diff --git a/tools/bazel.rc b/.bazelrc
similarity index 88%
rename from tools/bazel.rc
rename to .bazelrc
index 1fdf51f53e29c7111cf89c016400b710051cf9c6..17285afdb381018d0054e771475327b1f7ed9866 100644
--- a/tools/bazel.rc
+++ b/.bazelrc
@@ -25,12 +25,14 @@ build --define framework_shared_object=true
 # If you would like to use a local MKL instead of downloading, please set the
 # environment variable "TF_MKL_ROOT" every time before build.
 build:mkl --define=build_with_mkl=true --define=enable_mkl=true
+build:mkl --define=tensorflow_mkldnn_contraction_kernel=0
 build:mkl -c opt
 
 # This config option is used to enable MKL-DNN open source library only,
 # without depending on MKL binary version.
 build:mkl_open_source_only --define=build_with_mkl_dnn_only=true
 build:mkl_open_source_only --define=build_with_mkl=true --define=enable_mkl=true
+build:mkl_open_source_only --define=tensorflow_mkldnn_contraction_kernel=0
 
 build:download_clang --crosstool_top=@local_config_download_clang//:toolchain
 build:download_clang --define=using_clang=true
@@ -76,10 +78,9 @@ build:nonccl --define=no_nccl_support=true
 
 build --define=use_fast_cpp_protos=true
 build --define=allow_oversize_protos=true
-build --define=grpc_no_ares=true
 
 build --spawn_strategy=standalone
-build --genrule_strategy=standalone
+build --strategy=Genrule=standalone
 build -c opt
 
 # Other build flags.
@@ -89,7 +90,21 @@ build --define=grpc_no_ares=true
 build:dynamic_kernels --define=dynamic_loaded_kernels=true
 build:dynamic_kernels --copt=-DAUTOLOAD_DYNAMIC_KERNELS
 
+# Build TF with C++ 17 features.
+build:c++17 --cxxopt=-std=c++1z
+build:c++17 --cxxopt=-stdlib=libc++
+build:c++1z --cxxopt=-std=c++1z
+build:c++1z --cxxopt=-stdlib=libc++
+
 # Default paths for TF_SYSTEM_LIBS
 build --define=PREFIX=/usr
 build --define=LIBDIR=$(PREFIX)/lib
 build --define=INCLUDEDIR=$(PREFIX)/include
+
+# Default options should come above this line
+
+# Options from ./configure
+try-import %workspace%/.tf_configure.bazelrc
+
+# Put user-specific options in .bazelrc.user
+try-import %workspace%/.bazelrc.user
diff --git a/.gitignore b/.gitignore
index 90324058600bee46af56e49028977971848a80de..e1d352c238a1b2d4febe0f5d4a30cfa0c942f7e7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,7 @@
 .DS_Store
 .ipynb_checkpoints
 node_modules
-/.bazelrc
+/.bazelrc.user
 /.tf_configure.bazelrc
 /bazel-*
 /bazel_pip
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 4a296f265f7b9521c46d350cec26ff199f43eb6c..b978f89f9e1d79dd4f7481711a59c2b94e8bf01b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -150,41 +150,45 @@ may exist in your changes.
 
 There are two ways to run TensorFlow unit tests.
 
-1. Using tools and libraries installed directly on your system.
+1.  Using tools and libraries installed directly on your system.
 
-   Refer to the
-   [CPU-only developer Dockerfile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/Dockerfile.devel) and
-   [GPU developer Dockerfile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/Dockerfile.devel-gpu)
-   for the required packages. Alternatively, use the said
-   [Docker images](https://hub.docker.com/r/tensorflow/tensorflow/tags/), e.g.,
-   `tensorflow/tensorflow:nightly-devel` and `tensorflow/tensorflow:nightly-devel-gpu`
-   for development to avoid installing the packages directly on your system.
+    Refer to the
+    [CPU-only developer Dockerfile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/Dockerfile.devel)
+    and
+    [GPU developer Dockerfile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/docker/Dockerfile.devel-gpu)
+    for the required packages. Alternatively, use the said
+    [Docker images](https://hub.docker.com/r/tensorflow/tensorflow/tags/), e.g.,
+    `tensorflow/tensorflow:nightly-devel` and
+    `tensorflow/tensorflow:nightly-devel-gpu` for development to avoid
+    installing the packages directly on your system (in which case remember to
+    change directory from `/root` to `/tensorflow` once you get into the running
+    container so `bazel` can find the `tensorflow` workspace).
 
-   Once you have the packages installed, you can run a specific unit test in
-   bazel by doing as follows:
+    Once you have the packages installed, you can run a specific unit test in
+    bazel by doing as follows:
 
-   If the tests are to be run on GPU, add CUDA paths to LD_LIBRARY_PATH and add
-   the `cuda` option flag
+    If the tests are to be run on GPU, add CUDA paths to LD_LIBRARY_PATH and add
+    the `cuda` option flag
 
-   ```bash
-   export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
+    ```bash
+    export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
 
-   export flags="--config=opt --config=cuda -k"
-   ```
+    export flags="--config=opt --config=cuda -k"
+    ```
 
-   For example, to run all tests under tensorflow/python, do:
+    For example, to run all tests under tensorflow/python, do:
 
-   ```bash
-   bazel test ${flags} //tensorflow/python/...
-   ```
+    ```bash
+    bazel test ${flags} //tensorflow/python/...
+    ```
 
-2. Using [Docker](https://www.docker.com) and TensorFlow's CI scripts.
+2.  Using [Docker](https://www.docker.com) and TensorFlow's CI scripts.
 
-   ```bash
-   # Install Docker first, then this will build and run cpu tests
-   tensorflow/tools/ci_build/ci_build.sh CPU bazel test //tensorflow/...
-   ```
-
-   See
-   [TensorFlow Builds](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/ci_build) for details.
+    ```bash
+    # Install Docker first, then this will build and run cpu tests
+    tensorflow/tools/ci_build/ci_build.sh CPU bazel test //tensorflow/...
+    ```
 
+    See
+    [TensorFlow Builds](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/ci_build)
+    for details.
diff --git a/README.md b/README.md
index 044174947a094d43a51f7140dd40ec0f17801d40..96a8ecf4f693d5634da63f4ecc6f4e9c35751f5b 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,8 @@ organization for the purposes of conducting machine learning and deep neural
 networks research.  The system is general enough to be applicable in a wide
 variety of other domains, as well.
 
-TensorFlow provides stable Python API and C APIs as well as without API backwards compatibility guarantee like C++, Go, Java, JavaScript and Swift.
+TensorFlow provides stable Python and C APIs as well as non-guaranteed backwards
+compatible API's for C++, Go, Java, JavaScript and Swift.
 
 Keep up to date with release announcements and security updates by
 subscribing to
@@ -57,21 +58,24 @@ Simply run `pip install tf-nightly` or `pip install tf-nightly-gpu` in a clean
 environment to install the nightly TensorFlow build. We support CPU and GPU
 packages on Linux, Mac, and Windows.
 
-
 #### *Try your first TensorFlow program*
+
 ```shell
 $ python
 ```
+
 ```python
 >>> import tensorflow as tf
 >>> tf.enable_eager_execution()
->>> tf.add(1, 2)
+>>> tf.add(1, 2).numpy()
 3
 >>> hello = tf.constant('Hello, TensorFlow!')
 >>> hello.numpy()
 'Hello, TensorFlow!'
 ```
-Learn more examples about how to do specific tasks in TensorFlow at the [tutorials page of tensorflow.org](https://www.tensorflow.org/tutorials/).
+
+Learn more examples about how to do specific tasks in TensorFlow at the
+[tutorials page of tensorflow.org](https://www.tensorflow.org/tutorials/).
 
 ## Contribution guidelines
 
@@ -113,11 +117,12 @@ The TensorFlow project strives to abide by generally accepted best practices in
 Build Type                                                                                                                                                                                      | Status                                                                                                                                                                                   | Artifacts
 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------
 **IBM s390x**                                                                                                                                                                                   | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/badge/icon)](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/)                                                        | TBA
-**IBM ppc64le CPU**                                                                                                                                                                             | [![Build Status](http://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/badge/icon)](http://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/)                                    | TBA
-**IBM ppc64le GPU** Nightly                                                                                                                                                                     | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/)            | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/)
-**IBM ppc64le GPU** Stable Release                                                                                                                                                              | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/)                  | [Release](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/)
+**Linux ppc64le CPU** Nightly                                                                                                                                                                   | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/)                                  | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Nightly_Artifact/)
+**Linux ppc64le CPU** Stable Release                                                                                                                                                            | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/)                  | [Release](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/)
+**Linux ppc64le GPU** Nightly                                                                                                                                                                   | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Build/)                                  | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/)
+**Linux ppc64le GPU** Stable Release                                                                                                                                                            | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/)                  | [Release](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/)
 **Linux CPU with Intel® MKL-DNN** Nightly                                                                                                                                                       | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/)                                | [Nightly](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/)
-**Linux CPU with Intel® MKL-DNN** Python 2.7<br> **Linux CPU with Intel® MKL-DNN** Python 3.4<br> **Linux CPU with Intel® MKL-DNN** Python 3.5<br> **Linux CPU with Intel® MKL-DNN** Python 3.6 | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/lastStableBuild) | [1.11.0 py2.7](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.11.0-cp27-cp27mu-linux_x86_64.whl)<br>[1.11.0 py3.4](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.11.0-cp34-cp34m-linux_x86_64.whl)<br>[1.11.0 py3.5](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.11.0-cp35-cp35m-linux_x86_64.whl)<br>[1.11.0 py3.6](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.11.0-cp36-cp36m-linux_x86_64.whl)
+**Linux CPU with Intel® MKL-DNN** Python 2.7<br> **Linux CPU with Intel® MKL-DNN** Python 3.4<br> **Linux CPU with Intel® MKL-DNN** Python 3.5<br> **Linux CPU with Intel® MKL-DNN** Python 3.6 | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/lastStableBuild) | [1.12.0 py2.7](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.12.0-cp27-cp27mu-linux_x86_64.whl)<br>[1.12.0 py3.4](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.12.0-cp34-cp34m-linux_x86_64.whl)<br>[1.12.0 py3.5](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.12.0-cp35-cp35m-linux_x86_64.whl)<br>[1.12.0 py3.6](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.12.0-cp36-cp36m-linux_x86_64.whl)
 
 ## For more information
 
diff --git a/RELEASE.md b/RELEASE.md
index b13b071bd6cf4d3a260c8e248a67d23e1a688498..0a56e6909870e398c9d6349576cd2f8e6734f072 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -7,6 +7,8 @@
     Serving.
 *   Keras models now support evaluating with a `tf.data.Dataset`.
 *   TensorFlow binaries are built with XLA support linked in by default.
+*   Ignite Dataset added to contrib/ignite that allows to work with Apache
+    Ignite.
 
 ## Bug Fixes and Other Changes
 
@@ -280,50 +282,76 @@ Ag Ramesh, Alex Wiltschko, Alexander Pantyukhin, Amogh Mannekote, An Jiaoyang, A
 
 ## Bug Fixes and Other Changes
 
-* `tfe.Network` is deprecated. Please inherit from `tf.keras.Model`.
-* Layered variable names have changed in the following conditions:
-  * Using `tf.keras.layers` with custom variable scopes.
-  * Using `tf.layers` in  a subclassed `tf.keras.Model` class. See
-    [here](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/layers) for more details
-* `tf.data`:
-  * `Dataset.from_generator()` now accepts an `args` list, in order to create nested generators.
-  * `Dataset.list_files()` now produces determinstic results when `shuffle=False` or a `seed` is passed.
-  * `tf.contrib.data.sample_from_datasets()` and `tf.contrib.data.choose_from_datasets()` make it easier to sample or deterministically choose elements from multiple datasets.
-  * `tf.contrib.data.make_csv_dataset()` now supports line breaks in quoted strings, and two infrequently used arguments removed.
-  * (C++) `DatasetBase::DebugString()` is now `const`.
-  * (C++) `DatasetBase::MakeIterator()` has been renamed to `DatasetBase::MakeIteratorInternal()`.
-  * (C++) `IteratorBase::Initialize()` method was added to support raising errors during iterator construction.
-* Eager Execution:
-  * Added the ability to pause recording operations for gradient computation via `tf.GradientTape.stop_recording`.
-  * Updated documentation, introductory notebooks.
-* `tf.keras`:
-  * Move Keras code out of _impl folder and remove API files.
-  * `tf.keras.Model.save_weights` now saves in TensorFlow format by default.
-  * Enable dataset iterators to be passed to `tf.keras.Model` training/eval methods.
-* TensorFlow Debugger (tfdbg) CLI: fix an issue in which the TensorBoard Debugger Plugin could not handle total source file size exceeding gRPC message size limit (4 MB).
-* `tf.contrib`:
-  * `tf.contrib.framework.zero_initializer` supports ResourceVariable.
-  * Adding "constrained_optimization" to tensorflow/contrib.
-* Other:
-  * Add GCS Configuration Ops.
-  * Changing signature of `MakeIterator` to enable propagating error status.
-  * KL divergence for two Dirichlet distributions.
-  * More consistent GcsFileSystem behavior for certain reads past EOF.
-  * Update benchmark for tf.scan to match ranges across eager and graph modes.
-  * Fixed bug in `tf.reduce_prod gradient` for complex dtypes.
-  * Allow the use of '.' in variables (e.g. "hparams.parse('a.b=1.0')"), which would previously raise an error. This will correspond to an attribute name with an embedded '.' symbol (e.g. 'a.b'), which can only be accessed indirectly (e.g. through getattr and setattr).  To set this up the user will first need to explicitly add the variable to the hparam object (e.g. "hparams.add_hparam(name='a.b', value=0.0)").
-  * Benchmark for tf.scan in graph and eager modes.
-  * Added complex128 support to FFT, FFT2D, FFT3D, IFFT, IFFT2D, and IFFT3D.
-  * Making ids unique in `nn.embedding_lookup_sparse`. This helps to reduce RPC calls for looking up the embeddings when there are repeated ids in the batch.
-  * Support indicator column in boosted trees.
-  * Prevent `tf.gradients()` from backpropagating through integer tensors.
-  * LinearOperator[1D,2D,3D]Circulant added to `tensorflow.linalg`.
-  * Conv3D, Conv3DBackpropInput, Conv3DBackpropFilter now supports arbitrary.
-  * Added `tf.train.Checkpoint` for reading/writing object-based checkpoints.
-  * Added LinearOperatorKronecker, a dense-free implementation of the Kronecker Product.
-  * Allow LinearOperator to broadcast.
-  * SavedModelBuilder will now deduplicate asset names that point to files with the same basename and the same contents. Note that this may result in new asset files included in SavedModels in cases where assets with the same name but different contents were previously overwriting each other.
-
+*   `tfe.Network` is deprecated. Please inherit from `tf.keras.Model`.
+*   Layered variable names have changed in the following conditions:
+    *   Using `tf.keras.layers` with custom variable scopes.
+    *   Using `tf.layers` in a subclassed `tf.keras.Model` class. See
+        [here](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/layers)
+        for more details
+*   `tf.data`:
+    *   `Dataset.from_generator()` now accepts an `args` list, in order to
+        create nested generators.
+    *   `Dataset.list_files()` now produces deterministic results when
+        `shuffle=False` or a `seed` is passed.
+    *   `tf.contrib.data.sample_from_datasets()` and
+        `tf.contrib.data.choose_from_datasets()` make it easier to sample or
+        deterministically choose elements from multiple datasets.
+    *   `tf.contrib.data.make_csv_dataset()` now supports line breaks in quoted
+        strings, and two infrequently used arguments removed.
+    *   (C++) `DatasetBase::DebugString()` is now `const`.
+    *   (C++) `DatasetBase::MakeIterator()` has been renamed to
+        `DatasetBase::MakeIteratorInternal()`.
+    *   (C++) `IteratorBase::Initialize()` method was added to support raising
+        errors during iterator construction.
+*   Eager Execution:
+    *   Added the ability to pause recording operations for gradient computation
+        via `tf.GradientTape.stop_recording`.
+    *   Updated documentation, introductory notebooks.
+*   `tf.keras`:
+    *   Move Keras code out of _impl folder and remove API files.
+    *   `tf.keras.Model.save_weights` now saves in TensorFlow format by default.
+    *   Enable dataset iterators to be passed to `tf.keras.Model` training/eval
+        methods.
+*   TensorFlow Debugger (tfdbg) CLI: fix an issue in which the TensorBoard
+    Debugger Plugin could not handle total source file size exceeding gRPC
+    message size limit (4 MB).
+*   `tf.contrib`:
+    *   `tf.contrib.framework.zero_initializer` supports ResourceVariable.
+    *   Adding "constrained_optimization" to tensorflow/contrib.
+*   Other:
+    *   Add GCS Configuration Ops.
+    *   Changing signature of `MakeIterator` to enable propagating error status.
+    *   KL divergence for two Dirichlet distributions.
+    *   More consistent GcsFileSystem behavior for certain reads past EOF.
+    *   Update benchmark for tf.scan to match ranges across eager and graph
+        modes.
+    *   Fixed bug in `tf.reduce_prod gradient` for complex dtypes.
+    *   Allow the use of '.' in variables (e.g. "hparams.parse('a.b=1.0')"),
+        which would previously raise an error. This will correspond to an
+        attribute name with an embedded '.' symbol (e.g. 'a.b'), which can only
+        be accessed indirectly (e.g. through getattr and setattr). To set this
+        up the user will first need to explicitly add the variable to the hparam
+        object (e.g. "hparams.add_hparam(name='a.b', value=0.0)").
+    *   Benchmark for tf.scan in graph and eager modes.
+    *   Added complex128 support to FFT, FFT2D, FFT3D, IFFT, IFFT2D, and IFFT3D.
+    *   Making ids unique in `nn.embedding_lookup_sparse`. This helps to reduce
+        RPC calls for looking up the embeddings when there are repeated ids in
+        the batch.
+    *   Support indicator column in boosted trees.
+    *   Prevent `tf.gradients()` from backpropagating through integer tensors.
+    *   LinearOperator[1D,2D,3D]Circulant added to `tensorflow.linalg`.
+    *   Conv3D, Conv3DBackpropInput, Conv3DBackpropFilter now supports
+        arbitrary.
+    *   Added `tf.train.Checkpoint` for reading/writing object-based
+        checkpoints.
+    *   Added LinearOperatorKronecker, a dense-free implementation of the
+        Kronecker Product.
+    *   Allow LinearOperator to broadcast.
+    *   SavedModelBuilder will now deduplicate asset names that point to files
+        with the same basename and the same contents. Note that this may result
+        in new asset files included in SavedModels in cases where assets with
+        the same name but different contents were previously overwriting each
+        other.
 
 ## Thanks to our Contributors
 
@@ -821,7 +849,7 @@ answered questions, and were part of inspiring discussions.
 * Remove `tf.contrib.data.Iterator.from_dataset()` method. Use
   `Dataset.make_initializable_iterator()` instead.
 * Remove seldom used and unnecessary `tf.contrib.data.Iterator.dispose_op()`.
-* Reorder some TFGAN loss functions in a non-backwards compatible way.
+* Reorder some TF-GAN loss functions in a non-backwards compatible way.
 
 ## Known Issues
 * In Python 3, `Dataset.from_generator()` does not support Unicode strings.
diff --git a/WORKSPACE b/WORKSPACE
index 7cc08e0164a202581ad7ebbe107a9e19410e70e4..9f07b9fd47136d058cc4039ed6948db539485039 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -1,14 +1,14 @@
 workspace(name = "org_tensorflow")
 
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive", "http_file")
 
 http_archive(
     name = "io_bazel_rules_closure",
-    sha256 = "a38539c5b5c358548e75b44141b4ab637bba7c4dc02b46b1f62a96d6433f56ae",
-    strip_prefix = "rules_closure-dbb96841cc0a5fb2664c37822803b06dab20c7d1",
+    sha256 = "43c9b882fa921923bcba764453f4058d102bece35a37c9f6383c713004aacff1",
+    strip_prefix = "rules_closure-9889e2348259a5aad7e805547c1a0cf311cfcd91",
     urls = [
-        "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/dbb96841cc0a5fb2664c37822803b06dab20c7d1.tar.gz",
-        "https://github.com/bazelbuild/rules_closure/archive/dbb96841cc0a5fb2664c37822803b06dab20c7d1.tar.gz",  # 2018-04-13
+        "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/9889e2348259a5aad7e805547c1a0cf311cfcd91.tar.gz",
+        "https://github.com/bazelbuild/rules_closure/archive/9889e2348259a5aad7e805547c1a0cf311cfcd91.tar.gz",  # 2018-12-21
     ],
 )
 
@@ -16,38 +16,52 @@ load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
 
 closure_repositories()
 
-http_archive(
-    name = "base_images_docker",
-    sha256 = "e2b1b7254270bb7605e814a9dbf6d1e4ae04a11136ff1714fbfdabe3f87f7cf9",
-    strip_prefix = "base-images-docker-12801524f867e657fbb5d1a74f31618aff181ac6",
-    urls = ["https://github.com/GoogleCloudPlatform/base-images-docker/archive/12801524f867e657fbb5d1a74f31618aff181ac6.tar.gz"],
-)
+load("//third_party/toolchains/preconfig/generate:archives.bzl",
+     "bazel_toolchains_archive")
 
-http_archive(
-    name = "bazel_toolchains",
-    sha256 = "15b5858b1b5541ec44df31b94c3b8672815b31d71215a98398761ea9f4c4eedb",
-    strip_prefix = "bazel-toolchains-6200b238c9c2d137c0d9a7262c80cc71d98e692b",
-    urls = [
-        "https://github.com/bazelbuild/bazel-toolchains/archive/6200b238c9c2d137c0d9a7262c80cc71d98e692b.tar.gz",
-    ],
+bazel_toolchains_archive()
+
+load(
+    "@bazel_toolchains//repositories:repositories.bzl",
+    bazel_toolchains_repositories = "repositories",
 )
 
-http_archive(
-    name = "io_bazel_rules_docker",
-    sha256 = "29d109605e0d6f9c892584f07275b8c9260803bf0c6fcb7de2623b2bedc910bd",
-    strip_prefix = "rules_docker-0.5.1",
-    urls = ["https://github.com/bazelbuild/rules_docker/archive/v0.5.1.tar.gz"],
+bazel_toolchains_repositories()
+
+load(
+    "@io_bazel_rules_docker//repositories:repositories.bzl",
+    container_repositories = "repositories",
 )
 
-load("//third_party/toolchains/preconfig/generate:workspace.bzl", "remote_config_workspace")
+container_repositories()
+
+load("//third_party/toolchains/preconfig/generate:workspace.bzl",
+     "remote_config_workspace")
 
 remote_config_workspace()
 
+# Apple and Swift rules.
+http_archive(
+    name = "build_bazel_rules_apple",
+    sha256 = "73b4980a318d203d3307f850e27e66ec5cc8d223147a3475a6f11597eb6438a5",
+    strip_prefix = "rules_apple-0.13.0",
+    urls = ["https://github.com/bazelbuild/rules_apple/archive/0.13.0.tar.gz"],
+)
+http_file(
+    name = "xctestrunner",
+    executable = 1,
+    urls = ["https://github.com/google/xctestrunner/releases/download/0.2.6/ios_test_runner.par"],
+)
+load("@build_bazel_rules_apple//apple:repositories.bzl", "apple_rules_dependencies")
+apple_rules_dependencies()
+load("@build_bazel_rules_swift//swift:repositories.bzl", "swift_rules_dependencies")
+swift_rules_dependencies()
+
 # We must check the bazel version before trying to parse any other BUILD
 # files, in case the parsing of those build files depends on the bazel
 # version we require here.
 load("//tensorflow:version_check.bzl", "check_bazel_version_at_least")
-check_bazel_version_at_least("0.15.0")
+check_bazel_version_at_least("0.19.0")
 
 load("//tensorflow:workspace.bzl", "tf_workspace")
 
@@ -108,4 +122,3 @@ http_archive(
         "http://download.tensorflow.org/models/speech_commands_v0.01.zip",
     ],
 )
-
diff --git a/tensorflow/opensource_only/arm_compiler.BUILD b/arm_compiler.BUILD
similarity index 100%
rename from tensorflow/opensource_only/arm_compiler.BUILD
rename to arm_compiler.BUILD
diff --git a/configure.py b/configure.py
index 6c905a0be3d685b5921dfbc5bddfbe6471a82625..3eb09a1ae905b70dc5d02fab7c316f73c79633dd 100644
--- a/configure.py
+++ b/configure.py
@@ -33,7 +33,7 @@ except ImportError:
   from distutils.spawn import find_executable as which
 # pylint: enable=g-import-not-at-top
 
-_DEFAULT_CUDA_VERSION = '9.0'
+_DEFAULT_CUDA_VERSION = '10.0'
 _DEFAULT_CUDNN_VERSION = '7'
 _DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,7.0'
 _DEFAULT_CUDA_PATH = '/usr/local/cuda'
@@ -55,6 +55,12 @@ NCCL_LIB_PATHS = [
     'lib64/', 'lib/powerpc64le-linux-gnu/', 'lib/x86_64-linux-gnu/', ''
 ]
 
+# List of files to be configured for using Bazel on Apple platforms.
+APPLE_BAZEL_FILES = [
+    'tensorflow/lite/experimental/objc/BUILD',
+    'tensorflow/lite/experimental/swift/BUILD'
+]
+
 if platform.machine() == 'ppc64le':
   _DEFAULT_TENSORRT_PATH_LINUX = '/usr/lib/powerpc64le-linux-gnu/'
 else:
@@ -255,18 +261,7 @@ def setup_python(environ_cp):
 def reset_tf_configure_bazelrc():
   """Reset file that contains customized config settings."""
   open(_TF_BAZELRC, 'w').close()
-  bazelrc_path = os.path.join(_TF_WORKSPACE_ROOT, '.bazelrc')
-
-  data = []
-  if os.path.exists(bazelrc_path):
-    with open(bazelrc_path, 'r') as f:
-      data = f.read().splitlines()
-  with open(bazelrc_path, 'w') as f:
-    for l in data:
-      if _TF_BAZELRC_FILENAME in l:
-        continue
-      f.write('%s\n' % l)
-    f.write('import %%workspace%%/%s\n' % _TF_BAZELRC_FILENAME)
+
 
 def cleanup_makefile():
   """Delete any leftover BUILD files from the Makefile build.
@@ -488,11 +483,14 @@ def check_bazel_version(min_version, max_version):
   if curr_version_int < min_version_int:
     print('Please upgrade your bazel installation to version %s or higher to '
           'build TensorFlow!' % min_version)
-    sys.exit(0)
-  if curr_version_int > max_version_int:
+    sys.exit(1)
+  if (curr_version_int > max_version_int and
+      'TF_IGNORE_MAX_BAZEL_VERSION' not in os.environ):
     print('Please downgrade your bazel installation to version %s or lower to '
-          'build TensorFlow!' % max_version)
-    sys.exit(0)
+          'build TensorFlow! To downgrade: download the installer for the old '
+          'version (from https://github.com/bazelbuild/bazel/releases) then '
+          'run the installer.' % max_version)
+    sys.exit(1)
   return curr_version
 
 
@@ -794,8 +792,7 @@ def set_gcc_host_compiler_path(environ_cp):
       environ_cp,
       var_name='GCC_HOST_COMPILER_PATH',
       var_default=default_gcc_host_compiler_path,
-      ask_for_var=
-      'Please specify which gcc should be used by nvcc as the host compiler.',
+      ask_for_var='Please specify which gcc should be used by nvcc as the host compiler.',
       check_success=os.path.exists,
       error_msg='Invalid gcc path. %s cannot be found.',
   )
@@ -1246,6 +1243,7 @@ def set_tf_nccl_install_path(environ_cp):
   environ_cp['TF_NCCL_VERSION'] = tf_nccl_version
   write_action_env_to_bazelrc('TF_NCCL_VERSION', tf_nccl_version)
 
+
 def get_native_cuda_compute_capabilities(environ_cp):
   """Get native cuda compute capabilities.
 
@@ -1282,13 +1280,15 @@ def set_tf_cuda_compute_capabilities(environ_cp):
 
     ask_cuda_compute_capabilities = (
         'Please specify a list of comma-separated '
-        'Cuda compute capabilities you want to '
+        'CUDA compute capabilities you want to '
         'build with.\nYou can find the compute '
         'capability of your device at: '
         'https://developer.nvidia.com/cuda-gpus.\nPlease'
         ' note that each additional compute '
         'capability significantly increases your '
-        'build time and binary size. [Default is: %s]: ' %
+        'build time and binary size, and that '
+        'TensorFlow only supports compute '
+        'capabilities >= 3.5 [Default is: %s]: ' %
         default_cuda_compute_capabilities)
     tf_cuda_compute_capabilities = get_from_env_or_user_or_default(
         environ_cp, 'TF_CUDA_COMPUTE_CAPABILITIES',
@@ -1301,12 +1301,14 @@ def set_tf_cuda_compute_capabilities(environ_cp):
     for compute_capability in tf_cuda_compute_capabilities.split(','):
       m = re.match('[0-9]+.[0-9]+', compute_capability)
       if not m:
-        print('Invalid compute capability: ' % compute_capability)
+        print('Invalid compute capability: %s' % compute_capability)
         all_valid = False
       else:
-        ver = int(m.group(0).split('.')[0])
-        if ver < 3:
-          print('Only compute capabilities 3.0 or higher are supported.')
+        ver = float(m.group(0))
+        if ver < 3.5:
+          print('ERROR: TensorFlow only supports CUDA compute capabilities 3.5 '
+                'and higher. Please re-specify the list of compute '
+                'capabilities excluding version %s.' % ver)
           all_valid = False
 
     if all_valid:
@@ -1491,7 +1493,35 @@ def set_other_mpi_vars(environ_cp):
   else:
     raise ValueError(
         'Cannot find the MPI library file in %s/lib or %s/lib64 or %s/lib32' %
-        mpi_home, mpi_home, mpi_home)
+        (mpi_home, mpi_home, mpi_home))
+
+def system_specific_test_config(env):
+  """Add default test flags required for TF tests to bazelrc."""
+  write_to_bazelrc('test --flaky_test_attempts=3')
+  write_to_bazelrc('test --test_size_filters=small,medium')
+  write_to_bazelrc(
+      'test --test_tag_filters=-benchmark-test,-no_oss,-oss_serial')
+  write_to_bazelrc('test --build_tag_filters=-benchmark-test,-no_oss')
+  if is_windows():
+    if env.get('TF_NEED_CUDA', None) == 1:
+      write_to_bazelrc(
+          'test --test_tag_filters=-no_windows,-no_windows_gpu,-no_gpu')
+      write_to_bazelrc(
+          'test --build_tag_filters=-no_windows,-no_windows_gpu,-no_gpu')
+    else:
+      write_to_bazelrc('test --test_tag_filters=-no_windows,-gpu')
+      write_to_bazelrc('test --build_tag_filters=-no_windows,-gpu')
+  elif is_macos():
+    write_to_bazelrc('test --test_tag_filters=-gpu,-nomac,-no_mac')
+    write_to_bazelrc('test --build_tag_filters=-gpu,-nomac,-no_mac')
+  elif is_linux():
+    if env.get('TF_NEED_CUDA', None) == 1:
+      write_to_bazelrc('test --test_tag_filters=-no_gpu')
+      write_to_bazelrc('test --build_tag_filters=-no_gpu')
+      write_to_bazelrc('test --test_env=LD_LIBRARY_PATH')
+    else:
+      write_to_bazelrc('test --test_tag_filters=-gpu')
+      write_to_bazelrc('test --build_tag_filters=-gpu')
 
 
 def set_system_libs_flag(environ_cp):
@@ -1522,10 +1552,6 @@ def set_windows_build_flags(environ_cp):
   # The host and target platforms are the same in Windows build. So we don't
   # have to distinct them. This avoids building the same targets twice.
   write_to_bazelrc('build --distinct_host_configuration=false')
-  # Enable short object file path to avoid long path issue on Windows.
-  # TODO(pcloudy): Remove this flag when upgrading Bazel to 0.16.0
-  # Short object file path will be enabled by default.
-  write_to_bazelrc('build --experimental_shortened_obj_file_path=true')
 
   if get_var(
       environ_cp, 'TF_OVERRIDE_EIGEN_STRONG_INLINE', 'Eigen strong inline',
@@ -1546,6 +1572,23 @@ def config_info_line(name, help_text):
   print('\t--config=%-12s\t# %s' % (name, help_text))
 
 
+def configure_apple_bazel_rules():
+  """Configures Bazel rules for building on Apple platforms.
+
+  Enables analyzing and building Apple Bazel rules on Apple platforms. This
+  function will only be executed if `is_macos()` is true.
+  """
+  if not is_macos():
+    return
+  for filepath in APPLE_BAZEL_FILES:
+    print(
+        'Configuring %s file to analyze and build Bazel rules on Apple platforms.'
+        % filepath)
+    existing_filepath = os.path.join(_TF_WORKSPACE_ROOT, filepath + '.apple')
+    renamed_filepath = os.path.join(_TF_WORKSPACE_ROOT, filepath)
+    os.rename(existing_filepath, renamed_filepath)
+
+
 def main():
   global _TF_WORKSPACE_ROOT
   global _TF_BAZELRC
@@ -1565,11 +1608,9 @@ def main():
   # environment variables.
   environ_cp = dict(os.environ)
 
-  check_bazel_version('0.15.0', '0.20.0')
+  check_bazel_version('0.19.0', '0.22.0')
 
   reset_tf_configure_bazelrc()
-  # Explicitly import tools/bazel.rc, this is needed for Bazel 0.19.0 or later
-  write_to_bazelrc('import %workspace%/tools/bazel.rc')
 
   cleanup_makefile()
   setup_python(environ_cp)
@@ -1588,6 +1629,8 @@ def main():
 
   if is_macos():
     environ_cp['TF_NEED_TENSORRT'] = '0'
+  else:
+    environ_cp['TF_CONFIGURE_APPLE_BAZEL_RULES'] = '0'
 
   # The numpy package on ppc64le uses OpenBLAS which has multi-threading
   # issues that lead to incorrect answers.  Set OMP_NUM_THREADS=1 at
@@ -1690,6 +1733,16 @@ def main():
     create_android_ndk_rule(environ_cp)
     create_android_sdk_rule(environ_cp)
 
+  system_specific_test_config(os.environ)
+
+  if get_var(
+      environ_cp, 'TF_CONFIGURE_APPLE_BAZEL_RULES',
+      'Configure Bazel rules for Apple platforms', False,
+      ('Would you like to configure Bazel rules for building on Apple platforms?'
+      ), 'Configuring Bazel rules for Apple platforms.',
+      'Not configuring Bazel rules for Apple platforms.'):
+    configure_apple_bazel_rules()
+
   print('Preconfigured Bazel build configs. You can use any of the below by '
         'adding "--config=<>" to your build command. See .bazelrc for more '
         'details.')
@@ -1698,14 +1751,15 @@ def main():
   config_info_line('gdr', 'Build with GDR support.')
   config_info_line('verbs', 'Build with libverbs support.')
   config_info_line('ngraph', 'Build with Intel nGraph support.')
-  config_info_line('dynamic_kernels',
-                   '(Experimental) Build kernels into separate shared objects.')
+  config_info_line(
+      'dynamic_kernels',
+      '(Experimental) Build kernels into separate shared objects.')
 
   print('Preconfigured Bazel build configs to DISABLE default on features:')
   config_info_line('noaws', 'Disable AWS S3 filesystem support.')
   config_info_line('nogcp', 'Disable GCP support.')
   config_info_line('nohdfs', 'Disable HDFS support.')
-  config_info_line('noignite', 'Disable Apacha Ignite support.')
+  config_info_line('noignite', 'Disable Apache Ignite support.')
   config_info_line('nokafka', 'Disable Apache Kafka support.')
   config_info_line('nonccl', 'Disable NVIDIA NCCL support.')
 
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index fd4b94202aad24a82abef8abd16431f61a8326f0..f53982f1efc9885cc12dcc672ad819c762aca378 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -40,12 +40,16 @@ load(
 
 # @unused
 TENSORFLOW_API_INIT_FILES_V2 = (
-    TENSORFLOW_API_INIT_FILES + get_compat_files(TENSORFLOW_API_INIT_FILES_V1, 1)
+    TENSORFLOW_API_INIT_FILES +
+    get_compat_files(TENSORFLOW_API_INIT_FILES, 2) +
+    get_compat_files(TENSORFLOW_API_INIT_FILES_V1, 1)
 )
 
 # @unused
-TENSORFLOW_API_INIT_FILES_V1_WITH_COMPAT = (
-    TENSORFLOW_API_INIT_FILES_V1 + get_compat_files(TENSORFLOW_API_INIT_FILES_V1, 1)
+TENSORFLOW_API_INIT_FILES_V1 = (
+    TENSORFLOW_API_INIT_FILES_V1 +
+    get_compat_files(TENSORFLOW_API_INIT_FILES, 2) +
+    get_compat_files(TENSORFLOW_API_INIT_FILES_V1, 1)
 )
 
 # Config setting used when building for products
@@ -90,6 +94,12 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "emscripten",
+    values = {"crosstool_top": "//external:android/emscripten"},
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "raspberry_pi_armeabi",
     values = {
@@ -202,6 +212,12 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "arm",
+    values = {"cpu": "arm"},
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "freebsd",
     values = {"cpu": "freebsd"},
@@ -267,6 +283,15 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+# By default, XLA GPU is compiled into tensorflow when building with
+# --config=cuda even when `with_xla_support` is false. The config setting
+# here allows us to override the behavior if needed.
+config_setting(
+    name = "no_xla_deps_in_cuda",
+    define_values = {"no_xla_deps_in_cuda": "true"},
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "with_gdr_support",
     define_values = {"with_gdr_support": "true"},
@@ -328,6 +353,13 @@ config_setting(
     },
 )
 
+config_setting(
+    name = "using_rocm_hipcc",
+    define_values = {
+        "using_rocm_hipcc": "true",
+    },
+)
+
 config_setting(
     name = "with_mpi_support",
     values = {"define": "with_mpi_support=true"},
@@ -355,17 +387,18 @@ config_setting(
     define_values = {"tf_api_version": "2"},
 )
 
+# This flag is defined for select statements that match both
+# on 'windows' and 'api_version_2'. In this case, bazel requires
+# having a flag which is a superset of these two.
+config_setting(
+    name = "windows_and_api_version_2",
+    define_values = {"tf_api_version": "2"},
+    values = {"cpu": "x64_windows"},
+)
+
 package_group(
     name = "internal",
-    packages = [
-        "-//third_party/tensorflow/python/estimator",
-        "//learning/meta_rank/...",
-        "//tensorflow/...",
-        "//tensorflow_estimator/contrib/...",
-        "//tensorflow_fold/llgtm/...",
-        "//tensorflow_text/...",
-        "//third_party/py/tensor2tensor/...",
-    ],
+    packages = ["//tensorflow/..."],
 )
 
 load(
@@ -429,8 +462,7 @@ tf_cc_shared_object(
         "//tensorflow:darwin": [],
         "//tensorflow:windows": [],
         "//conditions:default": [
-            "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
-            "$(location //tensorflow:tf_framework_version_script.lds)",
+            "-Wl,--version-script,$(location //tensorflow:tf_framework_version_script.lds)",
         ],
     }),
     linkstatic = 1,
@@ -464,15 +496,13 @@ tf_cc_shared_object(
     name = "libtensorflow.so",
     linkopts = select({
         "//tensorflow:darwin": [
-            "-Wl,-exported_symbols_list",  # This line must be directly followed by the exported_symbols.lds file
-            "$(location //tensorflow/c:exported_symbols.lds)",
+            "-Wl,-exported_symbols_list,$(location //tensorflow/c:exported_symbols.lds)",
             "-Wl,-install_name,@rpath/libtensorflow.so",
         ],
         "//tensorflow:windows": [],
         "//conditions:default": [
             "-z defs",
-            "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
-            "$(location //tensorflow/c:version_script.lds)",
+            "-Wl,--version-script,$(location //tensorflow/c:version_script.lds)",
         ],
     }),
     visibility = ["//visibility:public"],
@@ -490,14 +520,12 @@ tf_cc_shared_object(
     name = "libtensorflow_cc.so",
     linkopts = select({
         "//tensorflow:darwin": [
-            "-Wl,-exported_symbols_list",  # This line must be directly followed by the exported_symbols.lds file
-            "$(location //tensorflow:tf_exported_symbols.lds)",
+            "-Wl,-exported_symbols_list,$(location //tensorflow:tf_exported_symbols.lds)",
         ],
         "//tensorflow:windows": [],
         "//conditions:default": [
             "-z defs",
-            "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
-            "$(location //tensorflow:tf_version_script.lds)",
+            "-Wl,--version-script,$(location //tensorflow:tf_version_script.lds)",
         ],
     }),
     visibility = ["//visibility:public"],
@@ -574,13 +602,20 @@ gen_api_init_files(
     name = "tf_python_api_gen_v1",
     srcs = [
         "api_template_v1.__init__.py",
+        "compat_template.__init__.py",
         "compat_template_v1.__init__.py",
     ],
     api_version = 1,
-    compat_api_versions = [1],
-    compat_init_templates = ["compat_template_v1.__init__.py"],
+    compat_api_versions = [
+        1,
+        2,
+    ],
+    compat_init_templates = [
+        "compat_template_v1.__init__.py",
+        "compat_template.__init__.py",
+    ],
     output_dir = "_api/v1/",
-    output_files = TENSORFLOW_API_INIT_FILES_V1_WITH_COMPAT,
+    output_files = TENSORFLOW_API_INIT_FILES_V1,
     output_package = "tensorflow._api.v1",
     root_file_name = "v1.py",
     root_init_template = "api_template_v1.__init__.py",
@@ -590,11 +625,18 @@ gen_api_init_files(
     name = "tf_python_api_gen_v2",
     srcs = [
         "api_template.__init__.py",
+        "compat_template.__init__.py",
         "compat_template_v1.__init__.py",
     ],
     api_version = 2,
-    compat_api_versions = [1],
-    compat_init_templates = ["compat_template_v1.__init__.py"],
+    compat_api_versions = [
+        1,
+        2,
+    ],
+    compat_init_templates = [
+        "compat_template_v1.__init__.py",
+        "compat_template.__init__.py",
+    ],
     output_dir = "_api/v2/",
     output_files = TENSORFLOW_API_INIT_FILES_V2,
     output_package = "tensorflow._api.v2",
@@ -606,9 +648,11 @@ py_library(
     name = "tensorflow_py",
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
-    deps = [
+    deps = select({
+        "api_version_2": [],
+        "//conditions:default": ["//tensorflow/contrib:contrib_py"],
+    }) + [
         ":tensorflow_py_no_contrib",
-        "//tensorflow/contrib:contrib_py",
         "//tensorflow/python/estimator:estimator_py",
     ],
 )
@@ -618,7 +662,11 @@ py_library(
     srcs = select({
         "api_version_2": [":tf_python_api_gen_v2"],
         "//conditions:default": [":tf_python_api_gen_v1"],
-    }) + [":root_init_gen"],
+    }) + [":root_init_gen"] + [
+        "//tensorflow/python/keras/api:keras_python_api_gen",
+        "//tensorflow/python/keras/api:keras_python_api_gen_compat_v1",
+        "//tensorflow/python/keras/api:keras_python_api_gen_compat_v2",
+    ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = ["//tensorflow/python:no_contrib"],
diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py
index d81cf067eb07e88e2b8a86cf5643674235eb3f3b..ddcacfcbe2d4d8b089f10f1a771384dc8c4fd199 100644
--- a/tensorflow/api_template.__init__.py
+++ b/tensorflow/api_template.__init__.py
@@ -18,27 +18,84 @@ from __future__ import absolute_import as _absolute_import
 from __future__ import division as _division
 from __future__ import print_function as _print_function
 
+import distutils as _distutils
+import inspect as _inspect
 import os as _os
-
-# pylint: disable=g-bad-import-order
-from tensorflow.python.tools import component_api_helper as _component_api_helper
-_component_api_helper.package_hook(
-    parent_package_str=__name__,
-    child_package_str=('tensorflow_estimator.python.estimator.api.estimator'))
+import site as _site
+import sys as _sys
 
 # API IMPORTS PLACEHOLDER
 
 # Make sure directory containing top level submodules is in
 # the __path__ so that "from tensorflow.foo import bar" works.
 # We're using bitwise, but there's nothing special about that.
-_tf_api_dir = _os.path.dirname(_os.path.dirname(bitwise.__file__))  # pylint: disable=undefined-variable
-if _tf_api_dir not in __path__:
+_API_MODULE = bitwise  # pylint: disable=undefined-variable
+_current_module = _sys.modules[__name__]
+_tf_api_dir = _os.path.dirname(_os.path.dirname(_API_MODULE.__file__))
+if not hasattr(_current_module, '__path__'):
+  __path__ = [_tf_api_dir]
+elif _tf_api_dir not in __path__:
   __path__.append(_tf_api_dir)
 
+# pylint: disable=g-bad-import-order
+from tensorflow.python.tools import component_api_helper as _component_api_helper
+_component_api_helper.package_hook(
+    parent_package_str=__name__,
+    child_package_str=('tensorboard.summary._tf.summary'),
+    error_msg="Limited tf.summary API due to missing TensorBoard installation")
+_component_api_helper.package_hook(
+    parent_package_str=__name__,
+    child_package_str=(
+        'tensorflow_estimator.python.estimator.api._v2.estimator'))
+
+if not hasattr(_current_module, 'estimator'):
+  _component_api_helper.package_hook(
+      parent_package_str=__name__,
+      child_package_str=(
+          'tensorflow_estimator.python.estimator.api.estimator'))
+_component_api_helper.package_hook(
+    parent_package_str=__name__,
+    child_package_str=('tensorflow.python.keras.api._v2.keras'))
+
 # Enable TF2 behaviors
-from tensorflow.python.compat import compat as _compat  # pylint: disable=g-import-not-at-top
+from tensorflow.python.compat import v2_compat as _compat  # pylint: disable=g-import-not-at-top
 _compat.enable_v2_behavior()
 
+
+# Load all plugin libraries from site-packages/tensorflow-plugins if we are
+# running under pip.
+# TODO(gunan): Enable setting an environment variable to define arbitrary plugin
+# directories.
+# TODO(gunan): Find a better location for this code snippet.
+from tensorflow.python.framework import load_library as _ll
+from tensorflow.python.lib.io import file_io as _fi
+
+# Get sitepackages directories for the python installation.
+_site_packages_dirs = []
+_site_packages_dirs += [_site.USER_SITE]
+_site_packages_dirs += [_p for _p in _sys.path if 'site-packages' in _p]
+if 'getsitepackages' in dir(_site):
+  _site_packages_dirs += _site.getsitepackages()
+
+if 'sysconfig' in dir(_distutils):
+  _site_packages_dirs += [_distutils.sysconfig.get_python_lib()]
+
+_site_packages_dirs = list(set(_site_packages_dirs))
+
+# Find the location of this exact file.
+_current_file_location = _inspect.getfile(_inspect.currentframe())
+
+def _running_from_pip_package():
+  return any(
+      _current_file_location.startswith(dir_) for dir_ in _site_packages_dirs)
+
+if _running_from_pip_package():
+  for s in _site_packages_dirs:
+    # TODO(gunan): Add sanity checks to loaded modules here.
+    plugin_dir = _os.path.join(s, 'tensorflow-plugins')
+    if _fi.file_exists(plugin_dir):
+      _ll.load_library(plugin_dir)
+
 # These symbols appear because we import the python package which
 # in turn imports from tensorflow.core and tensorflow.python. They
 # must come from this module. So python adds these symbols for the
@@ -59,4 +116,11 @@ try:
   del compiler
 except NameError:
   pass
+
+# Add module aliases
+if hasattr(_current_module, 'keras'):
+  losses = keras.losses
+  metrics = keras.metrics
+  optimizers = keras.optimizers
+
 # pylint: enable=undefined-variable
diff --git a/tensorflow/api_template_v1.__init__.py b/tensorflow/api_template_v1.__init__.py
index 65bdb6cb1b5e6fb0656a12b932d767aeacfccd29..5eb25a81b7f765f551bc4f1b7ba99b35dbc6b7bb 100644
--- a/tensorflow/api_template_v1.__init__.py
+++ b/tensorflow/api_template_v1.__init__.py
@@ -18,20 +18,42 @@ from __future__ import absolute_import as _absolute_import
 from __future__ import division as _division
 from __future__ import print_function as _print_function
 
+import distutils as _distutils
+import inspect as _inspect
 import os as _os
+import site as _site
+import sys as _sys
 
 # pylint: disable=g-bad-import-order
 from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 
+# API IMPORTS PLACEHOLDER
+
 from tensorflow.python.tools import component_api_helper as _component_api_helper
 _component_api_helper.package_hook(
     parent_package_str=__name__,
-    child_package_str=('tensorflow_estimator.python.estimator.api.estimator'))
-
-# API IMPORTS PLACEHOLDER
+    child_package_str=(
+        'tensorflow_estimator.python.estimator.api._v1.estimator'))
 
+_current_module = _sys.modules[__name__]
+if not hasattr(_current_module, 'estimator'):
+  _component_api_helper.package_hook(
+      parent_package_str=__name__,
+      child_package_str=(
+          'tensorflow_estimator.python.estimator.api.estimator'))
+_component_api_helper.package_hook(
+    parent_package_str=__name__,
+    child_package_str=('tensorflow.python.keras.api._v1.keras'))
 from tensorflow.python.util.lazy_loader import LazyLoader  # pylint: disable=g-import-not-at-top
-contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib')
+_CONTRIB_WARNING = """
+WARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.
+For more information, please see:
+  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
+  * https://github.com/tensorflow/addons
+If you depend on functionality not listed there, please file an issue.
+"""
+contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib',
+                     _CONTRIB_WARNING)
 del LazyLoader
 # The templated code that replaces the placeholder above sometimes
 # sets the __all__ variable. If it does, we have to be sure to add
@@ -40,14 +62,53 @@ if '__all__' in vars():
   vars()['__all__'].append('contrib')
 
 from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
+# The 'app' module will be imported as part of the placeholder section above.
 app.flags = flags  # pylint: disable=undefined-variable
 
+# Also use 'app' module (choice is arbitrary) to derive the API directory below.
+_API_MODULE = app  # pylint: disable=undefined-variable
+
 # Make sure directory containing top level submodules is in
 # the __path__ so that "from tensorflow.foo import bar" works.
-_tf_api_dir = _os.path.dirname(_os.path.dirname(app.__file__))  # pylint: disable=undefined-variable
-if _tf_api_dir not in __path__:
+_tf_api_dir = _os.path.dirname(_os.path.dirname(_API_MODULE.__file__))
+if not hasattr(_current_module, '__path__'):
+  __path__ = [_tf_api_dir]
+elif _tf_api_dir not in __path__:
   __path__.append(_tf_api_dir)
 
+# Load all plugin libraries from site-packages/tensorflow-plugins if we are
+# running under pip.
+# TODO(gunan): Enable setting an environment variable to define arbitrary plugin
+# directories.
+# TODO(gunan): Find a better location for this code snippet.
+from tensorflow.python.framework import load_library as _ll
+from tensorflow.python.lib.io import file_io as _fi
+
+# Get sitepackages directories for the python installation.
+_site_packages_dirs = []
+_site_packages_dirs += [_site.USER_SITE]
+_site_packages_dirs += [_p for _p in _sys.path if 'site-packages' in _p]
+if 'getsitepackages' in dir(_site):
+  _site_packages_dirs += _site.getsitepackages()
+
+if 'sysconfig' in dir(_distutils):
+  _site_packages_dirs += [_distutils.sysconfig.get_python_lib()]
+
+_site_packages_dirs = list(set(_site_packages_dirs))
+
+# Find the location of this exact file.
+_current_file_location = _inspect.getfile(_inspect.currentframe())
+
+def _running_from_pip_package():
+  return any(
+      _current_file_location.startswith(dir_) for dir_ in _site_packages_dirs)
+
+if _running_from_pip_package():
+  for s in _site_packages_dirs:
+    # TODO(gunan): Add sanity checks to loaded modules here.
+    plugin_dir = _os.path.join(s, 'tensorflow-plugins')
+    if _fi.file_exists(plugin_dir):
+      _ll.load_library(plugin_dir)
 
 # These symbols appear because we import the python package which
 # in turn imports from tensorflow.core and tensorflow.python. They
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 25df970ecab0757f23465ab19e7f45de0c759458..ef7863dc0d5cbd57da30baa6e04278c2a0354b25 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -67,6 +67,23 @@ tf_cuda_library(
 
 tf_cuda_library(
     name = "c_api",
+    hdrs = ["c_api.h"],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":c_api_no_xla",
+        ":c_api_internal",
+    ] + select({
+        "//tensorflow:with_xla_support": [
+            "//tensorflow/compiler/tf2xla:xla_compiler",
+            "//tensorflow/compiler/jit",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
+tf_cuda_library(
+    name = "c_api_no_xla",
     srcs = [
         "c_api.cc",
         "c_api_function.cc",
@@ -75,15 +92,13 @@ tf_cuda_library(
         "c_api.h",
     ],
     copts = tf_copts(),
-    visibility = ["//visibility:public"],
-    deps = select({
+    visibility = ["//tensorflow/c:__subpackages__"],
+    deps = [":c_api_internal"] + select({
         "//tensorflow:android": [
-            ":c_api_internal",
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//conditions:default": [
-            ":c_api_internal",
-            "//tensorflow/cc/saved_model:loader",
+            "//tensorflow/cc/saved_model:loader_lite",
             "//tensorflow/cc:gradients",
             "//tensorflow/cc:ops",
             "//tensorflow/cc:grad_ops",
@@ -97,13 +112,8 @@ tf_cuda_library(
             "//tensorflow/core:lib",
             "//tensorflow/core:lib_internal",
             "//tensorflow/core/distributed_runtime:server_lib",
+            "//tensorflow/core/kernels:logging_ops",
         ],
-    }) + select({
-        "//tensorflow:with_xla_support": [
-            "//tensorflow/compiler/tf2xla:xla_compiler",
-            "//tensorflow/compiler/jit",
-        ],
-        "//conditions:default": [],
     }),
 )
 
@@ -123,13 +133,13 @@ tf_cuda_library(
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:c_api_internal",
         "//tensorflow/compiler/jit:flags",
-        "//tensorflow/contrib/tpu:all_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_platform",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime/eager:attr_builder",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -156,8 +166,8 @@ tf_cuda_library(
     hdrs = ["tf_status_helper.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":c_api",
         ":c_api_internal",
+        ":c_api_no_xla",
         "//tensorflow/core:lib",
     ],
 )
@@ -190,14 +200,12 @@ tf_cuda_library(
             ":c_api",
             ":tf_status_helper",
             "//tensorflow/core:android_tensorflow_lib_lite",
-            "//tensorflow/core:platform_env",
             "//tensorflow/core:lib",
         ],
         "//conditions:default": [
             ":c_api",
             ":tf_status_helper",
             "//tensorflow/core:framework",
-            "//tensorflow/core:platform_env",
             "//tensorflow/core:lib",
         ],
     }) + [":c_api_internal"],
@@ -215,13 +223,13 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            ":c_api",
+            ":c_api_no_xla",
             ":c_api_internal",
             ":tf_status_helper",
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//conditions:default": [
-            ":c_api",
+            ":c_api_no_xla",
             ":c_api_internal",
             ":tf_status_helper",
             "//tensorflow/core:framework",
@@ -251,6 +259,18 @@ tf_cuda_library(
     ],
 )
 
+tf_cc_test(
+    name = "c_test",
+    srcs = ["c_test.c"],
+    extra_copts = ["-std=c11"],
+    deps = [
+        ":c_api",
+        ":c_api_experimental",
+        ":env",
+        ":kernels",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "c_api_test",
     size = "small",
@@ -279,13 +299,23 @@ tf_cuda_cc_test(
         "//tensorflow/cc/saved_model:signature_constants",
         "//tensorflow/cc/saved_model:tag_constants",
         "//tensorflow/compiler/jit",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:bitwise_ops_op_lib",
+        "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
         "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:sendrecv_ops_op_lib",
+        "//tensorflow/core:spectral_ops_op_lib",
+        "//tensorflow/core:state_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/kernels:array",
@@ -309,6 +339,7 @@ tf_cc_test(
     deps = [
         ":c_api",
         ":c_api_experimental",
+        ":c_api_internal",
         ":c_test_util",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:c_api_test_util",
@@ -325,6 +356,7 @@ tf_cc_test(
     srcs = ["c_api_function_test.cc"],
     deps = [
         ":c_api",
+        ":c_api_internal",
         ":c_test_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 94d18eb8b04e3534be547aca5cfbb32da40ffbf6..245d7ba2b186895532953aa61ebfc3fc6bf635a7 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/while_loop.h"
 #include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/kernels/logging_ops.h"
 #endif
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -257,6 +258,74 @@ int64_t TF_Dim(const TF_Tensor* t, int dim_index) {
 size_t TF_TensorByteSize(const TF_Tensor* t) { return t->buffer->size(); }
 void* TF_TensorData(const TF_Tensor* t) { return t->buffer->data(); }
 
+int64_t TF_TensorElementCount(const TF_Tensor* t) {
+  int64_t result = 1;
+  int rank = TF_NumDims(t);
+  for (int dim = 0; dim < rank; ++dim) {
+    result *= TF_Dim(t, dim);
+  }
+  return result;
+}
+
+// Returns the number of elements that would be present in a tensor with the
+// given shape.
+static int64_t ShapeNumElements(const int64_t* dims, int num_dims) {
+  int64_t result = 1;
+  for (int dim = 0; dim < num_dims; ++dim) {
+    result *= dims[dim];
+  }
+  return result;
+}
+
+static void UnrefIfNonNull(::tensorflow::TensorBuffer* buf) {
+  if (buf != nullptr) {
+    buf->Unref();
+  }
+}
+
+static void RefIfNonNull(::tensorflow::TensorBuffer* buf) {
+  if (buf != nullptr) {
+    buf->Ref();
+  }
+}
+
+void TF_TensorBitcastFrom(const TF_Tensor* from, TF_DataType type,
+                          TF_Tensor* to, const int64_t* new_dims,
+                          int num_new_dims, TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+  size_t in_size = TF_DataTypeSize(TF_TensorType(from));
+  if (in_size == 0) {
+    TF_SetStatus(status, TF_INVALID_ARGUMENT,
+                 "input tensor has a zero-sized data type");
+    return;
+  }
+  size_t out_size = TF_DataTypeSize(type);
+  if (out_size == 0) {
+    TF_SetStatus(status, TF_INVALID_ARGUMENT,
+                 "output tensor has a zero-sized data type");
+    return;
+  }
+
+  if (ShapeNumElements(new_dims, num_new_dims) * out_size !=
+      TF_TensorElementCount(from) * in_size) {
+    TF_SetStatus(status, TF_INVALID_ARGUMENT,
+                 "input tensor is not compatible with output shape");
+    return;
+  }
+
+  tensorflow::TensorShapeProto p;
+  for (int i = 0; i < num_new_dims; ++i) {
+    p.add_dim()->set_size(new_dims[i]);
+  }
+  to->shape = tensorflow::TensorShape(p);
+  to->dtype = type;
+  if (to->buffer != from->buffer) {
+    UnrefIfNonNull(to->buffer);
+    to->buffer = from->buffer;
+    RefIfNonNull(to->buffer);
+  }
+}
+
 // --------------------------------------------------------------------------
 size_t TF_StringEncode(const char* src, size_t src_len, char* dst,
                        size_t dst_len, TF_Status* status) {
@@ -488,6 +557,7 @@ static TF_Tensor* EmptyTensor(TF_DataType dtype, const TensorShape& shape) {
 // Non-static for testing.
 TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src,
                                TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
   if (!src.IsInitialized()) {
     status->status = FailedPrecondition(
         "attempt to use a tensor with an uninitialized value");
@@ -571,7 +641,7 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src,
                       dimvec.size(), base, size, DeleteArray, base);
 }
 
-Status MessageToBuffer(const tensorflow::protobuf::Message& in,
+Status MessageToBuffer(const tensorflow::protobuf::MessageLite& in,
                        TF_Buffer* out) {
   if (out->data != nullptr) {
     return InvalidArgument("Passing non-empty TF_Buffer is invalid.");
@@ -1241,6 +1311,13 @@ void TF_SetAttrTypeList(TF_OperationDescription* desc, const char* attr_name,
                      reinterpret_cast<const DataType*>(values), num_values));
 }
 
+void TF_SetAttrPlaceholder(TF_OperationDescription* desc, const char* attr_name,
+                           const char* placeholder) {
+  tensorflow::AttrValue attr_value;
+  attr_value.set_placeholder(placeholder);
+  desc->node_builder.Attr(attr_name, attr_value);
+}
+
 void TF_SetAttrFuncName(TF_OperationDescription* desc, const char* attr_name,
                         const char* value, size_t length) {
   tensorflow::NameAttrList func_name;
@@ -2880,6 +2957,16 @@ const char* TF_ServerTarget(TF_Server* server) {
 #endif
 }
 
-void TF_DeleteServer(TF_Server* server) { delete server; }
+void TF_DeleteServer(TF_Server* server) {
+#ifndef __ANDROID__
+  delete server;
+#endif
+}
+
+void TF_RegisterLogListener(void (*listener)(const char*)) {
+#ifndef __ANDROID__
+  tensorflow::logging::RegisterListener(listener);
+#endif
+}
 
 }  // end extern "C"
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index c7abba85521fccec07983cd5ab4f94a8368d6181..051de3a7dc0f8c630b6c81d2cfa960e5279c93c0 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -272,6 +272,39 @@ TF_CAPI_EXPORT extern size_t TF_TensorByteSize(const TF_Tensor*);
 // Return a pointer to the underlying data buffer.
 TF_CAPI_EXPORT extern void* TF_TensorData(const TF_Tensor*);
 
+// Returns the number of elements in the tensor.
+TF_CAPI_EXPORT extern int64_t TF_TensorElementCount(const TF_Tensor* tensor);
+
+// Copy the internal data representation of `from` to `to`. `new_dims` and
+// `num_new_dims` specify the new shape of the `to` tensor, `type` specifies its
+// data type. On success, *status is set to TF_OK and the two tensors share the
+// same data buffer.
+//
+// This call requires that the `from` tensor and the given type and shape (dims
+// and num_dims) are "compatible" (i.e. they occupy the same number of bytes).
+// Specifically, given from_type_size = TF_DataTypeSize(TF_TensorType(from)):
+//
+// ShapeElementCount(dims, num_dims) * TF_DataTypeSize(type)
+//
+// must equal
+//
+// TF_TensorElementCount(from) * from_type_size
+//
+// where TF_ShapeElementCount would be the number of elements in a tensor with
+// the given shape.
+//
+// In addition, this function requires:
+//   * TF_DataTypeSize(TF_TensorType(from)) != 0
+//   * TF_DataTypeSize(type) != 0
+//
+// If any of the requirements are not met, *status is set to
+// TF_INVALID_ARGUMENT.
+TF_CAPI_EXPORT extern void TF_TensorBitcastFrom(const TF_Tensor* from,
+                                                TF_DataType type, TF_Tensor* to,
+                                                const int64_t* new_dims,
+                                                int num_new_dims,
+                                                TF_Status* status);
+
 // --------------------------------------------------------------------------
 // Encode the string `src` (`src_len` bytes long) into `dst` in the format
 // required by TF_STRING tensors. Does not write to memory more than `dst_len`
@@ -516,6 +549,10 @@ TF_CAPI_EXPORT extern void TF_SetAttrTypeList(TF_OperationDescription* desc,
                                               const char* attr_name,
                                               const TF_DataType* values,
                                               int num_values);
+TF_CAPI_EXPORT extern void TF_SetAttrPlaceholder(TF_OperationDescription* desc,
+                                                 const char* attr_name,
+                                                 const char* placeholder);
+
 // Set a 'func' attribute to the specified name.
 // `value` must point to a string of length `length` bytes.
 TF_CAPI_EXPORT extern void TF_SetAttrFuncName(TF_OperationDescription* desc,
@@ -1277,6 +1314,28 @@ TF_CAPI_EXPORT extern TF_Function* TF_GraphToFunction(
     int noutputs, const TF_Output* outputs, const char* const* output_names,
     const TF_FunctionOptions* opts, const char* description, TF_Status* status);
 
+// Similar to TF_GraphToFunction but allows specifying control outputs of the
+// function.
+//
+//  The arguments of TF_GraphToFunction have the same meaning, but the new
+//  arguments are as follows:
+//
+//    ncontrol_outputs: Number of control outputs of the function.
+//    control_outputs: vector of TF_Operation objects to be marked as control
+//      outputs of the function. Operations marked as control outputs are
+//      guaranteed to execute.
+//    control_output_names: Optional. If not nullptr, vector of strings, one
+//      per control output, with their names to be added to the function's
+//      OpDef.
+TF_CAPI_EXPORT extern TF_Function* TF_GraphToFunctionWithControlOutputs(
+    const TF_Graph* fn_body, const char* fn_name,
+    unsigned char append_hash_to_fn_name, int num_opers,
+    const TF_Operation* const* opers, int ninputs, const TF_Output* inputs,
+    int noutputs, const TF_Output* outputs, const char* const* output_names,
+    int ncontrol_outputs, const TF_Operation* const* control_outputs,
+    const char* const* control_output_names, const TF_FunctionOptions* opts,
+    const char* description, TF_Status* status);
+
 // Returns the name of the graph function.
 // The return value points to memory that is only usable until the next
 // mutation to *func.
@@ -1710,6 +1769,14 @@ TF_CAPI_EXPORT extern const char* TF_ServerTarget(TF_Server* server);
 // it will be stopped and joined.
 TF_CAPI_EXPORT extern void TF_DeleteServer(TF_Server* server);
 
+// Register a listener method that processes printed messages.
+//
+// If any listeners are registered, the print operator will call all listeners
+// with the printed messages and immediately return without writing to the
+// logs.
+TF_CAPI_EXPORT extern void TF_RegisterLogListener(
+    void (*listener)(const char*));
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index 38e29aa74a90f4e85d1369b6928a5a58c531b2da..7ff4084decc686b067226ecaecf2af29d51d42f2 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/c/c_api_experimental.h"
 
+#include "absl/strings/substitute.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/eager/c_api.h"
@@ -66,7 +67,8 @@ void TF_EnableXLACompilation(TF_SessionOptions* options, unsigned char enable) {
 }
 
 TF_Buffer* TF_CreateConfig(unsigned char enable_xla_compilation,
-                           unsigned char gpu_memory_allow_growth) {
+                           unsigned char gpu_memory_allow_growth,
+                           unsigned int num_cpu_devices) {
   tensorflow::ConfigProto config;
   auto* optimizer_options =
       config.mutable_graph_options()->mutable_optimizer_options();
@@ -87,6 +89,8 @@ TF_Buffer* TF_CreateConfig(unsigned char enable_xla_compilation,
   auto* gpu_options = config.mutable_gpu_options();
   gpu_options->set_allow_growth(gpu_memory_allow_growth);
 
+  (*config.mutable_device_count())["CPU"] = num_cpu_devices;
+
   // TODO(b/113217601): This is needed for EagerContext::runner_ to use a
   // threadpool, so that we avoid the possibility of running the runner_ in the
   // threadpool of GPU event mgr, as that can trigger more callbacks to be
@@ -125,6 +129,14 @@ const char* TF_GraphDebugString(TF_Graph* graph, size_t* len) {
   return ret;
 }
 
+char* TF_FunctionDebugString(TF_Function* func, size_t* len) {
+  const auto& debug_str = func->fdef.DebugString();
+  *len = debug_str.size();
+  char* ret = static_cast<char*>(malloc(*len + 1));
+  memcpy(ret, debug_str.c_str(), *len + 1);
+  return ret;
+}
+
 // On success, returns a set of TF_Function instances from `text_proto` of
 // GraphDef type. These functions must be deleted by calling TF_DeleteFunction.
 //
@@ -8535,8 +8547,9 @@ TFE_Context* TFE_CreateContextFromSession(TF_Session* session,
 
   // Reduce GPU memory allocation, and set appropriate config options for TFE
   // context.
-  auto* config =
-      TF_CreateConfig(/*xla*/ false, /* gpu_memory_allow_growth */ true);
+  auto* config = TF_CreateConfig(
+      /*xla*/ false, /* gpu_memory_allow_growth */ true, /* num_cpu_devices */
+      10);
   TFE_ContextOptionsSetConfig(opts, config->data, config->length, status);
   if (!status->status.ok()) {
     CHECK(!config);
@@ -8733,6 +8746,12 @@ static void CheckOk(TF_Status* status) {
 
 void TFE_TensorHandlePrintDebugString(TFE_TensorHandle* handle) {
   auto* status = TF_NewStatus();
+  if (!TFE_TensorHandleIsConcrete(handle)) {
+    VLOG(1) << "Symbolic tensor: " << handle;
+    TF_DeleteStatus(status);
+    return;
+  }
+
   TF_Tensor* t = TFE_TensorHandleResolve(handle, status);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
@@ -8744,6 +8763,11 @@ void TFE_TensorHandlePrintDebugString(TFE_TensorHandle* handle) {
   TF_DeleteStatus(status);
 }
 
+void TFE_OpPrintDebugString(TFE_Op* op) {
+  VLOG(1) << "TFE_OpPrintDebugString() over " << op;
+  LOG(INFO) << op->operation.DebugString();
+}
+
 struct TFE_ExecuteOpNotification {
   TFE_ExecuteOpNotification() : status(TF_NewStatus(), TF_DeleteStatus) {}
   tensorflow::Notification n;
@@ -8886,3 +8910,240 @@ TFE_TensorHandle* TFE_NewTensorHandleFromScalar(TF_DataType dtype_arg,
   std::memcpy(tensorflow::TensorCApi::Buffer(tensor)->data(), data, len);
   return new TFE_TensorHandle(tensor, nullptr, nullptr);
 }
+
+namespace {
+tensorflow::Status EnableCollectiveOps(const tensorflow::ServerDef& server_def,
+                                       TFE_Context* ctx) {
+  // We don't use the TF_RETURN_IF_ERROR macro directly since that destroys the
+  // server object (which currently CHECK-fails) and we miss the error, instead,
+  // we log the error, and then return to allow the user to see the error
+  // message.
+#define LOG_AND_RETURN_IF_ERROR(...)                    \
+  do {                                                  \
+    const ::tensorflow::Status _status = (__VA_ARGS__); \
+    if (TF_PREDICT_FALSE(!_status.ok())) {              \
+      LOG(ERROR) << _status.error_message();            \
+      return _status;                                   \
+    }                                                   \
+  } while (0);
+
+  std::unique_ptr<tensorflow::ServerInterface> server;
+  LOG_AND_RETURN_IF_ERROR(tensorflow::NewServer(server_def, &server));
+
+  tensorflow::GrpcServer* grpc_server =
+      dynamic_cast<tensorflow::GrpcServer*>(server.get());
+  if (grpc_server == nullptr) {
+    LOG_AND_RETURN_IF_ERROR(tensorflow::errors::Internal(
+        "Currently, TFE_NewContext only supports tensorflow::GrpcServer."));
+  }
+
+  LOG_AND_RETURN_IF_ERROR(grpc_server->Start());
+
+  LOG_AND_RETURN_IF_ERROR(ctx->context.StoreCollectiveOpsServer(
+      std::move(server), grpc_server->worker_env()->device_mgr,
+      grpc_server->worker_env()->collective_executor_mgr));
+
+  return tensorflow::Status::OK();
+#undef LOG_AND_RETURN_IF_ERROR
+}
+}  // namespace
+
+// Set server_def on the context, possibly updating it.
+TF_CAPI_EXPORT extern void TFE_EnableCollectiveOps(TFE_Context* ctx,
+                                                   const void* proto,
+                                                   size_t proto_len,
+                                                   TF_Status* status) {
+  tensorflow::ServerDef server_def;
+  if (!server_def.ParseFromArray(proto, proto_len)) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "Invalid tensorflow.ServerDef protocol buffer");
+    return;
+  }
+  status->status = EnableCollectiveOps(server_def, ctx);
+}
+
+std::string tensorflow::getTF_OutputDebugString(TF_Output node) {
+  return absl::Substitute("TF_Output($0, $1)", node.oper, node.index);
+}
+
+using tensorflow::getTF_OutputDebugString;
+
+TFE_TensorHandle* TFE_NewTensorHandleFromTFOutput(TF_Output t,
+                                                  TF_DataType dtype) {
+  auto ret = new TFE_TensorHandle(t, dtype);
+  VLOG(1) << "Storing TFOutput " << getTF_OutputDebugString(t)
+          << " into tensor handle " << ret << " with internal handle "
+          << ret->handle;
+  return ret;
+}
+
+unsigned char TFE_TensorHandleIsConcrete(TFE_TensorHandle* handle) {
+  assert(handle->handle != nullptr);
+  return handle->handle->getSymbolicTensor() == nullptr;
+}
+
+TF_Output TFE_GetTFOutputFromTensorHandle(TFE_TensorHandle* handle,
+                                          TF_Status* status) {
+  if (TFE_TensorHandleIsConcrete(handle)) {
+    status->status =
+        tensorflow::errors::Internal("Not a symbolic tensor: ", handle);
+    return TF_Output{nullptr, -1};
+  }
+
+  auto* sym_tensor = handle->handle->getSymbolicTensor();
+  CHECK(sym_tensor != nullptr);
+  auto ret = TF_Output{sym_tensor->oper, sym_tensor->index};
+  VLOG(1) << "Retrieving " << getTF_OutputDebugString(ret)
+          << " from tensor handle " << handle;
+  CHECK_GE(sym_tensor->index, 0);
+  return ret;
+}
+
+TFE_TraceContext* TFE_NewTraceContext(TF_Graph* graph) {
+  return new TFE_TraceContext(graph);
+}
+
+void TFE_DeleteTraceContext(TFE_TraceContext* trace_ctx) { delete trace_ctx; }
+
+// If `handle` is already symbolic, return it. Otherwise map it to a new
+// symbolic tensor (a PlaceHolder op) and return that.
+static TF_Output getOrCreateSymbolicTensor(TFE_TraceContext* trace_ctx,
+                                           tensorflow::TensorHandle* handle,
+                                           TF_Status* status) {
+  VLOG(1) << "Getting symbolic tensor for input tensor handle " << handle
+          << ": " << handle->DebugString();
+
+  auto* sym_tensor = handle->getSymbolicTensor();
+  if (sym_tensor != nullptr) {
+    auto ret = TF_Output{sym_tensor->oper, sym_tensor->index};
+    VLOG(1) << "This handle is a symbolic tensor " << sym_tensor << ": "
+            << getTF_OutputDebugString(ret);
+    return ret;
+  }
+
+  auto find_it = trace_ctx->input_tensor_map.find(handle);
+  if (find_it != trace_ctx->input_tensor_map.end()) {
+    VLOG(1) << "There exists a map entry from this concrete tensor to: "
+            << getTF_OutputDebugString(find_it->second);
+    return find_it->second;
+  }
+
+  auto node_name = tensorflow::strings::StrCat("additional_input_",
+                                               trace_ctx->node_counter++);
+  VLOG(1) << "Adding a place holder node named " << node_name;
+  auto* desc =
+      TF_NewOperation(trace_ctx->graph, "Placeholder", node_name.c_str());
+  TF_SetAttrType(desc, "dtype",
+                 static_cast<TF_DataType>(handle->dtype) /*TF_FLOAT*/);
+  auto* result = TF_FinishOperation(desc, status);
+  if (!status->status.ok()) {
+    return TF_Output{nullptr, -1};
+  }
+
+  auto ret = TF_Output{result, 0};
+  VLOG(1) << "Creating a new map entry to map to: "
+          << getTF_OutputDebugString(ret);
+  trace_ctx->input_tensor_map[handle] = ret;
+  // `handle` could be destroyed before it's read from `input_tensor_map` (say
+  // during a subsequent TFE_FinalizeInputTensorsFromTraceContext() call), so we
+  // increment its ref count to extend its life span to that of `trace_ctx`.
+  handle->Ref();
+  VLOG(1) << "Ref count for handle " << handle
+          << " is 1?: " << handle->RefCountIsOne();
+  return ret;
+}
+
+TF_Operation* TFE_AddEagerOpToGraph(TFE_Op* op, TFE_TraceContext* trace_ctx,
+                                    TFE_TensorHandle** retvals,
+                                    int* num_retvals, TF_Status* status) {
+  VLOG(1) << "Calling TFE_AddEagerOpToGraph() with op " << op << ": "
+          << op->operation.DebugString();
+
+  const auto& op_type = op->operation.Name();
+  auto op_name =
+      tensorflow::strings::StrCat(op_type, "_", trace_ctx->node_counter++);
+  auto* desc =
+      TF_NewOperation(trace_ctx->graph, op_type.c_str(), op_name.c_str());
+
+  VLOG(1) << "Adding attrs.";
+  tensorflow::AttrValueMap attrs;
+  op->operation.Attrs().FillAttrValueMap(&attrs);
+  for (const auto& attr : attrs) {
+    desc->node_builder.Attr(attr.first, attr.second);
+  }
+
+  VLOG(1) << "Adding inputs.";
+  const auto& inputs = op->operation.Inputs();
+  size_t inputIndex = 0;
+  const tensorflow::OpDef& op_def = desc->node_builder.op_def();
+  for (const tensorflow::OpDef::ArgDef& input_arg : op_def.input_arg()) {
+    // TODO(bgogul): Add support for number attributes.
+    DCHECK(input_arg.number_attr().empty())
+        << "Number attributes is not implemented yet.";
+    if (input_arg.type_list_attr().empty()) {
+      auto symbolic_input =
+          getOrCreateSymbolicTensor(trace_ctx, inputs[inputIndex++], status);
+      if (!status->status.ok()) return nullptr;
+      TF_AddInput(desc, symbolic_input);
+      continue;
+    }
+    const std::string& type_list_attr = input_arg.type_list_attr();
+    const auto& attr_value = attrs[type_list_attr];
+    DCHECK(attr_value.value_case() == tensorflow::AttrValue::kList)
+        << "Type list attribute should be a list!";
+    std::vector<TF_Output> list_inputs(attr_value.list().type_size());
+    for (TF_Output& list_input : list_inputs) {
+      list_input =
+          getOrCreateSymbolicTensor(trace_ctx, inputs[inputIndex++], status);
+      if (!status->status.ok()) return nullptr;
+    }
+    TF_AddInputList(desc, list_inputs.data(), list_inputs.size());
+  }
+
+  auto* graph_op = TF_FinishOperation(desc, status);
+  if (!status->status.ok()) return nullptr;
+
+  VLOG(1) << "Op finalized; setting return tensors.";
+  *num_retvals = TF_OperationNumOutputs(graph_op);
+  VLOG(1) << "This op has " << *num_retvals << " outputs.";
+  for (int i = 0; i < *num_retvals; ++i) {
+    auto output = TF_Output{graph_op, i};
+    auto dtype = TF_OperationOutputType(output);
+    retvals[i] = TFE_NewTensorHandleFromTFOutput(output, dtype);
+  }
+  return graph_op;
+}
+
+int TFE_FinalizeInputTensorsFromTraceContext(TFE_TraceContext* trace_ctx) {
+  if (trace_ctx->input_tensors == nullptr) {
+    trace_ctx->input_tensors =
+        new std::vector<std::pair<tensorflow::TensorHandle*, TF_Output>>();
+    trace_ctx->input_tensors->reserve(trace_ctx->input_tensor_map.size());
+
+    for (auto input : trace_ctx->input_tensor_map) {
+      trace_ctx->input_tensors->emplace_back(input.first, input.second);
+    }
+  }
+  return trace_ctx->input_tensor_map.size();
+}
+
+TF_Output TFE_GetInputGraphNodeFromTraceContext(TFE_TraceContext* trace_ctx,
+                                                unsigned int idx) {
+  CHECK(trace_ctx->input_tensors != nullptr);
+  CHECK(trace_ctx->input_tensors->size() > idx);
+  return trace_ctx->input_tensors->at(idx).second;
+}
+
+TFE_TensorHandle* TFE_ConsumeInputConcreteTensorFromTraceContext(
+    TFE_TraceContext* trace_ctx, unsigned int idx) {
+  CHECK(trace_ctx->input_tensors != nullptr);
+  CHECK(trace_ctx->input_tensors->size() > idx);
+  auto* handle = trace_ctx->input_tensors->at(idx).first;
+  VLOG(1) << "Ref count for internal handle " << handle
+          << " is 1?: " << handle->RefCountIsOne();
+  handle->Ref();
+  auto* ret = new TFE_TensorHandle(handle);
+  VLOG(1) << "Returning a new tensor handle " << ret << ": "
+          << handle->DebugString();
+  return ret;
+}
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index 3e3a485eb763b871b0551414c4ef04746b2ed9a3..8d1a8b82fbaf9901b6d9aecf6d092ae298c8dba3 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -67,9 +67,10 @@ TF_CAPI_EXPORT extern void TF_EnableXLACompilation(TF_SessionOptions* options,
 // a) ConfigProto.optimizer_options.global_jit_level is set to to ON_1 if
 // `enable_xla_compilation` is non-zero, and OFF otherwise.
 // b) ConfigProto.gpu_options.allow_growth is set to `gpu_memory_allow_growth`.
+// c) ConfigProto.device_count is set to `num_cpu_devices`.
 TF_CAPI_EXPORT extern TF_Buffer* TF_CreateConfig(
-    unsigned char enable_xla_compilation,
-    unsigned char gpu_memory_allow_growth);
+    unsigned char enable_xla_compilation, unsigned char gpu_memory_allow_growth,
+    unsigned int num_cpu_devices);
 
 // Create a serialized tensorflow.RunOptions proto, where RunOptions.trace_level
 // is set to FULL_TRACE if `enable_full_trace` is non-zero, and NO_TRACE
@@ -83,6 +84,15 @@ TF_CAPI_EXPORT extern TF_Buffer* TF_CreateRunOptions(
 TF_CAPI_EXPORT extern const char* TF_GraphDebugString(TF_Graph* graph,
                                                       size_t* len);
 
+// Returns the function content in a human-readable format, with length set in
+// `len`. The format is subject to change in the future.
+// The returned string is heap-allocated, and caller should call free() on it.
+//
+// Do not return const char*, because some foreign language binding
+// (e.g. swift) cannot then call free() on the returned pointer.
+TF_CAPI_EXPORT extern char* TF_FunctionDebugString(TF_Function* func,
+                                                   size_t* len);
+
 // Creates a stack of data set + iterator nodes, currently hard-coded to return
 // a sequence of 3 float values <42.0, 43.0, 44.0> over 3 calls. On success,
 // returns the IteratorGetNext node, which caller can run or feed into an node.
@@ -180,6 +190,8 @@ TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_DequeueVariantTensor(
 TF_CAPI_EXPORT extern void TFE_TensorHandlePrintDebugString(
     TFE_TensorHandle* handle);
 
+TF_CAPI_EXPORT extern void TFE_OpPrintDebugString(TFE_Op* op);
+
 typedef struct TFE_ExecuteOpNotification TFE_ExecuteOpNotification;
 
 // Allows invoking a kernel asynchronously, and explicitly returns a
@@ -246,6 +258,62 @@ TF_CAPI_EXPORT int TF_PickUnusedPortOrDie(void);
 TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandleFromScalar(
     TF_DataType dtype, void* scalar, size_t len);
 
+// Specify the server_def that enables collective ops.
+// This is different to the above function in that it doesn't create remote
+// contexts, and remotely executing ops is not possible. It just enables
+// communication for collective ops.
+TF_CAPI_EXPORT extern void TFE_EnableCollectiveOps(TFE_Context* ctx,
+                                                   const void* proto,
+                                                   size_t proto_len,
+                                                   TF_Status* status);
+
+// Create a symbolic tensor from the input graph node.
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandleFromTFOutput(
+    TF_Output t, TF_DataType data_type);
+
+// Returns 0 if the input tensor handle represents a symbolic tensor (i.e., a
+// graph node). Otherwise returns non-0.
+TF_CAPI_EXPORT extern unsigned char TFE_TensorHandleIsConcrete(
+    TFE_TensorHandle* handle);
+
+// If `handle` is a symbolic tensor, return the corresponding graph node
+// represented by TF_Output. Otherwise, return an error status.
+TF_CAPI_EXPORT extern TF_Output TFE_GetTFOutputFromTensorHandle(
+    TFE_TensorHandle* handle, TF_Status* status);
+
+typedef struct TFE_TraceContext TFE_TraceContext;
+
+// A trace context contains a trace graph, to which TFE_AddEagerOpToGraph()
+// calls add graph nodes as a way to symbolically execute the eager ops.
+//
+// It also contains a hash map from concrete input tensors to symbolic
+// tensors. That map will be used to create input tensors to the trace graph.
+TF_CAPI_EXPORT extern TFE_TraceContext* TFE_NewTraceContext(TF_Graph* graph);
+
+TF_CAPI_EXPORT extern void TFE_DeleteTraceContext(TFE_TraceContext* trace_ctx);
+
+// Symbolically executes `op`, by adding a corresponding node to the graph
+// associated with `trace_ctx`. This graph node outputs a set of symbolic
+// tensors in `retvals` and `num_retvals`. Returns the corresponding graph
+// operation on success, otherwise returns nullptr.
+TF_CAPI_EXPORT extern TF_Operation* TFE_AddEagerOpToGraph(
+    TFE_Op* op, TFE_TraceContext* trace_ctx, TFE_TensorHandle** retvals,
+    int* num_retvals, TF_Status* status);
+
+// Finalizes the trace graph and its inputs, and returns the number of inputs.
+// After this call, the next two APIs can be called to iterate over the input
+// tensors.
+TF_CAPI_EXPORT extern int TFE_FinalizeInputTensorsFromTraceContext(
+    TFE_TraceContext* trace_ctx);
+
+TF_CAPI_EXPORT extern TF_Output TFE_GetInputGraphNodeFromTraceContext(
+    TFE_TraceContext* trace_ctx, unsigned int idx);
+
+// Each input tensor should be consumed at most once.
+TF_CAPI_EXPORT extern TFE_TensorHandle*
+TFE_ConsumeInputConcreteTensorFromTraceContext(TFE_TraceContext* trace_ctx,
+                                               unsigned int idx);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/c_api_experimental_test.cc b/tensorflow/c/c_api_experimental_test.cc
index daa7701b7fe7e8ce757b6504329cf6434ad39778..c54021a7517ebbdd00405cbfa9cee8f3f6616cca 100644
--- a/tensorflow/c/c_api_experimental_test.cc
+++ b/tensorflow/c/c_api_experimental_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/c/c_api_experimental.h"
+#include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/c_test_util.h"
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_test_util.h"
@@ -296,5 +297,178 @@ TEST(CAPI_EXPERIMENTAL, TFE_ExecuteOpInNewThreadTest_Blocking) {
   TF_DeleteStatus(status);
 }
 
+TEST(CAPI_EXPERIMENTAL, SymbolicTensor) {
+  TF_Status* status = TF_NewStatus();
+  auto node = TF_Output{nullptr, 1};
+  auto* sym_handle = TFE_NewTensorHandleFromTFOutput(node, TF_FLOAT);
+  TFE_TensorHandlePrintDebugString(sym_handle);
+  CHECK_EQ(TFE_TensorHandleDataType(sym_handle), TF_FLOAT);
+  ASSERT_FALSE(TFE_TensorHandleIsConcrete(sym_handle));
+  auto same_node = TFE_GetTFOutputFromTensorHandle(sym_handle, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  ASSERT_EQ(same_node.oper, node.oper);
+  ASSERT_EQ(same_node.index, node.index);
+  TFE_DeleteTensorHandle(sym_handle);
+
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+  ASSERT_TRUE(TFE_TensorHandleIsConcrete(m));
+  (void)TFE_GetTFOutputFromTensorHandle(m, status);
+  CHECK_EQ(TF_INTERNAL, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteTensorHandle(m);
+
+  TF_DeleteStatus(status);
+}
+
+class AddEagerOpToGraphTest : public ::testing::Test {
+ protected:
+  AddEagerOpToGraphTest()
+      : status_(TF_NewStatus()),
+        eager_ctx_(nullptr),
+        graph_(TF_NewGraph()),
+        trace_ctx_(TFE_NewTraceContext(graph_)) {
+    TFE_ContextOptions* opts = TFE_NewContextOptions();
+    CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+    eager_ctx_ = TFE_NewContext(opts, status_);
+    CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+    TFE_DeleteContextOptions(opts);
+  }
+
+  ~AddEagerOpToGraphTest() override {
+    TFE_DeleteTraceContext(trace_ctx_);
+    TF_DeleteGraph(graph_);
+    TFE_DeleteContext(eager_ctx_);
+    TF_DeleteStatus(status_);
+  }
+
+  template <typename Callable>
+  void AddEagerOpToGraphAndCheck(TFE_Op* op, Callable checker) {
+    TFE_TensorHandle* retvals[5];
+    int num_retvals = 5;
+    // Symbolically execute this op, which adds a graph node to `trace_ctx_`.
+    TF_Operation* graph_op =
+        TFE_AddEagerOpToGraph(op, trace_ctx_, retvals, &num_retvals, status_);
+    CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+    CHECK_NOTNULL(graph_op);
+    // Check the expectations.
+    checker(graph_op);
+    for (int i = 0; i < num_retvals; ++i) {
+      TFE_DeleteTensorHandle(retvals[i]);
+    }
+  }
+
+  TF_Status* status_;
+  TFE_Context* eager_ctx_;
+  TF_Graph* graph_;
+  TFE_TraceContext* trace_ctx_;
+};
+
+TEST_F(AddEagerOpToGraphTest, DebugPrintAndSymbolicExecution) {
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+  TFE_Op* op = MatMulOp(eager_ctx_, m, m);
+
+  CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+  TFE_OpPrintDebugString(op);
+
+  TFE_TensorHandle* retvals[5];
+  int num_retvals = 5;
+  // Symbolically execute this op, which adds a graph node to `trace_ctx`.
+  TFE_AddEagerOpToGraph(op, trace_ctx_, retvals, &num_retvals, status_);
+  CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+
+  int num_inputs = TFE_FinalizeInputTensorsFromTraceContext(trace_ctx_);
+  CHECK_EQ(num_inputs, 1);
+  auto input_sym_tensor = TFE_GetInputGraphNodeFromTraceContext(trace_ctx_,
+                                                                /*idx*/ 0);
+
+  LOG(INFO) << tensorflow::getTF_OutputDebugString(input_sym_tensor);
+  auto handle = TFE_ConsumeInputConcreteTensorFromTraceContext(trace_ctx_,
+                                                               /*idx*/ 0);
+  TFE_TensorHandlePrintDebugString(handle);
+  TFE_DeleteTensorHandle(handle);
+
+  CHECK_EQ(num_retvals, 1);
+  CHECK_EQ(TFE_TensorHandleDataType(retvals[0]), TF_FLOAT);
+
+  TFE_DeleteTensorHandle(retvals[0]);
+  TFE_DeleteTensorHandle(m);
+  TFE_DeleteOp(op);
+}
+
+TEST_F(AddEagerOpToGraphTest, ValueAttributesArePreserved) {
+  // Create MinOp
+  TFE_TensorHandle* axis = TestAxisTensorHandle();
+  TFE_Op* op = MinOp(eager_ctx_, axis, axis);
+  CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+
+  // Check the attributes set by the call to MinOp above.
+  AddEagerOpToGraphAndCheck(op, [this, &axis](TF_Operation* graph_op) {
+    unsigned char value;
+    TF_OperationGetAttrBool(graph_op, "keep_dims", &value, status_);
+    CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+    CHECK_EQ(value, 1);
+    TF_DataType dtype;
+    TF_OperationGetAttrType(graph_op, "Tidx", &dtype, status_);
+    CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+    CHECK_EQ(dtype, TF_INT32);
+    TF_OperationGetAttrType(graph_op, "T", &dtype, status_);
+    CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+    CHECK_EQ(dtype, TFE_TensorHandleDataType(axis));
+  });
+  TFE_DeleteTensorHandle(axis);
+  TFE_DeleteOp(op);
+}
+
+TEST_F(AddEagerOpToGraphTest, ListAttributesArePreserved) {
+  // Create a "Squeeze" operator with list attributes.
+  TFE_TensorHandle* axis = TestAxisTensorHandle();
+  TFE_Op* squeeze = TFE_NewOp(eager_ctx_, "Squeeze", status_);
+  CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+  TFE_OpAddInput(squeeze, axis, status_);
+  TFE_OpSetAttrType(squeeze, "T", TF_INT32);
+  std::vector<int64_t> boundaries = {1, 2, 3, 4};
+  TFE_OpSetAttrIntList(squeeze, "squeeze_dims", boundaries.data(),
+                       boundaries.size());
+  // Check attributes are preserved.
+  AddEagerOpToGraphAndCheck(
+      squeeze, [this, &boundaries](TF_Operation* squeeze_graph_op) {
+        TF_DataType dtype;
+        TF_OperationGetAttrType(squeeze_graph_op, "T", &dtype, status_);
+        CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+        CHECK_EQ(dtype, TF_INT32);
+        std::unique_ptr<int64_t[]> list(new int64_t[boundaries.size()]);
+        TF_OperationGetAttrIntList(squeeze_graph_op, "squeeze_dims", list.get(),
+                                   boundaries.size(), status_);
+        CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+        EXPECT_TRUE(std::equal(list.get(), list.get() + boundaries.size(),
+                               boundaries.begin()));
+      });
+  TFE_DeleteTensorHandle(axis);
+  TFE_DeleteOp(squeeze);
+}
+
+TEST_F(AddEagerOpToGraphTest, ListInputsAreAddedCorrectly) {
+  TFE_TensorHandle* scalar = TestScalarTensorHandle();
+  TFE_Op* identityn = TFE_NewOp(eager_ctx_, "IdentityN", status_);
+  CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+  constexpr size_t kNumInputs = 3;
+  for (size_t i = 0; i < kNumInputs; ++i) {
+    TFE_OpAddInput(identityn, scalar, status_);
+  }
+  TF_DataType types[kNumInputs] = {TF_FLOAT, TF_FLOAT, TF_FLOAT};
+  TFE_OpSetAttrTypeList(identityn, "T", types, kNumInputs);
+  AddEagerOpToGraphAndCheck(
+      identityn, [this, kNumInputs](TF_Operation* graph_op) {
+        EXPECT_EQ(TF_OperationNumInputs(graph_op), kNumInputs);
+        EXPECT_EQ(TF_OperationInputListLength(graph_op, "input", status_),
+                  kNumInputs);
+        CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+        EXPECT_EQ(TF_OperationOutputListLength(graph_op, "output", status_),
+                  kNumInputs);
+        CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_);
+      });
+  TFE_DeleteTensorHandle(scalar);
+  TFE_DeleteOp(identityn);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/c_api_function.cc b/tensorflow/c/c_api_function.cc
index 28b9f8df9c873ee394eb6a241dd9ac06ba6c8796..03d65ecefd4a9ba5a23a94ed902dfba6dd4fbda9 100644
--- a/tensorflow/c/c_api_function.cc
+++ b/tensorflow/c/c_api_function.cc
@@ -162,6 +162,11 @@ Status FillFunctionBody(
     const std::vector<const Node*>& body_nodes,
     const std::unordered_map<string, string>& tensor_renaming,
     FunctionDef* fdef) {
+  std::unordered_set<string> func_attr_names;
+  for (const auto& func_attr : fdef->signature().attr()) {
+    func_attr_names.insert(func_attr.name());
+  }
+
   std::vector<const Edge*> in_edges;
   std::vector<const Edge*> control_edges;
   for (const Node* node : body_nodes) {
@@ -243,6 +248,48 @@ Status FillFunctionBody(
     if (node->op_def().is_stateful()) {
       fdef->mutable_signature()->set_is_stateful(true);
     }
+
+    // If this node has any attributes with placeholder value, add the
+    // attribute to FunctionDef signature.
+    for (const auto& iter : node->attrs()) {
+      if (iter.second.placeholder().empty()) {
+        continue;
+      }
+
+      // If we already added the attribute, skip it.
+      string func_attr_name = iter.second.placeholder();
+      if (func_attr_names.find(func_attr_name) != func_attr_names.end()) {
+        continue;
+      }
+
+      // This node's attribute is a placeholder value, so it does not have type
+      // information. We check node's OpDef for attribute type.
+      string node_attr_name = iter.first;
+      const OpDef::AttrDef* node_attr_def = nullptr;
+      for (const auto& node_attr : node->op_def().attr()) {
+        if (node_attr.name() == node_attr_name) {
+          node_attr_def = &node_attr;
+        }
+      }
+      if (!node_attr_def) {
+#ifdef TENSORFLOW_LITE_PROTOS
+        return errors::Unimplemented(
+            "Placeholder value is not supported for attributes not in OpDef. "
+            "Attribute: ",
+            node_attr_name);
+#else
+        return errors::Unimplemented(
+            "Placeholder value is not supported for attributes not in OpDef. "
+            "Attribute: ",
+            node_attr_name, ", OpDef: ", node->op_def().DebugString());
+#endif
+      }
+      OpDef::AttrDef* attr_def = fdef->mutable_signature()->add_attr();
+      attr_def->set_name(func_attr_name);
+      attr_def->set_type(node_attr_def->type());
+
+      func_attr_names.insert(func_attr_name);
+    }
   }
   return Status::OK();
 }
@@ -255,6 +302,8 @@ Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
                           const std::vector<OutputTensor>& inputs,
                           const std::vector<OutputTensor>& outputs,
                           const std::vector<string>& output_names,
+                          const std::vector<const Node*>& control_outputs,
+                          const std::vector<string>& control_output_names,
                           const char* description, FunctionDef* fdef) {
   if (!output_names.empty()) {
     DCHECK_EQ(output_names.size(), outputs.size());
@@ -378,6 +427,29 @@ Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
     fdef->mutable_signature()->set_name(fn_name);
   }
 
+  if (!control_output_names.empty() &&
+      (control_outputs.size() != control_output_names.size())) {
+    return InvalidArgument(
+        "Expected number of control outputs (", control_outputs.size(),
+        ") and the number of control output names (",
+        control_output_names.size(), ") to match but they do not.");
+  }
+  std::unordered_set<string> control_output_names_set;
+  for (int i = 0; i < control_outputs.size(); ++i) {
+    string signature_name;
+    if (!control_output_names.empty()) {
+      signature_name = control_output_names[i];
+    } else {
+      signature_name = control_outputs[i]->name();
+    }
+    if (!control_output_names_set.insert(signature_name).second) {
+      return errors::InvalidArgument("Repeated control output name: ",
+                                     signature_name);
+    }
+    fdef->mutable_signature()->add_control_output(signature_name);
+    (*fdef->mutable_control_ret())[signature_name] = control_outputs[i]->name();
+  }
+
   return Status::OK();
 }
 
@@ -485,14 +557,14 @@ Status ComputeBodyNodes(
 using tensorflow::Node;
 using tensorflow::string;
 
-TF_Function* TF_GraphToFunction(const TF_Graph* fn_body, const char* fn_name,
-                                unsigned char append_hash_to_fn_name,
-                                int num_opers, const TF_Operation* const* opers,
-                                int ninputs, const TF_Output* inputs,
-                                int noutputs, const TF_Output* outputs,
-                                const char* const* output_names,
-                                const TF_FunctionOptions* opts,
-                                const char* description, TF_Status* status) {
+TF_Function* TF_GraphToFunctionWithControlOutputs(
+    const TF_Graph* fn_body, const char* fn_name,
+    unsigned char append_hash_to_fn_name, int num_opers,
+    const TF_Operation* const* opers, int ninputs, const TF_Output* inputs,
+    int noutputs, const TF_Output* outputs, const char* const* output_names,
+    int ncontrol_outputs, const TF_Operation* const* control_outputs,
+    const char* const* control_output_names, const TF_FunctionOptions* opts,
+    const char* description, TF_Status* status) {
   tensorflow::mutex_lock l(*const_cast<tensorflow::mutex*>(&fn_body->mu));
 
   // Process inputs.
@@ -517,19 +589,34 @@ TF_Function* TF_GraphToFunction(const TF_Graph* fn_body, const char* fn_name,
     }
   }
 
+  // Process control output names.
+  std::vector<string> control_output_names_vec;
+  if (control_output_names) {
+    control_output_names_vec.reserve(ncontrol_outputs);
+    for (int i = 0; i < ncontrol_outputs; ++i) {
+      control_output_names_vec.push_back(string(output_names[i]));
+    }
+  }
+
   // Compute body nodes.
   std::vector<const Node*> body_nodes;
   status->status = tensorflow::ComputeBodyNodes(
       fn_body, fn_name, num_opers, opers, input_nodes, &body_nodes);
   if (!status->status.ok()) return nullptr;
 
+  // Compute body nodes.
+  std::vector<const Node*> control_output_nodes;
+  for (int i = 0; i < ncontrol_outputs; ++i) {
+    control_output_nodes.push_back(&control_outputs[i]->node);
+  }
+
   // Do the actual function creation.
   TF_Function* tf_function = new TF_Function();
   DCHECK(append_hash_to_fn_name <= 1);
   status->status = tensorflow::GraphToFunctionDef(
       fn_body->graph, fn_name, append_hash_to_fn_name != 0, body_nodes,
-      input_tensors, output_tensors, output_names_vec, description,
-      &tf_function->fdef);
+      input_tensors, output_tensors, output_names_vec, control_output_nodes,
+      control_output_names_vec, description, &tf_function->fdef);
   if (!status->status.ok()) {
     TF_DeleteFunction(tf_function);
     return nullptr;
@@ -537,6 +624,20 @@ TF_Function* TF_GraphToFunction(const TF_Graph* fn_body, const char* fn_name,
   return tf_function;
 }
 
+TF_Function* TF_GraphToFunction(const TF_Graph* fn_body, const char* fn_name,
+                                unsigned char append_hash_to_fn_name,
+                                int num_opers, const TF_Operation* const* opers,
+                                int ninputs, const TF_Output* inputs,
+                                int noutputs, const TF_Output* outputs,
+                                const char* const* output_names,
+                                const TF_FunctionOptions* opts,
+                                const char* description, TF_Status* status) {
+  return TF_GraphToFunctionWithControlOutputs(
+      fn_body, fn_name, append_hash_to_fn_name, num_opers, opers, ninputs,
+      inputs, noutputs, outputs, output_names, 0, nullptr, nullptr, opts,
+      description, status);
+}
+
 const char* TF_FunctionName(TF_Function* func) {
   return func->fdef.signature().name().c_str();
 }
diff --git a/tensorflow/c/c_api_function_test.cc b/tensorflow/c/c_api_function_test.cc
index 73fe73769bc1219ce865149d67d333c53371ccc5..946f8c4a2c3fb25f908d809e00bf579b40a8668b 100644
--- a/tensorflow/c/c_api_function_test.cc
+++ b/tensorflow/c/c_api_function_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/c/c_api.h"
 
+#include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/c_test_util.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
@@ -1230,6 +1231,53 @@ void DefineFunction(const char* name, TF_Function** func,
   ASSERT_NE(*func, nullptr);
 }
 
+REGISTER_OP("CustomOp")
+    .Output("output: float32")
+    .Attr("index: int")
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape);
+
+void NodeWithPlaceholderAttrHelper(TF_Graph* graph, TF_Status* s,
+                                   const char* name, const char* placeholder,
+                                   TF_Operation** op) {
+  TF_OperationDescription* desc = TF_NewOperation(graph, "CustomOp", name);
+  TF_SetAttrPlaceholder(desc, "index", placeholder);
+  *op = TF_FinishOperation(desc, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  ASSERT_NE(*op, nullptr);
+}
+
+TEST_F(CApiFunctionTest, GraphToFunctionDefWithPlaceholderAttr) {
+  std::unique_ptr<TF_Graph, decltype(&TF_DeleteGraph)> func_graph(
+      TF_NewGraph(), TF_DeleteGraph);
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> s(TF_NewStatus(),
+                                                           TF_DeleteStatus);
+
+  TF_Operation *node1, *node2, *node3;
+  NodeWithPlaceholderAttrHelper(func_graph.get(), s.get(), "node1", "v1",
+                                &node1);
+  NodeWithPlaceholderAttrHelper(func_graph.get(), s.get(), "node2", "v1",
+                                &node2);
+  NodeWithPlaceholderAttrHelper(func_graph.get(), s.get(), "node3", "v2",
+                                &node3);
+
+  TF_Output inputs[] = {};
+  TF_Output outputs[] = {{node1, 0}, {node2, 0}, {node3, 0}};
+  func_ = TF_GraphToFunction(
+      func_graph.get(), "func", /*append_hash_to_fn_name=*/false, -1,
+      /*opers=*/nullptr, 0, inputs, 3, outputs,
+      /*output_names=*/nullptr,
+      /*opts=*/nullptr, /*description=*/nullptr, s.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(s.get())) << TF_Message(s.get());
+  ASSERT_NE(func_, nullptr);
+
+  // Verify that FunctionDef has 2 attributes, "v1" and "v2".
+  ASSERT_EQ(func_->fdef.signature().attr().size(), 2);
+  EXPECT_EQ(func_->fdef.signature().attr(0).name(), "v1");
+  EXPECT_EQ(func_->fdef.signature().attr(0).type(), "int");
+  EXPECT_EQ(func_->fdef.signature().attr(1).name(), "v2");
+  EXPECT_EQ(func_->fdef.signature().attr(1).type(), "int");
+}
+
 TEST_F(CApiFunctionTest, SetGradientAndRun) {
   // Define the function and its grad
   DefineFunction(func_name_, &func_);
diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h
index 5ba26d3c585350aa510f9970cbfc246a9a108543..d520b6b76849e562def6abd8be0510d3b4797e8c 100644
--- a/tensorflow/c/c_api_internal.h
+++ b/tensorflow/c/c_api_internal.h
@@ -204,7 +204,8 @@ Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst);
 
 TF_Tensor* TF_TensorFromTensor(const Tensor& src, TF_Status* status);
 
-Status MessageToBuffer(const tensorflow::protobuf::Message& in, TF_Buffer* out);
+Status MessageToBuffer(const tensorflow::protobuf::MessageLite& in,
+                       TF_Buffer* out);
 
 // Set the shapes and types of the output's handle.
 //
@@ -228,6 +229,8 @@ void RecordMutation(TF_Graph* graph, const TF_Operation& op,
 bool ExtendSessionGraphHelper(TF_Session* session, TF_Status* status)
     LOCKS_EXCLUDED(session->graph->mu, session->mu);
 
+std::string getTF_OutputDebugString(TF_Output node);
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_C_C_API_INTERNAL_H_
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index d5934a10395ae094f65d3bc8b6cd7b94dbd32410..2be03bf0de6277fc63c353ad6dc63bec096a6993 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -163,6 +163,7 @@ TEST(CAPI, AllocateTensor) {
   EXPECT_EQ(dims[0], TF_Dim(t, 0));
   EXPECT_EQ(dims[1], TF_Dim(t, 1));
   EXPECT_EQ(num_bytes, TF_TensorByteSize(t));
+  EXPECT_EQ(6, TF_TensorElementCount(t));
   TF_DeleteTensor(t);
 }
 
@@ -1467,6 +1468,41 @@ TEST(CAPI, DeletingNullPointerIsSafe) {
   TF_DeleteStatus(status);
 }
 
+TEST(CAPI, TestBitcastFrom_Reshape) {
+  int64_t dims[] = {2, 3};
+  TF_Tensor* a =
+      TF_AllocateTensor(TF_UINT64, dims, 2, 6 * TF_DataTypeSize(TF_UINT64));
+  TF_Tensor* b =
+      TF_AllocateTensor(TF_UINT64, nullptr, 0, TF_DataTypeSize(TF_UINT64));
+  EXPECT_NE(a, nullptr);
+  EXPECT_NE(b, nullptr);
+
+  EXPECT_EQ(6, TF_TensorElementCount(a));
+  EXPECT_EQ(1, TF_TensorElementCount(b));
+  EXPECT_EQ(6 * TF_DataTypeSize(TF_UINT64), TF_TensorByteSize(a));
+  EXPECT_EQ(TF_DataTypeSize(TF_UINT64), TF_TensorByteSize(b));
+
+  int64_t new_dims[] = {3, 2};
+  TF_Status* status = TF_NewStatus();
+  TF_TensorBitcastFrom(a, TF_UINT64, b, new_dims, 2, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status));
+  TF_DeleteStatus(status);
+
+  EXPECT_EQ(6, TF_TensorElementCount(a));
+  EXPECT_EQ(6, TF_TensorElementCount(b));
+  EXPECT_EQ(6 * TF_DataTypeSize(TF_UINT64), TF_TensorByteSize(a));
+  EXPECT_EQ(6 * TF_DataTypeSize(TF_UINT64), TF_TensorByteSize(b));
+
+  // Check that a write to one tensor shows up in the other.
+  *(static_cast<int64_t*>(TF_TensorData(a))) = 4;
+  EXPECT_EQ(4, *(static_cast<int64_t*>(TF_TensorData(b))));
+  *(static_cast<int64_t*>(TF_TensorData(b))) = 6;
+  EXPECT_EQ(6, *(static_cast<int64_t*>(TF_TensorData(a))));
+
+  TF_DeleteTensor(a);
+  TF_DeleteTensor(b);
+}
+
 REGISTER_OP("TestOpWithNoGradient")
     .Input("x: T")
     .Output("y: T")
diff --git a/tensorflow/c/c_test.c b/tensorflow/c/c_test.c
new file mode 100644
index 0000000000000000000000000000000000000000..7468122cd567270c8454f886e478be34c2c15cbf
--- /dev/null
+++ b/tensorflow/c/c_test.c
@@ -0,0 +1,93 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <limits.h>
+#include <memory.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_experimental.h"
+#include "tensorflow/c/env.h"
+#include "tensorflow/c/kernels.h"
+
+// A create function. This will never actually get called in this test, it's
+// just nice to know that it compiles.
+void* create(TF_OpKernelConstruction* ctx) {
+  TF_DataType type;
+  TF_Status* s = TF_NewStatus();
+  TF_OpKernelConstruction_GetAttrType(ctx, "foobar", &type, s);
+  TF_DeleteStatus(s);
+  return NULL;
+}
+
+// A compute function. This will never actually get called in this test, it's
+// just nice to know that it compiles.
+void compute(void* kernel, TF_OpKernelContext* ctx) {
+  TF_Tensor* input;
+  TF_Status* s = TF_NewStatus();
+  TF_GetInput(ctx, 0, &input, s);
+  TF_DeleteTensor(input);
+  TF_DeleteStatus(s);
+}
+
+// Exercises tensorflow's C API.
+int main(int argc, char** argv) {
+  TF_InitMain(argv[0], &argc, &argv);
+
+  struct TF_StringStream* s = TF_GetLocalTempDirectories();
+  const char* path;
+
+  if (!TF_StringStreamNext(s, &path)) {
+    fprintf(stderr, "TF_GetLocalTempDirectories returned no results\n");
+    return 1;
+  }
+
+  char file_name[100];
+  struct timeval t;
+  if (gettimeofday(&t, NULL)) {
+    perror("gettimeofday failed");
+    return 1;
+  }
+  snprintf(file_name, sizeof(file_name), "test-%d-%ld.txt", getpid(), t.tv_sec);
+
+  size_t length = 2 + strlen(path) + strlen(file_name);
+  char* full_path = malloc(length);
+  snprintf(full_path, length, "%s/%s", path, file_name);
+
+  TF_WritableFileHandle* h;
+  TF_Status* status = TF_NewStatus();
+  TF_NewWritableFile(full_path, &h, status);
+  if (TF_GetCode(status) != TF_OK) {
+    fprintf(stderr, "TF_NewWritableFile failed: %s\n", TF_Message(status));
+    return 1;
+  }
+  fprintf(stderr, "wrote %s\n", full_path);
+  free(full_path);
+  TF_CloseWritableFile(h, status);
+  if (TF_GetCode(status) != TF_OK) {
+    fprintf(stderr, "TF_CloseWritableFile failed: %s\n", TF_Message(status));
+  }
+  TF_StringStreamDone(s);
+
+  TF_KernelBuilder* b =
+      TF_NewKernelBuilder("SomeOp", "SomeDevice", &create, &compute, NULL);
+  TF_RegisterKernelBuilder("someKernel", b, status);
+
+  TF_DeleteStatus(status);
+  return 0;
+}
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index c34a84fcfee9b6ba9a7be86ae16e2856a2d343c7..282f0da302fac89c6fae9f8b5aa4b3c33ab93532 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -3,11 +3,19 @@ licenses(["notice"])  # Apache 2.0
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_cuda_cc_test",
-    "tf_cc_test",
     "tf_copts",
-    "tfe_xla_copts",
+    "tf_cuda_cc_test",
     "tf_cuda_library",
+    "tfe_xla_copts",
+)
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_additional_device_tracer_test_flags",
+    "tf_kernel_tests_linkstatic",
+)
+load(
+    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "tf_cuda_tests_tags",
 )
 
 tf_cuda_library(
@@ -62,6 +70,7 @@ tf_cuda_library(
         "//tensorflow/core/distributed_runtime:remote_device",
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/profiler/lib:profiler_session",
         "//tensorflow/core:gpu_runtime",
     ],
 )
@@ -101,6 +110,7 @@ tf_cuda_library(
         "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service",
         "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
         "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_client",
+        "//tensorflow/core/profiler/lib:profiler_session",
     ],
 )
 
@@ -148,6 +158,88 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cuda_library(
+    name = "c_api_experimental",
+    srcs = [
+        "c_api_experimental.cc",
+    ],
+    hdrs = ["c_api_experimental.h"],
+    copts = tf_copts() + tfe_xla_copts(),
+    visibility = ["//visibility:public"],
+    deps = select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            ":c_api",
+            ":c_api_internal",
+            "//tensorflow/c:c_api",
+            "//tensorflow/c:c_api_internal",
+            "//tensorflow/core:core_cpu",
+            "//tensorflow/core/common_runtime/eager:attr_builder",
+            "//tensorflow/core/common_runtime/eager:context",
+            "//tensorflow/core/common_runtime/eager:eager_executor",
+            "//tensorflow/core/common_runtime/eager:execute",
+            "//tensorflow/core/common_runtime/eager:kernel_and_device",
+            "//tensorflow/core/common_runtime/eager:tensor_handle",
+            "//tensorflow/core/common_runtime/eager:copy_to_device_node",
+            "//tensorflow/core:core_cpu_internal",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:lib_internal",
+            "//tensorflow/core:protos_all_cc",
+        ],
+    }) + select({
+        "//tensorflow:with_xla_support": [
+            "//tensorflow/compiler/tf2xla:xla_compiler",
+            "//tensorflow/compiler/jit",
+            "//tensorflow/compiler/jit:xla_device",
+        ],
+        "//conditions:default": [],
+    }) + [
+        "@com_google_absl//absl/memory",
+        "//tensorflow/core/common_runtime/eager:eager_operation",
+        "//tensorflow/core/distributed_runtime/eager:eager_client",
+        "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_client",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service",
+        "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
+        "//tensorflow/core/distributed_runtime:remote_device",
+        "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/profiler/rpc:profiler_server",
+        "//tensorflow/core/profiler/rpc/client:capture_profile",
+        "//tensorflow/core:gpu_runtime",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "c_api_experimental_test",
+    size = "small",
+    srcs = [
+        "c_api_experimental_test.cc",
+    ],
+    args =
+        ["--heap_check=local"] + tf_additional_device_tracer_test_flags(),
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = tf_cuda_tests_tags() + ["nomac"],
+    deps = [
+        ":c_api_experimental",
+        ":c_api_test_util",
+        "//tensorflow/c:c_test_util",
+        "//tensorflow/cc/profiler",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler:protos_all_cc",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "tape",
     hdrs = ["tape.h"],
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 027d752f420238da867cb9d8c116640e1730caaa..45701c7fcf02d5e6ec464ae10d4d20f20ba1d9f0 100755
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -356,6 +356,8 @@ TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t, TF_Status* status) {
 
 void TFE_DeleteTensorHandle(TFE_TensorHandle* h) {
   if (h == nullptr) return;
+  VLOG(1) << "Deleting tensor handle " << h << " with internal handle "
+          << h->handle;
   if (h->handle) {
     h->handle->Unref();
   }
@@ -443,15 +445,15 @@ TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, TF_Status* status) {
     return nullptr;
   }
   // TODO(agarwal): move this implementation inside TFE_TensorHandle.
-  tensorflow::Device* d = nullptr;
-  tensorflow::Device* op_device = nullptr;
   const tensorflow::Tensor* t = nullptr;
-  status->status = h->handle->TensorAndDevice(&t, &d, &op_device);
-  if (!status->status.ok()) return nullptr;
   tensorflow::TensorHandle* h_cpu = nullptr;
-  if (!IsCPU(d)) {
-    status->status = h->handle->CopyToDevice(
-        h->handle->Context(), h->handle->Context()->HostCPU(), &h_cpu);
+  tensorflow::Device* d = nullptr;
+  tensorflow::Device* op_device = nullptr;
+
+  if (h->handle->IsRemote()) {
+    status->status = EagerCopyToDevice(
+        h->handle, h->handle->Context(),
+        h->handle->Context()->HostCPU()->name().c_str(), &h_cpu);
     if (!status->status.ok()) {
       return nullptr;
     }
@@ -460,6 +462,22 @@ TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, TF_Status* status) {
       h_cpu->Unref();
       return nullptr;
     }
+  } else {
+    status->status = h->handle->TensorAndDevice(&t, &d, &op_device);
+    if (!status->status.ok()) return nullptr;
+
+    if (!IsCPU(d)) {
+      status->status = h->handle->CopyToDevice(
+          h->handle->Context(), h->handle->Context()->HostCPU(), &h_cpu);
+      if (!status->status.ok()) {
+        return nullptr;
+      }
+      status->status = h_cpu->TensorAndDevice(&t, &d, &op_device);
+      if (!status->status.ok()) {
+        h_cpu->Unref();
+        return nullptr;
+      }
+    }
   }
   TF_Tensor* retval = tensorflow::TF_TensorFromTensor(*t, status);
   if (h_cpu != nullptr) {
@@ -696,6 +714,7 @@ void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name,
 
 void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
                  TF_Status* status) {
+  VLOG(1) << "Calling TFE_Execute() on op " << op;
   tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> handle_retvals(
       *num_retvals);
   status->status =
@@ -738,12 +757,18 @@ void TFE_ContextAddFunction(TFE_Context* ctx, TF_Function* function,
   status->status = ctx->context.AddFunctionDef(function->fdef);
 }
 
+unsigned char TFE_ContextHasFunction(TFE_Context* ctx, const char* name) {
+  return ctx->context.FindFunctionDef(name) != nullptr;
+}
+
 void TFE_ContextEnableRunMetadata(TFE_Context* ctx) {
-  ctx->context.SetShouldStoreMetadata(true);
+  ctx->context.SetShouldStoreGraphs(true);
+  ctx->context.SetShouldStoreStepStats(true);
 }
 
 void TFE_ContextDisableRunMetadata(TFE_Context* ctx) {
-  ctx->context.SetShouldStoreMetadata(false);
+  ctx->context.SetShouldStoreGraphs(false);
+  ctx->context.SetShouldStoreStepStats(false);
 }
 
 }  // extern "C"
@@ -774,7 +799,7 @@ void TFE_ContextExportRunMetadata(TFE_Context* ctx, TF_Buffer* buf,
   if (!status->status.ok()) return;
   tensorflow::mutex_lock ml(*ctx->context.MetadataMu());
   status->status = MessageToBuffer(*ctx->context.RunMetadataProto(), buf);
-  ctx->context.RunMetadataProto()->Clear();
+  ctx->context.ClearRunMetadata();
 }
 
 namespace {
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index f80ae5a6d02d4d613c95cf8486e0fc0aeed3affc..044dfb7415b027b707af05a197fdb41fe1f6d2e5 100755
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -170,23 +170,11 @@ TF_CAPI_EXPORT extern int64_t TFE_TensorHandleDim(TFE_TensorHandle* h,
                                                   int dim_index,
                                                   TF_Status* status);
 
-// Returns the device of the operation that produced `h`.
-// If `h` was produced by a copy, returns the destination device of
-// the copy. Note that returned device name is not always the device
-// holding the tensor handle's memory. If you want the latter, use
-// TFE_TensorHandleBackingDeviceName.
-// This function will block till the operation that produces `h` has completed.
-//
-// Device on which the kernel of the operation that produced `h` ran.
-//
-// If `h` was produced by a copy, returns the destination device of
-// the copy.
-//
-// Note that returned device name is not always the device that owns the memory
-// that backs the tensor handle. For the latter see
-// TFE_TensorHandleBackingDeviceName.
-//
-// This function will block till the operation that produces `h` has completed.
+// Returns the device of the operation that produced `h`. If `h` was produced by
+// a copy, returns the destination device of the copy. Note that the returned
+// device name is not always the device holding the tensor handle's memory. If
+// you want the latter, use TFE_TensorHandleBackingDeviceName. This function
+// will block till the operation that produces `h` has completed.
 TF_CAPI_EXPORT extern const char* TFE_TensorHandleDeviceName(
     TFE_TensorHandle* h, TF_Status* status);
 
@@ -405,6 +393,10 @@ TF_CAPI_EXPORT extern void TFE_ContextAddFunction(TFE_Context* ctx,
                                                   TF_Function* function,
                                                   TF_Status* status);
 
+// Checks whether a function is registered under `name`.
+TF_CAPI_EXPORT unsigned char TFE_ContextHasFunction(TFE_Context* ctx,
+                                                    const char* name);
+
 // Enables tracing of RunMetadata on the ops executed from this context.
 TF_CAPI_EXPORT extern void TFE_ContextEnableRunMetadata(TFE_Context* ctx);
 
diff --git a/tensorflow/c/eager/c_api_debug.cc b/tensorflow/c/eager/c_api_debug.cc
index 52b0824552855860dfb138f3ac9a5d3afa7dc965..ffcd5ace0b98597363abe63201bf6c328a03212f 100644
--- a/tensorflow/c/eager/c_api_debug.cc
+++ b/tensorflow/c/eager/c_api_debug.cc
@@ -83,7 +83,7 @@ TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
       }
     }
 
-    if (xla::ShapeUtil::IsTuple(padded_shape)) {
+    if (padded_shape.IsTuple()) {
       if (xla::ShapeUtil::TupleElementCount(padded_shape) != 2) {
         // Currently, the only case of XlaTensor containing a tuple shape is to
         // represent 64 bit ints, doubles, and complex numbers (we don't support
@@ -99,7 +99,7 @@ TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
       xla::Shape shape0 = xla::ShapeUtil::GetTupleElementShape(padded_shape, 0);
       const xla::Shape& shape1 =
           xla::ShapeUtil::GetTupleElementShape(padded_shape, 1);
-      if (xla::ShapeUtil::IsTuple(shape0) || xla::ShapeUtil::IsTuple(shape1)) {
+      if (shape0.IsTuple() || shape1.IsTuple()) {
         status->status = tensorflow::errors::InvalidArgument(
             "XlaTensors should not contain nested tuples. Shape: ",
             padded_shape.DebugString());
diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ff798593b5f2f77339b668668ff6dafb9f44a2b3
--- /dev/null
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -0,0 +1,93 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/eager/c_api_experimental.h"
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/core/profiler/rpc/client/capture_profile.h"
+#include "tensorflow/core/profiler/rpc/profiler_server.h"
+
+using tensorflow::string;
+
+void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
+  op->operation.ConsumeInput(h->handle);
+}
+
+TFE_Profiler* TFE_NewProfiler(TFE_ProfilerContext* ctx) {
+  return new TFE_Profiler(ctx);
+}
+
+bool TFE_ProfilerIsOk(TFE_Profiler* profiler) {
+  return profiler->profiler->Status().ok();
+}
+
+void TFE_DeleteProfiler(TFE_Profiler* profiler) { delete profiler; }
+
+void TFE_ProfilerSerializeToString(TFE_Context* ctx, TFE_Profiler* profiler,
+                                   TF_Buffer* buf, TF_Status* status) {
+  TFE_ContextAsyncWait(ctx, status);
+  if (!status->status.ok()) return;
+  string content;
+  status->status = profiler->profiler->SerializeToString(&content);
+  void* data = tensorflow::port::Malloc(content.length());
+  content.copy(static_cast<char*>(data), content.length(), 0);
+  buf->data = data;
+  buf->length = content.length();
+  buf->data_deallocator = [](void* data, size_t length) {
+    tensorflow::port::Free(data);
+  };
+}
+
+TFE_ProfilerContext* TFE_NewProfilerContext() {
+  return new TFE_ProfilerContext;
+}
+
+void TFE_ProfilerContextSetEagerContext(TFE_ProfilerContext* profiler_context,
+                                        TFE_Context* eager_context) {
+  profiler_context->profiler_context.eager_context = &eager_context->context;
+}
+
+void TFE_DeleteProfilerContext(TFE_ProfilerContext* profiler_context) {
+  delete profiler_context;
+}
+
+void TFE_StartProfilerServer(TFE_ProfilerContext* context, int port) {
+  // Release child thread intentionally. The child thread can be terminate by
+  // terminating the main thread.
+  tensorflow::StartProfilerServer(&context->profiler_context, port).release();
+}
+
+void TFE_ContextEnableGraphCollection(TFE_Context* ctx) {
+  ctx->context.SetShouldStoreGraphs(true);
+}
+
+void TFE_ContextDisableGraphCollection(TFE_Context* ctx) {
+  ctx->context.SetShouldStoreGraphs(false);
+}
+
+bool TFE_ProfilerClientStartTracing(char* service_addr, char* logdir,
+                                    char* worker_list, bool include_dataset_ops,
+                                    int duration_ms, int num_tracing_attempts) {
+  tensorflow::Status s =
+      tensorflow::profiler::client::ValidateHostPortPair(service_addr);
+  if (!s.ok()) {
+    return false;
+  }
+  s = tensorflow::profiler::client::StartTracing(
+      service_addr, logdir, worker_list, include_dataset_ops, duration_ms,
+      num_tracing_attempts);
+  return s.ok();
+}
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
new file mode 100644
index 0000000000000000000000000000000000000000..89523793d37b89ee49c4db844a85f019381ff730
--- /dev/null
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -0,0 +1,94 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_C_API_EXPERIMENTAL_H_
+#define TENSORFLOW_C_EAGER_C_API_EXPERIMENTAL_H_
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+TF_CAPI_EXPORT extern void TFE_OpConsumeInput(TFE_Op* op, TFE_TensorHandle* h,
+                                              TF_Status* status);
+
+typedef struct TFE_ProfilerContext TFE_ProfilerContext;
+
+// A profiler which will start profiling when creating the object and will stop
+// when the object is destroyed. It will profile all operations run under the
+// given TFE_Context. Multiple instance of it can be created, but at most one
+// of them will profile for each TFE_Context.
+// Thread-safety: TFE_Profiler is thread-safe.
+typedef struct TFE_Profiler TFE_Profiler;
+
+TF_CAPI_EXPORT extern TFE_Profiler* TFE_NewProfiler(TFE_ProfilerContext* ctx);
+TF_CAPI_EXPORT extern bool TFE_ProfilerIsOk(TFE_Profiler* profiler);
+TF_CAPI_EXPORT extern void TFE_DeleteProfiler(TFE_Profiler* profiler);
+
+// The output string is a binary string of tensorflow.tpu.Trace. User can write
+// the string to file for offline analysis by tensorboard.
+TF_CAPI_EXPORT extern void TFE_ProfilerSerializeToString(TFE_Context* ctx,
+                                                         TFE_Profiler* profiler,
+                                                         TF_Buffer* buf,
+                                                         TF_Status* status);
+
+// Return a new profiler context object.
+TF_CAPI_EXPORT extern TFE_ProfilerContext* TFE_NewProfilerContext(void);
+
+// Set the eager context in TFE_ProfilerServerOptions
+TF_CAPI_EXPORT extern void TFE_ProfilerContextSetEagerContext(
+    TFE_ProfilerContext* profiler_context, TFE_Context* eager_context);
+
+// Destroy a profiler context object.
+TF_CAPI_EXPORT extern void TFE_DeleteProfilerContext(
+    TFE_ProfilerContext* profiler_context);
+
+// Start a profiler grpc server which listens to specified port. It will start
+// the server on its own thread. It can be shutdown by terminating tensorflow.
+// It can be used in both Eager mode and graph mode. Creating multiple profiler
+// server is allowed. The service defined in
+// tensorflow/contrib/tpu/profiler/tpu_profiler.proto. Please use
+// tensorflow/contrib/tpu/profiler/capture_tpu_profile to capture tracable
+// file following
+// https://cloud.google.com/tpu/docs/cloud-tpu-tools#capture_trace.
+TF_CAPI_EXPORT extern void TFE_StartProfilerServer(TFE_ProfilerContext* context,
+                                                   int port);
+
+// Enables only graph collection in RunMetadata on the functions executed from
+// this context.
+TF_CAPI_EXPORT extern void TFE_ContextEnableGraphCollection(TFE_Context* ctx);
+
+// Disables only graph collection in RunMetadata on the functions executed from
+// this context.
+TF_CAPI_EXPORT extern void TFE_ContextDisableGraphCollection(TFE_Context* ctx);
+
+// Send a grpc request to profiler server (service_addr) to perform on-demand
+// profiling and save the result into logdir which can be visualized by
+// TensorBoard. worker_list is the list of worker TPUs separated by ','. Set
+// include_dataset_opts to false to profile longer traces. It will block the
+// caller thread until receives tracing result.
+// This API is designed for TensorBoard, for end user, please use
+// tensorflow/contrib/tpu/profiler/capture_tpu_profile instead following
+// https://cloud.google.com/tpu/docs/cloud-tpu-tools#capture_trace.
+TF_CAPI_EXPORT extern bool TFE_ProfilerClientStartTracing(
+    char* service_addr, char* logdir, char* worker_list,
+    bool include_dataset_ops, int duration_ms, int num_tracing_attempts);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_EAGER_C_API_EXPERIMENTAL_H_
diff --git a/tensorflow/c/eager/c_api_experimental_test.cc b/tensorflow/c/eager/c_api_experimental_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d85048caa7c7f727271352883cb834a2575bd251
--- /dev/null
+++ b/tensorflow/c/eager/c_api_experimental_test.cc
@@ -0,0 +1,129 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/eager/c_api_experimental.h"
+
+#include <string.h>
+#include "tensorflow/c/eager/c_api_test_util.h"
+#include "tensorflow/cc/profiler/profiler.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/profiler/trace_events.pb.h"
+
+using tensorflow::string;
+
+namespace tensorflow {
+namespace {
+
+static bool HasSubstr(absl::string_view base, absl::string_view substr) {
+  bool ok = str_util::StrContains(base, substr);
+  EXPECT_TRUE(ok) << base << ", expected substring " << substr;
+  return ok;
+}
+
+void ExecuteWithProfiling(bool async) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  TFE_ProfilerContext* profiler_context = TFE_NewProfilerContext();
+  TFE_ProfilerContextSetEagerContext(profiler_context, ctx);
+  TFE_Profiler* profiler = TFE_NewProfiler(profiler_context);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+  TFE_DeleteProfilerContext(profiler_context);
+
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+  TFE_Op* matmul = MatMulOp(ctx, m, m);
+  TFE_TensorHandle* retvals[1] = {nullptr};
+  int num_retvals = 1;
+
+  // Run op on GPU if it is present.
+  string gpu_device_name;
+  if (GetDeviceName(ctx, &gpu_device_name, "GPU")) {
+    TFE_OpSetDevice(matmul, gpu_device_name.c_str(), status);
+    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
+    const char* device_name = TFE_OpGetDevice(matmul, status);
+    ASSERT_TRUE(strstr(device_name, "GPU:0") != nullptr);
+  }
+
+  TFE_Execute(matmul, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteOp(matmul);
+  TFE_DeleteTensorHandle(m);
+
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  ASSERT_EQ(1, num_retvals);
+  TF_Buffer* profiler_result = TF_NewBuffer();
+  TFE_ProfilerSerializeToString(ctx, profiler, profiler_result, status);
+  TFE_DeleteProfiler(profiler);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  profiler::Trace profile_proto;
+  EXPECT_TRUE(profile_proto.ParseFromString(
+      {reinterpret_cast<const char*>(profiler_result->data),
+       profiler_result->length}));
+  string profile_proto_str = profile_proto.DebugString();
+  if (!gpu_device_name.empty()) {
+    EXPECT_TRUE(HasSubstr(profile_proto_str, "GPU:0"));
+    // device name with "stream:all" is collected by Device Tracer.
+    EXPECT_TRUE(HasSubstr(profile_proto_str, "stream:all"));
+  }
+  EXPECT_TRUE(HasSubstr(profile_proto_str, "CPU:0"));
+  TF_DeleteBuffer(profiler_result);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+  TFE_DeleteTensorHandle(retvals[0]);
+  TFE_DeleteContext(ctx);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  float product[4] = {0};
+  EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
+  memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(7, product[0]);
+  EXPECT_EQ(10, product[1]);
+  EXPECT_EQ(15, product[2]);
+  EXPECT_EQ(22, product[3]);
+  TF_DeleteStatus(status);
+}
+TEST(CAPI, ExecuteWithTracing) { ExecuteWithProfiling(false); }
+TEST(CAPI, ExecuteWithTracingAsync) { ExecuteWithProfiling(true); }
+
+TEST(CAPI, MultipleProfilerSession) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(false));
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_ProfilerContext* profiler_context = TFE_NewProfilerContext();
+  TFE_ProfilerContextSetEagerContext(profiler_context, ctx);
+
+  TFE_Profiler* profiler1 = TFE_NewProfiler(profiler_context);
+  EXPECT_TRUE(TFE_ProfilerIsOk(profiler1));
+
+  TFE_Profiler* profiler2 = TFE_NewProfiler(profiler_context);
+  EXPECT_FALSE(TFE_ProfilerIsOk(profiler2));
+
+  TFE_DeleteProfiler(profiler1);
+  TFE_DeleteProfiler(profiler2);
+  TFE_DeleteProfilerContext(profiler_context);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index 67bc1bcd24605f8363d6a7c8d5d6a0836a42fc82..a563e4b8f50f2a90497736f4cb9ca234400bfa04 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -52,6 +52,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/profiler/lib/profiler_session.h"
 #include "tensorflow/core/public/version.h"
 
 struct TFE_ContextOptions {
@@ -82,6 +83,12 @@ struct TFE_TensorHandle {
   TFE_TensorHandle(tensorflow::TensorHandle* handle) : handle(handle) {}
 
   tensorflow::TensorHandle* handle;
+
+  // Create a symbolic tensor.
+  TFE_TensorHandle(TF_Output t, TF_DataType dtype)
+      : handle(new tensorflow::TensorHandle(
+            tensorflow::OutputGraphNode{t.oper, t.index},
+            static_cast<tensorflow::DataType>(dtype))) {}
 };
 
 struct TFE_TensorDebugInfo {
@@ -100,6 +107,18 @@ struct TFE_Op {
   tensorflow::EagerOperation operation;
 };
 
+struct TFE_ProfilerContext {
+  tensorflow::ProfilerContext profiler_context;
+};
+
+struct TFE_Profiler {
+  TFE_Profiler(TFE_ProfilerContext* ctx) {
+    profiler = tensorflow::ProfilerSession::Create(&ctx->profiler_context);
+  }
+
+  std::unique_ptr<tensorflow::ProfilerSession> profiler;
+};
+
 namespace tensorflow {
 // Set an AttrValue on the op. Doesn't handle the list types.
 void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op,
@@ -107,4 +126,24 @@ void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op,
                           const char* attr_name, TF_Status* status);
 }  // namespace tensorflow
 
+struct TFE_TraceContext {
+  TF_Graph* const graph;
+
+  unsigned int node_counter = 0;
+  // Each tensor handle will have its ref count incremented when it's added as a
+  // map key, and decremented when this object is destroyed.
+  std::map<tensorflow::TensorHandle*, TF_Output> input_tensor_map;
+  std::vector<std::pair<tensorflow::TensorHandle*, TF_Output>>* input_tensors =
+      nullptr;
+
+  TFE_TraceContext(TF_Graph* graph) : graph(graph) {}
+
+  ~TFE_TraceContext() {
+    delete input_tensors;
+    for (auto input : input_tensor_map) {
+      input.first->Unref();
+    }
+  }
+};
+
 #endif  // TENSORFLOW_C_EAGER_C_API_INTERNAL_H_
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 6b39b79ee82f9c7baaf856e573a42b7da65691e5..3d1ca4fb4b561a03ea9d879b1876fb1fd08a3139 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -175,13 +175,8 @@ void TestRemoteExecute(bool async) {
   TFE_Execute(matmul, &retvals[0], &num_retvals, status);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
-  auto* retval_task0 = TFE_TensorHandleCopyToDevice(
-      retvals[0], ctx, "/job:localhost/replica:0/task:0/device:CPU:0", status);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  TF_Tensor* t = TFE_TensorHandleResolve(retval_task0, status);
+  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TFE_DeleteTensorHandle(retval_task0);
   float product[4] = {0};
   EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
   memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
diff --git a/tensorflow/c/env.cc b/tensorflow/c/env.cc
index 07b9e8b940c55caf62ae0b81b884bf313d335459..1c35ff9001d0ee1ab0fbae9e1bcc07116fab1065 100644
--- a/tensorflow/c/env.cc
+++ b/tensorflow/c/env.cc
@@ -159,3 +159,25 @@ TF_CAPI_EXPORT extern uint64_t TF_NowMicros(void) {
 TF_CAPI_EXPORT extern uint64_t TF_NowSeconds(void) {
   return ::tensorflow::Env::Default()->NowSeconds();
 }
+
+void TF_DefaultThreadOptions(TF_ThreadOptions* options) {
+  options->stack_size = 0;
+  options->guard_size = 0;
+  options->numa_node = -1;
+}
+
+TF_Thread* TF_StartThread(const TF_ThreadOptions* options,
+                          const char* thread_name, void (*work_func)(void*),
+                          void* param) {
+  ::tensorflow::ThreadOptions cc_options;
+  cc_options.stack_size = options->stack_size;
+  cc_options.guard_size = options->guard_size;
+  cc_options.numa_node = options->numa_node;
+  return reinterpret_cast<TF_Thread*>(::tensorflow::Env::Default()->StartThread(
+      cc_options, thread_name, [=]() { (*work_func)(param); }));
+}
+
+void TF_JoinThread(TF_Thread* thread) {
+  // ::tensorflow::Thread joins on destruction
+  delete reinterpret_cast<::tensorflow::Thread*>(thread);
+}
diff --git a/tensorflow/c/env.h b/tensorflow/c/env.h
index 9d27c5da37735042c7476b591e57486dbde33152..73078fcbbc5ae4c042f4a992655072a838e42915 100644
--- a/tensorflow/c/env.h
+++ b/tensorflow/c/env.h
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
 #ifndef TENSORFLOW_C_ENV_H_
 #define TENSORFLOW_C_ENV_H_
 
@@ -21,13 +25,14 @@ limitations under the License.
 // --------------------------------------------------------------------------
 // C API for tensorflow::Env.
 
-struct TF_WritableFileHandle;
-struct TF_StringStream;
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+typedef struct TF_WritableFileHandle TF_WritableFileHandle;
+typedef struct TF_StringStream TF_StringStream;
+typedef struct TF_Thread TF_Thread;
+
 typedef struct TF_FileStatistics {
   // The length of the file in bytes.
   int64_t length;
@@ -37,6 +42,20 @@ typedef struct TF_FileStatistics {
   bool is_directory;
 } TF_FileStatistics;
 
+typedef struct TF_ThreadOptions {
+  // Thread stack size to use (in bytes), zero implies that the system default
+  // will be used.
+  size_t stack_size;
+
+  // Guard area size to use near thread stacks to use (in bytes), zero implies
+  // that the system default will be used.
+  size_t guard_size;
+
+  // The NUMA node to use, -1 implies that there should be no NUMA affinity for
+  // this thread.
+  int numa_node;
+} TF_ThreadOptions;
+
 // Creates the specified directory. Typical status code are:
 //  * TF_OK - successfully created the directory
 //  * TF_ALREADY_EXISTS - directory already exists
@@ -150,6 +169,25 @@ TF_CAPI_EXPORT extern uint64_t TF_NowMicros(void);
 // Returns the number of seconds since the Unix epoch.
 TF_CAPI_EXPORT extern uint64_t TF_NowSeconds(void);
 
+// Populates a TF_ThreadOptions struct with system-default values.
+TF_CAPI_EXPORT extern void TF_DefaultThreadOptions(TF_ThreadOptions* options);
+
+// Returns a new thread that is running work_func and is identified
+// (for debugging/performance-analysis) by thread_name.
+//
+// The given param (which may be null) is passed to work_func when the thread
+// starts. In this way, data may be passed from the thread back to the caller.
+//
+// Caller takes ownership of the result and must call TF_JoinThread on it
+// eventually.
+TF_CAPI_EXPORT extern TF_Thread* TF_StartThread(const TF_ThreadOptions* options,
+                                                const char* thread_name,
+                                                void (*work_func)(void*),
+                                                void* param);
+
+// Waits for the given thread to finish execution, then deletes it.
+TF_CAPI_EXPORT extern void TF_JoinThread(TF_Thread* thread);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/tensorflow/c/env_test.cc b/tensorflow/c/env_test.cc
index e2206c6befd2167346c64032940d6e8c631e4a3e..687ad024137352662759ec1f43df87e89faca353 100644
--- a/tensorflow/c/env_test.cc
+++ b/tensorflow/c/env_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -98,3 +99,29 @@ TEST(TestEnv, TestTimeFunctions) {
   ASSERT_GE(TF_NowMicros(), 946684800 * 1e6);
   ASSERT_GE(TF_NowNanos(), 946684800 * 1e9);
 }
+
+namespace {
+
+struct SomeThreadData {
+  ::tensorflow::mutex mu;
+  bool did_work = false;
+};
+
+void SomeThreadFunc(void* data) {
+  auto* real_data = static_cast<SomeThreadData*>(data);
+  ::tensorflow::mutex_lock l(real_data->mu);
+  real_data->did_work = true;
+}
+
+}  // namespace
+
+TEST(TestEnv, TestThreads) {
+  TF_ThreadOptions options;
+  TF_DefaultThreadOptions(&options);
+  SomeThreadData data;
+  TF_Thread* thread =
+      TF_StartThread(&options, "SomeThreadName", &SomeThreadFunc, &data);
+  TF_JoinThread(thread);
+  ::tensorflow::mutex_lock l(data.mu);
+  ASSERT_TRUE(data.did_work);
+}
diff --git a/tensorflow/c/kernels.cc b/tensorflow/c/kernels.cc
index 2a4eaecb6cf2740a522b1e849d1306ebde6c4577..71181ae430ab64106e2a75937bd54fbf2efc61ac 100644
--- a/tensorflow/c/kernels.cc
+++ b/tensorflow/c/kernels.cc
@@ -48,9 +48,10 @@ TF_KernelBuilder* TF_NewKernelBuilder(
 }
 
 void TF_DeleteKernelBuilder(TF_KernelBuilder* builder) {
-  DCHECK_NE(builder, nullptr);
-  delete builder->cc_builder;
-  delete builder;
+  if (builder != nullptr) {
+    delete builder->cc_builder;
+    delete builder;
+  }
 }
 
 namespace tensorflow {
@@ -158,3 +159,41 @@ void TF_SetOutput(TF_OpKernelContext* ctx, int i, const TF_Tensor* tensor,
     cc_ctx->set_output(i, cc_tensor);
   }
 }
+
+void TF_OpKernelConstruction_Failure(TF_OpKernelConstruction* ctx,
+                                     TF_Status* status) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelConstruction*>(ctx);
+  ::tensorflow::Status s(::tensorflow::StatusFromTF_Status(status));
+  cc_ctx->CtxFailure(s);
+}
+
+void TF_OpKernelContext_Failure(TF_OpKernelContext* ctx, TF_Status* status) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
+  ::tensorflow::Status s(::tensorflow::StatusFromTF_Status(status));
+  cc_ctx->CtxFailure(s);
+}
+
+#define DEFINE_TF_GETATTR(func, c_type, cc_type)                               \
+  void TF_OpKernelConstruction_GetAttr##func(TF_OpKernelConstruction* ctx,     \
+                                             const char* attr_name,            \
+                                             c_type* val, TF_Status* status) { \
+    TF_SetStatus(status, TF_OK, "");                                           \
+    cc_type v;                                                                 \
+    auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelConstruction*>(ctx); \
+    ::tensorflow::Status s = cc_ctx->GetAttr(attr_name, &v);                   \
+    ::tensorflow::Set_TF_Status_from_Status(status, s);                        \
+    if (s.ok()) {                                                              \
+      *val = static_cast<c_type>(v);                                           \
+    }                                                                          \
+  }
+
+DEFINE_TF_GETATTR(Type, TF_DataType, tensorflow::DataType)
+
+TF_DataType TF_ExpectedOutputDataType(TF_OpKernelContext* ctx, int i) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
+  return static_cast<TF_DataType>(cc_ctx->expected_output_dtype(i));
+}
+
+int64_t TF_StepId(TF_OpKernelContext* ctx) {
+  return reinterpret_cast<::tensorflow::OpKernelContext*>(ctx)->step_id();
+}
diff --git a/tensorflow/c/kernels.h b/tensorflow/c/kernels.h
index 1a91aa184f11ac8e45b38a1d106c7b445747a7c1..c47bfa8aa3a721d422a0a1536b924f3e53793193 100644
--- a/tensorflow/c/kernels.h
+++ b/tensorflow/c/kernels.h
@@ -35,9 +35,9 @@ extern "C" {
 // `TF_RegisterKernelBuilder`, which will allow TF to construct user-provided
 // kernels when necessary.
 
-struct TF_KernelBuilder;
-struct TF_OpKernelConstruction;
-struct TF_OpKernelContext;
+typedef struct TF_KernelBuilder TF_KernelBuilder;
+typedef struct TF_OpKernelConstruction TF_OpKernelConstruction;
+typedef struct TF_OpKernelContext TF_OpKernelContext;
 
 // Allocates a new kernel builder and returns a pointer to it.
 //
@@ -111,6 +111,32 @@ TF_CAPI_EXPORT extern void TF_SetOutput(TF_OpKernelContext* ctx, int i,
                                         const TF_Tensor* tensor,
                                         TF_Status* status);
 
+// Notifies the given OpKernelConstruction that kernel construction has failed.
+TF_CAPI_EXPORT extern void TF_OpKernelConstruction_Failure(
+    TF_OpKernelConstruction* ctx, TF_Status* status);
+
+// Notifies the given OpKernelContext that the kernel's compute function has
+// failed.
+TF_CAPI_EXPORT extern void TF_OpKernelContext_Failure(TF_OpKernelContext* ctx,
+                                                      TF_Status* status);
+
+// Returns the expected output data type of the ith output. If i < 0 or
+// i >= TF_NumOutputs(ctx), the program aborts.
+TF_CAPI_EXPORT extern TF_DataType TF_ExpectedOutputDataType(
+    TF_OpKernelContext* ctx, int i);
+
+// Returns the step ID of the given context.
+TF_CAPI_EXPORT extern int64_t TF_StepId(TF_OpKernelContext* ctx);
+
+// Interprets the named kernel construction attribute as a TF_DataType and
+// places it into *val. *status is set to TF_OK.
+//
+// If the attribute could not be found or could not be interpreted as
+// TF_DataType, *status is populated with an error.
+TF_CAPI_EXPORT extern void TF_OpKernelConstruction_GetAttrType(
+    TF_OpKernelConstruction* ctx, const char* attr_name, TF_DataType* val,
+    TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/kernels_test.cc b/tensorflow/c/kernels_test.cc
index e659ee3c3d258a626ccf03a782ec031b5a703a48..608887722f7bca44c884a3426d5e378e9387a530 100644
--- a/tensorflow/c/kernels_test.cc
+++ b/tensorflow/c/kernels_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/c/kernels.h"
 
 #include "tensorflow/c/c_api.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/node_def.pb_text.h"
 #include "tensorflow/core/framework/op.h"
@@ -35,12 +36,24 @@ static void* MyCreateFunc(TF_OpKernelConstruction* ctx) {
   struct MyCustomKernel* s = new struct MyCustomKernel;
   s->created = true;
   s->compute_called = false;
+
+  // Exercise attribute reads.
+  TF_DataType type;
+  TF_Status* status = TF_NewStatus();
+  TF_OpKernelConstruction_GetAttrType(ctx, "SomeDataTypeAttr", &type, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status));
+  EXPECT_EQ(TF_FLOAT, type);
+  TF_DeleteStatus(status);
+
   return s;
 }
 
 static void MyComputeFunc(void* kernel, TF_OpKernelContext* ctx) {
   struct MyCustomKernel* s = static_cast<struct MyCustomKernel*>(kernel);
   s->compute_called = true;
+  if (ctx != nullptr) {
+    EXPECT_EQ(43, TF_StepId(ctx));
+  }
 }
 
 static void MyDeleteFunc(void* kernel) {
@@ -61,6 +74,11 @@ static std::unique_ptr<OpKernel> GetFakeKernel(const char* device_name,
   def.set_device(device_name);
   def.add_input("input1");
   def.add_input("input2");
+
+  AttrValue v;
+  v.set_type(DataType::DT_FLOAT);
+  (*def.mutable_attr())["SomeDataTypeAttr"] = v;
+
   return CreateOpKernel(DeviceType(device_name), nullptr, nullptr, def, 1,
                         status);
 }
@@ -75,7 +93,8 @@ TEST(TestKernel, TestRegisterKernelBuilder) {
   REGISTER_OP(op_name)
       .Input("input1: double")
       .Input("input2: uint8")
-      .Output("output1: uint8");
+      .Output("output1: uint8")
+      .Attr("SomeDataTypeAttr: type");
 
   TF_KernelBuilder* builder = TF_NewKernelBuilder(
       op_name, device_name, &MyCreateFunc, &MyComputeFunc, &MyDeleteFunc);
@@ -126,7 +145,8 @@ TEST(TestKernel, TestInputAndOutputCount) {
   REGISTER_OP(op_name)
       .Input("input1: double")
       .Input("input2: uint8")
-      .Output("output1: uint8");
+      .Output("output1: uint8")
+      .Attr("SomeDataTypeAttr: type");
 
   static int num_inputs = 0;
   static int num_outputs = 0;
@@ -155,6 +175,8 @@ TEST(TestKernel, TestInputAndOutputCount) {
     TF_SetOutput(ctx, 24, input, s);
     EXPECT_EQ(TF_OUT_OF_RANGE, TF_GetCode(s));
 
+    EXPECT_EQ(TF_UINT8, TF_ExpectedOutputDataType(ctx, 0));
+
     TF_DeleteStatus(s);
     if (input != nullptr) {
       TF_DeleteTensor(input);
@@ -175,6 +197,7 @@ TEST(TestKernel, TestInputAndOutputCount) {
     OpKernelContext::Params p;
     DummyDevice dummy_device(nullptr, false);
     p.device = &dummy_device;
+    p.step_id = 43;
 
     Tensor t(tensorflow::uint8(123));
 
@@ -200,4 +223,8 @@ TEST(TestKernel, TestInputAndOutputCount) {
   }
 }
 
+TEST(TestKernel, DeleteKernelBuilderIsOkOnNull) {
+  TF_DeleteKernelBuilder(nullptr);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index a09becc49b10d2c58f98fbcc11df5190f794c1d4..4c4d587fce04d101b3cc8faebcc3ba04f2f1d0cf 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -150,6 +150,7 @@ cc_library_with_android_deps(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
     ],
 )
@@ -586,6 +587,25 @@ tf_gen_op_wrappers_cc(
     pkg = "//tensorflow/core",
 )
 
+tf_gen_op_wrappers_cc(
+    name = "tpu_ops",
+    include_internal_ops = 1,
+    op_lib_names = [
+        "tpu_configuration_ops",
+        "tpu_cross_replica_ops",
+        "tpu_embedding_ops",
+        "tpu_functional_ops",
+        "tpu_heartbeat_ops",
+        "tpu_host_compute_ops",
+        "tpu_infeed_ops",
+        "tpu_outfeed_ops",
+        "tpu_ordinal_selector_ops",
+        "tpu_replication_ops",
+    ],
+    pkg = "//tensorflow/core",
+    visibility = ["//tensorflow:internal"],
+)
+
 cc_library_with_android_deps(
     name = "cc_op_gen_main",
     srcs = [
diff --git a/tensorflow/cc/framework/cc_op_gen.cc b/tensorflow/cc/framework/cc_op_gen.cc
index 39593370d1c243e84dc5b6091724d1d404c102b0..43a33cbea6e1e4a50f61cc7d6d8d70cac6a603d2 100644
--- a/tensorflow/cc/framework/cc_op_gen.cc
+++ b/tensorflow/cc/framework/cc_op_gen.cc
@@ -321,6 +321,7 @@ std::pair<const char*, bool> AttrTypeName(StringPiece attr_type) {
           {"tensor", {"TensorProto", true}},
           {"list(tensor)", {"gtl::ArraySlice<TensorProto>", true}},
           {"func", {"NameAttrList", true}},
+          {"list(func)", {"gtl::ArraySlice<NameAttrList>", true}},
       };
 
   auto entry = attr_type_map->find(attr_type);
diff --git a/tensorflow/cc/framework/gradients.cc b/tensorflow/cc/framework/gradients.cc
index affd90b1bcc7cb4a8b3ffed6aeeb4bd480f5e314..a7e645e8b556f14f0c7a51d2eba6ab1e2256b837 100644
--- a/tensorflow/cc/framework/gradients.cc
+++ b/tensorflow/cc/framework/gradients.cc
@@ -96,7 +96,7 @@ class SymbolicGradientBuilder {
   // Used to identify nodes at which to stop backprop.
   std::unordered_set<int> GetStopBackpropNodes(
       const std::vector<bool>& reachable_nodes,
-      std::unordered_set<int> output_nodes);
+      const std::unordered_set<int>& output_nodes);
 
   const Scope& scope_;
   const ops::GradOpRegistry* registry_;
@@ -167,7 +167,6 @@ Status SymbolicGradientBuilder::BackpropAlongEdge(const Output& dst_grad,
 std::vector<bool> SymbolicGradientBuilder::GetReachableNodes() {
   std::vector<bool> reachable_nodes(scope_.graph()->num_node_ids(), false);
   std::deque<Node*> queue;
-  std::vector<bool> visited(scope_.graph()->num_node_ids(), false);
   for (const Output& out : outputs_) {
     if (!reachable_nodes[out.node()->id()]) {
       queue.push_back(out.node());
@@ -180,10 +179,10 @@ std::vector<bool> SymbolicGradientBuilder::GetReachableNodes() {
     queue.pop_front();
     for (const Edge* e : n->in_edges()) {
       if (e->IsControlEdge()) continue;
-      if (visited[e->src()->id()]) continue;
-      queue.push_back(e->src());
-      reachable_nodes[e->src()->id()] = true;
-      visited[e->src()->id()] = true;
+      if (!reachable_nodes[e->src()->id()]) {
+        queue.push_back(e->src());
+        reachable_nodes[e->src()->id()] = true;
+      }
     }
   }
   return reachable_nodes;
@@ -191,7 +190,7 @@ std::vector<bool> SymbolicGradientBuilder::GetReachableNodes() {
 
 std::unordered_set<int> SymbolicGradientBuilder::GetStopBackpropNodes(
     const std::vector<bool>& reachable_nodes,
-    std::unordered_set<int> output_nodes) {
+    const std::unordered_set<int>& output_nodes) {
   // Output nodes that get transitively consumed by other `outputs_` are stored
   // in `internal_outputs`.
   std::unordered_set<int> internal_outputs;
diff --git a/tensorflow/cc/gradients/image_grad.cc b/tensorflow/cc/gradients/image_grad.cc
index 882709e1e2817431a32c453fe0f35f2b2e6c69b0..05c287bdc62cdb8be7208ce3975f280aaa816766 100644
--- a/tensorflow/cc/gradients/image_grad.cc
+++ b/tensorflow/cc/gradients/image_grad.cc
@@ -69,6 +69,23 @@ Status ResizeBicubicGradHelper(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("ResizeBicubic", ResizeBicubicGradHelper);
 
+Status ScaleAndTranslateGradHelper(const Scope& scope, const Operation& op,
+                                   const std::vector<Output>& grad_inputs,
+                                   std::vector<Output>* grad_outputs) {
+  string kernel_type;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.node()->attrs(), "kernel_type", &kernel_type));
+  grad_outputs->push_back(internal::ScaleAndTranslateGrad(
+      scope, grad_inputs[0], op.input(0), op.input(2), op.input(3),
+      internal::ScaleAndTranslateGrad::KernelType(kernel_type)));
+
+  grad_outputs->push_back(NoGradient());
+  grad_outputs->push_back(NoGradient());
+  grad_outputs->push_back(NoGradient());
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("ScaleAndTranslate", ScaleAndTranslateGradHelper);
+
 }  // anonymous namespace
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/image_grad_test.cc b/tensorflow/cc/gradients/image_grad_test.cc
index 2e55c7561b030c50bd67bd53fd0d55710085c5d2..1d150226538093467e092e02f38090a327f9c9b6 100644
--- a/tensorflow/cc/gradients/image_grad_test.cc
+++ b/tensorflow/cc/gradients/image_grad_test.cc
@@ -30,6 +30,7 @@ using ops::Const;
 using ops::ResizeBicubic;
 using ops::ResizeBilinear;
 using ops::ResizeNearestNeighbor;
+using ops::ScaleAndTranslate;
 
 class ImageGradTest : public ::testing::Test {
  protected:
@@ -153,5 +154,45 @@ TEST_F(ImageGradTest, TestBicubic) {
   TestResize<double, float, double>(RESIZE_BICUBIC);
 }
 
+class ScaleAndTranslateGradTest : public ::testing::Test {
+ protected:
+  ScaleAndTranslateGradTest() : scope_(Scope::NewRootScope()) {}
+
+  template <typename T>
+  Tensor MakeData(const TensorShape& data_shape) {
+    DataType data_type = DataTypeToEnum<T>::v();
+    Tensor data(data_type, data_shape);
+    auto data_flat = data.flat<T>();
+    for (int i = 0; i < data_flat.size(); ++i) {
+      data_flat(i) = T(i);
+    }
+    return data;
+  }
+
+  template <typename T>
+  void MakeOp(const Tensor& x_data, const Input& y_shape, Output* x,
+              Output* y) {
+    *x = Const<T>(scope_, x_data);
+    *y = ScaleAndTranslate(scope_, *x, y_shape, {1.8f, 2.1f}, {0.5f, 0.7f});
+    TF_ASSERT_OK(scope_.status());
+  }
+
+  template <typename X_T, typename Y_T, typename JAC_T>
+  void TestResize() {
+    TensorShape x_shape({1, 2, 3, 1});
+    Tensor x_data = MakeData<X_T>(x_shape);
+    Output x, y;
+    MakeOp<X_T>(x_data, {4, 6}, &x, &y);
+    JAC_T max_error;
+    TF_ASSERT_OK((ComputeGradientError<X_T, Y_T, JAC_T>(
+        scope_, x, x_data, y, {1, 4, 6, 1}, &max_error)));
+    EXPECT_LT(max_error, 1e-3);
+  }
+
+  Scope scope_;
+};
+
+TEST_F(ScaleAndTranslateGradTest, Works) { TestResize<float, float, float>(); }
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/cc/profiler/BUILD b/tensorflow/cc/profiler/BUILD
index cf65fe1ab99b49207a64e86310178141b30d07d7..e9838d9aba6554b40082187057851e9c896f8352 100644
--- a/tensorflow/cc/profiler/BUILD
+++ b/tensorflow/cc/profiler/BUILD
@@ -10,7 +10,7 @@ tf_cuda_cc_test(
     name = "profiler_test",
     srcs = ["profiler_test.cc"],
     tags = [
-        "noguitar",  # b/77649654
+        "nogpu",  # b/77649654
     ],
     deps = [
         ":profiler",
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index 52345a376cc29ee47ccb9888c9bb26292468b5a9..dedd55f16afb879ea966dc89d14d88ee15d9e83e 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -81,6 +81,7 @@ cc_library(
     ] + if_not_mobile([
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:lib",
+        "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
     ]) + if_android([
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index 85d3dd01fa51b3c3ba6fcbf5faac03f1ff5630e2..66260fcf4a9b24f78d45010c6e86d4ee398b6d3d 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -21,11 +21,11 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/reader.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
+#include "tensorflow/core/lib/monitoring/sampler.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/protobuf_internal.h"
-#include "tensorflow/core/protobuf/saved_model.pb.h"
 #include "tensorflow/core/protobuf/saver.pb.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/session_options.h"
@@ -42,9 +42,28 @@ auto* load_latency = monitoring::Counter<1>::New(
     "/tensorflow/cc/saved_model/load_latency",
     "Latency in microseconds for SavedModels that were successfully loaded.",
     "model_path");
+auto* load_latency_by_stage = monitoring::Sampler<2>::New(
+    {
+        "/tensorflow/cc/saved_model/load_latency_by_stage",  // metric name
+        "Distribution of wall time spent (in microseconds) in each stage "
+        "(restore graph from disk, run init graph op, etc) when loading the "
+        "model",
+        "model_path",
+        "stage",
+    },
+    // Scale of 10, power of 1.8 with bucket count 33 (~20 minutes).
+    monitoring::Buckets::Exponential(10, 1.8, 33));
+
 constexpr char kLoadAttemptFail[] = "fail";
 constexpr char kLoadAttemptSuccess[] = "success";
 
+uint64 GetLatencyMicroseconds(const uint64 start_microseconds) {
+  const uint64 end_microseconds = Env::Default()->NowMicros();
+  // Avoid clock skew.
+  if (end_microseconds < start_microseconds) return 0;
+  return end_microseconds - start_microseconds;
+}
+
 Status LoadMetaGraphIntoSession(const MetaGraphDef& meta_graph_def,
                                 const SessionOptions& session_options,
                                 std::unique_ptr<Session>* session) {
@@ -242,6 +261,7 @@ Status LoadSavedModelInternal(const SessionOptions& session_options,
                               const string& export_dir,
                               const std::unordered_set<string>& tags,
                               SavedModelBundle* const bundle) {
+  const uint64 read_start_microseconds = Env::Default()->NowMicros();
   TF_RETURN_IF_ERROR(ReadMetaGraphDefFromSavedModel(export_dir, tags,
                                                     &bundle->meta_graph_def));
 
@@ -256,12 +276,23 @@ Status LoadSavedModelInternal(const SessionOptions& session_options,
                  bundle->meta_graph_def.saver_def().restore_op_name(),
                  bundle->meta_graph_def.saver_def().filename_tensor_name(),
                  asset_file_defs, bundle->session.get()));
+  // Record walltime spent in restoring graph from disk, but postpone metric
+  // increments until graph init finishes.
+  const uint64 restore_graph_walltime =
+      GetLatencyMicroseconds(read_start_microseconds);
+
+  const uint64 graph_init_start_microseconds = Env::Default()->NowMicros();
   string init_op_name;
   TF_RETURN_IF_ERROR(
       GetInitOp(export_dir, bundle->meta_graph_def, &init_op_name));
   TF_RETURN_IF_ERROR(RunInitOp(run_options, export_dir, bundle->meta_graph_def,
                                asset_file_defs, bundle->session.get(),
                                init_op_name));
+  load_latency_by_stage->GetCell(export_dir, "restore_graph")
+      ->Add(restore_graph_walltime);
+  // Record wall time spent in init op.
+  load_latency_by_stage->GetCell(export_dir, "init_graph")
+      ->Add(GetLatencyMicroseconds(graph_init_start_microseconds));
   return Status::OK();
 }
 
@@ -275,16 +306,10 @@ Status LoadSavedModel(const SessionOptions& session_options,
   const uint64 start_microseconds = Env::Default()->NowMicros();
   const Status status = LoadSavedModelInternal(session_options, run_options,
                                                export_dir, tags, bundle);
-  const uint64 load_latency_microsecs = [&]() -> uint64 {
-    const uint64 end_microseconds = Env::Default()->NowMicros();
-    // Avoid clock skew.
-    if (end_microseconds < start_microseconds) return 0;
-    return end_microseconds - start_microseconds;
-  }();
   auto log_and_count = [&](const string& status_str) {
     LOG(INFO) << "SavedModel load for tags { " << str_util::Join(tags, " ")
               << " }; Status: " << status_str << ". Took "
-              << load_latency_microsecs << " microseconds.";
+              << GetLatencyMicroseconds(start_microseconds) << " microseconds.";
     load_attempt_count->GetCell(export_dir, status_str)->IncrementBy(1);
   };
   if (status.ok()) {
@@ -292,7 +317,8 @@ Status LoadSavedModel(const SessionOptions& session_options,
   } else {
     log_and_count(kLoadAttemptFail);
   }
-  load_latency->GetCell(export_dir)->IncrementBy(load_latency_microsecs);
+  load_latency->GetCell(export_dir)
+      ->IncrementBy(GetLatencyMicroseconds(start_microseconds));
   return status;
 }
 
diff --git a/tensorflow/cc/tools/freeze_saved_model.cc b/tensorflow/cc/tools/freeze_saved_model.cc
index 23e9dc40d23899b9cef168c9128b6d8ed1be3ee9..eeb910178902ca883ed211379ba3f188c139f92e 100644
--- a/tensorflow/cc/tools/freeze_saved_model.cc
+++ b/tensorflow/cc/tools/freeze_saved_model.cc
@@ -124,7 +124,9 @@ Status GetVariableNameToTensorMap(
     return Status::OK();
   }
   std::vector<string> variable_names;
+  variable_names.reserve(variable_names_set.size());
   std::vector<string> tensor_names;
+  tensor_names.reserve(variable_names_set.size());
   for (const string& node_name : variable_names_set) {
     variable_names.push_back(node_name);
     NodeDef* node_def = name_to_node_map.at(node_name);
diff --git a/tensorflow/compat_template.__init__.py b/tensorflow/compat_template.__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cf68c9cd8396987899b4f34f21b994b4722ead4
--- /dev/null
+++ b/tensorflow/compat_template.__init__.py
@@ -0,0 +1,56 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Bring in all of the public TensorFlow interface into this module."""
+
+from __future__ import absolute_import as _absolute_import
+from __future__ import division as _division
+from __future__ import print_function as _print_function
+
+import os as _os
+import sys as _sys
+
+# pylint: disable=g-bad-import-order
+
+# API IMPORTS PLACEHOLDER
+
+from tensorflow.python.tools import component_api_helper as _component_api_helper
+_component_api_helper.package_hook(
+    parent_package_str=__name__,
+    child_package_str=('tensorboard.summary._tf.summary'),
+    error_msg=(
+        "Limited tf.compat.v2.summary API due to missing TensorBoard "
+        "installation"))
+_component_api_helper.package_hook(
+    parent_package_str=__name__,
+    child_package_str=(
+        'tensorflow_estimator.python.estimator.api._v2.estimator'))
+_component_api_helper.package_hook(
+    parent_package_str=__name__,
+    child_package_str=('tensorflow.python.keras.api._v2.keras'))
+
+# We would like the following to work for fully enabling 2.0 in a 1.0 install:
+#
+# import tensorflow.compat.v2 as tf
+# tf.enable_v2_behavior()
+#
+# This make this one symbol available directly.
+from tensorflow.python.compat.v2_compat import enable_v2_behavior  # pylint: disable=g-import-not-at-top
+
+# Add module aliases
+_current_module = _sys.modules[__name__]
+if hasattr(_current_module, 'keras'):
+  losses = keras.losses
+  metrics = keras.metrics
+  optimizers = keras.optimizers
diff --git a/tensorflow/compat_template_v1.__init__.py b/tensorflow/compat_template_v1.__init__.py
index 7df80ec01245a7fe820c79d5879458c4cd0a93cb..9549a71c41a0ba2aac58abd8cfb182aa4eaf3b4f 100644
--- a/tensorflow/compat_template_v1.__init__.py
+++ b/tensorflow/compat_template_v1.__init__.py
@@ -23,12 +23,15 @@ import os as _os
 # pylint: disable=g-bad-import-order
 from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 
+# API IMPORTS PLACEHOLDER
+
 from tensorflow.python.tools import component_api_helper as _component_api_helper
 _component_api_helper.package_hook(
     parent_package_str=__name__,
-    child_package_str=('tensorflow_estimator.python.estimator.api.estimator'))
-
-# API IMPORTS PLACEHOLDER
-
+    child_package_str=(
+        'tensorflow_estimator.python.estimator.api._v1.estimator'))
+_component_api_helper.package_hook(
+    parent_package_str=__name__,
+    child_package_str=('tensorflow.python.keras.api._v1.keras'))
 from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
 app.flags = flags  # pylint: disable=undefined-variable
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index 16151e77737429f4fbf690fc34b12a70bacebdc4..af016bf80e7a10d8729a1eb385466af48b5810cd 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -30,6 +30,7 @@ cc_library(
         "flags.h",
     ],
     deps = [
+        ":aot_only_var_handle_op",
         ":embedded_protocol_buffers",
         "//tensorflow/compiler/tf2xla",
         "//tensorflow/compiler/tf2xla:cpu_function_runtime",
@@ -71,6 +72,7 @@ tf_cc_test(
         ":tfcompile_lib",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "@com_google_absl//absl/strings",
@@ -205,6 +207,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "aot_only_var_handle_op",
+    srcs = ["aot_only_var_handle_op.cc"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+    ],
+    alwayslink = 1,
+)
+
 tf_cc_test(
     name = "benchmark_test",
     srcs = ["benchmark_test.cc"],
diff --git a/tensorflow/compiler/aot/aot_only_var_handle_op.cc b/tensorflow/compiler/aot/aot_only_var_handle_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0ce36a979f424610a5aa952afa8db2245ed971a9
--- /dev/null
+++ b/tensorflow/compiler/aot/aot_only_var_handle_op.cc
@@ -0,0 +1,56 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_context.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+
+namespace tensorflow {
+namespace {
+
+// Implementation of varhandle that binds a VarHandleOp to an XlaResource of the
+// same name. It is not safe to use this op in a JIT context.
+class XlaAotOnlyVarHandleOp : public XlaOpKernel {
+ public:
+  explicit XlaAotOnlyVarHandleOp(OpKernelConstruction* c);
+  void Compile(XlaOpKernelContext* context) override;
+
+ private:
+  string name_;
+};
+
+XlaAotOnlyVarHandleOp::XlaAotOnlyVarHandleOp(OpKernelConstruction* c)
+    : XlaOpKernel(c) {
+  OP_REQUIRES_OK(c, c->GetAttr("shared_name", &name_));
+}
+
+void XlaAotOnlyVarHandleOp::Compile(XlaOpKernelContext* context) {
+  // Look for a resource of the same name. TF also keys that on the container
+  // and type attributes, but that doesn't seem necessary.
+  for (const auto& resource : context->xla_context()->resources()) {
+    if (resource->kind() == XlaResource::kVariable &&
+        resource->name() == name_) {
+      context->SetResourceOutput(0, resource.get());
+      return;
+    }
+  }
+  context->SetStatus(
+      errors::InvalidArgument("Variable: ", name_, " not configured"));
+}
+}  // namespace
+
+REGISTER_XLA_OP(Name("VarHandleOp").CompilationOnly(), XlaAotOnlyVarHandleOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index ab1c1be344e2257721507543bc7647d4ff4becb2..da0598736a7d6b7f55458d76ca30fa6ad46a74f9 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -129,7 +129,7 @@ Status AddRewritesForShape(int i, const xla::Shape& shape,
   TF_RETURN_IF_ERROR(XLATypeToCpp(shape.element_type(), &type));
   std::vector<string> dim_vars;
   string dim_sizes, indices;
-  if (xla::ShapeUtil::Rank(shape) == 0 ||
+  if (shape.rank() == 0 ||
       (shape.dimensions_size() == 1 && shape.dimensions(0) == 1)) {
     dim_sizes = "[1]";
     indices = "[0]";
@@ -168,12 +168,12 @@ Status GenArgMethods(const tf2xla::Config& config,
                      const xla::ProgramShapeProto& ps,
                      const CompileResult& compile_result, string* methods) {
   size_t num_args = ps.parameters_size();
-  if (config.feed_size() != num_args) {
-    return errors::InvalidArgument("mismatch between feed_size(",
-                                   config.feed_size(), ") and num_args(",
-                                   num_args, ")");
+  if (config.feed_size() + config.variable_size() != num_args) {
+    return errors::InvalidArgument(
+        "mismatch between feed_size(", config.feed_size(), ")+variable_size(",
+        config.variable_size(), ") and num_args(", num_args, ")");
   }
-  for (int i = 0; i < num_args; ++i) {
+  for (int i = 0; i < config.feed_size(); ++i) {
     std::vector<std::pair<string, string>> rewrites;
     TF_RETURN_IF_ERROR(
         AddRewritesForShape(i, xla::Shape(ps.parameters(i)), &rewrites));
@@ -212,12 +212,14 @@ Status GenResultMethods(const tf2xla::Config& config,
     // tuple result, and we rely on this to simplify code generation.
     return errors::Internal("codegen requires the XLA result to be a tuple");
   }
-  if (config.fetch_size() != ps.result().tuple_shapes_size()) {
+  size_t num_results = ps.result().tuple_shapes_size();
+  if (config.fetch_size() + config.variable_size() != num_results) {
     return errors::InvalidArgument("mismatch between fetch_size(",
-                                   config.feed_size(), ") and tuple_size(",
+                                   config.fetch_size(), ")+variable_size(",
+                                   config.variable_size(), ") and tuple_size(",
                                    ps.result().tuple_shapes_size(), ")");
   }
-  for (int i = 0; i < ps.result().tuple_shapes_size(); ++i) {
+  for (int i = 0; i < config.fetch_size(); ++i) {
     std::vector<std::pair<string, string>> rewrites;
     TF_RETURN_IF_ERROR(AddRewritesForShape(
         i, xla::Shape(ps.result().tuple_shapes(i)), &rewrites));
@@ -245,6 +247,51 @@ Status GenResultMethods(const tf2xla::Config& config,
   return Status::OK();
 }
 
+// Generate methods for variables.
+Status GenVariableMethods(const tf2xla::Config& config,
+                          const xla::ProgramShapeProto& ps, string* methods) {
+  size_t num_args = ps.parameters_size();
+  for (int i = config.feed_size(); i < num_args; ++i) {
+    std::vector<std::pair<string, string>> rewrites;
+    TF_RETURN_IF_ERROR(
+        AddRewritesForShape(i, xla::Shape(ps.parameters(i)), &rewrites));
+    const string code = R"(
+  void set_var_{{NAME}}_input_data({{TYPE}}* data) {
+    set_arg_data({{I}}, data);
+  }
+)";
+    const tf2xla::Variable& var = config.variable(i - config.feed_size());
+    *methods += RewriteWithName(
+        var.name().empty() ? var.node_name() : var.name(), code, rewrites);
+  }
+  size_t num_results = ps.result().tuple_shapes_size();
+  for (int i = config.fetch_size(); i < num_results; ++i) {
+    std::vector<std::pair<string, string>> rewrites;
+    TF_RETURN_IF_ERROR(AddRewritesForShape(
+        i, xla::Shape(ps.result().tuple_shapes(i)), &rewrites));
+    string code = R"(
+  {{TYPE}}* var_{{NAME}}_result_data() {
+    return static_cast<{{TYPE}}*>(result_data({{I}}));
+  }
+  {{TYPE}}& var_{{NAME}}_result({{DIM_VARS}}) {
+    return (*static_cast<{{TYPE}}(*){{DIM_SIZES}}>(
+        result_data({{I}}))){{INDICES}};
+  }
+  const {{TYPE}}* var_{{NAME}}_result_data() const {
+    return static_cast<const {{TYPE}}*>(result_data({{I}}));
+  }
+  const {{TYPE}}& var_{{NAME}}_result({{DIM_VARS}}) const {
+    return (*static_cast<const {{TYPE}}(*){{DIM_SIZES}}>(
+        result_data({{I}}))){{INDICES}};
+  }
+)";
+    const tf2xla::Variable& var = config.variable(i - config.fetch_size());
+    *methods += RewriteWithName(
+        var.name().empty() ? var.node_name() : var.name(), code, rewrites);
+  }
+  return Status::OK();
+}
+
 // Generates code implementing {Arg,Result}Names(), where T is one of
 // tf2xla::{Feed,Fetch}. Each feed or fetch name results in a C-style string
 // literal in the array, with nullptr terminating the array.
@@ -291,6 +338,14 @@ Status ValidateFeedFetchCppNames(const tf2xla::Config& config) {
       TF_RETURN_IF_ERROR(ValidateCppIdent(fetch.name(), "fetch name"));
     }
   }
+  for (const tf2xla::Variable& variable : config.variable()) {
+    if (!variable.name().empty()) {
+      TF_RETURN_IF_ERROR(ValidateCppIdent(variable.name(), "variable name"));
+    } else {
+      TF_RETURN_IF_ERROR(
+          ValidateCppIdent(variable.node_name(), "variable name"));
+    }
+  }
   return Status::OK();
 }
 
@@ -339,9 +394,10 @@ Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config,
   std::vector<BufferInfo> buffer_infos_for_temps =
       ExtractTempBufferInfos(buffer_infos);
   const xla::ProgramShapeProto& ps = compile_result.program_shape;
-  string methods_arg, methods_result;
+  string methods_arg, methods_result, methods_variable;
   TF_RETURN_IF_ERROR(GenArgMethods(config, ps, compile_result, &methods_arg));
   TF_RETURN_IF_ERROR(GenResultMethods(config, ps, &methods_result));
+  TF_RETURN_IF_ERROR(GenVariableMethods(config, ps, &methods_variable));
   const size_t arg_bytes_aligned = cpu_function_runtime::AlignedBufferBytes(
       buffer_infos_for_args.data(), buffer_infos_for_args.size(),
       /*allocate_entry_params=*/true);
@@ -384,8 +440,9 @@ Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config,
   // calling HloProfilePrinter::profile_counters_size.
   const string assign_profile_counters_size =
       opts.gen_hlo_profile_printer_data
-          ? "data->set_profile_counters_size("
-            "data->hlo_profile_printer_data()->profile_counters_size());"
+          ? "set_static_data_profile_counters_size(data, "
+            "get_static_data_hlo_profile_printer_data(data)->"
+            "profile_counters_size());"
           : "";
 
   // Use a poor-man's text templating mechanism; first populate the full header
@@ -449,7 +506,7 @@ extern "C" void {{ENTRY}}(
 //   arg bytes aligned:  {{ARG_BYTES_ALIGNED}}
 //   temp bytes total:   {{TEMP_BYTES_TOTAL}}
 //   temp bytes aligned: {{TEMP_BYTES_ALIGNED}}
-class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
+class {{CLASS}} final : public tensorflow::XlaCompiledCpuFunction {
  public:
   // Number of input arguments for the compiled computation.
   static constexpr size_t kNumArgs = {{ARG_NUM}};
@@ -464,16 +521,17 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
     static XlaCompiledCpuFunction::StaticData* kStaticData = [](){
       XlaCompiledCpuFunction::StaticData* data =
         new XlaCompiledCpuFunction::StaticData;
-      data->set_raw_function({{ENTRY}});
-      data->set_buffer_infos(BufferInfos());
-      data->set_num_buffers(kNumBuffers);
-      data->set_arg_index_table(ArgIndexToBufferIndex());
-      data->set_num_args(kNumArgs);
-      data->set_result_index(kResultIndex);
-      data->set_arg_names(StaticArgNames());
-      data->set_result_names(StaticResultNames());
-      data->set_program_shape(StaticProgramShape());
-      data->set_hlo_profile_printer_data(StaticHloProfilePrinterData());
+      set_static_data_raw_function(data, {{ENTRY}});
+      set_static_data_buffer_infos(data, BufferInfos());
+      set_static_data_num_buffers(data, kNumBuffers);
+      set_static_data_arg_index_table(data, ArgIndexToBufferIndex());
+      set_static_data_num_args(data, kNumArgs);
+      set_static_data_result_index(data, kResultIndex);
+      set_static_data_arg_names(data, StaticArgNames());
+      set_static_data_result_names(data, StaticResultNames());
+      set_static_data_program_shape(data, StaticProgramShape());
+      set_static_data_hlo_profile_printer_data(
+          data, StaticHloProfilePrinterData());
 {{ASSIGN_PROFILE_COUNTERS_SIZE}}
       return data;
     }();
@@ -521,6 +579,21 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
   // buffers are managed internally, and may change after each call to Run.
 {{METHODS_RESULT}}
 
+  // Methods for managing variable buffers. Buffers are in row-major order. The
+  // input and output buffers may or may not be identical.
+  //
+  // void set_var_X_data(T* data)
+  //   Sets the buffer for variable X.
+  //
+  // T* var_X_data()
+  //   Returns the buffer of type T for variable X.
+  //
+  // T& var_X(...dim indices...)
+  //   Returns a reference to the value of type T for variable X,
+  //   with dim indices specifying which value. No bounds checking is performed
+  //   on dim indices.
+{{METHODS_VARIABLE}}
+
  private:
   // Number of buffers for the compiled computation.
   static constexpr size_t kNumBuffers = {{NUM_BUFFERS}};
@@ -587,6 +660,7 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
        include_hlo_profile_printer_data_proto},
       {"{{METHODS_ARG}}\n", methods_arg},
       {"{{METHODS_RESULT}}\n", methods_result},
+      {"{{METHODS_VARIABLE}}\n", methods_variable},
       {"{{NS_END}}\n", ns_end},
       {"{{NS_START}}\n", ns_start},
       {"{{PROGRAM_SHAPE}}", xla::ShapeUtil::HumanString(xla::ProgramShape(ps))},
diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc
index c1788ca32a1d099284eeb870f9513891051fd29e..5580e55b691bd10698b63d86bc0194b25da743b9 100644
--- a/tensorflow/compiler/aot/codegen_test.cc
+++ b/tensorflow/compiler/aot/codegen_test.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "llvm/Support/TargetSelect.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -172,6 +174,15 @@ TEST(CodegenTest, Golden) {
   tf2xla::Fetch* fetch = config.add_fetch();
   fetch->mutable_id()->set_node_name("fetch0");
   fetch->set_name("myfetch");
+  tf2xla::Variable* variable = config.add_variable();
+  variable->set_node_name("myvar");
+  variable->mutable_shape()->add_dim()->set_size(1);
+  variable->set_type(DT_FLOAT);
+  tf2xla::Variable* variable2 = config.add_variable();
+  variable2->set_node_name("my/var");
+  variable2->set_name("myvar2");
+  variable2->mutable_shape()->add_dim()->set_size(5);
+  variable2->set_type(DT_INT32);
   CompileResult compile_result;
   compile_result.aot.reset(new xla::cpu::CpuAotCompilationResult(
       {},
@@ -186,9 +197,14 @@ TEST(CodegenTest, Golden) {
           {
               xla::ShapeUtil::MakeShape(xla::F32, {1, 2}),
               xla::ShapeUtil::MakeShape(xla::S64, {3, 4}),
+              xla::ShapeUtil::MakeShape(xla::F32, {1}),
+              xla::ShapeUtil::MakeShape(xla::S32, {5}),
           },
-          xla::ShapeUtil::MakeTupleShape(
-              {xla::ShapeUtil::MakeShape(xla::U32, {5, 6})}))
+          xla::ShapeUtil::MakeTupleShape({
+              xla::ShapeUtil::MakeShape(xla::U32, {5, 6}),
+              xla::ShapeUtil::MakeShape(xla::F32, {1}),
+              xla::ShapeUtil::MakeShape(xla::S32, {5}),
+          }))
           .ToProto();
   compile_result.entry_point = "entry_point";
   compile_result.pointer_size = 8;
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index 968afad65ed6d4b5510687df484b7ce6743f6a85..b5f33d690d492489e9090786cd341e035ae7ca15 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -52,14 +52,14 @@ namespace bar {
 //   is guaranteed that no thread may call a non-const method.
 //
 // The logical function signature is:
-//   ((unknown): f32[1,2], (unknown): s64[3,4]) -> (u32[5,6])
+//   ((unknown): f32[1,2], (unknown): s64[3,4], (unknown): f32[1], (unknown): s32[5]) -> (u32[5,6], f32[1], s32[5])
 //
 // Memory stats:
 //   arg bytes total:    104
 //   arg bytes aligned:  192
 //   temp bytes total:   126
 //   temp bytes aligned: 320
-class MyClass : public tensorflow::XlaCompiledCpuFunction {
+class MyClass final : public tensorflow::XlaCompiledCpuFunction {
  public:
   // Number of input arguments for the compiled computation.
   static constexpr size_t kNumArgs = 2;
@@ -74,16 +74,17 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
     static XlaCompiledCpuFunction::StaticData* kStaticData = [](){
       XlaCompiledCpuFunction::StaticData* data =
         new XlaCompiledCpuFunction::StaticData;
-      data->set_raw_function(entry_point);
-      data->set_buffer_infos(BufferInfos());
-      data->set_num_buffers(kNumBuffers);
-      data->set_arg_index_table(ArgIndexToBufferIndex());
-      data->set_num_args(kNumArgs);
-      data->set_result_index(kResultIndex);
-      data->set_arg_names(StaticArgNames());
-      data->set_result_names(StaticResultNames());
-      data->set_program_shape(StaticProgramShape());
-      data->set_hlo_profile_printer_data(StaticHloProfilePrinterData());
+      set_static_data_raw_function(data, entry_point);
+      set_static_data_buffer_infos(data, BufferInfos());
+      set_static_data_num_buffers(data, kNumBuffers);
+      set_static_data_arg_index_table(data, ArgIndexToBufferIndex());
+      set_static_data_num_args(data, kNumArgs);
+      set_static_data_result_index(data, kResultIndex);
+      set_static_data_arg_names(data, StaticArgNames());
+      set_static_data_result_names(data, StaticResultNames());
+      set_static_data_program_shape(data, StaticProgramShape());
+      set_static_data_hlo_profile_printer_data(
+          data, StaticHloProfilePrinterData());
 
       return data;
     }();
@@ -213,6 +214,58 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
         result_data(0)))[dim0][dim1];
   }
 
+  // Methods for managing variable buffers. Buffers are in row-major order. The
+  // input and output buffers may or may not be identical.
+  //
+  // void set_var_X_data(T* data)
+  //   Sets the buffer for variable X.
+  //
+  // T* var_X_data()
+  //   Returns the buffer of type T for variable X.
+  //
+  // T& var_X(...dim indices...)
+  //   Returns a reference to the value of type T for variable X,
+  //   with dim indices specifying which value. No bounds checking is performed
+  //   on dim indices.
+
+  void set_var_myvar_input_data(float* data) {
+    set_arg_data(2, data);
+  }
+
+  void set_var_myvar2_input_data(tensorflow::int32* data) {
+    set_arg_data(3, data);
+  }
+
+  float* var_myvar_result_data() {
+    return static_cast<float*>(result_data(1));
+  }
+  float& var_myvar_result() {
+    return (*static_cast<float(*)[1]>(
+        result_data(1)))[0];
+  }
+  const float* var_myvar_result_data() const {
+    return static_cast<const float*>(result_data(1));
+  }
+  const float& var_myvar_result() const {
+    return (*static_cast<const float(*)[1]>(
+        result_data(1)))[0];
+  }
+
+  tensorflow::int32* var_myvar2_result_data() {
+    return static_cast<tensorflow::int32*>(result_data(2));
+  }
+  tensorflow::int32& var_myvar2_result(size_t dim0) {
+    return (*static_cast<tensorflow::int32(*)[5]>(
+        result_data(2)))[dim0];
+  }
+  const tensorflow::int32* var_myvar2_result_data() const {
+    return static_cast<const tensorflow::int32*>(result_data(2));
+  }
+  const tensorflow::int32& var_myvar2_result(size_t dim0) const {
+    return (*static_cast<const tensorflow::int32(*)[5]>(
+        result_data(2)))[dim0];
+  }
+
  private:
   // Number of buffers for the compiled computation.
   static constexpr size_t kNumBuffers = 6;
@@ -256,7 +309,7 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
   static const xla::ProgramShapeProto* StaticProgramShape() {
     static const xla::ProgramShapeProto* kShape = []() {
     xla::ProgramShapeProto* proto = new xla::ProgramShapeProto;
-    proto->ParseFromArray(&__tfcompile_foo_bar_MyClass_ProgramShapeProto_protobuf_array_contents[0], 52);
+    proto->ParseFromArray(&__tfcompile_foo_bar_MyClass_ProgramShapeProto_protobuf_array_contents[0], 132);
     return proto;
   }();
     return kShape;
diff --git a/tensorflow/compiler/aot/codegen_test_o.golden b/tensorflow/compiler/aot/codegen_test_o.golden
index ce8e5ec8c96a2c3696f14b8eea206d648182ecb5..2884597abcf29583e6192296b0e4ce6825d7c01a 100644
Binary files a/tensorflow/compiler/aot/codegen_test_o.golden and b/tensorflow/compiler/aot/codegen_test_o.golden differ
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index 9fc223bdc7c0e207ce2005cb86250aa77e709df8..0e46a9f5e9d68fa2174f7bd9b9fa7c3a82dfb715 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -108,10 +108,13 @@ Status CompileGraph(const GraphDef& graph_def, const tf2xla::Config& config,
                         computation.Snapshot());
     // Serialize the HloSnapshot deterministically so that all the outputs of a
     // tf_library genrule are deterministic.
-    string proto;
-    TF_RET_CHECK(SerializeToStringDeterministic(*module, &proto));
+    const size_t size = module->ByteSizeLong();
+    auto serialized = absl::make_unique<char[]>(size);
+    TF_RET_CHECK(
+        SerializeToBufferDeterministic(*module, serialized.get(), size));
     TF_RETURN_IF_ERROR(
-        WriteStringToFile(Env::Default(), flags.out_session_module, proto));
+        WriteStringToFile(Env::Default(), flags.out_session_module,
+                          absl::string_view(serialized.get(), size)));
   }
   xla::cpu::CpuAotCompilationOptions aot_opts(
       flags.target_triple, flags.target_cpu, flags.target_features,
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index 10fa33ab5e84dcbc1629bee6214e8969046f19c2..444264ba6e1f59c33551796025ba845c62c02d43 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -69,6 +69,7 @@ genrule(
         "test_graph_tfmatmulandadd.pb",
         "test_graph_tfsplits.pb",
         "test_graph_tftop_k.pb",
+        "test_graph_tfvariable.pb",
     ],
     # Set CUDA_VISIBLE_DEVICES='' to prevent the code we launch from using any
     # GPUs which might be present.  This is important because builds may run
@@ -222,6 +223,17 @@ tf_library(
     ],
 )
 
+tf_library(
+    name = "test_graph_tfvariable",
+    testonly = 1,
+    config = "test_graph_tfvariable.config.pbtxt",
+    cpp_class = "VariableComp",
+    graph = "test_graph_tfvariable.pb",
+    tags = [
+        "manual",
+    ],
+)
+
 tf_cc_test(
     name = "tfcompile_test",
     srcs = ["tfcompile_test.cc"],
@@ -241,6 +253,7 @@ tf_cc_test(
         ":test_graph_tfmatmulandadd_with_profiling",
         ":test_graph_tfsplits",
         ":test_graph_tftop_k",
+        ":test_graph_tfvariable",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:xla_data_proto",
diff --git a/tensorflow/compiler/aot/tests/make_test_graphs.py b/tensorflow/compiler/aot/tests/make_test_graphs.py
index 64b861a73091642b03573543a5c55618bf33915d..42f8812def0503824416d92daa2db71a64c3db88 100644
--- a/tensorflow/compiler/aot/tests/make_test_graphs.py
+++ b/tensorflow/compiler/aot/tests/make_test_graphs.py
@@ -50,7 +50,7 @@ def tfadd_with_ckpt(out_dir):
   y = variables.VariableV1(constant_op.constant([0]), name='y_saved')
   math_ops.add(x, y, name='x_y_sum')
 
-  init_op = variables.initialize_all_variables()
+  init_op = variables.global_variables_initializer()
   saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V1)
   with session.Session() as sess:
     sess.run(init_op)
@@ -65,7 +65,7 @@ def tfadd_with_ckpt_saver(out_dir):
   y = variables.VariableV1(constant_op.constant([0]), name='y_saved')
   math_ops.add(x, y, name='x_y_sum')
 
-  init_op = variables.initialize_all_variables()
+  init_op = variables.global_variables_initializer()
   saver = saver_lib.Saver(name='abcprefix', write_version=saver_pb2.SaverDef.V1)
   with session.Session() as sess:
     sess.run(init_op)
@@ -149,6 +149,14 @@ def tftop_k(_):
   array_ops.identity(output[1], name='indices')
 
 
+def tfvariable(_):
+  x = variables.Variable(1000.0, name='x')
+  old_x = x.value()
+  with ops.control_dependencies([old_x]):
+    new_x = x.assign_add(42.0)
+  array_ops.stack([old_x, new_x], name='result')
+
+
 def write_graph(build_graph, out_dir):
   """Build a graph using build_graph and write it out."""
   g = ops.Graph()
@@ -171,6 +179,7 @@ def main(_):
   write_graph(tfmatmulandadd, FLAGS.out_dir)
   write_graph(tfsplits, FLAGS.out_dir)
   write_graph(tftop_k, FLAGS.out_dir)
+  write_graph(tfvariable, FLAGS.out_dir)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/compiler/aot/tests/test_graph_tfvariable.config.pbtxt b/tensorflow/compiler/aot/tests/test_graph_tfvariable.config.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9b4c4215a330b014f595edde001aba73ad7d8263
--- /dev/null
+++ b/tensorflow/compiler/aot/tests/test_graph_tfvariable.config.pbtxt
@@ -0,0 +1,12 @@
+# Text form of tensorflow.tf2xla.Config proto.
+fetch {
+  id { node_name: "result" }
+}
+
+variable {
+  node_name: "x"
+  shape {
+    dim { size: 1 }
+  }
+  type: DT_FLOAT
+}
diff --git a/tensorflow/compiler/aot/tests/tfcompile_test.cc b/tensorflow/compiler/aot/tests/tfcompile_test.cc
index 4dd79e5882d7da61be029735ef2b165908c599f9..5f9316f3933713e12fc5960b9adfecc6e9bd99b5 100644
--- a/tensorflow/compiler/aot/tests/tfcompile_test.cc
+++ b/tensorflow/compiler/aot/tests/tfcompile_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/aot/tests/test_graph_tfmatmulandadd_with_profiling.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfsplits.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tftop_k.h"
+#include "tensorflow/compiler/aot/tests/test_graph_tfvariable.h"
 #include "tensorflow/compiler/xla/service/hlo_profile_printer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -473,6 +474,28 @@ TEST(TFCompileTest, TopK) {
   EXPECT_EQ(expected_indices[1], fn.result1(1));
 }
 
+TEST(TFCompileTest, Variable) {
+  Eigen::ThreadPool tp(1);
+  Eigen::ThreadPoolDevice device(&tp, tp.NumThreads());
+
+  VariableComp fn;
+  float x = 23;
+  fn.set_var_x_input_data(&x);
+
+  fn.set_thread_pool(&device);
+  fn.Run();
+  EXPECT_EQ(fn.result0(0, 0), 23);
+  EXPECT_EQ(fn.result0(1, 0), 65);
+  EXPECT_EQ(fn.var_x_result(), 65);
+
+  EXPECT_EQ(x, 23);
+  x = fn.var_x_result();
+  fn.Run();
+  EXPECT_EQ(fn.result0(0, 0), 65);
+  EXPECT_EQ(fn.result0(1, 0), 107);
+  EXPECT_EQ(fn.var_x_result(), 107);
+}
+
 TEST(TFCompileTest, AssertEqAndReturnDiff) {
   // Assert is converted into a no-op in XLA, so there is no failure even if the
   // two args are different.
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 2dc3e8c9113b37bf9d575ad66783f4ab49478af4..2abe3e29b78dbbe719637b13418704acc213d050 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -207,7 +207,7 @@ def tf_library(
         #
         # Note that setting the local=1 attribute on a *test target* causes the
         # test infrastructure to skip that test.  However this is a genrule, not
-        # a test target, and runs with --genrule_strategy=forced_forge, meaning
+        # a test target, and runs with --strategy=Genrule=forced_forge, meaning
         # the local=1 attribute is ignored, and the genrule is still run.
         #
         # https://www.bazel.io/versions/master/docs/be/general.html#genrule
@@ -283,7 +283,7 @@ def tf_library(
     )
 
     # Variables used for gen_test and gen_benchmark.
-    cpp_class_split = cpp_class.rsplit("::", maxsplit = 2)
+    cpp_class_split = cpp_class.rsplit("::", 2)
     if len(cpp_class_split) == 1:
         no_ns_name = cpp_class_split[0]
     else:
diff --git a/tensorflow/compiler/aot/tfcompile_main.cc b/tensorflow/compiler/aot/tfcompile_main.cc
index d548de8c44285f6d21dd778db464a31e1b19645b..0b6ab7e723d6e3a55da2f1c30b75f44cbdaa75bb 100644
--- a/tensorflow/compiler/aot/tfcompile_main.cc
+++ b/tensorflow/compiler/aot/tfcompile_main.cc
@@ -136,6 +136,10 @@ int main(int argc, char** argv) {
 
   tensorflow::string usage = tensorflow::tfcompile::kUsageHeader;
   usage += tensorflow::Flags::Usage(argv[0], flag_list);
+  if (argc > 1 && absl::string_view(argv[1]) == "--help") {
+    std::cerr << usage << "\n";
+    return 0;
+  }
   bool parsed_flags_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
   QCHECK(parsed_flags_ok) << "\n" << usage;
 
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 15dcbb2641eca031e82db9aa58dee6a14ab0a2cc..121de401cefb2b56b984944dde769f226590dc67 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -175,12 +175,22 @@ cc_library(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:stream_pool",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:resource_variable_ops_op_lib",
+        "//tensorflow/core:sendrecv_ops_op_lib",
+        "//tensorflow/core:state_ops_op_lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/core/kernels:constant_op",
@@ -198,9 +208,11 @@ cc_library(
         "//tensorflow/core/kernels:variable_ops",
         "//tensorflow/core/kernels/data:generator_dataset_op",
         "//tensorflow/core/kernels/data:iterator_ops",
+        "//tensorflow/core/kernels/data:optional_ops",
         "//tensorflow/core/kernels/data:prefetch_dataset_op",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -271,7 +283,6 @@ cc_library(
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
@@ -454,7 +465,6 @@ cc_library(
         "//tensorflow/compiler/tf2xla:tf2xla_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
-        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
@@ -515,6 +525,7 @@ cc_library(
         "//tensorflow/compiler/jit/ops:xla_ops",
         "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/tf2xla:resource_operation_table",
+        "//tensorflow/compiler/tf2xla:side_effect_util",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/cc:xla_jit_ops",
@@ -613,6 +624,7 @@ tf_cc_test(
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:functional_ops",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:resource_variable_ops",
         "//tensorflow/cc:scope",
@@ -625,15 +637,16 @@ tf_cc_test(
         "//tensorflow/compiler/tf2xla/cc:xla_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/grappler/optimizers/data:graph_utils",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass.cc b/tensorflow/compiler/jit/build_xla_ops_pass.cc
index 9f4042630edaec1b9519b6434d859a48372e8b15..285b1efa53d91922c9fa161cfd2de34e1434d0c4 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass.cc
@@ -115,6 +115,13 @@ void MergeOutgoingControlEdges(const Scope& s, Node* old_node, Node* new_node) {
     return;
   }
 
+  if (ctrl_edges.size() == 1 && ctrl_edges.front()->dst()->IsSink()) {
+    // Avoid creating a Merge node if we can just add an edge to _SINK
+    // instead.
+    s.graph()->AddControlEdge(new_node, s.graph()->sink_node());
+    return;
+  }
+
   // We can't merge control edges directly so we instead first "convert" them to
   // normal values that can be merged, merge the values and then "convert" the
   // merged value back into control.
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass_test.cc b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
index 48a23a4c1711ac88a329723c46559112d5a39dbd..c14c7465c55b7d350d6b3a6853cef6692140ce78 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/jit/node_matchers.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
@@ -69,6 +68,8 @@ Status BuildXlaOps(const Scope& s, std::unique_ptr<Graph>* result) {
     }
   }
 
+  FixupSourceAndSinkEdges(graph.get());
+
   GraphOptimizationPassOptions opt_options;
   opt_options.graph = &graph;
   BuildXlaOpsPass pass(/*enable_lazy_compilation=*/true);
@@ -224,5 +225,23 @@ TEST_F(BuildXlaOpsTest, OnXlaDevice) {
   ASSERT_NE(write_op_new, nullptr);
   EXPECT_THAT(write_op_new, assign_var);
 }
+
+TEST_F(BuildXlaOpsTest, NoExtraMergeForEdgeToSink) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  FunctionDefLibrary flib_def =
+      CreateFunctionDefLibWithConstFunction("cluster_0");
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(flib_def));
+  Node* call;
+  TF_ASSERT_OK(MakeXlaCompiledKernel(root.graph(), "cluster_0", "C", &call));
+
+  std::unique_ptr<Graph> graph;
+  TF_ASSERT_OK(BuildXlaOps(root, &graph));
+
+  Node* sink_node = graph->sink_node();
+  EXPECT_THAT(sink_node, NodeWith(CtrlDeps(NodeWith(Op("_XlaRun")),
+                                           NodeWith(Op("cluster_0")),
+                                           NodeWith(Op("NoOp")))));
+}
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/deadness_analysis.cc b/tensorflow/compiler/jit/deadness_analysis.cc
index 0562838f628c66b1eb03af9d2a5139c01dca31c5..4397eea9af266cbd0392f08323e59077c9395150 100644
--- a/tensorflow/compiler/jit/deadness_analysis.cc
+++ b/tensorflow/compiler/jit/deadness_analysis.cc
@@ -20,7 +20,10 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/jit/deadness_analysis_internal.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/hash/hash.h"
 
@@ -110,7 +113,11 @@ class Predicate {
   enum class Kind { kAnd, kOr, kNot, kAndRecurrence, kSymbol };
 
   virtual string ToString() const = 0;
-  int64 hash() const { return hash_; }
+
+  // An ID assigned to the Predicate at construction time.  Conceptually like a
+  // pointer, except that it is stable across runs.
+  int64 id() const { return id_; }
+
   virtual absl::Span<Predicate* const> GetOperands() const = 0;
 
   virtual Kind kind() const = 0;
@@ -123,29 +130,19 @@ class Predicate {
   static void Visit(Predicate* p, const FunctionTy& func);
 
  protected:
-  explicit Predicate(int64 hash) : hash_(hash) {}
+  explicit Predicate(int64 id) : id_(id) {}
 
  private:
-  const int64 hash_;
+  const int64 id_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Predicate);
 };
 
-int64 HashPredicateSequence(Predicate::Kind kind,
-                            absl::Span<Predicate* const> preds) {
-  int64 hash = ::tensorflow::hash<Predicate::Kind>()(kind);
-  for (Predicate* pred : preds) {
-    hash = Hash64Combine(hash, pred->hash());
-  }
-  return hash;
-}
-
 // Represents a logical conjunction of a set of predicates.
 class AndPredicate : public Predicate {
  public:
-  explicit AndPredicate(std::vector<Predicate*> operands)
-      : Predicate(HashPredicateSequence(Kind::kAnd, operands)),
-        operands_(std::move(operands)) {}
+  explicit AndPredicate(int64 id, std::vector<Predicate*> operands)
+      : Predicate(id), operands_(std::move(operands)) {}
 
   string ToString() const override {
     if (operands().empty()) {
@@ -174,9 +171,8 @@ class AndPredicate : public Predicate {
 // Represents a logical disjunction of a set of predicates.
 class OrPredicate : public Predicate {
  public:
-  explicit OrPredicate(std::vector<Predicate*> operands)
-      : Predicate(HashPredicateSequence(Kind::kOr, operands)),
-        operands_(std::move(operands)) {}
+  explicit OrPredicate(int64 id, std::vector<Predicate*> operands)
+      : Predicate(id), operands_(std::move(operands)) {}
 
   string ToString() const override {
     if (operands().empty()) {
@@ -204,9 +200,8 @@ class OrPredicate : public Predicate {
 // Represents a logical negation of a set of predicates.
 class NotPredicate : public Predicate {
  public:
-  explicit NotPredicate(Predicate* operand)
-      : Predicate(HashPredicateSequence(Kind::kNot, {operand})),
-        operands_({operand}) {}
+  explicit NotPredicate(int64 id, Predicate* operand)
+      : Predicate(id), operands_({operand}) {}
 
   string ToString() const override {
     return absl::StrCat("~", operand()->ToString());
@@ -222,29 +217,38 @@ class NotPredicate : public Predicate {
   std::array<Predicate*, 1> operands_;
 };
 
-// Represents an infinite list of predicates.
+// Represents the liveness of an induction variable.  For users inside the loop
+// this represents the "current" liveness of the induction variable.  For users
+// outside the loop it represents the "last" liveness of the induction variable.
+//
+// More concretely, an and recurrence {S,&,X}<loop> represents the liveness of V
+// in the following graph:
 //
-// An AndRecurrence with start = S and step = X is printed as {S,&,X} and stands
-// for the list of predicates:
+//   V = Merge(S', V_NextIt)
+//   V = Op(V, X')
+//   V_NextIt = NextIteration(V)
 //
-//   S, S & GenSym(X,1), S & GenSym(X,1) & GenSym(X,2), ...
+// where Predicate(S') = S and Predicate(X') = X.
 //
-// where GenSym(<expression>, <id>) renames every SymbolPredicate in
-// <expression> by appending <id> to it, in effect creating a "fresh" symbol.
-// This means {P,&,Q} is not equal to "P on the first iteration; P&Q on
-// subsequent iterations".
+// `X` may contain symbolic predicates and the operations corresponding to these
+// symbolic predicates are either in frame `loop` or outside it.  The symbols
+// that are inside frame `loop` are loop variant (i.e. can have different
+// liveness in each loop iteration) and the symbols that are outside frame
+// `loop` are loop invariant (i.e. have the same liveness across all
+// iterations).
 class AndRecurrencePredicate : public Predicate {
  public:
-  explicit AndRecurrencePredicate(Predicate* start, Predicate* step)
-      : Predicate(HashPredicateSequence(Kind::kAndRecurrence, {start, step})),
-        operands_({start, step}) {}
+  explicit AndRecurrencePredicate(int64 id, Predicate* start, Predicate* step,
+                                  std::vector<string> frame)
+      : Predicate(id), operands_({start, step}), frame_(std::move(frame)) {}
 
   Predicate* start() const { return operands_[0]; }
   Predicate* step() const { return operands_[1]; }
+  absl::Span<const string> frame() const { return frame_; }
 
   string ToString() const override {
     return absl::StrCat("{", start()->ToString(), ",&,", step()->ToString(),
-                        "}");
+                        "}<", absl::StrJoin(frame(), ";"), ">");
   }
 
   Kind kind() const override { return Kind::kAndRecurrence; }
@@ -255,6 +259,7 @@ class AndRecurrencePredicate : public Predicate {
 
  private:
   std::array<Predicate*, 2> operands_;
+  std::vector<string> frame_;
 };
 
 // Represents an uninterpreted symbol in a logical predicate.
@@ -264,8 +269,8 @@ class AndRecurrencePredicate : public Predicate {
 // symbols.
 class SymbolPredicate : public Predicate {
  public:
-  explicit SymbolPredicate(TensorId tensor_id, bool must_be_true)
-      : Predicate(Hash(tensor_id, must_be_true)),
+  explicit SymbolPredicate(int64 id, TensorId tensor_id, bool must_be_true)
+      : Predicate(id),
         tensor_id_(std::move(tensor_id)),
         must_be_true_(must_be_true) {}
 
@@ -281,20 +286,13 @@ class SymbolPredicate : public Predicate {
   // "tensor_id() is live and evaluates to true".
   //
   // If `must_be_true()` is false then this SymbolPredicate represents the
-  // proposition "tensor_id() is live (and may evalutate to any value)"
+  // proposition "tensor_id() is live (and may evaluate to any value)"
   TensorId tensor_id() const { return tensor_id_; }
   bool must_be_true() const { return must_be_true_; }
 
  private:
   TensorId tensor_id_;
   bool must_be_true_;
-
-  static int64 Hash(const TensorId tensor_id, bool must_be_true) {
-    return Hash64Combine(
-        ::tensorflow::hash<bool>()(must_be_true),
-        Hash64Combine(::tensorflow::hash<Predicate::Kind>()(Kind::kSymbol),
-                      TensorId::Hasher{}(tensor_id)));
-  }
 };
 
 template <typename FunctionTy>
@@ -333,34 +331,58 @@ class PredicateFactory {
   }
 
   Predicate* MakeNotPredicate(Predicate* pred) {
-    SignatureForNot signature = pred;
-    auto it = interned_not_instances_.find(signature);
-    if (it == interned_not_instances_.end()) {
-      std::unique_ptr<Predicate> new_pred = Make<NotPredicate>(pred);
-      Predicate* new_pred_ptr = new_pred.get();
-      interned_not_instances_.emplace(signature, std::move(new_pred));
-      return new_pred_ptr;
-    } else {
-      return it->second.get();
+    auto it = make_not_predicate_cache_.find(pred);
+    if (it != make_not_predicate_cache_.end()) {
+      return it->second;
     }
+
+    Predicate* result = MakeNotPredicateImpl(pred);
+
+    bool insert_successful =
+        make_not_predicate_cache_.insert({pred, result}).second;
+    (void)insert_successful;
+    DCHECK(insert_successful);
+
+    return result;
   }
 
-  Predicate* MakeAndRecurrencePredicate(Predicate* start, Predicate* step) {
-    auto it = interned_and_rec_instances_.find({start, step});
+  Predicate* MakeAndRecurrencePredicate(Predicate* start, Predicate* step,
+                                        std::vector<string> frame) {
+    SignatureForAndRec signature(start, step, std::move(frame));
+    auto it = interned_and_rec_instances_.find(signature);
     if (it != interned_and_rec_instances_.end()) {
       return it->second.get();
     }
 
-    std::unique_ptr<Predicate> new_pred =
-        Make<AndRecurrencePredicate>(start, step);
+    std::unique_ptr<Predicate> new_pred = Make<AndRecurrencePredicate>(
+        std::get<0>(signature), std::get<1>(signature), std::get<2>(signature));
     Predicate* new_pred_ptr = new_pred.get();
-    CHECK(interned_and_rec_instances_
-              .emplace(SignatureForAndRec(start, step), std::move(new_pred))
-              .second);
+    bool inserted =
+        interned_and_rec_instances_.emplace(signature, std::move(new_pred))
+            .second;
+    (void)inserted;
+    DCHECK(inserted);
     return new_pred_ptr;
   }
 
-  Predicate* MakeSymbolPredicate(TensorId tensor_id, bool must_be_true) {
+  Status MakeSymbolPredicate(Node* node, int output_idx, bool must_be_true,
+                             Predicate** predicate) {
+    TensorId tensor_id(node->name(), output_idx);
+
+    bool is_boolean_tensor = node->output_type(tensor_id.index()) == DT_BOOL;
+    TF_RET_CHECK(!must_be_true || is_boolean_tensor);
+
+    if (node->type_string() == "Const" && must_be_true) {
+      const TensorProto* proto = nullptr;
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->def(), "value", &proto));
+
+      Tensor tensor(proto->dtype());
+      TF_RET_CHECK(tensor.FromProto(*proto));
+
+      *predicate = tensor.scalar<bool>()() ? MakeTrue() : MakeFalse();
+      return Status::OK();
+    }
+
     SignatureForSymbol signature = {tensor_id, must_be_true};
     auto it = interned_symbol_instances_.find(signature);
     if (it == interned_symbol_instances_.end()) {
@@ -369,20 +391,70 @@ class PredicateFactory {
       Predicate* new_pred_ptr = new_pred.get();
       interned_symbol_instances_.emplace(std::move(signature),
                                          std::move(new_pred));
-      return new_pred_ptr;
+      *predicate = new_pred_ptr;
     } else {
-      return it->second.get();
+      *predicate = it->second.get();
     }
+
+    return Status::OK();
   }
 
   Predicate* MakeTrue() { return MakeAndPredicate({}); }
   Predicate* MakeFalse() { return MakeOrPredicate({}); }
 
+  ~PredicateFactory() {
+    DCHECK_EQ(stack_depth_, 0) << "Unnested IncrementStackDepth?";
+  }
+
  private:
+  Predicate* MakeNotPredicateImpl(Predicate* pred) {
+    IncrementStackDepth stack_frame(this);
+    if (!stack_frame.HasOverflowed()) {
+      if (Predicate* simplified = SimplifyUsingDeMorgan(pred)) {
+        return simplified;
+      }
+
+      // ~~A => A
+      if (auto* not_pred = dynamic_cast<NotPredicate*>(pred)) {
+        return not_pred->operand();
+      }
+    }
+
+    SignatureForNot signature = pred;
+    auto it = interned_not_instances_.find(signature);
+    if (it == interned_not_instances_.end()) {
+      std::unique_ptr<Predicate> new_pred = Make<NotPredicate>(pred);
+      Predicate* new_pred_ptr = new_pred.get();
+      interned_not_instances_.emplace(signature, std::move(new_pred));
+      return new_pred_ptr;
+    } else {
+      return it->second.get();
+    }
+  }
+
+  Predicate* SimplifyUsingDeMorgan(Predicate* pred) {
+    // ~(A & B & C & ...) => ~A | ~B | ~C | ~...
+    // ~(A | B | C | ...) -> ~A & ~B & ~C & ~...
+    Predicate::Kind kind = pred->kind();
+
+    if (kind == Predicate::Kind::kAnd || kind == Predicate::Kind::kOr) {
+      std::vector<Predicate*> new_operands;
+      absl::c_transform(pred->GetOperands(), std::back_inserter(new_operands),
+                        [&](Predicate* p) { return MakeNotPredicate(p); });
+      return kind == Predicate::Kind::kOr ? MakeAndPredicate(new_operands)
+                                          : MakeOrPredicate(new_operands);
+    }
+
+    return nullptr;
+  }
+
   template <typename PredicateT, typename... Args>
   std::unique_ptr<Predicate> Make(Args&&... args) {
+    // If we ever expose the Predicate class outside this .cc file then we may
+    // want to make this hard to misuse (by accidentally passing in an arbitrary
+    // integer to the Predicate constructor for instance).
     return std::unique_ptr<PredicateT>(
-        new PredicateT(std::forward<Args>(args)...));
+        new PredicateT(id_counter_++, std::forward<Args>(args)...));
   }
 
   Predicate* MakeAndOrImpl(absl::Span<Predicate* const> operands, bool is_and);
@@ -402,7 +474,8 @@ class PredicateFactory {
   using SignatureForAndOr =
       std::pair<Predicate::Kind, absl::Span<Predicate* const>>;
   using SignatureForNot = Predicate*;
-  using SignatureForAndRec = std::pair<Predicate*, Predicate*>;
+  using SignatureForAndRec =
+      std::tuple<Predicate*, Predicate*, std::vector<string>>;
   using SignatureForSymbol = std::pair<SafeTensorId, bool>;
 
   struct HashSignatureForAndOr {
@@ -422,6 +495,36 @@ class PredicateFactory {
     }
   };
 
+  // Used to limit recursion to avoid blowing up the stack and cap compile time.
+  class IncrementStackDepth {
+   public:
+    explicit IncrementStackDepth(PredicateFactory* parent) : parent_(parent) {
+      parent_->stack_depth_++;
+    }
+
+    bool HasOverflowed() const {
+      const int kMaxStackDepth = 8;
+      return parent_->stack_depth_ >= kMaxStackDepth;
+    }
+
+    ~IncrementStackDepth() { parent_->stack_depth_--; }
+
+   private:
+    PredicateFactory* parent_;
+  };
+
+  // A cache for the MakeNotPredicate function.
+  //
+  // NB! This is *not* the same as `interned_not_instances_`.
+  // `interned_not_instances_` maps ensures pointer identity for `NotPredicate`
+  // instances, i.e., it ensures there at most one instance of Not(predicate)
+  // for any given predicate whereas `make_not_predicate_cache_` simply caches
+  // the result of the `MakeNotPredicate` function.  The values in
+  // `interned_not_instances_` are always instance of `NotPredicate` whereas the
+  // values in `make_not_predicate_cache_` may not be (for instance it will map
+  // Not(Not(A)) to A).
+  absl::flat_hash_map<Predicate*, Predicate*> make_not_predicate_cache_;
+
   absl::flat_hash_map<SignatureForAndOr, std::unique_ptr<Predicate>,
                       HashSignatureForAndOr>
       interned_and_or_instances_;
@@ -432,13 +535,15 @@ class PredicateFactory {
   absl::flat_hash_map<SignatureForSymbol, std::unique_ptr<Predicate>,
                       HashSignatureForSymbol>
       interned_symbol_instances_;
+  int64 id_counter_ = 0;
+  int stack_depth_ = 0;
 };
 
 Predicate* PredicateFactory::MakeInternedAndOr(
     std::vector<Predicate*> simplified_ops, Predicate::Kind pred_kind) {
   std::stable_sort(
       simplified_ops.begin(), simplified_ops.end(),
-      [](Predicate* a, Predicate* b) { return a->hash() < b->hash(); });
+      [](Predicate* a, Predicate* b) { return a->id() < b->id(); });
 
   auto it = interned_and_or_instances_.find({pred_kind, simplified_ops});
   if (it != interned_and_or_instances_.end()) {
@@ -466,6 +571,13 @@ Predicate* PredicateFactory::MakeAndOrImpl(
     absl::Span<Predicate* const> operands, bool is_and) {
   Predicate::Kind pred_kind =
       is_and ? Predicate::Kind::kAnd : Predicate::Kind::kOr;
+
+  IncrementStackDepth stack_frame(this);
+  if (stack_frame.HasOverflowed()) {
+    return MakeInternedAndOr(
+        std::vector<Predicate*>(operands.begin(), operands.end()), pred_kind);
+  }
+
   Predicate::Kind other_pred_kind =
       is_and ? Predicate::Kind::kOr : Predicate::Kind::kAnd;
   absl::flat_hash_set<Predicate*> simplified_ops_set;
@@ -494,16 +606,31 @@ Predicate* PredicateFactory::MakeAndOrImpl(
 
   // Simplify "A&~A=>False" and "A|~A=>True".
   absl::flat_hash_set<Predicate*> negated_ops;
-  for (Predicate* op : simplified_ops) {
-    if (op->kind() == Predicate::Kind::kNot) {
-      negated_ops.insert(dynamic_cast<NotPredicate&>(*op).operand());
-    }
-  }
-
   for (Predicate* op : simplified_ops) {
     if (negated_ops.count(op)) {
+      // Simple case:
+      //
+      //   A & ~A & ... == False
+      //   A | ~A | ... == True
       return is_and ? MakeFalse() : MakeTrue();
     }
+
+    Predicate* negated_op = MakeNotPredicate(op);
+    if (negated_op->kind() == pred_kind) {
+      // Slightly more complicated case:
+      //
+      //   (~A | ~B | ~C) & A & B & C & ... ==
+      //   ~(A & B & C) & (A & B & C) & ... == False
+      //
+      //   (~A & ~B & ~C) | A | B | C | ... ==
+      //   ~(A | B | C) | (A | B | C) | ... == True
+      if (absl::c_all_of(negated_op->GetOperands(), [&](Predicate* p) {
+            return simplified_ops_set.contains(p);
+          })) {
+        return is_and ? MakeFalse() : MakeTrue();
+      }
+    }
+    negated_ops.insert(negated_op);
   }
 
   // If all ops contain the same subop, then factor it out thanks to the
@@ -619,6 +746,7 @@ class DeadnessAnalysisImpl : public DeadnessAnalysis {
   const Graph& graph_;
   absl::flat_hash_map<TensorId, Predicate*, TensorId::Hasher> predicate_map_;
   PredicateFactory predicate_factory_;
+  std::vector<ControlFlowInfo> control_flow_info_;
   bool vlog_;
 };
 
@@ -661,9 +789,12 @@ Status DeadnessAnalysisImpl::HandleSwitch(Node* n,
   TF_RETURN_IF_ERROR(GetInputPreds(n, EdgeKind::kDataAndControl, &input_preds));
   const Edge* pred_edge;
   TF_RETURN_IF_ERROR(n->input_edge(1, &pred_edge));
-  Predicate* true_switch = predicate_factory_.MakeSymbolPredicate(
-      TensorId(pred_edge->src()->name(), pred_edge->src_output()),
-      /*must_be_true=*/true);
+
+  Predicate* true_switch;
+  TF_RETURN_IF_ERROR(predicate_factory_.MakeSymbolPredicate(
+      pred_edge->src(), pred_edge->src_output(),
+      /*must_be_true=*/true, &true_switch));
+
   Predicate* false_switch = predicate_factory_.MakeNotPredicate(true_switch);
 
   // Output 0 is alive iff all inputs are alive and the condition is false.
@@ -761,6 +892,23 @@ Predicate* DeduceStepPredicate(PredicateFactory* predicate_factory,
 
   return found_sym ? predicate_factory->MakeAndPredicate(and_ops) : nullptr;
 }
+
+Status GetFullFrame(const Node* n, absl::Span<const ControlFlowInfo> cfi_infos,
+                    std::vector<string>* frame) {
+  int depth = 0;
+  for (const ControlFlowInfo* cfi_iter = &cfi_infos[n->id()]; !n->IsSource();
+       n = cfi_iter->parent_frame, cfi_iter = &cfi_infos[n->id()]) {
+    frame->push_back(cfi_iter->frame_name);
+
+    if (depth++ > 5000) {
+      return errors::Internal(
+          "Frame of depth > 5000:  Probably malformed graph or a bug in "
+          "BuildControlFlowInfo");
+    }
+  }
+
+  return Status::OK();
+}
 }  // namespace
 
 Status DeadnessAnalysisImpl::HandleMerge(Node* n,
@@ -783,8 +931,10 @@ Status DeadnessAnalysisImpl::HandleMerge(Node* n,
     if (has_unvisited_backedge) {
       // We're visiting this merge for the first time and it has an unvisited
       // backedge.
-      Predicate* input_data_pred = predicate_factory_.MakeSymbolPredicate(
-          TensorId(n->name(), 0), /*must_be_true=*/false);
+      Predicate* input_data_pred;
+      TF_RETURN_IF_ERROR(predicate_factory_.MakeSymbolPredicate(
+          n, /*output_idx=*/0, /*must_be_true=*/false, &input_data_pred));
+
       SetPredicate(n, {0, 1, Graph::kControlSlot}, input_data_pred,
                    should_revisit);
       return Status::OK();
@@ -825,8 +975,10 @@ Status DeadnessAnalysisImpl::HandleMerge(Node* n,
 
         Predicate* start =
             predicate_factory_.MakeOrPredicate(non_recurrent_inputs);
-        Predicate* and_rec =
-            predicate_factory_.MakeAndRecurrencePredicate(start, step);
+        std::vector<string> frame;
+        TF_RETURN_IF_ERROR(GetFullFrame(n, control_flow_info_, &frame));
+        Predicate* and_rec = predicate_factory_.MakeAndRecurrencePredicate(
+            start, step, std::move(frame));
         SetPredicate(n, {0, 1, Graph::kControlSlot}, and_rec, should_revisit);
         return Status::OK();
       }
@@ -841,8 +993,10 @@ Status DeadnessAnalysisImpl::HandleRecv(Node* n,
   // acquire a dead signal from a _Send.
   std::vector<Predicate*> input_preds;
   TF_RETURN_IF_ERROR(GetInputPreds(n, EdgeKind::kDataAndControl, &input_preds));
-  input_preds.push_back(predicate_factory_.MakeSymbolPredicate(
-      TensorId(n->name(), 0), /*must_be_true=*/false));
+  Predicate* signal_is_alive;
+  TF_RETURN_IF_ERROR(predicate_factory_.MakeSymbolPredicate(
+      n, /*output_idx=*/0, /*must_be_true=*/false, &signal_is_alive));
+  input_preds.push_back(signal_is_alive);
   SetPredicate(n, {0, Graph::kControlSlot},
                predicate_factory_.MakeAndPredicate(input_preds),
                should_revisit);
@@ -892,6 +1046,24 @@ Status DeadnessAnalysisImpl::Populate() {
 
 Status DeadnessAnalysisImpl::PopulateWithReversePostOrder(
     absl::Span<Node* const> rpo) {
+  std::vector<string> unreachable_nodes;
+  // Compute the loop structure of the graph.
+  TF_RETURN_IF_ERROR(
+      BuildControlFlowInfo(&graph_, &control_flow_info_, &unreachable_nodes));
+
+  // Do some opportunistic error checking:
+  if (!unreachable_nodes.empty()) {
+    if (unreachable_nodes.size() > 5) {
+      unreachable_nodes.erase(unreachable_nodes.begin() + 5,
+                              unreachable_nodes.end());
+    }
+
+    return errors::InvalidArgument(
+        "Found unreachable nodes, most likely source and sink nodes not "
+        "connected: ",
+        absl::StrJoin(unreachable_nodes, ", "));
+  }
+
   // This an abstract interpretation over the deadness propagation semantics of
   // the graph executor.
   //
diff --git a/tensorflow/compiler/jit/deadness_analysis_test.cc b/tensorflow/compiler/jit/deadness_analysis_test.cc
index 8a73101c184e6190921fd7729742922bd96f4bcf..38a5118d9a721b814e1b52ce4202d4fb783e3ac3 100644
--- a/tensorflow/compiler/jit/deadness_analysis_test.cc
+++ b/tensorflow/compiler/jit/deadness_analysis_test.cc
@@ -123,10 +123,9 @@ InductionVarInfo CreateInductionVariable(const Scope& root,
   Output increment_by = ops::Const(root.WithOpName(prefix + "/incr"), 1);
   Output final_value = ops::Const(root.WithOpName(prefix + "/final"), 10);
   Output loop_cond_expr =
-      ops::Less(root.WithOpName(prefix + "/less"), iv.output, final_value);
-  Output loop_cond =
-      ops::LoopCond(root.WithOpName(prefix + "/cond"), loop_cond_expr);
-  ops::Switch latch(root.WithOpName(prefix + "/latch"), iv.output, loop_cond);
+      ops::Less(root.WithOpName(prefix + "/cond"), iv.output, final_value);
+  ops::Switch latch(root.WithOpName(prefix + "/latch"), iv.output,
+                    loop_cond_expr);
   ops::internal::Exit exit(root.WithOpName(prefix + "/exit"),
                            latch.output_false);
   Output iv_next = ops::Add(root.WithOpName(prefix + "/ivnext"),
@@ -140,7 +139,7 @@ InductionVarInfo CreateInductionVariable(const Scope& root,
   root.graph()->AddControlEdge(iv.output.node(), increment_by.node());
   root.graph()->AddControlEdge(iv.output.node(), final_value.node());
 
-  return {iv.output, loop_cond};
+  return {iv.output, loop_cond_expr};
 }
 
 InductionVarInfo CreateInductionVariable(const Scope& root,
@@ -515,24 +514,27 @@ TEST(DeadnessAnalysisTest, Loop) {
 
     // In theory we should be able to tell that iv0/cond:0 and iv1/cond:0
     // produce the same deadness.  But we're not that smart today.
-    EXPECT_EQ(predicate_map[ControlOutputFor(iv0)], "{#true,&,*iv0/cond:0}");
-    EXPECT_EQ(predicate_map[ControlOutputFor(iv1)], "{#true,&,*iv1/cond:0}");
-    EXPECT_EQ(predicate_map[ControlOutputFor(iv2)], "{#true,&,*iv2/cond:0}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv0)],
+              "{#true,&,*iv0/cond:0}<fr0>");
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv1)],
+              "{#true,&,*iv1/cond:0}<fr0>");
+    EXPECT_EQ(predicate_map[ControlOutputFor(iv2)],
+              "{#true,&,*iv2/cond:0}<fr0>");
     EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
-              "({#true,&,*iv1/cond:0} & {#true,&,*iv0/cond:0})");
+              "({#true,&,*iv0/cond:0}<fr0> & {#true,&,*iv1/cond:0}<fr0>)");
     EXPECT_EQ(predicate_map[ControlOutputFor(add1)],
-              "({#true,&,*iv1/cond:0} & {#true,&,*iv2/cond:0})");
+              "({#true,&,*iv1/cond:0}<fr0> & {#true,&,*iv2/cond:0}<fr0>)");
   }
 }
 
 TEST(DeadnessAnalysisTest, ControlEquivalentLoopBodies) {
   Scope root = Scope::NewRootScope().ExitOnError();
-  InductionVarInfo iv = CreateInductionVariable(root, "iv0", "frame", 0);
+  InductionVarInfo iv = CreateInductionVariable(root, "iv0", "loop", 0);
   Output dependent_iv0 =
-      CreateDependentLoopInvariantValue(root, "div0", "frame", iv.loop_cond, 0)
+      CreateDependentLoopInvariantValue(root, "div0", "loop", iv.loop_cond, 0)
           .induction_var;
   Output dependent_iv1 =
-      CreateDependentLoopInvariantValue(root, "div1", "frame", iv.loop_cond, 0)
+      CreateDependentLoopInvariantValue(root, "div1", "loop", iv.loop_cond, 0)
           .induction_var;
   Output add0 = ops::Add(root.WithOpName("add0"), dependent_iv0, dependent_iv1);
 
@@ -549,13 +551,13 @@ TEST(DeadnessAnalysisTest, ControlEquivalentLoopBodies) {
     TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
 
     EXPECT_EQ(predicate_map[ControlOutputFor(iv.induction_var)],
-              "{#true,&,*iv0/cond:0}");
+              "{#true,&,*iv0/cond:0}<loop>");
     EXPECT_EQ(predicate_map[ControlOutputFor(dependent_iv0)],
-              "{#true,&,(*iv0/cond:0 & iv0/iv:0)}");
+              "{#true,&,(iv0/iv:0 & *iv0/cond:0)}<loop>");
     EXPECT_EQ(predicate_map[ControlOutputFor(dependent_iv1)],
-              "{#true,&,(*iv0/cond:0 & iv0/iv:0)}");
+              "{#true,&,(iv0/iv:0 & *iv0/cond:0)}<loop>");
     EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
-              "{#true,&,(*iv0/cond:0 & iv0/iv:0)}");
+              "{#true,&,(iv0/iv:0 & *iv0/cond:0)}<loop>");
   }
 }
 
@@ -595,32 +597,33 @@ TEST(DeadnessAnalysisTest, LoopInvariantPredicateOnBackedge) {
 TEST(DeadnessAnalysisTest, ControlEquivalentNestedLoopBodies) {
   Scope root = Scope::NewRootScope().ExitOnError();
   InductionVarInfo iv_outer =
-      CreateInductionVariable(root, "iv_outer", "frame", 0);
+      CreateInductionVariable(root, "iv_outer", "outer_loop", 0);
+  Output enter_constant_outer_loop = ops::internal::Enter(
+      root.WithOpName("constant_enter_outer_loop"),
+      ops::Const(root.WithOpName("constant"), 5), "outer_loop",
+      ops::internal::Enter::Attrs().IsConstant(true));
   ops::Switch inner_value(root.WithOpName("outer_is_live"),
-                          ops::Const(root.WithOpName("constant"), 5),
-                          iv_outer.loop_cond);
+                          enter_constant_outer_loop, iv_outer.loop_cond);
   InductionVarInfo iv_inner = CreateInductionVariable(
-      root, "iv_inner", "frame",
-      ops::internal::Enter(root.WithOpName("iv_inner/enter"),
-                           inner_value.output_true, "frame_inner"));
+      root, "iv_inner", "inner_loop", inner_value.output_true);
 
   Output dependent_outer_iv0 =
-      CreateDependentLoopInvariantValue(root, "dependent_outer_iv0", "frame",
-                                        iv_outer.loop_cond, 0)
+      CreateDependentLoopInvariantValue(root, "dependent_outer_iv0",
+                                        "outer_loop", iv_outer.loop_cond, 0)
           .induction_var;
   Output dependent_outer_iv1 =
-      CreateDependentLoopInvariantValue(root, "dependent_outer_iv1", "frame",
-                                        iv_outer.loop_cond, 0)
+      CreateDependentLoopInvariantValue(root, "dependent_outer_iv1",
+                                        "outer_loop", iv_outer.loop_cond, 0)
           .induction_var;
 
-  Output dependent_inner_iv0 =
-      CreateDependentLoopInvariantValue(root, "dependent_inner_iv0", "frame",
-                                        iv_inner.loop_cond, dependent_outer_iv0)
-          .induction_var;
-  Output dependent_inner_iv1 =
-      CreateDependentLoopInvariantValue(root, "dependent_inner_iv1", "frame",
-                                        iv_inner.loop_cond, dependent_outer_iv1)
-          .induction_var;
+  Output dependent_inner_iv0 = CreateDependentLoopInvariantValue(
+                                   root, "dependent_inner_iv0", "inner_loop",
+                                   iv_inner.loop_cond, dependent_outer_iv0)
+                                   .induction_var;
+  Output dependent_inner_iv1 = CreateDependentLoopInvariantValue(
+                                   root, "dependent_inner_iv1", "inner_loop",
+                                   iv_inner.loop_cond, dependent_outer_iv1)
+                                   .induction_var;
 
   Output add0 = ops::Add(root.WithOpName("add0"), dependent_inner_iv0,
                          dependent_inner_iv1);
@@ -638,46 +641,51 @@ TEST(DeadnessAnalysisTest, ControlEquivalentNestedLoopBodies) {
     TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
 
     EXPECT_EQ(predicate_map[ControlOutputFor(iv_outer.induction_var)],
-              "{#true,&,*iv_outer/cond:0}");
+              "{#true,&,*iv_outer/cond:0}<outer_loop>");
     EXPECT_EQ(predicate_map[ControlOutputFor(iv_inner.induction_var)],
-              "{(*iv_outer/cond:0 & {#true,&,*iv_outer/cond:0}),&,"
-              "*iv_inner/cond:0}");
+              "{(*iv_outer/cond:0 & "
+              "{#true,&,*iv_outer/cond:0}<outer_loop>),&,*iv_inner/"
+              "cond:0}<inner_loop;outer_loop>");
 
     EXPECT_EQ(predicate_map[ControlOutputFor(dependent_inner_iv0)],
-              "{{#true,&,(iv_outer/iv:0 & *iv_outer/cond:0)},&,"
-              "(*iv_inner/cond:0 & iv_inner/iv:0)}");
+              "{{#true,&,(iv_outer/iv:0 & "
+              "*iv_outer/cond:0)}<outer_loop>,&,(iv_inner/iv:0 & "
+              "*iv_inner/cond:0)}<inner_loop;outer_loop>");
+
     EXPECT_EQ(predicate_map[ControlOutputFor(dependent_inner_iv1)],
-              "{{#true,&,(iv_outer/iv:0 & *iv_outer/cond:0)},&,"
-              "(*iv_inner/cond:0 & iv_inner/iv:0)}");
+              "{{#true,&,(iv_outer/iv:0 & "
+              "*iv_outer/cond:0)}<outer_loop>,&,(iv_inner/iv:0 & "
+              "*iv_inner/cond:0)}<inner_loop;outer_loop>");
     EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
-              "{{#true,&,(iv_outer/iv:0 & *iv_outer/cond:0)},&,"
-              "(*iv_inner/cond:0 & iv_inner/iv:0)}");
+              "{{#true,&,(iv_outer/iv:0 & "
+              "*iv_outer/cond:0)}<outer_loop>,&,(iv_inner/iv:0 & "
+              "*iv_inner/cond:0)}<inner_loop;outer_loop>");
   }
 }
 
 TEST(DeadnessAnalysisTest, ControlNonEquivalentNestedLoopBodies) {
   Scope root = Scope::NewRootScope().ExitOnError();
-  InductionVarInfo iv_outer_0 =
-      CreateInductionVariable(root, "iv_outer_0", "frame", 0);
-  ops::Switch inner_value_0(root.WithOpName("outer_0_is_live"),
-                            ops::Const(root.WithOpName("constant"), 5),
-                            iv_outer_0.loop_cond);
-  InductionVarInfo iv_inner_0 = CreateInductionVariable(
-      root, "iv_inner_0", "frame",
-      ops::internal::Enter(root.WithOpName("iv_inner_0/enter"),
-                           inner_value_0.output_true, "frame_inner"));
-
-  InductionVarInfo iv_outer_1 =
-      CreateInductionVariable(root, "iv_outer_1", "frame", 1);
-  ops::Switch inner_init_value_1(root.WithOpName("outer_1_is_live"),
-                                 ops::Const(root.WithOpName("constant"), 5),
-                                 iv_outer_1.loop_cond);
-  InductionVarInfo iv_inner_1 = CreateInductionVariable(
-      root, "iv_inner_1", "frame",
-      ops::internal::Enter(root.WithOpName("iv_inner_1/enter"),
-                           inner_init_value_1.output_true, "frame_inner"));
-  Output add0 = ops::Add(root.WithOpName("add0"), iv_inner_0.induction_var,
-                         iv_inner_1.induction_var);
+
+  std::array<Output, 2> outer_iv;
+  std::array<Output, 2> inner_iv;
+
+  for (int i : {0, 1}) {
+    InductionVarInfo iv_outer =
+        CreateInductionVariable(root, "iv_outer", "outer_loop", 0);
+    Output enter_constant_outer_loop = ops::internal::Enter(
+        root.WithOpName("constant_enter_outer_loop"),
+        ops::Const(root.WithOpName("constant"), 5), "outer_loop",
+        ops::internal::Enter::Attrs().IsConstant(true));
+    ops::Switch inner_value(root.WithOpName("outer_is_live"),
+                            enter_constant_outer_loop, iv_outer.loop_cond);
+    InductionVarInfo iv_inner = CreateInductionVariable(
+        root, "iv_inner", "inner_loop", inner_value.output_true);
+
+    outer_iv[i] = iv_outer.induction_var;
+    inner_iv[i] = iv_inner.induction_var;
+  }
+
+  Output add0 = ops::Add(root.WithOpName("add0"), inner_iv[0], inner_iv[1]);
 
   VLogGraphIfAsked(*root.graph());
 
@@ -692,21 +700,77 @@ TEST(DeadnessAnalysisTest, ControlNonEquivalentNestedLoopBodies) {
     PredicateMapTy predicate_map;
     TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
 
-    EXPECT_EQ(predicate_map[ControlOutputFor(iv_outer_0.induction_var)],
-              "{#true,&,*iv_outer_0/cond:0}");
-    EXPECT_EQ(predicate_map[ControlOutputFor(iv_inner_0.induction_var)],
-              "{(*iv_outer_0/cond:0 & {#true,&,*iv_outer_0/cond:0}),&,"
-              "*iv_inner_0/cond:0}");
-    EXPECT_EQ(predicate_map[ControlOutputFor(iv_outer_1.induction_var)],
-              "{#true,&,*iv_outer_1/cond:0}");
-    EXPECT_EQ(predicate_map[ControlOutputFor(iv_inner_1.induction_var)],
-              "{(*iv_outer_1/cond:0 & {#true,&,*iv_outer_1/cond:0}),&,"
-              "*iv_inner_1/cond:0}");
+    EXPECT_EQ(predicate_map[ControlOutputFor(outer_iv[0])],
+              "{#true,&,*iv_outer/cond:0}<outer_loop>");
+    EXPECT_EQ(predicate_map[ControlOutputFor(inner_iv[0])],
+              "{(*iv_outer/cond:0 & "
+              "{#true,&,*iv_outer/cond:0}<outer_loop>),&,*iv_inner/"
+              "cond:0}<inner_loop;outer_loop>");
+    EXPECT_EQ(predicate_map[ControlOutputFor(outer_iv[1])],
+              "{#true,&,*iv_outer/cond_1:0}<outer_loop>");
+    EXPECT_EQ(predicate_map[ControlOutputFor(inner_iv[1])],
+              "{(*iv_outer/cond_1:0 & "
+              "{#true,&,*iv_outer/cond_1:0}<outer_loop>),&,*iv_inner/"
+              "cond_1:0}<inner_loop;outer_loop>");
     EXPECT_EQ(predicate_map[ControlOutputFor(add0)],
-              "({(*iv_outer_1/cond:0 & {#true,&,*iv_outer_1/cond:0}),&,"
-              "*iv_inner_1/cond:0} & "
-              "{(*iv_outer_0/cond:0 & {#true,&,*iv_outer_0/cond:0}),&,"
-              "*iv_inner_0/cond:0})");
+              "({(*iv_outer/cond:0 & "
+              "{#true,&,*iv_outer/cond:0}<outer_loop>),&,*iv_inner/"
+              "cond:0}<inner_loop;outer_loop> & {(*iv_outer/cond_1:0 & "
+              "{#true,&,*iv_outer/cond_1:0}<outer_loop>),&,*iv_inner/"
+              "cond_1:0}<inner_loop;outer_loop>)");
+  }
+}
+
+TEST(DeadnessAnalysisTest, AndRecurrenceNeedsFrameName) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  InductionVarInfo iv_0 = CreateInductionVariable(root, "iv_0", "frame_0", 10);
+  InductionVarInfo iv_1 = CreateInductionVariable(root, "iv_1", "frame_1", 9);
+
+  Output init = CreateSwitch(root, "init").output_true;
+  Output step = CreateSwitch(root, "step").output_true;
+
+  std::array<Output, 2> exits;
+  std::array<Output, 2> next_iterations;
+
+  for (int i : {0, 1}) {
+    Output init_enter = ops::internal::Enter(
+        root.WithOpName(absl::StrCat("init_enter_frame_", i)), init,
+        absl::StrCat("frame_", i),
+        ops::internal::Enter::Attrs().IsConstant(true));
+    Output step_enter = ops::internal::Enter(
+        root.WithOpName(absl::StrCat("step_enter_frame_", i)), step,
+        absl::StrCat("frame_", i),
+        ops::internal::Enter::Attrs().IsConstant(true));
+
+    ops::Merge iv(root.WithOpName(absl::StrCat("expr_", i)),
+                  {init_enter, init_enter});
+    Output add = ops::Add(root.WithOpName(absl::StrCat("add_", i)), iv.output,
+                          step_enter);
+    next_iterations[i] = ops::NextIteration(
+        root.WithOpName(absl::StrCat("expr_", i, "_next_iteration")), add);
+    EXPECT_TRUE(
+        root.graph()
+            ->UpdateEdge(next_iterations[i].node(), 0, iv.output.node(), 1)
+            .ok());
+    exits[i] = ops::internal::Exit(root.WithOpName(absl::StrCat("exit_", i)),
+                                   iv.output);
+  }
+
+  FixupSourceAndSinkEdges(root.graph());
+
+  {
+    PredicateMapTy predicate_map;
+    TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
+
+    EXPECT_NE(predicate_map[ControlOutputFor(exits[0])],
+              predicate_map[ControlOutputFor(exits[1])]);
+    EXPECT_NE(predicate_map[ControlOutputFor(exits[0])], "");
+    EXPECT_NE(predicate_map[ControlOutputFor(exits[1])], "");
+
+    EXPECT_NE(predicate_map[ControlOutputFor(next_iterations[0])],
+              predicate_map[ControlOutputFor(next_iterations[1])]);
+    EXPECT_NE(predicate_map[ControlOutputFor(next_iterations[0])], "");
+    EXPECT_NE(predicate_map[ControlOutputFor(next_iterations[1])], "");
   }
 }
 
@@ -818,5 +882,82 @@ TEST(DeadnessAnalysisTest, RecvVsSwitchText) {
   EXPECT_EQ(predicate_map[logical_and_output_0], "(recv:0 & *recv:0)");
 }
 
+TEST(DeadnessAnalysisTest, DeMorgan) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Output cond_0 = ops::Placeholder(root.WithOpName("cond_0"), DT_BOOL);
+  Output cond_1 = ops::Placeholder(root.WithOpName("cond_1"), DT_BOOL);
+  Output value = ops::Placeholder(root.WithOpName("value"), DT_FLOAT);
+
+  ops::Switch sw_0(root.WithOpName("switch_0"), value, cond_0);
+  ops::Switch sw_1(root.WithOpName("switch_1"), value, cond_1);
+
+  Output and_0_1 =
+      ops::Add(root.WithOpName("and_0_1"), sw_0.output_true, sw_1.output_true);
+
+  Output or_not0_not1 = ops::Merge(root.WithOpName("or_not0_not1"),
+                                   {sw_0.output_false, sw_1.output_false})
+                            .output;
+
+  // Predicate(should_always_be_dead) =
+  // (A & B) & (~A | ~B) = (A & B) & ~(A & B) = False
+  Output should_always_be_dead =
+      ops::Add(root.WithOpName("should_always_be_dead"), and_0_1, or_not0_not1);
+
+  // Predicate(should_always_be_dead) =
+  // (A & B) | (~A | ~B) = (A & B) | ~(A & B) = True
+  Output should_always_be_alive =
+      ops::Merge(root.WithOpName("should_always_be_alive"),
+                 {and_0_1, or_not0_not1})
+          .output;
+
+  std::unique_ptr<DeadnessAnalysis> result;
+  TF_ASSERT_OK(AnalyzeDeadness(root.graph(), &result));
+
+  PredicateMapTy predicate_map;
+  TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
+
+  EXPECT_EQ(predicate_map[ControlOutputFor(should_always_be_dead)], "#false");
+  EXPECT_EQ(predicate_map[ControlOutputFor(should_always_be_alive)], "#true");
+}
+
+TEST(DeadnessAnalysisTest, ConstantTrueSwitchCondition) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Output constant_true = ops::Const(root.WithOpName("const_true"), true);
+  Output value = ops::Placeholder(root.WithOpName("value"), DT_FLOAT);
+  ops::Switch sw(root.WithOpName("switch"), value, constant_true);
+
+  Output id_false = ops::Identity(root.WithOpName("id_false"), sw.output_false);
+  Output id_true = ops::Identity(root.WithOpName("id_true"), sw.output_true);
+
+  FixupSourceAndSinkEdges(root.graph());
+
+  PredicateMapTy predicate_map;
+  TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
+
+  EXPECT_EQ(predicate_map[ControlOutputFor(id_false)], "#false");
+  EXPECT_EQ(predicate_map[ControlOutputFor(id_true)], "#true");
+}
+
+TEST(DeadnessAnalysisTest, ConstantFalseSwitchCondition) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Output constant_false = ops::Const(root.WithOpName("const_false"), false);
+  Output value = ops::Placeholder(root.WithOpName("value"), DT_FLOAT);
+  ops::Switch sw(root.WithOpName("switch"), value, constant_false);
+
+  Output id_false = ops::Identity(root.WithOpName("id_false"), sw.output_false);
+  Output id_true = ops::Identity(root.WithOpName("id_true"), sw.output_true);
+
+  FixupSourceAndSinkEdges(root.graph());
+
+  PredicateMapTy predicate_map;
+  TF_ASSERT_OK(ComputePredicates(*root.graph(), &predicate_map));
+
+  EXPECT_EQ(predicate_map[ControlOutputFor(id_false)], "#true");
+  EXPECT_EQ(predicate_map[ControlOutputFor(id_true)], "#false");
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index f478832781cb1dc045d9163d4a6f5e5f64a8a705..d0d7a3f3785469acd79a83b6897668f94fc6ea2e 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -779,7 +779,8 @@ Status Encapsulator::Subgraph::RecordArg(
   if (inserted) {
     NodeDef arg_def;
     NodeDefBuilder builder(
-        absl::StrCat(src_node->name(), "_", src_slot, "_arg"), kArgOp);
+        absl::StrCat(src_node->name(), "_", src_slot, "_arg"), kArgOp,
+        NodeDebugInfo(src_node->def()));
     DataType dtype = edge->dst()->input_type(edge->dst_input());
     builder.Attr("T", dtype);
     builder.Attr("index", arg_index);
@@ -814,7 +815,8 @@ Status Encapsulator::Subgraph::RecordResult(
   if (inserted) {
     NodeDef ret_def;
     NodeDefBuilder builder(
-        absl::StrCat(src_node->name(), "_", src_slot, "_retval"), kRetValOp);
+        absl::StrCat(src_node->name(), "_", src_slot, "_retval"), kRetValOp,
+        NodeDebugInfo(src_node->def()));
     DataType dtype = src_node->output_type(src_slot);
     builder.Attr("T", dtype);
     builder.Attr("index", ret_index);
@@ -974,6 +976,7 @@ Status Encapsulator::Subgraph::AddHostComputes(
       }
 
       NodeDef host_compute_def;
+      // TODO(shikharagarwal): What source node should we use for errors?
       NodeDefBuilder builder(absl::StrCat("outside_compilation_",
                                           oc_subgraph_name, "_host_compute"),
                              kHostComputeOp);
@@ -1005,13 +1008,15 @@ Status Encapsulator::Subgraph::AddHostComputes(
       // subgraph.
       for (const auto& src_node : oc_subgraph.control_inputs) {
         Node* src_image = node_images.at(src_node);
-        graph_->AddControlEdge(src_image, host_compute);
+        graph_->AddControlEdge(src_image, host_compute,
+                               /* allow_duplicates= */ true);
       }
 
       // Connect the _HostCompute node to its ancestor host compute nodes.
       for (const auto& ancestor_name : host_compute_ancestors) {
         Node* ancestor = host_compute_node[ancestor_name];
-        graph_->AddControlEdge(ancestor, host_compute);
+        graph_->AddControlEdge(ancestor, host_compute,
+                               /* allow_duplicates= */ true);
       }
 
       // Connect the consumers in the subgraph to the _HostCompute node.
@@ -1028,7 +1033,8 @@ Status Encapsulator::Subgraph::AddHostComputes(
       // node.
       for (const auto& dst_node : oc_subgraph.control_outputs) {
         Node* dst_image = node_images.at(dst_node);
-        graph_->AddControlEdge(host_compute, dst_image);
+        graph_->AddControlEdge(host_compute, dst_image,
+                               /* allow_duplicates= */ true);
       }
     }
   }
@@ -1040,6 +1046,7 @@ Status Encapsulator::Subgraph::MakeSequencingNode(const string& subgraph_name,
                                                   Graph* graph_out) {
   if (sequencer_ == nullptr) {
     NodeDef seq_def;
+    // TODO(shikharagarwal): What source node should we use for errors?
     NodeDefBuilder builder(absl::StrCat(subgraph_name, "_sequencer"), "NoOp");
     builder.Attr(kXlaHostTransferSequencerAttr, subgraph_name);
     builder.Device(device_);
@@ -1055,7 +1062,8 @@ Status Encapsulator::Subgraph::MakeSequencingNode(const string& subgraph_name,
 void Encapsulator::Subgraph::ConnectSequencerToCallNode(Graph* graph_out) {
   if (sequencer_ != nullptr) {
     VLOG(2) << "ConnectSequencerToCallNode";
-    graph_out->AddControlEdge(sequencer_, call_node_);
+    graph_out->AddControlEdge(sequencer_, call_node_,
+                              /* allow_duplicates= */ true);
   }
 }
 
@@ -1214,7 +1222,8 @@ Status Encapsulator::Subgraph::AddHostComputeKeyPlaceholder(
   GraphDefBuilder::Options options(graph_out, /*status=*/nullptr);
   NodeDef key_def;
   NodeDefBuilder builder(
-      absl::StrCat(call_node_def_.name(), "_key_placeholder"), "Placeholder");
+      absl::StrCat(call_node_def_.name(), "_key_placeholder"), "Placeholder",
+      NodeDebugInfo(call_node_def_));
   builder.Attr("dtype", DT_STRING);
   builder.Attr("shape", shape_proto);
   builder.Attr("_host_compute_call_node", call_node_def_.name());
@@ -1248,6 +1257,7 @@ Status Encapsulator::Subgraph::AddRecvAtHostNode(
   }
 
   NodeDef recv_def;
+  // TODO(shikharagarwal): What source node should we use for errors?
   NodeDefBuilder builder(absl::StrCat("outside_compilation_", subgraph_name,
                                       "_", oc_subgraph_name, "_recv"),
                          kRecvAtHostOp);
@@ -1273,7 +1283,8 @@ Status Encapsulator::Subgraph::AddRecvAtHostNode(
   // completes. This has no effect on execution order but prevents the
   // RecvAtHost being pruned.
   TF_RETURN_IF_ERROR(MakeSequencingNode(subgraph_name, graph_out));
-  graph_out->AddControlEdge(oc_subgraph->recv_at_host, sequencer_);
+  graph_out->AddControlEdge(oc_subgraph->recv_at_host, sequencer_,
+                            true /* skip duplicates check */);
 
   return Status::OK();
 }
@@ -1303,6 +1314,7 @@ Status Encapsulator::Subgraph::AddSendFromHostNode(
   }
 
   NodeDef send_def;
+  // TODO(shikharagarwal): What source node should we use for errors?
   NodeDefBuilder builder(absl::StrCat("outside_compilation_", subgraph_name,
                                       "_", oc_subgraph_name, "_send"),
                          kSendFromHostOp);
@@ -1329,7 +1341,8 @@ Status Encapsulator::Subgraph::AddSendFromHostNode(
   // subgraph completes. This has no effect on execution order but prevents the
   // RecvAtHost being pruned.
   TF_RETURN_IF_ERROR(MakeSequencingNode(subgraph_name, graph_out));
-  graph_out->AddControlEdge(oc_subgraph->send_from_host, sequencer_);
+  graph_out->AddControlEdge(oc_subgraph->send_from_host, sequencer_,
+                            /* allow_duplicates= */ true);
 
   return Status::OK();
 }
@@ -1439,7 +1452,8 @@ Status Encapsulator::CopySubgraphEdges(
         src_func_id == dst_func_id) {
       Graph* g = subgraphs_[src_func_id].GetGraph();
       if (edge->IsControlEdge()) {
-        g->AddControlEdge(src_image, dst_image);
+        g->AddControlEdge(src_image, dst_image,
+                          /* allow_duplicates= */ true);
       } else {
         g->AddEdge(src_image, edge->src_output(), dst_image, edge->dst_input());
       }
@@ -1725,7 +1739,8 @@ Status Encapsulator::CopyEdgeToOutputGraph(
     if (edges_added
             ->emplace(OutputTensor(src_image, -1), InputTensor(dst_image, -1))
             .second) {
-      graph_out->AddControlEdge(src_image, dst_image);
+      graph_out->AddControlEdge(src_image, dst_image,
+                                /* allow_duplicates= */ true);
     }
 
     return Status::OK();
@@ -1754,7 +1769,8 @@ Status Encapsulator::AddCallNodeDependencies(Graph* graph_out) {
     const string& subgraph = ancestors.first;
     for (const string& ancestor : ancestors.second) {
       graph_out->AddControlEdge(subgraphs_[ancestor].GetCallNode(),
-                                subgraphs_[subgraph].GetCallNode());
+                                subgraphs_[subgraph].GetCallNode(),
+                                /* allow_duplicates= */ true);
     }
   }
   return Status::OK();
@@ -1833,8 +1849,9 @@ Node* AddDummyShapedNode(const Node* src_node, int src_port,
   // Add any Enter nodes required to bring the constant to the correct control
   // flow frame.
   while (!control_flow_info[src_node->id()].frame_name.empty()) {
+    NodeDebugInfo debug_info(*src_node);
     NodeBuilder enter_builder(options.GetNameForOp("Enter"), "Enter",
-                              options.op_registry());
+                              options.op_registry(), &debug_info);
     enter_builder.Attr("frame_name",
                        control_flow_info[src_node->id()].frame_name);
     enter_builder.Attr("is_constant", true);
@@ -2018,7 +2035,8 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
             return errors::InvalidArgument(
                 "Shape inference is not possible for outside_compilation "
                 "SendFromHost node ",
-                send_node->name(), " because shape of node ", n->name(),
+                send_node->name(), " because shape of node ",
+                FormatNodeForError(*n),
                 " will not be known at compilation time.");
           }
         }
@@ -2047,8 +2065,7 @@ Status Encapsulator::DoStaticShapeInferenceForOutsideCompilationSend(
         return errors::Internal(
             "Internal assumption failed while rewriting an outside_compilation "
             "cluster that contains a while loop. Logic assumes back-edge is to "
-            "port 1 of a 2-input "
-            "Merge node.");
+            "port 1 of a 2-input Merge node.");
       }
       // Connect the existing edge to both inputs of the Merge node so that the
       // graph will be well-formed.
@@ -2121,7 +2138,8 @@ Status CheckClusterDependencyForCycles(
     const string& ancestor, const string& successor,
     const std::unordered_map<string, std::unordered_set<string>>& ancestors,
     const std::unordered_map<Node*, PathDetails>& node_ancestors_map,
-    GraphCycles* cycle_detector, std::map<string, int>* cycle_detector_map) {
+    GraphCycles* cycle_detector,
+    std::unordered_map<string, int>* cycle_detector_map) {
   if (cycle_detector_map->find(ancestor) == cycle_detector_map->end()) {
     (*cycle_detector_map)[ancestor] = cycle_detector->NewNode();
   }
@@ -2165,7 +2183,7 @@ Status Encapsulator::FindClusterDependencies() {
   // We check that clusters are acyclic using this cycle detector.
   GraphCycles cycle_detector;
   // Map from cluster name to cycle detector node id.
-  std::map<string, int> cycle_detector_map;
+  std::unordered_map<string, int> cycle_detector_map;
   // Process the nodes in topologically-sorted order.
   std::vector<Node*> nodes;
   GetReversePostOrder(*graph_in_, &nodes);
@@ -2527,7 +2545,33 @@ Status EncapsulateSubgraphsPass::Run(
             std::vector<int>* input_permutation,
             std::vector<int>* output_permutation, NodeDef* node) {
         // Optimize the subgraph.
-        OptimizeGraph(flr, subgraph);
+        // Do not constant fold nodes that output DT_VARIANT type tensors.
+        // XLA does not support Const nodes of Variant type since it needs
+        // to know the original ops to be able to compile them to the relevant
+        // XLA form.
+        // TODO(srbs): This filter is a little conservative. E.g. a subgraph of
+        // the form:
+        //                          Const
+        //                            |
+        // EmptyTensorList -> TensorListPushBack -> TensorListPopBack -> Op
+        //                                                  |
+        //                                        (Discard popped list)
+        //
+        // Would have been reduced to "Const -> Op" without this filter.
+        // However since we are only allowed to specify the filter at the "Node"
+        // level there is no good way to allow the above behavior. So we
+        // disallow any sort of constant folding on Variant nodes for now.
+        auto cf_consider_fn = [](const Node* n) {
+          for (const auto& output_arg : n->op_def().output_arg()) {
+            if (output_arg.type() == DT_VARIANT) {
+              return false;
+            }
+          }
+          return true;
+        };
+        GraphOptimizer::Options graph_optimizer_options;
+        graph_optimizer_options.cf_consider_fn = cf_consider_fn;
+        OptimizeGraph(flr, subgraph, graph_optimizer_options);
 
         const int num_args = input_permutation->size();
         std::vector<bool> const_args(num_args);
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index de89be9a3555960dabe7bacd17226c15ae888ae6..261519de3478c8b3e30d206a15944b5a686598e2 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -25,6 +25,8 @@ limitations under the License.
 #include "tensorflow/compiler/jit/encapsulate_util.h"
 #include "tensorflow/compiler/jit/extract_outside_compilation_pass.h"
 #include "tensorflow/compiler/tf2xla/side_effect_util.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -32,6 +34,8 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
@@ -299,26 +303,10 @@ REGISTER_OP("XlaHostCompute")
     .Attr("Toutputs: list(type) >= 0")
     .Attr("ancestors: list(string) >= 0")
     .Attr("key: string")
-    .Attr("shape_inference_graph: string = ''")
+    .Attr("shape_inference_graph: func")
     .Attr("shapes: list(shape) >= 0")
     .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
 
-REGISTER_OP("_XlaSendFromHost")
-    .Input("inputs: Tinputs")
-    .Input("dynamic_key: string")
-    .Attr("Tinputs: list(type) >= 0")
-    .Attr("key: string")
-    .Attr("device_ordinal: int")
-    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
-
-REGISTER_OP("_XlaRecvAtHost")
-    .Input("dynamic_key: string")
-    .Output("outputs: Toutputs")
-    .Attr("Toutputs: list(type) >= 0")
-    .Attr("key: string")
-    .Attr("device_ordinal: int")
-    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
-
 REGISTER_OP("InputTest")
     .Output("o: float")
     .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
@@ -510,12 +498,20 @@ Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library,
   s = ConvertGraphDefToGraph(options, *graphdef, graph.get());
   if (!s.ok()) return s;
 
-  s = PerformStaticShapeInferenceBeforeEncapsulation(
-      graph.get(), "_encapsulate", "_outside");
+  s = PerformStaticShapeInferenceBeforeEncapsulation(graph.get());
   if (!s.ok()) return s;
 
-  s = PreprocessForEncapsulation(graph.get(), "_encapsulate", "_outside");
-  if (!s.ok()) return s;
+  // Create FunctionLibraryRuntime.
+  SessionOptions session_options;
+  std::vector<std::unique_ptr<Device>> devices;
+  TF_CHECK_OK(DeviceFactory::AddDevices(
+      session_options, "/job:localhost/replica:0/task:0", &devices));
+  OptimizerOptions opts;
+  auto device_mgr = absl::make_unique<DeviceMgr>(std::move(devices));
+  auto pflr = absl::make_unique<ProcessFunctionLibraryRuntime>(
+      device_mgr.get(), Env::Default(), TF_GRAPH_DEF_VERSION, lib_def.get(),
+      opts, /*default_thread_pool=*/nullptr, /*cluster_flr=*/nullptr);
+  auto flr = pflr->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
 
   std::unique_ptr<Graph> graph_out;
   s = EncapsulateSubgraphsInFunctions(
@@ -542,7 +538,7 @@ Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library,
                                     std::map<string, int>{}});
   }
   s = ExtractOutsideCompilation("_encapsulate", "_outside", clusters,
-                                graph_out.get(), lib_def.get());
+                                graph_out.get(), flr, lib_def.get());
   if (!s.ok()) return s;
 
   GraphDef graphdef_out;
@@ -550,6 +546,14 @@ Status Encapsulate(GraphDef* graphdef, FunctionDefLibrary* library,
   graphdef->Swap(&graphdef_out);
 
   *library = lib_def->ToProto();
+  // Remove "_xla_inferred_shapes" attr. They are added by
+  // `PerformStaticShapeInferenceBeforeEncapsulation`.
+  for (FunctionDef& fdef : *library->mutable_function()) {
+    for (NodeDef& node_def : *fdef.mutable_node_def()) {
+      node_def.mutable_attr()->erase("_xla_inferred_shapes");
+    }
+  }
+
   return s;
 }
 
@@ -901,18 +905,22 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
   {
     GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
     Node* key_constant = KeyPlaceholder("F1", shape.opts());
-    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
-                            {DT_FLOAT, DT_FLOAT}, shape.opts());
+    Node* recv = RecvAtHost(
+        ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT, DT_FLOAT},
+        shape.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
                      shape.opts()
                          .WithName("E")
                          .WithAttr("_encapsulate", "F1")
                          .WithAttr("_outside", "O1"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape.opts());
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                 shape.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     TF_EXPECT_OK(
         AddGraphDefToFunctionLibrary(shape, "F1_O1", &library_expected));
   }
 
+  NameAttrList shape_inference_graph;
+  shape_inference_graph.set_name("_outside_compilation_shape_inference_F1_O1");
   *library_expected.add_function() = test::function::XTimesTwo();
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval_retval:float"}, {},
@@ -931,10 +939,11 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph",
-             "_outside_compilation_shape_inference_F1_O1"},
+            {"shape_inference_graph", shape_inference_graph},
             {"shapes", absl::Span<const DataType>({})},
-            {"_outside_compilation_subgraph", "O1"}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}},
            {"c"}},
       },
       {{"f_0_retval_retval", "F:o:0"}});
@@ -948,16 +957,18 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
 
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
-                            {DT_FLOAT, DT_FLOAT}, b2.opts());
+    Node* recv = RecvAtHost(
+        ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT, DT_FLOAT},
+        b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
                      b2.opts()
                          .WithName("E")
-                         .WithControlInputs({recv, b})
+                         .WithControlInputs({recv})
                          .WithAttr("_encapsulate", "F1")
                          .WithAttr("_outside", "O1"));
     Node* send = SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
-                              b2.opts().WithControlInput(e));
+                              b2.opts().WithControlInput(e).WithAttr(
+                                  kXlaHasHostTransferAttrName, true));
 
     Node* s = Sequencer(
         b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}),
@@ -966,9 +977,9 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
     NodeBuilder node_builder("F1", "F1", lib_def.get());
     node_builder.Input(a).Input(b);
     Node* call =
-        b2.opts().WithControlInputs({s}).FinalizeBuilder(&node_builder);
+        b2.opts().WithControlInputs({s, b}).FinalizeBuilder(&node_builder);
 
-    Binary(a, call, b2.opts().WithName("G").WithControlInputs({e}));
+    Binary(a, call, b2.opts().WithName("G").WithControlInputs({call}));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -1022,14 +1033,16 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
   {
     GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
     Node* key_constant = KeyPlaceholder("F1", shape1.opts());
-    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
-                            {DT_FLOAT, DT_FLOAT}, shape1.opts());
+    Node* recv = RecvAtHost(
+        ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT, DT_FLOAT},
+        shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
                      shape1.opts()
                          .WithName("E")
                          .WithAttr("_encapsulate", "F1")
                          .WithAttr("_outside", "O1"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape1.opts());
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                 shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     TF_EXPECT_OK(
         AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected));
   }
@@ -1037,33 +1050,45 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
   {
     GraphDefBuilder shape2(GraphDefBuilder::kFailImmediately);
     Node* key_constant = KeyPlaceholder("F1", shape2.opts());
-    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
-                             {DT_FLOAT, DT_FLOAT}, shape2.opts());
+    Node* recv1 = RecvAtHost(
+        ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT, DT_FLOAT},
+        shape2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
                      shape2.opts()
                          .WithName("E")
                          .WithAttr("_encapsulate", "F1")
                          .WithAttr("_outside", "O1"));
-    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
-                             {DT_FLOAT, DT_FLOAT}, shape2.opts());
+    Node* recv2 = RecvAtHost(
+        ops::NodeOut(key_constant, 0), "F1", "O2", {DT_FLOAT, DT_FLOAT},
+        shape2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* g = Binary(e, ops::NodeOut(recv2, 0),
+                     shape2.opts()
+                         .WithName("G")
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O2"));
     Node* h = Binary(ops::NodeOut(recv2, 1), e,
                      shape2.opts()
                          .WithName("H")
                          .WithAttr("_encapsulate", "F1")
                          .WithAttr("_outside", "O2"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {h}, shape2.opts());
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {g, h},
+                 shape2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     TF_EXPECT_OK(
         AddGraphDefToFunctionLibrary(shape2, "F1_O2", &library_expected));
   }
 
+  NameAttrList shape_inference_graph1, shape_inference_graph2;
+  shape_inference_graph1.set_name("_outside_compilation_shape_inference_F1_O1");
+  shape_inference_graph2.set_name("_outside_compilation_shape_inference_F1_O2");
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"i_0_retval_retval:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"},
+      {"g_0_retval_retval:float", "i_0_retval_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}, {}},
           {{"I"},
            "UnaryTest",
-           {"outside_compilation_O2_host_compute:outputs:0"}},
+           {"outside_compilation_O2_host_compute:outputs:1"}},
           {{"F"},
            "BinaryTest",
            {"C:o:0", "outside_compilation_O1_host_compute:outputs:0"},
@@ -1073,13 +1098,14 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
            "XlaHostCompute",
            {"F:o:0", "D:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT, DT_FLOAT})},
-            {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
+            {"Toutputs", absl::Span<const DataType>({DT_FLOAT, DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O2"},
-            {"shape_inference_graph",
-             "_outside_compilation_shape_inference_F1_O2"},
+            {"shape_inference_graph", shape_inference_graph2},
             {"shapes", absl::Span<const DataType>({})},
-            {"_outside_compilation_subgraph", "O2"}},
+            {"_outside_compilation_subgraph", "O2"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}},
            {"F"}},
           {{"outside_compilation_O1_host_compute"},
            "XlaHostCompute",
@@ -1088,13 +1114,15 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph",
-             "_outside_compilation_shape_inference_F1_O1"},
+            {"shape_inference_graph", shape_inference_graph1},
             {"shapes", absl::Span<const DataType>({})},
-            {"_outside_compilation_subgraph", "O1"}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}},
            {"D"}},
       },
-      {{"i_0_retval_retval", "I:o:0"}});
+      {{"g_0_retval_retval", "outside_compilation_O2_host_compute:outputs:0"},
+       {"i_0_retval_retval", "I:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -1105,19 +1133,22 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
 
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
-                             {DT_FLOAT, DT_FLOAT}, b2.opts());
+    Node* recv1 = RecvAtHost(
+        ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT, DT_FLOAT},
+        b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
                      b2.opts()
                          .WithName("E")
-                         .WithControlInputs({recv1, b})
+                         .WithControlInputs({recv1})
                          .WithAttr("_encapsulate", "F1")
                          .WithAttr("_outside", "O1"));
     Node* send1 = SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
-                               b2.opts().WithControlInput(e));
+                               b2.opts().WithControlInput(e).WithAttr(
+                                   kXlaHasHostTransferAttrName, true));
 
-    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
-                             {DT_FLOAT, DT_FLOAT}, b2.opts());
+    Node* recv2 = RecvAtHost(
+        ops::NodeOut(key_constant, 0), "F1", "O2", {DT_FLOAT, DT_FLOAT},
+        b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     Node* g = Binary(e, ops::NodeOut(recv2, 0),
                      b2.opts()
                          .WithName("G")
@@ -1130,7 +1161,8 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
                          .WithAttr("_encapsulate", "F1")
                          .WithAttr("_outside", "O2"));
     Node* send2 =
-        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {h}, b2.opts());
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {g, h},
+                     b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
 
     Node* s = Sequencer(b2.opts()
                             .WithName("F1_sequencer")
@@ -1139,12 +1171,13 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
 
     NodeBuilder node_builder("F1", "F1", lib_def.get());
     node_builder.Input(a).Input(b);
-    Node* call = b2.opts().WithControlInput(s).FinalizeBuilder(&node_builder);
+    Node* call =
+        b2.opts().WithControlInputs({s, b}).FinalizeBuilder(&node_builder);
 
-    Binary(g, call, b2.opts().WithName("J"));
+    Binary(ops::NodeOut(call, 0), ops::NodeOut(call, 1),
+           b2.opts().WithName("J"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
-
   TF_EXPECT_GRAPH_EQ(graphdef_expected, graphdef);
   TF_EXPECT_FUNCTIONDEFLIBRARY_EQ(library_expected, library);
 }
@@ -1196,7 +1229,9 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
 
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"a_0_arg:float", "b_0_arg:float"},
-      {"f_0_retval_retval:float", "d_0_retval_retval:float"}, {},
+      {"e_0_retval_retval:float", "f_0_retval_retval:float",
+       "d_0_retval_retval:float"},
+      {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
@@ -1212,35 +1247,41 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", ""},
+            {"shape_inference_graph", NameAttrList()},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
-            {"_outside_compilation_subgraph", "O1"}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}},
            {"D"}},
       },
-      {{"d_0_retval_retval", "D:o:0"}, {"f_0_retval_retval", "F:o:0"}});
+      {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
+       {"d_0_retval_retval", "D:o:0"},
+       {"f_0_retval_retval", "F:o:0"}});
 
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F2", {"f_0_arg:float", "bridge_e_g_0_arg:float"},
-      {"i_0_retval_retval:float", "g_0_retval_retval:float"}, {},
+      "F2", {"e_0_arg:float", "f_0_arg:float", "d_0_arg:float"},
+      {"g_0_retval_retval:float", "i_0_retval_retval:float"}, {},
       {
-          {{"G"}, "BinaryTest", {"bridge_e_g_0_arg", "f_0_arg"}},
+          {{"G"}, "BinaryTest", {"e_0_arg", "f_0_arg"}},
           {{"I"},
            "BinaryTest",
            {"f_0_arg", "outside_compilation_O1_host_compute:outputs:0"}},
           {{"outside_compilation_O1_host_compute"},
            "XlaHostCompute",
-           {"G:o:0"},
-           {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
+           {"d_0_arg", "G:o:0"},
+           {{"Tinputs", absl::Span<const DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F2_O1"},
-            {"shape_inference_graph", ""},
+            {"shape_inference_graph", NameAttrList()},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
-            {"_outside_compilation_subgraph", "O1"}}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
       },
-      {{"i_0_retval_retval", "I:o:0"}, {"g_0_retval_retval", "G:o:0"}});
+      {{"g_0_retval_retval", "G:o:0"}, {"i_0_retval_retval", "I:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -1251,16 +1292,18 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
 
     Node* key_constant1 =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant1, 0), "F1", "O1",
-                             {DT_FLOAT, DT_FLOAT}, b2.opts());
+    Node* recv1 = RecvAtHost(
+        ops::NodeOut(key_constant1, 0), "F1", "O1", {DT_FLOAT, DT_FLOAT},
+        b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
                      b2.opts()
                          .WithName("E")
-                         .WithControlInputs({recv1, b})
+                         .WithControlInputs({recv1})
                          .WithAttr("_encapsulate", "F1")
                          .WithAttr("_outside", "O1"));
     Node* send1 = SendFromHost(ops::NodeOut(key_constant1, 0), "F1", "O1", {e},
-                               b2.opts().WithControlInput(e));
+                               b2.opts().WithControlInput(e).WithAttr(
+                                   kXlaHasHostTransferAttrName, true));
     Node* s1 = Sequencer(
         b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}),
         "F1");
@@ -1268,29 +1311,33 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
     Node* call1 =
-        b2.opts().WithControlInput(s1).FinalizeBuilder(&node_builder1);
+        b2.opts().WithControlInputs({s1, b}).FinalizeBuilder(&node_builder1);
 
     Node* key_constant2 =
         KeyPlaceholder("F2", b2.opts().WithName("F2_key_placeholder"));
-    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant2, 0), "F2", "O1",
-                             {DT_FLOAT}, b2.opts());
-    Node* h = Binary(ops::NodeOut(call1, 1), recv2,
+    Node* recv2 = RecvAtHost(
+        ops::NodeOut(key_constant2, 0), "F2", "O1", {DT_FLOAT, DT_FLOAT},
+        b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* h = Binary(recv2, ops::NodeOut(recv2, 1),
                      b2.opts()
                          .WithName("H")
                          .WithAttr("_encapsulate", "F2")
                          .WithAttr("_outside", "O1"));
-    Node* send2 = SendFromHost(ops::NodeOut(key_constant2, 0), "F2", "O1", {h},
-                               b2.opts());
+    Node* send2 =
+        SendFromHost(ops::NodeOut(key_constant2, 0), "F2", "O1", {h},
+                     b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
 
     Node* s2 = Sequencer(
         b2.opts().WithName("F2_sequencer").WithControlInputs({recv2, send2}),
         "F2");
     NodeBuilder node_builder2("F2", "F2", lib_def.get());
-    node_builder2.Input(call1).Input(e);
+    node_builder2.Input(call1)
+        .Input(ops::NodeOut(call1, 1))
+        .Input(ops::NodeOut(call1, 2));
     Node* call2 = b2.opts()
-                      .WithControlInputs({s2, e, call1})
+                      .WithControlInputs({s2, call1})
                       .FinalizeBuilder(&node_builder2);
-    Binary(ops::NodeOut(call2, 1), call2, b2.opts().WithName("J"));
+    Binary(call2, ops::NodeOut(call2, 1), b2.opts().WithName("J"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -1326,8 +1373,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
     Node* h = Unary(g, b1.opts()
                            .WithName("H")
                            .WithAttr("_encapsulate", "F2")
-                           .WithAttr("_outside", "O1")
-                           .WithControlInput(e));
+                           .WithAttr("_outside", "O1"));
     Node* i = Unary(h, b1.opts().WithName("I").WithAttr("_encapsulate", "F2"));
     Binary(f, i, b1.opts().WithName("J"));
     TF_EXPECT_OK(b1.ToGraphDef(&graphdef));
@@ -1358,10 +1404,12 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", ""},
+            {"shape_inference_graph", NameAttrList()},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
-            {"_outside_compilation_subgraph", "O1"}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}},
            {"D"}},
       },
       {{"f_0_retval_retval", "F:o:0"}});
@@ -1380,10 +1428,12 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F2_O1"},
-            {"shape_inference_graph", ""},
+            {"shape_inference_graph", NameAttrList()},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
-            {"_outside_compilation_subgraph", "O1"}}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
       },
       {{"i_0_retval_retval", "I:o:0"}});
 
@@ -1401,7 +1451,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
                      b2.opts()
                          .WithName("E")
-                         .WithControlInputs({recv1, b})
+                         .WithControlInputs({recv1})
                          .WithAttr("_encapsulate", "F1")
                          .WithAttr("_outside", "O1"));
     Node* send1 = SendFromHost(ops::NodeOut(key_constant1, 0), "F1", "O1", {e},
@@ -1413,7 +1463,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
     Node* call1 =
-        b2.opts().WithControlInput(s1).FinalizeBuilder(&node_builder1);
+        b2.opts().WithControlInputs({s1, b}).FinalizeBuilder(&node_builder1);
 
     Node* key_constant2 =
         KeyPlaceholder("F2", b2.opts().WithName("F2_key_placeholder"));
@@ -1422,8 +1472,7 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
     Node* h = Unary(recv2, b2.opts()
                                .WithName("H")
                                .WithAttr("_encapsulate", "F2")
-                               .WithAttr("_outside", "O1")
-                               .WithControlInput(e));
+                               .WithAttr("_outside", "O1"));
     Node* send2 = SendFromHost(ops::NodeOut(key_constant2, 0), "F2", "O1", {h},
                                b2.opts());
 
@@ -1484,15 +1533,17 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
            {"D:o:0", "outside_compilation_O1_host_compute:outputs:0"}},
           {{"outside_compilation_O1_host_compute"},
            "XlaHostCompute",
-           {},
-           {{"Tinputs", absl::Span<const DataType>({})},
+           {"a_0_arg"},
+           {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", ""},
+            {"shape_inference_graph", NameAttrList()},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
-            {"_outside_compilation_subgraph", "O1"}}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
       },
       {{"f_0_retval_retval", "F:o:0"}});
 
@@ -1503,16 +1554,19 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
     Node* a = InputShaped(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
-    Node* e = Unary(a, b2.opts()
-                           .WithName("E")
-                           .WithAttr("_encapsulate", "F1")
-                           .WithAttr("_outside", "O1"));
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
+    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                             {DT_FLOAT}, b2.opts());
+    Node* e = Unary(recv1, b2.opts()
+                               .WithName("E")
+                               .WithAttr("_encapsulate", "F1")
+                               .WithAttr("_outside", "O1"));
     Node* send1 =
         SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts());
     Node* s1 = Sequencer(
-        b2.opts().WithName("F1_sequencer").WithControlInput(send1), "F1");
+        b2.opts().WithName("F1_sequencer").WithControlInputs({send1, recv1}),
+        "F1");
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
     Node* call1 =
@@ -1569,15 +1623,17 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
            {"D:o:0", "outside_compilation_O1_host_compute:outputs:0"}},
           {{"outside_compilation_O1_host_compute"},
            "XlaHostCompute",
-           {},
-           {{"Tinputs", absl::Span<const DataType>({})},
+           {"a_0_arg"},
+           {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", ""},
+            {"shape_inference_graph", NameAttrList()},
             {"shapes",
              absl::Span<const TensorShapeProto>({shape_proto_expected})},
-            {"_outside_compilation_subgraph", "O1"}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}},
            {"D"}},
       },
       {{"f_0_retval_retval", "F:o:0"}});
@@ -1591,13 +1647,13 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
 
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv1 =
-        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", {}, b2.opts());
-    Node* e = Unary(a, b2.opts()
-                           .WithName("E")
-                           .WithControlInput(recv1)
-                           .WithAttr("_encapsulate", "F1")
-                           .WithAttr("_outside", "O1"));
+    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                             {DT_FLOAT}, b2.opts());
+    Node* e = Unary(recv1, b2.opts()
+                               .WithName("E")
+                               .WithControlInput(recv1)
+                               .WithAttr("_encapsulate", "F1")
+                               .WithAttr("_outside", "O1"));
     Node* send1 =
         SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts());
     Node* s1 = Sequencer(
@@ -1644,8 +1700,27 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
+  {
+    GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
+    Node* key_constant = KeyPlaceholder("F1", shape1.opts());
+    Node* recv1 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT},
+                   shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* e = Unary(ops::NodeOut(recv1, 0), shape1.opts()
+                                                .WithName("E")
+                                                .WithAttr("_encapsulate", "F1")
+                                                .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                 shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected));
+  }
+
+  NameAttrList shape_inference_graph;
+  shape_inference_graph.set_name("_outside_compilation_shape_inference_F1_O1");
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval_retval:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"},
+      {"e_0_retval_retval:float", "f_0_retval_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
@@ -1654,14 +1729,17 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
            "XlaHostCompute",
            {"D:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"Toutputs", absl::Span<const DataType>({})},
+            {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", ""},
+            {"shape_inference_graph", shape_inference_graph},
             {"shapes", absl::Span<const TensorShapeProto>({})},
-            {"_outside_compilation_subgraph", "O1"}}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
       },
-      {{"f_0_retval_retval", "F:o:0"}});
+      {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
+       {"f_0_retval_retval", "F:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -1678,14 +1756,17 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
                                .WithName("E")
                                .WithAttr("_encapsulate", "F1")
                                .WithAttr("_outside", "O1"));
+    Node* send1 =
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts());
     Node* s1 = Sequencer(
-        b2.opts().WithName("F1_sequencer").WithControlInput(recv1), "F1");
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}),
+        "F1");
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
     Node* call1 =
         b2.opts().WithControlInput(s1).FinalizeBuilder(&node_builder1);
 
-    Binary(e, call1, b2.opts().WithName("G"));
+    Binary(call1, ops::NodeOut(call1, 1), b2.opts().WithName("G"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -1722,8 +1803,27 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
+  {
+    GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
+    Node* key_constant = KeyPlaceholder("F1", shape1.opts());
+    Node* recv1 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT},
+                   shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* e = Unary(ops::NodeOut(recv1, 0), shape1.opts()
+                                                .WithName("E")
+                                                .WithAttr("_encapsulate", "F1")
+                                                .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                 shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected));
+  }
+
+  NameAttrList shape_inference_graph;
+  shape_inference_graph.set_name("_outside_compilation_shape_inference_F1_O1");
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval_retval:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"},
+      {"e_0_retval_retval:float", "f_0_retval_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
@@ -1736,14 +1836,17 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
            "XlaHostCompute",
            {"D:o:0"},
            {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
-            {"Toutputs", absl::Span<const DataType>({})},
+            {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph", ""},
+            {"shape_inference_graph", shape_inference_graph},
             {"shapes", absl::Span<const TensorShapeProto>({})},
-            {"_outside_compilation_subgraph", "O1"}}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
       },
-      {{"f_0_retval_retval", "F:o:0"}});
+      {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
+       {"f_0_retval_retval", "F:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -1760,7 +1863,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
                                .WithName("E")
                                .WithAttr("_encapsulate", "F1")
                                .WithAttr("_outside", "O1"));
-    Node* send1 = SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {},
+    Node* send1 = SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
                                b2.opts().WithControlInput(e));
     Node* s1 = Sequencer(
         b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}),
@@ -1770,7 +1873,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
     Node* call1 =
         b2.opts().WithControlInput(s1).FinalizeBuilder(&node_builder1);
 
-    Binary(e, call1, b2.opts().WithName("G"));
+    Binary(call1, ops::NodeOut(call1, 1), b2.opts().WithName("G"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -1813,22 +1916,45 @@ TEST(EncapsulateSubgraphsTest,
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
+  {
+    GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
+    Node* key_constant = KeyPlaceholder("F1", shape1.opts());
+    Node* recv1 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT},
+                   shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* e = Unary(ops::NodeOut(recv1, 0), shape1.opts()
+                                                .WithName("E")
+                                                .WithAttr("_encapsulate", "F1")
+                                                .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                 shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected));
+  }
+
   {
     GraphDefBuilder shape2(GraphDefBuilder::kFailImmediately);
     Node* key_constant = KeyPlaceholder("F1", shape2.opts());
-    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
-                             {DT_FLOAT}, shape2.opts());
+    Node* recv2 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2", {DT_FLOAT},
+                   shape2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     Node* g = Unary(ops::NodeOut(recv2, 0), shape2.opts()
                                                 .WithName("G")
                                                 .WithAttr("_encapsulate", "F1")
                                                 .WithAttr("_outside", "O2"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {g}, shape2.opts());
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {g},
+                 shape2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     TF_EXPECT_OK(
         AddGraphDefToFunctionLibrary(shape2, "F1_O2", &library_expected));
   }
 
+  NameAttrList shape_inference_graph1;
+  shape_inference_graph1.set_name("_outside_compilation_shape_inference_F1_O1");
+  NameAttrList shape_inference_graph2;
+  shape_inference_graph2.set_name("_outside_compilation_shape_inference_F1_O2");
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval_retval:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"},
+      {"e_0_retval_retval:float", "h_0_retval_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
@@ -1836,6 +1962,18 @@ TEST(EncapsulateSubgraphsTest,
           {{"H"},
            "UnaryTest",
            {"outside_compilation_O2_host_compute:outputs:0"}},
+          {{"outside_compilation_O1_host_compute"},
+           "XlaHostCompute",
+           {"a_0_arg"},
+           {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
+            {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
+            {"ancestors", absl::Span<const string>({})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", shape_inference_graph1},
+            {"shapes", absl::Span<const TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
           {{"outside_compilation_O2_host_compute"},
            "XlaHostCompute",
            {"F:o:0"},
@@ -1843,12 +1981,14 @@ TEST(EncapsulateSubgraphsTest,
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O2"},
-            {"shape_inference_graph",
-             "_outside_compilation_shape_inference_F1_O2"},
+            {"shape_inference_graph", shape_inference_graph2},
             {"shapes", absl::Span<const TensorShapeProto>({})},
-            {"_outside_compilation_subgraph", "O2"}}},
+            {"_outside_compilation_subgraph", "O2"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
       },
-      {{"h_0_retval_retval", "H:o:0"}});
+      {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
+       {"h_0_retval_retval", "H:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -1856,30 +1996,39 @@ TEST(EncapsulateSubgraphsTest,
     GraphDefBuilder b2(GraphDefBuilder::kFailImmediately, lib_def.get());
     Node* a = Input(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
-
-    Node* e = Unary(a, b2.opts()
-                           .WithName("E")
-                           .WithAttr("_encapsulate", "F1")
-                           .WithAttr("_outside", "O1"));
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
-                            {DT_FLOAT}, b2.opts());
-    Node* g = Unary(recv, b2.opts()
-                              .WithName("G")
-                              .WithAttr("_encapsulate", "F1")
-                              .WithAttr("_outside", "O2")
-                              .WithControlInput(e));
-    Node* send =
-        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {g}, b2.opts());
-    Node* s1 = Sequencer(
-        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}),
-        "F1");
+    Node* recv1 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT},
+                   b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+
+    Node* e = Unary(recv1, b2.opts()
+                               .WithName("E")
+                               .WithAttr("_encapsulate", "F1")
+                               .WithAttr("_outside", "O1"));
+    Node* send1 =
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                     b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* recv2 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2", {DT_FLOAT},
+                   b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* g = Unary(recv2, b2.opts()
+                               .WithName("G")
+                               .WithAttr("_encapsulate", "F1")
+                               .WithAttr("_outside", "O2")
+                               .WithControlInput(e));
+    Node* send2 =
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {g},
+                     b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* s1 = Sequencer(b2.opts()
+                             .WithName("F1_sequencer")
+                             .WithControlInputs({recv1, send1, recv2, send2}),
+                         "F1");
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b).ControlInput(s1);
     Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
 
-    Binary(e, call1, b2.opts().WithName("I"));
+    Binary(call1, ops::NodeOut(call1, 1), b2.opts().WithName("I"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -1925,19 +2074,24 @@ TEST(EncapsulateSubgraphsTest,
   {
     GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
     Node* key_constant = KeyPlaceholder("F1", shape1.opts());
-    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
-                             {DT_FLOAT}, shape1.opts());
+    Node* recv2 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT},
+                   shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     Node* e = Unary(ops::NodeOut(recv2, 0), shape1.opts()
                                                 .WithName("E")
                                                 .WithAttr("_encapsulate", "F1")
                                                 .WithAttr("_outside", "O1"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape1.opts());
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                 shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     TF_EXPECT_OK(
         AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected));
   }
 
+  NameAttrList shape_inference_graph;
+  shape_inference_graph.set_name("_outside_compilation_shape_inference_F1_O1");
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval_retval:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"},
+      {"e_0_retval_retval:float", "h_0_retval_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
@@ -1945,6 +2099,18 @@ TEST(EncapsulateSubgraphsTest,
            "UnaryTest",
            {"outside_compilation_O1_host_compute:outputs:0"}},
           {{"H"}, "UnaryTest", {"F:o:0"}},
+          {{"outside_compilation_O2_host_compute"},
+           "XlaHostCompute",
+           {"a_0_arg"},
+           {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
+            {"Toutputs", absl::Span<const DataType>({})},
+            {"ancestors", absl::Span<const string>({})},
+            {"key", "host_compute_channel_F1_O2"},
+            {"shape_inference_graph", NameAttrList()},
+            {"shapes", absl::Span<const TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O2"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
           {{"outside_compilation_O1_host_compute"},
            "XlaHostCompute",
            {"D:o:0"},
@@ -1952,12 +2118,14 @@ TEST(EncapsulateSubgraphsTest,
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph",
-             "_outside_compilation_shape_inference_F1_O1"},
+            {"shape_inference_graph", shape_inference_graph},
             {"shapes", absl::Span<const TensorShapeProto>({})},
-            {"_outside_compilation_subgraph", "O1"}}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
       },
-      {{"h_0_retval_retval", "H:o:0"}});
+      {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
+       {"h_0_retval_retval", "H:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -1968,27 +2136,33 @@ TEST(EncapsulateSubgraphsTest,
 
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
-                            {DT_FLOAT}, b2.opts());
-    Node* e = Unary(recv, b2.opts()
-                              .WithName("E")
-                              .WithAttr("_encapsulate", "F1")
-                              .WithAttr("_outside", "O1"));
+    Node* recv1 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT},
+                   b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* e = Unary(recv1, b2.opts()
+                               .WithName("E")
+                               .WithAttr("_encapsulate", "F1")
+                               .WithAttr("_outside", "O1"));
     Node* send =
-        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts());
-    /*Node* g =*/Unary(a, b2.opts()
-                              .WithName("G")
-                              .WithAttr("_encapsulate", "F1")
-                              .WithAttr("_outside", "O2")
-                              .WithControlInput(e));
-    Node* s1 = Sequencer(
-        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}),
-        "F1");
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                     b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* recv2 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2", {DT_FLOAT},
+                   b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    /*Node* g =*/Unary(recv2, b2.opts()
+                                  .WithName("G")
+                                  .WithAttr("_encapsulate", "F1")
+                                  .WithAttr("_outside", "O2")
+                                  .WithControlInput(e));
+    Node* s1 = Sequencer(b2.opts()
+                             .WithName("F1_sequencer")
+                             .WithControlInputs({recv1, recv2, send}),
+                         "F1");
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b).ControlInput(s1);
     Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
 
-    Binary(e, call1, b2.opts().WithName("I"));
+    Binary(call1, ops::NodeOut(call1, 1), b2.opts().WithName("I"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -2039,19 +2213,24 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
   {
     GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
     Node* key_constant = KeyPlaceholder("F1", shape1.opts());
-    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
-                             {DT_FLOAT}, shape1.opts());
+    Node* recv2 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT},
+                   shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     Node* e = Unary(ops::NodeOut(recv2, 0), shape1.opts()
                                                 .WithName("E")
                                                 .WithAttr("_encapsulate", "F1")
                                                 .WithAttr("_outside", "O1"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape1.opts());
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                 shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     TF_EXPECT_OK(
         AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected));
   }
 
+  NameAttrList shape_inference_graph;
+  shape_inference_graph.set_name("_outside_compilation_shape_inference_F1_O1");
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"h_0_retval_retval:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"},
+      {"e_0_retval_retval:float", "h_0_retval_retval:float"}, {},
       {{{"C"}, "UnaryTest", {"a_0_arg"}},
        {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
        {{"F"}, "UnaryTest", {"outside_compilation_O1_host_compute:outputs:0"}},
@@ -2063,10 +2242,11 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
          {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
          {"ancestors", absl::Span<const string>({})},
          {"key", "host_compute_channel_F1_O1"},
-         {"shape_inference_graph",
-          "_outside_compilation_shape_inference_F1_O1"},
+         {"shape_inference_graph", shape_inference_graph},
          {"shapes", absl::Span<const TensorShapeProto>({})},
-         {"_outside_compilation_subgraph", "O1"}}},
+         {"_outside_compilation_subgraph", "O1"},
+         {"_xla_token_input_nodes",
+          absl::Span<const string>({"_xla_token_arg_node"})}}},
        {{"outside_compilation_O2_host_compute"},
         "XlaHostCompute",
         {"D:o:0"},
@@ -2074,9 +2254,11 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
          {"Toutputs", absl::Span<const DataType>({})},
          {"ancestors", absl::Span<const string>({})},
          {"key", "host_compute_channel_F1_O2"},
-         {"shape_inference_graph", ""},
+         {"shape_inference_graph", NameAttrList()},
          {"shapes", absl::Span<const TensorShapeProto>({})},
-         {"_outside_compilation_subgraph", "O2"}},
+         {"_outside_compilation_subgraph", "O2"},
+         {"_xla_token_input_nodes",
+          absl::Span<const string>({"_xla_token_arg_node"})}},
         {}},
        {{"outside_compilation_O3_host_compute"},
         "XlaHostCompute",
@@ -2085,11 +2267,14 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
          {"Toutputs", absl::Span<const DataType>({})},
          {"ancestors", absl::Span<const string>({})},
          {"key", "host_compute_channel_F1_O3"},
-         {"shape_inference_graph", ""},
+         {"shape_inference_graph", NameAttrList()},
          {"shapes", absl::Span<const TensorShapeProto>({})},
-         {"_outside_compilation_subgraph", "O3"}},
+         {"_outside_compilation_subgraph", "O3"},
+         {"_xla_token_input_nodes",
+          absl::Span<const string>({"_xla_token_arg_node"})}},
         {}}},
-      {{"h_0_retval_retval", "H:o:0"}});
+      {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
+       {"h_0_retval_retval", "H:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -2100,23 +2285,27 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
 
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
-                             {DT_FLOAT}, b2.opts());
+    Node* recv1 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT},
+                   b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     Node* e = Unary(recv1, b2.opts()
                                .WithName("E")
                                .WithAttr("_encapsulate", "F1")
                                .WithAttr("_outside", "O1"));
     Node* send =
-        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts());
-    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
-                             {DT_FLOAT}, b2.opts());
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                     b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* recv2 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2", {DT_FLOAT},
+                   b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     Node* g = Unary(recv2, b2.opts()
                                .WithName("G")
                                .WithAttr("_encapsulate", "F1")
                                .WithAttr("_outside", "O2")
                                .WithControlInput(e));
-    Node* recv3 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O3",
-                             {DT_FLOAT}, b2.opts());
+    Node* recv3 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O3", {DT_FLOAT},
+                   b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     /*Node* i =*/Binary(recv3, e,
                         b2.opts()
                             .WithName("I")
@@ -2131,7 +2320,7 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
     node_builder1.Input(a).Input(b).ControlInput(s1);
     Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
 
-    Binary(e, call1, b2.opts().WithName("J"));
+    Binary(call1, ops::NodeOut(call1, 1), b2.opts().WithName("J"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -2167,14 +2356,46 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) {
   FunctionDefLibrary library_expected;
   GraphDef graphdef_expected;
 
+  {
+    GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
+    Node* key_constant = KeyPlaceholder("F1", shape1.opts());
+    Node* recv2 =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT},
+                   shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* e = Unary(ops::NodeOut(recv2, 0), shape1.opts()
+                                                .WithName("E")
+                                                .WithAttr("_encapsulate", "F1")
+                                                .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                 shape1.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    TF_EXPECT_OK(
+        AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected));
+  }
+
+  NameAttrList shape_inference_graph;
+  shape_inference_graph.set_name("_outside_compilation_shape_inference_F1_O1");
   *library_expected.add_function() = FunctionDefHelper::Create(
-      "F1", {"a_0_arg:float", "b_0_arg:float"}, {"f_0_retval_retval:float"}, {},
+      "F1", {"a_0_arg:float", "b_0_arg:float"},
+      {"e_0_retval_retval:float", "f_0_retval_retval:float"}, {},
       {
           {{"C"}, "UnaryTest", {"a_0_arg"}},
           {{"D"}, "BinaryTest", {"b_0_arg", "C:o:0"}},
           {{"F"}, "UnaryTest", {"D:o:0"}},
+          {{"outside_compilation_O1_host_compute"},
+           "XlaHostCompute",
+           {"a_0_arg"},
+           {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
+            {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
+            {"ancestors", absl::Span<const string>({})},
+            {"key", "host_compute_channel_F1_O1"},
+            {"shape_inference_graph", shape_inference_graph},
+            {"shapes", absl::Span<const TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}}},
       },
-      {{"f_0_retval_retval", "F:o:0"}});
+      {{"e_0_retval_retval", "outside_compilation_O1_host_compute:outputs:0"},
+       {"f_0_retval_retval", "F:o:0"}});
 
   {
     std::unique_ptr<FunctionLibraryDefinition> lib_def(
@@ -2183,15 +2404,26 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) {
     Node* a = Input(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
-    Node* e = Unary(a, b2.opts()
-                           .WithName("E")
-                           .WithAttr("_encapsulate", "F1")
-                           .WithAttr("_outside", "O1"));
+    Node* key_constant =
+        KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
+    Node* recv =
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT},
+                   b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* e = Unary(recv, b2.opts()
+                              .WithName("E")
+                              .WithAttr("_encapsulate", "F1")
+                              .WithAttr("_outside", "O1"));
+    Node* send =
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                     b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* s = Sequencer(
+        b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}),
+        "F1");
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
-    node_builder1.Input(a).Input(b);
+    node_builder1.Input(a).Input(b).ControlInput(s);
     Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
 
-    Binary(e, call1, b2.opts().WithName("G"));
+    Binary(call1, ops::NodeOut(call1, 1), b2.opts().WithName("G"));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
@@ -2236,20 +2468,22 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
   {
     GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
     Node* key_constant = KeyPlaceholder("F1", shape.opts());
-    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
-                            {DT_FLOAT}, shape.opts());
-    Node* a = InputShaped(shape.opts().WithName("A"));
-    Node* c = Unary(a, shape.opts().WithName("C"));
-    Node* e = BinaryUnknownShape(c, recv,
+    Node* recv = RecvAtHost(
+        ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT, DT_FLOAT},
+        shape.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* e = BinaryUnknownShape(recv, ops::NodeOut(recv, 1),
                                  shape.opts()
                                      .WithName("E")
                                      .WithAttr("_encapsulate", "F1")
                                      .WithAttr("_outside", "O1"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape.opts());
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                 shape.opts().WithAttr(kXlaHasHostTransferAttrName, true));
     TF_EXPECT_OK(
         AddGraphDefToFunctionLibrary(shape, "F1_O1", &library_expected));
   }
 
+  NameAttrList shape_inference_graph;
+  shape_inference_graph.set_name("_outside_compilation_shape_inference_F1_O1");
   *library_expected.add_function() = test::function::XTimesTwo();
   *library_expected.add_function() = FunctionDefHelper::Create(
       "F1", {"b_0_arg:float", "c_0_arg:float"}, {"f_0_retval_retval:float"}, {},
@@ -2262,15 +2496,16 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
            {"outside_compilation_O1_host_compute"}},
           {{"outside_compilation_O1_host_compute"},
            "XlaHostCompute",
-           {"c:o:0"},
-           {{"Tinputs", absl::Span<const DataType>({DT_FLOAT})},
+           {"c_0_arg", "c:o:0"},
+           {{"Tinputs", absl::Span<const DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", absl::Span<const DataType>({DT_FLOAT})},
             {"ancestors", absl::Span<const string>({})},
             {"key", "host_compute_channel_F1_O1"},
-            {"shape_inference_graph",
-             "_outside_compilation_shape_inference_F1_O1"},
+            {"shape_inference_graph", shape_inference_graph},
             {"shapes", absl::Span<const DataType>({})},
-            {"_outside_compilation_subgraph", "O1"}},
+            {"_outside_compilation_subgraph", "O1"},
+            {"_xla_token_input_nodes",
+             absl::Span<const string>({"_xla_token_arg_node"})}},
            {"c"}},
       },
       {{"f_0_retval_retval", "F:o:0"}});
@@ -2285,16 +2520,18 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
 
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
-                            {DT_FLOAT}, b2.opts());
-    Node* e = BinaryUnknownShape(c, ops::NodeOut(recv, 0),
+    Node* recv = RecvAtHost(
+        ops::NodeOut(key_constant, 0), "F1", "O1", {DT_FLOAT, DT_FLOAT},
+        b2.opts().WithAttr(kXlaHasHostTransferAttrName, true));
+    Node* e = BinaryUnknownShape(recv, ops::NodeOut(recv, 1),
                                  b2.opts()
                                      .WithName("E")
-                                     .WithControlInputs({recv, b})
+                                     .WithControlInputs({recv})
                                      .WithAttr("_encapsulate", "F1")
                                      .WithAttr("_outside", "O1"));
     Node* send = SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
-                              b2.opts().WithControlInput(e));
+                              b2.opts().WithControlInput(e).WithAttr(
+                                  kXlaHasHostTransferAttrName, true));
 
     Node* s = Sequencer(
         b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}),
@@ -2303,9 +2540,9 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
     NodeBuilder node_builder("F1", "F1", lib_def.get());
     node_builder.Input(b).Input(c);
     Node* call =
-        b2.opts().WithControlInputs({s, c}).FinalizeBuilder(&node_builder);
+        b2.opts().WithControlInputs({s, b, c}).FinalizeBuilder(&node_builder);
 
-    Binary(a, call, b2.opts().WithName("G").WithControlInputs({e}));
+    Binary(a, call, b2.opts().WithName("G").WithControlInputs({call}));
     TF_EXPECT_OK(b2.ToGraphDef(&graphdef_expected));
   }
 
diff --git a/tensorflow/compiler/jit/encapsulate_util.cc b/tensorflow/compiler/jit/encapsulate_util.cc
index 1f4b9c90a4ff0b1166cdb7b5942771b350740ef3..2264806d6bdabd9f26d9f83b681524399f996317 100644
--- a/tensorflow/compiler/jit/encapsulate_util.cc
+++ b/tensorflow/compiler/jit/encapsulate_util.cc
@@ -62,517 +62,6 @@ void ReplaceAttr(Node* n, const string& attr_name, const T& value) {
   n->AddAttr(attr_name, value);
 }
 
-// Step 1a ~ 1d for PreprocessForEncapsulation(). See comments of
-// PreprocessForEncapsulation() for details.
-Status ProcessControlEdges(Graph* g, const string& xla_computation_attr_name,
-                           const string& outside_compilation_attr_name) {
-  // Gather edges to remove. We should not remove the edge while iterating.
-  std::vector<const Edge*> edges_to_remove;
-  for (const Edge* e : g->edges()) {
-    if (!e->IsControlEdge()) {
-      continue;
-    }
-
-    auto src_xla_computation =
-        GetStringAttr(*e->src(), xla_computation_attr_name);
-    auto dst_xla_computation =
-        GetStringAttr(*e->dst(), xla_computation_attr_name);
-    auto src_outside_compilation =
-        GetStringAttr(*e->src(), outside_compilation_attr_name);
-    auto dst_outside_compilation =
-        GetStringAttr(*e->dst(), outside_compilation_attr_name);
-
-    if (!src_xla_computation && !dst_xla_computation) {
-      continue;
-    } else if (src_xla_computation && !dst_xla_computation) {
-      if (src_outside_compilation) {
-        // Case 1c: outside compilation to host computation control edge.
-        edges_to_remove.push_back(e);
-
-        TF_RETURN_IF_ERROR(AppendToListAttr<string>(
-            e->dst(), kXlaControlDependenciesAttrName, e->src()->name()));
-      }
-    } else if (!src_xla_computation && dst_xla_computation) {
-      if (dst_outside_compilation) {
-        // Case 1c: host computation control to outside compilation edge.
-        edges_to_remove.push_back(e);
-
-        TF_RETURN_IF_ERROR(AppendToListAttr<string>(
-            e->dst(), kXlaControlDependenciesAttrName, e->src()->name()));
-      }
-    } else {  // src_xla_computation && dst_xla_computation
-      if (*src_xla_computation != *dst_xla_computation) {
-        if (src_outside_compilation && dst_outside_compilation) {
-          // Case 1b: outside compilation to outside compilation control edge.
-          edges_to_remove.push_back(e);
-
-          TF_RETURN_IF_ERROR(AppendToListAttr<string>(
-              e->dst(), kXlaControlDependenciesAttrName, e->src()->name()));
-        } else if (src_outside_compilation && !dst_outside_compilation) {
-          // Case 1a: outside compilation to another XLA computaition control
-          // edge.
-          TF_RETURN_IF_ERROR(AppendToListAttr<string>(
-              e->src(), kXlaConnectedToOtherXlaComputationAttrName,
-              *dst_xla_computation));
-        } else if (!src_outside_compilation && dst_outside_compilation) {
-          // Case 1a: another XLA computaition to outside compilation control
-          // edge.
-          TF_RETURN_IF_ERROR(AppendToListAttr<string>(
-              e->dst(), kXlaConnectedFromOtherXlaComputationAttrName,
-              *src_xla_computation));
-        }
-      }
-    }
-  }
-
-  for (auto e : edges_to_remove) {
-    g->RemoveEdge(e);
-  }
-  return Status::OK();
-}
-
-// Step 2 for PreprocessForEncapsulation(). See comments of
-// PreprocessForEncapsulation() for details.
-Status ProcessXlaToXlaDataEdges(Graph* g,
-                                const string& xla_computation_attr_name,
-                                const string& outside_compilation_attr_name) {
-  // Gather edges between XLA computations. Notice that we do not store `Edge*`
-  // directly because we remove some nodes while adding Identity nodes, and
-  // those Edge pointers might be invalidated.
-  struct EdgeInfo {
-    int dst_input, dst_node_id;
-  };
-  std::vector<EdgeInfo> edges;
-  for (const Edge* e : g->edges()) {
-    if (e->IsControlEdge()) {
-      continue;
-    }
-
-    auto src_xla_computation =
-        GetStringAttr(*e->src(), xla_computation_attr_name);
-    auto dst_xla_computation =
-        GetStringAttr(*e->dst(), xla_computation_attr_name);
-    auto src_outside_compilation =
-        GetStringAttr(*e->src(), outside_compilation_attr_name);
-    auto dst_outside_compilation =
-        GetStringAttr(*e->dst(), outside_compilation_attr_name);
-    if (!src_xla_computation || !dst_xla_computation) {
-      continue;
-    }
-
-    if (*src_xla_computation != *dst_xla_computation) {
-      if (src_outside_compilation || dst_outside_compilation) {
-        edges.push_back(EdgeInfo{e->dst_input(), e->dst()->id()});
-        VLOG(4) << "XLA -> XLA edge: " << e->DebugString();
-      }
-    }
-  }
-
-  // For each XLA -> XLA edge, add an Identity node between src and dst.
-  for (int i = 0; i < edges.size(); i++) {
-    Node* dst = g->FindNodeId(edges[i].dst_node_id);
-    const Edge* e;
-    TF_RETURN_IF_ERROR(dst->input_edge(edges[i].dst_input, &e));
-    Node* src = e->src();
-    int src_output = e->src_output(), dst_input = e->dst_input();
-    g->RemoveEdge(e);
-
-    // Create Identity node, and connect it between `src` and `dst`.
-    string identity_node_name =
-        absl::StrCat("bridge_", src->name(), "_", dst->name());
-    DataType dtype = src->output_type(src_output);
-    TF_ASSIGN_OR_RETURN(Node * identity_node,
-                        BuildIdentityNode(g, identity_node_name, dtype, src,
-                                          /*requested_device=*/absl::nullopt));
-    identity_node->AddAttr(kBridgeSourceNodeAttrName, src->name());
-    g->AddEdge(src, src_output, identity_node, 0);
-    g->AddEdge(identity_node, 0, dst, dst_input);
-
-    // Replace `e->dst()` because its input node changed.
-    NodeDef new_def = dst->def();
-    *new_def.mutable_input(dst_input) = identity_node->name();
-    TF_ASSIGN_OR_RETURN(Node * dst_replace_node, ReplaceNode(g, dst, new_def));
-
-    // Other edge in `edges` might have `e->dst()` as src or dst
-    // node. Before removing `e->dst()`, replace those edges with corresponding
-    // edges for `dst_replace_node`.
-    for (int j = i + 1; j < edges.size(); j++) {
-      if (edges[j].dst_node_id == edges[i].dst_node_id) {
-        edges[j].dst_node_id = dst_replace_node->id();
-      }
-    }
-  }
-  return Status::OK();
-}
-
-// Step 3 for PreprocessForEncapsulation(). See comments of
-// PreprocessForEncapsulation() for details.
-Status ProcessDataEdgeBetweenOutsideCompilationAndHostComputation(
-    Graph* g, const string& xla_computation_attr_name,
-    const string& outside_compilation_attr_name) {
-  // Gather edges between outside compilation and host computation. Notice that
-  // we do not store `Edge*` directly because we remove some nodes while adding
-  // Identity nodes, and those Edge pointers might be invalidated.
-  struct EdgeInfo {
-    int dst_input, dst_node_id;
-    bool is_host_to_outside_compilation;
-  };
-  std::vector<EdgeInfo> edges;
-  for (const Edge* e : g->edges()) {
-    if (e->IsControlEdge()) {
-      continue;
-    }
-
-    if (e->src()->attrs().Find(xla_computation_attr_name) == nullptr &&
-        e->dst()->attrs().Find(xla_computation_attr_name) != nullptr &&
-        e->dst()->attrs().Find(outside_compilation_attr_name) != nullptr) {
-      edges.push_back(EdgeInfo{e->dst_input(), e->dst()->id(),
-                               /*is_host_to_outside_compilation=*/true});
-      VLOG(4) << "Host -> oc edge: " << e->DebugString();
-    } else if (e->dst()->attrs().Find(xla_computation_attr_name) == nullptr &&
-               e->src()->attrs().Find(xla_computation_attr_name) != nullptr &&
-               e->src()->attrs().Find(outside_compilation_attr_name) !=
-                   nullptr) {
-      edges.push_back(EdgeInfo{e->dst_input(), e->dst()->id(),
-                               /*is_host_to_outside_compilation=*/false});
-      VLOG(4) << "Oc -> host edge: " << e->DebugString();
-    }
-  }
-
-  // Remove the edge from host to outside compilation. Add a placeholder as
-  // outside compilation node input.
-  std::map<std::pair<string, int>, Node*> placeholders;
-  for (int i = 0; i < edges.size(); i++) {
-    Node* dst = g->FindNodeId(edges[i].dst_node_id);
-    const Edge* e;
-    TF_RETURN_IF_ERROR(dst->input_edge(edges[i].dst_input, &e));
-    Node* src = e->src();
-    int src_output = e->src_output(), dst_input = e->dst_input();
-    g->RemoveEdge(e);
-
-    // Find or create placeholder node.
-    string new_name =
-        edges[i].is_host_to_outside_compilation
-            ? absl::StrCat(src->name(), "_host_to_oc_placeholder_", src_output)
-            : absl::StrCat(src->name(), "_oc_to_host_placeholder_", src_output);
-    auto placeholder_index = std::make_pair(src->name(), src_output);
-    auto iter = placeholders.find(placeholder_index);
-    Node* placeholder_node;
-    if (iter == placeholders.end()) {
-      NodeDefBuilder placeholder_builder(new_name, "Placeholder");
-      placeholder_builder.Attr("dtype", src->output_type(src_output));
-      if (edges[i].is_host_to_outside_compilation) {
-        placeholder_builder.Attr(kHostToOutsideCompilationOriginalNodeAttrName,
-                                 src->name());
-        placeholder_builder.Attr(kHostToOutsideCompilationSrcOutputAttrName,
-                                 src_output);
-        // If this placeholder node is in outside compilation, we need to set
-        // `xla_computation_attr_name` and `outside_compilation_attr_name`.
-        string xla_computation_attr, outside_compilation_attr;
-        TF_RETURN_IF_ERROR(GetNodeAttr(dst->attrs(), xla_computation_attr_name,
-                                       &xla_computation_attr));
-        TF_RETURN_IF_ERROR(GetNodeAttr(dst->attrs(),
-                                       outside_compilation_attr_name,
-                                       &outside_compilation_attr));
-        placeholder_builder.Attr(xla_computation_attr_name,
-                                 xla_computation_attr);
-        placeholder_builder.Attr(outside_compilation_attr_name,
-                                 outside_compilation_attr);
-      } else {
-        placeholder_builder.Attr(kOutsideCompilationToHostOriginalNodeAttrName,
-                                 src->name());
-        placeholder_builder.Attr(kOutsideCompilationToHostSrcOutputAttrName,
-                                 src_output);
-      }
-      NodeDef placeholder_def;
-      TF_RETURN_IF_ERROR(placeholder_builder.Finalize(&placeholder_def));
-      Status s;
-      placeholder_node = g->AddNode(placeholder_def, &s);
-      TF_RETURN_IF_ERROR(s);
-      placeholders[placeholder_index] = placeholder_node;
-    } else {
-      placeholder_node = iter->second;
-    }
-    g->AddEdge(placeholder_node, 0, dst, dst_input);
-
-    // Replace `e->dst()` because its input node changed.
-    NodeDef new_def = dst->def();
-    *new_def.mutable_input(dst_input) = placeholder_node->name();
-    TF_ASSIGN_OR_RETURN(Node * dst_replace_node, ReplaceNode(g, dst, new_def));
-
-    // Other edge in `edges` might have `e->dst()` as src or dst
-    // node. Before removing `e->dst()`, replace those edges with corresponding
-    // edges for `dst_replace_node`.
-    for (int j = i + 1; j < edges.size(); j++) {
-      if (edges[j].dst_node_id == edges[i].dst_node_id) {
-        edges[j].dst_node_id = dst_replace_node->id();
-      }
-    }
-  }
-  return Status::OK();
-}
-
-// Step 1 for `PostprocessForEncapsulation`. See comments of
-// `PostprocessForEncapsulation` for details.
-Status RemovePlaceholderBetweenOutsideCompilationAndHostComputation(Graph* g) {
-  // Gather all outside compilation to host computation nodes.
-  struct PlaceHolderNodeInfo {
-    Node* n;
-    bool is_host_to_oc;
-  };
-  std::vector<PlaceHolderNodeInfo> placeholder_nodes;
-  for (Node* n : g->nodes()) {
-    if (n->type_string() == "Placeholder") {
-      if (HasNodeAttr(n->def(),
-                      kOutsideCompilationToHostOriginalNodeAttrName)) {
-        placeholder_nodes.push_back({n, false});
-      } else if (HasNodeAttr(n->def(),
-                             kHostToOutsideCompilationOriginalNodeAttrName)) {
-        placeholder_nodes.push_back({n, true});
-      }
-    }
-  }
-
-  // Remove the placeholder nodes, and reconnect original edge.
-  auto node_name_index = g->BuildNodeNameIndex();
-  for (auto placeholder_iter : placeholder_nodes) {
-    Node* n = placeholder_iter.n;
-
-    string node_name;
-    int node_src_output;
-    if (placeholder_iter.is_host_to_oc) {
-      TF_RETURN_IF_ERROR(
-          GetNodeAttr(n->attrs(), kHostToOutsideCompilationOriginalNodeAttrName,
-                      &node_name));
-      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(),
-                                     kHostToOutsideCompilationSrcOutputAttrName,
-                                     &node_src_output));
-    } else {
-      TF_RETURN_IF_ERROR(
-          GetNodeAttr(n->attrs(), kOutsideCompilationToHostOriginalNodeAttrName,
-                      &node_name));
-      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(),
-                                     kOutsideCompilationToHostSrcOutputAttrName,
-                                     &node_src_output));
-    }
-    auto iter = node_name_index.find(node_name);
-    if (iter == node_name_index.end()) {
-      return errors::Internal(
-          "Cannot find original node for oc -> host placeholder node ",
-          node_name);
-    }
-
-    // Change all usage node to use the original node instead.
-    Node* original_node = iter->second;
-    std::vector<const Edge*> control_edges;
-    std::vector<OutEdgeInfo> data_edges;
-    for (auto e : n->out_edges()) {
-      if (e->IsControlEdge()) {
-        control_edges.push_back(e);
-      } else {
-        data_edges.push_back({e->dst(), e->src_output(), e->dst_input()});
-      }
-    }
-    for (const Edge* e : control_edges) {
-      g->AddControlEdge(original_node, e->dst());
-      g->RemoveEdge(e);
-    }
-    for (int i = 0; i < data_edges.size(); i++) {
-      Node* dst = data_edges[i].dst;
-      NodeDef new_def = dst->def();
-      int dst_input = data_edges[i].dst_input;
-      *new_def.mutable_input(dst_input) =
-          absl::StrCat(original_node->name(), ":", node_src_output);
-      TF_ASSIGN_OR_RETURN(Node * replace_node, ReplaceNode(g, dst, new_def));
-
-      const Edge* edge_to_replace = nullptr;
-      TF_RETURN_IF_ERROR(replace_node->input_edge(dst_input, &edge_to_replace));
-      g->RemoveEdge(edge_to_replace);
-      g->AddEdge(original_node, node_src_output, replace_node, dst_input);
-
-      // Other edges might have `dst` as dst node. Update those edges with
-      // `replace_node`.
-      for (int j = i + 1; j < data_edges.size(); j++) {
-        if (data_edges[j].dst == dst) {
-          data_edges[j].dst = replace_node;
-        }
-      }
-
-      // Other placeholder node might have `dst` as original node. Update
-      // `node_name_index` with `replace_node`.
-      node_name_index[replace_node->name()] = replace_node;
-    }
-
-    // Remove placeholder node.
-    g->RemoveNode(n);
-  }
-  return Status::OK();
-}
-
-// Step 2 for `PostprocessForEncapsulation`. See comments of
-// `PostprocessForEncapsulation` for details.
-Status RemoveIdentityBetweenDifferentXlaComputation(Graph* g) {
-  // Gather Identity nodes to remove.
-  std::vector<Node*> bridge_nodes;
-  for (Node* n : g->nodes()) {
-    if (n->type_string() == "Identity" &&
-        HasNodeAttr(n->def(), kBridgeSourceNodeAttrName)) {
-      bridge_nodes.push_back(n);
-    }
-  }
-
-  // Remove the identity nodes, and reconnect the original edge.
-  for (int i = 0; i < bridge_nodes.size(); i++) {
-    Node* n = bridge_nodes[i];
-    const Edge* src_edge = nullptr;
-    TF_RETURN_IF_ERROR(n->input_edge(0, &src_edge));
-
-    // Change all usage node to use the original node instead.
-    std::vector<const Edge*> control_edges;
-    std::vector<OutEdgeInfo> data_edges;
-    for (auto e : n->out_edges()) {
-      if (e->IsControlEdge()) {
-        control_edges.push_back(e);
-      } else {
-        data_edges.push_back({e->dst(), e->src_output(), e->dst_input()});
-      }
-    }
-    for (const Edge* e : control_edges) {
-      g->AddControlEdge(src_edge->src(), e->dst());
-      g->RemoveEdge(e);
-    }
-    for (int j = 0; j < data_edges.size(); j++) {
-      Node* dst = data_edges[j].dst;
-      NodeDef new_def = dst->def();
-      int dst_input = data_edges[j].dst_input;
-      *new_def.mutable_input(dst_input) =
-          absl::StrCat(src_edge->src()->name(), ":", src_edge->src_output());
-      TF_ASSIGN_OR_RETURN(Node * replace_node, ReplaceNode(g, dst, new_def));
-
-      const Edge* edge_to_replace = nullptr;
-      TF_RETURN_IF_ERROR(replace_node->input_edge(dst_input, &edge_to_replace));
-      g->RemoveEdge(edge_to_replace);
-      g->AddEdge(src_edge->src(), src_edge->src_output(), replace_node,
-                 dst_input);
-
-      // Other edges might have `dst` as dst node. Update those edges with
-      // `replace_node`.
-      for (int k = j + 1; k < data_edges.size(); k++) {
-        if (data_edges[k].dst == dst) {
-          data_edges[k].dst = replace_node;
-        }
-      }
-
-      // The node we replaced might be in `bridge_nodes`. If so, update
-      // `bridge_nodes` to use the replaced node.
-      for (int k = i + 1; k < bridge_nodes.size(); k++) {
-        if (bridge_nodes[k] == dst) {
-          bridge_nodes[k] = replace_node;
-        }
-      }
-    }
-
-    // Remove Identity node.
-    g->RemoveNode(n);
-  }
-  return Status::OK();
-}
-
-// Step 3 for `PostprocessForEncapsulation`. See comments of
-// `PostprocessForEncapsulation` for details.
-// We do not need to worry about removed nodes in step 1 and 2;
-// `PreprocessForEncapsulation` will not record control dependencies for those
-// remvoed nodes in the first place.
-Status AddControlDependencies(
-    Graph* g, const std::unordered_map<string, string>& cluster_node_names) {
-  auto node_name_index = g->BuildNodeNameIndex();
-
-  // Reconnect outside compilation to outside compilation control edge.
-  for (Node* n : g->nodes()) {
-    std::vector<string> control_deps;
-    Status s =
-        GetNodeAttr(n->attrs(), kXlaControlDependenciesAttrName, &control_deps);
-    if (!s.ok()) {
-      if (s.code() != error::NOT_FOUND) {
-        return s;
-      } else {
-        continue;
-      }
-    } else {
-      n->ClearAttr(kXlaControlDependenciesAttrName);
-      for (const string& control_input : control_deps) {
-        auto iter = node_name_index.find(control_input);
-        if (iter == node_name_index.end()) {
-          return errors::Internal("Cannot find original node for ",
-                                  control_input);
-        }
-        g->AddControlEdge(iter->second, n);
-      }
-    }
-  }
-
-  // Reconnect outside compilation to XLA computation control edge.
-  for (Node* n : g->nodes()) {
-    std::vector<string> control_deps;
-    Status s = GetNodeAttr(
-        n->attrs(), kXlaConnectedToOtherXlaComputationAttrName, &control_deps);
-    if (!s.ok()) {
-      if (s.code() != error::NOT_FOUND) {
-        return s;
-      } else {
-        continue;
-      }
-    } else {
-      n->ClearAttr(kXlaConnectedToOtherXlaComputationAttrName);
-      for (const string& control_input : control_deps) {
-        auto iter = cluster_node_names.find(control_input);
-        if (iter == cluster_node_names.end()) {
-          return errors::Internal("Cannot find cluster node for ",
-                                  control_input);
-        }
-        auto iter2 = node_name_index.find(iter->second);
-        if (iter2 == node_name_index.end()) {
-          return errors::Internal("Cannot find cluster node for ",
-                                  iter->second);
-        }
-        g->AddControlEdge(n, iter2->second);
-      }
-    }
-  }
-
-  // Reconnect XLA computation to outside compilation control edge.
-  for (Node* n : g->nodes()) {
-    std::vector<string> control_deps;
-    Status s =
-        GetNodeAttr(n->attrs(), kXlaConnectedFromOtherXlaComputationAttrName,
-                    &control_deps);
-    if (!s.ok()) {
-      if (s.code() != error::NOT_FOUND) {
-        return s;
-      } else {
-        continue;
-      }
-    } else {
-      n->ClearAttr(kXlaConnectedFromOtherXlaComputationAttrName);
-      for (const string& control_input : control_deps) {
-        auto iter = cluster_node_names.find(control_input);
-        if (iter == cluster_node_names.end()) {
-          return errors::Internal("Cannot find cluster node for ",
-                                  control_input);
-        }
-        auto iter2 = node_name_index.find(iter->second);
-        if (iter2 == node_name_index.end()) {
-          return errors::Internal("Cannot find cluster node for ",
-                                  iter->second);
-        }
-        g->AddControlEdge(iter2->second, n);
-      }
-    }
-  }
-
-  return Status::OK();
-}
-
 // Step 1 for `PreprocessEdgesBetweenOutsideCompilations`. See comments of
 // `PreprocessEdgesBetweenOutsideCompilations` for details.
 Status PreprocessControlEdgesBetweenOutsideCompilations(
@@ -811,20 +300,6 @@ Status PostprocessControlEdgesBetweenOutsideCompilations(
 
 const char kXlaInferredShapesAttrName[] = "_xla_inferred_shapes";
 
-const char kXlaConnectedToOtherXlaComputationAttrName[] =
-    "_xla_connected_to_other_xla_computation";
-const char kXlaConnectedFromOtherXlaComputationAttrName[] =
-    "_xla_connected_from_other_xla_computation";
-const char kXlaControlDependenciesAttrName[] = "_xla_control_dependencies";
-const char kBridgeSourceNodeAttrName[] = "_xla_bridge_src";
-const char kOutsideCompilationToHostOriginalNodeAttrName[] =
-    "_xla_oc_to_host_node_name";
-const char kOutsideCompilationToHostSrcOutputAttrName[] =
-    "_xla_oc_to_host_src_output";
-const char kHostToOutsideCompilationOriginalNodeAttrName[] =
-    "_xla_host_to_oc_node_name";
-const char kHostToOutsideCompilationSrcOutputAttrName[] =
-    "_xla_host_to_oc_src_output";
 const char kXlaConnectedToXlaComputationAttrName[] =
     "_xla_connected_to_xla_computation";
 const char kXlaConnectedFromXlaComputationAttrName[] =
@@ -835,32 +310,7 @@ const char kOutsideCompilationSrcOutputAttrName[] = "_xla_oc_to_oc_src_output";
 const char kXlaControlDependenciesWithinXlaClusterAttrName[] =
     "_xla_control_dependencies_within_xla_cluster";
 
-Status PerformStaticShapeInferenceBeforeEncapsulation(
-    Graph* g, const string& xla_computation_attr_name,
-    const string& outside_compilation_attr_name) {
-  // Find all outside compilation to XLA computation data edges.
-  std::unordered_set<Node*> outside_compilation_send_nodes;
-  for (auto e : g->edges()) {
-    if (e->IsControlEdge()) {
-      continue;
-    }
-
-    auto src_computation = GetStringAttr(*e->src(), xla_computation_attr_name);
-    auto dst_computation = GetStringAttr(*e->dst(), xla_computation_attr_name);
-    if (!src_computation || !dst_computation ||
-        *src_computation != *dst_computation) {
-      continue;
-    }
-
-    auto src_outside_compilation =
-        GetStringAttr(*e->src(), outside_compilation_attr_name);
-    auto dst_outside_compilation =
-        GetStringAttr(*e->dst(), outside_compilation_attr_name);
-    if (src_outside_compilation && !dst_outside_compilation) {
-      outside_compilation_send_nodes.insert(e->src());
-    }
-  }
-
+Status PerformStaticShapeInferenceBeforeEncapsulation(Graph* g) {
   // Perform shape inference.
   std::map<int, InferredShape> arg_shapes;
   GraphShapeInfo shape_info;
@@ -868,55 +318,21 @@ Status PerformStaticShapeInferenceBeforeEncapsulation(
       InferShapes(g, arg_shapes, /*fnlib_def=*/nullptr, &shape_info));
 
   // Add attribute for output shapes.
-  for (Node* n : outside_compilation_send_nodes) {
-    auto iter = shape_info.find(n->name());
-    if (iter == shape_info.end()) {
-      continue;
-    }
-
+  auto node_name_index = g->BuildNodeNameIndex();
+  for (auto iter : shape_info) {
     std::vector<PartialTensorShape> output_shapes;
-    std::transform(iter->second.begin(), iter->second.end(),
+    std::transform(iter.second.begin(), iter.second.end(),
                    std::back_inserter(output_shapes),
                    [](const InferredShape& inferred_shape) {
                      return inferred_shape.shape;
                    });
+    Node* n = node_name_index[iter.first];
     n->AddAttr(kXlaInferredShapesAttrName, output_shapes);
   }
 
   return Status::OK();
 }
 
-Status PreprocessForEncapsulation(Graph* g,
-                                  const string& xla_computation_attr_name,
-                                  const string& outside_compilation_attr_name) {
-  TF_RETURN_IF_ERROR(ProcessControlEdges(g, xla_computation_attr_name,
-                                         outside_compilation_attr_name));
-  TF_RETURN_IF_ERROR(ProcessXlaToXlaDataEdges(g, xla_computation_attr_name,
-                                              outside_compilation_attr_name));
-  TF_RETURN_IF_ERROR(ProcessDataEdgeBetweenOutsideCompilationAndHostComputation(
-      g, xla_computation_attr_name, outside_compilation_attr_name));
-  return Status::OK();
-}
-
-Status PostprocessForEncapsulation(
-    Graph* g, const string& xla_computation_attr_name,
-    const string& outside_compilation_attr_name,
-    const std::unordered_map<string, XlaClusterInfo>& clusters) {
-  // The `node` pointer in `XlaClusterInfo` might be invalidated in step 1/2,
-  // but the node name won't change. Record cluster node name for
-  // `AddControlDependencies`.
-  std::unordered_map<string, string> cluster_node_names;
-  for (const auto& iter : clusters) {
-    cluster_node_names[iter.first] = iter.second.node->name();
-  }
-
-  TF_RETURN_IF_ERROR(
-      RemovePlaceholderBetweenOutsideCompilationAndHostComputation(g));
-  TF_RETURN_IF_ERROR(RemoveIdentityBetweenDifferentXlaComputation(g));
-  TF_RETURN_IF_ERROR(AddControlDependencies(g, cluster_node_names));
-  return Status::OK();
-}
-
 Status PreprocessEdgesBetweenOutsideCompilations(
     Graph* g, const string& outside_compilation_attr_name) {
   // Remove edges from source node to outside compilation nodes, and edges
diff --git a/tensorflow/compiler/jit/encapsulate_util.h b/tensorflow/compiler/jit/encapsulate_util.h
index e363bc5754ac395bae262dc67a780a0173efaf5e..c9f16d14168163e11bb19092f566f1de8724aca3 100644
--- a/tensorflow/compiler/jit/encapsulate_util.h
+++ b/tensorflow/compiler/jit/encapsulate_util.h
@@ -27,51 +27,13 @@ namespace tensorflow {
 // a list of PartialTensorShape objects.
 extern const char kXlaInferredShapesAttrName[];
 
-// Infer output shapes for outside compilation nodes which have output data
-// edges to XLA computation nodes. These shapes will be used later by XLA
-// compiler as output shapes of the outside compilation's XlaHostCompute op.
-// XLA computation nodes will be mark by attr `xla_computation_attr_name`;
-// outside compilation nodes will be marked by both attr
-// `xla_computation_attr_name` and `outside_compilation_attr_name`.
-//
-// Those outside compilation nodes will be marked with attribute
-// `kXlaInferredShapesAttrName`.
+// Infers output shapes for all nodes in graph `g`. The output shapes will be
+// stored in node attribute `kXlaInferredShapesAttrName`.
 //
 // We have to perform shape inference before encapsulation because after
 // encapsulation, some nodes will be encapsulated into function call, and shape
 // inference does not handle function call at the moment.
-Status PerformStaticShapeInferenceBeforeEncapsulation(
-    Graph* g, const string& xla_computation_attr_name,
-    const string& outside_compilation_attr_name);
-
-// Attribute indicating that some ops in other XLA computation has control
-// dependency on this node. Attribute value will be a list of string (XLA
-// computation names).
-extern const char kXlaConnectedToOtherXlaComputationAttrName[];
-
-// Attribute indicating that this node has control dependency on some ops in
-// other XLA computation. Attribute value will be a list of string (XLA
-// computation names).
-extern const char kXlaConnectedFromOtherXlaComputationAttrName[];
-
-// Attribute indicating that this node has control dependencies on some other
-// nodes. Attribute value will be a list of string (node names).
-extern const char kXlaControlDependenciesAttrName[];
-
-// Attribute indicating that this is an Identity node added to act as a bridge
-// between different XLA computations. Attribute value will be string (source
-// node name).
-extern const char kBridgeSourceNodeAttrName[];
-
-// Attribute indicating that this is an Placeholder node added to act as a
-// temporary input node for an outside compilation node. Attribute value will be
-// string (original input node name).
-extern const char kOutsideCompilationToHostOriginalNodeAttrName[];
-
-// Attribute indicating that this is an Placeholder node added to act as a
-// temporary input node for an outside compilation node. Attribute value will be
-// int (src_output for original edge).
-extern const char kOutsideCompilationToHostSrcOutputAttrName[];
+Status PerformStaticShapeInferenceBeforeEncapsulation(Graph* g);
 
 // Attribute indicating that some ops in this node's XLA computation has control
 // dependency on this node. Attribute value will always be "true".
@@ -81,16 +43,6 @@ extern const char kXlaConnectedToXlaComputationAttrName[];
 // this node's XLA computation. Attribute value will always be "true".
 extern const char kXlaConnectedFromXlaComputationAttrName[];
 
-// Attribute indicating that this is an Placeholder node added to act as a
-// temporary input node for an host node. Attribute value will be string
-// (original input node name).
-extern const char kHostToOutsideCompilationOriginalNodeAttrName[];
-
-// Attribute indicating that this is an Placeholder node added to act as a
-// temporary input node for a host node. Attribute value will be int (src_output
-// for original edge).
-extern const char kHostToOutsideCompilationSrcOutputAttrName[];
-
 // Attribute indicating that this is an Placeholder node added to act as a
 // temporary input node for an outside compilation node. Attribute value will be
 // string (original input node name).
@@ -106,27 +58,6 @@ extern const char kOutsideCompilationSrcOutputAttrName[];
 // (node names).
 extern const char kXlaControlDependenciesWithinXlaClusterAttrName[];
 
-// Preprocesses edges between different XLA clusters for encapsulation. It will
-// perform the following operations in order:
-//
-// 1a. For control edges between outside compilation and another XLA
-//     computation, add attr "kXlaConnected{From, To}OtherXlaComputationAttrName
-//     = XLA computation node name" to the outside compilation node.
-// 1b. For control edges between different outside compilations (in different
-//     XLA computations), remove the edge and add attr
-//     "kXlaControlDependenciesAttrName = src node name" to dst node.
-// 1c. For control edges between outside compilation and host computation,
-//     remove the edge and add attr "kXlaControlDependenciesAttrName = src node
-//     name" to dst node.
-// 2. For data edges between different XLA computations, if either src or dst
-//    is outside compilation, add an Identity node in between the edge. The
-//    identity node will have attr kBridgeSourceNodeAttrName.
-// 3. For data edges between outside compilation and host computation, remove
-//    the edge and create a Placeholder node as dst node's input.
-Status PreprocessForEncapsulation(Graph* g,
-                                  const string& xla_computation_attr_name,
-                                  const string& outside_compilation_attr_name);
-
 // Information for XLA computation.
 struct XlaClusterInfo {
   // Add an explicitly-defined default constructor for this class.
@@ -158,24 +89,6 @@ struct XlaClusterInfo {
   const std::map<string, int> host_compute_core;
 };
 
-// Postprocesses edges between different XLA clusters for encapsulation. This
-// function reverts what `PreprocessForEncapsulation` did. It will perform the
-// following operations in order:
-//
-// 1. Remove Placeholder nodes between outside compilation and host computation
-//     (created in `PreprocessForEncapsulation` step 3).
-// 2. Remove Identity nodes created in `PreprocessForEncapsulation` step 2.
-// 3a. Reconnect control edges between outside compilation and another XLA
-//     computation (marked by `PreprocessForEncapsulation` step 1a).
-// 3b. Reconnect control edges between different outside compilations (marked by
-//     `PreprocessForEncapsulation` step 1b).
-// 3c. Reconnect control edges between outside compilation and host computation
-//     (marked by `PreprocessForEncapsulation` step 1c).
-Status PostprocessForEncapsulation(
-    Graph* g, const string& xla_computation_attr_name,
-    const string& outside_compilation_attr_name,
-    const std::unordered_map<string, XlaClusterInfo>& clusters);
-
 // Preprocesses edges within the same XLA cluster. It will perform the following
 // operations in order:
 //
diff --git a/tensorflow/compiler/jit/encapsulate_util_test.cc b/tensorflow/compiler/jit/encapsulate_util_test.cc
index 3b8b49cb92f3e453883a8e64e12ce3748a5173f6..6d1661222e3eaf9df4f9f91f2b426c80b55245b2 100644
--- a/tensorflow/compiler/jit/encapsulate_util_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_util_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
-#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -38,24 +37,11 @@ TEST(PerformStaticShapeInferenceBeforeEncapsulationTest, Basic) {
   Graph g(OpRegistry::Global());
   TF_CHECK_OK(s.ToGraph(&g));
 
-  // "add" node is outside compilation node, "identity" node is XLA node.
-  auto node_index = g.BuildNodeNameIndex();
-  Node *add_node = node_index["add"], *identity_node = node_index["identity"];
-  add_node->AddAttr("_xla", "cluster");
-  add_node->AddAttr("_oc", "cluster");
-  identity_node->AddAttr("_xla", "cluster");
-  TF_CHECK_OK(
-      PerformStaticShapeInferenceBeforeEncapsulation(&g, "_xla", "_oc"));
+  TF_CHECK_OK(PerformStaticShapeInferenceBeforeEncapsulation(&g));
 
-  // Check that only "add" node now has _xla_inferred_shapes attr.
-  std::vector<Node *> nodes_with_inferred_shape;
-  for (Node *n : g.nodes()) {
-    if (HasNodeAttr(n->def(), kXlaInferredShapesAttrName)) {
-      nodes_with_inferred_shape.push_back(n);
-    }
-  }
-  EXPECT_EQ(nodes_with_inferred_shape.size(), 1);
-  EXPECT_EQ(nodes_with_inferred_shape[0], add_node);
+  // Check that "add" node now has _xla_inferred_shapes attr.
+  auto node_index = g.BuildNodeNameIndex();
+  Node *add_node = node_index["add"];
   std::vector<PartialTensorShape> output_shapes;
   TF_CHECK_OK(GetNodeAttr(add_node->attrs(), kXlaInferredShapesAttrName,
                           &output_shapes));
@@ -66,329 +52,4 @@ TEST(PerformStaticShapeInferenceBeforeEncapsulationTest, Basic) {
   EXPECT_EQ(shape_proto.dim(0).size(), 2);
 }
 
-TEST(PreprocessForEncapsulationTest, ControlEdges) {
-  // Build the graph:
-  // "const_0" and "const_1" in host computation
-  // "add" = "const_0" + "const_1" in XLA computation 0
-  // "identity0" = "add" in XLA computation 0 & outside compilation 0
-  // "identity1" = "identity0" in XLA computation 0
-  // "identity2" = "identity1" in host computation
-  // "identity3" = "identity2" in XLA computation 1
-  // "identity4" = "identity3" in XLA computation 1 & outside compilation 1
-  // "identity5" = "identity4" in XLA computation 1
-  // "identity6" = "identity5" in host computation
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output const_0 = ops::Const(s.WithOpName("const_0"), 1, {});
-  Output const_1 = ops::Const(s.WithOpName("const_1"), 2, {});
-  Output add = ops::Add(s.WithOpName("add"), const_0, const_1);
-  Output identity0 = ops::Identity(s.WithOpName("identity0"), add);
-  Output identity1 = ops::Identity(s.WithOpName("identity1"), identity0);
-  Output identity2 = ops::Identity(s.WithOpName("identity2"), identity1);
-  Output identity3 = ops::Identity(s.WithOpName("identity3"), identity2);
-  Output identity4 = ops::Identity(s.WithOpName("identity4"), identity3);
-  Output identity5 = ops::Identity(s.WithOpName("identity5"), identity4);
-  Graph g(OpRegistry::Global());
-  TF_CHECK_OK(s.ToGraph(&g));
-  auto node_index = g.BuildNodeNameIndex();
-
-  // Set XLA computation/outside compilation attr, and add control edges.
-  Node *const0_node = node_index["const_0"], *add_node = node_index["add"],
-       *identity0_node = node_index["identity0"],
-       *identity1_node = node_index["identity1"],
-       *identity2_node = node_index["identity2"],
-       *identity3_node = node_index["identity3"],
-       *identity4_node = node_index["identity4"],
-       *identity5_node = node_index["identity5"];
-  add_node->AddAttr("_xla", "0");
-  identity0_node->AddAttr("_xla", "0");
-  identity0_node->AddAttr("_oc", "0");
-  identity1_node->AddAttr("_xla", "0");
-  identity3_node->AddAttr("_xla", "1");
-  identity4_node->AddAttr("_xla", "1");
-  identity4_node->AddAttr("_oc", "0");
-  identity5_node->AddAttr("_xla", "1");
-  // Case 1a: control edges between outside compilation and another XLA
-  // computation.
-  g.AddControlEdge(identity0_node, identity3_node);
-  g.AddControlEdge(identity1_node, identity4_node);
-  // Case 1b: control edges between different outside compilations.
-  g.AddControlEdge(identity0_node, identity4_node);
-  // Case 1c: control edges between outside compilation and host computation.
-  g.AddControlEdge(const0_node, identity0_node);
-  g.AddControlEdge(identity0_node, identity2_node);
-
-  TF_CHECK_OK(PreprocessForEncapsulation(&g, "_xla", "_oc"));
-
-  // Case 1a: add attr "_xla_control_deps_{from/to} = XLA computation node name"
-  // to the outside compilation node.
-  std::vector<string> attr;
-  TF_CHECK_OK(GetNodeAttr(identity0_node->def(),
-                          kXlaConnectedToOtherXlaComputationAttrName, &attr));
-  EXPECT_EQ(attr.size(), 1);
-  EXPECT_EQ(attr[0], "1");
-  attr.clear();
-  TF_CHECK_OK(GetNodeAttr(identity4_node->def(),
-                          kXlaConnectedFromOtherXlaComputationAttrName, &attr));
-  EXPECT_EQ(attr.size(), 1);
-  EXPECT_EQ(attr[0], "0");
-  // Case 1b: add attr "_xla_control_deps = src node name" to dst node.
-  attr.clear();
-  TF_CHECK_OK(GetNodeAttr(identity4_node->def(),
-                          kXlaControlDependenciesAttrName, &attr));
-  EXPECT_EQ(attr.size(), 1);
-  EXPECT_EQ(attr[0], "identity0");
-  // Case 1c: add attr "_xla_control_deps = src node name" to dst node.
-  attr.clear();
-  TF_CHECK_OK(GetNodeAttr(identity0_node->def(),
-                          kXlaControlDependenciesAttrName, &attr));
-  EXPECT_EQ(attr.size(), 1);
-  EXPECT_EQ(attr[0], "const_0");
-  attr.clear();
-  TF_CHECK_OK(GetNodeAttr(identity2_node->def(),
-                          kXlaControlDependenciesAttrName, &attr));
-  EXPECT_EQ(attr.size(), 1);
-  EXPECT_EQ(attr[0], "identity0");
-}
-
-TEST(PreprocessForEncapsulationTest, DataEdges) {
-  // Build the graph:
-  // "const_0" and "const_1" in host computation
-  // "identityn0" = ("const_0", "const_1") in host computation 0
-  // "add0" = "const_0" + "const_1" in XLA computation 0
-  // "add1" = "add0" + "const_0" in XLA computation 0 & outside compilation 0
-  // "identity0" = "add1" in XLA computation 0
-  // "add2" = "add1" + "identity0" in host computation
-  // "add3" = "add1" + "add2" in XLA computation 1
-  // "add4" = "identity0" + "add2" in XLA computation 1 & outside compilation 0
-  // "add5" = "identityn0"[0] + "identityn0"[1] in XLA computation 1 &
-  //                                               outside compilation 0
-  // "identityn1" = ("identityn0"[0], "identityn0"[1]) in XLA computation 1 &
-  //                                                   outside compilation 0
-  // "identity1" = "add4" in XLA computation 1
-  // "identity2" = "identity1" in host computation
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output const_0 = ops::Const(s.WithOpName("const_0"), 1, {});
-  Output const_1 = ops::Const(s.WithOpName("const_1"), 2, {});
-  auto identityn0 =
-      ops::IdentityN(s.WithOpName("identityn_0"), {const_0, const_1});
-  Output add0 = ops::Add(s.WithOpName("add0"), const_0, const_1);
-  Output add1 = ops::Add(s.WithOpName("add1"), add0, const_0);
-  Output identity0 = ops::Identity(s.WithOpName("identity0"), add1);
-  Output add2 = ops::Add(s.WithOpName("add2"), add1, identity0);
-  Output add3 = ops::Add(s.WithOpName("add3"), add1, add2);
-  Output add4 = ops::Add(s.WithOpName("add4"), identity0, add2);
-  Output add5 = ops::Add(s.WithOpName("add5"), identityn0[0], identityn0[1]);
-  auto identityn1 = ops::IdentityN(s.WithOpName("identityn_1"),
-                                   {identityn0[0], identityn0[1]});
-  Output identity1 = ops::Identity(s.WithOpName("identity1"), add4);
-  Output identity2 = ops::Identity(s.WithOpName("identity2"), add4);
-  Graph g(OpRegistry::Global());
-  TF_CHECK_OK(s.ToGraph(&g));
-  auto node_index = g.BuildNodeNameIndex();
-
-  // Set XLA computation/outside compilation attr.
-  Node *add0_node = node_index["add0"], *add1_node = node_index["add1"],
-       *identity0_node = node_index["identity0"],
-       *add3_node = node_index["add3"], *add4_node = node_index["add4"],
-       *add5_node = node_index["add5"],
-       *identityn1_node = node_index["identityn_1"],
-       *identity1_node = node_index["identity1"];
-  add0_node->AddAttr("_xla", "0");
-  add1_node->AddAttr("_xla", "0");
-  add1_node->AddAttr("_oc", "0");
-  identity0_node->AddAttr("_xla", "0");
-  add3_node->AddAttr("_xla", "1");
-  add4_node->AddAttr("_xla", "1");
-  add4_node->AddAttr("_oc", "0");
-  add5_node->AddAttr("_xla", "1");
-  add5_node->AddAttr("_oc", "0");
-  identityn1_node->AddAttr("_xla", "1");
-  identityn1_node->AddAttr("_oc", "0");
-  identity1_node->AddAttr("_xla", "1");
-
-  TF_CHECK_OK(PreprocessForEncapsulation(&g, "_xla", "_oc"));
-
-  // Check input nodes for related data edges.
-  node_index = g.BuildNodeNameIndex();
-  // Step 2: add an Identity node between different XLA computations.
-  Node *bridge_add1_add3 = node_index["bridge_add1_add3"];
-  EXPECT_NE(bridge_add1_add3, nullptr);
-  string str;
-  TF_CHECK_OK(
-      GetNodeAttr(bridge_add1_add3->attrs(), kBridgeSourceNodeAttrName, &str));
-  EXPECT_EQ(str, "add1");
-  Node *bridge_identity0_add4 = node_index["bridge_identity0_add4"];
-  EXPECT_NE(bridge_identity0_add4, nullptr);
-  // Step 3: add placeholder for edges between host computation and outside
-  // compilation.
-  EXPECT_EQ(bridge_add1_add3->def().input(0), "add1_oc_to_host_placeholder_0");
-  Node *add1_oc_to_host_placeholder =
-      node_index["add1_oc_to_host_placeholder_0"];
-  TF_CHECK_OK(GetNodeAttr(add1_oc_to_host_placeholder->attrs(),
-                          kOutsideCompilationToHostOriginalNodeAttrName, &str));
-  EXPECT_EQ(str, "add1");
-  int i;
-  TF_CHECK_OK(GetNodeAttr(add1_oc_to_host_placeholder->attrs(),
-                          kOutsideCompilationToHostSrcOutputAttrName, &i));
-  EXPECT_EQ(i, 0);
-  add4_node = node_index["add4"];
-  ASSERT_NE(add4_node, nullptr);
-  EXPECT_EQ(add4_node->def().input(0),
-            "bridge_identity0_add4_host_to_oc_placeholder_0");
-  Node *identity0_host_to_oc_placeholder =
-      node_index["bridge_identity0_add4_host_to_oc_placeholder_0"];
-  TF_CHECK_OK(GetNodeAttr(identity0_host_to_oc_placeholder->attrs(),
-                          kHostToOutsideCompilationOriginalNodeAttrName, &str));
-  EXPECT_EQ(str, "bridge_identity0_add4");
-  TF_CHECK_OK(GetNodeAttr(identity0_host_to_oc_placeholder->attrs(),
-                          kHostToOutsideCompilationSrcOutputAttrName, &i));
-  EXPECT_EQ(i, 0);
-
-  // Check different placeholder nodes are created for different src_output.
-  Node *placeholder0 = node_index["identityn_0_host_to_oc_placeholder_0"],
-       *placeholder1 = node_index["identityn_0_host_to_oc_placeholder_1"];
-  EXPECT_NE(placeholder0, nullptr);
-  EXPECT_NE(placeholder1, nullptr);
-  // Check we only have 2 placeholder nodes created for "identityn_0".
-  int placeholder_count = 0;
-  for (Node *n : g.nodes()) {
-    if (HasNodeAttr(n->def(), kHostToOutsideCompilationOriginalNodeAttrName)) {
-      string attr;
-      TF_CHECK_OK(GetNodeAttr(
-          n->attrs(), kHostToOutsideCompilationOriginalNodeAttrName, &attr));
-      if (attr == "identityn_0") {
-        ++placeholder_count;
-      }
-    }
-  }
-  EXPECT_EQ(placeholder_count, 2);
-}
-
-TEST(PostprocessForEncapsulationTest, ControlEdges) {
-  // Build the graph:
-  // "const0"
-  // "identity0" = "const0" (XLA computation 0)
-  // "identity1" = "identity0"
-  // "identity2" = "identity1" (XLA computation 1)
-  // "identity3" = "identity2"
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output const0 = ops::Const(s.WithOpName("const0"), 1, {});
-  Output identity0 = ops::Identity(s.WithOpName("identity0"), const0);
-  Output identity1 = ops::Identity(s.WithOpName("identity1"), identity0);
-  Output identity2 = ops::Identity(s.WithOpName("identity2"), identity1);
-  Output identity3 = ops::Identity(s.WithOpName("identity3"), identity2);
-  Graph g(OpRegistry::Global());
-  TF_CHECK_OK(s.ToGraph(&g));
-  auto node_index = g.BuildNodeNameIndex();
-
-  // Set XLA computation/outside compilation attr, and add control edges.
-  Node *const0_node = node_index["const0"],
-       *identity0_node = node_index["identity0"],
-       *identity1_node = node_index["identity1"],
-       *identity2_node = node_index["identity2"],
-       *identity3_node = node_index["identity3"];
-  identity1_node->AddAttr(kXlaConnectedFromOtherXlaComputationAttrName,
-                          std::vector<string>{"0"});
-  identity1_node->AddAttr(kXlaConnectedToOtherXlaComputationAttrName,
-                          std::vector<string>{"1"});
-  identity3_node->AddAttr(kXlaControlDependenciesAttrName,
-                          std::vector<string>{"const0", "identity1"});
-
-  std::unordered_map<string, XlaClusterInfo> clusters;
-  clusters["0"].node = identity0_node;
-  clusters["1"].node = identity2_node;
-  TF_CHECK_OK(PostprocessForEncapsulation(&g, "_xla", "_oc", clusters));
-
-  // Case 3a: we have control edge identity0 -> identity1, and identity1 ->
-  // identity2.
-  bool edge_identity0_identity1 = false, edge_identity1_identity2 = false;
-  for (const Edge *e : g.edges()) {
-    if (!e->IsControlEdge()) {
-      continue;
-    }
-    if (e->src() == identity0_node && e->dst() == identity1_node) {
-      edge_identity0_identity1 = true;
-    } else if (e->src() == identity1_node && e->dst() == identity2_node) {
-      edge_identity1_identity2 = true;
-    }
-  }
-  EXPECT_TRUE(edge_identity0_identity1);
-  EXPECT_TRUE(edge_identity1_identity2);
-  // Case 3b: we have control edge const0 -> identity3, and identity1 ->
-  // identity3.
-  bool edge_const0_identity3 = false, edge_identity1_identity3 = false;
-  for (const Edge *e : g.edges()) {
-    if (!e->IsControlEdge()) {
-      continue;
-    }
-    if (e->src() == const0_node && e->dst() == identity3_node) {
-      edge_const0_identity3 = true;
-    } else if (e->src() == identity1_node && e->dst() == identity3_node) {
-      edge_identity1_identity3 = true;
-    }
-  }
-  EXPECT_TRUE(edge_const0_identity3);
-  EXPECT_TRUE(edge_identity1_identity3);
-}
-
-TEST(PostprocessForEncapsulationTest, DataEdges) {
-  // Build the graph:
-  // "const0" in outside compilation "0"
-  // "placeholder0" (for "const0") in host computation
-  // "add0" = "placeholder0" + "placeholder0" in host computation
-  // "placeholder1" (for "add0") in outside compilation 1
-  // "add1" = "placeholder1" + "placeholder1" in outside compilation 1
-  //
-  // "bridge" = "placeholder0" in host computation
-  // "placeholder2" (for "bridge") in outside compilation 1
-  // "add2" = "placeholder2" + "placeholder2" in outside compilation 1
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output const0 = ops::Const(s.WithOpName("const0"), 1, {});
-  Output placeholder0 =
-      ops::Placeholder(s.WithOpName("placeholder0"), DT_INT32);
-  Output add0 = ops::Add(s.WithOpName("add0"), placeholder0, placeholder0);
-  Output placeholder1 =
-      ops::Placeholder(s.WithOpName("placeholder1"), DT_INT32);
-  Output add1 = ops::Add(s.WithOpName("add1"), placeholder1, placeholder1);
-  Output bridge = ops::Identity(s.WithOpName("bridge"), placeholder0);
-  Output placeholder2 =
-      ops::Placeholder(s.WithOpName("placeholder2"), DT_INT32);
-  Output add2 = ops::Add(s.WithOpName("add2"), placeholder2, placeholder2);
-  Graph g(OpRegistry::Global());
-  TF_CHECK_OK(s.ToGraph(&g));
-  auto node_index = g.BuildNodeNameIndex();
-
-  // Set related attributes.
-  Node *placeholder0_node = node_index["placeholder0"];
-  placeholder0_node->AddAttr(kOutsideCompilationToHostOriginalNodeAttrName,
-                             "const0");
-  placeholder0_node->AddAttr(kOutsideCompilationToHostSrcOutputAttrName, 0);
-  Node *placeholder1_node = node_index["placeholder1"];
-  placeholder1_node->AddAttr(kHostToOutsideCompilationOriginalNodeAttrName,
-                             "add0");
-  placeholder1_node->AddAttr(kHostToOutsideCompilationSrcOutputAttrName, 0);
-  Node *bridge_node = node_index["bridge"];
-  bridge_node->AddAttr(kBridgeSourceNodeAttrName, "const0");
-  Node *placeholder2_node = node_index["placeholder2"];
-  placeholder2_node->AddAttr(kHostToOutsideCompilationOriginalNodeAttrName,
-                             "bridge");
-  placeholder2_node->AddAttr(kHostToOutsideCompilationSrcOutputAttrName, 0);
-
-  std::unordered_map<string, XlaClusterInfo> clusters;
-  TF_CHECK_OK(PostprocessForEncapsulation(&g, "_xla", "_oc", clusters));
-
-  // Result graph should be:
-  // "add0" = "const0" + "const0"
-  // "add1" = "add0" + "add0"
-  // "add2" = "const0" + "const0"
-  node_index = g.BuildNodeNameIndex();
-  EXPECT_EQ(node_index.size(), 6);
-  EXPECT_EQ(node_index["add0"]->def().input(0), "const0:0");
-  EXPECT_EQ(node_index["add0"]->def().input(1), "const0:0");
-  EXPECT_EQ(node_index["add1"]->def().input(0), "add0:0");
-  EXPECT_EQ(node_index["add1"]->def().input(1), "add0:0");
-  EXPECT_EQ(node_index["add2"]->def().input(0), "const0:0");
-  EXPECT_EQ(node_index["add2"]->def().input(1), "const0:0");
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
index d334100aa4a915a87fb05d371e0e3379a7ee05f2..f0c9d573451952a398dce190e102a33270a4d739 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
@@ -15,13 +15,17 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/encapsulate_xla_computations_pass.h"
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -36,6 +40,25 @@ namespace {
 
 const char* const kXlaClusterOutput = "XlaClusterOutput";
 
+bool IsCpuGpuCompile(const Graph* graph) {
+  for (Node* n : graph->nodes()) {
+    string name;
+    // Only consider nodes being compiled.
+    if (!GetNodeAttr(n->attrs(),
+                     EncapsulateXlaComputationsPass::kXlaClusterAttr, &name)
+             .ok())
+      continue;
+    // Early return for any node with a device that is not a CPU or GPU.
+    DeviceNameUtils::ParsedName parsed;
+    if (DeviceNameUtils::ParseFullName(n->requested_device(), &parsed)) {
+      if (parsed.type != DEVICE_CPU && parsed.type != DEVICE_GPU) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 // Checks if a graph node is marked to be a guaranteed constant.
 bool is_guaranteed_constant(const Node& n) {
   bool guaranteed_constant = false;
@@ -173,10 +196,11 @@ Status RewriteSubgraph(const std::vector<OutputTensor>& arg_source_tensors,
   // Nondeterminism in serialization would not lead to incorrect results, but
   // may cause spurious cache misses. DeterministicSerialization is a
   // best-effort deterministic serialization.
-  string serialized;
-  TF_RET_CHECK(SerializeToStringDeterministic(gdef, &serialized));
-  uint64 fingerprint = Fingerprint64(serialized);
-  LOG(INFO) << "Subgraph fingerprint:" << fingerprint;
+  const size_t size = gdef.ByteSizeLong();
+  auto serialized = absl::make_unique<char[]>(size);
+  TF_RET_CHECK(SerializeToBufferDeterministic(gdef, serialized.get(), size));
+  uint64 fingerprint = Fingerprint64(absl::string_view(serialized.get(), size));
+  VLOG(1) << "Subgraph fingerprint:" << fingerprint;
   call_def->set_op(absl::StrCat(call_def->op(), "_", fingerprint));
   return Status::OK();
 }
@@ -297,6 +321,7 @@ Status RewriteSubgraph(const std::vector<OutputTensor>& arg_source_tensors,
 
     NodeDef def;
     def.set_name(launch->name());
+    MergeDebugInfo(NodeDebugInfo(launch->def()), &def);
 
     // Target the XLA CPU/GPU backends.
     VLOG(2) << "Replacing with XlaLaunch";
@@ -350,12 +375,19 @@ Status EncapsulateXlaComputationsPass::Run(
           << dump_graph::DumpGraphToFile("encapsulate_xla_computations_before",
                                          **options.graph, options.flib_def);
 
-  TF_RETURN_IF_ERROR(Encapsulate(options.graph, options.flib_def));
+  const char* additional_help =
+      IsCpuGpuCompile(options.graph->get())
+          ? xla::status_macros::kPossibleAutoJitAlternative
+          : "";
+
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(Encapsulate(options.graph, options.flib_def),
+                                  additional_help);
   VLOG(1) << "EncapsulateXlaComputations() half-way: "
           << dump_graph::DumpGraphToFile("encapsulate_xla_computations_halfway",
                                          **options.graph, options.flib_def);
 
-  TF_RETURN_IF_ERROR(BuildXlaLaunchOps(options.graph->get()));
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(BuildXlaLaunchOps(options.graph->get()),
+                                  additional_help);
   VLOG(1) << "EncapsulateXlaComputations() finished: "
           << dump_graph::DumpGraphToFile("encapsulate_xla_computations_after",
                                          **options.graph, options.flib_def);
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
index e3c7e2f89be9b37b51a633dabb099969c181013f..2a770c527b2fae91352fd17dacb13495a3a73f34 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
@@ -20,14 +20,17 @@ limitations under the License.
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 #include "tensorflow/compiler/jit/encapsulate_util.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
+#include "tensorflow/compiler/tf2xla/side_effect_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 
 namespace tensorflow {
 
@@ -98,9 +101,12 @@ xla::StatusOr<Node*> BuildRecvAtHostNode(
   recv_at_host_builder.Attr("Toutputs", recv_at_host_dtypes);
   // The correct device_ordinal will be inserted during replication in a
   // subsequent rewrite.
-  recv_at_host_builder.Attr("device_ordinal", 0);
+  AttrValue device_ordinal_value;
+  device_ordinal_value.set_placeholder("device_ordinal");
+  recv_at_host_builder.Attr("device_ordinal", device_ordinal_value);
   recv_at_host_builder.Attr(
       "key", absl::StrCat("host_compute_channel_", oc_cluster_name));
+  recv_at_host_builder.Attr(kXlaHasHostTransferAttrName, true);
   recv_at_host_builder.Input(key_placeholder->name(), 0, DT_STRING);
   TF_RETURN_IF_ERROR(recv_at_host_builder.Finalize(&recv_at_host_def));
   Status s;
@@ -197,9 +203,12 @@ xla::StatusOr<Node*> BuildSendFromHostNode(
   send_from_host_builder.Attr("Tinputs", send_from_host_dtypes);
   // The correct device_ordinal will be inserted during replication in a
   // subsequent rewrite.
-  send_from_host_builder.Attr("device_ordinal", 0);
+  AttrValue device_ordinal_value;
+  device_ordinal_value.set_placeholder("device_ordinal");
+  send_from_host_builder.Attr("device_ordinal", device_ordinal_value);
   send_from_host_builder.Attr(
       "key", absl::StrCat("host_compute_channel_", oc_cluster_name));
+  send_from_host_builder.Attr(kXlaHasHostTransferAttrName, true);
   std::vector<NodeDefBuilder::NodeOut> inputs(send_from_host_dtypes.size());
   for (auto* n : ret_nodes) {
     int index;
@@ -300,6 +309,10 @@ xla::StatusOr<NodeDef> BuildXlaHostComputeNodeDef(
     host_compute_builder.Attr("tpu_core", core);
   }
 
+  // Set input tokens.
+  host_compute_builder.Attr(kXlaTokenInputNodesAttrName,
+                            std::vector<string>{kXlaTokenArgNodeName});
+
   // Populate inputs.
   std::vector<DataType> input_dtypes;
   TF_RETURN_IF_ERROR(GetNodeAttr(call_node->attrs(), "Tinputs", &input_dtypes));
@@ -322,6 +335,38 @@ xla::StatusOr<NodeDef> BuildXlaHostComputeNodeDef(
   return new_def;
 }
 
+Status ValidateOutsideCompilationCallNode(Node* call_node) {
+  // DT_INT64 as input/output for outside compilation is not supported yet:
+  // b/120809951.
+  for (const Edge* e : call_node->in_edges()) {
+    if (e->IsControlEdge()) {
+      continue;
+    }
+    DataType dtype = e->src()->output_type(e->src_output());
+    if (dtype == DT_INT64) {
+      return errors::Unimplemented(
+          "int64 input for outside compilation is not supported yet: "
+          "b/120809951. Please cast output of node ",
+          e->src()->DebugString(),
+          " to int32 before feeding it into outside compilation.");
+    }
+  }
+  for (const Edge* e : call_node->out_edges()) {
+    if (e->IsControlEdge()) {
+      continue;
+    }
+    DataType dtype = e->dst()->input_type(e->dst_input());
+    if (dtype == DT_INT64) {
+      return errors::Unimplemented(
+          "int64 output for outside compilation is not supported yet: "
+          "b/120809951. Please cast input of node ",
+          e->dst()->DebugString(),
+          " to int32 before returning it from outside compilation.");
+    }
+  }
+  return Status::OK();
+}
+
 // Replace outside compilation function call node with XlaHostCompute node.
 // If the function call node has no input/output edges, we will just remove it
 // and not create a XlaHostCompute node.
@@ -357,6 +402,51 @@ Status ReplaceOrRemoveOutsideCompilationCallNode(
   return Status::OK();
 }
 
+// Resets "device_ordinal" attr to placeholder value for related nodes
+// (XlaRecvAtHost nodes; XlaSendFromHost nodes; If/While/FuncCall nodes
+// containing XlaRecvAtHost/XlaSendFromHost).
+Status ResetDeviceOrdinalToPlaceholderValue(Graph* g) {
+  AttrValue device_ordinal_value;
+  device_ordinal_value.set_placeholder("device_ordinal");
+  for (Node* n : g->nodes()) {
+    if (!HasNodeAttr(n->def(), kXlaHasHostTransferAttrName)) {
+      continue;
+    }
+
+    if (n->type_string() == "_XlaRecvAtHost" ||
+        n->type_string() == "_XlaSendFromHost") {
+      n->ClearAttr("device_ordinal");
+      n->AddAttr("device_ordinal", device_ordinal_value);
+    } else if (n->type_string() == "If") {
+      for (const string& attr_name :
+           std::vector<string>{"then_branch", "else_branch"}) {
+        NameAttrList branch_func;
+        TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), attr_name, &branch_func));
+        (*branch_func.mutable_attr())["device_ordinal"] = device_ordinal_value;
+        n->ClearAttr(attr_name);
+        n->AddAttr(attr_name, branch_func);
+      }
+    } else if (n->type_string() == "While") {
+      for (const string& attr_name : std::vector<string>{"cond", "body"}) {
+        NameAttrList branch_func;
+        TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), attr_name, &branch_func));
+        (*branch_func.mutable_attr())["device_ordinal"] = device_ordinal_value;
+        n->ClearAttr(attr_name);
+        n->AddAttr(attr_name, branch_func);
+      }
+    } else if (HasNodeAttr(n->def(), "device_ordinal")) {
+      // Function call node containing outside compilation.
+      n->ClearAttr("device_ordinal");
+      n->AddAttr("device_ordinal", device_ordinal_value);
+    } else {
+      return errors::Internal("Unknown node marked with ",
+                              kXlaHasHostTransferAttrName, ": ",
+                              n->DebugString());
+    }
+  }
+  return Status::OK();
+}
+
 // For an XLA computation, builds host side graph given all outside compilation
 // graphs inside it. The host side graph contains:
 // 1) a "sequencer" node (we will add control edge between XlaRecvAtHost and
@@ -368,8 +458,8 @@ Status ReplaceOrRemoveOutsideCompilationCallNode(
 Status ConstructHostGraph(
     const string& xla_cluster_name, const string& outside_compilation_attr_name,
     const std::vector<string>& outside_compilation_host_graphs,
-    FunctionLibraryDefinition* fld, std::unique_ptr<Graph>* host_graph) {
-  host_graph->reset(new Graph(fld));
+    FunctionLibraryDefinition* fld, const string& host_graph_func_name) {
+  Graph host_graph(fld);
 
   // Create sequencer node in host graph.
   NodeDefBuilder sequencer_builder(absl::StrCat(xla_cluster_name, "_sequencer"),
@@ -378,24 +468,34 @@ Status ConstructHostGraph(
   NodeDef sequencer_def;
   TF_RETURN_IF_ERROR(sequencer_builder.Finalize(&sequencer_def));
   Status s;
-  Node* sequencer = (*host_graph)->AddNode(sequencer_def, &s);
+  Node* sequencer = host_graph.AddNode(sequencer_def, &s);
   TF_RETURN_IF_ERROR(s);
 
   // Create key placeholder in host graph.
   TF_ASSIGN_OR_RETURN(
       Node * key_placeholder,
-      AddHostComputeKeyPlaceholder(xla_cluster_name, host_graph->get()));
+      AddHostComputeKeyPlaceholder(xla_cluster_name, &host_graph));
 
   // For each outside compilation graph, copy them to host graph with the
   // following changes:
   // a) Use key_placeholder in host graph instead of its own.
-  // b) Add control edge from RecvAtHost/SendFromHost to sequencer.
+  // b) Add control edge from host transfer nodes (XlaRecvAtHost,
+  //    XlaSendFromHost, If/While nodes containing
+  //    XlaRecvAtHost/XlaSendFromHost) to sequencer node.
   // c) Clear node_def.device(), so device placer won't get confused.
   for (const string& host_func : outside_compilation_host_graphs) {
     VLOG(4) << "Expanding host graph " << host_func;
+    // Temporarily use "0" as "device_ordinal". It will be reset to placeholder
+    // value after we expanded all host graphs. We cannot just use placeholder
+    // value here because FunctionDef instantiation does not allow placeholder
+    // value for attributes.
+    AttrValue device_ordinal_attr;
+    device_ordinal_attr.set_i(0);
+    protobuf::Map<string, AttrValue> attrs;
+    attrs["device_ordinal"] = device_ordinal_attr;
     FunctionBody* host_fbody = nullptr;
     TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
-        *fld->Find(host_func), AttrSlice(), fld,
+        *fld->Find(host_func), AttrSlice(&attrs), fld,
         [&](const string& op, const OpDef** sig) {
           return fld->LookUpOpDef(op, sig);
         },
@@ -408,8 +508,8 @@ Status ConstructHostGraph(
     FixupSourceAndSinkEdges(host_fbody->graph);
 
     std::map<const Node*, Node*> node_map;
-    node_map[host_fbody->graph->source_node()] = (*host_graph)->source_node();
-    node_map[host_fbody->graph->sink_node()] = (*host_graph)->sink_node();
+    node_map[host_fbody->graph->source_node()] = host_graph.source_node();
+    node_map[host_fbody->graph->sink_node()] = host_graph.sink_node();
     Status s;
     ReverseDFS(
         *host_fbody->graph, /*enter=*/nullptr,
@@ -431,7 +531,7 @@ Status ConstructHostGraph(
             NodeDef copy_def = n->def();
             // Change c).
             copy_def.clear_device();
-            copy = (*host_graph)->AddNode(copy_def, &s);
+            copy = host_graph.AddNode(copy_def, &s);
             if (!s.ok()) {
               return;
             }
@@ -446,22 +546,23 @@ Status ConstructHostGraph(
                                    e->src()->DebugString());
               return;
             }
-            (*host_graph)
-                ->AddEdge(node_map[e->src()], e->src_output(), copy,
-                          e->dst_input());
+            host_graph.AddEdge(node_map[e->src()], e->src_output(), copy,
+                               e->dst_input());
           }
 
           // Change b).
-          if (copy->type_string() == "_XlaRecvAtHost" ||
-              copy->type_string() == "_XlaSendFromHost") {
-            (*host_graph)->AddControlEdge(copy, sequencer);
+          if (HasNodeAttr(copy->def(), kXlaHasHostTransferAttrName)) {
+            host_graph.AddControlEdge(copy, sequencer);
           }
         },
         NodeComparatorID());
+
     if (!s.ok()) {
       return s;
     }
   }
+  // Reset "device_ordinal" to placeholder value.
+  TF_RETURN_IF_ERROR(ResetDeviceOrdinalToPlaceholderValue(&host_graph));
 
   // sequencer and key_placeholder might be dead nodes. Prune them if necessary.
   // - sequencer should be pruned iff it has no input control edges from
@@ -470,21 +571,30 @@ Status ConstructHostGraph(
   // - key_placeholder should be pruned iff there's no RecvAtHost/SendFromHost.
   //   We don't need to do anything special.
   if (!sequencer->in_edges().empty()) {
-    (*host_graph)->AddControlEdge(sequencer, (*host_graph)->sink_node());
+    host_graph.AddControlEdge(sequencer, host_graph.sink_node());
   }
   PruneForReverseReachability(
-      host_graph->get(),
-      std::unordered_set<const Node*>{(*host_graph)->sink_node()});
+      &host_graph, std::unordered_set<const Node*>{host_graph.sink_node()});
 
   // Postprocess edges between different outside compilations.
   TF_RETURN_IF_ERROR(PostprocessEdgesBetweenOutsideCompilations(
-      host_graph->get(), outside_compilation_attr_name));
+      &host_graph, outside_compilation_attr_name));
 
   if (VLOG_IS_ON(4)) {
     dump_graph::DumpGraphToFile(
         absl::StrCat("extract_outside_compilation_host_graph_for_",
                      xla_cluster_name),
-        **host_graph, fld);
+        host_graph, fld);
+  }
+
+  FunctionDef host_graph_fdef;
+  TF_RETURN_IF_ERROR(
+      GraphToFunctionDef(host_graph, host_graph_func_name, &host_graph_fdef));
+  if (fld->Find(host_graph_func_name)) {
+    TF_RETURN_IF_ERROR(
+        fld->ReplaceFunction(host_graph_func_name, host_graph_fdef));
+  } else {
+    TF_RETURN_IF_ERROR(fld->AddFunctionDef(host_graph_fdef));
   }
 
   return Status::OK();
@@ -492,8 +602,28 @@ Status ConstructHostGraph(
 
 // Expand XLA computation's outside compilation host side graph into main graph.
 // Add a control edge between sequencer node and the XLA computation node.
-Status ExpandHostGraphIntoMainGraph(Graph* main_graph, Graph* host_graph,
+Status ExpandHostGraphIntoMainGraph(Graph* main_graph,
+                                    FunctionLibraryDefinition* fld,
+                                    const string& host_graph_func_name,
                                     Node* xla_computation_node) {
+  // Temporarily use "0" as "device_ordinal". It will be rewritten with the
+  // correct value in a later pass. We cannot just use placeholder value here
+  // because FunctionDef instantiation does not allow placeholder value for
+  // attributes.
+  AttrValue device_ordinal_attr;
+  device_ordinal_attr.set_i(0);
+  protobuf::Map<string, AttrValue> attrs;
+  attrs["device_ordinal"] = device_ordinal_attr;
+  FunctionBody* fbody = nullptr;
+  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+      *fld->Find(host_graph_func_name), AttrSlice(&attrs), fld,
+      [&](const string& op, const OpDef** sig) {
+        return fld->LookUpOpDef(op, sig);
+      },
+      &fbody));
+  std::unique_ptr<FunctionBody> fbody_deleter(fbody);
+  Graph* host_graph = fbody->graph;
+
   // We use ReverseDFS() to copy nodes. Make sure all nodes are reverse
   // reachable from sink node so all nodes will be copied.
   // TODO(b/77601805): consolidate copy graph functions.
@@ -545,23 +675,25 @@ Status ExpandHostGraphIntoMainGraph(Graph* main_graph, Graph* host_graph,
   return s;
 }
 
-// Rewrites shape inference graph for outside compilation.
-// 1. If the outside compilation is a "top-level" one (not in a function of any
-//    If/While/etc.), this shape inference graph might have host computation to
-//    outside compilation placeholder nodes, which will cause shape inference to
-//    fail. However, those nodes are not in `host_graph` any more (because we
-//    have executed `PostprocessForEncapsultion`). In this case, we clear the
-//    graph, and copy SendFromHost with all its predecessors from `host_graph`.
-//    This case is detected by whether the SendFromHost node exists in
-//    `host_graph` as well.
-// 2. Remove control edges, and prune nodes that are not useful for shape
-//    inference.
+// Rewrites shape inference graph for outside compilation:
+// 1) If XlaSendFromHost also exists in `host_graph`, copy nodes from
+//    `host_graph`. Because we might still have outside compilation to outside
+//    compilation placeholder nodes in shape inference graph, which will prevent
+//    us from inferring XlaSendFromHost shape. But in `host_graph`, we already
+//    removed those placeholder nodes.
+// 2) Remove control edges.
+// 3) Prune nodes that are not useful for shape inference.
 Status RewriteShapeInferenceGraph(const string& shape_inference_graph_name,
                                   Graph* host_graph,
                                   FunctionLibraryDefinition* fld) {
+  // Use "0" as "device_ordinal". It does not matter for shape inference.
+  AttrValue device_ordinal_attr;
+  device_ordinal_attr.set_i(0);
+  protobuf::Map<string, AttrValue> attrs;
+  attrs["device_ordinal"] = device_ordinal_attr;
   FunctionBody* fbody = nullptr;
   TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
-      *fld->Find(shape_inference_graph_name), AttrSlice(), fld,
+      *fld->Find(shape_inference_graph_name), AttrSlice(&attrs), fld,
       [&](const string& op, const OpDef** sig) {
         return fld->LookUpOpDef(op, sig);
       },
@@ -650,6 +782,7 @@ Status RewriteShapeInferenceGraph(const string& shape_inference_graph_name,
       g->RemoveEdge(e);
     }
   }
+
   // Nodes that are not reverse reachable from SendFromHost are not useful for
   // shape inference. Prune them.
   PruneForReverseReachability(g,
@@ -669,6 +802,681 @@ Status RewriteShapeInferenceGraph(const string& shape_inference_graph_name,
   return Status::OK();
 }
 
+// Builds XlaSendToHost node which sends cond predicate to host.
+xla::StatusOr<Node*> BuildSendIfPredNode(const string& name,
+                                         const string& host_transfer_key,
+                                         Node* pred_node, Graph* g) {
+  NodeDefBuilder send_pred_builder(name, "XlaSendToHost");
+  send_pred_builder.Attr("Tinput", DT_BOOL);
+  send_pred_builder.Attr("key", absl::StrCat(host_transfer_key, "_dtoh_0"));
+  send_pred_builder.Attr(kXlaTokenInputNodesAttrName,
+                         std::vector<string>{kXlaTokenArgNodeName});
+  send_pred_builder.Input(pred_node->name(), 0, DT_BOOL);
+  NodeDef send_pred_def;
+  TF_RETURN_IF_ERROR(send_pred_builder.Finalize(&send_pred_def));
+  Status s;
+  Node* send_pred_node = g->AddNode(send_pred_def, &s);
+  TF_RETURN_IF_ERROR(s);
+  g->AddEdge(pred_node, 0, send_pred_node, 0);
+  return send_pred_node;
+}
+
+// Replaces key placeholder node with an _Arg node.
+Status ReplaceKeyPlaceholderWithArgNode(const string& xla_cluster_name,
+                                        const string& func_name,
+                                        FunctionLibraryDefinition* fld) {
+  // Temporarily use "0" as "device_ordinal". It will be reset to placeholder
+  // value after rewriting.
+  AttrValue device_ordinal_attr;
+  device_ordinal_attr.set_i(0);
+  protobuf::Map<string, AttrValue> attrs;
+  attrs["device_ordinal"] = device_ordinal_attr;
+  FunctionBody* fbody = nullptr;
+  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+      *fld->Find(func_name), AttrSlice(&attrs), fld,
+      [&](const string& op, const OpDef** sig) {
+        return fld->LookUpOpDef(op, sig);
+      },
+      &fbody));
+  std::unique_ptr<FunctionBody> fbody_deleter(fbody);
+  Graph* g = fbody->graph;
+
+  // Find or create the key placeholder node.
+  Node* key_placeholder = nullptr;
+  for (Node* n : g->nodes()) {
+    if (IsKeyPlaceholderNode(*n)) {
+      key_placeholder = n;
+      break;
+    }
+  }
+  if (!key_placeholder) {
+    TF_ASSIGN_OR_RETURN(key_placeholder,
+                        AddHostComputeKeyPlaceholder(xla_cluster_name, g));
+  }
+
+  // Build the _Arg node, and replace key placeholder node with it.
+  NodeDefBuilder arg_builder("key_arg", FunctionLibraryDefinition::kArgOp);
+  arg_builder.Attr("T", DT_STRING);
+  arg_builder.Attr("index", 0);
+  NodeDef arg_def;
+  TF_RETURN_IF_ERROR(arg_builder.Finalize(&arg_def));
+  TF_RETURN_IF_ERROR(ReplaceNode(g, key_placeholder, arg_def).status());
+
+  // Reset "device_ordinal" to placeholder value.
+  TF_RETURN_IF_ERROR(ResetDeviceOrdinalToPlaceholderValue(g));
+
+  FunctionDef replace_fdef;
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(*g, func_name, &replace_fdef));
+  TF_RETURN_IF_ERROR(fld->ReplaceFunction(func_name, replace_fdef));
+  return Status::OK();
+}
+
+// Builds host side graph for If node.
+Status BuildHostGraphForIfNode(const string& xla_cluster_attr_name,
+                               const string& outside_compilation_attr_name,
+                               const string& xla_cluster_name,
+                               const string& if_node_name,
+                               const string& host_transfer_key,
+                               const string& host_graph_func_name,
+                               FunctionLibraryDefinition* fld,
+                               const string& then_branch_host_func_name,
+                               const string& else_branch_host_func_name) {
+  Graph host_graph(fld);
+  string outside_compilation_name = absl::StrCat("oc_if_", if_node_name);
+  AttrValue device_ordinal_value;
+  device_ordinal_value.set_placeholder("device_ordinal");
+
+  // Step 1: add key placeholder node.
+  TF_ASSIGN_OR_RETURN(
+      Node * key_placeholder,
+      AddHostComputeKeyPlaceholder(xla_cluster_name, &host_graph));
+
+  // Step 2: build XlaRecvAtHost node to recv predicate.
+  NodeDefBuilder recv_pred_builder(
+      absl::StrCat("recv_oc_if_pred_", if_node_name), "_XlaRecvAtHost");
+  recv_pred_builder.Attr("Toutputs", std::vector<DataType>{DT_BOOL});
+  recv_pred_builder.Attr("key", host_transfer_key);
+  recv_pred_builder.Attr("device_ordinal", device_ordinal_value);
+  recv_pred_builder.Attr(xla_cluster_attr_name, xla_cluster_name);
+  recv_pred_builder.Attr(outside_compilation_attr_name,
+                         outside_compilation_name);
+  recv_pred_builder.Attr(kXlaHasHostTransferAttrName, true);
+  recv_pred_builder.Input(key_placeholder->name(), 0, DT_STRING);
+  NodeDef recv_pred_def;
+  TF_RETURN_IF_ERROR(recv_pred_builder.Finalize(&recv_pred_def));
+  Status s;
+  Node* recv_pred_node = host_graph.AddNode(recv_pred_def, &s);
+  TF_RETURN_IF_ERROR(s);
+  host_graph.AddEdge(key_placeholder, 0, recv_pred_node, 0);
+
+  // Step 3: rewrite `{then, else}_branch_host_func_name`, replace key
+  // placeholder with an _Arg node.
+  TF_RETURN_IF_ERROR(ReplaceKeyPlaceholderWithArgNode(
+      xla_cluster_name, then_branch_host_func_name, fld));
+  TF_RETURN_IF_ERROR(ReplaceKeyPlaceholderWithArgNode(
+      xla_cluster_name, else_branch_host_func_name, fld));
+
+  // Step 4: build If node to choose between `{then, else}_branch_host_graph`.
+  NodeDefBuilder if_builder(absl::StrCat("oc_if_", if_node_name), "If");
+  if_builder.Attr("Tcond", DT_BOOL);
+  if_builder.Attr("Tin", std::vector<DataType>{DT_STRING});
+  if_builder.Attr("Tout", std::vector<DataType>{});
+  NameAttrList host_then_branch, host_else_branch;
+  host_then_branch.set_name(then_branch_host_func_name);
+  (*host_then_branch.mutable_attr())["device_ordinal"] = device_ordinal_value;
+  host_else_branch.set_name(else_branch_host_func_name);
+  (*host_else_branch.mutable_attr())["device_ordinal"] = device_ordinal_value;
+  if_builder.Attr("then_branch", host_then_branch);
+  if_builder.Attr("else_branch", host_else_branch);
+  if_builder.Attr(kXlaHasHostTransferAttrName, true);
+  if_builder.Attr(xla_cluster_attr_name, xla_cluster_name);
+  if_builder.Attr(outside_compilation_attr_name, outside_compilation_name);
+  if_builder.Input(recv_pred_node->name(), 0, DT_BOOL);
+  std::vector<NodeDefBuilder::NodeOut> if_inputs{
+      {key_placeholder->name(), 0, DT_STRING}};
+  if_builder.Input(if_inputs);
+  NodeDef if_def;
+  TF_RETURN_IF_ERROR(if_builder.Finalize(&if_def));
+  Node* if_node = host_graph.AddNode(if_def, &s);
+  TF_RETURN_IF_ERROR(s);
+  host_graph.AddEdge(recv_pred_node, 0, if_node, 0);
+  host_graph.AddEdge(key_placeholder, 0, if_node, 1);
+
+  // Convert `host_graph` to function, and add a "device_ordinal" attr.
+  FunctionDef oc_host_graph_fdef;
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(host_graph, host_graph_func_name,
+                                        &oc_host_graph_fdef));
+  if (fld->Find(host_graph_func_name)) {
+    TF_RETURN_IF_ERROR(
+        fld->ReplaceFunction(host_graph_func_name, oc_host_graph_fdef));
+  } else {
+    TF_RETURN_IF_ERROR(fld->AddFunctionDef(oc_host_graph_fdef));
+  }
+
+  return Status::OK();
+}
+
+// Rewrites loop cond to add a node which sends loop cond to host.
+Status AddSendLoopPredToLoopCond(FunctionLibraryDefinition* fld,
+                                 const NameAttrList& loop_cond_func,
+                                 const string& while_node_name,
+                                 const string& host_transfer_key) {
+  // Instantiate the loop cond function.
+  FunctionBody* fbody = nullptr;
+  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+      *fld->Find(loop_cond_func.name()), AttrSlice(&loop_cond_func.attr()), fld,
+      [&](const string& op, const OpDef** sig) {
+        return fld->LookUpOpDef(op, sig);
+      },
+      &fbody));
+  std::unique_ptr<FunctionBody> fbody_deleter(fbody);
+  Graph* g = fbody->graph;
+
+  // Find the _Retval node and the loop cond node.
+  Node* ret_node = nullptr;
+  for (Node* n : g->nodes()) {
+    if (n->type_string() == "_Retval") {
+      if (ret_node) {
+        return errors::Internal("Multiple return node for loop cond function ",
+                                loop_cond_func.name(), ": ",
+                                ret_node->DebugString(), " and ",
+                                n->DebugString());
+      } else {
+        ret_node = n;
+      }
+    }
+  }
+  if (!ret_node) {
+    return errors::Internal("No _Retval node for loop cond function ",
+                            loop_cond_func.name());
+  }
+  Node* loop_cond;
+  TF_RETURN_IF_ERROR(ret_node->input_node(0, &loop_cond));
+
+  // Build the XlaSendToHost node.
+  NodeDefBuilder send_loop_cond_builder(
+      absl::StrCat("send_oc_while_cond_", while_node_name), "XlaSendToHost");
+  send_loop_cond_builder.Attr("Tinput", DT_BOOL);
+  send_loop_cond_builder.Attr("key",
+                              absl::StrCat(host_transfer_key, "_dtoh_0"));
+  send_loop_cond_builder.Attr(kXlaTokenInputNodesAttrName,
+                              std::vector<string>{kXlaTokenArgNodeName});
+  send_loop_cond_builder.Input(loop_cond->name(), 0, DT_BOOL);
+  NodeDef send_loop_cond_def;
+  TF_RETURN_IF_ERROR(send_loop_cond_builder.Finalize(&send_loop_cond_def));
+  Status s;
+  Node* send_loop_cond_node = g->AddNode(send_loop_cond_def, &s);
+  TF_RETURN_IF_ERROR(s);
+  g->AddEdge(loop_cond, 0, send_loop_cond_node, 0);
+
+  // Replace original function.
+  FunctionDef replace_fdef;
+  TF_RETURN_IF_ERROR(
+      GraphToFunctionDef(*g, loop_cond_func.name(), &replace_fdef));
+  TF_RETURN_IF_ERROR(fld->ReplaceFunction(loop_cond_func.name(), replace_fdef));
+
+  return Status::OK();
+}
+
+// Rewrites while loop cond function for host.
+Status RewriteHostWhileLoopCond(
+    const string& cond_host_func_name, const string& while_node_name,
+    const string& host_transfer_key, const string& xla_cluster_attr_name,
+    const string& xla_cluster_name, const string& outside_compilation_attr_name,
+    const string& outside_compilation_name, FunctionLibraryDefinition* fld) {
+  // Replace key placeholder node with _Arg node.
+  TF_RETURN_IF_ERROR(ReplaceKeyPlaceholderWithArgNode(
+      xla_cluster_name, cond_host_func_name, fld));
+
+  // Instantiate cond function.
+  AttrValue device_ordinal_temp_value;
+  device_ordinal_temp_value.set_i(0);
+  protobuf::Map<string, AttrValue> attrs;
+  attrs["device_ordinal"] = device_ordinal_temp_value;
+  FunctionBody* cond_fbody = nullptr;
+  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+      *fld->Find(cond_host_func_name), AttrSlice(&attrs), fld,
+      [&](const string& op, const OpDef** sig) {
+        return fld->LookUpOpDef(op, sig);
+      },
+      &cond_fbody));
+  std::unique_ptr<FunctionBody> cond_fbody_deleter(cond_fbody);
+  Graph* cond_graph = cond_fbody->graph;
+  Node* key_arg = nullptr;
+  for (Node* n : cond_graph->nodes()) {
+    if (n->type_string() == "_Arg") {
+      key_arg = n;
+    }
+  }
+  if (!key_arg) {
+    return errors::Internal(
+        "No _Arg node found for host compute key in function ",
+        cond_host_func_name);
+  }
+
+  // Add an XlaRecvAtHost node to use as cond function return value.
+  // We don't need to set kXlaHasHostTransferAttrName for this node, because
+  // it's already added for the "While" node on the host.
+  NodeDefBuilder recv_pred_builder(
+      absl::StrCat("recv_oc_while_cond_", while_node_name), "_XlaRecvAtHost");
+  recv_pred_builder.Attr("Toutputs", std::vector<DataType>{DT_BOOL});
+  recv_pred_builder.Attr("key", host_transfer_key);
+  AttrValue device_ordinal_value;
+  device_ordinal_value.set_placeholder("device_ordinal");
+  recv_pred_builder.Attr("device_ordinal", device_ordinal_value);
+  recv_pred_builder.Attr(xla_cluster_attr_name, xla_cluster_name);
+  recv_pred_builder.Attr(outside_compilation_attr_name,
+                         outside_compilation_name);
+  recv_pred_builder.Input(key_arg->name(), 0, DT_STRING);
+  NodeDef recv_pred_def;
+  TF_RETURN_IF_ERROR(recv_pred_builder.Finalize(&recv_pred_def));
+  Status s;
+  Node* recv_pred_node = cond_graph->AddNode(recv_pred_def, &s);
+  TF_RETURN_IF_ERROR(s);
+  cond_graph->AddEdge(key_arg, 0, recv_pred_node, 0);
+  NodeDefBuilder ret_builder(
+      absl::StrCat("recv_oc_while_cond_ret_", while_node_name), "_Retval");
+  ret_builder.Attr("T", DT_BOOL);
+  ret_builder.Attr("index", 0);
+  ret_builder.Input(recv_pred_node->name(), 0, DT_BOOL);
+  NodeDef ret_def;
+  TF_RETURN_IF_ERROR(ret_builder.Finalize(&ret_def));
+  Node* ret_node = cond_graph->AddNode(ret_def, &s);
+  TF_RETURN_IF_ERROR(s);
+  cond_graph->AddEdge(recv_pred_node, 0, ret_node, 0);
+
+  // Reset device_ordinal to placeholder value.
+  TF_RETURN_IF_ERROR(ResetDeviceOrdinalToPlaceholderValue(cond_graph));
+
+  // Replace original function.
+  FunctionDef cond_replace_fdef;
+  TF_RETURN_IF_ERROR(
+      GraphToFunctionDef(*cond_graph, cond_host_func_name, &cond_replace_fdef));
+  TF_RETURN_IF_ERROR(
+      fld->ReplaceFunction(cond_host_func_name, cond_replace_fdef));
+
+  return Status::OK();
+}
+
+// Rewrites while loop body function for host.
+Status RewriteHostWhileLoopBody(
+    const string& body_host_func_name, const string& while_node_name,
+    const string& host_transfer_key, const string& xla_cluster_attr_name,
+    const string& xla_cluster_name, const string& outside_compilation_attr_name,
+    const string& outside_compilation_name, FunctionLibraryDefinition* fld) {
+  // Replace key placeholder node with _Arg node.
+  TF_RETURN_IF_ERROR(ReplaceKeyPlaceholderWithArgNode(
+      xla_cluster_name, body_host_func_name, fld));
+
+  // Instantiate body function.
+  AttrValue device_ordinal_temp_value;
+  device_ordinal_temp_value.set_i(0);
+  protobuf::Map<string, AttrValue> attrs;
+  attrs["device_ordinal"] = device_ordinal_temp_value;
+  FunctionBody* body_fbody = nullptr;
+  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+      *fld->Find(body_host_func_name), AttrSlice(&attrs), fld,
+      [&](const string& op, const OpDef** sig) {
+        return fld->LookUpOpDef(op, sig);
+      },
+      &body_fbody));
+  std::unique_ptr<FunctionBody> body_fbody_deleter(body_fbody);
+  Graph* body_graph = body_fbody->graph;
+  Node* key_arg = nullptr;
+  for (Node* n : body_graph->nodes()) {
+    if (n->type_string() == "_Arg") {
+      key_arg = n;
+    }
+  }
+  if (!key_arg) {
+    return errors::Internal(
+        "No _Arg node found for host compute key in function ",
+        body_host_func_name);
+  }
+
+  // Add a _Retval node to loop body.
+  NodeDefBuilder ret_builder(
+      absl::StrCat("recv_oc_while_body_ret_", while_node_name), "_Retval");
+  ret_builder.Attr("T", DT_STRING);
+  ret_builder.Attr("index", 0);
+  ret_builder.Input(key_arg->name(), 0, DT_STRING);
+  NodeDef ret_def;
+  TF_RETURN_IF_ERROR(ret_builder.Finalize(&ret_def));
+  Status s;
+  Node* ret_node = body_graph->AddNode(ret_def, &s);
+  TF_RETURN_IF_ERROR(s);
+  body_graph->AddEdge(key_arg, 0, ret_node, 0);
+
+  // Reset device_ordinal to placeholder value.
+  TF_RETURN_IF_ERROR(ResetDeviceOrdinalToPlaceholderValue(body_graph));
+
+  // Replace original function.
+  FunctionDef body_replace_fdef;
+  TF_RETURN_IF_ERROR(
+      GraphToFunctionDef(*body_graph, body_host_func_name, &body_replace_fdef));
+  TF_RETURN_IF_ERROR(
+      fld->ReplaceFunction(body_host_func_name, body_replace_fdef));
+
+  return Status::OK();
+}
+
+// Builds host side graph for while node.
+Status BuildHostGraphForWhileNode(
+    const string& xla_cluster_attr_name,
+    const string& outside_compilation_attr_name, const string& xla_cluster_name,
+    const string& while_node_name, const string& host_transfer_key,
+    const string& host_graph_func_name, FunctionLibraryDefinition* fld,
+    const string& cond_host_func_name, const string& body_host_func_name) {
+  Graph host_graph(fld);
+  string outside_compilation_name = absl::StrCat("oc_while_", while_node_name);
+
+  // Step 1: add key placeholder node.
+  TF_ASSIGN_OR_RETURN(
+      Node * key_placeholder,
+      AddHostComputeKeyPlaceholder(xla_cluster_name, &host_graph));
+
+  // Step 2: rewrite cond function.
+  TF_RETURN_IF_ERROR(RewriteHostWhileLoopCond(
+      cond_host_func_name, while_node_name, host_transfer_key,
+      xla_cluster_attr_name, xla_cluster_name, outside_compilation_attr_name,
+      outside_compilation_name, fld));
+
+  // Step 3: rewrite body function.
+  TF_RETURN_IF_ERROR(RewriteHostWhileLoopBody(
+      body_host_func_name, while_node_name, host_transfer_key,
+      xla_cluster_attr_name, xla_cluster_name, outside_compilation_attr_name,
+      outside_compilation_name, fld));
+
+  // Step 4: build While node.
+  NodeDefBuilder while_builder(absl::StrCat("oc_while_", while_node_name),
+                               "While");
+  while_builder.Attr("T", std::vector<DataType>{DT_STRING});
+  NameAttrList func;
+  AttrValue device_ordinal_value;
+  device_ordinal_value.set_placeholder("device_ordinal");
+  (*func.mutable_attr())["device_ordinal"] = device_ordinal_value;
+  func.set_name(cond_host_func_name);
+  while_builder.Attr("cond", func);
+  func.set_name(body_host_func_name);
+  while_builder.Attr("body", func);
+  while_builder.Attr(kXlaHasHostTransferAttrName, true);
+  while_builder.Attr(xla_cluster_attr_name, xla_cluster_name);
+  while_builder.Attr(outside_compilation_attr_name, outside_compilation_name);
+  std::vector<NodeDefBuilder::NodeOut> while_inputs{
+      {key_placeholder->name(), 0, DT_STRING}};
+  while_builder.Input(while_inputs);
+  NodeDef while_def;
+  TF_RETURN_IF_ERROR(while_builder.Finalize(&while_def));
+  Status s;
+  Node* while_node = host_graph.AddNode(while_def, &s);
+  TF_RETURN_IF_ERROR(s);
+  host_graph.AddEdge(key_placeholder, 0, while_node, 0);
+
+  // Convert `host_graph` to function.
+  FunctionDef oc_host_graph_fdef;
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(host_graph, host_graph_func_name,
+                                        &oc_host_graph_fdef));
+  if (fld->Find(host_graph_func_name)) {
+    TF_RETURN_IF_ERROR(
+        fld->ReplaceFunction(host_graph_func_name, oc_host_graph_fdef));
+  } else {
+    TF_RETURN_IF_ERROR(fld->AddFunctionDef(oc_host_graph_fdef));
+  }
+
+  return Status::OK();
+}
+
+// Builds host graph for func call nodes.
+Status BuildHostGraphForFuncCallNode(const string& func_call_node_name,
+                                     const string& xla_cluster_name,
+                                     const string& func_call_host_func_name,
+                                     const string& host_graph_func_name,
+                                     FunctionLibraryDefinition* fld) {
+  Graph host_graph(fld);
+  AttrValue device_ordinal_value;
+  device_ordinal_value.set_placeholder("device_ordinal");
+
+  // Step 1: add key placeholder node.
+  TF_ASSIGN_OR_RETURN(
+      Node * key_placeholder,
+      AddHostComputeKeyPlaceholder(xla_cluster_name, &host_graph));
+
+  // Step 2: rewrite `host_func_name`, replace key placeholder with an _Arg
+  // node.
+  TF_RETURN_IF_ERROR(ReplaceKeyPlaceholderWithArgNode(
+      xla_cluster_name, func_call_host_func_name, fld));
+
+  // Step 3: build a function call node with `host_func_name`, with
+  // `key_placeholder` as input.
+  NodeDefBuilder call_builder(absl::StrCat("oc_call_", func_call_node_name),
+                              func_call_host_func_name, fld);
+  call_builder.Input(key_placeholder->name(), 0, DT_STRING);
+  call_builder.Attr("device_ordinal", device_ordinal_value);
+  call_builder.Attr(kXlaHasHostTransferAttrName, true);
+  NodeDef call_def;
+  TF_RETURN_IF_ERROR(call_builder.Finalize(&call_def));
+  Status s;
+  Node* call_node = host_graph.AddNode(call_def, &s);
+  TF_RETURN_IF_ERROR(s);
+  host_graph.AddEdge(key_placeholder, 0, call_node, 0);
+
+  // Convert `host_graph` to function, and add a "device_ordinal" attr.
+  FunctionDef oc_host_graph_fdef;
+  TF_RETURN_IF_ERROR(GraphToFunctionDef(host_graph, host_graph_func_name,
+                                        &oc_host_graph_fdef));
+  if (fld->Find(host_graph_func_name)) {
+    TF_RETURN_IF_ERROR(
+        fld->ReplaceFunction(host_graph_func_name, oc_host_graph_fdef));
+  } else {
+    TF_RETURN_IF_ERROR(fld->AddFunctionDef(oc_host_graph_fdef));
+  }
+
+  return Status::OK();
+}
+
+Status ExtractOutsideCompilationForNodesWithAssociatedFunctions(
+    Graph* g, const string& xla_cluster_attr_name,
+    const string& outside_compilation_attr_name, const string& xla_cluster_name,
+    const std::map<string, int>& host_compute_core, FunctionLibraryRuntime* flr,
+    FunctionLibraryDefinition* fld, std::vector<string>* host_graphs,
+    std::vector<string>* shape_inference_graphs,
+    bool* has_outside_compilation) {
+  std::vector<Node*> if_nodes, while_nodes, func_call_nodes;
+  for (Node* n : g->nodes()) {
+    if (n->type_string() == "If") {
+      if_nodes.push_back(n);
+    } else if (n->type_string() == "While") {
+      while_nodes.push_back(n);
+    } else if (fld->Contains(n->type_string())) {
+      func_call_nodes.push_back(n);
+    } else if (n->type_string() == FunctionLibraryDefinition::kGradientOp) {
+      // Only gradient for user-defined function should be considered as
+      // function call node.
+      NameAttrList original_func;
+      TF_RETURN_IF_ERROR(GetNodeAttr(
+          n->def(), FunctionLibraryDefinition::kFuncAttr, &original_func));
+      if (fld->Contains(original_func.name())) {
+        func_call_nodes.push_back(n);
+      }
+    }
+  }
+
+  for (Node* n : func_call_nodes) {
+    // Extract outside compilation for the function call.
+    bool func_has_outside_compilation = false;
+    NameAttrList func;
+    func.set_name(n->type_string());
+    typedef protobuf::Map<string, AttrValue> AttrMap;
+    *func.mutable_attr() = AttrMap(n->attrs().begin(), n->attrs().end());
+    string new_func_name = absl::StrCat(n->name(), "_oc");
+    string host_func_name = absl::StrCat("oc_func_call_host_", n->name());
+    TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
+        xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+        func, new_func_name, host_func_name, host_compute_core, flr, fld,
+        shape_inference_graphs, &func_has_outside_compilation));
+
+    // If the function call does not have outside compilation, nothing to do.
+    if (!func_has_outside_compilation) {
+      continue;
+    }
+
+    *has_outside_compilation = true;
+
+    // Change `n` to call the new function directly.
+    NodeDefBuilder replace_builder(n->name(), new_func_name, fld);
+    for (const Edge* e : n->in_edges()) {
+      if (e->IsControlEdge()) {
+        continue;
+      }
+      replace_builder.Input(e->src()->name(), e->src_output(),
+                            e->src()->output_type(e->src_output()));
+    }
+    for (const auto& attr : n->attrs()) {
+      replace_builder.Attr(attr.first, attr.second);
+    }
+    NodeDef replace_def;
+    TF_RETURN_IF_ERROR(replace_builder.Finalize(&replace_def));
+    TF_ASSIGN_OR_RETURN(Node * replace, ReplaceNode(g, n, replace_def));
+    replace->AddAttr(kXlaTokenInputNodesAttrName,
+                     std::vector<string>{kXlaTokenArgNodeName});
+
+    // Build host side graph for the function call.
+    string oc_host_graph_name =
+        absl::StrCat("oc_func_host_graph_", replace->name());
+    TF_RETURN_IF_ERROR(
+        BuildHostGraphForFuncCallNode(replace->name(), xla_cluster_name,
+                                      host_func_name, oc_host_graph_name, fld));
+
+    // Record the host graph.
+    host_graphs->push_back(oc_host_graph_name);
+  }
+
+  for (Node* n : if_nodes) {
+    // Instantiate "then_branch" and "else_branch".
+    NameAttrList then_branch, else_branch;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "then_branch", &then_branch));
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "else_branch", &else_branch));
+
+    // Extract outside compilation for then_branch and else_branch.
+    bool then_branch_has_outside_compilation = false;
+    bool else_branch_has_outside_compilation = false;
+    string then_branch_host_func_name =
+               absl::StrCat("oc_then_branch_host_if_", n->name()),
+           else_branch_host_func_name =
+               absl::StrCat("oc_else_branch_host_if_", n->name());
+    string then_branch_xla_func_name = absl::StrCat(then_branch.name(), "_oc"),
+           else_branch_xla_func_name = absl::StrCat(else_branch.name(), "_oc");
+    TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
+        xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+        then_branch, then_branch_xla_func_name, then_branch_host_func_name,
+        host_compute_core, flr, fld, shape_inference_graphs,
+        &then_branch_has_outside_compilation));
+    TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
+        xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+        else_branch, else_branch_xla_func_name, else_branch_host_func_name,
+        host_compute_core, flr, fld, shape_inference_graphs,
+        &else_branch_has_outside_compilation));
+
+    // If then/else branch do not have outside compilation, nothing to do.
+    if (!then_branch_has_outside_compilation &&
+        !else_branch_has_outside_compilation) {
+      continue;
+    }
+
+    *has_outside_compilation = true;
+
+    // Change If node to call the new functions.
+    then_branch.set_name(then_branch_xla_func_name);
+    n->ClearAttr("then_branch");
+    n->AddAttr("then_branch", then_branch);
+    else_branch.set_name(else_branch_xla_func_name);
+    n->ClearAttr("else_branch");
+    n->AddAttr("else_branch", else_branch);
+
+    string host_transfer_key = absl::StrCat("oc_if_pred_", n->name());
+
+    // XLA computation: add a SendToHost node to send cond predicate.
+    Node* pred_node;
+    TF_RETURN_IF_ERROR(n->input_node(0, &pred_node));
+    TF_ASSIGN_OR_RETURN(
+        Node * send_pred_node,
+        BuildSendIfPredNode(absl::StrCat("send_oc_if_pred_", n->name()),
+                            host_transfer_key, pred_node, g));
+    n->AddAttr(kXlaTokenInputNodesAttrName,
+               std::vector<string>{send_pred_node->name()});
+
+    // Add a control edge from `send_pred_node` to If node, so XlaCompiler will
+    // visit If node after `send_pred_node`, thus the token output for
+    // `send_pred_node` has been generated.
+    g->AddControlEdge(send_pred_node, n);
+
+    // Build host side graph for the "If" node.
+    string oc_host_graph_name = absl::StrCat("oc_if_host_graph_", n->name());
+    TF_RETURN_IF_ERROR(BuildHostGraphForIfNode(
+        xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+        n->name(), host_transfer_key, oc_host_graph_name, fld,
+        then_branch_host_func_name, else_branch_host_func_name));
+    host_graphs->push_back(oc_host_graph_name);
+  }
+
+  for (Node* n : while_nodes) {
+    // Instantiate "cond" and "body".
+    NameAttrList cond, body;
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "cond", &cond));
+    TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "body", &body));
+
+    // Extract outside compilation for cond and body.
+    bool cond_has_outside_compilation = false;
+    bool body_has_outside_compilation = false;
+    string cond_host_func_name = absl::StrCat("oc_cond_host_while_", n->name()),
+           body_host_func_name = absl::StrCat("oc_body_host_while_", n->name());
+    string cond_xla_func_name = absl::StrCat(cond.name(), "_oc"),
+           body_xla_func_name = absl::StrCat(body.name(), "_oc");
+    TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
+        xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+        cond, cond_xla_func_name, cond_host_func_name, host_compute_core, flr,
+        fld, shape_inference_graphs, &cond_has_outside_compilation));
+    TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
+        xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+        body, body_xla_func_name, body_host_func_name, host_compute_core, flr,
+        fld, shape_inference_graphs, &body_has_outside_compilation));
+
+    // If cond/body do not have outside compilation, nothing to do.
+    if (!cond_has_outside_compilation && !body_has_outside_compilation) {
+      continue;
+    }
+
+    *has_outside_compilation = true;
+
+    // Change While node to call the new functions.
+    cond.set_name(cond_xla_func_name);
+    n->ClearAttr("cond");
+    n->AddAttr("cond", cond);
+    body.set_name(body_xla_func_name);
+    n->ClearAttr("body");
+    n->AddAttr("body", body);
+
+    string host_transfer_key = absl::StrCat("oc_while_pred_", n->name());
+
+    // XLA computation: rewrite cond function to add a SendToHost node to send
+    // loop predicate.
+    TF_RETURN_IF_ERROR(
+        AddSendLoopPredToLoopCond(fld, cond, n->name(), host_transfer_key));
+    n->AddAttr(kXlaTokenInputNodesAttrName,
+               std::vector<string>{kXlaTokenArgNodeName});
+
+    // Build host side graph for the "While" node.
+    string oc_host_graph_name = absl::StrCat("oc_while_host_graph_", n->name());
+    TF_RETURN_IF_ERROR(BuildHostGraphForWhileNode(
+        xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+        n->name(), host_transfer_key, oc_host_graph_name, fld,
+        cond_host_func_name, body_host_func_name));
+    host_graphs->push_back(oc_host_graph_name);
+  }
+
+  return Status::OK();
+}
+
 }  // namespace
 
 Status RewriteOutsideCompilationSubgraphFn::operator()(
@@ -755,12 +1563,15 @@ Status RewriteOutsideCompilationSubgraphFn::operator()(
   // it with HostCompute node later.
   AddNodeAttr("_outside_compilation_subgraph", old_name, node_def);
   if (shapes) {
-    AddNodeAttr("shape_inference_graph", "", node_def);
+    NameAttrList shape_inference_graph;
+    AddNodeAttr("shape_inference_graph", shape_inference_graph, node_def);
     AddNodeAttr("shapes", *shapes, node_def);
   } else {
     string shape_inference_func_name =
         absl::StrCat("_outside_compilation_shape_inference_", new_name);
-    AddNodeAttr("shape_inference_graph", shape_inference_func_name, node_def);
+    NameAttrList shape_inference_graph;
+    shape_inference_graph.set_name(shape_inference_func_name);
+    AddNodeAttr("shape_inference_graph", shape_inference_graph, node_def);
     AddNodeAttr("shapes", std::vector<TensorShapeProto>{}, node_def);
   }
   AddNodeAttr("ancestors", std::vector<string>{}, node_def);
@@ -775,36 +1586,34 @@ Status ExtractOutsideCompilationForFunction(
     const string& xla_cluster_attr_name,
     const string& outside_compilation_attr_name, const string& xla_cluster_name,
     const NameAttrList& func_name_attrs, const string& new_func_name,
-    const std::map<string, int>& host_compute_core,
-    FunctionLibraryDefinition* fld, std::unique_ptr<Graph>* host_graph,
-    std::vector<string>* shape_inference_graphs,
+    const string& host_graph_func_name,
+    const std::map<string, int>& host_compute_core, FunctionLibraryRuntime* flr,
+    FunctionLibraryDefinition* fld, std::vector<string>* shape_inference_graphs,
     bool* has_outside_compilation) {
-  // Early return if function does not have any outside compilation nodes.
+  // Convert the function to graph.
   const string& func_name = func_name_attrs.name();
-  const FunctionDef* fdef = fld->Find(func_name);
-  if (!fdef) {
-    return errors::Internal("Cannot find function ", func_name);
-  }
+  FunctionLibraryRuntime::Handle handle;
+  TF_RETURN_IF_ERROR(
+      flr->Instantiate(func_name, AttrSlice(&func_name_attrs.attr()), &handle));
+  Status ret_status = Status::OK();
+  auto cleanup_handle = gtl::MakeCleanup([&]() {
+    auto s = flr->ReleaseHandle(handle);
+    if (!s.ok()) {
+      ret_status.Update(s);
+    }
+  });
+  const FunctionBody* fbody = flr->GetFunctionBody(handle);
+
+  // Check if we have outside compilation nodes.
   *has_outside_compilation = false;
-  for (auto& node_def : fdef->node_def()) {
-    if (HasNodeAttr(node_def, outside_compilation_attr_name)) {
+  for (Node* n : fbody->graph->nodes()) {
+    if (HasNodeAttr(n->def(), outside_compilation_attr_name)) {
       *has_outside_compilation = true;
       break;
     }
   }
-  if (!has_outside_compilation) {
-    return Status::OK();
-  }
-
-  // Convert the function to graph.
-  FunctionBody* fbody = nullptr;
-  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
-      *fld->Find(func_name), AttrSlice(&func_name_attrs.attr()), fld,
-      [&](const string& op, const OpDef** sig) {
-        return fld->LookUpOpDef(op, sig);
-      },
-      &fbody));
-  std::unique_ptr<FunctionBody> fbody_deleter(fbody);
+  // We cannot early return here, because we might have outside compilation in
+  // If/While function body.
 
   // Preprocess edges between different outside compilations. They will be
   // restored in `ConstructHostGraph()`.
@@ -835,11 +1644,11 @@ Status ExtractOutsideCompilationForFunction(
       // If we could not infer shapes for XlaSendFromHost inputs statically, we
       // will set the "shape_inference_graph" attribute. In that case, copy
       // outside compilation subgraph as shape inference graph in `fld`.
-      string shape_inference_graph;
+      NameAttrList shape_inference_graph;
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "shape_inference_graph",
                                      &shape_inference_graph));
-      if (!shape_inference_graph.empty()) {
-        shape_inference_graphs->push_back(shape_inference_graph);
+      if (!shape_inference_graph.name().empty()) {
+        shape_inference_graphs->push_back(shape_inference_graph.name());
 
         const FunctionDef* xla_fdef = fld->Find(n->name());
         if (!xla_fdef) {
@@ -847,9 +1656,9 @@ Status ExtractOutsideCompilationForFunction(
         }
         FunctionDef shape_inference_fdef = *xla_fdef;
         shape_inference_fdef.mutable_signature()->set_name(
-            shape_inference_graph);
-        if (fld->Find(shape_inference_graph)) {
-          TF_RETURN_IF_ERROR(fld->ReplaceFunction(shape_inference_graph,
+            shape_inference_graph.name());
+        if (fld->Find(shape_inference_graph.name())) {
+          TF_RETURN_IF_ERROR(fld->ReplaceFunction(shape_inference_graph.name(),
                                                   shape_inference_fdef));
         } else {
           TF_RETURN_IF_ERROR(fld->AddFunctionDef(shape_inference_fdef));
@@ -858,21 +1667,22 @@ Status ExtractOutsideCompilationForFunction(
     }
   }
   for (Node* n : outside_compilation_nodes) {
+    TF_RETURN_IF_ERROR(ValidateOutsideCompilationCallNode(n));
     TF_RETURN_IF_ERROR(ReplaceOrRemoveOutsideCompilationCallNode(
         graph_out.get(), n, host_compute_core));
   }
-  if (VLOG_IS_ON(4)) {
-    dump_graph::DumpGraphToFile(
-        absl::StrCat("extract_outside_compilation_for_func_after_", func_name),
-        *graph_out, fld);
-  }
+
+  // Handle nodes with associated functions.
+  TF_RETURN_IF_ERROR(ExtractOutsideCompilationForNodesWithAssociatedFunctions(
+      graph_out.get(), xla_cluster_attr_name, outside_compilation_attr_name,
+      xla_cluster_name, host_compute_core, flr, fld,
+      &outside_compilation_host_graphs, shape_inference_graphs,
+      has_outside_compilation));
 
   // Construct host graph.
-  if (!outside_compilation_host_graphs.empty()) {
-    TF_RETURN_IF_ERROR(
-        ConstructHostGraph(xla_cluster_name, outside_compilation_attr_name,
-                           outside_compilation_host_graphs, fld, host_graph));
-  }
+  TF_RETURN_IF_ERROR(ConstructHostGraph(
+      xla_cluster_name, outside_compilation_attr_name,
+      outside_compilation_host_graphs, fld, host_graph_func_name));
 
   // Remove the outside compilation graphs from function library.
   for (const string& func : outside_compilation_host_graphs) {
@@ -883,20 +1693,31 @@ Status ExtractOutsideCompilationForFunction(
   FunctionDef updated_fdef;
   TF_RETURN_IF_ERROR(
       GraphToFunctionDef(*graph_out, new_func_name, &updated_fdef));
+  const FunctionDef* original_fdef = fld->Find(func_name);
+  if (original_fdef) {
+    for (const auto& attr : original_fdef->attr()) {
+      (*updated_fdef.mutable_attr())[attr.first] = attr.second;
+    }
+  }
   if (fld->Find(new_func_name)) {
     TF_RETURN_IF_ERROR(fld->ReplaceFunction(new_func_name, updated_fdef));
   } else {
     TF_RETURN_IF_ERROR(fld->AddFunctionDef(updated_fdef));
   }
+  if (VLOG_IS_ON(4)) {
+    dump_graph::DumpGraphToFile(
+        absl::StrCat("extract_outside_compilation_for_func_after_", func_name),
+        *graph_out, fld);
+  }
 
-  return Status::OK();
+  return ret_status;
 }
 
 Status ExtractOutsideCompilation(
     const string& xla_cluster_attr_name,
     const string& outside_compilation_attr_name,
     const std::unordered_map<string, XlaClusterInfo>& clusters, Graph* g,
-    FunctionLibraryDefinition* fld) {
+    FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld) {
   if (VLOG_IS_ON(4)) {
     dump_graph::DumpGraphToFile("extract_outside_compilation_before", *g, fld);
   }
@@ -909,24 +1730,17 @@ Status ExtractOutsideCompilation(
     auto const& host_compute_core = iter.second.host_compute_core;
 
     bool has_outside_compilation;
-    std::unique_ptr<Graph> host_graph;
+    string host_graph_func_name = absl::StrCat("oc_host_graph_", n->name());
     TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction(
         xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
-        func_name_attrs, func_name_attrs.name(), host_compute_core, fld,
-        &host_graph, &shape_inference_graphs, &has_outside_compilation));
-    if (host_graph) {
-      TF_RETURN_IF_ERROR(ExpandHostGraphIntoMainGraph(g, host_graph.get(), n));
-    }
-  }
-
-  if (VLOG_IS_ON(4)) {
-    dump_graph::DumpGraphToFile("extract_outside_compilation_expanded", *g,
-                                fld);
+        func_name_attrs, func_name_attrs.name(), host_graph_func_name,
+        host_compute_core, flr, fld, &shape_inference_graphs,
+        &has_outside_compilation));
+    TF_RETURN_IF_ERROR(
+        ExpandHostGraphIntoMainGraph(g, fld, host_graph_func_name, n));
+    TF_RETURN_IF_ERROR(fld->RemoveFunction(host_graph_func_name));
   }
 
-  TF_RETURN_IF_ERROR(PostprocessForEncapsulation(
-      g, xla_cluster_attr_name, outside_compilation_attr_name, clusters));
-
   for (auto shape_inference_graph_name : shape_inference_graphs) {
     TF_RETURN_IF_ERROR(
         RewriteShapeInferenceGraph(shape_inference_graph_name, g, fld));
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.h b/tensorflow/compiler/jit/extract_outside_compilation_pass.h
index 2a4f07cca213d999202024294f5d8f94527059c3..d64cc2a103ed040cbf413ac736f97f84459e869b 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass.h
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.h
@@ -88,9 +88,10 @@ Status ExtractOutsideCompilationForFunction(
     const string& xla_cluster_attr_name,
     const string& outside_compilation_attr_name, const string& xla_cluster_name,
     const NameAttrList& func_name_attrs, const string& new_func_name,
-    const std::map<string, int>& host_compute_core,
-    FunctionLibraryDefinition* fld, std::unique_ptr<Graph>* host_graph,
-    std::vector<string>* shape_inference_graphs, bool* has_outside_compilation);
+    const string& host_graph_func_name,
+    const std::map<string, int>& host_compute_core, FunctionLibraryRuntime* flr,
+    FunctionLibraryDefinition* fld, std::vector<string>* shape_inference_graphs,
+    bool* has_outside_compilation);
 
 // Rewrites XLA computation in `clusters` to replace outside compilation nodes
 // with XlaHostCompute, and moves those outside compilations into `g`. If shapes
@@ -100,7 +101,7 @@ Status ExtractOutsideCompilation(
     const string& xla_cluster_attr_name,
     const string& outside_compilation_attr_name,
     const std::unordered_map<string, XlaClusterInfo>& clusters, Graph* g,
-    FunctionLibraryDefinition* fld);
+    FunctionLibraryRuntime* flr, FunctionLibraryDefinition* fld);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
index bff956100da661b679b4557fce53671e6cef88c5..7c3a24feff81b21a5d2347d21fb80988bc3e6065 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
@@ -19,8 +19,11 @@ limitations under the License.
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/functional_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/jit/encapsulate_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/function.h"
@@ -29,6 +32,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
 
@@ -109,10 +114,10 @@ TEST(RewriteOutsideCompilationSubgraphFnTest, Basic) {
   }
   EXPECT_TRUE(has_control_edge_to_send_from_host);
   // Verify step 7: necessary attrs added to call_node_def.
-  string shape_inference_graph;
+  NameAttrList shape_inference_graph;
   TF_CHECK_OK(GetNodeAttr(AttrSlice(&call_node_def.attr()),
                           "shape_inference_graph", &shape_inference_graph));
-  EXPECT_EQ(shape_inference_graph,
+  EXPECT_EQ(shape_inference_graph.name(),
             "_outside_compilation_shape_inference_cluster_0");
 }
 
@@ -220,7 +225,42 @@ TEST(RewriteOutsideCompilationSubgraphFnTest, ShapesInferred) {
   EXPECT_EQ(shapes[0].dim_size(), 1);
 }
 
-TEST(ExtractOutsideCompilationForFunctionTest, Basic) {
+class ExtractOutsideCompilationForFunctionTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    SessionOptions session_options;
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::AddDevices(
+        session_options, "/job:localhost/replica:0/task:0", &devices));
+    device_mgr_ = absl::make_unique<DeviceMgr>(std::move(devices));
+  }
+
+  Status ExtractOutsideCompilationTest(
+      const string &xla_cluster_attr_name,
+      const string &outside_compilation_attr_name,
+      const string &xla_cluster_name, const NameAttrList &func_name_attrs,
+      const string &new_func_name, const string &host_graph_func_name,
+      const std::map<string, int> &host_compute_core,
+      FunctionLibraryDefinition *fld,
+      std::vector<string> *shape_inference_graphs,
+      bool *has_outside_compilation) {
+    OptimizerOptions opts;
+    pflr_ = absl::make_unique<ProcessFunctionLibraryRuntime>(
+        device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, fld, opts,
+        /*default_thread_pool=*/nullptr, /*cluster_flr=*/nullptr);
+    auto flr = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
+    return ExtractOutsideCompilationForFunction(
+        xla_cluster_attr_name, outside_compilation_attr_name, xla_cluster_name,
+        func_name_attrs, new_func_name, host_graph_func_name, host_compute_core,
+        flr, fld, shape_inference_graphs, has_outside_compilation);
+  }
+
+ private:
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+};
+
+TEST_F(ExtractOutsideCompilationForFunctionTest, Basic) {
   // Build the XLA computation func.
   // "const0"
   // "identity0" = "const0" (outside compilation cluster "0")
@@ -249,27 +289,26 @@ TEST(ExtractOutsideCompilationForFunctionTest, Basic) {
 
   protobuf::Map<string, tensorflow::AttrValue> attrs;
   std::map<string, int> host_compute_core = {{"0", 1}, {"1", 0}};
-  std::unique_ptr<Graph> host_graph;
   std::vector<string> shape_inference_graphs;
   bool has_outside_compilation;
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationForFunction(
-      "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten",
-      host_compute_core, &fld, &host_graph, &shape_inference_graphs,
+  TF_CHECK_OK(ExtractOutsideCompilationTest(
+      "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
+      host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
 
   // Get rewritten XLA computation function.
-  FunctionBody *fbody = nullptr;
-  TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"),
-                                      AttrSlice(), &fld,
-                                      [&](const string &op, const OpDef **sig) {
-                                        return fld.LookUpOpDef(op, sig);
-                                      },
-                                      &fbody));
-  std::unique_ptr<FunctionBody> fbody_deleter(fbody);
-  auto node_name_index = fbody->graph->BuildNodeNameIndex();
+  FunctionBody *xla_fbody = nullptr;
+  TF_CHECK_OK(FunctionDefToBodyHelper(
+      *fld.Find("cluster_rewritten"), AttrSlice(), &fld,
+      [&](const string &op, const OpDef **sig) {
+        return fld.LookUpOpDef(op, sig);
+      },
+      &xla_fbody));
+  std::unique_ptr<FunctionBody> xla_fbody_deleter(xla_fbody);
+  auto node_name_index = xla_fbody->graph->BuildNodeNameIndex();
 
   // Check XlaHostCompute nodes.
   Node *host_compute_0 = node_name_index["outside_compilation_0_host_compute"];
@@ -292,18 +331,31 @@ TEST(ExtractOutsideCompilationForFunctionTest, Basic) {
   EXPECT_EQ(shapes[0].dim_size(), 1);
   // Check XlaHostCompute nodes' "shape_inference_graph" attr. Both should have
   // empty values.
-  string shape_inference_graph;
+  NameAttrList shape_inference_graph;
   TF_CHECK_OK(GetNodeAttr(host_compute_0->attrs(), "shape_inference_graph",
                           &shape_inference_graph));
-  EXPECT_EQ(shape_inference_graph, "");
+  EXPECT_EQ(shape_inference_graph.name(), "");
   TF_CHECK_OK(GetNodeAttr(host_compute_1->attrs(), "shape_inference_graph",
                           &shape_inference_graph));
-  EXPECT_EQ(shape_inference_graph, "");
+  EXPECT_EQ(shape_inference_graph.name(), "");
 
   // Check `shape_inference_graphs`.
   EXPECT_EQ(shape_inference_graphs.size(), 0);
 
-  // Check `host_graph`: verify we have key placeholder and sequencer.
+  // Check host graph: verify we have key placeholder and sequencer.
+  FunctionBody *host_fbody = nullptr;
+  AttrValue device_ordinal_temp_value;
+  device_ordinal_temp_value.set_i(0);
+  protobuf::Map<string, AttrValue> host_func_attrs;
+  host_func_attrs["device_ordinal"] = device_ordinal_temp_value;
+  TF_CHECK_OK(FunctionDefToBodyHelper(
+      *fld.Find("host_graph"), AttrSlice(&host_func_attrs), &fld,
+      [&](const string &op, const OpDef **sig) {
+        return fld.LookUpOpDef(op, sig);
+      },
+      &host_fbody));
+  std::unique_ptr<FunctionBody> host_fbody_deleter(host_fbody);
+  Graph *host_graph = host_fbody->graph;
   Node *key_placeholder = nullptr, *sequencer = nullptr;
   for (Node *n : host_graph->nodes()) {
     if (n->type_string() == "Placeholder" &&
@@ -348,7 +400,7 @@ TEST(ExtractOutsideCompilationForFunctionTest, Basic) {
   }
 }
 
-TEST(ExtractOutsideCompilationForFunctionTest, NoHostGraph) {
+TEST_F(ExtractOutsideCompilationForFunctionTest, NoHostGraph) {
   // Build the XLA computation func.
   // "const0"
   FunctionDefLibrary fdl;
@@ -365,25 +417,37 @@ TEST(ExtractOutsideCompilationForFunctionTest, NoHostGraph) {
 
   protobuf::Map<string, tensorflow::AttrValue> attrs;
   std::map<string, int> host_compute_core = {{"0", 1}, {"1", 0}};
-  std::unique_ptr<Graph> host_graph;
   std::vector<string> shape_inference_graphs;
   bool has_outside_compilation;
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationForFunction(
-      "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten",
-      host_compute_core, &fld, &host_graph, &shape_inference_graphs,
+  TF_CHECK_OK(ExtractOutsideCompilationTest(
+      "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
+      host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
 
-  // Check `host_graph` is empty.
-  EXPECT_FALSE(host_graph);
+  // Check host graph is empty.
+  FunctionBody *host_fbody = nullptr;
+  AttrValue device_ordinal_temp_value;
+  device_ordinal_temp_value.set_i(0);
+  protobuf::Map<string, AttrValue> host_func_attrs;
+  host_func_attrs["device_ordinal"] = device_ordinal_temp_value;
+  TF_CHECK_OK(FunctionDefToBodyHelper(
+      *fld.Find("host_graph"), AttrSlice(&host_func_attrs), &fld,
+      [&](const string &op, const OpDef **sig) {
+        return fld.LookUpOpDef(op, sig);
+      },
+      &host_fbody));
+  std::unique_ptr<FunctionBody> host_fbody_deleter(host_fbody);
+  Graph *host_graph = host_fbody->graph;
+  EXPECT_EQ(host_graph->num_nodes(), 2);
 }
 
-TEST(ExtractOutsideCompilationForFunctionTest, XlaHostComputeRemoved) {
+TEST_F(ExtractOutsideCompilationForFunctionTest, XlaHostComputeRemoved) {
   // Build the XLA computation func.
   // "const0"
-  // "const1" (outside compilation clsuter "0")
+  // "const1" (outside compilation cluster "0")
   FunctionDefLibrary fdl;
   {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -401,31 +465,43 @@ TEST(ExtractOutsideCompilationForFunctionTest, XlaHostComputeRemoved) {
 
   protobuf::Map<string, tensorflow::AttrValue> attrs;
   std::map<string, int> host_compute_core = {{"0", 1}, {"1", 0}};
-  std::unique_ptr<Graph> host_graph;
   std::vector<string> shape_inference_graphs;
   bool has_outside_compilation;
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationForFunction(
-      "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten",
-      host_compute_core, &fld, &host_graph, &shape_inference_graphs,
+  TF_CHECK_OK(ExtractOutsideCompilationTest(
+      "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
+      host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
 
   // Check rewritten XLA graph: verify that we have no XlaHostCompute.
-  FunctionBody *fbody = nullptr;
-  TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"),
-                                      AttrSlice(), &fld,
-                                      [&](const string &op, const OpDef **sig) {
-                                        return fld.LookUpOpDef(op, sig);
-                                      },
-                                      &fbody));
-  std::unique_ptr<FunctionBody> fbody_deleter(fbody);
-  for (Node *n : fbody->graph->nodes()) {
+  FunctionBody *xla_fbody = nullptr;
+  TF_CHECK_OK(FunctionDefToBodyHelper(
+      *fld.Find("cluster_rewritten"), AttrSlice(), &fld,
+      [&](const string &op, const OpDef **sig) {
+        return fld.LookUpOpDef(op, sig);
+      },
+      &xla_fbody));
+  std::unique_ptr<FunctionBody> xla_fbody_deleter(xla_fbody);
+  for (Node *n : xla_fbody->graph->nodes()) {
     EXPECT_NE(n->type_string(), "XlaHostCompute");
   }
 
-  // Check `host_graph`: verify we have no placeholder, but we have "const1".
+  // Check host graph: verify we have no placeholder, but we have "const1".
+  FunctionBody *host_fbody = nullptr;
+  AttrValue device_ordinal_temp_value;
+  device_ordinal_temp_value.set_i(0);
+  protobuf::Map<string, AttrValue> host_func_attrs;
+  host_func_attrs["device_ordinal"] = device_ordinal_temp_value;
+  TF_CHECK_OK(FunctionDefToBodyHelper(
+      *fld.Find("host_graph"), AttrSlice(&host_func_attrs), &fld,
+      [&](const string &op, const OpDef **sig) {
+        return fld.LookUpOpDef(op, sig);
+      },
+      &host_fbody));
+  std::unique_ptr<FunctionBody> host_fbody_deleter(host_fbody);
+  Graph *host_graph = host_fbody->graph;
   int num_key_placeholders = 0;
   for (Node *n : host_graph->nodes()) {
     if (n->type_string() == "Placeholder" &&
@@ -438,4 +514,468 @@ TEST(ExtractOutsideCompilationForFunctionTest, XlaHostComputeRemoved) {
   EXPECT_NE(node_name_index.find("const1"), node_name_index.end());
 }
 
+REGISTER_OP("XlaSendToHost")
+    .Input("input: Tinput")
+    .Attr("Tinput: type")
+    .Attr("key: string")
+    .SetIsStateful();
+
+REGISTER_OP("XlaRecvFromHost")
+    .Output("output: Toutput")
+    .Attr("Toutput: type")
+    .Attr("shape: shape")
+    .Attr("key: string")
+    .SetIsStateful();
+
+TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
+  // Build the XLA computation func.
+  // "const0" (bool)
+  // "const1" (int32)
+  // "if0" (pred = "const0", input = "const1", then_branch = "true_fn",
+  //        else_branch = "false_fn")
+  FunctionDefLibrary fdl;
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output arg = ops::_Arg(s.WithOpName("arg"), DT_INT32, 0);
+    Output identity = ops::Identity(s.WithOpName("identity_true_fn"), arg);
+    ops::_Retval retval(s.WithOpName("retval"), identity, 0);
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+    auto node_name_image = g->BuildNodeNameIndex();
+    node_name_image["identity_true_fn"]->AddAttr("_oc", "0");
+    PartialTensorShape shape({2});
+    node_name_image["identity_true_fn"]->AddAttr(
+        kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
+
+    FunctionDef *true_fn_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "true_fn", true_fn_fdef));
+  }
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output arg = ops::_Arg(s.WithOpName("arg"), DT_INT32, 0);
+    Output identity = ops::Identity(s.WithOpName("identity_false_fn"), arg);
+    ops::_Retval retval(s.WithOpName("retval"), identity, 0);
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+    auto node_name_image = g->BuildNodeNameIndex();
+    node_name_image["identity_false_fn"]->AddAttr("_oc", "0");
+    PartialTensorShape shape({2});
+    node_name_image["identity_false_fn"]->AddAttr(
+        kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
+
+    FunctionDef *false_fn_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "false_fn", false_fn_fdef));
+  }
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output cond = ops::Const(s.WithOpName("const0"), true, {2});
+    Output input = ops::Const(s.WithOpName("const1"), 1, {2});
+    NameAttrList true_fn;
+    true_fn.set_name("true_fn");
+    NameAttrList false_fn;
+    false_fn.set_name("false_fn");
+    auto if_op = ops::If(s.WithOpName("if"), cond,
+                         std::initializer_list<Input>{cond, input}, {DT_INT32},
+                         true_fn, false_fn);
+    ops::_Retval retval(s.WithOpName("retval"), if_op.output[0], 0);
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+
+    FunctionDef *xla_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+  }
+  FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
+
+  protobuf::Map<string, tensorflow::AttrValue> attrs;
+  std::map<string, int> host_compute_core;
+  std::vector<string> shape_inference_graphs;
+  bool has_outside_compilation;
+  NameAttrList name_attrs;
+  name_attrs.set_name("cluster");
+  *name_attrs.mutable_attr() = attrs;
+  TF_CHECK_OK(ExtractOutsideCompilationTest(
+      "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
+      host_compute_core, &fld, &shape_inference_graphs,
+      &has_outside_compilation));
+
+  // Check host graph.
+  {
+    FunctionBody *host_fbody = nullptr;
+    AttrValue device_ordinal_temp_value;
+    device_ordinal_temp_value.set_i(0);
+    protobuf::Map<string, AttrValue> host_func_attrs;
+    host_func_attrs["device_ordinal"] = device_ordinal_temp_value;
+    TF_CHECK_OK(FunctionDefToBodyHelper(
+        *fld.Find("host_graph"), AttrSlice(&host_func_attrs), &fld,
+        [&](const string &op, const OpDef **sig) {
+          return fld.LookUpOpDef(op, sig);
+        },
+        &host_fbody));
+    std::unique_ptr<FunctionBody> host_fbody_deleter(host_fbody);
+    Graph *host_graph = host_fbody->graph;
+    auto node_name_index = host_graph->BuildNodeNameIndex();
+
+    // Verify we have XlaRecvAtHost to receive "If" predicate.
+    Node *recv_if_pred_node = node_name_index["recv_oc_if_pred_if"];
+    EXPECT_NE(recv_if_pred_node, nullptr);
+
+    // Verify we have an "If" to choose outside compilation between then_branch
+    // and else_branch, and it has `recv_if_pred_node` as cond input.
+    Node *if_oc_node = node_name_index["oc_if_if"];
+    EXPECT_NE(if_oc_node, nullptr);
+    Node *if_oc_node_cond_input;
+    TF_CHECK_OK(if_oc_node->input_node(0, &if_oc_node_cond_input));
+    EXPECT_EQ(if_oc_node_cond_input, recv_if_pred_node);
+
+    // Check that then_branch outside compilation has node "identity_true_fn".
+    const FunctionDef *true_def = fld.Find("oc_then_branch_host_if_if");
+    EXPECT_NE(true_def, nullptr);
+    bool has_identity_true_fn_node = false;
+    for (const auto &node_def : true_def->node_def()) {
+      if (node_def.name() == "identity_true_fn") {
+        has_identity_true_fn_node = true;
+        break;
+      }
+    }
+    EXPECT_TRUE(has_identity_true_fn_node);
+
+    // Check that else_branch outside compilation has node "identity_false_fn".
+    const FunctionDef *false_def = fld.Find("oc_else_branch_host_if_if");
+    EXPECT_NE(false_def, nullptr);
+    bool has_identity_false_fn_node = false;
+    for (const auto &node_def : false_def->node_def()) {
+      if (node_def.name() == "identity_false_fn") {
+        has_identity_false_fn_node = true;
+        break;
+      }
+    }
+    EXPECT_TRUE(has_identity_false_fn_node);
+  }
+
+  // Check XLA graph.
+  {
+    FunctionBody *xla_fbody = nullptr;
+    TF_CHECK_OK(FunctionDefToBodyHelper(
+        *fld.Find("cluster_rewritten"), AttrSlice(), &fld,
+        [&](const string &op, const OpDef **sig) {
+          return fld.LookUpOpDef(op, sig);
+        },
+        &xla_fbody));
+    std::unique_ptr<FunctionBody> xla_fbody_deleter(xla_fbody);
+    Graph *xla_graph = xla_fbody->graph;
+    auto node_name_index = xla_graph->BuildNodeNameIndex();
+
+    // Check that we have XlaSendToHost to send cond predicate to host, and
+    // there is a control edge to If node.
+    Node *send_if_pred_node = node_name_index["send_oc_if_pred_if"];
+    EXPECT_NE(send_if_pred_node, nullptr);
+    bool has_control_edge_to_if = false;
+    for (const Edge *e : send_if_pred_node->out_edges()) {
+      if (e->IsControlEdge() && e->dst()->name() == "if") {
+        has_control_edge_to_if = true;
+        break;
+      }
+    }
+    EXPECT_TRUE(has_control_edge_to_if);
+
+    // Check that the "If" node now has `send_if_pred_node` as attribute
+    // _xla_token_input_nodes.
+    Node *if_node = node_name_index["if"];
+    EXPECT_NE(if_node, nullptr);
+    std::vector<string> token_inputs;
+    TF_CHECK_OK(
+        GetNodeAttr(if_node->def(), "_xla_token_input_nodes", &token_inputs));
+    EXPECT_THAT(token_inputs, ::testing::ElementsAre("send_oc_if_pred_if"));
+  }
+}
+
+TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
+  // Build the XLA computation func.
+  // "const0" (bool)
+  // "while0" (input = "const0", cond = "cond_fn", body = "body_fn")
+  FunctionDefLibrary fdl;
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output arg = ops::_Arg(s.WithOpName("arg"), DT_BOOL, 0);
+    Output identity = ops::Identity(s.WithOpName("identity_cond_fn"), arg);
+    ops::_Retval retval(s.WithOpName("retval"), identity, 0);
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+    auto node_name_image = g->BuildNodeNameIndex();
+    node_name_image["identity_cond_fn"]->AddAttr("_oc", "0");
+    PartialTensorShape shape({2});
+    node_name_image["identity_cond_fn"]->AddAttr(
+        kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
+
+    FunctionDef *cond_fn_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "cond_fn", cond_fn_fdef));
+  }
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output arg = ops::_Arg(s.WithOpName("arg"), DT_BOOL, 0);
+    Output identity = ops::Identity(s.WithOpName("identity_body_fn"), arg);
+    ops::_Retval retval(s.WithOpName("retval"), identity, 0);
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+    auto node_name_image = g->BuildNodeNameIndex();
+    node_name_image["identity_body_fn"]->AddAttr("_oc", "0");
+    PartialTensorShape shape({2});
+    node_name_image["identity_body_fn"]->AddAttr(
+        kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
+
+    FunctionDef *body_fn_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "body_fn", body_fn_fdef));
+  }
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output input = ops::Const(s.WithOpName("const0"), true, {2});
+    NameAttrList cond_fn;
+    cond_fn.set_name("cond_fn");
+    NameAttrList body_fn;
+    body_fn.set_name("body_fn");
+    auto while_op =
+        ops::While(s.WithOpName("while"), std::initializer_list<Input>{input},
+                   cond_fn, body_fn);
+    ops::_Retval retval(s.WithOpName("retval"), while_op.output[0], 0);
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+
+    FunctionDef *xla_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+  }
+  FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
+
+  protobuf::Map<string, tensorflow::AttrValue> attrs;
+  std::map<string, int> host_compute_core;
+  std::vector<string> shape_inference_graphs;
+  bool has_outside_compilation;
+  NameAttrList name_attrs;
+  name_attrs.set_name("cluster");
+  *name_attrs.mutable_attr() = attrs;
+  TF_CHECK_OK(ExtractOutsideCompilationTest(
+      "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
+      host_compute_core, &fld, &shape_inference_graphs,
+      &has_outside_compilation));
+
+  // Check host graph.
+  {
+    FunctionBody *host_fbody = nullptr;
+    AttrValue device_ordinal_temp_value;
+    device_ordinal_temp_value.set_i(0);
+    protobuf::Map<string, AttrValue> host_func_attrs;
+    host_func_attrs["device_ordinal"] = device_ordinal_temp_value;
+    TF_CHECK_OK(FunctionDefToBodyHelper(
+        *fld.Find("host_graph"), AttrSlice(&host_func_attrs), &fld,
+        [&](const string &op, const OpDef **sig) {
+          return fld.LookUpOpDef(op, sig);
+        },
+        &host_fbody));
+    std::unique_ptr<FunctionBody> host_fbody_deleter(host_fbody);
+    Graph *host_graph = host_fbody->graph;
+    auto node_name_index = host_graph->BuildNodeNameIndex();
+
+    // Verify we have an "While" to execute outside compilation.
+    Node *while_oc_node = node_name_index["oc_while_while"];
+    EXPECT_NE(while_oc_node, nullptr);
+
+    // Check that cond outside compilation has node "identity_cond_fn".
+    const FunctionDef *cond_def = fld.Find("oc_cond_host_while_while");
+    EXPECT_NE(cond_def, nullptr);
+    bool has_identity_cond_fn_node = false;
+    for (const auto &node_def : cond_def->node_def()) {
+      if (node_def.name() == "identity_cond_fn") {
+        has_identity_cond_fn_node = true;
+        break;
+      }
+    }
+    EXPECT_TRUE(has_identity_cond_fn_node);
+
+    // Check that body outside compilation has node "identity_body_fn".
+    const FunctionDef *body_def = fld.Find("oc_body_host_while_while");
+    EXPECT_NE(body_def, nullptr);
+    bool has_identity_body_fn_node = false;
+    for (const auto &node_def : body_def->node_def()) {
+      if (node_def.name() == "identity_body_fn") {
+        has_identity_body_fn_node = true;
+        break;
+      }
+    }
+    EXPECT_TRUE(has_identity_body_fn_node);
+  }
+
+  // Check XLA graph.
+  {
+    // Verify that rewritten cond fn has XlaSendToHost to send loop predicate to
+    // host.
+    const FunctionDef *cond_def = fld.Find("cond_fn_oc");
+    EXPECT_NE(cond_def, nullptr);
+    bool has_send_oc_while_cond_node = false;
+    for (const auto &node_def : cond_def->node_def()) {
+      if (node_def.name() == "send_oc_while_cond_while") {
+        has_send_oc_while_cond_node = true;
+        break;
+      }
+    }
+    EXPECT_TRUE(has_send_oc_while_cond_node);
+  }
+}
+
+TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInFunction) {
+  // Build the XLA computation func.
+  // "const0" (int32)
+  // "fn" (input = "const0")
+  FunctionDefLibrary fdl;
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output arg = ops::_Arg(s.WithOpName("arg"), DT_INT32, 0);
+    Output identity = ops::Identity(s.WithOpName("identity"), arg);
+    ops::_Retval retval(s.WithOpName("retval"), identity, 0);
+    std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+    TF_CHECK_OK(s.ToGraph(g.get()));
+    auto node_name_image = g->BuildNodeNameIndex();
+    node_name_image["identity"]->AddAttr("_oc", "0");
+    PartialTensorShape shape({2});
+    node_name_image["identity"]->AddAttr(
+        kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
+
+    FunctionDef *true_fn_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "fn", true_fn_fdef));
+  }
+  FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
+  {
+    std::unique_ptr<Graph> g(new Graph(&fld));
+
+    tensorflow::TensorProto tensor_proto;
+    tensor_proto.set_dtype(tensorflow::DT_INT32);
+    tensorflow::TensorShapeProto shape;
+    shape.add_dim()->set_size(2);
+    *tensor_proto.mutable_tensor_shape() = shape;
+    for (int i = 0; i < 2; ++i) {
+      tensor_proto.add_int_val(1);
+    }
+    NodeDef const_def;
+    TF_CHECK_OK(NodeDefBuilder("const", "Const")
+                    .Attr("dtype", DT_INT32)
+                    .Attr("value", tensor_proto)
+                    .Finalize(&const_def));
+    Status s;
+    Node *const_node = g->AddNode(const_def, &s);
+    TF_CHECK_OK(s);
+
+    NodeDef fn_def;
+    TF_CHECK_OK(NodeDefBuilder("fn", "fn", &fld)
+                    .Input("const", 0, DT_INT32)
+                    .Finalize(&fn_def));
+    Node *fn_node = g->AddNode(fn_def, &s);
+    TF_CHECK_OK(s);
+    g->AddEdge(const_node, 0, fn_node, 0);
+
+    NodeDef ret_def;
+    TF_CHECK_OK(NodeDefBuilder("ret", "_Retval")
+                    .Attr("index", 0)
+                    .Attr("T", DT_INT32)
+                    .Input("fn", 0, DT_INT32)
+                    .Finalize(&ret_def));
+    Node *ret_node = g->AddNode(ret_def, &s);
+    TF_CHECK_OK(s);
+    g->AddEdge(fn_node, 0, ret_node, 0);
+
+    FunctionDef *xla_fdef = fdl.add_function();
+    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+    TF_CHECK_OK(fld.AddFunctionDef(*xla_fdef));
+  }
+
+  protobuf::Map<string, tensorflow::AttrValue> attrs;
+  std::map<string, int> host_compute_core;
+  std::vector<string> shape_inference_graphs;
+  bool has_outside_compilation;
+  NameAttrList name_attrs;
+  name_attrs.set_name("cluster");
+  *name_attrs.mutable_attr() = attrs;
+  TF_CHECK_OK(ExtractOutsideCompilationTest(
+      "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
+      host_compute_core, &fld, &shape_inference_graphs,
+      &has_outside_compilation));
+
+  // Check host graph.
+  {
+    FunctionBody *host_fbody = nullptr;
+    AttrValue device_ordinal_temp_value;
+    device_ordinal_temp_value.set_i(0);
+    protobuf::Map<string, AttrValue> host_func_attrs;
+    host_func_attrs["device_ordinal"] = device_ordinal_temp_value;
+    TF_CHECK_OK(FunctionDefToBodyHelper(
+        *fld.Find("host_graph"), AttrSlice(&host_func_attrs), &fld,
+        [&](const string &op, const OpDef **sig) {
+          return fld.LookUpOpDef(op, sig);
+        },
+        &host_fbody));
+    std::unique_ptr<FunctionBody> host_fbody_deleter(host_fbody);
+    Graph *host_graph = host_fbody->graph;
+    auto node_name_index = host_graph->BuildNodeNameIndex();
+
+    // Verify we have call node for outside compilation in `fn`.
+    Node *call_node = node_name_index["oc_call_fn"];
+    EXPECT_NE(call_node, nullptr);
+
+    FunctionBody *call_fbody = nullptr;
+    TF_CHECK_OK(FunctionDefToBodyHelper(
+        *fld.Find("oc_func_call_host_fn"), AttrSlice(&host_func_attrs), &fld,
+        [&](const string &op, const OpDef **sig) {
+          return fld.LookUpOpDef(op, sig);
+        },
+        &call_fbody));
+    std::unique_ptr<FunctionBody> call_fbody_deleter(call_fbody);
+
+    // Verify we have _XlaRecvAtHost and _XlaSendFromHost nodes.
+    bool has_recv = false, has_send = false;
+    for (Node *n : call_fbody->graph->nodes()) {
+      if (n->type_string() == "_XlaRecvAtHost") {
+        has_recv = true;
+      } else if (n->type_string() == "_XlaSendFromHost") {
+        has_send = true;
+      }
+    }
+    EXPECT_TRUE(has_recv);
+    EXPECT_TRUE(has_send);
+  }
+
+  // Check XLA graph.
+  {
+    FunctionBody *xla_fbody = nullptr;
+    TF_CHECK_OK(FunctionDefToBodyHelper(
+        *fld.Find("cluster_rewritten"), AttrSlice(), &fld,
+        [&](const string &op, const OpDef **sig) {
+          return fld.LookUpOpDef(op, sig);
+        },
+        &xla_fbody));
+    std::unique_ptr<FunctionBody> xla_fbody_deleter(xla_fbody);
+    Graph *xla_graph = xla_fbody->graph;
+    auto node_name_index = xla_graph->BuildNodeNameIndex();
+
+    // Check that we have call node.
+    Node *fn_node = node_name_index["fn"];
+    EXPECT_NE(fn_node, nullptr);
+    EXPECT_EQ(fn_node->type_string(), "fn_oc");
+
+    FunctionBody *call_fbody = nullptr;
+    TF_CHECK_OK(FunctionDefToBodyHelper(
+        *fld.Find("fn_oc"), AttrSlice(), &fld,
+        [&](const string &op, const OpDef **sig) {
+          return fld.LookUpOpDef(op, sig);
+        },
+        &call_fbody));
+    std::unique_ptr<FunctionBody> call_fbody_deleter(call_fbody);
+
+    // Verify we have XlaHostCompute nodes.
+    bool has_hc = false;
+    for (Node *n : call_fbody->graph->nodes()) {
+      if (n->type_string() == "XlaHostCompute") {
+        has_hc = true;
+      }
+    }
+    EXPECT_TRUE(has_hc);
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
index 98e344b3a080aa8aab27cd41564a90427bac151e..fba69dfccc31e01e73d8f86006b41ce5e3283f15 100644
--- a/tensorflow/compiler/jit/flags.cc
+++ b/tensorflow/compiler/jit/flags.cc
@@ -68,7 +68,12 @@ void AppendMarkForCompilationPassFlagsInternal(std::vector<Flag>* flag_list) {
       Flag("tf_xla_fusion_only",
            &mark_for_compilation_flags->tf_xla_fusion_only,
            "enable fusion of element-wise operations only using XLA when "
-           "global_jit_level is ON*.")};
+           "global_jit_level is ON*."),
+      Flag("tf_xla_disable_deadness_safety_checks_for_debugging",
+           &mark_for_compilation_flags
+                ->tf_xla_disable_deadness_safety_checks_for_debugging,
+           "Disable deadness related safety checks when clustering (this is "
+           "unsound).")};
   flag_list->insert(flag_list->end(), new_flags.begin(), new_flags.end());
 }
 
@@ -89,6 +94,8 @@ void AllocateAndParseFlags() {
   mark_for_compilation_flags->tf_xla_clustering_fuel =
       std::numeric_limits<int64>::max();
   mark_for_compilation_flags->tf_xla_fusion_only = false;
+  mark_for_compilation_flags
+      ->tf_xla_disable_deadness_safety_checks_for_debugging = false;
 
   device_flags = new XlaDeviceFlags;
   device_flags->tf_xla_compile_on_demand = false;
diff --git a/tensorflow/compiler/jit/flags.h b/tensorflow/compiler/jit/flags.h
index 5ddea588eef5270880d91623dc05893da265960a..ed7810fcfd85c17db70d42e691446b60dc696939 100644
--- a/tensorflow/compiler/jit/flags.h
+++ b/tensorflow/compiler/jit/flags.h
@@ -25,27 +25,39 @@ namespace tensorflow {
 
 // Flags associated with the XLA bridge's mark_for_compilation_pass module.
 struct MarkForCompilationPassFlags {
-  int32 tf_xla_auto_jit;  // Control compilation of operators into XLA
-                          // computations on CPU and GPU devices.  0 = use
-                          // ConfigProto setting; -1 = off; 1 = on for things
-                          // very likely to be improved; 2 = on for everything.
-                          // Experimental.
-  int32 tf_xla_min_cluster_size;  // Minimum number of operators in an XLA
-                                  // compilation. Ignored for operators placed
-                                  // on an XLA device or operators explicitly
-                                  // marked for compilation.
-  int32 tf_xla_max_cluster_size;  // Maximum number of operators in an XLA
-                                  // compilation.
-  bool tf_xla_clustering_debug;   // Dump graphs during XLA compilation.
-  bool tf_xla_cpu_global_jit;     // Enables global JIT compilation for CPU
-                                  // via SessionOptions.
-  int64 tf_xla_clustering_fuel;   // "Compiler fuel" for clustering.  Only this
-                                  // many ops will be marked as eligible for
-                                  // clustering.
-  bool tf_xla_fusion_only;  // This flag is effective only when global_jit_level
-                            // is set to ON* and overrides its behavior. If
-                            // true, enable fusion of element-wise operations
-                            // only using XLA.
+  // Control compilation of operators into XLA computations on CPU and GPU
+  // devices.  0 = use ConfigProto setting; -1 = off; 1 = on for things very
+  // likely to be improved; 2 = on for everything.
+  //
+  // Experimental.
+  int32 tf_xla_auto_jit;
+
+  // Minimum number of operators in an XLA compilation. Ignored for operators
+  // placed on an XLA device or operators explicitly marked for compilation.
+  int32 tf_xla_min_cluster_size;
+
+  // Maximum number of operators in an XLA compilation.
+  int32 tf_xla_max_cluster_size;
+
+  // Dump graphs during XLA compilation.
+  bool tf_xla_clustering_debug;
+
+  // Enables global JIT compilation for CPU via SessionOptions.
+  bool tf_xla_cpu_global_jit;
+
+  // "Compiler fuel" for clustering.  Only this many ops will be marked as
+  // eligible for clustering.
+  int64 tf_xla_clustering_fuel;
+
+  // tf_xla_fusion_only is effective only when global_jit_level is set to ON*
+  // and overrides its behavior. If true, enable fusion of element-wise
+  // operations only using XLA.
+  bool tf_xla_fusion_only;
+
+  // If tf_xla_disable_deadness_safety_checks_for_debugging is set to true then
+  // we do not do deadness related safety checks.  This is unsound in general,
+  // but can be used as a debugging aid.
+  bool tf_xla_disable_deadness_safety_checks_for_debugging;
 };
 
 // Flags associated with the XLA bridge's xla_device module.
diff --git a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
index ce53f70b79d97ab087fefe542920b33f883632a2..5287fd175df206970b9fa73bc6b0176eddcdcaa9 100644
--- a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
+++ b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.h"
+#include <iterator>
 #include "absl/algorithm/container.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
@@ -144,7 +145,9 @@ SliceInputs MakeSliceIndexAndSizeInt64(const Scope& host_scope,
 // same constant value.  This helps make the generated GraphDef more readable.
 class ConstantCache {
  public:
-  explicit ConstantCache(const Scope& s) : scope_(s) {}
+  explicit ConstantCache(const Scope& s,
+                         const std::vector<const Edge*>& control_deps)
+      : scope_(s), control_deps_(control_deps) {}
 
   Output Get1DHostConstant(int64 constant) {
     auto it = cache_.find(constant);
@@ -152,6 +155,9 @@ class ConstantCache {
       Output new_const =
           ops::Const(scope_.WithOpName("const_", constant), {constant});
       it = cache_.insert({constant, new_const}).first;
+      for (const Edge* e : control_deps_) {
+        scope_.graph()->AddControlEdge(e->src(), new_const.node());
+      }
     }
     return it->second;
   }
@@ -159,11 +165,13 @@ class ConstantCache {
  private:
   Scope scope_;
   std::unordered_map<int, Output> cache_;
+  std::vector<const Edge*> control_deps_;
 };
 
 // Returns a node computing the size of the Slice op with inputs `slice_inputs`.
 Status ComputeSliceSize(const Scope& host_scope,
-                        const SliceInputs& slice_inputs, Output* size) {
+                        const SliceInputs& slice_inputs,
+                        std::vector<const Edge*> control_deps, Output* size) {
   // If slice_size[i] >= 0 then slice_size[i] = slice_size[i].
   //
   // If slice_size[i] == -1 then slice_size[i] = input_size[i] -
@@ -183,7 +191,7 @@ Status ComputeSliceSize(const Scope& host_scope,
       ops::Shape(host_scope.WithOpName("input_shape"), slice_inputs.input,
                  ops::Shape::OutType(DT_INT64));
 
-  ConstantCache constant_pool(host_scope);
+  ConstantCache constant_pool(host_scope, control_deps);
 
   std::vector<Output> slice_size;
   for (int i = 0; i < slice_inputs.size_as_vector.size(); i++) {
@@ -209,11 +217,16 @@ Status ComputeSliceSize(const Scope& host_scope,
   }
 
   // Trivial ConcatV2 nodes (with exactly one input) are disallowed.
-  *size =
-      slice_size.size() == 1
-          ? slice_size[0]
-          : ops::Concat(host_scope.WithOpName("slice_size"), slice_size,
-                        ops::Const(host_scope.WithOpName("concat_axis"), 0));
+  if (slice_size.size() == 1) {
+    *size = slice_size[0];
+  } else {
+    auto concat_axis = ops::Const(host_scope.WithOpName("concat_axis"), 0);
+    for (const Edge* e : control_deps) {
+      host_scope.graph()->AddControlEdge(e->src(), concat_axis.node());
+    }
+    *size = ops::Concat(host_scope.WithOpName("slice_size"), slice_size,
+                        concat_axis);
+  }
   return Status::OK();
 }
 
@@ -234,12 +247,21 @@ Status ConvertTensorFlowSliceToStaticShapedSlice(
           .NewSubScope(absl::StrCat(slice->name(), "/static_shaped_slice"));
   Scope host_scope = main_scope.WithAssignedDevice(host_name);
 
+  // In the future we may want to be clever here and avoid the extra Cast ops.
   SliceInputs slice_inputs_int64 =
       MakeSliceIndexAndSizeInt64(host_scope, slice_inputs);
 
+  // Create a list of all control dependencies to be copied when possibly
+  // replacing nodes related to slice_size.
+  Node* old_size;
+  std::vector<const Edge*> old_size_ctrl_deps;
+  TF_RETURN_IF_ERROR(slice->input_node(2, &old_size));
+  absl::c_copy_if(old_size->in_edges(), std::back_inserter(old_size_ctrl_deps),
+                  [](const Edge* e) { return e->IsControlEdge(); });
+
   Output slice_size;
-  TF_RETURN_IF_ERROR(
-      ComputeSliceSize(host_scope, slice_inputs_int64, &slice_size));
+  TF_RETURN_IF_ERROR(ComputeSliceSize(host_scope, slice_inputs_int64,
+                                      old_size_ctrl_deps, &slice_size));
 
   *result =
       ops::Slice(main_scope.WithAssignedDevice(slice->assigned_device_name())
@@ -291,9 +313,9 @@ Status RewriteSlice(Graph* g, Node* slice, const SliceInputs& slice_inputs,
   return Status::OK();
 }
 
-// Return true if `n` is a slice we can rewrite to have a static shape
+// Return true if `n` is a slice we should rewrite to have a static shape
 // (i.e. have the output shape only depend on the "size" input).
-xla::StatusOr<bool> IsRewritableSlice(Node* n) {
+xla::StatusOr<bool> ShouldRewriteSlice(Node* n) {
   if (n->type_string() != "Slice") {
     return false;
   }
@@ -311,14 +333,20 @@ xla::StatusOr<bool> IsRewritableSlice(Node* n) {
 
   // If slice_size[i] < -1 for any i then executing the slice will throw an
   // error, and we don't do anything here.
-  return absl::c_all_of(slice_inputs->size_as_vector,
-                        [](int64 size_i) { return size_i >= -1; });
+  bool slice_size_has_error = absl::c_all_of(
+      slice_inputs->size_as_vector, [](int64 size_i) { return size_i >= -1; });
+  if (!slice_size_has_error) {
+    return false;
+  }
+
+  // No point in rewriting slices that have both size and begin as constants.
+  return !slice_inputs->begin.node()->IsConstant();
 }
 
 Status FindAndRewriteSlices(Graph* g, bool* changed) {
   std::vector<Node*> slices_to_rewrite;
   for (Node* n : g->nodes()) {
-    TF_ASSIGN_OR_RETURN(bool is_rewritable, IsRewritableSlice(n));
+    TF_ASSIGN_OR_RETURN(bool is_rewritable, ShouldRewriteSlice(n));
     if (is_rewritable) {
       slices_to_rewrite.push_back(n);
     }
diff --git a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
index a2f1b831ad7605237e23c15cc43b337e06265553..2add2c13f92f561904163012ee16cc17ce5badce 100644
--- a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
+++ b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
@@ -401,5 +401,57 @@ TEST(SliceToDynamicSliceRewriteTest, SliceWithSliceBegin) {
                      Name("begin/static_shaped_slice/static_shaped_slice"))),
                  _)));
 }
+
+// New constants being created need to have control dependencies copied to
+// ensure correct control flow analysis in TF V2.
+TEST(SliceToDynamicSliceRewriteTest, WithControlDepsToConstant) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+  Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT32);
+  Output size = ops::Const(root.WithOpName("size"), {-1});
+  Output slice = ops::Slice(root.WithOpName("slice"), input, begin, size);
+
+  // Add an additional dependency that should still exist in with the new size
+  // variables.
+  Output dependency = ops::Placeholder(root.WithOpName("dependency"), DT_BOOL);
+  root.graph()->AddControlEdge(dependency.node(), size.node());
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  // Check that the new constants have control dependencies.
+  Node* const_0 = testing::FindNodeByName(result.get(),
+                                          "slice/static_shaped_slice/const_0");
+  EXPECT_NE(const_0, nullptr);
+  EXPECT_THAT(const_0,
+              NodeWith(Op("Const"), CtrlDeps(NodeWith(Op("Placeholder"),
+                                                      Name("dependency")))));
+}
+
+TEST(SliceToDynamicSliceRewriteTest, DontRewriteSliceWithConstBegin) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+  Output begin = ops::Const(root.WithOpName("begin"), {10, 10});
+  Output size = ops::Const(root.WithOpName("size"), {-1, 500});
+  Output slice = ops::Slice(root.WithOpName("slice"), input, begin, size);
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  Node* slice_node = testing::FindNodeByName(result.get(), "slice");
+  EXPECT_THAT(slice_node,
+              NodeWith(Op("Slice"), Inputs(Out(NodeWith(Op("Placeholder"))),
+                                           Out(NodeWith(Op("Const"))),
+                                           Out(NodeWith(Op("Const"))))));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index 0583774714c6db7a2fa515fc8a0d304e1898db97..d0fa2c40be9d6b13ec736a9d6483dae0b4f0f45e 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -19,12 +19,14 @@ cc_library(
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:state_ops_op_lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core/kernels:variable_ops",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index ad71df5a694a5f8da94675049df1062a7edb6253..997ef6e14bb9bd16ddac13eaf67368966818b29e 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -35,6 +36,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/util/stream_executor_util.h"
@@ -304,10 +307,19 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
   xla::LocalExecutable* executable;
   std::map<int, OptionalTensor> variables;
 
-  OP_REQUIRES_OK(
-      ctx, CompileToLocalExecutable(ctx, function_, platform_info_, resources_,
-                                    constants_, /*lazy=*/false, &client,
-                                    &variables, &kernel, &executable));
+  {
+    Status s = CompileToLocalExecutable(
+        ctx, function_, platform_info_, resources_, constants_, /*lazy=*/false,
+        &client, &variables, &kernel, &executable);
+    if (!s.ok() && (platform_info_.device_type().type_string() == DEVICE_CPU ||
+                    platform_info_.device_type().type_string() == DEVICE_GPU)) {
+      // Suggest auto jit if the failure was with GPU or CPU.
+      errors::AppendToMessage(&s,
+                              xla::status_macros::kPossibleAutoJitAlternative);
+    }
+
+    OP_REQUIRES_OK(ctx, s);
+  }
 
   se::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 6618e3a58ab7b6374ed775cd6e4e18a6a4975588..d9a83049d6352f04f9237f21b44bdb5ea18e518a 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -41,7 +42,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
-#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/public/version.h"
@@ -677,12 +678,28 @@ Status MarkForCompilationPass::Run(
   VLOG(1) << "flags->tf_xla_auto_jit = " << flags->tf_xla_auto_jit;
   const FunctionLibraryDefinition* fld = options.flib_def;
 
+  // Deadness analysis expects a graph with source and sink edges properly
+  // connected but sometimes the incoming graph does not follow this invariant.
+  // So fix up the source and sink edges before calling into deadness analysis.
+  FixupSourceAndSinkEdges(options.graph->get());
+
   std::unique_ptr<DeadnessAnalysis> deadness;
   {
     XLA_SCOPED_LOGGING_TIMER_LEVEL("DeadnessAnalysis", 1);
     TF_RETURN_IF_ERROR(DeadnessAnalysis::Run(**options.graph, &deadness));
   }
 
+  bool deadness_analysis_disabled =
+      GetMarkForCompilationPassFlags()
+          ->tf_xla_disable_deadness_safety_checks_for_debugging;
+
+  if (deadness_analysis_disabled) {
+    LOG(WARNING) << "Deadness analysis was manually disabled via "
+                    "--tf_xla_disable_deadness_safety_checks_for_debugging; "
+                    "auto-clustering "
+                    "is unsound!";
+  }
+
   auto is_compilable = [&](const Node* node, const DeviceType& device_type) {
     const XlaOpRegistry::DeviceRegistration* registration;
     if (!XlaOpRegistry::GetCompilationDevice(device_type.type(),
@@ -715,9 +732,12 @@ Status MarkForCompilationPass::Run(
     // and some are dead) then don't compile it.  XLA cannot represent the
     // deadness semantics of these nodes correctly and auto-clustering these
     // nodes can cause deadness to propagate to nodes that should be live.
-    if (node->IsMerge() || deadness->HasInputsWithMismatchingDeadness(*node)) {
-      VLOG(2) << "Rejecting " << node->name() << ": mismatching deadness.";
-      return false;
+    if (!deadness_analysis_disabled) {
+      if (node->IsMerge() ||
+          deadness->HasInputsWithMismatchingDeadness(*node)) {
+        VLOG(2) << "Rejecting " << node->name() << ": mismatching deadness.";
+        return false;
+      }
     }
 
     // Check for fusable ops only if requested.
@@ -1145,6 +1165,29 @@ Status MarkForCompilationPass::RunImpl(
   if (flags->tf_xla_clustering_debug) {
     dump_graph::DumpGraphToFile("mark_for_compilation", **options.graph,
                                 options.flib_def);
+
+    // We also dump out an annoated version of the TF graph where the nodes
+    // names are prefixed with the cluster names.  This can help visualizing the
+    // clustering decisions on TensorBoard.
+    Graph new_graph((*options.graph)->op_registry());
+    CopyGraph(**options.graph, &new_graph);
+
+    for (Node* n : new_graph.nodes()) {
+      if (absl::optional<absl::string_view> cluster_name =
+              GetXlaClusterForNode(*n)) {
+        n->set_name(absl::StrCat(*cluster_name, "/", n->name()));
+      } else if (n->type_string() == "VarHandleOp") {
+        n->set_name(absl::StrCat("varhandle/", n->name()));
+      } else {
+        // There is room for improvement here.  In particular, it may help to
+        // split these unclustered nodes into classes where every node in a
+        // specific class has edges to and from the same set of clusters.
+        n->set_name(absl::StrCat("unclustered/", n->name()));
+      }
+    }
+
+    dump_graph::DumpGraphToFile("mark_for_compilation_annotated", new_graph,
+                                options.flib_def);
   }
 
   VLogClusteringSummary(*graph);
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index bf2c5508ea9e987e80093f4c2e15d3ff5191126f..c2b6250f738fafa35b2c5f79e97cf1281b50a316 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -151,7 +151,7 @@ TEST(XlaCompilationTest, CompilableCycles) {
   EXPECT_EQ(clusters["A"], clusters["C"]);
 }
 
-TEST(XlaCompilationTest, Complex128Unsupported) {
+TEST(XlaCompilationTest, StringUnsupported) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   GraphDef graphdef;
   {
@@ -159,10 +159,10 @@ TEST(XlaCompilationTest, Complex128Unsupported) {
     Node* a = ops::SourceOp(
         "Const", builder.opts()
                      .WithName("A")
-                     .WithAttr("dtype", DT_COMPLEX128)
-                     .WithAttr("value", Tensor(DT_COMPLEX128, TensorShape())));
-    Node* b = ops::UnaryOp("Neg", a, builder.opts().WithName("B"));
-    ops::BinaryOp("MatMul", a, b, builder.opts().WithName("C"));
+                     .WithAttr("dtype", DT_STRING)
+                     .WithAttr("value", Tensor(DT_STRING, TensorShape())));
+    Node* b = ops::UnaryOp("EncodeBase64", a, builder.opts().WithName("B"));
+    ops::BinaryOp("StringSplit", a, b, builder.opts().WithName("C"));
     TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
diff --git a/tensorflow/compiler/jit/partially_decluster_pass.cc b/tensorflow/compiler/jit/partially_decluster_pass.cc
index 42ea3926e16ae791dbe1bede3b8742383db7667c..e1fd2aaee2822daeffb415d053c9c4f56002a856 100644
--- a/tensorflow/compiler/jit/partially_decluster_pass.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass.cc
@@ -120,6 +120,7 @@ Status PartiallyDeclusterNode(Graph* graph, Node* n) {
 
   NodeDef ndef = n->def();
   ndef.set_name(absl::StrCat(n->name(), "/declustered"));
+  MergeDebugInfo(NodeDebugInfo(n->def()), &ndef);
   RemoveFromXlaCluster(&ndef);
   Status s;
   Node* cloned_node = graph->AddNode(ndef, &s);
diff --git a/tensorflow/compiler/jit/partially_decluster_pass_test.cc b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
index 38a54cc5efae35ad77b6dc8039c653e920cfc071..1d81a8f4fcbf050663626b1f7660afd71f4027bc 100644
--- a/tensorflow/compiler/jit/partially_decluster_pass_test.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/graph_def_builder_util.h"
-#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/compiler/jit/shape_inference.cc b/tensorflow/compiler/jit/shape_inference.cc
index 80c691fe490c1092315708a2da754d367d585300..a27e0d9f2a6ecddfdbdb29be673084d77a178d8a 100644
--- a/tensorflow/compiler/jit/shape_inference.cc
+++ b/tensorflow/compiler/jit/shape_inference.cc
@@ -53,7 +53,15 @@ Status PropagateShapes(const Graph& graph,
     // shapes, even if no shape function is registered for a node.
     Status status = shape_refiner->AddNode(n);
     if (!status.ok()) {
-      VLOG(1) << "Shape inference failed for node: " << status;
+      VLOG(1) << "Shape inference failed for node " << n->name() << ": "
+              << status;
+    } else {
+      shape_inference::InferenceContext* context = shape_refiner->GetContext(n);
+      for (int i = 0; i < n->num_outputs(); i++) {
+        shape_inference::ShapeHandle handle = context->output(i);
+        VLOG(4) << "Output " << i << " for node " << n->name() << ": "
+                << context->DebugString(handle);
+      }
     }
 
     if (n->type_string() == "_Arg") {
diff --git a/tensorflow/compiler/jit/xla_cluster_util.cc b/tensorflow/compiler/jit/xla_cluster_util.cc
index fef28fc810cb4e544fe3f271f0b96cebd8a96779..3adcfef4dacecb343812cefc3a893a65c74ca101 100644
--- a/tensorflow/compiler/jit/xla_cluster_util.cc
+++ b/tensorflow/compiler/jit/xla_cluster_util.cc
@@ -19,9 +19,9 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/jit/resource_operation_safety_analysis.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/graph/control_flow.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
@@ -43,7 +43,7 @@ string DescribeCycle(const GraphCycles* cycles, const Graph& graph, int src,
     return "";
   }
 
-  auto node_name = [cycles, &graph](int node_id) {
+  auto node_name = [&graph](int node_id) {
     if (!FastBoundsCheck(node_id, graph.num_node_ids())) {
       return string("(null)");
     }
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 3df5479a55e841380ca7b8cdd0add9fd17487091..611515cf33bc1abe21e06eb7f1513800276e095b 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <numeric>
 
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
@@ -38,6 +39,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+constexpr int64 XlaCompilationCache::kDefaultCompilationThreshold;
+
 XlaCompilationCache::XlaCompilationCache(xla::LocalClient* client,
                                          DeviceType device_type)
     : client_(client), device_type_(std::move(device_type)) {}
@@ -60,7 +63,7 @@ XlaCompilationCache::~XlaCompilationCache() {
   // about?
 }
 
-string XlaCompilationCache::DebugString() {
+string XlaCompilationCache::DebugString() const {
   return "XLA JIT compilation cache";
 }
 
@@ -68,9 +71,9 @@ string XlaCompilationCache::DebugString() {
 // arguments in the supplied list.
 string XlaCompilationCache::Signature::HumanString() const {
   string result = name;
-  for (const auto& a : arg_types) {
-    absl::StrAppend(&result, ",", DataTypeString(a.first),
-                    a.second.DebugString());
+  for (const auto& a : arg_shapes) {
+    absl::StrAppend(&result, ",", DataTypeString(a.first));
+    absl::StrAppend(&result, " [", absl::StrJoin(a.second, ","), "]");
   }
 
   for (const auto& v : arg_values) {
@@ -81,7 +84,7 @@ string XlaCompilationCache::Signature::HumanString() const {
 
 bool XlaCompilationCache::Signature::operator==(const Signature& other) const {
   if (name != other.name) return false;
-  if (arg_types != other.arg_types) return false;
+  if (arg_shapes != other.arg_shapes) return false;
 
   if (arg_values.size() != other.arg_values.size()) return false;
   for (int i = 0; i < arg_values.size(); ++i) {
@@ -97,10 +100,10 @@ bool XlaCompilationCache::Signature::operator==(const Signature& other) const {
 uint64 XlaCompilationCache::Signature::Hash::operator()(
     const XlaCompilationCache::Signature& signature) const {
   uint64 h = std::hash<string>()(signature.name);
-  for (const auto& arg : signature.arg_types) {
+  for (const auto& arg : signature.arg_shapes) {
     h = Hash64Combine(h, std::hash<int>()(static_cast<int>(arg.first)));
-    h = Hash64Combine(h, std::hash<int>()(arg.second.dims()));
-    for (int dim : arg.second.dim_sizes()) {
+    h = Hash64Combine(h, std::hash<int>()(arg.second.size()));
+    for (int dim : arg.second) {
       h = Hash64Combine(h, std::hash<int>()(dim));
     }
   }
@@ -124,7 +127,7 @@ XlaCompilationCache::BuildSignature(
         break;
       case XlaCompiler::Argument::kParameter:
       case XlaCompiler::Argument::kResource:
-        signature.arg_types.emplace_back(arg.type, arg.shape);
+        signature.arg_shapes.emplace_back(arg.type, arg.DimensionSizes());
         break;
       default:
         return errors::InvalidArgument(
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h
index 846d0c963dbfdf55f51120f2f138d12f5f63839b..7748b4700f39da4f952278ca6c6d2cadff4d3fb8 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.h
+++ b/tensorflow/compiler/jit/xla_compilation_cache.h
@@ -88,14 +88,16 @@ class XlaCompilationCache : public ResourceBase {
   xla::LocalClient* client() const { return client_; }
   const DeviceType& device_type() const { return device_type_; }
 
-  string DebugString() override;
+  string DebugString() const override;
 
   // Describes the types, shapes and any compile-time constant arguments
   // to a kernel. Key that uniquely identifies a compilation output.
   struct Signature {
     string name;
 
-    std::vector<std::pair<DataType, TensorShape>> arg_types;
+    // List of Tensor types & shapes for compile-time constant arguments to the
+    // compilation, ordered by argument number.
+    std::vector<std::pair<DataType, std::vector<int64>>> arg_shapes;
 
     // List of Tensor values for compile-time constant arguments to the
     // compilation, ordered by argument number. Tensors must be in host memory.
diff --git a/tensorflow/compiler/jit/xla_cpu_device.cc b/tensorflow/compiler/jit/xla_cpu_device.cc
index e9770647e7ba96cc1db026d12d5f11f52ce98d35..94dc61d55fb047c0ea81d98fde24cb55387c27d7 100644
--- a/tensorflow/compiler/jit/xla_cpu_device.cc
+++ b/tensorflow/compiler/jit/xla_cpu_device.cc
@@ -83,9 +83,9 @@ REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_CPU, XlaCpuDeviceFactory);
 
 // Kernel registrations
 
-constexpr std::array<DataType, 12> kAllXlaCpuTypes = {
+constexpr std::array<DataType, 13> kAllXlaCpuTypes = {
     {DT_UINT8, DT_QUINT8, DT_INT8, DT_QINT8, DT_INT32, DT_QINT32, DT_INT64,
-     DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BOOL}};
+     DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128, DT_BOOL}};
 
 REGISTER_XLA_LAUNCH_KERNEL(DEVICE_XLA_CPU, XlaLocalLaunchOp, kAllXlaCpuTypes);
 REGISTER_XLA_COMPILE_KERNEL(DEVICE_XLA_CPU, XlaCompileOp, kAllXlaCpuTypes);
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 4201ff91a89b1bee370e6a43337c51abe3bf974a..56c4220f12b54be09821eca4590df52e8e71850b 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -102,7 +102,8 @@ XlaDeviceAllocator* XlaDeviceAllocatorState::GetOrCreateXlaDeviceAllocator(
   }
 
   std::unique_ptr<XlaDeviceAllocator> alloc =
-      absl::make_unique<XlaDeviceAllocator>();
+      absl::make_unique<XlaDeviceAllocator>(
+          backend->stream_executors()[device_ordinal]);
   XlaDeviceAllocator* alloc_ptr = alloc.get();
   state.allocators_[{backend, device_ordinal}] = std::move(alloc);
   return alloc_ptr;
@@ -201,7 +202,8 @@ XlaDevice::XlaDevice(const SessionOptions& session_options,
       jit_device_name_(options.compilation_device_name),
       platform_(options.platform),
       use_multiple_streams_(options.use_multiple_streams),
-      shape_representation_fn_(options.shape_representation_fn) {
+      shape_representation_fn_(options.shape_representation_fn),
+      allowed_devices_(options.allowed_devices) {
   VLOG(1) << "Created XLA device " << options.compilation_device_name << " "
           << this;
   thread_pool_.reset(new thread::ThreadPool(session_options.env, "xla_device",
@@ -218,9 +220,6 @@ XlaDevice::XlaDevice(const SessionOptions& session_options,
 XlaDevice::~XlaDevice() {
   VLOG(1) << "Destroying XLA device " << jit_device_name_ << " " << this;
   mutex_lock lock(mu_);
-  while (outstanding_asynchronous_operations_ > 0) {
-    outstanding_asynchronous_operations_cv_.wait(lock);
-  }
   if (device_context_) {
     device_context_->Unref();
   }
@@ -234,7 +233,8 @@ xla::LocalClient* XlaDevice::client() const {
 
   // TODO(b/78468222): This can fail, at least when the backend is GPU and
   // there is no GPU on the host.
-  return xla::ClientLibrary::GetOrCreateLocalClient(platform_).ValueOrDie();
+  return xla::ClientLibrary::GetOrCreateLocalClient(platform_, allowed_devices_)
+      .ValueOrDie();
 }
 
 Allocator* XlaDevice::GetAllocator(AllocatorAttributes attr) {
@@ -396,12 +396,6 @@ Status XlaDevice::Sync() {
   if (!stream) return Status::OK();
 
   Status status = stream->BlockHostUntilDone();
-  {
-    mutex_lock lock(mu_);
-    while (outstanding_asynchronous_operations_ > 0) {
-      outstanding_asynchronous_operations_cv_.wait(lock);
-    }
-  }
   TF_RETURN_IF_ERROR(status);
   if (!stream->ok()) {
     return errors::Internal("XlaDevice::Sync() failed.");
@@ -410,6 +404,8 @@ Status XlaDevice::Sync() {
   return Status::OK();
 }
 
+// TODO(b/112409994): This is no longer necessary. Consolidate it with the
+// synchronous version.
 void XlaDevice::Sync(const DoneCallback& done) {
   VLOG(1) << "XlaDevice::Sync (asynchronous)";
   std::shared_ptr<se::Stream> stream;
@@ -422,14 +418,20 @@ void XlaDevice::Sync(const DoneCallback& done) {
     return;
   }
 
+  // The call to ThenEnqueueOnBackgroundThread below enqueues a host callback at
+  // the end of the stream, after everything that has already been enqueued
+  // there at this moment. When the host callback is called, everything before
+  // it must have already finished, and the host callback will then place the
+  // task below onto a background thread. (See the implementation of
+  // ThenEnqueueOnBackgroundThread for details.) Therefore, when the done
+  // callback is finally called from that background thread, we know for sure
+  // that everything enqueued onto the stream (i.e., the device) at this very
+  // moment--when ThenEnqueueOnBackgroundThread is called--will have finished.
+  // This achieves a device-wide sync.
   stream->ThenEnqueueOnBackgroundThread(
-      [this, stream, done](se::StreamExecutor*) {
+      [stream, done](se::StreamExecutor*) {
         tracing::ScopedActivity activity("XlaDevice::Sync::Callback",
                                          /*is_expensive=*/true);
-        mutex_lock lock(mu_);
-        while (outstanding_asynchronous_operations_ > 0) {
-          outstanding_asynchronous_operations_cv_.wait(lock);
-        }
         done(stream->ok() ? Status::OK()
                           : errors::Internal("XlaDevice::Sync() failed."));
       });
@@ -468,57 +470,50 @@ Status XlaDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
   return status;
 }
 
-void XlaDevice::SetRequiresSyncOnCompletion(bool sync_on_completion) {
+void XlaDevice::SetAllowsSyncOnCompletion(bool sync_on_completion) {
   mutex_lock lock(mu_);
   sync_on_completion_ = sync_on_completion;
 }
 
-bool XlaDevice::RequiresSyncOnCompletion() const {
+bool XlaDevice::AllowsSyncOnCompletion() const {
   mutex_lock lock(mu_);
   return sync_on_completion_;
 }
 
-XlaDevice::AsynchronousOperationHandle::AsynchronousOperationHandle(
-    XlaDevice* device)
-    : device_(device) {
-  mutex_lock lock(device_->mu_);
-  ++device_->outstanding_asynchronous_operations_;
+void XlaDevice::SetHandleDeviceErrorCallback(std::function<Status()> callback) {
+  mutex_lock lock(mu_);
+  device_error_callback_ = callback;
 }
 
-XlaDevice::AsynchronousOperationHandle::~AsynchronousOperationHandle() {
-  if (device_) {
-    mutex_lock lock(device_->mu_);
-    --device_->outstanding_asynchronous_operations_;
-    device_->outstanding_asynchronous_operations_cv_.notify_all();
+Status XlaDevice::HandleDeviceError() {
+  std::function<Status()> local_device_error_callback;
+  {
+    mutex_lock lock(mu_);
+    local_device_error_callback = device_error_callback_;
   }
+  if (local_device_error_callback != nullptr) {
+    return local_device_error_callback();
+  }
+  return Status::OK();
 }
 
-XlaDevice::AsynchronousOperationHandle::AsynchronousOperationHandle(
-    const XlaDevice::AsynchronousOperationHandle& other)
-    : device_(other.device_) {
-  mutex_lock lock(device_->mu_);
-  ++device_->outstanding_asynchronous_operations_;
-}
-
-XlaDevice::AsynchronousOperationHandle::AsynchronousOperationHandle(
-    XlaDevice::AsynchronousOperationHandle&& other)
-    : device_(other.device_) {
-  other.device_ = nullptr;
-}
-
-XlaDevice::AsynchronousOperationHandle& XlaDevice::AsynchronousOperationHandle::
-operator=(const XlaDevice::AsynchronousOperationHandle& other) {
-  device_ = other.device_;
-  mutex_lock lock(device_->mu_);
-  ++device_->outstanding_asynchronous_operations_;
-  return *this;
-}
-
-XlaDevice::AsynchronousOperationHandle& XlaDevice::AsynchronousOperationHandle::
-operator=(XlaDevice::AsynchronousOperationHandle&& other) {
-  device_ = other.device_;
-  other.device_ = nullptr;
-  return *this;
+Status XlaDevice::RefreshStatus() {
+  std::shared_ptr<se::Stream> stream;
+  {
+    mutex_lock lock(mu_);
+    stream = stream_;
+  }
+  if (!stream) {
+    return Status::OK();
+  }
+  Status status = stream->RefreshStatus();
+  if (!status.ok()) {
+    // Ignore errors from HandleDeviceError, since by definition the status is
+    // already non-ok, so there's nothing extra to report if HandleDeviceError
+    // itself returns an error.
+    HandleDeviceError().IgnoreError();
+  }
+  return status;
 }
 
 XlaDeviceOpRegistrations* RegisterXlaDeviceKernels(const char* device,
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index c8bb276cdb9673fdcba4cc15a9f33ecd3ae96dbb..977f5f5cf151d979d025c2966012445af04fc502 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -24,7 +24,9 @@ limitations under the License.
 
 #ifndef TENSORFLOW_COMPILER_JIT_XLA_DEVICE_H_
 #define TENSORFLOW_COMPILER_JIT_XLA_DEVICE_H_
+#include <set>
 
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/jit/xla_device_context.h"
 #include "tensorflow/compiler/jit/xla_tensor.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
@@ -123,6 +125,11 @@ class XlaDevice : public LocalDevice {
     // If padded_shape_fn is empty, a default implementation that returns
     // the logical on-device shape without padding is used.
     PaddedShapeFn padded_shape_fn;
+
+    // Set of devices to use. This controls which of the devices on the given
+    // platform will have resources allocated. For GPUs this will be
+    // filled from visible_gpu_devices list from session configuration.
+    absl::optional<std::set<int>> allowed_devices;
   };
 
   // Creates a new XLA Device.
@@ -160,35 +167,16 @@ class XlaDevice : public LocalDevice {
   Status UseGpuDeviceInfo() LOCKS_EXCLUDED(mu_);
 
   // Instructs this XlaDevice to return 'sync_on_completion' for
-  // RequiresSyncOnCompletion().
-  void SetRequiresSyncOnCompletion(bool sync_on_completion) LOCKS_EXCLUDED(mu_);
-
-  bool RequiresSyncOnCompletion() const override LOCKS_EXCLUDED(mu_);
+  // AllowsSyncOnCompletion().
+  void SetAllowsSyncOnCompletion(bool sync_on_completion) LOCKS_EXCLUDED(mu_);
+  bool AllowsSyncOnCompletion() const override LOCKS_EXCLUDED(mu_);
 
-  // A simple RAII handle. On construction the device's
-  // outstanding_asynchronous_operations_ field is incremented; on destruction
-  // it is decremented.
-  class AsynchronousOperationHandle {
-   public:
-    AsynchronousOperationHandle(XlaDevice* device);
-    ~AsynchronousOperationHandle();
-    AsynchronousOperationHandle(const AsynchronousOperationHandle& other);
-    AsynchronousOperationHandle(AsynchronousOperationHandle&& other);
-    AsynchronousOperationHandle& operator=(
-        const AsynchronousOperationHandle& other);
-    AsynchronousOperationHandle& operator=(AsynchronousOperationHandle&& other);
+  // Installs an error handling callback when RefreshStatus sees !status.ok().
+  void SetHandleDeviceErrorCallback(std::function<Status()> callback);
 
-   private:
-    XlaDevice* device_ = nullptr;
-  };
-
-  AsynchronousOperationHandle CreateAsynchronousOperationHandle() {
-    return AsynchronousOperationHandle(this);
-  }
+  Status RefreshStatus() override LOCKS_EXCLUDED(mu_);
 
  private:
-  friend class AsynchronousOperationHandle;
-
   xla::LocalClient* client() const;
   Allocator* GetAllocatorLocked(AllocatorAttributes attr)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
@@ -202,6 +190,9 @@ class XlaDevice : public LocalDevice {
   static Status GetMetadataFromDevice(DeviceBase* device,
                                       const XlaDevice::Metadata** metadata);
 
+  // Handles error when RefreshStatus sees !status.ok().
+  Status HandleDeviceError();
+
   mutable mutex mu_;
   // The metadata of this XlaDevice.
   const Metadata xla_metadata_;
@@ -248,14 +239,17 @@ class XlaDevice : public LocalDevice {
   // Thread pool used for running closures
   std::unique_ptr<thread::ThreadPool> thread_pool_;
 
-  // True if the device requires XlaDevice::Sync to be called on completion
+  // True if the device allows XlaDevice::Sync to be called on completion
   // regardless of status.
-  bool sync_on_completion_ GUARDED_BY(mu_) = false;
+  bool sync_on_completion_ GUARDED_BY(mu_) = true;
+
+  // A callback that will be invoked when RefreshStatus sees a status error.
+  std::function<Status()> device_error_callback_ GUARDED_BY(mu_);
 
-  // Count of outstanding asynchronous operations which must be zero on Sync()
-  // completion.
-  int64 outstanding_asynchronous_operations_ GUARDED_BY(mu_) = 0;
-  condition_variable outstanding_asynchronous_operations_cv_;
+  // Set of devices to use. This controls which of the devices on the given
+  // platform will have resources allocated. For GPUs this will be
+  // filled from visible_gpu_devices list from session configuration.
+  absl::optional<std::set<int>> allowed_devices_;
 };
 
 // Builds OpKernel registrations on 'device' for the JIT operators
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index 6e6532731e64bd42ee56aa719748988f321e0f17..05b9c511866d3ca48ec3519bee8a4dbf6086f6ac 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -29,7 +29,10 @@ limitations under the License.
 namespace tensorflow {
 
 // The allocator used for Tensors assigned to the XLA device.
-XlaDeviceAllocator::XlaDeviceAllocator() {}
+XlaDeviceAllocator::XlaDeviceAllocator(
+    stream_executor::StreamExecutor* stream_executor)
+    : stream_executor_(stream_executor) {}
+
 XlaDeviceAllocator::~XlaDeviceAllocator() = default;
 
 string XlaDeviceAllocator::Name() { return "xla"; }
@@ -48,7 +51,21 @@ void XlaDeviceAllocator::DeallocateRaw(void* ptr) {
   delete XlaTensor::FromOpaquePointer(ptr);
 }
 
-void XlaDeviceAllocator::GetStats(AllocatorStats* stats) { stats->Clear(); }
+absl::optional<AllocatorStats> XlaDeviceAllocator::GetStats() {
+  absl::optional<stream_executor::AllocatorStats> se_stats =
+      stream_executor_->GetAllocatorStats();
+  if (!se_stats) {
+    return absl::nullopt;
+  }
+
+  tensorflow::AllocatorStats tf_stats;
+  tf_stats.num_allocs = se_stats->num_allocs;
+  tf_stats.bytes_in_use = se_stats->bytes_in_use;
+  tf_stats.peak_bytes_in_use = se_stats->peak_bytes_in_use;
+  tf_stats.largest_alloc_size = se_stats->largest_alloc_size;
+  tf_stats.bytes_limit = se_stats->bytes_limit;
+  return tf_stats;
+}
 
 XlaDeviceContext::XlaDeviceContext(
     std::shared_ptr<se::Stream> compute_stream,
@@ -79,6 +96,13 @@ XlaDeviceContext::XlaDeviceContext(
   }
 }
 
+void XlaDeviceContext::CopyTensorInSameDevice(const Tensor* input_tensor,
+                                              Device* device,
+                                              Tensor* output_tensor,
+                                              StatusCallback done) const {
+  done(errors::Unimplemented("XLA->XLA same-device copies not implemented."));
+}
+
 void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
                                              Device* device,
                                              Tensor* device_tensor,
@@ -124,7 +148,7 @@ void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
         xla::ShapeUtil::MakeShape(shape.element_type(),
                                   xla::AsInt64Slice(shape.dimensions())));
 
-    VLOG(1) << "Transfer to device as literal: " << literal.ToString() << " "
+    VLOG(2) << "Transfer to device as literal: " << literal.ToString() << " "
             << xla_tensor->shaped_buffer().ToString();
     if (UseMultipleStreams() &&
         !transfer_manager_->CanShapedBufferBeAccessedNow(
@@ -207,7 +231,7 @@ void XlaDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
       device_to_host_stream_.get(), xla_tensor->shaped_buffer(), literal,
       [ref, xla_tensor, done](xla::Status status) {
         done([&]() -> Status {
-          VLOG(1) << "Transfer from device as literal: "
+          VLOG(2) << "Transfer from device as literal: "
                   << xla_tensor->shaped_buffer().ToString();
           return status;
         }());
diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h
index 1e18df197a2dd65590c5181b4dae4481dca36641..1ce64ad323b4827adc2f4d48841315fbde43e532 100644
--- a/tensorflow/compiler/jit/xla_device_context.h
+++ b/tensorflow/compiler/jit/xla_device_context.h
@@ -34,14 +34,18 @@ namespace tensorflow {
 // empty, XlaTensor.
 class XlaDeviceAllocator : public Allocator {
  public:
-  XlaDeviceAllocator();
+  XlaDeviceAllocator(se::StreamExecutor* stream_executor);
   ~XlaDeviceAllocator() override;
 
   string Name() override;
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override;
   void DeallocateRaw(void* ptr) override;
-  void GetStats(AllocatorStats* stats) override;
+  absl::optional<AllocatorStats> GetStats() override;
+
+ private:
+  // The stream executor of the device.
+  se::StreamExecutor* stream_executor_;
 };
 
 // Helper class for managing data transfers between host and XLA devices.
@@ -62,6 +66,9 @@ class XlaDeviceContext : public DeviceContext {
   void CopyDeviceTensorToCPU(const Tensor* device_tensor,
                              absl::string_view tensor_name, Device* device,
                              Tensor* cpu_tensor, StatusCallback done) override;
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Device* device,
+                              Tensor* output_tensor,
+                              StatusCallback done) const override;
 
   xla::LocalClient* client() const { return client_; }
   se::Stream* stream() const { return stream_.get(); }
diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h
index 927f983ba9ef23c8509523f42366c0c89c29db9f..09e04d22def9c39f45c2737c1d4a5e7787e3fdc0 100644
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/control_flow_ops.h"
 #include "tensorflow/core/kernels/data/generator_dataset_op.h"
 #include "tensorflow/core/kernels/data/iterator_ops.h"
+#include "tensorflow/core/kernels/data/optional_ops.h"
 #include "tensorflow/core/kernels/data/prefetch_dataset_op.h"
 #include "tensorflow/core/kernels/fifo_queue.h"
 #include "tensorflow/core/kernels/function_ops.h"
@@ -241,6 +242,8 @@ class XlaAssignVariableOp : public OpKernel {
                           data::AnonymousIteratorHandleOp);                    \
   REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE),              \
                           data::IteratorGetNextOp);                            \
+  REGISTER_KERNEL_BUILDER(Name("IteratorGetNextAsOptional").Device(DEVICE),    \
+                          data::IteratorGetNextAsOptionalOp);                  \
   REGISTER_KERNEL_BUILDER(Name("IteratorGetNextSync").Device(DEVICE),          \
                           data::IteratorGetNextSyncOp);                        \
   REGISTER_KERNEL_BUILDER(Name("IteratorToStringHandle")                       \
@@ -251,6 +254,15 @@ class XlaAssignVariableOp : public OpKernel {
                               .Device(DEVICE)                                  \
                               .HostMemory("string_handle"),                    \
                           data::IteratorFromStringHandleOp);                   \
+  REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE),                 \
+                          data::OptionalNoneOp);                               \
+  REGISTER_KERNEL_BUILDER(Name("OptionalFromValue").Device(DEVICE),            \
+                          data::OptionalFromValueOp);                          \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("OptionalHasValue").Device(DEVICE).HostMemory("has_value"),         \
+      data::OptionalHasValueOp);                                               \
+  REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE),             \
+                          data::OptionalGetValueOp);                           \
   REGISTER_KERNEL_BUILDER(Name(FunctionLibraryDefinition::kArgOp)              \
                               .Device(DEVICE)                                  \
                               .HostMemory("output")                            \
diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index 0191315a66f4d331e54fadc9dc6a073a05fd67ef..b29f6a009b9e9fdba76ac55386a4bec2f339cc0e 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -29,6 +29,30 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Returns a set containing the device ids contained in visible_device_list or
+// nullopt if it is empty. It returns error in case of malformed configuration
+// string.
+static xla::StatusOr<absl::optional<std::set<int>>> ParseVisibleDeviceList(
+    const string& visible_device_list) {
+  std::set<int> gpu_ids;
+  if (visible_device_list.empty()) {
+    return {{absl::nullopt}};
+  }
+  const std::vector<string> visible_devices =
+      absl::StrSplit(visible_device_list, ',');
+  for (const string& platform_gpu_id_str : visible_devices) {
+    int32 platform_gpu_id;
+    if (!absl::SimpleAtoi(platform_gpu_id_str, &platform_gpu_id)) {
+      return errors::InvalidArgument(
+          "Could not parse entry in 'visible_device_list': '",
+          platform_gpu_id_str,
+          "'. visible_device_list = ", visible_device_list);
+    }
+    gpu_ids.insert(platform_gpu_id);
+  }
+  return {{gpu_ids}};
+}
+
 class XlaGpuDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
@@ -57,33 +81,16 @@ Status XlaGpuDeviceFactory::CreateDevices(
   }
   string allowed_gpus =
       session_options.config.gpu_options().visible_device_list();
-  std::set<int> gpu_ids;
-  int num_visible_devices = platform.ValueOrDie()->VisibleDeviceCount();
-  if (allowed_gpus.empty()) {
-    for (int i = 0; i < num_visible_devices; ++i) {
-      gpu_ids.insert(i);
-    }
-  } else {
-    // For loop below is copied from gpu/gpu_device.cc. It validates
-    // the visible_device_list and populates gpu_ids set.
-    const std::vector<string> visible_devices =
-        absl::StrSplit(allowed_gpus, ',');
-    for (const string& platform_gpu_id_str : visible_devices) {
-      int32 platform_gpu_id;
-      if (!absl::SimpleAtoi(platform_gpu_id_str, &platform_gpu_id)) {
-        return errors::InvalidArgument(
-            "Could not parse entry in 'visible_device_list': '",
-            platform_gpu_id_str, "'. visible_device_list = ", allowed_gpus);
-      }
-      if (platform_gpu_id < 0 || platform_gpu_id >= num_visible_devices) {
-        return errors::InvalidArgument(
-            "'visible_device_list' listed an invalid GPU id '", platform_gpu_id,
-            "' but visible device count is ", num_visible_devices);
-      }
-      gpu_ids.insert(platform_gpu_id);
+  absl::optional<std::set<int>> gpu_ids =
+      ParseVisibleDeviceList(allowed_gpus).ValueOrDie();
+  if (!gpu_ids) {
+    gpu_ids.emplace();
+    // Fill the gpu_ids set with all devices if config string is empty.
+    for (int i = 0; i < platform.ValueOrDie()->VisibleDeviceCount(); ++i) {
+      gpu_ids->insert(i);
     }
   }
-  for (int i : gpu_ids) {
+  for (int i : *gpu_ids) {
     XlaDevice::Options options;
     options.platform = platform.ValueOrDie();
     options.device_name_prefix = name_prefix;
@@ -91,6 +98,7 @@ Status XlaGpuDeviceFactory::CreateDevices(
     options.device_ordinal = i;
     options.compilation_device_name = DEVICE_GPU_XLA_JIT;
     options.use_multiple_streams = true;
+    options.allowed_devices = gpu_ids;
     auto device = absl::make_unique<XlaDevice>(session_options, options);
 
     Status status = device->UseGpuDeviceInfo();
diff --git a/tensorflow/compiler/jit/xla_interpreter_device.cc b/tensorflow/compiler/jit/xla_interpreter_device.cc
index 4007309ed1c57b663dca5bac0df11260bf1327f3..e1a582406153d2af447fa9d4ebcaf0bf0842b132 100644
--- a/tensorflow/compiler/jit/xla_interpreter_device.cc
+++ b/tensorflow/compiler/jit/xla_interpreter_device.cc
@@ -26,9 +26,9 @@ namespace tensorflow {
 const char* const DEVICE_XLA_INTERPRETER = "XLA_INTERPRETER";
 const char* const DEVICE_INTERPRETER_XLA_JIT = "XLA_INTERPRETER_JIT";
 
-constexpr std::array<DataType, 9> kExecAllTypes = {
+constexpr std::array<DataType, 10> kExecAllTypes = {
     {DT_INT8, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64,
-     DT_BOOL, DT_BFLOAT16}};
+     DT_COMPLEX128, DT_BOOL, DT_BFLOAT16}};
 
 class XlaInterpreterDeviceFactory : public DeviceFactory {
  public:
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 3b0bda4caa161a7561a3098b89420329998ff8a7..c64981053fad2dbf1e8bcd623a940ded8b4d9150 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -237,7 +237,7 @@ void XlaComputationLaunchContext::PopulateInputs(
 
     const xla::Shape on_device_shape =
         client_->backend().transfer_manager()->HostShapeToDeviceShape(shape);
-    if (xla::ShapeUtil::IsTuple(on_device_shape)) {
+    if (on_device_shape.IsTuple()) {
       const XlaTensor* xla_tensor = XlaTensor::FromTensor(t);
       CHECK(xla_tensor && xla_tensor->has_shaped_buffer());
       arg_ptrs_[i] = const_cast<ShapedBuffer*>(&xla_tensor->shaped_buffer());
@@ -274,7 +274,7 @@ Status XlaComputationLaunchContext::PopulateOutputs(
   // If the on-host-shape isn't a tuple, create a new single-element tuple
   // buffer with a nullptr root index table. This allows the code below to treat
   // output as a tuple unconditionally.
-  if (!xla::ShapeUtil::IsTuple(output.on_host_shape())) {
+  if (!output.on_host_shape().IsTuple()) {
     ShapedBuffer nontuple_buffer = output.release();
     ShapedBuffer buffer(
         xla::ShapeUtil::MakeTupleShape({nontuple_buffer.on_host_shape()}),
@@ -377,7 +377,7 @@ Status XlaComputationLaunchContext::PopulateOutputs(
     }
 
     if (VLOG_IS_ON(3)) {
-      VLOG(3) << ctx->mutable_output(i)->DebugString();
+      VLOG(3) << ctx->mutable_output(i)->DeviceSafeDebugString();
     }
   }
 
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 093b61629cd0b04d5d8488139b8d7262b739f86d..7c1e0daf0b7b418530367cb80fbd18b93e8e5f5e 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -72,7 +72,7 @@ py_test(
 
 tf_xla_py_test(
     name = "adadelta_test",
-    size = "large",
+    size = "medium",
     srcs = ["adadelta_test.py"],
     deps = [
         ":xla_test",
@@ -230,6 +230,7 @@ tf_xla_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:standard_ops",
     ],
 )
 
@@ -242,9 +243,33 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework",
+        "//tensorflow/python:map_fn",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training",
+    ],
+)
+
+tf_xla_py_test(
+    name = "self_adjoint_eig_op_test",
+    size = "medium",
+    srcs = ["self_adjoint_eig_op_test.py"],
+    # TODO(kuny): remove it after b/124377352 is fixed.
+    disabled_backends = [
+        "cpu",
+        "gpu",
+        "cpu_ondemand",
+    ],
+    tags = ["optonly"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:map_fn",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -277,10 +302,9 @@ tf_xla_py_test(
     ],
 )
 
-# This test is large because occasionally the cpu test is long for testConcatLargeNumberOfTensors
 tf_xla_py_test(
     name = "concat_ops_test",
-    size = "large",
+    size = "medium",
     srcs = ["concat_ops_test.py"],
     deps = [
         ":xla_test",
@@ -406,7 +430,7 @@ tf_xla_py_test(
 
 tf_xla_py_test(
     name = "eager_test",
-    size = "large",
+    size = "medium",
     srcs = ["eager_test.py"],
     deps = [
         ":xla_test",
@@ -677,6 +701,7 @@ tf_xla_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:standard_ops",
     ],
 )
 
@@ -826,6 +851,7 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python:standard_ops",
         "//tensorflow/python:stateless_random_ops",
     ],
 )
@@ -1188,11 +1214,18 @@ tf_xla_py_test(
 
 tf_xla_py_test(
     name = "quantized_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["quantized_ops_test.py"],
+    disabled_backends = [
+        "cpu",
+        "cpu_ondemand",
+    ],
     deps = [
         ":xla_test",
+        "//tensorflow/compiler/tf2xla/python:xla",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:bitwise_ops",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
diff --git a/tensorflow/compiler/tests/adadelta_test.py b/tensorflow/compiler/tests/adadelta_test.py
index b7b7fda293b69d6f0cec61d0d234277636a3670d..6cf16cc07ff503c4f3e008cfb720224abe5e9166 100644
--- a/tensorflow/compiler/tests/adadelta_test.py
+++ b/tensorflow/compiler/tests/adadelta_test.py
@@ -32,10 +32,18 @@ class AdadeltaOptimizerTest(xla_test.XLATestCase):
 
   def testBasic(self):
     num_updates = 4  # number of ADADELTA steps to perform
+    if "CPU" in self.device:
+      # To avoid timeout on CPU.
+      all_grad = [0.2, 0.01]
+      all_lr = [1.0, 0.1]
+    else:
+      all_grad = [0.2, 0.1, 0.01]
+      all_lr = [1.0, 0.5, 0.1]
+
     for dtype in self.float_types:
       with self.cached_session(), self.test_scope():
-        for grad in [0.2, 0.1, 0.01]:
-          for lr in [1.0, 0.5, 0.1]:
+        for grad in all_grad:
+          for lr in all_lr:
             var0_init = [1.0, 2.0]
             var1_init = [3.0, 4.0]
             var0 = resource_variable_ops.ResourceVariable(
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 9a5423c1b2a5df7880453cbb328f6a8174066255..c829c50b5518b29c96c0b0117a6cd143911bd1fc 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -311,6 +311,30 @@ class BinaryOpsTest(xla_test.XLATestCase):
           dtype(7),
           expected=np.array([[-6], [-5]], dtype=dtype))
 
+      if dtype in [np.float32, np.float64]:
+        x = np.array([
+            -0.0, 0.0, -0.0, +0.0, np.inf, np.inf, -np.inf, -np.inf, 2.0, 2.0,
+            1.0
+        ],
+                     dtype=dtype)
+        y = np.array(
+            [-0.0, 0.0, +0.0, -0.0, 1.0, -1.0, 1.0, -1.0, 2.0, 1.0, 2.0],
+            dtype=dtype)
+        expected = np.nextafter(x, y)
+
+        # We use assertAllEqual to expose any bugs hidden by relative or
+        # absolute error tolerances.
+        def NextAfterEqualityTest(result, expected, rtol):
+          del rtol
+          return self.assertAllEqual(result, expected)
+
+        self._testBinary(
+            math_ops.nextafter,
+            x,
+            y,
+            expected=expected,
+            equality_test=NextAfterEqualityTest)
+
       # min/max not supported for complex
       if dtype not in self.complex_types | {np.uint8, np.int8}:
         self._testBinary(
@@ -400,7 +424,7 @@ class BinaryOpsTest(xla_test.XLATestCase):
 
   def testComplexOps(self):
     for dtype in self.complex_types:
-      ctypes = {np.complex64: np.float32}
+      ctypes = {np.complex64: np.float32, np.complex128: np.float64}
       self._testBinary(
           math_ops.complex,
           np.array([[[[-1, 2], [2, 0]]]], dtype=ctypes[dtype]),
diff --git a/tensorflow/compiler/tests/build_defs.bzl b/tensorflow/compiler/tests/build_defs.bzl
index 447a7de2cb6526a5dcf7789d4f2bffb5e733e8c0..ed580f95b6c2f57dfdf46cfcd64cabb452980c5d 100644
--- a/tensorflow/compiler/tests/build_defs.bzl
+++ b/tensorflow/compiler/tests/build_defs.bzl
@@ -5,6 +5,7 @@ load("//tensorflow/compiler/tests:plugin.bzl", "plugins")
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
     "tf_cuda_tests_tags",
+    "tf_exec_compatible_with",
 )
 
 def all_backends():
@@ -64,7 +65,7 @@ def tf_xla_py_test(
         if backend == "cpu":
             backend_args += [
                 "--test_device=XLA_CPU",
-                "--types=DT_HALF,DT_FLOAT,DT_DOUBLE,DT_UINT8,DT_QUINT8,DT_INT8,DT_QINT8,DT_INT32,DT_QINT32,DT_INT64,DT_BOOL,DT_COMPLEX64",
+                "--types=DT_HALF,DT_FLOAT,DT_DOUBLE,DT_UINT8,DT_QUINT8,DT_INT8,DT_QINT8,DT_INT32,DT_QINT32,DT_INT64,DT_BOOL,DT_COMPLEX64,DT_COMPLEX128",
             ]
         elif backend == "gpu":
             backend_args += [
@@ -84,6 +85,7 @@ def tf_xla_py_test(
         else:
             fail("Unknown backend {}".format(backend))
 
+        test_tags = tags + backend_tags
         native.py_test(
             name = test_name,
             srcs = srcs,
@@ -92,7 +94,8 @@ def tf_xla_py_test(
             main = "{}.py".format(name) if main == None else main,
             data = data + backend_data,
             deps = deps + backend_deps,
-            tags = tags + backend_tags,
+            tags = test_tags,
+            exec_compatible_with = tf_exec_compatible_with({"tags": test_tags}),
             **kwargs
         )
         test_names.append(test_name)
diff --git a/tensorflow/compiler/tests/categorical_op_test.py b/tensorflow/compiler/tests/categorical_op_test.py
index 5d5e486f616937601214aa169a4c329ab78932c8..eec69ea7d2d9af9ff570f927fb25b668ccce2b97 100644
--- a/tensorflow/compiler/tests/categorical_op_test.py
+++ b/tensorflow/compiler/tests/categorical_op_test.py
@@ -119,7 +119,7 @@ class CategoricalTest(xla_test.XLATestCase):
 
   def testSamplingCorrectness(self):
     np.random.seed(1618)  # Make it reproducible.
-    num_samples = 21000
+    num_samples = 40000
 
     rand_probs = np.random.dirichlet([1., 1., 2., 3.])
     rand_probs2 = np.random.dirichlet([1., 4., 5.], size=3)  # batched
diff --git a/tensorflow/compiler/tests/concat_ops_test.py b/tensorflow/compiler/tests/concat_ops_test.py
index 2187f57960f80300d631bdc7eb8fe5e9c8dddeea..76750decd2963ea12680a46d7340f48e8b011fa9 100644
--- a/tensorflow/compiler/tests/concat_ops_test.py
+++ b/tensorflow/compiler/tests/concat_ops_test.py
@@ -294,6 +294,9 @@ class ConcatTest(xla_test.XLATestCase):
   # The purpose of this is to ensure that XLA on GPU will not run out of memory
   # with too many arguments.
   def testConcatLargeNumberOfTensors(self):
+    if "CPU" in self.device:
+      self.skipTest("This test can time out on CPU, so we will just allow "
+                    "other backends to catch this specific error.")
     with self.cached_session():
       with self.test_scope():
         for concat_dim in range(2):
diff --git a/tensorflow/compiler/tests/dense_layer_test.py b/tensorflow/compiler/tests/dense_layer_test.py
index bf5ea7b1fb6fb3c774c4db20d059f131990d20d3..b7d08df9f7d144b71fd0b09535e10b8f596ea6ca 100644
--- a/tensorflow/compiler/tests/dense_layer_test.py
+++ b/tensorflow/compiler/tests/dense_layer_test.py
@@ -72,7 +72,7 @@ class DenseLayerTest(test.TestCase):
       x = array_ops.placeholder(shape=[None, None, 3], dtype=np.float32)
       y = layers.dense(x, 3)
 
-      self.evaluate(variables.initialize_all_variables())
+      self.evaluate(variables.global_variables_initializer())
       run_metadata = config_pb2.RunMetadata()
       test_utils.RunWithWarmup(
           sess,
@@ -97,7 +97,7 @@ class DenseLayerTest(test.TestCase):
       with jit_scope():
         y = layers.dense(x, 3)
 
-      self.evaluate(variables.initialize_all_variables())
+      self.evaluate(variables.global_variables_initializer())
       run_metadata = config_pb2.RunMetadata()
       test_utils.RunWithWarmup(
           sess,
@@ -126,7 +126,7 @@ class DenseLayerTest(test.TestCase):
       with jit_scope():
         y = layers.dense(x, 3)
 
-      self.evaluate(variables.initialize_all_variables())
+      self.evaluate(variables.global_variables_initializer())
       run_metadata = config_pb2.RunMetadata()
       test_utils.RunWithWarmup(
           sess,
diff --git a/tensorflow/compiler/tests/depthwise_conv_op_test.py b/tensorflow/compiler/tests/depthwise_conv_op_test.py
index 174bfa9efbcd7dcb4f895237eb01c17bc4a3a6b4..90146e6b27ca31304a2549ec247412341efe390c 100644
--- a/tensorflow/compiler/tests/depthwise_conv_op_test.py
+++ b/tensorflow/compiler/tests/depthwise_conv_op_test.py
@@ -350,8 +350,13 @@ class DepthwiseConv2DTest(xla_test.XLATestCase):
       self._CompareBackpropInput(input_size, filter_size, output_size, stride,
                                  padding)
 
-  def _CompareBackpropFilter(self, input_sizes, filter_sizes, output_sizes,
-                             stride, padding):
+  def _CompareBackpropFilter(self,
+                             input_sizes,
+                             filter_sizes,
+                             output_sizes,
+                             stride,
+                             padding,
+                             data_format="NHWC"):
     x0 = np.random.rand(*input_sizes).astype(np.float32)
     x2 = np.random.rand(*output_sizes).astype(np.float32)
 
@@ -360,13 +365,30 @@ class DepthwiseConv2DTest(xla_test.XLATestCase):
         t0 = array_ops.placeholder(np.float32, shape=input_sizes)
         t1 = constant_op.constant(filter_sizes, shape=[len(filter_sizes)])
         t2 = array_ops.placeholder(np.float32, shape=output_sizes)
+        native_t0 = t0
+        native_t2 = t2
+        strides = [1, stride, stride, 1]
+
         if use_xla:
+          if data_format == "NCHW":
+            # Transpose from NWHC input to NCHW
+            # Ex. [4, 5, 5, 48] to [4, 48, 5, 5]
+            native_t0 = array_ops.transpose(t0, [0, 3, 1, 2])
+            native_t2 = array_ops.transpose(t2, [0, 3, 1, 2])
+            strides = [1, 1, stride, stride]
           with self.test_scope():
             backprop = nn_ops.depthwise_conv2d_native_backprop_filter(
-                t0, t1, t2, strides=[1, stride, stride, 1], padding=padding)
+                native_t0,
+                t1,
+                native_t2,
+                strides=strides,
+                padding=padding,
+                data_format=data_format)
         else:
+          # For CPU, the format NCHW is not supported. Therefore we always use
+          # NHWC here.
           backprop = nn_ops.depthwise_conv2d_native_backprop_filter(
-              t0, t1, t2, strides=[1, stride, stride, 1], padding=padding)
+              native_t0, t1, native_t2, strides=strides, padding=padding)
         ret = backprop.eval({t0: x0, t2: x2})
         self.assertShapeEqual(ret, backprop)
         return ret
@@ -379,11 +401,24 @@ class DepthwiseConv2DTest(xla_test.XLATestCase):
     for index, (input_size, filter_size, output_size, stride,
                 padding) in enumerate(ConfigsToTest()):
       print("Testing DepthwiseConv2DFilterGradCompare,", index, "th config:",
-            input_size, "*", filter_size, "stride:", stride, "padding:",
-            padding)
+            input_size, "*", filter_size, "producing output", output_size,
+            "stride:", stride, "padding:", padding)
       self._CompareBackpropFilter(input_size, filter_size, output_size,
                                   stride, padding)
 
+  def testDepthwiseConv2DFilterGradFormatNCHWCompare(self):
+    for index, (input_size, filter_size, output_size, stride,
+                padding) in enumerate(ConfigsToTest()):
+      print("Testing DepthwiseConv2DFilterGradFormatNCHWCompare,", index,
+            "th config:", input_size, "*", filter_size, "producing output",
+            output_size, "stride:", stride, "padding:", padding)
+      self._CompareBackpropFilter(
+          input_size,
+          filter_size,
+          output_size,
+          stride,
+          padding,
+          data_format="NCHW")
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/compiler/tests/eager_test.py b/tensorflow/compiler/tests/eager_test.py
index 2af32b537ba53723370faf81aebf308a465718c7..632eccbb097b4e84f10f926e89d7fa439c8a38cd 100644
--- a/tensorflow/compiler/tests/eager_test.py
+++ b/tensorflow/compiler/tests/eager_test.py
@@ -24,6 +24,7 @@ from tensorflow.compiler.tests import xla_test
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -31,7 +32,9 @@ from tensorflow.python.framework import ops
 from tensorflow.python.layers import convolutional
 from tensorflow.python.layers import pooling
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_random_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -463,7 +466,7 @@ class EagerFunctionTest(xla_test.XLATestCase):
       def f(x, y):
         return x[0::2, y:, ...]
 
-      x = array_ops.ones([2, 3, 4])
+      x = array_ops.ones([2, 3, 4], dtype=dtypes.float32)
       y = array_ops.ones([], dtype=dtypes.int32)
       with backprop.GradientTape() as tape:
         tape.watch(x)
@@ -479,15 +482,15 @@ class EagerFunctionTest(xla_test.XLATestCase):
 
       @function.defun
       def times_two(x):
-        return 2 * x
+        return 2. * x
 
       @function.defun
       def two_x_plus_1(x):
-        return times_two(x) + 1
+        return times_two(x) + 1.
 
-      x = constant_op.constant([2, 3, 4])
+      x = constant_op.constant([2., 3., 4.])
       y = two_x_plus_1(x)
-      self.assertAllEqual([5, 7, 9], y.numpy())
+      self.assertAllEqual([5., 7., 9.], y.numpy())
 
   def testNestedDefunWithVariable(self):
     with self.test_scope():
@@ -506,7 +509,7 @@ class EagerFunctionTest(xla_test.XLATestCase):
       x = constant_op.constant(3.0)
       y = f(x)
 
-    self.assertEqual(75, y.numpy())
+    self.assertEqual(75.0, y.numpy())
 
   def testNestedDefunInGradientTape(self):
     with self.test_scope():
@@ -555,6 +558,71 @@ class EagerFunctionTest(xla_test.XLATestCase):
     self.assertEqual(9, dy_v0.numpy())
     self.assertEqual(15, dy_v1.numpy())
 
+  def testWhileInDefun(self):
+    with self.test_scope():
+      @def_function.function
+      def f(start):
+        c = lambda x: math_ops.less(x, 13.0)
+        b = lambda x: math_ops.add(x, 1.0)
+        return control_flow_ops.while_loop(c, b, [start])
+
+      y = f(constant_op.constant(3.0))
+    self.assertEqual(13.0, y.numpy())
+
+  def testAutoGraphWhileInDefun(self):
+    with self.test_scope():
+      @def_function.function
+      def f(start):
+        x = start
+        while x < 13.0:
+          x += 1.0
+        return x
+
+      y = f(constant_op.constant(3.0))
+    self.assertEqual(13.0, y.numpy())
+
+  def testCondInDefun(self):
+    with self.test_scope():
+      @def_function.function
+      def f(pred, value):
+        fn1 = lambda: math_ops.add(value, 1.0)
+        fn2 = lambda: math_ops.subtract(value, 1.0)
+        return control_flow_ops.cond(pred, fn1, fn2)
+
+      plus_one = f(constant_op.constant(True), constant_op.constant(10.0))
+      minus_one = f(constant_op.constant(False), constant_op.constant(10.0))
+    self.assertEqual(11.0, plus_one.numpy())
+    self.assertEqual(9.0, minus_one.numpy())
+
+  def testAutoGraphCondInDefun(self):
+    with self.test_scope():
+      @def_function.function
+      def f(pred, value):
+        if pred:
+          return value + 1.0
+        else:
+          return value - 1.0
+
+      plus_one = f(constant_op.constant(True), constant_op.constant(10.0))
+      minus_one = f(constant_op.constant(False), constant_op.constant(10.0))
+    self.assertEqual(11.0, plus_one.numpy())
+    self.assertEqual(9.0, minus_one.numpy())
+
+  def testScanInDefun(self):
+    with self.test_scope():
+      elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name='data')
+      v = constant_op.constant(2.0, name='v')
+
+      @def_function.function
+      def f(y):
+        # pylint: disable=unnecessary-lambda
+        return functional_ops.scan(
+            lambda a, x: math_ops.multiply(a, x), y, initializer=v)
+        # pylint: enable=unnecessary-lambda
+
+      r = f(elems)
+      self.assertAllEqual([2., 4., 12., 48., 240., 1440.], self.evaluate(r))
+
 
 class ExcessivePaddingTest(xla_test.XLATestCase):
   """Test that eager execution works with TPU flattened tensors.
diff --git a/tensorflow/compiler/tests/fused_batchnorm_test.py b/tensorflow/compiler/tests/fused_batchnorm_test.py
index 374942a0b339b816944ea5529e4f84134b60017b..56a8e1b1667f154f6cec475ee0f4f8b308121c09 100644
--- a/tensorflow/compiler/tests/fused_batchnorm_test.py
+++ b/tensorflow/compiler/tests/fused_batchnorm_test.py
@@ -191,6 +191,20 @@ class FusedBatchNormTest(xla_test.XLATestCase, parameterized.TestCase):
     mean_val = np.random.random_sample(scale_shape).astype(np.float32)
     var_val = np.random.random_sample(scale_shape).astype(np.float32)
     epsilon = 0.001
+
+    # The TensorFlow FusedBatchNormGrad training operation takes two inputs with
+    # implementation defined values.  In theory the only correct value these
+    # inputs are the corresponding reserve_space_{1|2} outputs from the
+    # FusedBatchNorm training operation.  However, in practice, we rely on the
+    # first one being mean on {C|G}PU, and the second one being variance on CPU
+    # and inverse(sqrt(variance + epsilon)) on GPU (we test this assumption
+    # separately).
+    reserve_space_1_val = mean_val
+    if self.device == "XLA_GPU":
+      reserve_space_2_val = np.reciprocal(np.sqrt(var_val + epsilon))
+    else:
+      reserve_space_2_val = var_val
+
     data_format_src = "NHWC"
     grad_x_ref, grad_scale_ref, grad_offset_ref = self._reference_grad(
         x_val, grad_val, scale_val, mean_val, var_val, epsilon, data_format_src)
@@ -207,18 +221,26 @@ class FusedBatchNormTest(xla_test.XLATestCase, parameterized.TestCase):
           np.float32, shape=x_val_converted.shape, name="grad")
       x = array_ops.placeholder(
           np.float32, shape=x_val_converted.shape, name="x")
-      mean = array_ops.placeholder(np.float32, shape=scale_shape, name="mean")
-      var = array_ops.placeholder(np.float32, shape=scale_shape, name="var")
+      reserve_space_1 = array_ops.placeholder(
+          np.float32, shape=scale_shape, name="reserve_space_1")
+      reserve_space_2 = array_ops.placeholder(
+          np.float32, shape=scale_shape, name="reserve_space_2")
       scale = array_ops.placeholder(np.float32, shape=scale_shape, name="scale")
       grad_x, grad_scale, grad_offset, _, _ = gen_nn_ops.fused_batch_norm_grad(
-          grad, x, scale, mean, var, data_format=data_format, is_training=True)
+          grad,
+          x,
+          scale,
+          reserve_space_1,
+          reserve_space_2,
+          data_format=data_format,
+          is_training=True)
 
       grad_x_val, grad_scale_val, grad_offset_val = sess.run(
           [grad_x, grad_scale, grad_offset], {
               grad: grad_val_converted,
               x: x_val_converted,
-              mean: mean_val,
-              var: var_val,
+              reserve_space_1: reserve_space_1_val,
+              reserve_space_2: reserve_space_2_val,
               scale: scale_val
           })
 
diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py
index 0e2d840418156d825e2d141018e49f42374c8fee..42e688174fce9e939feb09e1767ebab31e30a6ee 100644
--- a/tensorflow/compiler/tests/image_ops_test.py
+++ b/tensorflow/compiler/tests/image_ops_test.py
@@ -403,6 +403,117 @@ class AdjustSaturationTest(xla_test.XLATestCase):
           self.assertAllClose(y_fused, y_baseline, rtol=2e-5, atol=1e-5)
 
 
+class ResizeNearestNeighborTest(xla_test.XLATestCase):
+  # TODO(ilch): Wrap each test with `for dtype in self.float_types:`
+  # Some work to understand how that should be done was presented here:
+  # cl/227850213
+
+  def _assertForwardOpMatchesExpected(self,
+                                      image_np,
+                                      target_shape,
+                                      expected=None,
+                                      large_tolerance=False,
+                                      align_corners=True):
+    if expected is None:
+      self.fail("expected must be specified")
+    with self.cached_session() as sess, self.test_scope():
+      image = array_ops.placeholder(image_np.dtype)
+      resized = gen_image_ops.resize_nearest_neighbor(
+          image, target_shape, align_corners=align_corners)
+      out = sess.run(resized, {image: image_np[np.newaxis, :, :, np.newaxis]})
+      if large_tolerance:
+        self.assertAllClose(
+            expected[np.newaxis, :, :, np.newaxis], out, rtol=2e-4, atol=2e-4)
+      else:
+        self.assertAllClose(expected[np.newaxis, :, :, np.newaxis], out)
+
+  def testAlignCorners2x2To1x1(self):
+    self._assertForwardOpMatchesExpected(
+        np.array([[1, 2], [3, 4]], dtype=np.float32), [1, 1],
+        expected=np.array([[1]], dtype=np.float32))
+
+  def testAlignCorners1x1To2x2(self):
+    self._assertForwardOpMatchesExpected(
+        np.array([[1]], dtype=np.float32), [2, 2],
+        expected=np.array([[1, 1], [1, 1]], dtype=np.float32))
+
+  def testAlignCorners1x1To3x3(self):
+    self._assertForwardOpMatchesExpected(
+        np.array([[1]], dtype=np.float32), [3, 3],
+        expected=np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]], dtype=np.float32))
+
+  def testAlignCorners2x2To3x3(self):
+    self._assertForwardOpMatchesExpected(
+        np.array([[1, 2], [3, 4]], dtype=np.float32), [3, 3],
+        expected=np.array([[1, 2, 2], [3, 4, 4], [3, 4, 4]], dtype=np.float32))
+
+  def testAlignCorners2x2To4x4(self):
+    self._assertForwardOpMatchesExpected(
+        np.array([[1, 2], [3, 4]], dtype=np.float32), [4, 4],
+        expected=np.array(
+            [[1, 1, 2, 2], [1, 1, 2, 2], [3, 3, 4, 4], [3, 3, 4, 4]],
+            dtype=np.float32), large_tolerance=True)
+
+  def testAlignCorners3x3To2x2(self):
+    self._assertForwardOpMatchesExpected(
+        np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32), [2, 2],
+        expected=np.array([[1, 3], [7, 9]], dtype=np.float32))
+
+  def testAlignCorners4x4To3x3(self):
+    self._assertForwardOpMatchesExpected(
+        np.array(
+            [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]],
+            dtype=np.float32), [3, 3],
+        expected=np.array([[1, 3, 4], [9, 11, 12], [13, 15, 16]],
+                          dtype=np.float32))
+
+  def testAlignCorners3x3To4x4(self):
+    self._assertForwardOpMatchesExpected(
+        np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32), [4, 4],
+        expected=np.array(
+            [[1, 2, 2, 3], [4, 5, 5, 6], [4, 5, 5, 6], [7, 8, 8, 9]],
+            dtype=np.float32))
+
+  def testAlignCorners3x3To6x6(self):
+    self._assertForwardOpMatchesExpected(
+        np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32), [6, 6],
+        expected=np.array(
+            [[1, 1, 2, 2, 3, 3], [1, 1, 2, 2, 3, 3], [4, 4, 5, 5, 6, 6],
+             [4, 4, 5, 5, 6, 6], [7, 7, 8, 8, 9, 9], [7, 7, 8, 8, 9, 9]],
+            dtype=np.float32))
+
+  def testAlignCorners3x3To9x9(self):
+    # The expected matrix might look uneven in terms of how many of each number
+    # there is, but this is an artifact of doing the dilation and convolution
+    # iteratively. The behavior is less esoteric in the 3x3To12x12 case below.
+    self._assertForwardOpMatchesExpected(
+        np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32), [9, 9],
+        expected=np.array(
+            [[1, 2, 2, 2, 2, 3, 3, 3, 3], [4, 5, 5, 5, 5, 6, 6, 6, 6],
+             [4, 5, 5, 5, 5, 6, 6, 6, 6], [4, 5, 5, 5, 5, 6, 6, 6, 6],
+             [4, 5, 5, 5, 5, 6, 6, 6, 6], [7, 8, 8, 8, 8, 9, 9, 9, 9],
+             [7, 8, 8, 8, 8, 9, 9, 9, 9], [7, 8, 8, 8, 8, 9, 9, 9, 9],
+             [7, 8, 8, 8, 8, 9, 9, 9, 9]],
+            dtype=np.float32))
+
+  def testAlignCorners3x3To12x12(self):
+    self._assertForwardOpMatchesExpected(
+        np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32), [12, 12],
+        expected=np.array([[1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3],
+                           [1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3],
+                           [1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3],
+                           [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
+                           [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
+                           [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
+                           [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
+                           [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
+                           [4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6],
+                           [7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9],
+                           [7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9],
+                           [7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9]],
+                          dtype=np.float32))
+
+
 class ResizeBilinearTest(xla_test.XLATestCase):
 
   def _assertForwardOpMatchesExpected(self,
@@ -444,14 +555,14 @@ class ResizeBilinearTest(xla_test.XLATestCase):
       self.assertAllCloseAccordingToType(expected[np.newaxis, :, :, np.newaxis],
                                          out)
 
-  def testAlignCorners1x2To3x2(self):
+  def testAlignCorners1x2To3x3(self):
     for dtype in self.float_types:
       self._assertForwardOpMatchesExpected(
           np.array([[1, 2]], dtype=dtype), [3, 3],
           expected=np.array([[1, 1.5, 2], [1, 1.5, 2], [1, 1.5, 2]],
                             dtype=np.float32))
 
-  def testAlignCorners1x2To3x2Grad(self):
+  def testAlignCorners1x2To3x3Grad(self):
     for dtype in self.float_types:
       self._assertBackwardOpMatchesExpected(
           np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32),
diff --git a/tensorflow/compiler/tests/matrix_band_part_test.py b/tensorflow/compiler/tests/matrix_band_part_test.py
index c61965b97fc142ce452cf28def8c937f692d2f84..0eec070a906670ff36c772edda22f8291b5b734a 100644
--- a/tensorflow/compiler/tests/matrix_band_part_test.py
+++ b/tensorflow/compiler/tests/matrix_band_part_test.py
@@ -167,6 +167,11 @@ class MatrixBandPartTest(xla_test.XLATestCase, parameterized.TestCase):
       },
   )
   def testMatrixBandPart(self, batch_shape, rows, cols):
+    # TODO(b/125505881): Disabled due to LLVM backend crash.
+    if self.device == 'XLA_CPU' and cols == 7 and rows == 1 and batch_shape == [
+        1, 3, 2
+    ]:
+      pass
     for dtype in self.float_types:
       with self.cached_session():
         mat = np.ones(batch_shape + [rows, cols]).astype(dtype)
diff --git a/tensorflow/compiler/tests/plugin.bzl b/tensorflow/compiler/tests/plugin.bzl
index fbc8781a3e59faecf985cde5114bf56a041c4be0..46a854d1459b7ea9d9fe3cf7689faee557c2cf84 100644
--- a/tensorflow/compiler/tests/plugin.bzl
+++ b/tensorflow/compiler/tests/plugin.bzl
@@ -18,13 +18,12 @@
 #   git update-index --assume-unchanged tensorflow/compiler/tests/plugin.bzl
 
 plugins = {
-  #"example": {
-  #  "device":"XLA_MY_DEVICE",
-  #  "types":"DT_FLOAT,DT_HALF,DT_INT32",
-  #   "tags":[],
-  #   "args":["--disabled_manifest=tensorflow/compiler/plugin/example/disabled_manifest.txt"],
-  #   "data":["//tensorflow/compiler/plugin/example:disabled_manifest.txt"],
-  #   "deps":[],
-  #},
+    #"example": {
+    #  "device":"XLA_MY_DEVICE",
+    #  "types":"DT_FLOAT,DT_HALF,DT_INT32",
+    #   "tags":[],
+    #   "args":["--disabled_manifest=tensorflow/compiler/plugin/example/disabled_manifest.txt"],
+    #   "data":["//tensorflow/compiler/plugin/example:disabled_manifest.txt"],
+    #   "deps":[],
+    #},
 }
-
diff --git a/tensorflow/compiler/tests/quantized_ops_test.py b/tensorflow/compiler/tests/quantized_ops_test.py
index 80c338513bc9ff6b8e56c5ad6b904af9e06a3715..cd9b728ab314d29e4eb585e00a9131024ea3a207 100644
--- a/tensorflow/compiler/tests/quantized_ops_test.py
+++ b/tensorflow/compiler/tests/quantized_ops_test.py
@@ -18,11 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import math
 import numpy as np
 
 from tensorflow.compiler.tests import xla_test
+from tensorflow.compiler.tf2xla.python import xla
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import bitwise_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import googletest
 
@@ -44,5 +49,55 @@ class QuantizedOpsTest(xla_test.XLATestCase):
         self.assertAllEqual(value, expected)
 
 
+class DeuantizedOpsTest(xla_test.XLATestCase):
+
+  def pack_uint8_r2_to_uint32(self, test_input):
+    num_rows, num_columns = test_input.get_shape().as_list()
+    num_output_columns = int(math.ceil(num_columns / 4.0))
+    padding_input = array_ops.pad(
+        math_ops.cast(test_input, dtype=dtypes.uint8),
+        constant_op.constant([[
+            0,
+            0,
+        ], [0, num_output_columns * 4 - num_columns]]))
+    output = array_ops.zeros([num_rows, num_output_columns],
+                             dtype=dtypes.uint32)
+    num_elements_per_pack = 4
+    shift_bits = 8
+
+    iota_r1 = math_ops.range(num_output_columns * num_elements_per_pack)
+
+    for p in range(num_elements_per_pack):
+      selected_index = math_ops.equal(
+          math_ops.mod(iota_r1, num_elements_per_pack), p)
+      gather_index = array_ops.boolean_mask(iota_r1, selected_index)
+      gathered_input = array_ops.gather(padding_input, gather_index, axis=1)
+      total_shift_bits = shift_bits * (num_elements_per_pack - p - 1)
+      left_shift_input = bitwise_ops.left_shift(
+          math_ops.cast(gathered_input, dtype=dtypes.uint32), total_shift_bits)
+      output = bitwise_ops.bitwise_or(output, left_shift_input)
+    return output
+
+  def testDequantizeQuint8(self):
+    num_rows = 100
+    num_columns = 3547
+    random_input = np.random.normal(128.0, 10.0, [num_rows, num_columns])
+    with self.cached_session() as session:
+      with ops.device("CPU"):
+        test_input = ops.convert_to_tensor(random_input, dtype=dtypes.float32)
+        transposed_input = array_ops.transpose(test_input, [1, 0])
+        quantized_input = array_ops.quantize(transposed_input, 0.0, 255.0,
+                                             dtypes.quint8)
+        packed_input = self.pack_uint8_r2_to_uint32(quantized_input.output)
+      with self.test_scope():
+        transposed_quantized_output = xla.dequantize(packed_input, 0.0, 255.0,
+                                                     "MIN_COMBINED", True)
+        quantized_output = array_ops.slice(transposed_quantized_output, [0, 0],
+                                           [num_rows, num_columns])
+
+    value = session.run(quantized_output)
+    self.assertAllClose(value, random_input, 1.0)
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/compiler/tests/random_ops_test.py b/tensorflow/compiler/tests/random_ops_test.py
index 97ffad34c00b8ec16eb1ec109ba5d980e0ce673d..34f2465ba63f235f893db9dd6930ac252c3e7226 100644
--- a/tensorflow/compiler/tests/random_ops_test.py
+++ b/tensorflow/compiler/tests/random_ops_test.py
@@ -122,8 +122,8 @@ class RandomOpsTest(xla_test.XLATestCase):
         beta = (b - mu) / sigma
         z = normal_cdf(beta) - normal_cdf(alpha)
 
-        self.assertTrue((y >= a).sum() == count)
-        self.assertTrue((y <= b).sum() == count)
+        self.assertEqual((y >= a).sum(), count)
+        self.assertEqual((y <= b).sum(), count)
 
         # For more information on these calculations, see:
         # Burkardt, John. "The Truncated Normal Distribution".
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index d23fd125163d1afe8c7fd5e008d4b617ff4b2874..1521cc760b85b176acb27c1489640e92ef90e247 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -63,6 +63,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
@@ -80,6 +81,7 @@ int64 tf_xla_random_seed = 0;
 int32 tf_xla_test_repetitions = 20;
 int64 tf_xla_max_tensor_size = 10000LL;
 string* tf_xla_test_device_ptr;  // initial value set in main()
+string* tf_xla_reference_device_ptr;  // initial value set in main()
 bool tf_xla_test_use_jit = true;
 
 string LocalDeviceToFullDeviceName(const string& device) {
@@ -321,6 +323,9 @@ class OpTest : public ::testing::Test {
   // for use as reduction indices.
   Tensor RandomReductionIndices(int rank);
 
+  // Returns a random bit.
+  bool RandomBool();
+
   struct WindowedSpatialDims {
     Padding padding;
     std::vector<int64> kernel_dims;
@@ -453,6 +458,11 @@ std::vector<int64> OpTest::RandomDims(int min_rank, int max_rank,
   return dims;
 }
 
+bool OpTest::RandomBool() {
+  std::bernoulli_distribution d(0.5);
+  return d(generator());
+}
+
 Tensor OpTest::RandomTensor(DataType dtype, bool needs_unique_values,
                             absl::Span<const int64> shape) {
   Tensor tensor(dtype, TensorShape(shape));
@@ -760,8 +770,22 @@ Status TensorsAreEqualImpl(const Tensor& x, const Tensor& y) {
   for (int i = 0; i < Tx.size(); ++i) {
     if (Tx(i) != Ty(i)) {
       return errors::InvalidArgument(absl::StrCat(
-          i, "-th tensor element isn't equal: ", Tx(i), " vs. ", Ty(i),
-          ". x = ", x.DebugString(), "y = ", y.DebugString()));
+          i, "-th tensor element isn't equal: ", Str(Tx(i)), " vs. ",
+          Str(Ty(i)), ". x = ", x.DebugString(), "y = ", y.DebugString()));
+    }
+  }
+  return Status::OK();
+}
+
+Status TensorsAreEqualImplBfloat16(const Tensor& x, const Tensor& y) {
+  auto Tx = x.flat<bfloat16>();
+  auto Ty = y.flat<bfloat16>();
+  for (int i = 0; i < Tx.size(); ++i) {
+    if (Tx(i) != Ty(i)) {
+      return errors::InvalidArgument(absl::StrCat(
+          i, "-th tensor element isn't equal: ", static_cast<float>(Tx(i)),
+          " vs. ", static_cast<float>(Ty(i)), ". x = ", x.DebugString(),
+          "y = ", y.DebugString()));
     }
   }
   return Status::OK();
@@ -797,6 +821,8 @@ Status TensorsAreClose(const Tensor& a, const Tensor& b, double atol,
       return TensorsAreEqualImpl<int64>(a, b);
     case DT_BOOL:
       return TensorsAreEqualImpl<bool>(a, b);
+    case DT_BFLOAT16:
+      return TensorsAreEqualImplBfloat16(a, b);
     default:
       LOG(FATAL) << "Unexpected type : " << DataTypeString(a.dtype());
   }
@@ -829,8 +855,8 @@ OpTest::TestResult OpTest::ExpectTfAndXlaOutputsAreClose(
     VLOG(1) << "Input: " << input_tensors.back().DebugString();
   }
 
-  string cpu_device =
-      LocalDeviceToFullDeviceName(absl::StrCat(DEVICE_CPU, ":0"));
+  string reference_device =
+      LocalDeviceToFullDeviceName(*tf_xla_reference_device_ptr);
   string test_device = LocalDeviceToFullDeviceName(*tf_xla_test_device_ptr);
 
   DeviceNameUtils::ParsedName parsed_name;
@@ -845,9 +871,9 @@ OpTest::TestResult OpTest::ExpectTfAndXlaOutputsAreClose(
   std::vector<string> expected_inputs, test_inputs;
   std::vector<string> expected_fetches, test_fetches;
   Status status = builder.BuildGraph(
-      absl::StrCat("test", num_tests_, "_expected"), cpu_device,
-      /* use_jit= */ false, &graph, /* test_node_def= */ nullptr,
-      &expected_inputs, &expected_fetches);
+      absl::StrCat("test", num_tests_, "_expected"), reference_device,
+      /*use_jit=*/false, &graph, /*test_node_def=*/nullptr, &expected_inputs,
+      &expected_fetches);
   if (!status.ok()) {
     LOG(ERROR) << "Expected graph construction failed: " << status;
     return kFatalError;
@@ -1371,6 +1397,19 @@ TEST_F(OpTest, Cast) {
   });
 }
 
+TEST_F(OpTest, CastBF16) {
+  Repeatedly([this]() {
+    DataType src_type, dst_type;
+    src_type = Choose<DataType>({DT_FLOAT});
+    dst_type = Choose<DataType>({DT_BFLOAT16});
+    return ExpectTfAndXlaOutputsAreClose(OpTestBuilder("Cast")
+                                             .RandomInput(src_type)
+                                             .Attr("SrcT", src_type)
+                                             .Attr("DstT", dst_type)
+                                             .Attr("Truncate", true));
+  });
+}
+
 TEST_F(OpTest, Ceil) {
   Repeatedly([this]() {
     return ExpectTfAndXlaOutputsAreClose(
@@ -3346,11 +3385,41 @@ TEST_F(OpTest, ZerosLike) {
   });
 }
 
+// Example failing run:
+//   --tf_xla_reference_device=GPU:0
+//   --tf_xla_test_use_jit=true --tf_xla_test_device=GPU:0
+//   --tf_xla_test_repetitions=2
+//   --gunit_filter='OpTest.FusedBatchNormTraining'
+//   --tf_xla_random_seed=2838146746
+TEST_F(OpTest, FusedBatchNormTraining) {
+  bool is_nhwc = RandomBool();
+  std::vector<int64> x_dims = RandomDims(/*min_rank=*/4, /*max_rank=*/4,
+                                         /*min_size=*/5, /*max_size=*/20);
+  std::vector<int64> scale_dims = {x_dims[is_nhwc ? 3 : 1]};
+  std::vector<int64> offset_dims = {x_dims[is_nhwc ? 3 : 1]};
+  std::vector<int64> mean_dims = {0};
+  std::vector<int64> variance_dims = {0};
+  DataType type = DT_FLOAT;
+  Repeatedly([&] {
+    return ExpectTfAndXlaOutputsAreClose(
+        OpTestBuilder("FusedBatchNorm")
+            .RandomInput(type, x_dims)
+            .RandomInput(type, scale_dims)
+            .RandomInput(type, offset_dims)
+            .RandomInput(type, mean_dims)
+            .RandomInput(type, variance_dims)
+            .Attr("T", type)
+            .Attr("data_format", is_nhwc ? "NHWC" : "NCHW")
+            .Attr("epsilon", static_cast<float>(1.001e-05))
+            .Attr("is_training", true));
+  });
+}
 }  // anonymous namespace
 }  // namespace tensorflow
 
 int main(int argc, char** argv) {
   tensorflow::tf_xla_test_device_ptr = new tensorflow::string("GPU:0");
+  tensorflow::tf_xla_reference_device_ptr = new tensorflow::string("CPU:0");
   std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag(
           "tf_xla_random_seed", &tensorflow::tf_xla_random_seed,
@@ -3366,6 +3435,9 @@ int main(int argc, char** argv) {
                        "Maximum number of elements for random input tensors."),
       tensorflow::Flag("tf_xla_test_device", tensorflow::tf_xla_test_device_ptr,
                        "Tensorflow device type to use for test"),
+      tensorflow::Flag("tf_xla_reference_device",
+                       tensorflow::tf_xla_reference_device_ptr,
+                       "Tensorflow device type to use for reference"),
       tensorflow::Flag("tf_xla_test_use_jit", &tensorflow::tf_xla_test_use_jit,
                        "Use JIT compilation for the operator under test"),
   };
diff --git a/tensorflow/compiler/tests/scatter_nd_op_test.py b/tensorflow/compiler/tests/scatter_nd_op_test.py
index 693f8513bc54e30060a2e963abd504768535a50a..a9a87b8fb3104f8b9870c41e2aa28b0c48c12921 100644
--- a/tensorflow/compiler/tests/scatter_nd_op_test.py
+++ b/tensorflow/compiler/tests/scatter_nd_op_test.py
@@ -134,6 +134,12 @@ class ScatterNdTest(xla_test.XLATestCase):
     expected = np.array([0, 11, 0, 10, 9, 0, 0, 12], dtype=np.int32)
     self.assertAllEqual(expected, self._runScatterNd(indices, updates, [8]))
 
+  def testRepeatedIndices(self):
+    indices = np.array([[0], [1], [0], [1]], dtype=np.int32)
+    updates = np.array([9, 10, 11, 12], dtype=np.float32)
+    expected = np.array([20, 22], dtype=np.int32)
+    self.assertAllEqual(expected, self._runScatterNd(indices, updates, [2]))
+
   def testSimple2(self):
     indices = np.array([[1, 0], [1, 1]], dtype=np.int32)
     updates = np.array([11., 12.], dtype=np.float32)
diff --git a/tensorflow/compiler/tests/self_adjoint_eig_op_test.py b/tensorflow/compiler/tests/self_adjoint_eig_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfb5c82b22ea1d7400b54045edee0ca0782ce979
--- /dev/null
+++ b/tensorflow/compiler/tests/self_adjoint_eig_op_test.py
@@ -0,0 +1,62 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.self_adjoint_eig."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.platform import test
+
+
+class SelfAdjointEigOpTest(xla_test.XLATestCase, parameterized.TestCase):
+
+  def _test(self, dtype, shape):
+    np.random.seed(1)
+    x_np = np.random.uniform(
+        low=-1.0, high=1.0, size=np.prod(shape)).reshape(shape).astype(dtype)
+    x_np = x_np + np.swapaxes(x_np, -1, -2)
+    n = shape[-1]
+
+    e_np, _ = np.linalg.eigh(x_np)
+    with self.cached_session() as sess:
+      x_tf = array_ops.placeholder(dtype)
+      with self.test_scope():
+        e, v = linalg_ops.self_adjoint_eig(x_tf)
+      e_val, v_val = sess.run([e, v], feed_dict={x_tf: x_np})
+
+      v_diff = np.matmul(v_val, np.swapaxes(v_val, -1, -2)) - np.eye(n)
+      self.assertAlmostEqual(np.mean(v_diff**2), 0.0, delta=1e-6)
+      self.assertAlmostEqual(np.mean((e_val - e_np)**2), 0.0, delta=1e-6)
+
+  SIZES = [1, 2, 5, 10, 32]
+  DTYPES = [np.float32]
+  PARAMS = itertools.product(SIZES, DTYPES)
+
+  @parameterized.parameters(*PARAMS)
+  def testSelfAdjointEig(self, n, dtype):
+    for batch_dims in [(), (3,)] + [(3, 2)] * (n < 10):
+      self._test(dtype, batch_dims + (n, n))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/stateless_random_ops_test.py b/tensorflow/compiler/tests/stateless_random_ops_test.py
index ee7ca7e6f196e114ff18e2597145e5c198980b08..df5914a518e06e4190c623a14287de8daefebd40 100644
--- a/tensorflow/compiler/tests/stateless_random_ops_test.py
+++ b/tensorflow/compiler/tests/stateless_random_ops_test.py
@@ -167,8 +167,8 @@ class StatelessRandomOpsTest(xla_test.XLATestCase):
         beta = (b - mu) / sigma
         z = normal_cdf(beta) - normal_cdf(alpha)
 
-        self.assertTrue((y >= a).sum() == n)
-        self.assertTrue((y <= b).sum() == n)
+        self.assertEqual((y >= a).sum(), n)
+        self.assertEqual((y <= b).sum(), n)
 
         # For more information on these calculations, see:
         # Burkardt, John. "The Truncated Normal Distribution".
diff --git a/tensorflow/compiler/tests/tensor_list_ops_test.py b/tensorflow/compiler/tests/tensor_list_ops_test.py
index 5c079d595c440cac644f5461154509abe7b1d1ed..a380715301b08ce2186c97b678b7235b9121d178 100644
--- a/tensorflow/compiler/tests/tensor_list_ops_test.py
+++ b/tensorflow/compiler/tests/tensor_list_ops_test.py
@@ -23,24 +23,20 @@ from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import list_ops
 from tensorflow.python.platform import test
 
 
-def scalar_shape():
-  return ops.convert_to_tensor([], dtype=dtypes.int32)
-
-
 class ListOpsTest(xla_test.XLATestCase):
 
   def testElementShape(self):
     with self.cached_session() as sess, self.test_scope():
       dim = array_ops.placeholder(dtypes.int32)
-      l = list_ops.tensor_list_reserve(
-          element_shape=(dim, 15), num_elements=20,
-          element_dtype=dtypes.float32)
+      l = list_ops.empty_tensor_list(
+          element_shape=(dim, 15),
+          element_dtype=dtypes.float32,
+          max_num_elements=20)
       e32 = list_ops.tensor_list_element_shape(l, shape_type=dtypes.int32)
       e64 = list_ops.tensor_list_element_shape(l, shape_type=dtypes.int64)
       self.assertAllEqual(sess.run(e32, {dim: 10}), (10, 15))
@@ -48,25 +44,44 @@ class ListOpsTest(xla_test.XLATestCase):
 
   def testPushPop(self):
     with self.cached_session() as sess, self.test_scope():
-      num = array_ops.placeholder(dtypes.int32)
-      l = list_ops.tensor_list_reserve(
-          element_shape=(7, 15), num_elements=num, element_dtype=dtypes.float32)
+      l = list_ops.empty_tensor_list(
+          element_shape=(7, 15),
+          element_dtype=dtypes.float32,
+          max_num_elements=10)
       l = list_ops.tensor_list_push_back(
           l, constant_op.constant(1.0, shape=(7, 15)))
       l = list_ops.tensor_list_push_back(
           l, constant_op.constant(2.0, shape=(7, 15)))
       l, e2 = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
       _, e1 = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-      self.assertAllEqual(sess.run(e2, {num: 10}), 2.0 * np.ones((7, 15)))
-      self.assertAllEqual(sess.run(e1, {num: 10}), 1.0 * np.ones((7, 15)))
+      self.assertAllEqual(sess.run(e2), 2.0 * np.ones((7, 15)))
+      self.assertAllEqual(sess.run(e1), 1.0 * np.ones((7, 15)))
+
+  def testDoNotConstantFoldVariants(self):
+    with self.cached_session() as sess, self.test_scope():
+      val = array_ops.placeholder(dtype=dtypes.float32)
+      l = list_ops.empty_tensor_list(
+          element_shape=(7, 15),
+          element_dtype=dtypes.float32,
+          max_num_elements=10)
+      # Note: Pushing a Placeholder will force the constant folding code
+      # to build a Const node with a DT_VARIANT output. This tests that XLA
+      # passes a cf_consider_fn which prevent folding such nodes.
+      l = list_ops.tensor_list_push_back(
+          l, array_ops.fill(value=val, dims=(7, 15)))
+      l = list_ops.tensor_list_push_back(
+          l, constant_op.constant(2.0, shape=(7, 15)))
+      l, e2 = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+      _, e1 = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(sess.run(e2, {val: 1.0}), 2.0 * np.ones((7, 15)))
+      self.assertAllEqual(sess.run(e1, {val: 1.0}), 1.0 * np.ones((7, 15)))
 
   def testPushPopSeparateLists(self):
     with self.cached_session() as sess, self.test_scope():
-      num = array_ops.placeholder(dtypes.int32)
-      l = list_ops.tensor_list_reserve(
-          element_shape=scalar_shape(),
-          num_elements=num,
-          element_dtype=dtypes.float32)
+      l = list_ops.empty_tensor_list(
+          element_shape=[],
+          element_dtype=dtypes.float32,
+          max_num_elements=20)
       l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
       l2 = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
       l3 = list_ops.tensor_list_push_back(l, constant_op.constant(3.0))
@@ -75,22 +90,125 @@ class ListOpsTest(xla_test.XLATestCase):
       l2, e22 = list_ops.tensor_list_pop_back(l2, element_dtype=dtypes.float32)
       l3, e31 = list_ops.tensor_list_pop_back(l3, element_dtype=dtypes.float32)
       l3, e32 = list_ops.tensor_list_pop_back(l3, element_dtype=dtypes.float32)
-      result = sess.run([e11, [e21, e22], [e31, e32]], {num: 20})
+      result = sess.run([e11, [e21, e22], [e31, e32]])
       self.assertEqual(result, [1.0, [2.0, 1.0], [3.0, 1.0]])
 
-  def testEmptyTensorList(self):
-    dim = 7
+  def testEmptyTensorListNoMax(self):
     with self.cached_session() as sess, self.test_scope():
-      p = array_ops.placeholder(dtypes.int32)
       l = list_ops.empty_tensor_list(
-          element_shape=(p, 15), element_dtype=dtypes.float32)
+          element_shape=(7, 15), element_dtype=dtypes.float32)
       l = list_ops.tensor_list_push_back(
-          l, constant_op.constant(1.0, shape=(dim, 15)))
+          l, constant_op.constant(1.0, shape=(7, 15)))
       _, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
       with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "Use TensorListReserve instead"):
-        self.assertEqual(sess.run(e, {p: dim}), 1.0 * np.ones((dim, 15)))
+                                   "Set the max number of elements"):
+        self.assertAllEqual(sess.run(e), 1.0 * np.ones((7, 15)))
 
+  def testEmptyTensorListMax(self):
+    with self.cached_session() as sess, self.test_scope():
+      l = list_ops.empty_tensor_list(
+          element_shape=(10, 15), element_dtype=dtypes.float32,
+          max_num_elements=2)
+      l = list_ops.tensor_list_push_back(
+          l, array_ops.fill(value=3.0, dims=(10, 15)))
+      _, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(sess.run(e), 3.0 * np.ones((10, 15)))
+
+  def testListFromTensor(self):
+    with self.cached_session(), self.test_scope():
+      t = constant_op.constant([1.0, 2.0])
+      l = list_ops.tensor_list_from_tensor(t, element_shape=[])
+      e = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+      self.assertAllEqual(e, 1.0)
+      l, e0 = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(e0, 2.0)
+      l, e1 = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(e1, 1.0)
+      self.assertAllEqual(list_ops.tensor_list_length(l), 0)
+
+  def testGetSet(self):
+    with self.cached_session(), self.test_scope():
+      t = constant_op.constant([1.0, 2.0])
+      l = list_ops.tensor_list_from_tensor(t, element_shape=[])
+      e0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+      self.assertAllEqual(e0, 1.0)
+      l = list_ops.tensor_list_set_item(l, 0, 3.0)
+      t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(t, [3.0, 2.0])
+
+  def testSetDoesNotUpdatePushIndex(self):
+    with self.cached_session(), self.test_scope():
+      l = list_ops.empty_tensor_list(
+          element_shape=[], element_dtype=dtypes.float32, max_num_elements=2)
+      # SetItem should not change the push index.
+      l = list_ops.tensor_list_set_item(l, 1, 3.)
+      l = list_ops.tensor_list_push_back(l, 5.)
+      l = list_ops.tensor_list_push_back(l, 7.)
+      t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(t, [5., 7.])
+
+  def testGetSetReserved(self):
+    with self.cached_session(), self.test_scope():
+      l = list_ops.tensor_list_reserve(
+          element_dtype=dtypes.float32, element_shape=[], num_elements=2)
+      e0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+      self.assertAllEqual(e0, 0.0)
+      l = list_ops.tensor_list_set_item(l, 0, 3.0)
+      t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(t, [3.0, 0.0])
+
+  def testSetStackReservedUnknownElementShape(self):
+    with self.cached_session(), self.test_scope():
+      l = list_ops.tensor_list_reserve(
+          element_dtype=dtypes.float32, element_shape=None, num_elements=2)
+      l = list_ops.tensor_list_set_item(l, 0, [3.0, 4.0])
+      t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(t, [[3.0, 4.0], [0., 0.]])
+
+  def testPushInEmptyListWithUnknownElementShape(self):
+    with self.cached_session(), self.test_scope():
+      l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.float32, element_shape=None, max_num_elements=2)
+      l = list_ops.tensor_list_push_back(l, [3.0, 4.0])
+      # Pushing an element with a different shape should raise an error.
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, "Shape"):
+        l = list_ops.tensor_list_push_back(l, 5.)
+        self.evaluate(
+            list_ops.tensor_list_stack(l, element_dtype=dtypes.float32))
+
+  def testGetSetReservedNonScalar(self):
+    with self.cached_session() as sess, self.test_scope():
+      l = list_ops.tensor_list_reserve(
+          element_dtype=dtypes.float32,
+          element_shape=(7, 15),
+          num_elements=2)
+      l = list_ops.tensor_list_set_item(
+          l, 0, constant_op.constant(1.0, shape=(7, 15)))
+      e1 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+      e2 = list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32)
+      self.assertAllEqual(sess.run(e1), np.ones((7, 15)))
+      self.assertAllEqual(sess.run(e2), np.zeros((7, 15)))
+
+  def testStack(self):
+    with self.cached_session(), self.test_scope():
+      l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.float32,
+          element_shape=[],
+          max_num_elements=2)
+      l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
+      e = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+      self.assertAllEqual(e, 1.0)
+      l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
+      t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(t.shape.as_list(), [None])
+      self.assertAllEqual(t, [1.0, 2.0])
+
+  def testStackWithUninitializedTensors(self):
+    with self.cached_session(), self.test_scope():
+      l = list_ops.tensor_list_reserve(
+          element_dtype=dtypes.float32, element_shape=[], num_elements=3)
+      t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      self.assertAllEqual(t, [0., 0., 0.])
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 95c9e7ffd4651642781143c2c1940b0e51e1e470..f2e0eac2d99fe3b71ecabd4b9977817c5f9c372c 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -72,6 +72,7 @@ class UnaryOpsTest(xla_test.XLATestCase):
         output = op(pinp)
       result = session.run(output, {pinp: inp})
       if equality_test is None:
+        self.assertEqual(output.dtype, expected.dtype)
         self.assertAllCloseAccordingToType(
             result, expected, rtol=rtol, atol=atol, bfloat16_rtol=0.03)
       else:
@@ -260,7 +261,8 @@ class UnaryOpsTest(xla_test.XLATestCase):
       self._assertOpOutputMatchesExpected(
           math_ops.log1p,
           np.array([[1e-14, 1e-15, 0.6]], dtype=dtype),
-          expected=np.log1p(np.array([[1e-14, 1e-15, 0.6]], dtype=dtype)),
+          expected=np.log1p(np.array([[1e-14, 1e-15, 0.6]],
+                                     dtype=dtype)).astype(dtype),
           rtol=1e-4,
           atol=1e-6)
 
@@ -391,6 +393,11 @@ class UnaryOpsTest(xla_test.XLATestCase):
           expected=np.array(
               [[-0.66666669, -0.5, 0, 0.5, 0.66666669]], dtype=dtype))
 
+      self._assertOpOutputMatchesExpected(
+          math_ops.sign,
+          np.array([[-2.0, -1.0, -0.0, +0.0, 1.0, 2.0]], dtype=dtype),
+          expected=np.array([[-1.0, -1.0, -0.0, +0.0, 1.0, 1.0]], dtype=dtype))
+
       self._assertOpOutputMatchesExpected(
           math_ops.is_finite,
           np.array(
@@ -647,7 +654,7 @@ class UnaryOpsTest(xla_test.XLATestCase):
           np.array([1, 2j, 2 - 3j, 4 + 5j], dtype=dtype),
           expected=np.tan(np.array([1, 2j, 2 - 3j, 4 + 5j], dtype=dtype)))
 
-      ctypes = {np.complex64: np.float32}
+      ctypes = {np.complex64: np.float32, np.complex128: np.float64}
       self._assertOpOutputMatchesExpected(
           math_ops.abs,
           np.array([[3 - 4j, -1j, np.inf]], dtype=dtype),
@@ -705,7 +712,7 @@ class UnaryOpsTest(xla_test.XLATestCase):
       self._assertOpOutputMatchesExpected(
           math_ops.abs,
           np.array([[2, -1]], dtype=dtype),
-          expected=np.array([[2, 1]], dtype=dtype))
+          expected=np.array([[2, 1]], dtype=np.real(dtype(0)).dtype))
 
       self._assertOpOutputMatchesExpected(
           math_ops.negative,
@@ -743,6 +750,10 @@ class UnaryOpsTest(xla_test.XLATestCase):
           np.array(
               [[np.NINF, -2, -1, 0, 0.5, 1, 2, np.inf, np.nan]], dtype=dtype),
           expected=np.array([[0, 0, 0, 0, 0, 0, 0, 0, 1]], dtype=np.bool))
+      self._assertOpOutputMatchesExpected(
+          math_ops.sign,
+          np.array([[np.nan]], dtype=dtype),
+          expected=np.array([[0.0]], dtype=dtype))
 
   def testLogicalOps(self):
     self._assertOpOutputMatchesExpected(
@@ -760,7 +771,7 @@ class UnaryOpsTest(xla_test.XLATestCase):
         lambda x: gen_nn_ops.bias_add_grad(x, data_format="NCHW"),
         np.array(
             [[[1., 2.], [3., 4.]], [[5., 6.], [7., 8.]]], dtype=np.float32),
-        expected=np.array([10., 26.], dtype=np.float32))
+        expected=np.array([14., 22.], dtype=np.float32))
 
   def testCast(self):
     shapes = [[], [4], [2, 3], [2, 0, 4]]
@@ -811,6 +822,12 @@ class UnaryOpsTest(xla_test.XLATestCase):
         np.array([1, 2, 0], np.int32),
         expected=np.array([2, 0, 1], dtype=np.int32))
 
+  def testInvertPermutationTwiceIsNoop(self):
+    self._assertOpOutputMatchesExpected(
+        lambda x: array_ops.invert_permutation(array_ops.invert_permutation(x)),
+        np.array([1, 2, 0], np.int32),
+        expected=np.array([1, 2, 0], dtype=np.int32))
+
   def testRank(self):
     rank_op = lambda x: array_ops.rank_internal(x, optimize=False)
     for dtype in self.numeric_types:
@@ -865,6 +882,17 @@ class UnaryOpsTest(xla_test.XLATestCase):
           np.array([[-1], [1], [4]], dtype=dtype),
           expected=np.int32(3))
 
+  def testSizeWithInt64OutType(self):
+
+    def size_op(x):
+      return array_ops.size_internal(x, optimize=False, out_type=np.int64)
+
+    for dtype in self.numeric_types:
+      self._assertOpOutputMatchesExpected(
+          size_op,
+          np.array([[-1], [1], [4]], dtype=dtype),
+          expected=np.int64(3))
+
   def testUnpack(self):
     self._assertOpOutputMatchesExpected(
         array_ops.unstack,
@@ -974,7 +1002,7 @@ class UnaryOpsTest(xla_test.XLATestCase):
   def _assertSoftplusMatchesExpected(self, features, dtype):
     features = np.array(features, dtype=dtype)
     zero = np.asarray(0).astype(dtype)
-    expected = np.logaddexp(zero, features)
+    expected = np.logaddexp(zero, features).astype(dtype)
     self._assertOpOutputMatchesExpected(
         nn_ops.softplus, features, expected=expected, rtol=1e-6, atol=9.1e-6)
 
diff --git a/tensorflow/compiler/tests/variable_ops_test.py b/tensorflow/compiler/tests/variable_ops_test.py
index fcd7ac5ba1ca5049246e93e6f5f76746fb28c6b8..18c5870e0decb686f4df1c16bbb4a340c93ad21d 100644
--- a/tensorflow/compiler/tests/variable_ops_test.py
+++ b/tensorflow/compiler/tests/variable_ops_test.py
@@ -485,7 +485,7 @@ class SliceAssignTest(xla_test.XLATestCase):
       checker2[None] = [6]  # new axis
 
   def testUninitialized(self):
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+    with self.assertRaisesRegexp(errors.FailedPreconditionError,
                                  "uninitialized variable"):
       with self.test_session() as sess, self.test_scope():
         v = resource_variable_ops.ResourceVariable([1, 2])
diff --git a/tensorflow/compiler/tests/xla_ops_test.py b/tensorflow/compiler/tests/xla_ops_test.py
index 4cf88fc523735cc2d22e085afb83790c7ebb48e4..28274ff799de2c85e1e80512cadbe0206cb640a4 100644
--- a/tensorflow/compiler/tests/xla_ops_test.py
+++ b/tensorflow/compiler/tests/xla_ops_test.py
@@ -319,7 +319,7 @@ class XlaOpsTest(xla_test.XLATestCase, parameterized.TestCase):
         session.run(output)
       self.assertRegexpMatches(
           invalid_arg_error.exception.message,
-          (r'^start_indices must be a vector with length equal to input rank, '
+          (r'start_indices must be a vector with length equal to input rank, '
            r'but input rank is 3 and start_indices has shape \[2\].*'))
 
   def testDynamicSliceWithIncorrectSizeIndicesShape(self):
@@ -332,7 +332,7 @@ class XlaOpsTest(xla_test.XLATestCase, parameterized.TestCase):
         session.run(output)
       self.assertRegexpMatches(
           invalid_arg_error.exception.message,
-          (r'^size_indices must be a vector with length equal to input rank, '
+          (r'size_indices must be a vector with length equal to input rank, '
            r'but input rank is 3 and size_indices has shape \[2\].*'))
 
 
diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..63cad6a159c3a9b0da9e3bb86ff250dd29e45729
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -0,0 +1,445 @@
+# Description:
+#   Wrap NVIDIA TensorRT (http://developer.nvidia.com/tensorrt) with tensorflow
+#   and provide TensorRT operators and converter package.
+#   APIs are meant to change over time.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+    "tf_copts",
+    "tf_cuda_library",
+    "tf_custom_op_library",
+    "tf_custom_op_library_additional_deps",
+    "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+)
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+load(
+    "@local_config_tensorrt//:build_defs.bzl",
+    "if_tensorrt",
+)
+
+tf_cuda_cc_test(
+    name = "tensorrt_test_cc",
+    size = "small",
+    srcs = ["tensorrt_test.cc"],
+    tags = [
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        "//tensorflow/core:gpu_init",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ] + if_tensorrt([
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+tf_custom_op_library(
+    name = "python/ops/_trt_ops.so",
+    srcs = [
+        "ops/get_serialized_resource_op.cc",
+        "ops/trt_engine_op.cc",
+    ],
+    deps = [
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+cc_library(
+    name = "trt_op_kernels",
+    srcs = [
+        "kernels/get_serialized_resource_op.cc",
+        "kernels/trt_engine_op.cc",
+    ],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":test_utils",
+        ":trt_allocator",
+        ":trt_conversion",
+        ":trt_logging",
+        ":trt_plugins",
+        ":trt_resources",
+        ":utils",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/core:gpu_headers_lib",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:stream_executor_headers_lib",
+        "//tensorflow/core/grappler/costs:graph_properties",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]) + tf_custom_op_library_additional_deps(),
+    alwayslink = 1,
+)
+
+tf_cuda_cc_test(
+    name = "get_serialized_resource_op_test",
+    size = "small",
+    srcs = ["kernels/get_serialized_resource_op_test.cc"],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        # TODO(laigd): consider splitting get_serialized_resource_op out from
+        # TF-TRT.
+        ":trt_op_kernels",
+        ":trt_op_libs",
+        ":trt_resources",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = [
+        "trt_engine_op",
+        "get_serialized_resource_op",
+    ],
+)
+
+cc_library(
+    name = "trt_op_libs",
+    deps = [
+        ":get_serialized_resource_op_op_lib",
+        ":trt_engine_op_op_lib",
+    ],
+)
+
+tf_cuda_library(
+    name = "trt_logging",
+    srcs = ["utils/trt_logger.cc"],
+    hdrs = ["utils/trt_logger.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+tf_gen_op_wrapper_py(
+    name = "trt_ops",
+    deps = [
+        ":trt_op_libs",
+    ],
+)
+
+tf_custom_op_py_library(
+    name = "trt_ops_loader",
+    srcs = ["python/ops/trt_ops.py"],
+    dso = [
+        "python/ops/_trt_ops.so",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+    kernels = [
+        ":trt_op_kernels",
+        ":trt_op_libs",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:resources",
+    ],
+)
+
+tf_cuda_library(
+    name = "trt_resources",
+    srcs = [
+        "utils/trt_int8_calibrator.cc",
+        "utils/trt_resources.cc",
+    ],
+    hdrs = [
+        "utils/trt_int8_calibrator.h",
+        "utils/trt_lru_cache.h",
+        "utils/trt_resources.h",
+    ],
+    deps = [
+        ":trt_allocator",
+        ":trt_logging",
+        ":utils",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+tf_cuda_library(
+    name = "trt_allocator",
+    srcs = ["utils/trt_allocator.cc"],
+    hdrs = ["utils/trt_allocator.h"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+tf_cc_test(
+    name = "trt_allocator_test",
+    size = "small",
+    srcs = ["utils/trt_allocator_test.cc"],
+    tags = [
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        ":trt_allocator",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "trt_lru_cache_test",
+    size = "small",
+    srcs = ["utils/trt_lru_cache_test.cc"],
+    tags = [
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        ":trt_resources",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+# Library for the node-level conversion portion of TensorRT operation creation
+tf_cuda_library(
+    name = "trt_conversion",
+    srcs = [
+        "convert/convert_graph.cc",
+        "convert/convert_nodes.cc",
+        "convert/trt_optimization_pass.cc",
+    ],
+    hdrs = [
+        "convert/convert_graph.h",
+        "convert/convert_nodes.h",
+        "convert/trt_optimization_pass.h",
+    ],
+    deps = [
+        ":segment",
+        ":test_utils",
+        ":trt_allocator",
+        ":trt_plugins",
+        ":trt_logging",
+        ":trt_resources",
+        ":utils",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:gpu_runtime",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:devices",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/optimizers:meta_optimizer",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]) + tf_custom_op_library_additional_deps(),
+)
+
+tf_cuda_cc_test(
+    name = "convert_graph_test",
+    size = "medium",
+    srcs = ["convert/convert_graph_test.cc"],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        ":trt_conversion",
+        "@com_google_googletest//:gtest",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+tf_cuda_cc_test(
+    name = "convert_nodes_test",
+    size = "medium",
+    srcs = ["convert/convert_nodes_test.cc"],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        ":trt_logging",
+        ":trt_conversion",
+        ":trt_plugins",
+        "@com_google_googletest//:gtest",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensor_testutil",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ] + if_tensorrt([
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+# Library for the segmenting portion of TensorRT operation creation
+cc_library(
+    name = "segment",
+    srcs = ["segment/segment.cc"],
+    hdrs = [
+        "segment/segment.h",
+        "segment/union_find.h",
+    ],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "segment_test",
+    size = "small",
+    srcs = ["segment/segment_test.cc"],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        ":segment",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+# Library for the plugin factory
+tf_cuda_library(
+    name = "trt_plugins",
+    srcs = [
+        "plugin/trt_plugin.cc",
+        "plugin/trt_plugin_factory.cc",
+        "plugin/trt_plugin_utils.cc",
+    ],
+    hdrs = [
+        "plugin/trt_plugin.h",
+        "plugin/trt_plugin_factory.h",
+        "plugin/trt_plugin_utils.h",
+    ],
+    deps = [
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+tf_cuda_cc_test(
+    name = "trt_plugin_factory_test",
+    size = "small",
+    srcs = ["plugin/trt_plugin_factory_test.cc"],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        ":trt_plugins",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ] + if_tensorrt([
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_tensorrt//:tensorrt",
+    ]),
+)
+
+cc_library(
+    name = "utils",
+    srcs = ["convert/utils.cc"],
+    hdrs = ["convert/utils.h"],
+    copts = tf_copts(),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_proto_parsing",
+    ],
+)
+
+cc_library(
+    name = "test_utils",
+    srcs = ["utils/test_utils.cc"],
+    hdrs = ["utils/test_utils.h"],
+    deps = [
+        "//tensorflow/core:lib_proto_parsing",
+        "@com_googlesource_code_re2//:re2",
+    ],
+)
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
similarity index 88%
rename from tensorflow/contrib/tensorrt/convert/convert_graph.cc
rename to tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index ae211a93c3279ff1d6de2f9c9a4b849fc8cd578d..1f3cae3fda0cd7be296882b7b17ea47554edace8 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h"
 
 #include <fstream>
 #include <list>
@@ -24,13 +24,13 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
-#include "tensorflow/contrib/tensorrt/segment/segment.h"
-#include "tensorflow/contrib/tensorrt/test/utils.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/compiler/tf2tensorrt/segment/segment.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/test_utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resources.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
@@ -63,8 +63,8 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 namespace convert {
-using ::tensorflow::strings::StrAppend;
-using ::tensorflow::strings::StrCat;
+using absl::StrAppend;
+using absl::StrCat;
 
 // Returns compiled TRT version information {Maj, Min, Patch}
 std::vector<int> GetLinkedTensorRTVersion() {
@@ -82,63 +82,81 @@ std::vector<int> GetLoadedTensorRTVersion() {
 }
 
 TrtCandidateSelector::TrtCandidateSelector(
-    const grappler::GraphProperties& graph_properties, int precision_mode)
+    const grappler::GraphProperties& graph_properties,
+    TrtPrecisionMode precision_mode)
     : graph_properties_(graph_properties), precision_mode_(precision_mode) {}
 
 Status TrtCandidateSelector::IsTensorRTCandidate(const tensorflow::Node* node) {
   // TODO(laigd): move this set to TrtNodeValidator where it should belong.
   // LINT.IfChange
-  static const std::set<string> candidate_ops = {
-      "Identity",
-      "Snapshot",
-      "Const",
-      "Conv2D",
-      "MaxPool",
-      "BiasAdd",
-      "Relu",
-      "Sigmoid",
-      "Tanh",
+  static const auto* candidate_ops = new std::set<string>{
+      "Abs",
+      "Acos",
+      "Acosh",
       "Add",
-      "Mul",
-      "Sub",
-      "Rsqrt",
-      "Pad",
-      "Mean",
+      "Asin",
+      "Asinh",
+      "Atan",
+      "Atanh",
       "AvgPool",
+      "BatchMatMul",
+      "BiasAdd",
+      "Ceil",
       "ConcatV2",
+      "Const",
+      "Conv2D",
+      "Conv2DBackpropInput",
+      "Cos",
+      "Cosh",
       "DepthwiseConv2dNative",
-      "FusedBatchNorm",
-      "FusedBatchNormV2",
       "Div",
-      "RealDiv",
-      "Rsqrt",
-      "Reciprocal",
       "Exp",
+      "ExpandDims",
+      "Floor",
+      "FusedBatchNorm",
+      "FusedBatchNormV2",
+      "GatherV2",
+      "Identity",
+      "LeakyRelu",
       "Log",
-      "Sqrt",
-      "Abs",
-      "Neg",
-      "Transpose",
-      "Reshape",
       "MatMul",
-      "BatchMatMul",
-      "Softmax",
-      "Minimum",
-      "Maximum",
-      "TopKV2",
-      "Sum",
-      "Prod",
       "Max",
+      "Maximum",
+      "MaxPool",
+      "Mean",
       "Min",
+      "Minimum",
+      "Mul",
+      "Neg",
+      "Pad",
+      "Prod",
+      "RealDiv",
+      "Reciprocal",
+      "Relu",
       "Relu6",
+      "Reshape",
+      "Rsqrt",
+      "Sigmoid",
+      "Sin",
+      "Sinh",
+      "Slice",
+      "Snapshot",
+      "Softmax",
+      "Sqrt",
       "Square",
-      "ExpandDims",
       "Squeeze",
+      "StridedSlice",
+      "Sub",
+      "Sum",
+      "Tan",
+      "Tanh",
+      "TopKV2",
+      "Transpose",
   };
   bool is_supported_op_type =
-      (candidate_ops.count(node->type_string()) ||
+      (candidate_ops->count(node->type_string()) ||
        PluginFactoryTensorRT::GetInstance()->IsPlugin(node->type_string()));
-  static const std::set<string> quantize_ops = {
+  static const auto* quantize_ops = new std::set<string>{
       "QuantizeAndDequantizeV2",
       "QuantizeAndDequantizeV3",
       "FakeQuantWithMinMaxVars",
@@ -147,10 +165,11 @@ Status TrtCandidateSelector::IsTensorRTCandidate(const tensorflow::Node* node) {
   // In INT8 mode, we will always apply the quantization ranges provided by
   // these ops to the relevant tensors. This happens regardless of the value of
   // use_calibration.
-  if (precision_mode_ == INT8MODE && quantize_ops.count(node->type_string())) {
+  if (precision_mode_ == TrtPrecisionMode::INT8 &&
+      quantize_ops->count(node->type_string())) {
     is_supported_op_type = true;
   }
-  // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.cc)
+  // LINT.ThenChange(//tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc)
   if (!is_supported_op_type) {
     return errors::Unimplemented("Op type ", node->type_string(),
                                  " is not supported");
@@ -184,60 +203,11 @@ tensorflow::Status BuildNodeMap(
 
 }  // namespace
 
-// Function to get calibration from ResourceMgr and put them into nodedef.
-tensorflow::Status ConvertCalibGraphToInferGraph(
-    const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* infer_graph,
-    bool is_dyn_op) {
-  LOG(INFO) << "Starting Calib Conversion";
-  infer_graph->CopyFrom(graph_def);
-  auto trt_rm = TRTResourceManager::instance();
-  auto calib_rm = trt_rm->getManager("TRTCalibration");
-  int num_nodes = infer_graph->node_size();
-  if (!is_dyn_op) {
-    LOG(WARNING) << "Construction of static int8 engine is not implemented "
-                    "yet!. Dynamic engine will be constructed";
-  }
-  for (int i = 0; i < num_nodes; ++i) {
-    auto n = infer_graph->mutable_node(i);
-    if (n->op() == "TRTEngineOp") {
-      VLOG(1) << "Processing " << n->name();
-      const string& container_name = n->attr().at("segment_funcdef_name").s();
-      TRTCalibrationResource* cres = nullptr;
-      auto status = calib_rm->Lookup(container_name, "Calibrator", &cres);
-      if (!status.ok()) {
-        LOG(ERROR) << "Could not get Calibration information. Did you run with "
-                      "calibration data?";
-        return tensorflow::errors::FailedPrecondition(
-            "Need to run graph with calibration data first!");
-      }
-      if (cres->calibrator_) {
-        cres->calibrator_->waitAndSetDone();
-        cres->thr_->join();
-        const auto& calibration_table =
-            cres->calibrator_->getCalibrationTableAsString();
-        if (!calibration_table.size()) {
-          LOG(ERROR) << "Calibration table is empty";
-          return tensorflow::errors::Unknown(
-              "Calibration table is missing. This shouldn't have happened!");
-        }
-        n->mutable_attr()->at("calibration_data").set_s(calibration_table);
-      } else {
-        LOG(ERROR) << "Can't get TRTCalibrator from resource manager!";
-        return tensorflow::errors::Unknown(
-            "Can't get TRTCalibrator from resource manager!");
-      }
-      cres->Unref();
-      TF_RETURN_IF_ERROR(calib_rm->Cleanup(container_name));
-    }
-  }
-  return tensorflow::Status::OK();
-}
-
 tensorflow::Status ConvertGraphDefToTensorRT(
     const tensorflow::GraphDef& graph_def,
     const std::vector<string>& output_names, size_t max_batch_size,
     size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
-    int precision_mode, int minimum_segment_size, bool is_dyn_op,
+    TrtPrecisionMode precision_mode, int minimum_segment_size, bool is_dyn_op,
     int max_cached_engines, std::vector<int> cached_engine_batches,
     bool use_calibration) {
   // Create GrapplerItem.
@@ -297,7 +267,7 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   parameters["max_batch_size"].set_i(max_batch_size);
   parameters["is_dynamic_op"].set_b(is_dyn_op);
   parameters["max_workspace_size_bytes"].set_i(max_workspace_size_bytes);
-  TF_RETURN_IF_ERROR(GetPrecisionModeName(
+  TF_RETURN_IF_ERROR(TrtPrecisionModeToName(
       precision_mode, parameters["precision_mode"].mutable_s()));
   parameters["maximum_cached_engines"].set_i(max_cached_engines);
   if (!cached_engine_batches.empty()) {
@@ -322,17 +292,23 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   return Status::OK();
 }
 
+struct EdgePtrCompare {
+  bool operator()(const tensorflow::Edge* lhs,
+                  const tensorflow::Edge* rhs) const {
+    return lhs->id() < rhs->id();
+  }
+};
+
 // Function to get subsegment information structure.
 tensorflow::Status GetEngineInfo(
     const tensorflow::Graph* g,
     const tensorflow::grappler::GraphProperties& graph_properties,
-    const std::set<string>& segment_nodes,
+    const std::set<const Node*>& segment_nodes,
     const std::unordered_map<string, tensorflow::Node*>& node_map,
     const std::vector<tensorflow::Node*>& reverse_topo_order,
     EngineInfo* info) {
-  std::vector<int> subgraph_node_ids;  // Topologically sorted node ids.
-  std::set<string> subgraph_node_names = segment_nodes;
-  std::set<int> added_const_node_ids;  // Used to prevent double insertion.
+  std::vector<const Node*> subgraph_nodes;  // Topologically sorted nodes.
+  std::set<const Node*> added_const_nodes;  // Used to prevent double insertion.
   std::set<string> segment_devices;
 
   // Map from src_node_name+port to the unique port numbers of the TRT op, where
@@ -344,26 +320,45 @@ tensorflow::Status GetEngineInfo(
   std::unordered_map<string, int> input_to_engine_port, output_to_engine_port;
   for (auto it = reverse_topo_order.rbegin(); it != reverse_topo_order.rend();
        ++it) {
-    const auto& node_name = (*it)->name();
-    if (segment_nodes.count(node_name) == 0) continue;
-    auto node = *it;
+    const Node* node = *it;
+    if (segment_nodes.count(node) == 0) continue;
     auto node_device = node->requested_device();
     if (!node_device.empty()) {
-      segment_devices.insert(node_device);
+      // If device is CPU, treat as if no device was assigned. Don't add CPU to
+      // segment_device because that would cause a segfault in
+      // GetDeviceAndAllocator. This is because GetDeviceAndAllocator assumes
+      // any already set device is a GPU.
+      DeviceNameUtils::ParsedName parsed_name;
+      DeviceNameUtils::ParseFullName(node_device, &parsed_name);
+      if (parsed_name.type == "CPU") {
+        VLOG(1) << "Node " << node->name() << " was assigned to the CPU. "
+                << "Attempting to place on GPU.";
+      } else {
+        segment_devices.insert(node_device);
+      }
     } else {
       if (node->has_assigned_device_name()) {
+        // It appears that nodes will not have assigned devices at this point in
+        // execution.
         segment_devices.insert(node->assigned_device_name());
       } else {
         VLOG(2) << "Node " << node->name()
                 << " neither have requested device nor assigned device";
       }
     }
+    subgraph_nodes.push_back(node);
+
     const int node_id = node->id();
-    subgraph_node_ids.push_back(node_id);
-    // Create input connections.
-    for (const auto edge : node->in_edges()) {
+    const string& node_name = node->name();
+
+    // Create input connections. Sort edges first to make determnistic since
+    // in_edges is a set of pointers.
+    std::vector<const tensorflow::Edge*> in_edges(node->in_edges().begin(),
+                                                  node->in_edges().end());
+    std::sort(in_edges.begin(), in_edges.end(), EdgePtrCompare());
+    for (const auto edge : in_edges) {
       auto input_node = edge->src();
-      if (input_node->IsSource() || segment_nodes.count(input_node->name())) {
+      if (input_node->IsSource() || segment_nodes.count(input_node)) {
         continue;
       }
       if (edge->IsControlEdge()) {
@@ -380,12 +375,11 @@ tensorflow::Status GetEngineInfo(
         //
         // Note that the segmenter already ensure that the constant data input
         // is valid and suppported by the engine.
-        if (!added_const_node_ids.insert(input_node->id()).second) {
+        if (!added_const_nodes.insert(input_node).second) {
           // Already added before.
           continue;
         }
         VLOG(1) << "Adding const node " << input_node->name();
-        QCHECK(subgraph_node_names.insert(input_node->name()).second);
         // Since we already add (duplicate) the const input node to the segment
         // graphdef, it's now not a data dependency any more, but to make the
         // dependency correct we still add a control dependency.
@@ -409,10 +403,14 @@ tensorflow::Status GetEngineInfo(
             node_id, edge->dst_input(), /*input_edge=*/true, port);
       }
     }
-    // Create output connections.
-    for (const auto edge : node->out_edges()) {
+    // Create output connections. Sort edges first to make determnistic since
+    // out_edges is a set of pointers.
+    std::vector<const tensorflow::Edge*> out_edges(node->out_edges().begin(),
+                                                   node->out_edges().end());
+    std::sort(out_edges.begin(), out_edges.end(), EdgePtrCompare());
+    for (const auto edge : out_edges) {
       auto output_node = edge->dst();
-      if (output_node->IsSink() || segment_nodes.count(output_node->name())) {
+      if (output_node->IsSink() || segment_nodes.count(output_node)) {
         continue;
       }
       if (edge->IsControlEdge()) {
@@ -440,12 +438,11 @@ tensorflow::Status GetEngineInfo(
   }  // For each segment node in topological order.
 
   // Construct the const nodes first.
-  subgraph_node_ids.insert(subgraph_node_ids.begin(),
-                           added_const_node_ids.begin(),
-                           added_const_node_ids.end());
+  subgraph_nodes.insert(subgraph_nodes.begin(), added_const_nodes.begin(),
+                        added_const_nodes.end());
   TF_RETURN_IF_ERROR(ConvertSegmentToGraphDef(
-      g, graph_properties, subgraph_node_names, subgraph_node_ids,
-      &info->connections, &info->segment_graph_def, &info->engine_name));
+      g, graph_properties, subgraph_nodes, &info->connections,
+      &info->segment_graph_def, &info->engine_name));
   // TODO(sami): This should not happen once segmenter is updated.
   if (segment_devices.size() == 1) {
     info->device = *segment_devices.begin();
@@ -566,6 +563,18 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
         }
         input_shape_protos.at(conn.port_number) = in_shape;
         input_shapes.at(conn.port_number) = conn.outside_shape;
+        // Shape must be fully defined (excluding batch dimension) for static
+        // mode.
+        if (info.engine_type == EngineInfo::EngineType::TRTStatic) {
+          for (int i = 1; i < conn.outside_shape.dims(); i++) {
+            if (conn.outside_shape.dim_size(i) <= 0) {
+              return tensorflow::errors::Internal(
+                  "Input shapes must be fully defined when in static mode. "
+                  "Please try is_dynamic_op=True (shape was ",
+                  conn.outside_shape.DebugString(), ")");
+            }
+          }
+        }
 
         // Rewrire data input if it's not found in original graph.
         tensorflow::Node* input_node = graph->FindNodeId(conn.outside_id);
@@ -597,7 +606,7 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
   }
 
   const bool calibrate_int8 =
-      (info.precision_mode == INT8MODE && info.use_calibration);
+      (info.precision_mode == TrtPrecisionMode::INT8 && info.use_calibration);
   // Build the engine and get its serialized representation.
   string segment_string;
   if (info.engine_type == EngineInfo::EngineType::TRTStatic || calibrate_int8) {
@@ -610,14 +619,15 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
     TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
     // TODO(sami): What happens if 1st dim is not batch?
     TF_RETURN_IF_ERROR(ConvertGraphDefToEngine(
-        info.segment_graph_def, calibrate_int8 ? FP32MODE : info.precision_mode,
+        info.segment_graph_def,
+        calibrate_int8 ? TrtPrecisionMode::FP32 : info.precision_mode,
         max_batch_size, info.max_workspace_size_bytes, input_shapes,
         &trt_logger, alloc, /*calibrator=*/nullptr, &engine,
         info.use_calibration,
         /*convert_successfully=*/nullptr));
     TrtUniquePtrType<nvinfer1::IHostMemory> engine_data(engine->serialize());
-    segment_string =
-        string((const char*)engine_data->data(), engine_data->size());
+    segment_string = string(static_cast<const char*>(engine_data->data()),
+                            engine_data->size());
     if (calibrate_int8) {
       // See above comment about why not putting this inside the 'else' branch.
       segment_string = info.segment_graph_def.SerializeAsString();
@@ -626,14 +636,8 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
     segment_string = info.segment_graph_def.SerializeAsString();
   }
 
-  // TODO(aaroey): use enum instead, and add a helper method to do the
-  // conversion.
   string prec_string;
-  TF_RETURN_IF_ERROR(GetPrecisionModeName(info.precision_mode, &prec_string));
-  if (info.precision_mode == INT8MODE && calibrate_int8 &&
-      !TRTResourceManager::instance()->getManager("TRTCalibration")) {
-    LOG(ERROR) << "Failed to construct calibration storage";
-  }
+  TF_RETURN_IF_ERROR(TrtPrecisionModeToName(info.precision_mode, &prec_string));
   tensorflow::NodeDefBuilder node_builder(info.engine_name, "TRTEngineOp");
   if (!info.device.empty()) node_builder.Device(info.device);
   if (VLOG_IS_ON(1)) {
@@ -649,7 +653,7 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
   }
 
   if (info.engine_type == EngineInfo::EngineType::TRTStatic &&
-      info.cached_engine_batches.size()) {
+      !info.cached_engine_batches.empty()) {
     LOG(WARNING) << "Cached engine batches are ignored for static engines";
   }
   tensorflow::NodeDef trt_node;
@@ -663,7 +667,6 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
           .Attr("serialized_segment", segment_string)
           .Attr("calibration_data", "")
           .Attr("max_cached_engines_count", info.maximum_cached_engines)
-          .Attr("cached_engine_batches", {max_batch_size})
           .Attr("workspace_size_bytes", info.max_workspace_size_bytes)
           .Attr("precision_mode", prec_string)
           .Attr("use_calibration", info.use_calibration)
@@ -815,6 +818,12 @@ tensorflow::Status RegisterSegmentFunctionToFunctionLibrary(
   auto native_segment = fdeflib.add_function();
   TF_RETURN_IF_ERROR(tensorflow::GraphToFunctionDef(
       sgraph, StrCat(engine_name, "_native_segment"), native_segment));
+  // Set kIntsonDeviceAttr to true so that all TRTEngineOp outputs are always on
+  // a GPU device as expected. Otherwise, some of the tensors of type DT_INT32
+  // would be on host if the op generating the tensor has host memory tag set.
+  (*native_segment
+        ->mutable_attr())[FunctionLibraryDefinition::kIntsOnDeviceAttr]
+      .set_b(true);
   if (VLOG_IS_ON(7)) {
     VLOG(7) << engine_name << " Function_Def ";
     VLOG(7) << native_segment->DebugString();
@@ -936,7 +945,8 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
       continue;
     }
     curr_engine.precision_mode = params.precision_mode;
-    if (params.use_calibration && params.precision_mode != INT8MODE) {
+    if (params.use_calibration &&
+        params.precision_mode != TrtPrecisionMode::INT8) {
       return errors::InvalidArgument(
           "Calibration with FP32 or FP16 is not supported.");
     }
@@ -1005,27 +1015,31 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
     cudaSetDevice(cuda_device_id);
     auto status = CreateTRTNode(engine_segments, i, params.max_batch_size,
                                 &graph, alloc.get(), &engine_nodes);
-    // If status is ok, we successfully added the node to the graph and can
-    // remove segment ops. Otherwise graph is not modified.
+
     string msg = StrCat("TensorRT node ", engine.engine_name,
                         " added for segment ", i, " consisting of ",
                         converted_segments.at(i).first.size(), " nodes");
     if (status.ok()) {
       LOG(INFO) << msg << " succeeded.";
-      for (auto node_name : converted_segments.at(i).first) {
-        graph.RemoveNode(node_map.at(node_name));
-      }
     } else {
       // Graph is not modified.
       LOG(WARNING) << msg << " failed: " << status << ". Fallback to TF...";
     }
     if (VLOG_IS_ON(1)) {
       msg = "Segment consists of nodes: ";
-      for (const string& node_name : converted_segments.at(i).first) {
-        StrAppend(&msg, node_name, ", ");
+      for (const Node* node : converted_segments.at(i).first) {
+        StrAppend(&msg, node->name(), ", ");
       }
       VLOG(1) << msg;
     }
+
+    // If status is ok, we successfully added the node to the graph and can
+    // remove segment ops. Otherwise graph is not modified.
+    if (status.ok()) {
+      for (const Node* node : converted_segments.at(i).first) {
+        graph.RemoveNode(const_cast<Node*>(node));
+      }
+    }
   }
   cudaSetDevice(old_cuda_device);
   graph.ToGraphDef(params.output_graph_def);
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
similarity index 82%
rename from tensorflow/contrib/tensorrt/convert/convert_graph.h
rename to tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
index 1f39f56f6392ba33af3d74fec12c326ed4451cb6..80f68d36a3ab894e97586687ee9ab93dddc73c50 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_GRAPH_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_GRAPH_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_GRAPH_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_GRAPH_H_
 
 #include <vector>
 
-#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
@@ -36,7 +36,7 @@ namespace convert {
 class TrtCandidateSelector {
  public:
   TrtCandidateSelector(const grappler::GraphProperties& graph_properties,
-                       int precision_mode);
+                       TrtPrecisionMode precision_mode);
 
   // Returns OK iff 'node' is a TF-TRT conversion candidate, which will be added
   // to TRT subgraph and later converted into TRT engine.
@@ -52,7 +52,7 @@ class TrtCandidateSelector {
   const grappler::GraphProperties& graph_properties_;
 
   // Quantization ops are only converted when using quantized precisions.
-  const int precision_mode_;
+  const TrtPrecisionMode precision_mode_;
 };
 
 struct ConversionParams {
@@ -61,7 +61,7 @@ struct ConversionParams {
         max_batch_size(1),
         max_workspace_size_bytes(1 << 30),
         output_graph_def(nullptr),
-        precision_mode(1),
+        precision_mode(TrtPrecisionMode::FP32),
         minimum_segment_size(3),
         graph_properties(nullptr),
         cluster(nullptr),
@@ -74,7 +74,7 @@ struct ConversionParams {
   size_t max_batch_size;
   size_t max_workspace_size_bytes;
   tensorflow::GraphDef* output_graph_def;
-  int precision_mode;
+  TrtPrecisionMode precision_mode;
   int minimum_segment_size;
   const tensorflow::grappler::GraphProperties* graph_properties;
   const tensorflow::grappler::Cluster* cluster;
@@ -85,12 +85,6 @@ struct ConversionParams {
   std::vector<int> cached_engine_batches;  // list of cached engines
 };
 
-// This method extracts calibration information from the resource managers
-// and puts them in to engine nodedefs.
-tensorflow::Status ConvertCalibGraphToInferGraph(
-    const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* new_graph_def,
-    bool is_dyn_op);
-
 // - max_batch_size: maximum batch size which can be used for inference for
 //   optimization targets inference run with max batch size.
 // - max_workspace_size_bytes: The upper bound of memory allowance for engine
@@ -99,9 +93,10 @@ tensorflow::Status ConvertGraphDefToTensorRT(
     const tensorflow::GraphDef& graph_def,
     const std::vector<string>& output_names, size_t max_batch_size,
     size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
-    int precision_mode = 1, int minimum_segment_size = 3,
-    bool is_dyn_op = false, int max_cached_engines = 1,
-    std::vector<int> cached_engine_batches = {}, bool use_calibration = true);
+    TrtPrecisionMode precision_mode = TrtPrecisionMode::FP32,
+    int minimum_segment_size = 3, bool is_dyn_op = false,
+    int max_cached_engines = 1, std::vector<int> cached_engine_batches = {},
+    bool use_calibration = true);
 
 // Method to call from optimization pass
 tensorflow::Status ConvertAfterShapes(ConversionParams& params);
@@ -123,4 +118,4 @@ std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator(
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_GRAPH_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_GRAPH_H_
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
similarity index 95%
rename from tensorflow/contrib/tensorrt/convert/convert_graph_test.cc
rename to tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
index 2d2bfeb192c1893824c7b30bfad593c62c203392..1a754181debf41865190aa7f9ca6a76efea98181 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -75,7 +75,7 @@ TEST(TrtCandidateSelector, Basics) {
                                          feed, const_1, matmul_attrs);
 
   // Unsupported op.
-  auto unsupported_op = ops::Sin(s.WithOpName("sin"), feed);
+  auto unsupported_op = ops::Erf(s.WithOpName("sin"), feed);
 
   // Incompatible input.
   auto incompatible_feed = ops::Placeholder(s.WithOpName("feed"), DT_DOUBLE);
@@ -98,7 +98,8 @@ TEST(TrtCandidateSelector, Basics) {
   grappler::GraphProperties graph_properties(item);
   TF_EXPECT_OK(graph_properties.InferStatically(true));
 
-  for (const int precision_mode : {FP32MODE, INT8MODE}) {
+  for (const TrtPrecisionMode precision_mode :
+       {TrtPrecisionMode::FP32, TrtPrecisionMode::INT8}) {
     TrtCandidateSelector selector(graph_properties, precision_mode);
     TF_EXPECT_OK(selector.IsTensorRTCandidate(matmul.operation.node()));
     ExpectStatus(
@@ -107,13 +108,13 @@ TEST(TrtCandidateSelector, Basics) {
         "transpose_a is not supported for TensorRT FullyConnected "
         "(op: MatMul), at: incompatible_matmul");
     ExpectStatus(selector.IsTensorRTCandidate(unsupported_op.operation.node()),
-                 error::UNIMPLEMENTED, "Op type Sin is not supported");
+                 error::UNIMPLEMENTED, "Op type Erf is not supported");
     ExpectStatus(
         selector.IsTensorRTCandidate(
             matmul_with_incompatible_input.operation.node()),
         error::INTERNAL,
         "Failed to convert input with index 0 to a TRT_TensorOrWeights");
-    if (precision_mode == INT8MODE) {
+    if (precision_mode == TrtPrecisionMode::INT8) {
       TF_EXPECT_OK(selector.IsTensorRTCandidate(quantize.operation.node()));
     } else {
       ExpectStatus(selector.IsTensorRTCandidate(quantize.operation.node()),
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
similarity index 73%
rename from tensorflow/contrib/tensorrt/convert/convert_nodes.cc
rename to tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index 777a80bbc4da7a260cf85d0a7bc5ec16f4cd3cab..9a2ac8c3e5f1d149baf5de25c940e24a8acc9125 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
 
 #include <algorithm>
 #include <cstring>
@@ -24,11 +24,13 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resources.h"
 #include "tensorflow/core/framework/node_def.pb.h"  // NOLINT
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.pb.h"        // NOLINT
@@ -43,6 +45,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/tensor_coding.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -80,10 +83,16 @@ namespace tensorrt {
 const char* const kInputPHName = "TensorRTInputPH_";
 const char* const kOutputPHName = "TensorRTOutputPH_";
 
+bool IsEngineInput(absl::string_view name) {
+  return absl::StartsWith(name, kInputPHName);
+}
+bool IsEngineOutput(absl::string_view name) {
+  return absl::StartsWith(name, kOutputPHName);
+}
+
 namespace convert {
-using ::tensorflow::str_util::Split;
-using ::tensorflow::strings::StrAppend;
-using ::tensorflow::strings::StrCat;
+using absl::StrAppend;
+using absl::StrCat;
 
 inline tensorflow::Status ConvertDType(tensorflow::DataType tf_dtype,
                                        nvinfer1::DataType* trt_dtype) {
@@ -183,6 +192,15 @@ Status ValidateTensorProperties(const string& producer_node_type,
   *trt_dims = TensorShapeToTrtDims(shape, /*ignore_first_dim=*/true);
   *batch_size = shape.dim_size(0);
 
+  // Don't convert empty tensors (dim value of 0).
+  for (int d = 1; d < shape.dims(); ++d) {
+    if (shape.dim_size(d) == 0) {
+      return errors::Unimplemented(
+          "Input tensor with shape ", shape.DebugString(),
+          " is an empty tensor, which is not supported by TRT");
+    }
+  }
+
   if (validation_only) return Status::OK();
   // Following are validations at runtime.
 
@@ -286,8 +304,8 @@ Status Converter::GetTrtBroadcastShape(
 
   const int max_nb_dims = nvinfer1::Dims::MAX_DIMS + 1;
   auto compute_output_dims =
-      [max_nb_dims](const TRT_TensorOrWeights& input, int broadcast_num_dims,
-                    int* output_dims_array, nvinfer1::Dims* output_dims) {
+      [](const TRT_TensorOrWeights& input, int broadcast_num_dims,
+         int* output_dims_array, nvinfer1::Dims* output_dims) {
         const nvinfer1::Dims input_dims = input.GetTrtDims();
         std::fill(output_dims_array, output_dims_array + max_nb_dims, 1);
         std::copy(input_dims.d, input_dims.d + input_dims.nbDims,
@@ -334,6 +352,67 @@ Status Converter::GetTrtBroadcastShape(
   return Status::OK();
 }
 
+nvinfer1::ITensor* Converter::CreateConstantLayer(
+    const TRT_ShapedWeights& weights, const nvinfer1::Dims& dims) {
+  nvinfer1::Weights trt_weights = weights.GetTrtWeights();
+  nvinfer1::IConstantLayer* layer = network()->addConstant(dims, trt_weights);
+  if (!layer) return nullptr;
+  const nvinfer1::DataType trt_dtype = trt_weights.type;
+  nvinfer1::ITensor* trt_tensor = layer->getOutput(0);
+  // TODO(laigd): there is a bug in TensorRT 5.0 library that, if we don't set
+  // the data type below, it will always be kFLOAT regardless what the data type
+  // of the weights is. Once NVIDIA fixes this bug, we should remove the data
+  // type setting logic below and test should still pass.
+  trt_tensor->setType(trt_dtype);
+  return trt_tensor;
+}
+
+tensorflow::Status CreateBroadcastableScalarConstant(
+    OpConverterParams* params, float value, const nvinfer1::Dims& dims,
+    const nvinfer1::ITensor** tensor) {
+  // In order to be broadcastable, the number of dims has to match.
+  nvinfer1::Dims broadcastable_dims(dims);
+  for (int i = 0; i < broadcastable_dims.nbDims; i++) {
+    broadcastable_dims.d[i] = 1;
+  }
+  TRT_ShapedWeights weights = params->weight_store->GetTempWeights(
+      tensorflow::DataType::DT_FLOAT, broadcastable_dims);
+  auto weights_ptr =
+      static_cast<float*>(const_cast<void*>(weights.GetValues()));
+  weights_ptr[0] = value;
+  *tensor = params->converter->CreateConstantLayer(weights, broadcastable_dims);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(*tensor, params->node_def.name());
+  params->converter->ProvideQuantizationRange(
+      const_cast<nvinfer1::ITensor*>(*tensor), value, value);
+  return Status::OK();
+}
+
+// Convert an axis from TF format to TRT format while validating. TF format
+// includes the batch dimension, while TRT does not. TF can also use negative
+// indices.
+// TODO(tmorris): Use this method in more ops.
+tensorflow::Status ConvertAxis(int tf_axis, int trt_nb_dims,
+                               absl::string_view node_name, int* trt_axis) {
+  const int tf_nb_dims = trt_nb_dims + 1;
+  // Check bounds.
+  if (tf_axis < -tf_nb_dims || tf_axis >= tf_nb_dims) {
+    return tensorflow::errors::InvalidArgument(
+        "Axis value of ", tf_axis, " is out of bounds, must be in range [",
+        -tf_nb_dims, ", ", tf_nb_dims, "), at ", node_name);
+  }
+  // Make negative axis positive.
+  if (tf_axis < 0) tf_axis += tf_nb_dims;
+  // Don't allow axis to be the batch dimension.
+  if (tf_axis == 0) {
+    return tensorflow::errors::Unimplemented(
+        "TensorRT does not allow manipulation of the batch dimension, at ",
+        node_name);
+  }
+  // Remove batch dimension.
+  *trt_axis = tf_axis - 1;
+  return Status::OK();
+}
+
 inline bool DimsEqual(const nvinfer1::Dims& dim_l,
                       const nvinfer1::Dims& dim_r) {
   if (dim_l.nbDims != dim_r.nbDims) {
@@ -347,6 +426,15 @@ inline bool DimsEqual(const nvinfer1::Dims& dim_l,
   return true;
 }
 
+bool AllLengthsEqual(const std::vector<std::vector<int>>& inputs) {
+  if (inputs.size() == 0) return true;
+  int length = inputs.at(0).size();
+  for (int i = 1; i < inputs.size(); i++) {
+    if (inputs.at(i).size() != length) return false;
+  }
+  return true;
+}
+
 inline nvinfer1::Dims GetTrtDimsForTensor(const tensorflow::Tensor& tensor) {
   nvinfer1::Dims dims;
   dims.nbDims = tensor.dims();
@@ -484,6 +572,16 @@ class TRT_TensorOrWeights::SimpleITensor : public nvinfer1::ITensor {
   float getDynamicRange() const override { return 0; }
 #endif
 
+#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+  bool dynamicRangeIsSet() const override { return true; }
+
+  void resetDynamicRange() override {}
+
+  float getDynamicRangeMin() const override { return 0.f; }
+
+  float getDynamicRangeMax() const override { return 0.f; }
+#endif
+
  private:
   nvinfer1::DataType trt_dtype_;
   nvinfer1::Dims trt_dims_;
@@ -632,6 +730,11 @@ bool TFAttrs::get<bool>(const string& key) const {
   return this->at(key)->b();
 }
 
+template <>
+int TFAttrs::get<int>(const string& key) const {
+  return this->at(key)->i();
+}
+
 // TODO(jie): reorder4 & reorder2 should be merged?
 // TODO(aaroey): fix the order of parameters.
 template <typename T>
@@ -843,7 +946,7 @@ Status TrtNodeValidator::ConvertConstToWeights(
 }
 
 Converter::Converter(nvinfer1::INetworkDefinition* trt_network,
-                     int precision_mode, bool use_calibration)
+                     TrtPrecisionMode precision_mode, bool use_calibration)
     : trt_network_(trt_network),
       precision_mode_(precision_mode),
       use_calibration_(use_calibration) {
@@ -870,13 +973,15 @@ Status Converter::ConvertNode(const NodeDef& node_def) {
   for (size_t i = 0; i < outputs.size(); ++i) {
     TRT_TensorOrWeights& output = outputs[i];
     string output_name = node_def.name();
-    if (i != 0) output_name = StrCat(output_name, ":", i);
+    if (i != 0) absl::StrAppend(&output_name, ":", i);
     // We need to check the name before setting it. If the input is one of the
     // engine input, setting the name here will overwrite engine input
     // bindings which will cause runtime error.
+    // TODO(tmorris): Remove this work-around once we use TRT's IIdentityLayer
+    // in ConvertIdentity.
     if (output.is_tensor()) {
       const char* tensor_name = output.tensor()->getName();
-      if (!tensorflow::str_util::StartsWith(tensor_name, kInputPHName)) {
+      if (!IsEngineInput(tensor_name)) {
         // TRT initializes tensor names as "(Unnamed ITensor* N)". We rename
         // them to match their corresponding TensorFlow name.
         // Note: ITensors that we create internally within TF-TRT which are
@@ -922,22 +1027,45 @@ Status Converter::AddInputTensor(const string& name, nvinfer1::DataType dtype,
 }
 
 Status Converter::RenameAndMarkOutputTensors(
-    const std::vector<std::pair<string, string>>& output_tensors) {
+    const std::vector<Converter::EngineOutputInfo>& output_tensors) {
   for (const auto& output : output_tensors) {
     TRT_TensorOrWeights tensor_or_weights;
-    TF_RETURN_IF_ERROR(GetTensorOrWeights(output.first, &tensor_or_weights));
+    TF_RETURN_IF_ERROR(
+        GetTensorOrWeights(output.source_tensor_name, &tensor_or_weights));
     if (!tensor_or_weights.is_tensor()) {
-      return errors::InvalidArgument("Output ", output.first,
+      return errors::InvalidArgument("Output ", output.source_tensor_name,
                                      " is weights not tensor");
     }
     nvinfer1::ITensor* tensor = tensor_or_weights.tensor();
     if (tensor == nullptr) {
-      return errors::NotFound("Output tensor not found: ", output.first);
+      return errors::NotFound("Output tensor not found: ",
+                              output.source_tensor_name);
     }
-    tensor->setName(output.second.c_str());
-    VLOG(1) << "Marking output tensor " << output.first << ", as output tensor "
-            << output.second;
+    // Check if this tensor has already been marked as an input or output.
+    //
+    // ConvertIdentity can cause the same tensor to be repeated in
+    // output_tensors, which can cause us to overwrite the name of the output
+    // tensor binding. For example, if we rename OutputPH_0 to OutputPH_1 then
+    // we won't be able to locate OutputPH_0 during runtime. To fix this,
+    // duplicate the tensor using no-op shuffle.
+    //
+    // TODO(tmorris): Remove this work-around once we use TRT's IIdentityLayer
+    // in ConvertIdentity.
+    if (IsEngineInput(tensor->getName()) || IsEngineOutput(tensor->getName())) {
+      // Using shuffle layer for identity by not setting reshape or transpose.
+      nvinfer1::IShuffleLayer* layer = network()->addShuffle(*tensor);
+      TFTRT_RETURN_ERROR_IF_NULLPTR(
+          layer, StrCat("Output Copy for ", tensor->getName()));
+      MarkQuantizationRangesAsInferrable(tensor, layer->getOutput(0));
+      tensor = layer->getOutput(0);
+    }
+    tensor->setName(output.dest_node_name.c_str());
     network()->markOutput(*tensor);
+    // Set type after marking as output. TRT only supports setType for engine
+    // outputs and inputs (type is inferred otherwise).
+    tensor->setType(output.trt_dtype);
+    VLOG(1) << "Marking output TRT tensor " << output.source_tensor_name
+            << ", which feeds TF node " << output.dest_node_name;
   }
   return Status::OK();
 }
@@ -1081,11 +1209,9 @@ Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input,
       *tensor = layer->getOutput(0);
     }
   } else {
-    nvinfer1::IConstantLayer* layer =
-        this->network()->addConstant(dims, input.weights().GetTrtWeights());
-    TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Reshape");
-    *tensor = layer->getOutput(0);
-    if (precision_mode() == INT8MODE && !use_calibration()) {
+    *tensor = CreateConstantLayer(input.weights(), dims);
+    TFTRT_RETURN_ERROR_IF_NULLPTR(*tensor, "TF-TRT Internal Reshape");
+    if (precision_mode() == TrtPrecisionMode::INT8 && !use_calibration()) {
       // If we are in int8 mode and not calibrating, we need to explicitly set a
       // quantization range for the output tensor of the IConstantLayer. Here we
       // set the range to [min(weights), max(weights)].
@@ -1120,7 +1246,7 @@ void Converter::ProvideQuantizationRange(nvinfer1::ITensor* tensor,
 }
 
 void Converter::MaybeApplyQuantizationRanges() {
-  if (precision_mode() != INT8MODE) return;
+  if (precision_mode() != TrtPrecisionMode::INT8) return;
 
   // Infer ranges across marked ops.
   PropagateQuantizationRanges();
@@ -1243,6 +1369,39 @@ Status Converter::GetInputs(const tensorflow::NodeDef& node_def,
   return tensorflow::Status::OK();
 }
 
+// Checks that the number of inputs match, and enforces that the inputs marked
+// as true are constant weights. true means that the input must be a weight,
+// while false means the input must be a tensor. In the future, false will mean
+// the input can be a tensor or weight.
+tensorflow::Status CheckInputsWeights(
+    const OpConverterParams& params,
+    const std::vector<std::pair<string, bool>>& inputs_is_weight) {
+  const auto& inputs = params.inputs;
+  const auto& node_def = params.node_def;
+  if (inputs.size() != inputs_is_weight.size()) {
+    return tensorflow::errors::InvalidArgument(
+        node_def.op(), " got ", inputs.size(), " inputs but expected ",
+        inputs_is_weight.size(), ", at ", node_def.name());
+  }
+  for (int i = 0; i < inputs.size(); i++) {
+    if (inputs_is_weight[i].second && inputs.at(i).is_tensor()) {
+      return tensorflow::errors::Unimplemented(
+          "The input \"", inputs_is_weight[i].first, "\" for ", node_def.op(),
+          " must be a constant, at ", node_def.name());
+    }
+    // TODO(tmorris): Remove this check and provide a method to automatically
+    // retrive an input as a tensor, converting via CreateConstantLayer if it
+    // was originally a weight. We will want a caching mechanism to prevent many
+    // duplicate constants from being created.
+    if (!inputs_is_weight[i].second && inputs.at(i).is_weights()) {
+      return tensorflow::errors::Unimplemented(
+          "The input \"", inputs_is_weight[i].first, "\" for ", node_def.op(),
+          " must be a tensor, at ", node_def.name());
+    }
+  }
+  return tensorflow::Status::OK();
+}
+
 TRT_ShapedWeights ConvertFP32ToFP16(TrtWeightStore* store,
                                     const TRT_ShapedWeights& weights_src) {
   auto dtype_new = tensorflow::DataType::DT_HALF;
@@ -1435,7 +1594,7 @@ Status BinaryTensorOpWeight(OpConverterParams* params,
         const_cast<nvinfer1::ITensor*>(tensor), permutation, &tensor));
   }
 
-  if (params->converter->precision_mode() == FP16MODE) {
+  if (params->converter->precision_mode() == TrtPrecisionMode::FP16) {
     weights = ConvertFP32ToFP16(params->weight_store, weights);
   }
 
@@ -1478,7 +1637,7 @@ Status BinaryTensorOpWeight(OpConverterParams* params,
       // Because of this issue, fall back to BinaryTensorOpTensor if we are
       // doing INT8 with no calibration. There is most likely no performance
       // penalty by falling back here.
-      if (params->converter->precision_mode() == INT8MODE &&
+      if (params->converter->precision_mode() == TrtPrecisionMode::INT8 &&
           !params->converter->use_calibration()) {
         return errors::Unimplemented(
             "Intermediate quantization range cannot be determined without"
@@ -1528,80 +1687,126 @@ Status BinaryTensorOpWeight(OpConverterParams* params,
   return tensorflow::Status::OK();
 }
 
-enum class ConvolutionType { DEFAULT, DEPTHWISE_CONV };
-
-tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group) {
+tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group,
+                                       bool is_conv2d_backprop_input) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+  TRT_TensorOrWeights backprop_output_size;
+  const nvinfer1::ITensor* tensor = nullptr;
+  if (is_conv2d_backprop_input) {
+    // In the case when Conv2dBackpropInput is used for conv2d_transpose, these
+    // inputs correspond to: output size, filter, and input.
+    TF_RETURN_IF_ERROR(CheckInputsWeights(
+        *params,
+        {{"input_sizes", true}, {"filter", true}, {"out_backprop", false}}));
+    backprop_output_size = inputs.at(0);
+    tensor = inputs.at(2).tensor();
+  } else {
+    TF_RETURN_IF_ERROR(
+        CheckInputsWeights(*params, {{"input", false}, {"filter", true}}));
+    tensor = inputs.at(0).tensor();
+  }
+  TRT_ShapedWeights weights_rsck = inputs.at(1).weights();
+  if (weights_rsck.shape_.nbDims != 4) {
+    return tensorflow::errors::InvalidArgument(
+        "Conv2D expects kernel of dimension 4, at " + node_def.name());
+  }
   TFAttrs attrs(node_def);
-
-  int h_index = 2;
-  int w_index = 3;
   auto data_format = attrs.get<string>("data_format");
-  if (data_format == "NHWC") {
+  int c_index = (data_format == "NHWC") ? 3 : 1;
+  int h_index = (data_format == "NHWC") ? 1 : 2;
+  int w_index = (data_format == "NHWC") ? 2 : 3;
+  auto tf_dilations = attrs.get<std::vector<int>>("dilations");
+  if (tf_dilations.size() != 4) {
+    return tensorflow::errors::InvalidArgument(
+        "Convolution dilations field must specify 4 dimensions, at ",
+        node_def.name());
+  }
+  if (tf_dilations[0] != 1 || tf_dilations[c_index] != 1) {
+    return tensorflow::errors::Unimplemented(
+        "Dilation rate must be 1 for batch and channel dimensions, at ",
+        node_def.name());
+  }
+  const nvinfer1::DimsHW dilation(tf_dilations[h_index], tf_dilations[w_index]);
+  if (is_conv2d_backprop_input && (dilation.d[0] != 1 || dilation.d[1] != 1)) {
+    return tensorflow::errors::Unimplemented(
+        "Dilation with Conv2DBackpropInput (conv2d_transpose) is not supported",
+        ", at ", node_def.name());
+  }
+
+  const auto tf_stride = attrs.get<std::vector<int>>("strides");
+  if (tf_stride.size() != 4) {
+    return tensorflow::errors::InvalidArgument(
+        "Convolution strides field must specify 4 dimensions, at ",
+        node_def.name());
+  }
+  if (tf_stride[0] != 1 || tf_stride[c_index] != 1) {
+    return tensorflow::errors::Unimplemented(
+        "Stride must be 1 for batch and channel dimensions, at ",
+        node_def.name());
+  }
+  const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
+  if (params->validation_only) return tensorflow::Status::OK();
+
+  // Transpose to NCHW (NCHW is required for IConvLayer).
+  const bool need_transpose = (data_format == "NHWC");
+  if (need_transpose) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
         const_cast<nvinfer1::ITensor*>(tensor), {0, 3, 1, 2}, &tensor));
-    h_index = 1;
-    w_index = 2;
-    // TODO(jie): transpose it
   }
-
-  // tensor after transpose (NCHW)
+  // Dimensions of transposed tensor.
   const auto tensor_dim = tensor->getDimensions();
 
-  int num_groups = group;
-  if (num_groups == 0) num_groups = tensor_dim.d[0];  // depthwise convolution
-  VLOG(2) << "groups count: " << num_groups;
+  // group == 0 signifies that this is a depthwise convolution, so set
+  // num_groups to size of input's channel dim. For a non-depthwise conv,
+  // num_groups will be 1.
+  const int num_groups = (group == 0) ? tensor_dim.d[0] : group;
 
-  TRT_ShapedWeights weights_rsck = inputs.at(1).weights();
-  VLOG(2) << "weight shape: " << weights_rsck.DebugString();
-  if (weights_rsck.shape_.nbDims != 4) {
-    return tensorflow::errors::Internal(
-        "Conv2D expects kernel of dimension 4, at: " + node_def.name());
-  }
-  if (params->converter->precision_mode() == FP16MODE) {
-    weights_rsck =
-        ConvertFP32ToFP16(params->weight_store, inputs.at(1).weights());
+  if (params->converter->precision_mode() == TrtPrecisionMode::FP16) {
+    weights_rsck = ConvertFP32ToFP16(params->weight_store, weights_rsck);
   }
-
+  // For conv, TF weights are RSCK, and TRT expects KCRS.
+  // For backprop, TF weights are RSKC, and TRT expects CKRS.
+  // Therefore, this reorder will work for both cases.
   TRT_ShapedWeights weights =
       params->weight_store->GetTempWeights(weights_rsck);
   ReorderRSCKToKCRS(weights_rsck, &weights, num_groups);
   TRT_ShapedWeights biases(weights.type_);
-  const int noutput = weights.shape_.d[0] * num_groups;
+  const int output_axis = is_conv2d_backprop_input ? 1 : 0;
+  const int noutput = weights.shape_.d[output_axis] * num_groups;
   nvinfer1::DimsHW kernel_size;
   kernel_size.h() = weights.shape_.d[2];
   kernel_size.w() = weights.shape_.d[3];
-  VLOG(2) << "RSCK: " << weights.DebugString();
-  VLOG(2) << "kernel size: " << kernel_size.h() << ", " << kernel_size.w();
-
-  // TODO(jie): stride. (NHWC/NCHW)
-  const auto tf_stride = attrs.get<std::vector<int>>("strides");
-  VLOG(2) << "h_INDEX" << h_index << ", w_index " << w_index;
-  VLOG(2) << "stride: " << tf_stride[0] << tf_stride[1] << tf_stride[2]
-          << tf_stride[3];
-  const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
 
+  // Add padding.
   std::vector<std::pair<int, int>> padding;
-  // TODO(jie): padding.
   if (attrs.get<string>("padding") == "SAME") {
-    // This is NCHW tensor with no batch dimension.
-    //  1 -> h
-    //  2 -> w
-    padding = CreateSamePadding(
-        stride, kernel_size,
-        {static_cast<int>(tensor_dim.d[1]), static_cast<int>(tensor_dim.d[2])});
+    nvinfer1::DimsHW effective_kernel_size = kernel_size;
+    effective_kernel_size.h() += (kernel_size.h() - 1) * (dilation.h() - 1);
+    effective_kernel_size.w() += (kernel_size.w() - 1) * (dilation.w() - 1);
+    std::vector<int64_t> input_dims;
+    if (is_conv2d_backprop_input) {
+      // For backprop, calculate padding based on "input_sizes" input, which
+      // actually corresponds to output size. ("input_sizes" makes sense in the
+      // context of Conv2DBackpropInput).
+      // We use h_index and w_index instead of 1 and 2 because we havent
+      // transposed backprop_output_size along with the input.
+      auto output_size_weights = static_cast<int*>(
+          const_cast<void*>(backprop_output_size.weights().GetValues()));
+      input_dims = {output_size_weights[h_index], output_size_weights[w_index]};
+    } else {
+      // Use 1 and 2 because tensor_dim has the dimensions of the transposed
+      // input.
+      input_dims = {static_cast<int>(tensor_dim.d[1]),
+                    static_cast<int>(tensor_dim.d[2])};
+    }
+    padding = CreateSamePadding(stride, effective_kernel_size, input_dims);
   } else {
     padding = {{0, 0}, {0, 0}};
   }
-
   if (padding[0].first != padding[0].second ||
       padding[1].first != padding[1].second) {
-    // TODO(jie): handle asymmetric padding
-    VLOG(2) << "Padding!!!: " << padding[0].first << padding[0].second
-            << padding[1].first << padding[1].second;
-    VLOG(2) << "TENSOR before: " << DebugString(tensor->getDimensions());
+    // Handle asymmetric padding.
     auto pad_layer = params->converter->network()->addPadding(
         *const_cast<nvinfer1::ITensor*>(tensor),
         nvinfer1::DimsHW(padding[0].first, padding[1].first),
@@ -1611,24 +1816,38 @@ tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group) {
         const_cast<nvinfer1::ITensor*>(tensor), pad_layer->getOutput(0));
     padding = {{0, 0}, {0, 0}};
     tensor = pad_layer->getOutput(0);
-    VLOG(2) << "TENSOR after: " << DebugString(tensor->getDimensions());
   }
 
-  nvinfer1::IConvolutionLayer* layer =
-      params->converter->network()->addConvolution(
-          *const_cast<nvinfer1::ITensor*>(tensor), noutput, kernel_size,
-          weights.GetTrtWeights(), biases.GetTrtWeights());
-  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  // Add convolution.
+  nvinfer1::ILayer* conv_layer = nullptr;
+  if (is_conv2d_backprop_input) {
+    nvinfer1::IDeconvolutionLayer* layer =
+        params->converter->network()->addDeconvolution(
+            *const_cast<nvinfer1::ITensor*>(tensor), noutput, kernel_size,
+            weights.GetTrtWeights(), biases.GetTrtWeights());
+    TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+    layer->setStride(stride);
+    layer->setPadding({padding[0].first, padding[1].first});
+    layer->setName(node_def.name().c_str());
+    layer->setNbGroups(num_groups);
+    conv_layer = layer;
+  } else {
+    nvinfer1::IConvolutionLayer* layer =
+        params->converter->network()->addConvolution(
+            *const_cast<nvinfer1::ITensor*>(tensor), noutput, kernel_size,
+            weights.GetTrtWeights(), biases.GetTrtWeights());
+    TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+    layer->setStride(stride);
+    layer->setPadding({padding[0].first, padding[1].first});
+    layer->setName(node_def.name().c_str());
+    layer->setNbGroups(num_groups);
+    layer->setDilation(dilation);
+    conv_layer = layer;
+  }
+  const nvinfer1::ITensor* output_tensor = conv_layer->getOutput(0);
 
-  layer->setStride(stride);
-  layer->setPadding({padding[0].first, padding[1].first});
-  layer->setName(node_def.name().c_str());
-  layer->setNbGroups(num_groups);
-  const nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-  VLOG(2) << "TENSOR out: " << DebugString(output_tensor->getDimensions());
-  VLOG(2) << "data_format: " << data_format;
-  if (data_format == "NHWC") {
-    // TODO(jie): transpose it back!
+  // Restore transpose.
+  if (need_transpose) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
         const_cast<nvinfer1::ITensor*>(output_tensor), {0, 2, 3, 1},
         &output_tensor));
@@ -1638,18 +1857,6 @@ tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group) {
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status ConvertConv2DHelper(OpConverterParams* params,
-                                       ConvolutionType type) {
-  switch (type) {
-    case ConvolutionType::DEFAULT:
-      return ConvertConv2DHelper(params, 1);
-    case ConvolutionType::DEPTHWISE_CONV:
-      return ConvertConv2DHelper(params, 0);
-  }
-  return tensorflow::errors::Unimplemented("unsupported convolution type at, " +
-                                           params->node_def.name());
-}
-
 Status BinaryTensorOpTensor(OpConverterParams* params,
                             const TRT_TensorOrWeights& operand_l,
                             const TRT_TensorOrWeights& operand_r) {
@@ -1677,6 +1884,13 @@ Status BinaryTensorOpTensor(OpConverterParams* params,
         "Unsupported binary op broadcast scheme for op ", node_def.name(), ": ",
         status.error_message());
   }
+  TFAttrs attrs(node_def);
+  nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("T");
+  if (dtype == nvinfer1::DataType::kINT32) {
+    return errors::Unimplemented("Binary op ", node_def.op(),
+                                 " does not support INT32, at ",
+                                 node_def.name());
+  }
   if (params->validation_only) return Status::OK();
 
   const nvinfer1::ITensor* tensor_l = nullptr;
@@ -1693,8 +1907,6 @@ Status BinaryTensorOpTensor(OpConverterParams* params,
   }
 
   // Check type consistency.
-  TFAttrs attrs(node_def);
-  nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("T");
   TFTRT_CHECK_EQ_TYPE(tensor_l->getType(), dtype)
       << DebugString(tensor_l->getType()) << " vs " << DebugString(dtype);
   TFTRT_CHECK_EQ_TYPE(tensor_r->getType(), dtype)
@@ -1754,12 +1966,8 @@ tensorflow::Status ConvertPlugin(OpConverterParams* params) {
 
 tensorflow::Status ConvertTranspose(OpConverterParams* params) {
   const auto& inputs = params->inputs;
-  if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
-      !inputs.at(1).is_weights()) {
-    return tensorflow::errors::InvalidArgument(
-        "Input expects tensor and weights, at ", params->node_def.name());
-  }
-
+  TF_RETURN_IF_ERROR(
+      CheckInputsWeights(*params, {{"x", false}, {"perm", true}}));
   // Get the permutation from weights.
   TRT_ShapedWeights weights = inputs.at(1).weights();
   const int* weights_ptr =
@@ -1792,11 +2000,8 @@ tensorflow::Status ConvertTranspose(OpConverterParams* params) {
 tensorflow::Status ConvertReshape(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  if (inputs.size() != 2 || !inputs.at(1).is_weights()) {
-    return tensorflow::errors::InvalidArgument(
-        "Input expects weights for shape, at ", node_def.name());
-  }
-
+  TF_RETURN_IF_ERROR(
+      CheckInputsWeights(*params, {{"tensor", false}, {"shape", true}}));
   TRT_TensorOrWeights input_tensor = inputs.at(0);
   TRT_ShapedWeights weights = inputs.at(1).weights();
   if (weights.count() == 0) {
@@ -1892,18 +2097,8 @@ tensorflow::Status ConvertReshape(OpConverterParams* params) {
 tensorflow::Status ConvertExpandDims(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  if (inputs.size() != 2) {
-    return tensorflow::errors::InvalidArgument(
-        "Two inputs expected for ExpandDims, at ", node_def.name());
-  }
-  if (inputs.at(0).is_weights()) {
-    return tensorflow::errors::Unimplemented(
-        "ExpandDims expects tensor for input, at ", node_def.name());
-  }
-  if (!inputs.at(1).is_weights()) {
-    return tensorflow::errors::InvalidArgument(
-        "ExpandDims expects weights for axis, at ", node_def.name());
-  }
+  TF_RETURN_IF_ERROR(
+      CheckInputsWeights(*params, {{"input", false}, {"axis", true}}));
   // Get input shape as vector.
   TRT_TensorOrWeights input_tensor = inputs.at(0);
   const nvinfer1::Dims dims = input_tensor.GetTrtDims();
@@ -1953,14 +2148,7 @@ tensorflow::Status ConvertExpandDims(OpConverterParams* params) {
 tensorflow::Status ConvertSqueeze(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  if (inputs.size() != 1) {
-    return tensorflow::errors::InvalidArgument(
-        "One input expected for Squeeze, at ", node_def.name());
-  }
-  if (inputs.at(0).is_weights()) {
-    return tensorflow::errors::Unimplemented(
-        "Squeeze expects tensor for input, at ", node_def.name());
-  }
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
   // Get input shape.
   TRT_TensorOrWeights input_tensor = inputs.at(0);
   const nvinfer1::Dims dims = input_tensor.GetTrtDims();
@@ -1971,7 +2159,7 @@ tensorflow::Status ConvertSqueeze(OpConverterParams* params) {
   // Mark axes to remove by setting them to 0.
   TFAttrs attrs(node_def);
   auto squeeze_dims = attrs.get<std::vector<int>>("squeeze_dims");
-  if (squeeze_dims.size() == 0) {
+  if (squeeze_dims.empty()) {
     return tensorflow::errors::Unimplemented(
         "Squeeze is only implemented for explicit dims, at ", node_def.name());
   }
@@ -2016,20 +2204,371 @@ tensorflow::Status ConvertSqueeze(OpConverterParams* params) {
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertStridedSliceHelper(OpConverterParams* params,
+                                             const TRT_TensorOrWeights& input,
+                                             std::vector<int> begin,
+                                             std::vector<int> size,
+                                             const std::vector<int>& stride) {
+  const auto& node_def = params->node_def;
+  // Get input dims.
+  nvinfer1::Dims dims = input.GetTrtDims();
+  std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
+  // Temporarily add batch dimension so that indexes line up properly.
+  input_dims.insert(input_dims.begin(), -1);
+  // Check bounds.
+  for (int i = 1; i < input_dims.size(); i++) {
+    if (begin[i] < 0 || begin[i] > input_dims[i]) {
+      return tensorflow::errors::InvalidArgument(
+          "\"begin\" for dimension ", std::to_string(i), " in ", node_def.op(),
+          " is out of range, at ", node_def.name());
+    }
+    const int end = begin[i] + size[i];
+    if (end < 0 || end > input_dims[i]) {
+      return tensorflow::errors::InvalidArgument(
+          "\"begin\" + \"size\" for dimension ", std::to_string(i), " in ",
+          node_def.op(), " is out of range, at ", node_def.name());
+    }
+    if (size[i] <= 0) {
+      return tensorflow::errors::InvalidArgument(
+          "\"size\" cannot be negative or zero for ", node_def.op(), ", at ",
+          node_def.name());
+    }
+  }
+// TRT 5.1 adds a slice layer. For older versions, we attempt to use the
+// padding layer with negative padding.
+#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+  // Use ISliceLayer.
+  nvinfer1::Dims begin_dims, size_dims, stride_dims;
+  TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(begin, &begin_dims,
+                                               /*ignore_first_dim=*/true));
+  TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(size, &size_dims,
+                                               /*ignore_first_dim=*/true));
+  TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(stride, &stride_dims,
+                                               /*ignore_first_dim=*/true));
+  if (params->validation_only) return Status::OK();
+
+  nvinfer1::ISliceLayer* layer = params->converter->network()->addSlice(
+      *const_cast<nvinfer1::ITensor*>(input.tensor()), begin_dims, size_dims,
+      stride_dims);
+  params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
+  return tensorflow::Status::OK();
+#else
+  // Use IPaddingLayer.
+  // Strides must be 1 in this case.
+  for (int x : stride) {
+    if (x != 1) {
+      return tensorflow::errors::Unimplemented(
+          "Strides other than 1 are not supported with this version of TRT, "
+          "at ",
+          node_def.name());
+    }
+  }
+  // Rank must be 2, 3 or 4.
+  if (input_dims.size() > 4) {
+    return tensorflow::errors::Unimplemented(node_def.op(),
+                                             " for tensors with rank > 4 is "
+                                             "not supported in this version of "
+                                             "TRT, at ",
+                                             node_def.name());
+  }
+  // Reshape if necessary to 4-D, since IPaddingLayer requires a 4-D input.
+  const bool need_reshape = (input_dims.size() != 4);
+  int reshape_dims_added = 0;
+  nvinfer1::Dims reshape_dims;
+  if (need_reshape) {
+    // Add new dims after batch dim until tensor is 4D.
+    while (input_dims.size() < 4) {
+      input_dims.insert(input_dims.begin() + 1, 1);
+      begin.insert(begin.begin() + 1, 0);
+      size.insert(size.begin() + 1, 1);
+      reshape_dims_added++;
+    }
+    TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(input_dims, &reshape_dims,
+                                                 /*ignore_first_dim=*/true));
+  }
+  // Find dimensions which need to be sliced.
+  std::vector<int> pad_dims;
+  for (int i = 1; i < input_dims.size(); i++) {
+    if ((begin[i] != 0) || (begin[i] + size[i] != input_dims[i])) {
+      pad_dims.push_back(i);
+    }
+  }
+  if (pad_dims.empty()) {
+    // No dimensions are changed, so this is a no-op. We could just return the
+    // input without creating a new layer. TRT will crash if an empty engine
+    // with no layers is attempted to be created, so we add a no-op shuffle to
+    // prevent our unit tests from breaking.
+    // TODO(tmorris): Allow empty engines in the unit tests and return the input
+    // as output here.
+    if (params->validation_only) return Status::OK();
+    nvinfer1::IShuffleLayer* layer = params->converter->network()->addShuffle(
+        *const_cast<nvinfer1::ITensor*>(input.tensor()));
+    params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
+    return tensorflow::Status::OK();
+  } else if (pad_dims.size() == 1) {
+    // Only one dim is modified but we have to have 2, mark a second dim which
+    // will have padding of 0. The dim we add is chosen to avoid an unecessary
+    // transpose.
+    if (pad_dims[0] != 2) {
+      pad_dims.push_back(2);
+    } else {
+      pad_dims.push_back(3);
+    }
+  } else if (pad_dims.size() > 2) {
+    return tensorflow::errors::Unimplemented(
+        node_def.op(),
+        " can only modify up to 2 dimensions in this version of TRT, at ",
+        node_def.name());
+  }
+  std::sort(pad_dims.begin(), pad_dims.end());
+  // Convert to pre/post padding values. Since TRT does not have a StridedSlice
+  // or Slice layer prior to 5.1, we instead create an IPaddingLayer with
+  // negative padding.
+  nvinfer1::DimsHW pre_padding, post_padding;
+  for (int i = 0; i < pad_dims.size(); i++) {
+    const int axis = pad_dims[i];
+    pre_padding.d[i] = -begin[axis];
+    post_padding.d[i] = (begin[axis] + size[axis]) - input_dims[axis];
+  }
+
+  // IPaddingLayer will always apply the padding to dims 2,3 (input format is
+  // NCHW).
+  const bool need_transpose = !(pad_dims[0] == 2 && pad_dims[1] == 3);
+  std::vector<int> transpose_order(input_dims.size());
+  std::vector<int> inv_transpose_order(input_dims.size());
+  if (need_transpose) {
+    if (pad_dims[0] == 1 && pad_dims[1] == 3) {
+      transpose_order = {0, 2, 1, 3};
+      inv_transpose_order = {0, 2, 1, 3};
+    } else if (pad_dims[0] == 1 && pad_dims[1] == 2) {
+      transpose_order = {0, 3, 1, 2};
+      inv_transpose_order = {0, 2, 3, 1};
+    }
+  }
+  if (params->validation_only) return Status::OK();
+
+  // Start conversion.
+  nvinfer1::ITensor* tensor = const_cast<nvinfer1::ITensor*>(input.tensor());
+  if (need_reshape) {
+    const nvinfer1::ITensor* output_tensor = nullptr;
+    TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
+        input, reshape_dims, &output_tensor));
+    tensor = const_cast<nvinfer1::ITensor*>(output_tensor);
+  }
+  if (need_transpose) {
+    const nvinfer1::ITensor* output_tensor = nullptr;
+    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+        tensor, transpose_order, &output_tensor));
+    tensor = const_cast<nvinfer1::ITensor*>(output_tensor);
+  }
+  // Add padding layer
+  nvinfer1::IPaddingLayer* layer = params->converter->network()->addPadding(
+      *const_cast<nvinfer1::ITensor*>(tensor), pre_padding, post_padding);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  params->converter->MarkQuantizationRangesAsInferrable(tensor,
+                                                        layer->getOutput(0));
+  tensor = layer->getOutput(0);
+  // Restore transpose
+  if (need_transpose) {
+    const nvinfer1::ITensor* output_tensor = nullptr;
+    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+        tensor, inv_transpose_order, &output_tensor));
+    tensor = const_cast<nvinfer1::ITensor*>(output_tensor);
+  }
+  // Restore reshape
+  if (need_reshape) {
+    // Calculate output dimensions
+    for (int i = 0; i < pad_dims.size(); i++) {
+      const int axis = pad_dims[i];
+      input_dims[axis] = size[axis];
+    }
+    // Remove added 1 dimensions
+    for (int i = 0; i < reshape_dims_added; i++) {
+      int value = input_dims[1];
+      if (value != 1) {
+        return tensorflow::errors::Internal(
+            "StridedSlice error when reshaping, at ", node_def.name());
+      }
+      input_dims.erase(input_dims.begin() + 1);
+    }
+
+    nvinfer1::Dims new_dims;
+    TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(input_dims, &new_dims,
+                                                 /*ignore_first_dim=*/true));
+    const nvinfer1::ITensor* output_tensor = nullptr;
+    TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
+        TRT_TensorOrWeights(tensor), new_dims, &output_tensor));
+    tensor = const_cast<nvinfer1::ITensor*>(output_tensor);
+  }
+
+  params->outputs->push_back(
+      TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(tensor)));
+  return tensorflow::Status::OK();
+#endif
+}
+
+tensorflow::Status ConvertSlice(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(CheckInputsWeights(
+      *params, {{"input", false}, {"begin", true}, {"size", true}}));
+  std::vector<int> begin = inputs.at(1).weights().ToVector<int>();
+  std::vector<int> size = inputs.at(2).weights().ToVector<int>();
+  // Get input dims.
+  nvinfer1::Dims dims = inputs.at(0).GetTrtDims();
+  std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
+  // Add batch dimension so that indexes line up properly.
+  input_dims.insert(input_dims.begin(), inputs.at(0).batch_size());
+  if (!AllLengthsEqual({input_dims, begin, size})) {
+    return tensorflow::errors::InvalidArgument(
+        "Length of begin and size arguments must equal rank of input for "
+        "Slice, at ",
+        node_def.name());
+  }
+  // Check that batch dimension is unmodified.
+  const bool begin_is_modified = begin[0] != 0;
+  // If size[0]s is not -1, we can only know if the batch dimension is
+  // unmodified when the batch size is defined. When the batch size is
+  // undefined, we don't convert to be safe.
+  const bool batch_size_is_defined = input_dims[0] > 0;
+  const bool size_is_modified =
+      size[0] != -1 && (!batch_size_is_defined ||
+                        (batch_size_is_defined && size[0] != input_dims[0]));
+  if (begin_is_modified || size_is_modified) {
+    return tensorflow::errors::Unimplemented(
+        "TensorRT does not allow modifications to the batch dimension, at ",
+        node_def.name());
+  }
+  // Size of -1 signifies to take all remaining elements.
+  for (int i = 1; i < input_dims.size(); i++) {
+    if (size[i] == -1) {
+      size[i] = input_dims[i] - begin[i];
+    }
+  }
+  // Stride is 1 for all dims.
+  std::vector<int> stride(begin.size(), 1);
+  return ConvertStridedSliceHelper(params, inputs.at(0), begin, size, stride);
+}
+
+tensorflow::Status ConvertStridedSlice(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(CheckInputsWeights(
+      *params,
+      {{"input", false}, {"begin", true}, {"end", true}, {"strides", true}}));
+  // Get input dims.
+  nvinfer1::Dims dims = inputs.at(0).GetTrtDims();
+  std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
+  // Add batch dimension so that indexes line up properly.
+  input_dims.insert(input_dims.begin(), inputs.at(0).batch_size());
+  // Get begin and end bounds per axis.
+  std::vector<int> begin = inputs.at(1).weights().ToVector<int>();
+  std::vector<int> end = inputs.at(2).weights().ToVector<int>();
+  std::vector<int> stride = inputs.at(3).weights().ToVector<int>();
+  if (!AllLengthsEqual({input_dims, begin, end, stride})) {
+    return tensorflow::errors::InvalidArgument(
+        "Length of begin, end, and stride arguments must equal rank of input "
+        "for StridedSlice, at ",
+        node_def.name());
+  }
+  // Unsupported mask options.
+  TFAttrs attrs(node_def);
+  for (const string& attr :
+       {"ellipsis_mask", "new_axis_mask", "shrink_axis_mask"}) {
+    int attr_val = attrs.get<int>(attr);
+    if (attr_val != 0) {
+      return tensorflow::errors::Unimplemented(
+          attr, " is not supported for StridedSlice, at ", node_def.name());
+    }
+  }
+  const int begin_mask = attrs.get<int>("begin_mask");
+  const int end_mask = attrs.get<int>("end_mask");
+  // Check that batch dimension is unmodified.
+  const bool begin_is_modified = !(begin_mask & 1) && begin[0] != 0;
+  const bool stride_is_modified = stride[0] != 1;
+  // If the batch size is -1 and the end mask is not set, we can only know if
+  // the batch dimension is unmodified when the batch size is defined. When the
+  // batch size is undefined, we don't convert to be safe.
+  const bool batch_size_is_defined = input_dims[0] > 0;
+  const bool end_is_modified =
+      !(end_mask & 1) && (!batch_size_is_defined ||
+                          (batch_size_is_defined && end[0] != input_dims[0]));
+  if (begin_is_modified || stride_is_modified || end_is_modified) {
+    return tensorflow::errors::Unimplemented(
+        "TensorRT does not allow modifications to the batch dimension, at ",
+        node_def.name());
+  }
+  // Standarize begin and end bounds by applying masks, making negative values
+  // positive, and correcting out of bounds ranges (StridedSlice does this
+  // silently).
+  for (int i = 1; i < input_dims.size(); i++) {
+    // Begin
+    if ((1 << i) & begin_mask) {
+      begin[i] = 0;
+    } else if (begin[i] < 0) {
+      begin[i] += input_dims[i];
+    }
+    begin[i] = std::max(0, std::min(begin[i], input_dims[i]));
+    // End
+    if ((1 << i) & end_mask) {
+      end[i] = input_dims[i];
+    } else if (end[i] < 0) {
+      end[i] += input_dims[i];
+    }
+    end[i] = std::max(0, std::min(end[i], input_dims[i]));
+  }
+  // Negative or zero strides currently not supported.
+  for (int i = 0; i < input_dims.size(); i++) {
+    if (stride[i] <= 0) {
+      return tensorflow::errors::Unimplemented(
+          "Negative or zero stride values are not supported for StridedSlice, "
+          "at ",
+          node_def.name());
+    }
+  }
+  // TRT Slice layer uses (begin, size) instead of (begin, end)
+  std::vector<int> size(input_dims.size());
+  for (int i = 0; i < input_dims.size(); i++) {
+    // Divide by stride (round up)
+    size[i] = (end[i] - begin[i] + stride[i] - 1) / stride[i];
+  }
+  return ConvertStridedSliceHelper(params, inputs.at(0), begin, size, stride);
+}
+
 tensorflow::Status ConvertConv2D(OpConverterParams* params) {
-  return ConvertConv2DHelper(params, ConvolutionType::DEFAULT);
+  return ConvertConv2DHelper(params, 1, /*is_conv2d_backprop_input=*/false);
 }
 
 tensorflow::Status ConvertConv2DDepthwise(OpConverterParams* params) {
-  return ConvertConv2DHelper(params, ConvolutionType::DEPTHWISE_CONV);
+  return ConvertConv2DHelper(params, 0, /*is_conv2d_backprop_input=*/false);
+}
+
+tensorflow::Status ConvertConv2DBackpropInput(OpConverterParams* params) {
+  return ConvertConv2DHelper(params, 1, /*is_conv2d_backprop_input=*/true);
 }
 
 tensorflow::Status ConvertPool(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
+  nvinfer1::PoolingType type;
+  if (node_def.op() == "MaxPool") {
+    type = nvinfer1::PoolingType::kMAX;
+  } else if (node_def.op() == "AvgPool") {
+    type = nvinfer1::PoolingType::kAVERAGE;
+  } else {
+    return tensorflow::errors::Unimplemented(
+        "Unsupported pooling type: ", node_def.op(), ", at ", node_def.name());
+  }
   TFAttrs attrs(node_def);
+  const string padding_type = attrs.get<string>("padding");
+  if ((padding_type != "SAME") && (padding_type != "VALID")) {
+    return tensorflow::errors::Unimplemented(
+        "Unsupported padding type: ", padding_type, ", at ", node_def.name());
+  }
+  if (params->validation_only) return Status::OK();
 
+  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
   int h_index = 2;
   int w_index = 3;
   const auto data_format = attrs.get<string>("data_format");
@@ -2040,16 +2579,6 @@ tensorflow::Status ConvertPool(OpConverterParams* params) {
         const_cast<nvinfer1::ITensor*>(tensor), {0, 3, 1, 2}, &tensor));
   }
 
-  nvinfer1::PoolingType type;
-  if (node_def.op() == "MaxPool") {
-    type = nvinfer1::PoolingType::kMAX;
-  } else if (node_def.op() == "AvgPool") {
-    type = nvinfer1::PoolingType::kAVERAGE;
-  } else {
-    return tensorflow::errors::Unimplemented("Unsupported pool type: ",
-                                             node_def.op());
-  }
-
   const auto tf_stride = attrs.get<std::vector<int>>("strides");
   const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
 
@@ -2058,7 +2587,6 @@ tensorflow::Status ConvertPool(OpConverterParams* params) {
 
   auto tensor_dim = tensor->getDimensions();
   std::vector<std::pair<int, int>> padding;
-  const string padding_type = attrs.get<string>("padding");
   if (padding_type == "SAME") {
     // This is NCHW tensor with no batch dimension.
     //  1 -> h
@@ -2068,9 +2596,6 @@ tensorflow::Status ConvertPool(OpConverterParams* params) {
         {static_cast<int>(tensor_dim.d[1]), static_cast<int>(tensor_dim.d[2])});
   } else if (padding_type == "VALID") {
     padding = {{0, 0}, {0, 0}};
-  } else {
-    return tensorflow::errors::Unimplemented("Unsupported padding type: ",
-                                             padding_type);
   }
 
   if (padding[0].first != padding[0].second ||
@@ -2112,7 +2637,9 @@ tensorflow::Status ConvertPool(OpConverterParams* params) {
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status ConvertActivation(OpConverterParams* params) {
+// TODO(tmorris): Use ActivationType::kLEAKY_RELU in TRT 5.1+ once perf
+// improves.
+tensorflow::Status ConvertLeakyRelu(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   if (inputs.size() != 1) {
@@ -2124,6 +2651,47 @@ tensorflow::Status ConvertActivation(OpConverterParams* params) {
         node_def.op(), " is only implemented for tensors, at ",
         node_def.name());
   }
+  TFAttrs attrs(node_def);
+  const float alpha = attrs.get<float>("alpha");
+  if (alpha < 0.0f || alpha > 1.0f) {
+    return tensorflow::errors::Unimplemented(
+        "Alpha value for LeakyRelu must be between 0 and 1, at ",
+        node_def.name());
+  }
+  if (params->validation_only) return tensorflow::Status::OK();
+
+  // Input Tensor
+  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+  // Create const for alpha.
+  const nvinfer1::ITensor* const_alpha_tensor = nullptr;
+  TF_RETURN_IF_ERROR(CreateBroadcastableScalarConstant(
+      params, alpha, tensor->getDimensions(), &const_alpha_tensor));
+  // alpha * x
+  nvinfer1::IElementWiseLayer* mul_layer =
+      params->converter->network()->addElementWise(
+          *const_cast<nvinfer1::ITensor*>(tensor),
+          *const_cast<nvinfer1::ITensor*>(const_alpha_tensor),
+          nvinfer1::ElementWiseOperation::kPROD);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(mul_layer, node_def.name());
+  // max(x, alpha * x)
+  nvinfer1::IElementWiseLayer* max_layer =
+      params->converter->network()->addElementWise(
+          *const_cast<nvinfer1::ITensor*>(tensor),
+          *const_cast<nvinfer1::ITensor*>(mul_layer->getOutput(0)),
+          nvinfer1::ElementWiseOperation::kMAX);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(max_layer, node_def.name());
+  nvinfer1::ITensor* output_tensor = max_layer->getOutput(0);
+  params->converter->MarkQuantizationRangesAsInferrable(
+      output_tensor, const_cast<nvinfer1::ITensor*>(mul_layer->getOutput(0)));
+
+  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return Status::OK();
+}
+
+tensorflow::Status ConvertActivation(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
   static const std::unordered_map<string, nvinfer1::ActivationType> ops{
       {"Relu", nvinfer1::ActivationType::kRELU},
       {"Sigmoid", nvinfer1::ActivationType::kSIGMOID},
@@ -2157,19 +2725,19 @@ tensorflow::Status ConvertActivation(OpConverterParams* params) {
 Status ConvertQuantize(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  if ((inputs.size() == 0) ||
-      (node_def.op() == "FakeQuantWithMinMaxArgs" && inputs.size() != 1) ||
-      (node_def.op() == "FakeQuantWithMinMaxVars" && inputs.size() != 3) ||
-      (node_def.op() == "QuantizeAndDequantizeV2" && inputs.size() != 3) ||
-      (node_def.op() == "QuantizeAndDequantizeV3" && inputs.size() != 4)) {
-    return errors::InvalidArgument("Invalid number of inputs for ",
-                                   node_def.op(), ", at ", node_def.name());
-  }
-  if (inputs.at(0).is_weights()) {
-    // TensorRT will automatically quantize weights, so we will ignore ranges
-    // for weights.
-    params->outputs->push_back(inputs.at(0));
-    return Status::OK();
+  if (node_def.op() == "FakeQuantWithMinMaxArgs") {
+    TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
+  } else if (node_def.op() == "FakeQuantWithMinMaxVars") {
+    TF_RETURN_IF_ERROR(CheckInputsWeights(
+        *params, {{"input", false}, {"min", true}, {"max", true}}));
+  } else if (node_def.op() == "QuantizeAndDequantizeV2") {
+    TF_RETURN_IF_ERROR(CheckInputsWeights(
+        *params, {{"input", false}, {"input_min", true}, {"input_max", true}}));
+  } else if (node_def.op() == "QuantizeAndDequantizeV3") {
+    TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false},
+                                                    {"input_min", true},
+                                                    {"input_max", true},
+                                                    {"num_bits", true}}));
   }
   float min_range = 0.0f;
   float max_range = 0.0f;
@@ -2186,11 +2754,6 @@ Status ConvertQuantize(OpConverterParams* params) {
              node_def.op() == "QuantizeAndDequantizeV2" ||
              node_def.op() == "QuantizeAndDequantizeV3") {
     // Get ranges via inputs.
-    if (!inputs.at(1).is_weights() || !inputs.at(2).is_weights()) {
-      return errors::InvalidArgument("Min and max inputs for ", node_def.op(),
-                                     " must be weights not tensors, at ",
-                                     node_def.name());
-    }
     auto get_weights_value = [&inputs](int index) {
       auto raw_weights = static_cast<float*>(
           const_cast<void*>(inputs.at(index).weights().GetValues()));
@@ -2221,20 +2784,11 @@ Status ConvertQuantize(OpConverterParams* params) {
   return Status::OK();
 }
 
-// TODO(pdavoodi): we should update relu6 implementation once TensorRT supports
-// Relu6 natively.
+// TODO(tmorris): Use ActivationType::kCLIP in TRT 5.1+ once perf improves.
 tensorflow::Status ConvertRelu6(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  if (inputs.size() != 1) {
-    return tensorflow::errors::InvalidArgument(
-        "Invalid number of inputs for Relu6, at ", node_def.name());
-  }
-  if (inputs.at(0).is_weights()) {
-    return tensorflow::errors::Unimplemented(
-        "Relu6 is only implemented for tensors, not weights, at ",
-        node_def.name());
-  }
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"input", false}}));
   if (params->validation_only) return Status::OK();
   // ***************************************************************************
   // TensorRT does not implement Relu6 natively. This function converts Relu6 op
@@ -2258,35 +2812,18 @@ tensorflow::Status ConvertRelu6(OpConverterParams* params) {
   params->converter->ProvideQuantizationRange(relu_layer->getOutput(0), 0.0f,
                                               6.0f);
 
-  // Create a constant layer to store the floating point weight i.e. 6.0f This
-  // tensor will be broadcasted uniformly during elementwise `min` operation.
-  // The constant has to have the same rank as the input in order for TRT to
-  // broadcast
-  nvinfer1::Dims dims;
-  dims.nbDims = relu_layer->getOutput(0)->getDimensions().nbDims;
-  for (int i = 0; i < dims.nbDims; i++) {
-    dims.d[i] = 1;
-  }
-  TRT_ShapedWeights weights = params->weight_store->GetTempWeights(
-      tensorflow::DataType::DT_FLOAT, dims);
-  auto weights_ptr =
-      static_cast<float*>(const_cast<void*>(weights.GetValues()));
-  weights_ptr[0] = 6.0f;
-  nvinfer1::IConstantLayer* const6_layer =
-      params->converter->network()->addConstant(dims, weights.GetTrtWeights());
-  TFTRT_RETURN_ERROR_IF_NULLPTR(const6_layer, node_def.name());
-  params->converter->ProvideQuantizationRange(const6_layer->getOutput(0), 0.0f,
-                                              6.0f);
+  // Create a constant layer to store the floating point weight i.e. 6.0f
+  const nvinfer1::ITensor* const6_tensor = nullptr;
+  TF_RETURN_IF_ERROR(CreateBroadcastableScalarConstant(
+      params, 6.0f, relu_layer->getOutput(0)->getDimensions(), &const6_tensor));
 
   // ElementWise Min Operation
   // Min op is a nop for INT8 execution path, as the input tensor
   // to this layer will only have values in range [0.f, 6.0f].
-  const nvinfer1::ITensor* tensor_l = relu_layer->getOutput(0);
-  const nvinfer1::ITensor* tensor_r = const6_layer->getOutput(0);
   nvinfer1::IElementWiseLayer* relu6_layer =
       params->converter->network()->addElementWise(
-          *const_cast<nvinfer1::ITensor*>(tensor_l),
-          *const_cast<nvinfer1::ITensor*>(tensor_r),
+          *const_cast<nvinfer1::ITensor*>(relu_layer->getOutput(0)),
+          *const_cast<nvinfer1::ITensor*>(const6_tensor),
           nvinfer1::ElementWiseOperation::kMIN);
   TFTRT_RETURN_ERROR_IF_NULLPTR(relu6_layer, node_def.name());
   nvinfer1::ITensor* output_tensor = relu6_layer->getOutput(0);
@@ -2299,17 +2836,20 @@ tensorflow::Status ConvertRelu6(OpConverterParams* params) {
 tensorflow::Status ConvertBiasAdd(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
-      !inputs.at(1).is_weights()) {
-    return errors::InvalidArgument("Input expects tensor and weights, at ",
-                                   node_def.name());
+  TF_RETURN_IF_ERROR(
+      CheckInputsWeights(*params, {{"value", false}, {"bias", true}}));
+  TFAttrs attrs(node_def);
+  tensorflow::DataType tf_dtype = attrs.get<tensorflow::DataType>("T");
+  if (tf_dtype != DataType::DT_FLOAT && tf_dtype != DataType::DT_HALF) {
+    return errors::Unimplemented("Data type is not supported, for node ",
+                                 node_def.name(), " got ",
+                                 DataTypeString(tf_dtype));
   }
   if (params->validation_only) return Status::OK();
 
   nvinfer1::ITensor* tensor =
       const_cast<nvinfer1::ITensor*>(inputs.at(0).tensor());
   const nvinfer1::Dims original_dims = tensor->getDimensions();
-  TFAttrs attrs(node_def);
   const string data_format = attrs.get<string>("data_format");
   const int channel_index =
       (data_format == "NHWC" ? original_dims.nbDims - 1 : 0);
@@ -2355,7 +2895,7 @@ tensorflow::Status ConvertBiasAdd(OpConverterParams* params) {
   }
 
   TRT_ShapedWeights weights = inputs.at(1).weights();
-  if (params->converter->precision_mode() == FP16MODE) {
+  if (params->converter->precision_mode() == TrtPrecisionMode::FP16) {
     weights = ConvertFP32ToFP16(params->weight_store, weights);
   }
   nvinfer1::ScaleMode mode = nvinfer1::ScaleMode::kCHANNEL;
@@ -2399,43 +2939,69 @@ tensorflow::Status ConvertBiasAdd(OpConverterParams* params) {
   return Status::OK();
 }
 
-Status GetTensorDimsWithProtoShape(const Tensor& tensor,
-                                   int tensor_proto_array_len,
-                                   nvinfer1::Dims* dims) {
+void GetTensorDimsWithProtoShape(const Tensor& tensor, nvinfer1::Dims* dims) {
   if (tensor.dims() > 0) {
     *dims = GetTrtDimsForTensor(tensor);
-    if (TrtDimsNumElements(*dims) != tensor_proto_array_len &&
-        tensor_proto_array_len != 1) {
-      return errors::InvalidArgument(
-          "Broadcast on weights only supports kCHANNEL and kUNIFORM");
-    }
   } else {
     dims->nbDims = 1;
     // No dimension provided. Flatten it.
-    dims->d[0] = tensor_proto_array_len;
+    dims->d[0] = tensor.NumElements();
     dims->type[0] = nvinfer1::DimensionType::kSPATIAL;
     for (int i = 1; i < nvinfer1::Dims::MAX_DIMS; ++i) {
       dims->d[i] = 0;
     }
   }
-  return Status::OK();
 }
 
-template <typename CType>
-Status TfTensorToTrtWeights(const DataType dtype, const Tensor& tensor,
-                            const CType* tensor_proto_array,
-                            int tensor_proto_array_len, TrtWeightStore* store,
+Status TfTensorToTrtWeights(const Tensor& tensor, TrtWeightStore* weight_store,
                             TRT_ShapedWeights* weights) {
+  const DataType dtype = tensor.dtype();
+
+  // We always convert the integer constants to INT32, since TRT INT8 is for
+  // quantized inference.
+  //
+  // TODO(aaroey): FP16 will remain in half format and is not converted to
+  // FP32, but the converter currently uses all float weights as FP32. Fix
+  // this.
+  const DataType converted_dtype =
+      (dtype == DT_INT16 || dtype == DT_INT8 || dtype == DT_UINT8 ? DT_INT32
+                                                                  : dtype);
+
+  // Verify that the dtype is supported by TensorRT. Otherwise, return an error.
+  nvinfer1::DataType trt_dtype;
+  TF_RETURN_IF_ERROR(ConvertDType(converted_dtype, &trt_dtype));
+
+  if (tensor.NumElements() == 0) {
+    // Return empty weights having converted dtype.
+    *weights = TRT_ShapedWeights(converted_dtype);
+    return Status::OK();
+  }
+
   nvinfer1::Dims weight_dims;
-  TF_RETURN_IF_ERROR(GetTensorDimsWithProtoShape(tensor, tensor_proto_array_len,
-                                                 &weight_dims));
-  *weights = store->GetTempWeights(dtype, weight_dims);
-  void* dst = const_cast<void*>(weights->GetValues());
-  if (tensor_proto_array_len == 1) {
-    std::fill_n((CType*)dst, TrtDimsNumElements(weight_dims),
-                *tensor_proto_array);
+  GetTensorDimsWithProtoShape(tensor, &weight_dims);
+  *weights = weight_store->GetTempWeights(converted_dtype, weight_dims);
+
+  // Copy the tensor directly if the tensor does not require cast to the
+  // supported type.
+  if (converted_dtype == dtype) {
+    char* dst = static_cast<char*>(const_cast<void*>(weights->GetValues()));
+    memcpy(dst, tensor.tensor_data().data(), tensor.TotalBytes());
+    return Status::OK();
+  }
+
+  // Copy tensor elements after casting them to the converted DataType.
+  int32* dst = static_cast<int32*>(const_cast<void*>(weights->GetValues()));
+  if (dtype == DT_INT16) {
+    const int16* src = tensor.flat<int16>().data();
+    std::copy(src, src + tensor.NumElements(), dst);
+  } else if (dtype == DT_INT8) {
+    const int8* src = tensor.flat<int8>().data();
+    std::copy(src, src + tensor.NumElements(), dst);
   } else {
-    memcpy(dst, tensor_proto_array, weights->size_bytes());
+    // dtype can only be DT_UINT8 at this point.
+    TFTRT_CHECK_EQ_TYPE(dtype, DT_UINT8);
+    const uint8* src = tensor.flat<uint8>().data();
+    std::copy(src, src + tensor.NumElements(), dst);
   }
   return Status::OK();
 }
@@ -2453,15 +3019,6 @@ tensorflow::Status ConvertConst(OpConverterParams* params) {
         "Constant node is expected to have empty input list: ",
         node_def.name());
   }
-  TFAttrs attrs(node_def);
-  const DataType dtype = attrs.get<tensorflow::DataType>("dtype");
-  // We always convert the integer constants to kINT32, since TRT kINT8 is for
-  // quantized inference.
-  const DataType converted_dtype =
-      (dtype == DT_INT16 || dtype == DT_INT8 || dtype == DT_UINT8 ? DT_INT32
-                                                                  : dtype);
-  nvinfer1::DataType trt_dtype;
-  TF_RETURN_IF_ERROR(ConvertDType(converted_dtype, &trt_dtype));
 
   // Create shaped weights as output
   const auto& tensor_proto = node_def.attr().at("value").tensor();
@@ -2471,78 +3028,18 @@ tensorflow::Status ConvertConst(OpConverterParams* params) {
                                         node_def.name());
   }
 
-  TRT_ShapedWeights weights(converted_dtype);
-  if (tensor.NumElements() == 0) {
-    // Do nothing.
-  } else if (!tensor_proto.float_val().empty()) {
-    TF_RETURN_IF_ERROR(TfTensorToTrtWeights(
-        converted_dtype, tensor, tensor_proto.float_val().begin(),
-        tensor_proto.float_val_size(), params->weight_store, &weights));
-  } else if (!tensor_proto.int_val().empty()) {
-    TF_RETURN_IF_ERROR(TfTensorToTrtWeights(
-        converted_dtype, tensor, tensor_proto.int_val().begin(),
-        tensor_proto.int_val_size(), params->weight_store, &weights));
-  } else if (!tensor_proto.half_val().empty()) {
-    // TODO(aaroey): implement fp16 conversion.
-    return errors::Unimplemented("fp16 constant is not supported yet.");
-  } else if (!tensor_proto.tensor_content().empty()) {
-    // TODO(aaroey): fp16 will remain in half format and is not converted to
-    // fp32, but the converter currently uses all float weights as fp32. Fix
-    // this.
-    const auto& content = tensor_proto.tensor_content();
-    if (content.size() > 0) {
-      const int dtype_size = tensorflow::DataTypeSize(dtype);
-      if (content.size() % dtype_size != 0) {
-        return errors::FailedPrecondition("Tensor content size ",
-                                          content.size(),
-                                          " is not a multiple of ", dtype_size);
-      }
-      nvinfer1::Dims weights_dim;
-      TF_RETURN_IF_ERROR(GetTensorDimsWithProtoShape(
-          tensor, content.size() / dtype_size, &weights_dim));
-      const int64_t size_bytes = TrtDimsNumElements(weights_dim) * dtype_size;
-      if (content.size() != size_bytes) {
-        return errors::FailedPrecondition(
-            "Tensor size and TensorProto content size mismatch: ", size_bytes,
-            " vs ", content.size());
-      } else if (tensor.NumElements() != content.size() / dtype_size) {
-        return errors::FailedPrecondition(
-            "Tensor elements count and TensorProto content size mismatch: ",
-            tensor.NumElements(), " vs ", content.size() / dtype_size);
-      }
-      weights =
-          params->weight_store->GetTempWeights(converted_dtype, weights_dim);
-      if (dtype_size == tensorflow::DataTypeSize(converted_dtype)) {
-        port::CopyToArray(content, static_cast<char*>(
-                                       const_cast<void*>(weights.GetValues())));
-      } else {
-        // Copy out the weights as original data type.
-        std::vector<uint8_t> temp_weights(content.size());
-        port::CopyToArray(content,
-                          reinterpret_cast<char*>(temp_weights.data()));
-        int32* dst =
-            static_cast<int32*>(const_cast<void*>(weights.GetValues()));
-        // Copy to the weight store as converted data type.
-        if (dtype == DT_INT16) {
-          int16* data = reinterpret_cast<int16*>(temp_weights.data());
-          std::copy(data, data + tensor.NumElements(), dst);
-        } else if (dtype == DT_INT8) {
-          int8* data = reinterpret_cast<int8*>(temp_weights.data());
-          std::copy(data, data + tensor.NumElements(), dst);
-        } else if (dtype == DT_UINT8) {
-          uint8* data = reinterpret_cast<uint8*>(temp_weights.data());
-          std::copy(data, data + tensor.NumElements(), dst);
-        } else {
-          return errors::FailedPrecondition(
-              "Unexpected data type: ", DataTypeString(dtype),
-              " at: ", node_def.name());
-        }
-      }
-    }
-  } else {
-    return errors::Unimplemented("Not supported constant type, at ",
-                                 node_def.name());
+  TFAttrs attrs(node_def);
+  const DataType dtype = attrs.get<tensorflow::DataType>("dtype");
+  if (dtype != tensor.dtype()) {
+    return errors::InvalidArgument("DataType mismatch between attr (",
+                                   DataTypeString(dtype), ") and tensor (",
+                                   DataTypeString(tensor.dtype()), ")");
   }
+
+  TRT_ShapedWeights weights;
+  TF_RETURN_IF_ERROR(
+      TfTensorToTrtWeights(tensor, params->weight_store, &weights));
+
   if (params->outputs != nullptr) {
     params->outputs->push_back(TRT_TensorOrWeights(weights));
   }
@@ -2560,9 +3057,13 @@ tensorflow::Status ConvertIdentity(OpConverterParams* params) {
 Status ConvertBinary(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
+  // TODO(tmorris): Enable once false is updated to mean either tensor or weight
+  // TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}, {"y",
+  // false}}));
   if (inputs.size() != 2) {
-    return errors::InvalidArgument("Binary ops require two inputs, at ",
-                                   node_def.name());
+    return tensorflow::errors::InvalidArgument(
+        node_def.op(), " got ", inputs.size(), " inputs but expected 2, at ",
+        node_def.name());
   }
 
   // Constant folding should have been done by TensorFlow
@@ -2601,62 +3102,104 @@ Status ConvertBinary(OpConverterParams* params) {
   return status;
 }
 
-tensorflow::Status ConvertUnary(OpConverterParams* params) {
+tensorflow::Status ConvertRsqrt(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  static const std::unordered_map<string, nvinfer1::UnaryOperation> ops{
-      {"Neg", nvinfer1::UnaryOperation::kNEG},
-      {"Exp", nvinfer1::UnaryOperation::kEXP},
-      {"Log", nvinfer1::UnaryOperation::kLOG},
-      {"Sqrt", nvinfer1::UnaryOperation::kSQRT},
-      {"Abs", nvinfer1::UnaryOperation::kABS},
-      {"Reciprocal", nvinfer1::UnaryOperation::kRECIP},
-  };
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}}));
+  if (params->validation_only) return tensorflow::Status::OK();
 
-  if (inputs.size() != 1) {
-    return tensorflow::errors::FailedPrecondition(
-        "Unary ops require single tensor input, at ", node_def.name());
+  // TODO(tmorris): params->converter is null during validation. Allow
+  // precision_mode and use_calibration to be accessed during validation and
+  // include this check in validation.
+  // We will need a quantization range for intermediate tensor if not using
+  // calibration.
+  //
+  //   x -> [Sqrt] -> sqrt(x) -> [Recip] -> 1/sqrt(x)
+  //                     ^
+  //               need range here
+  if (params->converter->precision_mode() == TrtPrecisionMode::INT8 &&
+      !params->converter->use_calibration()) {
+    return errors::Unimplemented(
+        "Intermediate quantization range cannot be determined without"
+        " calibration for Rsqrt, consider replacing with "
+        "Sqrt -> FakeQuant -> Reciprocal ops, at ",
+        node_def.name());
   }
+  // Start conversion.
+  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+  // Sqrt
+  nvinfer1::IUnaryLayer* sqrt_layer = params->converter->network()->addUnary(
+      *const_cast<nvinfer1::ITensor*>(tensor), nvinfer1::UnaryOperation::kSQRT);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(sqrt_layer, node_def.name());
+  // Recip
+  nvinfer1::IUnaryLayer* recip_layer = params->converter->network()->addUnary(
+      *sqrt_layer->getOutput(0), nvinfer1::UnaryOperation::kRECIP);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(recip_layer, node_def.name());
+  params->outputs->push_back(TRT_TensorOrWeights(recip_layer->getOutput(0)));
+  return tensorflow::Status::OK();
+}
 
-  // TODO(jie): check type
-  const nvinfer1::ITensor* tensor = nullptr;
-  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      inputs.at(0), inputs.at(0).GetTrtDims(), &tensor));
+const std::unordered_map<string, nvinfer1::UnaryOperation>*
+UnaryOperationMap() {
+  static auto* const m =
+      new std::unordered_map<string, nvinfer1::UnaryOperation>({
+        {"Neg", nvinfer1::UnaryOperation::kNEG},
+            {"Exp", nvinfer1::UnaryOperation::kEXP},
+            {"Log", nvinfer1::UnaryOperation::kLOG},
+            {"Sqrt", nvinfer1::UnaryOperation::kSQRT},
+            {"Abs", nvinfer1::UnaryOperation::kABS},
+            {"Reciprocal", nvinfer1::UnaryOperation::kRECIP},
+#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+            {"Sin", nvinfer1::UnaryOperation::kSIN},
+            {"Cos", nvinfer1::UnaryOperation::kCOS},
+            {"Tan", nvinfer1::UnaryOperation::kTAN},
+            {"Sinh", nvinfer1::UnaryOperation::kSINH},
+            {"Cosh", nvinfer1::UnaryOperation::kCOSH},
+            {"Asin", nvinfer1::UnaryOperation::kASIN},
+            {"Acos", nvinfer1::UnaryOperation::kACOS},
+            {"Atan", nvinfer1::UnaryOperation::kATAN},
+            {"Asinh", nvinfer1::UnaryOperation::kASINH},
+            {"Acosh", nvinfer1::UnaryOperation::kACOSH},
+            {"Atanh", nvinfer1::UnaryOperation::kATANH},
+            {"Ceil", nvinfer1::UnaryOperation::kCEIL},
+            {"Floor", nvinfer1::UnaryOperation::kFLOOR},
+#endif
+      });
+  return m;
+}
 
-  nvinfer1::IUnaryLayer* layer;
-  if (node_def.op() == "Rsqrt") {
-    // We will need a quantization range for intermediate tensor if not using
-    // calibration.
-    //
-    //   x -> [Sqrt] -> sqrt(x) -> [Recip] -> 1/sqrt(x)
-    //                     ^
-    //               need range here
-    if (params->converter->precision_mode() == INT8MODE &&
-        !params->converter->use_calibration()) {
-      return errors::Unimplemented(
-          "Intermediate quantization range cannot be determined without"
-          " calibration for Rsqrt, consider replacing with "
-          "Sqrt -> FakeQuant -> Reciprocal ops, at ",
-          node_def.name());
-    }
-    layer = params->converter->network()->addUnary(
-        *const_cast<nvinfer1::ITensor*>(tensor),
-        nvinfer1::UnaryOperation::kSQRT);
-    TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-    tensor = layer->getOutput(0);
-    layer = params->converter->network()->addUnary(
-        *const_cast<nvinfer1::ITensor*>(tensor),
-        nvinfer1::UnaryOperation::kRECIP);
-  } else if (ops.count(node_def.op()) != 0) {
-    layer = params->converter->network()->addUnary(
-        *const_cast<nvinfer1::ITensor*>(tensor), ops.at(node_def.op()));
-  } else {
-    return tensorflow::errors::InvalidArgument(
-        "Binary op: ", node_def.op(), " not supported, at ", node_def.name());
+tensorflow::Status ConvertUnary(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}}));
+  auto op_pair = UnaryOperationMap()->find(node_def.op());
+  if (op_pair == UnaryOperationMap()->end()) {
+    return tensorflow::errors::Unimplemented(
+        "Unary op: ", node_def.op(), " not supported at: ", node_def.name());
   }
+  if (params->validation_only) return tensorflow::Status::OK();
 
+  // Start conversion.
+  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
+  nvinfer1::IUnaryLayer* layer = params->converter->network()->addUnary(
+      *const_cast<nvinfer1::ITensor*>(tensor), op_pair->second);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+
+  // Set quantization ranges.
+  if (node_def.op() == "Sin" || node_def.op() == "Cos") {
+    params->converter->ProvideQuantizationRange(output_tensor, -1.0f, 1.0f);
+  } else if (node_def.op() == "Asin" || node_def.op() == "Atan") {
+    params->converter->ProvideQuantizationRange(output_tensor, -M_PI_2, M_PI_2);
+  } else if (node_def.op() == "Acos") {
+    params->converter->ProvideQuantizationRange(output_tensor, 0.0f, M_PI);
+  } else if (node_def.op() == "Neg" || node_def.op() == "Abs") {
+    // Neg and Abs will have same range as input since TRT uses symmetric
+    // quantization.
+    // TODO(tmorris): Should we infer ranges for Ceil and Floor as well?
+    params->converter->MarkQuantizationRangesAsInferrable(
+        const_cast<nvinfer1::ITensor*>(tensor), output_tensor);
+  }
   params->outputs->push_back(
       TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(output_tensor)));
   return tensorflow::Status::OK();
@@ -2665,14 +3208,7 @@ tensorflow::Status ConvertUnary(OpConverterParams* params) {
 tensorflow::Status ConvertSquare(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  if (inputs.size() != 1) {
-    return tensorflow::errors::InvalidArgument("Square expects one input, at ",
-                                               node_def.name());
-  }
-  if (inputs.at(0).is_weights()) {
-    return tensorflow::errors::Unimplemented(
-        "Square is only implemented for tensors, at ", node_def.name());
-  }
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}}));
   if (params->validation_only) return Status::OK();
 
   // Constant 2 with same rank as input
@@ -2685,18 +3221,15 @@ tensorflow::Status ConvertSquare(OpConverterParams* params) {
   auto weights_ptr =
       static_cast<float*>(const_cast<void*>(weights.GetValues()));
   weights_ptr[0] = 2.f;
-  nvinfer1::IConstantLayer* const2_layer =
-      params->converter->network()->addConstant(dims, weights.GetTrtWeights());
-  TFTRT_RETURN_ERROR_IF_NULLPTR(const2_layer, node_def.name());
+  nvinfer1::ITensor* const2_tensor =
+      params->converter->CreateConstantLayer(weights, dims);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(const2_tensor, node_def.name());
 
   // ElementWise Pow Operation
-  const nvinfer1::ITensor* tensor_l = inputs.at(0).tensor();
-  const nvinfer1::ITensor* tensor_r = const2_layer->getOutput(0);
   nvinfer1::IElementWiseLayer* layer =
       params->converter->network()->addElementWise(
-          *const_cast<nvinfer1::ITensor*>(tensor_l),
-          *const_cast<nvinfer1::ITensor*>(tensor_r),
-          nvinfer1::ElementWiseOperation::kPOW);
+          *const_cast<nvinfer1::ITensor*>(inputs.at(0).tensor()),
+          *const2_tensor, nvinfer1::ElementWiseOperation::kPOW);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
@@ -2707,11 +3240,8 @@ tensorflow::Status ConvertSquare(OpConverterParams* params) {
 tensorflow::Status ConvertReduce(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
-      !inputs.at(1).is_weights()) {
-    return tensorflow::errors::InvalidArgument(
-        "Input expects tensor and weights, at", node_def.name());
-  }
+  TF_RETURN_IF_ERROR(
+      CheckInputsWeights(*params, {{"input", false}, {"axis", true}}));
 
   const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
   TRT_ShapedWeights index_list = inputs.at(1).weights();
@@ -2772,12 +3302,8 @@ tensorflow::Status ConvertReduce(OpConverterParams* params) {
 tensorflow::Status ConvertPad(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  // TODO(aaroey): make a routine for this check and reuse it.
-  if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
-      !inputs.at(1).is_weights()) {
-    return tensorflow::errors::InvalidArgument(
-        "Input expects tensor and weights, at", node_def.name());
-  }
+  TF_RETURN_IF_ERROR(
+      CheckInputsWeights(*params, {{"tensor", false}, {"paddings", true}}));
 
   // Implement tensor binaryOp weight [channel wise] for now;
   const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
@@ -2814,7 +3340,7 @@ tensorflow::Status ConvertPad(OpConverterParams* params) {
   }
 
   // No padding at all, we should exit
-  if (pad_index.size() == 0) {
+  if (pad_index.empty()) {
     params->outputs->push_back(inputs.at(0));
     return tensorflow::Status::OK();
   }
@@ -2837,6 +3363,7 @@ tensorflow::Status ConvertPad(OpConverterParams* params) {
     return tensorflow::errors::Unimplemented(
         "Padding layer does not support padding on dimension 1 and 3 yet");
   }
+  if (params->validation_only) return Status::OK();
 
   bool legit_pad = true;
   nvinfer1::DimsHW pre_padding(0, 0);
@@ -2940,6 +3467,7 @@ tensorflow::Status ConvertConcat(OpConverterParams* params) {
 
     inputs_vec.push_back(tensor_i);
   }
+  if (params->validation_only) return tensorflow::Status::OK();
 
   // nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
   nvinfer1::IConcatenationLayer* layer =
@@ -2956,17 +3484,30 @@ tensorflow::Status ConvertConcat(OpConverterParams* params) {
 tensorflow::Status ConvertFusedBatchNorm(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false},
+                                                  {"scale", true},
+                                                  {"offset", true},
+                                                  {"mean", true},
+                                                  {"variance", true}}));
   TFAttrs attrs(node_def);
   float epsilon = attrs.get<float>("epsilon");
   auto data_format = attrs.get<string>("data_format");
   if (data_format != "NCHW") {
     return tensorflow::errors::Unimplemented(
-        "only data_format=NCHW is supported, at " + node_def.name());
+        node_def.op(), " only supports data_format=NCHW, at ", node_def.name());
   }
   bool is_training = attrs.get<bool>("is_training");
   if (is_training) {
+    // Trying to use batchnorm in training mode is a very common problem.
+    // Because the error message will only be printed in VLOG(1) by the
+    // segmenter, we issue a special warning so that users will actually see it.
+    LOG(WARNING) << node_def.op() << " only supports is_training=false. If you "
+                 << "are using Keras, please call "
+                 << "keras.backend.set_learning_phase(0) before constructing "
+                 << "your model. At " << node_def.name();
     return tensorflow::errors::Unimplemented(
-        "only is_training=false is supported, at " + node_def.name());
+        node_def.op(), " only supports is_training=false, at ",
+        node_def.name());
   }
   nvinfer1::ITensor const* tensor = inputs.at(0).tensor();
 
@@ -2981,7 +3522,7 @@ tensorflow::Status ConvertFusedBatchNorm(OpConverterParams* params) {
   for (int i = 1; i < 5; i++) {
     if (inputs.at(i).weights().type_ != parameter_type) {
       return tensorflow::errors::Unimplemented(
-          "Inconsistent parameter type for batchnormis not supported, at: " +
+          "Inconsistent parameter type for batchnorm is not supported, at: " +
           node_def.name());
     }
   }
@@ -2989,7 +3530,7 @@ tensorflow::Status ConvertFusedBatchNorm(OpConverterParams* params) {
   TRT_ShapedWeights dummy_power_weights(parameter_type);
   size_t nweight = 0;
   for (int i = 1; i < 5; i++) {
-    nweight = std::max(nweight, (size_t)inputs.at(i).weights().count());
+    nweight = std::max<size_t>(nweight, inputs.at(i).weights().count());
   }
   TRT_ShapedWeights* ptr_shape_weights = nullptr;
   for (int i = 1; i < 5; i++) {
@@ -3001,6 +3542,8 @@ tensorflow::Status ConvertFusedBatchNorm(OpConverterParams* params) {
           "Inconsistent batchnorm parameter count, at: " + node_def.name());
     }
   }
+  if (params->validation_only) return Status::OK();
+
   //  We could technically have two weights with different shape.
   //  that requires two addScale op, arguably less performant
   TRT_ShapedWeights combined_scale_weights =
@@ -3072,6 +3615,29 @@ tensorflow::Status ConvertFusedBatchNorm(OpConverterParams* params) {
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertGather(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(CheckInputsWeights(
+      *params, {{"params", false}, {"indices", false}, {"axis", true}}));
+  absl::Span<const int> axis = inputs.at(2).weights().GetSpan<int>();
+  if (axis.size() != 1) {
+    return tensorflow::errors::InvalidArgument(
+        "Axis for GatherV2 must be a scalar, at ", node_def.name());
+  }
+  int trt_axis = 0;
+  TF_RETURN_IF_ERROR(ConvertAxis(axis[0], inputs.at(0).GetTrtDims().nbDims,
+                                 node_def.name(), &trt_axis));
+  if (params->validation_only) return Status::OK();
+
+  nvinfer1::IGatherLayer* layer = params->converter->network()->addGather(
+      *const_cast<nvinfer1::ITensor*>(inputs.at(0).tensor()),
+      *const_cast<nvinfer1::ITensor*>(inputs.at(1).tensor()), trt_axis);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
+  return Status::OK();
+}
+
 tensorflow::Status ConvertMatMulHelper(OpConverterParams* params,
                                        TRT_TensorOrWeights tensor_input,
                                        TRT_ShapedWeights weights_raw,
@@ -3122,14 +3688,9 @@ tensorflow::Status ConvertMatMulHelper(OpConverterParams* params,
 tensorflow::Status ConvertMatMul(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
-      !inputs.at(1).is_weights()) {
-    return errors::InvalidArgument("Input expects tensor and weights, at ",
-                                   node_def.name());
-  }
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"a", false}, {"b", true}}));
 
   TFAttrs attrs(node_def);
-  // TODO(jie): INT32 should be converted?
   tensorflow::DataType tf_dtype = attrs.get<tensorflow::DataType>("T");
   if (tf_dtype != DataType::DT_FLOAT && tf_dtype != DataType::DT_HALF) {
     return errors::Unimplemented("Data type is not supported, for node ",
@@ -3153,9 +3714,16 @@ tensorflow::Status ConvertMatMul(OpConverterParams* params) {
 tensorflow::Status ConvertBatchMatMul(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
+  // TODO(tmorris): Enable once false is updated to mean either tensor or weight
+  // TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}, {"y",
+  // false}}));
+  if (inputs.size() != 2) {
+    return tensorflow::errors::InvalidArgument(
+        node_def.op(), " got ", inputs.size(), " inputs but expected 2, at ",
+        node_def.name());
+  }
   TFAttrs attrs(node_def);
 
-  // TODO(jie): INT32 should be converted?
   tensorflow::DataType tf_dtype = attrs.get<tensorflow::DataType>("T");
   if (tf_dtype != tensorflow::DataType::DT_FLOAT &&
       tf_dtype != tensorflow::DataType::DT_HALF) {
@@ -3225,6 +3793,7 @@ tensorflow::Status ConvertBatchMatMul(OpConverterParams* params) {
 tensorflow::Status ConvertSoftmax(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"logits", false}}));
   const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
 
   int nbDims = tensor->getDimensions().nbDims;
@@ -3233,6 +3802,8 @@ tensorflow::Status ConvertSoftmax(OpConverterParams* params) {
         "TensorRT Softmax cannot apply on batch dimension, at" +
         node_def.name());
   }
+  if (params->validation_only) return Status::OK();
+
   nvinfer1::ISoftMaxLayer* layer = params->converter->network()->addSoftMax(
       *const_cast<nvinfer1::ITensor*>(tensor));
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
@@ -3248,31 +3819,36 @@ tensorflow::Status ConvertSoftmax(OpConverterParams* params) {
 
 tensorflow::Status ConvertTopK(OpConverterParams* params) {
   const auto& inputs = params->inputs;
+  if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
+      !inputs.at(1).is_weights()) {
+    return errors::InvalidArgument("Input expects tensor and weights, at ",
+                                   params->node_def.name());
+  }
+
   const auto& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(
+      CheckInputsWeights(*params, {{"input", false}, {"k", true}}));
   const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
-
-  int nbDims = tensor->getDimensions().nbDims;
-  if (nbDims == 0) {
-    return tensorflow::errors::InvalidArgument(
-        "TensorRT TopK cannot apply on batch dimension, at" + node_def.name());
+  const int num_dims = tensor->getDimensions().nbDims;
+  if (num_dims == 0) {
+    return errors::InvalidArgument(
+        "TensorRT TopK cannot apply on batch dimension, at", node_def.name());
   }
 
   TRT_ShapedWeights k_w = inputs.at(1).weights();
-  int k = *(static_cast<int*>(const_cast<void*>(k_w.GetValues())));
-
-  nvinfer1::TopKOperation op;
-  uint32_t reducedAxes = 0;
-  if (node_def.op() == "TopKV2") {
-    op = nvinfer1::TopKOperation::kMAX;
-    reducedAxes |= 1 << (nbDims - 1);
-  } else {
-    return tensorflow::errors::Unimplemented(
-        "Operation: " + node_def.op() +
-        " not implemented, at: " + node_def.name());
+  if (k_w.count() != 1) {
+    return errors::InvalidArgument("k value of TopK should be a scalar, at",
+                                   node_def.name());
   }
+  // Note that ITopKLayer always have sorted outputs, so we don't need to handle
+  // the 'sorted' attribute of the node.
+  if (params->validation_only) return Status::OK();
 
+  const nvinfer1::TopKOperation op = nvinfer1::TopKOperation::kMAX;
+  const int k = *(static_cast<int*>(const_cast<void*>(k_w.GetValues())));
+  const uint32_t reduce_axes = 1 << (num_dims - 1);
   nvinfer1::ITopKLayer* layer = params->converter->network()->addTopK(
-      *const_cast<nvinfer1::ITensor*>(tensor), op, k, reducedAxes);
+      *const_cast<nvinfer1::ITensor*>(tensor), op, k, reduce_axes);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
 
   nvinfer1::ITensor* output_value_tensor = layer->getOutput(0);
@@ -3286,14 +3862,25 @@ static void RegisterValidatableOpConverters(
     std::unordered_map<string, OpConverter>* registration) {
   // TODO(laigd): support all op types.
   (*registration)["BiasAdd"] = ConvertBiasAdd;
+  (*registration)["ConcatV2"] = ConvertConcat;
   (*registration)["Const"] = ConvertConst;
-  (*registration)["Transpose"] = ConvertTranspose;
-  (*registration)["Reshape"] = ConvertReshape;
+  (*registration)["Conv2D"] = ConvertConv2D;
+  (*registration)["Conv2DBackpropInput"] = ConvertConv2DBackpropInput;
+  (*registration)["DepthwiseConv2dNative"] = ConvertConv2DDepthwise;
+  (*registration)["ExpandDims"] = ConvertExpandDims;
+  (*registration)["GatherV2"] = ConvertGather;
+  (*registration)["LeakyRelu"] = ConvertLeakyRelu;
   (*registration)["MatMul"] = ConvertMatMul;
+  (*registration)["Pad"] = ConvertPad;
   (*registration)["Relu6"] = ConvertRelu6;
+  (*registration)["Reshape"] = ConvertReshape;
+  (*registration)["Rsqrt"] = ConvertRsqrt;
+  (*registration)["Slice"] = ConvertSlice;
   (*registration)["Square"] = ConvertSquare;
-  (*registration)["ExpandDims"] = ConvertExpandDims;
   (*registration)["Squeeze"] = ConvertSqueeze;
+  (*registration)["StridedSlice"] = ConvertStridedSlice;
+  (*registration)["Transpose"] = ConvertTranspose;
+  (*registration)["TopKV2"] = ConvertTopK;
 
   for (auto quantization_op_type :
        {"QuantizeAndDequantizeV2", "QuantizeAndDequantizeV3",
@@ -3307,6 +3894,15 @@ static void RegisterValidatableOpConverters(
   for (auto activation_op_type : {"Relu", "Sigmoid", "Tanh"}) {
     (*registration)[activation_op_type] = ConvertActivation;
   }
+  for (auto pool_op_type : {"AvgPool", "MaxPool"}) {
+    (*registration)[pool_op_type] = ConvertPool;
+  }
+  for (auto normalization_op_type : {"FusedBatchNorm", "FusedBatchNormV2"}) {
+    (*registration)[normalization_op_type] = ConvertFusedBatchNorm;
+  }
+  for (auto unary_op_pair : *UnaryOperationMap()) {
+    (*registration)[unary_op_pair.first] = ConvertUnary;
+  }
 }
 
 void TrtNodeValidator::RegisterOpValidators() {
@@ -3315,29 +3911,10 @@ void TrtNodeValidator::RegisterOpValidators() {
 
 void Converter::RegisterOpConverters() {
   RegisterValidatableOpConverters(&op_registry_);
-
-  op_registry_["Conv2D"] = ConvertConv2D;
-  op_registry_["DepthwiseConv2dNative"] = ConvertConv2DDepthwise;
-  op_registry_["MaxPool"] = ConvertPool;
-  op_registry_["AvgPool"] = ConvertPool;
   // TODO(ben,jie): this is a temp hack.
   op_registry_["Identity"] = ConvertIdentity;  // Identity should be removed
   op_registry_["Snapshot"] = ConvertIdentity;  // Snapshot should be removed
 
-  op_registry_["Pad"] = ConvertPad;
-
-  op_registry_["ConcatV2"] = ConvertConcat;
-  op_registry_["FusedBatchNorm"] = ConvertFusedBatchNorm;
-  op_registry_["FusedBatchNormV2"] = ConvertFusedBatchNorm;
-
-  op_registry_["Rsqrt"] = ConvertUnary;
-  op_registry_["Reciprocal"] = ConvertUnary;
-  op_registry_["Exp"] = ConvertUnary;
-  op_registry_["Log"] = ConvertUnary;
-  op_registry_["Sqrt"] = ConvertUnary;
-  op_registry_["Abs"] = ConvertUnary;
-  op_registry_["Neg"] = ConvertUnary;
-
   op_registry_["Sum"] = ConvertReduce;
   op_registry_["Prod"] = ConvertReduce;
   op_registry_["Max"] = ConvertReduce;
@@ -3345,14 +3922,13 @@ void Converter::RegisterOpConverters() {
   op_registry_["Mean"] = ConvertReduce;
   op_registry_["Softmax"] = ConvertSoftmax;
   op_registry_["BatchMatMul"] = ConvertBatchMatMul;
-  op_registry_["TopKV2"] = ConvertTopK;
 
   plugin_converter_ = ConvertPlugin;
 }
 
 tensorflow::Status ConvertGraphDefToEngine(
-    const tensorflow::GraphDef& gdef, int precision_mode, int max_batch_size,
-    size_t max_workspace_size_bytes,
+    const tensorflow::GraphDef& gdef, TrtPrecisionMode precision_mode,
+    int max_batch_size, size_t max_workspace_size_bytes,
     const std::vector<tensorflow::PartialTensorShape>& input_shapes,
     Logger* logger, nvinfer1::IGpuAllocator* allocator,
     TRTInt8Calibrator* calibrator,
@@ -3367,9 +3943,13 @@ tensorflow::Status ConvertGraphDefToEngine(
   builder->setMaxBatchSize(max_batch_size);
   builder->setMaxWorkspaceSize(max_workspace_size_bytes);
   builder->setGpuAllocator(allocator);
-  if (precision_mode == FP16MODE) {
-    builder->setHalf2Mode(true);
-  } else if (precision_mode == INT8MODE) {
+  if (precision_mode == TrtPrecisionMode::FP16) {
+    builder->setFp16Mode(true);
+  } else if (precision_mode == TrtPrecisionMode::INT8) {
+    // Setting FP16 mode as well allows TRT to also consider FP16 kernels and
+    // use them in situations where they are faster than INT8 or where INT8 is
+    // not supported for a given layer.
+    builder->setFp16Mode(true);
     builder->setInt8Mode(true);
     if (use_calibration) {
       builder->setInt8Calibrator(calibrator);
@@ -3389,15 +3969,14 @@ tensorflow::Status ConvertGraphDefToEngine(
   // Build the network
   VLOG(1) << "Starting engine conversion ";
   Converter converter(trt_network.get(), precision_mode, use_calibration);
-  std::vector<std::pair<string, string>> output_tensors;
+  std::vector<Converter::EngineOutputInfo> output_tensors;
   // Graph nodes are already topologically sorted during construction
   for (const auto& node_def : gdef.node()) {
     string node_name = node_def.name();
     VLOG(2) << "Converting op name=" << node_name << ", op=" << node_def.op();
-    if (tensorflow::str_util::StartsWith(node_name, kInputPHName) &&
-        (node_def.op() == "Placeholder")) {
+    if (IsEngineInput(node_name) && (node_def.op() == "Placeholder")) {
       int32 slot_number = -1;
-      if (!tensorflow::strings::safe_strto32(
+      if (!tensorflow::strings::safe_strto32(  // non-absl ok
               node_name.c_str() + strlen(kInputPHName), &slot_number)) {
         return tensorflow::errors::InvalidArgument(
             "Failed to parse slot number from ", node_name);
@@ -3423,18 +4002,23 @@ tensorflow::Status ConvertGraphDefToEngine(
       // engines offline, by calling sess.run() and cache/serialize the engines.
       TF_RETURN_IF_ERROR(
           converter.AddInputTensor(node_name, trt_dtype, trt_dims, batch_size));
-    } else if (tensorflow::str_util::StartsWith(node_name, kOutputPHName) &&
-               (node_def.op() == "Identity")) {
+    } else if (IsEngineOutput(node_name) && (node_def.op() == "Identity")) {
       int32 slot_number = -1;
-      if (!tensorflow::strings::safe_strto32(
+      if (!tensorflow::strings::safe_strto32(  // non-absl ok
               node_name.c_str() + strlen(kOutputPHName), &slot_number)) {
         return tensorflow::errors::InvalidArgument(
             "Failed to parse slot number from ", node_name);
       }
+      // Get output type that TensorFlow expects
+      TFAttrs attrs(node_def);
+      tensorflow::DataType tf_dtype = attrs.get<tensorflow::DataType>("T");
+      nvinfer1::DataType trt_dtype;
+      TF_RETURN_IF_ERROR(ConvertDType(tf_dtype, &trt_dtype));
       if (output_tensors.size() <= slot_number) {
         output_tensors.resize(slot_number + 1);
       }
-      output_tensors.at(slot_number) = {node_def.input(0), node_name};
+      output_tensors.at(slot_number) = {node_def.input(0), node_name,
+                                        trt_dtype};
     } else {
       VLOG(2) << "Converting node: " << node_def.name() << " , "
               << node_def.op();
@@ -3460,8 +4044,7 @@ tensorflow::Status ConvertGraphDefToEngine(
 tensorflow::Status ConvertSegmentToGraphDef(
     const tensorflow::Graph* graph,
     const tensorflow::grappler::GraphProperties& graph_properties,
-    const std::set<string>& subgraph_node_names,
-    const std::vector<int>& subgraph_node_ids,  // In topological order
+    const std::vector<const Node*>& subgraph_nodes,  // In topological order
     std::vector<EngineConnection>* connections,
     tensorflow::GraphDef* segment_def, string* common_scope) {
   std::set<string> marker_nodes;
@@ -3524,8 +4107,10 @@ tensorflow::Status ConvertSegmentToGraphDef(
       marker_nodes.insert(node_name);
       auto seg_node = segment_def->add_node();
       tensorflow::NodeDefBuilder builder(node_name, "Identity");
-      auto status = builder.Input(connection.inside_node_name, 0, dtype)
-                        .Finalize(seg_node);
+      auto status =
+          builder
+              .Input(connection.inside_node_name, connection.inside_port, dtype)
+              .Finalize(seg_node);
       VLOG(1) << "Constructing output " << node_name << " for the edge "
               << connection.inside_node_name << ":" << connection.inside_port
               << " -> " << connection.outside_node_name << ":"
@@ -3535,13 +4120,12 @@ tensorflow::Status ConvertSegmentToGraphDef(
 
   std::unordered_map<int, int> old_to_new_id_map;
   // Copy internal nodes to new graphdef
-  string local_scope = graph->FindNodeId(*subgraph_node_ids.begin())->name();
-  for (const auto node_id : subgraph_node_ids) {
-    const auto node = graph->FindNodeId(node_id);
+  string local_scope = subgraph_nodes.front()->name();
+  for (const Node* node : subgraph_nodes) {
     local_scope = GetCommonNameScope(local_scope, node->name());
-    old_to_new_id_map[node_id] = segment_def->node_size();
+    old_to_new_id_map[node->id()] = segment_def->node_size();
     auto snode = segment_def->add_node();
-    snode->CopyFrom(node->def());
+    *snode = node->def();
     VLOG(2) << "Copying " << snode->name() << " to subgraph";
   }
   // Update the inputs of the new input nodes to point to placeholder nodes.
@@ -3557,6 +4141,11 @@ tensorflow::Status ConvertSegmentToGraphDef(
             << placeholder_name;
     snode->set_input(connection.inside_port, placeholder_name);
   }
+  std::set<string> subgraph_node_names;
+  for (const Node* node : subgraph_nodes) {
+    subgraph_node_names.insert(node->name());
+  }
+
   // Remove control inputs that are not inside the segment.
   for (int i = 0; i < segment_def->node_size(); ++i) {
     auto snode = segment_def->mutable_node(i);
@@ -3567,7 +4156,7 @@ tensorflow::Status ConvertSegmentToGraphDef(
       TensorId input = ParseTensorName(snode->input(input_idx));
       if (!subgraph_node_names.count(
               string(input.first.data(), input.first.size())) &&
-          !str_util::StartsWith(input.first, kInputPHName)) {
+          !IsEngineInput(input.first)) {
         if (input.second == Graph::kControlSlot) {
           VLOG(1) << "... removing control inputs " << input.first
                   << " from subgraph.";
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
similarity index 86%
rename from tensorflow/contrib/tensorrt/convert/convert_nodes.h
rename to tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
index 54e19b73957bccdae2b23bd3556de9ad00b864e5..7b37173090519ff6fadd956942d7ea12a0644981 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_NODES_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_NODES_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_NODES_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_NODES_H_
 
 #include <set>
 #include <string>
@@ -22,11 +22,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resources.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
@@ -92,7 +92,7 @@ struct EngineInfo {
   EngineInfo()
       : engine_type(EngineType::TRTStatic),
         max_workspace_size_bytes(0),
-        precision_mode(FP32MODE),
+        precision_mode(TrtPrecisionMode::FP32),
         use_calibration(true) {}
 
   string engine_name;
@@ -109,7 +109,7 @@ struct EngineInfo {
   int64 max_workspace_size_bytes;
   int maximum_cached_engines;
   std::vector<int> cached_engine_batches;
-  int precision_mode;
+  TrtPrecisionMode precision_mode;
   bool use_calibration;
 };
 
@@ -128,8 +128,7 @@ struct EngineInfo {
 tensorflow::Status ConvertSegmentToGraphDef(
     const tensorflow::Graph* graph,
     const tensorflow::grappler::GraphProperties& graph_properties,
-    const std::set<string>& subgraph_node_names,
-    const std::vector<int>& subgraph_node_ids,
+    const std::vector<const Node*>& subgraph_nodes,
     std::vector<EngineConnection>* connections,
     tensorflow::GraphDef* segment_def, string* common_scope);
 
@@ -142,8 +141,8 @@ tensorflow::Status ConvertSegmentToGraphDef(
 //   is successful. This is different than successfully building the engine:
 //   building can still fail afterwards.
 tensorflow::Status ConvertGraphDefToEngine(
-    const tensorflow::GraphDef& gdef, int precision_mode, int max_batch_size,
-    size_t max_workspace_size_bytes,
+    const tensorflow::GraphDef& gdef, TrtPrecisionMode precision_mode,
+    int max_batch_size, size_t max_workspace_size_bytes,
     const std::vector<tensorflow::PartialTensorShape>& input_shapes,
     Logger* logger, nvinfer1::IGpuAllocator* allocator,
     TRTInt8Calibrator* calibrator,
@@ -159,7 +158,10 @@ class OutputEdgeValidator {
   bool operator()(const tensorflow::Edge* out_edge) const;
 };
 
+string DebugString(const nvinfer1::DimensionType type);
+string DebugString(const nvinfer1::DataType trt_dtype);
 string DebugString(const nvinfer1::Dims& dims);
+string DebugString(const nvinfer1::Permutation& permutation, int len);
 string DebugString(const nvinfer1::ITensor& tensor);
 int64_t TrtDimsNumElements(const nvinfer1::Dims& dims);
 
@@ -176,6 +178,8 @@ class TRT_ShapedWeights {
 
   nvinfer1::Weights GetTrtWeights() const;
 
+  // Returns the raw pointer to the underlying buffer which holds the weights
+  // value.
   void* GetValues() const {
     return const_cast<char*>(tensor_.tensor_data().data());
   }
@@ -186,6 +190,17 @@ class TRT_ShapedWeights {
 
   string DebugString() const;
 
+  template <typename T>
+  absl::Span<const T> GetSpan() const {
+    return absl::Span<const T>(tensor_.flat<T>().data(), count());
+  }
+
+  template <typename T>
+  std::vector<T> ToVector() const {
+    auto span = GetSpan<T>();
+    return std::vector<T>(span.data(), span.data() + span.size());
+  }
+
   // TODO(aaroey): make these private.
   nvinfer1::Dims shape_;  // Note: shape.type[] is not used.
   tensorflow::DataType type_;
@@ -195,6 +210,10 @@ class TRT_ShapedWeights {
   // underlying buffer.
   TRT_ShapedWeights(DataType type, nvinfer1::Dims dims, Tensor tensor);
 
+  // All weights should be stored inside TrtWeightStore to make sure lifetime of
+  // all the underlying tensors are available until the engine is built. For
+  // this reason, tensor_ should never be reassigned to a different value that
+  // is not already present in the TrtWeightStore.
   Tensor tensor_;
 
   friend class TrtWeightStore;
@@ -394,8 +413,21 @@ class TrtNodeValidator {
 // Class to convert TF nodes to TRT network.
 class Converter {
  public:
-  Converter(nvinfer1::INetworkDefinition* trt_network, int precision_mode,
-            bool use_calibration);
+  // Used for Converter::RenameAndMarkOutputTensors()
+  struct EngineOutputInfo {
+    // The TRT tensor name which produces the output.
+    string source_tensor_name;
+    // The TensorFlow node name which is receiving the output from the TRT
+    // engine. This should always be the Identity node created in
+    // ConvertSegmentToGraphDef.
+    string dest_node_name;
+    // Output type. TensorRT requires this to be explicitly set for engine
+    // outputs.
+    nvinfer1::DataType trt_dtype;
+  };
+
+  Converter(nvinfer1::INetworkDefinition* trt_network,
+            TrtPrecisionMode precision_mode, bool use_calibration);
 
   //////////////////////////////////////////////////////////////////////////////
   // Methods used by the TRT engine builder to build a TRT network from a TF
@@ -409,13 +441,10 @@ class Converter {
   Status AddInputTensor(const string& name, nvinfer1::DataType dtype,
                         const nvinfer1::Dims& dims, int batch_size);
 
-  // Mark the tensors with names specified by output_tensors[i].first as output
-  // of the TRT network, and set their names in the TRT network as
-  // output_tensors[i].second. The tensor names (output_tensors[i].first) are
-  // standard TF tensor names, i.e. node names followed by output slot number
-  // (or just the node name if the tensor is the first output of the node).
+  // Mark the tensors with names specified by source_tensor_name as output of
+  // the TRT network, and set their names in the TRT network as dest_node_name.
   Status RenameAndMarkOutputTensors(
-      const std::vector<std::pair<string, string>>& output_tensors);
+      const std::vector<EngineOutputInfo>& output_tensors);
 
   //////////////////////////////////////////////////////////////////////////////
   // Methods used by op converters to convert individual TF node and add layers
@@ -426,7 +455,7 @@ class Converter {
   nvinfer1::INetworkDefinition* network() { return trt_network_; }
 
   // What precision are we targeting?
-  int precision_mode() const { return precision_mode_; }
+  TrtPrecisionMode precision_mode() const { return precision_mode_; }
 
   // Calibration will be or was previously performed on this network?
   bool use_calibration() const { return use_calibration_; }
@@ -469,6 +498,11 @@ class Converter {
                               nvinfer1::Dims* operand_l_new_dims,
                               nvinfer1::Dims* operand_r_new_dims) const;
 
+  // Creates an IConstantLayer using 'weights' whose dimensions are specified by
+  // 'dims', and returns the output ITensor.
+  nvinfer1::ITensor* CreateConstantLayer(const TRT_ShapedWeights& weights,
+                                         const nvinfer1::Dims& dims);
+
  private:
   // Verify the provided batch_size is consistent with batch_size_ and update it
   // if necessary.
@@ -523,7 +557,7 @@ class Converter {
   std::vector<std::pair<nvinfer1::ITensor*, nvinfer1::ITensor*>>
       quantization_infer_;
 
-  const int precision_mode_;
+  const TrtPrecisionMode precision_mode_;
 
   const bool use_calibration_;
 
@@ -537,6 +571,9 @@ class Converter {
   friend class OpConverterTest;
 };
 
+// Map of all supported UnaryOperations
+const std::unordered_map<string, nvinfer1::UnaryOperation>* UnaryOperationMap();
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
@@ -544,4 +581,4 @@ class Converter {
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_CONVERT_NODES_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_CONVERT_NODES_H_
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
similarity index 56%
rename from tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc
rename to tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index c37a43dd5def9daf3c5d70720c6db2aab20db077..45afc76d758ab5052da78879b27380e2c1ccb5b9 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
 
 #include <memory>
 #include <unordered_map>
@@ -21,11 +21,16 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 #include "tensorflow/core/framework/node_def.pb.h"  // NOLINT
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
@@ -35,7 +40,9 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/config.pb.h"  // NOLINT
 #include "tensorflow/core/public/session.h"
@@ -50,9 +57,10 @@ namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 
-using ::tensorflow::strings::StrCat;
+using absl::StrCat;
 using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
+using ::testing::NanSensitiveFloatNear;
 
 // TODO(laigd): put this into some test utils file.
 void ExpectStatus(Status status, error::Code code = error::OK,
@@ -152,7 +160,7 @@ void ExpectTrtDimsEqualsArray(const std::vector<int>& lhs,
 }
 
 template <typename T>
-void ExpectArrayNear(const std::vector<T>& lhs, const std::vector<T>& rhs) {
+void ExpectArrayNear(const std::vector<T>& lhs, absl::Span<const T> rhs) {
   ASSERT_EQ(lhs.size(), rhs.size());
   for (int i = 0; i < lhs.size(); i++) {
     EXPECT_FLOAT_EQ(lhs[i], rhs[i]);
@@ -163,7 +171,7 @@ void ExpectArrayNear(const std::vector<T>& lhs, const std::vector<T>& rhs) {
 // EXPECT_FLOAT_EQ.
 template <>
 void ExpectArrayNear(const std::vector<Eigen::half>& lhs,
-                     const std::vector<Eigen::half>& rhs) {
+                     absl::Span<const Eigen::half> rhs) {
   ASSERT_EQ(lhs.size(), rhs.size());
   for (int i = 0; i < lhs.size(); i++) {
     EXPECT_FLOAT_EQ(Eigen::half_impl::half_to_float(lhs[i]),
@@ -234,6 +242,16 @@ class FakeITensor : public nvinfer1::ITensor {
   float getDynamicRange() const override { return dynamic_range_; }
 #endif
 
+#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+  bool dynamicRangeIsSet() const override { return true; }
+
+  void resetDynamicRange() override {}
+
+  float getDynamicRangeMin() const override { return 0.f; }
+
+  float getDynamicRangeMax() const override { return 0.f; }
+#endif
+
  private:
   string name_;
   nvinfer1::Dims dims_;
@@ -364,9 +382,6 @@ TEST(TRT_TensorOrWeights_Test, Basic) {
       EXPECT_EQ(false, ptr->is_tensor());
       EXPECT_EQ(true, ptr->is_weights());
       EXPECT_TRUE(TrtShapedWeightsEquals(weights, ptr->weights()));
-
-      nvinfer1::Dims dims;
-      dims.nbDims = 0;
       ExpectTrtDimsEqualsArray({}, ptr->GetTrtDims());
     }
   }
@@ -481,8 +496,7 @@ class ConverterTest : public ::testing::Test {
   ConverterTest() {
     builder_.reset(nvinfer1::createInferBuilder(logger_));
     network_.reset(builder_->createNetwork());
-    converter_.reset(new Converter(network_.get(),
-                                   /*precision_mode=*/FP32MODE,
+    converter_.reset(new Converter(network_.get(), TrtPrecisionMode::FP32,
                                    /*use_calibration=*/false));
     weight_store_ = &converter_->weight_store_;
   }
@@ -784,7 +798,7 @@ TEST_F(ConverterTest, MaybeApplyQuantizationRanges) {
   // input -> infer1 -> infer2 -> infer3
   FakeITensor input, infer_1, infer_2, infer_3;
   FakeITensor not_infer;
-  Converter int8_converter(/*trt_network=*/nullptr, INT8MODE,
+  Converter int8_converter(/*trt_network=*/nullptr, TrtPrecisionMode::INT8,
                            /*use_calibration=*/true);
   int8_converter.ProvideQuantizationRange(&input, -5.0f, 5.0f);
   int8_converter.ProvideQuantizationRange(&not_infer, -100.0f, 100.0f);
@@ -915,6 +929,97 @@ TEST_F(ConverterTest, GetTrtBroadcastShape) {
                  "(tensor #dims 4 vs broadcast #dims 5)");
 }
 
+TEST_F(ConverterTest, CreateConstantLayer) {
+  for (auto dtype : {DT_FLOAT, DT_INT32}) {
+    TRT_ShapedWeights weights =
+        weight_store_->GetTempWeights(dtype, GetTestDims({2, 3, 5}));
+    nvinfer1::ITensor* tensor =
+        converter_->CreateConstantLayer(weights, GetTestDims({3, 10}));
+    ASSERT_NE(nullptr, tensor);
+    EXPECT_EQ(TfDataTypeToTrt(dtype), tensor->getType())
+        << "Expected " << DebugString(TfDataTypeToTrt(dtype)) << " vs. actual "
+        << DebugString(tensor->getType());
+    ExpectTrtDimsEqualsArray({3, 10}, tensor->getDimensions());
+  }
+}
+
+class ConvertGraphDefToEngineTest : public ::testing::Test {
+ public:
+  Status RunConvertGraphDefToEngine(Scope* s) {
+    GraphDef gdef;
+    TF_EXPECT_OK(s->ToGraphDef(&gdef));
+    std::vector<tensorflow::PartialTensorShape> input_shapes;
+    int batch_size = -1;
+    for (const NodeDef& node : gdef.node()) {
+      absl::string_view node_name(node.name());
+      if (str_util::ConsumePrefix(&node_name, kInputPHName)) {
+        int port = -1;
+        EXPECT_TRUE(absl::SimpleAtoi(node_name, &port)) << node.name();
+        if (input_shapes.size() < port + 1) input_shapes.resize(port + 1);
+        input_shapes[port] =
+            PartialTensorShape(node.attr().at("shape").shape());
+        if (batch_size == -1) {
+          batch_size = input_shapes[port].dim_size(0);
+        } else {
+          EXPECT_EQ(batch_size, input_shapes[port].dim_size(0));
+        }
+      }
+    }
+    // TODO(laigd): execute the engine and get outputs.
+    return ConvertGraphDefToEngine(
+        gdef, TrtPrecisionMode::FP32, /*max_batch_size=*/1,
+        /*max_workspace_size_bytes=*/64 << 20, input_shapes, &logger_,
+        /*allocator=*/nullptr, /*calibrator=*/nullptr, &engine_,
+        /*use_calibration=*/false, /*convert_successfully=*/nullptr);
+  }
+
+ protected:
+  TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
+
+ private:
+  Logger logger_;
+};
+
+TEST_F(ConvertGraphDefToEngineTest, IdentityGraph) {
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName(StrCat(kInputPHName, 0)), DT_FLOAT,
+                                ops::Placeholder::Shape({1, 1}));
+  auto output = ops::Identity(s.WithOpName("identity1"), input);
+  output = ops::Identity(s.WithOpName("identity2"), output);
+  output = ops::Identity(s.WithOpName(StrCat(kOutputPHName, 0)), output);
+  // If the converter marks the input tensor as output tensor, the conversion
+  // below will fail with:
+  // > TensorRTOutputPH_0 cannot be both input and output
+  // > Network must have at least one output
+  TF_EXPECT_OK(RunConvertGraphDefToEngine(&s));
+}
+
+// Input/output data format for OpConverterTest::BuildAndRun().
+struct InputOutputData {
+  void* Buffer() const {
+    return const_cast<char*>(tensor.tensor_data().data());
+  }
+
+  size_t TotalBytes() const { return tensor.TotalBytes(); }
+
+  const char* name;
+  Tensor tensor;
+};
+
+template <typename T>
+Tensor ConstructTensor(int data_size, const T& value = T()) {
+  std::vector<T> values(data_size, value);
+  return test::AsTensor<T>(values);
+}
+
+using DataVec = std::vector<InputOutputData>;
+
+template <typename T>
+inline absl::Span<const T> GetSpanForData(const InputOutputData& data) {
+  const auto& tensor_map = data.tensor.flat<T>();
+  return absl::Span<const T>(tensor_map.data(), tensor_map.size());
+}
+
 // Class to test various op converters, using both a TrtNodeValidator and
 // Converter.
 class OpConverterTest : public ::testing::Test {
@@ -940,11 +1045,11 @@ class OpConverterTest : public ::testing::Test {
     builder_.reset(nvinfer1::createInferBuilder(logger_));
     network_.reset(builder_->createNetwork());
     builder_->setMaxBatchSize(1);
+    builder_->setMaxWorkspaceSize(1 << 26);
 
     // Reset the validator and converter.
     validator_.reset(new TrtNodeValidator);
-    converter_.reset(new Converter(network_.get(),
-                                   /*precision_mode=*/FP32MODE,
+    converter_.reset(new Converter(network_.get(), TrtPrecisionMode::FP32,
                                    /*use_calibration=*/false));
 
     // Reset other related artifacts.
@@ -953,14 +1058,14 @@ class OpConverterTest : public ::testing::Test {
   }
 
   // TODO(laigd): test fp16 and int8 support.
-  template <typename T>
-  void BuildAndRun(
-      const std::vector<std::pair<const char*, const std::vector<T>>>&
-          input_data,
-      const char* output_name, std::vector<T>* output_data) {
+  void BuildAndRun(const DataVec& input_data, DataVec* output_data) {
     // Mark the output tensor as TRT engine output.
-    TF_EXPECT_OK(converter_->RenameAndMarkOutputTensors(
-        {{string(output_name), string(output_name)}}));
+    std::vector<Converter::EngineOutputInfo> output_info;
+    for (const auto& data : *output_data) {
+      output_info.push_back(
+          {data.name, data.name, TfDataTypeToTrt(data.tensor.dtype())});
+    }
+    TF_EXPECT_OK(converter_->RenameAndMarkOutputTensors(output_info));
 
     // Build the TRT engine.
     ASSERT_EQ(nullptr, engine_.get());
@@ -968,31 +1073,44 @@ class OpConverterTest : public ::testing::Test {
     CHECK_NOTNULL(engine_.get());
 
     // Execute the TRT engine.
-    ASSERT_LE(input_data.size() + 1, 3);
-    void* buffers[3];
-    for (const auto name_and_data : input_data) {
-      const int input_size = name_and_data.second.size() * sizeof(T);
-      const int input_index = engine_->getBindingIndex(name_and_data.first);
-      ASSERT_EQ(0, cudaMalloc(&buffers[input_index], input_size));
-      ASSERT_EQ(
-          0, cudaMemcpyAsync(buffers[input_index], name_and_data.second.data(),
-                             input_size, cudaMemcpyHostToDevice, stream_));
+    const int num_bindings = input_data.size() + output_data->size();
+    std::vector<void*> buffers(num_bindings);
+
+    for (const auto& data : input_data) {
+      const int input_index = engine_->getBindingIndex(data.name);
+      ASSERT_EQ(0, cudaMalloc(&buffers[input_index], data.TotalBytes()));
+      ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], data.Buffer(),
+                                   data.TotalBytes(), cudaMemcpyHostToDevice,
+                                   stream_));
+    }
+    struct SizeAndIndex {
+      SizeAndIndex(int in_size, int in_index)
+          : size(in_size), index(in_index) {}
+      int size;
+      int index;
+    };
+    std::vector<SizeAndIndex> output_infos;
+    for (const auto& data : *output_data) {
+      const int output_index = engine_->getBindingIndex(data.name);
+      output_infos.emplace_back(data.TotalBytes(), output_index);
+      ASSERT_EQ(0, cudaMalloc(&buffers[output_index], data.TotalBytes()));
     }
 
-    const int output_size = output_data->size() * sizeof(T);
-    const int output_index = engine_->getBindingIndex(output_name);
-    ASSERT_EQ(0, cudaMalloc(&buffers[output_index], output_size));
-
-    ASSERT_EQ(engine_->getNbBindings(), input_data.size() + 1);
-
+    ASSERT_EQ(engine_->getNbBindings(), num_bindings);
     TrtUniquePtrType<nvinfer1::IExecutionContext> execution_context(
         engine_->createExecutionContext());
-    execution_context->enqueue(/*batchSize=*/1, buffers, stream_, nullptr);
-    ASSERT_EQ(0, cudaMemcpyAsync(output_data->data(), buffers[output_index],
-                                 output_size, cudaMemcpyDeviceToHost, stream_));
+    execution_context->enqueue(/*batchSize=*/1, buffers.data(), stream_,
+                               nullptr);
+
+    for (int i = 0; i < output_infos.size(); ++i) {
+      const auto& output_info = output_infos[i];
+      ASSERT_EQ(0, cudaMemcpyAsync(output_data->at(i).Buffer(),
+                                   buffers[output_info.index], output_info.size,
+                                   cudaMemcpyDeviceToHost, stream_));
+    }
     cudaStreamSynchronize(stream_);
 
-    for (int i = 0; i < input_data.size() + 1; ++i) {
+    for (int i = 0; i < num_bindings; ++i) {
       ASSERT_EQ(0, cudaFree(buffers[i]));
     }
   }
@@ -1111,6 +1229,30 @@ class OpConverterTest : public ::testing::Test {
   std::unordered_map<string, NodeDef> validator_inputs_;
 };
 
+template <typename T>
+void CopyTensorElements(const Tensor& tensor, protobuf::RepeatedField<T>* out) {
+  out->Clear();
+  if (tensor.NumElements() == 0) return;
+
+  // TensorProto does not need to have all the elements present and can truncate
+  // trailing elements with the same value for compressed representation. Such
+  // elements are derived based on the tensor shape.
+  const auto flat = tensor.flat<T>();
+  int64 last_index = 0;
+  for (int64 i = 0; i < tensor.NumElements(); ++i) {
+    if (flat(i) != flat(last_index)) {
+      last_index = i;
+    }
+  }
+
+  int num_out_elements = last_index + 1;
+  out->Reserve(num_out_elements);
+  out->AddNAlreadyReserved(num_out_elements);
+  const T* src = flat.data();
+  T* dst = out->mutable_data();
+  std::copy(src, src + num_out_elements, dst);
+}
+
 template <DataType dtype, typename InputCType, typename OutputCType>
 void TestConvertConst(OpConverterTest* test) {
   NodeDef node_def;
@@ -1123,11 +1265,23 @@ void TestConvertConst(OpConverterTest* test) {
                             const std::vector<OutputCType>& expected_value) {
     test->Reset();
 
-    auto& attr = *node_def.mutable_attr();
+    TensorProto* tensor_attr =
+        (*node_def.mutable_attr())["value"].mutable_tensor();
+    tensor_attr->Clear();
+
     if (as_tensor_content) {
-      tensor.AsProtoTensorContent(attr["value"].mutable_tensor());
+      tensor.AsProtoTensorContent(tensor_attr);
     } else {
-      tensor.AsProtoField(attr["value"].mutable_tensor());
+      tensor.shape().AsProto(tensor_attr->mutable_tensor_shape());
+      tensor_attr->set_dtype(tensor.dtype());
+
+      if (tensor.dtype() == DT_FLOAT) {
+        CopyTensorElements<float>(tensor, tensor_attr->mutable_float_val());
+      } else if (tensor.dtype() == DT_INT32) {
+        CopyTensorElements<int32>(tensor, tensor_attr->mutable_int_val());
+      } else {
+        tensor.AsProtoField(tensor_attr);
+      }
     }
     test->RunValidationAndConversion(node_def);
     TRT_TensorOrWeights output;
@@ -1140,8 +1294,7 @@ void TestConvertConst(OpConverterTest* test) {
   {
     // By default empty tensor will pick DT_FLOAT as data type and we fix it
     // here.
-    attr["value"].mutable_tensor()->set_dtype(dtype);
-    Tensor t;  // Empty tensor.
+    Tensor t(dtype);  // Empty tensor.
     reset_and_test(t, false, {}, {});
   }
   {
@@ -1160,6 +1313,22 @@ void TestConvertConst(OpConverterTest* test) {
     reset_and_test(t, false, {2, 3}, {1, 2, 3, 4, 5, 6});
     reset_and_test(t, true, {2, 3}, {1, 2, 3, 4, 5, 6});
   }
+  {
+    // Set all tensor elements to the same value. Such tensors are encoded
+    // using a single element list in tensor proto.
+    Tensor t = ::tensorflow::test::AsTensor<InputCType>({1, 1, 1, 1, 1, 1},
+                                                        TensorShape({2, 3}));
+    reset_and_test(t, false, {2, 3}, {1, 1, 1, 1, 1, 1});
+    reset_and_test(t, true, {2, 3}, {1, 1, 1, 1, 1, 1});
+  }
+  {
+    // Set trailing tensor elements to the same value. Such tensors are
+    // encoded by truncating all equal elements except the first one.
+    Tensor t = ::tensorflow::test::AsTensor<InputCType>({2, 2, 1, 1, 1, 1},
+                                                        TensorShape({2, 3}));
+    reset_and_test(t, false, {2, 3}, {2, 2, 1, 1, 1, 1});
+    reset_and_test(t, true, {2, 3}, {2, 2, 1, 1, 1, 1});
+  }
 }
 
 TEST_F(OpConverterTest, ConvertConst) {
@@ -1189,7 +1358,7 @@ TEST_F(OpConverterTest, ConvertTranspose) {
     NodeDef node_def = MakeNodeDef("my_transpose", "Transpose", {});
     RunValidationAndConversion(
         node_def, error::INVALID_ARGUMENT,
-        "Input expects tensor and weights, at my_transpose");
+        "Transpose got 0 inputs but expected 2, at my_transpose");
   }
 
   // Get the NodeDef for Transpose.
@@ -1205,8 +1374,8 @@ TEST_F(OpConverterTest, ConvertTranspose) {
     AddTestTensor("input", {1, 2, 3});
     AddTestTensor("weights", {3});
     RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Input expects tensor and weights, at my_transpose");
+        node_def, error::UNIMPLEMENTED,
+        "The input \"perm\" for Transpose must be a constant, at my_transpose");
   }
   {
     // Transpose at batch dimension, should fail.
@@ -1236,10 +1405,12 @@ TEST_F(OpConverterTest, ConvertTranspose) {
     EXPECT_TRUE(output.is_tensor());
     ExpectTrtDimsEqualsArray({3, 1, 2}, output.tensor()->getDimensions());
 
-    std::vector<float> output_data(6);
-    BuildAndRun<float>({{"input", {1, 2, 3, 4, 5, 6}}}, "my_transpose",
-                       &output_data);
-    EXPECT_THAT(output_data, ElementsAre(1, 4, 2, 5, 3, 6));
+    const DataVec input_data{
+        {"input", test::AsTensor<float>({1, 2, 3, 4, 5, 6})}};
+    DataVec output_data{{"my_transpose", ConstructTensor<float>(6)}};
+    BuildAndRun(input_data, &output_data);
+    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
+                ElementsAre(1, 4, 2, 5, 3, 6));
   }
 }
 
@@ -1249,7 +1420,7 @@ TEST_F(OpConverterTest, ConvertReshape) {
     NodeDef node_def = MakeNodeDef("my_reshape", "Reshape", {});
     RunValidationAndConversion(
         node_def, error::INVALID_ARGUMENT,
-        "Input expects weights for shape, at my_reshape");
+        "Reshape got 0 inputs but expected 2, at my_reshape");
   }
 
   // Get the NodeDef for Reshape.
@@ -1265,8 +1436,8 @@ TEST_F(OpConverterTest, ConvertReshape) {
     AddTestTensor("input", {1, 2, 3});
     AddTestTensor("weights", {3});
     RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Input expects weights for shape, at my_reshape");
+        node_def, error::UNIMPLEMENTED,
+        "The input \"shape\" for Reshape must be a constant, at my_reshape");
   }
   {
     // Reshape to scalar, should fail.
@@ -1279,11 +1450,6 @@ TEST_F(OpConverterTest, ConvertReshape) {
   }
 
   struct TestParams {
-    TestParams(int input_batch_size, const std::vector<int>& input_tensor_dims,
-               const std::vector<int>& input_shape)
-        : batch_size(input_batch_size),
-          tensor_dims(input_tensor_dims),
-          shape(input_shape) {}
     int batch_size;
     std::vector<int> tensor_dims;
     std::vector<int> shape;
@@ -1326,10 +1492,12 @@ TEST_F(OpConverterTest, ConvertReshape) {
     EXPECT_TRUE(output.is_tensor());
     ExpectTrtDimsEqualsArray({1, 3, 2}, output.tensor()->getDimensions());
 
-    std::vector<float> output_data(6);
-    BuildAndRun<float>({{"input", {1, 2, 3, 4, 5, 6}}}, "my_reshape",
-                       &output_data);
-    EXPECT_THAT(output_data, ElementsAre(1, 2, 3, 4, 5, 6));
+    const DataVec input_data{
+        {"input", test::AsTensor<float>({1, 2, 3, 4, 5, 6})}};
+    DataVec output_data{{"my_reshape", ConstructTensor<float>(6)}};
+    BuildAndRun(input_data, &output_data);
+    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
+                ElementsAre(1, 2, 3, 4, 5, 6));
   }
 }
 
@@ -1339,7 +1507,7 @@ TEST_F(OpConverterTest, ConvertMatMul) {
     NodeDef node_def = MakeNodeDef("my_matmul", "MatMul", {});
     RunValidationAndConversion(
         node_def, error::INVALID_ARGUMENT,
-        "Input expects tensor and weights, at my_matmul");
+        "MatMul got 0 inputs but expected 2, at my_matmul");
   }
 
   // Get the NodeDef for MatMul.
@@ -1389,12 +1557,13 @@ TEST_F(OpConverterTest, ConvertMatMul) {
     EXPECT_TRUE(output.is_tensor());
     ExpectTrtDimsEqualsArray({2}, output.tensor()->getDimensions());
 
-    std::vector<float> output_data(2);
-    BuildAndRun<float>({{"input", {0, 1}}}, "my_matmul", &output_data);
+    const DataVec input_data{{"input", test::AsTensor<float>({0, 1})}};
+    DataVec output_data{{"my_matmul", ConstructTensor<float>(2)}};
+    BuildAndRun(input_data, &output_data);
     if (transpose_b) {
-      EXPECT_THAT(output_data, ElementsAre(1, 3));
+      EXPECT_THAT(GetSpanForData<float>(output_data[0]), ElementsAre(1, 3));
     } else {
-      EXPECT_THAT(output_data, ElementsAre(2, 3));
+      EXPECT_THAT(GetSpanForData<float>(output_data[0]), ElementsAre(2, 3));
     }
   }
 }
@@ -1448,23 +1617,28 @@ void TestConvertBiasAdd(OpConverterTest* test) {
       const int num_input = TrtDimsNumElements(GetTestDims(dims_array));
       ASSERT_EQ(trt_input_rank > 1 ? 6 : (data_format == "NHWC" ? 3 : 2),
                 num_input);
-      std::vector<CType> output_data(num_input);
-      test->BuildAndRun<CType>(
-          {{"input", std::vector<CType>(num_input, CType(0))}}, "my_biasadd",
-          &output_data);
+
+      const DataVec input_data{
+          {"input", ConstructTensor<CType>(num_input, CType(0))}};
+      DataVec output_data{{"my_biasadd", ConstructTensor<CType>(num_input)}};
+      test->BuildAndRun(input_data, &output_data);
       if (trt_input_rank == 1) {
         if (data_format == "NHWC") {
-          EXPECT_THAT(output_data, ElementsAre(CType(1), CType(2), CType(3)));
+          EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
+                      ElementsAre(CType(1), CType(2), CType(3)));
         } else {
-          EXPECT_THAT(output_data, ElementsAre(CType(1), CType(2)));
+          EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
+                      ElementsAre(CType(1), CType(2)));
         }
       } else {
         if (data_format == "NHWC") {
-          EXPECT_THAT(output_data, ElementsAre(CType(1), CType(2), CType(3),
-                                               CType(1), CType(2), CType(3)));
+          EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
+                      ElementsAre(CType(1), CType(2), CType(3), CType(1),
+                                  CType(2), CType(3)));
         } else {
-          EXPECT_THAT(output_data, ElementsAre(CType(1), CType(1), CType(1),
-                                               CType(2), CType(2), CType(2)));
+          EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
+                      ElementsAre(CType(1), CType(1), CType(1), CType(2),
+                                  CType(2), CType(2)));
         }
       }
     }
@@ -1477,7 +1651,7 @@ TEST_F(OpConverterTest, ConvertBiasAdd) {
     NodeDef node_def = MakeNodeDef("my_biasadd", "BiasAdd", {});
     RunValidationAndConversion(
         node_def, error::INVALID_ARGUMENT,
-        "Input expects tensor and weights, at my_biasadd");
+        "BiasAdd got 0 inputs but expected 2, at my_biasadd");
   }
 
   // OK. Note that kINT32 is not supported by IScaleLayer, so we don't test
@@ -1542,21 +1716,25 @@ void TestBinaryTensorOpWeightNoBroadcast(OpConverterTest* test) {
     EXPECT_TRUE(output.is_tensor());
     ExpectTrtDimsEqualsArray({1, 1, 2}, output.tensor()->getDimensions());
 
-    std::vector<CType> output_data(2);
-    test->BuildAndRun<CType>(
-        {{"input",
-          /*input_data=*/swap_inputs ? operand2 : operand1}},
-        "my_binary", &output_data);
+    const DataVec input_data{
+        {"input", test::AsTensor<CType>(swap_inputs ? operand2 : operand1)}};
+    DataVec output_data{{"my_binary", ConstructTensor<CType>(2)}};
+    test->BuildAndRun(input_data, &output_data);
     if (node_def.op() == "Add") {
-      EXPECT_THAT(output_data, ElementsAre(CType(5), CType(10.5)));
+      EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
+                  ElementsAre(CType(5), CType(10.5)));
     } else if (node_def.op() == "Sub") {
-      EXPECT_THAT(output_data, ElementsAre(CType(1), CType(4.5)));
+      EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
+                  ElementsAre(CType(1), CType(4.5)));
     } else if (node_def.op() == "Mul") {
-      EXPECT_THAT(output_data, ElementsAre(CType(6), CType(22.5)));
+      EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
+                  ElementsAre(CType(6), CType(22.5)));
     } else if (node_def.op() == "Div") {
-      EXPECT_THAT(output_data, ElementsAre(CType(1.5), CType(2.5)));
+      EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
+                  ElementsAre(CType(1.5), CType(2.5)));
     } else if (node_def.op() == "RealDiv") {
-      EXPECT_THAT(output_data, ElementsAre(CType(1.5), CType(2.5)));
+      EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
+                  ElementsAre(CType(1.5), CType(2.5)));
     } else {
       ASSERT_TRUE(false);
     }
@@ -1591,13 +1769,14 @@ void TestBinaryTensorOpWeightWithChannelWiseBroadcast(OpConverterTest* test) {
     EXPECT_TRUE(output.is_tensor());
     ExpectTrtDimsEqualsArray({2, 1, 2}, output.tensor()->getDimensions());
 
-    std::vector<CType> output_data(4);
-    test->BuildAndRun<CType>({{"input", input}}, "my_binary", &output_data);
+    const DataVec input_data{{"input", test::AsTensor<CType>(input)}};
+    DataVec output_data{{"my_binary", ConstructTensor<CType>(4)}};
+    test->BuildAndRun(input_data, &output_data);
     if (weights_dims.size() == 1) {
-      EXPECT_THAT(output_data,
+      EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                   ElementsAre(CType(11), CType(22), CType(13), CType(24)));
     } else {
-      EXPECT_THAT(output_data,
+      EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                   ElementsAre(CType(11), CType(12), CType(23), CType(24)));
     }
   }
@@ -1625,9 +1804,10 @@ void TestBinaryTensorOpWeightWithUniformlyBroadcast(OpConverterTest* test) {
   EXPECT_TRUE(output.is_tensor());
   ExpectTrtDimsEqualsArray({2, 1, 2}, output.tensor()->getDimensions());
 
-  std::vector<CType> output_data(4);
-  test->BuildAndRun<CType>({{"input", input}}, "my_binary", &output_data);
-  EXPECT_THAT(output_data,
+  const DataVec input_data{{"input", test::AsTensor<CType>(input)}};
+  DataVec output_data{{"my_binary", ConstructTensor<CType>(4)}};
+  test->BuildAndRun(input_data, &output_data);
+  EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
               ElementsAre(CType(11), CType(12), CType(13), CType(14)));
 }
 
@@ -1675,17 +1855,19 @@ void TestBinaryTensorOpWeightFallback(OpConverterTest* test,
   // Check the result of running the engine.
   const int expected_num_outputs =
       TrtDimsNumElements(GetTestDims(expected_output_dims));
-  std::vector<CType> output_data(expected_num_outputs);
-  test->BuildAndRun<CType>(
-      {{"input",
-        /*input_data=*/std::vector<CType>(num_inputs, CType(2))}},
-      "my_binary", &output_data);
+  const DataVec input_data{
+      {"input", ConstructTensor<CType>(num_inputs, CType(2))}};
+  DataVec output_data{
+      {"my_binary", ConstructTensor<CType>(expected_num_outputs)}};
+  test->BuildAndRun(input_data, &output_data);
   if (node_def.op() == "Add") {
-    EXPECT_THAT(output_data, ElementsAreArray(std::vector<CType>(
-                                 expected_num_outputs, CType(3))));
+    EXPECT_THAT(
+        GetSpanForData<CType>(output_data[0]),
+        ElementsAreArray(std::vector<CType>(expected_num_outputs, CType(3))));
   } else if (node_def.op() == "Minimum") {
-    EXPECT_THAT(output_data, ElementsAreArray(std::vector<CType>(
-                                 expected_num_outputs, CType(1))));
+    EXPECT_THAT(
+        GetSpanForData<CType>(output_data[0]),
+        ElementsAreArray(std::vector<CType>(expected_num_outputs, CType(1))));
   } else {
     ASSERT_TRUE(false);
   }
@@ -1712,32 +1894,33 @@ void TestBinaryTensorOpTensor(OpConverterTest* test) {
   EXPECT_TRUE(output.is_tensor());
   ExpectTrtDimsEqualsArray({2, 2}, output.tensor()->getDimensions());
 
-  std::vector<CType> output_data(4);
+  const DataVec input_data{
+      {"input1", test::AsTensor<CType>({CType(3), CType(6)})},
+      {"input2", test::AsTensor<CType>({CType(2), CType(3)})}};
+  DataVec output_data{{"my_binary", ConstructTensor<CType>(4)}};
   // After broadcasting first input becomes {3, 6, 3, 6} and second input
   // becomes {2, 3, 2, 3}.
-  test->BuildAndRun<CType>(
-      {{"input1", {CType(3), CType(6)}}, {"input2", {CType(2), CType(3)}}},
-      "my_binary", &output_data);
+  test->BuildAndRun(input_data, &output_data);
   if (node_def.op() == "Add") {
-    EXPECT_THAT(output_data,
+    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAre(CType(5), CType(8), CType(6), CType(9)));
   } else if (node_def.op() == "Sub") {
-    EXPECT_THAT(output_data,
+    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAre(CType(1), CType(4), CType(0), CType(3)));
   } else if (node_def.op() == "Mul") {
-    EXPECT_THAT(output_data,
+    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAre(CType(6), CType(12), CType(9), CType(18)));
   } else if (node_def.op() == "Div") {
-    EXPECT_THAT(output_data,
+    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAre(CType(1.5), CType(3), CType(1), CType(2)));
   } else if (node_def.op() == "RealDiv") {
-    EXPECT_THAT(output_data,
+    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAre(CType(1.5), CType(3), CType(1), CType(2)));
   } else if (node_def.op() == "Minimum") {
-    EXPECT_THAT(output_data,
+    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAre(CType(2), CType(2), CType(3), CType(3)));
   } else if (node_def.op() == "Maximum") {
-    EXPECT_THAT(output_data,
+    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
                 ElementsAre(CType(3), CType(6), CType(3), CType(6)));
   } else {
     ASSERT_TRUE(false);
@@ -1751,7 +1934,9 @@ TEST_F(OpConverterTest, ConvertBinary) {
     NodeDef node_def = MakeNodeDef("my_add", "Add", {num_inputs, "input"});
     AddTestTensor("input", {1}, /*batch_size=*/1, nvinfer1::DataType::kFLOAT);
     RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
-                               "Binary ops require two inputs, at my_add");
+                               StrCat("Add got ", std::to_string(num_inputs),
+                                      " inputs but expected 2, at my_add")
+                                   .c_str());
   }
   {
     // Both inputs are weights.
@@ -1821,14 +2006,18 @@ TEST_F(OpConverterTest, ConvertBinary) {
 }
 
 TEST_F(OpConverterTest, ConvertQuantize) {
-  for (const string& op :
-       {"FakeQuantWithMinMaxArgs", "FakeQuantWithMinMaxVars",
-        "QuantizeAndDequantizeV2", "QuantizeAndDequantizeV3"}) {
+  const std::pair<string, int> op_with_num_inputs[4] = {
+      {"FakeQuantWithMinMaxArgs", 1},
+      {"FakeQuantWithMinMaxVars", 3},
+      {"QuantizeAndDequantizeV2", 3},
+      {"QuantizeAndDequantizeV3", 4}};
+  for (const auto& pair : op_with_num_inputs) {
     // Input list is empty, should fail.
-    NodeDef node_def = MakeNodeDef("my_quantize", op, {});
+    NodeDef node_def = MakeNodeDef("my_quantize", pair.first, {});
     RunValidationAndConversion(
         node_def, error::INVALID_ARGUMENT,
-        StrCat("Invalid number of inputs for ", op, ", at my_quantize")
+        StrCat(pair.first, " got 0 inputs but expected ",
+               std::to_string(pair.second), ", at my_quantize")
             .c_str());
   }
   {
@@ -1915,9 +2104,9 @@ TEST_F(OpConverterTest, ConvertQuantize) {
     AddTestTensor("weights_min", {1});
     AddTestTensor("weights_max", {1});
     RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Min and max inputs for QuantizeAndDequantizeV2 must be weights not "
-        "tensors, at my_quantize");
+        node_def, error::UNIMPLEMENTED,
+        "The input \"input_min\" for QuantizeAndDequantizeV2 must be a constant"
+        ", at my_quantize");
   }
   {
     // QuantizeAndDequantizeV3 ranges set via inputs, ok.
@@ -1944,46 +2133,6 @@ TEST_F(OpConverterTest, ConvertQuantize) {
   }
 }
 
-TEST_F(OpConverterTest, ConvertRelu6) {
-  {
-    // Input list is empty, should fail.
-    NodeDef node_def = MakeNodeDef("my_relu6", "Relu6", {});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "Invalid number of inputs for Relu6, at my_relu6");
-  }
-
-  // Get the NodeDef for Relu6.
-  Scope s = Scope::NewRootScope();
-  auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-  auto relu6 = ops::Relu6(s.WithOpName("my_relu6"), input);
-  const NodeDef node_def = relu6.operation.node()->def();
-  {
-    // Input is weights, should fail.
-    Reset();
-    AddTestWeights<float>("input", {1}, {1.0f});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "Relu6 is only implemented for tensors, not weights, at my_relu6");
-  }
-  {
-    // Clip tensor values and set quantization ranges, ok.
-    Reset();
-    AddTestTensor("input", {1, 2, 3});
-    RunValidationAndConversion(node_def);
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(GetTensorOrWeights("my_relu6", &output));
-    EXPECT_TRUE(output.is_tensor());
-    auto ranges = quantization_ranges();
-    EXPECT_EQ(ranges[output.tensor()], 6.0f);
-
-    std::vector<float> output_data(6);
-    BuildAndRun<float>({{"input", {-100, -1, 0, 3, 5, 9}}}, "my_relu6",
-                       &output_data);
-    EXPECT_THAT(output_data, ElementsAre(0, 0, 0, 3, 5, 6));
-  }
-}
-
 template <DataType dtype>
 void TestConvertSquare(OpConverterTest* test) {
   test->Reset();
@@ -2002,24 +2151,26 @@ void TestConvertSquare(OpConverterTest* test) {
   ExpectTrtDimsEqualsArray({1, 20}, output.tensor()->getDimensions());
 
   const int num_inputs = 20;
-  std::vector<CType> input_data(num_inputs);
-  std::vector<CType> expected_output_data(num_inputs);
+  std::vector<CType> inputs(num_inputs);
+  std::vector<CType> expected_outputs(num_inputs);
   for (int i = 0; i < 20; i++) {
     const CType value = CType(i - 9);
-    input_data[i] = value;
-    expected_output_data[i] = value * value;
+    inputs[i] = value;
+    expected_outputs[i] = value * value;
   }
-  std::vector<CType> output_data(num_inputs);
-  test->BuildAndRun<CType>({{"input", input_data}}, "my_square", &output_data);
-  ExpectArrayNear(expected_output_data, output_data);
+  const DataVec input_data{{"input", test::AsTensor<CType>(inputs)}};
+  DataVec output_data{{"my_square", ConstructTensor<CType>(num_inputs)}};
+  test->BuildAndRun(input_data, &output_data);
+  ExpectArrayNear(expected_outputs, GetSpanForData<CType>(output_data[0]));
 }
 
 TEST_F(OpConverterTest, ConvertSquare) {
   {
     // Input list is empty, should fail.
     NodeDef node_def = MakeNodeDef("my_square", "Square", {});
-    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
-                               "Square expects one input, at my_square");
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Square got 0 inputs but expected 1, at my_square");
   }
   {
     // Input is weights, should fail.
@@ -2031,7 +2182,7 @@ TEST_F(OpConverterTest, ConvertSquare) {
     AddTestWeights<float>("input", {1, 2, 3}, {1, 2, 3, 4, -5, 6});
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
-        "Square is only implemented for tensors, at my_square");
+        "The input \"x\" for Square must be a tensor, at my_square");
   }
 
   // OK. Note that kINT32 is not supported by IElementWiseLayer, so we don't
@@ -2047,7 +2198,7 @@ TEST_F(OpConverterTest, ConvertActivation) {
     // Input list is empty, should fail.
     NodeDef node_def = MakeNodeDef("my_act", "Relu", {});
     RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
-                               "Relu expects one input, at my_act");
+                               "Relu got 0 inputs but expected 1, at my_act");
   }
   {
     // Input is weights, should fail.
@@ -2059,16 +2210,26 @@ TEST_F(OpConverterTest, ConvertActivation) {
     AddTestWeights<int32>("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
-        "Relu is only implemented for tensors, at my_act");
+        "The input \"input\" for Relu must be a tensor, at my_act");
   }
 
+  constexpr float kAlpha = 0.2f;
+
   // Get nodedef for activation layer.
   auto get_act_nodedef = [](string op_name) -> NodeDef {
     Scope s = Scope::NewRootScope();
     auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    if (op_name == "Relu") {
+    if (op_name == "LeakyRelu") {
+      // LeakyRelu does not have a C++ API
+      NodeDef node_def = MakeNodeDef("my_act", "LeakyRelu", {"input"});
+      (*node_def.mutable_attr())["alpha"].set_f(kAlpha);
+      return node_def;
+    } else if (op_name == "Relu") {
       auto act = ops::Relu(s.WithOpName("my_act"), input);
       return act.operation.node()->def();
+    } else if (op_name == "Relu6") {
+      auto act = ops::Relu6(s.WithOpName("my_act"), input);
+      return act.operation.node()->def();
     } else if (op_name == "Sigmoid") {
       auto act = ops::Sigmoid(s.WithOpName("my_act"), input);
       return act.operation.node()->def();
@@ -2081,8 +2242,12 @@ TEST_F(OpConverterTest, ConvertActivation) {
   };
   // Get expected output for activation layer.
   auto get_act_output = [](string op_name, float input) -> float {
-    if (op_name == "Relu") {
+    if (op_name == "LeakyRelu") {
+      return (input > 0.0f) ? input : input * kAlpha;
+    } else if (op_name == "Relu") {
       return (input > 0.0f) ? input : 0.0f;
+    } else if (op_name == "Relu6") {
+      return std::min(std::max(input, 0.0f), 6.0f);
     } else if (op_name == "Sigmoid") {
       return 1.0f / (1.0f + std::exp(-input));
     } else if (op_name == "Tanh") {
@@ -2093,7 +2258,8 @@ TEST_F(OpConverterTest, ConvertActivation) {
   };
 
   // Ok.
-  for (string op_name : {"Relu", "Sigmoid", "Tanh"}) {
+  for (const string& op_name :
+       {"LeakyRelu", "Relu", "Relu6", "Sigmoid", "Tanh"}) {
     Reset();
     NodeDef node_def = get_act_nodedef(op_name);
     AddTestTensor("input", {1, 2, 3});
@@ -2102,13 +2268,20 @@ TEST_F(OpConverterTest, ConvertActivation) {
     TF_EXPECT_OK(GetTensorOrWeights("my_act", &output));
     EXPECT_TRUE(output.is_tensor());
     ExpectTrtDimsEqualsArray({1, 2, 3}, output.tensor()->getDimensions());
+    if (op_name == "Relu6") {
+      // Relu6 should set quantization range automatically.
+      auto ranges = quantization_ranges();
+      EXPECT_EQ(ranges[output.tensor()], 6.0f);
+    }
 
-    const std::vector<float> input_data = {-100, -2, -1, 0, 1, 100};
-    std::vector<float> output_data(6);
-    BuildAndRun<float>({{"input", input_data}}, "my_act", &output_data);
-    for (int i = 0; i < input_data.size(); i++) {
-      const float expected_output = get_act_output(op_name, input_data[i]);
-      EXPECT_FLOAT_EQ(output_data[i], expected_output);
+    const std::vector<float> input = {-100, -2, -1, 0, 1, 100};
+    const DataVec input_data{{"input", test::AsTensor<float>(input)}};
+    DataVec output_data{{"my_act", ConstructTensor<float>(6)}};
+    BuildAndRun(input_data, &output_data);
+    for (int i = 0; i < input.size(); i++) {
+      const float expected_output = get_act_output(op_name, input[i]);
+      EXPECT_FLOAT_EQ(GetSpanForData<float>(output_data[0])[i],
+                      expected_output);
     }
   }
 }
@@ -2119,7 +2292,7 @@ TEST_F(OpConverterTest, ConvertExpandDims) {
     NodeDef node_def = MakeNodeDef("my_expanddims", "ExpandDims", {});
     RunValidationAndConversion(
         node_def, error::INVALID_ARGUMENT,
-        "Two inputs expected for ExpandDims, at my_expanddims");
+        "ExpandDims got 0 inputs but expected 2, at my_expanddims");
   }
 
   // Get the NodeDef for ExpandDims.
@@ -2129,24 +2302,23 @@ TEST_F(OpConverterTest, ConvertExpandDims) {
   auto expanddims =
       ops::ExpandDims(s.WithOpName("my_expanddims"), input, weights);
   const NodeDef& node_def = expanddims.operation.node()->def();
-
   {
     // Input is weights, should fail.
     Reset();
     AddTestWeights<int32>("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
     AddTestWeights<int32>("weights", {1}, {1});
-    RunValidationAndConversion(
-        node_def, error::UNIMPLEMENTED,
-        "ExpandDims expects tensor for input, at my_expanddims");
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "The input \"input\" for ExpandDims must be a "
+                               "tensor, at my_expanddims");
   }
   {
     // Axis is a tensor, should fail.
     Reset();
     AddTestTensor("input", {1, 2, 3});
     AddTestTensor("weights", {3});
-    RunValidationAndConversion(
-        node_def, error::INVALID_ARGUMENT,
-        "ExpandDims expects weights for axis, at my_expanddims");
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "The input \"axis\" for ExpandDims must be a "
+                               "constant, at my_expanddims");
   }
   {
     // Add dim at batch dimension, should fail.
@@ -2193,11 +2365,6 @@ TEST_F(OpConverterTest, ConvertExpandDims) {
   }
 
   struct TestParams {
-    TestParams(const std::vector<int>& input_dims, int axis,
-               const std::vector<int>& expected_output_dims)
-        : input_dims(input_dims),
-          axis(axis),
-          expected_output_dims(expected_output_dims) {}
     std::vector<int> input_dims;
     int axis;
     std::vector<int> expected_output_dims;
@@ -2222,10 +2389,12 @@ TEST_F(OpConverterTest, ConvertExpandDims) {
     ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
                              output.tensor()->getDimensions());
 
-    std::vector<float> output_data(6);
-    BuildAndRun<float>({{"input", {1, 2, 3, 4, 5, 6}}}, "my_expanddims",
-                       &output_data);
-    EXPECT_THAT(output_data, ElementsAre(1, 2, 3, 4, 5, 6));
+    const DataVec input_data{
+        {"input", test::AsTensor<float>({1, 2, 3, 4, 5, 6})}};
+    DataVec output_data{{"my_expanddims", ConstructTensor<float>(6)}};
+    BuildAndRun(input_data, &output_data);
+    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
+                ElementsAre(1, 2, 3, 4, 5, 6));
   }
 }
 
@@ -2233,8 +2402,9 @@ TEST_F(OpConverterTest, ConvertSqueeze) {
   {
     // Input list is empty, should fail.
     NodeDef node_def = MakeNodeDef("my_squeeze", "Squeeze", {});
-    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
-                               "One input expected for Squeeze, at my_squeeze");
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Squeeze got 0 inputs but expected 1, at my_squeeze");
   }
   {
     // No attrs, should fail.
@@ -2254,7 +2424,7 @@ TEST_F(OpConverterTest, ConvertSqueeze) {
     Scope s = Scope::NewRootScope();
     auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
     ops::Squeeze::Attrs squeeze_attrs;
-    squeeze_attrs.axis_ = gtl::ArraySlice<int>(axis);
+    squeeze_attrs.axis_ = gtl::ArraySlice<int>(axis);  // non-absl ok
     auto squeeze =
         ops::Squeeze(s.WithOpName("my_squeeze"), input, squeeze_attrs);
     return squeeze.operation.node()->def();
@@ -2267,7 +2437,7 @@ TEST_F(OpConverterTest, ConvertSqueeze) {
     AddTestWeights<float>("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
-        "Squeeze expects tensor for input, at my_squeeze");
+        "The input \"input\" for Squeeze must be a tensor, at my_squeeze");
   }
   {
     // Squeeze batch dim, should fail.
@@ -2307,11 +2477,6 @@ TEST_F(OpConverterTest, ConvertSqueeze) {
   }
 
   struct TestParams {
-    TestParams(const std::vector<int>& input_dims, const std::vector<int>& axis,
-               const std::vector<int>& expected_output_dims)
-        : input_dims(input_dims),
-          axis(axis),
-          expected_output_dims(expected_output_dims) {}
     std::vector<int> input_dims;
     std::vector<int> axis;
     std::vector<int> expected_output_dims;
@@ -2342,10 +2507,1117 @@ TEST_F(OpConverterTest, ConvertSqueeze) {
     ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
                              output.tensor()->getDimensions());
 
-    std::vector<float> output_data(6);
-    BuildAndRun<float>({{"input", {1, 2, 3, 4, 5, 6}}}, "my_squeeze",
-                       &output_data);
-    EXPECT_THAT(output_data, ElementsAre(1, 2, 3, 4, 5, 6));
+    const DataVec input_data{
+        {"input", test::AsTensor<float>({1, 2, 3, 4, 5, 6})}};
+    DataVec output_data{{"my_squeeze", ConstructTensor<float>(6)}};
+    BuildAndRun(input_data, &output_data);
+    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
+                ElementsAre(1, 2, 3, 4, 5, 6));
+  }
+}
+
+TEST_F(OpConverterTest, ConvertStridedSlice) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_strided_slice", "StridedSlice", {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "StridedSlice got 0 inputs but expected 4, at my_strided_slice");
+  }
+
+  // Get nodedef for StridedSlice layer.
+  auto get_strided_slice_nodedef =
+      [](int begin_mask = 0, int end_mask = 0, int ellipsis_mask = 0,
+         int new_axis_mask = 0, int shrink_axis_mask = 0) -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto begin = ops::Placeholder(s.WithOpName("begin"), DT_INT32);
+    auto end = ops::Placeholder(s.WithOpName("end"), DT_INT32);
+    auto strides = ops::Placeholder(s.WithOpName("strides"), DT_INT32);
+    ops::StridedSlice::Attrs attrs = ops::StridedSlice::Attrs()
+                                         .BeginMask(begin_mask)
+                                         .EndMask(end_mask)
+                                         .EllipsisMask(ellipsis_mask)
+                                         .NewAxisMask(new_axis_mask)
+                                         .ShrinkAxisMask(shrink_axis_mask);
+    auto strided_slice = ops::StridedSlice(s.WithOpName("my_strided_slice"),
+                                           input, begin, end, strides, attrs);
+    return strided_slice.operation.node()->def();
+  };
+
+  {
+    // Input is weights, should fail.
+    Reset();
+    NodeDef node_def = get_strided_slice_nodedef();
+    AddTestWeights<int32>("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("end", {4}, {1, 1, 2, 3});
+    AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "The input \"input\" for StridedSlice must be a "
+                               "tensor, at my_strided_slice");
+  }
+  {
+    // Begin, end, strides are tensors, should fail.
+    Reset();
+    NodeDef node_def = get_strided_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("begin", {4});
+    AddTestTensor("end", {4});
+    AddTestTensor("strides", {4});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "The input \"begin\" for StridedSlice must be a constant, at "
+        "my_strided_slice");
+  }
+  {
+    // Non-zero ellipsis_mask, should fail.
+    Reset();
+    NodeDef node_def = get_strided_slice_nodedef(
+        /*begin_mask=*/0, /*end_mask=*/0, /*ellipsis_mask=*/2,
+        /*new_axis_mask=*/0, /*shrink_axis_mask=*/0);
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("end", {4}, {1, 1, 2, 3});
+    AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "ellipsis_mask is not supported for StridedSlice, at "
+        "my_strided_slice");
+  }
+  {
+    // Modify batch dim, should fail.
+    Reset();
+    NodeDef node_def = get_strided_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("end", {4}, {0, 1, 2, 3});
+    AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "TensorRT does not allow modifications to the batch dimension, at "
+        "my_strided_slice");
+  }
+  {
+    // Dynamic batch size without end_mask, should fail.
+    Reset();
+    NodeDef node_def = get_strided_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3}, /*batch_size=*/-1);
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("end", {4}, {1, 1, 2, 3});
+    AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "TensorRT does not allow modifications to the batch dimension, at "
+        "my_strided_slice");
+  }
+  {
+    // Dynamic batch size but using end_mask, ok.
+    Reset();
+    NodeDef node_def = get_strided_slice_nodedef(/*begin_mask=*/0,
+                                                 /*end_mask=*/1);
+    AddTestTensor("input", {1, 2, 3}, /*batch_size=*/-1);
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("end", {4}, {0, 1, 2, 2});
+    AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
+    RunValidationAndConversion(node_def);
+  }
+// TRT 5.1+ supports strides
+#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+  {
+    // Negative strides, should fail.
+    Reset();
+    NodeDef node_def = get_strided_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("end", {4}, {1, 1, 2, 3});
+    AddTestWeights<int32>("strides", {4}, {1, 1, 1, -1});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Negative or zero stride values are not "
+                               "supported for StridedSlice, at "
+                               "my_strided_slice");
+  }
+#else
+  {
+    // Stride is not 1, should fail.
+    Reset();
+    NodeDef node_def = get_strided_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("end", {4}, {1, 1, 2, 3});
+    AddTestWeights<int32>("strides", {4}, {1, 2, 1, 3});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Strides other than 1 are not supported with "
+                               "this version of TRT, at my_strided_slice");
+  }
+#endif
+  {
+    // Size of sliced dim is negative, should fail.
+    Reset();
+    NodeDef node_def = get_strided_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {0, 0, 2, 0});
+    AddTestWeights<int32>("end", {4}, {1, 1, 0, 3});
+    AddTestWeights<int32>("strides", {4}, {1, 1, 1, 1});
+    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+                               "\"size\" cannot be negative or zero for "
+                               "StridedSlice, at my_strided_slice");
+  }
+
+  struct TestParams {
+    std::vector<int> input_dims;
+    std::vector<int> begin;
+    std::vector<int> end;
+    std::vector<int> strides;
+    int begin_mask;
+    int end_mask;
+    std::vector<int> expected_output_dims;
+    std::vector<float> expected_output;
+  };
+
+  auto get_mask = [](const std::vector<int>& mask) {
+    int result = 0;
+    for (int i = 0; i < mask.size(); i++) {
+      if (mask[i]) result += (1 << i);
+    }
+    return result;
+  };
+
+  // Same input is used for all tests.
+  const std::vector<float> ok_input = {1, 2, 3, 4, 5, 6};
+
+#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+  const int kStridedSliceOKCases = 23;
+#else
+  const int kStridedSliceOKCases = 19;
+#endif
+  // Ok.
+  TestParams ok_params[kStridedSliceOKCases] = {
+    // 2D Crop.
+    TestParams{/*input_dims=*/{1, 2, 3}, /*begin=*/{0, 0, 0, 0},
+               /*end=*/{0, 0, 1, 2}, /*strides=*/{1, 1, 1, 1},
+               /*begin_mask=*/get_mask({0, 0, 0, 0}),
+               /*end_mask=*/get_mask({1, 1, 0, 0}),
+               /*expected_output_dims=*/{1, 1, 2}, /*expected_output=*/{1, 2}},
+    TestParams{
+        /*input_dims=*/{1, 2, 3},
+        /*begin=*/{0, 0, 1, 1}, /*end=*/{0, 0, 0, 0}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 1, 1, 1}), /*expected_output_dims=*/{1, 1, 2},
+        /*expected_output=*/{5, 6}},
+    TestParams{
+        /*input_dims=*/{1, 2, 3},
+        /*begin=*/{0, 0, 1, 1}, /*end=*/{0, 1, 2, 3}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 1, 0, 0}), /*expected_output_dims=*/{1, 1, 2},
+        /*expected_output=*/{5, 6}},
+    // 2D Crop, with transpose.
+    TestParams{
+        /*input_dims=*/{2, 3, 1},
+        /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 1, 2, 1}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 0, 0, 0}), /*expected_output_dims=*/{1, 2, 1},
+        /*expected_output=*/{1, 2}},
+    TestParams{
+        /*input_dims=*/{2, 3, 1},
+        /*begin=*/{0, 1, 1, 0}, /*end=*/{0, 2, 3, 1}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 0, 0, 0}), /*expected_output_dims=*/{1, 2, 1},
+        /*expected_output=*/{5, 6}},
+    TestParams{
+        /*input_dims=*/{2, 1, 3},
+        /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 1, 1, 2}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 0, 0, 0}), /*expected_output_dims=*/{1, 1, 2},
+        /*expected_output=*/{1, 2}},
+    TestParams{
+        /*input_dims=*/{2, 1, 3},
+        /*begin=*/{0, 1, 0, 1}, /*end=*/{0, 2, 1, 3}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 0, 0, 0}), /*expected_output_dims=*/{1, 1, 2},
+        /*expected_output=*/{5, 6}},
+    // 2D Crop, with reshape.
+    TestParams{/*input_dims=*/{2, 3},
+               /*begin=*/{0, 0, 0}, /*end=*/{0, 1, 2}, /*strides=*/{1, 1, 1},
+               /*begin_mask=*/get_mask({0, 0, 0}),
+               /*end_mask=*/get_mask({1, 0, 0}),
+               /*expected_output_dims=*/{1, 2},
+               /*expected_output=*/{1, 2}},
+    TestParams{/*input_dims=*/{2, 3},
+               /*begin=*/{0, 1, 1}, /*end=*/{0, 0, 0}, /*strides=*/{1, 1, 1},
+               /*begin_mask=*/get_mask({0, 0, 0}),
+               /*end_mask=*/get_mask({1, 1, 1}),
+               /*expected_output_dims=*/{1, 2},
+               /*expected_output=*/{5, 6}},
+    // 1D Crop.
+    TestParams{
+        /*input_dims=*/{1, 2, 3},
+        /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 0, 0, 2}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 1, 1, 0}), /*expected_output_dims=*/{1, 2, 2},
+        /*expected_output=*/{1, 2, 4, 5}},
+    TestParams{
+        /*input_dims=*/{1, 2, 3},
+        /*begin=*/{0, 0, 1, 0}, /*end=*/{0, 0, 0, 0}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 1, 1, 1}), /*expected_output_dims=*/{1, 1, 3},
+        /*expected_output=*/{4, 5, 6}},
+    // 1D Crop, with transpose.
+    TestParams{
+        /*input_dims=*/{2, 3, 1},
+        /*begin=*/{0, 0, 0, 0}, /*end=*/{0, 1, 0, 0}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 0, 1, 1}), /*expected_output_dims=*/{1, 3, 1},
+        /*expected_output=*/{1, 2, 3}},
+    TestParams{
+        /*input_dims=*/{2, 3, 1},
+        /*begin=*/{0, 1, 0, 0}, /*end=*/{0, 0, 0, 0}, /*strides=*/{1, 1, 1, 1},
+        /*begin_mask=*/get_mask({0, 0, 0, 0}),
+        /*end_mask=*/get_mask({1, 1, 1, 1}), /*expected_output_dims=*/{1, 3, 1},
+        /*expected_output=*/{4, 5, 6}},
+    // 1D Crop, with reshape.
+    TestParams{/*input_dims=*/{6},
+               /*begin=*/{0, 0}, /*end=*/{0, 3}, /*strides=*/{1, 1},
+               /*begin_mask=*/get_mask({0, 0}), /*end_mask=*/get_mask({1, 0}),
+               /*expected_output_dims=*/{3},
+               /*expected_output=*/{1, 2, 3}},
+    TestParams{/*input_dims=*/{1, 6},
+               /*begin=*/{0, 0, 2}, /*end=*/{0, 0, 5}, /*strides=*/{1, 1, 1},
+               /*begin_mask=*/get_mask({0, 0, 0}),
+               /*end_mask=*/get_mask({1, 1, 0}),
+               /*expected_output_dims=*/{1, 3},
+               /*expected_output=*/{3, 4, 5}},
+    TestParams{/*input_dims=*/{6, 1},
+               /*begin=*/{0, 2, 0}, /*end=*/{0, 5, 0}, /*strides=*/{1, 1, 1},
+               /*begin_mask=*/get_mask({0, 0, 0}),
+               /*end_mask=*/get_mask({1, 0, 1}),
+               /*expected_output_dims=*/{3, 1},
+               /*expected_output=*/{3, 4, 5}},
+    // Negative axis.
+    TestParams{/*input_dims=*/{6, 1},
+               /*begin=*/{0, -6, 0}, /*end=*/{0, -3, 0}, /*strides=*/{1, 1, 1},
+               /*begin_mask=*/get_mask({0, 0, 0}),
+               /*end_mask=*/get_mask({1, 0, 1}),
+               /*expected_output_dims=*/{3, 1},
+               /*expected_output=*/{1, 2, 3}},
+    TestParams{/*input_dims=*/{6, 1},
+               /*begin=*/{0, 0, 0}, /*end=*/{0, -1, 0}, /*strides=*/{1, 1, 1},
+               /*begin_mask=*/get_mask({0, 0, 0}),
+               /*end_mask=*/get_mask({1, 0, 1}),
+               /*expected_output_dims=*/{5, 1},
+               /*expected_output=*/{1, 2, 3, 4, 5}},
+    // Clamp out of bounds begin and end.
+    TestParams{/*input_dims=*/{1, 2, 3}, /*begin=*/{0, 0, -9999, -9},
+               /*end=*/{0, 1, 1000, 4}, /*strides=*/{1, 1, 1, 1},
+               /*begin_mask=*/get_mask({0, 0, 0, 0}),
+               /*end_mask=*/get_mask({1, 0, 0, 0}),
+               /*expected_output_dims=*/{1, 2, 3},
+               /*expected_output=*/{1, 2, 3, 4, 5, 6}},
+#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+    // Strides
+    TestParams{/*input_dims=*/{6},
+               /*begin=*/{0, 0}, /*end=*/{0, 5}, /*strides=*/{1, 2},
+               /*begin_mask=*/get_mask({0, 0}), /*end_mask=*/get_mask({1, 0}),
+               /*expected_output_dims=*/{3},
+               /*expected_output=*/{1, 3, 5}},
+    TestParams{/*input_dims=*/{6},
+               /*begin=*/{0, 0}, /*end=*/{0, 6}, /*strides=*/{1, 2},
+               /*begin_mask=*/get_mask({0, 0}), /*end_mask=*/get_mask({1, 0}),
+               /*expected_output_dims=*/{3},
+               /*expected_output=*/{1, 3, 5}},
+    TestParams{/*input_dims=*/{6},
+               /*begin=*/{0, 1}, /*end=*/{0, 6}, /*strides=*/{1, 2},
+               /*begin_mask=*/get_mask({0, 0}), /*end_mask=*/get_mask({1, 0}),
+               /*expected_output_dims=*/{3},
+               /*expected_output=*/{2, 4, 6}},
+    TestParams{/*input_dims=*/{6},
+               /*begin=*/{0, 2}, /*end=*/{0, 6}, /*strides=*/{1, 3},
+               /*begin_mask=*/get_mask({0, 0}), /*end_mask=*/get_mask({1, 0}),
+               /*expected_output_dims=*/{2},
+               /*expected_output=*/{3, 6}},
+#endif
+  };
+
+  for (int i = 0; i < kStridedSliceOKCases; i++) {
+    Reset();
+    NodeDef node_def = get_strided_slice_nodedef(ok_params[i].begin_mask,
+                                                 ok_params[i].end_mask);
+    AddTestTensor("input", ok_params[i].input_dims);
+    AddTestWeights<int32>("begin",
+                          {static_cast<int>(ok_params[i].begin.size())},
+                          ok_params[i].begin);
+    AddTestWeights<int32>("end", {static_cast<int>(ok_params[i].end.size())},
+                          ok_params[i].end);
+    AddTestWeights<int32>("strides",
+                          {static_cast<int>(ok_params[i].strides.size())},
+                          ok_params[i].strides);
+    RunValidationAndConversion(node_def);
+
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_strided_slice", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
+                             output.tensor()->getDimensions());
+
+    const DataVec input_data{{"input", test::AsTensor<float>(ok_input)}};
+    DataVec output_data{
+        {"my_strided_slice",
+         ConstructTensor<float>(ok_params[i].expected_output.size())}};
+    BuildAndRun(input_data, &output_data);
+    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
+                ElementsAreArray(ok_params[i].expected_output));
+  }
+}
+
+TEST_F(OpConverterTest, ConvertSlice) {
+  // Get nodedef for Slice layer.
+  auto get_slice_nodedef = []() -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto begin = ops::Placeholder(s.WithOpName("begin"), DT_INT32);
+    auto size = ops::Placeholder(s.WithOpName("size"), DT_INT32);
+    auto slice = ops::Slice(s.WithOpName("my_slice"), input, begin, size);
+    return slice.operation.node()->def();
+  };
+
+  {
+    // Begin is below bounds, should fail.
+    Reset();
+    NodeDef node_def = get_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {0, 0, -1, 0});
+    AddTestWeights<int32>("size", {4}, {1, 1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "\"begin\" for dimension 2 in Slice is out of range, at my_slice");
+  }
+  {
+    // Begin is above bounds, should fail.
+    Reset();
+    NodeDef node_def = get_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {0, 0, 3, 0});
+    AddTestWeights<int32>("size", {4}, {1, 1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "\"begin\" for dimension 2 in Slice is out of range, at my_slice");
+  }
+  {
+    // Size is below bounds, should fail.
+    Reset();
+    NodeDef node_def = get_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("size", {4}, {1, 1, 2, -2});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "\"begin\" + \"size\" for dimension 3 in Slice is out of range, at "
+        "my_slice");
+  }
+  {
+    // Size is above bounds, should fail.
+    Reset();
+    NodeDef node_def = get_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("size", {4}, {1, 1, 3, 3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "\"begin\" + \"size\" for dimension 2 in Slice is out of range, at "
+        "my_slice");
+  }
+  {
+    // Modify batch dim, should fail.
+    Reset();
+    NodeDef node_def = get_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("size", {4}, {0, 1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "TensorRT does not allow modifications to the batch dimension, at "
+        "my_slice");
+  }
+  {
+    // Dynamic batch size with size[0] not -1, should fail.
+    Reset();
+    NodeDef node_def = get_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3}, /*batch_size=*/-1);
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("size", {4}, {1, 1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "TensorRT does not allow modifications to the batch dimension, at "
+        "my_slice");
+  }
+  {
+    // Dynamic batch size but using size[0] of -1, ok.
+    Reset();
+    NodeDef node_def = get_slice_nodedef();
+    AddTestTensor("input", {1, 2, 3}, /*batch_size=*/-1);
+    AddTestWeights<int32>("begin", {4}, {0, 0, 0, 0});
+    AddTestWeights<int32>("size", {4}, {-1, 1, 2, 2});
+    RunValidationAndConversion(node_def);
+  }
+
+  struct TestParams {
+    std::vector<int> input_dims;
+    std::vector<int> begin;
+    std::vector<int> size;
+    std::vector<int> expected_output_dims;
+    std::vector<int> expected_output;
+  };
+
+  // Ok.
+  const int kSliceOKCases = 5;
+  TestParams ok_params[kSliceOKCases] = {
+      TestParams{{1, 2, 3},
+                 {0, 0, 0, 0},
+                 {-1, -1, -1, -1},
+                 {1, 2, 3},
+                 {1, 2, 3, 4, 5, 6}},
+      TestParams{
+          {1, 2, 3}, {0, 0, 0, 0}, {1, 1, 2, 3}, {1, 2, 3}, {1, 2, 3, 4, 5, 6}},
+      TestParams{
+          {1, 2, 3}, {0, 0, 0, 0}, {1, -1, 2, 2}, {1, 2, 2}, {1, 2, 4, 5}},
+      TestParams{{6}, {0, 1}, {1, 5}, {5}, {2, 3, 4, 5, 6}},
+      TestParams{{6}, {0, 1}, {-1, 3}, {3}, {2, 3, 4}},
+  };
+
+  for (int i = 0; i < kSliceOKCases; i++) {
+    Reset();
+    NodeDef node_def = get_slice_nodedef();
+    AddTestTensor("input", ok_params[i].input_dims);
+    AddTestWeights<int32>("begin",
+                          {static_cast<int>(ok_params[i].begin.size())},
+                          ok_params[i].begin);
+    AddTestWeights<int32>("size", {static_cast<int>(ok_params[i].size.size())},
+                          ok_params[i].size);
+    RunValidationAndConversion(node_def);
+
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_slice", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
+                             output.tensor()->getDimensions());
+
+    const DataVec input_data{
+        {"input", test::AsTensor<float>({1, 2, 3, 4, 5, 6})}};
+    DataVec output_data{{"my_slice", ConstructTensor<float>(
+                                         ok_params[i].expected_output.size())}};
+    BuildAndRun(input_data, &output_data);
+    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
+                ElementsAreArray(ok_params[i].expected_output));
+  }
+}
+
+TEST_F(OpConverterTest, ConvertConv2D) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_conv2d", "Conv2D", {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Conv2D got 0 inputs but expected 2, at my_conv2d");
+  }
+
+  // Get nodedef for Conv2D layer.
+  auto get_conv2d_nodedef =
+      [](std::vector<int> strides = {1, 1, 1, 1}, string padding = "SAME",
+         string data_format = "NCHW", std::vector<int> dilations = {1, 1, 1, 1},
+         bool is_conv2d_backprop_input = false) -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto filter = ops::Placeholder(s.WithOpName("weights"), DT_FLOAT);
+    if (is_conv2d_backprop_input) {
+      auto input_sizes =
+          ops::Placeholder(s.WithOpName("input_sizes"), DT_INT32);
+      ops::Conv2DBackpropInput::Attrs attrs = ops::Conv2DBackpropInput::Attrs()
+                                                  .DataFormat(data_format)
+                                                  .Dilations(dilations);
+      auto conv2d =
+          ops::Conv2DBackpropInput(s.WithOpName("my_conv2d"), input_sizes,
+                                   filter, input, strides, padding, attrs);
+      return conv2d.operation.node()->def();
+    } else {
+      ops::Conv2D::Attrs attrs =
+          ops::Conv2D::Attrs().DataFormat(data_format).Dilations(dilations);
+      auto conv2d = ops::Conv2D(s.WithOpName("my_conv2d"), input, filter,
+                                strides, padding, attrs);
+      return conv2d.operation.node()->def();
+    }
+  };
+
+  {
+    // Input is weights, should fail.
+    Reset();
+    NodeDef node_def = get_conv2d_nodedef();
+    AddTestWeights<float>("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
+    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "The input \"input\" for Conv2D must be a tensor, at my_conv2d");
+  }
+  {
+    // Filter is tensor, should fail.
+    Reset();
+    NodeDef node_def = get_conv2d_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("weights", {3, 3, 1, 1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "The input \"filter\" for Conv2D must be a constant, at my_conv2d");
+  }
+  {
+    // Filter is not 4D, should fail.
+    Reset();
+    NodeDef node_def = get_conv2d_nodedef();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights", {3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Conv2D expects kernel of dimension 4, at my_conv2d");
+  }
+  {
+    // Dilations is not 4D, should fail.
+    Reset();
+    NodeDef node_def =
+        get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NCHW", {1, 1, 1});
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Convolution dilations field must specify 4 dimensions, at my_conv2d");
+  }
+  {
+    // Dilation value is not 1 for channel, should fail.
+    Reset();
+    NodeDef node_def =
+        get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NCHW", {1, 2, 1, 1});
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Dilation rate must be 1 for batch and channel "
+                               "dimensions, at my_conv2d");
+  }
+  {
+    // Dilation value is not 1 for channel (NHWC), should fail.
+    Reset();
+    NodeDef node_def =
+        get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NHWC", {1, 1, 1, 2});
+    AddTestTensor("input", {2, 3, 1});
+    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Dilation rate must be 1 for batch and channel "
+                               "dimensions, at my_conv2d");
+  }
+  {
+    // Dilation + Conv2DBackpropInput, should fail.
+    Reset();
+    NodeDef node_def =
+        get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NHWC", {1, 1, 2, 1}, true);
+    AddTestTensor("input", {2, 3, 1});
+    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    AddTestWeights<int>("input_sizes", {4}, {1, 2, 3, 1});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Dilation with Conv2DBackpropInput "
+                               "(conv2d_transpose) is not supported, "
+                               "at my_conv2d");
+  }
+  {
+    // Strides is not 4D, should fail.
+    Reset();
+    NodeDef node_def =
+        get_conv2d_nodedef({1, 1, 1}, "SAME", "NCHW", {1, 1, 1, 1});
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Convolution strides field must specify 4 dimensions, at my_conv2d");
+  }
+  {
+    // Stride value is not 1 for channel, should fail.
+    Reset();
+    NodeDef node_def =
+        get_conv2d_nodedef({1, 2, 1, 1}, "SAME", "NCHW", {1, 1, 1, 1});
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Stride must be 1 for batch and channel dimensions, at my_conv2d");
+  }
+
+  struct TestParams {
+    std::vector<int> input_dims;
+    std::vector<float> input;
+    std::vector<int> filter_dims;
+    std::vector<float> filter;
+    std::vector<int> strides;
+    string padding;
+    string data_format;
+    std::vector<int> dilations;
+    bool is_conv2d_backprop_input;
+    std::vector<int> expected_output_dims;
+    std::vector<float> expected_output;
+  };
+
+  // Ok.
+  const int kConv2DOKCases = 7;
+  TestParams ok_params[kConv2DOKCases] = {
+      // Basic
+      TestParams{/*input_dims=*/{1, 2, 3},
+                 /*input=*/{0, 1, 2, 3, 3, 4},
+                 /*filter_dims=*/{1, 2, 1, 1},
+                 /*filter=*/{-1, 1},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*padding=*/"VALID",
+                 /*data_format=*/"NCHW",
+                 /*dilations=*/{1, 1, 1, 1},
+                 /*is_conv2d_backprop_input=*/false,
+                 /*expected_output_dims=*/{1, 2, 2},
+                 /*expected_output=*/{1, 1, 0, 1}},
+      // SAME padding (Asymmetric)
+      TestParams{/*input_dims=*/{1, 2, 3},
+                 /*input=*/{0, 1, 2, 3, 3, 4},
+                 /*filter_dims=*/{1, 2, 1, 1},
+                 /*filter=*/{-1, 1},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*padding=*/"SAME",
+                 /*data_format=*/"NCHW",
+                 /*dilations=*/{1, 1, 1, 1},
+                 /*is_conv2d_backprop_input=*/false,
+                 /*expected_output_dims=*/{1, 2, 3},
+                 /*expected_output=*/{1, 1, -2, 0, 1, -4}},
+      // SAME padding (Symmetric)
+      TestParams{/*input_dims=*/{1, 2, 3},
+                 /*input=*/{0, 1, 2, 3, 3, 4},
+                 /*filter_dims=*/{1, 3, 1, 1},
+                 /*filter=*/{-1, 0, 1},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*padding=*/"SAME",
+                 /*data_format=*/"NCHW",
+                 /*dilations=*/{1, 1, 1, 1},
+                 /*is_conv2d_backprop_input=*/false,
+                 /*expected_output_dims=*/{1, 2, 3},
+                 /*expected_output=*/{1, 2, -1, 3, 1, -3}},
+      // NHWC
+      TestParams{/*input_dims=*/{2, 3, 1},
+                 /*input=*/{0, 1, 2, 3, 3, 4},
+                 /*filter_dims=*/{1, 2, 1, 1},
+                 /*filter=*/{-1, 1},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*padding=*/"VALID",
+                 /*data_format=*/"NHWC",
+                 /*dilations=*/{1, 1, 1, 1},
+                 /*is_conv2d_backprop_input=*/false,
+                 /*expected_output_dims=*/{2, 2, 1},
+                 /*expected_output=*/{1, 1, 0, 1}},
+      // Dilated
+      TestParams{/*input_dims=*/{1, 2, 3},
+                 /*input=*/{0, 1, 2, 3, 3, 4},
+                 /*filter_dims=*/{1, 2, 1, 1},
+                 /*filter=*/{-1, 1},
+                 /*strides=*/{1, 1, 1, 1},
+                 /*padding=*/"VALID",
+                 /*data_format=*/"NCHW",
+                 /*dilations=*/{1, 1, 1, 2},
+                 /*is_conv2d_backprop_input=*/false,
+                 /*expected_output_dims=*/{1, 2, 1},
+                 /*expected_output=*/{2, 1}},
+      // Strided
+      TestParams{/*input_dims=*/{1, 2, 4},
+                 /*input=*/{0, 1, 2, 2, 3, 4, 4, 7},
+                 /*filter_dims=*/{1, 2, 1, 1},
+                 /*filter=*/{-1, 1},
+                 /*strides=*/{1, 1, 1, 2},
+                 /*padding=*/"VALID",
+                 /*data_format=*/"NCHW",
+                 /*dilations=*/{1, 1, 1, 1},
+                 /*is_conv2d_backprop_input=*/false,
+                 /*expected_output_dims=*/{1, 2, 2},
+                 /*expected_output=*/{1, 0, 1, 3}},
+      // Transpose Strided
+      TestParams{/*input_dims=*/{1, 2, 2},
+                 /*input=*/{0, 1, 2, 3},
+                 /*filter_dims=*/{1, 2, 1, 1},
+                 /*filter=*/{-1, 1},
+                 /*strides=*/{1, 1, 1, 2},
+                 /*padding=*/"SAME",
+                 /*data_format=*/"NCHW",
+                 /*dilations=*/{1, 1, 1, 1},
+                 /*is_conv2d_backprop_input=*/true,
+                 /*expected_output_dims=*/{1, 2, 4},
+                 /*expected_output=*/{0, 0, -1, 1, -2, 2, -3, 3}},
+  };
+
+  for (int i = 0; i < kConv2DOKCases; i++) {
+    Reset();
+    NodeDef node_def = get_conv2d_nodedef(
+        ok_params[i].strides, ok_params[i].padding, ok_params[i].data_format,
+        ok_params[i].dilations, ok_params[i].is_conv2d_backprop_input);
+    AddTestTensor("input", ok_params[i].input_dims);
+    AddTestWeights<float>("weights", ok_params[i].filter_dims,
+                          ok_params[i].filter);
+    if (ok_params[i].is_conv2d_backprop_input) {
+      AddTestWeights<float>(
+          "input_sizes",
+          {static_cast<int>(ok_params[i].expected_output.size())},
+          ok_params[i].expected_output);
+    }
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_conv2d", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
+                             output.tensor()->getDimensions());
+
+    const DataVec input_data{
+        {"input", test::AsTensor<float>(ok_params[i].input)}};
+    DataVec output_data{
+        {"my_conv2d",
+         ConstructTensor<float>(ok_params[i].expected_output.size())}};
+    BuildAndRun(input_data, &output_data);
+    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
+                ElementsAreArray(ok_params[i].expected_output));
+  }
+}
+
+TEST_F(OpConverterTest, ConvertTopK) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_topk", "TopKV2", {});
+    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+                               "Input expects tensor and weights, at my_topk");
+  }
+
+  for (const auto dtype : {DT_FLOAT, DT_INT32}) {
+    // Get the NodeDef for TopKV2.
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), dtype);
+    auto weights = ops::Placeholder(s.WithOpName("weights"), DT_INT32);
+    auto topk = ops::TopK(s.WithOpName("my_topk"), input, weights);
+    const NodeDef& node_def = topk.operation.node()->def();
+    {
+      // K is a tensor, should fail.
+      Reset();
+      AddTestTensor("input", {1, 2, 3}, /*batch_size=*/1,
+                    /*trt_dtype=*/TfDataTypeToTrt(dtype));
+      AddTestTensor("weights", {2});
+      RunValidationAndConversion(
+          node_def, error::INVALID_ARGUMENT,
+          "Input expects tensor and weights, at my_topk");
+    }
+    {
+      // Ok.
+      Reset();
+      AddTestTensor("input", {1, 2, 5});
+      AddTestWeights<int32>("weights", {1}, {2});
+      RunValidationAndConversion(node_def);
+      TRT_TensorOrWeights outputs[2];
+      TF_EXPECT_OK(GetTensorOrWeights("my_topk", &outputs[0]));
+      TF_EXPECT_OK(GetTensorOrWeights("my_topk:1", &outputs[1]));
+      for (auto& output : outputs) {
+        EXPECT_TRUE(output.is_tensor());
+        ExpectTrtDimsEqualsArray({1, 2, 2}, output.tensor()->getDimensions());
+      }
+
+      const DataVec input_data{
+          {"input", test::AsTensor<float>({-9, 3, 5, 1, 6, -5, 7, 1, 0, -1})}};
+      DataVec output_data{{"my_topk", ConstructTensor<float>(4)},
+                          {"my_topk:1", ConstructTensor<int32>(4)}};
+      BuildAndRun(input_data, &output_data);
+      EXPECT_THAT(GetSpanForData<float>(output_data[0]),
+                  ElementsAre(6, 5, 7, 1));
+      EXPECT_THAT(GetSpanForData<int32>(output_data[1]),
+                  ElementsAre(4, 2, 1, 2));
+    }
+  }
+}
+
+template <DataType dtype>
+void TestConvertGather(OpConverterTest* test) {
+  typedef typename EnumToDataType<dtype>::Type CType;
+
+  // Get the NodeDef for GatherV2.
+  Scope s = Scope::NewRootScope();
+  auto params = ops::Placeholder(s.WithOpName("params"), dtype);
+  auto indices = ops::Placeholder(s.WithOpName("indices"), DT_INT32);
+  auto axis = ops::Placeholder(s.WithOpName("axis"), DT_INT32);
+  auto gather = ops::GatherV2(s.WithOpName("my_gather"), params, indices, axis);
+  const NodeDef& node_def = gather.operation.node()->def();
+
+  struct TestParams {
+    std::vector<int> params_dims;
+    std::vector<int> indices_dims;
+    std::vector<int> indices;
+    int axis;
+    std::vector<int> expected_output_dims;
+    std::vector<int> expected_output;
+  };
+
+  // Input is the same {1, 2, 3, 4, 5, 6} for all cases.
+  const int kGatherOKCases = 5;
+  TestParams ok_params[kGatherOKCases] = {
+      // Vector indices (output is rank(params)).
+      TestParams{{1, 2, 3}, {1}, {0}, 3, {1, 2, 1}, {1, 4}},
+      TestParams{{1, 2, 3}, {1}, {1}, 3, {1, 2, 1}, {2, 5}},
+      TestParams{{1, 2, 3}, {1}, {2}, -1, {1, 2, 1}, {3, 6}},
+      TestParams{{1, 2, 3}, {3}, {2, 0, 1}, 3, {1, 2, 3}, {3, 1, 2, 6, 4, 5}},
+      // Higher rank indices (output is rank(params) + rank(indices) - 1).
+      TestParams{{1, 2, 3}, {1, 1}, {0}, 2, {1, 1, 1, 3}, {1, 2, 3}},
+  };
+
+  // Ok.
+  for (int i = 0; i < kGatherOKCases; i++) {
+    test->Reset();
+    test->AddTestTensor("params", ok_params[i].params_dims, 1,
+                        TfDataTypeToTrt(dtype));
+    test->AddTestTensor("indices", ok_params[i].indices_dims, 1,
+                        nvinfer1::DataType::kINT32);
+    test->AddTestWeights<int32>("axis", {1}, {ok_params[i].axis});
+    test->RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(test->GetTensorOrWeights("my_gather", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
+                             output.tensor()->getDimensions());
+
+    // Create input in CType and convert expected output to CType.
+    std::vector<CType> inputs = {CType(1), CType(2), CType(3),
+                                 CType(4), CType(5), CType(6)};
+    std::vector<CType> converted_expected_output(
+        ok_params[i].expected_output.begin(),
+        ok_params[i].expected_output.end());
+
+    const DataVec input_data{
+        {"params", test::AsTensor<CType>(inputs)},
+        {"indices", test::AsTensor<int32>(ok_params[i].indices)}};
+    DataVec output_data{
+        {"my_gather",
+         ConstructTensor<CType>(ok_params[i].expected_output.size())}};
+    test->BuildAndRun(input_data, &output_data);
+    EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
+                ElementsAreArray(converted_expected_output));
+  }
+}
+
+TEST_F(OpConverterTest, ConvertGather) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_gather", "GatherV2", {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "GatherV2 got 0 inputs but expected 3, at my_gather");
+  }
+
+  // Get the NodeDef for GatherV2.
+  Scope s = Scope::NewRootScope();
+  auto params = ops::Placeholder(s.WithOpName("params"), DT_FLOAT);
+  auto indices = ops::Placeholder(s.WithOpName("indices"), DT_INT32);
+  auto axis = ops::Placeholder(s.WithOpName("axis"), DT_INT32);
+  auto gather = ops::GatherV2(s.WithOpName("my_gather"), params, indices, axis);
+  const NodeDef& node_def = gather.operation.node()->def();
+  {
+    // Axis is a tensor, should fail.
+    Reset();
+    AddTestTensor("params", {1, 2, 3});
+    AddTestTensor("indices", {2});
+    AddTestTensor("axis", {1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "The input \"axis\" for GatherV2 must be a constant, at my_gather");
+  }
+  {
+    // Axis is out of bounds, should fail.
+    Reset();
+    AddTestTensor("params", {1, 2, 3});
+    AddTestTensor("indices", {2});
+    AddTestWeights<int32>("axis", {1}, {4});
+    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+                               "Axis value of 4 is out of bounds, must be in "
+                               "range [-4, 4), at my_gather");
+  }
+  {
+    // Axis is batch dimension, should fail.
+    Reset();
+    AddTestTensor("params", {1, 2, 3});
+    AddTestTensor("indices", {2});
+    AddTestWeights<int32>("axis", {1}, {0});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "TensorRT does not allow manipulation of the "
+                               "batch dimension, at my_gather");
+  }
+
+  Reset();
+  TestConvertGather<DT_FLOAT>(this);
+  TestConvertGather<DT_HALF>(this);
+  TestConvertGather<DT_INT32>(this);
+}
+
+TEST_F(OpConverterTest, ConvertUnary) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_unary", "Neg", {});
+    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+                               "Neg got 0 inputs but expected 1, at my_unary");
+  }
+  {
+    // Input is weights, should fail.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto neg = ops::Neg(s.WithOpName("my_unary"), input);
+    const NodeDef& node_def = neg.operation.node()->def();
+    AddTestWeights<float>("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "The input \"x\" for Neg must be a tensor, at my_unary");
+  }
+
+  // Get nodedef for unary layer.
+  auto get_unary_nodedef = [](string op_name) -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    if (op_name == "Abs") {
+      auto unary = ops::Abs(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Acos") {
+      auto unary = ops::Acos(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Acosh") {
+      auto unary = ops::Acosh(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Asin") {
+      auto unary = ops::Asin(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Asinh") {
+      auto unary = ops::Asinh(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Atan") {
+      auto unary = ops::Atan(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Atanh") {
+      auto unary = ops::Atanh(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Ceil") {
+      auto unary = ops::Ceil(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Cos") {
+      auto unary = ops::Cos(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Cosh") {
+      auto unary = ops::Cosh(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Exp") {
+      auto unary = ops::Exp(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Floor") {
+      auto unary = ops::Floor(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Log") {
+      auto unary = ops::Log(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Neg") {
+      auto unary = ops::Neg(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Reciprocal") {
+      auto unary = ops::Reciprocal(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Rsqrt") {
+      auto unary = ops::Rsqrt(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Sin") {
+      auto unary = ops::Sin(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Sinh") {
+      auto unary = ops::Sinh(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Sqrt") {
+      auto unary = ops::Sqrt(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    } else if (op_name == "Tan") {
+      auto unary = ops::Tan(s.WithOpName("my_unary"), input);
+      return unary.operation.node()->def();
+    }
+    EXPECT_TRUE(false);
+    return NodeDef();
+  };
+  // Get expected output for unary layer.
+  auto get_unary_output = [](string op_name, float input) -> float {
+    if (op_name == "Abs") {
+      return std::abs(input);
+    } else if (op_name == "Acos") {
+      return std::acos(input);
+    } else if (op_name == "Acosh") {
+      return std::acosh(input);
+    } else if (op_name == "Asin") {
+      return std::asin(input);
+    } else if (op_name == "Asinh") {
+      return std::asinh(input);
+    } else if (op_name == "Atan") {
+      return std::atan(input);
+    } else if (op_name == "Atanh") {
+      return std::atanh(input);
+    } else if (op_name == "Ceil") {
+      return std::ceil(input);
+    } else if (op_name == "Cos") {
+      return std::cos(input);
+    } else if (op_name == "Cosh") {
+      return std::cosh(input);
+    } else if (op_name == "Exp") {
+      return std::exp(input);
+    } else if (op_name == "Floor") {
+      return std::floor(input);
+    } else if (op_name == "Log") {
+      return std::log(input);
+    } else if (op_name == "Neg") {
+      return -input;
+    } else if (op_name == "Reciprocal") {
+      return 1.0 / input;
+    } else if (op_name == "Rsqrt") {
+      return 1.0 / std::sqrt(input);
+    } else if (op_name == "Sin") {
+      return std::sin(input);
+    } else if (op_name == "Sinh") {
+      return std::sinh(input);
+    } else if (op_name == "Sqrt") {
+      return std::sqrt(input);
+    } else if (op_name == "Tan") {
+      return std::tan(input);
+    }
+    EXPECT_TRUE(false);
+    return 0;
+  };
+
+  // Get list of ops to test.
+  std::vector<string> ops_to_test;
+  // Add all ops supported by ConvertUnary.
+  auto* map = UnaryOperationMap();
+  ops_to_test.reserve(map->size());
+  for (auto& pair : *map) {
+    ops_to_test.push_back(pair.first);
+  }
+  // Add other unary ops to test.
+  ops_to_test.push_back("Rsqrt");
+  // Ok.
+  for (string op_name : ops_to_test) {
+    Reset();
+    NodeDef node_def = get_unary_nodedef(op_name);
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_unary", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray({1, 2, 3}, output.tensor()->getDimensions());
+
+    const std::vector<float> input = {-0.9f, 0.6f, 0.0f, -3.5f, 100.0f, 2.9f};
+    const DataVec input_data{{"input", test::AsTensor<float>(input)}};
+    DataVec output_data{{"my_unary", ConstructTensor<float>(6)}};
+    BuildAndRun(input_data, &output_data);
+    for (int i = 0; i < input.size(); ++i) {
+      const float expected_output = get_unary_output(op_name, input[i]);
+      EXPECT_THAT(GetSpanForData<float>(output_data[0])[i],
+                  NanSensitiveFloatNear(expected_output, 0.0001));
+    }
   }
 }
 
diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
similarity index 92%
rename from tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
rename to tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
index c1688d4db88a270dcd202989f89a677ed10576d9..0eedfcacb4c11c8dc63fcfc13f044586b99b3c76 100644
--- a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
@@ -12,9 +12,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h"
-#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
@@ -30,9 +32,9 @@ namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 // TODO(sami): Remove VLOG messages once the code matures
+using absl::StrAppend;
+using absl::StrCat;
 using tensorflow::str_util::Uppercase;
-using tensorflow::strings::StrAppend;
-using tensorflow::strings::StrCat;
 
 tensorflow::Status TRTOptimizationPass::Init(
     const tensorflow::RewriterConfig_CustomGraphOptimizer* config) {
@@ -64,7 +66,7 @@ tensorflow::Status TRTOptimizationPass::Init(
     max_workspace_size_bytes_ = params.at("max_workspace_size_bytes").i();
   }
   if (params.count("precision_mode")) {
-    TF_RETURN_IF_ERROR(GetPrecisionMode(
+    TF_RETURN_IF_ERROR(TrtPrecisionModeFromName(
         Uppercase(params.at("precision_mode").s()), &precision_mode_));
   }
   if (params.count("use_calibration")) {
@@ -85,7 +87,7 @@ void TRTOptimizationPass::PrintDebugInfo(
     LOG(INFO) << offset << "type             = " << cluster->type();
     LOG(INFO) << offset << "num warmup steps = " << cluster->NumWarmupSteps();
     const auto dev_names = cluster->GetDeviceNames();
-    if (dev_names.size()) {
+    if (!dev_names.empty()) {
       LOG(INFO) << offset << " Device names:";
       for (const auto s : dev_names) {
         LOG(INFO) << offset2 << s;
@@ -101,7 +103,7 @@ void TRTOptimizationPass::PrintDebugInfo(
     }
 
     const auto dev_props = cluster->GetDevices();
-    if (dev_props.size()) {
+    if (!dev_props.empty()) {
       LOG(INFO) << offset << "Device properties:";
       for (auto k : dev_props) {
         LOG(INFO) << offset2 << k.first;
@@ -129,7 +131,7 @@ void TRTOptimizationPass::PrintDebugInfo(
     }
   }
   LOG(INFO) << "item: " << item.id;
-  if (item.feed.size()) {
+  if (!item.feed.empty()) {
     LOG(INFO) << offset << "Feeds  :";
     for (const auto& f : item.feed) {
       const auto& shape = f.second.shape();
@@ -138,7 +140,7 @@ void TRTOptimizationPass::PrintDebugInfo(
   } else {
     LOG(INFO) << offset << "No Feeds";
   }
-  if (item.fetch.size()) {
+  if (!item.fetch.empty()) {
     LOG(INFO) << offset << "Fetches  :";
     for (const auto& f : item.fetch) {
       LOG(INFO) << offset2 << f;
@@ -147,7 +149,7 @@ void TRTOptimizationPass::PrintDebugInfo(
     LOG(INFO) << offset << "No Fetches";
   }
 
-  if (item.init_ops.size()) {
+  if (!item.init_ops.empty()) {
     LOG(INFO) << offset << "init ops  :";
     for (const auto& f : item.init_ops) {
       LOG(INFO) << offset2 << f;
@@ -158,7 +160,7 @@ void TRTOptimizationPass::PrintDebugInfo(
   LOG(INFO) << "Save Op = " << item.save_op;
   LOG(INFO) << "Restore Op = " << item.restore_op;
   LOG(INFO) << "save_restore_loc_tensor = " << item.save_restore_loc_tensor;
-  if (item.keep_ops.size()) {
+  if (!item.keep_ops.empty()) {
     LOG(INFO) << offset << "keep ops  :";
     for (const auto& f : item.keep_ops) {
       LOG(INFO) << offset2 << f;
@@ -195,7 +197,7 @@ tensorflow::Status TRTOptimizationPass::Optimize(
     PrintDebugInfo(cluster, item);
   }
   int max_dim = -1;
-  if (item.feed.size()) {
+  if (!item.feed.empty()) {
     for (const auto& f : item.feed) {
       const auto& shape = f.second.shape();
       if (shape.dims() > 0) {
@@ -225,9 +227,10 @@ tensorflow::Status TRTOptimizationPass::Optimize(
   TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true));
   tensorflow::tensorrt::convert::ConversionParams cp;
 
-  if (use_calibration_ && precision_mode_ != INT8MODE) {
-    LOG(ERROR) << "Calibration with FP32 or FP16 is not implemented. "
-               << "Falling back to use_calibration = False.";
+  if (use_calibration_ && precision_mode_ != TrtPrecisionMode::INT8) {
+    VLOG(1) << "Calibration with FP32 or FP16 is not implemented. "
+            << "Falling back to use_calibration = False."
+            << "Note that the default value of use_calibration is True.";
     use_calibration_ = false;
   }
 
@@ -242,7 +245,7 @@ tensorflow::Status TRTOptimizationPass::Optimize(
     // If the last token is not an integer, it must be part of the name.
     // Otherwise it is port number.
     if (tokens.size() > 1 &&
-        !strings::safe_strto32(tokens.back(), &dumm_port)) {
+        !strings::safe_strto32(tokens.back(), &dumm_port)) {  // non-absl ok
       StrAppend(&s, ":", tokens.back());
     }
     nodes_to_preserve.push_back(s);
diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
similarity index 87%
rename from tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
rename to tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
index 3e8dc0978e43e2e9ba07aaa09f74acfe8e59b9a7..b2aed2a37afb6c01863f5617bad0bafe004eec24 100644
--- a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
 
 #include <string>
 
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
 #include "tensorflow/core/platform/logging.h"
@@ -34,7 +35,7 @@ class TRTOptimizationPass : public tensorflow::grappler::CustomGraphOptimizer {
   TRTOptimizationPass(const string& name = "TRTOptimizationPass")
       : name_(name),
         minimum_segment_size_(3),
-        precision_mode_(0),
+        precision_mode_(TrtPrecisionMode::FP32),
         maximum_batch_size_(-1),
         is_dynamic_op_(false),
         max_cached_batches_(1),
@@ -62,7 +63,7 @@ class TRTOptimizationPass : public tensorflow::grappler::CustomGraphOptimizer {
  private:
   const string name_;
   int minimum_segment_size_;
-  int precision_mode_;
+  TrtPrecisionMode precision_mode_;
   int maximum_batch_size_;
   bool is_dynamic_op_;
   std::vector<int> batches_;
@@ -77,4 +78,4 @@ class TRTOptimizationPass : public tensorflow::grappler::CustomGraphOptimizer {
 
 #endif  // GOOGLE_CUDA
 #endif  // GOOGLE_TENSORRT
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
diff --git a/tensorflow/contrib/tensorrt/convert/utils.cc b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
similarity index 73%
rename from tensorflow/contrib/tensorrt/convert/utils.cc
rename to tensorflow/compiler/tf2tensorrt/convert/utils.cc
index e7a1febb8c076891596741fe30721e7acca15a73..0ca3a5a4a58e6a3e29d3d515f496b8cb5e9f7eb0 100644
--- a/tensorflow/contrib/tensorrt/convert/utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -34,33 +34,32 @@ bool IsGoogleTensorRTEnabled() {
 #endif
 }
 
-Status GetPrecisionModeName(const int precision_mode, string* name) {
-  switch (precision_mode) {
-    case FP32MODE:
+Status TrtPrecisionModeToName(TrtPrecisionMode mode, string* name) {
+  switch (mode) {
+    case TrtPrecisionMode::FP32:
       *name = "FP32";
       break;
-    case FP16MODE:
+    case TrtPrecisionMode::FP16:
       *name = "FP16";
       break;
-    case INT8MODE:
+    case TrtPrecisionMode::INT8:
       *name = "INT8";
       break;
     default:
-      return tensorflow::errors::OutOfRange("Unknown precision mode");
+      return errors::OutOfRange("Unknown precision mode");
   }
   return Status::OK();
 }
 
-Status GetPrecisionMode(const string& name, int* precision_mode) {
+Status TrtPrecisionModeFromName(const string& name, TrtPrecisionMode* mode) {
   if (name == "FP32") {
-    *precision_mode = FP32MODE;
+    *mode = TrtPrecisionMode::FP32;
   } else if (name == "FP16") {
-    *precision_mode = FP16MODE;
+    *mode = TrtPrecisionMode::FP16;
   } else if (name == "INT8") {
-    *precision_mode = INT8MODE;
+    *mode = TrtPrecisionMode::INT8;
   } else {
-    return tensorflow::errors::InvalidArgument("Invalid precision mode name: ",
-                                               name);
+    return errors::InvalidArgument("Invalid precision mode name: ", name);
   }
   return Status::OK();
 }
diff --git a/tensorflow/contrib/tensorrt/convert/utils.h b/tensorflow/compiler/tf2tensorrt/convert/utils.h
similarity index 72%
rename from tensorflow/contrib/tensorrt/convert/utils.h
rename to tensorflow/compiler/tf2tensorrt/convert/utils.h
index 0592f31462af2b20f3a13fe5119e89c2ba42dd8a..0aa602dda2f3e98095bf72b5810a246c690d6741 100644
--- a/tensorflow/contrib/tensorrt/convert/utils.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_UTILS_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_UTILS_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_UTILS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_UTILS_H_
 
 #include <memory>
 
@@ -35,16 +35,13 @@ using TrtUniquePtrType = std::unique_ptr<T, TrtDestroyer<T>>;
 
 bool IsGoogleTensorRTEnabled();
 
-// TODO(aaroey): use an enum instead.
-const int FP32MODE = 0;
-const int FP16MODE = 1;
-const int INT8MODE = 2;
+enum class TrtPrecisionMode { FP32, FP16, INT8 };
 
-Status GetPrecisionModeName(const int precision_mode, string* name);
+Status TrtPrecisionModeToName(TrtPrecisionMode mode, string* name);
 
-Status GetPrecisionMode(const string& name, int* precision_mode);
+Status TrtPrecisionModeFromName(const string& name, TrtPrecisionMode* mode);
 
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_UTILS_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_CONVERT_UTILS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..81406b6e301ca350a3e52c97f5fcb575e88c3a90
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op.cc
@@ -0,0 +1,69 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resources.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/core/refcount.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+class GetSerializedResourceOp : public OpKernel {
+ public:
+  explicit GetSerializedResourceOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  ~GetSerializedResourceOp() override {}
+
+  void Compute(OpKernelContext* context) override {
+    // TODO(laigd): it will allocate the tensor on the device and copy the
+    // serialized string to that tensor, and later sess.run() will copy it back
+    // to host. We need to optimize this.
+    const string& container = context->input(0).scalar<string>()();
+    const string& resource_name = context->input(1).scalar<string>()();
+
+    // Get the resource.
+    SerializableResourceBase* resource = nullptr;
+    OP_REQUIRES_OK(context, context->resource_manager()->Lookup(
+                                container, resource_name, &resource));
+    ::tensorflow::core::ScopedUnref sc(resource);
+
+    // Serialize the resource as output.
+    string serialized_resource;
+    OP_REQUIRES_OK(context, resource->SerializeToString(&serialized_resource));
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape({}), &output));
+    output->scalar<string>()() = serialized_resource;
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("GetSerializedResourceOp").Device(DEVICE_GPU),
+                        GetSerializedResourceOp);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ec038ebda073c8050321d5668b15a2c6faa72a4b
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/kernels/get_serialized_resource_op_test.cc
@@ -0,0 +1,80 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <dirent.h>
+#include <string.h>
+#include <fstream>
+#include <vector>
+
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resources.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+class GetSerializedResourceOpTest : public OpsTestBase {};
+
+TEST_F(GetSerializedResourceOpTest, Basic) {
+  // Create the GPU device.
+  std::unique_ptr<Device> device(
+      DeviceFactory::NewDevice("GPU", {}, "/job:worker/replica:0/task:0"));
+
+  // Create the resource.
+  class MySerializableResource : public SerializableResourceBase {
+   public:
+    string DebugString() const override { return ""; }
+    Status SerializeToString(string* serialized) override {
+      *serialized = "my_serialized_str";
+      return Status::OK();
+    }
+  };
+  const string container = "mycontainer";
+  const string resource_name = "myresource";
+  SerializableResourceBase* resource = new MySerializableResource();
+  ResourceMgr* rm = device->resource_manager();
+  EXPECT_TRUE(rm->Create(container, resource_name, resource).ok());
+
+  // Create the op.
+  SetDevice(DEVICE_GPU, std::move(device));
+  TF_ASSERT_OK(NodeDefBuilder("op", "GetSerializedResourceOp")
+                   .Input(FakeInput(DT_STRING))
+                   .Input(FakeInput(DT_STRING))
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  // Execute the op.
+  AddInputFromArray<string>(TensorShape({}), {container});
+  AddInputFromArray<string>(TensorShape({}), {resource_name});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Verify the result.
+  // TODO(laigd): OpsTestBase::GetOutput() doesn't work.
+  Tensor* output = context_->mutable_output(0);
+  EXPECT_EQ("my_serialized_str", output->scalar<string>()());
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
similarity index 59%
rename from tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
rename to tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
index bad568644bb1f8d01d4cb0a7c853ec47d6f19e45..f6d387c59cd04aa5c7ccad610290b7b1f1d2b11f 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@@ -12,35 +12,44 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/tensorrt/kernels/trt_engine_op.h"
-
 #include <algorithm>
-
-#include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
-#include "tensorflow/contrib/tensorrt/test/utils.h"
+#include <memory>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/test_utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resources.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
 #include "cuda/include/cuda_runtime_api.h"
+#include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
 namespace tensorrt {
 static Logger logger;
+using absl::StrAppend;
+using absl::StrCat;
 using ::nvinfer1::IRuntime;
-using ::tensorflow::strings::StrAppend;
-using ::tensorflow::strings::StrCat;
 
 // A helper class to call done() when destructed for asynchronous execution.
 // Helps simultaneous execution of native and TRT engines.
@@ -53,6 +62,83 @@ class AsyncHelper : public tensorflow::core::RefCounted {
   AsyncOpKernel::DoneCallback done_;
 };
 
+//  This OP can construct TRTEngine on the fly and if construction of engine
+//  fails, executes equivalent subgraph as a TensorFlow function.
+class TRTEngineOp : public AsyncOpKernel {
+ public:
+  explicit TRTEngineOp(OpKernelConstruction* context);
+
+  void ComputeAsync(OpKernelContext* context,
+                    AsyncOpKernel::DoneCallback done) override;
+
+ private:
+  // Execute calibration
+  void ExecuteCalibration(OpKernelContext* ctx, AsyncHelper* helper);
+
+  // Construct a function handle for executing native funcdef graph
+  Status ConstructFunctionHandle(OpKernelContext* ctx);
+
+  // Execute replaced native segment as function Op.
+  void ExecuteNativeSegment(OpKernelContext* ctx, AsyncHelper* helper);
+
+  // Execute the tensorrt engine. Returns whether we need to retry by running
+  // the native segment.
+  bool ExecuteTrtEngine(OpKernelContext* ctx, EngineContext* engine_context);
+
+  // Allocate necessary resources for calibration
+  Status AllocateCalibrationResources(OpKernelContext* ctx,
+                                      SerializableResourceBase** cr);
+
+  // Get engine for the input shape
+  EngineContext* GetEngine(const std::vector<TensorShape>& input_shapes,
+                           OpKernelContext* ctx);
+
+  // Return engine batch in cached_engne_batch_sizes_ which is closest to input
+  // batch.
+  bool GetCompatibleCachedEngine(
+      const std::vector<TensorShape>& actual_input_shapes,
+      std::vector<TensorShape>* engine_input_shapes);
+
+  std::vector<string> input_nodes_;
+  std::vector<string> output_nodes_;
+
+  // serialized protobuf segment or trt engine depending on static_engine_ flag.
+  string serialized_segment_;
+
+  // Name of the function for TF native execution of the segment.
+  string funcdef_name_;
+
+  // GraphDef representation of the segment.
+  GraphDef segment_graph_;
+
+  // Engine Precision mode.
+  TrtPrecisionMode precision_mode_;
+
+  // Whether engine is constructed during the conversion or needs to be
+  // constructed from protobuf segment.
+  bool static_engine_;
+
+  // Whether to calibrate INT8 engine.
+  bool calibration_mode_;
+
+  // Batches of the cached engines
+  std::vector<int> cached_engine_batches_;
+
+  // Maximum number of cached engines
+  int max_cached_engines_;
+
+  int64 workspace_size_;
+  mutex engine_mutex_;
+  FunctionLibraryRuntime::Handle native_func_;
+
+  // The finalized calibrator for inference.
+  std::unique_ptr<TRTInt8Calibrator> calibrator_;
+
+  // If true, create calibration graph for INT8 mode. Otherwise, we are using
+  // user-provided quantization ranges.
+  bool use_calibration_;
+};
+
 #define TYPECASE(dt, X, Y)                                                \
   case dt: {                                                              \
     return (void*)X->flat<tensorflow::EnumToDataType<dt>::Type>().data(); \
@@ -123,20 +209,20 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
                  context->GetAttr("calibration_data", &calibration_data));
   OP_REQUIRES_OK(context,
                  context->GetAttr("segment_funcdef_name", &funcdef_name_));
-  OP_REQUIRES_OK(context, GetPrecisionMode(precision_string, &precision_mode_));
+  OP_REQUIRES_OK(context,
+                 TrtPrecisionModeFromName(precision_string, &precision_mode_));
   OP_REQUIRES_OK(context,
                  context->GetAttr("use_calibration", &use_calibration_));
-  calibration_mode_ = (use_calibration_ && precision_mode_ == INT8MODE &&
-                       calibration_data.size() == 0);
-  if (calibration_data.size()) {
+  calibration_mode_ =
+      (use_calibration_ && precision_mode_ == TrtPrecisionMode::INT8 &&
+       calibration_data.empty());
+  if (!calibration_data.empty()) {
     calibrator_.reset(new TRTInt8Calibrator(calibration_data));
     calibration_data.resize(0);
   }
   native_func_ = tensorflow::kInvalidHandle;
   OP_REQUIRES_OK(context, context->GetAttr("max_cached_engines_count",
                                            &max_cached_engines_));
-  OP_REQUIRES_OK(context,
-                 context->GetAttr("fixed_input_size", &fixed_input_size_));
   OP_REQUIRES_OK(context, context->GetAttr("cached_engine_batches",
                                            &cached_engine_batches_));
   std::sort(cached_engine_batches_.begin(), cached_engine_batches_.end());
@@ -167,6 +253,7 @@ void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx,
   opts.rendezvous = ctx->rendezvous();
   opts.cancellation_manager = ctx->cancellation_manager();
   opts.runner = ctx->runner();
+  inputs.reserve(ctx->num_inputs());
   for (int i = 0; i < ctx->num_inputs(); i++) {
     inputs.push_back(ctx->input(i));
   }
@@ -175,11 +262,13 @@ void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx,
   lib->Run(opts, native_func_, inputs, outputs,
            [this, ctx, outputs, helper](const tensorflow::Status& s) {
              tensorflow::core::ScopedUnref sc(helper);
-             VLOG(1) << "Native Segment completed";
              if (!s.ok()) {
+               LOG(ERROR) << "Failed to execute native segment " << this->name()
+                          << ": " << s;
                ctx->SetStatus(s);
                return;
              }
+             VLOG(1) << "Native Segment completed";
              for (size_t t = 0; t < outputs->size(); ++t) {
                ctx->set_output(t, outputs->at(t));
              }
@@ -194,19 +283,17 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx,
   VLOG(1) << "Executing TRT calibration: " << name();
   helper->Ref();
   tensorflow::core::ScopedUnref sc(helper);
-  // TODO(aaroey): remove the ResourceMgr singleton.
-  auto trt_rm = TRTResourceManager::instance();
-  auto res_mgr = trt_rm->getManager("TRTCalibration");
+  auto res_mgr = ctx->resource_manager();
   TRTCalibrationResource* calib_res = nullptr;
-  auto status = res_mgr->LookupOrCreate(
-      funcdef_name_, "Calibrator", &calib_res,
-      {[ctx, this](TRTCalibrationResource** cr) -> tensorflow::Status {
-        return this->AllocateCalibrationResources(ctx, cr);
-      }});
-  if (!status.ok()) {
-    ctx->SetStatus(status);
-    return;
-  }
+  OP_REQUIRES_OK(
+      ctx,
+      res_mgr->LookupOrCreate(
+          "TF_TRT_Calibration", name(),
+          reinterpret_cast<SerializableResourceBase**>(&calib_res),
+          {[ctx, this](SerializableResourceBase** cr) -> tensorflow::Status {
+            return this->AllocateCalibrationResources(ctx, cr);
+          }}));
+  tensorflow::core::ScopedUnref calib_sc(calib_res);
   int num_inputs = ctx->num_inputs();
   // Pass input data to calibrator
   std::unordered_map<string, void*> input_data;
@@ -219,7 +306,8 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx,
       return;
     }
     // Check the allocated buffer is sufficient for input
-    const auto device_tensor = dev_tensors_.at(i).AccessTensor(ctx);
+    const auto device_tensor =
+        calib_res->device_tensors_.at(i).AccessTensor(ctx);
     CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes());
     input_data.emplace(StrCat(kInputPHName, i), data_address);
   }
@@ -236,32 +324,34 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx,
   ExecuteNativeSegment(ctx, helper);
 }
 
-int TRTEngineOp::GetEngineBatch(OpKernelContext* ctx) {
-  int num_batch = ctx->input(0).shape().dim_size(0);
-  int smallest_engine = 0;
-  for (const auto i : cached_engine_batches_) {
-    if (i >= num_batch) {
-      smallest_engine = i;
-      break;
-    }
-  }
-  // TODO(sami): Need an LRU here
-  if (smallest_engine == 0) {
-    if (max_cached_engines_ > cached_engine_batches_.size()) {
-      smallest_engine = num_batch;
-      cached_engine_batches_.push_back(num_batch);
-      VLOG(1) << "Running with batch size " << num_batch;
-    } else {
-      string msg =
-          StrCat("Engine buffer is full. buffer limit=", max_cached_engines_,
-                 ", current entries=");
-      for (auto i : cached_engine_batches_) StrAppend(&msg, i, ",");
-      StrAppend(&msg, " requested batch=", num_batch);
-      LOG(WARNING) << msg;
-      return -1;
+bool TRTEngineOp::GetCompatibleCachedEngine(
+    const std::vector<TensorShape>& actual_input_shapes,
+    std::vector<TensorShape>* engine_input_shapes) {
+  const int batch_size = actual_input_shapes[0].dim_size(0);
+  int smallest_batch_size = -1;
+  // Output shape will always be the same as the input but we will overwrite the
+  // batch size.
+  *engine_input_shapes = actual_input_shapes;
+  for (const int cached_batch_size : cached_engine_batches_) {
+    // Check if compatible: batch <= cached batch.
+    //
+    // TODO(laigd): here it only compare the first dim a.k.a the batch size,
+    // we'll need to to support non-batch dimensions as well. This will be done
+    // as part of the offline conversion implementation.
+    if (batch_size <= cached_batch_size) {
+      // First case: first compatible engine found
+      // Second case: smaller batch size engine found
+      if ((smallest_batch_size == -1) ||
+          (cached_batch_size < smallest_batch_size)) {
+        smallest_batch_size = cached_batch_size;
+        // Overwrite batch size for output
+        for (int i = 0; i < engine_input_shapes->size(); i++) {
+          (*engine_input_shapes)[i].set_dim(0, smallest_batch_size);
+        }
+      }
     }
   }
-  return smallest_engine;
+  return (smallest_batch_size != -1);
 }
 
 void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
@@ -272,25 +362,21 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
     ExecuteCalibration(ctx, helper);
     return;
   }
-  const int smallest_engine = GetEngineBatch(ctx);
-  if (smallest_engine < 0) {
-    LOG(WARNING) << "Failed to get engine batch, running native segment for "
-                 << name();
-    ExecuteNativeSegment(ctx, helper);
-    return;
+  // Get shapes of inputs to engine.
+  std::vector<tensorflow::TensorShape> input_shapes;
+  input_shapes.reserve(ctx->num_inputs());
+  for (int i = 0; i < ctx->num_inputs(); ++i) {
+    input_shapes.push_back(ctx->input(i).shape());
   }
-
-  const int num_batch = ctx->input(0).shape().dim_size(0);
-  auto& engine_ctx_pair = GetEngine(smallest_engine, ctx);
-  auto& trt_engine_ptr = engine_ctx_pair.first;
-  if (!trt_engine_ptr) {
-    LOG(WARNING) << "Engine retrieval for batch size " << num_batch
+  EngineContext* engine_context = GetEngine(input_shapes, ctx);
+  if (!engine_context->cuda_engine) {
+    LOG(WARNING) << "Engine retrieval for input shapes: "
+                 << TensorShapeUtils::ShapeListString(input_shapes)
                  << " failed. Running native segment for " << name();
     ExecuteNativeSegment(ctx, helper);
     return;
   }
-  const bool retry = ExecuteTrtEngine(ctx, num_batch, trt_engine_ptr.get(),
-                                      engine_ctx_pair.second.get());
+  const bool retry = ExecuteTrtEngine(ctx, engine_context);
   if (retry) {
     LOG(WARNING) << "Failed to execute engine, "
                  << "retrying with native segment for " << name();
@@ -299,18 +385,19 @@ void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
   }
 }
 
-bool TRTEngineOp::ExecuteTrtEngine(
-    OpKernelContext* ctx, const int num_batch,
-    nvinfer1::ICudaEngine* trt_engine_ptr,
-    nvinfer1::IExecutionContext* trt_execution_context_ptr) {
+bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx,
+                                   EngineContext* engine_context) {
   VLOG(1) << "Executing TRT engine: " << name();
+  auto& cuda_engine = engine_context->cuda_engine;
   const bool kRetry = true;
+  // All inputs must have the same batch size, so just get it from the first
+  // input.
+  const int num_batch = ctx->input(0).shape().dim_size(0);
   const int num_binding = ctx->num_inputs() + ctx->num_outputs();
   std::vector<void*> buffers(num_binding);
   for (int i = 0; i < ctx->num_inputs(); i++) {
     const string input_name = StrCat(kInputPHName, i);
-    const int binding_index =
-        trt_engine_ptr->getBindingIndex(input_name.c_str());
+    const int binding_index = cuda_engine->getBindingIndex(input_name.c_str());
     if (binding_index == -1) {
       LOG(ERROR) << "Input node not found, at " << input_name;
       return kRetry;
@@ -323,10 +410,11 @@ bool TRTEngineOp::ExecuteTrtEngine(
                  << " vs " << input_shape.dim_size(0);
       return kRetry;
     }
-    auto dtype = trt_engine_ptr->getBindingDataType(binding_index);
+    auto dtype = cuda_engine->getBindingDataType(binding_index);
     switch (dtype) {
       case nvinfer1::DataType::kFLOAT:
-        buffers[binding_index] = (void*)(input_tensor.flat<float>().data());
+        buffers[binding_index] =
+            const_cast<float*>(input_tensor.flat<float>().data());
         break;
       case nvinfer1::DataType::kHALF:
         LOG(ERROR) << "FP16 inputs are not supported yet!";
@@ -335,10 +423,11 @@ bool TRTEngineOp::ExecuteTrtEngine(
         LOG(ERROR) << "INT8 inputs are not supported yet!";
         return kRetry;
       case nvinfer1::DataType::kINT32:
-        buffers[binding_index] = (void*)(input_tensor.flat<int32>().data());
+        buffers[binding_index] =
+            const_cast<int32*>(input_tensor.flat<int32>().data());
         break;
       default:
-        LOG(ERROR) << "Unknown TRT data type: " << int(dtype);
+        LOG(ERROR) << "Unknown TRT data type: " << static_cast<int>(dtype);
         return kRetry;
     }
   }
@@ -346,13 +435,12 @@ bool TRTEngineOp::ExecuteTrtEngine(
   for (int i = 0; i < ctx->num_outputs(); i++) {
     // Create an output tensor
     const string output_name = StrCat(kOutputPHName, i);
-    const int binding_index =
-        trt_engine_ptr->getBindingIndex(output_name.c_str());
+    const int binding_index = cuda_engine->getBindingIndex(output_name.c_str());
     Tensor* output_tensor = nullptr;
 
     TensorShape output_shape;
     if (binding_index != -1) {
-      auto dims = trt_engine_ptr->getBindingDimensions(binding_index);
+      auto dims = cuda_engine->getBindingDimensions(binding_index);
       std::vector<int> trt_shape(dims.nbDims + 1);
       trt_shape[0] = num_batch;
       for (int j = 0; j < dims.nbDims; j++) trt_shape[j + 1] = dims.d[j];
@@ -374,11 +462,11 @@ bool TRTEngineOp::ExecuteTrtEngine(
       // TODO(aaroey): ideally we should retry, fix this.
       return !kRetry;
     }
-    auto dtype = trt_engine_ptr->getBindingDataType(binding_index);
+    auto dtype = cuda_engine->getBindingDataType(binding_index);
     switch (dtype) {
       case nvinfer1::DataType::kFLOAT:
         buffers[binding_index] =
-            reinterpret_cast<void*>(output_tensor->flat<float>().data());
+            const_cast<float*>(output_tensor->flat<float>().data());
         break;
       case nvinfer1::DataType::kHALF:
         LOG(WARNING) << "half size is not supported yet!";
@@ -388,7 +476,7 @@ bool TRTEngineOp::ExecuteTrtEngine(
         return kRetry;
       case nvinfer1::DataType::kINT32:
         buffers[binding_index] =
-            reinterpret_cast<void*>(output_tensor->flat<int32>().data());
+            const_cast<int32*>(output_tensor->flat<int32>().data());
         break;
       default:
         LOG(WARNING) << "Unknown TRT data type: " << static_cast<int>(dtype);
@@ -402,9 +490,12 @@ bool TRTEngineOp::ExecuteTrtEngine(
                                                 ->implementation()
                                                 ->GpuStreamMemberHack()));
 
+  // nvinfer1::IExecutionContext::enqueue is not thread safe and we need a mutex
+  // for it.
+  tensorflow::mutex_lock lock(engine_context->mu);
   // TODO(jie): trt enqueue does not return error
-  auto ret = trt_execution_context_ptr->enqueue(num_batch, &buffers[0], *stream,
-                                                nullptr);
+  auto ret = engine_context->execution_context->enqueue(num_batch, &buffers[0],
+                                                        *stream, nullptr);
   if (!ret) {
     LOG(WARNING) << "Failed to enqueue batch for TRT engine: " << name();
     return kRetry;
@@ -414,50 +505,45 @@ bool TRTEngineOp::ExecuteTrtEngine(
   return !kRetry;
 }
 
-TRTEngineOp::~TRTEngineOp() {
-  // We need to manually destroy the engine and execution context before
-  // the allocator is destructed.
-  for (auto& eng : engine_map_) {
-    eng.second.first.reset();
-    eng.second.second.reset();
+EngineContext* TRTEngineOp::GetEngine(
+    const std::vector<TensorShape>& input_shapes, OpKernelContext* ctx) {
+  static EngineContext empty_context;
+  tensorflow::mutex_lock lock(engine_mutex_);
+  // TODO(tmorris): using first input to get batch size - is this reliable?
+  const int batch_size = input_shapes[0].dim_size(0);
+
+  // Get engine cache
+  TRTEngineCacheResource* cache_res = nullptr;
+  auto status = ctx->resource_manager()->LookupOrCreate(
+      "TRTEngineCache", funcdef_name_, &cache_res,
+      {[this, ctx](TRTEngineCacheResource** cr) -> tensorflow::Status {
+        *cr = new TRTEngineCacheResource(ctx, this->max_cached_engines_);
+        return Status::OK();
+      }});
+  if (!status.ok()) {
+    ctx->SetStatus(status);
+    return &empty_context;
   }
-  allocator_.reset();
-}
-
-nvinfer1::IGpuAllocator* TRTEngineOp::GetAllocator(OpKernelContext* ctx) {
-  if (allocator_) return allocator_.get();
-  auto device = ctx->device();
-  auto alloc = device->GetAllocator(tensorflow::AllocatorAttributes());
-  if (!alloc) {
-    LOG(ERROR) << "Can't find device allocator for gpu device "
-               << device->name();
-    return nullptr;
+  tensorflow::core::ScopedUnref sc(cache_res);
+  auto& cache = cache_res->cache_;
+  auto allocator = cache_res->allocator_.get();
+  if (allocator == nullptr) {
+    return &empty_context;
   }
-  allocator_.reset(new TRTDeviceAllocator(alloc));
-  return allocator_.get();
-}
-
-TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size,
-                                                   OpKernelContext* ctx) {
-  static EngineCtxPair null_pair = {
-      TrtUniquePtrType<nvinfer1::ICudaEngine>(nullptr),
-      TrtUniquePtrType<nvinfer1::IExecutionContext>(nullptr)};
-  // TODO(sami): This method needs to be re-written to use resource manager and
-  // with LRU mechanism option.
-  tensorflow::mutex_lock lock(engine_mutex_);
 
+  // Handle the static engine case. For static engines, the cache will have a
+  // single element containing the only engine.
   if (static_engine_) {
-    if (engine_map_.size()) {
-      if (engine_map_.begin()->first >= batch_size) {
-        return engine_map_.begin()->second;
+    if (cache.size()) {
+      // Batch size of engine must be >= the input batch size
+      // TODO(tmorris): use match compatible function?
+      if (cache.begin()->first[0].dim_size(0) >= batch_size) {
+        return cache.begin()->second.get();
       }
-      return null_pair;
+      return &empty_context;
     }
+
     TrtUniquePtrType<IRuntime> infer(nvinfer1::createInferRuntime(logger));
-    auto allocator = GetAllocator(ctx);
-    if (allocator == nullptr) {
-      return null_pair;
-    }
     infer->setGpuAllocator(allocator);
     TrtUniquePtrType<nvinfer1::ICudaEngine> static_engine(
         infer->deserializeCudaEngine(serialized_segment_.c_str(),
@@ -465,62 +551,87 @@ TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size,
                                      PluginFactoryTensorRT::GetInstance()));
     auto raw_static_engine = static_engine.get();
     const auto max_batch_size = raw_static_engine->getMaxBatchSize();
-    engine_map_[max_batch_size] = {
-        std::move(static_engine),
-        TrtUniquePtrType<nvinfer1::IExecutionContext>(
-            raw_static_engine->createExecutionContext())};
+    // Static engine will have max_batch_size for batch size so that all inputs
+    // will map to this single engine.
+    std::vector<TensorShape> engine_input_shapes(input_shapes);
+    for (int i = 0; i < engine_input_shapes.size(); i++) {
+      // TODO(tmorris): will all inputs have batch size as first dimension??
+      engine_input_shapes[i].set_dim(0, max_batch_size);
+    }
+    // TODO(laigd): here we assume engine_input_shapes matches the actual input
+    // shapes of the engine, we should verify that.
+    cache.emplace(engine_input_shapes,
+                  absl::make_unique<EngineContext>(
+                      std::move(static_engine),
+                      TrtUniquePtrType<nvinfer1::IExecutionContext>(
+                          raw_static_engine->createExecutionContext())));
     // Runtime is safe to delete after engine creation
     serialized_segment_.clear();
     if (max_batch_size < batch_size) {
-      return null_pair;
+      return &empty_context;
     }
-    return engine_map_.at(max_batch_size);
+    return cache.at(engine_input_shapes).get();
   }  // static_engine_
 
   // Handle the dynamic engine case.
-  auto engine_it = engine_map_.find(batch_size);
-  if (engine_it == engine_map_.end() &&
-      engine_map_.size() < (size_t)max_cached_engines_) {
-    nvinfer1::IGpuAllocator* allocator = nullptr;
-    allocator = GetAllocator(ctx);
-    if (allocator == nullptr) {
-      return null_pair;
-    }
-    std::vector<tensorflow::PartialTensorShape> shapes;
-    for (int i = 0; i < ctx->num_inputs(); ++i) {
-      shapes.emplace_back(ctx->input(i).shape());
+  // See if there is a compatible engine cached. The batch size should be <= the
+  // cached batch size.
+  std::vector<tensorflow::TensorShape> engine_input_shapes;
+  const bool matched_successfully =
+      GetCompatibleCachedEngine(input_shapes, &engine_input_shapes);
+  // If matched, use that engine. Otherwise, we will look in cache for that
+  // exact shape and possibly create a new engine if it is not in cache.
+  if (!matched_successfully) {
+    engine_input_shapes = input_shapes;
+    if (!cached_engine_batches_.empty()) {
+      // If user has explicitly defined cached_engine_batches, we should
+      // warn them that their input was non-compatible (batch size too high)
+      LOG(WARNING) << "No compatible cached engine was found for batch size: "
+                   << batch_size << ". A new engine will be created.";
+      cached_engine_batches_.push_back(batch_size);
     }
+  }
+
+  if (!cache.count(engine_input_shapes)) {
     TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
     bool convert_successfully = false;
     LOG(INFO) << "Building a new TensorRT engine for " << name()
-              << " with batch size " << batch_size;
+              << " input shapes: "
+              << TensorShapeUtils::ShapeListString(engine_input_shapes);
+
+    // Convert to partial shapes
+    std::vector<PartialTensorShape> partial_shapes(engine_input_shapes.begin(),
+                                                   engine_input_shapes.end());
+
     // Up to this point, calibrator_ can never be empty, since otherwise it
     // means calibration_mode_ is true and this path won't get executed.
     auto status = convert::ConvertGraphDefToEngine(
-        segment_graph_, precision_mode_, batch_size, workspace_size_, shapes,
-        &logger, allocator, calibrator_.get(), &engine, use_calibration_,
-        &convert_successfully);
+        segment_graph_, precision_mode_, batch_size, workspace_size_,
+        partial_shapes, &logger, allocator, calibrator_.get(), &engine,
+        use_calibration_, &convert_successfully);
     if (!status.ok()) {
       if (convert_successfully) {
         // This means it fail to build the engine even when the network is built
         // successfully, probably due to internal issues. In this case we don't
         // retry in the future.
-        engine_map_[batch_size] = {nullptr, nullptr};
+        cache.emplace(engine_input_shapes, absl::make_unique<EngineContext>());
       }
       LOG(WARNING) << "Engine creation for batch size " << batch_size
                    << " failed " << status;
-      return null_pair;
+      return &empty_context;
     }
     VLOG(1) << "Conversion is done";
     TrtUniquePtrType<nvinfer1::IExecutionContext> exec_context(
         engine->createExecutionContext());
-    engine_map_[batch_size] = {std::move(engine), std::move(exec_context)};
+    cache.emplace(engine_input_shapes,
+                  absl::make_unique<EngineContext>(std::move(engine),
+                                                   std::move(exec_context)));
   }
-  return engine_map_.at(batch_size);
+  return cache.at(engine_input_shapes).get();
 }
 
 tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
-    OpKernelContext* ctx, TRTCalibrationResource** cr) {
+    OpKernelContext* ctx, SerializableResourceBase** cr) {
   auto cres = new TRTCalibrationResource();
   *cr = cres;
   // Get the allocator.
@@ -536,7 +647,7 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
   const int batch_size = ctx->input(0).dim_size(0);
   const int num_inputs = ctx->num_inputs();
   std::vector<tensorflow::PartialTensorShape> shapes;
-  dev_tensors_.resize(num_inputs);
+  cres->device_tensors_.resize(num_inputs);
   VLOG(1) << " Constructing calibrator";
   for (int i = 0; i < num_inputs; i++) {
     // allocate workspace on device for inputs
@@ -544,19 +655,19 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
     shapes.emplace_back(t.shape());
     Tensor* device_tensor;
     TF_RETURN_IF_ERROR(ctx->allocate_persistent(
-        t.dtype(), t.shape(), &dev_tensors_.at(i), &device_tensor));
+        t.dtype(), t.shape(), &cres->device_tensors_.at(i), &device_tensor));
     CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes());
     void* device_address = GetTensorAddress(device_tensor);
     if (device_address == nullptr) {
       return tensorflow::errors::InvalidArgument(
           "Unsupported data type encountered in input ", i);
     }
-    device_buffers_.emplace(
+    cres->device_buffers_.emplace(
         StrCat(kInputPHName, i),
         std::pair<void*, size_t>(device_address, device_tensor->TotalBytes()));
   }
   cres->calibrator_.reset(
-      new TRTInt8Calibrator(device_buffers_, batch_size, name()));
+      new TRTInt8Calibrator(cres->device_buffers_, batch_size, name()));
   const string label(name());
   auto segment_graph = &segment_graph_;
   const int platform_gpu_id =
@@ -585,9 +696,10 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
     // TODO(aaroey): maybe setting the max batch size using the python
     // calibration wrapper class.
     auto s = convert::ConvertGraphDefToEngine(
-        *segment_graph, INT8MODE, cres->calibrator_->getBatchSize(),
-        workspace_size_bytes, shapes, &cres->logger_, cres->allocator_.get(),
-        cres->calibrator_.get(), &cres->engine_,
+        *segment_graph, TrtPrecisionMode::INT8,
+        cres->calibrator_->getBatchSize(), workspace_size_bytes, shapes,
+        &cres->logger_, cres->allocator_.get(), cres->calibrator_.get(),
+        &cres->engine_,
         /*use_calibration=*/true,
         /*convert_successfully=*/nullptr);
     if (!s.ok()) {
diff --git a/tensorflow/compiler/tf2tensorrt/ops/get_serialized_resource_op.cc b/tensorflow/compiler/tf2tensorrt/ops/get_serialized_resource_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..59da73f5efc8eedc20c35cf35cb1eae6cda136c9
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/ops/get_serialized_resource_op.cc
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+
+REGISTER_OP("GetSerializedResourceOp")
+    .Input("container: string")
+    .Input("resource_name: string")
+    .Output("serialized_resource: string")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .SetIsStateful()
+    .Doc(R"doc(
+Gets a resource from a container managed by the resource manager and returns
+its serialized representation.
+)doc");
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
similarity index 80%
rename from tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
rename to tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
index 92405906eb76b043bc08b68e25e16ab40197dddf..b84d2fe0b8cef3475f2a7d0f5383d5e11cde099a 100644
--- a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/ops/trt_engine_op.cc
@@ -28,16 +28,22 @@ namespace shape_inference {
 extern Status TRTEngineOpShapeInference(InferenceContext* c);
 }
 
+// NOTE: please try NOT to add/modify/remove attributes or inputs/outputs to the
+// list below, this will break backward compatibility!
+//
+// TODO(laigd): consider making this op stateful. The only problem is it uses TF
+// function which has to be stateless, but we can use function library as the
+// key to cache the instantiated functions for different executor subgraphs.
 REGISTER_OP("TRTEngineOp")
     .Attr("serialized_segment: string")
     .Attr("input_shapes: list(shape)")
     .Attr("output_shapes: list(shape)")
     .Attr("segment_funcdef_name: string")
-    .Attr("InT: list({int8,float16,float32})")
-    .Attr("OutT: list({int8,float16,float32})")
+    .Attr("InT: list({int8,float16,float32,int32})")
+    .Attr("OutT: list({int8,float16,float32,int32})")
     .Attr("static_engine: bool = true")
     .Attr("fixed_input_size: bool = true")
-    .Attr("cached_engine_batches: list(int) = []")
+    .Attr("cached_engine_batches: list(int) >= 0 = []")
     .Attr("max_cached_engines_count: int = 1")
     .Attr("workspace_size_bytes: int")
     .Attr("precision_mode: {'FP32', 'FP16', 'INT8'}")
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin.cc b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
similarity index 96%
rename from tensorflow/contrib/tensorrt/plugin/trt_plugin.cc
rename to tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
index 062f86e8bb4dc753925e4e2baf0bc80a5312a94f..a4341c530fffca88c82813cc2ace2c0ae1df5345 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin.cc
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h"
+
 #include <cassert>
 #include <cstring>
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h"
+
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin.h b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
similarity index 92%
rename from tensorflow/contrib/tensorrt/plugin/trt_plugin.h
rename to tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
index 754920b60ca7439513a91ad0354833a2482b29c1..f495d857037c79a1783f8eb232fb57c20e229169 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin.h
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_H_
 
 #include <iostream>
 #include <unordered_map>
@@ -71,4 +71,4 @@ class PluginTensorRT : public nvinfer1::IPlugin {
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_H_
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.cc
similarity index 96%
rename from tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc
rename to tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.cc
index cccc91226265ed139fb8db0b71c40b868f729562..871fb1210bd495dc3f5e8153bb6c3a361bf569f5 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h
similarity index 91%
rename from tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h
rename to tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h
index bbae9fb65c22cf69d2e7954436fd04dd16f7f6c8..9aa99a40b80de92a4d9b9ad36e88e693b8aa42dc 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
 
 #include <memory>
 #include <unordered_map>
 
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -99,4 +99,4 @@ class TrtPluginRegistrar {
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory_test.cc b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory_test.cc
similarity index 96%
rename from tensorflow/contrib/tensorrt/plugin/trt_plugin_factory_test.cc
rename to tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory_test.cc
index 129bdcdbc2f8d9d5215f45f381bcadf35e4fa75e..7d9c465c22beed0e252cbc26d6c533a0789d4f49 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory_test.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
 
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.cc
similarity index 94%
rename from tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc
rename to tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.cc
index a8f60886c03c174a612e7a135b6eb7bb7cb9997a..f3d6b4ff476139693a5251ddf58a3200d8af8efc 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.h"
 #include <cassert>
 
 #if GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.h
similarity index 82%
rename from tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h
rename to tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.h
index 274ce42fec9283c643004d45fba461879fc5f2dc..e5eff15c19694093c7a5ea933a41375e8e01c8b9 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h
+++ b/tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_utils.h
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
 
 #include <functional>
 
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h"
 #include "tensorflow/core/platform/types.h"
 
 #if GOOGLE_CUDA
@@ -43,4 +43,4 @@ string ExtractOpName(const void* serial_data, size_t serial_length,
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
diff --git a/tensorflow/compiler/tf2tensorrt/python/ops/trt_ops.py b/tensorflow/compiler/tf2tensorrt/python/ops/trt_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..25fb3a13db9911673bac04652b8ed8ba842be93c
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/python/ops/trt_ops.py
@@ -0,0 +1,69 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Exposes the Python wrapper of TRTEngineOp."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+import platform
+from tensorflow.python.framework import errors
+
+_trt_ops_so = None
+_module_lock = threading.Lock()
+
+
+def load_trt_ops():
+  """Load TF-TRT op libraries so if it hasn't been loaded already."""
+  global _trt_ops_so
+
+  if platform.system() == "Windows":
+    raise RuntimeError("Windows platforms are not supported")
+
+  with _module_lock:
+    if _trt_ops_so:
+      return
+
+    try:
+      # pylint: disable=g-import-not-at-top,unused-variable
+      # This registers the TRT ops, it doesn't require loading TRT library.
+      from tensorflow.compiler.tf2tensorrt.ops.gen_trt_ops import trt_engine_op
+      # pylint: enable=g-import-not-at-top,unused-variable
+    except ImportError as e:
+      print("**** Failed to import TF-TRT ops. This is because the binary was "
+            "not built with CUDA or TensorRT enabled. ****")
+      raise e
+
+    # TODO(laigd): we should load TF-TRT kernels here as well after removing the
+    # swig binding.
+    try:
+      # pylint: disable=g-import-not-at-top
+      from tensorflow.python.framework import load_library
+      from tensorflow.python.platform import resource_loader
+      # pylint: enable=g-import-not-at-top
+
+      _trt_ops_so = load_library.load_op_library(
+          resource_loader.get_path_to_datafile("_trt_ops.so"))
+    except errors.NotFoundError as e:
+      no_trt_message = (
+          "**** Failed to initialize TensorRT. This is either because the "
+          "TensorRT installation path is not in LD_LIBRARY_PATH, or because "
+          "you do not have it installed. If not installed, please go to "
+          "https://developer.nvidia.com/tensorrt to download and install "
+          "TensorRT ****")
+      print(no_trt_message)
+      raise e
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
similarity index 92%
rename from tensorflow/contrib/tensorrt/segment/segment.cc
rename to tensorflow/compiler/tf2tensorrt/segment/segment.cc
index 6abc5226ccf96e472df77269bee6186726e5768d..3794929b1df3fa999de6ab218dc2ddfb96e4ac81 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/segment/segment.h"
+#include "tensorflow/compiler/tf2tensorrt/segment/segment.h"
 
 #include <queue>
 #include <set>
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/contrib/tensorrt/segment/union_find.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2tensorrt/segment/union_find.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -29,11 +30,14 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/types.h"
 
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
 namespace tensorflow {
 namespace tensorrt {
 namespace segment {
-using ::tensorflow::strings::StrAppend;
-using ::tensorflow::strings::StrCat;
+using absl::StrAppend;
+using absl::StrCat;
 
 // A simple graph representation to mirror tensorflow::Graph. This structure
 // helps saving memory since segmenter modifies the graph in place, preventing
@@ -225,6 +229,24 @@ SimpleGraph::~SimpleGraph() {
   for (auto x : edges_) delete x;
 }
 
+// Define comparison functions for std::set with pointer keys so that behavior
+// is deterministic. When using std::set with pointer key types, the items are
+// sorted by pointer address which is non-deterministic. This can cause issues
+// for INT8 mode because the graph is converted twice and non-determinism may
+// cause a mismatch between the calibration tables of the conversions.
+struct SimpleEdgePtrCompare {
+  bool operator()(const SimpleEdge* lhs, const SimpleEdge* rhs) const {
+    return lhs->id() < rhs->id();
+  }
+};
+
+struct NodePtrCompare {
+  bool operator()(const tensorflow::Node* lhs,
+                  const tensorflow::Node* rhs) const {
+    return lhs->name() < rhs->name();
+  }
+};
+
 namespace {
 
 // Copied from TF ReverseDFS, which only works for tensorflow::Graph.
@@ -476,7 +498,7 @@ tensorflow::Status SegmentGraph(
     // nodes. Iterate since combining two nodes may unblock other
     // combining.
     while (true) {
-      std::set<const SimpleEdge*> contract_edges;
+      std::set<const SimpleEdge*, SimpleEdgePtrCompare> contract_edges;
       for (const SimpleEdge* out_edge : node->out_edges()) {
         VLOG(3) << "... out node " << out_edge->dst()->name() << " ( "
                 << out_edge->dst()->id() << " <- " << node->id() << " )";
@@ -530,7 +552,7 @@ tensorflow::Status SegmentGraph(
 
   // A map from the segment identifier (currently the name of the root node of
   // the segment tree) to the segment nodes set.
-  std::map<string, std::set<const tensorflow::Node*>> sg_map;
+  std::map<string, std::set<const tensorflow::Node*, NodePtrCompare>> sg_map;
 
   // A map from the segment identifier (currently the name of the root node of
   // the segment tree) to the device names that the nodes in the segment are
@@ -566,7 +588,8 @@ tensorflow::Status SegmentGraph(
   // --------------------------------- Step 2 ---------------------------------
   // Remove ineligible input/output nodes.
   for (auto& itr : sg_map) {
-    std::set<const tensorflow::Node*>& segment_nodes = itr.second;
+    std::set<const tensorflow::Node*, NodePtrCompare>& segment_nodes =
+        itr.second;
     VLOG(1) << "Segment original size: " << segment_nodes.size();
     while (true) {
       std::deque<const tensorflow::Node*> in_nodes_que, out_nodes_que;
@@ -618,8 +641,9 @@ tensorflow::Status SegmentGraph(
                               bool is_input_nodes,
                               std::deque<const tensorflow::Node*>* que) {
         // Run a BFS on the queue to find all the input/output nodes.
-        std::set<const tensorflow::Node*> visited;
-        std::set<const tensorflow::Node*> logged(que->begin(), que->end());
+        std::set<const tensorflow::Node*, NodePtrCompare> visited;
+        std::set<const tensorflow::Node*, NodePtrCompare> logged(que->begin(),
+                                                                 que->end());
         while (!que->empty()) {
           auto node = que->front();
           que->pop_front();
@@ -653,9 +677,11 @@ tensorflow::Status SegmentGraph(
   // --------------------------------- Step 3 ---------------------------------
   // Convert the segments into the expected return format
   for (const auto& itr : sg_map) {
-    const std::set<const tensorflow::Node*>& segment_nodes = itr.second;
+    const string& segment_root = itr.first;
+    // Return format does not require set comparator.
+    std::set<const Node*> segment_nodes(itr.second.begin(), itr.second.end());
     if (VLOG_IS_ON(1)) {
-      string s = "parent=" + itr.first + ":";
+      string s = "parent=" + segment_root + ":";
       for (auto node : segment_nodes) s += " " + node->name();
       VLOG(1) << "Segment " << segments->size() << ": " << s;
     }
@@ -668,12 +694,10 @@ tensorflow::Status SegmentGraph(
     }
 
     // TODO(sami): Make segmenter placement aware once trtscopes are in place
-    std::set<string> segment_node_names;
-    for (auto node : itr.second) segment_node_names.insert(node->name());
-    const auto& dev_itr = device_maps.find(itr.first);
+    const auto& dev_itr = device_maps.find(segment_root);
     if (dev_itr == device_maps.end() || dev_itr->second.empty()) {
       VLOG(1) << "No device assigned to segment " << segments->size();
-      segments->emplace_back(std::make_pair(segment_node_names, string()));
+      segments->emplace_back(std::make_pair(segment_nodes, string()));
     } else if (dev_itr->second.size() > 1) {
       string s("Segment ");
       StrAppend(&s, segments->size(), " has multiple devices attached: ");
@@ -682,10 +706,10 @@ tensorflow::Status SegmentGraph(
       }
       LOG(WARNING) << s << " choosing " << *(dev_itr->second.begin());
       segments->emplace_back(
-          std::make_pair(segment_node_names, *(dev_itr->second.begin())));
+          std::make_pair(segment_nodes, *(dev_itr->second.begin())));
     } else {
       segments->emplace_back(
-          std::make_pair(segment_node_names, *(dev_itr->second.begin())));
+          std::make_pair(segment_nodes, *(dev_itr->second.begin())));
     }
   }
   if (VLOG_IS_ON(1)) {
@@ -704,3 +728,6 @@ tensorflow::Status SegmentGraph(
 }  // namespace segment
 }  // namespace tensorrt
 }  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/segment/segment.h b/tensorflow/compiler/tf2tensorrt/segment/segment.h
similarity index 81%
rename from tensorflow/contrib/tensorrt/segment/segment.h
rename to tensorflow/compiler/tf2tensorrt/segment/segment.h
index b9693aad1b764515459db6833b05221ea5b3a2d1..9622ddd593990e93ba1b54e9dfd0052006e20ced 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_SEGMENT_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_SEGMENT_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_SEGMENT_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_SEGMENT_H_
 
 #include <set>
 #include <vector>
@@ -24,15 +24,17 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
-namespace tensorflow {
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 
+namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 
-// Vector of segments, each entry contains a set of node names and a device name
-// in the segment.
-// TODO(aaroey): use node pointer instead of node name.
-using SegmentNodesVector = std::vector<std::pair<std::set<string>, string>>;
+// Vector of segments, each entry contains a set of node pointers and a device
+// name in the segment.
+using SegmentNodesVector =
+    std::vector<std::pair<std::set<const Node*>, string>>;
 
 struct SegmentOptions {
   // Segment must contain at least this many nodes.
@@ -60,4 +62,7 @@ tensorflow::Status SegmentGraph(
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_SEGMENT_H_
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_SEGMENT_H_
diff --git a/tensorflow/contrib/tensorrt/segment/segment_test.cc b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
similarity index 97%
rename from tensorflow/contrib/tensorrt/segment/segment_test.cc
rename to tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
index 4805ef9c61a7784a1c08cf5eaf504691bc9dbedc..e11ad2719740d908f93ef580a6b308469365f402 100644
--- a/tensorflow/contrib/tensorrt/segment/segment_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/segment/segment.h"
+#include "tensorflow/compiler/tf2tensorrt/segment/segment.h"
 
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/standard_ops.h"
@@ -26,6 +26,9 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
 
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
 namespace tensorflow {
 namespace tensorrt {
 namespace segment {
@@ -75,7 +78,10 @@ class SegmentTest : public ::testing::Test {
                        const std::vector<std::set<string>>& expected_segments) {
     EXPECT_EQ(expected_segments.size(), segments.size());
     for (int i = 0; i < segments.size(); ++i) {
-      const auto& segment_node_names = segments[i].first;
+      std::set<string> segment_node_names;
+      for (const Node* node : segments[i].first) {
+        segment_node_names.insert(node->name());
+      }
       const auto& expected = expected_segments[i];
       for (const auto& name : expected) {
         EXPECT_TRUE(segment_node_names.count(name))
@@ -262,3 +268,6 @@ TEST_F(SegmentTest, BigIfElse) {
 }  // namespace segment
 }  // namespace tensorrt
 }  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/segment/union_find.h b/tensorflow/compiler/tf2tensorrt/segment/union_find.h
similarity index 92%
rename from tensorflow/contrib/tensorrt/segment/union_find.h
rename to tensorflow/compiler/tf2tensorrt/segment/union_find.h
index 1c64ebbb0ae532a4776ab8963515d19fd3b23b4c..6458ae692fd7c922b5fc3bea2e55b613447dbde0 100644
--- a/tensorflow/contrib/tensorrt/segment/union_find.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/union_find.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_UNION_FIND_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_UNION_FIND_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_
 
 namespace tensorflow {
 namespace tensorrt {
@@ -76,4 +76,4 @@ UnionFind<T>* UnionFind<T>::FindRoot() {
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_SEGMENT_UNION_FIND_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_
diff --git a/tensorflow/contrib/tensorrt/tensorrt_test.cc b/tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
similarity index 100%
rename from tensorflow/contrib/tensorrt/tensorrt_test.cc
rename to tensorflow/compiler/tf2tensorrt/tensorrt_test.cc
diff --git a/tensorflow/contrib/tensorrt/test/utils.cc b/tensorflow/compiler/tf2tensorrt/utils/test_utils.cc
similarity index 94%
rename from tensorflow/contrib/tensorrt/test/utils.cc
rename to tensorflow/compiler/tf2tensorrt/utils/test_utils.cc
index 276308b3a0a6ce864969afb0179c6a3f00d6b70b..dd3c09d7e42358a1f9e6cc13be6198de58e38963 100644
--- a/tensorflow/contrib/tensorrt/test/utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/test_utils.cc
@@ -13,13 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/test/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/test_utils.h"
 
 #include <unordered_map>
 #include <vector>
 
 #include "re2/re2.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace tensorrt {
diff --git a/tensorflow/contrib/tensorrt/test/utils.h b/tensorflow/compiler/tf2tensorrt/utils/test_utils.h
similarity index 85%
rename from tensorflow/contrib/tensorrt/test/utils.h
rename to tensorflow/compiler/tf2tensorrt/utils/test_utils.h
index 4bb4120206cfaae70107e55d1818e3af2f02717a..d85875991b79014c4f173d3157ed02e6c96f045c 100644
--- a/tensorflow/contrib/tensorrt/test/utils.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/test_utils.h
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_TEST_UTILS_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_TEST_UTILS_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TEST_UTILS_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TEST_UTILS_H_
 
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace tensorrt {
@@ -41,4 +40,4 @@ string GetTestValue(const string& label);
 }  // namespace tensorrt
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_TEST_UTILS_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TEST_UTILS_H_
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
similarity index 98%
rename from tensorflow/contrib/tensorrt/resources/trt_allocator.cc
rename to tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
index 7a2e93414aed56525eaeac876cdac20404bcf6ab..1636cdc30c4df157ed124b160449af645f917252 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
 
 #include "tensorflow/core/platform/logging.h"
 
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.h b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
similarity index 93%
rename from tensorflow/contrib/tensorrt/resources/trt_allocator.h
rename to tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
index f857a9de055ee7668f0bf9bc97e030354505081b..59ffb42bad348c78cde32035aff8c7081528b3a6 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_allocator.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ALLOCATOR_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ALLOCATOR_H_
 
 #include <unordered_map>
 
@@ -81,4 +81,4 @@ class TRTDeviceAllocator : public TRTBaseAllocator {
 
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_ALLOCATOR_H_
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator_test.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator_test.cc
similarity index 80%
rename from tensorflow/contrib/tensorrt/resources/trt_allocator_test.cc
rename to tensorflow/compiler/tf2tensorrt/utils/trt_allocator_test.cc
index ad6b1d7d4c57d696d3dee3b479733e152e669211..e457c64928e5df84c7e2726ba3621420f013dbc9 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_allocator_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_allocator_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
 
 #include "tensorflow/core/platform/test.h"
 
@@ -48,11 +48,14 @@ TEST(TRTAllocatorTest, Align) {
         513ul, 700ul, 12345ul, 1ul << 32}) {
     for (uint64_t alignment = 1; alignment <= space * 4; alignment *= 2) {
       for (const uintptr_t ptr_val :
-           {1ul, alignment == 1 ? 1ul : alignment - 1, alignment, alignment + 1,
-            alignment + (alignment / 2)}) {
+           {static_cast<uint64_t>(1),
+            alignment == 1 ? static_cast<uint64_t>(1) : alignment - 1,
+            alignment, alignment + 1, alignment + (alignment / 2)}) {
         if (ptr_val % alignment == 0) {
           for (const uint64_t size :
-               {1ul, space == 1 ? 1ul : space - 1, space, space + 1}) {
+               {static_cast<uint64_t>(1),
+                space == 1 ? static_cast<uint64_t>(1) : space - 1, space,
+                space + 1}) {
             EXPECT_EQ(space >= size, RunTest(alignment, size, ptr_val, space));
           }
         } else {
@@ -62,8 +65,10 @@ TEST(TRTAllocatorTest, Align) {
             EXPECT_TRUE(
                 RunTest(alignment, space - diff, ptr_val + diff, space - diff));
             for (const uint64_t size :
-                 {1ul, space - diff > 1 ? space - diff - 1 : 1ul, space - diff,
-                  space - diff + 1, space - 1}) {
+                 {static_cast<uint64_t>(1),
+                  space - diff > 1 ? space - diff - 1
+                                   : static_cast<uint64_t>(1),
+                  space - diff, space - diff + 1, space - 1}) {
               EXPECT_EQ(space - diff >= size,
                         RunTest(alignment, size, ptr_val, space));
             }
diff --git a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
similarity index 97%
rename from tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc
rename to tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
index dab1dd9343be7d5b033a3e04bf0b49fbbf37e9e5..5213fced1ea9220422245172f5b4a3f584a2a566 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h"
 
 #include <atomic>
 #include <unordered_map>
@@ -135,7 +135,7 @@ void TRTInt8Calibrator::setDone() {
 
 void TRTInt8Calibrator::writeCalibrationCache(const void* ptr,
                                               std::size_t length) {
-  calibration_table_ = string((const char*)ptr, length);
+  calibration_table_ = string(static_cast<const char*>(ptr), length);
   VLOG(1) << "Got calibration data for " << engine_name_ << " @" << ptr
           << " length=" << length;
 }
diff --git a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
similarity index 87%
rename from tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h
rename to tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
index 65466c9741989fda5f82fc27d813d026f35fe386..aa70b07f8d79848c362275815004db32cca128be 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_INT8_CALIBRATOR_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_INT8_CALIBRATOR_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_INT8_CALIBRATOR_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_INT8_CALIBRATOR_H_
 
 #include <atomic>
 #include <string>
@@ -34,7 +34,12 @@ namespace tensorrt {
 // TRTs pull model for calibration. When TRT implements a means for
 // a push calibration This class should be updated accordingly
 
+// IInt8EntropyCalibrator2 is prefferred for TRT 5.1+.
+#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator2 {
+#else
 struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
+#endif
  public:
   // Construct a calibrator for future calibration.
   TRTInt8Calibrator(
@@ -96,4 +101,4 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator {
 
 #endif
 #endif
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_INT8_CALIBRATOR_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_INT8_CALIBRATOR_H_
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
similarity index 90%
rename from tensorflow/contrib/tensorrt/log/trt_logger.cc
rename to tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
index dda0dc9e712eb726800abfb6084f4f708d04825b..6bc842ed5ca7e03018157060a332338cdc926f14 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
@@ -26,6 +26,9 @@ namespace tensorrt {
 void Logger::log(Severity severity, const char* msg) {
   // Suppress info-level messages
   switch (severity) {
+#if NV_TENSORRT_MAJOR > 5 || (NV_TENSORRT_MAJOR == 5 && NV_TENSORRT_MINOR >= 1)
+    case Severity::kVERBOSE:
+#endif
     case Severity::kINFO: {  // Mark TRT info messages as debug!
       VLOG(2) << name_ << " " << msg;
       break;
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.h b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
similarity index 86%
rename from tensorflow/contrib/tensorrt/log/trt_logger.h
rename to tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
index 96ccacb791e40143c5c4d9d691bb353702f9a28b..22f4de970a80765b0e1e7e8816134d83aaec7c73 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_logger.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LOGGER_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LOGGER_H_
 
 #include "tensorflow/core/platform/types.h"
 
@@ -41,4 +41,4 @@ class Logger : public nvinfer1::ILogger {
 #endif  // GOOGLE_TENSORRT
 #endif  // GOOGLE_CUDA
 
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_LOG_TRT_LOGGER_H_
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LOGGER_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..09c47b36b0ad8074e749342e7d08f139da7ea1f4
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
@@ -0,0 +1,192 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LRU_CACHE_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LRU_CACHE_H_
+
+#include <list>
+#include <unordered_map>
+
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+template <class Key, class Value, class HashFunction>
+class LRUCache {
+ public:
+  typedef Value value_type;
+  typedef Key key_type;
+  typedef HashFunction hasher;
+  typedef typename std::unordered_map<key_type, value_type, hasher> map_type;
+  typedef typename map_type::iterator iterator;
+  typedef typename map_type::const_iterator const_iterator;
+
+  LRUCache() : capacity_(0) {}
+  explicit LRUCache(size_t capacity) : capacity_(capacity) {}
+
+  size_t capacity() const { return capacity_; }
+
+  void reserve(size_t capacity) {
+    capacity_ = capacity;
+    DiscardOld();
+  }
+
+  size_t size() const { return objects_.size(); }
+
+  size_t count(const key_type& key) const { return objects_.count(key); }
+
+  value_type& at(const key_type& key) { return Touch(key); }
+
+  const_iterator begin() const { return objects_.begin(); }
+  const_iterator end() const { return objects_.end(); }
+
+  iterator begin() { return objects_.begin(); }
+  iterator end() { return objects_.end(); }
+
+  template <typename... Args>
+  std::pair<iterator, bool> emplace(Args&&... args) {
+    DiscardOld(1);
+    std::pair<iterator, bool> result =
+        objects_.emplace(std::forward<Args>(args)...);
+    key_type key = result.first->first;
+    if (result.second) {
+      keys_.push_front(key);
+    } else {
+      TouchNoCheck(key);  // The key must exist in this case.
+    }
+    return result;
+  }
+
+ private:
+  std::unordered_map<key_type, value_type, hasher> objects_;
+  std::list<key_type> keys_;
+  size_t capacity_;
+  value_type not_found_value_;
+
+  value_type& Touch(const key_type& key) {
+    // Check that the key exists, and let it return std::out_of_range error if
+    // not.
+    value_type& value = objects_.at(key);
+    TouchNoCheck(key);
+    return value;
+  }
+
+  void TouchNoCheck(const key_type& key) {
+    auto rank = std::find(keys_.begin(), keys_.end(), key);
+    if (rank != keys_.begin()) {
+      keys_.erase(rank);
+      keys_.push_front(key);
+    }
+  }
+
+  // Creates n free positions in cache
+  tensorflow::Status DiscardOld(size_t n = 0) {
+    if (n > capacity_) {
+      return tensorflow::errors::Internal(
+          "Insufficient capacity in cache (capacity = ", capacity_,
+          ", requested ", n, ")");
+    }
+    while (objects_.size() > (capacity_ - n)) {
+      key_type discard_key = keys_.back();
+      keys_.pop_back();
+      objects_.erase(discard_key);
+    }
+    return tensorflow::Status::OK();
+  }
+};
+
+// Define a hash function for vector<TensorShape> because it is used as the key
+// for the engine cache.
+struct VectorTensorShapeHasher {
+  std::size_t operator()(
+      const std::vector<tensorflow::TensorShape>& key) const {
+    return std::hash<std::string>()(TensorShapeUtils::ShapeListString(key));
+  }
+};
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+struct EngineContext {
+  EngineContext() {}  // Creates an empty context.
+  EngineContext(
+      TrtUniquePtrType<nvinfer1::ICudaEngine>&& input_cuda_engine,
+      TrtUniquePtrType<nvinfer1::IExecutionContext>&& input_execution_context)
+      : cuda_engine(std::move(input_cuda_engine)),
+        execution_context(std::move(input_execution_context)) {}
+
+  mutex mu;
+  TrtUniquePtrType<nvinfer1::ICudaEngine> cuda_engine;
+  TrtUniquePtrType<nvinfer1::IExecutionContext> execution_context
+      GUARDED_BY(mu);
+};
+
+class TRTEngineCacheResource : public tensorflow::ResourceBase {
+ public:
+  TRTEngineCacheResource(OpKernelContext* ctx, size_t capacity)
+      : cache_(capacity) {
+    auto device = ctx->device();
+    auto alloc = device->GetAllocator(tensorflow::AllocatorAttributes());
+    if (!alloc) {
+      LOG(ERROR) << "Can't find device allocator for gpu device "
+                 << device->name();
+      allocator_ = nullptr;
+    } else {
+      allocator_.reset(new TRTDeviceAllocator(alloc));
+    }
+  }
+
+  string DebugString() const override {
+    std::stringstream oss;
+    using std::dec;
+    using std::endl;
+    using std::hex;
+    oss << "TRTEngineCacheResource: ";
+    oss << "TRTBaseAllocator = " << hex << allocator_.get() << dec << ", ";
+    oss << "LRUCache = " << hex << &cache_ << dec << endl;
+    oss << "Containing " << cache_.size() << " entries: " << endl;
+    for (const auto& item : cache_) {
+      oss << TensorShapeUtils::ShapeListString(item.first) << ": " << hex
+          << "ICudaEngine: " << item.second.get()->cuda_engine.get() << ", "
+          << "IExecutionContext: " << item.second.get()->execution_context.get()
+          << dec << endl;
+    }
+    return oss.str();
+  }
+
+  // Keep device allocator for TRT.
+  std::unique_ptr<TRTBaseAllocator> allocator_;
+
+  // Declare cache after allocator so that it is destroyed before allocator is.
+  LRUCache<std::vector<TensorShape>, std::unique_ptr<EngineContext>,
+           VectorTensorShapeHasher>
+      cache_;
+};
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LRU_CACHE_H_
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache_test.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0aa5eb8f7d4ad062c2d8622fa5aa55f823f80dd5
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache_test.cc
@@ -0,0 +1,57 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+TEST(LRUCacheTest, Basic) {
+  LRUCache<int, int, std::hash<int>> cache;
+  cache.reserve(2);
+  // Insert 10
+  cache.emplace(10, 100);
+  EXPECT_EQ(cache.size(), 1);
+  EXPECT_EQ(cache.count(10), 1);
+  EXPECT_EQ(cache.at(10), 100);
+  EXPECT_EQ(cache.count(100), 0);
+  // Insert 20
+  cache.emplace(20, 200);
+  EXPECT_EQ(cache.size(), 2);
+  EXPECT_EQ(cache.count(10), 1);
+  EXPECT_EQ(cache.count(20), 1);
+  EXPECT_EQ(cache.at(10), 100);
+  EXPECT_EQ(cache.at(20), 200);
+  EXPECT_EQ(cache.count(100), 0);
+  EXPECT_EQ(cache.count(200), 0);
+  // Insert 30, Evicting 10
+  cache.emplace(30, 300);
+  EXPECT_EQ(cache.count(10), 0);
+  EXPECT_EQ(cache.count(20), 1);
+  EXPECT_EQ(cache.count(30), 1);
+  // Touch 20
+  cache.at(20);
+  // Insert 40, Evicting 30
+  cache.emplace(40, 400);
+  EXPECT_EQ(cache.count(10), 0);
+  EXPECT_EQ(cache.count(20), 1);
+  EXPECT_EQ(cache.count(30), 0);
+  EXPECT_EQ(cache.count(40), 1);
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_resources.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_resources.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2e553079b19a3e5d0739cc6ac79a84f3b6a1fc4e
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_resources.cc
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_resources.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+TRTCalibrationResource::~TRTCalibrationResource() {
+  VLOG(0) << "Destroying Calibration Resource " << std::endl << DebugString();
+  builder_.reset();
+  engine_.reset();
+  // We need to manually destroy the builder and engine before the allocator
+  // is destroyed.
+  allocator_.reset();
+}
+
+string TRTCalibrationResource::DebugString() const {
+  std::stringstream oss;
+  using std::dec;
+  using std::endl;
+  using std::hex;
+  oss << " Calibrator = " << hex << calibrator_.get() << dec << endl
+      << " Builder    = " << hex << builder_.get() << dec << endl
+      << " Engine     = " << hex << engine_.get() << dec << endl
+      << " Logger     = " << hex << &logger_ << dec << endl
+      << " Allocator  = " << hex << allocator_.get() << dec << endl
+      << " Thread     = " << hex << thr_.get() << dec << endl;
+  return oss.str();
+}
+
+Status TRTCalibrationResource::SerializeToString(string* serialized) {
+  calibrator_->waitAndSetDone();
+  thr_->join();
+  *serialized = calibrator_->getCalibrationTableAsString();
+  if (serialized->empty()) {
+    return tensorflow::errors::Unknown("Calibration table is empty.");
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_resources.h b/tensorflow/compiler/tf2tensorrt/utils/trt_resources.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e8d4b3b738df09b0c2ea82dcc06e9b23a708385
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_resources.h
@@ -0,0 +1,73 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_RESOURCES_H_
+#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_RESOURCES_H_
+
+#include <list>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+class SerializableResourceBase : public tensorflow::ResourceBase {
+ public:
+  virtual Status SerializeToString(string* serialized) = 0;
+};
+
+class TRTCalibrationResource : public SerializableResourceBase {
+ public:
+  ~TRTCalibrationResource() override;
+
+  string DebugString() const override;
+
+  Status SerializeToString(string* serialized) override;
+
+  // Lookup table for temporary staging areas of input tensors for calibration.
+  std::unordered_map<string, std::pair<void*, size_t>> device_buffers_;
+
+  // Temporary staging areas for calibration inputs.
+  std::vector<PersistentTensor> device_tensors_;
+
+  std::unique_ptr<TRTInt8Calibrator> calibrator_;
+  TrtUniquePtrType<nvinfer1::IBuilder> builder_;
+  TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
+  std::unique_ptr<TRTBaseAllocator> allocator_;
+  tensorflow::tensorrt::Logger logger_;
+  // TODO(sami): Use threadpool threads!
+  std::unique_ptr<std::thread> thr_;
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+#endif  // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_RESOURCES_H_
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 5a0d9b9af9d55a8dee809d3cf909bce39c3b8b6c..7d9e7b9fc1f7ea83d6aa982afb5df097b0bdbf77 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -1,6 +1,6 @@
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test", "tf_cuda_cc_test")
 
 package_group(
     name = "internal",
@@ -24,7 +24,7 @@ package(
 )
 
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
-load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library", "xla_py_proto_library")
 
 cc_library(
     name = "tf2xla_supported_ops_lib",
@@ -60,6 +60,14 @@ xla_proto_library(
     ],
 )
 
+xla_py_proto_library(
+    name = "tf2xla_py",
+    has_services = False,
+    api_version = 2,
+    visibility = ["//visibility:public"],
+    deps = [":tf2xla_proto"],
+)
+
 xla_proto_library(
     name = "host_compute_metadata_proto",
     srcs = ["host_compute_metadata.proto"],
@@ -204,6 +212,7 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
@@ -224,6 +233,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
+        "@com_google_absl//absl/types:variant",
     ],
     alwayslink = 1,
 )
@@ -244,6 +254,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -280,6 +291,7 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
     ],
 )
 
@@ -314,11 +326,13 @@ tf_cc_test(
         ":tf2xla_util",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:functional_ops",
         "//tensorflow/cc:ops",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -443,6 +457,7 @@ cc_library(
     hdrs = [
         "dump_graph.h",
     ],
+    visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/core:framework",
@@ -668,8 +683,31 @@ cc_library(
     name = "side_effect_util",
     srcs = ["side_effect_util.cc"],
     hdrs = ["side_effect_util.h"],
+    visibility = [":friends"],
     deps = [
         "//tensorflow/core:core_cpu",
         "@com_google_absl//absl/strings",
     ],
 )
+
+tf_cuda_cc_test(
+    name = "fused_batchnorm_reserve_space_test",
+    size = "medium",
+    srcs = ["fused_batchnorm_reserve_space_test.cc"],
+    deps = [
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/compiler/jit",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+        "//tensorflow/core/kernels:ops_util",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
diff --git a/tensorflow/compiler/tf2xla/cpu_function_runtime.h b/tensorflow/compiler/tf2xla/cpu_function_runtime.h
index dfc1e8b8aebcf3142e9f61f60171c6b58634c71d..78970fb39bae7067c7668baa2aec65732b5b2352 100644
--- a/tensorflow/compiler/tf2xla/cpu_function_runtime.h
+++ b/tensorflow/compiler/tf2xla/cpu_function_runtime.h
@@ -104,7 +104,7 @@ class BufferInfo {
  private:
   BufferInfo() = default;
 
-  enum class Kind : unsigned {
+  enum class Kind : uint64 {
     kConstant,
     kTempBuffer,
     kEntryParameter,
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.cc b/tensorflow/compiler/tf2xla/functionalize_cond.cc
index c693e42d26712d55852f45c806215fc1f1b9a030..8aa162be47c9181e215de6a2eb660215135ff6eb 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_cond.cc
@@ -34,6 +34,8 @@ limitations under the License.
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 using xla::StatusOr;
@@ -41,6 +43,43 @@ using xla::StatusOr;
 namespace tensorflow {
 namespace functionalize_cond {
 
+bool AncestorNode::operator<(const AncestorNode& other) const {
+  return (output_tensor.node->id() < other.output_tensor.node->id()) ||
+         (output_tensor.node->id() == other.output_tensor.node->id() &&
+          output_tensor.index < other.output_tensor.index) ||
+         (output_tensor.node->id() == other.output_tensor.node->id() &&
+          output_tensor.index == other.output_tensor.index &&
+          type < other.type);
+}
+
+bool AncestorNode::operator==(const AncestorNode& other) const {
+  return output_tensor.node->id() == other.output_tensor.node->id() &&
+         output_tensor.index == other.output_tensor.index && type == other.type;
+}
+
+size_t AncestorNode::Hash::operator()(const AncestorNode& ancestor) const {
+  size_t h = std::hash<int>()(ancestor.output_tensor.node->id());
+  h = Hash64Combine(h, std::hash<int>()(ancestor.output_tensor.index));
+  return Hash64Combine(h, std::hash<int>()(static_cast<int>(ancestor.type)));
+}
+
+typedef std::tuple<StateMap::CondId, StateMap::AncestorId, OutputTensor>
+    ClusterTuple;
+
+struct ClusterTupleLessThan {
+  bool operator()(const ClusterTuple& a, const ClusterTuple& b) const {
+    if (std::tie(std::get<0>(a), std::get<1>(a)) <
+        std::tie(std::get<0>(b), std::get<1>(b))) {
+      return true;
+    } else if (std::tie(std::get<0>(a), std::get<1>(a)) ==
+               std::tie(std::get<0>(b), std::get<1>(b))) {
+      return StateMap::OutputTensorLess()(std::get<2>(a), std::get<2>(b));
+    } else {
+      return false;
+    }
+  }
+};
+
 // TODO(jpienaar): Move to OutputTensor.
 string DebugString(const OutputTensor& tensor) {
   return absl::StrCat(tensor.node->name(), ":", tensor.index);
@@ -145,10 +184,10 @@ size_t StateMap::Hash::operator()(const StateMap::AncestorState& map) const {
   if (map.empty()) return 0;
   // Compute hash of the front element.
   auto it = map.begin();
-  size_t h = hash<Node*>()(*it);
+  size_t h = AncestorNode::Hash()(*it);
   for (++it; it != map.end(); ++it) {
     // Combine the has with the different elements in the map.
-    h = Hash64Combine(h, hash<Node*>()(*it));
+    h = Hash64Combine(h, AncestorNode::Hash()(*it));
   }
   return h;
 }
@@ -229,7 +268,17 @@ string StateMap::CondStateToString(StateMap::CondId id) const {
 }
 
 string StateMap::AncestorStateToString(const Node* node) const {
-  if (auto id = LookupAncestorId(node)) return NodesToString(*id);
+  if (auto id = LookupAncestorId(node)) {
+    return absl::StrCat(
+        "{",
+        absl::StrJoin(*id, ",",
+                      [](string* output, const AncestorNode& ancestor) {
+                        absl::StrAppend(output,
+                                        ancestor.output_tensor.node->name(),
+                                        ":", ancestor.output_tensor.index);
+                      }),
+        "}");
+  }
   return "{}";
 }
 
@@ -247,7 +296,9 @@ class Conditional {
   Status AddMerge(Node* m);
 
   // Constructs an If node from the merge nodes.
-  Status BuildAndReplace(Graph* graph, FunctionLibraryDefinition* library);
+  Status BuildAndReplace(
+      Graph* graph, FunctionLibraryDefinition* library,
+      std::unordered_map<Node*, OutputTensor>* merge_to_replacement);
 
  private:
   // Extracts the then/else bodies: creates new graphs with the nodes
@@ -262,10 +313,15 @@ class Conditional {
   Status BuildIfNode(Graph* graph, FunctionLibraryDefinition* library);
 
   // Adds input edges to If node.
-  Status AddInputEdges(Graph* graph);
+  Status AddInputEdges(
+      Graph* graph,
+      const std::unordered_map<Node*, OutputTensor>& merge_to_replacement);
 
   // Adds output edges from If node.
-  Status AddOutputEdges(Graph* graph);
+  // Record new output tensor for all Merge nodes in 'merge_to_replacement'.
+  Status AddOutputEdges(
+      Graph* graph,
+      std::unordered_map<Node*, OutputTensor>* merge_to_replacement);
 
   // Adds switch node that is part of this conditional.
   Status AddSwitch(Node* s);
@@ -640,7 +696,8 @@ Status Conditional::ExtractBodies(Graph* graph) {
 Status Conditional::BuildIfNode(Graph* graph,
                                 FunctionLibraryDefinition* library) {
   VLOG(2) << "Build cond function for " << name();
-  NodeDefBuilder builder(name(), "If", library);
+  NodeDebugInfo debug_info((*merges_.begin())->def());
+  NodeDefBuilder builder(name(), "If", library, &debug_info);
   const string branch_name[] = {"else_branch", "then_branch"};
   for (auto branch : {BranchType::kElseBranch, BranchType::kThenBranch}) {
     int branch_index = static_cast<int>(branch);
@@ -704,9 +761,9 @@ Status Conditional::BuildIfNode(Graph* graph,
   }
   builder.Device(predicate_.node->assigned_device_name());
   // Conditional should be the first input ...
-  builder.Input(NodeDefBuilder::NodeOut(predicate_.node->name(),
-                                        predicate_.index,
-                                        predicate_.node->output_type(0)));
+  builder.Input(
+      NodeDefBuilder::NodeOut(predicate_.node->name(), predicate_.index,
+                              predicate_.node->output_type(predicate_.index)));
   // ... followed by the other inputs.
   builder.Input(inputs);
 
@@ -719,12 +776,29 @@ Status Conditional::BuildIfNode(Graph* graph,
   return Status::OK();
 }
 
-Status Conditional::AddInputEdges(Graph* graph) {
+Status Conditional::AddInputEdges(
+    Graph* graph,
+    const std::unordered_map<Node*, OutputTensor>& merge_to_replacement) {
   VLOG(2) << "AddInputEdges for " << if_node_->name();
   int index = 0;
   // Add predicate input.
-  graph->AddEdge(const_cast<Node*>(predicate_.node), predicate_.index, if_node_,
-                 index++);
+  if (predicate_.node->IsMerge()) {
+    // If the predicate is a Merge node, we should not use Merge output as
+    // predicate. Instead, we should use the corresponding If output in
+    // 'merge_to_replacement'. Otherwise, this Conditional's If node is still
+    // connected to the predicate Merge node; and when we call
+    // DeleteReachableAndDeadNodes(), the predicate Merge node and this
+    // Conditional's If node will be removed.
+    auto iter = merge_to_replacement.find(predicate_.node);
+    if (iter == merge_to_replacement.end()) {
+      return errors::Internal("Cannot find replacement for Merge node ",
+                              predicate_.node->name());
+    }
+    graph->AddEdge(iter->second.node, iter->second.index, if_node_, index++);
+  } else {
+    graph->AddEdge(const_cast<Node*>(predicate_.node), predicate_.index,
+                   if_node_, index++);
+  }
   // Add function body inputs.
   for (auto& arg : cond_arg_nodes_) {
     if (arg.src_output == Graph::kControlSlot) {
@@ -739,7 +813,9 @@ Status Conditional::AddInputEdges(Graph* graph) {
   return Status::OK();
 }
 
-Status Conditional::AddOutputEdges(Graph* graph) {
+Status Conditional::AddOutputEdges(
+    Graph* graph,
+    std::unordered_map<Node*, OutputTensor>* merge_to_replacement) {
   VLOG(2) << "AddOutputEdges for " << if_node_->name();
   int i = 0;
   for (Node* node : merges_) {
@@ -763,6 +839,10 @@ Status Conditional::AddOutputEdges(Graph* graph) {
         graph->AddEdge(if_node_, i, dst, dst_input);
       }
     }
+
+    // Record corresponding output tensor in 'merge_to_replacement'.
+    (*merge_to_replacement)[node] = OutputTensor{if_node_, i};
+
     ++i;
   }
   for (Node* n : external_control_outputs_) {
@@ -772,8 +852,9 @@ Status Conditional::AddOutputEdges(Graph* graph) {
   return Status::OK();
 }
 
-Status Conditional::BuildAndReplace(Graph* graph,
-                                    FunctionLibraryDefinition* library) {
+Status Conditional::BuildAndReplace(
+    Graph* graph, FunctionLibraryDefinition* library,
+    std::unordered_map<Node*, OutputTensor>* merge_to_replacement) {
   VLOG(1) << "Build If and replace merge nodes "
           << NodesToString(this->merges_);
   if (replaced_) return Status::OK();
@@ -792,8 +873,8 @@ Status Conditional::BuildAndReplace(Graph* graph,
   }
 
   TF_RETURN_IF_ERROR(BuildIfNode(graph, library));
-  TF_RETURN_IF_ERROR(AddInputEdges(graph));
-  TF_RETURN_IF_ERROR(AddOutputEdges(graph));
+  TF_RETURN_IF_ERROR(AddInputEdges(graph, *merge_to_replacement));
+  TF_RETURN_IF_ERROR(AddOutputEdges(graph, merge_to_replacement));
   TF_RETURN_IF_ERROR(parent_->PropagateUpdatedState(if_node_));
 
   // Check that the if_node doesn't feed into itself.
@@ -935,6 +1016,10 @@ StatusOr<StateMap::CondId> FunctionalizeCond::JoinCondStatesMerge(
   VLOG(4) << "Joining (for merge) " << DebugString(src) << " and "
           << DebugString(dst);
   if (state_map_.IsEmpty(dst)) return src;
+  if (state_map_.IsEmpty(src)) {
+    return errors::Internal("Merge node ", merge->name(),
+                            " has input that's not in any CondContext.");
+  }
 
   if (state_map_.IsDead(src)) return src;
   if (state_map_.IsDead(dst)) return dst;
@@ -1169,8 +1254,17 @@ Status FunctionalizeCond::DetermineAncestorState(Node* dst) {
     if (other_id != id && other_id != nullptr) {
       state.insert(other_id->begin(), other_id->end());
     }
-    if (IsSwitch(src) || IsMerge(src)) {
-      state.insert(src);
+    if (IsMerge(src)) {
+      state.insert({{src, 0}, AncestorNode::AncestorNodeType::kMerge});
+    } else if (IsSwitch(src)) {
+      OutputTensor pred;
+      // For dead switch nodes, GetSwitchPredicate() will fail, and we use
+      // the switch node directly as ancestor.
+      if (GetSwitchPredicate(*src, &pred).ok()) {
+        state.insert({pred, AncestorNode::AncestorNodeType::kPred});
+      } else {
+        state.insert({{src, 0}, AncestorNode::AncestorNodeType::kSwitch});
+      }
     }
     return state_map_.GetAncestorId(state);
   };
@@ -1316,16 +1410,30 @@ Status FunctionalizeCond::FunctionalizeInternal() {
   // Sort the merge nodes from innermost outwards.
   SortMergeNodes(&merge_order);
 
-  // Cluster merge nodes by CondId and AncestorId in order of nesting.
-  using ClusterPair = std::pair<StateMap::CondId, StateMap::AncestorId>;
+  // Cluster merge nodes by (CondId, AncestorId, predicate) in order of
+  // nesting. (CondId, AncestorId) is not enough, e.g.
+  //   pred1 = array_ops.placeholder(dtypes.bool, name='pred1')
+  //   pred2 = array_ops.placeholder(dtypes.bool, name='pred2')
+  //   cond1 = control_flow_ops.cond(pred1, ...)
+  //   cond2 = control_flow_ops.cond(pred2, ...)
+  //   cond3 = control_flow_ops.cond(pred1, use cond1 and cond2)
+  //   cond4 = control_flow_ops.cond(pred2, use cond1 and cond2)
+  // cond3 and cond4 have the same (CondId, AncestorId), but they should not
+  // be merged into one "If" node (because they have different predicates).
   std::deque<std::vector<Node*>> merge_clusters;
-  std::map<ClusterPair, int> merge_cluster_index;
+  std::map<ClusterTuple, int, ClusterTupleLessThan> merge_cluster_index;
   for (Node* merge : merge_order) {
     auto cond_id = state_map_.LookupCondId(merge);
     if (state_map_.IsDead(cond_id)) continue;
 
-    ClusterPair key =
-        std::make_pair(cond_id, state_map_.LookupAncestorId(merge));
+    auto predicate = merge_to_predicate_.find(merge);
+    if (predicate == merge_to_predicate_.end()) {
+      return errors::Internal("Cannot find predicate for Merge node ",
+                              merge->name());
+    }
+
+    ClusterTuple key = std::make_tuple(
+        cond_id, state_map_.LookupAncestorId(merge), predicate->second);
     auto idx = merge_cluster_index.find(key);
     if (idx == merge_cluster_index.end()) {
       merge_cluster_index[key] = merge_clusters.size();
@@ -1344,7 +1452,8 @@ Status FunctionalizeCond::FunctionalizeInternal() {
     Conditional cond(merge_to_predicate_.at(cluster.front()), this,
                      &state_map_);
     for (Node* merge : cluster) TF_RETURN_IF_ERROR(cond.AddMerge(merge));
-    TF_RETURN_IF_ERROR(cond.BuildAndReplace(graph_, library_));
+    TF_RETURN_IF_ERROR(
+        cond.BuildAndReplace(graph_, library_, &merge_to_replacement_));
 
     if (VLOG_IS_ON(4)) DumpGraphWithCondState("after_extract");
   }
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.h b/tensorflow/compiler/tf2xla/functionalize_cond.h
index 8525d7af61b4471e53a9ae16b081060bfd234c9c..d85800fb8ee65a354716bf6601c6bc40eca9a10d 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond.h
+++ b/tensorflow/compiler/tf2xla/functionalize_cond.h
@@ -43,6 +43,33 @@ enum class BranchType {
   kNeither = 3,
 };
 
+// When we keep track of which switch/merge node's feed into a node, we record
+// 1) predicate for non-dead switch node,
+// 2) the switch node itself for dead switch node,
+// 3) the merge node itself for merge node.
+// Case 1) is an optimization. With this optimization, if there are nodes from
+// different switch nodes but those switch nodes have the same predicate, the
+// nodes will still have same AncestorState, and they will be clustered into a
+// single "If".
+struct AncestorNode {
+  enum class AncestorNodeType {
+    kPred = 0,
+    kSwitch = 1,
+    kMerge = 2,
+  };
+
+  OutputTensor output_tensor;
+  AncestorNodeType type;
+
+  // Compare two AncestorNodes by (node id, index, type).
+  bool operator<(const AncestorNode& other) const;
+  bool operator==(const AncestorNode& other) const;
+
+  struct Hash {
+    size_t operator()(const AncestorNode&) const;
+  };
+};
+
 // StateMap is responsible for mapping from each graph Node to
 // * a CondState, where each CondState is a map from predicate to branch (i,e.,
 //   what predicates have to hold or not hold).
@@ -68,7 +95,7 @@ class StateMap {
   using CondId = const CondState*;
 
   // Keep track of which switch/merge node's feed into a node's values.
-  using AncestorState = std::set<Node*>;
+  using AncestorState = std::set<AncestorNode>;
 
   // Every unique ID is mapped to a AncestorState.
   using AncestorId = const AncestorState*;
@@ -232,6 +259,9 @@ class FunctionalizeCond {
   // Mapping from merge nodes to predicate.
   std::unordered_map<Node*, OutputTensor> merge_to_predicate_;
 
+  // Mapping from merge nodes to corresponding If node outputs.
+  std::unordered_map<Node*, OutputTensor> merge_to_replacement_;
+
   FunctionLibraryDefinition* library_;
   Graph* graph_;
 
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond_test.cc b/tensorflow/compiler/tf2xla/functionalize_cond_test.cc
index b0aabd63bbda784b3b7103a438ce025eea0cd93b..05fa1ee92dc172bd11cec9f99e3884996e00791f 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_cond_test.cc
@@ -101,6 +101,17 @@ TEST_F(FunctionalizeCondTest, JoinCondStates) {
   TF_EXPECT_OK(t.status());
 }
 
+TEST_F(FunctionalizeCondTest, JoinCondStatesMergeWithInputNotInCondContext) {
+  Tensor val_tensor(DT_INT32, TensorShape());
+  val_tensor.flat<int>().setZero();
+  Node* val = test::graph::Constant(graph_.get(), val_tensor, "val");
+  Node* m = test::graph::Merge(graph_.get(), val, val);
+
+  StateMap::CondState cond_state;
+  auto joined_or = JoinCondStatesMerge(m, /*src=*/nullptr, &cond_state);
+  EXPECT_FALSE(joined_or.ok());
+}
+
 }  // namespace
 }  // namespace functionalize_cond
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/fused_batchnorm_reserve_space_test.cc b/tensorflow/compiler/tf2xla/fused_batchnorm_reserve_space_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4535ece374ceb801e450af98a21d5a4c5e8f2a29
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/fused_batchnorm_reserve_space_test.cc
@@ -0,0 +1,130 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace {
+Status GetTestDevice(Session* session, string* test_device) {
+  std::vector<DeviceAttributes> devices;
+  TF_RETURN_IF_ERROR(session->ListDevices(&devices));
+
+  bool found_cpu = absl::c_any_of(devices, [&](const DeviceAttributes& device) {
+    return device.device_type() == "CPU";
+  });
+
+  bool found_gpu = absl::c_any_of(devices, [&](const DeviceAttributes& device) {
+    return device.device_type() == "GPU";
+  });
+
+  if (!found_gpu && !found_cpu) {
+    return errors::Internal("Expected at least one CPU or GPU!");
+  }
+
+  *test_device = found_gpu ? "GPU" : "CPU";
+  VLOG(2) << "Using test device " << *test_device;
+  return Status::OK();
+}
+
+void FillZeros(Tensor* tensor) {
+  auto flat = tensor->flat<float>();
+  for (int i = 0; i < flat.size(); i++) {
+    flat.data()[i] = 0.0f;
+  }
+}
+
+// This tests check that the implementation outputs from FusedBatchnorm
+// training, reserve_space_{1|2}, are what we assume them to be in the TF/XLA
+// lowering.
+//
+// If this test starts failing then it doesn't indicate that TF/cudnn have
+// violated their contract, but it indicates that we need to update the TF/XLA
+// lowering for FusedBatchnorm training to match the new implementation defined
+// behavior.
+TEST(FusedBatchnormReserveSpaceTest, Test) {
+  using ::tensorflow::ops::Const;
+  using ::tensorflow::ops::FusedBatchNorm;
+
+  std::unique_ptr<tensorflow::Session> session(
+      tensorflow::NewSession(tensorflow::SessionOptions{}));
+
+  string test_device;
+  TF_ASSERT_OK(GetTestDevice(session.get(), &test_device));
+
+  Scope root = tensorflow::Scope::NewRootScope();
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+
+  Tensor scale_data(DT_FLOAT, TensorShape({10}));
+  FillZeros(&scale_data);
+  Output scale =
+      Const(root.WithOpName("scale"), Input::Initializer(scale_data));
+
+  Tensor offset_data(DT_FLOAT, TensorShape({10}));
+  FillZeros(&offset_data);
+  Output offset =
+      Const(root.WithOpName("offset"), Input::Initializer(offset_data));
+
+  Tensor mean_data(DT_FLOAT, TensorShape({0}));
+  Output mean = Const(root.WithOpName("offset"), Input::Initializer(mean_data));
+
+  Tensor variance_data(DT_FLOAT, TensorShape({0}));
+  Output variance =
+      Const(root.WithOpName("variance"), Input::Initializer(variance_data));
+
+  string tf_device = absl::StrCat("/device:", test_device, ":0");
+  string xla_device = absl::StrCat("/device:XLA_", test_device, ":0");
+
+  FusedBatchNorm fused_batch_norm_tf(
+      root.WithOpName("fused_batch_norm_tf").WithDevice(tf_device), input,
+      scale, offset, mean, variance, FusedBatchNorm::Attrs{}.IsTraining(true));
+  FusedBatchNorm fused_batch_norm_xla(
+      root.WithOpName("fused_batch_norm_xla").WithDevice(xla_device), input,
+      scale, offset, mean, variance, FusedBatchNorm::Attrs{}.IsTraining(true));
+
+  tensorflow::GraphDef graph;
+  TF_ASSERT_OK(root.ToGraphDef(&graph));
+
+  TF_ASSERT_OK(session->Create(graph));
+
+  Tensor input_data(DT_FLOAT, TensorShape({10, 10, 10, 10}));
+  auto flat_input = input_data.flat<float>();
+  for (int i = 0; i < flat_input.size(); i++) {
+    flat_input.data()[i] = (i - 5) / 1000.0f;
+  }
+
+  std::vector<Tensor> results;
+  TF_ASSERT_OK(session->Run({{"input", input_data}},
+                            {fused_batch_norm_tf.reserve_space_1.name(),
+                             fused_batch_norm_xla.reserve_space_1.name(),
+                             fused_batch_norm_tf.reserve_space_2.name(),
+                             fused_batch_norm_xla.reserve_space_2.name()},
+                            {}, &results));
+
+  test::ExpectClose(results[0], results[1], /*atol=*/1e-4);
+  test::ExpectClose(results[2], results[3], /*atol=*/1e-4);
+}
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index efb75749722893100494e089c0beb96944e9f1d4..5e4699bbb6218089d2e76a36c7351bf7fbd23264 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/side_effect_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
@@ -88,6 +89,9 @@ Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph,
       case XlaExpression::Kind::kResource:
         return errors::Unimplemented(
             "Resource as function argument is not yet implemented.");
+      case XlaExpression::Kind::kTensorList:
+        return errors::Unimplemented(
+            "TensorList as function argument is not yet implemented.");
       case XlaExpression::Kind::kInvalid:
         return errors::InvalidArgument("Invalid function argument");
     }
@@ -191,6 +195,9 @@ Status GraphCompiler::CompileFunctionalNode(Node* n,
   // into the functions.
   XlaOpKernelContext xla_op_context(op_context);
 
+  XlaContext& context = XlaContext::Get(op_context);
+  auto* b = context.builder();
+
   XlaCompiler* compiler = xla_op_context.compiler();
 
   NameAttrList func;
@@ -219,8 +226,12 @@ Status GraphCompiler::CompileFunctionalNode(Node* n,
   TF_RETURN_IF_ERROR(
       PrepareArguments(&xla_op_context, graph.get(), expressions, &arguments));
 
+  bool add_token_input_output =
+      HasNodeAttr(n->def(), kXlaTokenInputNodesAttrName);
+
   XlaCompiler::CompileOptions compile_options;
   compile_options.is_entry_computation = false;
+  compile_options.add_token_input_output = add_token_input_output;
   XlaCompiler::CompilationResult result;
   TF_RETURN_IF_ERROR(
       compiler->CompileFunction(compile_options, func, arguments, &result));
@@ -234,9 +245,19 @@ Status GraphCompiler::CompileFunctionalNode(Node* n,
     }
     handles.push_back(expressions[i]->handle());
   }
-
-  XlaContext& context = XlaContext::Get(op_context);
-  auto* b = context.builder();
+  if (add_token_input_output) {
+    std::vector<string> token_input_nodes;
+    TF_RETURN_IF_ERROR(
+        GetNodeAttr(n->def(), kXlaTokenInputNodesAttrName, &token_input_nodes));
+    std::vector<xla::XlaOp> token_inputs;
+    for (const string& node_name : token_input_nodes) {
+      auto token_or = compiler->GetNodeToken(node_name);
+      TF_RETURN_IF_ERROR(token_or.status());
+      token_inputs.push_back(token_or.ConsumeValueOrDie());
+    }
+    xla::XlaOp token_input = xla::AfterAll(b, token_inputs);
+    handles.push_back(token_input);
+  }
 
   auto output_handle = xla::Call(b, *result.computation, handles);
   // The output handle of `Call` computation is a tuple type. Unzip it so
@@ -251,6 +272,10 @@ Status GraphCompiler::CompileFunctionalNode(Node* n,
       ++computation_output;
     }
   }
+  if (add_token_input_output) {
+    TF_RETURN_IF_ERROR(compiler->SetNodeToken(
+        n->name(), xla::GetTupleElement(output_handle, computation_output)));
+  }
   return b->first_error();
 }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 8bc329229648c5aced8d06c99b170803bb3a90f8..343568b2392595a2347bde41f0a2e2559fb1de19 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -1,16 +1,11 @@
+load("//tensorflow:tensorflow.bzl", "tf_copts", "tf_kernel_library")
+
 licenses(["notice"])  # Apache 2.0
 
 package(
     default_visibility = ["//tensorflow/compiler/tf2xla:internal"],
 )
 
-load("//tensorflow:tensorflow.bzl", "tf_copts")
-load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
-load(
-    "//third_party/mkl:build_defs.bzl",
-    "if_mkl",
-)
-
 tf_kernel_library(
     name = "xla_ops",
     srcs = [
@@ -39,6 +34,7 @@ tf_kernel_library(
         "dynamic_slice_ops.cc",
         "dynamic_stitch_op.cc",
         "elu_op.cc",
+        "empty_op.cc",
         "extract_image_patches_op.cc",
         "fake_param_op.cc",
         "fake_quantize_ops.cc",
@@ -106,15 +102,18 @@ tf_kernel_library(
         "variable_ops.cc",
         "xla_broadcast_helper_op.cc",
         "xla_conv_op.cc",
+        "xla_dequantize_op.cc",
         "xla_dot_op.cc",
         "xla_pad_op.cc",
         "xla_reduce_op.cc",
         "xla_select_and_scatter_op.cc",
+        "xla_self_adjoint_eig_op.cc",
     ],
     hdrs = [
         "index_ops.h",
         "shape_util.h",
     ],
+    tags = ["optonly"],
     deps = [
         ":conv_op_helpers",
         ":if_op",
@@ -122,12 +121,9 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/lib:broadcast",
-        "//tensorflow/compiler/tf2xla/lib:cholesky",
-        "//tensorflow/compiler/tf2xla/lib:qr",
         "//tensorflow/compiler/tf2xla/lib:random",
         "//tensorflow/compiler/tf2xla/lib:scatter",
         "//tensorflow/compiler/tf2xla/lib:util",
-        "//tensorflow/compiler/tf2xla/lib:while_loop",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:literal",
@@ -140,20 +136,38 @@ tf_kernel_library(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/lib:cholesky",
         "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/compiler/xla/client/lib:loops",
         "//tensorflow/compiler/xla/client/lib:math",
         "//tensorflow/compiler/xla/client/lib:matrix",
         "//tensorflow/compiler/xla/client/lib:pooling",
         "//tensorflow/compiler/xla/client/lib:prng",
+        "//tensorflow/compiler/xla/client/lib:qr",
+        "//tensorflow/compiler/xla/client/lib:quantize",
+        "//tensorflow/compiler/xla/client/lib:self_adjoint_eig",
         "//tensorflow/compiler/xla/client/lib:sorting",
-        "//tensorflow/compiler/xla/client/lib:triangular_solve",
+        "//tensorflow/core:bitwise_ops_op_lib",
+        "//tensorflow/core:control_flow_ops_op_lib",
+        "//tensorflow/core:data_flow_ops_op_lib",
         "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:image_ops_op_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:linalg_ops_op_lib",
+        "//tensorflow/core:list_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:random_ops_op_lib",
+        "//tensorflow/core:resource_variable_ops_op_lib",
+        "//tensorflow/core:sendrecv_ops_op_lib",
+        "//tensorflow/core:sparse_ops_op_lib",
         "//tensorflow/core:spectral_ops_op_lib",
+        "//tensorflow/core:state_ops_op_lib",
         "//tensorflow/core:stateless_random_ops_op_lib",
+        "//tensorflow/core:training_ops_op_lib",
         "//tensorflow/core/kernels:bounds_check",
         "//tensorflow/core/kernels:concat_lib",
         "//tensorflow/core/kernels:constant_op",
diff --git a/tensorflow/compiler/tf2xla/kernels/arg_op.cc b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
index 795ea09831e183a26fb3498b9bbaf9c3adaef9ed..5554d7a377d38554058aa731770ee10e400bc535 100644
--- a/tensorflow/compiler/tf2xla/kernels/arg_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
@@ -53,7 +53,11 @@ class XlaArgOp : public XlaOpKernel {
     const XlaExpression& arg = ctx->xla_context()->args()[index_];
     OP_REQUIRES(ctx, arg.kind() != XlaExpression::Kind::kInvalid,
                 errors::InvalidArgument("Invalid/missing argument expression"));
-    ctx->SetOutputExpression(0, arg);
+    if (ctx->expected_output_dtype(0) == DT_VARIANT) {
+      ctx->SetTensorListOutput(0, arg.handle());
+    } else {
+      ctx->SetOutputExpression(0, arg);
+    }
   }
 
  private:
@@ -63,6 +67,8 @@ class XlaArgOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(XlaArgOp);
 };
 
-REGISTER_XLA_OP(Name("_Arg").AllowResourceTypes().CompilationOnly(), XlaArgOp);
+REGISTER_XLA_OP(
+    Name("_Arg").AllowResourceTypes().AllowVariantTypes().CompilationOnly(),
+    XlaArgOp);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
index 0e2f335f3354e3ae6008bdc0ac0b80683fe479c1..f1d78c87527eb5f818dcf92209feabe33653a625 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/util/tensor_format.h"
 
@@ -34,6 +36,7 @@ class FusedBatchNormOp : public XlaOpKernel {
     OP_REQUIRES(
         ctx, FormatFromString(data_format_str, &data_format_),
         errors::InvalidArgument("Invalid data format: ", data_format_str));
+    is_on_gpu_ = ctx->device_type().type_string() == DEVICE_GPU_XLA_JIT;
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
@@ -71,7 +74,18 @@ class FusedBatchNormOp : public XlaOpKernel {
       // variance to the gradient. Here we maintain the same behavior by setting
       // them to the mean and variance calculated by BatchNormTraining.
       ctx->SetOutput(3, xla::GetTupleElement(output, 1));
-      ctx->SetOutput(4, xla::GetTupleElement(output, 2));
+      if (is_on_gpu_) {
+        // The last two outputs from the FusedBatchNorm training TensorFlow GPU
+        // op are implementation defined.  For now we rely on the in-practice
+        // behavior of the op:
+        //   output 3 is the mean
+        //   output 4 is rsqrt(variance + epsilon)
+        xla::XlaOp variance = xla::GetTupleElement(output, 2);
+        ctx->SetOutput(4, xla::Rsqrt(xla::Add(
+                              variance, xla::ScalarLike(variance, epsilon_))));
+      } else {
+        ctx->SetOutput(4, xla::GetTupleElement(output, 2));
+      }
     } else {
       xla::XlaOp output = xla::BatchNormInference(
           input, ctx->Input(1), ctx->Input(2), ctx->Input(3), ctx->Input(4),
@@ -89,6 +103,7 @@ class FusedBatchNormOp : public XlaOpKernel {
   float epsilon_;
   TensorFormat data_format_;
   bool is_training_;
+  bool is_on_gpu_;
 };
 
 REGISTER_XLA_OP(Name("FusedBatchNorm"), FusedBatchNormOp);
@@ -104,6 +119,7 @@ class FusedBatchNormGradOp : public XlaOpKernel {
     OP_REQUIRES(
         ctx, FormatFromString(data_format_str, &data_format_),
         errors::InvalidArgument("Invalid data format: ", data_format_str));
+    is_on_gpu_ = ctx->device_type().type_string() == DEVICE_GPU_XLA_JIT;
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
@@ -130,6 +146,22 @@ class FusedBatchNormGradOp : public XlaOpKernel {
     xla::XlaOp scale_backprop;
     xla::XlaOp offset_backprop;
     if (is_training_) {
+      if (is_on_gpu_) {
+        // The last two inputs to the FusedBatchNormGrad training TensorFlow GPU
+        // op are implementation defined.  For now we rely on the in-practice
+        // behavior of the op: input 3 is the mean input 4 is rsqrt(variance +
+        // epsilon)
+        //
+        // The XLA op expects:
+        //   input 3 is the mean
+        //   input 4 is the variance
+        //
+        // so we adjust input 4 here.
+        xla::XlaOp one = xla::ScalarLike(var, 1.0f);
+        xla::XlaOp epsilon = xla::ScalarLike(var, epsilon_);
+        var = xla::Sub(one / (var * var), epsilon);
+      }
+
       xla::XlaOp output =
           xla::BatchNormGrad(activations, scale, mean, var, grad_backprop,
                              epsilon_, feature_index);
@@ -158,9 +190,8 @@ class FusedBatchNormGradOp : public XlaOpKernel {
       offset_backprop = XlaHelpers::ConvertElementType(reduce, scale_dtype);
 
       // scratch1 = rsqrt(pop_var + epsilon)
-      auto neg_half = XlaHelpers::FloatLiteral(b, scale_dtype, -0.5);
-      auto scratch1 = xla::Pow(
-          xla::Add(var, xla::ConstantR0<float>(b, epsilon_)), neg_half);
+      auto epsilon = XlaHelpers::FloatLiteral(b, scale_dtype, epsilon_);
+      auto scratch1 = xla::Rsqrt(xla::Add(var, epsilon));
 
       // scratch2 = sum(y_backprop * (x - mean))
       auto mul =
@@ -187,6 +218,7 @@ class FusedBatchNormGradOp : public XlaOpKernel {
   TensorFormat data_format_;
   float epsilon_;
   bool is_training_;
+  bool is_on_gpu_;
 };
 
 REGISTER_XLA_OP(Name("FusedBatchNormGrad"), FusedBatchNormGradOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
index 46e5d68c78fd9ff26a88dc2a1484c3a67b76f4f3..6b675fa8a94e0bc932baaa359565cbc8e4614ee5 100644
--- a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
@@ -39,7 +39,7 @@ void BatchToSpace(XlaOpKernelContext* ctx, const xla::XlaOp& input,
 
   OP_REQUIRES(
       ctx,
-      xla::ShapeUtil::Rank(crops.shape()) == 2 &&
+      crops.shape().rank() == 2 &&
           block_rank == xla::ShapeUtil::GetDimension(crops.shape(), 0) &&
           2 == xla::ShapeUtil::GetDimension(crops.shape(), 1),
       errors::InvalidArgument("crops should have shape [", block_rank,
diff --git a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
index e7f369b761f36a717ea5fb536780af91a8955b1e..33bdf9aec3167b0277f3c1db18c9e247ed9bb5d1 100644
--- a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
@@ -48,8 +48,11 @@ class BiasOp : public XlaOpKernel {
     OP_REQUIRES(ctx, TensorShapeUtils::IsVector(bias_shape),
                 errors::InvalidArgument("Biases must be 1D: ",
                                         bias_shape.DebugString()));
-    int feature_dim = (data_format_ == FORMAT_NHWC) ? input_shape.dims() - 1
-                                                    : input_shape.dims() - 3;
+
+    // feature_dim is the channel (C) dimension of the data.
+    int feature_dim = (data_format_ == FORMAT_NHWC)
+                          ? input_shape.dims() - 1
+                          : /*data_format == FORMAT_NCHW*/ 1;
     OP_REQUIRES(
         ctx, feature_dim >= 0,
         errors::InvalidArgument("Input tensor does not have enough dimensions "
@@ -91,9 +94,10 @@ class BiasAddGradOp : public XlaOpKernel {
                 errors::InvalidArgument("Input tensor must be at least 2D: ",
                                         out_backprop_shape.DebugString()));
 
+    // feature_dim is the channel (C) dimension of the data.
     int feature_dim = (data_format_ == FORMAT_NHWC)
                           ? out_backprop_shape.dims() - 1
-                          : out_backprop_shape.dims() - 3;
+                          : /*data_format == FORMAT_NCHW*/ 1;
     OP_REQUIRES(
         ctx, feature_dim >= 0,
         errors::InvalidArgument("Input tensor does not have enough dimensions "
diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
index 5e9280c1fe692037b0a842a92ef5a8c28b854a54..ad6b334326a470442c8c0d79b725345d4165be10 100644
--- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
@@ -20,7 +20,9 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
@@ -165,12 +167,8 @@ XLA_MAKE_BINARY(
     xla::Div(xla::Mul(rhs, XlaHelpers::FloatLiteral(b, input_type(0), 0.5)),
              lhs, extend_dimensions));
 
-static xla::XlaOp Square(xla::XlaBuilder* builder, const xla::XlaOp& x) {
-  return xla::Mul(x, x);
-}
-
 XLA_MAKE_BINARY(SquaredDifference,
-                Square(b, xla::Sub(lhs, rhs, extend_dimensions)));
+                xla::Square(xla::Sub(lhs, rhs, extend_dimensions)));
 
 XLA_MAKE_BINARY(TruncateDiv, xla::Div(lhs, rhs, extend_dimensions));
 XLA_MAKE_BINARY(TruncateMod, xla::Rem(lhs, rhs, extend_dimensions));
@@ -195,8 +193,8 @@ XLA_MAKE_BINARY(SoftplusGrad,
 // softsigngrad(gradients, features) = gradients / (1 + abs(features)) ** 2
 XLA_MAKE_BINARY(SoftsignGrad,
                 xla::Div(lhs,
-                         Square(b, xla::Add(XlaHelpers::One(b, input_type(0)),
-                                            xla::Abs(rhs)))));
+                         xla::Square(xla::Add(XlaHelpers::One(b, input_type(0)),
+                                              xla::Abs(rhs)))));
 
 XLA_MAKE_BINARY(TanhGrad,
                 xla::Mul(rhs, xla::Sub(XlaHelpers::One(b, input_type(0)),
@@ -204,6 +202,8 @@ XLA_MAKE_BINARY(TanhGrad,
 
 XLA_MAKE_BINARY(Pow, xla::Pow(lhs, rhs, extend_dimensions));
 
+XLA_MAKE_BINARY(NextAfter, xla::NextAfter(lhs, rhs));
+
 #undef XLA_MAKE_BINARY
 
 class ApproximateEqualOp : public XlaOpKernel {
diff --git a/tensorflow/compiler/tf2xla/kernels/cast_op.cc b/tensorflow/compiler/tf2xla/kernels/cast_op.cc
index 8cc2479dd555380da7500abe6b2aca380110333b..ca2152d6c103e05c06809d85d9529720ff112217 100644
--- a/tensorflow/compiler/tf2xla/kernels/cast_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cast_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/compiler/tf2xla/lib/util.h"
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
@@ -19,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 
 namespace tensorflow {
@@ -31,6 +33,7 @@ class CastOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("DstT", &dst_dtype_));
     OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(src_dtype_, &src_type_));
     OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(dst_dtype_, &dst_type_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("Truncate", &use_truncation_));
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
@@ -48,6 +51,36 @@ class CastOp : public XlaOpKernel {
       // imaginary part.
       output = xla::ConvertElementType(xla::Real(input), dst_type_);
     } else {
+      if (use_truncation_) {
+        OP_REQUIRES(
+            ctx,
+            xla::primitive_util::IsFloatingPointType(src_type_) &&
+                xla::primitive_util::IsFloatingPointType(dst_type_),
+            errors::Unimplemented("Truncate attribute is only "
+                                  "implemented for floating point datatypes."));
+        int mantissa_difference =
+            xla::primitive_util::SignificandWidth(src_type_) -
+            xla::primitive_util::SignificandWidth(dst_type_);
+        OP_REQUIRES(ctx, mantissa_difference > 0,
+                    errors::Unimplemented(
+                        "Truncate attribute is only implemented in cases where "
+                        "dst datatype "
+                        "has fewer mantissa bits than the src datatype"));
+        int src_bitwidth = xla::primitive_util::BitWidth(src_type_);
+
+        // Bitcast to same-width integer, mask off the LSBs, bitcast back to the
+        // source datatype.
+        int64 mask = ~((1L << mantissa_difference) - 1);
+        xla::PrimitiveType same_width_int =
+            xla::primitive_util::UnsignedIntegralTypeForBitWidth(src_bitwidth);
+        OP_REQUIRES(ctx, same_width_int != xla::PRIMITIVE_TYPE_INVALID,
+                    errors::Unimplemented("Unexpected type bitwidth"));
+        input = xla::BitcastConvertType(
+            xla::And(
+                xla::BitcastConvertType(input, same_width_int),
+                ::tensorflow::IntegerLiteral(builder, same_width_int, mask)),
+            src_type_);
+      }
       output = xla::ConvertElementType(input, dst_type_);
     }
 
@@ -57,6 +90,7 @@ class CastOp : public XlaOpKernel {
  protected:
   DataType src_dtype_, dst_dtype_;
   xla::PrimitiveType src_type_, dst_type_;
+  bool use_truncation_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(CastOp);
 };
@@ -79,8 +113,8 @@ class BitcastOp : public XlaOpKernel {
     if (src_dtype_ == dst_dtype_) {
       output = input;
     } else {
-      // The only complex type in XLA is C64, so error out if the bitcast has a
-      // complex source or destination type and the bitcast is not trivial.
+      // Error out if the bitcast has a complex source or destination type and
+      // the bitcast is not trivial.
       OP_REQUIRES(ctx,
                   !xla::primitive_util::IsComplexType(src_type_) &&
                       !xla::primitive_util::IsComplexType(dst_type_),
diff --git a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
index 7199b9b6feb36dd45ef51f4c38463bc715fcc38a..a99c6ee4431852166eec0a71bb7ad74fd5c135d9 100644
--- a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/prng.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -99,8 +100,8 @@ class CategoricalOp : public XlaOpKernel {
     xla::PrimitiveType xla_output_type;
     OP_REQUIRES_OK(ctx,
                    DataTypeToPrimitiveType(output_type(0), &xla_output_type));
-    xla::XlaOp argmax = XlaHelpers::ArgMax(softmax_entries, xla_output_type,
-                                           /*axis=*/class_dimension);
+    xla::XlaOp argmax = xla::ArgMax(softmax_entries, xla_output_type,
+                                    /*axis=*/class_dimension);
     if (num_samples == 1) {
       argmax = xla::Reshape(argmax, {batch_size, 1});
     }
@@ -112,9 +113,12 @@ class CategoricalOp : public XlaOpKernel {
                                     xla::PrimitiveType type,
                                     XlaOpKernelContext* ctx) {
     xla::XlaBuilder* builder = ctx->builder();
-    auto uniforms =
-        xla::RngUniform(XlaHelpers::Zero(builder, input_type(0)),
-                        XlaHelpers::One(builder, input_type(0)), uniform_shape);
+    // We want a number in (0, 1) rather than [0, 1) or (0, 1]:
+    // * log(-log(0)) is ∞.
+    // * log(-log(1)) is -∞.
+    auto uniforms = xla::RngUniform(
+        xla::MinPositiveNormalValue(builder, type),
+        xla::One(builder, uniform_shape.element_type()), uniform_shape);
     return xla::Log(-xla::Log(uniforms));
   }
 
@@ -143,9 +147,13 @@ class StatelessCategoricalOp : public CategoricalOp {
     if (uniform_shape.element_type() == xla::BF16) {
       uniform_shape.set_element_type(xla::F32);
     }
+    // We want a number in (0, 1) rather than [0, 1) or (0, 1]:
+    // * log(-log(0)) is ∞.
+    // * log(-log(1)) is -∞.
     auto uniforms = xla::StatelessRngUniform(
-        {seed0, seed1}, uniform_shape, XlaHelpers::Zero(builder, DT_FLOAT),
-        XlaHelpers::One(builder, DT_FLOAT));
+        {seed0, seed1}, uniform_shape,
+        xla::MinPositiveNormalValue(builder, uniform_shape.element_type()),
+        xla::One(builder, uniform_shape.element_type()));
     return xla::ConvertElementType(xla::Log(-xla::Log(uniforms)), type);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc b/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc
index 9fcbc86adc0967cbb7fb73da8bdabc58b60953da..0ed3044efa5b1060d2b0ad2d5563b0e02ebf66ec 100644
--- a/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cholesky_op.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/lib/cholesky.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/cholesky.h"
 
 namespace tensorflow {
 namespace {
@@ -24,7 +24,7 @@ class CholeskyOp : public XlaOpKernel {
  public:
   explicit CholeskyOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
   void Compile(XlaOpKernelContext* ctx) override {
-    ctx->SetOutput(0, Cholesky(ctx->Input(0)));
+    ctx->SetOutput(0, xla::Cholesky(ctx->Input(0)));
   }
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/concat_op.cc b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
index cd7c7f4a82df7a65829787efcb1fd2f77870e945..91e4d9cea7cbf6075e30250587044174c4b8e7f4 100644
--- a/tensorflow/compiler/tf2xla/kernels/concat_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
@@ -24,13 +24,13 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/const_op.cc b/tensorflow/compiler/tf2xla/kernels/const_op.cc
index dff8af800229b9605bb93e0498bc5e5cf012f244..ff6c54e47c62f0555ef045e25051f6ec5a3c1d39 100644
--- a/tensorflow/compiler/tf2xla/kernels/const_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/const_op.cc
@@ -83,6 +83,17 @@ class ConstOp : public XlaOpKernel {
             return;
           }
           break;
+        case DT_COMPLEX128:
+          if (proto_.scomplex_val_size() == 2) {
+            ctx->SetOutput(
+                0,
+                xla::Broadcast(xla::ConstantR0<xla::complex128>(
+                                   b, xla::complex128(proto_.dcomplex_val(0),
+                                                      proto_.dcomplex_val(1))),
+                               shape.dim_sizes()));
+            return;
+          }
+          break;
         case DT_INT32:
           if (proto_.int_val_size() == 1) {
             ctx->SetOutput(
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
index 641fefafb357f6ad10483c454600f3dadd4f8cb7..e8b270c67a23b876612ab1dba92a8ae7a46a392d 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
@@ -26,13 +26,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/util/padding.h"
@@ -203,7 +203,8 @@ Status ConvBackpropComputeDimensionsV2XlaShapes(
     StringPiece label, int num_spatial_dims, const xla::Shape& input_shape,
     const xla::Shape& filter_shape, const xla::Shape& out_backprop_shape,
     absl::Span<const int32> dilations, const std::vector<int32>& strides,
-    Padding padding, TensorFormat data_format, ConvBackpropDimensions* dims) {
+    Padding padding, TensorFormat data_format, ConvBackpropDimensions* dims,
+    absl::Span<const int64> explicit_paddings) {
   TensorShape input_tensor_shape, filter_tensor_shape,
       out_backprop_tensor_shape;
   TF_RETURN_IF_ERROR(XLAShapeToTensorShape(input_shape, &input_tensor_shape));
@@ -212,8 +213,8 @@ Status ConvBackpropComputeDimensionsV2XlaShapes(
       XLAShapeToTensorShape(out_backprop_shape, &out_backprop_tensor_shape));
   return ConvBackpropComputeDimensionsV2(
       label, num_spatial_dims, input_tensor_shape, filter_tensor_shape,
-      out_backprop_tensor_shape, dilations, strides, padding, data_format,
-      dims);
+      out_backprop_tensor_shape, dilations, strides, padding, explicit_paddings,
+      data_format, dims);
 }
 
 }  // anonymous namespace
@@ -227,6 +228,10 @@ xla::StatusOr<ConvOpAttrs> ConvOpAttrs::Create(int num_spatial_dims,
   TF_RETURN_IF_ERROR(ctx->GetAttr("dilations", &attrs.dilations));
   TF_RETURN_IF_ERROR(ctx->GetAttr("strides", &attrs.strides));
   TF_RETURN_IF_ERROR(ctx->GetAttr("padding", &attrs.padding));
+  if (attrs.padding == EXPLICIT) {
+    TF_RETURN_IF_ERROR(
+        ctx->GetAttr("explicit_paddings", &attrs.explicit_paddings));
+  }
 
   string data_format;
   TF_RETURN_IF_ERROR(ctx->GetAttr("data_format", &data_format));
@@ -298,6 +303,11 @@ xla::StatusOr<xla::XlaOp> MakeXlaForwardConvOp(StringPiece /*type_string*/,
     window_strides[i] = attrs.strides.at(dim);
     rhs_dilation[i] = attrs.dilations.at(dim);
 
+    if (attrs.padding == EXPLICIT) {
+      padding[i] = {attrs.explicit_paddings.at(dim * 2),
+                    attrs.explicit_paddings.at(dim * 2 + 1)};
+    }
+
     int64 unused_output_size;
     TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerboseV2(
         input_shape.dimensions(dim), filter_shape.dimensions(i),
@@ -332,7 +342,7 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropInputConvOp(
   TF_RETURN_IF_ERROR(ConvBackpropComputeDimensionsV2XlaShapes(
       type_string, attrs.num_spatial_dims, input_shape, expanded_filter_shape,
       out_backprop_shape, attrs.dilations, attrs.strides, attrs.padding,
-      attrs.data_format, &dims));
+      attrs.data_format, &dims, attrs.explicit_paddings));
 
   // The input gradients are computed by a convolution of the output
   // gradients and the filter, with some appropriate padding. See the
@@ -392,23 +402,31 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
                       builder->GetShape(activations));
   TF_ASSIGN_OR_RETURN(xla::Shape out_backprop_shape,
                       builder->GetShape(gradients));
+  xla::XlaOp filter_backprop;
+
+  xla::Shape input_shape = activations_shape;
+  xla::Shape output_shape = out_backprop_shape;
+
+  TensorShape input_tensor_shape, filter_tensor_shape, output_tensor_shape;
+  TF_RETURN_IF_ERROR(XLAShapeToTensorShape(filter_shape, &filter_tensor_shape));
+  TF_RETURN_IF_ERROR(XLAShapeToTensorShape(input_shape, &input_tensor_shape));
+  TF_RETURN_IF_ERROR(XLAShapeToTensorShape(output_shape, &output_tensor_shape));
+
   const xla::Shape expanded_filter_shape =
       attrs.depthwise ? ExpandedFilterShapeForDepthwiseConvolution(filter_shape)
                       : filter_shape;
-
   // Reuse dimension computation logic from conv_grad_ops.cc.
   ConvBackpropDimensions dims;
-  TF_RETURN_IF_ERROR(ConvBackpropComputeDimensionsV2XlaShapes(
-      type_string, attrs.num_spatial_dims, activations_shape,
-      expanded_filter_shape, out_backprop_shape, attrs.dilations, attrs.strides,
-      attrs.padding, attrs.data_format, &dims));
-
   // The filter gradients are computed by a convolution of the input
   // activations and the output gradients, with some appropriate padding.
   // See the comment at the top of conv_grad_ops.h for details.
-
   xla::ConvolutionDimensionNumbers dnums;
 
+  TF_RETURN_IF_ERROR(ConvBackpropComputeDimensionsV2XlaShapes(
+      type_string, attrs.num_spatial_dims, activations_shape,
+      expanded_filter_shape, out_backprop_shape, attrs.dilations, attrs.strides,
+      attrs.padding, attrs.data_format, &dims, attrs.explicit_paddings));
+
   // The activations (inputs) form the LHS of the convolution.
   // Activations have shape: [batch, in_rows, in_cols, ..., in_depth]
   // For the gradient computation, we flip the roles of the batch and
@@ -420,6 +438,14 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
   int n_dim = GetTensorBatchDimIndex(num_dims, attrs.data_format);
   int c_dim = GetTensorFeatureDimIndex(num_dims, attrs.data_format);
 
+  bool use_batch_group_count =
+      filter_tensor_shape.dim_size(num_dims - 1) == 1 && attrs.depthwise;
+
+  std::vector<std::pair<int64, int64>> padding(attrs.num_spatial_dims);
+  std::vector<int64> rhs_dilation(attrs.num_spatial_dims);
+  std::vector<int64> window_strides(attrs.num_spatial_dims);
+  std::vector<int64> ones(attrs.num_spatial_dims, 1);
+
   // Swap n_dim and c_dim in the activations.
   dnums.set_input_batch_dimension(c_dim);
   dnums.set_input_feature_dimension(n_dim);
@@ -430,28 +456,32 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
   dnums.set_kernel_input_feature_dimension(n_dim);
   dnums.set_kernel_output_feature_dimension(c_dim);
 
-  std::vector<std::pair<int64, int64>> padding(attrs.num_spatial_dims);
-  std::vector<int64> rhs_dilation(attrs.num_spatial_dims);
-  std::vector<int64> window_strides(attrs.num_spatial_dims);
-  std::vector<int64> ones(attrs.num_spatial_dims, 1);
+  // The dimension swap below is needed because filter shape is KH,KW,F,DM.
+  if (use_batch_group_count) {
+    dnums.set_output_batch_dimension(attrs.num_spatial_dims + 1);
+    dnums.set_output_feature_dimension(attrs.num_spatial_dims);
+  } else {
+    dnums.set_output_batch_dimension(attrs.num_spatial_dims);
+    dnums.set_output_feature_dimension(attrs.num_spatial_dims + 1);
+  }
 
   // Tensorflow filter shape is [ H, W, ..., inC, outC ].
   for (int i = 0; i < attrs.num_spatial_dims; ++i) {
     dnums.add_output_spatial_dimensions(i);
   }
-  dnums.set_output_batch_dimension(attrs.num_spatial_dims);
-  dnums.set_output_feature_dimension(attrs.num_spatial_dims + 1);
 
-  for (int i = 0; i < attrs.num_spatial_dims; ++i) {
+  for (int64 i = 0; i < attrs.num_spatial_dims; ++i) {
     int64 dim = GetTensorSpatialDimIndex(num_dims, attrs.data_format, i);
     dnums.add_input_spatial_dimensions(dim);
     dnums.add_kernel_spatial_dimensions(dim);
+    rhs_dilation[i] = dims.spatial_dims[i].stride;
+    window_strides[i] = attrs.dilations[dim];
 
     // We will also need to pad the input with zeros such that after the
     // convolution, we get the right size for the filter.
     // The padded_in_rows should be such that when we convolve this with the
     // expanded_out_rows as a filter, we should get filter_rows back.
-    //
+
     const int64 padded_in_size =
         dims.spatial_dims[i].expanded_output_size +
         (dims.spatial_dims[i].filter_size - 1) * attrs.dilations[dim];
@@ -472,6 +502,8 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
     // We apply negative padding in this case.
     const int64 pad_total = padded_in_size - dims.spatial_dims[i].input_size;
 
+    // + For the EXPLICIT padding, we pad the top/left side with the explicit
+    //   padding and pad the bottom/right side with the remaining space.
     // + For the VALID padding, we don't pad anything on the top/left side
     //   and pad the bottom/right side with the remaining space.
     // + For the SAME padding, we pad top/left side the same as bottom/right
@@ -480,12 +512,12 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
     // In addition, if the padded input size is smaller than the input size,
     // we need to ignore some training elements of the input. We do this by
     // applying negative padding on the right/bottom.
-    const int64 pad_before =
-        attrs.padding == Padding::SAME ? std::max<int64>(pad_total / 2, 0) : 0;
-
+    const int64 pad_before = attrs.padding == Padding::EXPLICIT
+                                 ? attrs.explicit_paddings[2 * dim]
+                                 : attrs.padding == Padding::SAME
+                                       ? std::max<int64>(pad_total / 2, 0)
+                                       : 0;
     padding[i] = {pad_before, pad_total - pad_before};
-    rhs_dilation[i] = dims.spatial_dims[i].stride;
-    window_strides[i] = attrs.dilations[dim];
   }
 
   // Besides padding the input, we will also expand output_rows to
@@ -496,11 +528,14 @@ xla::StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
   //
   // This is done by specifying the window dilation factors in the
   // convolution HLO below.
-  auto filter_backprop =
-      xla::ConvGeneralDilated(activations, gradients, window_strides, padding,
-                              /*lhs_dilation=*/ones, rhs_dilation, dnums);
 
-  if (attrs.depthwise) {
+  filter_backprop = xla::ConvGeneralDilated(
+      activations, gradients, window_strides, padding, /*lhs_dilation=*/ones,
+      rhs_dilation, dnums,
+      /*feature_group_count=*/1,
+      /*batch_group_count=*/use_batch_group_count ? dims.in_depth : 1);
+
+  if (!use_batch_group_count && attrs.depthwise) {
     filter_backprop = ContractFilterForDepthwiseBackprop(
         filter_shape, filter_backprop, activations.builder());
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
index 6e1b70a47850ae5c05939f8dfb7ec129c031df21..d893eca7f9ba07dded76eb215af4779080fa66b9 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
@@ -47,6 +47,7 @@ struct ConvOpAttrs {
   std::vector<int32> dilations;
   std::vector<int32> strides;
   Padding padding;
+  std::vector<int64> explicit_paddings;
   TensorFormat data_format;
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
index eafdba876ae9e2c38694f065cf83bb3725b8460e..52c3c2c4a903a8c51f6b511774bc0312d39df826 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
@@ -25,13 +25,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/util/padding.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
index 6e6ba21daf5bf3eab5bfc15378e77b6dd253da7c..b119997cf39e210ed8e0ae730a08829e72b238b4 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
@@ -22,10 +22,10 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/empty_op.cc b/tensorflow/compiler/tf2xla/kernels/empty_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..00d2ce7c12fdc96483612059d1c792c847df04f3
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/empty_op.cc
@@ -0,0 +1,66 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// XLA-specific Empty Op.
+
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+namespace {
+
+class EmptyOp : public XlaOpKernel {
+ public:
+  explicit EmptyOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(dtype_, &type_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("init", &init_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    // The output of this Op is a tensor of shape 'shape' with each
+    // element set to the default value of 'dtype'. If 'init' is false then
+    // the result values may be left undefined, though we don't do that here.
+    const TensorShape shape_shape = ctx->InputShape("shape");
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsVector(shape_shape),
+        errors::InvalidArgument("shape must be a vector of int32, got shape ",
+                                shape_shape.DebugString()));
+
+    std::vector<int64> shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector("shape", &shape));
+
+    auto default_value = xla::Zero(ctx->builder(), type_);
+    auto result = xla::Broadcast(default_value, shape);
+    ctx->SetOutput(0, result);
+  }
+
+ private:
+  DataType dtype_;
+  xla::PrimitiveType type_;
+  bool init_;
+};
+
+REGISTER_XLA_OP(Name("Empty").CompileTimeConstantInput("shape"), EmptyOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
index 6df8b5367d2390e65995beb1583b225755e6ee9f..a623585aad3b1b8f1f096ca527e7694d74f1ba46 100644
--- a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
@@ -21,12 +21,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/util/padding.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index 20b0de193dc060197f3062d3be0b8d45f7dcb9b1..6472045265e4d930a5da770a68f5c502192201ae 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h"
-#include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
@@ -168,13 +167,13 @@ class GatherOp : public XlaOpKernel {
 
       OP_REQUIRES_OK(context, context->ConstantInputAsIntScalar(2, &axis));
       const auto params_dims = input_shape.dims();
-      if (axis < 0) {
-        axis += params_dims;
-      }
       OP_REQUIRES(
-          context, 0 <= axis && axis < params_dims,
+          context, -params_dims <= axis && axis < params_dims,
           errors::InvalidArgument("Expected axis in the range [", -params_dims,
                                   ", ", params_dims, "), but got ", axis));
+      if (axis < 0) {
+        axis += params_dims;
+      }
     }
 
     DataType index_type = input_type(1);
diff --git a/tensorflow/compiler/tf2xla/kernels/identity_op.cc b/tensorflow/compiler/tf2xla/kernels/identity_op.cc
index 19dd38c46ef154ea74bcbb6721dd04924702efcc..8b27e8e85a37bd5aa757b0cdd7e00e9fa3c0cf6e 100644
--- a/tensorflow/compiler/tf2xla/kernels/identity_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/identity_op.cc
@@ -38,9 +38,13 @@ class IdentityOp : public XlaOpKernel {
 
 // XLA_* devices also register a "real" Identity operator so we suppress the
 // dummy operator using CompilationOnly().
-REGISTER_XLA_OP(Name("Identity").AllowResourceTypes().CompilationOnly(),
-                IdentityOp);
-REGISTER_XLA_OP(Name("IdentityN").AllowResourceTypes().CompilationOnly(),
+REGISTER_XLA_OP(
+    Name("Identity").AllowResourceTypes().AllowVariantTypes().CompilationOnly(),
+    IdentityOp);
+REGISTER_XLA_OP(Name("IdentityN")
+                    .AllowResourceTypes()
+                    .AllowVariantTypes()
+                    .CompilationOnly(),
                 IdentityOp);
 REGISTER_XLA_OP(Name("PlaceholderWithDefault"), IdentityOp);
 REGISTER_XLA_OP(Name("PreventGradient"), IdentityOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.cc b/tensorflow/compiler/tf2xla/kernels/if_op.cc
index b5e083912555c865b5eadc7697075c9ca4451ca9..aa5637e2669555da17af8bb05ab08beeba6a89c3 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.cc
@@ -56,6 +56,7 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
   VLOG(1) << "Building If: " << input_types_.size() << " inputs";
 
   std::vector<XlaCompiler::Argument> arguments(input_types_.size());
+  int num_resource_args = 0;
   for (int i = 0; i < input_types_.size(); ++i) {
     XlaCompiler::Argument& arg = arguments[i];
     DataType type = ctx->input_type(i + 1);
@@ -79,14 +80,16 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
       arg.name = resource->name();
       VLOG(2) << "Resource " << resource->name()
               << " type: " << DataTypeString(arg.type)
-              << " shape: " << arg.shape.DebugString()
+              << " shape: " << arg.HumanString()
               << " initialized: " << arg.initialized;
+
+      num_resource_args++;
     } else {
       arg.kind = XlaCompiler::Argument::kParameter;
       arg.type = input_types_[i];
       arg.shape = ctx->InputShape(i + 1);
       VLOG(2) << "Arg type: " << DataTypeString(arg.type)
-              << " shape: " << arg.shape.DebugString();
+              << " shape: " << arg.HumanString();
     }
   }
 
@@ -147,12 +150,12 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
   OP_REQUIRES(ctx, then_result.xla_input_shapes.size() == 1,
               errors::FailedPrecondition("Expected one input shape"));
   xla::Shape then_input_shape = then_result.xla_input_shapes[0];
-  OP_REQUIRES(ctx, xla::ShapeUtil::IsTuple(then_input_shape),
+  OP_REQUIRES(ctx, then_input_shape.IsTuple(),
               errors::FailedPrecondition("Expected tuple shape"));
   OP_REQUIRES(ctx, else_result.xla_input_shapes.size() == 1,
               errors::FailedPrecondition("Expected one input shape"));
   xla::Shape else_input_shape = else_result.xla_input_shapes[0];
-  OP_REQUIRES(ctx, xla::ShapeUtil::IsTuple(else_input_shape),
+  OP_REQUIRES(ctx, else_input_shape.IsTuple(),
               errors::FailedPrecondition("Expected tuple shape"));
   OP_REQUIRES(ctx,
               xla::ShapeUtil::Compatible(then_input_shape, else_input_shape),
@@ -236,12 +239,16 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
     ctx->SetOutput(i, output_handle);
   }
   if (has_token_input_output_) {
-    // Set token output for this "if" op.
+    // Set token output for this "If" op. Token output is the last output of
+    // XLA computation, which comes after all "normal" TF outputs and resource
+    // updates. For "If" node, num of resource updates equals to number of
+    // resource args because we set `return_updated_values_for_all_resources`
+    // to true in XlaCompiler option.
     xla::XlaOp token_output =
-        xla::GetTupleElement(outputs, output_types_.size());
+        xla::GetTupleElement(outputs, output_types_.size() + num_resource_args);
     auto shape_or = b->GetShape(token_output);
     OP_REQUIRES_OK(ctx, shape_or.status());
-    OP_REQUIRES(ctx, xla::ShapeUtil::IsToken(shape_or.ValueOrDie()),
+    OP_REQUIRES(ctx, shape_or.ValueOrDie().IsToken(),
                 errors::FailedPrecondition(
                     "Token output is not token type: ",
                     xla::ShapeUtil::HumanString(shape_or.ValueOrDie())));
diff --git a/tensorflow/compiler/tf2xla/kernels/image_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
index e9bb0a77e99d144863b027bd214081316d61c314..92b20fe0ba5611ca5314cd954026f7b71ea75f84 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
@@ -13,18 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h"
 #include "tensorflow/compiler/tf2xla/lib/util.h"
-#include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/loops.h"
 #include "tensorflow/compiler/xla/client/lib/sorting.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
 
 namespace tensorflow {
 namespace {
@@ -185,19 +187,20 @@ class AdjustContrastOpV2 : public XlaOpKernel {
                                         factor_shape.DebugString()));
 
     xla::XlaBuilder* b = context->builder();
-    xla::XlaOp input = context->Input(0);
-    xla::XlaOp factor = context->Input(1);
-
     DataType type = context->input_type(0);
 
+    xla::XlaOp input = context->Input(0);
+    xla::XlaOp factor = XlaHelpers::ConvertElementType(context->Input(1), type);
+
     const DataType accumulation_type = XlaHelpers::SumAccumulationType(type);
     auto converted = XlaHelpers::ConvertElementType(input, accumulation_type);
     auto reduce = xla::Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
                               *context->GetOrCreateAdd(accumulation_type),
                               {height_dim, width_dim});
-    auto output = XlaHelpers::ConvertElementType(reduce, type);
-    output =
-        xla::Div(output, XlaHelpers::FloatLiteral(b, type, height * width));
+
+    auto output = xla::Div(
+        reduce, XlaHelpers::FloatLiteral(b, accumulation_type, height * width));
+    output = XlaHelpers::ConvertElementType(output, type);
 
     std::vector<int64> broadcast_dims(input_shape.dims() - 2);
     std::iota(broadcast_dims.begin(), broadcast_dims.end(), 0);
@@ -233,8 +236,10 @@ class AdjustSaturationOp : public XlaOpKernel {
                                 channels, " channels."));
 
     xla::XlaBuilder* b = context->builder();
-    xla::XlaOp input = context->Input(0);
-    xla::XlaOp scale = context->Input(1);
+    xla::XlaOp input =
+        XlaHelpers::ConvertElementType(context->Input(0), DT_FLOAT);
+    xla::XlaOp scale =
+        XlaHelpers::ConvertElementType(context->Input(1), DT_FLOAT);
 
     DataType type = context->input_type(0);
 
@@ -249,15 +254,17 @@ class AdjustSaturationOp : public XlaOpKernel {
                                       /*dimno=*/channel_dim);
     TensorShape channel_shape = input_shape;
     channel_shape.set_dim(channel_dim, 1);
-    auto hsv = RGBToHSV(context, b, {red, green, blue}, context->input_type(0),
-                        channel_shape);
+    auto hsv =
+        RGBToHSV(context, b, {red, green, blue}, DT_FLOAT, channel_shape);
 
-    hsv[1] = xla::Clamp(XlaHelpers::Zero(b, type), xla::Mul(hsv[1], scale),
-                        XlaHelpers::One(b, type));
+    hsv[1] = xla::Clamp(XlaHelpers::Zero(b, DT_FLOAT), xla::Mul(hsv[1], scale),
+                        XlaHelpers::One(b, DT_FLOAT));
 
-    auto rgb = HSVToRGB(context->builder(), hsv, context->input_type(0));
+    auto rgb = HSVToRGB(context->builder(), hsv, DT_FLOAT);
 
-    context->SetOutput(0, xla::ConcatInDim(b, rgb, channel_dim));
+    auto output = XlaHelpers::ConvertElementType(
+        xla::ConcatInDim(b, rgb, channel_dim), type);
+    context->SetOutput(0, output);
   }
 };
 REGISTER_XLA_OP(Name("AdjustSaturation"), AdjustSaturationOp);
@@ -283,8 +290,10 @@ class AdjustHueOp : public XlaOpKernel {
                                 channels, " channels."));
 
     xla::XlaBuilder* b = context->builder();
-    xla::XlaOp input = context->Input(0);
-    xla::XlaOp delta = context->Input(1);
+    xla::XlaOp input =
+        XlaHelpers::ConvertElementType(context->Input(0), DT_FLOAT);
+    xla::XlaOp delta =
+        XlaHelpers::ConvertElementType(context->Input(1), DT_FLOAT);
 
     DataType type = context->input_type(0);
 
@@ -299,20 +308,22 @@ class AdjustHueOp : public XlaOpKernel {
                                       /*dimno=*/channel_dim);
     TensorShape channel_shape = input_shape;
     channel_shape.set_dim(channel_dim, 1);
-    auto hsv = RGBToHSV(context, b, {red, green, blue}, context->input_type(0),
-                        channel_shape);
+    auto hsv =
+        RGBToHSV(context, b, {red, green, blue}, DT_FLOAT, channel_shape);
 
-    auto zero = XlaHelpers::Zero(b, type);
-    auto one = XlaHelpers::One(b, type);
+    auto zero = XlaHelpers::Zero(b, DT_FLOAT);
+    auto one = XlaHelpers::One(b, DT_FLOAT);
 
     auto& hue = hsv[0];
     hue = xla::Rem(xla::Add(hsv[0], delta), one);
     hue =
         xla::Select(xla::Lt(hue, zero), xla::Rem(xla::Add(one, hue), one), hue);
 
-    auto rgb = HSVToRGB(context->builder(), hsv, context->input_type(0));
+    auto rgb = HSVToRGB(context->builder(), hsv, DT_FLOAT);
 
-    context->SetOutput(0, xla::ConcatInDim(b, rgb, channel_dim));
+    auto output = XlaHelpers::ConvertElementType(
+        xla::ConcatInDim(b, rgb, channel_dim), type);
+    context->SetOutput(0, output);
   }
 };
 REGISTER_XLA_OP(Name("AdjustHue"), AdjustHueOp);
@@ -351,24 +362,26 @@ struct SuppressBodyFn {
     auto num_outputs_so_far = values[1];
     auto iou_mask = values[2];
     auto included_iou = values[3];
-    auto zero_r1 = xla::ConstantR1<int32>(builder, {0});
+    auto zero = xla::ConstantR0<int32>(builder, 0);
     // Determine if current elem is active using a slice.
-    auto row_idx_r1 = xla::Reshape(row_idx, {1});
-    auto active_elem = xla::DynamicSlice(included_iou, row_idx_r1, {1});
+    // TODO(b/118437727): The only reason we need an explicit vector is because
+    // some old GCCs can't deduce the right type for MakeConstSpan, and
+    // providing a single-value initializer list directly uses the wrong
+    // overload. Delete this once the deprecated overload is gone.
+    std::vector<xla::XlaOp> row_idx_vector = {row_idx};
+    auto active_elem = xla::DynamicSlice(included_iou, row_idx_vector, {1});
     active_elem = xla::Reshape(active_elem, {});
     // Increment output count iff current elem is not suppressed.
     num_outputs_so_far = xla::Select(
         active_elem, num_outputs_so_far + xla::ConstantR0<int32>(builder, 1),
         num_outputs_so_far);
     // Slice out the row_idx.
-    auto starts = xla::ConcatInDim(builder, {row_idx_r1, zero_r1}, 0);
-    auto row_iou = xla::DynamicSlice(iou_mask, starts, {1, num_boxes});
+    auto row_iou = xla::DynamicSlice(iou_mask, {row_idx, zero}, {1, num_boxes});
     // Remove the diagonal from consideration. An elem cannot suppress
     // itself.
-    auto update_starts = xla::ConcatInDim(builder, {zero_r1, row_idx_r1}, 0);
     row_iou = xla::DynamicUpdateSlice(
         row_iou, xla::ConstantR2FromArray2D<bool>(builder, {{false}}),
-        update_starts);
+        {zero, row_idx});
     // Create a suppression by inverting polarity.
     row_iou = xla::Reshape(row_iou, {num_boxes});
     auto supp_mask = xla::Not(row_iou);
@@ -505,9 +518,9 @@ class NonMaxSuppressionOp : public XlaOpKernel {
     init_values.push_back(included_iou);
 
     auto suppress_loop_result =
-        XlaWhileLoop(WhileCondFn(num_boxes, output_size),
-                     SuppressBodyFn(num_boxes), init_values, "suppress_loop",
-                     builder)
+        xla::WhileLoopHelper(WhileCondFn(num_boxes, output_size),
+                             SuppressBodyFn(num_boxes), init_values,
+                             "suppress_loop", builder)
             .ValueOrDie();
 
     xla::XlaOp included_score =
diff --git a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
index 5a10c52ba8b6d4fab73f0dda67cbd52fd625e76b..d19d48e5dd95962fe4a4e4026eaf6b06b7898564 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -72,10 +73,10 @@ namespace {
 // from in_size to out_size.
 struct ResizeConvolutionDims {
   // Size of the kernel to use.
-  std::vector<int64> kernel_size;
+  std::vector<int64> kernel_size;  // k
 
   // Stride of the convolution to use.
-  std::vector<int64> stride;
+  std::vector<int64> stride;  // S
 };
 ResizeConvolutionDims ComputeResizeConvolutionParameters(
     absl::Span<const int64> in_size, absl::Span<const int64> out_size,
@@ -117,8 +118,10 @@ ResizeConvolutionDims ComputeResizeConvolutionParameters(
 //                        + dims.stride * (out_size - 1)
 int64 CalculateUpperPadding(int64 in_size, int64 out_size, int64 kernel_size,
                             int64 stride) {
-  return (2 * kernel_size - 1) + (out_size - 1) * stride - (kernel_size - 1) -
-         1 - (kernel_size * (in_size - 1));
+  int64 padding = (2 * kernel_size - 1) + (out_size - 1) * stride -
+                  (kernel_size - 1) - 1 - (kernel_size * (in_size - 1));
+
+  return padding;
 }
 
 // Form a 2D convolution kernel like:
@@ -132,53 +135,100 @@ int64 CalculateUpperPadding(int64 in_size, int64 out_size, int64 kernel_size,
 // If the 2D kernel would be very large, the 1D kernel can be applied once in
 // each dimension due to the symmetry of the kernel along all axis to reduce the
 // computational intensity.
-xla::XlaOp Make1DKernel(xla::XlaBuilder* builder, int64 n) {
+xla::XlaOp MakeBilinear1DKernel(xla::XlaBuilder* builder,
+                                xla::PrimitiveType type, int64 n) {
   std::vector<float> kernel(n * 2 - 1);
   for (int64 i = 0; i < n; ++i) {
     float v = (i + 1.0f) / n;
     kernel[i] = v;
     kernel[n * 2 - 2 - i] = v;
   }
-  return xla::ConstantR1<float>(builder, kernel);
+  return xla::ConvertElementType(xla::ConstantR1<float>(builder, kernel), type);
+}
+
+// Unlike the bilinear kernel, which is triangular, the nearest neighbor
+// kernel is a square. For example, a 1D kernel with n=3 would look like
+// [0 1 1 1 0]
+// and n=4 would look like
+// [0 0 1 1 1 1 0].
+// Note that in the second case, the kernel is not symmetric and we default
+// to the right (because an existing non TPU kernel
+// for nearest neighbor resize already chose to default to the right,
+// so we want to be consistent).
+xla::XlaOp MakeNearestNeighbor1DKernel(xla::XlaBuilder* builder,
+                                       xla::PrimitiveType type, int64 n) {
+  std::vector<float> kernel(n * 2 - 1, 0.0f);
+  std::fill(&kernel[n / 2], &kernel[(3 * n) / 2], 1.0f);
+
+  return xla::ConvertElementType(xla::ConstantR1<float>(builder, kernel), type);
 }
 
 // Kernels with more than 16 spatial elements are considered intense and the
-// kernel should applied to each dimension independently.
+// kernel should be applied to each dimension independently.
 const int64 kMax2DKernelSize = 16;
 
-xla::XlaOp MakeBilinearResizeKernel(xla::XlaBuilder* builder,
-                                    absl::Span<const int64> kernel_size,
-                                    int64 channels) {
-  auto depthwise_kernel = xla::Broadcast(
-      xla::Zero(builder, xla::F32),
-      {(2 * kernel_size[0] - 1), (2 * kernel_size[1] - 1), channels, 1});
-
-  return xla::Mul(
-      xla::Add(depthwise_kernel, Make1DKernel(builder, kernel_size[1]),
-               /*broadcast_dimensions=*/{1}),
-      Make1DKernel(builder, kernel_size[0]),
-      /*broadcast_dimensions=*/{0});
-}
+xla::XlaOp MakeGeneralResizeKernel(xla::XlaBuilder* builder,
+                                   xla::PrimitiveType type,
+                                   absl::Span<const int64> kernel_size,
+                                   int64 channels, bool is_kernel_bilinear) {
+  auto make_kernel_func =
+      is_kernel_bilinear ? MakeBilinear1DKernel : MakeNearestNeighbor1DKernel;
 
-xla::XlaOp MakeBilinearResizeKernelInDim(xla::XlaBuilder* builder,
-                                         absl::Span<const int64> kernel_size,
-                                         int64 channels, int64 dim) {
+  std::vector<int64> depthwise_kernel_sizes = {
+      (2 * kernel_size[0] - 1), (2 * kernel_size[1] - 1), channels, 1};
   auto depthwise_kernel =
-      xla::Broadcast(xla::Zero(builder, xla::F32),
-                     {dim == 0 ? (2 * kernel_size[0] - 1) : 1,
-                      dim == 1 ? (2 * kernel_size[1] - 1) : 1, channels, 1});
-  return xla::Add(depthwise_kernel, Make1DKernel(builder, kernel_size[dim]),
-                  /*broadcast_dimensions=*/{dim});
+      xla::BroadcastInDim(make_kernel_func(builder, type, kernel_size[1]),
+                          depthwise_kernel_sizes, /*broadcast_dimensions=*/{1});
+
+  return xla::Mul(depthwise_kernel,
+                  make_kernel_func(builder, type, kernel_size[0]),
+                  /*broadcast_dimensions=*/{0});
+}
+
+xla::XlaOp MakeGeneralResizeKernelInDim(xla::XlaBuilder* builder,
+                                        xla::PrimitiveType type,
+                                        absl::Span<const int64> kernel_size,
+                                        int64 channels, int64 dim,
+                                        bool is_kernel_bilinear) {
+  auto make_kernel_func =
+      is_kernel_bilinear ? MakeBilinear1DKernel : MakeNearestNeighbor1DKernel;
+
+  std::vector<int64> depthwise_kernel_sizes = {
+      dim == 0 ? (2 * kernel_size[0] - 1) : 1,
+      dim == 1 ? (2 * kernel_size[1] - 1) : 1, channels, 1};
+  return xla::BroadcastInDim(make_kernel_func(builder, type, kernel_size[dim]),
+                             depthwise_kernel_sizes,
+                             /*broadcast_dimensions=*/{dim});
+}
+
+xla::XlaOp BroadcastSpatialDimensions(xla::XlaBuilder* builder,
+                                      const xla::XlaOp& input,
+                                      int32 spatial_dimensions_offset,
+                                      absl::Span<const int64> in_size,
+                                      absl::Span<const int64> out_size) {
+  // Add broadcasts to handle expanding from a size == 1 dimension to a
+  // size > 1 dimension.
+  auto broadcast_shape_or_status = builder->GetShape(input);
+  if (!broadcast_shape_or_status.ok()) {
+    return builder->ReportError(broadcast_shape_or_status.status());
+  }
+  xla::Shape broadcast_shape = broadcast_shape_or_status.ValueOrDie();
+  for (int32 i = 0; i < in_size.size(); ++i) {
+    if (in_size[i] == 1 && out_size[i] > 1) {
+      broadcast_shape.set_dimensions(spatial_dimensions_offset + i,
+                                     out_size[i]);
+    }
+  }
+  return xla::BroadcastInDim(input, broadcast_shape.dimensions(),
+                             /*broadcast_dimensions=*/{0, 1, 2, 3});
 }
 
-xla::XlaOp ResizeUsingDilationAndConvolution(xla::XlaBuilder* builder,
-                                             const xla::XlaOp& input,
-                                             const int num_spatial_dims,
-                                             std::vector<int64> in_size,
-                                             std::vector<int64> out_size,
-                                             const int64 channels,
-                                             const bool align_corners) {
-  // Picture for a 1x3 to 1x4 resize:
+xla::XlaOp ResizeUsingDilationAndConvolution(
+    xla::XlaBuilder* builder, const xla::XlaOp& input, xla::PrimitiveType type,
+    const int num_spatial_dims, absl::Span<const int64> in_size,
+    absl::Span<const int64> out_size, const int64 channels,
+    const bool align_corners, bool is_kernel_bilinear) {
+  // Picture for a 1x3 to 1x4 bilinear resize:
   // stride = 2, kernel size = 3
   // Input:
   // 3 6 9
@@ -264,8 +314,8 @@ xla::XlaOp ResizeUsingDilationAndConvolution(xla::XlaBuilder* builder,
   // Split convolutions into independent dimensions if they would be a very
   // large kernel.
   if (dims.kernel_size[0] * dims.kernel_size[1] < kMax2DKernelSize) {
-    xla::XlaOp kernel =
-        MakeBilinearResizeKernel(builder, dims.kernel_size, channels);
+    xla::XlaOp kernel = MakeGeneralResizeKernel(builder, type, dims.kernel_size,
+                                                channels, is_kernel_bilinear);
     output =
         xla::ConvGeneralDilated(input_data, kernel, dims.stride,
                                 /*padding=*/
@@ -275,8 +325,8 @@ xla::XlaOp ResizeUsingDilationAndConvolution(xla::XlaBuilder* builder,
                                 /*rhs_dilation=*/{1, 1}, dimension_numbers,
                                 /*feature_group_count=*/channels);
   } else {
-    xla::XlaOp kernel0 =
-        MakeBilinearResizeKernelInDim(builder, dims.kernel_size, channels, 0);
+    xla::XlaOp kernel0 = MakeGeneralResizeKernelInDim(
+        builder, type, dims.kernel_size, channels, 0, is_kernel_bilinear);
     output = xla::ConvGeneralDilated(
         input_data, kernel0, {dims.stride[0], 1},
         /*padding=*/
@@ -284,8 +334,8 @@ xla::XlaOp ResizeUsingDilationAndConvolution(xla::XlaBuilder* builder,
         /*lhs_dilation=*/{dims.kernel_size[0], 1},
         /*rhs_dilation=*/{1, 1}, dimension_numbers,
         /*feature_group_count=*/channels);
-    xla::XlaOp kernel1 =
-        MakeBilinearResizeKernelInDim(builder, dims.kernel_size, channels, 1);
+    xla::XlaOp kernel1 = MakeGeneralResizeKernelInDim(
+        builder, type, dims.kernel_size, channels, 1, is_kernel_bilinear);
     output = xla::ConvGeneralDilated(
         output, kernel1, {1, dims.stride[1]},
         /*padding=*/
@@ -297,22 +347,15 @@ xla::XlaOp ResizeUsingDilationAndConvolution(xla::XlaBuilder* builder,
 
   // Add broadcasts to handle expanding from a size == 1 dimension to a
   // size > 1 dimension.
-  for (int i = 0; i < num_spatial_dims; ++i) {
-    if (in_size[i] == 1 && out_size[i] > 1) {
-      output = xla::Add(output, xla::ConstantR1<float>(builder, out_size[i], 0),
-                        /*broadcast_dimensions=*/{1 + i});
-    }
-  }
-  return output;
+  return BroadcastSpatialDimensions(
+      builder, output, /*spatial_dimensions_offset=*/1, in_size, out_size);
 }
 
-xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(xla::XlaBuilder* builder,
-                                                   const xla::XlaOp& grad,
-                                                   const int num_spatial_dims,
-                                                   std::vector<int64> in_size,
-                                                   std::vector<int64> grad_size,
-                                                   const int64 channels,
-                                                   const bool align_corners) {
+xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(
+    xla::XlaBuilder* builder, const xla::XlaOp& grad, xla::PrimitiveType type,
+    const int num_spatial_dims, absl::Span<const int64> in_size,
+    absl::Span<const int64> grad_size, const int64 channels,
+    const bool align_corners, bool is_kernel_bilinear) {
   ResizeConvolutionDims dims =
       ComputeResizeConvolutionParameters(in_size, grad_size, align_corners);
 
@@ -332,19 +375,14 @@ xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(xla::XlaBuilder* builder,
   dimension_numbers.set_kernel_output_feature_dimension(num_spatial_dims);
   xla::XlaOp output;
   if (dims.kernel_size[0] * dims.kernel_size[1] < kMax2DKernelSize) {
-    xla::XlaOp kernel =
-        MakeBilinearResizeKernel(builder, dims.kernel_size, channels);
+    xla::XlaOp kernel = MakeGeneralResizeKernel(builder, type, dims.kernel_size,
+                                                channels, is_kernel_bilinear);
 
     // Broadcast the input kernel where the forward op expanded from a size == 1
     // dimension to a size > 1 dimension. This has the effect of summing the
     // gradient contributions in that dimension.
-    for (int i = 0; i < num_spatial_dims; ++i) {
-      if (in_size[i] == 1 && grad_size[i] > 1) {
-        kernel =
-            xla::Add(kernel, xla::ConstantR1<float>(builder, grad_size[i], 0),
-                     /*broadcast_dimensions=*/{i});
-      }
-    }
+    kernel = BroadcastSpatialDimensions(
+        builder, kernel, /*spatial_dimensions_offset=*/0, in_size, grad_size);
 
     output = xla::ConvGeneralDilated(
         grad, kernel, /*window_strides=*/dims.kernel_size,
@@ -355,23 +393,23 @@ xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(xla::XlaBuilder* builder,
         /*rhs_dilation=*/{1, 1}, dimension_numbers,
         /*feature_group_count=*/channels);
   } else {
-    xla::XlaOp kernel0 =
-        MakeBilinearResizeKernelInDim(builder, dims.kernel_size, channels, 0);
-    xla::XlaOp kernel1 =
-        MakeBilinearResizeKernelInDim(builder, dims.kernel_size, channels, 1);
-
-    // Broadcast the input kernel where the forward op expanded from a size == 1
-    // dimension to a size > 1 dimension. This has the effect of summing the
-    // gradient contributions in that dimension.
+    xla::XlaOp kernel0 = MakeGeneralResizeKernelInDim(
+        builder, type, dims.kernel_size, channels, 0, is_kernel_bilinear);
+    xla::XlaOp kernel1 = MakeGeneralResizeKernelInDim(
+        builder, type, dims.kernel_size, channels, 1, is_kernel_bilinear);
+
+    // Broadcast the input kernel where the forward op expanded from a
+    // size == 1 dimension to a size > 1 dimension. This has the effect of
+    // summing the gradient contributions in that dimension.
     if (in_size[0] == 1 && grad_size[0] > 1) {
-      kernel0 =
-          xla::Add(kernel0, xla::ConstantR1<float>(builder, grad_size[0], 0),
-                   /*broadcast_dimensions=*/{0});
+      kernel0 = BroadcastSpatialDimensions(builder, kernel0,
+                                           /*spatial_dimensions_offset=*/0, {1},
+                                           {grad_size[0]});
     }
     if (in_size[1] == 1 && grad_size[1] > 1) {
-      kernel1 =
-          xla::Add(kernel0, xla::ConstantR1<float>(builder, grad_size[1], 0),
-                   /*broadcast_dimensions=*/{1});
+      kernel1 = BroadcastSpatialDimensions(builder, kernel0,
+                                           /*spatial_dimensions_offset=*/0,
+                                           in_size, grad_size);
     }
 
     output = xla::ConvGeneralDilated(
@@ -402,114 +440,148 @@ xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(xla::XlaBuilder* builder,
     }
   }
   if (pad_output) {
-    output = xla::Pad(output, xla::ConstantR0<float>(builder, 0.0f), padding);
+    output = xla::Pad(output, xla::Zero(builder, type), padding);
   }
   return output;
 }
 
-class ResizeBilinearOp : public XlaOpKernel {
- public:
-  explicit ResizeBilinearOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("align_corners", &align_corners_));
-  }
-
-  void Compile(XlaOpKernelContext* ctx) override {
-    xla::XlaBuilder* b = ctx->builder();
-
-    TensorShape input_shape = ctx->InputShape(0);
-    OP_REQUIRES(ctx, input_shape.dims() == 4,
-                errors::InvalidArgument("input must be 4-dimensional",
-                                        input_shape.DebugString()));
-    const int64 batch = input_shape.dim_size(0);
-    std::vector<int64> in_size = {input_shape.dim_size(1),
-                                  input_shape.dim_size(2)};
-    const int64 channels = input_shape.dim_size(3);
-    OP_REQUIRES(ctx, in_size[0] > 0 && in_size[1] > 0,
-                errors::InvalidArgument("input size must be positive, got [",
-                                        in_size[0], ",", in_size[1], "]"));
-
-    std::vector<int64> out_size;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &out_size));
-    OP_REQUIRES(ctx, out_size.size() == 2,
-                errors::InvalidArgument("output size must be length 2, got ",
-                                        out_size.size()));
-    OP_REQUIRES(ctx, out_size[0] > 0 && out_size[1] > 0,
-                errors::InvalidArgument("output size must be positive, got [",
-                                        out_size[0], ",", out_size[1], "]"));
-
-    const int num_spatial_dims = 2;
-
-    xla::XlaOp input = ctx->Input(0);
-
-    // If in_size[i] > 1 and out_size[i] == 1, slice out the first input in
-    // dimension i.
-    bool slice_input = false;
-    for (int i = 0; i < num_spatial_dims; ++i) {
-      if (in_size[i] > 1 && out_size[i] == 1) {
-        // If in_size[i] > 1 but out_size[i] == 1, then we slice out the first
-        // entry before resizing.
-        slice_input = true;
-        in_size[i] = 1;
-      }
-    }
-    if (slice_input) {
-      input =
-          xla::Slice(input, {0, 0, 0, 0},
-                     {batch, in_size[0], in_size[1], channels}, {1, 1, 1, 1});
+void GeneralCompile(XlaOpKernelContext* ctx, bool align_corners_,
+                    bool is_kernel_bilinear) {
+  xla::XlaBuilder* b = ctx->builder();
+
+  TensorShape input_shape = ctx->InputShape(0);
+  OP_REQUIRES(ctx, input_shape.dims() == 4,
+              errors::InvalidArgument("input must be 4-dimensional",
+                                      input_shape.DebugString()));
+  // First dimension always assumed to be batch
+  const int64 batch = input_shape.dim_size(0);
+  std::vector<int64> in_size = {input_shape.dim_size(1),
+                                input_shape.dim_size(2)};
+  // Last/4th dimension always assumed to be num channels
+  const int64 channels = input_shape.dim_size(3);
+  OP_REQUIRES(ctx, in_size[0] > 0 && in_size[1] > 0,
+              errors::InvalidArgument("input size must be positive, got [",
+                                      in_size[0], ",", in_size[1], "]"));
+
+  std::vector<int64> out_size;
+  OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &out_size));
+  OP_REQUIRES(ctx, out_size.size() == 2,
+              errors::InvalidArgument("output size must be length 2, got ",
+                                      out_size.size()));
+  OP_REQUIRES(ctx, out_size[0] > 0 && out_size[1] > 0,
+              errors::InvalidArgument("output size must be positive, got [",
+                                      out_size[0], ",", out_size[1], "]"));
+
+  const int num_spatial_dims = 2;
+
+  xla::XlaOp input = ctx->Input(0);
+  xla::PrimitiveType input_type = ctx->input_xla_type(0);
+
+  // If in_size[i] > 1 and out_size[i] == 1, slice out the first input in
+  // dimension i.
+  bool slice_input = false;
+  for (int i = 0; i < num_spatial_dims; ++i) {
+    if (in_size[i] > 1 && out_size[i] == 1) {
+      // If in_size[i] > 1 but out_size[i] == 1, then we slice out the first
+      // entry before resizing.
+      slice_input = true;
+      in_size[i] = 1;
     }
+  }
+  if (slice_input) {
+    input = xla::Slice(input, {0, 0, 0, 0},
+                       {batch, in_size[0], in_size[1], channels}, {1, 1, 1, 1});
+  }
 
-    // Output is always type float.
+  // Output is always type float if 'is_kernel_bilinear' is true.
+  if (is_kernel_bilinear) {
     input = xla::ConvertElementType(input, xla::F32);
+    input_type = xla::F32;
+  }
 
-    // Special Case:
-    // Instead of doing a ResizeUsingDilationAndConvolution directly,
-    // while (out_size[0]-1) = c * 2^x * (in_size[0]-1) for x>1 c>1, resize the
-    // image to 2*(in_size[0]-1)+1 x-times and then resize by scale c(int here).
-    // Instead of resizing directly we resize it iteratively.
-    //
-    // Since bilinear resize can be broken down as 2 sequential linear
-    // operations along different dimensions.
-    // Given sufficient numerical stability and a<e<c and b<f<d, bilinear resize
-    // from image of size axb -> cxd is same as resizing axb -> exf -> cxd.
-    // This does not work in the case of align_corners_=false because of special
-    // padding requirements that cause multiple resizes to be very different
-    // from a single resize.
-    //
-    // This makes the convolutions kernels smaller and the operation faster.
-    xla::XlaOp output = input;
-    while (in_size != out_size) {
-      if (in_size[0] != 1 && in_size[1] != 1) {
-        std::vector<float> k = {
-            (static_cast<float>(out_size[0]) - 1) / ((in_size[0] - 1) * 2),
-            (static_cast<float>(out_size[1]) - 1) / ((in_size[1] - 1) * 2)};
-        if ((k[0] == std::floor(k[0])) && (k[1] == std::floor(k[1])) &&
-            k[0] > 1 && k[1] > 1 && align_corners_) {
-          std::vector<int64> next_out_size = {(in_size[0] - 1) * 2 + 1,
-                                              (in_size[1] - 1) * 2 + 1};
-          output = ResizeUsingDilationAndConvolution(b, input, num_spatial_dims,
-                                                     in_size, next_out_size,
-                                                     channels, align_corners_);
-          input = output;
-          in_size = next_out_size;
-        } else {
-          output = ResizeUsingDilationAndConvolution(b, input, num_spatial_dims,
-                                                     in_size, out_size,
-                                                     channels, align_corners_);
-          in_size = out_size;
-        }
+  // Special Case:
+  // Instead of doing a ResizeUsingDilationAndConvolution directly,
+  // while (out_size[0]-1) = c * 2^x * (in_size[0]-1) for x>1 c>1, resize the
+  // image to 2*(in_size[0]-1)+1 x-times and then resize by scale c(int here).
+  // Instead of resizing directly we resize it iteratively.
+  //
+  // Since bilinear resize can be broken down as 2 sequential linear
+  // operations along different dimensions.
+  // Given sufficient numerical stability and a<e<c and b<f<d, bilinear resize
+  // from image of size axb -> cxd is same as resizing axb -> exf -> cxd.
+  // This does not work in the case of align_corners_=false because of special
+  // padding requirements that cause multiple resizes to be very different
+  // from a single resize.
+  //
+  // This makes the convolutions kernels smaller and the operation faster.
+  xla::XlaOp output = input;
+  while (in_size != out_size) {
+    if (in_size[0] != 1 && in_size[1] != 1) {
+      std::vector<float> k = {
+          (static_cast<float>(out_size[0]) - 1) / ((in_size[0] - 1) * 2),
+          (static_cast<float>(out_size[1]) - 1) / ((in_size[1] - 1) * 2)};
+      if ((k[0] == std::floor(k[0])) && (k[1] == std::floor(k[1])) &&
+          k[0] > 1 && k[1] > 1 && align_corners_) {
+        std::vector<int64> next_out_size = {(in_size[0] - 1) * 2 + 1,
+                                            (in_size[1] - 1) * 2 + 1};
+        output = ResizeUsingDilationAndConvolution(
+            b, input, input_type, num_spatial_dims, in_size, next_out_size,
+            channels, align_corners_, is_kernel_bilinear);
+        input = output;
+        in_size = next_out_size;
       } else {
-        output = ResizeUsingDilationAndConvolution(b, input, num_spatial_dims,
-                                                   in_size, out_size, channels,
-                                                   align_corners_);
+        output = ResizeUsingDilationAndConvolution(
+            b, input, input_type, num_spatial_dims, in_size, out_size, channels,
+            align_corners_, is_kernel_bilinear);
         in_size = out_size;
       }
+    } else {
+      output = ResizeUsingDilationAndConvolution(
+          b, input, input_type, num_spatial_dims, in_size, out_size, channels,
+          align_corners_, is_kernel_bilinear);
+      in_size = out_size;
     }
+  }
 
-    ctx->SetOutput(0, output);
+  ctx->SetOutput(0, output);
+}
+
+class ResizeNearestNeighborOp : public XlaOpKernel {
+ public:
+  explicit ResizeNearestNeighborOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("align_corners", &align_corners_));
+    OP_REQUIRES(
+        ctx, align_corners_ == true,
+        errors::Unimplemented("ResizeNearestNeighbor with align_corners=False "
+                              "is not yet implemented"));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    GeneralCompile(ctx, align_corners_, is_kernel_bilinear_);
   }
 
  private:
-  bool align_corners_;
+  bool align_corners_ = true;
+  bool is_kernel_bilinear_ = false;
+};
+
+REGISTER_XLA_OP(Name("ResizeNearestNeighbor").CompileTimeConstantInput("size"),
+                ResizeNearestNeighborOp);
+
+class ResizeBilinearOp : public XlaOpKernel {
+ public:
+  explicit ResizeBilinearOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("align_corners", &align_corners_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    GeneralCompile(ctx, align_corners_, is_kernel_bilinear_);
+  }
+
+ private:
+  bool align_corners_ = true;
+  bool is_kernel_bilinear_ = true;
 };
 
 REGISTER_XLA_OP(Name("ResizeBilinear").CompileTimeConstantInput("size"),
@@ -580,20 +652,20 @@ class ResizeBilinearGradOp : public XlaOpKernel {
           std::vector<int64> next_grad_size = {(in_size[0] - 1) * 2 + 1,
                                                (in_size[1] - 1) * 2 + 1};
           output = ResizeUsingDilationAndConvolutionGradOp(
-              b, grad, num_spatial_dims, in_size, next_grad_size, channels,
-              align_corners_);
+              b, grad, xla::F32, num_spatial_dims, in_size, next_grad_size,
+              channels, align_corners_, true);
           grad = output;
           in_size = next_grad_size;
         } else {
           output = ResizeUsingDilationAndConvolutionGradOp(
-              b, grad, num_spatial_dims, in_size, grad_size, channels,
-              align_corners_);
+              b, grad, xla::F32, num_spatial_dims, in_size, grad_size, channels,
+              align_corners_, true);
           in_size = grad_size;
         }
       } else {
         output = ResizeUsingDilationAndConvolutionGradOp(
-            b, grad, num_spatial_dims, in_size, grad_size, channels,
-            align_corners_);
+            b, grad, xla::F32, num_spatial_dims, in_size, grad_size, channels,
+            align_corners_, true);
         in_size = grad_size;
       }
     }
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops.cc b/tensorflow/compiler/tf2xla/kernels/index_ops.cc
index 843b6bb4e658af16fd753c1a20b35dd3d18df027..c1539f48d4f729510b2d930de91666a7c31f1ef0 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops.cc
@@ -18,17 +18,16 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/kernels/index_ops.h"
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 XlaArgMinMaxOp::XlaArgMinMaxOp(OpKernelConstruction* ctx, bool is_min)
@@ -66,9 +65,9 @@ void XlaArgMinMaxOp::Compile(XlaOpKernelContext* ctx) {
   xla::XlaOp input = ctx->Input(0);
   xla::XlaOp output;
   if (is_min_) {
-    output = XlaHelpers::ArgMin(input, index_xla_type, axis);
+    output = xla::ArgMin(input, index_xla_type, axis);
   } else {
-    output = XlaHelpers::ArgMax(input, index_xla_type, axis);
+    output = xla::ArgMax(input, index_xla_type, axis);
   }
 
   ctx->SetOutput(0, output);
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
index e2c05b648bb194b1b452c527ddb1a2c5995b1217..e4bbdef6480104a1051acfc647644deb65c80171 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
@@ -16,16 +16,16 @@ limitations under the License.
 // Native XLA implementations of indexing ops.
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 namespace {
@@ -74,7 +74,7 @@ class ArgMaxCustomCallOp : public XlaOpKernel {
     // shape isn't supported.
     if (!ctx->compiler()->options().allow_cpu_custom_calls ||
         (input_dims != 1 && input_dims != 2)) {
-      xla::XlaOp output = XlaHelpers::ArgMax(ctx->Input(0), output_type, axis);
+      xla::XlaOp output = xla::ArgMax(ctx->Input(0), output_type, axis);
       ctx->SetOutput(0, output);
       return;
     }
@@ -110,8 +110,8 @@ class ArgMaxCustomCallOp : public XlaOpKernel {
       auto shape_status = b.GetShape(arg);
       OP_REQUIRES_OK(ctx, shape_status.status());
       xla::Shape arg_shape = shape_status.ConsumeValueOrDie();
-      *arg_shape.mutable_layout() = xla::LayoutUtil::MakeDescendingLayout(
-          xla::ShapeUtil::Rank(arg_shape));
+      *arg_shape.mutable_layout() =
+          xla::LayoutUtil::MakeDescendingLayout(arg_shape.rank());
       arg_shapes.push_back(std::move(arg_shape));
     }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
index 47cf8c6675bc120653c2a5ab6d4b07376dc382ee..39d96e748b3a2a852c03c0dd53ec175f0c66a43a 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops_kernel_argmax_float_1d.cc
@@ -25,9 +25,6 @@ limitations under the License.
 namespace tensorflow {
 
 EIGEN_STRONG_INLINE void argmax_float_1d_xla_impl(void* out, void** data) {
-  // data is managed by the JIT code so msan can't tell it's initialized.
-  TF_ANNOTATE_MEMORY_IS_INITIALIZED(data, 2 * sizeof(void*));
-
   float* input = static_cast<float*>(data[0]);
   int64 input_size = *static_cast<int64*>(data[1]);
 
diff --git a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
index 6440770c29894c951f010f6c1deb929f4fe79bbf..f36e0025250b3a196b31755a1ddf6620c415b6a3 100644
--- a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
@@ -24,8 +24,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-constexpr std::array<DataType, 5> kMatmulTypes = {
-    {DT_HALF, DT_BFLOAT16, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64}};
+constexpr std::array<DataType, 6> kMatmulTypes = {
+    {DT_HALF, DT_BFLOAT16, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128}};
 
 class MatMulOp : public XlaOpKernel {
  public:
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
index 90c0ebefb24ec2c4378782e9b15d3f57c33032a4..5a6569c8954d1686dc9d7577a66feb720241ea13 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
@@ -15,7 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace tensorflow {
 namespace {
@@ -31,7 +32,10 @@ class MatrixTriangularSolveOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     auto result = xla::TriangularSolve(
         ctx->Input(0), ctx->Input(1), /*left_side=*/true,
-        /*lower=*/lower_, /*transpose_a=*/adjoint_, /*conjugate_a=*/adjoint_);
+        /*lower=*/lower_, /*unit_diagonal=*/false,
+        /*transpose_a=*/
+        adjoint_ ? xla::TriangularSolveOptions::ADJOINT
+                 : xla::TriangularSolveOptions::NO_TRANSPOSE);
     ctx->SetOutput(0, result);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
index f6b8534f4d7c537e5b708ee000e00cb92123584b..656f9b898f32dfc05215014f51c2bbaf07580836 100644
--- a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
@@ -38,8 +38,7 @@ class MirrorPadOp : public XlaOpKernel {
     // - [1, 2, 3, 3, 2] in symmetric mode.
     int64 excluded_edges = mode == MirrorPadMode::REFLECT ? 1 : 0;
     xla::XlaOp accum = t;
-    for (int64 dimno = xla::ShapeUtil::Rank(original_shape) - 1; dimno >= 0;
-         --dimno) {
+    for (int64 dimno = original_shape.rank() - 1; dimno >= 0; --dimno) {
       auto t_rev = xla::Rev(accum, {dimno});
       int64 lhs_padding = pad_literal.Get<int64>({dimno, 0});
       int64 rhs_padding = pad_literal.Get<int64>({dimno, 1});
diff --git a/tensorflow/compiler/tf2xla/kernels/pack_op.cc b/tensorflow/compiler/tf2xla/kernels/pack_op.cc
index a9b519d8928cc2807831fd6b4f12e60b7d58ea55..426a0941df57f19072d1cb9f3fa3d0079db465c5 100644
--- a/tensorflow/compiler/tf2xla/kernels/pack_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pack_op.cc
@@ -24,12 +24,12 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index 06c6cc37ec90192486ba15010bfeb763a9ffb987..23bb050a34d9246cdf73090aa6adfca054bf8bcf 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -26,10 +26,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/kernels/pooling_ops_common.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/qr_op.cc b/tensorflow/compiler/tf2xla/kernels/qr_op.cc
index 7ea0afc1f53cbe4cfcc3f6121a4ecd55864c1b52..66ec40a946b8a063d84acd33daf81f52ea2c35ed 100644
--- a/tensorflow/compiler/tf2xla/kernels/qr_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/qr_op.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/lib/qr.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/qr.h"
 
 namespace tensorflow {
 namespace {
@@ -26,7 +26,7 @@ class QROp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("full_matrices", &full_matrices_));
   }
   void Compile(XlaOpKernelContext* ctx) override {
-    auto result = QRDecomposition(ctx->Input(0), full_matrices_);
+    auto result = xla::QRDecomposition(ctx->Input(0), full_matrices_);
     if (!result.ok()) {
       ctx->SetStatus(result.status());
       return;
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops.cc b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
index 8822e29f7e77b1cbc6fa6ca61d0062d9b1b0c36e..d6c70d4af1c2e921b70b0869f0163c8481017c7d 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
@@ -20,12 +20,13 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h"
 #include "tensorflow/compiler/tf2xla/lib/random.h"
 #include "tensorflow/compiler/tf2xla/lib/util.h"
-#include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/loops.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -160,23 +161,30 @@ class RandomShuffleOp : public XlaOpKernel {
         -> xla::StatusOr<std::vector<xla::XlaOp>> {
       auto swaps = loop_vars[0];
       auto indices = loop_vars[1];
-      i = xla::Reshape(i, {1});
+      // TODO(b/118437727): The absl::Span nonsense is only necessary because
+      // the deprecated overload creates ambiguity for the single-element span
+      // case. Remove it once the deprecated overload is gone.
       // temp = indices[i]
-      auto temp = xla::DynamicSlice(indices, i, {1});
+      auto temp =
+          xla::DynamicSlice(indices, absl::Span<const xla::XlaOp>({i}), {1});
       // swap_index = swaps[i]
-      auto swap_index = xla::DynamicSlice(swaps, i, {1});
+      auto swap_index = xla::Reshape(
+          xla::DynamicSlice(swaps, absl::Span<const xla::XlaOp>({i}), {1}), {});
       // swap_value = indices[swaps[i]]
-      auto swap_value = xla::DynamicSlice(indices, swap_index, {1});
+      auto swap_value = xla::DynamicSlice(
+          indices, absl::Span<const xla::XlaOp>({swap_index}), {1});
       // indices[i] = indices[swaps[i]]
-      indices = xla::DynamicUpdateSlice(indices, swap_value, i);
+      indices = xla::DynamicUpdateSlice(indices, swap_value,
+                                        absl::Span<const xla::XlaOp>({i}));
       // indices[swaps[i]] = temp
-      indices = xla::DynamicUpdateSlice(indices, temp, swap_index);
+      indices = xla::DynamicUpdateSlice(
+          indices, temp, absl::Span<const xla::XlaOp>({swap_index}));
       return std::vector<xla::XlaOp>{swaps, indices};
     };
     // for i in range(n):
     auto swap_loop_result =
-        XlaForEachIndex(n, xla::S32, swap_body_fn, {swaps, indices},
-                        "indices_swap_loop", builder)
+        xla::ForEachIndex(n, xla::S32, swap_body_fn, {swaps, indices},
+                          "indices_swap_loop", builder)
             .ValueOrDie();
     auto swapped_indices = swap_loop_result[1];
 
@@ -272,9 +280,9 @@ class TruncatedNormalOp : public XlaOpKernel {
 
     xla::XlaBuilder* b = ctx->builder();
 
-    xla::XlaOp one = XlaHelpers::FloatLiteral(b, dtype, 1.0);
+    xla::XlaOp one = xla::One(b, xla_shape.element_type());
     xla::XlaOp min_positive =
-        XlaHelpers::FloatLiteral(b, dtype, std::numeric_limits<float>::min());
+        xla::MinPositiveNormalValue(b, xla_shape.element_type());
     auto uniform = xla::RngUniform(min_positive, one, xla_shape);
     ctx->SetOutput(0, TruncatedNormal(uniform));
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc b/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc
index 54d34a38abc4948a1a08197d72e3e7f763649093..f9985d526033ca675c701a508a3d1576e46bc5f7 100644
--- a/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc
@@ -125,7 +125,7 @@ XlaOp ConcatenateIota(xla::XlaBuilder* b, XlaOp indices,
   dimensions.back() = 1;
 
   auto batch_indices =
-      xla::Iota(b, xla::ShapeUtil::MakeShape(xla::U32, dimensions),
+      xla::Iota(b, xla::ShapeUtil::MakeShape(xla::S32, dimensions),
                 /*iota_dimension=*/0);
 
   return xla::ConcatInDim(b, {batch_indices, indices}, dimensions.size() - 1);
@@ -189,11 +189,53 @@ XlaOp ScatterToGradData(XlaOpKernelContext* ctx, XlaOp grad_data, XlaOp indices,
                       scatter_dim_numbers);
 }
 
+// Bounds samples to 0 if the warp image indices are out of the (-1, image_size)
+// bound.
+// The resulting dimension is given by 'result_dims'.
+XlaOp BoundSamples(XlaOpKernelContext* ctx, XlaOp warp,
+                   xla::PrimitiveType warp_type, TensorShape warp_shape,
+                   std::vector<int64> result_dims,
+                   std::vector<int64> broadcasted_dims, int64 last_warp_dim,
+                   xla::Shape data_shape, XlaOp sample) {
+  auto is_gt_minus_one =
+      xla::Gt(warp,
+              xla::ConvertElementType(
+                  xla::ConstantR1<float>(ctx->builder(), {-1, -1}), warp_type),
+              /*broadcast_dimensions=*/{warp_shape.dims() - 1});
+  auto is_lt_image_size = xla::Lt(
+      warp,
+      xla::ConvertElementType(
+          xla::ConstantR1<float>(
+              ctx->builder(),
+              {/*width=*/static_cast<float>(data_shape.dimensions(2)),
+               /*height=*/static_cast<float>(data_shape.dimensions(1))}),
+          warp_type),
+      /*broadcast_dimensions=*/{warp_shape.dims() - 1});
+
+  auto is_in_bound_padded_x_y = xla::And(is_gt_minus_one, is_lt_image_size);
+  // Reduce along last dimension. The resulting dimension is:
+  // [batch, dim_0, ...dim_n].
+  auto is_in_bound = xla::Reduce(
+      is_in_bound_padded_x_y, xla::ConstantR0<bool>(ctx->builder(), true),
+      xla::CreateScalarAndComputation(xla::PrimitiveType::PRED, ctx->builder()),
+      {last_warp_dim});
+
+  // Broadcast 'is_in_bound' to the same dimension as 'result_dims'.
+  auto broadcasted_is_in_bound =
+      xla::BroadcastInDim(is_in_bound, result_dims, broadcasted_dims);
+
+  // Set out of bound samples to zero.
+  auto zeros =
+      xla::Broadcast(xla::Zero(ctx->builder(), warp_type), result_dims);
+  return xla::Select(broadcasted_is_in_bound, sample, zeros);
+}
+
 // Build computation the backprop into input 'data'.
 // Where input:
 // grad_output is of dimension [batch, dim_0, ...dim_n, channel]
 // ratio is of dimension [batch, dim_0, ...dim_n, 2]
 // gather_indices is of dimension [batch, dim_0, ...dim_n, 3]
+// data_shape is of dimension [batch, x(width), y(height), channel]
 //
 // Output:
 // scatter-add to each 2x2 grad_data neighbor:
@@ -201,10 +243,12 @@ XlaOp ScatterToGradData(XlaOpKernelContext* ctx, XlaOp grad_data, XlaOp indices,
 //  grad_data[cx, fy, chan] += output_grad * (1 - dx) * dy
 //  grad_data[fx, cy, chan] += output_grad * dx * (1 - dy)
 //  grad_data[cx, cy, chan] += output_grad * (1 - dx) * (1 - dy)
-// where (dx, dy) is (1 - ratio).
+// where (dx, dy) is (1 - ratio). If (dx, dy) is out of bound, then the their
+// contribution is 0 to 'grad_data'.
 XlaOp CalculateGradData(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
-                        XlaOp gather_indices, xla::PrimitiveType warp_type,
-                        TensorShape warp_shape, int64 data_channels,
+                        XlaOp gather_indices, XlaOp warp,
+                        xla::PrimitiveType warp_type, TensorShape warp_shape,
+                        int64 last_warp_dim, int64 data_channels,
                         xla::Shape data_shape) {
   // Weights tensor has dimension [batch, dim_0, ... dim_n, 4].
   auto weights = BilinearWeights(ctx, ratio, warp_shape, warp_type);
@@ -229,6 +273,18 @@ XlaOp CalculateGradData(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
   std::iota(reshaped_weights_indices.begin(), reshaped_weights_indices.end(),
             0);
 
+  // Set out of bound weights to 0.
+  // The dimension of the reshaped_weight: [batch, dim_0, ...dim_n, 2, 2].
+  std::vector<int64> reshaped_result_dims(warp_dims.begin(),
+                                          warp_dims.end() - 1);
+  reshaped_result_dims.push_back(2);
+  reshaped_result_dims.push_back(2);
+  std::vector<int64> broadcasted_dims(warp_dims.size() - 1);
+  std::iota(broadcasted_dims.begin(), broadcasted_dims.end(), 0);
+  reshaped_weights = BoundSamples(ctx, warp, warp_type, warp_shape,
+                                  reshaped_result_dims, broadcasted_dims,
+                                  last_warp_dim, data_shape, reshaped_weights);
+
   // The dimension is [batch, dim_0, ..., dim_n, 2, 2, data_channel].
   auto broadcast_reshaped_weights = xla::BroadcastInDim(
       reshaped_weights, weights_with_channels_dims, reshaped_weights_indices);
@@ -245,18 +301,41 @@ XlaOp CalculateGradData(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
   auto grad_data = xla::ConstantLiteral(
       ctx->builder(), xla::Literal::CreateFromShape(data_shape));
 
-  return ScatterToGradData(ctx, grad_data, gather_indices,
-                           grad_output_multiply_weights, warp_shape.dims(),
-                           warp_type);
+  // Pad grad data then slice it back.
+  //
+  // After left and right column 0-padding, the new dimension of padded data
+  // will be [batch, x+2, y+2, channel].
+  auto padded_grad_data =
+      xla::Pad(grad_data, xla::Zero(ctx->builder(), warp_type),
+               xla::MakeEdgePaddingConfig({{0, 0}, {1, 1}, {1, 1}, {0, 0}}));
+
+  auto shifting_value = xla::ConstantR1<int32>(
+      ctx->builder(), {/*batch=*/0, /*x(width)=*/1, /*y(height)=*/1});
+  auto shifted_gather_indices =
+      xla::Add(gather_indices, shifting_value, {last_warp_dim});
+
+  auto updated_grad_data = ScatterToGradData(
+      ctx, padded_grad_data, shifted_gather_indices,
+      grad_output_multiply_weights, warp_shape.dims(), warp_type);
+
+  const int64 batch_size = data_shape.dimensions(0);
+  const int64 width = data_shape.dimensions(1);
+  const int64 height = data_shape.dimensions(2);
+  // Slice out the result accounting for the padding.
+  return xla::Slice(
+      updated_grad_data, /*start_indices=*/{0, 1, 1, 0},
+      /*limit_indices=*/{batch_size, width + 1, height + 1, data_channels},
+      /*strides=*/{1, 1, 1, 1});
 }
 
 // Build computation for the backprop into input 'warp'.
 // Where input:
-// warp is of dimension [batch, dim_0, ...dim_n, 2]
-// grad_output is of dimension [batch, dim_0, ...dim_n, channel]
-// ratio is of dimension [batch, dim_0, ...dim_n, 2]
-// gather_indices is of dimension [batch, dim_0, ...dim_n, 3]
-// data is of dimension [batch, x, y, channel]
+//  warp is of dimension [batch, dim_0, ...dim_n, 2]
+//  grad_output is of dimension [batch, dim_0, ...dim_n, channel]
+//  ratio is of dimension [batch, dim_0, ...dim_n, 2]
+//  gather_indices is of dimension [batch, dim_0, ...dim_n, 3] where the last
+//  dimension of size 3 is for {batch, x(width), y(height)}.
+//  data is of dimension [batch, x, y, channel]
 //
 // Output (simplified by ignoring the batch dimensions):
 // Since the forward path has:
@@ -275,12 +354,12 @@ XlaOp CalculateGradData(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
 //    grad_warp_x = py * (img_cxcy - img_fxcy) + (1-py) * (img_cxfy-img_fxfy)
 //    grad_warp_y = px * (img_cxcy - img_cxfy) + (1-px) * (img_fxcy-img_fxfy)
 //
-// where (px, py) is warp, (fx, fy) is the left top corner and (cx, cy) is the
+// where (px, py) is warp, (fx, fy) is the top left corner and (cx, cy) is the
 // bottom right corner in a 2x2 neighborhood.
 XlaOp CalculateGradWarp(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
                         XlaOp gather_indices, XlaOp data,
                         TensorShape warp_shape, int64 data_channels,
-                        xla::PrimitiveType data_type) {
+                        xla::PrimitiveType data_type, xla::Shape data_shape) {
   auto warp_dims = warp_shape.dim_sizes();
   std::vector<int64> warp_dims_without_last_dims(warp_dims.begin(),
                                                  warp_dims.end() - 1);
@@ -289,12 +368,30 @@ XlaOp CalculateGradWarp(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
   std::vector<int64> neighbor_broadcast_dims = warp_dims_without_last_dims;
   neighbor_broadcast_dims.push_back(4);
 
-  // The dimension is [batch, dim_0, ... dim_n, 4, data_channels]
-  auto neighbors_data = Gather2by2Neighbors(
-      ctx->builder(), data, gather_indices, data_channels, warp_shape.dims());
+  // With dimension [batch, dim_0, ...dim_n, 4]
+  auto neighbor_broadcast_shape =
+      xla::ShapeUtil::MakeShape(data_type, neighbor_broadcast_dims);
 
   const int64 last_warp_dim = warp_shape.dims() - 1;
 
+  // Pad data with 0, before gathering such that 0 will be returned for samples
+  // in the range of (-1, 0) or (image_dimension-1, image_dimension).
+  // After left and right column 0-padding, the new dimension of padded data
+  // will be [batch, x+2, y+2, channel].
+  auto padded_data =
+      xla::Pad(data, xla::Zero(ctx->builder(), data_type),
+               xla::MakeEdgePaddingConfig({{0, 0}, {1, 1}, {1, 1}, {0, 0}}));
+
+  auto shifting_value = xla::ConstantR1<int32>(
+      ctx->builder(), {/*batch=*/0, /*x(width)=*/1, /*y(height)=*/1});
+  auto shifted_gather_indices =
+      xla::Add(gather_indices, shifting_value, {last_warp_dim});
+
+  // The dimension is [batch, dim_0, ... dim_n, 4, data_channels]
+  auto neighbors_data =
+      Gather2by2Neighbors(ctx->builder(), padded_data, shifted_gather_indices,
+                          data_channels, warp_shape.dims());
+
   // Since we will be creating the dot product of:
   //  lhs: [batch, dim_0, ...dim_n, 4]
   // and
@@ -417,7 +514,7 @@ class ResamplerOp : public XlaOpKernel {
     // Find the coordinates of the top left corner for the 2x2 region to be
     // sampled from. The dimensions are [batch, dim_0, ... dim_n, 2] where the
     // last dimension of size 2 in turn is [x, y].
-    XlaOp top_left = xla::ConvertElementType(warp, xla::U32);
+    XlaOp top_left = xla::ConvertElementType(warp, xla::S32);
 
     auto gather_indices = ConcatenateIota(ctx->builder(), top_left, warp_shape);
 
@@ -526,7 +623,8 @@ class ResamplerGradOp : public XlaOpKernel {
                                           size, "]"));
     }
     // Last dimension of warp shape must be of size 2.
-    OP_REQUIRES(ctx, warp_shape.dim_size(warp_shape.dims() - 1) == 2,
+    const int64 last_warp_dim = warp_shape.dims() - 1;
+    OP_REQUIRES(ctx, warp_shape.dim_size(last_warp_dim) == 2,
                 errors::InvalidArgument(
                     "the last dimension of warp must be exactly size 2."));
     xla::PrimitiveType warp_type = ctx->input_xla_type(1);
@@ -549,24 +647,32 @@ class ResamplerGradOp : public XlaOpKernel {
     // Find the top left corner coordinate for the region to be sampled from.
     // The dimensions are [batch, dim_0, ... dim_n, 2] where the last dimension
     // of size 2 in turn is [x, y].
-    XlaOp top_left = xla::ConvertElementType(warp, xla::U32);
+    XlaOp top_left = xla::ConvertElementType(xla::Floor(warp), xla::S32);
 
-    // Dimensions are [batch, dim_0, ... dim_n, 2]
+    // Dimensions are [batch, dim_0, ... dim_n, 2].
     XlaOp ratio = warp - xla::ConvertElementType(top_left, warp_type);
 
     // Indices for gathering neighboring pixels.
     auto gather_indices = ConcatenateIota(ctx->builder(), top_left, warp_shape);
 
-    auto grad_data =
-        CalculateGradData(ctx, grad_output, ratio, gather_indices, warp_type,
-                          warp_shape, data_channels, data_shape);
+    auto grad_data = CalculateGradData(
+        ctx, grad_output, ratio, gather_indices, warp, warp_type, warp_shape,
+        last_warp_dim, data_channels, data_shape);
 
     auto grad_warp =
         CalculateGradWarp(ctx, grad_output, ratio, gather_indices, data,
-                          warp_shape, data_channels, data_type);
+                          warp_shape, data_channels, data_type, data_shape);
+    auto warp_dims = warp_shape.dim_sizes();
+    std::vector<int64> result_dims(warp_dims.begin(), warp_dims.end() - 1);
+    result_dims.push_back(2);
+    std::vector<int64> broadcasted_dims(warp_dims.size() - 1);
+    std::iota(broadcasted_dims.begin(), broadcasted_dims.end(), 0);
+    auto grad_warp_bounded =
+        BoundSamples(ctx, warp, warp_type, warp_shape, result_dims,
+                     broadcasted_dims, last_warp_dim, data_shape, grad_warp);
 
     ctx->SetOutput(0, grad_data);
-    ctx->SetOutput(1, grad_warp);
+    ctx->SetOutput(1, grad_warp_bounded);
   }
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/retval_op.cc b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
index e4046c795577983bff1a8053743bf4d3a258e583..1f417037284c87753b219ea5ce1d4edce0ce6336 100644
--- a/tensorflow/compiler/tf2xla/kernels/retval_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
@@ -37,10 +37,14 @@ class RetvalOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     const Tensor& input = ctx->op_kernel_context()->input(0);
 
-    OP_REQUIRES(ctx, input.dtype() == dtype_,
-                errors::InvalidArgument(
-                    "Type mismatch: actual ", DataTypeString(input.dtype()),
-                    " vs. expect ", DataTypeString(dtype_)));
+    // DT_VARIANT types represent Tensor Lists and are wrapped in a DT_UINT8
+    // tensor so we skip the check here.
+    if (dtype_ != DT_VARIANT) {
+      OP_REQUIRES(ctx, input.dtype() == dtype_,
+                  errors::InvalidArgument(
+                      "Type mismatch: actual ", DataTypeString(input.dtype()),
+                      " vs. expect ", DataTypeString(dtype_)));
+    }
     auto frame = ctx->call_frame();
     if (frame) {
       // If 'frame' is non-null, this is an inner function call inside a JIT
@@ -59,8 +63,9 @@ class RetvalOp : public XlaOpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(RetvalOp);
 };
 
-REGISTER_XLA_OP(Name("_Retval").AllowResourceTypes().CompilationOnly(),
-                RetvalOp);
+REGISTER_XLA_OP(
+    Name("_Retval").AllowResourceTypes().AllowVariantTypes().CompilationOnly(),
+    RetvalOp);
 
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
index 4b9e1a578be2445091228953df7e5c5e82b42c28..daefdfc58a4957d9e685d25aa90da6218f2041ad 100644
--- a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
@@ -23,13 +23,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
index a95e7adacf194ba6eb33cbeb56abe1a5a2479337..a1c18bed3f94008af8038f32324c79aa5b2abded 100644
--- a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
@@ -110,10 +110,16 @@ class ScatterNdOp : public XlaOpKernel {
     auto updates = context->Input(1);
     auto result =
         XlaScatter(buffer, updates, indices,
-                   /*indices_are_vectors=*/true, /*combiner=*/{}, builder);
+                   /*indices_are_vectors=*/true, /*combiner=*/Combine, builder);
     OP_REQUIRES_OK(context, result.status());
     context->SetOutput(0, result.ValueOrDie());
   }
+
+ private:
+  static xla::XlaOp Combine(const xla::XlaOp& x, const xla::XlaOp& y,
+                            xla::XlaBuilder* builder) {
+    return xla::Add(x, y);
+  }
 };
 
 REGISTER_XLA_OP(Name("ScatterNd").CompileTimeConstantInput("shape"),
diff --git a/tensorflow/compiler/tf2xla/kernels/select_op.cc b/tensorflow/compiler/tf2xla/kernels/select_op.cc
index 9e4c57c9bf73369662274f6b783418e18ff860c2..aaf8c6075dd292e33e70683774a6c1bf374183e3 100644
--- a/tensorflow/compiler/tf2xla/kernels/select_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/select_op.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
index b1fa2915d59e4e5e2f2523e20e9a37898d087117..7a620d2a6518f8686ef570b33aac971d1dccb6c1 100644
--- a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
@@ -157,9 +157,11 @@ class LinSpaceOp : public XlaOpKernel {
           flat(0) = start;
         } else {
           const float step = (stop - start) / (num - 1);
-          for (int64 i = 0; i < num; ++i) {
+          for (int64 i = 0; i < num - 1; ++i) {
             flat(i) = start + step * i;
           }
+          // The last value in the sequence must be equal to stop.
+          flat(num - 1) = stop;
         }
         break;
       }
@@ -171,9 +173,11 @@ class LinSpaceOp : public XlaOpKernel {
           flat(0) = start;
         } else {
           const double step = (stop - start) / (num - 1);
-          for (int64 i = 0; i < num; ++i) {
+          for (int64 i = 0; i < num - 1; ++i) {
             flat(i) = start + step * i;
           }
+          // The last value in the sequence must be equal to stop.
+          flat(num - 1) = stop;
         }
         break;
       }
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index 12830816ec16c9797f0fe4d8f3f13f5a8176161d..280b68383c28d1b9d88f7b2ac0f8fab47244c05d 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -20,10 +20,11 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 namespace {
@@ -91,14 +92,20 @@ class SizeOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     const TensorShape input_shape = ctx->InputShape(0);
-    const int64 size = input_shape.num_elements();
-    OP_REQUIRES(ctx, FastBoundsCheck(size, std::numeric_limits<int32>::max()),
+    OP_REQUIRES(ctx,
+                FastBoundsCheck(input_shape.num_elements(),
+                                std::numeric_limits<int32>::max()),
                 errors::InvalidArgument("Size does not work for tensors > "
                                         "int32 max."));
     Tensor size_constant(DT_INT32, TensorShape({}));
-    size_constant.scalar<int32>()() = static_cast<int32>(size);
-
-    ctx->SetConstantOutput(0, size_constant);
+    const int rank = input_shape.dims();
+    xla::XlaBuilder* builder = ctx->builder();
+    auto size = xla::One(builder, xla::U32);
+    for (int64 i = 0; i < rank; ++i) {
+      size = xla::Mul(size, xla::GetDimensionSize(ctx->Input(0), i));
+    }
+    size = xla::ConvertElementType(size, ctx->output_xla_type(0));
+    ctx->SetOutput(0, size);
   }
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_util.cc b/tensorflow/compiler/tf2xla/kernels/shape_util.cc
index 76ea5f525598f511f295eb5a30f3cf603fbf57aa..b18e3f965c427aec456ce2b188dad79485df23cc 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_util.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_util.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <limits>
 
-#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/framework/bounds_check.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
index 622efac81766fc3ddaf538b58170f34fce06927a..52bed2670b4b8408e3b2f72b64bf370aea5325f6 100644
--- a/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
@@ -39,7 +39,7 @@ void SpaceToBatch(XlaOpKernelContext* ctx, const xla::XlaOp& input,
 
   OP_REQUIRES(
       ctx,
-      xla::ShapeUtil::Rank(paddings.shape()) == 2 &&
+      paddings.shape().rank() == 2 &&
           block_rank == xla::ShapeUtil::GetDimension(paddings.shape(), 0) &&
           2 == xla::ShapeUtil::GetDimension(paddings.shape(), 1),
       errors::InvalidArgument("paddings should have shape [", block_rank,
diff --git a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
index 8e9e4daf99d3dd3b8e149e3f3e5f6c27665c0fcb..b6c96b1f582710e1cc39e6e1e0e800ef8170743d 100644
--- a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
@@ -24,13 +24,13 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
@@ -45,7 +45,7 @@ Status GetStackShape(xla::XlaBuilder* builder, XlaResource* resource,
     return shape_or_status.status();
   }
   xla::Shape shape = shape_or_status.ValueOrDie();
-  TF_RET_CHECK(xla::ShapeUtil::IsTuple(shape));
+  TF_RET_CHECK(shape.IsTuple());
   return XLAShapeToTensorShape(xla::ShapeUtil::GetTupleElementShape(shape, 0),
                                stack_shape);
 }
@@ -146,9 +146,9 @@ class StackPushOp : public XlaOpKernel {
     xla::XlaOp value = ctx->Input(1);
 
     // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
-    auto start_indices =
-        xla::Pad(xla::Reshape(index, {1}), xla::ConstantR0<int32>(b, 0),
-                 xla::MakeEdgePaddingConfig({{0, elem_shape.dims()}}));
+    std::vector<xla::XlaOp> start_indices(elem_shape.dims() + 1,
+                                          xla::ConstantR0<int32>(b, 0));
+    start_indices[0] = index;
 
     TensorShape slice_shape = elem_shape;
     slice_shape.InsertDim(0, 1LL);
@@ -202,9 +202,9 @@ class StackPopOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, resource->SetValue(xla::Tuple(b, {ta, index})));
 
     // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
-    auto start_indices =
-        xla::Pad(xla::Reshape(index, {1}), xla::ConstantR0<int32>(b, 0),
-                 xla::MakeEdgePaddingConfig({{0, stack_shape.dims() - 1}}));
+    std::vector<xla::XlaOp> start_indices(stack_shape.dims(),
+                                          xla::ConstantR0<int32>(b, 0));
+    start_indices[0] = index;
 
     auto slice_shape = stack_shape.dim_sizes();
     slice_shape[0] = 1LL;
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
index 50653d7b3973b73d580cdeec5d71943b575d7cc9..17f067e0dfcf4f8b360ee6db934df3e373d5fdd1 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
@@ -218,8 +218,8 @@ class StatelessTruncatedNormalOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(DT_FLOAT, shape, &xla_shape));
     auto uniform = xla::StatelessRngUniform(
         {seed0, seed1}, xla_shape,
-        xla::ConstantR0<float>(builder, std::numeric_limits<float>::min()),
-        xla::ConstantR0<float>(builder, 1.0));
+        xla::MinPositiveNormalValue(builder, xla_shape.element_type()),
+        xla::One(builder, xla_shape.element_type()));
     auto output = TruncatedNormal(uniform);
     output = MaybeConvertF32ToBF16(output, dtype_);
     ctx->SetOutput(0, output);
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index 10d990b3213ab882cf44a4df20a977633de3fdab..e8846fbe88fa2a75244398ef0f601fd74e80ec50 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -288,19 +288,21 @@ class StridedSliceAssignOp : public XlaOpKernel {
     xla::XlaOp rhs = ctx->Input(4);
 
     absl::InlinedVector<int64, 4> dimensions_to_reverse;
-    absl::InlinedVector<int64, 4> slice_begin, slice_dims;
+    absl::InlinedVector<xla::XlaOp, 4> slice_begin;
+    absl::InlinedVector<int64, 4> slice_dims;
     for (int i = 0; i < begin.size(); ++i) {
-      // TODO(phawkins): implement strides != 1
+      // TODO(b/121179231): implement strides != 1
       OP_REQUIRES(
           ctx, strides[i] == 1 || strides[i] == -1,
           errors::Unimplemented("Strides != 1 or -1 are not yet implemented"));
       if (strides[i] > 0) {
-        slice_begin.push_back(begin[i]);
+        slice_begin.push_back(xla::ConstantR0<int64>(ctx->builder(), begin[i]));
         slice_dims.push_back(end[i] - begin[i]);
       } else {
         // Negative stride: swap begin and end, add 1 because the interval
         // is semi-open, and mark the dimension to be reversed.
-        slice_begin.push_back(end[i] + 1);
+        slice_begin.push_back(
+            xla::ConstantR0<int64>(ctx->builder(), end[i] + 1));
         slice_dims.push_back(begin[i] - end[i]);
         dimensions_to_reverse.push_back(i);
       }
@@ -311,14 +313,7 @@ class StridedSliceAssignOp : public XlaOpKernel {
     }
     rhs = xla::Reshape(rhs, slice_dims);
 
-    if (lhs_shape.dims() == 0) {
-      // TODO(b/38323843): DynamicUpdateSlice crashes on rank 0 inputs. Fix
-      // and remove this workaround.
-      lhs = rhs;
-    } else {
-      lhs = xla::DynamicUpdateSlice(
-          lhs, rhs, xla::ConstantR1<int64>(ctx->builder(), slice_begin));
-    }
+    lhs = xla::DynamicUpdateSlice(lhs, rhs, slice_begin);
 
     OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, lhs));
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index 939d7e19515a1cb41e3e23e9d1fa957ae09ecab7..77a3e5c001e1c715f23ae5148f94dae2faa81acf 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -27,13 +27,13 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_resource.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
@@ -123,7 +123,8 @@ Status GetTensorArrayShape(const XlaResource* resource,
 xla::XlaOp DynamicAddSlice(xla::XlaBuilder* builder, const xla::XlaOp& operand,
                            const xla::XlaOp& update,
                            absl::Span<const int64> update_dims,
-                           const xla::XlaOp& start_indices, DataType dtype) {
+                           absl::Span<const xla::XlaOp> start_indices,
+                           DataType dtype) {
   xla::XlaOp current = xla::DynamicSlice(operand, start_indices, update_dims);
   xla::XlaOp sum =
       dtype == DT_BOOL ? xla::Or(current, update) : xla::Add(current, update);
@@ -212,9 +213,9 @@ class TensorArrayWriteOp : public XlaOpKernel {
     xla::XlaOp flow = ctx->Input(3);
 
     // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
-    auto start_indices =
-        xla::Pad(xla::Reshape(index, {1}), xla::ConstantR0<int32>(b, 0),
-                 xla::MakeEdgePaddingConfig({{0, elem_shape.dims()}}));
+    std::vector<xla::XlaOp> start_indices(elem_shape.dims() + 1,
+                                          xla::ConstantR0<int32>(b, 0));
+    start_indices[0] = index;
 
     TensorShape slice_shape = elem_shape;
     slice_shape.InsertDim(0, 1LL);
@@ -263,9 +264,9 @@ class TensorArrayReadOp : public XlaOpKernel {
     xla::XlaOp index = ctx->Input(1);
 
     // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
-    auto start_indices =
-        xla::Pad(xla::Reshape(index, {1}), xla::ConstantR0<int32>(b, 0),
-                 xla::MakeEdgePaddingConfig({{0, ta_shape.dims() - 1}}));
+    std::vector<xla::XlaOp> start_indices(ta_shape.dims(),
+                                          xla::ConstantR0<int32>(b, 0));
+    start_indices[0] = index;
 
     auto slice_shape = ta_shape.dim_sizes();
     slice_shape[0] = 1LL;
@@ -419,10 +420,10 @@ class TensorArrayScatterOp : public XlaOpKernel {
         auto slice = xla::Slice(value, value_starts, value_ends, value_strides);
 
         // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
-        auto index = xla::Slice(indices, {i}, {i + 1}, {1});
-        auto start_indices =
-            xla::Pad(xla::Reshape(index, {1}), xla::ConstantR0<int32>(b, 0),
-                     xla::MakeEdgePaddingConfig({{0, elem_shape.dims()}}));
+        auto index = xla::Reshape(xla::Slice(indices, {i}, {i + 1}, {1}), {});
+        std::vector<xla::XlaOp> start_indices(elem_shape.dims() + 1,
+                                              xla::ConstantR0<int32>(b, 0));
+        start_indices[0] = index;
         ta = DynamicAddSlice(b, ta, slice, slice_dims, start_indices, dtype_);
       }
     }
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
index 64a24703ae1460abfedb6d9298e1e164076a199a..8958a48bc79dce91c41ab7d0a5fc0fbb401112ba 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 // XLA TensorList operators.
+// Tensor lists are represented as tuple consisting of a pre-allocated list
+// consisting of the tensors (and where dim 0 is the list index), along with a
+// scalar telling us the current number of elements.
 
 #include <limits>
 #include <vector>
@@ -23,15 +26,17 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -45,11 +50,64 @@ Status GetTensorListShape(xla::XlaBuilder* builder, xla::XlaOp op,
     return shape_or_status.status();
   }
   xla::Shape shape = shape_or_status.ValueOrDie();
-  TF_RET_CHECK(xla::ShapeUtil::IsTuple(shape));
+  TF_RET_CHECK(shape.IsTuple());
   return XLAShapeToTensorShape(xla::ShapeUtil::GetTupleElementShape(shape, 0),
                                tensor_list_shape);
 }
 
+class TensorListLengthOp : public XlaOpKernel {
+ public:
+  explicit TensorListLengthOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaOp tl = ctx->Input(0);
+    xla::XlaOp index = xla::GetTupleElement(tl, 1);
+    ctx->SetOutput(0, index);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorListLengthOp);
+};
+
+REGISTER_XLA_OP(Name("TensorListLength"), TensorListLengthOp);
+
+// Creates an empty list with size (leading_dim, *element_shape) if
+// element_shape is known at compile time. Otherwise creates one with size
+// (leading_dim, 0) which gets initialized later in `GetInitializedList`.
+Status CreateZerosList(XlaOpKernelContext* ctx, int element_shape_index,
+                       int64 leading_dim, DataType dtype, xla::XlaOp* list) {
+  TensorShape list_shape;
+  list_shape.AddDim(leading_dim);
+  xla::XlaOp element_shape_handle = ctx->Input(element_shape_index);
+  TF_ASSIGN_OR_RETURN(
+      bool is_element_shape_compile_time_const,
+      element_shape_handle.builder()->IsConstant(element_shape_handle));
+  PartialTensorShape partial_element_shape;
+  if (is_element_shape_compile_time_const) {
+    TF_RETURN_IF_ERROR(ctx->ConstantInputAsPartialShape(
+        element_shape_index, &partial_element_shape));
+  }
+  if (is_element_shape_compile_time_const &&
+      partial_element_shape.IsFullyDefined()) {
+    TensorShape element_shape;
+    partial_element_shape.AsTensorShape(&element_shape);
+    list_shape.AppendShape(element_shape);
+  } else {
+    // If element_shape is not a compile time constant or if it is not fully
+    // defined we will have to wait for the first write call to fully allocate
+    // the array.
+    // TODO(srbs): We are using element_shape of [0] as a proxy to denote an
+    // uninitialized list. A better implementation may be to represent the
+    // list as a 3-tuple containining an explicit "initialized" flag. However,
+    // we would still need to create a dummy tensor for the first tuple
+    // element.
+    list_shape.AddDim(0);
+  }
+  *list = xla::Broadcast(XlaHelpers::Zero(ctx->builder(), dtype),
+                         list_shape.dim_sizes());
+  return Status::OK();
+}
+
 class TensorListReserveOp : public XlaOpKernel {
  public:
   explicit TensorListReserveOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
@@ -57,19 +115,15 @@ class TensorListReserveOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    TensorShape element_shape;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &element_shape));
     int64 num_elements;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &num_elements));
 
-    TensorShape tensor_shape;
-    tensor_shape.AddDim(num_elements);
-    tensor_shape.AppendShape(element_shape);
+    xla::XlaOp list;
+    OP_REQUIRES_OK(ctx, CreateZerosList(ctx, 0, num_elements, dtype_, &list));
 
     xla::XlaBuilder* b = ctx->builder();
-    ctx->SetOutput(0, xla::Tuple(b, {xla::Broadcast(XlaHelpers::Zero(b, dtype_),
-                                                    tensor_shape.dim_sizes()),
-                                     xla::ConstantR0<int32>(b, 0)}));
+    ctx->SetTensorListOutput(
+        0, xla::Tuple(b, {list, xla::ConstantR0<int32>(b, num_elements)}));
   }
 
  private:
@@ -85,19 +139,37 @@ REGISTER_XLA_OP(Name("TensorListReserve")
 
 class EmptyTensorListOp : public XlaOpKernel {
  public:
-  explicit EmptyTensorListOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  explicit EmptyTensorListOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
+  }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    ctx->CtxFailure(
+    int64 max_num_elements;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &max_num_elements));
+    OP_REQUIRES(
+        ctx, max_num_elements >= 0,
         errors::InvalidArgument("XLA compilation requires a fixed tensor list "
-                                "size. Use TensorListReserve instead."));
+                                "size. Set the max number of elements."));
+
+    xla::XlaOp list;
+    OP_REQUIRES_OK(ctx,
+                   CreateZerosList(ctx, 0, max_num_elements, dtype_, &list));
+
+    xla::XlaBuilder* b = ctx->builder();
+    ctx->SetTensorListOutput(
+        0, xla::Tuple(b, {list, xla::ConstantR0<int32>(b, 0)}));
   }
 
  private:
+  DataType dtype_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(EmptyTensorListOp);
 };
 
-REGISTER_XLA_OP(Name("EmptyTensorList"), EmptyTensorListOp);
+REGISTER_XLA_OP(Name("EmptyTensorList")
+                    .CompileTimeConstantInput("element_shape")
+                    .CompileTimeConstantInput("max_num_elements"),
+                EmptyTensorListOp);
 
 class TensorListElementShapeOp : public XlaOpKernel {
  public:
@@ -139,6 +211,168 @@ class TensorListElementShapeOp : public XlaOpKernel {
 
 REGISTER_XLA_OP(Name("TensorListElementShape"), TensorListElementShapeOp);
 
+class TensorListGetItemOp : public XlaOpKernel {
+ public:
+  explicit TensorListGetItemOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* b = ctx->builder();
+    xla::XlaOp state = ctx->Input(0);
+
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, GetTensorListShape(b, state, &shape));
+
+    xla::XlaOp ta = xla::GetTupleElement(state, 0);
+    xla::XlaOp index = ctx->Input(1);
+
+    // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
+    std::vector<xla::XlaOp> start_indices(shape.dims(),
+                                          xla::ConstantR0<int32>(b, 0));
+    start_indices[0] = index;
+    auto slice_shape = shape.dim_sizes();
+    slice_shape[0] = 1LL;
+
+    xla::XlaOp read = xla::DynamicSlice(ta, start_indices, slice_shape);
+    // Remove the leading '1' dimension.
+    std::vector<int64> value_shape(slice_shape.begin() + 1, slice_shape.end());
+
+    ctx->SetOutput(0, xla::Reshape(read, value_shape));
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorListGetItemOp);
+};
+
+REGISTER_XLA_OP(Name("TensorListGetItem"), TensorListGetItemOp);
+
+class TensorListStackOp : public XlaOpKernel {
+ public:
+  explicit TensorListStackOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaOp state = ctx->Input(0);
+    xla::XlaOp ta = xla::GetTupleElement(state, 0);
+    ctx->SetOutput(0, ta);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorListStackOp);
+};
+
+REGISTER_XLA_OP(Name("TensorListStack"), TensorListStackOp);
+
+class TensorListFromTensorOp : public XlaOpKernel {
+ public:
+  explicit TensorListFromTensorOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape element_shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(1, &element_shape));
+
+    const TensorShape tensor_shape = ctx->InputShape(0);
+    OP_REQUIRES(ctx, tensor_shape.dims() > 0,
+                errors::InvalidArgument("Input value must be at least a "
+                                        "vector but received shape: ",
+                                        tensor_shape.DebugString()));
+    const int num_elements = tensor_shape.dim_size(0);
+
+    xla::XlaBuilder* b = ctx->builder();
+    const xla::XlaOp tensor = ctx->Input(0);
+
+    ctx->SetTensorListOutput(
+        0, xla::Tuple(b, {tensor, xla::ConstantR0<int32>(b, num_elements)}));
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorListFromTensorOp);
+};
+
+REGISTER_XLA_OP(
+    Name("TensorListFromTensor").CompileTimeConstantInput("element_shape"),
+    TensorListFromTensorOp);
+
+// Returns the 0'th element of `tuple` containing the list tensor if it has been
+// initialized already else creates one lazily. This allows lazy initialization
+// of the list on the first call to SetItem or PushBack.
+Status GetInitializedList(XlaOpKernelContext* ctx, const xla::XlaOp& tuple,
+                          const TensorShape& element_shape, DataType dtype,
+                          xla::XlaOp* list) {
+  *list = xla::GetTupleElement(tuple, 0);
+  TensorShape list_shape;
+  TF_RETURN_IF_ERROR(GetTensorListShape(ctx->builder(), tuple, &list_shape));
+  int64 leading_dim = list_shape.dim_size(0);
+  TensorShape list_element_shape = list_shape;
+  list_element_shape.RemoveDim(0);
+  // This checks for the lazy initialization contract set by CreateEmptyList.
+  // In TensorListReserve if the element_shape is not known at compile time,
+  // it creates a list with shape [leading_dim, 0].
+  if (element_shape != list_element_shape) {
+    if (list_element_shape.num_elements() != 0) {
+      return errors::InvalidArgument(
+          "Invalid shape of value in TensorListSetItem. Expected: ",
+          list_element_shape.DebugString(),
+          " Actual: ", element_shape.DebugString());
+    }
+    list_shape = element_shape;
+    list_shape.InsertDim(0, leading_dim);
+    *list = xla::Broadcast(XlaHelpers::Zero(ctx->builder(), dtype),
+                           list_shape.dim_sizes());
+  }
+  return Status::OK();
+}
+
+class TensorListSetItemOp : public XlaOpKernel {
+ public:
+  explicit TensorListSetItemOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* b = ctx->builder();
+    xla::XlaOp tl = ctx->Input(0);
+    TensorShape elem_shape = ctx->InputShape(2);
+
+    xla::XlaOp list;
+    OP_REQUIRES_OK(ctx, GetInitializedList(ctx, tl, elem_shape, dtype_, &list));
+
+    xla::XlaOp index = ctx->Input(1);
+    xla::XlaOp value = ctx->Input(2);
+
+    // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
+    std::vector<xla::XlaOp> start_indices(elem_shape.dims() + 1,
+                                          xla::ConstantR0<int32>(b, 0));
+    start_indices[0] = index;
+
+    TensorShape slice_shape = elem_shape;
+    slice_shape.InsertDim(0, 1LL);
+    auto update = xla::Reshape(value, slice_shape.dim_sizes());
+
+    ctx->SetTensorListOutput(
+        0, xla::Tuple(b, {xla::DynamicUpdateSlice(list, update, start_indices),
+                          xla::GetTupleElement(tl, 1)}));
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorListSetItemOp);
+};
+
+REGISTER_XLA_OP(Name("TensorListSetItem"), TensorListSetItemOp);
+
 class TensorListPushBackOp : public XlaOpKernel {
  public:
   explicit TensorListPushBackOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
@@ -147,26 +381,27 @@ class TensorListPushBackOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     xla::XlaBuilder* b = ctx->builder();
-    xla::XlaOp list = ctx->Input(0);
+    xla::XlaOp list_tuple = ctx->Input(0);
     TensorShape elem_shape = ctx->InputShape(1);
 
-    xla::XlaOp ta = xla::GetTupleElement(list, 0);
-    xla::XlaOp index = xla::GetTupleElement(list, 1);
+    xla::XlaOp list;
+    OP_REQUIRES_OK(
+        ctx, GetInitializedList(ctx, list_tuple, elem_shape, dtype_, &list));
+
+    xla::XlaOp index = xla::GetTupleElement(list_tuple, 1);
     xla::XlaOp value = ctx->Input(1);
 
     // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
-    auto start_indices =
-        xla::Pad(xla::Reshape(index, {1}), xla::ConstantR0<int32>(b, 0),
-                 xla::MakeEdgePaddingConfig({{0, elem_shape.dims()}}));
+    std::vector<xla::XlaOp> start_indices(elem_shape.dims() + 1,
+                                          xla::ConstantR0<int32>(b, 0));
+    start_indices[0] = index;
 
     TensorShape slice_shape = elem_shape;
     slice_shape.InsertDim(0, 1LL);
     auto update = xla::Reshape(value, slice_shape.dim_sizes());
 
-    // TODO(phawkins): We don't check the index is in bounds --- there is no
-    // error mechanism in XLA.
-    ctx->SetOutput(
-        0, xla::Tuple(b, {xla::DynamicUpdateSlice(ta, update, start_indices),
+    ctx->SetTensorListOutput(
+        0, xla::Tuple(b, {xla::DynamicUpdateSlice(list, update, start_indices),
                           index + xla::ConstantR0<int32>(b, 1)}));
   }
 
@@ -197,20 +432,17 @@ class TensorListPopBackOp : public XlaOpKernel {
     index = index - xla::ConstantR0<int32>(b, 1);
 
     // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
-    auto start_indices =
-        xla::Pad(xla::Reshape(index, {1}), xla::ConstantR0<int32>(b, 0),
-                 xla::MakeEdgePaddingConfig({{0, shape.dims() - 1}}));
-
+    std::vector<xla::XlaOp> start_indices(shape.dims(),
+                                          xla::ConstantR0<int32>(b, 0));
+    start_indices[0] = index;
     auto slice_shape = shape.dim_sizes();
     slice_shape[0] = 1LL;
 
-    // TODO(phawkins): We don't check the index is in bounds --- there is no
-    // error mechanism in XLA.
     xla::XlaOp read = xla::DynamicSlice(ta, start_indices, slice_shape);
     // Remove the leading '1' dimension.
     std::vector<int64> value_shape(slice_shape.begin() + 1, slice_shape.end());
 
-    ctx->SetOutput(0, xla::Tuple(b, {ta, index}));
+    ctx->SetTensorListOutput(0, xla::Tuple(b, {ta, index}));
     ctx->SetOutput(1, xla::Reshape(read, value_shape));
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/training_ops.cc b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
index 960c1462ceb8c00a2d6c96564f6c985fd1caef0f..ceb762038009f7a3ff80d9ad4066af43d54a9e34 100644
--- a/tensorflow/compiler/tf2xla/kernels/training_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
@@ -172,6 +172,65 @@ class ResourceApplyMomentum : public XlaOpKernel {
 REGISTER_XLA_OP(Name("ResourceApplyMomentum").TypeConstraint("T", kFloatTypes),
                 ResourceApplyMomentum);
 
+class ResourceApplyKerasMomentum : public XlaOpKernel {
+ public:
+  explicit ResourceApplyKerasMomentum(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    DataType type = ctx->input_type(2);
+
+    TensorShape var_shape, accum_shape;
+    xla::XlaOp var, accum;
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &var_shape, &var));
+    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, type, &accum_shape, &accum));
+
+    OP_REQUIRES(ctx, var_shape.IsSameSize(accum_shape),
+                errors::InvalidArgument(
+                    "var and accum do not have the same shape",
+                    var_shape.DebugString(), " ", accum_shape.DebugString()));
+
+    TensorShape lr_shape = ctx->InputShape(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_shape),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr_shape.DebugString()));
+
+    TensorShape grad_shape = ctx->InputShape(3);
+    OP_REQUIRES(ctx, var_shape.IsSameSize(grad_shape),
+                errors::InvalidArgument(
+                    "var and grad do not have the same shape",
+                    var_shape.DebugString(), " ", grad_shape.DebugString()));
+
+    TensorShape momentum_shape = ctx->InputShape(4);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(momentum_shape),
+                errors::InvalidArgument("momentum is not a scalar: ",
+                                        momentum_shape.DebugString()));
+
+    xla::XlaOp lr = ctx->Input(2);
+    xla::XlaOp grad = ctx->Input(3);
+    xla::XlaOp momentum = ctx->Input(4);
+
+    accum = accum * momentum - grad * lr;
+    if (use_nesterov_) {
+      // See https://github.com/tensorflow/tensorflow/pull/2798 for an
+      // explanation of the reparameterization used here.
+      var = var + accum * momentum - grad * lr;
+    } else {
+      var = var + accum;
+    }
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, var));
+    OP_REQUIRES_OK(ctx, ctx->AssignVariable(1, type, accum));
+  }
+
+ private:
+  bool use_nesterov_;
+};
+REGISTER_XLA_OP(
+    Name("ResourceApplyKerasMomentum").TypeConstraint("T", kFloatTypes),
+    ResourceApplyKerasMomentum);
+
 class ResourceApplyAdagrad : public XlaOpKernel {
  public:
   explicit ResourceApplyAdagrad(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
@@ -797,15 +856,12 @@ class ResourceApplyAdadelta : public XlaOpKernel {
     xla::XlaOp grad = ctx->Input(6);
 
     xla::XlaBuilder* b = ctx->builder();
-    xla::XlaOp neg_half = XlaHelpers::FloatLiteral(b, dtype_, -0.5);
-    xla::XlaOp half = XlaHelpers::FloatLiteral(b, dtype_, 0.5);
     xla::XlaOp one = XlaHelpers::FloatLiteral(b, dtype_, 1.0);
-    xla::XlaOp two = XlaHelpers::FloatLiteral(b, dtype_, 2.0);
 
-    accum = rho * accum + (one - rho) * xla::Pow(grad, two);
-    xla::XlaOp update = xla::Pow(accum_update + epsilon, half) *
-                        xla::Pow(accum + epsilon, neg_half) * grad;
-    accum_update = rho * accum_update + (one - rho) * xla::Pow(update, two);
+    accum = rho * accum + (one - rho) * xla::Square(grad);
+    xla::XlaOp update =
+        xla::Sqrt(accum_update + epsilon) * xla::Rsqrt(accum + epsilon) * grad;
+    accum_update = rho * accum_update + (one - rho) * xla::Square(update);
     var = var - update * lr;
     OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, var));
     OP_REQUIRES_OK(ctx, ctx->AssignVariable(1, dtype_, accum));
diff --git a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
index c9b324a243e4cc3ec64daa3ca0d285336a0d0154..4ac714306248302242902f20d45d2609ef2c7cd3 100644
--- a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
@@ -19,14 +19,15 @@ limitations under the License.
 // helper.
 
 #include "tensorflow/core/kernels/transpose_op.h"
+#include "tensorflow/compiler/tf2xla/lib/scatter.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 namespace {
@@ -128,29 +129,46 @@ class InvertPermutationOp : public XlaOpKernel {
                 errors::InvalidArgument("permutation of nonnegative int32s "
                                         "must have <= int32 max elements"));
 
-    std::vector<int64> perm;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(0, &perm));
-
-    int size = perm.size();
+    auto e = ctx->InputExpression(0);
+    auto tensor_or_status = e.ResolveConstant(ctx->compiler()->client());
+    OP_REQUIRES_OK(ctx, tensor_or_status.status());
+    // If the input is a constant, we also want the output to be a constant.
+    // Some models rely on the result of InvertPermutation being a constant.
+    // TODO(b/32495713): Remove this when we can check whether Scatter is
+    // constant. Right now, we always assume it is non-constant because we don't
+    // check the embedded computation.
+    if (tensor_or_status.ValueOrDie().has_value()) {
+      std::vector<int64> perm;
+      OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(0, &perm));
+
+      int size = perm.size();
+
+      std::vector<int32> output(size);
+      std::fill_n(output.data(), size, -1);
+      for (int i = 0; i < size; ++i) {
+        const int64 d = perm[i];
+        OP_REQUIRES(ctx, FastBoundsCheck(d, size),
+                    errors::InvalidArgument(d, " is not between 0 and ", size));
+        OP_REQUIRES(ctx, output[d] == -1,
+                    errors::InvalidArgument(d, " is duplicated in the input."));
+        output[d] = i;
+      }
 
-    std::vector<int32> output(size);
-    std::fill_n(output.data(), size, -1);
-    for (int i = 0; i < size; ++i) {
-      const int64 d = perm[i];
-      OP_REQUIRES(ctx, FastBoundsCheck(d, size),
-                  errors::InvalidArgument(d, " is not between 0 and ", size));
-      OP_REQUIRES(ctx, output[d] == -1,
-                  errors::InvalidArgument(d, " is duplicated in the input."));
-      output[d] = i;
+      ctx->SetOutput(0, xla::ConstantR1<int32>(ctx->builder(), output));
+    } else {
+      auto indices = ctx->Input(0);
+      int size = ctx->InputShape(0).num_elements();
+      auto iota = xla::Iota(ctx->builder(), xla::S32, size);
+      auto result = XlaScatter(iota, iota, indices,
+                               /*indices_are_vectors=*/false, /*combiner=*/{},
+                               ctx->builder());
+      OP_REQUIRES_OK(ctx, result.status());
+      ctx->SetOutput(0, result.ValueOrDie());
     }
-
-    ctx->SetOutput(0, xla::ConstantR1<int32>(ctx->builder(), output));
   }
 };
 
-REGISTER_XLA_OP(Name("InvertPermutation")
-                    .TypeConstraint("T", DT_INT32)
-                    .CompileTimeConstantInput("x"),
+REGISTER_XLA_OP(Name("InvertPermutation").TypeConstraint("T", DT_INT32),
                 InvertPermutationOp);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
index a0ea6422d732b00fc1b8cf855d9c9ad603b87c82..62b5cd32da59063f8ce07119fd085f91ec3a1bc4 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
@@ -65,11 +65,8 @@ XLAJIT_MAKE_UNARY(Exp, xla::Exp(x));
 XLAJIT_MAKE_UNARY(Expm1, xla::Expm1(x));
 XLAJIT_MAKE_UNARY(Floor, xla::Floor(x));
 XLAJIT_MAKE_UNARY(IsFinite, xla::IsFinite(x));
-XLAJIT_MAKE_UNARY(
-    IsInf,
-    xla::Eq(xla::Abs(x),
-            xla::ScalarLike(x, std::numeric_limits<double>::infinity())));
-XLAJIT_MAKE_UNARY(IsNan, xla::Ne(x, x));
+XLAJIT_MAKE_UNARY(IsInf, xla::IsInf(x));
+XLAJIT_MAKE_UNARY(IsNan, xla::IsNan(x));
 // Return 1/x
 XLAJIT_MAKE_UNARY(Inv, xla::ScalarLike(x, 1.0) / x);
 XLAJIT_MAKE_UNARY(Reciprocal, xla::ScalarLike(x, 1.0) / x);
@@ -92,8 +89,9 @@ xla::XlaOp Sigmoid(xla::XlaOp x) {
 }
 XLAJIT_MAKE_UNARY(Sigmoid, Sigmoid(x));
 
-// Returns 0 if x is 0, -1 if x < 0 and 1 if x > 0.
-XLAJIT_MAKE_UNARY(Sign, xla::Sign(x));
+// Returns 0 if x is NaN, 0 if x is 0, -1 if x < 0 and 1 if x > 0.
+XLAJIT_MAKE_UNARY(Sign,
+                  xla::Select(xla::Ne(x, x), xla::ZerosLike(x), xla::Sign(x)));
 XLAJIT_MAKE_UNARY(Sinh, xla::Sinh(x));
 
 // softplus(x) = log(1 + exp(x))
@@ -116,37 +114,11 @@ XLAJIT_MAKE_UNARY(Tanh, xla::Tanh(x));
 
 XLAJIT_MAKE_UNARY(Real, xla::Real(x));
 XLAJIT_MAKE_UNARY(Imag, xla::Imag(x));
+XLAJIT_MAKE_UNARY(Erf, xla::Erf(x));
+XLAJIT_MAKE_UNARY(Erfc, xla::Erfc(x));
 
 #undef XLAJIT_MAKE_UNARY
 
-// Erf/Erfc.  For x in (-1, 1), the erf approximation is used; erfc polynomial
-// is used outside of this range.
-class ErfOp : public XlaOpKernel {
- public:
-  explicit ErfOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
-  void Compile(XlaOpKernelContext* ctx) override {
-    xla::XlaOp x = ctx->Input(0);
-    xla::XlaOp one = xla::ScalarLike(x, 1.0);
-    auto y =
-        xla::Select(xla::Gt(xla::Abs(x), one), one - xla::Erfc(x), xla::Erf(x));
-    ctx->SetOutput(0, y);
-  }
-};
-REGISTER_XLA_OP(Name("Erf"), ErfOp);
-
-class ErfcOp : public XlaOpKernel {
- public:
-  explicit ErfcOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
-  void Compile(XlaOpKernelContext* ctx) override {
-    xla::XlaOp x = ctx->Input(0);
-    xla::XlaOp one = xla::ScalarLike(x, 1.0);
-    auto y =
-        xla::Select(xla::Lt(xla::Abs(x), one), one - xla::Erf(x), xla::Erfc(x));
-    ctx->SetOutput(0, y);
-  }
-};
-REGISTER_XLA_OP(Name("Erfc"), ErfcOp);
-
 class LgammaOp : public XlaOpKernel {
  public:
   explicit LgammaOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
diff --git a/tensorflow/compiler/tf2xla/kernels/unpack_op.cc b/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
index 8671632976023fded04c26a9780c1a67638b0916..2fc5619de737b8977e4249e4d2297a0303c339ce 100644
--- a/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
@@ -24,12 +24,12 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
index 2c92a585f5679242d672d0402e617ff199b94f17..dfa09b16081e93ba843a1858e68e6ff756de20c1 100644
--- a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
@@ -291,5 +291,19 @@ class ResourceScatterNdAddOp : public ResourceScatterOp {
 };
 REGISTER_XLA_OP(Name("ResourceScatterNdAdd"), ResourceScatterNdAddOp);
 
+class ResourceScatterNdSubOp : public ResourceScatterOp {
+ public:
+  explicit ResourceScatterNdSubOp(OpKernelConstruction* context)
+      : ResourceScatterOp(context, /*indices_are_vectors=*/true,
+                          /*combiner=*/Combine) {}
+
+ private:
+  static xla::XlaOp Combine(const xla::XlaOp& x, const xla::XlaOp& y,
+                            xla::XlaBuilder* builder) {
+    return xla::Sub(x, y);
+  }
+};
+REGISTER_XLA_OP(Name("ResourceScatterNdSub"), ResourceScatterNdSubOp);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
index ce007fc04a818869686b9936a1607cee42665e87..f49da9683b3622bdda708cc305306baafa1639df 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -41,8 +41,7 @@ Status MakeXlaCompilerArgumentsFromInputs(
   *has_uninitialized_vars = false;
   *has_tensor_arrays = false;
   for (int i = 0; i < ctx->num_inputs(); ++i) {
-    VLOG(2) << " Input " << i
-            << " type: " << DataTypeString(ctx->input_type(i))
+    VLOG(2) << " Input " << i << " type: " << DataTypeString(ctx->input_type(i))
             << " shape: " << ctx->InputShape(i).DebugString();
     XlaCompiler::Argument& arg = (*args)[i];
     DataType type = ctx->input_type(i);
@@ -71,13 +70,20 @@ Status MakeXlaCompilerArgumentsFromInputs(
       arg.name = resource->name();
       VLOG(2) << "    resource " << resource->name()
               << " type: " << DataTypeString(arg.type)
-              << " shape: " << arg.shape.DebugString()
+              << " shape: " << arg.ShapeHumanString()
               << " initialized: " << arg.initialized;
 
     } else {
       arg.kind = XlaCompiler::Argument::kParameter;
       arg.type = ctx->input_type(i);
-      arg.shape = ctx->InputShape(i);
+
+      xla::XlaBuilder* builder = ctx->builder();
+      xla::XlaOp handle = ctx->Input(i);
+      auto shape_or_status = builder->GetShape(handle);
+      if (!shape_or_status.ok()) {
+        return shape_or_status.status();
+      }
+      arg.shape = shape_or_status.ValueOrDie();
     }
   }
   return Status::OK();
@@ -207,12 +213,12 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
   OP_REQUIRES(ctx, body.xla_input_shapes.size() == 1,
               errors::FailedPrecondition("Expected one input shape"));
   xla::Shape body_input_shape = body.xla_input_shapes[0];
-  OP_REQUIRES(ctx, xla::ShapeUtil::IsTuple(body_input_shape),
+  OP_REQUIRES(ctx, body_input_shape.IsTuple(),
               errors::FailedPrecondition("Expected tuple shape"));
   OP_REQUIRES(ctx, cond.xla_input_shapes.size() == 1,
               errors::FailedPrecondition("Expected one input shape"));
   xla::Shape cond_input_shape = cond.xla_input_shapes[0];
-  OP_REQUIRES(ctx, xla::ShapeUtil::IsTuple(cond_input_shape),
+  OP_REQUIRES(ctx, cond_input_shape.IsTuple(),
               errors::FailedPrecondition("Expected tuple shape"));
 
   VLOG(2) << "Body shape: " << xla::ShapeUtil::HumanString(body_input_shape)
@@ -233,13 +239,22 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
           xla::ShapeUtil::HumanString(body_input_shape), " vs. ",
           xla::ShapeUtil::HumanString(body.xla_output_shape)));
 
-  xla::Shape expected_cond_output_shape = xla::ShapeUtil::MakeTupleShape(
-      {xla::ShapeUtil::MakeShape(xla::PRED, {})});
+  xla::Shape expected_cond_output_shape_without_side_effect =
+      xla::ShapeUtil::MakeTupleShape(
+          {xla::ShapeUtil::MakeShape(xla::PRED, {})});
+  xla::Shape expected_cond_output_shape_with_side_effect =
+      xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::PRED, {}),
+                                      xla::ShapeUtil::MakeTokenShape()});
   OP_REQUIRES(ctx,
-              xla::ShapeUtil::Compatible(cond.xla_output_shape,
-                                         expected_cond_output_shape),
+              xla::ShapeUtil::Compatible(
+                  cond.xla_output_shape,
+                  expected_cond_output_shape_without_side_effect) ||
+                  xla::ShapeUtil::Compatible(
+                      cond.xla_output_shape,
+                      expected_cond_output_shape_with_side_effect),
               errors::InvalidArgument(
-                  "Output shape of loop condition should be (pred[]), got: ",
+                  "Output shape of loop condition should be (pred[]) or "
+                  "(pred[], token[]), got: ",
                   xla::ShapeUtil::HumanString(cond.xla_output_shape)));
 
   int num_inputs = body.input_mapping.size();
@@ -283,11 +298,15 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
 
   xla::XlaOp while_result = xla::While(cond_wrapper, *body.computation, init);
 
-  // Sets non-variable outputs.
+  // Sets non-variable outputs and determine when resource variables start.
+  int resource_index = 0;
   for (int i = 0; i < ctx->num_outputs(); ++i) {
     if (ctx->input_type(i) != DT_RESOURCE) {
       ctx->SetOutput(body.input_mapping[i],
                      xla::GetTupleElement(while_result, i));
+      ++resource_index;
+    } else {
+      break;
     }
   }
   if (has_token_input_output_) {
@@ -296,7 +315,7 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
         xla::GetTupleElement(while_result, ctx->num_outputs());
     auto shape_or = builder->GetShape(token_output);
     OP_REQUIRES_OK(ctx, shape_or.status());
-    OP_REQUIRES(ctx, xla::ShapeUtil::IsToken(shape_or.ValueOrDie()),
+    OP_REQUIRES(ctx, shape_or.ValueOrDie().IsToken(),
                 errors::FailedPrecondition(
                     "Token output is not token type: ",
                     xla::ShapeUtil::HumanString(shape_or.ValueOrDie())));
@@ -309,7 +328,7 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
     XlaResource* resource;
     OP_REQUIRES_OK(ctx, ctx->GetResourceInput(update.input_index, &resource));
     if (update.modified) {
-      int pos = body.outputs.size() + i;
+      int pos = resource_index + i;
       OP_REQUIRES_OK(ctx,
                      resource->SetFromPack(
                          arguments[update.input_index].tensor_array_gradients,
@@ -329,8 +348,11 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
   VLOG(1) << "Done building while loop";
 }
 
-REGISTER_XLA_OP(Name("While").AllowResourceTypes(), XlaWhileOp);
-REGISTER_XLA_OP(Name("StatelessWhile").AllowResourceTypes(), XlaWhileOp);
-REGISTER_XLA_OP(Name("XlaWhile").AllowResourceTypes(), XlaWhileOp);
+REGISTER_XLA_OP(Name("While").AllowResourceTypes().AllowVariantTypes(),
+                XlaWhileOp);
+REGISTER_XLA_OP(Name("StatelessWhile").AllowResourceTypes().AllowVariantTypes(),
+                XlaWhileOp);
+REGISTER_XLA_OP(Name("XlaWhile").AllowResourceTypes().AllowVariantTypes(),
+                XlaWhileOp);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_conv_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_conv_op.cc
index 4612f19971a3ce6994aef303f751748b77ccda9a..b20adc592a0d3d2129c897218ddbfc891b4cd40a 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_conv_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_conv_op.cc
@@ -78,7 +78,7 @@ class XlaConvOp : public XlaOpKernel {
     xla::XlaOp output = xla::ConvGeneralDilated(
         context->Input(0), context->Input(1), window_strides, padding,
         lhs_dilation, rhs_dilation, dnums_, feature_group_count,
-        &precision_config_);
+        /*batch_group_count=*/1, &precision_config_);
     context->SetOutput(0, output);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_dequantize_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_dequantize_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a30b4861f6b3a964c0c874a3affab7d6198264d7
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/xla_dequantize_op.cc
@@ -0,0 +1,60 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/quantize.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace {
+
+class XlaDequantizeOp : public XlaOpKernel {
+ public:
+  explicit XlaDequantizeOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("min_range", &min_range_));
+    OP_REQUIRES_OK(context, context->GetAttr("max_range", &max_range_));
+    OP_REQUIRES_OK(context, context->GetAttr("mode", &mode_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("transpose_output", &transpose_output_));
+  }
+
+  void Compile(XlaOpKernelContext* context) override {
+    const xla::XlaOp& input = context->Input(0);
+
+    xla::QuantizedRange range(min_range_, max_range_);
+
+    xla::XlaOp output =
+        xla::Dequantize<uint8>(input, range, mode_, transpose_output_);
+    context->SetOutput(0, output);
+  }
+
+ private:
+  float min_range_;
+  float max_range_;
+  bool transpose_output_;
+  string mode_;
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaDequantizeOp);
+};
+
+REGISTER_XLA_OP(Name("XlaDequantize"), XlaDequantizeOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_self_adjoint_eig_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_self_adjoint_eig_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..233ac8e7b455403f8ee65b95b1403ecefdb92c6b
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/xla_self_adjoint_eig_op.cc
@@ -0,0 +1,66 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/self_adjoint_eig.h"
+#include "tensorflow/core/lib/core/bits.h"
+
+namespace tensorflow {
+namespace {
+
+class XlaSelfAdjointEigOp : public XlaOpKernel {
+ public:
+  explicit XlaSelfAdjointEigOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("lower", &lower_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_iter", &max_iter_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("epsilon", &epsilon_));
+  }
+  void Compile(XlaOpKernelContext* ctx) override {
+    auto result =
+        xla::SelfAdjointEig(ctx->Input(0), lower_, max_iter_, epsilon_);
+    ctx->SetOutput(0, result.w);
+    ctx->SetOutput(1, result.v);
+  }
+
+ private:
+  bool lower_;
+  int32 max_iter_;
+  float epsilon_;
+};
+
+class SelfAdjointEigV2Op : public XlaOpKernel {
+ public:
+  explicit SelfAdjointEigV2Op(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorShape input_shape = ctx->InputShape("input");
+    int n = input_shape.dim_size(input_shape.dims() - 1);
+    // This is based on heuristics that approx log(n) sweep updates are needed.
+    // Note: the heuristics provides no theoretical guarantee, max_iter=100 and
+    // epsilon should be used to determine exit condition.
+    int max_iter = 2 * tensorflow::Log2Ceiling(n);
+    auto result = xla::SelfAdjointEig(ctx->Input(0), true, max_iter, 1e-6);
+    ctx->SetOutput(0, result.w);
+    ctx->SetOutput(1, result.v);
+  }
+};
+
+REGISTER_XLA_OP(Name("XlaSelfAdjointEig").TypeConstraint("T", kFloatTypes),
+                XlaSelfAdjointEigOp);
+REGISTER_XLA_OP(Name("SelfAdjointEigV2").TypeConstraint("T", kFloatTypes),
+                SelfAdjointEigV2Op);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
index 3e7a761120317ff85947559b7b2e52be9232afb7..3d7b0bc959f9dbf3c1b9749379e2ea0d285b302b 100644
--- a/tensorflow/compiler/tf2xla/lib/BUILD
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -15,8 +15,6 @@ filegroup(
     ]),
 )
 
-load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
-
 cc_library(
     name = "broadcast",
     srcs = ["broadcast.cc"],
@@ -33,27 +31,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "cholesky",
-    srcs = ["cholesky.cc"],
-    hdrs = ["cholesky.h"],
-    deps = [
-        ":util",
-        ":while_loop",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/compiler/xla/client/lib:matrix",
-        "//tensorflow/compiler/xla/client/lib:slicing",
-        "//tensorflow/compiler/xla/client/lib:triangular_solve",
-        "//tensorflow/core:lib",
-    ],
-)
-
 cc_library(
     name = "random",
     srcs = ["random.cc"],
@@ -69,35 +46,12 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "qr",
-    srcs = ["qr.cc"],
-    hdrs = ["qr.h"],
-    deps = [
-        ":util",
-        ":while_loop",
-        "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/compiler/xla/client/lib:math",
-        "//tensorflow/compiler/xla/client/lib:matrix",
-        "//tensorflow/compiler/xla/client/lib:slicing",
-        "//tensorflow/core:lib",
-    ],
-)
-
 cc_library(
     name = "scatter",
     srcs = ["scatter.cc"],
     hdrs = ["scatter.h"],
     deps = [
         ":util",
-        ":while_loop",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -128,19 +82,3 @@ cc_library(
         "@com_google_absl//absl/types:span",
     ],
 )
-
-cc_library(
-    name = "while_loop",
-    srcs = ["while_loop.cc"],
-    hdrs = ["while_loop.h"],
-    deps = [
-        ":util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-    ],
-)
diff --git a/tensorflow/compiler/tf2xla/lib/scatter.cc b/tensorflow/compiler/tf2xla/lib/scatter.cc
index 2b1c2ced925d9fee7392986015a6e716a94d356f..1cd5a79171dccd57fc1b7941cdf16417301ff7f8 100644
--- a/tensorflow/compiler/tf2xla/lib/scatter.cc
+++ b/tensorflow/compiler/tf2xla/lib/scatter.cc
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/lib/util.h"
-#include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
@@ -49,7 +48,7 @@ xla::StatusOr<xla::XlaOp> XlaScatter(
   if (indices_are_vectors) {
     TF_RET_CHECK(!indices_dims.empty());
     num_index_dims = indices_dims.back();
-    if (num_index_dims > xla::ShapeUtil::Rank(buffer_shape)) {
+    if (num_index_dims > buffer_shape.rank()) {
       return errors::InvalidArgument(
           "The size of the minor dimension of the indices (shape: ",
           xla::ShapeUtil::HumanString(indices_shape),
@@ -141,8 +140,8 @@ xla::StatusOr<xla::XlaOp> XlaScatter(
                                        ? indices_shape.dimensions_size() - 1
                                        : indices_shape.dimensions_size());
 
-  int64 updates_rank = xla::ShapeUtil::Rank(updates_shape);
-  int64 buffer_rank = xla::ShapeUtil::Rank(buffer_shape);
+  int64 updates_rank = updates_shape.rank();
+  int64 buffer_rank = buffer_shape.rank();
   int64 num_window_dims_in_updates = buffer_rank - num_index_dims;
 
   // If the rank of `updates` is 0 and does not match the expected rank of
@@ -157,7 +156,7 @@ xla::StatusOr<xla::XlaOp> XlaScatter(
   if (updates_rank == 0 && expected_updates_rank != 0) {
     new_updates = xla::Broadcast(updates, expected_updates_dims);
     TF_ASSIGN_OR_RETURN(updates_shape, builder->GetShape(new_updates));
-    updates_rank = xla::ShapeUtil::Rank(updates_shape);
+    updates_rank = updates_shape.rank();
   }
 
   if (updates_rank > 0) {
diff --git a/tensorflow/compiler/tf2xla/lib/util.cc b/tensorflow/compiler/tf2xla/lib/util.cc
index c0bd172d17c192435ba8ee196f9def0491c0bf5c..06eda41611861060a1f1c4d028b96405d288efdb 100644
--- a/tensorflow/compiler/tf2xla/lib/util.cc
+++ b/tensorflow/compiler/tf2xla/lib/util.cc
@@ -54,6 +54,9 @@ xla::XlaOp FloatLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type,
     case xla::C64:
       return xla::ConstantR0<xla::complex64>(builder, value);
       break;
+    case xla::C128:
+      return xla::ConstantR0<xla::complex128>(builder, value);
+      break;
     default:
       LOG(FATAL) << "unhandled element type " << type;
   }
@@ -90,6 +93,9 @@ xla::XlaOp IntegerLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type,
     case xla::C64:
       literal = xla::LiteralUtil::CreateR0<complex64>(value);
       break;
+    case xla::C128:
+      literal = xla::LiteralUtil::CreateR0<complex128>(value);
+      break;
     case xla::PRED:
       LOG(FATAL) << "pred element type is not integral";
     case xla::S16:
diff --git a/tensorflow/compiler/tf2xla/literal_util.cc b/tensorflow/compiler/tf2xla/literal_util.cc
index 67d08290033361f16dfff42b06af9b253e84963a..749a7c3054a65d6ec9f9dc13f6f4a713ac9d3d5a 100644
--- a/tensorflow/compiler/tf2xla/literal_util.cc
+++ b/tensorflow/compiler/tf2xla/literal_util.cc
@@ -77,7 +77,7 @@ Status HostTensorsToBorrowingLiteralTuple(absl::Span<const Tensor> host_tensors,
 
 Status CopyLiteralToHostTensor(const xla::LiteralSlice& literal,
                                Tensor* host_tensor) {
-  TF_RET_CHECK(xla::ShapeUtil::IsArray(literal.shape()) &&
+  TF_RET_CHECK(literal.shape().IsArray() &&
                xla::ShapeUtil::ElementsIn(literal.shape()) ==
                    host_tensor->NumElements());
   xla::PrimitiveType primitive_type;
diff --git a/tensorflow/compiler/tf2xla/literal_util_test.cc b/tensorflow/compiler/tf2xla/literal_util_test.cc
index 15f4c38da29507da9e092c1d5725b5f95a81d1b9..44bccfe6474d175beda392ca17dfbcb08c0b1b11 100644
--- a/tensorflow/compiler/tf2xla/literal_util_test.cc
+++ b/tensorflow/compiler/tf2xla/literal_util_test.cc
@@ -49,7 +49,7 @@ using Types =
                      std::pair<int16, qint16>, std::pair<uint16, quint16>,
                      std::pair<int32, qint32>>;
 
-TYPED_TEST_CASE(LiteralUtilTest, Types);
+TYPED_TEST_SUITE(LiteralUtilTest, Types);
 
 TYPED_TEST(LiteralUtilTest, LiteralToQuantizedHostTensor) {
   using int_type = typename TypeParam::first_type;
diff --git a/tensorflow/compiler/tf2xla/ops/BUILD b/tensorflow/compiler/tf2xla/ops/BUILD
index 4dce0a2102cf9c782850ccc7af4f14b59bd51e53..7140b6a1227a53290c3747892a55886a7f48513b 100644
--- a/tensorflow/compiler/tf2xla/ops/BUILD
+++ b/tensorflow/compiler/tf2xla/ops/BUILD
@@ -4,7 +4,11 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_custom_op_library",
+    "tf_gen_op_wrapper_py",
+)
 
 cc_library(
     name = "xla_ops",
@@ -24,3 +28,14 @@ tf_gen_op_wrapper_py(
         ":xla_ops",
     ],
 )
+
+tf_custom_op_library(
+    name = "_xla_ops.so",
+    srcs = [
+        "xla_ops.cc",
+    ],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index bd2c0a5ee88869ba60701c0a7ace05857452eed9..ccd58071d350e605e0e1f0c2b43643a400e32c2c 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -56,6 +56,41 @@ lhs_output: the broadcasted LHS tensor
 rhs_output: the broadcasted RHS tensor
 )doc");
 
+REGISTER_OP("XlaSelfAdjointEig")
+    .Input("a: T")
+    .Attr("lower: bool")
+    .Attr("max_iter: int")
+    .Attr("epsilon: float")
+    .Output("w: T")
+    .Output("v: T")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Attr("T: numbertype")
+    .Doc(R"doc(
+Computes the eigen decomposition of a batch of self-adjoint matrices
+(Note: Only real inputs are supported).
+
+Computes the eigenvalues and eigenvectors of the innermost N-by-N matrices in
+tensor such that tensor[...,:,:] * v[..., :,i] = e[..., i] * v[...,:,i], for
+i=0...N-1.
+
+a: the input tensor.
+
+lower: a boolean specifies whether the calculation is done with the lower
+  triangular part or the upper triangular part.
+
+max_iter: maximum number of sweep update, i.e., the whole lower triangular
+  part or upper triangular part based on parameter lower. Heuristically, it has
+  been argued that approximatly logN sweeps are needed in practice (Ref: Golub &
+  van Loan "Matrix Computation").
+
+epsilon: the tolerance ratio.
+
+w: The eigenvalues in ascending order, each repeated according to its
+  multiplicity.
+v: The column v[..., :, i] is the normalized eigenvector corresponding to the
+  eigenvalue w[..., i].
+)doc");
+
 REGISTER_OP("XlaConv")
     .Input("lhs: T")
     .Input("rhs: T")
@@ -369,7 +404,11 @@ REGISTER_OP("XlaKeyValueSort")
     .Output("sorted_values: V")
     .Attr("K: realnumbertype")
     .Attr("V: type")
-    .SetShapeFn(shape_inference::UnchangedShape)
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      c->set_output(1, c->input(1));
+      return Status::OK();
+    })
     .Doc(R"doc(
 Wraps the XLA Sort operator, documented at
  https://www.tensorflow.org/performance/xla/operation_semantics#sort
@@ -409,5 +448,29 @@ body: A function that takes a list of tensors and returns another
       list of tensors. Both lists have the same types as specified by T.
 )doc");
 
+REGISTER_OP("XlaDequantize")
+    .Input("input: uint32")
+    .Output("output: bfloat16")
+    .Attr("min_range: float")
+    .Attr("max_range: float")
+    .Attr("mode: string")
+    .Attr("transpose_output: bool")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(R"doc(
+Takes the packed uint32 input and unpacks the input to uint8 to do
+Dequantization on deivce.
+
+input: Input tensors whose types is uint32, shape is [d0, ..., dn].
+output: Output tensors whose types is bloat16. If transpose_output is true,
+     output shape is [dn * 4, dn-1, ..., d1, d0]. If transpose_output
+     is false, output shape is [d0,..., dn * 4].
+min_range: The minimum scalar value possibly produced for the input.
+max_range: The maximum scalar value possibly produced for the input.
+mode: String to determine the dequantize mode in {"MIN_COMBINED", "MIN_FIRST", "SCALED"}.
+transpose_output: Boolean to determine if output is transposed. transpose_output
+     is faster when input is large and rank of input is higher than 1.
+)doc");
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/python/BUILD b/tensorflow/compiler/tf2xla/python/BUILD
index fef97b98c376d9df8bbfd9cb6651216895e46bf4..9abdb04d7736e8ff5225688af4759a522d3e7fc7 100644
--- a/tensorflow/compiler/tf2xla/python/BUILD
+++ b/tensorflow/compiler/tf2xla/python/BUILD
@@ -15,6 +15,7 @@ load(
     "//tensorflow/core:platform/default/build_config.bzl",
     "tf_py_clif_cc",
 )
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 tf_py_clif_cc(
     name = "xla_op_registry",
@@ -27,9 +28,13 @@ tf_py_clif_cc(
     ],
 )
 
-py_library(
+tf_custom_op_py_library(
     name = "xla",
     srcs = ["xla.py"],
+    dso = ["//tensorflow/compiler/tf2xla/ops:_xla_ops.so"],
+    kernels = [
+        "//tensorflow/compiler/tf2xla/ops:xla_ops",
+    ],
     deps = [
         "//tensorflow/compiler/tf2xla/ops:gen_xla_ops",
         "//tensorflow/compiler/xla:xla_data_proto_py",
diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
index 147e562658bbfc445f99268812e2c3ae1ee61e30..de4710d03a3e69afb04aa68e37961698f0e3a300 100644
--- a/tensorflow/compiler/tf2xla/python/xla.py
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -291,6 +291,10 @@ def dot_general(lhs, rhs, dimension_numbers, precision_config=None, name=None):
       name=name)
 
 
+def self_adjoint_eig(a, lower, max_iter, epsilon):
+  return gen_xla_ops.xla_self_adjoint_eig(a, lower, max_iter, epsilon)
+
+
 dynamic_slice = gen_xla_ops.xla_dynamic_slice
 dynamic_update_slice = gen_xla_ops.xla_dynamic_update_slice
 
@@ -386,3 +390,4 @@ def slice(x, start_dims, limit_dims, strides):
 sort = gen_xla_ops.xla_sort
 key_value_sort = gen_xla_ops.xla_key_value_sort
 while_loop = gen_xla_ops.xla_while
+dequantize = gen_xla_ops.xla_dequantize
diff --git a/tensorflow/compiler/tf2xla/resource_operation_table.cc b/tensorflow/compiler/tf2xla/resource_operation_table.cc
index 72b240996fb4d9dcb5f5dfd919da618cbae08c16..c20d6a5fd1f3bd7dad30cb3359d13ed4609a2250 100644
--- a/tensorflow/compiler/tf2xla/resource_operation_table.cc
+++ b/tensorflow/compiler/tf2xla/resource_operation_table.cc
@@ -65,6 +65,7 @@ CreateResourceOpInfoMap() {
   add("ResourceApplyFtrlV2"                  , kReadWrite, kVariable);
   add("ResourceApplyGradientDescent"         , kReadWrite, kVariable);
   add("ResourceApplyMomentum"                , kReadWrite, kVariable);
+  add("ResourceApplyKerasMomentum"           , kReadWrite, kVariable);
   add("ResourceApplyPowerSign"               , kReadWrite, kVariable);
   add("ResourceApplyProximalAdagrad"         , kReadWrite, kVariable);
   add("ResourceApplyProximalGradientDescent" , kReadWrite, kVariable);
@@ -76,6 +77,7 @@ CreateResourceOpInfoMap() {
   add("ResourceScatterMin"                   , kReadWrite, kVariable);
   add("ResourceScatterMul"                   , kReadWrite, kVariable);
   add("ResourceScatterNdAdd"                 , kReadWrite, kVariable);
+  add("ResourceScatterNdSub"                 , kReadWrite, kVariable);
   add("ResourceScatterNdUpdate"              , kReadWrite, kVariable);
   add("ResourceScatterSub"                   , kReadWrite, kVariable);
   add("ResourceScatterUpdate"                , kReadWrite, kVariable);
diff --git a/tensorflow/compiler/tf2xla/shape_util.cc b/tensorflow/compiler/tf2xla/shape_util.cc
index b589512dcdfa32050281120aba6a5ae89a980c2f..8997b2f5c68da480e9d4cb1f7ff8776690363392 100644
--- a/tensorflow/compiler/tf2xla/shape_util.cc
+++ b/tensorflow/compiler/tf2xla/shape_util.cc
@@ -18,21 +18,81 @@ limitations under the License.
 #include <numeric>
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
+namespace {
+
+Status PopulateInfeedLayoutVector(const xla::Shape& shape,
+                                  std::vector<int>* layouts) {
+  if (shape.IsTuple()) {
+    int64 tuple_elements = xla::ShapeUtil::TupleElementCount(shape);
+    for (int64 i = 0; i < tuple_elements; ++i) {
+      const xla::Shape& subshape =
+          xla::ShapeUtil::GetTupleElementShape(shape, i);
+      TF_RETURN_IF_ERROR(PopulateInfeedLayoutVector(subshape, layouts));
+    }
+  } else if (xla::LayoutUtil::HasLayout(shape)) {
+    for (auto dim : xla::LayoutUtil::MinorToMajor(shape)) {
+      layouts->push_back(dim);
+    }
+  } else {
+    layouts->insert(layouts->end(), shape.rank(), -1);
+  }
+  return Status::OK();
+}
+
+// Populate the output layout unless the minor_to_major array contains all -1
+// value, in which case the layout is considered missing and the API returns
+// false.
+xla::StatusOr<bool> MakeLayout(absl::Span<const int64> minor_to_major,
+                               xla::Layout* layout) {
+  if (std::all_of(minor_to_major.begin(), minor_to_major.end(),
+                  [](int64 dim) { return dim == -1; })) {
+    return false;
+  }
+  std::vector<bool> dim_present(minor_to_major.size(), false);
+  for (auto dim : minor_to_major) {
+    if (dim < 0 || dim >= minor_to_major.size()) {
+      return errors::InvalidArgument("Layout dimension out of range: dim=", dim,
+                                     " rank=", minor_to_major.size());
+    }
+    if (dim_present[dim]) {
+      return errors::InvalidArgument("Repeated layout dimension: dim=", dim);
+    }
+    dim_present[dim] = true;
+  }
+  *layout = xla::LayoutUtil::MakeLayout(minor_to_major);
+  return true;
+}
+
+Status AssignLayout(
+    absl::Span<const int64> minor_to_major,
+    const std::function<xla::Layout(const xla::Shape&)>& layout_func,
+    xla::Shape* shape) {
+  xla::Layout layout;
+  TF_ASSIGN_OR_RETURN(bool has_layout, MakeLayout(minor_to_major, &layout));
+  if (!has_layout && layout_func) {
+    layout = layout_func(*shape);
+  }
+  *shape->mutable_layout() = layout;
+  return Status::OK();
+}
+
+}  // namespace
 
 // Convert an XLA Shape into the equivalent TensorFlow shape.
 Status XLAShapeToTensorShape(const xla::Shape& shape,
                              TensorShape* tensor_shape) {
-  if (xla::ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     return errors::InvalidArgument("XLA shape ",
                                    xla::ShapeUtil::HumanString(shape),
                                    " cannot be converted to a TensorShape");
   }
   *tensor_shape = TensorShape();
-  for (int i = 0; i < xla::ShapeUtil::Rank(shape); ++i) {
+  for (int i = 0; i < shape.rank(); ++i) {
     tensor_shape->AddDim(shape.dimensions(i));
   }
   return Status::OK();
@@ -61,4 +121,64 @@ xla::Shape TensorShapeToXLAShape(xla::PrimitiveType type,
   return xla::ShapeUtil::MakeShapeWithLayout(type, dimensions, layout);
 }
 
+xla::StatusOr<std::vector<int>> GetShapeLayoutVector(const xla::Shape& shape) {
+  std::vector<int> layouts;
+  TF_RETURN_IF_ERROR(PopulateInfeedLayoutVector(shape, &layouts));
+  return layouts;
+}
+
+Status GetShapeWithLayout(
+    const xla::Shape& input_shape, absl::Span<const int64> minor_to_major,
+    const std::function<xla::Layout(const xla::Shape&)>& layout_func,
+    xla::Shape* output_shape) {
+  if (input_shape.IsTuple()) {
+    int64 tuple_elements = xla::ShapeUtil::TupleElementCount(input_shape);
+    std::vector<xla::Shape> shapes;
+    shapes.reserve(tuple_elements);
+    size_t position = 0;
+    for (int64 i = 0; i < tuple_elements; ++i) {
+      const xla::Shape& shape =
+          xla::ShapeUtil::GetTupleElementShape(input_shape, i);
+      if (shape.IsTuple()) {
+        return errors::InvalidArgument(
+            "Nested tuples not supported: ",
+            xla::ShapeUtil::HumanString(input_shape));
+      }
+      int64 rank = shape.rank();
+      if (position + rank > minor_to_major.size()) {
+        return errors::InvalidArgument(
+            "Not enough layout attribute elements: position=", position,
+            " rank=", rank, " elements=", minor_to_major.size());
+      }
+      shapes.push_back(shape);
+      TF_RETURN_IF_ERROR(AssignLayout(
+          absl::Span<const int64>(minor_to_major).subspan(position, rank),
+          layout_func, &shapes.back()));
+      position += rank;
+
+      VLOG(4) << "Shape[" << i
+              << "] = " << xla::ShapeUtil::HumanStringWithLayout(shapes.back());
+    }
+    if (position != minor_to_major.size()) {
+      return errors::InvalidArgument(
+          "Too many elements passed in the layout attribute: position=",
+          position, " size=", minor_to_major.size());
+    }
+    *output_shape = xla::ShapeUtil::MakeTupleShape(shapes);
+  } else {
+    int64 rank = input_shape.rank();
+    if (rank != minor_to_major.size()) {
+      return errors::InvalidArgument(
+          "Wrong number of layout attribute elements: rank=", rank,
+          " elements=", minor_to_major.size());
+    }
+    *output_shape = input_shape;
+    TF_RETURN_IF_ERROR(AssignLayout(minor_to_major, layout_func, output_shape));
+
+    VLOG(4) << "Shape[] = "
+            << xla::ShapeUtil::HumanStringWithLayout(*output_shape);
+  }
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/shape_util.h b/tensorflow/compiler/tf2xla/shape_util.h
index 0b231ea8e7a2d8e303e91911e2e0a36fc83e78b4..e775c4462c3dc15cf4b8d9e8d8e7d9a61e024cd0 100644
--- a/tensorflow/compiler/tf2xla/shape_util.h
+++ b/tensorflow/compiler/tf2xla/shape_util.h
@@ -18,7 +18,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_SHAPE_UTIL_H_
 #define TENSORFLOW_COMPILER_TF2XLA_SHAPE_UTIL_H_
 
+#include <vector>
+
 #include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -41,6 +44,25 @@ Status TensorShapeToXLAShape(DataType dtype, const TensorShape& tensor_shape,
 xla::Shape TensorShapeToXLAShape(xla::PrimitiveType type,
                                  const TensorShape& tensor_shape);
 
+// Given an XLA shape with layouts, builds a layout vector in the form able to
+// be fed to ops like InfeedEnqueue/InfeedEnqueueTuple/XRTAllocateV2/....
+// THe returned vector is a linearized sequence of the minor-to-major values of
+// the layouts held within the input shape.
+// In case the input shape is a tuple, the minor-to-major values will be in the
+// order of the tuple elements within the tuple shape.
+// If a shape (or a subshape of a tuple shape) has missing layout, a rank long
+// sequence of -1 values will be emittted.
+xla::StatusOr<std::vector<int>> GetShapeLayoutVector(const xla::Shape& shape);
+
+// Given the input shape and a linearized sequence of the minor-to-major values
+// of the layouts, create the output shape by rewriting the input shape layouts.
+// If a layout is missing (has -1 values) for a matching tuple subshape, the
+// layout_func will be called, if not nullptr.
+Status GetShapeWithLayout(
+    const xla::Shape& input_shape, absl::Span<const int64> minor_to_major,
+    const std::function<xla::Layout(const xla::Shape&)>& layout_func,
+    xla::Shape* output_shape);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_SHAPE_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/side_effect_util.cc b/tensorflow/compiler/tf2xla/side_effect_util.cc
index b233e6b2c28e1968bb74901fc684e808ae45ab60..412f31adbb7df52b2d6933be054cc6d40947dc44 100644
--- a/tensorflow/compiler/tf2xla/side_effect_util.cc
+++ b/tensorflow/compiler/tf2xla/side_effect_util.cc
@@ -24,6 +24,51 @@ const char kXlaTokenInputNodesAttrName[] = "_xla_token_input_nodes";
 
 const char kXlaTokenArgNodeName[] = "_xla_token_arg_node";
 
+const char kXlaHasHostTransferAttrName[] = "_xla_has_host_transfer";
+
+Status SetDeviceOrdinalAttributeForNode(Node* node, int device_ordinal) {
+  if (!HasNodeAttr(node->def(), kXlaHasHostTransferAttrName)) {
+    return errors::InvalidArgument("Node ", node->DebugString(),
+                                   " does not have attribute ",
+                                   kXlaHasHostTransferAttrName);
+  }
+
+  if (node->type_string() == "_XlaRecvAtHost" ||
+      node->type_string() == "_XlaSendFromHost") {
+    node->ClearAttr("device_ordinal");
+    node->AddAttr("device_ordinal", device_ordinal);
+  } else if (node->type_string() == "If") {
+    AttrValue device_ordinal_value;
+    device_ordinal_value.set_i(device_ordinal);
+    for (const string& attr_name :
+         std::vector<string>{"then_branch", "else_branch"}) {
+      NameAttrList branch_func;
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), attr_name, &branch_func));
+      (*branch_func.mutable_attr())["device_ordinal"] = device_ordinal_value;
+      node->ClearAttr(attr_name);
+      node->AddAttr(attr_name, branch_func);
+    }
+  } else if (node->type_string() == "While") {
+    AttrValue device_ordinal_value;
+    device_ordinal_value.set_i(device_ordinal);
+    for (const string& attr_name : std::vector<string>{"cond", "body"}) {
+      NameAttrList branch_func;
+      TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), attr_name, &branch_func));
+      (*branch_func.mutable_attr())["device_ordinal"] = device_ordinal_value;
+      node->ClearAttr(attr_name);
+      node->AddAttr(attr_name, branch_func);
+    }
+  } else if (HasNodeAttr(node->def(), "device_ordinal")) {
+    // Function call node containing outside compilation.
+    node->ClearAttr("device_ordinal");
+    node->AddAttr("device_ordinal", device_ordinal);
+  } else {
+    return errors::Internal("Unknown node type to set 'device_ordinal': ",
+                            node->DebugString());
+  }
+  return Status::OK();
+}
+
 std::set<std::string> CalculateTokenInputsForOutputToken(const Graph& g) {
   std::set<std::string> results;
   Node* first_side_effecting_node_on_path = nullptr;
diff --git a/tensorflow/compiler/tf2xla/side_effect_util.h b/tensorflow/compiler/tf2xla/side_effect_util.h
index f22ddb2f58e1fa5c10ca0fdb956d9136942388b7..75e1f253fb08ae61b0336a8783b7449c69197dd1 100644
--- a/tensorflow/compiler/tf2xla/side_effect_util.h
+++ b/tensorflow/compiler/tf2xla/side_effect_util.h
@@ -35,6 +35,13 @@ extern const char kXlaTokenInputNodesAttrName[];
 // node has side-effect dependency on current graph's token input.
 extern const char kXlaTokenArgNodeName[];
 
+// This node have XlaRecvAtHost/XlaSendFromHost in its associated functions.
+extern const char kXlaHasHostTransferAttrName[];
+
+// Sets device ordinal attribute for nodes with attribute
+// `kXlaHasHostTransferAttrName`.
+Status SetDeviceOrdinalAttributeForNode(Node* node, int device_ordinal);
+
 // Calculates side-effect dependencies for the graph's token output.
 // Returns a set of node names representing these dependencies.
 std::set<std::string> CalculateTokenInputsForOutputToken(const Graph& g);
diff --git a/tensorflow/compiler/tf2xla/tf2xla.cc b/tensorflow/compiler/tf2xla/tf2xla.cc
index 9fac16a9700419b189bf5393c2b8bd7d76c6c1cc..28a4566c9d284fb8410a2d618f368c4dd2c1d893 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla.cc
@@ -243,7 +243,9 @@ Status CreateXlaArgs(const Graph& graph,
     XlaCompiler::Argument arg;
     arg.kind = XlaCompiler::Argument::kParameter;
     TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), "T", &arg.type));
-    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kShapeAttr, &arg.shape));
+    TensorShape shape;
+    TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kShapeAttr, &shape));
+    arg.shape = shape;
     TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kDebugNameAttr, &arg.name));
     xla_args->push_back(arg);
   }
@@ -252,7 +254,8 @@ Status CreateXlaArgs(const Graph& graph,
 
 // Converts the TensorFlow graph into an XLA computation, by executing the
 // graph symbolically, with each op building up the XLA HLO.
-Status ConvertGraphToXla(std::unique_ptr<Graph> graph, xla::Client* client,
+Status ConvertGraphToXla(std::unique_ptr<Graph> graph,
+                         const tf2xla::Config& config, xla::Client* client,
                          xla::XlaComputation* computation) {
   XlaOpRegistry::RegisterCompilationKernels();
   for (Node* node : graph->nodes()) {
@@ -262,6 +265,19 @@ Status ConvertGraphToXla(std::unique_ptr<Graph> graph, xla::Client* client,
   std::vector<XlaCompiler::Argument> xla_args;
   TF_RETURN_IF_ERROR(CreateXlaArgs(*graph, &xla_args));
 
+  // Populate arguments with resource variables from the config. The variables
+  // get turned into inputs and outputs.
+  for (const tf2xla::Variable& variable : config.variable()) {
+    XlaCompiler::Argument arg;
+    arg.type = variable.type();
+    arg.kind = XlaCompiler::Argument::kResource;
+    arg.shape = variable.shape();
+    arg.name = variable.node_name();
+    arg.resource_kind = XlaResource::kVariable;
+    arg.initialized = true;
+    xla_args.push_back(std::move(arg));
+  }
+
   // Compile the graph into an XLA computation.
   XlaCompiler::Options compiler_options;
   compiler_options.client = client;
@@ -359,7 +375,8 @@ Status ConvertGraphDefToXla(const GraphDef& graph_def,
                             xla::XlaComputation* computation) {
   std::unique_ptr<Graph> graph;
   TF_RETURN_IF_ERROR(InitGraph(graph_def, config, &graph));
-  TF_RETURN_IF_ERROR(ConvertGraphToXla(std::move(graph), client, computation));
+  TF_RETURN_IF_ERROR(
+      ConvertGraphToXla(std::move(graph), config, client, computation));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/tf2xla.proto b/tensorflow/compiler/tf2xla/tf2xla.proto
index 18c9089f5fa0e9792a4763d9bfac4c4e826eb5b2..5627af7452b99da594c1c214d0b556d8d70544d5 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.proto
+++ b/tensorflow/compiler/tf2xla/tf2xla.proto
@@ -39,6 +39,15 @@ message Fetch {
   string name = 2;  // Optional name for generated code.
 };
 
+// Variable represents a resource variable with the given name, shape and type.
+message Variable {
+  string node_name = 1;
+  string name =
+      2;  // Optional name for generated code. If empty, node_name will be used.
+  TensorShapeProto shape = 3;
+  DataType type = 4;
+}
+
 // Config represents configuration information for tf2xla conversion.
 message Config {
   // Each feed is a positional input argument for the generated computation.
@@ -47,4 +56,6 @@ message Config {
   // Each fetch is a positional output argument for the generated computation.
   // The order of each entry matches the order of each output argument.
   repeated Fetch fetch = 2;
+  // Each variable is a named input and output of the generated computation.
+  repeated Variable variable = 3;
 };
diff --git a/tensorflow/compiler/tf2xla/tf2xla_test.cc b/tensorflow/compiler/tf2xla/tf2xla_test.cc
index ab26d939ccba75ce58609ffd71c7ccadbe90cfa8..24afe595b18b823818bd8fe65bc599af8bce040a 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_test.cc
@@ -91,7 +91,7 @@ TEST(ConvertGraphDefToXla, Sum) {
       client->ExecuteAndTransfer(computation, {x_global.get(), y_global.get()});
   TF_EXPECT_OK(result_or.status());
   xla::Literal result = std::move(result_or.ValueOrDie());
-  EXPECT_EQ("(s32[]) (\n42\n)", result.ToString());
+  EXPECT_EQ("(\ns32[] 42\n)", result.ToString());
 
   config.mutable_feed(0)->mutable_id()->set_output_index(
       123); /* invalid output_index */
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index cc81772e8c5da710bc733f7e4f5fe820b2c2d110..88c03a6056ac6484013c3fd32c9889899b5c15c5 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -122,7 +122,12 @@ Status ReplaceArgUsageWithConstNode(
 
   for (const auto& iter : const_input_index_to_node) {
     int arg_index = iter.first;
-    Node* const_node = g->CopyNode(iter.second);
+    NodeDef const_def = iter.second->def();
+    const_def.set_name(g->NewName(const_def.name()));
+    Status s;
+    Node* const_node = g->AddNode(const_def, &s);
+    TF_RETURN_IF_ERROR(s);
+
     Node* arg_node = arg_nodes[arg_index];
 
     // Collect all usages of the _Arg node.
@@ -265,6 +270,13 @@ Status PropagateConstIntoWhileNode(Graph* g, Node* while_node,
     }
 
     // Check if i-th retval's input comes from i-th arg directly.
+    // For resource variable input of While nodes, TF2XLA convention is to place
+    // them at the end of all inputs (after all data inputs), and *not* return
+    // them. So number of While node inputs might be larger than number of its
+    // outputs.
+    if (i >= body_func->signature().output_arg_size()) {
+      continue;
+    }
     const OpDef_ArgDef& output_arg = body_func->signature().output_arg(i);
     auto output_arg_input = body_func->ret().find(output_arg.name());
     if (output_arg_input == body_func->ret().end()) {
@@ -364,6 +376,7 @@ Status AddPlaceholdersForFeeds(
       GraphDef gd;
       *gd.mutable_versions() = graph_def->versions();
       *gd.add_node() = *existing;
+      MergeDebugInfo(NodeDebugInfo(*existing), gd.mutable_node(0));
       TF_RETURN_IF_ERROR(
           AddDefaultAttrsToGraphDef(&gd, *op_registry, 0 /*node_offset*/));
 
@@ -390,6 +403,7 @@ Status AddPlaceholdersForFeeds(
   // in this code.
   for (auto it = placeholder_info.begin(); it != placeholder_info.end(); ++it) {
     const PlaceholderInfo& info = it->second;
+    // TODO(shikharagarwal): Add original node information.
     NodeDef* d = graph_def->add_node();
     d->set_name(info.placeholder_name);
     d->set_op("PlaceholderV2");
@@ -557,6 +571,12 @@ bool HasAssociatedFunction(const NodeDef& node_def,
     return true;
   }
 
+  if (node_def.op() == "XlaHostCompute") {
+    // XlaHostCompute has "shape_inference_graph" func attr, but that's not
+    // related to graph execution.
+    return false;
+  }
+
   for (const auto& iter : node_def.attr()) {
     if (iter.second.has_func()) {
       return true;
@@ -578,6 +598,9 @@ std::vector<AssociatedFunctionInfo> GetAssociatedFunctions(
     // This is a SymbolicGradient op.
     AttrValueMap attrs(node.attrs().begin(), node.attrs().end());
     results.emplace_back(AssociatedFunctionInfo::SymbolicGradient(op, attrs));
+  } else if (node.type_string() == "XlaHostCompute") {
+    // XlaHostCompute has "shape_inference_graph" func attr, but that's not
+    // related to graph execution.
   } else {
     // Collect all function attrs for the node.
     for (auto& iter : node.attrs()) {
@@ -599,7 +622,9 @@ Status RewriteAssociatedFunction(
   switch (associated_function.type()) {
     case AssociatedFunctionInfo::kFunctionCallNode: {
       // Change this node to call the new function.
-      NodeDefBuilder builder(node->name(), rewritten_function_name, fld);
+      NodeDebugInfo debug_info(*node);
+      NodeDefBuilder builder(node->name(), rewritten_function_name, fld,
+                             &debug_info);
       for (auto attr : node->attrs()) {
         builder.Attr(attr.first, attr.second);
       }
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
index 202e929315cacd4d6cdfc69d50639d8a427ec6c2..28b4744470e7d28863b5f7275f829b9bd59641e1 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
@@ -21,11 +21,13 @@ limitations under the License.
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/data_flow_ops.h"
 #include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/functional_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -329,5 +331,90 @@ TEST(CachedFunctionHandles, Basic) {
   TF_EXPECT_OK(cached_function_handles.ReleaseAllHandles());
 }
 
+TEST(PropagateConstIntoFunctionalNodes, WhileLoopWithResourceInput) {
+  FunctionLibraryDefinition fld(OpRegistry::Global(), {});
+  {
+    // Cond graph & body graph.
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto pred = ops::_Arg(scope.WithOpName("pred"), DT_BOOL, 0);
+    auto input = ops::_Arg(scope.WithOpName("input"), DT_RESOURCE, 1);
+    auto ret = ops::_Retval(scope.WithOpName("ret"), pred, 0);
+    Graph graph(OpRegistry::Global());
+    TF_ASSERT_OK(scope.ToGraph(&graph));
+    FunctionDef cond_fdef;
+    TF_ASSERT_OK(GraphToFunctionDef(graph, "cond", &cond_fdef));
+    TF_ASSERT_OK(fld.AddFunctionDef(cond_fdef));
+    FunctionDef body_fdef;
+    TF_ASSERT_OK(GraphToFunctionDef(graph, "body", &body_fdef));
+    TF_ASSERT_OK(fld.AddFunctionDef(body_fdef));
+  }
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto pred = ops::Const(scope.WithOpName("pred"), false, TensorShape({}));
+  auto input = ops::Const(scope.WithOpName("input"), 0, TensorShape({}));
+  NameAttrList cond_fn, body_fn;
+  cond_fn.set_name("cond");
+  body_fn.set_name("body");
+  auto while_op =
+      ops::While(scope.WithOpName("while"),
+                 std::initializer_list<Input>{pred, input}, cond_fn, body_fn);
+  Graph graph(OpRegistry::Global());
+  TF_ASSERT_OK(scope.ToGraph(&graph));
+
+  TF_EXPECT_OK(PropagateConstIntoFunctionalNodes(&graph, &fld, &fld));
+}
+
+TEST(PropagateConstIntoFunctionalNodes, CopiedConstNodeHasUniqueName) {
+  FunctionLibraryDefinition fld(OpRegistry::Global(), {});
+  {
+    // Cond graph & body graph.
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto pred = ops::_Arg(scope.WithOpName("arg0"), DT_BOOL, 0);
+    auto input = ops::_Arg(scope.WithOpName("arg1"), DT_BOOL, 1);
+    auto duplicate_name = ops::NoOp(scope.WithOpName("duplicate_name"));
+    auto ret = ops::_Retval(scope.WithOpName("ret"), pred, 0);
+    Graph graph(OpRegistry::Global());
+    TF_ASSERT_OK(scope.ToGraph(&graph));
+    FunctionDef cond_fdef;
+    TF_ASSERT_OK(GraphToFunctionDef(graph, "cond", &cond_fdef));
+    TF_ASSERT_OK(fld.AddFunctionDef(cond_fdef));
+    FunctionDef body_fdef;
+    TF_ASSERT_OK(GraphToFunctionDef(graph, "body", &body_fdef));
+    TF_ASSERT_OK(fld.AddFunctionDef(body_fdef));
+  }
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto pred =
+      ops::Const(scope.WithOpName("duplicate_name"), false, TensorShape({}));
+  auto input = ops::Const(scope.WithOpName("input"), false, TensorShape({}));
+  NameAttrList cond_fn, body_fn;
+  cond_fn.set_name("cond");
+  body_fn.set_name("body");
+  auto while_op =
+      ops::While(scope.WithOpName("while"),
+                 std::initializer_list<Input>{pred, input}, cond_fn, body_fn);
+  Graph graph(OpRegistry::Global());
+  TF_ASSERT_OK(scope.ToGraph(&graph));
+
+  TF_EXPECT_OK(PropagateConstIntoFunctionalNodes(&graph, &fld, &fld));
+
+  // Check that in rewritten body function, the NoOp node still has name
+  // "duplicate_name", and the copied Const node has name "duplicate_name/_0".
+  auto node_name_index = graph.BuildNodeNameIndex();
+  Node* while_node = node_name_index["while"];
+  ASSERT_NE(while_node, nullptr);
+  TF_ASSERT_OK(GetNodeAttr(while_node->def(), "body", &body_fn));
+  const FunctionDef* rewritten_body_fn = fld.Find(body_fn.name());
+  ASSERT_NE(rewritten_body_fn, nullptr);
+  std::unordered_map<string, NodeDef> nodes;
+  for (const NodeDef& node_def : rewritten_body_fn->node_def()) {
+    nodes[node_def.name()] = node_def;
+  }
+  auto noop_def = nodes.find("duplicate_name");
+  ASSERT_NE(noop_def, nodes.end());
+  EXPECT_EQ(noop_def->second.op(), "NoOp");
+  auto const_def = nodes.find("duplicate_name/_0");
+  ASSERT_NE(const_def, nodes.end());
+  EXPECT_EQ(const_def->second.op(), "Const");
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/type_util.cc b/tensorflow/compiler/tf2xla/type_util.cc
index d00b1376620c0c9d112c7d7426758f6d3f25e86f..732f957d7329c93ad104dacf5190948fbfd7974b 100644
--- a/tensorflow/compiler/tf2xla/type_util.cc
+++ b/tensorflow/compiler/tf2xla/type_util.cc
@@ -69,6 +69,9 @@ Status DataTypeToPrimitiveType(DataType data_type, xla::PrimitiveType* type) {
     case tensorflow::DT_COMPLEX64:
       *type = xla::C64;
       return Status::OK();
+    case tensorflow::DT_COMPLEX128:
+      *type = xla::C128;
+      return Status::OK();
     default:
       return errors::InvalidArgument(
           "Unsupported type in DataTypeToPrimitiveType ",
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index ddb284966eeb97cc7c9d3ed77fb313e567975e59..5bd0277c051711f2677b90a2679662899521e94a 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -60,8 +60,6 @@ class XlaCompilationAllocator : public Allocator {
   // buffers, so they get ids to track.
   bool ShouldAllocateEmptyTensors() override { return true; }
 
-  void GetStats(AllocatorStats* stats) override { stats->Clear(); }
-
  private:
   // Don't run any constructors or destructors for complex objects,
   // since there is no backing store for the tensor to run them
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
index c7341cf8b9e8d7a06fd304ae8766420d20f0c16e..de2e485a47c18ae8e58a06aba408dbb61a30d00a 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@@ -59,45 +59,8 @@ class XlaCompiledCpuFunction {
   // AOT this is backed by data compiled into the object file.
   //
   // The contents of StaticData are XLA-internal implementation details and
-  // should not be relied on by clients.
-  //
-  // TODO(sanjoy): Come up with a cleaner way to express the contraint we want
-  // here: generated XlaCompiledCpuFunction subclasses should be able to create
-  // instances of StaticData but only XlaCompiledCpuFunction should be able to
-  // read from StaticData instances.
+  // should not be relied on by clients (and therefore are private).
   class StaticData {
-   public:
-    void set_raw_function(RawFunction raw_function) {
-      raw_function_ = raw_function;
-    }
-    void set_buffer_infos(
-        const cpu_function_runtime::BufferInfo* buffer_infos) {
-      buffer_infos_ = buffer_infos;
-    }
-    void set_num_buffers(size_t num_buffers) { num_buffers_ = num_buffers; }
-    void set_arg_index_table(const int32* arg_index_table) {
-      arg_index_table_ = arg_index_table;
-    }
-    void set_num_args(int64 num_args) { num_args_ = num_args; }
-    void set_result_index(size_t result_index) { result_index_ = result_index; }
-    void set_arg_names(const char** arg_names) { arg_names_ = arg_names; }
-    void set_result_names(const char** result_names) {
-      result_names_ = result_names;
-    }
-    void set_program_shape(const xla::ProgramShapeProto* program_shape) {
-      program_shape_ = program_shape;
-    }
-    const xla::HloProfilePrinterData* hlo_profile_printer_data() const {
-      return hlo_profile_printer_data_;
-    }
-    void set_hlo_profile_printer_data(
-        const xla::HloProfilePrinterData* hlo_profile_printer_data) {
-      hlo_profile_printer_data_ = hlo_profile_printer_data;
-    }
-    void set_profile_counters_size(int64 profile_counters_size) {
-      profile_counters_size_ = profile_counters_size;
-    }
-
    private:
     // The raw function to call.
     RawFunction raw_function_;
@@ -134,7 +97,8 @@ class XlaCompiledCpuFunction {
     // declared so we don't have access to that information here.
     int64 profile_counters_size_ = 0;
 
-    // Only XlaCompiledCpuFunction is allowed to read the above fields.
+    // Only XlaCompiledCpuFunction is allowed to read and write the above
+    // fields.
     friend class XlaCompiledCpuFunction;
   };
 
@@ -148,7 +112,7 @@ class XlaCompiledCpuFunction {
     RESULTS_PROFILES_AND_TEMPS_ONLY,
   };
 
-  XlaCompiledCpuFunction(
+  explicit XlaCompiledCpuFunction(
       const StaticData& static_data,
       AllocMode alloc_mode = AllocMode::ARGS_RESULTS_PROFILES_AND_TEMPS);
   virtual ~XlaCompiledCpuFunction();
@@ -280,6 +244,76 @@ class XlaCompiledCpuFunction {
     return *hlo_profile_printer_data_;
   }
 
+ protected:
+  // ---------------------------------------------------------------------------
+  // Accessors for reading from and writing to instances of `StaticData`.
+  //
+  // Classes generated by tfcompile can call these because the generated classes
+  // inherit from `XlaCompiledCpuFunction`.  `XlaJitCompiledCpuFunction` can
+  // call these because it is explicitly added as a friend.
+
+  static void set_static_data_raw_function(StaticData* static_data,
+                                           RawFunction raw_function) {
+    static_data->raw_function_ = raw_function;
+  }
+
+  static void set_static_data_buffer_infos(
+      StaticData* static_data,
+      const cpu_function_runtime::BufferInfo* buffer_infos) {
+    static_data->buffer_infos_ = buffer_infos;
+  }
+
+  static void set_static_data_num_buffers(StaticData* static_data,
+                                          size_t num_buffers) {
+    static_data->num_buffers_ = num_buffers;
+  }
+
+  static void set_static_data_arg_index_table(StaticData* static_data,
+                                              const int32* arg_index_table) {
+    static_data->arg_index_table_ = arg_index_table;
+  }
+
+  static void set_static_data_num_args(StaticData* static_data,
+                                       int64 num_args) {
+    static_data->num_args_ = num_args;
+  }
+
+  static void set_static_data_result_index(StaticData* static_data,
+                                           size_t result_index) {
+    static_data->result_index_ = result_index;
+  }
+
+  static void set_static_data_arg_names(StaticData* static_data,
+                                        const char** arg_names) {
+    static_data->arg_names_ = arg_names;
+  }
+
+  static void set_static_data_result_names(StaticData* static_data,
+                                           const char** result_names) {
+    static_data->result_names_ = result_names;
+  }
+
+  static void set_static_data_program_shape(
+      StaticData* static_data, const xla::ProgramShapeProto* program_shape) {
+    static_data->program_shape_ = program_shape;
+  }
+
+  static void set_static_data_hlo_profile_printer_data(
+      StaticData* static_data,
+      const xla::HloProfilePrinterData* hlo_profile_printer_data) {
+    static_data->hlo_profile_printer_data_ = hlo_profile_printer_data;
+  }
+
+  static const xla::HloProfilePrinterData*
+  get_static_data_hlo_profile_printer_data(StaticData* static_data) {
+    return static_data->hlo_profile_printer_data_;
+  }
+
+  static void set_static_data_profile_counters_size(
+      StaticData* static_data, int64 profile_counters_size) {
+    static_data->profile_counters_size_ = profile_counters_size;
+  }
+
  private:
   const RawFunction raw_function_;
   const size_t result_index_;
@@ -313,6 +347,10 @@ class XlaCompiledCpuFunction {
   const char** result_names_ = nullptr;
   const xla::ProgramShapeProto* program_shape_ = nullptr;
   const xla::HloProfilePrinterData* hlo_profile_printer_data_ = nullptr;
+
+  // Add `XlaJitCompiledCpuFunction` as a friend so that it can access the
+  // `set_static_data_*` static methods above.
+  friend class XlaJitCompiledCpuFunction;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index ee461a3c07d4db514c7697e005a9371be4b54dd0..3221ec5b727de1f792cd61b792ee917588d56cf9 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -42,6 +43,8 @@ limitations under the License.
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
@@ -57,7 +60,11 @@ Status CheckSignature(const DataTypeVector& types,
                             " elements while function has ", types.size());
   }
   for (int i = 0; i < types.size(); ++i) {
-    if (types[i] != args[i].type && types[i] != DT_RESOURCE) {
+    // Don't perform type checks on resource variables and tensor
+    // lists (DT_VARIANT) as we have to trick the type system in order to
+    // plumb them through. DT_VARIANTS are wrapped in a DT_UINT8 tensor.
+    if (types[i] != args[i].type && types[i] != DT_RESOURCE &&
+        types[i] != DT_VARIANT) {
       return errors::Internal(
           "Argument ", i, " has declared type ", DataTypeString(args[i].type),
           " but function parameter has type ", DataTypeString(types[i]));
@@ -178,9 +185,10 @@ Status BuildComputation(
   std::vector<xla::XlaOp> elems;
   elems.reserve(retvals.size());
 
-  // Keeps track of which retvals have layout to update. The first element is
-  // the output index, second element is the new layout.
-  std::vector<std::pair<int64, xla::Layout>> retval_to_update_layout;
+  // Keeps track of the layout of each retval. If a retval is not in this list,
+  // a descending layout is used. The first element is the output index, second
+  // element is the new layout.
+  std::vector<std::pair<int64, xla::Layout>> retval_index_and_layout;
   for (int i = 0; i < retvals.size(); ++i) {
     XlaCompiler::OutputDescription& output = (*outputs)[i];
     const XlaExpression& retval = retvals[i];
@@ -192,6 +200,8 @@ Status BuildComputation(
         output.shape = output.constant_value.shape();
         break;
 
+      case XlaExpression::Kind::kTensorList:
+        TF_FALLTHROUGH_INTENDED;
       case XlaExpression::Kind::kXlaOp: {
         output.is_constant = false;
         TF_ASSIGN_OR_RETURN(output.shape, retval.GetShape());
@@ -207,7 +217,7 @@ Status BuildComputation(
           TF_ASSIGN_OR_RETURN(xla::Shape shape, shape_representation_fn(
                                                     output.shape, output.type));
           value = xla::Reshape(value, xla::AsInt64Slice(shape.dimensions()));
-          retval_to_update_layout.emplace_back(elems.size(), shape.layout());
+          retval_index_and_layout.emplace_back(elems.size(), shape.layout());
         } else if (it != retval_cores.end()) {
           // Apply the sharding to the output, if there is a core assignment.
           value = identity_op(value);
@@ -280,6 +290,11 @@ Status BuildComputation(
       // Ensures the correct sharding is applied to the output.
       handle = identity_op(handle);
 
+      // Set layout of the retval to device representation layout.
+      if (resource->representation_shape().has_value()) {
+        retval_index_and_layout.emplace_back(
+            elems.size(), resource->representation_shape()->layout());
+      }
       elems.push_back(handle);
     }
   }
@@ -309,15 +324,15 @@ Status BuildComputation(
                       computation->GetProgramShape());
   *output_shape = program_shape.result();
   // Update the output layout to the layout of retval.
-  for (auto& update : retval_to_update_layout) {
+  for (auto& index_and_layout : retval_index_and_layout) {
     if (!always_return_tuple && elems.size() == 1) {
-      *output_shape->mutable_layout() = update.second;
+      *output_shape->mutable_layout() = index_and_layout.second;
       continue;
     }
 
-    xla::Shape* output_sub_shape =
-        xla::ShapeUtil::GetMutableSubshape(output_shape, {update.first});
-    *output_sub_shape->mutable_layout() = update.second;
+    xla::Shape* output_sub_shape = xla::ShapeUtil::GetMutableSubshape(
+        output_shape, {index_and_layout.first});
+    *output_sub_shape->mutable_layout() = index_and_layout.second;
   }
   return Status::OK();
 }
@@ -333,8 +348,21 @@ bool XlaCompiler::Argument::operator==(
                other.tensor_array_gradients)) {
     return false;
   }
-  if (shape != other.shape) {
-    return false;
+  if (absl::holds_alternative<xla::Shape>(shape)) {
+    if (!absl::holds_alternative<xla::Shape>(other.shape)) {
+      return false;
+    }
+    if (!xla::Shape::Equal()(absl::get<xla::Shape>(shape),
+                             absl::get<xla::Shape>(other.shape))) {
+      return false;
+    }
+  } else {
+    if (!absl::holds_alternative<TensorShape>(other.shape)) {
+      return false;
+    }
+    if (absl::get<TensorShape>(shape) != absl::get<TensorShape>(other.shape)) {
+      return false;
+    }
   }
   if (constant_value.shape() != other.constant_value.shape()) {
     return false;
@@ -348,7 +376,7 @@ string XlaCompiler::Argument::HumanString() const {
     common = absl::StrCat(" name=", name);
   }
   absl::StrAppend(&common, " type=", DataTypeString(type),
-                  " shape=", shape.DebugString());
+                  " shape=", ShapeHumanString());
   switch (kind) {
     case kInvalid:
       return "invalid";
@@ -375,6 +403,23 @@ string XlaCompiler::Argument::HumanString() const {
   }
 }
 
+std::vector<int64> XlaCompiler::Argument::DimensionSizes() const {
+  if (absl::holds_alternative<TensorShape>(shape)) {
+    return xla::InlinedVectorToVector(
+        absl::get<TensorShape>(shape).dim_sizes());
+  } else {
+    return absl::get<xla::Shape>(shape).dimensions();
+  }
+}
+
+string XlaCompiler::Argument::ShapeHumanString() const {
+  if (absl::holds_alternative<TensorShape>(shape)) {
+    return absl::get<TensorShape>(shape).DebugString();
+  } else {
+    return absl::get<xla::Shape>(shape).DebugString();
+  }
+}
+
 XlaCompiler::XlaCompiler(XlaCompiler::Options options)
     : options_(options),
       initialization_status_(Status::OK()),
@@ -462,8 +507,34 @@ std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
   opts.set_do_function_inlining(true);
   opts.set_do_constant_folding(true);
   GraphOptimizer optimizer(opts);
+  // Do not constant fold nodes that output DT_VARIANT type tensors.
+  // XLA does not support Const nodes of Variant type since it needs
+  // to know the original ops to be able to compile them to the relevant
+  // XLA form.
+  // TODO(srbs): This filter is a little conservative. E.g. a subgraph of
+  // the form:
+  //                          Const
+  //                            |
+  // EmptyTensorList -> TensorListPushBack -> TensorListPopBack -> Op
+  //                                                  |
+  //                                        (Discard popped list)
+  //
+  // Would have been reduced to "Const -> Op" without this filter.
+  // However since we are only allowed to specify the filter at the "Node"
+  // level there is no good way to allow the above behavior. So we
+  // disallow any sort of constant folding on Variant nodes for now.
+  auto cf_consider_fn = [](const Node* n) {
+    for (const auto& output_arg : n->op_def().output_arg()) {
+      if (output_arg.type() == DT_VARIANT) {
+        return false;
+      }
+    }
+    return true;
+  };
+  GraphOptimizer::Options graph_optimizer_options;
+  graph_optimizer_options.cf_consider_fn = cf_consider_fn;
   optimizer.Optimize(flib_runtime_, flib_runtime_->env(),
-                     /*device=*/nullptr, &graph, /*shape_map=*/nullptr);
+                     /*device=*/nullptr, &graph, graph_optimizer_options);
 
   return graph;
 }
@@ -548,11 +619,22 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
       LOG(FATAL) << "Unreachable case";
     case XlaCompiler::Argument::kParameter: {
       if (is_entry_computation) {
-        TF_ASSIGN_OR_RETURN(
-            *xla_shape, options_.shape_representation_fn(arg.shape, arg.type));
+        TensorShape shape;
+        if (absl::holds_alternative<TensorShape>(arg.shape)) {
+          shape = absl::get<TensorShape>(arg.shape);
+        } else {
+          TF_RETURN_IF_ERROR(
+              XLAShapeToTensorShape(absl::get<xla::Shape>(arg.shape), &shape));
+        }
+        TF_ASSIGN_OR_RETURN(*xla_shape,
+                            options_.shape_representation_fn(shape, arg.type));
       } else {
-        TF_RETURN_IF_ERROR(
-            TensorShapeToXLAShape(arg.type, arg.shape, xla_shape));
+        if (absl::holds_alternative<xla::Shape>(arg.shape)) {
+          *xla_shape = absl::get<xla::Shape>(arg.shape);
+        } else {
+          TF_RETURN_IF_ERROR(TensorShapeToXLAShape(
+              arg.type, absl::get<TensorShape>(arg.shape), xla_shape));
+        }
       }
       return Status::OK();
     }
@@ -561,8 +643,10 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
 
       switch (arg.resource_kind) {
         case XlaResource::kVariable: {
-          TF_ASSIGN_OR_RETURN(*xla_shape, options_.shape_representation_fn(
-                                              arg.shape, arg.type));
+          TF_RET_CHECK(absl::holds_alternative<TensorShape>(arg.shape));
+          TF_ASSIGN_OR_RETURN(*xla_shape,
+                              options_.shape_representation_fn(
+                                  absl::get<TensorShape>(arg.shape), arg.type));
 
           return Status::OK();
         }
@@ -571,9 +655,10 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
             return errors::InvalidArgument(
                 "Negative max_array_size in XLAShapeForArgument");
           }
+          TF_RET_CHECK(absl::holds_alternative<TensorShape>(arg.shape));
           TensorShape shape;
           shape.AddDim(arg.max_array_size);
-          shape.AppendShape(arg.shape);
+          shape.AppendShape(absl::get<TensorShape>(arg.shape));
           TF_RETURN_IF_ERROR(TensorShapeToXLAShape(arg.type, shape, xla_shape));
 
           if (!arg.tensor_array_gradients.empty()) {
@@ -588,9 +673,10 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
             return errors::InvalidArgument(
                 "Negative max_array_size in XLAShapeForArgument");
           }
+          TF_RET_CHECK(absl::holds_alternative<TensorShape>(arg.shape));
           TensorShape shape;
           shape.AddDim(arg.max_array_size);
-          shape.AppendShape(arg.shape);
+          shape.AppendShape(absl::get<TensorShape>(arg.shape));
           xla::Shape buffer_shape;
           TF_RETURN_IF_ERROR(
               TensorShapeToXLAShape(arg.type, shape, &buffer_shape));
@@ -620,14 +706,15 @@ Status XlaCompiler::BuildArguments(
     bool use_tuple_arg, xla::XlaBuilder* builder, XlaContext* context,
     const std::map<int, int>& arg_cores,
     std::vector<XlaExpression>* arg_expressions,
-    std::vector<int>* input_mapping, std::vector<xla::Shape>* input_shapes,
+    std::vector<int>* input_to_args, std::vector<xla::Shape>* input_shapes,
     bool is_entry_computation) {
   arg_expressions->resize(args.size());
 
   // Argument numbers of arguments and resources that are to be passed to the
-  // XLA computation as runtime parameters.
-  input_mapping->clear();
-  input_mapping->reserve(args.size());
+  // XLA computation as runtime parameters. `input_to_args[a] = b` means that
+  // the a'th XLA input corresponds to the b'th original arg indexes.
+  input_to_args->clear();
+  input_to_args->reserve(args.size());
 
   // Fills in constant arguments, and computes non-constant argument order.
   for (std::vector<XlaCompiler::Argument>::size_type i = 0; i < args.size();
@@ -637,24 +724,25 @@ Status XlaCompiler::BuildArguments(
     switch (arg.kind) {
       case XlaCompiler::Argument::kResource: {
         TF_RET_CHECK(arg.resource_kind != XlaResource::kInvalid);
+        TF_RET_CHECK(absl::holds_alternative<TensorShape>(arg.shape));
         // TODO(phawkins): this code assumes that resource arguments do not
         // alias.
         XlaResource* resource =
             context->AddResource(absl::make_unique<XlaResource>(
-                arg.resource_kind, i, arg.name, arg.type, arg.shape,
-                xla::XlaOp(),
+                arg.resource_kind, i, arg.name, arg.type,
+                absl::get<TensorShape>(arg.shape), xla::XlaOp(),
                 /*max_array_size=*/arg.max_array_size,
                 /*tensor_array_gradients=*/arg.tensor_array_gradients,
                 /*tensor_array_multiple_writes_aggregate=*/true));
         arg_expression = XlaExpression::Resource(resource);
         if (arg.initialized) {
-          input_mapping->push_back(i);
+          input_to_args->push_back(i);
         }
         break;
       }
       case XlaCompiler::Argument::kParameter:
       case XlaCompiler::Argument::kToken: {
-        input_mapping->push_back(i);
+        input_to_args->push_back(i);
         break;
       }
       case XlaCompiler::Argument::kConstant:
@@ -666,15 +754,23 @@ Status XlaCompiler::BuildArguments(
     }
   }
 
-  if (input_mapping->empty()) {
+  if (input_to_args->empty()) {
     return Status::OK();
   }
 
-  std::vector<xla::Shape> arg_shapes(input_mapping->size());
-  for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
+  // `arg_to_inputs[c] = d` means that the c'th original arg index corresponds
+  // to the d'th XLA input. Note that the value -1 corresponds to constants, or
+  // other args that don't correspond to an input.
+  std::vector<int> arg_to_inputs(args.size(), -1);
+  for (int i = 0; i < input_to_args->size(); i++) {
+    arg_to_inputs[input_to_args->at(i)] = i;
+  }
+
+  std::vector<xla::Shape> arg_shapes(input_to_args->size());
+  for (std::vector<int>::size_type i = 0; i < input_to_args->size(); ++i) {
     // Computes the shapes of non-constant arguments.
     TF_RETURN_IF_ERROR(XLAShapeForArgument(
-        args[(*input_mapping)[i]], is_entry_computation, &arg_shapes[i]));
+        args[(*input_to_args)[i]], is_entry_computation, &arg_shapes[i]));
   }
 
   if (use_tuple_arg) {
@@ -691,13 +787,13 @@ Status XlaCompiler::BuildArguments(
   builder->SetOpMetadata(arg_metadata);
 
   // Build parameter handles for non-constant arguments.
-  std::vector<xla::XlaOp> arg_handles(input_mapping->size());
+  std::vector<xla::XlaOp> arg_handles(input_to_args->size());
   if (use_tuple_arg) {
     xla::XlaOp tuple;
     if (is_entry_computation) {
       xla::OpSharding tuple_sharding;
       tuple_sharding.set_type(xla::OpSharding::Type::OpSharding_Type_TUPLE);
-      for (int64 parameter : *input_mapping) {
+      for (int64 parameter : *input_to_args) {
         auto it = arg_cores.find(parameter);
         const int core = it == arg_cores.end() ? 0 : it->second;
         *tuple_sharding.add_tuple_shardings() =
@@ -709,7 +805,19 @@ Status XlaCompiler::BuildArguments(
     } else {
       tuple = xla::Parameter(builder, 0, (*input_shapes)[0], "arg_tuple");
     }
-    for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
+
+    for (int i = 0; i < input_to_args->size(); ++i) {
+      const XlaCompiler::Argument& arg = args[input_to_args->at(i)];
+      for (const auto& dim_and_arg_num : arg.dynamic_dim_to_arg_num_map) {
+        int dynamic_size_param_index = arg_to_inputs.at(dim_and_arg_num.second);
+        TF_RETURN_IF_ERROR(builder->SetDynamicBinding(
+            /*dynamic_size_param_num=*/0, {dynamic_size_param_index},
+            /*target_param_num=*/0, /*target_param_index=*/{i},
+            dim_and_arg_num.first));
+      }
+    }
+
+    for (std::vector<int>::size_type i = 0; i < input_to_args->size(); ++i) {
       auto it = arg_cores.find(i);
       const int core = it == arg_cores.end() ? -1 : it->second;
       xla::XlaScopedShardingAssignment assign_sharding(
@@ -718,7 +826,7 @@ Status XlaCompiler::BuildArguments(
       arg_handles[i] = xla::GetTupleElement(tuple, i);
     }
   } else {
-    for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
+    for (std::vector<int>::size_type i = 0; i < input_to_args->size(); ++i) {
       auto it = arg_cores.find(i);
       const int core = it == arg_cores.end() ? -1 : it->second;
       xla::XlaScopedShardingAssignment assign_sharding(
@@ -727,6 +835,17 @@ Status XlaCompiler::BuildArguments(
       arg_handles[i] = xla::Parameter(builder, i, (*input_shapes)[i],
                                       absl::StrCat("arg", i));
     }
+
+    for (int i = 0; i < input_to_args->size(); ++i) {
+      const XlaCompiler::Argument& arg = args[input_to_args->at(i)];
+      for (const auto& dim_and_arg_num : arg.dynamic_dim_to_arg_num_map) {
+        int dynamic_size_param_index = arg_to_inputs.at(dim_and_arg_num.second);
+        TF_RETURN_IF_ERROR(builder->SetDynamicBinding(
+            /*dynamic_size_param_num=*/dynamic_size_param_index, {},
+            /*target_param_num=*/i, /*target_param_index=*/{},
+            dim_and_arg_num.first));
+      }
+    }
   }
 
   builder->ClearOpMetadata();
@@ -734,12 +853,12 @@ Status XlaCompiler::BuildArguments(
   // Fill in the handles in non-constant arguments, and reshape parameters
   // back to their correct shapes.
   VLOG(2) << "XLA computation inputs:";
-  for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
-    const XlaCompiler::Argument& arg = args[input_mapping->at(i)];
+  for (std::vector<int>::size_type i = 0; i < input_to_args->size(); ++i) {
+    const XlaCompiler::Argument& arg = args[input_to_args->at(i)];
     VLOG(2) << "  XLA arg " << i
             << " shape: " << xla::ShapeUtil::HumanString(arg_shapes[i])
-            << " name: " << arg.name << " TF arg " << input_mapping->at(i);
-    XlaExpression& arg_expression = (*arg_expressions)[input_mapping->at(i)];
+            << " name: " << arg.name << " TF arg " << input_to_args->at(i);
+    XlaExpression& arg_expression = (*arg_expressions)[input_to_args->at(i)];
     switch (arg.kind) {
       case XlaCompiler::Argument::kResource: {
         TF_RET_CHECK(arg.initialized);
@@ -756,7 +875,7 @@ Status XlaCompiler::BuildArguments(
         // return values of functions, and then reshape unconditionally.
         if (is_entry_computation) {
           arg_expression = XlaExpression::XlaOp(
-              xla::Reshape(arg_handles[i], arg.shape.dim_sizes()), arg.type);
+              xla::Reshape(arg_handles[i], arg.DimensionSizes()), arg.type);
         } else {
           arg_expression = XlaExpression::XlaOp(arg_handles[i], arg.type);
         }
@@ -997,8 +1116,17 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
   result->outputs.resize(context->retvals().size());
   std::vector<XlaExpression> retvals = context->retvals();
   if (options.resolve_compile_time_constants) {
-    TF_RETURN_IF_ERROR(ResolveConstantExpressionsToConstants(
-        client(), absl::Span<XlaExpression>(retvals)));
+    Status status = ResolveConstantExpressionsToConstants(
+        client(), absl::Span<XlaExpression>(retvals));
+
+    // If the HloEvaluator has not implemented an expression, just evaluate it
+    // at runtime.
+    if (status.code() == error::UNIMPLEMENTED) {
+      ConvertConstantsToExpressions(&builder,
+                                    absl::Span<XlaExpression>(retvals));
+    } else {
+      TF_RETURN_IF_ERROR(status);
+    }
   } else {
     ConvertConstantsToExpressions(&builder, absl::Span<XlaExpression>(retvals));
   }
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 0d801b73a8c2651305328384377751254ecaa41d..ad3144b41bdf3fc8b75ab5230e8e128df2962884 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <stack>
 
 #include "absl/types/span.h"
+#include "absl/types/variant.h"
 #include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_expression.h"
@@ -124,7 +125,8 @@ class XlaCompiler {
     DataType type = DT_INVALID;
 
     // The shape of the argument. For:
-    // * a parameter: the shape of the parameter.
+    // * a parameter: the shape of the parameter. We allow setting the xla shape
+    //   if known. This helps avoid conversions to and from TensorShape.
     // * a constant: ignored; the shape given by constant_value is used
     //     instead.
     // * an uninitialized resource: ignored. We don't yet know the shape of an
@@ -133,7 +135,7 @@ class XlaCompiler {
     // * an initialized TensorArray or Stack resource: the shape of an entry in
     //   the TensorArray/Stack. Note this is the size of a single entry, not the
     //   XLA data structure that represents the complete stack/array.
-    TensorShape shape;
+    absl::variant<TensorShape, xla::Shape> shape;
 
     // The value of the argument, if it is a compile-time constant. Must be a
     // host-memory tensor.
@@ -157,10 +159,20 @@ class XlaCompiler {
     // as `tensor_array_gradients`.
     std::set<string> tensor_array_gradients;
 
+    // dynamic dims to arg number map. Empty if no dynamic shapes.
+    std::map<int32, int32> dynamic_dim_to_arg_num_map;
+    bool is_pad_arg = false;
+
     bool operator==(const Argument& other) const;
 
     // Returns a human-readable summary of the argument.
     string HumanString() const;
+
+    // Returns the dimension sizes for either TensorShape or xla::Shape.
+    std::vector<int64> DimensionSizes() const;
+
+    // Returns the human-readable string for either TensorShape or xla::Shape.
+    string ShapeHumanString() const;
   };
 
   // Options pertaining to an individual call to CompileGraph() or
@@ -420,7 +432,7 @@ class XlaCompiler {
                         XlaContext* context,
                         const std::map<int, int>& arg_cores,
                         std::vector<XlaExpression>* arg_expressions,
-                        std::vector<int>* input_mapping,
+                        std::vector<int>* input_to_args,
                         std::vector<xla::Shape>* input_shapes,
                         bool is_entry_computation);
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index fe2a5f5b0c9ea6b5f2bb71df836fdcabf9a0cf23..b31137867d738944eaaa73e142ad8538ec6b854a 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -82,7 +82,7 @@ namespace {
 // compiled kernels.
 class DummyResourceForTest : public ResourceBase {
  public:
-  string DebugString() override { return "dummy"; }
+  string DebugString() const override { return "dummy"; }
   void Increment() { ++value_; }
   int Get() { return value_; }
 
@@ -277,6 +277,97 @@ TEST_F(XlaCompilerTest, OutOfOrderGraph) {
   EXPECT_TRUE(xla::LiteralTestUtil::Equal(param0_literal, actual_literal));
 }
 
+// Tests that the compiler can correctly propagate the layout assigned by
+// shape_representation_fn_ to return types.
+TEST_F(XlaCompilerTest, HonorShapeRepresentationFnForRetVal) {
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
+  auto var = ops::_Arg(scope.WithOpName("V"), DT_RESOURCE, 1);
+  // Adds an identity op around the resource to make sure identity ops propagate
+  // resources correctly.
+  auto identity = ops::Identity(scope.WithOpName("VIdentity"), var);
+  auto write = ops::AssignAddVariableOp(scope, identity, a);
+  auto read = ops::ReadVariableOp(
+      scope.WithControlDependencies(std::vector<Operation>{write}), var,
+      DT_INT32);
+  auto read_plus_one = ops::Add(scope, read, ops::Const<int32>(scope, 1));
+  auto d = ops::_Retval(scope.WithOpName("D"), read_plus_one, 0);
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args(2);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({2, 3});
+  args[1].kind = XlaCompiler::Argument::kResource;
+  args[1].resource_kind = XlaResource::kVariable;
+  args[1].initialized = true;
+  args[1].type = DT_INT32;
+  args[1].shape = TensorShape({2, 3});
+
+  auto options = DefaultOptions();
+  options.shape_representation_fn =
+      [](const TensorShape& shape, DataType dt) -> xla::StatusOr<xla::Shape> {
+    xla::Shape xla_shape;
+    TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dt, shape, &xla_shape));
+    *xla_shape.mutable_layout() = xla::LayoutUtil::MakeLayout({0, 1});
+    return xla_shape;
+  };
+  // Compiles the graph.
+  XlaCompiler compiler(options);
+
+  XlaCompiler::CompilationResult result;
+  TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "add",
+                                     std::move(graph), args, &result));
+  xla::Shape transposed =
+      xla::ShapeUtil::MakeShapeWithLayout(xla::S32, {2, 3}, {0, 1});
+  // Check that the return shapes are correctly tranposed.
+  EXPECT_EQ(result.xla_output_shape,
+            xla::ShapeUtil::MakeTupleShape({transposed, transposed}));
+}
+
+// The layout of resource variable shouldn't change after transpose
+TEST_F(XlaCompilerTest, TransposeVariables) {
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
+  auto var = ops::_Arg(scope.WithOpName("V"), DT_RESOURCE, 1);
+  // Adds an identity op around the resource to make sure identity ops propagate
+  // resources correctly.
+  auto identity = ops::Identity(scope.WithOpName("VIdentity"), var);
+  auto write = ops::AssignAddVariableOp(scope, identity, a);
+  auto read = ops::ReadVariableOp(
+      scope.WithControlDependencies(std::vector<Operation>{write}), var,
+      DT_INT32);
+  auto transposed_read = ops::Transpose(scope, read, {1, 0});
+  auto reshape = ops::Reshape(scope, transposed_read, {2, 3});
+  auto d = ops::_Retval(scope.WithOpName("D"), reshape, 0);
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args(2);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({2, 3});
+  args[1].kind = XlaCompiler::Argument::kResource;
+  args[1].resource_kind = XlaResource::kVariable;
+  args[1].initialized = true;
+  args[1].type = DT_INT32;
+  args[1].shape = TensorShape({2, 3});
+  // Compiles the graph.
+  XlaCompiler compiler(DefaultOptions());
+
+  XlaCompiler::CompilationResult result;
+  TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "transpose",
+                                     std::move(graph), args, &result));
+  xla::Shape transposed =
+      xla::ShapeUtil::MakeShapeWithLayout(xla::S32, {2, 3}, {1, 0});
+  // Check that the return shapes are correctly tranposed.
+  EXPECT_EQ(result.xla_output_shape,
+            xla::ShapeUtil::MakeTupleShape({transposed, transposed}));
+}
+
 // Tests that the compiler doesn't reorder the parameters.
 TEST_F(XlaCompilerTest, MixedOrderArguments) {
   for (bool swap_order : {false, true}) {
@@ -1362,7 +1453,7 @@ TEST_F(XlaCompilerTest, TokenInputAndOutput) {
     TF_ASSERT_OK(compiler.CompileGraph(options, "NoOp", std::move(graph_copy),
                                        args, &result));
     EXPECT_EQ(result.xla_input_shapes.size(), 1);
-    EXPECT_TRUE(xla::ShapeUtil::IsTuple(result.xla_output_shape));
+    EXPECT_TRUE(result.xla_output_shape.IsTuple());
     EXPECT_EQ(xla::ShapeUtil::TupleElementCount(result.xla_output_shape), 1);
   }
   {
@@ -1380,11 +1471,11 @@ TEST_F(XlaCompilerTest, TokenInputAndOutput) {
     TF_ASSERT_OK(compiler.CompileGraph(options, "NoOp", std::move(graph_copy),
                                        args, &result));
     EXPECT_EQ(result.xla_input_shapes.size(), 2);
-    EXPECT_TRUE(xla::ShapeUtil::IsToken(result.xla_input_shapes[1]));
-    EXPECT_TRUE(xla::ShapeUtil::IsTuple(result.xla_output_shape));
+    EXPECT_TRUE(result.xla_input_shapes[1].IsToken());
+    EXPECT_TRUE(result.xla_output_shape.IsTuple());
     EXPECT_EQ(xla::ShapeUtil::TupleElementCount(result.xla_output_shape), 2);
-    EXPECT_TRUE(xla::ShapeUtil::IsToken(
-        xla::ShapeUtil::GetTupleElementShape(result.xla_output_shape, 1)));
+    EXPECT_TRUE(xla::ShapeUtil::GetTupleElementShape(result.xla_output_shape, 1)
+                    .IsToken());
   }
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index a69af70503376b6c0905deb8980abdc3254a6e47..3f787fd86c9f7366a7728dcf146a3797ba672bc3 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -61,7 +61,7 @@ void XlaContext::set_args(std::vector<XlaExpression> args) {
 XlaContext::XlaContext(XlaCompiler* compiler, xla::XlaBuilder* builder)
     : compiler_(compiler), builder_(builder) {}
 
-string XlaContext::DebugString() { return "XLA JIT context"; }
+string XlaContext::DebugString() const { return "XLA JIT context"; }
 
 void XlaContext::SetRetval(int index, const XlaExpression& expression) {
   if (retvals_.size() <= index) {
@@ -76,7 +76,7 @@ XlaResource* XlaContext::AddResource(std::unique_ptr<XlaResource> resource) {
 }
 
 const xla::XlaComputation* XlaContext::GetOrCreateMax(const DataType type) {
-  return LookupOrCreate(type, &max_func_, [this, type] {
+  return LookupOrCreate(type, &max_func_, [type] {
     const string type_string = DataTypeString(type);
     VLOG(1) << "Building Max() for " << type_string;
     xla::XlaBuilder b("max<" + type_string + ">");
@@ -92,7 +92,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateMax(const DataType type) {
 }
 
 const xla::XlaComputation* XlaContext::GetOrCreateMin(const DataType type) {
-  return LookupOrCreate(type, &min_func_, [this, type] {
+  return LookupOrCreate(type, &min_func_, [type] {
     const string type_string = DataTypeString(type);
     VLOG(1) << "Building Min() for " << type_string;
     xla::XlaBuilder b("min<" + type_string + ">");
@@ -108,7 +108,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateMin(const DataType type) {
 }
 
 const xla::XlaComputation* XlaContext::GetOrCreateAdd(const DataType type) {
-  return LookupOrCreate(type, &add_func_, [this, type] {
+  return LookupOrCreate(type, &add_func_, [type] {
     const string type_string = DataTypeString(type);
     VLOG(1) << "Building Add() for " << type_string;
     xla::XlaBuilder b("add<" + type_string + ">");
@@ -124,7 +124,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateAdd(const DataType type) {
 }
 
 const xla::XlaComputation* XlaContext::GetOrCreateMul(const DataType type) {
-  return LookupOrCreate(type, &mul_func_, [this, type] {
+  return LookupOrCreate(type, &mul_func_, [type] {
     const string type_string = DataTypeString(type);
     VLOG(1) << "Building Mul() for " << type_string;
     xla::XlaBuilder b("mul<" + type_string + ">");
diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h
index 0767d1faac14cedb8666f6cc37175eb7b55f6158..eb4ad3fe6a14b42a4df2c73c71cb6df1331fd796 100644
--- a/tensorflow/compiler/tf2xla/xla_context.h
+++ b/tensorflow/compiler/tf2xla/xla_context.h
@@ -47,7 +47,7 @@ class XlaContext : public ResourceBase {
   XlaContext(XlaCompiler* compiler, xla::XlaBuilder* builder);
 
   // Virtual method defined by ResourceBase.
-  string DebugString() override;
+  string DebugString() const override;
 
   XlaCompiler* compiler() const { return compiler_; }
 
diff --git a/tensorflow/compiler/tf2xla/xla_expression.cc b/tensorflow/compiler/tf2xla/xla_expression.cc
index ca0309166b7c73d1a5a818091e2a30fa112a4de4..3d228c92adcbe3d093a4fe70d157e57ab3e80c80 100644
--- a/tensorflow/compiler/tf2xla/xla_expression.cc
+++ b/tensorflow/compiler/tf2xla/xla_expression.cc
@@ -46,6 +46,14 @@ XlaExpression XlaExpression::XlaOp(xla::XlaOp value, DataType dtype) {
   return e;
 }
 
+XlaExpression XlaExpression::TensorList(xla::XlaOp tensor_list) {
+  XlaExpression e;
+  e.kind_ = Kind::kTensorList;
+  e.dtype_ = DT_VARIANT;
+  e.handle_ = tensor_list;
+  return e;
+}
+
 XlaExpression XlaExpression::Resource(XlaResource* resource) {
   XlaExpression e;
   e.kind_ = Kind::kResource;
@@ -64,6 +72,8 @@ string XlaExpression::HumanString() const {
       return "xla_op";
     case Kind::kResource:
       return "resource";
+    case Kind::kTensorList:
+      return "tensor_list";
   }
 }
 
@@ -76,6 +86,8 @@ xla::XlaOp XlaExpression::AsXlaOp(xla::XlaBuilder* builder) const {
             HostTensorToBorrowingLiteral(constant_value_, &literal));
         return xla::ConstantLiteral(builder, literal);
       }
+      case Kind::kTensorList:
+        TF_FALLTHROUGH_INTENDED;
       case Kind::kXlaOp:
         if (builder != handle_.builder()) {
           return errors::InvalidArgument(
@@ -96,7 +108,10 @@ xla::StatusOr<absl::optional<Tensor>> XlaExpression::ResolveConstant(
       return {constant_value()};
     case Kind::kXlaOp:
       break;
+    case Kind::kTensorList:
+      TF_FALLTHROUGH_INTENDED;
     case Kind::kResource:
+      TF_FALLTHROUGH_INTENDED;
     case Kind::kInvalid:
       return errors::InvalidArgument(
           "ResolveConstant called on XlaExpression: ", HumanString());
@@ -134,6 +149,8 @@ xla::StatusOr<TensorShape> XlaExpression::GetShape() const {
       TF_RETURN_IF_ERROR(XLAShapeToTensorShape(xla_shape, &shape));
       return shape;
     }
+    case Kind::kTensorList:
+      return TensorShape({});
     case Kind::kResource:
       return TensorShape({});
     case Kind::kInvalid:
diff --git a/tensorflow/compiler/tf2xla/xla_expression.h b/tensorflow/compiler/tf2xla/xla_expression.h
index bed6761d362a98d344003c1edea342e68c31ef07..ac0232d8924cf2c9e35ad3f0772a3a2adc18af87 100644
--- a/tensorflow/compiler/tf2xla/xla_expression.h
+++ b/tensorflow/compiler/tf2xla/xla_expression.h
@@ -32,11 +32,16 @@ namespace tensorflow {
 // * a constant tensor.
 // * an xla::XlaOp, representing a symbolic XLA value.
 // * a resource, e.g., a variable, represented as an XlaResource pointer.
+// * a tensor list, represented by a tuple of tensors and the list length.
 //
 // Constant tensors are mostly an optimization to avoid passing large constants
 // to XLA, but are also sometimes used to represent tensors that have no XLA
 // representation, for example, DT_STRING tensors. A canonical use case might be
 // an error message string.
+//
+// Tensor lists are very similar to xla::XlaOp, however they require some
+// specific logic around shape management since the tuples are not supported by
+// TensorFlow.
 class XlaExpression {
  public:
   enum class Kind {
@@ -44,6 +49,7 @@ class XlaExpression {
     kConstant,
     kXlaOp,
     kResource,
+    kTensorList,
   };
 
   XlaExpression();
@@ -62,6 +68,9 @@ class XlaExpression {
   // be derived from the XLA type.
   static XlaExpression XlaOp(xla::XlaOp value, DataType dtype);
 
+  // Builds a tensor list expression.
+  static XlaExpression TensorList(xla::XlaOp tensor_list);
+
   // Builds a resource expression.
   static XlaExpression Resource(XlaResource* resource);
 
@@ -100,7 +109,8 @@ class XlaExpression {
 
   DataType dtype_ = DT_INVALID;
 
-  // The XLA handle of the expression's computation, if kind_ == kXlaOp.
+  // The XLA handle of the expression's computation, if kind_ == kXlaOp or
+  // a tuple expression if kind_ == kTensorList.
   xla::XlaOp handle_;
 
   // The value of the constant, if kind_ == kConstant.
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index c2c0751211180c3715a19d6c78e34659fd18914e..7bb1ad27467a5b281626de4203169e575288f9ee 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -34,63 +34,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-namespace {
-
-xla::XlaOp ArgMinMax(xla::XlaOp input, xla::PrimitiveType output_type, int axis,
-                     bool is_min) {
-  xla::XlaBuilder* builder = input.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape input_shape, builder->GetShape(input));
-    xla::XlaOp init_value;
-    xla::XlaComputation reducer;
-    if (is_min) {
-      init_value = xla::MaxValue(builder, input_shape.element_type());
-      reducer =
-          xla::CreateScalarMinComputation(input_shape.element_type(), builder);
-    } else {
-      init_value = xla::MinValue(builder, input_shape.element_type());
-      reducer =
-          xla::CreateScalarMaxComputation(input_shape.element_type(), builder);
-    }
-
-    xla::XlaOp input_max = xla::Reduce(input, init_value, reducer,
-                                       /*dimensions_to_reduce=*/{axis});
-    std::vector<int64> broadcast_dims(xla::ShapeUtil::Rank(input_shape) - 1);
-    std::iota(broadcast_dims.begin(), broadcast_dims.begin() + axis, 0);
-    std::iota(broadcast_dims.begin() + axis, broadcast_dims.end(), axis + 1);
-    // Compute a mask that has 1s for elements equal to the maximum.
-    xla::XlaOp partial_mask = xla::ConvertElementType(
-        xla::Eq(input, input_max, broadcast_dims), output_type);
-
-    // In order to make identity elements for a bitwise And, we:
-    //   Left shift the 1 to the leftmost bit, yielding 0x10...0
-    //   Arithmetic right shift the 1 back to the rightmost bit, yielding
-    //   0xFF...F
-    int32 bits_in_type =
-        xla::ShapeUtil::ByteSizeOfPrimitiveType(output_type) * 8 - 1;
-    xla::XlaOp shift_amount =
-        xla::ConstantR0WithType(builder, output_type, bits_in_type);
-    xla::XlaOp full_mask = xla::ShiftRightArithmetic(
-        xla::ShiftLeft(partial_mask, shift_amount), shift_amount);
-
-    // And with the vector [0, 1, 2, ...] to convert each 0xFF...F into its
-    // index.
-
-    const int64 axis_size = xla::ShapeUtil::GetDimension(input_shape, axis);
-    xla::XlaOp iota = xla::Iota(builder, output_type, axis_size);
-    xla::XlaOp product =
-        xla::And(full_mask, iota, /*broadcast_dimensions=*/{axis});
-
-    // If there are multiple maximum elements, choose the one with the highest
-    // index.
-    return xla::Reduce(product, xla::MinValue(builder, output_type),
-                       xla::CreateScalarMaxComputation(output_type, builder),
-                       /*dimensions_to_reduce=*/{axis});
-  });
-}
-
-}  // namespace
-
 xla::XlaOp XlaHelpers::Zero(xla::XlaBuilder* b, DataType data_type) {
   xla::PrimitiveType type;
   TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
@@ -120,7 +63,7 @@ xla::XlaOp XlaHelpers::FloatLiteral(xla::XlaBuilder* b, DataType data_type,
 /* static */ Status XlaHelpers::ReshapeLiteral(
     const xla::Literal& input, absl::Span<const int64> dimensions,
     xla::Literal* output) {
-  if (xla::ShapeUtil::IsTuple(input.shape())) {
+  if (input.shape().IsTuple()) {
     return errors::InvalidArgument("ReshapeLiteral does not support tuples.");
   }
   xla::Shape shape =
@@ -138,71 +81,27 @@ xla::XlaOp XlaHelpers::FloatLiteral(xla::XlaBuilder* b, DataType data_type,
   return Status::OK();
 }
 
-template <typename T>
-static Tensor MakeLinspaceTensor(const TensorShape& shape, int64 depth) {
-  Tensor linspace(DataTypeToEnum<T>::v(), shape);
-  auto linspace_flat = linspace.flat<T>();
-  for (int64 i = 0; i < depth; ++i) {
-    linspace_flat(i) = i;
-  }
-  return linspace;
-}
-
-xla::XlaOp XlaHelpers::ArgMax(xla::XlaOp input, xla::PrimitiveType output_type,
-                              int axis) {
-  return ArgMinMax(input, output_type, axis, /*is_min=*/false);
-}
-
-xla::XlaOp XlaHelpers::ArgMin(xla::XlaOp input, xla::PrimitiveType output_type,
-                              int axis) {
-  return ArgMinMax(input, output_type, axis, /*is_min=*/true);
-}
-
 Status XlaHelpers::OneHot(xla::XlaBuilder* builder, int64 depth, int axis,
                           DataType index_type, const TensorShape& indices_shape,
                           const xla::XlaOp& indices, const xla::XlaOp& on_value,
                           const xla::XlaOp& off_value, xla::XlaOp* one_hot) {
-  const int indices_dims = indices_shape.dims();
-  const int output_dims = indices_dims + 1;
-
-  TensorShape output_shape = indices_shape;
-  output_shape.InsertDim(axis, depth);
-
-  // Build a Tensor populated with values 0, 1, 2, ... depth.
-  std::vector<int64> linspace_dims(output_dims, 1);
-  linspace_dims[axis] = depth;
-  TensorShape linspace_shape(linspace_dims);
-  Tensor linspace;
-  switch (index_type) {
-    case DT_UINT8:
-      linspace = MakeLinspaceTensor<uint8>(linspace_shape, depth);
-      break;
-    case DT_INT32:
-      linspace = MakeLinspaceTensor<int32>(linspace_shape, depth);
-      break;
-    case DT_INT64:
-      linspace = MakeLinspaceTensor<int64>(linspace_shape, depth);
-      break;
-    default:
-      return errors::InvalidArgument("Invalid argument type ",
-                                     DataTypeString(index_type));
-  }
-
-  xla::BorrowingLiteral linspace_literal;
-  TF_RETURN_IF_ERROR(HostTensorToBorrowingLiteral(linspace, &linspace_literal));
-
   // Broadcast the linspace constant across the indices along the new axis,
   // and test equality at each position.
   std::vector<int64> broadcast_dims(indices_shape.dims());
   std::iota(broadcast_dims.begin(), broadcast_dims.begin() + axis, 0);
   std::iota(broadcast_dims.begin() + axis, broadcast_dims.end(), axis + 1);
-  xla::XlaOp one_hot_bool = xla::Eq(
-      indices, xla::ConstantLiteral(builder, linspace_literal), broadcast_dims);
+
+  TensorShape output_shape = indices_shape;
+  output_shape.InsertDim(axis, depth);
+  xla::Shape iota_shape;
+  TF_RETURN_IF_ERROR(
+      TensorShapeToXLAShape(index_type, output_shape, &iota_shape));
 
   // Selects the user-provided off_value and on_value values.
-  *one_hot = xla::Select(one_hot_bool,
-                         xla::Broadcast(on_value, output_shape.dim_sizes()),
-                         xla::Broadcast(off_value, output_shape.dim_sizes()));
+  *one_hot = xla::Select(
+      xla::Eq(indices, xla::Iota(builder, iota_shape, axis), broadcast_dims),
+      xla::Broadcast(on_value, output_shape.dim_sizes()),
+      xla::Broadcast(off_value, output_shape.dim_sizes()));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.h b/tensorflow/compiler/tf2xla/xla_helpers.h
index 4858dfee55a393d04cd2af83916eeb40820ee368..490923526bd3acd4b167ccb3faff1d6c9e631131 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.h
+++ b/tensorflow/compiler/tf2xla/xla_helpers.h
@@ -53,16 +53,6 @@ class XlaHelpers {
                                absl::Span<const int64> shape,
                                xla::Literal* output);
 
-  // Returns the argmax of `input` along `axis`. `output_type` is the type to
-  // use for the output.
-  static xla::XlaOp ArgMax(xla::XlaOp input, xla::PrimitiveType output_type,
-                           int axis);
-
-  // Returns the argmin of `input` along `axis`. `output_type` is the type to
-  // use for the output.
-  static xla::XlaOp ArgMin(xla::XlaOp input, xla::PrimitiveType output_type,
-                           int axis);
-
   // Converts `indices` into a one-hot representation. `depth` is the size
   // of the new axis to add. `axis` is the position at which to add the new
   // axis. `indices_shape` is the shape of `indices`. `on_value` and
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
index fabbcd04fed96ad814d04c2df9394f43bfe0cf99..884dc45cb11b18ae557c3da3f4192b3805cb7980 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
@@ -135,24 +135,34 @@ XlaJitCompiledCpuFunction::Compile(
   jit->arg_index_table_ = std::move(arg_index_table);
   jit->program_shape_ =
       absl::make_unique<xla::ProgramShapeProto>(program_shape->ToProto());
-  jit->static_data_.set_raw_function(raw_function);
-  jit->static_data_.set_buffer_infos(jit->buffer_infos_.data());
-  jit->static_data_.set_num_buffers(jit->buffer_infos_.size());
-  jit->static_data_.set_arg_index_table(jit->arg_index_table_.data());
-  jit->static_data_.set_num_args(jit->arg_index_table_.size());
-  jit->static_data_.set_result_index(result_index);
+  XlaCompiledCpuFunction::set_static_data_raw_function(&jit->static_data_,
+                                                       raw_function);
+  XlaCompiledCpuFunction::set_static_data_buffer_infos(
+      &jit->static_data_, jit->buffer_infos_.data());
+  XlaCompiledCpuFunction::set_static_data_num_buffers(
+      &jit->static_data_, jit->buffer_infos_.size());
+  XlaCompiledCpuFunction::set_static_data_arg_index_table(
+      &jit->static_data_, jit->arg_index_table_.data());
+  XlaCompiledCpuFunction::set_static_data_num_args(
+      &jit->static_data_, jit->arg_index_table_.size());
+  XlaCompiledCpuFunction::set_static_data_result_index(&jit->static_data_,
+                                                       result_index);
   // Optional metadata is collected and set below.
   CollectNames(config.feed(), &jit->nonempty_arg_names_, &jit->arg_names_);
   CollectNames(config.fetch(), &jit->nonempty_result_names_,
                &jit->result_names_);
-  jit->static_data_.set_arg_names(jit->arg_names_.data());
-  jit->static_data_.set_result_names(jit->result_names_.data());
-  jit->static_data_.set_program_shape(jit->program_shape_.get());
+  XlaCompiledCpuFunction::set_static_data_arg_names(&jit->static_data_,
+                                                    jit->arg_names_.data());
+  XlaCompiledCpuFunction::set_static_data_result_names(
+      &jit->static_data_, jit->result_names_.data());
+  XlaCompiledCpuFunction::set_static_data_program_shape(
+      &jit->static_data_, jit->program_shape_.get());
 
   if (cpu_executable->hlo_profiling_enabled()) {
-    jit->static_data_.set_hlo_profile_printer_data(
-        &cpu_executable->hlo_profile_printer_data());
-    jit->static_data_.set_profile_counters_size(
+    XlaCompiledCpuFunction::set_static_data_hlo_profile_printer_data(
+        &jit->static_data_, &cpu_executable->hlo_profile_printer_data());
+    XlaCompiledCpuFunction::set_static_data_profile_counters_size(
+        &jit->static_data_,
         cpu_executable->hlo_profile_printer_data().profile_counters_size());
   }
 
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index 58808c76de6330a6b28e21dbdead03dea25847f6..ee11f3a3de658c7e5108605122b84fbc3e1cd963 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -93,7 +93,7 @@ TensorShape XlaOpKernelContext::InputShape(absl::string_view name) {
 }
 
 DataType XlaOpKernelContext::input_type(int index) const {
-  return context_->input(index).dtype();
+  return context_->input_dtype(index);
 }
 
 DataType XlaOpKernelContext::InputType(absl::string_view name) {
@@ -178,7 +178,7 @@ Status XlaOpKernelContext::ConstantInputReshaped(
 // Converts an int32 or int64 scalar literal to an int64.
 static Status LiteralToInt64Scalar(const xla::LiteralSlice& literal,
                                    int64* out) {
-  if (xla::ShapeUtil::Rank(literal.shape()) != 0) {
+  if (literal.shape().rank() != 0) {
     return errors::InvalidArgument("value is not a scalar");
   }
   if (literal.shape().element_type() == xla::S32) {
@@ -194,7 +194,7 @@ static Status LiteralToInt64Scalar(const xla::LiteralSlice& literal,
 // Converts an float32 or float64 scalar literal to a float64.
 static Status LiteralToFloat64Scalar(const xla::LiteralSlice& literal,
                                      double* out) {
-  if (xla::ShapeUtil::Rank(literal.shape()) != 0) {
+  if (literal.shape().rank() != 0) {
     return errors::InvalidArgument("value is not a scalar");
   }
   if (literal.shape().element_type() == xla::F32) {
@@ -228,8 +228,9 @@ Status XlaOpKernelContext::ConstantInputAsFloatScalar(int index, double* out) {
 // Converts an int32 or int64 1D literal to an int64 vector.
 static Status LiteralToInt64Vector(const xla::LiteralSlice& literal,
                                    std::vector<int64>* out) {
-  if (xla::ShapeUtil::Rank(literal.shape()) != 1) {
-    return errors::InvalidArgument("value is not 1D");
+  if (literal.shape().rank() != 1) {
+    return errors::InvalidArgument("value is not 1D, rank: ",
+                                   literal.shape().rank());
   }
   int64 size = xla::ShapeUtil::ElementsIn(literal.shape());
   if (literal.shape().element_type() == xla::S32) {
@@ -318,6 +319,27 @@ Status XlaOpKernelContext::ConstantInputAsShape(int index, TensorShape* shape) {
   return Status::OK();
 }
 
+Status XlaOpKernelContext::ConstantInputAsPartialShape(
+    int index, PartialTensorShape* shape) {
+  xla::Literal literal;
+  TF_RETURN_IF_ERROR(ConstantInput(index, &literal));
+  // If `literal` is a scalar it's value must be -1.
+  if (literal.shape().rank() == 0) {
+    int64 shape_val;
+    TF_RETURN_IF_ERROR(LiteralToInt64Scalar(literal, &shape_val));
+    if (shape_val != -1) {
+      return errors::InvalidArgument(
+          "Cannot convert value to PartialTensorShape: ", shape_val);
+    }
+    *shape = PartialTensorShape();  // Shape with unknown rank.
+    return Status::OK();
+  }
+  std::vector<int64> dims;
+  TF_RETURN_IF_ERROR(LiteralToInt64Vector(literal, &dims));
+  *shape = PartialTensorShape(dims);
+  return Status::OK();
+}
+
 Status XlaOpKernelContext::InputList(absl::string_view name,
                                      std::vector<xla::XlaOp>* handles,
                                      std::vector<TensorShape>* shapes) {
@@ -353,8 +375,8 @@ Status ReadVariableInputTensor(const Tensor& tensor, DataType type,
   TF_RET_CHECK(variable != nullptr);
   TF_RET_CHECK(variable->kind() == XlaResource::kVariable);
   if (!variable->initialized()) {
-    return errors::InvalidArgument("Read of uninitialized variable ",
-                                   variable->name());
+    return errors::FailedPrecondition("Read of uninitialized variable ",
+                                      variable->name());
   }
   if (variable->type() != type) {
     return errors::InvalidArgument(
@@ -446,6 +468,16 @@ void XlaOpKernelContext::SetOutputExpression(int index,
   }
 }
 
+xla::PrimitiveType XlaOpKernelContext::output_xla_type(int index) {
+  xla::PrimitiveType type;
+  Status status = DataTypeToPrimitiveType(expected_output_dtype(index), &type);
+  if (!status.ok()) {
+    SetStatus(status);
+    return xla::PRIMITIVE_TYPE_INVALID;
+  }
+  return type;
+}
+
 void XlaOpKernelContext::SetOutput(int index, const xla::XlaOp& handle) {
   SetOutputExpression(
       index,
@@ -456,6 +488,11 @@ void XlaOpKernelContext::SetConstantOutput(int index, const Tensor& constant) {
   SetOutputExpression(index, XlaExpression::Constant(constant));
 }
 
+void XlaOpKernelContext::SetTensorListOutput(int index,
+                                             const xla::XlaOp& handle) {
+  SetOutputExpression(index, XlaExpression::TensorList(handle));
+}
+
 void XlaOpKernelContext::SetResourceOutput(int index, XlaResource* resource) {
   SetOutputExpression(index, XlaExpression::Resource(resource));
 }
@@ -497,6 +534,7 @@ Status AssignVariableTensor(const Tensor& tensor, DataType type,
     handle = xla::Reshape(handle,
                           xla::AsInt64Slice(representation_shape.dimensions()));
   }
+  variable->SetRepresentationShape(representation_shape);
   return variable->SetValue(handle);
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index 1858844bc05a6e12abbf07af83cad816590ddd03..cc2d5e8de3eb020ba41dfed7d730b48cd0534b4c 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -138,6 +138,10 @@ class XlaOpKernelContext {
   // Converts a constant 1D int32 or int64 tensor into a TensorShape.
   Status ConstantInputAsShape(int index, TensorShape* shape);
 
+  // Converts a constant 1D int32 or int64 tensor, or a scalar with value -1
+  // into a PartialTensorShape.
+  Status ConstantInputAsPartialShape(int index, PartialTensorShape* shape);
+
   // Returns the named list-valued immutable input in "list", as
   // defined in the OpDef.  If the named output is not list-valued,
   // returns a one-element list.
@@ -155,6 +159,11 @@ class XlaOpKernelContext {
     return context_->expected_output_dtype(index);
   }
 
+  // Returns the type of output `index` as an xla::PrimitiveType. If the type
+  // is not representable as an XLA type, sets an error status and returns
+  // xla::PRIMITIVE_TYPE_INVALID.
+  xla::PrimitiveType output_xla_type(int index);
+
   // Sets output `index` to the XlaOp `handle`.
   // All outputs should be set using SetOutput and SetConstantOutput, not
   // via the underlying OpKernelContext.
@@ -168,6 +177,9 @@ class XlaOpKernelContext {
   // Returns an XlaExpression describing the value of 'index'.
   void SetOutputExpression(int index, const XlaExpression& expression);
 
+  // Sets output `index` to the Tensor List `handle`.
+  void SetTensorListOutput(int index, const xla::XlaOp& handle);
+
   // Status handling.
   void SetStatus(const Status& status) { context_->SetStatus(status); }
   Status status() { return context_->status(); }
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index 14237df69081016817fbd1a5332f22996e7f264d..26314034a18b2a77a3529f0c1af242e29ec69902 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -73,6 +73,11 @@ XlaOpRegistry::~XlaOpRegistry() = default;
                  << " have incompatible allow_resource_types settings.";
     return false;
   }
+  if (x.allow_variant_types != y.allow_variant_types) {
+    LOG(WARNING) << "Registrations of " << x.name
+                 << " have incompatible allow_variant_types settings.";
+    return false;
+  }
   if (!x.has_device_whitelist && !y.has_device_whitelist) {
     LOG(WARNING) << "Duplicate registrations of " << x.name
                  << "with no device whitelists.";
@@ -289,6 +294,9 @@ void XlaOpRegistry::RegisterCompilationKernels() {
           if (op_registration->allow_resource_types) {
             allowed_values->add_type(DT_RESOURCE);
           }
+          if (op_registration->allow_variant_types) {
+            allowed_values->add_type(DT_VARIANT);
+          }
           // Don't build KernelDefs that have unsatisfiable type constraints.
           if (allowed_values->type().empty()) {
             unsatisfiable_type_constraint = true;
@@ -485,6 +493,11 @@ XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::AllowResourceTypes() {
   return *this;
 }
 
+XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::AllowVariantTypes() {
+  registration_->allow_variant_types = true;
+  return *this;
+}
+
 XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::TypeConstraint(
     absl::string_view attr_name, DataType allowed) {
   std::set<DataType>& types =
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index 0bdd4a1085445420a5147756daac4a54f4725f11..c5e078a02d1ca6fdd8405ae6556a5205e387421e 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -47,13 +47,14 @@ extern const char* const DEVICE_XLA_GPU;
 
 constexpr std::array<DataType, 4> kFloatTypes = {
     {DT_HALF, DT_FLOAT, DT_DOUBLE, DT_BFLOAT16}};
-constexpr std::array<DataType, 11> kNumericTypes = {
+constexpr std::array<DataType, 12> kNumericTypes = {
     {DT_UINT8, DT_UINT32, DT_UINT64, DT_INT8, DT_INT32, DT_INT64, DT_HALF,
-     DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BFLOAT16}};
+     DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128, DT_BFLOAT16}};
 
-constexpr std::array<DataType, 14> kCpuAllTypes = {
+constexpr std::array<DataType, 15> kCpuAllTypes = {
     {DT_UINT8, DT_QUINT8, DT_UINT32, DT_UINT64, DT_INT8, DT_QINT8, DT_INT32,
-     DT_QINT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BOOL}};
+     DT_QINT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64,
+     DT_COMPLEX128, DT_BOOL}};
 
 constexpr std::array<DataType, 15> kGpuAllTypes = {
     {DT_UINT8, DT_QUINT8, DT_UINT32, DT_UINT64, DT_INT8, DT_QINT8, DT_INT32,
@@ -211,6 +212,10 @@ class XlaOpRegistry {
     // allow DT_RESOURCE.
     bool allow_resource_types = false;
 
+    // Should we allow variant types for type attributes? Used by While to
+    // allow TensorList which is of type DT_VARIANT.
+    bool allow_variant_types = false;
+
     // Mapping from attribute name to a list of supported types.
     std::unordered_map<string, std::set<DataType>> type_constraints;
 
@@ -232,9 +237,9 @@ class XlaOpRegistry {
 
   // Returns true if registrations x and y can both be added to the registry.
   // This is always the case if they refer to different ops. If they refer to
-  // the same op name, they must: have the same values for compilation_only and
-  // allow_resource_types; use a device_whitelist; and their
-  // whitelists must not intersect.
+  // the same op name, they must: have the same values for compilation_only,
+  // allow_resource_types and allow_variant_types; use a device_whitelist; and
+  // their whitelists must not intersect.
   static bool IsCompatible(const OpRegistration& x, const OpRegistration& y);
 
   static Status CompileTimeConstantInputs(const NodeDef& node_def,
@@ -292,6 +297,9 @@ class XlaOpRegistrationBuilder {
   // Allow DT_RESOURCE types for type parameters.
   XlaOpRegistrationBuilder& AllowResourceTypes();
 
+  // Allow DT_VARIANT types for type parameters.
+  XlaOpRegistrationBuilder& AllowVariantTypes();
+
   // Mark 'input_name' as an argument whose value must be known at compile-time.
   XlaOpRegistrationBuilder& CompileTimeConstantInput(
       absl::string_view input_name);
diff --git a/tensorflow/compiler/tf2xla/xla_resource.h b/tensorflow/compiler/tf2xla/xla_resource.h
index 736588bb8b89ba756cdce77eeebff8d1fcf4774c..ab3a5bdd9bc580c16d65d35c3be3ba8204511f83 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.h
+++ b/tensorflow/compiler/tf2xla/xla_resource.h
@@ -86,6 +86,12 @@ class XlaResource {
   // variables have new values that need to be written back.
   const xla::XlaOp& initial_value() const { return initial_value_; }
 
+  // An xla shape that indicates how this resource variable is represented on
+  // device.
+  const absl::optional<xla::Shape>& representation_shape() const {
+    return representation_shape_;
+  }
+
   // A variable is initialized if it has a value.
   bool initialized() const { return value_.valid(); }
 
@@ -100,6 +106,11 @@ class XlaResource {
   // Sets the current value of the resource to an all-zero value.
   Status SetZeroValue(xla::XlaBuilder* builder);
 
+  // Sets the representational shape of the resource on device.
+  void SetRepresentationShape(const xla::Shape& shape) {
+    representation_shape_ = absl::make_optional(shape);
+  }
+
   // Looks up the gradient for `source`, or creates it if it does not already
   // exist. The call target must be an initialized TensorArray resource. A
   // TensorArray can have multiple named gradients; see the operator
@@ -160,6 +171,10 @@ class XlaResource {
   xla::XlaOp value_;
   xla::XlaOp initial_value_;
 
+  // An xla shape that indicates how this resource variable is represented on
+  // device.
+  absl::optional<xla::Shape> representation_shape_;
+
   int64 max_array_size_ = -1;
   bool tensor_array_multiple_writes_aggregate_ = false;
 
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 4360e0857964b0ac63fc887e269b04a4b00d854a..ee6f7d5956ede4af99498ca0df5de47150cc5e4d 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -109,7 +109,7 @@ cc_library(
     name = "status_macros",
     srcs = ["status_macros.cc"],
     hdrs = ["status_macros.h"],
-    visibility = [":friends"],
+    visibility = ["//visibility:public"],
     deps = [
         ":statusor",
         ":types",
@@ -150,9 +150,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":status",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor/lib",
     ],
 )
 
@@ -194,7 +192,7 @@ cc_library(
         ":types",
         ":util",
         "//tensorflow/core:lib",
-        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
     ],
 )
 
@@ -224,6 +222,7 @@ cc_library(
     name = "shape_util",
     srcs = [
         "index_util.cc",
+        "layout.cc",
         "layout_util.cc",
         "primitive_util.cc",
         "shape.cc",
@@ -231,6 +230,7 @@ cc_library(
     ],
     hdrs = [
         "index_util.h",
+        "layout.h",
         "layout_util.h",
         "primitive_util.h",
         "shape.h",
@@ -290,6 +290,22 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "primitive_util_test",
+    srcs = ["primitive_util_test.cc"],
+    deps = [
+        ":shape_util",
+        ":status_macros",
+        ":test",
+        ":test_helpers",
+        ":types",
+        ":util",
+        ":xla_data_proto",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cc_test(
     name = "layout_util_test",
     srcs = ["layout_util_test.cc"],
@@ -301,6 +317,22 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "layout_test",
+    srcs = ["layout_test.cc"],
+    deps = [
+        ":shape_util",
+        ":status_macros",
+        ":test",
+        ":test_helpers",
+        ":types",
+        ":util",
+        ":xla_data_proto",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 tf_cc_test(
     name = "index_util_test",
     srcs = ["index_util_test.cc"],
@@ -575,6 +607,7 @@ cc_library(
         ":types",
         ":util",
         ":xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "@com_google_absl//absl/memory",
@@ -682,6 +715,7 @@ cc_library(
         ":types",
         ":xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
@@ -705,8 +739,8 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_evaluator",
         "//tensorflow/compiler/xla/service:shape_inference",
-        "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
     ],
@@ -790,13 +824,13 @@ cc_library(
         "debug_options_parsers.h",
     ],
     hdrs = ["debug_options_flags.h"],
+    visibility = [":friends"],
     deps =
         [
             ":parse_flags_from_env",
             "//tensorflow/compiler/xla:xla_proto",
             "//tensorflow/compiler/xla/service:hlo",
             "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
             "@com_google_absl//absl/strings",
         ],
 )
diff --git a/tensorflow/compiler/xla/array.h b/tensorflow/compiler/xla/array.h
index 58cc1575858201b4508d7340cb47e59c4f4c5783..529e7f77cec43f3158fcb59a53efa9a085d7422a 100644
--- a/tensorflow/compiler/xla/array.h
+++ b/tensorflow/compiler/xla/array.h
@@ -272,6 +272,15 @@ class Array {
     std::iota(&values_[0], &values_[0] + num_elements(), value);
   }
 
+  // Fills the array with a repeating sequence:
+  //   [value, value + 1, ..., value + length - 1, value, ... ]
+  void FillRepeatedIota(const T& value, int64 length) {
+    for (int64 i = 0; i < num_elements(); i += length) {
+      std::iota(&values_[i], &values_[std::min(i + length, num_elements())],
+                value);
+    }
+  }
+
   // Fills the array with the sequence i*multiplier for i=0,1,...
   void FillWithMultiples(const T& multiplier) {
     for (int64 i = 0; i < num_elements(); ++i) {
@@ -280,11 +289,11 @@ class Array {
   }
 
   // Fills the array with random normal variables with the specified mean.
-  void FillRandom(const T& value, const double mean = 0.0,
+  void FillRandom(const T& stddev, const double mean = 0.0,
                   const int seed = 12345) {
     std::mt19937 g(seed);
     std::normal_distribution<double> distribution(mean,
-                                                  static_cast<double>(value));
+                                                  static_cast<double>(stddev));
     for (int64 i = 0; i < num_elements(); ++i) {
       values_[i] = static_cast<T>(distribution(g));
     }
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index fe99564d3c671cd7890e1fa26fcd2e3384972983..f5d56e8a9e1f3a05e1039f7cc90194407200f1ab 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -3,7 +3,7 @@
 
 licenses(["notice"])  # Apache 2.0
 
-package(default_visibility = [":friends"])
+package(default_visibility = ["//visibility:public"])
 
 package_group(
     name = "friends",
@@ -170,6 +170,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -245,6 +246,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index 74b76f929949d3300a5d0ff45d5fa4cd9f162642..4f020bcec2756a328755d86ab04154d54f532465 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -186,7 +186,7 @@ StatusOr<Literal> Client::ComputeConstant(const XlaComputation& computation,
   ComputeConstantGraphRequest request;
   *request.mutable_computation() = computation.proto();
   if (output_layout != nullptr) {
-    *request.mutable_output_layout() = *output_layout;
+    *request.mutable_output_layout() = output_layout->ToProto();
   }
 
   ComputeConstantResponse response;
@@ -278,53 +278,51 @@ StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
     const XlaComputation& computation, absl::Span<GlobalData* const> arguments,
     const ExecutionOptions* execution_options,
     ExecutionProfile* execution_profile) {
-  if (execution_options != nullptr &&
-      execution_options->device_handles_size() > 1) {
-    std::vector<XlaComputationInstance> computation_instances = {
-        XlaComputationInstance{
-            computation,
-            std::vector<GlobalData*>(arguments.begin(), arguments.end()),
-            *execution_options, execution_profile}};
-    TF_ASSIGN_OR_RETURN(auto results, ExecuteParallel(computation_instances));
-    // The result selection is a bit hacky, but better than assuming it is
-    // device 0.
-    //
-    // TODO(b/118493728): Allow Execute to return one result per computation.
-    for (int64 i = 0; i < results.size(); i++) {
-      TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(*results[i]));
-      if (!ShapeUtil::IsEmptyTuple(shape)) {
-        VLOG(3) << "Fetching result from device " << i << ": "
-                << ShapeUtil::HumanString(shape);
-        return std::move(results[i]);
-      }
+  // Create an ExecutionOptions if necessary, or set its DeviceHandles.
+  absl::optional<ExecutionOptions> options_storage;
+  if (!execution_options || execution_options->device_handles().empty()) {
+    if (execution_options) {
+      options_storage.emplace(*execution_options);
+    } else {
+      options_storage.emplace(CreateDefaultExecutionOptions());
     }
-    TF_RET_CHECK(!results.empty());
-    VLOG(1) << "Defaulting to device 0 result";
-    return std::move(results[0]);
-  }
-
-  // The argument shapes affect how the computation is compiled.
-  std::vector<Shape> arg_shapes(arguments.size());
-  for (int i = 0; i < arguments.size(); i++) {
-    TF_ASSIGN_OR_RETURN(arg_shapes[i], GetShape(*arguments[i]));
-  }
-
-  TF_ASSIGN_OR_RETURN(auto handle,
-                      Compile(computation, arg_shapes, execution_options));
-
-  TF_ASSIGN_OR_RETURN(auto result,
-                      Execute(handle, arguments, execution_profile));
-
-  if (execution_profile != nullptr) {
-    if (VLOG_IS_ON(1)) {
-      TF_ASSIGN_OR_RETURN(
-          auto execution_stats,
-          ExecutionStatsAsString(computation, *execution_profile));
-      VLOG(1) << execution_stats;
+    execution_options = &*options_storage;
+
+    TF_ASSIGN_OR_RETURN(auto device_handles,
+                        GetDeviceHandles(/*device_count=*/1));
+    TF_RET_CHECK(!device_handles.empty());
+    *options_storage->add_device_handles() = std::move(device_handles[0]);
+  }
+
+  std::vector<XlaComputationInstance> computation_instances = {
+      XlaComputationInstance{
+          computation,
+          std::vector<GlobalData*>(arguments.begin(), arguments.end()),
+          *execution_options, execution_profile}};
+
+  // Instead of invoking Compile() and Execute(), invoke
+  // Service::ExecuteParallel() to execute our one computation.  Compile()
+  // caches the executable forever, which isn't what we want.
+  VLOG(1) << "Making ExecuteParallel request: "
+          << execution_options->DebugString();
+  TF_ASSIGN_OR_RETURN(auto results, ExecuteParallel(computation_instances));
+  VLOG(1) << "ExecuteParallel request done.";
+
+  // The result selection is a bit hacky, but better than assuming it is
+  // device 0.
+  //
+  // TODO(b/118493728): Allow Execute to return one result per computation.
+  for (int64 i = 0; i < results.size(); i++) {
+    TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(*results[i]));
+    if (!ShapeUtil::IsEmptyTuple(shape)) {
+      VLOG(3) << "Fetching result from device " << i << ": "
+              << ShapeUtil::HumanString(shape);
+      return std::move(results[i]);
     }
   }
-
-  return std::move(result);
+  TF_RET_CHECK(!results.empty());
+  VLOG(1) << "Defaulting to device 0 result";
+  return std::move(results[0]);
 }
 
 StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
diff --git a/tensorflow/compiler/xla/client/client.h b/tensorflow/compiler/xla/client/client.h
index d0ac4703c632e0e01d3c8911594b46fedf28930d..eff8713ac340e82ee7633f1f078334ba73b67b2f 100644
--- a/tensorflow/compiler/xla/client/client.h
+++ b/tensorflow/compiler/xla/client/client.h
@@ -52,6 +52,12 @@ class Client {
   //   need to live beyond this call.)
   // * If execution_options.device_handles should be empty. If you need
   //   non-empty device handles, call 'Execute' instead.
+  //
+  // TODO(b/122731460): This call caches the resulting Executable in the Service
+  // *forever*.  If you're only going to run the computation once, you may want
+  // to call the Execute(const XlaComputation&) overload.  If you're going to
+  // run the computation more than once but you want control over when the
+  // Executable is unloaded, use the LocalClient API.
   StatusOr<ExecutionHandle> Compile(
       const XlaComputation& computation,
       absl::Span<const Shape> argument_shapes,
@@ -76,6 +82,10 @@ class Client {
   //   device is chosen by the service.
   // * If execution_profile is not nullptr then the pointed-to ExecutionProfile
   //   will be filled with profile data from the execution.
+  //
+  // TODO(b/122731460): The given computation is compiled and then thrown away
+  // immediately after it's run.  If you want control over how long the
+  // resulting Executable lives, use the LocalClient API.
   StatusOr<std::unique_ptr<GlobalData>> Execute(
       const XlaComputation& computation,
       absl::Span<GlobalData* const> arguments,
diff --git a/tensorflow/compiler/xla/client/client_library.cc b/tensorflow/compiler/xla/client/client_library.cc
index 27b7fa7b29206affa9f9c2e4becd9e4ea66484ab..42aae026229a49fd801cc90562fa51f604336148 100644
--- a/tensorflow/compiler/xla/client/client_library.cc
+++ b/tensorflow/compiler/xla/client/client_library.cc
@@ -24,12 +24,14 @@ limitations under the License.
 
 namespace xla {
 
-LocalClientOptions::LocalClientOptions(se::Platform* platform,
-                                       int number_of_replicas,
-                                       int intra_op_parallelism_threads)
+LocalClientOptions::LocalClientOptions(
+    se::Platform* platform, int number_of_replicas,
+    int intra_op_parallelism_threads,
+    const absl::optional<std::set<int>>& allowed_devices)
     : platform_(platform),
       number_of_replicas_(number_of_replicas),
-      intra_op_parallelism_threads_(intra_op_parallelism_threads) {}
+      intra_op_parallelism_threads_(intra_op_parallelism_threads),
+      allowed_devices_(allowed_devices) {}
 
 LocalClientOptions& LocalClientOptions::set_platform(se::Platform* platform) {
   platform_ = platform;
@@ -58,6 +60,17 @@ int LocalClientOptions::intra_op_parallelism_threads() const {
   return intra_op_parallelism_threads_;
 }
 
+LocalClientOptions& LocalClientOptions::set_allowed_devices(
+    const absl::optional<std::set<int>>& allowed_devices) {
+  allowed_devices_ = allowed_devices;
+  return *this;
+}
+
+const absl::optional<std::set<int>>& LocalClientOptions::allowed_devices()
+    const {
+  return allowed_devices_;
+}
+
 /* static */ ClientLibrary& ClientLibrary::Singleton() {
   static ClientLibrary* c = new ClientLibrary;
   return *c;
@@ -67,9 +80,10 @@ ClientLibrary::ClientLibrary() = default;
 ClientLibrary::~ClientLibrary() = default;
 
 /* static */ StatusOr<LocalClient*> ClientLibrary::GetOrCreateLocalClient(
-    se::Platform* platform) {
+    se::Platform* platform, const absl::optional<std::set<int>>& device_set) {
   LocalClientOptions default_options;
   default_options.set_platform(platform);
+  default_options.set_allowed_devices(device_set);
   return GetOrCreateLocalClient(default_options);
 }
 
@@ -94,7 +108,7 @@ ClientLibrary::~ClientLibrary() = default;
   service_options.set_number_of_replicas(replica_count);
   service_options.set_intra_op_parallelism_threads(
       options.intra_op_parallelism_threads());
-
+  service_options.set_allowed_devices(options.allowed_devices());
   auto instance = absl::make_unique<LocalInstance>();
   TF_ASSIGN_OR_RETURN(instance->service,
                       LocalService::NewService(service_options));
diff --git a/tensorflow/compiler/xla/client/client_library.h b/tensorflow/compiler/xla/client/client_library.h
index 3ad558fa532931937fab898f7b855f0a3370eaec..62d225c6c298b26bbbd248fc1f4be64fc8efcf6b 100644
--- a/tensorflow/compiler/xla/client/client_library.h
+++ b/tensorflow/compiler/xla/client/client_library.h
@@ -23,9 +23,11 @@ limitations under the License.
 
 #include <functional>
 #include <memory>
+#include <set>
 #include <string>
 #include <vector>
 
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/client/compile_only_client.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/service/compile_only_service.h"
@@ -43,9 +45,10 @@ namespace xla {
 // Options to configure the local client when it is created.
 class LocalClientOptions {
  public:
-  LocalClientOptions(se::Platform* platform = nullptr,
-                     int number_of_replicas = 1,
-                     int intra_op_parallelism_threads = -1);
+  LocalClientOptions(
+      se::Platform* platform = nullptr, int number_of_replicas = 1,
+      int intra_op_parallelism_threads = -1,
+      const absl::optional<std::set<int>>& allowed_devices = absl::nullopt);
 
   // Set the platform backing the service, or nullptr for the default platform.
   LocalClientOptions& set_platform(se::Platform* platform);
@@ -60,10 +63,17 @@ class LocalClientOptions {
   LocalClientOptions& set_intra_op_parallelism_threads(int num_threads);
   int intra_op_parallelism_threads() const;
 
+  // Sets the allowed_devices set for selectively constructing stream executors
+  // on the platform.
+  LocalClientOptions& set_allowed_devices(
+      const absl::optional<std::set<int>>& allowed_devices);
+  const absl::optional<std::set<int>>& allowed_devices() const;
+
  private:
   se::Platform* platform_;
   int number_of_replicas_;
   int intra_op_parallelism_threads_;
+  absl::optional<std::set<int>> allowed_devices_;
 };
 
 class ClientLibrary {
@@ -73,8 +83,11 @@ class ClientLibrary {
   //
   //   platform : The platform the underlying XLA service should target. If
   //     null then default platform is used.
+  //   device_set: Set of device IDs for which the stream executor will be
+  //   created, for the given platform.
   static StatusOr<LocalClient*> GetOrCreateLocalClient(
-      se::Platform* platform = nullptr);
+      se::Platform* platform = nullptr,
+      const absl::optional<std::set<int>>& allowed_devices = absl::nullopt);
   static StatusOr<LocalClient*> GetOrCreateLocalClient(
       const LocalClientOptions& options);
 
diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc
index 1f594e551af381d7537e947892cbf7e0b5b3b861..ec0e08975926f36c36c854f83a40b374b12a09a4 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.cc
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@@ -58,6 +58,12 @@ const Shape* ExecutableBuildOptions::result_layout() const {
   return result_layout_set_ ? &result_layout_ : nullptr;
 }
 
+ExecutableBuildOptions& ExecutableBuildOptions::set_num_replicas(
+    int num_replicas) {
+  num_replicas_ = num_replicas;
+  return *this;
+}
+
 string ExecutableBuildOptions::ToString() const {
   string result_layout = "nullopt";
   if (result_layout_set_) {
@@ -65,8 +71,9 @@ string ExecutableBuildOptions::ToString() const {
   }
   return absl::StrFormat(
       "ExecutableBuildOptions{device_ordinal=%d, result_layout=%s, "
-      "generate_hlo_graph=%s}",
-      device_ordinal_, result_layout, debug_options().xla_generate_hlo_graph());
+      "generate_hlo_graph=%s, num_replicas=%d}",
+      device_ordinal_, result_layout, debug_options().xla_generate_hlo_graph(),
+      num_replicas_);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h
index a58090253bfac7779e4b61bc7231a0f0d945cc00..1d85fb34304b95d1fccdb0b0d6a7a65e739fae18 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.h
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@@ -67,12 +67,18 @@ class ExecutableBuildOptions {
   // debugging.
   string ToString() const;
 
+  // The number of replicas of this computation that are to be executed.
+  // Defaults to 1.
+  int num_replicas() const { return num_replicas_; }
+  ExecutableBuildOptions& set_num_replicas(int num_replicas);
+
  private:
   int device_ordinal_ = -1;
   Shape result_layout_;
   bool result_layout_set_ = false;
   absl::optional<DebugOptions> debug_options_;
   DeviceMemoryAllocator* device_allocator_ = nullptr;
+  int num_replicas_ = 1;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index 41db8de29ff0085a30847ff41db4ffbfc774e2a1..c5dea5f18030f2d226c86e3408ea85b2b5989728 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -1,5 +1,7 @@
 # Common computation builders for XLA.
 
+load("//tensorflow/compiler/xla/tests:build_defs.bzl", "generate_backend_suites", "xla_test")
+
 licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = ["//tensorflow/compiler/xla/client:friends"])
@@ -13,9 +15,6 @@ filegroup(
     ]),
 )
 
-load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
-load("//tensorflow/compiler/xla/tests:build_defs.bzl", "generate_backend_suites")
-
 # Generate test_suites for all backends, named "${backend}_tests".
 generate_backend_suites()
 
@@ -35,6 +34,95 @@ cc_library(
     ],
 )
 
+xla_test(
+    name = "arithmetic_test",
+    srcs = ["arithmetic_test.cc"],
+    deps = [
+        ":arithmetic",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
+cc_library(
+    name = "cholesky",
+    srcs = ["cholesky.cc"],
+    hdrs = ["cholesky.h"],
+    deps = [
+        ":math",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/compiler/xla/client/lib:loops",
+        "//tensorflow/compiler/xla/client/lib:matrix",
+        "//tensorflow/compiler/xla/client/lib:slicing",
+        "//tensorflow/core:lib",
+    ],
+)
+
+xla_test(
+    name = "cholesky_test",
+    srcs = ["cholesky_test.cc"],
+    tags = ["optonly"],
+    deps = [
+        ":arithmetic",
+        ":cholesky",
+        ":matrix",
+        "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
+cc_library(
+    name = "comparators",
+    srcs = ["comparators.cc"],
+    hdrs = ["comparators.h"],
+    deps = [
+        ":constants",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_test(
+    name = "comparators_test",
+    srcs = ["comparators_test.cc"],
+    deps = [
+        ":comparators",
+        ":constants",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/container:inlined_vector",
+    ],
+)
+
 cc_library(
     name = "constants",
     srcs = ["constants.cc"],
@@ -52,7 +140,6 @@ cc_library(
 xla_test(
     name = "constants_test",
     srcs = ["constants_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         ":constants",
         "//tensorflow/compiler/xla:test",
@@ -75,11 +162,28 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "loops",
+    srcs = ["loops.cc"],
+    hdrs = ["loops.h"],
+    deps = [
+        ":constants",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 cc_library(
     name = "math",
     srcs = ["math.cc"],
     hdrs = ["math.h"],
     deps = [
+        ":arithmetic",
         ":constants",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -90,7 +194,23 @@ cc_library(
 xla_test(
     name = "math_test",
     srcs = ["math_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
+    deps = [
+        ":math",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
+xla_test(
+    name = "math_exhaustive_test",
+    srcs = ["math_exhaustive_test.cc"],
+    shard_count = 16,
     deps = [
         ":math",
         "//tensorflow/compiler/xla:literal_util",
@@ -110,13 +230,18 @@ cc_library(
     deps = [
         ":arithmetic",
         ":constants",
+        ":slicing",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -124,16 +249,19 @@ cc_library(
 xla_test(
     name = "matrix_test",
     srcs = ["matrix_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         ":matrix",
         ":slicing",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -172,23 +300,59 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/base",
     ],
 )
 
 cc_library(
-    name = "slicing",
-    srcs = ["slicing.cc"],
-    hdrs = ["slicing.h"],
+    name = "qr",
+    srcs = ["qr.cc"],
+    hdrs = ["qr.h"],
     deps = [
+        ":arithmetic",
+        ":constants",
+        ":loops",
+        ":math",
+        ":matrix",
+        ":slicing",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/core:lib",
+    ],
+)
+
+xla_test(
+    name = "qr_test",
+    srcs = ["qr_test.cc"],
+    tags = ["optonly"],
+    deps = [
+        ":matrix",
+        ":qr",
+        "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:array3d",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
+cc_library(
+    name = "slicing",
+    srcs = ["slicing.cc"],
+    hdrs = ["slicing.h"],
+    deps = [
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -196,13 +360,11 @@ cc_library(
 xla_test(
     name = "slicing_test",
     srcs = ["slicing_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         ":slicing",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -214,6 +376,7 @@ cc_library(
     srcs = ["sorting.cc"],
     hdrs = ["sorting.h"],
     deps = [
+        ":comparators",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
@@ -225,13 +388,42 @@ cc_library(
 xla_test(
     name = "sorting_test",
     srcs = ["sorting_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         ":sorting",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
+cc_library(
+    name = "quantize",
+    hdrs = ["quantize.h"],
+    deps = [
+        ":constants",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/core:lib",
+    ],
+)
+
+xla_test(
+    name = "quantize_test",
+    srcs = ["quantize_test.cc"],
+    # TODO(b/122119490): re-enable TAP after fixing.
+    tags = [
+        "notap",
+    ],
+    deps = [
+        ":quantize",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -260,46 +452,52 @@ cc_library(
 )
 
 cc_library(
-    name = "triangular_solve",
-    srcs = ["triangular_solve.cc"],
-    hdrs = ["triangular_solve.h"],
+    name = "self_adjoint_eig",
+    srcs = ["self_adjoint_eig.cc"],
+    hdrs = ["self_adjoint_eig.h"],
     deps = [
-        "//tensorflow/compiler/xla:literal",
+        ":arithmetic",
+        ":comparators",
+        ":constants",
+        ":loops",
+        ":math",
+        ":matrix",
+        ":slicing",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/compiler/xla/client/lib:math",
-        "//tensorflow/compiler/xla/client/lib:matrix",
-        "//tensorflow/compiler/xla/client/lib:slicing",
         "//tensorflow/core:lib",
     ],
 )
 
 xla_test(
-    name = "triangular_solve_test",
-    srcs = ["triangular_solve_test.cc"],
-    tags = ["noasan"],  # sometimes times out, http://b/78650012
+    name = "self_adjoint_eig_test",
+    srcs = ["self_adjoint_eig_test.cc"],
+    blacklisted_backends = [
+        "cpu",
+        "gpu",
+    ],
+    real_hardware_only = True,
+    shard_count = 10,
+    tags = ["optonly"],
     deps = [
-        ":triangular_solve",
+        ":arithmetic",
+        ":constants",
+        ":matrix",
+        ":self_adjoint_eig",
         "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:lib",
         "//tensorflow/core:test",
     ],
 )
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.cc b/tensorflow/compiler/xla/client/lib/arithmetic.cc
index e86c10f030f3990d67e5a6638100640f73c82307..3b875135af29f142463ffd783bfeaadc61ada1af 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.cc
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.cc
@@ -117,10 +117,70 @@ XlaOp Any(XlaOp predicates) {
     XlaComputation logical_or = CreateScalarOrComputation(PRED, builder);
     TF_ASSIGN_OR_RETURN(const Shape& predicates_shape,
                         builder->GetShape(predicates));
-    std::vector<int64> all_dimensions(ShapeUtil::Rank(predicates_shape));
+    std::vector<int64> all_dimensions(predicates_shape.rank());
     std::iota(all_dimensions.begin(), all_dimensions.end(), 0);
     return Reduce(predicates, f, logical_or, all_dimensions);
   });
 }
 
+namespace {
+
+XlaOp ArgMinMax(XlaOp input, PrimitiveType output_type, int axis, bool is_min) {
+  XlaBuilder* builder = input.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape input_shape, builder->GetShape(input));
+    XlaOp init_value;
+    XlaComputation reducer;
+    if (is_min) {
+      init_value = MaxValue(builder, input_shape.element_type());
+      reducer = CreateScalarMinComputation(input_shape.element_type(), builder);
+    } else {
+      init_value = MinValue(builder, input_shape.element_type());
+      reducer = CreateScalarMaxComputation(input_shape.element_type(), builder);
+    }
+
+    XlaOp input_max = Reduce(input, init_value, reducer,
+                             /*dimensions_to_reduce=*/{axis});
+    std::vector<int64> broadcast_dims(input_shape.rank() - 1);
+    std::iota(broadcast_dims.begin(), broadcast_dims.begin() + axis, 0);
+    std::iota(broadcast_dims.begin() + axis, broadcast_dims.end(), axis + 1);
+    // Compute a mask that has 1s for elements equal to the maximum.
+    XlaOp partial_mask =
+        ConvertElementType(Eq(input, input_max, broadcast_dims), output_type);
+
+    // In order to make identity elements for a bitwise And, we:
+    //   Left shift the 1 to the leftmost bit, yielding 0x10...0
+    //   Arithmetic right shift the 1 back to the rightmost bit, yielding
+    //   0xFF...F
+    int32 bits_in_type =
+        ShapeUtil::ByteSizeOfPrimitiveType(output_type) * 8 - 1;
+    XlaOp shift_amount = ConstantR0WithType(builder, output_type, bits_in_type);
+    XlaOp full_mask = ShiftRightArithmetic(
+        ShiftLeft(partial_mask, shift_amount), shift_amount);
+
+    // And with the vector [0, 1, 2, ...] to convert each 0xFF...F into its
+    // index.
+
+    const int64 axis_size = ShapeUtil::GetDimension(input_shape, axis);
+    XlaOp iota = Iota(builder, output_type, axis_size);
+    XlaOp product = And(full_mask, iota, /*broadcast_dimensions=*/{axis});
+
+    // If there are multiple maximum elements, choose the one with the highest
+    // index.
+    return Reduce(product, MinValue(builder, output_type),
+                  CreateScalarMaxComputation(output_type, builder),
+                  /*dimensions_to_reduce=*/{axis});
+  });
+}
+
+}  // namespace
+
+XlaOp ArgMax(XlaOp input, PrimitiveType output_type, int axis) {
+  return ArgMinMax(input, output_type, axis, /*is_min=*/false);
+}
+
+XlaOp ArgMin(XlaOp input, PrimitiveType output_type, int axis) {
+  return ArgMinMax(input, output_type, axis, /*is_min=*/true);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.h b/tensorflow/compiler/xla/client/lib/arithmetic.h
index 632e8cc8bc64fad236a0226c6e93079aadde7050..d4a7812c441c351b121e5d72faf9642b06728b18 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.h
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.h
@@ -57,6 +57,14 @@ XlaComputation CreateScalarOrComputation(PrimitiveType type,
 // Note: if predicates is zero-sized, Any() vacuously returns false.
 XlaOp Any(XlaOp predicates);
 
+// Returns the argmax of `input` along `axis`. `output_type` is the type to
+// use for the output.
+XlaOp ArgMax(XlaOp input, PrimitiveType output_type, int axis);
+
+// Returns the argmin of `input` along `axis`. `output_type` is the type to
+// use for the output.
+XlaOp ArgMin(XlaOp input, PrimitiveType output_type, int axis);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_ARITHMETIC_H_
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic_test.cc b/tensorflow/compiler/xla/client/lib/arithmetic_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a13839f9db89b9c07f2465867a503ef2193f8160
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/arithmetic_test.cc
@@ -0,0 +1,67 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+using ArithmeticTest = ClientLibraryTestBase;
+
+XLA_TEST_F(ArithmeticTest, ArgMinR2Axis0) {
+  XlaBuilder builder(TestName());
+  auto x = ConstantR2<int32>(&builder, {{1, 7, 4}, {6, 3, 5}, {8, 3, 3}});
+  ArgMin(x, S32, /*axis=*/0);
+
+  std::vector<int32> expected = {0, 2, 2};
+  ComputeAndCompareR1<int32>(&builder, expected, {});
+}
+
+XLA_TEST_F(ArithmeticTest, ArgMinR2Axis1) {
+  XlaBuilder builder(TestName());
+  auto x = ConstantR2<int32>(&builder, {{1, 7, 4}, {6, 3, 5}, {8, 3, 3}});
+  ArgMin(x, S32, /*axis=*/1);
+
+  std::vector<int32> expected = {0, 1, 2};
+  ComputeAndCompareR1<int32>(&builder, expected, {});
+}
+
+XLA_TEST_F(ArithmeticTest, ArgMaxR2Axis0) {
+  XlaBuilder builder(TestName());
+  auto x = ConstantR2<int32>(&builder, {{1, 7, 4}, {6, 3, 5}, {8, 3, 3}});
+  ArgMax(x, S32, /*axis=*/0);
+
+  std::vector<int32> expected = {2, 0, 1};
+  ComputeAndCompareR1<int32>(&builder, expected, {});
+}
+
+XLA_TEST_F(ArithmeticTest, ArgMaxR2Axis1) {
+  XlaBuilder builder(TestName());
+  auto x = ConstantR2<int32>(&builder, {{1, 7, 4}, {6, 3, 5}, {8, 3, 3}});
+  ArgMax(x, S32, /*axis=*/1);
+
+  std::vector<int32> expected = {1, 0, 0};
+  ComputeAndCompareR1<int32>(&builder, expected, {});
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.cc b/tensorflow/compiler/xla/client/lib/cholesky.cc
similarity index 57%
rename from tensorflow/compiler/tf2xla/lib/cholesky.cc
rename to tensorflow/compiler/xla/client/lib/cholesky.cc
index 550ab5b05693b79e60e49577309328ac6846d3f9..bb41f9932d1cc62b62d37fea2c10fbfeaa0bd15e 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.cc
+++ b/tensorflow/compiler/xla/client/lib/cholesky.cc
@@ -13,25 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/lib/cholesky.h"
+#include "tensorflow/compiler/xla/client/lib/cholesky.h"
 
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/tf2xla/lib/util.h"
-#include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/loops.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/lib/slicing.h"
-#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/core/errors.h"
 
-namespace tensorflow {
+namespace xla {
 
 namespace {
 
@@ -50,70 +50,63 @@ namespace {
 //     l[..., j+1:, j] = (a[..., j+1:, j] - np.dot(l[..., j+1:, :j], row_t)) /
 //                       l[..., j, j]
 //   return l
-xla::XlaOp CholeskyUnblocked(xla::XlaOp a,
-                             xla::PrecisionConfig::Precision precision) {
-  xla::XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
-    const int n_dims = xla::ShapeUtil::Rank(a_shape);
-    const int64 n = xla::ShapeUtil::GetDimension(a_shape, -1);
-    auto major_dims = xla::AsInt64Slice(a_shape.dimensions())
+XlaOp CholeskyUnblocked(XlaOp a, PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = a.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+    const int n_dims = a_shape.rank();
+    const int64 n = ShapeUtil::GetDimension(a_shape, -1);
+    auto major_dims = AsInt64Slice(a_shape.dimensions())
                           .subspan(
                               /*pos=*/0,
                               /*len=*/n_dims - 2);
 
-    xla::XlaOp l = xla::ZerosLike(a);
+    XlaOp l = ZerosLike(a);
 
     // Construct the for loop body to iterate over rows.
-    auto body_fn = [&](xla::XlaOp i, absl::Span<const xla::XlaOp> loop_vars,
-                       xla::XlaBuilder* body_builder)
-        -> xla::StatusOr<std::vector<xla::XlaOp>> {
-      xla::Shape col_shape;
-      xla::Shape row_shape;
-      for (int64 d : major_dims) {
-        row_shape.add_dimensions(d);
-        col_shape.add_dimensions(d);
-      }
-      row_shape.add_dimensions(1);
-      row_shape.add_dimensions(n);
-      row_shape.set_element_type(a_shape.element_type());
-      auto mask_zeros_row = xla::Zeros(body_builder, row_shape);
-
-      col_shape.add_dimensions(n);
-      col_shape.add_dimensions(1);
-      col_shape.set_element_type(a_shape.element_type());
-      auto mask_zeros_col = xla::Zeros(body_builder, col_shape);
-
-      std::vector<int32> mask_vector(n);
-      std::iota(mask_vector.begin(), mask_vector.end(), 0);
-      auto mask_range = xla::ConstantR1<int32>(body_builder, mask_vector);
+    auto body_fn =
+        [&](XlaOp i, absl::Span<const XlaOp> loop_vars,
+            XlaBuilder* body_builder) -> StatusOr<std::vector<XlaOp>> {
+      std::vector<int64> row_shape_dims(major_dims.begin(), major_dims.end());
+      std::vector<int64> col_shape_dims(major_dims.begin(), major_dims.end());
+      row_shape_dims.push_back(1);
+      row_shape_dims.push_back(n);
+      auto mask_zeros_row =
+          Zeros(body_builder,
+                ShapeUtil::MakeShape(a_shape.element_type(), row_shape_dims));
+
+      col_shape_dims.push_back(n);
+      col_shape_dims.push_back(1);
+      auto mask_zeros_col =
+          Zeros(body_builder,
+                ShapeUtil::MakeShape(a_shape.element_type(), col_shape_dims));
+
       auto mask_range_row =
-          xla::Broadcast(xla::Reshape(mask_range, {0}, {1, n}), major_dims);
+          Iota(body_builder, ShapeUtil::MakeShape(S32, row_shape_dims),
+               /*iota_dimension=*/n_dims - 1);
       auto mask_range_col =
-          xla::Broadcast(xla::Reshape(mask_range, {0}, {n, 1}), major_dims);
+          Iota(body_builder, ShapeUtil::MakeShape(S32, col_shape_dims),
+               /*iota_dimension=*/n_dims - 2);
       auto body_a = loop_vars[0];
       auto body_l = loop_vars[1];
 
       // row = l[..., i, :i]
       // select the whole i-th row, then mask out all columns past i-1
-      auto zero = xla::ConstantR0<int32>(body_builder, 0);
+      auto zero = ConstantR0<int32>(body_builder, 0);
       auto l_i = DynamicSliceInMinorDims(body_l, {i, zero}, {1, n});
-      auto row = xla::Select(xla::Ge(mask_range_row, i), mask_zeros_row, l_i);
+      auto row = Select(Ge(mask_range_row, i), mask_zeros_row, l_i);
       // a[..., i, i]
       auto a_ii = DynamicSliceInMinorDims(body_a, {i, i}, {1, 1});
       // np.dot(row, np.swapaxes(row, -1, -2))
       auto diag_dot = BatchDot(row, TransposeInMinorDims(row), precision);
       // l[..., i, i] = np.sqrt(a[..., i, i] - np.dot(row,
       //                                              np.swapaxes(row, -1, -2)))
-      auto l_ii =
-          xla::Pow(a_ii - diag_dot,
-                   FloatLiteral(body_builder, a_shape.element_type(), 0.5));
+      auto l_ii = Sqrt(a_ii - diag_dot);
 
       // a[..., i+1:, i]
       // select the whole i-th column, then mask out all rows above i+1
       auto a_0i = DynamicSliceInMinorDims(body_a, {i}, {1});
-      auto a_ip1i =
-          xla::Select(xla::Le(mask_range_col, i), mask_zeros_col, a_0i);
+      auto a_ip1i = Select(Le(mask_range_col, i), mask_zeros_col, a_0i);
 
       // l[..., i+1:, i] = (a[..., i+1:, i] - np.dot(l[..., i+1:, :i], r.T)) /
       //                   l[..., i, i]
@@ -122,8 +115,7 @@ xla::XlaOp CholeskyUnblocked(xla::XlaOp a,
       // r.T)
       auto dot = BatchDot(body_l, TransposeInMinorDims(row), precision);
       // np.dot(l[..., i+1:, :i], r.T)
-      auto dot_ip1 =
-          xla::Select(xla::Le(mask_range_col, i), mask_zeros_col, dot);
+      auto dot_ip1 = Select(Le(mask_range_col, i), mask_zeros_col, dot);
 
       body_l =
           DynamicUpdateSliceInMinorDims(body_l, (a_ip1i - dot_ip1) / l_ii, {i});
@@ -131,12 +123,12 @@ xla::XlaOp CholeskyUnblocked(xla::XlaOp a,
       // column assign will wrap around and overwrite the diagonal assign.
       body_l = DynamicUpdateSliceInMinorDims(body_l, l_ii, {i, i});
 
-      return std::vector<xla::XlaOp>{body_a, body_l};
+      return std::vector<XlaOp>{body_a, body_l};
     };
 
     TF_ASSIGN_OR_RETURN(
         auto cholesky_while,
-        XlaForEachIndex(n, xla::S32, body_fn, {a, l}, "unblocked", builder));
+        ForEachIndex(n, S32, body_fn, {a, l}, "unblocked", builder));
 
     return cholesky_while[1];
   });
@@ -144,34 +136,41 @@ xla::XlaOp CholeskyUnblocked(xla::XlaOp a,
 
 }  // namespace
 
-xla::XlaOp Cholesky(xla::XlaOp a, int64 block_size,
-                    xla::PrecisionConfig::Precision precision) {
-  xla::XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
-    const int ndims = xla::ShapeUtil::Rank(a_shape);
+XlaOp Cholesky(XlaOp a, int64 block_size,
+               PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = a.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+    const int ndims = a_shape.rank();
     if (ndims < 2) {
-      return errors::InvalidArgument(
-          "Arguments to Cholesky must have rank >= 2: ", ndims);
+      return InvalidArgument(
+          "Argument to Cholesky must have rank >= 2; shape was %s",
+          a_shape.ToString());
+    }
+
+    const int64 n = ShapeUtil::GetDimension(a_shape, -1);
+    if (n != ShapeUtil::GetDimension(a_shape, -2)) {
+      return InvalidArgument(
+          "Argument to Cholesky must be batched square matrices; got shape %s",
+          ShapeUtil::HumanString(a_shape));
     }
 
-    const int64 n = xla::ShapeUtil::GetDimension(a_shape, -1);
-    if (n != xla::ShapeUtil::GetDimension(a_shape, -2)) {
-      return errors::InvalidArgument(
-          "Arguments to Cholesky must be square matrices: ",
-          xla::ShapeUtil::HumanString(a_shape));
+    if (primitive_util::IsComplexType(a_shape.element_type())) {
+      return Unimplemented(
+          "Complex types are not implemented in Cholesky; got shape %s",
+          ShapeUtil::HumanString(a_shape));
     }
 
     if (block_size < 1) {
-      return errors::InvalidArgument(
-          "block_size argument to Cholesky must be >= 1; got ", block_size);
+      return InvalidArgument(
+          "block_size argument to Cholesky must be >= 1; got %d", block_size);
     }
 
     // Blocked left-looking Cholesky factorization.
     // Algorithm 1 from
     // Haidar, Azzam, et al. "High-performance Cholesky factorization for
     // GPU-only execution." Proceedings of General Purpose GPUs. ACM, 2017.
-    xla::XlaOp l = xla::ZerosLike(a);
+    XlaOp l = ZerosLike(a);
     for (int64 i = 0; i < n; i += block_size) {
       int64 k = std::min(block_size, n - i);
       if (i > 0) {
@@ -194,12 +193,12 @@ xla::XlaOp Cholesky(xla::XlaOp a, int64 block_size,
         // l[i+k:, i:i+k] =
         //     trsm_right_transpose(l[i:i+k, i:i+k], a[i+k:, i:i+k])
         auto panel = SliceInMinorDims(a, {i + k, i}, {n, i + k});
-        auto update = TriangularSolve(factorized, panel,
-                                      /*left_side=*/false,
-                                      /*lower=*/true,
-                                      /*transpose_a=*/true,
-                                      /*conjugate_a=*/false,
-                                      /*block_size=*/block_size);
+        auto update =
+            TriangularSolve(factorized, panel,
+                            /*left_side=*/false,
+                            /*lower=*/true,
+                            /*unit_diagonal=*/false,
+                            /*transpose_a=*/TriangularSolveOptions::TRANSPOSE);
         l = UpdateSliceInMinorDims(l, update, {i + k, i});
       }
     }
@@ -207,4 +206,4 @@ xla::XlaOp Cholesky(xla::XlaOp a, int64 block_size,
   });
 }
 
-}  // namespace tensorflow
+}  // namespace xla
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.h b/tensorflow/compiler/xla/client/lib/cholesky.h
similarity index 87%
rename from tensorflow/compiler/tf2xla/lib/cholesky.h
rename to tensorflow/compiler/xla/client/lib/cholesky.h
index 9a561c34b92ee45059f2a05336e682838f8e36e2..0bae26837c0f14dd0cfab82cf426becc787ec11c 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.h
+++ b/tensorflow/compiler/xla/client/lib/cholesky.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_CHOLESKY_H_
-#define TENSORFLOW_COMPILER_TF2XLA_LIB_CHOLESKY_H_
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_CHOLESKY_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_CHOLESKY_H_
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
-namespace tensorflow {
+namespace xla {
 
 // Computes the Cholesky decompositions of a batch of symmetric positive
 // definite matrices.
@@ -34,6 +34,6 @@ xla::XlaOp Cholesky(
     xla::XlaOp a, int64 block_size = 256,
     xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::HIGHEST);
 
-}  // namespace tensorflow
+}  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_CHOLESKY_H_
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_CHOLESKY_H_
diff --git a/tensorflow/compiler/xla/client/lib/cholesky_test.cc b/tensorflow/compiler/xla/client/lib/cholesky_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..095dd4fbf8b7c90047c4428b50c626c16e9c1e94
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/cholesky_test.cc
@@ -0,0 +1,166 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/cholesky.h"
+
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace {
+
+using xla::int64;
+
+using CholeskyTest = xla::ClientLibraryTestBase;
+
+XLA_TEST_F(CholeskyTest, Simple) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::Array2D<float> a_vals({
+      {4, 6, 8, 10},
+      {6, 45, 54, 63},
+      {8, 54, 146, 166},
+      {10, 63, 166, 310},
+  });
+
+  xla::XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_vals, 0, "a", &builder, &a);
+  xla::Cholesky(a, /*block_size=*/2);
+
+  xla::Array2D<float> expected({
+      {2, 0, 0, 0},
+      {3, 6, 0, 0},
+      {4, 7, 9, 0},
+      {5, 8, 10, 11},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get()},
+                             xla::ErrorSpec(1e-4, 1e-4));
+}
+
+XLA_TEST_F(CholeskyTest, Simple2) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::Array2D<float> a_vals({
+      {16, 24, 8, 12},
+      {24, 61, 82, 48},
+      {8, 82, 456, 106},
+      {12, 48, 106, 62},
+  });
+
+  xla::XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_vals, 0, "a", &builder, &a);
+  xla::Cholesky(a);
+
+  xla::Array2D<float> expected(
+      {{4, 0, 0, 0}, {6, 5, 0, 0}, {2, 14, 16, 0}, {3, 6, 1, 4}});
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get()},
+                             xla::ErrorSpec(1e-4, 1e-4));
+}
+
+XLA_TEST_F(CholeskyTest, SimpleBatched) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::Array3D<float> a_vals({
+      {
+          {4, 6, 8, 10},
+          {6, 45, 54, 63},
+          {8, 54, 146, 166},
+          {10, 63, 166, 310},
+      },
+      {
+          {16, 24, 8, 12},
+          {24, 61, 82, 48},
+          {8, 82, 456, 106},
+          {12, 48, 106, 62},
+      },
+  });
+
+  xla::XlaOp a;
+  auto a_data = CreateR3Parameter<float>(a_vals, 0, "a", &builder, &a);
+  xla::Cholesky(a);
+
+  xla::Array3D<float> expected({
+      {
+          {2, 0, 0, 0},
+          {3, 6, 0, 0},
+          {4, 7, 9, 0},
+          {5, 8, 10, 11},
+      },
+      {{4, 0, 0, 0}, {6, 5, 0, 0}, {2, 14, 16, 0}, {3, 6, 1, 4}},
+  });
+
+  ComputeAndCompareR3<float>(&builder, expected, {a_data.get()},
+                             xla::ErrorSpec(1e-4, 1e-4));
+}
+
+using CholeskyTestCase = std::tuple<int64, int64>;
+
+class RandomCholeskyTest
+    : public xla::ClientLibraryTestBase,
+      public ::testing::WithParamInterface<CholeskyTestCase> {};
+
+XLA_TEST_P(RandomCholeskyTest, Random) {
+  xla::XlaBuilder builder(TestName());
+
+  auto test_params = GetParam();
+  std::vector<int64> dimensions = {std::get<0>(test_params),
+                                   std::get<1>(test_params),
+                                   std::get<1>(test_params)};
+  xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, dimensions);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto literal,
+      xla::LiteralUtil::CreateRandomLiteral<xla::F32>(shape, 0.0, 1.0));
+
+  auto input = xla::Parameter(&builder, 0, shape, "input");
+  // Form a random positive definite matrix.
+  auto matrix = xla::BatchDot(input, TransposeInMinorDims(input),
+                              xla::PrecisionConfig::HIGHEST);
+
+  auto cholesky = xla::Cholesky(matrix, /*block_size=*/4);
+
+  // Verify that ||matrix - cholesky * cholesky_t||_2 ~= 0
+  auto verification = xla::BatchDot(cholesky, TransposeInMinorDims(cholesky),
+                                    xla::PrecisionConfig::HIGHEST);
+  auto delta = matrix - verification;
+  xla::Reduce(delta * delta, xla::ConstantR0<float>(&builder, 0.0),
+              CreateScalarAddComputation(xla::F32, &builder), {0, 1, 2});
+
+  TF_ASSERT_OK_AND_ASSIGN(auto input_data, client_->TransferToServer(literal));
+  ComputeAndCompareR0<float>(&builder, 0.0, {input_data.get()},
+                             xla::ErrorSpec(1e-4, 1e-4));
+}
+
+INSTANTIATE_TEST_SUITE_P(RandomCholeskyTestInstance, RandomCholeskyTest,
+                         ::testing::Values(CholeskyTestCase{1, 1},
+                                           CholeskyTestCase{1, 2},
+                                           CholeskyTestCase{10, 5},
+                                           CholeskyTestCase{2, 20}));
+
+}  // namespace
diff --git a/tensorflow/compiler/xla/client/lib/comparators.cc b/tensorflow/compiler/xla/client/lib/comparators.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c620c9841a5146618e3a142adeb3fe2da525950a
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/comparators.cc
@@ -0,0 +1,159 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
+
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+using XlaOpGenerator = XlaOp (*)(const XlaOp&, const XlaOp&,
+                                 absl::Span<const int64>);
+
+XlaOp BitcastConvertFloatingPointToIntegral(const XlaOp& value,
+                                            int64 bit_width) {
+  PrimitiveType signed_type;
+  PrimitiveType unsigned_type;
+  XlaOp max_value;
+  switch (bit_width) {
+    case 16:
+      max_value =
+          ConstantR0(value.builder(),
+                     static_cast<uint16>(std::numeric_limits<int16>::max()));
+      signed_type = S16;
+      unsigned_type = U16;
+      break;
+    case 32:
+      max_value =
+          ConstantR0(value.builder(),
+                     static_cast<uint32>(std::numeric_limits<int32>::max()));
+      signed_type = S32;
+      unsigned_type = U32;
+      break;
+    case 64:
+      max_value =
+          ConstantR0(value.builder(),
+                     static_cast<uint64>(std::numeric_limits<int64>::max()));
+      signed_type = S64;
+      unsigned_type = U64;
+      break;
+    default:
+      return value.builder()->ReportError(
+          InvalidArgument("Invalid bit width %lld for Comparator floating "
+                          "point parameter.",
+                          bit_width));
+  }
+  // Switch from a floating point value to a integer value in such a way that
+  // when using the integer value to compare, we get the same result for normal
+  // values, and -Nan is treated as the smallest value, and Nan is treated as
+  // the largest value.
+  // If f is a float, and
+  // x = bit_cast<int32>(f);
+  // y = x < 0 ? numeric_limits<int32>::max() - x : x;
+  // then y is ordered as an int32 such that finite values have the obvious
+  // order, -0 is ordered before 0, and -NaN and NaN appear at the beginning
+  // and end of the ordering.
+  // Note that in order to avoid -x to overflow, we calculate
+  // numeric_limits<int32>::max() - x as unsigned, and then convert back to
+  // signed.
+  auto signed_value = BitcastConvertType(value, signed_type);
+  auto unsigned_value = BitcastConvertType(value, unsigned_type);
+  auto flipped_value =
+      BitcastConvertType(Sub(max_value, unsigned_value), signed_type);
+  auto is_negative = Lt(signed_value, Zero(value.builder(), signed_type));
+  return Select(is_negative, flipped_value, signed_value);
+}
+
+XlaComputation CreateScalarComparisonComputation(
+    const string& name, const std::vector<PrimitiveType>& operand_types,
+    XlaBuilder* builder, XlaOpGenerator generator) {
+  // Create a default computation where we compare only the first two
+  // parameters of type 'operand_types[0]'.
+  auto b = builder->CreateSubBuilder(name);
+  if (operand_types.empty()) {
+    b->ReportError(InvalidArgument("operand_types should not be empty"));
+    return b->BuildAndNoteError();
+  }
+
+  int64 parameter_count = 0;
+  XlaOp first_lhs_param;
+  XlaOp first_rhs_param;
+
+  // For each type in 'operand_types' we create two parameters of this type. The
+  // idea is that this computation can be used by n-ary Sort, and potentially
+  // should support comparing also the other operands of sort. In this default
+  // computation, however, we will not actually use any parameters except the
+  // first two.
+  for (auto operand_type : operand_types) {
+    auto scalar_shape = ShapeUtil::MakeShape(operand_type, {});
+    auto lhs_param = Parameter(b.get(), parameter_count * 2, scalar_shape,
+                               absl::StrCat("p.", parameter_count, ".lhs"));
+    auto rhs_param = Parameter(b.get(), parameter_count * 2 + 1, scalar_shape,
+                               absl::StrCat("p.", parameter_count, ".rhs"));
+    if (parameter_count == 0) {
+      first_lhs_param = lhs_param;
+      first_rhs_param = rhs_param;
+    }
+    ++parameter_count;
+  }
+  if (primitive_util::IsFloatingPointType(operand_types[0])) {
+    PrimitiveType compare_type = operand_types[0];
+    // Special-case handling for BF16. We currently do not support direct
+    // comparisons with BF16, so we convert to F32 and then use the F32
+    // comparison logic.
+    if (compare_type == BF16) {
+      compare_type = F32;
+      first_lhs_param = ConvertElementType(first_lhs_param, F32);
+      first_rhs_param = ConvertElementType(first_rhs_param, F32);
+    }
+    int64 bit_width = primitive_util::BitWidth(compare_type);
+    first_lhs_param =
+        BitcastConvertFloatingPointToIntegral(first_lhs_param, bit_width);
+    first_rhs_param =
+        BitcastConvertFloatingPointToIntegral(first_rhs_param, bit_width);
+  }
+  generator(first_lhs_param, first_rhs_param, {});
+  return b->BuildAndNoteError();
+}
+}  // namespace
+
+// Creates a scalar less-than computation and returns it.
+XlaComputation CreateScalarLtComputation(
+    const std::vector<PrimitiveType>& operand_types, XlaBuilder* builder) {
+  return CreateScalarComparisonComputation("compare-less-than", operand_types,
+                                           builder, Lt);
+}
+
+// Creates a scalar greater-than computation and returns it.
+XlaComputation CreateScalarGtComputation(
+    const std::vector<PrimitiveType>& operand_types, XlaBuilder* builder) {
+  return CreateScalarComparisonComputation("compare-greater-than",
+                                           operand_types, builder, Gt);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/comparators.h b/tensorflow/compiler/xla/client/lib/comparators.h
new file mode 100644
index 0000000000000000000000000000000000000000..cbcfc227dd495537f59bf0a9090bad8ade15da62
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/comparators.h
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_COMPARATORS_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_COMPARATORS_H_
+
+#include <vector>
+
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+
+// Creates a scalar less-than computation and returns it. The created
+// computation has 2 * 'operand_types.size()' many parameters, where parameters
+// 2 * i and 2 * i + 1 are a scalar with primitive type 'operand_types[i]'. The
+// computation compares the first two parameters. For floating point types, a
+// total order is created where
+// -NaN < -infinity < ... < -0 < 0 < ... < infinity < NaN
+XlaComputation CreateScalarLtComputation(
+    const std::vector<PrimitiveType>& operand_types, XlaBuilder* builder);
+
+// Creates a scalar greater-than computation and returns it. The created
+// computation has 2 * 'operand_types.size()' many parameters, where parameters
+// 2 * i and 2 * i + 1 are a scalar with primitive type 'operand_types[i]'. The
+// computation compares the first two parameters. For floating point types, a
+// total order is created where
+// NaN > infinity > ... > 0 > -0 > ... > -infinity > -NaN
+XlaComputation CreateScalarGtComputation(
+    const std::vector<PrimitiveType>& operand_types, XlaBuilder* builder);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_COMPARATORS_H_
diff --git a/tensorflow/compiler/xla/client/lib/comparators_test.cc b/tensorflow/compiler/xla/client/lib/comparators_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..598956803b34702b1e095a342648d348fa350b29
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/comparators_test.cc
@@ -0,0 +1,149 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
+
+#include <limits>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+class ComparatorsTest : public ClientLibraryTestBase {
+ public:
+  ComparatorsTest() : builder_(TestName()) {}
+  XlaBuilder* builder() { return &builder_; }
+
+ private:
+  XlaBuilder builder_;
+};
+
+template <
+    PrimitiveType type,
+    typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
+void BuildComparatorAndComparisons(ComparatorsTest* test,
+                                   bool compare_less_than,
+                                   absl::InlinedVector<bool, 10>* expected) {
+  auto compare = compare_less_than
+                     ? CreateScalarLtComputation({type}, test->builder())
+                     : CreateScalarGtComputation({type}, test->builder());
+
+  auto negative_nan = ConstantR0<T>(
+      test->builder(), -T(std::numeric_limits<float>::quiet_NaN()));
+  auto positive_nan = ConstantR0<T>(test->builder(),
+                                    T(std::numeric_limits<float>::quiet_NaN()));
+  auto negative_zero = ConstantR0<T>(test->builder(), T(-0.));
+  auto positive_zero = ConstantR0<T>(test->builder(), T(0.));
+  auto negative_infinity = MinValue(test->builder(), type);
+  auto positive_infinity = MaxValue(test->builder(), type);
+
+  // List the values in the expected sorting order from smallest to largest.
+  std::vector<XlaOp> all_constants{negative_nan,      negative_infinity,
+                                   negative_zero,     positive_zero,
+                                   positive_infinity, positive_nan};
+
+  // Do pairwise comparisons.
+  std::vector<XlaOp> all_comparisons;
+  for (const XlaOp& lhs_constant : all_constants) {
+    for (const XlaOp& rhs_constant : all_constants) {
+      all_comparisons.push_back(Broadcast(
+          Call(test->builder(), compare, {lhs_constant, rhs_constant}), {1}));
+    }
+  }
+
+  // Concantenate the comparison results.
+  ConcatInDim(test->builder(), all_comparisons, 0);
+
+  // If we use less-than comparisons, we expect the comparison to result in true
+  // if the lhs value to be compared appears earlier in 'all_constants' than the
+  // rhs value. Likewise, if we use greater-than comparisons, we expect the
+  // comparison to return true if the rhs value appears earlier in
+  // 'all_constants' than the lhs value.
+  expected->clear();
+  for (int i = 0; i < all_constants.size(); ++i) {
+    for (int j = 0; j < all_constants.size(); ++j) {
+      expected->push_back(compare_less_than ? i < j : i > j);
+    }
+  }
+}
+
+XLA_TEST_F(ComparatorsTest, CompareLtBF16) {
+  absl::InlinedVector<bool, 10> expected;
+  BuildComparatorAndComparisons<BF16>(this, /*compare_less_than=*/true,
+                                      &expected);
+  ComputeAndCompareR1<bool>(builder(), expected, {});
+}
+
+XLA_TEST_F(ComparatorsTest, CompareGtBF16) {
+  absl::InlinedVector<bool, 10> expected;
+  BuildComparatorAndComparisons<BF16>(this, /*compare_less_than=*/false,
+                                      &expected);
+  ComputeAndCompareR1<bool>(builder(), expected, {});
+}
+
+XLA_TEST_F(ComparatorsTest, CompareLtF16) {
+  absl::InlinedVector<bool, 10> expected;
+  BuildComparatorAndComparisons<F16>(this, /*compare_less_than=*/true,
+                                     &expected);
+  ComputeAndCompareR1<bool>(builder(), expected, {});
+}
+
+XLA_TEST_F(ComparatorsTest, CompareGtF16) {
+  absl::InlinedVector<bool, 10> expected;
+  BuildComparatorAndComparisons<F16>(this, /*compare_less_than=*/false,
+                                     &expected);
+  ComputeAndCompareR1<bool>(builder(), expected, {});
+}
+
+XLA_TEST_F(ComparatorsTest, CompareLtF32) {
+  absl::InlinedVector<bool, 10> expected;
+  BuildComparatorAndComparisons<F32>(this, /*compare_less_than=*/true,
+                                     &expected);
+  ComputeAndCompareR1<bool>(builder(), expected, {});
+}
+
+XLA_TEST_F(ComparatorsTest, CompareGtF32) {
+  absl::InlinedVector<bool, 10> expected;
+  BuildComparatorAndComparisons<F32>(this, /*compare_less_than=*/false,
+                                     &expected);
+  ComputeAndCompareR1<bool>(builder(), expected, {});
+}
+
+XLA_TEST_F(ComparatorsTest, CompareLtF64) {
+  absl::InlinedVector<bool, 10> expected;
+  BuildComparatorAndComparisons<F64>(this, /*compare_less_than=*/true,
+                                     &expected);
+  ComputeAndCompareR1<bool>(builder(), expected, {});
+}
+
+XLA_TEST_F(ComparatorsTest, CompareGtF64) {
+  absl::InlinedVector<bool, 10> expected;
+  BuildComparatorAndComparisons<F64>(this, /*compare_less_than=*/false,
+                                     &expected);
+  ComputeAndCompareR1<bool>(builder(), expected, {});
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/constants.cc b/tensorflow/compiler/xla/client/lib/constants.cc
index 1ada7b4a964ccf7ca400b937abbe425bef083468..6bd56a8df0a5d0417f747a158664ed0daa8a7b40 100644
--- a/tensorflow/compiler/xla/client/lib/constants.cc
+++ b/tensorflow/compiler/xla/client/lib/constants.cc
@@ -80,6 +80,24 @@ XlaOp MinFiniteValue(XlaBuilder* builder, PrimitiveType type) {
   }
 }
 
+XlaOp MinPositiveNormalValue(XlaBuilder* builder, PrimitiveType type) {
+  switch (type) {
+    case F16:
+      return ConstantR0<Eigen::half>(builder,
+                                     std::numeric_limits<Eigen::half>::min());
+    case BF16:
+      return ConstantR0<bfloat16>(builder, bfloat16::min_positive_normal());
+    case F32:
+      return ConstantR0<float>(builder, std::numeric_limits<float>::min());
+    case F64:
+      return ConstantR0<double>(builder, std::numeric_limits<double>::min());
+    default:
+      return builder->ReportError(
+          InvalidArgument("Invalid type for MinPositiveNormalValue (%s).",
+                          PrimitiveType_Name(type)));
+  }
+}
+
 XlaOp MaxValue(XlaBuilder* builder, PrimitiveType type) {
   return ConstantLiteral(builder, LiteralUtil::MaxValue(type));
 }
@@ -100,4 +118,28 @@ XlaOp MaxFiniteValue(XlaBuilder* builder, PrimitiveType type) {
   }
 }
 
+XlaOp NanValue(XlaBuilder* builder, PrimitiveType type) {
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    switch (type) {
+      case F16:
+        return ConstantR0<Eigen::half>(
+            builder, Eigen::NumTraits<Eigen::half>::quiet_NaN());
+      case BF16:
+        return ConstantR0<bfloat16>(
+            builder, bfloat16(std::numeric_limits<float>::quiet_NaN()));
+      case F32:
+        return ConstantR0<float>(builder,
+                                 std::numeric_limits<float>::quiet_NaN());
+      case F64:
+        return ConstantR0<double>(builder,
+                                  std::numeric_limits<double>::quiet_NaN());
+      default:
+        return InvalidArgument(
+            "Operand to NanValue was %s, but must be a real-valued "
+            "floating-point type.",
+            PrimitiveType_Name(type));
+    }
+  });
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/constants.h b/tensorflow/compiler/xla/client/lib/constants.h
index 81624614c1e3599dfe116eb61d9e2edcd5230684..47b8f1b44ffa12b2b15be0e865d693a709962e6e 100644
--- a/tensorflow/compiler/xla/client/lib/constants.h
+++ b/tensorflow/compiler/xla/client/lib/constants.h
@@ -56,6 +56,8 @@ XlaOp ConstantR0WithType(XlaBuilder* builder, PrimitiveType type, T value) {
       return ConstantR0<double>(builder, static_cast<double>(value));
     case C64:
       return ConstantR0<complex64>(builder, static_cast<complex64>(value));
+    case C128:
+      return ConstantR0<complex128>(builder, static_cast<complex128>(value));
     case U8:
       return ConstantR0<uint8>(builder, static_cast<uint8>(value));
     case U32:
@@ -88,6 +90,27 @@ XlaOp ScalarLike(XlaOp prototype, T value) {
   });
 }
 
+// Returns an array or scalar containing copies of `value` cast to the same
+// run-type type as `prototype` and broadcast to the same dimensions as
+// `prototype`.
+//
+// If `prototype` is not a scalar or array, returns an error.
+template <typename T>
+XlaOp FullLike(XlaOp prototype, T value) {
+  XlaBuilder* builder = prototype.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(prototype));
+    if (ShapeUtil::IsScalar(shape) || shape.IsArray()) {
+      return Broadcast(ScalarLike(prototype, value), shape.dimensions());
+    } else {
+      return InvalidArgument(
+          "Prototype shape for BroadcastConstantLike must be a scalar or "
+          "array, but was %s",
+          shape.ToString());
+    }
+  });
+}
+
 // Returns a scalar with value '0' of 'type'.
 XlaOp Zero(XlaBuilder* builder, PrimitiveType type);
 
@@ -112,6 +135,9 @@ XlaOp MinValue(XlaBuilder* builder, PrimitiveType type);
 // point type, this is equal to -MaxFiniteValue().
 XlaOp MinFiniteValue(XlaBuilder* builder, PrimitiveType type);
 
+// Returns the minimum positive normal value for floating-point type `type`.
+XlaOp MinPositiveNormalValue(XlaBuilder* builder, PrimitiveType type);
+
 // Returns the maximum representable finite or infinite value for 'type'.
 // Returns 'inf' for floating-point types.
 XlaOp MaxValue(XlaBuilder* builder, PrimitiveType type);
@@ -119,6 +145,9 @@ XlaOp MaxValue(XlaBuilder* builder, PrimitiveType type);
 // Returns the maximum representable finite value for 'type'.
 XlaOp MaxFiniteValue(XlaBuilder* builder, PrimitiveType type);
 
+// Returns a nan for the given type.  Only valid for real-valued fp types.
+XlaOp NanValue(XlaBuilder* builder, PrimitiveType type);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_CONSTANTS_H_
diff --git a/tensorflow/compiler/xla/client/lib/constants_test.cc b/tensorflow/compiler/xla/client/lib/constants_test.cc
index f4320f65c1f76d4d4c384110b39d6606773aaf01..180175b7495b32250af8ae77c8c7fba804703885 100644
--- a/tensorflow/compiler/xla/client/lib/constants_test.cc
+++ b/tensorflow/compiler/xla/client/lib/constants_test.cc
@@ -155,5 +155,12 @@ XLA_TEST_F(ConstantsTest, MaxValueF32) {
                              {});
 }
 
+XLA_TEST_F(ConstantsTest, NanValueF32) {
+  XlaBuilder builder(TestName());
+  NanValue(&builder, F32);
+  ComputeAndCompareR0<float>(&builder, std::numeric_limits<float>::quiet_NaN(),
+                             {});
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/tf2xla/lib/while_loop.cc b/tensorflow/compiler/xla/client/lib/loops.cc
similarity index 50%
rename from tensorflow/compiler/tf2xla/lib/while_loop.cc
rename to tensorflow/compiler/xla/client/lib/loops.cc
index 594ab1dfd0700f47501712183f6efe62d17e15e7..721f987628a8ac7da3f3f872939c3f0457d6bbe2 100644
--- a/tensorflow/compiler/tf2xla/lib/while_loop.cc
+++ b/tensorflow/compiler/xla/client/lib/loops.cc
@@ -13,44 +13,43 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/lib/while_loop.h"
-#include "tensorflow/compiler/tf2xla/lib/util.h"
+#include "tensorflow/compiler/xla/client/lib/loops.h"
+
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
-namespace tensorflow {
+namespace xla {
 
-xla::StatusOr<std::vector<xla::XlaOp>> XlaWhileLoop(
-    const LoopConditionFunction& condition_function,
-    const LoopBodyFunction& body_function,
-    absl::Span<const xla::XlaOp> initial_values, absl::string_view name,
-    xla::XlaBuilder* builder) {
+StatusOr<std::vector<XlaOp>> WhileLoopHelper(
+    const WhileLoopHelperConditionFunction& condition_function,
+    const WhileLoopHelperBodyFunction& body_function,
+    absl::Span<const XlaOp> initial_values, absl::string_view name,
+    XlaBuilder* builder) {
   int arity = initial_values.size();
-  std::vector<xla::Shape> var_shapes;
+  std::vector<Shape> var_shapes;
   var_shapes.reserve(arity);
-  for (const xla::XlaOp& input : initial_values) {
+  for (const XlaOp& input : initial_values) {
     TF_ASSIGN_OR_RETURN(auto shape, builder->GetShape(input));
     var_shapes.push_back(std::move(shape));
   }
-  xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(var_shapes);
+  Shape tuple_shape = ShapeUtil::MakeTupleShape(var_shapes);
 
   // Unpacks a tuple into its component parts.
-  auto unpack_tuple = [](xla::XlaOp tuple, int arity,
-                         xla::XlaBuilder* builder) {
-    std::vector<xla::XlaOp> elements(arity);
+  auto unpack_tuple = [](XlaOp tuple, int arity, XlaBuilder* builder) {
+    std::vector<XlaOp> elements(arity);
     for (int i = 0; i < arity; ++i) {
-      elements[i] = xla::GetTupleElement(tuple, i);
+      elements[i] = GetTupleElement(tuple, i);
     }
     return elements;
   };
 
   // Build the condition.
-  std::unique_ptr<xla::XlaBuilder> cond_builder =
+  std::unique_ptr<XlaBuilder> cond_builder =
       builder->CreateSubBuilder(absl::StrCat(name, "_condition"));
   {
-    auto parameter =
-        xla::Parameter(cond_builder.get(), 0, tuple_shape, "parameter");
+    auto parameter = Parameter(cond_builder.get(), 0, tuple_shape, "parameter");
 
     TF_RETURN_IF_ERROR(
         condition_function(unpack_tuple(parameter, arity, cond_builder.get()),
@@ -60,11 +59,10 @@ xla::StatusOr<std::vector<xla::XlaOp>> XlaWhileLoop(
   TF_ASSIGN_OR_RETURN(auto cond, cond_builder->Build());
 
   // Build the body.
-  std::unique_ptr<xla::XlaBuilder> body_builder =
+  std::unique_ptr<XlaBuilder> body_builder =
       builder->CreateSubBuilder(absl::StrCat(name, "_body"));
   {
-    auto parameter =
-        xla::Parameter(body_builder.get(), 0, tuple_shape, "parameter");
+    auto parameter = Parameter(body_builder.get(), 0, tuple_shape, "parameter");
 
     TF_ASSIGN_OR_RETURN(
         auto result,
@@ -72,56 +70,54 @@ xla::StatusOr<std::vector<xla::XlaOp>> XlaWhileLoop(
                       body_builder.get()));
 
     TF_RET_CHECK(result.size() == initial_values.size());
-    xla::Tuple(body_builder.get(), result);
+    Tuple(body_builder.get(), result);
   }
   TF_ASSIGN_OR_RETURN(auto body, body_builder->Build());
 
-  auto outputs = xla::While(cond, body, xla::Tuple(builder, initial_values));
+  auto outputs = While(cond, body, Tuple(builder, initial_values));
 
   return unpack_tuple(outputs, arity, builder);
 }
 
-xla::StatusOr<std::vector<xla::XlaOp>> XlaForEachIndex(
-    int64 num_iterations, xla::PrimitiveType num_iterations_type,
+StatusOr<std::vector<XlaOp>> ForEachIndex(
+    int64 num_iterations, PrimitiveType num_iterations_type,
     const ForEachIndexBodyFunction& body_function,
-    absl::Span<const xla::XlaOp> initial_values, absl::string_view name,
-    xla::XlaBuilder* builder) {
-  auto while_cond_fn =
-      [&](absl::Span<const xla::XlaOp> values,
-          xla::XlaBuilder* cond_builder) -> xla::StatusOr<xla::XlaOp> {
-    return xla::Lt(values[0], IntegerLiteral(cond_builder, num_iterations_type,
-                                             num_iterations));
+    absl::Span<const XlaOp> initial_values, absl::string_view name,
+    XlaBuilder* builder) {
+  auto while_cond_fn = [&](absl::Span<const XlaOp> values,
+                           XlaBuilder* cond_builder) -> StatusOr<XlaOp> {
+    return Lt(values[0], ConstantR0WithType(cond_builder, num_iterations_type,
+                                            num_iterations));
   };
-  auto while_body_fn = [&](absl::Span<const xla::XlaOp> values,
-                           xla::XlaBuilder* body_builder)
-      -> xla::StatusOr<std::vector<xla::XlaOp>> {
-    xla::XlaOp iteration = values[0];
+  auto while_body_fn =
+      [&](absl::Span<const XlaOp> values,
+          XlaBuilder* body_builder) -> StatusOr<std::vector<XlaOp>> {
+    XlaOp iteration = values[0];
 
-    std::vector<xla::XlaOp> updated_values;
+    std::vector<XlaOp> updated_values;
     updated_values.reserve(values.size());
-    updated_values.push_back(xla::Add(
+    updated_values.push_back(Add(
         iteration,
-        xla::ConstantLiteral(body_builder,
-                             xla::LiteralUtil::One(num_iterations_type))));
+        ConstantLiteral(body_builder, LiteralUtil::One(num_iterations_type))));
 
     values.remove_prefix(1);
-    TF_ASSIGN_OR_RETURN(std::vector<xla::XlaOp> body_outputs,
+    TF_ASSIGN_OR_RETURN(std::vector<XlaOp> body_outputs,
                         body_function(iteration, values, body_builder));
     updated_values.insert(updated_values.end(), body_outputs.begin(),
                           body_outputs.end());
     return updated_values;
   };
 
-  std::vector<xla::XlaOp> values;
+  std::vector<XlaOp> values;
   values.reserve(initial_values.size() + 1);
-  values.push_back(xla::ConstantLiteral(
-      builder, xla::LiteralUtil::Zero(num_iterations_type)));
+  values.push_back(
+      ConstantLiteral(builder, LiteralUtil::Zero(num_iterations_type)));
   values.insert(values.end(), initial_values.begin(), initial_values.end());
 
-  TF_ASSIGN_OR_RETURN(values, XlaWhileLoop(while_cond_fn, while_body_fn, values,
-                                           name, builder));
+  TF_ASSIGN_OR_RETURN(values, WhileLoopHelper(while_cond_fn, while_body_fn,
+                                              values, name, builder));
   values.erase(values.begin(), values.begin() + 1);
   return values;
 }
 
-}  // namespace tensorflow
+}  // namespace xla
diff --git a/tensorflow/compiler/tf2xla/lib/while_loop.h b/tensorflow/compiler/xla/client/lib/loops.h
similarity index 62%
rename from tensorflow/compiler/tf2xla/lib/while_loop.h
rename to tensorflow/compiler/xla/client/lib/loops.h
index f2134bb4495a12b8342961d96f70e7737f816c7d..e11de59493e9c1de51fbdb6c45dab6d82b85a62a 100644
--- a/tensorflow/compiler/tf2xla/lib/while_loop.h
+++ b/tensorflow/compiler/xla/client/lib/loops.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_WHILE_LOOP_H_
-#define TENSORFLOW_COMPILER_TF2XLA_LIB_WHILE_LOOP_H_
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_LOOPS_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_LOOPS_H_
 
 #include <functional>
 #include <vector>
@@ -25,19 +25,18 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
-namespace tensorflow {
+namespace xla {
 
 // Function that builds a loop condition. Takes as input a sequence of input
 // values, and returns a boolean value representing if the condition succeeds.
-typedef std::function<xla::StatusOr<xla::XlaOp>(absl::Span<const xla::XlaOp>,
-                                                xla::XlaBuilder*)>
-    LoopConditionFunction;
+typedef std::function<StatusOr<XlaOp>(absl::Span<const XlaOp>, XlaBuilder*)>
+    WhileLoopHelperConditionFunction;
 
 // Function that builds a loop body. Takes as input a sequence of input values
 // and returns a sequence of output values.
-typedef std::function<xla::StatusOr<std::vector<xla::XlaOp>>(
-    absl::Span<const xla::XlaOp>, xla::XlaBuilder*)>
-    LoopBodyFunction;
+typedef std::function<StatusOr<std::vector<XlaOp>>(absl::Span<const XlaOp>,
+                                                   XlaBuilder*)>
+    WhileLoopHelperBodyFunction;
 
 // Helper function for building an XLA while loop, where the values carried by
 // the loop are a tuple of values, e.g., (a, b, c):
@@ -47,27 +46,27 @@ typedef std::function<xla::StatusOr<std::vector<xla::XlaOp>>(
 //   init: (a, b, c)
 // )
 // 'name' is a descriptive name for the loop.
-xla::StatusOr<std::vector<xla::XlaOp>> XlaWhileLoop(
-    const LoopConditionFunction& condition_function,
-    const LoopBodyFunction& body_function,
-    absl::Span<const xla::XlaOp> initial_values, absl::string_view name,
-    xla::XlaBuilder* builder);
+StatusOr<std::vector<XlaOp>> WhileLoopHelper(
+    const WhileLoopHelperConditionFunction& condition_function,
+    const WhileLoopHelperBodyFunction& body_function,
+    absl::Span<const XlaOp> initial_values, absl::string_view name,
+    XlaBuilder* builder);
 
 // Builds an XLA loop that repeats a computation `num_iterations` times.
 //
 // The body function (ForEachIndexBodyFunction) takes as input a pair of
 // (current iteration number, loop-carried values), and returns an updated
 // vector of the loop-carried values.
-typedef std::function<xla::StatusOr<std::vector<xla::XlaOp>>(
-    xla::XlaOp, absl::Span<const xla::XlaOp>, xla::XlaBuilder*)>
+typedef std::function<StatusOr<std::vector<XlaOp>>(
+    XlaOp, absl::Span<const XlaOp>, XlaBuilder*)>
     ForEachIndexBodyFunction;
 
-xla::StatusOr<std::vector<xla::XlaOp>> XlaForEachIndex(
-    int64 num_iterations, xla::PrimitiveType num_iterations_type,
+StatusOr<std::vector<XlaOp>> ForEachIndex(
+    int64 num_iterations, PrimitiveType num_iterations_type,
     const ForEachIndexBodyFunction& body_function,
-    absl::Span<const xla::XlaOp> initial_values, absl::string_view name,
-    xla::XlaBuilder* builder);
+    absl::Span<const XlaOp> initial_values, absl::string_view name,
+    XlaBuilder* builder);
 
-}  // namespace tensorflow
+}  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_WHILE_LOOP_H_
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_LOOPS_H_
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index 36fdda39b4124b9100c6054160f9c17bdf787d6f..f3fe3d0b5ebaabdc762c811027b85444db7b0d56 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -13,59 +13,103 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// This macro is required to make MSVC defines math constants in math.h
+#define _USE_MATH_DEFINES
+#include <math.h>
+
 #include "tensorflow/compiler/xla/client/lib/math.h"
 
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
 namespace xla {
 
-XlaOp Sqrt(XlaOp operand) { return Pow(operand, ScalarLike(operand, 0.5)); }
+// TODO(jlebar): Use this function in more places in this file to restrict the
+// domain of other functions.
+static Status EnsureOperandIsRealFp(absl::string_view op_name, XlaOp operand) {
+  auto& b = *operand.builder();
+  TF_ASSIGN_OR_RETURN(auto shape, b.GetShape(operand));
+  auto elem_ty = shape.element_type();
+  if (!primitive_util::IsFloatingPointType(elem_ty)) {
+    return InvalidArgument(
+        "Operands to %s must be real-valued floating-point, but got %s",
+        op_name, PrimitiveType_Name(elem_ty));
+  }
+  return Status::OK();
+}
 
-XlaOp Rsqrt(XlaOp operand) { return Pow(operand, ScalarLike(operand, -0.5)); }
+XlaOp IsPosInf(XlaOp operand) {
+  auto& b = *operand.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("IsPosInf", operand));
+    TF_ASSIGN_OR_RETURN(auto shape, b.GetShape(operand));
+    // Note that this is only correct for floating-point types.  If we wanted it
+    // to be correct for all types, we'd need to Gt(MaxFiniteValue).
+    return Eq(operand, MaxValue(&b, shape.element_type()));
+  });
+}
 
-XlaOp Square(XlaOp operand) { return operand * operand; }
+XlaOp IsNegInf(XlaOp operand) {
+  auto& b = *operand.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("IsNegInf", operand));
+    TF_ASSIGN_OR_RETURN(auto shape, b.GetShape(operand));
+    // Note that this is only correct for floating-point types.  If we wanted it
+    // to be correct for all types, we'd need to Lt(MinFiniteValue).
+    return Eq(operand, MinValue(&b, shape.element_type()));
+  });
+}
 
-XlaOp Reciprocal(XlaOp operand) { return ScalarLike(operand, 1.0) / operand; }
+XlaOp IsInf(XlaOp operand) {
+  auto& b = *operand.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("IsInf", operand));
+    return IsPosInf(Abs(operand));
+  });
+}
 
-namespace {
+XlaOp IsNan(XlaOp operand) {
+  auto& b = *operand.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("IsNan", operand));
+    return Ne(operand, operand);
+  });
+}
 
-// Polynomials for computing erf/erfc.  Originally from cephes.
-// Note we use float for compatibility across devices, at the cost of some
-// precision for 64 bit computations.
-//
-// Coefficients are in descending order.
-std::array<float, 9> kErfcPCoefficient = {
-    2.46196981473530512524E-10, 5.64189564831068821977E-1,
-    7.46321056442269912687E0,   4.86371970985681366614E1,
-    1.96520832956077098242E2,   5.26445194995477358631E2,
-    9.34528527171957607540E2,   1.02755188689515710272E3,
-    5.57535335369399327526E2};
-std::array<float, 9> kErfcQCoefficient = {
-    1.00000000000000000000E0, 1.32281951154744992508E1,
-    8.67072140885989742329E1, 3.54937778887819891062E2,
-    9.75708501743205489753E2, 1.82390916687909736289E3,
-    2.24633760818710981792E3, 1.65666309194161350182E3,
-    5.57535340817727675546E2};
-std::array<float, 6> kErfcRCoefficient = {
-    5.64189583547755073984E-1, 1.27536670759978104416E0,
-    5.01905042251180477414E0,  6.16021097993053585195E0,
-    7.40974269950448939160E0,  2.97886665372100240670E0};
-std::array<float, 7> kErfcSCoefficient = {
-    1.00000000000000000000E0, 2.26052863220117276590E0,
-    9.39603524938001434673E0, 1.20489539808096656605E1,
-    1.70814450747565897222E1, 9.60896809063285878198E0,
-    3.36907645100081516050E0};
-std::array<float, 5> kErfTCoefficient = {
-    9.60497373987051638749E0, 9.00260197203842689217E1,
-    2.23200534594684319226E3, 7.00332514112805075473E3,
-    5.55923013010394962768E4};
-std::array<float, 6> kErfUCoefficient = {
-    1.00000000000000000000E0, 3.35617141647503099647E1,
-    5.21357949780152679795E2, 4.59432382970980127987E3,
-    2.26290000613890934246E4, 4.92673942608635921086E4};
-}  // namespace
+XlaOp IsNegZero(XlaOp operand) {
+  auto& b = *operand.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("IsNegZero", operand));
+    TF_ASSIGN_OR_RETURN(auto shape, b.GetShape(operand));
+
+    // The bitwise representation of -0 in bfloat16 and IEEE 754 is 0x80...0
+    // (sign bit on, all other bits off).
+    switch (shape.element_type()) {
+      case F64:
+        return Eq(BitcastConvertType(operand, U64),
+                  ConstantR0WithType(&b, U64, uint64{1} << 63));
+      case F32:
+        return Eq(BitcastConvertType(operand, U32),
+                  ConstantR0WithType(&b, U32, uint32{1} << 31));
+      case F16:
+      case BF16:
+        // Not all XLA backends handle U16 well, so we convert to F32/U32.
+        // TODO(jlebar): It would be nice if we could stay in (B)F16/U16 for
+        // backends that *do* support it.
+        return Eq(BitcastConvertType(ConvertElementType(operand, F32), U32),
+                  ConstantR0WithType(&b, U32, uint32{1} << 31));
+      default:
+        LOG(FATAL) << "Expected real fp type.";
+    }
+  });
+}
+
+XlaOp Square(XlaOp operand) { return operand * operand; }
+
+XlaOp Reciprocal(XlaOp operand) { return ScalarLike(operand, 1.0) / operand; }
 
 // Evaluate the polynomial given coefficients and `x`.
 // N.B. Coefficients should be supplied in decreasing order.
@@ -77,27 +121,86 @@ XlaOp EvaluatePolynomial(XlaOp x, absl::Span<const float> coefficients) {
   return poly;
 }
 
-// Compute an approximation of the error function complement (1 - erf(x)).
-XlaOp Erfc(XlaOp x) {
+// Computes an approximation of the error function complement (1 - erf(x)).
+//
+// Precondition: abs(x) >= 1.  Otherwise, use ErfImpl.
+//
+// This follows Cephes's f32 implementation of erfc, and so it may have errors
+// for double precision.
+//
+// See also these alternate implementations of erf and erfc:
+//
+//   https://stackoverflow.com/questions/35148198
+//   https://stackoverflow.com/questions/35966695
+//
+static XlaOp ErfcImpl(XlaOp x) {
+  // Coefficients for erfc(f32), from Cephes.
+  //
+  // erfc(x) = exp(-x^2) P(1/x), 1 < x < 2
+  static std::array<float, 9> kErfcPCoefficient{
+      +2.326819970068386E-2, -1.387039388740657E-1, +3.687424674597105E-1,
+      -5.824733027278666E-1, +6.210004621745983E-1, -4.944515323274145E-1,
+      +3.404879937665872E-1, -2.741127028184656E-1, +5.638259427386472E-1,
+  };
+  // erfc(x) = exp(-x^2) 1/x P(1/x^2), 2 < x < 14
+  static std::array<float, 8> kErfcRCoefficient{
+      -1.047766399936249E+1, +1.297719955372516E+1, -7.495518717768503E+0,
+      +2.921019019210786E+0, -1.015265279202700E+0, +4.218463358204948E-1,
+      -2.820767439740514E-1, +5.641895067754075E-1,
+  };
+
   XlaOp abs_x = Abs(x);
   XlaOp z = Exp(-x * x);
+  XlaOp q = ScalarLike(x, 1) / abs_x;
+  XlaOp y = q * q;
+  XlaOp p = Select(Lt(abs_x, ScalarLike(x, 2.0)),
+                   EvaluatePolynomial(y, kErfcPCoefficient),
+                   EvaluatePolynomial(y, kErfcRCoefficient));
+  y = z * q * p;
+  return Select(Lt(x, ScalarLike(x, 0)), ScalarLike(x, 2.0) - y, y);
+}
 
-  XlaOp pp = EvaluatePolynomial(abs_x, kErfcPCoefficient);
-  XlaOp pq = EvaluatePolynomial(abs_x, kErfcQCoefficient);
-  XlaOp pr = EvaluatePolynomial(abs_x, kErfcRCoefficient);
-  XlaOp ps = EvaluatePolynomial(abs_x, kErfcSCoefficient);
-
-  XlaOp y = Select(Lt(abs_x, ScalarLike(x, 8.0)), z * pp / pq, z * pr / ps);
+// Compute a polynomial approximation of the error function.
+//
+// Precondition: abs(x) <= 1.  Otherwise, use ErfcImpl.
+//
+// This follows Cephes's f32 implementation of erf, so it may have errors for
+// double precision.
+static XlaOp ErfImpl(XlaOp x) {
+  // Coefficients for by erf(f32), from Cephes.
+  //
+  // erf(x) = x P(x^2), 0 < x < 1
+  static std::array<float, 7> kErfTCoefficient{
+      +7.853861353153693E-5, -8.010193625184903E-4, +5.188327685732524E-3,
+      -2.685381193529856E-2, +1.128358514861418E-1, -3.761262582423300E-1,
+      +1.128379165726710E+0,
+  };
+
+  return x * EvaluatePolynomial(x * x, kErfTCoefficient);
+}
 
-  return Select(Lt(x, ScalarLike(x, 0.0)), ScalarLike(x, 2.0) - y, y);
+XlaOp Erfc(XlaOp x) {
+  auto& b = *x.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("Erfc", x));
+    // erfc(x) =
+    //   erfc_impl(x)           if x > 1
+    //   1 - erf_impl(x)        otherwise
+    return Select(Gt(Abs(x), ScalarLike(x, 1)), ErfcImpl(x),
+                  ScalarLike(x, 1) - ErfImpl(x));
+  });
 }
 
-// Compute a polynomial approximation of the error function.
 XlaOp Erf(XlaOp x) {
-  XlaOp z = x * x;
-  XlaOp pt = EvaluatePolynomial(z, kErfTCoefficient);
-  XlaOp pu = EvaluatePolynomial(z, kErfUCoefficient);
-  return x * pt / pu;
+  auto& b = *x.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("Erf", x));
+    // erf(x) =
+    //   erf_impl(x)            if x < 1
+    //   1 - erfc_impl(x)       otherwise
+    return Select(Lt(Abs(x), ScalarLike(x, 1)), ErfImpl(x),
+                  ScalarLike(x, 1) - ErfcImpl(x));
+  });
 }
 
 // Approximation for the inverse error function from
@@ -113,37 +216,30 @@ XlaOp Erf(XlaOp x) {
 //   }
 //   return p*x
 XlaOp ErfInv(XlaOp x) {
-  XlaBuilder* b = x.builder();
-  return b->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(Shape shape, b->GetShape(x));
-    constexpr int kDegree = 9;
-    constexpr std::array<float, 9> w_less_than_5_constants = {
-        2.81022636e-08f,  3.43273939e-07f, -3.5233877e-06f,
-        -4.39150654e-06f, 0.00021858087f,  -0.00125372503f,
-        -0.00417768164f,  0.246640727f,    1.50140941f};
-    constexpr std::array<float, 9> w_greater_than_5_constants = {
-        -0.000200214257f, 0.000100950558f, 0.00134934322f,
-        -0.00367342844f,  0.00573950773f,  -0.0076224613f,
-        0.00943887047f,   1.00167406f,     2.83297682f};
+  constexpr int kDegree = 9;
+  constexpr std::array<float, 9> w_less_than_5_constants = {
+      2.81022636e-08f,  3.43273939e-07f, -3.5233877e-06f,
+      -4.39150654e-06f, 0.00021858087f,  -0.00125372503f,
+      -0.00417768164f,  0.246640727f,    1.50140941f};
+  constexpr std::array<float, 9> w_greater_than_5_constants = {
+      -0.000200214257f, 0.000100950558f, 0.00134934322f,
+      -0.00367342844f,  0.00573950773f,  -0.0076224613f,
+      0.00943887047f,   1.00167406f,     2.83297682f};
 
-    auto one = ScalarLike(x, 1.0);
-    auto w = -Log((one - x) * (one + x));
-
-    auto lt = Lt(w, ScalarLike(x, 5.0));
-    auto coefficient = [&](int i) {
-      return Select(lt,
-                    Broadcast(ScalarLike(x, w_less_than_5_constants[i]),
-                              AsInt64Slice(shape.dimensions())),
-                    Broadcast(ScalarLike(x, w_greater_than_5_constants[i]),
-                              AsInt64Slice(shape.dimensions())));
-    };
-    w = Select(lt, w - ScalarLike(x, 2.5), Sqrt(w) - ScalarLike(x, 3.0));
-    auto p = coefficient(0);
-    for (int i = 1; i < kDegree; ++i) {
-      p = coefficient(i) + p * w;
-    }
-    return p * x;
-  });
+  auto one = ScalarLike(x, 1.0);
+  auto w = -Log((one - x) * (one + x));
+
+  auto lt = Lt(w, ScalarLike(x, 5.0));
+  auto coefficient = [&](int i) {
+    return Select(lt, FullLike(x, w_less_than_5_constants[i]),
+                  FullLike(x, w_greater_than_5_constants[i]));
+  };
+  w = Select(lt, w - ScalarLike(x, 2.5), Sqrt(w) - ScalarLike(x, 3.0));
+  auto p = coefficient(0);
+  for (int i = 1; i < kDegree; ++i) {
+    p = coefficient(i) + p * w;
+  }
+  return p * x;
 }
 
 namespace {
@@ -170,49 +266,86 @@ static constexpr std::array<double, 8> kLanczosCoefficients = {
 // t(z) = z + kLanczosGamma + 1/2
 // A(z) = kBaseLanczosCoeff + sigma(k = 1, n, kLanczosCoefficients[i] / (z + k))
 XlaOp Lgamma(XlaOp input) {
-  XlaOp one_half = ScalarLike(input, 0.5);
-  XlaOp one = ScalarLike(input, 1);
-
-  XlaOp pi = ScalarLike(input, M_PI);
-  XlaOp log_pi = ScalarLike(input, std::log(M_PI));
-  XlaOp log_sqrt_two_pi = ScalarLike(input, (std::log(2) + std::log(M_PI)) / 2);
-
-  XlaOp lanczos_gamma_plus_one_half = ScalarLike(input, kLanczosGamma + 0.5);
-  XlaOp log_lanczos_gamma_plus_one_half =
-      ScalarLike(input, std::log(kLanczosGamma + 0.5));
-
-  XlaOp base_lanczos_coeff = ScalarLike(input, kBaseLanczosCoeff);
-
-  // If the input is less than 0.5 use Gauss's reflection formula:
-  // gamma(x) = pi / sin(pi * x) * gamma(1 - x)
-  XlaOp need_to_reflect = Lt(Real(input), one_half);
-  XlaOp z = Select(need_to_reflect, -input, input - one);
-
-  XlaOp x = base_lanczos_coeff;
-  for (int i = 0; i < kLanczosCoefficients.size(); ++i) {
-    XlaOp lanczos_coefficient = ScalarLike(input, kLanczosCoefficients[i]);
-    XlaOp index = ScalarLike(input, i);
-    x = x + lanczos_coefficient / (z + index + one);
-  }
+  auto& b = *input.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("Lgamma", input));
+
+    XlaOp one_half = ScalarLike(input, 0.5);
+    XlaOp one = ScalarLike(input, 1);
+
+    XlaOp pi = ScalarLike(input, M_PI);
+    XlaOp log_pi = ScalarLike(input, std::log(M_PI));
+    XlaOp log_sqrt_two_pi =
+        ScalarLike(input, (std::log(2) + std::log(M_PI)) / 2);
+
+    XlaOp lanczos_gamma_plus_one_half = ScalarLike(input, kLanczosGamma + 0.5);
+    XlaOp log_lanczos_gamma_plus_one_half =
+        ScalarLike(input, std::log(kLanczosGamma + 0.5));
+
+    XlaOp base_lanczos_coeff = ScalarLike(input, kBaseLanczosCoeff);
+
+    // If the input is less than 0.5 use Euler's reflection formula:
+    // gamma(x) = pi / (sin(pi * x) * gamma(1 - x))
+    XlaOp need_to_reflect = Lt(input, one_half);
+    XlaOp z = Select(need_to_reflect, -input, input - one);
+
+    XlaOp x = base_lanczos_coeff;
+    for (int i = 0; i < kLanczosCoefficients.size(); ++i) {
+      XlaOp lanczos_coefficient = ScalarLike(input, kLanczosCoefficients[i]);
+      XlaOp index = ScalarLike(input, i);
+      x = x + lanczos_coefficient / (z + index + one);
+    }
 
-  // To improve accuracy on platforms with less-precise log implementations,
-  // compute log(lanczos_gamma_plus_one_half) at compile time and use log1p on
-  // the device.
-  // log(t) = log(kLanczosGamma + 0.5 + z)
-  //        = log(kLanczosGamma + 0.5) + log1p(z / (kLanczosGamma + 0.5))
-  XlaOp t = lanczos_gamma_plus_one_half + z;
-  XlaOp log_t =
-      log_lanczos_gamma_plus_one_half + Log1p(z / lanczos_gamma_plus_one_half);
-
-  XlaOp log_y = log_sqrt_two_pi + (z + one_half) * log_t - t + Log(x);
-
-  // If z = a + 0j, the analytic continuation of log reduces to taking the
-  // absolute value of the real part.
-  // Re(log(z)) = Re(log|z| + arg(z)j)
-  //            = log|a|
-  XlaOp reflection = log_pi - Log(Abs(Sin(pi * input))) - log_y;
-  XlaOp result = Select(need_to_reflect, reflection, log_y);
-  return result;
+    // To improve accuracy on platforms with less-precise log implementations,
+    // compute log(lanczos_gamma_plus_one_half) at compile time and use log1p on
+    // the device.
+    // log(t) = log(kLanczosGamma + 0.5 + z)
+    //        = log(kLanczosGamma + 0.5) + log1p(z / (kLanczosGamma + 0.5))
+    XlaOp t = lanczos_gamma_plus_one_half + z;
+    XlaOp log_t = log_lanczos_gamma_plus_one_half +
+                  Log1p(z / lanczos_gamma_plus_one_half);
+
+    XlaOp log_y = log_sqrt_two_pi + (z + one_half) * log_t - t + Log(x);
+
+    // Compute the reflected value, used when x < 0.5:
+    //
+    //   lgamma(x) = log(pi) - lgamma(1-x) - log(abs(sin(pi * x))).
+    //
+    // (The abs is because lgamma is the log of the absolute value of the gamma
+    // function.)
+    //
+    // We have to be careful when computing the final term above. gamma(x) goes
+    // to +/-inf at every integer x < 0, and this is controlled by the
+    // sin(pi * x) term.  The slope is large, so precision is particularly
+    // important.
+    //
+    // Because abs(sin(pi * x)) has period 1, we can equivalently use
+    // abs(sin(pi * frac(x))) = sin(pi * frac(x)), where frac(x) is the
+    // fractional part of x.  This is more numerically accurate: It doesn't
+    // overflow to inf like pi * x can, and if x is an integer, it evaluates to
+    // 0 exactly, which is significant because we then take the log of this
+    // value, and log(0) is inf.
+    //
+    // We don't have a frac(x) primitive in XLA and computing it is tricky, but
+    // because abs(sin(pi * x)) = abs(sin(pi * abs(x))), it's good enough for
+    // our purposes to use abs(frac(x)) = abs(x) - floor(abs(x)).
+    //
+    XlaOp abs_input = Abs(input);
+    XlaOp reflection_denom = Log(Sin(pi * (abs_input - Floor(abs_input))));
+
+    // Avoid computing -inf - inf, which is nan.  If reflection_denom is +/-inf,
+    // then it "wins" and the result is +/-inf.
+    XlaOp reflection =
+        Select(IsFinite(reflection_denom), log_pi - reflection_denom - log_y,
+               -reflection_denom);
+    XlaOp result = Select(need_to_reflect, reflection, log_y);
+
+    // lgamma(+/-inf) = +inf.
+    XlaOp inf_bcast = FullLike(input, std::numeric_limits<float>::infinity());
+    return Select(Or(IsFinite(input),                           // is finite, or
+                     Not(Or(Lt(input, one), Ge(input, one)))),  // is nan
+                  result, inf_bcast);
+  });
 }
 
 // Compute the Digamma function using Lanczos' approximation from "A Precision
@@ -223,69 +356,84 @@ XlaOp Lgamma(XlaOp input) {
 // A(z) = kBaseLanczosCoeff + sigma(k = 1, n, kLanczosCoefficients[i] / (z + k))
 // A'(z) = sigma(k = 1, n, kLanczosCoefficients[i] / (z + k) / (z + k))
 XlaOp Digamma(XlaOp input) {
-  XlaOp zero = ScalarLike(input, 0);
-  XlaOp one_half = ScalarLike(input, 0.5);
-  XlaOp one = ScalarLike(input, 1);
-
-  XlaOp pi = ScalarLike(input, M_PI);
-
-  XlaOp lanczos_gamma = ScalarLike(input, kLanczosGamma);
-  XlaOp lanczos_gamma_plus_one_half = ScalarLike(input, kLanczosGamma + 0.5);
-  XlaOp log_lanczos_gamma_plus_one_half =
-      ScalarLike(input, std::log(kLanczosGamma + 0.5));
-
-  XlaOp base_lanczos_coeff = ScalarLike(input, kBaseLanczosCoeff);
-
-  // If the input is less than 0.5 use Gauss's reflection formula:
-  // digamma(x) = digamma(1 - x) - pi * cot(pi * x)
-  XlaOp need_to_reflect = Lt(Real(input), one_half);
-  XlaOp z = Select(need_to_reflect, -input, input - one);
-
-  XlaOp num = zero;
-  XlaOp denom = base_lanczos_coeff;
-  for (int i = 0; i < kLanczosCoefficients.size(); ++i) {
-    XlaOp lanczos_coefficient = ScalarLike(input, kLanczosCoefficients[i]);
-    XlaOp index = ScalarLike(input, i);
-    num = num - lanczos_coefficient / ((z + index + one) * (z + index + one));
-    denom = denom + lanczos_coefficient / (z + index + one);
-  }
+  auto& b = *input.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("Digamma", input));
+
+    XlaOp zero = ScalarLike(input, 0);
+    XlaOp one_half = ScalarLike(input, 0.5);
+    XlaOp one = ScalarLike(input, 1);
+
+    XlaOp pi = ScalarLike(input, M_PI);
+
+    XlaOp lanczos_gamma = ScalarLike(input, kLanczosGamma);
+    XlaOp lanczos_gamma_plus_one_half = ScalarLike(input, kLanczosGamma + 0.5);
+    XlaOp log_lanczos_gamma_plus_one_half =
+        ScalarLike(input, std::log(kLanczosGamma + 0.5));
+
+    XlaOp base_lanczos_coeff = ScalarLike(input, kBaseLanczosCoeff);
+
+    // If the input is less than 0.5 use Euler's reflection formula:
+    // digamma(x) = digamma(1 - x) - pi * cot(pi * x)
+    XlaOp need_to_reflect = Lt(input, one_half);
+    XlaOp z = Select(need_to_reflect, -input, input - one);
+
+    XlaOp num = zero;
+    XlaOp denom = base_lanczos_coeff;
+    for (int i = 0; i < kLanczosCoefficients.size(); ++i) {
+      XlaOp lanczos_coefficient = ScalarLike(input, kLanczosCoefficients[i]);
+      XlaOp index = ScalarLike(input, i);
+      num = num - lanczos_coefficient / ((z + index + one) * (z + index + one));
+      denom = denom + lanczos_coefficient / (z + index + one);
+    }
 
-  // To improve accuracy on platforms with less-precise log implementations,
-  // compute log(lanczos_gamma_plus_one_half) at compile time and use log1p on
-  // the device.
-  // log(t) = log(kLanczosGamma + 0.5 + z)
-  //        = log(kLanczosGamma + 0.5) + log1p(z / (kLanczosGamma + 0.5))
-  XlaOp t = lanczos_gamma_plus_one_half + z;
-  XlaOp log_t =
-      log_lanczos_gamma_plus_one_half + Log1p(z / lanczos_gamma_plus_one_half);
-
-  XlaOp y = log_t + num / denom - lanczos_gamma / t;
-  XlaOp reflection = y - pi * Cos(pi * input) / Sin(pi * input);
-  XlaOp result = Select(need_to_reflect, reflection, y);
-  return result;
+    // To improve accuracy on platforms with less-precise log implementations,
+    // compute log(lanczos_gamma_plus_one_half) at compile time and use log1p on
+    // the device.
+    // log(t) = log(kLanczosGamma + 0.5 + z)
+    //        = log(kLanczosGamma + 0.5) + log1p(z / (kLanczosGamma + 0.5))
+    XlaOp t = lanczos_gamma_plus_one_half + z;
+    XlaOp log_t = log_lanczos_gamma_plus_one_half +
+                  Log1p(z / lanczos_gamma_plus_one_half);
+
+    XlaOp y = log_t + num / denom - lanczos_gamma / t;
+    XlaOp reflection = y - pi * Cos(pi * input) / Sin(pi * input);
+    return Select(need_to_reflect, reflection, y);
+  });
 }
 
 // Implements Banker's rounding: numbers that are equidistant between two
 // integers are rounded towards even.
 XlaOp RoundToEven(XlaOp x) {
-  auto half = ScalarLike(x, 0.5);
-  auto one = ScalarLike(x, 1.0);
-  auto two = ScalarLike(x, 2.0);
-
-  auto round_val = Floor(x);
-  auto fraction = x - round_val;
-  auto nearest_even_int = round_val - two * Floor(half * x);
-  auto is_odd = Eq(nearest_even_int, one);
-  return Select(Or(Gt(fraction, half), And(Eq(fraction, half), is_odd)),
-                round_val + one, round_val);
+  auto& b = *x.builder();
+  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    // Reject non-real non-fp inputs (What does it even mean to round a complex
+    // number?  Do you round each component equally?  In that case, you should
+    // just ask for that explicitly.)
+    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("RoundToEven", x));
+
+    auto half = ScalarLike(x, 0.5);
+    auto one = ScalarLike(x, 1.0);
+    auto two = ScalarLike(x, 2.0);
+
+    auto round_val = Floor(x);
+    auto fraction = x - round_val;
+    auto nearest_even_int = round_val - two * Floor(half * x);
+    auto is_odd = Eq(nearest_even_int, one);
+    return Select(Or(Gt(fraction, half), And(Eq(fraction, half), is_odd)),
+                  round_val + one, round_val);
+  });
 }
 
 // Trigonometric functions.
 
-// acos(x) = 2 * atan(sqrt(1 - x^2) / (1 + x))
+// acos(x) = 2 * atan(sqrt(1 - x^2) / (1 + x)) if x != -1
+//           pi                                if x == -1
 XlaOp Acos(XlaOp x) {
-  return ScalarLike(x, 2.0) *
-         Atan2(Sqrt(ScalarLike(x, 1.0) - x * x), ScalarLike(x, 1.0) + x);
+  return Select(Ne(x, FullLike(x, -1)),
+                ScalarLike(x, 2.0) * Atan2(Sqrt(ScalarLike(x, 1.0) - x * x),
+                                           ScalarLike(x, 1.0) + x),
+                FullLike(x, M_PI));
 }
 
 // asin(x) = 2 * atan(x / (1 + sqrt(1 - x^2)))
@@ -323,9 +471,88 @@ XlaOp MaybeConjugate(XlaOp x, bool conjugate) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    auto perform_conj = shape.element_type() == C64 && conjugate;
+    auto perform_conj =
+        primitive_util::IsComplexType(shape.element_type()) && conjugate;
     return perform_conj ? Conj(x) : x;
   });
 }
 
+XlaOp NextAfter(XlaOp from, XlaOp to) {
+  auto builder = from.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto shape, builder->GetShape(from));
+    int bitwidth = primitive_util::BitWidth(shape.element_type());
+    auto int_type = primitive_util::UnsignedIntegralTypeForBitWidth(bitwidth);
+    auto from_as_int = BitcastConvertType(from, int_type);
+    auto to_as_int = BitcastConvertType(to, int_type);
+
+    // The result is NaN if either "from" or "to" are NaN.
+    auto from_is_nan = Ne(from, from);
+    auto to_is_nan = Ne(to, to);
+    auto nan_input = Or(from_is_nan, to_is_nan);
+    auto result_for_nan =
+        Broadcast(ScalarLike(from, std::numeric_limits<double>::quiet_NaN()),
+                  shape.dimensions());
+    result_for_nan = BitcastConvertType(result_for_nan, int_type);
+
+    // The sign bit is the MSB.
+    const int64 sign_mask = int64{1} << (bitwidth - 1);
+    // Discard the sign bit to make the result non-negative.
+    auto from_abs = And(from_as_int, ScalarLike(from_as_int, ~sign_mask));
+    auto to_abs = And(to_as_int, ScalarLike(to_as_int, ~sign_mask));
+
+    // When both "from" and "to" are equal, the result is "to".
+    // N.B. It would not make a difference if we chose the result to be "from".
+    auto from_and_to_are_equal = Eq(from_as_int, to_as_int);
+    auto result_for_equal = to_as_int;
+
+    // When both "from" and "to" are both 0, the result is "to". This ensures we
+    // get a zero signed like "to".
+    auto from_is_zero = Eq(from_abs, ZerosLike(from_abs));
+    auto to_is_zero = Eq(to_abs, ZerosLike(to_abs));
+    auto result_for_both_zero = to_as_int;
+
+    auto from_sign = And(from_as_int, ScalarLike(from_as_int, sign_mask));
+    auto to_sign = And(to_as_int, ScalarLike(to_as_int, sign_mask));
+
+    // If from == 0 && to != 0, we need to return the smallest subnormal number
+    // signed like "to".
+    auto result_for_from_zero_to_non_zero =
+        Or(to_sign, ScalarLike(from_as_int, 1));
+
+    // If the sign of "from" and "to" disagree:
+    // - we need to make the magnitude of "from" smaller so that it is closer to
+    //   zero.
+    //
+    // Otherwise the signs agree:
+    // - "from" with a magnitude larger than "to" means we need to make the
+    //   magnitude smaller.
+    // - "from" with a magnitude smaller than "to" means we need to make the
+    //   magnitude larger.
+    // - "from" with the same magnitude and sign as "to" has already been
+    //   handled.
+    auto signs_disagree = Ne(from_sign, to_sign);
+    auto from_magnitude_larger_than_to = Gt(from_abs, to_abs);
+    auto result_has_smaller_magnitude =
+        Or(from_magnitude_larger_than_to, signs_disagree);
+    auto magnitude_adjustment =
+        Select(result_has_smaller_magnitude,
+               Broadcast(ScalarLike(from_as_int, -1), shape.dimensions()),
+               Broadcast(ScalarLike(from_as_int, 1), shape.dimensions()));
+    auto result = Add(from_as_int, magnitude_adjustment);
+    // Handle from == ±0.
+    result = Select(from_is_zero,
+                    Select(to_is_zero, result_for_both_zero,
+                           result_for_from_zero_to_non_zero),
+                    result);
+    // Handle from == to.
+    result = Select(from_and_to_are_equal, result_for_equal, result);
+    // Handle isnan(from) || isnan(to).
+    result = Select(nan_input, result_for_nan, result);
+
+    // Cast back to the original type.
+    return BitcastConvertType(result, shape.element_type());
+  });
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/math.h b/tensorflow/compiler/xla/client/lib/math.h
index 17612bf9fdc0f1eabb338671c93c025c5b268872..71a3acedcec0a8e65561d4139baeaf532ec8bf46 100644
--- a/tensorflow/compiler/xla/client/lib/math.h
+++ b/tensorflow/compiler/xla/client/lib/math.h
@@ -20,11 +20,22 @@ limitations under the License.
 
 namespace xla {
 
-// Computes the square root of 'operand'.
-XlaOp Sqrt(XlaOp operand);
-
-// Computes the reciprocal of the square root of 'operand'.
-XlaOp Rsqrt(XlaOp operand);
+// Determines whether operand is +/-inf or nan.
+//
+// Raises an error if called on integral or complex values.
+XlaOp IsPosInf(XlaOp operand);
+XlaOp IsNegInf(XlaOp operand);
+XlaOp IsInf(XlaOp operand);
+XlaOp IsNan(XlaOp operand);
+
+// Determines whether operand is equal to -0.
+//
+// Raises an error for integral or complex values.
+XlaOp IsNegZero(XlaOp operand);
+
+// Returns the next number after 'from' in the direction of 'to' the same way
+// std::nextafter(from, to) would.
+XlaOp NextAfter(XlaOp from, XlaOp to);
 
 // Computes the square of 'operand'.
 XlaOp Square(XlaOp operand);
@@ -32,7 +43,7 @@ XlaOp Square(XlaOp operand);
 // Computes the reciprocal of 'operand'.
 XlaOp Reciprocal(XlaOp operand);
 
-// Evaluates a polynomial given coefficients and `x`.
+// Evaluates a polynomial given coefficients and 'x'.
 // N.B. Coefficients should be supplied in decreasing order.
 XlaOp EvaluatePolynomial(XlaOp x, absl::Span<const float> coefficients);
 
@@ -86,7 +97,7 @@ XlaOp Cosh(XlaOp x);
 // Computes the hyperbolic sine of 'x'.
 XlaOp Sinh(XlaOp x);
 
-// Applies a complex conjugation operation if `a` is complex and `conjugate`
+// Applies a complex conjugation operation if 'a' is complex and 'conjugate'
 // is true, otherwise returns its argument.
 xla::XlaOp MaybeConjugate(xla::XlaOp x, bool conjugate);
 
diff --git a/tensorflow/compiler/xla/client/lib/math_exhaustive_test.cc b/tensorflow/compiler/xla/client/lib/math_exhaustive_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7f423d54dbb7ff911398b0137b482ee47f46c5c1
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/math_exhaustive_test.cc
@@ -0,0 +1,188 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace xla {
+namespace {
+
+using Eigen::half;
+
+struct Testcase {
+  Testcase(string name, const std::function<XlaOp(XlaOp)>& op,
+           float (*host_op)(float))
+      : name(name), op(op), host_op(host_op) {}
+
+  Testcase& set_tolerance(float abs_err, float rel_err) {
+    error.abs = abs_err;
+    error.rel = rel_err;
+    return *this;
+  }
+
+  Testcase& set_relaxed_nans() {
+    error.relaxed_nans = true;
+    return *this;
+  }
+
+  Testcase& set_fewer_infs_ok() {
+    error.fewer_infs_ok = true;
+    return *this;
+  }
+
+  Testcase& set_skip_pos_inf() {
+    skip_pos_inf = true;
+    return *this;
+  }
+
+  Testcase& set_skip_neg_inf() {
+    skip_neg_inf = true;
+    return *this;
+  }
+
+  Testcase& set_skip_infs() {
+    skip_pos_inf = true;
+    skip_neg_inf = true;
+    return *this;
+  }
+
+  Testcase& set_skip_neg_zero() {
+    skip_neg_zero = true;
+    return *this;
+  }
+
+  string name;
+  std::function<XlaOp(XlaOp)> op;
+  float (*host_op)(float);
+
+  ErrorSpec error{0.01, 0.01};
+
+  // If true, don't test +/-infinity or negative 0.
+  bool skip_pos_inf = false;
+  bool skip_neg_inf = false;
+  bool skip_neg_zero = false;
+};
+
+void PrintTo(const Testcase& tc, std::ostream* os) { *os << tc.name; }
+
+class MathExhaustiveTest : public ClientLibraryTestBase,
+                           public ::testing::WithParamInterface<Testcase> {
+ public:
+  MathExhaustiveTest() {
+    // Disable fast-math, otherwise we get the wrong results for e.g.
+    // sqrt(-inf).
+    SetFastMathDisabled(true);
+  }
+};
+
+// Checks a function's behavior on all fp16 values.
+//
+// TODO(jlebar): asin and lgamma tests fail on interpreter.
+XLA_TEST_P(MathExhaustiveTest, DISABLED_ON_INTERPRETER(F16)) {
+  const Testcase& tc = GetParam();
+  XlaBuilder b(TestName());
+
+  std::vector<half> input;
+  for (uint32 i = 0; i < 1 << 16; ++i) {
+    half h;
+    h.x = i;
+
+    // If we're not using infinity as an input, use 0 as a placeholder rather
+    // than simply skipping this element.  We do this because when the test
+    // framework reports an incorrect answer, it tells us which index failed.
+    // So long as our inputs are a simple list of all possible float16s, we can
+    // convert an index to a half with e.g. the following Python:
+    //
+    //   np.frombuffer(array('H', [12345]), dtype=np.float16)[0]
+    //
+    // but as soon as our list of inputs has any gaps, this doesn't work.
+    if (std::isinf(static_cast<float>(h)) &&
+        ((tc.skip_pos_inf && h > half{0}) ||
+         (tc.skip_neg_inf && h < half{0}))) {
+      h = half{0};
+    }
+
+    if (h == half{0} && tc.skip_neg_zero &&
+        std::signbit(static_cast<float>(h))) {
+      h = half{0};
+    }
+
+    input.push_back(h);
+  }
+
+  std::vector<half> expected_result;
+  for (const auto& h : input) {
+    expected_result.push_back(
+        static_cast<half>(tc.host_op(static_cast<float>(h))));
+  }
+
+  XlaOp param = AddParam(LiteralUtil::CreateR1<half>(input), &b);
+  tc.op(param);
+  ComputeAndCompareR1<half>(&b, expected_result, {}, tc.error);
+}
+
+// TODO(b/123355973): The following tests from math.cc are missing.
+//
+// - Many failures.
+//
+//   Testcase{"acosh", Acosh, std::acosh}.set_relaxed_nans(),
+//   Testcase{"asinh", Asinh, std::asinh},
+//   Testcase{"sinh", Sinh, std::sinh},
+//   Testcase{"cosh", Cosh, std::cosh}.set_fewer_infs_ok(),
+//   Testcase{"round_to_even", RoundToEven,
+//            [](float x) { return std::nearbyint(x / 2) * 2; }},
+//
+// - No equivalent std function to compare with.
+//
+//   Testcase{"erfinv", ErfInv, std::erfinv},
+//   Testcase{"digamma", Digamma, std::digamma},
+//
+// - Needs a special test (function takes two args, and simply computing in f32
+//   and downcasting to f16 doesn't give the correct answer).
+//
+//   Testcase{"nextafter", NextAfter, std::nextafter},
+//
+// TODO(b/123355973): Test math functions not from math.cc (e.g. log).
+// TODO(b/123355973): Test bf16 and f32.
+// TODO(b/123355973): Get rid of skip_infs / skip_neg_zero below if possible.
+// TODO(b/123355973): Reduce lgamma error if possible; it is very high.
+INSTANTIATE_TEST_CASE_P(
+    MathExhaustiveTest_Instantiation, MathExhaustiveTest,
+    ::testing::ValuesIn(std::vector<Testcase>{
+        Testcase{"sqrt", Sqrt, std::sqrt}.set_skip_neg_inf(),
+        Testcase{"rsqrt", Rsqrt, [](float x) { return 1 / std::sqrt(x); }}
+            .set_tolerance(0.05, 0.05)
+            .set_skip_infs()
+            .set_skip_neg_zero(),
+        Testcase{"square", Square, [](float x) { return x * x; }},
+        Testcase{"reciprocal", Reciprocal, [](float x) { return 1 / x; }},
+        Testcase{"erf", Erf, std::erf}.set_tolerance(0.001, 0.0001),
+        Testcase{"erfc", Erfc, std::erfc}.set_tolerance(0.001, 0.0001),
+        Testcase{"lgamma", Lgamma, std::lgamma}
+            .set_tolerance(0.1, 0.15)
+            .set_fewer_infs_ok(),
+        Testcase{"asin", Asin, std::asin}.set_skip_infs(),
+        Testcase{"acos", Acos, std::acos}.set_skip_infs(),
+        Testcase{"atan", Atan, std::atan},
+        Testcase{"tan", Tan, std::tan}.set_tolerance(0.05, 0.05),
+    }));
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/math_test.cc b/tensorflow/compiler/xla/client/lib/math_test.cc
index ae2ea225d1aadd7b3a794eabeca866c498f34760..bdfb0575f573716b54cf9116d155d8a3a55056e8 100644
--- a/tensorflow/compiler/xla/client/lib/math_test.cc
+++ b/tensorflow/compiler/xla/client/lib/math_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -30,6 +31,138 @@ class MathTest : public ClientLibraryTestBase {
   ErrorSpec error_spec_{0.0001};
 };
 
+// Write TYPED_TESTs within the class definition so that we don't have to litter
+// "this->" everywhere.
+template <typename T>
+class MathTypedTest : public MathTest {
+ public:
+  void TestLogEdgeCases() {
+    SetFastMathDisabled(true);
+
+    XlaBuilder b(TestName());
+    Log(AddParam(LiteralUtil::CreateR1<T>({T{0.0}, T{-0.0}}), &b));
+    ComputeAndCompareR1<T>(&b,
+                           {-std::numeric_limits<T>::infinity(),
+                            -std::numeric_limits<T>::infinity()},
+                           {}, error_spec_);
+  }
+
+  void TestLog1pEdgeCases() {
+    SetFastMathDisabled(true);
+
+    XlaBuilder b(TestName());
+    Log1p(AddParam(LiteralUtil::CreateR1<T>({T{0.0}, T{-0.0}, T{-1.0}}), &b));
+    ComputeAndCompareR1<T>(
+        &b, {T{0.0}, T{-0.0}, -std::numeric_limits<T>::infinity()}, {},
+        error_spec_);
+  }
+
+  void TestIsInfOrNan() {
+    SetFastMathDisabled(true);
+
+    XlaBuilder b(TestName());
+    auto x =
+        ConstantR1<T>(&b, {
+                              T{0},
+                              T{100},
+                              T{-1000},
+                              T{std::numeric_limits<T>::max()},
+                              T{std::numeric_limits<T>::lowest()},
+                              T{std::numeric_limits<float>::infinity()},
+                              T{-std::numeric_limits<float>::infinity()},
+                              T{std::numeric_limits<float>::quiet_NaN()},
+                              T{std::numeric_limits<float>::signaling_NaN()},
+                          });
+    Tuple(&b, {IsFinite(x), IsInf(x), IsPosInf(x), IsNegInf(x), IsNan(x)});
+
+    auto expected = LiteralUtil::MakeTupleOwned(
+        LiteralUtil::CreateR1<bool>(
+            {true, true, true, true, true, false, false, false, false}),
+        LiteralUtil::CreateR1<bool>(
+            {false, false, false, false, false, true, true, false, false}),
+        LiteralUtil::CreateR1<bool>(
+            {false, false, false, false, false, true, false, false, false}),
+        LiteralUtil::CreateR1<bool>(
+            {false, false, false, false, false, false, true, false, false}),
+        LiteralUtil::CreateR1<bool>(
+            {false, false, false, false, false, false, false, true, true}));
+    ComputeAndCompareLiteral(&b, expected, {});
+  }
+
+  void TestIsNegZero() {
+    SetFastMathDisabled(true);
+    XlaBuilder b(TestName());
+    T inf(std::numeric_limits<float>::infinity());
+    T nan(std::numeric_limits<float>::quiet_NaN());
+    IsNegZero(AddParam(
+        LiteralUtil::CreateR1<T>({T{-0.0}, T{0}, T{1}, T{-1}, inf, -inf, nan}),
+        &b));
+
+    ComputeAndCompareLiteral(
+        &b,
+        LiteralUtil::CreateR1<bool>(
+            {true, false, false, false, false, false, false}),
+        {}, error_spec_);
+  }
+};
+
+// TODO(b/123355973): Add bfloat16 to TestTypes once it's working.
+#ifdef XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16
+using TestTypes = ::testing::Types<float>;
+#else
+using TestTypes = ::testing::Types<float, Eigen::half>;
+#endif
+
+TYPED_TEST_CASE(MathTypedTest, TestTypes);
+
+XLA_TYPED_TEST(MathTypedTest, LogEdgeCases) { this->TestLogEdgeCases(); }
+XLA_TYPED_TEST(MathTypedTest, Log1pEdgeCases) { this->TestLog1pEdgeCases(); }
+XLA_TYPED_TEST(MathTypedTest, IsInfOrNan) { this->TestIsInfOrNan(); }
+XLA_TYPED_TEST(MathTypedTest, IsNegZero) { this->TestIsNegZero(); }
+
+// Check that certain ops only support real, floating-point inputs.
+//
+// TODO(jlebar): Expand this test to cover more ops.
+XLA_TEST_F(MathTest, RealFpOnlyOps) {
+  for (int64 i = PrimitiveType_MIN; i <= PrimitiveType_MAX; ++i) {
+    auto ty = static_cast<PrimitiveType>(i);
+    SCOPED_TRACE(PrimitiveType_Name(ty));
+    Shape shape;
+    if (primitive_util::IsArrayType(ty)) {
+      shape = ShapeUtil::MakeShape(ty, {42});
+    } else if (ty == PrimitiveType::TUPLE) {
+      shape = ShapeUtil::MakeTupleShape({});
+    } else if (ty == PrimitiveType::OPAQUE) {
+      shape = ShapeUtil::MakeOpaqueShape();
+    } else if (ty == PrimitiveType::TOKEN) {
+      shape = ShapeUtil::MakeTokenShape();
+    } else {
+      continue;
+    }
+
+    for (const auto& test :
+         std::vector<std::pair<std::function<XlaOp(XlaOp)>, string>>({
+             {IsFinite, "is_finite"},
+             {IsInf, "is_inf"},
+             {IsPosInf, "is_pos_inf"},
+             {IsNegInf, "is_neg_inf"},
+             {IsNan, "is_nan"},
+             {Erf, "erf"},
+             {Erfc, "erfc"},
+             {Lgamma, "lgamma"},
+             {Digamma, "digamma"},
+             {RoundToEven, "round_to_even"},
+         })) {
+      SCOPED_TRACE(test.second);
+      XlaBuilder b(TestName());
+      XlaOp p = Parameter(&b, 0, shape, "p0");
+      test.first(p);
+
+      EXPECT_EQ(b.first_error().ok(), primitive_util::IsFloatingPointType(ty));
+    }
+  }
+}
+
 XLA_TEST_F(MathTest, SqrtF32) {
   XlaBuilder builder(TestName());
   Literal zero_literal = LiteralUtil::Zero(PrimitiveType::F32);
@@ -106,6 +239,27 @@ XLA_TEST_F(MathTest, Lgamma) {
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
+XLA_TEST_F(MathTest, LgammaF16) {
+  SetFastMathDisabled(true);
+
+  XlaBuilder b(TestName());
+
+  // These seemingly arbitrary inputs came from debugging the lgamma
+  // implementation against a test which tried all possible f16 values.
+  auto x = ConstantR1<half>(&b, {
+                                    half(-7360.0),
+                                    half(-4066.0),
+                                    half(-5.9605e-08),
+                                });
+  Lgamma(x);
+  std::vector<half> expected = {
+      std::numeric_limits<half>::infinity(),
+      std::numeric_limits<half>::infinity(),
+      half(16.64),
+  };
+  ComputeAndCompareR1<half>(&b, expected, {}, ErrorSpec{0.1});
+}
+
 XLA_TEST_F(MathTest, Digamma) {
   XlaBuilder builder(TestName());
   auto x = ConstantR1<float>(&builder, {1.0, 0.5, 1 / 3.0, 0.25, 1 / 6.0, 0.125,
@@ -148,5 +302,40 @@ XLA_TEST_F(MathTest, RoundToEven) {
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
+XLA_TEST_F(MathTest, ErfRejectsComplexInputs) {
+  XlaBuilder b(TestName());
+  auto x = ConstantR1<std::complex<float>>(&b, {{0, 0}});
+  Erf(x);
+  EXPECT_FALSE(b.Build().status().ok());
+}
+
+XLA_TEST_F(MathTest, ErfcRejectsComplexInputs) {
+  XlaBuilder b(TestName());
+  auto x = ConstantR1<std::complex<float>>(&b, {{0, 0}});
+  Erfc(x);
+  EXPECT_FALSE(b.Build().status().ok());
+}
+
+XLA_TEST_F(MathTest, LgammaRejectsComplexInputs) {
+  XlaBuilder b(TestName());
+  auto x = ConstantR1<std::complex<float>>(&b, {{0, 0}});
+  Lgamma(x);
+  EXPECT_FALSE(b.Build().status().ok());
+}
+
+XLA_TEST_F(MathTest, DigammaRejectsComplexInputs) {
+  XlaBuilder b(TestName());
+  auto x = ConstantR1<std::complex<float>>(&b, {{0, 0}});
+  Digamma(x);
+  EXPECT_FALSE(b.Build().status().ok());
+}
+
+XLA_TEST_F(MathTest, RoundToEvenRejectsComplexInputs) {
+  XlaBuilder b(TestName());
+  auto x = ConstantR1<std::complex<float>>(&b, {{0, 0}});
+  RoundToEven(x);
+  EXPECT_FALSE(b.Build().status().ok());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/matrix.cc b/tensorflow/compiler/xla/client/lib/matrix.cc
index ffd744d190885b8e3f4149a48a706498b3787618..a055a8e625c680cf5232896c95cd35b78cb172bc 100644
--- a/tensorflow/compiler/xla/client/lib/matrix.cc
+++ b/tensorflow/compiler/xla/client/lib/matrix.cc
@@ -15,40 +15,52 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/matrix.h"
 
+#include <array>
 #include <numeric>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 
 XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m,
                      int64 n) {
-  auto a = Iota(builder, type, m);
-  auto b = Iota(builder, type, n);
+  auto a = Iota(builder, U32, m);
+  auto b = Iota(builder, U32, n);
   auto indicator = Eq(a, Broadcast(b, {m}), /*broadcast_dimensions=*/{0});
   return ConvertElementType(indicator, type);
 }
 
-XlaOp GetMatrixDiagonal(XlaOp x) {
+XlaOp GetMatrixDiagonal(XlaOp x, int k) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64 n_dims = ShapeUtil::Rank(shape);
+    const int64 n_dims = shape.rank();
     TF_RET_CHECK(n_dims >= 2);
     const int64 m = shape.dimensions(n_dims - 2);
     const int64 n = shape.dimensions(n_dims - 1);
+
+    auto offset = ConstantR0WithType(builder, S32, k);
+
     absl::Span<const int64> major_dims =
         AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2);
-    auto a = Iota(builder, U32, n);
-    auto b = Iota(builder, U32, m);
+    auto a = Iota(builder, S32, n);
+    auto b = Iota(builder, S32, m) + offset;
     auto indicator = Eq(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
     auto mask = Broadcast(indicator, major_dims);
 
@@ -58,111 +70,269 @@ XlaOp GetMatrixDiagonal(XlaOp x) {
         primitive_util::IsIntegralType(shape.element_type())
             ? CreateScalarOrComputation(shape.element_type(), builder)
             : CreateScalarAddComputation(shape.element_type(), builder);
-
-    return Reduce(Select(mask, x, Zeros(builder, shape)), ScalarLike(x, 0),
-                  reducer, {m >= n ? n_dims - 2 : n_dims - 1});
+    // k == 0, we can save one slice op.
+    if (k == 0) {
+      return Reduce(Select(mask, x, Zeros(builder, shape)), ScalarLike(x, 0),
+                    reducer, {m >= n ? n_dims - 2 : n_dims - 1});
+    } else if (k > 0) {
+      auto result = Reduce(Select(mask, x, Zeros(builder, shape)),
+                           ScalarLike(x, 0), reducer, {n_dims - 2});
+      return SliceInMinorDims(result, {std::min<int64>(k, n)},
+                              {std::min(m + k, n)});
+    } else {
+      auto result = Reduce(Select(mask, x, Zeros(builder, shape)),
+                           ScalarLike(x, 0), reducer, {n_dims - 1});
+      return SliceInMinorDims(result, {std::min<int64>(-k, m)},
+                              {std::min(m, n - k)});
+    }
   });
 }
 
-XlaOp Triangle(XlaOp x, bool lower) {
+XlaOp TriangleMask(XlaOp x, int diagonal) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64 n_dims = ShapeUtil::Rank(shape);
+    const int64 n_dims = shape.rank();
     TF_RET_CHECK(n_dims >= 2);
     const int64 m = shape.dimensions(n_dims - 2);
     const int64 n = shape.dimensions(n_dims - 1);
     absl::Span<const int64> major_dims =
         AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2);
-    auto a = Iota(builder, U32, n);
-    auto b = Iota(builder, U32, m);
+    auto a = Iota(builder, S32, n);
+    auto b = Iota(builder, S32, m) + ConstantR0<int32>(builder, diagonal);
     XlaOp indicator;
-    if (lower) {
-      indicator = Ge(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
-    } else {
-      indicator = Le(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
-    }
-    auto mask = Broadcast(indicator, major_dims);
-
-    return Select(mask, x, Zeros(builder, shape));
+    indicator = Ge(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
+    return Broadcast(indicator, major_dims);
   });
 }
 
+XlaOp Triangle(XlaOp x, bool lower) {
+  return lower ? Select(TriangleMask(x, 0), x, ZerosLike(x))
+               : Select(TriangleMask(x, -1), ZerosLike(x), x);
+}
+
 XlaOp UpperTriangle(XlaOp x) { return Triangle(x, false); }
 
 XlaOp LowerTriangle(XlaOp x) { return Triangle(x, true); }
 
-XlaOp BatchDot(XlaOp x, XlaOp y, PrecisionConfig::Precision precision) {
+Status ValidateEinsumNumericDimensions(absl::Span<const int64> x_config,
+                                       absl::Span<const int64> y_config,
+                                       absl::Span<const int64> output_config) {
+  for (auto dim : output_config) {
+    if (absl::c_linear_search(x_config, dim) ||
+        absl::c_linear_search(y_config, dim)) {
+      if (absl::c_count(output_config, dim) > 1) {
+        return InvalidArgument("Einsum has repeated output dimension.");
+      }
+      continue;
+    }
+    return InvalidArgument(
+        "Einsum has output dimension without corresponding input dimension.");
+  }
+  for (auto dim : x_config) {
+    if (absl::c_linear_search(y_config, dim) ||
+        absl::c_linear_search(output_config, dim)) {
+      if (absl::c_count(x_config, dim) > 1) {
+        return InvalidArgument("Einsum has repeated lhs dimension.");
+      }
+      continue;
+    }
+    return InvalidArgument(
+        "Einsum has lhs dimension without corresponding rhs or output "
+        "dimension.");
+  }
+  for (auto dim : y_config) {
+    if (absl::c_linear_search(x_config, dim) ||
+        absl::c_linear_search(output_config, dim)) {
+      if (absl::c_count(y_config, dim) > 1) {
+        return InvalidArgument("Einsum has repeated rhs dimension.");
+      }
+      continue;
+    }
+    return InvalidArgument(
+        "Einsum has rhs dimension without corresponding lhs or output "
+        "dimension.");
+  }
+  return Status::OK();
+}
+
+xla::XlaOp Einsum(xla::XlaOp x, absl::Span<const int64> x_config, xla::XlaOp y,
+                  absl::Span<const int64> y_config,
+                  absl::Span<const int64> output_config,
+                  xla::PrecisionConfig::Precision precision) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(Shape x_shape, builder->GetShape(x));
-    TF_ASSIGN_OR_RETURN(Shape y_shape, builder->GetShape(y));
+    TF_RETURN_IF_ERROR(
+        ValidateEinsumNumericDimensions(x_config, y_config, output_config));
+    const int64 x_rank = x_config.size();
+    const int64 y_rank = y_config.size();
+    const int64 output_rank = output_config.size();
+    absl::flat_hash_set<int64> x_map;
+    absl::flat_hash_set<int64> y_map;
+    absl::flat_hash_set<int64> output_map;
+
+    auto find = [&](const absl::flat_hash_set<int64>& map, int64 d) {
+      return map.count(d) != 0;
+    };
 
-    // Check that both tensors have the same number of dimensions. There must be
-    // at least two (the batch dimensions can be empty).
-    if (ShapeUtil::Rank(x_shape) != ShapeUtil::Rank(y_shape)) {
-      return InvalidArgument(
-          "Arguments to BatchDot have different ranks: %s vs. %s",
-          ShapeUtil::HumanString(x_shape), ShapeUtil::HumanString(y_shape));
+    auto insert = [&](absl::flat_hash_set<int64>& map, char d) {
+      CHECK(!find(map, d));
+      map.insert(d);
+    };
+
+    for (auto d : x_config) {
+      insert(x_map, d);
     }
-    const int ndims = ShapeUtil::Rank(x_shape);
-    if (ndims < 2) {
-      return InvalidArgument(
-          "Arguments to BatchDot must have rank >= 2: got %d", ndims);
+
+    for (auto d : y_config) {
+      insert(y_map, d);
     }
 
-    // The batch dimensions must be equal and the matrix dimensions must be
-    // valid.
-    std::vector<int64> batch_dimension_numbers;
-    for (int i = 0; i < ndims - 2; ++i) {
-      if (x_shape.dimensions(i) != y_shape.dimensions(i)) {
-        return InvalidArgument(
-            "Dimension %d of inputs to BatchDot must be equal: shapes %s vs %s",
-            i, ShapeUtil::HumanString(x_shape),
-            ShapeUtil::HumanString(y_shape));
-      }
-      batch_dimension_numbers.push_back(i);
+    for (auto d : output_config) {
+      insert(output_map, d);
     }
 
-    int x_inner_dim = ndims - 1;
-    int y_inner_dim = ndims - 2;
-    if (x_shape.dimensions(x_inner_dim) != y_shape.dimensions(y_inner_dim)) {
-      return InvalidArgument(
-          "Dimensions %d and %d of arguments to BatchDot must be equal: "
-          "shapes %s vs %s",
-          x_inner_dim, y_inner_dim, ShapeUtil::HumanString(x_shape),
-          ShapeUtil::HumanString(y_shape));
+    DotDimensionNumbers dnums;
+    std::vector<int64> lhs_outer_dims;
+    auto is_batch_dim = [&](int64 d) {
+      return find(x_map, d) && find(y_map, d) && find(output_map, d);
+    };
+    auto is_contracting = [&](int64 d) {
+      return find(x_map, d) && find(y_map, d);
+    };
+    auto rhs_dimension_number = [&](int64 d) {
+      return absl::c_find(y_config, d) - y_config.begin();
+    };
+    for (int64 i = 0; i < x_rank; ++i) {
+      auto dim_name = x_config[i];
+      if (is_batch_dim(dim_name)) {
+        dnums.add_lhs_batch_dimensions(i);
+        dnums.add_rhs_batch_dimensions(rhs_dimension_number(dim_name));
+      } else if (is_contracting(dim_name)) {
+        dnums.add_lhs_contracting_dimensions(i);
+        dnums.add_rhs_contracting_dimensions(rhs_dimension_number(dim_name));
+      } else {
+        lhs_outer_dims.push_back(i);
+      }
     }
 
-    // Check for zero lhs/rhs dim size.
-    if (ShapeUtil::IsZeroElementArray(x_shape) ||
-        ShapeUtil::IsZeroElementArray(y_shape)) {
-      std::vector<int64> dimensions(batch_dimension_numbers.size());
-      for (int i = 0; i < batch_dimension_numbers.size(); ++i) {
-        dimensions[i] = x_shape.dimensions(batch_dimension_numbers[i]);
+    std::vector<int64> rhs_outer_dims;
+    for (int64 i = 0; i < y_rank; ++i) {
+      auto dim_name = y_config[i];
+      if (!is_batch_dim(dim_name) && !is_contracting(dim_name)) {
+        rhs_outer_dims.push_back(i);
       }
-      int x_outer_dim = ndims - 2;
-      int y_outer_dim = ndims - 1;
-      dimensions.push_back(x_shape.dimensions(x_outer_dim));
-      dimensions.push_back(y_shape.dimensions(y_outer_dim));
-      return Broadcast(
-          ConstantLiteral(builder, LiteralUtil::Zero(x_shape.element_type())),
-          dimensions);
+    }
+
+    auto output_dimension_number = [&](char d) {
+      return absl::c_find(output_config, d) - output_config.begin();
+    };
+
+    std::vector<int64> output_dims;
+    output_dims.reserve(output_rank);
+    for (auto d : dnums.lhs_batch_dimensions()) {
+      output_dims.push_back(output_dimension_number(x_config[d]));
+    }
+    for (auto d : lhs_outer_dims) {
+      output_dims.push_back(output_dimension_number(x_config[d]));
+    }
+    for (auto d : rhs_outer_dims) {
+      output_dims.push_back(output_dimension_number(y_config[d]));
+    }
+
+    std::vector<int64> transpose_dims(output_rank);
+    for (int64 i = 0; i < output_rank; ++i) {
+      transpose_dims[output_dims[i]] = i;
     }
 
     PrecisionConfig precision_proto;
     precision_proto.add_operand_precision(precision);
     precision_proto.add_operand_precision(precision);
+    return Transpose(DotGeneral(x, y, dnums, &precision_proto), transpose_dims);
+  });
+}
+
+XlaOp BatchDot(XlaOp x, XlaOp y, PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape x_shape, builder->GetShape(x));
+    TF_ASSIGN_OR_RETURN(Shape y_shape, builder->GetShape(y));
 
-    DotDimensionNumbers dot_dnums;
-    dot_dnums.add_lhs_contracting_dimensions(x_inner_dim);
-    dot_dnums.add_rhs_contracting_dimensions(y_inner_dim);
-    for (auto batch_dimension_number : batch_dimension_numbers) {
-      dot_dnums.add_lhs_batch_dimensions(batch_dimension_number);
-      dot_dnums.add_rhs_batch_dimensions(batch_dimension_number);
+    // The batch dimensions must be equal and the matrix dimensions must be
+    // valid.
+    std::vector<int64> batch_dimension_numbers;
+    const int ndims = x_shape.rank();
+    batch_dimension_numbers.reserve(ndims - 2);
+    for (int i = 0; i < ndims - 2; ++i) {
+      batch_dimension_numbers.push_back(i);
+    }
+    std::vector<int64> x_config = batch_dimension_numbers;
+    x_config.push_back(ndims - 2);
+    x_config.push_back(ndims);
+    std::vector<int64> y_config = batch_dimension_numbers;
+    y_config.push_back(ndims);
+    y_config.push_back(ndims - 1);
+    std::vector<int64> output_config = batch_dimension_numbers;
+    output_config.push_back(ndims - 2);
+    output_config.push_back(ndims - 1);
+    return Einsum(x, x_config, y, y_config, output_config, precision);
+  });
+}
+
+StatusOr<std::array<std::vector<int64>, 3>> ParseEinsumString(
+    absl::string_view einsum_config) {
+  std::array<std::vector<int64>, 3> einsum_config_numeric;
+  std::vector<absl::string_view> main_split =
+      absl::StrSplit(einsum_config, ',');
+
+  if (main_split.size() != 2) {
+    return InvalidArgument("Expected one \",\" in einsum_config.");
+  }
+
+  auto maybe_invalid_character = [](char d) {
+    if (absl::ascii_isalpha(d)) {
+      return Status::OK();
     }
+    if (d == '.') {
+      return InvalidArgument("Unsupported \"...\" or \".\" in einsum config.");
+    }
+    return InvalidArgument("Unexpected character in einsum config.");
+  };
+
+  auto& x_config = einsum_config_numeric[0];
+  x_config.reserve(main_split[0].size());
+  for (auto d : main_split[0]) {
+    TF_RETURN_IF_ERROR(maybe_invalid_character(d));
+    x_config.push_back(static_cast<int64>(d));
+  }
+  std::vector<absl::string_view> y_output_split =
+      absl::StrSplit(main_split[1], "->");
+  if (y_output_split.size() != 2) {
+    return InvalidArgument("Expected one \"->\" in einsum_config.");
+  }
+  auto& y_config = einsum_config_numeric[1];
+  y_config.reserve(y_output_split[0].size());
+  for (auto d : y_output_split[0]) {
+    TF_RETURN_IF_ERROR(maybe_invalid_character(d));
+    y_config.push_back(static_cast<int64>(d));
+  }
+  auto& output_config = einsum_config_numeric[2];
+  output_config.reserve(y_output_split[1].size());
+  for (auto d : y_output_split[1]) {
+    TF_RETURN_IF_ERROR(maybe_invalid_character(d));
+    output_config.push_back(static_cast<int64>(d));
+  }
+  return einsum_config_numeric;
+}
 
-    return DotGeneral(x, y, dot_dnums, &precision_proto);
+XlaOp Einsum(XlaOp x, XlaOp y, absl::string_view einsum_config,
+             PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto einsum_config_numeric,
+                        ParseEinsumString(einsum_config));
+    return Einsum(x, einsum_config_numeric[0], y, einsum_config_numeric[1],
+                  einsum_config_numeric[2], precision);
   });
 }
 
@@ -170,7 +340,7 @@ XlaOp TransposeInMinorDims(XlaOp x) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64 n_dims = ShapeUtil::Rank(shape);
+    const int64 n_dims = shape.rank();
     TF_RET_CHECK(n_dims >= 2);
     std::vector<int64> permutation(n_dims);
     std::iota(permutation.begin(), permutation.end(), 0);
@@ -182,4 +352,5 @@ XlaOp TransposeInMinorDims(XlaOp x) {
 XlaOp MaybeTransposeInMinorDims(XlaOp x, bool transpose) {
   return transpose ? TransposeInMinorDims(x) : x;
 }
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/matrix.h b/tensorflow/compiler/xla/client/lib/matrix.h
index 8856f99c7a0fee8f315aac11fab392cf5536f57b..60c41ec45a086726086dac7227fc432a9c62d0c8 100644
--- a/tensorflow/compiler/xla/client/lib/matrix.h
+++ b/tensorflow/compiler/xla/client/lib/matrix.h
@@ -16,7 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATRIX_H_
 #define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATRIX_H_
 
+#include <array>
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -26,10 +30,19 @@ namespace xla {
 // else.
 XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m, int64 n);
 
-// Get the diagonals of the last two dimensions. If 'x' has shape
-// [..., M, N], then the output has shape [..., min(M, N)], containing the
-// diagonal elements (i.e., with indices [..., i, i]).
-XlaOp GetMatrixDiagonal(XlaOp x);
+// Get the diagonals of the last two dimensions. Use k>0 for diagonals above the
+// main diagonal, and k<0 for diagonals below the main diagonal.
+//
+// If 'x' has shape [..., M, N]
+//  If k >= 0: then the output has shape [..., min(M, N - k)], containing the
+//            diagonal elements (i.e., with indices [..., i, i + k]).
+//  If k < 0: then the output has shape [..., min(M + k, N)], containing the
+//            diagonal elements (i.e., with indices [..., i - k, i]).
+XlaOp GetMatrixDiagonal(XlaOp x, int k = 0);
+
+// Returns a lower-triangular mask, i.e., true below the `diagonal`-th diagonal
+// and false above that diagonal.
+XlaOp TriangleMask(XlaOp x, int diagonal);
 
 // Get the upper or lower triangle part of the last two dimensions
 XlaOp Triangle(XlaOp x, bool lower);
@@ -61,6 +74,40 @@ xla::XlaOp BatchDot(
     xla::XlaOp x, xla::XlaOp y,
     xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT);
 
+// Parse an einsum string into dimension numbers:
+//   "ab,cb->ac"
+// becomes:
+//   {{0, 1},{2, 1},{0, 2}}
+//
+// NOTE: This function is meant for testing, there is no need to call it
+// directly.
+
+StatusOr<std::array<std::vector<int64>, 3>> ParseEinsumString(
+    absl::string_view einsum_config);
+
+// Determine if each dimension label is in at least two inputs.
+//
+// NOTE: This function is meant for testing, there is no need to call it
+// directly.
+Status ValidateEinsumNumericDimensions(absl::Span<const int64> x_config,
+                                       absl::Span<const int64> y_config,
+                                       absl::Span<const int64> output_config);
+
+// Supports two operand einsum notation like "ab,cb->ac".
+xla::XlaOp Einsum(
+    xla::XlaOp x, xla::XlaOp y, absl::string_view einsum_config,
+    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT);
+
+// Same as above but supporting numeric labels on dimensins. So "ab,cb->ac"
+// becomes:
+//   x_config = {0, 1}
+//   y_config = {2, 1}
+//   output_config = {0, 2}
+xla::XlaOp Einsum(
+    xla::XlaOp x, absl::Span<const int64> x_config, xla::XlaOp y,
+    absl::Span<const int64> y_config, absl::Span<const int64> output_config,
+    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT);
+
 // Transposes a stack of matrices `x` by swapping the last two dimensions.
 xla::XlaOp TransposeInMinorDims(xla::XlaOp x);
 
diff --git a/tensorflow/compiler/xla/client/lib/matrix_test.cc b/tensorflow/compiler/xla/client/lib/matrix_test.cc
index 0593a7517ac125ca8dc5395cee76f6bc23232cd3..a93fc2ccb92912a10b9b6c2192b81cd73566f2a0 100644
--- a/tensorflow/compiler/xla/client/lib/matrix_test.cc
+++ b/tensorflow/compiler/xla/client/lib/matrix_test.cc
@@ -15,13 +15,15 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/matrix.h"
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/client/lib/slicing.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
@@ -51,13 +53,24 @@ void MatrixTest::TestMatrixDiagonal() {
   XlaBuilder builder("GetMatrixDiagonal");
   Array3D<T> input(2, 3, 4);
   input.FillIota(0);
-
-  XlaOp a;
-  auto a_data = CreateR3Parameter<T>(input, 0, "a", &builder, &a);
-  GetMatrixDiagonal(a);
-  Array2D<T> expected({{0, 5, 10}, {12, 17, 22}});
-
-  ComputeAndCompareR2<T>(&builder, expected, {a_data.get()});
+  std::map<int, Array2D<T>> k_and_expected = {
+      {0, {{0, 5, 10}, {12, 17, 22}}},
+      {1, {{1, 6, 11}, {13, 18, 23}}},
+      {2, {{2, 7}, {14, 19}}},
+      {3, {{3}, {15}}},
+      {4, {{}, {}}},
+      {-1, {{4, 9}, {16, 21}}},
+      {-2, {{8}, {20}}},
+      {-3, {{}, {}}},
+      {-4, {{}, {}}},
+  };
+  for (const auto& kv : k_and_expected) {
+    XlaOp a;
+    auto a_data = CreateR3Parameter<T>(input, 0, "a", &builder, &a);
+    GetMatrixDiagonal(a, kv.first);
+
+    ComputeAndCompareR2<T>(&builder, kv.second, {a_data.get()});
+  }
 }
 
 XLA_TEST_F(MatrixTest, GetMatrixDiagonal_S32) { TestMatrixDiagonal<int32>(); }
@@ -101,5 +114,78 @@ XLA_TEST_F(MatrixTest, RowBatchDot) {
   ComputeAndCompareR3<float>(&builder, {{{33}}, {{292}}},
                              {a_data.get(), row_data.get(), index_data.get()});
 }
+
+XLA_TEST_F(MatrixTest, Einsum) {
+  XlaBuilder builder(TestName());
+
+  int n = 4;
+
+  XlaOp a, row, index;
+  auto a_data =
+      CreateR3Parameter<float>(BatchedAValsFull(), 0, "a", &builder, &a);
+  auto row_data = CreateR3Parameter<float>({{{9, 1, 0, 0}}, {{2, 4, 0, 0}}}, 1,
+                                           "row", &builder, &row);
+  // Select {{3, 6, 0, 1}, {24, 61,  82,  48}} out of BatchedAValsFull().
+  auto index_data = CreateR0Parameter<int>(1, 2, "index", &builder, &index);
+
+  auto l_index = DynamicSliceInMinorDims(
+      a, {index, ConstantR0<int32>(&builder, 0)}, {1, n});
+  Einsum(l_index, row, "abc,adc->abd");
+
+  ComputeAndCompareR3<float>(&builder, {{{33}}, {{292}}},
+                             {a_data.get(), row_data.get(), index_data.get()});
+}
+
+XLA_TEST_F(MatrixTest, ParseEinsumString) {
+  auto to_vec = [](absl::string_view s) {
+    std::vector<int64> v;
+    v.reserve(s.size());
+    for (auto c : s) {
+      v.push_back(int64{c});
+    }
+    return v;
+  };
+
+  auto to_string = [&](absl::string_view x, absl::string_view y,
+                       absl::string_view o) {
+    return absl::StrCat(x, ",", y, "->", o);
+  };
+
+  std::vector<std::vector<string>> good_test_cases = {{"ab", "bc", "ac"},
+                                                      {"Bab", "Bbc", "Bac"},
+                                                      {"ab", "cd", "dcba"},
+                                                      {"abc", "abd", "cbd"}};
+  for (auto test_case : good_test_cases) {
+    auto parse_result_or_status =
+        ParseEinsumString(to_string(test_case[0], test_case[1], test_case[2]));
+    EXPECT_TRUE(parse_result_or_status.status().ok());
+    auto parse_result = parse_result_or_status.ValueOrDie();
+    for (int i = 0; i < 3; ++i) {
+      EXPECT_EQ(parse_result[i], to_vec(test_case[i]));
+    }
+    EXPECT_TRUE(ValidateEinsumNumericDimensions(
+                    parse_result[0], parse_result[1], parse_result[2])
+                    .ok());
+  }
+
+  std::vector<string> einsum_strings_that_fail_parsing = {
+      "", "a", "ab->ba", "ab,bc,cd->ad", "a...b,bc->a...c"};
+  for (auto test_case : einsum_strings_that_fail_parsing) {
+    auto parse_result_or_status = ParseEinsumString(test_case);
+    EXPECT_FALSE(parse_result_or_status.status().ok());
+  }
+
+  std::vector<string> einsum_strings_that_fail_numeric_validation = {
+      "a,b->c", "ab,bc->acd", "abz,bc->ac", "ab,bcz->ac"};
+  for (auto test_case : einsum_strings_that_fail_numeric_validation) {
+    auto parse_result_or_status = ParseEinsumString(test_case);
+    EXPECT_TRUE(parse_result_or_status.status().ok());
+    auto parse_result = parse_result_or_status.ValueOrDie();
+    EXPECT_FALSE(ValidateEinsumNumericDimensions(
+                     parse_result[0], parse_result[1], parse_result[2])
+                     .ok());
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/tf2xla/lib/qr.cc b/tensorflow/compiler/xla/client/lib/qr.cc
similarity index 62%
rename from tensorflow/compiler/tf2xla/lib/qr.cc
rename to tensorflow/compiler/xla/client/lib/qr.cc
index d6007748609fdd161cb89692a167eb7ed12fe00c..640412ec8bcffd2565b11ba25b87f6bf6438d848 100644
--- a/tensorflow/compiler/tf2xla/lib/qr.cc
+++ b/tensorflow/compiler/xla/client/lib/qr.cc
@@ -13,15 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/lib/qr.h"
+#include "tensorflow/compiler/xla/client/lib/qr.h"
 
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/tf2xla/lib/util.h"
-#include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/loops.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/lib/slicing.h"
@@ -32,10 +31,18 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/core/errors.h"
 
-namespace tensorflow {
+namespace xla {
 
 namespace {
 
+std::vector<int64> ConcatVectors(absl::Span<const int64> xs,
+                                 absl::Span<const int64> ys) {
+  std::vector<int64> output(xs.size() + ys.size());
+  std::copy(xs.begin(), xs.end(), output.begin());
+  std::copy(ys.begin(), ys.end(), output.begin() + xs.size());
+  return output;
+}
+
 // Computes a Householder reflection of the form:
 // H = I - tau v v.T.
 // such that
@@ -65,52 +72,47 @@ namespace {
 //   return (v, tau, beta)
 // TODO(phawkins): LAPACK's xLARFG implementation has code for handling
 // overflows in the norm/beta calculations. Perhaps do the same here.
-xla::Status House(xla::XlaOp x, xla::XlaOp k,
-                  absl::Span<const int64> batch_dims, const int64 m,
-                  xla::XlaOp* v, xla::XlaOp* tau, xla::XlaOp* beta) {
-  xla::XlaBuilder* const builder = x.builder();
-  TF_ASSIGN_OR_RETURN(xla::Shape x_shape, builder->GetShape(x));
-  const xla::PrimitiveType type = x_shape.element_type();
+Status House(XlaOp x, XlaOp k, absl::Span<const int64> batch_dims,
+             const int64 m, XlaOp* v, XlaOp* tau, XlaOp* beta) {
+  XlaBuilder* const builder = x.builder();
+  TF_ASSIGN_OR_RETURN(Shape x_shape, builder->GetShape(x));
+  const PrimitiveType type = x_shape.element_type();
 
   std::vector<int64> batch_dim_ids(batch_dims.size());
   std::iota(batch_dim_ids.begin(), batch_dim_ids.end(), 0);
   const int64 minor_dim = batch_dims.size();
 
-  xla::XlaOp zero = xla::ScalarLike(x, 0.0);
-  xla::XlaOp one = xla::ScalarLike(x, 1.0);
+  XlaOp zero = ScalarLike(x, 0.0);
+  XlaOp one = ScalarLike(x, 1.0);
 
   // alpha = x[k]
-  xla::XlaOp alpha =
-      xla::Reshape(DynamicSliceInMinorDims(x, {k}, {1}), batch_dims);
+  XlaOp alpha = Reshape(DynamicSliceInMinorDims(x, {k}, {1}), batch_dims);
 
   // Compute x[k+1:] (padded with zeros in elements 0..k)
-  xla::XlaOp iota = xla::Iota(builder, xla::S32, m);
-  xla::XlaOp x_after_k =
-      xla::Mul(x, xla::ConvertElementType(xla::Gt(iota, k), type),
-               /*broadcast_dimensions=*/{minor_dim});
+  XlaOp iota = Iota(builder, S32, m);
+  XlaOp x_after_k = Mul(x, ConvertElementType(Gt(iota, k), type),
+                        /*broadcast_dimensions=*/{minor_dim});
 
   // sigma = np.dot(x[k+1:], x[k+1:])
-  auto sigma =
-      xla::Reduce(x_after_k * x_after_k, zero,
-                  xla::CreateScalarAddComputation(type, builder), {minor_dim});
+  auto sigma = Reduce(x_after_k * x_after_k, zero,
+                      CreateScalarAddComputation(type, builder), {minor_dim});
   // mu = np.sqrt(x[k]*x[k] + sigma)
-  auto mu = xla::Sqrt(xla::Square(alpha) + sigma);
+  auto mu = Sqrt(Square(alpha) + sigma);
 
-  auto sigma_is_zero = xla::Eq(sigma, zero);
+  auto sigma_is_zero = Eq(sigma, zero);
 
-  *beta = xla::Select(sigma_is_zero, alpha, -xla::Sign(alpha) * mu);
-  *tau = xla::Select(sigma_is_zero, xla::Broadcast(zero, batch_dims),
-                     (*beta - alpha) / *beta);
-  auto divisor = xla::Select(sigma_is_zero, xla::Broadcast(one, batch_dims),
-                             alpha - *beta);
+  *beta = Select(sigma_is_zero, alpha, -Sign(alpha) * mu);
+  *tau = Select(sigma_is_zero, Broadcast(zero, batch_dims),
+                (*beta - alpha) / *beta);
+  auto divisor =
+      Select(sigma_is_zero, Broadcast(one, batch_dims), alpha - *beta);
 
-  auto e_k = xla::Broadcast(xla::ConvertElementType(xla::Eq(iota, k), type),
-                            std::vector<int64>(batch_dims.size(), 1));
+  auto e_k = Broadcast(ConvertElementType(Eq(iota, k), type),
+                       std::vector<int64>(batch_dims.size(), 1));
 
   // Form v as [0, 0, ..., 1] ++ x[k+1:] / divisor
   // If sigma is zero, x[k+1:] is zero, so use any non-zero divisor.
-  *v = e_k +
-       xla::Div(x_after_k, divisor, /*broadcast_dimensions=*/batch_dim_ids);
+  *v = e_k + Div(x_after_k, divisor, /*broadcast_dimensions=*/batch_dim_ids);
   return Status::OK();
 }
 
@@ -143,90 +145,86 @@ xla::Status House(xla::XlaOp x, xla::XlaOp k,
 //   return (q, vs, taus)
 struct QRBlockResult {
   // The factored R value
-  xla::XlaOp r;
+  XlaOp r;
 
   // Representation of the Householder matrices I - beta v v.T
-  xla::XlaOp taus;  // Shape: [..., n]
-  xla::XlaOp vs;    // Shape: [..., m, n]
+  XlaOp taus;  // Shape: [..., n]
+  XlaOp vs;    // Shape: [..., m, n]
 };
-xla::StatusOr<QRBlockResult> QRBlock(
-    xla::XlaOp a, xla::PrecisionConfig::Precision precision) {
-  xla::XlaBuilder* builder = a.builder();
-  TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
-  const int num_dims = xla::ShapeUtil::Rank(a_shape);
+StatusOr<QRBlockResult> QRBlock(XlaOp a, PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = a.builder();
+  TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+  const int num_dims = a_shape.rank();
   if (num_dims < 2) {
-    return errors::InvalidArgument("Arguments to QR must have rank >= 2: ",
-                                   num_dims);
+    return InvalidArgument("Argument to QR must have rank >= 2; got shape %s",
+                           a_shape.ToString());
   }
-  xla::PrimitiveType type = a_shape.element_type();
+  PrimitiveType type = a_shape.element_type();
 
-  const int64 m = xla::ShapeUtil::GetDimension(a_shape, -2);
-  const int64 n = xla::ShapeUtil::GetDimension(a_shape, -1);
+  const int64 m = ShapeUtil::GetDimension(a_shape, -2);
+  const int64 n = ShapeUtil::GetDimension(a_shape, -1);
 
   const int64 num_batch_dims = num_dims - 2;
   std::vector<int64> batch_dims(num_batch_dims);
   for (int i = 0; i < num_batch_dims; ++i) {
-    batch_dims[i] = xla::ShapeUtil::GetDimension(a_shape, i);
+    batch_dims[i] = ShapeUtil::GetDimension(a_shape, i);
   }
 
   std::vector<int64> batch_dim_indices(num_batch_dims);
   std::iota(batch_dim_indices.begin(), batch_dim_indices.end(), 0);
 
-  auto qr_body_fn =
-      [&](xla::XlaOp j, absl::Span<const xla::XlaOp> values,
-          xla::XlaBuilder* builder) -> xla::StatusOr<std::vector<xla::XlaOp>> {
+  auto qr_body_fn = [&](XlaOp j, absl::Span<const XlaOp> values,
+                        XlaBuilder* builder) -> StatusOr<std::vector<XlaOp>> {
     auto a = values[0];
     auto vs = values[1];
     auto taus = values[2];
 
     // v, beta = house(a[:, j], j)
     auto x = DynamicSliceInMinorDims(a, {j}, {1});
-    xla::XlaOp v, tau, beta;
-    TF_RETURN_IF_ERROR(House(xla::Collapse(x, {num_dims - 2, num_dims - 1}), j,
+    XlaOp v, tau, beta;
+    TF_RETURN_IF_ERROR(House(Collapse(x, {num_dims - 2, num_dims - 1}), j,
                              batch_dims, m, &v, &tau, &beta));
 
     std::vector<int64> shape = batch_dims;
     shape.push_back(1);
     shape.push_back(m);
-    auto v_broadcast = xla::Reshape(v, shape);
+    auto v_broadcast = Reshape(v, shape);
     // a[:, :] -= tau * np.dot(v[:, np.newaxis],
     //                          np.dot(v[np.newaxis, :], a[:, :]))
     auto vva = BatchDot(v_broadcast, a, precision);
     vva = BatchDot(TransposeInMinorDims(v_broadcast), vva, precision);
-    a = a - xla::Mul(tau, vva,
-                     /*broadcast_dimensions=*/batch_dim_indices);
+    a = a - Mul(tau, vva,
+                /*broadcast_dimensions=*/batch_dim_indices);
 
     // It is more precise to populate column 'k' explicitly, rather than
     // computing it implicitly by applying the Householder transformation.
     // a[k,k] = beta
     // a[k+1:,k] = np.zeros([m-k-1], dtype=a.dtype)
-    auto iota = xla::Reshape(xla::Iota(a.builder(), xla::S32, m), {m, 1});
-    auto predecessor_mask = xla::ConvertElementType(xla::Lt(iota, j), type);
-    auto mask = xla::Broadcast(xla::ConvertElementType(xla::Eq(iota, j), type),
-                               std::vector<int64>(batch_dims.size(), 1));
-    auto new_x =
-        xla::Mul(x, predecessor_mask,
-                 /*broadcast_dimensions=*/{num_dims - 2, num_dims - 1}) +
-        xla::Mul(beta, mask, /*broadcast_dimensions=*/batch_dim_indices);
+    auto iota = Reshape(Iota(a.builder(), S32, m), {m, 1});
+    auto predecessor_mask = ConvertElementType(Lt(iota, j), type);
+    auto mask = Broadcast(ConvertElementType(Eq(iota, j), type),
+                          std::vector<int64>(batch_dims.size(), 1));
+    auto new_x = Mul(x, predecessor_mask,
+                     /*broadcast_dimensions=*/{num_dims - 2, num_dims - 1}) +
+                 Mul(beta, mask, /*broadcast_dimensions=*/batch_dim_indices);
     a = DynamicUpdateSliceInMinorDims(a, new_x, {j});
 
     // vs[:, j] = v
     vs = DynamicUpdateSliceInMinorDims(
-        vs, xla::Reshape(v, ConcatVectors(batch_dims, {m, 1})), {j});
+        vs, Reshape(v, ConcatVectors(batch_dims, {m, 1})), {j});
     // taus[j] = tau
     taus = DynamicUpdateSliceInMinorDims(
-        taus, xla::Reshape(tau, ConcatVectors(batch_dims, {1})), {j});
-    return std::vector<xla::XlaOp>{a, vs, taus};
+        taus, Reshape(tau, ConcatVectors(batch_dims, {1})), {j});
+    return std::vector<XlaOp>{a, vs, taus};
   };
 
-  auto vs = xla::Zeros(builder, xla::ShapeUtil::MakeShape(
-                                    type, ConcatVectors(batch_dims, {m, n})));
-  auto taus = xla::Zeros(
-      builder, xla::ShapeUtil::MakeShape(type, ConcatVectors(batch_dims, {n})));
+  auto vs = Zeros(
+      builder, ShapeUtil::MakeShape(type, ConcatVectors(batch_dims, {m, n})));
+  auto taus = Zeros(builder,
+                    ShapeUtil::MakeShape(type, ConcatVectors(batch_dims, {n})));
 
-  TF_ASSIGN_OR_RETURN(auto values,
-                      XlaForEachIndex(std::min(m, n), xla::S32, qr_body_fn,
-                                      {a, vs, taus}, "qr", builder));
+  TF_ASSIGN_OR_RETURN(auto values, ForEachIndex(std::min(m, n), S32, qr_body_fn,
+                                                {a, vs, taus}, "qr", builder));
 
   QRBlockResult result;
   result.r = values[0];
@@ -250,24 +248,23 @@ xla::StatusOr<QRBlockResult> QRBlock(
 // return W
 // There is no need to return Y since at termination of the loop it is equal to
 // vs.
-xla::StatusOr<xla::XlaOp> ComputeWYRepresentation(
-    xla::PrimitiveType type, absl::Span<const int64> batch_dims, xla::XlaOp vs,
-    xla::XlaOp taus, int64 m, int64 n,
-    xla::PrecisionConfig::Precision precision) {
+StatusOr<XlaOp> ComputeWYRepresentation(PrimitiveType type,
+                                        absl::Span<const int64> batch_dims,
+                                        XlaOp vs, XlaOp taus, int64 m, int64 n,
+                                        PrecisionConfig::Precision precision) {
   std::vector<int64> batch_dim_indices(batch_dims.size());
   std::iota(batch_dim_indices.begin(), batch_dim_indices.end(), 0);
   int64 n_index = batch_dims.size() + 1;
 
-  auto body_fn =
-      [&](xla::XlaOp j, absl::Span<const xla::XlaOp> values,
-          xla::XlaBuilder* builder) -> xla::StatusOr<std::vector<xla::XlaOp>> {
+  auto body_fn = [&](XlaOp j, absl::Span<const XlaOp> values,
+                     XlaBuilder* builder) -> StatusOr<std::vector<XlaOp>> {
     auto w = values[0];
     auto y = values[1];
     const auto vs = values[2];
     const auto taus = values[3];
 
     // Want j values in range [1, ... n).
-    j = j + xla::ConstantR0<int32>(builder, 1);
+    j = j + ConstantR0<int32>(builder, 1);
     // vs has shape [..., m, 1]
     auto v = DynamicSliceInMinorDims(vs, {j}, {1});
     // beta has shape [..., 1]
@@ -278,31 +275,31 @@ xla::StatusOr<xla::XlaOp> ComputeWYRepresentation(
     // wyv has shape [..., m, 1]
     auto wyv = BatchDot(w, yv, precision);
 
-    auto z = xla::Mul(
+    auto z = Mul(
         -beta, v + wyv,
         /*broadcast_dimensions=*/ConcatVectors(batch_dim_indices, {n_index}));
 
     w = DynamicUpdateSliceInMinorDims(w, z, {j});
     y = DynamicUpdateSliceInMinorDims(y, v, {j});
 
-    return std::vector<xla::XlaOp>{w, y, vs, taus};
+    return std::vector<XlaOp>{w, y, vs, taus};
   };
 
-  xla::XlaBuilder* builder = vs.builder();
-  auto w = xla::Zeros(builder, xla::ShapeUtil::MakeShape(
-                                   type, ConcatVectors(batch_dims, {m, n})));
+  XlaBuilder* builder = vs.builder();
+  auto w = Zeros(builder,
+                 ShapeUtil::MakeShape(type, ConcatVectors(batch_dims, {m, n})));
   auto y = w;
   auto v = SliceInMinorDims(vs, {0}, {1});
   auto beta = SliceInMinorDims(taus, {0}, {1});
   y = UpdateSliceInMinorDims(y, v, {0});
-  auto bv = xla::Mul(
-      -beta, v,
-      /*broadcast_dimensions=*/ConcatVectors(batch_dim_indices, {n_index}));
+  auto bv =
+      Mul(-beta, v,
+          /*broadcast_dimensions=*/ConcatVectors(batch_dim_indices, {n_index}));
   w = UpdateSliceInMinorDims(w, bv, {0});
 
   TF_ASSIGN_OR_RETURN(
-      auto values, XlaForEachIndex(n - 1, xla::S32, body_fn, {w, y, vs, taus},
-                                   "wy", builder));
+      auto values,
+      ForEachIndex(n - 1, S32, body_fn, {w, y, vs, taus}, "wy", builder));
   return values[0];
 }
 
@@ -323,34 +320,34 @@ xla::StatusOr<xla::XlaOp> ComputeWYRepresentation(
 //   return (q, a)
 // TODO(phawkins): consider using UT transformations (in the form I - V U V')
 // rather than WY transformations.
-xla::StatusOr<QRDecompositionResult> QRDecomposition(
-    xla::XlaOp a, bool full_matrices, int64 block_size,
-    xla::PrecisionConfig::Precision precision) {
-  xla::XlaBuilder* builder = a.builder();
-  TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
-  const int num_dims = xla::ShapeUtil::Rank(a_shape);
+StatusOr<QRDecompositionResult> QRDecomposition(
+    XlaOp a, bool full_matrices, int64 block_size,
+    PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = a.builder();
+  TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+  const int num_dims = a_shape.rank();
   if (num_dims < 2) {
-    return errors::InvalidArgument("Arguments to QR must have rank >= 2: ",
-                                   num_dims);
+    return InvalidArgument("Arguments to QR must have rank >= 2: got shape %s",
+                           a_shape.ToString());
   }
-  xla::PrimitiveType type = a_shape.element_type();
+  PrimitiveType type = a_shape.element_type();
 
-  const int64 m = xla::ShapeUtil::GetDimension(a_shape, -2);
-  const int64 n = xla::ShapeUtil::GetDimension(a_shape, -1);
+  const int64 m = ShapeUtil::GetDimension(a_shape, -2);
+  const int64 n = ShapeUtil::GetDimension(a_shape, -1);
   const int64 p = std::min(m, n);
 
   if (block_size < 1) {
-    return errors::InvalidArgument(
-        "block_size argument to QR must be >= 1; got ", block_size);
+    return InvalidArgument("block_size argument to QR must be >= 1; got %d",
+                           block_size);
   }
 
   const int64 num_batch_dims = num_dims - 2;
   std::vector<int64> batch_dims(num_batch_dims);
   for (int i = 0; i < num_batch_dims; ++i) {
-    batch_dims[i] = xla::ShapeUtil::GetDimension(a_shape, i);
+    batch_dims[i] = ShapeUtil::GetDimension(a_shape, i);
   }
 
-  auto q = xla::Broadcast(xla::IdentityMatrix(builder, type, m, m), batch_dims);
+  auto q = Broadcast(IdentityMatrix(builder, type, m, m), batch_dims);
   for (int64 i = 0; i < p; i += block_size) {
     int64 k = std::min(block_size, p - i);
 
@@ -393,4 +390,4 @@ xla::StatusOr<QRDecompositionResult> QRDecomposition(
   return result;
 }
 
-}  // namespace tensorflow
+}  // namespace xla
diff --git a/tensorflow/compiler/tf2xla/lib/qr.h b/tensorflow/compiler/xla/client/lib/qr.h
similarity index 74%
rename from tensorflow/compiler/tf2xla/lib/qr.h
rename to tensorflow/compiler/xla/client/lib/qr.h
index 24b537ac8b63b93e734c3d0e335ea455f7d51a54..827c8eeca05ef09a0d77363eb3c40961b95813d8 100644
--- a/tensorflow/compiler/tf2xla/lib/qr.h
+++ b/tensorflow/compiler/xla/client/lib/qr.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_QR_H_
-#define TENSORFLOW_COMPILER_TF2XLA_LIB_QR_H_
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_QR_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_QR_H_
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
-namespace tensorflow {
+namespace xla {
 
 // Computes the QR decompositions of a batch of matrices. That is,
 // given a (batched) matrix a, computes an orthonormal matrix Q and an
@@ -29,14 +29,14 @@ namespace tensorflow {
 // the block size to use.
 // TODO(phawkins): handle the complex case.
 struct QRDecompositionResult {
-  xla::XlaOp q;
-  xla::XlaOp r;
+  XlaOp q;
+  XlaOp r;
 };
 
-xla::StatusOr<QRDecompositionResult> QRDecomposition(
-    xla::XlaOp a, bool full_matrices, int64 block_size = 128,
-    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::HIGHEST);
+StatusOr<QRDecompositionResult> QRDecomposition(
+    XlaOp a, bool full_matrices, int64 block_size = 128,
+    PrecisionConfig::Precision precision = PrecisionConfig::HIGHEST);
 
-}  // namespace tensorflow
+}  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_QR_H_
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_QR_H_
diff --git a/tensorflow/compiler/xla/client/lib/qr_test.cc b/tensorflow/compiler/xla/client/lib/qr_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b27d364b62444d6d5fb1278b6e6461affc15b2e6
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/qr_test.cc
@@ -0,0 +1,93 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/qr.h"
+
+#include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/array3d.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace {
+
+using QrTest = xla::ClientLibraryTestBase;
+
+XLA_TEST_F(QrTest, Simple) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::Array2D<float> a_vals({
+      {4, 6, 8, 10},
+      {6, 45, 54, 63},
+      {8, 54, 146, 166},
+      {10, 63, 166, 310},
+  });
+
+  xla::XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_vals, 0, "a", &builder, &a);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto result,
+      xla::QRDecomposition(a, /*full_matrices=*/true, /*block_size=*/2));
+
+  // Verifies that the decomposition composes back to the original matrix.
+  //
+  // This isn't a terribly demanding test, (e.g., we should verify that Q is
+  // orthonormal and R is upper-triangular) but it's awkward to write such tests
+  // without more linear algebra libraries. It's easier to test the numerics
+  // from Python, anyway, where we have access to numpy and scipy.
+  xla::BatchDot(result.q, result.r, xla::PrecisionConfig::HIGHEST);
+
+  ComputeAndCompareR2<float>(&builder, a_vals, {a_data.get()},
+                             xla::ErrorSpec(1e-4, 1e-4));
+}
+
+XLA_TEST_F(QrTest, SimpleBatched) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::Array3D<float> a_vals({
+      {
+          {4, 6, 8, 10},
+          {6, 45, 54, 63},
+          {8, 54, 146, 166},
+          {10, 63, 166, 310},
+      },
+      {
+          {16, 24, 8, 12},
+          {24, 61, 82, 48},
+          {8, 82, 456, 106},
+          {12, 48, 106, 62},
+      },
+  });
+
+  xla::XlaOp a;
+  auto a_data = CreateR3Parameter<float>(a_vals, 0, "a", &builder, &a);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto result,
+      xla::QRDecomposition(a, /*full_matrices=*/true, /*block_size=*/2));
+
+  xla::BatchDot(result.q, result.r, xla::PrecisionConfig::HIGHEST);
+
+  ComputeAndCompareR3<float>(&builder, a_vals, {a_data.get()},
+                             xla::ErrorSpec(1e-4, 1e-4));
+}
+
+}  // namespace
diff --git a/tensorflow/compiler/xla/client/lib/quantize.h b/tensorflow/compiler/xla/client/lib/quantize.h
new file mode 100644
index 0000000000000000000000000000000000000000..26dbbd5b00bd1a29f4047c9a4294fcac7340cf6c
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/quantize.h
@@ -0,0 +1,186 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_QUANTIZE_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_QUANTIZE_H_
+
+#include <limits>
+#include <numeric>
+#include <vector>
+
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+
+namespace xla {
+
+constexpr int64 kBitsOfByte = 8;
+
+// Represents the range used for quantization
+struct QuantizedRange {
+  QuantizedRange() = default;
+  QuantizedRange(float min_in, float max_in) : min(min_in), max(max_in) {}
+
+  bool operator==(const QuantizedRange& rhs) const {
+    return this->min == rhs.min && this->max == rhs.max;
+  }
+
+  bool operator!=(const QuantizedRange& rhs) const { return !(*this == rhs); }
+
+  tensorflow::bfloat16 min = tensorflow::bfloat16(0.0f);
+  tensorflow::bfloat16 max = tensorflow::bfloat16(0.0f);
+};
+
+template <typename T>
+inline std::vector<uint32> PackToUint32(absl::Span<const T> input) {
+  const int64 kElementsPerPack = sizeof(uint32) / sizeof(T);
+  const int64 input_size = input.size();
+  const int64 output_size = CeilOfRatio(input_size, kElementsPerPack);
+
+  std::vector<uint32> output_vec;
+  constexpr int64 kShiftBits = sizeof(T) / sizeof(uint8) * kBitsOfByte;
+
+  for (int64 i = 0; i < output_size; i++) {
+    uint32 result = 0;
+    for (int64 p = 0; p < kElementsPerPack; p++) {
+      int64 index = i * kElementsPerPack + p;
+      if (index < input_size) {
+        int64 total_shift_bits = kShiftBits * (kElementsPerPack - p - 1);
+        result |= (input[index] << total_shift_bits);
+      }
+    }
+    output_vec.push_back(result);
+  }
+
+  return output_vec;
+}
+
+// Dequantize the quantized input of packed uint32 to bfloat16.
+// Only uint8 or uint16 is supported for the original unpacked input.
+// Returns a tensor of shape [d0,..., dn * unpack_size] if
+// input shape is [d0, ..., dn], where unpack_size = sizeof(unit32) / sizeof(T).
+// If transpose_output is true, will return a tensor of shape
+// [dn * unpack_size, dn-1, ..., d1, d0]. transpose_output is faster when
+// input's rank higher than 1. The input needs to be transposed to use
+// transpose_output feature.
+template <typename T>
+inline XlaOp Dequantize(XlaOp input, const QuantizedRange& range,
+                        absl::string_view mode_string = "MIN_COMBINED",
+                        bool transpose_output = false) {
+  XlaBuilder* const builder = input.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    float half_range =
+        !std::is_signed<T>::value
+            ? 0.0f
+            : (static_cast<float>(std::numeric_limits<T>::max()) -
+               std::numeric_limits<T>::min() + 1) /
+                  2.0f;
+    const int64 unpack_size = sizeof(uint32) / sizeof(T);
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(input));
+
+    auto element_type = shape.element_type();
+    if (element_type != U32) {
+      return InvalidArgument(
+          "Only U32 is supported for input type of xla::Dequantize Op.");
+    }
+
+    // Broadcast the input to [unpack_size, d0, ..., dn] if input size is
+    // [d0, ..., dn].
+    auto broadcast_input = Broadcast(input, {unpack_size});
+
+    XlaOp iota_r1 = Iota(builder, U32, unpack_size);
+    // Highest significant bytes needs to shift more bytes than lower
+    // significant bytes.
+    XlaOp shift_bytes =
+        xla::ConstantR0<uint32>(builder, unpack_size - 1) - iota_r1;
+
+    const int bytes_of_type = sizeof(T) / sizeof(uint8);
+    std::vector<uint32> shift_vec(unpack_size, kBitsOfByte * bytes_of_type);
+    XlaOp shift_bits =
+        shift_bytes * xla::ConstantR1<uint32>(builder, shift_vec);
+
+    // Make bit_mask for different data type T.
+    uint32 bit_mask = 0x00000000;
+    for (int i = 0; i < bytes_of_type; i++) {
+      bit_mask <<= kBitsOfByte;
+      bit_mask |= 0x000000ff;
+    }
+
+    std::vector<int64> shift_transpose_dimensions(shape.dimensions_size());
+    std::iota(shift_transpose_dimensions.begin(),
+              shift_transpose_dimensions.end(), 0);
+    shift_transpose_dimensions.insert(shift_transpose_dimensions.begin(), 1,
+                                      shape.dimensions_size());
+
+    // Shift the input by sizeof(T) bytes and apply bit_mask to unpack.
+    XlaOp shifted_input = ShiftRightLogical(
+        broadcast_input, Transpose(Broadcast(shift_bits, shape.dimensions()),
+                                   shift_transpose_dimensions));
+    XlaOp unpack_input =
+        And(shifted_input, xla::ConstantR0<uint32>(builder, bit_mask));
+
+    XlaOp result;
+
+    if (mode_string == "MIN_COMBINED") {
+      const tensorflow::bfloat16 scale_factor =
+          (range.max - range.min) /
+          (static_cast<tensorflow::bfloat16>(std::numeric_limits<T>::max() -
+                                             std::numeric_limits<T>::min()));
+      // result = bfloat16(input + half_range) * scale_factor + range.min
+      XlaOp unpack_input_bf16 = ConvertElementType(unpack_input, BF16);
+      XlaOp half_range_bf16 = xla::ConstantR0<tensorflow::bfloat16>(
+          builder, static_cast<bfloat16>(half_range));
+      XlaOp sum = unpack_input_bf16 + half_range_bf16;
+
+      result =
+          sum * xla::ConstantR0<tensorflow::bfloat16>(builder, scale_factor) +
+          xla::ConstantR0<tensorflow::bfloat16>(builder, range.min);
+    } else {
+      // TODO(wangtao): support other modes.
+      return InvalidArgument(
+          "Only MIN_COMBINED mode is supported in xla::Dequantize Op.");
+    }
+
+    std::vector<int64> transpose_dimensions(shape.dimensions_size());
+    std::iota(transpose_dimensions.begin(), transpose_dimensions.end(), 1);
+    std::reverse(transpose_dimensions.begin(), transpose_dimensions.end());
+    transpose_dimensions.insert(transpose_dimensions.begin() + 1, 1, 0);
+
+    // Transpose the result to be [dn, unpack_size, dn-1, ..., d1, d0].
+    XlaOp transposed_result = Transpose(result, transpose_dimensions);
+
+    // Reshape to be [dn * unpack_size, dn-1, ..., d1, d0].
+    XlaOp reshaped_result = Collapse(transposed_result, {0, 1});
+
+    // Return the transpose result if transpose_output is true.
+    if (transpose_output) {
+      return reshaped_result;
+    }
+
+    // Transpose the result to be [d0, d1, ..., dn-1, dn * unpack_size].
+    std::vector<int64> result_dimensions(shape.dimensions_size());
+    std::iota(result_dimensions.begin(), result_dimensions.end(), 0);
+    std::reverse(result_dimensions.begin(), result_dimensions.end());
+
+    return Transpose(reshaped_result, result_dimensions);
+  });
+}
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_QUANTIZE_H_
diff --git a/tensorflow/compiler/xla/client/lib/quantize_test.cc b/tensorflow/compiler/xla/client/lib/quantize_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..be3603d9e11670913c21a834d2216a999306d582
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/quantize_test.cc
@@ -0,0 +1,337 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/quantize.h"
+
+#include <limits>
+
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+namespace {
+
+using bfloat16 = tensorflow::bfloat16;
+
+template <typename NativeT>
+std::vector<NativeT> GenerateInput() {
+  std::vector<NativeT> input;
+
+  for (int64 i = std::numeric_limits<NativeT>::min();
+       i < std::numeric_limits<NativeT>::max(); ++i) {
+    input.push_back(static_cast<NativeT>(i));
+  }
+
+  return input;
+}
+
+template <typename NativeT>
+Array2D<NativeT> GenerateLargeSizeInput(int num_columns, int num_rows) {
+  Array2D<NativeT> input(num_columns, num_rows);
+
+  input.FillRandom(6, 128);
+
+  return input;
+}
+
+template <typename NativeT>
+Array2D<uint32> PackLargeInput(Array2D<NativeT> &input) {
+  const int64 size_per_pack = sizeof(uint32) / sizeof(NativeT);
+  int64 width = input.width();
+
+  int64 padded_output_width = CeilOfRatio(width, size_per_pack);
+
+  Array2D<uint32> pack_input(input.height(), padded_output_width);
+
+  for (int h = 0; h < input.height(); h++) {
+    std::vector<NativeT> input_row;
+    for (int w = 0; w < width; w++) {
+      input_row.push_back(input({h, w}));
+    }
+
+    auto pack_input_vec = PackToUint32<uint8>(input_row);
+
+    for (int w = 0; w < padded_output_width; w++) {
+      pack_input(h, w) = pack_input_vec[w];
+    }
+  }
+
+  return pack_input;
+}
+
+template <typename NativeT>
+Array2D<bfloat16> GenerateLargeSizeMinCombinedOutput(
+    Array2D<NativeT> &input, const QuantizedRange &range,
+    bool transpose_output = false) {
+  const int64 size_per_pack = sizeof(uint32) / sizeof(NativeT);
+  int64 width = input.width();
+
+  int64 padded_output_width = CeilOfRatio(width, size_per_pack) * size_per_pack;
+
+  int64 output_height;
+  int64 output_width;
+
+  if (transpose_output) {
+    output_height = padded_output_width;
+    output_width = input.height();
+  } else {
+    output_height = input.height();
+    output_width = padded_output_width;
+  }
+
+  Array2D<bfloat16> output(output_height, output_width, bfloat16(0.0));
+
+  float half_range =
+      !std::is_signed<NativeT>::value
+          ? 0.0f
+          : (static_cast<float>(std::numeric_limits<NativeT>::max() -
+                                std::numeric_limits<NativeT>::min() + 1)) /
+                2.0f;
+  const bfloat16 scale_factor =
+      (range.max - range.min) /
+      (static_cast<bfloat16>(std::numeric_limits<NativeT>::max() -
+                             std::numeric_limits<NativeT>::min()));
+
+  for (int h = 0; h < input.height(); h++) {
+    std::vector<NativeT> input_row;
+    for (int w = 0; w < width; w++) {
+      bfloat16 result =
+          static_cast<bfloat16>(input(h, w) + half_range) * scale_factor +
+          range.min;
+      if (transpose_output) {
+        output(w, h) = result;
+      } else {
+        output(h, w) = result;
+      }
+    }
+  }
+
+  return output;
+}
+
+template <typename NativeT>
+std::vector<bfloat16> GenerateMinCombinedOutput(const QuantizedRange &range) {
+  float half_range =
+      !std::is_signed<NativeT>::value
+          ? 0.0f
+          : (static_cast<float>(std::numeric_limits<NativeT>::max() -
+                                std::numeric_limits<NativeT>::min() + 1)) /
+                2.0f;
+  const bfloat16 scale_factor =
+      (range.max - range.min) /
+      (static_cast<bfloat16>(std::numeric_limits<NativeT>::max() -
+                             std::numeric_limits<NativeT>::min()));
+  std::vector<bfloat16> output;
+  for (int64 i = std::numeric_limits<NativeT>::min();
+       i < std::numeric_limits<NativeT>::max(); ++i) {
+    bfloat16 result =
+        static_cast<bfloat16>(i + half_range) * scale_factor + range.min;
+    output.push_back(result);
+  }
+
+  const int64 pack_size = sizeof(uint32) / sizeof(NativeT);
+  const int64 output_size = output.size();
+
+  int64 num_tailing_zeros =
+      CeilOfRatio(output_size, pack_size) * pack_size - output_size;
+
+  output.insert(output.end(), num_tailing_zeros, bfloat16(0.0));
+  return output;
+}
+
+// TODO(wangtao): add a test to make sure this op is the inverse of the existing
+// TF quantize op defined in: third_party/tensorflow/core/kernels/quantize_op.cc
+
+using DequantizeTest = ClientLibraryTestBase;
+
+TEST(PackTest, PackUint8ToUint32) {
+  std::vector<uint8> input = {0xAB, 0x0B, 0x00, 0xF0, 0x01};
+  auto output = PackToUint32<uint8>(input);
+  EXPECT_THAT(output, ::testing::ElementsAre(0xAB0B00F0, 0x01000000));
+}
+
+TEST(PackTest, PackInt8ToUint32) {
+  std::vector<int8> input = {static_cast<signed char>(0x81), 0x0B, 0x00, 0x20,
+                             0x01};
+  auto output = PackToUint32<int8>(input);
+  EXPECT_THAT(output, ::testing::ElementsAre(0x810B0020, 0x01000000));
+}
+
+TEST(PackTest, PackUint8ToUint32PerfectSize) {
+  std::vector<uint8> input = {3, 2, 1, 0};
+  auto output = PackToUint32<uint8>(input);
+  EXPECT_THAT(output, ::testing::ElementsAre(0x03020100));
+}
+
+XLA_TEST_F(DequantizeTest, MinCombinedUint16R1) {
+  XlaBuilder builder(TestName());
+  auto input = GenerateInput<uint16>();
+  auto x = ConstantR1<uint32>(&builder, PackToUint32<uint16>(input));
+  QuantizedRange range(0, 255.0f);
+  xla::Dequantize<uint16>(x, range, "MIN_COMBINED");
+  auto expected = GenerateMinCombinedOutput<uint16>(range);
+  ComputeAndCompareR1<bfloat16>(&builder, expected, {});
+}
+
+XLA_TEST_F(DequantizeTest, MinCombinedUint8R1) {
+  XlaBuilder builder(TestName());
+  auto input = GenerateInput<uint8>();
+  auto x = ConstantR1<uint32>(&builder, PackToUint32<uint8>(input));
+  QuantizedRange range(0, 127.0f);
+  xla::Dequantize<uint8>(x, range, "MIN_COMBINED");
+  auto expected = GenerateMinCombinedOutput<uint8>(range);
+  ComputeAndCompareR1<bfloat16>(&builder, expected, {});
+}
+
+XLA_TEST_F(DequantizeTest, MinCombinedUint8R2) {
+  XlaBuilder builder(TestName());
+  std::vector<std::vector<uint8>> input = {
+      {0, 1, 2, 3},
+      {4, 5, 6, 7},
+      {8, 9, 10, 11},
+      {12, 13, 16, 15},
+  };
+  auto x = ConstantR2<uint32>(&builder, {{PackToUint32<uint8>(input[0])[0]},
+                                         {PackToUint32<uint8>(input[1])[0]},
+                                         {PackToUint32<uint8>(input[2])[0]},
+                                         {PackToUint32<uint8>(input[3])[0]}});
+  QuantizedRange range(0, 255.0f);
+  xla::Dequantize<uint8>(x, range, "MIN_COMBINED");
+  const Array2D<bfloat16> expected = {
+      {bfloat16(0.0), bfloat16(1.0), bfloat16(2.0), bfloat16(3.0)},
+      {bfloat16(4.0), bfloat16(5.0), bfloat16(6.0), bfloat16(7.0)},
+      {bfloat16(8.0), bfloat16(9.0), bfloat16(10.0), bfloat16(11.0)},
+      {bfloat16(12.0), bfloat16(13.0), bfloat16(16.0), bfloat16(15.0)},
+  };
+  ComputeAndCompareR2<bfloat16>(&builder, expected, {});
+}
+
+XLA_TEST_F(DequantizeTest, MinCombinedUint8R2TransposeOutput) {
+  XlaBuilder builder(TestName());
+  std::vector<std::vector<uint8>> input = {
+      {0, 1, 2, 3},
+      {4, 5, 6, 7},
+      {8, 9, 10, 11},
+      {12, 13, 16, 15},
+  };
+  auto x = ConstantR2<uint32>(&builder, {{PackToUint32<uint8>(input[0])[0]},
+                                         {PackToUint32<uint8>(input[1])[0]},
+                                         {PackToUint32<uint8>(input[2])[0]},
+                                         {PackToUint32<uint8>(input[3])[0]}});
+  QuantizedRange range(0, 255.0f);
+  xla::Dequantize<uint8>(x, range, "MIN_COMBINED", /*transpose_output=*/true);
+  const Array2D<bfloat16> expected = {
+      {bfloat16(0.0), bfloat16(4.0), bfloat16(8.0), bfloat16(12.0)},
+      {bfloat16(1.0), bfloat16(5.0), bfloat16(9.0), bfloat16(13.0)},
+      {bfloat16(2.0), bfloat16(6.0), bfloat16(10.0), bfloat16(16.0)},
+      {bfloat16(3.0), bfloat16(7.0), bfloat16(11.0), bfloat16(15.0)},
+  };
+  ComputeAndCompareR2<bfloat16>(&builder, expected, {});
+}
+
+XLA_TEST_F(DequantizeTest, MinCombinedUint8R2TailingZero) {
+  XlaBuilder builder(TestName());
+  std::vector<std::vector<uint8>> input = {
+      {0, 1, 2, 3, 16},
+      {4, 5, 6, 7, 17},
+      {8, 9, 10, 11, 18},
+      {12, 13, 16, 15, 19},
+  };
+  auto x = ConstantR2<uint32>(
+      &builder,
+      {{PackToUint32<uint8>(input[0])[0], PackToUint32<uint8>(input[0])[1]},
+       {PackToUint32<uint8>(input[1])[0], PackToUint32<uint8>(input[1])[1]},
+       {PackToUint32<uint8>(input[2])[0], PackToUint32<uint8>(input[2])[1]},
+       {PackToUint32<uint8>(input[3])[0], PackToUint32<uint8>(input[3])[1]}});
+  QuantizedRange range(0, 255.0f);
+  xla::Dequantize<uint8>(x, range, "MIN_COMBINED");
+
+  const Array2D<bfloat16> expected = {
+      {bfloat16(0.0), bfloat16(1.0), bfloat16(2.0), bfloat16(3.0),
+       bfloat16(16.0), bfloat16(0.0), bfloat16(0.0), bfloat16(0.0)},
+      {bfloat16(4.0), bfloat16(5.0), bfloat16(6.0), bfloat16(7.0),
+       bfloat16(17.0), bfloat16(0.0), bfloat16(0.0), bfloat16(0.0)},
+      {bfloat16(8.0), bfloat16(9.0), bfloat16(10.0), bfloat16(11.0),
+       bfloat16(18.0), bfloat16(0.0), bfloat16(0.0), bfloat16(0.0)},
+      {bfloat16(12.0), bfloat16(13.0), bfloat16(16.0), bfloat16(15.0),
+       bfloat16(19.0), bfloat16(0.0), bfloat16(0.0), bfloat16(0.0)},
+  };
+  ComputeAndCompareR2<bfloat16>(&builder, expected, {});
+}
+
+XLA_TEST_F(DequantizeTest, MinCombinedUint8R2TailingZeroTransposeOutput) {
+  XlaBuilder builder(TestName());
+  std::vector<std::vector<uint8>> input = {
+      {0, 1, 2, 3, 16},
+      {4, 5, 6, 7, 17},
+      {8, 9, 10, 11, 18},
+      {12, 13, 16, 15, 19},
+  };
+  auto x = ConstantR2<uint32>(
+      &builder,
+      {{PackToUint32<uint8>(input[0])[0], PackToUint32<uint8>(input[0])[1]},
+       {PackToUint32<uint8>(input[1])[0], PackToUint32<uint8>(input[1])[1]},
+       {PackToUint32<uint8>(input[2])[0], PackToUint32<uint8>(input[2])[1]},
+       {PackToUint32<uint8>(input[3])[0], PackToUint32<uint8>(input[3])[1]}});
+  QuantizedRange range(0, 255.0f);
+  xla::Dequantize<uint8>(x, range, "MIN_COMBINED", /*transpose_output=*/true);
+
+  const Array2D<bfloat16> expected = {
+      {bfloat16(0.0), bfloat16(4.0), bfloat16(8.0), bfloat16(12.0)},
+      {bfloat16(1.0), bfloat16(5.0), bfloat16(9.0), bfloat16(13.0)},
+      {bfloat16(2.0), bfloat16(6.0), bfloat16(10.0), bfloat16(16.0)},
+      {bfloat16(3.0), bfloat16(7.0), bfloat16(11.0), bfloat16(15.0)},
+      {bfloat16(16.0), bfloat16(17.0), bfloat16(18.0), bfloat16(19.0)},
+      {bfloat16(0.0), bfloat16(0.0), bfloat16(0.0), bfloat16(0.0)},
+      {bfloat16(0.0), bfloat16(0.0), bfloat16(0.0), bfloat16(0.0)},
+      {bfloat16(0.0), bfloat16(0.0), bfloat16(0.0), bfloat16(0.0)},
+  };
+  ComputeAndCompareR2<bfloat16>(&builder, expected, {});
+}
+
+XLA_TEST_F(DequantizeTest, MinCombinedUint8LargeSizeTest) {
+  XlaBuilder builder(TestName());
+  Array2D<uint8> input = GenerateLargeSizeInput<uint8>(500, 3547);
+  Array2D<uint32> input_packed = PackLargeInput<uint8>(input);
+
+  auto x = ConstantR2FromArray2D<uint32>(&builder, input_packed);
+  QuantizedRange range(0, 255.0f);
+  xla::Dequantize<uint8>(x, range, "MIN_COMBINED");
+
+  const Array2D<bfloat16> expected =
+      GenerateLargeSizeMinCombinedOutput<uint8>(input, range);
+  ComputeAndCompareR2<bfloat16>(&builder, expected, {});
+}
+
+XLA_TEST_F(DequantizeTest, MinCombinedUint8LargeSizeTestTransposeOutput) {
+  XlaBuilder builder(TestName());
+  Array2D<uint8> input = GenerateLargeSizeInput<uint8>(500, 3547);
+  Array2D<uint32> input_packed = PackLargeInput<uint8>(input);
+
+  auto x = ConstantR2FromArray2D<uint32>(&builder, input_packed);
+  QuantizedRange range(0, 255.0f);
+  xla::Dequantize<uint8>(x, range, "MIN_COMBINED", /*transpose_output=*/true);
+
+  const Array2D<bfloat16> expected = GenerateLargeSizeMinCombinedOutput<uint8>(
+      input, range, /*transpose_output=*/true);
+  ComputeAndCompareR2<bfloat16>(&builder, expected, {});
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/self_adjoint_eig.cc b/tensorflow/compiler/xla/client/lib/self_adjoint_eig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..546127e4627f1717913d1039be13fd0c655be1a3
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/self_adjoint_eig.cc
@@ -0,0 +1,471 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/self_adjoint_eig.h"
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/loops.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+
+namespace {
+
+// Jacobi rotation (also known as Givens rotation):
+// G = [[ c, s],
+//      [-s, c]]
+// matmul(G_T, G) = I
+struct SymmetricSchurDecomposition {
+  XlaOp c;          // cosine.
+  XlaOp s;          // sine.
+};
+
+// JacobiUpdate holds the intermediate orthogonal matrix, Jacobi-rotated matrix
+// and the off-diagonal norm of the rotated matrix. After each Jacobi iteration,
+// off-diagonal norm is reduced.
+struct JacobiUpdate {
+  XlaOp v;
+  XlaOp w;
+};
+
+struct FrobeniusNorms {
+  XlaOp off_diagonal_norm;
+  XlaOp total_norm;
+};
+
+// Given an n-by-n symmetric A and integers p and q that satisfy 0 <= p < q < n,
+// it computes a rotation matrix G = [[c, s], [-s, c]], such that
+//                        G_T * A[[p, q], [p, q]] * G
+// is diagonalized.
+//
+//  def sym_schur2x2(A, p, q):
+//      if np.abs(A[p, q]) > 1e-6:
+//          tau = (A[q, q] - A[p, p]) / (2 * A[p, q])
+//          if tau >= 0:
+//              t = 1.0 / (tau + np.sqrt(1 + tau ** 2))
+//          else:
+//              t = -1.0 / (-tau + np.sqrt(1 + tau ** 2))
+//          c = 1.0 / np.sqrt(1.0 + t ** 2)
+//          s = t * c
+//      else:
+//          c = 1.0
+//          s = 0.0
+//      return c, s
+StatusOr<SymmetricSchurDecomposition> SymmetricShurDecomposition2x2(XlaOp a,
+                                                                    XlaOp p,
+                                                                    XlaOp q,
+                                                                    XlaOp tol) {
+  XlaBuilder* builder = a.builder();
+  TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+
+  auto zero = ScalarLike(a, 0.0);
+  auto one = ScalarLike(a, 1.0);
+  auto two = ScalarLike(a, 2.0);
+
+  auto pqs = DynamicSliceInMinorDims(a, {p, q}, {1, 1});
+
+  auto ps = DynamicSliceInMinorDims(a, {p, p}, {1, 1});
+  auto qs = DynamicSliceInMinorDims(a, {q, q}, {1, 1});
+
+  auto tau = (qs - ps) / (pqs * two);
+  auto t_pos = one / (tau + Sqrt(one + Square(tau)));
+  auto t_neg = -one / (-tau + Sqrt(one + Square(tau)));
+  auto t = Select(Ge(tau, zero), t_pos, t_neg);
+
+  auto c_temp = Rsqrt(one + Square(t));
+  auto s_temp = t * c_temp;
+
+  auto c = Select(Ge(Abs(pqs), tol), c_temp, ZerosLike(c_temp) + one);
+  auto s = Select(Ge(Abs(pqs), tol), s_temp, ZerosLike(s_temp));
+  // Renormalize c and s to compensate for low precision arithmetic, this step
+  // is redundant if high precision float is used, like float64.
+  auto rnorm = Rsqrt(Square(c) + Square(s));
+
+  SymmetricSchurDecomposition schur;
+
+  schur.c = c * rnorm;
+  schur.s = s * rnorm;
+
+  return schur;
+}
+
+StatusOr<JacobiUpdate> Update(JacobiUpdate jacobi_update, XlaOp p, XlaOp q,
+                              XlaOp tol, int64 n) {
+  XlaBuilder* builder = jacobi_update.w.builder();
+  TF_ASSIGN_OR_RETURN(
+      SymmetricSchurDecomposition schur,
+      SymmetricShurDecomposition2x2(jacobi_update.w, p, q, tol));
+
+  TF_ASSIGN_OR_RETURN(Shape w_shape, builder->GetShape(jacobi_update.w));
+  const std::vector<int64> batch_dims(w_shape.dimensions().begin(),
+                                      w_shape.dimensions().end() - 2);
+  const int64 num_dims = w_shape.rank();
+
+  auto zero = ScalarLike(p, 0);
+
+  XlaOp c = schur.c;
+  XlaOp s = schur.s;
+
+  auto slice_p = DynamicSliceInMinorDims(jacobi_update.w, {p, zero}, {1, n});
+  auto slice_q = DynamicSliceInMinorDims(jacobi_update.w, {q, zero}, {1, n});
+
+  auto slice_p_new = c * slice_p - s * slice_q;
+  auto slice_q_new = s * slice_p + c * slice_q;
+
+  jacobi_update.w =
+      DynamicUpdateSliceInMinorDims(jacobi_update.w, slice_p_new, {p, zero});
+  jacobi_update.w =
+      DynamicUpdateSliceInMinorDims(jacobi_update.w, slice_q_new, {q, zero});
+
+  slice_p = DynamicSliceInMinorDims(jacobi_update.w, {zero, p}, {n, 1});
+  slice_q = DynamicSliceInMinorDims(jacobi_update.w, {zero, q}, {n, 1});
+
+  slice_p_new = c * slice_p - s * slice_q;
+  slice_q_new = s * slice_p + c * slice_q;
+
+  jacobi_update.w =
+      DynamicUpdateSliceInMinorDims(jacobi_update.w, slice_p_new, {zero, p});
+  jacobi_update.w =
+      DynamicUpdateSliceInMinorDims(jacobi_update.w, slice_q_new, {zero, q});
+
+  // Zero out a_{pq} explicitly.
+  std::vector<int64> pq_dims(batch_dims.begin(), batch_dims.end());
+  pq_dims.push_back(1);
+  pq_dims.push_back(1);
+  auto pq_zero = ScalarLike(jacobi_update.w, 0.0);
+  auto pq_zeros = Broadcast(pq_zero, pq_dims);
+  jacobi_update.w =
+      DynamicUpdateSliceInMinorDims(jacobi_update.w, pq_zeros, {p, q});
+  jacobi_update.w =
+      DynamicUpdateSliceInMinorDims(jacobi_update.w, pq_zeros, {q, p});
+
+  slice_p = DynamicSliceInMinorDims(jacobi_update.v, {zero, p}, {n, 1});
+  slice_q = DynamicSliceInMinorDims(jacobi_update.v, {zero, q}, {n, 1});
+
+  std::vector<int64> broadcast_dims(batch_dims.size());
+  std::iota(broadcast_dims.begin(), broadcast_dims.end(), 0);
+  broadcast_dims.push_back(num_dims - 1);
+
+  // Renormalize the p-th and q-th columns. This step is redundant if high
+  // precision floats are used, like 64-bit float. But for 32-bit float, it
+  // becomes necessary. This step will not increase the overall complexity.
+  slice_p_new = c * slice_p - s * slice_q;
+  slice_p_new = Mul(
+      slice_p_new,
+      Rsqrt(Reduce(Square(slice_p_new), pq_zero,
+                   CreateScalarAddComputation(w_shape.element_type(), builder),
+                   {num_dims - 2})),
+      broadcast_dims);
+  slice_q_new = s * slice_p + c * slice_q;
+  slice_q_new = Mul(
+      slice_q_new,
+      Rsqrt(Reduce(Square(slice_q_new), pq_zero,
+                   CreateScalarAddComputation(w_shape.element_type(), builder),
+                   {num_dims - 2})),
+      broadcast_dims);
+
+  jacobi_update.v =
+      DynamicUpdateSliceInMinorDims(jacobi_update.v, slice_p_new, {zero, p});
+  jacobi_update.v =
+      DynamicUpdateSliceInMinorDims(jacobi_update.v, slice_q_new, {zero, q});
+
+  return jacobi_update;
+}
+
+StatusOr<FrobeniusNorms> ComputeFrobeniusNorms(XlaOp w) {
+  XlaBuilder* builder = w.builder();
+  TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(w));
+  const int64 num_dims = shape.rank();
+  auto frobenius_norm =
+      Sqrt(Reduce(Square(w), ScalarLike(w, 0.0),
+                  CreateScalarAddComputation(shape.element_type(), builder),
+                  {num_dims - 2, num_dims - 1}));
+  auto diag = GetMatrixDiagonal(w);
+  auto diag_square =
+      Reduce(Square(diag), ScalarLike(w, 0.0),
+             CreateScalarAddComputation(shape.element_type(), builder),
+             {num_dims - 2});
+
+  FrobeniusNorms frobenius_norms;
+
+  frobenius_norms.off_diagonal_norm =
+      Sqrt(Max(Square(frobenius_norm) - diag_square, ScalarLike(w, 0.0)));
+  frobenius_norms.total_norm = frobenius_norm;
+
+  return frobenius_norms;
+}
+
+StatusOr<std::vector<XlaOp>> WhileLoopFn(
+    absl::Span<const XlaOp> initial_values,  //
+    int matrix_dimension,                    //
+    int max_sweep_updates,                   //
+    PrimitiveType index_type,                //
+    absl::string_view name,                  //
+    XlaBuilder* builder) {
+  auto while_cond_fn = [&](absl::Span<const XlaOp> values,
+                           XlaBuilder* cond_builder) -> StatusOr<XlaOp> {
+    auto k = values[0];
+    auto max_sweeps = ScalarLike(k, max_sweep_updates);
+    auto sweep_update_cond = Gt(max_sweeps, k);
+
+    auto norms = ComputeFrobeniusNorms(values[2]).ValueOrDie();
+    auto tol = norms.total_norm * values[3];
+    auto tol_cond = ReduceAll(Lt(tol, norms.off_diagonal_norm),
+                              xla::ConstantR0<bool>(cond_builder, false),
+                              CreateScalarOrComputation(PRED, cond_builder));
+
+    return And(sweep_update_cond, tol_cond);
+  };
+
+  auto while_body_fn =
+      [&](absl::Span<const XlaOp> values,
+          XlaBuilder* body_builder) -> StatusOr<std::vector<XlaOp>> {
+    auto while_cond_fn_inner =
+        [&](absl::Span<const XlaOp> values_inner,
+            XlaBuilder* inner_cond_builder) -> StatusOr<XlaOp> {
+      auto p = values_inner[0];
+      return Lt(p, ScalarLike(p, matrix_dimension - 1));
+    };
+
+    auto while_body_fn_inner =
+        [&](absl::Span<const XlaOp> values_inner,
+            XlaBuilder* inner_body_builder) -> StatusOr<std::vector<XlaOp>> {
+      auto while_cond_fn_innermost =
+          [&](absl::Span<const XlaOp> values_innermost,
+              XlaBuilder* innermost_cond_builder) -> StatusOr<XlaOp> {
+        auto q = values_innermost[1];
+        return Lt(q, ScalarLike(q, matrix_dimension));
+      };
+      auto while_body_fn_innermost =
+          [&](absl::Span<const XlaOp> values_innermost,
+              XlaBuilder* innermost_body_builder)
+          -> StatusOr<std::vector<XlaOp>> {
+        auto p = values_innermost[0];
+        auto q = values_innermost[1];
+
+        JacobiUpdate jacobi_update;
+        jacobi_update.v = values_innermost[2];
+        jacobi_update.w = values_innermost[3];
+
+        auto tol = values_innermost[4];
+
+        TF_ASSIGN_OR_RETURN(jacobi_update,
+                            Update(jacobi_update, p, q, tol, matrix_dimension));
+
+        std::vector<XlaOp> updated_values_innermost;
+        updated_values_innermost.reserve(values_innermost.size());
+
+        updated_values_innermost.push_back(p);
+        updated_values_innermost.push_back(q + ScalarLike(q, 1));
+        updated_values_innermost.push_back(jacobi_update.v);
+        updated_values_innermost.push_back(jacobi_update.w);
+        updated_values_innermost.push_back(tol);
+
+        return updated_values_innermost;
+      };
+
+      std::vector<XlaOp> values_innermost(5);
+      auto p = values_inner[0];
+      auto q = p + ScalarLike(p, 1);
+      values_innermost[0] = p;                // index p.
+      values_innermost[1] = q;                // index q.
+      values_innermost[2] = values_inner[1];  // v.
+      values_innermost[3] = values_inner[2];  // w.
+      values_innermost[4] = values_inner[3];  // tol.
+      TF_ASSIGN_OR_RETURN(
+          values_innermost,
+          WhileLoopHelper(while_cond_fn_innermost, while_body_fn_innermost,
+                          values_innermost, absl::StrCat(name, "-Innermost"),
+                          inner_body_builder));
+
+      std::vector<XlaOp> updated_values_inner;
+      updated_values_inner.reserve(values_inner.size());
+
+      updated_values_inner.push_back(p + ScalarLike(p, 1));
+      updated_values_inner.push_back(values_innermost[2]);
+      updated_values_inner.push_back(values_innermost[3]);
+      updated_values_inner.push_back(values_innermost[4]);
+      return updated_values_inner;
+    };
+    // Indexes.
+    XlaOp k = values[0];
+
+    std::vector<XlaOp> values_inner(4);
+    values_inner[0] = ScalarLike(k, 0);  // index p.
+    values_inner[1] = values[1];         // v.
+    values_inner[2] = values[2];         // w.
+    values_inner[3] = values[3];         // tol.
+    TF_ASSIGN_OR_RETURN(
+        values_inner,
+        WhileLoopHelper(while_cond_fn_inner, while_body_fn_inner, values_inner,
+                        absl::StrCat(name, "-Inner"), body_builder));
+
+    std::vector<XlaOp> updated_values;
+    updated_values.reserve(values_inner.size());
+
+    updated_values.push_back(k + ScalarLike(k, 1));
+    updated_values.push_back(values_inner[1]);
+    updated_values.push_back(values_inner[2]);
+    updated_values.push_back(values_inner[3]);
+
+    return updated_values;
+  };
+  std::vector<XlaOp> values;
+  TF_ASSIGN_OR_RETURN(values, WhileLoopHelper(while_cond_fn, while_body_fn,
+                                              initial_values, name, builder));
+
+  return values;
+}
+
+StatusOr<SelfAdjointEigResult> SortByEigenvalues(SelfAdjointEigResult result) {
+  XlaBuilder* builder = result.v.builder();
+  TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(result.v));
+  const int64 num_dims = shape.rank();
+  auto dimensions = shape.dimensions();
+
+  std::vector<int64> broadcast_dims(num_dims - 1);
+  std::iota(broadcast_dims.begin(), broadcast_dims.end(), 0);
+  broadcast_dims[num_dims - 2] = num_dims - 1;
+  result.w = BroadcastInDim(result.w, dimensions, broadcast_dims);
+
+  XlaOp sort_result =
+      Sort({result.w, result.v},
+           CreateScalarLtComputation(
+               {shape.element_type(), shape.element_type()}, builder),
+           num_dims - 1);
+  result.w = GetMatrixDiagonal(GetTupleElement(sort_result, 0));
+  result.v = GetTupleElement(sort_result, 1);
+  return result;
+}
+
+}  // namespace
+
+// This is the cyclic Jacobi iteration. Please note that the eigenvalues are
+// possibly not ordered.
+//
+//  def jacobi(A):
+//      n, _ = A.shape
+//      V = np.eye(n)
+//      frobenius_norm = np.linalg.norm(A)
+//      diag_norm = np.linalg.norm(np.diag(A))
+//      off_diag_norm = np.sqrt(
+//          frobenius_norm - diag_norm) * np.sqrt(frobenius_norm + diag_norm)
+//      while off_diag_norm > 1e-6 * frobenius_norm:
+//          for p in range(n - 1):
+//              for q in range(p + 1, n):
+//                  c, s = sym_schur2x2(A, p, q)
+//                  A[[p, q], :] = np.matmul(np.array([[c, -s], [s, c]]),
+//                                           A[[p, q], :])
+//                  A[:, [p, q]] = np.matmul(A[:, [p, q]],
+//                                           np.array([[c, s], [-s, c]]))
+//                  V[:, [p, q]] = np.matmul(V[:, [p, q]],
+//                                               np.array([[c, s], [-s, c]]))
+//          frobenius_norm_sq = np.linalg.norm(A)
+//          diag_square_sum = np.linalg.norm(np.diag(A))
+//          off_diag_norm = np.sqrt(
+//              frobenius_norm - diag_norm) * np.sqrt(
+//                  frobenius_norm + diag_norm)
+//
+//      return A, V
+//
+// TODO(kuny): Implement parallel order Jacobi.
+//
+SelfAdjointEigResult SelfAdjointEig(XlaOp a, bool lower, int64 max_iter,
+                                    float epsilon) {
+  XlaBuilder* builder = a.builder();
+  auto return_error = [&](const Status& status) {
+    SelfAdjointEigResult result;
+    result.v = builder->ReportError(status);
+    result.w = builder->ReportError(status);
+    return result;
+  };
+  auto shape_with_status = builder->GetShape(a);
+  if (!shape_with_status.status().ok()) {
+    return return_error(shape_with_status.status());
+  }
+  Shape a_shape = shape_with_status.ValueOrDie();
+  const int64 num_dims = a_shape.rank();
+  if (num_dims < 2) {
+    return return_error(InvalidArgument(
+        "Arguments to Eigen decomposition must have rank >= 2: got shape %s.",
+        a_shape.ToString()));
+  }
+  PrimitiveType type = a_shape.element_type();
+  if (!primitive_util::IsFloatingPointType(type)) {
+    return return_error(InvalidArgument(
+        "Type of the input matrix must be float: got %s.", a_shape.ToString()));
+  }
+
+  const int64 m = ShapeUtil::GetDimension(a_shape, -2);
+  const int64 n = ShapeUtil::GetDimension(a_shape, -1);
+
+  if (m != n) {
+    return return_error(InvalidArgument(
+        "Arguments to Eigen decomposition must be square matrices: got shape "
+        "(%d, %d).",
+        m, n));
+  }
+
+  const int64 num_batch_dims = num_dims - 2;
+  std::vector<int64> batch_dims(num_batch_dims);
+  for (int i = 0; i < num_batch_dims; ++i) {
+    batch_dims[i] = ShapeUtil::GetDimension(a_shape, i);
+  }
+
+  auto tol = ScalarLike(a, epsilon);
+
+  auto v_init = Broadcast(IdentityMatrix(builder, type, m, m), batch_dims);
+  auto w_init = Triangle(a, lower);
+  w_init = w_init + TransposeInMinorDims(w_init) - w_init * v_init;
+
+  auto output_with_status = WhileLoopFn(
+      {
+          Zero(builder, S32),  // k
+          v_init,              // v
+          w_init,              // w
+          tol,                 //
+      },                       //
+      n,                       //
+      max_iter,                //
+      S32,                     //
+      "CyclicJacobi",          //
+      builder);
+  if (!output_with_status.status().ok()) {
+    return return_error(output_with_status.status());
+  }
+
+  auto output = output_with_status.ValueOrDie();
+
+  SelfAdjointEigResult result;
+  result.v = output[1];
+  result.w = GetMatrixDiagonal(output[2]);
+
+  return SortByEigenvalues(result).ValueOrDie();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/self_adjoint_eig.h b/tensorflow/compiler/xla/client/lib/self_adjoint_eig.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a089891d6a2d80c0c265a3310539b4f1c5db4d5
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/self_adjoint_eig.h
@@ -0,0 +1,40 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SELF_ADJOINT_EIG_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SELF_ADJOINT_EIG_H_
+
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+
+// The eigenvalue decomposition of a symmetric matrix, the original matrix is
+// recovered by v * w * v_t.
+struct SelfAdjointEigResult {
+  // The i-th column is the normalized eigenvector corresponding to the
+  // eigenvalue w[i]. Will return a matrix object if a is a matrix object.
+  XlaOp v;
+  // The eigenvalues in ascending order, each repeated according to its
+  // multiplicity.
+  XlaOp w;
+};
+
+SelfAdjointEigResult SelfAdjointEig(XlaOp a, bool lower = true,
+                                    int64 max_iter = 100, float epsilon = 1e-6);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SELF_ADJOINT_EIG_H_
diff --git a/tensorflow/compiler/xla/client/lib/self_adjoint_eig_test.cc b/tensorflow/compiler/xla/client/lib/self_adjoint_eig_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c8875dff7bfdbd4e133297cef0a6686bfcd9bb6f
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/self_adjoint_eig_test.cc
@@ -0,0 +1,313 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/self_adjoint_eig.h"
+
+#include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/array3d.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+
+class SelfAdjointEigTest : public ClientLibraryTestBase {
+ protected:
+  void SetUp() override {
+    ClientLibraryTestBase::SetUp();
+    batch_3d_4x4_ = Array3D<float>{
+        {
+            {4, 6, 8, 10},
+            {6, 45, 54, 63},
+            {8, 54, 146, 166},
+            {10, 63, 166, 310},
+        },
+        {
+            {16, 24, 8, 12},
+            {24, 61, 82, 48},
+            {8, 82, 100, 6},
+            {12, 48, 6, 62},
+        },
+    };
+    matrix2d_8x8_ = Array2D<float>{
+        {14., 123., 49., 112., 115., 173., 182., 125.},
+        {123., 14., 60., 118., 150., 130., 91., 72.},
+        {49., 60., 138., 111., 106., 101., 115., 142.},
+        {112., 118., 111., 142., 91., 130., 25., 61.},
+        {115., 150., 106., 91., 116., 121., 128., 85.},
+        {173., 130., 101., 130., 121., 70., 151., 132.},
+        {182., 91., 115., 25., 128., 151., 66., 92.},
+        {125., 72., 142., 61., 85., 132., 92., 156.},
+    };
+    low_rank_4x4_ = Array2D<float>{
+        // x = [[1, 2, 3, 4], [1, -1, 1, -1]]
+        // matmul(x.T, x)
+        {2, 1, 4, 3},
+        {1, 5, 5, 9},
+        {4, 5, 10, 11},
+        {3, 9, 11, 17},
+    };
+  }
+  void TearDown() override { ClientLibraryTestBase::TearDown(); }
+
+  Array3D<float> GetUnitMatrix3D(const Array3D<float>& matrix) {
+    Array3D<float> result(matrix.n1(), matrix.n2(), matrix.n3(), 0.0);
+    for (int i = 0; i < matrix.n1(); ++i) {
+      for (int j = 0; j < matrix.n2(); ++j) {
+        result({i, j, j}) = 1.0;
+      }
+    }
+    return result;
+  }
+
+  Array3D<float> ExtractTriangularMatrix(const Array3D<float>& matrix,
+                                         bool lower) {
+    Array3D<float> result(matrix);
+    for (int i = 0; i < result.n1(); ++i) {
+      for (int j = 0; j < result.n2(); ++j) {
+        if (lower) {
+          for (int k = j + 1; k < result.n3(); ++k) {
+            result({i, j, k}) = 0.0;
+          }
+        } else {
+          for (int k = 0; k < j; ++k) {
+            result({i, j, k}) = 0.0;
+          }
+        }
+      }
+    }
+    return result;
+  }
+
+  XlaOp ComputeMatmulVWVt(SelfAdjointEigResult result, XlaBuilder* builder) {
+    Shape shape = builder->GetShape(result.v).ValueOrDie();
+    std::vector<int64> out_dims = shape.dimensions();
+    std::vector<int64> broadcast_dims(shape.rank() - 1);
+    std::iota(broadcast_dims.begin(), broadcast_dims.end(), 0);
+
+    broadcast_dims[shape.rank() - 2] = shape.rank() - 1;
+    auto vw = Mul(result.v, BroadcastInDim(result.w, out_dims, broadcast_dims));
+    return BatchDot(vw, TransposeInMinorDims(result.v),
+                    PrecisionConfig::HIGHEST);
+  }
+
+  XlaOp GetAverageAbsoluteError(XlaOp m1, XlaOp m2, XlaBuilder* builder) {
+    Shape shape = builder->GetShape(m1).ValueOrDie();
+    int64 size = 1;
+    for (auto d : shape.dimensions()) {
+      size *= d;
+    }
+    return ReduceAll(Abs(m1 - m2), ConstantR0WithType(builder, F32, 0),
+                     CreateScalarAddComputation(F32, builder)) /
+           ConstantR0WithType(builder, F32, size);
+  }
+
+  Array2D<float> GenerateRandomSymmetricMatrix(int size) {
+    Array2D<float> result{size, size, 0.0};
+    result.FillRandom(10 /* stddev */, 2 /* mean */);
+    for (int i = 0; i < size; ++i) {
+      for (int j = 0; j < i; ++j) {
+        result({j, i}) = result({i, j});
+      }
+    }
+    return result;
+  }
+
+  Array3D<float> batch_3d_4x4_;
+  Array2D<float> matrix2d_8x8_;
+  Array2D<float> low_rank_4x4_;
+  Array2D<int> wrong_type_4x4_;
+};
+
+XLA_TEST_F(SelfAdjointEigTest, Test_VWVt_EQ_A_2x4x4) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a;
+  auto a_data = CreateR3Parameter<float>(batch_3d_4x4_, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  ComputeMatmulVWVt(result, &builder);
+
+  ComputeAndCompareR3<float>(&builder, batch_3d_4x4_, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Test_VWVt_EQ_A_Lower_2x4x4) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a;
+  auto a_data = CreateR3Parameter<float>(
+      ExtractTriangularMatrix(batch_3d_4x4_, true), 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  ComputeMatmulVWVt(result, &builder);
+
+  ComputeAndCompareR3<float>(&builder, batch_3d_4x4_, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Test_VWVt_EQ_A_Upper_2x4x4) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a;
+  auto a_data = CreateR3Parameter<float>(
+      ExtractTriangularMatrix(batch_3d_4x4_, false), 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a, false);
+  ComputeMatmulVWVt(result, &builder);
+
+  ComputeAndCompareR3<float>(&builder, batch_3d_4x4_, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Test_Orthogonality_2x4x4) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a;
+  auto a_data = CreateR3Parameter<float>(batch_3d_4x4_, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  BatchDot(result.v, TransposeInMinorDims(result.v), PrecisionConfig::HIGHEST);
+
+  ComputeAndCompareR3<float>(&builder, GetUnitMatrix3D(batch_3d_4x4_),
+                             {a_data.get()}, ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Test_VtWV_EQ_A_Rank_Deficient_4x4) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(low_rank_4x4_, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  ComputeMatmulVWVt(result, &builder);
+
+  ComputeAndCompareR2<float>(&builder, low_rank_4x4_, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Test_Eigen_8x8) {
+  XlaBuilder builder(TestName());
+
+  // This is computed by numpy.linalg.eigh with float32.
+  std::vector<float> expected{-182.69205, -116.86245, -105.74489, -9.545369,
+                              37.81711,   104.732285, 120.29153,  868.00385};
+
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(matrix2d_8x8_, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  Add(result.w, ZerosLike(result.w));
+
+  ComputeAndCompareR1<float>(&builder, expected, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Test_Orthogonality_8x8) {
+  XlaBuilder builder(TestName());
+
+  float expected_vals = 1e-3;
+
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(matrix2d_8x8_, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  // np.sum(norm(eye(n) - matmul(conj(T(v)), v)) / n**2
+  GetAverageAbsoluteError(IdentityMatrix(&builder, F32, 8, 8),
+                          BatchDot(TransposeInMinorDims(result.v), result.v),
+                          &builder);
+
+  ComputeAndCompareR0<float>(&builder, expected_vals, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Wrong_Type_Int) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a;
+  auto a_data = CreateR2Parameter<int>(wrong_type_4x4_, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  EXPECT_FALSE(result.v.valid());
+  EXPECT_FALSE(result.w.valid());
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Various_Size_Random_Matrix_8x8) {
+  XlaBuilder builder(TestName());
+  int size = 8;
+  Array2D<float> a_val = GenerateRandomSymmetricMatrix(size);
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  GetAverageAbsoluteError(ComputeMatmulVWVt(result, &builder), a, &builder);
+
+  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Various_Size_Random_Matrix_16x16) {
+  XlaBuilder builder(TestName());
+  int size = 16;
+  Array2D<float> a_val = GenerateRandomSymmetricMatrix(size);
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  GetAverageAbsoluteError(ComputeMatmulVWVt(result, &builder), a, &builder);
+
+  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Various_Size_Random_Matrix_32x32) {
+  XlaBuilder builder(TestName());
+  int size = 32;
+  Array2D<float> a_val = GenerateRandomSymmetricMatrix(size);
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  GetAverageAbsoluteError(ComputeMatmulVWVt(result, &builder), a, &builder);
+
+  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Various_Size_Random_Matrix_256x256) {
+  XlaBuilder builder(TestName());
+  int size = 256;
+  Array2D<float> a_val = GenerateRandomSymmetricMatrix(size);
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  GetAverageAbsoluteError(ComputeMatmulVWVt(result, &builder), a, &builder);
+
+  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+XLA_TEST_F(SelfAdjointEigTest, Various_Size_Random_Matrix_512x512) {
+  XlaBuilder builder(TestName());
+  int size = 512;
+  Array2D<float> a_val = GenerateRandomSymmetricMatrix(size);
+  XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_val, 0, "a", &builder, &a);
+  auto result = SelfAdjointEig(a);
+  GetAverageAbsoluteError(ComputeMatmulVWVt(result, &builder), a, &builder);
+
+  ComputeAndCompareR0<float>(&builder, 1e-3, {a_data.get()},
+                             ErrorSpec(1e-3, 1e-3));
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/slicing.cc b/tensorflow/compiler/xla/client/lib/slicing.cc
index f8c7df3ff5189c817202eaf39adb572f7e232ec2..d7b33c5af25606c4e7e443027b913f7ca13a013c 100644
--- a/tensorflow/compiler/xla/client/lib/slicing.cc
+++ b/tensorflow/compiler/xla/client/lib/slicing.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/lib/slicing.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace xla {
 
@@ -26,7 +27,7 @@ XlaOp SliceInMinorDims(XlaOp x, absl::Span<const int64> start,
 
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
 
-    const int64 n_dims = ShapeUtil::Rank(shape);
+    const int64 n_dims = shape.rank();
     TF_RET_CHECK(n_minor_dims <= n_dims);
     auto major_dims = AsInt64Slice(shape.dimensions())
                           .subspan(
@@ -51,17 +52,17 @@ XlaOp SliceInMinorDims(XlaOp x, absl::Span<const int64> start,
 XlaOp UpdateSlice(XlaOp x, XlaOp update, absl::Span<const int64> start) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = shape.rank();
+    TF_RET_CHECK(start.size() == n_dims);
+
     // TODO(phawkins): make int64 work on all backends, remove the int32 cast.
     std::vector<int32> start_as_int32(start.begin(), start.end());
-    auto start_constant = ConstantR1<int32>(builder, start_as_int32);
-    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64 n_dims = ShapeUtil::Rank(shape);
-    TF_ASSIGN_OR_RETURN(Shape start_constant_shape,
-                        builder->GetShape(start_constant));
-    const int64 start_length =
-        ShapeUtil::GetDimension(start_constant_shape, -1);
-    TF_RET_CHECK(start_length == n_dims);
-    return DynamicUpdateSlice(x, update, start_constant);
+    std::vector<XlaOp> start_ops(start.size());
+    for (int i = 0; i < start.size(); ++i) {
+      start_ops[i] = ConstantR0(builder, start_as_int32[i]);
+    }
+    return DynamicUpdateSlice(x, update, start_ops);
   });
 }
 
@@ -70,7 +71,7 @@ XlaOp UpdateSliceInMinorDims(XlaOp x, XlaOp update,
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64 n_dims = ShapeUtil::Rank(shape);
+    const int64 n_dims = shape.rank();
     const int64 n_minor_dims = start.size();
     TF_RET_CHECK(n_minor_dims <= n_dims);
     std::vector<int64> padded_start(n_dims, 0);
@@ -90,18 +91,17 @@ std::vector<int64> ConcatVectors(absl::Span<const int64> xs,
   return output;
 }
 
-XlaOp PrependZerosInMajorDims(XlaOp x, absl::Span<const XlaOp> starts) {
+StatusOr<std::vector<XlaOp>> PrependZerosInMajorDims(
+    XlaOp x, absl::Span<const XlaOp> starts) {
   XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64 n_dims = ShapeUtil::Rank(shape);
-    auto zero = Reshape(ConstantR0<int32>(builder, 0), {1});
-    std::vector<XlaOp> padded_starts(n_dims, zero);
-    for (int i = 0; i < starts.size(); ++i) {
-      padded_starts[n_dims - starts.size() + i] = Reshape(starts[i], {1});
-    }
-    return ConcatInDim(builder, padded_starts, 0);
-  });
+  TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+  const int64 n_dims = shape.rank();
+  auto zero = ConstantR0<int32>(builder, 0);
+  std::vector<XlaOp> padded_starts(n_dims, zero);
+  for (int i = 0; i < starts.size(); ++i) {
+    padded_starts[n_dims - starts.size() + i] = starts[i];
+  }
+  return padded_starts;
 }
 
 }  // namespace
@@ -111,7 +111,7 @@ XlaOp DynamicSliceInMinorDims(XlaOp x, absl::Span<const XlaOp> starts,
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64 n_dims = ShapeUtil::Rank(shape);
+    const int64 n_dims = shape.rank();
     int64 n_minor_dims = starts.size();
     TF_RET_CHECK(n_minor_dims == sizes.size());
     TF_RET_CHECK(n_minor_dims <= n_dims);
@@ -119,7 +119,7 @@ XlaOp DynamicSliceInMinorDims(XlaOp x, absl::Span<const XlaOp> starts,
                           .subspan(
                               /*pos=*/0,
                               /*len=*/n_dims - sizes.size());
-    auto padded_starts = PrependZerosInMajorDims(x, starts);
+    TF_ASSIGN_OR_RETURN(auto padded_starts, PrependZerosInMajorDims(x, starts));
     auto padded_sizes = ConcatVectors(major_dims, sizes);
     return DynamicSlice(x, padded_starts, padded_sizes);
   });
@@ -127,8 +127,38 @@ XlaOp DynamicSliceInMinorDims(XlaOp x, absl::Span<const XlaOp> starts,
 
 XlaOp DynamicUpdateSliceInMinorDims(XlaOp x, XlaOp update,
                                     absl::Span<const XlaOp> starts) {
-  auto padded_starts = PrependZerosInMajorDims(x, starts);
-  return DynamicUpdateSlice(x, update, padded_starts);
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto padded_starts, PrependZerosInMajorDims(x, starts));
+    return DynamicUpdateSlice(x, update, padded_starts);
+  });
+}
+
+XlaOp TorchGather(XlaOp input, XlaOp index, int64 dim) {
+  XlaBuilder* builder = input.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape index_shape, builder->GetShape(index));
+    ShapeUtil::AppendMajorDimension(1, &index_shape);
+    std::vector<XlaOp> to_concat;
+    TF_ASSIGN_OR_RETURN(Shape input_shape, builder->GetShape(input));
+    to_concat.reserve(input_shape.rank());
+    for (int64 i = 0; i < input_shape.rank(); ++i) {
+      if (i == dim) {
+        to_concat.push_back(Reshape(index, index_shape.dimensions()));
+      } else {
+        to_concat.push_back(Iota(builder, index_shape, i));
+      }
+    }
+    XlaOp gather_indices = ConcatInDim(builder, to_concat, input_shape.rank());
+    std::vector<int64> slice_sizes(input_shape.rank(), 1);
+    GatherDimensionNumbers gather_dnums;
+    gather_dnums.set_index_vector_dim(input_shape.rank());
+    for (int64 i = 0; i < input_shape.rank(); ++i) {
+      gather_dnums.add_collapsed_slice_dims(i);
+      gather_dnums.add_start_index_map(i);
+    }
+    return Gather(input, gather_indices, gather_dnums, slice_sizes);
+  });
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/slicing.h b/tensorflow/compiler/xla/client/lib/slicing.h
index 6c482a38b5489c9fb17c3dca9ee3d2a1b8fd1890..69f98a6f43fa167adf6f77b28645a3460b292633 100644
--- a/tensorflow/compiler/xla/client/lib/slicing.h
+++ b/tensorflow/compiler/xla/client/lib/slicing.h
@@ -43,6 +43,20 @@ XlaOp DynamicSliceInMinorDims(XlaOp x, absl::Span<const XlaOp> starts,
 XlaOp DynamicUpdateSliceInMinorDims(XlaOp x, XlaOp update,
                                     absl::Span<const XlaOp> starts);
 
+// Gathers values along an axis specified by dim.
+//
+// For a 3-D tensor the output is specified by:
+//
+// out[i][j][k] = input[index[i][j][k]][j][k]  # if dim == 0
+// out[i][j][k] = input[i][index[i][j][k]][k]  # if dim == 1
+// out[i][j][k] = input[i][j][index[i][j][k]]  # if dim == 2
+//
+// If `input` is an n-dimensional tensor with size
+// [X0,X1,X2,..XN] and dim = i `index` must be an n-dimensional tensor with size
+// [X0,X1,...Y,Xi+1,...,X[N] where y >= 1 and `out` will have the same sizes as
+// `index`.
+XlaOp TorchGather(XlaOp input, XlaOp index, int64 dim);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SLICING_H_
diff --git a/tensorflow/compiler/xla/client/lib/slicing_test.cc b/tensorflow/compiler/xla/client/lib/slicing_test.cc
index 8d362119e01006555db0f82d02626175936e1d05..db6ebb9df18372260a64a3e9fd17b0c30b35667d 100644
--- a/tensorflow/compiler/xla/client/lib/slicing_test.cc
+++ b/tensorflow/compiler/xla/client/lib/slicing_test.cc
@@ -102,5 +102,18 @@ XLA_TEST_F(SlicingTest, SimpleSliceUpdate) {
       {a_data.get(), b_data.get(), x_data.get(), y_data.get()});
 }
 
+XLA_TEST_F(SlicingTest, TorchGather) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::XlaOp input, index;
+  auto input_data =
+      CreateR2Parameter<int>({{1, 2}, {3, 4}}, 0, "input", &builder, &input);
+  auto index_data =
+      CreateR2Parameter<int>({{0, 0}, {1, 0}}, 1, "index", &builder, &index);
+  TorchGather(input, index, 1);
+
+  ComputeAndCompareR2<int>(&builder, {{1, 1}, {4, 3}},
+                           {input_data.get(), index_data.get()});
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/sorting.cc b/tensorflow/compiler/xla/client/lib/sorting.cc
index e8553a08bb014e790822a14e128686b60b8d6b7c..ddc39f4d874cd3613a763b969091e7e65ff1c783 100644
--- a/tensorflow/compiler/xla/client/lib/sorting.cc
+++ b/tensorflow/compiler/xla/client/lib/sorting.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/lib/sorting.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -30,7 +31,13 @@ XlaOp TopK(XlaOp input, int64 k) {
         ShapeUtil::MakeShape(S32, AsInt64Slice(input_shape.dimensions()));
     XlaOp iota_s32 = Iota(builder, iota_shape, last_dim);
     auto input_dims = input_shape.dimensions();
-    XlaOp sort_result = Sort(Neg(input), {iota_s32});
+    // TODO(b/122298745): Get rid of Neg() and use CreateScalarGtComputation
+    // once the TPU backend supports the comparison computations.
+    XlaOp sort_result =
+        Sort({Neg(input), iota_s32},
+             CreateScalarLtComputation({input_shape.element_type(), S32},
+                                       iota_s32.builder()),
+             last_dim, /*is_stable=*/true);
     std::vector<int64> start_indices(input_shape.dimensions_size(), 0);
     std::vector<int64> limit_indices(input_dims.begin(), input_dims.end());
     limit_indices[last_dim] = k;
diff --git a/tensorflow/compiler/xla/client/lib/sorting_test.cc b/tensorflow/compiler/xla/client/lib/sorting_test.cc
index 27ff36c7491ab8397d46f3a49493ff2b904deb2d..0fbd138aca1e86f219d0459086fc09d20844f135 100644
--- a/tensorflow/compiler/xla/client/lib/sorting_test.cc
+++ b/tensorflow/compiler/xla/client/lib/sorting_test.cc
@@ -77,7 +77,7 @@ XLA_TEST_F(SortingTest, TopKFullSort) {
   auto x = ConstantR1<float>(&builder, inputs);
   xla::GetTupleElement(xla::TopK(x, kSize), 0);
 
-  std::sort(inputs.begin(), inputs.end(), std::greater<float>());
+  absl::c_sort(inputs, std::greater<float>());
   ComputeAndCompareR1<float>(&builder, inputs, {});
 }
 
diff --git a/tensorflow/compiler/xla/client/lib/testing.cc b/tensorflow/compiler/xla/client/lib/testing.cc
index a95bbf2c8c860914877d3195b97342097dafc725..9f520bcdadfabc8ca9f9ee82b20804fd2c50d1db 100644
--- a/tensorflow/compiler/xla/client/lib/testing.cc
+++ b/tensorflow/compiler/xla/client/lib/testing.cc
@@ -34,7 +34,7 @@ namespace {
 // specified shape. In case of a (nested) tuple shape this is the total byte
 // size of all sub-shapes within the tuple.
 int64 DataSizeOfShape(const Shape& shape) {
-  if (ShapeUtil::IsArray(shape)) {
+  if (shape.IsArray()) {
     return ShapeUtil::ByteSizeOf(shape);
   }
 
@@ -47,7 +47,7 @@ int64 DataSizeOfShape(const Shape& shape) {
 
 // Creates a XlaOp for an op what generates fake data with the given shape.
 XlaOp BuildFakeDataOpOnDevice(const Shape& shape, XlaBuilder* builder) {
-  if (ShapeUtil::IsArray(shape)) {
+  if (shape.IsArray()) {
     return Broadcast(
         ConstantLiteral(builder, LiteralUtil::One(shape.element_type())),
         AsInt64Slice(shape.dimensions()));
@@ -59,22 +59,25 @@ XlaOp BuildFakeDataOpOnDevice(const Shape& shape, XlaBuilder* builder) {
   return Tuple(builder, parts);
 }
 
-std::unique_ptr<GlobalData> MakeFakeDataViaDeviceOrDie(const Shape& shape,
-                                                       Client* client) {
+std::unique_ptr<GlobalData> MakeFakeDataViaDeviceOrDie(
+    const Shape& shape, Client* client, DebugOptions* debug_opts) {
   XlaBuilder b(absl::StrCat("make_fake_", ShapeUtil::HumanString(shape)));
   BuildFakeDataOpOnDevice(shape, &b);
   XlaComputation computation = b.Build().ConsumeValueOrDie();
 
   auto execution_options = CreateDefaultExecutionOptions();
   *execution_options.mutable_shape_with_output_layout() = shape.ToProto();
+  if (debug_opts) {
+    *execution_options.mutable_debug_options() = *debug_opts;
+  }
   return client->Execute(computation, /*arguments=*/{}, &execution_options)
       .ConsumeValueOrDie();
 }
 
 }  // namespace
 
-std::unique_ptr<GlobalData> MakeFakeDataOrDie(const Shape& shape,
-                                              Client* client) {
+std::unique_ptr<GlobalData> MakeFakeDataOrDie(
+    const Shape& shape, Client* client, DebugOptions* debug_opts /*=nullptr*/) {
   if (DataSizeOfShape(shape) < (1LL << 20)) {
     StatusOr<Literal> literal_status = MakeFakeLiteral(shape);
     if (!literal_status.ok()) {
@@ -82,24 +85,25 @@ std::unique_ptr<GlobalData> MakeFakeDataOrDie(const Shape& shape,
       // an on-device computation.
       CHECK_EQ(literal_status.status().code(),
                tensorflow::error::UNIMPLEMENTED);
-      return MakeFakeDataViaDeviceOrDie(shape, client);
+      return MakeFakeDataViaDeviceOrDie(shape, client, debug_opts);
     }
     return client->TransferToServer(literal_status.ValueOrDie()).ValueOrDie();
   }
 
   // If the data is large, generate it on-device.
-  return MakeFakeDataViaDeviceOrDie(shape, client);
+  return MakeFakeDataViaDeviceOrDie(shape, client, debug_opts);
 }
 
 std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
-    const XlaComputation& computation, Client* client) {
+    const XlaComputation& computation, Client* client,
+    DebugOptions* debug_opts /*=nullptr*/) {
   CHECK(computation.proto().has_host_program_shape())
       << "Computation should have progran shape.";
   auto program_shape = computation.proto().host_program_shape();
 
   std::vector<std::unique_ptr<GlobalData>> results;
   for (const ShapeProto& shape : program_shape.parameters()) {
-    results.push_back(MakeFakeDataOrDie(Shape(shape), client));
+    results.push_back(MakeFakeDataOrDie(Shape(shape), client, debug_opts));
   }
   return results;
 }
diff --git a/tensorflow/compiler/xla/client/lib/testing.h b/tensorflow/compiler/xla/client/lib/testing.h
index 03695ce2a339735e3e49522f4fe1bbf2d83a3834..428fa3e93d1b46983aae60176e7c2242d2552fdb 100644
--- a/tensorflow/compiler/xla/client/lib/testing.h
+++ b/tensorflow/compiler/xla/client/lib/testing.h
@@ -29,14 +29,19 @@ namespace xla {
 // Generates fake data of the given shape on the device or dies. The fake data
 // is created by performing a computation on the device rather than transferring
 // data from the host to the device.
-std::unique_ptr<GlobalData> MakeFakeDataOrDie(const Shape& shape,
-                                              Client* client);
+//
+// The optional DebugOptions are used when generating fake data on the device.
+std::unique_ptr<GlobalData> MakeFakeDataOrDie(
+    const Shape& shape, Client* client, DebugOptions* debug_opts = nullptr);
 
 // Returns vector of GlobalData handles of fake data (created using
 // MakeFakeDataOrDie) that are correctly shaped arguments for the given
 // xla computation.
+//
+// The optional DebugOptions are used when generating fake data on the device.
 std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
-    const XlaComputation& computation, Client* client);
+    const XlaComputation& computation, Client* client,
+    DebugOptions* debug_opts = nullptr);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/client/lib/triangular_solve.h b/tensorflow/compiler/xla/client/lib/triangular_solve.h
deleted file mode 100644
index 50a3b30ebd1c15eb6d2ace4e351cb41f21db7093..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/client/lib/triangular_solve.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_TRIANGULAR_SOLVE_H_
-#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_TRIANGULAR_SOLVE_H_
-
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-
-namespace xla {
-
-// Solves systems of linear equations with lower or upper triangular coefficient
-// matrices by forward- or back-substitution. Broadcasting along leading
-// dimensions, this routine solves one of the matrix systems
-//   `op(a) * x = b`,  or `x * op(a) = b`,
-// for the variable `x` given `a` and `b`, where `op(a)` is either
-//   `op(a) = a`,  or `op(a) = transpose(a)`,  or `op(a) = conj(transpose(a))`.
-// That is, the innermost matrices in the output satisfy a scalar system
-// depending on the value of the value of (left_side, transpose_a, conjugate_a)
-// according to:
-//   (F, F, F) => `output[..., i, k]  a[..., k, j] = b[..., i, j]`,
-//   (F, F, T) => `output[..., i, k] a*[..., k, j] = b[..., i, j]`,
-//   (F, T, F) => `output[..., i, k]  a[..., j, k] = b[..., i, j]`,
-//   (F, T, T) => `output[..., i, k] a*[..., j, k] = b[..., i, j]`,
-//   (T, F, F) => ` a[..., i, k] output[..., k, j] = b[..., i, j]`,
-//   (T, F, T) => `a*[..., i, k] output[..., k, j] = b[..., i, j]`,
-//   (T, T, F) => ` a[..., i, k] output[..., j, k] = b[..., i, j]`,
-//   (T, T, T) => `a*[..., i, k] output[..., j, k] = b[..., i, j]`,
-// where * denotes complex conjugation and where the index `k` is summed over.
-//
-// `a` is a tensor of shape `[..., M, M]` whose innermost 2 dimensions form
-// square matrices. If lower is true (false), then the strictly upper (lower)
-// triangular part of each innermost matrix in `a` is assumed to be zero and is
-// not accessed.
-// `b` is a tensor of shape `[..., M, K]` if left_side is true, otherwise a
-// tensor of shape `[..., K, M]`.
-// `left_side` is a boolean, indicating whether to solve a system of the form
-// op(a) * x = b (true) or x * op(a) = b (false).
-// `lower` is a boolean, indicating whether the argument `a` is lower-triangular
-// (true) or upper-triangular (false).
-// `transpose_a` is a boolean indicating whether the matrix `a` is transposed.
-// `conjugate_a` is a boolean indicating whether the entries of `a` are complex
-// conjugated (independently of whether they are transposed), so that when both
-// transpose_a and conjugate_a are true the effect is a Hermitian adjoint.
-//
-// Uses a blocked algorithm if `block_size` is > 1; if block_size == 1 then no
-// blocking is used.
-XlaOp TriangularSolve(
-    XlaOp a, XlaOp b, bool left_side, bool lower, bool transpose_a,
-    bool conjugate_a, int64 block_size = 128,
-    PrecisionConfig::Precision precision = PrecisionConfig::HIGHEST);
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_TRIANGULAR_SOLVE_H_
diff --git a/tensorflow/compiler/xla/client/lib/triangular_solve_test.cc b/tensorflow/compiler/xla/client/lib/triangular_solve_test.cc
deleted file mode 100644
index f6a70d64a788d95a456774ccbbcf67f2e5cac98b..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/client/lib/triangular_solve_test.cc
+++ /dev/null
@@ -1,333 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
-
-#include <memory>
-#include <numeric>
-#include <vector>
-
-#include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
-#include "tensorflow/compiler/xla/tests/literal_test_util.h"
-#include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-
-namespace xla {
-namespace {
-
-using TriangularSolveTest = xla::ClientLibraryTestBase;
-using TriangularSolveLeftLookingTest = xla::ClientLibraryTestBase;
-using complex64 = xla::complex64;
-
-xla::Array2D<float> AValsLower() {
-  return {{2, 0, 0, 0}, {3, 6, 0, 0}, {4, 7, 9, 0}, {5, 8, 10, 11}};
-}
-
-xla::Array2D<float> AValsUpper() {
-  return {{2, 3, 4, 5}, {0, 6, 7, 8}, {0, 0, 9, 10}, {0, 0, 0, 11}};
-}
-
-xla::Array2D<float> BValsRight() {
-  return {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}};
-}
-
-xla::Array2D<float> BValsLeft() {
-  return {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}};
-}
-
-xla::Array2D<complex64> AValsLowerComplex() {
-  return {{2, 0, 0, 0},
-          {complex64(3, 1), 6, 0, 0},
-          {4, complex64(7, 2), 9, 0},
-          {5, 8, complex64(10, 3), 11}};
-}
-
-xla::Array2D<complex64> AValsUpperComplex() {
-  return {{2, 3, complex64(4, 3), 5},
-          {0, 6, complex64(7, 2), 8},
-          {0, 0, complex64(9, 1), 10},
-          {0, 0, 0, 11}};
-}
-
-xla::Array2D<complex64> BValsRightComplex() {
-  return {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}};
-}
-
-xla::Array2D<complex64> BValsLeftComplex() {
-  return {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}};
-}
-
-xla::Array2D<float> AValsFull() {
-  return {{2, 0, 1, 2}, {3, 6, 0, 1}, {4, 7, 9, 0}, {5, 8, 10, 11}};
-}
-
-XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTranspose) {
-  xla::XlaBuilder builder(TestName());
-
-  xla::XlaOp a, b;
-  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
-  auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
-  TriangularSolve(a, b,
-                  /*left_side=*/false, /*lower=*/true,
-                  /*transpose_a=*/true, /*conjugate_a=*/false,
-                  /*block_size=*/2);
-
-  xla::Array2D<float> expected({
-      {0.5, 0.08333334, 0.04629629, 0.03367003},
-      {2.5, -0.25, -0.1388889, -0.1010101},
-      {4.5, -0.58333331, -0.32407406, -0.23569024},
-  });
-
-  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(1e-2, 1e-2));
-}
-
-XLA_TEST_F(TriangularSolveTest, SimpleRightLowerNotranspose) {
-  xla::XlaBuilder builder(TestName());
-
-  xla::XlaOp a, b;
-  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
-  auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
-  TriangularSolve(a, b,
-                  /*left_side=*/false, /*lower=*/true,
-                  /*transpose_a=*/false, /*conjugate_a=*/false,
-                  /*block_size=*/2);
-
-  xla::Array2D<float> expected({
-      {-0.16414141, -0.06902357, -0.07070707, 0.36363636},
-      {0.64393939, 0.06565657, -0.03030303, 0.72727273},
-      {1.4520202, 0.2003367, 0.01010101, 1.09090909},
-  });
-
-  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(1e-2, 1e-2));
-}
-
-XLA_TEST_F(TriangularSolveTest, SimpleRightUpperTranspose) {
-  xla::XlaBuilder builder(TestName());
-
-  xla::XlaOp a, b;
-  auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
-  auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
-  TriangularSolve(a, b,
-                  /*left_side=*/false, /*lower=*/false,
-                  /*transpose_a=*/true, /*conjugate_a=*/false,
-                  /*block_size=*/2);
-
-  xla::Array2D<float> expected({
-      {-0.16414141, -0.06902357, -0.07070707, 0.36363636},
-      {0.64393939, 0.06565657, -0.03030303, 0.72727273},
-      {1.4520202, 0.2003367, 0.01010101, 1.09090909},
-  });
-
-  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(1e-2, 1e-2));
-}
-
-XLA_TEST_F(TriangularSolveTest, SimpleRightUpperNotranspose) {
-  xla::XlaBuilder builder(TestName());
-
-  xla::XlaOp a, b;
-  auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
-  auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
-  TriangularSolve(a, b,
-                  /*left_side=*/false, /*lower=*/false,
-                  /*transpose_a=*/false, /*conjugate_a=*/false,
-                  /*block_size=*/2);
-
-  xla::Array2D<float> expected({
-      {0.5, 0.08333334, 0.04629629, 0.03367003},
-      {2.5, -0.25, -0.1388889, -0.1010101},
-      {4.5, -0.58333331, -0.32407406, -0.23569024},
-  });
-
-  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(1e-2, 1e-2));
-}
-
-XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerTranspose) {
-  xla::XlaBuilder builder(TestName());
-
-  xla::XlaOp a, b;
-  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
-  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
-  TriangularSolve(a, b,
-                  /*left_side=*/true, /*lower=*/true,
-                  /*transpose_a=*/true, /*conjugate_a=*/false,
-                  /*block_size=*/2);
-
-  xla::Array2D<float> expected({
-      {-0.89646465, -0.69444444, -0.49242424},
-      {-0.27441077, -0.24074074, -0.20707071},
-      {-0.23232323, -0.22222222, -0.21212121},
-      {0.90909091, 1., 1.09090909},
-  });
-
-  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(1e-2, 1e-2));
-}
-
-XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotranspose) {
-  xla::XlaBuilder builder(TestName());
-
-  xla::XlaOp a, b;
-  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
-  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
-  TriangularSolve(a, b,
-                  /*left_side=*/true, /*lower=*/true,
-                  /*transpose_a=*/false, /*conjugate_a=*/false,
-                  /*block_size=*/2);
-
-  xla::Array2D<float> expected({
-      {0.5, 1.0, 1.5},
-      {0.41666667, 0.33333333, 0.25},
-      {0.23148148, 0.18518519, 0.13888889},
-      {0.16835017, 0.13468013, 0.1010101},
-  });
-
-  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(1e-2, 1e-2));
-}
-
-XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotransposeIrregularblock) {
-  xla::XlaBuilder builder(TestName());
-
-  xla::XlaOp a, b;
-  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
-  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
-  TriangularSolve(a, b,
-                  /*left_side=*/true, /*lower=*/true,
-                  /*transpose_a=*/false, /*conjugate_a=*/false,
-                  /*block_size=*/3);
-
-  xla::Array2D<float> expected({
-      {0.5, 1.0, 1.5},
-      {0.41666667, 0.33333333, 0.25},
-      {0.23148148, 0.18518519, 0.13888889},
-      {0.16835017, 0.13468013, 0.1010101},
-  });
-
-  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(1e-2, 1e-2));
-}
-
-XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTranspose) {
-  xla::XlaBuilder builder(TestName());
-
-  xla::XlaOp a, b;
-  auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
-  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
-  TriangularSolve(a, b,
-                  /*left_side=*/true, /*lower=*/false,
-                  /*transpose_a=*/true, /*conjugate_a=*/false,
-                  /*block_size=*/2);
-
-  xla::Array2D<float> expected({
-      {0.5, 1.0, 1.5},
-      {0.41666667, 0.33333333, 0.25},
-      {0.23148148, 0.18518519, 0.13888889},
-      {0.16835017, 0.13468013, 0.1010101},
-  });
-
-  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(1e-2, 1e-2));
-}
-
-XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperNotranspose) {
-  xla::XlaBuilder builder(TestName());
-
-  xla::XlaOp a, b;
-  auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
-  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
-  TriangularSolve(a, b,
-                  /*left_side=*/true, /*lower=*/false,
-                  /*transpose_a=*/false, /*conjugate_a=*/false,
-                  /*block_size=*/2);
-
-  xla::Array2D<float> expected({
-      {-0.89646465, -0.69444444, -0.49242424},
-      {-0.27441077, -0.24074074, -0.20707071},
-      {-0.23232323, -0.22222222, -0.21212121},
-      {0.90909091, 1., 1.09090909},
-  });
-
-  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(1e-2, 1e-2));
-}
-
-XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTransposeConjugate) {
-  xla::XlaBuilder builder(TestName());
-
-  xla::XlaOp a, b;
-  auto a_data =
-      CreateR2Parameter<complex64>(AValsLowerComplex(), 0, "a", &builder, &a);
-  auto b_data =
-      CreateR2Parameter<complex64>(BValsRightComplex(), 1, "b", &builder, &b);
-  TriangularSolve(a, b,
-                  /*left_side=*/false, /*lower=*/true,
-                  /*transpose_a=*/true, /*conjugate_a=*/true,
-                  /*block_size=*/2);
-
-  xla::Array2D<complex64> expected({
-      {0.5, complex64(0.08333333, 0.08333333),
-       complex64(0.02777778, -0.0462963), complex64(0.06313131, -0.01094276)},
-      {2.5, complex64(-0.25, 0.41666667), complex64(-0.23148148, -0.37962963),
-       complex64(0.08670034, -0.02104377)},
-      {4.5, complex64(-0.58333333, 0.75), complex64(-0.49074074, -0.71296296),
-       complex64(0.11026936, -0.03114478)},
-  });
-
-  ComputeAndCompareR2<complex64>(&builder, expected,
-                                 {a_data.get(), b_data.get()},
-                                 xla::ErrorSpec(1e-2, 1e-2));
-}
-
-XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTransposeNoconjugate) {
-  xla::XlaBuilder builder(TestName());
-
-  xla::XlaOp a, b;
-  auto a_data =
-      CreateR2Parameter<complex64>(AValsUpperComplex(), 0, "a", &builder, &a);
-  auto b_data =
-      CreateR2Parameter<complex64>(BValsLeftComplex(), 1, "b", &builder, &b);
-  TriangularSolve(a, b,
-                  /*left_side=*/true, /*lower=*/false,
-                  /*transpose_a=*/true, /*conjugate_a=*/false,
-                  /*block_size=*/2);
-
-  xla::Array2D<complex64> expected({
-      {0.5, 1., 1.5},
-      {0.41666667, 0.33333333, 0.25},
-      {complex64(0.20020325, -2.81504065e-01),
-       complex64(0.13821138, -4.22764228e-01),
-       complex64(0.07621951, -5.64024390e-01)},
-      {complex64(0.19678492, 2.55912786e-01),
-       complex64(0.17738359, 3.84331116e-01),
-       complex64(0.15798226, 5.12749446e-01)},
-  });
-
-  ComputeAndCompareR2<complex64>(&builder, expected,
-                                 {a_data.get(), b_data.get()},
-                                 xla::ErrorSpec(1e-2, 1e-2));
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index 049cd15738a619294b19d5cf74ca514d7b4a00ad..48b5f94538f453785194bc434a91ee0a10c020c2 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -164,9 +164,8 @@ StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
   //    ExecutableRunOptions.eigen_intra_op_thread_pool.
   // *) The thread pool used for XLA CPU ops is from
   //    backend_->eigen_intra_op_thread_pool().
-  ServiceExecutableRunOptions service_options(
-      run_options, backend_->StreamBorrower(),
-      backend_->eigen_intra_op_thread_pool());
+  ServiceExecutableRunOptions service_options(run_options,
+                                              backend_->StreamBorrower());
 
   if (executable_->dumping_snapshot()) {
     return ExecuteAndDump(&service_options, arguments);
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index ddb36680e8b185b053368baffa6f1d5cac50dc07..4f4fc8df31c633749ae9b6dafcdc38d4fd1eba40 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -114,7 +114,7 @@ class LocalClient : public Client {
   // Build and return a LocalExecutable object. The executable is compiled using
   // the given XlaComputation, argument layouts and options.
   //
-  // The given ExecutableBuildOptions override any values from TF_XLA_FLAGS
+  // The given ExecutableBuildOptions overrides any values from XLA_FLAGS
   // environment variable.
   StatusOr<std::unique_ptr<LocalExecutable>> Compile(
       const XlaComputation& computation,
diff --git a/tensorflow/compiler/xla/client/sharding_builder.cc b/tensorflow/compiler/xla/client/sharding_builder.cc
index fb9ea6ec3fc41d5e04ca125798a8199350470a44..b9bff06cbdbc3525eb19d5df885952c3971d9d6a 100644
--- a/tensorflow/compiler/xla/client/sharding_builder.cc
+++ b/tensorflow/compiler/xla/client/sharding_builder.cc
@@ -50,7 +50,7 @@ OpSharding Tile1D(const Shape& tile_shape, int64 num_tiles) {
   OpSharding result;
   result.set_type(OpSharding::Type::OpSharding_Type_OTHER);
 
-  CHECK_EQ(ShapeUtil::Rank(tile_shape), 1);
+  CHECK_EQ(tile_shape.rank(), 1);
   std::vector<int64> dimensions(1, num_tiles);
   *result.mutable_tile_shape() = tile_shape.ToProto();
   auto& tile_dimension =
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 60df2ec3959216b0564846ad47c21c5bcc01ea57..16381155c3f875dcd55853ebbe004ae58af1590d 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include <utility>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
@@ -29,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/sharding_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
@@ -192,9 +195,9 @@ StatusOr<ProgramShape> XlaBuilder::GetProgramShape(XlaOp root) const {
 }
 
 void XlaBuilder::IsConstantVisitor(const int64 op_handle,
-                                   std::set<int64>* visited,
+                                   absl::flat_hash_set<int64>* visited,
                                    bool* is_constant) const {
-  if (visited->count(op_handle) != 0 || !*is_constant) {
+  if (visited->contains(op_handle) || !*is_constant) {
     return;
   }
 
@@ -208,11 +211,21 @@ void XlaBuilder::IsConstantVisitor(const int64 op_handle,
       }
       // TODO(b/32495713): We aren't checking the called computations.
       break;
+    case HloOpcode::kGetDimensionSize: {
+      int64 dimension_number = instr.dimensions(0);
+      const HloInstructionProto& operand =
+          *(LookUpInstructionByHandle(instr.operand_ids(0)).ValueOrDie());
+      Shape operand_shape(operand.shape());
+      if (operand_shape.is_dynamic_dimension(dimension_number)) {
+        *is_constant = false;
+      }
+      break;
+    }
 
     // Non functional ops.
     case HloOpcode::kRng:
-    case HloOpcode::kCrossReplicaSum:
-      // TODO(b/33009255): Implmement constant folding for cross replica sum.
+    case HloOpcode::kAllReduce:
+      // TODO(b/33009255): Implement constant folding for cross replica sum.
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kCall:
@@ -244,6 +257,29 @@ Status XlaBuilder::SetDynamicBinding(int64 dynamic_size_param_num,
                                      int64 target_param_num,
                                      ShapeIndex target_param_index,
                                      int64 target_dim_num) {
+  bool param_exists = false;
+  for (HloInstructionProto& instr : instructions_) {
+    if (instr.opcode() == HloOpcodeString(HloOpcode::kParameter) &&
+        instr.parameter_number() == target_param_num) {
+      param_exists = true;
+      Shape param_shape(instr.shape());
+      Shape* param_shape_ptr = &param_shape;
+      for (int64 index : target_param_index) {
+        param_shape_ptr = param_shape_ptr->mutable_tuple_shapes(index);
+      }
+      param_shape_ptr->set_dynamic_dimension(target_dim_num,
+                                             /*is_dynamic=*/true);
+      *instr.mutable_shape() = param_shape.ToProto();
+    }
+  }
+
+  if (!param_exists) {
+    return InvalidArgument(
+        "Asked to mark parameter %lld as dynamic sized parameter, but the "
+        "doesn't exists",
+        target_param_num);
+  }
+
   TF_RETURN_IF_ERROR(dynamic_parameter_binding_.Bind(
       DynamicParameterBinding::DynamicParameter{dynamic_size_param_num,
                                                 dynamic_size_param_index},
@@ -263,27 +299,51 @@ XlaComputation XlaBuilder::BuildAndNoteError() {
   return build_status.ConsumeValueOrDie();
 }
 
-StatusOr<XlaComputation> XlaBuilder::Build() {
+Status XlaBuilder::GetCurrentStatus() const {
   if (!first_error_.ok()) {
     string backtrace;
     first_error_backtrace_.Dump(tensorflow::DebugWriteToString, &backtrace);
     return AppendStatus(first_error_, backtrace);
   }
-  return Build(instructions_.back().id());
+  return Status::OK();
+}
+
+StatusOr<XlaComputation> XlaBuilder::Build(bool remove_dynamic_dimensions) {
+  TF_RETURN_IF_ERROR(GetCurrentStatus());
+  return Build(instructions_.back().id(), remove_dynamic_dimensions);
 }
 
-StatusOr<XlaComputation> XlaBuilder::Build(XlaOp root) {
+StatusOr<XlaComputation> XlaBuilder::Build(XlaOp root,
+                                           bool remove_dynamic_dimensions) {
   if (root.builder_ != this) {
     return InvalidArgument("Given root operation is not in this computation.");
   }
-  return Build(root.handle());
-}
+  return Build(root.handle(), remove_dynamic_dimensions);
+}
+
+StatusOr<XlaComputation> XlaBuilder::Build(int64 root_id,
+                                           bool remove_dynamic_dimensions) {
+  TF_RETURN_IF_ERROR(GetCurrentStatus());
+
+  // TODO(b/121223198): XLA backend cannot handle dynamic dimensions yet, remove
+  // all dynamic dimensions before building xla program until we have support in
+  // the backend.
+  if (remove_dynamic_dimensions) {
+    std::function<void(ShapeProto*)> remove_dynamic_dimension =
+        [&](ShapeProto* shape) {
+          if (shape->tuple_shapes_size() != 0) {
+            for (int64 i = 0; i < shape->tuple_shapes_size(); ++i) {
+              remove_dynamic_dimension(shape->mutable_tuple_shapes(i));
+            }
+          }
+          for (int64 i = 0; i < shape->dimensions_size(); ++i) {
+            shape->set_is_dynamic_dimension(i, false);
+          }
+        };
 
-StatusOr<XlaComputation> XlaBuilder::Build(int64 root_id) {
-  if (!first_error_.ok()) {
-    string backtrace;
-    first_error_backtrace_.Dump(tensorflow::DebugWriteToString, &backtrace);
-    return AppendStatus(first_error_, backtrace);
+    for (auto& instruction : instructions_) {
+      remove_dynamic_dimension(instruction.mutable_shape());
+    }
   }
 
   HloComputationProto entry;
@@ -310,7 +370,10 @@ StatusOr<XlaComputation> XlaBuilder::Build(int64 root_id) {
     module->add_computations()->Swap(&e.second);
   }
   module->add_computations()->Swap(&entry);
-
+  if (!input_output_aliases_.empty()) {
+    TF_RETURN_IF_ERROR(
+        PopulateInputOutputAlias(module, program_shape, input_output_aliases_));
+  }
   *(module->mutable_dynamic_parameter_binding()) =
       dynamic_parameter_binding_.ToProto();
 
@@ -323,6 +386,35 @@ StatusOr<XlaComputation> XlaBuilder::Build(int64 root_id) {
   return std::move(computation);
 }
 
+/* static */ Status XlaBuilder::PopulateInputOutputAlias(
+    HloModuleProto* module, const ProgramShape& program_shape,
+    const std::vector<InputOutputAlias>& input_output_aliases) {
+  HloInputOutputAliasConfig config(program_shape.result());
+  for (auto& alias : input_output_aliases) {
+    // The HloInputOutputAliasConfig does not do parameter validation as it only
+    // carries the result shape. Maybe it should be constructed with a
+    // ProgramShape to allow full validation. We will still get an error when
+    // trying to compile the HLO module, but would be better to have validation
+    // at this stage.
+    if (alias.param_number >= program_shape.parameters_size()) {
+      return InvalidArgument("Invalid parameter number %ld (total %ld)",
+                             alias.param_number,
+                             program_shape.parameters_size());
+    }
+    const Shape& parameter_shape = program_shape.parameters(alias.param_number);
+    if (!ShapeUtil::IndexIsValid(parameter_shape, alias.param_index)) {
+      return InvalidArgument("Invalid parameter %ld index: %s",
+                             alias.param_number,
+                             alias.param_index.ToString().c_str());
+    }
+    TF_RETURN_IF_ERROR(config.SetUpAlias(
+        alias.output_index, alias.param_number, alias.param_index,
+        HloInputOutputAliasConfig::AliasKind::kUserAlias));
+  }
+  *module->mutable_input_output_alias() = config.ToProto();
+  return Status::OK();
+}
+
 StatusOr<XlaOp> XlaBuilder::InDimBroadcast(
     const Shape& shape, const XlaOp& operand,
     absl::Span<const int64> broadcast_dimensions) {
@@ -343,7 +435,7 @@ StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(const Shape& output_shape,
   TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
 
   CHECK(ShapeUtil::IsScalar(operand_shape) ||
-        ShapeUtil::Rank(operand_shape) == ShapeUtil::Rank(output_shape));
+        operand_shape.rank() == output_shape.rank());
   Shape broadcast_shape =
       ShapeUtil::ChangeElementType(output_shape, operand_shape.element_type());
 
@@ -355,7 +447,7 @@ StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(const Shape& output_shape,
   // Do explicit broadcast for degenerate broadcast.
   std::vector<int64> broadcast_dimensions;
   std::vector<int64> reshaped_dimensions;
-  for (int i = 0; i < ShapeUtil::Rank(operand_shape); i++) {
+  for (int i = 0; i < operand_shape.rank(); i++) {
     if (operand_shape.dimensions(i) == output_shape.dimensions(i)) {
       broadcast_dimensions.push_back(i);
       reshaped_dimensions.push_back(operand_shape.dimensions(i));
@@ -398,8 +490,8 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs,
                             binop, lhs_shape, rhs_shape, broadcast_dimensions));
     *instr.mutable_shape() = shape.ToProto();
 
-    const int64 lhs_rank = ShapeUtil::Rank(lhs_shape);
-    const int64 rhs_rank = ShapeUtil::Rank(rhs_shape);
+    const int64 lhs_rank = lhs_shape.rank();
+    const int64 rhs_rank = rhs_shape.rank();
 
     XlaOp updated_lhs = lhs;
     XlaOp updated_rhs = rhs;
@@ -410,17 +502,19 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs,
       const Shape& from_shape = should_broadcast_lhs ? lhs_shape : rhs_shape;
 
       std::vector<int64> to_size;
-      for (int64 size : shape.dimensions()) {
-        to_size.push_back(size);
+      std::vector<bool> to_size_is_dynamic;
+      for (int i = 0; i < shape.rank(); i++) {
+        to_size.push_back(shape.dimensions(i));
+        to_size_is_dynamic.push_back(shape.is_dynamic_dimension(i));
       }
-      for (int64 from_dim = 0; from_dim < ShapeUtil::Rank(from_shape);
-           from_dim++) {
+      for (int64 from_dim = 0; from_dim < from_shape.rank(); from_dim++) {
         int64 to_dim = broadcast_dimensions[from_dim];
         to_size[to_dim] = from_shape.dimensions(from_dim);
+        to_size_is_dynamic[to_dim] = from_shape.is_dynamic_dimension(from_dim);
       }
 
-      const Shape& broadcasted_shape =
-          ShapeUtil::MakeShape(from_shape.element_type(), to_size);
+      const Shape& broadcasted_shape = ShapeUtil::MakeShape(
+          from_shape.element_type(), to_size, to_size_is_dynamic);
       TF_ASSIGN_OR_RETURN(
           XlaOp broadcasted_operand,
           InDimBroadcast(broadcasted_shape, from, broadcast_dimensions));
@@ -458,18 +552,18 @@ XlaOp XlaBuilder::TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs,
     XlaOp updated_lhs = lhs;
     XlaOp updated_rhs = rhs;
     XlaOp updated_ehs = ehs;
-    if (!ShapeUtil::IsTuple(shape)) {
-      if (!ShapeUtil::IsTuple(lhs_shape) &&
+    if (!shape.IsTuple()) {
+      if (!lhs_shape.IsTuple() &&
           !ShapeUtil::SameDimensions(shape, lhs_shape)) {
         // lhs is being implicitly broadcasted. Change to explicit.
         TF_ASSIGN_OR_RETURN(updated_lhs, AddBroadcastSequence(shape, lhs));
       }
-      if (!ShapeUtil::IsTuple(rhs_shape) &&
+      if (!rhs_shape.IsTuple() &&
           !ShapeUtil::SameDimensions(shape, rhs_shape)) {
         // rhs is being implicitly broadcasted. Change to explicit.
         TF_ASSIGN_OR_RETURN(updated_rhs, AddBroadcastSequence(shape, rhs));
       }
-      if (!ShapeUtil::IsTuple(ehs_shape) &&
+      if (!ehs_shape.IsTuple() &&
           !ShapeUtil::SameDimensions(shape, ehs_shape)) {
         // ehs is being implicitly broadcasted. Change to explicit.
         TF_ASSIGN_OR_RETURN(updated_ehs, AddBroadcastSequence(shape, ehs));
@@ -480,16 +574,6 @@ XlaOp XlaBuilder::TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs,
   });
 }
 
-XlaOp XlaBuilder::Add(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kAdd, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Mul(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kMultiply, lhs, rhs, broadcast_dimensions);
-}
-
 XlaOp XlaBuilder::ConstantLiteral(const LiteralSlice& literal) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
@@ -563,10 +647,10 @@ XlaOp XlaBuilder::Broadcast(const XlaOp& operand,
     // output, so to append dimensions on the left the instruction's dimensions
     // should just be the n highest dimension numbers of the output shape where
     // n is the number of input dimensions.
-    const int64 operand_rank = ShapeUtil::Rank(operand_shape);
+    const int64 operand_rank = operand_shape.rank();
     std::vector<int64> dimensions(operand_rank);
     for (int i = 0; i < operand_rank; ++i) {
-      dimensions[i] = i + ShapeUtil::Rank(shape) - operand_rank;
+      dimensions[i] = i + shape.rank() - operand_rank;
     }
     return InDimBroadcast(shape, operand, dimensions);
   });
@@ -579,8 +663,17 @@ XlaOp XlaBuilder::BroadcastInDim(
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
     // Output shape, in the case of degenerate broadcast, the out_dim_size is
     // not necessarily the same as the dimension sizes of the output shape.
-    const auto& output_shape =
+    auto output_shape =
         ShapeUtil::MakeShape(operand_shape.element_type(), out_dim_size);
+    for (int i = 0; i < broadcast_dimensions.size(); i++) {
+      if (broadcast_dimensions[i] < 0 ||
+          broadcast_dimensions[i] > out_dim_size.size()) {
+        return InvalidArgument("Broadcast dimension %lld is out of bound",
+                               broadcast_dimensions[i]);
+      }
+      output_shape.set_dynamic_dimension(broadcast_dimensions[i],
+                                         operand_shape.is_dynamic_dimension(i));
+    }
 
     TF_RETURN_IF_ERROR(ShapeInference::InferBroadcastShape(
                            operand_shape, output_shape, broadcast_dimensions)
@@ -639,10 +732,10 @@ XlaOp XlaBuilder::SliceInDim(const XlaOp& operand, int64 start_index,
                              int64 limit_index, int64 stride, int64 dimno) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand));
-    std::vector<int64> starts(ShapeUtil::Rank(shape), 0);
+    std::vector<int64> starts(shape.rank(), 0);
     std::vector<int64> limits(shape.dimensions().begin(),
                               shape.dimensions().end());
-    std::vector<int64> strides(ShapeUtil::Rank(shape), 1);
+    std::vector<int64> strides(shape.rank(), 1);
     starts[dimno] = start_index;
     limits[dimno] = limit_index;
     strides[dimno] = stride;
@@ -660,7 +753,7 @@ XlaOp XlaBuilder::DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
                         GetShape(start_indices));
     TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferDynamicSliceShape(
-                            operand_shape, start_indices_shape, slice_sizes));
+                            operand_shape, {start_indices_shape}, slice_sizes));
     *instr.mutable_shape() = shape.ToProto();
 
     for (int64 size : slice_sizes) {
@@ -672,6 +765,34 @@ XlaOp XlaBuilder::DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
   });
 }
 
+XlaOp XlaBuilder::DynamicSlice(const XlaOp& operand,
+                               absl::Span<const XlaOp> start_indices,
+                               absl::Span<const int64> slice_sizes) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    std::vector<const Shape*> start_indices_shape_ptrs;
+    TF_ASSIGN_OR_RETURN(const auto& start_indices_shapes,
+                        GetOperandShapes(start_indices));
+    absl::c_transform(start_indices_shapes,
+                      std::back_inserter(start_indices_shape_ptrs),
+                      [](const Shape& shape) { return &shape; });
+    TF_ASSIGN_OR_RETURN(Shape shape,
+                        ShapeInference::InferDynamicSliceShape(
+                            operand_shape, start_indices_shapes, slice_sizes));
+    *instr.mutable_shape() = shape.ToProto();
+
+    for (int64 size : slice_sizes) {
+      instr.add_dynamic_slice_sizes(size);
+    }
+
+    std::vector<XlaOp> operands = {operand};
+    operands.insert(operands.end(), start_indices.begin(), start_indices.end());
+    return AddInstruction(std::move(instr), HloOpcode::kDynamicSlice, operands);
+  });
+}
+
 XlaOp XlaBuilder::DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
                                      const XlaOp& start_indices) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -681,13 +802,38 @@ XlaOp XlaBuilder::DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
     TF_ASSIGN_OR_RETURN(const Shape& update_shape, GetShape(update));
     TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape,
                         GetShape(start_indices));
+    TF_ASSIGN_OR_RETURN(
+        Shape shape, ShapeInference::InferDynamicUpdateSliceShape(
+                         operand_shape, update_shape, {start_indices_shape}));
+    *instr.mutable_shape() = shape.ToProto();
+
+    return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice,
+                          {operand, update, start_indices});
+  });
+}
+
+XlaOp XlaBuilder::DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+                                     absl::Span<const XlaOp> start_indices) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& update_shape, GetShape(update));
+    std::vector<const Shape*> start_indices_shape_ptrs;
+    TF_ASSIGN_OR_RETURN(const auto& start_indices_shapes,
+                        GetOperandShapes(start_indices));
+    absl::c_transform(start_indices_shapes,
+                      std::back_inserter(start_indices_shape_ptrs),
+                      [](const Shape& shape) { return &shape; });
     TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferDynamicUpdateSliceShape(
-                            operand_shape, update_shape, start_indices_shape));
+                            operand_shape, update_shape, start_indices_shapes));
     *instr.mutable_shape() = shape.ToProto();
 
+    std::vector<XlaOp> operands = {operand, update};
+    operands.insert(operands.end(), start_indices.begin(), start_indices.end());
     return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice,
-                          {operand, update, start_indices});
+                          operands);
   });
 }
 
@@ -780,7 +926,7 @@ XlaOp XlaBuilder::Collapse(const XlaOp& operand,
     VLOG(3) << "dims to collapse: " << absl::StrJoin(dimensions, ",");
 
     std::vector<int64> new_sizes;
-    for (int i = 0; i < ShapeUtil::Rank(original_shape); ++i) {
+    for (int i = 0; i < original_shape.rank(); ++i) {
       if (i <= dimensions.front() || i > dimensions.back()) {
         new_sizes.push_back(original_shape.dimensions(i));
       } else {
@@ -808,10 +954,9 @@ XlaOp XlaBuilder::Select(const XlaOp& pred, const XlaOp& on_true,
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape& true_shape, GetShape(on_true));
     TF_ASSIGN_OR_RETURN(const Shape& false_shape, GetShape(on_false));
-    TF_RET_CHECK(ShapeUtil::IsTuple(true_shape) ==
-                 ShapeUtil::IsTuple(false_shape));
-    HloOpcode opcode = ShapeUtil::IsTuple(true_shape) ? HloOpcode::kTupleSelect
-                                                      : HloOpcode::kSelect;
+    TF_RET_CHECK(true_shape.IsTuple() == false_shape.IsTuple());
+    HloOpcode opcode =
+        true_shape.IsTuple() ? HloOpcode::kTupleSelect : HloOpcode::kSelect;
     return TernaryOp(opcode, pred, on_true, on_false);
   });
 }
@@ -835,7 +980,7 @@ XlaOp XlaBuilder::GetTupleElement(const XlaOp& tuple_data, int64 index) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& tuple_shape, GetShape(tuple_data));
-    if (!ShapeUtil::IsTuple(tuple_shape)) {
+    if (!tuple_shape.IsTuple()) {
       return InvalidArgument(
           "Operand to GetTupleElement() is not a tuple; got %s",
           ShapeUtil::HumanString(tuple_shape));
@@ -850,36 +995,6 @@ XlaOp XlaBuilder::GetTupleElement(const XlaOp& tuple_data, int64 index) {
   });
 }
 
-XlaOp XlaBuilder::Eq(const XlaOp& lhs, const XlaOp& rhs,
-                     absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kEq, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Ne(const XlaOp& lhs, const XlaOp& rhs,
-                     absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kNe, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Ge(const XlaOp& lhs, const XlaOp& rhs,
-                     absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kGe, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Gt(const XlaOp& lhs, const XlaOp& rhs,
-                     absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kGt, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Le(const XlaOp& lhs, const XlaOp& rhs,
-                     absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kLe, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Lt(const XlaOp& lhs, const XlaOp& rhs,
-                     absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kLt, lhs, rhs, broadcast_dimensions);
-}
-
 XlaOp XlaBuilder::Dot(const XlaOp& lhs, const XlaOp& rhs,
                       const PrecisionConfig* precision_config) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -900,6 +1015,18 @@ XlaOp XlaBuilder::DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
     TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
+    // If one operand is a scalar, just multiply the two operands.
+    if (ShapeUtil::IsScalar(lhs_shape) || ShapeUtil::IsScalar(rhs_shape)) {
+      if (dimension_numbers.rhs_batch_dimensions_size() != 0 ||
+          dimension_numbers.lhs_batch_dimensions_size() != 0 ||
+          dimension_numbers.rhs_contracting_dimensions_size() != 0 ||
+          dimension_numbers.lhs_contracting_dimensions_size() != 0) {
+        return InvalidArgument(
+            "Dots with scalar operands must have no contracting or batch "
+            "dimensions");
+      }
+      return xla::Mul(lhs, rhs);
+    }
     TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferDotOpShape(lhs_shape, rhs_shape,
                                                         dimension_numbers));
@@ -915,13 +1042,13 @@ XlaOp XlaBuilder::DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
 Status XlaBuilder::VerifyConvolution(
     const Shape& lhs_shape, const Shape& rhs_shape,
     const ConvolutionDimensionNumbers& dimension_numbers) const {
-  if (ShapeUtil::Rank(lhs_shape) != ShapeUtil::Rank(rhs_shape)) {
+  if (lhs_shape.rank() != rhs_shape.rank()) {
     return InvalidArgument(
         "Convolution arguments must have same number of "
         "dimensions. Got: %s and %s",
         ShapeUtil::HumanString(lhs_shape), ShapeUtil::HumanString(rhs_shape));
   }
-  int num_dims = ShapeUtil::Rank(lhs_shape);
+  int num_dims = lhs_shape.rank();
   if (num_dims < 2) {
     return InvalidArgument(
         "Convolution expects argument arrays with >= 3 dimensions. "
@@ -959,27 +1086,29 @@ Status XlaBuilder::VerifyConvolution(
 
 XlaOp XlaBuilder::Conv(const XlaOp& lhs, const XlaOp& rhs,
                        absl::Span<const int64> window_strides, Padding padding,
-                       int64 feature_group_count,
+                       int64 feature_group_count, int64 batch_group_count,
                        const PrecisionConfig* precision_config) {
   return ConvWithGeneralDimensions(
       lhs, rhs, window_strides, padding,
       CreateDefaultConvDimensionNumbers(window_strides.size()),
-      feature_group_count, precision_config);
+      feature_group_count, batch_group_count, precision_config);
 }
 
 XlaOp XlaBuilder::ConvWithGeneralPadding(
     const XlaOp& lhs, const XlaOp& rhs, absl::Span<const int64> window_strides,
     absl::Span<const std::pair<int64, int64>> padding,
-    int64 feature_group_count, const PrecisionConfig* precision_config) {
+    int64 feature_group_count, int64 batch_group_count,
+    const PrecisionConfig* precision_config) {
   return ConvGeneral(lhs, rhs, window_strides, padding,
                      CreateDefaultConvDimensionNumbers(window_strides.size()),
-                     feature_group_count, precision_config);
+                     feature_group_count, batch_group_count, precision_config);
 }
 
 XlaOp XlaBuilder::ConvWithGeneralDimensions(
     const XlaOp& lhs, const XlaOp& rhs, absl::Span<const int64> window_strides,
     Padding padding, const ConvolutionDimensionNumbers& dimension_numbers,
-    int64 feature_group_count, const PrecisionConfig* precision_config) {
+    int64 feature_group_count, int64 batch_group_count,
+    const PrecisionConfig* precision_config) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
     TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
@@ -1007,7 +1136,7 @@ XlaOp XlaBuilder::ConvWithGeneralDimensions(
                        MakePadding(base_area_dimensions, window_dimensions,
                                    window_strides, padding),
                        dimension_numbers, feature_group_count,
-                       precision_config);
+                       batch_group_count, precision_config);
   });
 }
 
@@ -1015,10 +1144,11 @@ XlaOp XlaBuilder::ConvGeneral(
     const XlaOp& lhs, const XlaOp& rhs, absl::Span<const int64> window_strides,
     absl::Span<const std::pair<int64, int64>> padding,
     const ConvolutionDimensionNumbers& dimension_numbers,
-    int64 feature_group_count, const PrecisionConfig* precision_config) {
+    int64 feature_group_count, int64 batch_group_count,
+    const PrecisionConfig* precision_config) {
   return ConvGeneralDilated(lhs, rhs, window_strides, padding, {}, {},
                             dimension_numbers, feature_group_count,
-                            precision_config);
+                            batch_group_count, precision_config);
 }
 
 XlaOp XlaBuilder::ConvGeneralDilated(
@@ -1026,7 +1156,8 @@ XlaOp XlaBuilder::ConvGeneralDilated(
     absl::Span<const std::pair<int64, int64>> padding,
     absl::Span<const int64> lhs_dilation, absl::Span<const int64> rhs_dilation,
     const ConvolutionDimensionNumbers& dimension_numbers,
-    int64 feature_group_count, const PrecisionConfig* precision_config) {
+    int64 feature_group_count, int64 batch_group_count,
+    const PrecisionConfig* precision_config) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
@@ -1045,14 +1176,15 @@ XlaOp XlaBuilder::ConvGeneralDilated(
                         MakeWindow(window_dimensions, window_strides, padding,
                                    lhs_dilation, rhs_dilation));
 
-    TF_ASSIGN_OR_RETURN(Shape shape,
-                        ShapeInference::InferConvolveShape(
-                            lhs_shape, rhs_shape, feature_group_count,
-                            instr.window(), dimension_numbers));
+    TF_ASSIGN_OR_RETURN(
+        Shape shape, ShapeInference::InferConvolveShape(
+                         lhs_shape, rhs_shape, feature_group_count,
+                         batch_group_count, instr.window(), dimension_numbers));
     *instr.mutable_shape() = shape.ToProto();
 
     *instr.mutable_convolution_dimension_numbers() = dimension_numbers;
     instr.set_feature_group_count(feature_group_count);
+    instr.set_batch_group_count(batch_group_count);
 
     if (precision_config != nullptr) {
       *instr.mutable_precision_config() = *precision_config;
@@ -1145,7 +1277,7 @@ XlaOp XlaBuilder::Infeed(const Shape& shape, const string& config) {
     *instr.mutable_shape() = infeed_instruction_shape.ToProto();
     instr.set_infeed_config(config);
 
-    if (ShapeUtil::IsArray(shape) && sharding() &&
+    if (shape.IsArray() && sharding() &&
         sharding()->type() == OpSharding::Type::OpSharding_Type_OTHER) {
       // TODO(b/110793772): Support tiled array-shaped infeeds.
       return InvalidArgument(
@@ -1221,7 +1353,7 @@ XlaOp XlaBuilder::InfeedWithToken(const XlaOp& token, const Shape& shape,
     *instr.mutable_shape() = infeed_instruction_shape.ToProto();
     instr.set_infeed_config(config);
 
-    if (ShapeUtil::IsArray(shape) && sharding() &&
+    if (shape.IsArray() && sharding() &&
         sharding()->type() == OpSharding::Type::OpSharding_Type_OTHER) {
       // TODO(b/110793772): Support tiled array-shaped infeeds.
       return InvalidArgument(
@@ -1334,7 +1466,7 @@ XlaOp XlaBuilder::AfterAll(absl::Span<const XlaOp> tokens) {
     for (int i = 0; i < tokens.size(); ++i) {
       const XlaOp& operand = tokens[i];
       TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-      if (!ShapeUtil::IsToken(operand_shape)) {
+      if (!operand_shape.IsToken()) {
         return InvalidArgument(
             "All operands to AfterAll must be tokens; operand %d has shape %s",
             i, ShapeUtil::HumanString(operand_shape));
@@ -1390,147 +1522,6 @@ XlaOp XlaBuilder::CustomCall(
   });
 }
 
-XlaOp XlaBuilder::Complex(const XlaOp& real, const XlaOp& imag,
-                          absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kComplex, real, imag, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Conj(const XlaOp& operand) {
-  return Complex(Real(operand), Neg(Imag(operand)));
-}
-
-XlaOp XlaBuilder::Sub(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kSubtract, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Div(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kDivide, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Rem(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kRemainder, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Max(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kMaximum, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Min(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kMinimum, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::And(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kAnd, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Or(const XlaOp& lhs, const XlaOp& rhs,
-                     absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kOr, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Xor(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kXor, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Not(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kNot, operand);
-}
-
-XlaOp XlaBuilder::ShiftLeft(const XlaOp& lhs, const XlaOp& rhs,
-                            absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kShiftLeft, lhs, rhs, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::ShiftRightArithmetic(
-    const XlaOp& lhs, const XlaOp& rhs,
-    absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kShiftRightArithmetic, lhs, rhs,
-                  broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::ShiftRightLogical(
-    const XlaOp& lhs, const XlaOp& rhs,
-    absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kShiftRightLogical, lhs, rhs,
-                  broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Abs(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kAbs, operand);
-}
-
-XlaOp XlaBuilder::Atan2(const XlaOp& y, const XlaOp& x,
-                        absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kAtan2, y, x, broadcast_dimensions);
-}
-
-XlaOp XlaBuilder::Exp(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kExp, operand);
-}
-
-XlaOp XlaBuilder::Expm1(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kExpm1, operand);
-}
-
-XlaOp XlaBuilder::Floor(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kFloor, operand);
-}
-
-XlaOp XlaBuilder::Ceil(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kCeil, operand);
-}
-
-XlaOp XlaBuilder::Round(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kRoundNearestAfz, operand);
-}
-
-XlaOp XlaBuilder::Log(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kLog, operand);
-}
-
-XlaOp XlaBuilder::Log1p(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kLog1p, operand);
-}
-
-XlaOp XlaBuilder::Sign(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kSign, operand);
-}
-
-XlaOp XlaBuilder::Clz(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kClz, operand);
-}
-
-XlaOp XlaBuilder::Cos(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kCos, operand);
-}
-
-XlaOp XlaBuilder::Sin(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kSin, operand);
-}
-
-XlaOp XlaBuilder::Tanh(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kTanh, operand);
-}
-
-XlaOp XlaBuilder::Real(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kReal, operand);
-}
-
-XlaOp XlaBuilder::Imag(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kImag, operand);
-}
-
-XlaOp XlaBuilder::IsFinite(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kIsFinite, operand);
-}
-
 XlaOp XlaBuilder::Transpose(const XlaOp& operand,
                             absl::Span<const int64> permutation) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -1561,36 +1552,146 @@ XlaOp XlaBuilder::Rev(const XlaOp& operand,
   });
 }
 
+namespace {
+// Switch from a floating point value to a integer value in such a way that when
+// using the integer value to compare, we get the same result for normal values,
+// and -Nan is treated as the smallest value, and Nan is treated as the largest
+// value.
+// If f is a float, and
+// x = bit_cast<int32>(f);
+// y = x < 0 ? numeric_limits<int32>::max() - x : x;
+// then y is ordered as an int32 such that finite values have the obvious order,
+// -0 is ordered before 0, and -NaN and NaN appear at the beginning and end of
+// the ordering.
+// Note that in order to avoid -x to overflow, we calculate
+// numeric_limits<int32>::max() - x as unsigned, and then convert back to
+// signed.
+XlaOp BitcastConvertFloatingPointToIntegral(const XlaOp& value,
+                                            int64 bit_width) {
+  PrimitiveType signed_type;
+  PrimitiveType unsigned_type;
+  XlaOp max_value;
+  switch (bit_width) {
+    case 16:
+      max_value =
+          ConstantR0(value.builder(),
+                     static_cast<uint16>(std::numeric_limits<int16>::max()));
+      signed_type = S16;
+      unsigned_type = U16;
+      break;
+    case 32:
+      max_value =
+          ConstantR0(value.builder(),
+                     static_cast<uint32>(std::numeric_limits<int32>::max()));
+      signed_type = S32;
+      unsigned_type = U32;
+      break;
+    case 64:
+      max_value =
+          ConstantR0(value.builder(),
+                     static_cast<uint64>(std::numeric_limits<int64>::max()));
+      signed_type = S64;
+      unsigned_type = U64;
+      break;
+    default:
+      return value.builder()->ReportError(
+          InvalidArgument("Invalid bit width %lld for Comparator floating "
+                          "point parameter.",
+                          bit_width));
+  }
+  auto signed_value = BitcastConvertType(value, signed_type);
+  auto unsigned_value = BitcastConvertType(value, unsigned_type);
+  auto flipped_value =
+      BitcastConvertType(Sub(max_value, unsigned_value), signed_type);
+  auto is_negative =
+      Lt(signed_value,
+         ConstantLiteral(value.builder(), LiteralUtil::Zero(signed_type)));
+  return Select(is_negative, flipped_value, signed_value);
+}
+}  // namespace
+
 XlaOp XlaBuilder::Sort(const XlaOp& keys, absl::Span<const XlaOp> values,
                        int64 dimension) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    std::vector<XlaOp> operands{keys};
+    for (const XlaOp& value : values) {
+      operands.push_back(value);
+    }
+    // Build the default less-than comparator (copied from lib/comparators.cc).
+    // TODO(b/122298745): Remove the deprecated API method so that this code
+    // duplication can be deleted.
+    auto b = this->CreateSubBuilder("comparator");
+    std::vector<PrimitiveType> operand_types;
+    for (const XlaOp& operand : operands) {
+      TF_ASSIGN_OR_RETURN(auto operand_shape, GetShape(operand));
+      operand_types.push_back(operand_shape.element_type());
+    }
+
+    int64 parameter_count = 0;
+    XlaOp first_lhs_param;
+    XlaOp first_rhs_param;
+
+    for (auto operand_type : operand_types) {
+      auto scalar_shape = ShapeUtil::MakeShape(operand_type, {});
+      auto lhs_param =
+          b->Parameter(parameter_count * 2, scalar_shape,
+                       absl::StrCat("p.", parameter_count, ".lhs"));
+      auto rhs_param =
+          b->Parameter(parameter_count * 2 + 1, scalar_shape,
+                       absl::StrCat("p.", parameter_count, ".rhs"));
+      if (parameter_count == 0) {
+        first_lhs_param = lhs_param;
+        first_rhs_param = rhs_param;
+      }
+      ++parameter_count;
+    }
+    if (primitive_util::IsFloatingPointType(operand_types[0])) {
+      PrimitiveType compare_type = operand_types[0];
+      // Special-case handling for BF16. We currently do not support direct
+      // comparisons with BF16, so we convert to F32 and then use the F32
+      // comparison logic.
+      if (compare_type == BF16) {
+        compare_type = F32;
+        first_lhs_param = b->ConvertElementType(first_lhs_param, F32);
+        first_rhs_param = b->ConvertElementType(first_rhs_param, F32);
+      }
+      int64 bit_width = primitive_util::BitWidth(compare_type);
+      first_lhs_param =
+          BitcastConvertFloatingPointToIntegral(first_lhs_param, bit_width);
+      first_rhs_param =
+          BitcastConvertFloatingPointToIntegral(first_rhs_param, bit_width);
+    }
+    Lt(first_lhs_param, first_rhs_param);
+
+    TF_ASSIGN_OR_RETURN(auto comparator, b->Build());
+    return Sort(operands, comparator, dimension, /*is_stable=*/false);
+  });
+}
+
+XlaOp XlaBuilder::Sort(absl::Span<const XlaOp> operands,
+                       const XlaComputation& comparator, int64 dimension,
+                       bool is_stable) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
+    instr.set_is_stable(is_stable);
     std::vector<const Shape*> operand_shape_ptrs;
-    TF_ASSIGN_OR_RETURN(const Shape& keys_shape, GetShape(keys));
-    operand_shape_ptrs.push_back(&keys_shape);
-    TF_ASSIGN_OR_RETURN(std::vector<Shape> values_shapes,
-                        GetOperandShapes(values));
-    absl::c_transform(values_shapes, std::back_inserter(operand_shape_ptrs),
+    TF_ASSIGN_OR_RETURN(std::vector<Shape> operand_shapes,
+                        GetOperandShapes(operands));
+    absl::c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
                       [](const Shape& shape) { return &shape; });
     TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferVariadicOpShape(
                                          HloOpcode::kSort, operand_shape_ptrs));
     *instr.mutable_shape() = shape.ToProto();
     if (dimension == -1) {
-      TF_ASSIGN_OR_RETURN(const Shape& keys_shape, GetShape(keys));
-      dimension = ShapeUtil::Rank(keys_shape) - 1;
+      TF_ASSIGN_OR_RETURN(const Shape& keys_shape, GetShape(operands[0]));
+      dimension = keys_shape.rank() - 1;
     }
     instr.add_dimensions(dimension);
-    std::vector<XlaOp> operands{keys};
-    operands.insert(operands.end(), values.begin(), values.end());
+    AddCalledComputation(comparator, &instr);
     return AddInstruction(std::move(instr), HloOpcode::kSort, operands);
   });
 }
 
-XlaOp XlaBuilder::Pow(const XlaOp& lhs, const XlaOp& rhs,
-                      absl::Span<const int64> broadcast_dimensions) {
-  return BinaryOp(HloOpcode::kPower, lhs, rhs, broadcast_dimensions);
-}
-
 XlaOp XlaBuilder::ConvertElementType(const XlaOp& operand,
                                      PrimitiveType new_element_type) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -1616,10 +1717,6 @@ XlaOp XlaBuilder::BitcastConvertType(const XlaOp& operand,
   });
 }
 
-XlaOp XlaBuilder::Neg(const XlaOp& operand) {
-  return UnaryOp(HloOpcode::kNegate, operand);
-}
-
 XlaOp XlaBuilder::Clamp(const XlaOp& min, const XlaOp& operand,
                         const XlaOp& max) {
   return TernaryOp(HloOpcode::kClamp, min, operand, max);
@@ -1647,12 +1744,12 @@ XlaOp XlaBuilder::Map(absl::Span<const XlaOp> operands,
     *instr.mutable_shape() = shape.ToProto();
 
     Shape output_shape(instr.shape());
-    const int64 output_rank = ShapeUtil::Rank(output_shape);
+    const int64 output_rank = output_shape.rank();
     AddCalledComputation(computation, &instr);
     std::vector<XlaOp> new_operands(operands.begin(), operands.end());
     for (XlaOp& new_operand : new_operands) {
       TF_ASSIGN_OR_RETURN(Shape shape, GetShape(new_operand));
-      const int64 rank = ShapeUtil::Rank(shape);
+      const int64 rank = shape.rank();
       if (rank != output_rank) {
         TF_ASSIGN_OR_RETURN(new_operand,
                             InDimBroadcast(output_shape, new_operand, {}));
@@ -1861,7 +1958,7 @@ XlaOp XlaBuilder::ReduceAll(const XlaOp& operand, const XlaOp& init_value,
                             const XlaComputation& computation) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    std::vector<int64> all_dimnos(ShapeUtil::Rank(operand_shape));
+    std::vector<int64> all_dimnos(operand_shape.rank());
     std::iota(all_dimnos.begin(), all_dimnos.end(), 0);
     return Reduce(operand, init_value, computation, all_dimnos);
   });
@@ -2000,8 +2097,8 @@ XlaOp XlaBuilder::CrossReplicaSum(
     TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand));
     const Shape& scalar_shape = ShapeUtil::MakeShape(shape.element_type(), {});
     auto b = CreateSubBuilder("sum");
-    b->Add(b->Parameter(/*parameter_number=*/0, scalar_shape, "x"),
-           b->Parameter(/*parameter_number=*/1, scalar_shape, "y"));
+    Add(b->Parameter(/*parameter_number=*/0, scalar_shape, "x"),
+        b->Parameter(/*parameter_number=*/1, scalar_shape, "y"));
     TF_ASSIGN_OR_RETURN(auto computation, b->Build());
     return CrossReplicaSum(operand, computation, replica_groups,
                            /*channel_id=*/absl::nullopt);
@@ -2015,8 +2112,8 @@ XlaOp XlaBuilder::CrossReplicaSum(
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferCrossReplicaSumShape(
-                                         {&operand_shape}));
+    TF_ASSIGN_OR_RETURN(Shape shape,
+                        ShapeInference::InferAllReduceShape({&operand_shape}));
     *instr.mutable_shape() = shape.ToProto();
 
     for (const ReplicaGroup& group : replica_groups) {
@@ -2029,8 +2126,7 @@ XlaOp XlaBuilder::CrossReplicaSum(
 
     AddCalledComputation(computation, &instr);
 
-    return AddInstruction(std::move(instr), HloOpcode::kCrossReplicaSum,
-                          {operand});
+    return AddInstruction(std::move(instr), HloOpcode::kAllReduce, {operand});
   });
 }
 
@@ -2111,6 +2207,14 @@ XlaOp XlaBuilder::CollectivePermute(
   });
 }
 
+XlaOp XlaBuilder::ReplicaId() {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    *instr.mutable_shape() = ShapeUtil::MakeShape(U32, {}).ToProto();
+    return AddInstruction(std::move(instr), HloOpcode::kReplicaId, {});
+  });
+}
+
 XlaOp XlaBuilder::SelectAndScatter(const XlaOp& operand,
                                    const XlaComputation& select,
                                    absl::Span<const int64> window_dimensions,
@@ -2288,7 +2392,7 @@ XlaOp XlaBuilder::SendToHost(const XlaOp& operand, const XlaOp& token,
           ShapeUtil::HumanStringWithLayout(operand_shape));
     }
     // TODO(b/111544877): Support tuple shapes.
-    if (!ShapeUtil::IsArray(operand_shape)) {
+    if (!operand_shape.IsArray()) {
       return InvalidArgument("SendToHost only supports array shapes, shape: %s",
                              ShapeUtil::HumanString(operand_shape));
     }
@@ -2328,7 +2432,7 @@ XlaOp XlaBuilder::RecvFromHost(const XlaOp& token, const Shape& shape,
     }
 
     // TODO(b/111544877): Support tuple shapes.
-    if (!ShapeUtil::IsArray(shape)) {
+    if (!shape.IsArray()) {
       return InvalidArgument(
           "RecvFromHost only supports array shapes, shape: %s",
           ShapeUtil::HumanString(shape));
@@ -2381,7 +2485,7 @@ StatusOr<bool> XlaBuilder::IsConstant(const XlaOp& operand) const {
   TF_RETURN_IF_ERROR(LookUpInstruction(operand).status());
 
   bool is_constant = true;
-  std::set<int64> visited;
+  absl::flat_hash_set<int64> visited;
   IsConstantVisitor(operand.handle(), &visited, &is_constant);
   return is_constant;
 }
@@ -2428,21 +2532,58 @@ StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
     worklist.pop();
     TF_ASSIGN_OR_RETURN(const HloInstructionProto* instr_proto,
                         LookUpInstructionByHandle(handle));
-    for (int64 id : instr_proto->operand_ids()) {
-      if (related_ops.insert(id).second) {
-        worklist.push(id);
+
+    if (instr_proto->opcode() ==
+        HloOpcodeString(HloOpcode::kGetDimensionSize)) {
+      // At this point, BuildConstantSubGraph should never encounter a
+      // GetDimensionSize with a dynamic dimension. IsConstant check would have
+      // failed at the beginning of this function.
+      //
+      // Replace GetDimensionSize with a Constant representing the static bound
+      // of the shape.
+      int64 dimension = instr_proto->dimensions(0);
+      int64 operand_handle = instr_proto->operand_ids(0);
+      TF_ASSIGN_OR_RETURN(const HloInstructionProto* operand_proto,
+                          LookUpInstructionByHandle(operand_handle));
+
+      TF_RET_CHECK(!operand_proto->shape().is_dynamic_dimension(dimension));
+      auto constant_dimension_size =
+          static_cast<uint32>(operand_proto->shape().dimensions(dimension));
+
+      Literal literal = LiteralUtil::CreateR0(constant_dimension_size);
+
+      HloInstructionProto const_instr;
+      *const_instr.mutable_shape() = literal.shape().ToProto();
+      *const_instr.mutable_literal() = literal.ToProto();
+      *const_instr.mutable_opcode() = HloOpcodeString(HloOpcode::kConstant);
+
+      const_instr.set_id(handle);
+      *const_instr.mutable_name() =
+          GetFullName(const_instr.opcode(), kNameSeparator, const_instr.id());
+      *entry.add_instructions() =
+          const_instr;  // Add to the result constant graph.
+    } else {
+      for (int64 id : instr_proto->operand_ids()) {
+        if (related_ops.insert(id).second) {
+          worklist.push(id);
+        }
+      }
+      for (int64 called_id : instr_proto->called_computation_ids()) {
+        related_calls.insert(called_id);
       }
-    }
-    for (int64 called_id : instr_proto->called_computation_ids()) {
-      related_calls.insert(called_id);
     }
   }
 
   // Add related ops to the computation.
   for (int64 id : related_ops) {
-    auto* instr = entry.add_instructions();
     TF_ASSIGN_OR_RETURN(const HloInstructionProto* instr_src,
                         LookUpInstructionByHandle(id));
+
+    if (instr_src->opcode() == HloOpcodeString(HloOpcode::kGetDimensionSize)) {
+      continue;
+    }
+    auto* instr = entry.add_instructions();
+
     *instr = *instr_src;
     // Ensures that the instruction names are unique among the graph.
     const string& new_name =
@@ -2715,12 +2856,21 @@ XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
                    absl::Span<const int64> slice_sizes) {
   return operand.builder()->DynamicSlice(operand, start_indices, slice_sizes);
 }
+XlaOp DynamicSlice(const XlaOp& operand, absl::Span<const XlaOp> start_indices,
+                   absl::Span<const int64> slice_sizes) {
+  return operand.builder()->DynamicSlice(operand, start_indices, slice_sizes);
+}
 
 XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
                          const XlaOp& start_indices) {
   return operand.builder()->DynamicUpdateSlice(operand, update, start_indices);
 }
 
+XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+                         absl::Span<const XlaOp> start_indices) {
+  return operand.builder()->DynamicUpdateSlice(operand, update, start_indices);
+}
+
 XlaOp ConcatInDim(XlaBuilder* builder, absl::Span<const XlaOp> operands,
                   int64 dimension) {
   return builder->ConcatInDim(operands, dimension);
@@ -2744,32 +2894,38 @@ XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index) {
 
 XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
          absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Eq(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kEq, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
          absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Ne(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kNe, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
          absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Ge(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kGe, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
          absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Gt(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kGt, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
-XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
          absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Lt(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kLe, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
-XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
+XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
          absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Le(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kLt, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs,
@@ -2786,38 +2942,42 @@ XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
 
 XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> window_strides, Padding padding,
-           int64 feature_group_count, const PrecisionConfig* precision_config) {
+           int64 feature_group_count, int64 batch_group_count,
+           const PrecisionConfig* precision_config) {
   return lhs.builder()->Conv(lhs, rhs, window_strides, padding,
-                             feature_group_count, precision_config);
+                             feature_group_count, batch_group_count,
+                             precision_config);
 }
 
 XlaOp ConvWithGeneralPadding(const XlaOp& lhs, const XlaOp& rhs,
                              absl::Span<const int64> window_strides,
                              absl::Span<const std::pair<int64, int64>> padding,
-                             int64 feature_group_count,
+                             int64 feature_group_count, int64 batch_group_count,
                              const PrecisionConfig* precision_config) {
   return lhs.builder()->ConvWithGeneralPadding(
-      lhs, rhs, window_strides, padding, feature_group_count, precision_config);
+      lhs, rhs, window_strides, padding, feature_group_count, batch_group_count,
+      precision_config);
 }
 
 XlaOp ConvWithGeneralDimensions(
     const XlaOp& lhs, const XlaOp& rhs, absl::Span<const int64> window_strides,
     Padding padding, const ConvolutionDimensionNumbers& dimension_numbers,
-    int64 feature_group_count, const PrecisionConfig* precision_config) {
+    int64 feature_group_count, int64 batch_group_count,
+    const PrecisionConfig* precision_config) {
   return lhs.builder()->ConvWithGeneralDimensions(
       lhs, rhs, window_strides, padding, dimension_numbers, feature_group_count,
-      precision_config);
+      batch_group_count, precision_config);
 }
 
 XlaOp ConvGeneral(const XlaOp& lhs, const XlaOp& rhs,
                   absl::Span<const int64> window_strides,
                   absl::Span<const std::pair<int64, int64>> padding,
                   const ConvolutionDimensionNumbers& dimension_numbers,
-                  int64 feature_group_count,
+                  int64 feature_group_count, int64 batch_group_count,
                   const PrecisionConfig* precision_config) {
   return lhs.builder()->ConvGeneral(lhs, rhs, window_strides, padding,
                                     dimension_numbers, feature_group_count,
-                                    precision_config);
+                                    batch_group_count, precision_config);
 }
 
 XlaOp ConvGeneralDilated(const XlaOp& lhs, const XlaOp& rhs,
@@ -2826,11 +2986,12 @@ XlaOp ConvGeneralDilated(const XlaOp& lhs, const XlaOp& rhs,
                          absl::Span<const int64> lhs_dilation,
                          absl::Span<const int64> rhs_dilation,
                          const ConvolutionDimensionNumbers& dimension_numbers,
-                         int64 feature_group_count,
+                         int64 feature_group_count, int64 batch_group_count,
                          const PrecisionConfig* precision_config) {
   return lhs.builder()->ConvGeneralDilated(
       lhs, rhs, window_strides, padding, lhs_dilation, rhs_dilation,
-      dimension_numbers, feature_group_count, precision_config);
+      dimension_numbers, feature_group_count, batch_group_count,
+      precision_config);
 }
 
 XlaOp Fft(const XlaOp& operand, FftType fft_type,
@@ -2838,6 +2999,29 @@ XlaOp Fft(const XlaOp& operand, FftType fft_type,
   return operand.builder()->Fft(operand, fft_type, fft_length);
 }
 
+XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
+                      bool unit_diagonal,
+                      TriangularSolveOptions::Transpose transpose_a) {
+  XlaBuilder* builder = a.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& a_shape, builder->GetShape(a));
+    TF_ASSIGN_OR_RETURN(const Shape& b_shape, builder->GetShape(b));
+    xla::TriangularSolveOptions& options =
+        *instr.mutable_triangular_solve_options();
+    options.set_left_side(left_side);
+    options.set_lower(lower);
+    options.set_unit_diagonal(unit_diagonal);
+    options.set_transpose_a(transpose_a);
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferTriangularSolveShape(
+                                         a_shape, b_shape, options));
+    *instr.mutable_shape() = shape.ToProto();
+
+    return builder->AddInstruction(std::move(instr),
+                                   HloOpcode::kTriangularSolve, {a, b});
+  });
+}
+
 XlaOp Infeed(XlaBuilder* builder, const Shape& shape, const string& config) {
   return builder->Infeed(shape, config);
 }
@@ -2867,78 +3051,96 @@ XlaOp CustomCallWithLayout(XlaBuilder* builder, const string& call_target_name,
                              operand_shapes_with_layout);
 }
 
-XlaOp Complex(const XlaOp& real, const XlaOp& imag,
+XlaOp Complex(const XlaOp& lhs, const XlaOp& rhs,
               absl::Span<const int64> broadcast_dimensions) {
-  return real.builder()->Complex(real, imag, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kComplex, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
-XlaOp Conj(const XlaOp& operand) { return operand.builder()->Conj(operand); }
+XlaOp Conj(const XlaOp& operand) {
+  return Complex(Real(operand), Neg(Imag(operand)));
+}
 
 XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Add(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kAdd, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Sub(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kSubtract, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Mul(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kMultiply, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Div(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kDivide, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Rem(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kRemainder, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Max(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kMaximum, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Min(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kMinimum, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->And(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kAnd, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Or(const XlaOp& lhs, const XlaOp& rhs,
          absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Or(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kOr, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Xor(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kXor, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
-XlaOp Not(const XlaOp& operand) { return operand.builder()->Not(operand); }
+XlaOp Not(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kNot, operand);
+}
 
 XlaOp ShiftLeft(const XlaOp& lhs, const XlaOp& rhs,
                 absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->ShiftLeft(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kShiftLeft, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp ShiftRightArithmetic(const XlaOp& lhs, const XlaOp& rhs,
                            absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->ShiftRightArithmetic(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kShiftRightArithmetic, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp ShiftRightLogical(const XlaOp& lhs, const XlaOp& rhs,
                         absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->ShiftRightLogical(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kShiftRightLogical, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
@@ -3010,6 +3212,8 @@ XlaOp CollectivePermute(
   return operand.builder()->CollectivePermute(operand, source_target_pairs);
 }
 
+XlaOp ReplicaId(XlaBuilder* builder) { return builder->ReplicaId(); }
+
 XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
                        absl::Span<const int64> window_dimensions,
                        absl::Span<const int64> window_strides, Padding padding,
@@ -3031,48 +3235,73 @@ XlaOp SelectAndScatterWithGeneralPadding(
       init_value, scatter);
 }
 
-XlaOp Abs(const XlaOp& operand) { return operand.builder()->Abs(operand); }
+XlaOp Abs(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kAbs, operand);
+}
 
-XlaOp Atan2(const XlaOp& y, const XlaOp& x,
+XlaOp Atan2(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions) {
-  return y.builder()->Atan2(y, x, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kAtan2, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
-XlaOp Exp(const XlaOp& operand) { return operand.builder()->Exp(operand); }
-
-XlaOp Expm1(const XlaOp& operand) { return operand.builder()->Expm1(operand); }
-
-XlaOp Floor(const XlaOp& operand) { return operand.builder()->Floor(operand); }
-
-XlaOp Ceil(const XlaOp& operand) { return operand.builder()->Ceil(operand); }
-
-XlaOp Round(const XlaOp& operand) { return operand.builder()->Round(operand); }
-
-XlaOp Log(const XlaOp& operand) { return operand.builder()->Log(operand); }
-
-XlaOp Log1p(const XlaOp& operand) { return operand.builder()->Log1p(operand); }
-
-XlaOp Sign(const XlaOp& operand) { return operand.builder()->Sign(operand); }
-
-XlaOp Clz(const XlaOp& operand) { return operand.builder()->Clz(operand); }
-
-XlaOp Cos(const XlaOp& operand) { return operand.builder()->Cos(operand); }
-
-XlaOp Sin(const XlaOp& operand) { return operand.builder()->Sin(operand); }
-
-XlaOp Tanh(const XlaOp& operand) { return operand.builder()->Tanh(operand); }
-
-XlaOp Real(const XlaOp& operand) { return operand.builder()->Real(operand); }
-
-XlaOp Imag(const XlaOp& operand) { return operand.builder()->Imag(operand); }
+XlaOp Exp(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kExp, operand);
+}
+XlaOp Expm1(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kExpm1, operand);
+}
+XlaOp Floor(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kFloor, operand);
+}
+XlaOp Ceil(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kCeil, operand);
+}
+XlaOp Round(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kRoundNearestAfz, operand);
+}
+XlaOp Log(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kLog, operand);
+}
+XlaOp Log1p(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kLog1p, operand);
+}
+XlaOp Sign(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kSign, operand);
+}
+XlaOp Clz(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kClz, operand);
+}
+XlaOp Cos(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kCos, operand);
+}
+XlaOp Sin(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kSin, operand);
+}
+XlaOp Tanh(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kTanh, operand);
+}
+XlaOp Real(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kReal, operand);
+}
+XlaOp Imag(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kImag, operand);
+}
+XlaOp Sqrt(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kSqrt, operand);
+}
+XlaOp Rsqrt(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kRsqrt, operand);
+}
 
 XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions) {
-  return lhs.builder()->Pow(lhs, rhs, broadcast_dimensions);
+  return lhs.builder()->BinaryOp(HloOpcode::kPower, lhs, rhs,
+                                 broadcast_dimensions);
 }
 
 XlaOp IsFinite(const XlaOp& operand) {
-  return operand.builder()->IsFinite(operand);
+  return operand.builder()->UnaryOp(HloOpcode::kIsFinite, operand);
 }
 
 XlaOp ConvertElementType(const XlaOp& operand, PrimitiveType new_element_type) {
@@ -3083,7 +3312,9 @@ XlaOp BitcastConvertType(const XlaOp& operand, PrimitiveType new_element_type) {
   return operand.builder()->BitcastConvertType(operand, new_element_type);
 }
 
-XlaOp Neg(const XlaOp& operand) { return operand.builder()->Neg(operand); }
+XlaOp Neg(const XlaOp& operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kNegate, operand);
+}
 
 XlaOp Transpose(const XlaOp& operand, absl::Span<const int64> permutation) {
   return operand.builder()->Transpose(operand, permutation);
@@ -3097,6 +3328,12 @@ XlaOp Sort(const XlaOp& keys, absl::Span<const XlaOp> values, int64 dimension) {
   return keys.builder()->Sort(keys, values, dimension);
 }
 
+XlaOp Sort(absl::Span<const XlaOp> operands, const XlaComputation& comparator,
+           int64 dimension, bool is_stable) {
+  return operands[0].builder()->Sort(operands, comparator, dimension,
+                                     is_stable);
+}
+
 XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max) {
   return min.builder()->Clamp(min, operand, max);
 }
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 098efb60f9bdca8306ff771a505f4a225dea9f7d..129e51674293fe7decd041ed05641519a8e8e444 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -56,6 +56,9 @@ class XlaOp {
   }
   ~XlaOp() = default;
 
+  XlaOp(const XlaOp& other) = default;
+  XlaOp& operator=(const XlaOp& other) = default;
+
   // Precondition: !IsUninitialized().
   //
   // It's very common to do foo.builder()->bar().  Without this precondition, if
@@ -197,11 +200,19 @@ class XlaBuilder {
   // status. Note that all ops that have been enqueued will be moved to the
   // computation being returned. The root of the computation will be the last
   // added operation.
-  StatusOr<XlaComputation> Build();
+  //
+  // `remove_dynamic_dimensions` tells the builder whether to remove the
+  // dyanmic dimensions information in all ops.
+  //
+  // TODO(b/121223198): Delete `remove_dynamic_dimensions` and keeps the
+  // dynamic dimensions information when XLA backend can handle dynamic
+  // dimensions.
+  StatusOr<XlaComputation> Build(bool remove_dynamic_dimensions = true);
 
   // Overload of Build which specifies a particular root instruction for the
   // computation.
-  StatusOr<XlaComputation> Build(XlaOp root);
+  StatusOr<XlaComputation> Build(XlaOp root,
+                                 bool remove_dynamic_dimensions = true);
 
   // Builds the computation with the requested operations, or notes an error in
   // the parent XlaBuilder and returns an empty computation if building failed.
@@ -227,6 +238,10 @@ class XlaBuilder {
   // See also set_die_immediately_on_error().
   Status first_error() const { return first_error_; }
 
+  // Returns the current status of the builder, complete with the stack trace
+  // information.
+  Status GetCurrentStatus() const;
+
   // Returns the shape of the given op.
   StatusOr<Shape> GetShape(const XlaOp& op) const;
 
@@ -269,6 +284,10 @@ class XlaBuilder {
   // and its real dynamic size is represented by `dynamic_param_index` in
   // parameter `dynamic_param_num`.
   //
+  // Note that this should be called before the dynamic parameters are used to
+  // create other operations, otherwise created operations won't have the
+  // dynamic dimensions information.
+  //
   // TODO(b/119520625): Remove this API once we have more dynamic shape infra
   // ready.
   Status SetDynamicBinding(int64 dynamic_size_param_num,
@@ -276,9 +295,24 @@ class XlaBuilder {
                            int64 target_param_num,
                            ShapeIndex target_param_index, int64 target_dim_num);
 
+  // Adds a new input/output alias. Since the input/ouput shape information are
+  // not available until the computation is built, and eventual error in the
+  // arguments of this API will be detected only at computation Build() time.
+  void SetUpAlias(const ShapeIndex& output_index, int64 param_number,
+                  const ShapeIndex& param_index) {
+    input_output_aliases_.push_back({output_index, param_number, param_index});
+  }
+
  private:
+  // Describes an input/output alias as inserted by the SetUpAlias() API.
+  struct InputOutputAlias {
+    ShapeIndex output_index;
+    int64 param_number;
+    ShapeIndex param_index;
+  };
+
   // Build helper which takes the id of the root operation..
-  StatusOr<XlaComputation> Build(int64 root_id);
+  StatusOr<XlaComputation> Build(int64 root_id, bool remove_dynamic_dimensions);
 
   // Description for the methods below can be found in the corresponding public
   // functions section in this file.
@@ -288,38 +322,6 @@ class XlaBuilder {
 
   XlaOp ConstantLiteral(const LiteralSlice& literal);
 
-  template <typename NativeT>
-  XlaOp ConstantR0(NativeT value);
-  template <typename NativeT>
-  XlaOp ConstantR1(absl::Span<const NativeT> values);
-  XlaOp ConstantR1(const tensorflow::core::Bitmap& values);
-  template <typename NativeT>
-  XlaOp ConstantR2(
-      std::initializer_list<std::initializer_list<NativeT>> values);
-  template <typename NativeT>
-  XlaOp ConstantFromArrayWithLayout(const Array<NativeT>& values,
-                                    const Layout& layout);
-  template <typename NativeT>
-  XlaOp ConstantFromArray(const Array<NativeT>& values);
-  template <typename NativeT>
-  XlaOp ConstantR2FromArray2DWithLayout(const Array2D<NativeT>& values,
-                                        const Layout& layout);
-  template <typename NativeT>
-  XlaOp ConstantR2FromArray2D(const Array2D<NativeT>& values);
-  template <typename NativeT>
-  XlaOp ConstantR3FromArray3DWithLayout(const Array3D<NativeT>& values,
-                                        const Layout& layout);
-  template <typename NativeT>
-  XlaOp ConstantR3FromArray3D(const Array3D<NativeT>& values);
-  template <typename NativeT>
-  XlaOp ConstantR4FromArray4DWithLayout(const Array4D<NativeT>& values,
-                                        const Layout& layout);
-  template <typename NativeT>
-  XlaOp ConstantR4FromArray4D(const Array4D<NativeT>& values);
-
-  template <typename NativeT>
-  XlaOp ConstantR1(int64 length, NativeT value);
-
   XlaOp Broadcast(const XlaOp& operand,
                   absl::Span<const int64> broadcast_sizes);
 
@@ -344,11 +346,18 @@ class XlaBuilder {
   XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index,
                    int64 stride, int64 dimno);
 
+  ABSL_DEPRECATED("Use span-of-indices form instead")
   XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
                      absl::Span<const int64> slice_sizes);
+  XlaOp DynamicSlice(const XlaOp& operand,
+                     absl::Span<const XlaOp> start_indices,
+                     absl::Span<const int64> slice_sizes);
 
+  ABSL_DEPRECATED("Use span-of-indices form instead")
   XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
                            const XlaOp& start_indices);
+  XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+                           absl::Span<const XlaOp> start_indices);
 
   XlaOp ConcatInDim(absl::Span<const XlaOp> operands, int64 dimension);
 
@@ -360,24 +369,6 @@ class XlaBuilder {
 
   XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index);
 
-  XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
-           absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
-           absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
-           absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
-           absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
-           absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
-           absl::Span<const int64> broadcast_dimensions = {});
-
   XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs,
             const PrecisionConfig* precision_config = nullptr);
 
@@ -387,28 +378,28 @@ class XlaBuilder {
 
   XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
              absl::Span<const int64> window_strides, Padding padding,
-             int64 feature_group_count = 1,
+             int64 feature_group_count = 1, int64 batch_group_count = 1,
              const PrecisionConfig* precision_config = nullptr);
 
   XlaOp ConvWithGeneralPadding(
       const XlaOp& lhs, const XlaOp& rhs,
       absl::Span<const int64> window_strides,
       absl::Span<const std::pair<int64, int64>> padding,
-      int64 feature_group_count = 1,
+      int64 feature_group_count = 1, int64 batch_group_count = 1,
       const PrecisionConfig* precision_config = nullptr);
 
   XlaOp ConvWithGeneralDimensions(
       const XlaOp& lhs, const XlaOp& rhs,
       absl::Span<const int64> window_strides, Padding padding,
       const ConvolutionDimensionNumbers& dimension_numbers,
-      int64 feature_group_count = 1,
+      int64 feature_group_count = 1, int64 batch_group_count = 1,
       const PrecisionConfig* precision_config = nullptr);
 
   XlaOp ConvGeneral(const XlaOp& lhs, const XlaOp& rhs,
                     absl::Span<const int64> window_strides,
                     absl::Span<const std::pair<int64, int64>> padding,
                     const ConvolutionDimensionNumbers& dimension_numbers,
-                    int64 feature_group_count = 1,
+                    int64 feature_group_count = 1, int64 batch_group_count = 1,
                     const PrecisionConfig* precision_config = nullptr);
 
   XlaOp ConvGeneralDilated(const XlaOp& lhs, const XlaOp& rhs,
@@ -418,6 +409,7 @@ class XlaBuilder {
                            absl::Span<const int64> rhs_dilation,
                            const ConvolutionDimensionNumbers& dimension_numbers,
                            int64 feature_group_count = 1,
+                           int64 batch_group_count = 1,
                            const PrecisionConfig* precision_config = nullptr);
 
   XlaOp Fft(const XlaOp& operand, FftType fft_type,
@@ -441,50 +433,6 @@ class XlaBuilder {
       const Shape& shape_with_layout, const string& opaque,
       absl::optional<absl::Span<const Shape>> operand_shapes_with_layout);
 
-  XlaOp Complex(const XlaOp& real, const XlaOp& imag,
-                absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Conj(const XlaOp& operand);
-
-  XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Or(const XlaOp& lhs, const XlaOp& rhs,
-           absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Not(const XlaOp& operand);
-
-  XlaOp ShiftLeft(const XlaOp& lhs, const XlaOp& rhs,
-                  absl::Span<const int64> broadcast_dimensions = {});
-  XlaOp ShiftRightArithmetic(const XlaOp& lhs, const XlaOp& rhs,
-                             absl::Span<const int64> broadcast_dimensions = {});
-  XlaOp ShiftRightLogical(const XlaOp& lhs, const XlaOp& rhs,
-                          absl::Span<const int64> broadcast_dimensions = {});
-
   XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
                const XlaComputation& computation,
                absl::Span<const int64> dimensions_to_reduce);
@@ -527,6 +475,8 @@ class XlaBuilder {
       const XlaOp& operand,
       const std::vector<std::pair<int64, int64>>& source_target_pairs);
 
+  XlaOp ReplicaId();
+
   XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
                          absl::Span<const int64> window_dimensions,
                          absl::Span<const int64> window_strides,
@@ -541,44 +491,6 @@ class XlaBuilder {
       absl::Span<const std::pair<int64, int64>> padding, const XlaOp& source,
       const XlaOp& init_value, const XlaComputation& scatter);
 
-  XlaOp Abs(const XlaOp& operand);
-
-  XlaOp Atan2(const XlaOp& y, const XlaOp& x,
-              absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp Exp(const XlaOp& operand);
-
-  XlaOp Expm1(const XlaOp& operand);
-
-  XlaOp Floor(const XlaOp& operand);
-
-  XlaOp Ceil(const XlaOp& operand);
-
-  XlaOp Round(const XlaOp& operand);
-
-  XlaOp Log(const XlaOp& operand);
-
-  XlaOp Log1p(const XlaOp& operand);
-
-  XlaOp Sign(const XlaOp& operand);
-
-  XlaOp Clz(const XlaOp& operand);
-
-  XlaOp Cos(const XlaOp& operand);
-
-  XlaOp Sin(const XlaOp& operand);
-
-  XlaOp Tanh(const XlaOp& operand);
-
-  XlaOp Real(const XlaOp& operand);
-
-  XlaOp Imag(const XlaOp& operand);
-
-  XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
-            absl::Span<const int64> broadcast_dimensions = {});
-
-  XlaOp IsFinite(const XlaOp& operand);
-
   XlaOp Iota(const Shape& shape, int64 iota_dimension);
 
   XlaOp Iota(PrimitiveType type, int64 size);
@@ -589,14 +501,15 @@ class XlaBuilder {
   XlaOp BitcastConvertType(const XlaOp& operand,
                            PrimitiveType new_element_type);
 
-  XlaOp Neg(const XlaOp& operand);
-
   XlaOp Transpose(const XlaOp& operand, absl::Span<const int64> permutation);
 
   XlaOp Rev(const XlaOp& operand, absl::Span<const int64> dimensions);
 
+  ABSL_DEPRECATED("Use form with comparator computation instead")
   XlaOp Sort(const XlaOp& keys, absl::Span<const XlaOp> values = {},
              int64 dimension = -1);
+  XlaOp Sort(absl::Span<const XlaOp> operands, const XlaComputation& comparator,
+             int64 dimension = -1, bool is_stable = false);
 
   XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
 
@@ -711,7 +624,8 @@ class XlaBuilder {
   // operation such as `RngNormal` or `Infeed`. The visitor walks the
   // computation starting at a given operation and sets is_constant to false iff
   // a parameter or stateful operation is encountered.
-  void IsConstantVisitor(const int64 op_handle, std::set<int64>* visited,
+  void IsConstantVisitor(const int64 op_handle,
+                         absl::flat_hash_set<int64>* visited,
                          bool* is_constant) const;
 
   // Checks bounds for convolution parameters.
@@ -729,6 +643,12 @@ class XlaBuilder {
 
   int64 GetNextId() { return ++next_id_; }
 
+  // Populates the module with the input/output alias information stored within
+  // the input_output_aliases vector.
+  static Status PopulateInputOutputAlias(
+      HloModuleProto* module, const ProgramShape& program_shape,
+      const std::vector<InputOutputAlias>& input_output_aliases);
+
   string name_;  // Name to use for the built computation.
 
   // The next sequential ID for every instruction/computation contained within
@@ -748,6 +668,9 @@ class XlaBuilder {
   // Dynamic parameter configuration of this computation.
   DynamicParameterBinding dynamic_parameter_binding_;
 
+  // Holds the input/output alias information populated by the SetUpAlias() API.
+  std::vector<InputOutputAlias> input_output_aliases_;
+
   // A map from XlaOp::Handle to the index in the instructions_ vector where the
   // instruction is held.
   absl::flat_hash_map<int64, int64> handle_to_index_;
@@ -778,48 +701,6 @@ class XlaBuilder {
                          const Shape& shape, const string& name);
   friend XlaOp ConstantLiteral(XlaBuilder* builder,
                                const LiteralSlice& literal);
-  template <typename NativeT>
-  friend XlaOp ConstantR0(XlaBuilder* builder, NativeT value);
-  template <typename NativeT>
-  friend XlaOp ConstantR1(XlaBuilder* builder,
-                          absl::Span<const NativeT> values);
-  friend XlaOp ConstantR1(XlaBuilder* builder,
-                          const tensorflow::core::Bitmap& values);
-  template <typename NativeT>
-  friend XlaOp ConstantR2(
-      XlaBuilder* builder,
-      std::initializer_list<std::initializer_list<NativeT>> values);
-  template <typename NativeT>
-  friend XlaOp ConstantFromArrayWithLayout(XlaBuilder* builder,
-                                           const Array<NativeT>& values,
-                                           const Layout& layout);
-  template <typename NativeT>
-  friend XlaOp ConstantFromArray(XlaBuilder* builder,
-                                 const Array<NativeT>& values);
-  template <typename NativeT>
-  friend XlaOp ConstantR2FromArray2DWithLayout(XlaBuilder* builder,
-                                               const Array2D<NativeT>& values,
-                                               const Layout& layout);
-  template <typename NativeT>
-  friend XlaOp ConstantR2FromArray2D(XlaBuilder* builder,
-                                     const Array2D<NativeT>& values);
-  template <typename NativeT>
-  friend XlaOp ConstantR3FromArray3DWithLayout(XlaBuilder* builder,
-                                               const Array3D<NativeT>& values,
-                                               const Layout& layout);
-  template <typename NativeT>
-  friend XlaOp ConstantR3FromArray3D(XlaBuilder* builder,
-                                     const Array3D<NativeT>& values);
-  template <typename NativeT>
-  friend XlaOp ConstantR4FromArray4DWithLayout(XlaBuilder* builder,
-                                               const Array4D<NativeT>& values,
-                                               const Layout& layout);
-  template <typename NativeT>
-  friend XlaOp ConstantR4FromArray4D(XlaBuilder* builder,
-                                     const Array4D<NativeT>& values);
-
-  template <typename NativeT>
-  friend XlaOp ConstantR1(XlaBuilder* builder, int64 length, NativeT value);
 
   friend XlaOp Broadcast(const XlaOp& operand,
                          absl::Span<const int64> broadcast_sizes);
@@ -849,9 +730,14 @@ class XlaBuilder {
 
   friend XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
                             absl::Span<const int64> slice_sizes);
+  friend XlaOp DynamicSlice(const XlaOp& operand,
+                            absl::Span<const XlaOp> start_indices,
+                            absl::Span<const int64> slice_sizes);
 
   friend XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
                                   const XlaOp& start_indices);
+  friend XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+                                  absl::Span<const XlaOp> start_indices);
 
   friend XlaOp ConcatInDim(XlaBuilder* builder,
                            absl::Span<const XlaOp> operands, int64 dimension);
@@ -881,23 +767,25 @@ class XlaBuilder {
                           const PrecisionConfig* precision_config);
   friend XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
                     absl::Span<const int64> window_strides, Padding padding,
-                    int64 feature_group_count,
+                    int64 feature_group_count, int64 batch_group_count,
                     const PrecisionConfig* precision_config);
   friend XlaOp ConvWithGeneralPadding(
       const XlaOp& lhs, const XlaOp& rhs,
       absl::Span<const int64> window_strides,
       absl::Span<const std::pair<int64, int64>> padding,
-      int64 feature_group_count, const PrecisionConfig* precision_config);
+      int64 feature_group_count, int64 batch_group_count,
+      const PrecisionConfig* precision_config);
   friend XlaOp ConvWithGeneralDimensions(
       const XlaOp& lhs, const XlaOp& rhs,
       absl::Span<const int64> window_strides, Padding padding,
       const ConvolutionDimensionNumbers& dimension_numbers,
-      int64 feature_group_count, const PrecisionConfig* precision_config);
+      int64 feature_group_count, int64 batch_group_count,
+      const PrecisionConfig* precision_config);
   friend XlaOp ConvGeneral(const XlaOp& lhs, const XlaOp& rhs,
                            absl::Span<const int64> window_strides,
                            absl::Span<const std::pair<int64, int64>> padding,
                            const ConvolutionDimensionNumbers& dimension_numbers,
-                           int64 feature_group_count,
+                           int64 feature_group_count, int64 batch_group_count,
                            const PrecisionConfig* precision_config);
   friend XlaOp ConvGeneralDilated(
       const XlaOp& lhs, const XlaOp& rhs,
@@ -906,9 +794,13 @@ class XlaBuilder {
       absl::Span<const int64> lhs_dilation,
       absl::Span<const int64> rhs_dilation,
       const ConvolutionDimensionNumbers& dimension_numbers,
-      int64 feature_group_count, const PrecisionConfig* precision_config);
+      int64 feature_group_count, int64 batch_group_count,
+      const PrecisionConfig* precision_config);
   friend XlaOp Fft(const XlaOp& operand, FftType fft_type,
                    absl::Span<const int64> fft_length);
+  friend XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
+                               bool unit_diagonal,
+                               TriangularSolveOptions::Transpose transpose_a);
   friend XlaOp Infeed(XlaBuilder* builder, const Shape& shape,
                       const string& config);
   friend void Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
@@ -987,6 +879,7 @@ class XlaBuilder {
   friend XlaOp CollectivePermute(
       const XlaOp& operand,
       const std::vector<std::pair<int64, int64>>& source_target_pairs);
+  friend XlaOp ReplicaId(XlaBuilder* builder);
   friend XlaOp SelectAndScatter(const XlaOp& operand,
                                 const XlaComputation& select,
                                 absl::Span<const int64> window_dimensions,
@@ -1017,6 +910,8 @@ class XlaBuilder {
   friend XlaOp Tanh(const XlaOp& operand);
   friend XlaOp Real(const XlaOp& operand);
   friend XlaOp Imag(const XlaOp& operand);
+  friend XlaOp Sqrt(const XlaOp& operand);
+  friend XlaOp Rsqrt(const XlaOp& operand);
   friend XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
                    absl::Span<const int64> broadcast_dimensions);
   friend XlaOp IsFinite(const XlaOp& operand);
@@ -1033,6 +928,9 @@ class XlaBuilder {
   friend XlaOp Rev(const XlaOp& operand, absl::Span<const int64> dimensions);
   friend XlaOp Sort(const XlaOp& keys, absl::Span<const XlaOp> values,
                     int64 dimension);
+  friend XlaOp Sort(absl::Span<const XlaOp> operands,
+                    const XlaComputation& comparator, int64 dimension,
+                    bool is_stable);
   friend XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
   friend XlaOp Map(XlaBuilder* builder, absl::Span<const XlaOp> operands,
                    const XlaComputation& computation,
@@ -1290,10 +1188,15 @@ XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index,
 // The size of the slice in each dimension is passed in 'slice_sizes',
 // which specify the end point of exclusive slice intervals in each
 // dimension [start, start + size).
-// The shape of 'start_indices' must be rank == 1, with dimension size
-// equal to the rank of the 'operand'.
+// The shape of each element of 'start_indices' must be scalar, with the span
+// size equal to the rank of the 'operand'. All elements of 'start_indices' must
+// have the same shape.
 // Slice index calculations are computed modulo input dimension sizes to
 // prevent dynamic start indices from generating out-of-bound array accesses.
+XlaOp DynamicSlice(const XlaOp& operand, absl::Span<const XlaOp> start_indices,
+                   absl::Span<const int64> slice_sizes);
+
+ABSL_DEPRECATED("Use span-of-indices form instead")
 XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
                    absl::Span<const int64> slice_sizes);
 
@@ -1309,10 +1212,15 @@ XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
 //   [4 5 6]  => DynamicUpdateslice(data, update, start)   => [4 10 11]
 //   [7 8 9]                                                  [7 8  9 ]
 //
-// The shape of 'start_indices' must be rank == 1, with dimension size
-// equal to the rank of the 'operand'.
+// The shape of each element of 'start_indices' must be scalar, with the span
+// size equal to the rank of the 'operand'. All elements of 'start_indices' must
+// have the same shape.
 // Slice index calculations are computed modulo update dimension sizes to
 // prevent dynamic start indices from generating out-of-bound array accesses.
+XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+                         absl::Span<const XlaOp> start_indices);
+
+ABSL_DEPRECATED("Use span-of-indices form instead")
 XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
                          const XlaOp& start_indices);
 
@@ -1372,7 +1280,7 @@ XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
 // default convolution dimension numbers.
 XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> window_strides, Padding padding,
-           int64 feature_group_count = 1,
+           int64 feature_group_count = 1, int64 batch_group_count = 1,
            const PrecisionConfig* precision_config = nullptr);
 
 // Enqueues a convolution instruction onto the computation, with the caller
@@ -1381,6 +1289,7 @@ XlaOp ConvWithGeneralPadding(const XlaOp& lhs, const XlaOp& rhs,
                              absl::Span<const int64> window_strides,
                              absl::Span<const std::pair<int64, int64>> padding,
                              int64 feature_group_count = 1,
+                             int64 batch_group_count = 1,
                              const PrecisionConfig* precision_config = nullptr);
 
 // Enqueues a convolution instruction onto the computation, with the caller
@@ -1388,7 +1297,7 @@ XlaOp ConvWithGeneralPadding(const XlaOp& lhs, const XlaOp& rhs,
 XlaOp ConvWithGeneralDimensions(
     const XlaOp& lhs, const XlaOp& rhs, absl::Span<const int64> window_strides,
     Padding padding, const ConvolutionDimensionNumbers& dimension_numbers,
-    int64 feature_group_count = 1,
+    int64 feature_group_count = 1, int64 batch_group_count = 1,
     const PrecisionConfig* precision_config = nullptr);
 
 // Enqueues a convolution instruction onto the computation, with the caller
@@ -1397,7 +1306,7 @@ XlaOp ConvGeneral(const XlaOp& lhs, const XlaOp& rhs,
                   absl::Span<const int64> window_strides,
                   absl::Span<const std::pair<int64, int64>> padding,
                   const ConvolutionDimensionNumbers& dimension_numbers,
-                  int64 feature_group_count = 1,
+                  int64 feature_group_count = 1, int64 batch_group_count = 1,
                   const PrecisionConfig* precision_config = nullptr);
 
 // Enqueues a convolution instruction onto the computation, with the caller
@@ -1409,6 +1318,7 @@ XlaOp ConvGeneralDilated(const XlaOp& lhs, const XlaOp& rhs,
                          absl::Span<const int64> rhs_dilation,
                          const ConvolutionDimensionNumbers& dimension_numbers,
                          int64 feature_group_count = 1,
+                         int64 batch_group_count = 1,
                          const PrecisionConfig* precision_config = nullptr);
 
 // Enqueues an FFT instruction onto the computation, of the given type and
@@ -1416,6 +1326,32 @@ XlaOp ConvGeneralDilated(const XlaOp& lhs, const XlaOp& rhs,
 XlaOp Fft(const XlaOp& operand, FftType fft_type,
           absl::Span<const int64> fft_length);
 
+// Solves systems of linear equations with lower or upper triangular coefficient
+// matrices by forward- or back-substitution. Broadcasting along leading
+// dimensions, this routine solves for x in one of the matrix systems
+//   `op(a) * x = b`,  or `x * op(a) = b`,
+// for the variable `x` given `a` and `b`, where `op(a)` is either
+//   `op(a) = a`,  or `op(a) = transpose(a)`,  or `op(a) = conj(transpose(a))`.
+//
+// * `a` is a tensor of shape `[..., M, M]` whose innermost 2 dimensions form
+//   square matrices. If `lower` is true (false), then the strictly upper
+//   (lower) triangular part of each innermost matrix in `a` is assumed to be
+//   zero and is not accessed.
+// * `b` is a tensor of shape `[..., M, K]` if `left_side` is true, otherwise a
+//   tensor of shape `[..., K, M]`.
+// * `left_side` is a boolean, indicating whether to solve a system of the form
+//   op(a) * x = b (true) or x * op(a) = b (false).
+// * `lower` is a boolean, indicating whether the argument `a` is
+// lower-triangular
+//   (true) or upper-triangular (false).
+// * If `unit_diagonal` is true, the diagonal elements of `a` are assumed to be
+//   1 and not accessed.
+// * `transpose_a` indicates which function `op` we use to transform the tensor
+//   `a`: the identity function, transpose(a), or conjugate(transpose(a))
+XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
+                      bool unit_diagonal,
+                      TriangularSolveOptions::Transpose transpose_a);
+
 // Enqueues an infeed instruction onto the computation, which writes data of
 // the given shape to the infeed buffer of the device.
 XlaOp Infeed(XlaBuilder* builder, const Shape& shape,
@@ -1515,9 +1451,33 @@ XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
 XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions = {});
 
+// Overload to call And with 3 or more operands.  We need the following somewhat
+// convoluted overload set to disambiguate with the overload that takes the
+// `broadcast_dimensions` optional param.
+inline XlaOp And(const XlaOp& op1, const XlaOp& op2, const XlaOp& op3) {
+  return And(op1, And(op2, op3));
+}
+template <typename... XlaOpTs>
+XlaOp And(const XlaOp& op1, const XlaOp& op2, const XlaOp& op3,
+          const XlaOpTs&... operands) {
+  return And(op1, And(op2, And(op3, operands...)));
+}
+
 XlaOp Or(const XlaOp& lhs, const XlaOp& rhs,
          absl::Span<const int64> broadcast_dimensions = {});
 
+// Overload to call Or with 3 or more operands.  As with `And`, we need the
+// following complicated overload set to handle the default arg in the `Or`
+// overload above.
+inline XlaOp Or(const XlaOp& op1, const XlaOp& op2, const XlaOp& op3) {
+  return Or(op1, Or(op2, op3));
+}
+template <typename... XlaOpTs>
+XlaOp Or(const XlaOp& op1, const XlaOp& op2, const XlaOp& op3,
+         const XlaOpTs&... operands) {
+  return Or(op1, Or(op2, Or(op3, operands...)));
+}
+
 XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions = {});
 
@@ -1610,6 +1570,9 @@ XlaOp CollectivePermute(
     const XlaOp& operand,
     const std::vector<std::pair<int64, int64>>& source_target_pairs);
 
+// Enqueues an operation that returns the replica ID.
+XlaOp ReplicaId(XlaBuilder* builder);
+
 // Enqueues an operation that scatters the `source` array to the selected
 // indices of each window.
 XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
@@ -1677,14 +1640,24 @@ XlaOp Real(const XlaOp& operand);
 // Enqueues an imaginary-part instruction onto the computation.
 XlaOp Imag(const XlaOp& operand);
 
+// Enqueues a sqrt computation onto the computation.
+XlaOp Sqrt(const XlaOp& operand);
+
+// Enqueues a rsqrt computation onto the computation.
+XlaOp Rsqrt(const XlaOp& operand);
+
 // Enqueues a lhs^rhs computation onto the computation.
 XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
           absl::Span<const int64> broadcast_dimensions = {});
 
-// Enqueues an operator that tests if the operand's values are finite, i.e.,
-// not Inf or NaN. Defined only for floating-point types. Returns an array of
-// booleans with the same shape where entries are true iff the corresponding
-// entry was NaN.
+// Enqueues an operator that tests if the operand's values are finite, i.e., not
+// +/-Inf or NaN.  Returns an array of booleans with the same shape where
+// entries are true iff the corresponding entry was not infinite or NaN.
+//
+// Defined only for real-valued (i.e. not complex) floating-point types; raises
+// an error for other types.
+//
+// See also IsInf, IsPosInf, IsNegInf, and IsNan in lib/math.h.
 XlaOp IsFinite(const XlaOp& operand);
 
 // Enqueues an iota operation onto the computation.
@@ -1720,7 +1693,7 @@ XlaOp Rev(const XlaOp& operand, absl::Span<const int64> dimensions);
 // of keys, in ascending order.
 // * If the keys have higher rank, the keys are sorted along the provided
 // dimension. For example, for a rank-2 tensor (a matrix) of keys, a dimension
-// value of 0 will indepenently sort every column, and a dimension value of 1
+// value of 0 will independently sort every column, and a dimension value of 1
 // will independently sort each row. If no dimension number is provided, then
 // the last dimension is chosen by default.
 //
@@ -1730,9 +1703,39 @@ XlaOp Rev(const XlaOp& operand, absl::Span<const int64> dimensions);
 // * The result is a tuple that consists of a sorted tensor of keys (along the
 // provided dimension, as above) as the first element, and tensors with their
 // corresponding values as the other elements.
+ABSL_DEPRECATED("Use form with comparator computation instead")
 XlaOp Sort(const XlaOp& keys, absl::Span<const XlaOp> values = {},
            int64 dimension = -1);
 
+// Enqueues a sort instruction onto the computation, using 'comparator' for
+// comparisons. 'comparator' needs to define a strict weak order. 'is_stable'
+// determines whether the stable sorting should be used.
+// If only one operand is provided:
+// * If the operand is a rank-1 tensor (an array), the result is a sorted array.
+//   The resulting sorting order has the property that for all index positions
+//   i, j with i < j, either
+//   comparator(value[i], value[j]) = comparator(value[j], value[i]) = false or
+//   comparator(value[i], value[j]) = true.
+// * If the operand has higher rank, the operand is sorted along the provided
+//   dimension. For example, for a rank-2 tensor (a matrix), a dimension value
+//   of 0 will independently sort every column, and a dimension value of 1 will
+//   independently sort each row. If no dimension number is provided, then the
+//   last dimension is chosen by default. For the dimension which is sorted, the
+//   same sorting order applies as in the rank-1 case.
+//
+// If more than one operand is provided:
+// * All operands must be tensors with the same dimensions. The element types of
+//   the tensors may be different.
+// * The result is a tuple that consists of the operands in sorted order (along
+//   the provided dimension, as above). The same permutation as implied by the
+//   comparison computation is applied to all operand tensors. When comparing
+//   two index positions, 'comparator' is called with 2 * n scalar parameters,
+//   where parameter 2 * i and 2 * i + 1 correspond to the value of operand i at
+//   two index positions.
+// Default comparator computations can be found in lib/comparators.h
+XlaOp Sort(absl::Span<const XlaOp> operands, const XlaComputation& comparator,
+           int64 dimension = -1, bool is_stable = false);
+
 // Enqueues a clamp instruction onto the computation.
 XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
 
@@ -1871,81 +1874,6 @@ XlaOp GetDimensionSize(const XlaOp& operand, int64 dimension);
 // Implementation details below this point.
 //
 
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR0(NativeT value) {
-  return ConstantLiteral(LiteralUtil::CreateR0<NativeT>(value));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR1(absl::Span<const NativeT> values) {
-  return ConstantLiteral(LiteralUtil::CreateR1<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR1(int64 length, NativeT value) {
-  Literal literal(ShapeUtil::MakeShape(
-      primitive_util::NativeToPrimitiveType<NativeT>(), {length}));
-  literal.PopulateWithValue(value);
-  return ConstantLiteral(literal);
-}
-
-inline XlaOp XlaBuilder::ConstantR1(const tensorflow::core::Bitmap& values) {
-  return ConstantLiteral(LiteralUtil::CreateR1(values));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR2(
-    std::initializer_list<std::initializer_list<NativeT>> values) {
-  return ConstantLiteral(LiteralUtil::CreateR2<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantFromArrayWithLayout(const Array<NativeT>& values,
-                                              const Layout& layout) {
-  return ConstantLiteral(
-      LiteralUtil::CreateFromArrayWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantFromArray(const Array<NativeT>& values) {
-  return ConstantLiteral(LiteralUtil::CreateFromArray<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR2FromArray2DWithLayout(
-    const Array2D<NativeT>& values, const Layout& layout) {
-  return ConstantLiteral(
-      LiteralUtil::CreateFromArrayWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR2FromArray2D(const Array2D<NativeT>& values) {
-  return ConstantLiteral(LiteralUtil::CreateR2FromArray2D<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR3FromArray3DWithLayout(
-    const Array3D<NativeT>& values, const Layout& layout) {
-  return ConstantLiteral(
-      LiteralUtil::CreateR3FromArray3DWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR3FromArray3D(const Array3D<NativeT>& values) {
-  return ConstantFromArray(values);
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR4FromArray4DWithLayout(
-    const Array4D<NativeT>& values, const Layout& layout) {
-  return ConstantFromArrayWithLayout(values, layout);
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR4FromArray4D(const Array4D<NativeT>& values) {
-  return ConstantFromArray(values);
-}
-
 // Free function template implementations.
 
 template <typename NativeT>
diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index b3f5be300d3f15397ad33858a6a9cab5f6029688..c9fa738a19d0928d56ac4b98beb5fc0ed195518b 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -39,7 +40,8 @@ using ::testing::HasSubstr;
 class XlaBuilderTest : public ::testing::Test {
  protected:
   StatusOr<std::unique_ptr<HloModule>> BuildHloModule(XlaBuilder* b) {
-    TF_ASSIGN_OR_RETURN(XlaComputation computation, b->Build());
+    TF_ASSIGN_OR_RETURN(XlaComputation computation,
+                        b->Build(/*remove_dynamic_dimensions=*/false));
     const HloModuleProto& proto = computation.proto();
     TF_ASSIGN_OR_RETURN(const auto& config,
                         HloModule::CreateModuleConfigFromProto(
@@ -50,7 +52,8 @@ class XlaBuilderTest : public ::testing::Test {
   // Overload which explicitly specifies the root instruction.
   StatusOr<std::unique_ptr<HloModule>> BuildHloModule(XlaBuilder* b,
                                                       XlaOp root) {
-    TF_ASSIGN_OR_RETURN(XlaComputation computation, b->Build(root));
+    TF_ASSIGN_OR_RETURN(XlaComputation computation,
+                        b->Build(root, /*remove_dynamic_dimensions=*/false));
     const HloModuleProto& proto = computation.proto();
     TF_ASSIGN_OR_RETURN(const auto& config,
                         HloModule::CreateModuleConfigFromProto(
@@ -132,6 +135,38 @@ TEST_F(XlaBuilderTest, BinaryOperatorsBuildExpectedHLO) {
       op::ShiftRightLogical(op::Constant(), op::Constant()));
 }
 
+TEST_F(XlaBuilderTest, VariadicAnd) {
+  XlaBuilder b(TestName());
+  Shape s = ShapeUtil::MakeShape(PRED, {});
+  And(Parameter(&b, 0, s, "p0"), Parameter(&b, 1, s, "p1"),
+      Parameter(&b, 2, s, "p2"));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  // Don't specify in the test whether And(x, y, z) is right- or
+  // left-associative; accept either one.
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      ::testing::AnyOf(op::And(op::Parameter(0),
+                               op::And(op::Parameter(1), op::Parameter(2))),
+                       op::And(op::And(op::Parameter(0), op::Parameter(1)),
+                               op::Parameter(2))));
+}
+
+TEST_F(XlaBuilderTest, VariadicOr) {
+  XlaBuilder b(TestName());
+  Shape s = ShapeUtil::MakeShape(PRED, {});
+  Or(Parameter(&b, 0, s, "p0"), Parameter(&b, 1, s, "p1"),
+     Parameter(&b, 2, s, "p2"));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  // Don't specify in the test whether Or(x, y, z) is right- or
+  // left-associative; accept either one.
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      ::testing::AnyOf(
+          op::Or(op::Parameter(0), op::Or(op::Parameter(1), op::Parameter(2))),
+          op::Or(op::Or(op::Parameter(0), op::Parameter(1)),
+                 op::Parameter(2))));
+}
+
 TEST_F(XlaBuilderTest, ShiftRightOperatorOnNonIntegerProducesError) {
   XlaBuilder b(TestName());
   ConstantR0<float>(&b, 1) >> ConstantR0<float>(&b, 2);
@@ -446,6 +481,461 @@ TEST_F(XlaBuilderTest, ProtoMatches) {
   EXPECT_EQ(c0_string, c1_string);
 }
 
+TEST_F(XlaBuilderTest, DynamicParameter) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5}), ShapeUtil::MakeShape(F32, {6})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  Parameter(&b, 1, ShapeUtil::MakeShape(U32, {}), "p1");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/1,
+                                   /*dynamic_size_param_index=*/{},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{1},
+                                   /*target_dim_num=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b, /*root=*/p0));
+  const Shape& param_shape = module->entry_computation()
+                                 ->parameter_instruction(0)
+                                 ->shape()
+                                 .tuple_shapes(1);
+  EXPECT_TRUE(param_shape.is_dynamic_dimension(0));
+}
+
+TEST_F(XlaBuilderTest, DynamicUnary) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  auto gte = GetTupleElement(p0, 0);
+  Neg(gte);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(result_shape.is_dynamic_dimension(0));
+}
+
+TEST_F(XlaBuilderTest, DynamicBinary) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5}), ShapeUtil::MakeShape(F32, {5}),
+       ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{1},
+                                   /*target_dim_num=*/0));
+  auto gte0 = GetTupleElement(p0, 0);
+  auto gte1 = GetTupleElement(p0, 1);
+  Add(gte0, gte1);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(result_shape.is_dynamic_dimension(0));
+}
+
+TEST_F(XlaBuilderTest, DynamicBinaryHasBroadcast) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5, 4}), ShapeUtil::MakeShape(F32, {5}),
+       ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{1},
+                                   /*target_dim_num=*/0));
+  auto gte0 = GetTupleElement(p0, 0);
+  auto gte1 = GetTupleElement(p0, 1);
+  Add(gte0, gte1, {0});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {true, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicBroadcast) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5, 4}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  auto gte = GetTupleElement(p0, 0);
+  BroadcastInDim(gte, /*out_dim_size=*/{3, 5, 4},
+                 /*broadcast_dimensions=*/{1, 2});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(
+      ContainersEqual(result_shape.dynamic_dimensions(), {false, true, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicBinaryHasDegenerateBroadcast) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {10}), ShapeUtil::MakeShape(F32, {1, 15}),
+       ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  auto gte0 = GetTupleElement(p0, 0);
+  auto gte1 = GetTupleElement(p0, 1);
+  Add(gte0, gte1, /*broadcast_dimensions=*/{0});  // f32[<=10, 15]
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {true, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicSelectOnlyPredDynamic) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(PRED, {10}), ShapeUtil::MakeShape(F32, {10}),
+       ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  auto gte0 = GetTupleElement(p0, 0);
+  auto gte1 = GetTupleElement(p0, 1);
+
+  Select(gte0, gte1, gte1);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {true}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicPad) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5, 4}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  auto pad_val = ConstantR0<float>(&b, -1);
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  auto gte = GetTupleElement(p0, 0);
+  PaddingConfig padding_config;
+  for (int i = 0; i < 2; i++) {
+    auto dimension = padding_config.add_dimensions();
+    dimension->set_edge_padding_low(0);
+    dimension->set_edge_padding_high(0);
+    dimension->set_interior_padding(0);
+  }
+  Pad(gte, pad_val, padding_config);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {true, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicConvolution) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {1, 2, 2, 128}),
+       ShapeUtil::MakeShape(F32, {2, 2, 128, 8}), ShapeUtil::MakeShape(U32, {}),
+       ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{3},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{1},
+                                   /*target_dim_num=*/2));
+  auto input = GetTupleElement(p0, 0);
+  auto filter = GetTupleElement(p0, 1);
+  ConvolutionDimensionNumbers dnums;
+  dnums.set_input_batch_dimension(0);
+  dnums.set_output_batch_dimension(0);
+  dnums.add_input_spatial_dimensions(1);
+  dnums.add_output_spatial_dimensions(1);
+  dnums.add_input_spatial_dimensions(2);
+  dnums.add_output_spatial_dimensions(2);
+  dnums.set_input_feature_dimension(3);
+  dnums.set_output_feature_dimension(3);
+  dnums.add_kernel_spatial_dimensions(0);
+  dnums.add_kernel_spatial_dimensions(1);
+  dnums.set_kernel_input_feature_dimension(2);
+  dnums.set_kernel_output_feature_dimension(3);
+  ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                            /*feature_group_count=*/1);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(),
+                              {true, false, false, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicDot) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {2, 3, 4}),
+       ShapeUtil::MakeShape(F32, {2, 4, 5}), ShapeUtil::MakeShape(U32, {}),
+       ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{1},
+                                   /*target_dim_num=*/0));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{3},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/1));
+
+  auto lhs = GetTupleElement(p0, 0);
+  auto rhs = GetTupleElement(p0, 1);
+  DotDimensionNumbers dnums;
+  dnums.add_lhs_contracting_dimensions(2);
+  dnums.add_rhs_contracting_dimensions(1);
+  dnums.add_lhs_batch_dimensions(0);
+  dnums.add_rhs_batch_dimensions(0);
+  DotGeneral(lhs, rhs, dnums);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(
+      ContainersEqual(result_shape.dynamic_dimensions(), {true, true, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicReduce) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {5, 4, 3}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  auto init = ConstantR0<float>(&b, 0);
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/1));
+  auto gte = GetTupleElement(p0, 0);
+  XlaBuilder bsum(TestName());
+  Add(Parameter(&bsum, 0, ShapeUtil::MakeShape(F32, {}), "x"),
+      Parameter(&bsum, 1, ShapeUtil::MakeShape(F32, {}), "y"));
+  TF_ASSERT_OK_AND_ASSIGN(auto sum, bsum.Build());
+  Reduce(gte, init, sum, {0});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {true, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicReduceWindow) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {2, 4, 8}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  auto init = ConstantR0<float>(&b, 0.f);
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  auto gte = GetTupleElement(p0, 0);
+  XlaBuilder bsum(TestName());
+  Add(Parameter(&bsum, 0, ShapeUtil::MakeShape(F32, {}), "x"),
+      Parameter(&bsum, 1, ShapeUtil::MakeShape(F32, {}), "y"));
+  TF_ASSERT_OK_AND_ASSIGN(auto sum, bsum.Build());
+  ReduceWindow(gte, init, sum, /*window_dimensions=*/{1, 2, 4},
+               /*window_strides=*/{1, 1, 1}, Padding::kValid);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(
+      ContainersEqual(result_shape.dynamic_dimensions(), {true, false, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicSelectAndScatter) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {2, 4, 8}),
+       ShapeUtil::MakeShape(F32, {2, 2, 2}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  auto init = ConstantR0<float>(&b, 0.f);
+  XlaBuilder bsum(TestName());
+  Add(Parameter(&bsum, 0, ShapeUtil::MakeShape(F32, {}), "x"),
+      Parameter(&bsum, 1, ShapeUtil::MakeShape(F32, {}), "y"));
+  TF_ASSERT_OK_AND_ASSIGN(auto sum, bsum.Build());
+  XlaBuilder bge(TestName());
+  Ge(Parameter(&bge, 0, ShapeUtil::MakeShape(F32, {}), "x"),
+     Parameter(&bge, 1, ShapeUtil::MakeShape(F32, {}), "y"));
+  TF_ASSERT_OK_AND_ASSIGN(auto ge, bge.Build());
+
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{1},
+                                   /*target_dim_num=*/0));
+  auto gte0 = GetTupleElement(p0, 0);
+  auto source = GetTupleElement(p0, 1);
+  SelectAndScatter(gte0, ge, {1, 2, 4}, {1, 2, 4}, Padding::kValid, source,
+                   init, sum);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(
+      ContainersEqual(result_shape.dynamic_dimensions(), {true, false, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicReshape) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {2, 3, 4, 5, 6}),
+       ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/2));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/3));
+  auto gte = GetTupleElement(p0, 0);  // f32[2, 3, <=4, <=5, 6]
+  Reshape(gte, /*new_sizes=*/{6, 4, 1, 5, 2, 3});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(result_shape.is_dynamic_dimension(1));
+  EXPECT_TRUE(result_shape.is_dynamic_dimension(3));
+  EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(),
+                              {false, true, false, true, false, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicSelect) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {4, 5, 6}),
+       ShapeUtil::MakeShape(F32, {4, 5, 6}), ShapeUtil::MakeShape(U32, {}),
+       ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  auto pred = Parameter(&b, 1, ShapeUtil::MakeShape(PRED, {}), "pred");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/1));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{3},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{1},
+                                   /*target_dim_num=*/1));
+  auto gte0 = GetTupleElement(p0, 0);
+  auto gte1 = GetTupleElement(p0, 1);
+  Select(pred, gte0, gte1);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(result_shape.is_dynamic_dimension(1));
+  EXPECT_FALSE(result_shape.is_dynamic_dimension(2));
+  EXPECT_TRUE(
+      ContainersEqual(result_shape.dynamic_dimensions(), {false, true, false}))
+      << result_shape;
+}
+
+TEST_F(XlaBuilderTest, DynamicSelectNotCompatible) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {4, 5, 6}),
+       ShapeUtil::MakeShape(F32, {4, 5, 6}), ShapeUtil::MakeShape(U32, {}),
+       ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  auto pred = Parameter(&b, 1, ShapeUtil::MakeShape(PRED, {}), "pred");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{2},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/1));
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{3},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{1},
+                                   /*target_dim_num=*/2));
+  auto gte0 = GetTupleElement(p0, 0);  // f32[4,<=5,6]
+  auto gte1 = GetTupleElement(p0, 1);  // f32[4,5,<=6]
+  Select(pred, gte0, gte1);
+  Status status = BuildHloModule(&b).status();
+  ASSERT_IS_NOT_OK(status);
+  EXPECT_THAT(status.error_message(),
+              ::testing::HasSubstr("Operands to select must be the same shape; "
+                                   "got f32[4,<=5,6] and f32[4,5,<=6]"));
+}
+
+TEST_F(XlaBuilderTest, DynamicTranspose) {
+  XlaBuilder b(TestName());
+  Shape tuple_param_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {3, 5}), ShapeUtil::MakeShape(U32, {})});
+  auto p0 = Parameter(&b, 0, tuple_param_shape, "p0");
+  ASSERT_IS_OK(b.SetDynamicBinding(/*dynamic_size_param_num=*/0,
+                                   /*dynamic_size_param_index=*/{1},
+                                   /*target_param_num=*/0,
+                                   /*target_param_index=*/{0},
+                                   /*target_dim_num=*/0));
+  auto gte = GetTupleElement(p0, 0);
+  Transpose(gte, /*permutation=*/{1, 0});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  const Shape& result_shape =
+      module->entry_computation()->root_instruction()->shape();
+  EXPECT_TRUE(ContainersEqual(result_shape.dynamic_dimensions(), {false, true}))
+      << result_shape;
+}
+
 TEST_F(XlaBuilderTest, AfterAllWithNonTokenOperands) {
   XlaBuilder b(TestName());
   AfterAll(&b, {CreateToken(&b), ConstantR0<float>(&b, 1.0)});
@@ -455,5 +945,31 @@ TEST_F(XlaBuilderTest, AfterAllWithNonTokenOperands) {
               ::testing::HasSubstr("All operands to AfterAll must be tokens"));
 }
 
+TEST_F(XlaBuilderTest, CheckInputOutputAlias) {
+  XlaBuilder b(TestName());
+  auto p0 = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {8, 4}), "p0");
+  auto p1 = Parameter(&b, 1, ShapeUtil::MakeShape(F32, {8, 4}), "p1");
+  auto add = Add(p0, p1);
+  auto sub = Sub(p0, p1);
+  auto root = Tuple(&b, {add, sub});
+
+  b.SetUpAlias({1}, 0, {});
+  b.SetUpAlias({0}, 1, {});
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b, root));
+
+  const HloInputOutputAliasConfig& config = module->input_output_alias_config();
+  EXPECT_TRUE(config.ParameterHasAlias(0, {}));
+  EXPECT_TRUE(config.ParameterHasAlias(1, {}));
+
+  auto alias_p0 = config.GetAliasedOutput(0, {});
+  ASSERT_TRUE(alias_p0.has_value());
+  EXPECT_EQ(*alias_p0, ShapeIndex({1}));
+
+  auto alias_p1 = config.GetAliasedOutput(1, {});
+  ASSERT_TRUE(alias_p1.has_value());
+  EXPECT_EQ(*alias_p1, ShapeIndex({0}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index 20609cad58d920c0c272899c41efeb99d23cd490..43d9ee0d9a5e689676b00e59d7c59bb0f4e37461 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -22,49 +22,49 @@ limitations under the License.
 #include "tensorflow/compiler/xla/parse_flags_from_env.h"
 
 namespace xla {
-namespace {
 
-DebugOptions* flag_values;
-std::vector<tensorflow::Flag>* flag_objects;
-std::once_flag flags_init;
-
-void SetDebugOptionsDefaults(DebugOptions* flags) {
-  flags->set_xla_llvm_enable_alias_scope_metadata(true);
-  flags->set_xla_llvm_enable_noalias_metadata(true);
-  flags->set_xla_llvm_enable_invariant_load_metadata(true);
-  flags->set_xla_llvm_disable_expensive_passes(false);
-  flags->set_xla_backend_optimization_level(3);
-  flags->set_xla_cpu_multi_thread_eigen(true);
-  flags->set_xla_gpu_cuda_data_dir("./cuda_sdk_lib");
-  flags->set_xla_eliminate_hlo_implicit_broadcast(true);
+DebugOptions DefaultDebugOptionsIgnoringFlags() {
+  DebugOptions opts;
+  opts.set_xla_llvm_enable_alias_scope_metadata(true);
+  opts.set_xla_llvm_enable_noalias_metadata(true);
+  opts.set_xla_llvm_enable_invariant_load_metadata(true);
+  opts.set_xla_llvm_disable_expensive_passes(false);
+  opts.set_xla_backend_optimization_level(3);
+  opts.set_xla_cpu_multi_thread_eigen(true);
+  opts.set_xla_gpu_cuda_data_dir("./cuda_sdk_lib");
+  opts.set_xla_eliminate_hlo_implicit_broadcast(true);
+  opts.set_xla_hlo_dump_as_html(false);
 #ifdef INTEL_MKL
-  flags->set_xla_cpu_use_mkl_dnn(true);
+  opts.set_xla_cpu_use_mkl_dnn(true);
 #endif  // INTEL_MKL
-  flags->set_xla_gpu_max_kernel_unroll_factor(4);
+  opts.set_xla_gpu_max_kernel_unroll_factor(4);
   // Set cudnn batchnorm off by default; it does not provide a performance win
   // on average.
-  flags->set_xla_gpu_use_cudnn_batchnorm(false);
+  opts.set_xla_gpu_use_cudnn_batchnorm(false);
 
   // Run all GPU work on one stream by default.  Using multiple streams
   // increases memory usage and we lack strong motivating benchmarks for tuning
   // the heuristics needed to decide when to run on multiple streams.  See
   // b/77879207.
-  flags->set_xla_gpu_disable_multi_streaming(true);
+  opts.set_xla_gpu_disable_multi_streaming(true);
 
   // TODO(jlebar): Disable fastmath once doing so is not a performance
   // regression.
-  flags->set_xla_cpu_enable_fast_math(true);
-  flags->set_xla_gpu_enable_fast_min_max(true);
+  opts.set_xla_cpu_enable_fast_math(true);
+  opts.set_xla_gpu_enable_fast_min_max(true);
 
-  flags->set_xla_force_host_platform_device_count(1);
+  opts.set_xla_force_host_platform_device_count(1);
+  return opts;
 }
 
+static DebugOptions* flag_values;
+static std::vector<tensorflow::Flag>* flag_objects;
+static std::once_flag flags_init;
+
 // Allocates flag_values and flag_objects; this function must not be called more
 // than once - its call done via call_once.
-void AllocateFlags() {
-  flag_values = new DebugOptions;
-
-  SetDebugOptionsDefaults(flag_values);
+static void AllocateFlags() {
+  flag_values = new DebugOptions(DefaultDebugOptionsIgnoringFlags());
 
   // Returns a lambda that calls "member_setter" on "flag_values" with the
   // argument passed in to the lambda.
@@ -128,24 +128,17 @@ void AllocateFlags() {
       tensorflow::Flag(
           "xla_hlo_graph_path", flag_values->mutable_xla_hlo_graph_path(),
           "With xla_generate_hlo_graph, dump the graphs into this path."),
-      tensorflow::Flag(
-          "xla_hlo_dump_as_graphdef",
-          bool_setter_for(&DebugOptions::set_xla_hlo_dump_as_graphdef),
-          flag_values->xla_hlo_dump_as_graphdef(),
-          "Dump HLO graphs as TensorFlow GraphDefs."),
+      tensorflow::Flag("xla_hlo_dump_as_html",
+                       bool_setter_for(&DebugOptions::set_xla_hlo_dump_as_html),
+                       flag_values->xla_hlo_dump_as_html(),
+                       "Dump HLO graphs as an HTML (DOT rendered into SVG "
+                       "inlined in HTML)."),
       tensorflow::Flag(
           "xla_hlo_graph_sharding_color",
           bool_setter_for(&DebugOptions::set_xla_hlo_graph_sharding_color),
           flag_values->xla_hlo_graph_sharding_color(),
           "Assign colors based on sharding assignments when generating the "
           "HLO graphs."),
-      tensorflow::Flag(
-          "xla_hlo_tfgraph_device_scopes",
-          bool_setter_for(&DebugOptions::set_xla_hlo_tfgraph_device_scopes),
-          flag_values->xla_hlo_tfgraph_device_scopes(),
-          "When generating TensorFlow HLO graphs, if the HLO instructions "
-          "are assigned to a specific device, prefix the name scope with "
-          "\"devX\" with X being the device ordinal."),
       tensorflow::Flag(
           "xla_log_hlo_text", flag_values->mutable_xla_log_hlo_text(),
           "HLO modules matching this regex will be dumped to LOG(INFO)."),
@@ -202,6 +195,16 @@ void AllocateFlags() {
           "Comma-separated list of hlo passes to be disabled. These names "
           "must exactly match the passes' names; no whitespace around "
           "commas."),
+      tensorflow::Flag(
+          "xla_disable_all_hlo_passes",
+          bool_setter_for(&DebugOptions::set_xla_disable_all_hlo_passes), false,
+          "Disables all HLO passes.  Notes that some passes are necessary for "
+          "correctness and the invariants that must be satisfied by 'fully "
+          "optimized' HLO are different for different devices and may change "
+          "over time.  The only 'guarantee', such as it is, is that if you "
+          "compile XLA and dump the optimized HLO for some graph, you should "
+          "be able to run it again on the same device with the same build of "
+          "XLA."),
       tensorflow::Flag(
           "xla_embed_ir_in_executable",
           bool_setter_for(&DebugOptions::set_xla_embed_ir_in_executable),
@@ -344,8 +347,6 @@ void AllocateFlags() {
   ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
 }
 
-}  // namespace
-
 void AppendDebugOptionsFlags(std::vector<tensorflow::Flag>* flag_list) {
   std::call_once(flags_init, &AllocateFlags);
   flag_list->insert(flag_list->end(), flag_objects->begin(),
diff --git a/tensorflow/compiler/xla/debug_options_flags.h b/tensorflow/compiler/xla/debug_options_flags.h
index 60e59abc2a2e0f1cce3de1afc928f9fe36f75b33..dbf86a40f052af09c61da0e1abb3116ef5214357 100644
--- a/tensorflow/compiler/xla/debug_options_flags.h
+++ b/tensorflow/compiler/xla/debug_options_flags.h
@@ -29,7 +29,10 @@ void AppendDebugOptionsFlags(std::vector<tensorflow::Flag>* flag_list);
 // Fetches a DebugOptions proto message from flags provided to the program.
 // Flags must be registered with the flags parser using AppendDebugOptionsFlags
 // first.
-xla::DebugOptions GetDebugOptionsFromFlags();
+DebugOptions GetDebugOptionsFromFlags();
+
+// Gets a DebugOptions proto that reflects the defaults as if no flags were set.
+DebugOptions DefaultDebugOptionsIgnoringFlags();
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/error_spec.h b/tensorflow/compiler/xla/error_spec.h
index a1463aa15941b9c265db94e2eb3cc176fab6695b..4359f3b7deb8e585494cb2a9c7115eac6a312c8e 100644
--- a/tensorflow/compiler/xla/error_spec.h
+++ b/tensorflow/compiler/xla/error_spec.h
@@ -30,6 +30,19 @@ struct ErrorSpec {
   // In effect, this allows the tested operation to produce incorrect results
   // for inputs outside its mathematical domain.
   bool relaxed_nans;
+
+  // If this is true, then we treat each +/-inf in the actual result as
+  // equivalent to our choice of either +/-inf or the min/max floating-point
+  // value.
+  //
+  // If the expected result is +/-inf, the actual result must still be +/-inf.
+  //
+  // In effect, this allows the tested operation to overflow, so long as it's
+  // overflowing on "large" values.
+  //
+  // (We could have a symmetric more_infs_ok flag if necessary; right now it
+  // appears not to be.)
+  bool fewer_infs_ok = false;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/executable_run_options.cc b/tensorflow/compiler/xla/executable_run_options.cc
index 0f9b591c70d4fd96147958d18bd5fb7dd78a7f3f..230f3b202a4b531c381665471c3856c3feba5a3a 100644
--- a/tensorflow/compiler/xla/executable_run_options.cc
+++ b/tensorflow/compiler/xla/executable_run_options.cc
@@ -77,7 +77,7 @@ ExecutionProfile* ExecutableRunOptions::execution_profile() const {
 }
 
 ExecutableRunOptions& ExecutableRunOptions::set_device_assignment(
-    DeviceAssignment* device_assignment) {
+    const DeviceAssignment* device_assignment) {
   device_assignment_ = device_assignment;
   return *this;
 }
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index ba3217f31b55bd1428f67da6154a46c8bc304053..1e744953bd3be58afba5b81c0e2a8ba26665f9c4 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -16,9 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 #define TENSORFLOW_COMPILER_XLA_EXECUTABLE_RUN_OPTIONS_H_
 
-// Pulls in the ::stream_executor -> ::xla::se namespace alias.
-#include "tensorflow/compiler/xla/types.h"
-
 // These classes are forward declared so that ExecutableRunOptions can be linked
 // into an XLA-compiled binary without having to link all of the pointed-to
 // objects (e.g., for an ahead-of-time compiled CPU binary, the gpu tools don't
@@ -28,12 +25,6 @@ class Stream;
 class Platform;
 }  // namespace stream_executor
 
-namespace tensorflow {
-namespace thread {
-class ThreadPool;
-}  // namespace thread
-}  // namespace tensorflow
-
 namespace Eigen {
 struct ThreadPoolDevice;
 }  // namespace Eigen
@@ -83,7 +74,7 @@ class ExecutableRunOptions {
   ExecutableRunOptions& set_execution_profile(ExecutionProfile* profile);
 
   ExecutableRunOptions& set_device_assignment(
-      DeviceAssignment* device_assignment);
+      const DeviceAssignment* device_assignment);
   const DeviceAssignment* device_assignment() const;
 
   ExecutableRunOptions& set_rng_seed(int rng_seed);
@@ -92,7 +83,7 @@ class ExecutableRunOptions {
  private:
   DeviceMemoryAllocator* allocator_ = nullptr;
   int device_ordinal_ = -1;
-  DeviceAssignment* device_assignment_ = nullptr;
+  const DeviceAssignment* device_assignment_ = nullptr;
   stream_executor::Stream* stream_ = nullptr;
   const Eigen::ThreadPoolDevice* intra_op_thread_pool_ = nullptr;
   ExecutionProfile* execution_profile_ = nullptr;
diff --git a/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
index 1fea816a803bfb75b9721393cef8c4dfc249268d..c34e84efc80ba970624d80802841d6ec534b6fd0 100644
--- a/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
+++ b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
@@ -104,9 +104,9 @@ class Sharding(object):
       ValueError: The tensor to split was smaller in the split dimension than
         the number of devices to split over.
     """
-    tensor.shape.assert_is_fully_defined()
     shape = tensor.shape.as_list()
-    if shape[split_dimension] < num_devices:
+    if (shape[split_dimension] is not None and
+        shape[split_dimension] < num_devices):
       raise ValueError('Split dimension was smaller than the required number '
                        'of splits: shape=%r, dimension=%r, num_devices=%r' %
                        (shape, split_dimension, num_devices))
diff --git a/tensorflow/compiler/xla/g3doc/_book.yaml b/tensorflow/compiler/xla/g3doc/_book.yaml
index 267701e9c0e42a21d2cda6238520f6a9692e7e76..d756cd74c98b98a6fda099690d966562bd694e2c 100644
--- a/tensorflow/compiler/xla/g3doc/_book.yaml
+++ b/tensorflow/compiler/xla/g3doc/_book.yaml
@@ -25,6 +25,8 @@ upper_tabs:
         path: /xla/operation_semantics
       - title: Shapes and layout
         path: /xla/shapes
+      - title: Tiled layout
+        path: /xla/tiled_layout
       - title: Using AOT compilation
         path: /xla/tfcompile
       - heading: Tutorials
diff --git a/tensorflow/compiler/xla/g3doc/broadcasting.md b/tensorflow/compiler/xla/g3doc/broadcasting.md
index 2870869a2cef13a9105b9dc9fa4d657834288f86..5c0525c1e9adf9f37d945170d05e7c18fa3d8852 100644
--- a/tensorflow/compiler/xla/g3doc/broadcasting.md
+++ b/tensorflow/compiler/xla/g3doc/broadcasting.md
@@ -168,7 +168,7 @@ consult the
 
 Broadcasting of a lower-rank array to a higher-rank array **and** broadcasting
 using degenerate dimensions can both be performed in the same binary operation.
-For example, a vector of size 4 and an matrix of size 1x2 can be added together
+For example, a vector of size 4 and a matrix of size 1x2 can be added together
 using broadcast dimensions value of (0):
 
     |1 2 3 4| + [5 6]    // [5 6] is a 1x2 matrix, not a vector.
@@ -176,7 +176,7 @@ using broadcast dimensions value of (0):
 First the vector is broadcast up to rank 2 (matrix) using the broadcast
 dimensions. The single value (0) in the broadcast dimensions indicates that
 dimension zero of the vector matches to dimension zero of the matrix. This
-produces an matrix of size 4xM where the value M is chosen to match the
+produces a matrix of size 4xM where the value M is chosen to match the
 corresponding dimension size in the 1x2 array. Therefore, a 4x2 matrix is
 produced:
 
diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index d888b1f23f36f33ef94ef0e22374e0c796e47a89..db90d184b5218614ac49363ebf2a7e25fffe44de 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -38,25 +38,25 @@ Alltoall is a collective operation that sends data from all cores to all cores.
 It has two phases:
 
 1.  the scatter phase. On each core, the operand is split into `split_count`
-    number of blocks along the `split_dimensions`, and the blocks are scattered
-    to all cores, e.g., the ith block is send to the ith core.
+number of blocks along the `split_dimensions`, and the blocks are scattered
+to all cores, e.g., the ith block is send to the ith core.
 2.  the gather phase. Each core concatenates the received blocks along the
-    `concat_dimension`.
+`concat_dimension`.
 
 The participating cores can be configured by:
 
 -   `replica_groups`: each ReplicaGroup contains a list of replica id. If empty,
-    all replicas belong to one group in the order of 0 - (n-1). Alltoall will be
-    applied within subgroups in the specified order. For example, replica
-    groups = {{1,2,3},{4,5,0}} means, an Alltoall will be applied within replica
-    1, 2, 3, and in the gather phase, the received blocks will be concatenated
-    in the order of 1, 2, 3; another Alltoall will be applied within replica 4,
-    5, 0, and the concatenation order is 4, 5, 0.
+all replicas belong to one group in the order of 0 - (n-1). Alltoall will be
+applied within subgroups in the specified order. For example, replica
+groups = {{1,2,3},{4,5,0}} means, an Alltoall will be applied within replica
+1, 2, 3, and in the gather phase, the received blocks will be concatenated
+in the order of 1, 2, 3; another Alltoall will be applied within replica 4,
+5, 0, and the concatenation order is 4, 5, 0.
 
 Prerequisites:
 
 -   The dimension size of the operand on the split_dimension is divisible by
-    split_count.
+split_count.
 -   The operand's shape is not tuple.
 
 <b> `AllToAll(operand, split_dimension, concat_dimension, split_count,
@@ -93,7 +93,7 @@ AllToAll(x, /*split_dimension=*/1, /*concat_dimension=*/0, /*split_count=*/4);
 ```
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="./images/ops_alltoall.png">
+<img style="width:100%" src="./images/ops_alltoall.png">
 </div>
 
 In this example, there are 4 cores participating the Alltoall. On each core, the
@@ -387,34 +387,34 @@ For example, let v be an array of 24 elements:
 
 ```
 let v = f32[4x2x3] {{{10, 11, 12},  {15, 16, 17}},
-                    {{20, 21, 22},  {25, 26, 27}},
-                    {{30, 31, 32},  {35, 36, 37}},
-                    {{40, 41, 42},  {45, 46, 47}}};
+{{20, 21, 22},  {25, 26, 27}},
+{{30, 31, 32},  {35, 36, 37}},
+{{40, 41, 42},  {45, 46, 47}}};
 
 // Collapse to a single dimension, leaving one dimension.
 let v012 = Collapse(v, {0,1,2});
 then v012 == f32[24] {10, 11, 12, 15, 16, 17,
-                      20, 21, 22, 25, 26, 27,
-                      30, 31, 32, 35, 36, 37,
-                      40, 41, 42, 45, 46, 47};
+20, 21, 22, 25, 26, 27,
+30, 31, 32, 35, 36, 37,
+40, 41, 42, 45, 46, 47};
 
 // Collapse the two lower dimensions, leaving two dimensions.
 let v01 = Collapse(v, {0,1});
 then v01 == f32[4x6] {{10, 11, 12, 15, 16, 17},
-                      {20, 21, 22, 25, 26, 27},
-                      {30, 31, 32, 35, 36, 37},
-                      {40, 41, 42, 45, 46, 47}};
+{20, 21, 22, 25, 26, 27},
+{30, 31, 32, 35, 36, 37},
+{40, 41, 42, 45, 46, 47}};
 
 // Collapse the two higher dimensions, leaving two dimensions.
 let v12 = Collapse(v, {1,2});
 then v12 == f32[8x3] {{10, 11, 12},
-                      {15, 16, 17},
-                      {20, 21, 22},
-                      {25, 26, 27},
-                      {30, 31, 32},
-                      {35, 36, 37},
-                      {40, 41, 42},
-                      {45, 46, 47}};
+{15, 16, 17},
+{20, 21, 22},
+{25, 26, 27},
+{30, 31, 32},
+{35, 36, 37},
+{40, 41, 42},
+{45, 46, 47}};
 
 ```
 
@@ -441,9 +441,9 @@ replicas.
 Note that there are the following restrictions on the `source_target_pair`:
 
 -   Any two pairs should not have the same target replica id, and they should
-    not have the same source replica id.
+not have the same source replica id.
 -   If a replica id is not a target in any pair, then the output on that replica
-    is a tensor consists of 0(s) with the same shape as the input.
+is a tensor consists of 0(s) with the same shape as the input.
 
 ## Concatenate
 
@@ -480,25 +480,25 @@ Concat({{2, 3}, {4, 5}, {6, 7}}, 0)
 
 ```
 let a = {
-  {1, 2},
-  {3, 4},
-  {5, 6},
+{1, 2},
+{3, 4},
+{5, 6},
 };
 let b = {
-  {7, 8},
+{7, 8},
 };
 Concat({a, b}, 0)
 >>> {
-  {1, 2},
-  {3, 4},
-  {5, 6},
-  {7, 8},
+{1, 2},
+{3, 4},
+{5, 6},
+{7, 8},
 }
 ```
 
 Diagram:
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="./images/ops_concatenate.png">
+<img style="width:100%" src="./images/ops_concatenate.png">
 </div>
 
 ## Conditional
@@ -548,17 +548,23 @@ Computes a convolution of the kind used in neural networks. Here, a convolution
 can be thought of as a n-dimensional window moving across a n-dimensional base
 area and a computation is performed for each possible position of the window.
 
-| Arguments             | Type                 | Semantics                     |
-| --------------------- | -------------------- | ----------------------------- |
-| `lhs`                 | `XlaOp`              | rank n+2 array of inputs      |
-| `rhs`                 | `XlaOp`              | rank n+2 array of kernel      |
-:                       :                      : weights                       :
-| `window_strides`      | `ArraySlice<int64>`  | n-d array of kernel strides   |
-| `padding`             | `ArraySlice<         | n-d array of (low, high)      |
-:                       : pair<int64, int64>>` : padding                       :
-| `lhs_dilation`        | `ArraySlice<int64>`  | n-d lhs dilation factor array |
-| `rhs_dilation`        | `ArraySlice<int64>`  | n-d rhs dilation factor array |
-| `feature_group_count` | int64                | the number of feature groups  |
+| Arguments             | Type                     | Semantics                |
+| --------------------- | ------------------------ | ------------------------ |
+| `lhs`                 | `XlaOp`                  | rank n+2 array of inputs |
+| `rhs`                 | `XlaOp`                  | rank n+2 array of kernel |
+:                       :                          : weights                  :
+| `window_strides`      | `ArraySlice<int64>`      | n-d array of kernel      |
+:                       :                          : strides                  :
+| `padding`             | `ArraySlice< pair<int64, | n-d array of (low, high) |
+:                       : int64>>`                 : padding                  :
+| `lhs_dilation`        | `ArraySlice<int64>`      | n-d lhs dilation factor  |
+:                       :                          : array                    :
+| `rhs_dilation`        | `ArraySlice<int64>`      | n-d rhs dilation factor  |
+:                       :                          : array                    :
+| `feature_group_count` | int64                    | the number of feature    |
+:                       :                          : groups                   :
+| `batch_group_count`   | int64                    | the number of batch      |
+:                       :                          : groups                   :
 
 Let n be the number of spatial dimensions. The `lhs` argument is a rank n+2
 array describing the base area. This is called the input, even though of course
@@ -566,20 +572,20 @@ the rhs is also an input. In a neural network, these are the input activations.
 The n+2 dimensions are, in this order:
 
 *   `batch`: Each coordinate in this dimension represents an independent input
-    for which convolution is carried out.
+for which convolution is carried out.
 *   `z/depth/features`: Each (y,x) position in the base area has a vector
-    associated to it, which goes into this dimension.
+associated to it, which goes into this dimension.
 *   `spatial_dims`: Describes the `n` spatial dimensions that define the base
-    area that the window moves across.
+area that the window moves across.
 
 The `rhs` argument is a rank n+2 array describing the convolutional
 filter/kernel/window. The dimensions are, in this order:
 
 *   `output-z`: The `z` dimension of the output.
 *   `input-z`: The size of this dimension times `feature_group_count` should
-    equal the size of the `z` dimension in lhs.
+equal the size of the `z` dimension in lhs.
 *   `spatial_dims`: Describes the `n` spatial dimensions that define the n-d
-    window that moves across the base area.
+window that moves across the base area.
 
 The `window_strides` argument specifies the stride of the convolutional window
 in the spatial dimensions. For example, if the stride in the first spatial
@@ -628,9 +634,22 @@ input feature dimension, and the filter would be reshaped from
 `[filter_height, filter_width, 1, in_channels * channel_multiplier]`. For more
 details, see `tf.nn.depthwise_conv2d`.
 
+The `batch_group_count` (default value 1) argument can be used for depthwise
+filters during backpropagation. `batch_group_count` needs to be a divisor of the
+size of the `lhs` (input) batch dimension. If `batch_group_count` is greater
+than 1, it means that the output batch dimension should be of size
+`batch_group_size` where `batch_group_size = input batch / batch_group_count`.
+For convolutions with `batch_group_count` greater than 1, the input batch size
+must evenly divide into batch_group_size and output feature size, which implies
+that the output feature size must be equal to batch_group_count. Conceptually,
+this can be achieved by performing the usual convolution, and then scraping
+`batch_group_size` number of elements on the diagonal of the matrix formed by
+output batch and output feature.
+
 The output shape has these dimensions, in this order:
 
-*   `batch`: Same size as `batch` on the input (`lhs`).
+*   `batch`: The size of this dimension times `batch_group_count` should equal
+    the size of the `batch` dimension in lhs.
 *   `z`: Same size as `output-z` on the kernel (`rhs`).
 *   `spatial_dims`: One value for each valid placement of the convolutional
     window.
@@ -658,15 +677,15 @@ Here is pseudo-code for a 2d convolution with padding and striding:
 
 ```
 for (b, oz, oy, ox) {  // output coordinates
-  value = 0;
-  for (iz, ky, kx) {  // kernel coordinates and input z
-    iy = oy*stride_y + ky - pad_low_y;
-    ix = ox*stride_x + kx - pad_low_x;
-    if ((iy, ix) inside the base area considered without padding) {
-      value += input(b, iz, iy, ix) * kernel(oz, iz, ky, kx);
-    }
-  }
-  output(b, oz, oy, ox) = value;
+value = 0;
+for (iz, ky, kx) {  // kernel coordinates and input z
+iy = oy*stride_y + ky - pad_low_y;
+ix = ox*stride_x + kx - pad_low_x;
+if ((iy, ix) inside the base area considered without padding) {
+value += input(b, iz, iy, ix) * kernel(oz, iz, ky, kx);
+}
+}
+output(b, oz, oy, ox) = value;
 }
 ```
 
@@ -777,19 +796,19 @@ Here is an example of an implementation of `myfunc`:
 
 ```
 extern "C" void myfunc(void* out, void** in) {
-  float (&x)[2] = *static_cast<float(*)[2]>(in[0]);
-  float (&y)[2][3] = *static_cast<float(*)[2][3]>(in[1]);
-  EXPECT_EQ(1, x[0]);
-  EXPECT_EQ(2, x[1]);
-  EXPECT_EQ(10, y[0][0]);
-  EXPECT_EQ(20, y[0][1]);
-  EXPECT_EQ(30, y[0][2]);
-  EXPECT_EQ(40, y[1][0]);
-  EXPECT_EQ(50, y[1][1]);
-  EXPECT_EQ(60, y[1][2]);
-  float (&z)[3][3] = *static_cast<float(*)[3][3]>(out);
-  z[0][0] = x[1] + y[1][0];
-  // ...
+float (&x)[2] = *static_cast<float(*)[2]>(in[0]);
+float (&y)[2][3] = *static_cast<float(*)[2][3]>(in[1]);
+EXPECT_EQ(1, x[0]);
+EXPECT_EQ(2, x[1]);
+EXPECT_EQ(10, y[0][0]);
+EXPECT_EQ(20, y[0][1]);
+EXPECT_EQ(30, y[0][2]);
+EXPECT_EQ(40, y[1][0]);
+EXPECT_EQ(50, y[1][1]);
+EXPECT_EQ(60, y[1][2]);
+float (&z)[3][3] = *static_cast<float(*)[3][3]>(out);
+z[0][0] = x[1] + y[1][0];
+// ...
 }
 ```
 
@@ -856,44 +875,40 @@ DotGeneral performs the sum of products over contracting dimensions specified
 in 'dimension_numbers'.
 
 Associated contracting dimension numbers from the 'lhs' and 'rhs' do not need
-to be the same, but must be listed in the same order in both
-'lhs/rhs_contracting_dimensions' arrays and have the same dimension sizes.
-There must be exactly one contracting dimension on both 'lhs' and 'rhs'.
+to be the same and but must have the same dimension sizes.
 
 Example with contracting dimension numbers:
 
 ```
 lhs = { {1.0, 2.0, 3.0},
-        {4.0, 5.0, 6.0} }
+{4.0, 5.0, 6.0} }
 
 rhs = { {1.0, 1.0, 1.0},
-        {2.0, 2.0, 2.0} }
+{2.0, 2.0, 2.0} }
 
 DotDimensionNumbers dnums;
 dnums.add_lhs_contracting_dimensions(1);
 dnums.add_rhs_contracting_dimensions(1);
 
 DotGeneral(lhs, rhs, dnums) -> { {6.0, 12.0},
-                                 {15.0, 30.0} }
+{15.0, 30.0} }
 ```
 
-Associated batch dimension numbers from the 'lhs' and 'rhs' must have the same
-dimension number, must be listed in the same order in both arrays, must
-have the same dimension sizes, and must be ordered before contracting and
-non-contracting/non-batch dimension numbers.
+Associated batch dimension numbers from the 'lhs' and 'rhs' must
+have the same dimension sizes.
 
 Example with batch dimension numbers (batch size 2, 2x2 matrices):
 
 ```
 lhs = { { {1.0, 2.0},
-          {3.0, 4.0} },
-        { {5.0, 6.0},
-          {7.0, 8.0} } }
+{3.0, 4.0} },
+{ {5.0, 6.0},
+{7.0, 8.0} } }
 
 rhs = { { {1.0, 0.0},
-          {0.0, 1.0} },
-        { {1.0, 0.0},
-          {0.0, 1.0} } }
+{0.0, 1.0} },
+{ {1.0, 0.0},
+{0.0, 1.0} } }
 
 DotDimensionNumbers dnums;
 dnums.add_lhs_contracting_dimensions(2);
@@ -902,9 +917,9 @@ dnums.add_lhs_batch_dimensions(0);
 dnums.add_rhs_batch_dimensions(0);
 
 DotGeneral(lhs, rhs, dnums) -> { { {1.0, 2.0},
-                                   {3.0, 4.0} },
-                                 { {5.0, 6.0},
-                                   {7.0, 8.0} } }
+{3.0, 4.0} },
+{ {5.0, 6.0},
+{7.0, 8.0} } }
 ```
 
 | Input                               | Output            | Semantics        |
@@ -929,21 +944,21 @@ dimension: [start, start + size). The shape of `start_indices` must be rank ==
 
 <b> `DynamicSlice(operand, start_indices, size_indices)` </b>
 
-| Arguments       | Type                | Semantics                           |
-| --------------- | ------------------- | ----------------------------------- |
-| `operand`       | `XlaOp`             | N dimensional array of type T       |
-| `start_indices` | `XlaOp`             | Rank 1 array of N integers          |
-:                 :                     : containing the starting indices of  :
-:                 :                     : the slice for each dimension. Value :
-:                 :                     : must be greater than or equal to    :
-:                 :                     : zero.                               :
-| `size_indices`  | `ArraySlice<int64>` | List of N integers containing the   |
-:                 :                     : slice size for each dimension. Each :
-:                 :                     : value must be strictly greater than :
-:                 :                     : zero, and start + size must be less :
-:                 :                     : than or equal to the size of the    :
-:                 :                     : dimension to avoid wrapping modulo  :
-:                 :                     : dimension size.                     :
+| Arguments       | Type                  | Semantics                          |
+| --------------- | --------------------- | ---------------------------------- |
+| `operand`       | `XlaOp`               | N dimensional array of type T      |
+| `start_indices` | sequence of N `XlaOp` | List of N scalar integers          |
+:                 :                       : containing the starting indices of :
+:                 :                       : the slice for each dimension.      :
+:                 :                       : Value must be greater than or      :
+:                 :                       : equal to zero.                     :
+| `size_indices`  | `ArraySlice<int64>`   | List of N integers containing the  |
+:                 :                       : slice size for each dimension.     :
+:                 :                       : Each value must be strictly        :
+:                 :                       : greater than zero, and start +     :
+:                 :                       : size must be less than or equal to :
+:                 :                       : the size of the dimension to avoid :
+:                 :                       : wrapping modulo dimension size.    :
 
 The effective slice indices are computed by applying the following
 transformation for each index `i` in `[1, N)` before performing the slice:
@@ -963,22 +978,22 @@ let a = {0.0, 1.0, 2.0, 3.0, 4.0}
 let s = {2}
 
 DynamicSlice(a, s, {2}) produces:
-  {2.0, 3.0}
+{2.0, 3.0}
 ```
 
 2-dimensional example:
 
 ```
 let b =
- { {0.0,  1.0,  2.0},
-   {3.0,  4.0,  5.0},
-   {6.0,  7.0,  8.0},
-   {9.0, 10.0, 11.0} }
+{ {0.0,  1.0,  2.0},
+{3.0,  4.0,  5.0},
+{6.0,  7.0,  8.0},
+{9.0, 10.0, 11.0} }
 let s = {2, 1}
 
 DynamicSlice(b, s, {2, 2}) produces:
-  { { 7.0,  8.0},
-    {10.0, 11.0} }
+{ { 7.0,  8.0},
+{10.0, 11.0} }
 ```
 ## DynamicUpdateSlice
 
@@ -994,19 +1009,22 @@ the rank of `operand`.
 
 <b> `DynamicUpdateSlice(operand, update, start_indices)` </b>
 
-| Arguments       | Type    | Semantics                                        |
-| --------------- | ------- | ------------------------------------------------ |
-| `operand`       | `XlaOp` | N dimensional array of type T                    |
-| `update`        | `XlaOp` | N dimensional array of type T containing the     |
-:                 :         : slice update. Each dimension of update shape     :
-:                 :         : must be strictly greater than zero, and start +  :
-:                 :         : update must be less than or equal to the operand :
-:                 :         : size for each dimension to avoid generating      :
-:                 :         : out-of-bounds update indices.                    :
-| `start_indices` | `XlaOp` | Rank 1 array of N integers containing the        |
-:                 :         : starting indices of the slice for each           :
-:                 :         : dimension. Value must be greater than or equal   :
-:                 :         : to zero.                                         :
+| Arguments       | Type                  | Semantics                          |
+| --------------- | --------------------- | ---------------------------------- |
+| `operand`       | `XlaOp`               | N dimensional array of type T      |
+| `update`        | `XlaOp`               | N dimensional array of type T      |
+:                 :                       : containing the slice update. Each  :
+:                 :                       : dimension of update shape must be  :
+:                 :                       : strictly greater than zero, and    :
+:                 :                       : start + update must be less than   :
+:                 :                       : or equal to the operand size for   :
+:                 :                       : each dimension to avoid generating :
+:                 :                       : out-of-bounds update indices.      :
+| `start_indices` | sequence of N `XlaOp` | List of N scalar integers          |
+:                 :                       : containing the starting indices of :
+:                 :                       : the slice for each dimension.      :
+:                 :                       : Value must be greater than or      :
+:                 :                       : equal to zero.                     :
 
 The effective slice indices are computed by applying the following
 transformation for each index `i` in `[1, N)` before performing the slice:
@@ -1027,29 +1045,29 @@ let u = {5.0, 6.0}
 let s = {2}
 
 DynamicUpdateSlice(a, u, s) produces:
-  {0.0, 1.0, 5.0, 6.0, 4.0}
+{0.0, 1.0, 5.0, 6.0, 4.0}
 ```
 
 2-dimensional example:
 
 ```
 let b =
- { {0.0,  1.0,  2.0},
-   {3.0,  4.0,  5.0},
-   {6.0,  7.0,  8.0},
-   {9.0, 10.0, 11.0} }
+{ {0.0,  1.0,  2.0},
+{3.0,  4.0,  5.0},
+{6.0,  7.0,  8.0},
+{9.0, 10.0, 11.0} }
 let u =
- { {12.0,  13.0},
-   {14.0,  15.0},
-   {16.0,  17.0} }
+{ {12.0,  13.0},
+{14.0,  15.0},
+{16.0,  17.0} }
 
 let s = {1, 1}
 
 DynamicUpdateSlice(b, u, s) produces:
- { {0.0,  1.0,  2.0},
-   {3.0, 12.0, 13.0},
-   {6.0, 14.0, 15.0},
-   {9.0, 16.0, 17.0} }
+{ {0.0,  1.0,  2.0},
+{3.0, 12.0, 13.0},
+{6.0, 14.0, 15.0},
+{9.0, 16.0, 17.0} }
 ```
 
 ## Element-wise binary arithmetic operations
@@ -1080,7 +1098,7 @@ When `Op` is `Rem`, the sign of the result is taken from the dividend, and the
 absolute value of the result is always less than the divisor's absolute value.
 
 Integer division overflow (signed/unsigned division/remainder by zero or signed
-divison/remainder of `INT_SMIN` with `-1`) produces an implementation defined
+division/remainder of `INT_SMIN` with `-1`) produces an implementation defined
 value.
 
 An alternative variant with different-rank broadcasting support exists for these
@@ -1168,7 +1186,7 @@ if and only if the corresponding input element is finite.
 
 <b>`Sign(operand)`</b> Element-wise sign operation `x -> sgn(x)` where
 
-$$\text{sgn}(x) = \begin{cases} -1 & x < 0\\ 0 & x = 0\\ 1 & x > 0 \end{cases}$$
+$$\text{sgn}(x) = \begin{cases} -1 & x < 0\\ -0 & x = -0\\ NaN & x = NaN\\ +0 & x = +0\\ 1 & x > 0 \end{cases}$$
 
 using the comparison operator of the element type of `operand`.
 
@@ -1235,42 +1253,42 @@ shape of `start_indices` to be `[6,7,1]`).
 
 The bounds for the output array along dimension `i` is computed as follows:
 
-  1. If `i` is present in `batch_dims` (i.e. is equal to `batch_dims[k]` for
-     some `k`) then we pick the corresponding dimension bounds out of
-     `start_indices.shape`, skipping `index_vector_dim` (i.e. pick
-     `start_indices.shape.dims`[`k`] if `k` < `index_vector_dim` and
-     `start_indices.shape.dims`[`k`+`1`] otherwise).
+1. If `i` is present in `batch_dims` (i.e. is equal to `batch_dims[k]` for
+some `k`) then we pick the corresponding dimension bounds out of
+`start_indices.shape`, skipping `index_vector_dim` (i.e. pick
+`start_indices.shape.dims`[`k`] if `k` < `index_vector_dim` and
+`start_indices.shape.dims`[`k`+`1`] otherwise).
 
-  2. If `i` is present in `offset_dims` (i.e. equal to `offset_dims`[`k`] for
-     some `k`) then we pick the corresponding bound out of `slice_sizes` after
-     accounting for `collapsed_slice_dims` (i.e. we pick
-     `adjusted_slice_sizes`[`k`] where `adjusted_slice_sizes` is `slice_sizes`
-     with the bounds at indices `collapsed_slice_dims` removed).
+2. If `i` is present in `offset_dims` (i.e. equal to `offset_dims`[`k`] for
+some `k`) then we pick the corresponding bound out of `slice_sizes` after
+accounting for `collapsed_slice_dims` (i.e. we pick
+`adjusted_slice_sizes`[`k`] where `adjusted_slice_sizes` is `slice_sizes`
+with the bounds at indices `collapsed_slice_dims` removed).
 
 Formally, the operand index `In` corresponding to an output index `Out` is
 computed as follows:
 
-  1. Let `G` = { `Out`[`k`] for `k` in `batch_dims` }.  Use `G` to slice out
-     vector `S` such that `S`[`i`] = `start_indices`[Combine(`G`, `i`)] where
-     Combine(A, b) inserts b at position `index_vector_dim` into A.  Note that
-     this is well defined even if `G` is empty -- if `G` is empty then `S` =
-     `start_indices`.
-
-  2. Create a starting index, `S`<sub>`in`</sub>, into `operand` using `S` by
-     scattering `S` using `start_index_map`.  More precisely:
-       1. `S`<sub>`in`</sub>[`start_index_map`[`k`]] = `S`[`k`] if `k` <
-          `start_index_map.size`.
-       2. `S`<sub>`in`</sub>[`_`] = `0` otherwise.
-
-  3. Create an index `O`<sub>`in`</sub> into `operand` by scattering the indices
-     at the offset dimensions in `Out` according to the `collapsed_slice_dims`
-     set.  More precisely:
-       1. `O`<sub>`in`</sub>[`expand_offset_dims`(`k`)] =
-          `Out`[`offset_dims`[`k`]] if `k` < `offset_dims.size`
-          (`expand_offset_dims` is defined below).
-       2. `O`<sub>`in`</sub>[`_`] = `0` otherwise.
-  4. `In` is `O`<sub>`in`</sub> + `S`<sub>`in`</sub> where + is element-wise
-     addition.
+1. Let `G` = { `Out`[`k`] for `k` in `batch_dims` }.  Use `G` to slice out
+vector `S` such that `S`[`i`] = `start_indices`[Combine(`G`, `i`)] where
+Combine(A, b) inserts b at position `index_vector_dim` into A.  Note that
+this is well defined even if `G` is empty -- if `G` is empty then `S` =
+`start_indices`.
+
+2. Create a starting index, `S`<sub>`in`</sub>, into `operand` using `S` by
+scattering `S` using `start_index_map`.  More precisely:
+1. `S`<sub>`in`</sub>[`start_index_map`[`k`]] = `S`[`k`] if `k` <
+`start_index_map.size`.
+2. `S`<sub>`in`</sub>[`_`] = `0` otherwise.
+
+3. Create an index `O`<sub>`in`</sub> into `operand` by scattering the indices
+at the offset dimensions in `Out` according to the `collapsed_slice_dims`
+set.  More precisely:
+1. `O`<sub>`in`</sub>[`expand_offset_dims`(`k`)] =
+`Out`[`offset_dims`[`k`]] if `k` < `offset_dims.size`
+(`expand_offset_dims` is defined below).
+2. `O`<sub>`in`</sub>[`_`] = `0` otherwise.
+4. `In` is `O`<sub>`in`</sub> + `S`<sub>`in`</sub> where + is element-wise
+addition.
 
 `expand_offset_dims` is the monotonic function with domain [`0`, `offset.size`)
 and range [`0`, `operand.rank`) \ `collapsed_slice_dims`.  So if, e.g.,
@@ -1282,21 +1300,21 @@ and range [`0`, `operand.rank`) \ `collapsed_slice_dims`.  So if, e.g.,
 Informally, every index `Out` in the output array corresponds to an element `E`
 in the operand array, computed as follows:
 
-  - We use the batch dimensions in `Out` to look up a starting index from
-    `start_indices`.
+- We use the batch dimensions in `Out` to look up a starting index from
+`start_indices`.
 
-  - We use `start_index_map` to map the starting index (which may have size less
-    than operand.rank) to a "full" starting index into operand.
+- We use `start_index_map` to map the starting index (which may have size less
+than operand.rank) to a "full" starting index into operand.
 
-  - We dynamic-slice out a slice with size `slice_sizes` using the full starting
-    index.
+- We dynamic-slice out a slice with size `slice_sizes` using the full starting
+index.
 
-  - We reshape the slice by collapsing the `collapsed_slice_dims` dimensions.
-    Since all collapsed slice dimensions have to have bound 1 this reshape is
-    always legal.
+- We reshape the slice by collapsing the `collapsed_slice_dims` dimensions.
+Since all collapsed slice dimensions have to have bound 1 this reshape is
+always legal.
 
-  - We use the offset dimensions in `Out` to index into this slice to get the
-    input element, `E`, corresponding to output index `Out`.
+- We use the offset dimensions in `Out` to index into this slice to get the
+input element, `E`, corresponding to output index `Out`.
 
 `index_vector_dim` is set to `start_indices.rank` - `1` in all of the
 examples that follow.  More interesting values for `index_vector_dim` does not
@@ -1315,7 +1333,7 @@ the output shape, and maps it to an element in the input array in the following
 way:
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="./images/ops_xla_gather_0.svg">
+<img style="width:100%" src="./images/ops_xla_gather_0.svg">
 </div>
 
 We first select an (`X`,`Y`) vector from the gather indices array using `G`.
@@ -1334,7 +1352,7 @@ version of the example above using a "gather indices" array of shape `[4,5,2]`
 would translate indices like this:
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="./images/ops_xla_gather_1.svg">
+<img style="width:100%" src="./images/ops_xla_gather_1.svg">
 </div>
 
 Again, this acts as a batch dynamic slice `G`<sub>`0`</sub> and
@@ -1343,27 +1361,27 @@ Again, this acts as a batch dynamic slice `G`<sub>`0`</sub> and
 The gather operation in XLA generalizes the informal semantics outlined above in
 the following ways:
 
- 1. We can configure which dimensions in the output shape are the offset
-    dimensions (dimensions containing `O`<sub>`0`</sub>, `O`<sub>`1`</sub> in
-    the last example).  The output batch dimensions (dimensions containing
-    `G`<sub>`0`</sub>, `G`<sub>`1`</sub> in the last example) are defined to be
-    the output dimensions that are not offset dimensions.
+1. We can configure which dimensions in the output shape are the offset
+dimensions (dimensions containing `O`<sub>`0`</sub>, `O`<sub>`1`</sub> in
+the last example).  The output batch dimensions (dimensions containing
+`G`<sub>`0`</sub>, `G`<sub>`1`</sub> in the last example) are defined to be
+the output dimensions that are not offset dimensions.
 
- 2. The number of output offset dimensions explicitly present in the output
-    shape may be smaller than the input rank.  These "missing" dimensions, which
-    are listed explicitly as `collapsed_slice_dims`, must have a slice size of
-    `1`.  Since they have a slice size of `1` the only valid index for them is
-    `0` and eliding them does not introduce ambiguity.
+2. The number of output offset dimensions explicitly present in the output
+shape may be smaller than the input rank.  These "missing" dimensions, which
+are listed explicitly as `collapsed_slice_dims`, must have a slice size of
+`1`.  Since they have a slice size of `1` the only valid index for them is
+`0` and eliding them does not introduce ambiguity.
 
- 3. The slice extracted from the "Gather Indices" array ((`X`, `Y`) in the last
-    example) may have fewer elements than the input array rank, and an explicit
-    mapping dictates how the index should be expanded to have the same rank as
-    the input.
+3. The slice extracted from the "Gather Indices" array ((`X`, `Y`) in the last
+example) may have fewer elements than the input array rank, and an explicit
+mapping dictates how the index should be expanded to have the same rank as
+the input.
 
 As a final example, we use (2) and (3) to implement `tf.gather_nd`:
 
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
-  <img style="width:100%" src="./images/ops_xla_gather_2.svg">
+<img style="width:100%" src="./images/ops_xla_gather_2.svg">
 </div>
 
 `G`<sub>`0`</sub> and `G`<sub>`1`</sub> are used to slice out a starting index
@@ -1442,11 +1460,11 @@ dependency between the while loops.
 
 ```
 result1 = while (condition, init = init_value) {
-  Infeed(shape)
+Infeed(shape)
 }
 
 result2 = while (condition, init = result1) {
-  Infeed(shape)
+Infeed(shape)
 }
 ```
 
@@ -1464,7 +1482,9 @@ Infeed of the device.
 
 Builds a constant literal on device rather than a potentially large host
 transfer. Creates a rank 1 array of values starting at zero and incrementing by
-one.
+one. For floating-point types, the produced array is equivalent to
+`ConvertElementType(Iota(...))` where the `Iota` is of integral type and the
+conversion is to the floating-point type.
 
 Arguments        | Type            | Semantics
 ---------------- | --------------- | ------------------------------------
@@ -1853,6 +1873,20 @@ non-deterministic. Therefore, the reduction function should not be overly
 sensitive to reassociation. See the discussion about associativity in the
 context of [`Reduce`](#reduce) for more details.
 
+## ReplicaId
+
+See also
+[`XlaBuilder::ReplicaId`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+Returns the unique ID (U32 scalar) of the replica.
+
+<b> `ReplicaId()` </b>
+
+The unique ID of each replica is an unsigned integer in the interval `[0, N)`,
+where `N` is the number of replicas. Since all the replicas are running the same
+program, a `ReplicaId()` call in the program will return a different value on
+each replica.
+
 ## Reshape
 
 See also
diff --git a/tensorflow/compiler/xla/g3doc/layout_with_tiling.md b/tensorflow/compiler/xla/g3doc/tiled_layout.md
similarity index 96%
rename from tensorflow/compiler/xla/g3doc/layout_with_tiling.md
rename to tensorflow/compiler/xla/g3doc/tiled_layout.md
index 5e990851af7495ebd4417e44f1d955fcc14dadf1..21e88ceab6208cdf940826d769fd93713044d5a0 100644
--- a/tensorflow/compiler/xla/g3doc/layout_with_tiling.md
+++ b/tensorflow/compiler/xla/g3doc/tiled_layout.md
@@ -1,9 +1,7 @@
 # Tiled layout
 
-*Note: This doc describes how tiled layout is intended to work. Tiling is being
-implemented, but this is an early effort and it is currently not even guaranteed
-to get an Unimplemented error if one tries to use tiling - it may be just
-silently ignored.*
+Caution: Tiled layout is *pre-release* and this describes how it's intended to
+work. Errors may be silently ignored.
 
 <center> ![](images/xla_array_layout_figure1.png)
 
diff --git a/tensorflow/compiler/xla/index_util.cc b/tensorflow/compiler/xla/index_util.cc
index 2a0241af3ef359c4d1c6c1ab9319b5b293110f7a..eebd8245abe759b71b3fe732943761325ea04b81 100644
--- a/tensorflow/compiler/xla/index_util.cc
+++ b/tensorflow/compiler/xla/index_util.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -141,7 +140,7 @@ namespace xla {
 
 /* static */ bool IndexUtil::IndexInBounds(const Shape& shape,
                                            absl::Span<const int64> index) {
-  int64 rank = ShapeUtil::Rank(shape);
+  int64 rank = shape.rank();
   if (rank != index.size()) {
     return false;
   }
diff --git a/tensorflow/compiler/xla/layout.cc b/tensorflow/compiler/xla/layout.cc
new file mode 100644
index 0000000000000000000000000000000000000000..000c4fdc40519214fa9fa721a8987b77b534442b
--- /dev/null
+++ b/tensorflow/compiler/xla/layout.cc
@@ -0,0 +1,128 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/layout.h"
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/layout_util.h"
+
+namespace xla {
+
+TileProto Tile::ToProto() const {
+  TileProto tile_proto;
+  for (int64 i : dimensions()) {
+    tile_proto.add_dimensions(i);
+  }
+  return tile_proto;
+}
+
+string Tile::ToString() const {
+  std::vector<string> elements;
+  for (auto dim : dimensions()) {
+    if (dim >= 0) {
+      elements.push_back(std::to_string(dim));
+    } else {
+      if (dim == kCombineDimension) {
+        elements.push_back("*");
+      } else {
+        elements.push_back(absl::StrCat("Invalid value ", dim));
+      }
+    }
+  }
+  return absl::StrCat("(", absl::StrJoin(elements, ","), ")");
+}
+
+/* static */ Layout Layout::CreateFromProto(const LayoutProto& proto) {
+  Layout layout;
+  layout.set_format(proto.format());
+  layout.minor_to_major_.reserve(proto.minor_to_major_size());
+  for (const int64 dimension : proto.minor_to_major()) {
+    layout.add_minor_to_major(dimension);
+  }
+  layout.set_max_sparse_elements(proto.max_sparse_elements());
+  for (const TileProto& tile_proto : proto.tiles()) {
+    *layout.add_tiles() = Tile::CreateFromProto(tile_proto);
+  }
+  layout.set_element_size_in_bits(proto.element_size_in_bits());
+  return layout;
+}
+
+LayoutProto Layout::ToProto() const {
+  LayoutProto proto;
+  proto.set_format(format_);
+  proto.mutable_minor_to_major()->Reserve(minor_to_major_size());
+  for (const int64 dimension : minor_to_major()) {
+    proto.add_minor_to_major(dimension);
+  }
+  proto.set_max_sparse_elements(max_sparse_elements_);
+  for (const Tile& tile : tiles()) {
+    *proto.add_tiles() = tile.ToProto();
+  }
+  proto.set_element_size_in_bits(element_size_in_bits());
+  return proto;
+}
+
+string Layout::ToString() const {
+  if (format() == SPARSE) {
+    CHECK_EQ(tiles_size(), 0) << "Sparse layout should not be tiled.";
+    return absl::StrCat("sparse{", max_sparse_elements(), "}");
+  } else if (format() == DENSE) {
+    string colon_string = tiles().empty() ? "" : "T";
+    for (Tile tile : tiles()) {
+      absl::StrAppend(&colon_string, tile.ToString());
+    }
+    if (element_size_in_bits() != 0) {
+      absl::StrAppend(&colon_string, "E(", element_size_in_bits(), ")");
+    }
+    return absl::StrCat("{", absl::StrJoin(minor_to_major(), ","),
+                        colon_string.empty() ? "" : ":", colon_string, "}");
+  } else {
+    CHECK_EQ(format(), INVALID_FORMAT);
+    return "invalid{}";
+  }
+}
+
+bool Layout::Equal::operator()(const Layout& lhs, const Layout& rhs) {
+  if (lhs.format() != rhs.format() ||
+      lhs.minor_to_major() != rhs.minor_to_major() ||
+      lhs.max_sparse_elements() != rhs.max_sparse_elements()) {
+    return false;
+  }
+  if (!ignore_tiles_ && lhs.tiles() != rhs.tiles()) {
+    return false;
+  }
+  if (!ignore_element_size_ &&
+      lhs.element_size_in_bits() != rhs.element_size_in_bits()) {
+    return false;
+  }
+  return true;
+}
+
+bool Layout::operator==(const Layout& other) const {
+  return Equal()(*this, other);
+}
+
+std::ostream& operator<<(std::ostream& out, const Tile& tile) {
+  out << tile.ToString();
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const Layout& layout) {
+  out << layout.ToString();
+  return out;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/layout.h b/tensorflow/compiler/xla/layout.h
new file mode 100644
index 0000000000000000000000000000000000000000..acc449b781b503142b24ed7229e3559230bb1599
--- /dev/null
+++ b/tensorflow/compiler/xla/layout.h
@@ -0,0 +1,234 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_LAYOUT_H_
+#define TENSORFLOW_COMPILER_XLA_LAYOUT_H_
+
+#include <vector>
+
+#include "absl/types/span.h"
+
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+// Describes a tile used in tiling-based layout. Refer to
+// g3doc/third_party/tensorflow/compiler/xla/g3doc/layout_with_tiling.md for
+// details.
+class Tile {
+ public:
+  Tile() = default;
+  explicit Tile(absl::Span<const int64> dimensions)
+      : dimensions_(dimensions.begin(), dimensions.end()) {}
+
+  // De/Serialize a Tile to and from a TileProto.
+  static Tile CreateFromProto(const TileProto& tile_proto) {
+    return Tile(AsInt64Slice(tile_proto.dimensions()));
+  }
+  TileProto ToProto() const;
+
+  bool operator==(const Tile& other) const {
+    return dimensions() == other.dimensions();
+  }
+  bool operator!=(const Tile& other) const { return !(*this == other); }
+
+  string ToString() const;
+
+  // Returns the bound of the tile in the given dimension index.
+  int64 dimension(int i) const { return dimensions_.at(i); }
+
+  // Returns the dimensions of the tile.
+  const std::vector<int64>& dimensions() const { return dimensions_; }
+
+  Tile& add_dimensions(int64 value) {
+    dimensions_.push_back(value);
+    return *this;
+  }
+
+  Tile& clear_dimensions() {
+    dimensions_.clear();
+    return *this;
+  }
+
+  // This dimension size means the corresponding dimension in the shape is
+  // combined with the next minor dimension before tiling is applied.
+  static constexpr int64 kCombineDimension = std::numeric_limits<int64>::min();
+
+ private:
+  // The bounds of the tile.
+  std::vector<int64> dimensions_;
+};
+
+class Layout {
+ public:
+  Layout() = default;
+
+  // Constructs a dense layout with the given minor-to-major order.
+  explicit Layout(absl::Span<const int64> minor_to_major)
+      : format_(DENSE),
+        minor_to_major_(minor_to_major.begin(), minor_to_major.end()) {}
+
+  // Constructs a dense tiled layout with the given minor-to-major order and
+  // tiles.
+  Layout(absl::Span<const int64> minor_to_major, absl::Span<const Tile> tiles,
+         int64 element_size_in_bits = 0)
+      : format_(DENSE),
+        minor_to_major_(minor_to_major.begin(), minor_to_major.end()),
+        tiles_(tiles.begin(), tiles.end()),
+        element_size_in_bits_(element_size_in_bits) {}
+
+  // Construct a shape from a LayoutProto.
+  static Layout CreateFromProto(const LayoutProto& proto);
+
+  // Returns a LayoutProto representation of the Layout.
+  LayoutProto ToProto() const;
+
+  // Returns a human-readable string that represents this layout.
+  string ToString() const;
+
+  // Equal is a configurable functor to check the equality of two layouts.
+  //
+  // Examples:
+  //
+  // - Comparing two layouts ignoring their difference in tiles:
+  //   Equal().IgnoreTiles()(layout1, layout2);
+  //
+  // - Comparing two layouts ignoring their difference in tiles and element
+  //   size:
+  //   Equal().IgnoreTiles().IgnoreElementSize()(layout1, layout2);
+  class Equal {
+   public:
+    Equal() = default;
+
+    bool operator()(const Layout& lhs, const Layout& rhs);
+
+    Equal& IgnoreTiles() {
+      ignore_tiles_ = true;
+      return *this;
+    }
+
+    Equal& IgnoreElementSize() {
+      ignore_element_size_ = true;
+      return *this;
+    }
+
+   private:
+    bool ignore_tiles_ = false;
+    bool ignore_element_size_ = false;
+  };
+
+  bool operator==(const Layout& other) const;
+  bool operator!=(const Layout& other) const { return !(*this == other); }
+
+  // The following methods mirror the protobuf generated code interface for the
+  // message LayoutProto. This enabled easy migration of this data structure
+  // from a proto to a proper C++ class.
+  //
+  // TODO(b/29771030): Replace or augment these methods with a more ergonomic
+  // interface.
+
+  // Methods for accessing the format.
+  Format format() const { return format_; }
+  Layout& set_format(Format value) {
+    format_ = value;
+    return *this;
+  }
+
+  // Methods for accessing the minor-to-major array.
+  int minor_to_major_size() const { return minor_to_major_.size(); }
+  int64 minor_to_major(int index) const { return minor_to_major_.at(index); }
+  Layout& set_minor_to_major(int index, int64 value) {
+    minor_to_major_.at(index) = value;
+    return *this;
+  }
+  Layout& add_minor_to_major(int64 value) {
+    minor_to_major_.push_back(value);
+    return *this;
+  }
+  Layout& clear_minor_to_major() {
+    minor_to_major_.clear();
+    return *this;
+  }
+  const std::vector<int64>& minor_to_major() const { return minor_to_major_; }
+  std::vector<int64>* mutable_minor_to_major() { return &minor_to_major_; }
+
+  // Methods for accessing the tile field.
+  int tiles_size() const { return tiles_.size(); }
+  const Tile& tiles(int index) const { return tiles_.at(index); }
+  Tile* mutable_tiles(int index) { return &tiles_.at(index); }
+  Tile* add_tiles() {
+    tiles_.push_back(Tile());
+    return &tiles_.back();
+  }
+  Layout& clear_tiles() {
+    tiles_.clear();
+    return *this;
+  }
+  const std::vector<Tile>& tiles() const { return tiles_; }
+  std::vector<Tile>* mutable_tiles() { return &tiles_; }
+
+  // Methods for accessing the int64 fields.
+  int64 max_sparse_elements() const { return max_sparse_elements_; }
+  Layout& set_max_sparse_elements(int64 value) {
+    max_sparse_elements_ = value;
+    return *this;
+  }
+  int64 element_size_in_bits() const { return element_size_in_bits_; }
+  Layout& set_element_size_in_bits(int64 value) {
+    element_size_in_bits_ = value;
+    return *this;
+  }
+
+  void Swap(Layout* other) {
+    using std::swap;
+    swap(*this, *other);
+  }
+
+  void Clear() {
+    format_ = INVALID_FORMAT;
+    minor_to_major_.clear();
+    max_sparse_elements_ = 0;
+    element_size_in_bits_ = 0;
+  }
+
+ private:
+  // The format of this layout.
+  Format format_ = INVALID_FORMAT;
+
+  // Sequence of dimension numbers, from minor (fastest varying index) to major
+  // (slowest varying index).
+  std::vector<int64> minor_to_major_;
+
+  // The maximum number of elements that can be stored for SPARSE formats.  This
+  // can be used to determine the maximum size in bytes of arrays stored in
+  // memory.  This field must be zero unless the format is SPARSE.
+  int64 max_sparse_elements_ = 0;
+
+  // The tiles used in tiling-based layout.
+  std::vector<Tile> tiles_;
+
+  // The number of bits used to store an individual array element.
+  int64 element_size_in_bits_ = 0;
+};
+
+std::ostream& operator<<(std::ostream& out, const Tile& Tile);
+std::ostream& operator<<(std::ostream& out, const Layout& layout);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_LAYOUT_H_
diff --git a/tensorflow/compiler/xla/layout_test.cc b/tensorflow/compiler/xla/layout_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f5d71c553ed2e0cfd5d5945144dd476557582b5f
--- /dev/null
+++ b/tensorflow/compiler/xla/layout_test.cc
@@ -0,0 +1,116 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/layout.h"
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+class LayoutTest : public ::testing::Test {};
+
+TEST_F(LayoutTest, ToString) {
+  EXPECT_EQ(Layout().ToString(), "invalid{}");
+  EXPECT_EQ(Layout({4, 5, 6}).ToString(), "{4,5,6}");
+  EXPECT_EQ(Layout().set_format(SPARSE).set_max_sparse_elements(123).ToString(),
+            "sparse{123}");
+  EXPECT_EQ(Layout({4, 5, 6}).ToString(), "{4,5,6}");
+  EXPECT_EQ(Layout({3, 2, 1, 0}, {Tile({42, 123}), Tile({4, 5})}).ToString(),
+            "{3,2,1,0:T(42,123)(4,5)}");
+  EXPECT_EQ(
+      Layout({1, 0}, {Tile({2, 55})}).set_element_size_in_bits(42).ToString(),
+      "{1,0:T(2,55)E(42)}");
+  EXPECT_EQ(
+      Layout({1, 0}, {Tile({-2, 55})}).set_element_size_in_bits(42).ToString(),
+      "{1,0:T(Invalid value -2,55)E(42)}");
+}
+
+TEST_F(LayoutTest, StreamOut) {
+  {
+    std::ostringstream oss;
+    oss << Tile({7, 8});
+    EXPECT_EQ(oss.str(), "(7,8)");
+  }
+
+  {
+    std::ostringstream oss;
+    oss << Layout({0, 1, 2});
+    EXPECT_EQ(oss.str(), "{0,1,2}");
+  }
+}
+
+TEST_F(LayoutTest, SparseLayoutMaxElements) {
+  EXPECT_EQ(LayoutUtil::MaxSparseElements(LayoutUtil::MakeSparseLayout(101)),
+            101);
+}
+
+TEST_F(LayoutTest, Equality) {
+  EXPECT_EQ(Layout(), Layout());
+  const std::vector<int64> empty_dims;
+  EXPECT_EQ(Layout(empty_dims), Layout(empty_dims));
+  EXPECT_NE(Layout(), Layout(empty_dims));
+  EXPECT_EQ(Layout({0, 1, 2, 3}), Layout({0, 1, 2, 3}));
+  EXPECT_NE(Layout({0, 1, 2, 3}), Layout({0, 1, 2}));
+  EXPECT_EQ(Layout({0, 1, 2}, {Tile({42, 44})}),
+            Layout({0, 1, 2}, {Tile({42, 44})}));
+  EXPECT_NE(Layout({0, 1, 2}, {Tile({42, 44})}),
+            Layout({0, 1, 2}, {Tile({42, 45})}));
+  EXPECT_NE(Layout({0, 1, 2}, {Tile({42, 44})}), Layout({0, 1, 2, 3}));
+  EXPECT_EQ(Layout({0, 1, 2}).set_element_size_in_bits(33),
+            Layout({0, 1, 2}).set_element_size_in_bits(33));
+  EXPECT_NE(Layout({0, 1, 2}).set_element_size_in_bits(33),
+            Layout({0, 1, 2}).set_element_size_in_bits(7));
+  EXPECT_EQ(Layout().set_format(SPARSE), Layout().set_format(SPARSE));
+  EXPECT_EQ(Layout().set_format(SPARSE).set_max_sparse_elements(42),
+            Layout().set_format(SPARSE).set_max_sparse_elements(42));
+  EXPECT_NE(Layout().set_format(SPARSE).set_max_sparse_elements(42),
+            Layout().set_format(SPARSE).set_max_sparse_elements(24));
+
+  EXPECT_FALSE(
+      Layout::Equal()(Layout({0, 1, 2}, {Tile({42, 44})}), Layout({0, 1, 2})));
+  EXPECT_TRUE(Layout::Equal().IgnoreTiles()(Layout({0, 1, 2}, {Tile({42, 44})}),
+                                            Layout({0, 1, 2})));
+  EXPECT_FALSE(
+      Layout::Equal()(Layout({0, 1, 2}, {}, 32), Layout({0, 1, 2}, {}, 1)));
+  EXPECT_TRUE(Layout::Equal().IgnoreElementSize()(Layout({0, 1, 2}, {}, 32),
+                                                  Layout({0, 1, 2}, {}, 1)));
+}
+
+TEST_F(LayoutTest, LayoutToFromProto) {
+  // Round-trips a Layout through proto de/serialization.
+  auto expect_unchanged = [](const Layout& layout) {
+    EXPECT_EQ(layout, Layout::CreateFromProto(layout.ToProto()));
+  };
+
+  expect_unchanged(Layout());
+  expect_unchanged(Layout({1, 3, 2, 0}));
+  expect_unchanged(Layout().set_format(SPARSE));
+  expect_unchanged(Layout().set_format(SPARSE).set_max_sparse_elements(123));
+  expect_unchanged(Layout({0, 1}).set_element_size_in_bits(42));
+  expect_unchanged(Layout({3, 2, 1, 0}, {Tile({42, 123}), Tile({4, 5})}));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc
index dbb81381acde645f08639737b6e7b6f6ad971f9b..62314118ca9713a04cb4e3cf6ad261b966d85f15 100644
--- a/tensorflow/compiler/xla/layout_util.cc
+++ b/tensorflow/compiler/xla/layout_util.cc
@@ -41,27 +41,37 @@ namespace {
 
 // Internal helper for GetDefaultLayoutForShape and SetToDefaultLayout. Sets
 // minor_to_major to the value that represents the default layout.
-void SetDefaultLayoutToContainer(
-    tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>*
-        minor_to_major) {
+void SetDefaultLayoutToContainer(std::vector<int64>* minor_to_major) {
   // The default XLA layout is major-to-minor (dim 0 is major).
   // For more information on XLA layouts, see:
   // https://www.tensorflow.org/performance/xla/shapes
   const int64 size = minor_to_major->size();
   for (int64 i = 0; i < size; ++i) {
-    minor_to_major->Set(i, size - 1 - i);
+    (*minor_to_major)[i] = size - 1 - i;
   }
 }
 
 }  // namespace
 
 /* static */ Layout LayoutUtil::MakeLayout(
-    absl::Span<const int64> minor_to_major) {
+    absl::Span<const int64> minor_to_major, absl::Span<const Tile> tiles,
+    int64 element_size_in_bits) {
   Layout layout;
   layout.set_format(DENSE);
   for (int64 dimension_number : minor_to_major) {
     layout.add_minor_to_major(dimension_number);
   }
+  for (Tile tile : tiles) {
+    for (int64 dim : tile.dimensions()) {
+      if (dim < 0 && dim != Tile::kCombineDimension) {
+        LOG(FATAL) << "Tile dimension size needs to be mininum int64 value if "
+                      "it's negative. Value is "
+                   << dim;
+      }
+    }
+    *layout.add_tiles() = tile;
+  }
+  layout.set_element_size_in_bits(element_size_in_bits);
   return layout;
 }
 
@@ -94,9 +104,8 @@ namespace {
 Layout CreateDefaultLayoutForRank(int64 rank) {
   Layout layout;
   layout.set_format(DENSE);
-  tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>*
-      minor_to_major = layout.mutable_minor_to_major();
-  minor_to_major->Resize(rank, 0);
+  std::vector<int64>* minor_to_major = layout.mutable_minor_to_major();
+  minor_to_major->resize(rank, 0);
   SetDefaultLayoutToContainer(minor_to_major);
   return layout;
 }
@@ -104,13 +113,13 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 }  // namespace
 
 /* static */ Layout LayoutUtil::GetDefaultLayoutForShape(const Shape& shape) {
-  if (ShapeUtil::IsOpaque(shape) || ShapeUtil::IsToken(shape)) {
+  if (shape.IsOpaque() || shape.IsToken()) {
     // Opaque and token types have empty layouts.
     return Layout();
   }
 
   // A Layout proto corresponds to a single array, not a tuple.
-  CHECK(ShapeUtil::IsArray(shape));
+  CHECK(shape.IsArray());
   return CreateDefaultLayoutForRank(shape.dimensions_size());
 }
 
@@ -131,17 +140,16 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 }
 
 /* static */ void LayoutUtil::SetToDefaultLayout(Shape* shape) {
-  if (ShapeUtil::IsTuple(*shape)) {
+  if (shape->IsTuple()) {
     // Tuple shape.
     for (auto& element_shape : *shape->mutable_tuple_shapes()) {
       SetToDefaultLayout(&element_shape);
     }
     shape->clear_layout();
-  } else if (ShapeUtil::IsArray(*shape)) {
+  } else if (shape->IsArray()) {
     shape->mutable_layout()->set_format(DENSE);
-    tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>*
-        minor_to_major = shape->mutable_layout()->mutable_minor_to_major();
-    minor_to_major->Resize(shape->dimensions_size(), 0);
+    auto* minor_to_major = shape->mutable_layout()->mutable_minor_to_major();
+    minor_to_major->resize(shape->dimensions_size(), 0);
     SetDefaultLayoutToContainer(minor_to_major);
   } else {
     // Opaque, token types etc. have no layout.
@@ -164,7 +172,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 
 /* static */ Status LayoutUtil::ValidateLayoutInShape(
     const Shape& shape, bool allow_missing_layouts) {
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     // Tuple shape.
     if (shape.has_layout()) {
       return InvalidArgument("tuple should not have a layout field");
@@ -174,7 +182,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
           ValidateLayoutInShape(element_shape, allow_missing_layouts));
     }
     return Status::OK();
-  } else if (ShapeUtil::IsArray(shape)) {
+  } else if (shape.IsArray()) {
     if (!shape.has_layout()) {
       if (allow_missing_layouts) {
         return Status::OK();
@@ -196,11 +204,11 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 
 /* static */ Status LayoutUtil::ValidateLayoutForShape(const Layout& layout,
                                                        const Shape& shape) {
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     return InvalidArgument("a single Layout is not valid for tuple shapes");
   }
 
-  if (!ShapeUtil::IsArray(shape)) {
+  if (!shape.IsArray()) {
     if (layout.minor_to_major_size() != 0) {
       return InvalidArgument(
           "shape of primitive type %s should not have a non-trivial layout",
@@ -210,25 +218,24 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
   }
 
   if (layout.format() == INVALID_FORMAT || !Format_IsValid(layout.format())) {
-    return InvalidArgument(
-        "Layout has an invalid format (%d) in layout {%s}, shape {%s}",
-        layout.format(), layout.ShortDebugString(), shape.ShortDebugString());
+    return InvalidArgument("Layout has an invalid format (%d)",
+                           layout.format());
   }
 
   if (layout.format() == DENSE) {
-    if (layout.minor_to_major_size() != ShapeUtil::Rank(shape)) {
+    if (layout.minor_to_major_size() != shape.rank()) {
       return InvalidArgument(
           "layout minor_to_major field contains %d elements, "
           "but shape is rank %d: {%s}; shape: %s",
-          layout.minor_to_major_size(), ShapeUtil::Rank(shape),
+          layout.minor_to_major_size(), shape.rank(),
           absl::StrJoin(layout.minor_to_major(), ", "),
           shape.ShortDebugString());
     }
 
-    std::vector<bool> dimensions_in_layout(ShapeUtil::Rank(shape), false);
-    for (int64 i = 0; i < ShapeUtil::Rank(shape); ++i) {
+    std::vector<bool> dimensions_in_layout(shape.rank(), false);
+    for (int64 i = 0; i < shape.rank(); ++i) {
       int64 dim = layout.minor_to_major(i);
-      if (dim < 0 || dim >= ShapeUtil::Rank(shape)) {
+      if (dim < 0 || dim >= shape.rank()) {
         return InvalidArgument(
             "layout minor_to_major field has out-of-bounds value: %s",
             HumanString(layout));
@@ -240,6 +247,10 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
       }
       dimensions_in_layout[dim] = true;
     }
+  } else {
+    if (layout.tiles_size() != 0) {
+      return InvalidArgument("Only dense layouts can be tiled.");
+    }
   }
 
   return Status::OK();
@@ -260,8 +271,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 }
 
 /* static */ bool LayoutUtil::IsDenseArray(const Shape& shape) {
-  return ShapeUtil::IsArray(shape) && shape.has_layout() &&
-         IsDense(shape.layout());
+  return shape.IsArray() && shape.has_layout() && IsDense(shape.layout());
 }
 
 /* static */ bool LayoutUtil::IsDense(const Layout& layout) {
@@ -281,8 +291,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 }
 
 /* static */ bool LayoutUtil::IsSparseArray(const Shape& shape) {
-  return ShapeUtil::IsArray(shape) && shape.has_layout() &&
-         IsSparse(shape.layout());
+  return shape.IsArray() && shape.has_layout() && IsSparse(shape.layout());
 }
 
 /* static */ bool LayoutUtil::IsSparse(const Layout& layout) {
@@ -295,11 +304,11 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 }
 
 /* static */ bool LayoutUtil::HasLayout(const Shape& shape) {
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     // Tuple shape: all subshapes must have a layout.
-    return std::all_of(shape.tuple_shapes().begin(), shape.tuple_shapes().end(),
-                       [](const Shape& s) { return HasLayout(s); });
-  } else if (!ShapeUtil::IsArray(shape)) {
+    return absl::c_all_of(shape.tuple_shapes(),
+                          [](const Shape& s) { return HasLayout(s); });
+  } else if (!shape.IsArray()) {
     // Opaque, token types etc. ignore layout.
     return true;
   }
@@ -316,7 +325,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 }
 
 /* static */ bool LayoutUtil::Equal(const Layout& lhs, const Layout& rhs) {
-  return protobuf_util::ProtobufEquals(lhs, rhs);
+  return lhs == rhs;
 }
 
 /* static */ absl::Span<const int64> LayoutUtil::MinorToMajor(
@@ -358,22 +367,18 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 }
 
 /* static */ string LayoutUtil::HumanString(const Layout& layout) {
-  if (IsSparse(layout)) {
-    return absl::StrCat("sparse{", layout.max_sparse_elements(), "}");
-  }
-  CHECK(IsDense(layout));
-  return absl::StrCat("{", absl::StrJoin(layout.minor_to_major(), ","), "}");
+  return layout.ToString();
 }
 
 namespace {
 
 // Internal helper for recursively copying layouts.
 Status CopyLayoutInternal(const Shape& src, Shape* dst) {
-  if (ShapeUtil::IsTuple(src) != ShapeUtil::IsTuple(*dst)) {
+  if (src.IsTuple() != dst->IsTuple()) {
     return InvalidArgument(
         "cannot copy layout from shape: shape structure differs");
   }
-  if (ShapeUtil::IsTuple(src)) {
+  if (src.IsTuple()) {
     if (ShapeUtil::TupleElementCount(src) !=
         ShapeUtil::TupleElementCount(*dst)) {
       return InvalidArgument(
@@ -385,7 +390,7 @@ Status CopyLayoutInternal(const Shape& src, Shape* dst) {
     }
   } else {
     if (src.has_layout()) {
-      if (ShapeUtil::Rank(src) != ShapeUtil::Rank(*dst)) {
+      if (src.rank() != dst->rank()) {
         return InvalidArgument("cannot copy layout from shape: ranks differs");
       }
       TF_RETURN_IF_ERROR(
@@ -407,9 +412,9 @@ Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src, Shape* dst) {
 
 /* static */ bool LayoutUtil::LayoutsInShapesEqual(const Shape& lhs,
                                                    const Shape& rhs) {
-  if (ShapeUtil::IsTuple(lhs)) {
-    if (!ShapeUtil::IsTuple(rhs) || ShapeUtil::TupleElementCount(lhs) !=
-                                        ShapeUtil::TupleElementCount(rhs)) {
+  if (lhs.IsTuple()) {
+    if (!rhs.IsTuple() || ShapeUtil::TupleElementCount(lhs) !=
+                              ShapeUtil::TupleElementCount(rhs)) {
       return false;
     }
     for (int i = 0; i < ShapeUtil::TupleElementCount(lhs); ++i) {
@@ -418,8 +423,8 @@ Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src, Shape* dst) {
       }
     }
     return true;
-  } else if (ShapeUtil::IsArray(lhs)) {
-    return ShapeUtil::Rank(lhs) == ShapeUtil::Rank(rhs) &&
+  } else if (lhs.IsArray()) {
+    return lhs.rank() == rhs.rank() &&
            LayoutUtil::Equal(lhs.layout(), rhs.layout());
   } else {
     // Layouts of non-array and non-tuple shapes is ignored.
@@ -435,7 +440,7 @@ Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src, Shape* dst) {
     positions_in_layout.push_back(
         PositionInContainer(layout.minor_to_major(), dim));
   }
-  std::sort(positions_in_layout.begin(), positions_in_layout.end());
+  absl::c_sort(positions_in_layout);
   for (size_t i = 1; i < positions_in_layout.size(); ++i) {
     if (1 != positions_in_layout[i] - positions_in_layout[i - 1]) {
       return false;
@@ -444,11 +449,6 @@ Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src, Shape* dst) {
   return true;
 }
 
-std::ostream& operator<<(std::ostream& out, const Layout& layout) {
-  out << LayoutUtil::HumanString(layout);
-  return out;
-}
-
 /*static*/ size_t LayoutUtil::Hash(const Layout& layout) {
   using tensorflow::hash;
   using tensorflow::Hash64Combine;
diff --git a/tensorflow/compiler/xla/layout_util.h b/tensorflow/compiler/xla/layout_util.h
index 6c298e57252449ce3f1f9055436e918f2d9f17f1..9997aef465daa48ee77050e03d97cde0ea2425cc 100644
--- a/tensorflow/compiler/xla/layout_util.h
+++ b/tensorflow/compiler/xla/layout_util.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/layout.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -35,7 +36,9 @@ class LayoutUtil {
  public:
   // Creates a layout with the given minor-to-major dimension order. (This is a
   // convenience function for protobuf construction.)
-  static Layout MakeLayout(absl::Span<const int64> minor_to_major);
+  static Layout MakeLayout(absl::Span<const int64> minor_to_major,
+                           absl::Span<const Tile> tiles = {},
+                           int64 element_size_in_bits = 0);
 
   // Similar to MakeLayout, but take indices in reverse order.
   static Layout MakeLayoutFromMajorToMinor(
@@ -195,8 +198,6 @@ class LayoutUtil {
   TF_DISALLOW_COPY_AND_ASSIGN(LayoutUtil);
 };
 
-std::ostream& operator<<(std::ostream& out, const Layout& layout);
-
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_LAYOUT_UTIL_H_
diff --git a/tensorflow/compiler/xla/layout_util_test.cc b/tensorflow/compiler/xla/layout_util_test.cc
index 12ce2d2d7c6fa8c590035f9ff2af50001ccf80d8..12da214063676717aa075e66aa54974f4cc2b31b 100644
--- a/tensorflow/compiler/xla/layout_util_test.cc
+++ b/tensorflow/compiler/xla/layout_util_test.cc
@@ -317,15 +317,79 @@ TEST_F(LayoutUtilTest, DefaultLayoutGettersMajorToMinor) {
                             ShapeUtil::MakeShape(F32, {10, 20, 30, 15, 25}))));
 }
 
-TEST_F(LayoutUtilTest, SparseLayoutMaxElements) {
-  EXPECT_EQ(LayoutUtil::MaxSparseElements(LayoutUtil::MakeSparseLayout(101)),
-            101);
-}
-
-TEST_F(LayoutUtilTest, StreamOut) {
-  std::ostringstream oss;
-  oss << LayoutUtil::MakeLayout({0, 1, 2});
-  EXPECT_EQ(oss.str(), "{0,1,2}");
+TEST_F(LayoutUtilTest, HumanStringWithTiling) {
+  Shape shape = ShapeUtil::MakeShapeWithLayout(F32, {2, 3, 4}, {0, 1, 2});
+  Tile* tile;
+
+  // No tiling.
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape), "f32[2,3,4]{0,1,2}");
+
+  // 2D tile.
+  tile = shape.mutable_layout()->add_tiles();
+  tile->add_dimensions(512);
+  tile->add_dimensions(1024);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "f32[2,3,4]{0,1,2:T(512,1024)}");
+
+  // 1D tile.
+  shape.mutable_layout()->clear_tiles();
+  tile = shape.mutable_layout()->add_tiles();
+  tile->add_dimensions(512);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "f32[2,3,4]{0,1,2:T(512)}");
+
+  // 2 tiles.
+  shape = ShapeUtil::MakeShapeWithLayout(BF16, {2, 3, 4}, {1, 2, 0});
+  tile = shape.mutable_layout()->add_tiles();
+  tile->add_dimensions(16);
+  tile->add_dimensions(256);
+  tile = shape.mutable_layout()->add_tiles();
+  tile->add_dimensions(2);
+  tile->add_dimensions(1);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "bf16[2,3,4]{1,2,0:T(16,256)(2,1)}");
+
+  // PRED with element size of 8 bits.
+  shape = ShapeUtil::MakeShapeWithLayout(PRED, {8, 8, 8}, {0, 2, 1});
+  tile = shape.mutable_layout()->add_tiles();
+  tile->add_dimensions(8);
+  tile->add_dimensions(128);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "pred[8,8,8]{0,2,1:T(8,128)}");
+
+  // PRED with element size of 32 bits.
+  shape.mutable_layout()->clear_tiles();
+  tile = shape.mutable_layout()->add_tiles();
+  tile->add_dimensions(8);
+  tile->add_dimensions(128);
+  shape.mutable_layout()->set_element_size_in_bits(32);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "pred[8,8,8]{0,2,1:T(8,128)E(32)}");
+
+  // No tile. PRED with element size of 32 bits.
+  shape.mutable_layout()->clear_tiles();
+  shape.mutable_layout()->set_element_size_in_bits(32);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "pred[8,8,8]{0,2,1:E(32)}");
+
+  // Tile with negative dimension size for combining dimensions.
+  shape = ShapeUtil::MakeShapeWithLayout(BF16, {2, 3, 1004}, {2, 1, 0});
+  tile = shape.mutable_layout()->add_tiles();
+  tile->add_dimensions(2);
+  tile->add_dimensions(Tile::kCombineDimension);
+  tile->add_dimensions(128);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "bf16[2,3,1004]{2,1,0:T(2,*,128)}");
+
+  // Tile with two negative dimensions.
+  shape = ShapeUtil::MakeShapeWithLayout(BF16, {8, 2, 3, 1004}, {3, 2, 1, 0});
+  tile = shape.mutable_layout()->add_tiles();
+  tile->add_dimensions(2);
+  tile->add_dimensions(Tile::kCombineDimension);
+  tile->add_dimensions(Tile::kCombineDimension);
+  tile->add_dimensions(128);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "bf16[8,2,3,1004]{3,2,1,0:T(2,*,*,128)}");
 }
 
 TEST_F(LayoutUtilTest, ValidateLayout_ValidArrayLayout) {
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index 8f480c1f1079b4e1a5be53958ebdf6e004ad9ebe..5cd738d0f7769ceac7eb3bdbc5abd3196d9cf99c 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -29,10 +29,12 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/index_util.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
@@ -42,7 +44,6 @@ namespace xla {
 namespace {
 
 using absl::StrCat;
-using absl::StrFormat;
 
 constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
 
@@ -107,7 +108,7 @@ Literal::Literal(const Shape& shape)
     : Literal(shape, /*allocate_arrays=*/true) {}
 
 void Literal::SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays) {
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
       const Shape& subshape = shape.tuple_shapes(i);
 
@@ -118,7 +119,7 @@ void Literal::SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays) {
 
       piece->emplace_back(std::move(child_piece));
     }
-  } else if (ShapeUtil::IsArray(shape)) {
+  } else if (shape.IsArray()) {
     if (allocate_arrays) {
       if (LayoutUtil::IsSparseArray(shape)) {
         // For sparse arrays, the buffer must be of the size of the maximum
@@ -129,7 +130,7 @@ void Literal::SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays) {
             new char[max_sparse_elements *
                      ShapeUtil::ByteSizeOfPrimitiveType(shape.element_type())]);
         piece->set_sparse_indices(
-            new SparseIndexArray(max_sparse_elements, ShapeUtil::Rank(shape)));
+            new SparseIndexArray(max_sparse_elements, shape.rank()));
       } else {
         piece->set_buffer(new char[piece->size_bytes()]);
       }
@@ -187,7 +188,7 @@ Literal LiteralBase::CreateFromShape(const Shape& shape) {
   Literal literal(shape);
   literal.root_piece_->ForEachMutableSubpiece(
       [&](const ShapeIndex& index, Piece* piece) {
-        if (ShapeUtil::IsArray(piece->subshape())) {
+        if (piece->subshape().IsArray()) {
           memset(piece->untyped_data(), 0, piece->size_bytes());
         }
       });
@@ -208,16 +209,15 @@ template <typename NativeT>
 Status MutableLiteralBase::CopySliceFromInternal(
     const LiteralBase& src_literal, absl::Span<const int64> src_base,
     absl::Span<const int64> dest_base, absl::Span<const int64> copy_size) {
-  TF_RET_CHECK(ShapeUtil::Rank(src_literal.shape()) == src_base.size());
-  TF_RET_CHECK(ShapeUtil::Rank(shape()) == dest_base.size());
+  TF_RET_CHECK(src_literal.shape().rank() == src_base.size());
+  TF_RET_CHECK(shape().rank() == dest_base.size());
 
   auto linear_index = [](const Shape& shape,
                          absl::Span<const int64> multi_index) {
     return IndexUtil::MultidimensionalIndexToLinearIndex(shape, multi_index);
   };
 
-  if (ShapeUtil::Rank(src_literal.shape()) == 0 ||
-      ShapeUtil::Rank(shape()) == 0) {
+  if (src_literal.shape().rank() == 0 || shape().rank() == 0) {
     // If any of the two shapes are scalars, we can just call the StridedCopy()
     // directly, and we know we will be copying only one value.
     TF_RET_CHECK(copy_size.empty());
@@ -312,7 +312,7 @@ Status MutableLiteralBase::CopyElementFrom(const LiteralSlice& src_literal,
           proto_element = &proto_element->tuple_literals(i);
         }
 
-        if (ShapeUtil::IsTuple(piece->subshape())) {
+        if (piece->subshape().IsTuple()) {
           if (proto_element->tuple_literals_size() !=
               ShapeUtil::TupleElementCount(piece->subshape())) {
             return InvalidArgument(
@@ -326,7 +326,7 @@ Status MutableLiteralBase::CopyElementFrom(const LiteralSlice& src_literal,
           return Status::OK();
         }
 
-        CHECK(ShapeUtil::IsArray(piece->subshape()));
+        CHECK(piece->subshape().IsArray());
         TF_RETURN_IF_ERROR(piece->CopyFromProto(*proto_element));
 
         return Status::OK();
@@ -336,7 +336,7 @@ Status MutableLiteralBase::CopyElementFrom(const LiteralSlice& src_literal,
 }
 
 std::vector<Literal> Literal::DecomposeTuple() {
-  CHECK(ShapeUtil::IsTuple(shape()));
+  CHECK(shape().IsTuple());
   std::vector<Literal> elements;
   for (int i = 0; i < ShapeUtil::TupleElementCount(shape()); ++i) {
     elements.push_back(Literal(ShapeUtil::GetSubshape(shape(), {i}),
@@ -375,7 +375,7 @@ void CopyElementsBetween(absl::Span<NativeT> dest,
   if (ShapeUtil::IsZeroElementArray(dest_shape)) {
     return;
   }
-  std::vector<int64> index(ShapeUtil::Rank(dest_shape));
+  std::vector<int64> index(dest_shape.rank());
   do {
     dest[IndexUtil::MultidimensionalIndexToLinearIndex(dest_shape, index)] =
         src[IndexUtil::MultidimensionalIndexToLinearIndex(src_shape, index)];
@@ -392,7 +392,7 @@ Status LiteralBase::Piece::CopyFrom(const LiteralBase::Piece& src) {
     memcpy(buffer(), src.buffer(), src.size_bytes());
   } else {
     TF_RET_CHECK(ShapeUtil::Compatible(src.subshape(), subshape()));
-    std::vector<int64> origin(ShapeUtil::Rank(subshape()), 0);
+    std::vector<int64> origin(subshape().rank(), 0);
     switch (subshape().element_type()) {
 #define COPY_ELEMENTS(XLA_T, NATIVE_T)                                    \
   case (XLA_T):                                                           \
@@ -412,6 +412,7 @@ Status LiteralBase::Piece::CopyFrom(const LiteralBase::Piece& src) {
       COPY_ELEMENTS(F32, float);
       COPY_ELEMENTS(F64, double);
       COPY_ELEMENTS(C64, complex64);
+      COPY_ELEMENTS(C128, complex128);
       COPY_ELEMENTS(PRED, bool);
 #undef COPY_ELEMENTS
       default:
@@ -438,7 +439,7 @@ Status MutableLiteralBase::CopyFrom(const LiteralSlice& src_literal,
   }
   return root_piece_->ForEachMutableSubpieceWithStatus(
       [&](const ShapeIndex& index, Piece* piece) {
-        if (!ShapeUtil::IsArray(piece->subshape())) {
+        if (!piece->subshape().IsArray()) {
           return Status::OK();
         }
 
@@ -477,7 +478,7 @@ Status Literal::MoveFrom(Literal&& src_literal,
 
   src_literal.root_piece_->ForEachSubpiece(
       [&](const ShapeIndex& src_index, const Piece& src_piece) {
-        if (!ShapeUtil::IsArray(src_piece.subshape())) {
+        if (!src_piece.subshape().IsArray()) {
           return;
         }
 
@@ -504,8 +505,8 @@ Status MutableLiteralBase::CopySliceFrom(const LiteralSlice& src_literal,
                                          absl::Span<const int64> src_base,
                                          absl::Span<const int64> dest_base,
                                          absl::Span<const int64> copy_size) {
-  TF_RET_CHECK(ShapeUtil::IsArray(shape())) << ShapeUtil::HumanString(shape());
-  TF_RET_CHECK(ShapeUtil::IsArray(src_literal.shape()))
+  TF_RET_CHECK(shape().IsArray()) << ShapeUtil::HumanString(shape());
+  TF_RET_CHECK(src_literal.shape().IsArray())
       << ShapeUtil::HumanString(src_literal.shape());
   TF_RET_CHECK(ShapeUtil::SameElementType(src_literal.shape(), shape()));
 
@@ -549,6 +550,9 @@ Status MutableLiteralBase::CopySliceFrom(const LiteralSlice& src_literal,
     case C64:
       return CopySliceFromInternal<complex64>(src_literal, src_base, dest_base,
                                               copy_size);
+    case C128:
+      return CopySliceFromInternal<complex128>(src_literal, src_base, dest_base,
+                                               copy_size);
     case PRED:
       return CopySliceFromInternal<bool>(src_literal, src_base, dest_base,
                                          copy_size);
@@ -562,8 +566,8 @@ Status MutableLiteralBase::CopySliceFrom(const LiteralSlice& src_literal,
 }
 
 void MutableLiteralBase::PopulateR1(const tensorflow::core::Bitmap& values) {
-  CHECK(ShapeUtil::IsArray(shape()));
-  CHECK_EQ(ShapeUtil::Rank(shape()), 1);
+  CHECK(shape().IsArray());
+  CHECK_EQ(shape().rank(), 1);
   CHECK_EQ(element_count(), values.bits());
   CHECK_EQ(shape().element_type(), PRED);
   for (int64 i = 0; i < static_cast<int64>(values.bits()); ++i) {
@@ -592,7 +596,7 @@ Literal LiteralBase::Relayout(const Shape& shape_with_layout) const {
   ShapeUtil::ForEachSubshape(
       result.shape(),
       [this, &result](const Shape& subshape, const ShapeIndex& index) {
-        if (ShapeUtil::IsArray(subshape)) {
+        if (subshape.IsArray()) {
           TF_CHECK_OK(result.CopyFrom(*this,
                                       /*dest_shape_index=*/index,
                                       /*src_shape_index=*/index));
@@ -603,7 +607,7 @@ Literal LiteralBase::Relayout(const Shape& shape_with_layout) const {
 
 StatusOr<Literal> LiteralBase::Broadcast(
     const Shape& result_shape, absl::Span<const int64> dimensions) const {
-  if (!ShapeUtil::IsArray(shape())) {
+  if (!shape().IsArray()) {
     return InvalidArgument("Broadcast only supports arrays.");
   }
 
@@ -643,13 +647,12 @@ StatusOr<Literal> LiteralBase::Broadcast(
 
 StatusOr<Literal> LiteralBase::Reshape(
     absl::Span<const int64> dimensions) const {
-  if (!ShapeUtil::IsArray(shape())) {
+  if (!shape().IsArray()) {
     return InvalidArgument("Reshape does not support tuples.");
   }
   Literal output;
   if (!LayoutUtil::IsMonotonicWithDim0Major(shape().layout())) {
-    output =
-        Relayout(LayoutUtil::GetDefaultLayoutForRank(ShapeUtil::Rank(shape())));
+    output = Relayout(LayoutUtil::GetDefaultLayoutForRank(shape().rank()));
   } else {
     output = Clone();
   }
@@ -671,8 +674,8 @@ StatusOr<Literal> LiteralBase::Reshape(
 }
 
 Literal LiteralBase::Transpose(absl::Span<const int64> permutation) const {
-  CHECK(ShapeUtil::IsArray(shape())) << "Tuple is not supported for transpose";
-  CHECK(IsPermutation(permutation, ShapeUtil::Rank(shape())))
+  CHECK(shape().IsArray()) << "Tuple is not supported for transpose";
+  CHECK(IsPermutation(permutation, shape().rank()))
       << "Given permutation is not a permutation of dimension numbers";
   // To transpose the array, we just permute the dimensions and layout, and
   // do a straight memory copy of the raw data set.
@@ -711,10 +714,10 @@ template <typename NativeT>
 Literal LiteralBase::SliceInternal(
     const Shape& result_shape, absl::Span<const int64> start_indices) const {
   Literal result_literal(result_shape);
-  DimensionVector new_indices(ShapeUtil::Rank(result_shape));
+  DimensionVector new_indices(result_shape.rank());
   result_literal.EachCell<NativeT>(
       [&](absl::Span<const int64> indices, NativeT /*value*/) {
-        for (int64 i = 0; i < ShapeUtil::Rank(result_shape); ++i) {
+        for (int64 i = 0; i < result_shape.rank(); ++i) {
           new_indices[i] = indices[i] + start_indices[i];
         }
         NativeT value = Get<NativeT>(new_indices);
@@ -725,10 +728,10 @@ Literal LiteralBase::SliceInternal(
 
 Literal LiteralBase::Slice(absl::Span<const int64> start_indices,
                            absl::Span<const int64> limit_indices) const {
-  CHECK(ShapeUtil::IsArray(shape())) << "tuple is not supported for slice";
+  CHECK(shape().IsArray()) << "tuple is not supported for slice";
 
   DimensionVector result_dimensions;
-  for (int64 dnum = 0; dnum < ShapeUtil::Rank(shape()); ++dnum) {
+  for (int64 dnum = 0; dnum < shape().rank(); ++dnum) {
     CHECK_GE(start_indices[dnum], 0);
     CHECK_LE(limit_indices[dnum], shape().dimensions(dnum))
         << "dnum = " << dnum;
@@ -768,6 +771,8 @@ Literal LiteralBase::Slice(absl::Span<const int64> start_indices,
       return SliceInternal<double>(result_shape, start_indices);
     case C64:
       return SliceInternal<complex64>(result_shape, start_indices);
+    case C128:
+      return SliceInternal<complex128>(result_shape, start_indices);
     default:
       LOG(FATAL) << "not yet implemented: "
                  << PrimitiveType_Name(result_shape.element_type());
@@ -816,6 +821,10 @@ string LiteralBase::GetAsString(absl::Span<const int64> multi_index,
       complex64 c = Get<complex64>(multi_index, shape_index);
       return StrCat("(", c.real(), ", ", c.imag(), ")");
     }
+    case C128: {
+      complex128 c = Get<complex128>(multi_index, shape_index);
+      return StrCat("(", c.real(), ", ", c.imag(), ")");
+    }
     default:
       LOG(FATAL) << PrimitiveType_Name(subshape.element_type());
   }
@@ -870,6 +879,11 @@ string LiteralBase::GetSparseElementAsString(
           GetSparseElement<complex64>(sparse_element_number, shape_index);
       return StrCat("(", c.real(), ", ", c.imag(), ")");
     }
+    case C128: {
+      complex128 c =
+          GetSparseElement<complex128>(sparse_element_number, shape_index);
+      return StrCat("(", c.real(), ", ", c.imag(), ")");
+    }
     default:
       LOG(FATAL) << "Invalid element type for sparse arrays: "
                  << PrimitiveType_Name(subshape.element_type());
@@ -906,7 +920,7 @@ size_t LiteralBase::Hash() const {
 
   ShapeUtil::ForEachSubshape(
       shape(), [&](const Shape& subshape, const ShapeIndex& index) {
-        if (!ShapeUtil::IsArray(subshape)) {
+        if (!subshape.IsArray()) {
           return;
         }
 
@@ -998,6 +1012,9 @@ void LiteralBase::Piece::SortSparseElements() {
     case C64:
       SortSparseElementsInternal<complex64>();
       break;
+    case C128:
+      SortSparseElementsInternal<complex128>();
+      break;
     case F16:
       SortSparseElementsInternal<half>();
       break;
@@ -1028,20 +1045,21 @@ string ShapeToString(bool print_layout, const Shape& shape) {
 }
 
 void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index,
-                    bool print_layout, std::vector<string>* pieces);
+                    bool print_shape, bool print_layout,
+                    std::vector<string>* pieces);
 
 void TupleToStringHelper(const LiteralBase& literal,
-                         const ShapeIndex& shape_index, bool print_layout,
-                         std::vector<string>* pieces) {
+                         const ShapeIndex& shape_index, bool print_shape,
+                         bool print_layout, std::vector<string>* pieces) {
   const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
-  pieces->push_back(ShapeToString(print_layout, subshape));
-  pieces->push_back(" (\n");
+  pieces->push_back("(\n");
   std::vector<string> tuple_pieces;
   for (int i = 0; i < ShapeUtil::TupleElementCount(subshape); ++i) {
     ShapeIndex element_index = shape_index;
     element_index.push_back(i);
     std::vector<string> element_pieces;
-    ToStringHelper(literal, element_index, print_layout, &element_pieces);
+    ToStringHelper(literal, element_index, print_shape, print_layout,
+                   &element_pieces);
     tuple_pieces.push_back(absl::StrJoin(element_pieces, ""));
   }
   pieces->push_back(absl::StrJoin(tuple_pieces, ",\n"));
@@ -1049,11 +1067,13 @@ void TupleToStringHelper(const LiteralBase& literal,
 }
 
 void SparseArrayToStringHelper(const LiteralBase& literal,
-                               const Shape& subshape, bool print_layout,
-                               std::vector<string>* pieces) {
-  pieces->push_back(ShapeToString(print_layout, subshape));
+                               const Shape& subshape, bool print_shape,
+                               bool print_layout, std::vector<string>* pieces) {
+  if (print_shape) {
+    pieces->push_back(ShapeToString(print_layout, subshape));
+  }
   pieces->push_back("{");
-  int64 rank = ShapeUtil::Rank(subshape);
+  int64 rank = subshape.rank();
   int64 num_elements = literal.sparse_element_count();
   for (int64 i = 0; i < num_elements; ++i) {
     if (i > 0) {
@@ -1073,10 +1093,10 @@ void SparseArrayToStringHelper(const LiteralBase& literal,
 }
 
 void DenseArrayToStringHelper(const LiteralBase& literal,
-                              const ShapeIndex& shape_index, bool print_layout,
-                              std::vector<string>* pieces) {
+                              const ShapeIndex& shape_index, bool print_shape,
+                              bool print_layout, std::vector<string>* pieces) {
   const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
-  int64 rank = ShapeUtil::Rank(subshape);
+  int64 rank = subshape.rank();
 
   std::function<void(absl::Span<const int64> dimensions, std::vector<int64>*)>
       to_string_recursive = [&](absl::Span<const int64> dimensions,
@@ -1135,7 +1155,7 @@ void DenseArrayToStringHelper(const LiteralBase& literal,
         }
       };
 
-  if (rank > 1) {
+  if (print_shape) {
     pieces->push_back(ShapeToString(print_layout, subshape));
     pieces->push_back(" ");
   }
@@ -1146,19 +1166,23 @@ void DenseArrayToStringHelper(const LiteralBase& literal,
 }
 
 void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index,
-                    bool print_layout, std::vector<string>* pieces) {
+                    bool print_shape, bool print_layout,
+                    std::vector<string>* pieces) {
   const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
   CHECK(LayoutUtil::HasLayout(literal.shape()));
   CHECK(LayoutUtil::HasLayout(subshape));
-  if (ShapeUtil::IsTuple(subshape)) {
-    TupleToStringHelper(literal, shape_index, print_layout, pieces);
-  } else if (ShapeUtil::IsToken(subshape)) {
+  if (subshape.IsTuple()) {
+    TupleToStringHelper(literal, shape_index, print_shape, print_layout,
+                        pieces);
+  } else if (subshape.IsToken()) {
     pieces->push_back("token");
   } else if (LayoutUtil::IsSparseArray(subshape)) {
-    SparseArrayToStringHelper(literal, subshape, print_layout, pieces);
+    SparseArrayToStringHelper(literal, subshape, print_shape, print_layout,
+                              pieces);
   } else {
     CHECK(LayoutUtil::IsDenseArray(subshape));
-    DenseArrayToStringHelper(literal, shape_index, print_layout, pieces);
+    DenseArrayToStringHelper(literal, shape_index, print_shape, print_layout,
+                             pieces);
   }
 }
 
@@ -1169,10 +1193,27 @@ int64 LiteralBase::sparse_element_count() const {
   return sparse_indices()->index_count();
 }
 
-string LiteralBase::ToString(bool print_layout) const {
+string LiteralBase::ToString() const {
   std::vector<string> pieces;
   CHECK(LayoutUtil::HasLayout(this->shape()));
-  ToStringHelper(*this, {}, print_layout, &pieces);
+  ToStringHelper(*this, {}, /*print_shape=*/true,
+                 /*print_layout=*/false, &pieces);
+  return absl::StrJoin(pieces, "");
+}
+
+string LiteralBase::ToStringWithoutShape() const {
+  std::vector<string> pieces;
+  CHECK(LayoutUtil::HasLayout(this->shape()));
+  ToStringHelper(*this, {}, /*print_shape=*/false,
+                 /*print_layout=*/false, &pieces);
+  return absl::StrJoin(pieces, "");
+}
+
+string LiteralBase::ToStringWithLayout() const {
+  std::vector<string> pieces;
+  CHECK(LayoutUtil::HasLayout(this->shape()));
+  ToStringHelper(*this, {}, /*print_shape=*/true,
+                 /*print_layout=*/true, &pieces);
   return absl::StrJoin(pieces, "");
 }
 
@@ -1193,7 +1234,7 @@ namespace {
 template <typename NativeSrcT, typename NativeDestT, typename ConverterType>
 Literal ConvertBetweenNativeTypesWithConverter(const LiteralBase& src_literal,
                                                const ConverterType& converter) {
-  CHECK(ShapeUtil::IsArray(src_literal.shape()));
+  CHECK(src_literal.shape().IsArray());
   Literal result_literal(ShapeUtil::ChangeElementType(
       src_literal.shape(),
       primitive_util::NativeToPrimitiveType<NativeDestT>()));
@@ -1208,7 +1249,24 @@ Literal ConvertBetweenNativeTypesWithConverter(const LiteralBase& src_literal,
 }
 
 template <typename NativeSrcT, typename NativeDestT>
-Literal ConvertBetweenNativeTypes(const LiteralBase& src_literal) {
+typename std::enable_if<(std::is_same<NativeSrcT, Eigen::half>::value) &&
+                            (std::is_same<NativeDestT, complex64>::value ||
+                             std::is_same<NativeDestT, complex128>::value),
+                        Literal>::type
+ConvertBetweenNativeTypes(const LiteralBase& src_literal) {
+  auto converter = [](NativeSrcT src) {
+    return NativeDestT(static_cast<typename NativeDestT::value_type>(src));
+  };
+  return ConvertBetweenNativeTypesWithConverter<NativeSrcT, NativeDestT>(
+      src_literal, converter);
+}
+
+template <typename NativeSrcT, typename NativeDestT>
+typename std::enable_if<(!std::is_same<NativeSrcT, Eigen::half>::value) ||
+                            (!std::is_same<NativeDestT, complex64>::value &&
+                             !std::is_same<NativeDestT, complex128>::value),
+                        Literal>::type
+ConvertBetweenNativeTypes(const LiteralBase& src_literal) {
   auto converter = [](NativeSrcT src) { return static_cast<NativeDestT>(src); };
   return ConvertBetweenNativeTypesWithConverter<NativeSrcT, NativeDestT>(
       src_literal, converter);
@@ -1252,22 +1310,6 @@ BitcastBetweenNativeTypes(const LiteralBase& src_literal) {
   LOG(FATAL) << "Invalid bitcast between types of different sizes.";
 }
 
-template <PrimitiveType primitive_src_type>
-Literal ConvertToC64(const LiteralBase& src_literal) {
-  CHECK(ShapeUtil::IsArray(src_literal.shape()));
-  Literal result_literal(
-      ShapeUtil::ChangeElementType(src_literal.shape(), C64));
-  using NativeSrcT =
-      typename primitive_util::PrimitiveTypeToNative<primitive_src_type>::type;
-  absl::Span<const NativeSrcT> src_data = src_literal.data<NativeSrcT>();
-  absl::Span<complex64> dest_data = result_literal.data<complex64>();
-  int64 num_elements = src_literal.element_count();
-  for (int64 i = 0; i < num_elements; ++i) {
-    dest_data[i] = complex64(static_cast<float>(src_data[i]), 0);
-  }
-  return result_literal;
-}
-
 template <PrimitiveType primitive_src_type, PrimitiveType primitive_dest_type>
 Literal ConvertIfTypesMatch(const LiteralBase& src_literal, bool bitcast) {
   CHECK_EQ(primitive_src_type, src_literal.shape().element_type());
@@ -1297,9 +1339,11 @@ StatusOr<Literal> ConvertIfDestTypeMatches(const LiteralBase& src_literal,
                                                            bitcast);
     CONVERT_IF_TYPES_MATCH(PRED)
     CONVERT_IF_TYPES_MATCH(S8)
+    CONVERT_IF_TYPES_MATCH(S16)
     CONVERT_IF_TYPES_MATCH(S32)
     CONVERT_IF_TYPES_MATCH(S64)
     CONVERT_IF_TYPES_MATCH(U8)
+    CONVERT_IF_TYPES_MATCH(U16)
     CONVERT_IF_TYPES_MATCH(U32)
     CONVERT_IF_TYPES_MATCH(U64)
     CONVERT_IF_TYPES_MATCH(F16)
@@ -1308,10 +1352,15 @@ StatusOr<Literal> ConvertIfDestTypeMatches(const LiteralBase& src_literal,
     CONVERT_IF_TYPES_MATCH(BF16)
 #undef CONVERT_IF_TYPES_MATCH
     case C64:
-      if (!bitcast) {
-        return ConvertToC64<primitive_src_type>(src_literal);
+      if (bitcast) {
+        break;
       }
-      break;
+      return ConvertIfTypesMatch<primitive_src_type, C64>(src_literal, false);
+    case C128:
+      if (bitcast) {
+        break;
+      }
+      return ConvertIfTypesMatch<primitive_src_type, C128>(src_literal, false);
     // Other types are not yet supported.
     default:
       break;
@@ -1324,7 +1373,7 @@ StatusOr<Literal> ConvertIfDestTypeMatches(const LiteralBase& src_literal,
 StatusOr<Literal> ConvertSwitch(const LiteralBase& literal,
                                 PrimitiveType primitive_dest_type,
                                 bool bitcast) {
-  TF_RET_CHECK(ShapeUtil::IsArray(literal.shape()));
+  TF_RET_CHECK(literal.shape().IsArray());
   if (literal.shape().element_type() == primitive_dest_type) {
     return literal.Clone();
   }
@@ -1335,9 +1384,11 @@ StatusOr<Literal> ConvertSwitch(const LiteralBase& literal,
                                             bitcast);
     CONVERT_IF_DEST_TYPE_MATCHES(PRED)
     CONVERT_IF_DEST_TYPE_MATCHES(S8)
+    CONVERT_IF_DEST_TYPE_MATCHES(S16)
     CONVERT_IF_DEST_TYPE_MATCHES(S32)
     CONVERT_IF_DEST_TYPE_MATCHES(S64)
     CONVERT_IF_DEST_TYPE_MATCHES(U8)
+    CONVERT_IF_DEST_TYPE_MATCHES(U16)
     CONVERT_IF_DEST_TYPE_MATCHES(U32)
     CONVERT_IF_DEST_TYPE_MATCHES(U64)
     CONVERT_IF_DEST_TYPE_MATCHES(F16)
@@ -1377,7 +1428,7 @@ StatusOr<Literal> LiteralBase::BitcastConvert(
 }
 
 StatusOr<Literal> LiteralBase::ConvertToShape(const Shape& dest_shape) const {
-  if (!ShapeUtil::IsTuple(dest_shape)) {
+  if (!dest_shape.IsTuple()) {
     return Convert(dest_shape.element_type());
   }
   std::vector<Literal> elements;
@@ -1409,7 +1460,7 @@ StatusOr<Literal> LiteralBase::ConvertToShape(const Shape& dest_shape) const {
 template <typename NativeT>
 bool LiteralBase::Piece::EqualElementsInternal(
     const LiteralBase::Piece& other, std::vector<int64>* multi_index) const {
-  if (multi_index->size() == ShapeUtil::Rank(subshape())) {
+  if (multi_index->size() == subshape().rank()) {
     return (Get<NativeT>(*multi_index) == other.Get<NativeT>(*multi_index));
   }
   for (int64 i = 0; i < subshape().dimensions(multi_index->size()); ++i) {
@@ -1459,6 +1510,8 @@ bool LiteralBase::Piece::EqualElements(const LiteralBase::Piece& other) const {
       return EqualElementsInternal<bfloat16>(other, &multi_index);
     case C64:
       return EqualElementsInternal<complex64>(other, &multi_index);
+    case C128:
+      return EqualElementsInternal<complex128>(other, &multi_index);
     default:
       LOG(FATAL) << "Unimplemented: LiteralBase::Piece::EqualElements for type "
                  << PrimitiveType_Name(subshape().element_type());
@@ -1472,7 +1525,7 @@ bool LiteralBase::operator==(const LiteralBase& other) const {
 
   return root_piece().ForEachSubpieceWithBool(
       [&](const ShapeIndex& index, const Piece& piece) {
-        if (!ShapeUtil::IsArray(piece.subshape())) {
+        if (!piece.subshape().IsArray()) {
           return true;
         }
 
@@ -1502,7 +1555,7 @@ static bool AllElementsEqualValue(absl::Span<const NativeT> data,
 bool LiteralBase::IsAll(int8 value) const {
   return root_piece().ForEachSubpieceWithBool([&](const ShapeIndex& index,
                                                   const Piece& piece) {
-    if (!ShapeUtil::IsArray(piece.subshape())) {
+    if (!piece.subshape().IsArray()) {
       return true;
     }
 
@@ -1570,30 +1623,24 @@ bool LiteralBase::IsAll(int8 value) const {
 bool LiteralBase::IsAllFloat(float value) const {
   return root_piece().ForEachSubpieceWithBool(
       [&](const ShapeIndex& index, const Piece& piece) {
-        if (!ShapeUtil::IsArray(piece.subshape())) {
+        if (!piece.subshape().IsArray()) {
           return true;
         }
 
-        auto piece_is_all = [&]() {
-          switch (shape().element_type()) {
-            case F32:
-              return AllElementsEqualValue<float>(piece.data<float>(), value);
-            case F64:
-              return AllElementsEqualValue<double>(piece.data<double>(), value);
-            case F16:
-              return AllElementsEqualValue<half>(piece.data<half>(),
-                                                 static_cast<half>(value));
-            case BF16:
-              return AllElementsEqualValue<bfloat16>(
-                  piece.data<bfloat16>(), static_cast<bfloat16>(value));
-            default:
-              return false;
-          }
-        };
-        if (!piece_is_all()) {
-          return false;
+        switch (shape().element_type()) {
+          case F32:
+            return AllElementsEqualValue<float>(piece.data<float>(), value);
+          case F64:
+            return AllElementsEqualValue<double>(piece.data<double>(), value);
+          case F16:
+            return AllElementsEqualValue<half>(piece.data<half>(),
+                                               static_cast<half>(value));
+          case BF16:
+            return AllElementsEqualValue<bfloat16>(
+                piece.data<bfloat16>(), static_cast<bfloat16>(value));
+          default:
+            return false;
         }
-        return true;
       });
 }
 
@@ -1602,6 +1649,9 @@ bool LiteralBase::IsAllComplex(complex64 value) const {
     case C64:
       return AllElementsEqualValue<complex64>(root_piece().data<complex64>(),
                                               value);
+    case C128:
+      return AllElementsEqualValue<complex128>(root_piece().data<complex128>(),
+                                               value);
     default:
       return false;
   }
@@ -1610,7 +1660,7 @@ bool LiteralBase::IsAllComplex(complex64 value) const {
 bool LiteralBase::IsAllFirst() const {
   return root_piece().ForEachSubpieceWithBool(
       [&](const ShapeIndex& index, const Piece& piece) {
-        if (!ShapeUtil::IsArray(piece.subshape())) {
+        if (!piece.subshape().IsArray()) {
           return true;
         }
 
@@ -1681,6 +1731,11 @@ bool LiteralBase::IsAllFirst() const {
               auto data = piece.data<uint64>();
               return AllElementsEqualValue<uint64>(data, data[0]);
             }
+
+            case C128: {
+              auto data = piece.data<complex128>();
+              return AllElementsEqualValue<complex128>(data, data[0]);
+            }
             default:
               return false;
           }
@@ -1694,11 +1749,11 @@ bool LiteralBase::IsAllFirst() const {
 }
 
 bool LiteralBase::IsR1Iota() const {
-  if (!ShapeUtil::IsArray(shape())) {
+  if (!shape().IsArray()) {
     return false;
   }
 
-  if (ShapeUtil::Rank(shape()) != 1) {
+  if (shape().rank() != 1) {
     return false;
   }
 
@@ -1730,6 +1785,8 @@ bool LiteralBase::IsR1Iota() const {
         return Get<bfloat16>({idx}) == static_cast<bfloat16>(idx);
       case C64:
         return Get<complex64>({idx}) == complex64(idx, 0.0f);
+      case C128:
+        return Get<complex128>({idx}) == complex128(idx, 0.0f);
       case PRED:
         return Get<bool>({idx}) == idx;
       // token, opaque, tuple, etc. are all not iota.
@@ -1749,7 +1806,7 @@ bool LiteralBase::IsR1Iota() const {
 }
 
 bool LiteralBase::IsZero(absl::Span<const int64> indices) const {
-  CHECK(ShapeUtil::IsArray(shape()));
+  CHECK(shape().IsArray());
   switch (shape().element_type()) {
     case U8:
       return Get<uint8>(indices) == 0;
@@ -1773,6 +1830,8 @@ bool LiteralBase::IsZero(absl::Span<const int64> indices) const {
       return Get<double>(indices) == 0.0;
     case C64:
       return Get<complex64>(indices) == complex64(0.0f, 0.0f);
+    case C128:
+      return Get<complex128>(indices) == complex128(0.0f, 0.0f);
     case F16:
       return Get<half>(indices) == static_cast<half>(0.0f);
     case BF16:
@@ -1860,6 +1919,12 @@ void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const {
         proto->add_c64s(value.imag());
       }
       break;
+    case C128:
+      for (complex128 value : data<complex128>()) {
+        proto->add_c128s(value.real());
+        proto->add_c128s(value.imag());
+      }
+      break;
     case TUPLE:
     case TOKEN:
       // Nothing to do but assign the shape which is done above.
@@ -1872,12 +1937,12 @@ void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const {
 }
 
 const void* LiteralBase::Piece::untyped_data() const {
-  CHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape());
+  CHECK(subshape().IsArray()) << ShapeUtil::HumanString(subshape());
   return buffer();
 }
 
 void* LiteralBase::Piece::untyped_data() {
-  CHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape());
+  CHECK(subshape().IsArray()) << ShapeUtil::HumanString(subshape());
   return buffer();
 }
 
@@ -1908,14 +1973,12 @@ Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
   if (LayoutUtil::IsSparseArray(subshape())) {
     // Compute the number of elements (indices) in the sparse shape and reserve
     // the necessary space in spare_indices.
-    TF_RET_CHECK(ShapeUtil::Rank(subshape()) != 0)
-        << "Scalar shapes cannot be sparse";
-    TF_RET_CHECK(proto.sparse_indices_size() % ShapeUtil::Rank(subshape()) == 0)
+    TF_RET_CHECK(subshape().rank() != 0) << "Scalar shapes cannot be sparse";
+    TF_RET_CHECK(proto.sparse_indices_size() % subshape().rank() == 0)
         << "Unexpected number of indices in proto ("
         << proto.sparse_indices_size() << ") for shape of rank "
-        << ShapeUtil::Rank(subshape());
-    const int64 index_count =
-        proto.sparse_indices_size() / ShapeUtil::Rank(subshape());
+        << subshape().rank();
+    const int64 index_count = proto.sparse_indices_size() / subshape().rank();
     sparse_indices()->Resize(index_count);
 
     // Copy the indices from the proto into the SparseIndexArray object.
@@ -1994,7 +2057,17 @@ Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
       for (int64 i = 0; i < complex_data.size(); ++i) {
         complex_data[i] = complex64{proto.c64s(i * 2), proto.c64s(i * 2 + 1)};
       }
-    } break;
+      break;
+    }
+    case C128: {
+      auto complex_data = data<complex128>();
+      TF_RET_CHECK(proto.c128s_size() == complex_data.size() * 2);
+      for (int64 i = 0; i < complex_data.size(); ++i) {
+        complex_data[i] =
+            complex128{proto.c128s(i * 2), proto.c128s(i * 2 + 1)};
+      }
+      break;
+    }
     case TUPLE:
       return InvalidArgument("Should not be called on tuple shapes: %s",
                              ShapeUtil::HumanString(subshape()));
@@ -2040,8 +2113,8 @@ int64 LiteralBase::size_bytes(const ShapeIndex& shape_index) const {
 }
 
 string LiteralBase::GetR1U8AsString() const {
-  CHECK(ShapeUtil::IsArray(shape()));
-  CHECK_EQ(ShapeUtil::Rank(shape()), 1);
+  CHECK(shape().IsArray());
+  CHECK_EQ(shape().rank(), 1);
   CHECK_EQ(shape().element_type(), U8);
   return string(absl::bit_cast<const char*>(data<uint8>().data()),
                 ShapeUtil::ElementsIn(shape()));
@@ -2055,7 +2128,7 @@ void MutableBorrowingLiteral::CopyPieceSubtree(const Shape& shape,
       << ShapeUtil::HumanString(src_piece->subshape())
       << "dest_piece has shape: "
       << ShapeUtil::HumanString(dest_piece->subshape());
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
       const Shape& subshape = shape.tuple_shapes(i);
 
@@ -2066,7 +2139,7 @@ void MutableBorrowingLiteral::CopyPieceSubtree(const Shape& shape,
 
       dest_piece->emplace_back(std::move(child_piece));
     }
-  } else if (ShapeUtil::IsArray(shape)) {
+  } else if (shape.IsArray()) {
     dest_piece->set_buffer(src_piece->buffer());
   } else {
     // If the shape is neither an array nor tuple, then it must be
@@ -2142,7 +2215,7 @@ MutableBorrowingLiteral::MutableBorrowingLiteral(const char* src_buf_ptr,
     : MutableLiteralBase() {
   shape_ = absl::make_unique<Shape>(shape);
   CHECK(LayoutUtil::HasLayout(*shape_));
-  CHECK(!ShapeUtil::IsTuple(*shape_));
+  CHECK(!shape_->IsTuple());
 
   root_piece_ = new Piece();
   root_piece_->set_buffer(const_cast<char*>(src_buf_ptr));
@@ -2169,14 +2242,14 @@ LiteralSlice::LiteralSlice(const LiteralBase& literal,
     : LiteralBase(), root_piece_(&literal.piece(view_root)) {}
 
 void BorrowingLiteral::BuildPieceSubtree(const Shape& shape, Piece* piece) {
-  CHECK(ShapeUtil::IsTuple(shape));
+  CHECK(shape.IsTuple());
   for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
     const Shape& subshape = shape.tuple_shapes(i);
 
     auto child_piece = Piece();
     child_piece.set_subshape(&subshape);
 
-    if (ShapeUtil::IsTuple(subshape)) {
+    if (subshape.IsTuple()) {
       BuildPieceSubtree(subshape, &child_piece);
     }
 
@@ -2186,7 +2259,7 @@ void BorrowingLiteral::BuildPieceSubtree(const Shape& shape, Piece* piece) {
 
 BorrowingLiteral::BorrowingLiteral(const char* src_buf_ptr, const Shape& shape)
     : LiteralBase(), shape_(absl::make_unique<Shape>(shape)) {
-  CHECK(ShapeUtil::IsArray(*shape_));
+  CHECK(shape_->IsArray());
   CHECK(LayoutUtil::HasLayout(*shape_));
 
   root_piece_ = Piece();
@@ -2197,7 +2270,7 @@ BorrowingLiteral::BorrowingLiteral(const char* src_buf_ptr, const Shape& shape)
 BorrowingLiteral::BorrowingLiteral(absl::Span<const char* const> src_buf_ptrs,
                                    const Shape& shape)
     : LiteralBase(), shape_(absl::make_unique<Shape>(shape)) {
-  CHECK(ShapeUtil::IsTuple(*shape_));
+  CHECK(shape_->IsTuple());
   CHECK(!ShapeUtil::IsNestedTuple(*shape_));
   CHECK_EQ(src_buf_ptrs.size(), ShapeUtil::TupleElementCount(*shape_));
   root_piece_ = Piece();
@@ -2206,7 +2279,7 @@ BorrowingLiteral::BorrowingLiteral(absl::Span<const char* const> src_buf_ptrs,
 
   for (int i = 0; i < src_buf_ptrs.size(); ++i) {
     const auto& src_shape = shape_->tuple_shapes(i);
-    CHECK(ShapeUtil::IsArray(src_shape));
+    CHECK(src_shape.IsArray());
     root_piece_.child(i).set_buffer(const_cast<char*>(src_buf_ptrs[i]));
   }
 }
diff --git a/tensorflow/compiler/xla/literal.h b/tensorflow/compiler/xla/literal.h
index fa9a71af4ceb998a7a289443cbef70eb52cb1a11..c418be895d6c3faa6a85ca2c73c6f42b0a021104 100644
--- a/tensorflow/compiler/xla/literal.h
+++ b/tensorflow/compiler/xla/literal.h
@@ -92,9 +92,20 @@ class LiteralBase {
   // array.
   string GetR1U8AsString() const;
 
-  // Returns a string representation of the literal value.
-  // Warning: this function can take minutes for multi-million element Literals.
-  string ToString(bool print_layout = false) const;
+  // Returns a string representation of the literal value. The Shape of the
+  // literal is a prefix of the literal value in the string.
+
+  // Warning: this function can take minutes for multi-million
+  // element Literals.
+  string ToString() const;
+
+  // Returns a string representation of the literal value which does *not*
+  // include the shape string.
+  string ToStringWithoutShape() const;
+
+  // Returns a string representation of the literal value which includes the
+  // shape string with its layout.does *not* include the shape string.
+  string ToStringWithLayout() const;
 
   // Gets an element in the literal at the given index. The multi_index is
   // CHECKed against the dimension sizes.
@@ -856,7 +867,7 @@ class BorrowingLiteral : public LiteralBase {
 
 template <typename NativeT>
 absl::Span<const NativeT> LiteralBase::Piece::data() const {
-  DCHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape());
+  DCHECK(subshape().IsArray()) << ShapeUtil::HumanString(subshape());
   DCHECK_EQ(subshape().element_type(),
             primitive_util::NativeToPrimitiveType<NativeT>())
       << "Attempting to access "
@@ -869,7 +880,7 @@ absl::Span<const NativeT> LiteralBase::Piece::data() const {
 
 template <typename NativeT>
 absl::Span<NativeT> LiteralBase::Piece::data() {
-  DCHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape());
+  DCHECK(subshape().IsArray()) << ShapeUtil::HumanString(subshape());
   DCHECK_EQ(subshape().element_type(),
             primitive_util::NativeToPrimitiveType<NativeT>())
       << "Attempting to access "
@@ -950,8 +961,12 @@ void MutableLiteralBase::AppendSparseElement(
   Piece& p = piece(shape_index);
   const Shape& subshape = p.subshape();
   CHECK(LayoutUtil::IsSparseArray(subshape));
-  int64 rank = ShapeUtil::Rank(subshape);
+  int64 rank = subshape.rank();
   CHECK_EQ(multi_index.size(), rank);
+  for (int64 i = 0; i < rank; ++i) {
+    CHECK_GE(multi_index[i], 0);
+    CHECK_LT(multi_index[i], subshape.dimensions(i));
+  }
   int64 last_element = p.sparse_indices()->index_count();
   CHECK_LT(last_element, LayoutUtil::MaxSparseElements(subshape.layout()));
   p.sparse_indices()->Append(multi_index);
@@ -966,7 +981,7 @@ void LiteralBase::EachCell(
   if (ShapeUtil::IsZeroElementArray(shape())) {
     return;
   }
-  std::vector<int64> indices(ShapeUtil::Rank(shape()), 0);
+  std::vector<int64> indices(shape().rank(), 0);
   do {
     per_cell(indices, Get<NativeT>(indices));
   } while (IndexUtil::BumpIndices(shape(), absl::MakeSpan(indices)));
@@ -974,8 +989,8 @@ void LiteralBase::EachCell(
 
 template <typename NativeT>
 inline void MutableLiteralBase::PopulateR1(absl::Span<const NativeT> values) {
-  CHECK(ShapeUtil::IsArray(shape()));
-  CHECK_EQ(ShapeUtil::Rank(shape()), 1);
+  CHECK(shape().IsArray());
+  CHECK_EQ(shape().rank(), 1);
   CHECK_EQ(ShapeUtil::ElementsIn(shape()), values.size());
   CHECK_EQ(shape().element_type(),
            primitive_util::NativeToPrimitiveType<NativeT>());
@@ -986,8 +1001,8 @@ inline void MutableLiteralBase::PopulateR1(absl::Span<const NativeT> values) {
 template <typename NativeT>
 void MutableLiteralBase::PopulateR2(
     std::initializer_list<std::initializer_list<NativeT>> values) {
-  CHECK(ShapeUtil::IsArray(shape()));
-  CHECK_EQ(ShapeUtil::Rank(shape()), 2);
+  CHECK(shape().IsArray());
+  CHECK_EQ(shape().rank(), 2);
   CHECK_EQ(shape().element_type(),
            primitive_util::NativeToPrimitiveType<NativeT>());
 
@@ -1010,10 +1025,10 @@ void MutableLiteralBase::PopulateR2(
 
 template <typename NativeT>
 void MutableLiteralBase::PopulateFromArray(const Array<NativeT>& values) {
-  CHECK(ShapeUtil::IsArray(shape()));
+  CHECK(shape().IsArray());
   CHECK_EQ(shape().element_type(),
            primitive_util::NativeToPrimitiveType<NativeT>());
-  CHECK_EQ(ShapeUtil::Rank(shape()), values.num_dimensions());
+  CHECK_EQ(shape().rank(), values.num_dimensions());
   for (int dim = 0; dim < values.num_dimensions(); ++dim) {
     CHECK_EQ(values.dim(dim), shape().dimensions(dim));
   }
@@ -1042,7 +1057,7 @@ void MutableLiteralBase::PopulateSparse(SparseIndexArray indices,
                                         absl::Span<const NativeT> values,
                                         bool sort) {
   CHECK(LayoutUtil::IsSparseArray(shape()));
-  int rank = ShapeUtil::Rank(shape());
+  int rank = shape().rank();
   CHECK_EQ(indices.rank(), rank);
   int64 max_elements = LayoutUtil::MaxSparseElements(shape().layout());
   CHECK_LE(indices.max_indices(), max_elements);
@@ -1066,7 +1081,7 @@ template <typename NativeT, typename FnType>
 Status MutableLiteralBase::PopulateInternal(const FnType& generator,
                                             bool parallel) {
   const Shape& this_shape = shape();
-  const int64 rank = ShapeUtil::Rank(this_shape);
+  const int64 rank = this_shape.rank();
   TF_RET_CHECK(LayoutUtil::IsDenseArray(this_shape));
   TF_RET_CHECK(this_shape.element_type() ==
                primitive_util::NativeToPrimitiveType<NativeT>());
@@ -1118,7 +1133,7 @@ Status MutableLiteralBase::PopulateParallel(const FnType& generator) {
 
 template <typename NativeT>
 void MutableLiteralBase::PopulateWithValue(NativeT value) {
-  CHECK(ShapeUtil::IsArray(shape()));
+  CHECK(shape().IsArray());
   CHECK_EQ(shape().element_type(),
            primitive_util::NativeToPrimitiveType<NativeT>());
   for (NativeT& element : data<NativeT>()) {
diff --git a/tensorflow/compiler/xla/literal_comparison.cc b/tensorflow/compiler/xla/literal_comparison.cc
index b044f0ad73f13a0599e77f1f43888bc974e31f73..9b3de75dd4e9d495778af86fb8fc07909ab4ba81 100644
--- a/tensorflow/compiler/xla/literal_comparison.cc
+++ b/tensorflow/compiler/xla/literal_comparison.cc
@@ -46,68 +46,116 @@ uint16 GetRawValue(Eigen::half val) { return val.x; }
 // between the left-hand-side and right-hand-side, by bit-casting to UnsignedT
 // -- on miscompare, a nice error message is given in the AssertionFailure.
 template <typename FloatT, typename UnsignedT>
-Status CompareFloatsBitwiseEqual(FloatT lhs, FloatT rhs,
-                                 absl::Span<const int64> multi_index) {
+bool CompareFloatsBitwiseEqual(FloatT lhs, FloatT rhs,
+                               absl::Span<const int64> multi_index) {
+  auto ulhs = absl::bit_cast<UnsignedT>(GetRawValue(lhs));
+  auto urhs = absl::bit_cast<UnsignedT>(GetRawValue(rhs));
+  return ulhs == urhs;
+}
+
+// Templated comparator that specializes for float equality comparison with the
+// bitwise helper above (this is the un-specialized fallback, to just use the
+// default gunit implementation).
+template <typename NativeT>
+bool CompareEqual(NativeT lhs, NativeT rhs,
+                  absl::Span<const int64> multi_index) {
+  return lhs == rhs;
+}
+
+// Specializations for floating types that do bitwise comparisons when equality
+// comparison is requested.
+template <>
+bool CompareEqual<bfloat16>(bfloat16 lhs, bfloat16 rhs,
+                            absl::Span<const int64> multi_index) {
+  return CompareFloatsBitwiseEqual<bfloat16, uint16>(lhs, rhs, multi_index);
+}
+template <>
+bool CompareEqual<Eigen::half>(Eigen::half lhs, Eigen::half rhs,
+                               absl::Span<const int64> multi_index) {
+  return CompareFloatsBitwiseEqual<Eigen::half, uint16>(lhs, rhs, multi_index);
+}
+template <>
+bool CompareEqual<float>(float lhs, float rhs,
+                         absl::Span<const int64> multi_index) {
+  return CompareFloatsBitwiseEqual<float, uint32>(lhs, rhs, multi_index);
+}
+template <>
+bool CompareEqual<double>(double lhs, double rhs,
+                          absl::Span<const int64> multi_index) {
+  return CompareFloatsBitwiseEqual<double, uint64>(lhs, rhs, multi_index);
+}
+template <>
+bool CompareEqual<complex64>(complex64 lhs, complex64 rhs,
+                             absl::Span<const int64> multi_index) {
+  return CompareEqual<float>(lhs.real(), rhs.real(), multi_index) &&
+         CompareEqual<float>(lhs.imag(), rhs.imag(), multi_index);
+}
+template <>
+bool CompareEqual<complex128>(complex128 lhs, complex128 rhs,
+                              absl::Span<const int64> multi_index) {
+  return CompareEqual<double>(lhs.real(), rhs.real(), multi_index) &&
+         CompareEqual<double>(lhs.imag(), rhs.imag(), multi_index);
+}
+
+template <typename NativeT, typename UnsignedT>
+Status MakeBitwiseErrorStatus(NativeT lhs, NativeT rhs,
+                              absl::Span<const int64> multi_index) {
   auto ulhs = absl::bit_cast<UnsignedT>(GetRawValue(lhs));
   auto urhs = absl::bit_cast<UnsignedT>(GetRawValue(rhs));
   auto lhs_double = static_cast<double>(lhs);
   auto rhs_double = static_cast<double>(rhs);
-  if (ulhs != urhs) {
     return InvalidArgument(
         "floating values are not bitwise-equal; and equality testing "
         "was requested: %s=%g=%a vs %s=%g=%a at array index %s",
         StrCat(absl::Hex(ulhs)), lhs_double, lhs_double,
         StrCat(absl::Hex(urhs)), rhs_double, rhs_double,
         LiteralUtil::MultiIndexAsString(multi_index));
-  }
-  return Status::OK();
 }
 
-// Templated comparator that specializes for float equality comparison with the
-// bitwise helper above (this is the un-specialized fallback, to just use the
-// default gunit implementation).
 template <typename NativeT>
-Status CompareEqual(NativeT lhs, NativeT rhs,
-                    absl::Span<const int64> multi_index) {
-  if (lhs == rhs) {
-    return Status::OK();
-  }
+Status MakeErrorStatus(NativeT lhs, NativeT rhs,
+                       absl::Span<const int64> multi_index) {
   return InvalidArgument(
       "first mismatch at array index %s:\n  expected value: %s\n  actual "
       "value:   %s",
       LiteralUtil::MultiIndexAsString(multi_index), StrCat(lhs), StrCat(rhs));
 }
 
-// Specializations for floating types that do bitwise comparisons when equality
-// comparison is requested.
 template <>
-Status CompareEqual<bfloat16>(bfloat16 lhs, bfloat16 rhs,
-                              absl::Span<const int64> multi_index) {
-  return CompareFloatsBitwiseEqual<bfloat16, uint16>(lhs, rhs, multi_index);
+Status MakeErrorStatus(bfloat16 lhs, bfloat16 rhs,
+                       absl::Span<const int64> multi_index) {
+  return MakeBitwiseErrorStatus<bfloat16, uint16>(lhs, rhs, multi_index);
 }
 template <>
-Status CompareEqual<Eigen::half>(Eigen::half lhs, Eigen::half rhs,
-                                 absl::Span<const int64> multi_index) {
-  return CompareFloatsBitwiseEqual<Eigen::half, uint16>(lhs, rhs, multi_index);
+Status MakeErrorStatus(Eigen::half lhs, Eigen::half rhs,
+                       absl::Span<const int64> multi_index) {
+  return MakeBitwiseErrorStatus<Eigen::half, uint16>(lhs, rhs, multi_index);
 }
 template <>
-Status CompareEqual<float>(float lhs, float rhs,
-                           absl::Span<const int64> multi_index) {
-  return CompareFloatsBitwiseEqual<float, uint32>(lhs, rhs, multi_index);
+Status MakeErrorStatus(float lhs, float rhs,
+                       absl::Span<const int64> multi_index) {
+  return MakeBitwiseErrorStatus<float, uint32>(lhs, rhs, multi_index);
 }
 template <>
-Status CompareEqual<double>(double lhs, double rhs,
-                            absl::Span<const int64> multi_index) {
-  return CompareFloatsBitwiseEqual<double, uint64>(lhs, rhs, multi_index);
+Status MakeErrorStatus(double lhs, double rhs,
+                       absl::Span<const int64> multi_index) {
+  return MakeBitwiseErrorStatus<double, uint64>(lhs, rhs, multi_index);
 }
 template <>
-Status CompareEqual<complex64>(complex64 lhs, complex64 rhs,
-                               absl::Span<const int64> multi_index) {
-  auto res = CompareEqual<float>(lhs.real(), rhs.real(), multi_index);
-  if (!res.ok()) {
-    return res;
+Status MakeErrorStatus(complex64 lhs, complex64 rhs,
+                       absl::Span<const int64> multi_index) {
+  if (!CompareEqual<float>(lhs.real(), rhs.real(), multi_index)) {
+    return MakeErrorStatus(lhs.real(), rhs.real(), multi_index);
   }
-  return CompareEqual<float>(lhs.imag(), rhs.imag(), multi_index);
+  return MakeErrorStatus(lhs.imag(), rhs.imag(), multi_index);
+}
+template <>
+Status MakeErrorStatus(complex128 lhs, complex128 rhs,
+                       absl::Span<const int64> multi_index) {
+  if (!CompareEqual<double>(lhs.real(), rhs.real(), multi_index)) {
+    return MakeErrorStatus(lhs.real(), rhs.real(), multi_index);
+  }
+  return MakeErrorStatus(lhs.imag(), rhs.imag(), multi_index);
 }
 
 // A recursive function which iterates through every index of expected and
@@ -119,7 +167,11 @@ Status Equal(LiteralSlice expected, LiteralSlice actual,
   if (dimension == expected.shape().dimensions_size()) {
     NativeT expected_value = expected.Get<NativeT>(multi_index);
     NativeT actual_value = actual.Get<NativeT>(multi_index);
-    return CompareEqual<NativeT>(expected_value, actual_value, multi_index);
+    bool result =
+        CompareEqual<NativeT>(expected_value, actual_value, multi_index);
+    return result ? Status::OK()
+                  : MakeErrorStatus<NativeT>(expected_value, actual_value,
+                                             multi_index);
   }
 
   Status result;
@@ -134,53 +186,40 @@ Status Equal(LiteralSlice expected, LiteralSlice actual,
 // Gets the total element count.  For tuples, this is not the count of tuple
 // elements, but the sum of elements of each tuple element.
 int64 RecursiveElementCount(const Shape& shape) {
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     const int64 tuple_elements = ShapeUtil::TupleElementCount(shape);
     int64 total = 0;
     for (int64 i = 0; i < tuple_elements; ++i) {
       total += RecursiveElementCount(ShapeUtil::GetTupleElementShape(shape, i));
     }
     return total;
-  } else if (ShapeUtil::IsArray(shape)) {
+  } else if (shape.IsArray()) {
     return ShapeUtil::ElementsIn(shape);
   } else {
     return 0;
   }
 }
 
-// Returns whether the actual and expected values are mismatched with respect to
-// nans. 'relaxed_nans' is interpreted as in xla::ErrorSpec.
+// Returns whether the given value is infinity.
 template <typename NativeT>
-bool NanMismatch(NativeT expected, NativeT actual, bool relaxed_nans) {
-  if (relaxed_nans) {
-    return !std::isnan(expected) && std::isnan(actual);
-  } else {
-    return std::isnan(expected) != std::isnan(actual);
-  }
-}
-
-template <>
-bool NanMismatch<complex64>(complex64 expected, complex64 actual,
-                            bool relaxed_nans) {
-  return NanMismatch<float>(expected.real(), actual.real(), relaxed_nans) ||
-         NanMismatch<float>(expected.imag(), actual.imag(), relaxed_nans);
+bool IsInf(NativeT val) {
+  return std::isinf(val);
 }
 
 template <>
-bool NanMismatch<half>(half expected, half actual, bool relaxed_nans) {
-  return NanMismatch<float>(static_cast<float>(expected),
-                            static_cast<float>(actual), relaxed_nans);
+bool IsInf<half>(half val) {
+  return std::isinf(static_cast<float>(val));
 }
 
-// Returns whether the given value is infinity.
+// Returns whether the given value is nan.
 template <typename NativeT>
-bool IsInf(NativeT val) {
-  return std::isinf(val);
+float IsNan(NativeT value) {
+  return std::isnan(value);
 }
 
 template <>
-bool IsInf<half>(half val) {
-  return std::isinf(static_cast<float>(val));
+float IsNan(half value) {
+  return IsNan<float>(static_cast<float>(value));
 }
 
 // Converts the given floating-point value to a string.
@@ -194,6 +233,11 @@ string FpValueToString<complex64>(complex64 value) {
   return absl::StrFormat("%8.4g + %8.4fi", value.real(), value.imag());
 }
 
+template <>
+string FpValueToString<complex128>(complex128 value) {
+  return absl::StrFormat("%8.4g + %8.4fi", value.real(), value.imag());
+}
+
 // Returns the absolute value of the given floating point value. This function
 // is used instead of std::abs directly in order to allow type-dependent
 // implementations for NearComparator.
@@ -273,7 +317,7 @@ class NearComparator {
     // If the shapes mismatch, we simply fail the expectation instead of
     // printing out data, as it's a type error rather than a value error.
     TF_RETURN_IF_ERROR(EqualShapes(expected_.shape(), actual_.shape()));
-    if (!ShapeUtil::IsArray(expected_.shape())) {
+    if (!expected_.shape().IsArray()) {
       return InvalidArgument("Expected array shape; got %s.",
                              ShapeUtil::HumanString(expected_.shape()));
     }
@@ -326,35 +370,59 @@ class NearComparator {
   // the given literal_index and keeps track of various mismatch statistics.
   template <typename T>
   void CompareValues(T expected, T actual, int64 linear_index) {
-    const bool is_nan_mismatch =
-        NanMismatch(expected, actual, error_.relaxed_nans);
     float abs_error;
     float rel_error;
-    if (CompareEqual<T>(expected, actual, {linear_index}).ok()) {
+    if (CompareEqual<T>(expected, actual, {linear_index})) {
       abs_error = 0;
       rel_error = 0;
-    } else if (is_nan_mismatch) {
-      num_nan_mismatches_++;
-      // A nan mismatch is considered to have infinite error. rel_error is used
-      // for sorting a std::set of the top mismatchs, and a nan value here will
-      // result in undefined behavior because nan's do not satisfy the strict
-      // weak ordering requirement of std containers.
-      abs_error = std::numeric_limits<float>::infinity();
-      rel_error = std::numeric_limits<float>::infinity();
+    } else if (IsNan(expected) || IsNan(actual)) {
+      if ((!error_.relaxed_nans && IsNan(expected) != IsNan(actual)) ||
+          (error_.relaxed_nans && !IsNan(expected) && IsNan(actual))) {
+        num_nan_mismatches_++;
+        // A nan mismatch is considered to have infinite error. rel_error is
+        // used for sorting a std::set of the top mismatchs, and a nan value
+        // here will result in undefined behavior because nan's do not satisfy
+        // the strict weak ordering requirement of std containers.
+        abs_error = std::numeric_limits<float>::infinity();
+        rel_error = std::numeric_limits<float>::infinity();
+      } else {
+        abs_error = 0;
+        rel_error = 0;
+      }
+    } else if (IsInf(actual) && !IsInf(expected) && error_.fewer_infs_ok) {
+      // `fewer_infs_ok` gives us the option of comparing as though `actual`
+      // were float_max/min rather than inf.
+      T actual_finite = actual > T{0} ? std::numeric_limits<T>::max()
+                                      : std::numeric_limits<T>::lowest();
+      abs_error = FpAbsoluteValue(actual_finite - expected);
+
+      // Avoid division by 0 even though it's well-defined because ubsan can be
+      // configured to treat this as a fatal error.
+      if (expected != T{0}) {
+        rel_error = abs_error / FpAbsoluteValue(expected);
+      } else {
+        rel_error = std::numeric_limits<float>::infinity();
+      }
     } else if (IsInf(expected) || IsInf(actual)) {
       // If either the expected or actual value is infinity but not both,
       // then both absolute and relative error are regarded as inifity.
-      CHECK(!CompareEqual(expected, actual, {linear_index}).ok());
+      CHECK(!CompareEqual(expected, actual, {linear_index}));
       abs_error = std::numeric_limits<float>::infinity();
       rel_error = std::numeric_limits<float>::infinity();
     } else {
       abs_error = FpAbsoluteValue(actual - expected);
-      rel_error = abs_error / FpAbsoluteValue(expected);
+
+      // Avoid division by 0 even though it's well-defined because ubsan can be
+      // configured to treat this as a fatal error.
+      if (expected != T{0}) {
+        rel_error = abs_error / FpAbsoluteValue(expected);
+      } else {
+        rel_error = std::numeric_limits<float>::infinity();
+      }
     }
     const bool is_abs_mismatch = abs_error > error_.abs;
     const bool is_rel_mismatch = rel_error > error_.rel;
-    const bool is_mismatch =
-        is_nan_mismatch || (is_abs_mismatch && is_rel_mismatch);
+    const bool is_mismatch = is_abs_mismatch && is_rel_mismatch;
 
     // Update the error of the relative bucket only if the *absolute* error
     // bound is exceeded and vice versa.
@@ -389,7 +457,7 @@ class NearComparator {
     mismatches_.data<bool>()[linear_index] = true;
   }
 
-  // For complex64 types, we compare real and imaginary parts individually.
+  // For complex types, we compare real and imaginary parts individually.
   void CompareValues(complex64 expected, complex64 actual, int64 linear_index) {
     bool mismatch = false;
     CompareValues<float>(expected.real(), actual.real(), linear_index);
@@ -412,6 +480,29 @@ class NearComparator {
     mismatches_.data<bool>()[linear_index] = mismatch;
   }
 
+  void CompareValues(complex128 expected, complex128 actual,
+                     int64 linear_index) {
+    bool mismatch = false;
+    CompareValues<double>(expected.real(), actual.real(), linear_index);
+    if (mismatches_.data<bool>()[linear_index] == true) {
+      mismatch = true;
+      // Delay the mismatch count increase for real part, instead increase
+      // mismatch by 1 for the entire complex number.
+      num_mismatches_--;
+    }
+    CompareValues<double>(expected.imag(), actual.imag(), linear_index);
+    if (mismatches_.data<bool>()[linear_index] == true) {
+      mismatch = true;
+      // Delay the mismatch count increase for imag part, instead increase
+      // mismatch by 1 for the entire complex number.
+      num_mismatches_--;
+    }
+    if (mismatch == true) {
+      num_mismatches_++;
+    }
+    mismatches_.data<bool>()[linear_index] = mismatch;
+  }
+
   // Compares the two literals elementwise.
   void CompareLiterals() {
     // Fast path optimization for the case were layouts match.
@@ -425,7 +516,7 @@ class NearComparator {
       }
       return;
     }
-    std::vector<int64> multi_index(ShapeUtil::Rank(actual_.shape()), 0);
+    std::vector<int64> multi_index(actual_.shape().rank(), 0);
     CompareLiteralsSlow(0, &multi_index);
   }
 
@@ -620,6 +711,9 @@ Status EqualHelper(const LiteralSlice& expected, const LiteralSlice& actual) {
     case C64:
       result = Equal<complex64>(expected, actual, index, 0);
       break;
+    case C128:
+      result = Equal<complex128>(expected, actual, index, 0);
+      break;
     case TUPLE: {
       for (int i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
         result.Update(EqualHelper(LiteralSlice(expected, {i}),
@@ -642,12 +736,12 @@ Status EqualHelper(const LiteralSlice& expected, const LiteralSlice& actual) {
 // via recursion. shape_index is the ShapeIndex of expected (or actual)
 // currently being compared.
 Status NearHelper(const LiteralSlice& expected, const LiteralSlice& actual,
-                  const ErrorSpec& error, bool detailed_message,
+                  const ErrorSpec& error, absl::optional<bool> detailed_message,
                   const MiscompareCallback& miscompare_callback,
                   const ShapeIndex& shape_index) {
   TF_RETURN_IF_ERROR(EqualShapes(expected.shape(), actual.shape()));
 
-  if (ShapeUtil::IsTuple(expected.shape())) {
+  if (expected.shape().IsTuple()) {
     Status return_status;
     for (int64 i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
       const auto expected_element = LiteralSlice(expected, {i});
@@ -683,26 +777,32 @@ Status NearHelper(const LiteralSlice& expected, const LiteralSlice& actual,
 
   if (ShapeUtil::ElementIsFloating(expected.shape()) ||
       ShapeUtil::ElementIsComplex(expected.shape())) {
+    bool use_detailed_message = detailed_message.value_or(
+        ShapeUtil::ElementsIn(expected.shape()) >= 64);
     switch (expected.shape().element_type()) {
       case BF16:
         return NearComparator<bfloat16>::Compare(
-            expected, actual, error, detailed_message, miscompare_callback);
+            expected, actual, error, use_detailed_message, miscompare_callback);
         break;
       case F16:
         return NearComparator<half>::Compare(
-            expected, actual, error, detailed_message, miscompare_callback);
+            expected, actual, error, use_detailed_message, miscompare_callback);
         break;
       case F32:
         return NearComparator<float>::Compare(
-            expected, actual, error, detailed_message, miscompare_callback);
+            expected, actual, error, use_detailed_message, miscompare_callback);
         break;
       case F64:
         return NearComparator<double>::Compare(
-            expected, actual, error, detailed_message, miscompare_callback);
+            expected, actual, error, use_detailed_message, miscompare_callback);
         break;
       case C64:
         return NearComparator<complex64>::Compare(
-            expected, actual, error, detailed_message, miscompare_callback);
+            expected, actual, error, use_detailed_message, miscompare_callback);
+        break;
+      case C128:
+        return NearComparator<complex128>::Compare(
+            expected, actual, error, use_detailed_message, miscompare_callback);
         break;
       default:
         LOG(FATAL) << "Unsupported primitive type in near comparator: "
@@ -723,7 +823,7 @@ Status EqualShapes(const Shape& expected, const Shape& actual) {
                            ShapeUtil::HumanString(expected),
                            ShapeUtil::HumanString(actual));
   }
-  if (ShapeUtil::IsTuple(expected)) {
+  if (expected.IsTuple()) {
     if (ShapeUtil::TupleElementCount(expected) !=
         ShapeUtil::TupleElementCount(actual)) {
       return InvalidArgument(
@@ -738,8 +838,8 @@ Status EqualShapes(const Shape& expected, const Shape& actual) {
         return AppendStatus(result, StrCat("mismatch in tuple index", i));
       }
     }
-  } else if (ShapeUtil::IsArray(expected)) {
-    if (ShapeUtil::Rank(expected) != ShapeUtil::Rank(actual)) {
+  } else if (expected.IsArray()) {
+    if (expected.rank() != actual.rank()) {
       return InvalidArgument("want rank of %s got rank of %s",
                              ShapeUtil::HumanString(expected),
                              ShapeUtil::HumanString(actual));
@@ -793,7 +893,7 @@ Status Equal(const LiteralSlice& expected, const LiteralSlice& actual) {
 }
 
 Status Near(const LiteralSlice& expected, const LiteralSlice& actual,
-            const ErrorSpec& error, bool detailed_message,
+            const ErrorSpec& error, absl::optional<bool> detailed_message,
             const MiscompareCallback& miscompare_callback) {
   VLOG(1) << "Expected literal:";
   XLA_VLOG_LINES(1, expected.ToString());
diff --git a/tensorflow/compiler/xla/literal_comparison.h b/tensorflow/compiler/xla/literal_comparison.h
index 9e5bf7c1d062ef0f25d07a80d6ded8106df5dacc..23fff3fa348f1652eaec344da4c40ccf3ad1079a 100644
--- a/tensorflow/compiler/xla/literal_comparison.h
+++ b/tensorflow/compiler/xla/literal_comparison.h
@@ -55,9 +55,10 @@ using MiscompareCallback =
 // being compared.
 //
 // If detailed_message is true, then the error message in the assertion result
-// will contain a more detailed breakdown of mismatches.
+// will contain a more detailed breakdown of mismatches.  By default, we display
+// a detailed message only for "large" inputs.
 Status Near(const LiteralSlice& expected, const LiteralSlice& actual,
-            const ErrorSpec& error, bool detailed_message,
+            const ErrorSpec& error, absl::optional<bool> detailed_message,
             const MiscompareCallback& miscompare_callback);
 
 // Calling ToString on a literal with over 100 million elements takes around
diff --git a/tensorflow/compiler/xla/literal_test.cc b/tensorflow/compiler/xla/literal_test.cc
index 49363ad802ddb9520f89b53257216bc7ddaf8ff5..b54a71ae68218ef578535a913f5867d843236e32 100644
--- a/tensorflow/compiler/xla/literal_test.cc
+++ b/tensorflow/compiler/xla/literal_test.cc
@@ -98,42 +98,45 @@ class LiteralUtilTest : public ::testing::Test {
 
 TEST_F(LiteralUtilTest, LiteralScalarToString) {
   auto true_lit = LiteralUtil::CreateR0<bool>(true);
-  EXPECT_EQ("true", true_lit.ToString());
+  EXPECT_EQ("pred[] true", true_lit.ToString());
 
   auto false_lit = LiteralUtil::CreateR0<bool>(false);
-  EXPECT_EQ("false", false_lit.ToString());
+  EXPECT_EQ("pred[] false", false_lit.ToString());
 
   auto u32_lit = LiteralUtil::CreateR0<uint32>(42);
-  EXPECT_EQ("42", u32_lit.ToString());
+  EXPECT_EQ("u32[] 42", u32_lit.ToString());
 
   auto s32_lit = LiteralUtil::CreateR0<int32>(-999);
-  EXPECT_EQ("-999", s32_lit.ToString());
+  EXPECT_EQ("s32[] -999", s32_lit.ToString());
 
   auto f32_lit = LiteralUtil::CreateR0<float>(3.14f);
-  EXPECT_EQ("3.14", f32_lit.ToString());
+  EXPECT_EQ("f32[] 3.14", f32_lit.ToString());
 
   auto f16_lit = LiteralUtil::CreateR0<half>(static_cast<half>(0.5f));
-  EXPECT_EQ("0.5", f16_lit.ToString());
+  EXPECT_EQ("f16[] 0.5", f16_lit.ToString());
 
   auto c64_lit = LiteralUtil::CreateR0<complex64>({3.14f, 2.78f});
-  EXPECT_EQ("(3.14, 2.78)", c64_lit.ToString());
+  EXPECT_EQ("c64[] (3.14, 2.78)", c64_lit.ToString());
+
+  auto c128_lit = LiteralUtil::CreateR0<complex128>({3.14f, 2.78f});
+  EXPECT_EQ("c128[] (3.14, 2.78)", c128_lit.ToString());
 
   auto bf16_lit = LiteralUtil::CreateR0<bfloat16>(static_cast<bfloat16>(0.5f));
-  EXPECT_EQ("0.5", bf16_lit.ToString());
+  EXPECT_EQ("bf16[] 0.5", bf16_lit.ToString());
 
   // 3.14 will be rounded to 3.14062 in bfloat16 format.
   auto bf16_lit_truncated =
       LiteralUtil::CreateR0<bfloat16>(static_cast<bfloat16>(3.14f));
-  ASSERT_EQ("3.14062", bf16_lit_truncated.ToString());
+  ASSERT_EQ("bf16[] 3.14062", bf16_lit_truncated.ToString());
 
   auto bf16_lit_truncated2 =
       LiteralUtil::CreateR0<bfloat16>(static_cast<bfloat16>(9.001f));
-  EXPECT_EQ("9", bf16_lit_truncated2.ToString());
+  EXPECT_EQ("bf16[] 9", bf16_lit_truncated2.ToString());
 }
 
 TEST_F(LiteralUtilTest, LiteralVectorToString) {
   auto pred_vec = LiteralUtil::CreateR1<bool>({true, false, true});
-  EXPECT_EQ("{1, 0, 1}", pred_vec.ToString());
+  EXPECT_EQ("pred[3] {1, 0, 1}", pred_vec.ToString());
 }
 
 TEST_F(LiteralUtilTest, R2ToString) {
@@ -210,8 +213,8 @@ TEST_F(LiteralUtilTest, TupleToString) {
   auto scalar = LiteralUtil::CreateR0<float>(1.0);
   auto matrix = LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
   auto tuple = LiteralUtil::MakeTuple({&scalar, &matrix});
-  const string expected = R"((f32[], f32[2,2]) (
-1,
+  const string expected = R"((
+f32[] 1,
 f32[2,2] {
   { 1, 2 },
   { 3, 4 }
@@ -469,6 +472,21 @@ TEST_F(LiteralUtilTest, C64Equality) {
   EXPECT_NE(vector, vector_reversed);
 }
 
+TEST_F(LiteralUtilTest, C128Equality) {
+  // Test equality with tuples.
+  auto vector = LiteralUtil::CreateR1<complex128>({{1.0, 2.0}, {3.0, 4.0}});
+
+  // Tuple with the same elements. One element is shared with the original
+  // tuple, the other is a clone of the element in the original tuple.
+  auto vector_clone =
+      LiteralUtil::CreateR1<complex128>({{1.0, 2.0}, {3.0, 4.0}});
+  EXPECT_EQ(vector, vector_clone);
+
+  auto vector_reversed =
+      LiteralUtil::CreateR1<complex128>({{3.0, 4.0}, {1.0, 2.0}});
+  EXPECT_NE(vector, vector_reversed);
+}
+
 TEST_F(LiteralUtilTest, IsAllTuple) {
   auto element1 = LiteralUtil::CreateR0<float>(0.0);
   auto element2 = LiteralUtil::CreateR2<float>({{0.0, 0.0}, {0.0, 0.0}});
@@ -623,7 +641,7 @@ template <typename T>
 class LiteralUtilTestTemplated : public ::testing::Test {};
 
 using TestedTypes = ::testing::Types<float, int32, uint32, complex64>;
-TYPED_TEST_CASE(LiteralUtilTestTemplated, TestedTypes);
+TYPED_TEST_SUITE(LiteralUtilTestTemplated, TestedTypes);
 
 TYPED_TEST(LiteralUtilTestTemplated, Relayout2x2) {
   // Make a non-integer for floating point types.
@@ -836,6 +854,13 @@ TEST_F(LiteralUtilTest, PopulateR1C64) {
   EXPECT_EQ(output, expected);
 }
 
+TEST_F(LiteralUtilTest, PopulateR1C128) {
+  Literal output(ShapeUtil::MakeShape(C128, {1}));
+  output.PopulateR1<complex128>({{77, 88}});
+  auto expected = LiteralUtil::CreateR1<complex128>({{77, 88}});
+  EXPECT_EQ(output, expected);
+}
+
 TEST_F(LiteralUtilTest, PopulateR2C64) {
   Literal output(ShapeUtil::MakeShape(C64, {2, 2}));
   output.PopulateR2<complex64>({{{7, 8}, {9, 10}}, {{1, 2}, {3, 4}}});
@@ -897,6 +922,14 @@ TEST_F(LiteralUtilTest, PopulateWithValueR2C64) {
   EXPECT_EQ(output, expected);
 }
 
+TEST_F(LiteralUtilTest, PopulateWithValueR2C128) {
+  Literal output(ShapeUtil::MakeShape(C128, {2, 2}));
+  output.PopulateWithValue<complex128>({4, 2});
+  auto expected =
+      LiteralUtil::CreateR2<complex128>({{{4, 2}, {4, 2}}, {{4, 2}, {4, 2}}});
+  EXPECT_EQ(output, expected);
+}
+
 TEST_F(LiteralUtilTest, PopulateWithValueR0F16) {
   Literal output(ShapeUtil::MakeShape(F16, {}));
   half h(0.25f);
@@ -1237,11 +1270,21 @@ TEST_F(LiteralUtilTest, ConvertIfTypesMatch) {
     {{0, 19, 0, 21}, {22, 0, 24, 0}},
     {{26, 0, 28, 0}, {0, 31, 0, 33}},
   }}, layout_r4_dim0major_);
+  auto s16 = LiteralUtil::CreateR4WithLayout<int16>({{
+    {{10, 0, 12, 0}, {0, 15, 0, 17}},
+    {{0, 19, 0, 21}, {22, 0, 24, 0}},
+    {{26, 0, 28, 0}, {0, 31, 0, 33}},
+  }}, layout_r4_dim0major_);
   auto s32 = LiteralUtil::CreateR4WithLayout<int32>({{
     {{10, 0, 12, 0}, {0, 15, 0, 17}},
     {{0, 19, 0, 21}, {22, 0, 24, 0}},
     {{26, 0, 28, 0}, {0, 31, 0, 33}},
   }}, layout_r4_dim0major_);
+  auto u16 = LiteralUtil::CreateR4WithLayout<uint16>({{
+    {{10, 0, 12, 0}, {0, 15, 0, 17}},
+    {{0, 19, 0, 21}, {22, 0, 24, 0}},
+    {{26, 0, 28, 0}, {0, 31, 0, 33}},
+  }}, layout_r4_dim0major_);
   auto u32 = LiteralUtil::CreateR4WithLayout<uint32>({{
     {{10, 0, 12, 0}, {0, 15, 0, 17}},
     {{0, 19, 0, 21}, {22, 0, 24, 0}},
@@ -1298,9 +1341,19 @@ TEST_F(LiteralUtilTest, ConvertIfTypesMatch) {
     {{0.0f, 19.0f, 0.0f, 21.0f}, {22.0f, 0.0f, 24.0f, 0.0f}},
     {{26.0f, 0.0f, 28.0f, 0.0f}, {0.0f, 31.0f, 0.0f, 33.0f}},
   }}, layout_r4_dim0major_);
-  // clang-format on
+  auto c128 = LiteralUtil::CreateR4WithLayout<complex128>({{
+    {{10.0, 0.0, 12.0, 0.0}, {0.0, 15.0, 0.0, 17.0}},
+    {{0.0, 19.0, 0.0, 21.0}, {22.0, 0.0, 24.0, 0.0}},
+    {{26.0, 0.0, 28.0, 0.0}, {0.0, 31.0, 0.0, 33.0}},
+  }}, layout_r4_dim0major_);  // clang-format on
   Literal conv;
 
+  conv = s8.Convert(U16).ConsumeValueOrDie();
+  EXPECT_EQ(conv, u16);
+
+  conv = s8.Convert(S16).ConsumeValueOrDie();
+  EXPECT_EQ(conv, s16);
+
   conv = s8.Convert(U32).ConsumeValueOrDie();
   EXPECT_EQ(conv, u32);
 
@@ -1352,12 +1405,26 @@ TEST_F(LiteralUtilTest, ConvertIfTypesMatch) {
   conv = f16.Convert(C64).ConsumeValueOrDie();
   EXPECT_EQ(conv, c64);
 
+  conv = s32.Convert(S16).ConsumeValueOrDie();
+  EXPECT_EQ(conv, s16);
+
+  conv = s32.Convert(U16).ConsumeValueOrDie();
+  EXPECT_EQ(conv, u16);
+
+  conv = s32.Convert(C128).ConsumeValueOrDie();
+  EXPECT_EQ(conv, c128);
+
+  conv = f16.Convert(C128).ConsumeValueOrDie();
+  EXPECT_EQ(conv, c128);
+
   EXPECT_EQ(s32.Convert(TUPLE).status().code(),
             tensorflow::error::UNIMPLEMENTED);
-  EXPECT_EQ(s32.Convert(S16).status().code(), tensorflow::error::UNIMPLEMENTED);
-  EXPECT_EQ(s32.Convert(U16).status().code(), tensorflow::error::UNIMPLEMENTED);
   EXPECT_EQ(c64.Convert(F32).status().code(), tensorflow::error::UNIMPLEMENTED);
   EXPECT_EQ(c64.Convert(S32).status().code(), tensorflow::error::UNIMPLEMENTED);
+  EXPECT_EQ(c128.Convert(F32).status().code(),
+            tensorflow::error::UNIMPLEMENTED);
+  EXPECT_EQ(c128.Convert(S32).status().code(),
+            tensorflow::error::UNIMPLEMENTED);
 }
 
 TEST_F(LiteralUtilTest, BitcastConvert) {
@@ -1642,7 +1709,7 @@ TEST_F(LiteralUtilTest, MoveIntoTuple) {
       LiteralUtil::MakeTuple({&inner_elements[0], &inner_elements[1]}));
 
   Literal literal = Literal::MoveIntoTuple(absl::MakeSpan(elements));
-  ASSERT_TRUE(ShapeUtil::IsTuple(literal.shape()));
+  ASSERT_TRUE(literal.shape().IsTuple());
   ASSERT_EQ(ShapeUtil::TupleElementCount(literal.shape()), 3);
 
   EXPECT_EQ(literal.Get<float>({}, /*shape_index=*/{0}), 1.0);
@@ -1659,7 +1726,7 @@ TEST_F(LiteralUtilTest, MoveIntoTuple) {
 
 TEST_F(LiteralUtilTest, MoveIntoEmptyTuple) {
   Literal literal = Literal::MoveIntoTuple({});
-  ASSERT_TRUE(ShapeUtil::IsTuple(literal.shape()));
+  ASSERT_TRUE(literal.shape().IsTuple());
   EXPECT_EQ(ShapeUtil::TupleElementCount(literal.shape()), 0);
 }
 
@@ -1719,7 +1786,8 @@ TEST_F(LiteralUtilTest, CreateFromShapeZeroInitialized) {
 
   Literal tuple = Literal::CreateFromShape(ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F64, {}), ShapeUtil::MakeShape(PRED, {2}),
-       ShapeUtil::MakeShape(U64, {2, 1}), ShapeUtil::MakeShape(C64, {})}));
+       ShapeUtil::MakeShape(U64, {2, 1}), ShapeUtil::MakeShape(C64, {}),
+       ShapeUtil::MakeShape(C128, {})}));
 
   EXPECT_EQ(tuple.Get<double>({}, {0}), 0.0);
   EXPECT_EQ(tuple.Get<bool>({0}, {1}), false);
@@ -1727,6 +1795,7 @@ TEST_F(LiteralUtilTest, CreateFromShapeZeroInitialized) {
   EXPECT_EQ(tuple.Get<uint64>({0, 0}, {2}), 0);
   EXPECT_EQ(tuple.Get<uint64>({1, 0}, {2}), 0);
   EXPECT_EQ(tuple.Get<complex64>({}, {3}), complex64(0.0f, 0.0f));
+  EXPECT_EQ(tuple.Get<complex128>({}, {4}), complex128(0.0, 0.0));
 }
 
 TEST_F(LiteralUtilTest, ProtoRoundTrip) {
@@ -1736,6 +1805,8 @@ TEST_F(LiteralUtilTest, ProtoRoundTrip) {
   auto vector_int8 = LiteralUtil::CreateR1<int8>({-128, 0, 2, 4, 7, 56, 127});
   auto vector_uint8 = LiteralUtil::CreateR1<uint8>({128, 0, 2, 56, 127, 255});
   auto vector_c64 = LiteralUtil::CreateR1<complex64>({{1.0, 2.0}, {3.0, 4.0}});
+  auto vector_c128 =
+      LiteralUtil::CreateR1<complex128>({{1.0, 2.0}, {3.0, 4.0}});
   auto vector_bfloat16 = LiteralUtil::CreateR1<bfloat16>(
       {bfloat16{-1.0}, bfloat16{2.0}, bfloat16{-3.0}});
   auto vector_half =
@@ -1756,6 +1827,7 @@ TEST_F(LiteralUtilTest, ProtoRoundTrip) {
   EXPECT_EQ(vector_int8, to_from_proto(vector_int8));
   EXPECT_EQ(vector_uint8, to_from_proto(vector_uint8));
   EXPECT_EQ(vector_c64, to_from_proto(vector_c64));
+  EXPECT_EQ(vector_c128, to_from_proto(vector_c128));
   EXPECT_EQ(vector_bfloat16, to_from_proto(vector_bfloat16));
   EXPECT_EQ(matrix_pred, to_from_proto(matrix_pred));
   EXPECT_EQ(tuple, to_from_proto(tuple));
@@ -1890,7 +1962,7 @@ TEST_F(LiteralUtilTest, SortSparseElements) {
   literal.AppendSparseElement<float>({3, 4, 5}, 3.0);
   literal.AppendSparseElement<float>({1, 2, 3}, 1.0);
   literal.SortSparseElements();
-  EXPECT_EQ(literal.ToString(false),
+  EXPECT_EQ(literal.ToString(),
             "f32[10,10,10]{[1, 2, 3]: 1, [2, 3, 4]: 2, [3, 4, 5]: 3}");
 }
 
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index bb5e5e61000d0aca6ab052ac87d2fbcd96e55f70..26b029c8d0c52e38510f9279def7c4af2904931d 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -62,7 +62,7 @@ Literal ConvertType(LiteralSlice literal) {
   ShapeUtil::ForEachSubshape(
       literal.shape(),
       [&](const Shape& subshape, const ShapeIndex& shape_index) {
-        if (ShapeUtil::IsArray(subshape)) {
+        if (subshape.IsArray()) {
           if (subshape.element_type() ==
               primitive_util::NativeToPrimitiveType<FromNativeT>()) {
             auto src = literal.data<FromNativeT>(shape_index);
@@ -106,12 +106,16 @@ Literal ConvertType(LiteralSlice literal) {
   switch (primitive_type) {
     case U8:
       return LiteralUtil::CreateR0<uint8>(0);
+    case U16:
+      return LiteralUtil::CreateR0<uint16>(0);
     case U32:
       return LiteralUtil::CreateR0<uint32>(0);
     case U64:
       return LiteralUtil::CreateR0<uint64>(0);
     case S8:
       return LiteralUtil::CreateR0<int8>(0);
+    case S16:
+      return LiteralUtil::CreateR0<int16>(0);
     case S32:
       return LiteralUtil::CreateR0<int32>(0);
     case S64:
@@ -126,11 +130,10 @@ Literal ConvertType(LiteralSlice literal) {
       return LiteralUtil::CreateR0<double>(0);
     case C64:
       return LiteralUtil::CreateR0<complex64>(0);
+    case C128:
+      return LiteralUtil::CreateR0<complex128>(0);
     case PRED:
       return LiteralUtil::CreateR0<bool>(false);
-    case S16:
-    case U16:
-      LOG(FATAL) << "u16/s16 literals not yet implemented";
     case TUPLE:
       LOG(FATAL) << "tuple element type cannot take on value of 0";
     case OPAQUE:
@@ -164,6 +167,8 @@ Literal ConvertType(LiteralSlice literal) {
       return LiteralUtil::CreateR0<double>(1);
     case C64:
       return LiteralUtil::CreateR0<complex64>(1);
+    case C128:
+      return LiteralUtil::CreateR0<complex128>(1);
     case PRED:
       return LiteralUtil::CreateR0<bool>(true);
     case S16:
@@ -200,6 +205,8 @@ Literal ConvertType(LiteralSlice literal) {
           -std::numeric_limits<double>::infinity());
     case C64:
       LOG(FATAL) << "C64 element type has no minimum value";
+    case C128:
+      LOG(FATAL) << "C128 element type has no minimum value";
     case PRED:
       return LiteralUtil::CreateR0<bool>(false);
     case S16:
@@ -344,6 +351,10 @@ Literal ConvertType(LiteralSlice literal) {
         new_literal.Set<complex64>(to_multi_index,
                                    literal.Get<complex64>(from_multi_index));
         break;
+      case C128:
+        new_literal.Set<complex128>(to_multi_index,
+                                    literal.Get<complex128>(from_multi_index));
+        break;
       default:
         LOG(FATAL) << "Unhandled primitive element type: "
                    << PrimitiveType_Name(literal.shape().element_type());
@@ -355,7 +366,7 @@ Literal ConvertType(LiteralSlice literal) {
 
 /* static */ Literal LiteralUtil::GetFirstScalarLiteral(
     const LiteralSlice& literal) {
-  CHECK(ShapeUtil::IsArray(literal.shape()));
+  CHECK(literal.shape().IsArray());
   CHECK_GT(ShapeUtil::ElementsIn(literal.shape()), 0);
   switch (literal.shape().element_type()) {
     case PRED:
@@ -392,6 +403,10 @@ Literal ConvertType(LiteralSlice literal) {
       return LiteralUtil::CreateR0<int64>(literal.GetFirstElement<int64>());
     case U64:
       return LiteralUtil::CreateR0<uint64>(literal.GetFirstElement<uint64>());
+
+    case C128:
+      return LiteralUtil::CreateR0<complex128>(
+          literal.GetFirstElement<complex128>());
     default:
       LOG(FATAL) << "Unhandled primitive type "
                  << literal.shape().element_type();
diff --git a/tensorflow/compiler/xla/metric_table_report.cc b/tensorflow/compiler/xla/metric_table_report.cc
index 4eab4fa4290c270697c00be20840cf4e85459183..bad65ac32018fafcc7634b989f1b4b0867aa5c0d 100644
--- a/tensorflow/compiler/xla/metric_table_report.cc
+++ b/tensorflow/compiler/xla/metric_table_report.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/metric_table_report.h"
 
-#include <cctype>
 #include <unordered_map>
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "tensorflow/core/platform/logging.h"
@@ -55,7 +55,7 @@ string MetricTableReport::MakeReport(double expected_metric_sum) {
   const auto metric_greater = [](const Entry& a, const Entry& b) {
     return a.metric > b.metric;
   };
-  std::sort(entries_.begin(), entries_.end(), metric_greater);
+  absl::c_sort(entries_, metric_greater);
 
   // Create the report
   AppendLine();
@@ -117,7 +117,7 @@ std::vector<MetricTableReport::Category> MetricTableReport::MakeCategories(
   auto metric_sum_greater = [](const Category& a, const Category& b) {
     return a.metric_sum > b.metric_sum;
   };
-  std::sort(categories.begin(), categories.end(), metric_sum_greater);
+  absl::c_sort(categories, metric_sum_greater);
 
   return categories;
 }
@@ -249,7 +249,7 @@ string MetricTableReport::MetricString(double metric) {
   string output;
   // Copy leading non-digit characters unconditionally.
   // This picks up the leading sign.
-  while (!sp1.empty() && !isdigit(sp1[0])) {
+  while (!sp1.empty() && !absl::ascii_isdigit(sp1[0])) {
     output.push_back(sp1[0]);
     sp1.remove_prefix(1);
   }
diff --git a/tensorflow/compiler/xla/packed_literal_reader.cc b/tensorflow/compiler/xla/packed_literal_reader.cc
index 0f86f9f35e105713aa3072a9ebf572d33d35d66d..339660cf44fd64fc5859e72255d63762fcf20efe 100644
--- a/tensorflow/compiler/xla/packed_literal_reader.cc
+++ b/tensorflow/compiler/xla/packed_literal_reader.cc
@@ -42,8 +42,7 @@ PackedLiteralReader::~PackedLiteralReader() { delete file_; }
 StatusOr<Literal> PackedLiteralReader::Read(const Shape& shape,
                                             const Layout* layout) {
   VLOG(3) << "reading shape from file: " << ShapeUtil::HumanString(shape)
-          << " layout: "
-          << (layout == nullptr ? "<none>" : layout->ShortDebugString());
+          << " layout: " << (layout == nullptr ? "<none>" : layout->ToString());
   Shape literal_shape = shape;
   if (layout != nullptr) {
     TF_RETURN_IF_ERROR(
diff --git a/tensorflow/compiler/xla/parse_flags_from_env.cc b/tensorflow/compiler/xla/parse_flags_from_env.cc
index 5b568888d14f21c1330556d017eafba6c8dd2228..e1e22f784172b5f3850f0bc510322dfad9e7f1bb 100644
--- a/tensorflow/compiler/xla/parse_flags_from_env.cc
+++ b/tensorflow/compiler/xla/parse_flags_from_env.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
@@ -37,7 +38,7 @@ limitations under the License.
 
 namespace xla {
 
-static const char kWS[] = " \t\r\n";           // whitespace
+static const char kWS[] = " \t\r\n";  // whitespace
 
 // The following struct represents an argv[]-style array, parsed
 // from data gleaned from the environment.
@@ -104,7 +105,8 @@ static void ParseArgvFromString(const string& flag_str, EnvArgv* a) {
     // Set e to the index just past the end of the flag.
     size_t e = b;
     while (e != flag_str.size() && isascii(flag_str[e]) &&
-           (strchr("-_", flag_str[e]) != nullptr || isalnum(flag_str[e]))) {
+           (strchr("-_", flag_str[e]) != nullptr ||
+            absl::ascii_isalnum(flag_str[e]))) {
       e++;
     }
     if (e != flag_str.size() && flag_str[e] == '=' &&
@@ -184,6 +186,14 @@ bool ParseFlagsFromEnvAndDieIfUnknown(
   tensorflow::mutex_lock lock(env_argv_mu);
   auto* env_argv = &EnvArgvs()[string(envvar)];
   SetArgvFromEnv(envvar, env_argv);  // a no-op if already initialized
+
+  if (VLOG_IS_ON(1)) {
+    VLOG(1) << "For env var " << envvar << " found arguments:";
+    for (int i = 0; i < env_argv->argc; i++) {
+      VLOG(1) << "  argv[" << i << "] = " << env_argv->argv[i];
+    }
+  }
+
   bool result =
       tensorflow::Flags::Parse(&env_argv->argc, &env_argv->argv[0], flag_list);
 
diff --git a/tensorflow/compiler/xla/primitive_util.cc b/tensorflow/compiler/xla/primitive_util.cc
index b16147e3be71771269d8b7a18528bef3a8c72d99..1eedddf72c1d393cb1b88e589881e24de02ad802 100644
--- a/tensorflow/compiler/xla/primitive_util.cc
+++ b/tensorflow/compiler/xla/primitive_util.cc
@@ -15,16 +15,35 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/primitive_util.h"
 
+#include "absl/strings/ascii.h"
+#include "absl/strings/numbers.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
 namespace primitive_util {
 
+int SignificandWidth(PrimitiveType type) {
+  switch (type) {
+    case F32:
+      return std::numeric_limits<float>::digits;
+    case F64:
+      return std::numeric_limits<double>::digits;
+    case BF16:
+      return kBFloat16MantissaBits + 1;
+    case F16:
+      return 11;
+    default:
+      LOG(FATAL) << "Not a floating data type " << type;
+  }
+}
+
 bool IsFloatingPointType(PrimitiveType type) {
   return type == F16 || type == F32 || type == F64 || type == BF16;
 }
 
-bool IsComplexType(PrimitiveType type) { return type == C64; }
+bool IsComplexType(PrimitiveType type) { return type == C64 || type == C128; }
 
 bool IsSignedIntegralType(PrimitiveType type) {
   return type == S8 || type == S16 || type == S32 || type == S64;
@@ -64,6 +83,9 @@ int BitWidth(PrimitiveType type) {
     case C64:
       return 64;
 
+    case C128:
+      return 128;
+
     case TUPLE:
       LOG(FATAL) << "TUPLE is an invalid type for BitWidth";
 
@@ -75,10 +97,27 @@ int BitWidth(PrimitiveType type) {
   }
 }
 
+xla::PrimitiveType UnsignedIntegralTypeForBitWidth(int64 src_bitwidth) {
+  switch (src_bitwidth) {
+    case 8:
+      return xla::U8;
+    case 16:
+      return xla::U16;
+    case 32:
+      return xla::U32;
+    case 64:
+      return xla::U64;
+    default:
+      return xla::PRIMITIVE_TYPE_INVALID;
+  }
+}
+
 PrimitiveType ComplexComponentType(PrimitiveType complex_type) {
   switch (complex_type) {
     case C64:
       return F32;
+    case C128:
+      return F64;
     default:
       LOG(FATAL) << "Primitive type is not complex: "
                  << PrimitiveType_Name(complex_type);
@@ -90,5 +129,65 @@ bool IsArrayType(PrimitiveType primitive_type) {
          primitive_type != OPAQUE && primitive_type != TOKEN;
 }
 
+// Class to memoize the computation of
+//   absl::AsciiStrToLower(PrimitiveType_Name(p))
+// for all PrimitiveType values "p"
+class PrimitiveTypeNameGenerator {
+ public:
+  PrimitiveTypeNameGenerator() {
+    for (int i = 0; i < PrimitiveType_ARRAYSIZE; i++) {
+      if (PrimitiveType_IsValid(i)) {
+        lowercase_name_[i] = absl::AsciiStrToLower(
+            PrimitiveType_Name(static_cast<PrimitiveType>(i)));
+      }
+    }
+  }
+  const string& LowercaseName(PrimitiveType t) {
+    return lowercase_name_[static_cast<int>(t)];
+  }
+
+ private:
+  string lowercase_name_[PrimitiveType_ARRAYSIZE];
+};
+
+const string& LowercasePrimitiveTypeName(PrimitiveType s) {
+  static auto* gen = new PrimitiveTypeNameGenerator();
+  return gen->LowercaseName(s);
+}
+
+namespace {
+
+// Returns a map from lower-case primitive type name to primitive type.
+const std::unordered_map<string, PrimitiveType>& GetPrimitiveTypeStringMap() {
+  static std::unordered_map<string, PrimitiveType>* name_to_type = [] {
+    static auto* map = new std::unordered_map<string, PrimitiveType>;
+    for (int i = 0; i < PrimitiveType_ARRAYSIZE; i++) {
+      if (PrimitiveType_IsValid(i) && i != PRIMITIVE_TYPE_INVALID) {
+        auto value = static_cast<PrimitiveType>(i);
+        (*map)[LowercasePrimitiveTypeName(value)] = value;
+      }
+    }
+    return map;
+  }();
+  return *name_to_type;
+}
+
+}  // namespace
+
+StatusOr<PrimitiveType> StringToPrimitiveType(absl::string_view name) {
+  const auto& map = GetPrimitiveTypeStringMap();
+  auto found = map.find(string(name));
+  if (found == map.end()) {
+    return InvalidArgument("Invalid element type string: \"%s\".", name);
+  }
+  return found->second;
+}
+
+bool IsPrimitiveTypeName(absl::string_view name) {
+  const auto& map = GetPrimitiveTypeStringMap();
+  auto found = map.find(string(name));
+  return found != map.end();
+}
+
 }  // namespace primitive_util
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/primitive_util.h b/tensorflow/compiler/xla/primitive_util.h
index 889e9a1ceca675689406d255d348c82c398563aa..295d353003276b4c1731f7d6a378fd1ae0288d3c 100644
--- a/tensorflow/compiler/xla/primitive_util.h
+++ b/tensorflow/compiler/xla/primitive_util.h
@@ -20,12 +20,19 @@ limitations under the License.
 
 #include <type_traits>
 
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace primitive_util {
 
+// Returns the count of significand (mantissa) bits for float datatypes.
+// For non-float datatypes, results in a LOG(FATAL).
+int SignificandWidth(PrimitiveType type);
+
 // The number of exponent bits in a BF16 value.
 const int kBFloat16ExponentBits = 8;
 
@@ -123,6 +130,11 @@ inline PrimitiveType NativeToPrimitiveType<complex64>() {
   return C64;
 }
 
+template <>
+inline PrimitiveType NativeToPrimitiveType<complex128>() {
+  return C128;
+}
+
 bool IsFloatingPointType(PrimitiveType type);
 
 bool IsComplexType(PrimitiveType type);
@@ -139,6 +151,8 @@ bool IsArrayType(PrimitiveType primitive_type);
 // Returns the number of bits in the representation for a given type.
 int BitWidth(PrimitiveType type);
 
+PrimitiveType UnsignedIntegralTypeForBitWidth(int64 src_bitwidth);
+
 // Returns the real, imag component type underlying the given complex type.
 // LOG(FATAL)'s if complex_type is not complex.
 PrimitiveType ComplexComponentType(PrimitiveType complex_type);
@@ -221,6 +235,22 @@ template <>
 struct PrimitiveTypeToNative<C64> {
   using type = complex64;
 };
+
+template <>
+struct PrimitiveTypeToNative<C128> {
+  using type = complex128;
+};
+
+// Returns the lower-case name of the given primitive type.
+const string& LowercasePrimitiveTypeName(PrimitiveType s);
+
+// Returns the PrimitiveType matching the given name. The given name is expected
+// to be lower-case.
+StatusOr<PrimitiveType> StringToPrimitiveType(absl::string_view name);
+
+// Returns true if the given name is a primitive type string (lower-case).
+bool IsPrimitiveTypeName(absl::string_view name);
+
 }  // namespace primitive_util
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/primitive_util_test.cc b/tensorflow/compiler/xla/primitive_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1f765d6da9ef65849fe8ede56ced7597d623cb59
--- /dev/null
+++ b/tensorflow/compiler/xla/primitive_util_test.cc
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/primitive_util.h"
+
+#include <numeric>
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+TEST(PrimitiveUtilTest, StringToPrimitiveType) {
+  auto expect_ok_and_equal = [](const string& str, PrimitiveType expected) {
+    TF_ASSERT_OK_AND_ASSIGN(PrimitiveType actual,
+                            primitive_util::StringToPrimitiveType(str));
+    EXPECT_EQ(expected, actual);
+  };
+  expect_ok_and_equal("f32", F32);
+  expect_ok_and_equal("tuple", TUPLE);
+  expect_ok_and_equal("pred", PRED);
+  expect_ok_and_equal("s32", S32);
+
+  EXPECT_IS_NOT_OK(primitive_util::StringToPrimitiveType("F32").status());
+  EXPECT_IS_NOT_OK(primitive_util::StringToPrimitiveType("Pred").status());
+  EXPECT_IS_NOT_OK(primitive_util::StringToPrimitiveType("preD").status());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/protobuf_util.h b/tensorflow/compiler/xla/protobuf_util.h
index f22fc8b8499dd4a5329276040331a2ed9e89bea9..4a88a48f2857a327aba3600ca72191e5c7b28585 100644
--- a/tensorflow/compiler/xla/protobuf_util.h
+++ b/tensorflow/compiler/xla/protobuf_util.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PROTOBUF_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_PROTOBUF_UTIL_H_
 
+#include "google/protobuf/duration.pb.h"
+#include "absl/time/time.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -43,6 +45,20 @@ Status DumpProtoToDirectory(const tensorflow::protobuf::Message& message,
 // dirpath along as-is.
 void RegisterDirectoryExpander(const std::function<string(string)>& expander);
 
+// Converts an absl::Duration to a google::protobuf::Duration.
+inline google::protobuf::Duration ToDurationProto(absl::Duration duration) {
+  google::protobuf::Duration proto;
+  proto.set_seconds(absl::IDivDuration(duration, absl::Seconds(1), &duration));
+  proto.set_nanos(
+      absl::IDivDuration(duration, absl::Nanoseconds(1), &duration));
+  return proto;
+}
+
+// Converts a google::protobuf::Duration to an absl::Duration.
+inline absl::Duration FromDurationProto(google::protobuf::Duration proto) {
+  return absl::Seconds(proto.seconds()) + absl::Nanoseconds(proto.nanos());
+}
+
 }  // namespace protobuf_util
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 63ac1c6649210cbae9e238a74e0a45fb8ee4da63..55eacc1c16a76522215d27ac7cf4e801e69c9740 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -3,7 +3,8 @@ licenses(["notice"])  # Apache 2.0
 package(default_visibility = ["//tensorflow:internal"])
 
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
+load("//tensorflow/core:platform/default/build_config.bzl", "pyx_library")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_python_default_plugins")
 
 py_library(
     name = "xla_client",
@@ -17,6 +18,12 @@ py_library(
     ],
 )
 
+pyx_library(
+    name = "custom_call_for_test",
+    testonly = True,
+    srcs = ["custom_call_for_test.pyx"],
+)
+
 py_test(
     name = "xla_client_test",
     srcs = ["xla_client_test.py"],
@@ -24,6 +31,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = ["no_oss"],
     deps = [
+        ":custom_call_for_test",
         ":xla_client",
         "//tensorflow/python:platform_test",
     ],
@@ -51,10 +59,6 @@ cc_library(
     srcs = ["local_computation_builder.cc"],
     hdrs = ["local_computation_builder.h"],
     deps = [
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:client_session",
-        "//tensorflow/cc:ops",
-        "//tensorflow/cc:scope",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
@@ -66,9 +70,37 @@ cc_library(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/client/lib:cholesky",
         "//tensorflow/compiler/xla/client/lib:math",
+        "//tensorflow/compiler/xla/client/lib:qr",
+        "//tensorflow/compiler/xla/service:computation_placer",
+        "//tensorflow/compiler/xla/service:hlo_graph_dumper",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
+        "//tensorflow/core:lib",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "xrt",
+    srcs = ["xrt.cc"],
+    hdrs = ["xrt.h"],
+    deps = [
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_proto",
+        "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xrt:xrt_proto",
         "//tensorflow/compiler/xrt/cc:xrt_ops",
         "//tensorflow/core:framework",
@@ -80,11 +112,19 @@ cc_library(
 
 tf_py_wrap_cc(
     name = "pywrap_xla",
-    srcs = ["xla.i"],
+    srcs = [
+        "xla.i",
+    ],
     swig_includes = [
         "local_computation_builder.i",
+        "xla_data.i",
         "//tensorflow/python:platform/base.i",
     ],
+    version_script = select({
+        "//tensorflow:darwin": "pywrap_xla_exported_symbols.lds",
+        "//tensorflow:windows": None,
+        "//conditions:default": "pywrap_xla_version_script.lds",
+    }),
     deps = [
         ":local_computation_builder",
         ":numpy_bridge",
@@ -92,7 +132,29 @@ tf_py_wrap_cc(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:cpu_plugin",
-    ] + if_cuda_is_configured([
-        "//tensorflow/compiler/xla/service:gpu_plugin",
-    ]),
+    ] + xla_python_default_plugins(),
+)
+
+tf_py_wrap_cc(
+    name = "pywrap_xrt",
+    srcs = [
+        "xrt.i",
+    ],
+    swig_includes = [
+        "xla_data.i",
+        "//tensorflow/python:platform/base.i",
+    ],
+    version_script = select({
+        "//tensorflow:darwin": "pywrap_xla_exported_symbols.lds",
+        "//tensorflow:windows": None,
+        "//conditions:default": "pywrap_xla_version_script.lds",
+    }),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":numpy_bridge",
+        ":xrt",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+    ],
 )
diff --git a/tensorflow/compiler/xla/python/custom_call_for_test.pyx b/tensorflow/compiler/xla/python/custom_call_for_test.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..530dffd1755d8438f52569c223525000c97df6ea
--- /dev/null
+++ b/tensorflow/compiler/xla/python/custom_call_for_test.pyx
@@ -0,0 +1,21 @@
+# distutils: language = c++
+
+# Test case for defining a XLA custom call target in Cython, and registering
+# it via the xla_client SWIG API.
+
+from cpython.pycapsule cimport PyCapsule_New
+
+cdef void test_subtract_f32(void* out_ptr, void** data_ptr) nogil:
+  cdef float a = (<float*>(data_ptr[0]))[0]
+  cdef float b = (<float*>(data_ptr[1]))[0]
+  cdef float* out = <float*>(out_ptr)
+  out[0] = a - b
+
+
+cpu_custom_call_targets = {}
+
+cdef register_custom_call_target(fn_name, void* fn):
+  cdef const char* name = "xla._CPU_CUSTOM_CALL_TARGET"
+  cpu_custom_call_targets[fn_name] = PyCapsule_New(fn, name, NULL)
+
+register_custom_call_target(b"test_subtract_f32", <void*>(test_subtract_f32))
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index 6e2ee866321a070d55a7221c7c68024ceaa93448..c14a01a858af414fc78a5f727372e8fa64cad4b8 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -20,25 +20,22 @@ limitations under the License.
 #include <vector>
 
 #include "absl/memory/memory.h"
-#include "tensorflow/cc/client/client_session.h"
-#include "tensorflow/cc/framework/ops.h"
-#include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/client/lib/cholesky.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/lib/qr.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/computation_placer.h"
+#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/compiler/xrt/cc/ops/xrt_compile_ops.h"
-#include "tensorflow/compiler/xrt/cc/ops/xrt_execute_op.h"
-#include "tensorflow/compiler/xrt/cc/ops/xrt_state_ops.h"
-#include "tensorflow/compiler/xrt/xrt.pb.h"
-#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/thread_annotations.h"
@@ -47,127 +44,80 @@ limitations under the License.
 namespace xla {
 namespace swig {
 
-// TODO(b/118641336): Factor out XRT parts into a small c++ library of their
-// own.
-
-// TODO(b/34473877) Ideally XLA would support AllReduce among arbitrary sets of
-// device handles instead of needing to set the number of replicas at XLA
-// service initialization time.
-tensorflow::mutex g_local_client_mutex(tensorflow::LINKER_INITIALIZED);
-int g_replica_count GUARDED_BY(g_local_client_mutex) = 1;
-LocalClient* g_local_client GUARDED_BY(g_local_client_mutex) = nullptr;
-
-string* GetPlatformNameString() {
-  static string* platform_name_string PT_GUARDED_BY(g_local_client_mutex) =
-      new string("Host");
-  return platform_name_string;
-}
-
-Status InitializeReplicaCount(int replica_count) {
-  if (replica_count < 1) {
-    return InvalidArgument("Replica count must be >= 1; got %d.",
-                           replica_count);
-  }
-  tensorflow::mutex_lock lock(g_local_client_mutex);
-  if (g_local_client != nullptr) {
-    return FailedPrecondition(
-        "Attempted to set the replica count to %d, but a local XLA service was "
-        "previously created with a replica count of %d.",
-        replica_count, g_replica_count);
-  }
-  g_replica_count = replica_count;
-  return Status::OK();
-}
-
-Status InitializePlatformName(const string& platform_name) {
-  string* g_platform_name = GetPlatformNameString();
-  tensorflow::mutex_lock lock(g_local_client_mutex);
-  if (g_local_client != nullptr) {
-    return FailedPrecondition(
-        "Attempted to set the platform name to %s, but a local XLA service was "
-        "previously created with a platform name of %s.",
-        platform_name, *g_platform_name);
+Status RegisterCpuCustomCallTarget(const string& fn_name, PyObject* capsule) {
+  const char* name = "xla._CPU_CUSTOM_CALL_TARGET";
+  if (!PyCapsule_IsValid(capsule, name)) {
+    return InvalidArgument(
+        "Argument to RegisterCpuCustomCallTargetRegistry was not a "
+        "xla._CPU_CUSTOM_CALL_TARGET capsule.");
   }
-  TF_RETURN_IF_ERROR(PlatformUtil::GetPlatform(platform_name).status());
-  *g_platform_name = platform_name;
+  void* fn_ptr = PyCapsule_GetPointer(capsule, name);
+  CHECK(fn_ptr != nullptr);
+  cpu::CustomCallTargetRegistry::Global()->Register(
+      std::string(fn_name.begin(), fn_name.end()), fn_ptr);
   return Status::OK();
 }
 
-int GetReplicaCount() {
-  tensorflow::mutex_lock lock(g_local_client_mutex);
-  return g_replica_count;
-}
+LocalClient::LocalClient(xla::LocalClient* client) : client_(client) {}
 
-LocalClient* GetOrCreateLocalClient() {
-  string* platform_name = GetPlatformNameString();
-  tensorflow::mutex_lock lock(g_local_client_mutex);
-  if (g_local_client != nullptr) {
-    return g_local_client;
+/* static */ StatusOr<LocalClient> LocalClient::Get(
+    const string& platform_name) {
+  TF_ASSIGN_OR_RETURN(se::Platform * platform,
+                      PlatformUtil::GetPlatform(platform_name));
+  if (platform->VisibleDeviceCount() <= 0) {
+    return InvalidArgument("Platform %s has no visible devices.",
+                           platform_name);
   }
   LocalClientOptions options;
-  options.set_platform(PlatformUtil::GetPlatform(*platform_name).ValueOrDie());
-  options.set_number_of_replicas(g_replica_count);
-  g_local_client = ClientLibrary::GetOrCreateLocalClient(options).ValueOrDie();
-  CHECK(g_local_client != nullptr);
-  return g_local_client;
+  options.set_platform(platform);
+  TF_ASSIGN_OR_RETURN(xla::LocalClient * client,
+                      ClientLibrary::GetOrCreateLocalClient(options));
+  CHECK(client != nullptr);
+  return LocalClient(client);
 }
 
-Status TransferToInfeedLocal(const Literal& literal) {
-  VLOG(1) << "Infeeding literal without replica number; shape: "
-          << literal.shape();
-  LocalClient* client = GetOrCreateLocalClient();
-  return client->TransferToInfeedLocal(literal, /*device_ordinal=*/0);
-}
+// Returns the number of devices known to the XLA client.
+int LocalClient::DeviceCount() const { return client_->device_count(); }
 
-Status TransferToInfeedLocalReplica(const Literal& literal,
-                                    int replica_number) {
-  VLOG(1) << "Infeeding shape " << literal.shape()
-          << " to replica number: " << replica_number;
-  LocalClient* client = GetOrCreateLocalClient();
-  TF_ASSIGN_OR_RETURN(int device_ordinal,
-                      client->ReplicaNumberToDeviceOrdinal(replica_number));
-  return client->TransferToInfeedLocal(literal, device_ordinal);
+Status LocalClient::TransferToInfeed(const Literal& literal,
+                                     int device_ordinal) {
+  VLOG(1) << "Infeeding literal to device " << device_ordinal
+          << "; shape: " << literal.shape();
+  return client_->TransferToInfeed(literal, device_ordinal);
 }
 
-StatusOr<Literal> TransferFromOutfeedLocalReplica(const Shape& shape,
-                                                  int replica_number) {
-  VLOG(1) << "Outfeeding literal from replica number: " << replica_number
-          << " shape: " << shape;
-  LocalClient* client = GetOrCreateLocalClient();
-  TF_ASSIGN_OR_RETURN(int device_ordinal,
-                      client->ReplicaNumberToDeviceOrdinal(replica_number));
-  return client->TransferFromOutfeedLocal(shape, device_ordinal);
-}
-
-static StatusOr<ScopedShapedBuffer> ToBuffer(LocalClient* client,
-                                             int device_ordinal,
-                                             const Literal& arg) {
-  return client->LiteralToShapedBuffer(arg, device_ordinal,
-                                       client->backend().memory_allocator());
+StatusOr<Literal> LocalClient::TransferFromOutfeed(const Shape& shape,
+                                                   int device_ordinal) {
+  VLOG(1) << "Outfeeding literal from device " << device_ordinal
+          << "; shape: " << shape;
+  return client_->TransferFromOutfeed(&shape, device_ordinal);
 }
 
 /* static */
 StatusOr<LocalShapedBuffer*> LocalShapedBuffer::FromLiteral(
     const Literal& argument, const absl::optional<Shape>& shape_with_layout,
-    int replica_number) {
-  LocalClient* client = GetOrCreateLocalClient();
-  TF_ASSIGN_OR_RETURN(int device_ordinal,
-                      client->ReplicaNumberToDeviceOrdinal(replica_number));
-  VLOG(1) << "Creating shaped buffer from literal on replica/ordinal: "
-          << replica_number << "/" << device_ordinal;
+    const LocalClient& client, int device_ordinal) {
+  VLOG(1) << "Creating shaped buffer from literal on device ordinal: "
+          << device_ordinal;
+  auto literal_to_buffer = [&](const Literal& arg) {
+    return client.client()->LiteralToShapedBuffer(
+        arg, device_ordinal, client.client()->backend().memory_allocator());
+  };
+
   StatusOr<ScopedShapedBuffer> buf = [&] {
     if (shape_with_layout) {
       Literal relaid = argument.Relayout(shape_with_layout.value());
-      return ToBuffer(client, device_ordinal, relaid);
+      return literal_to_buffer(relaid);
     }
-    return ToBuffer(client, device_ordinal, argument);
+    return literal_to_buffer(argument);
   }();
   TF_RETURN_IF_ERROR(buf.status());
-  return new LocalShapedBuffer(std::move(buf).ValueOrDie());
+  return new LocalShapedBuffer(std::move(buf).ValueOrDie(), client.client());
 }
 
-LocalShapedBuffer::LocalShapedBuffer(ScopedShapedBuffer shaped_buffer)
-    : shaped_buffer_(std::move(shaped_buffer)) {}
+LocalShapedBuffer::LocalShapedBuffer(ScopedShapedBuffer shaped_buffer,
+                                     xla::LocalClient* client)
+    : shaped_buffer_(std::move(shaped_buffer)), client_(client) {}
 
 const ScopedShapedBuffer* LocalShapedBuffer::shaped_buffer() const {
   return &shaped_buffer_;
@@ -180,8 +130,7 @@ const Shape& LocalShapedBuffer::shape() const {
 }
 
 StatusOr<Literal> LocalShapedBuffer::ToLiteral() const {
-  LocalClient* client = GetOrCreateLocalClient();
-  return client->ShapedBufferToLiteral(*shaped_buffer());
+  return client_->ShapedBufferToLiteral(*shaped_buffer());
 }
 
 LocalShapedBufferTuple::LocalShapedBufferTuple(
@@ -212,141 +161,94 @@ StatusOr<LocalShapedBuffer*> LocalShapedBufferTuple::Release(int i) {
 
 int64 LocalShapedBufferTuple::size() const { return elements_.size(); }
 
-XrtAllocation::XrtAllocation(int64 handle, Shape shape,
-                             const string& session_target)
-    : handle_(handle), shape_(shape), session_target_(session_target) {}
-
-XrtAllocation::~XrtAllocation() {
-  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
-  auto allocation_handle =
-      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
-  auto release =
-      tensorflow::ops::XRTReleaseAllocationHandle(root, allocation_handle);
-  if (!root.status().ok()) {
-    LOG(ERROR) << root.status();
-    return;
-  }
+StatusOr<LocalShapedBufferTuple*> LocalShapedBuffer::DestructureTuple() {
+  const Shape tuple_shape = shape();
 
-  tensorflow::ClientSession session(root, session_target_);
-  tensorflow::ClientSession::FeedType inputs;
-  inputs.insert({allocation_handle, handle()});
-  std::vector<tensorflow::Tensor> outputs;
-  auto status = session.Run(inputs, {}, {release}, &outputs);
-  if (!status.ok()) {
-    LOG(ERROR) << status;
-    return;
+  if (!tuple_shape.IsTuple()) {
+    return InvalidArgument(
+        "Attemped to destructure a LocalShapedBuffer that did not have a tuple "
+        "shape; shape: %s",
+        ShapeUtil::HumanString(tuple_shape));
   }
-}
-
-/* static */
-StatusOr<XrtAllocation*> XrtAllocation::FromLiteral(
-    const Literal& argument, const string& session_target) {
-  xrt::XLAAllocation alloc;
-  alloc.set_device_ordinal(0);
-  *alloc.mutable_value() = argument.ToProto();
-
-  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
-  auto literal_string =
-      tensorflow::ops::Placeholder(root, tensorflow::DT_STRING);
-  auto literal_handle = tensorflow::ops::XRTAllocate(root, literal_string);
-  TF_RETURN_IF_ERROR(root.status());
 
-  tensorflow::ClientSession session(root, session_target);
-  tensorflow::ClientSession::FeedType inputs;
-  inputs.insert({literal_string, alloc.SerializeAsString()});
-  std::vector<tensorflow::Tensor> outputs;
-  TF_RETURN_IF_ERROR(session.Run(inputs, {literal_handle}, &outputs));
+  DeviceMemoryAllocator* allocator = shaped_buffer()->memory_allocator();
+  ShapedBuffer tuple_buffer = Release();
 
-  int64 handle = outputs[0].scalar<int64>()();
-  return new XrtAllocation(handle, argument.shape(), session_target);
-}
-
-const int64 XrtAllocation::handle() const { return handle_; }
-
-const Shape& XrtAllocation::shape() const { return shape_; }
-
-StatusOr<Literal> XrtAllocation::ToLiteral() const {
-  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
-  auto allocation_handle =
-      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
-  auto read_literal = tensorflow::ops::XRTReadLiteral(root, allocation_handle);
-  TF_RETURN_IF_ERROR(root.status());
+  // Extract some metadata we use to construct scoped buffers.
+  const se::Platform* platform = tuple_buffer.platform();
+  int device_ordinal = tuple_buffer.device_ordinal();
 
-  tensorflow::ClientSession session(root, session_target_);
-  tensorflow::ClientSession::FeedType inputs;
-  inputs.insert({allocation_handle, handle()});
-  std::vector<tensorflow::Tensor> outputs;
-  TF_RETURN_IF_ERROR(session.Run(inputs, {read_literal}, &outputs));
+  ShapeTree<se::DeviceMemoryBase>& shape_tree = tuple_buffer.buffers();
+  std::vector<LocalShapedBuffer*> results;
+  for (int64 i = 0; i < ShapeUtil::TupleElementCount(tuple_shape); ++i) {
+    // Create a shaped buffer for this destructured tuple element.
+    const Shape& subshape = ShapeUtil::GetSubshape(tuple_shape, {i});
+    VLOG(3) << "Starting tuple element " << i << " subshape: " << subshape;
+    ShapedBuffer shaped_buffer(subshape, subshape, platform, device_ordinal);
 
-  xla::LiteralProto response;
-  TF_RET_CHECK(response.ParseFromString(outputs[0].scalar<string>()()));
-  return Literal::CreateFromProto(response);
-}
+    ShapeUtil::ForEachSubshape(
+        subshape, [&](const Shape& s, const ShapeIndex& index) {
+          ShapeIndex original(index);
+          original.push_front(i);
+          se::DeviceMemoryBase* device_memory =
+              shape_tree.mutable_element(original);
+          shaped_buffer.set_buffer(*device_memory, index);
+          *device_memory = se::DeviceMemoryBase();
+        });
 
-XrtAllocationTuple::XrtAllocationTuple(std::vector<XrtAllocation*> elements)
-    : elements_(std::move(elements)) {
-  for (auto* element : elements_) {
-    CHECK(element != nullptr);
+    VLOG(3) << "Completed tuple element: " << i;
+    results.push_back(new LocalShapedBuffer(
+        ScopedShapedBuffer(std::move(shaped_buffer), allocator), client_));
   }
+  // Deallocate the root buffer.
+  se::DeviceMemoryBase root_buffer = tuple_buffer.root_buffer();
+  TF_RETURN_IF_ERROR(allocator->Deallocate(device_ordinal, root_buffer));
+  return new LocalShapedBufferTuple(std::move(results));
 }
 
-XrtAllocationTuple::~XrtAllocationTuple() {
-  for (XrtAllocation* element : elements_) {
-    if (element != nullptr) {
-      delete element;
-    }
-  }
-}
+LocalExecutable::LocalExecutable(
+    std::unique_ptr<xla::LocalExecutable> executable,
+    xla::DeviceAssignment device_assignment, xla::LocalClient* client)
+    : executable_(std::move(executable)),
+      device_assignment_(std::move(device_assignment)),
+      client_(client) {}
 
-StatusOr<XrtAllocation*> XrtAllocationTuple::Release(int i) {
-  XrtAllocation* element = elements_[i];
-  if (element == nullptr) {
-    return InvalidArgument("Attempted to release already-released element %d.",
-                           i);
+std::vector<int> LocalExecutable::DeviceOrdinals() const {
+  int num_replicas = device_assignment_.replica_count();
+  std::vector<int> device_ordinals;
+  device_ordinals.reserve(num_replicas);
+  for (int i = 0; i < num_replicas; ++i) {
+    device_ordinals.push_back(device_assignment_(i, 0));
   }
-  elements_[i] = nullptr;
-  return element;
+  return device_ordinals;
 }
 
-int64 XrtAllocationTuple::size() const { return elements_.size(); }
-
-CompiledLocalComputation::CompiledLocalComputation(
-    std::unique_ptr<LocalExecutable> executable)
-    : executable_(std::move(executable)) {}
-
-StatusOr<LocalShapedBuffer*> CompiledLocalComputation::Execute(
+StatusOr<LocalShapedBuffer*> LocalExecutable::Execute(
     absl::Span<LocalShapedBuffer* const> argument_handles) {
-  LocalClient* client = GetOrCreateLocalClient();
-  StatusOr<int> device_ordinal_status = client->ReplicaNumberToDeviceOrdinal(0);
+  if (num_replicas() != 1) {
+    return InvalidArgument(
+        "Attempted to execute computation with %d replicas using Execute()",
+        num_replicas());
+  }
   StatusOr<ScopedShapedBuffer> result_buffer_status;
-  if (!device_ordinal_status.ok()) {
-    result_buffer_status = device_ordinal_status.status();
-  } else {
-    const int device_ordinal = device_ordinal_status.ValueOrDie();
-    VLOG(3) << "Replica 0 mapped to device ordinal for execution: "
-            << device_ordinal;
+  const int device_ordinal = device_assignment_(0, 0);
+  VLOG(3) << "Replica 0 mapped to device ordinal for execution: "
+          << device_ordinal;
 
-    std::vector<const ShapedBuffer*> argument_buffers;
-    argument_buffers.reserve(argument_handles.size());
-    for (auto& handle : argument_handles) {
-      argument_buffers.push_back(handle->shaped_buffer());
-    }
-
-    DeviceAssignment device_assignment =
-        client->backend()
-            .computation_placer()
-            ->AssignDevices(1, /*computation_count=*/1)
-            .ConsumeValueOrDie();
+  std::vector<const ShapedBuffer*> argument_buffers;
+  argument_buffers.reserve(argument_handles.size());
+  for (auto& handle : argument_handles) {
+    argument_buffers.push_back(handle->shaped_buffer());
+  }
 
-    ExecutableRunOptions options;
-    options.set_device_ordinal(device_ordinal);
-    options.set_allocator(client->backend().memory_allocator());
-    options.set_intra_op_thread_pool(
-        client->backend().eigen_intra_op_thread_pool_device());
-    options.set_device_assignment(&device_assignment);
+  ExecutableRunOptions options;
+  options.set_device_ordinal(device_ordinal);
+  options.set_allocator(client_->backend().memory_allocator());
+  options.set_intra_op_thread_pool(
+      client_->backend().eigen_intra_op_thread_pool_device());
+  options.set_device_assignment(&device_assignment_);
 
-    result_buffer_status = executable_->Run(argument_buffers, options);
-  }
+  result_buffer_status = executable_->Run(argument_buffers, options);
 
   if (!result_buffer_status.ok()) {
     return InternalError(
@@ -354,34 +256,30 @@ StatusOr<LocalShapedBuffer*> CompiledLocalComputation::Execute(
         "%s.",
         result_buffer_status.status().ToString());
   }
-  return new LocalShapedBuffer(std::move(result_buffer_status).ValueOrDie());
+  return new LocalShapedBuffer(std::move(result_buffer_status).ValueOrDie(),
+                               client_);
 }
 
-StatusOr<LocalShapedBufferTuple*> CompiledLocalComputation::ExecutePerReplica(
+StatusOr<LocalShapedBufferTuple*> LocalExecutable::ExecutePerReplica(
     absl::Span<const std::vector<LocalShapedBuffer*>> argument_handles) {
-  LocalClient* client = GetOrCreateLocalClient();
-  const int num_replicas = GetReplicaCount();
+  const int num_devices = client_->device_count();
 
-  if (argument_handles.size() != num_replicas) {
+  if (argument_handles.size() != num_replicas()) {
     return InvalidArgument(
         "Attempted to execute with %d replicas when replica count is %d",
-        argument_handles.size(), num_replicas);
+        argument_handles.size(), num_devices);
+  }
+  if (argument_handles.size() > num_devices) {
+    return InvalidArgument(
+        "Attempted to execute with %d replicas when device count is %d",
+        argument_handles.size(), num_devices);
   }
 
-  VLOG(1) << "Executing with " << num_replicas << " replicas.";
-
-  // Each replica populates a StatusOr result, but only the output value of
-  // replica zero is returned.
-  std::vector<StatusOr<ScopedShapedBuffer>> results(num_replicas);
-  auto execute = [this, client, num_replicas, &argument_handles,
-                  &results](int replica) {
-    StatusOr<int> device_ordinal_status =
-        client->ReplicaNumberToDeviceOrdinal(replica);
-    if (!device_ordinal_status.ok()) {
-      results[replica] = device_ordinal_status.status();
-      return;
-    }
-    const int device_ordinal = device_ordinal_status.ValueOrDie();
+  VLOG(1) << "Executing with " << num_replicas() << " replicas.";
+
+  std::vector<StatusOr<ScopedShapedBuffer>> results(num_replicas());
+  auto execute = [this, &argument_handles, &results](int replica) {
+    const int device_ordinal = device_assignment_(replica, 0);
     VLOG(3) << "Replica " << replica
             << " mapped to device ordinal for execution: " << device_ordinal;
 
@@ -391,41 +289,35 @@ StatusOr<LocalShapedBufferTuple*> CompiledLocalComputation::ExecutePerReplica(
       argument_buffers.push_back(handle->shaped_buffer());
     }
 
-    DeviceAssignment device_assignment =
-        client->backend()
-            .computation_placer()
-            ->AssignDevices(num_replicas, /*computation_count=*/1)
-            .ConsumeValueOrDie();
-
     ExecutableRunOptions options;
     options.set_device_ordinal(device_ordinal);
-    options.set_allocator(client->backend().memory_allocator());
+    options.set_allocator(client_->backend().memory_allocator());
     options.set_intra_op_thread_pool(
-        client->backend().eigen_intra_op_thread_pool_device());
-    options.set_device_assignment(&device_assignment);
+        client_->backend().eigen_intra_op_thread_pool_device());
+    options.set_device_assignment(&device_assignment_);
     StatusOr<ScopedShapedBuffer> result_buffer_status =
         executable_->Run(argument_buffers, options);
 
     results[replica] = std::move(result_buffer_status);
   };
 
-  if (num_replicas == 1) {
+  if (num_replicas() == 1) {
     // Fast-path if there is only one replica — run the computation on the
     // current thread.
     execute(0);
   } else {
     // TODO(phawkins): don't recreate the threadpool for each execution.
     tensorflow::thread::ThreadPool pool(tensorflow::Env::Default(), "xlarun",
-                                        num_replicas - 1);
+                                        num_replicas() - 1);
 
-    for (int replica = 0; replica < num_replicas - 1; ++replica) {
+    for (int replica = 0; replica < num_replicas() - 1; ++replica) {
       pool.Schedule([&execute, replica] { execute(replica); });
     }
-    execute(num_replicas - 1);
+    execute(num_replicas() - 1);
   }
 
-  std::vector<LocalShapedBuffer*> wrapped_results(num_replicas);
-  for (int replica = 0; replica < num_replicas; ++replica) {
+  std::vector<LocalShapedBuffer*> wrapped_results(num_replicas());
+  for (int replica = 0; replica < num_replicas(); ++replica) {
     auto& statusor = results[replica];
     if (!statusor.ok()) {
       return InternalError(
@@ -434,151 +326,43 @@ StatusOr<LocalShapedBufferTuple*> CompiledLocalComputation::ExecutePerReplica(
           replica, statusor.status().ToString());
     }
     wrapped_results[replica] =
-        new LocalShapedBuffer(std::move(statusor).ValueOrDie());
+        new LocalShapedBuffer(std::move(statusor).ValueOrDie(), client_);
   }
 
   return new LocalShapedBufferTuple(std::move(wrapped_results));
 }
 
-static StatusOr<Shape> GetReturnValueShape(const XlaComputation& computation) {
-  TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
-                      computation.GetProgramShape());
-  return std::move(*program_shape.mutable_result());
-}
-
-CompiledXrtComputation::CompiledXrtComputation(
-    const ProgramShape& program_shape, int64 handle,
-    const string& session_target)
-    : program_shape_(program_shape),
-      handle_(handle),
-      session_target_(session_target) {}
-
-CompiledXrtComputation::~CompiledXrtComputation() {
-  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
-  auto computation_handle =
-      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
-  auto release =
-      tensorflow::ops::XRTReleaseCompilationHandle(root, computation_handle);
-  if (!root.status().ok()) {
-    LOG(ERROR) << root.status();
-    return;
-  }
-
-  tensorflow::ClientSession session(root, session_target_);
-  tensorflow::ClientSession::FeedType inputs;
-  inputs.insert({computation_handle, handle()});
-  std::vector<tensorflow::Tensor> outputs;
-  auto status = session.Run(inputs, {}, {release}, &outputs);
-  if (!status.ok()) {
-    LOG(ERROR) << status;
-    return;
-  }
-}
-
-StatusOr<XrtAllocation*> CompiledXrtComputation::Execute(
-    absl::Span<XrtAllocation* const> argument_handles) {
-  const int num_expected_arguments = program_shape().parameters().size();
-
-  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
-  std::vector<tensorflow::Output> arguments;
-  arguments.reserve(num_expected_arguments);
-  for (int i = 0; i < num_expected_arguments; ++i) {
-    arguments.push_back(
-        tensorflow::ops::Placeholder(root, tensorflow::DT_INT64));
-  }
-  auto computation_handle =
-      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
-  auto execution_config =
-      tensorflow::ops::Placeholder(root, tensorflow::DT_STRING);
-  auto execute = tensorflow::ops::XRTExecute(root, computation_handle,
-                                             execution_config, arguments);
-  TF_RETURN_IF_ERROR(root.status());
-
-  TF_RET_CHECK(argument_handles.size() == arguments.size());
-
-  xrt::XRTExecutionConfig e;
-  e.set_release_input_handles(false);
-  e.set_release_compilation_handle(false);
-
-  tensorflow::ClientSession session(root, session_target_);
-  tensorflow::ClientSession::FeedType inputs;
-  for (int i = 0; i < arguments.size(); ++i) {
-    inputs.insert({arguments[i], argument_handles[i]->handle()});
-  }
-  inputs.insert({computation_handle, handle()});
-  inputs.insert({execution_config, e.SerializeAsString()});
-  std::vector<tensorflow::Tensor> outputs;
-  TF_RETURN_IF_ERROR(session.Run(inputs, {execute}, &outputs));
-
-  int64 output = outputs[0].scalar<int64>()();
-  return new XrtAllocation(output, program_shape().result(), session_target_);
-}
-
-const ProgramShape& CompiledXrtComputation::program_shape() const {
-  return program_shape_;
-}
-
-int64 CompiledXrtComputation::handle() const { return handle_; }
-
-LocalComputation::LocalComputation(XlaComputation computation)
+Computation::Computation(XlaComputation computation)
     : computation_(std::move(computation)) {}
 
-StatusOr<CompiledLocalComputation*> LocalComputation::Compile(
+StatusOr<LocalExecutable*> Computation::Compile(
     const std::vector<Shape>& argument_shapes,
-    const ExecutableBuildOptions* build_options) {
+    const ExecutableBuildOptions* build_options, const LocalClient& client) {
   std::vector<const Shape*> argument_shape_pointers;
   argument_shape_pointers.reserve(argument_shapes.size());
   for (auto& argument_shape : argument_shapes) {
     argument_shape_pointers.push_back(&argument_shape);
   }
 
-  LocalClient* client = GetOrCreateLocalClient();
   ExecutableBuildOptions options;
   if (build_options != nullptr) {
     options = *build_options;
   }
   TF_ASSIGN_OR_RETURN(
       auto local_executable,
-      client->Compile(computation_, argument_shape_pointers, options));
-  return new CompiledLocalComputation(std::move(local_executable));
-}
-
-StatusOr<CompiledXrtComputation*> LocalComputation::CompileForXrt(
-    const std::vector<Shape>& argument_shapes, const string& session_target) {
-  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
-  auto program = tensorflow::ops::Placeholder(root, tensorflow::DT_STRING);
-  auto compile = tensorflow::ops::XRTCompile(root, program);
-  TF_RETURN_IF_ERROR(root.status());
-
-  xrt::XLAComputation c;
-  auto config = c.mutable_config();
-  ProgramShape shapes;
-  for (auto& shape : argument_shapes) {
-    *shapes.add_parameters() = shape;
-  }
-  TF_ASSIGN_OR_RETURN(*shapes.mutable_result(), GetReturnValueShape());
-  LayoutUtil::SetToDefaultLayout(&shapes);
-  *config->mutable_program_shape() = shapes.ToProto();
-  auto snapshot = computation().Snapshot().ValueOrDie();
-  *c.mutable_hlo_snapshot() = *snapshot;
-
-  tensorflow::ClientSession session(root, session_target);
-  tensorflow::ClientSession::FeedType inputs;
-  inputs.insert({program, c.SerializeAsString()});
-  std::vector<tensorflow::Tensor> outputs;
-  TF_RETURN_IF_ERROR(session.Run(inputs, {compile.handle}, &outputs));
+      client.client()->Compile(computation_, argument_shape_pointers, options));
+  TF_ASSIGN_OR_RETURN(
+      DeviceAssignment device_assignment,
+      client.client()->backend().computation_placer()->AssignDevices(
+          options.num_replicas(), /*computation_count=*/1));
 
-  TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
-                      computation().GetProgramShape());
-  int64 handle = outputs[0].scalar<int64>()();
-  return new CompiledXrtComputation(program_shape, handle, session_target);
+  return new LocalExecutable(std::move(local_executable),
+                             std::move(device_assignment), client.client());
 }
 
-const XlaComputation& LocalComputation::computation() const {
-  return computation_;
-}
+const XlaComputation& Computation::computation() const { return computation_; }
 
-string LocalComputation::GetSerializedProto() const {
+string Computation::GetSerializedProto() const {
   string result;
   if (!computation_.proto().SerializeToString(&result)) {
     LOG(ERROR) << "Failed to serialize the HloModuleProto.";
@@ -587,123 +371,171 @@ string LocalComputation::GetSerializedProto() const {
   return result;
 }
 
-StatusOr<Shape> LocalComputation::GetReturnValueShape() const {
-  return swig::GetReturnValueShape(computation_);
+StatusOr<string> Computation::GetHloText() const {
+  TF_ASSIGN_OR_RETURN(const HloModuleConfig module_config,
+                      HloModule::CreateModuleConfigFromProto(
+                          computation_.proto(), GetDebugOptionsFromFlags()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> hlo_module,
+      HloModule::CreateFromProto(computation_.proto(), module_config));
+  HloPrintOptions options;
+  options = HloPrintOptions::ShortParsable();
+  options.set_print_large_constants(false);
+  return hlo_module->ToString(options);
+}
+
+StatusOr<string> Computation::GetHloDotGraph() const {
+  TF_ASSIGN_OR_RETURN(const HloModuleConfig module_config,
+                      HloModule::CreateModuleConfigFromProto(
+                          computation_.proto(), GetDebugOptionsFromFlags()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> hlo_module,
+      HloModule::CreateFromProto(computation_.proto(), module_config));
+  hlo_graph_dumper::DotGraphOptions options;
+  options.debug_options = &hlo_module->config().debug_options();
+  return hlo_graph_dumper::HloComputationToDotGraph(
+      *hlo_module->entry_computation(), options);
+}
+
+StatusOr<ProgramShape> Computation::GetProgramShape() const {
+  return computation_.GetProgramShape();
+}
+
+StatusOr<Shape> Computation::GetReturnValueShape() const {
+  TF_ASSIGN_OR_RETURN(ProgramShape shape, computation_.GetProgramShape());
+  return std::move(*shape.mutable_result());
 }
 
 LocalOp::LocalOp(const XlaOp& op) : op_(op) {}
 
 const XlaOp& LocalOp::op() const { return op_; }
 
-LocalComputationBuilder::LocalComputationBuilder(const string& computation_name)
+ComputationBuilder::ComputationBuilder(const string& computation_name)
     : builder_(computation_name) {}
 
-void LocalComputationBuilder::SetOpMetadata(const OpMetadata& metadata) {
+void ComputationBuilder::SetOpMetadata(const OpMetadata& metadata) {
   builder_.SetOpMetadata(metadata);
 }
 
-void LocalComputationBuilder::ClearOpMetadata() { builder_.ClearOpMetadata(); }
+void ComputationBuilder::ClearOpMetadata() { builder_.ClearOpMetadata(); }
 
-StatusOr<LocalComputation*> LocalComputationBuilder::Build() {
+StatusOr<Computation*> ComputationBuilder::Build() {
   TF_ASSIGN_OR_RETURN(XlaComputation computation, builder_.Build());
-  return new LocalComputation(std::move(computation));
+  return new Computation(std::move(computation));
 }
 
-LocalOp LocalComputationBuilder::Parameter(int64 parameter_number,
-                                           const Shape& shape,
-                                           const string& name) {
+LocalOp ComputationBuilder::Parameter(int64 parameter_number,
+                                      const Shape& shape, const string& name) {
   return xla::Parameter(&builder_, parameter_number, shape, name);
 }
 
-StatusOr<LocalComputation*> LocalComputationBuilder::BuildWithRoot(
-    const LocalOp& root) {
+StatusOr<Computation*> ComputationBuilder::BuildWithRoot(const LocalOp& root) {
   TF_ASSIGN_OR_RETURN(XlaComputation computation, builder_.Build(root.op()));
-  return new LocalComputation(std::move(computation));
+  return new Computation(std::move(computation));
 }
 
-StatusOr<Shape> LocalComputationBuilder::GetShape(const LocalOp& operand) {
+StatusOr<Shape> ComputationBuilder::GetShape(const LocalOp& operand) {
   return builder_.GetShape(operand.op());
 }
 
-StatusOr<Shape> LocalComputationBuilder::GetReturnValueShape() {
+StatusOr<Shape> ComputationBuilder::GetReturnValueShape() {
   TF_ASSIGN_OR_RETURN(ProgramShape program_shape, builder_.GetProgramShape());
   return program_shape.result();
 }
 
-LocalOp LocalComputationBuilder::Infeed(const Shape& shape) {
+LocalOp ComputationBuilder::Infeed(const Shape& shape) {
   return xla::Infeed(&builder_, shape);
 }
 
-void LocalComputationBuilder::Outfeed(const LocalOp& operand,
-                                      const Shape& shape,
-                                      const string& outfeed_config) {
+void ComputationBuilder::Outfeed(const LocalOp& operand, const Shape& shape,
+                                 const string& outfeed_config) {
   xla::Outfeed(operand.op(), shape, outfeed_config);
 }
 
-LocalOp LocalComputationBuilder::ConstantLiteral(const Literal& literal) {
+LocalOp ComputationBuilder::ConstantLiteral(const Literal& literal) {
   return xla::ConstantLiteral(&builder_, literal);
 }
 
-LocalOp LocalComputationBuilder::Broadcast(
-    const LocalOp& operand, absl::Span<const int64> broadcast_sizes) {
+LocalOp ComputationBuilder::Iota(PrimitiveType element_type, int64 size) {
+  return xla::Iota(&builder_, element_type, size);
+}
+
+LocalOp ComputationBuilder::BroadcastedIota(const Shape& shape,
+                                            int64 dimension) {
+  return xla::Iota(&builder_, shape, dimension);
+}
+
+LocalOp ComputationBuilder::Broadcast(const LocalOp& operand,
+                                      absl::Span<const int64> broadcast_sizes) {
   return xla::Broadcast(operand.op(), broadcast_sizes);
 }
 
-LocalOp LocalComputationBuilder::BroadcastInDim(
+LocalOp ComputationBuilder::BroadcastInDim(
     const LocalOp& operand, absl::Span<const int64> out_dim_sizes,
     absl::Span<const int64> broadcast_dimensions) {
   return xla::BroadcastInDim(operand.op(), out_dim_sizes, broadcast_dimensions);
 }
 
-LocalOp LocalComputationBuilder::Pad(const LocalOp& operand,
-                                     const LocalOp& padding_value,
-                                     const PaddingConfig& padding_config) {
+LocalOp ComputationBuilder::Pad(const LocalOp& operand,
+                                const LocalOp& padding_value,
+                                const PaddingConfig& padding_config) {
   return xla::Pad(operand.op(), padding_value.op(), padding_config);
 }
 
-LocalOp LocalComputationBuilder::Reshape(const LocalOp& operand,
-                                         absl::Span<const int64> dimensions,
-                                         absl::Span<const int64> new_sizes) {
+LocalOp ComputationBuilder::Reshape(const LocalOp& operand,
+                                    absl::Span<const int64> dimensions,
+                                    absl::Span<const int64> new_sizes) {
   return xla::Reshape(operand.op(), dimensions, new_sizes);
 }
 
-LocalOp LocalComputationBuilder::Collapse(const LocalOp& operand,
-                                          absl::Span<const int64> dimensions) {
+LocalOp ComputationBuilder::Collapse(const LocalOp& operand,
+                                     absl::Span<const int64> dimensions) {
   return xla::Collapse(operand.op(), dimensions);
 }
 
-LocalOp LocalComputationBuilder::CrossReplicaSum(const LocalOp& operand) {
-  return xla::CrossReplicaSum(operand.op());
+LocalOp ComputationBuilder::AllToAll(
+    const LocalOp& operand, int64 split_dimension, int64 concat_dimension,
+    int64 split_count, absl::Span<const ReplicaGroup> replica_groups) {
+  std::vector<ReplicaGroup> rg(replica_groups.size());
+  for (int i = 0; i < replica_groups.size(); ++i) {
+    rg.push_back(replica_groups[i]);
+  }
+  return xla::AllToAll(operand.op(), split_dimension, concat_dimension,
+                       split_count, rg);
+}
+
+LocalOp ComputationBuilder::CrossReplicaSum(
+    const LocalOp& operand, absl::Span<const ReplicaGroup> replica_groups) {
+  return xla::CrossReplicaSum(operand.op(), replica_groups);
 }
 
-LocalOp LocalComputationBuilder::Slice(const LocalOp& operand,
-                                       absl::Span<const int64> start_indices,
-                                       absl::Span<const int64> limit_indices,
-                                       absl::Span<const int64> strides) {
+LocalOp ComputationBuilder::Slice(const LocalOp& operand,
+                                  absl::Span<const int64> start_indices,
+                                  absl::Span<const int64> limit_indices,
+                                  absl::Span<const int64> strides) {
   return xla::Slice(operand.op(), start_indices, limit_indices, strides);
 }
 
-LocalOp LocalComputationBuilder::SliceInDim(const LocalOp& operand,
-                                            int64 start_index,
-                                            int64 limit_index, int64 stride,
-                                            int64 dimno) {
+LocalOp ComputationBuilder::SliceInDim(const LocalOp& operand,
+                                       int64 start_index, int64 limit_index,
+                                       int64 stride, int64 dimno) {
   return xla::SliceInDim(operand.op(), start_index, limit_index, stride, dimno);
 }
 
-LocalOp LocalComputationBuilder::DynamicSlice(
-    const LocalOp& operand, const LocalOp& start_indices,
-    absl::Span<const int64> slice_sizes) {
+LocalOp ComputationBuilder::DynamicSlice(const LocalOp& operand,
+                                         const LocalOp& start_indices,
+                                         absl::Span<const int64> slice_sizes) {
   return xla::DynamicSlice(operand.op(), start_indices.op(), slice_sizes);
 }
 
-LocalOp LocalComputationBuilder::DynamicUpdateSlice(
-    const LocalOp& operand, const LocalOp& update,
-    const LocalOp& start_indices) {
+LocalOp ComputationBuilder::DynamicUpdateSlice(const LocalOp& operand,
+                                               const LocalOp& update,
+                                               const LocalOp& start_indices) {
   return xla::DynamicUpdateSlice(operand.op(), update.op(), start_indices.op());
 }
 
-LocalOp LocalComputationBuilder::ConcatInDim(absl::Span<const LocalOp> operands,
-                                             int64 dimension) {
+LocalOp ComputationBuilder::ConcatInDim(absl::Span<const LocalOp> operands,
+                                        int64 dimension) {
   std::vector<XlaOp> xla_ops;
   xla_ops.reserve(operands.size());
   for (const auto& op : operands) {
@@ -712,18 +544,18 @@ LocalOp LocalComputationBuilder::ConcatInDim(absl::Span<const LocalOp> operands,
   return xla::ConcatInDim(&builder_, xla_ops, dimension);
 }
 
-LocalOp LocalComputationBuilder::SelectAndScatterWithGeneralPadding(
-    const LocalOp& operand, const LocalComputation& select,
+LocalOp ComputationBuilder::SelectAndScatterWithGeneralPadding(
+    const LocalOp& operand, const Computation& select,
     absl::Span<const int64> window_dimensions,
     absl::Span<const int64> window_strides,
     absl::Span<const std::pair<int64, int64>> padding, const LocalOp& source,
-    const LocalOp& init_value, const LocalComputation& scatter) {
+    const LocalOp& init_value, const Computation& scatter) {
   return xla::SelectAndScatterWithGeneralPadding(
       operand.op(), select.computation(), window_dimensions, window_strides,
       padding, source.op(), init_value.op(), scatter.computation());
 }
 
-LocalOp LocalComputationBuilder::Tuple(absl::Span<const LocalOp> elements) {
+LocalOp ComputationBuilder::Tuple(absl::Span<const LocalOp> elements) {
   std::vector<XlaOp> xla_ops;
   xla_ops.reserve(elements.size());
   for (const auto& op : elements) {
@@ -733,22 +565,22 @@ LocalOp LocalComputationBuilder::Tuple(absl::Span<const LocalOp> elements) {
   return xla::Tuple(&builder_, xla_ops);
 }
 
-LocalOp LocalComputationBuilder::GetTupleElement(const LocalOp& tuple_data,
-                                                 int64 index) {
+LocalOp ComputationBuilder::GetTupleElement(const LocalOp& tuple_data,
+                                            int64 index) {
   return xla::GetTupleElement(tuple_data.op(), index);
 }
 
-LocalOp LocalComputationBuilder::Dot(const LocalOp& lhs, const LocalOp& rhs) {
+LocalOp ComputationBuilder::Dot(const LocalOp& lhs, const LocalOp& rhs) {
   return xla::Dot(lhs.op(), rhs.op());
 }
 
-LocalOp LocalComputationBuilder::DotGeneral(
+LocalOp ComputationBuilder::DotGeneral(
     const LocalOp& lhs, const LocalOp& rhs,
     const DotDimensionNumbers& dimension_numbers) {
   return xla::DotGeneral(lhs.op(), rhs.op(), dimension_numbers);
 }
 
-LocalOp LocalComputationBuilder::ConvGeneralDilated(
+LocalOp ComputationBuilder::ConvGeneralDilated(
     const LocalOp& lhs, const LocalOp& rhs,
     absl::Span<const int64> window_strides,
     absl::Span<const std::pair<int64, int64>> padding,
@@ -760,18 +592,18 @@ LocalOp LocalComputationBuilder::ConvGeneralDilated(
                                  feature_group_count);
 }
 
-LocalOp LocalComputationBuilder::ConvertElementType(
-    const LocalOp& operand, PrimitiveType new_element_type) {
+LocalOp ComputationBuilder::ConvertElementType(const LocalOp& operand,
+                                               PrimitiveType new_element_type) {
   return xla::ConvertElementType(operand.op(), new_element_type);
 }
 
-LocalOp LocalComputationBuilder::BitcastConvertType(
-    const LocalOp& operand, PrimitiveType new_element_type) {
+LocalOp ComputationBuilder::BitcastConvertType(const LocalOp& operand,
+                                               PrimitiveType new_element_type) {
   return xla::BitcastConvertType(operand.op(), new_element_type);
 }
 
-LocalOp LocalComputationBuilder::Call(const LocalComputation& local_computation,
-                                      absl::Span<const LocalOp> operands) {
+LocalOp ComputationBuilder::Call(const Computation& local_computation,
+                                 absl::Span<const LocalOp> operands) {
   std::vector<XlaOp> xla_ops;
   xla_ops.reserve(operands.size());
   for (const auto& op : operands) {
@@ -780,19 +612,34 @@ LocalOp LocalComputationBuilder::Call(const LocalComputation& local_computation,
   return xla::Call(&builder_, local_computation.computation(), xla_ops);
 }
 
-LocalOp LocalComputationBuilder::Transpose(
-    const LocalOp& operand, absl::Span<const int64> permutation) {
+LocalOp ComputationBuilder::CustomCall(
+    const string& call_target_name, absl::Span<const LocalOp> operands,
+    const Shape& shape_with_layout,
+    const std::vector<Shape>& operand_shapes_with_layout,
+    const string& opaque) {
+  std::vector<XlaOp> xla_ops;
+  xla_ops.reserve(operands.size());
+  for (const auto& op : operands) {
+    xla_ops.push_back(op.op());
+  }
+  return xla::CustomCallWithLayout(&builder_, call_target_name, xla_ops,
+                                   shape_with_layout,
+                                   operand_shapes_with_layout, opaque);
+}
+
+LocalOp ComputationBuilder::Transpose(const LocalOp& operand,
+                                      absl::Span<const int64> permutation) {
   return xla::Transpose(operand.op(), permutation);
 }
 
-LocalOp LocalComputationBuilder::Rev(const LocalOp& operand,
-                                     absl::Span<const int64> dimensions) {
+LocalOp ComputationBuilder::Rev(const LocalOp& operand,
+                                absl::Span<const int64> dimensions) {
   return xla::Rev(operand.op(), dimensions);
 }
 
-LocalOp LocalComputationBuilder::Map(absl::Span<const LocalOp> operands,
-                                     const LocalComputation& local_computation,
-                                     absl::Span<const int64> dimensions) {
+LocalOp ComputationBuilder::Map(absl::Span<const LocalOp> operands,
+                                const Computation& local_computation,
+                                absl::Span<const int64> dimensions) {
   std::vector<XlaOp> xla_ops;
   xla_ops.reserve(operands.size());
   for (const auto& op : operands) {
@@ -803,17 +650,17 @@ LocalOp LocalComputationBuilder::Map(absl::Span<const LocalOp> operands,
                   dimensions);
 }
 
-LocalOp LocalComputationBuilder::Reduce(
+LocalOp ComputationBuilder::Reduce(
     const LocalOp& operand, const LocalOp& init_value,
-    const LocalComputation& local_computation,
+    const Computation& local_computation,
     absl::Span<const int64> dimensions_to_reduce) {
   return xla::Reduce(operand.op(), init_value.op(),
                      local_computation.computation(), dimensions_to_reduce);
 }
 
-LocalOp LocalComputationBuilder::ReduceWindowWithGeneralPadding(
+LocalOp ComputationBuilder::ReduceWindowWithGeneralPadding(
     const LocalOp& operand, const LocalOp& init_value,
-    const LocalComputation& local_computation,
+    const Computation& local_computation,
     absl::Span<const int64> window_dimensions,
     absl::Span<const int64> window_strides,
     absl::Span<const int64> base_dilations,
@@ -825,56 +672,92 @@ LocalOp LocalComputationBuilder::ReduceWindowWithGeneralPadding(
       padding);
 }
 
-LocalOp LocalComputationBuilder::RngNormal(const LocalOp& mu,
-                                           const LocalOp& sigma,
-                                           const Shape& shape) {
+LocalOp ComputationBuilder::RngNormal(const LocalOp& mu, const LocalOp& sigma,
+                                      const Shape& shape) {
   return xla::RngNormal(mu.op(), sigma.op(), shape);
 }
 
-LocalOp LocalComputationBuilder::RngUniform(const LocalOp& a, const LocalOp& b,
-                                            const Shape& shape) {
+LocalOp ComputationBuilder::RngUniform(const LocalOp& a, const LocalOp& b,
+                                       const Shape& shape) {
   return xla::RngUniform(a.op(), b.op(), shape);
 }
 
-LocalOp LocalComputationBuilder::While(const LocalComputation& condition,
-                                       const LocalComputation& body,
-                                       const LocalOp& init) {
+LocalOp ComputationBuilder::While(const Computation& condition,
+                                  const Computation& body,
+                                  const LocalOp& init) {
   return xla::While(condition.computation(), body.computation(), init.op());
 }
 
-LocalOp LocalComputationBuilder::Conditional(
-    const LocalOp& predicate, const LocalOp& true_operand,
-    const LocalComputation& true_computation, const LocalOp& false_operand,
-    const LocalComputation& false_computation) {
+LocalOp ComputationBuilder::Conditional(const LocalOp& predicate,
+                                        const LocalOp& true_operand,
+                                        const Computation& true_computation,
+                                        const LocalOp& false_operand,
+                                        const Computation& false_computation) {
   return xla::Conditional(predicate.op(), true_operand.op(),
                           true_computation.computation(), false_operand.op(),
                           false_computation.computation());
 }
 
-StatusOr<bool> LocalComputationBuilder::IsConstant(const LocalOp& operand) {
+StatusOr<bool> ComputationBuilder::IsConstant(const LocalOp& operand) {
   return builder_.IsConstant(operand.op());
 }
 
-LocalOp LocalComputationBuilder::Sort(const LocalOp& operand, int64 dimension) {
+LocalOp ComputationBuilder::Sort(const LocalOp& operand, int64 dimension) {
   return xla::Sort(operand.op(), {}, dimension);
 }
 
-LocalOp LocalComputationBuilder::SortKeyVal(const LocalOp& keys,
-                                            const LocalOp& values,
-                                            int64 dimension) {
+LocalOp ComputationBuilder::SortKeyVal(const LocalOp& keys,
+                                       const LocalOp& values, int64 dimension) {
   return xla::Sort(keys.op(), {values.op()}, dimension);
 }
 
-StatusOr<LocalComputation*> LocalComputationBuilder::BuildConstantSubGraph(
+LocalOp ComputationBuilder::Cholesky(const LocalOp& a) {
+  return xla::Cholesky(a.op());
+}
+
+LocalOp ComputationBuilder::QR(const LocalOp& a, bool full_matrices) {
+  XlaBuilder* builder = a.op().builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto qr, xla::QRDecomposition(a.op(), full_matrices));
+    return xla::Tuple(builder, {qr.q, qr.r});
+  });
+}
+
+LocalOp ComputationBuilder::TriangularSolve(const LocalOp& a, const LocalOp& b,
+                                            bool left_side, bool lower,
+                                            bool unit_diagonal,
+                                            int transpose_a) {
+  return xla::TriangularSolve(
+      a.op(), b.op(), left_side, lower, unit_diagonal,
+      xla::TriangularSolveOptions::Transpose(transpose_a));
+}
+
+LocalOp ComputationBuilder::Gather(
+    const LocalOp& input, const LocalOp& start_indices,
+    const GatherDimensionNumbers& dimension_numbers,
+    absl::Span<const int64> slice_sizes) {
+  return xla::Gather(input.op(), start_indices.op(), dimension_numbers,
+                     slice_sizes);
+}
+
+LocalOp ComputationBuilder::Scatter(
+    const LocalOp& input, const LocalOp& scatter_indices,
+    const LocalOp& updates, const Computation& update_computation,
+    const ScatterDimensionNumbers& dimension_numbers) {
+  return xla::Scatter(input.op(), scatter_indices.op(), updates.op(),
+                      update_computation.computation(), dimension_numbers);
+}
+
+StatusOr<Computation*> ComputationBuilder::BuildConstantSubGraph(
     const LocalOp& operand) {
   TF_ASSIGN_OR_RETURN(XlaComputation computation,
                       builder_.BuildConstantSubGraph(operand.op()));
-  return new LocalComputation(std::move(computation));
+  return new Computation(std::move(computation));
 }
 
-#define _FORWARD(method_name, return_sig, args_sig, args)    \
-  return_sig LocalComputationBuilder::method_name args_sig { \
-    return xla::method_name args;                            \
+#define _FORWARD(method_name, return_sig, args_sig, args) \
+  return_sig ComputationBuilder::method_name args_sig {   \
+    return xla::method_name args;                         \
   }
 
 #define _FORWARD_UNOP(method_name) \
@@ -916,6 +799,7 @@ _FORWARD_BINOP(Atan2)
 _FORWARD_BINOP(Pow)
 _FORWARD_BINOP(Complex)
 _FORWARD_UNOP(Not)
+_FORWARD_UNOP(Clz)
 _FORWARD_UNOP(Abs)
 _FORWARD_UNOP(Exp)
 _FORWARD_UNOP(Expm1)
@@ -961,108 +845,9 @@ void DeleteLocalShapedBuffer(LocalShapedBuffer* local_shaped_buffer) {
   delete local_shaped_buffer;
 }
 
-void DeleteXrtAllocation(XrtAllocation* allocation) { delete allocation; }
-
-void DeleteCompiledLocalComputation(CompiledLocalComputation* computation) {
-  delete computation;
-}
-
-void DeleteCompiledXrtComputation(CompiledXrtComputation* computation) {
-  delete computation;
-}
-
-void DeleteLocalComputation(LocalComputation* computation) {
-  delete computation;
-}
-
-StatusOr<LocalShapedBufferTuple*> DestructureLocalShapedBufferTuple(
-    LocalShapedBuffer* local_shaped_buffer) {
-  const Shape tuple_shape = local_shaped_buffer->shape();
-
-  if (!ShapeUtil::IsTuple(tuple_shape)) {
-    return InvalidArgument(
-        "Attemped to destructure a LocalShapedBuffer that did not have a tuple "
-        "shape; shape: %s",
-        ShapeUtil::HumanString(tuple_shape));
-  }
+void DeleteLocalExecutable(LocalExecutable* computation) { delete computation; }
 
-  DeviceMemoryAllocator* allocator =
-      local_shaped_buffer->shaped_buffer()->memory_allocator();
-  ShapedBuffer tuple_buffer = local_shaped_buffer->Release();
-
-  // Extract some metadata we use to construct scoped buffers.
-  const se::Platform* platform = tuple_buffer.platform();
-  int device_ordinal = tuple_buffer.device_ordinal();
-
-  ShapeTree<se::DeviceMemoryBase>& shape_tree = tuple_buffer.buffers();
-  std::vector<LocalShapedBuffer*> results;
-  for (int64 i = 0; i < ShapeUtil::TupleElementCount(tuple_shape); ++i) {
-    // Create a shaped buffer for this destructured tuple element.
-    const Shape& subshape = ShapeUtil::GetSubshape(tuple_shape, {i});
-    VLOG(3) << "Starting tuple element " << i << " subshape: " << subshape;
-    ShapedBuffer shaped_buffer(subshape, subshape, platform, device_ordinal);
-
-    ShapeUtil::ForEachSubshape(
-        subshape, [&](const Shape& s, const ShapeIndex& index) {
-          ShapeIndex original(index);
-          original.push_front(i);
-          se::DeviceMemoryBase* device_memory =
-              shape_tree.mutable_element(original);
-          shaped_buffer.set_buffer(*device_memory, index);
-          *device_memory = se::DeviceMemoryBase();
-        });
-
-    VLOG(3) << "Completed tuple element: " << i;
-    results.push_back(new LocalShapedBuffer(
-        ScopedShapedBuffer(std::move(shaped_buffer), allocator)));
-  }
-  // Deallocate the root buffer.
-  se::DeviceMemoryBase root_buffer = tuple_buffer.root_buffer();
-  TF_RETURN_IF_ERROR(allocator->Deallocate(device_ordinal, root_buffer));
-  return new LocalShapedBufferTuple(std::move(results));
-}
-
-StatusOr<XrtAllocationTuple*> DestructureXrtAllocationTuple(
-    XrtAllocation* allocation, const string& session_target) {
-  const Shape& tuple_shape = allocation->shape();
-
-  if (!ShapeUtil::IsTuple(tuple_shape)) {
-    return InvalidArgument(
-        "Attemped to destructure a LocalShapedBuffer that did not have a tuple "
-        "shape; shape: %s",
-        ShapeUtil::HumanString(tuple_shape));
-  }
-
-  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
-  auto base_handle = tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
-  auto shape_index = tensorflow::ops::Placeholder(root, tensorflow::DT_INT32);
-  auto subtuple = tensorflow::ops::XRTSubTuple(root, base_handle, shape_index);
-  TF_RETURN_IF_ERROR(root.status());
-
-  tensorflow::ClientSession session(root, session_target);
-  tensorflow::ClientSession::FeedType inputs;
-  std::vector<XrtAllocation*> results;
-  for (int32 i = 0; i < ShapeUtil::TupleElementCount(tuple_shape); ++i) {
-    inputs.clear();
-    inputs.insert({base_handle, allocation->handle()});
-    inputs.insert({shape_index, {i}});
-    std::vector<tensorflow::Tensor> outputs;
-    auto status = session.Run(inputs, {subtuple}, &outputs);
-    if (!status.ok()) {
-      // Clean up before returning non-ok status.
-      for (int j = 0; j < results.size(); ++j) {
-        delete results[j];
-      }
-      return status;
-    }
-    const int64 subtuple_handle = outputs[0].scalar<int64>()();
-    const Shape& subtuple_shape =
-        ShapeUtil::GetTupleElementShape(tuple_shape, i);
-    results.push_back(
-        new XrtAllocation(subtuple_handle, subtuple_shape, session_target));
-  }
-  return new XrtAllocationTuple(std::move(results));
-}
+void DeleteComputation(Computation* computation) { delete computation; }
 
 }  // namespace swig
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index 149e44570df5c6a3df88bbe2ffa779be47842d82..66b1cce7fb598388af40940ea2ed52ac2f8ee8e1 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -19,10 +19,9 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include <Python.h>
+
 #include "absl/types/span.h"
-#include "tensorflow/cc/framework/ops.h"
-#include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
@@ -33,37 +32,42 @@ limitations under the License.
 namespace xla {
 namespace swig {
 
-// Initializes the number of replicas that XLA will be initialized with (when
-// first obtaining a handle to the local XLA service). If this is called after
-// the handle to the local XLA service has been established, then an error is
-// returned.
-Status InitializeReplicaCount(int replica_count);
-
-// Initializes the platform name that XLA will be initialized with (when
-// first obtaining a handle to the local XLA service). If this is called after
-// the handle to the local XLA service has been established, then an error is
-// returned.
-Status InitializePlatformName(const string& platform_name);
-
-// Returns the replica count that is currently set, regardless of whether the
-// local XLA service has been instantiated yet or not.
-int GetReplicaCount();
-
-// Wraps the local client's infeed-transfer function.
-//
-// The default device ordinal (0) is used.
-Status TransferToInfeedLocal(const Literal& literal);
-
-// Transfers the given literal to the infeed of the given replica.
-//
-// The replica number is resolved to an appropriate device ordinal.
-Status TransferToInfeedLocalReplica(const Literal& literal, int replica_number);
-
-// Transfers a literal of the given shape from the outfeed of the given replica.
-//
-// The replica number is resolved to an appropriate device ordinal.
-StatusOr<Literal> TransferFromOutfeedLocalReplica(const Shape& shape,
-                                                  int replica_number);
+// Registers a 'fn_capsule' as a CPU custom call target.
+// 'fn_capsule' is a void* pointer encapsulated in a PyCapsule object, with name
+// "xla._CPU_CUSTOM_CALL_TARGET".
+Status RegisterCpuCustomCallTarget(const string& name, PyObject* fn_capsule);
+
+// Wrapper around an xla::LocalClient.
+class LocalClient {
+ public:
+  // Initializes a local XLA client for `platform_name`. Returns an error if no
+  /// such platform exists, or if the platform has no visible devices.
+  static StatusOr<LocalClient> Get(const string& platform_name);
+
+  // Copyable and moveable; the class is just a wrapper around a
+  // xla::LocalClient pointer for convenient SWIG wrapping.
+
+  // Returns the number of devices known to the XLA client.
+  int DeviceCount() const;
+
+  // Wraps the local client's infeed-transfer function.
+  //
+  // The default device ordinal (0) is used.
+  Status TransferToInfeed(const Literal& literal, int device_ordinal);
+
+  // Transfers a literal of the given shape from the outfeed of the given
+  // replica.
+  StatusOr<Literal> TransferFromOutfeed(const Shape& shape, int device_ordinal);
+
+  xla::LocalClient* client() const { return client_; }
+
+ private:
+  LocalClient(xla::LocalClient* client);
+
+  xla::LocalClient* client_;
+};
+
+class LocalShapedBufferTuple;
 
 // Represents a reference to literals that live in a device-allocated buffer via
 // XLA. Specifically, wraps a ScopedShapedBuffer produced by transferring a
@@ -72,9 +76,9 @@ class LocalShapedBuffer {
  public:
   static StatusOr<LocalShapedBuffer*> FromLiteral(
       const Literal& argument, const absl::optional<Shape>& shape_with_layout,
-      int replica_number);
+      const LocalClient& client, int device_ordinal);
 
-  LocalShapedBuffer(ScopedShapedBuffer shaped_buffer);
+  LocalShapedBuffer(ScopedShapedBuffer shaped_buffer, xla::LocalClient* client);
   StatusOr<Literal> ToLiteral() const;
   const Shape& shape() const;
   const ScopedShapedBuffer* shaped_buffer() const;
@@ -83,8 +87,13 @@ class LocalShapedBuffer {
   // analogous to std::unique_ptr::release().
   ShapedBuffer Release();
 
+  // Destructures a tuple-valued LocalShapedBuffer into its constitutent
+  // elements in LocalShapedBufferTuple form.
+  StatusOr<LocalShapedBufferTuple*> DestructureTuple();
+
  private:
   ScopedShapedBuffer shaped_buffer_;
+  xla::LocalClient* client_;
 };
 
 // Result of a tuple destructuring operation on a LocalShapedBuffer -- this
@@ -110,68 +119,20 @@ class LocalShapedBufferTuple {
   std::vector<LocalShapedBuffer*> elements_;
 };
 
-// Destructures a tuple-valued LocalShapedBuffer into its constitutent elements
-// in LocalShapedBufferTuple form.
-StatusOr<LocalShapedBufferTuple*> DestructureLocalShapedBufferTuple(
-    LocalShapedBuffer* local_shaped_buffer);
-
-// Represents a reference to literals that live in a device-allocated buffer via
-// XRT. Specifically, wraps an int64 handle produced by running the allocation
-// graph, and an XLA shape to track the referent's shape.
-class XrtAllocation {
- public:
-  // Accepts a `session_target` argument, used in constructing the
-  // `tensorflow::ClientSession` instance in which allocation and deallocation
-  // graphs are run.
-  static StatusOr<XrtAllocation*> FromLiteral(const Literal& argument,
-                                              const string& session_target);
-
-  XrtAllocation(int64 handle, Shape shape, const string& session_target);
-  ~XrtAllocation();
-  StatusOr<Literal> ToLiteral() const;
-  const Shape& shape() const;
-  const int64 handle() const;
-
- private:
-  const int64 handle_;
-  const Shape shape_;
-  const string session_target_;
-};
-
-// Result of a tuple destructuring operation on an XrtAllocation.
-class XrtAllocationTuple {
- public:
-  // Note: any XrtAllocation elements that are not Release()'d will be
-  // deallocated in the destructor.
-  explicit XrtAllocationTuple(std::vector<XrtAllocation*> elements);
-
-  ~XrtAllocationTuple();
-
-  // Releases the ith element to the caller. Further attempts to release the ith
-  // element will return an invalid argument error.
-  StatusOr<XrtAllocation*> Release(int i);
-
-  // Returns the number of elements in the destructured tuple.
-  int64 size() const;
-
- private:
-  std::vector<XrtAllocation*> elements_;
-};
-
-// Destructures a tuple-valued XrtAllocation into its constitutent elements
-// in XrtAllocationTuple form.
-//
-// Accepts a `session_target` argument, used in constructing the
-// `tensorflow::ClientSession` instance in which the sub-tupling graph is run,
-// and passed along in constructing each constituent XrtAllocation.
-StatusOr<XrtAllocationTuple*> DestructureXrtAllocationTuple(
-    XrtAllocation* allocation, const string& session_target);
-
 // Represents a compiled computation that can be executed given handles to
 // device-allocated literals. Specifically, wraps an XLA LocalExecutable.
-class CompiledLocalComputation {
+class LocalExecutable {
  public:
-  CompiledLocalComputation(std::unique_ptr<LocalExecutable> executable);
+  LocalExecutable(std::unique_ptr<xla::LocalExecutable> executable,
+                  xla::DeviceAssignment device_assignment,
+                  xla::LocalClient* client);
+
+  int num_replicas() const {
+    return executable_->build_options().num_replicas();
+  }
+
+  // Returns the device ordinals to which each replica is assigned.
+  std::vector<int> DeviceOrdinals() const;
 
   StatusOr<LocalShapedBuffer*> Execute(
       absl::Span<LocalShapedBuffer* const> argument_handles);
@@ -183,47 +144,22 @@ class CompiledLocalComputation {
       absl::Span<const std::vector<LocalShapedBuffer*> > argument_handles);
 
  private:
-  std::unique_ptr<LocalExecutable> executable_;
-};
-
-// Represents a compiled computation that can be executed given handles to
-// device-allocated literals. Specifically, wraps an XRT computation handle.
-class CompiledXrtComputation {
- public:
-  // Accepts a `session_target` argument, used in constructing the
-  // `tensorflow::ClientSession` instance in which the execution graph is run.
-  CompiledXrtComputation(const ProgramShape& program_shape, int64 handle,
-                         const string& session_target);
-  ~CompiledXrtComputation();
-
-  StatusOr<XrtAllocation*> Execute(
-      absl::Span<XrtAllocation* const> argument_handles);
-
-  const ProgramShape& program_shape() const;
-  int64 handle() const;
-
- private:
-  const ProgramShape program_shape_;
-  const int64 handle_;
-  const string session_target_;
+  const std::unique_ptr<xla::LocalExecutable> executable_;
+  const xla::DeviceAssignment device_assignment_;
+  xla::LocalClient* const client_;
 };
 
-// Wraps a XlaComputation produced by a LocalComputationBuilder. The
+// Wraps a XlaComputation produced by a ComputationBuilder. The
 // Compile method compiles the computation to a (local) executable via
 // the client library's local client. This class is intended to be
 // made available to Python via SWIG.
-class LocalComputation {
+class Computation {
  public:
-  LocalComputation(XlaComputation computation);
+  Computation(XlaComputation computation);
 
-  StatusOr<CompiledLocalComputation*> Compile(
+  StatusOr<LocalExecutable*> Compile(
       const std::vector<Shape>& argument_shapes,
-      const ExecutableBuildOptions* build_options);
-
-  // Accepts a `session_target` argument, used in constructing the
-  // `tensorflow::ClientSession` instance in which the compilation graph is run.
-  StatusOr<CompiledXrtComputation*> CompileForXrt(
-      const std::vector<Shape>& argument_shapes, const string& session_target);
+      const ExecutableBuildOptions* build_options, const LocalClient& client);
 
   const XlaComputation& computation() const;
 
@@ -232,6 +168,15 @@ class LocalComputation {
   // string on failure.
   string GetSerializedProto() const;
 
+  // Returns the computation in human-readable HLO text format.
+  StatusOr<string> GetHloText() const;
+
+  // Returns the computation in graphviz dot format.
+  StatusOr<string> GetHloDotGraph() const;
+
+  // Returns the program shape for this computation.
+  StatusOr<ProgramShape> GetProgramShape() const;
+
   // Returns the return-value shape for this computation.
   StatusOr<Shape> GetReturnValueShape() const;
 
@@ -239,7 +184,7 @@ class LocalComputation {
   XlaComputation computation_;
 };
 
-// Wraps a XlaOp produced by a LocalComputationBuilder. This class is intended
+// Wraps a XlaOp produced by a ComputationBuilder. This class is intended
 // to be made available to Python via SWIG.
 class LocalOp {
  public:
@@ -256,20 +201,20 @@ class LocalOp {
 //   Python.
 // - Set up the underlying builder to use the client library's
 //   LocalClient.
-// - Wrap Computations in LocalComputations for Python access.
-// - Correspondingly unwrap incoming LocalComputations.
-class LocalComputationBuilder {
+// - Wrap Computations in Computations for Python access.
+// - Correspondingly unwrap incoming Computations.
+class ComputationBuilder {
  public:
-  LocalComputationBuilder(const string& computation_name);
+  ComputationBuilder(const string& computation_name);
 
   void SetOpMetadata(const OpMetadata& metadata);
   void ClearOpMetadata();
 
-  // Returns an owned LocalComputation to the caller on success.
-  StatusOr<LocalComputation*> Build();
+  // Returns an owned Computation to the caller on success.
+  StatusOr<Computation*> Build();
 
-  // Returns an owned LocalComputation to the caller on success with given root.
-  StatusOr<LocalComputation*> BuildWithRoot(const LocalOp& root);
+  // Returns an owned Computation to the caller on success with given root.
+  StatusOr<Computation*> BuildWithRoot(const LocalOp& root);
 
   LocalOp Parameter(int64 parameter_number, const Shape& shape,
                     const string& name);
@@ -286,6 +231,10 @@ class LocalComputationBuilder {
 
   LocalOp ConstantLiteral(const Literal& literal);
 
+  LocalOp Iota(PrimitiveType element_type, int64 size);
+
+  LocalOp BroadcastedIota(const Shape& shape, int64 dimension);
+
   LocalOp Broadcast(const LocalOp& operand,
                     absl::Span<const int64> broadcast_sizes);
 
@@ -301,7 +250,12 @@ class LocalComputationBuilder {
 
   LocalOp Collapse(const LocalOp& operand, absl::Span<const int64> dimensions);
 
-  LocalOp CrossReplicaSum(const LocalOp& operand);
+  LocalOp AllToAll(const LocalOp& operand, int64 split_dimension,
+                   int64 concat_dimension, int64 split_count,
+                   absl::Span<const ReplicaGroup> replica_groups);
+
+  LocalOp CrossReplicaSum(const LocalOp& operand,
+                          absl::Span<const ReplicaGroup> replica_groups);
 
   LocalOp Slice(const LocalOp& operand, absl::Span<const int64> start_indices,
                 absl::Span<const int64> limit_indices,
@@ -319,11 +273,11 @@ class LocalComputationBuilder {
   LocalOp ConcatInDim(absl::Span<const LocalOp> operands, int64 dimension);
 
   LocalOp SelectAndScatterWithGeneralPadding(
-      const LocalOp& operand, const LocalComputation& select,
+      const LocalOp& operand, const Computation& select,
       absl::Span<const int64> window_dimensions,
       absl::Span<const int64> window_strides,
       absl::Span<const std::pair<int64, int64> > padding, const LocalOp& source,
-      const LocalOp& init_value, const LocalComputation& scatter);
+      const LocalOp& init_value, const Computation& scatter);
 
   LocalOp Tuple(absl::Span<const LocalOp> elements);
 
@@ -349,25 +303,31 @@ class LocalComputationBuilder {
   LocalOp BitcastConvertType(const LocalOp& operand,
                              PrimitiveType new_element_type);
 
-  LocalOp Call(const LocalComputation& local_computation,
+  LocalOp Call(const Computation& local_computation,
                absl::Span<const LocalOp> operands);
 
+  LocalOp CustomCall(const string& call_target_name,
+                     absl::Span<const LocalOp> operands,
+                     const Shape& shape_with_layout,
+                     const std::vector<Shape>& operand_shapes_with_layout,
+                     const string& opaque);
+
   LocalOp Transpose(const LocalOp& operand,
                     absl::Span<const int64> permutation);
 
   LocalOp Rev(const LocalOp& operand, absl::Span<const int64> dimensions);
 
   LocalOp Map(absl::Span<const LocalOp> operands,
-              const LocalComputation& local_computation,
+              const Computation& local_computation,
               absl::Span<const int64> dimensions);
 
   LocalOp Reduce(const LocalOp& operand, const LocalOp& init_value,
-                 const LocalComputation& local_computation,
+                 const Computation& local_computation,
                  absl::Span<const int64> dimensions_to_reduce);
 
   LocalOp ReduceWindowWithGeneralPadding(
       const LocalOp& operand, const LocalOp& init_value,
-      const LocalComputation& local_computation,
+      const Computation& local_computation,
       absl::Span<const int64> window_dimensions,
       absl::Span<const int64> window_strides,
       absl::Span<const int64> base_dilations,
@@ -379,13 +339,13 @@ class LocalComputationBuilder {
 
   LocalOp RngUniform(const LocalOp& a, const LocalOp& b, const Shape& shape);
 
-  LocalOp While(const LocalComputation& condition, const LocalComputation& body,
+  LocalOp While(const Computation& condition, const Computation& body,
                 const LocalOp& init);
 
   LocalOp Conditional(const LocalOp& predicate, const LocalOp& true_operand,
-                      const LocalComputation& true_computation,
+                      const Computation& true_computation,
                       const LocalOp& false_operand,
-                      const LocalComputation& false_computation);
+                      const Computation& false_computation);
 
   StatusOr<bool> IsConstant(const LocalOp& operand);
 
@@ -394,7 +354,25 @@ class LocalComputationBuilder {
   LocalOp SortKeyVal(const LocalOp& keys, const LocalOp& values,
                      int64 dimension);
 
-  StatusOr<LocalComputation*> BuildConstantSubGraph(const LocalOp& operand);
+  LocalOp QR(const LocalOp& a, bool full_matrices);
+
+  LocalOp Cholesky(const LocalOp& a);
+
+  // `transpose_a` is the integer value of a TriangularSolveOptions::Transpose
+  // enum. We use an integer here so we don't have to teach SWIG about the
+  // enum.
+  LocalOp TriangularSolve(const LocalOp& a, const LocalOp& b, bool left_side,
+                          bool lower, bool unit_diagonal, int transpose_a);
+
+  LocalOp Gather(const LocalOp& input, const LocalOp& start_indices,
+                 const GatherDimensionNumbers& dimension_numbers,
+                 absl::Span<const int64> slice_sizes);
+
+  LocalOp Scatter(const LocalOp& input, const LocalOp& scatter_indices,
+                  const LocalOp& updates, const Computation& update_computation,
+                  const ScatterDimensionNumbers& dimension_numbers);
+
+  StatusOr<Computation*> BuildConstantSubGraph(const LocalOp& operand);
 
 #define _FORWARD(method_name, return_sig, args_sig) \
   return_sig method_name args_sig;
@@ -436,6 +414,7 @@ class LocalComputationBuilder {
   _FORWARD_BINOP(Pow)
   _FORWARD_BINOP(Complex)
   _FORWARD_UNOP(Not)
+  _FORWARD_UNOP(Clz)
   _FORWARD_UNOP(Abs)
   _FORWARD_UNOP(Exp)
   _FORWARD_UNOP(Expm1)
@@ -483,10 +462,8 @@ class LocalComputationBuilder {
 
 // Functions for freeing resources from the Python side.
 void DeleteLocalShapedBuffer(LocalShapedBuffer* local_shaped_buffer);
-void DeleteXrtAllocation(XrtAllocation* allocation);
-void DeleteCompiledLocalComputation(CompiledLocalComputation* computation);
-void DeleteCompiledXrtComputation(CompiledXrtComputation* computation);
-void DeleteLocalComputation(LocalComputation* computation);
+void DeleteLocalExecutable(LocalExecutable* computation);
+void DeleteComputation(Computation* computation);
 
 }  // namespace swig
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index d23d693c1e5bde43b52959e4397aa311268411bb..7d7a860baa03e99cc254b7596fb5f9d41acbef20 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -23,17 +23,22 @@ limitations under the License.
 //    C++                                  Python
 // -------------------------------------+---------------------------------------
 //  Span<int64>                        <-  sequence of int
+//  vector<int>                        ->  sequence of int
 //  Span<LocalOp>                      <-  sequence of LocalOp
 //  Literal                            <-> (nested tuple of) numpy ndarray
 //  std::vector<Literal>               <-  sequence of (nested tuple of) ndarray
 //  Shape                               -> pair holding (dtype, dimensions)
 //                                     <-  object duck-typed as xla_client.Shape
+//  ProgramShape                       ->  pair of ([arg_shapes], ret_shape)
 //  std::vector<Shape>                 <-  sequence of xla_client.Shape objects
 //  PrimitiveType                      <-  int
 //  Span<pair<int64, in64>>            <-  sequence of int pairs
 //  PaddingConfig proto                <-  corresponding Python proto
 //  ConvolutionDimensionNumbers proto  <-  corresponding Python proto
 //  DotDimensionNumbers proto          <-  corresponding Python proto
+//  GatherDimensionNumbers proto       <-  corresponding Python proto
+//  ScatterDimensionNumbers proto      <-  corresponding Python proto
+//  Span<ReplicaGroup proto>           <-  sequence of ReplicaGroup Python proto
 //
 // Arrows indicate whether a conversion only ever occurs in one
 // direction, or whether it is maintained bidirectionally.
@@ -94,7 +99,7 @@ limitations under the License.
 // wrapped in a Python class (xla_client.Shape) so as not to expose
 // the raw pair externally.
 //
-// Other SWIG object wrappers (e.g. of LocalComputation) are further
+// Other SWIG object wrappers (e.g. of Computation) are further
 // wrapped by xla_client in order to set up a custom destructor that
 // triggers memory deallocation on the C++ side.
 
@@ -104,6 +109,7 @@ limitations under the License.
 %nothread;
 
 %include "tensorflow/python/platform/base.i"
+%include "tensorflow/compiler/xla/python/xla_data.i"
 
 %{
 // Must be included first
@@ -121,54 +127,6 @@ limitations under the License.
 using namespace xla;
 using namespace xla::swig;
 
-namespace xla {
-
-namespace swig {
-
-bool GetIntAttr(PyObject* o, const char* field, int64* result) {
-  PyObject* fo = PyObject_GetAttrString(o, field);
-  if (!fo) {
-    return false;
-  }
-  const int64 value = numpy::PyIntOrPyLongToLong(fo);
-  if (value == -1 && PyErr_Occurred()) {
-    Py_DECREF(fo);
-    return false;
-  }
-  Py_DECREF(fo);
-  *result = value;
-  return true;
-}
-
-// Returns "ok"; true if there is no error, false if there was an error.
-bool HandleStringAttribute(PyObject* o,
-                           const char* attr_name,
-                           std::function<void(string s)> f) {
-  if (!PyObject_HasAttrString(o, attr_name)) {
-    return true;  // It's ok for the object to not have the attribute.
-  }
-  PyObject* attr = PyObject_GetAttrString(o, attr_name);
-  if (attr == nullptr) {
-    return false;  // An error occurred getting the attribute.
-  }
-  if (attr == Py_None) {
-    Py_DECREF(attr);
-    return true;  // The attribute is None, which we consider ok.
-  }
-  if (!PyString_Check(attr)) {
-    string message = absl::StrFormat("%s must be a string or none; got %s",
-        attr_name, numpy::PyObjectCppRepr(attr));
-    PyErr_SetString(PyExc_TypeError, message.c_str());
-    Py_DECREF(attr);
-    return false;  // Type error, not ok.
-  }
-  f(PyString_AsString(attr));
-  Py_DECREF(attr);
-  return true;  // Handled string attribute, ok!
-}
-
-}
-}
 %}
 
 // Required to use PyArray_* functions.
@@ -176,57 +134,6 @@ bool HandleStringAttribute(PyObject* o,
 tensorflow::ImportNumpy();
 %}
 
-// Basic types
-
-%typemap(out) StatusOr<bool> {
-  if ($1.ok()) {
-    $result = PyBool_FromLong($1.ConsumeValueOrDie());
-  } else {
-    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    SWIG_fail;
-  }
-}
-
-%typemap(out) Status {
-  if (!$1.ok()) {
-    PyErr_SetString(
-        PyExc_RuntimeError, $1.ToString().c_str());
-    SWIG_fail;
-  }
-  Py_INCREF(Py_None);
-  $result = Py_None;
-}
-
-%typemap(in) absl::Span<const int64>
-    (std::vector<int64> temps) {
-  if (!PySequence_Check($input)) {
-    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    SWIG_fail;
-  }
-  const int size = PySequence_Size($input);
-  temps.resize(size);
-  for (int i = 0; i < size; ++i) {
-    PyObject* o = PySequence_GetItem($input, i);
-    PyObject* py_int = numpy::PyNumberToPyInt(o);
-    if (!py_int) {
-      PyErr_SetString(
-          PyExc_TypeError,
-          "Argument sequence element cannot be converted to int");
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    temps[i] = numpy::PyIntOrPyLongToLong(py_int);
-    if (temps[i] == -1 && PyErr_Occurred()) {
-      Py_DECREF(py_int);
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    Py_DECREF(py_int);
-    Py_DECREF(o);
-  }
-  $1 = temps;
-}
-
 // Computation builder types
 
 %typemap(in) absl::Span<const xla::swig::LocalOp>(
@@ -251,12 +158,12 @@ tensorflow::ImportNumpy();
 
 // Computation and buffer/allocation types
 
-%typemap(out) StatusOr<xla::swig::CompiledLocalComputation*> {
+%typemap(out) StatusOr<xla::swig::LocalClient> {
   if ($1.ok()) {
-    auto* value = $1.ValueOrDie();
+    xla::swig::LocalClient value = $1.ValueOrDie();
     {
-      auto* $1 = value;
-      $typemap(out, xla::swig::CompiledLocalComputation*)
+      auto $1 = value;
+      $typemap(out, xla::swig::LocalClient)
     }
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
@@ -264,12 +171,12 @@ tensorflow::ImportNumpy();
   }
 }
 
-%typemap(out) StatusOr<xla::swig::CompiledXrtComputation*> {
+%typemap(out) StatusOr<xla::swig::LocalExecutable*> {
   if ($1.ok()) {
     auto* value = $1.ValueOrDie();
     {
       auto* $1 = value;
-      $typemap(out, xla::swig::CompiledXrtComputation*)
+      $typemap(out, xla::swig::LocalExecutable*)
     }
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
@@ -303,38 +210,12 @@ tensorflow::ImportNumpy();
   }
 }
 
-%typemap(out) StatusOr<xla::swig::XrtAllocation*> {
-  if ($1.ok()) {
-    auto* value = $1.ValueOrDie();
-    {
-      auto* $1 = value;
-      $typemap(out, xla::swig::XrtAllocation*)
-    }
-  } else {
-    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    SWIG_fail;
-  }
-}
-
-%typemap(out) StatusOr<xla::swig::XrtAllocationTuple*> {
-  if ($1.ok()) {
-    auto* value = $1.ValueOrDie();
-    {
-      auto* $1 = value;
-      $typemap(out, xla::swig::XrtAllocationTuple*)
-    }
-  } else {
-    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    SWIG_fail;
-  }
-}
-
-%typemap(out) StatusOr<xla::swig::LocalComputation*> {
+%typemap(out) StatusOr<xla::swig::Computation*> {
   if ($1.ok()) {
     auto* value = $1.ValueOrDie();
     {
       auto* $1 = value;
-      $typemap(out, xla::swig::LocalComputation*)
+      $typemap(out, xla::swig::Computation*)
     }
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
@@ -394,556 +275,6 @@ tensorflow::ImportNumpy();
   $1 = temps;
 }
 
-%typemap(in) absl::Span<xla::swig::XrtAllocation* const>
-    (std::vector<XrtAllocation*> temps) {
-  if (!PySequence_Check($input)) {
-    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    SWIG_fail;
-  }
-  const int size = PySequence_Size($input);
-  temps.reserve(size);
-  for (int i = 0; i < size; ++i) {
-    PyObject* o = PySequence_GetItem($input, i);
-    XrtAllocation* xrta;
-    if ((SWIG_ConvertPtr(o, (void**) &xrta, $descriptor(xla::swig::XrtAllocation*),
-                         SWIG_POINTER_EXCEPTION)) == -1) {
-      SWIG_fail;
-    }
-    temps.push_back(xrta);
-    Py_DECREF(o);
-  }
-  $1 = temps;
-}
-
-// Literal
-
-%typemap(out) StatusOr<Literal> {
-  if ($1.ok()) {
-    Literal value = $1.ConsumeValueOrDie();
-    $result = numpy::PyObjectFromXlaLiteral(*value);
-  } else {
-    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    SWIG_fail;
-  }
-}
-
-%typemap(in) const Literal& (StatusOr<Literal> literal_status) {
-  literal_status = numpy::XlaLiteralFromPyObject($input);
-  if (!literal_status.ok()) {
-    PyErr_SetString(PyExc_RuntimeError, literal_status.status().ToString().c_str());
-    SWIG_fail;
-  }
-  $1 = &literal_status.ValueOrDie();
-}
-
-%typemap(out) Literal {
-  $result = numpy::PyObjectFromXlaLiteral(*$1);
-}
-
-%typemap(out) StatusOr<Literal> {
-  if (!$1.ok()) {
-    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    SWIG_fail;
-  }
-  $result = numpy::PyObjectFromXlaLiteral($1.ValueOrDie());
-}
-
-%typemap(in) const std::vector<Literal>& (std::vector<Literal> temps) {
-  if (!PySequence_Check($input)) {
-    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    SWIG_fail;
-  }
-  const int size = PySequence_Size($input);
-  for (int i = 0; i < size; ++i) {
-    PyObject* o = PySequence_GetItem($input, i);
-    StatusOr<Literal> literal_status = numpy::XlaLiteralFromPyObject(o);
-    if (!literal_status.ok()) {
-      PyErr_SetString(PyExc_RuntimeError, literal_status.status().ToString().c_str());
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    temps.push_back(literal_status.ConsumeValueOrDie());
-    Py_DECREF(o);
-  }
-  $1 = &temps;
-}
-
-// OpMetadata
-
-%typemap(in) const OpMetadata& (OpMetadata temp) {
-  StatusOr<OpMetadata> statusor = numpy::OpMetadataFromPyObject($input);
-  if (!statusor.ok()) {
-    PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-    SWIG_fail;
-  }
-  temp = std::move(statusor).ValueOrDie();
-  $1 = &temp;
-}
-
-// Shape
-
-%typemap(out) const Shape& {
-  $result = numpy::PyShapeInfoFromXlaShape(*$1);
-}
-
-%typemap(out) StatusOr<Shape> {
-  if ($1.ok()) {
-    $result = numpy::PyShapeInfoFromXlaShape($1.ConsumeValueOrDie());
-  } else {
-    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    SWIG_fail;
-  }
-}
-
-%typemap(in) const Shape& (Shape temp) {
-  StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
-  if (!statusor.ok()) {
-    PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-    SWIG_fail;
-  }
-  temp = std::move(statusor).ValueOrDie();
-  $1 = &temp;
-}
-
-%typemap(in) const absl::optional<Shape>& (
-    absl::optional<Shape> temp) {
-  if ($input == Py_None) {
-    temp = absl::nullopt;
-    $1 = &temp;
-  } else {
-    StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
-    if (!statusor.ok()) {
-      PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-      SWIG_fail;
-    }
-    temp = std::move(statusor).ValueOrDie();
-    $1 = &temp;
-  }
-}
-
-%typemap(out) std::unique_ptr<Shape> {
-  $result = numpy::PyShapeInfoFromXlaShape(*$1);
-}
-
-%typemap(in) const std::vector<Shape>& (std::vector<Shape> temps) {
-  if (!PySequence_Check($input)) {
-    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    SWIG_fail;
-  }
-  const int size = PySequence_Size($input);
-  for (int i = 0; i < size; ++i) {
-    PyObject* o = PySequence_GetItem($input, i);
-    StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape(o);
-    Py_DECREF(o);
-    if (!statusor.ok()) {
-      PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-      SWIG_fail;
-    }
-    temps.push_back(statusor.ConsumeValueOrDie());
-  }
-  $1 = &temps;
-}
-
-%typemap(in) const std::vector<absl::optional<Shape> >& (
-    std::vector<absl::optional<Shape> > temps) {
-  if (!PySequence_Check($input)) {
-    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    SWIG_fail;
-  }
-  const int size = PySequence_Size($input);
-  for (int i = 0; i < size; ++i) {
-    PyObject* o = PySequence_GetItem($input, i);
-    if (o == Py_None) {
-      temps.push_back(absl::nullopt);
-    } else {
-      StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape(o);
-      Py_DECREF(o);
-      if (!statusor.ok()) {
-        PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-        SWIG_fail;
-      }
-      temps.push_back(statusor.ConsumeValueOrDie());
-    }
-  }
-  $1 = &temps;
-}
-
-// PrimitiveType
-
-%typemap(in) PrimitiveType {
-  PyObject* py_int = numpy::PyNumberToPyInt($input);
-  if (!py_int) {
-    PyErr_SetString(PyExc_TypeError, "Argument cannot be converted to int");
-    SWIG_fail;
-  }
-  const long value = numpy::PyIntOrPyLongToLong(py_int);
-  if (value == -1 && PyErr_Occurred()) {
-    Py_DECREF(py_int);
-    SWIG_fail;
-  }
-  if (!PrimitiveType_IsValid(value)) {
-    PyErr_SetString(
-        PyExc_TypeError, "Argument not valid for PrimitiveType enum");
-    Py_DECREF(py_int);
-    SWIG_fail;
-  }
-  $1 = static_cast<PrimitiveType>(value);
-}
-
-// Span<pair<int64, in64>>
-
-%typemap(in) absl::Span<const std::pair<int64, int64> >
-    (std::vector<std::pair<int64, int64> > temps) {
-  if (!PySequence_Check($input)) {
-    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    SWIG_fail;
-  }
-  const int size = PySequence_Size($input);
-  temps.reserve(size);
-  for (int i = 0; i < size; ++i) {
-    PyObject* o = PySequence_GetItem($input, i);
-    if (!o) {
-      SWIG_fail;
-    }
-    PyObject* first = PyTuple_GetItem(o, 0);
-    if (!first) {
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    PyObject* first_pyint = numpy::PyNumberToPyInt(first);
-    if (!first_pyint) {
-      PyErr_SetString(
-          PyExc_TypeError,
-          "First pair item cannot be converted to int");
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    PyObject* second = PyTuple_GetItem(o, 1);
-    if (!second) {
-      Py_DECREF(o);
-      Py_DECREF(first_pyint);
-      SWIG_fail;
-    }
-    PyObject* second_pyint = numpy::PyNumberToPyInt(second);
-    if (!second_pyint) {
-      PyErr_SetString(
-          PyExc_TypeError,
-          "Second pair item cannot be converted to int");
-      Py_DECREF(o);
-      Py_DECREF(first_pyint);
-      SWIG_fail;
-    }
-    const int64 first_value = numpy::PyIntOrPyLongToLong(first_pyint);
-    if (first_value == -1 && PyErr_Occurred()) {
-      Py_DECREF(o);
-      Py_DECREF(first_pyint);
-      Py_DECREF(second_pyint);
-      SWIG_fail;
-    }
-    const int64 second_value = numpy::PyIntOrPyLongToLong(second_pyint);
-    if (second_value == -1 && PyErr_Occurred()) {
-      Py_DECREF(o);
-      Py_DECREF(first_pyint);
-      Py_DECREF(second_pyint);
-      SWIG_fail;
-    }
-    temps.push_back(std::make_pair(first_value, second_value));
-    Py_DECREF(o);
-  }
-  $1 = temps;
-}
-
-// DotDimensionNumbers
-
-%typemap(in) const DotDimensionNumbers&
-    (DotDimensionNumbers dimension_numbers) {
-  int length;
-
-  /* lhs_contracting_dimensions */
-  PyObject* lhs_contracting_dimensions = PyObject_GetAttrString(
-      $input, "lhs_contracting_dimensions");
-  if (!lhs_contracting_dimensions) {
-    SWIG_fail;
-  }
-
-  length = PySequence_Size(lhs_contracting_dimensions);
-  if (length == -1) {
-    Py_DECREF(lhs_contracting_dimensions);
-    SWIG_fail;
-  }
-
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(lhs_contracting_dimensions, i);
-    if (!item) {
-      Py_DECREF(lhs_contracting_dimensions);
-      SWIG_fail;
-    }
-    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
-    if (dimension == -1 && PyErr_Occurred()) {
-      Py_DECREF(item);
-      Py_DECREF(lhs_contracting_dimensions);
-      SWIG_fail;
-    }
-    dimension_numbers.add_lhs_contracting_dimensions(dimension);
-    Py_DECREF(item);
-  }
-  Py_DECREF(lhs_contracting_dimensions);
-
-  /* rhs_contracting_dimensions */
-  PyObject* rhs_contracting_dimensions = PyObject_GetAttrString(
-      $input, "rhs_contracting_dimensions");
-  if (!lhs_contracting_dimensions) {
-    SWIG_fail;
-  }
-
-  length = PySequence_Size(rhs_contracting_dimensions);
-  if (length == -1) {
-    Py_DECREF(rhs_contracting_dimensions);
-    SWIG_fail;
-  }
-
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(rhs_contracting_dimensions, i);
-    if (!item) {
-      Py_DECREF(rhs_contracting_dimensions);
-      SWIG_fail;
-    }
-    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
-    if (dimension == -1 && PyErr_Occurred()) {
-      Py_DECREF(item);
-      Py_DECREF(rhs_contracting_dimensions);
-      SWIG_fail;
-    }
-    dimension_numbers.add_rhs_contracting_dimensions(dimension);
-    Py_DECREF(item);
-  }
-  Py_DECREF(rhs_contracting_dimensions);
-
-  /* lhs_batch_dimensions */
-  PyObject* lhs_batch_dimensions = PyObject_GetAttrString(
-      $input, "lhs_batch_dimensions");
-  if (!lhs_batch_dimensions) {
-    SWIG_fail;
-  }
-
-  length = PySequence_Size(lhs_batch_dimensions);
-  if (length == -1) {
-    Py_DECREF(lhs_batch_dimensions);
-    SWIG_fail;
-  }
-
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(lhs_batch_dimensions, i);
-    if (!item) {
-      Py_DECREF(lhs_batch_dimensions);
-      SWIG_fail;
-    }
-    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
-    if (dimension == -1 && PyErr_Occurred()) {
-      Py_DECREF(item);
-      Py_DECREF(lhs_batch_dimensions);
-      SWIG_fail;
-    }
-    dimension_numbers.add_lhs_batch_dimensions(dimension);
-    Py_DECREF(item);
-  }
-  Py_DECREF(lhs_batch_dimensions);
-
-  /* rhs_batch_dimensions */
-  PyObject* rhs_batch_dimensions = PyObject_GetAttrString(
-      $input, "rhs_batch_dimensions");
-  if (!rhs_batch_dimensions) {
-    SWIG_fail;
-  }
-
-  length = PySequence_Size(rhs_batch_dimensions);
-  if (length == -1) {
-    Py_DECREF(rhs_batch_dimensions);
-    SWIG_fail;
-  }
-
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(rhs_batch_dimensions, i);
-    if (!item) {
-      Py_DECREF(rhs_batch_dimensions);
-      SWIG_fail;
-    }
-    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
-    if (dimension == -1 && PyErr_Occurred()) {
-      Py_DECREF(item);
-      Py_DECREF(rhs_batch_dimensions);
-      SWIG_fail;
-    }
-    dimension_numbers.add_rhs_batch_dimensions(dimension);
-    Py_DECREF(item);
-  }
-  Py_DECREF(rhs_batch_dimensions);
-
-  $1 = &dimension_numbers;
-}
-
-// PaddingConfig
-
-%typemap(in) const PaddingConfig&
-    (PaddingConfig padding_config) {
-  PyObject* dimensions = PyObject_GetAttrString($input, "dimensions");
-  if (!dimensions) {
-    SWIG_fail;
-  }
-
-  int length = PySequence_Size(dimensions);
-  if (length == -1) {
-    Py_DECREF(dimensions);
-    SWIG_fail;
-  }
-
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(dimensions, i);
-    if (!item) {
-      Py_DECREF(dimensions);
-      SWIG_fail;
-    }
-    int64 edge_padding_low, edge_padding_high, interior_padding;
-    if (!GetIntAttr(item, "edge_padding_low", &edge_padding_low)
-        || !GetIntAttr(item, "edge_padding_high", &edge_padding_high)
-        || !GetIntAttr(item, "interior_padding", &interior_padding)) {
-      Py_DECREF(item);
-      Py_DECREF(dimensions);
-      SWIG_fail;
-    }
-    Py_DECREF(item);
-
-    PaddingConfig::PaddingConfigDimension* dimension =
-        padding_config.add_dimensions();
-    dimension->set_edge_padding_low(edge_padding_low);
-    dimension->set_edge_padding_high(edge_padding_high);
-    dimension->set_interior_padding(interior_padding);
-  }
-  Py_DECREF(dimensions);
-
-  $1 = &padding_config;
-}
-
-// ConvolutionDimensionNumbers
-
-%typemap(in) const ConvolutionDimensionNumbers&
-    (ConvolutionDimensionNumbers dimension_numbers) {
-  int64 value;
-
-  if (!GetIntAttr($input, "input_batch_dimension", &value)) {
-    SWIG_fail;
-  }
-  dimension_numbers.set_input_batch_dimension(value);
-
-  if (!GetIntAttr($input, "input_feature_dimension", &value)) {
-    SWIG_fail;
-  }
-  dimension_numbers.set_input_feature_dimension(value);
-
-  if (!GetIntAttr($input, "output_batch_dimension", &value)) {
-    SWIG_fail;
-  }
-  dimension_numbers.set_output_batch_dimension(value);
-
-  if (!GetIntAttr($input, "output_feature_dimension", &value)) {
-    SWIG_fail;
-  }
-  dimension_numbers.set_output_feature_dimension(value);
-
-  if (!GetIntAttr($input, "kernel_output_feature_dimension", &value)) {
-    SWIG_fail;
-  }
-  dimension_numbers.set_kernel_output_feature_dimension(value);
-
-  if (!GetIntAttr($input, "kernel_input_feature_dimension", &value)) {
-    SWIG_fail;
-  }
-  dimension_numbers.set_kernel_input_feature_dimension(value);
-
-  PyObject* o;
-  int length;
-
-  o = PyObject_GetAttrString($input, "input_spatial_dimensions");
-  if (!o) {
-    SWIG_fail;
-  }
-  length = PySequence_Size(o);
-  if (length == -1) {
-    Py_DECREF(o);
-    SWIG_fail;
-  }
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(o, i);
-    if (!item) {
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
-    if (dimension == -1 && PyErr_Occurred()) {
-      Py_DECREF(item);
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    dimension_numbers.add_input_spatial_dimensions(dimension);
-    Py_DECREF(item);
-  }
-  Py_DECREF(o);
-
-  o = PyObject_GetAttrString($input, "kernel_spatial_dimensions");
-  if (!o) {
-    SWIG_fail;
-  }
-  length = PySequence_Size(o);
-  if (length == -1) {
-    Py_DECREF(o);
-    SWIG_fail;
-  }
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(o, i);
-    if (!item) {
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
-    if (dimension == -1 && PyErr_Occurred()) {
-      Py_DECREF(item);
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    dimension_numbers.add_kernel_spatial_dimensions(dimension);
-    Py_DECREF(item);
-  }
-  Py_DECREF(o);
-
-  o = PyObject_GetAttrString($input, "output_spatial_dimensions");
-  if (!o) {
-    SWIG_fail;
-  }
-  length = PySequence_Size(o);
-  if (length == -1) {
-    Py_DECREF(o);
-    SWIG_fail;
-  }
-  for (int i = 0; i < length; ++i) {
-    PyObject* item = PySequence_GetItem(o, i);
-    if (!item) {
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
-    if (dimension == -1 && PyErr_Occurred()) {
-      Py_DECREF(item);
-      Py_DECREF(o);
-      SWIG_fail;
-    }
-    dimension_numbers.add_output_spatial_dimensions(dimension);
-    Py_DECREF(item);
-  }
-  Py_DECREF(o);
-
-  $1 = &dimension_numbers;
-}
-
 // ExecutableBuildOptions
 
 %typemap(in) const ExecutableBuildOptions*
@@ -1000,6 +331,12 @@ tensorflow::ImportNumpy();
     }
     Py_DECREF(o);
 
+    int64 num_replicas;
+    if (!GetIntAttr($input, "num_replicas", &num_replicas)) {
+      SWIG_fail;
+    }
+    build_options.set_num_replicas(num_replicas);
+
     $1 = &build_options;
   }
 }
@@ -1007,150 +344,151 @@ tensorflow::ImportNumpy();
 %ignoreall
 %unignore xla;
 %unignore xla::swig;
-%unignore xla::swig::InitializeReplicaCount;
-%unignore xla::swig::InitializePlatformName;
-%unignore xla::swig::GetReplicaCount;
-%unignore xla::swig::TransferToInfeedLocal;
-%unignore xla::swig::TransferToInfeedLocalReplica;
-%unignore xla::swig::TransferFromOutfeedLocalReplica;
+%unignore xla::swig::RegisterCpuCustomCallTarget;
+%unignore xla::swig::LocalClient;
+%unignore xla::swig::LocalClient::Get;
+%unignore xla::swig::LocalClient::DeviceCount;
+%unignore xla::swig::LocalClient::TransferToInfeed;
+%unignore xla::swig::LocalClient::TransferFromOutfeed;
 %unignore xla::swig::LocalShapedBuffer;
 %unignore xla::swig::LocalShapedBuffer::FromLiteral;
 %unignore xla::swig::LocalShapedBuffer::ToLiteral;
 %unignore xla::swig::LocalShapedBuffer::shape;
+%unignore xla::swig::LocalShapedBuffer::DestructureTuple;
 %unignore xla::swig::LocalShapedBufferTuple;
 %unignore xla::swig::LocalShapedBufferTuple::Release;
 %unignore xla::swig::LocalShapedBufferTuple::size;
-%unignore xla::swig::XrtAllocation;
-%unignore xla::swig::XrtAllocation::FromLiteral;
-%unignore xla::swig::XrtAllocation::ToLiteral;
-%unignore xla::swig::XrtAllocation::shape;
-%unignore xla::swig::XrtAllocationTuple;
-%unignore xla::swig::XrtAllocationTuple::Release;
-%unignore xla::swig::XrtAllocationTuple::size;
-%unignore xla::swig::CompiledLocalComputation;
-%unignore xla::swig::CompiledLocalComputation::Execute;
-%unignore xla::swig::CompiledLocalComputation::ExecutePerReplica;
-%unignore xla::swig::CompiledXrtComputation;
-%unignore xla::swig::CompiledXrtComputation::Execute;
-%unignore xla::swig::LocalComputation;
-%unignore xla::swig::LocalComputation::Compile;
-%unignore xla::swig::LocalComputation::CompileForXrt;
-%unignore xla::swig::LocalComputation::GetReturnValueShape;
-%unignore xla::swig::LocalComputation::GetSerializedProto;
+%unignore xla::swig::LocalExecutable;
+%unignore xla::swig::LocalExecutable::DeviceOrdinals;
+%unignore xla::swig::LocalExecutable::Execute;
+%unignore xla::swig::LocalExecutable::ExecutePerReplica;
+%unignore xla::swig::Computation;
+%unignore xla::swig::Computation::Compile;
+%unignore xla::swig::Computation::GetProgramShape;
+%unignore xla::swig::Computation::GetReturnValueShape;
+%unignore xla::swig::Computation::GetSerializedProto;
+%unignore xla::swig::Computation::GetHloText;
+%unignore xla::swig::Computation::GetHloDotGraph;
 %unignore xla::swig::LocalOp;
-%unignore xla::swig::LocalComputationBuilder;
-%unignore xla::swig::LocalComputationBuilder::LocalComputationBuilder;
-%unignore xla::swig::LocalComputationBuilder::Build;
-%unignore xla::swig::LocalComputationBuilder::BuildWithRoot;
-%unignore xla::swig::LocalComputationBuilder::SetOpMetadata;
-%unignore xla::swig::LocalComputationBuilder::ClearOpMetadata;
-%unignore xla::swig::LocalComputationBuilder::Parameter;
-%unignore xla::swig::LocalComputationBuilder::GetShape;
-%unignore xla::swig::LocalComputationBuilder::GetReturnValueShape;
-%unignore xla::swig::LocalComputationBuilder::Infeed;
-%unignore xla::swig::LocalComputationBuilder::Outfeed;
-%unignore xla::swig::LocalComputationBuilder::ConstantLiteral;
-%unignore xla::swig::LocalComputationBuilder::ConstantR0;
-%unignore xla::swig::LocalComputationBuilder::Broadcast;
-%unignore xla::swig::LocalComputationBuilder::BroadcastInDim;
-%unignore xla::swig::LocalComputationBuilder::Pad;
-%unignore xla::swig::LocalComputationBuilder::Reshape;
-%unignore xla::swig::LocalComputationBuilder::Collapse;
-%unignore xla::swig::LocalComputationBuilder::CrossReplicaSum;
-%unignore xla::swig::LocalComputationBuilder::Slice;
-%unignore xla::swig::LocalComputationBuilder::SliceInDim;
-%unignore xla::swig::LocalComputationBuilder::DynamicSlice;
-%unignore xla::swig::LocalComputationBuilder::DynamicUpdateSlice;
-%unignore xla::swig::LocalComputationBuilder::ConcatInDim;
-%unignore xla::swig::LocalComputationBuilder::SelectAndScatterWithGeneralPadding;
-%unignore xla::swig::LocalComputationBuilder::Select;
-%unignore xla::swig::LocalComputationBuilder::Tuple;
-%unignore xla::swig::LocalComputationBuilder::GetTupleElement;
-%unignore xla::swig::LocalComputationBuilder::ConvertElementType;
-%unignore xla::swig::LocalComputationBuilder::BitcastConvertType;
-%unignore xla::swig::LocalComputationBuilder::Call;
-%unignore xla::swig::LocalComputationBuilder::Transpose;
-%unignore xla::swig::LocalComputationBuilder::Rev;
-%unignore xla::swig::LocalComputationBuilder::Clamp;
-%unignore xla::swig::LocalComputationBuilder::Map;
-%unignore xla::swig::LocalComputationBuilder::Reduce;
-%unignore xla::swig::LocalComputationBuilder::ReduceWindowWithGeneralPadding;
-%unignore xla::swig::LocalComputationBuilder::RngNormal;
-%unignore xla::swig::LocalComputationBuilder::RngUniform;
-%unignore xla::swig::LocalComputationBuilder::RngBernoulli;
-%unignore xla::swig::LocalComputationBuilder::While;
-%unignore xla::swig::LocalComputationBuilder::Conditional;
-%unignore xla::swig::LocalComputationBuilder::IsConstant;
-%unignore xla::swig::LocalComputationBuilder::Eq;
-%unignore xla::swig::LocalComputationBuilder::Ne;
-%unignore xla::swig::LocalComputationBuilder::Ge;
-%unignore xla::swig::LocalComputationBuilder::Gt;
-%unignore xla::swig::LocalComputationBuilder::Lt;
-%unignore xla::swig::LocalComputationBuilder::Le;
-%unignore xla::swig::LocalComputationBuilder::Dot;
-%unignore xla::swig::LocalComputationBuilder::DotGeneral;
-%unignore xla::swig::LocalComputationBuilder::ConvGeneralDilated;
-%unignore xla::swig::LocalComputationBuilder::Add;
-%unignore xla::swig::LocalComputationBuilder::Sub;
-%unignore xla::swig::LocalComputationBuilder::Mul;
-%unignore xla::swig::LocalComputationBuilder::Div;
-%unignore xla::swig::LocalComputationBuilder::Rem;
-%unignore xla::swig::LocalComputationBuilder::Max;
-%unignore xla::swig::LocalComputationBuilder::Min;
-%unignore xla::swig::LocalComputationBuilder::And;
-%unignore xla::swig::LocalComputationBuilder::Or;
-%unignore xla::swig::LocalComputationBuilder::Xor;
-%unignore xla::swig::LocalComputationBuilder::ShiftLeft;
-%unignore xla::swig::LocalComputationBuilder::ShiftRightArithmetic;
-%unignore xla::swig::LocalComputationBuilder::ShiftRightLogical;
-%unignore xla::swig::LocalComputationBuilder::Not;
-%unignore xla::swig::LocalComputationBuilder::Abs;
-%unignore xla::swig::LocalComputationBuilder::Exp;
-%unignore xla::swig::LocalComputationBuilder::Expm1;
-%unignore xla::swig::LocalComputationBuilder::Floor;
-%unignore xla::swig::LocalComputationBuilder::Ceil;
-%unignore xla::swig::LocalComputationBuilder::Round;
-%unignore xla::swig::LocalComputationBuilder::Log;
-%unignore xla::swig::LocalComputationBuilder::Log1p;
-%unignore xla::swig::LocalComputationBuilder::Sign;
-%unignore xla::swig::LocalComputationBuilder::Cos;
-%unignore xla::swig::LocalComputationBuilder::Sin;
-%unignore xla::swig::LocalComputationBuilder::Tanh;
-%unignore xla::swig::LocalComputationBuilder::Atan2;
-%unignore xla::swig::LocalComputationBuilder::IsFinite;
-%unignore xla::swig::LocalComputationBuilder::Pow;
-%unignore xla::swig::LocalComputationBuilder::Neg;
-%unignore xla::swig::LocalComputationBuilder::Sort;
-%unignore xla::swig::LocalComputationBuilder::SortKeyVal;
-%unignore xla::swig::LocalComputationBuilder::Sqrt;
-%unignore xla::swig::LocalComputationBuilder::Rsqrt;
-%unignore xla::swig::LocalComputationBuilder::Square;
-%unignore xla::swig::LocalComputationBuilder::Reciprocal;
-%unignore xla::swig::LocalComputationBuilder::Erfc;
-%unignore xla::swig::LocalComputationBuilder::Erf;
-%unignore xla::swig::LocalComputationBuilder::ErfInv;
-%unignore xla::swig::LocalComputationBuilder::Lgamma;
-%unignore xla::swig::LocalComputationBuilder::Digamma;
-%unignore xla::swig::LocalComputationBuilder::Acos;
-%unignore xla::swig::LocalComputationBuilder::Asin;
-%unignore xla::swig::LocalComputationBuilder::Atan;
-%unignore xla::swig::LocalComputationBuilder::Tan;
-%unignore xla::swig::LocalComputationBuilder::Acosh;
-%unignore xla::swig::LocalComputationBuilder::Asinh;
-%unignore xla::swig::LocalComputationBuilder::Atanh;
-%unignore xla::swig::LocalComputationBuilder::Cosh;
-%unignore xla::swig::LocalComputationBuilder::Sinh;
-%unignore xla::swig::LocalComputationBuilder::Real;
-%unignore xla::swig::LocalComputationBuilder::Imag;
-%unignore xla::swig::LocalComputationBuilder::Conj;
-%unignore xla::swig::LocalComputationBuilder::Complex;
-%unignore xla::swig::DeleteLocalComputation;
-%unignore xla::swig::DestructureLocalShapedBufferTuple;
-%unignore xla::swig::DestructureXrtAllocationTuple;
+%unignore xla::swig::ComputationBuilder;
+%unignore xla::swig::ComputationBuilder::ComputationBuilder;
+%unignore xla::swig::ComputationBuilder::Build;
+%unignore xla::swig::ComputationBuilder::BuildWithRoot;
+%unignore xla::swig::ComputationBuilder::SetOpMetadata;
+%unignore xla::swig::ComputationBuilder::ClearOpMetadata;
+%unignore xla::swig::ComputationBuilder::Parameter;
+%unignore xla::swig::ComputationBuilder::GetShape;
+%unignore xla::swig::ComputationBuilder::GetReturnValueShape;
+%unignore xla::swig::ComputationBuilder::Infeed;
+%unignore xla::swig::ComputationBuilder::Outfeed;
+%unignore xla::swig::ComputationBuilder::ConstantLiteral;
+%unignore xla::swig::ComputationBuilder::ConstantR0;
+%unignore xla::swig::ComputationBuilder::Iota;
+%unignore xla::swig::ComputationBuilder::BroadcastedIota;
+%unignore xla::swig::ComputationBuilder::Broadcast;
+%unignore xla::swig::ComputationBuilder::BroadcastInDim;
+%unignore xla::swig::ComputationBuilder::Pad;
+%unignore xla::swig::ComputationBuilder::Reshape;
+%unignore xla::swig::ComputationBuilder::Collapse;
+%unignore xla::swig::ComputationBuilder::AllToAll;
+%unignore xla::swig::ComputationBuilder::CrossReplicaSum;
+%unignore xla::swig::ComputationBuilder::Slice;
+%unignore xla::swig::ComputationBuilder::SliceInDim;
+%unignore xla::swig::ComputationBuilder::DynamicSlice;
+%unignore xla::swig::ComputationBuilder::DynamicUpdateSlice;
+%unignore xla::swig::ComputationBuilder::ConcatInDim;
+%unignore xla::swig::ComputationBuilder::SelectAndScatterWithGeneralPadding;
+%unignore xla::swig::ComputationBuilder::Select;
+%unignore xla::swig::ComputationBuilder::Tuple;
+%unignore xla::swig::ComputationBuilder::GetTupleElement;
+%unignore xla::swig::ComputationBuilder::ConvertElementType;
+%unignore xla::swig::ComputationBuilder::BitcastConvertType;
+%unignore xla::swig::ComputationBuilder::Call;
+%unignore xla::swig::ComputationBuilder::Transpose;
+%unignore xla::swig::ComputationBuilder::Rev;
+%unignore xla::swig::ComputationBuilder::Clamp;
+%unignore xla::swig::ComputationBuilder::Map;
+%unignore xla::swig::ComputationBuilder::Reduce;
+%unignore xla::swig::ComputationBuilder::ReduceWindowWithGeneralPadding;
+%unignore xla::swig::ComputationBuilder::RngNormal;
+%unignore xla::swig::ComputationBuilder::RngUniform;
+%unignore xla::swig::ComputationBuilder::RngBernoulli;
+%unignore xla::swig::ComputationBuilder::While;
+%unignore xla::swig::ComputationBuilder::Conditional;
+%unignore xla::swig::ComputationBuilder::IsConstant;
+%unignore xla::swig::ComputationBuilder::Eq;
+%unignore xla::swig::ComputationBuilder::Ne;
+%unignore xla::swig::ComputationBuilder::Ge;
+%unignore xla::swig::ComputationBuilder::Gt;
+%unignore xla::swig::ComputationBuilder::Lt;
+%unignore xla::swig::ComputationBuilder::Le;
+%unignore xla::swig::ComputationBuilder::Dot;
+%unignore xla::swig::ComputationBuilder::DotGeneral;
+%unignore xla::swig::ComputationBuilder::ConvGeneralDilated;
+%unignore xla::swig::ComputationBuilder::Add;
+%unignore xla::swig::ComputationBuilder::Sub;
+%unignore xla::swig::ComputationBuilder::Mul;
+%unignore xla::swig::ComputationBuilder::Div;
+%unignore xla::swig::ComputationBuilder::Rem;
+%unignore xla::swig::ComputationBuilder::Max;
+%unignore xla::swig::ComputationBuilder::Min;
+%unignore xla::swig::ComputationBuilder::And;
+%unignore xla::swig::ComputationBuilder::Or;
+%unignore xla::swig::ComputationBuilder::Xor;
+%unignore xla::swig::ComputationBuilder::ShiftLeft;
+%unignore xla::swig::ComputationBuilder::ShiftRightArithmetic;
+%unignore xla::swig::ComputationBuilder::ShiftRightLogical;
+%unignore xla::swig::ComputationBuilder::Not;
+%unignore xla::swig::ComputationBuilder::Clz;
+%unignore xla::swig::ComputationBuilder::Abs;
+%unignore xla::swig::ComputationBuilder::Exp;
+%unignore xla::swig::ComputationBuilder::Expm1;
+%unignore xla::swig::ComputationBuilder::Floor;
+%unignore xla::swig::ComputationBuilder::Ceil;
+%unignore xla::swig::ComputationBuilder::Round;
+%unignore xla::swig::ComputationBuilder::Log;
+%unignore xla::swig::ComputationBuilder::Log1p;
+%unignore xla::swig::ComputationBuilder::Sign;
+%unignore xla::swig::ComputationBuilder::Cos;
+%unignore xla::swig::ComputationBuilder::Sin;
+%unignore xla::swig::ComputationBuilder::Tanh;
+%unignore xla::swig::ComputationBuilder::Atan2;
+%unignore xla::swig::ComputationBuilder::IsFinite;
+%unignore xla::swig::ComputationBuilder::Pow;
+%unignore xla::swig::ComputationBuilder::Neg;
+%unignore xla::swig::ComputationBuilder::Sort;
+%unignore xla::swig::ComputationBuilder::SortKeyVal;
+%unignore xla::swig::ComputationBuilder::Sqrt;
+%unignore xla::swig::ComputationBuilder::Rsqrt;
+%unignore xla::swig::ComputationBuilder::Square;
+%unignore xla::swig::ComputationBuilder::Reciprocal;
+%unignore xla::swig::ComputationBuilder::Erfc;
+%unignore xla::swig::ComputationBuilder::Erf;
+%unignore xla::swig::ComputationBuilder::ErfInv;
+%unignore xla::swig::ComputationBuilder::Lgamma;
+%unignore xla::swig::ComputationBuilder::Digamma;
+%unignore xla::swig::ComputationBuilder::Acos;
+%unignore xla::swig::ComputationBuilder::Asin;
+%unignore xla::swig::ComputationBuilder::Atan;
+%unignore xla::swig::ComputationBuilder::Tan;
+%unignore xla::swig::ComputationBuilder::Acosh;
+%unignore xla::swig::ComputationBuilder::Asinh;
+%unignore xla::swig::ComputationBuilder::Atanh;
+%unignore xla::swig::ComputationBuilder::Cosh;
+%unignore xla::swig::ComputationBuilder::Sinh;
+%unignore xla::swig::ComputationBuilder::Real;
+%unignore xla::swig::ComputationBuilder::Imag;
+%unignore xla::swig::ComputationBuilder::Conj;
+%unignore xla::swig::ComputationBuilder::Complex;
+%unignore xla::swig::ComputationBuilder::Cholesky;
+%unignore xla::swig::ComputationBuilder::QR;
+%unignore xla::swig::ComputationBuilder::TriangularSolve;
+%unignore xla::swig::ComputationBuilder::CustomCall;
+%unignore xla::swig::ComputationBuilder::Gather;
+%unignore xla::swig::ComputationBuilder::Scatter;
+%unignore xla::swig::DeleteComputation;
 %unignore xla::swig::DeleteLocalShapedBuffer;
-%unignore xla::swig::DeleteXrtAllocation;
-%unignore xla::swig::DeleteCompiledLocalComputation;
-%unignore xla::swig::DeleteCompiledXrtComputation;
+%unignore xla::swig::DeleteLocalExecutable;
 
 %thread;
 %include "tensorflow/compiler/xla/python/local_computation_builder.h"
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.cc b/tensorflow/compiler/xla/python/numpy_bridge.cc
index b0aa024c7474cf8e6934432b2f364be464714999..74f45b7cdcfd7d7b10a5832be37ac1fb34057743 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.cc
+++ b/tensorflow/compiler/xla/python/numpy_bridge.cc
@@ -26,6 +26,10 @@ namespace swig {
 
 namespace numpy {
 
+Safe_PyObjectPtr make_safe(PyObject* object) {
+  return Safe_PyObjectPtr(object);
+}
+
 int PrimitiveTypeToNumpyType(PrimitiveType primitive_type) {
   switch (primitive_type) {
     case PRED:
@@ -54,6 +58,8 @@ int PrimitiveTypeToNumpyType(PrimitiveType primitive_type) {
       return NPY_FLOAT64;
     case C64:
       return NPY_COMPLEX64;
+    case C128:
+      return NPY_COMPLEX128;
     case TUPLE:
       return NPY_OBJECT;
     default:
@@ -89,6 +95,8 @@ PrimitiveType NumpyTypeToPrimitiveType(int np_type) {
       return F64;
     case NPY_COMPLEX64:
       return C64;
+    case NPY_COMPLEX128:
+      return C128;
     case NPY_OBJECT:
       return TUPLE;
     default:
@@ -111,6 +119,7 @@ bool NumpyTypeIsValid(int np_type) {
     case NPY_FLOAT32:
     case NPY_FLOAT64:
     case NPY_COMPLEX64:
+    case NPY_COMPLEX128:
     case NPY_OBJECT:
       return true;
     default:
@@ -118,28 +127,42 @@ bool NumpyTypeIsValid(int np_type) {
   }
 }
 
-PyObject* PyShapeInfoFromXlaShape(const Shape& shape) {
+Safe_PyObjectPtr PyShapeInfoFromXlaShape(const Shape& shape) {
   int np_typenum = PrimitiveTypeToNumpyType(shape.element_type());
   PyArray_Descr* np_dtype = PyArray_DescrFromType(np_typenum);
 
-  PyObject* dimensions;
-  if (ShapeUtil::IsTuple(shape)) {
+  Safe_PyObjectPtr dimensions;
+  if (shape.IsTuple()) {
     int num_elements = ShapeUtil::TupleElementCount(shape);
-    dimensions = PyTuple_New(ShapeUtil::TupleElementCount(shape));
+    dimensions = make_safe(PyTuple_New(ShapeUtil::TupleElementCount(shape)));
     for (int i = 0; i < num_elements; ++i) {
       PyTuple_SET_ITEM(
-          dimensions, i,
-          PyShapeInfoFromXlaShape(ShapeUtil::GetTupleElementShape(shape, i)));
+          dimensions.get(), i,
+          PyShapeInfoFromXlaShape(ShapeUtil::GetTupleElementShape(shape, i))
+              .release());
     }
   } else {
-    int rank = ShapeUtil::Rank(shape);
-    dimensions = PyTuple_New(rank);
+    int rank = shape.rank();
+    dimensions = make_safe(PyTuple_New(rank));
     for (int i = 0; i < rank; ++i) {
-      PyTuple_SET_ITEM(dimensions, i,
+      PyTuple_SET_ITEM(dimensions.get(), i,
                        LongToPyIntOrPyLong(ShapeUtil::GetDimension(shape, i)));
     }
   }
-  return PyTuple_Pack(2, np_dtype, dimensions);
+  return make_safe(PyTuple_Pack(2, np_dtype, dimensions.release()));
+}
+
+Safe_PyObjectPtr PyProgramShapeInfoFromXlaProgramShape(
+    const ProgramShape& shape) {
+  Safe_PyObjectPtr arg_shapes = make_safe(PyTuple_New(shape.parameters_size()));
+  for (int i = 0; i < shape.parameters_size(); ++i) {
+    PyTuple_SET_ITEM(arg_shapes.get(), i,
+                     PyShapeInfoFromXlaShape(shape.parameters(i)).release());
+  }
+
+  Safe_PyObjectPtr result_shape = PyShapeInfoFromXlaShape(shape.result());
+  return make_safe(
+      PyTuple_Pack(2, arg_shapes.release(), result_shape.release()));
 }
 
 // Precondition: o->ob_type == &PyArrayDescr_Type
@@ -344,26 +367,30 @@ StatusOr<OpMetadata> OpMetadataFromPyObject(PyObject* o) {
   return result;
 }
 
-PyObject* PyObjectFromXlaLiteral(const LiteralSlice& literal) {
-  if (ShapeUtil::IsTuple(literal.shape())) {
+StatusOr<Safe_PyObjectPtr> PyObjectFromXlaLiteral(const LiteralSlice& literal) {
+  if (literal.shape().IsTuple()) {
     int num_elements = ShapeUtil::TupleElementCount(literal.shape());
-    PyObject* tuple = PyTuple_New(num_elements);
+    std::vector<Safe_PyObjectPtr> elems(num_elements);
+    for (int i = 0; i < num_elements; i++) {
+      TF_ASSIGN_OR_RETURN(elems[i],
+                          PyObjectFromXlaLiteral(LiteralSlice(literal, {i})));
+    }
+    Safe_PyObjectPtr tuple = make_safe(PyTuple_New(num_elements));
     for (int i = 0; i < num_elements; i++) {
-      PyTuple_SET_ITEM(tuple, i,
-                       PyObjectFromXlaLiteral(LiteralSlice(literal, {i})));
+      PyTuple_SET_ITEM(tuple.get(), i, elems[i].release());
     }
     return tuple;
   } else {
-    int rank = ShapeUtil::Rank(literal.shape());
+    int rank = literal.shape().rank();
     std::vector<long> dimensions(rank);  // NOLINT - PyArray requires a long*
     for (int i = 0; i < rank; i++) {
       dimensions[i] = ShapeUtil::GetDimension(literal.shape(), i);
     }
     int np_type = PrimitiveTypeToNumpyType(literal.shape().element_type());
-    PyObject* array =
-        PyArray_EMPTY(rank, dimensions.data(), np_type, /*fortran=*/0);
-    CopyLiteralToNumpyArray(np_type, literal,
-                            reinterpret_cast<PyArrayObject*>(array));
+    Safe_PyObjectPtr array = make_safe(
+        PyArray_EMPTY(rank, dimensions.data(), np_type, /*fortran=*/0));
+    TF_RETURN_IF_ERROR(CopyLiteralToNumpyArray(
+        np_type, literal, reinterpret_cast<PyArrayObject*>(array.get())));
     return array;
   }
 }
@@ -403,6 +430,12 @@ Status CopyNumpyArrayToLiteral(int np_type, PyArrayObject* py_array,
     case NPY_BOOL:
       CopyNumpyArrayToLiteral<bool>(py_array, literal);
       break;
+    case NPY_INT8:
+      CopyNumpyArrayToLiteral<int8>(py_array, literal);
+      break;
+    case NPY_INT16:
+      CopyNumpyArrayToLiteral<int16>(py_array, literal);
+      break;
     case NPY_INT32:
       CopyNumpyArrayToLiteral<int32>(py_array, literal);
       break;
@@ -412,6 +445,9 @@ Status CopyNumpyArrayToLiteral(int np_type, PyArrayObject* py_array,
     case NPY_UINT8:
       CopyNumpyArrayToLiteral<uint8>(py_array, literal);
       break;
+    case NPY_UINT16:
+      CopyNumpyArrayToLiteral<uint16>(py_array, literal);
+      break;
     case NPY_UINT32:
       CopyNumpyArrayToLiteral<uint32>(py_array, literal);
       break;
@@ -430,6 +466,9 @@ Status CopyNumpyArrayToLiteral(int np_type, PyArrayObject* py_array,
     case NPY_COMPLEX64:
       CopyNumpyArrayToLiteral<complex64>(py_array, literal);
       break;
+    case NPY_COMPLEX128:
+      CopyNumpyArrayToLiteral<complex128>(py_array, literal);
+      break;
     default:
       return InvalidArgument(
           "No XLA literal container for Numpy type number: %d", np_type);
@@ -437,12 +476,18 @@ Status CopyNumpyArrayToLiteral(int np_type, PyArrayObject* py_array,
   return Status::OK();
 }
 
-void CopyLiteralToNumpyArray(int np_type, const LiteralSlice& literal,
-                             PyArrayObject* py_array) {
+Status CopyLiteralToNumpyArray(int np_type, const LiteralSlice& literal,
+                               PyArrayObject* py_array) {
   switch (np_type) {
     case NPY_BOOL:
       CopyLiteralToNumpyArray<bool>(literal, py_array);
       break;
+    case NPY_INT8:
+      CopyLiteralToNumpyArray<int8>(literal, py_array);
+      break;
+    case NPY_INT16:
+      CopyLiteralToNumpyArray<int16>(literal, py_array);
+      break;
     case NPY_INT32:
       CopyLiteralToNumpyArray<int32>(literal, py_array);
       break;
@@ -452,6 +497,9 @@ void CopyLiteralToNumpyArray(int np_type, const LiteralSlice& literal,
     case NPY_UINT8:
       CopyLiteralToNumpyArray<uint8>(literal, py_array);
       break;
+    case NPY_UINT16:
+      CopyLiteralToNumpyArray<uint16>(literal, py_array);
+      break;
     case NPY_UINT32:
       CopyLiteralToNumpyArray<uint32>(literal, py_array);
       break;
@@ -470,9 +518,14 @@ void CopyLiteralToNumpyArray(int np_type, const LiteralSlice& literal,
     case NPY_COMPLEX64:
       CopyLiteralToNumpyArray<complex64>(literal, py_array);
       break;
+    case NPY_COMPLEX128:
+      CopyLiteralToNumpyArray<complex128>(literal, py_array);
+      break;
     default:
-      LOG(FATAL) << "No XLA literal container for Numpy type" << np_type;
+      return InvalidArgument(
+          "No XLA literal container for Numpy type number: %d", np_type);
   }
+  return Status::OK();
 }
 
 PyObject* LongToPyIntOrPyLong(long x) {  // NOLINT
@@ -514,6 +567,92 @@ PyObject* PyNumberToPyInt(PyObject* o) {
 
 }  // namespace numpy
 
+bool GetIntAttr(PyObject* o, const char* field, int64* result) {
+  PyObject* fo = PyObject_GetAttrString(o, field);
+  if (!fo) {
+    return false;
+  }
+  const int64 value = numpy::PyIntOrPyLongToLong(fo);
+  if (value == -1 && PyErr_Occurred()) {
+    Py_DECREF(fo);
+    return false;
+  }
+  Py_DECREF(fo);
+  *result = value;
+  return true;
+}
+
+// Returns "ok"; true if there is no error, false if there was an error.
+bool HandleStringAttribute(PyObject* o, const char* attr_name,
+                           std::function<void(string s)> f) {
+  if (!PyObject_HasAttrString(o, attr_name)) {
+    return true;  // It's ok for the object to not have the attribute.
+  }
+  PyObject* attr = PyObject_GetAttrString(o, attr_name);
+  if (attr == nullptr) {
+    return false;  // An error occurred getting the attribute.
+  }
+  if (attr == Py_None) {
+    Py_DECREF(attr);
+    return true;  // The attribute is None, which we consider ok.
+  }
+#if PY_MAJOR_VERSION < 3
+  if (!PyString_Check(attr)) {
+    string message = absl::StrFormat("%s must be a string or none; got %s",
+                                     attr_name, numpy::PyObjectCppRepr(attr));
+    PyErr_SetString(PyExc_TypeError, message.c_str());
+    Py_DECREF(attr);
+    return false;  // Type error, not ok.
+  }
+  f(PyString_AsString(attr));
+#else
+  if (!PyBytes_Check(attr)) {
+    string message = absl::StrFormat("%s must be a string or none; got %s",
+                                     attr_name, numpy::PyObjectCppRepr(attr));
+    PyErr_SetString(PyExc_TypeError, message.c_str());
+    Py_DECREF(attr);
+    return false;  // Type error, not ok.
+  }
+  f(PyBytes_AsString(attr));
+#endif
+
+  Py_DECREF(attr);
+  return true;  // Handled string attribute, ok!
+}
+
+bool HandleRepeatedInt64Attribute(
+    PyObject* o, const char* attr_name,
+    tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>* field) {
+  PyObject* seq = PyObject_GetAttrString(o, attr_name);
+  if (!seq) {
+    return false;
+  }
+
+  int length = PySequence_Size(seq);
+  if (length == -1) {
+    Py_DECREF(seq);
+    return false;
+  }
+
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(seq, i);
+    if (!item) {
+      Py_DECREF(seq);
+      return false;
+    }
+    const int64 dimension = numpy::PyIntOrPyLongToLong(item);
+    if (dimension == -1 && PyErr_Occurred()) {
+      Py_DECREF(item);
+      Py_DECREF(seq);
+      return false;
+    }
+    *field->Add() = dimension;
+    Py_DECREF(item);
+  }
+  Py_DECREF(seq);
+  return true;
+}
+
 }  // namespace swig
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.h b/tensorflow/compiler/xla/python/numpy_bridge.h
index 40ff2d9ad214cc4dcad42234fa296834cbc92882..eff8cda334f00050605febad66a61aa1c518c500 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.h
+++ b/tensorflow/compiler/xla/python/numpy_bridge.h
@@ -36,6 +36,16 @@ namespace swig {
 
 namespace numpy {
 
+struct PyDecrefDeleter {
+  void operator()(PyObject* p) const { Py_DECREF(p); }
+};
+
+// Safe container for an owned PyObject. On destruction, the reference count of
+// the contained object will be decremented.
+using Safe_PyObjectPtr = std::unique_ptr<PyObject, PyDecrefDeleter>;
+
+Safe_PyObjectPtr make_safe(PyObject* object);
+
 // Maps XLA primitive types (PRED, S8, F32, ..., and TUPLE) to numpy
 // dtypes (NPY_BOOL, NPY_INT8, NPY_FLOAT32, ..., and NPY_OBJECT), and
 // vice versa.
@@ -54,7 +64,13 @@ bool NumpyTypeIsValid(int np_type);
 // providing the array dimensions.
 //
 // The return value is a new reference.
-PyObject* PyShapeInfoFromXlaShape(const Shape& shape);
+Safe_PyObjectPtr PyShapeInfoFromXlaShape(const Shape& shape);
+
+// Returns a pair of (arg_shapes, result_shape), where arg_shapes is a tuple
+// of argument shapes and result_shape is the result shape. Each shape is as
+// described in in PyShapeInfoFromXlaShape's comment.
+Safe_PyObjectPtr PyProgramShapeInfoFromXlaProgramShape(
+    const ProgramShape& shape);
 
 // Converts a Python object with a method interface mathing that of
 // xla_client.Shape into an XLA Shape object.
@@ -74,7 +90,7 @@ StatusOr<OpMetadata> OpMetadataFromPyObject(PyObject* o);
 // array data.
 //
 // The return value is a new reference.
-PyObject* PyObjectFromXlaLiteral(const LiteralSlice& literal);
+StatusOr<Safe_PyObjectPtr> PyObjectFromXlaLiteral(const LiteralSlice& literal);
 
 // Converts a Numpy ndarray or a nested Python tuple thereof to a
 // corresponding XLA literal.
@@ -90,8 +106,8 @@ StatusOr<Literal> XlaLiteralFromPyObject(PyObject* o);
 Status CopyNumpyArrayToLiteral(int np_type, PyArrayObject* py_array,
                                Literal* literal);
 
-void CopyLiteralToNumpyArray(int np_type, const LiteralSlice& literal,
-                             PyArrayObject* py_array);
+Status CopyLiteralToNumpyArray(int np_type, const LiteralSlice& literal,
+                               PyArrayObject* py_array);
 
 template <typename NativeT>
 void CopyNumpyArrayToLiteral(PyArrayObject* py_array, Literal* literal) {
@@ -120,6 +136,18 @@ PyObject* PyNumberToPyInt(PyObject* o);
 
 }  // namespace numpy
 
+// Miscellaneous swig helpers that don't have a better home.
+
+bool GetIntAttr(PyObject* o, const char* field, int64* result);
+
+// Returns "ok"; true if there is no error, false if there was an error.
+bool HandleStringAttribute(PyObject* o, const char* attr_name,
+                           std::function<void(string s)> f);
+
+bool HandleRepeatedInt64Attribute(
+    PyObject* o, const char* attr_name,
+    tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>* field);
+
 }  // namespace swig
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/pywrap_xla_exported_symbols.lds b/tensorflow/compiler/xla/python/pywrap_xla_exported_symbols.lds
new file mode 100644
index 0000000000000000000000000000000000000000..ef77ed3d95850fdfc7145e6fe1df4833d20bb7df
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pywrap_xla_exported_symbols.lds
@@ -0,0 +1,2 @@
+_PyInit__pywrap_xla
+_init_pywrap_xla
diff --git a/tensorflow/compiler/xla/python/pywrap_xla_version_script.lds b/tensorflow/compiler/xla/python/pywrap_xla_version_script.lds
new file mode 100644
index 0000000000000000000000000000000000000000..d31cfce7be7b6accf05ef77f3485904099965afc
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pywrap_xla_version_script.lds
@@ -0,0 +1,6 @@
+xla {
+  global:
+    PyInit_*;
+  local:
+    *;
+};
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index c91a2aaf56dfe2127168628c78e0c4b868a28055..9019a979a61c6ebb62adaa5503560c604e2b30f8 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -12,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""An in-process, local XLA client in Python, supporting AOT compilation."""
+"""An XLA client in Python, supporting AOT compilation."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
 import collections
 import enum  # pylint: disable=g-bad-import-order
 import inspect
@@ -33,13 +34,32 @@ from tensorflow.compiler.xla import xla_data_pb2
 from tensorflow.compiler.xla.python import pywrap_xla as c_api
 from tensorflow.compiler.xla.service import hlo_pb2
 
+# Import the XRT backend, if available.
+try:
+  # pylint: disable=g-import-not-at-top
+  from tensorflow.compiler.xla.python import pywrap_xrt as xrt_api
+except ImportError:
+  xrt_api = None
+
 
 # Most functions are snake_case for consistency with other modules, whereas
-# method names of ComputationBuilder and LocalComputation are CamelCase for
+# method names of ComputationBuilder and Computation are CamelCase for
 # consistency with XLA.
 # pylint: disable=invalid-name
 
 
+# Version of the XLA Python client.
+#
+# JAX packages the XLA python plugin as a binary pip module (jaxlib) that is
+# packaged separately from the Python code that consumes it (jax).
+#
+# We occasionally need to make backwards-incompatible changes to jaxlib, in
+# which case we need to be able to detect when incompatible versions are
+# installed.
+def version():
+  return (0, 1, 8)
+
+
 _OP_METADATA_FIELDS = [
     'op_type',
     'op_name',
@@ -49,13 +69,163 @@ _OP_METADATA_FIELDS = [
 OpMetadata = collections.namedtuple('OpMetadata', _OP_METADATA_FIELDS)
 
 
+@six.add_metaclass(abc.ABCMeta)
+class Backend(object):
+  """Abstract base class for XLA backends."""
+
+  @abc.abstractmethod
+  def device_count(self):
+    """Returns the number of devices known to the backend."""
+
+  @abc.abstractmethod
+  def buffer_from_pyval(self, pyval, device=0):
+    """Allocates a fresh buffer and populates it with `pyval`."""
+
+  @abc.abstractmethod
+  def delete_buffer(self, c_buffer):
+    """Deletes buffer `c_buffer`."""
+
+  @abc.abstractmethod
+  def destructure_tuple(self, c_buffer):
+    """Destructures a tuple buffer into a sequence of buffers."""
+
+  @abc.abstractmethod
+  def compile(self, computation, argument_shapes, result_shape,
+              compile_options):
+    """Compiles a computation. Returns an executable."""
+
+  @abc.abstractmethod
+  def delete_executable(self, executable):
+    """Deletes an executable."""
+
+  @abc.abstractmethod
+  def execute(self, executable, args):
+    """Runs an executable without replication."""
+
+  @abc.abstractmethod
+  def execute_replicated(self, executable, per_replica_args):
+    """Runs an executable in a replicated manner."""
+
+
+def _maybe_encode_string(s):
+  if six.PY3:
+    return s.encode('utf-8')
+  else:
+    return s
+
+
+class XlaLocalBackend(Backend):
+  """XLA backend implemented using the in-process xla::LocalClient API."""
+
+  def __init__(self, platform=None):
+    platform = platform or _get_default_platform_name()
+    self.client = c_api.LocalClient.Get(_maybe_encode_string(platform))
+    self._delete_buffer = c_api.DeleteLocalShapedBuffer
+    self._delete_executable = c_api.DeleteLocalExecutable
+
+  def device_count(self):
+    return self.client.DeviceCount()
+
+  def buffer_from_pyval(self, pyval, device=0):
+    return c_api.LocalShapedBuffer.FromLiteral(pyval, None, self.client, device)
+
+  def delete_buffer(self, c_buffer):
+    self._delete_buffer(c_buffer)
+
+  def destructure_tuple(self, c_buffer):
+    result = c_buffer.DestructureTuple()
+    return [result.Release(i) for i in xrange(result.size())]
+
+  def compile(self, c_computation, argument_shapes, result_shape,
+              compile_options):
+    return c_computation.Compile(argument_shapes, compile_options, self.client)
+
+  def delete_executable(self, executable):
+    self._delete_executable(executable)
+
+  def execute(self, executable, args):
+    return executable.Execute(args)
+
+  def execute_replicated(self, executable, per_replica_args):
+    output_buffer_tup = executable.ExecutePerReplica(per_replica_args)
+    size = output_buffer_tup.size()
+    return [output_buffer_tup.Release(i) for i in xrange(size)]
+
+
+class XrtBackend(Backend):
+  """XLA backend implemented using XRT."""
+
+  def __init__(self, target):
+    self.target = target
+    self._delete_buffer = xrt_api.DeleteXrtAllocation
+    self._delete_executable = xrt_api.DeleteXrtExecutable
+
+  def device_count(self):
+    return 1  # Multidevice execution not implemented.
+
+  def buffer_from_pyval(self, pyval, device=0):
+    if device != 0:
+      raise NotImplementedError(
+          'Multi-replica execution is not yet supported via the XRT backend.')
+    return xrt_api.XrtAllocation.FromLiteral(pyval,
+                                             _maybe_encode_string(self.target))
+
+  def delete_buffer(self, c_buffer):
+    self._delete_buffer(c_buffer)
+
+  def destructure_tuple(self, c_buffer):
+    result = xrt_api.DestructureXrtAllocationTuple(
+        c_buffer, _maybe_encode_string(self.target))
+    return [result.Release(i) for i in xrange(result.size())]
+
+  def compile(self, c_computation, argument_shapes, result_shape,
+              compile_options):
+    return xrt_api.XrtExecutable.CompileForXrt(
+        c_computation.GetSerializedProto(), argument_shapes, result_shape,
+        _maybe_encode_string(self.target))
+
+  def delete_executable(self, executable):
+    self._delete_executable(executable)
+
+  def execute(self, executable, args):
+    return executable.Execute(args)
+
+  def execute_replicated(self, executable, per_replica_args):
+    if len(per_replica_args) != 1:
+      raise NotImplementedError(
+          'Multi-replica execution is not yet supported via the XRT backend.')
+    return [executable.Execute(per_replica_args[0])]
+
+
+_default_platform_name = 'Host'
+_default_backend = None
+
+
+def _get_default_platform_name():
+  return _default_platform_name
+
+
+def _get_default_local_backend():
+  global _default_backend
+  global _default_platform_name
+  if _default_backend is None:
+    _default_backend = XlaLocalBackend(_default_platform_name)
+  return _default_backend
+
+
 class BackendType(enum.Enum):
   XLA_LOCAL = 1
   XRT = 2
 
 
-BackendSpec = collections.namedtuple('Backend', ('backend_type', 'target'))
-XLA_LOCAL_BACKEND = BackendSpec(BackendType.XLA_LOCAL, 'local')
+def BackendSpec(backend, target):
+  """Compatibility wrapper to support older clients. Do not use in new code."""
+  if backend == BackendType.XLA_LOCAL:
+    return _get_default_local_backend()
+  elif backend == BackendType.XRT:
+    return XrtBackend(target)
+  else:
+    raise ValueError('Unknown backend {}'.format(backend))
 
 
 def OpMetadataToProto(pyobj):
@@ -78,13 +248,6 @@ def CurrentSourceInfoMetadata(op_type=None, op_name=None, skip_frames=1):
       source_line=lineno)
 
 
-def _maybe_encode_string(s):
-  if six.PY3:
-    return s.encode('utf-8')
-  else:
-    return s
-
-
 class PaddingType(enum.Enum):
   VALID = 1
   SAME = 2
@@ -122,6 +285,7 @@ def _convert_padding_type_to_pad_values(padding_type, lhs_dims, rhs_dims,
 
 _UNARY_OPS = [
     'Not',
+    'Clz',
     'Abs',
     'Exp',
     'Expm1',
@@ -199,6 +363,7 @@ XLA_ELEMENT_TYPE_TO_DTYPE = {
     xla_data_pb2.F32: np.dtype('float32'),
     xla_data_pb2.F64: np.dtype('float64'),
     xla_data_pb2.C64: np.dtype('complex64'),
+    xla_data_pb2.C128: np.dtype('complex128'),
     xla_data_pb2.TUPLE: np.dtype(np.object),
 }
 
@@ -222,33 +387,18 @@ class LocalBuffer(object):
   means the referent is in device memory.
   """
 
-  def __init__(self, c_buffer, backend, replica):
+  def __init__(self, c_buffer, backend, device):
     self.c_buffer = c_buffer
     self._backend = backend
-    self._replica = replica
-    if backend.backend_type == BackendType.XRT:
-      self._delete = c_api.DeleteXrtAllocation
-    else:
-      self._delete = c_api.DeleteLocalShapedBuffer
+    self._device = device
 
   @staticmethod
-  def from_pyval(pyval, replica=0, backend=XLA_LOCAL_BACKEND):
+  def from_pyval(pyval, device=0, backend=None):
     """Allocate and copy to XLA the given python value."""
+    backend = backend or _get_default_local_backend()
     pyval = require_numpy_array_layout(pyval)
-    num_replicas = get_replica_count()
-    if not 0 <= replica < num_replicas:
-      raise ValueError(
-          'Attempt to place buffer on replica {} when the replica count is {}'
-          .format(replica, num_replicas))
-    if backend.backend_type == BackendType.XRT:
-      if replica != 0:
-        raise NotImplementedError(
-            'Multi-replica execution is not yet supported via the XRT backend.')
-      cbuf = c_api.XrtAllocation.FromLiteral(
-          pyval, _maybe_encode_string(backend.target))
-    else:
-      cbuf = c_api.LocalShapedBuffer.FromLiteral(pyval, None, replica)
-    return LocalBuffer(cbuf, backend, replica)
+    cbuf = backend.buffer_from_pyval(pyval, device)
+    return LocalBuffer(cbuf, backend, device)
 
   def to_py(self):
     return self.c_buffer.ToLiteral()
@@ -256,29 +406,22 @@ class LocalBuffer(object):
   def shape(self):
     return _wrap_shape(self.c_buffer.shape())
 
-  def replica(self):
-    return self._replica
+  def device(self):
+    return self._device
 
   def delete(self):
     if self.c_buffer is not None:
-      self._delete(self.c_buffer)
+      self._backend.delete_buffer(self.c_buffer)
       self.c_buffer = None
 
   def destructure(self):
     """Assuming a tuple buffer, unpack it into constituent tuple elements."""
     assert self.c_buffer is not None
-    if self._backend.backend_type == BackendType.XRT:
-      result = c_api.DestructureXrtAllocationTuple(
-          self.c_buffer, _maybe_encode_string(self._backend.target))
-    else:
-      result = c_api.DestructureLocalShapedBufferTuple(self.c_buffer)
+    result = self._backend.destructure_tuple(self.c_buffer)
     self.delete()
-    size = result.size()
-    destructured = tuple(
-        LocalBuffer(
-            result.Release(i), replica=self._replica, backend=self._backend)
-        for i in xrange(size))
-    return destructured
+    return tuple(
+        LocalBuffer(sub_buffer, device=self._device, backend=self._backend)
+        for sub_buffer in result)
 
   def is_deleted(self):
     return self.c_buffer is None
@@ -415,7 +558,7 @@ class Shape(object):
       assert mtm is None, self
     if mtm is not None:
       assert self.rank() == len(mtm), self
-      assert sorted(mtm) == range(len(mtm)), self
+      assert sorted(mtm) == list(range(len(mtm))), self
 
   def update_minor_to_major(self, minor_to_major):
     if not self.is_array():
@@ -427,6 +570,34 @@ class Shape(object):
     updated._check_minor_to_major()  # pylint: disable=protected-access
     return updated
 
+  def with_major_to_minor_layout_if_absent(self):
+    """Returns a copy of a shape with missing layouts set to major-to-minor."""
+
+    def f(a):
+      if a.minor_to_major():
+        return None
+      return a.update_minor_to_major(tuple(xrange(a.rank() - 1, -1, -1)))
+
+    return self.map_leaves(f)
+
+  def serialize(self, proto):
+    """Serializes 'shape' into proto."""
+    if self.is_tuple():
+      proto.element_type = xla_data_pb2.TUPLE
+      for shape in self.tuple_shapes():
+        shape.serialize(proto.tuple_shapes.add())
+    else:
+      proto.element_type = dtype_to_etype(self.element_type())
+      proto.dimensions.extend(self.dimensions())
+      proto.is_dynamic_dimension.extend([False for _ in self.dimensions()])
+      if self.minor_to_major():
+        proto.layout.format = xla_data_pb2.DENSE
+        proto.layout.minor_to_major.extend(self.minor_to_major())
+
+
+ProgramShape = collections.namedtuple('ProgramShape',
+                                      ('parameter_shapes', 'result_shape'))
+
 
 def _wrap_shape(shape_info):
   dtype, dims = shape_info
@@ -438,6 +609,12 @@ def _wrap_shape(shape_info):
     return Shape.array_shape(dtype, dims)
 
 
+def _wrap_program_shape(shape_info):
+  arg_shapes, result_shape = shape_info
+  return ProgramShape([_wrap_shape(arg) for arg in arg_shapes],
+                      _wrap_shape(result_shape))
+
+
 def require_numpy_array_layout(value):
   if isinstance(value, tuple):
     return tuple(require_numpy_array_layout(x) for x in value)
@@ -458,9 +635,10 @@ class CompileOptions(object):
     self.dump_unoptimized_hlo_proto_to = None
     self.dump_per_pass_hlo_proto_to = None
     self.hlo_profile = False
+    self.num_replicas = get_replica_count()
 
 
-def transfer_to_infeed(value, replica_number=None):
+def transfer_to_infeed(value, device_ordinal=0):
   """Transfers the given value into the XLA infeed queue.
 
   XLA's infeed queue is a single queue that feeds the "XLA virtual machine" with
@@ -470,64 +648,50 @@ def transfer_to_infeed(value, replica_number=None):
   Args:
     value: the value that the caller would like to enqueue into the XLA infeed
       queue
-    replica_number: the replica number to infeed the value to -- if not
-      provided, then the default replica (trivially replica 0) is used.
+    device_ordinal: the device to infeed the value to. Each device has a
+      distinct infeed queue.
   """
-  if replica_number is None:
-    c_api.TransferToInfeedLocal(require_numpy_array_layout(value))
-  else:
-    c_api.TransferToInfeedLocalReplica(
-        require_numpy_array_layout(value), replica_number)
+  # TODO(phawkins): support non-default backends.
+  backend = _get_default_local_backend()
+  backend.client.TransferToInfeed(
+      require_numpy_array_layout(value), device_ordinal)
 
 
-def transfer_from_outfeed(shape, replica_number=None):
-  """Transfers a literal of the given shape from replica_number's outfeed.
+def transfer_from_outfeed(shape, device_ordinal=0):
+  """Transfers a literal of the given shape from `device_ordinal`'s outfeed.
 
   Args:
     shape: The shape of the value to transfer from outfeed.
-    replica_number: The replica number ordinal to transfer the outfeed value
-      from. (Each replica has a distinct outfeed queue.)
+    device_ordinal: The device ordinal to transfer the outfeed value from. Each
+      device has a distinct outfeed queue..
 
   Returns:
     The literal value that is produced from the outfeed queue.
   """
-  return c_api.TransferFromOutfeedLocalReplica(shape, replica_number or 0)
+  # TODO(phawkins): support non-default backends.
+  backend = _get_default_local_backend()
+  return backend.client.TransferFromOutfeed(shape, device_ordinal)
 
 
-class LocalComputation(object):
-  """Python wrapper for a local XLA Computation.
+class Computation(object):
+  """Python wrapper for an XLA Computation.
 
-  A LocalComputation can be executed if it is compiled. Otherwise, it
-  can still be used as a Computation where required by the
-  ComputationBuilder methods.
+  A Computation can be compiled to form an Executable, or used as a
+  subcomputation in ComputationBuilder methods.
   """
 
-  def __init__(self, c_computation, is_compiled, backend=XLA_LOCAL_BACKEND):
+  def __init__(self, c_computation, backend=None):
     self._c_computation = c_computation
+    # The backend argument is deprecated. Pass a backend to Compile() instead.
     self._backend = backend
-    self._is_compiled = is_compiled
-
-    # Ensure a reference to C-based destructor for use in __del__.
-    if is_compiled:
-      if backend.backend_type == BackendType.XRT:
-        assert isinstance(c_computation, c_api.CompiledXrtComputation)
-        self._delete = c_api.DeleteCompiledXrtComputation
-      else:
-        assert isinstance(c_computation, c_api.CompiledLocalComputation)
-        self._delete = c_api.DeleteCompiledLocalComputation
-    else:
-      assert isinstance(c_computation, c_api.LocalComputation)
-      self._delete = c_api.DeleteLocalComputation
+    self._delete_computation = c_api.DeleteComputation
 
   @property
   def computation(self):
-    if self._is_compiled:
-      raise ValueError(
-          'Attempt to read the XLA computation of a compiled LocalComputation.')
     return self._c_computation
 
   def GetProto(self):
-    """Get the HloModuleProto proto object in this local computation.
+    """Get the HloModuleProto proto object in this computation.
 
     Returns:
        An HloModuleProto proto object that has the whole-graph information.
@@ -536,30 +700,41 @@ class LocalComputation(object):
     proto = hlo_pb2.HloModuleProto.FromString(serialized)
     return proto
 
-  def Compile(self, argument_shapes=(), compile_options=None, layout_fn=None):
-    """Compiles an un-compiled local computation.
+  def GetHloText(self):
+    """Get the textual HLO representation of this computation.
+
+    Returns:
+       A string containing the textual HLO.
+    """
+    return self.computation.GetHloText()
+
+  def GetHloDotGraph(self):
+    """Get a Graphviz Dot representation of this computation.
+
+    Returns:
+       A string containing the graphviz dot graph.
+    """
+    return self.computation.GetHloDotGraph()
 
-    Local computations are the result of a "LocalComputationBuild'ing" process
-    -- they start in uncompiled form, and via a call to Compile() turn into a
-    compiled local computation.
+  def Compile(self, argument_shapes=(), compile_options=None, layout_fn=None,
+              backend=None):
+    """Compiles a computation.
 
-    Raises:
-      ValueError: if this is already a compiled local computation.
+    Computations are the result of a "ComputationBuild'ing" process.
 
     Arguments:
       argument_shapes: parameter shapes -- they are first laid out by layout_fn
         if layout_fn is provided. Otherwise, the default layout for those shapes
         will be used.
-      compile_options: options to use for compilation, includes an optional
-        laid out result shape for the computation.
+      compile_options: options to use for compilation, includes an optional laid
+        out result shape for the computation.
       layout_fn: lambda that is used to lay out the argument/result shapes.
+      backend: a `Backend` for which an executable should be generated.
 
     Returns:
-      A newly *compiled* local computation instance.
+      A Executable instance.
     """
-    if self._is_compiled:
-      raise ValueError('Attempt to compile a compiled local XLA computation.')
-
+    backend = backend or self._backend or _get_default_local_backend()
     result_shape = _wrap_shape(self.computation.GetReturnValueShape())
 
     if layout_fn:
@@ -572,32 +747,52 @@ class LocalComputation(object):
 
     compile_options = compile_options or CompileOptions()
     compile_options.result_shape = result_shape
-    if self._backend.backend_type == BackendType.XRT:
-      c = self.computation.CompileForXrt(
-          argument_shapes, _maybe_encode_string(self._backend.target))
-    else:
-      c = self.computation.Compile(argument_shapes, compile_options)
-    return LocalComputation(c, is_compiled=True, backend=self._backend)
+    c = backend.compile(self.computation, argument_shapes, result_shape,
+                        compile_options)
+    return Executable(c, backend=backend)
 
   def CompileWithExampleArguments(self,
                                   arguments=(),
                                   compile_options=None,
-                                  layout_fn=None):
+                                  layout_fn=None,
+                                  backend=None):
     return self.Compile(
         argument_shapes=[Shape.from_pyval(arg) for arg in arguments],
         compile_options=compile_options,
-        layout_fn=layout_fn)
+        layout_fn=layout_fn,
+        backend=backend)
+
+  def GetProgramShape(self):
+    return _wrap_program_shape(self._c_computation.GetProgramShape())
 
   def GetReturnValueShape(self):
     return _wrap_shape(self._c_computation.GetReturnValueShape())
 
+  def __del__(self):
+    if self._c_computation:
+      self._delete_computation(self._c_computation)
+
+
+class Executable(object):
+  """Python wrapper for an XLA Executable."""
+
+  def __init__(self, c_executable, backend=None):
+    self._c_executable = c_executable
+    self._device_ordinals = c_executable.DeviceOrdinals()
+    self._backend = backend
+
+  def DeviceOrdinals(self):
+    """Returns a list containing the device ordinals for each replica."""
+    return self._device_ordinals
+
   def Execute(self, arguments=(), check_for_deleted_args=True):
     """Execute on one replica with LocalBuffer arguments and return value."""
     if check_for_deleted_args and any(arg.is_deleted() for arg in arguments):
       raise ValueError('Executing with deleted local buffer argument')
     raw_args = [arg.c_buffer for arg in arguments]
-    output_buffer = self._c_computation.Execute(raw_args)
-    return LocalBuffer(output_buffer, backend=self._backend, replica=0)
+    output_buffer = self._backend.execute(self._c_executable, raw_args)
+    return LocalBuffer(
+        output_buffer, backend=self._backend, device=self._device_ordinals[0])
 
   def ExecutePerReplica(self, arguments=None):
     """Execute on many replicas with LocalBuffer arguments and return value.
@@ -607,14 +802,12 @@ class LocalComputation(object):
         sequence comprises the arguments for execution on the i'th replica.
 
     Returns:
-      A list of the computation's outputs on each replica, as a LocalBuffer. If
+      A list of the computation's outputs for each replica, as a LocalBuffer. If
       a shallow sequence of arguments was passed in for `arguments`, then the
       sole, zero'th replica's output is returned instead, as a LocalBuffer.
     """
-    if not self._is_compiled:
-      raise ValueError('Cannot execute an uncompiled local XLA computation.')
     if arguments is None:
-      arguments = ((),) * get_replica_count()
+      arguments = ((),) * len(self._device_ordinals)
     else:
       arguments = [list(replica_args) for replica_args in arguments]
 
@@ -623,37 +816,35 @@ class LocalComputation(object):
       for arg in replica_args:
         if arg.is_deleted():
           raise ValueError('Executing with deleted local buffer argument')
-        if arg.replica() != replica:
+        if arg.device() != self._device_ordinals[replica]:
           raise ValueError(
-              'Executing on replica {} with argument from replica {}'.format(
-                  replica, arg.replica()))
+              'Executing on device {} with argument from device {}'.format(
+                  self._device_ordinals[replica], arg.device()))
 
     # Pull out argument buffer handles
+    # pylint: disable=g-complex-comprehension
     stripped_args = [
         [arg.c_buffer for arg in replica_args] for replica_args in arguments
     ]
 
     # Execute
-    if self._backend.backend_type == BackendType.XRT:
-      if len(stripped_args) > 1:
-        raise NotImplementedError(
-            'Multi-replica execution is not yet supported via the XRT backend.')
-      output_buffers = [self._c_computation.Execute(stripped_args[0])]
-    else:
-      output_buffer_tup = self._c_computation.ExecutePerReplica(stripped_args)
-      size = output_buffer_tup.size()
-      output_buffers = [output_buffer_tup.Release(i) for i in xrange(size)]
+    output_buffers = self._backend.execute_replicated(self._c_executable,
+                                                      stripped_args)
 
     # Wrap output handles in LocalBuffer instances
     return tuple(
-        LocalBuffer(output_buffer, backend=self._backend, replica=replica)
+        LocalBuffer(
+            output_buffer,
+            backend=self._backend,
+            device=self._device_ordinals[replica])
         for replica, output_buffer in enumerate(output_buffers))
 
   def ExecuteWithPythonValues(self, arguments=()):
     """Execute on one replica with Python values as arguments and output."""
 
     def put(arg):
-      return LocalBuffer.from_pyval(arg, backend=self._backend)
+      return LocalBuffer.from_pyval(
+          arg, device=self._device_ordinals[0], backend=self._backend)
 
     arguments = [put(arg) for arg in arguments]
     return self.Execute(arguments).to_py()
@@ -661,24 +852,33 @@ class LocalComputation(object):
   def ExecuteWithPythonValuesPerReplica(self, arguments):
     """Execute on many replicas with Python values as arguments and output."""
 
-    def put(arg, replica):
-      return LocalBuffer.from_pyval(arg, replica, backend=self._backend)
+    def put(arg, device):
+      return LocalBuffer.from_pyval(arg, device, backend=self._backend)
 
-    arguments = [[put(arg, replica)
-                  for arg in replica_args]
-                 for replica, replica_args in enumerate(arguments)]
+    # pylint: disable=g-complex-comprehension
+    arguments = [[
+        put(arg, self._device_ordinals[replica]) for arg in replica_args
+    ] for replica, replica_args in enumerate(arguments)]
     return [out.to_py() for out in self.ExecutePerReplica(arguments)]
 
   def __del__(self):
-    self._delete(self._c_computation)
+    # Python may have freed c_api first.
+    if c_api and self._c_executable:
+      self._backend.delete_executable(self._c_executable)
+
+
+def _make_replica_group_proto(replica_group):
+  replica_group_proto = xla_data_pb2.ReplicaGroup()
+  replica_group_proto.replica_ids.extend(replica_group)
+  return replica_group_proto
 
 
 class ComputationBuilder(object):
   """XLA computation builder.
 
   Enqueues XLA ops in sequence and in order to build a
-  LocalComputation, which in turn can be compiled into a
-  CompiledLocalComputation, which in turn can be locally executed.
+  Computation, which in turn can be compiled into a
+  LocalExecutable, which in turn can be locally executed.
   """
 
   # The methods of this class map 1-to-1 onto the XLA C++
@@ -689,16 +889,23 @@ class ComputationBuilder(object):
   # pylint: disable=g-doc-args
 
   def __init__(self, name):
-    self._client = c_api.LocalComputationBuilder(name.encode('utf8'))
+    self._client = c_api.ComputationBuilder(name.encode('utf8'))
     self._parameter_numbering = itertools.count()
 
-  def Build(self, root=None, backend=XLA_LOCAL_BACKEND):
+  def Build(self, root=None, backend=None):
+    """Builds a `Computation` from the contents of the builder.
+
+    Args:
+      root: if not None, the operator containing the return value of the
+        computation.
+      backend: deprecated. Pass a `backend` to `Computation.Compile` instead.
+    Returns:
+      A `Computation`.
+    """
     if root is not None:
-      return LocalComputation(
-          self._client.BuildWithRoot(root), is_compiled=False, backend=backend)
+      return Computation(self._client.BuildWithRoot(root), backend=backend)
     else:
-      return LocalComputation(
-          self._client.Build(), is_compiled=False, backend=backend)
+      return Computation(self._client.Build(), backend=backend)
 
   def SetOpMetadata(self, op_metadata):
     """Set metadata for operations that are about to be enqueued."""
@@ -831,6 +1038,33 @@ class ComputationBuilder(object):
     return self.ParameterWithShape(
         Shape.from_pyval(value), name=name, parameter_num=parameter_num)
 
+  def Iota(self, dtype, size):
+    """Enqueues an iota constant onto the computation.
+
+    Args:
+      dtype: expected numpy dtype of the output.
+      size: integer, the number of elements in the array.
+
+    Returns:
+      A LocalOp representing the added iota constant.
+    """
+    element_type = DTYPE_TO_XLA_ELEMENT_TYPE[str(np.dtype(dtype))]
+    return self._client.Iota(element_type, size)
+
+  def BroadcastedIota(self, dtype, shape, dimension):
+    """Enqueues a broadcasted iota constant onto the computation.
+
+    Args:
+      dtype: expected numpy dtype of the output.
+      shape: tuple of integers, the expected output shape (dimensions).
+      dimension: positive integer, dimension along which to increment values.
+
+    Returns:
+      A LocalOp representing the added broadcasted iota constant.
+    """
+    xla_shape = Shape.array_shape(dtype, shape)
+    return self._client.BroadcastedIota(xla_shape, dimension)
+
   def Broadcast(self, operand, sizes):
     """Enqueues a broadcast operation onto the computation.
 
@@ -936,16 +1170,60 @@ class ComputationBuilder(object):
       dimensions = tuple(range(ndim))
     return self._client.Reshape(operand, dimensions, new_sizes)
 
-  def CrossReplicaSum(self, operand):
+  def AllToAll(self,
+               operand,
+               split_dimension,
+               concat_dimension,
+               replica_groups=None):
+    """AllToAll op.
+
+    Args:
+      operand: LocalOp representing the input array
+      split_dimension: the dimension along which the operand is split
+      concat_dimension: the dimension along which the split blocks are
+        concatenated
+      replica_groups: optional, list of lists of ints encoding a partition of
+        the set {0, 1, ..., num_replicas} into equally-sized replica groups
+        within which the all-to-all is performed. If not supplied or None (the
+        default), all replicas belong to the same group.
+
+    Returns:
+      A LocalOp that represents the all-to-all concatenation.
+    """
+    if replica_groups is None:
+      replica_groups_protos = []  # special value for XLA API
+    else:
+      replica_groups = list(replica_groups)
+      replica_groups_protos = [
+          _make_replica_group_proto(group) for group in replica_groups]
+    if not replica_groups:
+      split_count = get_replica_count()
+    else:
+      split_count = len(replica_groups[0])
+      if not all(split_count == len(g) for g in replica_groups):
+        raise ValueError('Replica groups must be equally sized')
+    return self._client.AllToAll(operand, split_dimension, concat_dimension,
+                                 split_count, replica_groups_protos)
+
+  def CrossReplicaSum(self, operand, replica_groups=None):
     """CrossReplicaSum op.
 
     Args:
       operand: the operand to sum across replica instances.
+      replica_groups: optional, list of lists of ints encoding a partition of
+        the set {0, 1, ..., num_replicas} into equally-sized replica groups
+        within which the cross-replica sum is performed. If not supplied or None
+        (the default), all replicas belong to the same group.
 
     Returns:
-      A LocalOp that has the sum of the value among all replicas.
+      A LocalOp that represents on each replica the sum of its group's values.
     """
-    return self._client.CrossReplicaSum(operand)
+    if replica_groups is None:
+      replica_groups = []  # special value for XLA API
+    else:
+      replica_groups = [
+          _make_replica_group_proto(group) for group in replica_groups]
+    return self._client.CrossReplicaSum(operand, replica_groups)
 
   def Collapse(self, operand, dimensions):
     """Collapse op."""
@@ -1102,6 +1380,31 @@ class ComputationBuilder(object):
     """
     return self._client.Call(computation_to_apply.computation, operands)
 
+  def CustomCall(self,
+                 call_target_name,
+                 operands,
+                 shape_with_layout,
+                 operand_shapes_with_layout,
+                 opaque=None):
+    """Enqueues a custom call operation onto the computation.
+
+    Args:
+      call_target_name: the name of the function to call.
+      operands: an iterable of LocalOp. The number and types of operands must
+        match the arity of `operand_shapes_with_layout`.
+      shape_with_layout: the shape of the operator's output, with layout.
+      operand_shapes_with_layout: the shapes of `operands`, including the
+        expected layouts.
+      opaque: an opaque string passed to the backend.
+
+    Returns:
+      A LocalOp representing the added custom call op.
+    """
+    opaque = opaque or b''
+    return self._client.CustomCall(call_target_name, operands,
+                                   shape_with_layout,
+                                   operand_shapes_with_layout, opaque)
+
   def Map(self, operands, computation_to_apply, dimensions):
     """Enqueues a map operation onto the computation.
 
@@ -1254,7 +1557,7 @@ class ComputationBuilder(object):
 
     Args:
       operand: a LocalOp to test.
-    Returns: a LocalComputation that is rooted on the given `operand` which is a
+    Returns: a Computation that is rooted on the given `operand` which is a
       compile-time constant.
     """
     return self._client.BuildConstantSubGraph(operand)
@@ -1411,13 +1714,51 @@ class ComputationBuilder(object):
     """Enqueues a key-value sort operation onto the computation."""
     return self._client.SortKeyVal(keys, values, dimension)
 
+  def Cholesky(self, a):
+    """Enqueues a Cholesky decomposition onto the computation."""
+    return self._client.Cholesky(a)
+
+  def QR(self, a, full_matrices=True):
+    """Enqueues a QR decomposition onto the computation."""
+    return self._client.QR(a, full_matrices)
+
+  def TriangularSolve(self,
+                      a,
+                      b,
+                      left_side=False,
+                      lower=False,
+                      transpose_a=False,
+                      conjugate_a=False,
+                      unit_diagonal=False):
+    """Enqueues a triangular-solve operation onto the computation."""
+    if not transpose_a:
+      transpose = 1
+      if conjugate_a:
+        a = self.Conj(a)
+    else:
+      transpose = 3 if conjugate_a else 2
+    return self._client.TriangularSolve(a, b, left_side, lower, unit_diagonal,
+                                        transpose)
+
+  def Gather(self, a, start_indices, dimension_numbers, slice_sizes):
+    """Enqueues a Gather operation onto the computation."""
+    return self._client.Gather(a, start_indices, dimension_numbers,
+                               slice_sizes)
+
+  def Scatter(self, a, scatter_indices, updates, update_computation,
+              dimension_numbers):
+    """Enqueues a Scatter operation onto the computation."""
+    return self._client.Scatter(
+        a, scatter_indices, updates, update_computation.computation,
+        dimension_numbers,)
+
 
 def _forward_methods_to_local_builder():
   """Forward remaining ComputationBuilder methods to the C API.
 
   Set up methods, corresponding to unary and binary XLA operations,
   whose calls are forwarded in a boilerplate manner to the underlying
-  LocalComputationBuilder C-extension API.
+  ComputationBuilder C-extension API.
   """
 
   def forward_to_local_builder_with_handles(target_method, is_binop=False):
@@ -1437,13 +1778,13 @@ def _forward_methods_to_local_builder():
 
   for method_name in _UNARY_OPS:
     forward = forward_to_local_builder_with_handles(
-        getattr(c_api.LocalComputationBuilder, method_name))
+        getattr(c_api.ComputationBuilder, method_name))
     forward.__name__ = method_name
     setattr(ComputationBuilder, method_name, forward)
 
   for method_name in _BINARY_OPS:
     forward = forward_to_local_builder_with_handles(
-        getattr(c_api.LocalComputationBuilder, method_name), is_binop=True)
+        getattr(c_api.ComputationBuilder, method_name), is_binop=True)
     forward.__name__ = method_name
     setattr(ComputationBuilder, method_name, forward)
 
@@ -1451,8 +1792,14 @@ def _forward_methods_to_local_builder():
 _forward_methods_to_local_builder()
 
 
+_default_replica_count = 1
+
+
 def initialize_replica_count(replica_count):
-  """Initializes the desired replica count to use on XLA service init.
+  """Initializes the default replica count to use.
+
+  Deprecated; pass `num_replicas` as an option to `Computation.Compile()`
+  instead.
 
   Args:
     replica_count: number of replicas that are desired for set up during XLA
@@ -1461,29 +1808,40 @@ def initialize_replica_count(replica_count):
   Raises:
     A runtime exception if the XLA service has already been initialized.
   """
-  c_api.InitializeReplicaCount(replica_count)
+  global _default_replica_count
+  _default_replica_count = replica_count
+
+
+def get_replica_count():
+  """Returns the default replica count.
+
+  Deprecated; pass `num_replicas` as an option to `Computation.Compile()`
+  instead.
+  """
+  return _default_replica_count
 
 
 def initialize_platform_name(platform_name):
-  """Initializes the desired platform name to use on XLA service init.
+  """Initializes the default platform name to use for XLA.
 
   Args:
     platform_name: string name of platform.
-
-  Raises:
-    A runtime exception if the XLA service has already been initialized.
   """
-  platform_name = _maybe_encode_string(platform_name)
-  c_api.InitializePlatformName(platform_name)
+  global _default_platform_name
+  _default_platform_name = platform_name
 
+  # Make sure the platform is valid by trying to instantiate it.
+  _get_default_local_backend()
 
-def get_replica_count():
-  """Returns the current replica count used for the XLA service.
 
-  Note: this will return a value whether the XLA service has been initialized
-  yet or not.
+def register_cpu_custom_call_target(name, fn):
+  """Registers a CPU custom call target.
+
+  Args:
+    name: bytes containing the name of the function.
+    fn: a PyCapsule object containing the function pointer.
   """
-  return c_api.GetReplicaCount()
+  c_api.RegisterCpuCustomCallTarget(name, fn)
 
 
 def GetPaddingConfigFromTriples(triples):
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 21b5c93b615ec429a5da0b4ffe89e8f75f59ef1b..51ef7d7f3a17f341e955f48615b05a886813430b 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -18,16 +18,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import itertools
 import threading
 
 import numpy as np
 
+from tensorflow.compiler.xla.python import custom_call_for_test
 from tensorflow.compiler.xla.python import xla_client
 import unittest
 
 
-class LocalComputationTest(unittest.TestCase):
+class ComputationTest(unittest.TestCase):
   """Base class for running an XLA Computation through the local client."""
 
   def _NewComputation(self, name=None):
@@ -51,9 +53,11 @@ class LocalComputationTest(unittest.TestCase):
   def _ExecuteAndCompareExact(self, c, arguments=(), expected=None):
     self._ExecuteAndAssertWith(np.testing.assert_equal, c, arguments, expected)
 
-  def _ExecuteAndCompareClose(self, c, arguments=(), expected=None):
-    self._ExecuteAndAssertWith(np.testing.assert_allclose, c, arguments,
-                               expected)
+  def _ExecuteAndCompareClose(self, c, arguments=(), expected=None, rtol=1e-7,
+                              atol=0):
+    self._ExecuteAndAssertWith(
+        functools.partial(np.testing.assert_allclose, rtol=rtol, atol=atol),
+        c, arguments, expected)
 
 
 def NumpyArrayF32(*args, **kwargs):
@@ -81,9 +85,35 @@ def NumpyArrayBool(*args, **kwargs):
   return np.array(*args, dtype=np.bool, **kwargs)
 
 
-class ComputationsWithConstantsTest(LocalComputationTest):
+class ComputationPrinting(unittest.TestCase):
+
+  def ExampleComputation(self):
+    builder = xla_client.ComputationBuilder("acomputation")
+    p0 = builder.ParameterFromNumpy(np.float32(0))
+    p1 = builder.ParameterFromNumpy(np.zeros((4,), np.float32))
+    builder.Mul(p0, p1)
+    return builder.Build()
+
+  def testComputationToHloText(self):
+    computation = self.ExampleComputation()
+    hlo_text = computation.GetHloText()
+    self.assertTrue(hlo_text.startswith("HloModule acomputation"))
+
+  def testComputationToHloGraph(self):
+    computation = self.ExampleComputation()
+    hlo_dot_graph = computation.GetHloDotGraph()
+    self.assertTrue(hlo_dot_graph.startswith("digraph "))
+
+
+class ComputationsWithConstantsTest(ComputationTest):
   """Tests focusing on Constant ops."""
 
+  def testConstantScalarSumS8(self):
+    c = self._NewComputation()
+    root = c.Add(c.Constant(np.int8(1)), c.Constant(np.int8(2)))
+    self.assertEqual(c.GetShape(root), c.GetReturnValueShape())
+    self._ExecuteAndCompareExact(c, expected=np.int8(3))
+
   def testConstantScalarSumF32(self):
     c = self._NewComputation()
     root = c.Add(c.ConstantF32Scalar(1.11), c.ConstantF32Scalar(3.14))
@@ -143,6 +173,17 @@ class ComputationsWithConstantsTest(LocalComputationTest):
     c.Pow(c.Constant(NumpyArrayF64([1.5, 2.5, 3.0])), c.ConstantF64Scalar(2.))
     self._ExecuteAndCompareClose(c, expected=[2.25, 6.25, 9.])
 
+  def testIota(self):
+    c = self._NewComputation()
+    c.Iota(np.float32, 10)
+    self._ExecuteAndCompareExact(c, expected=np.arange(10, dtype=np.float32))
+
+  def testBroadcastedIota(self):
+    c = self._NewComputation()
+    c.BroadcastedIota(np.int64, (2, 3), 1)
+    expected = np.array([[0, 1, 2], [0, 1, 2]], dtype=np.int64)
+    self._ExecuteAndCompareExact(c, expected=expected)
+
   def testBooleanAnd(self):
     c = self._NewComputation()
     c.And(
@@ -268,8 +309,22 @@ class ComputationsWithConstantsTest(LocalComputationTest):
         c.Constant(NumpyArrayF64([100, -100, 200, -200])))
     self._ExecuteAndCompareClose(c, expected=[104.4, -93.4, 208.8, -189])
 
+  def testCustomCall(self):
+    c = self._NewComputation()
+    for name, fn in custom_call_for_test.cpu_custom_call_targets.items():
+      xla_client.register_cpu_custom_call_target(name, fn)
+    c.CustomCall(
+        b"test_subtract_f32",
+        operands=(c.ConstantF32Scalar(1.25), c.ConstantF32Scalar(0.5)),
+        shape_with_layout=xla_client.Shape.array_shape(np.float32, (), ()),
+        operand_shapes_with_layout=(
+            xla_client.Shape.array_shape(np.float32, (), ()),
+            xla_client.Shape.array_shape(np.float32, (), ()),
+        ))
+    self._ExecuteAndCompareClose(c, expected=0.75)
+
 
-class ParametersTest(LocalComputationTest):
+class ParametersTest(ComputationTest):
   """Tests focusing on Parameter ops and argument-passing."""
 
   def setUp(self):
@@ -349,7 +404,7 @@ class ParametersTest(LocalComputationTest):
         expected=[-4.3, 1.3, -6.3, 3.3])
 
 
-class LocalBufferTest(LocalComputationTest):
+class LocalBufferTest(ComputationTest):
   """Tests focusing on execution with LocalBuffers."""
 
   def _Execute(self, c, arguments):
@@ -447,7 +502,7 @@ class LocalBufferTest(LocalComputationTest):
     self.assertEqual(np.dtype(xla_shape.element_type()), np.dtype(np.float32))
 
 
-class SingleOpTest(LocalComputationTest):
+class SingleOpTest(ComputationTest):
   """Tests for single ops.
 
   The goal here is smoke testing - to exercise the most basic functionality of
@@ -524,6 +579,18 @@ class SingleOpTest(LocalComputationTest):
       for src_dtype, dst_dtype in itertools.product(xla_types, xla_types):
         _ConvertAndTest(x, src_dtype, dst_dtype, xla_types[dst_dtype])
 
+  # TODO(b/123523486): re-enable when shape check is resolved
+  def DISABLED_testAllToAllOneReplica(self):
+    samples = [
+        NumpyArrayF32([97.0]),
+        NumpyArrayF32([64.0, 117.0]),
+        NumpyArrayF32([[2.0, 3.0], [4.0, 5.0]]),
+    ]
+    for lhs in samples[:1]:
+      c = self._NewComputation()
+      c.AllToAll(c.Constant(lhs), 0, 0)
+      self._ExecuteAndCompareExact(c, expected=lhs)
+
   def testCrossReplicaSumOneReplica(self):
     samples = [
         NumpyArrayF32(42.0),
@@ -536,6 +603,18 @@ class SingleOpTest(LocalComputationTest):
       c.CrossReplicaSum(c.Constant(lhs))
       self._ExecuteAndCompareExact(c, expected=lhs)
 
+  def testCrossReplicaSumOneReplicaWithSingletonGroup(self):
+    samples = [
+        NumpyArrayF32(42.0),
+        NumpyArrayF32([97.0]),
+        NumpyArrayF32([64.0, 117.0]),
+        NumpyArrayF32([[2.0, 3.0], [4.0, 5.0]]),
+    ]
+    for lhs in samples:
+      c = self._NewComputation()
+      c.CrossReplicaSum(c.Constant(lhs), [[0]])
+      self._ExecuteAndCompareExact(c, expected=lhs)
+
   def testDotMatrixVectorF32(self):
     c = self._NewComputation()
     lhs = NumpyArrayF32([[2.0, 3.0], [4.0, 5.0]])
@@ -698,6 +777,12 @@ class SingleOpTest(LocalComputationTest):
     c.Not(c.Constant(arr))
     self._ExecuteAndCompareClose(c, expected=~arr)
 
+  def testCountLeadingZeros(self):
+    c = self._NewComputation()
+    arr = NumpyArrayS32([0x7FFF, 0x12345678])
+    c.Clz(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=[17, 3])
+
   def testExp(self):
     c = self._NewComputation()
     arr = NumpyArrayF32([3.3, 12.1])
@@ -1057,6 +1142,38 @@ class SingleOpTest(LocalComputationTest):
     self.assertTrue(np.all(lo <= result))
     self.assertTrue(np.all(result < hi))
 
+  def testCholesky(self):
+    l = np.array([[4, 0, 0, 0], [6, 5, 0, 0], [2, 14, 16, 0], [3, 6, 1, 4]],
+                 dtype=np.float32)
+    c = self._NewComputation()
+    c.Cholesky(c.Constant(np.dot(l, l.T)))
+    self._ExecuteAndCompareClose(c, expected=l, rtol=1e-4)
+
+  def testQR(self):
+    a = np.array(
+        [[4, 6, 8, 10], [6, 45, 54, 63], [8, 54, 146, 166], [10, 63, 166, 310]],
+        dtype=np.float32)
+    c = self._NewComputation()
+    c.QR(c.Constant(a), full_matrices=True)
+    q, r = self._Execute(c, ())
+    np.testing.assert_allclose(np.dot(q, r), a, rtol=1e-4)
+
+  def testTriangularSolve(self):
+    a_vals = np.array(
+        [[2, 0, 0, 0], [3, 6, 0, 0], [4, 7, 9, 0], [5, 8, 10, 11]],
+        dtype=np.float32)
+    b_vals = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],
+                      dtype=np.float32)
+
+    c = self._NewComputation()
+    c.TriangularSolve(c.Constant(a_vals), c.Constant(b_vals), left_side=False,
+                      lower=True, transpose_a=True)
+    self._ExecuteAndCompareClose(c, expected=np.array([
+        [0.5, 0.08333334, 0.04629629, 0.03367003],
+        [2.5, -0.25, -0.1388889, -0.1010101],
+        [4.5, -0.58333331, -0.32407406, -0.23569024],
+    ], dtype=np.float32), rtol=1e-4)
+
   def testIsConstant(self):
     c = self._NewComputation()
     a = c.ConstantS32Scalar(3)
@@ -1068,8 +1185,23 @@ class SingleOpTest(LocalComputationTest):
     self.assertFalse(c.IsConstant(non_const_expr))
     # self.assertTrue(c.IsConstant(c.Sub(c.Add(x, a), x)))  # TODO(b/77245564)
 
+  def testGather(self):
+    a = np.arange(9).astype(np.int32).reshape((3, 3))
+    indices = np.array([[[0, 2], [2, 1]], [[1, 2], [2, 0]]], dtype=np.int32)
+    dnums = xla_client.xla_data_pb2.GatherDimensionNumbers()
+    dnums.offset_dims.append(1)
+    dnums.offset_dims.append(2)
+    dnums.start_index_map.append(0)
+    dnums.start_index_map.append(1)
+    dnums.index_vector_dim = 2
+    c = self._NewComputation()
+    c.Gather(c.Constant(a), c.Constant(indices), dnums, slice_sizes=[1, 1])
+    g = self._Execute(c, ())
+    expected = np.array([[[[2, 7]]], [[[5, 6]]]], dtype=np.int32)
+    np.testing.assert_allclose(g, expected, rtol=1e-4)
 
-class EmbeddedComputationsTest(LocalComputationTest):
+
+class EmbeddedComputationsTest(ComputationTest):
   """Tests for XLA graphs with embedded computations (such as maps)."""
 
   def _CreateConstantS32Computation(self):
@@ -1125,6 +1257,14 @@ class EmbeddedComputationsTest(LocalComputationTest):
     c.Mul(c.ParameterFromNumpy(NumpyArrayF64(0)), c.ConstantF64Scalar(2.0))
     return c.Build()
 
+  def _CreateBinaryAddS32Computation(self):
+    """Computation (s32, s32) -> s32 that adds its two parameters."""
+    c = self._NewComputation("add_param0_by_param1")
+    c.Add(
+        c.ParameterFromNumpy(NumpyArrayS32(0)),
+        c.ParameterFromNumpy(NumpyArrayS32(0)))
+    return c.Build()
+
   def _CreateBinaryAddF32Computation(self):
     """Computation (f32, f32) -> f32 that adds its two parameters."""
     c = self._NewComputation("add_param0_by_param1")
@@ -1507,8 +1647,25 @@ class EmbeddedComputationsTest(LocalComputationTest):
       execution.join()
       self.assertEqual(want, got)
 
+  def testScatter(self):
+    a = np.arange(9).astype(np.int32).reshape((3, 3))
+    scatter_indices = np.array([0, 2], dtype=np.int32)
+    updates = np.array([[10, 20, 30], [70, 80, 90]], dtype=np.int32)
+
+    dnums = xla_client.xla_data_pb2.ScatterDimensionNumbers()
+    dnums.update_window_dims.append(1)
+    dnums.inserted_window_dims.append(0)
+    dnums.scatter_dims_to_operand_dims.append(0)
+    dnums.index_vector_dim = 1
+
+    c = self._NewComputation()
+    c.Scatter(c.Constant(a), c.Constant(scatter_indices), c.Constant(updates),
+              self._CreateBinaryAddS32Computation(), dnums)
+    expected = np.array([[10, 21, 32], [3, 4, 5], [76, 87, 98]], dtype=np.int32)
+    self._ExecuteAndCompareClose(c, expected=expected)
+
 
-class ErrorTest(LocalComputationTest):
+class ErrorTest(ComputationTest):
 
   def setUp(self):
     self.f32_scalar_2 = NumpyArrayF32(2.0)
@@ -1525,7 +1682,7 @@ class ErrorTest(LocalComputationTest):
         lambda: c.Build().CompileWithExampleArguments([self.f32_scalar_2]))
 
 
-class ComputationRootTest(LocalComputationTest):
+class ComputationRootTest(ComputationTest):
   """Tests related to setting the root of the computation."""
 
   def testComputationRootDifferentFromLastOp(self):
diff --git a/tensorflow/compiler/xla/python/xla_data.i b/tensorflow/compiler/xla/python/xla_data.i
new file mode 100644
index 0000000000000000000000000000000000000000..974f314af24f61c0015a8d51c16dff1bfc84c7cc
--- /dev/null
+++ b/tensorflow/compiler/xla/python/xla_data.i
@@ -0,0 +1,654 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// SWIG typemaps and declarations for building, compiling, and
+// executing XLA computations, wrapping most of what is declared in
+// xla_data.h.
+//
+// The typemaps below implement/assert the following correspondences
+// (with elaborations below):
+//
+//    C++                                  Python
+// -------------------------------------+---------------------------------------
+//  Span<int64>                        <-  sequence of int
+//  vector<int>                        ->  sequence of int
+//  Span<LocalOp>                      <-  sequence of LocalOp
+//  Literal                            <-> (nested tuple of) numpy ndarray
+//  std::vector<Literal>               <-  sequence of (nested tuple of) ndarray
+//  Shape                               -> pair holding (dtype, dimensions)
+//                                     <-  object duck-typed as xla_client.Shape
+//  ProgramShape                       ->  pair of ([arg_shapes], ret_shape)
+//  std::vector<Shape>                 <-  sequence of xla_client.Shape objects
+//  PrimitiveType                      <-  int
+//  Span<pair<int64, in64>>            <-  sequence of int pairs
+//  PaddingConfig proto                <-  corresponding Python proto
+//  ConvolutionDimensionNumbers proto  <-  corresponding Python proto
+//  DotDimensionNumbers proto          <-  corresponding Python proto
+//  GatherDimensionNumbers proto       <-  corresponding Python proto
+//  ScatterDimensionNumbers proto      <-  corresponding Python proto
+//  Span<ReplicaGroup proto>           <-  sequence of ReplicaGroup Python proto
+//
+// Arrows indicate whether a conversion only ever occurs in one
+// direction, or whether it is maintained bidirectionally.
+//
+// The Python objects corresponding to C++ Literals have the type:
+//
+//   T = ndarray | (T, ...)
+//
+// where a terminal numpy ndarray translates to a Literal with a
+// non-tuple Shape, an XLA primitive element type corresponding to the
+// ndarray's dtype. Meanwhile, a non-terminal "tuple of T" translates
+// to a tuple-shaped Literal whose tuple components are translated
+// recursively. For example, if x is a numpy ndarray in Python, with
+// shape (2, 3) and dtype of dtype('float32'), then x translates to a
+// Literal with rank 2, dimension 2 and 3, and XLA primitive type
+// F32. Meanwhile,
+//
+//   (x, (x, x), (x,)),
+//
+// translates to a tuple-shaped XLA Literal, whose component subshapes
+// are a 2x3 F32-shaped literal followed by two tuple-shaped literals.
+//
+// Shapes output by C++ become Python objects with the type:
+//
+//   T            = (dtype, S)
+//   S            = DIMENSIONS | TUPLE_SHAPES
+//   DIMENSIONS   = (int, ...)
+//   TUPLE_SHAPES = (T, ...)
+//
+// In the pair described by the T rule, the terminal dtype determines
+// whether S expands as DIMENSIONS or TUPLE_SHAPES. Namely if it is
+// dtype('O'), numpy's object dtype, the structure represents a tuple
+// shape and the expansion of the non-terminal S is
+// TUPLE_SHAPES. Otherwise, dtype describes a primitive element type
+// and S expands into DIMENSIONS giving dimension sizes. For example:
+//
+//   (dtype('float32'), (3, 5, 7))
+//
+// describes a 3x5x7 array of F32s, and
+//
+//   (dtype('O'), ((dtype('float32'), (2, 3)),
+//                 (dtype('float64'), (4, 5))))
+//
+// describes a tuple shape with two subshapes: the first a 2x3 F32,
+// and the other a 4x5 F64.
+//
+// The Python int corresponding to a PrimitiveType enum must be valid
+// per xla_data.proto (e.g. xla_data.PRED, xla_data.F32).
+//
+// The SWIG object wrappers generated by this file are not intended
+// for end use, but rather for internal use in the Python XLA client,
+// xla_client.py.
+//
+// One central reason for the Python-side indirection is that the
+// Python-side objects produced by the typemaps in this file are
+// further packaged up by xla_client before being passed on. For
+// instance, the Python pair produced for a C++ Shape is further
+// wrapped in a Python class (xla_client.Shape) so as not to expose
+// the raw pair externally.
+//
+// Other SWIG object wrappers (e.g. of Computation) are further
+// wrapped by xla_client in order to set up a custom destructor that
+// triggers memory deallocation on the C++ side.
+
+%module(threads="1") xla_data
+
+// Keep the GIL except where explicitly specified.
+%nothread;
+
+%include "tensorflow/python/platform/base.i"
+
+%{
+// Must be included first
+#include "tensorflow/python/lib/core/numpy.h"
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/python/numpy_bridge.h"
+
+using namespace xla;
+using namespace xla::swig;
+
+%}
+
+// Basic types
+
+
+%typemap(out) std::vector<int> {
+  PyObject* out = PyList_New($1.size());
+  for (int i = 0; i < $1.size(); ++i) {
+    PyList_SET_ITEM(out, i, PyInt_FromLong($1[i]));
+  }
+  $result = out;
+}
+
+%typemap(out) StatusOr<bool> {
+  if ($1.ok()) {
+    $result = PyBool_FromLong($1.ConsumeValueOrDie());
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+}
+
+%typemap(out) StatusOr<string> {
+  if ($1.ok()) {
+    $result = PyString_FromString($1.ConsumeValueOrDie().c_str());
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+}
+
+%typemap(out) Status {
+  if (!$1.ok()) {
+    PyErr_SetString(
+        PyExc_RuntimeError, $1.ToString().c_str());
+    SWIG_fail;
+  }
+  Py_INCREF(Py_None);
+  $result = Py_None;
+}
+
+%typemap(in) absl::Span<const int64>
+    (std::vector<int64> temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  temps.resize(size);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    PyObject* py_int = numpy::PyNumberToPyInt(o);
+    if (!py_int) {
+      PyErr_SetString(
+          PyExc_TypeError,
+          "Argument sequence element cannot be converted to int");
+      Py_DECREF(o);
+      SWIG_fail;
+    }
+    temps[i] = numpy::PyIntOrPyLongToLong(py_int);
+    if (temps[i] == -1 && PyErr_Occurred()) {
+      Py_DECREF(py_int);
+      Py_DECREF(o);
+      SWIG_fail;
+    }
+    Py_DECREF(py_int);
+    Py_DECREF(o);
+  }
+  $1 = temps;
+}
+
+// Literal
+
+%typemap(in) const Literal& (StatusOr<Literal> literal_status) {
+  literal_status = numpy::XlaLiteralFromPyObject($input);
+  if (!literal_status.ok()) {
+    PyErr_SetString(PyExc_RuntimeError, literal_status.status().ToString().c_str());
+    SWIG_fail;
+  }
+  $1 = &literal_status.ValueOrDie();
+}
+
+%typemap(out) Literal (StatusOr<numpy::Safe_PyObjectPtr> obj_status) {
+  obj_status = numpy::PyObjectFromXlaLiteral(*$1);
+  if (!obj_status.ok()) {
+    PyErr_SetString(PyExc_RuntimeError, obj_status.status().ToString().c_str());
+    SWIG_fail;
+  }
+  $result = obj_status.ValueOrDie().release();
+}
+
+%typemap(out) StatusOr<Literal> (StatusOr<numpy::Safe_PyObjectPtr> obj_status) {
+  if (!$1.ok()) {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+  obj_status = numpy::PyObjectFromXlaLiteral($1.ValueOrDie());
+  if (!obj_status.ok()) {
+    PyErr_SetString(PyExc_RuntimeError, obj_status.status().ToString().c_str());
+    SWIG_fail;
+  }
+  $result = obj_status.ValueOrDie().release();
+}
+
+%typemap(in) const std::vector<Literal>& (std::vector<Literal> temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    StatusOr<Literal> literal_status = numpy::XlaLiteralFromPyObject(o);
+    if (!literal_status.ok()) {
+      PyErr_SetString(PyExc_RuntimeError, literal_status.status().ToString().c_str());
+      Py_DECREF(o);
+      SWIG_fail;
+    }
+    temps.push_back(literal_status.ConsumeValueOrDie());
+    Py_DECREF(o);
+  }
+  $1 = &temps;
+}
+
+// OpMetadata
+
+%typemap(in) const OpMetadata& (OpMetadata temp) {
+  StatusOr<OpMetadata> statusor = numpy::OpMetadataFromPyObject($input);
+  if (!statusor.ok()) {
+    PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+    SWIG_fail;
+  }
+  temp = std::move(statusor).ValueOrDie();
+  $1 = &temp;
+}
+
+// Shape
+
+%typemap(out) const Shape& {
+  $result = numpy::PyShapeInfoFromXlaShape(*$1).release();
+}
+
+%typemap(out) StatusOr<Shape> {
+  if ($1.ok()) {
+    $result = numpy::PyShapeInfoFromXlaShape($1.ConsumeValueOrDie()).release();
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+}
+
+
+%typemap(out) StatusOr<ProgramShape> {
+  if ($1.ok()) {
+    $result = numpy::PyProgramShapeInfoFromXlaProgramShape(
+        $1.ConsumeValueOrDie()).release();
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+}
+
+
+%typemap(in) const Shape& (Shape temp) {
+  StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
+  if (!statusor.ok()) {
+    PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+    SWIG_fail;
+  }
+  temp = std::move(statusor).ValueOrDie();
+  $1 = &temp;
+}
+
+%typemap(in) const absl::optional<Shape>& (
+    absl::optional<Shape> temp) {
+  if ($input == Py_None) {
+    temp = absl::nullopt;
+    $1 = &temp;
+  } else {
+    StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
+    if (!statusor.ok()) {
+      PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+      SWIG_fail;
+    }
+    temp = std::move(statusor).ValueOrDie();
+    $1 = &temp;
+  }
+}
+
+%typemap(out) std::unique_ptr<Shape> {
+  $result = numpy::PyShapeInfoFromXlaShape(*$1).release();
+}
+
+%typemap(in) const std::vector<Shape>& (std::vector<Shape> temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape(o);
+    Py_DECREF(o);
+    if (!statusor.ok()) {
+      PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+      SWIG_fail;
+    }
+    temps.push_back(statusor.ConsumeValueOrDie());
+  }
+  $1 = &temps;
+}
+
+%typemap(in) const std::vector<absl::optional<Shape> >& (
+    std::vector<absl::optional<Shape> > temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    if (o == Py_None) {
+      temps.push_back(absl::nullopt);
+    } else {
+      StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape(o);
+      Py_DECREF(o);
+      if (!statusor.ok()) {
+        PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
+        SWIG_fail;
+      }
+      temps.push_back(statusor.ConsumeValueOrDie());
+    }
+  }
+  $1 = &temps;
+}
+
+// PrimitiveType
+
+%typemap(in) PrimitiveType {
+  PyObject* py_int = numpy::PyNumberToPyInt($input);
+  if (!py_int) {
+    PyErr_SetString(PyExc_TypeError, "Argument cannot be converted to int");
+    SWIG_fail;
+  }
+  const long value = numpy::PyIntOrPyLongToLong(py_int);
+  if (value == -1 && PyErr_Occurred()) {
+    Py_DECREF(py_int);
+    SWIG_fail;
+  }
+  if (!PrimitiveType_IsValid(value)) {
+    PyErr_SetString(
+        PyExc_TypeError, "Argument not valid for PrimitiveType enum");
+    Py_DECREF(py_int);
+    SWIG_fail;
+  }
+  $1 = static_cast<PrimitiveType>(value);
+}
+
+// Span<pair<int64, in64>>
+
+%typemap(in) absl::Span<const std::pair<int64, int64> >
+    (std::vector<std::pair<int64, int64> > temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  temps.reserve(size);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    if (!o) {
+      SWIG_fail;
+    }
+    PyObject* first = PyTuple_GetItem(o, 0);
+    if (!first) {
+      Py_DECREF(o);
+      SWIG_fail;
+    }
+    PyObject* first_pyint = numpy::PyNumberToPyInt(first);
+    if (!first_pyint) {
+      PyErr_SetString(
+          PyExc_TypeError,
+          "First pair item cannot be converted to int");
+      Py_DECREF(o);
+      SWIG_fail;
+    }
+    PyObject* second = PyTuple_GetItem(o, 1);
+    if (!second) {
+      Py_DECREF(o);
+      Py_DECREF(first_pyint);
+      SWIG_fail;
+    }
+    PyObject* second_pyint = numpy::PyNumberToPyInt(second);
+    if (!second_pyint) {
+      PyErr_SetString(
+          PyExc_TypeError,
+          "Second pair item cannot be converted to int");
+      Py_DECREF(o);
+      Py_DECREF(first_pyint);
+      SWIG_fail;
+    }
+    const int64 first_value = numpy::PyIntOrPyLongToLong(first_pyint);
+    if (first_value == -1 && PyErr_Occurred()) {
+      Py_DECREF(o);
+      Py_DECREF(first_pyint);
+      Py_DECREF(second_pyint);
+      SWIG_fail;
+    }
+    const int64 second_value = numpy::PyIntOrPyLongToLong(second_pyint);
+    if (second_value == -1 && PyErr_Occurred()) {
+      Py_DECREF(o);
+      Py_DECREF(first_pyint);
+      Py_DECREF(second_pyint);
+      SWIG_fail;
+    }
+    temps.push_back(std::make_pair(first_value, second_value));
+    Py_DECREF(o);
+  }
+  $1 = temps;
+}
+
+// DotDimensionNumbers
+
+%typemap(in) const DotDimensionNumbers&
+    (DotDimensionNumbers dimension_numbers) {
+  if (!HandleRepeatedInt64Attribute(
+        $input, "lhs_contracting_dimensions",
+        dimension_numbers.mutable_lhs_contracting_dimensions())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "rhs_contracting_dimensions",
+        dimension_numbers.mutable_rhs_contracting_dimensions())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "lhs_batch_dimensions",
+        dimension_numbers.mutable_lhs_batch_dimensions())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "rhs_batch_dimensions",
+        dimension_numbers.mutable_rhs_batch_dimensions())) {
+    SWIG_fail;
+  }
+
+  $1 = &dimension_numbers;
+}
+
+// PaddingConfig
+
+%typemap(in) const PaddingConfig&
+    (PaddingConfig padding_config) {
+  PyObject* dimensions = PyObject_GetAttrString($input, "dimensions");
+  if (!dimensions) {
+    SWIG_fail;
+  }
+
+  int length = PySequence_Size(dimensions);
+  if (length == -1) {
+    Py_DECREF(dimensions);
+    SWIG_fail;
+  }
+
+  for (int i = 0; i < length; ++i) {
+    PyObject* item = PySequence_GetItem(dimensions, i);
+    if (!item) {
+      Py_DECREF(dimensions);
+      SWIG_fail;
+    }
+    int64 edge_padding_low, edge_padding_high, interior_padding;
+    if (!GetIntAttr(item, "edge_padding_low", &edge_padding_low)
+        || !GetIntAttr(item, "edge_padding_high", &edge_padding_high)
+        || !GetIntAttr(item, "interior_padding", &interior_padding)) {
+      Py_DECREF(item);
+      Py_DECREF(dimensions);
+      SWIG_fail;
+    }
+    Py_DECREF(item);
+
+    PaddingConfig::PaddingConfigDimension* dimension =
+        padding_config.add_dimensions();
+    dimension->set_edge_padding_low(edge_padding_low);
+    dimension->set_edge_padding_high(edge_padding_high);
+    dimension->set_interior_padding(interior_padding);
+  }
+  Py_DECREF(dimensions);
+
+  $1 = &padding_config;
+}
+
+// ConvolutionDimensionNumbers
+
+%typemap(in) const ConvolutionDimensionNumbers&
+    (ConvolutionDimensionNumbers dimension_numbers) {
+  int64 value;
+
+  if (!GetIntAttr($input, "input_batch_dimension", &value)) {
+    SWIG_fail;
+  }
+  dimension_numbers.set_input_batch_dimension(value);
+
+  if (!GetIntAttr($input, "input_feature_dimension", &value)) {
+    SWIG_fail;
+  }
+  dimension_numbers.set_input_feature_dimension(value);
+
+  if (!GetIntAttr($input, "output_batch_dimension", &value)) {
+    SWIG_fail;
+  }
+  dimension_numbers.set_output_batch_dimension(value);
+
+  if (!GetIntAttr($input, "output_feature_dimension", &value)) {
+    SWIG_fail;
+  }
+  dimension_numbers.set_output_feature_dimension(value);
+
+  if (!GetIntAttr($input, "kernel_output_feature_dimension", &value)) {
+    SWIG_fail;
+  }
+  dimension_numbers.set_kernel_output_feature_dimension(value);
+
+  if (!GetIntAttr($input, "kernel_input_feature_dimension", &value)) {
+    SWIG_fail;
+  }
+  dimension_numbers.set_kernel_input_feature_dimension(value);
+
+  if (!HandleRepeatedInt64Attribute(
+        $input, "input_spatial_dimensions",
+        dimension_numbers.mutable_input_spatial_dimensions())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "kernel_spatial_dimensions",
+        dimension_numbers.mutable_kernel_spatial_dimensions())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "output_spatial_dimensions",
+        dimension_numbers.mutable_output_spatial_dimensions())) {
+    SWIG_fail;
+  }
+
+  $1 = &dimension_numbers;
+}
+
+// GatherDimensionNumbers
+
+%typemap(in) const GatherDimensionNumbers&
+    (GatherDimensionNumbers dimension_numbers) {
+  if (!HandleRepeatedInt64Attribute(
+        $input, "offset_dims",
+        dimension_numbers.mutable_offset_dims())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "collapsed_slice_dims",
+        dimension_numbers.mutable_collapsed_slice_dims())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "start_index_map",
+        dimension_numbers.mutable_start_index_map())) {
+    SWIG_fail;
+  }
+
+  int64 value;
+  if (!GetIntAttr($input, "index_vector_dim", &value)) {
+    SWIG_fail;
+  }
+  dimension_numbers.set_index_vector_dim(value);
+
+  $1 = &dimension_numbers;
+}
+
+// ScatterDimensionNumbers
+
+%typemap(in) const ScatterDimensionNumbers&
+    (ScatterDimensionNumbers dimension_numbers) {
+  if (!HandleRepeatedInt64Attribute(
+        $input, "update_window_dims",
+        dimension_numbers.mutable_update_window_dims())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "inserted_window_dims",
+        dimension_numbers.mutable_inserted_window_dims())) {
+    SWIG_fail;
+  }
+  if (!HandleRepeatedInt64Attribute(
+        $input, "scatter_dims_to_operand_dims",
+        dimension_numbers.mutable_scatter_dims_to_operand_dims())) {
+    SWIG_fail;
+  }
+
+  int64 value;
+  if (!GetIntAttr($input, "index_vector_dim", &value)) {
+    SWIG_fail;
+  }
+  dimension_numbers.set_index_vector_dim(value);
+
+  $1 = &dimension_numbers;
+}
+
+// Span<const ReplicaGroup>
+
+%typemap(in) absl::Span<const ReplicaGroup >
+    (std::vector<ReplicaGroup > temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  temps.reserve(size);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    ReplicaGroup rgrp;
+    if (!HandleRepeatedInt64Attribute(
+            o, "replica_ids",
+            rgrp.mutable_replica_ids())) {
+        SWIG_fail;
+    }
+    temps.push_back(rgrp);
+    Py_DECREF(o);
+  }
+  $1 = temps;
+}
diff --git a/tensorflow/compiler/xla/python/xrt.cc b/tensorflow/compiler/xla/python/xrt.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2c55abc17f87c369e3d5b2140a84014e07921a9a
--- /dev/null
+++ b/tensorflow/compiler/xla/python/xrt.cc
@@ -0,0 +1,297 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/xrt.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/cc/client/client_session.h"
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/compiler/xrt/cc/ops/xrt_compile_ops.h"
+#include "tensorflow/compiler/xrt/cc/ops/xrt_execute_op.h"
+#include "tensorflow/compiler/xrt/cc/ops/xrt_state_ops.h"
+#include "tensorflow/compiler/xrt/xrt.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace swig {
+
+XrtAllocation::XrtAllocation(int64 handle, Shape shape,
+                             const string& session_target)
+    : handle_(handle), shape_(shape), session_target_(session_target) {}
+
+XrtAllocation::~XrtAllocation() {
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto allocation_handle =
+      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
+  auto release =
+      tensorflow::ops::XRTReleaseAllocationHandle(root, allocation_handle);
+  if (!root.status().ok()) {
+    LOG(ERROR) << root.status();
+    return;
+  }
+
+  tensorflow::ClientSession session(root, session_target_);
+  tensorflow::ClientSession::FeedType inputs;
+  inputs.insert({allocation_handle, handle()});
+  std::vector<tensorflow::Tensor> outputs;
+  auto status = session.Run(inputs, {}, {release}, &outputs);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+    return;
+  }
+}
+
+/* static */
+StatusOr<XrtAllocation*> XrtAllocation::FromLiteral(
+    const Literal& argument, const string& session_target) {
+  xrt::XLAAllocation alloc;
+  *alloc.mutable_value() = argument.ToProto();
+
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto literal_string =
+      tensorflow::ops::Placeholder(root, tensorflow::DT_STRING);
+  auto literal_handle = tensorflow::ops::XRTAllocate(root, literal_string);
+  TF_RETURN_IF_ERROR(root.status());
+
+  tensorflow::ClientSession session(root, session_target);
+  tensorflow::ClientSession::FeedType inputs;
+  inputs.insert({literal_string, alloc.SerializeAsString()});
+  std::vector<tensorflow::Tensor> outputs;
+  TF_RETURN_IF_ERROR(session.Run(inputs, {literal_handle}, &outputs));
+
+  int64 handle = outputs[0].scalar<int64>()();
+  return new XrtAllocation(handle, argument.shape(), session_target);
+}
+
+const int64 XrtAllocation::handle() const { return handle_; }
+
+const Shape& XrtAllocation::shape() const { return shape_; }
+
+StatusOr<Literal> XrtAllocation::ToLiteral() const {
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto allocation_handle =
+      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
+  auto read_literal = tensorflow::ops::XRTReadLiteral(root, allocation_handle);
+  TF_RETURN_IF_ERROR(root.status());
+
+  tensorflow::ClientSession session(root, session_target_);
+  tensorflow::ClientSession::FeedType inputs;
+  inputs.insert({allocation_handle, handle()});
+  std::vector<tensorflow::Tensor> outputs;
+  TF_RETURN_IF_ERROR(session.Run(inputs, {read_literal}, &outputs));
+
+  xla::LiteralProto response;
+  TF_RET_CHECK(response.ParseFromString(outputs[0].scalar<string>()()));
+  return Literal::CreateFromProto(response);
+}
+
+XrtAllocationTuple::XrtAllocationTuple(std::vector<XrtAllocation*> elements)
+    : elements_(std::move(elements)) {
+  for (auto* element : elements_) {
+    CHECK(element != nullptr);
+  }
+}
+
+XrtAllocationTuple::~XrtAllocationTuple() {
+  for (XrtAllocation* element : elements_) {
+    if (element != nullptr) {
+      delete element;
+    }
+  }
+}
+
+StatusOr<XrtAllocation*> XrtAllocationTuple::Release(int i) {
+  XrtAllocation* element = elements_[i];
+  if (element == nullptr) {
+    return InvalidArgument("Attempted to release already-released element %d.",
+                           i);
+  }
+  elements_[i] = nullptr;
+  return element;
+}
+
+int64 XrtAllocationTuple::size() const { return elements_.size(); }
+
+StatusOr<XrtExecutable*> XrtExecutable::CompileForXrt(
+    const string& hlo_module_proto, const std::vector<Shape>& argument_shapes,
+    const Shape& result_shape, const string& session_target) {
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto program = tensorflow::ops::Placeholder(root, tensorflow::DT_STRING);
+  auto compile = tensorflow::ops::XRTCompile(root, program);
+  TF_RETURN_IF_ERROR(root.status());
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  ProgramShape program_shape;
+  for (auto& shape : argument_shapes) {
+    *program_shape.add_parameters() = shape;
+  }
+  *program_shape.mutable_result() = result_shape;
+
+  LayoutUtil::SetToDefaultLayout(&program_shape);
+  *config->mutable_program_shape() = program_shape.ToProto();
+  c.mutable_hlo_snapshot()
+      ->mutable_hlo()
+      ->mutable_hlo_module()
+      ->ParsePartialFromString(hlo_module_proto);
+
+  tensorflow::ClientSession session(root, session_target);
+  tensorflow::ClientSession::FeedType inputs;
+  inputs.insert({program, c.SerializeAsString()});
+  std::vector<tensorflow::Tensor> outputs;
+  TF_RETURN_IF_ERROR(session.Run(inputs, {compile.handle}, &outputs));
+
+  int64 handle = outputs[0].scalar<int64>()();
+  return new XrtExecutable(program_shape, handle, session_target);
+}
+
+XrtExecutable::XrtExecutable(const ProgramShape& program_shape, int64 handle,
+                             const string& session_target)
+    : program_shape_(program_shape),
+      handle_(handle),
+      session_target_(session_target) {}
+
+XrtExecutable::~XrtExecutable() {
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto computation_handle =
+      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
+  auto release =
+      tensorflow::ops::XRTReleaseCompilationHandle(root, computation_handle);
+  if (!root.status().ok()) {
+    LOG(ERROR) << root.status();
+    return;
+  }
+
+  tensorflow::ClientSession session(root, session_target_);
+  tensorflow::ClientSession::FeedType inputs;
+  inputs.insert({computation_handle, handle()});
+  std::vector<tensorflow::Tensor> outputs;
+  auto status = session.Run(inputs, {}, {release}, &outputs);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+    return;
+  }
+}
+
+StatusOr<XrtAllocation*> XrtExecutable::Execute(
+    absl::Span<XrtAllocation* const> argument_handles) {
+  const int num_expected_arguments = program_shape().parameters().size();
+
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  std::vector<tensorflow::Output> arguments;
+  arguments.reserve(num_expected_arguments);
+  for (int i = 0; i < num_expected_arguments; ++i) {
+    arguments.push_back(
+        tensorflow::ops::Placeholder(root, tensorflow::DT_INT64));
+  }
+  auto computation_handle =
+      tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
+  auto execution_config =
+      tensorflow::ops::Placeholder(root, tensorflow::DT_STRING);
+  auto execute = tensorflow::ops::XRTExecute(root, computation_handle,
+                                             execution_config, arguments);
+  TF_RETURN_IF_ERROR(root.status());
+
+  TF_RET_CHECK(argument_handles.size() == arguments.size());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(false);
+  e.set_release_compilation_handle(false);
+
+  tensorflow::ClientSession session(root, session_target_);
+  tensorflow::ClientSession::FeedType inputs;
+  for (int i = 0; i < arguments.size(); ++i) {
+    inputs.insert({arguments[i], argument_handles[i]->handle()});
+  }
+  inputs.insert({computation_handle, handle()});
+  inputs.insert({execution_config, e.SerializeAsString()});
+  std::vector<tensorflow::Tensor> outputs;
+  TF_RETURN_IF_ERROR(session.Run(inputs, {execute}, &outputs));
+
+  int64 output = outputs[0].scalar<int64>()();
+  return new XrtAllocation(output, program_shape().result(), session_target_);
+}
+
+const ProgramShape& XrtExecutable::program_shape() const {
+  return program_shape_;
+}
+
+int64 XrtExecutable::handle() const { return handle_; }
+
+void DeleteXrtAllocation(XrtAllocation* allocation) { delete allocation; }
+
+void DeleteXrtExecutable(XrtExecutable* computation) { delete computation; }
+
+StatusOr<XrtAllocationTuple*> DestructureXrtAllocationTuple(
+    XrtAllocation* allocation, const string& session_target) {
+  const Shape& tuple_shape = allocation->shape();
+
+  if (!tuple_shape.IsTuple()) {
+    return InvalidArgument(
+        "Attemped to destructure a LocalShapedBuffer that did not have a tuple "
+        "shape; shape: %s",
+        ShapeUtil::HumanString(tuple_shape));
+  }
+
+  tensorflow::Scope root = tensorflow::Scope::NewRootScope();
+  auto base_handle = tensorflow::ops::Placeholder(root, tensorflow::DT_INT64);
+  auto shape_index = tensorflow::ops::Placeholder(root, tensorflow::DT_INT32);
+  auto subtuple = tensorflow::ops::XRTSubTuple(root, base_handle, shape_index);
+  TF_RETURN_IF_ERROR(root.status());
+
+  tensorflow::ClientSession session(root, session_target);
+  tensorflow::ClientSession::FeedType inputs;
+  std::vector<XrtAllocation*> results;
+  for (int32 i = 0; i < ShapeUtil::TupleElementCount(tuple_shape); ++i) {
+    inputs.clear();
+    inputs.insert({base_handle, allocation->handle()});
+    inputs.insert({shape_index, {i}});
+    std::vector<tensorflow::Tensor> outputs;
+    auto status = session.Run(inputs, {subtuple}, &outputs);
+    if (!status.ok()) {
+      // Clean up before returning non-ok status.
+      for (int j = 0; j < results.size(); ++j) {
+        delete results[j];
+      }
+      return status;
+    }
+    const int64 subtuple_handle = outputs[0].scalar<int64>()();
+    const Shape& subtuple_shape =
+        ShapeUtil::GetTupleElementShape(tuple_shape, i);
+    results.push_back(
+        new XrtAllocation(subtuple_handle, subtuple_shape, session_target));
+  }
+  return new XrtAllocationTuple(std::move(results));
+}
+
+}  // namespace swig
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/xrt.h b/tensorflow/compiler/xla/python/xrt.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd5bba6d5c9641dadc323f70745e870c14543321
--- /dev/null
+++ b/tensorflow/compiler/xla/python/xrt.h
@@ -0,0 +1,118 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_XRT_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_XRT_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/shape.h"
+
+namespace xla {
+namespace swig {
+
+// Represents a reference to literals that live in a device-allocated buffer via
+// XRT. Specifically, wraps an int64 handle produced by running the allocation
+// graph, and an XLA shape to track the referent's shape.
+class XrtAllocation {
+ public:
+  // Accepts a `session_target` argument, used in constructing the
+  // `tensorflow::ClientSession` instance in which allocation and deallocation
+  // graphs are run.
+  static StatusOr<XrtAllocation*> FromLiteral(const Literal& argument,
+                                              const string& session_target);
+
+  XrtAllocation(int64 handle, Shape shape, const string& session_target);
+  ~XrtAllocation();
+  StatusOr<Literal> ToLiteral() const;
+  const Shape& shape() const;
+  const int64 handle() const;
+
+ private:
+  const int64 handle_;
+  const Shape shape_;
+  const string session_target_;
+};
+
+// Result of a tuple destructuring operation on an XrtAllocation.
+class XrtAllocationTuple {
+ public:
+  // Note: any XrtAllocation elements that are not Release()'d will be
+  // deallocated in the destructor.
+  explicit XrtAllocationTuple(std::vector<XrtAllocation*> elements);
+
+  ~XrtAllocationTuple();
+
+  // Releases the ith element to the caller. Further attempts to release the ith
+  // element will return an invalid argument error.
+  StatusOr<XrtAllocation*> Release(int i);
+
+  // Returns the number of elements in the destructured tuple.
+  int64 size() const;
+
+ private:
+  std::vector<XrtAllocation*> elements_;
+};
+
+// Destructures a tuple-valued XrtAllocation into its constitutent elements
+// in XrtAllocationTuple form.
+//
+// Accepts a `session_target` argument, used in constructing the
+// `tensorflow::ClientSession` instance in which the sub-tupling graph is run,
+// and passed along in constructing each constituent XrtAllocation.
+StatusOr<XrtAllocationTuple*> DestructureXrtAllocationTuple(
+    XrtAllocation* allocation, const string& session_target);
+
+// Represents a compiled computation that can be executed given handles to
+// device-allocated literals. Specifically, wraps an XRT computation handle.
+class XrtExecutable {
+ public:
+  // Accepts a `session_target` argument, used in constructing the
+  // `tensorflow::ClientSession` instance in which the compilation graph is run.
+  static StatusOr<XrtExecutable*> CompileForXrt(
+      const string& hlo_module_proto, const std::vector<Shape>& argument_shapes,
+      const Shape& result_shape, const string& session_target);
+
+  // Accepts a `session_target` argument, used in constructing the
+  // `tensorflow::ClientSession` instance in which the execution graph is run.
+  XrtExecutable(const ProgramShape& program_shape, int64 handle,
+                const string& session_target);
+  ~XrtExecutable();
+
+  std::vector<int> DeviceOrdinals() const { return {0}; }
+
+  StatusOr<XrtAllocation*> Execute(
+      absl::Span<XrtAllocation* const> argument_handles);
+
+  const ProgramShape& program_shape() const;
+  int64 handle() const;
+
+ private:
+  const ProgramShape program_shape_;
+  const int64 handle_;
+  const string session_target_;
+};
+
+// Functions for freeing resources from the Python side.
+void DeleteXrtAllocation(XrtAllocation* allocation);
+void DeleteXrtExecutable(XrtExecutable* computation);
+
+}  // namespace swig
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_XRT_H_
diff --git a/tensorflow/compiler/xla/python/xrt.i b/tensorflow/compiler/xla/python/xrt.i
new file mode 100644
index 0000000000000000000000000000000000000000..456dd7be86e479b46815fc16b51a10431fe2060d
--- /dev/null
+++ b/tensorflow/compiler/xla/python/xrt.i
@@ -0,0 +1,124 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Wrappers for XRT ops.
+
+%module(threads="1") xrt
+
+// Keep the GIL except where explicitly specified.
+%nothread;
+
+%include "tensorflow/python/platform/base.i"
+%include "tensorflow/compiler/xla/python/xla_data.i"
+
+%{
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/python/xrt.h"
+
+using namespace xla;
+using namespace xla::swig;
+
+%}
+
+// Computation and buffer/allocation types
+
+%typemap(out) StatusOr<xla::swig::XrtExecutable*> {
+  if ($1.ok()) {
+    auto* value = $1.ValueOrDie();
+    {
+      auto* $1 = value;
+      $typemap(out, xla::swig::XrtExecutable*)
+    }
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+}
+
+%typemap(out) StatusOr<xla::swig::XrtAllocation*> {
+  if ($1.ok()) {
+    auto* value = $1.ValueOrDie();
+    {
+      auto* $1 = value;
+      $typemap(out, xla::swig::XrtAllocation*)
+    }
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+}
+
+%typemap(out) StatusOr<xla::swig::XrtAllocationTuple*> {
+  if ($1.ok()) {
+    auto* value = $1.ValueOrDie();
+    {
+      auto* $1 = value;
+      $typemap(out, xla::swig::XrtAllocationTuple*)
+    }
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
+  }
+}
+
+
+%typemap(in) absl::Span<xla::swig::XrtAllocation* const>
+    (std::vector<XrtAllocation*> temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  temps.reserve(size);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    XrtAllocation* xrta;
+    if ((SWIG_ConvertPtr(o, (void**) &xrta, $descriptor(xla::swig::XrtAllocation*),
+                         SWIG_POINTER_EXCEPTION)) == -1) {
+      SWIG_fail;
+    }
+    temps.push_back(xrta);
+    Py_DECREF(o);
+  }
+  $1 = temps;
+}
+
+
+%ignoreall
+%unignore xla;
+%unignore xla::swig;
+%unignore xla::swig::XrtAllocation;
+%unignore xla::swig::XrtAllocation::FromLiteral;
+%unignore xla::swig::XrtAllocation::ToLiteral;
+%unignore xla::swig::XrtAllocation::shape;
+%unignore xla::swig::XrtAllocationTuple;
+%unignore xla::swig::XrtAllocationTuple::Release;
+%unignore xla::swig::XrtAllocationTuple::size;
+%unignore xla::swig::XrtExecutable;
+%unignore xla::swig::XrtExecutable::CompileForXrt;
+%unignore xla::swig::XrtExecutable::DeviceOrdinals;
+%unignore xla::swig::XrtExecutable::Execute;
+%unignore xla::swig::DestructureXrtAllocationTuple;
+%unignore xla::swig::DeleteXrtAllocation;
+%unignore xla::swig::DeleteXrtExecutable;
+
+%thread;
+%include "tensorflow/compiler/xla/python/xrt.h"
+%nothread;
+
+%unignoreall
diff --git a/tensorflow/compiler/xla/python_api/xla_literal.py b/tensorflow/compiler/xla/python_api/xla_literal.py
index 757e41a78ad2b57d2ef6e1f3055160be22c7b3ed..19bd685ab2260485d2a86f0a682d0cdd36712fdb 100644
--- a/tensorflow/compiler/xla/python_api/xla_literal.py
+++ b/tensorflow/compiler/xla/python_api/xla_literal.py
@@ -69,7 +69,7 @@ def _ConvertNumpyArrayToLiteral(ndarray):
 
   if ndarray.ndim == 0:
     getattr(literal, type_record.literal_field_name).append(
-        _np.asscalar(ndarray.astype(type_record.literal_field_type)))
+        ndarray.astype(type_record.literal_field_type).item())
   else:
     # Ndarrays with boolean dtypes need special type conversion with protobufs
     if ndarray.dtype in {_np.bool_, _np.dtype('bool')}:
diff --git a/tensorflow/compiler/xla/python_api/xla_shape.py b/tensorflow/compiler/xla/python_api/xla_shape.py
index 95b2bf300ec67e9f034f77450416544cb088ae55..bdcd4abd6cc708795416b15412f37dde10d7fe97 100644
--- a/tensorflow/compiler/xla/python_api/xla_shape.py
+++ b/tensorflow/compiler/xla/python_api/xla_shape.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 import numpy as _np  # Avoids becoming a part of public Tensorflow API.
 
+from six.moves import xrange
+
 from tensorflow.compiler.xla import xla_data_pb2
 from tensorflow.compiler.xla.python_api import types
 
diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc
index ceb5e74db7c3b9305e9d77068df9ae0a3690af8a..08b78ee244844f41d551d7e249cec0cbf157d639 100644
--- a/tensorflow/compiler/xla/reference_util.cc
+++ b/tensorflow/compiler/xla/reference_util.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <array>
 #include <utility>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
@@ -32,48 +32,19 @@ limitations under the License.
 
 namespace xla {
 
-namespace {
-
-template <typename T>
-std::unique_ptr<Array2D<T>> MatmulArray2DImpl(
-    const Array2D<T>& lhs, const Array2D<T>& rhs,
-    const std::function<void(
-        const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m, int64 n,
-        int64 k, int32 transpose_lhs, int32 transpose_rhs)>& impl_fn) {
-  CHECK_EQ(lhs.width(), rhs.height());
-  int m = lhs.height();
-  int n = rhs.width();
-  int k = lhs.width();
-  auto result = absl::make_unique<Array2D<T>>(m, n);
-  // Because Eigen is a header-oriented library, make sure that the Eigen code
-  // is the same as the code used by the CPU backend (otherwise the linker will
-  // randomly pick *some* definition).
-  impl_fn(
-      /*run_options_ptr=*/nullptr, result->data(), rhs.data(), lhs.data(), n, m,
-      k,
-      /*transpose_lhs=*/0,
-      /*transpose_rhs=*/0);
-  return result;
-}
-
-}  // namespace
-
 /* static */ std::unique_ptr<Array2D<Eigen::half>> ReferenceUtil::MatmulArray2D(
     const Array2D<Eigen::half>& lhs, const Array2D<Eigen::half>& rhs) {
-  return MatmulArray2DImpl<Eigen::half>(
-      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF16);
+  return HloEvaluator::MatmulArray2D(lhs, rhs);
 }
 
 /* static */ std::unique_ptr<Array2D<float>> ReferenceUtil::MatmulArray2D(
     const Array2D<float>& lhs, const Array2D<float>& rhs) {
-  return MatmulArray2DImpl<float>(
-      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF32);
+  return HloEvaluator::MatmulArray2D(lhs, rhs);
 }
 
 /* static */ std::unique_ptr<Array2D<double>> ReferenceUtil::MatmulArray2D(
     const Array2D<double>& lhs, const Array2D<double>& rhs) {
-  return MatmulArray2DImpl<double>(
-      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF64);
+  return HloEvaluator::MatmulArray2D(lhs, rhs);
 }
 
 /* static */ std::unique_ptr<Array2D<double>> ReferenceUtil::Array2DF32ToF64(
@@ -557,10 +528,11 @@ ReferenceUtil::ConvArray4DGeneralDimensionsDilated(
   dim2.set_base_dilation(lhs_dilation.second);
   *window.add_dimensions() = dim2;
 
-  const Shape& shape = ShapeInference::InferConvolveShape(
-                           lhs_literal.shape(), rhs_literal.shape(),
-                           /*feature_group_count=*/1, window, dnums)
-                           .ConsumeValueOrDie();
+  const Shape& shape =
+      ShapeInference::InferConvolveShape(
+          lhs_literal.shape(), rhs_literal.shape(),
+          /*feature_group_count=*/1, /*batch_group_count=*/1, window, dnums)
+          .ConsumeValueOrDie();
 
   HloInstruction* lhs_instruction =
       b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal)));
@@ -572,16 +544,16 @@ ReferenceUtil::ConvArray4DGeneralDimensionsDilated(
       /*new_size=*/2, PrecisionConfig::DEFAULT);
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
-      window, dnums, precision_config));
+      /*batch_group_count=*/1, window, dnums, precision_config));
   HloModuleConfig config;
   HloModule module("ReferenceUtil", config);
   auto computation = module.AddEntryComputation(b.Build());
 
   HloEvaluator evaluator;
   Literal result_literal =
-      evaluator.Evaluate<const Literal*>(*computation, {}).ConsumeValueOrDie();
+      evaluator.Evaluate(*computation, {}).ConsumeValueOrDie();
 
-  CHECK_EQ(ShapeUtil::Rank(result_literal.shape()), 4);
+  CHECK_EQ(result_literal.shape().rank(), 4);
   auto result =
       absl::make_unique<Array4D<float>>(result_literal.shape().dimensions(0),
                                         result_literal.shape().dimensions(1),
@@ -634,24 +606,26 @@ ReferenceUtil::ReduceToRowArray2D(
     const std::function<float(float, float)>& reduce_function) {
   std::vector<float> result;
   CHECK_EQ(dims.size(), 3);
-  const std::set<int64> dim_set(dims.begin(), dims.end());
+  const absl::flat_hash_set<int64> dim_set(dims.begin(), dims.end());
   CHECK_EQ(dim_set.size(), 3);
-  for (int64 a0 = 0; a0 == 0 || (!dim_set.count(0) && a0 < array.n1()); ++a0) {
-    for (int64 a1 = 0; a1 == 0 || (!dim_set.count(1) && a1 < array.n2());
+  for (int64 a0 = 0; a0 == 0 || (!dim_set.contains(0) && a0 < array.n1());
+       ++a0) {
+    for (int64 a1 = 0; a1 == 0 || (!dim_set.contains(1) && a1 < array.n2());
          ++a1) {
-      for (int64 a2 = 0; a2 == 0 || (!dim_set.count(2) && a2 < array.n3());
+      for (int64 a2 = 0; a2 == 0 || (!dim_set.contains(2) && a2 < array.n3());
            ++a2) {
-        for (int64 a3 = 0; a3 == 0 || (!dim_set.count(3) && a3 < array.n4());
+        for (int64 a3 = 0; a3 == 0 || (!dim_set.contains(3) && a3 < array.n4());
              ++a3) {
           float accumulator = init;
-          for (int64 i0 = 0; i0 == 0 || (dim_set.count(0) && i0 < array.n1());
-               ++i0) {
-            for (int64 i1 = 0; i1 == 0 || (dim_set.count(1) && i1 < array.n2());
-                 ++i1) {
+          for (int64 i0 = 0;
+               i0 == 0 || (dim_set.contains(0) && i0 < array.n1()); ++i0) {
+            for (int64 i1 = 0;
+                 i1 == 0 || (dim_set.contains(1) && i1 < array.n2()); ++i1) {
               for (int64 i2 = 0;
-                   i2 == 0 || (dim_set.count(2) && i2 < array.n3()); ++i2) {
+                   i2 == 0 || (dim_set.contains(2) && i2 < array.n3()); ++i2) {
                 for (int64 i3 = 0;
-                     i3 == 0 || (dim_set.count(3) && i3 < array.n4()); ++i3) {
+                     i3 == 0 || (dim_set.contains(3) && i3 < array.n4());
+                     ++i3) {
                   // Handle zero-sized arrays.
                   if (array.n1() > 0 && array.n2() > 0 && array.n3() > 0 &&
                       array.n4() > 0) {
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.cc b/tensorflow/compiler/xla/rpc/grpc_service.cc
index d8123a6de28ca532819ece4a75cd0b725f8c1bbd..22b4218fbd5e9bc59a0de22735eb51db46670f09 100644
--- a/tensorflow/compiler/xla/rpc/grpc_service.cc
+++ b/tensorflow/compiler/xla/rpc/grpc_service.cc
@@ -47,6 +47,14 @@ namespace xla {
   });
 }
 
+::grpc::Status GRPCService::GetDeviceHandles(::grpc::ServerContext* context,
+                                             const GetDeviceHandlesRequest* arg,
+                                             GetDeviceHandlesResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->GetDeviceHandles(arg, result);
+  });
+}
+
 ::grpc::Status GRPCService::Compile(::grpc::ServerContext* /*context*/,
                                     const CompileRequest* arg,
                                     CompileResponse* result) {
@@ -61,6 +69,14 @@ namespace xla {
       [this, arg, result]() { return service_->Execute(arg, result); });
 }
 
+::grpc::Status GRPCService::ExecuteGraphParallel(
+    ::grpc::ServerContext* /*context*/, const ExecuteGraphParallelRequest* arg,
+    ExecuteParallelResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->ExecuteGraphParallel(arg, result);
+  });
+}
+
 ::grpc::Status GRPCService::WaitForExecution(::grpc::ServerContext* context,
                                              const WaitForExecutionRequest* arg,
                                              WaitForExecutionResponse* result) {
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.h b/tensorflow/compiler/xla/rpc/grpc_service.h
index 3e586b288a56a22573d0c3b9ae7b2f25fdbf851a..b546704f73e34941cbf7bc2fe08062aa438039f7 100644
--- a/tensorflow/compiler/xla/rpc/grpc_service.h
+++ b/tensorflow/compiler/xla/rpc/grpc_service.h
@@ -39,6 +39,10 @@ class GRPCService : public grpc::XlaService::Service {
                                   const DeconstructTupleRequest* arg,
                                   DeconstructTupleResponse* result) override;
 
+  ::grpc::Status GetDeviceHandles(::grpc::ServerContext* context,
+                                  const GetDeviceHandlesRequest* arg,
+                                  GetDeviceHandlesResponse* result) override;
+
   ::grpc::Status Compile(::grpc::ServerContext* context,
                          const CompileRequest* arg,
                          CompileResponse* result) override;
@@ -46,6 +50,9 @@ class GRPCService : public grpc::XlaService::Service {
   ::grpc::Status Execute(::grpc::ServerContext* context,
                          const ExecuteRequest* arg,
                          ExecuteResponse* result) override;
+  ::grpc::Status ExecuteGraphParallel(::grpc::ServerContext* context,
+                                      const ExecuteGraphParallelRequest* arg,
+                                      ExecuteParallelResponse* result) override;
 
   ::grpc::Status WaitForExecution(::grpc::ServerContext* context,
                                   const WaitForExecutionRequest* arg,
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 4c21ae2a427477caa86fb4130616c38eb3bcf006..8d8394cb43ee013b9396a54e3a4d037445fcc0e1 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1,6 +1,14 @@
 # Description:
 #   XLA service implementation.
 
+load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library_py",
+)
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
+
 licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = [":friends"])
@@ -12,15 +20,6 @@ package_group(
     ],
 )
 
-load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
-load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
-load(
-    "//tensorflow/core:platform/default/build_config.bzl",
-    "tf_proto_library_py",
-)
-
 xla_proto_library(
     name = "hlo_proto",
     srcs = ["hlo.proto"],
@@ -115,6 +114,7 @@ tf_cc_test(
         ":bfloat16_normalization",
         ":bfloat16_support",
         ":hlo",
+        ":hlo_creation_utils",
         ":hlo_verifier",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -224,23 +224,28 @@ cc_library(
         "hlo_evaluator_typed_visitor.h",
         "hlo_evaluator_typed_visitor_bfloat16.cc",
         "hlo_evaluator_typed_visitor_bool.cc",
+        "hlo_evaluator_typed_visitor_complex128.cc",
         "hlo_evaluator_typed_visitor_complex64.cc",
         "hlo_evaluator_typed_visitor_double.cc",
         "hlo_evaluator_typed_visitor_float.cc",
         "hlo_evaluator_typed_visitor_half.cc",
+        "hlo_evaluator_typed_visitor_int16.cc",
         "hlo_evaluator_typed_visitor_int32.cc",
         "hlo_evaluator_typed_visitor_int64.cc",
         "hlo_evaluator_typed_visitor_int8.cc",
+        "hlo_evaluator_typed_visitor_uint16.cc",
         "hlo_evaluator_typed_visitor_uint32.cc",
         "hlo_evaluator_typed_visitor_uint64.cc",
         "hlo_evaluator_typed_visitor_uint8.cc",
     ],
     hdrs = ["hlo_evaluator.h"],
     deps = [
+        ":dynamic_dimension_inference",
         ":hlo",
         ":hlo_casting_utils",
         ":hlo_query",
         ":shape_inference",
+        "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -249,12 +254,14 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/meta:type_traits",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
@@ -266,6 +273,7 @@ tf_cc_test(
     srcs = ["hlo_evaluator_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_element_type_converter",
         ":hlo_evaluator",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:reference_util",
@@ -278,7 +286,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/service:hlo_element_type_converter",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -514,6 +521,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -672,10 +680,10 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//third_party/eigen3",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -694,6 +702,7 @@ cc_library(
         ":compiler",
         ":computation_layout",
         ":device_memory_allocator",
+        ":dynamic_dimension_inference",
         ":executable",
         ":execution_tracker",
         ":hlo",
@@ -1001,6 +1010,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -1012,6 +1022,7 @@ cc_library(
     srcs = ["name_uniquer.cc"],
     hdrs = ["name_uniquer.h"],
     deps = [
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -1051,7 +1062,6 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -1089,7 +1099,6 @@ cc_library(
         ":buffer_value_containers",
         ":heap_simulator",
         ":hlo",
-        ":hlo_memory_scheduler",
         ":hlo_proto",
         ":logical_buffer",
         ":tuple_points_to_analysis",
@@ -1134,6 +1143,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -1193,7 +1203,6 @@ cc_library(
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
@@ -1228,7 +1237,6 @@ cc_library(
     deps = [
         ":hlo",
         ":hlo_proto",
-        "//tensorflow/compiler/xla:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
@@ -1412,6 +1420,7 @@ cc_library(
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -1451,11 +1460,15 @@ cc_library(
     hdrs = ["hlo_creation_utils.h"],
     deps = [
         ":hlo",
+        ":hlo_module_config",
         ":shape_inference",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/client/lib:comparators",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -1495,12 +1508,25 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
 )
 
+cc_library(
+    name = "op_expander_pass",
+    srcs = ["op_expander_pass.cc"],
+    hdrs = ["op_expander_pass.h"],
+    deps = [
+        ":hlo",
+        ":hlo_creation_utils",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
+
 cc_library(
     name = "gather_expander",
     srcs = ["gather_expander.cc"],
@@ -1509,6 +1535,7 @@ cc_library(
         ":hlo",
         ":hlo_creation_utils",
         ":hlo_pass",
+        ":op_expander_pass",
         ":while_util",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:statusor",
@@ -1532,6 +1559,28 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "triangular_solve_expander",
+    srcs = ["triangular_solve_expander.cc"],
+    hdrs = ["triangular_solve_expander.h"],
+    deps = [
+        ":op_expander_pass",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/compiler/xla/client/lib:math",
+        "//tensorflow/compiler/xla/client/lib:matrix",
+        "//tensorflow/compiler/xla/client/lib:slicing",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
 tf_cc_test(
     name = "batchnorm_expander_test",
     size = "small",
@@ -1576,6 +1625,9 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
@@ -1590,7 +1642,7 @@ tf_cc_test(
         ":algebraic_simplifier",
         ":hlo",
         ":hlo_casting_utils",
-        ":hlo_matchers",
+        ":hlo_creation_utils",
         ":hlo_parser",
         ":hlo_pass",
         ":pattern_matcher",
@@ -1695,9 +1747,9 @@ tf_cc_test(
 )
 
 cc_library(
-    name = "convolution_feature_group_converter",
-    srcs = ["convolution_feature_group_converter.cc"],
-    hdrs = ["convolution_feature_group_converter.h"],
+    name = "convolution_group_converter",
+    srcs = ["convolution_group_converter.cc"],
+    hdrs = ["convolution_group_converter.h"],
     deps = [
         ":hlo",
         ":hlo_pass",
@@ -1715,11 +1767,11 @@ cc_library(
 )
 
 tf_cc_test(
-    name = "convolution_feature_group_converter_test",
+    name = "convolution_group_converter_test",
     size = "small",
-    srcs = ["convolution_feature_group_converter_test.cc"],
+    srcs = ["convolution_group_converter_test.cc"],
     deps = [
-        ":convolution_feature_group_converter",
+        ":convolution_group_converter",
         ":hlo",
         ":hlo_matchers",
         ":hlo_parser",
@@ -1782,6 +1834,7 @@ tf_cc_test(
         ":hlo_cse",
         ":hlo_dce",
         ":hlo_matchers",
+        ":hlo_parser",
         ":hlo_pass",
         ":hlo_pass_pipeline",
         ":tuple_simplifier",
@@ -1860,8 +1913,9 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -1916,6 +1970,7 @@ cc_library(
     hdrs = ["dynamic_dimension_inference.h"],
     deps = [
         ":hlo",
+        ":while_util",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
@@ -1925,6 +1980,46 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "dynamic_padder",
+    srcs = ["dynamic_padder.cc"],
+    hdrs = ["dynamic_padder.h"],
+    deps = [
+        ":dynamic_dimension_inference",
+        ":hlo_dce",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+    ],
+)
+
+tf_cc_test(
+    name = "dynamic_padder_test",
+    srcs = ["dynamic_padder_test.cc"],
+    deps = [
+        ":dynamic_padder",
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_runner",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test",
+    ],
+)
+
 tf_cc_test(
     name = "dynamic_dimension_inference_test",
     srcs = ["dynamic_dimension_inference_test.cc"],
@@ -2011,7 +2106,6 @@ cc_library(
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -2052,6 +2146,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
@@ -2108,8 +2203,12 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -2249,6 +2348,7 @@ tf_cc_test(
     srcs = ["hlo_dataflow_analysis_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_creation_utils",
         ":hlo_dataflow_analysis",
         ":hlo_graph_dumper",
         ":hlo_matchers",
@@ -2282,6 +2382,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
@@ -2418,6 +2519,7 @@ tf_cc_test(
     srcs = ["tuple_points_to_analysis_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_creation_utils",
         ":hlo_matchers",
         ":instruction_fusion",
         ":tuple_points_to_analysis",
@@ -2542,6 +2644,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -2586,6 +2689,7 @@ tf_cc_test(
     srcs = ["hlo_verifier_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_module_config",
         ":hlo_parser",
         ":hlo_verifier",
         ":layout_assignment",
@@ -2593,6 +2697,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -2790,7 +2895,6 @@ cc_library(
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
@@ -2963,15 +3067,11 @@ cc_library(
     srcs = ["hlo_get_dimension_size_rewriter.cc"],
     hdrs = ["hlo_get_dimension_size_rewriter.h"],
     deps = [
+        ":dynamic_dimension_inference",
         ":hlo",
         ":hlo_pass",
         ":shape_inference",
-        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
     ],
 )
@@ -3133,43 +3233,17 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "hlo_tfgraph_builder",
-    srcs = ["hlo_tfgraph_builder.cc"],
-    hdrs = ["hlo_tfgraph_builder.h"],
-    deps = [
-        ":hlo",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-tf_cc_test(
-    name = "hlo_tfgraph_builder_test",
-    srcs = ["hlo_tfgraph_builder_test.cc"],
-    deps = [
-        ":hlo_tfgraph_builder",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
 cc_library(
     name = "hlo_graph_dumper",
     srcs = [
         "hlo_graph_dumper.cc",
+        "hlo_graph_html_renderer.cc",
     ],
     hdrs = ["hlo_graph_dumper.h"],
     deps = [
         ":hlo",
         ":hlo_casting_utils",
         ":hlo_execution_profile",
-        ":hlo_tfgraph_builder",
         ":pattern_matcher",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -3179,6 +3253,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:regexp_internal",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
@@ -3212,7 +3287,6 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
     ],
 )
@@ -3339,7 +3413,6 @@ cc_library(
         ":hlo_pass_pipeline",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/core:lib",
-        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -3396,10 +3469,70 @@ cc_library(
         ":hlo_profile_printer_data",
         ":human_readable_profile_builder",
         "//tensorflow/compiler/xla:types",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
     ],
 )
 
+cc_library(
+    name = "sort_simplifier",
+    srcs = ["sort_simplifier.cc"],
+    hdrs = ["sort_simplifier.h"],
+    deps = [
+        ":hlo",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:statusor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+    ],
+)
+
+tf_cc_test(
+    name = "sort_simplifier_test",
+    srcs = ["sort_simplifier_test.cc"],
+    deps = [
+        ":hlo_matchers",
+        ":hlo_parser",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
+        ":sort_simplifier",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test",
+    ],
+)
+
+cc_library(
+    name = "stable_sort_expander",
+    srcs = ["stable_sort_expander.cc"],
+    hdrs = ["stable_sort_expander.h"],
+    deps = [
+        ":hlo",
+        ":hlo_casting_utils",
+        ":hlo_pass",
+        ":op_expander_pass",
+        "//tensorflow/compiler/xla:statusor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+    ],
+)
+
+tf_cc_test(
+    name = "stable_sort_expander_test",
+    srcs = ["stable_sort_expander_test.cc"],
+    deps = [
+        ":algebraic_simplifier",
+        ":hlo_matchers",
+        ":hlo_parser",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
+        ":stable_sort_expander",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "tuple_util",
     srcs = ["tuple_util.cc"],
@@ -3496,9 +3629,7 @@ cc_library(
         ":while_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
     ],
 )
@@ -3553,7 +3684,6 @@ cc_library(
         ":hlo_evaluator",
         ":hlo_pass",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -3567,14 +3697,16 @@ cc_library(
 tf_cc_test(
     name = "indexed_array_analysis_test",
     srcs = ["indexed_array_analysis_test.cc"],
+    extra_copts = ["-Wno-string-plus-int"],
     deps = [
         ":hlo_matchers",
+        ":hlo_parser",
         ":indexed_array_analysis",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -3596,6 +3728,7 @@ cc_library(
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:variant",
     ],
 )
 
@@ -3624,7 +3757,6 @@ cc_library(
     srcs = ["hlo_lexer.cc"],
     hdrs = [
         "hlo_lexer.h",
-        "hlo_token.h",
     ],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
@@ -3660,6 +3792,47 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "optimize_input_output_buffer_alias",
+    srcs = ["optimize_input_output_buffer_alias.cc"],
+    hdrs = ["optimize_input_output_buffer_alias.h"],
+    deps = [
+        ":hlo",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:shape_tree",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings:str_format",
+    ],
+)
+
+tf_cc_test(
+    name = "optimize_input_output_buffer_alias_test",
+    srcs = ["optimize_input_output_buffer_alias_test.cc"],
+    deps = [
+        ":optimize_input_output_buffer_alias",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings:str_format",
+    ],
+)
+
 cc_library(
     name = "ar_crs_combiner",
     srcs = ["ar_crs_combiner.cc"],
@@ -3669,10 +3842,10 @@ cc_library(
         ":pattern_matcher",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -3680,6 +3853,38 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "dynamic_index_splitter",
+    srcs = ["dynamic_index_splitter.cc"],
+    hdrs = ["dynamic_index_splitter.h"],
+    deps = [
+        ":hlo_casting_utils",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "dynamic_index_splitter_test",
+    srcs = ["dynamic_index_splitter_test.cc"],
+    deps = [
+        ":dynamic_index_splitter",
+        ":hlo",
+        ":hlo_matchers",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 tf_cc_test(
     name = "ar_crs_combiner_test",
     srcs = ["ar_crs_combiner_test.cc"],
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 985c5af1c4d89425dd6693585e42e22510fe21f8..d566062e7401af545bd3a097d3b3735b305eba66 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <cmath>
+#include <functional>
 #include <iterator>
 #include <memory>
 #include <numeric>
@@ -25,6 +26,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/optional.h"
@@ -32,6 +36,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -41,12 +46,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
@@ -117,23 +124,37 @@ bool TransposeIsBitcast(const HloInstruction* transpose) {
                                        transpose->dimensions());
 }
 
-// Returns true if the given reshape/copy produces a result which is bit-wise
-// identical to its operand and thus may be replaced with a bitcast.
-//
-// This function is conservative -- even if this function returns false, the
-// reshape may still be a bitcast. For example, a reshape from [28x28] to [784].
-bool ReshapeOrCopyIsBitcast(
-    const HloInstruction* instr,
-    const AlgebraicSimplifierOptions::ValidBitcastCallback&
-        valid_bitcast_callback) {
+// Recursive helper for method below.
+HloInstruction* BitcastingOperandOfReshapeOrCopyChainHelper(
+    HloInstruction* instr, HloInstruction* operand,
+    const AlgebraicSimplifierOptions& options) {
+  // Can't replace chain of copies and reshapes with bitcasts if the compiler
+  // used a memory layout which isn't compatible.
+  if (options.ReshapeIsBitcast(operand->shape(), instr->shape())) {
+    return operand;
+  }
+
+  // If the operand is a copy or reshape try to see if the operand's operand
+  // would produce a bitcast with initial instruction.
+  if (HloOpcode::kReshape == operand->opcode() ||
+      HloOpcode::kCopy == operand->opcode()) {
+    return BitcastingOperandOfReshapeOrCopyChainHelper(
+        instr, operand->mutable_operand(0), options);
+  }
+  return nullptr;
+}
+
+// Returns an operand of a chain of reshapes and copies that is bit-wise
+// identical to first reshape or copy in the chain.
+HloInstruction* BitcastingOperandOfReshapeOrCopyChain(
+    HloInstruction* instr, const AlgebraicSimplifierOptions& options) {
+  if (!options.is_layout_sensitive()) {
+    return nullptr;
+  }
   CHECK(HloOpcode::kReshape == instr->opcode() ||
         HloOpcode::kCopy == instr->opcode());
-
-  const HloInstruction* operand = instr->operand(0);
-  // Can't insert bitcasts if the compiler used a memory layout which isn't
-  // compatible.
-  return ShapeUtil::ReshapeIsBitcast(operand->shape(), instr->shape()) &&
-         valid_bitcast_callback(operand->shape(), instr->shape());
+  return BitcastingOperandOfReshapeOrCopyChainHelper(
+      instr, instr->mutable_operand(0), options);
 }
 
 bool IsUnstridedSlice(const HloInstruction* hlo) {
@@ -200,6 +221,8 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 
   Status HandlePower(HloInstruction* power) override;
 
+  Status HandleRemainder(HloInstruction* remainder) override;
+
   Status HandleReshape(HloInstruction* reshape) override;
 
   Status HandleReduce(HloInstruction* reduce) override;
@@ -239,9 +262,16 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   // more fusion than leaving the nodes as Dot operations.
   StatusOr<bool> HandleDotStrengthReduction(HloInstruction* dot);
 
+  // Removes dimension dim from hlo.
+  HloInstruction* StripDim(HloInstruction* hlo, int64 dim) {
+    CHECK_EQ(hlo->shape().dimensions(dim), 1);
+    return computation_->AddInstruction(HloInstruction::CreateReshape(
+        ShapeUtil::DeleteDimension(dim, hlo->shape()), hlo));
+  }
+
   // Reshapes an instruction to rank 1 if it is not already rank 1.
   HloInstruction* Flatten(HloInstruction* hlo) {
-    if (ShapeUtil::Rank(hlo->shape()) == 1) {
+    if (hlo->shape().rank() == 1) {
       return hlo;
     }
     return computation_->AddInstruction(HloInstruction::CreateReshape(
@@ -250,19 +280,58 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
         hlo));
   }
 
-  // Helper method to perform and add reduction in a single dimension.
-  HloInstruction* AddReduce(HloInstruction* hlo, int64 dim) {
+  // Converts to primitive type if the input hlo is not that type, otherwise
+  // returns the original hlo.
+  HloInstruction* AsType(HloInstruction* hlo,
+                         const PrimitiveType element_type) {
+    if (hlo->shape().element_type() == element_type) {
+      return hlo;
+    }
+    return computation_->AddInstruction(HloInstruction::CreateConvert(
+        ShapeUtil::ChangeElementType(hlo->shape(), element_type), hlo));
+  }
+
+  // Transposes a dot operand such that the batch dimensions are the msot major,
+  // and the contracting dimensions are most minor.
+  StatusOr<HloInstruction*> NormalizeDotOperandToBatchMajorAndContractingMinor(
+      HloInstruction* dot_operand, absl::Span<const int64> batch_dimensions,
+      absl::Span<const int64> contracting_dimensions) {
+    std::vector<int64> transpose_dimensions(batch_dimensions.begin(),
+                                            batch_dimensions.end());
+    for (int64 i = 0; i < dot_operand->shape().rank(); ++i) {
+      if (!(absl::c_linear_search(batch_dimensions, i) ||
+            absl::c_linear_search(contracting_dimensions, i))) {
+        transpose_dimensions.push_back(i);
+      }
+    }
+    transpose_dimensions.insert(transpose_dimensions.end(),
+                                contracting_dimensions.begin(),
+                                contracting_dimensions.end());
+    return MakeTransposeHlo(dot_operand, transpose_dimensions);
+  }
+
+  // Helper method to perform and add reduction on a list of dimensions.
+  HloInstruction* AddReduce(HloInstruction* hlo, absl::Span<const int64> dims) {
     HloInstruction* zero =
         computation_->AddInstruction(HloInstruction::CreateConstant(
             LiteralUtil::Zero(hlo->shape().element_type()).Clone()));
     HloComputation* AddReduce_computation = GetOrCreateScalarAddComputation();
-    Shape shape = ShapeUtil::DeleteDimension(dim, hlo->shape());
+    Shape shape = ShapeUtil::FilterDimensions(
+        [&](int64 dim) { return !absl::c_linear_search(dims, dim); },
+        hlo->shape());
     return computation_->AddInstruction(HloInstruction::CreateReduce(
-        shape, hlo, zero, {dim}, AddReduce_computation));
+        shape, hlo, zero, dims, AddReduce_computation));
+  }
+
+  HloInstruction* AddReduce(HloInstruction* hlo, int64 dim) {
+    return AddReduce(hlo, std::vector<int64>{dim});
   }
 
-  // Convenience method for replacing an instruction with a bitcast.
-  void ReplaceWithBitcast(HloInstruction* instruction);
+  // Convenience method for replacing an instruction with a bitcast. If operand
+  // is not null, then the bitcast will use the specified operand instead of the
+  // operand of the instruction.
+  void ReplaceWithBitcast(HloInstruction* instruction,
+                          HloInstruction* operand = nullptr);
 
   // Replace old instruction with new instruction if old and new instructions
   // have the same shape. Updates uses and root instruction. Returns whether a
@@ -391,17 +460,19 @@ bool AlgebraicSimplifierVisitor::SameShape(const HloInstruction* lhs,
   }
 }
 
-void AlgebraicSimplifierVisitor::ReplaceWithBitcast(
-    HloInstruction* instruction) {
+void AlgebraicSimplifierVisitor::ReplaceWithBitcast(HloInstruction* instruction,
+                                                    HloInstruction* operand) {
   CHECK_EQ(1, instruction->operand_count());
+  if (operand == nullptr) {
+    operand = instruction->mutable_operand(0);
+  }
   CHECK_EQ(ShapeUtil::ElementsIn(instruction->shape()),
-           ShapeUtil::ElementsIn(instruction->operand(0)->shape()));
+           ShapeUtil::ElementsIn(operand->shape()));
   CHECK_EQ(ShapeUtil::ByteSizeOf(instruction->shape()),
-           ShapeUtil::ByteSizeOf(instruction->operand(0)->shape()));
+           ShapeUtil::ByteSizeOf(operand->shape()));
 
-  auto bitcast = computation_->AddInstruction(
-      HloInstruction::CreateUnary(instruction->shape(), HloOpcode::kBitcast,
-                                  instruction->mutable_operand(0)));
+  auto bitcast = computation_->AddInstruction(HloInstruction::CreateUnary(
+      instruction->shape(), HloOpcode::kBitcast, operand));
   TF_CHECK_OK(ReplaceInstruction(instruction, bitcast));
 }
 
@@ -562,9 +633,9 @@ Status AlgebraicSimplifierVisitor::HandleCopy(HloInstruction* copy) {
     return Status::OK();
   }
 
-  if (options_.is_layout_sensitive() &&
-      ReshapeOrCopyIsBitcast(copy, options_.valid_bitcast_callback())) {
-    ReplaceWithBitcast(copy);
+  if (HloInstruction* bitcast_operand =
+          BitcastingOperandOfReshapeOrCopyChain(copy, options_)) {
+    ReplaceWithBitcast(copy, bitcast_operand);
   }
 
   return Status::OK();
@@ -677,7 +748,7 @@ Status AlgebraicSimplifierVisitor::HandleConcatenate(
       return Status::OK();
     }
     PaddingConfig padding_config;
-    for (int64 dim = 0; dim < ShapeUtil::Rank(operands[0]->shape()); ++dim) {
+    for (int64 dim = 0; dim < operands[0]->shape().rank(); ++dim) {
       auto padding_config_dim = padding_config.add_dimensions();
       padding_config_dim->set_edge_padding_high(0);
       padding_config_dim->set_edge_padding_low(0);
@@ -705,7 +776,7 @@ Status AlgebraicSimplifierVisitor::HandleConcatenate(
 
 static HloInstruction* BuildTupleConstant(HloComputation* computation,
                                           const LiteralSlice& literal) {
-  if (ShapeUtil::IsTuple(literal.shape())) {
+  if (literal.shape().IsTuple()) {
     std::vector<HloInstruction*> elems;
     elems.reserve(ShapeUtil::TupleElementCount(literal.shape()));
     for (int i = 0; i < ShapeUtil::TupleElementCount(literal.shape()); ++i) {
@@ -722,7 +793,7 @@ static HloInstruction* BuildTupleConstant(HloComputation* computation,
 Status AlgebraicSimplifierVisitor::HandleConstant(HloInstruction* constant) {
   // Tuple constants aren't directly supported by any backend. Expand them into
   // explicit Tuple instructions.
-  if (ShapeUtil::IsTuple(constant->shape())) {
+  if (constant->shape().IsTuple()) {
     return ReplaceInstruction(
         constant, BuildTupleConstant(computation_, constant->literal()));
   }
@@ -744,7 +815,7 @@ Status AlgebraicSimplifierVisitor::HandleConstant(HloInstruction* constant) {
   }
 
   // If a literal is an increasing sequence from zero, replace it with an iota.
-  if (ShapeUtil::Rank(constant->shape()) == 1 &&
+  if (constant->shape().rank() == 1 &&
       ShapeUtil::ElementsIn(constant->shape()) > 1 &&
       constant->literal().IsR1Iota()) {
     return ReplaceWithNewInstruction(
@@ -781,10 +852,82 @@ Status InvertConstant(const HloInstruction& constant, Literal* result) {
     return T{1.0} / constant.literal().Get<T>(indices);
   });
 }
+
+template <typename T>
+std::unique_ptr<HloInstruction> TryDivideToShift(HloInstruction* divide,
+                                                 HloComputation* computation) {
+  HloInstruction *a, *b, *c;
+  CHECK(Match(divide, m::Divide(m::Op(&a), m::Op(&b))));
+
+  if (ShapeUtil::ElementIsIntegral(divide->shape()) &&
+      !Match(b, m::ConstantEffectiveScalar(&c)) &&
+      !Match(b, m::Broadcast(m::ConstantEffectiveScalar(&c)))) {
+    return nullptr;
+  }
+
+  if (ShapeUtil::ElementIsSigned(divide->shape())) {
+    int64 b_value = c->literal().GetFirstElement<T>();
+    if (b_value > 0 && IsPowerOfTwo(static_cast<uint64>(b_value))) {
+      // Handle negative dividends by negating the result of the division.
+      HloInstruction* zero_like_a = BroadcastZeros(
+          computation, a->shape().element_type(), a->shape().dimensions());
+
+      auto* dividend_is_negative =
+          computation->AddInstruction(HloInstruction::CreateBinary(
+              ShapeUtil::ChangeElementType(a->shape(), PRED), HloOpcode::kLt, a,
+              zero_like_a));
+
+      auto* negated_dividend = computation->AddInstruction(
+          HloInstruction::CreateUnary(a->shape(), HloOpcode::kNegate, a));
+
+      auto* abs_dividend =
+          computation->AddInstruction(HloInstruction::CreateTernary(
+              a->shape(), HloOpcode::kSelect, dividend_is_negative,
+              negated_dividend, a));
+
+      int log2_abs_b_value = tensorflow::Log2Floor64(b_value);
+
+      auto* shift_amount =
+          computation->AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<T>(log2_abs_b_value)));
+      if (!ShapeUtil::IsScalar(b->shape())) {
+        shift_amount = computation->AddInstruction(
+            HloInstruction::CreateBroadcast(b->shape(), shift_amount, {}));
+      }
+
+      auto* quotient = computation->AddInstruction(HloInstruction::CreateBinary(
+          divide->shape(), HloOpcode::kShiftRightLogical, abs_dividend,
+          shift_amount));
+
+      auto* neqated_quotient =
+          computation->AddInstruction(HloInstruction::CreateUnary(
+              quotient->shape(), HloOpcode::kNegate, quotient));
+
+      return HloInstruction::CreateTernary(divide->shape(), HloOpcode::kSelect,
+                                           dividend_is_negative,
+                                           neqated_quotient, quotient);
+    }
+  } else {
+    uint64 b_value = c->literal().GetFirstElement<T>();
+    if (IsPowerOfTwo(b_value)) {
+      int log2_abs_b_value = tensorflow::Log2Floor64(b_value);
+      HloInstruction* shift_amount =
+          computation->AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<T>(log2_abs_b_value)));
+      if (!ShapeUtil::IsScalar(b->shape())) {
+        shift_amount = computation->AddInstruction(
+            HloInstruction::CreateBroadcast(b->shape(), shift_amount, {}));
+      }
+      return HloInstruction::CreateBinary(
+          divide->shape(), HloOpcode::kShiftRightLogical, a, shift_amount);
+    }
+  }
+
+  return nullptr;
+}
 }  // namespace
 
 Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
-  Shape* shape;
   HloInstruction *a, *b, *c, *d;
   CHECK(Match(divide, m::Divide(m::Op(&a), m::Op(&b))));
   // A/1 => A
@@ -793,6 +936,61 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
     return Status::OK();
   }
 
+  // A / B => A >> log2(B) if B is a power of 2.
+  switch (divide->shape().element_type()) {
+    case S8:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryDivideToShift<int8>(divide, computation_)) {
+        return ReplaceWithNewInstruction(divide, std::move(shift));
+      }
+      break;
+    case S16:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryDivideToShift<int16>(divide, computation_)) {
+        return ReplaceWithNewInstruction(divide, std::move(shift));
+      }
+      break;
+    case S32:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryDivideToShift<int32>(divide, computation_)) {
+        return ReplaceWithNewInstruction(divide, std::move(shift));
+      }
+      break;
+    case S64:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryDivideToShift<int64>(divide, computation_)) {
+        return ReplaceWithNewInstruction(divide, std::move(shift));
+      }
+      break;
+    case U8:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryDivideToShift<uint8>(divide, computation_)) {
+        return ReplaceWithNewInstruction(divide, std::move(shift));
+      }
+      break;
+    case U16:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryDivideToShift<uint16>(divide, computation_)) {
+        return ReplaceWithNewInstruction(divide, std::move(shift));
+      }
+      break;
+    case U32:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryDivideToShift<uint32>(divide, computation_)) {
+        return ReplaceWithNewInstruction(divide, std::move(shift));
+      }
+      break;
+    case U64:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryDivideToShift<uint64>(divide, computation_)) {
+        return ReplaceWithNewInstruction(divide, std::move(shift));
+      }
+      break;
+    default:
+      break;
+  }
+
+  Shape* shape;
   // exp(A)/exp(B) => exp(A-B)
   if (Match(divide, m::Divide(m::Exp(m::Op(&a)), m::Exp(m::Op(&b)))
                         .WithShape(m::Shape(&shape)))) {
@@ -833,6 +1031,24 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
                     divide->shape(), HloOpcode::kMultiply, a, new_power));
   }
 
+  // A/sqrt(B) => A*rsqrt(X).
+  if (Match(divide, m::Divide(m::Op(&a), m::Sqrt(m::Op(&b))))) {
+    auto* rsqrt = computation_->AddInstruction(
+        HloInstruction::CreateUnary(divide->shape(), HloOpcode::kRsqrt, b));
+    return ReplaceWithNewInstruction(
+        divide, HloInstruction::CreateBinary(rsqrt->shape(),
+                                             HloOpcode::kMultiply, a, rsqrt));
+  }
+
+  // A/rsqrt(B) => A*sqrt(B).
+  if (Match(divide, m::Divide(m::Op(&a), m::Rsqrt(m::Op(&b))))) {
+    auto* sqrt = computation_->AddInstruction(
+        HloInstruction::CreateUnary(divide->shape(), HloOpcode::kSqrt, b));
+    return ReplaceWithNewInstruction(
+        divide, HloInstruction::CreateBinary(sqrt->shape(),
+                                             HloOpcode::kMultiply, a, sqrt));
+  }
+
   // Simplifying integral division would produce unexpected results.
   if (ShapeUtil::ElementIsIntegral(divide->shape())) {
     return Status::OK();
@@ -843,8 +1059,9 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
   // (Backends can do this transformation, but generally only if the constant is
   // a scalar.)
   if (Match(divide, m::Divide(m::NonConstant(&a), m::Constant(&b)))) {
-    Literal new_literal(b->shape());
-    switch (b->shape().element_type()) {
+    Shape result_shape = b->literal().shape();
+    Literal new_literal(result_shape);
+    switch (result_shape.element_type()) {
       case F16:
         TF_RETURN_IF_ERROR(InvertConstant<half>(*b, &new_literal));
         break;
@@ -860,6 +1077,9 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
       case C64:
         TF_RETURN_IF_ERROR(InvertConstant<complex64>(*b, &new_literal));
         break;
+      case C128:
+        TF_RETURN_IF_ERROR(InvertConstant<complex128>(*b, &new_literal));
+        break;
       default:
         return Status::OK();
     }
@@ -908,32 +1128,54 @@ StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
     HloInstruction* dot) {
   HloInstruction *lhs, *rhs;
   CHECK(Match(dot, m::Dot(m::Op(&lhs), m::Op(&rhs))));
-  int64 lhs_collapsing_dim =
-      dot->dot_dimension_numbers().lhs_contracting_dimensions(0);
+
+  const auto kept_dim = [](int64 rank, int64 contracting_dimension,
+                           absl::Span<const int64> batch_dimensions) -> int64 {
+    for (int64 i = 0; i < rank; ++i) {
+      if (i != contracting_dimension &&
+          !absl::c_linear_search(batch_dimensions, i)) {
+        return i;
+      }
+    }
+    return -1;
+  };
+
+  const int64 dot_rank = dot->shape().rank();
+  const int64 rhs_rank = rhs->shape().rank();
+  const int64 lhs_rank = lhs->shape().rank();
+  const auto& dnums = dot->dot_dimension_numbers();
+  if (dnums.rhs_contracting_dimensions_size() != 1) {
+    return false;
+  }
+  if (dot_rank > 2 && (lhs_rank != rhs_rank || lhs_rank != dot_rank)) {
+    return false;
+  }
+  int64 lhs_collapsing_dim = dnums.lhs_contracting_dimensions(0);
+  int64 lhs_kept_dim = kept_dim(lhs_rank, lhs_collapsing_dim,
+                                AsInt64Slice(dnums.lhs_batch_dimensions()));
+  // If there is no non-contracting dimension in rank 2, do not strength reduce.
+  if (lhs_kept_dim == -1 && lhs_rank > 1) {
+    return false;
+  }
   if (lhs->IsRank2Transpose()) {
     lhs = lhs->mutable_operand(0);
-    lhs_collapsing_dim = 1 - lhs_collapsing_dim;
+    std::swap(lhs_collapsing_dim, lhs_kept_dim);
   }
-  const int64 lhs_kept_dim = 1 - lhs_collapsing_dim;
 
-  int64 rhs_collapsing_dim =
-      dot->dot_dimension_numbers().rhs_contracting_dimensions(0);
+  int64 rhs_collapsing_dim = dnums.rhs_contracting_dimensions(0);
+  int64 rhs_kept_dim = kept_dim(rhs_rank, rhs_collapsing_dim,
+                                AsInt64Slice(dnums.rhs_batch_dimensions()));
+  // If there is no non-contracting dimension in rank 2, do not strength reduce.
+  if (rhs_kept_dim == -1 && rhs_rank > 1) {
+    return false;
+  }
   if (rhs->IsRank2Transpose()) {
     rhs = rhs->mutable_operand(0);
-    rhs_collapsing_dim = 1 - rhs_collapsing_dim;
+    std::swap(rhs_collapsing_dim, rhs_kept_dim);
   }
-  const int64 rhs_kept_dim = 1 - rhs_collapsing_dim;
-
-  auto as_type = [&](HloInstruction* hlo, const PrimitiveType element_type) {
-    if (hlo->shape().element_type() == element_type) {
-      return hlo;
-    }
-    return computation_->AddInstruction(HloInstruction::CreateConvert(
-        ShapeUtil::ChangeElementType(hlo->shape(), element_type), hlo));
-  };
 
   auto reshape_if_necessary = [&](HloInstruction* hlo) {
-    hlo = as_type(hlo, dot->shape().element_type());
+    hlo = AsType(hlo, dot->shape().element_type());
     if (!ShapeUtil::SameDimensions(hlo->shape(), dot->shape())) {
       hlo = computation_->AddInstruction(
           HloInstruction::CreateReshape(dot->shape(), hlo));
@@ -942,13 +1184,18 @@ StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
   };
 
   auto add_reduce_in_f32 = [&](HloInstruction* hlo, const int64 dim) {
-    return AddReduce(as_type(hlo, F32), dim);
+    return AddReduce(AsType(hlo, F32), dim);
+  };
+
+  auto broadcast = [&](HloInstruction* hlo, const Shape& shape,
+                       absl::Span<const int64> dims) {
+    return computation_->AddInstruction(
+        HloInstruction::CreateBroadcast(shape, hlo, dims));
   };
 
   auto broadcast_to_dim = [&](HloInstruction* hlo, const Shape& shape,
                               int64 dim) {
-    return computation_->AddInstruction(
-        HloInstruction::CreateBroadcast(shape, hlo, {dim}));
+    return broadcast(hlo, shape, {dim});
   };
 
   auto multiply = [&](HloInstruction* local_lhs, HloInstruction* local_rhs) {
@@ -959,11 +1206,9 @@ StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
   // Strength reduce dot(a[K] , b[K]) =
   //  reshape(result.shape,
   //          reduce_sum(multiply(a, b), {0}))
-  if (ShapeUtil::Rank(rhs->shape()) == 1 &&
-      ShapeUtil::Rank(lhs->shape()) == 1) {
-    TF_RETURN_IF_ERROR(
-        ReplaceInstruction(dot, reshape_if_necessary(add_reduce_in_f32(
-                                    multiply(Flatten(lhs), Flatten(rhs)), 0))));
+  if (rhs_rank == 1 && lhs_rank == 1) {
+    TF_RETURN_IF_ERROR(ReplaceInstruction(
+        dot, reshape_if_necessary(add_reduce_in_f32(multiply(lhs, rhs), 0))));
     return true;
   }
 
@@ -977,8 +1222,7 @@ StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
   // Simplify outer product into multiply with implicit broadcasting.
   //
   // A dot(a[M, 1], b[1, N]) = multiply(a [M,1], b [1, N])
-  if (ShapeUtil::Rank(rhs->shape()) == 2 &&
-      rhs->shape().dimensions(rhs_collapsing_dim) == 1) {
+  if (rhs_rank == 2 && rhs->shape().dimensions(rhs_collapsing_dim) == 1) {
     TF_RETURN_IF_ERROR(ReplaceInstruction(
         dot, multiply(broadcast_to_dim(Flatten(lhs), dot->shape(), 0),
                       broadcast_to_dim(Flatten(rhs), dot->shape(), 1))));
@@ -992,10 +1236,9 @@ StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
   //        {0})
   //      )
   //    )
-  if (ShapeUtil::Rank(lhs->shape()) == 1 ||
-      (ShapeUtil::Rank(lhs->shape()) == 2 &&
-       lhs->shape().dimensions(lhs_kept_dim) == 1)) {
-    if (ShapeUtil::Rank(rhs->shape()) == 1) {
+  if (lhs_rank == 1 ||
+      (lhs_rank == 2 && lhs->shape().dimensions(lhs_kept_dim) == 1)) {
+    if (rhs->shape().rank() == 1) {
       TF_RETURN_IF_ERROR(
           ReplaceInstruction(dot, reshape_if_necessary(add_reduce_in_f32(
                                       multiply(Flatten(lhs), rhs), 0))));
@@ -1014,9 +1257,8 @@ StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
   //  reshape(result.shape,
   //    reduce_sum(multiply(a, broadcast(reshape([K],b), {1})), {0})
   //  )
-  if (ShapeUtil::Rank(rhs->shape()) == 1 ||
-      (ShapeUtil::Rank(rhs->shape()) == 2 &&
-       rhs->shape().dimensions(rhs_kept_dim) == 1)) {
+  if (rhs_rank == 1 ||
+      (rhs_rank == 2 && rhs->shape().dimensions(rhs_kept_dim) == 1)) {
     TF_RETURN_IF_ERROR(ReplaceInstruction(
         dot, reshape_if_necessary(add_reduce_in_f32(
                  multiply(lhs, broadcast_to_dim(Flatten(rhs), lhs->shape(),
@@ -1024,6 +1266,97 @@ StatusOr<bool> AlgebraicSimplifierVisitor::HandleDotStrengthReduction(
                  lhs_collapsing_dim))));
     return true;
   }
+
+  // Only consider kDot with batch dimension.
+  if (dot_rank <= 2) {
+    return false;
+  }
+
+  CHECK_EQ(rhs_rank, lhs_rank);
+  CHECK_EQ(dot_rank, lhs_rank);
+  // If there is more than one non-contracting dimension or the batch dimensions
+  // are not equal, bail out since transposes may be required to do a strength
+  // reduction.
+  if (dnums.rhs_batch_dimensions_size() + 2 != dot_rank ||
+      !absl::c_equal(dnums.lhs_batch_dimensions(),
+                     dnums.rhs_batch_dimensions())) {
+    return false;
+  }
+
+  auto broadcast_dims = [](int64 rank, int64 non_broadcast_dim) {
+    absl::InlinedVector<int64, 8> dims;
+    for (int64 i = 0; i < rank; ++i) {
+      if (i != non_broadcast_dim) {
+        dims.push_back(i);
+      }
+    }
+    return dims;
+  };
+
+  // If the contracting dimension is 1, remove the degnerate dimnensions from
+  // the lhs and rhs, broadcast each to the result shape and multiply.
+  if (lhs->shape().dimensions(lhs_collapsing_dim) == 1 &&
+      (rhs_kept_dim == rhs_rank - 1 ||
+       (rhs_collapsing_dim == rhs_rank - 1 && rhs_kept_dim == rhs_rank - 2))) {
+    CHECK_EQ(rhs->shape().dimensions(rhs_collapsing_dim), 1);
+    const int64 lhs_kept_dim_in_output =
+        lhs_kept_dim > lhs_collapsing_dim ? (lhs_kept_dim - 1) : lhs_kept_dim;
+    absl::InlinedVector<int64, 8> lhs_broadcast_dims;
+    for (const int64 dim : dnums.lhs_batch_dimensions()) {
+      lhs_broadcast_dims.push_back(dim > lhs_collapsing_dim ? (dim - 1) : dim);
+    }
+    absl::InlinedVector<int64, 8> rhs_broadcast_dims = lhs_broadcast_dims;
+    lhs_broadcast_dims.push_back(lhs_kept_dim_in_output);
+    absl::c_sort(lhs_broadcast_dims);
+    rhs_broadcast_dims.push_back(dot_rank - 1);
+    absl::c_sort(rhs_broadcast_dims);
+    TF_RETURN_IF_ERROR(ReplaceInstruction(
+        dot, reshape_if_necessary(
+                 multiply(broadcast(StripDim(lhs, lhs_collapsing_dim),
+                                    dot->shape(), lhs_broadcast_dims),
+                          broadcast(StripDim(rhs, rhs_collapsing_dim),
+                                    dot->shape(), rhs_broadcast_dims)))));
+    return true;
+  }
+
+  // If the lhs and rhs non-contracting dimensions are both one, strip each one,
+  // multiply and then reduce the collapsing dimension
+  if (lhs->shape().dimensions(lhs_kept_dim) == 1 &&
+      rhs->shape().dimensions(rhs_kept_dim) == 1 &&
+      lhs_kept_dim == rhs_kept_dim) {
+    auto new_lhs = StripDim(lhs, lhs_kept_dim);
+    auto new_rhs = StripDim(rhs, rhs_kept_dim);
+    const int64 reduce_dim = rhs_kept_dim < rhs_collapsing_dim
+                                 ? (rhs_collapsing_dim - 1)
+                                 : rhs_collapsing_dim;
+    TF_RETURN_IF_ERROR(
+        ReplaceInstruction(dot, reshape_if_necessary(add_reduce_in_f32(
+                                    multiply(new_lhs, new_rhs), reduce_dim))));
+    return true;
+  }
+
+  // If the lhs  non-contracting dimensions is one, strip the one, brodcast to
+  // the rhs shape, multiply and then reduce the collapsing dimension
+  if (lhs->shape().dimensions(lhs_kept_dim) == 1) {
+    auto new_lhs = broadcast(StripDim(lhs, lhs_kept_dim), rhs->shape(),
+                             broadcast_dims(rhs_rank, rhs_kept_dim));
+    TF_RETURN_IF_ERROR(ReplaceInstruction(
+        dot, reshape_if_necessary(add_reduce_in_f32(multiply(new_lhs, rhs),
+                                                    rhs_collapsing_dim))));
+    return true;
+  }
+
+  // If the rhs  non-contracting dimensions is one, strip the one, brodcast to
+  // the lhs shape, multiply and then reduce the collapsing dimension
+  if (rhs->shape().dimensions(rhs_kept_dim) == 1) {
+    auto new_rhs = broadcast(StripDim(rhs, rhs_kept_dim), lhs->shape(),
+                             broadcast_dims(lhs_rank, lhs_kept_dim));
+    TF_RETURN_IF_ERROR(ReplaceInstruction(
+        dot, reshape_if_necessary(add_reduce_in_f32(multiply(lhs, new_rhs),
+                                                    lhs_collapsing_dim))));
+    return true;
+  }
+
   return false;
 }
 
@@ -1242,6 +1575,9 @@ StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfGather(
   // => output dimensions: DS ({M x N}, {0, start}, {M, 1}) => {M x 1}.
 
   bool lhs_is_dynamic_slice = lhs->opcode() == HloOpcode::kDynamicSlice;
+  HloDynamicSliceInstruction* dynamic_slice =
+      lhs_is_dynamic_slice ? Cast<HloDynamicSliceInstruction>(lhs)
+                           : Cast<HloDynamicSliceInstruction>(rhs);
 
   // ctA:
   HloInstruction* left_operand =
@@ -1259,8 +1595,6 @@ StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfGather(
       HloInstruction::CreateDot(memoized_shape, left_operand, right_operand,
                                 dnums, dot->precision_config()));
   // Get pair {start, 0} or {0, start}.
-  HloInstruction* original_start_indices =
-      lhs_is_dynamic_slice ? lhs->mutable_operand(1) : rhs->mutable_operand(1);
   // Position of start:
   int index_of_non_zero_start = lhs_is_dynamic_slice
                                     ? 1 - lhs_contracting_dimension
@@ -1269,23 +1603,19 @@ StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfGather(
   int index_of_zero_start = 1 - index_of_non_zero_start;
 
   // Slice out start and 0 components and reorder if necessary.
-  auto indices_type = original_start_indices->shape().element_type();
+  auto indices_type = dynamic_slice->operand(1)->shape().element_type();
   Shape s_shape = ShapeUtil::MakeShape(indices_type, {1});
   Shape d_shape = ShapeUtil::MakeShape(indices_type, {2});
   HloInstruction* non_zero_start =
-      computation_->AddInstruction(HloInstruction::CreateSlice(
-          s_shape, original_start_indices, {index_of_non_zero_start},
-          {index_of_non_zero_start + 1}, {1}));
+      dynamic_slice->mutable_operand(1 + index_of_non_zero_start);
   HloInstruction* zero_start =
-      computation_->AddInstruction(HloInstruction::CreateSlice(
-          s_shape, original_start_indices, {index_of_zero_start},
-          {index_of_zero_start + 1}, {1}));
-  HloInstruction* new_start_indices =
-      lhs_is_dynamic_slice
-          ? computation_->AddInstruction(HloInstruction::CreateConcatenate(
-                d_shape, {non_zero_start, zero_start}, 0))
-          : computation_->AddInstruction(HloInstruction::CreateConcatenate(
-                d_shape, {zero_start, non_zero_start}, 0));
+      dynamic_slice->mutable_operand(1 + index_of_zero_start);
+  std::vector<HloInstruction*> new_start_indices;
+  if (lhs_is_dynamic_slice) {
+    new_start_indices = {non_zero_start, zero_start};
+  } else {
+    new_start_indices = {zero_start, non_zero_start};
+  }
 
   // Build DynamicSlice(ctA x ctB).
   const int new_slice_m = lhs_is_dynamic_slice ? 1 : m;
@@ -1301,26 +1631,145 @@ StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfGather(
 Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
   HloInstruction *lhs, *rhs;
   CHECK(Match(dot, m::Dot(m::Op(&lhs), m::Op(&rhs))));
-
-  // Only optimize F32 or BF16 dot operations where the dot, rhs and lhs are
-  // rank 2 or below.
-  if ((dot->shape().element_type() != F32 &&
-       dot->shape().element_type() != BF16) ||
-      ShapeUtil::Rank(lhs->shape()) > 2 || ShapeUtil::Rank(rhs->shape()) > 2 ||
-      ShapeUtil::Rank(dot->shape()) > 2) {
+  if (options_.is_layout_sensitive()) {
     return Status::OK();
   }
-
   // Replace a zero element dot with a broadcast of the constant 0.
   if (ShapeUtil::IsZeroElementArray(dot->shape()) ||
       ShapeUtil::IsZeroElementArray(lhs->shape()) ||
       ShapeUtil::IsZeroElementArray(rhs->shape())) {
-    auto zero = computation_->AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f)));
+    auto zero = computation_->AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::Zero(dot->shape().element_type())));
     return ReplaceWithNewInstruction(
         dot, HloInstruction::CreateBroadcast(dot->shape(), zero, {}));
   }
 
+  // Only optimize F32 or BF16 dot operations where the dot, rhs and lhs are
+  // rank 2 or below.
+  if (dot->shape().element_type() != F32 &&
+      dot->shape().element_type() != BF16) {
+    return Status::OK();
+  }
+
+  // If there are no contracting dimensions, a dot can be rewritten as
+  // mul(broadcast(transpose(x)),broadcast(transpose(y)))
+  if (dot->dot_dimension_numbers().lhs_contracting_dimensions_size() == 0) {
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * new_lhs,
+        NormalizeDotOperandToBatchMajorAndContractingMinor(
+            lhs,
+            AsInt64Slice(dot->dot_dimension_numbers().lhs_batch_dimensions()),
+            AsInt64Slice(
+                dot->dot_dimension_numbers().lhs_contracting_dimensions())));
+    if (dot->shape().rank() != lhs->shape().rank()) {
+      std::vector<int64> lhs_broadcast_dims(lhs->shape().rank());
+      absl::c_iota(lhs_broadcast_dims, 0);
+      new_lhs = computation_->AddInstruction(HloInstruction::CreateBroadcast(
+          dot->shape(), new_lhs, lhs_broadcast_dims));
+    }
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * new_rhs,
+        NormalizeDotOperandToBatchMajorAndContractingMinor(
+            rhs,
+            AsInt64Slice(dot->dot_dimension_numbers().rhs_batch_dimensions()),
+            AsInt64Slice(
+                dot->dot_dimension_numbers().rhs_contracting_dimensions())));
+    if (dot->shape().rank() != rhs->shape().rank()) {
+      std::vector<int64> rhs_broadcast_dims(
+          dot->dot_dimension_numbers().lhs_batch_dimensions_size());
+      absl::c_iota(rhs_broadcast_dims, 0);
+      for (int64 i = lhs->shape().rank(); i < dot->shape().rank(); ++i) {
+        rhs_broadcast_dims.push_back(i);
+      }
+      new_rhs = computation_->AddInstruction(HloInstruction::CreateBroadcast(
+          dot->shape(), new_rhs, rhs_broadcast_dims));
+    }
+    return ReplaceWithNewInstruction(
+        dot, HloInstruction::CreateBinary(dot->shape(), HloOpcode::kMultiply,
+                                          new_lhs, new_rhs));
+  }
+
+  // If the lhs or rhs have only batch and contracting dimensions, a dot can be
+  // rewritten as reduce(mul(broadcast(transpose(x)),broadcast(transpose(y))))
+  if ((dot->dot_dimension_numbers().lhs_batch_dimensions_size() +
+           dot->dot_dimension_numbers().lhs_contracting_dimensions_size() ==
+       lhs->shape().rank()) ||
+      (dot->dot_dimension_numbers().rhs_contracting_dimensions_size() +
+           dot->dot_dimension_numbers().rhs_batch_dimensions_size() ==
+       rhs->shape().rank())) {
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * new_lhs,
+        NormalizeDotOperandToBatchMajorAndContractingMinor(
+            lhs,
+            AsInt64Slice(dot->dot_dimension_numbers().lhs_batch_dimensions()),
+            AsInt64Slice(
+                dot->dot_dimension_numbers().lhs_contracting_dimensions())));
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * new_rhs,
+        NormalizeDotOperandToBatchMajorAndContractingMinor(
+            rhs,
+            AsInt64Slice(dot->dot_dimension_numbers().rhs_batch_dimensions()),
+            AsInt64Slice(
+                dot->dot_dimension_numbers().rhs_contracting_dimensions())));
+
+    int64 lhs_outer_dims =
+        lhs->shape().rank() -
+        (dot->dot_dimension_numbers().lhs_batch_dimensions_size() +
+         dot->dot_dimension_numbers().lhs_contracting_dimensions_size());
+    int64 rhs_outer_dims =
+        rhs->shape().rank() -
+        (dot->dot_dimension_numbers().rhs_batch_dimensions_size() +
+         dot->dot_dimension_numbers().rhs_contracting_dimensions_size());
+    CHECK(lhs_outer_dims == 0 || rhs_outer_dims == 0);
+    if (rhs_outer_dims > 0) {
+      std::vector<int64> lhs_broadcast_dims(
+          dot->dot_dimension_numbers().lhs_batch_dimensions_size());
+      absl::c_iota(lhs_broadcast_dims, 0);
+      lhs_broadcast_dims.resize(lhs->shape().rank());
+      std::iota(lhs_broadcast_dims.begin() +
+                    dot->dot_dimension_numbers().lhs_batch_dimensions_size(),
+                lhs_broadcast_dims.end(),
+                dot->dot_dimension_numbers().lhs_batch_dimensions_size() +
+                    rhs_outer_dims);
+      new_lhs = computation_->AddInstruction(HloInstruction::CreateBroadcast(
+          new_rhs->shape(), new_lhs, lhs_broadcast_dims));
+    } else if (lhs_outer_dims > 0) {
+      std::vector<int64> rhs_broadcast_dims(
+          dot->dot_dimension_numbers().rhs_batch_dimensions_size());
+      absl::c_iota(rhs_broadcast_dims, 0);
+      rhs_broadcast_dims.resize(rhs->shape().rank());
+      std::iota(rhs_broadcast_dims.begin() +
+                    dot->dot_dimension_numbers().rhs_batch_dimensions_size(),
+                rhs_broadcast_dims.end(),
+                dot->dot_dimension_numbers().rhs_batch_dimensions_size() +
+                    lhs_outer_dims);
+      new_rhs = computation_->AddInstruction(HloInstruction::CreateBroadcast(
+          new_lhs->shape(), new_rhs, rhs_broadcast_dims));
+    }
+
+    TF_ASSIGN_OR_RETURN(HloInstruction * new_dot,
+                        MakeBinaryHlo(HloOpcode::kMultiply, new_lhs, new_rhs));
+    std::vector<int64> reduce_dims(
+        dot->dot_dimension_numbers().lhs_contracting_dimensions_size());
+    new_dot = AsType(new_dot, F32);
+    const int64 outer_dims = std::max(rhs_outer_dims, lhs_outer_dims);
+    absl::c_iota(
+        reduce_dims,
+        outer_dims + dot->dot_dimension_numbers().lhs_batch_dimensions_size());
+    new_dot = AddReduce(new_dot, reduce_dims);
+    new_dot = AsType(new_dot, dot->shape().element_type());
+    return ReplaceInstruction(dot, new_dot);
+  }
+
+  if (lhs->shape().rank() > 2 || rhs->shape().rank() > 2 ||
+      dot->shape().rank() > 2) {
+    if (options_.enable_dot_strength_reduction() &&
+        !options_.is_layout_sensitive()) {
+      TF_RETURN_IF_ERROR(HandleDotStrengthReduction(dot).status());
+    }
+    return Status::OK();
+  }
+
   TF_ASSIGN_OR_RETURN(HloInstruction * dot_of_concat_optimized,
                       OptimizeDotOfConcat(dot));
   if (dot_of_concat_optimized) {
@@ -1350,7 +1799,11 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
   }
 
   // Simplify dot(transpose(a), transpose(b)) to transpose(dot(b,a)).
-  if (lhs->IsRank2Transpose() && rhs->IsRank2Transpose()) {
+  if (dot->dot_dimension_numbers().lhs_batch_dimensions_size() == 0 &&
+      dot->dot_dimension_numbers().lhs_contracting_dimensions_size() == 1 &&
+      dot->dot_dimension_numbers().lhs_contracting_dimensions(0) == 1 &&
+      dot->dot_dimension_numbers().rhs_contracting_dimensions(0) == 0 &&
+      lhs->IsRank2Transpose() && rhs->IsRank2Transpose()) {
     DotDimensionNumbers dot_dimension_numbers;
     dot_dimension_numbers.add_lhs_contracting_dimensions(1);
     dot_dimension_numbers.add_rhs_contracting_dimensions(0);
@@ -1549,7 +2002,7 @@ bool OutputIsPermutationOfOperandElements(HloInstruction* instruction,
     case HloOpcode::kTranspose:
       return true;
     case HloOpcode::kSort:
-      return (!ShapeUtil::IsTuple(instruction->shape()));
+      return (!instruction->shape().IsTuple());
     default:
       return false;
   }
@@ -1595,8 +2048,7 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
 
   // A degenerate broadcast that has the same input and output rank can be
   // converted into a transpose.
-  if (ShapeUtil::Rank(broadcast->shape()) ==
-          ShapeUtil::Rank(operand->shape()) &&
+  if (broadcast->shape().rank() == operand->shape().rank() &&
       ShapeUtil::ElementsIn(broadcast->shape()) ==
           ShapeUtil::ElementsIn(operand->shape())) {
     VLOG(10) << "transform broadcast(X) -> transpose(X) where "
@@ -1751,7 +2203,7 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
   if (HasInteriorPadding(pad->padding_config())) {
     PaddingConfig padding_config = pad->padding_config();
     bool cleared_interior_padding = false;
-    for (int64 i = 0; i < ShapeUtil::Rank(pad->shape()); ++i) {
+    for (int64 i = 0; i < pad->shape().rank(); ++i) {
       if (padding_config.dimensions(i).interior_padding() > 0 &&
           pad->operand(0)->shape().dimensions(i) == 1) {
         cleared_interior_padding = true;
@@ -2002,14 +2454,151 @@ AlgebraicSimplifierVisitor::TryToSinkBroadcastAfterOpWithUniqueNonScalarOperand(
   return changed;
 }
 
+namespace {
+template <typename T>
+std::unique_ptr<HloInstruction> TryRemainderToAnd(HloInstruction* remainder,
+                                                  HloComputation* computation) {
+  HloInstruction *a, *b, *c;
+  CHECK(Match(remainder, m::Remainder(m::Op(&a), m::Op(&b))));
+
+  if (ShapeUtil::ElementIsIntegral(remainder->shape()) &&
+      !Match(b, m::ConstantEffectiveScalar(&c)) &&
+      !Match(b, m::Broadcast(m::ConstantEffectiveScalar(&c)))) {
+    return nullptr;
+  }
+
+  if (ShapeUtil::ElementIsSigned(remainder->shape())) {
+    int64 b_value = c->literal().GetFirstElement<T>();
+    if (b_value > 0 && IsPowerOfTwo(static_cast<uint64>(b_value))) {
+      // Handle negative dividends by negating the result of the division.
+      HloInstruction* zero_like_a = BroadcastZeros(
+          computation, a->shape().element_type(), a->shape().dimensions());
+
+      auto* dividend_is_negative =
+          computation->AddInstruction(HloInstruction::CreateBinary(
+              ShapeUtil::ChangeElementType(a->shape(), PRED), HloOpcode::kLt, a,
+              zero_like_a));
+
+      auto* negated_dividend = computation->AddInstruction(
+          HloInstruction::CreateUnary(a->shape(), HloOpcode::kNegate, a));
+
+      auto* abs_dividend =
+          computation->AddInstruction(HloInstruction::CreateTernary(
+              a->shape(), HloOpcode::kSelect, dividend_is_negative,
+              negated_dividend, a));
+
+      auto* mask_amount =
+          computation->AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<T>(b_value - 1)));
+      if (!ShapeUtil::IsScalar(b->shape())) {
+        mask_amount = computation->AddInstruction(
+            HloInstruction::CreateBroadcast(b->shape(), mask_amount, {}));
+      }
+
+      auto* quotient = computation->AddInstruction(HloInstruction::CreateBinary(
+          remainder->shape(), HloOpcode::kAnd, abs_dividend, mask_amount));
+
+      auto* neqated_quotient =
+          computation->AddInstruction(HloInstruction::CreateUnary(
+              quotient->shape(), HloOpcode::kNegate, quotient));
+
+      return HloInstruction::CreateTernary(
+          remainder->shape(), HloOpcode::kSelect, dividend_is_negative,
+          neqated_quotient, quotient);
+    }
+  } else {
+    uint64 b_value = c->literal().GetFirstElement<T>();
+    if (IsPowerOfTwo(b_value)) {
+      HloInstruction* mask_amount =
+          computation->AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<T>(b_value - 1)));
+      if (!ShapeUtil::IsScalar(b->shape())) {
+        mask_amount = computation->AddInstruction(
+            HloInstruction::CreateBroadcast(b->shape(), mask_amount, {}));
+      }
+      return HloInstruction::CreateBinary(remainder->shape(), HloOpcode::kAnd,
+                                          a, mask_amount);
+    }
+  }
+  return nullptr;
+}
+}  // namespace
+
+Status AlgebraicSimplifierVisitor::HandleRemainder(HloInstruction* remainder) {
+  HloInstruction *a, *b;
+  CHECK(Match(remainder, m::Remainder(m::Op(&a), m::Op(&b))));
+
+  // A % B => A & (B - 1) if B is a power of 2.
+  switch (remainder->shape().element_type()) {
+    case S8:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryRemainderToAnd<int8>(remainder, computation_)) {
+        return ReplaceWithNewInstruction(remainder, std::move(shift));
+      }
+      break;
+    case S16:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryRemainderToAnd<int16>(remainder, computation_)) {
+        return ReplaceWithNewInstruction(remainder, std::move(shift));
+      }
+      break;
+    case S32:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryRemainderToAnd<int32>(remainder, computation_)) {
+        return ReplaceWithNewInstruction(remainder, std::move(shift));
+      }
+      break;
+    case S64:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryRemainderToAnd<int64>(remainder, computation_)) {
+        return ReplaceWithNewInstruction(remainder, std::move(shift));
+      }
+      break;
+    case U8:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryRemainderToAnd<uint8>(remainder, computation_)) {
+        return ReplaceWithNewInstruction(remainder, std::move(shift));
+      }
+      break;
+    case U16:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryRemainderToAnd<uint16>(remainder, computation_)) {
+        return ReplaceWithNewInstruction(remainder, std::move(shift));
+      }
+      break;
+    case U32:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryRemainderToAnd<uint32>(remainder, computation_)) {
+        return ReplaceWithNewInstruction(remainder, std::move(shift));
+      }
+      break;
+    case U64:
+      if (std::unique_ptr<HloInstruction> shift =
+              TryRemainderToAnd<uint64>(remainder, computation_)) {
+        return ReplaceWithNewInstruction(remainder, std::move(shift));
+      }
+      break;
+    default:
+      break;
+  }
+
+  return Status::OK();
+}
+
 Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) {
   auto operand = reshape->mutable_operand(0);
 
   // Reshape directly to empty constant if the shape contains zero-element
   // dimension.
   if (ShapeUtil::IsZeroElementArray(reshape->shape())) {
+    // If the instruction doesn't have a layout, use a default layout for
+    // the literal result.
+    Shape reshaped_shape = reshape->shape();
+    if (!LayoutUtil::HasLayout(reshaped_shape)) {
+      LayoutUtil::SetToDefaultLayout(&reshaped_shape);
+    }
     auto empty_constant = HloInstruction::CreateConstant(
-        Literal::CreateFromShape(reshape->shape()));
+        Literal::CreateFromShape(reshaped_shape));
 
     return ReplaceWithNewInstruction(reshape, std::move(empty_constant));
   }
@@ -2026,6 +2615,7 @@ Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) {
         reshape, HloInstruction::CreateReshape(reshape->shape(),
                                                operand->mutable_operand(0)));
   }
+
   if (operand->opcode() == HloOpcode::kRng && operand->user_count() == 1) {
     *operand->mutable_shape() = reshape->shape();
     return ReplaceInstruction(reshape, operand);
@@ -2057,12 +2647,10 @@ Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) {
   }
 
   // Make this a bitcast if possible.
-  if (options_.is_layout_sensitive() &&
-      ReshapeOrCopyIsBitcast(reshape, options_.valid_bitcast_callback())) {
-    ReplaceWithBitcast(reshape);
-    return Status::OK();
+  if (HloInstruction* bitcast_operand =
+          BitcastingOperandOfReshapeOrCopyChain(reshape, options_)) {
+    ReplaceWithBitcast(reshape, bitcast_operand);
   }
-
   return Status::OK();
 }
 
@@ -2072,8 +2660,7 @@ Status AlgebraicSimplifierVisitor::HandleReverse(HloInstruction* reverse) {
   auto dim_is_one = [&](int64 i) -> bool {
     return reverse->shape().dimensions(i) == 1;
   };
-  if (std::all_of(reverse->dimensions().begin(), reverse->dimensions().end(),
-                  dim_is_one)) {
+  if (absl::c_all_of(reverse->dimensions(), dim_is_one)) {
     return ReplaceInstruction(reverse, reverse->mutable_operand(0));
   }
   return Status::OK();
@@ -2106,11 +2693,11 @@ StatusOr<bool> AlgebraicSimplifierVisitor::TrySimplifyScalarSlice(
         int64 start = slice->slice_starts(i);
         int64 low = padding_config.dimensions(i).edge_padding_low();
         int64 data = pad->operand(0)->shape().dimensions(i);
-        if (start >= low && start < low + data) {
-          return false;
+        if (start < low || start >= low + data) {
+          return true;
         }
       }
-      return true;
+      return false;
     }();
 
     if (in_padding) {
@@ -2138,7 +2725,7 @@ StatusOr<bool> AlgebraicSimplifierVisitor::TrySimplifyScalarSlice(
   if (slice->operand(0)->opcode() == HloOpcode::kConcatenate) {
     VLOG(10) << "Trying to simplify scalar slice of concat";
     // Only do this for R1, there's no chance of this being useful otherwise.
-    if (ShapeUtil::Rank(slice->shape()) != 1) {
+    if (slice->shape().rank() != 1) {
       VLOG(10) << "Not folding, slice is not rank 1";
       return false;
     }
@@ -2188,7 +2775,7 @@ StatusOr<bool> AlgebraicSimplifierVisitor::TryToReorderSliceAndReshape(
     return false;
   }
   HloInstruction* new_slice_operand = reshape->mutable_operand(0);
-  int64 slice_rank = ShapeUtil::Rank(slice->shape());
+  int64 slice_rank = slice->shape().rank();
   std::vector<int64> sliced_dims;
   for (int64 i = 0; i < slice_rank; ++i) {
     if (slice->slice_starts(i) != 0 ||
@@ -2200,7 +2787,7 @@ StatusOr<bool> AlgebraicSimplifierVisitor::TryToReorderSliceAndReshape(
   if (sliced_dims.size() == 1 && sliced_dims[0] == 0 &&
       slice->slice_starts(0) == 0) {
     const Shape& new_slice_shape = new_slice_operand->shape();
-    const int64 rank = ShapeUtil::Rank(new_slice_shape);
+    const int64 rank = new_slice_shape.rank();
     std::vector<int64> new_slice_starts(rank, 0);
     std::vector<int64> new_slice_stides(rank, 1);
     std::vector<int64> new_slice_limits(new_slice_shape.dimensions().begin(),
@@ -2297,28 +2884,71 @@ Status AlgebraicSimplifierVisitor::HandleDynamicUpdateSlice(
   return Status::OK();
 }
 
-Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
-  // TODO(b/112040122): Most of those optimizations can be done for multi-output
-  // reduces.
-  if (ShapeUtil::IsTuple(reduce->shape())) {
-    return Status::OK();
-  }
+Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
+  HloReduceInstruction* reduce = Cast<HloReduceInstruction>(hlo);
+  bool multi_output_reduce = reduce->shape().IsTuple();
+
+  // For tuple reduce, we require all reduce shapes to be the same, up to the
+  // element types, so we can just the first operand and the first result as a
+  // representative.
+  auto arg = reduce->inputs()[0];
+  auto init_value = reduce->init_values()[0];
+  const Shape& reduce_result_shape =
+      multi_output_reduce ? reduce->shape().tuple_shapes(0) : reduce->shape();
 
-  auto arg = reduce->mutable_operand(0);
-  auto init_value = reduce->mutable_operand(1);
   absl::Span<const int64> dimensions(reduce->dimensions());
   HloComputation* function = reduce->to_apply();
   if (ShapeUtil::IsZeroElementArray(arg->shape()) ||
-      ShapeUtil::IsZeroElementArray(reduce->shape())) {
-    return ReplaceWithNewInstruction(
-        reduce,
-        HloInstruction::CreateBroadcast(reduce->shape(), init_value, {}));
+      ShapeUtil::IsZeroElementArray(reduce_result_shape)) {
+    if (multi_output_reduce) {
+      std::vector<HloInstruction*> broadcast_inits;
+      int64 inputs = reduce->input_count();
+      for (int64 i = 0; i < inputs; ++i) {
+        broadcast_inits.push_back(computation_->AddInstruction(
+            HloInstruction::CreateBroadcast(reduce->shape().tuple_shapes(i),
+                                            reduce->init_values()[i], {})));
+      }
+      return ReplaceWithNewInstruction(
+          reduce, HloInstruction::CreateTuple(broadcast_inits));
+    } else {
+      return ReplaceWithNewInstruction(
+          reduce,
+          HloInstruction::CreateBroadcast(reduce_result_shape, init_value, {}));
+    }
+  }
+
+  // If the reduction results in the same number of elements, then the only
+  // possible side effect would be a reshape. Since the init_value is an
+  // identity of the reduction function, we can therefore replace the reduce
+  // with a simple reshape, ignoring the reduction function completely.
+  if (ShapeUtil::ElementsIn(reduce_result_shape) ==
+      ShapeUtil::ElementsIn(arg->shape())) {
+    if (multi_output_reduce) {
+      std::vector<HloInstruction*> reshaped_args;
+      int64 inputs = reduce->input_count();
+      for (int64 i = 0; i < inputs; ++i) {
+        reshaped_args.push_back(
+            computation_->AddInstruction(HloInstruction::CreateReshape(
+                reduce->shape().tuple_shapes(i), reduce->inputs()[i])));
+      }
+      return ReplaceWithNewInstruction(
+          reduce, HloInstruction::CreateTuple(reshaped_args));
+    } else {
+      return ReplaceWithNewInstruction(
+          reduce, HloInstruction::CreateReshape(reduce_result_shape, arg));
+    }
+  }
+
+  // TODO(b/112040122): Most of those optimizations below can be done for
+  // multi-output reduces.
+  if (multi_output_reduce) {
+    return Status::OK();
   }
 
   // A Transpose feeding a reduce can simply permute the reduction dimensions
   // field if the output of the reduce is a vector or scalar. Higher ranked
   // result may require a transpose of the output.
-  if (ShapeUtil::Rank(reduce->shape()) <= 1 &&
+  if (reduce_result_shape.rank() <= 1 &&
       arg->opcode() == HloOpcode::kTranspose) {
     auto transpose_dimensions = arg->dimensions();
     std::vector<int64> new_reduce_dimensions;
@@ -2327,20 +2957,10 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
     }
     return ReplaceWithNewInstruction(
         reduce, HloInstruction::CreateReduce(
-                    reduce->shape(), arg->mutable_operand(0), init_value,
+                    reduce_result_shape, arg->mutable_operand(0), init_value,
                     new_reduce_dimensions, function));
   }
 
-  // If the reduction results in the same number of elements, then the only
-  // possible side effect would be a reshape. Since the init_value is an
-  // identity of the reduction function, we can therefore replace the reduce
-  // with a simple reshape, ignoring the reduction function completely.
-  if (ShapeUtil::ElementsIn(reduce->shape()) ==
-      ShapeUtil::ElementsIn(arg->shape())) {
-    return ReplaceWithNewInstruction(
-        reduce, HloInstruction::CreateReshape(reduce->shape(), arg));
-  }
-
   // If a reduce feeds a reduce with the same computation and initial value,
   // they can be combined into a single reduce.
   if (arg->opcode() == HloOpcode::kReduce &&
@@ -2349,9 +2969,9 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
     // Create a new reduce with the combined reduction dimensions of both
     // reduces.
     std::vector<int64> arg_dims = arg->dimensions();
-    std::sort(arg_dims.begin(), arg_dims.end());
+    absl::c_sort(arg_dims);
     std::vector<int64> reduce_dims = reduce->dimensions();
-    std::sort(reduce_dims.begin(), reduce_dims.end());
+    absl::c_sort(reduce_dims);
     // Transform reduce_dims to the same rank as the operand of the operand.
     for (int64 arg_dim : arg_dims) {
       for (int64& dim : reduce_dims) {
@@ -2366,9 +2986,9 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
     std::merge(arg_dims.begin(), arg_dims.end(), reduce_dims.begin(),
                reduce_dims.end(), std::back_inserter(new_dimensions));
     return ReplaceWithNewInstruction(
-        reduce,
-        HloInstruction::CreateReduce(reduce->shape(), arg->mutable_operand(0),
-                                     init_value, new_dimensions, function));
+        reduce, HloInstruction::CreateReduce(
+                    reduce_result_shape, arg->mutable_operand(0), init_value,
+                    new_dimensions, function));
   }
 
   // A reshape that collapses multiple dimensions into a dimension being
@@ -2378,8 +2998,8 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
     std::vector<std::pair<int64, int64>> unmodified_dims =
         ShapeUtil::DimensionsUnmodifiedByReshape(arg->operand(0)->shape(),
                                                  arg->shape());
-    std::vector<bool> arg_dim_in_output(ShapeUtil::Rank(arg->shape()), true);
-    std::vector<bool> arg_dim_unmodified(ShapeUtil::Rank(arg->shape()), false);
+    std::vector<bool> arg_dim_in_output(arg->shape().rank(), true);
+    std::vector<bool> arg_dim_unmodified(arg->shape().rank(), false);
     for (auto dim : dimensions) {
       arg_dim_in_output[dim] = false;
     }
@@ -2397,21 +3017,21 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
     }
     if (can_move_reshape_into_reduce) {
       changed_ = true;
-      std::unordered_set<int64> dimensions_not_to_reduce;
+      absl::flat_hash_set<int64> dimensions_not_to_reduce;
       for (auto dim_pair : unmodified_dims) {
         if (arg_dim_in_output[dim_pair.second]) {
           dimensions_not_to_reduce.insert(dim_pair.first);
         }
       }
       std::vector<int64> new_reduce_dimensions;
-      for (int64 i = 0; i < ShapeUtil::Rank(arg->operand(0)->shape()); ++i) {
-        if (dimensions_not_to_reduce.count(i) == 0) {
+      for (int64 i = 0; i < arg->operand(0)->shape().rank(); ++i) {
+        if (!dimensions_not_to_reduce.contains(i)) {
           new_reduce_dimensions.push_back(i);
         }
       }
       return ReplaceWithNewInstruction(
           reduce, HloInstruction::CreateReduce(
-                      reduce->shape(), arg->mutable_operand(0), init_value,
+                      reduce_result_shape, arg->mutable_operand(0), init_value,
                       new_reduce_dimensions, function));
     }
   }
@@ -2426,11 +3046,11 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
     HloInstruction* old_reduce = nullptr;
     for (HloInstruction* operand : arg->operands()) {
       HloInstruction* new_reduce = computation_->AddInstruction(
-          HloInstruction::CreateReduce(reduce->shape(), operand, init_value,
+          HloInstruction::CreateReduce(reduce_result_shape, operand, init_value,
                                        reduce->dimensions(), function));
       if (old_reduce != nullptr) {
         new_reduce = computation_->AddInstruction(HloInstruction::CreateMap(
-            reduce->shape(), {old_reduce, new_reduce}, function));
+            reduce_result_shape, {old_reduce, new_reduce}, function));
       }
       old_reduce = new_reduce;
     }
@@ -2459,6 +3079,55 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
                                   function));
   }
 
+  if (options_.enable_window_reduce_to_reduce_replacement()) {
+    // A reduce window can be expressed as a reduce and a reshape if all
+    // dimensions either have a window size of one or the entire dimension. If
+    // there is no stride, dilation, or padding, this is as easy as checking the
+    // size of the output shape and window dimension.
+    //
+    // The reshape is a bitcast since it adds one-sized dimensions. Often these
+    // ones are immediately removed as well with another reshape. The
+    // implementation of reduce tends to be slightly more efficient at reducing
+    // entire dimensions compared to reduce window.
+    auto effective_reduce_dims = [&] {
+      if (window_util::HasStride(window) || window_util::HasDilation(window) ||
+          window_util::HasPadding(window)) {
+        return absl::InlinedVector<int64, 8>{};
+      }
+      absl::InlinedVector<int64, 8> reduce_dims;
+      for (int64 i = 0; i < window.dimensions_size(); ++i) {
+        if (window.dimensions(i).size() == 1) {
+          continue;
+        } else if (reduce_window->shape().dimensions(i) == 1) {
+          reduce_dims.push_back(i);
+        } else {
+          return absl::InlinedVector<int64, 8>{};
+        }
+      }
+      return reduce_dims;
+    }();
+
+    // If a reduce window can be expressed as a reduce, do so and reshape the
+    // output.
+    if (!effective_reduce_dims.empty()) {
+      Shape reduce_shape = ShapeUtil::FilterDimensions(
+          [&](int64 dim) {
+            return !absl::c_linear_search(effective_reduce_dims, dim);
+          },
+          reduce_window->shape());
+      HloInstruction* reduce =
+          computation_->AddInstruction(HloInstruction::CreateReduce(
+              /*shape=*/reduce_shape,
+              /*operand=*/operand,
+              /*init_value=*/reduce_window->mutable_operand(1),
+              /*dimensions_to_reduce=*/effective_reduce_dims,
+              /*reduce_computation=*/function));
+      return ReplaceWithNewInstruction(
+          reduce_window,
+          HloInstruction::CreateReshape(reduce_window->shape(), reduce));
+    }
+  }
+
   // This optimization folds a pad op into reduce_window.
   HloInstruction* pad;
   const HloInstruction* convert = nullptr;
@@ -2594,7 +3263,7 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
   // Carry out the folding of the pad into reduce_window.
   VLOG(10) << "Folding pad into reduce-window.";
   Window new_window = window;
-  const int64 rank = ShapeUtil::Rank(reduce_window->shape());
+  const int64 rank = reduce_window->shape().rank();
   TF_RET_CHECK(pad_config.dimensions_size() == rank);
   TF_RET_CHECK(window.dimensions_size() == rank);
   for (int64 i = 0; i < rank; ++i) {
@@ -2643,110 +3312,24 @@ Status AlgebraicSimplifierVisitor::HandleSort(HloInstruction* sort) {
     return ReplaceWithNewInstruction(
         sort, HloInstruction::CreateTuple(sort->operands()));
   }
-  if (!options_.enable_permutation_sort_replacement()) {
-    return Status::OK();
-  }
-  // Check if we are sorting a permutation. In that case, we know that the keys
-  // will be sorted to the identity permutation, and we can represent the
-  // changes to the 'values' parameter as a scatter.
-  if (sort->operand_count() == 2 &&
-      operand->opcode() == HloOpcode::kGetTupleElement) {
-    const HloInstruction* other_sort = operand->operand(0);
-    // Check whether the 'values' parameter is the result of another sort with
-    // the same sort dimension.
-    if (other_sort->opcode() == HloOpcode::kSort &&
-        other_sort->operand_count() >= 2 &&
-        other_sort->dimensions(0) == dimension_to_sort &&
-        other_sort->operand(operand->tuple_index())->opcode() ==
-            HloOpcode::kIota) {
-      auto* iota =
-          Cast<HloIotaInstruction>(other_sort->operand(operand->tuple_index()));
-      // The sort operand needs to be an integral iota, and the iota dimension
-      // needs to be the dimension that was sorted.
-      if (iota->iota_dimension() == dimension_to_sort &&
-          ShapeUtil::ElementIsIntegral(iota->shape())) {
-        // We use the following construction method for a Scatter that applies
-        // the permutation from 'keys' to the 'values' parameter.
-        // - Take the "keys" parameter of the second sort and reshape it to have
-        //   another "1" dimension at the end.
-        // - Concatenate it with iotas of the same extended shape with all
-        //   different iota_dimensions except the dimension_to_sort in the order
-        //   of iota_dimensions/dimension_to_sort, so e.g. with rank 3 and
-        //   dimension_to_sort = 1, we would have concatenate of (iota with
-        //   iota_dimension=0, keys, iota with iota_dimension = 2)
-        // - Use this as the indices parameter of scatter, and set updates
-        //   of the scatter to be a reshaped 'values' parameter of sort (adding
-        //   'rank' many 1 dimensions at the end).
-        int64 rank = ShapeUtil::Rank(operand->shape());
-        Shape extended_shape = operand->shape();
-        extended_shape.add_dimensions(1);
-        extended_shape.mutable_layout()->add_minor_to_major(rank);
-        auto reshaped_permutation = computation_->AddInstruction(
-            HloInstruction::CreateReshape(extended_shape, operand));
-        std::vector<HloInstruction*> concat_operands;
-        for (int64 i = 0; i < rank; ++i) {
-          if (i == dimension_to_sort) {
-            concat_operands.push_back(reshaped_permutation);
-          } else {
-            concat_operands.push_back(computation_->AddInstruction(
-                HloInstruction::CreateIota(extended_shape, i)));
-          }
-        }
-        Shape concat_shape = operand->shape();
-        concat_shape.add_dimensions(rank);
-        concat_shape.mutable_layout()->add_minor_to_major(rank);
-        auto scatter_indices =
-            rank > 1 ? computation_->AddInstruction(
-                           HloInstruction::CreateConcatenate(
-                               concat_shape, concat_operands, rank))
-                     : reshaped_permutation;
-
-        // We don't care about the operand, it will be completely overridden by
-        // the updates.
-        auto scatter_operand = computation_->AddInstruction(
-            HloInstruction::CreateIota(sort->operand(1)->shape(), 0));
-
-        // Construct the updates operand of scatter.
-        Shape update_shape = sort->operand(1)->shape();
-        for (int64 i = 0; i < rank; ++i) {
-          update_shape.add_dimensions(1);
-          update_shape.mutable_layout()->add_minor_to_major(rank + i);
-        }
-        auto scatter_updates =
-            computation_->AddInstruction(HloInstruction::CreateReshape(
-                update_shape, sort->mutable_operand(1)));
-
-        // Construct the updates computation, which simply replaces the operand
-        // values with the update values.
-        HloComputation::Builder b("update_replace_computation");
-        Shape scalar_shape = ShapeUtil::MakeShape(S32, {});
-        b.AddInstruction(
-            HloInstruction::CreateParameter(0, scalar_shape, "scalar_lhs"));
-        auto scalar_rhs = b.AddInstruction(
-            HloInstruction::CreateParameter(1, scalar_shape, "scalar_rhs"));
-        auto update_replace_computation =
-            computation_->parent()->AddEmbeddedComputation(b.Build(scalar_rhs));
-
-        ScatterDimensionNumbers dim_numbers;
-        dim_numbers.set_index_vector_dim(rank);
-        for (int64 i = 0; i < rank; ++i) {
-          dim_numbers.add_update_window_dims(rank + i);
-          dim_numbers.add_scatter_dims_to_operand_dims(i);
-        }
-        auto scatter =
-            computation_->AddInstruction(HloInstruction::CreateScatter(
-                sort->operand(1)->shape(), scatter_operand, scatter_indices,
-                scatter_updates, update_replace_computation, dim_numbers));
-        return ReplaceWithNewInstruction(
-            sort, HloInstruction::CreateTuple(
-                      {computation_->AddInstruction(HloInstruction::CreateIota(
-                           operand->shape(), dimension_to_sort)),
-                       scatter}));
-      }
+  return Status::OK();
+}
+
+namespace {
+bool OnlyPermutesMoreThanOneDegenerateDim(const Shape& shape,
+                                          absl::Span<const int64> perm) {
+  std::vector<int64> new_permutation;
+  int64 degenerate_count = 0;
+  for (int64 i = 0; i < perm.size(); ++i) {
+    if (shape.dimensions(i) != 1) {
+      new_permutation.push_back(perm[i]);
+    } else {
+      ++degenerate_count;
     }
   }
-  return Status::OK();
+  return degenerate_count > 1 && absl::c_is_sorted(new_permutation);
 }
+}  // namespace
 
 Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) {
   auto operand = transpose->mutable_operand(0);
@@ -2764,6 +3347,15 @@ Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) {
                                            transpose->dimensions())));
   }
 
+  // Replace transpose with a reshape if more than one degenerate method is
+  // permuted.
+  if (OnlyPermutesMoreThanOneDegenerateDim(transpose->shape(),
+                                           transpose->dimensions())) {
+    return ReplaceWithNewInstruction(
+        transpose, HloInstruction::CreateReshape(
+                       transpose->shape(), transpose->mutable_operand(0)));
+  }
+
   if (operand->opcode() == HloOpcode::kRng && operand->user_count() == 1) {
     *operand->mutable_shape() = transpose->shape();
     return ReplaceInstruction(transpose, operand);
@@ -3011,15 +3603,6 @@ StatusOr<bool> AlgebraicSimplifierVisitor::SimplifyConvToDot(
   const Shape dot_output_shape = ShapeUtil::MakeShapeWithDescendingLayout(
       convolution_shape.element_type(), {conv_width, output_channels});
 
-  // We cannot insert bitcasts if the layouts will not be compatible.
-  // TODO(b/33178038): Consider inserting a transpose if a bitcast would be
-  // invalid.
-  if (!options_.valid_bitcast_callback()(input_shape, new_input_shape) ||
-      !options_.valid_bitcast_callback()(filter_shape, new_filter_shape) ||
-      !options_.valid_bitcast_callback()(dot_output_shape, convolution_shape)) {
-    return false;
-  }
-
   auto new_lhs = add_bitcast(new_input_shape, lhs);
   auto new_rhs = add_bitcast(new_filter_shape, rhs);
   DotDimensionNumbers dot_dimension_numbers;
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h
index d2775b9fafa7e4c625f5d181114e80e7369f9c78..df5a8c2ec141458a95fafb76b1e99e4b04a61b28 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.h
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h
@@ -25,21 +25,25 @@ namespace xla {
 
 class AlgebraicSimplifierOptions {
  public:
-  // Given shapes 'from_shape' and 'to_shape', determines if it is valid to
-  // bitcast from 'from_shape' to 'to_shape' after considering platform
-  // dependent effects on layout like alignment restrictions. Precondition: the
-  // two shapes have layouts, the same number of elements and
-  // ShapeUtil::ReshapeIsBitcast returns true.
-  using ValidBitcastCallback =
+  AlgebraicSimplifierOptions() {}
+  // Platform dependent callback to determine if a reshape `from_shape` to
+  // `to_shape` is a bitcast.
+  using ReshapeIsBitcastCallback =
       std::function<bool(const Shape& from_shape, const Shape& to_shape)>;
-
   explicit AlgebraicSimplifierOptions(
-      ValidBitcastCallback valid_bitcast_callback)
-      : valid_bitcast_callback_(std::move(valid_bitcast_callback)) {}
-  // If valid_bitcast_callback returns true, then the pass will replace reshapes
-  // and transposes with bitcasts.
-  const ValidBitcastCallback& valid_bitcast_callback() const {
-    return valid_bitcast_callback_;
+      ReshapeIsBitcastCallback reshape_is_bitcast_callback)
+      : reshape_is_bitcast_callback_(std::move(reshape_is_bitcast_callback)) {}
+
+  // Use the platform specific callback if set. It is not sensible to return
+  // true here if the options are not layout sensitive.
+  bool ReshapeIsBitcast(const Shape& from_shape, const Shape& to_shape) const {
+    if (!is_layout_sensitive_) {
+      return false;
+    }
+    if (!reshape_is_bitcast_callback_) {
+      return ShapeUtil::ReshapeIsBitcast(from_shape, to_shape);
+    }
+    return reshape_is_bitcast_callback_(from_shape, to_shape);
   }
 
   // If is_layout_sensitive is true, then the simplifier preserves layout during
@@ -47,12 +51,14 @@ class AlgebraicSimplifierOptions {
   void set_is_layout_sensitive(bool is_layout_sensitive) {
     is_layout_sensitive_ = is_layout_sensitive;
   }
+
   bool is_layout_sensitive() const { return is_layout_sensitive_; }
 
   // Enable dot simplification on platforms where it is profitable.
   void set_enable_dot_strength_reduction(bool enable_dot_strength_reduction) {
     enable_dot_strength_reduction_ = enable_dot_strength_reduction;
   }
+
   bool enable_dot_strength_reduction() const {
     return enable_dot_strength_reduction_;
   }
@@ -65,22 +71,24 @@ class AlgebraicSimplifierOptions {
     return enable_conv_simplification_;
   }
 
-  // If enable_permutation_sort_replacement is true, a sort op that is known to
-  // sort a permutation will be replaced with a scatter op.
-  void set_enable_permutation_sort_replacement(
-      bool enable_permutation_sort_replacement) {
-    enable_permutation_sort_replacement_ = enable_permutation_sort_replacement;
+  // If enable_window_reduce_replacement is true, the kReduceWindow instruction
+  // can be optimized by replacement with simpler operations.
+  void set_enable_window_reduce_to_reduce_replacement(
+      bool enable_window_reduce_to_reduce_replacement) {
+    enable_window_reduce_to_reduce_replacement_ =
+        enable_window_reduce_to_reduce_replacement;
   }
-  bool enable_permutation_sort_replacement() const {
-    return enable_permutation_sort_replacement_;
+
+  bool enable_window_reduce_to_reduce_replacement() const {
+    return enable_window_reduce_to_reduce_replacement_;
   }
 
  private:
-  ValidBitcastCallback valid_bitcast_callback_;
+  ReshapeIsBitcastCallback reshape_is_bitcast_callback_;
   bool is_layout_sensitive_{false};
   bool enable_dot_strength_reduction_{true};
   bool enable_conv_simplification_{true};
-  bool enable_permutation_sort_replacement_{false};
+  bool enable_window_reduce_to_reduce_replacement_{true};
 };
 
 // A pass which performs algebraic simplifications.
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 14ce519b6a0fd221070006d336d23bddeb6cd621..06f6206a3b3d0007dc4b6a91395babb510bf023e 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -46,17 +47,9 @@ namespace {
 using ::testing::ElementsAre;
 namespace m = match;
 
-AlgebraicSimplifierOptions::ValidBitcastCallback bitcasting_callback() {
-  return [](const Shape&, const Shape&) { return true; };
-}
-
-AlgebraicSimplifierOptions::ValidBitcastCallback non_bitcasting_callback() {
-  return [](const Shape&, const Shape&) { return false; };
-}
-
 class AlgebraicSimplifierTest : public HloTestBase {
  protected:
-  AlgebraicSimplifierOptions default_options_{non_bitcasting_callback()};
+  AlgebraicSimplifierOptions default_options_;
 };
 
 // Test that A + 0 is simplified to A
@@ -202,6 +195,86 @@ TEST_F(AlgebraicSimplifierTest, FactorFpAdditionBfloat16) {
                   m::Broadcast(m::ConstantScalar(0.125)))));
 }
 
+TEST_F(AlgebraicSimplifierTest, UnsignedDivideByPowerOf2) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p = u32[4] parameter(0)
+      c = u32[] constant(8)
+      b = u32[4] broadcast(c), dimensions={}
+      ROOT d = u32[4] divide(p, b)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::ShiftRightLogical(
+                  m::Parameter(0), m::Broadcast(m::ConstantScalar(3)))));
+}
+
+TEST_F(AlgebraicSimplifierTest, SignedDivideByPowerOf2) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p = s32[4] parameter(0)
+      c = s32[] constant(8)
+      b = s32[4] broadcast(c), dimensions={}
+      ROOT d = s32[4] divide(p, b)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  auto match_dividend_is_negative =
+      m::Lt(m::Parameter(0), m::Broadcast(m::ConstantScalar(0)));
+  auto match_abs = m::Select(match_dividend_is_negative,
+                             m::Negate(m::Parameter(0)), m::Parameter(0));
+  auto match_shift =
+      m::ShiftRightLogical(match_abs, m::Broadcast(m::ConstantScalar(3)));
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Select(match_dividend_is_negative,
+                                   m::Negate(match_shift), match_shift)));
+}
+
+TEST_F(AlgebraicSimplifierTest, UnsignedRemainderByPowerOf2) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p = u32[4] parameter(0)
+      c = u32[] constant(8)
+      b = u32[4] broadcast(c), dimensions={}
+      ROOT r = u32[4] remainder(p, b)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::AndAnyOrder(m::Parameter(0),
+                                        m::Broadcast(m::ConstantScalar(7)))));
+}
+
+TEST_F(AlgebraicSimplifierTest, SignedRemainderByPowerOf2) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p = s32[4] parameter(0)
+      c = s32[] constant(8)
+      b = s32[4] broadcast(c), dimensions={}
+      ROOT r = s32[4] remainder(p, b)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  auto match_dividend_is_negative =
+      m::Lt(m::Parameter(0), m::Broadcast(m::ConstantScalar(0)));
+  auto match_abs = m::Select(match_dividend_is_negative,
+                             m::Negate(m::Parameter(0)), m::Parameter(0));
+  auto match_and =
+      m::AndAnyOrder(match_abs, m::Broadcast(m::ConstantScalar(7)));
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Select(match_dividend_is_negative,
+                                   m::Negate(match_and), match_and)));
+}
+
 // Test that A * 0 is simplified to 0
 TEST_F(AlgebraicSimplifierTest, MulZero) {
   auto m = CreateNewVerifiedModule();
@@ -1273,7 +1346,7 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedConvolution) {
   // Create add computation.
   builder.AddInstruction(HloInstruction::CreateConvolve(
       ShapeUtil::MakeShape(F32, {3, 3, 3}), lhs, rhs, /*feature_group_count=*/1,
-      window, dnums, DefaultPrecisionConfig(2)));
+      /*batch_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
   m->AddEntryComputation(builder.Build());
   HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
   EXPECT_THAT(m->entry_computation()->root_instruction(),
@@ -1283,6 +1356,51 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedConvolution) {
               GmockMatch(m::Broadcast(m::Constant())));
 }
 
+TEST_F(AlgebraicSimplifierTest, ReduceWindowIsReduceAndReshape) {
+  auto m = CreateNewVerifiedModule();
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {1, 2, 3, 4}), "param"));
+  Window window;
+  for (int64 i = 0; i < 4; ++i) {
+    WindowDimension* dim = window.add_dimensions();
+    // Makes 1x2x3x1 window.
+    dim->set_size((i % 3) + 1);
+    dim->set_stride(1);
+    dim->set_padding_low(0);
+    dim->set_padding_high(0);
+    dim->set_window_dilation(1);
+    dim->set_base_dilation(1);
+  }
+  // Create add computation.
+  HloComputation* add_computation = nullptr;
+  {
+    HloComputation::Builder builder(TestName() + ".add");
+    const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+    HloInstruction* p0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, scalar_shape, "p0"));
+    HloInstruction* p1 = builder.AddInstruction(
+        HloInstruction::CreateParameter(1, scalar_shape, "p1"));
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
+    add_computation = m->AddEmbeddedComputation(builder.Build());
+  }
+  builder.AddInstruction(HloInstruction::CreateReduceWindow(
+      ShapeUtil::MakeShape(F32, {1, 1, 1, 4}), param,
+      builder.AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f))),
+      window, add_computation));
+  m->AddEntryComputation(builder.Build());
+  HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::ReduceWindow(m::Parameter(0), m::Constant())));
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Reshape(m::Reduce(m::Parameter(0), m::Constant()))));
+}
+
 TEST_F(AlgebraicSimplifierTest, ZeroSizedReduceWindow) {
   auto m = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
@@ -1419,23 +1537,77 @@ TEST_F(AlgebraicSimplifierTest, RemoveCopy) {
   EXPECT_THAT(computation->root_instruction(), param0);
 }
 
-TEST_F(AlgebraicSimplifierTest, CopyEqualsBitcast) {
+TEST_F(AlgebraicSimplifierTest, CopyOfReshapeOfCopyEqualsBitcast) {
   auto m = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
   HloInstruction* param =
       builder.AddInstruction(HloInstruction::CreateParameter(
-          0, ShapeUtil::MakeShape(F32, {1, 14, 14, 64}), "param"));
-  *param->mutable_shape()->mutable_layout() =
-      LayoutUtil::MakeLayout({0, 1, 2, 3});
+          0, ShapeUtil::MakeShapeWithLayout(F32, {1, 14, 14, 64}, {3, 2, 1, 0}),
+          "param"));
   HloInstruction* copy = builder.AddInstruction(HloInstruction::CreateUnary(
-      ShapeUtil::MakeShape(F32, {1, 14, 14, 64}), HloOpcode::kCopy, param));
-  *copy->mutable_shape()->mutable_layout() =
-      LayoutUtil::MakeLayout({1, 2, 0, 3});
+      ShapeUtil::MakeShapeWithLayout(F32, {1, 14, 14, 64}, {0, 1, 2, 3}),
+      HloOpcode::kCopy, param));
+  HloInstruction* reshape =
+      builder.AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShapeWithLayout(F32, {14 * 14, 64}, {0, 1}), copy));
+  builder.AddInstruction(HloInstruction::CreateUnary(
+      ShapeUtil::MakeShapeWithLayout(F32, {14 * 14, 64}, {1, 0}),
+      HloOpcode::kCopy, reshape));
+  auto computation = m->AddEntryComputation(builder.Build());
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Reshape(m::Copy(m::Parameter(0))))));
+
+  AlgebraicSimplifierOptions options;
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  // Verify that the copy of reshape of copy is replaced.
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Bitcast(m::Parameter(0))));
+}
+
+TEST_F(AlgebraicSimplifierTest, ReshapeOfCopyEqualsBitcast) {
+  auto m = CreateNewVerifiedModule();
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShapeWithLayout(F32, {1, 14, 14, 64}, {3, 2, 1, 0}),
+          "param"));
+  HloInstruction* copy = builder.AddInstruction(HloInstruction::CreateUnary(
+      ShapeUtil::MakeShapeWithLayout(F32, {1, 14, 14, 64}, {0, 1, 2, 3}),
+      HloOpcode::kCopy, param));
+  builder.AddInstruction(HloInstruction::CreateReshape(
+      ShapeUtil::MakeShapeWithLayout(F32, {14 * 14, 64}, {1, 0}), copy));
+
+  auto computation = m->AddEntryComputation(builder.Build());
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Copy(m::Parameter(0)))));
+
+  AlgebraicSimplifierOptions options;
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  // Verify that the copy of reshape of copy is replaced.
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Bitcast(m::Parameter(0))));
+}
+
+TEST_F(AlgebraicSimplifierTest, CopyEqualsBitcast) {
+  auto m = CreateNewVerifiedModule();
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShapeWithLayout(F32, {1, 14, 14, 64}, {0, 1, 2, 3}),
+          "param"));
+  builder.AddInstruction(HloInstruction::CreateUnary(
+      ShapeUtil::MakeShapeWithLayout(F32, {1, 14, 14, 64}, {1, 2, 0, 3}),
+      HloOpcode::kCopy, param));
   auto computation = m->AddEntryComputation(builder.Build());
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Copy(m::Parameter(0))));
 
-  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  AlgebraicSimplifierOptions options(
+      [](const Shape&, const Shape&) { return false; });
   options.set_is_layout_sensitive(true);
   AlgebraicSimplifier simplifier1(options);
   ASSERT_FALSE(simplifier1.Run(m.get()).ValueOrDie());
@@ -1443,10 +1615,10 @@ TEST_F(AlgebraicSimplifierTest, CopyEqualsBitcast) {
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Copy(m::Parameter(0))));
 
-  AlgebraicSimplifierOptions options2(bitcasting_callback());
+  AlgebraicSimplifierOptions options2;
   options2.set_is_layout_sensitive(true);
   AlgebraicSimplifier simplifier2(options2);
-  ASSERT_TRUE(simplifier2.Run(m.get()).ValueOrDie());
+  EXPECT_TRUE(simplifier2.Run(m.get()).ValueOrDie());
   // Verify that the copy is replaced.
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Bitcast(m::Parameter(0))));
@@ -1699,7 +1871,7 @@ TEST_F(AlgebraicSimplifierTest, CopyWithDifferentLayout) {
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Copy(m::Parameter(0))));
 
-  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   options.set_is_layout_sensitive(true);
   AlgebraicSimplifier simplifier(options);
   EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
@@ -1729,7 +1901,7 @@ TEST_F(AlgebraicSimplifierTest, CopyWithSameLayout) {
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Copy(m::Parameter(0))));
 
-  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   options.set_is_layout_sensitive(true);
   AlgebraicSimplifier simplifier(options);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
@@ -1759,7 +1931,8 @@ TEST_F(AlgebraicSimplifierTest, NoBitcastAdded) {
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Reshape(m::Parameter(0))));
 
-  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  AlgebraicSimplifierOptions options(
+      [](const Shape&, const Shape&) { return false; });
   options.set_is_layout_sensitive(true);
   AlgebraicSimplifier simplifier(options);
   EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
@@ -1790,8 +1963,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeOfTransposeOfRngToRng) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(
-      (AlgebraicSimplifierOptions(bitcasting_callback())));
+  AlgebraicSimplifier simplifier(AlgebraicSimplifierOptions{});
   EXPECT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   // Verify that reshape(transpose(rng)) is replace by a single rng of the
@@ -1842,7 +2014,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
                                   m::Op().Is(dimensions_wrong_reshape),
                                   m::Op().Is(layout_wrong_reshape))));
 
-  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   options.set_is_layout_sensitive(true);
   AlgebraicSimplifier simplifier(options);
   simplifier.Run(m.get()).ValueOrDie();
@@ -1872,8 +2044,7 @@ TEST_F(AlgebraicSimplifierTest, FailureToSinkReshapeDoesntAffectChangedBit) {
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {4}), add));
 
-  AlgebraicSimplifier simplifier(
-      (AlgebraicSimplifierOptions(bitcasting_callback())));
+  AlgebraicSimplifier simplifier(AlgebraicSimplifierOptions{});
   m->AddEntryComputation(builder.Build());
   EXPECT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 }
@@ -1897,8 +2068,7 @@ TEST_F(AlgebraicSimplifierTest, FailureToSinkBroadcastDoesntAffectChangedBit) {
       HloInstruction::CreateBroadcast(ShapeUtil::MakeShape(F32, {2, 2, 2}), add,
                                       /*broadcast_dimensions=*/{0, 1}));
 
-  AlgebraicSimplifier simplifier(
-      (AlgebraicSimplifierOptions(bitcasting_callback())));
+  AlgebraicSimplifier simplifier(AlgebraicSimplifierOptions{});
   m->AddEntryComputation(builder.Build());
   EXPECT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 }
@@ -1923,7 +2093,7 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast1) {
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Transpose(m::Parameter(0))));
 
-  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   options.set_is_layout_sensitive(true);
   AlgebraicSimplifier simplifier(options);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
@@ -1953,7 +2123,7 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast2) {
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Transpose(m::Parameter(0))));
 
-  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   options.set_is_layout_sensitive(true);
   AlgebraicSimplifier simplifier(options);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
@@ -2010,7 +2180,7 @@ TEST_F(AlgebraicSimplifierTest, CopiesMerged) {
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Copy(m::Copy(m::Parameter(0)))));
 
-  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   options.set_is_layout_sensitive(true);
   AlgebraicSimplifier simplifier(options);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
@@ -2047,6 +2217,26 @@ TEST_F(AlgebraicSimplifierTest, TransposesMerged) {
             computation->root_instruction()->dimensions());
 }
 
+TEST_F(AlgebraicSimplifierTest, TransposeIsReshape) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+      param = f32[10] parameter(0)
+      reshaped = f32[1,1,10] reshape(f32[10] param)
+      transposed = f32[10,1,1] transpose(f32[1,1,10] reshaped), dimensions={2,1,0}
+      ROOT reshaped_again = f32[10] reshape(f32[10,1,1] transposed)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Parameter()));
+}
+
 // Test merging reshape and broadcast.
 TEST_F(AlgebraicSimplifierTest, ReshapeAndBroadcastMerged) {
   auto m = CreateNewVerifiedModule();
@@ -2558,93 +2748,23 @@ TEST_F(AlgebraicSimplifierTest, SliceOfReshapeUnchanged) {
 
 TEST_F(AlgebraicSimplifierTest, RemoveNoopSort) {
   auto builder = HloComputation::Builder(TestName());
+  auto module = CreateNewVerifiedModule();
 
   Shape keys_shape = ShapeUtil::MakeShape(F32, {1});
   auto keys = builder.AddInstruction(
       HloInstruction::CreateParameter(0, keys_shape, "keys"));
-  builder.AddInstruction(HloInstruction::CreateSort(keys_shape, 0, keys));
-  auto module = CreateNewVerifiedModule();
+  TF_ASSERT_OK(MakeSortHlo(keys_shape, {keys}, 0, /*is_stable=*/false, &builder,
+                           module.get())
+                   .status());
   HloComputation* computation = module->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(), keys);
 }
 
-TEST_F(AlgebraicSimplifierTest, ReplacePermutationSortWithScatter) {
-  const char* hlo_string = R"(
-    HloModule permutation_sort
-
-    ENTRY sort_computation {
-      keys = f32[64,8732]{1,0} parameter(0)
-      values = s32[64,8732]{1,0} iota(), iota_dimension=1
-      sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values), dimensions={1}
-      gte = s32[64,8732]{1,0} get-tuple-element(sort), index=1
-      ROOT sort2 = (s32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(gte, values), dimensions={1}
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-
-  AlgebraicSimplifierOptions options(non_bitcasting_callback());
-  options.set_enable_permutation_sort_replacement(true);
-  AlgebraicSimplifier simplifier(options);
-  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-  auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root,
-              GmockMatch(m::Tuple(
-                  m::Iota(),
-                  m::Scatter(m::Iota(), m::Concatenate(m::Iota(), m::Reshape()),
-                             m::Reshape()))));
-}
-
-TEST_F(AlgebraicSimplifierTest, DontReplacePermutationSortIfNonIntegral) {
-  // Same as ReplacePermutationSortWithScatter except that the iota has F32
-  // type.
-  const char* hlo_string = R"(
-    HloModule permutation_sort
-
-    ENTRY sort_computation {
-      keys = f32[64,8732]{1,0} parameter(0)
-      values = f32[64,8732]{1,0} iota(), iota_dimension=1
-      sort = (f32[64,8732]{1,0}, f32[64,8732]{1,0}) sort(keys, values), dimensions={1}
-      gte = f32[64,8732]{1,0} get-tuple-element(sort), index=1
-      ROOT sort2 = (f32[64,8732]{1,0}, f32[64,8732]{1,0}) sort(gte, values), dimensions={1}
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-
-  AlgebraicSimplifierOptions options(non_bitcasting_callback());
-  options.set_enable_permutation_sort_replacement(true);
-  AlgebraicSimplifier simplifier(options);
-  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
-}
-
-TEST_F(AlgebraicSimplifierTest, DontReplacePermutationSortWrongDimensions) {
-  // Same as ReplacePermutationSortWithScatter except that the sort dimensions
-  // don't match.
-  const char* hlo_string = R"(
-   HloModule permutation_sort
-
-    ENTRY sort_computation {
-      keys = f32[64,8732]{1,0} parameter(0)
-      values = s32[64,8732]{1,0} iota(), iota_dimension=1
-      sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values), dimensions={1}
-      gte = s32[64,8732]{1,0} get-tuple-element(sort), index=1
-      ROOT sort2 = (s32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(gte, values), dimensions={0}
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-
-  AlgebraicSimplifierOptions options(non_bitcasting_callback());
-  options.set_enable_permutation_sort_replacement(true);
-  AlgebraicSimplifier simplifier(options);
-  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
-}
-
 TEST_F(AlgebraicSimplifierTest, ReplaceEffectiveScalarKeyValueSortWithTuple) {
   auto builder = HloComputation::Builder(TestName());
+  auto module = CreateNewVerifiedModule();
 
   Shape keys_shape = ShapeUtil::MakeShape(F32, {5, 0});
   Shape values_shape = ShapeUtil::MakeShape(S32, {5, 0});
@@ -2654,10 +2774,11 @@ TEST_F(AlgebraicSimplifierTest, ReplaceEffectiveScalarKeyValueSortWithTuple) {
       HloInstruction::CreateParameter(1, values_shape, "values0"));
   auto values1 = builder.AddInstruction(
       HloInstruction::CreateParameter(2, values_shape, "values1"));
-  builder.AddInstruction(HloInstruction::CreateSort(
-      ShapeUtil::MakeTupleShape({keys_shape, values_shape, values_shape}), 0,
-      keys, {values0, values1}));
-  auto module = CreateNewVerifiedModule();
+  TF_ASSERT_OK(MakeSortHlo(ShapeUtil::MakeTupleShape(
+                               {keys_shape, values_shape, values_shape}),
+                           {keys, values0, values1}, 0, /*is_stable=*/false,
+                           &builder, module.get())
+                   .status());
   HloComputation* computation = module->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
@@ -2879,7 +3000,7 @@ class ConvInputPaddingTest
     : public AlgebraicSimplifierTest,
       public ::testing::WithParamInterface<ConvPaddingTestcase> {};
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     ConvInputPaddingTestCases, ConvInputPaddingTest,
     ::testing::ValuesIn(std::vector<ConvPaddingTestcase>{
         // Merge this edge padding into the conv.
@@ -2950,11 +3071,11 @@ TEST_P(ConvInputPaddingTest, DoTest) {
           .ValueOrDie();
   builder.AddInstruction(HloInstruction::CreateConvolve(
       ShapeInference::InferConvolveShape(lhs_pad->shape(), filter->shape(),
-                                         /*feature_group_count=*/1, window,
-                                         dnums)
+                                         /*feature_group_count=*/1,
+                                         /*batch_group_count=*/1, window, dnums)
           .ValueOrDie(),
-      lhs_pad, filter, /*feature_group_count=*/1, window, dnums,
-      DefaultPrecisionConfig(2)));
+      lhs_pad, filter, /*feature_group_count=*/1, /*batch_group_count=*/1,
+      window, dnums, DefaultPrecisionConfig(2)));
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
@@ -2987,7 +3108,7 @@ class ConvFilterPaddingTest
     : public AlgebraicSimplifierTest,
       public ::testing::WithParamInterface<ConvPaddingTestcase> {};
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     ConvFilterPaddingTestCases, ConvFilterPaddingTest,
     ::testing::ValuesIn(std::vector<ConvPaddingTestcase>{
         // Can only merge interior padding on the filter's spatial dimensions;
@@ -3067,11 +3188,11 @@ TEST_P(ConvFilterPaddingTest, DoIt) {
 
   builder.AddInstruction(HloInstruction::CreateConvolve(
       ShapeInference::InferConvolveShape(input->shape(), rhs_pad->shape(),
-                                         /*feature_group_count=*/1, window,
-                                         dnums)
+                                         /*feature_group_count=*/1,
+                                         /*batch_group_count=*/1, window, dnums)
           .ValueOrDie(),
-      input, rhs_pad, /*feature_group_count=*/1, window, dnums,
-      precision_config));
+      input, rhs_pad, /*feature_group_count=*/1, /*batch_group_count=*/1,
+      window, dnums, precision_config));
 
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
@@ -3219,13 +3340,14 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) {
 
     b.AddInstruction(HloInstruction::CreateConvolve(
         out_shape, input, filter,
-        /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
+        /*feature_group_count=*/1, /*batch_group_count=*/1, window, dnums,
+        DefaultPrecisionConfig(2)));
 
     // TODO(b/80488902): verify this module.
     auto module = CreateNewUnverifiedModule();
     auto* computation = module->AddEntryComputation(b.Build());
 
-    AlgebraicSimplifierOptions simplifier_options(bitcasting_callback());
+    AlgebraicSimplifierOptions simplifier_options;
     simplifier_options.set_is_layout_sensitive(true);
     AlgebraicSimplifier simplifier(simplifier_options);
     if (!simplifier.Run(module.get()).ValueOrDie()) {
@@ -3431,7 +3553,7 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
 
   // Create the reduce-window.
   Window window;
-  for (int64 i = 0; i < ShapeUtil::Rank(pad->shape()); ++i) {
+  for (int64 i = 0; i < pad->shape().rank(); ++i) {
     auto* dim = window.add_dimensions();
     dim->set_size(1);
     dim->set_padding_low(10);
@@ -3517,7 +3639,7 @@ TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) {
 
   // Create the reduce-window.
   Window window;
-  for (int64 i = 0; i < ShapeUtil::Rank(pad->shape()); ++i) {
+  for (int64 i = 0; i < pad->shape().rank(); ++i) {
     auto* dim = window.add_dimensions();
     dim->set_size(1);
     dim->set_padding_low(10);
@@ -3592,8 +3714,8 @@ TEST_F(AlgebraicSimplifierTest, IteratorInvalidation) {
   HloInstruction* y =
       builder.AddInstruction(HloInstruction::CreateParameter(1, r1f32, "y"));
   DotDimensionNumbers dot_dnums;
-  dot_dnums.add_lhs_contracting_dimensions(1);
-  dot_dnums.add_rhs_contracting_dimensions(0);
+  dot_dnums.add_lhs_batch_dimensions(0);
+  dot_dnums.add_rhs_batch_dimensions(0);
   builder.AddInstruction(HloInstruction::CreateDot(r1f32, x, y, dot_dnums,
                                                    DefaultPrecisionConfig(2)));
   std::unique_ptr<HloComputation> dot_computation(builder.Build());
@@ -3639,12 +3761,16 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicSlice) {
   HloComputation::Builder builder(TestName());
 
   Shape shape = ShapeUtil::MakeShape(F32, {10, 100, 1000});
+  std::vector<HloInstruction*> params;
+  for (int i = 0; i < 3; ++i) {
+    params.push_back(builder.AddInstruction(HloInstruction::CreateParameter(
+        i + 1, ShapeUtil::MakeShape(U32, {}), "slice_indices")));
+  }
   builder.AddInstruction(HloInstruction::CreateDynamicSlice(
       shape,
       builder.AddInstruction(
           HloInstruction::CreateParameter(0, shape, "slice_from")),
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          1, ShapeUtil::MakeShape(U32, {3}), "slice_indices")),
+      params,
       /*slice_sizes=*/{10, 100, 1000}));
 
   auto computation = m->AddEntryComputation(builder.Build());
@@ -3663,28 +3789,35 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicUpdateSlice) {
   Shape full_shape = ShapeUtil::MakeShape(F32, {10, 100, 1000});
   Shape slice_shape = ShapeUtil::MakeShape(F32, {10, 1, 1000});
 
+  std::vector<HloInstruction*> slice_indices, update_indices;
+  for (int i = 0; i < 3; ++i) {
+    slice_indices.push_back(
+        builder.AddInstruction(HloInstruction::CreateParameter(
+            i + 1, ShapeUtil::MakeShape(U32, {}), "slice_indices")));
+    update_indices.push_back(
+        builder.AddInstruction(HloInstruction::CreateParameter(
+            i + 5, ShapeUtil::MakeShape(U32, {}), "update_indices")));
+  }
   HloInstruction* slice =
       builder.AddInstruction(HloInstruction::CreateDynamicSlice(
           slice_shape,
           builder.AddInstruction(
               HloInstruction::CreateParameter(0, full_shape, "slice_from")),
-          builder.AddInstruction(HloInstruction::CreateParameter(
-              1, ShapeUtil::MakeShape(U32, {3}), "slice_indices")),
+          slice_indices,
           /*slice_sizes=*/{10, 1, 1000}));
 
   builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
       slice_shape,
       builder.AddInstruction(
-          HloInstruction::CreateParameter(2, slice_shape, "to_update")),
-      slice,
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          3, ShapeUtil::MakeShape(U32, {3}), "update_indices"))));
+          HloInstruction::CreateParameter(4, slice_shape, "to_update")),
+      slice, update_indices));
 
   auto computation = m->AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
-              GmockMatch(m::DynamicSlice(m::Parameter(), m::Parameter())));
+              GmockMatch(m::DynamicSlice(m::Parameter(), m::Parameter(),
+                                         m::Parameter(), m::Parameter())));
 }
 
 // Test that two consecutive broadcasts can be merged to one.
@@ -3791,7 +3924,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadLow) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
@@ -3812,7 +3945,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadHigh) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
@@ -3827,17 +3960,38 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidNonScalar) {
       param = f32[3,4] parameter(0)
       constant = f32[] constant(0.0)
       pad = f32[8,10] pad(f32[3,4] param, f32[] constant), padding=3_2x1_5
-      ROOT slice = f32[1,1] slice(f32[8,10] pad), slice={[5:6],[9:10]}
+      ROOT slice = f32[1,1] slice(f32[8,10] pad), slice={[5:6],[4:5]}
     }
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   AlgebraicSimplifier simplifier(options);
   EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 }
 
+TEST_F(AlgebraicSimplifierTest, SliceOfPadMidScalarConstant) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+      param = f32[3,4] parameter(0)
+      constant = f32[] constant(0.0)
+      pad = f32[8,10] pad(f32[3,4] param, f32[] constant), padding=3_2x1_5
+      ROOT slice = f32[1,1] slice(f32[8,10] pad), slice={[5:6],[9:10]}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Reshape(m::Constant())));
+}
+
 TEST_F(AlgebraicSimplifierTest, SliceOfPadMidScalar) {
   const char* hlo_string = R"(
     HloModule module
@@ -3852,13 +4006,36 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidScalar) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Parameter()));
 }
 
+TEST_F(AlgebraicSimplifierTest, SliceOfPadSomeDimsInPadding) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY entry () -> f32[1]{0} {
+      constant.val = f32[] constant(4)
+      constant.pad = f32[] constant(-7)
+      reshape.1 = f32[1,1,1]{2,1,0} reshape(f32[] constant.val)
+      pad = f32[3,3,3]{2,1,0} pad(f32[1,1,1]{2,1,0} reshape.1, f32[] constant.pad), padding=0_2x0_2x2_0
+      slice = f32[1,1,1]{2,1,0} slice(f32[3,3,3]{2,1,0} pad), slice={[0:1], [0:1], [0:1]}
+      ROOT reshape.2 = f32[1]{0} reshape(f32[1,1,1]{2,1,0} slice)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Reshape(m::ConstantScalar(-7.0))));
+}
+
 TEST_F(AlgebraicSimplifierTest, SliceOfConcatScalarInput) {
   const char* hlo_string = R"(
     HloModule module
@@ -3874,7 +4051,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfConcatScalarInput) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
@@ -3896,7 +4073,7 @@ TEST_F(AlgebraicSimplifierTest, SliceOfConcatNonScalarInput) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
@@ -3918,7 +4095,7 @@ TEST_F(AlgebraicSimplifierTest, NegateNegate) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
@@ -3938,7 +4115,7 @@ TEST_F(AlgebraicSimplifierTest, NotNot) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifierOptions options;
   AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
@@ -4065,9 +4242,6 @@ PadReduceWindowEffectiveBroadcastCases() {
       {/*input_spatials=*/{2, 2}, /*symmetric_pad_amount=*/{6, 6},
        /*reduce_window_spatials=*/{7, 7}, /*prepend_a=*/true,
        /*should_become_broadcast=*/false},  //
-      {/*input_spatials=*/{1, 1}, /*symmetric_pad_amount=*/{2, 2},
-       /*reduce_window_spatials=*/{5, 5}, /*prepend_a=*/true,
-       /*should_become_broadcast=*/true},  //
       {/*input_spatials=*/{1, 1}, /*symmetric_pad_amount=*/{2, 2},
        /*reduce_window_spatials=*/{1, 1}, /*prepend_a=*/true,
        /*should_become_broadcast=*/false},  //
@@ -4078,11 +4252,80 @@ PadReduceWindowEffectiveBroadcastCases() {
   return *cases;
 }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     PadReduceWindowEffectiveBroadcastInstantiation,
     PadReduceWindowEffectiveBroadcastTest,
     ::testing::ValuesIn(PadReduceWindowEffectiveBroadcastCases()));
 
+class BatchDotStrengthReductionTest
+    : public AlgebraicSimplifierTest,
+      public ::testing::WithParamInterface<
+          ::testing::tuple<int, int, int, PrimitiveType>> {};
+TEST_P(BatchDotStrengthReductionTest, BatchDotStrengthReduction) {
+  auto module = CreateNewVerifiedModule();
+  int m, k, n;
+  PrimitiveType element_type;
+  std::tie(m, k, n, element_type) = GetParam();
+  std::vector<int64> lhs_dims = {1, 3, 5};
+  std::vector<int64> rhs_dims = lhs_dims;
+  std::vector<int64> output_dims = lhs_dims;
+  if (m > 0) {
+    lhs_dims.push_back(m);
+    output_dims.push_back(m);
+  }
+  if (k > 0) {
+    lhs_dims.push_back(k);
+    rhs_dims.push_back(k);
+  }
+  if (n > 0) {
+    rhs_dims.push_back(n);
+    output_dims.push_back(n);
+  }
+  Shape dot_shape = ShapeUtil::MakeShape(element_type, output_dims);
+  Shape lhs_shape = ShapeUtil::MakeShape(element_type, lhs_dims);
+  Shape rhs_shape = ShapeUtil::MakeShape(element_type, rhs_dims);
+  HloComputation::Builder builder(TestName());
+
+  auto lhs = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, lhs_shape, "lhs"));
+  auto rhs = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, rhs_shape, "rhs"));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_batch_dimensions(0);
+  dot_dnums.add_lhs_batch_dimensions(1);
+  dot_dnums.add_lhs_batch_dimensions(2);
+  dot_dnums.add_rhs_batch_dimensions(0);
+  dot_dnums.add_rhs_batch_dimensions(1);
+  dot_dnums.add_rhs_batch_dimensions(2);
+  if (k > 0) {
+    dot_dnums.add_lhs_contracting_dimensions(m > 0 ? 4 : 3);
+    dot_dnums.add_rhs_contracting_dimensions(3);
+  }
+  builder.AddInstruction(HloInstruction::CreateDot(
+      dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
+  auto computation = module->AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(default_options_);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, simplifier.Run(module.get()));
+  const bool dot_should_be_transformed =
+      m == 1 || k == 1 || n == 1 || m == -1 || k == -1 || n == -1;
+  EXPECT_EQ(changed, dot_should_be_transformed);
+  bool has_no_dot = true;
+  for (const auto& hlo : computation->instructions()) {
+    if (hlo->opcode() == HloOpcode::kDot) {
+      has_no_dot = false;
+      break;
+    }
+  }
+  EXPECT_EQ(has_no_dot, dot_should_be_transformed);
+}
+
+INSTANTIATE_TEST_SUITE_P(BatchDotStrengthReductionTestInstantiation,
+                         BatchDotStrengthReductionTest,
+                         ::testing::Combine(::testing::Values(-1, 1, 2),
+                                            ::testing::Values(-1, 1, 2),
+                                            ::testing::Values(-1, 1, 2),
+                                            ::testing::Values(F32, BF16)));
+
 class DotStrengthReductionTest
     : public AlgebraicSimplifierTest,
       public ::testing::WithParamInterface<
@@ -4135,7 +4378,7 @@ TEST_P(DotStrengthReductionTest, DotStrengthReduction) {
   EXPECT_EQ(has_no_dot, dot_should_be_transformed);
 }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     DotStrengthReductionTestInstantiation, DotStrengthReductionTest,
     ::testing::Combine(::testing::Values(1, 2), ::testing::Values(1, 2),
                        ::testing::Values(1, 2), ::testing::Bool(),
@@ -4297,9 +4540,10 @@ TEST_F(AlgebraicSimplifierTest, DynamicUpdateSliceZeroUpdate) {
   HloInstruction* const update = builder.AddInstruction(
       HloInstruction::CreateParameter(1, update_shape, "update"));
   HloInstruction* const start_indices = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int>({0})));
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int>({})));
   builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-      dslice_shape, operand, update, start_indices));
+      dslice_shape, operand, update,
+      std::initializer_list<HloInstruction*>({start_indices})));
   const HloComputation* const computation =
       m->AddEntryComputation(builder.Build());
 
@@ -4308,9 +4552,9 @@ TEST_F(AlgebraicSimplifierTest, DynamicUpdateSliceZeroUpdate) {
   EXPECT_THAT(computation->root_instruction(), operand);
 }
 
-INSTANTIATE_TEST_CASE_P(DotOfConcatSimplificationTestInstantiation,
-                        DotOfConcatSimplificationTest,
-                        ::testing::ValuesIn(kDotOfConcatTestSpecs));
+INSTANTIATE_TEST_SUITE_P(DotOfConcatSimplificationTestInstantiation,
+                         DotOfConcatSimplificationTest,
+                         ::testing::ValuesIn(kDotOfConcatTestSpecs));
 
 struct DotOfGatherTestSpec {
   int64 m;
@@ -4352,14 +4596,17 @@ TEST_P(DotOfGatherSimplificationTest, ConstantRHS) {
 
   int32 start_row = (spec.lcd == 0) ? 0 : spec.s;
   int32 start_col = (spec.lcd == 0) ? spec.s : 0;
-  const auto start_indices =
+  std::vector<HloInstruction*> start_indices = {
       builder.AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR1<int32>({start_row, start_col})));
+          LiteralUtil::CreateR0<int32>(start_row))),
+      builder.AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateR0<int32>(start_col)))};
   int64 slice_row_size = (spec.lcd == 0) ? spec.k : 1;
   int64 slice_col_size = (spec.lcd == 0) ? 1 : spec.k;
-  Shape ds_shape = ShapeUtil::MakeShape(F32, {slice_row_size, slice_col_size});
+  std::vector<int64> slice_sizes = {slice_row_size, slice_col_size};
+  Shape ds_shape = ShapeUtil::MakeShape(F32, slice_sizes);
   auto* ds = builder.AddInstruction(HloInstruction::CreateDynamicSlice(
-      ds_shape, lhs, start_indices, {slice_row_size, slice_col_size}));
+      ds_shape, lhs, start_indices, slice_sizes));
 
   int64 rhs_rows = (spec.rcd == 0) ? spec.k : spec.n;
   int64 rhs_cols = (spec.rcd == 0) ? spec.n : spec.k;
@@ -4392,7 +4639,7 @@ TEST_P(DotOfGatherSimplificationTest, ConstantRHS) {
   } else {
     EXPECT_THAT(computation->root_instruction(),
                 GmockMatch(m::DynamicSlice(m::Dot(m::Constant(), m::Constant()),
-                                           m::Concatenate())));
+                                           m::Constant(), m::Constant())));
   }
 }
 
@@ -4430,14 +4677,17 @@ TEST_P(DotOfGatherSimplificationTest, ConstantLHS) {
 
   int32 start_row = (spec.rcd == 0) ? 0 : spec.s;
   int32 start_col = (spec.rcd == 0) ? spec.s : 0;
-  const auto start_indices =
+  std::vector<HloInstruction*> start_indices = {
+      builder.AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateR0<int32>(start_row))),
       builder.AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR1<int32>({start_row, start_col})));
+          LiteralUtil::CreateR0<int32>(start_col)))};
   int64 slice_row_size = (spec.rcd == 0) ? spec.k : 1;
   int64 slice_col_size = (spec.rcd == 0) ? 1 : spec.k;
-  Shape ds_shape = ShapeUtil::MakeShape(F32, {slice_row_size, slice_col_size});
+  std::vector<int64> slice_sizes = {slice_row_size, slice_col_size};
+  Shape ds_shape = ShapeUtil::MakeShape(F32, slice_sizes);
   auto* ds = builder.AddInstruction(HloInstruction::CreateDynamicSlice(
-      ds_shape, rhs, start_indices, {slice_row_size, slice_col_size}));
+      ds_shape, rhs, start_indices, slice_sizes));
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(spec.lcd);
@@ -4462,7 +4712,7 @@ TEST_P(DotOfGatherSimplificationTest, ConstantLHS) {
   } else {
     EXPECT_THAT(computation->root_instruction(),
                 GmockMatch(m::DynamicSlice(m::Dot(m::Constant(), m::Constant()),
-                                           m::Concatenate())));
+                                           m::Constant(), m::Constant())));
   }
 }
 
@@ -4510,9 +4760,160 @@ std::vector<DotOfGatherTestSpec> DotOfGatherPositiveNegativeTests() {
   return all;
 }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     DotOfGatherSimplificationTestInstantiation, DotOfGatherSimplificationTest,
     ::testing::ValuesIn(DotOfGatherPositiveNegativeTests()));
 
+TEST_F(AlgebraicSimplifierTest, TupleReduceReshape) {
+  const char* hlo_string = R"(
+HloModule module
+
+reducer {
+  parameter.1 = f32[] parameter(0)
+  parameter.3 = f32[] parameter(2)
+  add.2 = f32[] add(parameter.1, parameter.3)
+  parameter.0 = f32[] parameter(1)
+  parameter.2 = f32[] parameter(3)
+  add.3 = f32[] add(parameter.0, parameter.2)
+  ROOT tuple.4 = (f32[], f32[]) tuple(add.2, add.3)
+}
+
+ENTRY entry {
+  parameter.6 = (f32[], f32[]) parameter(0)
+  get-tuple-element.10 = f32[] get-tuple-element(parameter.6), index=0
+  get-tuple-element.11 = f32[] get-tuple-element(parameter.6), index=1
+  constant = f32[] constant(0)
+  ROOT reduce = (f32[], f32[]) reduce(get-tuple-element.10, get-tuple-element.11, constant, constant), dimensions={}, to_apply=reducer
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Tuple(
+                        m::Reshape(m::GetTupleElement(m::Parameter(), 0)),
+                        m::Reshape(m::GetTupleElement(m::Parameter(), 1)))));
+}
+
+TEST_F(AlgebraicSimplifierTest, TupleReduceBroadcast) {
+  const char* hlo_string = R"(
+HloModule module
+
+reducer {
+  parameter.1 = f32[] parameter(0)
+  parameter.3 = f32[] parameter(2)
+  mul.2 = f32[] add(parameter.1, parameter.3)
+  parameter.0 = f32[] parameter(1)
+  parameter.2 = f32[] parameter(3)
+  add.3 = f32[] add(parameter.0, parameter.2)
+  ROOT tuple.4 = (f32[], f32[]) tuple(mul.2, add.3)
+}
+
+ENTRY entry {
+  parameter.6 = (f32[0, 10, 10], f32[0, 10, 10]) parameter(0)
+  get-tuple-element.10 = f32[0, 10, 10] get-tuple-element(parameter.6), index=0
+  get-tuple-element.11 = f32[0, 10, 10] get-tuple-element(parameter.6), index=1
+  constant.0 = f32[] constant(0)
+  constant.1 = f32[] constant(1)
+  ROOT reduce = (f32[10, 10], f32[10, 10]) reduce(get-tuple-element.10, get-tuple-element.11, constant.0, constant.1), dimensions={0}, to_apply=reducer
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Tuple(m::Broadcast(m::ConstantScalar(0)),
+                                        m::Broadcast(m::ConstantScalar(1)))));
+}
+
+TEST_F(AlgebraicSimplifierTest, ZeroSizedReshapeWithoutLayout) {
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {1}), "param"));
+  HloInstruction* broadcast =
+      builder.AddInstruction(HloInstruction::CreateBroadcast(
+          ShapeUtil::MakeShape(F32, {0, 1}), param, {1}));
+
+  // Create a reshape with zero sized result and without layout.
+  Shape reshaped_shape = ShapeUtil::MakeShape(F32, {0});
+  reshaped_shape.clear_layout();
+  builder.AddInstruction(
+      HloInstruction::CreateReshape(reshaped_shape, broadcast));
+
+  std::unique_ptr<VerifiedHloModule> module = CreateNewVerifiedModule();
+  module->AddEntryComputation(builder.Build());
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
+}
+
+TEST_F(AlgebraicSimplifierTest, DividedByConstantInstructionWithoutLayout) {
+  Shape shape = ShapeUtil::MakeShape(F32, {});
+  shape.clear_layout();
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+
+  HloInstruction* const_value = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(20.0f)));
+  builder.AddInstruction(HloInstruction::CreateBinary(shape, HloOpcode::kDivide,
+                                                      param, const_value));
+
+  std::unique_ptr<VerifiedHloModule> module = CreateNewVerifiedModule();
+  module->AddEntryComputation(builder.Build());
+
+  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Multiply()));
+}
+
+// Test that 1/sqrt(X) is simplified to rsqrt(X).
+TEST_F(AlgebraicSimplifierTest, RecipSqrt) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      sqrt = f32[] sqrt(p0)
+      ROOT div = f32[] divide(p1, sqrt)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::MultiplyAnyOrder(m::Parameter(1),
+                                             m::Rsqrt(m::Parameter(0)))));
+}
+
+// Test that 1/rsqrt(X) is simplified to sqrt(X).
+TEST_F(AlgebraicSimplifierTest, RecipRsqrt) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      rsqrt = f32[] rsqrt(p0)
+      ROOT div = f32[] divide(p1, rsqrt)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::MultiplyAnyOrder(m::Parameter(1),
+                                             m::Sqrt(m::Parameter(0)))));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc
index ef5e211646e7b0b66b8e6c09948be58063422943..6cb0e985e57016e5a22fba50c3e3ad6970f1b178 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@@ -142,13 +142,13 @@ StatusOr<std::vector<GlobalDataHandle>> AllocationTracker::DeconstructTuple(
   // We only need to care about replica id 0 here, since the GlobalDataHandle is
   // the same for all buffers across replicas.
   const ShapedBuffer* shaped_buffer = replicated_buffers[0];
-  if (!ShapeUtil::IsTuple(shaped_buffer->on_host_shape())) {
+  if (!shaped_buffer->on_host_shape().IsTuple()) {
     return InvalidArgument("global data handle %d is not a tuple",
                            data.handle());
   }
   // If the on-host representation is a tuple, then the on-device one should be
   // as well.
-  TF_RET_CHECK(ShapeUtil::IsTuple(shaped_buffer->on_device_shape()));
+  TF_RET_CHECK(shaped_buffer->on_device_shape().IsTuple());
 
   if (ShapeUtil::IsNestedTuple(shaped_buffer->on_device_shape())) {
     return Unimplemented("Deconstructing nested tuples is not implemented.");
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.cc b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
index 362bc44a1cf377b51c5519c6ab5e0d9628e80e58..52d6982c70f7962ea9f54db0a4b1f2089a122c1c 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner.cc
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
@@ -26,38 +26,72 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 
-namespace {
-
 namespace m = match;
 
-// If the argument instruction is a CRS in the sequence
-// AR -> Convert -> Add -> CRS
-// then return the AR in the sequence.
-// TODO(b/117554291): Rewrite this to recognize more general patterns,
-// not just the specific one of AR -> Add -> Convert -> CRS.
-absl::optional<HloInstruction*> MatchesArCrsPattern(
+// Checks if the argument instruction is an AllReduce, followed by a certain
+// sequence of instructions and then a CRS. It must be possible to move
+// the AR past each instruction in the sequence. Returns the CRS, which is the
+// last instruction in the sequence.
+absl::optional<ArCrsCombiner::ArCrsPair> ArCrsCombiner::MatchesArCrsPattern(
     HloInstruction* instruction) {
-  HloInstruction *ar, *convert, *add, *crs;
-  if (Match(instruction,
-            m::CrossReplicaSum(
-                &crs, m::Add(&add, m::Op(),
-                             m::Convert(&convert,
-                                        m::CrossReplicaSum(&ar, m::Op()))))) &&
-      ar->users().size() == 1 && ar->shape().element_type() == BF16 &&
-      convert->shape().element_type() == F32 && !crs->all_reduce_id()) {
-    return ar;
+  auto can_ar_move_past_instruction = [](HloInstruction* instruction) -> bool {
+    if (instruction->user_count() != 1) {
+      return false;
+    }
+    switch (instruction->opcode()) {
+      case HloOpcode::kBitcast:
+      case HloOpcode::kTranspose:
+      case HloOpcode::kReshape:
+        return true;
+      case HloOpcode::kConvert:
+        // Can be moved across if both input and output is either float or
+        // integer (e.g. S32<->U32 or F32<->BF16)
+        return ShapeUtil::ElementIsFloating(instruction->shape()) ==
+               ShapeUtil::ElementIsFloating(instruction->operand(0)->shape());
+      case HloOpcode::kAdd:
+      case HloOpcode::kSubtract:
+      case HloOpcode::kMultiply:
+        // Only supported for floating point operands.
+        return ShapeUtil::ElementIsFloating(instruction->shape());
+      default:
+        return false;
+    }
+  };
+
+  auto computation_is_addition = [](HloComputation* c) {
+    return c->instruction_count() == 3 &&
+           Match(c->root_instruction(), m::Add(m::Parameter(), m::Parameter()));
+  };
+
+  if (!instruction->IsCrossModuleAllReduce() ||
+      !computation_is_addition(instruction->called_computations()[0]) ||
+      instruction->user_count() != 1) {
+    return absl::nullopt;
+  }
+  auto next = instruction->users()[0];
+  int64 distance = 1;
+  while (!next->IsCrossReplicaAllReduce()) {
+    if (can_ar_move_past_instruction(next)) {
+      next = next->users()[0];
+    } else {
+      return absl::nullopt;
+    }
+    ++distance;
+  }
+  if (!Cast<HloAllReduceInstruction>(next)->IsNoop() &&
+      computation_is_addition(next->called_computations()[0])) {
+    return absl::optional<ArCrsPair>(ArCrsPair(instruction, next, distance));
+  } else {
+    return absl::nullopt;
   }
-  return absl::optional<HloInstruction*>();
 }
 
-}  // namespace
-
 absl::optional<HloInstruction*> ArCrsCombiner::WhileFromBodyParameter(
     HloInstruction* instruction) {
   CHECK_EQ(HloOpcode::kParameter, instruction->opcode());
@@ -69,7 +103,7 @@ absl::optional<HloInstruction*> ArCrsCombiner::WhileFromBodyParameter(
       return caller_instruction;
     }
   }
-  return absl::optional<HloInstruction*>();
+  return absl::nullopt;
 }
 
 std::vector<HloInstruction*> ArCrsCombiner::GetAllTuples(
@@ -160,6 +194,15 @@ bool ArCrsCombiner::InstructionsComputeSameValue(
   if (opcode1 != i2->opcode() || operands1.size() != i2->operands().size()) {
     return false;
   }
+  auto eq_computations = [](const HloComputation* a, const HloComputation* b) {
+    return *a == *b;
+  };
+  if (i1->IsCrossModuleAllReduce()) {
+    return i1->Identical(*i2,
+                         /*eq_operands=*/std::equal_to<const HloInstruction*>(),
+                         eq_computations,
+                         /*layout_sensitive=*/false);
+  }
   visited_pairs->emplace(min_uid, max_uid);
   for (int i = 0; i < operands1.size(); ++i) {
     auto operand1 = operands1[i];
@@ -185,19 +228,61 @@ bool ArCrsCombiner::InstructionsComputeSameValue(
   // InstructionsComputeSameValue earlier.
   auto eq_instructions = [](const HloInstruction* i1,
                             const HloInstruction* i2) -> bool { return true; };
-  auto eq_computations = [](const HloComputation* a, const HloComputation* b) {
-    return *a == *b;
-  };
   return i1->Identical(*i2, eq_instructions, eq_computations,
                        /*layout_sensitive=*/false);
 }
 
 void ArCrsCombiner::GroupAllReducesById(HloModule* module) {
+  // Say that two or more ARs lead to the same CRS: (AR1, CRS), (AR2, CRS),
+  // ... , (ARn, CRS).
+  // If as we traverse the HLO graph we start tracking the pair (AR2, CRS),
+  // and later find that AR1's distance from the CRS is longer, we discard
+  // AR2 and start tracking AR1. We put the discarded ids in this set, in order
+  // to skip processing of short paths when we encounter the other ARs that
+  // have the same id as AR2.
+  absl::flat_hash_set<int64> discarded_ar_ids;
   for (HloComputation* computation : module->MakeNonfusionComputations()) {
     for (HloInstruction* instruction : computation->instructions()) {
-      auto ar = MatchesArCrsPattern(instruction);
-      if (ar) {
-        all_reduce_map_[*((*ar)->all_reduce_id())].push_back(*ar);
+      auto maybe_pair = MatchesArCrsPattern(instruction);
+      if (maybe_pair) {
+        auto pair = *maybe_pair;
+        int64 ar_id = *(instruction->all_reduce_id());
+        if (discarded_ar_ids.find(ar_id) != discarded_ar_ids.end()) {
+          continue;
+        }
+        auto it = crs_reserved_map_.find(pair.crs);
+        if (it != crs_reserved_map_.end()) {
+          auto prev_ar_id = it->second;
+          // Since there is another AR paired with CRS,
+          // all_reduce_map_[prev_ar_id] should exist, but
+          // all_reduce_map_[ar_id] shouldn't.
+          CHECK(all_reduce_map_.find(ar_id) == all_reduce_map_.end());
+          CHECK_NE(prev_ar_id, ar_id);
+          auto prev_pair = all_reduce_map_[prev_ar_id].back();
+          int64 prev_distance = prev_pair.distance;
+          if (prev_distance < pair.distance) {
+            // The current AR's distance to CRS is longer than the previously
+            // tracked AR, so we discard the previous AR.
+            all_reduce_map_.erase(prev_ar_id);
+            discarded_ar_ids.insert(prev_ar_id);
+            all_reduce_map_[ar_id].push_back(pair);
+            crs_reserved_map_[pair.crs] = ar_id;
+          } else {
+            // Discard the current AR id because we are keeping the previously
+            // tracked AR.
+            discarded_ar_ids.insert(ar_id);
+          }
+        } else {
+          if (all_reduce_map_.find(ar_id) != all_reduce_map_.end()) {
+            int64 prev_distance = all_reduce_map_[ar_id].back().distance;
+            CHECK_EQ(prev_distance, pair.distance)
+                << "All ARs with the same AR ID must have the same distance "
+                   "from the corresponding CRSs. Found: "
+                << prev_distance << " and " << pair.distance;
+          }
+          all_reduce_map_[ar_id].push_back(pair);
+          crs_reserved_map_[pair.crs] = ar_id;
+        }
       }
     }
   }
@@ -205,20 +290,25 @@ void ArCrsCombiner::GroupAllReducesById(HloModule* module) {
 
 void ArCrsCombiner::KeepProvablyEqualInstructionGroups() {
   for (auto it : all_reduce_map_) {
-    auto instruction_vec = it.second;
-    CHECK_EQ(instruction_vec.size(), num_spatial_partitions_);
-
-    auto instr_0 = instruction_vec[0];
-    auto add_0 = instr_0->users()[0]->users()[0];
-    CHECK_EQ(HloOpcode::kAdd, add_0->opcode());
-
-    for (int i = 1; i < instruction_vec.size(); ++i) {
-      auto instr_i = instruction_vec[i];
-      auto add_i = instr_i->users()[0]->users()[0];
-      CHECK_EQ(HloOpcode::kAdd, add_i->opcode());
+    auto all_reduce_id = it.first;
+    auto pairs_vec = it.second;
+    CHECK_EQ(pairs_vec.size(), num_spatial_partitions_);
+    auto instr_0 = pairs_vec[0].ar;
+    for (int i = 1; i < pairs_vec.size(); ++i) {
+      auto instr_i = pairs_vec[i].ar;
+      auto next_0 = instr_0->users()[0];
+      auto next_i = instr_i->users()[0];
       absl::flat_hash_map<int64, int64> visited_pairs;
-      if (!InstructionsComputeSameValue(add_0, add_i, &visited_pairs)) {
-        all_reduce_map_.erase(it.first);
+      while (true) {
+        if (!InstructionsComputeSameValue(next_0, next_i, &visited_pairs)) {
+          all_reduce_map_.erase(all_reduce_id);
+          break;
+        }
+        if (next_0->IsCrossReplicaAllReduce()) {
+          break;
+        }
+        next_0 = next_0->users()[0];
+        next_i = next_i->users()[0];
       }
     }
   }
@@ -228,47 +318,59 @@ StatusOr<bool> ArCrsCombiner::RewriteGraph() {
   if (all_reduce_map_.empty()) {
     return false;
   }
-
-  auto computation_is_addition = [](HloComputation* c) {
-    return c->instruction_count() == 3 &&
-           Match(c->root_instruction(), m::Add(m::Parameter(), m::Parameter()));
-  };
-
   for (auto it : all_reduce_map_) {
-    auto instruction_vec = it.second;
-    for (auto all_reduce : instruction_vec) {
+    auto pairs_vec = it.second;
+    for (auto pair : pairs_vec) {
+      auto all_reduce = pair.ar;
       auto parent_computation = all_reduce->parent();
-      auto convert = all_reduce->users()[0];
-      auto add = convert->users()[0];
-      auto crs = add->users()[0];
-
-      if (!computation_is_addition(all_reduce->called_computations()[0]) ||
-          !computation_is_addition(crs->called_computations()[0])) {
-        continue;
+      auto all_reduce_id = all_reduce->all_reduce_id();
+      auto prev = all_reduce->mutable_operand(0);
+      auto next = all_reduce->users()[0];
+      TF_CHECK_OK(all_reduce->ReplaceUseWith(next, prev));
+      TF_CHECK_OK(parent_computation->RemoveInstruction(all_reduce));
+      while (!next->IsCrossReplicaAllReduce()) {
+        switch (next->opcode()) {
+          case HloOpcode::kBitcast:
+          case HloOpcode::kTranspose:
+          case HloOpcode::kReshape:
+          case HloOpcode::kConvert:
+          case HloOpcode::kMultiply:
+            break;
+          case HloOpcode::kAdd:
+          case HloOpcode::kSubtract: {
+            auto other_operand = (next->operands()[0] == prev)
+                                     ? next->operands()[1]
+                                     : next->operands()[0];
+            // To move the AR past the addition/subtraction, we need to divide
+            // other_operand by the number of spatial partitions, except if
+            // other_operand is a cross-module AR, which can be eliminated.
+            if (other_operand->IsCrossModuleAllReduce() &&
+                other_operand->user_count() == 1) {
+              TF_CHECK_OK(other_operand->ReplaceAllUsesWith(
+                  other_operand->mutable_operand(0)));
+            } else {
+              auto shape = other_operand->shape();
+              Literal lit(shape);
+              lit.PopulateWithValue<float>(num_spatial_partitions_);
+              auto divisor = parent_computation->AddInstruction(
+                  HloInstruction::CreateConstant(lit.Clone()));
+              auto division = parent_computation->AddInstruction(
+                  HloInstruction::CreateBinary(shape, HloOpcode::kDivide,
+                                               other_operand, divisor));
+              TF_CHECK_OK(other_operand->ReplaceUseWith(next, division));
+            }
+            break;
+          }
+          default:
+            LOG(FATAL) << "Unexpected instruction: " << next->ToShortString();
+        }
+        prev = next;
+        next = next->users()[0];
       }
-      HloInstruction* other_summand = (add->operands()[0] == convert)
-                                          ? add->operands()[1]
-                                          : add->operands()[0];
-      // To move the AR past the addition, we need to divide other_summand by
-      // the number of spatial partitions.
-      CHECK_EQ(all_reduce->user_count(), 1);
-      TF_CHECK_OK(
-          all_reduce->ReplaceAllUsesWith(all_reduce->mutable_operand(0)));
-      auto shape = other_summand->shape();
-      Literal lit(shape);
-      lit.PopulateWithValue<float>(num_spatial_partitions_);
-      auto divisor = parent_computation->AddInstruction(
-          HloInstruction::CreateConstant(lit.Clone()));
-      auto division =
-          parent_computation->AddInstruction(HloInstruction::CreateBinary(
-              shape, HloOpcode::kDivide, other_summand, divisor));
-      TF_CHECK_OK(other_summand->ReplaceUseWith(add, division));
       // The AllReduce and the CRS are combined to an all-core AllReduce.
-      crs->set_all_reduce_id(all_reduce->all_reduce_id());
-      TF_CHECK_OK(parent_computation->RemoveInstruction(all_reduce));
+      next->set_all_reduce_id(all_reduce_id);
     }
   }
-
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.h b/tensorflow/compiler/xla/service/ar_crs_combiner.h
index f6a7ef76ec3b76972d1b2c7fb548cecfb9423160..f503e1d5f2b519687e40818a61f0c0be9dfd3ab0 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner.h
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.h
@@ -25,9 +25,48 @@ limitations under the License.
 
 namespace xla {
 
-// Combine an AllReduce and a CrossReplicaSum when they are close to each other
-// in the graph, to use an efficient CrossReplicaSum implementation that
-// fully utilizes the interconnect bandwidth.
+// When the HLO graph contains a cross-module AllReduce, followed by some simple
+// linear operations, followed by a cross-replica AllReduce (also known as
+// cross-replica sum, or CRS), we can combine the CMAR and the CRAR, to use an
+// efficient AllReduce implementation that fully utilizes the interconnect
+// bandwidth.
+// Such sequences appear in spatially partitioned models.
+// This pass must run right after spatial partitioning, when the code is still
+// in a single HLO module.
+//
+// The steps are:
+// 1) Find CMARs followed by simple ops followed by CRARs.
+// 2) Group CMARs by all_reduce_id. They must all be rewritten.
+// 3) Prove that the CMAR patterns in each core produce the same result.
+// 4) Eliminate the CMAR, and if it feeds an addition/subtraction, divide the
+//    other operand by the number of spatial partitions.
+// 5) Turn the CRAR into an all-core AllReduce.
+//
+// The pass also handles the case where multiple CMARs lead to the same CRAR,
+// and eliminates all CMARs. This graph:
+//
+//        Y
+//        |
+//  X   CMAR_2   Z
+//  |      \    /
+// CMAR_1     +
+//    \     /
+//       +
+//       |
+//     CRAR
+//
+// gets rewritten to:
+//
+//           Z   num_partitions
+//            \  /
+//       Y    div
+//        \   /
+//    X     +
+//     \   /
+//       +
+//       |
+//  all-core AR
+//
 class ArCrsCombiner : public HloModulePass {
  public:
   ArCrsCombiner(int num_spatial_partitions)
@@ -40,6 +79,28 @@ class ArCrsCombiner : public HloModulePass {
                                                HloInstruction* i2);
 
  private:
+  // We used this struct because multiple ARs could be paired with the same CRS.
+  // In this case, we want to select the AR that is furthest from the CRS,
+  // because it makes it easier to eliminate all ARs during RewriteGraph.
+  struct ArCrsPair {
+    HloInstruction* ar;
+    HloInstruction* crs;
+    // The length of the path from AR to CRS in the HLO graph.
+    int64 distance;
+
+    ArCrsPair(HloInstruction* all_reduce, HloInstruction* cross_replica_sum,
+              int64 dist)
+        : ar(all_reduce), crs(cross_replica_sum), distance(dist) {}
+
+    string ToString() {
+      return absl::StrCat("(AR: ", ar->name(), ", CRS: ", crs->name(),
+                          ", distance: ", distance, ")");
+    }
+  };
+
+  absl::optional<ArCrsCombiner::ArCrsPair> MatchesArCrsPattern(
+      HloInstruction* instruction);
+
   // If the passed instruction is a while parameter, and the while body is only
   // called by a single while instruction, return the while instruction.
   absl::optional<HloInstruction*> WhileFromBodyParameter(
@@ -77,8 +138,13 @@ class ArCrsCombiner : public HloModulePass {
 
   int num_spatial_partitions_;
 
-  // Map from all-reduce ids to the all reduce instructions.
-  absl::flat_hash_map<int64, std::vector<HloInstruction*>> all_reduce_map_;
+  // Map from all-reduce ids to the AR/CRS pairs.
+  absl::flat_hash_map<int64, std::vector<ArCrsPair>> all_reduce_map_;
+
+  // Map from a CRS instruction to the all-reduce ID of the AR paired with the
+  // CRS. Sometimes, several ARs in the code could be paired with the same CRS.
+  // We use this map to pick a single AR/CRS path to rewrite.
+  absl::flat_hash_map<HloInstruction*, int64> crs_reserved_map_;
 
   std::unique_ptr<CallGraph> call_graph_;
 };
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
index 10171835d83c75fef091a34b8fe102d263211307..9c9db74fd2fdab836f91d2f749d08ad93f8879b0 100644
--- a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
@@ -32,8 +32,8 @@ HloModule foobar
 
 ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
   %p = f32[2,2] parameter(0)
-  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
-  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32.1 = f32[2,2] constant({{1, 2}, {3, 4}})
+  %constant.f32.2 = f32[2,2] constant({{1, 2}, {3, 4}})
   ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%constant.f32.1, %constant.f32.2)
 }
 )";
@@ -91,7 +91,7 @@ HloModule foobar
 
 ENTRY %entrycomp (p: f32[2,2]) -> ((f32[2,2]), (f32[2,2], f32[2,2])) {
   %p = f32[2,2] parameter(0)
-  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32 = f32[2,2] constant({{1, 2}, {3, 4}})
   %tuple1 = (f32[2,2]) tuple(%constant.f32)
   %tuple2 = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
   ROOT %tuple = ((f32[2,2]), (f32[2,2], f32[2,2])) tuple(%tuple1, %tuple2)
@@ -152,7 +152,7 @@ HloModule foobar
 
 ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
   %p = f32[2,2] parameter(0)
-  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32 = f32[2,2] constant({{1, 2}, {3, 4}})
   %tuple.1 = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
   %get-tuple-element.1 = f32[2,2] get-tuple-element(%tuple.1), index=0
   %get-tuple-element.2 = f32[2,2] get-tuple-element(%tuple.1), index=0
@@ -174,7 +174,7 @@ HloModule foobar
 
 ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
   %p = f32[2,2] parameter(0)
-  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32 = f32[2,2] constant({{1, 2}, {3, 4}})
   %tuple.1 = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
   %get-tuple-element.1 = f32[2,2] get-tuple-element(%tuple.1), index=0
   %get-tuple-element.2 = f32[2,2] get-tuple-element(%tuple.1), index=1
@@ -196,8 +196,8 @@ HloModule foobar
 
 ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
   %p = f32[2,2] parameter(0)
-  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
-  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{2, 3}, {4, 5}})
+  %constant.f32.1 = f32[2,2] constant({{1, 2}, {3, 4}})
+  %constant.f32.2 = f32[2,2] constant({{2, 3}, {4, 5}})
   %tuple.1 = (f32[2,2], f32[2,2]) tuple(%constant.f32.1, %constant.f32.2)
   %get-tuple-element.1 = f32[2,2] get-tuple-element(%tuple.1), index=0
   %get-tuple-element.2 = f32[2,2] get-tuple-element(%tuple.1), index=1
@@ -226,7 +226,7 @@ HloModule foobar
 
 %body (x: (f32[2,2], f32[2,2])) -> (f32[2,2], f32[2,2]) {
   %x = (f32[2,2], f32[2,2]) parameter(0)
-  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32 = f32[2,2] constant({{1, 2}, {3, 4}})
   %get-tuple-element.1 = f32[2,2] get-tuple-element(%x), index=0
   %get-tuple-element.2 = f32[2,2] get-tuple-element(%x), index=1
   %add.1 = f32[2,2] add(%get-tuple-element.1, %constant.f32)
@@ -235,7 +235,7 @@ HloModule foobar
 }
 
 ENTRY %WhileLoop () -> (f32[2,2], f32[2,2]) {
-  %constant.f32 = f32[2,2] constant(f32[2,2] {{3, 4}, {5, 6}})
+  %constant.f32 = f32[2,2] constant({{3, 4}, {5, 6}})
   %init.tuple = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
   ROOT %while = (f32[2,2], f32[2,2]) while(%init.tuple), condition=%condition, body=%body
 }
@@ -263,7 +263,7 @@ HloModule foobar
 
 %body (x: (f32[2,2], f32[2,2])) -> (f32[2,2], f32[2,2]) {
   %x = (f32[2,2], f32[2,2]) parameter(0)
-  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32 = f32[2,2] constant({{1, 2}, {3, 4}})
   %get-tuple-element.1 = f32[2,2] get-tuple-element(%x), index=0
   %get-tuple-element.2 = f32[2,2] get-tuple-element(%x), index=1
   %add.1 = f32[2,2] add(%get-tuple-element.1, %constant.f32)
@@ -272,8 +272,8 @@ HloModule foobar
 }
 
 ENTRY %WhileLoop () -> (f32[2,2], f32[2,2]) {
-  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{3, 4}, {5, 6}})
-  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{3, 4}, {7, 8}})
+  %constant.f32.1 = f32[2,2] constant({{3, 4}, {5, 6}})
+  %constant.f32.2 = f32[2,2] constant({{3, 4}, {7, 8}})
   %init.tuple = (f32[2,2], f32[2,2]) tuple(%constant.f32.1, %constant.f32.2)
   ROOT %while = (f32[2,2], f32[2,2]) while(%init.tuple), condition=%condition, body=%body
 }
@@ -301,8 +301,8 @@ HloModule foobar
 
 %body (x: (f32[2,2], f32[2,2])) -> (f32[2,2], f32[2,2]) {
   %x = (f32[2,2], f32[2,2]) parameter(0)
-  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
-  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{3, 4}, {1, 2}})
+  %constant.f32.1 = f32[2,2] constant({{1, 2}, {3, 4}})
+  %constant.f32.2 = f32[2,2] constant({{3, 4}, {1, 2}})
   %get-tuple-element.1 = f32[2,2] get-tuple-element(%x), index=0
   %get-tuple-element.2 = f32[2,2] get-tuple-element(%x), index=1
   %add.1 = f32[2,2] add(%get-tuple-element.1, %constant.f32.1)
@@ -311,7 +311,7 @@ HloModule foobar
 }
 
 ENTRY %WhileLoop () -> (f32[2,2], f32[2,2]) {
-  %constant.f32 = f32[2,2] constant(f32[2,2] {{3, 4}, {5, 6}})
+  %constant.f32 = f32[2,2] constant({{3, 4}, {5, 6}})
   %init.tuple = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
   ROOT %while = (f32[2,2], f32[2,2]) while(%init.tuple), condition=%condition, body=%body
 }
@@ -326,11 +326,27 @@ ENTRY %WhileLoop () -> (f32[2,2], f32[2,2]) {
   EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
 }
 
-TEST_F(ArCrsCombinerTest, RewritePatternArConvertAddCrs) {
+void CompareReplicaGroups(const std::vector<ReplicaGroup>& groups_before,
+                          const std::vector<ReplicaGroup>& groups_after) {
+  ASSERT_EQ(groups_before.size(), groups_after.size());
+  for (int i = 0; i < groups_before.size(); ++i) {
+    // Somewhat verbose way to compare the replica_ids, because EqualsProto
+    // is not available in the open-source build.
+    auto group_before = groups_before[i];
+    std::vector<int64> ids_before(group_before.replica_ids().begin(),
+                                  group_before.replica_ids().end());
+    auto group_after = groups_after[i];
+    std::vector<int64> ids_after(group_after.replica_ids().begin(),
+                                 group_after.replica_ids().end());
+    EXPECT_EQ(ids_before, ids_after);
+  }
+}
+
+TEST_F(ArCrsCombinerTest, RewriteArConvertCrs) {
   const char* module_str = R"(
 HloModule foobar
 
-%binary_add (a: bf16[], b: bf16[]) -> bf16[] {
+%sum.bf16 (a: bf16[], b: bf16[]) -> bf16[] {
   %a = bf16[] parameter(0)
   %b = bf16[] parameter(1)
   ROOT %add = bf16[] add(%a, %b)
@@ -342,49 +358,258 @@ HloModule foobar
   ROOT %add = f32[] add(%x, %y)
 }
 
-ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
-  %p = f32[2,2] parameter(0)
-  %constant.bf16 = bf16[2,2] constant(bf16[2,2] {{1, 2}, {3, 4}})
-  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+ENTRY %entrycomp (p: bf16[]) -> (f32[], f32[]) {
+  %p = bf16[] parameter(0)
+  %constant.bf16 = bf16[] constant(1)
+
+  %all-reduce.ar.1 = bf16[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum.bf16,
+      sharding={maximal device=0}
+  %convert.1 = f32[]
+      convert(%all-reduce.ar.1),
+      sharding={maximal device=0}
+  %all-reduce.1 = f32[]
+      all-reduce(%convert.1),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=0}
+
+  %all-reduce.ar.2 = bf16[]
+      all-reduce(%constant.bf16),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum.bf16,
+      sharding={maximal device=1}
+  %convert.2 = f32[]
+      convert(%all-reduce.ar.2),
+      sharding={maximal device=1}
+  %all-reduce.2 = f32[]
+      all-reduce(%convert.2),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[], f32[])
+      tuple(%all-reduce.1, %all-reduce.2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto crs_before =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_before = crs_before->replica_groups();
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::AllReduce(op::Convert(op::Parameter())),
+                        op::AllReduce(op::Convert(op::Constant()))));
+  auto crs_after =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_after = crs_after->replica_groups();
+  CompareReplicaGroups(replica_groups_before, replica_groups_after);
+}
+
+TEST_F(ArCrsCombinerTest, RewriteArBitcastCrs) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum.1 (a: f32[2,1], b: f32[2,1]) -> f32[2,1] {
+  %a = f32[2,1] parameter(0)
+  %b = f32[2,1] parameter(1)
+  ROOT %add = f32[2,1] add(%a, %b)
+}
+
+%sum.2 (x: f32[2], y: f32[2]) -> f32[2] {
+  %x = f32[2] parameter(0)
+  %y = f32[2] parameter(1)
+  ROOT %add = f32[2] add(%x, %y)
+}
 
-  %cross-replica-sum.ar.1 = bf16[2,2]
-      cross-replica-sum(%constant.bf16),
+ENTRY %entrycomp (p: f32[2,1]) -> (f32[2], f32[2]) {
+  %p = f32[2,1] parameter(0)
+
+  %all-reduce.ar.1 = f32[2,1]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum.1,
+      sharding={maximal device=0}
+  %bitcast.1 = f32[2]{0} bitcast(f32[2,1]{1,0} %all-reduce.ar.1)
+  %all-reduce.1 = f32[2]
+      all-reduce(%bitcast.1),
+      replica_groups={{0,1}},
+      to_apply=%sum.2,
+      sharding={maximal device=0}
+
+  %all-reduce.ar.2 = f32[2,1]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum.1,
+      sharding={maximal device=1}
+  %bitcast.2 = f32[2]{0} bitcast(f32[2,1]{1,0} %all-reduce.ar.2)
+  %all-reduce.2 = f32[2]
+      all-reduce(%bitcast.2),
+      replica_groups={{0,1}},
+      to_apply=%sum.2,
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[], f32[])
+      tuple(%all-reduce.1, %all-reduce.2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto crs_before =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_before = crs_before->replica_groups();
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::AllReduce(op::Bitcast(op::Parameter())),
+                        op::AllReduce(op::Bitcast(op::Parameter()))));
+  auto crs_after =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_after = crs_after->replica_groups();
+  CompareReplicaGroups(replica_groups_before, replica_groups_after);
+}
+
+TEST_F(ArCrsCombinerTest, RewriteArMultiplyCrs) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum.f32 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
+  %p = f32[] parameter(0)
+  %constant.f32 = f32[] constant(123)
+
+  %all-reduce.ar.1 = f32[]
+      all-reduce(%p),
       replica_groups={{0},{1}},
       all_reduce_id=1,
-      to_apply=%binary_add,
+      to_apply=%sum.f32,
       sharding={maximal device=0}
-  %convert.1 = f32[2,2]
-      convert(%cross-replica-sum.ar.1),
+  %multiply.1 = f32[]
+      multiply(%all-reduce.ar.1, %constant.f32),
       sharding={maximal device=0}
-  %add.1 = f32[2,2]
+  %all-reduce.1 = f32[]
+      all-reduce(%multiply.1),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=0}
+
+  %all-reduce.ar.2 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum.f32,
+      sharding={maximal device=1}
+  %multiply.2 = f32[]
+      multiply(%all-reduce.ar.2, %constant.f32),
+      sharding={maximal device=1}
+  %all-reduce.2 = f32[]
+      all-reduce(%multiply.2),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[], f32[])
+      tuple(%all-reduce.1, %all-reduce.2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto crs_before =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_before = crs_before->replica_groups();
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Tuple(op::AllReduce(op::Multiply(op::Parameter(), op::Constant())),
+                op::AllReduce(op::Multiply(op::Parameter(), op::Constant()))));
+  auto crs_after =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_after = crs_after->replica_groups();
+  CompareReplicaGroups(replica_groups_before, replica_groups_after);
+}
+
+TEST_F(ArCrsCombinerTest, RewriteArConvertAddCrs) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum.bf16 (a: bf16[], b: bf16[]) -> bf16[] {
+  %a = bf16[] parameter(0)
+  %b = bf16[] parameter(1)
+  ROOT %add = bf16[] add(%a, %b)
+}
+
+%sum.f32 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
+  %p = f32[] parameter(0)
+  %constant.bf16 = bf16[] constant(1)
+  %constant.f32 = f32[] constant(2)
+
+  %all-reduce.ar.1 = bf16[]
+      all-reduce(%constant.bf16),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum.bf16,
+      sharding={maximal device=0}
+  %convert.1 = f32[]
+      convert(%all-reduce.ar.1),
+      sharding={maximal device=0}
+  %add.1 = f32[]
       add(%constant.f32, %convert.1),
       sharding={maximal device=0}
-  %cross-replica-sum.1 = f32[2,2]
-      cross-replica-sum(%add.1),
+  %all-reduce.1 = f32[]
+      all-reduce(%add.1),
       replica_groups={{0,1}},
       to_apply=%sum.f32,
       sharding={maximal device=0}
 
-  %cross-replica-sum.ar.2 = bf16[2,2]
-      cross-replica-sum(%constant.bf16),
+  %all-reduce.ar.2 = bf16[]
+      all-reduce(%constant.bf16),
       replica_groups={{0},{1}},
       all_reduce_id=1,
-      to_apply=%binary_add,
+      to_apply=%sum.bf16,
       sharding={maximal device=1}
-  %convert.2 = f32[2,2]
-      convert(%cross-replica-sum.ar.2),
+  %convert.2 = f32[]
+      convert(%all-reduce.ar.2),
       sharding={maximal device=1}
-  %add.2 = f32[2,2]
+  %add.2 = f32[]
       add(%constant.f32, %convert.2),
       sharding={maximal device=1}
-  %cross-replica-sum.2 = f32[2,2]
-      cross-replica-sum(%add.2),
+  %all-reduce.2 = f32[]
+      all-reduce(%add.2),
       replica_groups={{0,1}},
       to_apply=%sum.f32,
       sharding={maximal device=1}
 
-  ROOT %tuple = (f32[2,2], f32[2,2])
-      tuple(%cross-replica-sum.1, %cross-replica-sum.2),
+  ROOT %tuple = (f32[], f32[])
+      tuple(%all-reduce.1, %all-reduce.2),
       sharding={{maximal device=0}, {maximal device=1}}
 }
 )";
@@ -400,32 +625,21 @@ ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
   EXPECT_THAT(
       module->entry_computation()->root_instruction(),
       op::Tuple(
-          op::CrossReplicaSum(op::Add(
-              op::Divide(op::Constant(), op::Constant()), op::Convert())),
-          op::CrossReplicaSum(op::Add(
-              op::Divide(op::Constant(), op::Constant()), op::Convert()))));
+          op::AllReduce(op::Add(op::Divide(op::Constant(), op::Constant()),
+                                op::Convert())),
+          op::AllReduce(op::Add(op::Divide(op::Constant(), op::Constant()),
+                                op::Convert()))));
   auto crs_after =
       module->entry_computation()->root_instruction()->operands()[0];
   auto replica_groups_after = crs_after->replica_groups();
-  ASSERT_EQ(replica_groups_before.size(), replica_groups_after.size());
-  for (int i = 0; i < replica_groups_before.size(); ++i) {
-    // Somewhat verbose way to compare the replica_ids, because EqualsProto
-    // is not available in the open-source build.
-    auto group_before = replica_groups_before[i];
-    std::vector<int64> ids_before(group_before.replica_ids().begin(),
-                                  group_before.replica_ids().end());
-    auto group_after = replica_groups_after[i];
-    std::vector<int64> ids_after(group_after.replica_ids().begin(),
-                                 group_after.replica_ids().end());
-    EXPECT_EQ(ids_before, ids_after);
-  }
+  CompareReplicaGroups(replica_groups_before, replica_groups_after);
 }
 
 TEST_F(ArCrsCombinerTest, OtherSummandNotTheSameDontRewrite) {
   const char* module_str = R"(
 HloModule foobar
 
-%binary_add (a: bf16[], b: bf16[]) -> bf16[] {
+%sum.bf16 (a: bf16[], b: bf16[]) -> bf16[] {
   %a = bf16[] parameter(0)
   %b = bf16[] parameter(1)
   ROOT %add = bf16[] add(%a, %b)
@@ -437,50 +651,517 @@ HloModule foobar
   ROOT %add = f32[] add(%x, %y)
 }
 
-ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
-  %p = f32[2,2] parameter(0)
-  %constant.bf16 = bf16[2,2] constant(bf16[2,2] {{1, 2}, {3, 4}})
-  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
-  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{3, 4}, {5, 6}})
+ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
+  %p = f32[] parameter(0)
+  %constant.bf16 = bf16[] constant(1)
+  %constant.f32.1 = f32[] constant(2)
+  %constant.f32.2 = f32[] constant(3)
 
-  %cross-replica-sum.ar.1 = bf16[2,2]
-      cross-replica-sum(%constant.bf16),
+  %all-reduce.ar.1 = bf16[]
+      all-reduce(%constant.bf16),
       replica_groups={{0},{1}},
       all_reduce_id=1,
-      to_apply=%binary_add,
+      to_apply=%sum.bf16,
       sharding={maximal device=0}
-  %convert.1 = f32[2,2]
-      convert(%cross-replica-sum.ar.1),
+  %convert.1 = f32[]
+      convert(%all-reduce.ar.1),
       sharding={maximal device=0}
-  %add.1 = f32[2,2]
+  %add.1 = f32[]
       add(%constant.f32.1, %convert.1),
       sharding={maximal device=0}
-  %cross-replica-sum.1 = f32[2,2]
-      cross-replica-sum(%add.1),
+  %all-reduce.1 = f32[]
+      all-reduce(%add.1),
       replica_groups={{0,1}},
       to_apply=%sum.f32,
       sharding={maximal device=0}
 
-  %cross-replica-sum.ar.2 = bf16[2,2]
-      cross-replica-sum(%constant.bf16),
+  %all-reduce.ar.2 = bf16[]
+      all-reduce(%constant.bf16),
       replica_groups={{0},{1}},
       all_reduce_id=1,
-      to_apply=%binary_add,
+      to_apply=%sum.bf16,
       sharding={maximal device=1}
-  %convert.2 = f32[2,2]
-      convert(%cross-replica-sum.ar.2),
+  %convert.2 = f32[]
+      convert(%all-reduce.ar.2),
       sharding={maximal device=1}
-  %add.2 = f32[2,2]
+  %add.2 = f32[]
       add(%constant.f32.2, %convert.2),
       sharding={maximal device=1}
-  %cross-replica-sum.2 = f32[2,2]
-      cross-replica-sum(%add.2),
+  %all-reduce.2 = f32[]
+      all-reduce(%add.2),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[], f32[])
+      tuple(%all-reduce.1, %all-reduce.2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(ArCrsCombinerTest, ArThenCrsDontCrash) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum.1 (a: f32[], b: f32[]) -> f32[] {
+  %a = f32[] parameter(0)
+  %b = f32[] parameter(1)
+  ROOT %add = f32[] add(%a, %b)
+}
+
+ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
+  %p = f32[] parameter(0)
+  %constant.f32 = f32[] constant(123)
+
+  %all-reduce.ar.1 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum.1,
+      sharding={maximal device=0}
+  %all-reduce.1 = f32[]
+      all-reduce(%all-reduce.ar.1),
+      replica_groups={{0,1}},
+      to_apply=%sum.1,
+      sharding={maximal device=0}
+  %multiply.1 = f32[]
+      multiply(%all-reduce.1, %constant.f32),
+      sharding={maximal device=0}
+
+  %all-reduce.ar.2 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum.1,
+      sharding={maximal device=1}
+  %all-reduce.2 = f32[]
+      all-reduce(%all-reduce.ar.2),
+      replica_groups={{0,1}},
+      to_apply=%sum.1,
+      sharding={maximal device=1}
+  %multiply.2 = f32[]
+      multiply(%all-reduce.2, %constant.f32),
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[], f32[])
+      tuple(%all-reduce.1, %all-reduce.2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto crs_before =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_before = crs_before->replica_groups();
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::AllReduce(op::Parameter()),
+                        op::AllReduce(op::Parameter())));
+  auto crs_after =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_after = crs_after->replica_groups();
+  CompareReplicaGroups(replica_groups_before, replica_groups_after);
+}
+
+TEST_F(ArCrsCombinerTest, RewriteMultipleAdds) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
+  %p = f32[] parameter(0)
+  %constant.1 = f32[] constant(1)
+  %constant.2 = f32[] constant(2)
+
+  %all-reduce.ar.1 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum,
+      sharding={maximal device=0}
+  %add.11 = f32[]
+      add(%constant.1, %all-reduce.ar.1),
+      sharding={maximal device=0}
+  %add.12 = f32[]
+      add(%constant.2, %add.11),
+      sharding={maximal device=0}
+  %all-reduce.1 = f32[]
+      all-reduce(%add.12),
+      replica_groups={{0,1}},
+      to_apply=%sum,
+      sharding={maximal device=0}
+
+  %all-reduce.ar.2 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum,
+      sharding={maximal device=0}
+  %add.21 = f32[]
+      add(%constant.1, %all-reduce.ar.2),
+      sharding={maximal device=0}
+  %add.22 = f32[]
+      add(%constant.2, %add.21),
+      sharding={maximal device=0}
+  %all-reduce.2 = f32[]
+      all-reduce(%add.22),
+      replica_groups={{0,1}},
+      to_apply=%sum,
+      sharding={maximal device=0}
+
+  ROOT %tuple = (f32[], f32[])
+      tuple(%all-reduce.1, %all-reduce.2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto crs_before =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_before = crs_before->replica_groups();
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::AllReduce(op::Add(
+                            op::Divide(op::Constant(), op::Constant()),
+                            op::Add(op::Divide(op::Constant(), op::Constant()),
+                                    op::Parameter()))),
+                        op::AllReduce(op::Add(
+                            op::Divide(op::Constant(), op::Constant()),
+                            op::Add(op::Divide(op::Constant(), op::Constant()),
+                                    op::Parameter())))));
+  auto crs_after =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_after = crs_after->replica_groups();
+  CompareReplicaGroups(replica_groups_before, replica_groups_after);
+}
+
+TEST_F(ArCrsCombinerTest, RewriteArSubtractCrs) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum.f32 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
+  %p = f32[] parameter(0)
+  %constant.f32 = f32[] constant(123)
+
+  %all-reduce.ar.1 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum.f32,
+      sharding={maximal device=0}
+  %sub.1 = f32[]
+      subtract(%constant.f32, %all-reduce.ar.1),
+      sharding={maximal device=0}
+  %all-reduce.1 = f32[]
+      all-reduce(%sub.1),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=0}
+
+  %all-reduce.ar.2 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum.f32,
+      sharding={maximal device=1}
+  %sub.2 = f32[]
+      subtract(%constant.f32, %all-reduce.ar.2),
+      sharding={maximal device=1}
+  %all-reduce.2 = f32[]
+      all-reduce(%sub.2),
       replica_groups={{0,1}},
       to_apply=%sum.f32,
       sharding={maximal device=1}
 
-  ROOT %tuple = (f32[2,2], f32[2,2])
-      tuple(%cross-replica-sum.1, %cross-replica-sum.2),
+  ROOT %tuple = (f32[], f32[])
+      tuple(%all-reduce.1, %all-reduce.2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto crs_before =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_before = crs_before->replica_groups();
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Tuple(
+          op::AllReduce(op::Subtract(op::Divide(op::Constant(), op::Constant()),
+                                     op::Parameter())),
+          op::AllReduce(op::Subtract(op::Divide(op::Constant(), op::Constant()),
+                                     op::Parameter()))));
+  auto crs_after =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_after = crs_after->replica_groups();
+  CompareReplicaGroups(replica_groups_before, replica_groups_after);
+}
+
+TEST_F(ArCrsCombinerTest, RewriteMultipleARsLeft) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
+  %p = f32[] parameter(0)
+  %const1 = f32[] constant(1)
+  %const2 = f32[] constant(2)
+
+  %ar11 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum,
+      sharding={maximal device=0}
+  %add11 = f32[]
+      add(%ar11, %const1),
+      sharding={maximal device=0}
+  %ar12 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=2,
+      to_apply=%sum,
+      sharding={maximal device=0}
+  %add12 = f32[]
+      add(%add11, %ar12),
+      sharding={maximal device=0}
+  %crs1 = f32[]
+      all-reduce(%add12),
+      replica_groups={{0,1}},
+      to_apply=%sum,
+      sharding={maximal device=0}
+
+  %ar21 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum,
+      sharding={maximal device=1}
+  %add21 = f32[]
+      add(%ar21, %const1),
+      sharding={maximal device=1}
+  %ar22 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=2,
+      to_apply=%sum,
+      sharding={maximal device=1}
+  %add22 = f32[]
+      add(%add21, %ar22),
+      sharding={maximal device=1}
+  %crs2 = f32[]
+      all-reduce(%add22),
+      replica_groups={{0,1}},
+      to_apply=%sum,
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[], f32[])
+      tuple(%crs1, %crs2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto crs_before =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_before = crs_before->replica_groups();
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::AllReduce(op::Add(
+                            op::Add(op::Parameter(),
+                                    op::Divide(op::Constant(), op::Constant())),
+                            op::Parameter())),
+                        op::AllReduce(op::Add(
+                            op::Add(op::Parameter(),
+                                    op::Divide(op::Constant(), op::Constant())),
+                            op::Parameter()))));
+  auto crs_after =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_after = crs_after->replica_groups();
+  CompareReplicaGroups(replica_groups_before, replica_groups_after);
+}
+
+TEST_F(ArCrsCombinerTest, RewriteMultipleARsRight) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[]) -> (f32[], f32[]) {
+  %p = f32[] parameter(0)
+  %const1 = f32[] constant(1)
+  %const2 = f32[] constant(2)
+
+  %ar11 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum,
+      sharding={maximal device=0}
+  %ar12 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=2,
+      to_apply=%sum,
+      sharding={maximal device=0}
+  %add11 = f32[]
+      add(%ar12, %const1),
+      sharding={maximal device=0}
+  %add12 = f32[]
+      add(%ar11, %add11),
+      sharding={maximal device=0}
+  %crs1 = f32[]
+      all-reduce(%add12),
+      replica_groups={{0,1}},
+      to_apply=%sum,
+      sharding={maximal device=0}
+
+  %ar21 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%sum,
+      sharding={maximal device=1}
+  %ar22 = f32[]
+      all-reduce(%p),
+      replica_groups={{0},{1}},
+      all_reduce_id=2,
+      to_apply=%sum,
+      sharding={maximal device=1}
+  %add21 = f32[]
+      add(%ar22, %const1),
+      sharding={maximal device=1}
+  %add22 = f32[]
+      add(%ar21, %add21),
+      sharding={maximal device=1}
+  %crs2 = f32[]
+      all-reduce(%add22),
+      replica_groups={{0,1}},
+      to_apply=%sum,
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[], f32[])
+      tuple(%crs1, %crs2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto crs_before =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_before = crs_before->replica_groups();
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Tuple(op::AllReduce(op::Add(
+                    op::Parameter(),
+                    op::Add(op::Parameter(),
+                            op::Divide(op::Constant(), op::Constant())))),
+                op::AllReduce(op::Add(
+                    op::Parameter(),
+                    op::Add(op::Parameter(),
+                            op::Divide(op::Constant(), op::Constant()))))));
+
+  auto crs_after =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_after = crs_after->replica_groups();
+  CompareReplicaGroups(replica_groups_before, replica_groups_after);
+}
+
+TEST_F(ArCrsCombinerTest, OneReplicaDontRewrite) {
+  const char* module_str = R"(
+HloModule foobar
+
+%sum.bf16 (a: bf16[], b: bf16[]) -> bf16[] {
+  %a = bf16[] parameter(0)
+  %b = bf16[] parameter(1)
+  ROOT %add = bf16[] add(%a, %b)
+}
+
+%sum.f32 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: bf16[]) -> (f32[], f32[]) {
+  %p = bf16[] parameter(0)
+  %constant.bf16 = bf16[] constant(1)
+
+  %all-reduce.ar.1 = bf16[]
+      all-reduce(%p),
+      replica_groups={{0}},
+      all_reduce_id=1,
+      to_apply=%sum.bf16,
+      sharding={maximal device=0}
+  %convert.1 = f32[]
+      convert(%all-reduce.ar.1),
+      sharding={maximal device=0}
+  %all-reduce.1 = f32[]
+      all-reduce(%convert.1),
+      replica_groups={{0}},
+      to_apply=%sum.f32,
+      sharding={maximal device=0}
+
+  %all-reduce.ar.2 = bf16[]
+      all-reduce(%constant.bf16),
+      replica_groups={{0}},
+      all_reduce_id=1,
+      to_apply=%sum.bf16,
+      sharding={maximal device=1}
+  %convert.2 = f32[]
+      convert(%all-reduce.ar.2),
+      sharding={maximal device=1}
+  %all-reduce.2 = f32[]
+      all-reduce(%convert.2),
+      replica_groups={{0}},
+      to_apply=%sum.f32,
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[], f32[])
+      tuple(%all-reduce.1, %all-reduce.2),
       sharding={{maximal device=0}, {maximal device=1}}
 }
 )";
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index 5c180cbdd492031e133b81149f0f4698619b7788..d016d3e03d5e994841b81cda6214b6ff7cb550be 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/byte_order.h"
@@ -57,18 +56,48 @@ int BackendOptions::intra_op_parallelism_threads() const {
   return intra_op_parallelism_threads_;
 }
 
+BackendOptions& BackendOptions::set_allowed_devices(
+    const absl::optional<std::set<int>>& allowed_devices) {
+  allowed_devices_ = allowed_devices;
+  return *this;
+}
+
+const absl::optional<std::set<int>>& BackendOptions::allowed_devices() const {
+  return allowed_devices_;
+}
+
+namespace {
+
+class EigenThreadPoolWrapper : public Eigen::ThreadPoolInterface {
+ public:
+  explicit EigenThreadPoolWrapper(tensorflow::thread::ThreadPool* pool)
+      : pool_(pool) {}
+  ~EigenThreadPoolWrapper() override {}
+
+  void Schedule(std::function<void()> fn) override {
+    pool_->Schedule(std::move(fn));
+  }
+  int NumThreads() const override { return pool_->NumThreads(); }
+  int CurrentThreadId() const override { return pool_->CurrentThreadId(); }
+
+ private:
+  tensorflow::thread::ThreadPool* pool_ = nullptr;
+};
+
+}  // namespace
+
 // Define this in .cc file to avoid having to include eigen or forward declare
 // these types in the header.
-struct Backend::EigenThreadPoolWrapper {
-  explicit EigenThreadPoolWrapper(const int num_threads)
+struct Backend::IntraOpThreadPool {
+  explicit IntraOpThreadPool(const int num_threads)
       : pool(new tensorflow::thread::ThreadPool(tensorflow::Env::Default(),
                                                 "XLAEigen", num_threads)),
-        wrapper(new tensorflow::EigenThreadPoolWrapper(pool.get())),
+        wrapper(new EigenThreadPoolWrapper(pool.get())),
         device(new Eigen::ThreadPoolDevice(wrapper.get(),
                                            wrapper->NumThreads())) {}
 
   std::unique_ptr<tensorflow::thread::ThreadPool> pool;
-  std::unique_ptr<tensorflow::EigenThreadPoolWrapper> wrapper;
+  std::unique_ptr<EigenThreadPoolWrapper> wrapper;
   std::unique_ptr<Eigen::ThreadPoolDevice> device;
 };
 
@@ -76,8 +105,9 @@ struct Backend::EigenThreadPoolWrapper {
     const BackendOptions& options) {
   se::Platform* platform = options.platform();
   TF_ASSIGN_OR_RETURN(auto compiler, Compiler::GetForPlatform(platform));
-  TF_ASSIGN_OR_RETURN(auto stream_executors,
-                      PlatformUtil::GetStreamExecutors(platform));
+  TF_ASSIGN_OR_RETURN(
+      auto stream_executors,
+      PlatformUtil::GetStreamExecutors(platform, options.allowed_devices()));
   TF_ASSIGN_OR_RETURN(auto transfer_manager,
                       TransferManager::GetForPlatform(platform));
   TF_ASSIGN_OR_RETURN(auto computation_placer,
@@ -104,12 +134,10 @@ StatusOr<StreamPool::Ptr> Backend::BorrowStream(int device_ordinal) {
 
 StatusOr<StreamPool::Ptr> Backend::BorrowStream(se::StreamExecutor* executor) {
   tensorflow::mutex_lock l(mu_);
-  if (0 == stream_pools_.count(executor)) {
-    stream_pools_.emplace(std::piecewise_construct,
-                          std::forward_as_tuple(executor),
-                          std::forward_as_tuple());
+  if (!stream_pools_.contains(executor)) {
+    stream_pools_.emplace(executor, absl::make_unique<StreamPool>());
   }
-  return stream_pools_.at(executor).BorrowStream(executor);
+  return stream_pools_.at(executor)->BorrowStream(executor);
 }
 
 Backend::Backend(se::Platform* platform, Compiler* compiler,
@@ -137,8 +165,7 @@ Backend::Backend(se::Platform* platform, Compiler* compiler,
     const int num_threads = intra_op_parallelism_threads > 0
                                 ? intra_op_parallelism_threads
                                 : tensorflow::port::NumSchedulableCPUs();
-    intra_op_thread_pool_wrapper_.reset(
-        new EigenThreadPoolWrapper(num_threads));
+    intra_op_thread_pool_.reset(new IntraOpThreadPool(num_threads));
   }
 }
 
@@ -150,17 +177,17 @@ int Backend::default_device_ordinal() const {
 
 const Eigen::ThreadPoolDevice* Backend::eigen_intra_op_thread_pool_device()
     const {
-  if (intra_op_thread_pool_wrapper_ == nullptr) {
+  if (intra_op_thread_pool_ == nullptr) {
     return nullptr;
   }
-  return intra_op_thread_pool_wrapper_->device.get();
+  return intra_op_thread_pool_->device.get();
 }
 
 tensorflow::thread::ThreadPool* Backend::eigen_intra_op_thread_pool() const {
-  if (intra_op_thread_pool_wrapper_ == nullptr) {
+  if (intra_op_thread_pool_ == nullptr) {
     return nullptr;
   }
-  return intra_op_thread_pool_wrapper_->pool.get();
+  return intra_op_thread_pool_->pool.get();
 }
 
 StatusOr<se::StreamExecutor*> Backend::stream_executor(
diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h
index a2dafbe803f8bd5f23e4e9f3f6d3e6f744c9fab9..e7f29a044b95015aa7e547373c24971646833280 100644
--- a/tensorflow/compiler/xla/service/backend.h
+++ b/tensorflow/compiler/xla/service/backend.h
@@ -18,9 +18,11 @@ limitations under the License.
 
 #include <map>
 #include <memory>
+#include <set>
 #include <string>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
@@ -53,9 +55,16 @@ class BackendOptions {
   BackendOptions& set_intra_op_parallelism_threads(int num_threads);
   int intra_op_parallelism_threads() const;
 
+  // Sets the allowed_devices for selectively constructing stream executors
+  // on the platform.
+  BackendOptions& set_allowed_devices(
+      const absl::optional<std::set<int>>& allowed_devices);
+  const absl::optional<std::set<int>>& allowed_devices() const;
+
  private:
   se::Platform* platform_ = nullptr;
   int intra_op_parallelism_threads_ = -1;
+  absl::optional<std::set<int>> allowed_devices_;
 };
 
 // Class which encapsulates an XLA backend. It includes everything necessary
@@ -147,7 +156,6 @@ class Backend {
   Status ResetDevices();
 
  private:
-  struct EigenThreadPoolWrapper;
   Backend(se::Platform* platform, Compiler* compiler,
           absl::Span<se::StreamExecutor* const> stream_executors,
           TransferManager* transfer_manager,
@@ -167,13 +175,15 @@ class Backend {
   tensorflow::mutex mu_;
 
   // Mapping from stream executor to stream pools, used by `BorrowStream` above.
-  std::map<se::StreamExecutor*, StreamPool> stream_pools_ GUARDED_BY(mu_);
+  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<StreamPool>>
+      stream_pools_ GUARDED_BY(mu_);
 
   // The default memory allocator to use.
   std::unique_ptr<StreamExecutorMemoryAllocator> memory_allocator_;
 
   // For the CPU backend, an Eigen threadpool device for use by Eigen code.
-  std::unique_ptr<EigenThreadPoolWrapper> intra_op_thread_pool_wrapper_;
+  struct IntraOpThreadPool;
+  std::unique_ptr<IntraOpThreadPool> intra_op_thread_pool_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/batch_dot_simplification.cc b/tensorflow/compiler/xla/service/batch_dot_simplification.cc
index eda026ac5685dc469a6230094eb28b3618e36400..dbabd82dd55465dd4c85a56aea849a3e3702d6bf 100644
--- a/tensorflow/compiler/xla/service/batch_dot_simplification.cc
+++ b/tensorflow/compiler/xla/service/batch_dot_simplification.cc
@@ -28,6 +28,13 @@ BatchDotSimplification::ElideDegenerateBatchDimensionFromBatchDot(
                  *rhs = batch_dot->mutable_operand(1);
   const Shape& lhs_shape = lhs->shape();
 
+  // A dot with no contracting dims will be rewritten into a multiply by
+  // AlgebraicSimplifier. Dots with multiple contracting dims are currently
+  // unsupported.
+  if (dim_numbers.lhs_contracting_dimensions_size() != 1) {
+    return false;
+  }
+
   std::vector<int64> degenerate_dims;
   for (int64 batch_dim : dim_numbers.lhs_batch_dimensions()) {
     if (lhs_shape.dimensions(batch_dim) == 1) {
diff --git a/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc b/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc
index 52ec1a794c5e9f4452a4bf2b648f453d8acfe976..a81f394a38f091b89b7f1e4d26653ff549f35b75 100644
--- a/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc
+++ b/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc
@@ -169,5 +169,47 @@ main {
                   /*lhs_contracting_dim=*/3, /*rhs_contracting_dim=*/2)));
 }
 
+TEST_F(BatchDotSimplificationTest,
+       ElideMultipleDegenerateBatchDotDimsNonContracting) {
+  const char* hlo_text = R"(
+HloModule BatchDot
+
+main {
+  a = f32[1,101] parameter(0)
+  b = f32[1,101] parameter(1)
+  ROOT dot = f32[1,101,101] dot(a,b), lhs_batch_dims={0},
+                                      lhs_contracting_dims={},
+                                      rhs_batch_dims={0},
+                                      rhs_contracting_dims={}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  BatchDotSimplification pass;
+  ASSERT_FALSE(pass.Run(m.get()).ValueOrDie());
+}
+
+TEST_F(BatchDotSimplificationTest,
+       ElideMultipleDegenerateBatchDotDimsMultipleContracting) {
+  const char* hlo_text = R"(
+HloModule BatchDot
+
+main {
+  lhs = f32[1,5,17,10,13] parameter(0)
+  rhs = f32[1,9,10,13,6,5] parameter(1)
+  ROOT dot = f32[10,1,17,9,6] dot(lhs,rhs), lhs_batch_dims={3,0},
+                                            rhs_batch_dims={2,0},
+                                            lhs_contracting_dims={1,4},
+                                            rhs_contracting_dims={5,3}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  BatchDotSimplification pass;
+  ASSERT_FALSE(pass.Run(m.get()).ValueOrDie());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander.cc b/tensorflow/compiler/xla/service/batchnorm_expander.cc
index 0e6ca1871b379a2f55b92207133822fc6258b007..620876c264ad446542e3ad8229593c1f56c94604 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.cc
@@ -95,15 +95,8 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
       HloInstruction* operand,
       const std::function<HloInstruction*(std::unique_ptr<HloInstruction>)>&
           add_instruction) {
-    HloInstruction* exponent = add_instruction(HloInstruction::CreateBroadcast(
-        operand->shape(),
-        add_instruction(HloInstruction::CreateConvert(
-            ShapeUtil::MakeShape(operand->shape().element_type(), {}),
-            add_instruction(HloInstruction::CreateConstant(
-                LiteralUtil::CreateR0<float>(-0.5f))))),
-        {}));
-    return HloInstruction::CreateBinary(operand->shape(), HloOpcode::kPower,
-                                        operand, exponent);
+    return HloInstruction::CreateUnary(operand->shape(), HloOpcode::kRsqrt,
+                                       operand);
   }
 
   std::unique_ptr<HloInstruction> Mean(
@@ -123,7 +116,7 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
     auto elements_per_feature_u32 = add_instruction(
         HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(1)));
 
-    for (int64 i = 0; i < ShapeUtil::Rank(operand->shape()); ++i) {
+    for (int64 i = 0; i < operand->shape().rank(); ++i) {
       if (i == feature_index) {
         continue;
       }
@@ -229,7 +222,7 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
       add(HloInstruction::CreateConstant(std::move(epsilon_literal))), {}));
   std::vector<int64> dimensions_without_feature;
 
-  for (int64 i = 0; i < ShapeUtil::Rank(operand_shape); ++i) {
+  for (int64 i = 0; i < operand_shape.rank(); ++i) {
     if (i != feature_index) {
       dimensions_without_feature.push_back(i);
     }
@@ -357,7 +350,7 @@ Status BatchNormExpanderVisitor::HandleBatchNormInference(
 
   std::vector<int64> dimensions_without_feature;
 
-  for (int64 i = 0; i < ShapeUtil::Rank(operand_shape); ++i) {
+  for (int64 i = 0; i < operand_shape.rank(); ++i) {
     if (i != feature_index) {
       dimensions_without_feature.push_back(i);
     }
@@ -494,7 +487,7 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
 
   std::vector<int64> dimensions_without_feature;
 
-  for (int64 i = 0; i < ShapeUtil::Rank(activation_shape); ++i) {
+  for (int64 i = 0; i < activation_shape.rank(); ++i) {
     if (i != feature_index) {
       dimensions_without_feature.push_back(i);
     }
diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
index e9d30fc03c1c3194de577e6683b36a95641694d9..e62d72b323bd1d113e9d87bf8602bfb434c40d61 100644
--- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
@@ -34,8 +34,8 @@ class BFloat16ConversionFoldingVisitor : public DfsHloVisitorWithDefault {
 
   Status DefaultAction(HloInstruction* hlo) override;
 
-  // Special handling for cross-replica-sum which can have a tuple output.
-  Status HandleCrossReplicaSum(HloInstruction* crs) override;
+  // Special handling for all-reduce which can have a tuple output.
+  Status HandleAllReduce(HloInstruction* crs) override;
 
   static bool Run(HloComputation* computation,
                   const BFloat16Support* bfloat16_support) {
@@ -176,8 +176,7 @@ Status BFloat16ConversionFoldingVisitor::DefaultAction(HloInstruction* hlo) {
   return TryFoldBF16Conversions(hlo);
 }
 
-Status BFloat16ConversionFoldingVisitor::HandleCrossReplicaSum(
-    HloInstruction* crs) {
+Status BFloat16ConversionFoldingVisitor::HandleAllReduce(HloInstruction* crs) {
   if (crs->IsCrossModuleAllReduce()) {
     // Cross-module all-reduce has side effect.
     return Status::OK();
@@ -191,7 +190,7 @@ Status BFloat16ConversionFoldingVisitor::HandleCrossReplicaSum(
   }
 
   // If the output is not a tuple, we don't need special handling.
-  if (!ShapeUtil::IsTuple(crs->shape())) {
+  if (!crs->shape().IsTuple()) {
     return Status::OK();
   }
 
diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
index 4ce351acc2c359773e618da70360c96faf5ca379..2232a2cbdfe0cf64dc4fb10d4598c0ad8b51ee5e 100644
--- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
@@ -38,7 +38,7 @@ class TestBFloat16Support : public BFloat16Support {
         hlo.opcode() == HloOpcode::kSubtract ||
         hlo.opcode() == HloOpcode::kTuple ||
         hlo.opcode() == HloOpcode::kGetTupleElement ||
-        hlo.opcode() == HloOpcode::kCrossReplicaSum) {
+        hlo.opcode() == HloOpcode::kAllReduce) {
       return true;
     }
     return false;
@@ -49,7 +49,7 @@ class TestBFloat16Support : public BFloat16Support {
         hlo.opcode() == HloOpcode::kSubtract ||
         hlo.opcode() == HloOpcode::kTuple ||
         hlo.opcode() == HloOpcode::kGetTupleElement ||
-        hlo.opcode() == HloOpcode::kCrossReplicaSum) {
+        hlo.opcode() == HloOpcode::kAllReduce) {
       return true;
     }
     return false;
@@ -58,7 +58,7 @@ class TestBFloat16Support : public BFloat16Support {
   bool SupportsMixedPrecisions(const HloInstruction& hlo) const override {
     if (hlo.opcode() == HloOpcode::kAdd || hlo.opcode() == HloOpcode::kTuple ||
         hlo.opcode() == HloOpcode::kGetTupleElement ||
-        hlo.opcode() == HloOpcode::kCrossReplicaSum) {
+        hlo.opcode() == HloOpcode::kAllReduce) {
       return true;
     }
     return false;
@@ -213,7 +213,7 @@ TEST_F(BFloat16ConversionFoldingTest, DoNotFoldTuple) {
   EXPECT_EQ(tuple->operand(1), convert0);
 }
 
-TEST_F(BFloat16ConversionFoldingTest, FoldCrossReplicaSumTupleOutput) {
+TEST_F(BFloat16ConversionFoldingTest, FoldAllReduceTupleOutput) {
   auto builder = HloComputation::Builder(TestName());
 
   auto module = CreateNewVerifiedModule();
@@ -236,11 +236,10 @@ TEST_F(BFloat16ConversionFoldingTest, FoldCrossReplicaSumTupleOutput) {
   HloInstruction* b = builder.AddInstruction(
       HloInstruction::CreateParameter(1, f32_shape, "b"));
 
-  HloInstruction* crs =
-      builder.AddInstruction(HloInstruction::CreateCrossReplicaSum(
-          ShapeUtil::MakeTupleShape({f32_shape, f32_shape}), {convert_a, b},
-          sum, /*replica_groups=*/{}, /*barrier=*/"",
-          /*all_reduce_id=*/absl::nullopt));
+  HloInstruction* crs = builder.AddInstruction(HloInstruction::CreateAllReduce(
+      ShapeUtil::MakeTupleShape({f32_shape, f32_shape}), {convert_a, b}, sum,
+      /*replica_groups=*/{}, /*barrier=*/"",
+      /*all_reduce_id=*/absl::nullopt));
   HloInstruction* gte_a = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(f32_shape, crs, 0));
   HloInstruction* gte_b = builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization.cc b/tensorflow/compiler/xla/service/bfloat16_normalization.cc
index b8a8f844eff17a95d4073f53495e0027c481f558..d1b14d604f0559b6b18f7d1fba127669c241c8a3 100644
--- a/tensorflow/compiler/xla/service/bfloat16_normalization.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_normalization.cc
@@ -362,8 +362,8 @@ Status BFloat16NormalizationVisitor::DefaultAction(HloInstruction* hlo) {
   }
   // TODO(b/112040122): Correctly normalize variadic reduce.
   if ((hlo->opcode() == HloOpcode::kSort ||
-       hlo->opcode() == HloOpcode::kCrossReplicaSum) &&
-      ShapeUtil::IsTuple(hlo->shape())) {
+       hlo->opcode() == HloOpcode::kAllReduce) &&
+      hlo->shape().IsTuple()) {
     return HandleMultipleOutputs(hlo);
   }
   return HandleInstruction(hlo);
diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
index 9f97d18c565c7915b9f9346f0c6330cdc3c707e9..2caa979745b3b40817acb1b6951e1de5ffa294a4 100644
--- a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
 #include "tensorflow/compiler/xla/service/bfloat16_support.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -232,7 +233,7 @@ TEST_F(BFloat16NormalizationTest, ResolveUnsupportedMixedPrecisionReduce) {
   EXPECT_EQ(reduce->operand(1)->shape().element_type(), F32);
 }
 
-TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleCrossReplicaSum) {
+TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleAllReduce) {
   auto module = CreateNewVerifiedModule();
   HloComputation::Builder sum_builder("sum");
   auto x = sum_builder.AddInstruction(HloInstruction::CreateParameter(
@@ -253,11 +254,10 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleCrossReplicaSum) {
   HloInstruction* b = builder.AddInstruction(
       HloInstruction::CreateParameter(1, bf16_shape, "b"));
 
-  HloInstruction* crs =
-      builder.AddInstruction(HloInstruction::CreateCrossReplicaSum(
-          ShapeUtil::MakeTupleShape({f32_shape, bf16_shape}), {a, b}, reduction,
-          /*replica_groups=*/{}, /*barrier=*/"",
-          /*all_reduce_id=*/absl::nullopt));
+  HloInstruction* crs = builder.AddInstruction(HloInstruction::CreateAllReduce(
+      ShapeUtil::MakeTupleShape({f32_shape, bf16_shape}), {a, b}, reduction,
+      /*replica_groups=*/{}, /*barrier=*/"",
+      /*all_reduce_id=*/absl::nullopt));
   HloInstruction* gte = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(bf16_shape, crs, 1));
 
@@ -283,8 +283,11 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleSort) {
   HloInstruction* value = builder.AddInstruction(
       HloInstruction::CreateParameter(1, s32_shape, "value"));
 
-  HloInstruction* sort = builder.AddInstruction(HloInstruction::CreateSort(
-      ShapeUtil::MakeTupleShape({bf16_shape, s32_shape}), 0, key, {value}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* sort,
+      MakeSortHlo(ShapeUtil::MakeTupleShape({bf16_shape, s32_shape}),
+                  {key, value}, 0, /*is_stable=*/false, &builder,
+                  module.get()));
   HloInstruction* gte = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(bf16_shape, sort, 0));
 
@@ -309,8 +312,11 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleSortRoot) {
   HloInstruction* value = builder.AddInstruction(
       HloInstruction::CreateParameter(1, bf16_shape, "value"));
 
-  HloInstruction* sort = builder.AddInstruction(HloInstruction::CreateSort(
-      ShapeUtil::MakeTupleShape({bf16_shape, bf16_shape}), 0, key, {value}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* sort,
+      MakeSortHlo(ShapeUtil::MakeTupleShape({bf16_shape, f32_shape}),
+                  {key, value}, 0, /*is_stable=*/false, &builder,
+                  module.get()));
 
   auto computation = module->AddEntryComputation(builder.Build());
 
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.cc b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
index 63d4572f2028c462df1cac9d5e4ee616e407f37b..bab63f66d83b712d756078bef84926eed235f6b5 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
@@ -276,8 +276,8 @@ bool BFloat16Propagation::AllUsersConsumeBF16(const HloInstruction& hlo,
       if (bfloat16_support_->EffectiveOperandPrecisionIsOutputPrecision(
               *use.instruction, use.operand_number)) {
         if (use.instruction->opcode() == HloOpcode::kTuple ||
-            (use.instruction->opcode() == HloOpcode::kCrossReplicaSum &&
-             ShapeUtil::IsTuple(use.instruction->shape()))) {
+            (use.instruction->opcode() == HloOpcode::kAllReduce &&
+             use.instruction->shape().IsTuple())) {
           ShapeIndex use_output_index{use.operand_number};
           for (int64 i : use.operand_index) {
             use_output_index.push_back(i);
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
index 5be7141aae423adb4fe2f39262e463ff25ae8234..a9b5d9916e400b39039248098c22a715e44ccfd2 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
@@ -209,7 +209,7 @@ TEST_F(BFloat16PropagationTest, DoNotChangeAllReduce) {
       rb.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"))));
   auto reduction = module->AddEmbeddedComputation(rb.Build());
   HloInstruction* all_reduce =
-      builder.AddInstruction(HloInstruction::CreateCrossReplicaSum(
+      builder.AddInstruction(HloInstruction::CreateAllReduce(
           ShapeUtil::MakeTupleShape({shape, shape}), {a, b}, reduction,
           /*replica_groups=*/{}, /*barrier=*/"", /*all_reduce_id=*/1));
   HloInstruction* gte0 = builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 8d7c62447852fd946440c41389300a92377c471f..cbebbdc8a2d7d0b65f12accbe424bea383ff5355 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -86,10 +86,9 @@ std::vector<int64> ColorInterferenceGraph(
   // first, but it would be good to investigate other ordering heuristics too.
   std::vector<int64> nodes(node_count);
   std::iota(nodes.begin(), nodes.end(), 0);
-  std::sort(nodes.begin(), nodes.end(),
-            [&interference_map](const int64 i, const int64 j) {
-              return interference_map[i].size() > interference_map[j].size();
-            });
+  absl::c_sort(nodes, [&interference_map](const int64 i, const int64 j) {
+    return interference_map[i].size() > interference_map[j].size();
+  });
 
   const int64 kColorUnassigned = -1;
   std::vector<int64> assigned_colors(node_count, kColorUnassigned);
@@ -138,8 +137,8 @@ Status GatherComputationsByAllocationType(
     worklist.pop_front();
     const HloComputation* computation = worklist_front.first;
     bool is_thread_local = worklist_front.second;
-    bool in_thread_local_set = thread_local_set.count(computation) > 0;
-    bool in_global_set = global_set.count(computation) > 0;
+    bool in_thread_local_set = thread_local_set.contains(computation);
+    bool in_global_set = global_set.contains(computation);
 
     // If the computation has already been added to the respective set, then
     // nothing to do.
@@ -186,12 +185,13 @@ Status GatherComputationsByAllocationType(
             worklist.push_back(std::make_pair(subcomputation,
                                               false));  // Not thread local.
             break;
-          case HloOpcode::kCrossReplicaSum:
+          case HloOpcode::kAllReduce:
           case HloOpcode::kMap:
           case HloOpcode::kReduce:
           case HloOpcode::kReduceWindow:
           case HloOpcode::kScatter:
           case HloOpcode::kSelectAndScatter:
+          case HloOpcode::kSort:
           case HloOpcode::kFusion:
             // Map/reduce etc computations are always thread-local.
             worklist.push_back(std::make_pair(subcomputation,
@@ -207,9 +207,9 @@ Status GatherComputationsByAllocationType(
 
   // Add the computations to the vectors in post order.
   for (auto* computation : module->MakeComputationPostOrder()) {
-    if (thread_local_set.count(computation) > 0) {
+    if (thread_local_set.contains(computation)) {
       thread_local_computations->push_back(computation);
-    } else if (global_set.count(computation) > 0) {
+    } else if (global_set.contains(computation)) {
       global_computations->push_back(computation);
     }
     // If the computation is not reachable from the entry computation, then it
@@ -219,13 +219,6 @@ Status GatherComputationsByAllocationType(
   return Status::OK();
 }
 
-size_t BufferAllocation::Slice::Hasher::operator()(Slice s) const {
-  uint64 h = std::hash<int64>()(s.index());
-  h = tensorflow::Hash64Combine(h, std::hash<int64>()(s.offset()));
-  h = tensorflow::Hash64Combine(h, std::hash<int64>()(s.size()));
-  return h;
-}
-
 string BufferAllocation::Slice::ToString() const {
   return absl::StrCat("{index:", index(), ", offset:", offset_,
                       ", size:", size_, "}");
@@ -240,7 +233,7 @@ BufferAllocation::Slice BufferAllocation::GetSlice(
 void BufferAllocation::AddAssignment(const LogicalBuffer& buffer, int64 offset,
                                      int64 size) {
   VLOG(4) << "Trying to add " << buffer << " to allocation #" << index();
-  CHECK(assigned_buffers_.count(&buffer) == 0)
+  CHECK(!assigned_buffers_.contains(&buffer))
       << "LogicalBuffer " << buffer << " already assigned to allocation "
       << index_;
   CHECK_LE(offset, size_) << "LogicalBuffer " << buffer
@@ -279,11 +272,12 @@ BufferAllocationProto BufferAllocation::ToProto() const {
     proto_assigned->set_offset(buffer_offset_size.second.offset);
     proto_assigned->set_size(buffer_offset_size.second.size);
   }
-  std::sort(proto.mutable_assigned()->begin(), proto.mutable_assigned()->end(),
-            [](const BufferAllocationProto::Assigned& assign1,
-               const BufferAllocationProto::Assigned& assign2) {
-              return assign1.logical_buffer_id() < assign2.logical_buffer_id();
-            });
+  absl::c_sort(*proto.mutable_assigned(),
+               [](const BufferAllocationProto::Assigned& assign1,
+                  const BufferAllocationProto::Assigned& assign2) {
+                 return assign1.logical_buffer_id() <
+                        assign2.logical_buffer_id();
+               });
   return proto;
 }
 
@@ -315,10 +309,10 @@ string BufferAllocation::ToString() const {
   for (const auto& buffer_offset_size : assigned_buffers_) {
     sorted_buffers.push_back(buffer_offset_size.first);
   }
-  std::sort(sorted_buffers.begin(), sorted_buffers.end(),
-            [](const LogicalBuffer* a, const LogicalBuffer* b) {
-              return a->id() < b->id();
-            });
+  absl::c_sort(sorted_buffers,
+               [](const LogicalBuffer* a, const LogicalBuffer* b) {
+                 return a->id() < b->id();
+               });
   for (const LogicalBuffer* buffer : sorted_buffers) {
     const OffsetSize& offset_size = FindOrDie(assigned_buffers_, buffer);
     StrAppend(&output, absl::StrFormat(
@@ -346,7 +340,7 @@ const PointsToSet& BufferAssignment::GetPointsToSet(
 
 bool BufferAssignment::HasAllocation(const LogicalBuffer& buffer) const {
   TF_CHECK_OK(points_to_analysis().VerifyBuffer(buffer));
-  return allocation_index_for_buffer_.count(&buffer) > 0;
+  return allocation_index_for_buffer_.contains(&buffer);
 }
 
 const BufferAllocation& BufferAssignment::GetAssignedAllocation(
@@ -401,7 +395,7 @@ bool BufferAssignment::HasAllocationAt(const HloInstruction* instruction,
                                        const ShapeIndex& index) const {
   for (const LogicalBuffer* buffer :
        GetPointsToSet(instruction).element(index)) {
-    if (allocation_index_for_buffer_.count(buffer) > 0) {
+    if (allocation_index_for_buffer_.contains(buffer)) {
       return true;
     }
   }
@@ -459,8 +453,7 @@ bool BufferAssignment::SharesSliceAtIndex(
 
 bool BufferAssignment::HaveDisjointSlices(const HloInstruction* hlo_a,
                                           const HloInstruction* hlo_b) const {
-  using SliceSet =
-      flat_hash_set<BufferAllocation::Slice, BufferAllocation::Slice::Hasher>;
+  using SliceSet = flat_hash_set<BufferAllocation::Slice>;
   // Gets the slices all of instr's subshapes.  If any subshape doesn't have an
   // assigned slice, returns the empty set.
   auto collect_slices = [&](const HloInstruction* instr) -> SliceSet {
@@ -487,10 +480,9 @@ bool BufferAssignment::HaveDisjointSlices(const HloInstruction* hlo_a,
   // didn't return the empty set) for both HLOs, and the two resulting sets of
   // slices are disjoint.
   return !slices_a.empty() && !slices_b.empty() &&
-         std::none_of(slices_a.begin(), slices_a.end(),
-                      [&](const BufferAllocation::Slice& slice) {
-                        return slices_b.count(slice) > 0;
-                      });
+         absl::c_none_of(slices_a, [&](const BufferAllocation::Slice& slice) {
+           return slices_b.contains(slice);
+         });
 }
 
 StatusOr<BufferAllocation::Slice>
@@ -519,7 +511,7 @@ BufferAllocation* BufferAssignment::NewAllocation(const LogicalBuffer& buffer,
 void BufferAssignment::AddAssignment(BufferAllocation* allocation,
                                      const LogicalBuffer& buffer, int64 offset,
                                      int64 size) {
-  CHECK_EQ(0, allocation_index_for_buffer_.count(&buffer))
+  CHECK(!allocation_index_for_buffer_.contains(&buffer))
       << "LogicalBuffer " << buffer << " already has an allocation.";
   CHECK(allocation->is_reusable() || allocation->assigned_buffers().empty())
       << "Non-reusable allocation already assigned a buffer: "
@@ -761,7 +753,8 @@ namespace {
 bool MayInterfereAcrossSubcomputations(BufferAssignment* assignment,
                                        const LogicalBuffer& a_buffer,
                                        const LogicalBuffer& b_buffer) {
-  auto call_graph = assignment->liveness().hlo_ordering().call_graph();
+  const CallGraph& call_graph =
+      assignment->liveness().hlo_ordering().call_graph();
   const HloInstruction* a_ancestor;
   const HloInstruction* b_ancestor;
   std::tie(a_ancestor, b_ancestor) =
@@ -960,35 +953,35 @@ Status BufferAssigner::AssignBuffersForComputation(
   // operands (assuming operands are the same/larger size) enabling the
   // important reuse case where an elementwise instruction reuses one of its
   // operand's buffer. This improves locality.
-  std::sort(sorted_buffers.begin(), sorted_buffers.end(),
-            [has_sequential_order, &liveness, &post_order_position, assignment](
-                const LogicalBuffer* a, const LogicalBuffer* b) {
-              // Primary sort is by decreasing buffer size.
-              const int64 a_size = assignment->buffer_size_(*a);
-              const int64 b_size = assignment->buffer_size_(*b);
-              if (a_size != b_size) {
-                return a_size > b_size;  // use ">" for decreasing size.
-              }
-              // Otherwise live out buffers come before others, if the
-              // instructions are sequentially ordered.
-              if (has_sequential_order) {
-                const bool a_live_out = liveness.MaybeLiveOut(*a);
-                const bool b_live_out = liveness.MaybeLiveOut(*b);
-                if (a_live_out != b_live_out) {
-                  return a_live_out;
-                }
-              }
-              // Final tiebreaker is in instruction post order.
-              return post_order_position.at(a->instruction()) <
-                     post_order_position.at(b->instruction());
-            });
+  absl::c_sort(sorted_buffers,
+               [has_sequential_order, &liveness, &post_order_position,
+                assignment](const LogicalBuffer* a, const LogicalBuffer* b) {
+                 // Primary sort is by decreasing buffer size.
+                 const int64 a_size = assignment->buffer_size_(*a);
+                 const int64 b_size = assignment->buffer_size_(*b);
+                 if (a_size != b_size) {
+                   return a_size > b_size;  // use ">" for decreasing size.
+                 }
+                 // Otherwise live out buffers come before others, if the
+                 // instructions are sequentially ordered.
+                 if (has_sequential_order) {
+                   const bool a_live_out = liveness.MaybeLiveOut(*a);
+                   const bool b_live_out = liveness.MaybeLiveOut(*b);
+                   if (a_live_out != b_live_out) {
+                     return a_live_out;
+                   }
+                 }
+                 // Final tiebreaker is in instruction post order.
+                 return post_order_position.at(a->instruction()) <
+                        post_order_position.at(b->instruction());
+               });
 
   // BufferAllocations are necessarily created in decreasing size order. Keep
   // indices of previously created BufferAllocations in allocation_indices.
   std::vector<BufferAllocation::Index> allocation_indices;
   for (const LogicalBuffer* buffer : sorted_buffers) {
     VLOG(3) << "Assigning allocation to: " << *buffer;
-    if (colocated_buffers.count(buffer) > 0) {
+    if (colocated_buffers.contains(buffer)) {
       // Colocated buffers are currently assigned in an earlier pass.
       VLOG(3) << "Skipping colocated buffer: " << *buffer;
       continue;
@@ -1020,10 +1013,14 @@ Status BufferAssigner::AssignBuffersForComputation(
       // callers.
       BufferAllocation* allocation =
           assignment->NewAllocation(*buffer, buffer_size);
+      bool parameter_has_alias =
+          assignment->module().input_output_alias_config().ParameterHasAlias(
+              instruction->parameter_number(), buffer->index());
       allocation->set_entry_computation_parameter(
-          instruction->parameter_number(), buffer->index());
-      VLOG(3) << "New allocation #" << allocation->index()
-              << " for entry computation parameter: " << *buffer;
+          instruction->parameter_number(), buffer->index(),
+          parameter_has_alias);
+      VLOG(3) << "Mark allocation #" << allocation->index()
+              << " as entry computation parameter: " << *buffer;
       continue;
     }
 
@@ -1036,7 +1033,7 @@ Status BufferAssigner::AssignBuffersForComputation(
       continue;
     }
 
-    if (ShapeUtil::IsTuple(buffer->shape())) {
+    if (buffer->shape().IsTuple()) {
       BufferAllocation* allocation =
           assignment->NewAllocation(*buffer, buffer_size);
       allocation->set_is_tuple(true);
@@ -1056,7 +1053,7 @@ Status BufferAssigner::AssignBuffersForComputation(
              assignment->GetAllSlices(operand, /*index=*/{})) {
           BufferAllocation* allocation =
               assignment->GetMutableAllocation(operand_slice.index());
-          if (colocated_allocations.count(allocation->index()) == 0) {
+          if (!colocated_allocations.contains(allocation->index())) {
             // TODO(b/32491382) Colocated buffers are currently assigned in an
             // earlier pass, and so can break the "increasing allocation size"
             // invariant in this function (causing this CHECK to fail). However,
@@ -1087,7 +1084,7 @@ Status BufferAssigner::AssignBuffersForComputation(
         // Instructions are iterated in increasing buffer size, so any
         // previously create allocation must be large enough to hold this
         // instruction's output (with the exception of colocated buffers).
-        if (colocated_allocations.count(allocation->index()) == 0) {
+        if (!colocated_allocations.contains(allocation->index())) {
           // TODO(b/32491382) Colocated buffers are currently assigned in an
           // earlier pass, and so can break the "increasing allocation size"
           // invariant in this function (causing this CHECK to fail). However,
@@ -1313,10 +1310,10 @@ std::vector<const LogicalBuffer*> ComputePeakMemoryLogicalBuffers(
                              live_buffers.end());
 
   // Stabily sort the live buffers.
-  std::sort(live_buffers_vector.begin(), live_buffers_vector.end(),
-            [](const LogicalBuffer* a, const LogicalBuffer* b) {
-              return a->id() < b->id();
-            });
+  absl::c_sort(live_buffers_vector,
+               [](const LogicalBuffer* a, const LogicalBuffer* b) {
+                 return a->id() < b->id();
+               });
   return live_buffers_vector;
 }
 
@@ -1376,7 +1373,7 @@ void BufferAssigner::AddSetToColocatedBufferSets(
   std::vector<size_t> overlap_set_indices;
   for (size_t index = 0; index < colocated_buffer_sets->size(); ++index) {
     for (const LogicalBuffer* buffer : colocated_set) {
-      if ((*colocated_buffer_sets)[index].count(buffer) > 0) {
+      if ((*colocated_buffer_sets)[index].contains(buffer)) {
         VLOG(5) << "Found overlap with existing set on buffer "
                 << buffer->ToString() << "\n"
                 << ColocatedBufferSetsToString((*colocated_buffer_sets)[index],
@@ -1425,12 +1422,14 @@ BufferAssigner::MergeColocatedBufferSets(
           << colocated_buffer_sets.size();
 
   // Returns true if the given buffer is for the entry parameter.
-  auto is_entry_parameter = [](const LogicalBuffer& buffer) {
+  auto is_readonly_entry_parameter = [](const LogicalBuffer& buffer) {
     auto* instruction = buffer.instruction();
     auto* computation = instruction->parent();
     auto* module = computation->parent();
     return instruction->opcode() == HloOpcode::kParameter &&
-           computation == module->entry_computation();
+           computation == module->entry_computation() &&
+           !module->input_output_alias_config().ParameterHasAlias(
+               instruction->parameter_number(), buffer.index());
   };
 
   std::vector<bool> set_can_be_merged(colocated_buffer_sets.size(), true);
@@ -1452,7 +1451,7 @@ BufferAssigner::MergeColocatedBufferSets(
   for (int64 i = 0; i < colocated_buffer_sets.size(); ++i) {
     for (auto& buffer : colocated_buffer_sets[i]) {
       if (buffer_liveness.MaybeLiveOut(*buffer) ||
-          is_entry_parameter(*buffer) ||
+          is_readonly_entry_parameter(*buffer) ||
           buffer->instruction()->opcode() == HloOpcode::kConstant) {
         set_can_be_merged[i] = false;
         break;
@@ -1539,15 +1538,16 @@ void BufferAssigner::BuildColocatedBufferSets(
   VLOG(4) << "Input/Output Alias Config: ";
   VLOG(4) << module->input_output_alias_config();
   module->input_output_alias_config().ForEachAlias(
-      [&](const ShapeIndex& output_index, int64 param_number,
-          const ShapeIndex& param_index) {
+      [&](const ShapeIndex& output_index,
+          const HloInputOutputAliasConfig::Alias& alias) {
         std::vector<const LogicalBuffer*> colocated_set;
         AddBufferToColocatedSet(module->entry_computation()->root_instruction(),
                                 output_index, points_to_analysis,
                                 &colocated_set);
         AddBufferToColocatedSet(
-            module->entry_computation()->parameter_instruction(param_number),
-            param_index, points_to_analysis, &colocated_set);
+            module->entry_computation()->parameter_instruction(
+                alias.parameter_number),
+            alias.parameter_index, points_to_analysis, &colocated_set);
         AddSetToColocatedBufferSets(colocated_set, colocated_buffer_sets);
       });
 
@@ -1741,10 +1741,6 @@ void BufferAssigner::AssignColocatedBufferSets(
         // module-level scope, we can allow buffers to be shared across
         // computations (in some cases).
         allocation = assignment->NewAllocation(*buffer, buffer_size);
-        if (entry_parameter_number >= 0) {
-          allocation->set_entry_computation_parameter(
-              entry_parameter_number, *entry_parameter_shape_idx);
-        }
         if (is_constant) {
           allocation->set_constant(true);
         }
@@ -1758,6 +1754,16 @@ void BufferAssigner::AssignColocatedBufferSets(
       }
       colocated_buffers->insert(buffer);
     }
+
+    // If an allocation contains a parameter, set corresponding fields.
+    if (entry_parameter_number >= 0) {
+      bool parameter_has_alias =
+          assignment->module().input_output_alias_config().ParameterHasAlias(
+              entry_parameter_number, *entry_parameter_shape_idx);
+      allocation->set_entry_computation_parameter(entry_parameter_number,
+                                                  *entry_parameter_shape_idx,
+                                                  parameter_has_alias);
+    }
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index 0a9fdede803e84ca42472259084615c031b206eb..448dec3b1aa0c0f85e1060a70e965fcf3952c320 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -96,7 +96,11 @@ class BufferAllocation {
   // Whether this allocation is readonly i.e. backed by memory we cannot write
   // to.
   bool is_readonly() const {
-    return is_entry_computation_parameter() || is_constant();
+    // Entry parameters are generally readonly, except when they are aliased
+    // with any output.
+    return (is_entry_computation_parameter() &&
+            !is_parameter_aliased_with_output_) ||
+           is_constant();
   }
 
   bool is_tuple() const { return is_tuple_; }
@@ -186,9 +190,10 @@ class BufferAllocation {
              end > other.offset_;
     }
 
-    struct Hasher {
-      size_t operator()(Slice s) const;
-    };
+    template <typename H>
+    friend H AbslHashValue(H h, const Slice& s) {
+      return H::combine(std::move(h), s.index(), s.offset(), s.size());
+    }
 
     string ToString() const;
 
@@ -273,8 +278,10 @@ class BufferAllocation {
   void AddAssignment(const LogicalBuffer& buffer, int64 offset, int64 size);
 
   void set_entry_computation_parameter(int64 parameter_number,
-                                       ShapeIndex param_shape_index) {
+                                       ShapeIndex param_shape_index,
+                                       bool parameter_aliased_with_output) {
     is_entry_computation_parameter_ = true;
+    is_parameter_aliased_with_output_ = parameter_aliased_with_output;
     parameter_number_ = parameter_number;
     param_shape_index_ = std::move(param_shape_index);
   }
@@ -304,6 +311,9 @@ class BufferAllocation {
   // outlast the computation.
   bool is_entry_computation_parameter_ = false;
 
+  // Whether this entry computation parameter is aliased with output.
+  bool is_parameter_aliased_with_output_ = false;
+
   // If this allocation holds an entry computation parameter, this field
   // indicates the index (starting from 0) of the parameter.
   int64 parameter_number_ = 0;
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index 8f482e6ba8c3e71c9980be5e6947ea61f3b4ef29..580bc2f43384006eab8711490689a200fc887d37 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/buffer_value.h"
@@ -309,7 +310,7 @@ class BufferAssignmentTest : public HloTestBase {
 static bool BuffersDistinct(const std::vector<const HloInstruction*>& a,
                             const std::vector<const HloInstruction*>& b,
                             const BufferAssignment& assignment) {
-  std::set<BufferAllocation::Slice> a_slices;
+  absl::flat_hash_set<BufferAllocation::Slice> a_slices;
   for (const HloInstruction* instruction : a) {
     if (assignment.HasTopLevelAllocation(instruction)) {
       a_slices.insert(
@@ -319,8 +320,8 @@ static bool BuffersDistinct(const std::vector<const HloInstruction*>& a,
 
   for (const HloInstruction* instruction : b) {
     if (assignment.HasTopLevelAllocation(instruction)) {
-      if (a_slices.count(assignment.GetUniqueTopLevelSlice(instruction)
-                             .ConsumeValueOrDie())) {
+      if (a_slices.contains(assignment.GetUniqueTopLevelSlice(instruction)
+                                .ConsumeValueOrDie())) {
         return false;
       }
     }
@@ -464,6 +465,40 @@ TEST_F(BufferAssignmentTest, Basic) {
   GetAssignedOutputAllocation(*buffers, sub);
 }
 
+TEST_F(BufferAssignmentTest, AliasedParamCanBeReused) {
+  // If an input buffer and output buffer aliases, the input buffer can be
+  // reused for other intermediate results.
+  //
+  // param0[100] ----- (neg1) -- (neg2)
+  //    |                           |
+  //    + -------- Aliased ---------+
+
+  auto builder = HloComputation::Builder(TestName());
+
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, f32vec100_, "p0"));
+  auto neg_1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, param));
+  auto neg_2 = builder.AddInstruction(
+      HloInstruction::CreateUnary(f32vec100_, HloOpcode::kNegate, neg_1));
+
+  auto module = CreateNewVerifiedModule();
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK(module->input_output_alias_config().SetUpAlias(
+      {}, 0, {}, HloInputOutputAliasConfig::kUserAlias));
+
+  auto buffers = RunBufferAssignment(module.get());
+
+  BufferAllocation param_buffer = GetAssignedInputAllocation(*buffers, param);
+  BufferAllocation neg_1_buffer = GetAllocation(*buffers, neg_1, {});
+  BufferAllocation neg_2_buffer = GetAllocation(*buffers, neg_2, {});
+
+  // Everything use one buffer.
+  EXPECT_EQ(param_buffer.index(), neg_1_buffer.index());
+  EXPECT_EQ(neg_2_buffer.index(), neg_1_buffer.index());
+}
+
 TEST_F(BufferAssignmentTest, AddCannotReuse) {
   // Pass in a special rule to indicate that "add" cannot reuse any buffer.
   //
@@ -2485,9 +2520,9 @@ while_body {
   get-tuple-element.3 = s32[] get-tuple-element(state), index=0
   constant.2 = s32[] constant(128)
   add.5 = s32[] add(get-tuple-element.3, constant.2)
-  constant.3 = s32[3]{0} constant({0, 0, 0})
-  dynamic-update-slice.5 = f32[1280,1,128]{2,1,0} dynamic-update-slice(get-tuple-element.4, broadcast.6, constant.3)
-  dynamic-update-slice.9 = f32[1280,1,128]{2,1,0} dynamic-update-slice(dynamic-update-slice.5, broadcast.6, constant.3)
+  constant.3 = s32[] constant(0)
+  dynamic-update-slice.5 = f32[1280,1,128]{2,1,0} dynamic-update-slice(get-tuple-element.4, broadcast.6, constant.3, constant.3, constant.3)
+  dynamic-update-slice.9 = f32[1280,1,128]{2,1,0} dynamic-update-slice(dynamic-update-slice.5, broadcast.6, constant.3, constant.3, constant.3)
   ROOT tuple.85 = (s32[], s32[], s32[2]{0}, f32[1280,1,128]{2,1,0}) tuple(add.5, dynamic-update-slice.9)
 }
 
diff --git a/tensorflow/compiler/xla/service/buffer_liveness_test.cc b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
index 40825a78716b1c0b9fb0121787977d275891c0f8..23b9af0281b0d5ee1ef6ca2315f0cc1042285609 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
@@ -52,8 +52,8 @@ class BufferLivenessTest : public HloTestBase {
   // interfere. Precondition: 'a' and 'b' are array-shaped.
   bool InstructionsMayInterfere(const BufferLiveness& liveness,
                                 HloInstruction* a, HloInstruction* b) {
-    EXPECT_FALSE(ShapeUtil::IsTuple(a->shape()));
-    EXPECT_FALSE(ShapeUtil::IsTuple(b->shape()));
+    EXPECT_FALSE(a->shape().IsTuple());
+    EXPECT_FALSE(b->shape().IsTuple());
     return liveness.MayInterfere(
         GetBuffer(liveness, /*instruction=*/a, /*index=*/{}),
         GetBuffer(liveness, /*instruction=*/b, /*index=*/{}));
@@ -66,8 +66,8 @@ class BufferLivenessTest : public HloTestBase {
                                  HloInstruction* a, HloInstruction* b,
                                  const ShapeIndex& index) {
     // Check that top-level shapes are tuple and tuple element shapes are equal.
-    EXPECT_TRUE(ShapeUtil::IsTuple(a->shape()));
-    EXPECT_TRUE(ShapeUtil::IsTuple(b->shape()));
+    EXPECT_TRUE(a->shape().IsTuple());
+    EXPECT_TRUE(b->shape().IsTuple());
     EXPECT_TRUE(
         ShapeUtil::Compatible(ShapeUtil::GetSubshape(a->shape(), index),
                               ShapeUtil::GetSubshape(b->shape(), index)));
@@ -638,10 +638,10 @@ class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
     }
     // Create a DynamicUpdateSlice instruction of tuple element 1 with 'update'.
     auto starts = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(2)));
     auto dynamic_update_slice =
         builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-            data_shape, gte1, update, starts));
+            data_shape, gte1, update, {starts}));
     // Create output tuple.
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
@@ -794,10 +794,10 @@ class DynamicUpdateSliceLivenessTest : public BufferLivenessTest {
     }
     // Create a DynamicUpdateSlice instruction of tuple element 1 with 'update'.
     auto starts = builder.AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(2)));
     auto dynamic_update_slice =
         builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-            data_shape, gte1, update, starts));
+            data_shape, gte1, update, {starts}));
     // Create output tuple.
     auto tuple_root = builder.AddInstruction(
         HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
diff --git a/tensorflow/compiler/xla/service/buffer_value.cc b/tensorflow/compiler/xla/service/buffer_value.cc
index fdf822c666b15afbc7553ca89d4f92ab08201869..b1abba20689915b03304aacd7a5fcca5443c2c60 100644
--- a/tensorflow/compiler/xla/service/buffer_value.cc
+++ b/tensorflow/compiler/xla/service/buffer_value.cc
@@ -29,8 +29,8 @@ BufferValue::BufferValue(HloInstruction* instruction, const ShapeIndex& index,
                          Id id)
     : id_(id) {
   const Shape& shape = ShapeUtil::GetSubshape(instruction->shape(), index);
-  is_array_ = ShapeUtil::IsArray(shape);
-  is_tuple_ = ShapeUtil::IsTuple(shape);
+  is_array_ = shape.IsArray();
+  is_tuple_ = shape.IsTuple();
 }
 
 BufferValue::~BufferValue() {}
diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
index 7987343bfaf1069fd550909d127e4b11f2124701..98304757cae91d22466ed25f8c6e36ce90a848db 100644
--- a/tensorflow/compiler/xla/service/call_graph.cc
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -58,12 +58,13 @@ CallContext GetInstructionCallContext(HloOpcode opcode) {
     case HloOpcode::kConditional:
     case HloOpcode::kWhile:
       return CallContext::kSequential;
-    case HloOpcode::kCrossReplicaSum:
+    case HloOpcode::kAllReduce:
     case HloOpcode::kMap:
     case HloOpcode::kReduce:
     case HloOpcode::kReduceWindow:
     case HloOpcode::kScatter:
     case HloOpcode::kSelectAndScatter:
+    case HloOpcode::kSort:
     case HloOpcode::kFusion:
       return CallContext::kParallel;
     default:
@@ -236,6 +237,41 @@ void CallGraph::SetCallContexts() {
   }
 }
 
+void CallGraph::SetNodeDepths() {
+  std::queue<CallGraphNode*> worklist;
+
+  // Initialize node depths to -1.
+  for (CallGraphNode& node : nodes_) {
+    node.set_depth(-1);
+  }
+
+  // Initialize worklist with all roots of the call graph (computations without
+  // callers).
+  for (const HloComputation* computation : module_->computations()) {
+    CallGraphNode& node = GetNode(computation);
+    if (node.callers().empty()) {
+      node.set_depth(0);
+      worklist.push(&node);
+    }
+  }
+
+  while (!worklist.empty()) {
+    CallGraphNode* node = worklist.front();
+    worklist.pop();
+    for (const HloComputation* callee : node->callees()) {
+      CallGraphNode& callee_node = GetNode(callee);
+      if (callee_node.depth() < node->depth() + 1) {
+        callee_node.set_depth(node->depth() + 1);
+        worklist.push(&callee_node);
+      }
+    }
+  }
+
+  for (CallGraphNode& node : nodes_) {
+    CHECK_NE(node.depth(), -1);
+  }
+}
+
 /* static */
 std::unique_ptr<CallGraph> CallGraph::Build(const HloModule* module) {
   // Constructor for CallGraph is private so absl::make_unique can't be used.
@@ -271,6 +307,8 @@ std::unique_ptr<CallGraph> CallGraph::Build(const HloModule* module) {
   }
 
   call_graph->SetCallContexts();
+  call_graph->SetNodeDepths();
+
   XLA_VLOG_LINES(1, call_graph->ToString());
 
   return call_graph;
@@ -352,15 +390,38 @@ CallGraph::NearestAncestorsInSameComputation(HloInstruction* a,
 
   // Iterate through the callee->caller chains and find the earliest common
   // element.
-  for (HloInstruction* a_ancestor = a; a_ancestor != nullptr;
-       a_ancestor = next_caller(a_ancestor)) {
-    for (HloInstruction* b_ancestor = b; b_ancestor != nullptr;
-         b_ancestor = next_caller(b_ancestor)) {
-      if (a_ancestor->parent() == b_ancestor->parent()) {
-        return {a_ancestor, b_ancestor};
+  HloInstruction* a_ancestor = a;
+  HloInstruction* b_ancestor = b;
+  int a_depth = GetNode(a->parent()).depth();
+  int b_depth = GetNode(b->parent()).depth();
+
+  // Advance a_ancestor (b_ancestor) up the call chain until the call depth of
+  // a_ancestor or b_ancestor are the same. Necessarily each call to next_caller
+  // reduces the depth by exactly one.
+  if (a_depth > b_depth) {
+    for (int i = 0; i < a_depth - b_depth; ++i) {
+      a_ancestor = next_caller(a_ancestor);
+      if (a_ancestor == nullptr) {
+        return {nullptr, nullptr};
+      }
+    }
+  } else if (b_depth > a_depth) {
+    for (int i = 0; i < b_depth - a_depth; ++i) {
+      b_ancestor = next_caller(b_ancestor);
+      if (b_ancestor == nullptr) {
+        return {nullptr, nullptr};
       }
     }
   }
+
+  while ((a_ancestor != nullptr) && (b_ancestor != nullptr)) {
+    if (a_ancestor->parent() == b_ancestor->parent()) {
+      return {a_ancestor, b_ancestor};
+    }
+
+    a_ancestor = next_caller(a_ancestor);
+    b_ancestor = next_caller(b_ancestor);
+  }
   return {nullptr, nullptr};
 }
 
diff --git a/tensorflow/compiler/xla/service/call_graph.h b/tensorflow/compiler/xla/service/call_graph.h
index 05c7c998738f861ee804d1ec87bfa5fb17ddfb74..57a636fd740995d6cce933fe19d5592a64bde5cf 100644
--- a/tensorflow/compiler/xla/service/call_graph.h
+++ b/tensorflow/compiler/xla/service/call_graph.h
@@ -30,7 +30,7 @@ namespace xla {
 
 // The context in which a computation is called by another computation.
 enum class CallContext {
-  // In a parallel contex the computation is applied to each element of the
+  // In a parallel context the computation is applied to each element of the
   // array argument(s). kMap and kReduce instructions call computations in
   // parallel context.
   kParallel,
@@ -121,6 +121,11 @@ class CallGraphNode {
   // Returns the context in which this computation is called.
   CallContext context() const { return context_; }
 
+  // Returns the depth of this node in the call graph. The depth is defined as
+  // the length of the longest call chain from a computation with no callers
+  // (usually the entry computation node) to this node.
+  int depth() const { return depth_; }
+
   string ToString() const;
 
  private:
@@ -130,6 +135,9 @@ class CallGraphNode {
   // Sets the context in which this computation is called.
   void set_context(CallContext value) { context_ = value; }
 
+  // Sets the depth of this node in the graph.
+  void set_depth(int value) { depth_ = value; }
+
   // Adds a callsite which calls this computation. Updates callers to include
   // the calling computation.
   void AddCallerCallSite(const CallSite& caller_callsite);
@@ -164,6 +172,9 @@ class CallGraphNode {
 
   // The context in which this computation is called.
   CallContext context_ = CallContext::kNone;
+
+  // The depth of this node in the call graph.
+  int depth_ = 0;
 };
 
 // The call graph for an HLO module. The graph includes a node for each
@@ -245,9 +256,16 @@ class CallGraph {
  private:
   CallGraph(const HloModule* module);
 
+  // Not copyable.
+  CallGraph(const CallGraph&) = delete;
+  CallGraph& operator=(const CallGraph&) = delete;
+
   // Sets the call contexts for every node in the graph.
   void SetCallContexts();
 
+  // Sets the call node depths for every node in the graph.
+  void SetNodeDepths();
+
   // Helper method for VisitNodes(). Traverses the call graph from 'node' in DFS
   // post order (callee before caller) calling visitor_func on each node. Adds
   // nodes to 'visited' as each node is visited. Skips nodes already in
diff --git a/tensorflow/compiler/xla/service/call_graph_test.cc b/tensorflow/compiler/xla/service/call_graph_test.cc
index a3ac2568b0f3eec8556a42dbe3c2c64bd8564468..5de724f8924b78008ba4c56603b61bf93fbc5e7c 100644
--- a/tensorflow/compiler/xla/service/call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/call_graph_test.cc
@@ -102,6 +102,7 @@ TEST_F(CallGraphTest, SingletonComputation) {
 
   const CallGraphNode& node = call_graph->GetNode(computation);
   EXPECT_EQ(computation, node.computation());
+  EXPECT_EQ(node.depth(), 0);
   EXPECT_TRUE(node.callsites().empty());
   EXPECT_TRUE(node.callees().empty());
   EXPECT_TRUE(node.caller_callsites().empty());
@@ -122,11 +123,13 @@ TEST_F(CallGraphTest, UnreachableComputation) {
   EXPECT_EQ(2, call_graph->nodes().size());
 
   const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
+  EXPECT_EQ(entry_node.depth(), 0);
   EXPECT_EQ(entry_computation, entry_node.computation());
   EXPECT_EQ(CallContext::kSequential, entry_node.context());
 
   const CallGraphNode& unreachable_node =
       call_graph->GetNode(unreachable_computation);
+  EXPECT_EQ(unreachable_node.depth(), 0);
   EXPECT_EQ(unreachable_computation, unreachable_node.computation());
   EXPECT_EQ(CallContext::kSequential, unreachable_node.context());
 }
@@ -145,6 +148,7 @@ TEST_F(CallGraphTest, ParallelComputation) {
 
   const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
   EXPECT_EQ(entry_computation, entry_node.computation());
+  EXPECT_EQ(entry_node.depth(), 0);
   EXPECT_EQ(CallContext::kSequential, entry_node.context());
   EXPECT_EQ(5, entry_node.callsites().size());
   EXPECT_EQ(1, entry_node.callees().size());
@@ -153,6 +157,7 @@ TEST_F(CallGraphTest, ParallelComputation) {
 
   const CallGraphNode& map_node = call_graph->GetNode(map_computation);
   EXPECT_EQ(map_computation, map_node.computation());
+  EXPECT_EQ(map_node.depth(), 1);
   EXPECT_EQ(CallContext::kParallel, map_node.context());
   EXPECT_TRUE(map_node.callsites().empty());
   EXPECT_TRUE(map_node.callees().empty());
@@ -234,6 +239,7 @@ TEST_F(CallGraphTest, ContextBothComputations) {
   EXPECT_EQ(entry_node.GetCallSite(map), &map_callsite);
 
   const CallGraphNode& sub_node = call_graph->GetNode(subcomputation);
+  EXPECT_EQ(sub_node.depth(), 1);
   EXPECT_EQ(CallContext::kBoth, sub_node.context());
 }
 
@@ -264,6 +270,7 @@ TEST_F(CallGraphTest, ComputationWithConditional) {
   EXPECT_EQ(3, call_graph->nodes().size());
 
   const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
+  EXPECT_EQ(entry_node.depth(), 0);
   EXPECT_EQ(entry_computation, entry_node.computation());
   EXPECT_EQ(1, entry_node.callsites().size());
 
@@ -275,11 +282,13 @@ TEST_F(CallGraphTest, ComputationWithConditional) {
   EXPECT_EQ(entry_node.GetCallSite(conditional), &conditional_callsite);
 
   const CallGraphNode& true_node = call_graph->GetNode(true_computation);
+  EXPECT_EQ(true_node.depth(), 1);
   EXPECT_TRUE(true_node.callees().empty());
   EXPECT_EQ(1, true_node.callers().size());
   EXPECT_EQ(entry_computation, true_node.callers()[0]);
 
   const CallGraphNode& false_node = call_graph->GetNode(false_computation);
+  EXPECT_EQ(false_node.depth(), 1);
   EXPECT_TRUE(false_node.callees().empty());
   EXPECT_EQ(1, false_node.callers().size());
   EXPECT_EQ(entry_computation, false_node.callers()[0]);
@@ -332,9 +341,21 @@ TEST_F(CallGraphTest, ComplexGraph) {
   EXPECT_EQ(5, call_graph->nodes().size());
   EXPECT_FALSE(call_graph->IsFlattened());
 
+  const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
+  const CallGraphNode& a_node = call_graph->GetNode(a_computation);
+  const CallGraphNode& b_node = call_graph->GetNode(b_computation);
+  const CallGraphNode& c_node = call_graph->GetNode(c_computation);
+  const CallGraphNode& cond_node = call_graph->GetNode(cond_computation);
+
+  // Verify depths.
+  EXPECT_EQ(entry_node.depth(), 0);
+  EXPECT_EQ(a_node.depth(), 1);
+  EXPECT_EQ(b_node.depth(), 2);
+  EXPECT_EQ(c_node.depth(), 3);
+  EXPECT_EQ(cond_node.depth(), 2);
+
   // Entry computation has one while instruction calling two computations
   // (cond_computation and a_computation).
-  const CallGraphNode& entry_node = call_graph->GetNode(entry_computation);
   ASSERT_EQ(1, entry_node.callsites().size());
   const std::vector<HloComputation*>& called_computations =
       entry_node.callsites()[0].called_computations();
@@ -342,7 +363,6 @@ TEST_F(CallGraphTest, ComplexGraph) {
               UnorderedElementsAre(cond_computation, a_computation));
   EXPECT_EQ(CallContext::kSequential, entry_node.context());
 
-  const CallGraphNode& c_node = call_graph->GetNode(c_computation);
   EXPECT_TRUE(c_node.callsites().empty());
   EXPECT_THAT(c_node.callers(),
               UnorderedElementsAre(a_computation, b_computation));
@@ -364,7 +384,7 @@ TEST_F(CallGraphTest, ComplexGraph) {
 
   // Verify visitation order of some computations in the graph.
   auto index_of = [&visited](const HloComputation* comp) {
-    auto it = std::find(visited.begin(), visited.end(), comp);
+    auto it = absl::c_find(visited, comp);
     EXPECT_NE(it, visited.end());
     return std::distance(visited.begin(), it);
   };
diff --git a/tensorflow/compiler/xla/service/channel_tracker.cc b/tensorflow/compiler/xla/service/channel_tracker.cc
index 3c2d1ae6d82ebc6c10d52194fd1cec5e291025f7..b517495f2ea0c75679685c67f757ff586f8c79e3 100644
--- a/tensorflow/compiler/xla/service/channel_tracker.cc
+++ b/tensorflow/compiler/xla/service/channel_tracker.cc
@@ -72,7 +72,7 @@ ChannelHandle ChannelTracker::AllocateHandle(ChannelHandle::ChannelType type) {
 }
 
 Status ChannelTracker::RegisterSendInternal(const ChannelHandle& handle) {
-  if (opaque_to_channel_.count(handle.handle()) == 0) {
+  if (!opaque_to_channel_.contains(handle.handle())) {
     return NotFound("channel handle not found: %d", handle.handle());
   }
   Channel& channel = opaque_to_channel_[handle.handle()];
@@ -94,7 +94,7 @@ Status ChannelTracker::RegisterSendInternal(const ChannelHandle& handle) {
 }
 
 Status ChannelTracker::RegisterRecvInternal(const ChannelHandle& handle) {
-  if (opaque_to_channel_.count(handle.handle()) == 0) {
+  if (!opaque_to_channel_.contains(handle.handle())) {
     return NotFound("channel handle not found: %d", handle.handle());
   }
   Channel& channel = opaque_to_channel_[handle.handle()];
diff --git a/tensorflow/compiler/xla/service/channel_tracker.h b/tensorflow/compiler/xla/service/channel_tracker.h
index 52037bf9b52556c6aa2e66dd3209e25cf085cfe3..89e17eba36f23077ce4cf0704e7455b76bee68d1 100644
--- a/tensorflow/compiler/xla/service/channel_tracker.h
+++ b/tensorflow/compiler/xla/service/channel_tracker.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <map>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/status.h"
@@ -83,7 +84,8 @@ class ChannelTracker {
 
   // Mapping from ChannelHandle value to the corresponding registered
   // Channel object.
-  std::map<int64, Channel> opaque_to_channel_ GUARDED_BY(channel_mutex_);
+  absl::flat_hash_map<int64, Channel> opaque_to_channel_
+      GUARDED_BY(channel_mutex_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(ChannelTracker);
 };
diff --git a/tensorflow/compiler/xla/service/compiler.cc b/tensorflow/compiler/xla/service/compiler.cc
index 8f08c244908efb823b3870c19bdc3491fa87d44f..653f4555a77cc82e91fb1cd26206b93826375732 100644
--- a/tensorflow/compiler/xla/service/compiler.cc
+++ b/tensorflow/compiler/xla/service/compiler.cc
@@ -98,10 +98,17 @@ Compiler::GetPlatformCompilers() {
   auto* factories = GetPlatformCompilerFactories();
   auto it = factories->find(platform->id());
   if (it == factories->end()) {
+    string hint;
+    if (platform->Name() == "Host") {
+      hint = " (hint: try linking in tensorflow/compiler/jit:xla_cpu_jit)";
+    } else if (platform->Name() == "CUDA") {
+      hint = " (hint: try linking in tensorflow/compiler/jit:xla_gpu_jit)";
+    }
+
     return NotFound(
         "could not find registered compiler for platform %s -- check "
-        "target linkage",
-        platform->Name());
+        "target linkage%s",
+        platform->Name(), hint);
   }
 
   // And then we invoke the factory, placing the result into the mapping.
diff --git a/tensorflow/compiler/xla/service/computation_layout.cc b/tensorflow/compiler/xla/service/computation_layout.cc
index efc893818d03a20d6bd65b7dc1da72ea5da5ceb0..92d1ca4ba5da802a5f1c544017ac52dda38e9b1d 100644
--- a/tensorflow/compiler/xla/service/computation_layout.cc
+++ b/tensorflow/compiler/xla/service/computation_layout.cc
@@ -42,8 +42,8 @@ void ComputationLayout::SetToDefaultLayout() {
 }
 
 bool ComputationLayout::LayoutIsSet() const {
-  return std::all_of(parameter_layouts_.begin(), parameter_layouts_.end(),
-                     [](const ShapeLayout& s) { return s.LayoutIsSet(); }) &&
+  return absl::c_all_of(parameter_layouts_,
+                        [](const ShapeLayout& s) { return s.LayoutIsSet(); }) &&
          result_layout_.LayoutIsSet();
 }
 
diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc b/tensorflow/compiler/xla/service/convolution_group_converter.cc
similarity index 68%
rename from tensorflow/compiler/xla/service/convolution_feature_group_converter.cc
rename to tensorflow/compiler/xla/service/convolution_group_converter.cc
index 95c7724c3c93507ae61a984301ecfc0111bef192..f11f9e5fc2949a92f83ff66506a9b162ffda1c92 100644
--- a/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc
+++ b/tensorflow/compiler/xla/service/convolution_group_converter.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/convolution_feature_group_converter.h"
+#include "tensorflow/compiler/xla/service/convolution_group_converter.h"
 
 #include <memory>
 #include <vector>
@@ -50,8 +50,12 @@ class ConvolutionVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleConvolution(HloInstruction* convolution) override;
 
+  Status HandleBatchGroupCount(HloInstruction* convolution);
+
   // Runs the visitor on a computation.
   static bool Run(HloComputation* computation,
+                  std::function<bool(HloInstruction*)> is_cost_viable,
+                  bool convert_batch_groups_only,
                   bool canonicalize_depthwise_filter);
 
   // Returns whether any convolution ops were rewritten.
@@ -60,10 +64,15 @@ class ConvolutionVisitor : public DfsHloVisitorWithDefault {
   ~ConvolutionVisitor() override = default;
 
  private:
-  explicit ConvolutionVisitor(HloComputation* computation,
-                              bool canonicalize_depthwise_filter = false)
+  explicit ConvolutionVisitor(
+      HloComputation* computation,
+      std::function<bool(HloInstruction*)> is_cost_viable,
+      bool convert_batch_groups_only,
+      bool canonicalize_depthwise_filter = false)
       : computation_(computation),
-        filter_expansion_(!canonicalize_depthwise_filter) {}
+        filter_expansion_(!canonicalize_depthwise_filter),
+        convert_batch_groups_only_(convert_batch_groups_only),
+        is_cost_viable_(is_cost_viable) {}
 
   // Current HloComputation instance the ConvolutionVisitor is traversing.
   HloComputation* computation_;
@@ -73,11 +82,21 @@ class ConvolutionVisitor : public DfsHloVisitorWithDefault {
 
   // Whether filter expansion is required.
   bool filter_expansion_;
+
+  // Decides whether to convert batch groups or feature groups.
+  bool convert_batch_groups_only_;
+
+  // std::function<std::vector<LloValue*>(int64, int64)> chunk_fetcher
+  std::function<bool(HloInstruction*)> is_cost_viable_;
 };
 
-bool ConvolutionVisitor::Run(HloComputation* computation,
-                             bool canonicalize_depthwise_filter) {
-  ConvolutionVisitor visitor(computation, canonicalize_depthwise_filter);
+bool ConvolutionVisitor::Run(
+    HloComputation* computation,
+    std::function<bool(HloInstruction*)> is_cost_viable,
+    bool convert_batch_groups_only, bool canonicalize_depthwise_filter) {
+  ConvolutionVisitor visitor(computation, is_cost_viable,
+                             convert_batch_groups_only,
+                             canonicalize_depthwise_filter);
   TF_CHECK_OK(computation->Accept(&visitor));
   return visitor.changed_;
 }
@@ -176,18 +195,143 @@ HloInstruction* GetExpandedFilterMask(
       predicate_shape, HloOpcode::kEq, broadcasted_mask1, broadcasted_mask2));
 }
 
-Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
-  int64 group_count = convolution->feature_group_count();
-  if (group_count == 1) {
+// This function handles batch_group_counts which are relevant only for
+// depthwise backprop filter convolutions.
+Status ConvolutionVisitor::HandleBatchGroupCount(HloInstruction* convolution) {
+  auto dim_numbers = convolution->convolution_dimension_numbers();
+  auto activation = convolution->mutable_operand(0);
+  auto filter = convolution->mutable_operand(1);
+  int64 batch_group_count = convolution->batch_group_count();
+
+  if (batch_group_count == 1) {
     return Status::OK();
   }
-  auto filter = convolution->mutable_operand(1);
-  changed_ = true;
+
+  VLOG(2) << "Dealing with batch_group_count " << batch_group_count
+          << " for convolution " << convolution->ToString() << "\n";
+
+  auto add = [&](std::unique_ptr<HloInstruction> inst) {
+    return computation_->AddInstruction(std::move(inst));
+  };
+
+  int64 input_batch_dimension = dim_numbers.input_batch_dimension();
+  int64 output_batch_dimension = dim_numbers.output_batch_dimension();
+  int64 output_feature_dimension = dim_numbers.output_feature_dimension();
+
+  int64 input_batch = activation->shape().dimensions(input_batch_dimension);
+
+  // We are not yet supporting batch_group of sizes greater than 1.
+  TF_RET_CHECK(input_batch == batch_group_count);
+
+  if (!is_cost_viable_(convolution) || filter_expansion_) {
+    // We first obtain the expanded the filter (which is the convolution
+    // output). The batch dimension is the expanded one (which originally
+    // represents kernel input feature dimension). We mask the filter to zero
+    // out the expanded regions. Next we reduce the filter in the batch
+    // dimension to obtain the original filter size.
+
+    HloInstruction* filter_mask =
+        GetExpandedFilterMask(convolution->shape(), output_batch_dimension,
+                              output_feature_dimension, batch_group_count, add);
+    auto expanded_filter_shape = ExpandedFilterShape(
+        convolution->shape(), batch_group_count, output_batch_dimension);
+
+    auto new_convolution = add(HloInstruction::CreateConvolve(
+        expanded_filter_shape, activation, filter,
+        /*feature_group_count=*/1, /*batch_group_count=*/1,
+        convolution->window(), dim_numbers, convolution->precision_config()));
+
+    auto zero = add(HloInstruction::CreateConstant(
+        LiteralUtil::Zero(expanded_filter_shape.element_type())));
+    auto zero_filter =
+        add(HloInstruction::CreateBroadcast(expanded_filter_shape, zero, {}));
+
+    auto new_filter = add(HloInstruction::CreateTernary(
+        expanded_filter_shape, HloOpcode::kSelect, filter_mask, new_convolution,
+        zero_filter));
+
+    PrimitiveType reduce_type = new_filter->shape().element_type();
+    auto reduce_window_shape = new_convolution->shape();
+    reduce_window_shape.set_dimensions(output_batch_dimension, 1);
+
+    // Ensure that data input to reduce window uses at least 32 bits.
+    if (primitive_util::BitWidth(reduce_type) < primitive_util::BitWidth(F32)) {
+      reduce_type = F32;
+      reduce_window_shape.set_element_type(F32);
+      Shape convert_shape = new_filter->shape();
+      convert_shape.set_element_type(F32);
+      new_filter =
+          add(HloInstruction::CreateConvert(convert_shape, new_filter));
+    }
+
+    auto zero_literal = LiteralUtil::Zero(reduce_type);
+    auto zero_scalar =
+        add(HloInstruction::CreateConstant(std::move(zero_literal)));
+
+    auto reduce_function = [&]() -> HloComputation* {
+      HloComputation::Builder b("add_computation");
+      Shape shape = ShapeUtil::MakeShape(reduce_type, {});
+      auto lhs =
+          b.AddInstruction(HloInstruction::CreateParameter(0, shape, "lhs"));
+      auto rhs =
+          b.AddInstruction(HloInstruction::CreateParameter(1, shape, "rhs"));
+      auto scalar_op = b.AddInstruction(
+          HloInstruction::CreateBinary(shape, HloOpcode::kAdd, lhs, rhs));
+      return computation_->parent()->AddEmbeddedComputation(b.Build(scalar_op));
+    };
+
+    // Create the reduce window.
+    Window window;
+    for (int64 i = 0; i < new_convolution->shape().dimensions_size(); ++i) {
+      auto* dim = window.add_dimensions();
+      dim->set_padding_low(0);
+      dim->set_padding_high(0);
+      dim->set_window_dilation(1);
+      dim->set_base_dilation(1);
+      if (i == output_batch_dimension) {
+        dim->set_stride(batch_group_count);
+        dim->set_size(batch_group_count);
+      } else {
+        dim->set_stride(1);
+        dim->set_size(1);
+      }
+    }
+    auto reduce_window = add(HloInstruction::CreateReduceWindow(
+        reduce_window_shape, new_filter, zero_scalar, window,
+        reduce_function()));
+
+    Shape convert_back_shape = reduce_window->shape();
+    convert_back_shape.set_element_type(activation->shape().element_type());
+
+    // Convert reduced data back to the original data type.
+    auto reduce_window_converted =
+        HloInstruction::CreateConvert(convert_back_shape, reduce_window);
+
+    TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
+        convolution, std::move(reduce_window_converted)));
+    changed_ = true;
+  }
+
+  return Status::OK();
+}
+
+Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
+  if (convert_batch_groups_only_) {
+    return HandleBatchGroupCount(convolution);
+  }
+
   auto add = [&](std::unique_ptr<HloInstruction> inst) {
     return computation_->AddInstruction(std::move(inst));
   };
 
+  int64 group_count = convolution->feature_group_count();
+  if (group_count == 1) {
+    return Status::OK();
+  }
+
+  changed_ = true;
   auto dim_numbers = convolution->convolution_dimension_numbers();
+  auto filter = convolution->mutable_operand(1);
   int64 kernel_input_feature_dim = dim_numbers.kernel_input_feature_dimension();
   int64 group_size = filter->shape().dimensions(kernel_input_feature_dim);
   int64 kernel_output_feature_dim =
@@ -205,6 +349,7 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
     // If the code generator handles depthwise separable convolutions
     // inherently, then no filter expansion is needed.
     if (!filter_expansion_ && depthwise_separable) {
+      changed_ = false;
       return Status::OK();
     }
     // We want to repeat 'filter' in the 'input_feature_dim' dimension
@@ -233,8 +378,8 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
 
     auto new_convolution = HloInstruction::CreateConvolve(
         convolution->shape(), convolution->mutable_operand(0), new_filter,
-        /*feature_group_count=*/1, convolution->window(), dim_numbers,
-        convolution->precision_config());
+        /*feature_group_count=*/1, /*batch_group_count=*/1,
+        convolution->window(), dim_numbers, convolution->precision_config());
     TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
         convolution, std::move(new_convolution)));
   } else {
@@ -294,8 +439,9 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
       dim->set_size(group_size);
 
       auto new_convolution = add(HloInstruction::CreateConvolve(
-          new_output_shape, activation, filter, group_count, new_window,
-          dim_numbers, convolution->precision_config()));
+          new_output_shape, activation, filter, group_count,
+          /*batch_group_count=*/1, new_window, dim_numbers,
+          convolution->precision_config()));
 
       // Delete the extra spatial dimension, and reshape.
       Shape reshaped_convolution_shape =
@@ -372,7 +518,8 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
 
         auto new_convolution = add(HloInstruction::CreateConvolve(
             conv_slice_shape, activation_slice, filter_slice,
-            /*feature_group_count=*/1, convolution->window(), dim_numbers,
+            /*feature_group_count=*/1, /*batch_group_count=*/1,
+            convolution->window(), dim_numbers,
             convolution->precision_config()));
 
         sliced_convolutions.push_back(new_convolution);
@@ -390,17 +537,19 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
 
 }  // namespace
 
-StatusOr<bool> ConvolutionFeatureGroupConverter::Run(HloModule* module) {
-  XLA_VLOG_LINES(2, "ConvolutionFeatureGroupConverter::Run(), before:\n" +
-                        module->ToString());
+StatusOr<bool> ConvolutionGroupConverter::Run(HloModule* module) {
+  XLA_VLOG_LINES(
+      2, "ConvolutionGroupConverter::Run(), before:\n" + module->ToString());
   bool changed = false;
   for (auto* comp : module->MakeNonfusionComputations()) {
-    if (ConvolutionVisitor::Run(comp, filter_expansion_)) {
+    if (ConvolutionVisitor::Run(comp, is_cost_viable_,
+                                convert_batch_groups_only_,
+                                filter_expansion_)) {
       changed = true;
     }
   }
-  XLA_VLOG_LINES(2, "ConvolutionFeatureGroupConverter::Run(), after:\n" +
-                        module->ToString());
+  XLA_VLOG_LINES(
+      2, "ConvolutionGroupConverter::Run(), after:\n" + module->ToString());
   return changed;
 }
 
diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter.h b/tensorflow/compiler/xla/service/convolution_group_converter.h
similarity index 58%
rename from tensorflow/compiler/xla/service/convolution_feature_group_converter.h
rename to tensorflow/compiler/xla/service/convolution_group_converter.h
index cb6bc04c00a2ff10f970da2a07fb540a561dad5a..1caf1841119a965044502435fe0f5b38ca94f6a5 100644
--- a/tensorflow/compiler/xla/service/convolution_feature_group_converter.h
+++ b/tensorflow/compiler/xla/service/convolution_group_converter.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CONVOLUTION_FEATURE_GROUP_CONVERTER_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_CONVOLUTION_FEATURE_GROUP_CONVERTER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CONVOLUTION_GROUP_CONVERTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CONVOLUTION_GROUP_CONVERTER_H_
 
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -25,23 +25,34 @@ namespace xla {
 
 // A pass which rewrites convolutions with feature_group_count > 1 into
 // convolutions with feature_group_count = 1.
-class ConvolutionFeatureGroupConverter : public HloModulePass {
+class ConvolutionGroupConverter : public HloModulePass {
  public:
-  ConvolutionFeatureGroupConverter(bool canonicalize_depthwise_filter = false)
-      : filter_expansion_(canonicalize_depthwise_filter) {}
+  ConvolutionGroupConverter(std::function<bool(HloInstruction*)> is_cost_viable,
+                            bool convert_batch_groups_only,
+                            bool canonicalize_depthwise_filter = false)
+      : is_cost_viable_(is_cost_viable),
+        convert_batch_groups_only_(convert_batch_groups_only),
+        filter_expansion_(canonicalize_depthwise_filter) {}
 
   absl::string_view name() const override {
-    return "convolution-feature-group-converter";
+    return "convolution-group-converter";
   }
 
   // Run convolution rewriting on the given computation. Returns whether the
   // computation was changed.
   StatusOr<bool> Run(HloModule* module) override;
 
+  // Lambda containing cost model that decides whether to expand
+  // batch_group_count.
+  std::function<bool(HloInstruction*)> is_cost_viable_;
+
+  // Decides whether to convert batch groups or feature groups.
+  bool convert_batch_groups_only_;
+
   // Tells whether filter expansion is required.
   bool filter_expansion_;
 };
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CONVOLUTION_FEATURE_GROUP_CONVERTER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CONVOLUTION_GROUP_CONVERTER_H_
diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc b/tensorflow/compiler/xla/service/convolution_group_converter_test.cc
similarity index 68%
rename from tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc
rename to tensorflow/compiler/xla/service/convolution_group_converter_test.cc
index e6bf2143a21bd5001d3530fe8727c88504be1d43..9cee3eda95252d6c7d725fbb03030bd58f52e71f 100644
--- a/tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc
+++ b/tensorflow/compiler/xla/service/convolution_group_converter_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/convolution_feature_group_converter.h"
+#include "tensorflow/compiler/xla/service/convolution_group_converter.h"
 
 #include <memory>
 #include <string>
@@ -30,10 +30,10 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using ConvolutionFeatureGroupConverterTest = HloTestBase;
+using ConvolutionGroupConverterTest = HloTestBase;
 namespace op = testing::opcode_matchers;
 
-TEST_F(ConvolutionFeatureGroupConverterTest,
+TEST_F(ConvolutionGroupConverterTest,
        ConvertFeatureGroupCountEqualToInputFeatureDim) {
   string hlo_string = R"(HloModule Convolve1D1Window_0_module
 
@@ -49,7 +49,8 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,2], filter: f32[1,1,2]) -> f32[1,2
   auto computation = module->entry_computation();
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
-  ConvolutionFeatureGroupConverter converter;
+  ConvolutionGroupConverter converter(nullptr, /*convert_batch_groups_only=*/
+                                      false);
   ASSERT_TRUE(converter.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
   // Make sure the convolution is converted to one with feature_group_count = 1.
@@ -63,7 +64,7 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,2], filter: f32[1,1,2]) -> f32[1,2
                          op::Broadcast(op::Constant())));
 }
 
-TEST_F(ConvolutionFeatureGroupConverterTest,
+TEST_F(ConvolutionGroupConverterTest,
        ConvertFeatureGroupCountDivisorOfInputFeatureDim) {
   string hlo_string = R"(HloModule Convolve1D1Window_0_module
 
@@ -79,7 +80,8 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,4], filter: f32[1,2,2]) -> f32[1,2
   auto computation = module->entry_computation();
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
-  ConvolutionFeatureGroupConverter converter;
+  ConvolutionGroupConverter converter(nullptr, /*convert_batch_groups_only=*/
+                                      false);
   ASSERT_TRUE(converter.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
   // Make sure the convolution is replaced with a concatenate.
@@ -92,5 +94,32 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,4], filter: f32[1,2,2]) -> f32[1,2
   EXPECT_EQ(root->operand(1)->feature_group_count(), 1);
 }
 
+TEST_F(ConvolutionGroupConverterTest,
+       ConvertBatchGroupCountEqualToInputBatchDim) {
+  string hlo_string = R"(HloModule Convolve1D1Window_0_module
+
+ENTRY %Convolve1D1Window_0.v3 (input: f32[16,19,19,512]{3,2,1,0}, filter: f32[16,19,19,512]{3,2,1,0}) -> f32[3,3,512,1]{3,2,1,0} {
+  %input = f32[16,19,19,512]{3,2,1,0} parameter(0)
+  %filter = f32[16,19,19,512]{3,2,1,0} parameter(1)
+  ROOT %convolution = f32[3,3,512,1]{3,2,1,0} convolution(f32[16,19,19,512]{3,2,1,0} %input, f32[16,19,19,512]{3,2,1,0} %filter), window={size=19x19 pad=1_1x1_1}, dim_labels=f01b_i01o->01fb, batch_group_count=512
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  auto computation = module->entry_computation();
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
+  auto cost_model = [](HloInstruction* conv) { return false; };
+  ConvolutionGroupConverter converter(cost_model, /*convert_batch_groups_only=*/
+                                      true);
+  ASSERT_TRUE(converter.Run(module.get()).ValueOrDie());
+  root = computation->root_instruction();
+
+  // Verify that the convolution is replaced by a convert.
+  EXPECT_EQ(root->opcode(), HloOpcode::kConvert);
+  // Make sure the convert is being fed by a reduce window.
+  EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kReduceWindow);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index df6059663876dfde71f4c75d3931b3d2de72c1df..5e26a63cebfa9b2e50f4b13335c10c246999d4df 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -349,11 +349,12 @@ Status AddCopiesForAliasedInputOutputs(HloModule* module) {
     ShapeTree<bool> param_indices_to_copy(param->shape());
 
     module->input_output_alias_config().ForEachAlias(
-        [&](const ShapeIndex& output_index, int64 param_number,
-            const ShapeIndex& param_index) {
-          if (param_number == param->parameter_number()) {
+        [&](const ShapeIndex& output_index,
+            const HloInputOutputAliasConfig::Alias& alias) {
+          if (alias.parameter_number == param->parameter_number()) {
             param_has_alias = true;
-            *(param_indices_to_copy.mutable_element(param_index)) = true;
+            *(param_indices_to_copy.mutable_element(alias.parameter_index)) =
+                true;
             *(output_indices_to_copy.mutable_element(output_index)) = true;
           }
         });
@@ -395,13 +396,14 @@ Status AddCopiesForAliasedInputOutputs(HloModule* module) {
 
   // Add control dependencies between the input/output copies.
   TF_RETURN_IF_ERROR(module->input_output_alias_config().ForEachAliasWithStatus(
-      [&](const ShapeIndex& output_index, int64 param_number,
-          const ShapeIndex& input_index) -> Status {
-        if (!copied_parameters[param_number]) {
+      [&](const ShapeIndex& output_index,
+          const HloInputOutputAliasConfig::Alias& alias) -> Status {
+        if (!copied_parameters[alias.parameter_number]) {
           return Status::OK();
         }
         HloInstruction* from =
-            copied_parameters[param_number]->element(input_index);
+            copied_parameters[alias.parameter_number]->element(
+                alias.parameter_index);
         HloInstruction* to = output_copy_tree.element(output_index);
 
         TF_RET_CHECK(from != nullptr);
@@ -522,7 +524,7 @@ class CopyRemover {
         // between copies added around aliased operations (kWhile) guarantees
         // this strict order.
         for (const HloValue* value_a : buffer.values()) {
-          if (ShapeUtil::IsToken(value_a->shape())) {
+          if (value_a->shape().IsToken()) {
             // Token values have no representation and cannot interfere.
             continue;
           }
@@ -539,10 +541,9 @@ class CopyRemover {
         }
 
         std::vector<const HloValue*> values = buffer.values();
-        std::sort(values.begin(), values.end(),
-                  [this](const HloValue* a, const HloValue* b) {
-                    return ordering_.IsDefinedBefore(*a, *b);
-                  });
+        absl::c_sort(values, [this](const HloValue* a, const HloValue* b) {
+          return ordering_.IsDefinedBefore(*a, *b);
+        });
 
         // Create a list containing all of the values in the buffer.
         AddValueList(values, &value_to_node);
@@ -842,12 +843,11 @@ class CopyRemover {
       copy_value_node->next->prev = operand_node;
 
       // Patch up uses. Remove use of copy from operand_node uses.
-      auto it =
-          std::find_if(operand_node->uses.begin(), operand_node->uses.end(),
-                       [copy_value_node](const HloUse* use) {
-                         return use->instruction ==
-                                copy_value_node->value->defining_instruction();
-                       });
+      auto it = absl::c_find_if(
+          operand_node->uses, [copy_value_node](const HloUse* use) {
+            return use->instruction ==
+                   copy_value_node->value->defining_instruction();
+          });
       CHECK(it != operand_node->uses.end());
       operand_node->uses.erase(it);
 
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index e4e9d7ba05c115be9dd0eb53ebd7de208d514efb..4391bdcba532661a0fde789e2c4ed324c40bcd32 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -1376,9 +1376,11 @@ TEST_F(CopyInsertionTest, CrossingParameters) {
   builder.AddInstruction(HloInstruction::CreateTuple({gte1, gte0}));
   module->AddEntryComputation(builder.Build());
   ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1}));
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   InsertCopies(module.get());
 
   EXPECT_EQ(CountCopies(*module), 4);
@@ -1409,9 +1411,11 @@ TEST_F(CopyInsertionTest, ParametersAliasing) {
   builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
   module->AddEntryComputation(builder.Build());
   ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1}));
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   InsertCopies(module.get());
 
   EXPECT_EQ(CountCopies(*module), 0);
@@ -1475,7 +1479,8 @@ TEST_F(CopyInsertionTest, ParameterWithPartialAliasing) {
   builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
   module->AddEntryComputation(builder.Build());
   ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   InsertCopies(module.get());
 
   EXPECT_THAT(module->entry_computation()->root_instruction(),
@@ -1516,7 +1521,8 @@ TEST_F(CopyInsertionTest, ParameterAndParallelOpsWithPartialAliasing) {
   builder.AddInstruction(HloInstruction::CreateTuple({negate0, negate1}));
   module->AddEntryComputation(builder.Build());
   ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   InsertCopies(module.get());
 
   EXPECT_EQ(CountCopies(*module), 0);
@@ -1557,7 +1563,8 @@ TEST_F(CopyInsertionTest, ParameterAndOpsWithPartialAliasing) {
   builder.AddInstruction(HloInstruction::CreateTuple({add, negate1}));
   module->AddEntryComputation(builder.Build());
   ASSERT_IS_OK(module->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   InsertCopies(module.get());
 
   EXPECT_EQ(CountCopies(*module), 0);
@@ -1848,8 +1855,7 @@ ENTRY %TokensShouldNotBeCopied () -> s32[] {
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          HloRunner::CreateModuleFromString(
-                              module_string, GetDebugOptionsForTest()));
+                          ParseAndReturnVerifiedModule(module_string));
   InsertCopies(module.get());
 
   // There should be no copies added because tokens should not be copied.
@@ -2112,8 +2118,7 @@ ENTRY TestComputation {
   ROOT while.3 = (s32[], s32[], s32[], s32[], s32[]) while(arg_tuple.6), condition=cond_wrapper.v3.2, body=_functionalize_body_2__.v25
 }
 )";
-  auto module_or_status =
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
+  auto module_or_status = ParseAndReturnVerifiedModule(hlo_string);
   auto module = module_or_status.ConsumeValueOrDie();
   InsertCopies(module.get());
 }
@@ -2213,8 +2218,7 @@ ENTRY TestComputation {
   ROOT while.3 = (s32[], s32[], s32[], s32[], s32[]) while(arg_tuple.6), condition=cond_wrapper.v3.2, body=_functionalize_body_2__.v25
 }
 )";
-  auto module_or_status =
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
+  auto module_or_status = ParseAndReturnVerifiedModule(hlo_string);
   auto module = module_or_status.ConsumeValueOrDie();
   InsertCopies(module.get());
 }
@@ -2231,7 +2235,7 @@ cond.inner {
 
 body.inner {
   param.body.inner = pred[] parameter(0)
-  ROOT neg = pred[] negate(param.body.inner)
+  ROOT not = pred[] not(param.body.inner)
 }
 
 cond.outer {
@@ -2248,9 +2252,8 @@ ENTRY TestComputation {
   ROOT while = pred[] while(entry_param), condition=cond.outer, body=body.outer
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<HloModule> module,
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest()));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
   InsertCopies(module.get());
 
   // There should only be a single copy inserted, and it's in the entry
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index ce4c2a9cc69240b9565b35a3f2504d7fc9373917..42672bc3875af2d732d80691df6bf85b9d8080cd 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -1,6 +1,14 @@
 # Description:
 #    LLVM-based CPU backend for XLA.
 
+load("//tensorflow/compiler/xla:xla.bzl", "ORC_JIT_MEMORY_MAPPER_TARGETS")
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "mkl_deps",
+)
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
+load(":build_defs.bzl", "runtime_copts")
+
 licenses(["notice"])  # Apache 2.0
 
 package(
@@ -14,15 +22,6 @@ package_group(
     ],
 )
 
-load(":build_defs.bzl", "runtime_copts")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
-load("//tensorflow/compiler/xla:xla.bzl", "ORC_JIT_MEMORY_MAPPER_TARGETS")
-load(
-    "//third_party/mkl:build_defs.bzl",
-    "mkl_deps",
-)
-
 # Filegroup used to collect source files for dependency checking.
 filegroup(
     name = "c_srcs",
@@ -95,6 +94,7 @@ cc_library(
         ":target_machine_features",
         "@com_google_absl//absl/types:span",
         "//tensorflow/compiler/tf2xla:cpu_function_runtime",
+        "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:map_inliner",
         "//tensorflow/compiler/xla/service:hlo_get_dimension_size_rewriter",
         "//tensorflow/compiler/xla/service:scatter_expander",
@@ -112,8 +112,9 @@ cc_library(
         "//tensorflow/compiler/xla/service:buffer_liveness",
         "//tensorflow/compiler/xla/service:call_inliner",
         "//tensorflow/compiler/xla/service:conditional_simplifier",
-        "//tensorflow/compiler/xla/service:convolution_feature_group_converter",
+        "//tensorflow/compiler/xla/service:convolution_group_converter",
         "//tensorflow/compiler/xla/service:dot_decomposer",
+        "//tensorflow/compiler/xla/service:dynamic_index_splitter",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
         "//tensorflow/compiler/xla/service:hlo",
@@ -133,7 +134,9 @@ cc_library(
         "//tensorflow/compiler/xla/service:llvm_compiler",
         "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/service:reshape_mover",
+        "//tensorflow/compiler/xla/service:sort_simplifier",
         "//tensorflow/compiler/xla/service:transpose_folding",
+        "//tensorflow/compiler/xla/service:triangular_solve_expander",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
         "//tensorflow/compiler/xla/service:while_loop_constant_sinking",
         "//tensorflow/compiler/xla/service:while_loop_invariant_code_motion",
@@ -241,6 +244,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:tuple_points_to_analysis",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor/host:host_stream",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -364,15 +368,33 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tiled_dot_emitter",
+    srcs = ["tiled_dot_emitter.cc"],
+    hdrs = ["tiled_dot_emitter.h"],
+    deps = [
+        ":vector_support_library",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service/llvm_ir:kernel_support_library",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/core:lib",
+        "@llvm//:core",
+    ],
+)
+
 cc_library(
     name = "dot_op_emitter",
     srcs = ["dot_op_emitter.cc"],
-    hdrs = ["dot_op_emitter.h"],
+    hdrs = [
+        "dot_op_emitter.h",
+    ],
     deps = [
         ":cpu_options",
         ":cpu_runtime",
         ":ir_emission_utils",
         ":target_machine_features",
+        ":tiled_dot_emitter",
         ":vector_support_library",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -380,6 +402,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service/llvm_ir:ir_array",
         "//tensorflow/compiler/xla/service/llvm_ir:kernel_support_library",
@@ -572,6 +595,7 @@ cc_library(
         ":runtime_matvec",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/core:framework_lite",
+        "//tensorflow/core/kernels:eigen_contraction_kernel",
         "//third_party/eigen3",
     ],
 )
@@ -630,6 +654,7 @@ cc_library(
     deps = [
         ":runtime_matvec",
         "//tensorflow/core:framework_lite",
+        "//tensorflow/core/kernels:eigen_contraction_kernel",
         "//third_party/eigen3",
     ],
 )
@@ -1005,7 +1030,6 @@ tf_cc_test(
     size = "small",
     srcs = ["cpu_eigen_tensor_alignment_test.cc"],
     deps = [
-        ":dot_op_emitter",
         ":ir_emission_utils",
         ":target_machine_features_fake",
         "//tensorflow/compiler/xla:test",
diff --git a/tensorflow/compiler/xla/service/cpu/build_defs.bzl b/tensorflow/compiler/xla/service/cpu/build_defs.bzl
index e78330b21689fdd818cd97128bbcaaa9e0118602..ffa1cd4ec8e26e7dbe92e7b99cf65e99db5400b9 100644
--- a/tensorflow/compiler/xla/service/cpu/build_defs.bzl
+++ b/tensorflow/compiler/xla/service/cpu/build_defs.bzl
@@ -1,12 +1,11 @@
 """build_defs for service/cpu."""
 
-
 def runtime_copts():
-  """Returns copts used for CPU runtime libraries."""
-  return (["-DEIGEN_AVOID_STL_ARRAY"] + select({
-      "//tensorflow:android_arm": ["-mfpu=neon"],
-      "//conditions:default": []
-  }) + select({
-      "//tensorflow:android": ["-O2"],
-      "//conditions:default": []
-  }))
+    """Returns copts used for CPU runtime libraries."""
+    return (["-DEIGEN_AVOID_STL_ARRAY"] + select({
+        "//tensorflow:android_arm": ["-mfpu=neon"],
+        "//conditions:default": [],
+    }) + select({
+        "//tensorflow:android": ["-O2"],
+        "//conditions:default": [],
+    }))
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
index 796a7cf94d02b0ad42366387a9d3f8d589b8840a..414eacddfc7ba3c295c027c64c445a2046235d36 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
@@ -66,9 +66,14 @@ class FilteredPassManager : public llvm::legacy::PassManager {
   explicit FilteredPassManager(bool disable_expensive_passes)
       : disable_expensive_passes_(disable_expensive_passes) {}
   void add(llvm::Pass* p) override {
+    llvm::StringRef PassName = p->getPassName();
+    if (PassName.contains("Warn about non-applied transformations")) {
+      delete p;
+      return;
+    }
     if (disable_expensive_passes_) {
-      llvm::StringRef PassName = p->getPassName();
       if (PassName.contains("Unroll loops")) {
+        delete p;
         return;
       }
     }
diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc
index 2d9978404cc9ec1e40fc61aaf794a8f1f06050bb..8e55267a67d330e7e721f9b5fb25451357a49a9d 100644
--- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc
+++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc
@@ -132,7 +132,8 @@ StatusOr<bool> ConvCanonicalization::Run(HloModule* module) {
       HloInstruction* new_conv = module->entry_computation()->AddInstruction(
           HloInstruction::CreateConvolve(
               new_conv_shape, new_input, new_kernel, hlo->feature_group_count(),
-              hlo->window(), new_dnums, hlo->precision_config()));
+              hlo->batch_group_count(), hlo->window(), new_dnums,
+              hlo->precision_config()));
 
       // Reshape the output back to the shape of the original convolution.
       TF_RETURN_IF_ERROR(module->entry_computation()->ReplaceWithNewInstruction(
diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
index c58175428fea6a2d38253c35de598b99a4281bf1..02085108a081358cd4f8aed6dc12557cbd8eea85 100644
--- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
@@ -84,8 +84,8 @@ TEST_F(ConvCanonicalizationTest, NonCanonicalToCanonical) {
   builder.AddInstruction(HloInstruction::CreateConvolve(
       ShapeUtil::MakeShape(
           F32, {kOutputFeatureCount, kBatchSize, output_size, output_size}),
-      input, kernel, /*feature_group_count=*/1, conv_window_, dnums,
-      DefaultPrecisionConfig(2)));
+      input, kernel, /*feature_group_count=*/1, /*batch_group_count=*/1,
+      conv_window_, dnums, DefaultPrecisionConfig(2)));
 
   auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
@@ -147,8 +147,8 @@ TEST_F(ConvCanonicalizationTest, CanonicalStaysTheSame) {
   builder.AddInstruction(HloInstruction::CreateConvolve(
       ShapeUtil::MakeShape(
           F32, {kBatchSize, output_size, output_size, kOutputFeatureCount}),
-      input, kernel, /*feature_group_count=*/1, conv_window_, dnums,
-      DefaultPrecisionConfig(2)));
+      input, kernel, /*feature_group_count=*/1, /*batch_group_count=*/1,
+      conv_window_, dnums, DefaultPrecisionConfig(2)));
 
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 6374822c81bf42fd12829f57cf93c19457128219..19ab3bddb567afeeddb7c01b9a847b51bea5d957 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -51,7 +51,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/conditional_simplifier.h"
-#include "tensorflow/compiler/xla/service/convolution_feature_group_converter.h"
+#include "tensorflow/compiler/xla/service/convolution_group_converter.h"
 #include "tensorflow/compiler/xla/service/cpu/buffer_info_util.h"
 #include "tensorflow/compiler/xla/service/cpu/compiler_functor.h"
 #include "tensorflow/compiler/xla/service/cpu/conv_canonicalization.h"
@@ -69,6 +69,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/dot_decomposer.h"
+#include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -92,7 +93,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/scatter_expander.h"
+#include "tensorflow/compiler/xla/service/sort_simplifier.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
+#include "tensorflow/compiler/xla/service/triangular_solve_expander.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h"
 #include "tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h"
@@ -103,6 +106,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/dynamic_annotations.h"
 
 namespace xla {
 namespace cpu {
@@ -244,6 +248,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   HloPassPipeline pipeline("HLO passes through layout assignment");
   pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
                                             /*allow_mixed_precision=*/false);
+  pipeline.AddPass<DynamicIndexSplitter>();
   pipeline.AddPass<CpuHloSupportChecker>();
 
   ReducePrecisionInsertion::AddPasses(
@@ -252,12 +257,23 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
 
   pipeline.AddPass<MapInliner>();
 
+  pipeline.AddPass<TriangularSolveExpander>();
+
   // TODO(b/65775800): Fix wrong output bug in Call and remove the CallInliner
   // pass.
   pipeline.AddPass<CallInliner>();
   pipeline.AddPass<BatchDotSimplification>();
-  pipeline.AddPass<DotDecomposer>();
-  pipeline.AddPass<ConvolutionFeatureGroupConverter>();
+  pipeline.AddPass<DotDecomposer>(/*decompose_batch_dot=*/false);
+  auto cost_model = [](HloInstruction* conv) {
+    // We need a cost model for CPUs. Currently, do nothing.
+    return false;
+  };
+  pipeline.AddPass<ConvolutionGroupConverter>(
+      cost_model,
+      /*convert_batch_groups_only=*/true);
+  pipeline.AddPass<ConvolutionGroupConverter>(
+      cost_model,
+      /*convert_batch_groups_only=*/false);
   pipeline.AddPass<ConvCanonicalization>(target_machine_features);
   {
     auto& pass =
@@ -270,10 +286,10 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
         /*rewrite_inference_op=*/true,
         /*rewrite_grad_op=*/true);
     pipeline.AddPass<HloGetDimensionSizeRewriter>();
-    AlgebraicSimplifierOptions options(
-        [](const Shape&, const Shape&) { return false; });
+    AlgebraicSimplifierOptions options;
     options.set_enable_dot_strength_reduction(false);
     pass.AddPass<AlgebraicSimplifier>(options);
+    pass.AddPass<SortSimplifier>();
     pass.AddPass<HloDCE>();
 
     // BatchNormExpander can create zero-sized ops, so zero-sized HLO
@@ -293,7 +309,8 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   pipeline.AddPass<TransposeFolding>(
       [&](const HloInstruction& dot,
           const TransposeFolding::OperandIndices& candidate_operands) {
-        return PotentiallyImplementedAsEigenDot(dot, *target_machine_features)
+        return DotImplementationCanHandleTranspose(dot,
+                                                   *target_machine_features)
                    ? candidate_operands
                    : TransposeFolding::OperandIndices{};
       },
@@ -336,8 +353,7 @@ Status CpuCompiler::RunHloPassesAfterLayoutAssn(
     pass.AddInvariantChecker<HloVerifier>(
         /*layout_sensitive=*/true,
         /*allow_mixed_precision=*/false);
-    AlgebraicSimplifierOptions options(
-        [](const Shape&, const Shape&) { return true; });
+    AlgebraicSimplifierOptions options;
     options.set_is_layout_sensitive(true);
     options.set_enable_dot_strength_reduction(false);
     pass.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
@@ -497,7 +513,7 @@ Status CreateHloProfilingArtifacts(
 
   auto shape_size_bytes = [](const Shape& shape) {
     // On the cpu, opaques are pointers.
-    if (ShapeUtil::IsOpaque(shape)) {
+    if (shape.IsOpaque()) {
       return static_cast<int64>(sizeof(void*));
     }
     return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
@@ -621,7 +637,13 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
                        std::move(instruction_to_profile_idx),
                        std::move(computation_to_profile_idx),
-                       &target_machine_features);
+                       &target_machine_features,
+#ifdef MEMORY_SANITIZER
+                       /*emit_code_for_msan=*/true
+#else
+                       /*emit_code_for_msan=*/false
+#endif
+  );
 
   TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals());
 
@@ -635,18 +657,17 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
             .EmitComputation(
                 embedded_computation, embedded_computation->name(),
                 /*is_top_level_computation=*/false,
-                &schedule.sequence(embedded_computation).instructions())
+                schedule.sequence(embedded_computation).instructions())
             .status());
   }
   string function_name_prefix = entry_computation->name().empty()
                                     ? "__compute"
                                     : entry_computation->name();
-  TF_ASSIGN_OR_RETURN(
-      llvm::Function * entry_function,
-      ir_emitter.EmitComputation(
-          entry_computation, function_name_prefix,
-          /*is_top_level_computation=*/true,
-          &schedule.sequence(entry_computation).instructions()));
+  TF_ASSIGN_OR_RETURN(llvm::Function * entry_function,
+                      ir_emitter.EmitComputation(
+                          entry_computation, function_name_prefix,
+                          /*is_top_level_computation=*/true,
+                          schedule.sequence(entry_computation).instructions()));
 
   string function_name = [&]() {
     llvm::SmallVector<char, 40> function_name_vector;
@@ -659,9 +680,9 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   if (embed_ir_in_executable) {
     ir_module_string = llvm_ir::DumpModuleToString(*llvm_module);
   }
-  TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module));
 
   XLA_VLOG_LINES(2, "LLVM IR:\n" + llvm_ir::DumpModuleToString(*llvm_module));
+  TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module));
 
   // JIT compile the LLVM IR module to in-memory machine code.
   jit->AddModule(std::move(llvm_module));
@@ -820,7 +841,9 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
     IrEmitter ir_emitter(*module, *assignment, &llvm_module,
                          std::move(instruction_to_profile_idx),
                          std::move(computation_to_profile_idx),
-                         &target_machine_features);
+                         &target_machine_features,
+                         // TODO(b/66051036): Run full msan for AOT.
+                         /*emit_code_for_msan=*/false);
 
     TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals());
 
@@ -835,7 +858,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
               .EmitComputation(
                   embedded_computation, embedded_computation->name(),
                   /*is_top_level_computation=*/false,
-                  &schedule.sequence(embedded_computation).instructions())
+                  schedule.sequence(embedded_computation).instructions())
               .status());
     }
     const string& entry_point_name = options.entry_point_name();
@@ -843,7 +866,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
                         ir_emitter.EmitComputation(
                             computation, entry_point_name,
                             /*is_top_level_computation=*/true,
-                            &schedule.sequence(computation).instructions()));
+                            schedule.sequence(computation).instructions()));
 
     CHECK(entry_function->getName() == llvm_ir::AsStringRef(entry_point_name));
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc
index 8727c72b6e42517b1859e98ecadb41bbceed761c..485769a373acf5ae70c471b1a5dfcfb20ff772ef 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/cpu/dot_op_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
@@ -28,37 +27,6 @@ namespace {
 
 class CpuEigenTensorAlignmentTest : public ::testing::Test {};
 
-TEST_F(CpuEigenTensorAlignmentTest, EigenDotAlignment) {
-  string hlo_string = R"(
-HloModule DotOperation
-
-ENTRY DotOperation {
-  arg0 = f32[5,256] parameter(0)
-  arg1 = f32[256,1024] parameter(1)
-  ROOT dot = f32[5,1024] dot(arg0, arg1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseHloString(hlo_string));
-
-  HloInstruction* dot = module->entry_computation()->root_instruction();
-
-  TargetMachineFeaturesWithFakeAlignmentLogic target_machine_with_no_alignment(
-      [](int64 size) { return 1; });
-
-  EXPECT_FALSE(
-      PotentiallyImplementedAsEigenDot(*dot, target_machine_with_no_alignment));
-
-  TargetMachineFeaturesWithFakeAlignmentLogic
-      target_machine_with_full_alignment([](int64 size) {
-        return TargetMachineFeatures::kEigenExpectedTensorAlignment;
-      });
-
-  EXPECT_TRUE(PotentiallyImplementedAsEigenDot(
-      *dot, target_machine_with_full_alignment));
-}
-
 TEST_F(CpuEigenTensorAlignmentTest, EigenConvAlignment) {
   string hlo_string = R"(
 HloModule ConvOperation
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 818b2b0d0db2893e11fa46c7867e6c74bbbb6905..23d0af34233858515af21df5e92346742a5b5dc3 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -213,6 +213,8 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
       /*on_host_shape=*/result_shape(),
       /*on_device_shape=*/result_shape(), run_options->allocator(),
       stream->parent()->device_ordinal());
+  const HloInputOutputAliasConfig& input_output_alias =
+      module().input_output_alias_config();
 
   // Move OwningDeviceMemory values which contain the array(s) of the result
   // into the respective location in ScopedShapedBuffer which is returned to the
@@ -232,12 +234,31 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
         TF_ASSIGN_OR_RETURN(
             const BufferAllocation::Slice slice,
             this->assignment_->GetUniqueSlice(src, buffer_source->index()));
-        CHECK(!slice.allocation()->is_entry_computation_parameter());
-
         const BufferAllocation::Index buffer_index = slice.index();
         OwningDeviceMemory& buffer = buffers[buffer_index];
-        CHECK(!buffer.is_null() || buffer.size() == 0);
-        *device_memory = buffer.Forget();
+        if (!slice.allocation()->is_entry_computation_parameter()) {
+          // If the buffer coming out of the result is from a parameter, the
+          // owning buffer will be null, and that means the caller aliased some
+          // parameter buffer to an output one (via the
+          // HloInputOutputAliasConfig API). If that is the case, the caller
+          // will receive a partially complete scoped shaped buffer, which they
+          // will have to fill up on return. Unfortunately the interface to the
+          // execute APIs are ShapedBuffer pointer based, which assumes caller
+          // ownership, and hence a buffer coming from there cannot be part of
+          // the new ScopedShapedBuffer we create for the result (which assumes
+          // ownership).
+          *device_memory = buffer.Forget();
+        } else {
+          auto output_alias = input_output_alias.GetAliasedOutput(
+              slice.allocation()->parameter_number(),
+              slice.allocation()->param_shape_index());
+          CHECK(output_alias)
+              << "Ouput buffer is coming from parameter "
+              << slice.allocation()->parameter_number() << " at index "
+              << slice.allocation()->param_shape_index()
+              << ", but no alias exists";
+          CHECK_EQ(*output_alias, index);
+        }
         return Status::OK();
       }));
   return std::move(result_buffer);
@@ -326,7 +347,7 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStreamImpl(
 
 /*static*/ int64 CpuExecutable::ShapeSizeBytes(const Shape& shape) {
   // On the cpu, opaques are pointers.
-  if (ShapeUtil::IsOpaque(shape)) {
+  if (shape.IsOpaque()) {
     return sizeof(void*);
   }
   return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.cc b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.cc
index 7fbe0fa157c57eb0c274662a1de95cf5328ccfa8..4ac61f44d9f38425da2d1fc6b9495cb4deba5047 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_hlo_support_checker.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
index 527df0bd1c23bba74f32226e5622fed32f7dcf84..c4bde837e57e82584c2a007858ed8d55608acd3c 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
@@ -332,7 +332,7 @@ TEST_F(OpcodeFusionTest, Exponential_Reshape_Negate) {
 TEST_F(OpcodeFusionTest, Broadcast_Reshape_DynamicSlice_Tanh) {
   HloComputation::Builder builder(TestName());
   Shape param_shape = ShapeUtil::MakeShape(F32, {8});
-  Shape starts_shape = ShapeUtil::MakeShape(F32, {2});
+  Shape starts_shape = ShapeUtil::MakeShape(F32, {});
   Shape broadcast_shape = ShapeUtil::MakeShape(F32, {1, 8, 8});
   Shape reshape_shape = ShapeUtil::MakeShape(F32, {8, 8});
   Shape dynamic_slice_shape = ShapeUtil::MakeShape(F32, {4, 4});
@@ -340,13 +340,15 @@ TEST_F(OpcodeFusionTest, Broadcast_Reshape_DynamicSlice_Tanh) {
       HloInstruction::CreateParameter(0, param_shape, "param"));
   HloInstruction* param1 = builder.AddInstruction(
       HloInstruction::CreateParameter(1, starts_shape, "starts"));
+  HloInstruction* param2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, starts_shape, "starts"));
   HloInstruction* broadcast2 = builder.AddInstruction(
       HloInstruction::CreateBroadcast(broadcast_shape, param0, {1}));
   HloInstruction* reshape3 = builder.AddInstruction(
       HloInstruction::CreateReshape(reshape_shape, broadcast2));
   HloInstruction* dynamic_slice4 =
       builder.AddInstruction(HloInstruction::CreateDynamicSlice(
-          dynamic_slice_shape, reshape3, param1, {4, 4}));
+          dynamic_slice_shape, reshape3, {param1, param2}, {4, 4}));
   builder.AddInstruction(HloInstruction::CreateUnary(
       dynamic_slice_shape, HloOpcode::kTanh, dynamic_slice4));
 
@@ -356,7 +358,8 @@ TEST_F(OpcodeFusionTest, Broadcast_Reshape_DynamicSlice_Tanh) {
   RunFusionAndCheckOpcodesWereFused(
       module.get(),
       {HloOpcode::kTanh, HloOpcode::kDynamicSlice, HloOpcode::kReshape,
-       HloOpcode::kBroadcast, HloOpcode::kParameter, HloOpcode::kParameter});
+       HloOpcode::kBroadcast, HloOpcode::kParameter, HloOpcode::kParameter,
+       HloOpcode::kParameter});
 }
 
 TEST_F(OpcodeFusionTest, Broadcast_Negate) {
@@ -381,14 +384,14 @@ TEST_F(OpcodeFusionTest, Broadcast_Negate) {
 TEST_F(OpcodeFusionTest, DynamicSlice_Negate) {
   HloComputation::Builder builder(TestName());
   Shape param_shape = ShapeUtil::MakeShape(F32, {4});
-  Shape slice_shape = ShapeUtil::MakeShape(F32, {1});
+  Shape slice_shape = ShapeUtil::MakeShape(F32, {});
   Shape result_shape = ShapeUtil::MakeShape(F32, {2});
   HloInstruction* param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, param_shape, "param"));
   HloInstruction* param1 = builder.AddInstruction(
       HloInstruction::CreateParameter(1, slice_shape, "starts"));
   HloInstruction* dynamic_slice2 = builder.AddInstruction(
-      HloInstruction::CreateDynamicSlice(result_shape, param0, param1, {2}));
+      HloInstruction::CreateDynamicSlice(result_shape, param0, {param1}, {2}));
   builder.AddInstruction(HloInstruction::CreateUnary(
       result_shape, HloOpcode::kNegate, dynamic_slice2));
 
@@ -548,28 +551,36 @@ TEST_F(OpcodeFusionTest, DynamicSliceWithDynamicUpdateSlice) {
   Shape full_shape = ShapeUtil::MakeShape(F32, {10, 100, 1000});
   Shape slice_shape = ShapeUtil::MakeShape(F32, {10, 1, 1000});
 
+  std::vector<HloInstruction*> slice_indices, update_indices;
+  for (int i = 0; i < 3; ++i) {
+    slice_indices.push_back(
+        builder.AddInstruction(HloInstruction::CreateParameter(
+            1 + i, ShapeUtil::MakeShape(U32, {}), "slice_indices")));
+    update_indices.push_back(
+        builder.AddInstruction(HloInstruction::CreateParameter(
+            5 + i, ShapeUtil::MakeShape(U32, {}), "update_indices")));
+  }
   HloInstruction* slice =
       builder.AddInstruction(HloInstruction::CreateDynamicSlice(
           slice_shape,
           builder.AddInstruction(
               HloInstruction::CreateParameter(0, full_shape, "slice_from")),
-          builder.AddInstruction(HloInstruction::CreateParameter(
-              1, ShapeUtil::MakeShape(U32, {3}), "slice_indices")),
+          slice_indices,
           /*slice_sizes=*/{10, 1, 1000}));
 
   builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
       full_shape,
       builder.AddInstruction(
-          HloInstruction::CreateParameter(2, full_shape, "to_update")),
-      slice,
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          3, ShapeUtil::MakeShape(U32, {3}), "update_indices"))));
+          HloInstruction::CreateParameter(4, full_shape, "to_update")),
+      slice, update_indices));
 
   module->AddEntryComputation(builder.Build());
   RunFusionAndCheckOpcodesWereFused(
-      module.get(), {HloOpcode::kDynamicSlice, HloOpcode::kDynamicUpdateSlice,
-                     HloOpcode::kParameter, HloOpcode::kParameter,
-                     HloOpcode::kParameter, HloOpcode::kParameter});
+      module.get(),
+      {HloOpcode::kDynamicSlice, HloOpcode::kDynamicUpdateSlice,
+       HloOpcode::kParameter, HloOpcode::kParameter, HloOpcode::kParameter,
+       HloOpcode::kParameter, HloOpcode::kParameter, HloOpcode::kParameter,
+       HloOpcode::kParameter, HloOpcode::kParameter});
 }
 
 TEST_F(OpcodeFusionTest, MessOfFusibleNodes) {
@@ -578,49 +589,40 @@ TEST_F(OpcodeFusionTest, MessOfFusibleNodes) {
 
   Shape full_shape = ShapeUtil::MakeShape(F32, {4, 100, 10, 100, 50});
 
-  auto loop_idx = builder.AddInstruction(HloInstruction::CreateReshape(
-      ShapeUtil::MakeShape(S32, {1}),
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          0, ShapeUtil::MakeShape(S32, {}), "param0"))));
-
+  auto loop_idx = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(S32, {}), "param0"));
   auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
-      1, ShapeUtil::MakeShape(S32, {1}), "param1"));
-  auto concat = builder.AddInstruction(HloInstruction::CreateConcatenate(
-      ShapeUtil::MakeShape(S32, {5}),
-      {loop_idx, param1, param1, param1, param1}, /*dimension=*/0));
+      1, ShapeUtil::MakeShape(S32, {}), "param1"));
 
-  auto idx_choice = builder.AddInstruction(HloInstruction::CreateDynamicSlice(
-      ShapeUtil::MakeShape(S32, {1}),
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          2, ShapeUtil::MakeShape(S32, {4}), "param2")),
-      loop_idx,
-      /*slice_sizes=*/{1}));
-
-  PaddingConfig padding_config;
-  padding_config.add_dimensions()->set_edge_padding_high(4);
-  auto pad = builder.AddInstruction(HloInstruction::CreatePad(
-      ShapeUtil::MakeShape(S32, {5}), idx_choice,
-      builder.AddInstruction(
-          HloInstruction::CreateConstant(LiteralUtil::CreateR0(0))),
-      padding_config));
+  auto idx_choice = builder.AddInstruction(HloInstruction::CreateReshape(
+      ShapeUtil::MakeShape(S32, {}),
+      builder.AddInstruction(HloInstruction::CreateDynamicSlice(
+          ShapeUtil::MakeShape(S32, {1}),
+          builder.AddInstruction(HloInstruction::CreateParameter(
+              2, ShapeUtil::MakeShape(S32, {4}), "param2")),
+          {loop_idx},
+          /*slice_sizes=*/{1}))));
+  auto zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0(0)));
 
   auto slice = builder.AddInstruction(HloInstruction::CreateDynamicSlice(
       ShapeUtil::MakeShape(F32, {1, 100, 10, 100, 50}),
       builder.AddInstruction(HloInstruction::CreateParameter(
           3, ShapeUtil::MakeShape(F32, {100, 100, 10, 100, 50}), "param3")),
-      pad, /*slice_sizes=*/{1, 100, 10, 100, 50}));
+      {idx_choice, zero, zero, zero, zero},
+      /*slice_sizes=*/{1, 100, 10, 100, 50}));
 
   builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
       full_shape,
       builder.AddInstruction(
           HloInstruction::CreateParameter(4, full_shape, "param4")),
-      slice, concat));
+      slice, {loop_idx, param1, param1, param1, param1}));
 
   module->AddEntryComputation(builder.Build());
   RunFusionAndCheckOpcodesWereFused(
       module.get(),
-      {HloOpcode::kConcatenate, HloOpcode::kPad, HloOpcode::kDynamicSlice,
-       HloOpcode::kDynamicSlice, HloOpcode::kDynamicUpdateSlice,
+      {HloOpcode::kDynamicSlice, HloOpcode::kDynamicSlice,
+       HloOpcode::kDynamicUpdateSlice, HloOpcode::kReshape,
        HloOpcode::kParameter, HloOpcode::kParameter, HloOpcode::kParameter,
        HloOpcode::kParameter, HloOpcode::kParameter, HloOpcode::kParameter});
 }
@@ -930,9 +932,10 @@ ENTRY main {
   return result;
 }
 
-INSTANTIATE_TEST_CASE_P(GatherLoopFusionTestInstantiation, GatherLoopFusionTest,
-                        ::testing::ValuesIn(GetGatherLoopFusionTestSpecs()),
-                        GatherLoopFusionTestSpec::Name);
+INSTANTIATE_TEST_SUITE_P(GatherLoopFusionTestInstantiation,
+                         GatherLoopFusionTest,
+                         ::testing::ValuesIn(GetGatherLoopFusionTestSpecs()),
+                         GatherLoopFusionTestSpec::Name);
 }  // namespace
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
index c291bf2d1ba2eaff4192051840768c037bece86f..95b8025f873c56bea063ff258d4abd6614257d85 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
@@ -46,8 +46,7 @@ static bool ShouldMakeAllUsersColMajor(const HloInstruction* instruction) {
   for (auto* user : instruction->users()) {
     optional<int64> operand_idx = ProfitableToMakeDotOperandColumnMajor(*user);
     if (!operand_idx || user->operand(*operand_idx) != instruction ||
-        std::count(user->operands().begin(), user->operands().end(),
-                   instruction) != 1) {
+        absl::c_count(user->operands(), instruction) != 1) {
       return false;
     }
   }
@@ -94,60 +93,38 @@ static Shape ColMajorShape(const Shape& old_shape) {
   return new_shape;
 }
 
+static bool OperandsAndResultMustHaveRowMajorLayout(
+    const HloInstruction& instr,
+    const TargetMachineFeatures& target_machine_features) {
+  if (instr.opcode() == HloOpcode::kConvolution) {
+    return PotentiallyImplementedAsEigenConvolution(instr,
+                                                    target_machine_features);
+  } else if (instr.opcode() == HloOpcode::kDot) {
+    return DotOperandsAndResultMustHaveRowMajorLayout(instr,
+                                                      target_machine_features);
+  }
+  return false;
+}
+
 Status CpuLayoutAssignment::AddBackendConstraints(
     LayoutConstraints* constraints) {
   ShouldMakeOperandColMajorCache cache;
 
   const HloComputation* computation = constraints->computation();
   for (auto* instruction : computation->instructions()) {
-    if (instruction->opcode() == HloOpcode::kConvolution &&
-        PotentiallyImplementedAsEigenConvolution(*instruction,
-                                                 target_machine_features_)) {
-      const HloInstruction* convolution = instruction;
-      const HloInstruction* lhs_instruction = convolution->operand(0);
-      const HloInstruction* rhs_instruction = convolution->operand(1);
-
-      // In order to implement `convolution` with Eigen convolution, the layouts
-      // of the input, filter, and output need to be row-major.
-      //
-      // These constraints are not hard constraints. Ideally, we should decide
-      // which layouts to choose according to some cost model.
-      Shape output_shape(RowMajorShape(convolution->shape()));
-      Shape input_shape(RowMajorShape(lhs_instruction->shape()));
-      Shape filter_shape(RowMajorShape(rhs_instruction->shape()));
-
-      // Set layouts of the instructions' shapes.
-      TF_RETURN_IF_ERROR(
-          constraints->SetOperandLayout(input_shape, convolution, 0));
-      TF_RETURN_IF_ERROR(
-          constraints->SetOperandLayout(filter_shape, convolution, 1));
-      TF_RETURN_IF_ERROR(
-          constraints->SetInstructionLayout(output_shape, convolution));
+    if (OperandsAndResultMustHaveRowMajorLayout(*instruction,
+                                                target_machine_features_)) {
+      TF_RETURN_IF_ERROR(constraints->SetInstructionLayout(
+          RowMajorShape(instruction->shape()), instruction));
+      for (int i = 0; i < instruction->operand_count(); i++) {
+        TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
+            RowMajorShape(instruction->operand(i)->shape()), instruction, i));
+      }
     } else if (optional<int64> op_idx =
                    ShouldMakeOperandColumnMajor(&cache, *instruction)) {
       const HloInstruction* op = instruction->operand(*op_idx);
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
           ColMajorShape(op->shape()), instruction, *op_idx));
-    } else if (PotentiallyImplementedAsEigenDot(*instruction,
-                                                target_machine_features_)) {
-      const HloInstruction* dot = instruction;
-      // In order to implement `dot` with Eigen dot, the layouts of the lhs,
-      // rhs, and output need to be row-major.
-      //
-      // These constraints are not hard constraints. Ideally, we should decide
-      // which layouts to choose according to some cost model.
-      Shape output_shape(RowMajorShape(dot->shape()));
-
-      const HloInstruction* lhs_instruction = dot->operand(0);
-      Shape lhs_shape(RowMajorShape(lhs_instruction->shape()));
-      TF_RETURN_IF_ERROR(constraints->SetOperandLayout(lhs_shape, dot, 0));
-
-      const HloInstruction* rhs_instruction = dot->operand(1);
-      Shape rhs_shape(RowMajorShape(rhs_instruction->shape()));
-      TF_RETURN_IF_ERROR(constraints->SetOperandLayout(rhs_shape, dot, 1));
-
-      // Set layouts of the instructions' shapes.
-      TF_RETURN_IF_ERROR(constraints->SetInstructionLayout(output_shape, dot));
     } else {
       for (int64 operand_no = 0; operand_no < instruction->operand_count();
            ++operand_no) {
@@ -160,7 +137,7 @@ Status CpuLayoutAssignment::AddBackendConstraints(
           continue;
         }
         // Skip operands with non-array shapes.
-        if (!ShapeUtil::IsArray(instruction->operand(operand_no)->shape())) {
+        if (!instruction->operand(operand_no)->shape().IsArray()) {
           continue;
         }
         Shape operand_shape(
@@ -175,7 +152,7 @@ Status CpuLayoutAssignment::AddBackendConstraints(
       }
       // Skip instructions which don't produce array shapes (tuples, opaque,
       // etc.).
-      if (!ShapeUtil::IsArray(instruction->shape())) {
+      if (!instruction->shape().IsArray()) {
         continue;
       }
     }
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
index 92debb83e33b1400a59e5eef0f90971392ab7b22..ff654c83d61e7cc09ac7839feccaf2bc9cb3c63c 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
@@ -23,8 +23,8 @@ namespace {
 
 const char* const kXlaOptimizeForSizeCpuOption = "xla_cpu_optimize_for_size";
 const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor";
-const char* const kXlaEnableExperimentalLlvmIrGemm =
-    "xla_enable_experimental_llvm_ir_gemm";
+const char* const kXlaForceEnableExperimentalLlvmIrGemm =
+    "xla_force_enable_experimental_llvm_ir_gemm";
 const char* const kLlvmIrGemmTileSize = "xla_llvm_ir_gemm_tile_size";
 
 }  // namespace
@@ -57,10 +57,10 @@ absl::optional<int64> LlvmIrGemvTilingFactor(const HloModuleConfig& config) {
   return absl::nullopt;
 }
 
-bool EnableExperimentalLlvmIrGemm(const HloModuleConfig& config) {
+bool ForceEnableExperimentalLlvmIrGemm(const HloModuleConfig& config) {
   const auto& extra_options_map =
       config.debug_options().xla_backend_extra_options();
-  return extra_options_map.count(kXlaEnableExperimentalLlvmIrGemm) > 0;
+  return extra_options_map.count(kXlaForceEnableExperimentalLlvmIrGemm) > 0;
 }
 
 static absl::string_view RemoveSuffix(absl::string_view str,
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.h b/tensorflow/compiler/xla/service/cpu/cpu_options.h
index 47c7eb13b6e4cc05a23f82b8d2a25249f4b82ac0..99e6702d14aed8ffb148adec2bdd02dbc7c3c7e3 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.h
@@ -26,7 +26,7 @@ namespace options {
 
 bool OptimizeForSizeRequested(const HloModuleConfig& config);
 bool VectorizedReduceDisabled(const HloModuleConfig& config);
-bool EnableExperimentalLlvmIrGemm(const HloModuleConfig& config);
+bool ForceEnableExperimentalLlvmIrGemm(const HloModuleConfig& config);
 absl::optional<int64> LlvmIrGemvTilingFactor(const HloModuleConfig& config);
 absl::optional<std::tuple<int64, int64, int64>> LlvmIrGemmTileSize(
     const HloModuleConfig& config);
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index a9febe891b5e9d1eb9e6b297952b50d1d26a3396..d8878e622c0500fc5328aa6c295a9e24a3a037f7 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -84,31 +84,8 @@ extern const char* const kReleaseOutfeedBufferAfterPopulationSymbolName =
     "__xla_cpu_runtime_ReleaseOutfeedBufferAfterPopulation";
 extern const char* const kParallelForkJoinSymbolName =
     "__xla_cpu_runtime_ParallelForkJoin";
-extern const char* const kKeyValueSortPREDSymbolName =
-    "__xla_cpu_runtime_KeyValueSortPRED";
-extern const char* const kKeyValueSortS8SymbolName =
-    "__xla_cpu_runtime_KeyValueSortS8";
-extern const char* const kKeyValueSortU8SymbolName =
-    "__xla_cpu_runtime_KeyValueSortU8";
-extern const char* const kKeyValueSortS16SymbolName =
-    "__xla_cpu_runtime_KeyValueSortS16";
-extern const char* const kKeyValueSortU16SymbolName =
-    "__xla_cpu_runtime_KeyValueSortU16";
-extern const char* const kKeyValueSortF16SymbolName =
-    "__xla_cpu_runtime_KeyValueSortF16";
-extern const char* const kKeyValueSortS32SymbolName =
-    "__xla_cpu_runtime_KeyValueSortS32";
-extern const char* const kKeyValueSortU32SymbolName =
-    "__xla_cpu_runtime_KeyValueSortU32";
-extern const char* const kKeyValueSortF32SymbolName =
-    "__xla_cpu_runtime_KeyValueSortF32";
-extern const char* const kKeyValueSortS64SymbolName =
-    "__xla_cpu_runtime_KeyValueSortS64";
-extern const char* const kKeyValueSortU64SymbolName =
-    "__xla_cpu_runtime_KeyValueSortU64";
-extern const char* const kKeyValueSortF64SymbolName =
-    "__xla_cpu_runtime_KeyValueSortF64";
-
+extern const char* const kKeyValueSortSymbolName =
+    "__xla_cpu_runtime_KeyValueSort";
 extern const char* const kXlaCpuRuntimeSymbolNamePrefix = "__xla_cpu_runtime_";
 }  // namespace runtime
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
index b2e760a224ad8eaa61dae57b0f9cece04a7e54ae..3a2b44d8c1a80128d3577c374e751e73a89e9d59 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
@@ -64,18 +64,7 @@ extern const char* const kReleaseInfeedBufferAfterDequeueSymbolName;
 extern const char* const kAcquireOutfeedBufferForPopulationSymbolName;
 extern const char* const kReleaseOutfeedBufferAfterPopulationSymbolName;
 extern const char* const kParallelForkJoinSymbolName;
-extern const char* const kKeyValueSortPREDSymbolName;
-extern const char* const kKeyValueSortS8SymbolName;
-extern const char* const kKeyValueSortU8SymbolName;
-extern const char* const kKeyValueSortS16SymbolName;
-extern const char* const kKeyValueSortU16SymbolName;
-extern const char* const kKeyValueSortF16SymbolName;
-extern const char* const kKeyValueSortS32SymbolName;
-extern const char* const kKeyValueSortU32SymbolName;
-extern const char* const kKeyValueSortF32SymbolName;
-extern const char* const kKeyValueSortS64SymbolName;
-extern const char* const kKeyValueSortU64SymbolName;
-extern const char* const kKeyValueSortF64SymbolName;
+extern const char* const kKeyValueSortSymbolName;
 
 // All symbol names for XLA CPU runtime functions need to start with this
 // prefix.
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
index 1ae3aa57111e3a3b7ac18b4907c5c282edf89b7e..4e8c98678309fa4d573f1aac1290c9afc87643a4 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
@@ -162,11 +162,12 @@ TEST_P(EigenMatMulTest, DoIt) {
   CheckMatrixMultiply(*a, *b, *c);
 }
 
-INSTANTIATE_TEST_CASE_P(EigenMatMulTestInstantiaion, EigenMatMulTest,
-                        ::testing::Combine(::testing::ValuesIn(MatMulShapes),
-                                           ::testing::Bool(), ::testing::Bool(),
-                                           ::testing::Bool()),
-                        EigenMatMulTest::Name);
+INSTANTIATE_TEST_SUITE_P(EigenMatMulTestInstantiaion, EigenMatMulTest,
+                         ::testing::Combine(::testing::ValuesIn(MatMulShapes),
+                                            ::testing::Bool(),
+                                            ::testing::Bool(),
+                                            ::testing::Bool()),
+                         EigenMatMulTest::Name);
 
 #ifdef INTEL_MKL
 class MKLMatMulTest : public CpuRuntimeTest,
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
index 1457582ac19c27e5c3150b4667e6af505345a6bd..fae9670051a654f38f09856368ffb700b0c7a085 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/logging.h"
@@ -97,7 +96,7 @@ Status CpuTransferManager::TransferLiteralToInfeed(
   VLOG(2) << "Transferring literal to infeed with shape: "
           << ShapeUtil::HumanString(shape);
 
-  if (!ShapeUtil::IsTuple(shape)) {
+  if (!shape.IsTuple()) {
     int64 size = GetByteSizeRequirement(shape);
     return TransferBufferToInfeed(executor, size, literal.untyped_data());
   }
@@ -178,7 +177,7 @@ CpuTransferManager::TransferBufferToInfeedInternal(se::StreamExecutor* executor,
 Status CpuTransferManager::TransferLiteralFromOutfeed(
     se::StreamExecutor* executor, const Shape& literal_shape,
     MutableBorrowingLiteral literal) {
-  if (!ShapeUtil::IsTuple(literal_shape)) {
+  if (!literal_shape.IsTuple()) {
     int64 size = GetByteSizeRequirement(literal_shape);
     // Note: OSS build didn't like implicit conversion from
     // literal_shape.dimensions() to the array slice on 2017-07-10.
diff --git a/tensorflow/compiler/xla/service/cpu/disassembler.cc b/tensorflow/compiler/xla/service/cpu/disassembler.cc
index 3ae64142cd7e32d3aa8d50870efaf94698c06440..c3c6847b7b77e2fb0470630815de9f5d7a6c5b9c 100644
--- a/tensorflow/compiler/xla/service/cpu/disassembler.cc
+++ b/tensorflow/compiler/xla/service/cpu/disassembler.cc
@@ -77,17 +77,16 @@ StatusOr<DisassemblerResult> Disassembler::DisassembleObjectFile(
     }
 
     // Sort the symbols in increasing address order.
-    std::sort(
-        symbols.begin(), symbols.end(),
-        [](const llvm::object::SymbolRef& a, const llvm::object::SymbolRef& b) {
-          // getAddress returns a Expected object. Assert there is no error
-          // before extracting the address.
-          llvm::Expected<uint64_t> a_address_or_error = a.getAddress();
-          CHECK(a_address_or_error);
-          llvm::Expected<uint64_t> b_address_or_error = b.getAddress();
-          CHECK(b_address_or_error);
-          return a_address_or_error.get() < b_address_or_error.get();
-        });
+    absl::c_sort(symbols, [](const llvm::object::SymbolRef& a,
+                             const llvm::object::SymbolRef& b) {
+      // getAddress returns a Expected object. Assert there is no error
+      // before extracting the address.
+      llvm::Expected<uint64_t> a_address_or_error = a.getAddress();
+      CHECK(a_address_or_error);
+      llvm::Expected<uint64_t> b_address_or_error = b.getAddress();
+      CHECK(b_address_or_error);
+      return a_address_or_error.get() < b_address_or_error.get();
+    });
 
     // Construct ArrayRef pointing to section contents.
     llvm::StringRef section_content_string;
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 97f9b85a606e140fd7f3b1e3ecfb0dd5ba289f03..2bf22ec6e43ea9944935a4d0d5dcd22c5d190c17 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -26,7 +26,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
+#include "tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/vector_support_library.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
@@ -41,932 +44,165 @@ namespace xla {
 using llvm_ir::SetToFirstInsertPoint;
 
 namespace cpu {
-
 namespace {
-// Provides tiled access to an in-memory rank 2 array.
-class MemoryTile {
- public:
-  // Constructs a MemoryTile that can operate on tiles consisting of
-  // `tile_size_along_major_dim` vectors from the matrix `matrix`, starting at
-  // `major_dim_offset` in the major dimension.  The tile size along the minor
-  // dimension is the vector size, and that is implicitly determined by `vsl`.
-  MemoryTile(VectorSupportLibrary* vsl, llvm::IRBuilder<>* b,
-             llvm::Value* matrix, int64 matrix_size_along_minor_dim,
-             llvm::Value* major_dim_offset, int64 tile_size_along_major_dim)
-      : vsl_(vsl), b_(b) {
-    pointers_.reserve(tile_size_along_major_dim);
-    for (int64 i = 0; i < tile_size_along_major_dim; i++) {
-      llvm::Value* total_offset =
-          b->CreateMul(b->getInt64(matrix_size_along_minor_dim),
-                       b->CreateAdd(b->getInt64(i), major_dim_offset));
-      pointers_.push_back(vsl_->ComputeOffsetPointer(matrix, total_offset));
-    }
-  }
-
-  // Load a tile consisting of `tile_size_along_major_dim` vectors from position
-  // {major: `major_dim_offset`, minor: `minor_dim_offset`}.
-  //
-  // Note: `major_dim_offset` is a parameter to the constructor.
-  std::vector<llvm::Value*> LoadTile(llvm::Value* minor_dim_offset) const {
-    std::vector<llvm::Value*> result;
-    result.reserve(pointers_.size());
-    for (const auto& pointer : pointers_) {
-      result.push_back(vsl_->LoadVector(pointer, minor_dim_offset));
-    }
-    return result;
-  }
-
-  // Stores `tile` to position {major: `major_dim_offset`, minor:
-  // `minor_dim_offset`}.
-  //
-  // Note: `major_dim_offset` is a parameter to the constructor.
-  void StoreTile(absl::Span<llvm::Value* const> tile,
-                 llvm::Value* minor_dim_offset) const {
-    CHECK_EQ(tile.size(), pointers_.size());
-    for (int64 i = 0; i < pointers_.size(); i++) {
-      vsl_->StoreVector(tile[i], pointers_[i], minor_dim_offset);
-    }
-  }
-
-  // Loads a tile of size [`tile_size_along_major_dim`,
-  // `tile_size_along_middle_dim`] from position {major: `major_dim_offset`,
-  // minor: `minor_dim_offset`} and then broadcasts each element into a vector
-  // of size vsl_.vector_size().  The (i,j)'th element of the return value is
-  // the (i,j)'th element in the tile broadcasted into an LLVM vector.
-  //
-  // Note: `major_dim_offset` is a parameter to the constructor.
-  std::vector<std::vector<llvm::Value*>> LoadBroadcastTile(
-      llvm::Value* minor_dim_offset, int64 tile_size_along_middle_dim) const {
-    std::vector<std::vector<llvm::Value*>> result;
-    result.resize(pointers_.size());
-    for (int64 i = 0; i < pointers_.size(); i++) {
-      for (int64 j = 0; j < tile_size_along_middle_dim; j++) {
-        result[i].push_back(vsl_->LoadBroadcast(
-            pointers_[i], b_->CreateAdd(minor_dim_offset, b_->getInt64(j))));
-      }
-    }
-    return result;
-  }
-
- private:
-  VectorSupportLibrary* vsl_;
-  llvm::IRBuilder<>* b_;
-  std::vector<llvm::Value*> pointers_;
-};
-
-// The base class for the classes representing the GEMV emitter configurations.
-//
-// The IR emitted (modulo the LLVM values representing the input and output
-// buffers) by the row major and column major GEMV emitters should be a function
-// of their configuration.  This is important because their configuration is
-// used as a key to cache the generated IR.
-class GemvConfig {
- public:
-  // Mixin for convenience.
-  template <typename T>
-  struct User {
-   public:
-    PrimitiveType scalar_type() const {
-      return derived().config().scalar_type();
-    }
-    int64 tile_rows() const { return derived().config().tile_rows(); }
-    int64 tile_cols() const { return derived().config().tile_cols(); }
-    int64 m() const { return derived().config().m(); }
-    int64 k() const { return derived().config().k(); }
-    int64 has_addend() const { return derived().config().has_addend(); }
-
-   private:
-    const T& derived() const { return *static_cast<const T*>(this); }
-  };
+// Returns true if we should call into multi-threaded Eigen routines.
+bool ShouldUseMultiThreadedEigen(const HloModuleConfig& config) {
+  return config.debug_options().xla_cpu_multi_thread_eigen();
+}
 
-  PrimitiveType scalar_type() const { return scalar_type_; }
-  int64 tile_rows() const { return tile_rows_; }
-  int64 tile_cols() const { return tile_cols_; }
-  int64 m() const { return m_; }
-  int64 k() const { return k_; }
-  bool has_addend() const { return has_addend_; }
-
-  string GetCacheKey() const {
-    return absl::StrCat(name_, "_", PrimitiveType_Name(scalar_type()), "_",
-                        tile_rows(), "_", tile_cols(), "_", m(), "_", k(),
-                        has_addend() ? "_with_addend" : "");
+// Represents a dot operation.  We use this in lieu of an `HloInstruction`
+// because we want to be able to create this for the "inner" dot operation in a
+// batch dot, for which there is no separate HLO instruction.
+struct DotInfo {
+  Shape lhs_shape;
+  Shape rhs_shape;
+  Shape result_shape;
+  DotDimensionNumbers dim_nums;
+
+  DotInfo() = default;
+
+  explicit DotInfo(const HloInstruction& instr) {
+    CHECK_EQ(instr.opcode(), HloOpcode::kDot);
+    lhs_shape = instr.operand(0)->shape();
+    rhs_shape = instr.operand(1)->shape();
+    result_shape = instr.shape();
+    dim_nums = instr.dot_dimension_numbers();
   }
-
- protected:
-  explicit GemvConfig(string name, PrimitiveType scalar_type, int64 tile_rows,
-                      int64 tile_cols, int64 m, int64 k, bool has_addend)
-      : name_(std::move(name)),
-        scalar_type_(scalar_type),
-        tile_rows_(tile_rows),
-        tile_cols_(tile_cols),
-        m_(m),
-        k_(k),
-        has_addend_(has_addend) {}
-
- private:
-  string name_;
-  PrimitiveType scalar_type_;
-  int64 tile_rows_;
-  int64 tile_cols_;
-  int64 m_;
-  int64 k_;
-  bool has_addend_;
 };
 
-// Computes a dot product between "[M,K]{0,1} lhs" with a [K,1] vector (the
-// layout of the vector does not matter).  This implementation uses a tiling
-// scheme to improve performance.
-//
-// We logically separate the LHS matrix into four segments:
-//
-//   +----------------------+---+
-//   |                      |   |
-//   |                      |   |
-//   |         A            | B |
-//   |                      |   |
-//   |                      |   |
-//   |                      |   |
-//   +----------------------+---+
-//   |         C            | D |
-//   +----------------------+---+
-//
-// where A is the largest submatrix of the LHS that can be evenly dividied into
-// tiles.  For each tile in A, assuming tile_rows_ == tile_cols_ == 4, we have:
-//
-//   +---+---+---+---+       +--+--+--+--+
-//   |M00|M10|M20|M30|       |V0|V1|V2|V3|
-//   +---+---+---+---+       +--+--+--+--+
-//   |M01|M11|M21|M31| and   |V0|V1|V2|V3|
-//   +---+---+---+---+       +--+--+--+--+
-//   |M02|M12|M22|M32|       |V0|V1|V2|V3|
-//   +---+---+---+---+       +--+--+--+--+
-//   |M03|M13|M23|M33|       |V0|V1|V2|V3|
-//   +---+---+---+---+       +--+--+--+--+
-//
-// (Legend: rows are horizontal and columns are vertical; and each column is one
-// llvm::Value of a vector type)
-//
-// where:
-//
-//   a. The left tile is from the column major left matrix.
-//   b. The right tile is an elementwise broadcast of a [V0, V1, V2, V3]
-//      vector loaded from the RHS vector.
-//
-// As we iterate through the column dimension, we compute the change to the
-// result vector by an elementwise multiplication between the two tiles above
-// followed by a reduction along the major dimension:
-//
-//                     +-----------------------------------+
-//                     | M00*V0 + M10*V1 + M20*V2 + M30*V3 |
-//                     +-----------------------------------+
-//                     | M01*V0 + M11*V1 + M21*V2 + M31*V3 |
-// Result[R:R+4] +=    +-----------------------------------+
-//                     | M02*V0 + M12*V1 + M22*V2 + M32*V3 |
-//                     +-----------------------------------+
-//                     | M03*V0 + M13*V1 + M23*V2 + M33*V3 |
-//                     +-----------------------------------+
-//
-// Where R is the starting row for the tile.
-//
-// We have an inner epilogue loop to deal with the "C" submatrix and an outer
-// epilogue loop to deal with the B,D submarix.
-//
-// TODO(sanjoy): We should investigate if using gather loads and scatter stores
-// can be used here have the same inner loop for both column-major and row-major
-// matrix-vector products.
-class ColumnMajorMatrixVectorProductEmitter
-    : public GemvConfig::User<ColumnMajorMatrixVectorProductEmitter> {
- public:
-  class Config : public GemvConfig {
-   public:
-    explicit Config(PrimitiveType scalar_type, int64 tile_rows, int64 tile_cols,
-                    int64 m, int64 k, bool has_addend)
-        : GemvConfig(/*name=*/"col_major_gemv", scalar_type,
-                     /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols, /*m=*/m,
-                     /*k=*/k, /*has_addend=*/has_addend) {}
-  };
-
-  ColumnMajorMatrixVectorProductEmitter(const Config& config, llvm::Value* lhs,
-                                        llvm::Value* rhs, llvm::Value* addend,
-                                        llvm::Value* result,
-                                        llvm::IRBuilder<>* b)
-      : config_(config),
-        lhs_(lhs),
-        rhs_(rhs),
-        addend_(addend),
-        result_(result),
-        b_(b),
-        ksl_(b_),
-        vsl_(config.scalar_type(), /*vector_size=*/config.tile_rows(), b_, "") {
-    CHECK(tile_rows() > 0 && IsPowerOfTwo(static_cast<uint64>(tile_rows())));
-    CHECK(!has_addend() || addend != nullptr);
-  }
-
-  void Emit();
-
-  const Config& config() const { return config_; }
-
- private:
-  void EmitOuterLoopBody(llvm::Value* column, int64 column_count,
-                         bool is_first_column);
-
-  MemoryTile GetLhsMemoryTile(llvm::Value* column_start, int64 column_count) {
-    return MemoryTile(&vsl_, b_, /*matrix=*/lhs_,
-                      /*matrix_size_along_minor_dim=*/m(),
-                      /*major_dim_offset=*/column_start,
-                      /*tile_size_along_major_dim=*/column_count);
-  }
-
-  // Load a tile of values from the RHS.  For the RHS a "tile" is a contiguous
-  // sequence of `count` values, each one broadcasted to the vector width.
-  std::vector<llvm::Value*> LoadRhsTile(llvm::Value* offset, int64 count) {
-    llvm::Value* base_pointer = vsl_.ComputeOffsetPointer(rhs_, offset);
-    std::vector<llvm::Value*> result;
-    result.reserve(count);
-    for (int64 i = 0; i < count; i++) {
-      result.push_back(vsl_.LoadBroadcast(base_pointer, i));
-    }
-    return result;
-  }
-
-  void EmitInnerLoopTiled(MemoryTile* lhs_memory_tile,
-                          const std::vector<llvm::Value*>& rhs_tile,
-                          int64 columns, bool is_first_column);
-
-  void EmitInnerLoopEpilogue(llvm::Value* current_tile_col, int64 columns,
-                             bool is_first_tiled_column);
-
-  Config config_;
-  llvm::Value* lhs_;
-  llvm::Value* rhs_;
-  llvm::Value* addend_;
-  llvm::Value* result_;
-  llvm::IRBuilder<>* b_;
-  KernelSupportLibrary ksl_;
-  VectorSupportLibrary vsl_;
+// Dictates how a dot operation is implemented.
+enum class DotImplementationStrategy {
+  // The dot operation is lowered into LLVM IR that implements a naive nested
+  // loop that computes the result one element at a time.  This is our
+  // "fallback"; we don't really want this to kick in for any non-trival dot
+  // operation.
+  kNaiveLlvmIr,
+
+  // The dot operation is lowered into LLVM IR that implements a tiled
+  // Matrix*Vector operation.  This strategy also allows fusing in a bias add
+  // into the dot.  The matrix can be row major or column major, both are
+  // supported.
+  kTiledLlvmIrGemv,
+
+  // The dot operation is lowered into LLVM IR that implemetns a tiled
+  // Matrix*Matrix operation.  No fusions are supported.  The two inputs
+  // and the output have to be row major.
+  kTiledLlvmIrGemm,
+
+  // The dot operation is lowered into a call into an Eigen routine.  No fusions
+  // are supported today.  The two inputs and the output have to be row major.
+  // However, we do allow transposing either the LHS or the RHS as part of the
+  // GEMM -- we expose this flexibility as flexibility in the contraction
+  // dimensions, but we can also see this as flexibility in the input layouts.
+  kEigen,
 };
 
-void ColumnMajorMatrixVectorProductEmitter::EmitOuterLoopBody(
-    llvm::Value* column, int64 column_count, bool is_first_column) {
-  MemoryTile lhs_memory_tile = GetLhsMemoryTile(/*column_start=*/column,
-                                                /*column_count=*/column_count);
-
-  std::vector<llvm::Value*> rhs_tile =
-      LoadRhsTile(column, /*count=*/column_count);
-  EmitInnerLoopTiled(&lhs_memory_tile, rhs_tile,
-                     /*columns=*/column_count, is_first_column);
-  EmitInnerLoopEpilogue(column, /*columns=*/column_count, is_first_column);
-}
-
-void ColumnMajorMatrixVectorProductEmitter::Emit() {
-  // See the comment on the class declaration for the algorithm used here.
-  int64 column_remainder = k() % tile_cols();
-  int64 column_limit = k() - column_remainder;
-
-  ksl_.ForReturnVoid("dot.outer.tiled",
-                     /*start=*/0, /*end=*/column_limit, /*step=*/tile_cols(),
-                     [&](llvm::Value* column, bool is_first_column) {
-                       EmitOuterLoopBody(column, tile_cols(), is_first_column);
-                     });
-
-  if (column_remainder != 0) {
-    EmitOuterLoopBody(b_->getInt64(column_limit), column_remainder,
-                      column_limit == 0);
-  }
-}
-
-void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
-    MemoryTile* lhs_memory_tile, const std::vector<llvm::Value*>& rhs_tile,
-    int64 columns, bool is_first_column) {
-  int64 row_limit = m() - (m() % tile_rows());
-
-  ksl_.ForReturnVoid(
-      "dot.inner.tiled", /*start=*/0, /*end=*/row_limit,
-      /*step=*/tile_rows(), [&](llvm::Value* row) {
-        std::vector<llvm::Value*> lhs_tile =
-            lhs_memory_tile->LoadTile(/*minor_dim_offset=*/row);
-        llvm::Value* accumulator =
-            is_first_column ? (addend_ ? vsl_.LoadVector(addend_, row)
-                                       : vsl_.GetZeroVector())
-                            : vsl_.LoadVector(result_, row);
-        for (int i = 0; i < columns; i++) {
-          accumulator = vsl_.MulAdd(lhs_tile[i], rhs_tile[i], accumulator);
-        }
-        vsl_.StoreVector(accumulator, result_, row);
-      });
-}
-
-void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
-    llvm::Value* current_tile_col, int64 columns, bool is_first_tiled_column) {
-  int64 row_start = m() - (m() % tile_rows());
-  if (row_start == m()) {
-    return;
-  }
-
-  llvm::Value* columns_llvm = b_->getInt64(columns);
-
-  // for (col = current_tile_col; col < (columns + current_tile_col); col++)
-  //   for (row = row_start, row < m_; row++) {
-  //     result[row] += lhs[row, col] * rhs[col]
-  //     // Also take into account that if col is 0 then result[row] is not
-  //     // initialized.
-  //   }
-
-  ksl_.ForReturnVoid(
-      "dot.inner.epilg.outer", /*start=*/current_tile_col,
-      /*end=*/b_->CreateAdd(columns_llvm, current_tile_col),
-      /*step=*/1, /*peel_first_iteration=*/false,
-      [&](llvm::Value* col, llvm::Value* is_first_scalar_col) {
-        llvm::Value* rhs_element = vsl_.LoadScalar(rhs_, col);
-        llvm::Value* total_offset = b_->CreateMul(col, b_->getInt64(m()));
-        llvm::Value* lhs_base_pointer =
-            vsl_.ComputeOffsetPointer(lhs_, total_offset);
-        ksl_.ForReturnVoid(
-            "dot.inner.epilg.inner", /*start=*/row_start, /*end=*/m(),
-            /*step=*/1, [&](llvm::Value* scalar_row) {
-              llvm::Value* product = vsl_.Mul(
-                  vsl_.LoadScalar(lhs_base_pointer, scalar_row), rhs_element);
-              llvm::Value* setting_result_first_time = b_->CreateAnd(
-                  is_first_scalar_col, b_->getInt1(is_first_tiled_column));
-              ksl_.IfReturnVoid(
-                  setting_result_first_time,
-                  /*true_block_generator=*/
-                  [&]() {
-                    if (addend_) {
-                      vsl_.StoreScalar(
-                          vsl_.Add(vsl_.LoadScalar(addend_, scalar_row),
-                                   product),
-                          result_, scalar_row);
-                    } else {
-                      vsl_.StoreScalar(product, result_, scalar_row);
-                    }
-                  },
-                  /*false_block_generator=*/
-                  [&]() {
-                    vsl_.StoreScalar(
-                        vsl_.Add(vsl_.LoadScalar(result_, scalar_row), product),
-                        result_, scalar_row);
-                  });
-            });
-      });
-}
+// Returns the implementation strategy for a dot with the configuration
+// `dot_info`.
+DotImplementationStrategy GetDotImplementationStrategy(
+    const HloModuleConfig& config, const DotInfo& dot_info,
+    const TargetMachineFeatures& target_machine_features);
 
-// Computes a dot product between "[M,K]{1,0} lhs" with a [K,1] vector (the
-// layout of the vector does not matter).  This implementation uses a tiling
-// scheme to improve performance.
-//
-// We logically separate the LHS matrix into four segments:
-//
-//   +----------------------+---+
-//   |                      |   |
-//   |                      |   |
-//   |         A            | B |
-//   |                      |   |
-//   |                      |   |
-//   |                      |   |
-//   +----------------------+---+
-//   |         C            | D |
-//   +----------------------+---+
-//
-// where A is the largest submatrix of the LHS that can be evenly dividied into
-// tiles.  For each tile in A, assuming tile_rows_ == tile_cols_ == 4, we have:
-//
-//   +---+---+---+---+
-//   |M00|M10|M20|M30|
-//   +---+---+---+---+       +--+--+--+--+
-//   |M01|M11|M21|M31| and   |V0|V1|V2|V3|
-//   +---+---+---+---+       +--+--+--+--+
-//   |M02|M12|M22|M32|
-//   +---+---+---+---+
-//   |M03|M13|M23|M33|
-//   +---+---+---+---+
-//
-// (Legend: rows are horizontal and columns are vertical; and each row is one
-// llvm::Value of a vector type)
-//
-// where:
-//
-//   a. The left tile is loaded from the row major left matrix.
-//   b. The right vector is loaded from the RHS vector.
-//
-// We keep 4 vector accumulators accumulating the following four vector
-// expressions as we iterate over the row dimension:
-//
-//   +------+------+------+------+
-//   |M0I*V0|M1I*V1|M2I*V2|M3I*V3|  for I in [0,4)
-//   +------+------+------+------+
-//
-// In the end we do a horizontal reduction over these 4 vector accumulators to
-// get 4 values in the result vector.
-//
-// We have an inner epilogue loop to deal with the "B" sub-matrix and an outer
-// epilogue loop to deal with the C,D submatrix.
-class RowMajorMatrixVectorProductEmitter
-    : public GemvConfig::User<RowMajorMatrixVectorProductEmitter> {
+// Helper class for emitting LLVM IR to perform the dot operation.
+class DotOpEmitter {
  public:
-  class Config : public GemvConfig {
-   public:
-    explicit Config(PrimitiveType scalar_type, int64 tile_rows, int64 tile_cols,
-                    int64 m, int64 k, bool has_addend)
-        : GemvConfig(/*name=*/"row_major_gemv", scalar_type,
-                     /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols, /*m=*/m,
-                     /*k=*/k, /*has_addend=*/has_addend) {}
-  };
-
-  RowMajorMatrixVectorProductEmitter(const Config& config, llvm::Value* lhs,
-                                     llvm::Value* rhs, llvm::Value* addend,
-                                     llvm::Value* result, llvm::IRBuilder<>* b)
-      : config_(config),
-        lhs_(lhs),
-        rhs_(rhs),
-        addend_(addend),
-        result_(result),
-        b_(b),
-        ksl_(b_),
-        vsl_(scalar_type(), /*vector_size=*/tile_cols(), b_, "") {
-    CHECK(tile_cols() > 0 && IsPowerOfTwo(static_cast<uint64>(tile_cols())));
-    CHECK(!has_addend() || addend != nullptr);
-  }
-
-  void Emit();
-
-  const Config& config() const { return config_; }
+  explicit DotOpEmitter(DotInfo dot_info, string dot_hlo_name,
+                        const llvm_ir::IrArray& target_array,
+                        const llvm_ir::IrArray& lhs_array,
+                        const llvm_ir::IrArray& rhs_array,
+                        const llvm_ir::IrArray* addend_array,
+                        llvm::Value* executable_run_options_value,
+                        llvm::IRBuilder<>* b,
+                        const HloModuleConfig& hlo_module_config,
+                        const TargetMachineFeatures& target_machine_features);
+
+  // Emits the IR to perform the dot operation.
+  Status Emit();
 
  private:
-  MemoryTile GetLhsMemoryTile(llvm::Value* row_start, int64 row_count) {
-    return MemoryTile(&vsl_, b_, /*matrix=*/lhs_,
-                      /*matrix_size_along_minor_dim=*/k(),
-                      /*major_dim_offset=*/row_start,
-                      /*tile_size_along_major_dim=*/row_count);
-  }
-
-  void EmitOuterLoopBody(llvm::Value* row, int64 row_count);
-
-  void EmitInnerLoopTiled(MemoryTile* lhs_memory_tile, int64 rows,
-                          std::vector<VectorVariable>* vector_accumulators);
-
-  void EmitInnerLoopEpilogue(llvm::Value* current_tile_row, int64 rows,
-                             std::vector<ScalarVariable>* scalar_accumulators);
-
-  Config config_;
-  llvm::Value* lhs_;
-  llvm::Value* rhs_;
-  llvm::Value* addend_;
-  llvm::Value* result_;
-  llvm::IRBuilder<>* b_;
-  KernelSupportLibrary ksl_;
-  VectorSupportLibrary vsl_;
-};
-
-void RowMajorMatrixVectorProductEmitter::EmitOuterLoopBody(llvm::Value* row,
-                                                           int64 row_count) {
-  MemoryTile lhs_memory_tile = GetLhsMemoryTile(/*row_start=*/row,
-                                                /*row_count=*/row_count);
-  std::vector<VectorVariable> vector_accumulators;
-  std::vector<ScalarVariable> scalar_accumulators;
-  for (int i = 0; i < row_count; i++) {
-    vector_accumulators.emplace_back(&vsl_, vsl_.GetZeroVector());
-    scalar_accumulators.emplace_back(&vsl_, vsl_.GetZeroScalar());
-  }
-  EmitInnerLoopTiled(&lhs_memory_tile, /*rows=*/row_count,
-                     &vector_accumulators);
-  EmitInnerLoopEpilogue(/*current_tile_row=*/row, /*rows=*/row_count,
-                        &scalar_accumulators);
-
-  std::vector<llvm::Value*> accumulator_values;
-  std::transform(
-      vector_accumulators.begin(), vector_accumulators.end(),
-      std::back_inserter(accumulator_values),
-      [](const VectorVariable& vector_var) { return vector_var.Get(); });
-
-  std::vector<llvm::Value*> horizontal_sums;
-  if (row_count == vsl_.vector_size()) {
-    if (addend_) {
-      horizontal_sums = vsl_.ComputeHorizontalSums(
-          std::move(accumulator_values), vsl_.LoadVector(addend_, row));
-    } else {
-      horizontal_sums =
-          vsl_.ComputeHorizontalSums(std::move(accumulator_values));
-    }
-  } else {
-    horizontal_sums = vsl_.ComputeHorizontalSums(std::move(accumulator_values));
-  }
-
-  for (int i = 0; i < row_count; i++) {
-    llvm::Value* result_value =
-        vsl_.Add(horizontal_sums[i], scalar_accumulators[i].Get());
-    llvm::Value* offset = b_->CreateAdd(b_->getInt64(i), row);
-    if (addend_ && row_count != vsl_.vector_size()) {
-      result_value = vsl_.Add(vsl_.LoadScalar(addend_, offset), result_value);
-    }
-    vsl_.StoreScalar(result_value, result_, offset);
-  }
-}
+  // Emits instructions to perform a scalar dot product (a multiply of the
+  // LHS and RHS) and store the results in the target.
+  Status EmitScalarDot();
 
-void RowMajorMatrixVectorProductEmitter::Emit() {
-  // See the comment on the class declaration for the algorithm used here.
-  int64 row_remainder = m() % tile_rows();
-  int64 row_limit = m() - row_remainder;
+  // Emits a call to the CPU runtime to perform the matrix multiply.
+  Status EmitCallToRuntime();
 
-  ksl_.ForReturnVoid(
-      "dot.outer.tiled",
-      /*start=*/0, /*end=*/row_limit, /*step=*/tile_rows(),
-      [&](llvm::Value* row) { EmitOuterLoopBody(row, tile_rows()); });
-
-  if (row_remainder != 0) {
-    EmitOuterLoopBody(b_->getInt64(row_limit), row_remainder);
-  }
-}
+  // Represents the dimensions of a matrix-matrix multiply operation.
+  struct MatMultDims {
+    // The number of rows in the LHS.
+    int64 m;
 
-void RowMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
-    MemoryTile* lhs_memory_tile, int64 rows,
-    std::vector<VectorVariable>* vector_accumulators) {
-  int64 column_limit = k() - (k() % tile_cols());
-
-  ksl_.ForReturnVoid("dot.inner.tiled", /*start=*/0, /*end=*/column_limit,
-                     /*step=*/tile_cols(), [&](llvm::Value* col) {
-                       std::vector<llvm::Value*> lhs_tile =
-                           lhs_memory_tile->LoadTile(/*minor_dim_offset=*/col);
-                       llvm::Value* rhs_value = vsl_.LoadVector(rhs_, col);
-                       for (int i = 0; i < rows; i++) {
-                         llvm::Value* old_sum = (*vector_accumulators)[i].Get();
-                         (*vector_accumulators)[i].Set(vsl_.Add(
-                             old_sum, vsl_.Mul(rhs_value, lhs_tile[i])));
-                       }
-                     });
-}
+    // The number of columns in the LHS, which is also must be equal to the
+    // number of rows in the RHS.
+    int64 k;
 
-void RowMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
-    llvm::Value* current_tile_row, int64 rows,
-    std::vector<ScalarVariable>* scalar_accumulators) {
-  int64 column_start = k() - (k() % tile_cols());
-  if (column_start == k()) {
-    return;
-  }
+    // The number of columns on the RHS.
+    int64 n;
 
-  for (int r = 0; r < rows; r++) {
-    llvm::Value* total_offset = b_->CreateMul(
-        b_->CreateAdd(b_->getInt64(r), current_tile_row), b_->getInt64(k()));
-    llvm::Value* lhs_base_pointer =
-        vsl_.ComputeOffsetPointer(lhs_, total_offset);
-    ksl_.ForReturnVoid(
-        "dot.inner.epilg.inner", /*start=*/column_start, /*end=*/k(),
-        /*step=*/1, [&](llvm::Value* scalar_col) {
-          llvm::Value* product =
-              vsl_.Mul(vsl_.LoadScalar(lhs_base_pointer, scalar_col),
-                       vsl_.LoadScalar(rhs_, scalar_col));
-          llvm::Value* old_value = (*scalar_accumulators)[r].Get();
-          (*scalar_accumulators)[r].Set(vsl_.Add(old_value, product));
-        });
-  }
-}
+    // True if the LHS matrix is column major.
+    bool lhs_column_major;
 
-// This class implements a tiled matrix multiplication algorithm, intended for
-// multiplying small matrices that don't need cache tiling.
-//
-// In the future this can be used as the innermost GEBP loop in a GEMM kernel as
-// described in "Goto, Kazushige, and Robert A. Geijn. "Anatomy of
-// high-performance matrix multiplication." ACM Transactions on Mathematical
-// Software (TOMS) 34.3 (2008): 12.".
-//
-// This only supports canonical dot operations (i.e. where the lhs contraction
-// dimension is 1 and the rhs contraction dimension is 0) over row major
-// matrices.
-class TiledSmallGemmEmitter {
- public:
-  // Describe the dimensions of the kernel.
-  class Dimensions {
-   public:
-    explicit Dimensions(int64 m, int64 k, int64 n) : m_(m), k_(k), n_(n) {}
+    // True if the LHS contraction dimension is not 1.
+    bool lhs_non_canonical;
 
-    int64 m() const { return m_; }
-    int64 k() const { return k_; }
-    int64 n() const { return n_; }
+    // True if the RHS matrix is column major.
+    bool rhs_column_major;
 
-    string ToString() const { return absl::StrCat(m(), "x", k(), "x", n()); }
+    // True if the RHS contraction dimension is not 0.
+    bool rhs_non_canonical;
 
-   private:
-    const int64 m_;
-    const int64 k_;
-    const int64 n_;
+    // True if the result matrix is column major.
+    bool target_column_major;
   };
 
-  // Represents the configuration of the emitter.  The LLVM IR emitted by the
-  // emitter, modulo the LLVM values holding the input and output buffers, must
-  // be a function of the instance of `Config` passed to it.
-  //
-  // `dims` holds the matrix multiplication dimensions.
-  //
-  // `max_vectorization_width` is the maximum vector width (i.e. the width of
-  // the largest vector register we will use).  This can be larger than the
-  // largest vector register supported by the machine -- LLVM will legalize
-  // these large vector widths into legally sized vectors.
-  //
-  // `max_vector_count` is the maximum number of vectors of size
-  // `max_vectorization_width` that we will attempt to process at once.
-  //
-  // `min_vectorization_width` is the smallest vector width the emitter will use
-  // -- below that it will devolve to using a scalar loop.
-  //
-  // The innermost reduction loop executes the matrix multiply in tiles of size
-  // [`tile_size_m`, `tile_size_k`] from the LHS and [`tile_size_k`,
-  // <vectorization width>] in the RHS.
-  class Config {
-   public:
-    explicit Config(PrimitiveType scalar_type, Dimensions dims,
-                    int64 max_vectorization_width, int64 max_vector_count,
-                    int64 min_vectorization_width, int64 tile_size_m,
-                    int64 tile_size_k)
-        : scalar_type_(scalar_type),
-          dims_(dims),
-          max_vectorization_width_(max_vectorization_width),
-          max_vector_count_(max_vector_count),
-          min_vectorization_width_(min_vectorization_width),
-          tile_size_m_(tile_size_m),
-          tile_size_k_(tile_size_k) {}
-
-    string GetCacheKey() const {
-      return absl::StrCat("gemm_", PrimitiveType_Name(scalar_type()), "_",
-                          dims().ToString(), "_", max_vectorization_width(),
-                          "_", min_vectorization_width(), "_", tile_size_m(),
-                          "_", tile_size_k());
-    }
+  // Get the MatMultDims instance for the dot product this DotOpEmitter
+  // represents.  Precondition: the dot is of rank 2 (and thus its operands are
+  // of rank 2 as well).
+  MatMultDims GetMatMultDims() const;
 
-    PrimitiveType scalar_type() const { return scalar_type_; }
-    Dimensions dims() const { return dims_; }
-    int64 max_vectorization_width() const { return max_vectorization_width_; }
-    int64 max_vector_count() const { return max_vector_count_; }
-    int64 min_vectorization_width() const { return min_vectorization_width_; }
-
-    int64 tile_size_m() const { return tile_size_m_; }
-    int64 tile_size_k() const { return tile_size_k_; }
-
-   private:
-    PrimitiveType scalar_type_;
-    Dimensions dims_;
-    int64 max_vectorization_width_;
-    int64 max_vector_count_;
-    int64 min_vectorization_width_;
-    int64 tile_size_m_;
-    int64 tile_size_k_;
-  };
+  // Lowers the dot operation as a tiled Matrix*Vector loop.
+  void EmitTiledLlvmIrGemv();
 
-  // Creates an instance of TiledSmallGemmEmitter that matrix-multiplies
-  // `lhs` with `rhs` and stores the result in `result`.
-  explicit TiledSmallGemmEmitter(Config config, llvm::Value* lhs,
-                                 llvm::Value* rhs, llvm::Value* result,
-                                 llvm::IRBuilder<>* b)
-      : lhs_(lhs),
-        rhs_(rhs),
-        result_(result),
-        config_(config),
-        b_(b),
-        ksl_(b_) {
-    CHECK(max_vectorization_width() > 0 &&
-          IsPowerOfTwo(static_cast<uint64>(max_vectorization_width())));
-    CHECK_GT(max_vector_count(), 0);
-    CHECK(min_vectorization_width() > 0 &&
-          IsPowerOfTwo(static_cast<uint64>(min_vectorization_width())));
-    CHECK_GE(max_vectorization_width(), min_vectorization_width());
-    CHECK_GT(tile_size_k(), 0);
-  }
+  // Lowers the dot operation as a tiled Matrix*Matrix loop.
+  void EmitTiledLlvmIrGemm();
 
-  void Emit();
+  // Lowers the dot operation as a naive nested loop that computes the result
+  // one element at a time.
+  void EmitNaiveLlvmIrGemm();
 
- private:
-  // The HandleResiduesOnX helpers split the iteration space for dimension X
-  // into a multiple of the tile size on dimension X and an epilogue.  These
-  // helpers ultimately call into `EmitTiledGemm` for emitting the
-  // tiled GEMM kernel.
-
-  void HandleResiduesOnN();
-  void HandleResiduesOnK(VectorSupportLibrary* vsl, llvm::Value* n_start,
-                         llvm::Value* n_end);
-  void HandleResiduesOnM(VectorSupportLibrary* vsl, int64 tile_size_k,
-                         llvm::Value* k_start, llvm::Value* k_end,
-                         llvm::Value* n_start, llvm::Value* n_end);
-
-  // This emits a tiled GEMM kernel.  For a detailed description see the comment
-  // on the implementation.
-  void EmitTiledGemm(VectorSupportLibrary* vsl, int64 tile_size_k,
-                     llvm::Value* k_start, llvm::Value* k_end,
-                     llvm::Value* n_start, llvm::Value* n_end,
-                     int64 tile_size_m, llvm::Value* m_start,
-                     llvm::Value* m_end);
-
-  llvm::Value* GetInt64(int64 value) { return b_->getInt64(value); }
-
-  Config config() const { return config_; }
-  Dimensions dims() const { return config().dims(); }
-
-  int64 max_vectorization_width() const {
-    return config().max_vectorization_width();
+  // When doing a tiled GEMV in LLVM IR, a "tile" consists of this many vector
+  // registers.
+  int64 GetGemvTilingFactor() const {
+    const int64 kDefaultTilingFactor = 8;
+    return options::LlvmIrGemvTilingFactor(hlo_module_config_)
+        .value_or(kDefaultTilingFactor);
   }
-  int64 max_vector_count() const { return config().max_vector_count(); }
-  int64 min_vectorization_width() const {
-    return config().min_vectorization_width();
-  }
-  int64 tile_size_m() const { return config().tile_size_m(); }
-  int64 tile_size_k() const { return config().tile_size_k(); }
-  PrimitiveType scalar_type() const { return config().scalar_type(); }
 
-  llvm::Value* lhs_;
-  llvm::Value* rhs_;
-  llvm::Value* result_;
-  Config config_;
+  std::tuple<int64, int64, int64> GetGemmTileSize() const {
+    // Tuned for broadwell - Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
+    //
+    // TODO(b/80093688): Tune for other architectures and centralize this
+    // information in one place.
+    const std::tuple<int64, int64, int64> kDefaultTileSize =
+        std::tuple<int64, int64, int64>(11, 9, 1);
+    return options::LlvmIrGemmTileSize(hlo_module_config_)
+        .value_or(kDefaultTileSize);
+  }
 
+  DotInfo dot_info_;
+  string dot_hlo_name_;
+  const llvm_ir::IrArray& target_array_;
+  const llvm_ir::IrArray& lhs_array_;
+  const llvm_ir::IrArray& rhs_array_;
+  const llvm_ir::IrArray* addend_array_;
+  llvm::Value* executable_run_options_value_;
   llvm::IRBuilder<>* b_;
-  KernelSupportLibrary ksl_;
+  const HloModuleConfig& hlo_module_config_;
+  const TargetMachineFeatures& target_machine_features_;
 };
-
-void TiledSmallGemmEmitter::Emit() { HandleResiduesOnN(); }
-
-void TiledSmallGemmEmitter::HandleResiduesOnN() {
-  // We can only iterate the `n` dimension for an extent that is divisible by
-  // the vectorization width.  So we emit an outer loop that first processes the
-  // largest extent in `n` that is divisible by max_vectorization_width, then
-  // the largest remaining extent that is divisible by max_vectorization_width /
-  // 2 etc.
-
-  int64 current_vectorization_width =
-      max_vector_count() * max_vectorization_width();
-  int64 current_vector_count = max_vector_count();
-
-  int64 n_start = 0;
-  while (n_start != dims().n() &&
-         current_vectorization_width >= min_vectorization_width()) {
-    int64 n_end = dims().n() - (dims().n() % current_vectorization_width);
-    if (n_start != n_end) {
-      VectorSupportLibrary vsl(scalar_type(), current_vectorization_width, b_,
-                               "gemm");
-      HandleResiduesOnK(&vsl, GetInt64(n_start), GetInt64(n_end));
-      n_start = n_end;
-    }
-    if (current_vector_count == 1) {
-      current_vectorization_width /= 2;
-    } else {
-      current_vector_count--;
-      current_vectorization_width =
-          current_vector_count * max_vectorization_width();
-    }
-  }
-
-  if (n_start != dims().n()) {
-    VectorSupportLibrary vsl(scalar_type(), 1, b_, "gemm");
-    ksl_.ForReturnVoid("epi.n", n_start, dims().n(), 1, [&](llvm::Value* n_i) {
-      llvm::Value* n_i_next = b_->CreateAdd(n_i, b_->getInt64(1));
-      HandleResiduesOnK(&vsl, n_i, n_i_next);
-    });
-  }
-}
-
-void TiledSmallGemmEmitter::HandleResiduesOnK(VectorSupportLibrary* vsl,
-                                              llvm::Value* n_start,
-                                              llvm::Value* n_end) {
-  int64 k_start = 0;
-  int64 k_end = dims().k() - (dims().k() % tile_size_k());
-  if (k_end != k_start) {
-    HandleResiduesOnM(vsl, tile_size_k(), GetInt64(k_start), GetInt64(k_end),
-                      n_start, n_end);
-    k_start = k_end;
-  }
-
-  if (k_start != dims().k()) {
-    HandleResiduesOnM(vsl, dims().k() - k_start, GetInt64(k_start),
-                      GetInt64(dims().k()), n_start, n_end);
-  }
-}
-
-void TiledSmallGemmEmitter::HandleResiduesOnM(
-    VectorSupportLibrary* vsl, int64 tile_size_k, llvm::Value* k_start,
-    llvm::Value* k_end, llvm::Value* n_start, llvm::Value* n_end) {
-  const int64 m_end = dims().m() - dims().m() % tile_size_m();
-  EmitTiledGemm(vsl, tile_size_k, k_start, k_end, n_start, n_end, tile_size_m(),
-                GetInt64(0), GetInt64(m_end));
-
-  if (m_end != dims().m()) {
-    EmitTiledGemm(vsl, tile_size_k, k_start, k_end, n_start, n_end,
-                  dims().m() - m_end, GetInt64(m_end), GetInt64(dims().m()));
-  }
-}
-
-// The loop structure is:
-//
-// Iterate over dimension M as m:
-//   Iterate over dimension N as n:
-//     Iterate over dimension K as k:
-//       OutputTile[m,n] += Dot(LhsTile[m,k], RhsTile[k,n])
-//
-// I.e. a just a tiled version of a "naive" GEMM.
-//
-// The tiling scheme is as follows:
-//
-// Let the LHS be:
-//
-//   +----+----+----+
-//   | a0 | b0 | c0 | .
-//   +----+----+----+ .
-//   | a1 | b1 | c1 | .
-//   +----+----+----+
-//     ..     ..
-//
-// and the RHS be:
-//
-//   +----+----+----+----+
-//   | p0 | p1 | p2 | p3 | .
-//   +----+----+----+----+ .
-//   | q0 | q1 | q2 | q3 | .
-//   +----+----+----+----+
-//   | r0 | r1 | r2 | r3 | .
-//   +----+----+----+----+ .
-//     ......    ......
-//
-// and let tile_size_m=2, tile_size_k=3 and the vector width (implicitly denoted
-// by `vsl`) be 4.  Then we want to matrix multiply this tile to get a [2,4]
-// matrix that we can increment the result matrix by.
-//
-// First broadcast the rows row in LHS to 3 vectors of width 4, giving us a rank
-// 3 array, L, of dimension [2,3,4]:
-//
-//       L[0,_,_]           *      L[1,_,_]
-//                          *
-//   +----+----+----+----+  *  +----+----+----+----+
-//   | a0 | a0 | a0 | a0 |  *  | a1 | a1 | a1 | a1 |
-//   +----+----+----+----+  *  +----+----+----+----+
-//   | b0 | b0 | b0 | b0 |  *  | b1 | b1 | b1 | b1 |
-//   +----+----+----+----+  *  +----+----+----+----+
-//   | c0 | c0 | c0 | c0 |  *  | c1 | c1 | c1 | c1 |
-//   +----+----+----+----+  *  +----+----+----+----+
-//
-//
-// Then we FMA L[0,_,_] with the RHS to get the first row of the result and
-// L[1,_,_] with the RHS to get the second row of the result.  For example,
-// L[0,_,_] is computed as:
-//
-//   +----+----+----+----+   +----+----+----+----+
-//   | a0 | a0 | a0 | a0 | * | p0 | p1 | p2 | p3 |   +
-//   +----+----+----+----+   +----+----+----+----+
-//
-//   +----+----+----+----+   +----+----+----+----+
-//   | b0 | b0 | b0 | b0 | * | q0 | q1 | q2 | q3 |   +
-//   +----+----+----+----+   +----+----+----+----+
-//
-//   +----+----+----+----+   +----+----+----+----+
-//   | c0 | c0 | c0 | c0 | * | r0 | r1 | r2 | r3 |
-//   +----+----+----+----+   +----+----+----+----+
-//
-// to get:
-//
-//   +-------------------+-------------------+-------------------+---------
-//   | a0*p0+b0*q0+c0*r0 | a0*p1+b0*q1+c0*r1 | a0*p2+b0*q2+c0*r2 |  ...
-//   +-------------------+-------------------+-------------------+---------
-void TiledSmallGemmEmitter::EmitTiledGemm(
-    VectorSupportLibrary* vsl, int64 tile_size_k, llvm::Value* k_start,
-    llvm::Value* k_end, llvm::Value* n_start, llvm::Value* n_end,
-    int64 tile_size_m, llvm::Value* m_start, llvm::Value* m_end) {
-  ksl_.ForReturnVoid(
-      "dot.m", m_start, m_end, tile_size_m, [&](llvm::Value* m_i) {
-        MemoryTile result_memory_tile(
-            vsl, b_, /*matrix=*/result_,
-            /*matrix_size_along_minor_dim=*/dims().n(),
-            /*major_dim_offset=*/m_i,
-            /*tile_size_along_major_dim=*/tile_size_m);
-        MemoryTile lhs_memory_tile(vsl, b_, /*matrix=*/lhs_,
-                                   /*matrix_size_along_minor_dim=*/dims().k(),
-                                   /*major_dim_offset=*/m_i,
-                                   /*tile_size_along_major_dim=*/tile_size_m);
-        ksl_.ForReturnVoid(
-            "dot.n", n_start, n_end, vsl->vector_size(), [&](llvm::Value* n_i) {
-              TileVariable result_tile_var(vsl,
-                                           result_memory_tile.LoadTile(n_i));
-              ksl_.ForReturnVoid(
-                  "dot.k", k_start, k_end, tile_size_k, [&](llvm::Value* k_i) {
-                    MemoryTile rhs_memory_tile(vsl, b_, rhs_, dims().n(), k_i,
-                                               tile_size_k);
-                    std::vector<std::vector<llvm::Value*>> lhs_tile =
-                        lhs_memory_tile.LoadBroadcastTile(k_i, tile_size_k);
-                    std::vector<llvm::Value*> rhs_tile =
-                        rhs_memory_tile.LoadTile(n_i);
-                    std::vector<llvm::Value*> result_tile =
-                        result_tile_var.Get();
-                    for (int64 r_m_i = 0; r_m_i < tile_size_m; r_m_i++) {
-                      for (int64 r_k_i = 0; r_k_i < tile_size_k; r_k_i++) {
-                        result_tile[r_m_i] =
-                            vsl->MulAdd(lhs_tile[r_m_i][r_k_i], rhs_tile[r_k_i],
-                                        result_tile[r_m_i]);
-                      }
-                    }
-                    result_tile_var.Set(result_tile);
-                  });
-
-              result_memory_tile.StoreTile(result_tile_var.Get(), n_i);
-            });
-      });
-}
-
 }  // namespace
 
-DotOpEmitter::DotOpEmitter(const HloInstruction& dot,
+DotOpEmitter::DotOpEmitter(DotInfo dot_info, string dot_hlo_name,
                            const llvm_ir::IrArray& target_array,
                            const llvm_ir::IrArray& lhs_array,
                            const llvm_ir::IrArray& rhs_array,
@@ -975,7 +211,8 @@ DotOpEmitter::DotOpEmitter(const HloInstruction& dot,
                            llvm::IRBuilder<>* b,
                            const HloModuleConfig& hlo_module_config,
                            const TargetMachineFeatures& target_machine_features)
-    : dot_(dot),
+    : dot_info_(std::move(dot_info)),
+      dot_hlo_name_(std::move(dot_hlo_name)),
       target_array_(target_array),
       lhs_array_(lhs_array),
       rhs_array_(rhs_array),
@@ -985,58 +222,9 @@ DotOpEmitter::DotOpEmitter(const HloInstruction& dot,
       hlo_module_config_(hlo_module_config),
       target_machine_features_(target_machine_features) {}
 
-/* static */ Status DotOpEmitter::EmitDotOperation(
-    const HloInstruction& dot, const llvm_ir::IrArray& target_array,
-    const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
-    const llvm_ir::IrArray* addend_array,
-    llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
-    const HloModuleConfig& hlo_module_config,
-    const TargetMachineFeatures& target_machine_features) {
-  PrimitiveType type = target_array.GetShape().element_type();
-  TF_RET_CHECK(F16 == type || F32 == type || F64 == type || C64 == type);
-  DotOpEmitter dot_emitter(dot, target_array, lhs_array, rhs_array,
-                           addend_array, executable_run_options_value, b,
-                           hlo_module_config, target_machine_features);
-  return dot_emitter.Emit();
-}
-
-bool DotOpEmitter::EmitSmallGemmIfProfitable(
-    const DotOpEmitter::MatMultDims& mat_mult_dims) {
-  if (ShouldUseMultiThreadedEigen()) {
-    return false;
-  }
-
-  if (!EnableExperimentalLlvmIrGemm()) {
-    // TODO(sanjoy):  We should make these numbers micro-arch specific.
-    bool small_gemm = mat_mult_dims.k <= 128 &&
-                      ((mat_mult_dims.m <= 32 && mat_mult_dims.n <= 128) ||
-                       (mat_mult_dims.m <= 128 && mat_mult_dims.n <= 32));
-    if (!small_gemm) {
-      return false;
-    }
-  }
-
-  if (mat_mult_dims.lhs_non_canonical || mat_mult_dims.rhs_non_canonical) {
-    return false;
-  }
-
-  PrimitiveType primitive_type = dot_.shape().element_type();
-
-  switch (primitive_type) {
-    default:
-      return false;
-
-    case F32:
-    case F64:
-    case S32:
-    case S64:
-      break;
-  }
-
-  if (!(mat_mult_dims.lhs_column_major == mat_mult_dims.rhs_column_major &&
-        mat_mult_dims.rhs_column_major == mat_mult_dims.target_column_major)) {
-    return false;
-  }
+void DotOpEmitter::EmitTiledLlvmIrGemm() {
+  PrimitiveType primitive_type = dot_info_.result_shape.element_type();
+  MatMultDims mat_mult_dims = GetMatMultDims();
 
   llvm::Value* lhs = lhs_array_.GetBasePointer();
   llvm::Value* rhs = rhs_array_.GetBasePointer();
@@ -1051,9 +239,8 @@ bool DotOpEmitter::EmitSmallGemmIfProfitable(
   }
 
   int64 size_bytes = m * n * ShapeUtil::ByteSizeOfPrimitiveType(primitive_type);
-  b_->CreateMemSet(
-      target, b_->getInt8(0), size_bytes,
-      target_machine_features_.minimum_alignment_for_allocation(size_bytes));
+  b_->CreateMemSet(target, b_->getInt8(0), /*Size=*/size_bytes,
+                   /*Align=*/1);
 
   int64 max_target_vector_width =
       target_machine_features_.vector_register_num_elements(
@@ -1063,47 +250,28 @@ bool DotOpEmitter::EmitSmallGemmIfProfitable(
   std::tie(tile_size_m, tile_size_k, tile_size_n_in_vector_width) =
       GetGemmTileSize();
 
-  TiledSmallGemmEmitter::Config config(
-      /*scalar_type=*/primitive_type,
-      TiledSmallGemmEmitter::Dimensions{/*m=*/m, /*k=*/k, /*n=*/n},
-      /*max_vectorization_width=*/max_target_vector_width,
-      /*max_vector_count=*/tile_size_n_in_vector_width,
-      /*min_vectorization_width=*/std::min<int64>(4, max_target_vector_width),
-      /*tile_size_m=*/tile_size_m, /*tile_size_k=*/tile_size_k);
-
-  VLOG(2) << "Emitting GEMM kernel in LLVM IR with config "
-          << config.GetCacheKey();
-
   const bool enable_fast_math =
       hlo_module_config_.debug_options().xla_cpu_enable_fast_math();
   const bool optimize_for_size =
       options::OptimizeForSizeRequested(hlo_module_config_);
 
-  KernelSupportLibrary::EmitAndCallOutlinedKernel(
+  EmitSmallGemm(
+      /*scalar_type=*/primitive_type,
+      /*m=*/m, /*k=*/k, /*n=*/n,
+      /*max_vectorization_width=*/max_target_vector_width,
+      /*max_vector_count=*/tile_size_n_in_vector_width,
+      /*min_vectorization_width=*/std::min<int64>(4, max_target_vector_width),
+      /*tile_size_m=*/tile_size_m, /*tile_size_k=*/tile_size_k, /*lhs=*/lhs,
+      /*rhs=*/rhs, /*result=*/target, b_,
       /*enable_fast_math=*/enable_fast_math,
-      /*optimize_for_size=*/optimize_for_size, b_, config.GetCacheKey(), lhs,
-      rhs, target,
-      [this, config](llvm::Value* lhs, llvm::Value* rhs, llvm::Value* target) {
-        TiledSmallGemmEmitter small_gemm_emitter(config, /*lhs=*/lhs,
-                                                 /*rhs=*/rhs,
-                                                 /*result=*/target, b_);
-        small_gemm_emitter.Emit();
-      });
-
-  return true;
+      /*optimize_for_size=*/optimize_for_size);
 }
 
-bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
-  if (dot_.shape().dimensions_size() != 2) {
-    return false;
-  }
-
-  PrimitiveType primitive_type = dot_.shape().element_type();
+void DotOpEmitter::EmitTiledLlvmIrGemv() {
+  PrimitiveType primitive_type = dot_info_.result_shape.element_type();
 
-  if (!primitive_util::IsFloatingPointType(primitive_type) &&
-      !primitive_util::IsIntegralType(primitive_type)) {
-    return false;
-  }
+  CHECK(primitive_util::IsFloatingPointType(primitive_type) ||
+        primitive_util::IsIntegralType(primitive_type));
 
   MatMultDims mat_mult_dims = GetMatMultDims();
   bool is_column_major_matrix_vector = false;
@@ -1144,9 +312,7 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
     }
   }
 
-  if (!is_column_major_matrix_vector && !is_row_major_matrix_vector) {
-    return EmitSmallGemmIfProfitable(mat_mult_dims);
-  }
+  CHECK(is_column_major_matrix_vector || is_row_major_matrix_vector);
 
   int64 tiling_factor = GetGemvTilingFactor();
   CHECK_GT(tiling_factor, 0);
@@ -1178,44 +344,27 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
   if (is_column_major_matrix_vector) {
     VLOG(2) << "Emitting column major matrix-vector multiply with m = " << m
             << " and k = " << k;
-    ColumnMajorMatrixVectorProductEmitter::Config config(
+    EmitColumnMajorGemv(
         /*scalar_type=*/primitive_type,
         /*tile_rows=*/vector_register_element_size, /*tile_cols=*/tiling_factor,
-        /*m=*/m, /*k=*/k, /*has_addend=*/addend_array_ != nullptr);
-
-    KernelSupportLibrary::EmitAndCallOutlinedKernel(
+        /*m=*/m, /*k=*/k, /*lhs=*/lhs_op, /*rhs=*/rhs_op,
+        /*addend=*/addend_array_ ? addend_array_->GetBasePointer() : nullptr,
+        /*result=*/result_op, b_,
         /*enable_fast_math=*/enable_fast_math,
-        /*optimize_for_size=*/optimize_for_size, b_, config.GetCacheKey(),
-        lhs_op, rhs_op,
-        addend_array_ ? addend_array_->GetBasePointer() : nullptr, result_op,
-        [this, config](llvm::Value* lhs_op, llvm::Value* rhs_op,
-                       llvm::Value* addend_op, llvm::Value* result_op) {
-          ColumnMajorMatrixVectorProductEmitter emitter(
-              config, lhs_op, rhs_op, addend_op, result_op, b_);
-          emitter.Emit();
-        });
+        /*optimize_for_size=*/optimize_for_size);
   } else {
     VLOG(2) << "Emitting row major matrix-vector multiply with m = " << m
             << " and k = " << k;
-    RowMajorMatrixVectorProductEmitter::Config config(
+    EmitRowMajorGemv(
         /*scalar_type=*/primitive_type,
-        /*tile_rows=*/tiling_factor, /*tile_cols=*/vector_register_element_size,
-        /*m=*/m, /*k=*/k, /*has_addend=*/addend_array_ != nullptr);
-
-    KernelSupportLibrary::EmitAndCallOutlinedKernel(
+        /*tile_rows=*/tiling_factor,
+        /*tile_cols=*/vector_register_element_size,
+        /*m=*/m, /*k=*/k, /*lhs=*/lhs_op, /*rhs=*/rhs_op,
+        /*addend=*/addend_array_ ? addend_array_->GetBasePointer() : nullptr,
+        /*result=*/result_op, b_,
         /*enable_fast_math=*/enable_fast_math,
-        /*optimize_for_size=*/optimize_for_size, b_, config.GetCacheKey(),
-        lhs_op, rhs_op,
-        addend_array_ ? addend_array_->GetBasePointer() : nullptr, result_op,
-        [this, config](llvm::Value* lhs_op, llvm::Value* rhs_op,
-                       llvm::Value* addend_op, llvm::Value* result_op) {
-          RowMajorMatrixVectorProductEmitter emitter(config, lhs_op, rhs_op,
-                                                     addend_op, result_op, b_);
-          emitter.Emit();
-        });
+        /*optimize_for_size=*/optimize_for_size);
   }
-
-  return true;
 }
 
 Status DotOpEmitter::Emit() {
@@ -1241,11 +390,6 @@ Status DotOpEmitter::Emit() {
   // which performs the sum-of-products (the reduction loop) before storing
   // the result in the output buffer.
 
-  // This routine assumes that the dot operation is not in a parallelized
-  // enclosing computation.
-  CHECK(
-      dot_.parent()->root_instruction()->outer_dimension_partitions().empty());
-
   const Shape& lhs_shape = lhs_array_.GetShape();
   const Shape& rhs_shape = rhs_array_.GetShape();
 
@@ -1256,27 +400,41 @@ Status DotOpEmitter::Emit() {
     return EmitScalarDot();
   }
 
-  if (EmitLlvmIrDotIfProfitable()) {
-    return Status::OK();
+  switch (GetDotImplementationStrategy(hlo_module_config_, dot_info_,
+                                       target_machine_features_)) {
+    case DotImplementationStrategy::kNaiveLlvmIr:
+      EmitNaiveLlvmIrGemm();
+      return Status::OK();
+
+    case DotImplementationStrategy::kTiledLlvmIrGemv:
+      EmitTiledLlvmIrGemv();
+      return Status::OK();
+
+    case DotImplementationStrategy::kTiledLlvmIrGemm:
+      EmitTiledLlvmIrGemm();
+      return Status::OK();
+
+    case DotImplementationStrategy::kEigen:
+      return EmitCallToRuntime();
   }
+}
 
+void DotOpEmitter::EmitNaiveLlvmIrGemm() {
   CHECK_EQ(addend_array_, nullptr);
 
-  if (PotentiallyImplementedAsEigenDot(dot_, target_machine_features_)) {
-    return EmitCallToRuntime();
-  }
+  const Shape& lhs_shape = lhs_array_.GetShape();
+  const Shape& rhs_shape = rhs_array_.GetShape();
+  const DotDimensionNumbers& dim_nums = dot_info_.dim_nums;
 
   // Reduce along dimension 0 of the LHS and 1 of the RHS. Vectors are a special
   // case where the reduction dimension is 0 for both LHS and RHS. This results
   // in a vector dot product producing a scalar.
-  int64 lhs_reduction_dimension =
-      dot_.dot_dimension_numbers().lhs_contracting_dimensions(0);
-  int64 rhs_reduction_dimension =
-      dot_.dot_dimension_numbers().rhs_contracting_dimensions(0);
+  int64 lhs_reduction_dimension = dim_nums.lhs_contracting_dimensions(0);
+  int64 rhs_reduction_dimension = dim_nums.rhs_contracting_dimensions(0);
 
   // Verify the reduction dimension in the two operands are the same size.
-  TF_RET_CHECK(lhs_shape.dimensions(lhs_reduction_dimension) ==
-               rhs_shape.dimensions(rhs_reduction_dimension));
+  CHECK_EQ(lhs_shape.dimensions(lhs_reduction_dimension),
+           rhs_shape.dimensions(rhs_reduction_dimension));
 
   bool lhs_reduction_along_minor_dimension =
       lhs_reduction_dimension == LayoutUtil::Minor(lhs_shape.layout(), 0);
@@ -1286,7 +444,7 @@ Status DotOpEmitter::Emit() {
   // Create loop nests which loop through the LHS operand dimensions and the RHS
   // operand dimensions. The reduction dimension of the LHS and RHS are handled
   // in a separate innermost loop which performs the sum of products.
-  llvm_ir::ForLoopNest loop_nest(llvm_ir::IrName(&dot_), b_);
+  llvm_ir::ForLoopNest loop_nest(llvm_ir::IrName(dot_hlo_name_), b_);
   llvm_ir::IrArray::Index lhs_index = loop_nest.EmitOperandArrayLoopNest(
       lhs_array_, /*dimension_to_skip=*/lhs_reduction_dimension, "lhs");
   llvm_ir::IrArray::Index rhs_index = loop_nest.EmitOperandArrayLoopNest(
@@ -1391,8 +549,6 @@ Status DotOpEmitter::Emit() {
   // Set the IR builder insert point to the exit basic block of the outer most
   // loop.
   b_->SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock());
-
-  return Status::OK();
 }
 
 Status DotOpEmitter::EmitScalarDot() {
@@ -1406,16 +562,20 @@ Status DotOpEmitter::EmitScalarDot() {
   llvm::Value* rhs_value =
       rhs_array_.EmitReadArrayElement(/*index=*/element_index, b_);
   if (ShapeUtil::ElementIsComplex(lhs_array_.GetShape())) {
-#define REAL(x) b_->CreateExtractValue(x, {0})
-#define IMAG(x) b_->CreateExtractValue(x, {1})
-    llvm::Value* real =
-        b_->CreateFSub(b_->CreateFMul(REAL(lhs_value), REAL(rhs_value)),
-                       b_->CreateFMul(IMAG(lhs_value), IMAG(rhs_value)));
-    llvm::Value* imag =
-        b_->CreateFAdd(b_->CreateFMul(REAL(lhs_value), IMAG(rhs_value)),
-                       b_->CreateFMul(IMAG(lhs_value), REAL(rhs_value)));
-#undef IMAG
-#undef REAL
+    auto get_real = [&](llvm::Value* x) {
+      return b_->CreateExtractValue(x, {0});
+    };
+
+    auto get_imag = [&](llvm::Value* x) {
+      return b_->CreateExtractValue(x, {1});
+    };
+
+    llvm::Value* real = b_->CreateFSub(
+        b_->CreateFMul(get_real(lhs_value), get_real(rhs_value)),
+        b_->CreateFMul(get_imag(lhs_value), get_imag(rhs_value)));
+    llvm::Value* imag = b_->CreateFAdd(
+        b_->CreateFMul(get_real(lhs_value), get_imag(rhs_value)),
+        b_->CreateFMul(get_imag(lhs_value), get_real(rhs_value)));
     result = llvm::ConstantAggregateZero::get(lhs_array_.GetElementLlvmType());
     result = b_->CreateInsertValue(result, real, {0});
     result = b_->CreateInsertValue(result, imag, {1});
@@ -1435,7 +595,7 @@ Status DotOpEmitter::EmitCallToRuntime() {
   // The two transpose_... parameters are actually booleans, but we use int32
   // to avoid target-dependent calling convention details.
 
-  bool multi_threaded = ShouldUseMultiThreadedEigen();
+  bool multi_threaded = ShouldUseMultiThreadedEigen(hlo_module_config_);
   bool use_mkl_dnn = hlo_module_config_.debug_options().xla_cpu_use_mkl_dnn();
   PrimitiveType type = target_array_.GetShape().element_type();
   llvm::Type* float_type;
@@ -1483,11 +643,13 @@ Status DotOpEmitter::EmitCallToRuntime() {
   llvm::Function* function = b_->GetInsertBlock()->getParent();
   llvm::Module* module = function->getParent();
 
-  llvm::Function* matmul_func = llvm::cast<llvm::Function>(
-      module->getOrInsertFunction(fn_name, matmul_type));
-  matmul_func->setCallingConv(llvm::CallingConv::C);
-  matmul_func->setDoesNotThrow();
-  matmul_func->setOnlyAccessesArgMemory();
+  llvm::FunctionCallee matmul_func =
+      module->getOrInsertFunction(fn_name, matmul_type);
+  if (auto* fn = llvm::dyn_cast<llvm::Function>(matmul_func.getCallee())) {
+    fn->setCallingConv(llvm::CallingConv::C);
+    fn->setDoesNotThrow();
+    fn->setOnlyAccessesArgMemory();
+  }
 
   // The Eigen runtime function expects column-major layout. If the matrices are
   // row major, then use the following identity to compute the product:
@@ -1528,11 +690,11 @@ Status DotOpEmitter::EmitCallToRuntime() {
 }
 
 DotOpEmitter::MatMultDims DotOpEmitter::GetMatMultDims() const {
-  CHECK_EQ(dot_.shape().dimensions_size(), 2);
+  CHECK_EQ(dot_info_.result_shape.dimensions_size(), 2);
 
   const Shape& lhs_shape = lhs_array_.GetShape();
   const Shape& rhs_shape = rhs_array_.GetShape();
-  const DotDimensionNumbers& dim_nums = dot_.dot_dimension_numbers();
+  const DotDimensionNumbers& dim_nums = dot_info_.dim_nums;
 
   return {
       /*m=*/lhs_shape.dimensions(1 - dim_nums.lhs_contracting_dimensions(0)),
@@ -1546,74 +708,6 @@ DotOpEmitter::MatMultDims DotOpEmitter::GetMatMultDims() const {
       LayoutUtil::Minor(target_array_.GetShape().layout(), 0) == 0};
 }
 
-// Return whether the given shape is rank 2.
-static bool IsRank2(const Shape& shape) { return ShapeUtil::Rank(shape) == 2; }
-
-// In a gemm operation where output = lhs * rhs, check whether the given shapes
-// are valid for the operation.
-static bool AreValidGemmShapes(
-    const Shape& lhs_shape, const Shape& rhs_shape, const Shape& output_shape,
-    const TargetMachineFeatures& target_machine_features) {
-  // The inputs and the output must
-  // 1) be matrices with no padding, and
-  // 2) have an allowed element type.
-  PrimitiveType output_primitive_type = output_shape.element_type();
-  if (!(output_primitive_type == F64 || output_primitive_type == F32 ||
-        output_primitive_type == F16)) {
-    return false;
-  }
-
-  if (!(IsRank2(lhs_shape) && IsRank2(rhs_shape) && IsRank2(output_shape))) {
-    return false;
-  }
-
-  auto is_aligned = [&](const Shape& shape) {
-    return GetMinimumAlignmentForArray(shape, target_machine_features) >=
-           TargetMachineFeatures::kEigenExpectedTensorAlignment;
-  };
-
-  if (!is_aligned(lhs_shape) || !is_aligned(rhs_shape) ||
-      !is_aligned(output_shape)) {
-    return false;
-  }
-
-  return true;
-}
-
-bool PotentiallyImplementedAsEigenDot(
-    const HloInstruction& hlo,
-    const TargetMachineFeatures& target_machine_features) {
-  // For certain types of Dot, we can call Eigen
-  if (hlo.opcode() == HloOpcode::kDot) {
-    const Shape& lhs_shape = hlo.operand(0)->shape();
-    const Shape& rhs_shape = hlo.operand(1)->shape();
-
-    if (ShapeUtil::IsZeroElementArray(lhs_shape) ||
-        ShapeUtil::IsZeroElementArray(rhs_shape)) {
-      return false;
-    }
-
-    if (ProfitableToImplementDotInTiledLlvmIr(hlo)) {
-      return false;
-    }
-
-    // If gemm can accept the operand shapes, use it rather than a custom
-    // kernel.
-    if (AreValidGemmShapes(lhs_shape, rhs_shape, hlo.shape(),
-                           target_machine_features)) {
-      const DotDimensionNumbers& dim_numbers = hlo.dot_dimension_numbers();
-      // The size of the reduction dimension should match. The shape inference
-      // guarantees this invariant, so the check here is for programming
-      // errors.
-      CHECK_EQ(lhs_shape.dimensions(dim_numbers.lhs_contracting_dimensions(0)),
-               rhs_shape.dimensions(dim_numbers.rhs_contracting_dimensions(0)));
-      return true;
-    }
-  }
-
-  return false;
-}
-
 // For vector-matrix dot products, it is always profitable to make the Rhs
 // column major.
 absl::optional<int64> ProfitableToMakeDotOperandColumnMajor(
@@ -1652,16 +746,319 @@ absl::optional<int64> ProfitableToMakeDotOperandColumnMajor(
   return {};
 }
 
-bool ProfitableToImplementDotInTiledLlvmIr(const HloInstruction& dot) {
+namespace {
+// Return whether the given shape is rank 2.
+bool IsRank2(const Shape& shape) { return shape.rank() == 2; }
+
+bool IsSimpleLayout(const Layout& layout) {
+  return layout.tiles().empty() && layout.format() == DENSE;
+}
+
+// In a gemm operation where output = lhs * rhs, check whether the given shapes
+// are valid for the operation.
+bool AreGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape,
+                   const Shape& output_shape,
+                   const TargetMachineFeatures& target_machine_features) {
+  CHECK(!lhs_shape.has_layout() || IsSimpleLayout(lhs_shape.layout()))
+      << lhs_shape.DebugString();
+  CHECK(!rhs_shape.has_layout() || IsSimpleLayout(rhs_shape.layout()))
+      << rhs_shape.DebugString();
+  CHECK(!output_shape.has_layout() || IsSimpleLayout(output_shape.layout()))
+      << output_shape.DebugString();
+
+  switch (output_shape.element_type()) {
+    case F64:
+    case F32:
+    case F16:
+      return IsRank2(lhs_shape) && IsRank2(rhs_shape) && IsRank2(output_shape);
+    default:
+      return false;
+  }
+}
+
+bool IsAlignedGemm(const DotInfo& dot_info,
+                   const TargetMachineFeatures& target_machine_features) {
+  if (ShapeUtil::IsZeroElementArray(dot_info.lhs_shape) ||
+      ShapeUtil::IsZeroElementArray(dot_info.rhs_shape)) {
+    return false;
+  }
+
+  return AreGemmShapes(dot_info.lhs_shape, dot_info.rhs_shape,
+                       dot_info.result_shape, target_machine_features);
+}
+
+bool CanEmitTiledLlvmIrGemm(
+    const HloModuleConfig& config, const DotInfo& dot_info,
+    const TargetMachineFeatures& target_machine_features) {
+  CHECK(IsAlignedGemm(dot_info, target_machine_features));
+
+  if (ShouldUseMultiThreadedEigen(config)) {
+    return false;
+  }
+
+  int m = dot_info.result_shape.dimensions(0);
+  int k = dot_info.lhs_shape.dimensions(
+      dot_info.dim_nums.lhs_contracting_dimensions(0));
+  int n = dot_info.result_shape.dimensions(1);
+
+  if (!options::ForceEnableExperimentalLlvmIrGemm(config)) {
+    // TODO(sanjoy):  We should make these numbers micro-arch specific.
+    bool small_gemm =
+        k <= 128 && ((m <= 32 && n <= 128) || (m <= 128 && n <= 32));
+    if (!small_gemm) {
+      return false;
+    }
+  }
+
+  bool lhs_non_canonical = dot_info.dim_nums.lhs_contracting_dimensions(0) == 0;
+  bool rhs_non_canonical = dot_info.dim_nums.rhs_contracting_dimensions(0) == 1;
+
+  if (lhs_non_canonical || rhs_non_canonical) {
+    return false;
+  }
+
+  if (dot_info.result_shape.element_type() == F16) {
+    // TODO(sanjoy): This is probably easy to fix, but I want to keep the CL
+    // adding this comment NFC.
+    return false;
+  }
+
+  return true;
+}
+
+DotImplementationStrategy GetDotImplementationStrategy(
+    const HloModuleConfig& config, const DotInfo& dot_info,
+    const TargetMachineFeatures& target_machine_features) {
+  PrimitiveType element_type = dot_info.result_shape.element_type();
   // Any Matrix-Vector product of floating point or integral type, or
   // a transpose-dot fusion of the same can be lowered to a tiled LLVM
   // IR implementation.
-  const Shape& shape = dot.shape();
-  return shape.dimensions_size() == 2 &&
-         (shape.dimensions(0) == 1 || shape.dimensions(1) == 1) &&
-         (primitive_util::IsFloatingPointType(shape.element_type()) ||
-          primitive_util::IsIntegralType(shape.element_type()));
+  if (dot_info.result_shape.dimensions_size() == 2 &&
+      (dot_info.result_shape.dimensions(0) == 1 ||
+       dot_info.result_shape.dimensions(1) == 1) &&
+      (primitive_util::IsFloatingPointType(element_type) ||
+       primitive_util::IsIntegralType(element_type))) {
+    return DotImplementationStrategy::kTiledLlvmIrGemv;
+  }
+
+  if (IsAlignedGemm(dot_info, target_machine_features)) {
+    return CanEmitTiledLlvmIrGemm(config, dot_info, target_machine_features)
+               ? DotImplementationStrategy::kTiledLlvmIrGemm
+               : DotImplementationStrategy::kEigen;
+  }
+
+  return DotImplementationStrategy::kNaiveLlvmIr;
+}
+
+Status EmitNonBatchDotOperation(
+    DotInfo dot_info, string hlo_name, const llvm_ir::IrArray& target_array,
+    const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
+    const llvm_ir::IrArray* addend_array,
+    llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
+    const HloModuleConfig& hlo_module_config,
+    const TargetMachineFeatures& target_machine_features) {
+  PrimitiveType type = target_array.GetShape().element_type();
+  TF_RET_CHECK(F16 == type || F32 == type || F64 == type || C64 == type ||
+               C128 == type);
+  DotOpEmitter dot_emitter(std::move(dot_info), std::move(hlo_name),
+                           target_array, lhs_array, rhs_array, addend_array,
+                           executable_run_options_value, b, hlo_module_config,
+                           target_machine_features);
+  return dot_emitter.Emit();
+}
+
+Shape DropFirstDim(const Shape& shape) {
+  absl::Span<int64 const> array_shape_dims(shape.dimensions());
+  array_shape_dims.remove_prefix(1);
+  return ShapeUtil::MakeShapeWithDescendingLayout(shape.element_type(),
+                                                  array_shape_dims);
+}
+
+Shape CollapseFirstNDims(const Shape& shape, int64 n) {
+  absl::Span<int64 const> input_shape_dims(shape.dimensions());
+  int64 prefix_dim =
+      std::accumulate(input_shape_dims.begin(), input_shape_dims.begin() + n,
+                      1ll, std::multiplies<int64>());
+  DimensionVector result_dims;
+  result_dims.push_back(prefix_dim);
+  std::copy(input_shape_dims.begin() + n, input_shape_dims.end(),
+            std::back_inserter(result_dims));
+  return ShapeUtil::MakeShapeWithDescendingLayout(shape.element_type(),
+                                                  result_dims);
+}
+
+llvm_ir::IrArray CollapseFirstNDims(llvm::IRBuilder<>* b,
+                                    const llvm_ir::IrArray& array, int64 n) {
+  llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
+  const Shape& shape = array.GetShape();
+  CHECK(shape.has_layout() &&
+        LayoutUtil::IsMonotonicWithDim0Major(shape.layout()));
+  CHECK_GE(shape.dimensions_size(), n);
+  Shape new_shape = CollapseFirstNDims(shape, n);
+  llvm::Value* new_value = b->CreateBitCast(
+      array.GetBasePointer(),
+      llvm_ir::ShapeToIrType(new_shape, module)->getPointerTo());
+  return llvm_ir::IrArray(new_value, std::move(new_shape));
+}
+
+Status ValidateDotDimensionNumbers(const DotDimensionNumbers& dim_numbers) {
+  // Checks some invariants that do not hold in general, but DotDecomposer
+  // should have established for us.  This is just a debugging aid.
+  TF_RET_CHECK(dim_numbers.lhs_contracting_dimensions_size() == 1);
+  std::vector<int64> batch_dim_numbers(dim_numbers.lhs_batch_dimensions_size());
+  absl::c_iota(batch_dim_numbers, 0);
+  TF_RET_CHECK(
+      absl::c_equal(batch_dim_numbers, dim_numbers.lhs_batch_dimensions()));
+  TF_RET_CHECK(
+      absl::c_equal(batch_dim_numbers, dim_numbers.rhs_batch_dimensions()));
+  return Status::OK();
+}
+
+// Slice out the inner array at batch index `batch_index` from `outer_array`.
+llvm_ir::IrArray SliceOutInnerArray(llvm_ir::IrArray outer_array,
+                                    llvm::Value* batch_index,
+                                    llvm::IRBuilder<>* b) {
+  llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
+
+  Shape inner_shape = DropFirstDim(outer_array.GetShape());
+  llvm_ir::IrArray::Index slice_index(b->getInt64Ty());
+  slice_index.push_back(batch_index);
+  slice_index.InsertAt(
+      /*index=*/1, outer_array.GetShape().dimensions_size() - 1,
+      b->getInt64(0));
+  llvm::Value* slice_ptr = outer_array.EmitArrayElementAddress(slice_index, b);
+  llvm::Type* slice_ptr_type =
+      llvm_ir::ShapeToIrType(inner_shape, module)->getPointerTo();
+  return llvm_ir::IrArray(b->CreateBitCast(slice_ptr, slice_ptr_type),
+                          std::move(inner_shape));
+}
+
+Status EmitBatchDotOperation(
+    const HloInstruction& dot, const llvm_ir::IrArray& target_array,
+    const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
+    llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
+    const HloModuleConfig& hlo_module_config,
+    const TargetMachineFeatures& target_machine_features) {
+  TF_RETURN_IF_ERROR(ValidateDotDimensionNumbers(dot.dot_dimension_numbers()));
+
+  // Lower a batch dot into a sequence of non-batch dot operations.
+
+  int64 num_batch_dims =
+      dot.dot_dimension_numbers().lhs_batch_dimensions_size();
+
+  // First reshape the inputs to make sure we only have one batch dimension.
+  // This is a no-op bitcast because the operands have to be in row-major layout
+  // (enforced in CpuLayoutAssignment), and the batch dimensions are the leading
+  // dimensions (established by DotDecomposer and checked by
+  // ValidateDotDimensionNumbers above).
+  llvm_ir::IrArray lhs_array_reshaped =
+      CollapseFirstNDims(b, lhs_array, num_batch_dims);
+  llvm_ir::IrArray rhs_array_reshaped =
+      CollapseFirstNDims(b, rhs_array, num_batch_dims);
+  llvm_ir::IrArray target_array_reshaped =
+      CollapseFirstNDims(b, target_array, num_batch_dims);
+
+  int64 batch_count = lhs_array_reshaped.GetShape().dimensions(0);
+
+  KernelSupportLibrary ksl(b);
+
+  return ksl.ForWithStatus(
+      llvm_ir::IrName(&dot, "bdot"), /*start=*/0, /*end=*/batch_count,
+      /*step=*/1, [&](llvm::Value* indvar) {
+        DotDimensionNumbers adjusted_dim_numbers = dot.dot_dimension_numbers();
+        adjusted_dim_numbers.clear_lhs_batch_dimensions();
+        adjusted_dim_numbers.clear_rhs_batch_dimensions();
+
+        // Create a DotInfo representing the "inner" non-batch dot operation.
+        DotInfo dot_info;
+        dot_info.lhs_shape = DropFirstDim(lhs_array_reshaped.GetShape());
+        dot_info.rhs_shape = DropFirstDim(rhs_array_reshaped.GetShape());
+        dot_info.result_shape = DropFirstDim(target_array_reshaped.GetShape());
+        dot_info.dim_nums = dot.dot_dimension_numbers();
+        dot_info.dim_nums.clear_lhs_batch_dimensions();
+        dot_info.dim_nums.clear_rhs_batch_dimensions();
+
+        dot_info.dim_nums.set_lhs_contracting_dimensions(
+            0,
+            dot_info.dim_nums.lhs_contracting_dimensions(0) - num_batch_dims);
+        dot_info.dim_nums.set_rhs_contracting_dimensions(
+            0,
+            dot_info.dim_nums.rhs_contracting_dimensions(0) - num_batch_dims);
+
+        llvm_ir::IrArray lhs_slice =
+            SliceOutInnerArray(lhs_array_reshaped, /*batch_index=*/indvar, b);
+        llvm_ir::IrArray rhs_slice =
+            SliceOutInnerArray(rhs_array_reshaped, /*batch_index=*/indvar, b);
+        llvm_ir::IrArray target_slice = SliceOutInnerArray(
+            target_array_reshaped, /*batch_index=*/indvar, b);
+
+        // Emit the inner non-batch dot operation.
+        return EmitNonBatchDotOperation(
+            dot_info, dot.name(), target_slice, lhs_slice, rhs_slice, nullptr,
+            executable_run_options_value, b, hlo_module_config,
+            target_machine_features);
+      });
+}
+
+bool IsBatchDot(const HloInstruction& instr) {
+  if (auto* dot_instr = DynCast<HloDotInstruction>(&instr)) {
+    return dot_instr->dot_dimension_numbers().lhs_batch_dimensions_size() > 0;
+  }
+
+  return false;
+}
+}  // namespace
+
+bool DotImplementationCanHandleTranspose(
+    const HloInstruction& dot_instr,
+    const TargetMachineFeatures& target_machine_features) {
+  DotImplementationStrategy impl_strategy =
+      GetDotImplementationStrategy(dot_instr.parent()->parent()->config(),
+                                   DotInfo(dot_instr), target_machine_features);
+
+  // TODO(sanjoy): This is not quite right, it should be `impl_strategy ==
+  // kEigen || impl_strategy == kTiledLlvmIrGemv || impl_strategy ==
+  // kNaiveLlvmIr` but I'll fix this in a later CL in the interest of keeping
+  // the CL adding this comment NFC.
+  return impl_strategy == DotImplementationStrategy::kTiledLlvmIrGemm ||
+         impl_strategy == DotImplementationStrategy::kEigen;
 }
 
+bool DotOperandsAndResultMustHaveRowMajorLayout(
+    const HloInstruction& dot_instr,
+    const TargetMachineFeatures& target_machine_features) {
+  DotImplementationStrategy impl_strategy =
+      GetDotImplementationStrategy(dot_instr.parent()->parent()->config(),
+                                   DotInfo(dot_instr), target_machine_features);
+
+  return impl_strategy == DotImplementationStrategy::kTiledLlvmIrGemm ||
+         impl_strategy == DotImplementationStrategy::kEigen;
+}
+
+Status EmitDotOperation(const HloInstruction& dot,
+                        const llvm_ir::IrArray& target_array,
+                        const llvm_ir::IrArray& lhs_array,
+                        const llvm_ir::IrArray& rhs_array,
+                        const llvm_ir::IrArray* addend_array,
+                        llvm::Value* executable_run_options_value,
+                        llvm::IRBuilder<>* b,
+                        const HloModuleConfig& hlo_module_config,
+                        const TargetMachineFeatures& target_machine_features) {
+  // This routine assumes that the dot operation is not in a parallelized
+  // enclosing computation.
+  CHECK(dot.parent()->root_instruction()->outer_dimension_partitions().empty());
+
+  if (IsBatchDot(dot)) {
+    TF_RET_CHECK(addend_array == nullptr);
+    return EmitBatchDotOperation(dot, target_array, lhs_array, rhs_array,
+                                 executable_run_options_value, b,
+                                 hlo_module_config, target_machine_features);
+  }
+
+  return EmitNonBatchDotOperation(DotInfo(dot), dot.name(), target_array,
+                                  lhs_array, rhs_array, addend_array,
+                                  executable_run_options_value, b,
+                                  hlo_module_config, target_machine_features);
+}
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
index 4c2041b556aa8bf8fe8fb8e0674c0f4f04f0acae..105bd3005c86d87443b2528eba7b0106ad70590e 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
@@ -30,9 +30,16 @@ limitations under the License.
 
 namespace xla {
 namespace cpu {
+// Returns true if the two operands and the output of `dot_instr` must have row
+// major layout.
+bool DotOperandsAndResultMustHaveRowMajorLayout(
+    const HloInstruction& dot_instr,
+    const TargetMachineFeatures& target_machine_features);
 
-bool PotentiallyImplementedAsEigenDot(
-    const HloInstruction& hlo,
+// Returns true our lowering strategy for `dot_instr` can fold in transposes to
+// the either of the inputs.
+bool DotImplementationCanHandleTranspose(
+    const HloInstruction& dot_instr,
     const TargetMachineFeatures& target_machine_features);
 
 // Returns the index for an operand to `hlo` that should ideally be column
@@ -41,129 +48,24 @@ bool PotentiallyImplementedAsEigenDot(
 absl::optional<int64> ProfitableToMakeDotOperandColumnMajor(
     const HloInstruction& hlo);
 
-// Returns true to indicate that we can generate a tiled LLVM IR implementation
-// for |dot|.
-bool ProfitableToImplementDotInTiledLlvmIr(const HloInstruction& dot);
-
-// Helper class for emitting LLVM IR to perform the dot operation.
-class DotOpEmitter {
- public:
-  // Emit LLVM IR to perform the dot operation on lhs_array and rhs_array and
-  // place the result in target_array. IR is emitted at current insert point of
-  // the builder. Upon completion of the method, the insert point is set to the
-  // end of all instructions emitted for this operation.
-  //
-  // If `addend_array` is not nullptr then it must be an array of the same
-  // dimensions as the result, and the result is computed as `addend_array` +
-  // dot(`lhs_array`, `rhs_array`).  A non-null `addend_array` is only supported
-  // for Matrix-vector products.
-  static Status EmitDotOperation(
-      const HloInstruction& dot, const llvm_ir::IrArray& target_array,
-      const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
-      const llvm_ir::IrArray* addend_array,
-      llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
-      const HloModuleConfig& hlo_module_config,
-      const TargetMachineFeatures& target_machine_features);
-
- private:
-  DotOpEmitter(const HloInstruction& dot, const llvm_ir::IrArray& target_array,
-               const llvm_ir::IrArray& lhs_array,
-               const llvm_ir::IrArray& rhs_array,
-               const llvm_ir::IrArray* addend_array,
-               llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
-               const HloModuleConfig& hlo_module_config,
-               const TargetMachineFeatures& target_machine_features);
-
-  // Emits the IR to perform the dot operation.
-  Status Emit();
-
-  // Emits instructions to perform a scalar dot product (a multiply of the
-  // LHS and RHS) and store the results in the target.
-  Status EmitScalarDot();
-
-  // Emit an LLVM IR implementation of the dot operation if we can.  Returns
-  // true if an LLVM IR implementation was emitted.
-  bool EmitLlvmIrDotIfProfitable();
-
-  // Emits a call to the CPU runtime to perform the matrix multiply.
-  Status EmitCallToRuntime();
-
-  // Represents the dimensions of a matrix-matrix multiply operation.
-  struct MatMultDims {
-    // The number of rows in the LHS.
-    int64 m;
-
-    // The number of columns in the LHS, which is also must be equal to the
-    // number of rows in the RHS.
-    int64 k;
-
-    // The number of columns on the RHS.
-    int64 n;
-
-    // True if the LHS matrix is column major.
-    bool lhs_column_major;
-
-    // True if the LHS contraction dimension is not 1.
-    bool lhs_non_canonical;
-
-    // True if the RHS matrix is column major.
-    bool rhs_column_major;
-
-    // True if the RHS contraction dimension is not 0.
-    bool rhs_non_canonical;
-
-    // True if the result matrix is column major.
-    bool target_column_major;
-  };
-
-  // Get the MatMultDims instance for the dot product this DotOpEmitter
-  // represents.  Precondition: the dot is of rank 2 (and thus its operands are
-  // of rank 2 as well).
-  MatMultDims GetMatMultDims() const;
-
-  bool EmitSmallGemmIfProfitable(const MatMultDims& mat_mult_dims);
-
-  // When doing a tiled GEMV in LLVM IR, a "tile" consists of this many vector
-  // registers.
-  int64 GetGemvTilingFactor() const {
-    const int64 kDefaultTilingFactor = 8;
-    return options::LlvmIrGemvTilingFactor(hlo_module_config_)
-        .value_or(kDefaultTilingFactor);
-  }
-
-  std::tuple<int64, int64, int64> GetGemmTileSize() const {
-    // Tuned for broadwell - Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
-    //
-    // TODO(b/80093688): Tune for other architectures and centralize this
-    // information in one place.
-    const std::tuple<int64, int64, int64> kDefaultTileSize =
-        std::tuple<int64, int64, int64>(11, 9, 1);
-    return options::LlvmIrGemmTileSize(hlo_module_config_)
-        .value_or(kDefaultTileSize);
-  }
-
-  // Returns true if we should use an experimental implementation of GEMM
-  // (general matrix matrix multiplication) if possible.
-  bool EnableExperimentalLlvmIrGemm() const {
-    return options::EnableExperimentalLlvmIrGemm(hlo_module_config_);
-  }
-
-  // Returns true if we should call into multi-threaded Eigen routines.
-  bool ShouldUseMultiThreadedEigen() {
-    return hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
-  }
-
-  const HloInstruction& dot_;
-  const llvm_ir::IrArray& target_array_;
-  const llvm_ir::IrArray& lhs_array_;
-  const llvm_ir::IrArray& rhs_array_;
-  const llvm_ir::IrArray* addend_array_;
-  llvm::Value* executable_run_options_value_;
-  llvm::IRBuilder<>* b_;
-  const HloModuleConfig& hlo_module_config_;
-  const TargetMachineFeatures& target_machine_features_;
-};
-
+// Emit LLVM IR to perform the dot operation on lhs_array and rhs_array and
+// place the result in target_array. IR is emitted at current insert point of
+// the builder. Upon completion of the method, the insert point is set to the
+// end of all instructions emitted for this operation.
+//
+// If `addend_array` is not nullptr then it must be an array of the same
+// dimensions as the result, and the result is computed as `addend_array` +
+// dot(`lhs_array`, `rhs_array`).  A non-null `addend_array` is only supported
+// for Matrix-vector products.
+Status EmitDotOperation(const HloInstruction& dot,
+                        const llvm_ir::IrArray& target_array,
+                        const llvm_ir::IrArray& lhs_array,
+                        const llvm_ir::IrArray& rhs_array,
+                        const llvm_ir::IrArray* addend_array,
+                        llvm::Value* executable_run_options_value,
+                        llvm::IRBuilder<>* b,
+                        const HloModuleConfig& hlo_module_config,
+                        const TargetMachineFeatures& target_machine_features);
 }  // namespace cpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter_internal.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter_internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc28918ed60a8086135846e2b9b1b9d75ec31ef6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter_internal.h
@@ -0,0 +1,88 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_DOT_OP_EMITTER_INTERNAL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_DOT_OP_EMITTER_INTERNAL_H_
+
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+
+// -----------------------------------------------------------------------------
+// INTERNAL HEADER.
+//
+// This file exposes internal implementation details from dot_op_emitter.cc for
+// unit tests.  Please do not depend on this!
+//
+// -----------------------------------------------------------------------------
+
+namespace xla {
+namespace cpu {
+namespace internal {
+
+// Represents a dot operation.  We use this in lieu of an `HloInstruction`
+// because we want to be able to create this for the "inner" dot operation in a
+// batch dot, for which there is no separate HLO instruction.
+struct DotInfo {
+  Shape lhs_shape;
+  Shape rhs_shape;
+  Shape result_shape;
+  DotDimensionNumbers dim_nums;
+
+  explicit DotInfo(const HloInstruction& instr) {
+    CHECK_EQ(instr.opcode(), HloOpcode::kDot);
+    lhs_shape = instr.operand(0)->shape();
+    rhs_shape = instr.operand(1)->shape();
+    result_shape = instr.shape();
+    dim_nums = instr.dot_dimension_numbers();
+  }
+};
+
+// Dictates how a dot operation is implemented.
+enum class DotImplementationStrategy {
+  // The dot operation is lowered into LLVM IR that implements a naive nested
+  // loop that computes the result one element at a time.  This is our
+  // "fallback"; we don't really want this to kick in for any non-trival dot
+  // operation.
+  kNaiveLlvmIr,
+
+  // The dot operation is lowered into LLVM IR that implements a tiled
+  // Matrix*Vector operation.  This strategy also allows fusing in a bias add
+  // into the dot.  The matrix can be row major or column major, both are
+  // supported.
+  kTiledLlvmIrGemv,
+
+  // The dot operation is lowered into LLVM IR that implemetns a tiled
+  // Matrix*Matrix operation.  No fusions are supported.  The two inputs
+  // and the output have to be row major.
+  kTiledLlvmIrGemm,
+
+  // The dot operation is lowered into a call into an Eigen routine.  No fusions
+  // are supported today.  The two inputs and the output have to be row major.
+  // However, we do allow transposing either the LHS or the RHS as part of the
+  // GEMM -- we expose this flexibility as flexibility in the contraction
+  // dimensions, but we can also see this as flexibility in the input layouts.
+  kEigen,
+};
+
+// Returns the implementation strategy for a dot with the configuration
+// `dot_info`.
+DotImplementationStrategy GetDotImplementationStrategy(
+    const HloModuleConfig& config, const DotInfo& dot_info,
+    const TargetMachineFeatures& target_machine_features);
+}  // namespace internal
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_DOT_OP_EMITTER_INTERNAL_H_
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
index c8312d80bd5012e5bcb42a410db18a7fa77a2eb6..0028fbaed895becad8da496aa8acdf7dc173a2a0 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
@@ -51,10 +51,11 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitAtan2(PrimitiveType prim_type,
       return Unimplemented("atan2");
   }
   // Create a function declaration.
-  llvm::Function* function =
-      llvm::cast<llvm::Function>(module_->getOrInsertFunction(
-          llvm_ir::AsStringRef(function_name), lhs->getType(), lhs->getType(),
-          rhs->getType()));
+  llvm::Function* function = llvm::dyn_cast<llvm::Function>(
+      module_
+          ->getOrInsertFunction(llvm_ir::AsStringRef(function_name),
+                                lhs->getType(), lhs->getType(), rhs->getType())
+          .getCallee());
   function->setCallingConv(llvm::CallingConv::C);
   function->setDoesNotThrow();
   function->setDoesNotAccessMemory();
@@ -85,9 +86,11 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitTanh(PrimitiveType prim_type,
       return Unimplemented("tanh");
   }
   // Create a function declaration.
-  llvm::Function* function = llvm::cast<llvm::Function>(
-      module_->getOrInsertFunction(llvm_ir::AsStringRef(function_name),
-                                   value->getType(), value->getType()));
+  llvm::Function* function = llvm::dyn_cast<llvm::Function>(
+      module_
+          ->getOrInsertFunction(llvm_ir::AsStringRef(function_name),
+                                value->getType(), value->getType())
+          .getCallee());
   function->setCallingConv(llvm::CallingConv::C);
   function->setDoesNotThrow();
   function->setDoesNotAccessMemory();
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
index 1a8bedfe6afb4f096ddd4703c312b84d521a7ba5..a8b139aec9e96b6bb580baf74789df7c998cebf8 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
@@ -26,7 +26,7 @@ namespace cpu {
 
 int64 GetMinimumAlignmentForArray(
     const Shape& shape, const TargetMachineFeatures& target_machine_features) {
-  CHECK(ShapeUtil::IsArray(shape));
+  CHECK(shape.IsArray());
   CHECK(!LayoutUtil::HasLayout(shape) || LayoutUtil::IsDense(shape.layout()));
 
   // We don't require a layout to be set on `shape`.  This only works on CPU
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 4032c2da2f33ee61da8771ae6225a14172cbe6e8..2418d96440f9994842a54769cf6d561610ccfa18 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -24,11 +24,9 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+// IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/core/lib/math/math_util.h"
-#include "tensorflow/core/platform/logging.h"
-// IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/types/span.h"
@@ -70,6 +68,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/math/math_util.h"
+#include "tensorflow/core/platform/logging.h"
 
 namespace xla {
 
@@ -77,7 +77,6 @@ namespace {
 using llvm_ir::AsStringRef;
 using llvm_ir::IrName;
 using llvm_ir::SetToFirstInsertPoint;
-namespace gtl = tensorflow::gtl;
 }  // namespace
 
 namespace cpu {
@@ -87,7 +86,8 @@ IrEmitter::IrEmitter(
     llvm::Module* llvm_module,
     std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx,
     std::unordered_map<const HloComputation*, int64> computation_to_profile_idx,
-    const TargetMachineFeatures* target_machine_features)
+    const TargetMachineFeatures* target_machine_features,
+    bool emit_code_for_msan)
     : assignment_(assignment),
       module_(llvm_module),
       arch_type_(llvm::Triple(llvm_module->getTargetTriple()).getArch()),
@@ -97,7 +97,8 @@ IrEmitter::IrEmitter(
       alias_analysis_(hlo_module, assignment, &llvm_module->getContext()),
       hlo_module_config_(hlo_module.config()),
       is_top_level_computation_(false),
-      target_machine_features_(*target_machine_features) {
+      target_machine_features_(*target_machine_features),
+      emit_code_for_msan_(emit_code_for_msan) {
   b_.setFastMathFlags(llvm_ir::GetFastMathFlags(
       /*fast_math_enabled=*/hlo_module_config_.debug_options()
           .xla_cpu_enable_fast_math()));
@@ -111,10 +112,9 @@ IrEmitter::IrEmitter(
 StatusOr<llvm::Function*> IrEmitter::EmitComputation(
     HloComputation* computation, const string& function_name_prefix,
     bool is_top_level_computation,
-    const std::vector<HloInstruction*>* instruction_order) {
+    absl::Span<HloInstruction* const> instruction_order) {
   string function_name = name_uniquer_.GetUniqueName(function_name_prefix);
-  VLOG(2) << "Emitting IR for CPU function [" << function_name_prefix
-          << "]; ordered? " << (instruction_order != nullptr);
+  VLOG(2) << "Emitting IR for CPU function [" << function_name_prefix << "]";
   is_top_level_computation_ = is_top_level_computation;
   num_dynamic_loop_bounds_ = 0;
   if (!computation->root_instruction()->outer_dimension_partitions().empty()) {
@@ -141,11 +141,7 @@ StatusOr<llvm::Function*> IrEmitter::EmitComputation(
   bool use_rdtscp = arch_type_ == llvm::Triple::ArchType::x86 ||
                     arch_type_ == llvm::Triple::ArchType::x86_64;
   profiling_state_ = ProfilingState(use_rdtscp);
-  if (instruction_order == nullptr) {
-    TF_RETURN_IF_ERROR(computation->Accept(this));
-  } else {
-    TF_RETURN_IF_ERROR(computation->AcceptOrdered(this, *instruction_order));
-  }
+  TF_RETURN_IF_ERROR(computation->AcceptOrdered(this, instruction_order));
   llvm::Function* ir_function = compute_function_->function();
   InsertOrDie(&emitted_functions_, computation, ir_function);
   // Delete 'compute_function', finalizing 'ir_function' and restoring caller
@@ -228,11 +224,11 @@ Status IrEmitter::HandleConstant(HloInstruction* constant) {
 }
 
 Status IrEmitter::HandleCopy(HloInstruction* copy) {
-  if (ShapeUtil::IsTuple(copy->shape())) {
+  if (copy->shape().IsTuple()) {
     // kCopy shallow copies a tuple so just memcpy the top-level buffer.
     TF_RETURN_IF_ERROR(EmitTargetAddressForOp(copy));
     return EmitMemcpy(*(copy->operand(0)), *copy);
-  } else if (ShapeUtil::IsArray(copy->shape())) {
+  } else if (copy->shape().IsArray()) {
     // Use the elemental emitter for array shapes.
     return DefaultAction(copy);
   }
@@ -244,10 +240,12 @@ Status IrEmitter::HandleCopy(HloInstruction* copy) {
 int IrEmitter::MinimumAlignmentForPrimitiveType(PrimitiveType primitive_type) {
   int64 byte_size = ShapeUtil::ByteSizeOfPrimitiveType(primitive_type);
   DCHECK_GE(byte_size, 0);
-  // Largest scalar is a complex64 so we don't need to worry about the
+  // Largest scalar is a complex128 so we don't need to worry about the
   // int64->int truncation here.
-  DCHECK_LE(byte_size, 8);
-  return byte_size;
+  DCHECK_LE(byte_size, 16);
+
+  // Allocations may be 8-byte aligned if part of a small block.
+  return std::min(8LL, byte_size);
 }
 
 int64 IrEmitter::ByteSizeOf(const Shape& shape) const {
@@ -321,7 +319,7 @@ Status IrEmitter::HandleTupleSelect(HloInstruction* tuple_select) {
   auto on_false = tuple_select->operand(2);
   TF_RET_CHECK(pred->shape().element_type() == PRED);
   TF_RET_CHECK(ShapeUtil::IsScalar(pred->shape()));
-  TF_RET_CHECK(ShapeUtil::IsTuple(tuple_select->shape()));
+  TF_RET_CHECK(tuple_select->shape().IsTuple());
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(tuple_select));
   llvm_ir::EmitTupleSelect(GetIrArrayFor(tuple_select), GetIrArrayFor(pred),
                            GetEmittedValueFor(on_true),
@@ -351,7 +349,7 @@ Status IrEmitter::HandleInfeed(HloInstruction* instruction) {
   llvm_ir::EmitTuple(GetIrArrayFor(infeed), {data_address, token_address}, &b_,
                      module_);
 
-  if (ShapeUtil::IsTuple(data_shape)) {
+  if (data_shape.IsTuple()) {
     TF_RET_CHECK(!ShapeUtil::IsNestedTuple(data_shape));
 
     // For a tuple, we first copy each of the internal elements to
@@ -415,11 +413,18 @@ Status IrEmitter::EmitXfeedTransfer(XfeedKind kind, const Shape& shape,
 
   llvm::Function* acquire_func;
   if (kind == XfeedKind::kInfeed) {
-    acquire_func = llvm::cast<llvm::Function>(module_->getOrInsertFunction(
-        runtime::kAcquireInfeedBufferForDequeueSymbolName, acquire_type));
+    acquire_func = llvm::dyn_cast<llvm::Function>(
+        module_
+            ->getOrInsertFunction(
+                runtime::kAcquireInfeedBufferForDequeueSymbolName, acquire_type)
+            .getCallee());
   } else {
-    acquire_func = llvm::cast<llvm::Function>(module_->getOrInsertFunction(
-        runtime::kAcquireOutfeedBufferForPopulationSymbolName, acquire_type));
+    acquire_func = llvm::dyn_cast<llvm::Function>(
+        module_
+            ->getOrInsertFunction(
+                runtime::kAcquireOutfeedBufferForPopulationSymbolName,
+                acquire_type)
+            .getCallee());
   }
   acquire_func->setCallingConv(llvm::CallingConv::C);
 
@@ -432,11 +437,19 @@ Status IrEmitter::EmitXfeedTransfer(XfeedKind kind, const Shape& shape,
 
   llvm::Function* release_func;
   if (kind == XfeedKind::kInfeed) {
-    release_func = llvm::cast<llvm::Function>(module_->getOrInsertFunction(
-        runtime::kReleaseInfeedBufferAfterDequeueSymbolName, release_type));
+    release_func = llvm::dyn_cast<llvm::Function>(
+        module_
+            ->getOrInsertFunction(
+                runtime::kReleaseInfeedBufferAfterDequeueSymbolName,
+                release_type)
+            .getCallee());
   } else {
-    release_func = llvm::cast<llvm::Function>(module_->getOrInsertFunction(
-        runtime::kReleaseOutfeedBufferAfterPopulationSymbolName, release_type));
+    release_func = llvm::dyn_cast<llvm::Function>(
+        module_
+            ->getOrInsertFunction(
+                runtime::kReleaseOutfeedBufferAfterPopulationSymbolName,
+                release_type)
+            .getCallee());
   }
   release_func->setCallingConv(llvm::CallingConv::C);
 
@@ -475,7 +488,7 @@ Status IrEmitter::HandleOutfeed(HloInstruction* outfeed) {
   const Shape& operand_shape = operand->shape();
 
   llvm::Value* value = GetEmittedValueFor(operand);
-  if (!ShapeUtil::IsTuple(operand_shape)) {
+  if (!operand_shape.IsTuple()) {
     return EmitXfeedTransfer(XfeedKind::kOutfeed, operand_shape, value);
   }
 
@@ -498,6 +511,27 @@ Status IrEmitter::HandleSort(HloInstruction* hlo) {
   const HloSortInstruction* sort = Cast<HloSortInstruction>(hlo);
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(sort));
   Shape keys_shape = sort->keys()->shape();
+  PrimitiveType keys_type = keys_shape.element_type();
+  switch (keys_type) {
+    case PRED:
+    case S8:
+    case U8:
+    case S16:
+    case U16:
+    case BF16:
+    case F16:
+    case S32:
+    case U32:
+    case F32:
+    case S64:
+    case U64:
+    case F64:
+      break;
+    default:
+      return Unimplemented(
+          "Element type %s not supported in the Sort op on CPU.",
+          PrimitiveType_Name(keys_type));
+  }
   std::vector<llvm::Value*> destination_addresses(sort->operand_count());
   for (int64 i = 0; i < sort->operand_count(); ++i) {
     ShapeIndex shape_index =
@@ -540,110 +574,52 @@ Status IrEmitter::HandleSort(HloInstruction* hlo) {
     higher_dimensions *= normalized_keys_shape.dimensions(i);
   }
   int64 lower_dimensions = 1;
-  for (int64 i = ShapeUtil::Rank(normalized_keys_shape) - 1;
+  for (int64 i = normalized_keys_shape.rank() - 1;
        i > physical_dimension_to_sort; --i) {
     lower_dimensions *= normalized_keys_shape.dimensions(i);
   }
 
-  PrimitiveType keys_type = keys_shape.element_type();
-  const char* fn_name = nullptr;
-  llvm::Type* keys_native_type = nullptr;
-  switch (keys_type) {
-    case PRED:
-      fn_name = runtime::kKeyValueSortPREDSymbolName;
-      keys_native_type = b_.getInt8PtrTy();
-      break;
-    case S8:
-      fn_name = runtime::kKeyValueSortS8SymbolName;
-      keys_native_type = b_.getInt8PtrTy();
-      break;
-    case U8:
-      fn_name = runtime::kKeyValueSortU8SymbolName;
-      keys_native_type = b_.getInt8PtrTy();
-      break;
-    case S16:
-      fn_name = runtime::kKeyValueSortS16SymbolName;
-      keys_native_type = b_.getInt16Ty()->getPointerTo();
-      break;
-    case U16:
-      fn_name = runtime::kKeyValueSortU16SymbolName;
-      keys_native_type = b_.getInt16Ty()->getPointerTo();
-      break;
-    case F16:
-      fn_name = runtime::kKeyValueSortF16SymbolName;
-      keys_native_type = b_.getHalfTy()->getPointerTo();
-      break;
-    case S32:
-      fn_name = runtime::kKeyValueSortS32SymbolName;
-      keys_native_type = b_.getInt32Ty()->getPointerTo();
-      break;
-    case U32:
-      fn_name = runtime::kKeyValueSortU32SymbolName;
-      keys_native_type = b_.getInt32Ty()->getPointerTo();
-      break;
-    case F32:
-      fn_name = runtime::kKeyValueSortF32SymbolName;
-      keys_native_type = b_.getFloatTy()->getPointerTo();
-      break;
-    case S64:
-      fn_name = runtime::kKeyValueSortS64SymbolName;
-      keys_native_type = b_.getInt64Ty()->getPointerTo();
-      break;
-    case U64:
-      fn_name = runtime::kKeyValueSortU64SymbolName;
-      keys_native_type = b_.getInt64Ty()->getPointerTo();
-      break;
-    case F64:
-      fn_name = runtime::kKeyValueSortF64SymbolName;
-      keys_native_type = b_.getDoubleTy()->getPointerTo();
-      break;
-    default:
-      return Unimplemented(
-          "Element type %s not supported in the Sort op on CPU.",
-          PrimitiveType_Name(keys_type));
-  }
-
+  auto less_than_function = FindOrDie(emitted_functions_, sort->to_apply());
+  CHECK(absl::c_binary_search(thread_local_computations_, sort->to_apply()));
   llvm::FunctionType* key_value_sort_type = llvm::FunctionType::get(
       b_.getVoidTy(),
-      {keys_native_type, b_.getInt64Ty(), b_.getInt64Ty(), b_.getInt64Ty(),
+      {b_.getInt64Ty(), b_.getInt64Ty(), b_.getInt64Ty(),
        b_.getInt8PtrTy()->getPointerTo(), b_.getInt32Ty(),
-       b_.getInt32Ty()->getPointerTo()},
+       b_.getInt32Ty()->getPointerTo(), b_.getInt1Ty(), b_.getInt8PtrTy(),
+       b_.getInt64Ty()->getPointerTo(), less_than_function->getType()},
       /*isVarArg=*/false);
-  auto* key_value_sort_func = llvm::cast<llvm::Function>(
-      module_->getOrInsertFunction(fn_name, key_value_sort_type));
+  auto* key_value_sort_func = llvm::dyn_cast<llvm::Function>(
+      module_
+          ->getOrInsertFunction(runtime::kKeyValueSortSymbolName,
+                                key_value_sort_type)
+          .getCallee());
   key_value_sort_func->setCallingConv(llvm::CallingConv::C);
   key_value_sort_func->setDoesNotThrow();
-  llvm::Value* values;
-  llvm::Value* sizes;
-  if (sort->values_count() == 0) {
-    values = llvm::Constant::getNullValue(b_.getInt8PtrTy()->getPointerTo());
-    sizes = llvm::Constant::getNullValue(b_.getInt32Ty()->getPointerTo());
-  } else {
-    values = llvm_ir::EmitAllocaAtFunctionEntryWithCount(
-        b_.getInt8PtrTy(), b_.getInt32(sort->values_count()),
-        "cc_values_alloca", &b_);
-    sizes = llvm_ir::EmitAllocaAtFunctionEntryWithCount(
-        b_.getInt32Ty(), b_.getInt32(sort->values_count()), "cc_sizes_alloca",
-        &b_);
-    for (int64 i = 0; i < sort->values_count(); ++i) {
-      llvm::Value* value_as_i8ptr =
-          PointerCast(destination_addresses[i + 1], b_.getInt8PtrTy());
-      llvm::Value* slot_in_values_alloca =
-          ConstInBoundsGEP1_32(b_.getInt8PtrTy(), values, i);
-      Store(value_as_i8ptr, slot_in_values_alloca);
-      llvm::Value* slot_in_sizes_alloca =
-          ConstInBoundsGEP1_32(b_.getInt32Ty(), sizes, i);
-      llvm::Value* size = b_.getInt32(ShapeUtil::ByteSizeOfPrimitiveType(
-          sort->operand(i + 1)->shape().element_type()));
-      Store(size, slot_in_sizes_alloca);
-    }
+  llvm::Value* values = llvm_ir::EmitAllocaAtFunctionEntryWithCount(
+      b_.getInt8PtrTy(), b_.getInt32(sort->operand_count()), "cc_values_alloca",
+      &b_);
+  llvm::Value* sizes = llvm_ir::EmitAllocaAtFunctionEntryWithCount(
+      b_.getInt32Ty(), b_.getInt32(sort->operand_count()), "cc_sizes_alloca",
+      &b_);
+  for (int64 i = 0; i < sort->operand_count(); ++i) {
+    llvm::Value* value_as_i8ptr =
+        PointerCast(destination_addresses[i], b_.getInt8PtrTy());
+    llvm::Value* slot_in_values_alloca =
+        ConstInBoundsGEP1_32(b_.getInt8PtrTy(), values, i);
+    Store(value_as_i8ptr, slot_in_values_alloca);
+    llvm::Value* slot_in_sizes_alloca =
+        ConstInBoundsGEP1_32(b_.getInt32Ty(), sizes, i);
+    llvm::Value* size = b_.getInt32(ShapeUtil::ByteSizeOfPrimitiveType(
+        sort->operand(i)->shape().element_type()));
+    Store(size, slot_in_sizes_alloca);
   }
 
   Call(key_value_sort_func,
-       {PointerCast(destination_addresses[0], keys_native_type),
-        b_.getInt64(higher_dimensions), b_.getInt64(sort_dimension_elements),
+       {b_.getInt64(higher_dimensions), b_.getInt64(sort_dimension_elements),
         b_.getInt64(lower_dimensions), values,
-        b_.getInt32(sort->values_count()), sizes});
+        b_.getInt32(sort->operand_count()), sizes,
+        b_.getInt1(sort->is_stable()), GetExecutableRunOptionsArgument(),
+        GetProfileCountersArgument(), less_than_function});
 
   if (sort->values_count() > 0) {
     llvm_ir::EmitTuple(GetIrArrayFor(sort), destination_addresses, &b_,
@@ -752,11 +728,6 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForReduceWindow(
 }
 
 Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) {
-  TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
-      /*instruction=*/*reduce_window,
-      /*operands=*/{reduce_window->operand(0)},
-      /*supported_types=*/{F32, BF16, S32, F16}));
-
   // Pseudo code for reduce window:
   //
   //   for (coordinates O in the output)
@@ -784,8 +755,8 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) {
   const auto init_value = select_and_scatter->operand(2);
   const Window& window = select_and_scatter->window();
   PrimitiveType operand_element_type = operand->shape().element_type();
-  const int64 rank = ShapeUtil::Rank(operand->shape());
-  CHECK_EQ(rank, ShapeUtil::Rank(source->shape()));
+  const int64 rank = operand->shape().rank();
+  CHECK_EQ(rank, source->shape().rank());
   CHECK_EQ(rank, window.dimensions_size());
 
   // TODO(b/31410564): Implement dilation for select-and-scatter.
@@ -947,12 +918,8 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   auto rhs = dot->operand(1);
   TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
       /*instruction=*/*dot, /*operands=*/{lhs, rhs},
-      /*supported_types=*/{F16, F32, F64, C64}));
+      /*supported_types=*/{F16, F32, F64, C64, C128}));
   const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
-  if (dnums.lhs_batch_dimensions_size() > 0 ||
-      dnums.rhs_batch_dimensions_size() > 0) {
-    return Unimplemented("Dot with batch dimensions not implemented.");
-  }
 
   if (dnums.lhs_contracting_dimensions_size() != 1) {
     // This is disallowed by ShapeInference today.
@@ -975,10 +942,10 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
           << llvm_ir::DumpToString(*target_array.GetBasePointer());
 
   // Dot operation is complicated so we delegate to a helper class.
-  return DotOpEmitter::EmitDotOperation(
-      *dot, target_array, lhs_array, rhs_array, /*addend_array=*/nullptr,
-      GetExecutableRunOptionsArgument(), &b_, hlo_module_config_,
-      target_machine_features_);
+  return EmitDotOperation(*dot, target_array, lhs_array, rhs_array,
+                          /*addend_array=*/nullptr,
+                          GetExecutableRunOptionsArgument(), &b_,
+                          hlo_module_config_, target_machine_features_);
 }
 
 StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForConvolution(
@@ -1123,7 +1090,7 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
   auto rhs = convolution->operand(1);
   TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
       /*instruction=*/*convolution, /*operands=*/{lhs, rhs},
-      /*supported_types=*/{F16, F32, C64}));
+      /*supported_types=*/{F16, F32, C64, C128}));
 
   // TODO(tonywy): Add PotentiallyImplementedAsMKLCovolution to support
   // different data layouts.
@@ -1236,8 +1203,8 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
         LOG(WARNING) << "Using Eigen instead of MKL-DNN for single-threaded "
                         "conv2d function.";
       }
-      llvm::Function* conv_func = llvm::cast<llvm::Function>(
-          module_->getOrInsertFunction(fn_name, conv_type));
+      llvm::Function* conv_func = llvm::dyn_cast<llvm::Function>(
+          module_->getOrInsertFunction(fn_name, conv_type).getCallee());
       conv_func->setCallingConv(llvm::CallingConv::C);
       conv_func->setDoesNotThrow();
       conv_func->setOnlyAccessesArgMemory();
@@ -1320,8 +1287,8 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
                             ? runtime::kEigenFftSymbolName
                             : runtime::kEigenSingleThreadedFftSymbolName;
 
-  llvm::Function* fft_func = llvm::cast<llvm::Function>(
-      module_->getOrInsertFunction(fn_name, fft_type));
+  llvm::Function* fft_func = llvm::dyn_cast<llvm::Function>(
+      module_->getOrInsertFunction(fn_name, fft_type).getCallee());
   fft_func->setCallingConv(llvm::CallingConv::C);
   fft_func->setDoesNotThrow();
   fft_func->setOnlyAccessesInaccessibleMemOrArgMem();
@@ -1338,11 +1305,11 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
   return Status::OK();
 }
 
-Status IrEmitter::HandleCrossReplicaSum(HloInstruction* crs) {
+Status IrEmitter::HandleAllReduce(HloInstruction* crs) {
   if (hlo_module_config_.replica_count() != 1) {
     // TODO(b/33011107): Support nontrivial cross replica sum on CPU.
     return Unimplemented(
-        "CrossReplicaSum with >1 replica is not implemented on CPU.");
+        "AllReduce with >1 replica is not implemented on CPU.");
   }
 
   // When there is a single replica, a cross replica sum is the identity
@@ -1367,8 +1334,8 @@ Status IrEmitter::HandleCrossReplicaSum(HloInstruction* crs) {
                         assignment_.GetUniqueSlice(crs, {i}));
 
     const Shape& operand_shape = crs->operand(i)->shape();
-    CHECK(ShapeUtil::IsArray(operand_shape))
-        << "Operands to cross-replica-sum must be arrays: " << crs->ToString();
+    CHECK(operand_shape.IsArray())
+        << "Operands to all-reduce must be arrays: " << crs->ToString();
     operand_ptrs.push_back(EmitBufferPointer(out_slice, operand_shape));
 
     // TODO(b/63762267): Be more aggressive about specifying alignment.
@@ -1404,7 +1371,7 @@ static bool ReductionPreservesLayout(const HloInstruction& reduce) {
 
   int64 delta = 0;
   for (int64 i = 0; i < operand_shape.dimensions_size(); i++) {
-    if (reduced_dims.count(i)) {
+    if (reduced_dims.contains(i)) {
       delta++;
     } else {
       InsertOrDie(&unreduced_dim_map, i, i - delta);
@@ -1417,7 +1384,7 @@ static bool ReductionPreservesLayout(const HloInstruction& reduce) {
   for (int64 operand_dim_idx = 0;
        operand_dim_idx < operand_shape.dimensions_size(); operand_dim_idx++) {
     int64 operand_dim = operand_shape.layout().minor_to_major(operand_dim_idx);
-    if (!reduced_dims.count(operand_dim)) {
+    if (!reduced_dims.contains(operand_dim)) {
       if (FindOrDie(unreduced_dim_map, operand_dim) !=
           result_shape.layout().minor_to_major(result_dim_idx++)) {
         return false;
@@ -1714,10 +1681,8 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
       vectorization_factor_in_bytes /
       ShapeUtil::ByteSizeOfPrimitiveType(reduce->shape().element_type());
 
-  bool is_reduction_over_minor_dimension =
-      std::find(dimensions.begin(), dimensions.end(),
-                LayoutUtil::Minor(arg->shape().layout(), 0)) !=
-      dimensions.end();
+  bool is_reduction_over_minor_dimension = absl::c_linear_search(
+      dimensions, LayoutUtil::Minor(arg->shape().layout(), 0));
 
   unsigned element_alignment = tensorflow::MathUtil::GCD<unsigned>(
       ShapeUtil::ByteSizeOfPrimitiveType(reduce->shape().element_type()),
@@ -1729,7 +1694,7 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
     return false;
   }
 
-  CHECK(!ShapeUtil::IsTuple(reduce->shape()));
+  CHECK(!reduce->shape().IsTuple());
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(reduce));
 
   // We know we're not reducing over the most minor dimension, which means we
@@ -1895,8 +1860,8 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForReduce(
 }
 
 Status IrEmitter::HandleReduce(HloInstruction* reduce) {
-  // TODO(b/112040122): Support variadic reduce.
-  if (!ShapeUtil::IsArray(reduce->shape())) {
+  // TODO(b/118333695): Support variadic reduce.
+  if (!reduce->shape().IsArray()) {
     return Unimplemented("Variadic reduce is not supported on CPU");
   }
   auto arg = reduce->mutable_operand(0);
@@ -1995,7 +1960,7 @@ Status IrEmitter::HandleSlice(HloInstruction* slice) {
   // The memcpy will copy elements that are logically this shape (allowed to be
   // scalar).
   const Shape logical_element_shape = ShapeUtil::FilterDimensions(
-      [&inner_dims](int64 dim) -> bool { return inner_dims.count(dim); },
+      [&inner_dims](int64 dim) { return inner_dims.contains(dim); },
       operand->shape());
 
   const int64 primitive_elements_per_logical_element =
@@ -2210,10 +2175,10 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
     llvm_ir::IrArray addend_array(
         GetIrArrayFor(fusion->operand(addend_param_number)));
 
-    TF_RETURN_IF_ERROR(DotOpEmitter::EmitDotOperation(
-        *dot, target_array, lhs_array, rhs_array, &addend_array,
-        GetExecutableRunOptionsArgument(), &b_, hlo_module_config_,
-        target_machine_features_));
+    TF_RETURN_IF_ERROR(
+        EmitDotOperation(*dot, target_array, lhs_array, rhs_array,
+                         &addend_array, GetExecutableRunOptionsArgument(), &b_,
+                         hlo_module_config_, target_machine_features_));
     return Status::OK();
   } else {
     return Unimplemented("Fusion kind not implemented on CPU");
@@ -2262,15 +2227,51 @@ Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
         InBoundsGEP(operands_alloca, {b_.getInt64(i)});
     Store(operand_as_i8ptr, slot_in_operands_alloca);
   }
-  auto* custom_call_ir_function =
-      llvm::cast<llvm::Function>(module_->getOrInsertFunction(
-          AsStringRef(custom_call_target),
-          llvm::FunctionType::get(
-              /*Result=*/b_.getVoidTy(),
-              /*Params=*/{i8_ptr_type, operands_alloca->getType()},
-              /*isVarArg=*/false)));
+  if (emit_code_for_msan_) {
+    // Mark the alloca as initialized for msan. The buffer gets read by the
+    // custom callee, which might be msan-instrumented.
+    // TODO(b/66051036): Run the msan instrumentation pass instead.
+    const llvm::DataLayout& dl = module_->getDataLayout();
+    llvm::Type* intptr_type = b_.getIntPtrTy(dl);
+    auto* msan_unpoison_ir_function = llvm::cast<llvm::Function>(
+        module_
+            ->getOrInsertFunction(
+                "__msan_unpoison",
+                llvm::FunctionType::get(
+                    /*Result=*/b_.getVoidTy(),
+                    /*Params=*/{i8_ptr_type, intptr_type}, /*isVarArg=*/false))
+            .getCallee());
+    Call(msan_unpoison_ir_function,
+         {PointerCast(operands_alloca, i8_ptr_type),
+          llvm::ConstantInt::get(
+              intptr_type, *operands_alloca->getAllocationSizeInBits(dl) / 8)});
+  }
+  auto* custom_call_ir_function = llvm::dyn_cast<llvm::Function>(
+      module_
+          ->getOrInsertFunction(
+              AsStringRef(custom_call_target),
+              llvm::FunctionType::get(
+                  /*Result=*/b_.getVoidTy(),
+                  /*Params=*/{i8_ptr_type, operands_alloca->getType()},
+                  /*isVarArg=*/false))
+          .getCallee());
 
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(custom_call));
+  // Write the tuple table if the output is a tuple.
+  if (custom_call->shape().IsTuple()) {
+    std::vector<llvm::Value*> base_ptrs;
+    for (int i = 0; i < ShapeUtil::TupleElementCount(custom_call->shape());
+         ++i) {
+      const Shape& elem_shape =
+          ShapeUtil::GetTupleElementShape(custom_call->shape(), i);
+      TF_RET_CHECK(!elem_shape.IsTuple()) << "Nested tuples not implemented";
+      TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice slice,
+                          assignment_.GetUniqueSlice(custom_call, {i}));
+      llvm::Value* addr = EmitBufferPointer(slice, elem_shape);
+      base_ptrs.push_back(addr);
+    }
+    llvm_ir::EmitTuple(GetIrArrayFor(custom_call), base_ptrs, &b_, module_);
+  }
   auto* output_address_arg =
       PointerCast(GetEmittedValueFor(custom_call), i8_ptr_type);
 
@@ -2391,8 +2392,7 @@ StatusOr<bool> IrEmitter::EmitFastConcatenate(
   int64 concat_dim = concatenate->dimensions(0);
   const Layout& output_layout = output_shape.layout();
   auto output_min2maj = LayoutUtil::MinorToMajor(output_layout);
-  auto concat_dim_layout_itr =
-      std::find(output_min2maj.begin(), output_min2maj.end(), concat_dim);
+  auto concat_dim_layout_itr = absl::c_find(output_min2maj, concat_dim);
 
   std::vector<int64> inner_dims(output_min2maj.begin(), concat_dim_layout_itr);
   std::vector<int64> outer_dims(std::next(concat_dim_layout_itr),
@@ -2792,7 +2792,7 @@ llvm::Value* IrEmitter::EmitThreadLocalBufferPointer(
           llvm_ir::EmitBufferIndexingGEP(params, param_number, &b_);
       llvm::LoadInst* param_address_untyped = Load(param_address_offset);
 
-      if (!ShapeUtil::IsOpaque(target_shape)) {
+      if (!target_shape.IsOpaque()) {
         AttachAlignmentMetadataForLoad(param_address_untyped, target_shape);
         AttachDereferenceableMetadataForLoad(param_address_untyped,
                                              target_shape);
@@ -2851,7 +2851,9 @@ llvm::Value* IrEmitter::EmitBufferPointer(const BufferAllocation::Slice& slice,
   if (slice.allocation()->is_thread_local()) {
     return EmitThreadLocalBufferPointer(slice, target_shape);
   } else if (slice.allocation()->is_constant()) {
-    return FindOrDie(constant_buffer_to_global_, slice.allocation()->index());
+    return BitCast(
+        FindOrDie(constant_buffer_to_global_, slice.allocation()->index()),
+        IrShapeType(target_shape)->getPointerTo());
   } else {
     return EmitGlobalBufferPointer(slice, target_shape);
   }
@@ -2944,8 +2946,7 @@ Status IrEmitter::ElementTypesSameAndSupported(
 
   TF_RET_CHECK(!operands.empty());
   PrimitiveType primitive_type = operands[0]->shape().element_type();
-  if (std::find(supported_types.begin(), supported_types.end(),
-                primitive_type) == supported_types.end()) {
+  if (!absl::c_linear_search(supported_types, primitive_type)) {
     return Unimplemented("unsupported operand type %s in op %s",
                          PrimitiveType_Name(primitive_type),
                          HloOpcodeString(instruction.opcode()));
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 559a8162a2d53f28ea6817653503c216af90a610..0e372335f3aae919f9a9c559f86d4d61ab799b70 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -72,13 +72,15 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   //              index in the profiling array.
   // computation_to_profile_idx: the mapping from HLO computations to their
   //              index in the profiling array.
+  // emit_code_for_msan: whether emitted code should be compatible with msan.
   IrEmitter(const HloModule& hlo_module, const BufferAssignment& assignment,
             llvm::Module* llvm_module,
             std::unordered_map<const HloInstruction*, int64>
                 instruction_to_profile_idx,
             std::unordered_map<const HloComputation*, int64>
                 computation_to_profile_idx,
-            const TargetMachineFeatures* target_machine);
+            const TargetMachineFeatures* target_machine,
+            bool emit_code_for_msan);
   ~IrEmitter() override;
 
   // Emit and return the given HLO computation as an LLVM IR
@@ -101,7 +103,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   StatusOr<llvm::Function*> EmitComputation(
       HloComputation* computation, const string& function_name_prefix,
       bool is_top_level_computation,
-      const std::vector<HloInstruction*>* instruction_order);
+      absl::Span<HloInstruction* const> instruction_order);
 
   llvm::IRBuilder<>* b() { return &b_; }
 
@@ -134,7 +136,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   Status HandleDot(HloInstruction* dot) override;
   Status HandleConvolution(HloInstruction* convolution) override;
   Status HandleFft(HloInstruction* fft) override;
-  Status HandleCrossReplicaSum(HloInstruction* crs) override;
+  Status HandleAllReduce(HloInstruction* crs) override;
   Status HandleInfeed(HloInstruction* infeed) override;
   Status HandleOutfeed(HloInstruction* outfeed) override;
   Status HandleSort(HloInstruction* sort) override;
@@ -250,14 +252,6 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   llvm::Value* EmitBufferPointer(const BufferAllocation::Slice& slice,
                                  const Shape& target_shape);
 
-  // Emits a function into the current module. This can be used for
-  // computations embedded inside other computations, such as the
-  // function that a map operation applies.
-  StatusOr<llvm::Function*> EmitFunction(
-      HloComputation* function,  // The function to emit.
-      absl::string_view
-          function_name_suffix);  // Used for LLVM IR register names.
-
   // Emits a call to a thread local function (e.g. to the computation nested
   // within a reduce or a map).  Thread local callees (by definition) only write
   // to and read from thread local allocations.
@@ -448,7 +442,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
       computation_to_profile_idx_;
 
   // Maps HLOs to Values emitted for them.
-  std::unordered_map<const HloInstruction*, llvm::Value*> emitted_value_;
+  absl::flat_hash_map<const HloInstruction*, llvm::Value*> emitted_value_;
 
   llvm_ir::AliasAnalysis alias_analysis_;
 
@@ -582,6 +576,8 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   std::vector<const HloComputation*> thread_local_computations_;
   std::vector<const HloComputation*> global_computations_;
 
+  bool emit_code_for_msan_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(IrEmitter);
 };
 
diff --git a/tensorflow/compiler/xla/service/cpu/ir_function.cc b/tensorflow/compiler/xla/service/cpu/ir_function.cc
index adfb8392bf6fa356f0a5cdab3ff74036eca8918e..84a5b058cfb11c899eb6ae03478ed550b84dc819 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_function.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_function.cc
@@ -266,9 +266,11 @@ Status EmitCallToParallelForkJoin(
       /*Params=*/compute_function_params,
       /*isVarArg=*/false);
 
-  llvm::Function* fork_join_func =
-      llvm::cast<llvm::Function>(module->getOrInsertFunction(
-          runtime::kParallelForkJoinSymbolName, fork_join_type));
+  llvm::Function* fork_join_func = llvm::dyn_cast<llvm::Function>(
+      module
+          ->getOrInsertFunction(runtime::kParallelForkJoinSymbolName,
+                                fork_join_type)
+          .getCallee());
   fork_join_func->setCallingConv(llvm::CallingConv::C);
   fork_join_func->setDoesNotThrow();
 
diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
index f9722ffadac801521ddcbb568dd4435fd02e951b..93ef51754d21ad3ff4e24298c89649ef4c2742fb 100644
--- a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
@@ -36,57 +36,88 @@ const char* const kLogV4F32SymbolName = "__xla_cpu_runtime_LogV4F32AVX";
 const char* const kLogV8F32SymbolName = "__xla_cpu_runtime_LogV8F32AVX";
 
 namespace {
-llvm::Function* EmitVectorF32TanhIfNeeded(llvm::Module* module,
-                                          llvm::StringRef function_name,
-                                          int vector_width,
-                                          bool enable_fast_math) {
-  llvm::Function* vector_tanh_function = module->getFunction(function_name);
-  if (vector_tanh_function == nullptr) {
+
+// Replaces calls to the function `fn_name` with the code generated by
+// fn_body_generator.
+//
+// We assume that fn_name accepts either a scalar f32 or a vector of
+// vector_width f32s, and that fn_body_generator generates a function body with
+// the same inputs/outputs as fn_name.
+void RewriteCalls(
+    llvm::Module* module, const char* fn_name,
+    std::function<llvm::Value*(llvm::IRBuilder<>* b, llvm::Value* input,
+                               int32 vector_width)>
+        fn_body_generator,
+    int32 vector_width, bool enable_fast_math) {
+  llvm::Function* fn = module->getFunction(fn_name);
+  if (fn == nullptr) {
     // If the function declaration is not present in the module, there can't be
     // any calls to resolve.  Don't emit the function in this case.
-    return nullptr;
+    return;
   }
 
-  llvm::LLVMContext* context = &module->getContext();
+  // Our task is to generate a function body for `fn`, but we can't generate a
+  // function body for an LLVM intrinsic. So if fn is an intrinsic, replace it
+  // with a new function.
+  if (fn->isIntrinsic()) {
+    llvm::Function* new_fn = llvm::Function::Create(
+        fn->getFunctionType(), llvm::GlobalValue::InternalLinkage,
+        llvm::Twine("xla_impl.") + fn_name, module);
+    fn->replaceAllUsesWith(new_fn);
+    fn->eraseFromParent();
+    fn = new_fn;
+  }
 
-  llvm::BasicBlock* vector_tanh_body =
-      llvm::BasicBlock::Create(*context, "body", vector_tanh_function);
+  llvm::LLVMContext* context = &module->getContext();
 
-  llvm::IRBuilder<> b(vector_tanh_body);
+  llvm::BasicBlock* fn_body = llvm::BasicBlock::Create(*context, "body", fn);
+  llvm::IRBuilder<> b(fn_body);
   llvm::FastMathFlags fast_math_flags;
   fast_math_flags.setFast(enable_fast_math);
   b.setFastMathFlags(fast_math_flags);
 
-  llvm::Value* input = &*vector_tanh_function->arg_begin();
-  CHECK_EQ(vector_width, input->getType()->getVectorNumElements());
-  b.CreateRet(llvm_ir::EmitFastTanh(&b, input));
-
-  DCHECK(!llvm::verifyFunction(*vector_tanh_function));
-  return vector_tanh_function;
-}
+  llvm::Value* input = &*fn->arg_begin();
 
-llvm::Function* EmitVectorF32ExpIfNeeded(llvm::Module* module,
-                                         llvm::StringRef function_name,
-                                         int vector_width,
-                                         bool enable_fast_math) {
-  llvm::Function* vector_exp_function = module->getFunction(function_name);
-  if (vector_exp_function == nullptr) {
-    // If the function declaration is not present in the module, there can't be
-    // any calls to resolve.  Don't emit the function in this case.
-    return nullptr;
+  // Upcast to vector type if input is a scalar.
+  if (vector_width == 1) {
+    llvm::Type* v1_type = llvm::VectorType::get(input->getType(), 1);
+    input = b.CreateInsertElement(llvm::UndefValue::get(v1_type), input,
+                                  uint64_t{0});
   }
 
-  llvm::LLVMContext* context = &module->getContext();
+  // Generate the vectorized code.
+  CHECK_EQ(vector_width, input->getType()->getVectorNumElements());
+  llvm::Value* result = fn_body_generator(&b, input, vector_width);
+
+  // Downcast result to scalar type if necessary.
+  if (vector_width == 1) {
+    result = b.CreateExtractElement(result, uint64_t{0});
+  }
+  b.CreateRet(result);
+  DCHECK(!llvm::verifyFunction(*fn));
 
-  llvm::BasicBlock* vector_exp_body =
-      llvm::BasicBlock::Create(*context, "body", vector_exp_function);
+  // Force-inline `fn` into all of its callers and then delete `fn`.
+  //
+  // TODO(b/73081976): Should we avoid inlining these in some cases?
+  std::vector<llvm::CallInst*> calls_to_inline;
+  for (auto* user : fn->users()) {
+    calls_to_inline.push_back(llvm::cast<llvm::CallInst>(user));
+  }
+  for (auto* call_to_inline : calls_to_inline) {
+    llvm::InlineFunctionInfo inline_function_info;
+    CHECK(llvm::InlineFunction(call_to_inline, inline_function_info));
+  }
+  fn->eraseFromParent();
+}
 
-  llvm::IRBuilder<> b(vector_exp_body);
-  llvm::FastMathFlags fast_math_flags;
-  fast_math_flags.setFast();
-  b.setFastMathFlags(fast_math_flags);
+llvm::Value* GenerateVF32Tanh(llvm::IRBuilder<>* b, llvm::Value* input,
+                              int32 /*vector_width*/) {
+  return llvm_ir::EmitFastTanh(b, input);
+}
 
-  VectorSupportLibrary vsl(F32, vector_width, &b, "exp_f32");
+llvm::Value* GenerateVF32Exp(llvm::IRBuilder<>* b, llvm::Value* input,
+                             int32 vector_width) {
+  VectorSupportLibrary vsl(F32, vector_width, b, "exp_f32");
 
   // This implements the same polynomial approximation as implemented in Eigen3.
 
@@ -107,7 +138,6 @@ llvm::Function* EmitVectorF32ExpIfNeeded(llvm::Module* module,
   const llvm::APFloat cephes_exp_p4 = GetIeeeF32(1.6666665459E-1);
   const llvm::APFloat cephes_exp_p5 = GetIeeeF32(5.0000001201E-1);
 
-  llvm::Value* input = &*vector_exp_function->arg_begin();
   llvm::Value* input_clamped =
       vsl.Clamp(input, /*low=*/exp_lo, /*high=*/exp_hi);
   llvm::Value* fx = vsl.Floor(vsl.MulAdd(input_clamped, cephes_LOG2EF, half));
@@ -128,49 +158,24 @@ llvm::Function* EmitVectorF32ExpIfNeeded(llvm::Module* module,
   // VectorSupportLibrary (intentionally) can't juggle more than one type at a
   // time so drop down to IRBuilder for this bit.
   llvm::Value* vector_constant_0x7f =
-      b.CreateVectorSplat(vector_width, b.getInt32(0x7f));
+      b->CreateVectorSplat(vector_width, b->getInt32(0x7f));
   llvm::Value* vector_constant_23 =
-      b.CreateVectorSplat(vector_width, b.getInt32(23));
+      b->CreateVectorSplat(vector_width, b->getInt32(23));
   llvm::Type* i32_vector_type =
-      llvm::VectorType::get(b.getInt32Ty(), vector_width);
+      llvm::VectorType::get(b->getInt32Ty(), vector_width);
   // fx is clamped so we don't have to worry about it being out of range for
   // i32.
-  llvm::Value* emm0 = b.CreateFPToSI(fx, i32_vector_type);
-  emm0 = b.CreateAdd(emm0, vector_constant_0x7f);
-  emm0 = b.CreateShl(emm0, vector_constant_23);
-  llvm::Value* emm0_f32 = b.CreateBitCast(emm0, vsl.vector_type());
-
-  llvm::Value* result = vsl.Max(vsl.Mul(y, emm0_f32), input);
+  llvm::Value* emm0 = b->CreateFPToSI(fx, i32_vector_type);
+  emm0 = b->CreateAdd(emm0, vector_constant_0x7f);
+  emm0 = b->CreateShl(emm0, vector_constant_23);
+  llvm::Value* emm0_f32 = b->CreateBitCast(emm0, vsl.vector_type());
 
-  b.CreateRet(result);
-
-  DCHECK(!llvm::verifyFunction(*vector_exp_function));
-  return vector_exp_function;
+  return vsl.Max(vsl.Mul(y, emm0_f32), input);
 }
 
-llvm::Function* EmitVectorF32LogIfNeeded(llvm::Module* module,
-                                         llvm::StringRef function_name,
-                                         int vector_width,
-                                         bool enable_fast_math) {
-  llvm::Function* vector_log_function = module->getFunction(function_name);
-  if (vector_log_function == nullptr) {
-    // If the function declaration is not present in the module, there can't be
-    // any calls to resolve.  Don't emit the function in this case.
-    return nullptr;
-  }
-
-  llvm::LLVMContext* context = &module->getContext();
-
-  llvm::BasicBlock* vector_log_body =
-      llvm::BasicBlock::Create(*context, "body", vector_log_function);
-
-  llvm::IRBuilder<> b(vector_log_body);
-  llvm::FastMathFlags fast_math_flags;
-  fast_math_flags.setFast();
-  b.setFastMathFlags(fast_math_flags);
-
-  llvm::Value* input = &*vector_log_function->arg_begin();
-  VectorSupportLibrary vsl(F32, vector_width, &b, "log_f32");
+llvm::Value* GenerateVF32Log(llvm::IRBuilder<>* b, llvm::Value* input,
+                             int32 vector_width) {
+  VectorSupportLibrary vsl(F32, vector_width, b, "log_f32");
 
   const llvm::APFloat half = GetIeeeF32(0.5);
   const llvm::APFloat one = GetIeeeF32(1.0);
@@ -193,129 +198,107 @@ llvm::Function* EmitVectorF32LogIfNeeded(llvm::Module* module,
   // The smallest non denormalized float number.
   const llvm::APFloat min_norm_pos = GetIeeeF32FromBitwiseRep(0x00800000);
   const llvm::APFloat minus_inf = GetIeeeF32FromBitwiseRep(0xff800000);
+  const llvm::APFloat pos_inf = GetIeeeF32FromBitwiseRep(0x7f800000);
   const llvm::APFloat inv_mant_mask = GetIeeeF32FromBitwiseRep(~0x7f800000);
 
   // invalid_mask is set if x is negative or NaN (and therefore output
   // must be NaN).
   llvm::Value* invalid_mask = vsl.FCmpULEMask(input, vsl.GetZeroVector());
-  llvm::Value* iszero_mask = vsl.FCmpEQMask(input, vsl.GetZeroVector());
+  llvm::Value* is_zero_mask = vsl.FCmpEQMask(input, vsl.GetZeroVector());
+  llvm::Value* is_pos_inf_mask = vsl.FCmpEQMask(input, pos_inf);
 
   // Cut off denormalized stuff.
-  input = vsl.Max(min_norm_pos, input);
+  llvm::Value* tmp0 = vsl.Max(min_norm_pos, input);
 
   // VectorSupportLibrary (intentionally) can't juggle more than one type at a
   // time so drop down to IRBuilder for this bit.
   llvm::Value* vector_constant_0x7f =
-      b.CreateVectorSplat(vector_width, b.getInt32(0x7f));
+      b->CreateVectorSplat(vector_width, b->getInt32(0x7f));
   llvm::Value* vector_constant_23 =
-      b.CreateVectorSplat(vector_width, b.getInt32(23));
+      b->CreateVectorSplat(vector_width, b->getInt32(23));
   llvm::Type* i32_vector_type =
-      llvm::VectorType::get(b.getInt32Ty(), vector_width);
+      llvm::VectorType::get(b->getInt32Ty(), vector_width);
 
-  llvm::Value* emm0 =
-      b.CreateLShr(b.CreateBitCast(input, i32_vector_type), vector_constant_23);
+  llvm::Value* emm0 = b->CreateLShr(b->CreateBitCast(tmp0, i32_vector_type),
+                                    vector_constant_23);
 
   // Keep only the fractional part.
-  input = vsl.FloatAnd(input, inv_mant_mask);
-  input = vsl.FloatOr(input, half);
+  tmp0 = vsl.FloatAnd(tmp0, inv_mant_mask);
+  tmp0 = vsl.FloatOr(tmp0, half);
 
-  emm0 = b.CreateSub(emm0, vector_constant_0x7f);
-  llvm::Value* e = vsl.Add(one, b.CreateSIToFP(emm0, vsl.vector_type()));
+  emm0 = b->CreateSub(emm0, vector_constant_0x7f);
+  llvm::Value* e = vsl.Add(one, b->CreateSIToFP(emm0, vsl.vector_type()));
 
   // part2:
   //   if( x < SQRTHF ) {
   //     e -= 1;
   //     x = x + x - 1.0;
   //   } else { x = x - 1.0; }
-  llvm::Value* mask = vsl.FCmpOLTMask(input, cephes_SQRTHF);
-  llvm::Value* tmp = vsl.FloatAnd(input, mask);
-  input = vsl.Sub(input, one);
+  llvm::Value* mask = vsl.FCmpOLTMask(tmp0, cephes_SQRTHF);
+  llvm::Value* tmp1 = vsl.FloatAnd(tmp0, mask);
+  tmp0 = vsl.Sub(tmp0, one);
   e = vsl.Sub(e, vsl.FloatAnd(mask, one));
-  input = vsl.Add(input, tmp);
+  tmp0 = vsl.Add(tmp0, tmp1);
 
-  llvm::Value* x2 = vsl.Mul(input, input);
-  llvm::Value* x3 = vsl.Mul(x2, input);
+  llvm::Value* x2 = vsl.Mul(tmp0, tmp0);
+  llvm::Value* x3 = vsl.Mul(x2, tmp0);
 
   llvm::Value *y, *y1, *y2;
-  y = vsl.MulAdd(input, cephes_log_p0, cephes_log_p1);
-  y1 = vsl.MulAdd(input, cephes_log_p3, cephes_log_p4);
-  y2 = vsl.MulAdd(input, cephes_log_p6, cephes_log_p7);
-  y = vsl.MulAdd(y, input, cephes_log_p2);
-  y1 = vsl.MulAdd(y1, input, cephes_log_p5);
-  y2 = vsl.MulAdd(y2, input, cephes_log_p8);
+  y = vsl.MulAdd(tmp0, cephes_log_p0, cephes_log_p1);
+  y1 = vsl.MulAdd(tmp0, cephes_log_p3, cephes_log_p4);
+  y2 = vsl.MulAdd(tmp0, cephes_log_p6, cephes_log_p7);
+  y = vsl.MulAdd(y, tmp0, cephes_log_p2);
+  y1 = vsl.MulAdd(y1, tmp0, cephes_log_p5);
+  y2 = vsl.MulAdd(y2, tmp0, cephes_log_p8);
   y = vsl.MulAdd(y, x3, y1);
   y = vsl.MulAdd(y, x3, y2);
   y = vsl.Mul(y, x3);
 
   y1 = vsl.Mul(cephes_log_q1, e);
-  tmp = vsl.Mul(half, x2);
+  llvm::Value* tmp2 = vsl.Mul(half, x2);
   y = vsl.Add(y, y1);
-  input = vsl.Sub(input, tmp);
+  tmp0 = vsl.Sub(tmp0, tmp2);
   y2 = vsl.Mul(cephes_log_q2, e);
-  input = vsl.Add(input, y);
-  input = vsl.Add(input, y2);
+  tmp0 = vsl.Add(tmp0, y);
+  tmp0 = vsl.Add(tmp0, y2);
 
-  // Negative arg will be NAN, 0 will be -INF.
-  llvm::Value* or_lhs =
-      vsl.FloatAndNot(iszero_mask, vsl.FloatOr(input, invalid_mask));
-  llvm::Value* or_rhs = vsl.FloatAnd(iszero_mask, minus_inf);
-  llvm::Value* result = vsl.FloatOr(or_lhs, or_rhs);
+  // Contains +/-inf where +/-inf is the correct answer, otherwise 0.
+  llvm::Value* result_inf = vsl.FloatOr(vsl.FloatAnd(is_zero_mask, minus_inf),
+                                        vsl.FloatAnd(is_pos_inf_mask, pos_inf));
 
-  b.CreateRet(result);
+  // Contains a finite result or nan.  This is the correct answer only if both
+  // result_minus_inf and result_pos_inf are both 0.
+  //
+  // (This implementation works because 0xffffffff is a nan.)
+  llvm::Value* result_finite_or_nan = vsl.FloatOr(tmp0, invalid_mask);
 
-  DCHECK(!llvm::verifyFunction(*vector_log_function));
-  return vector_log_function;
+  // Combine the above into a final result.
+  return vsl.FloatOr(result_inf,
+                     vsl.FloatAndNot(vsl.FloatOr(is_zero_mask, is_pos_inf_mask),
+                                     result_finite_or_nan));
 }
 }  // namespace
 
 void RewriteIRRuntimeFunctions(llvm::Module* module, bool enable_fast_math) {
-  auto* tanh_v4f32 =
-      EmitVectorF32TanhIfNeeded(module, kTanhV4F32SymbolName,
-                                /*vector_width=*/4, enable_fast_math);
-  auto* tanh_v8f32 =
-      EmitVectorF32TanhIfNeeded(module, kTanhV8F32SymbolName,
-                                /*vector_width=*/8, enable_fast_math);
-
-  auto* exp_v4f32 =
-      EmitVectorF32ExpIfNeeded(module, kExpV4F32SymbolName,
-                               /*vector_width=*/4, enable_fast_math);
-  auto* exp_v8f32 =
-      EmitVectorF32ExpIfNeeded(module, kExpV8F32SymbolName,
-                               /*vector_width=*/8, enable_fast_math);
-
-  auto* log_v4f32 =
-      EmitVectorF32LogIfNeeded(module, kLogV4F32SymbolName,
-                               /*vector_width=*/4, enable_fast_math);
-  auto* log_v8f32 =
-      EmitVectorF32LogIfNeeded(module, kLogV8F32SymbolName,
-                               /*vector_width=*/8, enable_fast_math);
-
-  // Gather all the call sites, force inline them and then delete the vector
-  // function bodies.
-  //
-  // TODO(b/73081976): Should we avoid inlining these intrinsics in some cases?
-
-  std::vector<llvm::CallInst*> calls_to_inline;
-  for (auto* function :
-       {tanh_v4f32, tanh_v8f32, exp_v4f32, exp_v8f32, log_v4f32, log_v8f32}) {
-    if (function != nullptr) {
-      for (auto* user : function->users()) {
-        calls_to_inline.push_back(llvm::cast<llvm::CallInst>(user));
-      }
-    }
-  }
-
-  for (auto* call_to_inline : calls_to_inline) {
-    llvm::InlineFunctionInfo inline_function_info;
-    CHECK(llvm::InlineFunction(call_to_inline, inline_function_info));
-  }
-
-  for (auto* function :
-       {tanh_v4f32, tanh_v8f32, exp_v4f32, exp_v8f32, log_v4f32, log_v8f32}) {
-    if (function != nullptr) {
-      function->eraseFromParent();
-    }
-  }
+  // Curry some params to RewriteCalls.
+  auto rewrite_calls =
+      std::bind(RewriteCalls, module, std::placeholders::_1,
+                std::placeholders::_2, std::placeholders::_3, enable_fast_math);
+
+  rewrite_calls("tanhf", GenerateVF32Tanh, /*vector_width=*/1);
+  rewrite_calls("llvm.tanh.f32", GenerateVF32Tanh, /*vector_width=*/1);
+  rewrite_calls(kTanhV4F32SymbolName, GenerateVF32Tanh, /*vector_width=*/4);
+  rewrite_calls(kTanhV8F32SymbolName, GenerateVF32Tanh, /*vector_width=*/8);
+
+  rewrite_calls("expf", GenerateVF32Exp, /*vector_width=*/1);
+  rewrite_calls("llvm.exp.f32", GenerateVF32Exp, /*vector_width=*/1);
+  rewrite_calls(kExpV4F32SymbolName, GenerateVF32Exp, /*vector_width=*/4);
+  rewrite_calls(kExpV8F32SymbolName, GenerateVF32Exp, /*vector_width=*/8);
+
+  rewrite_calls("logf", GenerateVF32Log, /*vector_width=*/1);
+  rewrite_calls("llvm.log.f32", GenerateVF32Log, /*vector_width=*/1);
+  rewrite_calls(kLogV4F32SymbolName, GenerateVF32Log, /*vector_width=*/4);
+  rewrite_calls(kLogV8F32SymbolName, GenerateVF32Log, /*vector_width=*/8);
 }
 
 }  // namespace runtime
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
index f8441c3e345504616485c6b34b4302acd5cc23a3..a6f4273a5a70aab0bc88383283d2a55b1ecb1681 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
@@ -34,7 +34,7 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name,
                                                    llvm::Type* index_type) {
   CHECK_NE(index_type, nullptr);
 
-  CHECK(!ShapeUtil::IsTuple(shape_));
+  CHECK(!shape_.IsTuple());
   CHECK(!ShapeUtil::IsScalar(shape_));
 
   llvm_ir::ForLoopNest loop_nest(loop_name, b_);
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
index ede7f433ca6b2cc5629115f800348be9dfb2b93b..6121d1ca9a5c785cedd947200d3e7e320aa06bc2 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
@@ -146,11 +146,9 @@ int64 ParallelTaskAssignment::GetTargetParallelTaskCount(
       (opcode == HloOpcode::kConvolution &&
        PotentiallyImplementedAsEigenConvolution(*instruction,
                                                 target_machine_features_)) ||
-      PotentiallyImplementedAsEigenDot(*instruction,
-                                       target_machine_features_) ||
       (opcode == HloOpcode::kFusion &&
        instruction->fusion_kind() != HloInstruction::FusionKind::kLoop) ||
-      ShapeUtil::IsTuple(instruction->shape())) {
+      instruction->shape().IsTuple()) {
     return 1;
   }
 
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
index f0b65046c14ccec5336abf7c4d05d1d755f783bd..35ae62b42dfa768c6abd0508097d6b235b2ebf54 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
@@ -112,10 +112,10 @@ TEST_F(ParallelTaskAssignmentTest, InfeedOutfeedOperationNotParallelized) {
   const string hlo_string = R"(
     HloModule TestTaskParallel_infeed_outfeed
     ENTRY InfeedOutfeed {
-      token = token[] after-all()
-      infeed0 = (u32[12345678,2]{1,0}, token[]) infeed(token)
+      token0 = token[] after-all()
+      infeed0 = (u32[12345678,2]{1,0}, token[]) infeed(token0)
       infeed0.data = u32[12345678,2]{1,0} get-tuple-element((u32[12345678,2]{1,0}, token[]) infeed0), index=0
-      ROOT outfeed0 = token[] outfeed(infeed0.data, token)
+      ROOT outfeed0 = token[] outfeed(infeed0.data, token0)
     }
   )";
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fork_join.cc b/tensorflow/compiler/xla/service/cpu/runtime_fork_join.cc
index 2d9492eacfea34bec3b0f1115e171a5328b7cdc3..6f72ddadf94d4c5b9add2ee66e0f4ac9a8ae9099 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_fork_join.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fork_join.cc
@@ -69,8 +69,13 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_ParallelForkJoin(
   CHECK_EQ(params, nullptr);
   CHECK_GT(num_partitions, 1);
   CHECK_GT(num_partitioned_dims, 0);
+  CHECK_NE(function_ptr, nullptr);
+  CHECK_NE(partitions, nullptr);
   const xla::ExecutableRunOptions* run_options =
       static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
+  CHECK_NE(run_options, nullptr);
+  CHECK_NE(run_options->intra_op_thread_pool(), nullptr);
+
   ComputeFunctionType function =
       reinterpret_cast<ComputeFunctionType>(function_ptr);
   // Compute partition stride in 'partitions' array.
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
index 722aa3120ef4d8c957873ac58c361f19632dde1f..70a6d0af02c0c2db7208db561cf29e35a74707b2 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
@@ -15,12 +15,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.h"
 
 #include <algorithm>
-#include <cmath>
 #include <cstring>
-#include <limits>
 #include <memory>
+#include <numeric>
 #include <string>
-#include <utility>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/platform/dynamic_annotations.h"
@@ -28,80 +26,15 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 namespace {
-using tensorflow::int16;
 using tensorflow::int32;
 using tensorflow::int64;
-using tensorflow::int8;
-using tensorflow::uint16;
-using tensorflow::uint32;
-using tensorflow::uint64;
-using tensorflow::uint8;
-
-template <typename KeyType>
-void KeyValueSort(std::pair<KeyType, int64>* row_to_sort, int64 num_elements) {
-  std::sort(row_to_sort, row_to_sort + num_elements);
-}
-
-// We would like a total order of floating point numbers so that the
-// sort has a predictable behavior in the presence of NaNs. Rather
-// than using floating point comparison, we use the following trick:
-// If f is a float, and
-// x = bit_cast<int32>(f);
-// y = x < 0 ? 0x7FFFFFFF - x : x;
-// then y is ordered as an int32 such that finite values have the
-// obvious order, -0 is ordered before 0, and -NaN and NaN appear at
-// the beginning and end of the ordering.
-template <typename CastType, typename UnsignedCastType, typename KeyType>
-CastType Convert(KeyType value) {
-  CastType casted_value;
-  memcpy(&casted_value, &value, sizeof(CastType));
-  if (casted_value < 0) {
-    return static_cast<UnsignedCastType>(std::numeric_limits<CastType>::max()) -
-           casted_value;
-  }
-  return casted_value;
-}
-
-template <typename CastType, typename UnsignedCastType, typename KeyType>
-bool LessThan(KeyType lhs, KeyType rhs) {
-  return Convert<CastType, UnsignedCastType>(lhs) <
-         Convert<CastType, UnsignedCastType>(rhs);
-}
-
-template <>
-void KeyValueSort(std::pair<double, int64>* row_to_sort, int64 num_elements) {
-  std::stable_sort(row_to_sort, row_to_sort + num_elements,
-                   [](const std::pair<double, int64>& lhs,
-                      const std::pair<double, int64>& rhs) -> bool {
-                     return LessThan<int64, uint64>(lhs.first, rhs.first);
-                   });
-}
-
-template <>
-void KeyValueSort(std::pair<float, int64>* row_to_sort, int64 num_elements) {
-  std::stable_sort(row_to_sort, row_to_sort + num_elements,
-                   [](const std::pair<float, int64>& lhs,
-                      const std::pair<float, int64>& rhs) -> bool {
-                     return LessThan<int32, uint32>(lhs.first, rhs.first);
-                   });
-}
-
-template <>
-void KeyValueSort(std::pair<Eigen::half, int64>* row_to_sort,
-                  int64 num_elements) {
-  std::stable_sort(row_to_sort, row_to_sort + num_elements,
-                   [](const std::pair<Eigen::half, int64>& lhs,
-                      const std::pair<Eigen::half, int64>& rhs) -> bool {
-                     return LessThan<int32, uint32>(
-                         Eigen::half_impl::half_to_float(lhs.first),
-                         Eigen::half_impl::half_to_float(rhs.first));
-                   });
-}
+}  // namespace
 
-template <typename KeyType>
-void KeyValueSortImpl(KeyType* keys, int64 a, int64 b, int64 c, char** values,
-                      int32 values_count,
-                      int32* values_primitive_type_size_in_bytes) {
+TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSort(
+    int64 a, int64 b, int64 c, char** values, int32 values_count,
+    int32* values_primitive_type_size_in_bytes, bool is_stable,
+    char* run_options, int64* prof_counters,
+    void (*less_than)(char*, char*, char**, char**, tensorflow::int64*)) {
   // 'values' and 'values_primitive_type_size_in_bytes' are managed by the JIT
   // code, so msan can't tell they are initialized.
   TF_ANNOTATE_MEMORY_IS_INITIALIZED(values, values_count * sizeof(char*));
@@ -121,8 +54,9 @@ void KeyValueSortImpl(KeyType* keys, int64 a, int64 b, int64 c, char** values,
   int64 num_iteration_elements = a * c;
   int64 sort_dimension_offset = c;
 
-  std::unique_ptr<std::pair<KeyType, int64>[]> row_to_sort(
-      new std::pair<KeyType, int64>[sort_dimension_elements]);
+  std::unique_ptr<int64[]> indices(new int64[sort_dimension_elements]);
+  std::unique_ptr<char*[]> comparison_values(new char*[2 * values_count]);
+  std::iota(indices.get(), indices.get() + sort_dimension_elements, 0);
   std::unique_ptr<std::string[]> reordered_values(
       new std::string[sort_dimension_elements]);
   for (int64 index = 0; index < num_iteration_elements; ++index) {
@@ -135,24 +69,33 @@ void KeyValueSortImpl(KeyType* keys, int64 a, int64 b, int64 c, char** values,
     int64 base_offset =
         index % sort_dimension_offset +
         (index - index % sort_dimension_offset) * sort_dimension_elements;
-    // TODO(b/26783907): We could define a custom iterator class that references
-    // all arrays. Then we could avoid the intermediate copy. However this
-    // would become more complicated, and it is not clear if the benefit is high
-    // enough.
-    for (int64 i = 0; i < sort_dimension_elements; ++i) {
-      row_to_sort[i] =
-          std::make_pair(keys[base_offset + i * sort_dimension_offset], i);
-    }
-    KeyValueSort(row_to_sort.get(), sort_dimension_elements);
-    for (int64 i = 0; i < sort_dimension_elements; ++i) {
-      keys[base_offset + i * sort_dimension_offset] = row_to_sort[i].first;
+    auto compare_function = [&](int64 a, int64 b) -> bool {
+      int64 memory_index_lhs = (base_offset + a * sort_dimension_offset) *
+                               values_primitive_type_size_in_bytes[0];
+      int64 memory_index_rhs = (base_offset + b * sort_dimension_offset) *
+                               values_primitive_type_size_in_bytes[0];
+      for (int32 i = 0; i < values_count; ++i) {
+        comparison_values[i * 2] = values[i] + memory_index_lhs;
+        comparison_values[i * 2 + 1] = values[i] + memory_index_rhs;
+      }
+      char result = 0;  // Overwritten by less_than.
+      less_than(&result, run_options, comparison_values.get(), nullptr,
+                prof_counters);
+      return result != 0u;
+    };
+    if (is_stable) {
+      std::stable_sort(indices.get(), indices.get() + sort_dimension_elements,
+                       compare_function);
+    } else {
+      std::sort(indices.get(), indices.get() + sort_dimension_elements,
+                compare_function);
     }
 
-    // Reorder the values according to the order defined by the keys.
+    // Reorder the values according to the order defined by 'indices'.
     for (int32 idx = 0; idx < values_count; ++idx) {
       for (int64 i = 0; i < sort_dimension_elements; ++i) {
         int64 memory_index =
-            (base_offset + row_to_sort[i].second * sort_dimension_offset) *
+            (base_offset + indices[i] * sort_dimension_offset) *
             values_primitive_type_size_in_bytes[idx];
 
         reordered_values[i] =
@@ -168,88 +111,3 @@ void KeyValueSortImpl(KeyType* keys, int64 a, int64 b, int64 c, char** values,
     }
   }
 }
-}  // namespace
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortPRED(
-    bool* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortS8(
-    int8* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortU8(
-    uint8* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortS16(
-    int16* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortU16(
-    uint16* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortF16(
-    Eigen::half* keys, int64 a, int64 b, int64 c, char** values,
-    int32 values_count, int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortS32(
-    int32* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortU32(
-    uint32* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortF32(
-    float* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortS64(
-    int64* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortU64(
-    uint64* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
-
-TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSortF64(
-    double* keys, int64 a, int64 b, int64 c, char** values, int32 values_count,
-    int32* values_primitive_type_size_in_bytes) {
-  KeyValueSortImpl(keys, a, b, c, values, values_count,
-                   values_primitive_type_size_in_bytes);
-}
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.h b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.h
index 7821099386969e855ea1737cf53ef49c15c6e93b..50c2911c3bd392b6df12717c34d250ce86ad26e0 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.h
@@ -21,76 +21,26 @@ limitations under the License.
 
 extern "C" {
 
-// 'keys' represents a 3-dimensional shape with dimensions [a, b, c]. The 'b'
-// dimension of 'keys' is sorted into ascending order. If 'values_count' is <=
-// 0, 'values' and 'values_primitive_type_size_in_bytes' can be nullptr.
-// If 'values_count' > 0, they contain exactly 'values_count' many elements.
-// Each element of 'values' also represents a 3-dimensional shape with
-// dimensions [a, b, c], and the size of the primitive type of the i-th shape
-// has exactly 'values_primitive_type_size_in_bytes[i]' bytes. The elements in
-// each 'values' shape are reordered in such a way that if the element at index
-// 'i' in 'keys' was moved to index 'j', the element at index 'i' in a 'values'
-// shape is also moved to index 'j' (which means that the same elements
-// correspond to each other as before).
-extern void __xla_cpu_runtime_KeyValueSortPRED(
-    bool* keys, tensorflow::int64 a, tensorflow::int64 b, tensorflow::int64 c,
+// Each entry in 'values' represents a 3-dimensional shape with dimensions
+// [a, b, c]. The 'b' dimension of each shape is sorted into ascending order
+// according to the results of comparisons using the provided 'less_than'
+// function. 'values_count' must be > 0 and specifies the number of entries in
+// 'values' and 'values_primitive_type_size_in_bytes'. The size of the primitive
+// type of the i-th shape has exactly 'values_primitive_type_size_in_bytes[i]'
+// bytes. 'is_stable' specifies whether the sorting should be stable.
+// 'run_options' and 'prof_counters' are passed through to the less-than
+// function, which expects the following arguments:
+// - pointer to the return value buffer (char*)
+// - xla::ExecutableRunOptions = 'run_options' (char*)
+// - pointers to the parameter buffers (char**)
+// - pointers to the buffer tables = nullptr for thread local functions (char**)
+// - profile counters = 'prof_counters' (int64*)
+extern void __xla_cpu_runtime_KeyValueSort(
+    tensorflow::int64 a, tensorflow::int64 b, tensorflow::int64 c,
     char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortS8(
-    tensorflow::int8* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortU8(
-    tensorflow::uint8* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortS16(
-    tensorflow::int16* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortU16(
-    tensorflow::uint16* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortF16(
-    Eigen::half* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortS32(
-    tensorflow::int32* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortU32(
-    tensorflow::uint32* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortF32(
-    float* keys, tensorflow::int64 a, tensorflow::int64 b, tensorflow::int64 c,
-    char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortS64(
-    tensorflow::int64* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortU64(
-    tensorflow::uint64* keys, tensorflow::int64 a, tensorflow::int64 b,
-    tensorflow::int64 c, char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
-
-extern void __xla_cpu_runtime_KeyValueSortF64(
-    double* keys, tensorflow::int64 a, tensorflow::int64 b, tensorflow::int64 c,
-    char** values, tensorflow::int32 values_count,
-    tensorflow::int32* values_primitive_type_size_in_bytes);
+    tensorflow::int32* values_primitive_type_size_in_bytes, bool is_stable,
+    char* run_options, tensorflow::int64* prof_counters,
+    void (*less_than)(char*, char*, char**, char**, tensorflow::int64*));
 }
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_KEY_VALUE_SORT_H_
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc b/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
index a71a85913cfef271bc2a226cb0cf2dd4204499a4..fe7e87a197b6cf571195537eaea2898659cd5e2e 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
@@ -23,12 +23,20 @@ limitations under the License.
 #include "tensorflow/core/platform/dynamic_annotations.h"
 #include "tensorflow/core/platform/types.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 using tensorflow::int32;
 using tensorflow::int64;
 
 namespace {
 
-template <typename T>
+bool Is16BytesAligned(void* ptr) {
+  return reinterpret_cast<uintptr_t>(ptr) % 16 == 0;
+}
+
+template <typename T, Eigen::AlignmentType Alignment>
 void MatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m,
             int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) {
   const xla::ExecutableRunOptions* run_options =
@@ -46,11 +54,11 @@ void MatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m,
     std::swap(rhs_rows, rhs_cols);
   }
 
-  const Eigen::TensorMap<Eigen::Tensor<const T, 2>, Eigen::Aligned> A(
-      lhs, lhs_rows, lhs_cols);
-  const Eigen::TensorMap<Eigen::Tensor<const T, 2>, Eigen::Aligned> B(
-      rhs, rhs_rows, rhs_cols);
-  Eigen::TensorMap<Eigen::Tensor<T, 2>, Eigen::Aligned> C(out, m, n);
+  const Eigen::TensorMap<Eigen::Tensor<const T, 2>, Alignment> A(lhs, lhs_rows,
+                                                                 lhs_cols);
+  const Eigen::TensorMap<Eigen::Tensor<const T, 2>, Alignment> B(rhs, rhs_rows,
+                                                                 rhs_cols);
+  Eigen::TensorMap<Eigen::Tensor<T, 2>, Alignment> C(out, m, n);
 
   typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
   int lhs_contract_dim = transpose_lhs ? 0 : 1;
@@ -65,14 +73,24 @@ void MatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m,
 }
 
 template <typename T>
-void MatMulImpl(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m,
-                int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) {
+void MatMulDispatch(const void* run_options_ptr, T* out, T* lhs, T* rhs,
+                    int64 m, int64 n, int64 k, int32 transpose_lhs,
+                    int32 transpose_rhs) {
+  bool all_buffers_16b_aligned =
+      Is16BytesAligned(out) && Is16BytesAligned(lhs) && Is16BytesAligned(rhs);
+
+  if (!all_buffers_16b_aligned) {
+    MatMul<T, Eigen::Unaligned>(run_options_ptr, out, lhs, rhs, m, n, k,
+                                transpose_lhs, transpose_rhs);
+    return;
+  }
+
   if (m == 1 || n == 1) {
     // Despite being single threaded, this version of matrix * vector is faster.
     xla::EigenMatVec<T>(out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
   } else {
-    MatMul<T>(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs,
-              transpose_rhs);
+    MatMul<T, Eigen::Aligned16>(run_options_ptr, out, lhs, rhs, m, n, k,
+                                transpose_lhs, transpose_rhs);
   }
 }
 
@@ -82,20 +100,20 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulF16(
     const void* run_options_ptr, Eigen::half* out, Eigen::half* lhs,
     Eigen::half* rhs, int64 m, int64 n, int64 k, int32 transpose_lhs,
     int32 transpose_rhs) {
-  MatMulImpl<Eigen::half>(run_options_ptr, out, lhs, rhs, m, n, k,
-                          transpose_lhs, transpose_rhs);
+  MatMulDispatch<Eigen::half>(run_options_ptr, out, lhs, rhs, m, n, k,
+                              transpose_lhs, transpose_rhs);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulF32(
     const void* run_options_ptr, float* out, float* lhs, float* rhs, int64 m,
     int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) {
-  MatMulImpl<float>(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs,
-                    transpose_rhs);
+  MatMulDispatch<float>(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs,
+                        transpose_rhs);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulF64(
     const void* run_options_ptr, double* out, double* lhs, double* rhs, int64 m,
     int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) {
-  MatMulImpl<double>(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs,
-                     transpose_rhs);
+  MatMulDispatch<double>(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs,
+                         transpose_rhs);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
index 16692e7f2e6145b2649b67987eef47916e958be2..1f7204e67a413efabd34cd7d88ced4c82ee7a5df 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
@@ -20,12 +20,20 @@ limitations under the License.
 #include "tensorflow/core/platform/dynamic_annotations.h"
 #include "tensorflow/core/platform/types.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 using tensorflow::int32;
 using tensorflow::int64;
 
 namespace {
 
-template <typename T>
+bool Is16BytesAligned(void* ptr) {
+  return reinterpret_cast<uintptr_t>(ptr) % 16 == 0;
+}
+
+template <typename T, Eigen::AlignmentType Alignment>
 void MatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m,
             int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) {
   int64 lhs_rows = m;
@@ -40,11 +48,11 @@ void MatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m,
     std::swap(rhs_rows, rhs_cols);
   }
 
-  const Eigen::TensorMap<Eigen::Tensor<const T, 2>, Eigen::Aligned> A(
-      lhs, lhs_rows, lhs_cols);
-  const Eigen::TensorMap<Eigen::Tensor<const T, 2>, Eigen::Aligned> B(
-      rhs, rhs_rows, rhs_cols);
-  Eigen::TensorMap<Eigen::Tensor<T, 2>, Eigen::Aligned> C(out, m, n);
+  const Eigen::TensorMap<Eigen::Tensor<const T, 2>, Alignment> A(lhs, lhs_rows,
+                                                                 lhs_cols);
+  const Eigen::TensorMap<Eigen::Tensor<const T, 2>, Alignment> B(rhs, rhs_rows,
+                                                                 rhs_cols);
+  Eigen::TensorMap<Eigen::Tensor<T, 2>, Alignment> C(out, m, n);
 
   typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
   int lhs_contract_dim = transpose_lhs ? 0 : 1;
@@ -59,14 +67,22 @@ void MatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m,
 }
 
 template <typename T>
-void SingleThreadedMatMul(const void* run_options_ptr, T* out, T* lhs, T* rhs,
-                          int64 m, int64 n, int64 k, int32 transpose_lhs,
-                          int32 transpose_rhs) {
+void SingleThreadedMatMulDispatch(const void* run_options_ptr, T* out, T* lhs,
+                                  T* rhs, int64 m, int64 n, int64 k,
+                                  int32 transpose_lhs, int32 transpose_rhs) {
+  bool all_buffers_16b_aligned =
+      Is16BytesAligned(out) && Is16BytesAligned(lhs) && Is16BytesAligned(rhs);
+
+  if (!all_buffers_16b_aligned) {
+    MatMul<T, Eigen::Unaligned>(run_options_ptr, out, lhs, rhs, m, n, k,
+                                transpose_lhs, transpose_rhs);
+  }
+
   if (m == 1 || n == 1) {
     xla::EigenMatVec<T>(out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
   } else {
-    MatMul<T>(run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs,
-              transpose_rhs);
+    MatMul<T, Eigen::Aligned16>(run_options_ptr, out, lhs, rhs, m, n, k,
+                                transpose_lhs, transpose_rhs);
   }
 }
 
@@ -77,8 +93,8 @@ __xla_cpu_runtime_EigenSingleThreadedMatMulF16(
     const void* run_options_ptr, Eigen::half* out, Eigen::half* lhs,
     Eigen::half* rhs, int64 m, int64 n, int64 k, int32 transpose_lhs,
     int32 transpose_rhs) {
-  SingleThreadedMatMul<Eigen::half>(run_options_ptr, out, lhs, rhs, m, n, k,
-                                    transpose_lhs, transpose_rhs);
+  SingleThreadedMatMulDispatch<Eigen::half>(run_options_ptr, out, lhs, rhs, m,
+                                            n, k, transpose_lhs, transpose_rhs);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void
@@ -87,8 +103,8 @@ __xla_cpu_runtime_EigenSingleThreadedMatMulF32(const void* run_options_ptr,
                                                float* rhs, int64 m, int64 n,
                                                int64 k, int32 transpose_lhs,
                                                int32 transpose_rhs) {
-  SingleThreadedMatMul<float>(run_options_ptr, out, lhs, rhs, m, n, k,
-                              transpose_lhs, transpose_rhs);
+  SingleThreadedMatMulDispatch<float>(run_options_ptr, out, lhs, rhs, m, n, k,
+                                      transpose_lhs, transpose_rhs);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void
@@ -97,6 +113,6 @@ __xla_cpu_runtime_EigenSingleThreadedMatMulF64(const void* run_options_ptr,
                                                double* rhs, int64 m, int64 n,
                                                int64 k, int32 transpose_lhs,
                                                int32 transpose_rhs) {
-  SingleThreadedMatMul<double>(run_options_ptr, out, lhs, rhs, m, n, k,
-                               transpose_lhs, transpose_rhs);
+  SingleThreadedMatMulDispatch<double>(run_options_ptr, out, lhs, rhs, m, n, k,
+                                       transpose_lhs, transpose_rhs);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index efccadedf27181a4cddf4f1dc3610f7c6db1d821..f7b64738b7b314b56f4ae60336d9c85c90287219 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -116,13 +116,26 @@ SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions& target_options,
                 orc_jit_memory_mapper::GetInstance());
             result.Resolver = symbol_resolver_;
             return result;
+          },
+          /*NotifyLoaded=*/
+          llvm::orc::LegacyRTDyldObjectLinkingLayer::NotifyLoadedFtor(),
+          /*NotifyFinalized=*/
+          [this](VModuleKeyT, const llvm::object::ObjectFile& object,
+                 const llvm::RuntimeDyld::LoadedObjectInfo& object_info) {
+            this->NotifyObjectFinalized(object, object_info);
+          },
+          /*NotifyFreed=*/
+          [this](VModuleKeyT, const llvm::object::ObjectFile& object) {
+            this->NotifyObjectFreed(object);
           }),
       compile_layer_(object_layer_,
                      CompilerFunctor(target_machine_.get(), &disassembler_,
                                      opt_level, optimize_for_size,
                                      enable_fast_math, disable_expensive_passes,
                                      std::move(pre_optimization_hook),
-                                     std::move(post_optimization_hook))) {
+                                     std::move(post_optimization_hook))),
+      gdb_jit_event_listener_(
+          llvm::JITEventListener::createGDBRegistrationListener()) {
   VLOG(1) << "CPU target: " << target_machine_->getTargetCPU().str()
           << " features: " << target_machine_->getTargetFeatureString().str();
 }
@@ -139,7 +152,7 @@ llvm::JITSymbol SimpleOrcJIT::ResolveRuntimeSymbol(const std::string& name) {
   }
 
   if (func_addr == nullptr) {
-    VLOG(2) << "Unable to resolve runtime symbol: " << name;
+    LOG(ERROR) << "Unable to resolve runtime symbol: " << name;
     return nullptr;
   }
   llvm::JITEvaluatedSymbol symbol_info(reinterpret_cast<uint64_t>(func_addr),
@@ -147,6 +160,20 @@ llvm::JITSymbol SimpleOrcJIT::ResolveRuntimeSymbol(const std::string& name) {
   return symbol_info;
 }
 
+void SimpleOrcJIT::NotifyObjectFinalized(
+    const llvm::object::ObjectFile& object,
+    const llvm::RuntimeDyld::LoadedObjectInfo& object_info) {
+  uint64_t key = static_cast<uint64_t>(
+      reinterpret_cast<uintptr_t>(object.getData().data()));
+  gdb_jit_event_listener_->notifyObjectLoaded(key, object, object_info);
+}
+
+void SimpleOrcJIT::NotifyObjectFreed(const llvm::object::ObjectFile& object) {
+  uint64_t key = static_cast<uint64_t>(
+      reinterpret_cast<uintptr_t>(object.getData().data()));
+  gdb_jit_event_listener_->notifyFreeingObject(key);
+}
+
 SimpleOrcJIT::VModuleKeyT SimpleOrcJIT::AddModule(
     std::unique_ptr<llvm::Module> module) {
   auto key = execution_session_.allocateVModule();
@@ -213,18 +240,7 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(ParallelForkJoin);
   REGISTER_CPU_RUNTIME_SYMBOL(ReleaseInfeedBufferAfterDequeue);
   REGISTER_CPU_RUNTIME_SYMBOL(ReleaseOutfeedBufferAfterPopulation);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortPRED);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortS8);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortU8);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortS16);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortU16);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortF16);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortS32);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortU32);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortF32);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortS64);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortU64);
-  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSortF64);
+  REGISTER_CPU_RUNTIME_SYMBOL(KeyValueSort);
 
   registry->Register("__gnu_f2h_ieee", reinterpret_cast<void*>(__gnu_f2h_ieee));
   registry->Register("__gnu_h2f_ieee", reinterpret_cast<void*>(__gnu_h2f_ieee));
@@ -296,6 +312,9 @@ bool RegisterKnownJITSymbols() {
   REGISTER_LIBM_SYMBOL(sin, double (*)(double));
 #ifdef __APPLE__
   REGISTER_LIBM_SYMBOL(__sincos, void (*)(double, double*, double*));
+  registry->Register("__sincosf_stret",
+                     reinterpret_cast<void*>(__sincosf_stret));
+  registry->Register("__sincos_stret", reinterpret_cast<void*>(__sincos_stret));
 #else
   REGISTER_LIBM_SYMBOL(sincos, void (*)(double, double*, double*));
 #endif
@@ -311,6 +330,18 @@ bool RegisterKnownJITSymbols() {
   registry->Register("memcpy", reinterpret_cast<void*>(memcpy));
   registry->Register("memmove", reinterpret_cast<void*>(memmove));
   registry->Register("memset", reinterpret_cast<void*>(memset));
+
+#ifdef __APPLE__
+  registry->Register("__bzero", reinterpret_cast<void*>(bzero));
+  registry->Register("memset_pattern16",
+                     reinterpret_cast<void*>(memset_pattern16));
+#endif
+
+#ifdef MEMORY_SANITIZER
+  registry->Register("__msan_unpoison",
+                     reinterpret_cast<void*>(__msan_unpoison));
+#endif
+
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
index 78406ba143570183aea09d79db3f9b708c21bf70..3307c2f93d796bbdcd49af7f68e9f6c388e402ca 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "llvm/ADT/Triple.h"
+#include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
@@ -99,6 +100,11 @@ class SimpleOrcJIT {
  private:
   llvm::JITSymbol ResolveRuntimeSymbol(const std::string& name);
 
+  void NotifyObjectFinalized(
+      const llvm::object::ObjectFile& object,
+      const llvm::RuntimeDyld::LoadedObjectInfo& object_info);
+  void NotifyObjectFreed(const llvm::object::ObjectFile& object);
+
   std::vector<VModuleKeyT> module_keys_;
   std::unique_ptr<llvm::TargetMachine> target_machine_;
   const Disassembler disassembler_;
@@ -107,6 +113,15 @@ class SimpleOrcJIT {
   std::shared_ptr<llvm::orc::SymbolResolver> symbol_resolver_;
   ObjLayerT object_layer_;
   CompileLayerT compile_layer_;
+
+  // Non owning pointer to a JIT event listener that registers the JIT events
+  // with an attached GDB.
+  //
+  // Note: we get a pointer to this event listener using
+  // `createGDBRegistrationListener` which makes it look like we're supposed to
+  // free this, but the function is poorly named and really just returns a
+  // pointer to a static object.
+  llvm::JITEventListener* gdb_jit_event_listener_;
 };
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
index f8f5f392da8ab3348e63185aecf7b639daacaa42..8b7f843582b697058fe328fe69990122d868ada4 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 // Tests that we call into Eigen for dot operations as needed.
 
 #include <algorithm>
-#include <cctype>
 #include <string>
 
 #include "absl/strings/str_cat.h"
@@ -102,10 +101,10 @@ std::vector<DotTestSpec> GetDotTestCases() {
   return result;
 }
 
-INSTANTIATE_TEST_CASE_P(CpuEigenDotOperationTestInstantiation,
-                        CpuEigenDotOperationTest,
-                        ::testing::ValuesIn(GetDotTestCases()),
-                        DotTestSpecToString);
+INSTANTIATE_TEST_SUITE_P(CpuEigenDotOperationTestInstantiation,
+                         CpuEigenDotOperationTest,
+                         ::testing::ValuesIn(GetDotTestCases()),
+                         DotTestSpecToString);
 
 }  // namespace
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc
index 5cc6d01c0f15d4209cbc1fb259a0078fb9957f6e..f0f897e9635600b22e0c389ba056899e4d6ab3d4 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc
@@ -48,7 +48,7 @@ class InfeedTest : public ClientLibraryTestBase {
     ASSERT_IS_OK(client_->TransferToInfeed(literal));
     XlaBuilder builder(TestName());
     Infeed(&builder, literal.shape());
-    if (ShapeUtil::IsTuple(literal.shape())) {
+    if (literal.shape().IsTuple()) {
       // TODO(b/30609564): Use ComputeAndCompareLiteral instead.
       ComputeAndCompareTuple(&builder, literal, {});
     } else {
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
index 9b10c49f4f547edfb2164f98c49cceb031148bdc..9078b8fd1ff6cb0ddac89d5fcd13a9ccfae07763 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
-#include <cctype>
 #include <string>
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
@@ -59,8 +59,9 @@ class CpuUnaryIntrinsicTest
 
     string features{spec.features.data(), spec.features.size()};
     if (!features.empty()) {
-      std::replace_if(features.begin(), features.end(),
-                      [](char c) { return c != '_' && !isalnum(c); }, '_');
+      std::replace_if(
+          features.begin(), features.end(),
+          [](char c) { return c != '_' && !absl::ascii_isalnum(c); }, '_');
     } else {
       features = "";
     }
@@ -140,10 +141,10 @@ IntrinsicTestSpec CpuUnaryIntrinsicTestCases[] = {
         HloOpcode::kLog, kTriple_android_arm, "",
         R"(CHECK: fadd fast <4 x float> <float 0x3FBDE4A340000000, float 0x3FBDE4A340000000, float 0x3FBDE4A340000000, float 0x3FBDE4A340000000>)"}};
 
-INSTANTIATE_TEST_CASE_P(CpuUnaryIntrinsicTestInstantiation,
-                        CpuUnaryIntrinsicTest,
-                        ::testing::ValuesIn(CpuUnaryIntrinsicTestCases),
-                        CpuUnaryIntrinsicTest::Name);
+INSTANTIATE_TEST_SUITE_P(CpuUnaryIntrinsicTestInstantiation,
+                         CpuUnaryIntrinsicTest,
+                         ::testing::ValuesIn(CpuUnaryIntrinsicTestCases),
+                         CpuUnaryIntrinsicTest::Name);
 
 }  // namespace
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_key_value_sort_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_key_value_sort_test.cc
index 3934c03a04c978009282b3cd0d39bacf9b12a356..762ee67db9a1b2a753c6ec5538dee1d13282942e 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_key_value_sort_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_key_value_sort_test.cc
@@ -26,10 +26,16 @@ TEST_F(CpuKeyValueSortTest, SortR1) {
   const string hlo_text = R"(
 HloModule KeyValueSort
 
+compare {
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+}
+
 ENTRY main {
   a = f32[10] parameter(0)
 
-  ROOT result = f32[10] sort(f32[10] a), dimensions={0}
+  ROOT result = f32[10] sort(f32[10] a), dimensions={0}, to_apply=compare
 }
 )";
 
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
index fa0e09ff6b5694c0e97963b83c6e541b858a1376..0584c0484f810a03ccccd522163f54535440ef8b 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
@@ -31,29 +31,27 @@ HloModule RepeatedConstants
 while_body {
   arg_body = f32[2,3,2] parameter(0)
   ROOT const = f32[2,3,2] constant(
-  f32[2,3,2]
     {{{1, 2}, {1001, 1002}, {2001, 2002}},
      {{2, 1}, {2001, 3002}, {2001, 2002}}})
 }
 
 while_cond {
   arg_cond = f32[2,3,2] parameter(0)
-  token = token[] after-all()
-  infeed = (pred[], token[]) infeed(token)
+  token0 = token[] after-all()
+  infeed = (pred[], token[]) infeed(token0)
   ROOT unknown = pred[] get-tuple-element((pred[], token[]) infeed), index=0
 }
 
 ENTRY main {
   param = f32[2,3,2] parameter(0)
   const_a = f32[2,3,2] constant(
-  f32[2,3,2]
     {{{1, 2}, {1001, 1002}, {2001, 2002}},
      {{2, 1}, {2001, 3002}, {2001, 2002}}})
   const_b = f32[2,3,2] while(f32[2,3,2] const_a), condition=while_cond, body=while_body
 
-  token = token[] after-all()
-  out0 = token[] outfeed(f32[2,3,2] const_a, token[] token)
-  ROOT out1 = token[] outfeed(f32[2,3,2] const_b, token[] token)
+  token0 = token[] after-all()
+  out0 = token[] outfeed(f32[2,3,2] const_a, token[] token0)
+  ROOT out1 = token[] outfeed(f32[2,3,2] const_b, token[] token0)
 }
 )";
 
@@ -82,24 +80,24 @@ HloModule RepeatedConstants
 
 while_body {
   arg_body = (f32[2,1]{1,0}, f32[1]{0}) parameter(0)
-  ROOT const = (f32[2,1]{1,0}, f32[1]{0}) constant((f32[2,1], f32[1]) ( f32[2,1] { { 1 }, { 2 } }, {2} ))
+  ROOT const = (f32[2,1]{1,0}, f32[1]{0}) constant(({ { 1 }, { 2 } }, {2} ))
 }
 
 while_cond {
   arg_cond = (f32[2,1]{1,0}, f32[1]{0}) parameter(0)
-  token = token[] after-all()
-  infeed = (pred[], token[]) infeed(token)
+  token0 = token[] after-all()
+  infeed = (pred[], token[]) infeed(token0)
   ROOT unknown = pred[] get-tuple-element((pred[], token[]) infeed), index=0
 }
 
 ENTRY main {
   param = f32[2,3,2] parameter(0)
-  const_a = (f32[2,1]{1,0}, f32[1]{0}) constant((f32[2,1], f32[1]) ( f32[2,1] { { 1 }, { 2 } }, {2} ))
+  const_a = (f32[2,1]{1,0}, f32[1]{0}) constant(( { { 1 }, { 2 } }, {2} ))
   const_b = (f32[2,1]{1,0}, f32[1]{0}) while((f32[2,1]{1,0}, f32[1]{0}) const_a), condition=while_cond, body=while_body
 
-  token = token[] after-all()
-  out0 = () outfeed((f32[2,1]{1,0}, f32[1]{0}) const_a, token[] token)
-  ROOT out1 = () outfeed((f32[2,1]{1,0}, f32[1]{0}) const_b, token[] token)
+  token0 = token[] after-all()
+  out0 = () outfeed((f32[2,1]{1,0}, f32[1]{0}) const_a, token[] token0)
+  ROOT out1 = () outfeed((f32[2,1]{1,0}, f32[1]{0}) const_b, token[] token0)
 }
 )";
 
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
index a7702c2aeeaff8a46a2c4f2785ccb873ea2c08e5..030bd41c2fc73eac41fe43c1acdf862d5dc97f98 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
@@ -75,8 +75,9 @@ TEST_F(CpuNoAliasTest, Concat) {
   // the buffers in the HLO module.  We'll inspect these loads to ensure that
   // they have the expected alias information.
   llvm::Module ir_module("test", context);
-  llvm::Function* func = llvm::cast<llvm::Function>(
-      ir_module.getOrInsertFunction("test_fn", llvm::Type::getVoidTy(context)));
+  llvm::Function* func = llvm::dyn_cast<llvm::Function>(
+      ir_module.getOrInsertFunction("test_fn", llvm::Type::getVoidTy(context))
+          .getCallee());
   llvm::BasicBlock* bb = llvm::BasicBlock::Create(context, "body", func);
   llvm::IRBuilder<> b(bb);
   auto* zero = llvm::ConstantInt::get(llvm::Type::getInt32Ty(context), 0);
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
index e2c7af541eede5265f274c72f55305549f059839..aab7f0b393881642437f1891256bd138823a3b87 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
@@ -28,12 +28,11 @@ HloModule Outfeed
 
 ENTRY main {
   const_a = f32[2,3,2] constant(
-  f32[2,3,2]
     {{{1, 2}, {1001, 1002}, {2001, 2002}},
      {{2, 1}, {2001, 3002}, {2001, 2002}}})
 
-  token = token[] after-all()
-  outfeed = token[] outfeed(f32[2,3,2] const_a, token)
+  token0 = token[] after-all()
+  outfeed = token[] outfeed(f32[2,3,2] const_a, token0)
   ROOT root = () tuple()
 }
 )";
diff --git a/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.cc b/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9fc472ff767441e60cf618ac9022e5c50ea20023
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.cc
@@ -0,0 +1,1073 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h"
+
+#include "tensorflow/compiler/xla/service/cpu/vector_support_library.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+using tensorflow::int64;
+
+// Provides tiled access to an in-memory rank 2 array.
+class MemoryTile {
+ public:
+  // Constructs a MemoryTile that can operate on tiles consisting of
+  // `tile_size_along_major_dim` vectors from the matrix `matrix`, starting at
+  // `major_dim_offset` in the major dimension.  The tile size along the minor
+  // dimension is the vector size, and that is implicitly determined by `vsl`.
+  MemoryTile(VectorSupportLibrary* vsl, llvm::IRBuilder<>* b,
+             llvm::Value* matrix, int64 matrix_size_along_minor_dim,
+             llvm::Value* major_dim_offset, int64 tile_size_along_major_dim)
+      : vsl_(vsl), b_(b) {
+    pointers_.reserve(tile_size_along_major_dim);
+    for (int64 i = 0; i < tile_size_along_major_dim; i++) {
+      llvm::Value* total_offset =
+          b->CreateMul(b->getInt64(matrix_size_along_minor_dim),
+                       b->CreateAdd(b->getInt64(i), major_dim_offset));
+      pointers_.push_back(vsl_->ComputeOffsetPointer(matrix, total_offset));
+    }
+  }
+
+  // Load a tile consisting of `tile_size_along_major_dim` vectors from position
+  // {major: `major_dim_offset`, minor: `minor_dim_offset`}.
+  //
+  // Note: `major_dim_offset` is a parameter to the constructor.
+  std::vector<llvm::Value*> LoadTile(llvm::Value* minor_dim_offset) const {
+    std::vector<llvm::Value*> result;
+    result.reserve(pointers_.size());
+    for (const auto& pointer : pointers_) {
+      result.push_back(vsl_->LoadVector(pointer, minor_dim_offset));
+    }
+    return result;
+  }
+
+  // Stores `tile` to position {major: `major_dim_offset`, minor:
+  // `minor_dim_offset`}.
+  //
+  // Note: `major_dim_offset` is a parameter to the constructor.
+  void StoreTile(absl::Span<llvm::Value* const> tile,
+                 llvm::Value* minor_dim_offset) const {
+    CHECK_EQ(tile.size(), pointers_.size());
+    for (int64 i = 0; i < pointers_.size(); i++) {
+      vsl_->StoreVector(tile[i], pointers_[i], minor_dim_offset);
+    }
+  }
+
+  // Loads a tile of size [`tile_size_along_major_dim`,
+  // `tile_size_along_middle_dim`] from position {major: `major_dim_offset`,
+  // minor: `minor_dim_offset`} and then broadcasts each element into a vector
+  // of size vsl_.vector_size().  The (i,j)'th element of the return value is
+  // the (i,j)'th element in the tile broadcasted into an LLVM vector.
+  //
+  // Note: `major_dim_offset` is a parameter to the constructor.
+  std::vector<std::vector<llvm::Value*>> LoadBroadcastTile(
+      llvm::Value* minor_dim_offset, int64 tile_size_along_middle_dim) const {
+    std::vector<std::vector<llvm::Value*>> result;
+    result.resize(pointers_.size());
+    for (int64 i = 0; i < pointers_.size(); i++) {
+      for (int64 j = 0; j < tile_size_along_middle_dim; j++) {
+        result[i].push_back(vsl_->LoadBroadcast(
+            pointers_[i], b_->CreateAdd(minor_dim_offset, b_->getInt64(j))));
+      }
+    }
+    return result;
+  }
+
+ private:
+  VectorSupportLibrary* vsl_;
+  llvm::IRBuilder<>* b_;
+  std::vector<llvm::Value*> pointers_;
+};
+
+// The base class for the classes representing the GEMV emitter configurations.
+//
+// The IR emitted (modulo the LLVM values representing the input and output
+// buffers) by the row major and column major GEMV emitters should be a function
+// of their configuration.  This is important because their configuration is
+// used as a key to cache the generated IR.
+class GemvConfig {
+ public:
+  // Mixin for convenience.
+  template <typename T>
+  struct User {
+   public:
+    PrimitiveType scalar_type() const {
+      return derived().config().scalar_type();
+    }
+    int64 tile_rows() const { return derived().config().tile_rows(); }
+    int64 tile_cols() const { return derived().config().tile_cols(); }
+    int64 m() const { return derived().config().m(); }
+    int64 k() const { return derived().config().k(); }
+    int64 has_addend() const { return derived().config().has_addend(); }
+
+   private:
+    const T& derived() const { return *static_cast<const T*>(this); }
+  };
+
+  PrimitiveType scalar_type() const { return scalar_type_; }
+  int64 tile_rows() const { return tile_rows_; }
+  int64 tile_cols() const { return tile_cols_; }
+  int64 m() const { return m_; }
+  int64 k() const { return k_; }
+  bool has_addend() const { return has_addend_; }
+
+  string GetCacheKey() const {
+    return absl::StrCat(name_, "_", PrimitiveType_Name(scalar_type()), "_",
+                        tile_rows(), "_", tile_cols(), "_", m(), "_", k(),
+                        has_addend() ? "_with_addend" : "");
+  }
+
+ protected:
+  explicit GemvConfig(string name, PrimitiveType scalar_type, int64 tile_rows,
+                      int64 tile_cols, int64 m, int64 k, bool has_addend)
+      : name_(std::move(name)),
+        scalar_type_(scalar_type),
+        tile_rows_(tile_rows),
+        tile_cols_(tile_cols),
+        m_(m),
+        k_(k),
+        has_addend_(has_addend) {}
+
+ private:
+  string name_;
+  PrimitiveType scalar_type_;
+  int64 tile_rows_;
+  int64 tile_cols_;
+  int64 m_;
+  int64 k_;
+  bool has_addend_;
+};
+
+// Computes a dot product between "[M,K]{0,1} lhs" with a [K,1] vector (the
+// layout of the vector does not matter).  This implementation uses a tiling
+// scheme to improve performance.
+//
+// We logically separate the LHS matrix into four segments:
+//
+//   +----------------------+---+
+//   |                      |   |
+//   |                      |   |
+//   |         A            | B |
+//   |                      |   |
+//   |                      |   |
+//   |                      |   |
+//   +----------------------+---+
+//   |         C            | D |
+//   +----------------------+---+
+//
+// where A is the largest submatrix of the LHS that can be evenly dividied into
+// tiles.  For each tile in A, assuming tile_rows_ == tile_cols_ == 4, we have:
+//
+//   +---+---+---+---+       +--+--+--+--+
+//   |M00|M10|M20|M30|       |V0|V1|V2|V3|
+//   +---+---+---+---+       +--+--+--+--+
+//   |M01|M11|M21|M31| and   |V0|V1|V2|V3|
+//   +---+---+---+---+       +--+--+--+--+
+//   |M02|M12|M22|M32|       |V0|V1|V2|V3|
+//   +---+---+---+---+       +--+--+--+--+
+//   |M03|M13|M23|M33|       |V0|V1|V2|V3|
+//   +---+---+---+---+       +--+--+--+--+
+//
+// (Legend: rows are horizontal and columns are vertical; and each column is one
+// llvm::Value of a vector type)
+//
+// where:
+//
+//   a. The left tile is from the column major left matrix.
+//   b. The right tile is an elementwise broadcast of a [V0, V1, V2, V3]
+//      vector loaded from the RHS vector.
+//
+// As we iterate through the column dimension, we compute the change to the
+// result vector by an elementwise multiplication between the two tiles above
+// followed by a reduction along the major dimension:
+//
+//                     +-----------------------------------+
+//                     | M00*V0 + M10*V1 + M20*V2 + M30*V3 |
+//                     +-----------------------------------+
+//                     | M01*V0 + M11*V1 + M21*V2 + M31*V3 |
+// Result[R:R+4] +=    +-----------------------------------+
+//                     | M02*V0 + M12*V1 + M22*V2 + M32*V3 |
+//                     +-----------------------------------+
+//                     | M03*V0 + M13*V1 + M23*V2 + M33*V3 |
+//                     +-----------------------------------+
+//
+// Where R is the starting row for the tile.
+//
+// We have an inner epilogue loop to deal with the "C" submatrix and an outer
+// epilogue loop to deal with the B,D submarix.
+//
+// TODO(sanjoy): We should investigate if using gather loads and scatter stores
+// can be used here have the same inner loop for both column-major and row-major
+// matrix-vector products.
+class ColumnMajorMatrixVectorProductEmitter
+    : public GemvConfig::User<ColumnMajorMatrixVectorProductEmitter> {
+ public:
+  class Config : public GemvConfig {
+   public:
+    explicit Config(PrimitiveType scalar_type, int64 tile_rows, int64 tile_cols,
+                    int64 m, int64 k, bool has_addend)
+        : GemvConfig(/*name=*/"col_major_gemv", scalar_type,
+                     /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols, /*m=*/m,
+                     /*k=*/k, /*has_addend=*/has_addend) {}
+  };
+
+  ColumnMajorMatrixVectorProductEmitter(const Config& config, llvm::Value* lhs,
+                                        llvm::Value* rhs, llvm::Value* addend,
+                                        llvm::Value* result,
+                                        llvm::IRBuilder<>* b)
+      : config_(config),
+        lhs_(lhs),
+        rhs_(rhs),
+        addend_(addend),
+        result_(result),
+        b_(b),
+        ksl_(b_),
+        vsl_(config.scalar_type(), /*vector_size=*/config.tile_rows(), b_, "") {
+    CHECK(tile_rows() > 0 && IsPowerOfTwo(static_cast<uint64>(tile_rows())));
+    CHECK(!has_addend() || addend != nullptr);
+  }
+
+  void Emit();
+
+  const Config& config() const { return config_; }
+
+ private:
+  void EmitOuterLoopBody(llvm::Value* column, int64 column_count,
+                         bool is_first_column);
+
+  MemoryTile GetLhsMemoryTile(llvm::Value* column_start, int64 column_count) {
+    return MemoryTile(&vsl_, b_, /*matrix=*/lhs_,
+                      /*matrix_size_along_minor_dim=*/m(),
+                      /*major_dim_offset=*/column_start,
+                      /*tile_size_along_major_dim=*/column_count);
+  }
+
+  // Load a tile of values from the RHS.  For the RHS a "tile" is a contiguous
+  // sequence of `count` values, each one broadcasted to the vector width.
+  std::vector<llvm::Value*> LoadRhsTile(llvm::Value* offset, int64 count) {
+    llvm::Value* base_pointer = vsl_.ComputeOffsetPointer(rhs_, offset);
+    std::vector<llvm::Value*> result;
+    result.reserve(count);
+    for (int64 i = 0; i < count; i++) {
+      result.push_back(vsl_.LoadBroadcast(base_pointer, i));
+    }
+    return result;
+  }
+
+  void EmitInnerLoopTiled(MemoryTile* lhs_memory_tile,
+                          const std::vector<llvm::Value*>& rhs_tile,
+                          int64 columns, bool is_first_column);
+
+  void EmitInnerLoopEpilogue(llvm::Value* current_tile_col, int64 columns,
+                             bool is_first_tiled_column);
+
+  Config config_;
+  llvm::Value* lhs_;
+  llvm::Value* rhs_;
+  llvm::Value* addend_;
+  llvm::Value* result_;
+  llvm::IRBuilder<>* b_;
+  KernelSupportLibrary ksl_;
+  VectorSupportLibrary vsl_;
+};
+
+void ColumnMajorMatrixVectorProductEmitter::EmitOuterLoopBody(
+    llvm::Value* column, int64 column_count, bool is_first_column) {
+  MemoryTile lhs_memory_tile = GetLhsMemoryTile(/*column_start=*/column,
+                                                /*column_count=*/column_count);
+
+  std::vector<llvm::Value*> rhs_tile =
+      LoadRhsTile(column, /*count=*/column_count);
+  EmitInnerLoopTiled(&lhs_memory_tile, rhs_tile,
+                     /*columns=*/column_count, is_first_column);
+  EmitInnerLoopEpilogue(column, /*columns=*/column_count, is_first_column);
+}
+
+void ColumnMajorMatrixVectorProductEmitter::Emit() {
+  // See the comment on the class declaration for the algorithm used here.
+  int64 column_remainder = k() % tile_cols();
+  int64 column_limit = k() - column_remainder;
+
+  ksl_.For("dot.outer.tiled",
+           /*start=*/0, /*end=*/column_limit, /*step=*/tile_cols(),
+           [&](llvm::Value* column, bool is_first_column) {
+             EmitOuterLoopBody(column, tile_cols(), is_first_column);
+           });
+
+  if (column_remainder != 0) {
+    EmitOuterLoopBody(b_->getInt64(column_limit), column_remainder,
+                      column_limit == 0);
+  }
+}
+
+void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
+    MemoryTile* lhs_memory_tile, const std::vector<llvm::Value*>& rhs_tile,
+    int64 columns, bool is_first_column) {
+  int64 row_limit = m() - (m() % tile_rows());
+
+  ksl_.For("dot.inner.tiled", /*start=*/0, /*end=*/row_limit,
+           /*step=*/tile_rows(), [&](llvm::Value* row) {
+             std::vector<llvm::Value*> lhs_tile =
+                 lhs_memory_tile->LoadTile(/*minor_dim_offset=*/row);
+             llvm::Value* accumulator =
+                 is_first_column ? (addend_ ? vsl_.LoadVector(addend_, row)
+                                            : vsl_.GetZeroVector())
+                                 : vsl_.LoadVector(result_, row);
+             for (int i = 0; i < columns; i++) {
+               accumulator = vsl_.MulAdd(lhs_tile[i], rhs_tile[i], accumulator);
+             }
+             vsl_.StoreVector(accumulator, result_, row);
+           });
+}
+
+void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
+    llvm::Value* current_tile_col, int64 columns, bool is_first_tiled_column) {
+  int64 row_start = m() - (m() % tile_rows());
+  if (row_start == m()) {
+    return;
+  }
+
+  llvm::Value* columns_llvm = b_->getInt64(columns);
+
+  // for (col = current_tile_col; col < (columns + current_tile_col); col++)
+  //   for (row = row_start, row < m_; row++) {
+  //     result[row] += lhs[row, col] * rhs[col]
+  //     // Also take into account that if col is 0 then result[row] is not
+  //     // initialized.
+  //   }
+
+  ksl_.For(
+      "dot.inner.epilg.outer", /*start=*/current_tile_col,
+      /*end=*/b_->CreateAdd(columns_llvm, current_tile_col),
+      /*step=*/1, /*peel_first_iteration=*/false,
+      [&](llvm::Value* col, llvm::Value* is_first_scalar_col) {
+        llvm::Value* rhs_element = vsl_.LoadScalar(rhs_, col);
+        llvm::Value* total_offset = b_->CreateMul(col, b_->getInt64(m()));
+        llvm::Value* lhs_base_pointer =
+            vsl_.ComputeOffsetPointer(lhs_, total_offset);
+        ksl_.For(
+            "dot.inner.epilg.inner", /*start=*/row_start, /*end=*/m(),
+            /*step=*/1, [&](llvm::Value* scalar_row) {
+              llvm::Value* product = vsl_.Mul(
+                  vsl_.LoadScalar(lhs_base_pointer, scalar_row), rhs_element);
+              llvm::Value* setting_result_first_time = b_->CreateAnd(
+                  is_first_scalar_col, b_->getInt1(is_first_tiled_column));
+              ksl_.If(
+                  setting_result_first_time,
+                  /*true_block_generator=*/
+                  [&]() {
+                    if (addend_) {
+                      vsl_.StoreScalar(
+                          vsl_.Add(vsl_.LoadScalar(addend_, scalar_row),
+                                   product),
+                          result_, scalar_row);
+                    } else {
+                      vsl_.StoreScalar(product, result_, scalar_row);
+                    }
+                  },
+                  /*false_block_generator=*/
+                  [&]() {
+                    vsl_.StoreScalar(
+                        vsl_.Add(vsl_.LoadScalar(result_, scalar_row), product),
+                        result_, scalar_row);
+                  });
+            });
+      });
+}
+
+// Computes a dot product between "[M,K]{1,0} lhs" with a [K,1] vector (the
+// layout of the vector does not matter).  This implementation uses a tiling
+// scheme to improve performance.
+//
+// We logically separate the LHS matrix into four segments:
+//
+//   +----------------------+---+
+//   |                      |   |
+//   |                      |   |
+//   |         A            | B |
+//   |                      |   |
+//   |                      |   |
+//   |                      |   |
+//   +----------------------+---+
+//   |         C            | D |
+//   +----------------------+---+
+//
+// where A is the largest submatrix of the LHS that can be evenly dividied into
+// tiles.  For each tile in A, assuming tile_rows_ == tile_cols_ == 4, we have:
+//
+//   +---+---+---+---+
+//   |M00|M10|M20|M30|
+//   +---+---+---+---+       +--+--+--+--+
+//   |M01|M11|M21|M31| and   |V0|V1|V2|V3|
+//   +---+---+---+---+       +--+--+--+--+
+//   |M02|M12|M22|M32|
+//   +---+---+---+---+
+//   |M03|M13|M23|M33|
+//   +---+---+---+---+
+//
+// (Legend: rows are horizontal and columns are vertical; and each row is one
+// llvm::Value of a vector type)
+//
+// where:
+//
+//   a. The left tile is loaded from the row major left matrix.
+//   b. The right vector is loaded from the RHS vector.
+//
+// We keep 4 vector accumulators accumulating the following four vector
+// expressions as we iterate over the row dimension:
+//
+//   +------+------+------+------+
+//   |M0I*V0|M1I*V1|M2I*V2|M3I*V3|  for I in [0,4)
+//   +------+------+------+------+
+//
+// In the end we do a horizontal reduction over these 4 vector accumulators to
+// get 4 values in the result vector.
+//
+// We have an inner epilogue loop to deal with the "B" sub-matrix and an outer
+// epilogue loop to deal with the C,D submatrix.
+class RowMajorMatrixVectorProductEmitter
+    : public GemvConfig::User<RowMajorMatrixVectorProductEmitter> {
+ public:
+  class Config : public GemvConfig {
+   public:
+    explicit Config(PrimitiveType scalar_type, int64 tile_rows, int64 tile_cols,
+                    int64 m, int64 k, bool has_addend)
+        : GemvConfig(/*name=*/"row_major_gemv", scalar_type,
+                     /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols, /*m=*/m,
+                     /*k=*/k, /*has_addend=*/has_addend) {}
+  };
+
+  RowMajorMatrixVectorProductEmitter(const Config& config, llvm::Value* lhs,
+                                     llvm::Value* rhs, llvm::Value* addend,
+                                     llvm::Value* result, llvm::IRBuilder<>* b)
+      : config_(config),
+        lhs_(lhs),
+        rhs_(rhs),
+        addend_(addend),
+        result_(result),
+        b_(b),
+        ksl_(b_),
+        vsl_(scalar_type(), /*vector_size=*/tile_cols(), b_, "") {
+    CHECK(tile_cols() > 0 && IsPowerOfTwo(static_cast<uint64>(tile_cols())));
+    CHECK(!has_addend() || addend != nullptr);
+  }
+
+  void Emit();
+
+  const Config& config() const { return config_; }
+
+ private:
+  MemoryTile GetLhsMemoryTile(llvm::Value* row_start, int64 row_count) {
+    return MemoryTile(&vsl_, b_, /*matrix=*/lhs_,
+                      /*matrix_size_along_minor_dim=*/k(),
+                      /*major_dim_offset=*/row_start,
+                      /*tile_size_along_major_dim=*/row_count);
+  }
+
+  void EmitOuterLoopBody(llvm::Value* row, int64 row_count);
+
+  void EmitInnerLoopTiled(MemoryTile* lhs_memory_tile, int64 rows,
+                          std::vector<VectorVariable>* vector_accumulators);
+
+  void EmitInnerLoopEpilogue(llvm::Value* current_tile_row, int64 rows,
+                             std::vector<ScalarVariable>* scalar_accumulators);
+
+  Config config_;
+  llvm::Value* lhs_;
+  llvm::Value* rhs_;
+  llvm::Value* addend_;
+  llvm::Value* result_;
+  llvm::IRBuilder<>* b_;
+  KernelSupportLibrary ksl_;
+  VectorSupportLibrary vsl_;
+};
+
+void RowMajorMatrixVectorProductEmitter::EmitOuterLoopBody(llvm::Value* row,
+                                                           int64 row_count) {
+  MemoryTile lhs_memory_tile = GetLhsMemoryTile(/*row_start=*/row,
+                                                /*row_count=*/row_count);
+  std::vector<VectorVariable> vector_accumulators;
+  std::vector<ScalarVariable> scalar_accumulators;
+  for (int i = 0; i < row_count; i++) {
+    vector_accumulators.emplace_back(&vsl_, vsl_.GetZeroVector());
+    scalar_accumulators.emplace_back(&vsl_, vsl_.GetZeroScalar());
+  }
+  EmitInnerLoopTiled(&lhs_memory_tile, /*rows=*/row_count,
+                     &vector_accumulators);
+  EmitInnerLoopEpilogue(/*current_tile_row=*/row, /*rows=*/row_count,
+                        &scalar_accumulators);
+
+  std::vector<llvm::Value*> accumulator_values;
+  std::transform(
+      vector_accumulators.begin(), vector_accumulators.end(),
+      std::back_inserter(accumulator_values),
+      [](const VectorVariable& vector_var) { return vector_var.Get(); });
+
+  std::vector<llvm::Value*> horizontal_sums;
+  if (row_count == vsl_.vector_size()) {
+    if (addend_) {
+      horizontal_sums = vsl_.ComputeHorizontalSums(
+          std::move(accumulator_values), vsl_.LoadVector(addend_, row));
+    } else {
+      horizontal_sums =
+          vsl_.ComputeHorizontalSums(std::move(accumulator_values));
+    }
+  } else {
+    horizontal_sums = vsl_.ComputeHorizontalSums(std::move(accumulator_values));
+  }
+
+  for (int i = 0; i < row_count; i++) {
+    llvm::Value* result_value =
+        vsl_.Add(horizontal_sums[i], scalar_accumulators[i].Get());
+    llvm::Value* offset = b_->CreateAdd(b_->getInt64(i), row);
+    if (addend_ && row_count != vsl_.vector_size()) {
+      result_value = vsl_.Add(vsl_.LoadScalar(addend_, offset), result_value);
+    }
+    vsl_.StoreScalar(result_value, result_, offset);
+  }
+}
+
+void RowMajorMatrixVectorProductEmitter::Emit() {
+  // See the comment on the class declaration for the algorithm used here.
+  int64 row_remainder = m() % tile_rows();
+  int64 row_limit = m() - row_remainder;
+
+  ksl_.For("dot.outer.tiled",
+           /*start=*/0, /*end=*/row_limit, /*step=*/tile_rows(),
+           [&](llvm::Value* row) { EmitOuterLoopBody(row, tile_rows()); });
+
+  if (row_remainder != 0) {
+    EmitOuterLoopBody(b_->getInt64(row_limit), row_remainder);
+  }
+}
+
+void RowMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
+    MemoryTile* lhs_memory_tile, int64 rows,
+    std::vector<VectorVariable>* vector_accumulators) {
+  int64 column_limit = k() - (k() % tile_cols());
+
+  ksl_.For("dot.inner.tiled", /*start=*/0, /*end=*/column_limit,
+           /*step=*/tile_cols(), [&](llvm::Value* col) {
+             std::vector<llvm::Value*> lhs_tile =
+                 lhs_memory_tile->LoadTile(/*minor_dim_offset=*/col);
+             llvm::Value* rhs_value = vsl_.LoadVector(rhs_, col);
+             for (int i = 0; i < rows; i++) {
+               llvm::Value* old_sum = (*vector_accumulators)[i].Get();
+               (*vector_accumulators)[i].Set(
+                   vsl_.Add(old_sum, vsl_.Mul(rhs_value, lhs_tile[i])));
+             }
+           });
+}
+
+void RowMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
+    llvm::Value* current_tile_row, int64 rows,
+    std::vector<ScalarVariable>* scalar_accumulators) {
+  int64 column_start = k() - (k() % tile_cols());
+  if (column_start == k()) {
+    return;
+  }
+
+  for (int r = 0; r < rows; r++) {
+    llvm::Value* total_offset = b_->CreateMul(
+        b_->CreateAdd(b_->getInt64(r), current_tile_row), b_->getInt64(k()));
+    llvm::Value* lhs_base_pointer =
+        vsl_.ComputeOffsetPointer(lhs_, total_offset);
+    ksl_.For("dot.inner.epilg.inner", /*start=*/column_start, /*end=*/k(),
+             /*step=*/1, [&](llvm::Value* scalar_col) {
+               llvm::Value* product =
+                   vsl_.Mul(vsl_.LoadScalar(lhs_base_pointer, scalar_col),
+                            vsl_.LoadScalar(rhs_, scalar_col));
+               llvm::Value* old_value = (*scalar_accumulators)[r].Get();
+               (*scalar_accumulators)[r].Set(vsl_.Add(old_value, product));
+             });
+  }
+}
+
+// This class implements a tiled matrix multiplication algorithm, intended for
+// multiplying small matrices that don't need cache tiling.
+//
+// In the future this can be used as the innermost GEBP loop in a GEMM kernel as
+// described in "Goto, Kazushige, and Robert A. Geijn. "Anatomy of
+// high-performance matrix multiplication." ACM Transactions on Mathematical
+// Software (TOMS) 34.3 (2008): 12.".
+//
+// This only supports canonical dot operations (i.e. where the lhs contraction
+// dimension is 1 and the rhs contraction dimension is 0) over row major
+// matrices.
+class TiledSmallGemmEmitter {
+ public:
+  // Describe the dimensions of the kernel.
+  class Dimensions {
+   public:
+    explicit Dimensions(int64 m, int64 k, int64 n) : m_(m), k_(k), n_(n) {}
+
+    int64 m() const { return m_; }
+    int64 k() const { return k_; }
+    int64 n() const { return n_; }
+
+    string ToString() const { return absl::StrCat(m(), "x", k(), "x", n()); }
+
+   private:
+    const int64 m_;
+    const int64 k_;
+    const int64 n_;
+  };
+
+  // Represents the configuration of the emitter.  The LLVM IR emitted by the
+  // emitter, modulo the LLVM values holding the input and output buffers, must
+  // be a function of the instance of `Config` passed to it.
+  //
+  // `dims` holds the matrix multiplication dimensions.
+  //
+  // `max_vectorization_width` is the maximum vector width (i.e. the width of
+  // the largest vector register we will use).  This can be larger than the
+  // largest vector register supported by the machine -- LLVM will legalize
+  // these large vector widths into legally sized vectors.
+  //
+  // `max_vector_count` is the maximum number of vectors of size
+  // `max_vectorization_width` that we will attempt to process at once.
+  //
+  // `min_vectorization_width` is the smallest vector width the emitter will use
+  // -- below that it will devolve to using a scalar loop.
+  //
+  // The innermost reduction loop executes the matrix multiply in tiles of size
+  // [`tile_size_m`, `tile_size_k`] from the LHS and [`tile_size_k`,
+  // <vectorization width>] in the RHS.
+  class Config {
+   public:
+    explicit Config(PrimitiveType scalar_type, Dimensions dims,
+                    int64 max_vectorization_width, int64 max_vector_count,
+                    int64 min_vectorization_width, int64 tile_size_m,
+                    int64 tile_size_k)
+        : scalar_type_(scalar_type),
+          dims_(dims),
+          max_vectorization_width_(max_vectorization_width),
+          max_vector_count_(max_vector_count),
+          min_vectorization_width_(min_vectorization_width),
+          tile_size_m_(tile_size_m),
+          tile_size_k_(tile_size_k) {}
+
+    string GetCacheKey() const {
+      return absl::StrCat("gemm_", PrimitiveType_Name(scalar_type()), "_",
+                          dims().ToString(), "_", max_vectorization_width(),
+                          "_", min_vectorization_width(), "_", tile_size_m(),
+                          "_", tile_size_k());
+    }
+
+    PrimitiveType scalar_type() const { return scalar_type_; }
+    Dimensions dims() const { return dims_; }
+    int64 max_vectorization_width() const { return max_vectorization_width_; }
+    int64 max_vector_count() const { return max_vector_count_; }
+    int64 min_vectorization_width() const { return min_vectorization_width_; }
+
+    int64 tile_size_m() const { return tile_size_m_; }
+    int64 tile_size_k() const { return tile_size_k_; }
+
+   private:
+    PrimitiveType scalar_type_;
+    Dimensions dims_;
+    int64 max_vectorization_width_;
+    int64 max_vector_count_;
+    int64 min_vectorization_width_;
+    int64 tile_size_m_;
+    int64 tile_size_k_;
+  };
+
+  // Creates an instance of TiledSmallGemmEmitter that matrix-multiplies
+  // `lhs` with `rhs` and stores the result in `result`.
+  explicit TiledSmallGemmEmitter(Config config, llvm::Value* lhs,
+                                 llvm::Value* rhs, llvm::Value* result,
+                                 llvm::IRBuilder<>* b)
+      : lhs_(lhs),
+        rhs_(rhs),
+        result_(result),
+        config_(config),
+        b_(b),
+        ksl_(b_) {
+    CHECK(max_vectorization_width() > 0 &&
+          IsPowerOfTwo(static_cast<uint64>(max_vectorization_width())));
+    CHECK_GT(max_vector_count(), 0);
+    CHECK(min_vectorization_width() > 0 &&
+          IsPowerOfTwo(static_cast<uint64>(min_vectorization_width())));
+    CHECK_GE(max_vectorization_width(), min_vectorization_width());
+    CHECK_GT(tile_size_k(), 0);
+  }
+
+  void Emit();
+
+ private:
+  // The HandleResiduesOnX helpers split the iteration space for dimension X
+  // into a multiple of the tile size on dimension X and an epilogue.  These
+  // helpers ultimately call into `EmitTiledGemm` for emitting the
+  // tiled GEMM kernel.
+
+  void HandleResiduesOnN();
+  void HandleResiduesOnK(VectorSupportLibrary* vsl, llvm::Value* n_start,
+                         llvm::Value* n_end);
+  void HandleResiduesOnM(VectorSupportLibrary* vsl, int64 tile_size_k,
+                         llvm::Value* k_start, llvm::Value* k_end,
+                         llvm::Value* n_start, llvm::Value* n_end);
+
+  // This emits a tiled GEMM kernel.  For a detailed description see the comment
+  // on the implementation.
+  void EmitTiledGemm(VectorSupportLibrary* vsl, int64 tile_size_k,
+                     llvm::Value* k_start, llvm::Value* k_end,
+                     llvm::Value* n_start, llvm::Value* n_end,
+                     int64 tile_size_m, llvm::Value* m_start,
+                     llvm::Value* m_end);
+
+  llvm::Value* GetInt64(int64 value) { return b_->getInt64(value); }
+
+  Config config() const { return config_; }
+  Dimensions dims() const { return config().dims(); }
+
+  int64 max_vectorization_width() const {
+    return config().max_vectorization_width();
+  }
+  int64 max_vector_count() const { return config().max_vector_count(); }
+  int64 min_vectorization_width() const {
+    return config().min_vectorization_width();
+  }
+  int64 tile_size_m() const { return config().tile_size_m(); }
+  int64 tile_size_k() const { return config().tile_size_k(); }
+  PrimitiveType scalar_type() const { return config().scalar_type(); }
+
+  llvm::Value* lhs_;
+  llvm::Value* rhs_;
+  llvm::Value* result_;
+  Config config_;
+
+  llvm::IRBuilder<>* b_;
+  KernelSupportLibrary ksl_;
+};
+
+void TiledSmallGemmEmitter::Emit() { HandleResiduesOnN(); }
+
+void TiledSmallGemmEmitter::HandleResiduesOnN() {
+  // We can only iterate the `n` dimension for an extent that is divisible by
+  // the vectorization width.  So we emit an outer loop that first processes the
+  // largest extent in `n` that is divisible by max_vectorization_width, then
+  // the largest remaining extent that is divisible by max_vectorization_width /
+  // 2 etc.
+
+  int64 current_vectorization_width =
+      max_vector_count() * max_vectorization_width();
+  int64 current_vector_count = max_vector_count();
+
+  int64 n_start = 0;
+  while (n_start != dims().n() &&
+         current_vectorization_width >= min_vectorization_width()) {
+    int64 n_end = dims().n() - (dims().n() % current_vectorization_width);
+    if (n_start != n_end) {
+      VectorSupportLibrary vsl(scalar_type(), current_vectorization_width, b_,
+                               "gemm");
+      HandleResiduesOnK(&vsl, GetInt64(n_start), GetInt64(n_end));
+      n_start = n_end;
+    }
+    if (current_vector_count == 1) {
+      current_vectorization_width /= 2;
+    } else {
+      current_vector_count--;
+      current_vectorization_width =
+          current_vector_count * max_vectorization_width();
+    }
+  }
+
+  if (n_start != dims().n()) {
+    VectorSupportLibrary vsl(scalar_type(), 1, b_, "gemm");
+    ksl_.For("epi.n", n_start, dims().n(), 1, [&](llvm::Value* n_i) {
+      llvm::Value* n_i_next = b_->CreateAdd(n_i, b_->getInt64(1));
+      HandleResiduesOnK(&vsl, n_i, n_i_next);
+    });
+  }
+}
+
+void TiledSmallGemmEmitter::HandleResiduesOnK(VectorSupportLibrary* vsl,
+                                              llvm::Value* n_start,
+                                              llvm::Value* n_end) {
+  int64 k_start = 0;
+  int64 k_end = dims().k() - (dims().k() % tile_size_k());
+  if (k_end != k_start) {
+    HandleResiduesOnM(vsl, tile_size_k(), GetInt64(k_start), GetInt64(k_end),
+                      n_start, n_end);
+    k_start = k_end;
+  }
+
+  if (k_start != dims().k()) {
+    HandleResiduesOnM(vsl, dims().k() - k_start, GetInt64(k_start),
+                      GetInt64(dims().k()), n_start, n_end);
+  }
+}
+
+void TiledSmallGemmEmitter::HandleResiduesOnM(
+    VectorSupportLibrary* vsl, int64 tile_size_k, llvm::Value* k_start,
+    llvm::Value* k_end, llvm::Value* n_start, llvm::Value* n_end) {
+  const int64 m_end = dims().m() - dims().m() % tile_size_m();
+  EmitTiledGemm(vsl, tile_size_k, k_start, k_end, n_start, n_end, tile_size_m(),
+                GetInt64(0), GetInt64(m_end));
+
+  if (m_end != dims().m()) {
+    EmitTiledGemm(vsl, tile_size_k, k_start, k_end, n_start, n_end,
+                  dims().m() - m_end, GetInt64(m_end), GetInt64(dims().m()));
+  }
+}
+
+// The loop structure is:
+//
+// Iterate over dimension M as m:
+//   Iterate over dimension N as n:
+//     Iterate over dimension K as k:
+//       OutputTile[m,n] += Dot(LhsTile[m,k], RhsTile[k,n])
+//
+// I.e. a just a tiled version of a "naive" GEMM.
+//
+// The tiling scheme is as follows:
+//
+// Let the LHS be:
+//
+//   +----+----+----+
+//   | a0 | b0 | c0 | .
+//   +----+----+----+ .
+//   | a1 | b1 | c1 | .
+//   +----+----+----+
+//     ..     ..
+//
+// and the RHS be:
+//
+//   +----+----+----+----+
+//   | p0 | p1 | p2 | p3 | .
+//   +----+----+----+----+ .
+//   | q0 | q1 | q2 | q3 | .
+//   +----+----+----+----+
+//   | r0 | r1 | r2 | r3 | .
+//   +----+----+----+----+ .
+//     ......    ......
+//
+// and let tile_size_m=2, tile_size_k=3 and the vector width (implicitly denoted
+// by `vsl`) be 4.  Then we want to matrix multiply this tile to get a [2,4]
+// matrix that we can increment the result matrix by.
+//
+// First broadcast the rows row in LHS to 3 vectors of width 4, giving us a rank
+// 3 array, L, of dimension [2,3,4]:
+//
+//       L[0,_,_]           *      L[1,_,_]
+//                          *
+//   +----+----+----+----+  *  +----+----+----+----+
+//   | a0 | a0 | a0 | a0 |  *  | a1 | a1 | a1 | a1 |
+//   +----+----+----+----+  *  +----+----+----+----+
+//   | b0 | b0 | b0 | b0 |  *  | b1 | b1 | b1 | b1 |
+//   +----+----+----+----+  *  +----+----+----+----+
+//   | c0 | c0 | c0 | c0 |  *  | c1 | c1 | c1 | c1 |
+//   +----+----+----+----+  *  +----+----+----+----+
+//
+//
+// Then we FMA L[0,_,_] with the RHS to get the first row of the result and
+// L[1,_,_] with the RHS to get the second row of the result.  For example,
+// L[0,_,_] is computed as:
+//
+//   +----+----+----+----+   +----+----+----+----+
+//   | a0 | a0 | a0 | a0 | * | p0 | p1 | p2 | p3 |   +
+//   +----+----+----+----+   +----+----+----+----+
+//
+//   +----+----+----+----+   +----+----+----+----+
+//   | b0 | b0 | b0 | b0 | * | q0 | q1 | q2 | q3 |   +
+//   +----+----+----+----+   +----+----+----+----+
+//
+//   +----+----+----+----+   +----+----+----+----+
+//   | c0 | c0 | c0 | c0 | * | r0 | r1 | r2 | r3 |
+//   +----+----+----+----+   +----+----+----+----+
+//
+// to get:
+//
+//   +-------------------+-------------------+-------------------+---------
+//   | a0*p0+b0*q0+c0*r0 | a0*p1+b0*q1+c0*r1 | a0*p2+b0*q2+c0*r2 |  ...
+//   +-------------------+-------------------+-------------------+---------
+void TiledSmallGemmEmitter::EmitTiledGemm(
+    VectorSupportLibrary* vsl, int64 tile_size_k, llvm::Value* k_start,
+    llvm::Value* k_end, llvm::Value* n_start, llvm::Value* n_end,
+    int64 tile_size_m, llvm::Value* m_start, llvm::Value* m_end) {
+  ksl_.For("dot.m", m_start, m_end, tile_size_m, [&](llvm::Value* m_i) {
+    MemoryTile result_memory_tile(vsl, b_, /*matrix=*/result_,
+                                  /*matrix_size_along_minor_dim=*/dims().n(),
+                                  /*major_dim_offset=*/m_i,
+                                  /*tile_size_along_major_dim=*/tile_size_m);
+    MemoryTile lhs_memory_tile(vsl, b_, /*matrix=*/lhs_,
+                               /*matrix_size_along_minor_dim=*/dims().k(),
+                               /*major_dim_offset=*/m_i,
+                               /*tile_size_along_major_dim=*/tile_size_m);
+    ksl_.For(
+        "dot.n", n_start, n_end, vsl->vector_size(), [&](llvm::Value* n_i) {
+          TileVariable result_tile_var(vsl, result_memory_tile.LoadTile(n_i));
+          ksl_.For("dot.k", k_start, k_end, tile_size_k, [&](llvm::Value* k_i) {
+            MemoryTile rhs_memory_tile(vsl, b_, rhs_, dims().n(), k_i,
+                                       tile_size_k);
+            std::vector<std::vector<llvm::Value*>> lhs_tile =
+                lhs_memory_tile.LoadBroadcastTile(k_i, tile_size_k);
+            std::vector<llvm::Value*> rhs_tile = rhs_memory_tile.LoadTile(n_i);
+            std::vector<llvm::Value*> result_tile = result_tile_var.Get();
+            for (int64 r_m_i = 0; r_m_i < tile_size_m; r_m_i++) {
+              for (int64 r_k_i = 0; r_k_i < tile_size_k; r_k_i++) {
+                result_tile[r_m_i] =
+                    vsl->MulAdd(lhs_tile[r_m_i][r_k_i], rhs_tile[r_k_i],
+                                result_tile[r_m_i]);
+              }
+            }
+            result_tile_var.Set(result_tile);
+          });
+
+          result_memory_tile.StoreTile(result_tile_var.Get(), n_i);
+        });
+  });
+}
+
+llvm::Type* GetPointerToElementType(llvm::Type* pointer_type) {
+  llvm::Type* type =
+      llvm::cast<llvm::PointerType>(pointer_type)->getElementType();
+  while (auto* array_type = llvm::dyn_cast<llvm::ArrayType>(type)) {
+    type = array_type->getElementType();
+  }
+
+  return type->getPointerTo();
+}
+
+struct GemvBuffersWithCanonicalType {
+  llvm::Value* lhs_canonicalized;
+  llvm::Value* rhs_canonicalized;
+  llvm::Value* addend_canonicalized;
+  llvm::Value* result_canonicalized;
+};
+
+GemvBuffersWithCanonicalType GetGemvBuffersWithCanonicalType(
+    llvm::Value* lhs, llvm::Value* rhs, llvm::Value* addend,
+    llvm::Value* result, llvm::IRBuilder<>* b) {
+  // We characterize a GEMV operation via M and K, since N is implicitly 1.
+  // This means the GEMV that multiplies (say) [5,6] with [6,1] is implemented
+  // by the same GEMV that multiplies [5,6] with [1,6].  However, the
+  // `llvm::Types` for the inputs to the two GEMVs don't match (in a trivial
+  // sense -- the in memory representations are the same) since they're computed
+  // from the `xla::Shape`s.  Since we want to be able to call the same
+  // `llvm::Function` for the two GEMVs we canonicalize the types of the GEMV
+  // inputs here into the same type.
+  GemvBuffersWithCanonicalType buffers_with_canonical_type;
+  llvm::Type* lhs_type = lhs->getType();
+  llvm::Type* rhs_type = rhs->getType();
+  llvm::Type* addend_type = addend ? addend->getType() : nullptr;
+  llvm::Type* result_type = result->getType();
+
+  buffers_with_canonical_type.lhs_canonicalized =
+      b->CreateBitCast(lhs, GetPointerToElementType(lhs_type));
+  buffers_with_canonical_type.rhs_canonicalized =
+      b->CreateBitCast(rhs, GetPointerToElementType(rhs_type));
+  buffers_with_canonical_type.addend_canonicalized =
+      addend ? b->CreateBitCast(addend, GetPointerToElementType(addend_type))
+             : nullptr;
+  buffers_with_canonical_type.result_canonicalized =
+      b->CreateBitCast(result, GetPointerToElementType(result_type));
+
+  return buffers_with_canonical_type;
+}
+
+}  // namespace
+
+void EmitRowMajorGemv(PrimitiveType scalar_type, int64 tile_rows,
+                      int64 tile_cols, int64 m, int64 k, llvm::Value* lhs,
+                      llvm::Value* rhs, llvm::Value* addend,
+                      llvm::Value* result, llvm::IRBuilder<>* b,
+                      bool enable_fast_math, bool optimize_for_size) {
+  RowMajorMatrixVectorProductEmitter::Config config(
+      /*scalar_type=*/scalar_type,
+      /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols,
+      /*m=*/m, /*k=*/k, /*has_addend=*/addend != nullptr);
+
+  GemvBuffersWithCanonicalType canonical_inputs =
+      GetGemvBuffersWithCanonicalType(lhs, rhs, addend, result, b);
+
+  KernelSupportLibrary::EmitAndCallOutlinedKernel(
+      /*enable_fast_math=*/enable_fast_math,
+      /*optimize_for_size=*/optimize_for_size, b, config.GetCacheKey(),
+      canonical_inputs.lhs_canonicalized, canonical_inputs.rhs_canonicalized,
+      canonical_inputs.addend_canonicalized,
+      canonical_inputs.result_canonicalized,
+      [&config, b, &canonical_inputs](llvm::Value* lhs, llvm::Value* rhs,
+                                      llvm::Value* addend,
+                                      llvm::Value* result) {
+        RowMajorMatrixVectorProductEmitter emitter(config, lhs, rhs, addend,
+                                                   result, b);
+        emitter.Emit();
+      });
+}
+
+void EmitColumnMajorGemv(PrimitiveType scalar_type, int64 tile_rows,
+                         int64 tile_cols, int64 m, int64 k, llvm::Value* lhs,
+                         llvm::Value* rhs, llvm::Value* addend,
+                         llvm::Value* result, llvm::IRBuilder<>* b,
+                         bool enable_fast_math, bool optimize_for_size) {
+  ColumnMajorMatrixVectorProductEmitter::Config config(
+      /*scalar_type=*/scalar_type,
+      /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols,
+      /*m=*/m, /*k=*/k, /*has_addend=*/addend != nullptr);
+
+  GemvBuffersWithCanonicalType canonical_inputs =
+      GetGemvBuffersWithCanonicalType(lhs, rhs, addend, result, b);
+
+  KernelSupportLibrary::EmitAndCallOutlinedKernel(
+      /*enable_fast_math=*/enable_fast_math,
+      /*optimize_for_size=*/optimize_for_size, b, config.GetCacheKey(),
+      canonical_inputs.lhs_canonicalized, canonical_inputs.rhs_canonicalized,
+      canonical_inputs.addend_canonicalized,
+      canonical_inputs.result_canonicalized,
+      [&config, b, &canonical_inputs](llvm::Value* lhs, llvm::Value* rhs,
+                                      llvm::Value* addend,
+                                      llvm::Value* result) {
+        ColumnMajorMatrixVectorProductEmitter emitter(config, lhs, rhs, addend,
+                                                      result, b);
+        emitter.Emit();
+      });
+}
+
+void EmitSmallGemm(PrimitiveType scalar_type, int64 m, int64 k, int64 n,
+                   int64 max_vectorization_width, int64 max_vector_count,
+                   int64 min_vectorization_width, int64 tile_size_m,
+                   int64 tile_size_k, llvm::Value* lhs, llvm::Value* rhs,
+                   llvm::Value* result, llvm::IRBuilder<>* b,
+                   bool enable_fast_math, bool optimize_for_size) {
+  TiledSmallGemmEmitter::Config config(
+      /*scalar_type=*/scalar_type,
+      TiledSmallGemmEmitter::Dimensions{/*m=*/m, /*k=*/k, /*n=*/n},
+      /*max_vectorization_width=*/max_vectorization_width,
+      /*max_vector_count=*/max_vector_count,
+      /*min_vectorization_width=*/min_vectorization_width,
+      /*tile_size_m=*/tile_size_m, /*tile_size_k=*/tile_size_k);
+
+  KernelSupportLibrary::EmitAndCallOutlinedKernel(
+      /*enable_fast_math=*/enable_fast_math,
+      /*optimize_for_size=*/optimize_for_size, b, config.GetCacheKey(), lhs,
+      rhs, result,
+      [&](llvm::Value* lhs, llvm::Value* rhs, llvm::Value* result) {
+        TiledSmallGemmEmitter small_gemm_emitter(config, /*lhs=*/lhs,
+                                                 /*rhs=*/rhs,
+                                                 /*result=*/result, b);
+        small_gemm_emitter.Emit();
+      });
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h b/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a82326cc3704bce8c122261383249c60eda1f3a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TILED_DOT_EMITTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TILED_DOT_EMITTER_H_
+
+#include "llvm/IR/IRBuilder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace cpu {
+
+// These routines emit LLVM IR implementing tiled GEMM and GEMV routines.
+
+void EmitRowMajorGemv(PrimitiveType scalar_type, tensorflow::int64 tile_rows,
+                      tensorflow::int64 tile_cols, tensorflow::int64 m,
+                      tensorflow::int64 k, llvm::Value* lhs, llvm::Value* rhs,
+                      llvm::Value* addend, llvm::Value* result,
+                      llvm::IRBuilder<>* b, bool enable_fast_math,
+                      bool optimize_for_size);
+
+void EmitColumnMajorGemv(PrimitiveType scalar_type, tensorflow::int64 tile_rows,
+                         tensorflow::int64 tile_cols, tensorflow::int64 m,
+                         tensorflow::int64 k, llvm::Value* lhs,
+                         llvm::Value* rhs, llvm::Value* addend,
+                         llvm::Value* result, llvm::IRBuilder<>* b,
+                         bool enable_fast_math, bool optimize_for_size);
+
+void EmitSmallGemm(PrimitiveType scalar_type, tensorflow::int64 m,
+                   tensorflow::int64 k, tensorflow::int64 n,
+                   tensorflow::int64 max_vectorization_width,
+                   tensorflow::int64 max_vector_count,
+                   tensorflow::int64 min_vectorization_width,
+                   tensorflow::int64 tile_size_m, tensorflow::int64 tile_size_k,
+                   llvm::Value* lhs, llvm::Value* rhs, llvm::Value* result,
+                   llvm::IRBuilder<>* b, bool enable_fast_math,
+                   bool optimize_for_size);
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TILED_DOT_EMITTER_H_
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.h b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
index 5690d2be2fe3e21c96b51a5226e0b29148217fd1..c444fd7d4aa88fa21b1aa2b2f058bd689b234b15 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.h
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
@@ -114,6 +114,9 @@ class VectorSupportLibrary {
   // raison d'etre) less cluttered.
 
   llvm::Value* FCmpEQMask(llvm::Value* lhs, llvm::Value* rhs);
+  llvm::Value* FCmpEQMask(llvm::Value* lhs, const llvm::APFloat& rhs) {
+    return FCmpEQMask(lhs, GetConstantFloat(lhs->getType(), rhs));
+  }
   llvm::Value* FCmpULEMask(llvm::Value* lhs, llvm::Value* rhs);
   llvm::Value* FCmpOLTMask(llvm::Value* lhs, llvm::Value* rhs);
   llvm::Value* FCmpOLTMask(llvm::Value* lhs, const llvm::APFloat& rhs) {
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index e84bf00153aa28df29d8df486b92654feab4afbf..2f7fddb96da2dbb4e3f824daa483d5bcd027460f 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -103,11 +103,19 @@ class DfsHloVisitorBase {
   virtual Status HandlePower(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
   }
+  virtual Status HandleSqrt(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+  virtual Status HandleRsqrt(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
   virtual Status HandleConvolution(HloInstructionPtr hlo) = 0;
   virtual Status HandleFft(HloInstructionPtr fft) = 0;
-  virtual Status HandleCrossReplicaSum(HloInstructionPtr hlo) = 0;
+  virtual Status HandleTriangularSolve(HloInstructionPtr hlo) = 0;
+  virtual Status HandleAllReduce(HloInstructionPtr hlo) = 0;
   virtual Status HandleAllToAll(HloInstructionPtr hlo) = 0;
   virtual Status HandleCollectivePermute(HloInstructionPtr hlo) = 0;
+  virtual Status HandleReplicaId(HloInstructionPtr hlo) = 0;
   virtual Status HandleGetDimensionSize(HloInstructionPtr hlo) = 0;
   virtual Status HandleCompare(HloInstructionPtr hlo) {
     return HandleElementwiseBinary(hlo);
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index 80ea5be298aea44a0f424398da74c4e478f10346..341bb37b8355e9987a0331d0a66bb8fe87f019cf 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -91,7 +91,10 @@ class DfsHloVisitorWithDefaultBase
   Status HandleFft(HloInstructionPtr fft) override {
     return DefaultAction(fft);
   }
-  Status HandleCrossReplicaSum(HloInstructionPtr crs) override {
+  Status HandleTriangularSolve(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
+  Status HandleAllReduce(HloInstructionPtr crs) override {
     return DefaultAction(crs);
   }
   Status HandleAllToAll(HloInstructionPtr hlo) override {
@@ -100,6 +103,9 @@ class DfsHloVisitorWithDefaultBase
   Status HandleCollectivePermute(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
+  Status HandleReplicaId(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
   Status HandleRng(HloInstructionPtr random) override {
     return DefaultAction(random);
   }
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default_test.cc b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default_test.cc
index 825e1436f0ec6d49b555e5e3e9c2c7a19fb7b062..70173d43d79e931b75f131ad380ad98359cc78b8 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default_test.cc
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default_test.cc
@@ -73,15 +73,14 @@ ENTRY TestComputation {
   abs = f32[] abs(arg)
   add = f32[] add(arg, gte)
   broadcast = f32[42] broadcast(add), dimensions={}
-  slice = f32[0] slice(broadcast), slice={[1:2]}
+  slice = f32[1] slice(broadcast), slice={[1:2]}
   copy = f32[] copy(arg)
   eq = pred[] equal-to(arg, gte)
   neg = f32[] negate(arg)
   ROOT convert = f64[] convert(f32[] arg)
 })";
   std::unique_ptr<HloModule> module =
-      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest())
-          .ConsumeValueOrDie();
+      ParseAndReturnVerifiedModule(hlo_string).ConsumeValueOrDie();
   ElementwiseTestVisitor visitor;
   TF_EXPECT_OK(module->entry_computation()->Accept(&visitor));
 }
diff --git a/tensorflow/compiler/xla/service/dot_decomposer.cc b/tensorflow/compiler/xla/service/dot_decomposer.cc
index b2ba2617902104bfea06713332fa1c2aedea536d..559b9c1f2c9f341293ca89adc61e3312fd9f313c 100644
--- a/tensorflow/compiler/xla/service/dot_decomposer.cc
+++ b/tensorflow/compiler/xla/service/dot_decomposer.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/dot_decomposer.h"
 
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -156,29 +158,192 @@ Status DecomposeBatchDot(HloInstruction* dot) {
   return computation->ReplaceInstruction(dot, new_dot);
 }
 
+// Convert a dot into a canonical form where non-contracting and contracting
+// dimensions are reshaped together and batch dimensions are the most major
+// dimensions. The requires transposing and reshapes the lhs and rhs and
+// reshaping the output batch to the original shape.
+Status CanonicalizeDot(HloInstruction* original_dot) {
+  auto computation = original_dot->parent();
+  const auto& original_dnums = original_dot->dot_dimension_numbers();
+  const int64 num_batch_dims = original_dnums.lhs_batch_dimensions_size();
+  const int64 num_contracting_dims =
+      original_dnums.lhs_contracting_dimensions_size();
+
+  const auto& lhs_shape = original_dot->operand(0)->shape();
+  const int64 lhs_rank = lhs_shape.rank();
+  const int64 num_lhs_non_contracting_dims =
+      lhs_rank - num_batch_dims - num_contracting_dims;
+
+  std::vector<int64> lhs_non_contracting_dims;
+  lhs_non_contracting_dims.reserve(num_lhs_non_contracting_dims);
+  int64 lhs_contracting_size = 1;
+  int64 lhs_non_contracting_size = 1;
+  std::vector<int64> batch_dim_sizes;
+  batch_dim_sizes.reserve(num_batch_dims);
+  for (int64 i = 0; i < lhs_rank; ++i) {
+    if (absl::c_linear_search(original_dnums.lhs_contracting_dimensions(), i)) {
+      lhs_contracting_size *= lhs_shape.dimensions(i);
+    } else if (absl::c_linear_search(original_dnums.lhs_batch_dimensions(),
+                                     i)) {
+      batch_dim_sizes.push_back(lhs_shape.dimensions(i));
+    } else {
+      lhs_non_contracting_dims.push_back(i);
+      lhs_non_contracting_size *= lhs_shape.dimensions(i);
+    }
+  }
+  // The canonical form of the lhs is
+  // [BatchDims, NonContractingDims, ContractingsDims]
+  std::vector<int64> lhs_transpose;
+  lhs_transpose.reserve(lhs_rank);
+  lhs_transpose.insert(lhs_transpose.end(),
+                       original_dnums.lhs_batch_dimensions().begin(),
+                       original_dnums.lhs_batch_dimensions().end());
+  lhs_transpose.insert(lhs_transpose.end(), lhs_non_contracting_dims.begin(),
+                       lhs_non_contracting_dims.end());
+  lhs_transpose.insert(lhs_transpose.end(),
+                       original_dnums.lhs_contracting_dimensions().begin(),
+                       original_dnums.lhs_contracting_dimensions().end());
+  HloInstruction* transposed_lhs =
+      computation->AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::PermuteDimensions(InversePermutation(lhs_transpose),
+                                       lhs_shape),
+          original_dot->mutable_operand(0), lhs_transpose));
+  std::vector<int64> lhs_reshape_dims = batch_dim_sizes;
+  lhs_reshape_dims.push_back(lhs_non_contracting_size);
+  lhs_reshape_dims.push_back(lhs_contracting_size);
+  // Reshape the contracting and non-contracting dimensions together.
+  HloInstruction* reshaped_lhs =
+      computation->AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(lhs_shape.element_type(), lhs_reshape_dims),
+          transposed_lhs));
+
+  const auto& rhs_shape = original_dot->operand(1)->shape();
+  const int64 rhs_rank = rhs_shape.rank();
+  const int64 num_rhs_non_contracting_dims =
+      rhs_rank - num_batch_dims - num_contracting_dims;
+  std::vector<int64> rhs_non_contracting_dims;
+  rhs_non_contracting_dims.reserve(num_rhs_non_contracting_dims);
+  int64 rhs_non_contracting_size = 1;
+  int64 rhs_contracting_size = 1;
+  for (int64 i = 0; i < rhs_rank; ++i) {
+    if (absl::c_linear_search(original_dnums.rhs_contracting_dimensions(), i)) {
+      rhs_contracting_size *= rhs_shape.dimensions(i);
+    } else if (!absl::c_linear_search(original_dnums.rhs_batch_dimensions(),
+                                      i)) {
+      rhs_non_contracting_dims.push_back(i);
+      rhs_non_contracting_size *= rhs_shape.dimensions(i);
+    }
+  }
+
+  // The canonical form of the rhs is
+  // [BatchDims, ContractingsDims, NonContractingDims]
+  std::vector<int64> rhs_transpose;
+  rhs_transpose.reserve(rhs_rank);
+  rhs_transpose.insert(rhs_transpose.end(),
+                       original_dnums.rhs_batch_dimensions().begin(),
+                       original_dnums.rhs_batch_dimensions().end());
+  rhs_transpose.insert(rhs_transpose.end(),
+                       original_dnums.rhs_contracting_dimensions().begin(),
+                       original_dnums.rhs_contracting_dimensions().end());
+  rhs_transpose.insert(rhs_transpose.end(), rhs_non_contracting_dims.begin(),
+                       rhs_non_contracting_dims.end());
+  HloInstruction* transposed_rhs =
+      computation->AddInstruction(HloInstruction::CreateTranspose(
+          ShapeUtil::PermuteDimensions(InversePermutation(rhs_transpose),
+                                       rhs_shape),
+          original_dot->mutable_operand(1), rhs_transpose));
+
+  std::vector<int64> rhs_reshape_dims = batch_dim_sizes;
+  rhs_reshape_dims.push_back(rhs_contracting_size);
+  rhs_reshape_dims.push_back(rhs_non_contracting_size);
+  // Reshape the contracting and non-contracting dimensions together.
+  HloInstruction* reshaped_rhs =
+      computation->AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(rhs_shape.element_type(), rhs_reshape_dims),
+          transposed_rhs));
+
+  std::vector<int64> dot_dims = batch_dim_sizes;
+  dot_dims.push_back(lhs_non_contracting_size);
+  dot_dims.push_back(rhs_non_contracting_size);
+
+  DotDimensionNumbers dot_dnums;
+  for (int64 i = 0; i < num_batch_dims; ++i) {
+    dot_dnums.add_lhs_batch_dimensions(i);
+    dot_dnums.add_rhs_batch_dimensions(i);
+  }
+  dot_dnums.add_lhs_contracting_dimensions(num_batch_dims + 1);
+  dot_dnums.add_rhs_contracting_dimensions(num_batch_dims);
+
+  HloInstruction* dot = computation->AddInstruction(HloInstruction::CreateDot(
+      ShapeUtil::MakeShape(original_dot->shape().element_type(), dot_dims),
+      reshaped_lhs, reshaped_rhs, dot_dnums, original_dot->precision_config()));
+
+  return computation->ReplaceInstruction(
+      original_dot, computation->AddInstruction(HloInstruction::CreateReshape(
+                        original_dot->shape(), dot)));
+}
+
 }  // namespace
 
 StatusOr<bool> DotDecomposer::Run(HloModule* module) {
   XLA_VLOG_LINES(2, "DotDecomposer ENTRY\n" + module->ToString());
-  // Gather all batch Dot operations.
-  std::vector<HloInstruction*> batch_dots;
+  // Gather all Non-canonical Dot operations.
+  std::vector<HloInstruction*> non_canonical_dots;
   for (auto* computation : module->MakeNonfusionComputations()) {
     for (auto* instruction : computation->instructions()) {
       if (instruction->opcode() != HloOpcode::kDot) {
         continue;
       }
       const DotDimensionNumbers& dnums = instruction->dot_dimension_numbers();
-      if (dnums.lhs_batch_dimensions_size() > 0 && decompose_batch_dot_) {
-        batch_dots.push_back(instruction);
+      // A dot it not canonical if there are more than one contracting
+      // dimension.
+      if (dnums.lhs_contracting_dimensions_size() != 1) {
+        non_canonical_dots.push_back(instruction);
+        continue;
+      }
+      if (dnums.lhs_batch_dimensions().empty() &&
+          dnums.lhs_contracting_dimensions().empty()) {
+        non_canonical_dots.push_back(instruction);
+        continue;
+      }
+      if (dnums.lhs_batch_dimensions().empty()) {
+        continue;
+      }
+      std::vector<int64> canonical_batch_dims(
+          dnums.lhs_batch_dimensions_size());
+      absl::c_iota(canonical_batch_dims, 0);
+      if (!absl::c_equal(dnums.lhs_batch_dimensions(), canonical_batch_dims) ||
+          !absl::c_equal(dnums.rhs_batch_dimensions(), canonical_batch_dims)) {
+        non_canonical_dots.push_back(instruction);
       }
     }
   }
-  // Decompose each batch Dot in 'batch_dots'.
   bool changed = false;
-  for (auto* dot : batch_dots) {
-    TF_RETURN_IF_ERROR(DecomposeBatchDot(dot));
+  for (auto* dot : non_canonical_dots) {
+    TF_RETURN_IF_ERROR(CanonicalizeDot(dot));
     changed = true;
   }
+
+  if (decompose_batch_dot_) {
+    std::vector<HloInstruction*> batch_dots;
+    for (auto* computation : module->MakeNonfusionComputations()) {
+      for (auto* instruction : computation->instructions()) {
+        if (instruction->opcode() != HloOpcode::kDot) {
+          continue;
+        }
+        const DotDimensionNumbers& dnums = instruction->dot_dimension_numbers();
+        if (!dnums.lhs_batch_dimensions().empty()) {
+          batch_dots.push_back(instruction);
+        }
+      }
+    }
+    // Decompose each batch Dot in 'batch_dots'.
+
+    for (auto* dot : batch_dots) {
+      TF_RETURN_IF_ERROR(DecomposeBatchDot(dot));
+      changed = true;
+    }
+  }
   XLA_VLOG_LINES(2, "DotDecompose EXIT\n" + module->ToString());
   return changed;
 }
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
index 6d0472689bf48092ceef2e9792c1358687d707ec..de3b508064bfadd88396f050142e682de2294434 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/while_util.h"
 
 namespace xla {
 
@@ -53,6 +54,8 @@ class DynamicDimensionInferenceVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleDot(HloInstruction* hlo) override;
 
+  Status HandleTuple(HloInstruction* hlo) override;
+
   Status HandleTranspose(HloInstruction* hlo) override;
 
   Status HandleReshape(HloInstruction* hlo) override;
@@ -77,6 +80,8 @@ class DynamicDimensionInferenceVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleElementwiseBinary(HloInstruction* hlo) override;
 
+  Status HandleWhile(HloInstruction* hlo) override;
+
  private:
   using OperandDynamicDimensionFn = std::function<Status(
       HloInstruction* operand, ShapeIndex index, int64 dimension,
@@ -122,6 +127,16 @@ Status DynamicDimensionInferenceVisitor::HandleGetTupleElement(
       });
 }
 
+Status DynamicDimensionInferenceVisitor::HandleTuple(HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction*, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        index.push_front(operand_index);
+        parent_->SetDynamicSize(hlo, index, dimension, dynamic_size);
+        return Status::OK();
+      });
+}
+
 Status DynamicDimensionInferenceVisitor::HandleBroadcast(HloInstruction* hlo) {
   return ForEachOperandDynamicDimension(
       hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
@@ -173,7 +188,7 @@ Status DynamicDimensionInferenceVisitor::HandleReduce(HloInstruction* hlo) {
 
         // Find out the new dynamic dimension after reduce.
         int64 dimensions_not_reduced_count = 0;
-        for (int i = 0; i < ShapeUtil::Rank(operand->shape()); ++i) {
+        for (int i = 0; i < operand->shape().rank(); ++i) {
           if (dimension == i) {
             parent_->SetDynamicSize(reduce, {}, dimensions_not_reduced_count,
                                     dynamic_size);
@@ -207,7 +222,7 @@ Status DynamicDimensionInferenceVisitor::HandleDot(HloInstruction* hlo) {
           result_dim_mapping[i] = current_result_dims++;
         }
 
-        for (int64 i = 0; i < ShapeUtil::Rank(dot->operand(0)->shape()); i++) {
+        for (int64 i = 0; i < dot->operand(0)->shape().rank(); i++) {
           if (!absl::c_linear_search(
                   dimension_numbers.lhs_contracting_dimensions(), i)) {
             if (operand_index == 0) {
@@ -217,7 +232,7 @@ Status DynamicDimensionInferenceVisitor::HandleDot(HloInstruction* hlo) {
           }
         }
 
-        for (int64 i = 0; i < ShapeUtil::Rank(dot->operand(1)->shape()); i++) {
+        for (int64 i = 0; i < dot->operand(1)->shape().rank(); i++) {
           if (!absl::c_linear_search(
                   dimension_numbers.rhs_contracting_dimensions(), i) &&
               !absl::c_linear_search(dimension_numbers.rhs_batch_dimensions(),
@@ -383,6 +398,120 @@ Status DynamicDimensionInferenceVisitor::HandleSelectAndScatter(
       });
 }
 
+Status DynamicDimensionInferenceVisitor::HandleWhile(HloInstruction* hlo) {
+  // While loop is handled by passing dynamic size hlos as parameters into the
+  // hlo while loop. This is done by replacing the original while with a new
+  // one.
+  //
+  // Before:
+  //
+  // op1 = ...
+  // op2 = ...
+  // op1_x = ... // dynamic dimension size of op1
+  // while = while(op1, op2)
+  //
+  //
+  // After:
+  //
+  // op1 = ...
+  // op2 = ...
+  // op1_x = ... // dynamic dimension size of op1
+  // while = while(op1, op2, op1_x)
+  //
+  // In the above graph, op_x is the bound of the dynamic dimension size of op1
+  // and is wired into the while loop as new parameter.
+  //
+  // TODO(b/119843103): Once we implement dynamic bounds in XLA backend, dynamic
+  // bound can be propagated through native xla values instead of relying on
+  // additional parameter.
+
+  // dynamic_size_to_operand_id_index_map keeps track of dynamic size operations
+  // to their operand ids in the new while loop.
+  absl::flat_hash_map<HloInstruction*, int64>
+      dynamic_size_to_operand_id_index_map;
+
+  // operands_to_add collects dynamic sizes that need to be added to the while
+  // loop as parameters. Note that a dynamic size is ignored if it is already
+  // part of the parameter. i.e.:
+  //
+  // We don't do:
+  //
+  // op1 = ...
+  // op2 = ...
+  // op_x = ... // dynamic dimension size of both op1 and op2
+  // while = while(op1, op2, op_x, op_x) // 4 parameters
+  //
+  // But we do:
+  //
+  // op1 = ...
+  // op2 = ...
+  // op_x = ... // dynamic dimension size of both op1 and op2
+  // while = while(op1, op2, op_x)
+  //
+  // An alternative is to do this in a while loop CSE pass.
+  //
+  std::vector<HloInstruction*> operands_to_add;
+  int64 operand_count = hlo->shape().tuple_shapes_size();
+  TF_RETURN_IF_ERROR(ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction*, ShapeIndex, int64, int64,
+               HloInstruction* dynamic_size) {
+        const HloInstruction* tuple_operand = hlo->operand(0);
+        for (int64 i = 0; i < tuple_operand->operand_count(); ++i) {
+          if (dynamic_size == tuple_operand->operand(i)) {
+            dynamic_size_to_operand_id_index_map[dynamic_size] = i;
+            return Status::OK();
+          }
+        }
+        auto iter = dynamic_size_to_operand_id_index_map.find(dynamic_size);
+        if (iter == dynamic_size_to_operand_id_index_map.end()) {
+          operands_to_add.push_back(dynamic_size);
+          dynamic_size_to_operand_id_index_map[dynamic_size] = operand_count++;
+        }
+        return Status::OK();
+      }));
+
+  if (!operands_to_add.empty()) {
+    // Only replace the while loop if there are new parameters to add.
+    HloInstruction* old_tuple_operand = hlo->mutable_operand(0);
+    TF_ASSIGN_OR_RETURN(
+        WhileUtil::MakeInstructionsLiveInResult result,
+        WhileUtil::MakeInstructionsLiveIn(hlo, operands_to_add));
+    // WhileUtil creates a new while hlo and tuple. Update the dynamic size
+    // mapping for the newly created tuple.
+    HloInstruction* new_tuple_operand =
+        result.new_while_instr->mutable_operand(0);
+    parent_->CopyMapping(/*from=*/old_tuple_operand, /*to=*/new_tuple_operand);
+    hlo = result.new_while_instr;
+  }
+
+  // We have replaced the while loop, now set the dynamic dimensions for the
+  // newly created while loop so that the hlos that consumes the while loop can
+  // see the dynamic dimensions. Also sets the dynamic parameter binding for
+  // running inference in the while loop.
+  DynamicParameterBinding binding_for_while;
+  TF_RETURN_IF_ERROR(ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction*, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        DynamicParameterBinding::DynamicParameter dynamic_parameter{
+            operand_index,
+            {dynamic_size_to_operand_id_index_map[dynamic_size]}};
+        DynamicParameterBinding::DynamicDimension dynamic_dimension{
+            operand_index, index, dimension};
+        TF_RETURN_IF_ERROR(
+            binding_for_while.Bind(dynamic_parameter, dynamic_dimension));
+        parent_->SetDynamicSize(hlo, index, dimension, dynamic_size);
+        return Status::OK();
+      }));
+
+  // Run inference in while body and condition.
+  TF_RETURN_IF_ERROR(DynamicDimensionInferenceVisitor::Run(
+      hlo->while_body(), binding_for_while, parent_));
+  TF_RETURN_IF_ERROR(DynamicDimensionInferenceVisitor::Run(
+      hlo->while_condition(), binding_for_while, parent_));
+
+  return Status::OK();
+}
+
 Status DynamicDimensionInferenceVisitor::HandleParameter(HloInstruction* hlo) {
   return param_bindings_.ForEachBinding(
       [&](const DynamicParameterBinding::DynamicParameter& dynamic_parameter,
@@ -430,15 +559,43 @@ Status DynamicDimensionInferenceVisitor::ForEachOperandDynamicDimension(
   return Status::OK();
 }
 
+void DynamicDimensionInference::CopyMapping(HloInstruction* from,
+                                            HloInstruction* to) {
+  auto iter = per_hlo_dynamic_dimensions_.find(from);
+  if (iter != per_hlo_dynamic_dimensions_.end()) {
+    for (auto& dynamic_dimension : iter->second) {
+      HloInstruction* dynamic_size =
+          GetDynamicSize(dynamic_dimension.inst, dynamic_dimension.index,
+                         dynamic_dimension.dim);
+      SetDynamicSize(to, dynamic_dimension.index, dynamic_dimension.dim,
+                     dynamic_size);
+    }
+  }
+}
+
 /* static */
 StatusOr<DynamicDimensionInference> DynamicDimensionInference::Run(
     HloModule* module) {
-  VLOG(0) << "Param Config " << module->dynamic_parameter_binding().ToString();
+  VLOG(2) << "Param Config " << module->dynamic_parameter_binding().ToString();
   DynamicDimensionInference inference(module);
   TF_RETURN_IF_ERROR(inference.AnalyzeDynamicDimensions());
   return inference;
 }
 
+string DynamicDimensionInference::ToString() const {
+  std::vector<string> pieces;
+  pieces.push_back("DynamicDimensionInference: ");
+  for (const auto& mapping : dynamic_mapping_) {
+    const DynamicDimension& dynamic_dimension = mapping.first;
+    pieces.push_back(absl::StrFormat(
+        " -- instruction %s at %s has dim %lld as dynamic"
+        " dimension, which is represented by instruction %s",
+        dynamic_dimension.inst->ToString(), dynamic_dimension.index.ToString(),
+        dynamic_dimension.dim, mapping.second->ToString()));
+  }
+  return absl::StrJoin(pieces, "\n");
+}
+
 DynamicDimensionInference::DynamicDimensionInference(HloModule* module)
     : module_(module) {}
 
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.h b/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
index 164d15bf111a92e3da957f609b54ee0662ef18b1..d0f2998328f3028ccbd5b33690a514371a03b5a1 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
@@ -88,6 +88,11 @@ class DynamicDimensionInference {
     iter.first->second.emplace(DynamicDimension{inst, index, dim});
   }
 
+  // Copies the internal mapping from instruction `from` to instruction `to`.
+  // This is useful when an instruction is replaced by the other during the
+  // inferencing process.
+  void CopyMapping(HloInstruction* from, HloInstruction* to);
+
   // AnalyzeDynamicDimensions starts the analysis of the dynamic dimensions in
   // module_.
   Status AnalyzeDynamicDimensions();
@@ -101,6 +106,8 @@ class DynamicDimensionInference {
   using DynamicMapping = absl::flat_hash_map<DynamicDimension, HloInstruction*>;
   DynamicMapping dynamic_mapping_;
 
+  // A convenient mapping from an hlo to the set of dynamic dimensions that it
+  // holds.
   using PerHloDynamicDimensions =
       absl::flat_hash_map<HloInstruction*,
                           absl::flat_hash_set<DynamicDimension>>;
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
index ea9ebed45d99797ce4f80376ec3d0b758da3ca17..597cdf27c3318b3cf8bd5bb5f9b3239cf23a4c73 100644
--- a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
@@ -62,6 +62,17 @@ class DynamicDimensionInferenceTest : public HloTestBase {
     return module_->AddEmbeddedComputation(embedded_builder.Build());
   }
 
+  HloComputation* GetGe() {
+    auto embedded_builder = HloComputation::Builder("ge");
+    auto lhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        0, ShapeUtil::MakeShape(F32, {}), "lhs"));
+    auto rhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        1, ShapeUtil::MakeShape(F32, {}), "rhs"));
+    embedded_builder.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGe, lhs, rhs));
+    return module_->AddEmbeddedComputation(embedded_builder.Build());
+  }
+
   std::unique_ptr<HloModule> module_;
   std::unique_ptr<DynamicDimensionInference> inference_;
   const Shape scalar_shape_ = ShapeUtil::MakeShape(S32, {});
@@ -292,7 +303,8 @@ TEST_F(DynamicDimensionInferenceTest, ConvolutionTest) {
   Window window;
 
   auto* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
-      zx_shape, a_param, b_param, /*feature_group_count=*/1, window, dnums,
+      zx_shape, a_param, b_param, /*feature_group_count=*/1,
+      /*batch_group_count=*/1, window, dnums,
       HloTestBase::DefaultPrecisionConfig(2)));
 
   module_->AddEntryComputation(builder.Build());
@@ -433,6 +445,96 @@ TEST_F(DynamicDimensionInferenceTest, BroadcastTest) {
   EXPECT_EQ(inference_->GetDynamicSize(broadcast, {}, 2), nullptr);
 }
 
+TEST_F(DynamicDimensionInferenceTest, WhileTest) {
+  // Test the ability to trace into while loops.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {2, 4, 4});
+  auto output_shape = ShapeUtil::MakeShape(F32, {2, 2, 2});
+  auto tuple_shape = ShapeUtil::MakeTupleShape({input_shape, input_shape});
+
+  // Body:
+  //
+  //   Param
+  //   |  |
+  // GTE1 GTE2
+  //   |  |
+  //    ADD
+  auto body_builder = HloComputation::Builder("body");
+  auto body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  auto gte_0 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(input_shape, body_param, 0));
+  auto gte_1 = body_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(input_shape, body_param, 1));
+  auto add = body_builder.AddInstruction(
+      HloInstruction::CreateBinary(input_shape, HloOpcode::kAdd, gte_0, gte_1));
+  body_builder.AddInstruction(HloInstruction::CreateTuple({add, add}));
+
+  HloComputation* body = module_->AddEmbeddedComputation(body_builder.Build());
+
+  auto cond_builder = HloComputation::Builder("condition");
+  cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "param"));
+  cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<bool>(false)));
+  HloComputation* condition =
+      module_->AddEmbeddedComputation(cond_builder.Build());
+
+  // Entry:
+  //
+  //  Param
+  //   |
+  //  While
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, tuple_shape, "A"));
+  auto* size_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, scalar_shape_, "size_param"));
+  builder.AddInstruction(
+      HloInstruction::CreateWhile(tuple_shape, condition, body, a_param));
+
+  module_->AddEntryComputation(builder.Build());
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {0}, 0}));
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {1}, 0}));
+
+  // Test that dynamic dimension inference does the right thing. A lambda is
+  // used here since we want to test twice by running inference again
+  // (idempotency).
+  auto test_dynamic_dimension = [&]() {
+    HloInstruction* while_hlo = nullptr;
+    // The while hlo has been replaced, find the new one.
+    for (HloInstruction* inst : module_->entry_computation()->instructions()) {
+      if (inst->opcode() == HloOpcode::kWhile) {
+        while_hlo = inst;
+      }
+    }
+    ASSERT_NE(while_hlo, nullptr);
+    // The original while shape has 2 parameters. With dynamic size passed in
+    // as an extra parameter, the tuple should have 3 elements.
+    EXPECT_EQ(while_hlo->shape().tuple_shapes_size(), 3);
+    HloInstruction* add = nullptr;
+    for (HloInstruction* inst : while_hlo->while_body()->instructions()) {
+      if (inst->opcode() == HloOpcode::kAdd) {
+        add = inst;
+      }
+    }
+    EXPECT_NE(add, nullptr);
+    EXPECT_NE(inference_->GetDynamicSize(add, {}, 0), nullptr);
+    EXPECT_EQ(inference_->GetDynamicSize(while_hlo, {0}, 0), size_param);
+    EXPECT_EQ(inference_->GetDynamicSize(while_hlo, {1}, 0), size_param);
+  };
+
+  TF_ASSERT_OK(RunInference());
+  test_dynamic_dimension();
+  TF_ASSERT_OK(RunInference());
+  test_dynamic_dimension();
+}
+
 TEST_F(DynamicDimensionInferenceTest, ReduceWindowBatchTest) {
   // Test the ability to trace reduce window batch dimensions.
   auto builder = HloComputation::Builder(TestName());
@@ -486,7 +588,7 @@ TEST_F(DynamicDimensionInferenceTest, SelectAndScatterTest) {
   // Test the ability to trace select and scatter batch dimensions.
   auto builder = HloComputation::Builder(TestName());
   auto input_shape = ShapeUtil::MakeShape(F32, {2, 4, 4});
-  auto output_shape = ShapeUtil::MakeShape(F32, {2, 2, 2});
+  auto source_shape = ShapeUtil::MakeShape(F32, {2, 2, 2});
 
   Window window;
   // First dimension is unchanged.
@@ -513,22 +615,26 @@ TEST_F(DynamicDimensionInferenceTest, SelectAndScatterTest) {
       /*parameter_number=*/0, input_shape, "A"));
   auto* size_param = builder.AddInstruction(HloInstruction::CreateParameter(
       /*parameter_number=*/1, scalar_shape_, "size_param"));
+  auto* source = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/2, source_shape, "B"));
 
   auto init = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
 
-  auto* reduce_window =
-      builder.AddInstruction(HloInstruction::CreateReduceWindow(
-          output_shape, a_param, init, window, GetAdd()));
+  auto* sns = builder.AddInstruction(HloInstruction::CreateSelectAndScatter(
+      input_shape, a_param, GetGe(), window, source, init, GetAdd()));
 
   module_->AddEntryComputation(builder.Build());
 
   TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
       DynamicParameterBinding::DynamicParameter{1, {}},
       DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{2, {}, 0}));
 
   TF_ASSERT_OK(RunInference());
-  EXPECT_EQ(inference_->GetDynamicSize(reduce_window, {}, 0), size_param);
+  EXPECT_EQ(inference_->GetDynamicSize(sns, {}, 0), size_param);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/dynamic_index_splitter.cc b/tensorflow/compiler/xla/service/dynamic_index_splitter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e34adfd2d2bbb7214cfa2da28291b133538845e5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_index_splitter.cc
@@ -0,0 +1,99 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
+
+#include <map>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace xla {
+
+StatusOr<bool> DynamicIndexSplitter::Run(HloModule* module) {
+  bool changed = false;
+
+  std::vector<HloComputation*> computations =
+      module->MakeNonfusionComputations();
+  for (HloComputation* computation : computations) {
+    for (HloInstruction* dynamic_op : computation->MakeInstructionPostOrder()) {
+      switch (dynamic_op->opcode()) {
+        case HloOpcode::kDynamicSlice:
+        case HloOpcode::kDynamicUpdateSlice:
+          break;
+        default:
+          continue;
+      }
+      auto parent = dynamic_op->parent();
+      bool is_update = dynamic_op->opcode() == HloOpcode::kDynamicUpdateSlice;
+      int64 num_indices = dynamic_op->operand(0)->shape().rank();
+
+      if (num_indices == 0) {
+        // If the operand rank is 0, directly replace R0 DS/DUS with the
+        // operand (for DS) or update (for DUS).
+        if (is_update) {
+          TF_CHECK_OK(parent->ReplaceInstruction(
+              dynamic_op, dynamic_op->mutable_operand(1)));
+        } else {
+          TF_CHECK_OK(parent->ReplaceInstruction(
+              dynamic_op, dynamic_op->mutable_operand(0)));
+        }
+        changed = true;
+        continue;
+      }
+
+      int64 index_operand_number = Cast<HloDynamicIndexInstruction>(dynamic_op)
+                                       ->first_index_operand_number();
+      auto index_operand = dynamic_op->mutable_operand(index_operand_number);
+      if (ShapeUtil::IsScalar(index_operand->shape())) {
+        // This DS/DUS already uses scalar indices.
+        continue;
+      }
+      TF_RET_CHECK(index_operand->shape().rank() == 1);
+      auto index_element_type = index_operand->shape().element_type();
+      std::vector<HloInstruction*> index_array;
+      for (int64 dim = 0; dim < num_indices; ++dim) {
+        auto slice = parent->AddInstruction(HloInstruction::CreateSlice(
+            ShapeUtil::MakeShape(index_element_type, {1}), index_operand, {dim},
+            {dim + 1}, {1}));
+        auto bitcast = parent->AddInstruction(HloInstruction::CreateReshape(
+            ShapeUtil::MakeShape(index_element_type, {}), slice));
+        index_array.push_back(bitcast);
+      }
+      auto new_dynamic_op =
+          is_update
+              ? HloInstruction::CreateDynamicUpdateSlice(
+                    dynamic_op->shape(), dynamic_op->mutable_operand(0),
+                    dynamic_op->mutable_operand(1), absl::MakeSpan(index_array))
+              : HloInstruction::CreateDynamicSlice(
+                    dynamic_op->shape(), dynamic_op->mutable_operand(0),
+                    absl::MakeSpan(index_array),
+                    dynamic_op->dynamic_slice_sizes());
+      TF_CHECK_OK(parent->ReplaceWithNewInstruction(dynamic_op,
+                                                    std::move(new_dynamic_op)));
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_index_splitter.h b/tensorflow/compiler/xla/service/dynamic_index_splitter.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c12e3a4af287ad2272a08ba54cd99c2cad9d451
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_index_splitter.h
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_INDEX_SPLITTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_INDEX_SPLITTER_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Convert R1 index operands to DynamicSlice and DynamicUpdateSlice ops into
+// separate scalars.
+class DynamicIndexSplitter : public HloModulePass {
+ public:
+  DynamicIndexSplitter() = default;
+  absl::string_view name() const override { return "dynamic-index-splitter"; }
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_INDEX_SPLITTER_H_
diff --git a/tensorflow/compiler/xla/service/dynamic_index_splitter_test.cc b/tensorflow/compiler/xla/service/dynamic_index_splitter_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..98029d1faff7d669730f6b66e38fcefece70f0eb
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_index_splitter_test.cc
@@ -0,0 +1,134 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+class DynamicIndexSplitterTest : public HloTestBase {};
+
+TEST_F(DynamicIndexSplitterTest, DynamicSlice) {
+  const char* const kDynamicSlice = R"(
+    HloModule DynamicSlice_module
+
+    ENTRY entry (operand: s32[4,5,6], indices: s32[3]) -> s32[1,1,1] {
+      operand = s32[4,5,6] parameter(0)
+      indices = s32[3] parameter(1)
+      ROOT dynamic-slice = s32[1,1,1] dynamic-slice(operand, indices), dynamic_slice_sizes={1,1,1}
+    }
+  )";
+
+  HloModuleConfig config;
+  DebugOptions debug_options = config.debug_options();
+  debug_options.set_xla_allow_scalar_index_dynamic_ops(true);
+  config.set_debug_options(debug_options);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(kDynamicSlice, config));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          DynamicIndexSplitter().Run(module.get()));
+  EXPECT_TRUE(changed);
+  ASSERT_THAT(module->entry_computation()->root_instruction(),
+              op::DynamicSlice(op::Parameter(0),
+                               op::Reshape(op::Slice(op::Parameter(1))),
+                               op::Reshape(op::Slice(op::Parameter(1))),
+                               op::Reshape(op::Slice(op::Parameter(1)))));
+
+  for (int i = 0; i < 3; ++i) {
+    const HloInstruction* slice = module->entry_computation()
+                                      ->root_instruction()
+                                      ->operand(i + 1)
+                                      ->operand(0);
+    EXPECT_EQ(slice->slice_starts(0), i);
+    EXPECT_EQ(slice->slice_limits(0), i + 1);
+  }
+}
+
+TEST_F(DynamicIndexSplitterTest, DynamicUpdateSlice) {
+  const char* const kDynamicUpdateSlice = R"(
+    HloModule DynamicUpdatedSlice_module
+
+    ENTRY entry (operand: s32[4,5,6], indices: s32[3], update: s32[1,1,1]) -> s32[4,5,6] {
+      operand = s32[4,5,6] parameter(0)
+      indices = s32[3] parameter(1)
+      update = s32[1,1,1] parameter(2)
+      ROOT dynamic-update-slice = s32[4,5,6] dynamic-update-slice(operand, update, indices)
+    }
+  )";
+
+  HloModuleConfig config;
+  DebugOptions debug_options = config.debug_options();
+  debug_options.set_xla_allow_scalar_index_dynamic_ops(true);
+  config.set_debug_options(debug_options);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseHloString(kDynamicUpdateSlice, config));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          DynamicIndexSplitter().Run(module.get()));
+  EXPECT_TRUE(changed);
+  ASSERT_THAT(module->entry_computation()->root_instruction(),
+              op::DynamicUpdateSlice(op::Parameter(0), op::Parameter(2),
+                                     op::Reshape(op::Slice(op::Parameter(1))),
+                                     op::Reshape(op::Slice(op::Parameter(1))),
+                                     op::Reshape(op::Slice(op::Parameter(1)))));
+
+  for (int i = 0; i < 3; ++i) {
+    const HloInstruction* slice = module->entry_computation()
+                                      ->root_instruction()
+                                      ->operand(i + 2)
+                                      ->operand(0);
+    EXPECT_EQ(slice->slice_starts(0), i);
+    EXPECT_EQ(slice->slice_limits(0), i + 1);
+  }
+}
+
+TEST_F(DynamicIndexSplitterTest, AlreadyScalar) {
+  const char* const kDynamicSlice = R"(
+    HloModule DynamicSlice_module
+
+    ENTRY entry (operand: s32[4,5,6], index.0: s32[], index.1: s32[], index.2: s32[]) -> s32[1,1,1] {
+      operand = s32[4,5,6] parameter(0)
+      index.0 = s32[] parameter(1)
+      index.1 = s32[] parameter(2)
+      index.2 = s32[] parameter(3)
+      ROOT dynamic-slice = s32[1,1,1] dynamic-slice(operand, index.0, index.1, index.2), dynamic_slice_sizes={1,1,1}
+    }
+  )";
+
+  HloModuleConfig config;
+  DebugOptions debug_options = config.debug_options();
+  debug_options.set_xla_allow_scalar_index_dynamic_ops(true);
+  config.set_debug_options(debug_options);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(kDynamicSlice, config));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          DynamicIndexSplitter().Run(module.get()));
+  EXPECT_FALSE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::DynamicSlice(op::Parameter(0), op::Parameter(1),
+                               op::Parameter(2), op::Parameter(3)));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_padder.cc b/tensorflow/compiler/xla/service/dynamic_padder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4db280f817141bd52e3a5b9564600a618f81aeac
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_padder.cc
@@ -0,0 +1,161 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/xla/service/dynamic_padder.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
+
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+
+namespace {
+
+// ChooseIdentityValue looks at the instruction and returns a identity value
+// which, when padded, doesn't change the result of the instruction.
+//
+// nullopt is returned if padding doesn't need to be reset.
+StatusOr<HloInstruction*> ChooseIdentityValue(HloInstruction* inst) {
+  HloComputation* comp = inst->parent();
+  // Padding on elementwise operation doesn't affect the result of the effective
+  // data.
+  if (inst->IsElementwise()) {
+    return nullptr;
+  }
+
+  switch (inst->opcode()) {
+    case HloOpcode::kReduce:
+    case HloOpcode::kReduceWindow: {
+      // Because of the way we do reduce, we already require the `init` operand
+      // of hlo reduce instruction to be identity value. Here we reuse the
+      // operand.
+      return inst->mutable_operand(1);
+    }
+
+    case HloOpcode::kConvolution:
+    case HloOpcode::kDot: {
+      // Use 0 as padding value for convolution and dot.
+      PrimitiveType ptype = inst->shape().element_type();
+      return comp->AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::Zero(ptype)));
+    }
+
+    case HloOpcode::kPad: {
+      return inst->mutable_operand(1);
+    }
+    case HloOpcode::kParameter:
+    case HloOpcode::kGetDimensionSize:
+    case HloOpcode::kReshape:
+    case HloOpcode::kTuple:
+    case HloOpcode::kAllReduce:
+    case HloOpcode::kBroadcast:
+      return nullptr;
+    default:
+      return UnimplementedStrCat("Unimplimented padding for instruction: ",
+                                 inst->ToString());
+  }
+}
+
+}  // namespace
+
+StatusOr<bool> DynamicPadder::Run(HloModule* module) {
+  bool changed = false;
+  VLOG(2) << "Pre DynamicPadder HLO:";
+  XLA_VLOG_LINES(2, module->ToString());
+  TF_ASSIGN_OR_RETURN(DynamicDimensionInference dynamic_dimension_inference,
+                      DynamicDimensionInference::Run(module));
+
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* inst : computation->instructions()) {
+      for (int64 operand_num = 0; operand_num < inst->operand_count();
+           ++operand_num) {
+        HloInstruction* operand = inst->mutable_operand(operand_num);
+        if (!operand->shape().IsArray()) {
+          continue;
+        }
+        for (int64 dim = 0; dim < operand->shape().rank(); ++dim) {
+          HloInstruction* dynamic_size =
+              dynamic_dimension_inference.GetDynamicSize(operand, {}, dim);
+          if (dynamic_size == nullptr) {
+            continue;
+          }
+          VLOG(1) << "Has dynamic dimension of operand" << operand_num << " @"
+                  << dim;
+          TF_ASSIGN_OR_RETURN(HloInstruction * identity_value,
+                              ChooseIdentityValue(inst));
+          if (identity_value == nullptr) {
+            continue;
+          }
+
+          // For each dimension, first generates a mask representing the
+          // effective area of data and padded area of data using iota and
+          // dynamic_size. For example, given a dimension of 7 elements and 5
+          // effective elements:
+          //
+          // iota = [0, 1, 2, 3, 4, 5, 6]
+          // broadcast_dynamic_size = [5, 5, 5, 5, 5, 5, 5]
+          // mask = lt(iota, broadcast_dynamic_size) = [t, t, t, t, t, f, f]
+          //
+          // Once the mask is generated, the input data is then padded using the
+          // mask and pad value.
+          //
+          const Shape mask_shape =
+              ShapeUtil::ChangeElementType(operand->shape(), xla::U32);
+          const Shape pred_shape =
+              ShapeUtil::ChangeElementType(operand->shape(), xla::PRED);
+          HloInstruction* iota = computation->AddInstruction(
+              HloInstruction::CreateIota(mask_shape, dim));
+
+          HloInstruction* broadcasted_effective_size =
+              computation->AddInstruction(HloInstruction::CreateBroadcast(
+                  mask_shape, dynamic_size, {}));
+          HloInstruction* pred = computation->AddInstruction(
+              HloInstruction::CreateBinary(pred_shape, HloOpcode::kLt, iota,
+                                           broadcasted_effective_size));
+
+          HloInstruction* broadcasted_identity_value =
+              computation->AddInstruction(HloInstruction::CreateBroadcast(
+                  operand->shape(), identity_value, {}));
+          HloInstruction* padded =
+              computation->AddInstruction(HloInstruction::CreateTernary(
+                  operand->shape(), HloOpcode::kSelect, pred, operand,
+                  broadcasted_identity_value));
+          TF_RETURN_IF_ERROR(inst->ReplaceOperandWith(operand_num, padded));
+          operand = inst->mutable_operand(operand_num);
+          changed = true;
+        }
+      }
+    }
+  }
+  HloDCE dce;
+  TF_ASSIGN_OR_RETURN(changed, dce.Run(module));
+  VLOG(2) << "Post DynamicPadder HLO:";
+  XLA_VLOG_LINES(2, module->ToString());
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_padder.h b/tensorflow/compiler/xla/service/dynamic_padder.h
new file mode 100644
index 0000000000000000000000000000000000000000..509269f7f56746fa5516ad917a04221587c6dcca
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_padder.h
@@ -0,0 +1,44 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PADDER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PADDER_H_
+
+#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// With bounded shapes, only part of the shape contains effective data and the
+// rest contains padded data, whose value can be anything depending on the
+// source of the data. When a bounded shape is directly consumed by an
+// instruction that collapses dimensions (reduce for example), the padding data
+// would affect result of the instruction.
+//
+// DynamicPadder uses DynamicDimensionInference to detect bounded shapes in a
+// hlo module, it then inserts certain instructions to reset the padding into an
+// identity value so that in doesn't affect the result of subsequent
+// instruction. For example, it'd reset the padding to 0 before a bounded shape
+// is consumed by a reduce-sum.
+class DynamicPadder : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "dynamic_padder"; }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PADDER_H_
diff --git a/tensorflow/compiler/xla/service/dynamic_padder_test.cc b/tensorflow/compiler/xla/service/dynamic_padder_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..55a11286e4596d87c330315322cae704fc5cd707
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_padder_test.cc
@@ -0,0 +1,152 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dynamic_padder.h"
+
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_runner.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace op = xla::testing::opcode_matchers;
+
+namespace xla {
+namespace {
+
+class DynamicPadderTest : public HloTestBase {
+ protected:
+  DynamicPadderTest() : HloTestBase() { module_ = CreateNewVerifiedModule(); }
+
+  StatusOr<bool> RunPadder() {
+    hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before padder");
+
+    DynamicPadder padder;
+
+    return padder.Run(module_.get());
+  }
+
+  void ExpectPadded(const HloInstruction* inst) {
+    EXPECT_THAT(inst,
+                op::Select(op::Lt(op::Iota(), op::Broadcast(op::Parameter())),
+                           ::testing::_, op::Broadcast()));
+  }
+
+  HloComputation* GetScalarAddComputation() {
+    auto embedded_builder = HloComputation::Builder("add");
+    auto lhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        0, ShapeUtil::MakeShape(F32, {}), "lhs"));
+    auto rhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        1, ShapeUtil::MakeShape(F32, {}), "rhs"));
+    embedded_builder.AddInstruction(
+        HloInstruction::CreateBinary(lhs->shape(), HloOpcode::kAdd, lhs, rhs));
+    return module_->AddEmbeddedComputation(embedded_builder.Build());
+  }
+
+  std::unique_ptr<HloModule> module_;
+  const Shape scalar_shape_ = ShapeUtil::MakeShape(U32, {});
+};
+
+TEST_F(DynamicPadderTest, ReduceTest) {
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
+  auto reduce_shape = ShapeUtil::MakeShape(F32, {2});
+
+  auto data_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, input_shape, "data_param"));
+  builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "size_param"));
+
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(input_shape, HloOpcode::kNegate, data_param));
+
+  auto init = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+
+  auto reduce = builder.AddInstruction(HloInstruction::CreateReduce(
+      reduce_shape, negate, init, {0, 2}, GetScalarAddComputation()));
+
+  module_->AddEntryComputation(builder.Build());
+
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  TF_ASSERT_OK(RunPadder().status());
+
+  ExpectPadded(reduce->operand(0));
+}
+
+TEST_F(DynamicPadderTest, ConvolutionTest) {
+  auto builder = HloComputation::Builder(TestName());
+  constexpr int xdim = 3;
+  constexpr int ydim = 2;
+  constexpr int zdim = 1;
+  auto xy_shape = ShapeUtil::MakeShape(F32, {xdim, ydim});
+  auto yz_shape = ShapeUtil::MakeShape(F32, {ydim, zdim});
+  auto zx_shape = ShapeUtil::MakeShape(F32, {zdim, xdim});
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, xy_shape, "A"));
+  auto* b_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, yz_shape, "B"));
+  builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/2, scalar_shape_, "size_param"));
+
+  auto dnums = XlaBuilder::CreateDefaultConvDimensionNumbers(0);
+
+  dnums.set_kernel_input_feature_dimension(0);
+  dnums.set_kernel_output_feature_dimension(1);
+  dnums.set_input_batch_dimension(0);
+  dnums.set_output_batch_dimension(1);
+  dnums.set_output_feature_dimension(0);
+
+  Window window;
+
+  auto* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
+      zx_shape, a_param, b_param, /*feature_group_count=*/1,
+      /*batch_group_count=*/1, window, dnums,
+      HloTestBase::DefaultPrecisionConfig(2)));
+
+  module_->AddEntryComputation(builder.Build());
+
+  // Set up dynamic parameter binding for non-contracting dimension.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+
+  // Set up binding for contracting dimensions.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  TF_ASSERT_OK(RunPadder().status());
+
+  ExpectPadded(conv->operand(0));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc b/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc
index c8bfc8905064bcd7b68fe259fbcc1546ff083dbd..7f0ae692f7414dbdcccda8b287c9059bcf920df1 100644
--- a/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc
+++ b/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc
@@ -29,7 +29,8 @@ Status DynamicParameterBinding::Bind(
 }
 
 absl::optional<DynamicParameterBinding::DynamicParameter>
-DynamicParameterBinding::GetBinding(const DynamicDimension& dynamic_dimension) {
+DynamicParameterBinding::GetBinding(
+    const DynamicDimension& dynamic_dimension) const {
   auto param_iter = bindings_.find(dynamic_dimension);
   if (param_iter == bindings_.end()) {
     return absl::nullopt;
@@ -70,7 +71,7 @@ StatusOr<DynamicParameterBinding> DynamicParameterBinding::CreateFromProto(
     int64 target_param_num = binding.target_param_num();
     ShapeIndex target_param_index(binding.target_param_index().begin(),
                                   binding.target_param_index().end());
-    int64 target_dim_num = binding.target_param_num();
+    int64 target_dim_num = binding.target_param_dim_num();
 
     TF_RETURN_IF_ERROR(
         result.Bind(DynamicParameter{dynamic_param_num, dynamic_param_index},
@@ -111,7 +112,8 @@ Status DynamicParameterBinding::Verify(const HloModule& module) const {
   return ForEachBinding([&](const DynamicParameter& dynamic_parameter,
                             const DynamicDimension& dynamic_dimension)
                             -> Status {
-    TF_RET_CHECK(dynamic_parameter.parameter_num < entry->num_parameters());
+    TF_RET_CHECK(dynamic_parameter.parameter_num >= 0 &&
+                 dynamic_parameter.parameter_num < entry->num_parameters());
     TF_RET_CHECK(dynamic_dimension.parameter_num < entry->num_parameters());
     TF_RET_CHECK(ShapeUtil::IndexIsValid(
         entry->parameter_instruction(dynamic_parameter.parameter_num)->shape(),
@@ -121,10 +123,11 @@ Status DynamicParameterBinding::Verify(const HloModule& module) const {
         dynamic_dimension.parameter_index));
     TF_RET_CHECK(
         dynamic_dimension.dimension <
-        ShapeUtil::Rank(ShapeUtil::GetSubshape(
+        ShapeUtil::GetSubshape(
             entry->parameter_instruction(dynamic_dimension.parameter_num)
                 ->shape(),
-            dynamic_dimension.parameter_index)));
+            dynamic_dimension.parameter_index)
+            .rank());
     return Status::OK();
   });
 }
diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding.h b/tensorflow/compiler/xla/service/dynamic_parameter_binding.h
index dd474d8eed1b2c30ddb8f624a864198c74eacaba..57af2c43d3c65f7340e6a9f04e5abbf052ebceea 100644
--- a/tensorflow/compiler/xla/service/dynamic_parameter_binding.h
+++ b/tensorflow/compiler/xla/service/dynamic_parameter_binding.h
@@ -89,7 +89,7 @@ class DynamicParameterBinding {
   //
   // Returns nullopt if the binding is not set.
   absl::optional<DynamicParameter> GetBinding(
-      const DynamicDimension& dynamic_dimension);
+      const DynamicDimension& dynamic_dimension) const;
 
   using BindingFn =
       std::function<Status(const DynamicParameter& dynamic_parameter,
diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc b/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc
index 83a6d83dffde7995bd8e43917d13c5fd2705ba6f..b5d57cda4f469a384dc0affdae9e5f93a70ac418 100644
--- a/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc
+++ b/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc
@@ -33,7 +33,15 @@ limitations under the License.
 
 namespace xla {
 namespace {
-class DynamicParameterBindingTest : public HloTestBase {};
+class DynamicParameterBindingTest : public HloTestBase {
+ protected:
+  // Serialize and then deserialize a binding.
+  void SerializeAndDeserialize(DynamicParameterBinding* binding) {
+    DynamicParameterBindingProto proto = binding->ToProto();
+    TF_ASSERT_OK_AND_ASSIGN(*binding,
+                            DynamicParameterBinding::CreateFromProto(proto));
+  }
+};
 
 TEST_F(DynamicParameterBindingTest, SimpleBinding) {
   // 'b' is a dynamic shape; 'a' represents the real size of b's first
@@ -56,15 +64,20 @@ ENTRY main {
       binding.Bind(DynamicParameterBinding::DynamicParameter{0, {}},
                    DynamicParameterBinding::DynamicDimension{1, {}, 0}));
 
-  absl::optional<DynamicParameterBinding::DynamicParameter> param =
-      binding.GetBinding(
-          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/1,
-                                                    /*parameter_index=*/{},
-                                                    /*dimension=*/0});
-  EXPECT_TRUE(param);
-  EXPECT_EQ(param->parameter_num, 0);
-  EXPECT_EQ(param->parameter_index, ShapeIndex({}));
-  TF_EXPECT_OK(binding.Verify(*module));
+  auto test = [&](const DynamicParameterBinding& binding) {
+    absl::optional<DynamicParameterBinding::DynamicParameter> param =
+        binding.GetBinding(
+            DynamicParameterBinding::DynamicDimension{/*parameter_num=*/1,
+                                                      /*parameter_index=*/{},
+                                                      /*dimension=*/0});
+    EXPECT_TRUE(param);
+    EXPECT_EQ(param->parameter_num, 0);
+    EXPECT_EQ(param->parameter_index, ShapeIndex({}));
+    TF_EXPECT_OK(binding.Verify(*module));
+  };
+  test(binding);
+  SerializeAndDeserialize(&binding);
+  test(binding);
 }
 
 TEST_F(DynamicParameterBindingTest, TupleBinding) {
@@ -89,16 +102,21 @@ ENTRY main {
       binding.Bind(DynamicParameterBinding::DynamicParameter{0, {0}},
                    DynamicParameterBinding::DynamicDimension{0, {1}, 0}));
 
-  absl::optional<DynamicParameterBinding::DynamicParameter> param =
-      binding.GetBinding(
-          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
-                                                    /*parameter_index=*/{1},
-                                                    /*dimension=*/0});
-
-  EXPECT_TRUE(param);
-  EXPECT_EQ(param->parameter_num, 0);
-  EXPECT_EQ(param->parameter_index, ShapeIndex({0}));
-  TF_EXPECT_OK(binding.Verify(*module));
+  auto test = [&](const DynamicParameterBinding& binding) {
+    absl::optional<DynamicParameterBinding::DynamicParameter> param =
+        binding.GetBinding(
+            DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
+                                                      /*parameter_index=*/{1},
+                                                      /*dimension=*/0});
+
+    EXPECT_TRUE(param);
+    EXPECT_EQ(param->parameter_num, 0);
+    EXPECT_EQ(param->parameter_index, ShapeIndex({0}));
+    TF_EXPECT_OK(binding.Verify(*module));
+  };
+  test(binding);
+  SerializeAndDeserialize(&binding);
+  test(binding);
 }
 
 TEST_F(DynamicParameterBindingTest, TupleBindingWithMultiDimension) {
@@ -127,26 +145,35 @@ ENTRY main {
       binding.Bind(DynamicParameterBinding::DynamicParameter{0, {0}},
                    DynamicParameterBinding::DynamicDimension{0, {1}, 1}));
 
-  absl::optional<DynamicParameterBinding::DynamicParameter> param =
-      binding.GetBinding(
-          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
-                                                    /*parameter_index=*/{1},
-                                                    /*dimension=*/0});
-
-  EXPECT_TRUE(param);
-  EXPECT_EQ(param->parameter_num, 0);
-  EXPECT_EQ(param->parameter_index, ShapeIndex({0}));
-
-  absl::optional<DynamicParameterBinding::DynamicParameter> param2 =
-      binding.GetBinding(
-          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
-                                                    /*parameter_index=*/{1},
-                                                    /*dimension=*/0});
-  EXPECT_TRUE(param2);
-  EXPECT_EQ(param2->parameter_num, 0);
-  EXPECT_EQ(param2->parameter_index, ShapeIndex({0}));
-
-  TF_EXPECT_OK(binding.Verify(*module));
+  auto test = [&](const DynamicParameterBinding& binding) {
+    absl::optional<DynamicParameterBinding::DynamicParameter> param =
+        binding.GetBinding(
+            DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
+                                                      /*parameter_index=*/{1},
+                                                      /*dimension=*/0});
+
+    EXPECT_TRUE(param);
+    EXPECT_EQ(param->parameter_num, 0);
+    EXPECT_EQ(param->parameter_index, ShapeIndex({0}));
+
+    absl::optional<DynamicParameterBinding::DynamicParameter> param2 =
+
+        binding.GetBinding(
+            DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
+                                                      /*parameter_index=*/{1},
+                                                      /*dimension=*/0});
+    EXPECT_TRUE(param2);
+    EXPECT_EQ(param2->parameter_num, 0);
+    EXPECT_EQ(param2->parameter_index, ShapeIndex({0}));
+    TF_EXPECT_OK(binding.Verify(*module));
+  };
+
+  test(binding);
+
+  SerializeAndDeserialize(&binding);
+
+  // Test the binding again after deserialization.
+  test(binding);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 6f1f95f2e9082649b6ca9cc0da5c238e15b77c10..a62a743802456d0239438a12884f5a594aa05798 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -423,6 +423,10 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
       return EmitSin(op->shape().element_type(), operand_value);
     case HloOpcode::kTanh:
       return EmitTanh(op->shape().element_type(), operand_value);
+    case HloOpcode::kSqrt:
+      return EmitSqrt(op->shape().element_type(), operand_value);
+    case HloOpcode::kRsqrt:
+      return EmitRsqrt(op->shape().element_type(), operand_value);
     case HloOpcode::kFloor:
       return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::floor,
                                           {operand_value},
@@ -440,14 +444,16 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
                                           {operand_value},
                                           {operand_value->getType()}, b_);
     case HloOpcode::kSign: {
-      // TODO(b/32151903): Ensure consistent sign behavior for -0.0.
       auto type = operand_value->getType();
       auto zero = llvm::ConstantFP::get(type, 0.0);
-      auto oeq = FCmpOEQ(operand_value, zero);
-      auto olt = FCmpOLT(operand_value, zero);
-      return Select(oeq, zero,
-                    Select(olt, llvm::ConstantFP::get(type, -1.0),
-                           llvm::ConstantFP::get(type, 1.0)));
+      auto ne0_i1 = FCmpONE(operand_value, zero);
+      auto ne0_float = UIToFP(ne0_i1, type);
+      llvm::Value* result = llvm_ir::EmitCallToIntrinsic(
+          llvm::Intrinsic::copysign, {ne0_float, operand_value},
+          {operand_value->getType()}, b_);
+      auto is_nan = FCmpUNO(operand_value, operand_value);
+      result = Select(is_nan, operand_value, result);
+      return result;
     }
     case HloOpcode::kIsFinite: {
       // abs(x) o!= inf, this works because the comparison returns false if
@@ -653,6 +659,20 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
           EmitComposeComplex(op, FDiv(EmitExtractReal(operand_value), cplx_abs),
                              FDiv(EmitExtractImag(operand_value), cplx_abs)));
     }
+    case HloOpcode::kSqrt: {
+      auto a = EmitExtractReal(operand_value);
+      auto b = EmitExtractImag(operand_value);
+      auto c = llvm::ConstantFP::get(a->getType(), 0.5);
+      auto d = llvm::ConstantFP::get(b->getType(), 0.0);
+      return EmitComplexPower(op, a, b, c, d);
+    }
+    case HloOpcode::kRsqrt: {
+      auto a = EmitExtractReal(operand_value);
+      auto b = EmitExtractImag(operand_value);
+      auto c = llvm::ConstantFP::get(a->getType(), -0.5);
+      auto d = llvm::ConstantFP::get(b->getType(), 0.0);
+      return EmitComplexPower(op, a, b, c, d);
+    }
     case HloOpcode::kNegate:
       return EmitComposeComplex(op, FNeg(EmitExtractReal(operand_value)),
                                 FNeg(EmitExtractImag(operand_value)));
@@ -736,6 +756,43 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatBinaryOp(
   }
 }
 
+// (a+bi)^(c+di) =
+//    (a*a+b*b)^(0.5c) * exp(-d*atan2(b,a)) * (cos(q) + i*sin(q)),
+//    where q = c*atan2(b,a)+0.5d*ln(a*a+b*b)
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexPower(
+    const HloInstruction* op, llvm::Value* a, llvm::Value* b, llvm::Value* c,
+    llvm::Value* d) {
+  PrimitiveType component_type =
+      primitive_util::ComplexComponentType(op->shape().element_type());
+  auto aa_p_bb = FAdd(FMul(a, a), FMul(b, b));
+  auto zero = llvm::ConstantFP::get(a->getType(), 0);
+  auto one_half = llvm::ConstantFP::get(a->getType(), 0.5);
+  auto one = llvm::ConstantFP::get(a->getType(), 1);
+  auto half_c = FMul(one_half, c);
+
+  TF_ASSIGN_OR_RETURN(auto aa_p_bb_to_half_c,
+                      EmitPow(component_type, aa_p_bb, half_c));
+
+  auto neg_d = FNeg(d);
+  TF_ASSIGN_OR_RETURN(auto arg_lhs, EmitAtan2(component_type, b, a));
+  auto neg_d_arg_lhs = FMul(neg_d, arg_lhs);
+  TF_ASSIGN_OR_RETURN(auto e_to_neg_d_arg_lhs,
+                      EmitExp(component_type, neg_d_arg_lhs));
+  auto coeff = FMul(aa_p_bb_to_half_c, e_to_neg_d_arg_lhs);
+  TF_ASSIGN_OR_RETURN(auto ln_aa_p_bb, EmitLog(component_type, aa_p_bb));
+  auto half_d = FMul(one_half, d);
+  auto q = FAdd(FMul(c, arg_lhs), FMul(half_d, ln_aa_p_bb));
+  TF_ASSIGN_OR_RETURN(auto cos_q, EmitCos(component_type, q));
+  TF_ASSIGN_OR_RETURN(auto sin_q, EmitSin(component_type, q));
+  // 0^c is 0 if d is 0 and c > 0. 0^0 is defined to be 1.0, see
+  // Branch Cuts for Complex Elementary Functions or Much Ado About
+  // Nothing's Sign Bit, W. Kahan, Section 10.
+  return Select(
+      And(And(FCmpOEQ(aa_p_bb, zero), FCmpOEQ(d, zero)), FCmpOLE(zero, c)),
+      EmitComposeComplex(op, Select(FCmpOEQ(zero, c), one, zero), zero),
+      EmitComposeComplex(op, FMul(coeff, cos_q), FMul(coeff, sin_q)));
+}
+
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexBinaryOp(
     const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value) {
   switch (op->opcode()) {
@@ -802,33 +859,11 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexBinaryOp(
                                         EmitExtractImag(rhs_value), b_));
 
     case HloOpcode::kPower: {
-      // (a+bi)^(c+di) =
-      //    (a*a+b*b)^(0.5c) * exp(-d*atan2(b,a)) * (cos(q) + i*sin(q)),
-      //    where q = c*atan2(b,a)+0.5d*ln(a*a+b*b)
-      PrimitiveType component_type =
-          primitive_util::ComplexComponentType(op->shape().element_type());
       auto a = EmitExtractReal(lhs_value);
       auto b = EmitExtractImag(lhs_value);
       auto c = EmitExtractReal(rhs_value);
       auto d = EmitExtractImag(rhs_value);
-      auto aa_p_bb = FAdd(FMul(a, a), FMul(b, b));
-      auto one_half = llvm::ConstantFP::get(a->getType(), 0.5);
-      auto half_c = FMul(one_half, c);
-
-      TF_ASSIGN_OR_RETURN(auto aa_p_bb_to_half_c,
-                          EmitPow(component_type, aa_p_bb, half_c));
-      auto neg_d = FNeg(d);
-      TF_ASSIGN_OR_RETURN(auto arg_lhs, EmitAtan2(component_type, b, a));
-      auto neg_d_arg_lhs = FMul(neg_d, arg_lhs);
-      TF_ASSIGN_OR_RETURN(auto e_to_neg_d_arg_lhs,
-                          EmitExp(component_type, neg_d_arg_lhs));
-      auto coeff = FMul(aa_p_bb_to_half_c, e_to_neg_d_arg_lhs);
-      TF_ASSIGN_OR_RETURN(auto ln_aa_p_bb, EmitLog(component_type, aa_p_bb));
-      auto half_d = FMul(one_half, d);
-      auto q = FAdd(FMul(c, arg_lhs), FMul(half_d, ln_aa_p_bb));
-      TF_ASSIGN_OR_RETURN(auto cos_q, EmitCos(component_type, q));
-      TF_ASSIGN_OR_RETURN(auto sin_q, EmitSin(component_type, q));
-      return EmitComposeComplex(op, FMul(coeff, cos_q), FMul(coeff, sin_q));
+      return EmitComplexPower(op, a, b, c, d);
     }
     default:
       return Unimplemented("binary complex op '%s'",
@@ -846,6 +881,9 @@ llvm::Value* ElementalIrEmitter::EmitFloatMin(llvm::Value* lhs_value,
   return llvm_ir::EmitFloatMin(lhs_value, rhs_value, b_);
 }
 
+// TODO(b/123355973): We have an implementation of erfinv in math.cc.  We
+// shouldn't have two implementations, especially since this one isn't testable
+// (it's only observable via a normally-distributed RNG).
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitErfInv(PrimitiveType prim_type,
                                                       llvm::Value* x) {
   if (prim_type != F16 && prim_type != F32 && prim_type != F64) {
@@ -1038,6 +1076,18 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitLog1p(PrimitiveType prim_type,
   return Select(x_is_small, for_small_x, for_large_x);
 }
 
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitSqrt(PrimitiveType prim_type,
+                                                    llvm::Value* value) {
+  return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sqrt, {value},
+                                      {value->getType()}, b_);
+}
+
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitRsqrt(PrimitiveType prim_type,
+                                                     llvm::Value* value) {
+  TF_ASSIGN_OR_RETURN(auto sqrt, EmitSqrt(prim_type, value));
+  return FDiv(llvm::ConstantFP::get(sqrt->getType(), 1.0), sqrt);
+}
+
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitSin(PrimitiveType prim_type,
                                                    llvm::Value* value) {
   return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sin, {value},
@@ -1327,9 +1377,9 @@ llvm_ir::IrArray::Index ElementalIrEmitter::ElementwiseSourceIndex(
 
   // If implicit broadcast is needed, the source dimensions that are broadcast
   // have index 0.
-  CHECK_EQ(ShapeUtil::Rank(operand_shape), ShapeUtil::Rank(hlo.shape()));
+  CHECK_EQ(operand_shape.rank(), hlo.shape().rank());
   llvm_ir::IrArray::Index source_index(target_index.GetType());
-  for (int64 i = 0; i < ShapeUtil::Rank(hlo.shape()); ++i) {
+  for (int64 i = 0; i < hlo.shape().rank(); ++i) {
     if (hlo.shape().dimensions(i) == operand_shape.dimensions(i)) {
       source_index.push_back(target_index[i]);
     } else {
@@ -1353,26 +1403,69 @@ StatusOr<llvm::Value*> ElementalIrEmitter::ConvertValueForDistribution(
       llvm_ir::PrimitiveTypeToIrType(elem_prim_ty, module_);
   llvm::Type* raw_value_ty = raw_value->getType();
 
-  // Convert raw integer to float in range [0, 1) if the element is a float.
+  // If we're generating a floating-point value, convert the raw integer R (i.e.
+  // `raw_value`) to a float in the range [0, 1).
+  //
+  // The basic approach is to choose a significand and exponent such that the
+  // significand is uniformly distributed and the exponent is distributed, well,
+  // exponentially (it's more likely to be close to 0 than far from 0).
+  //
+  // An easy way to do this is to say that the significand is the first S bits
+  // of R, and the exponent is determined by the number of trailing zeroes in R,
+  // exp = 2^-(cttz(R) + 1).  (+1 because the largest exponent should be -1;
+  // this way the largest value we can return is 1.999... * 2^-1 = 1-ε.)
+  //
+  // This results in a small bias.  Namely, if R has enough trailing zeroes, the
+  // significand and exponent will "overlap".  As a concrete example, consider
+  //
+  //         20 X's                 12 zeroes
+  //   R = 0bXXXXXXXXXXXXXXXXXXXX000000000000
+  //
+  // Here the exponent is 2^-13 because R has 12 trailing zeroes.  The
+  // significand is made up of the first 23 most-significant bits of R, which we
+  // observe contain 3 zeroes.  This is biased because any random value with
+  // exponent 2^-12 will have a significand which ends in `000`.
+  //
+  // For f32s, this problem occurs only when there are more than 32-23 = 9
+  // trailing zeros, which happens with probability 0.5^10 = ~0.1%. Moreover the
+  // probability of a large bias (i.e. many trailing 0s in the significand) is
+  // exponentially low.  So we deem this acceptable.
   llvm::Value* elem_value = raw_value;
   if (elem_ir_ty->isFloatingPointTy()) {
-    unsigned raw_value_size_in_bits = raw_value_ty->getPrimitiveSizeInBits();
-    CHECK(raw_value_size_in_bits == 32 || raw_value_size_in_bits == 64);
-    // Perform the division using the float type with the same number of bits
-    // as the raw value to avoid overflow.
-    if (raw_value_size_in_bits == 32) {
-      elem_value = UIToFP(elem_value, b_->getFloatTy());
-      elem_value = FDiv(elem_value,
-                        llvm::ConstantFP::get(b_->getFloatTy(), std::exp2(32)));
-    } else {
-      elem_value = UIToFP(elem_value, b_->getDoubleTy());
-      elem_value = FDiv(
-          elem_value, llvm::ConstantFP::get(b_->getDoubleTy(), std::exp2(64)));
-    }
-
-    if (elem_ir_ty != elem_value->getType()) {
-      elem_value = FPTrunc(elem_value, elem_ir_ty);
-    }
+    const auto& dest_flt_semantics = elem_ir_ty->getFltSemantics();
+    const int bits = raw_value_ty->getPrimitiveSizeInBits();
+    CHECK_GE(bits, llvm::APFloat::semanticsSizeInBits(dest_flt_semantics));
+
+    // Subtract 1 because semanticsPrecision includes the "hidden bit", i.e. the
+    // implicit "1." at the beginning of the significand.
+    const int significand_bits =
+        llvm::APFloat::semanticsPrecision(dest_flt_semantics) - 1;
+
+    llvm::Value* cttz = llvm_ir::EmitCallToIntrinsic(
+        llvm::Intrinsic::cttz, {raw_value, /*is_zero_undef=*/b_->getFalse()},
+        {raw_value->getType()}, b_);
+    llvm::Value* significand = LShr(raw_value, bits - significand_bits);
+
+    // Exponent bias is -127 for f32, meaning that if the exponent is E and the
+    // significand is S, then the value of the number is 2^(E - 127) * (1.S).
+    //
+    // We want cttz == 0 to correspond to 2^-1, so our exponent is computed as
+    // E = 126 - cttz.
+    //
+    // For f64, this is all the same, except the bias is -1023.
+    //
+    // In IEEE floating point, the absolute value of the exponent bias equals
+    // the value of the largest possible exponent.
+    const int bias = -llvm::APFloat::semanticsMaxExponent(dest_flt_semantics);
+    llvm::Value* exponent =
+        Sub(llvm::ConstantInt::get(cttz->getType(), -bias - 1), cttz);
+
+    // Now just slot everything into place!  The `Trunc` is here because
+    // raw_value may be larger than our float destination.
+    elem_value =
+        BitCast(Trunc(Or(Shl(exponent, significand_bits), significand),
+                      b_->getIntNTy(elem_ir_ty->getPrimitiveSizeInBits())),
+                elem_ir_ty);
   }
 
   // Convert the value for the requested distribution.
@@ -1750,7 +1843,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicSlice(
     const llvm_ir::IrArray::Index& index) {
   // Emit IR to read dynamic start indices from hlo->operand(1).
   const HloInstruction* input_hlo = hlo->operand(0);
-  const int64 rank = ShapeUtil::Rank(input_hlo->shape());
+  const int64 rank = input_hlo->shape().rank();
   // Use the same index type for all tensor accesses in the same kernel.
   llvm::Type* index_type = index.GetType();
   llvm_ir::IrArray::Index slice_start_index(index_type, rank);
@@ -1758,9 +1851,10 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicSlice(
     auto index_typed_const = [&](uint64 c) -> llvm::Constant* {
       return llvm::ConstantInt::get(index_type, c);
     };
-    llvm_ir::IrArray::Index dim_index(1, index_typed_const(i));
-    TF_ASSIGN_OR_RETURN(llvm::Value * start_index_value,
-                        operand_to_generator.at(hlo->operand(1))(dim_index));
+    llvm_ir::IrArray::Index zero_index(index_type);
+    TF_ASSIGN_OR_RETURN(
+        llvm::Value * start_index_value,
+        operand_to_generator.at(hlo->operand(1 + i))(zero_index));
 
     // Clamp the start index so that the sliced portion fits in the operand:
     // start_index = clamp(start_index, 0, operand_dim_size - output_dim_size)
@@ -1893,7 +1987,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicUpdateSlice(
   const HloInstruction* update_hlo = hlo->operand(1);
   const HloInstruction* start_hlo = hlo->operand(2);
   // Calculate slice start/end indices.
-  const int64 rank = ShapeUtil::Rank(input_hlo->shape());
+  const int64 rank = input_hlo->shape().rank();
   llvm_ir::IrArray::Index slice_start_index(index.GetType(), rank);
   llvm_ir::IrArray::Index slice_limit_index(index.GetType(), rank);
   // Slice intersection gathers (ANDs) conditions on all ranks for which
@@ -1905,9 +1999,11 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicUpdateSlice(
     auto index_typed_const = [&](uint64 c) -> llvm::Constant* {
       return llvm::ConstantInt::get(index_type, c);
     };
-    llvm_ir::IrArray::Index dim_index(1, index_typed_const(i));
-    TF_ASSIGN_OR_RETURN(llvm::Value * start_index_value,
-                        operand_to_generator.at(start_hlo)(dim_index));
+
+    llvm_ir::IrArray::Index zero_index(index_type);
+    TF_ASSIGN_OR_RETURN(
+        llvm::Value * start_index_value,
+        operand_to_generator.at(hlo->operand(2 + i))(zero_index));
 
     // Clamp the start index so that the update region fits in the operand.
     // start_index = clamp(start_index, 0, input_dim_size - update_dim_size)
@@ -2128,8 +2224,10 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kNegate:
     case HloOpcode::kNot:
     case HloOpcode::kReal:
+    case HloOpcode::kRsqrt:
     case HloOpcode::kSign:
     case HloOpcode::kSin:
+    case HloOpcode::kSqrt:
     case HloOpcode::kTanh:
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index& index) -> StatusOr<llvm::Value*> {
@@ -2225,7 +2323,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
         auto* iota = Cast<HloIotaInstruction>(hlo);
         PrimitiveType element_type = iota->shape().element_type();
         IrArray::Index elem_index =
-            ShapeUtil::Rank(iota->shape()) > 1
+            iota->shape().rank() > 1
                 ? target_index.SourceIndexOfBroadcast(
                       iota->shape(),
                       ShapeUtil::MakeShapeWithDescendingLayout(
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index d3e2acaabd4f602171def70ccd3d4fd5adce0d0d..819465f1e5d633a0652b09005a3d9a08874759bd 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -119,6 +119,12 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
   virtual StatusOr<llvm::Value*> EmitLog(PrimitiveType prim_type,
                                          llvm::Value* value);
 
+  virtual StatusOr<llvm::Value*> EmitSqrt(PrimitiveType prim_type,
+                                          llvm::Value* value);
+
+  virtual StatusOr<llvm::Value*> EmitRsqrt(PrimitiveType prim_type,
+                                           llvm::Value* value);
+
   virtual StatusOr<llvm::Value*> EmitLog1p(PrimitiveType prim_type,
                                            llvm::Value* value);
 
@@ -211,13 +217,21 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
   const HloModuleConfig& hlo_module_config_;
 
  private:
+  // Computes the complex power function, returns (a + i*b)^(c + i*d).
+  StatusOr<llvm::Value*> EmitComplexPower(const HloInstruction* op,
+                                          llvm::Value* a, llvm::Value* b,
+                                          llvm::Value* c, llvm::Value* d);
+
   // Returns a ElementGenerator for an RNG HloInstruction using the Philox
   // random number generation algorithm.
   llvm_ir::ElementGenerator MakePhiloxRngElementGenerator(
       const HloInstruction* hlo,
       const HloToElementGeneratorMap& operand_to_generator);
+
   // Converts the raw value generated by a random number generation algorithm
   // to the distribution requested by the RNG HloInstruction.
+  //
+  // Precondition: raw_value has at least as many bits as hlo's element type.
   StatusOr<llvm::Value*> ConvertValueForDistribution(
       const HloInstruction* hlo,
       const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index 10b8c01ff1383658fcfb2271c177ba54347f985a..1518d83083b3b0ce876da9344c483a23cd5b073c 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/platform/env.h"
 
-
 namespace xla {
 
 StatusOr<std::vector<ScopedShapedBuffer>> Executable::ExecuteOnStreams(
@@ -173,11 +172,13 @@ Status Executable::DumpHloSnapshot() {
   }
   filename = SanitizeFileName(std::move(filename));
   string file_path = tensorflow::io::JoinPath(directory_path, filename);
-  string result;
-  TF_RET_CHECK(
-      tensorflow::SerializeToStringDeterministic(hlo_session, &result));
-  return tensorflow::WriteStringToFile(tensorflow::Env::Default(), file_path,
-                                       result);
+  const size_t size = hlo_session.ByteSizeLong();
+  auto serialized = absl::make_unique<char[]>(size);
+  TF_RET_CHECK(tensorflow::SerializeToBufferDeterministic(
+      hlo_session, serialized.get(), size));
+  return tensorflow::WriteStringToFile(
+      tensorflow::Env::Default(), file_path,
+      absl::string_view(serialized.get(), size));
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gather_expander.cc b/tensorflow/compiler/xla/service/gather_expander.cc
index 01cef499665c050d4453382289168276028e1d26..a58ac39dffad56315308f784b08e6b6087b8e30a 100644
--- a/tensorflow/compiler/xla/service/gather_expander.cc
+++ b/tensorflow/compiler/xla/service/gather_expander.cc
@@ -153,10 +153,9 @@ static StatusOr<std::vector<HloInstruction*>> GatherLoopBody(
            dim_numbers.index_vector_dim() ==
                gather.operand(1)->shape().dimensions_size());
 
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * induction_var_as_vector,
+  HloInstruction* induction_var_as_vector =
       MakeBroadcastHlo(induction_var, /*broadcast_dimensions=*/{},
-                       /*result_shape_bounds=*/{1}));
+                       /*result_shape_bounds=*/{1});
 
   HloInstruction* index_vector;
 
@@ -222,7 +221,7 @@ static StatusOr<std::vector<HloInstruction*>> GatherLoopBody(
       {operand, start_indices, updated_accumulator}};
 }
 
-static StatusOr<HloInstruction*> CreateGatherLoopAccumulatorInitValue(
+static HloInstruction* CreateGatherLoopAccumulatorInitValue(
     HloComputation* computation, PrimitiveType element_type,
     absl::Span<const int64> slice_sizes, int64 gather_loop_trip_count,
     const GatherDimensionNumbers& dim_numbers) {
@@ -297,7 +296,7 @@ static StatusOr<HloInstruction*> PermuteBatchAndOffsetDims(
 // [3,1] out of operand into an accumulator of shape [4,3,1].  We then
 // reshape this result to [2,2,3] and finally transpose it to [2,3,2].
 
-StatusOr<HloInstruction*> GatherExpander::ExpandGather(
+StatusOr<HloInstruction*> GatherExpander::ExpandInstruction(
     HloInstruction* gather_instr) {
   CHECK(!ShapeUtil::IsZeroElementArray(gather_instr->shape()));
 
@@ -332,12 +331,10 @@ StatusOr<HloInstruction*> GatherExpander::ExpandGather(
   CHECK_EQ(gather_loop_trip_count,
            canonical_start_indices->shape().dimensions(0));
 
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * accumulator_init,
-      CreateGatherLoopAccumulatorInitValue(
-          computation, output_shape.element_type(),
-          gather_instr->gather_slice_sizes(), gather_loop_trip_count,
-          gather_instr->gather_dimension_numbers()));
+  HloInstruction* accumulator_init = CreateGatherLoopAccumulatorInitValue(
+      computation, output_shape.element_type(),
+      gather_instr->gather_slice_sizes(), gather_loop_trip_count,
+      gather_instr->gather_dimension_numbers());
 
   StatusOr<std::vector<HloInstruction*>> gather_loop_result_or_error =
       WhileUtil::MakeCountedLoop(
@@ -364,25 +361,11 @@ StatusOr<HloInstruction*> GatherExpander::ExpandGather(
                                    output_rank);
 }
 
-StatusOr<bool> GatherExpander::Run(HloModule* module) {
-  auto is_nontrivial_gather = [](HloInstruction* inst) {
-    return inst->opcode() == HloOpcode::kGather &&
-           // Avoid expanding gather ops that produce zero sized tensors,
-           // instead punt these to ZeroSizedHloElimination.
-           !ShapeUtil::IsZeroElementArray(inst->shape());
-  };
-
-  std::vector<HloInstruction*> gather_instrs;
-  for (HloComputation* computation : module->MakeNonfusionComputations()) {
-    absl::c_copy_if(computation->instructions(),
-                    std::back_inserter(gather_instrs), is_nontrivial_gather);
-  }
-
-  for (HloInstruction* inst : gather_instrs) {
-    TF_ASSIGN_OR_RETURN(HloInstruction * expanded_root, ExpandGather(inst));
-    TF_RETURN_IF_ERROR(inst->parent()->ReplaceInstruction(inst, expanded_root));
-  }
-
-  return !gather_instrs.empty();
+bool GatherExpander::InstructionMatchesPattern(HloInstruction* inst) {
+  return inst->opcode() == HloOpcode::kGather &&
+         // Avoid expanding gather ops that produce zero sized tensors,
+         // instead punt these to ZeroSizedHloElimination.
+         !ShapeUtil::IsZeroElementArray(inst->shape());
 }
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gather_expander.h b/tensorflow/compiler/xla/service/gather_expander.h
index 8af9c6b71fbc391bf7c0e9809e979b65135a6df3..5625a37cb46ca5b70f69d86bc424f6512bfb293f 100644
--- a/tensorflow/compiler/xla/service/gather_expander.h
+++ b/tensorflow/compiler/xla/service/gather_expander.h
@@ -16,20 +16,22 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GATHER_EXPANDER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GATHER_EXPANDER_H_
 
-#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/op_expander_pass.h"
 
 namespace xla {
 
 // This pass rewrites gather operations into (roughly) while loops of dynamic
 // slices.  This lets backends that don't support gather directly to
 // nevertheless have a minimum level of support.
-class GatherExpander : public HloModulePass {
+class GatherExpander : public OpExpanderPass {
  public:
   absl::string_view name() const override { return "gather_expander"; }
-  StatusOr<bool> Run(HloModule* module) override;
 
  protected:
-  StatusOr<HloInstruction*> ExpandGather(HloInstruction* gather_instr);
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+
+  StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* gather_inst) override;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gather_expander_test.cc b/tensorflow/compiler/xla/service/gather_expander_test.cc
index a3102368cb1dba15da7422337666d278cef775ab..e1ea5c39d58b6d23b076740626ca0ad63dc341ee 100644
--- a/tensorflow/compiler/xla/service/gather_expander_test.cc
+++ b/tensorflow/compiler/xla/service/gather_expander_test.cc
@@ -89,7 +89,7 @@ ENTRY main {
   // an implementation detail from WhileUtil::MakeCountedLoop).
 
   const Shape& while_shape = while_instr->shape();
-  ASSERT_TRUE(ShapeUtil::IsTuple(while_shape));
+  ASSERT_TRUE(while_shape.IsTuple());
   ASSERT_EQ(ShapeUtil::TupleElementCount(while_shape), 4);
 
   EXPECT_TRUE(ShapeUtil::SameDimensions(
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
index bec02e14f951c6d905b7329be5c02896984279d0..cb43c27be961262bf29d4a3958de62cfada19aed 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
@@ -83,7 +82,7 @@ Status GenericTransferManager::TransferLiteralFromDeviceInternal(
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
       device_buffer.on_host_shape(),
       [&](const Shape& subshape, const ShapeIndex& index) -> Status {
-        if (ShapeUtil::IsArray(subshape)) {
+        if (subshape.IsArray()) {
           TF_RETURN_IF_ERROR(executor->SynchronousMemcpyD2H(
               /*source=*/device_buffer.buffer(index),
               /*size=*/GetByteSizeRequirement(subshape),
@@ -120,7 +119,7 @@ Status GenericTransferManager::TransferLiteralToDeviceAsync(
       device_buffer.on_host_shape(),
       [&](const Shape& device_subshape, const ShapeIndex& index) -> Status {
         se::DeviceMemoryBase device_memory = device_buffer.buffer(index);
-        if (ShapeUtil::IsArray(device_subshape)) {
+        if (device_subshape.IsArray()) {
           TF_RET_CHECK(GetByteSizeRequirement(device_subshape) ==
                        device_memory.size());
           // Element is array-shaped: transfer array data to device buffer.
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index bfd1b6cb1492f5cb709e2ecefe73782094e26f5e..25c4f70d89b4ebc483a61f1e28c7a55eb31f4bdf 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -3,6 +3,11 @@
 
 load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
+load(
+    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "tf_cuda_tests_tags",
+)
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 licenses(["notice"])  # Apache 2.0
 
@@ -24,12 +29,6 @@ filegroup(
     ]),
 )
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
-    "tf_cuda_tests_tags",
-)
-
 xla_proto_library(
     name = "backend_configs",
     srcs = ["backend_configs.proto"],
@@ -94,8 +93,8 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_reachability",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -135,6 +134,8 @@ cc_library(
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/compiler/xla/service/llvm_ir:tuple_ops",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm//:core",
@@ -263,7 +264,9 @@ cc_library(
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
     ],
@@ -302,6 +305,7 @@ cc_library(
         "sequential_thunk.cc",
         "thunk.cc",
         "thunk_schedule.cc",
+        "triangular_solve_thunk.cc",
         "tuple_thunk.cc",
         "while_thunk.cc",
     ],
@@ -321,6 +325,7 @@ cc_library(
         "sequential_thunk.h",
         "thunk.h",
         "thunk_schedule.h",
+        "triangular_solve_thunk.h",
         "tuple_thunk.h",
         "while_thunk.h",
     ],
@@ -361,7 +366,10 @@ cc_library(
         "//tensorflow/core/platform/default/build_config:cufft_plugin",
         "//tensorflow/core/platform/default/build_config:stream_executor_cuda",  # build_cleaner: keep
         "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:blas",
+        "//tensorflow/stream_executor:device_memory",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -392,18 +400,21 @@ cc_library(
     srcs = ["cudnn_conv_algorithm_picker.cc"],
     hdrs = ["cudnn_conv_algorithm_picker.h"],
     deps = [
+        ":autotuning_proto",
         ":backend_configs",
         ":buffer_comparator",
         ":cudnn_conv_runner",
         ":gpu_executable",
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/core:lib",
+        "//tensorflow/core:logger",
         "//tensorflow/core:stream_executor_no_cuda",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -551,6 +562,44 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "gpu_sanitize_constant_names",
+    srcs = ["gpu_sanitize_constant_names.cc"],
+    hdrs = ["gpu_sanitize_constant_names.h"],
+    deps = [
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "gpu_sanitize_constant_names_test",
+    srcs = ["gpu_sanitize_constant_names_test.cc"],
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":gpu_sanitize_constant_names",
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla:shape_layout",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:computation_layout",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "fusion_merger",
     srcs = ["fusion_merger.cc"],
@@ -675,6 +724,7 @@ cc_library(
         ":gpu_hlo_schedule",
         ":gpu_hlo_support_checker",
         ":gpu_layout_assignment",
+        ":gpu_sanitize_constant_names",
         ":instruction_fusion",
         ":ir_emission_utils",
         ":ir_emitter",
@@ -694,6 +744,9 @@ cc_library(
         "//tensorflow/compiler/xla/service:buffer_liveness",
         "//tensorflow/compiler/xla/service:call_inliner",
         "//tensorflow/compiler/xla/service:conditional_simplifier",
+        "//tensorflow/compiler/xla/service:convolution_group_converter",
+        "//tensorflow/compiler/xla/service:dot_decomposer",
+        "//tensorflow/compiler/xla/service:dynamic_index_splitter",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
         "//tensorflow/compiler/xla/service:hlo",
@@ -711,6 +764,8 @@ cc_library(
         "//tensorflow/compiler/xla/service:llvm_compiler",
         "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/service:reshape_mover",
+        "//tensorflow/compiler/xla/service:sort_simplifier",
+        "//tensorflow/compiler/xla/service:stable_sort_expander",
         "//tensorflow/compiler/xla/service:transpose_folding",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
         "//tensorflow/compiler/xla/service:while_loop_constant_sinking",
@@ -724,6 +779,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:regexp_internal",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor/cuda:cuda_diagnostics",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -1004,14 +1060,10 @@ cc_library(
     srcs = ["variadic_op_splitter.cc"],
     hdrs = ["variadic_op_splitter.h"],
     deps = [
-        ":ir_emission_utils",
-        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
@@ -1037,3 +1089,12 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
     ],
 )
+
+xla_proto_library(
+    name = "autotuning_proto",
+    srcs = ["autotuning.proto"],
+    deps = [
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_proto",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/gpu/autotuning.proto b/tensorflow/compiler/xla/service/gpu/autotuning.proto
new file mode 100644
index 0000000000000000000000000000000000000000..b4a08963b4f2ebc55c89ed57325093536f343bd1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/autotuning.proto
@@ -0,0 +1,81 @@
+// This file defines protos that store the results of autotuning XLA:GPU
+// operations.
+//
+// They are in proto format because we want to log them structured. They offer
+// tremendous statistical, testing, and debugging value.
+syntax = "proto3";
+
+package xla.gpu;
+
+import "google/protobuf/duration.proto";
+import "tensorflow/compiler/xla/xla_data.proto";
+import "tensorflow/compiler/xla/service/hlo.proto";
+
+message CudnnVersion {
+  int32 major = 1;
+  int32 minor = 2;
+  int32 patch = 3;
+}
+
+message ComputeCapability {
+  int32 major = 1;
+  int32 minor = 2;
+}
+
+message AutotuneResult {
+  message SuccessResult {
+    int64 scratch_bytes = 1;
+    google.protobuf.Duration run_time = 2;
+  }
+
+  message ConvKey {
+    int64 algorithm = 1;
+    bool tensor_ops_enabled = 2;
+  }
+
+  // If the conv runs successfully, success will be populated with the
+  // autotuning result. Otherwise, the error message is propagated.
+  oneof result {
+    SuccessResult success = 3;
+    string error_string = 4;
+  }
+
+  oneof key {
+    ConvKey conv = 5;
+  }
+
+  // Sometimes we run a correctness checker during autotuning. It compares the
+  // result buffer content between two algorithms, say, "reference" and "test"
+  // algorithms. The "test" algorithm is the one associated with this
+  // AutotuneResult.
+  //
+  // This field records the reference algorithm used. Notice that naming it
+  // "reference" doesn't mean it's always correct. However, empirically it's
+  // more correct, as it's "algo 0", less fancy than the compared one.
+  //
+  // Notice that the checker_failure may exist even in the success case.
+  // This is because the error string in `result` comes from the underlying
+  // implementation like cuDNN, which isn't aware that it produced an incorrect
+  // result. And even if the checker detects an incorrect result, we can still
+  // retrieve scratch_bytes and runtime_ms.
+  oneof checker_failure {
+    ConvKey reference_conv = 6;
+  }
+}
+
+message AutotuneLog {
+  message Instruction {
+    xla.HloInstructionProto instruction = 1;
+    repeated xla.ShapeProto operand_shapes = 2;
+  }
+
+  oneof instr_oneof {
+    Instruction instr = 1;
+  }
+
+  // Records all auto-tuning results per algorithm.
+  repeated AutotuneResult results = 3;
+
+  CudnnVersion cudnn_version = 4;
+  ComputeCapability compute_capability = 5;
+}
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
index 528209abc75777440163c2e1512658b8ad36315b..eb59ee5a1d47b6b706ef3f53a76069b3538eb6b7 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -57,16 +58,16 @@ StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
 
     // If buffer #i's address is already registered (e.g. external arguments or
     // result buffers), use that registered buffer.
-    if (registered_buffers_.count(i)) {
-      se::DeviceMemoryBase address = FindOrDie(registered_buffers_, i);
-      if (reinterpret_cast<uintptr_t>(address.opaque()) % expected_alignment !=
+    if (se::DeviceMemoryBase* address =
+            tensorflow::gtl::FindOrNull(registered_buffers_, i)) {
+      if (reinterpret_cast<uintptr_t>(address->opaque()) % expected_alignment !=
           0) {
         return InternalError(
             "Address of registered buffer %d must be a multiple of %x, but "
             "was %p",
-            i, kEntryParameterAlignBytes, address.opaque());
+            i, kEntryParameterAlignBytes, address->opaque());
       }
-      buffer_allocations->SetBuffer(i, FindOrDie(registered_buffers_, i));
+      buffer_allocations->SetBuffer(i, *address);
       continue;
     }
 
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
index 14186b8faa68ad8492ea4863fcd7bd746e2eae48..9413ac2cff7c8d3ec4be6662569c580060bf1173 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <set>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
@@ -52,7 +53,8 @@ class BufferAllocations {
         DeviceMemoryAllocator* memory_allocator);
 
    private:
-    std::map<BufferAllocation::Index, se::DeviceMemoryBase> registered_buffers_;
+    absl::flat_hash_map<BufferAllocation::Index, se::DeviceMemoryBase>
+        registered_buffers_;
   };
 
   ~BufferAllocations();
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.cc
index 60289506524759580dbb9b82147c78c4ce1cb25e..2cceb0422d08ff7951308b0727941f5437785447 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.cc
@@ -188,13 +188,8 @@ Status Visitor::HandleBatchNormGrad(HloInstruction* batch_norm) {
           computation_->AddInstruction(HloInstruction::CreateBroadcast(
               batch_norm->operand(3)->shape(), epsilon, {}))));
   HloInstruction* inverse_stddev =
-      computation_->AddInstruction(HloInstruction::CreateBinary(
-          var_plus_epsilon->shape(), HloOpcode::kPower, var_plus_epsilon,
-          computation_->AddInstruction(HloInstruction::CreateBroadcast(
-              var_plus_epsilon->shape(),
-              computation_->AddInstruction(HloInstruction::CreateConstant(
-                  LiteralUtil::CreateR0<float>(-.5))),
-              {}))));
+      computation_->AddInstruction(HloInstruction::CreateUnary(
+          var_plus_epsilon->shape(), HloOpcode::kRsqrt, var_plus_epsilon));
 
   std::vector<HloInstruction*> operands(batch_norm->operands().begin(),
                                         batch_norm->operands().end());
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
index 6d6780fa1c7b0c636eb771c40e74f074cd8c4c4b..603af5a654589e0b02c762b57d70a8b7628b1d0f 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
@@ -16,14 +16,17 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/time/time.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/platform/logger.h"
 #include "tensorflow/core/platform/mutex.h"
 
 namespace xla {
@@ -32,7 +35,6 @@ namespace {
 
 using absl::optional;
 using se::DeviceMemoryBase;
-using se::dnn::AlgorithmConfig;
 using se::dnn::AlgorithmDesc;
 
 class ScratchAllocator : public se::ScratchAllocator {
@@ -132,6 +134,31 @@ tensorflow::mutex_lock LockGpu(const se::StreamExecutor* stream_exec) {
   return tensorflow::mutex_lock{it->second};
 }
 
+xla::gpu::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) {
+  xla::gpu::CudnnVersion cudnn_version;
+  if (auto* dnn = stream_executor->AsDnn()) {
+    StatusOr<se::dnn::VersionInfo> version_or = dnn->GetVersion();
+    if (version_or.ok()) {
+      const auto& version = version_or.ValueOrDie();
+      cudnn_version.set_major(version.major_version());
+      cudnn_version.set_minor(version.minor_version());
+      cudnn_version.set_patch(version.patch());
+    }
+  }
+  return cudnn_version;
+}
+
+xla::gpu::ComputeCapability GetComputeCapability(
+    se::StreamExecutor* stream_executor) {
+  xla::gpu::ComputeCapability cc;
+  int cc_major, cc_minor;
+  stream_executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                                  &cc_minor);
+  cc.set_major(cc_major);
+  cc.set_minor(cc_minor);
+  return cc;
+}
+
 }  // anonymous namespace
 
 // We could have caching here so that we don't redo this work for two identical
@@ -145,8 +172,8 @@ tensorflow::mutex_lock LockGpu(const se::StreamExecutor* stream_exec) {
 // cache misses and doing extra work.  Overall, caching doesn't seem worth the
 // trouble, but we may want to revisit this if we ever find a model where
 // caching would speed up compilation a lot.
-StatusOr<CudnnConvAlgorithmPicker::AutotuneResult>
-CudnnConvAlgorithmPicker::PickBestAlgorithm(HloCustomCallInstruction* instr) {
+StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithm(
+    const HloCustomCallInstruction* instr) {
   // TODO(timshen): for now only check fp16. It can be expanded to other types,
   // with some work on the HLO routines.
   const bool cross_check_enabled =
@@ -232,8 +259,6 @@ CudnnConvAlgorithmPicker::PickBestAlgorithm(HloCustomCallInstruction* instr) {
           &stream, ShapeUtil::ByteSizeOf(instr->shape().tuple_shapes(0))));
   initialize_buffer(result_buffer);
 
-  se::dnn::ProfileResult best_result;
-  int64 best_result_bytes_used = 0;
   TF_ASSIGN_OR_RETURN(auto backend_config,
                       instr->backend_config<CudnnConvBackendConfig>());
 
@@ -243,82 +268,119 @@ CudnnConvAlgorithmPicker::PickBestAlgorithm(HloCustomCallInstruction* instr) {
   // this algorithm considered correct, though.
   optional<AlgorithmDesc> first_algorithm;
   TF_ASSIGN_OR_RETURN(CudnnConvKind kind, GetCudnnConvKind(instr));
+  std::vector<AutotuneResult> profile_results;
   for (const AlgorithmDesc& alg : GetAlgorithms(kind, stream_exec_)) {
     ScratchAllocator scratch_allocator(device_ordinal, allocator);
     se::dnn::ProfileResult profile_result;
     VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for "
             << instr->ToString();
 
-    backend_config.set_algorithm(alg.algo_id());
-    backend_config.set_tensor_ops_enabled(alg.tensor_ops_enabled());
-    TF_RETURN_IF_ERROR(instr->set_backend_config(backend_config));
-    bool launch_ok =
+    // Use assignment instead of brace-list to make GCC 4.9 happy.
+    RunConvOptions options;
+    options.profile_result = &profile_result;
+    options.algo_override = alg;
+    Status launch_status =
         RunCudnnConv(instr, absl::MakeSpan(operand_buffers), result_buffer,
-                     &scratch_allocator, &stream, &profile_result)
-            .ok();
-
-    if (launch_ok && profile_result.is_valid()) {
-      const bool crash_on_checking_failure =
-          instr->GetModule()
-              ->config()
-              .debug_options()
-              .xla_gpu_crash_on_verification_failures();
-      if (comparator.has_value()) {
-        StatusOr<bool> result = comparator->CompareEqual(
-            se::DeviceMemory<Eigen::half>(result_buffer));
-        if (!result.ok()) {
-          LOG(ERROR) << "Unable to compare "
-                     << AlgorithmToString(*first_algorithm) << " against "
-                     << AlgorithmToString(alg) << " for " << instr->ToString()
-                     << ": " << result.status();
-          CHECK(!crash_on_checking_failure);
-        } else if (!result.ValueOrDie()) {
-          LOG(ERROR) << "Results mismatch between different convolution "
-                        "algorithms. This is likely a bug in convolution, or "
-                        "an excessive loss of precision in convolution. "
-                     << instr->ToString() << " for "
-                     << AlgorithmToString(*first_algorithm) << " vs "
-                     << AlgorithmToString(alg);
-          CHECK(!crash_on_checking_failure);
-        }
-      } else if (cross_check_enabled) {
-        auto comp = F16BufferComparator::Create(
-            se::DeviceMemory<Eigen::half>(result_buffer), compiler_, allocator,
-            &stream);
-        if (comp.ok()) {
-          comparator.emplace(comp.ConsumeValueOrDie());
-          first_algorithm.emplace(alg);
-        } else {
-          LOG(ERROR) << "Fail to initialize buffer comparator: "
-                     << comp.status() << ", instruction: " << instr->ToString();
-          CHECK(!crash_on_checking_failure);
-        }
+                     &scratch_allocator, &stream, options);
+
+    profile_results.emplace_back();
+    AutotuneResult& result = profile_results.back();
+    result.mutable_conv()->set_algorithm(alg.algo_id());
+    result.mutable_conv()->set_tensor_ops_enabled(alg.tensor_ops_enabled());
+
+    if (!launch_status.ok()) {
+      result.set_error_string(launch_status.error_message());
+      continue;
+    }
+
+    if (!profile_result.is_valid()) {
+      result.set_error_string("Invalid profile result");
+      continue;
+    }
+
+    int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes();
+    result.mutable_success()->set_scratch_bytes(scratch_bytes_used);
+    *result.mutable_success()->mutable_run_time() =
+        protobuf_util::ToDurationProto(
+            absl::Milliseconds(profile_result.elapsed_time_in_ms()));
+
+    const bool crash_on_checking_failure =
+        instr->GetModule()
+            ->config()
+            .debug_options()
+            .xla_gpu_crash_on_verification_failures();
+
+    if (comparator.has_value()) {
+      StatusOr<bool> compare_result = comparator->CompareEqual(
+          se::DeviceMemory<Eigen::half>(result_buffer));
+      if (!compare_result.ok()) {
+        LOG(ERROR) << "Unable to compare "
+                   << AlgorithmToString(*first_algorithm) << " against "
+                   << AlgorithmToString(alg) << " for " << instr->ToString()
+                   << ": " << compare_result.status();
+        CHECK(!crash_on_checking_failure);
+      } else if (!compare_result.ValueOrDie()) {
+        LOG(ERROR) << "Results mismatch between different convolution "
+                      "algorithms. This is likely a bug in convolution, or "
+                      "an excessive loss of precision in convolution. "
+                   << instr->ToString() << " for "
+                   << AlgorithmToString(*first_algorithm) << " vs "
+                   << AlgorithmToString(alg);
+        CHECK(!crash_on_checking_failure);
+        auto* failure = result.mutable_reference_conv();
+        failure->set_algorithm(first_algorithm->algo_id());
+        failure->set_tensor_ops_enabled(first_algorithm->tensor_ops_enabled());
       }
-      int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes();
-      VLOG(3) << "Run of algorithm " << AlgorithmToString(alg)
-              << " succeeded, taking " << profile_result.elapsed_time_in_ms()
-              << "ms and using " << NumBytesToString(scratch_bytes_used)
-              << " of scratch (Best result: "
-              << best_result.elapsed_time_in_ms() << "ms, "
-              << NumBytesToString(best_result_bytes_used) << " of scratch)";
-      if (profile_result.elapsed_time_in_ms() <
-          best_result.elapsed_time_in_ms()) {
-        best_result = profile_result;
-        best_result_bytes_used = scratch_bytes_used;
+    } else if (cross_check_enabled) {
+      auto comp = F16BufferComparator::Create(
+          se::DeviceMemory<Eigen::half>(result_buffer), compiler_, allocator,
+          &stream);
+      if (comp.ok()) {
+        comparator.emplace(comp.ConsumeValueOrDie());
+        first_algorithm.emplace(alg);
+      } else {
+        LOG(ERROR) << "Fail to initialize buffer comparator: " << comp.status()
+                   << ", instruction: " << instr->ToString();
+        CHECK(!crash_on_checking_failure);
       }
-    } else {
-      VLOG(3) << "Run of algorithm " << AlgorithmToString(alg) << " failed.";
     }
   }
-  if (best_result.is_valid()) {
-    VLOG(2) << "Best algorithm for " << instr->ToString() << ": "
-            << AlgorithmToString(best_result.algorithm()) << ", takes "
-            << best_result.elapsed_time_in_ms() << "ms, and uses "
-            << best_result_bytes_used << "B of scratch memory.";
-    return AutotuneResult{best_result.algorithm().algo_id(),
-                          best_result.algorithm().tensor_ops_enabled(),
-                          best_result_bytes_used,
-                          absl::Milliseconds(best_result.elapsed_time_in_ms())};
+
+  // Log the autotuning result.
+  {
+    AutotuneLog log;
+    *log.mutable_instr()->mutable_instruction() = instr->ToProto();
+    for (const auto* op : instr->operands()) {
+      *log.mutable_instr()->add_operand_shapes() = op->shape().ToProto();
+    }
+    for (const auto& profile : profile_results) {
+      *log.add_results() = profile;
+    }
+    *log.mutable_compute_capability() = GetComputeCapability(stream_exec_);
+    *log.mutable_cudnn_version() = GetCudnnVersion(stream_exec_);
+    VLOG(2) << "Autotuning result:\n" << log.DebugString();
+    tensorflow::Logger::Singleton()->LogProto(log);
+  }
+
+  auto* profile_results_end = profile_results.data() + profile_results.size();
+
+  const AutotuneResult* best_result = std::min_element(
+      profile_results.data(), profile_results_end,
+      [](const AutotuneResult& lhs, const AutotuneResult& rhs) {
+        // The successful one should have a smaller key, since we are doing
+        // min_element. If they are both unsuccessful, keep the earlier one in
+        // the vector by comparing pointers.
+        return std::make_tuple(
+                   !lhs.has_success(),
+                   protobuf_util::FromDurationProto(lhs.success().run_time()),
+                   &lhs) < std::make_tuple(!rhs.has_success(),
+                                           protobuf_util::FromDurationProto(
+                                               rhs.success().run_time()),
+                                           &rhs);
+      });
+
+  if (best_result != profile_results_end && best_result->has_success()) {
+    return *best_result;
   }
 
   return InternalError(
@@ -339,22 +401,23 @@ StatusOr<bool> CudnnConvAlgorithmPicker::RunOnInstruction(
   }
 
   auto best_algo = std::move(best_algo_or).ValueOrDie();
-  VLOG(1) << "Setting cudnn conv to use algorithm " << best_algo.algorithm
-          << " and " << NumBytesToString(best_algo.scratch_bytes)
+  VLOG(1) << "Setting cudnn conv to use algorithm "
+          << best_algo.conv().algorithm() << " and "
+          << NumBytesToString(best_algo.success().scratch_bytes())
           << " of scratch memory: " << instr->ToString()
-          << " tensor_ops_enabled: " << best_algo.tensor_ops_enabled;
+          << " tensor_ops_enabled: " << best_algo.conv().tensor_ops_enabled();
 
   // Replace instr with a new CustomCall which has the correct algorithm, and
   // whose output shape has the appropriate amount of scratch memory.
   HloComputation* computation = instr->parent();
   Shape new_call_shape = ShapeUtil::MakeTupleShape(
       {instr->shape().tuple_shapes(0),
-       ShapeUtil::MakeShape(U8, {best_algo.scratch_bytes})});
+       ShapeUtil::MakeShape(U8, {best_algo.success().scratch_bytes()})});
 
   TF_ASSIGN_OR_RETURN(CudnnConvBackendConfig backend_config,
                       instr->backend_config<CudnnConvBackendConfig>());
-  backend_config.set_algorithm(best_algo.algorithm);
-  backend_config.set_tensor_ops_enabled(best_algo.tensor_ops_enabled);
+  backend_config.set_algorithm(best_algo.conv().algorithm());
+  backend_config.set_tensor_ops_enabled(best_algo.conv().tensor_ops_enabled());
 
   HloInstruction* new_call = computation->AddInstruction(
       instr->CloneWithNewOperands(new_call_shape, instr->operands()));
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
index 642af787afc71586d722ecc7e529ed8b3fa64d33..2e34ba9672314a62290b8a557960a605a98996c7 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/service/gpu/autotuning.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -47,16 +48,10 @@ class CudnnConvAlgorithmPicker : public HloModulePass {
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
-  struct AutotuneResult {
-    int64 algorithm;
-    bool tensor_ops_enabled;
-    int64 scratch_bytes;
-    absl::Duration runtime;
-  };
-
   StatusOr<bool> RunOnComputation(HloComputation* computation);
   StatusOr<bool> RunOnInstruction(HloInstruction* instr);
-  StatusOr<AutotuneResult> PickBestAlgorithm(HloCustomCallInstruction* instr);
+  StatusOr<AutotuneResult> PickBestAlgorithm(
+      const HloCustomCallInstruction* instr);
 
   se::StreamExecutor* stream_exec_;                   // never null
   DeviceMemoryAllocator* allocator_;                  // may be null
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.cc
index 5aa4f839f4be5f1060480fea98775f8ffada0bdd..958e0b9c6e7b7885f87b90d61ee5b3bbf6ab2702 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_pad_for_tensor_cores.cc
@@ -50,10 +50,10 @@ static HloInstruction* PadInstruction(HloInstruction* instr,
   auto* zero = comp->AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::Zero(shape.element_type())));
 
-  PaddingConfig pad_config = MakeNoPaddingConfig(ShapeUtil::Rank(shape));
+  PaddingConfig pad_config = MakeNoPaddingConfig(shape.rank());
 
   bool added_padding = false;
-  for (int64 dim = 0; dim < ShapeUtil::Rank(shape); ++dim) {
+  for (int64 dim = 0; dim < shape.rank(); ++dim) {
     if (shape.dimensions(dim) == new_shape.dimensions(dim)) {
       continue;
     }
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.cc
index 3a09d4d4716950a09d65dd093272482d55ac5c27..17d0f7aa7bf6031148aae79f74f7878d6fca9574 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.cc
@@ -219,7 +219,7 @@ bool CudnnConvPaddingLegalization::CanonicalizeBackwardFilterConvolution(
   Window new_backward_conv_window = backward_conv->window();
   // input_padding_config is the config of the kPad to be inserted.
   PaddingConfig input_padding_config =
-      MakeNoPaddingConfig(ShapeUtil::Rank(input->shape()));
+      MakeNoPaddingConfig(input->shape().rank());
   ConvolutionDimensionNumbers backward_conv_dnums =
       backward_conv->convolution_dimension_numbers();
   for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) {
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc
index 443883a89f66a747def1049bc5afb53fec3c2409..dbcdc2b075bc72f3194af8e555faabb1511376e0 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc
@@ -109,9 +109,11 @@ TEST_F(CudnnConvRewriterTest, BackwardFilterConvolve) {
   auto* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
       ShapeInference::InferConvolveShape(
           activations->shape(), gradients->shape(), /*feature_group_count=*/1,
-          conv_window, tf_default_dnums_for_backward_filter_)
+          /*batch_group_count=*/1, conv_window,
+          tf_default_dnums_for_backward_filter_)
           .ConsumeValueOrDie(),
-      activations, gradients, /*feature_group_count=*/1, conv_window,
+      activations, gradients, /*feature_group_count=*/1,
+      /*batch_group_count=*/1, conv_window,
       tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2)));
 
   OpMetadata metadata;
@@ -147,9 +149,11 @@ TEST_F(CudnnConvRewriterTest,
   builder.AddInstruction(HloInstruction::CreateConvolve(
       ShapeInference::InferConvolveShape(
           activations->shape(), gradients->shape(), /*feature_group_count=*/1,
-          conv_window, tf_default_dnums_for_backward_filter_)
+          /*batch_group_count=*/1, conv_window,
+          tf_default_dnums_for_backward_filter_)
           .ConsumeValueOrDie(),
-      activations, gradients, /*feature_group_count=*/1, conv_window,
+      activations, gradients, /*feature_group_count=*/1,
+      /*batch_group_count=*/1, conv_window,
       tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2)));
 
   auto module = CreateNewVerifiedModule();
@@ -179,7 +183,7 @@ TEST_F(CudnnConvRewriterTest, BackwardFilterConvolveWithPaddedActivations) {
   }
   builder.AddInstruction(HloInstruction::CreateConvolve(
       ShapeUtil::MakeShape(F32, {32, 3, 3, 32}), activations, gradients,
-      /*feature_group_count=*/1, conv_window,
+      /*feature_group_count=*/1, /*batch_group_count=*/1, conv_window,
       tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2)));
 
   auto module = CreateNewVerifiedModule();
@@ -209,7 +213,7 @@ TEST_F(CudnnConvRewriterTest, BackwardFilterConvolveWithPaddedGradients) {
   }
   builder.AddInstruction(HloInstruction::CreateConvolve(
       ShapeUtil::MakeShape(F32, {320, 3, 3, 192}), activations, gradients,
-      /*feature_group_count=*/1, conv_window,
+      /*feature_group_count=*/1, /*batch_group_count=*/1, conv_window,
       tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2)));
 
   auto module = CreateNewVerifiedModule();
@@ -238,7 +242,7 @@ TEST_F(CudnnConvRewriterTest, BackwardFilterConvolveWithUnevenPadding) {
   }
   builder.AddInstruction(HloInstruction::CreateConvolve(
       ShapeUtil::MakeShape(F32, {32, 2, 2, 32}), activations, gradients,
-      /*feature_group_count=*/1, conv_window,
+      /*feature_group_count=*/1, /*batch_group_count=*/1, conv_window,
       tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2)));
 
   auto module = CreateNewVerifiedModule();
@@ -283,13 +287,15 @@ TEST_F(CudnnConvRewriterTest, BackwardInputConvolveEvenPadding) {
 
   HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
       ShapeUtil::MakeShape(F32, {4, 3, 16, 16}), /*lhs=*/output,
-      /*rhs=*/reverse_kernel, /*feature_group_count=*/1, conv_window,
-      conv_dnums, DefaultPrecisionConfig(2)));
+      /*rhs=*/reverse_kernel, /*feature_group_count=*/1,
+      /*batch_group_count=*/1, conv_window, conv_dnums,
+      DefaultPrecisionConfig(2)));
   // Verify the convolution's shape is consistent with ShapeInference.
   CHECK(ShapeUtil::Compatible(
       conv->shape(), ShapeInference::InferConvolveShape(
                          output->shape(), reverse_kernel->shape(),
-                         /*feature_group_count=*/1, conv_window, conv_dnums)
+                         /*feature_group_count=*/1, /*batch_group_count=*/1,
+                         conv_window, conv_dnums)
                          .ValueOrDie()));
 
   auto module = CreateNewVerifiedModule();
@@ -332,10 +338,12 @@ TEST_F(CudnnConvRewriterTest, BackwardInputConvolve1x1Filter) {
 
   builder.AddInstruction(HloInstruction::CreateConvolve(
       ShapeInference::InferConvolveShape(output->shape(), kernel->shape(),
-                                         /*feature_group_count=*/1, conv_window,
+                                         /*feature_group_count=*/1,
+                                         /*batch_group_count=*/1, conv_window,
                                          tf_default_dnums_for_backward_input_)
           .ConsumeValueOrDie(),
-      /*lhs=*/output, /*rhs=*/kernel, /*feature_group_count=*/1, conv_window,
+      /*lhs=*/output, /*rhs=*/kernel, /*feature_group_count=*/1,
+      /*batch_group_count=*/1, conv_window,
       tf_default_dnums_for_backward_input_, DefaultPrecisionConfig(2)));
 
   auto module = CreateNewVerifiedModule();
@@ -365,11 +373,12 @@ TEST_F(CudnnConvRewriterTest,
   builder.AddInstruction(HloInstruction::CreateConvolve(
       ShapeInference::InferConvolveShape(
           output->shape(), kernel->shape(), /*feature_group_count=*/1,
-          default_conv_window_, tf_default_dnums_for_backward_input_)
+          /*batch_group_count=*/1, default_conv_window_,
+          tf_default_dnums_for_backward_input_)
           .ConsumeValueOrDie(),
       /*lhs=*/output, /*rhs=*/kernel, /*feature_group_count=*/1,
-      default_conv_window_, tf_default_dnums_for_backward_input_,
-      DefaultPrecisionConfig(2)));
+      /*batch_group_count=*/1, default_conv_window_,
+      tf_default_dnums_for_backward_input_, DefaultPrecisionConfig(2)));
 
   auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
@@ -415,15 +424,15 @@ TEST_F(CudnnConvRewriterTest, BackwardInputConvolveUnevenPaddingOnGradients) {
   }
   HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
       ShapeUtil::MakeShape(F32, {20, 10, 10, 192}), output, reverse_kernel,
-      /*feature_group_count=*/1, conv_window,
+      /*feature_group_count=*/1, /*batch_group_count=*/1, conv_window,
       tf_default_dnums_for_backward_input_, DefaultPrecisionConfig(2)));
   // Verify the convolution's shape is consistent with ShapeInference.
   CHECK(ShapeUtil::Compatible(
-      conv->shape(),
-      ShapeInference::InferConvolveShape(
-          output->shape(), reverse_kernel->shape(), /*feature_group_count=*/1,
-          conv_window, tf_default_dnums_for_backward_input_)
-          .ValueOrDie()));
+      conv->shape(), ShapeInference::InferConvolveShape(
+                         output->shape(), reverse_kernel->shape(),
+                         /*feature_group_count=*/1, /*batch_group_count=*/1,
+                         conv_window, tf_default_dnums_for_backward_input_)
+                         .ValueOrDie()));
 
   auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
@@ -465,15 +474,15 @@ TEST_F(CudnnConvRewriterTest, BackwardInputConvolveLowPaddingTooLarge) {
   }
   HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
       ShapeUtil::MakeShape(F32, {20, 10, 10, 192}), output, reverse_kernel,
-      /*feature_group_count=*/1, conv_window,
+      /*feature_group_count=*/1, /*batch_group_count=*/1, conv_window,
       tf_default_dnums_for_backward_input_, DefaultPrecisionConfig(2)));
   // Verify the convolution's shape is consistent with ShapeInference.
   CHECK(ShapeUtil::Compatible(
-      conv->shape(),
-      ShapeInference::InferConvolveShape(
-          output->shape(), reverse_kernel->shape(), /*feature_group_count=*/1,
-          conv_window, tf_default_dnums_for_backward_input_)
-          .ValueOrDie()));
+      conv->shape(), ShapeInference::InferConvolveShape(
+                         output->shape(), reverse_kernel->shape(),
+                         /*feature_group_count=*/1, /*batch_group_count=*/1,
+                         conv_window, tf_default_dnums_for_backward_input_)
+                         .ValueOrDie()));
 
   auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
@@ -519,15 +528,15 @@ TEST_F(CudnnConvRewriterTest, BackwardInputConvolveUnevenPaddingOnActivations) {
   forward_conv_col_dim->set_base_dilation(2);
   HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
       ShapeUtil::MakeShape(F32, {1, 1, 14, 1}), output, reverse_kernel,
-      /*feature_group_count=*/1, conv_window,
+      /*feature_group_count=*/1, /*batch_group_count=*/1, conv_window,
       tf_default_dnums_for_backward_input_, DefaultPrecisionConfig(2)));
   // Verify the convolution's shape is consistent with ShapeInference.
   CHECK(ShapeUtil::Compatible(
-      conv->shape(),
-      ShapeInference::InferConvolveShape(
-          output->shape(), reverse_kernel->shape(), /*feature_group_count=*/1,
-          conv_window, tf_default_dnums_for_backward_input_)
-          .ValueOrDie()));
+      conv->shape(), ShapeInference::InferConvolveShape(
+                         output->shape(), reverse_kernel->shape(),
+                         /*feature_group_count=*/1, /*batch_group_count=*/1,
+                         conv_window, tf_default_dnums_for_backward_input_)
+                         .ValueOrDie()));
 
   auto module = CreateNewVerifiedModule();
   const HloComputation* entry_computation =
@@ -574,15 +583,15 @@ TEST_F(CudnnConvRewriterTest,
   forward_conv_col_dim->set_padding_high(2);
   HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
       ShapeUtil::MakeShape(F32, {1, 1, 4, 1}), output, reverse_kernel,
-      /*feature_group_count=*/1, conv_window,
+      /*feature_group_count=*/1, /*batch_group_count=*/1, conv_window,
       tf_default_dnums_for_backward_input_, DefaultPrecisionConfig(2)));
   // Verify the convolution's shape is consistent with ShapeInference.
   CHECK(ShapeUtil::Compatible(
-      conv->shape(),
-      ShapeInference::InferConvolveShape(
-          output->shape(), reverse_kernel->shape(), /*feature_group_count=*/1,
-          conv_window, tf_default_dnums_for_backward_input_)
-          .ValueOrDie()));
+      conv->shape(), ShapeInference::InferConvolveShape(
+                         output->shape(), reverse_kernel->shape(),
+                         /*feature_group_count=*/1, /*batch_group_count=*/1,
+                         conv_window, tf_default_dnums_for_backward_input_)
+                         .ValueOrDie()));
 
   auto module = CreateNewVerifiedModule();
   HloComputation* entry_computation =
@@ -599,7 +608,7 @@ TEST_F(CudnnConvRewriterTest, BackwardInputConvolveConstantFilter) {
   Array4D<float> constant_arr(4, 4, 2, 2);
   constant_arr.FillIota(0);
   string constant_str =
-      LiteralUtil::CreateR4FromArray4D(constant_arr).ToString();
+      LiteralUtil::CreateR4FromArray4D(constant_arr).ToStringWithoutShape();
 
   const string module_str = absl::StrFormat(R"(
     HloModule test
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
index 3425e1b4942aaf1011ba1bf1c50dd7e79c1f9807..b628f27f4b2ba8ccf17fd531d8a0c25cb99d9396 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
@@ -395,32 +395,36 @@ Status RunCudnnConv(const HloCustomCallInstruction* conv,
                     absl::Span<se::DeviceMemoryBase> operand_buffers,
                     se::DeviceMemoryBase result_buffer,
                     se::DeviceMemoryBase scratch_buf, se::Stream* stream,
-                    se::dnn::ProfileResult* profile_result) {
+                    RunConvOptions options) {
   ScratchBufAllocator scratch_allocator(scratch_buf);
   return RunCudnnConv(conv, operand_buffers, result_buffer, &scratch_allocator,
-                      stream, profile_result);
+                      stream, options);
 }
 
 Status RunCudnnConv(const HloCustomCallInstruction* conv,
                     absl::Span<se::DeviceMemoryBase> operand_buffers,
                     se::DeviceMemoryBase result_buffer,
                     se::ScratchAllocator* scratch_allocator, se::Stream* stream,
-                    se::dnn::ProfileResult* profile_result) {
+                    RunConvOptions options) {
   TF_ASSIGN_OR_RETURN(CudnnConvParams params,
                       GetCudnnConvParams(conv, operand_buffers, result_buffer));
 
+  if (options.algo_override) {
+    params.algorithm = AlgorithmConfig(*options.algo_override);
+  }
+
   PrimitiveType output_primitive_type =
       conv->shape().tuple_shapes(0).element_type();
   switch (output_primitive_type) {
     case F16:
       return RunCudnnConvImpl<Eigen::half>(params, scratch_allocator, stream,
-                                           profile_result);
+                                           options.profile_result);
     case F32:
       return RunCudnnConvImpl<float>(params, scratch_allocator, stream,
-                                     profile_result);
+                                     options.profile_result);
     case F64:
       return RunCudnnConvImpl<double>(params, scratch_allocator, stream,
-                                      profile_result);
+                                      options.profile_result);
     default:
       LOG(FATAL) << ShapeUtil::HumanString(*params.output_shape);
   }
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h
index edbc75a94a1238540390b93f0fa5217852c7781f..25b2461ca61251c6cb7b89b1f91da0f1636a3647 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h
@@ -28,6 +28,14 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+struct RunConvOptions {
+  // Nullable output-parameter pointer for profiling results.
+  se::dnn::ProfileResult* profile_result = nullptr;
+
+  // Use this algorithm, instead of the one from the instruction.
+  absl::optional<se::dnn::AlgorithmDesc> algo_override;
+};
+
 // This file contains low-level routines for running cudnn convolutions.
 
 // Calls into cudnn to run the specified convolution.
@@ -46,13 +54,13 @@ Status RunCudnnConv(const HloCustomCallInstruction* conv,
                     absl::Span<se::DeviceMemoryBase> operand_buffers,
                     se::DeviceMemoryBase result_buffer,
                     se::DeviceMemoryBase scratch_buf, se::Stream* stream,
-                    se::dnn::ProfileResult* profile_result = nullptr);
+                    RunConvOptions = {});
 
 Status RunCudnnConv(const HloCustomCallInstruction* conv,
                     absl::Span<se::DeviceMemoryBase> operand_buffers,
                     se::DeviceMemoryBase result_buffer,
                     se::ScratchAllocator* scratch_allocator, se::Stream* stream,
-                    se::dnn::ProfileResult* profile_result = nullptr);
+                    RunConvOptions = {});
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index 2ab754a471070d5f90a3eaebd0600ff180d2fe5d..dd74788a0e2940e88dfca1ffa4a4cdad7c1997e2 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -270,6 +270,16 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitPow(PrimitiveType prim_type,
                                prim_type);
 }
 
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitSqrt(PrimitiveType prim_type,
+                                                       llvm::Value* value) {
+  return EmitLibdeviceMathCall("__nv_sqrt", {value}, {prim_type}, prim_type);
+}
+
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitRsqrt(PrimitiveType prim_type,
+                                                        llvm::Value* value) {
+  return EmitLibdeviceMathCall("__nv_rsqrt", {value}, {prim_type}, prim_type);
+}
+
 StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitAtan2(PrimitiveType prim_type,
                                                         llvm::Value* lhs,
                                                         llvm::Value* rhs) {
@@ -308,9 +318,11 @@ llvm::Value* GpuElementalIrEmitter::EmitDeviceFunctionCall(
       false);  // No variadic arguments.
 
   // Declares the callee if it is not declared already.
-  llvm::Function* callee = llvm::cast<llvm::Function>(
-      b_->GetInsertBlock()->getModule()->getOrInsertFunction(
-          llvm_ir::AsStringRef(callee_name), callee_type));
+  llvm::Function* callee = llvm::dyn_cast<llvm::Function>(
+      b_->GetInsertBlock()
+          ->getModule()
+          ->getOrInsertFunction(llvm_ir::AsStringRef(callee_name), callee_type)
+          .getCallee());
 
   for (auto attribute : attributes) {
     callee->addFnAttr(attribute);
@@ -446,7 +458,7 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
         return Load(accum_ptr);
       };
     case HloOpcode::kReduce:
-      // TODO(b/112040122): This should be supported.
+      // TODO(b/118332391): This should be supported.
       CHECK_EQ(hlo->operand_count(), 2) << "Did not expect variadic reduce";
       return [=, &operand_to_generator](
                  const IrArray::Index& output_index) -> StatusOr<llvm::Value*> {
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
index e8b56a39ce58b6aab35c1c977553c7ff7e753273..2aedbf05abb31c88b9988dc1d90e921e9473d25b 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
@@ -76,6 +76,12 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
   StatusOr<llvm::Value*> EmitExpm1(PrimitiveType prim_type,
                                    llvm::Value* value) override;
 
+  StatusOr<llvm::Value*> EmitSqrt(PrimitiveType prim_type,
+                                  llvm::Value* value) override;
+
+  StatusOr<llvm::Value*> EmitRsqrt(PrimitiveType prim_type,
+                                   llvm::Value* value) override;
+
   StatusOr<llvm::Value*> EmitPow(PrimitiveType prim_type, llvm::Value* lhs,
                                  llvm::Value* rhs) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
index 470457935acacb8940af241dadb393d770786939..91930eccdff94bb2fc85636f3a4b2d661c618d87 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
@@ -35,7 +35,7 @@ namespace {
 // Traverses users of tuple shape, adding leaf instructions to 'instructions'.
 void MaybeResolveTupleElements(HloInstruction* instruction,
                                std::vector<HloInstruction*>* instructions) {
-  if (ShapeUtil::IsTuple(instruction->shape())) {
+  if (instruction->shape().IsTuple()) {
     for (auto tuple_user : instruction->users()) {
       MaybeResolveTupleElements(tuple_user, instructions);
     }
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
index 27f07b1d58125092c1ed6734b238e4ae0f11c4aa..a7053e6a013be3ccf5725cbe003558be77104af1 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
@@ -206,6 +206,8 @@ auto GetGemmFn(PrimitiveType type) -> decltype(&DoGemm<float>) {
       return &DoGemm<double>;
     case C64:
       return &DoGemm<std::complex<float>>;
+    case C128:
+      return &DoGemm<std::complex<double>>;
     default:
       LOG(FATAL) << "Unsupported type.";
   }
@@ -221,6 +223,8 @@ auto GetGemmWithAlgorithmFn(PrimitiveType type)
       return &DoGemmWithAlgorithm<double>;
     case C64:
       return &DoGemmWithAlgorithm<std::complex<float>>;
+    case C128:
+      return &DoGemmWithAlgorithm<std::complex<double>>;
     default:
       LOG(FATAL) << "Unsupported type.";
   }
@@ -235,6 +239,8 @@ auto GetGemmAutotuneFn(PrimitiveType type) -> decltype(&DoGemmAutotune<float>) {
       return &DoGemmAutotune<double>;
     case C64:
       return &DoGemmAutotune<std::complex<float>>;
+    case C128:
+      return &DoGemmAutotune<std::complex<double>>;
     default:
       LOG(FATAL) << "Unsupported type.";
   }
@@ -255,6 +261,8 @@ se::blas::ComputationType GetBlasComputationType(PrimitiveType type) {
       return se::blas::ComputationType::kF64;
     case C64:
       return se::blas::ComputationType::kComplexF32;
+    case C128:
+      return se::blas::ComputationType::kComplexF64;
     default:
       LOG(FATAL) << "Unsupported type.";
   }
@@ -315,8 +323,7 @@ Status GemmThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
   DotDimensionNumbers dim_nums = GetDimensionNumbers(*hlo_instruction());
   CHECK_EQ(dim_nums.lhs_batch_dimensions_size(),
            dim_nums.rhs_batch_dimensions_size());
-  CHECK_EQ(dim_nums.lhs_batch_dimensions_size() + 2,
-           ShapeUtil::Rank(output_shape_));
+  CHECK_EQ(dim_nums.lhs_batch_dimensions_size() + 2, output_shape_.rank());
 
   int64 row_dim = dim_nums.lhs_batch_dimensions_size();
   int64 col_dim = dim_nums.lhs_batch_dimensions_size() + 1;
@@ -421,7 +428,8 @@ Status GemmThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
         scratch_data = scratch_mem->device_memory();
       }
       const MatrixDescriptor scratch_descriptor(
-          scratch_data, false, output_num_cols, output_num_rows, batch_size);
+          scratch_data, false, output_matrix.num_rows, output_matrix.num_cols,
+          batch_size);
 
       StatusOr<se::blas::AlgorithmType> best_algorithm = GetGemmAutotuneFn(
           element_type)(lhs_matrix, rhs_matrix, scratch_descriptor, alpha_,
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index ae2e718db29803a085401969a7d9b09abf690a6c..434060ad89dac7ad65c790c8c0a7f3d6ad62a25a 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -218,7 +218,7 @@ GpuExecutable::ResolveConstantGlobals(se::StreamExecutor* executor) {
 
       const Literal& literal =
           llvm_ir::LiteralForConstantAllocation(allocation);
-      CHECK(ShapeUtil::IsArray(literal.shape()));
+      CHECK(literal.shape().IsArray());
       if (!ShouldEmitLiteralInLlvmIr(literal)) {
         VLOG(3) << "H2D memcpy for constant with shape "
                 << ShapeUtil::HumanString(literal.shape());
@@ -310,12 +310,34 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
         TF_ASSIGN_OR_RETURN(
             const BufferAllocation::Slice slice,
             this->assignment_->GetUniqueSlice(src_hlo, sources[0]->index()));
-        CHECK(!slice.allocation()->is_entry_computation_parameter());
 
         se::DeviceMemoryBase src_base =
             buffer_allocations->GetDeviceAddress(slice.index());
         CHECK(!src_base.is_null() || src_base.size() == 0);
-        *device_memory = src_base;
+        if (!slice.allocation()->is_entry_computation_parameter()) {
+          // If the buffer coming out of the result is from a parameter, it
+          // means the caller aliased some parameter buffer to an output one
+          // (via the HloInputOutputAliasConfig API). If that is the case, the
+          // caller will receive a partially complete scoped shaped buffer,
+          // which they will have to fill up on return.
+          // Unfortunately the interface to the execute APIs are ShapedBuffer
+          // pointer based, which assumes caller ownership, and hence a buffer
+          // coming from there cannot be part of the new ScopedShapedBuffer we
+          // create for the result (which assumes ownership).
+          *device_memory = src_base;
+        } else {
+          const HloInputOutputAliasConfig& input_output_alias =
+              module().input_output_alias_config();
+          auto output_alias = input_output_alias.GetAliasedOutput(
+              slice.allocation()->parameter_number(),
+              slice.allocation()->param_shape_index());
+          CHECK(output_alias)
+              << "Ouput buffer is coming from parameter "
+              << slice.allocation()->parameter_number() << " at index "
+              << slice.allocation()->param_shape_index()
+              << ", but no alias exists";
+          CHECK_EQ(*output_alias, index);
+        }
         buffers_in_result.insert(src_base);
         return Status::OK();
       }));
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
index 452e763a8eaadc805cd3a3859a68e2a31598fd36..842ba2fdcd31a451cec1be543e102e0a46077f38 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
@@ -42,15 +42,13 @@ bool LayoutsAreReduceInputFusionFriendly(const HloInstruction& producer,
   int64 max_rank = -1;
   const Layout* max_rank_layout;
   for (HloInstruction* param : params) {
-    if (ShapeUtil::IsArray(param->shape()) &&
-        ShapeUtil::Rank(param->shape()) > max_rank) {
-      max_rank = ShapeUtil::Rank(param->shape());
+    if (param->shape().IsArray() && param->shape().rank() > max_rank) {
+      max_rank = param->shape().rank();
       max_rank_layout = &param->shape().layout();
     }
   }
   return absl::c_all_of(params, [&](HloInstruction* param) {
-    return (!ShapeUtil::IsArray(param->shape())) ||
-           (ShapeUtil::Rank(param->shape()) < max_rank) ||
+    return (!param->shape().IsArray()) || (param->shape().rank() < max_rank) ||
            (LayoutUtil::Equal(param->shape().layout(), *max_rank_layout));
   });
 }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
index e9d7ba1c4cfa865532a0d06c2ed883a2fea4e2cd..9f0de3f794decb7b878b67c96030f8e11b0555fe 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
@@ -48,7 +48,7 @@ bool IsInputFusibleReduction(const HloInstruction& instr);
 
 // Whether instruction shapes are compatible for multi-output fusion, i.e.
 // whether the emitters support lowering the resulting fusion.
-// This function works for both, sibling and producer-conumser multi-output
+// This function works for both, sibling and producer-consumer multi-output
 // fusion.
 // So far, multi-output fusion is supported for loop fusions and reduce
 // input fusions only. It is up to the caller to ensure the instructions
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.cc
index 4268fb2c7a813b3b53e4cd48746028a7b369f28e..4765f67c4b17e97419182e341573f75ad3d6ac30 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
index f59da2caa18646676297e66dd329c66fb5fddf1b..a6d80f0b6dddb3d8d0fd00c639e11c71da6a9f09 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
@@ -196,9 +196,9 @@ Status GpuLayoutAssignment::AddBackendConstraints(
       CHECK_EQ(dim_nums.lhs_batch_dimensions_size(),
                dim_nums.rhs_batch_dimensions_size());
       CHECK_EQ(dim_nums.lhs_batch_dimensions_size() + 2,
-               ShapeUtil::Rank(instruction->shape()));
+               instruction->shape().rank());
       for (int64 batch_dim : dim_nums.lhs_batch_dimensions()) {
-        CHECK_LT(batch_dim, ShapeUtil::Rank(instruction->shape()) - 2);
+        CHECK_LT(batch_dim, instruction->shape().rank() - 2);
       }
 
       // Set both inputs and the output to default layout.
@@ -215,18 +215,18 @@ Status GpuLayoutAssignment::AddBackendConstraints(
       TF_RETURN_IF_ERROR(
           constraints->SetInstructionLayout(output_shape, instruction));
     } else if (instruction->opcode() == HloOpcode::kSort &&
-               ShapeUtil::Rank(instruction->operand(0)->shape()) > 1) {
+               instruction->operand(0)->shape().rank() > 1) {
       // Make sure that all the operands and the output(s) have the same layout.
       Shape keys_shape = instruction->operand(0)->shape();
       Layout keys_layout =
-          LayoutUtil::GetDefaultLayoutForRank(ShapeUtil::Rank(keys_shape));
+          LayoutUtil::GetDefaultLayoutForRank(keys_shape.rank());
       for (int64 i = 0; i < instruction->operand_count(); ++i) {
         Shape shape = instruction->operand(i)->shape();
         *shape.mutable_layout() = keys_layout;
         TF_RETURN_IF_ERROR(
             constraints->SetOperandLayout(shape, instruction, i));
         const LogicalBuffer* output_buffer;
-        if (ShapeUtil::IsArray(instruction->shape())) {
+        if (instruction->shape().IsArray()) {
           TF_ASSIGN_OR_RETURN(
               output_buffer,
               constraints->points_to_analysis().GetBufferDefinedAt(instruction,
@@ -240,6 +240,32 @@ Status GpuLayoutAssignment::AddBackendConstraints(
         TF_RETURN_IF_ERROR(
             constraints->SetBufferLayout(keys_layout, *output_buffer));
       }
+    } else if (instruction->opcode() == HloOpcode::kTriangularSolve) {
+      // TODO(phawkins): Ideally we would relax this constraint. What we
+      // actually want is that:
+      // a) the batch dimensions are major, in no particular order.
+      // b) the two minor dimensions are in fortran (column-major) order,
+      // although for the 'a' argument we could potentially accept row-major
+      // order and fold the transpose into the operator.
+      auto set_fortran_layout = [](Shape* shape) {
+        LayoutUtil::SetToDefaultLayout(shape);
+        int n = shape->mutable_layout()->minor_to_major_size();
+        CHECK_GE(n, 2);
+        std::swap(shape->mutable_layout()->mutable_minor_to_major()->at(0),
+                  shape->mutable_layout()->mutable_minor_to_major()->at(1));
+      };
+      Shape op0_shape = instruction->operand(0)->shape();
+      Shape op1_shape = instruction->operand(1)->shape();
+      Shape output_shape = instruction->shape();
+      set_fortran_layout(&op0_shape);
+      set_fortran_layout(&op1_shape);
+      set_fortran_layout(&output_shape);
+      TF_RETURN_IF_ERROR(
+          constraints->SetOperandLayout(op0_shape, instruction, 0));
+      TF_RETURN_IF_ERROR(
+          constraints->SetOperandLayout(op1_shape, instruction, 1));
+      TF_RETURN_IF_ERROR(
+          constraints->SetInstructionLayout(output_shape, instruction));
     }
   }
   return Status::OK();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
index 2ffc8bfb49b205dced0d540ba72426e72d95e596..391029e574622925b2a7e801a7d41d95e49a1cfb 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
@@ -368,12 +368,21 @@ TEST_F(LayoutAssignmentTest, DotLayout) {
 TEST_F(LayoutAssignmentTest, SortLayout) {
   const char* hlo_text = R"(
   HloModule SortLayout
+
+  compare {
+    p.0.lhs = f32[] parameter(0)
+    p.0.rhs = f32[] parameter(1)
+    p.1.lhs = f32[] parameter(2)
+    p.1.rhs = f32[] parameter(3)
+    ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+  }
+
   ENTRY sort {
-    keys = f32[3,2]{0,1} constant(f32[3,2]{0,1}{{0,1},{0,1},{0,1}})
+    keys = f32[3,2]{0,1} constant({{0,1},{0,1},{0,1}})
     values = f32[2,3]{1,0} parameter(0)
     transpose = f32[3,2]{1,0} transpose(values), dimensions={1,0}
     ROOT sort = (f32[3,2]{1,0}, f32[3,2]{1,0}) sort(keys, transpose),
-      dimensions={1}
+      dimensions={1}, to_apply=compare
   })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.cc b/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5e38ceca18de30e0e1fa75a7a4bd865e000b7d22
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.cc
@@ -0,0 +1,70 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.h"
+
+#include <memory>
+#include <set>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+namespace gpu {
+
+StatusOr<bool> GpuSanitizeConstantNames::Run(HloModule* module) {
+  bool changed = false;
+
+  NameUniquer instr_name_uniquer(/*separator=*/"_");
+  // Collect the names used for the non-constant HLO instructions.+
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instr : computation->instructions()) {
+      if (instr->opcode() == HloOpcode::kConstant) {
+        continue;
+      }
+
+      const string& old_name = instr->name();
+      instr->UniquifyName(&instr_name_uniquer);
+      CHECK_EQ(old_name, instr->name());
+    }
+  }
+
+  // Sanitize the names for the constant HLO instructions and make them unique.
+  // This is not merged into the above loop because we don't want this pass to
+  // change the names of non-constant instructions, that is, if a constant HLO
+  // conflicts with a non-constant HLO, we change the name of the constant HLO
+  // even though the non-constant HLO comes after in the HLO module.
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instr : computation->instructions()) {
+      if (instr->opcode() != HloOpcode::kConstant) {
+        continue;
+      }
+      string sanitized_name = llvm_ir::SanitizeConstantName(*instr);
+      instr->SetAndSanitizeName(sanitized_name);
+      instr->UniquifyName(&instr_name_uniquer);
+      changed = true;
+    }
+  }
+
+  return changed;
+}  // namespace gpu
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.h b/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d583d047e25698e86032020b7fc20df87f5ab68
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.h
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_SANITIZE_CONSTANT_NAMES_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_SANITIZE_CONSTANT_NAMES_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Sanitizes HLO instruction names for the GPU backend. Currently, it only
+// replaces . and - in the HLO constant instruction names with _ to please the
+// LLVM PTX backend.
+class GpuSanitizeConstantNames : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "sanitize-constant-names"; }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_SANITIZE_CONSTANT_NAMES_H_
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f5adee8cc61f18f356406d8c089dd43565957739
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names_test.cc
@@ -0,0 +1,82 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.h"
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+using SanitizeConstantNamesTest = HloTestBase;
+
+TEST_F(SanitizeConstantNamesTest, InstructionNameWithHyphenSanitized) {
+  const char *const kHloString = R"(
+    HloModule HyphenInInstructionName
+      ENTRY kernelEntry {
+        ROOT equal-to = s32[2]{0} constant({42, 73})
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(kHloString));
+
+  EXPECT_TRUE(GpuSanitizeConstantNames().Run(module.get()).ValueOrDie());
+  HloInstruction *root = module->entry_computation()->root_instruction();
+  EXPECT_EQ(root->name(), "equal_to");
+}
+
+TEST_F(SanitizeConstantNamesTest, InstructionNameWithDotSanitized) {
+  const char *const kHloString = R"(
+    HloModule HyphenInInstructionName
+      ENTRY kernelEntry {
+        ROOT equal.to = s32[2]{0} constant({42, 73})
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(kHloString));
+
+  EXPECT_TRUE(GpuSanitizeConstantNames().Run(module.get()).ValueOrDie());
+  HloInstruction *root = module->entry_computation()->root_instruction();
+  EXPECT_EQ(root->name(), "equal_to");
+}
+
+TEST_F(SanitizeConstantNamesTest, BufferSanitizedNameCollisionResolved) {
+  const char *const kHloString = R"(
+    HloModule BufferSanitizedName
+      ENTRY kernelEntry {
+      equal.to = s32[2]{0} constant({42, 73})
+      equal-to = s32[2]{0} constant({67, 3})
+      ROOT equal_to = s32[2]{0} add(equal.to, equal-to)
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(kHloString));
+
+  EXPECT_TRUE(GpuSanitizeConstantNames().Run(module.get()).ValueOrDie());
+  EXPECT_THAT(FindInstruction(module.get(), "equal_to_1"), op::Constant());
+  EXPECT_THAT(FindInstruction(module.get(), "equal_to_2"), op::Constant());
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
index f3c274429242d5c989146d14ea523b5910408cff..e593f535642e15f28a4a1c1f321881ba3c694548 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/logging.h"
@@ -59,7 +58,7 @@ Status GpuTransferManager::TransferLiteralToInfeed(
 
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
       shape, [&](const Shape& literal_subshape, const ShapeIndex& index) {
-        if (ShapeUtil::IsArray(literal_subshape)) {
+        if (literal_subshape.IsArray()) {
           int64 tuple_element_size = GetByteSizeRequirement(literal_subshape);
           TF_ASSIGN_OR_RETURN(
               *buffer_tree.mutable_element(index),
@@ -126,13 +125,12 @@ static void ShapeTreeToLiteral(
         ShapeTree<std::unique_ptr<gpu::OutfeedBuffer>>* shape_tree,
         ShapeIndex* index) {
       const Shape& shape = ShapeUtil::GetSubshape(shape_tree->shape(), *index);
-      if (ShapeUtil::IsArray(shape)) {
+      if (shape.IsArray()) {
         (*shape_tree->mutable_element(*index))->WaitUntilAvailable();
         return;
       }
 
-      CHECK(ShapeUtil::IsTuple(shape))
-          << ShapeUtil::HumanStringWithLayout(shape);
+      CHECK(shape.IsTuple()) << ShapeUtil::HumanStringWithLayout(shape);
       const int64 tuple_element_count = ShapeUtil::TupleElementCount(shape);
       index->push_back(0);
       for (int64 i = 0; i < tuple_element_count; ++i) {
@@ -158,7 +156,7 @@ Status GpuTransferManager::TransferLiteralFromOutfeed(
           std::unique_ptr<gpu::OutfeedBuffer>* buffer) {
         const Shape& shape = ShapeUtil::GetSubshape(literal_shape, index);
         // Do not transfer tuple index buffers.
-        if (ShapeUtil::IsTuple(shape)) {
+        if (shape.IsTuple()) {
           return;
         }
         *buffer = absl::make_unique<gpu::OutfeedBuffer>(
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
index 51627402b45f594dab3480129ba182d54d01b811..69aaaceca112364a4fd562f6a5eff1629fd3fc54 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
@@ -45,10 +46,10 @@ void HloToIrBindings::EmitBasePointersForHlos(
 
   // An HLO can have duplicated operands. This data structure remembers which
   // operand HLOs are already bound to avoid rebinding the same HLO.
-  std::set<const HloInstruction*> already_bound_for_this_function;
+  absl::flat_hash_set<const HloInstruction*> already_bound_for_this_function;
   auto arg_iter = function->arg_begin();
   for (const HloInstruction* io_hlo : io_hlos) {
-    if (!already_bound_for_this_function.count(io_hlo)) {
+    if (!already_bound_for_this_function.contains(io_hlo)) {
       if (!is_nested_ && io_hlo->opcode() == HloOpcode::kGetTupleElement) {
         BindHloToIrValue(*io_hlo, EmitGetTupleElement(io_hlo, &*arg_iter));
       } else {
@@ -63,7 +64,7 @@ void HloToIrBindings::EmitBasePointersForHlos(
   temp_buffer_base_->setName("temp_buffer");
 
   for (const HloInstruction* non_io_hlo : non_io_hlos) {
-    if (already_bound_for_this_function.count(non_io_hlo)) {
+    if (already_bound_for_this_function.contains(non_io_hlo)) {
       continue;
     }
     already_bound_for_this_function.insert(non_io_hlo);
@@ -280,7 +281,7 @@ string HloToIrBindings::ToString() const {
       StrAppend(&s, "    ", instr->ToString());
 
       const ShapeTree<llvm::Value*>& shape_tree = it->second;
-      if (!ShapeUtil::IsTuple(instr->shape())) {
+      if (!instr->shape().IsTuple()) {
         const llvm::Value* val = shape_tree.begin()->second;
         StrAppend(&s, " -> ", llvm_ir::DumpToString(*val), "\n");
         continue;
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
index c0edae530cedba45c897b07b7b9cc72eaaab397c..f57b594e9c18078a3bbbf4d2b4db7e989c4edfdd 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <unordered_map>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
@@ -61,7 +62,7 @@ class HloToIrBindings {
 
   // Returns whether `hlo` is bound to an LLVM IR value.
   bool BoundToIrValue(const HloInstruction& hlo) const {
-    return base_ptrs_.count(&hlo);
+    return base_ptrs_.contains(&hlo);
   }
 
   llvm::Value* GetTempBufferBase() const { return temp_buffer_base_; }
@@ -110,7 +111,8 @@ class HloToIrBindings {
   // For an instruction that generates multiple outputs, the root will be a
   // tuple shape. The IrArray for each element output is stored in the subnode
   // in the ShapeTree.
-  std::unordered_map<const HloInstruction*, ShapeTree<llvm::Value*>> base_ptrs_;
+  absl::flat_hash_map<const HloInstruction*, ShapeTree<llvm::Value*>>
+      base_ptrs_;
 
   // The address of the memory block that contains all temporary buffers.
   llvm::Value* temp_buffer_base_ = nullptr;
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
index 8c3a026740851767855beae59d6a3c92f7a0d6bd..676380c3b10f9a20c641eea0d9a948a26becaddc 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
@@ -36,6 +36,21 @@ Status InfeedThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
   ShapeTree<InfeedBuffer> infeed_buffers =
       GetOrCreateInfeedManager()->BlockingGetNextDestination();
 
+  // infeed_slices_'s shape should be a tuple of shape (buffers, token).
+  const auto& infeed_shape = infeed_slices_.shape();
+  TF_RET_CHECK(infeed_shape.IsTuple())
+      << ShapeUtil::HumanStringWithLayout(infeed_shape);
+  TF_RET_CHECK(infeed_shape.tuple_shapes().size() == 2)
+      << ShapeUtil::HumanStringWithLayout(infeed_shape);
+  TF_RET_CHECK(infeed_shape.tuple_shapes(1).IsToken())
+      << ShapeUtil::HumanStringWithLayout(infeed_shape);
+  TF_RET_CHECK(
+      ShapeUtil::Equal(infeed_buffers.shape(), infeed_shape.tuple_shapes(0)))
+      << "Expected infeed of shape "
+      << ShapeUtil::HumanStringWithLayout(infeed_shape.tuple_shapes(0))
+      << " but was "
+      << ShapeUtil::HumanStringWithLayout(infeed_buffers.shape());
+
   {
     // The infeed buffer has an extra outer tuple with a token. Adjust the index
     // accordingly.
@@ -45,7 +60,7 @@ Status InfeedThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
           const Shape& shape = ShapeUtil::GetSubshape(infeed_buffers.shape(),
                                                       ShapeIndexView(index, 1));
           // For the leaf buffers of the tuple copy the elements directly.
-          if (ShapeUtil::IsArray(shape)) {
+          if (shape.IsArray()) {
             const BufferAllocation::Slice& tuple_element_buffer =
                 infeed_slices_.element(index);
             se::DeviceMemoryBase tuple_element_address =
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index 6151dd8ff4c92bb81bd756c68cc9377633c8c9d5..f07141029cbf8b034b74548f6fca8f1628589f0c 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -282,22 +282,7 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
 
 bool GpuInstructionFusion::ShouldFuseIntoMultiOutput(HloInstruction* consumer,
                                                      int64 operand_index) {
-  const HloInstruction* producer = consumer->operand(operand_index);
-  // The IR emitter has limited support for non-loop fusions with multi output
-  // at present.
-  // TODO(tjoerg): Relax this constraint to allow for arbitraty kinds of fusion.
-  if (consumer->opcode() == HloOpcode::kFusion &&
-      consumer->fusion_kind() != HloInstruction::FusionKind::kLoop) {
-    return false;
-  }
-  // Multi-output fusion requires instructions with compatible shapes.
-  if (!ShapeUtil::Compatible(producer->shape(), consumer->shape())) {
-    return false;
-  }
-  // TODO(tjoerg): Stop calling `ShouldFuse` to relax the criteria for
-  // multi-output fusion. In particular, do not check whether an instruction is
-  // expensive to duplicate, since this doesn't matter here.
-  return GpuInstructionFusion::ShouldFuse(consumer, operand_index);
+  return false;
 }
 
 HloInstruction::FusionKind GpuInstructionFusion::ChooseKind(
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
index 688604cd36e5a45debf855aacd29d05ecda92341..a05ab86cf77a134a1fc387d93cb482aa1ff5345b 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
@@ -506,202 +506,11 @@ TEST_F(InstructionFusionTest, MultiOutputFusion) {
     })")
                     .ValueOrDie();
 
-  ASSERT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
-                  .Run(module.get())
-                  .ValueOrDie());
-  SCOPED_TRACE(module->ToString());
-
-  // Expect that there is one multi-output fusion and subtract has not been
-  // duplicated.
-  EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1);
-  EXPECT_EQ(Count(*module, HloOpcode::kSubtract), 1);
-  TF_ASSERT_OK_AND_ASSIGN(
-      const HloInstruction* fusion,
-      FindHloInstruction(*module->entry_computation(), HloOpcode::kFusion));
-  EXPECT_THAT(
-      fusion->fused_expression_root(),
-      op::Tuple(op::Add(op::Subtract(), op::Parameter()), op::Subtract()));
-}
-
-TEST_F(InstructionFusionTest, MultiOutputFusionExpensiveOp) {
-  // tanh --> add --> tuple
-  //  \---------------/
-  auto module = ParseHloString(R"(
-    HloModule test_module
-    ENTRY OutputFusion {
-     p0 = f32[4,3]{1,0} parameter(0)
-     p1 = f32[4,3]{1,0} parameter(1)
-     tanh = f32[4,3]{1,0} tanh(p0)
-     add = f32[4,3]{1,0} add(tanh, p1)
-     ROOT tuple = (f32[4,3]{1,0}, f32[4,3]{1,0}) tuple(tanh, add)
-    })")
-                    .ValueOrDie();
-
-  // TODO(tjoerg): Allow multi-output fusion for expensive operations like tanh.
+  // Multi-output fusion is disabled here and performed in the
+  // GpuMultiOutputFusion pass instead.
   ASSERT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
                    .Run(module.get())
-                   .ValueOrDie())
-      << module->ToString();
-}
-
-TEST_F(InstructionFusionTest, MultiOutputFusion2) {
-  // sub --> add1 --\--------\
-  //  \----------> add2 --> tuple
-  auto module = ParseHloString(R"(
-    HloModule test_module
-    ENTRY OutputFusion {
-     p0 = f32[4,3]{1,0} parameter(0)
-     p1 = f32[4,3]{1,0} parameter(1)
-     p2 = f32[4,3]{1,0} parameter(2)
-     sub = f32[4,3]{1,0} subtract(p0, p2)
-     add1 = f32[4,3]{1,0} add(sub, p1)
-     add2 = f32[4,3]{1,0} add(sub, add1)
-     ROOT tuple = (f32[4,3]{1,0}) tuple(add1, add2)
-    })")
-                    .ValueOrDie();
-
-  ASSERT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
-                  .Run(module.get())
-                  .ValueOrDie());
-  SCOPED_TRACE(module->ToString());
-
-  // Expect that there is one multi-output fusion and subtract has not been
-  // duplicated.
-  EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1);
-  EXPECT_EQ(Count(*module, HloOpcode::kSubtract), 1);
-  TF_ASSERT_OK_AND_ASSIGN(
-      const HloInstruction* fusion,
-      FindHloInstruction(*module->entry_computation(), HloOpcode::kFusion));
-  EXPECT_THAT(fusion->fused_expression_root(),
-              op::Tuple(op::Add(op::Subtract(), op::Add()),
-                        op::Add(op::Subtract(), op::Parameter())));
-}
-
-TEST_F(InstructionFusionTest, MultiOutputFusion3) {
-  // sub --> add1 ----\--------\
-  //  \ --> add2 --> add3 --> tuple
-  auto module = ParseHloString(R"(
-    HloModule test_module
-    ENTRY OutputFusion {
-     p0 = f32[4,3]{1,0} parameter(0)
-     p1 = f32[4,3]{1,0} parameter(1)
-     p2 = f32[4,3]{1,0} parameter(2)
-     p3 = f32[4,3]{1,0} parameter(3)
-     sub = f32[4,3]{1,0} subtract(p0, p2)
-     add1 = f32[4,3]{1,0} add(sub, p1)
-     add2 = f32[4,3]{1,0} add(p2, sub)
-     add3 = f32[4,3]{1,0} add(add1, add2)
-     ROOT tuple = (f32[4,3]{1,0}) tuple(add3, add2)
-    })")
-                    .ValueOrDie();
-
-  ASSERT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
-                  .Run(module.get())
-                  .ValueOrDie());
-  SCOPED_TRACE(module->ToString());
-
-  // Expect that there is one multi-output fusion and subtract has not been
-  // duplicated.
-  EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1);
-  EXPECT_EQ(Count(*module, HloOpcode::kSubtract), 1);
-  TF_ASSERT_OK_AND_ASSIGN(
-      const HloInstruction* fusion,
-      FindHloInstruction(*module->entry_computation(), HloOpcode::kFusion));
-  EXPECT_THAT(fusion->fused_expression_root(),
-              op::Tuple(op::Add(op::Add(), op::Add()),
-                        op::Add(op::Parameter(), op::Subtract())));
-}
-
-TEST_F(InstructionFusionTest, NoCyclesDueToMultiOutputFusion) {
-  // sub --> mul ---\
-  //  \--> call --> add --> tuple
-  auto module = ParseHloString(R"(
-  HloModule test_module
-  ENTRY OutputFusion {
-    c = f32[] constant(42)
-    p0 = f32[4,3]{1,0} parameter(0)
-    p1 = f32[4,3]{1,0} parameter(1)
-    sub = f32[4,3]{1,0} subtract(p0, p1)
-    mul = f32[4,3]{1,0} multiply(sub, c)
-    call = f32[4,3]{1,0} custom-call(sub), custom_call_target="foo"
-    add = f32[4,3]{1,0} add(mul, call)
-    ROOT tuple = (f32[4,3]{1,0}) tuple(add)
-  })")
-                    .ValueOrDie();
-
-  ASSERT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
-                  .Run(module.get())
-                  .ValueOrDie());
-  // Visit instructions in post order to detect cycles.
-  // TODO(tjoerg): Add cycle detection to the HloVerifier.
-  class DummyVisitor : public DfsHloVisitorWithDefault {
-   public:
-    DummyVisitor() {}
-    Status DefaultAction(HloInstruction* /*hlo_instruction*/) override {
-      return Status::OK();
-    }
-  } visitor;
-  for (const HloComputation* computation : module->MakeComputationPostOrder()) {
-    // Accept will return a FailedPrecondition when a cycle is detected.
-    EXPECT_TRUE(computation->root_instruction()->Accept(&visitor).ok());
-  }
-}
-
-TEST_F(InstructionFusionTest, NoMultiOutputFusionWithIncompatibleShapes) {
-  // sub[2,3] --> add[4,3] --> tuple([2,3], [4,3])
-  //  \-------------------------/
-  auto module = ParseHloString(R"(
-    HloModule test_module
-    ENTRY OutputFusion {
-     p0 = f32[2,3]{1,0} parameter(0)
-     p1 = f32[4,3]{1,0} parameter(1)
-     p2 = f32[2,3]{1,0} parameter(2)
-     sub = f32[2,3]{1,0} subtract(p0, p2)
-     add = f32[4,3]{1,0} add(sub, p1)
-     ROOT tuple = (f32[2,3]{1,0}, f32[4,3]{1,0}) tuple(sub, add)
-    })")
-                    .ValueOrDie();
-
-  // Multi-output fusion requires shapes to be compatible. Since `sub` and `add`
-  // have incompatible shapes, expect that no multi-output fusion happens.
-  ASSERT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
-                   .Run(module.get())
-                   .ValueOrDie())
-      << module->ToString();
-}
-
-TEST_F(InstructionFusionTest, FuseIntoInputFusionInstruction) {
-  auto module = ParseHloString(R"(
-  HloModule test_module
-
-  add_computation {
-    add_lhs = f32[] parameter(0)
-    add_rhs = f32[] parameter(1)
-    ROOT add_root = f32[] add(add_lhs, add_rhs)
-  }
-
-  fused_computation {
-    p1 = f32[10] parameter(0)
-    zero = f32[] constant(0)
-    ROOT f2_root = f32[] reduce(p1, zero), dimensions={0},
-           to_apply=add_computation
-  }
-
-  ENTRY entry {
-    p0 = f32[10] parameter(0)
-    mul = f32[10] multiply(p0, p0)
-    fusion = f32[] fusion(mul), kind=kInput, calls=fused_computation
-    ROOT tuple = (f32[10], f32[]) tuple(fusion, mul)
-  })")
-                    .ValueOrDie();
-
-  // Multi-output fusion is not supported for non-loop fusions at present. Since
-  // `fused_computation` is a input fusion, expect no multi-output fusion to
-  // happen.
-  ASSERT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
-                   .Run(module.get())
-                   .ValueOrDie())
-      << module->ToString();
+                   .ValueOrDie());
 }
 
 TEST_F(InstructionFusionTest, FuseScalarConstant) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 33e41a2782b5932430eea621d3cea2c6634f292f..3ed6553f9205803cfa17772b890c449cfb457c89 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include "llvm/IR/Module.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -40,7 +39,7 @@ namespace {
 
 // Return whether the given shape is rank 2 excluding the batch dimensions.
 bool IsRank2(const Shape& shape, int64 batch_dimensions_size) {
-  return ShapeUtil::Rank(shape) == batch_dimensions_size + 2;
+  return shape.rank() == batch_dimensions_size + 2;
 }
 
 // In a gemm operation where output = lhs * rhs, check whether the given shapes
@@ -54,7 +53,8 @@ bool AreValidGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape,
   PrimitiveType output_primitive_type = output_shape.element_type();
   bool type_is_allowed =
       (output_primitive_type == F16 || output_primitive_type == F32 ||
-       output_primitive_type == F64 || output_primitive_type == C64);
+       output_primitive_type == F64 || output_primitive_type == C64 ||
+       output_primitive_type == C128);
   return type_is_allowed && IsRank2(lhs_shape, batch_dimensions_size) &&
          IsRank2(rhs_shape, batch_dimensions_size) &&
          IsRank2(output_shape, batch_dimensions_size) &&
@@ -154,20 +154,17 @@ bool IsReductionToVector(const HloInstruction& reduce) {
   const HloInstruction* input = reduce.operand(0);
   std::vector<int64> dims_to_keep;
   for (int64 dim = 0; dim < input->shape().dimensions().size(); ++dim) {
-    if (!std::count(reduce.dimensions().begin(), reduce.dimensions().end(),
-                    dim)) {
+    if (!absl::c_linear_search(reduce.dimensions(), dim)) {
       dims_to_keep.push_back(dim);
     }
   }
   return LayoutUtil::AreDimensionsConsecutive(input->shape().layout(),
                                               dims_to_keep) &&
-         ShapeUtil::Equal(reduce.shape(), ShapeUtil::FilterDimensions(
-                                              [&dims_to_keep](int64 dim) {
-                                                return std::count(
-                                                    dims_to_keep.begin(),
-                                                    dims_to_keep.end(), dim);
-                                              },
-                                              input->shape()));
+         ShapeUtil::Equal(
+             reduce.shape(),
+             ShapeUtil::FilterDimensions(
+                 [&](int64 dim) { return absl::c_count(dims_to_keep, dim); },
+                 input->shape()));
 }
 
 // This emits a device-side call to
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 6693f66d62d8b04d1b78e001fdb515b34539c67f..8f010ab27a6c99b97e7808218de908ce558b0fe7 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -430,7 +430,7 @@ Status IrEmitter::HandleTupleSelect(HloInstruction* tuple_select) {
   auto on_false = tuple_select->operand(2);
   TF_RET_CHECK(pred->shape().element_type() == PRED);
   TF_RET_CHECK(ShapeUtil::IsScalar(pred->shape()));
-  TF_RET_CHECK(ShapeUtil::IsTuple(tuple_select->shape()));
+  TF_RET_CHECK(tuple_select->shape().IsTuple());
   llvm_ir::EmitTupleSelect(GetIrArray(*tuple_select, *tuple_select),
                            GetIrArray(*pred, *tuple_select),
                            GetBasePointer(*on_true), GetBasePointer(*on_false),
@@ -492,8 +492,11 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
       result = llvm::ConstantAggregateZero::get(lhs_array.GetElementLlvmType());
       result = InsertValue(result, value.first, {0});
       result = InsertValue(result, value.second, {1});
-    } else {
+    } else if (ShapeUtil::ElementIsFloating(lhs_shape)) {
       result = FMul(lhs_value, rhs_value);
+    } else {
+      TF_RET_CHECK(ShapeUtil::ElementIsIntegral(lhs_shape));
+      result = Mul(lhs_value, rhs_value);
     }
     target_array.EmitWriteArrayElement(/*index=*/element_index, result, &b_);
     return Status::OK();
@@ -583,9 +586,13 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
     llvm::Value* accum_imag = Imag(accum, &b_);
     llvm::Value* imag_sum = FAdd(accum_imag, value.second);
     updated_accum = InsertValue(updated_accum, imag_sum, {1});
-  } else {
+  } else if (ShapeUtil::ElementIsFloating(lhs_shape)) {
     llvm::Value* product = FMul(lhs_element, rhs_element);
     updated_accum = FAdd(accum, product);
+  } else {
+    TF_RET_CHECK(ShapeUtil::ElementIsIntegral(lhs_shape));
+    llvm::Value* product = Mul(lhs_element, rhs_element);
+    updated_accum = Add(accum, product);
   }
   Store(updated_accum, accum_address);
 
@@ -637,9 +644,9 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
   return Unimplemented("Hit a case for fft that is not implemented on GPU.");
 }
 
-Status IrEmitter::HandleCrossReplicaSum(HloInstruction* crs) {
+Status IrEmitter::HandleAllReduce(HloInstruction* crs) {
   // TODO(b/33011107): Support cross replica sum on GPU.
-  return Unimplemented("CrossReplicaSum is not implemented on GPU.");
+  return Unimplemented("AllReduce is not implemented on GPU.");
 }
 
 Status IrEmitter::HandleParameter(HloInstruction* parameter) {
@@ -647,8 +654,8 @@ Status IrEmitter::HandleParameter(HloInstruction* parameter) {
 }
 
 Status IrEmitter::HandleReduce(HloInstruction* reduce) {
-  // TODO(b/112040122): Support variadic reduce.
-  if (!ShapeUtil::IsArray(reduce->shape())) {
+  // TODO(b/118332391): Support variadic reduce.
+  if (!reduce->shape().IsArray()) {
     return Unimplemented("Variadic reduce is not supported on GPU");
   }
   auto arg = reduce->operand(0);
@@ -783,7 +790,7 @@ StatusOr<llvm::Value*> IrEmitter::ComputeNestedElement(
 std::vector<llvm_ir::IrArray> IrEmitter::ConstructIrArrayForOutputs(
     const HloInstruction& hlo) {
   std::vector<llvm_ir::IrArray> output_arrays;
-  if (ShapeUtil::IsTuple(hlo.shape())) {
+  if (hlo.shape().IsTuple()) {
     int64 num_outputs = ShapeUtil::TupleElementCount(hlo.shape());
     output_arrays.reserve(num_outputs);
     for (int64 i = 0; i < num_outputs; ++i) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index 2da46c016935d0e927879bbfb0d05cfc4899d818..f380aee9d3c06a29b503c81c7bd3846dbccf6ce5 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -81,7 +81,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   Status HandleDot(HloInstruction* dot) override;
   Status HandleConvolution(HloInstruction* convolution) override;
   Status HandleFft(HloInstruction* fft) override;
-  Status HandleCrossReplicaSum(HloInstruction* crs) override;
+  Status HandleAllReduce(HloInstruction* crs) override;
   Status HandleInfeed(HloInstruction* infeed) override;
   Status HandleOutfeed(HloInstruction* outfeed) override;
   Status HandleSend(HloInstruction* send) override;
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index fb040aff30d48bf5817946ce53d37bc6685941e4..0cc65ebb52737aa9bb8866eb07278a2319aa797b 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstring>
+#include <iterator>
 #include <memory>
 #include <string>
 #include <vector>
@@ -22,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
 
 #include "absl/algorithm/container.h"
-#include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/optional.h"
@@ -38,7 +38,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
-#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/conditional_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
@@ -60,6 +59,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/tuple_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/while_thunk.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
@@ -89,6 +89,9 @@ namespace xla {
 namespace gpu {
 
 using llvm_ir::KernelMappingScheme;
+using EmitElementFunction =
+    std::function<void(const llvm_ir::IrArray::Index& index, llvm::Value* y_loc,
+                       llvm::Value* x_loc, int64 x_iter_num)>;
 
 namespace {
 
@@ -293,13 +296,12 @@ llvm::Type* GetIndexTypeForKernel(const HloInstruction* hlo, int64 launch_size,
 
   auto shape_in_range = [&](const Shape& s) {
     bool in_range = true;
-    ShapeUtil::ForEachSubshape(
-        s, [&](const Shape& sub_shape, const ShapeIndex& /*index*/) {
-          if (ShapeUtil::IsArray(sub_shape) &&
-              !IsInt32(ShapeUtil::ElementsIn(sub_shape))) {
-            in_range = false;
-          }
-        });
+    ShapeUtil::ForEachSubshape(s, [&](const Shape& sub_shape,
+                                      const ShapeIndex& /*index*/) {
+      if (sub_shape.IsArray() && !IsInt32(ShapeUtil::ElementsIn(sub_shape))) {
+        in_range = false;
+      }
+    });
 
     return in_range;
   };
@@ -485,6 +487,41 @@ Status IrEmitterUnnested::HandleFft(HloInstruction* fft) {
   return Status::OK();
 }
 
+Status IrEmitterUnnested::HandleTriangularSolve(HloInstruction* hlo) {
+  auto has_fortran_layout = [](const Layout& layout) {
+    int n = layout.minor_to_major_size();
+    return layout.minor_to_major(0) == n - 2 &&
+           layout.minor_to_major(1) == n - 1;
+  };
+  TF_RET_CHECK(has_fortran_layout(hlo->operand(0)->shape().layout()));
+  TF_RET_CHECK(has_fortran_layout(hlo->operand(1)->shape().layout()));
+  TF_RET_CHECK(has_fortran_layout(hlo->shape().layout()));
+
+  std::vector<std::unique_ptr<Thunk>> thunks;
+
+  // Triangular solve is in-place on 'b', so copy 'b' to the output if they
+  // aren't the same buffer.
+  auto operand_buffer = GetAllocationSlice(*hlo->operand(1));
+  auto destination_buffer = GetAllocationSlice(*hlo);
+  if (operand_buffer != destination_buffer) {
+    thunks.push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
+        /*source_address=*/operand_buffer,
+        /*destination_buffer=*/destination_buffer,
+        /*mem_size=*/ShapeUtil::ByteSizeOf(hlo->operand(1)->shape()), hlo));
+  }
+
+  thunks.push_back(BuildTriangularSolveThunk(hlo));
+
+  // Elide the sequential thunk if there's no copy.
+  if (thunks.size() == 1) {
+    AddThunkToThunkSequence(std::move(thunks[0]));
+  } else {
+    AddThunkToThunkSequence(
+        absl::make_unique<SequentialThunk>(std::move(thunks), hlo));
+  }
+  return Status::OK();
+}
+
 Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
   HloInstruction* root = fusion->fused_expression_root();
   if (HloInstruction::FusionKind::kInput == fusion->fusion_kind()) {
@@ -543,96 +580,11 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
         // HandleFusion specializes reduction from a multi-dimensional array to
         // a 1D array. The specialized version requires a initializer thunk that
         // initializes the output array to the initial value of the reduce.
-        if (root->opcode() == HloOpcode::kReduce &&
-            ShapeUtil::IsTuple(root->shape())) {
-          // TODO(b/112040122): Support variadic reduce.
+        if (root->opcode() == HloOpcode::kReduce && root->shape().IsTuple()) {
+          // TODO(b/118332391): Support variadic reduce.
           return Unimplemented("Variadic reduce is not supported on GPU");
         }
-        VLOG(3) << "Emitting fused reduction to vector: " << fusion->ToString();
-        std::vector<std::unique_ptr<Thunk>> thunks;
-        absl::Span<HloInstruction* const> output_instructions =
-            root->opcode() == HloOpcode::kTuple
-                ? root->operands()
-                : absl::Span<HloInstruction* const>(&root, 1);
-
-        // For multi-output fusion emit an initializer for each tuple element.
-        // Otherwise it's sufficient to just initialize the single output.
-        HloInstruction* first_reduce = nullptr;
-        for (int i = 0, e = output_instructions.size(); i != e; ++i) {
-          if (output_instructions[i]->opcode() == HloOpcode::kReduce) {
-            TF_ASSIGN_OR_RETURN(
-                std::unique_ptr<Thunk> initializer_thunk,
-                BuildInitializerThunk(fusion, output_instructions[i] == root
-                                                  ? ShapeIndex()
-                                                  : ShapeIndex({i})));
-            thunks.push_back(std::move(initializer_thunk));
-            first_reduce =
-                first_reduce == nullptr ? output_instructions[i] : first_reduce;
-          }
-        }
-        CHECK(first_reduce != nullptr);
-        std::unique_ptr<KernelThunk> kernel_thunk =
-            BuildKernelThunk(fusion, /*implements_whole_instruction=*/false);
-        GpuElementalIrEmitter elemental_emitter(
-            hlo_module_config_, ir_emitter_context_->llvm_module(), &b_,
-            GetNestedComputer());
-        FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(fusion),
-                                     &elemental_emitter);
-        TF_RETURN_IF_ERROR(root->Accept(&fused_emitter));
-
-        // For multi-output fusion CHECK the constraints and feed all the
-        // reduces into a single loop code generator. Single-output reduce
-        // fusion is a special case of that.
-        InlinedVector<llvm_ir::ElementGenerator, 1> input_gens;
-        InlinedVector<llvm_ir::ElementGenerator, 1> init_value_gens;
-        std::vector<std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
-            extra_output_gens;
-        InlinedVector<HloComputation*, 1> reducers;
-        InlinedVector<ShapeIndex, 1> reduce_output_shapes;
-        for (int i = 0, e = output_instructions.size(); i != e; ++i) {
-          const HloInstruction* inst = output_instructions[i];
-          ShapeIndex output_shape_index;
-          if (root->opcode() == HloOpcode::kTuple) {
-            output_shape_index = {i};
-          }
-          if (inst->opcode() == HloOpcode::kReduce) {
-            CHECK(IsReductionToVector(*inst))
-                << "Only reductions to vector are supported";
-            // Shapes, layouts and dimensions must be the same for all reduces
-            // inside of this fusion.
-            CHECK(ShapeUtil::Equal(first_reduce->shape(), inst->shape()));
-            CHECK(ShapeUtil::Equal(first_reduce->operand(0)->shape(),
-                                   inst->operand(0)->shape()));
-            CHECK(ShapeUtil::Equal(first_reduce->operand(1)->shape(),
-                                   inst->operand(1)->shape()));
-            CHECK(first_reduce->dimensions() == inst->dimensions());
-            input_gens.push_back(fused_emitter.GetGenerator(inst->operand(0)));
-            init_value_gens.push_back(
-                fused_emitter.GetGenerator(inst->operand(1)));
-            reducers.push_back(inst->to_apply());
-            reduce_output_shapes.push_back(std::move(output_shape_index));
-          } else {
-            // For extra outputs we can relax shape equality to allow different
-            // types (with the same number of elements). Layouts still have to
-            // match.
-            CHECK(ShapeUtil::CompatibleIgnoringElementType(
-                first_reduce->operand(0)->shape(), inst->shape()));
-            CHECK(LayoutUtil::Equal(first_reduce->operand(0)->shape().layout(),
-                                    inst->shape().layout()));
-            extra_output_gens.emplace_back(fused_emitter.GetGenerator(inst),
-                                           std::move(output_shape_index));
-          }
-        }
-        const Shape& input_shape = first_reduce->operand(0)->shape();
-        TF_CHECK_OK(EmitReductionToVector(
-            kernel_thunk.get(), first_reduce, input_shape, input_gens,
-            init_value_gens, first_reduce->dimensions(), reducers,
-            reduce_output_shapes, extra_output_gens));
-        thunks.push_back(std::move(kernel_thunk));
-        std::unique_ptr<SequentialThunk> sequential_thunk =
-            absl::make_unique<SequentialThunk>(std::move(thunks), fusion);
-        AddThunkToThunkSequence(std::move(sequential_thunk));
-        return Status::OK();
+        return EmitReductionToVector(fusion);
       }
       default:
         LOG(FATAL) << "Bad opcode for input fusion: "
@@ -702,13 +654,12 @@ Status IrEmitterUnnested::HandleCopy(HloInstruction* copy) {
 }
 
 Status IrEmitterUnnested::EmitExtraOutputsForReduce(
-    const HloInstruction* reduce, const IrArray::Index& index,
+    const HloInstruction* unnested_hlo, const IrArray::Index& index,
     absl::Span<const std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
         extra_output_gens) {
   for (int i = 0; i != extra_output_gens.size(); ++i) {
-    const HloInstruction* output = reduce->parent()->FusionInstruction();
     llvm::Value* extra_output_address =
-        GetIrArray(*output, *output, extra_output_gens[i].second)
+        GetIrArray(*unnested_hlo, *unnested_hlo, extra_output_gens[i].second)
             .EmitArrayElementAddress(index, &b_,
                                      "extra_output_element_address");
     TF_ASSIGN_OR_RETURN(llvm::Value* const extra_output_ir_value,
@@ -718,984 +669,13 @@ Status IrEmitterUnnested::EmitExtraOutputsForReduce(
   return Status::OK();
 }
 
-Status IrEmitterUnnested::EmitReductionToScalar(
-    KernelThunk* kernel_thunk, HloInstruction* reduce, const Shape& input_shape,
-    absl::Span<const llvm_ir::ElementGenerator> input_gens,
-    absl::Span<const llvm_ir::ElementGenerator> init_value_gens,
-    absl::Span<HloComputation* const> reducers,
-    absl::Span<const ShapeIndex> reduce_output_shapes,
-    absl::Span<const std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
-        extra_output_gens) {
-  // Number of elements processed by a single thread.
-  constexpr int64 kTileSize = 16;
-  int64 num_elems = ShapeUtil::ElementsIn(input_shape);
-
-  // Round up the number of tiles to a multiple of the warp size.  This is
-  // necessary for correctness.  We launch one thread per tile, and if the
-  // number of threads isn't a multiple of the number of the warp size, our
-  // shuffles will read from inactive threads, producing undefined values.
-  int64 num_tiles =
-      RoundUpToNearest(CeilOfRatio(num_elems, kTileSize), kWarpSize);
-
-  Shape tiled_input_shape = ShapeUtil::MakeShapeWithLayout(
-      reduce->shape().element_type(), {num_tiles}, {0});
-  LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-      tiled_input_shape, ir_emitter_context_->device_description());
-
-  llvm::Type* index_ty =
-      GetIndexTypeForKernel(reduce, launch_dimensions.launch_bound(), &b_);
-
-  auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
-    return llvm::ConstantInt::get(index_ty, c);
-  };
-
-  // Check whether every thread will process a full tile's worth of elements
-  // without reading outside the bounds of the input.  If this is true, we can
-  // skip some bounds checks in the final algorithm.
-  bool all_threads_in_bounds = num_tiles * kTileSize == num_elems;
-
-  // __global__ void full_reduce_kernel() {
-  //   x_in_tiles = threadIdx.x + blockIdx.x * blockDim.x;
-  //   x = x_in_tiles * kTileSize;
-  //
-  //   partial_result = init_value;
-  //   if (all_threads_in_bounds || x + kTileSize <= num_elems) {
-  //     for (i = 0; i < kTileSize; ++i) {
-  //       partial_result = Reducer(partial_result, input[x + i]);
-  //     }
-  //   } else {
-  //     for (i = 0; i < kTileSize; ++i) {
-  //       if (x + i < num_elems) {
-  //         partial_result = Reducer(partial_result, input[x + i]);
-  //       }
-  //     }
-  //   }
-  //   for (i = warpSize / 2; i > 0; i /= 2) {
-  //     partial_result = Reducer(partial_result,
-  //                              __shfl_down(partial_result, i));
-  //   }
-  //   if (lane_id == 0) {
-  //     AtomicReducer(&output[y], partial_result);
-  //   }
-  // }
-  //
-  // // Choose num_blocks and threads_per_block such that:
-  // //
-  // //   num_blocks * threads_per_block =
-  // //     RoundUpToNextMultipleOf(Ceil(num_elems / kTileSize), warpSize),
-  // //
-  // // and threads_per_block is a multiple of warpSize.
-  // reduce_kernel  //
-  auto loop_body_emitter = [=](const IrArray::Index& tile_index) -> Status {
-    const int num_reduces = reducers.size();
-    llvm::Type* element_ir_type =
-        llvm_ir::PrimitiveTypeToIrType(input_shape.element_type(), module_);
-    std::vector<llvm::Value*> partial_reduction_result_addresses;
-    for (int i = 0; i != num_reduces; ++i) {
-      llvm::Value* partial_reduction_result_address =
-          Alloca(element_ir_type, /*ArraySize=*/nullptr,
-                 "partial_reduction_result." + llvm::Twine(i));
-      TF_ASSIGN_OR_RETURN(llvm::Value* const init_ir_value,
-                          init_value_gens[i](IrArray::Index(index_ty)));
-      Store(init_ir_value, partial_reduction_result_address);
-      partial_reduction_result_addresses.push_back(
-          partial_reduction_result_address);
-    }
-
-    llvm::Value* x_in_tiles = tile_index[0];
-    x_in_tiles = ZExtOrTrunc(x_in_tiles, index_ty);
-
-    // Emit an inner for-loop that reduces the elements in the tile.
-    auto emit_tile_element_loop = [=](bool tile_in_bounds) -> Status {
-      std::unique_ptr<llvm_ir::ForLoop> tile_element_loop =
-          llvm_ir::ForLoop::EmitForLoop(
-              "element_id_in_tile", index_typed_constant(0),
-              index_typed_constant(kTileSize), index_typed_constant(1), &b_);
-
-      // Emit the body of the partial reduction loop.
-      llvm_ir::SetToFirstInsertPoint(tile_element_loop->GetBodyBasicBlock(),
-                                     &b_);
-      llvm::Value* x =
-          NSWAdd(NSWMul(x_in_tiles, index_typed_constant(kTileSize)),
-                 tile_element_loop->GetIndVarValue());
-      // Unless we know the tile is entirely in bounds, we have to emit a
-      // x-in-bounds check before reading from the input.
-      if (!tile_in_bounds) {
-        llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
-            ICmpULT(x, index_typed_constant(num_elems)), "x_in_bounds", &b_);
-
-        // Emit code that reads the input element and accumulates it to
-        // the partial reduction result.
-        llvm_ir::SetToFirstInsertPoint(if_data.true_block, &b_);
-      }
-
-      IrArray::Index input_index(
-          /*linear=*/x, input_shape, &b_);
-      llvm::Value* input_address = Alloca(element_ir_type);
-      for (int i = 0; i != num_reduces; ++i) {
-        TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value,
-                            input_gens[i](input_index));
-        Store(input_ir_value, input_address);
-        TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-            *reducers[i],
-            {partial_reduction_result_addresses[i], input_address},
-            partial_reduction_result_addresses[i]));
-      }
-      return EmitExtraOutputsForReduce(reduce, input_index, extra_output_gens);
-    };
-
-    // x_end = kTileSize + x_in_tiles * kTileSize, i.e., the location that's
-    // immediately beyond the tile.
-    llvm::Value* x_end =
-        NSWAdd(index_typed_constant(kTileSize),
-               NSWMul(x_in_tiles, index_typed_constant(kTileSize)));
-    // The tile is entirely in bound if all_threads_in_bounds or
-    // x_end <= num_elems.
-    llvm::Value* tile_in_bounds =
-        Or(ICmpULE(x_end, index_typed_constant(num_elems)),
-           b_.getInt1(all_threads_in_bounds));
-    llvm_ir::LlvmIfData if_tile_in_bounds_data =
-        llvm_ir::EmitIfThenElse(tile_in_bounds, "tile_in_bounds", &b_);
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.true_block, &b_);
-    TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_bounds=*/true));
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.false_block, &b_);
-    TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_bounds=*/false));
-
-    // After the if-then-else statement on tile_in_bounds, emit calls to
-    // shfl_down that accumulate the partial reduction results of all threads
-    // from the warp.
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.after_block, &b_);
-    int bit_width = llvm_ir::GetSizeInBits(element_ir_type);
-    // bitcast cannot be applied to aggregate types (even packed ones), so we
-    // instead bitcast addresses of load/store to intN* of the same bit-width.
-    llvm::Type* shuffle_ir_type = element_ir_type->isStructTy()
-                                      ? b_.getIntNTy(bit_width)
-                                      : element_ir_type;
-    for (int shuffle_distance = kWarpSize / 2; shuffle_distance >= 1;
-         shuffle_distance /= 2) {
-      llvm::Value* result_from_other_lane =
-          Alloca(element_ir_type, nullptr, "result_from_other_lane");
-      for (int i = 0; i != num_reduces; ++i) {
-        llvm::Value* partial_reduction_result =
-            Load(BitCast(partial_reduction_result_addresses[i],
-                         shuffle_ir_type->getPointerTo()),
-                 "partial_reduction_result");
-        CHECK_EQ(launch_dimensions.threads_per_block() % kWarpSize, 0)
-            << "Requires block size a multiple of the warp size, otherwise we "
-               "will read undefined elements.";
-        Store(EmitFullWarpShuffleDown(partial_reduction_result,
-                                      b_.getInt32(shuffle_distance), &b_),
-              BitCast(result_from_other_lane, shuffle_ir_type->getPointerTo()));
-        TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-            *reducers[i],
-            {partial_reduction_result_addresses[i], result_from_other_lane},
-            partial_reduction_result_addresses[i]));
-      }
-    }
-
-    const HloInstruction* output =
-        reduce->IsFused() ? reduce->parent()->FusionInstruction() : reduce;
-
-    // Emit an atomic operation that accumulates the partial reduction result of
-    // lane 0 (which holds the partially accumulated result for its warp) to the
-    // output element.
-    llvm::Value* lane_id =
-        URem(x_in_tiles, index_typed_constant(kWarpSize), "lane_id");
-    llvm_ir::LlvmIfData if_lane_id_is_zero_data = llvm_ir::EmitIfThenElse(
-        ICmpEQ(lane_id, index_typed_constant(0)), "lane_id_is_zero", &b_);
-    llvm_ir::SetToFirstInsertPoint(if_lane_id_is_zero_data.true_block, &b_);
-
-    for (int i = 0; i != num_reduces; ++i) {
-      llvm::Value* output_address =
-          GetIrArray(*output, *output, reduce_output_shapes[i])
-              .EmitArrayElementAddress(
-                  IrArray::Index(
-                      /*linear=*/b_.getInt64(0),
-                      ShapeUtil::GetSubshape(output->shape(),
-                                             reduce_output_shapes[i]),
-                      &b_),
-                  &b_, "output_element_address");
-      TF_RETURN_IF_ERROR(EmitAtomicOperationForNestedComputation(
-          *reducers[i], output_address, partial_reduction_result_addresses[i]));
-    }
-    return Status::OK();
-  };
-
-  // Emit a parallel loop that iterates through all input tiles, one per thread.
-  UpdateLaunchDimensions(launch_dimensions, kernel_thunk,
-                         ir_emitter_context_->llvm_module());
-  return ParallelLoopEmitter(loop_body_emitter, tiled_input_shape,
-                             launch_dimensions, &b_)
-      .EmitLoop(IrName(reduce), index_ty);
-}
-
-Status IrEmitterUnnested::EmitColumnReduction(
-    KernelThunk* kernel_thunk, int64 height, int64 width,
-    HloInstruction* reduce, const Shape& input_shape,
-    absl::Span<const llvm_ir::ElementGenerator> input_gens,
-    absl::Span<const llvm_ir::ElementGenerator> init_value_gens,
-    absl::Span<HloComputation* const> reducers,
-    absl::Span<const ShapeIndex> reduce_output_shapes,
-    absl::Span<const std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
-        extra_output_gens) {
-  // Divide the input matrix into tiles of size KxL. For example, when the
-  // input matrix is 4x4, K=2, and L=1 the tiled matrix looks like
-  //
-  //   0123
-  //   0123
-  //   4567
-  //   4567  // Numbers indicate tile IDs.
-  //
-  // Each tile is first partially reduced to a scalar by a thread, and then the
-  // scalar is accumulated to the output vector using atomic operations.
-  //
-  // We choose 128 as the tile size based on empirical evidence. It's big enough
-  // to reduce the amount of atomic adds in the end, maximizing the memory
-  // bandwidth. A tile width of 2 allows for high memory bandwidth utilization
-  // on 16b input data.
-  constexpr int64 kTileHeight = 128;
-  constexpr int64 kTileWidth = 2;
-
-  // If the height is not a multiple of kTileHeight, we pad the bottom of the
-  // input matrix.
-  const int64 height_in_tiles = CeilOfRatio(height, kTileHeight);
-  // If width is not a multiple of kTileWidth the rightmost thread will process
-  // fewer input elements.
-  const int64 width_in_tiles = CeilOfRatio(width, kTileWidth);
-  Shape tiled_input_shape =
-      ShapeUtil::MakeShapeWithLayout(reduce->shape().element_type(),
-                                     {height_in_tiles, width_in_tiles}, {1, 0});
-  LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-      tiled_input_shape, ir_emitter_context_->device_description());
-
-  // TODO(b/110211620): Convert to use i32 index_type when it is possible.
-  llvm::Type* index_ty = b_.getInt64Ty();
-
-  auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
-    return llvm::ConstantInt::get(index_ty, c);
-  };
-
-  // for (linear_index = threadIdx.x + blockIdx.x * blockDim.x;
-  //      linear_index < height_in_tiles * width_in_tiles;
-  //      linear_index += blockDim.x * gridDim.x) {
-  //   y_in_tiles = linear_index / width_in_tiles;
-  //   x_in_tiles = linear_index % width_in_tiles;
-  //
-  //   partial_results[kTileWidth] = init_values;
-  //   tile_in_y_bounds = height % kTileHeight == 0 ||
-  //       y_in_tiles * kTileHeight + kTileHeight <= height;
-  //   tile_in_x_bounds = width % kTileWidth == 0 ||
-  //       x_in_tiles * kTileWidth + kTileWidth <= width;
-  //   // The implementation handles y and x bound checks separately.
-  //   if (tile_in_y_bounds && tile_in_x_bounds) {
-  //     for (y_offset : range(kTileHeight)) {
-  //       y = y_in_tiles * kTileHeight + y_offset;
-  //       for (x_offset : range(kTileWidth)) {
-  //         x = x_in_tiles * kTileWidth + x_offset;
-  //         partial_result = Reducer(partial_result[x_offset], input[y][x]);
-  //       }
-  //     }
-  //   } else {
-  //     for (y_offset : range(kTileHeight)) {
-  //       y = y_in_tiles * kTileHeight + y_offset;
-  //       for (y_offset : range(kTileHeight)) {
-  //         x = x_in_tiles * kTileWidth + x_offset;
-  //         if (y < height && x < width) {
-  //           partial_result = Reducer(partial_result, input[y][x]);
-  //         }
-  //       }
-  //     }
-  //   }
-  //   for (x_offset : range(kTileWidth)) {
-  //     AtomicReducer(&output[x + x_offset], partial_result[x_offset]);
-  //   }
-  // }
-  auto loop_body_emitter = [=](const IrArray::Index& tile_index) -> Status {
-    const int num_reduces = reducers.size();
-    // Emit the loop body that reduces one tile.
-    llvm::Type* element_ir_type =
-        llvm_ir::PrimitiveTypeToIrType(input_shape.element_type(), module_);
-    std::vector<llvm::Value*> partial_reduction_result_addresses;
-    for (int i = 0; i != num_reduces; ++i) {
-      for (int x_offset = 0; x_offset < kTileWidth; ++x_offset) {
-        llvm::Value* partial_reduction_result_address =
-            Alloca(element_ir_type, /*ArraySize=*/nullptr,
-                   "partial_reduction_result." +
-                       llvm::Twine(i * kTileWidth + x_offset));
-        TF_ASSIGN_OR_RETURN(llvm::Value* const init_ir_value,
-                            init_value_gens[i](IrArray::Index(index_ty)));
-        Store(init_ir_value, partial_reduction_result_address);
-        partial_reduction_result_addresses.push_back(
-            partial_reduction_result_address);
-      }
-    }
-
-    // Emit an inner for-loop that partially reduces the elements in the given
-    // tile.
-    llvm::Value* y_in_tiles = tile_index[0];
-    llvm::Value* x_in_tiles = tile_index[1];
-
-    y_in_tiles = ZExtOrTrunc(y_in_tiles, index_ty);
-    x_in_tiles = ZExtOrTrunc(x_in_tiles, index_ty);
-
-    auto emit_tile_element_loop = [=](bool tile_in_y_bounds,
-                                      bool tile_in_x_bounds) -> Status {
-      std::unique_ptr<llvm_ir::ForLoop> tile_element_loop =
-          llvm_ir::ForLoop::EmitForLoop(
-              "element_id_in_tile", index_typed_constant(0),
-              index_typed_constant(kTileHeight), index_typed_constant(1), &b_);
-
-      // Emit the body of the partial reduction loop.
-      llvm_ir::SetToFirstInsertPoint(tile_element_loop->GetBodyBasicBlock(),
-                                     &b_);
-      llvm::Value* y =
-          NSWAdd(NSWMul(y_in_tiles, index_typed_constant(kTileHeight)),
-                 tile_element_loop->GetIndVarValue());
-
-      // Unless we know that y is in bounds, we have to emit a check before
-      // reading from the input.
-      if (!tile_in_y_bounds) {
-        llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
-            ICmpULT(y, index_typed_constant(height)), "y_in_bounds", &b_);
-
-        // Emit code that reads the input element and accumulates it to
-        // the partial reduction result.
-        llvm_ir::SetToFirstInsertPoint(if_data.true_block, &b_);
-      }
-      for (int x_offset = 0; x_offset < kTileWidth; ++x_offset) {
-        llvm::Value* x =
-            NSWAdd(NSWMul(x_in_tiles, index_typed_constant(kTileWidth)),
-                   index_typed_constant(x_offset));
-        // Unless we know that x is in bounds, we have to emit a check before
-        // reading from the input.
-        if (!tile_in_x_bounds) {
-          llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
-              ICmpULT(x, index_typed_constant(width)), "x_in_bounds", &b_);
-          llvm_ir::SetToFirstInsertPoint(if_data.true_block, &b_);
-        }
-        llvm::Value* input_address = Alloca(element_ir_type);
-        // {y,x} is an index to input_matrix_shape [height,width]. We need to
-        // convert that to an index to input_shape (the shape of the operand of
-        // "reduce"). This conversion is composed of a transposition from
-        // input_shape to normalized_input_shape and a reshape from
-        // normalized_input_shape to input_matrix_shape.
-        const Shape normalized_input_shape =
-            ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
-                input_shape);
-        auto input_shape_min2maj = LayoutUtil::MinorToMajor(input_shape);
-        const std::vector<int64> transpose_dimension_mapping(
-            input_shape_min2maj.rbegin(), input_shape_min2maj.rend());
-
-        const Shape input_matrix_shape =
-            ShapeUtil::MakeShapeWithDescendingLayout(input_shape.element_type(),
-                                                     {height, width});
-        const IrArray::Index input_matrix_index({y, x}, input_matrix_shape,
-                                                &b_);
-        const IrArray::Index input_index =
-            input_matrix_index
-                .SourceIndexOfReshape(input_matrix_shape,
-                                      normalized_input_shape, &b_)
-                .SourceIndexOfTranspose(normalized_input_shape, input_shape,
-                                        transpose_dimension_mapping, &b_);
-        for (int i = 0; i != num_reduces; ++i) {
-          TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value,
-                              input_gens[i](input_index));
-          Store(input_ir_value, input_address);
-          TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-              *reducers[i],
-              {partial_reduction_result_addresses[i * kTileWidth + x_offset],
-               input_address},
-              partial_reduction_result_addresses[i * kTileWidth + x_offset]));
-          TF_RETURN_IF_ERROR(EmitExtraOutputsForReduce(reduce, input_index,
-                                                       extra_output_gens));
-        }
-      }
-      return Status::OK();
-    };
-
-    // y_end = kTileHeight + y_in_tiles * kTileHeight, i.e., the y location
-    // that's immediately beyond the tile.
-    llvm::Value* y_end =
-        NSWAdd(index_typed_constant(kTileHeight),
-               NSWMul(y_in_tiles, index_typed_constant(kTileHeight)));
-    // x_end = kTileWidth + x_in_tiles * kTileWidth, i.e., the x location
-    // that's immediately beyond the tile.
-    llvm::Value* x_end =
-        NSWAdd(index_typed_constant(kTileWidth),
-               NSWMul(x_in_tiles, index_typed_constant(kTileWidth)));
-    llvm::Value* tile_in_y_bounds =
-        Or(ICmpULE(y_end, index_typed_constant(height)),
-           b_.getInt1(height % kTileHeight == 0));
-    llvm::Value* tile_in_x_bounds =
-        Or(ICmpULE(x_end, index_typed_constant(width)),
-           b_.getInt1(width % kTileWidth == 0));
-    // The tile is in y bounds if "height" is a multiple of kTileHeight or
-    // y_end <= height.
-    llvm_ir::LlvmIfData if_tile_in_y_bounds_data =
-        llvm_ir::EmitIfThenElse(tile_in_y_bounds, "tile_in_y_bounds", &b_);
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_y_bounds_data.true_block, &b_);
-    // The tile is in x bounds if "width" is a multiple of kTileWidth or
-    // x_end <= width.
-    llvm_ir::LlvmIfData if_tile_in_x_bounds_data =
-        llvm_ir::EmitIfThenElse(tile_in_x_bounds, "tile_in_x_bounds", &b_);
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.true_block, &b_);
-    TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_y_bounds=*/true,
-                                              /*tile_in_x_bounds=*/true));
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.false_block, &b_);
-    TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_y_bounds=*/true,
-                                              /*tile_in_x_bounds=*/false));
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_y_bounds_data.false_block, &b_);
-    if_tile_in_x_bounds_data =
-        llvm_ir::EmitIfThenElse(tile_in_x_bounds, "tile_in_x_bounds", &b_);
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.true_block, &b_);
-    TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_y_bounds=*/false,
-                                              /*tile_in_x_bounds=*/true));
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.false_block, &b_);
-    TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_y_bounds=*/false,
-                                              /*tile_in_x_bounds=*/false));
-
-    // After the nested if-then-else statement on tile_in_y_bounds and
-    // tile_in_x_bounds, emit atomic operations to accumulate the partial
-    // reduction result to the output element.
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_y_bounds_data.after_block, &b_);
-    const HloInstruction* output =
-        reduce->IsFused() ? reduce->parent()->FusionInstruction() : reduce;
-    for (int i = 0; i != num_reduces; ++i) {
-      for (int x_offset = 0; x_offset < kTileWidth; ++x_offset) {
-        llvm::Value* x =
-            NSWAdd(NSWMul(x_in_tiles, index_typed_constant(kTileWidth)),
-                   index_typed_constant(x_offset));
-        llvm::Value* output_address =
-            GetIrArray(*output, *output, reduce_output_shapes[i])
-                .EmitArrayElementAddress(
-                    IrArray::Index(
-                        x,
-                        ShapeUtil::GetSubshape(output->shape(),
-                                               reduce_output_shapes[i]),
-                        &b_),
-                    &b_, "output_element_address");
-        TF_RETURN_IF_ERROR(EmitAtomicOperationForNestedComputation(
-            *reducers[i], output_address,
-            partial_reduction_result_addresses[i * kTileWidth + x_offset]));
-      }
-    }
-    return Status::OK();
-  };
-
-  // Emit a parallel loop that iterate through all input tiles.
-  UpdateLaunchDimensions(launch_dimensions, kernel_thunk,
-                         ir_emitter_context_->llvm_module());
-  return ParallelLoopEmitter(loop_body_emitter, tiled_input_shape,
-                             launch_dimensions, &b_)
-      .EmitLoop(IrName(reduce), index_ty);
-}
-
-static std::pair<int64, int64> ComputeKernelMappingSchemeForReduction(
-    int64 depth, int64 width, int64 kWarpSize) {
-  constexpr int64 kTargetNumElementsPerThread = 64;
-  int64 x_tile_size = kTargetNumElementsPerThread;
-  int64 z_tile_size = 1;
-
-  // Only tile along the x dimension with tile size kTargetNumElementsPerThread
-  // if doing so doesn't require a slow version of loop with bound check on each
-  // dimension. A more sophisticated heuristics is to enable tile along the
-  // x dimension with tile size kTargetNumElementsPerThread when either width is
-  // a factor of (kWarpSize * kTargetNumElementsPerThread) or width is big
-  // enough so that only a small fraction of the threads execute the slow
-  // version of loop with bound check.
-  if (width % (kWarpSize * kTargetNumElementsPerThread) != 0) {
-    x_tile_size = 8;
-    z_tile_size = 8;
-    while (depth % z_tile_size != 0) {
-      z_tile_size -= 1;
-    }
-  }
-
-  return std::pair<int64, int64>(x_tile_size, z_tile_size);
-}
-
-Status IrEmitterUnnested::EmitRowReduction(
-    KernelThunk* kernel_thunk, int64 depth, int64 height, int64 width,
-    HloInstruction* reduce, const Shape& input_shape,
-    absl::Span<const llvm_ir::ElementGenerator> input_gens,
-    absl::Span<const llvm_ir::ElementGenerator> init_value_gens,
-    absl::Span<HloComputation* const> reducers,
-    absl::Span<const ShapeIndex> reduce_output_shapes,
-    absl::Span<const std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
-        extra_output_gens) {
-  // A naive algorithm is:
-  // 1. Divide the x dimension of the input tensor into tiles of size 1x1xX.
-  // 2. Partially reduces each tile to a scalar using one thread.
-  // 3. Accumulates that scalar to the output vector using atomic operations.
-  //
-  // for (linear_index = threadIdx.x + blockIdx.x * blockDim.x;
-  //      linear_index < depth * height * width_in_tiles;
-  //      linear_index += blockDim.x * gridDim.x) {
-  //   int x_in_tiles = linear_index % width_in_tiles;
-  //   int y = linear_index / width_in_tiles % height;
-  //   int z = linear_index / (height * width_in_tiles);
-  //   float partial_result = 0;
-  //   for (element_id_in_tile : range(x_tile_size)) {
-  //     int x = x_in_tiles * x_tile_size + element_id_in_tile;
-  //     if (x < width)
-  //       partial_result = reducer(partial_result, input[z][y][x]);
-  //   }
-  //   AtomicReducer(&output[y], partial_result);
-  // }
-  //
-  // Four optimizations are performed.
-  //
-  // 1. To coalesce global memory accesses, dilate the tile with a factor of 32
-  // (i.e. the warp size). For example, suppose the width is 8x32=256. Instead
-  // of making each tile consecutive, we let make tile 0 column
-  // [0,32,64,...,224], tile 1 column [1,33,65,...,225], and so on. This ensures
-  // that threads in a warp access consecutive memory in one iteration (i.e.
-  // coalesced). In the above example, the warp that contains thread 0-31
-  // accesses column 0-31 in the first iteration, and 32-63 in the second
-  // iteration, and so on.
-  //
-  // 2. Partially accumulate partial reduced results computed by threads in the
-  // same warp using shfl_down. Using shfl_down is faster than directly using
-  // atomic operations because shfl_down transfers the data between threads
-  // using shared memory and threads in the same warp run in lock step (thus no
-  // extra synchronization needed). See
-  // https://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
-  // for details. The downside is, to produce correct results when using
-  // shfl_down, we need to guarantee threads in the same warp work on input
-  // elements with the same y, so the number of tiles in each row must be a
-  // multiple of 32.
-  //
-  // 3. Specialize the case that the entire tile is in bounds. When that is
-  // true, we don't need to emit "if(x<width)" inside the loop on
-  // element_id_in_tile, which makes the code more friendly to optimizations
-  // such as LICM.
-  //
-  // 4. When the width is too small and x_tile_size is less than the target
-  //    number of elements per thread and use a small factor of depth as
-  //    z_tile_size to increase the number of elements calculated by each
-  //    partial sum. This can reduce the needed number of dynamic shfl_down and
-  //    atomic operations.
-  //
-  // for (linear_index = threadIdx.x + blockIdx.x * blockDim.x;
-  //      linear_index < depth * height * width_in_tiles;
-  //      linear_index += blockDim.x * gridDim.x) {
-  //   int x_in_tiles = linear_index % width_in_tiles;
-  //   int y = linear_index / width_in_tiles % height;
-  //   int z_in_tiles = linear_index / (height * width_in_tiles);
-  //   int warp_id = x_in_tiles / warpSize;
-  //   int lane_id = x_in_tiles % warpSize;
-  //   float partial_result = 0;
-  //   int x = warp_id * kTileSize * warpSize + lane_id;
-  //   if (width % (x_tile_size * warpSize) == 0 ||
-  //       x + (x_tile_size - 1) * warpSize < width) {
-  //     // The entire x_tile is in bounds.
-  //     for (int element_id_in_z_tile = 0; element_id_in_z_tile < z_tile_size;
-  //          ++element_id_in_z_tile) {
-  //       z = z_in_tiles * z_tile_size + element_id_in_z_tile;
-  //       int tx = x;
-  //       for (int element_id_in_x_tile = 0;
-  //            element_id_in_x_tile < x_tile_size;
-  //            ++element_id_in_x_tile, tx += warpSize) {
-  //         partial_result = Reducer(partial_result, input[z][y][tx]);
-  //       }
-  //     }
-  //   } else {
-  //     // The tile is partially in bounds.
-  //     for (int element_id_in_z_tile = 0; element_id_in_z_tile < z_tile_size;
-  //          ++element_id_in_z_tile) {
-  //       z = z_in_tiles * z_tile_size + element_id_in_z_tile;
-  //       int tx = x;
-  //       for (int element_id_in_x_tile = 0; element_id_in_x_tile <
-  //            x_tile_size; ++element_id_in_tile, tx += warpSize) {
-  //         if (tx < width)
-  //           partial_result = Reducer(partial_result, input[z][y][tx]);
-  //       }
-  //     }
-  //   }
-  //   for (shuffle_distance = 16; shuffle_distance > 0; shuffle_distance /= 2)
-  //     partial_result = Reducer(
-  //         partial_result,
-  //         __shfl_down_sync(CUDA_WARP_ALL, partial_result, shuffle_distance));
-  //   if (lane_id == 0)
-  //     AtomicReducer(&output[y], partial_result);
-  // }
-  //
-
-  int64 x_tile_size;
-  int64 z_tile_size;
-  std::tie(x_tile_size, z_tile_size) =
-      ComputeKernelMappingSchemeForReduction(depth, width, kWarpSize);
-
-  // Round the width in tiles up to the nearest multiple of kWarpSize, so that
-  // the use of shfl_down is valid.
-  const int64 width_in_tiles =
-      RoundUpToNearest(CeilOfRatio(width, x_tile_size), kWarpSize);
-  Shape tiled_input_shape = ShapeUtil::MakeShapeWithLayout(
-      reduce->shape().element_type(),
-      {depth / z_tile_size, height, width_in_tiles}, {2, 1, 0});
-  LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-      tiled_input_shape, ir_emitter_context_->device_description());
-  llvm::Type* index_ty =
-      GetIndexTypeForKernel(reduce, launch_dimensions.launch_bound(), &b_);
-
-  auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
-    return llvm::ConstantInt::get(index_ty, c);
-  };
-
-  auto loop_body_emitter = [=](const IrArray::Index& tile_index) {
-    const int num_reduces = reducers.size();
-    llvm::Type* element_ir_type = llvm_ir::PrimitiveTypeToIrType(
-        input_shape.element_type(), ir_emitter_context_->llvm_module());
-    std::vector<llvm::Value*> partial_reduction_result_addresses;
-    for (int i = 0; i != num_reduces; ++i) {
-      llvm::Value* partial_reduction_result_address =
-          Alloca(element_ir_type, /*ArraySize=*/nullptr,
-                 "partial_reduction_result." + llvm::Twine(i));
-      TF_ASSIGN_OR_RETURN(llvm::Value* const init_ir_value,
-                          init_value_gens[i](IrArray::Index(index_ty)));
-      Store(init_ir_value, partial_reduction_result_address);
-      partial_reduction_result_addresses.push_back(
-          partial_reduction_result_address);
-    }
-
-    llvm::Value* z_tile = tile_index[0];
-    llvm::Value* y = tile_index[1];
-    llvm::Value* x_tile = tile_index[2];
-
-    x_tile = ZExtOrTrunc(x_tile, index_ty);
-
-    llvm::Value* warp_id =
-        UDiv(x_tile, index_typed_constant(kWarpSize), "warp_id");
-    llvm::Value* lane_id =
-        URem(x_tile, index_typed_constant(kWarpSize), "lane_id");
-
-    // The x-location of the last element in this z-x-tile.
-    // last_x = lane_id + warpSize * (x_tile_size - 1 + warp_id * x_tile_size);
-    llvm::Value* last_x = NSWAdd(
-        lane_id,
-        NSWMul(index_typed_constant(kWarpSize),
-               NSWAdd(index_typed_constant(x_tile_size - 1),
-                      NSWMul(warp_id, index_typed_constant(x_tile_size)))));
-
-    KernelSupportLibrary ksl(
-        &b_,
-        /*unroll_mode=*/xla::llvm_ir::UnrollMode::kFullyUnroll,
-        /*prevent_vectorization=*/false);
-
-    // Emit a for-loop that partially reduces the elements in the given
-    // z-x-tile.
-    auto emit_z_x_tile_element_loop = [&](bool x_tile_in_bounds,
-                                          int64 x_tile_loop_bound) -> Status {
-      auto emit_z_tile_element_loop = [&](llvm::Value* z_indvar) -> Status {
-        llvm::Value* z =
-            NSWAdd(z_indvar, NSWMul(index_typed_constant(z_tile_size), z_tile));
-        TF_RETURN_IF_ERROR(ksl.For(
-            "x_tile",
-            /*start=*/index_typed_constant(0),
-            /*end=*/index_typed_constant(x_tile_loop_bound),
-            /*step=*/1, [&](llvm::Value* x_indvar) -> Status {
-              // x = lane_id +
-              //     warpSize * (element_id_in_x_tile + warp_id * x_tile_size);
-              llvm::Value* x = NSWAdd(
-                  lane_id,
-                  NSWMul(index_typed_constant(kWarpSize),
-                         NSWAdd(x_indvar,
-                                NSWMul(warp_id, llvm::ConstantInt::get(
-                                                    index_ty, x_tile_size)))));
-
-              // Unless we know the x-tile is entirely in bounds, we have to
-              // emit a x-in-bounds check before reading from the input.
-              if (!x_tile_in_bounds) {
-                llvm_ir::LlvmIfData if_x_in_bounds_data =
-                    llvm_ir::EmitIfThenElse(
-                        ICmpULT(x, index_typed_constant(width)), "x_in_bounds",
-                        &b_);
-                // Points b_ to the then-block.
-                llvm_ir::SetToFirstInsertPoint(if_x_in_bounds_data.true_block,
-                                               &b_);
-              }
-
-              // Emit code that reads the input element and accumulates it
-              // to the partial reduction result.
-              llvm::Value* input_address = Alloca(element_ir_type);
-              {
-                // {z,y,x} is an index to input_3d_tensor_shape
-                // [depth,height,width]. We need to convert that to an index
-                // to input_shape (the shape of the operand of "reduce").
-                // This conversion is composed of a transposition from
-                // input_shape to normalized_input_shape and a reshape from
-                // normalized_input_shape to input_3d_tensor_shape.
-                const Shape normalized_input_shape = ShapeUtil::
-                    MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
-                        input_shape);
-                auto input_shape_min2maj =
-                    LayoutUtil::MinorToMajor(input_shape);
-                const std::vector<int64> transpose_dimension_mapping(
-                    input_shape_min2maj.rbegin(), input_shape_min2maj.rend());
-                const Shape input_3d_tensor_shape =
-                    ShapeUtil::MakeShapeWithDescendingLayout(
-                        input_shape.element_type(), {depth, height, width});
-                const IrArray::Index input_3d_tensor_index(
-                    {z, y, x}, input_3d_tensor_shape, &b_);
-                const IrArray::Index input_index =
-                    input_3d_tensor_index
-                        .SourceIndexOfReshape(input_3d_tensor_shape,
-                                              normalized_input_shape, &b_)
-                        .SourceIndexOfTranspose(
-                            normalized_input_shape, input_shape,
-                            transpose_dimension_mapping, &b_);
-
-                for (int i = 0; i != num_reduces; ++i) {
-                  TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value,
-                                      input_gens[i](input_index));
-                  Store(input_ir_value, input_address);
-                  TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-                      *reducers[i],
-                      {partial_reduction_result_addresses[i], input_address},
-                      partial_reduction_result_addresses[i]));
-                }
-                return EmitExtraOutputsForReduce(reduce, input_index,
-                                                 extra_output_gens);
-              }
-            }));
-        return Status::OK();
-      };
-
-      return ksl.For("z_tile",
-                     /*start=*/index_typed_constant(0),
-                     /*end=*/index_typed_constant(z_tile_size),
-                     /*step=*/1, emit_z_tile_element_loop);
-    };
-
-    llvm::Value* tile_in_bounds =
-        Or(b_.getInt1(width % (x_tile_size * kWarpSize) == 0),
-           ICmpULT(last_x, index_typed_constant(width)));
-
-    TF_RETURN_IF_ERROR(
-        ksl.If(tile_in_bounds,
-               /*true_block_generator=*/
-               [&]() -> Status {
-                 return emit_z_x_tile_element_loop(/*x_tile_in_bounds=*/true,
-                                                   x_tile_size);
-               },
-               /*false_block_generator=*/
-               [&]() -> Status {
-                 return emit_z_x_tile_element_loop(
-                     /*x_tile_in_bounds=*/false,
-                     CeilOfRatio(width % (x_tile_size * kWarpSize), kWarpSize));
-               }));
-
-    // After accumulating the elements of the z_x_tile, emit calls to
-    // shfl_down that accumulate the partial reduction results of all
-    // threads in a warp.
-    int bit_width = llvm_ir::GetSizeInBits(element_ir_type);
-    // bitcast cannot be applied to aggregate types (even packed ones), so we
-    // instead bitcast addresses of load/store to intN* of the same bit-width.
-    llvm::Type* shuffle_ir_type = element_ir_type->isStructTy()
-                                      ? b_.getIntNTy(bit_width)
-                                      : element_ir_type;
-    for (int shuffle_distance = 16; shuffle_distance >= 1;
-         shuffle_distance /= 2) {
-      llvm::Value* result_from_other_lane =
-          Alloca(element_ir_type, nullptr, "result_from_other_lane");
-      for (int i = 0; i != num_reduces; ++i) {
-        llvm::Value* partial_reduction_result =
-            Load(BitCast(partial_reduction_result_addresses[i],
-                         shuffle_ir_type->getPointerTo()),
-                 "partial_reduction_result");
-        CHECK_EQ(launch_dimensions.threads_per_block() % kWarpSize, 0)
-            << "Requires block size a multiple of the warp size, otherwise we "
-               "will read undefined elements.";
-        Store(EmitFullWarpShuffleDown(partial_reduction_result,
-                                      b_.getInt32(shuffle_distance), &b_),
-              BitCast(result_from_other_lane, shuffle_ir_type->getPointerTo()));
-        TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-            *reducers[i],
-            {partial_reduction_result_addresses[i], result_from_other_lane},
-            partial_reduction_result_addresses[i]));
-      }
-    }
-
-    const HloInstruction* output =
-        reduce->IsFused() ? reduce->parent()->FusionInstruction() : reduce;
-
-    // Emit an atomic operation that accumulates the partial reduction result of
-    // lane 0 (which holds the partially accumulated result for its warp) to the
-    // output element.
-    llvm_ir::LlvmIfData if_lane_id_is_zero_data = llvm_ir::EmitIfThenElse(
-        ICmpEQ(lane_id, index_typed_constant(0)), "lane_id_is_zero", &b_);
-    llvm_ir::SetToFirstInsertPoint(if_lane_id_is_zero_data.true_block, &b_);
-    for (int i = 0; i != num_reduces; ++i) {
-      llvm::Value* output_address =
-          GetIrArray(*output, *output, reduce_output_shapes[i])
-              .EmitArrayElementAddress(
-                  IrArray::Index(y,
-                                 ShapeUtil::GetSubshape(
-                                     output->shape(), reduce_output_shapes[i]),
-                                 &b_),
-                  &b_, "output_element_address");
-      // We don't need to emit atomic operations if there is only one tile of
-      // results. 'depth' is the z dimension, 'width' is the x dimension.
-      if (z_tile_size >= depth && x_tile_size >= width) {
-        TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-            *reducers[i],
-            {output_address, partial_reduction_result_addresses[i]},
-            output_address));
-      } else {
-        TF_RETURN_IF_ERROR(EmitAtomicOperationForNestedComputation(
-            *reducers[i], output_address,
-            partial_reduction_result_addresses[i]));
-      }
-    }
-    return Status::OK();
-  };
-
-  // Emit a parallel loop that iterates through every input tiles.
-  UpdateLaunchDimensions(launch_dimensions, kernel_thunk,
-                         ir_emitter_context_->llvm_module());
-  return ParallelLoopEmitter(loop_body_emitter, tiled_input_shape,
-                             launch_dimensions, &b_)
-      .EmitLoop(IrName(reduce), index_ty);
-}
-
-// Figures out whether `reduce` is a row or column reduction, and which
-// dimensions to reduce, and calls either `EmitRowReduction` or
-// `EmitColumnReduction` as appropriate.
-// Prerequisite: all the dimensions to keep are contiguous in the input layout
-//               and, if `reduce` is fused, the fused subgraph is pure
-//               elementwise.
-Status IrEmitterUnnested::EmitReductionToVector(
-    KernelThunk* kernel_thunk, HloInstruction* reduce, const Shape& input_shape,
-    absl::Span<const llvm_ir::ElementGenerator> input_gens,
-    absl::Span<const llvm_ir::ElementGenerator> init_value_gens,
-    absl::Span<const int64> dimensions_to_reduce,
-    absl::Span<HloComputation* const> reducers,
-    absl::Span<const ShapeIndex> reduce_output_shapes,
-    absl::Span<const std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
-        extra_output_gens) {
-  // This emission requires "reduce" to have an input layout. It is either set
-  // by LayoutAssignment (for a top-level kReduce) or by InstructionFusion (for
-  // a fused kReduce).
-  CHECK(input_shape.has_layout()) << "LayoutAssignment or InstructionFusion "
-                                     "doesn't set the input layout of "
-                                  << reduce->ToString();
-
-  // Specialize multi-dimensional-array-to-vector reduction.
-  std::vector<int64> input_dims_to_keep;
-  for (int64 input_dim = 0; input_dim < ShapeUtil::Rank(input_shape);
-       ++input_dim) {
-    if (std::find(dimensions_to_reduce.begin(), dimensions_to_reduce.end(),
-                  input_dim) == dimensions_to_reduce.end()) {
-      input_dims_to_keep.push_back(input_dim);
-    }
-  }
-
-  // Sort the dimensions to keep from minor to major, to facilitate checking
-  // whether another dimension is major or minor of them.
-  std::sort(input_dims_to_keep.begin(), input_dims_to_keep.end(),
-            [&input_shape](int64 dim_a, int64 dim_b) {
-              return PositionInContainer(LayoutUtil::MinorToMajor(input_shape),
-                                         dim_a) <
-                     PositionInContainer(LayoutUtil::MinorToMajor(input_shape),
-                                         dim_b);
-            });
-  // Now, if output rank is at least 1, `input_dims_to_keep.front()` is
-  // minormost and `input_dims_to_keep.back()` is majormost.
-
-  // If the dimensions to keep are minormost, emit a column reduction. As all
-  // the dimensions to keep are contiguous, by prerequisite of
-  // `EmitReductionToVector`, we only need to check whether the minormost
-  // dimension of the input is to keep.
-  if (ShapeUtil::IsEffectiveScalar(reduce->shape())) {
-    return EmitReductionToScalar(kernel_thunk, reduce, input_shape, input_gens,
-                                 init_value_gens, reducers,
-                                 reduce_output_shapes, extra_output_gens);
-  } else if (input_dims_to_keep.front() ==
-             LayoutUtil::Minor(input_shape.layout(), 0)) {
-    // Column reduction. Treat the result of "input" as a matrix whose width
-    // is the most minor dimension and height the product of other dimensions,
-    // and treat "reduce" as a column reduction of the input matrix.
-    const int64 width = ShapeUtil::ElementsIn(reduce->shape());
-    // "width" can be zero, so don't do
-    //   height = ShapeUtil::ElementsIn(input_shape) / width;
-    int64 height = 1;
-    for (int64 input_dim = 0; input_dim < ShapeUtil::Rank(input_shape);
-         ++input_dim) {
-      if (!std::count(input_dims_to_keep.begin(), input_dims_to_keep.end(),
-                      input_dim)) {
-        height *= input_shape.dimensions(input_dim);
-      }
-    }
-    return EmitColumnReduction(kernel_thunk, height, width, reduce, input_shape,
-                               input_gens, init_value_gens, reducers,
-                               reduce_output_shapes, extra_output_gens);
-  } else {
-    // Reduce the row dimension of a matrix or reduce dimension 0 and 2 in a
-    // 3D tensor. The size of dimension 1 (the height) is the size of the
-    // dimension to keep, the size of dimension 0 (the depth) is the product
-    // of dimensions that are more major than the dimension to keep, and the
-    // size of dimension 2 (the width) is the product of more minor
-    // dimensions.
-    int64 depth = 1;
-    int64 width = 1;
-    for (int64 input_dim = 0; input_dim < ShapeUtil::Rank(input_shape);
-         ++input_dim) {
-      if (PositionInContainer(LayoutUtil::MinorToMajor(input_shape),
-                              input_dim) >
-          PositionInContainer(LayoutUtil::MinorToMajor(input_shape),
-                              input_dims_to_keep.back())) {
-        depth *= input_shape.dimensions(input_dim);
-      } else if (PositionInContainer(LayoutUtil::MinorToMajor(input_shape),
-                                     input_dim) <
-                 PositionInContainer(LayoutUtil::MinorToMajor(input_shape),
-                                     input_dims_to_keep.front())) {
-        width *= input_shape.dimensions(input_dim);
-      }
-    }
-    const int64 height = ShapeUtil::ElementsIn(reduce->shape());
-    return EmitRowReduction(kernel_thunk, depth, height, width, reduce,
-                            input_shape, input_gens, init_value_gens, reducers,
-                            reduce_output_shapes, extra_output_gens);
-  }
-}
-
 Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) {
-  // TODO(b/112040122): Support multi-output reduce.
-  if (!ShapeUtil::IsArray(reduce->shape())) {
+  // TODO(b/118332391): Support multi-output reduce.
+  if (!reduce->shape().IsArray()) {
     return Unimplemented("Multi-output reduce is not supported on GPU");
   }
-  auto input = reduce->operand(0);
-  auto init_value = reduce->operand(1);
-  absl::Span<const int64> dimensions_to_reduce(reduce->dimensions());
-  HloComputation* reducer = reduce->to_apply();
-  // HandleReduce specializes reduction from a multi-dimensional array to a 1D
-  // array. The specialized version requires an initializer thunk that
-  // initializes the output array to the initial value of the reduce.
   if (IsReductionToVector(*reduce)) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> initializer_thunk,
-                        BuildInitializerThunk(reduce));
-    std::vector<std::unique_ptr<Thunk>> thunks;
-    thunks.push_back(std::move(initializer_thunk));
-    std::unique_ptr<KernelThunk> kernel_thunk =
-        BuildKernelThunk(reduce, /*implements_whole_instruction=*/false);
-
-    TF_CHECK_OK(EmitReductionToVector(
-        kernel_thunk.get(), reduce, input->shape(),
-        {[&](const IrArray::Index& index) {
-          return GetIrArray(*input, *reduce).EmitReadArrayElement(index, &b_);
-        }},
-        {[&](const IrArray::Index& index) {
-          return GetIrArray(*init_value, *reduce)
-              .EmitReadArrayElement(index, &b_);
-        }},
-        dimensions_to_reduce, {reducer}, {{}}, {}));
-
-    thunks.push_back(std::move(kernel_thunk));
-
-    std::unique_ptr<SequentialThunk> sequential_thunk =
-        absl::make_unique<SequentialThunk>(std::move(thunks), reduce);
-    AddThunkToThunkSequence(std::move(sequential_thunk));
-    return Status::OK();
+    return EmitReductionToVector(reduce);
   }
 
   return IrEmitter::HandleReduce(reduce);
@@ -1755,8 +735,8 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
   const auto* source = select_and_scatter->operand(1);
   const Window& window = select_and_scatter->window();
   PrimitiveType operand_element_type = operand->shape().element_type();
-  const int64 rank = ShapeUtil::Rank(operand->shape());
-  CHECK_EQ(rank, ShapeUtil::Rank(source->shape()));
+  const int64 rank = operand->shape().rank();
+  CHECK_EQ(rank, source->shape().rank());
   CHECK_EQ(rank, window.dimensions_size());
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> initializer_thunk,
@@ -1820,7 +800,7 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
     // Create the inner loop to iterate over the window.
     llvm_ir::ForLoopNest window_loops(IrName(select_and_scatter, "inner"), &b_,
                                       index_type);
-    std::vector<int64> window_size;
+    DimensionVector window_size;
     for (const auto& dim : window.dimensions()) {
       window_size.push_back(dim.size());
       CHECK_GT(dim.size(), 0);
@@ -2014,18 +994,18 @@ Status IrEmitterUnnested::HandleScatter(HloInstruction* scatter) {
       BuildKernelThunk(scatter,
                        /*implements_whole_instruction=*/thunks.empty()));
 
-  TF_RETURN_IF_ERROR(
-      EmitScatter(thunks.back().get(), scatter,
-                  /*scatter_indices_gen=*/
-                  [=](const IrArray::Index& index) {
-                    return GetIrArray(*scatter_indices, *scatter)
-                        .EmitReadArrayElement(index, &b_, "scatter_index");
-                  },
-                  /*updates_gen=*/
-                  [=](const IrArray::Index& index) {
-                    return GetIrArray(*updates, *scatter)
-                        .EmitReadArrayElement(index, &b_, "update");
-                  }));
+  TF_RETURN_IF_ERROR(EmitScatter(
+      thunks.back().get(), scatter,
+      /*scatter_indices_gen=*/
+      [=](const IrArray::Index& index) {
+        return GetIrArray(*scatter_indices, *scatter)
+            .EmitReadArrayElement(index, &b_, "scatter_index");
+      },
+      /*updates_gen=*/
+      [=](const IrArray::Index& index) {
+        return GetIrArray(*updates, *scatter)
+            .EmitReadArrayElement(index, &b_, "update");
+      }));
 
   // Elide the sequential thunk if there's no copy.
   if (thunks.size() == 1) {
@@ -2072,7 +1052,7 @@ Status IrEmitterUnnested::EmitScatter(
     int64 raw_window_multidim_idx = 0;
     std::vector<llvm::Value*> input_window_multidim;
     std::vector<int64> input_window_bounds;
-    for (int64 i = 0, e = ShapeUtil::Rank(operand->shape()); i != e; ++i) {
+    for (int64 i = 0, e = operand->shape().rank(); i != e; ++i) {
       if (absl::c_binary_search(dim_numbers.inserted_window_dims(), i)) {
         input_window_bounds.push_back(1);  // Trivial dimension.
         input_window_multidim.push_back(index.GetConstantWithIndexType(0));
@@ -2084,12 +1064,11 @@ Status IrEmitterUnnested::EmitScatter(
         ++raw_window_multidim_idx;
       }
     }
-    DCHECK_EQ(input_window_multidim.size(), ShapeUtil::Rank(operand->shape()));
+    DCHECK_EQ(input_window_multidim.size(), operand->shape().rank());
 
     // Insert a 1 dimension at the end if index_vector_dim requests one.
     Shape scatter_indices_shape = scatter_indices->shape();
-    if (dim_numbers.index_vector_dim() ==
-        ShapeUtil::Rank(scatter_indices_shape)) {
+    if (dim_numbers.index_vector_dim() == scatter_indices_shape.rank()) {
       scatter_indices_shape.add_dimensions(1);
       scatter_indices_shape.mutable_layout()->add_minor_to_major(
           dim_numbers.index_vector_dim());
@@ -2174,17 +1153,7 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
   std::vector<std::unique_ptr<Thunk>> thunks;
   Shape keys_shape = sort->operand(0)->shape();
   int64 dimension_to_sort = sort->dimensions(0);
-  // In case there is a 'values' parameter that is a iota, we take note and use
-  // it later to ensure a stable sort. Otherwise, we don't guarantee a stable
-  // sort.
-  int64 iota_values_parameter_index = -1;
   for (int64 i = 0; i < sort->operand_count(); ++i) {
-    if (i > 0 && sort->operand(i)->opcode() == HloOpcode::kIota &&
-        ShapeUtil::ElementIsIntegral(sort->operand(i)->shape()) &&
-        Cast<HloIotaInstruction>(sort->operand(i))->iota_dimension() ==
-            dimension_to_sort) {
-      iota_values_parameter_index = i;
-    }
     ShapeIndex shape_index =
         sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
     // We assume that the layout of all involved operands and outputs is the
@@ -2297,25 +1266,23 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
                                              : standard_launch_dimensions;
     UpdateLaunchDimensions(launch_dimensions, thunks.back().get(),
                            ir_emitter_context_->llvm_module());
-    IrArray keys_array;
     std::vector<IrArray> values_arrays;
-    values_arrays.reserve(sort->operand_count() - 1);
+    values_arrays.reserve(sort->operand_count());
     for (int64 i = 0; i < sort->operand_count(); ++i) {
       ShapeIndex shape_index =
           sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
-      if (i == 0) {
-        keys_array = GetIrArray(*sort, *sort, shape_index);
-      } else {
-        values_arrays.push_back(GetIrArray(*sort, *sort, shape_index));
-      }
+      values_arrays.push_back(GetIrArray(*sort, *sort, shape_index));
     }
     return llvm_ir::EmitSortInPlace(
-        dimension_to_sort, keys_array, values_arrays,
-        iota_values_parameter_index, IrName(sort), xor_masks, &b_,
+        dimension_to_sort, values_arrays, IrName(sort), xor_masks, &b_,
         launch_dimensions,
         xor_masks.size() > 1 ? num_iterations_in_sort_dim
                              : standard_num_iterations_in_sort_dim,
-        kTileSize);
+        kTileSize,
+        [&](absl::Span<llvm::Value* const> operands, llvm::Value* output) {
+          return EmitCallToNestedComputation(*sort->to_apply(), operands,
+                                             output);
+        });
   };
   std::vector<int64> xor_masks;
   for (int64 stage = 0; stage < num_stages; ++stage) {
@@ -2352,11 +1319,11 @@ Status IrEmitterUnnested::HandleTupleSelect(HloInstruction* tuple_select) {
   return IrEmitter::HandleTupleSelect(tuple_select);
 }
 
-Status IrEmitterUnnested::HandleCrossReplicaSum(HloInstruction* crs) {
+Status IrEmitterUnnested::HandleAllReduce(HloInstruction* crs) {
   if (hlo_module_config_.replica_count() != 1) {
     // TODO(b/33011107): Support nontrivial cross replica sum on GPU.
     return Unimplemented(
-        "CrossReplicaSum with >1 replica is not implemented on GPU.");
+        "AllReduce with >1 replica is not implemented on GPU.");
   }
 
   // CRS with one operand and one replica is simply the identity function.
@@ -2367,8 +1334,8 @@ Status IrEmitterUnnested::HandleCrossReplicaSum(HloInstruction* crs) {
   // HloModuleConfig::num_replicas changes between when the module is compiled
   // and when it's run.
   if (crs->operand_count() == 1) {
-    CHECK(ShapeUtil::IsArray(crs->operand(0)->shape()))
-        << "Operands to cross-replica-sum must be arrays: " << crs->ToString();
+    CHECK(crs->operand(0)->shape().IsArray())
+        << "Operands to all-reduce must be arrays: " << crs->ToString();
     AddThunkToThunkSequence(absl::make_unique<DeviceToDeviceCopyThunk>(
         /*source_address=*/GetAllocationSlice(*crs->operand(0)),
         /*destination_buffer=*/GetAllocationSlice(*crs),
@@ -2566,10 +1533,10 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
                     return !allocation->is_constant();
                   });
 
-  std::sort(non_constant_buffers.begin(), non_constant_buffers.end(),
-            [](const BufferAllocation* a, const BufferAllocation* b) {
-              return a->index() < b->index();
-            });
+  absl::c_sort(non_constant_buffers,
+               [](const BufferAllocation* a, const BufferAllocation* b) {
+                 return a->index() < b->index();
+               });
 
   llvm::Function* kernel = BuildKernelPrototype(*inst, non_constant_buffers);
 
@@ -2814,6 +1781,29 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildFftThunk(
       /*output_shape=*/inst->shape(), inst);
 }
 
+std::unique_ptr<Thunk> IrEmitterUnnested::BuildTriangularSolveThunk(
+    const HloInstruction* inst) {
+  const HloInstruction* a = inst->operand(0);
+  const HloInstruction* b = inst->operand(1);
+  int64 m = b->shape().dimensions(b->shape().rank() - 2);
+  int64 n = b->shape().dimensions(b->shape().rank() - 1);
+  int64 batch_size = std::accumulate(
+      b->shape().dimensions().begin(), b->shape().dimensions().end() - 2,
+      int64{1}, [](int64 a, int64 b) { return a * b; });
+  int64 elem_size =
+      ShapeUtil::ByteSizeOfPrimitiveType(inst->shape().element_type());
+  int64 a_batch_stride = inst->triangular_solve_options().left_side()
+                             ? m * m * elem_size
+                             : n * n * elem_size;
+  int64 b_batch_stride = m * n * elem_size;
+  return absl::make_unique<TriangularSolveThunk>(
+      inst->triangular_solve_options(),
+      /*a_input_buffer=*/GetAllocationSlice(*a),
+      /*b_input_buffer=*/GetAllocationSlice(*inst),
+      inst->shape().element_type(), batch_size, m, n, a_batch_stride,
+      b_batch_stride, inst);
+}
+
 StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
     HloInstruction* hlo, const ShapeIndex& index) {
   bool fused = HloOpcode::kFusion == hlo->opcode();
@@ -3121,11 +2111,9 @@ Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
   // pressure, since we touch threadIdx.x and blockIdx.x at the beginning of the
   // kernel *anyway*.
   std::vector<IrArray> output_arrays = ConstructIrArrayForOutputs(hlo);
-  TF_RETURN_IF_ERROR(
-      KernelSupportLibrary(&b_).If("emit_mof_tuple", IsBlock0Thread0(&b_), [&] {
-        llvm_ir::EmitTuple(GetIrArray(hlo, hlo), output_arrays, &b_, module_);
-        return Status::OK();
-      }));
+  KernelSupportLibrary{&b_}.If("emit_mof_tuple", IsBlock0Thread0(&b_), [&] {
+    llvm_ir::EmitTuple(GetIrArray(hlo, hlo), output_arrays, &b_, module_);
+  });
 
   // For multioutput fusion, we need to emit each operand and the root.
   TF_RETURN_IF_ERROR(
@@ -3139,12 +2127,36 @@ Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
   return Status::OK();
 }
 
+namespace {
+
+// Returns true if the fusion contains any instruction that is likely
+// translated to complex LLVM IR, such as loops, and prevent vectorization.
+bool MayPreventVectorization(const HloInstruction& fusion_hlo) {
+  CHECK_EQ(fusion_hlo.opcode(), HloOpcode::kFusion);
+  return absl::c_any_of(
+      fusion_hlo.fused_instructions_computation()->instructions(),
+      [&](const HloInstruction* instr) {
+        switch (instr->opcode()) {
+          case HloOpcode::kReduce:
+          case HloOpcode::kReduceWindow:
+          case HloOpcode::kSort:
+          case HloOpcode::kDot:
+            return true;
+          default:
+            return false;
+        }
+      });
+}
+
+}  // namespace
+
 Status IrEmitterUnnested::EmitTargetElementLoop(
     const HloInstruction& hlo,
     const llvm_ir::ElementGenerator& element_generator) {
   int unroll_factor = 1;
   // Unfused elementwise operations are usually memory bound, unroll them.
-  if (hlo.IsElementwise() || hlo.opcode() == HloOpcode::kFusion) {
+  if (hlo.IsElementwise() ||
+      (hlo.opcode() == HloOpcode::kFusion && !MayPreventVectorization(hlo))) {
     unroll_factor = ComputeMaxUnrollFactor(&hlo);
   }
 
@@ -3167,7 +2179,6 @@ std::vector<IrArray> IrEmitterUnnested::ConstructIrArrayForInputs(
   return param_arrays;
 }
 
-
 int IrEmitterUnnested::ConstructInputReducedShapeAndCastInputIrArrayToShape(
     const HloInstruction& hlo, const std::vector<IrArray>& param_arrays,
     const std::vector<llvm::Value*>& param_buffers,
@@ -3195,54 +2206,90 @@ int IrEmitterUnnested::ConstructInputReducedShapeAndCastInputIrArrayToShape(
 
 namespace {
 
-void EmitFullTile(const KernelMappingScheme* mapping_scheme,
-                  const IrArray::Index& tile_origin_index,
-                  llvm::IRBuilder<>* builder, llvm::Value* y, llvm::Value* x,
-                  llvm::Type* index_ty,
-                  const std::function<void(const IrArray::Index&, llvm::Value*,
-                                           llvm::Value*)>& emit_elem_function) {
+std::tuple<llvm::Value*, int64> GetStartOffsetAndStepForX(
+    int64 tile_size_x, int64 num_threads_x,
+    const KernelMappingScheme* mapping_scheme, llvm::IRBuilder<>* builder,
+    llvm::Value* x, llvm::Type* index_ty) {
+  llvm::Value* start_offset_x;
+  int64 step_x;
+  if (mapping_scheme->DilatedX()) {
+    start_offset_x = x;
+    step_x = num_threads_x;
+  } else {
+    start_offset_x = builder->CreateMul(
+        x, llvm::ConstantInt::get(index_ty, tile_size_x / num_threads_x));
+    step_x = 1;
+  }
+  return std::make_tuple(start_offset_x, step_x);
+}
+
+void EmitFullElementalTile(const KernelMappingScheme* mapping_scheme,
+                           const IrArray::Index& tile_origin_index,
+                           const string& loop_name, KernelSupportLibrary* ksl,
+                           llvm::IRBuilder<>* builder, llvm::Value* y,
+                           llvm::Value* x, llvm::Type* index_ty,
+                           const EmitElementFunction& emit_elem_function) {
   int64 num_threads_x = mapping_scheme->GetNumberOfThreadsForDimensionX();
   int64 num_threads_y = mapping_scheme->GetNumberOfThreadsForDimensionY();
   int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX();
   int64 tile_size_y = mapping_scheme->GetTileSizeForDimensionY();
-  for (int64 i = 0; i < tile_size_y; i += num_threads_y) {
-    IrArray::Index source_idx_y =
-        tile_origin_index.AddOffsetToDim(llvm::ConstantInt::get(index_ty, i),
-                                         KernelMappingScheme::DimY, builder);
-    llvm::Value* y_loc =
-        builder->CreateAdd(llvm::ConstantInt::get(index_ty, i), y);
-    for (int64 j = 0; j < tile_size_x; j += num_threads_x) {
-      IrArray::Index source_idx =
-          source_idx_y.AddOffsetToDim(llvm::ConstantInt::get(index_ty, j),
-                                      KernelMappingScheme::DimX, builder);
-      llvm::Value* x_loc =
-          builder->CreateAdd(llvm::ConstantInt::get(index_ty, j), x);
-      emit_elem_function(source_idx, y_loc, x_loc);
-    }
-  }
-}
 
-void EmitPartialTile(
-    const KernelMappingScheme* mapping_scheme,
-    const IrArray::Index& tile_origin_index, const string& loop_name,
-    KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y,
-    llvm::Value* x, llvm::Value* tile_height, llvm::Value* tile_width,
-    llvm::Type* index_ty,
-    const std::function<void(const IrArray::Index&, llvm::Value*,
-                             llvm::Value*)>& emit_elem_function) {
+  llvm::Value* start_offset_x;
+  int64 step_x;
+  std::tie(start_offset_x, step_x) = GetStartOffsetAndStepForX(
+      tile_size_x, num_threads_x, mapping_scheme, builder, x, index_ty);
+  IrArray::Index source_idx =
+      tile_origin_index.AddOffsetToDim(y, KernelMappingScheme::DimY, builder)
+          .AddOffsetToDim(start_offset_x, KernelMappingScheme::DimX, builder);
+  ksl->For(loop_name + "_y", /*start=*/llvm::ConstantInt::get(index_ty, 0),
+           /*end=*/llvm::ConstantInt::get(index_ty, tile_size_y),
+           /*step=*/llvm::ConstantInt::get(index_ty, num_threads_y),
+           [&](llvm::Value* y_indvar) {
+             IrArray::Index source_idx_y = source_idx.AddOffsetToDim(
+                 y_indvar, KernelMappingScheme::DimY, builder);
+             llvm::Value* y_loc = builder->CreateAdd(y_indvar, y);
+
+             for (int64 j = 0; j < tile_size_x / num_threads_x; j++) {
+               IrArray::Index source_idx_y_x = source_idx_y.AddOffsetToDim(
+                   llvm::ConstantInt::get(index_ty, j * step_x),
+                   KernelMappingScheme::DimX, builder);
+               llvm::Value* x_loc = builder->CreateAdd(
+                   llvm::ConstantInt::get(index_ty, j * step_x),
+                   start_offset_x);
+               emit_elem_function(source_idx_y_x, y_loc, x_loc, j);
+             }
+           });
+}
+
+void EmitPartialElementalTile(const KernelMappingScheme* mapping_scheme,
+                              const IrArray::Index& tile_origin_index,
+                              const string& loop_name,
+                              KernelSupportLibrary* ksl,
+                              llvm::IRBuilder<>* builder, llvm::Value* y,
+                              llvm::Value* x, llvm::Value* tile_height,
+                              llvm::Value* tile_width, llvm::Type* index_ty,
+                              const EmitElementFunction& emit_elem_function) {
   int64 num_threads_x = mapping_scheme->GetNumberOfThreadsForDimensionX();
   int64 num_threads_y = mapping_scheme->GetNumberOfThreadsForDimensionY();
   int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX();
 
-  for (int64 j = 0; j < tile_size_x; j += num_threads_x) {
-    IrArray::Index source_idx =
-        tile_origin_index.AddOffsetToDim(llvm::ConstantInt::get(index_ty, j),
-                                         KernelMappingScheme::DimX, builder);
-    llvm::Value* x_loc =
-        builder->CreateAdd(llvm::ConstantInt::get(index_ty, j), x);
-
-    ksl->IfReturnVoid(
-        "x_in_tile", builder->CreateICmpULT(x_loc, tile_width), [&] {
+  llvm::Value* start_offset_x;
+  int64 step_x;
+  std::tie(start_offset_x, step_x) = GetStartOffsetAndStepForX(
+      tile_size_x, num_threads_x, mapping_scheme, builder, x, index_ty);
+  IrArray::Index source_idx =
+      tile_origin_index.AddOffsetToDim(y, KernelMappingScheme::DimY, builder)
+          .AddOffsetToDim(start_offset_x, KernelMappingScheme::DimX, builder);
+  for (int64 j = 0; j < tile_size_x / num_threads_x; j++) {
+    IrArray::Index source_idx_x =
+        source_idx.AddOffsetToDim(llvm::ConstantInt::get(index_ty, j * step_x),
+                                  KernelMappingScheme::DimX, builder);
+    llvm::Value* x_loc = builder->CreateAdd(
+        llvm::ConstantInt::get(index_ty, j * step_x), start_offset_x);
+
+    ksl->If(
+        loop_name + "_x_in_tile", builder->CreateICmpULT(x_loc, tile_width),
+        [&] {
           // tile_height_bound =
           //   ceil(tile_height / num_threads_y) * num_threads_y
           llvm::Value* ceiling_of_ratio = builder->CreateUDiv(
@@ -3252,20 +2299,19 @@ void EmitPartialTile(
           llvm::Value* tile_height_bound = builder->CreateMul(
               ceiling_of_ratio,
               llvm::ConstantInt::get(index_ty, num_threads_y));
-          ksl->ForReturnVoid(
+          ksl->For(
               loop_name, /*start=*/llvm::ConstantInt::get(index_ty, 0),
               /*end=*/tile_height_bound,
               /*step=*/llvm::ConstantInt::get(index_ty, num_threads_y),
               [&](llvm::Value* y_indvar) {
                 llvm::Value* y_loc = builder->CreateAdd(y_indvar, y);
-                ksl->IfReturnVoid(
-                    "y_in_tile", builder->CreateICmpULT(y_loc, tile_height),
-                    [&] {
-                      emit_elem_function(
-                          source_idx.AddOffsetToDim(
-                              y_indvar, KernelMappingScheme::DimY, builder),
-                          y_loc, x_loc);
-                    });
+                ksl->If(loop_name + "_y_in_tile",
+                        builder->CreateICmpULT(y_loc, tile_height), [&] {
+                          emit_elem_function(
+                              source_idx_x.AddOffsetToDim(
+                                  y_indvar, KernelMappingScheme::DimY, builder),
+                              y_loc, x_loc, j);
+                        });
               });
         });
   }
@@ -3284,27 +2330,26 @@ void EmitTiledElementalCodeWithBoundsCheck(
     const IrArray::Index& tile_origin_index, const string& loop_name,
     KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y,
     llvm::Value* x, llvm::Value* tile_height, llvm::Value* tile_width,
-    const std::function<void(const IrArray::Index&, llvm::Value*,
-                             llvm::Value*)>& emit_elem_function) {
+    const EmitElementFunction& emit_elem_function) {
   int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX();
   int64 tile_size_y = mapping_scheme->GetTileSizeForDimensionY();
   llvm::Type* index_ty = tile_width->getType();
 
-  ksl->IfReturnVoid(
-      "full_tile",
+  ksl->If(
+      loop_name + "_full_tile",
       builder->CreateAnd(
           builder->CreateICmpEQ(llvm::ConstantInt::get(index_ty, tile_size_x),
                                 tile_width),
           builder->CreateICmpEQ(llvm::ConstantInt::get(index_ty, tile_size_y),
                                 tile_height)),
       [&] {
-        EmitFullTile(mapping_scheme, tile_origin_index, builder, y, x, index_ty,
-                     emit_elem_function);
+        EmitFullElementalTile(mapping_scheme, tile_origin_index, loop_name, ksl,
+                              builder, y, x, index_ty, emit_elem_function);
       },
       [&] {
-        EmitPartialTile(mapping_scheme, tile_origin_index, loop_name, ksl,
-                        builder, y, x, tile_height, tile_width, index_ty,
-                        emit_elem_function);
+        EmitPartialElementalTile(mapping_scheme, tile_origin_index, loop_name,
+                                 ksl, builder, y, x, tile_height, tile_width,
+                                 index_ty, emit_elem_function);
       });
 }
 }  // namespace
@@ -3321,7 +2366,7 @@ void EmitTiledElementalCodeWithBoundsCheck(
 void IrEmitterUnnested::EmitTileElementForCopy(
     HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
     const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
-    llvm::Value* x_loc) {
+    llvm::Value* x_loc, int64 /*x_iter_num*/) {
   llvm_ir::TiledParameterInfo* tiled_param_info =
       kernel_info->GetTiledParameterInfo();
   // TODO(jlebar): Add AA metadata to this load.
@@ -3351,7 +2396,7 @@ void IrEmitterUnnested::EmitTileElementForCopy(
 void IrEmitterUnnested::EmitTileElementForFusion(
     HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
     const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
-    llvm::Value* x_loc) {
+    llvm::Value* x_loc, int64 /*x_iter_num*/) {
   llvm_ir::TiledParameterInfo* tiled_param_info =
       kernel_info->GetTiledParameterInfo();
   std::vector<IrArray> output_arrays = ConstructIrArrayForOutputs(*hlo);
@@ -3382,10 +2427,443 @@ void IrEmitterUnnested::EmitTileElementForFusion(
   }
 }
 
-// Emits a block of tiles, given a function object to emit one tile.
+// Information to support the code generation for a tiled reduction kernel.
+using AddressVector = InlinedVector<llvm::AllocaInst*, 1>;
+class ReductionCodegenInfo : public IrEmitterUnnested::KernelCodegenInfo {
+ public:
+  explicit ReductionCodegenInfo(llvm_ir::KernelMappingScheme* mapping_scheme,
+                                bool is_row_reduction)
+      : KernelCodegenInfo(mapping_scheme),
+        current_output_linear_index_address_(nullptr),
+        current_output_inbound_address_(nullptr),
+        is_row_reduction_(is_row_reduction) {}
+
+  void SetCurrentOutputLinearIndexAddress(llvm::AllocaInst* a) {
+    current_output_linear_index_address_ = a;
+  }
+  // Returns the address of the memory that stores the linear index of the
+  // current output. Since we are processing reduction to contiguous physical
+  // dimensions, this linear index is the linear index of the 1D output array.
+  llvm::AllocaInst* GetCurrentOutputLinearIndexAddress() const {
+    return current_output_linear_index_address_;
+  }
+
+  void SetCurrentOutputInboundAddress(llvm::AllocaInst* a) {
+    current_output_inbound_address_ = a;
+  }
+
+  llvm::AllocaInst* GetCurrentOutputInboundAddress() const {
+    return current_output_inbound_address_;
+  }
+
+  AddressVector* GetMutablePartialResultAddresses() {
+    return &partial_result_addresses_;
+  }
+  absl::Span<llvm::AllocaInst* const> GetPartialResultAddresses() const {
+    return partial_result_addresses_;
+  }
+
+  AddressVector* GetMutableReductionInputAddresses() {
+    return &reduction_input_addresses_;
+  }
+  absl::Span<llvm::AllocaInst* const> GetReductionInputAddresses() const {
+    return reduction_input_addresses_;
+  }
+
+  InlinedVector<HloComputation*, 1>* GetMutableReducers() { return &reducers_; }
+  const InlinedVector<HloComputation*, 1>& GetReducers() const {
+    return reducers_;
+  }
+  int GetNumberOfReduces() const { return reducers_.size(); }
+
+  InlinedVector<ShapeIndex, 1>* GetMutableReductionOutputShapeIndices() {
+    return &reduction_output_shape_indices_;
+  }
+  absl::Span<const ShapeIndex> GetReductionOutputShapeIndices() const {
+    return reduction_output_shape_indices_;
+  }
+
+  bool IsRowReduction() const { return is_row_reduction_; }
+
+  // Return the dimension that is being reduced between DimX and DimY.
+  int GetReducedDimensionEnum() const {
+    return IsRowReduction() ? llvm_ir::KernelMappingScheme::DimX
+                            : llvm_ir::KernelMappingScheme::DimY;
+  }
+
+  // Return the dimension that is being ketp between DimX and DimY.
+  int GetKeptDimensionEnum() const {
+    return IsRowReduction() ? llvm_ir::KernelMappingScheme::DimY
+                            : llvm_ir::KernelMappingScheme::DimX;
+  }
+
+  int GetNumberOfPartialResults() const {
+    if (IsRowReduction()) {
+      return 1;
+    }
+    int64 num_thread = mapping_scheme_->GetNumberOfThreadsForDimensionX();
+    int64 tile_size = mapping_scheme_->GetTileSizeForDimensionX();
+    CHECK_EQ(tile_size % num_thread, 0);
+    return tile_size / num_thread;
+  }
+
+  int GetPartialResultIndex(int64 x_iter_num) const {
+    if (IsRowReduction()) {
+      return 0;
+    }
+    return x_iter_num;
+  }
+
+ private:
+  AddressVector partial_result_addresses_;
+  AddressVector reduction_input_addresses_;
+  InlinedVector<HloComputation*, 1> reducers_;
+  InlinedVector<ShapeIndex, 1> reduction_output_shape_indices_;
+  llvm::AllocaInst* current_output_linear_index_address_;
+  llvm::AllocaInst* current_output_inbound_address_;
+  bool is_row_reduction_;
+};
+
+namespace {
+// Returns a group of instructions that generate the output for the kernel
+// containing the given HLO instruction. The result may be an unnested kReduce
+// HLO, a nested kReduce HLO of a kInput fusion, or the operands of the tuple
+// for a multiple output fusion.
+absl::Span<HloInstruction* const> GetOutputInstructions(
+    HloInstruction* const* reduce_or_tuple_pointer) {
+  HloOpcode opcode = (*reduce_or_tuple_pointer)->opcode();
+  CHECK(opcode == HloOpcode::kReduce || opcode == HloOpcode::kTuple);
+  return opcode == HloOpcode::kTuple
+             ? (*reduce_or_tuple_pointer)->operands()
+             : absl::Span<HloInstruction* const>(reduce_or_tuple_pointer, 1);
+}
+
+const HloInstruction* GetFirstReduceInstruction(
+    absl::Span<HloInstruction* const> instructions) {
+  auto first_reduce_iter =
+      absl::c_find_if(instructions, [](const HloInstruction* inst) {
+        return inst->opcode() == HloOpcode::kReduce;
+      });
+  CHECK_NE(first_reduce_iter, instructions.end());
+  return *first_reduce_iter;
+}
+
+};  // namespace
+
+void IrEmitterUnnested::EmitPrologueForOneReduction(
+    HloInstruction* unnested_hlo, HloInstruction* reduce_inst, int reduce_idx,
+    KernelCodegenInfo* kernel_info, GpuElementalIrEmitter* elemental_emitter,
+    ShapeIndex output_shape_index) {
+  ReductionCodegenInfo* reduction_info =
+      static_cast<ReductionCodegenInfo*>(kernel_info);
+
+  InlinedVector<HloComputation*, 1>* reducers =
+      reduction_info->GetMutableReducers();
+  CHECK(IsReductionToVector(*reduce_inst));
+  reducers->push_back(reduce_inst->to_apply());
+
+  InlinedVector<ShapeIndex, 1>* reduction_output_shape_indices =
+      reduction_info->GetMutableReductionOutputShapeIndices();
+  reduction_output_shape_indices->push_back(std::move(output_shape_index));
+
+  AddressVector* reduction_input_addresses =
+      reduction_info->GetMutableReductionInputAddresses();
+  llvm::Type* element_type = llvm_ir::PrimitiveTypeToIrType(
+      reduce_inst->shape().element_type(), ir_emitter_context_->llvm_module());
+  llvm::AllocaInst* reduction_input_address = Alloca(element_type);
+  reduction_input_addresses->push_back(reduction_input_address);
+
+  int num_partial_results = reduction_info->GetNumberOfPartialResults();
+  AddressVector* partial_result_addresses =
+      reduction_info->GetMutablePartialResultAddresses();
+  llvm::AllocaInst* partial_result_address =
+      Alloca(element_type, /*ArraySize=*/b_.getInt32(num_partial_results),
+             "partial_reduction_result." + llvm::Twine(reduce_idx));
+  partial_result_addresses->push_back(partial_result_address);
+
+  // Initialize the partial result with the initial value of the reduction.
+  llvm::Value* init_ir_value;
+  if (unnested_hlo->opcode() == HloOpcode::kFusion) {
+    HloInstruction* init_value_operand = reduce_inst->mutable_operand(1);
+    FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(unnested_hlo),
+                                 elemental_emitter);
+
+    TF_CHECK_OK(init_value_operand->Accept(&fused_emitter));
+    init_ir_value =
+        fused_emitter
+            .GetGenerator(init_value_operand)(IrArray::Index(b_.getInt32Ty()))
+            .ValueOrDie();
+  } else {
+    const HloInstruction* init_value = unnested_hlo->operand(1);
+    init_ir_value =
+        GetIrArray(*init_value, *unnested_hlo)
+            .EmitReadArrayElement(IrArray::Index(b_.getInt32Ty()), &b_);
+  }
+
+  for (int i = 0; i < num_partial_results; ++i) {
+    Store(init_ir_value, InBoundsGEP(partial_result_address, {b_.getInt32(i)}));
+  }
+}
+
+void IrEmitterUnnested::EmitPrologueForReduction(
+    HloInstruction* unnested_hlo, KernelCodegenInfo* kernel_info) {
+  VLOG(10) << "Emit prologue for reduction " << unnested_hlo->ToString();
+  // Find the unnested kReduce or the tuple that contains a list of kReduce.
+  HloInstruction* reduce_or_tuple = unnested_hlo->opcode() == HloOpcode::kFusion
+                                        ? unnested_hlo->fused_expression_root()
+                                        : unnested_hlo;
+  absl::Span<HloInstruction* const> output_instructions =
+      GetOutputInstructions(&reduce_or_tuple);
+  ReductionCodegenInfo* reduction_info =
+      static_cast<ReductionCodegenInfo*>(kernel_info);
+  GpuElementalIrEmitter elemental_emitter(hlo_module_config_,
+                                          ir_emitter_context_->llvm_module(),
+                                          &b_, GetNestedComputer());
+  const HloInstruction* first_reduce = nullptr;
+  for (int i = 0, e = output_instructions.size(); i != e; ++i) {
+    if (output_instructions[i]->opcode() != HloOpcode::kReduce) {
+      continue;
+    }
+    HloInstruction* reduce_inst = output_instructions[i];
+    if (first_reduce == nullptr) {
+      first_reduce = reduce_inst;
+    } else {
+      CHECK(first_reduce->dimensions() == reduce_inst->dimensions());
+    }
+    ShapeIndex output_shape_index;
+    if (reduce_or_tuple->opcode() == HloOpcode::kTuple) {
+      output_shape_index = {i};
+    }
+
+    EmitPrologueForOneReduction(unnested_hlo, reduce_inst, i, kernel_info,
+                                &elemental_emitter,
+                                std::move(output_shape_index));
+  }
+
+  int num_partial_results = reduction_info->GetNumberOfPartialResults();
+
+  // Allocate stack storage to store the linear indices for the current output,
+  // and record the address of the storage.
+  reduction_info->SetCurrentOutputLinearIndexAddress(
+      Alloca(reduction_info->GetIndexType(),
+             /*ArraySize=*/b_.getInt32(num_partial_results),
+             "current_output_linear_index_address"));
+
+  if (!reduction_info->IsRowReduction()) {
+    llvm::Type* bool_ty = b_.getInt1Ty();
+    llvm::AllocaInst* output_inbound_addr = Alloca(bool_ty);
+    Store(llvm::ConstantInt::get(bool_ty, 0), output_inbound_addr);
+    reduction_info->SetCurrentOutputInboundAddress(output_inbound_addr);
+  }
+}
+
+void IrEmitterUnnested::EmitFullWarpShuffleDownLoopForAllReduces(
+    absl::Span<HloComputation* const> reducers,
+    absl::Span<llvm::AllocaInst* const> partial_result_addresses) {
+  for (int distance = 16; distance >= 1; distance /= 2) {
+    for (int i = 0; i != reducers.size(); ++i) {
+      llvm::Type* element_type =
+          partial_result_addresses[i]->getType()->getElementType();
+      int bit_width = llvm_ir::GetSizeInBits(element_type);
+      llvm::Value* result_from_other_lane = Alloca(
+          element_type, nullptr, "result_from_other_lane" + llvm::Twine(i));
+      // Bitcast cannot be applied to aggregate types (even packed ones), so
+      // we bitcast addresses of load/store to intN* of the same bit-width.
+      llvm::Type* shuffled_value_type =
+          element_type->isStructTy() ? b_.getIntNTy(bit_width) : element_type;
+      auto convert_pointer_for_shuffle = [&](llvm::Value* ptr) {
+        return BitCast(ptr, shuffled_value_type->getPointerTo());
+      };
+      llvm::Value* partial_result =
+          Load(convert_pointer_for_shuffle(partial_result_addresses[i]),
+               "partial_reduction_result");
+      Store(EmitFullWarpShuffleDown(partial_result, b_.getInt32(distance), &b_),
+            convert_pointer_for_shuffle(result_from_other_lane));
+      TF_CHECK_OK(EmitCallToNestedComputation(
+          *reducers[i], {partial_result_addresses[i], result_from_other_lane},
+          partial_result_addresses[i]));
+    }
+  }
+}
+
+void IrEmitterUnnested::EmitEpilogueForReduction(
+    HloInstruction* unnested_hlo, KernelCodegenInfo* kernel_info) {
+  ReductionCodegenInfo* reduction_info =
+      static_cast<ReductionCodegenInfo*>(kernel_info);
+  int num_reduces = reduction_info->GetNumberOfReduces();
+  absl::Span<llvm::AllocaInst* const> partial_result_addresses =
+      reduction_info->GetPartialResultAddresses();
+  const InlinedVector<HloComputation*, 1>& reducers =
+      reduction_info->GetReducers();
+  absl::Span<const ShapeIndex> reduction_output_shape_indices =
+      reduction_info->GetReductionOutputShapeIndices();
+
+  if (reduction_info->IsRowReduction()) {
+    EmitFullWarpShuffleDownLoopForAllReduces(reducers,
+                                             partial_result_addresses);
+    llvm::Value* lane_id = reduction_info->GetLaneId();
+    llvm_ir::LlvmIfData if_lane_id_is_zero_data = llvm_ir::EmitIfThenElse(
+        ICmpEQ(lane_id, llvm::ConstantInt::get(lane_id->getType(), 0)),
+        "lane_id_is_zero", &b_);
+    llvm_ir::SetToFirstInsertPoint(if_lane_id_is_zero_data.true_block, &b_);
+  } else {
+    llvm::Value* output_inbound_addr =
+        reduction_info->GetCurrentOutputInboundAddress();
+    llvm::Value* output_inbound = Load(output_inbound_addr);
+    llvm_ir::LlvmIfData if_output_inbound_data = llvm_ir::EmitIfThenElse(
+        ICmpEQ(output_inbound,
+               llvm::ConstantInt::get(output_inbound->getType(), 1)),
+        "output_inbound", &b_);
+    llvm_ir::SetToFirstInsertPoint(if_output_inbound_data.true_block, &b_);
+  }
+
+  int num_partial_results = reduction_info->GetNumberOfPartialResults();
+
+  // Emit an atomic operation that accumulates the partial reduction to the
+  // output element. For row reduction, this is only for lane 0 due to the
+  // if-statement emitted above.
+  for (int i = 0; i != num_reduces; ++i) {
+    for (int j = 0; j < num_partial_results; ++j) {
+      IrArray::Index element_index(
+          /*linear=*/Load(
+              InBoundsGEP(reduction_info->GetCurrentOutputLinearIndexAddress(),
+                          {b_.getInt32(j)}),
+              "output_linear_addr"),
+          ShapeUtil::GetSubshape(unnested_hlo->shape(),
+                                 reduction_output_shape_indices[i]),
+          &b_);
+      llvm::Value* output_address =
+          GetIrArray(*unnested_hlo, *unnested_hlo,
+                     reduction_output_shape_indices[i])
+              .EmitArrayElementAddress(element_index, &b_,
+                                       "output_element_address");
+      // Do not emit atomic operations if each element in the reduction result
+      // is computed by one block, that is the dimension being reduced has only
+      // one block.
+      const llvm_ir::KernelMappingScheme* mapping_scheme =
+          reduction_info->GetKernelMappingScheme();
+      if (mapping_scheme->GetTileBlockSizeForDimension(
+              llvm_ir::KernelMappingScheme::DimZ) == 1 &&
+          mapping_scheme->GetTileBlockSizeForDimension(
+              reduction_info->GetReducedDimensionEnum()) == 1) {
+        TF_CHECK_OK(EmitCallToNestedComputation(
+            *reducers[i],
+            {output_address,
+             InBoundsGEP(partial_result_addresses[i], {b_.getInt32(j)})},
+            output_address));
+      } else {
+        TF_CHECK_OK(EmitAtomicOperationForNestedComputation(
+            *reducers[i], output_address,
+            InBoundsGEP(partial_result_addresses[i], {b_.getInt32(j)})));
+      }
+    }
+  }
+}
+
+void IrEmitterUnnested::EmitTileElementForReduction(
+    HloInstruction* unnested_hlo, const llvm_ir::IrArray::Index& index,
+    const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
+    llvm::Value* x_loc, int64 x_iter_num) {
+  VLOG(10) << "Emit tile element for reduce " << unnested_hlo->ToString();
+  HloInstruction* reduce_or_tuple = unnested_hlo->opcode() == HloOpcode::kFusion
+                                        ? unnested_hlo->fused_expression_root()
+                                        : unnested_hlo;
+  llvm_ir::TiledParameterInfo* tiled_param_info =
+      kernel_info->GetTiledParameterInfo();
+  tiled_param_info->set_y(y_loc);
+  tiled_param_info->set_x(x_loc);
+
+  // Record the linear address for the current reduction.
+  const ReductionCodegenInfo* reduction_info =
+      dynamic_cast<const ReductionCodegenInfo*>(kernel_info);
+  int partial_result_index = reduction_info->IsRowReduction() ? 0 : x_iter_num;
+
+  Store(index[reduction_info->GetKeptDimensionEnum()],
+        InBoundsGEP(reduction_info->GetCurrentOutputLinearIndexAddress(),
+                    {b_.getInt32(partial_result_index)}));
+  if (!reduction_info->IsRowReduction()) {
+    llvm::Type* bool_ty = b_.getInt1Ty();
+    llvm::AllocaInst* output_inbound_addr =
+        reduction_info->GetCurrentOutputInboundAddress();
+    Store(llvm::ConstantInt::get(bool_ty, 1), output_inbound_addr);
+  }
+
+  InlinedVector<llvm_ir::ElementGenerator, 1> input_gens;
+  std::vector<std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
+      extra_output_gens;
+  GpuElementalIrEmitter elem_emitter(hlo_module_config_, module_, &b_,
+                                     GetNestedComputer());
+  FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(unnested_hlo),
+                               &elem_emitter);
+  absl::Span<HloInstruction* const> output_instructions =
+      GetOutputInstructions(&reduce_or_tuple);
+  // Construct the ElementGenerator for each reduction and extra output in the
+  // the group of output instructions.
+  if (unnested_hlo->opcode() == HloOpcode::kFusion) {
+    fused_emitter.SetTiledParameterInfo(tiled_param_info);
+    TF_CHECK_OK(unnested_hlo->fused_expression_root()->Accept(&fused_emitter));
+
+    for (int i = 0, e = output_instructions.size(); i != e; ++i) {
+      const HloInstruction* inst = output_instructions[i];
+      ShapeIndex output_shape_index;
+      if (reduce_or_tuple->opcode() == HloOpcode::kTuple) {
+        output_shape_index = {i};
+      }
+      if (inst->opcode() == HloOpcode::kReduce) {
+        input_gens.push_back(fused_emitter.GetGenerator(inst->operand(0)));
+      } else {
+        extra_output_gens.emplace_back(fused_emitter.GetGenerator(inst),
+                                       std::move(output_shape_index));
+      }
+    }
+  } else {
+    input_gens.push_back([&](const IrArray::Index& index) {
+      return GetIrArray(*unnested_hlo->operand(0), *unnested_hlo)
+          .EmitReadArrayElement(index, &b_);
+    });
+  }
+
+  IrArray::Index input_index =
+      reduction_info->GetKernelMappingScheme()->GetUnnormalizedIndex(
+          index,
+          GetFirstReduceInstruction(output_instructions)->operand(0)->shape());
+  int num_partial_results = reduction_info->GetNumberOfPartialResults();
+  if (num_partial_results > 1) {
+    // Clear the linear index field of the IrArray::Index to enable the use of
+    // GetElementPointer with array types. This enables the vectorization of
+    // the computation for different partial results.
+    input_index.ClearLinearIndex();
+  }
+  absl::Span<llvm::AllocaInst* const> partial_reduction_result_addresses =
+      reduction_info->GetPartialResultAddresses();
+  absl::Span<llvm::AllocaInst* const> reduction_input_addresses =
+      reduction_info->GetReductionInputAddresses();
+  const InlinedVector<HloComputation*, 1>& reducers =
+      reduction_info->GetReducers();
+
+  // Emit code to generate the input and perform the reduction computation for
+  // each reduction instruction.
+  for (int i = 0; i != reducers.size(); ++i) {
+    llvm::Value* const input_ir_value = input_gens[i](input_index).ValueOrDie();
+    Store(input_ir_value, reduction_input_addresses[i]);
+    llvm::Value* partial_result_address =
+        InBoundsGEP(partial_reduction_result_addresses[i],
+                    {b_.getInt32(partial_result_index)});
+    TF_CHECK_OK(EmitCallToNestedComputation(
+        *reducers[i], {partial_result_address, reduction_input_addresses[i]},
+        partial_result_address));
+  }
+
+  // Emit code to generate the output for the non-reduction instructions in the
+  // fusion, if any.
+  TF_CHECK_OK(
+      EmitExtraOutputsForReduce(unnested_hlo, input_index, extra_output_gens));
+}
+
+// Emits a kernel for the hlo instruction using the given tiling scheme.
 void IrEmitterUnnested::EmitBlock(const TileGenerator& emit_one_tile,
-                                  const KernelCodegenInfo* kernel_info,
-                                  KernelSupportLibrary& ksl,
+                                  KernelCodegenInfo* kernel_info,
+                                  KernelSupportLibrary* ksl,
                                   llvm::Type* index_ty) {
   KernelMappingScheme* mapping_scheme = kernel_info->GetKernelMappingScheme();
   absl::Span<const int64> dims_in_tile = mapping_scheme->GetDimensionsInTiles();
@@ -3418,16 +2896,14 @@ void IrEmitterUnnested::EmitBlock(const TileGenerator& emit_one_tile,
           llvm::Value* num_tiles_in_block =
               Select(ICmpEQ(last_block_for_dim, block_id_for_dim),
                      last_block_size_for_dim, block_size_for_dim);
-
-          ksl.ForReturnVoid(
-              loop_name,
-              /*start=*/index_typed_constant(0),
-              /*end=*/num_tiles_in_block,
-              /*step=*/1, [&](llvm::Value* block_dim_induction_var) {
-                IrArray::Index tile_index = starting_tile.AddOffsetToDim(
-                    block_dim_induction_var, dim_id, &b_);
-                emit_next_block_dim(tile_index);
-              });
+          ksl->For(loop_name,
+                   /*start=*/index_typed_constant(0),
+                   /*end=*/num_tiles_in_block,
+                   /*step=*/1, [&](llvm::Value* block_dim_induction_var) {
+                     IrArray::Index tile_index = starting_tile.AddOffsetToDim(
+                         block_dim_induction_var, dim_id, &b_);
+                     emit_next_block_dim(tile_index);
+                   });
         }
       };
 
@@ -3482,7 +2958,8 @@ void IrEmitterUnnested::EmitBlock(const TileGenerator& emit_one_tile,
 // unnested_hlo: The unnested hlo instruction for which the kernel is generated.
 //   Currently, these hlo instructions are supported: kLoop fusion, kCopy.
 // tiled_param_ids: The IDs for the parameters that are 0-2-1 transpose of
-//   other tensors with the same dimensions and need to be tiled and tranposed.
+//   other tensors with the same dimensions and are safe to be tranposed via
+//   the shared memory tranpose implementation.
 // mapping_scheme: The tiling scheme to use.
 // kernel_generator: Contains function objects for code generation, such as
 //   element generator, block prologue and epilogue generators.
@@ -3509,11 +2986,22 @@ LaunchDimensions IrEmitterUnnested::EmitKernel(
             << llvm_ir::DumpToString(*param_shmem_buffers[id]);
   }
 
-  CHECK_EQ(mapping_scheme->GetThreadsPerTile() % kWarpSize, 0);
-  LaunchDimensions launch_dimensions = LaunchDimensions(
-      mapping_scheme->GetNumberOfBlocks(), mapping_scheme->GetThreadsPerTile());
-  llvm::Type* index_ty = GetIndexTypeForKernel(
-      unnested_hlo, launch_dimensions.launch_bound(), &b_);
+  const ReductionCodegenInfo* reduction_info =
+      dynamic_cast<const ReductionCodegenInfo*>(kernel_info);
+  bool is_column_reduction =
+      (reduction_info && !reduction_info->IsRowReduction());
+
+  LaunchDimensions launch_dimensions =
+      LaunchDimensions(mapping_scheme->GetNumberOfBlocks(),
+                       mapping_scheme->GetThreadsPerBlock());
+
+  // TODO(b/110211620): Enable int32 index type for column reduction.
+  llvm::Type* index_ty =
+      is_column_reduction
+          ? b_.getInt64Ty()
+          : GetIndexTypeForKernel(unnested_hlo,
+                                  launch_dimensions.launch_bound(), &b_);
+
   auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
     return llvm::ConstantInt::get(index_ty, c);
   };
@@ -3523,14 +3011,12 @@ LaunchDimensions IrEmitterUnnested::EmitKernel(
   // but we do it at the beginning in the hopes of reducing register pressure,
   // since we touch threadIdx.x and blockIdx.x at the beginning of the kernel
   // *anyway*.
-  if (unnested_hlo->IsMultiOutputFusion()) {
-    TF_CHECK_OK(KernelSupportLibrary(&b_).If(
-        "emit_mof_tuple", IsBlock0Thread0(&b_), [&] {
-          llvm_ir::EmitTuple(GetIrArray(*unnested_hlo, *unnested_hlo),
-                             ConstructIrArrayForOutputs(*unnested_hlo), &b_,
-                             module_);
-          return Status::OK();
-        }));
+  if (!reduction_info && unnested_hlo->IsMultiOutputFusion()) {
+    KernelSupportLibrary{&b_}.If("emit_mof_tuple", IsBlock0Thread0(&b_), [&] {
+      llvm_ir::EmitTuple(GetIrArray(*unnested_hlo, *unnested_hlo),
+                         ConstructIrArrayForOutputs(*unnested_hlo), &b_,
+                         module_);
+    });
   }
 
   // For each tiled parameter, cast its input IrArray to the corresponding
@@ -3553,14 +3039,14 @@ LaunchDimensions IrEmitterUnnested::EmitKernel(
   kernel_info->SetLaneId(
       mapping_scheme->GetNumberOfThreadsForDimensionX() == kWarpSize ? x
                                                                      : nullptr);
+  kernel_info->SetIndexType(index_ty);
 
   KernelSupportLibrary ksl(&b_, llvm_ir::UnrollMode::kDefaultUnroll);
   // Curry a few parameters to EmitTiledElementalCodeWithBoundsCheck.
   auto emit_tiled_elemental_code_with_bounds_check =
       [&](const IrArray::Index& index, const string& loop_name,
           llvm::Value* tile_height, llvm::Value* tile_width,
-          const std::function<void(const IrArray::Index&, llvm::Value*,
-                                   llvm::Value*)>& emit_elem_function) {
+          const EmitElementFunction& emit_elem_function) {
         EmitTiledElementalCodeWithBoundsCheck(mapping_scheme, index, loop_name,
                                               &ksl, &b_, y, x, tile_height,
                                               tile_width, emit_elem_function);
@@ -3573,52 +3059,49 @@ LaunchDimensions IrEmitterUnnested::EmitKernel(
     const IrArray::Index input_tile_origin(
         Permute({0, 2, 1}, output_tile_origin.multidim()));
 
-    const IrArray::Index input_index =
-        input_tile_origin.AddOffsetToDim(x, KernelMappingScheme::DimX, &b_)
-            .AddOffsetToDim(y, KernelMappingScheme::DimY, &b_);
-
-    // Copy input parameter values to shared memory buffers:
-    // tile[y, x] = input[index]
-    // Note that tile_width and tile_height are flipped here because we are
-    // reading a transposed tile.
-    emit_tiled_elemental_code_with_bounds_check(
-        input_index, "input", output_tile_bounds[2], output_tile_bounds[1],
-        [&](const IrArray::Index& index, llvm::Value* y_loc,
-            llvm::Value* x_loc) {
-          for (int64 id : tiled_param_ids) {
-            IrArray& input_in_logical_shape = param_in_reduced_shape_arrays[id];
-            llvm::Value* shmem_buffer = param_shmem_buffers[id];
-            // TODO(jlebar): Add AA metadata to this store.  Tile buffers are
-            // global variables, so LLVM can't infer much about it.
-            Store(input_in_logical_shape.EmitReadArrayElement(index, &b_,
-                                                              "input_element"),
-                  GEP(shmem_buffer, {index_typed_constant(0), y_loc, x_loc}));
-          }
-        });
-
     // If shared memory transpose is needed, wait for all threads to reach this
     // point, lest we copy a value from tile to output before the other thread
     // copies it from input to tile. This is `__syncthreads` in CUDA.
     if (!tiled_param_ids.empty()) {
+      // Copy input parameter values to shared memory buffers:
+      // tile[y, x] = input[index]
+      // Note that tile_width and tile_height are flipped here because we are
+      // reading a transposed tile.
+      emit_tiled_elemental_code_with_bounds_check(
+          input_tile_origin, "input", output_tile_bounds[2],
+          output_tile_bounds[1],
+          [&](const IrArray::Index& index, llvm::Value* y_loc,
+              llvm::Value* x_loc, int64 /*x_iter_num*/) {
+            for (int64 id : tiled_param_ids) {
+              IrArray& input_in_logical_shape =
+                  param_in_reduced_shape_arrays[id];
+              llvm::Value* shmem_buffer = param_shmem_buffers[id];
+              // TODO(jlebar): Add AA metadata to this store.  Tile buffers are
+              // global variables, so LLVM can't infer much about it.
+              Store(input_in_logical_shape.EmitReadArrayElement(
+                        index, &b_, "input_element"),
+                    GEP(shmem_buffer, {index_typed_constant(0), y_loc, x_loc}));
+            }
+          });
+
+      // Wait for all threads to reach this point using `__syncthreads` in CUDA.
       llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, &b_);
     }
 
     llvm_ir::TiledParameterInfo tiled_param_info(param_shmem_buffers, y, x);
     kernel_info->SetTiledParamInfo(&tiled_param_info);
 
-    const IrArray::Index output_index =
-        output_tile_origin.AddOffsetToDim(x, KernelMappingScheme::DimX, &b_)
-            .AddOffsetToDim(y, KernelMappingScheme::DimY, &b_);
-
     // Write to output[index] by emitting code like normal, except that values
     // for the tiled parameters are read from the shmem buffers.
     emit_tiled_elemental_code_with_bounds_check(
-        output_index, "output", output_tile_bounds[1], output_tile_bounds[2],
-        [&](const IrArray::Index& index, llvm::Value* y_loc,
-            llvm::Value* x_loc) {
-          kernel_generator.GetTileElementGenerator()(unnested_hlo, index,
-                                                     kernel_info, y_loc, x_loc);
+        output_tile_origin, "output", output_tile_bounds[1],
+        output_tile_bounds[2],
+        [&](const IrArray::Index& index, llvm::Value* y_loc, llvm::Value* x_loc,
+            int64 x_iter_num) {
+          kernel_generator.GetTileElementGenerator()(
+              unnested_hlo, index, kernel_info, y_loc, x_loc, x_iter_num);
         });
+
     // If a tile block contains multiple tiles and shared memory buffers are
     // used, we need to wait for all threads to finish using the shared memory
     // buffer for the current tile before we move on to process the next tile
@@ -3634,7 +3117,7 @@ LaunchDimensions IrEmitterUnnested::EmitKernel(
     block_prologue_generator(unnested_hlo, kernel_info);
   }
 
-  EmitBlock(std::move(emit_one_tile), kernel_info, ksl, index_ty);
+  EmitBlock(std::move(emit_one_tile), kernel_info, &ksl, index_ty);
 
   const BlockEpilogueGenerator& block_epilogue_generator =
       kernel_generator.GetBlockEpilogueGenerator();
@@ -3647,7 +3130,10 @@ LaunchDimensions IrEmitterUnnested::EmitKernel(
 
 // Emits a kernel for the given hlo instruction using a tiled 0-2-1 transpose
 // algorithm to improve the memory access patterns for the input parameters
-// with a shape that is a 0-2-1 transpose of the output tensor shape.
+// with a shape that is a 0-2-1 transpose of the output tensor shape. The caller
+// is responsible for making sure that it is safe to apply the shared memory
+// tranpose on the input parameters.
+//
 //
 // For the purpose of tiling, the output tensors have a logical shape of three
 // components 0-2-1 while the relevant input parameters have a logical shape
@@ -3680,17 +3166,19 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
     element_generator = [&](HloInstruction* hlo,
                             const llvm_ir::IrArray::Index& index,
                             const KernelCodegenInfo* kernel_info,
-                            llvm::Value* y_loc, llvm::Value* x_loc) {
-      EmitTileElementForCopy(hlo, index, kernel_info, y_loc, x_loc);
+                            llvm::Value* y_loc, llvm::Value* x_loc,
+                            int64 x_iter_num) {
+      EmitTileElementForCopy(hlo, index, kernel_info, y_loc, x_loc, x_iter_num);
     };
   } else {
     DCHECK_EQ(hlo->opcode(), HloOpcode::kFusion);
-    element_generator = [&](HloInstruction* hlo,
-                            const llvm_ir::IrArray::Index& index,
-                            const KernelCodegenInfo* kernel_info,
-                            llvm::Value* y_loc, llvm::Value* x_loc) {
-      EmitTileElementForFusion(hlo, index, kernel_info, y_loc, x_loc);
-    };
+    element_generator =
+        [&](HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
+            const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
+            llvm::Value* x_loc, int64 x_iter_num) {
+          EmitTileElementForFusion(hlo, index, kernel_info, y_loc, x_loc,
+                                   x_iter_num);
+        };
   }
   KernelCodegenInfo kernel_info(&mapping_scheme);
   KernelCodeGenerator kernel_generator(std::move(element_generator));
@@ -3698,26 +3186,99 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
 }
 
 namespace {
-// Returns true to indicate it is safe to use the tile based shared memory
-// transpose implementation to implement the kernel for the instruction.
+// A recursive function to inspect the users of a parameter to determine
+// whether it's safe for a parameter to participate in a shared-memory
+// transpose.
 //
-// An instruction is not safe for such an implementation if it can change the
-// element order of a tensor without changing the dimension of the tensor, and
-// the instruction has a corresponding elemental_ir_emitter.
-bool IsInstructionSafeForTileBasedTranspose(const HloInstruction* hlo) {
-  auto is_safe_for_tile_based_transpose = [&](const HloInstruction* instr) {
-    HloOpcode opcode = instr->opcode();
-    CHECK_NE(opcode, HloOpcode::kFusion);
-    return (opcode != HloOpcode::kReverse && opcode != HloOpcode::kGather);
-  };
+// Consider a fusion parameter P for which we might want to use a shmem
+// transpose.  If we do, we use a GPU thread block to preload a tile of P with
+// indices [z, y..y+31, x..x+31] to compute an output tile with the same indices
+// cooperatively, where z, y, x are the indices for the normalized input/output
+// tensor (see the document for FindTranspose021 for the definition of
+// normalized tensor for 0-2-1 transpose). This shmem transpose implementation
+// requires that the computation of the output tile only read elements within
+// the preload tile. If this is not true, we can't use a shmem transpose for P.
+//
+// If the computation of output element [z, y, x] only requires the element of
+// P with the same indices, the shmem tranpose implementation can be applied
+// to P safely. This is a sufficient but not necessary condition. We check all
+// the transitive users of P to see if we can find a user that may cause an
+// exception to the situation. If such a user is not found, we conclude that P
+// is safe for shmem transpose.
+//
+// This is trivially true for elementwise operations and some "data-movement"
+// ops like kTuple. However, it's not true for operations that can change the
+// dimensions of the inputs (e.g. pad, slice) and bitcast operation.
+// For example:
+//
+// fused_computation {
+//   param_0 = f32[64,64]{1,0} parameter(0)
+//   ROOT bitcast = f32[64,64]{0,1} bitcast(param_0)
+// }
+// The output element at logical address [0, 63] depends on the input element
+// at logical address [63, 0], which would not be within the shared-memory
+// block.
+//
+// TODO(bixia): In order to extend this for kInput fusion, that is reduction
+// with tranpose, we only need to end the use-chain checking with the input of
+// a reduce operations. In this case, the above description on "output" apply
+// to the result of such a use-chain, which provides the input to the reduce
+// operation.
+bool IsInstructionSafeForShmemTranspose(const HloInstruction* hlo) {
+  if (hlo->IsElementwise()) {
+    return absl::c_all_of(hlo->users(), [&](const HloInstruction* user) {
+      return IsInstructionSafeForShmemTranspose(user);
+    });
+  }
+
+  switch (hlo->opcode()) {
+    // Non-elementwise instructions that don't cause the shmem transpose
+    // to be unsafe, including the instructions that don't currently fuse.
+    case HloOpcode::kGetDimensionSize:
+      // The result of the operation doesn't rely on the content of the
+      // tensor. As such, there is no need to further inspect its users.
+      return true;
+    case HloOpcode::kGetTupleElement:
+    case HloOpcode::kMap:
+    case HloOpcode::kParameter:
+    case HloOpcode::kTuple:
+    case HloOpcode::kTupleSelect:
+      return absl::c_all_of(hlo->users(), [&](const HloInstruction* user) {
+        return IsInstructionSafeForShmemTranspose(user);
+      });
 
-  if (hlo->opcode() == HloOpcode::kFusion) {
-    return absl::c_all_of(hlo->fused_instructions_computation()->instructions(),
-                          is_safe_for_tile_based_transpose);
+    default:
+      return false;
   }
+}
 
-  return is_safe_for_tile_based_transpose(hlo);
+// Given a group of input parameters that are 0-2-1 tranpose of the outputs of
+// a fusion kernel, returns the input parameters that are safe for the shared
+// memory tranpose implementation.
+//
+// When a tile based shared memory transpose is used to implement an input with
+// 0-2-1 transpose, we preload a tile of the input elements
+// [z, y..y+31, x..x+31] to compute the output tile elements of the same
+// indices. Preloading the input tile this way is only safe when the computation
+// of the output tile elements do not need any input element outside the
+// preloaded tile. We inspect all the transitive users of the input parameter
+// up to the fusion root instruction to see if we can find any instruction
+// that can make preloading the input tile unsafe.
+std::vector<int64> FilterInputsForShmemTranspose(const HloInstruction* fusion,
+                                                 std::vector<int64> input_ids) {
+  std::vector<int64> filtered_input_ids;
+  for (int64 i = 0; i < input_ids.size(); ++i) {
+    const HloInstruction* input = fusion->fused_parameter(input_ids[i]);
+    if (IsInstructionSafeForShmemTranspose(input)) {
+      filtered_input_ids.push_back(input_ids[i]);
+    } else {
+      VLOG(10) << "Input not safe for shmem transpose " << input->ToString()
+               << "\n";
+    }
+  }
+  return filtered_input_ids;
 }
+
 }  // namespace
 
 bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
@@ -3764,8 +3325,11 @@ bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
     return false;
   }
 
-  if (!IsInstructionSafeForTileBasedTranspose(hlo)) {
-    return false;
+  if (opcode == HloOpcode::kFusion) {
+    params_012 = FilterInputsForShmemTranspose(hlo, params_012);
+    if (params_012.empty()) {
+      return false;
+    }
   }
 
   // Each of our shared memory tiles has 32*33 elements (so ~4kb, if the
@@ -3814,6 +3378,350 @@ bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
   return true;
 }
 
+namespace {
+// Checks that the outputs of a fusion with reduction are consistent.
+Status AreFusedReductionOutputsConsistent(
+    absl::Span<HloInstruction* const> output_instructions,
+    const HloInstruction* first_reduce) {
+  for (const HloInstruction* inst : output_instructions) {
+    if (inst->opcode() == HloOpcode::kReduce) {
+      // Shapes, layouts and dimensions must be the same for all reduces
+      // inside of this fusion.
+      TF_RET_CHECK(ShapeUtil::Equal(first_reduce->shape(), inst->shape()));
+      TF_RET_CHECK(ShapeUtil::Equal(first_reduce->operand(0)->shape(),
+                                    inst->operand(0)->shape()));
+      TF_RET_CHECK(ShapeUtil::Equal(first_reduce->operand(1)->shape(),
+                                    inst->operand(1)->shape()));
+      TF_RET_CHECK(first_reduce->dimensions() == inst->dimensions());
+    } else {
+      // For extra outputs we can relax shape equality to allow different
+      // types (with the same number of elements). Layouts still have to
+      // match.
+      TF_RET_CHECK(ShapeUtil::CompatibleIgnoringElementType(
+          first_reduce->operand(0)->shape(), inst->shape()));
+      TF_RET_CHECK(LayoutUtil::Equal(first_reduce->operand(0)->shape().layout(),
+                                     inst->shape().layout()));
+    }
+  }
+  return Status::OK();
+}
+
+// Finds the dimensions to keep for the reduction, sorts and returns the
+// dimensions from minor to major.
+DimensionVector GetDimensionsToKeepMinorToMajor(
+    const Shape& input_shape, absl::Span<const int64> dims_to_reduce) {
+  DimensionVector input_dims(input_shape.rank(), 0);
+  absl::c_iota(input_dims, 0);
+  DimensionVector input_dims_to_keep;
+  for (int input_dim : input_dims) {
+    auto it = absl::c_find_if(dims_to_reduce, [&](int64 dim_to_reduce) {
+      return dim_to_reduce == input_dim;
+    });
+    if (it == dims_to_reduce.end()) {
+      input_dims_to_keep.push_back(input_dim);
+    }
+  }
+
+  // Sort the dimensions to keep from minor to major.
+  absl::c_sort(input_dims_to_keep, [&input_shape](int64 dim_a, int64 dim_b) {
+    return PositionInContainer(LayoutUtil::MinorToMajor(input_shape), dim_a) <
+           PositionInContainer(LayoutUtil::MinorToMajor(input_shape), dim_b);
+  });
+
+  VLOG(10) << "dims to keep minor to major"
+           << absl::StrJoin(input_dims_to_keep, ",");
+  return input_dims_to_keep;
+}
+
+// Given the input shape and dimensions to reduce for the reduction to vector,
+// returns <num_reduced_major, num_kept, num_reduced_minor>:
+// num_kept: the number of elements in the contiguous dimensions to keep.
+// num_reduced_major: the number of elements in the dimensions to reduce that
+//   are more major than the dimensions to keep.
+// num_reduced_minor: the number of elements in the dimensions to reduce that
+//   are more minor than the dimensions to kept.
+std::tuple<int64, int64, int64> GetReductionToVectorDimensions(
+    const Shape& input_shape, absl::Span<const int64> dims_to_reduce) {
+  DimensionVector input_dims_to_keep_minor_to_major =
+      GetDimensionsToKeepMinorToMajor(input_shape, dims_to_reduce);
+  CHECK(LayoutUtil::AreDimensionsConsecutive(
+      input_shape.layout(), input_dims_to_keep_minor_to_major));
+  int num_reduced_major = 1, num_kept = 1, num_reduced_minor = 1;
+  if (input_dims_to_keep_minor_to_major.empty()) {
+    return std::make_tuple(num_reduced_major, num_kept, num_reduced_minor);
+  }
+  DimensionVector input_dims(input_shape.rank(), 0);
+  absl::c_iota(input_dims, 0);
+  absl::Span<const int64> minor_to_major =
+      LayoutUtil::MinorToMajor(input_shape);
+  for (int input_dim : input_dims) {
+    int64 curr_dim_size = input_shape.dimensions(input_dim);
+    if (PositionInContainer(minor_to_major, input_dim) >
+        PositionInContainer(minor_to_major,
+                            input_dims_to_keep_minor_to_major.back())) {
+      num_reduced_major *= curr_dim_size;
+    } else if (PositionInContainer(minor_to_major, input_dim) <
+               PositionInContainer(minor_to_major,
+                                   input_dims_to_keep_minor_to_major.front())) {
+      num_reduced_minor *= curr_dim_size;
+    } else {
+      num_kept *= curr_dim_size;
+    }
+  }
+
+  return std::make_tuple(num_reduced_major, num_kept, num_reduced_minor);
+}
+
+// Returns true if all the transitive users of hlo before hitting users in
+// use_chain_endings are elementwise operations.
+bool AreUsersElementwise(const HloInstruction* hlo,
+                         const ConstHloInstructionSet& use_chain_endings) {
+  return absl::c_all_of(hlo->users(), [&](const HloInstruction* user) {
+    return use_chain_endings.count(user) ||
+           (user->IsElementwise() &&
+            AreUsersElementwise(user, use_chain_endings));
+  });
+}
+
+// Returns the number of fusion inputs that have the same dimension as the
+// given shape, and involve in only elementwise operations.
+int64 NumInputsInvolveInOnlyElementwiseOps(
+    const HloInstruction* unnested_hlo, const Shape& op_shape,
+    const ConstHloInstructionSet& use_chain_endings) {
+  return absl::c_count_if(
+      unnested_hlo->fused_parameters(), [&](const HloInstruction* parameter) {
+        const Shape& parameter_shape = parameter->shape();
+        return ShapeUtil::SameDimensions(op_shape, parameter_shape) &&
+               AreUsersElementwise(parameter, use_chain_endings);
+      });
+}
+
+// Returns the number of fusion inputs that have more elements than the given
+// shape.
+int64 NumInputsWithMoreElementsThan(const HloInstruction* unnested_hlo,
+                                    const Shape& shape) {
+  int64 num_elements = ShapeUtil::ElementsIn(shape);
+  return absl::c_count_if(
+      unnested_hlo->fused_parameters(), [&](const HloInstruction* parameter) {
+        return ShapeUtil::ElementsIn(parameter->shape()) > num_elements;
+      });
+}
+
+// The benefit of unrolling a kInput fusion that is a column reduction comes
+// from the vectorization of non-reduction fusion outputs and fusion inputs.
+// On the other hand, unrolling can also introduce factors that can cause
+// the kernel to run slower. This routine uses a simple heuristic to estimate
+// the benefit as well as the overhead of unrolling in order to decide whether
+// unrolling is beneficial for the given kInput fusion.
+bool IsUnrollingColumnReductionBeneficial(const HloInstruction* unnested_hlo,
+                                          const Shape& input_shape,
+                                          int64 num_kept) {
+  // TODO(b/122468062): Need further investigate to see whether we can
+  // remove the constraint on IsPowerOfTwo.
+  if (!IsPowerOfTwo(static_cast<uint64>(num_kept))) {
+    return false;
+  }
+
+  if (unnested_hlo->opcode() == HloOpcode::kReduce) {
+    return true;
+  }
+
+  CHECK_EQ(unnested_hlo->opcode(), HloOpcode::kFusion);
+  int64 can_be_vectorized = 0;
+  int64 cannot_be_vectorized = 0;
+  const HloInstruction* fused_root = unnested_hlo->fused_expression_root();
+  ConstHloInstructionSet use_chain_endings;
+  if (fused_root->opcode() == HloOpcode::kReduce) {
+    use_chain_endings.insert(fused_root);
+    // Atomic.add of the reduction result can't be vectorized.
+    cannot_be_vectorized++;
+  } else {
+    CHECK_EQ(fused_root->opcode(), HloOpcode::kTuple);
+    for (const HloInstruction* instr : fused_root->operands()) {
+      if (instr->opcode() == HloOpcode::kReduce) {
+        // Atomic.add of the reduction result can't be vectorized.
+        cannot_be_vectorized++;
+      } else {
+        // Write of the non-reduction result can be vectorized.
+        can_be_vectorized++;
+      }
+      use_chain_endings.insert(instr);
+    }
+  }
+  // Fusion inputs that have the same dimension as the reduce input and
+  // only involve in elementwise operations can be vectorized.
+  can_be_vectorized += NumInputsInvolveInOnlyElementwiseOps(
+      unnested_hlo, input_shape, use_chain_endings);
+  // Fusion inputs with more elements than the reduce op input must participate
+  // in non-elementwise operations and we assume that they are not vectorizable
+  // for the purpose of estimating the benefit of unrolling. If the kernel is
+  // unrolled even with such an assumption,  and the accesses to those inputs
+  // turn out to be vectorizable, the compiler will still vectorize them.
+  cannot_be_vectorized +=
+      NumInputsWithMoreElementsThan(unnested_hlo, input_shape);
+  return can_be_vectorized >= cannot_be_vectorized;
+}
+
+}  // namespace
+
+std::tuple<KernelMappingScheme, bool>
+IrEmitterUnnested::ComputeMappingSchemeAndReductionKind(
+    const HloInstruction* unnested_hlo, const HloInstruction* first_reduce) {
+  int64 depth = 1;
+  int64 height = 1;
+  int64 width = 1;
+  bool is_row_reduction = true;
+  int64 tile_size_x = 1;
+  int64 tile_size_y = 1;
+  int64 block_size_z = 1;
+  int64 num_threads_x = 1;
+  int64 num_threads_y = 1;
+  const Shape& input_shape = first_reduce->operand(0)->shape();
+  int64 num_input_elems = ShapeUtil::ElementsIn(input_shape);
+  int64 num_output_elems = ShapeUtil::ElementsIn(first_reduce->shape());
+  int64 num_reduced_major, num_kept, num_reduced_minor;
+  std::tie(num_reduced_major, num_kept, num_reduced_minor) =
+      GetReductionToVectorDimensions(input_shape, first_reduce->dimensions());
+  CHECK_EQ(num_output_elems, num_kept);
+  bool dilated_x = true;
+
+  if (num_kept == 1) {
+    // Scalar reduction is a special row reduction with depth = height = 1.
+    width = num_input_elems;
+    tile_size_x = kWarpSize * 16;
+    num_threads_x = kWarpSize;
+  } else if (num_reduced_minor == 1) {
+    // Column reduction reduces inputs with dimension [height, width], where
+    // width is the minor dimension, to dimension [width].
+    height = num_reduced_major;
+    width = num_kept;
+    is_row_reduction = false;
+    // Column reduction without transpose doesn't require communication among
+    // threads processing elements in the same tile. The current implementation
+    // only support the use of one hardware thread block to process one block of
+    // tiles in the KernelMappingScheme. We try to use one thread to compute
+    // the partial results for two tensor elements and to maximize the values of
+    // num_threads_x and tile_size_x to allow a bigger hardware thread block.
+    int64 hw_threads_per_block_limit =
+        ThreadsPerBlockLimit(ir_emitter_context_->device_description());
+    if (IsUnrollingColumnReductionBeneficial(unnested_hlo, input_shape,
+                                             num_kept)) {
+      tile_size_x = std::min(2 * hw_threads_per_block_limit, num_kept);
+      num_threads_x = tile_size_x / 2;
+      dilated_x = false;
+    } else {
+      tile_size_x = std::min(hw_threads_per_block_limit, num_kept);
+      num_threads_x = tile_size_x;
+    }
+    int64 kNumElementsPerPartialSum = 128;
+    tile_size_y = kNumElementsPerPartialSum;
+  } else {
+    // Row reduction reduces inputs with dimension [depth, height, width],
+    // where width is the most minor dimension, to dimension [height] .
+    depth = num_reduced_major;
+    height = num_kept;
+    width = num_reduced_minor;
+    num_threads_x = kWarpSize;
+    if (width % (kWarpSize * 64) == 0) {
+      tile_size_x = kWarpSize * 64;
+    } else {
+      tile_size_x = kWarpSize * 8;
+      block_size_z = 8;
+      while (depth % block_size_z != 0) {
+        block_size_z -= 1;
+      }
+    }
+  }
+  DCHECK_EQ(depth * height * width, num_input_elems);
+  VLOG(10) << "is_row_reduction " << is_row_reduction << depth << " " << height
+           << " " << width;
+
+  DimensionVector dims_in_elem{depth, height, width};
+  DimensionVector req_block_sizes{block_size_z, 1, 1};
+  llvm_ir::KernelMappingScheme mapping_scheme(
+      dims_in_elem, tile_size_y, tile_size_x, req_block_sizes, num_threads_y,
+      num_threads_x, &b_);
+  mapping_scheme.SetDilatedX(dilated_x);
+  return std::make_tuple(mapping_scheme, is_row_reduction);
+}
+
+Status IrEmitterUnnested::EmitReductionToVector(HloInstruction* unnested_hlo) {
+  VLOG(10) << "Emitting reduction to vector " << unnested_hlo->ToString();
+
+  HloInstruction* reduce_or_tuple = unnested_hlo->opcode() == HloOpcode::kFusion
+                                        ? unnested_hlo->fused_expression_root()
+                                        : unnested_hlo;
+  absl::Span<HloInstruction* const> output_instructions =
+      GetOutputInstructions(&reduce_or_tuple);
+  const HloInstruction* first_reduce =
+      GetFirstReduceInstruction(output_instructions);
+
+  if (output_instructions.size() > 1) {
+    TF_RETURN_IF_ERROR(
+        AreFusedReductionOutputsConsistent(output_instructions, first_reduce));
+  }
+
+  // Build an initializer thunk to initialize each reduction output.
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  for (int i = 0, e = output_instructions.size(); i != e; ++i) {
+    if (output_instructions[i]->opcode() != HloOpcode::kReduce) {
+      continue;
+    }
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<Thunk> initializer_thunk,
+        BuildInitializerThunk(unnested_hlo,
+                              (output_instructions[i] == reduce_or_tuple)
+                                  ? ShapeIndex()
+                                  : ShapeIndex({i})));
+    thunks.push_back(std::move(initializer_thunk));
+  }
+
+  // Build a kernel thunk to compute all the outputs.
+  std::unique_ptr<KernelThunk> kernel_thunk =
+      BuildKernelThunk(unnested_hlo, /*implements_whole_instruction=*/false);
+
+  const Shape& input_shape = first_reduce->operand(0)->shape();
+  // The layout of a reduction input is either set by LayoutAssignment for
+  // unnested kReduce or by InstructionFusion for fused kReduce.
+  CHECK(input_shape.has_layout()) << "LayoutAssignment or InstructionFusion "
+                                     "doesn't set the input layout of "
+                                  << first_reduce->ToString();
+
+  bool is_row_reduction;
+  llvm_ir::KernelMappingScheme mapping_scheme;
+  std::tie(mapping_scheme, is_row_reduction) =
+      ComputeMappingSchemeAndReductionKind(unnested_hlo, first_reduce);
+  ReductionCodegenInfo reduction_info(&mapping_scheme, is_row_reduction);
+  KernelCodeGenerator kernel_generator(
+      /*tile_element_generator=*/
+      [&](HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
+          const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
+          llvm::Value* x_loc, int64 x_iter_num) {
+        EmitTileElementForReduction(hlo, index, kernel_info, y_loc, x_loc,
+                                    x_iter_num);
+      },
+      /*block_prologue_generator=*/
+      [&](HloInstruction* hlo, KernelCodegenInfo* kernel_info) {
+        EmitPrologueForReduction(hlo, kernel_info);
+      },
+      /*block_epilogue_generator*/
+      [&](HloInstruction* hlo, KernelCodegenInfo* kernel_info) {
+        EmitEpilogueForReduction(hlo, kernel_info);
+      });
+
+  LaunchDimensions launch_dimensions =
+      EmitKernel(unnested_hlo, {}, kernel_generator, &reduction_info);
+  UpdateLaunchDimensions(launch_dimensions, kernel_thunk.get(),
+                         ir_emitter_context_->llvm_module());
+
+  thunks.push_back(std::move(kernel_thunk));
+  std::unique_ptr<SequentialThunk> sequential_thunk =
+      absl::make_unique<SequentialThunk>(std::move(thunks), unnested_hlo);
+  AddThunkToThunkSequence(std::move(sequential_thunk));
+
+  return Status::OK();
+}
+
 Status IrEmitterUnnested::EmitConstantGlobals() {
   for (const BufferAllocation& allocation :
        ir_emitter_context_->buffer_assignment().Allocations()) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index e09ed657a812be6ab4859a0e365a51c45a37bfed..f85e18bbf0798ef3d5b87e81d287d8aed691dfc4 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_UNNESTED_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_UNNESTED_H_
 
+#include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
@@ -68,11 +69,13 @@ class IrEmitterUnnested : public IrEmitter {
     explicit KernelCodegenInfo(llvm_ir::KernelMappingScheme* mapping_scheme)
         : mapping_scheme_(mapping_scheme),
           tiled_param_info_(nullptr),
-          lane_id_(nullptr) {}
+          lane_id_(nullptr),
+          index_ty_(nullptr) {}
+    virtual ~KernelCodegenInfo() {}
 
     void SetLaneId(llvm::Value* v) { lane_id_ = v; }
+    void SetIndexType(llvm::Type* t) { index_ty_ = t; }
     void SetTiledParamInfo(llvm_ir::TiledParameterInfo* tiled_param_info) {
-      CHECK_EQ(tiled_param_info_, nullptr);
       tiled_param_info_ = tiled_param_info;
     }
 
@@ -83,11 +86,13 @@ class IrEmitterUnnested : public IrEmitter {
     llvm_ir::TiledParameterInfo* GetTiledParameterInfo() const {
       return tiled_param_info_;
     }
+    llvm::Type* GetIndexType() const { return index_ty_; }
 
-   private:
+   protected:
     llvm_ir::KernelMappingScheme* mapping_scheme_;
     llvm_ir::TiledParameterInfo* tiled_param_info_;
     llvm::Value* lane_id_;
+    llvm::Type* index_ty_;
   };
 
   // A function object to prepare for the code generation for a tile block.
@@ -103,10 +108,12 @@ class IrEmitterUnnested : public IrEmitter {
   // y_loc: The y coordinate within a tile.
   // x_loc: The x coordinate within a tile.
   // kernel_info: Other information to support the kernel code generation.
+  // x_iter_num: When a thread process N elements in the X dimension, x_iter_num
+  //             has a value of 0..N-1 to identify the element being process.
   using TileElementGenerator = std::function<void(
       HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
       const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
-      llvm::Value* x_loc)>;
+      llvm::Value* x_loc, int64 x_iter_num)>;
 
   // KernelCodeGenerator records the code generator objects that generate code
   // for tile elements or tile block prologue/epilogue.
@@ -169,8 +176,9 @@ class IrEmitterUnnested : public IrEmitter {
   Status HandleScatter(HloInstruction* scatter) override;
   Status HandleSelect(HloInstruction* select) override;
   Status HandleSort(HloInstruction* sort) override;
+  Status HandleTriangularSolve(HloInstruction* hlo) override;
   Status HandleTupleSelect(HloInstruction* tuple_select) override;
-  Status HandleCrossReplicaSum(HloInstruction* crs) override;
+  Status HandleAllReduce(HloInstruction* crs) override;
   Status HandleAfterAll(HloInstruction* after_all) override;
 
   Status EmitTargetElementLoop(
@@ -200,82 +208,23 @@ class IrEmitterUnnested : public IrEmitter {
 
   // Helper for writing extra outputs from inside a reduce kernel.
   Status EmitExtraOutputsForReduce(
-      const HloInstruction* reduce, const llvm_ir::IrArray::Index& index,
+      const HloInstruction* unnested_hlo, const llvm_ir::IrArray::Index& index,
       absl::Span<const std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
           extra_output_gens);
 
-  // EmitColumnReduction and EmitRowReduction emit code for column and row
-  // reduction of a matrix and/or 3D tensor. Row and column reduction have
-  // different memory access pattern, so for performance their implementations
-  // are significantly different.
+  // Generates code for reduction to contiguous dimensions.
   //
-  // Emits code that reduces a matrix of shape [height x width] to a vector of
-  // [width]. Other parameters have the same meaning as those of
-  // `EmitReductionToVector`. Note that input shape might not be
-  // [height x width], but can be bitcast to [height x width] with "height"
-  // being the major dimension.
-  Status EmitColumnReduction(
-      KernelThunk* kernel_thunk, int64 height, int64 width,
-      HloInstruction* reduce, const Shape& input_shape,
-      absl::Span<const llvm_ir::ElementGenerator> input_gens,
-      absl::Span<const llvm_ir::ElementGenerator> init_value_gens,
-      absl::Span<HloComputation* const> reducers,
-      absl::Span<const ShapeIndex> reduce_output_shapes,
-      absl::Span<const std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
-          extra_output_gens);
-
-  // Emits code that reduces a 3D tensor of shape [depth x height x width] to a
-  // vector of shape [height]. Other parameters have the same meaning as those
-  // of `EmitReductionToVector`. Note that input shape might not be
-  // [depth x height x width], but can be bitcast to [depth x height x width]
-  // with "depth" being the most major dimension.
-  Status EmitRowReduction(
-      KernelThunk* kernel_thunk, int64 depth, int64 height, int64 width,
-      HloInstruction* reduce, const Shape& input_shape,
-      absl::Span<const llvm_ir::ElementGenerator> input_gens,
-      absl::Span<const llvm_ir::ElementGenerator> init_value_gens,
-      absl::Span<HloComputation* const> reducers,
-      absl::Span<const ShapeIndex> reduce_output_shapes,
-      absl::Span<const std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
-          extra_output_gens);
-
-  // Emits code that reduces a tensor of arbitrary rank to a scalar.
-  Status EmitReductionToScalar(
-      KernelThunk* kernel_thunk, HloInstruction* reduce,
-      const Shape& input_shape,
-      absl::Span<const llvm_ir::ElementGenerator> input_gens,
-      absl::Span<const llvm_ir::ElementGenerator> init_value_gens,
-      absl::Span<HloComputation* const> reducers,
-      absl::Span<const ShapeIndex> reduce_output_shapes,
-      absl::Span<const std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
-          extra_output_gens);
-
-  // Figures out whether `reduce` is a row or column reduction, and which
-  // dimensions to reduce, and calls either `EmitRowReduction` or
-  // `EmitColumnReduction` as appropriate. `input_shape` is the shape of the
-  // input array, which is the operand of the Reduce instruction if unfused or
-  // of the Fusion instruction if fused. `input_gen` and `init_value_gen`
-  // generate elements of the input and the initial value. Other parameters mean
-  // the same as for `HandleReduce`.
-  //
-  // Multiple reduces can be emitted in the same loop, assuming they have the
-  // same input and output shapes, and the same reduce dimensions.
-  //
-  // extra_output_gens can contain extra generators for intermediate outputs.
-  // These must have the same shape as the reduce input as they are computed
-  // when the reduce inputs are being read.
-  //
-  // Prerequisite: `IsReductionToVector(*reduce)`
-  Status EmitReductionToVector(
-      KernelThunk* kernel_thunk, HloInstruction* reduce,
-      const Shape& input_shape,
-      absl::Span<const llvm_ir::ElementGenerator> input_gens,
-      absl::Span<const llvm_ir::ElementGenerator> init_value_gens,
-      absl::Span<const int64> dimensions_to_reduce,
-      absl::Span<HloComputation* const> reducers,
-      absl::Span<const ShapeIndex> reduce_output_shapes,
-      absl::Span<const std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
-          extra_output_gens);
+  // Prerequisite: `IsReductionToVector(*unnested_hlo)`
+  Status EmitReductionToVector(HloInstruction* unnested_hlo);
+
+  // Computes the KernelMappingScheme for the reduce HLO and indicates whether
+  // the reduction is a row reduction. For an un-fused reduce op, unnested_hlo
+  // and first_reduce are the same instruction. For a kInput fusion,
+  // unnested_hlo is the fusion instruction while first_reduce is the first
+  // reduce op.
+  std::tuple<llvm_ir::KernelMappingScheme, bool>
+  ComputeMappingSchemeAndReductionKind(const HloInstruction* unnested_hlo,
+                                       const HloInstruction* first_reduce);
 
   // Emits code for an in-place scatter, modifying `thunk`s launch dimensions in
   // the process. `scatter` may be fused, scatter indices are taken from
@@ -300,20 +249,45 @@ class IrEmitterUnnested : public IrEmitter {
                               const KernelCodeGenerator& kernel_generator,
                               KernelCodegenInfo* kernel_info);
   void EmitBlock(const TileGenerator& emit_one_tile,
-                 const KernelCodegenInfo* kernel_info,
-                 KernelSupportLibrary& ksl, llvm::Type* index_ty);
+                 KernelCodegenInfo* kernel_info, KernelSupportLibrary* ksl,
+                 llvm::Type* index_ty);
   // Emits code to process a tensor element in a tile for the given kCopy HLO
   // that performs a 0-2-1 transpose.
   void EmitTileElementForCopy(HloInstruction* hlo,
                               const llvm_ir::IrArray::Index& index,
                               const KernelCodegenInfo* kernel_info,
-                              llvm::Value* y_loc, llvm::Value* x_loc);
+                              llvm::Value* y_loc, llvm::Value* x_loc,
+                              int64 x_iter_num);
   // Emits code to process a tensor element in a tile for the given kLoop fusion
   // HLO containing parameters that are 0-2-1 transpose of its outputs.
   void EmitTileElementForFusion(HloInstruction* hlo,
                                 const llvm_ir::IrArray::Index& index,
                                 const KernelCodegenInfo* kernel_info,
-                                llvm::Value* y_loc, llvm::Value* x_loc);
+                                llvm::Value* y_loc, llvm::Value* x_loc,
+                                int64 x_iter_num);
+  // Emits code to process a tensor element in a tile for the given input hlo
+  // that is either a unnested kReduce or a kInput fusion.
+  void EmitTileElementForReduction(HloInstruction* unnested_hlo,
+                                   const llvm_ir::IrArray::Index& index,
+                                   const KernelCodegenInfo* kernel_info,
+                                   llvm::Value* y_loc, llvm::Value* x_loc,
+                                   int64 x_iter_num);
+  // Prepares for the code generation for a tile block of a reduction kernel.
+  void EmitPrologueForReduction(HloInstruction* unnested_hlo,
+                                KernelCodegenInfo* kernel_info);
+  void EmitPrologueForOneReduction(HloInstruction* unnested_hlo,
+                                   HloInstruction* reduce_inst, int reduce_idx,
+                                   KernelCodegenInfo* kernel_info,
+                                   GpuElementalIrEmitter* elemental_emitter,
+                                   ShapeIndex output_shape_index);
+  // Wraps up the code generation for a tile block of a reduction kernel.
+  void EmitEpilogueForReduction(HloInstruction* unnested_hlo,
+                                KernelCodegenInfo* kernel_info);
+  // For each reducer, emits the shuffle-down loop to accumulate the partial
+  // result to the global result.
+  void EmitFullWarpShuffleDownLoopForAllReduces(
+      absl::Span<HloComputation* const> reducers,
+      absl::Span<llvm::AllocaInst* const> partial_result_addresses);
 
   // Generates the IrArray for each input of an hlo and returns a vector that
   // constains such IrArrays.
@@ -346,6 +320,9 @@ class IrEmitterUnnested : public IrEmitter {
   // Returns a FftThunk that calls cuFFT to implement `inst`.
   std::unique_ptr<Thunk> BuildFftThunk(const HloInstruction* inst);
 
+  // Returns a TriangularSolveThunk that calls cuBlas to implement `inst`.
+  std::unique_ptr<Thunk> BuildTriangularSolveThunk(const HloInstruction* inst);
+
   // Returns a GemmThunk that calls gemm to implement `inst`. The caller needs
   // to make sure `inst` outlives the lifetime of the returned Thunk object.
   std::unique_ptr<Thunk> BuildGemmThunk(const HloInstruction* inst);
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
index bd53b90b42d8e657a3ee58e7ca03fb60522aae28..153aab97d9eb971734c5ea95564895631bc2a9fa 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
@@ -110,11 +110,9 @@ static string GetLibdeviceFilename(const string& libdevice_dir_path,
 }
 
 // Gets the GPU name as it's known to LLVM for a given compute capability.  If
-// we see an unrecognized compute capability, we return "sm_30".
+// we see an unrecognized compute capability, we return "sm_35".
 static string GetSmName(std::pair<int, int> compute_capability) {
   static auto* m = new std::map<std::pair<int, int>, int>({
-      {{3, 0}, 30},
-      {{3, 2}, 32},
       {{3, 5}, 35},
       {{3, 7}, 37},
       {{5, 0}, 50},
@@ -125,8 +123,9 @@ static string GetSmName(std::pair<int, int> compute_capability) {
       {{6, 2}, 62},
       {{7, 0}, 70},
       {{7, 2}, 72},
+      {{7, 5}, 75},
   });
-  int sm_version = 30;
+  int sm_version = 35;
   auto it = m->find(compute_capability);
   if (it != m->end()) {
     sm_version = it->second;
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
index 01fddcede64d1bb02ab89db5fc9524893c2d47a4..02e1207f377b8c28bf2566bee8cf3bcbc66794fb 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
@@ -67,7 +67,7 @@ int64 GpuMultiOutputFusion::GetProfit(HloInstruction* instr1,
   }
   int64 profit = 0;
   for (auto instr : instr2->operands()) {
-    if (!IsProfitableOperand(instr) || in_list.count(instr) == 0) {
+    if (!IsProfitableOperand(instr) || !in_list.contains(instr)) {
       continue;
     }
     profit += ShapeUtil::ByteSizeOf(instr->shape());
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
index d16c87ba5c63aa582753fe949e9e39ee2d8b81e5..40b87b16a195564c9b98497f79a70f1db0539d87 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
@@ -628,8 +628,7 @@ TEST_F(MultiOutputFusionTest, MultiOutputFusionDUS) {
       p.1 = s32[1]{0} parameter(1)
       p.2 = f16[1,96,1024]{2,1,0} parameter(2)
       c.0 = s32[] constant(0)
-      pad = s32[3]{0} pad(p.1, c.0), padding=0_2
-      ROOT %dynamic-update-slice = f16[50,96,1024]{2,1,0} dynamic-update-slice(p.0, p.2, pad)
+      ROOT %dynamic-update-slice = f16[50,96,1024]{2,1,0} dynamic-update-slice(p.0, p.2, p.1, c.0, c.0)
     }
 
     fusion.2 {
@@ -638,7 +637,7 @@ TEST_F(MultiOutputFusionTest, MultiOutputFusionDUS) {
       p.2 = f16[1,96,1024]{2,1,0} parameter(2)
       c.0 = s32[] constant(0)
       pad = s32[3]{0} pad(p.1, c.0), padding=0_2
-      ROOT %dynamic-update-slice = f16[50,96,1024]{2,1,0} dynamic-update-slice(p.0, p.2, pad)
+      ROOT %dynamic-update-slice = f16[50,96,1024]{2,1,0} dynamic-update-slice(p.0, p.2, p.1, c.0, c.0)
     }
 
     ENTRY entry {
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index f3e17d888242a36c268dcbfa0d6530f80cedceb0..6e00e4b4ff8c493f00fae3355215fb13fb5f4f10 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -36,6 +36,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/conditional_simplifier.h"
+#include "tensorflow/compiler/xla/service/convolution_group_converter.h"
+#include "tensorflow/compiler/xla/service/dot_decomposer.h"
+#include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_rewriter.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h"
@@ -50,6 +53,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_hlo_support_checker.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
@@ -77,6 +81,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
+#include "tensorflow/compiler/xla/service/sort_simplifier.h"
+#include "tensorflow/compiler/xla/service/stable_sort_expander.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h"
@@ -108,29 +114,58 @@ namespace {
 
 namespace tracing = tensorflow::tracing;
 
-// Returns the directory containing nvvm libdevice files.  config_cuda_data_dir
-// should be equal to config().debug_options().xla_gpu_cuda_data_dir() of the
-// HloModule being compiled.
-string GetLibdeviceDir(const string& config_cuda_data_dir) {
-  std::vector<string> potential_libdevice_dirs;
-  if (!config_cuda_data_dir.empty()) {
-    potential_libdevice_dirs.push_back(config_cuda_data_dir);
-  }
-  potential_libdevice_dirs.push_back(tensorflow::LibdeviceRoot());
-
-  // Tries all potential libdevice directories in the order they are inserted.
-  // Returns the first directory that exists in the file system.
-  for (const string& potential_libdevice_dir : potential_libdevice_dirs) {
-    if (tensorflow::Env::Default()->IsDirectory(potential_libdevice_dir).ok()) {
-      VLOG(2) << "Found libdevice dir " << potential_libdevice_dir;
-      return potential_libdevice_dir;
+// Returns a vector of potential locations of the CUDA root directory.
+std::vector<string> GetCudaRootCandidates(
+    const HloModuleConfig& hlo_module_config) {
+  std::vector<string> potential_cuda_roots = tensorflow::CandidateCudaRoots();
+
+  // "." is our last resort, even though it probably won't work.
+  potential_cuda_roots.push_back(".");
+
+  // CUDA location explicitly specified by user via --xla_gpu_cuda_data_dir has
+  // highest priority.
+  string xla_gpu_cuda_data_dir =
+      hlo_module_config.debug_options().xla_gpu_cuda_data_dir();
+  if (!xla_gpu_cuda_data_dir.empty()) {
+    potential_cuda_roots.insert(potential_cuda_roots.begin(),
+                                xla_gpu_cuda_data_dir);
+  }
+  return potential_cuda_roots;
+}
+
+void PrintCantFindCudaMessage(absl::string_view msg,
+                              const HloModuleConfig& hlo_module_config) {
+  LOG(WARNING) << msg;
+  LOG(WARNING) << "Searched in the following directories:";
+  for (const auto& dir : GetCudaRootCandidates(hlo_module_config)) {
+    LOG(WARNING) << "  " << dir;
+  }
+  LOG(WARNING)
+      << "You can choose the search directory by setting xla_gpu_cuda_data_dir "
+         "in HloModule's DebugOptions.  For most apps, setting the environment "
+         "variable XLA_FLAGS=--xla_gpu_cuda_data_dir=/path/to/cuda will work.";
+}
+
+// Returns the directory containing nvvm libdevice files.
+string GetLibdeviceDir(const HloModuleConfig& hlo_module_config) {
+  const auto& candidate_dirs = GetCudaRootCandidates(hlo_module_config);
+  for (const string& cuda_root : candidate_dirs) {
+    string libdevice_dir =
+        tensorflow::io::JoinPath(cuda_root, "nvvm", "libdevice");
+    VLOG(2) << "Looking for libdevice at " << libdevice_dir;
+    if (tensorflow::Env::Default()->IsDirectory(libdevice_dir).ok()) {
+      VLOG(2) << "Found libdevice dir " << libdevice_dir;
+      return libdevice_dir;
     }
-    VLOG(2) << "Unable to find potential libdevice dir "
-            << potential_libdevice_dir;
   }
+  PrintCantFindCudaMessage(
+      "Can't find directory containing CUDA libevice.  This may result in "
+      "compilation or runtime failures, if the program we try to run uses "
+      "routines from libdevice.",
+      hlo_module_config);
 
-  LOG(WARNING) << "Unable to find libdevice dir. Using '.'";
-  // Last resort: maybe in the current folder.
+  // GetCudaRotCandidates always inclues ".", but but if everything fails, we
+  // return it anyway.  Better than returning the empty string.
   return ".";
 }
 
@@ -145,6 +180,7 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
     HloPassPipeline pipeline("optimization");
     pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
                                               /*allow_mixed_precision=*/false);
+    pipeline.AddPass<DynamicIndexSplitter>();
     pipeline.AddPass<GpuHloSupportChecker>();
     ReducePrecisionInsertion::AddPasses(
         &pipeline, hlo_module->config().debug_options(),
@@ -152,6 +188,16 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
 
     // TODO(b/64094172): make Call work on GPU instead of inlining.
     pipeline.AddPass<CallInliner>();
+    auto cost_model = [](HloInstruction* conv) {
+      // We need a cost model for GPUs. Currently, do nothing.
+      return false;
+    };
+    pipeline.AddPass<DotDecomposer>(false);
+    pipeline.AddPass<ConvolutionGroupConverter>(
+        cost_model,
+        /*convert_batch_groups_only=*/true);
+    // Expand the sort op to support stable sorting if required.
+    pipeline.AddPass<StableSortExpander>();
     // Convert BF16 operations to F32 operations so that the GPU backend can
     // support BF16 operations without directly implementing a BF16 lowering for
     // most ops.
@@ -180,10 +226,9 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
       // elimination has to come after that pass.
       pipeline.AddPass<ZeroSizedHloElimination>();
 
-      AlgebraicSimplifierOptions options(
-          [](const Shape&, const Shape&) { return false; });
-      options.set_enable_permutation_sort_replacement(true);
+      AlgebraicSimplifierOptions options;
       pass.AddPass<AlgebraicSimplifier>(options);
+      pass.AddPass<SortSimplifier>();
       pass.AddPass<TupleSimplifier>();
       pass.AddPass<WhileLoopConstantSinking>();
       pass.AddPass<WhileLoopSimplifier>();
@@ -252,12 +297,8 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
 
     // The LayoutAssignment pass may leave behind kCopy instructions which are
     // duplicate or NOPs, so remove them with algebraic simplification and CSE.
-    AlgebraicSimplifierOptions options(
-        /*valid_bitcast_callback=*/[](const Shape&, const Shape&) {
-          return true;
-        });
+    AlgebraicSimplifierOptions options;
     options.set_is_layout_sensitive(true);
-    options.set_enable_permutation_sort_replacement(true);
     pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
 
     // Choose the fastest algorithm for each conv.
@@ -361,6 +402,7 @@ Status PrepareHloModuleForIrEmitting(HloModule* hlo_module) {
   pipeline.AddPass<HloDCE>();
   pipeline.AddPass<FlattenCallGraph>();
   pipeline.AddPass<GpuCopyInsertion>();
+  pipeline.AddPass<GpuSanitizeConstantNames>();
   return pipeline.Run(hlo_module).status();
 }
 
@@ -478,14 +520,19 @@ void WarnIfBadDriverJITVersion() {
 
 // Compiles the given PTX string using ptxas and returns the resulting machine
 // code (i.e. a cubin) as a byte array.
-StatusOr<std::vector<uint8>> CompilePtx(const string& ptx, int cc_major,
-                                        int cc_minor,
-                                        bool disable_ptx_optimizations) {
+StatusOr<std::vector<uint8>> CompilePtx(
+    const string& ptx, int cc_major, int cc_minor,
+    const HloModuleConfig& hlo_module_config) {
   tracing::ScopedActivity activity("Compile PTX", /*is_expensive=*/true);
-  const string ptxas_path =
-      tensorflow::io::JoinPath(tensorflow::CudaRoot(), "bin", "ptxas");
-  VLOG(2) << "Checking ptxas at " << ptxas_path;
   auto env = tensorflow::Env::Default();
+  string ptxas_path;
+  for (const string& cuda_root : GetCudaRootCandidates(hlo_module_config)) {
+    ptxas_path = tensorflow::io::JoinPath(cuda_root, "bin", "ptxas");
+    VLOG(2) << "Looking for ptxas at " << ptxas_path;
+    if (env->FileExists(ptxas_path).ok()) {
+      break;
+    }
+  }
   TF_RETURN_IF_ERROR(env->FileExists(ptxas_path));
   VLOG(2) << "Using ptxas at " << ptxas_path;
 
@@ -520,7 +567,7 @@ StatusOr<std::vector<uint8>> CompilePtx(const string& ptx, int cc_major,
   if (VLOG_IS_ON(2)) {
     ptxas_args.push_back("-v");
   }
-  if (disable_ptx_optimizations) {
+  if (hlo_module_config.debug_options().xla_gpu_disable_ptxas_optimizations()) {
     ptxas_args.push_back("-O0");
   }
   ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args);
@@ -685,12 +732,8 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
     // Find the directory containing libdevice.  To avoid searching for it every
     // time, we have a one-element cache, keyed on the module's config's
     // cuda_data_dir.
-    const auto& config_cuda_data_dir =
-        module->config().debug_options().xla_gpu_cuda_data_dir();
-    if (cached_libdevice_dir_.empty() ||
-        cached_cuda_data_dir_ != config_cuda_data_dir) {
-      cached_cuda_data_dir_ = config_cuda_data_dir;
-      cached_libdevice_dir_ = GetLibdeviceDir(config_cuda_data_dir);
+    if (cached_libdevice_dir_.empty()) {
+      cached_libdevice_dir_ = GetLibdeviceDir(module->config());
     }
     libdevice_dir = cached_libdevice_dir_;
   }
@@ -743,9 +786,8 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
     }
   }
 
-  const std::vector<uint8> cubin = CompilePtxOrGetCachedResult(
-      ptx, cc_major, cc_minor,
-      module->config().debug_options().xla_gpu_disable_ptxas_optimizations());
+  const std::vector<uint8> cubin =
+      CompilePtxOrGetCachedResult(ptx, cc_major, cc_minor, module->config());
 
   auto thunk_schedule = absl::make_unique<ThunkSchedule>(
       ir_emitter.ConsumeThunkSequence(), std::move(stream_assignment),
@@ -756,14 +798,19 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
   std::unique_ptr<HloProfileIndexMap> profile_index_map;
   std::unique_ptr<HloProfilePrinterData> profile_printer;
 
-  if (module->config().hlo_profiling_enabled()) {
+  if (module->config().hlo_profiling_enabled() || VLOG_IS_ON(1)) {
     HloCostAnalysis cost_analysis(ShapeSizeBytesFunction());
     cost_analysis.set_bytes_per_second(
         stream_exec->GetDeviceDescription().memory_bandwidth());
     TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&cost_analysis));
-    profile_index_map = absl::make_unique<HloProfileIndexMap>(*module);
-    profile_printer = CreateHloProfilePrinterData(
-        *profile_index_map, cost_analysis, entry_computation->name());
+    VLOG(1) << "HLO memory read+written: "
+            << tensorflow::strings::HumanReadableNumBytes(
+                   cost_analysis.bytes_accessed());
+    if (module->config().hlo_profiling_enabled()) {
+      profile_index_map = absl::make_unique<HloProfileIndexMap>(*module);
+      profile_printer = CreateHloProfilePrinterData(
+          *profile_index_map, cost_analysis, entry_computation->name());
+    }
   }
 
   auto* gpu_executable = new GpuExecutable(
@@ -779,7 +826,7 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
 
 std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(
     const string& ptx, int cc_major, int cc_minor,
-    bool disable_ptx_optimizations) {
+    const HloModuleConfig& hlo_module_config) {
   XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::CompilePtxOrGetCachedResult");
   tracing::ScopedActivity activity("PTX->CUBIN", /*is_expensive=*/true);
   bool inserted;
@@ -807,8 +854,8 @@ std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(
     if (inserted) {
       CHECK(!cache_value->compilation_done);
       if (!ptx.empty()) {
-        StatusOr<std::vector<uint8>> maybe_cubin = CompilePtx(
-            *cache_ptx, cc_major, cc_minor, disable_ptx_optimizations);
+        StatusOr<std::vector<uint8>> maybe_cubin =
+            CompilePtx(*cache_ptx, cc_major, cc_minor, hlo_module_config);
         if (maybe_cubin.ok()) {
           cache_value->cubin_data = std::move(maybe_cubin).ValueOrDie();
           VLOG(2) << "Compiled PTX size:" << ptx.size()
@@ -827,10 +874,11 @@ std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(
             log_warning = !warning_done.exchange(true);
           }
           if (log_warning) {
-            LOG(WARNING)
-                << "Failed to compile ptx to cubin.  Will attempt to let "
-                   "GPU driver compile the ptx. "
-                << maybe_cubin.status();
+            PrintCantFindCudaMessage(
+                "Can't find ptxas binary.  Will back to the GPU driver "
+                "for PTX -> sass compilation.  This is OK so long as you don't "
+                "see a warning below about an out-of-date driver version.",
+                hlo_module_config);
           }
 
           // We're going to use the driver to JIT our PTX->SASS, so warn if
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
index be5e31a50112686841e6f18b76f382a56e61bafc..b2077f42fd097330703fde063d80a20704fa48e2 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
@@ -99,7 +99,7 @@ class NVPTXCompiler : public LLVMCompiler {
   // compiled cubin.  If compilation was unsuccessful, returns an empty vector.
   std::vector<uint8> CompilePtxOrGetCachedResult(
       const string& ptx, int cc_major, int cc_minor,
-      bool disable_ptx_optimizations);
+      const HloModuleConfig& hlo_module_config);
 
   // The compilation_cache_ map is a cache from {ptx string, cc_major, cc_minor}
   // -> cubin so we don't recompile the same ptx twice.  This is important for
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
index 8154d75d23a6d49153ccb6824402aff73f365617..cb012649200c6386d3ae25d088aa3b16bd40be82 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
index 375f68a15957936151aee068582a714b62694af2..bfed4f5230dfe37bca48560ce83a2dd82c8950a4 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
@@ -39,6 +39,25 @@ std::ostream& operator<<(std::ostream& out,
   return out;
 }
 
+int64 ThreadsPerBlockLimit(const se::DeviceDescription& device_desc) {
+  int64 threads_per_block = device_desc.threads_per_block_limit();
+  if (threads_per_block == 0) {
+    static std::atomic<int64> log_count{0};
+    if (log_count.fetch_add(1) < 8) {
+      LOG(WARNING) << "Attempting to calculate launch dimensions for GPU "
+                      "without full information about its capabilities.  "
+                      "StreamExecutor's PopulateDeviceDescription should be "
+                      "updated for this device.";
+    }
+    threads_per_block = device_desc.threads_per_warp();
+    if (threads_per_block == 0) {
+      // Fall back to *something* if we can't even get num threads per warp.
+      threads_per_block = 32;
+    }
+  }
+  return threads_per_block;
+}
+
 // Calculates the launch dimensions used to invoke `hlo`.
 LaunchDimensions CalculateLaunchDimensions(
     const Shape& shape, const se::DeviceDescription& device_desc,
@@ -62,21 +81,7 @@ LaunchDimensions CalculateLaunchDimensions(
   //
   //   <num threads per block> * <max blocks per core> = <max threads per core>
 
-  int64 threads_per_block = device_desc.threads_per_block_limit();
-  if (threads_per_block == 0) {
-    static std::atomic<int64> log_count{0};
-    if (log_count.fetch_add(1) < 8) {
-      LOG(WARNING) << "Attempting to calculate launch dimensions for GPU "
-                      "without full information about its capabilities.  "
-                      "StreamExecutor's PopulateDeviceDescription should be "
-                      "updated for this device.";
-    }
-    threads_per_block = device_desc.threads_per_warp();
-    if (threads_per_block == 0) {
-      // Fall back to *something* if we can't even get num threads per warp.
-      threads_per_block = 32;
-    }
-  }
+  int64 threads_per_block = ThreadsPerBlockLimit(device_desc);
 
   if (num_elements < threads_per_block) {
     threads_per_block = num_elements;
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.h b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
index 02471129e004b4876ce20a62cade34060c65b478..eb41dcccb938ccc088c2371def96ca73276771ab 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
@@ -57,6 +57,9 @@ class LaunchDimensions {
 std::ostream& operator<<(std::ostream& out,
                          const LaunchDimensions& launch_dims);
 
+// Returns the maximum number of threads per block allowed by the device.
+int64 ThreadsPerBlockLimit(const se::DeviceDescription& device_desc);
+
 LaunchDimensions CalculateLaunchDimensions(
     const Shape& shape, const se::DeviceDescription& device_desc,
     int unroll_factor = 1);
diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
index 4775baf44aecfe6adaf2bf0d2791595436635b16..1dedbd3befce6e2ceb06126d83a061207a90dd8f 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
@@ -25,7 +26,7 @@ namespace xla {
 namespace gpu {
 
 bool StreamAssignment::HasStreamAssigned(const HloInstruction& hlo) const {
-  return hlo_to_stream_number_.count(&hlo);
+  return hlo_to_stream_number_.contains(&hlo);
 }
 
 int StreamAssignment::StreamNumberForHlo(const HloInstruction& hlo) const {
@@ -98,10 +99,10 @@ int ComputeStreamToAssign(
   // greedy approach. First, we compute as forbidden_stream_numbers the
   // streams assigned to GEMMs that are concurrent with `hlo`. Then, we assign
   // `hlo` a different stream.
-  std::set<int> forbidden_stream_numbers;
+  absl::flat_hash_set<int> forbidden_stream_numbers;
   for (const auto* seen_gemm : seen_gemms) {
     int stream_num = stream_assignment.StreamNumberForHlo(*seen_gemm);
-    if (!forbidden_stream_numbers.count(stream_num) &&
+    if (!forbidden_stream_numbers.contains(stream_num) &&
         CanRunConcurrently(*seen_gemm, hlo, reachability)) {
       forbidden_stream_numbers.insert(stream_num);
     }
@@ -109,7 +110,7 @@ int ComputeStreamToAssign(
 
   for (int stream_num = 0; stream_num < stream_assignment.StreamCount();
        ++stream_num) {
-    if (!forbidden_stream_numbers.count(stream_num)) {
+    if (!forbidden_stream_numbers.contains(stream_num)) {
       return stream_num;
     }
   }
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.h b/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
index 1fc46bafa10e7ba6c896f081d5c836bd400886c9..92e4d6dbbc1bd564657f8a5de09d23d5ae81a93e 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
+++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_STREAM_EXECUTOR_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_STREAM_EXECUTOR_UTIL_H_
 
+#include "tensorflow/compiler/xla/layout.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
index a1ed8499040359fe7265a7317b0577a990a2234c..d33e9cf714ee3810b1fb2fa8c05c3ed399d27bfb 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
index a302b582ede3723acd118d2e4a4bb3efdf7a4d0b..869724db601b2d5e4ed6d3c7bf3e10a748433146 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@@ -65,7 +65,7 @@ TEST_F(GpuKernelTilingTest, UnnestedTransposeWithProperDimensionsTiled) {
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
 ; CHECK-LABEL: define void @copy
-; CHECK: tail call void @llvm.nvvm.barrier0()
+; CHECK: call void @llvm.nvvm.barrier0()
 ; CHECK: }
 )",
                      /*match_optimized_ir=*/true);
@@ -91,7 +91,7 @@ TEST_F(GpuKernelTilingTest, UnnestedTransposeWithSmallDimensionsNotTiled) {
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
 ; CHECK-LABEL: define void @copy
-; CHECK-NOT: tail call void @llvm.nvvm.barrier0()
+; CHECK-NOT: call void @llvm.nvvm.barrier0()
 ; CHECK: }
 )",
                      /*match_optimized_ir=*/true);
@@ -118,7 +118,7 @@ TEST_F(GpuKernelTilingTest, SimpleFusionWithTransposeTiled) {
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
 ; CHECK-LABEL: define void @fusion
-; CHECK: tail call void @llvm.nvvm.barrier0()
+; CHECK: call void @llvm.nvvm.barrier0()
 ; CHECK: }
 )",
                      /*match_optimized_ir=*/true);
@@ -152,7 +152,7 @@ TEST_F(GpuKernelTilingTest, MultipleOutputFusionWithOnePossibleTransposeTiled) {
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
 ; CHECK-LABEL: define void @fusion
-; CHECK: tail call void @llvm.nvvm.barrier0()
+; CHECK: call void @llvm.nvvm.barrier0()
 ; CHECK: }
 )",
                      /*match_optimized_ir=*/true);
@@ -187,13 +187,13 @@ TEST_F(GpuKernelTilingTest,
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
 ; CHECK-LABEL: define void @fusion
-; CHECK-NOT: tail call void @llvm.nvvm.barrier0()
+; CHECK-NOT: call void @llvm.nvvm.barrier0()
 ; CHECK: }
 )",
                      /*match_optimized_ir=*/true);
 }
 
-TEST_F(GpuKernelTilingTest, FusionTransposeWithReverseNotTiled) {
+TEST_F(GpuKernelTilingTest, TransposedInputWithUserReverseNotTiled) {
   const char *const kHloString = R"(
     HloModule FusionTransposeWithReverseNotTiled
     fused_computation.1 {
@@ -214,12 +214,203 @@ TEST_F(GpuKernelTilingTest, FusionTransposeWithReverseNotTiled) {
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
 ; CHECK-LABEL: define void @fusion
-; CHECK-NOT: tail call void @llvm.nvvm.barrier0()
+; CHECK-NOT: call void @llvm.nvvm.barrier0()
 ; CHECK: }
 )",
                      /*match_optimized_ir=*/true);
 }
 
+TEST_F(GpuKernelTilingTest, TransposedInputWithUserBitcastNotTiled) {
+  const char *const kHloString = R"(
+    HloModule TransposedInputWithUserBitcast
+
+    fused_computation {
+      param_0 = f32[20,20]{1,0} parameter(0)
+      ROOT bitcast = f32[20,20]{0,1} bitcast(param_0)
+    }
+
+    ENTRY kernel_entry {
+      parameter.0 = f32[20,20]{1,0} parameter(0)
+      ROOT fusion = f32[20,20]{0,1} fusion(parameter.0),
+        kind=kLoop, calls=fused_computation
+    })";
+
+  // Check that a call to llvm.nvvm.barrier0 is not generated.
+  auto hlo_module =
+      ParseHloString(kHloString, ConfigWithoutLayoutAssignment()).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @fusion
+; CHECK-NOT: call void @llvm.nvvm.barrier0()
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+
+  // Check that the kernel runs correctly.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0}));
+}
+
+TEST_F(GpuKernelTilingTest, TransposedInputWithoutUnsafeUseTiled) {
+  const char *const kHloString = R"(
+    HloModule TwoTransposedInputs
+
+    fused_computation {
+      param_0 = f32[64,64]{1,0} parameter(0)
+      param_1 = f32[64,64]{1,0} parameter(1)
+      bitcast = f32[64,64]{0,1} bitcast(param_0)
+      copy = f32[64,64]{0,1} copy(param_1)
+      ROOT tuple = (f32[64,64]{0,1}, f32[64,64]{0,1}) tuple(bitcast, copy)
+    }
+
+    ENTRY kernel_entry {
+      parameter.0 = f32[64,64]{1,0} parameter(0)
+      parameter.1 = f32[64,64]{1,0} parameter(1)
+      ROOT fusion = (f32[64,64]{0,1}, f32[64,64]{0,1})
+        fusion(parameter.0, parameter.1),
+        kind=kLoop, calls=fused_computation
+    })";
+
+  // Check that a call to llvm.nvvm.barrier0 is generated.
+  auto hlo_module =
+      ParseHloString(kHloString, ConfigWithoutLayoutAssignment()).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @fusion
+; CHECK: call void @llvm.nvvm.barrier0()
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+  // Check that the kernel runs correctly.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0}));
+}
+
+TEST_F(GpuKernelTilingTest, ColumnReductionWithPowerOf2OutputElementsUnrolled) {
+  const char *const kHloString = R"(
+  HloModule column_reduce_powerof2
+
+  reduction {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT add = f32[] add(x, y)
+  }
+
+  ENTRY kernel_entry {
+    constant0 = f32[] constant(0)
+    arg1 = f16[1024,512]{1,0} parameter(0)
+    arg1_conv = f32[1024,512]{1,0} convert(arg1)
+    ROOT reduce = f32[512]{0} reduce(arg1_conv, constant0), dimensions={0}, to_apply=reduction
+  })";
+
+  // Check that two calls to llvm.nvvm.atomic are generated.
+  auto hlo_module =
+      ParseHloString(kHloString, ConfigWithoutLayoutAssignment()).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @fusion
+; CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK-NOT: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+  // Check that the kernel runs correctly.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
+}
+
+TEST_F(GpuKernelTilingTest,
+       ColumnReductionWithInputLargerThenReduceInputNotUnrolled) {
+  const char *const kHloString = R"(
+  HloModule larger_than_reduce_input_parameter
+
+  reduction22 {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT add = f32[] add(x, y)
+  }
+
+  fused_computation {
+    constant0 = f32[] constant(0)
+    arg.1 = f16[1024,512]{1,0} parameter(0)
+    arg.2 = f16[1027,513]{1,0} parameter(1)
+    arg1.conv = f32[1024,512]{1,0} convert(arg.1)
+    arg2.conv = f32[1027,513]{1,0} convert(arg.2)
+    slice2 = f32[1024,512]{1,0} slice(arg2.conv), slice={[2:1026], [1:513]}
+    add2 = f32[1024,512]{1,0} add(arg1.conv, slice2)
+    ROOT reduce = f32[512]{0} reduce(add2, constant0), dimensions={0},
+      to_apply=reduction22
+  }
+
+  ENTRY kernel_entry {
+    arg1 = f16[1024,512]{1,0} parameter(0)
+    arg2 = f16[1027,513]{1,0} parameter(1)
+    ROOT fusion = f32[512]{0} fusion(arg1, arg2), kind=kInput,
+      calls=fused_computation
+  })";
+
+  // Check that one call to llvm.nvvm.atomic is generated.
+  auto hlo_module =
+      ParseHloString(kHloString, ConfigWithoutLayoutAssignment()).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @fusion
+; CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK-NOT: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+  // Check that the kernel runs correctly.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
+}
+
+TEST_F(GpuKernelTilingTest, ColumnReductionMOFUnrolled) {
+  const char *const kHloString = R"(
+  HloModule column_reduce_powerof2_mof
+
+  reduction22 {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT add = f32[] add(x, y)
+  }
+
+  fused_computation {
+    constant0 = f32[] constant(0)
+    arg.1 = f16[1024,512]{1,0} parameter(0)
+    arg.2 = f16[1024,512]{1,0} parameter(1)
+    arg1.conv = f32[1024,512]{1,0} convert(arg.1)
+    arg2.conv = f32[1024,512]{1,0} convert(arg.2)
+    reduce1 = f32[512]{0} reduce(arg1.conv, constant0), dimensions={0},
+      to_apply=reduction22
+    reduce2 = f32[512]{0} reduce(arg2.conv, constant0), dimensions={0},
+      to_apply=reduction22
+    add = f32[1024,512]{1,0} add(arg1.conv, arg2.conv)
+    ROOT tuple = (f32[512]{0}, f32[512]{0}, f32[1024,512]{1,0})
+      tuple(reduce1, reduce2, add)
+  }
+
+  ENTRY kernel_entry {
+    arg1 = f16[1024,512]{1,0} parameter(0)
+    arg2 = f16[1024,512]{1,0} parameter(1)
+    ROOT fusion = (f32[512]{0}, f32[512]{0}, f32[1024,512]{1,0})
+      fusion(arg1, arg2), kind=kInput, calls=fused_computation
+  })";
+
+  // Check that four calls to llvm.nvvm.atomic are generated.
+  auto hlo_module =
+      ParseHloString(kHloString, ConfigWithoutLayoutAssignment()).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @fusion
+; CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK-NOT: call float @llvm.nvvm.atomic.load.add.f32.p0f32
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+  // Check that the kernel runs correctly.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{1.0e-5, 1.0e-5}));
+}
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc b/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc
index f8120a5fa00ce38644cd85c54d5ef65701be1eda..06b06a5b1ee1fb9996be3ebe326893c4160a7e29 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/types.h"
@@ -43,7 +42,7 @@ class InfeedTest : public ClientLibraryTestBase {
     ASSERT_IS_OK(client_->TransferToInfeed(literal));
     XlaBuilder builder(TestName());
     Infeed(&builder, literal.shape());
-    if (ShapeUtil::IsTuple(literal.shape())) {
+    if (literal.shape().IsTuple()) {
       // TODO(b/30609564): Use ComputeAndCompareLiteral instead.
       ComputeAndCompareTuple(&builder, literal, {});
     } else {
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.cc b/tensorflow/compiler/xla/service/gpu/thunk.cc
index c78605cebbc671272b8df9faf0e0cc54be2f5b1c..a677617727c04811584cbaa295d164ed27273bb2 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/thunk.cc
@@ -48,6 +48,8 @@ std::ostream& operator<<(std::ostream& os, Thunk::Kind kind) {
       return os << "kOutfeed";
     case Thunk::kSequential:
       return os << "kSequential";
+    case Thunk::kTriangularSolve:
+      return os << "kTriangularSolve";
     case Thunk::kTuple:
       return os << "kTuple";
     case Thunk::kWhile:
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index e68bee035a029178844282995429eaa960cc4817..bc69af897a01775d2d33d46067464b10e049f3e1 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -56,6 +56,7 @@ class Thunk {
     kMemzero,
     kOutfeed,
     kSequential,
+    kTriangularSolve,
     kTuple,
     kWhile,
   };
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
index 6b2d76764a077dc6cfa3f9ddc6e525ab330323be..25bad67bab9375559c431466571c62acd0452b01 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
+++ b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
@@ -14,17 +14,19 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/gpu/thunk_schedule.h"
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
 
 namespace xla {
 namespace gpu {
 
 void ThunkSchedule::AddDependenciesOnTransitiveOperands(
     const Thunk& thunk, const HloInstruction& operand,
-    const std::unordered_map<const HloInstruction*, Thunk*>& hlo_to_thunk) {
-  if (hlo_to_thunk.count(&operand)) {
+    const absl::flat_hash_map<const HloInstruction*, Thunk*>& hlo_to_thunk) {
+  if (hlo_to_thunk.contains(&operand)) {
     // If `operand` is mapped to a thunk, adds `operand` to `thunk`'s dependency
     // list if `operand` is assigned to a different stream. As an optimization,
     // we skip `operand`'s operands because `operand` depends on them already.
@@ -48,14 +50,14 @@ ThunkSchedule::ThunkSchedule(
     const std::vector<HloInstruction*>& hlo_total_order)
     : thunks_(std::move(thunks)),
       stream_assignment_(std::move(stream_assignment)) {
-  std::unordered_map<const HloInstruction*, Thunk*> hlo_to_thunk;
+  absl::flat_hash_map<const HloInstruction*, Thunk*> hlo_to_thunk;
   for (const auto& thunk : *thunks_) {
     InsertOrDie(&hlo_to_thunk, thunk->hlo_instruction(), thunk.get());
   }
 
   for (HloInstruction* hlo : hlo_total_order) {
-    if (hlo_to_thunk.count(hlo)) {
-      thunk_total_order_.push_back(FindOrDie(hlo_to_thunk, hlo));
+    if (Thunk** thunk = tensorflow::gtl::FindOrNull(hlo_to_thunk, hlo)) {
+      thunk_total_order_.push_back(*thunk);
     }
   }
 
@@ -106,7 +108,7 @@ void ThunkSchedule::RemoveRedundantDependencyEdges() {
   // redundant dependency edge.
   Array2D<int> last_dependency(stream_count, stream_count, -1);
   for (const Thunk* dst : thunk_total_order_) {
-    if (!depends_on_.count(dst)) {
+    if (!depends_on_.contains(dst)) {
       continue;
     }
 
@@ -134,7 +136,7 @@ void ThunkSchedule::RemoveRedundantDependencyEdges() {
 
 const std::list<const Thunk*>& ThunkSchedule::DependsOn(
     const Thunk* thunk) const {
-  if (depends_on_.count(thunk)) {
+  if (depends_on_.contains(thunk)) {
     return FindOrDie(depends_on_, thunk);
   } else {
     return empty_thunk_list_;
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_schedule.h b/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
index 43b628a1baf0e79a3197f3cfad3547991642eaed..549378debd52417252724a5d8a6f4d24f2ad0369 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
@@ -21,6 +21,8 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -54,7 +56,9 @@ class ThunkSchedule {
   // Thunks that `thunk` depends on.
   const std::list<const Thunk*>& DependsOn(const Thunk* thunk) const;
   // Whether `thunk` is depended by another thunk.
-  bool Depended(const Thunk* thunk) const { return depended_by_.count(thunk); }
+  bool Depended(const Thunk* thunk) const {
+    return depended_by_.contains(thunk);
+  }
 
   // Delegates to StreamAssignment.
   int StreamCount() const { return stream_assignment_->StreamCount(); }
@@ -75,13 +79,13 @@ class ThunkSchedule {
   // thunk.hlo_instruction().
   void AddDependenciesOnTransitiveOperands(
       const Thunk& thunk, const HloInstruction& operand,
-      const std::unordered_map<const HloInstruction*, Thunk*>& hlo_to_thunk);
+      const absl::flat_hash_map<const HloInstruction*, Thunk*>& hlo_to_thunk);
 
   std::unique_ptr<ThunkSequence> thunks_;
   std::vector<Thunk*> thunk_total_order_;
 
-  std::unordered_map<const Thunk*, std::list<const Thunk*>> depends_on_;
-  std::set<const Thunk*> depended_by_;
+  absl::flat_hash_map<const Thunk*, std::list<const Thunk*>> depends_on_;
+  absl::flat_hash_set<const Thunk*> depended_by_;
   std::list<const Thunk*> empty_thunk_list_;
 
   std::unique_ptr<StreamAssignment> stream_assignment_;
diff --git a/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.cc b/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5200a2af412979c7e38d95c5a9bd5bc2ab64f086
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.cc
@@ -0,0 +1,149 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h"
+
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/blas.h"
+#include "tensorflow/stream_executor/device_memory.h"
+
+namespace xla {
+namespace gpu {
+
+TriangularSolveThunk::TriangularSolveThunk(
+    const TriangularSolveOptions& options,
+    const BufferAllocation::Slice& a_buffer,
+    const BufferAllocation::Slice& b_buffer, PrimitiveType type,
+    int64 batch_size, int64 m, int64 n, int64 a_batch_stride,
+    int64 b_batch_stride, const HloInstruction* hlo)
+    : Thunk(Kind::kTriangularSolve, hlo),
+      uplo_(options.lower() ? se::blas::UpperLower::kLower
+                            : se::blas::UpperLower::kUpper),
+      side_(options.left_side() ? se::blas::Side::kLeft
+                                : se::blas::Side::kRight),
+      unit_diagonal_(options.unit_diagonal() ? se::blas::Diagonal::kUnit
+                                             : se::blas::Diagonal::kNonUnit),
+      a_buffer_(a_buffer),
+      b_buffer_(b_buffer),
+      type_(type),
+      batch_size_(batch_size),
+      m_(m),
+      n_(n),
+      a_batch_stride_(a_batch_stride),
+      b_batch_stride_(b_batch_stride) {
+  transpose_a_ = [&] {
+    switch (options.transpose_a()) {
+      case TriangularSolveOptions::NO_TRANSPOSE:
+        return se::blas::Transpose::kNoTranspose;
+      case TriangularSolveOptions::TRANSPOSE:
+        return se::blas::Transpose::kTranspose;
+      case TriangularSolveOptions::ADJOINT:
+        return se::blas::Transpose::kConjugateTranspose;
+      default:
+        LOG(ERROR) << "Invalid triangular solve transpose value "
+                   << options.transpose_a();
+        return se::blas::Transpose::kNoTranspose;
+    }
+  }();
+}
+
+Status TriangularSolveThunk::ExecuteOnStream(
+    const BufferAllocations& buffer_allocations, se::Stream* stream,
+    HloExecutionProfiler* profiler) {
+  VLOG(3) << "uplo=" << se::blas::UpperLowerString(uplo_)
+          << " side=" << se::blas::SideString(side_)
+          << " diagonal=" << se::blas::DiagonalString(unit_diagonal_)
+          << " batch_size=" << batch_size_ << " m=" << m_ << " n=" << n_
+          << " a_batch_stride=" << a_batch_stride_
+          << " b_batch_stride=" << b_batch_stride_;
+
+  const int lda = side_ == se::blas::Side::kLeft ? m_ : n_;
+  const int ldb = m_;
+
+  char* a_base = static_cast<char*>(
+      buffer_allocations.GetDeviceAddress(a_buffer_).opaque());
+  char* b_base = static_cast<char*>(
+      buffer_allocations.GetDeviceAddress(b_buffer_).opaque());
+  for (int64 i = 0; i < batch_size_; ++i) {
+    bool launch_ok;
+    se::DeviceMemoryBase a_data =
+        se::DeviceMemoryBase(a_base + i * a_batch_stride_, a_batch_stride_);
+    se::DeviceMemoryBase b_data =
+        se::DeviceMemoryBase(b_base + i * b_batch_stride_, b_batch_stride_);
+    switch (type_) {
+      case F32: {
+        se::DeviceMemory<float> b_data_typed(b_data);
+        launch_ok = stream
+                        ->ThenBlasTrsm(side_, uplo_, transpose_a_,
+                                       unit_diagonal_, m_, n_, /*alpha=*/1.0f,
+                                       se::DeviceMemory<float>(a_data), lda,
+                                       &b_data_typed, ldb)
+                        .ok();
+        break;
+      }
+      case F64: {
+        se::DeviceMemory<double> b_data_typed(b_data);
+        launch_ok = stream
+                        ->ThenBlasTrsm(side_, uplo_, transpose_a_,
+                                       unit_diagonal_, m_, n_, /*alpha=*/1.0,
+                                       se::DeviceMemory<double>(a_data), lda,
+                                       &b_data_typed, ldb)
+                        .ok();
+        break;
+      }
+      case C64: {
+        se::DeviceMemory<std::complex<float>> b_data_typed(b_data);
+        launch_ok =
+            stream
+                ->ThenBlasTrsm(side_, uplo_, transpose_a_, unit_diagonal_, m_,
+                               n_, /*alpha=*/1.0f,
+                               se::DeviceMemory<std::complex<float>>(a_data),
+                               lda, &b_data_typed, ldb)
+                .ok();
+        break;
+      }
+      case C128: {
+        se::DeviceMemory<std::complex<double>> b_data_typed(b_data);
+        launch_ok =
+            stream
+                ->ThenBlasTrsm(side_, uplo_, transpose_a_, unit_diagonal_, m_,
+                               n_, /*alpha=*/1.0,
+                               se::DeviceMemory<std::complex<double>>(a_data),
+                               lda, &b_data_typed, ldb)
+                .ok();
+        break;
+      }
+      default:
+        return InvalidArgument("Invalid type for triangular solve %d", type_);
+    }
+    if (!launch_ok) {
+      return InternalError("Unable to launch triangular solve for thunk %p",
+                           this);
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h b/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h
new file mode 100644
index 0000000000000000000000000000000000000000..c947162ea32f197f808d099859eadbbc55a65ab1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/triangular_solve_thunk.h
@@ -0,0 +1,75 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TRIANGULAR_SOLVE_THUNK_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TRIANGULAR_SOLVE_THUNK_H_
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
+#include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/blas.h"
+
+namespace xla {
+namespace gpu {
+
+// This class stores everything that StreamExecutor needs to launch a triangular
+// solve (BlasTrsm). It is generated by IrEmitter.
+//
+// Thread-compatible.
+class TriangularSolveThunk : public Thunk {
+ public:
+  TriangularSolveThunk(const TriangularSolveOptions& options,
+                       const BufferAllocation::Slice& a_buffer,
+                       const BufferAllocation::Slice& b_buffer,
+                       PrimitiveType type, int64 batch_size, int64 m, int64 n,
+                       int64 a_batch_stride, int64 b_batch_stride,
+                       const HloInstruction* hlo);
+
+  TriangularSolveThunk(const TriangularSolveThunk&) = delete;
+  TriangularSolveThunk& operator=(const TriangularSolveThunk&) = delete;
+
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         se::Stream* stream,
+                         HloExecutionProfiler* profiler) override;
+
+ private:
+  const se::blas::UpperLower uplo_;
+  const se::blas::Side side_;
+  const se::blas::Diagonal unit_diagonal_;
+  se::blas::Transpose transpose_a_;
+
+  const BufferAllocation::Slice a_buffer_;
+  const BufferAllocation::Slice b_buffer_;
+
+  const PrimitiveType type_;
+  const int64 batch_size_;
+  const int64 m_;
+  const int64 n_;
+  const int64 a_batch_stride_;
+  const int64 b_batch_stride_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TRIANGULAR_SOLVE_THUNK_H_
diff --git a/tensorflow/compiler/xla/service/gpu/variadic_op_splitter.cc b/tensorflow/compiler/xla/service/gpu/variadic_op_splitter.cc
index c552c2925497f1c4808d74a615d35cdbeeba1858..bbbcc2dbb0f71d08462a1aad6d97e7fd07b2a1fb 100644
--- a/tensorflow/compiler/xla/service/gpu/variadic_op_splitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/variadic_op_splitter.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/gpu/xfeed_queue.h b/tensorflow/compiler/xla/service/gpu/xfeed_queue.h
index dd46ff433ba0ad6bfa3999b96845fdaebe148aca..167c038420a64d9fa29746ed3fe349620e08e6ff 100644
--- a/tensorflow/compiler/xla/service/gpu/xfeed_queue.h
+++ b/tensorflow/compiler/xla/service/gpu/xfeed_queue.h
@@ -47,6 +47,10 @@ class XfeedQueue {
   // Blocks until the queue is non-empty, then returns the buffer at the head of
   // the queue.
   BufferType BlockingGetNextDestination() {
+    for (const auto& callback : before_get_next_dest_callbacks_) {
+      callback();
+    }
+
     bool became_empty;
     BufferType current_buffer;
     {
@@ -69,6 +73,10 @@ class XfeedQueue {
   void RegisterOnEmptyCallback(std::function<void()> callback) {
     on_empty_callbacks_.push_back(std::move(callback));
   }
+  void RegisterBeforeGetNextDestinationCallback(
+      std::function<void()> callback) {
+    before_get_next_dest_callbacks_.push_back(std::move(callback));
+  }
 
  private:
   tensorflow::mutex mu_;
@@ -82,6 +90,11 @@ class XfeedQueue {
   // List of callbacks which will be called when 'enqueued_buffers_' becomes
   // empty.
   std::vector<std::function<void()>> on_empty_callbacks_;
+
+  // List of callbacks which will be called before BlockingGetNextDestination()
+  // is called. This lets you e.g. call EnqueueDestination() for each call to
+  // BlockingGetNextDestination().
+  std::vector<std::function<void()>> before_get_next_dest_callbacks_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 9220865867b770eebfb1ada8f31a5d24693a4b8d..4fca981c6a59cdb91a997e6a887fd26472c1a10a 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -199,7 +199,7 @@ Status HeapSimulator::RunComputation(
 
       // If the buffer has no users and isn't an entry parameter or output, it
       // must be a dead value.
-      if (live_buffers.count(buffer) == 0) {
+      if (!live_buffers.contains(buffer)) {
         dead_buffers_to_free.push_back(buffer);
       }
     }
@@ -225,10 +225,10 @@ Status HeapSimulator::RunComputation(
       }
     }
     // Sort to get a deterministic iteration order.
-    std::sort(operand_buffers_to_free.begin(), operand_buffers_to_free.end(),
-              [](const BufferValue* x, const BufferValue* y) {
-                return x->id() < y->id();
-              });
+    absl::c_sort(operand_buffers_to_free,
+                 [](const BufferValue* x, const BufferValue* y) {
+                   return x->id() < y->id();
+                 });
 
     // Allocate buffers defined by this instruction.  This is the latest point
     // that we can allocate; right before the buffer is first used.  This must
@@ -253,7 +253,7 @@ Status HeapSimulator::RunComputation(
       bool shared = false;
       if (options_.may_reuse_operand_buffers) {
         for (const BufferValue* operand_buffer : operand_buffers_to_free) {
-          if (reused_buffers.count(operand_buffer) != 0) {
+          if (reused_buffers.contains(operand_buffer)) {
             continue;
           }
           if (buffer->instruction()->IsUserOf(operand_buffer->instruction()) &&
@@ -335,10 +335,9 @@ Status HeapSimulator::RunComputation(
     to_free.push_back(buffer);
   }
 
-  std::sort(to_free.begin(), to_free.end(),
-            [](const BufferValue* x, const BufferValue* y) {
-              return x->id() < y->id();
-            });
+  absl::c_sort(to_free, [](const BufferValue* x, const BufferValue* y) {
+    return x->id() < y->id();
+  });
   for (const BufferValue* buffer : to_free) {
     VLOG(3) << "Freeing pending: " << buffer->ToString();
     Free(buffer, root);
@@ -374,15 +373,15 @@ bool HeapSimulator::IgnoreBuffer(const BufferValue* buffer) const {
     return true;
   }
   return options_.buffers_to_assign != nullptr &&
-         options_.buffers_to_assign->count(buffer) == 0;
+         !options_.buffers_to_assign->contains(buffer);
 }
 
 // Alloc always calls the underlying heap algorithm.
 void HeapSimulator::Alloc(const BufferValue* buffer,
                           const HloInstruction* instruction) {
-  CHECK(allocated_buffers_.count(buffer) == 0)
+  CHECK(!allocated_buffers_.contains(buffer))
       << "Alloc called on allocated buffer: " << *buffer;
-  CHECK(freed_buffers_.count(buffer) == 0)
+  CHECK(!freed_buffers_.contains(buffer))
       << "Alloc called on freed buffer: " << *buffer;
 
   allocated_buffers_.insert(buffer);
@@ -411,9 +410,9 @@ void HeapSimulator::Free(const BufferValue* buffer,
     buffer = group->canonical;
   }
 
-  CHECK(allocated_buffers_.count(buffer) > 0)
+  CHECK(allocated_buffers_.contains(buffer))
       << "Free called on non-allocated buffer: " << *buffer;
-  CHECK(freed_buffers_.count(buffer) == 0)
+  CHECK(!freed_buffers_.contains(buffer))
       << "Free called on freed buffer: " << *buffer;
 
   freed_buffers_.insert(buffer);
@@ -433,11 +432,11 @@ void HeapSimulator::ShareBuffer(const BufferValue* buffer,
                                 const HloInstruction* instruction) {
   CHECK_LE(size_fn_(*buffer), size_fn_(*shared))
       << "ShareBuffer oversized buffer" << *buffer << " shared: " << *shared;
-  CHECK(allocated_buffers_.count(buffer) == 0)
+  CHECK(!allocated_buffers_.contains(buffer))
       << "ShareBuffer called on allocated buffer: " << *buffer;
-  CHECK(freed_buffers_.count(buffer) == 0)
+  CHECK(!freed_buffers_.contains(buffer))
       << "ShareBuffer called on freed buffer: " << *buffer;
-  CHECK(freed_buffers_.count(shared) == 0)
+  CHECK(!freed_buffers_.contains(shared))
       << "ShareBuffer called on freed shared buffer: " << *shared;
 
   const BufferValue* canonical = nullptr;
@@ -452,7 +451,7 @@ void HeapSimulator::ShareBuffer(const BufferValue* buffer,
   } else {
     // The 'shared' buffer doesn't have a group; it must be the canonical.  Add
     // both 'buffer' and 'shared' to a new group.
-    CHECK(allocated_buffers_.count(shared) > 0)
+    CHECK(allocated_buffers_.contains(shared))
         << "ShareBuffer called on non-allocated shared buffer: " << *shared;
     auto group = std::make_shared<SharedGroup>();
     canonical = shared;
@@ -596,7 +595,7 @@ void DecreasingSizeRunsHeap::CallAndDrainRun() {
   }
 
   // Call ops in the run sorted by decreasing size, breaking ties by buffer id.
-  std::sort(run_.begin(), run_.end(), [](const Op& a, const Op& b) {
+  absl::c_sort(run_, [](const Op& a, const Op& b) {
     if (a.size != b.size) {
       return a.size > b.size;
     }
@@ -866,23 +865,23 @@ HeapSimulator::Result GlobalDecreasingSizeBestFitHeap::Finish() {
   for (auto& entry : buffer_intervals_) {
     sorted_buffer_intervals.push_back(entry.second);
   }
-  std::sort(sorted_buffer_intervals.begin(), sorted_buffer_intervals.end(),
-            [](const BufferInterval& x, const BufferInterval& y) {
-              if (x.size != y.size) {
-                return x.size > y.size;
-              }
-              if (x.end - x.start != y.end - y.start) {
-                return x.end - x.start > y.end - y.start;
-              }
-              return x.buffer->id() < y.buffer->id();
-            });
+  absl::c_sort(sorted_buffer_intervals,
+               [](const BufferInterval& x, const BufferInterval& y) {
+                 if (x.size != y.size) {
+                   return x.size > y.size;
+                 }
+                 if (x.end - x.start != y.end - y.start) {
+                   return x.end - x.start > y.end - y.start;
+                 }
+                 return x.buffer->id() < y.buffer->id();
+               });
 
   BufferIntervalTree interval_tree(sorted_buffer_intervals.size());
   for (auto& buffer_interval : sorted_buffer_intervals) {
     auto chunks_overlapping_in_time = interval_tree.ChunksOverlappingInTime(
         buffer_interval.start, buffer_interval.end);
-    std::sort(
-        chunks_overlapping_in_time.begin(), chunks_overlapping_in_time.end(),
+    absl::c_sort(
+        chunks_overlapping_in_time,
         [](const Chunk& x, const Chunk& y) { return x.offset < y.offset; });
 
     // Find the minimum free chunk that can hold this buffer.
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index dbbf43082f2c1d21f5ef42f53804bf0969903a58..3e0631aeb4aa374cb5748650e1c7529e26e10b34 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -158,7 +158,7 @@ class HeapSimulator {
   void FillDebugTrace(HeapSimulatorTrace::Event::Kind kind,
                       const BufferValue* buffer,
                       const HloInstruction* instruction,
-                      const BufferValue* shared_with_canonical);
+                      const BufferValue* share_with_canonical);
 
   // Counterintuitive: the algorithm_ itself can be a NoFragmentationStatsHeap,
   // in which case we are calculating the same allocs/frees twice in the
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 414c63271245315f037d04924c9291a9cd5b7a77..ae9e3169fd9b7a4655ab91ffb1589b845402ba8d 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -34,7 +34,7 @@ import "tensorflow/compiler/xla/xla_data.proto";
 option cc_enable_arenas = true;
 
 // Serialization of HloInstruction.
-// Next ID: 58
+// Next ID: 62
 message HloInstructionProto {
   reserved 10;
   reserved "parameter_name";
@@ -82,6 +82,8 @@ message HloInstructionProto {
   // it will use a default value of 1.
   int64 feature_group_count = 50;
 
+  int64 batch_group_count = 58;
+
   // Describes the [begin, end) index range and stride for slices.
   message SliceDimensions {
     int64 start = 1;
@@ -166,13 +168,16 @@ message HloInstructionProto {
   // Cross replica op fields.
   repeated ReplicaGroup replica_groups = 49;
   int64 all_reduce_id = 45;
-  string cross_replica_sum_barrier = 46;
+  string all_reduce_barrier = 46;
 
   // Whether this Send/Recv instruction transfers data to/from the host. Only
   // present for Send and Recv instructions and their SendDone and RecvDone
   // partners.
   bool is_host_transfer = 47;
 
+  // Whether this Sort instruction should be stable.
+  bool is_stable = 60;
+
   xla.ScatterDimensionNumbers scatter_dimension_numbers = 48;
 
   // Precision configuration for the instruction. Has backend-specific meaning.
@@ -191,6 +196,12 @@ message HloInstructionProto {
   // operand.
   bool constrain_layout = 56;
   repeated xla.ShapeProto operand_shapes_with_layout = 57;
+
+  // Options for TriangularSolve
+  xla.TriangularSolveOptions triangular_solve_options = 59;
+
+  // Describes how parameters behave with regards to replicas.
+  xla.ParameterReplication parameter_replication = 61;
 }
 
 // Serialization of HloComputation.
@@ -227,6 +238,18 @@ message HloScheduleProto {
 }
 
 message HloInputOutputAliasProto {
+  enum Kind {
+    // Define a UNDEFINED_ALIAS equal to zero to get around the default-0 proto3
+    // behavior and missing has_*() APIs.
+    UNDEFINED_ALIAS = 0;
+    // An alias setup by the user as must alias. A use setting USER_ALIAS is
+    // expecting the designed output to be dropped over the given input
+    // parameter number+index.
+    USER_ALIAS = 1;
+    // An alias setup by the compiler as part of its optimizations.
+    SYSTEM_ALIAS = 2;
+  }
+
   // The following proto describes a pair of aliased an input
   // (described by parameter number and a ShapeIndex of the parameter)
   // and an output (described by a ShapeIndex of the root
@@ -247,6 +270,8 @@ message HloInputOutputAliasProto {
     int64 parameter_number = 2;
     // ShapeIndex of the parameter instruction.
     repeated int64 parameter_shape_index = 3;
+    // The kind of alias to be setup.
+    Kind kind = 4;
   }
 
   repeated AliasEntryProto entries = 1;
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
index cf8e6594cbe5ffd28ca75dd5006e8817f1e8581c..e511f1951c5dd07ebb64fa38fd5b7f6a0e87b429 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
@@ -117,7 +117,7 @@ class BufferValueMap {
     for (const auto& pair : buffers_) {
       buffer_numbers.push_back(pair.first);
     }
-    std::sort(buffer_numbers.begin(), buffer_numbers.end());
+    absl::c_sort(buffer_numbers);
     return buffer_numbers;
   }
 
@@ -176,13 +176,12 @@ class BufferValueMap {
       const HloValue& value, std::vector<BufferNumber>* aliased_buffers) {
     // Get parameter value from an aliased_input object.
     const auto get_parameter_value =
-        [this](const std::pair<int64, ShapeIndex>& aliased_input)
+        [this](const HloInputOutputAliasConfig::Alias& aliased_input)
         -> const HloValue& {
-      int64 param_number = aliased_input.first;
-      const ShapeIndex& param_index = aliased_input.second;
       return dataflow_.GetUniqueValueAt(
-          module_->entry_computation()->parameter_instruction(param_number),
-          param_index);
+          module_->entry_computation()->parameter_instruction(
+              aliased_input.parameter_number),
+          aliased_input.parameter_index);
     };
 
     // If the value shows up in a root instruction, alias it with parameter
@@ -319,7 +318,7 @@ class BufferValueMap {
     ComputeWhileAliasedBuffers(value, &aliased_buffers);
     ComputeConditionalAliasedBuffers(value, &aliased_buffers);
     // Uniquify aliased buffers.
-    std::sort(aliased_buffers.begin(), aliased_buffers.end());
+    absl::c_sort(aliased_buffers);
     aliased_buffers.erase(
         std::unique(aliased_buffers.begin(), aliased_buffers.end()),
         aliased_buffers.end());
@@ -367,7 +366,7 @@ std::vector<const HloBuffer*> HloAliasAnalysis::ComputeBuffersAt(
   }
 
   // Sort and uniquify vector before returning.
-  std::sort(buffers.begin(), buffers.end(), HloBuffer::IdLessThan);
+  absl::c_sort(buffers, HloBuffer::IdLessThan);
   buffers.erase(std::unique(buffers.begin(), buffers.end()), buffers.end());
 
   return buffers;
@@ -430,8 +429,7 @@ Status HloAliasAnalysis::Verify() const {
   for (const auto& pair : value_to_buffer_) {
     const HloValue* value = pair.first;
     const HloBuffer& buffer = *pair.second;
-    TF_RET_CHECK(std::find(buffer.values().begin(), buffer.values().end(),
-                           value) != buffer.values().end());
+    TF_RET_CHECK(absl::c_linear_search(buffer.values(), value));
   }
 
   for (HloBuffer::Id id = 0; id < buffers_.size(); ++id) {
@@ -457,7 +455,7 @@ string HloAliasAnalysis::ToString() const {
   for (const HloComputation* computation : module_->computations()) {
     for (const HloInstruction* instruction : computation->instructions()) {
       StrAppend(&out, "    ", instruction->name(), ":\n");
-      if (ShapeUtil::IsTuple(instruction->shape())) {
+      if (instruction->shape().IsTuple()) {
         ShapeUtil::ForEachSubshape(
             instruction->shape(),
             [&out, &instruction, this](const Shape&, const ShapeIndex& index) {
@@ -515,7 +513,7 @@ StatusOr<std::unique_ptr<HloAliasAnalysis>> HloAliasAnalysis::Run(
     auto& value_set = buffer_map.GetValuesInBuffer(buffer_number);
     std::vector<const HloValue*> sorted_values(value_set.begin(),
                                                value_set.end());
-    std::sort(sorted_values.begin(), sorted_values.end(), HloValue::IdLessThan);
+    absl::c_sort(sorted_values, HloValue::IdLessThan);
     alias_analysis->buffers_.emplace_back(next_id++, sorted_values);
     for (const HloValue* value : sorted_values) {
       alias_analysis->value_to_buffer_[value] =
@@ -533,11 +531,11 @@ bool HloAliasAnalysis::HasLiveRangeInterference(
     const HloOrdering& ordering) const {
   for (const HloBuffer& buffer : buffers()) {
     CHECK(!buffer.values().empty());
-    if (ShapeUtil::IsToken(buffer.values().front()->shape())) {
+    if (buffer.values().front()->shape().IsToken()) {
       // Tokens have no on-device representation and cannot interfere.
       for (const HloValue* value : buffer.values()) {
         // If one of the values is a token, all values must be a token.
-        DCHECK(ShapeUtil::IsToken(value->shape()));
+        DCHECK(value->shape().IsToken());
       }
       continue;
     }
@@ -547,16 +545,15 @@ bool HloAliasAnalysis::HasLiveRangeInterference(
     // tie-break using value ID. The tie-break is necessary because we need a
     // strict weak order for std::sort.
     std::vector<const HloValue*> values = buffer.values();
-    std::sort(values.begin(), values.end(),
-              [&ordering](const HloValue* a, const HloValue* b) {
-                if (ordering.IsDefinedBefore(*a, *b)) {
-                  return true;
-                } else if (ordering.IsDefinedBefore(*b, *a)) {
-                  return false;
-                } else {
-                  return a->id() < b->id();
-                }
-              });
+    absl::c_sort(values, [&ordering](const HloValue* a, const HloValue* b) {
+      if (ordering.IsDefinedBefore(*a, *b)) {
+        return true;
+      } else if (ordering.IsDefinedBefore(*b, *a)) {
+        return false;
+      } else {
+        return a->id() < b->id();
+      }
+    });
 
     // Walk through the ordered vector of values. First verify that the values
     // are totally ordered with respect to 'ordering', then check that no
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
index 7e6150e94153cd15463725e862ce1b8593f2c991..b6dbf07959c541bceaa8eda5a0101503970ee832 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
@@ -238,13 +238,16 @@ TEST_F(HloAliasAnalysisTest, ParametersWithAliasing) {
       builder.AddInstruction(HloInstruction::CreateTuple({negate0, negate1}));
   module_->AddEntryComputation(builder.Build());
   TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1}));
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   // Cannot alias an output twice.
   ASSERT_IS_NOT_OK(module_->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -279,13 +282,16 @@ TEST_F(HloAliasAnalysisTest, ParametersWithCrossAliasing) {
       builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
   module_->AddEntryComputation(builder.Build());
   TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{1}));
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{1},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   // Cannot alias an output twice.
   ASSERT_IS_NOT_OK(module_->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1}));
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
@@ -365,9 +371,11 @@ TEST_F(HloAliasAnalysisTest, InputOutputAliasingWithWhile) {
       builder.AddInstruction(HloInstruction::CreateTuple({negate_1, negate_2}));
   module_->AddEntryComputation(builder.Build());
   TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0}));
+      /*output_index=*/{0}, /*param_number=*/0, /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
   TF_ASSERT_OK(module_->input_output_alias_config().SetUpAlias(
-      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1}));
+      /*output_index=*/{1}, /*param_number=*/0, /*param_index=*/{1},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   const HloAliasAnalysis& analysis = RunAnalysis();
 
diff --git a/tensorflow/compiler/xla/service/hlo_buffer.cc b/tensorflow/compiler/xla/service/hlo_buffer.cc
index 9c3aa0e64d119c2560f4955d0bcb492519fa52a2..32e48651b30bace4723169935d1f10dd7d7bfec3 100644
--- a/tensorflow/compiler/xla/service/hlo_buffer.cc
+++ b/tensorflow/compiler/xla/service/hlo_buffer.cc
@@ -49,7 +49,7 @@ std::vector<HloPosition> HloBuffer::ComputePositions() const {
                      value->positions().end());
   }
   // Remove duplicates and sort positions.
-  std::sort(positions.begin(), positions.end());
+  absl::c_sort(positions);
   positions.erase(std::unique(positions.begin(), positions.end()),
                   positions.end());
   return positions;
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index ff122b529bdcdcc69d2245136e19101902dbf957..817e15f9ff10a9b7e1a502265c85f70fdd681dd9 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 
-#include <stddef.h>
 #include <algorithm>
+#include <cstddef>
 #include <functional>
 #include <list>
 #include <queue>
@@ -207,14 +207,14 @@ Status HloComputation::RemoveInstructionAndUnusedOperands(
   TF_RET_CHECK(instruction->user_count() == 0);
   TF_RET_CHECK(IsRemovable(instruction))
       << "Cannot remove instruction: " << instruction->ToString();
-  std::unordered_set<HloInstruction*> removed;
+  absl::flat_hash_set<HloInstruction*> removed;
   std::queue<HloInstruction*> worklist;
   worklist.push(instruction);
   while (!worklist.empty()) {
     HloInstruction* item = worklist.front();
     worklist.pop();
 
-    if (removed.count(item) != 0 || item->user_count() != 0 ||
+    if (removed.contains(item) || item->user_count() != 0 ||
         item == root_instruction() || !IsRemovable(item) ||
         (item->HasSideEffect() && item != instruction)) {
       continue;
@@ -296,7 +296,7 @@ void ComputeComputationPostOrder(HloComputation* computation,
 }  // namespace
 
 void HloComputation::ComputeInstructionPostOrder(
-    const HloComputation::ChannelDependencyMap& channel_dependency_map,
+    const HloComputation::ChannelDependencyGroup& channel_dependency_group,
     std::vector<HloInstruction*>* post_order, HloInstruction* root,
     absl::flat_hash_map<HloInstruction*, VisitState>* visited) const {
   std::vector<HloInstruction*> dfs_stack;
@@ -320,66 +320,75 @@ void HloComputation::ComputeInstructionPostOrder(
 
     visited->insert({current, kVisiting});
 
-    // Add the operands to the stack in reverse order so the first operand is
-    // processed first. This will produce a more natural ordering and a nicer
-    // result for things like HLO stringification.
-    const auto& operands = current->operands();
-    for (int64 i = operands.size() - 1; i >= 0; --i) {
-      dfs_stack.emplace_back(operands[i]);
-    }
-
-    for (HloInstruction* op : current->control_predecessors()) {
-      dfs_stack.emplace_back(op);
-    }
-
-    // Add inputs for send->recv_done dependencies and cross-replica-sum
-    // dependencies.
-    switch (current->opcode()) {
-      case HloOpcode::kRecvDone: {
-        auto it = channel_dependency_map.find(current->channel_id());
-        if (it != channel_dependency_map.end()) {
-          for (HloInstruction* op : it->second) {
-            dfs_stack.emplace_back(op);
-          }
-        }
-        break;
+    const auto get_channel_id =
+        [](HloInstruction* inst) -> absl::optional<int64> {
+      switch (inst->opcode()) {
+        case HloOpcode::kRecvDone:
+          return inst->channel_id();
+        case HloOpcode::kAllReduce:
+          return inst->all_reduce_id();
+        default:
+          return absl::nullopt;
       }
-      case HloOpcode::kCrossReplicaSum: {
-        auto all_reduce_id = current->all_reduce_id();
-        if (all_reduce_id) {
-          auto it = channel_dependency_map.find(all_reduce_id.value());
-          if (it != channel_dependency_map.end()) {
-            for (HloInstruction* op : it->second) {
-              dfs_stack.emplace_back(op);
-            }
-          }
+    };
+
+    // When adding a predecessor to the dfs_stack, we need to also add its
+    // associated channel dependencies.
+    const auto add_dfs_stack = [&](HloInstruction* inst) {
+      auto channel_id = get_channel_id(inst);
+      if (channel_id && channel_dependency_group.count(*channel_id)) {
+        auto it = channel_dependency_group.find(*channel_id);
+        for (HloInstruction* cinst : it->second) {
+          dfs_stack.emplace_back(cinst);
         }
-        break;
+      } else {
+        dfs_stack.emplace_back(inst);
       }
-      default:
-        break;
+    };
+
+    const auto add_predecessors = [&](HloInstruction* inst) {
+      // Add the operands to the stack in reverse order so the first operand is
+      // processed first. This will produce a more natural ordering and a nicer
+      // result for things like HLO stringification.
+      const auto& operands = inst->operands();
+      for (int64 i = operands.size() - 1; i >= 0; --i) {
+        add_dfs_stack(operands[i]);
+      }
+
+      for (HloInstruction* op : inst->control_predecessors()) {
+        add_dfs_stack(op);
+      }
+    };
+
+    // If the current instruction is a channel instruction, add the dependencies
+    // from all associated instructions of the channel.
+    auto channel_id = get_channel_id(current);
+    if (channel_id && channel_dependency_group.count(*channel_id)) {
+      auto it = channel_dependency_group.find(*channel_id);
+      for (HloInstruction* cinst : it->second) {
+        add_predecessors(cinst);
+      }
+    } else {
+      add_predecessors(current);
     }
   }
 }
 
-HloComputation::ChannelDependencyMap
+HloComputation::ChannelDependencyGroup
 HloComputation::ComputeChannelDependencies() const {
-  ChannelDependencyMap channel_dependency_map;
+  ChannelDependencyGroup channel_dependency_group;
   for (const auto& instruction : instructions_) {
     switch (instruction->opcode()) {
-      case HloOpcode::kSend: {
-        channel_dependency_map[instruction->channel_id()].push_back(
+      case HloOpcode::kSend:
+      case HloOpcode::kRecvDone:
+        channel_dependency_group[instruction->channel_id()].push_back(
             instruction.get());
         break;
-      }
-      case HloOpcode::kCrossReplicaSum: {
+      case HloOpcode::kAllReduce: {
         auto all_reduce_id = instruction->all_reduce_id();
         if (all_reduce_id) {
-          auto& dependencies = channel_dependency_map[all_reduce_id.value()];
-          absl::c_copy(instruction->operands(),
-                       std::back_inserter(dependencies));
-          absl::c_copy(instruction->control_predecessors(),
-                       std::back_inserter(dependencies));
+          channel_dependency_group[all_reduce_id.value()].push_back(
+              instruction.get());
         }
         break;
       }
@@ -387,15 +396,16 @@ HloComputation::ComputeChannelDependencies() const {
         break;
     }
   }
-  return channel_dependency_map;
+  return channel_dependency_group;
 }
 
 std::vector<HloInstruction*> HloComputation::MakeInstructionPostOrder() const {
-  auto channel_dependency_map = ComputeChannelDependencies();
+  auto channel_dependency_group = ComputeChannelDependencies();
   std::vector<HloInstruction*> post_order;
   post_order.reserve(instruction_count());
   std::vector<HloInstruction*> trace_instructions;
   absl::flat_hash_map<HloInstruction*, VisitState> visited;
+  visited.reserve(instruction_count());
   for (auto& instruction : instructions_) {
     if (instruction->opcode() == HloOpcode::kTrace) {
       // Trace instructions aren't handled by the DFS visitor. Add trace
@@ -403,7 +413,7 @@ std::vector<HloInstruction*> HloComputation::MakeInstructionPostOrder() const {
       // users).
       trace_instructions.push_back(instruction.get());
     } else if (instruction->users().empty()) {
-      ComputeInstructionPostOrder(channel_dependency_map, &post_order,
+      ComputeInstructionPostOrder(channel_dependency_group, &post_order,
                                   instruction.get(), &visited);
     }
   }
@@ -530,11 +540,10 @@ HloComputation::CreateFromProto(
   HloInstruction* root = instruction_map.at(proto.root_id());
 
   // Sort the instructions in the proto id's order.
-  std::sort(instructions.begin(), instructions.end(),
-            [&](const std::unique_ptr<HloInstruction>& a,
-                const std::unique_ptr<HloInstruction>& b) {
-              return to_proto_id[a.get()] < to_proto_id[b.get()];
-            });
+  absl::c_sort(instructions, [&](const std::unique_ptr<HloInstruction>& a,
+                                 const std::unique_ptr<HloInstruction>& b) {
+    return to_proto_id[a.get()] < to_proto_id[b.get()];
+  });
 
   TF_RETURN_IF_ERROR([&]() -> Status {
     std::vector<bool> parameters_seen(parameter_count);
@@ -599,7 +608,7 @@ StatusOr<HloInstruction*> HloComputation::DeepCopyHelper(
     const std::function<
         HloInstruction*(HloInstruction* leaf, const ShapeIndex& leaf_index,
                         HloComputation* computation)>& copy_leaf) {
-  if (ShapeUtil::IsTuple(instruction->shape())) {
+  if (instruction->shape().IsTuple()) {
     std::vector<HloInstruction*> elements;
     for (int64 i = 0; i < ShapeUtil::TupleElementCount(instruction->shape());
          i++) {
@@ -616,14 +625,14 @@ StatusOr<HloInstruction*> HloComputation::DeepCopyHelper(
     }
     return AddInstruction(HloInstruction::CreateTuple(elements));
   }
-  if (ShapeUtil::IsToken(instruction->shape())) {
+  if (instruction->shape().IsToken()) {
     // Tokens have no on-device representation and cannot be copied. Pass
     // through transparently.
     return instruction;
   }
 
   // Array shape.
-  TF_RET_CHECK(ShapeUtil::IsArray(instruction->shape()));
+  TF_RET_CHECK(instruction->shape().IsArray());
   return copy_leaf(instruction, *index, this);
 }
 
@@ -693,25 +702,37 @@ bool HloComputation::operator==(const HloComputation& other) const {
   if (this == &other) {
     return true;
   }
-  std::set<std::pair<const HloInstruction*, const HloInstruction*>> visited;
-  std::function<bool(const HloInstruction*, const HloInstruction*)> eq =
-      [&visited, &eq](const HloInstruction* a, const HloInstruction* b) {
-        // If <a,b> are visited but not identical, the recursion should have
-        // been aborted. So, if <a,b> are visited at this point, they must be
-        // identical.
-        if (visited.count(std::make_pair(a, b)) > 0) {
-          return true;
-        }
-        visited.emplace(a, b);
-        return a->Identical(
-            *b, eq, [](const HloComputation* a, const HloComputation* b) {
-              return *a == *b;
-            });
-      };
-  return eq(root_instruction(), other.root_instruction());
-}
+  absl::flat_hash_set<std::pair<const HloInstruction*, const HloInstruction*>>
+      visited;
+  std::vector<std::pair<const HloInstruction*, const HloInstruction*>> worklist;
+
+  worklist.push_back({root_instruction(), other.root_instruction()});
 
-uint64 HloComputation::Hash() const { return root_instruction()->Hash(); }
+  while (!worklist.empty()) {
+    auto pair = worklist.back();
+    worklist.pop_back();
+
+    if (visited.contains(pair)) {
+      continue;
+    }
+    visited.emplace(pair);
+    // TODO(b/123082518): Avoid recursively invoking == becasue it may
+    // cause a stack overflow with deeply nested subcomputations.
+    bool identical_ignoring_operands = pair.first->Identical(
+        *pair.second,
+        [](const HloInstruction*, const HloInstruction*) { return true; },
+        [](const HloComputation* a, const HloComputation* b) {
+          return *a == *b;
+        });
+    if (!identical_ignoring_operands) {
+      return false;
+    }
+    for (size_t i = 0; i < pair.first->operands().size(); ++i) {
+      worklist.push_back({pair.first->operand(i), pair.second->operand(i)});
+    }
+  }
+  return true;
+}
 
 Status HloComputation::ReplaceWithNewInstruction(
     HloInstruction* old_instruction,
@@ -797,20 +818,19 @@ Status HloComputation::AcceptWithOperandOrder(
 template <typename HloInstructionPtr>
 Status HloComputation::AcceptOrdered(
     DfsHloVisitorBase<HloInstructionPtr>* visitor,
-    const std::vector<HloInstruction*>& order) const {
+    absl::Span<HloInstruction* const> order) const {
   VLOG(3) << "Accepting visitor with order.";
   for (HloInstruction* root : CollectUnreachableRoots()) {
-    TF_RET_CHECK(std::find(order.begin(), order.end(), root) != order.end())
-        << root->ToString();
+    TF_RET_CHECK(absl::c_linear_search(order, root)) << root->ToString();
   }
   TF_RET_CHECK(order.size() == instruction_count());
-  std::unordered_set<const HloInstruction*> visited;
+  absl::flat_hash_set<const HloInstruction*> visited;
   for (const HloInstruction* instruction : order) {
     VLOG(3) << "Visiting ordered: " << instruction->ToString();
-    TF_RET_CHECK(instruction_iterators_.count(instruction) == 1)
+    TF_RET_CHECK(instruction_iterators_.contains(instruction))
         << "Instruction " << instruction->name() << " is not in computation "
         << name();
-    TF_RET_CHECK(visited.count(instruction) == 0)
+    TF_RET_CHECK(!visited.contains(instruction))
         << "Instruction " << instruction->name()
         << " appears more than once in order";
     HloInstruction* mutable_instruction =
@@ -827,9 +847,9 @@ Status HloComputation::AcceptOrdered(
 
 // Explicit instantiations.
 template Status HloComputation::AcceptOrdered(
-    DfsHloVisitor*, const std::vector<HloInstruction*>&) const;
+    DfsHloVisitor*, absl::Span<HloInstruction* const>) const;
 template Status HloComputation::AcceptOrdered(
-    ConstDfsHloVisitor*, const std::vector<HloInstruction*>&) const;
+    ConstDfsHloVisitor*, absl::Span<HloInstruction* const>) const;
 
 Status HloComputation::Accept(
     const std::function<Status(HloInstruction*)>& visitor_func) {
@@ -846,29 +866,31 @@ Status HloComputation::Accept(
 std::unique_ptr<HloComputation> HloComputation::Clone(
     const string& suffix, HloCloneContext* context) {
   return CloneWithReplacements(
-      /*replacements=*/std::unordered_map<const HloInstruction*,
-                                          std::unique_ptr<HloInstruction>>(),
-      context, suffix);
+      /*replacements=*/absl::flat_hash_map<const HloInstruction*,
+                                           std::unique_ptr<HloInstruction>>(),
+      /*extra_parameters=*/{}, context, suffix);
 }
 
 std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
     std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
     HloCloneContext* context, const string& suffix) {
-  std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
       replacements;
   replacements.emplace(std::move(r1));
-  return CloneWithReplacements(std::move(replacements), context, suffix);
+  return CloneWithReplacements(std::move(replacements), /*extra_parameters=*/{},
+                               context, suffix);
 }
 
 std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
     std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r1,
     std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r2,
     HloCloneContext* context, const string& suffix) {
-  std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
       replacements;
   replacements.emplace(std::move(r1));
   replacements.emplace(std::move(r2));
-  return CloneWithReplacements(std::move(replacements), context, suffix);
+  return CloneWithReplacements(std::move(replacements), /*extra_parameters=*/{},
+                               context, suffix);
 }
 
 std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
@@ -876,17 +898,19 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacementPairs(
     std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r2,
     std::pair<const HloInstruction*, std::unique_ptr<HloInstruction>> r3,
     HloCloneContext* context, const string& suffix) {
-  std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
       replacements;
   replacements.emplace(std::move(r1));
   replacements.emplace(std::move(r2));
   replacements.emplace(std::move(r3));
-  return CloneWithReplacements(std::move(replacements), context, suffix);
+  return CloneWithReplacements(std::move(replacements), /*extra_parameters=*/{},
+                               context, suffix);
 }
 
 std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
-    std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+    absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
         replacements,
+    absl::Span<const HloInstruction* const> extra_parameters,
     HloCloneContext* context, const string& suffix) {
   std::unique_ptr<HloCloneContext> context_ptr;
   if (context == nullptr) {
@@ -952,6 +976,12 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
   }
 
   std::vector<std::unique_ptr<HloInstruction>> instructions;
+  // First add the extra parameters to 'instructions'.
+  for (const auto& instr : extra_parameters) {
+    CHECK_EQ(instr->opcode(), HloOpcode::kParameter)
+        << "Only parameter instructions are allowed in 'extra_parameters'";
+    instructions.emplace_back(instr->Clone());
+  }
   for (auto instr : postorder) {
     std::vector<HloInstruction*> new_operands;
     for (auto operand : instr->operands()) {
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index c584e4c7ca5770533f28352b0df9dadd9dbe1860..212dfa15a13185f1050103739fad8b560270d401 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -20,7 +20,6 @@ limitations under the License.
 #include <list>
 #include <memory>
 #include <string>
-#include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
@@ -264,12 +263,6 @@ class HloComputation {
   // Return whether `*this` and `other` are functionally equivalent.
   bool operator==(const HloComputation& other) const;
 
-  // Generates a hash value of an HLO computation. Hash considers
-  // information on opcode, shape, operands, and typically a root instruction.
-  // This function returns the same hash value for equivalent HLO computations,
-  // with respect to HloInstruction::Identical() method.
-  uint64 Hash() const;
-
   // Replaces old instruction with newly created instruction. Removes old
   // instruction from computation. Updates uses and root instruction.
   Status ReplaceWithNewInstruction(
@@ -307,7 +300,7 @@ class HloComputation {
   // be a topological sort of all instructions in the computation.
   template <typename HloInstructionPtr>
   Status AcceptOrdered(DfsHloVisitorBase<HloInstructionPtr>* visitor,
-                       const std::vector<HloInstruction*>& order) const;
+                       absl::Span<HloInstruction* const> order) const;
 
   // Same as Accept() above, but the visitor is given as a function.
   Status Accept(const std::function<Status(HloInstruction*)>& visitor_func);
@@ -329,11 +322,16 @@ class HloComputation {
   // that's not already in the computation, it's cloned and added to the new
   // computation.
   //
+  // 'extra_parameters' allows to specify additional parameters that should be
+  // added to the computation.
+  //
   // All relevant instructions are cloned, *including* unique_ptr in the
   // `replacements` map.
   std::unique_ptr<HloComputation> CloneWithReplacements(
-      std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      absl::flat_hash_map<const HloInstruction*,
+                          std::unique_ptr<HloInstruction>>
           replacements,
+      absl::Span<const HloInstruction* const> extra_parameters = {},
       HloCloneContext* context = nullptr, const string& suffix = "clone");
 
   // Convenience overloads for CloneWithReplacements.  You want to do
@@ -371,13 +369,13 @@ class HloComputation {
   // channel complete).
   bool IsRemovable(const HloInstruction* instruction);
 
-  // Returns a map from channel-id to directed dependencies of the channel
-  // instructions. For send&recv pairs it means the send instruction and for
-  // cross-replica-sum the union of the dependencies for all participating
-  // instructions.
-  using ChannelDependencyMap =
+  // Returns a map from channel-id to the group of instructions associated with
+  // the channel. These instructions will be considered as a single node for
+  // dependency purposes. Send and RecvDone are in the group, and AllReduces
+  // with the same channel id are in the group.
+  using ChannelDependencyGroup =
       absl::flat_hash_map<int64, absl::InlinedVector<HloInstruction*, 1>>;
-  ChannelDependencyMap ComputeChannelDependencies() const;
+  ChannelDependencyGroup ComputeChannelDependencies() const;
 
   // Returns true if this computation has a side effect. A computation has a
   // side effect if it contains one or more instructions with a side effect.
@@ -393,6 +391,10 @@ class HloComputation {
     fusion_instruction_ = fusion_instruction;
   }
 
+  // Clear the unique ID of the computation so that it can be re-assigned, such
+  // as for the purpose of compacting the unique IDs.
+  void ClearUniqueIdInternal() { unique_id_ = -1; }
+
   // The id of this computation should be unique within the module.
   void SetUniqueId(int64 id) {
     CHECK_EQ(unique_id_, -1);
@@ -436,7 +438,7 @@ class HloComputation {
 
   enum VisitState { kVisiting, kVisited };
   void ComputeInstructionPostOrder(
-      const HloComputation::ChannelDependencyMap& channel_dependency_map,
+      const HloComputation::ChannelDependencyGroup& channel_dependency_map,
       std::vector<HloInstruction*>* post_order, HloInstruction* root,
       absl::flat_hash_map<HloInstruction*, VisitState>* visited) const;
 
diff --git a/tensorflow/compiler/xla/service/hlo_computation_test.cc b/tensorflow/compiler/xla/service/hlo_computation_test.cc
index 0361c87428f6e4c031d95492a5bc782ad388e5b5..fe37ca6b3963430c765f27aede4f506366fc5d97 100644
--- a/tensorflow/compiler/xla/service/hlo_computation_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation_test.cc
@@ -15,12 +15,18 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 
+#include <memory>
 #include <set>
+#include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -33,6 +39,7 @@ namespace xla {
 namespace {
 
 namespace m = match;
+namespace op = xla::testing::opcode_matchers;
 using ::testing::ElementsAre;
 using ::testing::UnorderedElementsAre;
 
@@ -226,7 +233,7 @@ TEST_F(HloComputationTest, VisitWithMultipleRoots) {
         : computation_(computation) {}
 
     Status DefaultAction(HloInstruction* hlo_instruction) override {
-      EXPECT_EQ(0, visited_set_.count(hlo_instruction));
+      EXPECT_FALSE(visited_set_.contains(hlo_instruction));
       visited_set_.insert(hlo_instruction);
       last_visited_ = hlo_instruction;
       return Status::OK();
@@ -239,7 +246,7 @@ TEST_F(HloComputationTest, VisitWithMultipleRoots) {
     }
 
     HloComputation* computation_;
-    std::set<HloInstruction*> visited_set_;
+    absl::flat_hash_set<HloInstruction*> visited_set_;
     int64 finish_visit_calls_ = 0;
     HloInstruction* last_visited_ = nullptr;
   };
@@ -491,6 +498,41 @@ TEST_F(HloComputationTest, CloneWithControlDependency) {
   EXPECT_THAT(successors, ::testing::ElementsAre(cloned_add));
 }
 
+TEST_F(HloComputationTest, CloneWithReplacements) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape r0s64 = ShapeUtil::MakeShape(S64, {});
+  Shape r0s32 = ShapeUtil::MakeShape(S32, {});
+  Shape r0u32 = ShapeUtil::MakeShape(U32, {});
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32_, "p.0.lhs"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0f32_, "p.0.rhs"));
+  auto param2 =
+      builder.AddInstruction(HloInstruction::CreateParameter(2, r0s64, "p.1"));
+  auto lt = builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(PRED, {}), HloOpcode::kLt, param0, param1));
+  auto module = CreateNewVerifiedModule();
+  auto computation =
+      module->AddEntryComputation(builder.Build(/*root_instruction=*/lt));
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      replacements;
+  replacements.emplace(param2,
+                       HloInstruction::CreateParameter(2, r0s32, "p.1"));
+  auto param3 = HloInstruction::CreateParameter(3, r0u32, "p.2");
+  std::vector<const HloInstruction*> extra_parameters{param3.get()};
+  auto clone = computation->CloneWithReplacements(std::move(replacements),
+                                                  extra_parameters);
+  ASSERT_EQ(clone->num_parameters(), 4);
+  EXPECT_TRUE(
+      ShapeUtil::Equal(clone->parameter_instruction(0)->shape(), r0f32_));
+  EXPECT_TRUE(
+      ShapeUtil::Equal(clone->parameter_instruction(1)->shape(), r0f32_));
+  EXPECT_TRUE(
+      ShapeUtil::Equal(clone->parameter_instruction(2)->shape(), r0s32));
+  EXPECT_TRUE(
+      ShapeUtil::Equal(clone->parameter_instruction(3)->shape(), r0u32));
+}
+
 TEST_F(HloComputationTest, Stringification) {
   const Shape s1 = ShapeUtil::MakeShape(F32, {5, 10});
   const Shape s2 = ShapeUtil::MakeShape(F32, {20, 10});
@@ -606,5 +648,57 @@ TEST_F(HloComputationTest, StringificationCanonical) {
   EXPECT_EQ(computation->ToString(options), expected_computation2);
 }
 
+std::unique_ptr<HloComputation> MakeAddNComputation(int n) {
+  auto builder = HloComputation::Builder("add_n");
+  auto result = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(F32, {}), "x_value"));
+  auto one = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  for (int i = 0; i < n; ++i) {
+    result = builder.AddInstruction(HloInstruction::CreateBinary(
+        one->shape(), HloOpcode::kAdd, result, one));
+  }
+  return builder.Build();
+}
+
+TEST_F(HloComputationTest, DeepEquality) {
+  auto computation_a = MakeAddNComputation(200000);
+  auto computation_b = MakeAddNComputation(200000);
+  EXPECT_TRUE(*computation_a == *computation_b);
+
+  auto computation_c = MakeAddNComputation(199999);
+  EXPECT_FALSE(*computation_a == *computation_c);
+  EXPECT_FALSE(*computation_c == *computation_b);
+}
+
+// Tests that cross-module AllReduce instructions are ordered before all their
+// predecessors and after all their successors.
+TEST_F(HloComputationTest, InstructionPostOrderWithAllReduce) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+add {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  param = f32[128] parameter(0), sharding={maximal device=0}
+  crs0 = f32[128] all-reduce(param),
+    replica_groups={{0}}, all_reduce_id=1, barrier="", to_apply=add,
+    sharding={maximal device=0}
+  crs1 = f32[128] all-reduce(param),
+    replica_groups={{0}}, all_reduce_id=1, barrier="", to_apply=add,
+    sharding={maximal device=1}
+  add = f32[128] add(crs0, crs0), sharding={maximal device=0}
+  ROOT t = (f32[128], f32[128]) tuple(add, crs1)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string));
+  EXPECT_THAT(module->entry_computation()->MakeInstructionPostOrder(),
+              ElementsAre(op::Parameter(), op::AllReduce(), op::AllReduce(),
+                          op::Add(), op::Tuple()));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
index 5e37883d3d8d5067bab873ac6b5f732e7360c5fa..e7ed858e8c5af83d08863d64a0aba162c75ed5cb 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
@@ -35,6 +35,34 @@ limitations under the License.
 
 namespace xla {
 
+// Checks whether instr is or transitively contains an instruction that we
+// shouldn't fold.
+//
+// Specifically, we don't fold kRng or kAfterAll instructions:
+//
+//  - kRng is already marked as side-effecting and so is skipped elsewhere, but
+//    we check for it here.  Even kRng weren't side-effecting and took an
+//    explicit seed, we *still* wouldn't want to constant-fold it, because the
+//    evaluator's handling of rng is not guaranteed to be identical to any
+//    particular backend's rng.
+//
+//  - kAfterAll needs to be skipped because a kAfterAll op with no args can
+//    currently materialize a token "out of thin air".  TODO(b/110532604):
+//    Remove this check once AfterAll requires at least one operand, in which
+//    case constant folding will be impossible.
+static bool IsOrContainsIllegalInstr(const HloInstruction* instr) {
+  if (instr->opcode() == HloOpcode::kAfterAll ||
+      instr->opcode() == HloOpcode::kRng) {
+    return true;
+  }
+  for (const HloComputation* c : instr->called_computations()) {
+    if (absl::c_any_of(c->instructions(), IsOrContainsIllegalInstr)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
   // Limit the constant folding to 0 iterations to skip folding loops. This
   // retains the behavior from before while loop support in HloEvaluator and may
@@ -52,25 +80,24 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
           computation->root_instruction() != instruction) {
         continue;
       }
-      // Skip Constant, Parameter, Tuple, AfterAll operation.
-      // Tuple constants are not directly supported by any backends, hence
-      // folding Tuple is not useful and would in fact be expanded back into
-      // kTuple by Algebraic Simplifier.
-      // TODO(b/110532604): Enable AfterAll once AfterAll requires at least one
-      // operand in which case constant folding will be impossible and this
-      // special case is not necessary.
-      if (instruction->opcode() == HloOpcode::kParameter ||
-          instruction->opcode() == HloOpcode::kConstant ||
-          instruction->opcode() == HloOpcode::kTuple ||
-          instruction->opcode() == HloOpcode::kAfterAll) {
-        continue;
-      }
 
       // Skip instructions with non-constant operands.
       if (!hlo_query::AllOperandsAreConstants(*instruction)) {
         continue;
       }
 
+      // Don't fold Constant, Parameter, and Tuple instructions.  Tuple
+      // constants are not directly supported by any backends, hence folding
+      // Tuple is not useful and would in fact be expanded back into kTuple by
+      // Algebraic Simplifier.
+      //
+      // (We do allow folding subcomputations that contain these instructions.)
+      if (instruction->opcode() == HloOpcode::kParameter ||
+          instruction->opcode() == HloOpcode::kConstant ||
+          instruction->opcode() == HloOpcode::kTuple) {
+        continue;
+      }
+
       // Broadcasts dramatically increase the size of constants, which is often
       // detrimental to performance and memory capacity, so do not fold
       // broadcasts.
@@ -79,12 +106,23 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
         continue;
       }
 
+      // Check for instructions that we can't fold even if they appear inside of
+      // a subcomputation (e.g. a kCall).
+      if (IsOrContainsIllegalInstr(instruction)) {
+        continue;
+      }
+
+      // Don't constant-fold side-effecting instructions or instructions which
+      // contain side-effecting instructions.
+      if (instruction->HasSideEffect()) {
+        continue;
+      }
+
       // Don't constant fold unless it's a net positive or the output is small.
-      if (ShapeUtil::IsArray(instruction->shape())) {
+      if (instruction->shape().IsArray()) {
         int64 elements_in_removed_operands = 0;
         for (HloInstruction* operand : instruction->operands()) {
-          if (operand->user_count() == 1 &&
-              ShapeUtil::IsArray(operand->shape())) {
+          if (operand->user_count() == 1 && operand->shape().IsArray()) {
             elements_in_removed_operands +=
                 ShapeUtil::ElementsIn(operand->shape());
           }
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
index 4f81dc94e577a63c09ae4019e5e8158252c712ce..4bdc980c9ac4fb79cde0242f407aea7057474b27 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
@@ -252,7 +252,7 @@ const char* const kConstantFoldLargePad = R"(
   HloModule ConstantFoldLargePad
 
   ENTRY r {
-    a = f32[1,1,1] constant(f32[1,1,1]{{{7}}})
+    a = f32[1,1,1] constant({{{7}}})
     b = f32[] constant(42)
     ROOT pad = f32[2048,2048,128] pad(a, b), padding=1024_1023x1024_1023x64_63
   })";
@@ -268,5 +268,51 @@ TEST_F(HloConstantFoldingTest, DoesNotFoldLargePad) {
               GmockMatch(m::Pad(m::Constant(), m::Constant())));
 }
 
+TEST_F(HloConstantFoldingTest, DontFoldSubcomputationContainingAfterAll) {
+  const char* const kModuleStr = R"(
+  HloModule test
+
+  Fn {
+    tok = token[] after-all()
+    ROOT root = f32[10] iota(), iota_dimension=0
+  }
+
+  ENTRY entry {
+    ROOT call = f32[10] call(), to_apply=Fn
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  HloConstantFolding constant_folding;
+  TF_ASSERT_OK_AND_ASSIGN(bool result,
+                          RunHloPass(&constant_folding, module.get()));
+  EXPECT_FALSE(result);
+}
+
+TEST_F(HloConstantFoldingTest,
+       DontFoldSubcomputationTransitivelyContainingRng) {
+  const char* const kModuleStr = R"(
+  HloModule test
+
+  InnerFn {
+    c0 = f32[] constant(0)
+    c1 = f32[] constant(1)
+    ROOT rng = f32[10] rng(c0, c1), distribution=rng_uniform
+  }
+
+  Fn {
+    ROOT fusion = f32[10] fusion(), kind=kLoop, calls=InnerFn
+  }
+
+  ENTRY entry {
+    ROOT call = f32[10] call(), to_apply=Fn
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+  HloConstantFolding constant_folding;
+  TF_ASSERT_OK_AND_ASSIGN(bool result,
+                          RunHloPass(&constant_folding, module.get()));
+  EXPECT_FALSE(result);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index df7d3826dbad1f264a5dc53312c062900155b0f6..6d9e01e3a77b1cdb5d9bad69bb2754e3ce3380c0 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -91,9 +91,10 @@ Status HloCostAnalysis::HandleElementwiseOp(
   auto opcode = hlo_instruction->opcode();
   // We treat transcendental operations separately since one transcendental
   // operation can correspond to several floating point ops.
-  if (opcode == HloOpcode::kExp || opcode == HloOpcode::kPower ||
-      opcode == HloOpcode::kTanh || opcode == HloOpcode::kSin ||
-      opcode == HloOpcode::kCos) {
+  if (opcode == HloOpcode::kExp || opcode == HloOpcode::kLog ||
+      opcode == HloOpcode::kPower || opcode == HloOpcode::kSqrt ||
+      opcode == HloOpcode::kRsqrt || opcode == HloOpcode::kTanh ||
+      opcode == HloOpcode::kSin || opcode == HloOpcode::kCos) {
     current_properties_[kTranscendentalsKey] = computation_count;
   } else {
     // Note: transcendental operations are considered a separate category from
@@ -237,24 +238,17 @@ Status HloCostAnalysis::HandleDomain(const HloInstruction* domain) {
 
 Status HloCostAnalysis::HandleDot(const HloInstruction* dot) {
   const Shape& lhs_shape = dot->operand(0)->shape();
-  const Shape& rhs_shape = dot->operand(1)->shape();
+  const Shape& dot_shape = dot->shape();
   const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
   // Count of elements along the reduction dimension (last dimension for the
   // rhs).
-  int64 reduction_width =
-      lhs_shape.dimensions(dnums.lhs_contracting_dimensions(0));
-  // First divide by reduction width before multiplying by rhs elements to avoid
-  // overflow.
-  int64 fma_count;
-  if (reduction_width == 0) {
-    fma_count = 0;
-  } else {
-    fma_count = (ShapeUtil::ElementsIn(lhs_shape) / reduction_width) *
-                ShapeUtil::ElementsIn(rhs_shape);
+  int64 reduction_width = 1;
+  for (auto dim : dnums.lhs_contracting_dimensions()) {
+    reduction_width *= lhs_shape.dimensions(dim);
   }
-
-  // We count an FMA operation as 2 floating point operations.
-  current_properties_[kFlopsKey] = kFmaFlops * fma_count;
+  // Each output elment requires reduction_width FMA operations.
+  current_properties_[kFlopsKey] =
+      kFmaFlops * ShapeUtil::ElementsIn(dot_shape) * reduction_width;
   return Status::OK();
 }
 
@@ -292,7 +286,7 @@ Status HloCostAnalysis::HandleReduce(const HloInstruction* reduce) {
   // does not need to be multiplied by the number of input tensors - that's
   // already "priced in" by the sub-computation doing more work.
   auto arg = reduce->operand(0);
-  auto output_shape = ShapeUtil::IsArray(reduce->shape())
+  auto output_shape = reduce->shape().IsArray()
                           ? reduce->shape()
                           : reduce->shape().tuple_shapes(0);
   int64 reduction_count =
@@ -531,7 +525,8 @@ Status HloCostAnalysis::HandleConvolution(const HloInstruction* convolution) {
   }
 
   const int64 fma_count = (input_feature / convolution->feature_group_count()) *
-                          output_feature * batch *
+                          output_feature *
+                          (batch / convolution->batch_group_count()) *
                           Product(valid_position_counts);
   current_properties_[kFlopsKey] = fma_count * kFmaFlops;
   return Status::OK();
@@ -539,7 +534,7 @@ Status HloCostAnalysis::HandleConvolution(const HloInstruction* convolution) {
 
 Status HloCostAnalysis::HandleFft(const HloInstruction* fft) {
   auto real_shape =
-      ShapeUtil::IsTuple(fft->operand(0)->shape())
+      fft->operand(0)->shape().IsTuple()
           ? ShapeUtil::GetTupleElementShape(fft->operand(0)->shape(), 0)
           : fft->operand(0)->shape();
   constexpr int kFmaPerComplexMul = 4;
@@ -552,7 +547,22 @@ Status HloCostAnalysis::HandleFft(const HloInstruction* fft) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleCrossReplicaSum(const HloInstruction* crs) {
+Status HloCostAnalysis::HandleTriangularSolve(const HloInstruction* hlo) {
+  float bytes_accessed = GetShapeSize(hlo->operand(0)->shape()) / 2.0f;
+  bytes_accessed += GetShapeSize(hlo->operand(1)->shape());
+  current_properties_[kBytesAccessedKey] = bytes_accessed;
+
+  const Shape& a_shape = hlo->operand(0)->shape();
+  const Shape& b_shape = hlo->operand(1)->shape();
+  // Estimate as batch * mn^2 / 2 flops.
+  int64 elems = a_shape.dimensions(a_shape.dimensions_size() - 1);
+  elems *= ShapeUtil::ElementsIn(b_shape);
+  // Each output elment requires reduction_widht FMA operations.
+  current_properties_[kFlopsKey] = kFmaFlops * elems;
+  return Status::OK();
+}
+
+Status HloCostAnalysis::HandleAllReduce(const HloInstruction* crs) {
   // We assume 2 replicas, so that each output element is the sum of two input
   // elements.
   //
@@ -561,7 +571,7 @@ Status HloCostAnalysis::HandleCrossReplicaSum(const HloInstruction* crs) {
   double flops = 0.0;
   ShapeUtil::ForEachSubshape(crs->shape(),
                              [&](const Shape& subshape, const ShapeIndex&) {
-                               if (ShapeUtil::IsArray(subshape)) {
+                               if (subshape.IsArray()) {
                                  flops += ShapeUtil::ElementsIn(subshape);
                                }
                              });
@@ -577,6 +587,10 @@ Status HloCostAnalysis::HandleCollectivePermute(const HloInstruction* /*hlo*/) {
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandleReplicaId(const HloInstruction* /*hlo*/) {
+  return Status::OK();
+}
+
 Status HloCostAnalysis::HandleRng(const HloInstruction* random) {
   // TODO(b/26346211): Implement better estimates for the RNG cost, since the
   // cost changes with the implementation and the distribution. For now, assume
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index 33983119c9b00a248c0e8dcc5815c6367192dca3..96357dec68e390251c43c2c3fc6f5a5612063fbd 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -71,9 +71,11 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleDot(const HloInstruction* dot) override;
   Status HandleConvolution(const HloInstruction* convolution) override;
   Status HandleFft(const HloInstruction* fft) override;
-  Status HandleCrossReplicaSum(const HloInstruction* crs) override;
+  Status HandleTriangularSolve(const HloInstruction* hlo) override;
+  Status HandleAllReduce(const HloInstruction* crs) override;
   Status HandleAllToAll(const HloInstruction* hlo) override;
   Status HandleCollectivePermute(const HloInstruction* hlo) override;
+  Status HandleReplicaId(const HloInstruction* hlo) override;
   Status HandleInfeed(const HloInstruction* infeed) override;
   Status HandleOutfeed(const HloInstruction* outfeed) override;
   Status HandleRng(const HloInstruction* random) override;
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
index ff32faf298dd1f04c5b769f2a88f76a7a1e18ae7..4d42770ba784ba15fae9518b40a75d8a2f038e66 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
 
 #include "tensorflow/compiler/xla/statusor.h"
@@ -157,6 +158,87 @@ TEST_F(HloCostAnalysisTest, MatrixMultiply) {
             sizeof(float) * (10 * 5 + 5 * 30 + 10 * 30));
 }
 
+TEST_F(HloCostAnalysisTest, DotGeneral) {
+  XlaBuilder builder("matrix_multiply");
+  auto lhs =
+      Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {10, 5, 5}), "lhs");
+  auto rhs =
+      Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {5, 5, 30}), "rhs");
+  DotDimensionNumbers dnums;
+  dnums.add_lhs_contracting_dimensions(1);
+  dnums.add_lhs_contracting_dimensions(2);
+  dnums.add_rhs_contracting_dimensions(0);
+  dnums.add_rhs_contracting_dimensions(1);
+  DotGeneral(lhs, rhs, dnums);
+
+  // Run HLO cost analysis.
+  auto hlo_module = BuildHloGraph(&builder);
+  HloCostAnalysis analysis(ShapeSize);
+  ASSERT_IS_OK(
+      hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
+
+  // Check the number of computations returned from the analysis (1500 FMAs).
+  EXPECT_EQ(analysis.flop_count(), 2 * 10 * 30 * 5 * 5);
+
+  EXPECT_EQ(analysis.transcendental_count(), 0);
+
+  // Bytes accessed is sum of inputs and output.
+  EXPECT_EQ(analysis.bytes_accessed(),
+            sizeof(float) * (10 * 5 * 5 + 5 * 5 * 30 + 10 * 30));
+}
+
+TEST_F(HloCostAnalysisTest, DotGeneral2) {
+  XlaBuilder builder("matrix_multiply");
+  auto lhs =
+      Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {10, 5, 5}), "lhs");
+  auto rhs =
+      Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {5, 5, 30}), "rhs");
+  DotDimensionNumbers dnums;
+  dnums.add_lhs_contracting_dimensions(1);
+  dnums.add_lhs_batch_dimensions(2);
+  dnums.add_rhs_contracting_dimensions(0);
+  dnums.add_rhs_batch_dimensions(1);
+  DotGeneral(lhs, rhs, dnums);
+
+  // Run HLO cost analysis.
+  auto hlo_module = BuildHloGraph(&builder);
+  HloCostAnalysis analysis(ShapeSize);
+  ASSERT_IS_OK(
+      hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
+
+  // Check the number of computations returned from the analysis (1500 FMAs).
+  EXPECT_EQ(analysis.flop_count(), 2 * 10 * 30 * 5 * 5);
+
+  EXPECT_EQ(analysis.transcendental_count(), 0);
+
+  // Bytes accessed is sum of inputs and output.
+  EXPECT_EQ(analysis.bytes_accessed(),
+            sizeof(float) * (10 * 5 * 5 + 5 * 5 * 30 + 5 * 10 * 30));
+}
+
+TEST_F(HloCostAnalysisTest, DotGeneral3) {
+  XlaBuilder builder("matrix_multiply");
+  auto lhs = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {10, 5}), "lhs");
+  auto rhs = Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {5, 30}), "rhs");
+  DotDimensionNumbers dnums;
+  DotGeneral(lhs, rhs, dnums);
+
+  // Run HLO cost analysis.
+  auto hlo_module = BuildHloGraph(&builder);
+  HloCostAnalysis analysis(ShapeSize);
+  ASSERT_IS_OK(
+      hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
+
+  // Check the number of computations returned from the analysis (1500 FMAs).
+  EXPECT_EQ(analysis.flop_count(), 2 * 10 * 30 * 5 * 5);
+
+  EXPECT_EQ(analysis.transcendental_count(), 0);
+
+  // Bytes accessed is sum of inputs and output.
+  EXPECT_EQ(analysis.bytes_accessed(),
+            sizeof(float) * (10 * 5 + 5 * 30 + 5 * 5 * 10 * 30));
+}
+
 TEST_F(HloCostAnalysisTest, Map) {
   XlaBuilder builder("map");
   auto input = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {10}), "in");
@@ -529,7 +611,8 @@ TEST_F(HloCostAnalysisTest, DynamicSlice) {
   // Test the analysis on a slice.
   XlaBuilder builder("dynamic-slice");
   auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {2}), "x");
-  DynamicSlice(x, ConstantR1<int32>(&builder, {1}), {1});
+  DynamicSlice(x, absl::Span<const XlaOp>({ConstantR0<int32>(&builder, 1)}),
+               {1});
   auto hlo_module = BuildHloGraph(&builder);
 
   // Run HLO cost analysis.
@@ -545,7 +628,7 @@ TEST_F(HloCostAnalysisTest, DynamicUpdateSlice) {
   XlaBuilder builder("dynamic-update-slice");
   auto x = Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {2}), "x");
   DynamicUpdateSlice(x, ConstantR1<float>(&builder, {1.0}),
-                     ConstantR1<int32>(&builder, {1}));
+                     absl::Span<const XlaOp>({ConstantR0<int32>(&builder, 1)}));
   auto hlo_module = BuildHloGraph(&builder);
 
   // Run HLO cost analysis.
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.cc b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
index b2005d3c210d4ae7e3702cb9624c3ad98056984c..b5d9e8e7f1a703d5d914a12d5226d53821071be6 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
@@ -17,9 +17,15 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_clone_context.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/util.h"
 
@@ -69,11 +75,11 @@ StatusOr<HloInstruction*> MakeConvolveHlo(
   CHECK_EQ(computation, rhs->parent());
   TF_ASSIGN_OR_RETURN(Shape convolve_shape,
                       ShapeInference::InferConvolveShape(
-                          lhs->shape(), rhs->shape(), feature_group_count,
+                          lhs->shape(), rhs->shape(), feature_group_count, 1,
                           window, dimension_numbers));
   return computation->AddInstruction(HloInstruction::CreateConvolve(
-      convolve_shape, lhs, rhs, feature_group_count, window, dimension_numbers,
-      precision_config));
+      convolve_shape, lhs, rhs, feature_group_count, 1, window,
+      dimension_numbers, precision_config));
 }
 
 StatusOr<HloInstruction*> MakeTransposeHlo(HloInstruction* operand,
@@ -105,12 +111,26 @@ StatusOr<HloInstruction*> MakeDynamicSliceHlo(
     absl::Span<const int64> slice_sizes) {
   HloComputation* computation = operand->parent();
   CHECK_EQ(computation, start_indices->parent());
+  int64 rank = start_indices->shape().dimensions(0);
+  std::vector<HloInstruction*> scalar_start_indices;
+  for (int i = 0; i < rank; ++i) {
+    // TODO(b/118437727): Update callers to provide scalars directly.
+    auto slice = computation->AddInstruction(HloInstruction::CreateSlice(
+        ShapeUtil::MakeShape(start_indices->shape().element_type(), {1}),
+        start_indices, {i}, {i + 1}, {1}));
+    scalar_start_indices.push_back(
+        computation->AddInstruction(HloInstruction::CreateReshape(
+            ShapeUtil::MakeShape(start_indices->shape().element_type(), {}),
+            slice)));
+  }
+  std::vector<Shape> scalar_start_indices_shapes(
+      rank, ShapeUtil::MakeShape(start_indices->shape().element_type(), {}));
   TF_ASSIGN_OR_RETURN(
       Shape dynamic_slice_shape,
       ShapeInference::InferDynamicSliceShape(
-          operand->shape(), start_indices->shape(), slice_sizes));
+          operand->shape(), scalar_start_indices_shapes, slice_sizes));
   return computation->AddInstruction(HloInstruction::CreateDynamicSlice(
-      dynamic_slice_shape, operand, start_indices, slice_sizes));
+      dynamic_slice_shape, operand, scalar_start_indices, slice_sizes));
 }
 
 StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
@@ -119,17 +139,31 @@ StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
   HloComputation* computation = operand->parent();
   CHECK_EQ(computation, update->parent());
   CHECK_EQ(computation, start_indices->parent());
+  int64 rank = start_indices->shape().dimensions(0);
+  std::vector<HloInstruction*> scalar_start_indices;
+  for (int i = 0; i < rank; ++i) {
+    // TODO(b/118437727): Update callers to provide scalars directly.
+    auto slice = computation->AddInstruction(HloInstruction::CreateSlice(
+        ShapeUtil::MakeShape(start_indices->shape().element_type(), {1}),
+        start_indices, {i}, {i + 1}, {1}));
+    scalar_start_indices.push_back(
+        computation->AddInstruction(HloInstruction::CreateReshape(
+            ShapeUtil::MakeShape(start_indices->shape().element_type(), {}),
+            slice)));
+  }
+  std::vector<Shape> scalar_start_indices_shapes(
+      rank, ShapeUtil::MakeShape(start_indices->shape().element_type(), {}));
   TF_ASSIGN_OR_RETURN(
       Shape dynamic_update_slice_shape,
       ShapeInference::InferDynamicUpdateSliceShape(
-          operand->shape(), update->shape(), start_indices->shape()));
+          operand->shape(), update->shape(), scalar_start_indices_shapes));
   return computation->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-      dynamic_update_slice_shape, operand, update, start_indices));
+      dynamic_update_slice_shape, operand, update, scalar_start_indices));
 }
 
-StatusOr<HloInstruction*> MakeBroadcastHlo(
-    HloInstruction* operand, absl::Span<const int64> broadcast_dimensions,
-    absl::Span<const int64> result_shape_bounds) {
+HloInstruction* MakeBroadcastHlo(HloInstruction* operand,
+                                 absl::Span<const int64> broadcast_dimensions,
+                                 absl::Span<const int64> result_shape_bounds) {
   HloComputation* computation = operand->parent();
   Shape broadcast_shape = ShapeUtil::MakeShape(operand->shape().element_type(),
                                                result_shape_bounds);
@@ -189,8 +223,7 @@ StatusOr<HloInstruction*> MakeMapHlo(absl::Span<HloInstruction* const> operands,
   for (const HloInstruction* operand : operands) {
     CHECK_EQ(computation, operand->parent());
     operand_shapes.push_back(&operand->shape());
-    max_operand_rank =
-        std::max(max_operand_rank, ShapeUtil::Rank(operand->shape()));
+    max_operand_rank = std::max(max_operand_rank, operand->shape().rank());
   }
   std::vector<int64> map_dims(max_operand_rank);
   std::iota(map_dims.begin(), map_dims.end(), 0);
@@ -207,7 +240,7 @@ StatusOr<HloInstruction*> MakeReduceHlo(HloInstruction* operand,
                                         HloOpcode binary_opcode,
                                         HloModule* module) {
   DCHECK_NE(nullptr, module);
-  std::vector<int64> all_dims(ShapeUtil::Rank(operand->shape()));
+  std::vector<int64> all_dims(operand->shape().rank());
   std::iota(all_dims.begin(), all_dims.end(), 0);
 
   auto scalar_shape = ShapeUtil::MakeShape(operand->shape().element_type(), {});
@@ -240,6 +273,29 @@ StatusOr<HloInstruction*> MakeSelectHlo(HloInstruction* pred,
       select_shape, HloOpcode::kSelect, pred, on_true, on_false));
 }
 
+StatusOr<HloInstruction*> MakeSortHlo(
+    const Shape& sort_shape, absl::Span<HloInstruction* const> operands,
+    int64 dimension_to_sort, bool is_stable, HloComputation::Builder* builder,
+    HloModule* module) {
+  CHECK(!operands.empty()) << "Sort Hlo requires at least one operand.";
+  HloComputation* compare_computation;
+  XlaBuilder b("Sort.Compare");
+  std::vector<PrimitiveType> operand_types(operands.size());
+  for (int64 i = 0; i < operands.size(); ++i) {
+    operand_types[i] = operands[i]->shape().element_type();
+  }
+  XlaComputation comparator = CreateScalarLtComputation(operand_types, &b);
+  TF_ASSIGN_OR_RETURN(ProgramShape program_shape, comparator.GetProgramShape());
+  HloModuleConfig config(program_shape);
+  TF_ASSIGN_OR_RETURN(auto new_module,
+                      HloModule::CreateFromProto(comparator.proto(), config));
+  HloCloneContext context(module);
+  compare_computation =
+      module->DeepCloneComputation(new_module->entry_computation(), &context);
+  return builder->AddInstruction(HloInstruction::CreateSort(
+      sort_shape, dimension_to_sort, operands, compare_computation, is_stable));
+}
+
 StatusOr<HloInstruction*> CollapseFirstNDims(HloInstruction* operand, int64 n) {
   CHECK_GT(n, 0);
 
@@ -366,9 +422,9 @@ StatusOr<HloInstruction*> PadVectorWithZeros(HloInstruction* operand,
   return MakePadHlo(operand, zero, padding_config);
 }
 
-StatusOr<HloInstruction*> BroadcastZeros(
-    HloComputation* computation, PrimitiveType element_type,
-    absl::Span<const int64> broadcast_dimensions) {
+HloInstruction* BroadcastZeros(HloComputation* computation,
+                               PrimitiveType element_type,
+                               absl::Span<const int64> broadcast_dimensions) {
   HloInstruction* zero = computation->AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::Zero(element_type)));
   return MakeBroadcastHlo(zero, /*broadcast_dimensions=*/{},
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.h b/tensorflow/compiler/xla/service/hlo_creation_utils.h
index 8e5ddbbd503a501bd493aec43a2ccd4db883ef0c..17b7a2da6a9da994ea2d496b549eec79278b56b5 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.h
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.h
@@ -82,9 +82,9 @@ StatusOr<HloInstruction*> MakeDynamicUpdateSliceHlo(
 
 // Creates a broadcast HLO instruction and adds it to the computation containing
 // `operand`.
-StatusOr<HloInstruction*> MakeBroadcastHlo(
-    HloInstruction* operand, absl::Span<const int64> broadcast_dimensions,
-    absl::Span<const int64> result_shape_bounds);
+HloInstruction* MakeBroadcastHlo(HloInstruction* operand,
+                                 absl::Span<const int64> broadcast_dimensions,
+                                 absl::Span<const int64> result_shape_bounds);
 
 // Creates a GetTupleElement HLO instruction and adds it to the computation
 // containing `operand`.
@@ -123,6 +123,15 @@ StatusOr<HloInstruction*> MakeSelectHlo(HloInstruction* pred,
                                         HloInstruction* on_true,
                                         HloInstruction* on_false);
 
+// Creates a Sort HLO instruction and adds it to the computation containing the
+// operands. All operands must be in the same computation. Also creates a
+// default compare sub-computation which sorts the first operand into ascending
+// order. 'is_stable' specifies whether the sorting should be stable.
+StatusOr<HloInstruction*> MakeSortHlo(
+    const Shape& sort_shape, absl::Span<HloInstruction* const> operands,
+    int64 dimension_to_sort, bool is_stable, HloComputation::Builder* builder,
+    HloModule* module);
+
 // Creates an R1 Constant HLO instruction of the given PrimitiveType with the
 // given values and adds it to the given computation.
 template <typename NativeT>
@@ -198,9 +207,9 @@ StatusOr<HloInstruction*> PadVectorWithZeros(HloInstruction* operand,
 // Broadcasts a zero value of type `element_type` into a tensor with element
 // type `element_type` and dimension bounds `broadcast_dimensions`.  The
 // broadcast instruction is emitted into `computation`.
-StatusOr<HloInstruction*> BroadcastZeros(
-    HloComputation* computation, PrimitiveType element_type,
-    absl::Span<const int64> broadcast_dimensions);
+HloInstruction* BroadcastZeros(HloComputation* computation,
+                               PrimitiveType element_type,
+                               absl::Span<const int64> broadcast_dimensions);
 
 // Creates a HLO computation that takes arguments of type `domain` and produces
 // a value of type `range`.
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
index aaa9ec60eb3c4e0159ed40b37d772e0973d306ec..6025e6a77941369f75ebaa98bdf0979669b3a03c 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
@@ -56,9 +56,9 @@ TEST_F(HloCreationUtilsTest, CollapseFirst1Dim) {
   entry_computation->set_root_instruction(first_1_dims_collapsed);
 
   HloEvaluator evaluator;
-  TF_ASSERT_OK_AND_ASSIGN(Literal result_literal,
-                          evaluator.Evaluate<Literal>(
-                              *module, {LiteralUtil::CreateR1<int32>({3, 4})}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal result_literal,
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR1<int32>({3, 4})}));
   CHECK_EQ(result_literal, LiteralUtil::CreateR1<int32>({3, 4}));
 }
 
@@ -77,10 +77,9 @@ TEST_F(HloCreationUtilsTest, CollapseFirst2Dims) {
   HloEvaluator evaluator;
   TF_ASSERT_OK_AND_ASSIGN(
       Literal result_literal,
-      evaluator.Evaluate<Literal>(
-          *module,
-          {LiteralUtil::CreateR3<int32>(
-              {{{1, 2}, {3, 4}, {5, 6}}, {{-1, -2}, {-3, -4}, {-5, -6}}})}));
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR3<int32>(
+                                      {{{1, 2}, {3, 4}, {5, 6}},
+                                       {{-1, -2}, {-3, -4}, {-5, -6}}})}));
   CHECK_EQ(result_literal,
            LiteralUtil::CreateR2<int32>(
                {{1, 2}, {3, 4}, {5, 6}, {-1, -2}, {-3, -4}, {-5, -6}}));
@@ -101,8 +100,7 @@ TEST_F(HloCreationUtilsTest, Prepend1DegenerateDim) {
   HloEvaluator evaluator;
   TF_ASSERT_OK_AND_ASSIGN(
       Literal result_literal,
-      evaluator.Evaluate<Literal>(*module,
-                                  {LiteralUtil::CreateR1<int32>({9, 10})}));
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR1<int32>({9, 10})}));
   CHECK_EQ(result_literal, LiteralUtil::CreateR2<int32>({{9, 10}}));
 }
 
@@ -121,8 +119,7 @@ TEST_F(HloCreationUtilsTest, Prepend2DegenerateDims) {
   HloEvaluator evaluator;
   TF_ASSERT_OK_AND_ASSIGN(
       Literal result_literal,
-      evaluator.Evaluate<Literal>(*module,
-                                  {LiteralUtil::CreateR1<int32>({9, 10})}));
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR1<int32>({9, 10})}));
   CHECK_EQ(result_literal, LiteralUtil::CreateR3<int32>({{{9, 10}}}));
 }
 
@@ -141,7 +138,7 @@ TEST_F(HloCreationUtilsTest, Prepend2DegenerateDimsToScalar) {
   HloEvaluator evaluator;
   TF_ASSERT_OK_AND_ASSIGN(
       Literal result_literal,
-      evaluator.Evaluate<Literal>(*module, {LiteralUtil::CreateR0<int32>(9)}));
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR0<int32>(9)}));
   CHECK_EQ(result_literal, LiteralUtil::CreateR2<int32>({{9}}));
 }
 
@@ -160,8 +157,8 @@ TEST_F(HloCreationUtilsTest, ExpandFirstDimInto3Dims) {
   HloEvaluator evaluator;
   TF_ASSERT_OK_AND_ASSIGN(
       Literal result_literal,
-      evaluator.Evaluate<Literal>(
-          *module, {LiteralUtil::CreateR1<int32>({1, 2, 3, 4, 5, 6})}));
+      evaluator.Evaluate(*module,
+                         {LiteralUtil::CreateR1<int32>({1, 2, 3, 4, 5, 6})}));
   CHECK_EQ(result_literal,
            LiteralUtil::CreateR3<int32>({{{1, 2}}, {{3, 4}}, {{5, 6}}}));
 }
@@ -180,9 +177,9 @@ TEST_F(HloCreationUtilsTest, PadVectorWithZeros) {
   entry_computation->set_root_instruction(zero_padded_param);
 
   HloEvaluator evaluator;
-  TF_ASSERT_OK_AND_ASSIGN(Literal result_literal,
-                          evaluator.Evaluate<Literal>(
-                              *module, {LiteralUtil::CreateR1<int32>({3, 4})}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal result_literal,
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR1<int32>({3, 4})}));
   CHECK_EQ(result_literal, LiteralUtil::CreateR1<int32>({0, 0, 0, 3, 4, 0}));
 }
 
@@ -194,15 +191,14 @@ TEST_F(HloCreationUtilsTest, BroadcastZeros_S32) {
                                              /*output_shape_dims=*/{2, 2},
                                              &param, &entry_computation);
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      HloInstruction * zeros,
-      BroadcastZeros(module->entry_computation(), S32, {2, 2}));
+  HloInstruction* zeros =
+      BroadcastZeros(module->entry_computation(), S32, {2, 2});
   entry_computation->set_root_instruction(zeros);
 
   HloEvaluator evaluator;
   TF_ASSERT_OK_AND_ASSIGN(
       Literal result_literal,
-      evaluator.Evaluate<Literal>(*module, {LiteralUtil::CreateR0<int32>(0)}));
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR0<int32>(0)}));
   CHECK_EQ(result_literal, LiteralUtil::CreateR2<int32>({{0, 0}, {0, 0}}));
 }
 
@@ -214,15 +210,14 @@ TEST_F(HloCreationUtilsTest, BroadcastZeros_F32) {
                                              /*output_shape_dims=*/{2, 2},
                                              &param, &entry_computation);
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      HloInstruction * zeros,
-      BroadcastZeros(module->entry_computation(), F32, {2, 2}));
+  HloInstruction* zeros =
+      BroadcastZeros(module->entry_computation(), F32, {2, 2});
   entry_computation->set_root_instruction(zeros);
 
   HloEvaluator evaluator;
-  TF_ASSERT_OK_AND_ASSIGN(Literal result_literal,
-                          evaluator.Evaluate<Literal>(
-                              *module, {LiteralUtil::CreateR0<float>(0.0f)}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal result_literal,
+      evaluator.Evaluate(*module, {LiteralUtil::CreateR0<float>(0.0f)}));
   CHECK_EQ(result_literal,
            LiteralUtil::CreateR2<float>({{0.0f, 0.0f}, {0.0f, 0.0f}}));
 }
diff --git a/tensorflow/compiler/xla/service/hlo_cse.cc b/tensorflow/compiler/xla/service/hlo_cse.cc
index e602107cbe64320a8e8e740168cb294ec6be9667..849cac278ee379122ba1ff9fade3bf003969b8a7 100644
--- a/tensorflow/compiler/xla/service/hlo_cse.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/hash/hash.h"
 
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index 3ed3d3c11c71dc534f193ba3ffb556b0eb0c80e4..3144a84805454488f417391f40ed6b9e9facc752 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -107,7 +107,7 @@ bool HloDataflowAnalysis::AreTransitiveUsesElementwiseOrTuple(
           return false;
         }
       }
-      if (!visited.count(user)) {
+      if (!visited.contains(user)) {
         stack.push_back(user);
       }
     }
@@ -190,7 +190,7 @@ string HloDataflowAnalysis::ToString() const {
   for (const HloComputation* computation : module_.computations()) {
     for (const HloInstruction* instruction : computation->instructions()) {
       StrAppend(&out, "    ", instruction->name(), ":\n");
-      if (ShapeUtil::IsTuple(instruction->shape())) {
+      if (instruction->shape().IsTuple()) {
         GetInstructionValueSet(instruction)
             .ForEachElement([this, &instruction, &out](
                                 const ShapeIndex& index,
@@ -256,7 +256,7 @@ bool HloDataflowAnalysis::Phi(
         input_value_ids.push_back(value->id());
       }
     }
-    std::sort(input_value_ids.begin(), input_value_ids.end());
+    absl::c_sort(input_value_ids);
     input_value_ids.erase(
         std::unique(input_value_ids.begin(), input_value_ids.end()),
         input_value_ids.end());
@@ -271,8 +271,7 @@ bool HloDataflowAnalysis::Phi(
     if (current_value_defined_here) {
       VLOG(5) << "current_value_defined_here: " << current_value->ToString();
       CHECK(current_value->is_phi());
-      auto it = std::find(input_value_ids.begin(), input_value_ids.end(),
-                          current_value->id());
+      auto it = absl::c_find(input_value_ids, current_value->id());
       if (it != input_value_ids.end()) {
         input_value_ids.erase(it);
       }
@@ -921,8 +920,7 @@ StatusOr<std::unique_ptr<HloDataflowAnalysis>> HloDataflowAnalysis::Run(
   for (auto& pair : dataflow_analysis->values_) {
     dataflow_analysis->values_vector_.push_back(&pair.second);
   }
-  std::sort(dataflow_analysis->values_vector_.begin(),
-            dataflow_analysis->values_vector_.end(), HloValue::IdLessThan);
+  absl::c_sort(dataflow_analysis->values_vector_, HloValue::IdLessThan);
 
   TF_DCHECK_OK(dataflow_analysis->Verify());
 
@@ -937,9 +935,7 @@ Status HloDataflowAnalysis::Verify() const {
   for (const HloValue* value : values()) {
     for (const HloPosition& position : value->positions()) {
       const HloValueSet& value_set = GetValueSet(position);
-      TF_RET_CHECK(std::find(value_set.values().begin(),
-                             value_set.values().end(),
-                             value) != value_set.values().end())
+      TF_RET_CHECK(absl::c_linear_search(value_set.values(), value))
           << "Value set at position " << position << " does not contain value "
           << value->ToShortString();
     }
@@ -954,9 +950,7 @@ Status HloDataflowAnalysis::Verify() const {
         const HloValueSet& value_set = pair.second;
         const HloPosition position{instruction, index};
         for (const HloValue* value : value_set.values()) {
-          TF_RET_CHECK(std::find(value->positions().begin(),
-                                 value->positions().end(),
-                                 position) != value->positions().end())
+          TF_RET_CHECK(absl::c_linear_search(value->positions(), position))
               << "Value set at position " << position
               << " unexpectedly contains value " << value->ToShortString();
         }
@@ -1041,11 +1035,10 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
       // Check if one operand of kAdd fused root is kDot or kConvolution.
       auto* add = user->fused_expression_root();
       auto add_operand_it =
-          std::find_if(add->operands().begin(), add->operands().end(),
-                       [&](HloInstruction* operand) {
-                         return operand->opcode() == HloOpcode::kConvolution ||
-                                operand->opcode() == HloOpcode::kDot;
-                       });
+          absl::c_find_if(add->operands(), [&](HloInstruction* operand) {
+            return operand->opcode() == HloOpcode::kConvolution ||
+                   operand->opcode() == HloOpcode::kDot;
+          });
       if (add_operand_it == add->operands().end()) {
         return false;
       }
@@ -1100,16 +1093,15 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
     // *) The root instruction of the called computation is element-wise on
     //    'operand'.
     const bool found_caller_use =
-        std::find_if(uses.begin(), uses.end(), [user](const HloUse& use) {
+        absl::c_find_if(uses, [user](const HloUse& use) {
           return use.instruction == user;
         }) != uses.end();
     auto* callee_root = user->to_apply()->root_instruction();
     const bool found_elementwise_callee_use =
-        std::find_if(
-            uses.begin(), uses.end(), [callee_root](const HloUse& use) {
-              return use.instruction == callee_root &&
-                     callee_root->IsElementwiseOnOperand(use.operand_number);
-            }) != uses.end();
+        absl::c_find_if(uses, [callee_root](const HloUse& use) {
+          return use.instruction == callee_root &&
+                 callee_root->IsElementwiseOnOperand(use.operand_number);
+        }) != uses.end();
     return uses.size() == 2 && found_caller_use && found_elementwise_callee_use;
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index f7a1f19a6f52befd58a405d0e406d7d0d37a8e57..768e3afb3b80698061b62c4aadef09c20e2f286c 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -73,8 +74,8 @@ class HloDataflowAnalysisTest : public HloTestBase,
   bool InstructionsMayInterfere(const HloOrdering& ordering,
                                 const HloInstruction* a,
                                 const HloInstruction* b) {
-    EXPECT_FALSE(ShapeUtil::IsTuple(a->shape()));
-    EXPECT_FALSE(ShapeUtil::IsTuple(b->shape()));
+    EXPECT_FALSE(a->shape().IsTuple());
+    EXPECT_FALSE(b->shape().IsTuple());
     return ordering.MayInterfere(analysis_->GetValueDefinedAt(a),
                                  analysis_->GetValueDefinedAt(b), *analysis_);
   }
@@ -1882,8 +1883,8 @@ TEST_P(HloDataflowAnalysisTest, AddDependency) {
 HloModule AddDependency
 ENTRY %AddDependency (p: f32[3]) -> f32[3] {
   %p = f32[3] parameter(0)
-  %token = token[] after-all()
-  ROOT %add_dep = f32[3] add-dependency(f32[3] %p, token[] %token)
+  %token0 = token[] after-all()
+  ROOT %add_dep = f32[3] add-dependency(f32[3] %p, token[] %token0)
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(
@@ -1901,9 +1902,9 @@ ENTRY %AddDependency (p: f32[3]) -> f32[3] {
   EXPECT_FALSE(analysis->ValueIsDefinedAt(root));
 }
 
-INSTANTIATE_TEST_CASE_P(HloDataflowAnalysisInstantiation,
-                        HloDataflowAnalysisTest,
-                        ::testing::Values(false, true));
+INSTANTIATE_TEST_SUITE_P(HloDataflowAnalysisInstantiation,
+                         HloDataflowAnalysisTest,
+                         ::testing::Values(false, true));
 
 class HloDataflowAnalysisTestBase : public HloTestBase {
  protected:
@@ -1970,12 +1971,13 @@ TEST_F(DoesNotUseOperandBufferTest, FusedDynamicUpdateSlice) {
 
   // Create a DynamicUpdateSlice instruction of tuple element 1.
   auto starts = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(2)));
   auto update = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f})));
   auto dynamic_update_slice =
       builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-          data_shape, gte1, update, starts));
+          data_shape, gte1, update,
+          std::initializer_list<HloInstruction*>({starts})));
   builder.AddInstruction(
       HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
 
@@ -2012,12 +2014,13 @@ TEST_F(DoesNotUseOperandBufferTest, IndirectUses) {
 
   // Create a DynamicUpdateSlice instruction of tuple element 1.
   auto starts = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(2)));
   auto update = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f})));
   auto dynamic_update_slice =
       builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-          data_shape, gte1, update, starts));
+          data_shape, gte1, update,
+          std::initializer_list<HloInstruction*>({starts})));
   builder.AddInstruction(
       HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
 
@@ -2150,17 +2153,17 @@ TEST_F(CanShareOperandBufferWithUserTest,
 
   auto param = builder.AddInstruction(
       HloInstruction::CreateParameter(0, data_shape, "param0"));
-  auto index = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int64>({0, 0})));
-  auto ds = builder.AddInstruction(
-      HloInstruction::CreateDynamicSlice(slice_shape, param, index, {1, 2, 2}));
+  auto zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int64>(0)));
+  auto ds = builder.AddInstruction(HloInstruction::CreateDynamicSlice(
+      slice_shape, param, {zero, zero}, {1, 2, 2}));
 
-  auto dus = builder.AddInstruction(
-      HloInstruction::CreateDynamicUpdateSlice(data_shape, param, ds, index));
+  auto dus = builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+      data_shape, param, ds, {zero, zero}));
 
   BuildModule(builder.Build());
   auto fusion = computation_->CreateFusionInstruction(
-      {dus, ds, index}, HloInstruction::FusionKind::kLoop);
+      {dus, ds, zero}, HloInstruction::FusionKind::kLoop);
   RunAnalysis();
 
   EXPECT_TRUE(
@@ -2219,12 +2222,13 @@ TEST_F(CanShareOperandBufferWithUserTest, FusedDynamicUpdateSlice) {
 
   // Create a DynamicUpdateSlice instruction of tuple element 1.
   auto starts = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(2)));
   auto update = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f})));
   auto dynamic_update_slice =
       builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-          data_shape, gte1, update, starts));
+          data_shape, gte1, update,
+          std::initializer_list<HloInstruction*>({starts})));
   builder.AddInstruction(
       HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
 
@@ -2259,12 +2263,13 @@ TEST_F(CanShareOperandBufferWithUserTest,
 
   // Create a DynamicUpdateSlice instruction of tuple element 1.
   auto starts = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(2)));
   auto update = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f})));
   auto dynamic_update_slice =
       builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-          data_shape_bf16, convert1, update, starts));
+          data_shape_bf16, convert1, update,
+          std::initializer_list<HloInstruction*>({starts})));
 
   auto convert2 = builder.AddInstruction(
       HloInstruction::CreateConvert(data_shape, dynamic_update_slice));
@@ -2290,10 +2295,13 @@ TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
       HloInstruction::CreateParameter(0, data_shape, "data"));
   auto update = builder.AddInstruction(
       HloInstruction::CreateParameter(1, update_shape, "update"));
-  auto starts = builder.AddInstruction(
-      HloInstruction::CreateParameter(2, starts_shape, "starts"));
+  auto start0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, starts_shape, "start0"));
+  auto start1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(3, starts_shape, "start1"));
+
   auto dus = builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-      data_shape, data, update, starts));
+      data_shape, data, update, {start0, start1}));
 
   BuildModuleAndRunAnalysis(builder.Build());
 
@@ -2304,7 +2312,9 @@ TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
   EXPECT_FALSE(
       dataflow_analysis_->CanShareOperandBufferWithUser(update, {}, dus, {}));
   EXPECT_FALSE(
-      dataflow_analysis_->CanShareOperandBufferWithUser(starts, {}, dus, {}));
+      dataflow_analysis_->CanShareOperandBufferWithUser(start0, {}, dus, {}));
+  EXPECT_FALSE(
+      dataflow_analysis_->CanShareOperandBufferWithUser(start1, {}, dus, {}));
 }
 
 TEST_F(CanShareOperandBufferWithUserTest, ScatterCanShare) {
@@ -2347,14 +2357,17 @@ TEST_F(CanShareOperandBufferWithUserTest, ScatterCanShare) {
 
 TEST_F(CanShareOperandBufferWithUserTest, SortCanShare) {
   auto builder = HloComputation::Builder(TestName());
+  module_ = CreateNewVerifiedModule();
 
   Shape keys_shape = ShapeUtil::MakeShape(F32, {8});
   auto keys = builder.AddInstruction(
       HloInstruction::CreateParameter(0, keys_shape, "keys"));
-  auto sort =
-      builder.AddInstruction(HloInstruction::CreateSort(keys_shape, 0, keys));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* sort, MakeSortHlo(keys_shape, {keys}, -1, /*is_stable=*/false,
+                              &builder, module_.get()));
 
-  BuildModuleAndRunAnalysis(builder.Build());
+  computation_ = module_->AddEntryComputation(builder.Build());
+  RunAnalysis();
 
   EXPECT_TRUE(
       dataflow_analysis_->CanShareOperandBufferWithUser(keys, {}, sort, {}));
@@ -2362,6 +2375,7 @@ TEST_F(CanShareOperandBufferWithUserTest, SortCanShare) {
 
 TEST_F(CanShareOperandBufferWithUserTest, SortCanShareWithTupleUser) {
   auto builder = HloComputation::Builder(TestName());
+  module_ = CreateNewVerifiedModule();
 
   Shape keys_shape = ShapeUtil::MakeShape(F32, {8});
   Shape values_shape = ShapeUtil::MakeShape(F32, {8});
@@ -2369,11 +2383,14 @@ TEST_F(CanShareOperandBufferWithUserTest, SortCanShareWithTupleUser) {
       HloInstruction::CreateParameter(0, keys_shape, "keys"));
   auto values = builder.AddInstruction(
       HloInstruction::CreateParameter(1, values_shape, "values"));
-  auto sort = builder.AddInstruction(HloInstruction::CreateSort(
-      ShapeUtil::MakeTupleShape({keys_shape, values_shape}), 0, keys,
-      {values}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* sort,
+      MakeSortHlo(ShapeUtil::MakeTupleShape({keys_shape, values_shape}),
+                  {keys, values}, 0, /*is_stable=*/false, &builder,
+                  module_.get()));
 
-  BuildModuleAndRunAnalysis(builder.Build());
+  computation_ = module_->AddEntryComputation(builder.Build());
+  RunAnalysis();
 
   // The buffer for the keys can be shared with the first tuple entry.
   EXPECT_TRUE(
diff --git a/tensorflow/compiler/xla/service/hlo_dce.cc b/tensorflow/compiler/xla/service/hlo_dce.cc
index 7d35e251ca21951036336ff1a1eb4aabc87bc5ca..a5a11f09cf4f857b992e5ede3a9dbc5a937ce722 100644
--- a/tensorflow/compiler/xla/service/hlo_dce.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -65,7 +66,7 @@ StatusOr<bool> HloDCE::Run(HloModule* module) {
 
   // Now DCE HloComputations.  First, collect the computations that are
   // referenced by some remaining instruction.
-  std::unordered_set<HloComputation*> live_computations;
+  absl::flat_hash_set<HloComputation*> live_computations;
   if (HloComputation* entry_computation = module->entry_computation()) {
     live_computations.insert(entry_computation);
   }
@@ -79,7 +80,7 @@ StatusOr<bool> HloDCE::Run(HloModule* module) {
 
   // Remove dead computations.
   for (auto* computation : module->MakeComputationPostOrder()) {
-    if (live_computations.count(computation) == 0) {
+    if (!live_computations.contains(computation)) {
       TF_RETURN_IF_ERROR(module->RemoveEmbeddedComputation(computation));
       changed = true;
     }
diff --git a/tensorflow/compiler/xla/service/hlo_dce_test.cc b/tensorflow/compiler/xla/service/hlo_dce_test.cc
index 1fa4259a3e42286cbc911907eea563e6ca6f8611..b5d72b386f89568cc3066b2e497be98428d1ed0c 100644
--- a/tensorflow/compiler/xla/service/hlo_dce_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce_test.cc
@@ -43,9 +43,7 @@ class HloDceTest : public HloTestBase {
   // Returns whether the given instruction exists in the given computation.
   bool HasInstruction(const HloComputation& computation,
                       const HloInstruction* instruction) {
-    return std::find(computation.instructions().begin(),
-                     computation.instructions().end(),
-                     instruction) != computation.instructions().end();
+    return absl::c_linear_search(computation.instructions(), instruction);
   }
 };
 
diff --git a/tensorflow/compiler/xla/service/hlo_domain_map.cc b/tensorflow/compiler/xla/service/hlo_domain_map.cc
index c6d02f9f67bb599e496d20fc2acf2e627ed54438..7cdb7f6bdf26241cda4fabbb5ccaf6e6f7de39ce 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_map.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_map.cc
@@ -230,10 +230,10 @@ HloDomainMap::MakeNonDomainInstructions(
     }
   }
   // sort instructions according to instructions_order
-  std::sort(instructions.begin(), instructions.end(),
-            [&instructions_order](HloInstruction* a, HloInstruction* b) {
-              return instructions_order.at(a) < instructions_order.at(b);
-            });
+  absl::c_sort(instructions,
+               [&instructions_order](HloInstruction* a, HloInstruction* b) {
+                 return instructions_order.at(a) < instructions_order.at(b);
+               });
   return instructions;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_domain_test.cc b/tensorflow/compiler/xla/service/hlo_domain_test.cc
index acdb42128e3d9a1fb912a466c9c2c3cbbe3d3f83..fd4fb0246d8d42ab7329c05dc23e386303cdce3c 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_test.cc
@@ -195,10 +195,10 @@ HloModule Module
 ENTRY entry {
   p0 = (f32[4]) parameter(0)
   a = f32[4] get-tuple-element(p0), index=0
-  token = token[] after-all()
-  b = (f32[4], u32[], token[]) send(a, token), channel_id=1, sharding={maximal device=0}
+  token0 = token[] after-all()
+  b = (f32[4], u32[], token[]) send(a, token0), channel_id=1, sharding={maximal device=0}
   c = token[] send-done(b), channel_id=1, sharding={maximal device=0}
-  d = (f32[4], u32[], token[]) recv(token), channel_id=2, sharding={maximal device=0}
+  d = (f32[4], u32[], token[]) recv(token0), channel_id=2, sharding={maximal device=0}
   e = (f32[4], token[]) recv-done(d), channel_id=2, sharding={maximal device=0}
   e_element = f32[4] get-tuple-element(e), index=0, sharding={maximal device=0}
   f = f32[4] add(a, e_element)
@@ -235,12 +235,12 @@ TEST_F(HloDomainTest, CheckNoDomainAddedOnPureIOComputation) {
 HloModule Module
 
 ENTRY entry {
-  token = token[] after-all(), sharding={maximal device=-1}
-  a = (f32[4], u32[], token[]) recv(token), channel_id=1, sharding={maximal device=-1}
+  token0 = token[] after-all(), sharding={maximal device=-1}
+  a = (f32[4], u32[], token[]) recv(token0), channel_id=1, sharding={maximal device=-1}
   b = (f32[4], token[]) recv-done(a), channel_id=1, sharding={maximal device=-1}
   b_element = f32[4] get-tuple-element(b), index=0, sharding={maximal device=-1}
   c = f32[4] add(b_element, b_element), sharding={maximal device=-1}
-  d = (f32[4], u32[], token[]) send(c, token), channel_id=2, sharding={maximal device=-1}
+  d = (f32[4], u32[], token[]) send(c, token0), channel_id=2, sharding={maximal device=-1}
   ROOT e = token[] send-done(d), channel_id=2, sharding={maximal device=-1}
 }
 )";
@@ -259,12 +259,12 @@ TEST_F(HloDomainTest, CheckNormalizationOnPureIOComputation) {
 HloModule Module
 
 ENTRY entry {
-  token = token[] after-all(), sharding={maximal device=0}
-  a = (f32[4], u32[], token[]) recv(token), channel_id=1, sharding={maximal device=0}
+  token0 = token[] after-all(), sharding={maximal device=0}
+  a = (f32[4], u32[], token[]) recv(token0), channel_id=1, sharding={maximal device=0}
   b = (f32[4], token[]) recv-done(a), channel_id=1, sharding={maximal device=0}
   b_element = f32[4] get-tuple-element(b), index=0, sharding={maximal device=0}
   c = f32[4] add(b_element, b_element)
-  d = (f32[4], u32[], token[]) send(c, token), channel_id=2, sharding={maximal device=0}
+  d = (f32[4], u32[], token[]) send(c, token0), channel_id=2, sharding={maximal device=0}
   ROOT e = token[] send-done(d), channel_id=2, sharding={maximal device=0}
 }
 )";
@@ -344,8 +344,8 @@ TEST_F(HloDomainTest, CheckNormalizationOnInfeedTuple) {
 HloModule Module
 
 ENTRY entry {
-  token = token[] after-all()
-  infeed = ((f32[4], f32[4]), token[]) infeed(token),
+  token0 = token[] after-all()
+  infeed = ((f32[4], f32[4]), token[]) infeed(token0),
     sharding={{maximal device=1}, {maximal device=0}, {maximal device=0}}
   infeed.data = (f32[4], f32[4]) get-tuple-element(infeed), index=0,
     sharding={{maximal device=1}, {maximal device=0}}
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
index 72006e17e7e7ec09b62e88d05b695ec9f4c49647..7d6b86056af3fc2128fe1642bbfa0ca6f9ef1da0 100644
--- a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
@@ -68,7 +68,7 @@ Shape GetConvertedTupleShape(const Shape& shape, PrimitiveType from_type,
   std::vector<Shape> new_tuple_subshapes;
   for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
     Shape subshape = ShapeUtil::GetTupleElementShape(shape, i);
-    CHECK(!ShapeUtil::IsTuple(subshape));
+    CHECK(!subshape.IsTuple());
     if (subshape.element_type() == from_type) {
       subshape = ShapeUtil::ChangeElementType(subshape, to_type);
     }
@@ -92,7 +92,7 @@ HloInstruction* ConvertTupleElements(HloInstruction* hlo,
     HloInstruction* element = computation->AddInstruction(
         HloInstruction::CreateGetTupleElement(ele_shape, hlo, i));
     const Shape& to_ele_shape = ShapeUtil::GetTupleElementShape(to_shape, i);
-    CHECK(!ShapeUtil::IsTuple(ele_shape));
+    CHECK(!ele_shape.IsTuple());
     if (ele_shape.element_type() != to_ele_shape.element_type()) {
       element = computation->AddInstruction(
           HloInstruction::CreateConvert(to_ele_shape, element));
@@ -127,6 +127,7 @@ StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
       // These are ops where it does not make sense to convert them.
       if (opcode == HloOpcode::kParameter || opcode == HloOpcode::kConstant ||
           opcode == HloOpcode::kTuple || opcode == HloOpcode::kConvert ||
+          opcode == HloOpcode::kBitcastConvert ||
           opcode == HloOpcode::kGetTupleElement ||
           opcode == HloOpcode::kInfeed || opcode == HloOpcode::kOutfeed) {
         continue;
@@ -141,12 +142,11 @@ StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
       // These are ops with embedded computations where it suffices to convert
       // the embedded computations instead of converting the ops themselves.
       if (opcode == HloOpcode::kWhile || opcode == HloOpcode::kCall ||
-          opcode == HloOpcode::kCrossReplicaSum ||
-          opcode == HloOpcode::kFusion || opcode == HloOpcode::kMap ||
-          opcode == HloOpcode::kReduce || opcode == HloOpcode::kReduceWindow ||
-          opcode == HloOpcode::kScatter ||
+          opcode == HloOpcode::kAllReduce || opcode == HloOpcode::kFusion ||
+          opcode == HloOpcode::kMap || opcode == HloOpcode::kReduce ||
+          opcode == HloOpcode::kReduceWindow || opcode == HloOpcode::kScatter ||
           opcode == HloOpcode::kSelectAndScatter ||
-          opcode == HloOpcode::kConditional) {
+          opcode == HloOpcode::kSort || opcode == HloOpcode::kConditional) {
         continue;
       }
       TF_RET_CHECK(hlo->called_computations().empty()) << hlo->ToString();
@@ -191,7 +191,7 @@ StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
         TF_RETURN_IF_ERROR(new_hlo->CopyAllControlDepsFrom(hlo));
 
         new_hlo = ToElementType(new_hlo, eliminate_type_);
-      } else if (ShapeUtil::IsTuple(hlo->shape())) {
+      } else if (hlo->shape().IsTuple()) {
         Shape old_shape = hlo->shape();
         Shape new_shape = GetConvertedTupleShape(hlo->shape(), eliminate_type_,
                                                  replace_with_type_);
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
index c170e36c73ad2bef830e528de3ec72d38683d888..4171f738620dbf545e5883b8c26169fae4b93643 100644
--- a/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter_test.cc
@@ -28,15 +28,7 @@ using ::testing::Eq;
 using ::testing::Not;
 using ::testing::ResultOf;
 
-class HloElementTypeConverterTest : public HloTestBase {
- public:
-  std::unique_ptr<HloModule> CreateModuleFromHloString(
-      const string& hlo_string) {
-    return HloRunner::CreateModuleFromString(hlo_string,
-                                             GetDebugOptionsForTest())
-        .ValueOrDie();
-  }
-};
+using HloElementTypeConverterTest = HloTestBase;
 
 TEST_F(HloElementTypeConverterTest, CustomCallsNotConverted) {
   const string& hlo_string = R"(
@@ -47,7 +39,7 @@ TEST_F(HloElementTypeConverterTest, CustomCallsNotConverted) {
            custom_call_target="foo"
     }
   )";
-  auto module = CreateModuleFromHloString(hlo_string);
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
   HloElementTypeConverter type_converter(BF16, F32);
   TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
   EXPECT_FALSE(converted);
@@ -57,13 +49,13 @@ TEST_F(HloElementTypeConverterTest, InfeedsOutfeedsNotConverted) {
   const string& hlo_string = R"(
     HloModule InfeedOutfeed
     ENTRY RoundTrip16MiBR1.v2 {
-      token = token[] after-all()
-      infeed = (bf16[4]{0}, token[]) infeed(token)
+      token0 = token[] after-all()
+      infeed = (bf16[4]{0}, token[]) infeed(token0)
       ROOT infeed.data = bf16[4]{0} get-tuple-element(infeed), index=0
-      outfeed = token[] outfeed(infeed.data, token)
+      outfeed = token[] outfeed(infeed.data, token0)
     }
   )";
-  auto module = CreateModuleFromHloString(hlo_string);
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
   HloElementTypeConverter type_converter(BF16, F32);
   TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
   EXPECT_FALSE(converted);
@@ -73,17 +65,16 @@ TEST_F(HloElementTypeConverterTest, OperationsInNestedTuplesConverted) {
   const string& hlo_string = R"(
     HloModule NestedTuples
     ENTRY NestedTuples.v5 {
-      constant.4 = bf16[] constant(42)
       constant.2 = f32[2]{0} constant({1, 2})
-      constant.3 = bf16[] constant(42)
-      add = bf16[] add(constant.2, constant.3)
-      tuple = (f32[2]{0}, bf16[]) tuple(constant.2, add)
+      constant.3 = bf16[2]{0} constant({42, 42})
+      add = bf16[2]{0} add(constant.2, constant.3)
+      tuple = (f32[2]{0}, bf16[2]{0}) tuple(constant.2, add)
       constant.5 = bf16[2]{0} constant({22, 44})
-      ROOT tuple.1 = ((f32[2]{0}, bf16[]), bf16[2]{0}) tuple(tuple, constant.5)
+      ROOT tuple.1 = ((f32[2]{0}, bf16[2]{0}), bf16[2]{0}) tuple(tuple, constant.5)
     }
   )";
 
-  auto module = CreateModuleFromHloString(hlo_string);
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
   HloElementTypeConverter type_converter(BF16, F32);
   TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
   EXPECT_TRUE(converted);
@@ -96,13 +87,13 @@ TEST_F(HloElementTypeConverterTest, BatchNormGradBF16Converted) {
   const string& hlo_string = R"(
     HloModule BatchNormGrad
     ENTRY BatchNormGrad.v6 {
-      constant.4 = bf16[2,2,2,1]{3,2,1,0} constant(bf16[2,2,2,1] { { /*i0=0*/ 
+      constant.4 = bf16[2,2,2,1]{3,2,1,0} constant({ { /*i0=0*/
       { /*i1=0*/ {0}, {0} }, { /*i1=1*/ {0}, {0} } }, { /*i0=1*/ { /*i1=0*/ {0},
       {0} }, { /*i1=1*/ {0}, {0} } } })
       constant.5 = bf16[2]{0} constant({1, 1})
       constant.6 = bf16[2]{0} constant({0, 0})
       constant.7 = bf16[2]{0} constant({1, 1})
-      constant.8 = bf16[2,2,2,1]{3,2,1,0} constant(bf16[2,2,2,1] { { /*i0=0*/
+      constant.8 = bf16[2,2,2,1]{3,2,1,0} constant({ { /*i0=0*/
       { /*i1=0*/ {1}, {2} }, { /*i1=1*/ {3}, {4} } }, { /*i0=1*/ { /*i1=0*/
       {5}, {6} }, { /*i1=1*/ {7}, {8} } } })
       ROOT batch-norm-grad = (bf16[2,2,2,1]{3,2,1,0}, bf16[2]{0}, bf16[2]{0})
@@ -111,7 +102,7 @@ TEST_F(HloElementTypeConverterTest, BatchNormGradBF16Converted) {
     }
   )";
 
-  auto module = CreateModuleFromHloString(hlo_string);
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
   HloElementTypeConverter type_converter(BF16, F32);
   TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
   EXPECT_TRUE(converted);
@@ -135,7 +126,7 @@ ENTRY main {
   ROOT rng = bf16[1,1000,20]{2,1,0} rng(constant.3, constant.4), distribution=rng_uniform
 }
   )";
-  auto module = CreateModuleFromHloString(hlo_string);
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
   HloElementTypeConverter type_converter(BF16, F32);
   TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
   EXPECT_TRUE(converted);
@@ -161,7 +152,7 @@ ENTRY main {
   ROOT rng1 = bf16[1,1000,20]{2,1,0} rng(constant.3, constant.4), control-predecessors={%rng0}, distribution=rng_uniform
 }
   )";
-  auto module = CreateModuleFromHloString(hlo_string);
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
 
   HloElementTypeConverter type_converter(BF16, F32);
   TF_ASSERT_OK_AND_ASSIGN(bool converted, type_converter.Run(module.get()));
@@ -185,5 +176,19 @@ ENTRY main {
   EXPECT_THAT(rng1->control_predecessors(), ElementsAre(rng0));
 }
 
+TEST_F(HloElementTypeConverterTest, BitcastConvertIsUnmodified) {
+  const string& hlo_string = R"(
+  HloModule test
+
+  ENTRY test {
+    p = bf16[] parameter(0)
+    ROOT c = u16[] bitcast-convert(p)
+  })";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  HloElementTypeConverter converter(BF16, F32);
+  TF_ASSERT_OK_AND_ASSIGN(bool converted, RunHloPass(&converter, module.get()));
+  EXPECT_FALSE(converted);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 3a7652a8dc856b23c8988c4676916c8199e78860..4d6487700b24cfd3b89aece58e5ad6d7bb43a800 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <cmath>
 #include <cstdlib>
 #include <functional>
+#include <iterator>
 #include <string>
 #include <type_traits>
-#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -29,10 +29,11 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -135,8 +136,44 @@ StatusOr<Literal> Compare<complex64>(const Shape& shape, HloOpcode opcode,
   return std::move(result);
 }
 
+template <>
+StatusOr<Literal> Compare<complex128>(const Shape& shape, HloOpcode opcode,
+                                      LiteralSlice lhs_literal,
+                                      LiteralSlice rhs_literal) {
+  std::function<bool(complex128, complex128)> compare_op;
+  switch (opcode) {
+    case HloOpcode::kEq:
+      compare_op = [](complex128 lhs_el, complex128 rhs_el) {
+        return lhs_el == rhs_el;
+      };
+      break;
+    case HloOpcode::kNe:
+      compare_op = [](complex128 lhs_el, complex128 rhs_el) {
+        return lhs_el != rhs_el;
+      };
+      break;
+    default:
+      LOG(FATAL) << "unhandled HLO opcode for conversion to Comparison: "
+                 << HloOpcodeString(opcode);
+  }
+
+  Literal result(shape);
+  TF_RETURN_IF_ERROR(
+      result.Populate<bool>([&](absl::Span<const int64> multi_index) {
+        return compare_op(lhs_literal.Get<complex128>(multi_index),
+                          rhs_literal.Get<complex128>(multi_index));
+      }));
+
+  return std::move(result);
+}
+
 }  // namespace
 
+// Note that unsupported types by the typed visitor does not necessarily imply
+// the non-typed HloEvaluator (parent evaluator) would not support them either
+// in the type-agnostic handler. For e.g., HandleGetTupleElement in the parent
+// type-agnostic evaluator will be able to accept Tuple primitive type, whereas
+// HloEvaluatorTypedVisitor cannot.
 HloEvaluator::HloEvaluator(int64 max_loop_iterations)
     : max_loop_iterations_(max_loop_iterations) {
   typed_visitors_[PRED] =
@@ -144,22 +181,14 @@ HloEvaluator::HloEvaluator(int64 max_loop_iterations)
   typed_visitors_[U8] =
       absl::make_unique<HloEvaluatorTypedVisitor<uint8>>(this);
   typed_visitors_[U16] =
-      absl::make_unique<FunctionVisitor>([](HloInstruction*) {
-        return Unimplemented(
-            "HloEvaluator::HloEvaluatorTypedVisitor: unhandled primitive type: "
-            "U16.");
-      });
+      absl::make_unique<HloEvaluatorTypedVisitor<uint16>>(this);
   typed_visitors_[U32] =
       absl::make_unique<HloEvaluatorTypedVisitor<uint32>>(this);
   typed_visitors_[U64] =
       absl::make_unique<HloEvaluatorTypedVisitor<uint64>>(this);
   typed_visitors_[S8] = absl::make_unique<HloEvaluatorTypedVisitor<int8>>(this);
   typed_visitors_[S16] =
-      absl::make_unique<FunctionVisitor>([](HloInstruction*) {
-        return Unimplemented(
-            "HloEvaluator::HloEvaluatorTypedVisitor: unhandled primitive type: "
-            "S16.");
-      });
+      absl::make_unique<HloEvaluatorTypedVisitor<int16>>(this);
   typed_visitors_[S32] =
       absl::make_unique<HloEvaluatorTypedVisitor<int32>>(this);
   typed_visitors_[S64] =
@@ -172,6 +201,8 @@ HloEvaluator::HloEvaluator(int64 max_loop_iterations)
       absl::make_unique<HloEvaluatorTypedVisitor<double>>(this);
   typed_visitors_[C64] =
       absl::make_unique<HloEvaluatorTypedVisitor<complex64>>(this);
+  typed_visitors_[C128] =
+      absl::make_unique<HloEvaluatorTypedVisitor<complex128>>(this);
 
   // Most of the evaluator computations we use don't support BF16 (e.g.,
   // std::ceil, std::tanh). To make evaluator work with BF16, we set all
@@ -197,65 +228,30 @@ HloEvaluator::HloEvaluator(int64 max_loop_iterations)
       });
 }
 
-template <typename LiteralPtr>
-StatusOr<Literal> HloEvaluator::Evaluate(
-    const HloModule& module, absl::Span<const LiteralPtr> arg_literals) {
-  XLA_VLOG_LINES(2, "HloEvaluator::Evaluate module:\n" + module.ToString());
-
-  evaluated_.clear();
-  arg_literals_.clear();
-  for (const auto& literal_ptr : arg_literals) {
-    arg_literals_.push_back(&*literal_ptr);
-  }
-
-  TF_RETURN_IF_ERROR(module.entry_computation()->Accept(this));
-
-  return GetEvaluatedLiteralFor(module.entry_computation()->root_instruction())
-      .Clone();
-}
-
-template <>
-StatusOr<Literal> HloEvaluator::Evaluate<Literal>(
-    const HloModule& module, absl::Span<const Literal> arg_literals) {
-  std::vector<const Literal*> arg_literal_ptrs;
-  for (const auto& literal_ptr : arg_literals) {
-    arg_literal_ptrs.push_back(&literal_ptr);
-  }
-  return Evaluate<const Literal*>(module, arg_literal_ptrs);
-}
-
-template <typename LiteralPtr>
 StatusOr<Literal> HloEvaluator::Evaluate(
     const HloComputation& computation,
-    absl::Span<const LiteralPtr> arg_literals) {
+    absl::Span<const Literal* const> arg_literals) {
   CHECK(computation.parent() != nullptr);
   XLA_VLOG_LINES(
       2, "HloEvaluator::Evaluate computation:\n" + computation.ToString());
 
-  evaluated_.clear();
-  arg_literals_.clear();
-  for (const auto& literal_ptr : arg_literals) {
-    arg_literals_.push_back(&*literal_ptr);
+  if (arg_literals.size() != computation.num_parameters()) {
+    return InvalidArgument(
+        "Expected %d argument%s, but got %d.", computation.num_parameters(),
+        computation.num_parameters() == 1 ? "" : "s", arg_literals.size());
   }
-
-  TF_RETURN_IF_ERROR(computation.Accept(this));
-  return GetEvaluatedLiteralFor(computation.root_instruction()).Clone();
-}
-
-template <>
-StatusOr<Literal> HloEvaluator::Evaluate<Literal>(
-    const HloComputation& computation, absl::Span<const Literal> arg_literals) {
-  std::vector<const Literal*> arg_literal_ptrs;
-  for (const auto& literal_ptr : arg_literals) {
-    arg_literal_ptrs.push_back(&literal_ptr);
+  for (int64 i = 0; i < arg_literals.size(); ++i) {
+    const auto& computation_shape =
+        computation.parameter_instruction(i)->shape();
+    const auto& arg_shape = arg_literals[i]->shape();
+    if (!ShapeUtil::Equal(computation_shape, arg_shape)) {
+      return InvalidArgument(
+          "Shape mismatch at parameter %d. Computation expected %s, but arg "
+          "was %s.",
+          i, ShapeUtil::HumanStringWithLayout(computation_shape),
+          ShapeUtil::HumanString(arg_shape));
+    }
   }
-  return Evaluate<const Literal*>(computation, arg_literal_ptrs);
-}
-
-template <typename LiteralPtr>
-StatusOr<Literal> HloEvaluator::Evaluate(
-    HloInstruction* instruction, absl::Span<const LiteralPtr> arg_literals) {
-  TF_RET_CHECK(hlo_query::AllOperandsAreParametersOrConstants(*instruction));
 
   evaluated_.clear();
   arg_literals_.clear();
@@ -263,33 +259,20 @@ StatusOr<Literal> HloEvaluator::Evaluate(
     arg_literals_.push_back(&*literal_ptr);
   }
 
-  // Evaluate operands of Parameter type against the input literals which
-  // caches the evaluated literal results.
-  for (const auto operand : instruction->operands()) {
-    if (operand->opcode() == HloOpcode::kParameter) {
-      const Literal* input_literal = arg_literals_[operand->parameter_number()];
-      VLOG(2) << "Parameter operand evaluated to: "
-              << input_literal->ToString();
-      TF_RET_CHECK(ShapeUtil::Equal(operand->shape(), input_literal->shape()));
-
-      evaluated_[operand] = input_literal->Clone();
-    }
+  // Re-seed RNG, either from the configuration's seed or a monotonic
+  // per-evaluator seed (which prevents two evaluators from returning the same
+  // random sequence).
+  if (computation.parent()->config().seed()) {
+    seed_ = computation.parent()->config().seed();
+  } else {
+    // Start global_seed at a (true) random value.
+    static std::atomic<uint64> global_seed{std::random_device()()};
+    seed_ = global_seed.fetch_add(1);
   }
+  engine_.seed(seed_);
 
-  TF_RETURN_IF_ERROR(Preprocess(instruction));
-  TF_RETURN_IF_ERROR(instruction->Visit(this));
-  TF_RETURN_IF_ERROR(Postprocess(instruction));
-  return GetEvaluatedLiteralFor(instruction).Clone();
-}
-
-template <>
-StatusOr<Literal> HloEvaluator::Evaluate<Literal>(
-    HloInstruction* instruction, absl::Span<const Literal> arg_literals) {
-  std::vector<const Literal*> arg_literal_ptrs;
-  for (const auto& literal : arg_literals) {
-    arg_literal_ptrs.push_back(&literal);
-  }
-  return Evaluate<const Literal*>(instruction, arg_literal_ptrs);
+  TF_RETURN_IF_ERROR(computation.Accept(this));
+  return GetEvaluatedLiteralFor(computation.root_instruction()).Clone();
 }
 
 StatusOr<Literal> HloEvaluator::Evaluate(HloInstruction* instruction) {
@@ -407,16 +390,45 @@ Status HloEvaluator::HandleBitcast(HloInstruction* bitcast) {
   return Status::OK();
 }
 
+Status HloEvaluator::HandleGetDimensionSize(
+    HloInstruction* get_dimension_size) {
+  HloInstruction* operand = get_dimension_size->mutable_operand(0);
+  int64 dim = get_dimension_size->dimension();
+  if (dynamic_dimension_inference_ == nullptr) {
+    return InvalidArgument(
+        "Evaluator cannot evaluate get_dimension_size without "
+        "set_dynamic_dimension_inference.");
+  }
+  HloInstruction* dynamic_size =
+      dynamic_dimension_inference_->GetDynamicSize(operand, {}, dim);
+  if (dynamic_size != nullptr) {
+    evaluated_[get_dimension_size] =
+        GetEvaluatedLiteralFor(dynamic_size).Clone();
+    return Status::OK();
+  }
+
+  const Shape& shape = get_dimension_size->operand(0)->shape();
+  Literal output(ShapeUtil::MakeShape(U32, {}));
+  output.PopulateWithValue(
+      static_cast<uint32>(shape.dimensions(get_dimension_size->dimension())));
+  evaluated_[get_dimension_size] = std::move(output);
+  return Status::OK();
+}
+
 Status HloEvaluator::HandleParameter(HloInstruction* parameter) {
+  // Nothing to do other than sanity checks. Parameters' values are stored in
+  // arg_literals_.
   CHECK_LT(parameter->parameter_number(), arg_literals_.size());
+
+#ifndef NDEBUG
   const Literal* input_literal = arg_literals_[parameter->parameter_number()];
   VLOG(2) << "Parameter evaluated to: " << input_literal->ToString();
   DCHECK(ShapeUtil::Equal(parameter->shape(), input_literal->shape()))
       << "parameter shape is: " << ShapeUtil::HumanString(parameter->shape())
       << ", but input literal shape is: "
       << ShapeUtil::HumanString(input_literal->shape());
+#endif
 
-  evaluated_[parameter] = input_literal->Clone();
   return Status::OK();
 }
 
@@ -441,8 +453,8 @@ Status HloEvaluator::HandleConcatenate(HloInstruction* concatenate) {
   // The result concatenate dimension is going to be the sum of all
   // concatenate dimensions of the operands taking part of the operation.
   const Shape& reference_shape = operands[0]->shape();
-  CHECK(ShapeUtil::IsArray(reference_shape));
-  const int64 rank = ShapeUtil::Rank(reference_shape);
+  CHECK(reference_shape.IsArray());
+  const int64 rank = reference_shape.rank();
   const int64 concat_dim = concatenate->dimensions()[0];
   CHECK_GE(concat_dim, 0);
   CHECK_LT(concat_dim, rank);
@@ -452,7 +464,7 @@ Status HloEvaluator::HandleConcatenate(HloInstruction* concatenate) {
 
   for (int64 i = 1; i < operands.size(); ++i) {
     const Shape& operand_shape = operands[i]->shape();
-    CHECK(ShapeUtil::IsArray(operand_shape));
+    CHECK(operand_shape.IsArray());
     // Accumulate the concat dimension from all tensors taking part to the
     // operation.
     concat_dimensions[concat_dim] +=
@@ -479,15 +491,52 @@ Status HloEvaluator::HandleConcatenate(HloInstruction* concatenate) {
 
 Status HloEvaluator::HandleIsFinite(HloInstruction* is_finite) {
   auto operand = is_finite->operand(0);
-  if (!ShapeUtil::ElementIsFloating(operand->shape())) {
-    return InvalidArgument(
-        "expected element type in shape to be float for IsFinite op, got: %s",
-        PrimitiveType_Name(operand->shape().element_type()));
-  }
+  auto elem_ty = operand->shape().element_type();
+  switch (elem_ty) {
+    case PRED:
+    case TUPLE:
+    case OPAQUE:
+    case TOKEN:
+    case S8:
+    case S16:
+    case S32:
+    case S64:
+    case U8:
+    case U16:
+    case U32:
+    case U64:
+    case C64:
+    case C128:
+    // Explicitly enumerate all types in this switch so that when we add a new
+    // type, we'll get a compile error here.
+    case PRIMITIVE_TYPE_INVALID:
+    case PrimitiveType_INT_MIN_SENTINEL_DO_NOT_USE_:
+    case PrimitiveType_INT_MAX_SENTINEL_DO_NOT_USE_:
+      return InvalidArgument(
+          "expected element type in shape to be floating point, but "
+          "got: %s",
+          PrimitiveType_Name(elem_ty));
 
-  switch (operand->shape().element_type()) {
-    case F16:
-      return Unimplemented("unhandled primitive type: F16.");
+    case F16: {
+      auto result_or = ElementWiseUnaryOpImpl<bool, Eigen::half>(
+          is_finite,
+          [](Eigen::half elem_operand) {
+            return std::isfinite(static_cast<float>(elem_operand));
+          },
+          GetEvaluatedLiteralFor(operand));
+      TF_ASSIGN_OR_RETURN(evaluated_[is_finite], std::move(result_or));
+      break;
+    }
+    case BF16: {
+      auto result_or = ElementWiseUnaryOpImpl<bool, bfloat16>(
+          is_finite,
+          [](bfloat16 elem_operand) {
+            return std::isfinite(static_cast<float>(elem_operand));
+          },
+          GetEvaluatedLiteralFor(operand));
+      TF_ASSIGN_OR_RETURN(evaluated_[is_finite], std::move(result_or));
+      break;
+    }
     case F32: {
       auto result_or = ElementWiseUnaryOpImpl<bool, float>(
           is_finite,
@@ -504,9 +553,6 @@ Status HloEvaluator::HandleIsFinite(HloInstruction* is_finite) {
       TF_ASSIGN_OR_RETURN(evaluated_[is_finite], std::move(result_or));
       break;
     }
-    default:
-      LOG(FATAL) << "HandleIsFinite: unknown/unhandled primitive type: "
-                 << PrimitiveType_Name(operand->shape().element_type());
   }
 
   return Status::OK();
@@ -529,6 +575,13 @@ Status HloEvaluator::HandleReal(HloInstruction* real) {
       TF_ASSIGN_OR_RETURN(evaluated_[real], std::move(result_or));
       break;
     }
+    case C128: {
+      auto result_or = ElementWiseUnaryOpImpl<double, complex128>(
+          real, [](complex128 elem_operand) { return std::real(elem_operand); },
+          GetEvaluatedLiteralFor(operand));
+      TF_ASSIGN_OR_RETURN(evaluated_[real], std::move(result_or));
+      break;
+    }
     case F16: {
       auto result_or = ElementWiseUnaryOpImpl<Eigen::half, Eigen::half>(
           real, [](Eigen::half elem_operand) { return elem_operand; },
@@ -559,11 +612,61 @@ Status HloEvaluator::HandleReal(HloInstruction* real) {
 }
 
 Status HloEvaluator::HandleImag(HloInstruction* imag) {
-  auto result_or = ElementWiseUnaryOpImpl<float, complex64>(
-      imag, [](complex64 elem_operand) { return std::imag(elem_operand); },
-      GetEvaluatedLiteralFor(imag->operand(0)));
+  auto operand = imag->operand(0);
+  switch (operand->shape().element_type()) {
+    case C64: {
+      auto result_or = ElementWiseUnaryOpImpl<float, complex64>(
+          imag, [](complex64 elem_operand) { return std::imag(elem_operand); },
+          GetEvaluatedLiteralFor(imag->operand(0)));
+
+      TF_ASSIGN_OR_RETURN(evaluated_[imag], std::move(result_or));
+      break;
+    }
+    case C128: {
+      auto result_or = ElementWiseUnaryOpImpl<double, complex128>(
+          imag, [](complex128 elem_operand) { return std::imag(elem_operand); },
+          GetEvaluatedLiteralFor(imag->operand(0)));
 
-  TF_ASSIGN_OR_RETURN(evaluated_[imag], std::move(result_or));
+      TF_ASSIGN_OR_RETURN(evaluated_[imag], std::move(result_or));
+      break;
+    }
+    default:
+      LOG(FATAL) << "HandleImag: unknown/unhandled primitive type: "
+                 << PrimitiveType_Name(operand->shape().element_type());
+  }
+
+  return Status::OK();
+}
+
+Status HloEvaluator::HandleComplex(HloInstruction* complex) {
+  const Literal& real = GetEvaluatedLiteralFor(complex->operand(0));
+  const Literal& imag = GetEvaluatedLiteralFor(complex->operand(1));
+  TF_RET_CHECK(ShapeUtil::Compatible(real.shape(), imag.shape()));
+
+  Literal result(complex->shape());
+  switch (complex->shape().element_type()) {
+    case C64: {
+      TF_RETURN_IF_ERROR(
+          result.Populate<complex64>([&](absl::Span<const int64> multi_index) {
+            return std::complex<float>(real.Get<float>(multi_index),
+                                       imag.Get<float>(multi_index));
+          }));
+      break;
+    }
+    case C128: {
+      TF_RETURN_IF_ERROR(
+          result.Populate<complex128>([&](absl::Span<const int64> multi_index) {
+            return std::complex<float>(real.Get<double>(multi_index),
+                                       imag.Get<double>(multi_index));
+          }));
+      break;
+    }
+    default:
+      LOG(FATAL) << "HandleComplex: unknown/unhandled primitive type: "
+                 << PrimitiveType_Name(complex->shape().element_type());
+  }
+
+  evaluated_[complex] = std::move(result);
   return Status::OK();
 }
 
@@ -600,8 +703,11 @@ Status HloEvaluator::HandleCompare(HloInstruction* compare) {
           evaluated_[compare],
           Compare<uint8>(compare->shape(), opcode, lhs_literal, rhs_literal));
     } break;
-    case U16:
-      return Unimplemented("unhandled primitive type: U16.");
+    case U16: {
+      TF_ASSIGN_OR_RETURN(
+          evaluated_[compare],
+          Compare<uint16>(compare->shape(), opcode, lhs_literal, rhs_literal));
+    } break;
     case U32: {
       TF_ASSIGN_OR_RETURN(
           evaluated_[compare],
@@ -617,8 +723,11 @@ Status HloEvaluator::HandleCompare(HloInstruction* compare) {
           evaluated_[compare],
           Compare<int8>(compare->shape(), opcode, lhs_literal, rhs_literal));
     } break;
-    case S16:
-      return Unimplemented("unhandled primitive type: S16.");
+    case S16: {
+      TF_ASSIGN_OR_RETURN(
+          evaluated_[compare],
+          Compare<int16>(compare->shape(), opcode, lhs_literal, rhs_literal));
+    } break;
     case S32: {
       TF_ASSIGN_OR_RETURN(
           evaluated_[compare],
@@ -629,8 +738,11 @@ Status HloEvaluator::HandleCompare(HloInstruction* compare) {
           evaluated_[compare],
           Compare<int64>(compare->shape(), opcode, lhs_literal, rhs_literal));
     } break;
-    case F16:
-      return Unimplemented("unhandled primitive type: F16.");
+    case F16: {
+      TF_ASSIGN_OR_RETURN(
+          evaluated_[compare],
+          Compare<half>(compare->shape(), opcode, lhs_literal, rhs_literal));
+    } break;
     case BF16: {
       TF_ASSIGN_OR_RETURN(evaluated_[compare],
                           Compare<bfloat16>(compare->shape(), opcode,
@@ -651,6 +763,11 @@ Status HloEvaluator::HandleCompare(HloInstruction* compare) {
                           Compare<complex64>(compare->shape(), opcode,
                                              lhs_literal, rhs_literal));
     } break;
+    case C128: {
+      TF_ASSIGN_OR_RETURN(evaluated_[compare],
+                          Compare<complex128>(compare->shape(), opcode,
+                                              lhs_literal, rhs_literal));
+    } break;
     default:
       LOG(FATAL) << "HandleCompare: unknown primitive type: "
                  << PrimitiveType_Name(lhs->shape().element_type());
@@ -1032,11 +1149,9 @@ Status HloEvaluator::HandleGather(HloInstruction* gather) {
 Status HloEvaluator::HandleBroadcast(HloInstruction* broadcast) {
   const Literal& operand = GetEvaluatedLiteralFor(broadcast->operand(0));
 
-  TF_RET_CHECK(broadcast->dimensions().size() ==
-               ShapeUtil::Rank(operand.shape()))
+  TF_RET_CHECK(broadcast->dimensions().size() == operand.shape().rank())
       << "broadcast dimensions is of size: " << broadcast->dimensions().size()
-      << " and rank of operand_to_broadcast is: "
-      << ShapeUtil::Rank(operand.shape());
+      << " and rank of operand_to_broadcast is: " << operand.shape().rank();
   // Checks that operand's dimensions are the same as the broadcast's
   // dimensions along the dimensions to be broadcasted.
   for (int64 i = 0; i < broadcast->dimensions().size(); ++i) {
@@ -1109,9 +1224,10 @@ Status HloEvaluator::HandleCall(HloInstruction* call) {
   }
 
   HloEvaluator embedded_evaluator;
-  Literal result =
-      embedded_evaluator.Evaluate<const Literal*>(*computation, arg_literals)
-          .ConsumeValueOrDie();
+  embedded_evaluator.set_dynamic_dimension_inference(
+      dynamic_dimension_inference_);
+  Literal result = embedded_evaluator.Evaluate(*computation, arg_literals)
+                       .ConsumeValueOrDie();
 
   evaluated_[call] = std::move(result);
   return Status::OK();
@@ -1127,7 +1243,9 @@ Status HloEvaluator::HandleFusion(HloInstruction* fusion) {
       fusion->fused_instructions_computation()->Clone(
           /*suffix=*/"clone_with_layout", &context);
   for (auto* instruction : cloned_fused_computation->instructions()) {
-    LayoutUtil::SetToDefaultLayout(instruction->mutable_shape());
+    if (!LayoutUtil::HasLayout(instruction->shape())) {
+      LayoutUtil::SetToDefaultLayout(instruction->mutable_shape());
+    }
   }
   auto readded_computation =
       empty_hlo_module.AddEntryComputation(std::move(cloned_fused_computation));
@@ -1141,9 +1259,10 @@ Status HloEvaluator::HandleFusion(HloInstruction* fusion) {
   }
 
   HloEvaluator embedded_evaluator;
+  embedded_evaluator.set_dynamic_dimension_inference(
+      dynamic_dimension_inference_);
   Literal result =
-      embedded_evaluator
-          .Evaluate<const Literal*>(*readded_computation, arg_literals)
+      embedded_evaluator.Evaluate(*readded_computation, arg_literals)
           .ConsumeValueOrDie();
 
   evaluated_[fusion] = std::move(result);
@@ -1161,16 +1280,16 @@ Status HloEvaluator::HandleConditional(HloInstruction* conditional) {
   auto* false_computation = conditional->false_computation();
 
   HloEvaluator embedded_evaluator;
+  embedded_evaluator.set_dynamic_dimension_inference(
+      dynamic_dimension_inference_);
   Literal result;
   if (pred.Get<bool>({})) {
-    result = embedded_evaluator
-                 .Evaluate<const Literal*>(*true_computation,
-                                           {&true_computation_arg})
-                 .ConsumeValueOrDie();
+    result =
+        embedded_evaluator.Evaluate(*true_computation, {&true_computation_arg})
+            .ConsumeValueOrDie();
   } else {
     result = embedded_evaluator
-                 .Evaluate<const Literal*>(*false_computation,
-                                           {&false_computation_arg})
+                 .Evaluate(*false_computation, {&false_computation_arg})
                  .ConsumeValueOrDie();
   }
 
@@ -1217,18 +1336,21 @@ Status HloEvaluator::HandleWhile(HloInstruction* while_hlo) {
   bool keep_going = true;
   int64 iteration_count = 0;
   HloEvaluator cond_evaluator(max_loop_iterations_);
+  cond_evaluator.set_dynamic_dimension_inference(dynamic_dimension_inference_);
   HloEvaluator loop_body_evaluator(max_loop_iterations_);
+  loop_body_evaluator.set_dynamic_dimension_inference(
+      dynamic_dimension_inference_);
   while (keep_going) {
     if (max_loop_iterations_ >= 0 && iteration_count++ > max_loop_iterations_) {
       return InvalidArgument("Loop %s exceeded loop iteration limit (%d).",
                              while_hlo->name(), max_loop_iterations_);
     }
     TF_ASSIGN_OR_RETURN(auto cond_val,
-                        cond_evaluator.Evaluate<Literal*>(*cond_comp, {&lcv}));
+                        cond_evaluator.Evaluate(*cond_comp, {&lcv}));
     keep_going = cond_val.GetFirstElement<bool>();
     if (keep_going) {
-      TF_ASSIGN_OR_RETURN(auto body_val, loop_body_evaluator.Evaluate<Literal*>(
-                                             *body_comp, {&lcv}));
+      TF_ASSIGN_OR_RETURN(auto body_val,
+                          loop_body_evaluator.Evaluate(*body_comp, {&lcv}));
       VLOG(3) << "Loop iteration result: " << body_val.ToString();
       lcv = std::move(body_val);
       cond_evaluator.ResetVisitStates();
@@ -1239,173 +1361,216 @@ Status HloEvaluator::HandleWhile(HloInstruction* while_hlo) {
   return Status::OK();
 }
 
-// Key-value sort is a special snowflake: it's templated on two different
-// element types, one for the keys, and one for the values. Jump through some
-// hoops to make this work.
 namespace {
-template <typename KeyType, typename ValueType>
-StatusOr<Literal> EvaluateSortInternal(HloInstruction* sort,
-                                       const Literal& keys_literal,
-                                       const Literal& values_literal) {
-  auto rank = ShapeUtil::Rank(keys_literal.shape());
-  TF_RET_CHECK(
-      ShapeUtil::SameDimensions(keys_literal.shape(), values_literal.shape()))
-      << "Sort keys and values must have the same dimensions";
-  TF_RET_CHECK(sort->operand_count() >= 2) << "Expected key-value sort";
-  // We need to sort an array of keys and an array of values, where the
-  // sorted order of the values is determined by the keys. The simplest(?)
-  // way to do this is to go to an array-of-pairs representation, sort the
-  // array using the keys, and then go back to pair-of-arrays.
-  VLOG(3) << "HandleSort keys_literal: " << keys_literal.ToString();
-  VLOG(3) << "HandleSort values_literal: " << values_literal.ToString();
-
-  if (rank == 0) {
-    // Nothing to sort.
-    return LiteralUtil::MakeTuple({&keys_literal, &values_literal});
+template <typename NativeT>
+Literal ExtractLiteralFromIndexPositions(const Literal& from,
+                                         absl::Span<int64 const> indices,
+                                         bool extract_as_scalar) {
+  if (extract_as_scalar) {
+    return LiteralUtil::CreateR0<NativeT>(from.Get<NativeT>({indices[0]}));
   }
+  // We use a InlinedVector here because we need to convert it to an
+  // absl::Span later, and this would not work with std::vector<bool>.
+  absl::InlinedVector<NativeT, 10> values;
+  for (int64 index : indices) {
+    values.push_back(from.Get<NativeT>({index}));
+  }
+  return LiteralUtil::CreateR1<NativeT>(values);
+}
 
-  Literal keys_result_literal(keys_literal.shape());
-  Literal values_result_literal(values_literal.shape());
+StatusOr<Literal> ExtractFromIndexPositions(const Literal& from,
+                                            absl::Span<int64 const> indices,
+                                            bool extract_as_scalar = false) {
+  if (extract_as_scalar) {
+    CHECK_EQ(indices.size(), 1);
+  }
+  PrimitiveType type = from.shape().element_type();
+  switch (type) {
+    case PRED: {
+      return ExtractLiteralFromIndexPositions<bool>(from, indices,
+                                                    extract_as_scalar);
+    }
+    case U8: {
+      return ExtractLiteralFromIndexPositions<uint8>(from, indices,
+                                                     extract_as_scalar);
+    }
+    case S8: {
+      return ExtractLiteralFromIndexPositions<int8>(from, indices,
+                                                    extract_as_scalar);
+    }
+    case BF16: {
+      return ExtractLiteralFromIndexPositions<bfloat16>(from, indices,
+                                                        extract_as_scalar);
+    }
+    case F16: {
+      return ExtractLiteralFromIndexPositions<Eigen::half>(from, indices,
+                                                           extract_as_scalar);
+    }
+    case U16: {
+      return ExtractLiteralFromIndexPositions<uint16>(from, indices,
+                                                      extract_as_scalar);
+    }
+    case S16: {
+      return ExtractLiteralFromIndexPositions<int16>(from, indices,
+                                                     extract_as_scalar);
+    }
+    case F32: {
+      return ExtractLiteralFromIndexPositions<float>(from, indices,
+                                                     extract_as_scalar);
+    }
+    case U32: {
+      return ExtractLiteralFromIndexPositions<uint32>(from, indices,
+                                                      extract_as_scalar);
+    }
+    case S32: {
+      return ExtractLiteralFromIndexPositions<int32>(from, indices,
+                                                     extract_as_scalar);
+    }
+    case F64: {
+      return ExtractLiteralFromIndexPositions<double>(from, indices,
+                                                      extract_as_scalar);
+    }
+    case U64: {
+      return ExtractLiteralFromIndexPositions<uint64>(from, indices,
+                                                      extract_as_scalar);
+    }
+    case S64: {
+      return ExtractLiteralFromIndexPositions<int64>(from, indices,
+                                                     extract_as_scalar);
+    }
+    default:
+      return InvalidArgument("Unsupported type for Sort: %s",
+                             PrimitiveType_Name(type));
+  }
+}
+}  // namespace
+
+Status HloEvaluator::HandleSort(HloInstruction* sort) {
+  TF_RET_CHECK(sort->operand_count() >= 1)
+      << "Expected at least 1 operand for sort";
+  for (int64 i = 1; i < sort->operand_count(); ++i) {
+    TF_RET_CHECK(ShapeUtil::SameDimensions(sort->operand(0)->shape(),
+                                           sort->operand(i)->shape()))
+        << "All Sort operands must have the same dimensions";
+  }
+
+  if (VLOG_IS_ON(3)) {
+    for (int64 i = 0; i < sort->operand_count(); ++i) {
+      VLOG(3) << "HandleSort operand " << i << " literal: "
+              << GetEvaluatedLiteralFor(sort->operand(i)).ToString();
+    }
+  }
+  Shape key_shape = sort->operand(0)->shape();
+  auto rank = key_shape.rank();
+  std::vector<Literal> result_literals;
+  result_literals.reserve(sort->operand_count());
+  for (int64 i = 0; i < sort->operand_count(); ++i) {
+    result_literals.emplace_back(sort->operand(i)->shape());
+  }
   std::vector<int64> zero_base(rank, 0);
   std::vector<int64> increment(rank, 1);
   int64 sort_dim = sort->dimensions(0);
-  int64 sort_dim_elements = keys_literal.shape().dimensions(sort_dim);
+  int64 sort_dim_elements = key_shape.dimensions(sort_dim);
   increment[sort_dim] = sort_dim_elements;
+  HloEvaluator embedded_evaluator(max_loop_iterations_);
   // Iterate through each dimension except 'sort_dim'.
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachIndexWithStatus(
-      keys_literal.shape(), zero_base,
-      AsInt64Slice(keys_literal.shape().dimensions()), increment,
+      key_shape, zero_base, AsInt64Slice(key_shape.dimensions()), increment,
       [&](absl::Span<const int64> indices) -> StatusOr<bool> {
-        // Extract a slice from the keys and values literals that correspond to
+        // Extract a slice from each operand literal that corresponds to
         // exactly the row in dimension 'sort_dim'.
         std::vector<int64> limit_indices(indices.begin(), indices.end());
-        std::for_each(limit_indices.begin(), limit_indices.end(),
-                      [](int64& index) { ++index; });
+        absl::c_for_each(limit_indices, [](int64& index) { ++index; });
         limit_indices[sort_dim] = sort_dim_elements;
-        TF_ASSIGN_OR_RETURN(auto keys_to_sort,
-                            keys_literal.Slice(indices, limit_indices)
-                                .Reshape({sort_dim_elements}));
-        const auto& keys_data = keys_to_sort.data<KeyType>();
-        TF_ASSIGN_OR_RETURN(auto values_to_sort,
-                            values_literal.Slice(indices, limit_indices)
-                                .Reshape({sort_dim_elements}));
-        const auto& values_data = values_to_sort.data<ValueType>();
-        using kv_pair = std::pair<KeyType, ValueType>;
-        std::vector<kv_pair> key_value_vector;
-        key_value_vector.reserve(keys_data.size());
-        for (int i = 0; i < keys_data.size(); ++i) {
-          key_value_vector.push_back(
-              std::make_pair(keys_data[i], values_data[i]));
+        std::vector<Literal> literals_to_sort;
+        literals_to_sort.reserve(sort->operand_count());
+        for (int64 i = 0; i < sort->operand_count(); ++i) {
+          TF_ASSIGN_OR_RETURN(auto literal_to_sort,
+                              GetEvaluatedLiteralFor(sort->operand(i))
+                                  .Slice(indices, limit_indices)
+                                  .Reshape({sort_dim_elements}));
+          literals_to_sort.push_back(std::move(literal_to_sort));
+        }
+        std::vector<int64> indices_to_sort(sort_dim_elements);
+        std::iota(indices_to_sort.begin(), indices_to_sort.end(), 0);
+        Status compare_status = Status::OK();
+        auto comparator = [sort, &compare_status, &embedded_evaluator,
+                           &literals_to_sort](int64 a, int64 b) {
+          std::vector<Literal> literals;
+          literals.reserve(2 * sort->operand_count());
+          for (int64 i = 0; i < sort->operand_count(); ++i) {
+            auto lhs = ExtractFromIndexPositions(literals_to_sort[i], {a},
+                                                 /*extract_as_scalar=*/true);
+            if (!lhs.ok()) {
+              compare_status = lhs.status();
+              return false;
+            }
+            literals.push_back(std::move(lhs.ValueOrDie()));
+            auto rhs = ExtractFromIndexPositions(literals_to_sort[i], {b},
+                                                 /*extract_as_scalar=*/true);
+            if (!rhs.ok()) {
+              compare_status = rhs.status();
+              return false;
+            }
+            literals.push_back(std::move(rhs.ValueOrDie()));
+          }
+          std::vector<const Literal*> literal_ptrs;
+          absl::c_transform(literals, std::back_inserter(literal_ptrs),
+                            [](const Literal& literal) { return &literal; });
+
+          auto computed_result =
+              embedded_evaluator.Evaluate(*sort->to_apply(), literal_ptrs);
+          // Clear visit states so that we can use the evaluator again
+          // on the same computation.
+          embedded_evaluator.ResetVisitStates();
+          if (!computed_result.ok()) {
+            compare_status = computed_result.status();
+            return false;
+          }
+          return computed_result.ValueOrDie().Get<bool>({});
+        };
+        if (Cast<HloSortInstruction>(sort)->is_stable()) {
+          std::stable_sort(indices_to_sort.begin(), indices_to_sort.end(),
+                           comparator);
+        } else {
+          std::sort(indices_to_sort.begin(), indices_to_sort.end(), comparator);
         }
-        std::stable_sort(key_value_vector.begin(), key_value_vector.end(),
-                         [](const kv_pair& a, const kv_pair& b) {
-                           return SafeLess<KeyType>(a.first, b.first);
-                         });
-        std::vector<KeyType> result_keys;
-        // We use a InlinedVector here because we need to convert it to an
-        // absl::Span later, and this would not work with std::vector<bool>.
-        absl::InlinedVector<ValueType, 10> result_values;
-        for (const auto& key_value : key_value_vector) {
-          result_keys.push_back(key_value.first);
-          result_values.push_back(key_value.second);
+        if (!compare_status.ok()) {
+          return compare_status;
         }
-        Literal sorted_keys(ShapeUtil::MakeShape(
-            keys_literal.shape().element_type(), {sort_dim_elements}));
-        sorted_keys.PopulateR1(absl::Span<const KeyType>(result_keys));
-        Literal sorted_values(ShapeUtil::MakeShape(
-            values_literal.shape().element_type(), {sort_dim_elements}));
-        sorted_values.PopulateR1(absl::Span<const ValueType>(result_values));
         std::vector<int64> slice_dimensions(rank, 1);
         slice_dimensions[sort_dim] = sort_dim_elements;
         std::vector<int64> start_indices(rank, 0);
-        TF_ASSIGN_OR_RETURN(auto sorted_keys_reshaped,
-                            sorted_keys.Reshape(slice_dimensions));
-        TF_RETURN_IF_ERROR(keys_result_literal.CopySliceFrom(
-            sorted_keys_reshaped, start_indices, indices, slice_dimensions));
-        TF_ASSIGN_OR_RETURN(auto sorted_values_reshaped,
-                            sorted_values.Reshape(slice_dimensions));
-        TF_RETURN_IF_ERROR(values_result_literal.CopySliceFrom(
-            sorted_values_reshaped, start_indices, indices, slice_dimensions));
+        for (int64 i = 0; i < sort->operand_count(); ++i) {
+          TF_ASSIGN_OR_RETURN(
+              Literal sorted_literal,
+              ExtractFromIndexPositions(literals_to_sort[i], indices_to_sort));
+          TF_ASSIGN_OR_RETURN(auto sorted_literal_reshaped,
+                              sorted_literal.Reshape(slice_dimensions));
+          TF_RETURN_IF_ERROR(result_literals[i].CopySliceFrom(
+              sorted_literal_reshaped, start_indices, indices,
+              slice_dimensions));
+        }
         return true;
       }));
 
-  Literal result_tuple;
-  result_tuple =
-      LiteralUtil::MakeTuple({&keys_result_literal, &values_result_literal});
-  VLOG(3) << "HandleSort result_tuple: " << result_tuple.ToString();
-  return std::move(result_tuple);
-}
-
-template <typename KeyType>
-StatusOr<Literal> EvaluateSortCurried(HloInstruction* sort,
-                                      const Literal& keys_literal,
-                                      const Literal& values_literal) {
-  switch (values_literal.shape().element_type()) {
-    case PRED:
-      return EvaluateSortInternal<KeyType, bool>(sort, keys_literal,
-                                                 values_literal);
-    case F32:
-      return EvaluateSortInternal<KeyType, float>(sort, keys_literal,
-                                                  values_literal);
-    case U32:
-      return EvaluateSortInternal<KeyType, uint32>(sort, keys_literal,
-                                                   values_literal);
-    case S32:
-      return EvaluateSortInternal<KeyType, int32>(sort, keys_literal,
-                                                  values_literal);
-    case BF16:
-      return EvaluateSortInternal<KeyType, bfloat16>(sort, keys_literal,
-                                                     values_literal);
-    default:
-      return InvalidArgument("Unsupported type for Sort");
-  }
-}
-
-StatusOr<Literal> EvaluateSort(HloInstruction* sort,
-                               const Literal& keys_literal,
-                               const Literal& values_literal) {
-  switch (sort->operand(0)->shape().element_type()) {
-    case F32:
-      return EvaluateSortCurried<float>(sort, keys_literal, values_literal);
-    case U32:
-      return EvaluateSortCurried<uint32>(sort, keys_literal, values_literal);
-    case S32:
-      return EvaluateSortCurried<int32>(sort, keys_literal, values_literal);
-    case BF16:
-      return EvaluateSortCurried<bfloat16>(sort, keys_literal, values_literal);
-    default:
-      return InvalidArgument("Unsupported type for Sort");
-  }
-}
-}  // namespace
-
-Status HloEvaluator::HandleSort(HloInstruction* sort) {
-  if (!ShapeUtil::IsTuple(sort->shape())) {
-    return DefaultAction(sort);
+  if (sort->operand_count() == 1) {
+    evaluated_[sort] = std::move(result_literals[0]);
   } else {
-    // This is a really stupid work-around for the fact it's hard to support a
-    // multi-value sort directly, due to the fact we need to template the
-    // evaluation function on all of the value types.
-    std::vector<Literal> sort_results_backing;
-    for (int64 i = 0; i < sort->operand_count(); ++i) {
-      auto result = EvaluateSort(sort, GetEvaluatedLiteralFor(sort->operand(0)),
-                                 GetEvaluatedLiteralFor(sort->operand(i)));
-      if (!result.ok()) {
-        return result.status();
-      }
-      sort_results_backing.push_back(
-          std::move(result.ValueOrDie().DecomposeTuple()[1]));
-    }
-    std::vector<const Literal*> sort_results;
-    absl::c_transform(sort_results_backing, std::back_inserter(sort_results),
+    std::vector<const Literal*> literal_ptrs;
+    absl::c_transform(result_literals, std::back_inserter(literal_ptrs),
                       [](const Literal& literal) { return &literal; });
-    evaluated_[sort] = LiteralUtil::MakeTuple(sort_results);
-    return Status::OK();
+
+    Literal result_tuple = LiteralUtil::MakeTuple(literal_ptrs);
+    VLOG(3) << "HandleSort result_tuple: " << result_tuple.ToString();
+
+    evaluated_[sort] = std::move(result_tuple);
   }
+  return Status::OK();
 }
 
 Status HloEvaluator::HandleReduce(HloInstruction* reduce) {
-  if (!ShapeUtil::IsTuple(reduce->shape())) {
+  if (!reduce->shape().IsTuple()) {
     return DefaultAction(reduce);
   } else {
     auto first_element_type = reduce->shape().tuple_shapes(0).element_type();
@@ -1420,6 +1585,27 @@ Status HloEvaluator::HandleReduce(HloInstruction* reduce) {
   }
 }
 
+Status HloEvaluator::HandleCustomCall(HloInstruction* custom_call) {
+  if (!custom_call_handler_) {
+    // No handler is registered; this means custom-calls are not allowed.
+    return DefaultAction(custom_call);
+  }
+
+  // Evaluate input operands so the handler has access to the operand data.
+  std::vector<const Literal*> operands;
+  operands.reserve(custom_call->operand_count());
+  for (const HloInstruction* operand : custom_call->operands()) {
+    operands.push_back(&GetEvaluatedLiteralFor(operand));
+  }
+
+  // Synchronously issue the handler to populate the instruction output literal.
+  TF_ASSIGN_OR_RETURN(
+      auto output, custom_call_handler_(custom_call, absl::MakeSpan(operands)));
+
+  evaluated_[custom_call] = std::move(output);
+  return Status::OK();
+}
+
 Status HloEvaluator::Preprocess(HloInstruction* hlo) {
   VLOG(2) << "About to visit HLO: " << hlo->ToString();
   return ShapeUtil::ValidateShape(hlo->shape());
@@ -1437,16 +1623,46 @@ Status HloEvaluator::Postprocess(HloInstruction* hlo) {
   return Status::OK();
 }
 
-// Explicit instantiation of templatized Evaluate* methods.
-//
-template StatusOr<Literal> HloEvaluator::Evaluate<const Literal*>(
-    const HloModule& module, absl::Span<const Literal* const> arg_literals);
+namespace {
+template <typename T>
+std::unique_ptr<Array2D<T>> MatmulArray2DImpl(
+    const Array2D<T>& lhs, const Array2D<T>& rhs,
+    const std::function<void(
+        const void* run_options_ptr, T* out, T* lhs, T* rhs, int64 m, int64 n,
+        int64 k, int32 transpose_lhs, int32 transpose_rhs)>& impl_fn) {
+  CHECK_EQ(lhs.width(), rhs.height());
+  int m = lhs.height();
+  int n = rhs.width();
+  int k = lhs.width();
+  auto result = absl::make_unique<Array2D<T>>(m, n);
+  // Because Eigen is a header-oriented library, make sure that the Eigen code
+  // is the same as the code used by the CPU backend (otherwise the linker will
+  // randomly pick *some* definition).
+  impl_fn(
+      /*run_options_ptr=*/nullptr, result->data(), rhs.data(), lhs.data(), n, m,
+      k,
+      /*transpose_lhs=*/0,
+      /*transpose_rhs=*/0);
+  return result;
+}
+}  // namespace
+
+std::unique_ptr<Array2D<Eigen::half>> HloEvaluator::MatmulArray2D(
+    const Array2D<Eigen::half>& lhs, const Array2D<Eigen::half>& rhs) {
+  return MatmulArray2DImpl<Eigen::half>(
+      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF16);
+}
 
-template StatusOr<Literal> HloEvaluator::Evaluate<const Literal*>(
-    const HloComputation& computation,
-    absl::Span<const Literal* const> arg_literals);
+std::unique_ptr<Array2D<float>> HloEvaluator::MatmulArray2D(
+    const Array2D<float>& lhs, const Array2D<float>& rhs) {
+  return MatmulArray2DImpl<float>(
+      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF32);
+}
 
-template StatusOr<Literal> HloEvaluator::Evaluate<const Literal*>(
-    HloInstruction* instruction, absl::Span<const Literal* const> arg_literals);
+std::unique_ptr<Array2D<double>> HloEvaluator::MatmulArray2D(
+    const Array2D<double>& lhs, const Array2D<double>& rhs) {
+  return MatmulArray2DImpl<double>(
+      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF64);
+}
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index 45ed8131dc6b71f706fce45d65b206363dd79ac3..357975a131d0c7e63c06e96852468b43d97a37f2 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -16,12 +16,16 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
 
+#include <functional>
 #include <memory>
 
 #include "absl/container/node_hash_map.h"
 #include "absl/memory/memory.h"
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -42,16 +46,24 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   // specified.
   explicit HloEvaluator(int64 max_loop_iterations = -1);
 
-  // Evaluates an HLO module and an array of pointers to literals.
-  // Returns the evaluated result as a literal if successful.
+  // Evaluates an HLO module and an array of pointers to literals.  Returns the
+  // evaluated result as a literal if successful.
+  //
   // Precondition: The indices of arg_literals correspond to the parameter
   // numbers of the HLO parameters in the computation. See comment below for an
   // example.
-  // `LiteralPtr` accepts either Literal or const Literal*
-  // type.
-  template <typename LiteralPtr>
+  //
+  // (Dummy template arg is to reduce the overloading priority of one overload
+  // so that Evaluate(module, {}) resolves unambiguously.)
+  StatusOr<Literal> Evaluate(const HloModule& module,
+                             absl::Span<const Literal* const> arg_literals) {
+    return Evaluate(*module.entry_computation(), arg_literals);
+  }
+  template <typename Dummy = void>
   StatusOr<Literal> Evaluate(const HloModule& module,
-                             absl::Span<const LiteralPtr> arg_literals);
+                             absl::Span<const Literal> arg_literals) {
+    return Evaluate(*module.entry_computation(), arg_literals);
+  }
 
   // Evaluates an HLO computation and an array of pointers to literals.
   // Returns the evaluated result as a literal if successful.
@@ -69,29 +81,24 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   // where Parameter0 has parameter_number 0 and Parameter1 has parameter_number
   // 1 in this computation. The input literals array will then have its first
   // literal map to Parameter0 and the second map to Parameter1.
-  // `LiteralPtr` accepts either Literal or const Literal*
-  // type.
-  template <typename LiteralPtr>
+  //
+  // (Dummy template arg is to reduce the overloading priority of one overload
+  // so that Evaluate(module, {}) resolves unambiguously.)
+  StatusOr<Literal> Evaluate(const HloComputation& computation,
+                             absl::Span<const Literal* const> arg_literals);
+  template <typename Dummy = void>
   StatusOr<Literal> Evaluate(const HloComputation& computation,
-                             absl::Span<const LiteralPtr> arg_literals);
-
-  // Evaluates a single HLO instruction and an array of pointers to literals.
-  // Return the evaluated result as literal if successful.
-  // Precondition:
-  // 1. argument literals correspond to the input instruction's parameters in
-  // their post-ordering.
-  // 2. the instruction's operands must be of either Parameter or Constant type.
-  // `LiteralPtr` accepts either Literal or const Literal*
-  // type.
-  template <typename LiteralPtr>
-  StatusOr<Literal> Evaluate(HloInstruction* instruction,
-                             absl::Span<const LiteralPtr> arg_literals);
-
-  // Evaluates a single HLO instruction with constant operands.
-  // Returns the evaluated result as literal if successful.
-  // Precondition:
-  // 1. all operands of the input instruction are constants.
-  // 2. the instruction is not a Parameter operation.
+                             absl::Span<const Literal> arg_literals) {
+    std::vector<const Literal*> arg_literal_ptrs;
+    for (const auto& l : arg_literals) {
+      arg_literal_ptrs.push_back(&l);
+    }
+    return Evaluate(computation, arg_literal_ptrs);
+  }
+
+  // Gets the value of running a single HLO instruction.
+  //
+  // All of the operands to this instruction must be constants.
   StatusOr<Literal> Evaluate(HloInstruction* instruction);
 
   // Same as Evaluate, except returning false on error and accepts an output
@@ -119,6 +126,39 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
                                   const PrecisionConfig& precision_config,
                                   const Literal& lhs, const Literal& rhs);
 
+  void set_dynamic_dimension_inference(
+      DynamicDimensionInference* dynamic_dimension_inference) {
+    dynamic_dimension_inference_ = dynamic_dimension_inference;
+  }
+
+  // Enable the fast path for certain operations like dot or convolution.
+  void set_use_fast_path(bool value) { use_fast_path_ = value; }
+
+  // Handles evaluation of a custom-call op.
+  // Operand literals are provided in |operands| and implementations must
+  // populate |output| before returning.
+  using CustomCallHandler = std::function<StatusOr<Literal>(
+      HloInstruction* custom_call, absl::Span<const Literal*> operands)>;
+
+  // Sets a handler that is called during evaluation for custom-call ops.
+  // If no handler is defined the default error behavior will occur. The handler
+  // will be provided evaluated literals for all operands and is expected to
+  // return an output literal of the appropriate shape.
+  void set_custom_call_handler(
+      std::function<StatusOr<Literal>(HloInstruction* custom_call,
+                                      absl::Span<const Literal*> operands)>
+          handler) {
+    custom_call_handler_ = std::move(handler);
+  }
+
+  // Returns the result of a matrix multiply `lhs x rhs`.
+  static std::unique_ptr<Array2D<Eigen::half>> MatmulArray2D(
+      const Array2D<Eigen::half>& lhs, const Array2D<Eigen::half>& rhs);
+  static std::unique_ptr<Array2D<float>> MatmulArray2D(
+      const Array2D<float>& lhs, const Array2D<float>& rhs);
+  static std::unique_ptr<Array2D<double>> MatmulArray2D(
+      const Array2D<double>& lhs, const Array2D<double>& rhs);
+
  protected:
   // Make HloEvaluatorTypedVisitor a friend because it is logically part of this
   // class.
@@ -146,6 +186,8 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   //
   Status HandleBitcast(HloInstruction* bitcast) override;
 
+  Status HandleGetDimensionSize(HloInstruction* get_dimension_size) override;
+
   Status HandleParameter(HloInstruction* parameter) override;
 
   Status HandleConstant(HloInstruction* constant) override;
@@ -192,16 +234,51 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
 
   Status HandleImag(HloInstruction* imag) override;
 
+  Status HandleComplex(HloInstruction* complex) override;
+
   Status HandleReduce(HloInstruction* reduce) override;
 
+  Status HandleCustomCall(HloInstruction* custom_call) override;
+
+  // Unsupported HLOs, note some of them (such as BatchNorm*) are typically
+  // expanded in a semantic-preserving way into other HLOs by adding exanpsion
+  // HLO pass to the HLO optimization pass during compilation, which can then be
+  // handled by the evaluator.
+  Status HandleBatchNormGrad(HloInstruction* batch_norm_grad) override {
+    return Unimplemented("BatchNormGrad HLO is unsupported by the evaluator.");
+  };
+  Status HandleBatchNormInference(
+      HloInstruction* batch_norm_inference) override {
+    return Unimplemented(
+        "BatchNormInference HLO is unsupported by the evaluator.");
+  };
+  Status HandleBatchNormTraining(HloInstruction* batch_norm_training) override {
+    return Unimplemented(
+        "BatchNormTraining HLO is unsupported by the evaluator.");
+  };
+  Status HandleInfeed(HloInstruction* infeed) override {
+    return Unimplemented("Infeed HLO is unsupported by the evaluator.");
+  };
+  Status HandleOutfeed(HloInstruction* outfeed) override {
+    return Unimplemented("Outfeed HLO is unsupported by the evaluator.");
+  };
+
   // Returns the already-evaluated literal result for the instruction.
+  //
   // A Constant instruction is considered evaluated and its literal will be
   // returned directly without looking up the cache.
+  //
+  // Similarly, a Parameter instruction is considered evaluated and its literal
+  // is looked up in arg_literals.
+  //
   // Crash with log if the given instruction has not been evaluated previously.
   const Literal& GetEvaluatedLiteralFor(const HloInstruction* hlo) {
     if (hlo->IsConstant()) {
       return hlo->literal();
     }
+    if (hlo->opcode() == HloOpcode::kParameter) {
+      return *arg_literals_.at(hlo->parameter_number());
+    }
     auto it = evaluated_.find(hlo);
     CHECK(it != evaluated_.end())
         << "could not find evaluated value for: " << hlo->ToString();
@@ -209,14 +286,23 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   }
 
   // Tracks the HLO instruction and its evaluated literal result.
+  //
+  // Parameters and constants aren't stored here, see implementation of
+  // GetEvaluatedLiteralFor.
+  //
   // TODO(b/35950897): have better memory management here to free instructions
   // that are no longer a parent for any other subsequent instruction in
   // post-orderring.
+  //
   // Must be cleared for each evaluation.
-  // Storing Literal in place require the container to have pointer stability so
-  // we cannot use flat_hash_map any more.
+  //
+  // Storing Literal in place requires the container to have pointer stability
+  // so we cannot use flat_hash_map any more.
   absl::node_hash_map<const HloInstruction*, Literal> evaluated_;
 
+  // Use fast path that uses eigen in the evaluator.
+  bool use_fast_path_ = false;
+
  private:
   template <typename ReturnT, typename NativeT>
   static StatusOr<Literal> ElementWiseUnaryOpImpl(
@@ -245,11 +331,27 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   std::vector<const Literal*> arg_literals_;
 
   // Max loop iterations to execute with no maximum if negative.
-  int64 max_loop_iterations_;
+  int64 max_loop_iterations_ = 0;
+
+  // Module-level seed handle.
+  uint64 seed_ = 0;
+  // RNG engine.
+  std::minstd_rand0 engine_;
+
+  // DynamicDimensionInference is used to evaluate GetDimensionSize, which
+  // returns the dynamic dimension size of its operand.
+  DynamicDimensionInference* dynamic_dimension_inference_ = nullptr;
+
+  // Optional handler for custom_call ops.
+  std::function<StatusOr<Literal>(HloInstruction* custom_call,
+                                  absl::Span<const Literal*> operands)>
+      custom_call_handler_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(HloEvaluator);
 };
 
+std::unique_ptr<Array2D<float>> MatmulArray2D(const Array2D<float>& lhs,
+                                              const Array2D<float>& rhs);
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_H_
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index 4eaaab20ea0add17d9b49b1b2b97991af0438dcc..383921fde22242b6ede95a6554f2348ab6fd4277 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -51,20 +51,18 @@ namespace {
 
 static std::array<bool, 2> use_bf16_params{true, false};
 
-class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
-                         public HloTestBase {
- protected:
-  HloEvaluatorTest() : HloTestBase(), use_bfloat16_(GetParam()) {
-    evaluator_ = absl::make_unique<HloEvaluator>();
-  }
+// Test fixture for the HloEvaluator.
+//
+// In bf16 mode, all f32 shapes are converted to bf16 before running.
+class HloEvaluatorTest : public HloTestBase {
+ public:
+  HloEvaluatorTest() : use_bfloat16_(false) {}
 
   Literal Evaluate(absl::Span<const Literal* const> arg_literals = {}) {
     if (use_bfloat16_) {
-      // In BF16 mode, we convert all F32 type to BF16 and evaluate the module.
-      auto type_converter = HloElementTypeConverter(F32, BF16);
-      type_converter.Run(m_.get()).ValueOrDie();
+      HloElementTypeConverter(F32, BF16).Run(m_.get()).ValueOrDie();
     }
-    return evaluator_->Evaluate(*m_->entry_computation(), arg_literals)
+    return evaluator_.Evaluate(*m_->entry_computation(), arg_literals)
         .ConsumeValueOrDie();
   }
 
@@ -74,16 +72,12 @@ class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
   Literal EvaluateWithModule(
       HloModule* module, absl::Span<const Literal* const> arg_literals = {}) {
     if (use_bfloat16_) {
-      // In BF16 mode, we convert all F32 type to BF16 and evaluate the module.
-      auto type_converter = HloElementTypeConverter(F32, BF16);
-      type_converter.Run(module).ValueOrDie();
+      HloElementTypeConverter(F32, BF16).Run(m_.get()).ValueOrDie();
     }
-    return evaluator_->Evaluate(*module->entry_computation(), arg_literals)
+    return evaluator_.Evaluate(*module->entry_computation(), arg_literals)
         .ConsumeValueOrDie();
   }
 
-  std::unique_ptr<HloEvaluator> evaluator_;
-
   void TestUnaryOp(HloOpcode opcode, Literal expected, Literal input,
                    float aabs = 0) {
     HloComputation::Builder b(TestName());
@@ -117,16 +111,45 @@ class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
     EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
   }
 
-  bool use_bfloat16_;
+  void TestTernaryOp(HloOpcode opcode, Literal expected, Literal src0,
+                     Literal src1, Literal src2) {
+    HloComputation::Builder b(TestName());
+    auto operand0 =
+        b.AddInstruction(HloInstruction::CreateConstant(std::move(src0)));
+    auto operand1 =
+        b.AddInstruction(HloInstruction::CreateConstant(std::move(src1)));
+    auto operand2 =
+        b.AddInstruction(HloInstruction::CreateConstant(std::move(src2)));
+    b.AddInstruction(HloInstruction::CreateTernary(
+        expected.shape(), opcode, operand0, operand1, operand2));
+    m_->AddEntryComputation(b.Build());
+
+    Literal result = Evaluate();
+
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+  }
+
+ protected:
+  explicit HloEvaluatorTest(bool use_bfloat16) : use_bfloat16_(use_bfloat16) {}
+  HloEvaluator evaluator_;
+
+  const bool use_bfloat16_;
   std::unique_ptr<HloModule> m_ = CreateNewVerifiedModule();
 };
 
-#define XLA_TYPED_TEST_P(test_case_name, test_name, test_type1) \
-  TEST_P(test_case_name, test_name)
+// Lets you write TEST_Ps that run twice, once with and once without bf16.
+class HloEvaluatorBf16Test : public ::testing::WithParamInterface<bool>,
+                             public HloEvaluatorTest {
+ protected:
+  HloEvaluatorBf16Test() : HloEvaluatorTest(/*use_bfloat16=*/GetParam()) {}
+};
+
+INSTANTIATE_TEST_SUITE_P(HloEvaluatorTest_Instantiation, HloEvaluatorBf16Test,
+                         ::testing::ValuesIn(use_bf16_params));
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs clamp
 // with 3 operands.
-TEST_P(HloEvaluatorTest, DoesClamp) {
+TEST_P(HloEvaluatorBf16Test, DoesClamp) {
   auto low = LiteralUtil::CreateR2<float>({{0.f, 2.f}, {2.f, 4.f}});
   auto value = LiteralUtil::CreateR2<float>({{0.f, 5.f}, {0.f, 4.f}});
   auto high = LiteralUtil::CreateR2<float>({{2.f, 4.f}, {4.f, 4.f}});
@@ -147,7 +170,34 @@ TEST_P(HloEvaluatorTest, DoesClamp) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, DISABLED_DoesClampSpecialBroadcast) {
+// Verifies that clamping of int64 does not cause loss of precision
+TEST_P(HloEvaluatorBf16Test, DoesClampInt64) {
+  auto ones = [](int bits) { return (int64{1} << bits) - 1; };
+
+  auto low =
+      LiteralUtil::CreateR2<int64>({{0, ones(54)}, {ones(54), ones(58)}});
+  auto value = LiteralUtil::CreateR2<int64>({{0, ones(56)}, {0, ones(58)}});
+  auto high = LiteralUtil::CreateR2<int64>(
+      {{ones(54), ones(55)}, {ones(56), ones(58)}});
+
+  Shape shape = low.shape();
+  HloComputation::Builder b(TestName());
+  auto c1 = b.AddInstruction(HloInstruction::CreateConstant(std::move(low)));
+  auto c2 = b.AddInstruction(HloInstruction::CreateConstant(std::move(value)));
+  auto c3 = b.AddInstruction(HloInstruction::CreateConstant(std::move(high)));
+  b.AddInstruction(
+      HloInstruction::CreateTernary(shape, HloOpcode::kClamp, c1, c2, c3));
+  m_->AddEntryComputation(b.Build());
+
+  Literal result = Evaluate();
+
+  auto expected =
+      LiteralUtil::CreateR2<int64>({{0, ones(55)}, {ones(54), ones(58)}});
+
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+}
+
+TEST_P(HloEvaluatorBf16Test, DISABLED_DoesClampSpecialBroadcast) {
   auto low = LiteralUtil::CreateR0<float>(0.f);
   auto value = LiteralUtil::CreateR2<float>({{-1.f, 0.f}, {1.f, 2.f}});
   auto high = LiteralUtil::CreateR0<float>(1.f);
@@ -170,7 +220,7 @@ TEST_P(HloEvaluatorTest, DISABLED_DoesClampSpecialBroadcast) {
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs select
 // with 3 operands.
-TEST_P(HloEvaluatorTest, DoesSelect) {
+TEST_P(HloEvaluatorBf16Test, DoesSelect) {
   auto pred = LiteralUtil::CreateR2<bool>({{true, false}, {false, true}});
   auto on_true = LiteralUtil::CreateR2<float>({{2.f, 4.f}, {4.f, 4.f}});
   auto on_false = LiteralUtil::CreateR2<float>({{0.f, 5.f}, {0.f, 4.f}});
@@ -195,7 +245,7 @@ TEST_P(HloEvaluatorTest, DoesSelect) {
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise addition with 2 operands.
-TEST_P(HloEvaluatorTest, DoesAdd) {
+TEST_F(HloEvaluatorTest, DoesAdd) {
   auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
   auto expected = LiteralUtil::CreateR2<int64>({{3, 4}, {-96, 8}});
@@ -204,7 +254,7 @@ TEST_P(HloEvaluatorTest, DoesAdd) {
 }
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise and with 2 operands.
-TEST_P(HloEvaluatorTest, DoesAnd) {
+TEST_P(HloEvaluatorBf16Test, DoesAnd) {
   auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
   auto expected = LiteralUtil::CreateR2<int64>({{0, 0}, {4, 4}});
@@ -213,7 +263,7 @@ TEST_P(HloEvaluatorTest, DoesAnd) {
 }
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise or with 2 operands.
-TEST_P(HloEvaluatorTest, DoesOr) {
+TEST_F(HloEvaluatorTest, DoesOr) {
   auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
   auto expected = LiteralUtil::CreateR2<int64>({{3, 4}, {-100, 4}});
@@ -222,7 +272,7 @@ TEST_P(HloEvaluatorTest, DoesOr) {
 }
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise or with 2 operands.
-TEST_P(HloEvaluatorTest, DoesXor) {
+TEST_F(HloEvaluatorTest, DoesXor) {
   auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
   auto expected = LiteralUtil::CreateR2<int64>({{3, 4}, {-104, 0}});
@@ -231,7 +281,7 @@ TEST_P(HloEvaluatorTest, DoesXor) {
 }
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise multiply with 2 operands.
-TEST_P(HloEvaluatorTest, DoesMultiply) {
+TEST_F(HloEvaluatorTest, DoesMultiply) {
   auto lhs = LiteralUtil::CreateR2<int32>({{-1, 0}, {-100, 4}});
   auto rhs = LiteralUtil::CreateR2<int32>(
       {{std::numeric_limits<int32>::min(), 4}, {4, 4}});
@@ -242,14 +292,28 @@ TEST_P(HloEvaluatorTest, DoesMultiply) {
 }
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise divide with 2 operands.
-TEST_P(HloEvaluatorTest, DoesDivideInt64) {
+TEST_F(HloEvaluatorTest, DoesDivideInt64) {
   auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
   auto expected = LiteralUtil::CreateR2<int64>({{0, 0}, {-25, 1}});
   TestBinaryOp(HloOpcode::kDivide, std::move(expected), std::move(lhs),
                std::move(rhs));
 }
-TEST_P(HloEvaluatorTest, DoesDivideDouble) {
+
+TEST_F(HloEvaluatorTest, DoesClampS64) {
+  auto low = LiteralUtil::CreateR1<int64>(
+      {-8616761059752331528LL, 6780561065411491190LL, -8616761059752331528LL});
+  auto value = LiteralUtil::CreateR1<int64>(
+      {-6780561065411491190LL, 6780561065411491180LL, 4241131823772864090LL});
+  auto high = LiteralUtil::CreateR1<int64>(
+      {-6780561065411491180LL, 8616761059752331528LL, 3832151243857508051LL});
+  auto expected = LiteralUtil::CreateR1<int64>(
+      {-6780561065411491190LL, 6780561065411491190LL, 3832151243857508051LL});
+  TestTernaryOp(HloOpcode::kClamp, std::move(expected), std::move(low),
+                std::move(value), std::move(high));
+}
+
+TEST_P(HloEvaluatorBf16Test, DoesDivideDouble) {
   auto lhs = LiteralUtil::CreateR2<double>({{1.0, 0.0}, {-100.0, 4.0}});
   auto rhs = LiteralUtil::CreateR2<double>({{2.2, 4.0}, {4.0, 4.0}});
   auto expected =
@@ -260,41 +324,41 @@ TEST_P(HloEvaluatorTest, DoesDivideDouble) {
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise abs op with 1 operand.
-TEST_P(HloEvaluatorTest, DoesAbsR2) {
+TEST_F(HloEvaluatorTest, DoesAbsR2) {
   auto operand = LiteralUtil::CreateR2<int64>({{1, -20}, {-100, 4}});
   auto expected = LiteralUtil::CreateR2<int64>({{1, 20}, {100, 4}});
   TestUnaryOp(HloOpcode::kAbs, std::move(expected), std::move(operand));
 }
-TEST_P(HloEvaluatorTest, DoesAbsR0) {
+TEST_P(HloEvaluatorBf16Test, DoesAbsR0) {
   auto operand = LiteralUtil::CreateR0<float>(-1.0f);
   auto expected = LiteralUtil::CreateR0<float>(1.0f);
   TestUnaryOp(HloOpcode::kAbs, std::move(expected), std::move(operand));
 }
-TEST_P(HloEvaluatorTest, DoesAbsR1WithZeroSize) {
+TEST_P(HloEvaluatorBf16Test, DoesAbsR1WithZeroSize) {
   auto operand = LiteralUtil::CreateR1<float>({});
   auto expected = LiteralUtil::CreateR1<float>({});
   TestUnaryOp(HloOpcode::kAbs, std::move(expected), std::move(operand));
 }
-TEST_P(HloEvaluatorTest, DoesNegateR2) {
+TEST_F(HloEvaluatorTest, DoesNegateR2) {
   auto operand = LiteralUtil::CreateR2<int32>(
       {{0, std::numeric_limits<int32>::min()}, {-1, 4}});
   auto expected = LiteralUtil::CreateR2<int32>(
       {{0, std::numeric_limits<int>::min()}, {1, -4}});
   TestUnaryOp(HloOpcode::kNegate, std::move(expected), std::move(operand));
 }
-TEST_P(HloEvaluatorTest, DoesCosR2) {
+TEST_P(HloEvaluatorBf16Test, DoesCosR2) {
   auto operand = LiteralUtil::CreateR2<float>({{0, M_PI}, {-M_PI, 2 * M_PI}});
   auto expected = LiteralUtil::CreateR2<float>({{1, -1}, {-1, 1}});
   TestUnaryOp(HloOpcode::kCos, std::move(expected), std::move(operand),
               use_bfloat16_ ? 0.031250 : 9.5367431640625E-7);
 }
-TEST_P(HloEvaluatorTest, DoesSinR2) {
+TEST_P(HloEvaluatorBf16Test, DoesSinR2) {
   auto operand = LiteralUtil::CreateR2<float>({{0, M_PI}, {-M_PI, 2 * M_PI}});
   auto expected = LiteralUtil::CreateR2<float>({{0, 0}, {0, 0}});
   TestUnaryOp(HloOpcode::kSin, std::move(expected), std::move(operand),
               use_bfloat16_ ? 0.031250 : 9.5367431640625E-7);
 }
-TEST_P(HloEvaluatorTest, DoesNotR2) {
+TEST_F(HloEvaluatorTest, DoesNotR2) {
   auto operand =
       LiteralUtil::CreateR2<int32>({{0, std::numeric_limits<int>::min()},
                                     {-1, std::numeric_limits<int>::max()}});
@@ -303,9 +367,22 @@ TEST_P(HloEvaluatorTest, DoesNotR2) {
                                     {0, std::numeric_limits<int>::min()}});
   TestUnaryOp(HloOpcode::kNot, std::move(expected), std::move(operand));
 }
+
+TEST_F(HloEvaluatorTest, DoesRealC128) {
+  auto x = LiteralUtil::CreateR1<complex128>({{1, 0}, {-100, 4}});
+  auto expected_real = LiteralUtil::CreateR1<double>({1, -100});
+  TestUnaryOp(HloOpcode::kReal, std::move(expected_real), std::move(x));
+}
+
+TEST_F(HloEvaluatorTest, DoesImagC128) {
+  auto x = LiteralUtil::CreateR1<complex128>({{1, 0}, {-100, 4}});
+  auto expected_imag = LiteralUtil::CreateR1<double>({0, 4});
+  TestUnaryOp(HloOpcode::kImag, std::move(expected_imag), std::move(x));
+}
+
 // Verifies that HloEvaluator evaluates a HLO Computation with non-parameter nor
 // constant operands.
-TEST_P(HloEvaluatorTest, DoesTraverseInstructions) {
+TEST_F(HloEvaluatorTest, DoesTraverseInstructions) {
   auto lhs = LiteralUtil::CreateR2<int64>({{1, 0}, {-100, 4}});
   auto rhs = LiteralUtil::CreateR2<int64>({{2, 4}, {4, 4}});
   auto rhs2 = LiteralUtil::CreateR2<int64>({{1, -20}, {-100, 4}});
@@ -335,7 +412,7 @@ TEST_P(HloEvaluatorTest, DoesTraverseInstructions) {
 }
 
 // Verifies Reshape operation is correctly evaluated.
-TEST_P(HloEvaluatorTest, DoesReshape) {
+TEST_F(HloEvaluatorTest, DoesReshape) {
   HloComputation::Builder b(TestName());
   const int64 dimensions[] = {11, 8, 7, 5, 9};
   TF_ASSERT_OK_AND_ASSIGN(auto literal,
@@ -361,7 +438,7 @@ TEST_P(HloEvaluatorTest, DoesReshape) {
 }
 
 // Verifies Broadcast operation is correctly evaluated.
-TEST_P(HloEvaluatorTest, DoesBroadcast) {
+TEST_F(HloEvaluatorTest, DoesBroadcast) {
   HloComputation::Builder b(TestName());
   auto input_literal = LiteralUtil::CreateR2<int32>({{1, 2}, {3, 4}, {5, 6}});
   auto output_literal = LiteralUtil::CreateR3<int32>(
@@ -377,7 +454,7 @@ TEST_P(HloEvaluatorTest, DoesBroadcast) {
   EXPECT_TRUE(LiteralTestUtil::Equal(result, output_literal));
 }
 
-TEST_P(HloEvaluatorTest, DoesBroadcastScalar) {
+TEST_F(HloEvaluatorTest, DoesBroadcastScalar) {
   HloComputation::Builder b(TestName());
   auto input_literal = LiteralUtil::CreateR0<int32>(111);
   auto output_literal = LiteralUtil::CreateR2<int32>(
@@ -396,7 +473,7 @@ TEST_P(HloEvaluatorTest, DoesBroadcastScalar) {
   EXPECT_TRUE(LiteralTestUtil::Equal(result, output_literal));
 }
 
-TEST_P(HloEvaluatorTest, DoesConcatenateSimple) {
+TEST_F(HloEvaluatorTest, DoesConcatenateSimple) {
   HloComputation::Builder b(TestName());
 
   HloInstruction* operand1 = b.AddInstruction(HloInstruction::CreateConstant(
@@ -418,7 +495,7 @@ TEST_P(HloEvaluatorTest, DoesConcatenateSimple) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, ConcatenateHandlesShapeWithZeroElement) {
+TEST_F(HloEvaluatorTest, ConcatenateHandlesShapeWithZeroElement) {
   HloComputation::Builder b(TestName());
 
   HloInstruction* operand1 = b.AddInstruction(
@@ -439,7 +516,7 @@ TEST_P(HloEvaluatorTest, ConcatenateHandlesShapeWithZeroElement) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, ConvertWithSameLayout) {
+TEST_P(HloEvaluatorBf16Test, ConvertWithSameLayout) {
   HloComputation::Builder b(TestName());
 
   auto input_literal = LiteralUtil::CreateR2<int32>({{1, 2}, {3, 4}, {5, 6}});
@@ -458,7 +535,7 @@ TEST_P(HloEvaluatorTest, ConvertWithSameLayout) {
   EXPECT_TRUE(LiteralTestUtil::Equal(result, expected));
 }
 
-TEST_P(HloEvaluatorTest, ConvertWithDifferentLayout) {
+TEST_P(HloEvaluatorBf16Test, ConvertWithDifferentLayout) {
   HloComputation::Builder b(TestName());
 
   auto input_literal = LiteralUtil::CreateR2WithLayout<int32>(
@@ -491,7 +568,7 @@ PaddingConfig CreatePaddingConfig(
   return padding_config;
 }
 
-TEST_P(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) {
+TEST_F(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) {
   auto operand = LiteralUtil::CreateR2<int32>({{}, {}});
   HloComputation::Builder b(TestName());
   auto operand_instruction =
@@ -516,7 +593,7 @@ TEST_P(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) {
+TEST_P(HloEvaluatorBf16Test, Pad4DFloatArrayWithInteriorPadding) {
   HloComputation::Builder b(TestName());
 
   Array4D<float> input_array(3, 2, 1, 1, {1, 2, 3, 4, 5, 6});
@@ -551,7 +628,7 @@ TEST_P(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, NegativePadding2D) {
+TEST_P(HloEvaluatorBf16Test, NegativePadding2D) {
   HloComputation::Builder b(TestName());
 
   // input_array:
@@ -593,7 +670,7 @@ TEST_P(HloEvaluatorTest, NegativePadding2D) {
   EXPECT_TRUE(LiteralTestUtil::Near(expected, result, ErrorSpec(0.031250)));
 }
 
-TEST_P(HloEvaluatorTest, NegativeAndInteriorPadding2D) {
+TEST_P(HloEvaluatorBf16Test, NegativeAndInteriorPadding2D) {
   HloComputation::Builder b(TestName());
 
   // f32[4,3] {
@@ -632,7 +709,7 @@ TEST_P(HloEvaluatorTest, NegativeAndInteriorPadding2D) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, DotRank2AndRank1) {
+TEST_P(HloEvaluatorBf16Test, DotRank2AndRank1) {
   HloComputation::Builder b(TestName());
 
   // lhs:
@@ -678,7 +755,7 @@ TEST_P(HloEvaluatorTest, DotRank2AndRank1) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, DotRank1AndRank2) {
+TEST_P(HloEvaluatorBf16Test, DotRank1AndRank2) {
   HloComputation::Builder b(TestName());
 
   // lhs:
@@ -716,7 +793,7 @@ TEST_P(HloEvaluatorTest, DotRank1AndRank2) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, DotRank2AndRank2) {
+TEST_P(HloEvaluatorBf16Test, DotRank2AndRank2) {
   HloComputation::Builder b(TestName());
 
   // lhs:
@@ -766,7 +843,51 @@ TEST_P(HloEvaluatorTest, DotRank2AndRank2) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, SimpleConv1D) {
+TEST_P(HloEvaluatorBf16Test, DotRank4AndRank4) {
+  HloComputation::Builder b(TestName());
+
+  auto lhs_array = absl::make_unique<Array4D<float>>(2, 2, 3, 1);
+  lhs_array->FillIota(1.0f);
+  auto lhs_literal = LiteralUtil::CreateR4FromArray4D<float>(*lhs_array);
+  HloInstruction* lhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(lhs_literal)));
+
+  auto rhs_array = absl::make_unique<Array4D<float>>(2, 2, 3, 1);
+  rhs_array->FillIota(2.0f);
+  auto rhs_literal = LiteralUtil::CreateR4FromArray4D<float>(*rhs_array);
+  HloInstruction* rhs_instruction =
+      b.AddInstruction(HloInstruction::CreateConstant(std::move(rhs_literal)));
+
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 1, 1});
+  DotDimensionNumbers dot_dnums;
+
+  dot_dnums.add_lhs_batch_dimensions(0);
+  dot_dnums.add_rhs_batch_dimensions(0);
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_lhs_contracting_dimensions(2);
+  dot_dnums.add_rhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(2);
+  b.AddInstruction(HloInstruction::CreateDot(shape, lhs_instruction,
+                                             rhs_instruction, dot_dnums,
+                                             DefaultPrecisionConfig(2)));
+  m_->AddEntryComputation(b.Build());
+
+  Literal result = Evaluate();
+  float expected_1 = 0;
+  for (float i = 1.0f; i < 7.0f; ++i) {
+    expected_1 += i * i + i;
+  }
+  float expected_2 = 0;
+  for (float i = 7.0f; i < 13.0f; ++i) {
+    expected_2 += i * i + i;
+  }
+  auto expected_array = Array3D<float>({{{expected_1}}, {{expected_2}}});
+  auto expected = LiteralUtil::CreateR3FromArray3D<float>(expected_array);
+
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
+}
+
+TEST_P(HloEvaluatorBf16Test, SimpleConv1D) {
   HloComputation::Builder b(TestName());
 
   Array3D<float> lhs_array = {{{1, 2, 3}}};
@@ -804,7 +925,7 @@ TEST_P(HloEvaluatorTest, SimpleConv1D) {
   Shape shape = ShapeUtil::MakeShape(F32, {1, 1, 3});
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
-      window, dnums, DefaultPrecisionConfig(2)));
+      /*batch_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
   m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
@@ -815,7 +936,7 @@ TEST_P(HloEvaluatorTest, SimpleConv1D) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) {
+TEST_P(HloEvaluatorBf16Test, Simple4x4Conv2DWith2x2Kernel) {
   HloComputation::Builder b(TestName());
 
   Array4D<float> lhs_array(1, 1, 4, 4);
@@ -859,7 +980,7 @@ TEST_P(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) {
   Shape shape = ShapeUtil::MakeShape(F32, {1, 1, 4, 4});
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
-      window, dnums, DefaultPrecisionConfig(2)));
+      /*batch_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
   m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
@@ -878,7 +999,7 @@ TEST_P(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, Conv2DGeneralDimensionsReversed) {
+TEST_P(HloEvaluatorBf16Test, Conv2DGeneralDimensionsReversed) {
   HloComputation::Builder b(TestName());
 
   // clang-format off
@@ -943,7 +1064,7 @@ TEST_P(HloEvaluatorTest, Conv2DGeneralDimensionsReversed) {
   Shape shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 2});
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
-      window, dnums, DefaultPrecisionConfig(2)));
+      /*batch_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
   m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
@@ -959,7 +1080,7 @@ TEST_P(HloEvaluatorTest, Conv2DGeneralDimensionsReversed) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, Conv2DGeneralDimensions) {
+TEST_P(HloEvaluatorBf16Test, Conv2DGeneralDimensions) {
   HloComputation::Builder b(TestName());
 
   // clang-format off
@@ -1021,7 +1142,7 @@ TEST_P(HloEvaluatorTest, Conv2DGeneralDimensions) {
   Shape shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 2});
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
-      window, dnums, DefaultPrecisionConfig(2)));
+      /*batch_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
   m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
@@ -1037,7 +1158,7 @@ TEST_P(HloEvaluatorTest, Conv2DGeneralDimensions) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) {
+TEST_P(HloEvaluatorBf16Test, DilatedBaseConv2DWithHighPadding) {
   HloComputation::Builder b(TestName());
 
   Array4D<float> lhs_array(1, 1, 4, 4);
@@ -1081,7 +1202,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) {
   Shape shape = ShapeUtil::MakeShape(F32, {1, 1, 7, 7});
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
-      window, dnums, DefaultPrecisionConfig(2)));
+      /*batch_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
   m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
@@ -1101,7 +1222,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) {
+TEST_P(HloEvaluatorBf16Test, DilatedBaseConv2DWithLowAndHighPadding) {
   HloComputation::Builder b(TestName());
 
   Array4D<float> lhs_array(1, 1, 4, 4);
@@ -1145,7 +1266,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) {
   Shape shape = ShapeUtil::MakeShape(F32, {1, 1, 8, 8});
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
-      window, dnums, DefaultPrecisionConfig(2)));
+      /*batch_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
   m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
@@ -1166,7 +1287,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest,
+TEST_P(HloEvaluatorBf16Test,
        DilatedWindowAndBaseConv2DWithDifferentLowAndHighPaddingAndStrides) {
   HloComputation::Builder b(TestName());
 
@@ -1217,7 +1338,7 @@ TEST_P(HloEvaluatorTest,
   Shape shape = ShapeUtil::MakeShape(F32, {1, 1, 9, 3});
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, /*feature_group_count=*/1,
-      window, dnums, DefaultPrecisionConfig(2)));
+      /*batch_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
   m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
@@ -1239,7 +1360,7 @@ TEST_P(HloEvaluatorTest,
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, Conv2DGroupedConvolution) {
+TEST_P(HloEvaluatorBf16Test, Conv2DGroupedConvolution) {
   HloComputation::Builder b(TestName());
   std::vector<int64> input_dims = {1, 2, 2, 4};
   std::vector<int64> filter_dims = {2, 2, 2, 8};
@@ -1288,7 +1409,8 @@ TEST_P(HloEvaluatorTest, Conv2DGroupedConvolution) {
   Shape shape = ShapeUtil::MakeShape(F32, {1, 1, 1, 8});
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction,
-      /*feature_group_count=*/2, window, dnums, DefaultPrecisionConfig(2)));
+      /*feature_group_count=*/2, /*batch_group_count=*/1, window, dnums,
+      DefaultPrecisionConfig(2)));
   m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
@@ -1374,7 +1496,7 @@ void BM_ReducePrecisely(int num_iters) {
 
 BENCHMARK(BM_ReducePrecisely);
 
-TEST_P(HloEvaluatorTest, ReduceAdd) {
+TEST_P(HloEvaluatorBf16Test, ReduceAdd) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1416,7 +1538,7 @@ TEST_P(HloEvaluatorTest, ReduceAdd) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, ReduceWindowMax) {
+TEST_P(HloEvaluatorBf16Test, ReduceWindowMax) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1467,7 +1589,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowMax) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, ReduceWindowMaxWindowDilation) {
+TEST_P(HloEvaluatorBf16Test, ReduceWindowMaxWindowDilation) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1519,7 +1641,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowMaxWindowDilation) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, ReduceWindowAdd) {
+TEST_P(HloEvaluatorBf16Test, ReduceWindowAdd) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1576,7 +1698,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, ReduceWindowAdd6D) {
+TEST_P(HloEvaluatorBf16Test, ReduceWindowAdd6D) {
   HloComputation::Builder b(TestName());
 
   // arg: f32[4,4,4,4,4,4] full of ones. Using small dims to limit run-time.
@@ -1639,7 +1761,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd6D) {
   EXPECT_TRUE(LiteralTestUtil::Equal(result_literal, result));
 }
 
-TEST_P(HloEvaluatorTest, StridedSlice) {
+TEST_P(HloEvaluatorBf16Test, StridedSlice) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1673,7 +1795,7 @@ TEST_P(HloEvaluatorTest, StridedSlice) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, DynamicSlice) {
+TEST_P(HloEvaluatorBf16Test, DynamicSlice) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1689,12 +1811,14 @@ TEST_P(HloEvaluatorTest, DynamicSlice) {
   HloInstruction* operand = b.AddInstruction(
       HloInstruction::CreateConstant(std::move(operand_literal)));
 
-  auto start_indices = b.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({0, 1})));
+  auto zero = b.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(0)));
+  auto one = b.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(1)));
 
   Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
-  b.AddInstruction(HloInstruction::CreateDynamicSlice(shape, operand,
-                                                      start_indices, {2, 3}));
+  b.AddInstruction(
+      HloInstruction::CreateDynamicSlice(shape, operand, {zero, one}, {2, 3}));
   m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
@@ -1709,7 +1833,7 @@ TEST_P(HloEvaluatorTest, DynamicSlice) {
 
 // Verifies that the HloEvaluator's implementation goes along with existing
 // backends' behavior, although this is not required by the spec.
-TEST_P(HloEvaluatorTest, DynamicSliceModSlice) {
+TEST_P(HloEvaluatorBf16Test, DynamicSliceModSlice) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1725,12 +1849,14 @@ TEST_P(HloEvaluatorTest, DynamicSliceModSlice) {
   HloInstruction* operand = b.AddInstruction(
       HloInstruction::CreateConstant(std::move(operand_literal)));
 
-  auto start_indices = b.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2, 1})));
+  auto two = b.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(2)));
+  auto one = b.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(1)));
 
   Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
-  b.AddInstruction(HloInstruction::CreateDynamicSlice(shape, operand,
-                                                      start_indices, {2, 3}));
+  b.AddInstruction(
+      HloInstruction::CreateDynamicSlice(shape, operand, {two, one}, {2, 3}));
   m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
@@ -1743,7 +1869,7 @@ TEST_P(HloEvaluatorTest, DynamicSliceModSlice) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, DynamicSliceUpdate) {
+TEST_P(HloEvaluatorBf16Test, DynamicSliceUpdate) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1759,15 +1885,17 @@ TEST_P(HloEvaluatorTest, DynamicSliceUpdate) {
   HloInstruction* operand = b.AddInstruction(
       HloInstruction::CreateConstant(std::move(operand_literal)));
 
-  auto start_indices = b.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int64>({0, 1})));
+  auto zero = b.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(0)));
+  auto one = b.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(1)));
 
   auto update = b.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR2<double>({{-2.0, -3.0}, {-6.0, -7.0}})));
 
   Shape shape = ShapeUtil::MakeShape(F64, {2, 3});
   b.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-      shape, operand, update, start_indices));
+      shape, operand, update, {zero, one}));
   m_->AddEntryComputation(b.Build());
 
   Literal result = Evaluate();
@@ -1780,7 +1908,7 @@ TEST_P(HloEvaluatorTest, DynamicSliceUpdate) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, SetAndGetTuples) {
+TEST_P(HloEvaluatorBf16Test, SetAndGetTuples) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1816,7 +1944,7 @@ TEST_P(HloEvaluatorTest, SetAndGetTuples) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, SetAndGetNestedTuples) {
+TEST_P(HloEvaluatorBf16Test, SetAndGetNestedTuples) {
   HloComputation::Builder b(TestName());
 
   // arg:
@@ -1855,7 +1983,7 @@ TEST_P(HloEvaluatorTest, SetAndGetNestedTuples) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, Reverse) {
+TEST_P(HloEvaluatorBf16Test, Reverse) {
   HloComputation::Builder b(TestName());
 
   // Input shape is float[4x3x2x1].
@@ -1908,7 +2036,7 @@ TEST_P(HloEvaluatorTest, Reverse) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, result));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateWithSubstitutions) {
+TEST_P(HloEvaluatorBf16Test, EvaluateWithSubstitutions) {
   HloComputation::Builder b(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4});
 
@@ -1932,7 +2060,7 @@ TEST_P(HloEvaluatorTest, EvaluateWithSubstitutions) {
 
 // Check that EvaluateWithSubstitutions works if one of the operands to the op
 // we're evaluating is a constant.
-TEST_P(HloEvaluatorTest, EvaluateWithSubstitutionsWithConstantOperand) {
+TEST_P(HloEvaluatorBf16Test, EvaluateWithSubstitutionsWithConstantOperand) {
   HloComputation::Builder b(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {4});
 
@@ -1955,7 +2083,7 @@ TEST_P(HloEvaluatorTest, EvaluateWithSubstitutionsWithConstantOperand) {
       LiteralUtil::CreateR1<float>({11, 22, 33, 44}), result.ValueOrDie()));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherV1) {
+TEST_F(HloEvaluatorTest, EvaluateGather_TensorFlowGatherV1) {
   const char* hlo_text = R"(
 HloModule TensorFlowGatherV1
 
@@ -1979,7 +2107,7 @@ ENTRY main {
       Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherV2) {
+TEST_F(HloEvaluatorTest, EvaluateGather_TensorFlowGatherV2) {
   const char* hlo_text = R"(
 HloModule TensorFlowGatherV2
 
@@ -2003,7 +2131,7 @@ ENTRY main {
       Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherMultipleBatchDims) {
+TEST_F(HloEvaluatorTest, EvaluateGather_TensorFlowGatherMultipleBatchDims) {
   const char* hlo_text = R"(
 HloModule TensorFlowGatherMultipleBatchDims
 
@@ -2028,7 +2156,7 @@ ENTRY main {
       Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherNd) {
+TEST_F(HloEvaluatorTest, EvaluateGather_TensorFlowGatherNd) {
   const char* hlo_text = R"(
 HloModule TensorFlowGatherNd
 
@@ -2054,7 +2182,7 @@ ENTRY main {
                              Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest,
+TEST_F(HloEvaluatorTest,
        EvaluateGather_TensorFlowGatherNdNonDefaultIndexVectorDim) {
   const char* hlo_text = R"(
 HloModule TensorFlowGatherNd
@@ -2081,7 +2209,7 @@ ENTRY main {
                              Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateGather_DynamicSlice) {
+TEST_F(HloEvaluatorTest, EvaluateGather_DynamicSlice) {
   const char* hlo_text = R"(
 HloModule DynamicSlice
 
@@ -2104,7 +2232,7 @@ ENTRY main {
                                      Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateGather_BatchDynamicSlice) {
+TEST_F(HloEvaluatorTest, EvaluateGather_BatchDynamicSlice) {
   const char* hlo_text = R"(
 HloModule BatchDynamicSlice
 
@@ -2128,7 +2256,7 @@ ENTRY main {
                              Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateGather_ZeroDimBounds) {
+TEST_F(HloEvaluatorTest, EvaluateGather_ZeroDimBounds) {
   const char* hlo_text = R"(
 HloModule TensorFlowGatherV1
 
@@ -2150,7 +2278,7 @@ ENTRY main {
                                      Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateGather_NoOutputWindowDims) {
+TEST_F(HloEvaluatorTest, EvaluateGather_NoOutputWindowDims) {
   const string hlo_text = R"(
 HloModule GatherXd
 
@@ -2175,7 +2303,7 @@ ENTRY main {
                              Evaluate({&operand, &start_indices})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterV1_Update) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterV1_Update) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatterV1
 
@@ -2206,7 +2334,7 @@ ENTRY main {
       Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterV2_Update) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterV2_Update) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatterV2
 
@@ -2238,7 +2366,7 @@ ENTRY main {
       Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_Add) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_Add) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatter
 
@@ -2270,7 +2398,7 @@ ENTRY main {
       Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_Mul) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_Mul) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatter
 
@@ -2302,7 +2430,7 @@ ENTRY main {
       Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_F32) {
+TEST_P(HloEvaluatorBf16Test, EvaluateScatter_TensorFlowScatter_F32) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatter
 
@@ -2336,7 +2464,7 @@ ENTRY main {
       Evaluate({&operand, &scatter_indices, &updates}), ErrorSpec{0.1, 0.01}));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_RepeatedIndices) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_RepeatedIndices) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatter
 
@@ -2368,7 +2496,7 @@ ENTRY main {
       Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_MultipleBatchDims) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatter_MultipleBatchDims) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatterMultipleBatchDims
 
@@ -2401,7 +2529,7 @@ ENTRY main {
       Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterNd) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_TensorFlowScatterNd) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatterNd
 
@@ -2437,7 +2565,7 @@ ENTRY main {
       expected, Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest,
+TEST_F(HloEvaluatorTest,
        EvaluateScatter_TensorFlowScatterNd_NonDefaultIndexVectorDim) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatterNdNonDefaultIndexVectorDim
@@ -2474,7 +2602,7 @@ ENTRY main {
       expected, Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_DynamicUpdateSlice) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_DynamicUpdateSlice) {
   const char* hlo_text = R"(
 HloModule DynamicUpdateSlice
 
@@ -2506,7 +2634,7 @@ ENTRY main {
       expected, Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_BatchDynamicUpdateSlice) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_BatchDynamicUpdateSlice) {
   const char* hlo_text = R"(
 HloModule BatchDynamicUpdateSlice
 
@@ -2538,7 +2666,7 @@ ENTRY main {
       expected, Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_ZeroDimBounds) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_ZeroDimBounds) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatter_ZeroDimBounds
 
@@ -2567,7 +2695,7 @@ ENTRY main {
       operand, Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_NoUpdateWindowDims) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_NoUpdateWindowDims) {
   const string hlo_text = R"(
 HloModule Scatter_NoUpdateWindowDims
 
@@ -2600,7 +2728,7 @@ ENTRY main {
       expected, Evaluate({&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_NegativeIndices) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_NegativeIndices) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatter_NegativeIndices
 
@@ -2635,7 +2763,7 @@ ENTRY main {
                          {&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_OobIndices) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_OobIndices) {
   const string hlo_text = R"(
 HloModule BatchDynamicUpdateSlice
 
@@ -2671,7 +2799,7 @@ ENTRY main {
                          {&operand, &scatter_indices, &updates})));
 }
 
-TEST_P(HloEvaluatorTest, EvaluateScatter_OobUpdateWindow) {
+TEST_F(HloEvaluatorTest, EvaluateScatter_OobUpdateWindow) {
   const char* hlo_text = R"(
 HloModule TensorFlowScatterNd_OobUpdateWindow
 
@@ -2710,7 +2838,7 @@ ENTRY main {
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
 // element-wise comparison with 2 bfloat16 operands.
-TEST_P(HloEvaluatorTest, DoesCompareBF16) {
+TEST_F(HloEvaluatorTest, DoesCompareBF16) {
   // lhs >= rhs
   auto lhs = LiteralUtil::CreateR2<bfloat16>(
       {{bfloat16(0.25), bfloat16(0.35), bfloat16(0.125)},
@@ -2724,7 +2852,7 @@ TEST_P(HloEvaluatorTest, DoesCompareBF16) {
                std::move(rhs));
 }
 
-TEST_P(HloEvaluatorTest, Bf16Reduction) {
+TEST_P(HloEvaluatorBf16Test, Bf16Reduction) {
   const string hlo_text = R"(
 HloModule Bf16Reduction
 
@@ -2748,7 +2876,7 @@ ENTRY main {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected, Evaluate({&arg})));
 }
 
-TEST_P(HloEvaluatorTest, SliceWithDifferentLayout) {
+TEST_P(HloEvaluatorBf16Test, SliceWithDifferentLayout) {
   // Regression test for b/114735354.
   const string hlo_text = R"(
 HloModule SliceWithDifferentLayout
@@ -2767,7 +2895,7 @@ ENTRY main {
   EXPECT_TRUE(LiteralTestUtil::Equal(arg, actual));
 }
 
-TEST_P(HloEvaluatorTest, Bitcast) {
+TEST_P(HloEvaluatorBf16Test, Bitcast) {
   // Regression test for b/114735354.
   constexpr absl::string_view hlo_text_base = R"(
 HloModule Bitcast
@@ -2794,8 +2922,295 @@ ENTRY main {
   }
 }
 
-INSTANTIATE_TEST_CASE_P(HloEvaluatorTest_Instantiation, HloEvaluatorTest,
-                        ::testing::ValuesIn(use_bf16_params));
+// Check that s32 under/overflow doesn't trigger a ubsan failure.
+TEST_F(HloEvaluatorTest, Int32Overflow) {
+  constexpr absl::string_view hlo_text = R"(
+HloModule Test
+
+ENTRY main {
+  c1 = s32[] constant(1073741824)  // 2^30
+  sum = s32[] add(c1, c1)  // 2^31, i.e. INT_MIN
+
+  c2 = s32[] constant(-2147483648)  // -2^31
+  sub = s32[] subtract(c2, c1)  // -2^31 - 2^30, underflows
+
+  mul = s32[] multiply(c1, c1)
+  ROOT tuple = (s32[], s32[], s32[]) tuple(sum, sub, mul)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  std::vector<Literal> actual = Evaluate({}).DecomposeTuple();
+  ASSERT_EQ(actual.size(), 3);
+
+  uint32 pow30 = uint32{1} << 30;
+  uint32 pow31 = uint32{1} << 31;
+  EXPECT_EQ(actual[0].GetFirstElement<int32>(), static_cast<int32>(pow31));
+  EXPECT_EQ(actual[1].GetFirstElement<int32>(),
+            static_cast<int32>(-(pow31 + pow30)));
+  EXPECT_EQ(actual[2].GetFirstElement<int32>(),
+            static_cast<int32>(pow31 * pow31));
+}
+
+TEST_F(HloEvaluatorTest, GetDimensionSize) {
+  constexpr absl::string_view hlo_text = R"(
+HloModule Test
+
+ENTRY main {
+  size = u32[] parameter(0)
+
+  data = s32[4] parameter(1)
+
+  sum = s32[4] add(data, data)
+
+  ROOT dynamic_size = u32[] get-dimension-size(sum), dimensions={0}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(m_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{0, {}},
+      DynamicParameterBinding::DynamicDimension{1, {}, 0}));
+
+  TF_ASSERT_OK_AND_ASSIGN(DynamicDimensionInference dynamic_dimension_inference,
+                          DynamicDimensionInference::Run(m_.get()));
+
+  evaluator_.set_dynamic_dimension_inference(&dynamic_dimension_inference);
+  Literal size_arg = LiteralUtil::CreateR0<uint32>(3);
+  Literal data_arg = LiteralUtil::CreateR1<int32>({1, 2, 3, 4});
+
+  Literal actual = Evaluate({&size_arg, &data_arg});
+
+  EXPECT_EQ(actual.GetFirstElement<uint32>(), static_cast<uint32>(3));
+}
+
+// Check that we get a useful error if we pass inputs of the wrong shape.
+TEST_F(HloEvaluatorTest, EvaluateWithWrongInputShapes) {
+  constexpr absl::string_view hlo_text = R"(
+HloModule Test
+
+ENTRY main {
+  p0 = s32[1] parameter(0)
+  ROOT sum = s32[1] add(p0, p0)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  Literal input_wrong_shape = LiteralUtil::CreateR1<int32>({0, 1});
+
+  EXPECT_EQ(HloEvaluator()
+                .Evaluate(*m_, {&input_wrong_shape})
+                .status()
+                .error_message(),
+            "Shape mismatch at parameter 0. Computation expected s32[1]{0}, "
+            "but arg was s32[2].");
+  EXPECT_EQ(HloEvaluator()
+                .Evaluate(*m_->entry_computation(), {&input_wrong_shape})
+                .status()
+                .error_message(),
+            "Shape mismatch at parameter 0. Computation expected s32[1]{0}, "
+            "but arg was s32[2].");
+}
+
+// Check that we get a useful error if we pass too many or too few inputs.
+TEST_F(HloEvaluatorTest, EvaluateWithWrongNumberOfInputs) {
+  constexpr absl::string_view hlo_text = R"(
+HloModule Test
+
+ENTRY main {
+  p0 = s32[1] parameter(0)
+  ROOT sum = s32[1] add(p0, p0)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  Literal input = LiteralUtil::CreateR1<int32>({0});
+
+  EXPECT_EQ(
+      HloEvaluator().Evaluate(*m_, {&input, &input}).status().error_message(),
+      "Expected 1 argument, but got 2.");
+  EXPECT_EQ(HloEvaluator()
+                .Evaluate(*m_->entry_computation(), {&input, &input})
+                .status()
+                .error_message(),
+            "Expected 1 argument, but got 2.");
+}
+
+TEST_F(HloEvaluatorTest, PreserveFusionInputLayout) {
+  constexpr absl::string_view hlo_text = R"(
+    HloModule FusionInputLayout
+
+    fused_computation {
+      param_0 = f32[20,20]{0,1} parameter(0)
+      ROOT bitcast = f32[20,20]{1,0} bitcast(param_0)
+    }
+
+    ENTRY kernel_entry {
+      parameter.0 = f32[20,20]{0,1} parameter(0)
+      ROOT fusion = f32[20,20]{1,0} fusion(parameter.0),
+        kind=kLoop, calls=fused_computation
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
+  Literal actual = Evaluate({&args[0]});
+  EXPECT_TRUE(absl::c_equal(args[0].data<float>(), actual.data<float>()));
+}
+
+TEST_F(HloEvaluatorTest, PreserveFusionOutputLayout) {
+  constexpr absl::string_view hlo_text = R"(
+    HloModule FusionOutputLayout
+
+    fused_computation {
+      param_0 = f32[20,20]{1,0} parameter(0)
+      ROOT bitcast = f32[20,20]{0,1} bitcast(param_0)
+    }
+
+    ENTRY kernel_entry {
+      parameter.0 = f32[20,20]{1,0} parameter(0)
+      ROOT fusion = f32[20,20]{0,1} fusion(parameter.0),
+        kind=kLoop, calls=fused_computation
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
+  Literal actual = Evaluate({&args[0]});
+  EXPECT_TRUE(absl::c_equal(args[0].data<float>(), actual.data<float>()));
+}
+
+TEST_F(HloEvaluatorTest, PreserveMOFusionOutputLayout) {
+  constexpr absl::string_view hlo_text = R"(
+    HloModule MOFusionOutputLayout
+
+    fused_computation {
+      param_0 = f32[20,20]{1,0} parameter(0)
+      bitcast = f32[20,20]{0,1} bitcast(param_0)
+      ROOT tuple = (f32[20,20]{0,1}) tuple(bitcast)
+    }
+
+    ENTRY kernel_entry {
+      parameter.0 = f32[20,20]{1,0} parameter(0)
+      ROOT fusion = (f32[20,20]{0,1}) fusion(parameter.0),
+        kind=kLoop, calls=fused_computation
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
+  Literal actual_tuple = Evaluate({&args[0]});
+  std::vector<Literal> actual_literals = actual_tuple.DecomposeTuple();
+  EXPECT_TRUE(
+      absl::c_equal(args[0].data<float>(), actual_literals[0].data<float>()));
+}
+
+// Tests that custom_calls fail to evaluate when no handler is specified.
+TEST_F(HloEvaluatorTest, EvaluateCustomCall_NoHandler) {
+  constexpr absl::string_view hlo_text = R"(
+    HloModule EvaluateCustomCall_NoHandler
+    ENTRY kernel_entry {
+      parameter.0 = u32[2,2]{1,0} parameter(0)
+      ROOT test_root = (u32[2,2]{1,0}) custom-call(parameter.0),
+          custom_call_target="_my_custom_call"
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
+  EXPECT_EQ(HloEvaluator().Evaluate(*m_, {&args[0]}).status().code(),
+            ::tensorflow::error::UNIMPLEMENTED);
+}
+
+// Tests when a custom_call handler returns an error.
+TEST_F(HloEvaluatorTest, EvaluateCustomCall_HandlerError) {
+  constexpr absl::string_view hlo_text = R"(
+    HloModule EvaluateCustomCall_HandlerError
+    ENTRY kernel_entry {
+      parameter.0 = u32[2,2]{1,0} parameter(0)
+      ROOT test_root = (u32[2,2]{1,0}) custom-call(parameter.0),
+          custom_call_target="_my_custom_call"
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
+  HloEvaluator evaluator;
+  evaluator.set_custom_call_handler(
+      [](HloInstruction* custom_call, absl::Span<const Literal*> operands) {
+        return InternalError("Test error");
+      });
+  EXPECT_EQ(evaluator.Evaluate(*m_, {&args[0]}).status().code(),
+            ::tensorflow::error::INTERNAL);
+}
+
+// Tests the custom_call handler on calls with many inputs.
+// We sum the operands so that we can verify the operand and output literals
+// are properly mapped for access.
+TEST_F(HloEvaluatorTest, EvaluateCustomCall_ManyInputs) {
+  constexpr absl::string_view hlo_text = R"(
+    HloModule EvaluateCustomCall_ManyInputs
+    ENTRY kernel_entry {
+      parameter.0 = u32[1]{0} parameter(0)
+      parameter.1 = u32[1]{0} parameter(1)
+      ROOT test_root = u32[1]{0} custom-call(parameter.0, parameter.1),
+          custom_call_target="_my_custom_call"
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
+  HloEvaluator evaluator;
+  evaluator.set_custom_call_handler(
+      [](HloInstruction* custom_call, absl::Span<const Literal*> operands) {
+        EXPECT_EQ(HloOpcode::kCustomCall, custom_call->opcode());
+        EXPECT_EQ("_my_custom_call", custom_call->custom_call_target());
+        EXPECT_EQ(2, custom_call->operand_count());
+        EXPECT_EQ(2, operands.size());
+        auto output = Literal::CreateFromShape(custom_call->shape());
+        auto operand0_data = operands[0]->data<uint32>();
+        auto operand1_data = operands[1]->data<uint32>();
+        auto output_data = output.data<uint32>();
+        output_data[0] = operand0_data[0] + operand1_data[0];
+        return output;
+      });
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal actual_literal,
+      evaluator.Evaluate(*m_->entry_computation(), {&args[0], &args[1]}));
+  auto arg0_data = args[0].data<uint32>();
+  auto arg1_data = args[1].data<uint32>();
+  std::vector<uint32> expected_data = {arg0_data[0] + arg1_data[0]};
+  EXPECT_TRUE(absl::c_equal(expected_data, actual_literal.data<uint32>()));
+}
+
+TEST_F(HloEvaluatorTest, IsFiniteF16) {
+  constexpr absl::string_view hlo_text = R"(
+  HloModule test
+
+  ENTRY IsFiniteTest {
+    c = f16[6] constant({nan, 7, nan, -1, inf, -inf})
+    ROOT is-finite = pred[6] is-finite(c)
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal actual_literal,
+      HloEvaluator().Evaluate(*m_->entry_computation(), {}));
+  EXPECT_THAT(actual_literal.data<bool>(),
+              ::testing::ElementsAre(false, true, false, true, false, false));
+}
+
+TEST_F(HloEvaluatorTest, IsFiniteBf16) {
+  constexpr absl::string_view hlo_text = R"(
+  HloModule test
+
+  ENTRY IsFiniteTest {
+    c = bf16[6] constant({nan, 7, nan, -1, inf, -inf})
+    ROOT is-finite = pred[6] is-finite(c)
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(
+      Literal actual_literal,
+      HloEvaluator().Evaluate(*m_->entry_computation(), {}));
+  EXPECT_THAT(actual_literal.data<bool>(),
+              ::testing::ElementsAre(false, true, false, true, false, false));
+}
 
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index b87fc3e34012e75ee07bff6c1e113dce404f83cb..2d8a578985e8f603d4056bee8619725095ebc7bb 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -17,12 +17,15 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_TYPED_VISITOR_H_
 
 #include <cmath>
+#include <type_traits>
 
 #include "absl/algorithm/container.h"
 #include "absl/base/casts.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
+#include "absl/meta/type_traits.h"
 #include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
@@ -38,48 +41,27 @@ namespace xla {
 // Anyway this is relatively safe as-is because hlo_evaluator_typed_visitor.h is
 // a "private" header that's not exposed outside of hlo_evaluator.cc.
 template <typename T>
-using is_complex_t = std::is_same<T, complex64>;
-template <typename T>
-using is_complex64_t = std::is_same<T, complex64>;
-
-// It's UB to use std::sort with std::less<float>, because of NaNs. Define
-// "safe" less functions which are actually strict weak orders. -NaN and NaN
-// should appear at the beginning and end of the ordering, and -0.0 should
-// appear before 0.0.
-template <
-    typename NativeT,
-    typename std::enable_if<std::is_integral<NativeT>::value>::type* = nullptr>
-bool SafeLess(const NativeT& a, const NativeT& b) {
-  return a < b;
-}
+using is_complex_t =
+    absl::disjunction<std::is_same<T, complex64>, std::is_same<T, complex128>>;
 
-template <typename NativeT, typename std::enable_if<std::is_floating_point<
-                                NativeT>::value>::type* = nullptr>
-bool SafeLess(const NativeT& a, const NativeT& b) {
-  bool lhs_is_negative = std::signbit(a);
-  bool rhs_is_negative = std::signbit(b);
-  // If the signs are different, we can just compare the signs.
-  if (lhs_is_negative != rhs_is_negative) {
-    return lhs_is_negative && !rhs_is_negative;
-  }
-  bool lhs_nan = std::isnan(a);
-  bool rhs_nan = std::isnan(b);
-  // Exactly one number is nan?
-  if (lhs_nan != rhs_nan) {
-    if (lhs_nan) {
-      return lhs_is_negative;
-    }
-    return !rhs_is_negative;
-  }
-  return a < b;
+// ToArithmeticSafeType(T t):
+//  - converts `t` to the bitwise-equivalent `unsigned T` if T is a signed
+//    integer, and
+//  - otherwise returns `t` unchanged.
+//
+// It's UB in C++ to under/overflow a signed integer, so we wrap all arithmetic
+// in this type to force 2's complement behavior.
+template <typename T,
+          typename std::enable_if<std::is_integral<T>::value &&
+                                  std::is_signed<T>::value>::type* = nullptr>
+typename std::make_unsigned<T>::type ToArithmeticSafeType(T t) {
+  return static_cast<typename std::make_unsigned<T>::type>(t);
 }
-
-template <typename NativeT,
-          typename std::enable_if<
-              std::is_same<NativeT, bfloat16>::value ||
-              std::is_same<NativeT, Eigen::half>::value>::type* = nullptr>
-bool SafeLess(const NativeT& a, const NativeT& b) {
-  return SafeLess(static_cast<float>(a), static_cast<float>(b));
+template <typename T,
+          typename std::enable_if<!std::is_integral<T>::value ||
+                                  !std::is_signed<T>::value>::type* = nullptr>
+T ToArithmeticSafeType(T t) {
+  return std::move(t);
 }
 
 // Templated DfsHloVisitor for use by HloEvaluator.
@@ -105,6 +87,12 @@ bool SafeLess(const NativeT& a, const NativeT& b) {
 template <typename ReturnT, typename ElementwiseT = ReturnT>
 class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
  private:
+  Status UnsupportedTypeError(HloInstruction* instruction) {
+    return InvalidArgument(
+        "Unsupported type for %s: %s", HloOpcodeString(instruction->opcode()),
+        PrimitiveType_Name(instruction->shape().element_type()));
+  }
+
   // Get the value in the given literal static_cast as a double.
   template <
       typename NativeT,
@@ -185,7 +173,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
   template <
       typename NativeT,
-      typename std::enable_if<is_complex64_t<NativeT>::value>::type* = nullptr>
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleAbs(HloInstruction* abs) {
     const Literal& operand_literal =
         parent_->GetEvaluatedLiteralFor(abs->operand(0));
@@ -204,6 +192,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     // specifying the ElementwiseT explicitly as C64 is needed below.
     if (abs->operand(0)->shape().element_type() == C64) {
       return HandleAbs<complex64>(abs);
+    } else if (abs->operand(0)->shape().element_type() == C128) {
+      return HandleAbs<complex128>(abs);
     }
     return HandleAbs<ElementwiseT>(abs);
   }
@@ -224,7 +214,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleRound(HloInstruction* round) {
-    return InvalidArgument("Unsupported type for Round");
+    return UnsupportedTypeError(round);
   }
 
   Status HandleRound(HloInstruction* round) override {
@@ -246,7 +236,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleCeil(HloInstruction* ceil) {
-    return InvalidArgument("Unsupported type for Ceil");
+    return UnsupportedTypeError(ceil);
   }
 
   Status HandleCeil(HloInstruction* ceil) override {
@@ -297,8 +287,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   template <
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleExpm1(HloInstruction* floor) {
-    return InvalidArgument("Unsupported type for Expm1");
+  Status HandleExpm1(HloInstruction* expm1) {
+    return UnsupportedTypeError(expm1);
   }
 
   Status HandleExpm1(HloInstruction* floor) override {
@@ -321,7 +311,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleFloor(HloInstruction* floor) {
-    return InvalidArgument("Unsupported type for Floor");
+    return UnsupportedTypeError(floor);
   }
 
   Status HandleFloor(HloInstruction* floor) override {
@@ -339,10 +329,10 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   template <
       typename NativeT,
       typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleLog1p(HloInstruction* expm1) {
+  Status HandleLog1p(HloInstruction* log1p) {
     TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[expm1],
-        ElementWiseUnaryOp(expm1, [](ElementwiseT elem_operand) {
+        parent_->evaluated_[log1p],
+        ElementWiseUnaryOp(log1p, [](ElementwiseT elem_operand) {
           return std::log1p(elem_operand);
         }));
     return Status::OK();
@@ -351,12 +341,12 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   template <
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleLog1p(HloInstruction* floor) {
-    return InvalidArgument("Unsupported type for Log1p");
+  Status HandleLog1p(HloInstruction* log1p) {
+    return UnsupportedTypeError(log1p);
   }
 
-  Status HandleLog1p(HloInstruction* floor) override {
-    return HandleLog1p<ReturnT>(floor);
+  Status HandleLog1p(HloInstruction* log1p) override {
+    return HandleLog1p<ReturnT>(log1p);
   }
 
   template <typename NativeT,
@@ -396,7 +386,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleNot(HloInstruction* not_) {
-    return InvalidArgument("Unsupported type for Not");
+    return UnsupportedTypeError(not_);
   }
 
   Status HandleNot(HloInstruction* not_) override {
@@ -433,9 +423,9 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return HandleNegate<ReturnT>(negate);
   }
 
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  template <typename NativeT,
+            typename std::enable_if<std::is_integral<NativeT>::value>::type* =
+                nullptr>
   Status HandleSign(HloInstruction* sign) {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[sign],
                         ElementWiseUnaryOp(sign, [](ElementwiseT elem_operand) {
@@ -445,6 +435,23 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_same<NativeT, bfloat16>::value ||
+                std::is_same<NativeT, Eigen::half>::value ||
+                std::is_floating_point<NativeT>::value>::type* = nullptr>
+  Status HandleSign(HloInstruction* sign) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[sign],
+                        ElementWiseUnaryOp(sign, [](ElementwiseT elem_operand) {
+                          return std::isnan(elem_operand)
+                                     ? elem_operand
+                                     : std::copysign(
+                                           elem_operand != ElementwiseT(0),
+                                           elem_operand);
+                        }));
+    return Status::OK();
+  }
+
   template <
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
@@ -476,7 +483,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   template <typename NativeT, typename std::enable_if<!std::is_floating_point<
                                   NativeT>::value>::type* = nullptr>
   Status HandleAtan2(HloInstruction* atan2) {
-    return InvalidArgument("Unsupported type for Atan2");
+    return UnsupportedTypeError(atan2);
   }
 
   Status HandleAtan2(HloInstruction* atan2) override {
@@ -491,47 +498,25 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
-  template <typename NativeT,
-            typename std::enable_if<
-                std::is_signed<NativeT>::value &&
-                !std::is_floating_point<NativeT>::value>::type* = nullptr>
-  Status HandleMultiply(HloInstruction* multiply) {
-    using type = typename std::make_unsigned<NativeT>::type;
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[multiply],
-        ElementWiseBinaryOp(multiply,
-                            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
-                              return NativeT(type(lhs_elem) * type(rhs_elem));
-                            }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<std::is_unsigned<NativeT>::value ||
-                              std::is_floating_point<NativeT>::value ||
-                              is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleMultiply(HloInstruction* multiply) {
+  Status HandleMultiply(HloInstruction* multiply) override {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[multiply],
-        ElementWiseBinaryOp(multiply,
-                            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
-                              return lhs_elem * rhs_elem;
-                            }));
+        ElementWiseBinaryOp(
+            multiply, [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
+              return ElementwiseT(ToArithmeticSafeType(lhs_elem) *
+                                  ToArithmeticSafeType(rhs_elem));
+            }));
     return Status::OK();
   }
 
-  Status HandleMultiply(HloInstruction* multiply) override {
-    return HandleMultiply<ElementwiseT>(multiply);
-  }
-
   Status HandleSubtract(HloInstruction* subtract) override {
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[subtract],
-        ElementWiseBinaryOp(subtract,
-                            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
-                              return lhs_elem - rhs_elem;
-                            }));
+        ElementWiseBinaryOp(
+            subtract, [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
+              return ElementwiseT(ToArithmeticSafeType(lhs_elem) -
+                                  ToArithmeticSafeType(rhs_elem));
+            }));
     return Status::OK();
   }
 
@@ -539,7 +524,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[add],
                         ElementWiseBinaryOp(add, [](ElementwiseT lhs_elem,
                                                     ElementwiseT rhs_elem) {
-                          return lhs_elem + rhs_elem;
+                          return ElementwiseT(ToArithmeticSafeType(lhs_elem) +
+                                              ToArithmeticSafeType(rhs_elem));
                         }));
     return Status::OK();
   }
@@ -624,7 +610,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleMaximum(HloInstruction* maximum) {
-    return InvalidArgument("Unsupported type for Maximum");
+    return UnsupportedTypeError(maximum);
   }
 
   Status HandleMaximum(HloInstruction* maximum) override {
@@ -659,7 +645,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleMinimum(HloInstruction* minimum) {
-    return InvalidArgument("Unsupported type for Minimum");
+    return UnsupportedTypeError(minimum);
   }
 
   Status HandleMinimum(HloInstruction* minimum) override {
@@ -667,14 +653,34 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandlePower(HloInstruction* power) override {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[power],
-                        ElementWiseBinaryOp(power, [](ElementwiseT lhs_el,
-                                                      ElementwiseT rhs_el) {
-                          return std::pow(lhs_el, rhs_el);
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[power],
+        ElementWiseBinaryOp(
+            power, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
+              return lhs_el == ElementwiseT(0) && rhs_el == ElementwiseT(0)
+                         ? static_cast<ElementwiseT>(1)
+                         : std::pow(lhs_el, rhs_el);
+            }));
+    return Status::OK();
+  }
+
+  Status HandleSqrt(HloInstruction* sqrt) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[sqrt],
+                        ElementWiseUnaryOp(sqrt, [](ElementwiseT elem_operand) {
+                          return std::sqrt(elem_operand);
                         }));
     return Status::OK();
   }
 
+  Status HandleRsqrt(HloInstruction* rsqrt) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[rsqrt],
+        ElementWiseUnaryOp(rsqrt, [](ElementwiseT elem_operand) {
+          return static_cast<ElementwiseT>(1) / std::sqrt(elem_operand);
+        }));
+    return Status::OK();
+  }
+
   template <typename NativeT, typename std::enable_if<std::is_floating_point<
                                   NativeT>::value>::type* = nullptr>
   Status HandleRemainder(HloInstruction* remainder) {
@@ -724,7 +730,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleRemainder(HloInstruction* remainder) {
-    return InvalidArgument("Unsupported type for Remainder");
+    return UnsupportedTypeError(remainder);
   }
 
   Status HandleRemainder(HloInstruction* remainder) override {
@@ -746,14 +752,14 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   template <typename NativeT, typename std::enable_if<std::is_floating_point<
                                   NativeT>::value>::type* = nullptr>
   Status HandleAnd(HloInstruction* and_) {
-    return InvalidArgument("Unsupported type for And");
+    return UnsupportedTypeError(and_);
   }
 
   template <
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleAnd(HloInstruction* and_) {
-    return InvalidArgument("Unsupported type for And");
+    return UnsupportedTypeError(and_);
   }
 
   Status HandleAnd(HloInstruction* and_) override {
@@ -775,7 +781,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   template <typename NativeT, typename std::enable_if<std::is_floating_point<
                                   NativeT>::value>::type* = nullptr>
   Status HandleOr(HloInstruction* or_) {
-    return InvalidArgument("Unsupported type for Or");
+    return UnsupportedTypeError(or_);
   }
 
   template <
@@ -804,14 +810,14 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   template <typename NativeT, typename std::enable_if<std::is_floating_point<
                                   NativeT>::value>::type* = nullptr>
   Status HandleXor(HloInstruction* xor_) {
-    return InvalidArgument("Unsupported type for Xor");
+    return UnsupportedTypeError(xor_);
   }
 
   template <
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleXor(HloInstruction* xor_) {
-    return InvalidArgument("Unsupported type for Xor");
+    return UnsupportedTypeError(xor_);
   }
 
   Status HandleXor(HloInstruction* xor_) override {
@@ -836,8 +842,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
             typename std::enable_if<!std::is_integral<NativeT>::value ||
                                     std::is_same<NativeT, bool>::value>::type* =
                 nullptr>
-  Status HandleShiftLeft(HloInstruction*) {
-    return InvalidArgument("Unsupported type for ShiftLeft");
+  Status HandleShiftLeft(HloInstruction* shift) {
+    return UnsupportedTypeError(shift);
   }
 
   Status HandleShiftLeft(HloInstruction* shl) override {
@@ -866,8 +872,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
             typename std::enable_if<!std::is_integral<NativeT>::value ||
                                     std::is_same<NativeT, bool>::value>::type* =
                 nullptr>
-  Status HandleShiftRightArithmetic(HloInstruction*) {
-    return InvalidArgument("Unsupported type for ShiftRightArithmetic");
+  Status HandleShiftRightArithmetic(HloInstruction* shift) {
+    return UnsupportedTypeError(shift);
   }
 
   Status HandleShiftRightArithmetic(HloInstruction* shra) override {
@@ -897,21 +903,45 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
             typename std::enable_if<!std::is_integral<NativeT>::value ||
                                     std::is_same<NativeT, bool>::value>::type* =
                 nullptr>
-  Status HandleShiftRightLogical(HloInstruction*) {
-    return InvalidArgument("Unsupported type for ShiftRightLogical");
+  Status HandleShiftRightLogical(HloInstruction* shift) {
+    return UnsupportedTypeError(shift);
   }
 
   Status HandleShiftRightLogical(HloInstruction* shrl) override {
     return HandleShiftRightLogical<ElementwiseT>(shrl);
   }
 
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  // Special case for integral type due to MSVC's std::isnan being unable to
+  // handle integral type.
+  template <typename NativeT,
+            typename std::enable_if<!is_complex_t<NativeT>::value &&
+                                    std::is_integral<NativeT>::value>::type* =
+                nullptr>
+  Status HandleClamp(HloInstruction* clamp) {
+    std::function<ElementwiseT(ElementwiseT, ElementwiseT, ElementwiseT)>
+        clamp_op = [](ElementwiseT low, ElementwiseT value, ElementwiseT high) {
+          return static_cast<ElementwiseT>(
+              std::min(high, std::max(value, low)));
+        };
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[clamp],
+        ElementwiseTernaryOp(clamp,
+                             std::move(ConvertTernaryFunction(clamp_op))));
+    return Status::OK();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<!is_complex_t<NativeT>::value &&
+                                    !std::is_integral<NativeT>::value>::type* =
+                nullptr>
   Status HandleClamp(HloInstruction* clamp) {
     std::function<ElementwiseT(ElementwiseT, ElementwiseT, ElementwiseT)>
         clamp_op = [](ElementwiseT low, ElementwiseT value, ElementwiseT high) {
-          return std::fmin(high, std::fmax(value, low));
+          if (std::isnan(low) || std::isnan(high)) {
+            return static_cast<ElementwiseT>(NAN);
+          }
+          return static_cast<ElementwiseT>(
+              std::min<NativeT>(high, std::max<NativeT>(value, low)));
         };
     TF_ASSIGN_OR_RETURN(
         parent_->evaluated_[clamp],
@@ -923,8 +953,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   template <
       typename NativeT,
       typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleClamp(HloInstruction*) {
-    return InvalidArgument("Unsupported type for Clamp");
+  Status HandleClamp(HloInstruction* clamp) {
+    return UnsupportedTypeError(clamp);
   }
 
   Status HandleClamp(HloInstruction* clamp) override {
@@ -933,7 +963,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleSelect(HloInstruction* select) override {
     CHECK(!ShapeUtil::IsScalar(select->operand(0)->shape()));
-    CHECK(ShapeUtil::IsArray(select->shape()));
+    CHECK(select->shape().IsArray());
     std::function<ReturnT(bool, ReturnT, ReturnT)> select_op =
         [](bool pred, ReturnT on_true, ReturnT on_false) {
           if (pred) {
@@ -986,8 +1016,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
     TF_CHECK_OK(ShapeUtil::ValidateShape(lhs_shape));
     TF_CHECK_OK(ShapeUtil::ValidateShape(rhs_shape));
-    CHECK(ShapeUtil::IsArray(lhs_shape));
-    CHECK(ShapeUtil::IsArray(rhs_shape));
+    CHECK(lhs_shape.IsArray());
+    CHECK(rhs_shape.IsArray());
     CHECK(ShapeUtil::SameElementType(lhs_shape, rhs_shape));
     CHECK(ShapeUtil::SameElementType(lhs_shape, result_shape));
 
@@ -998,16 +1028,16 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     CHECK_GE(num_spatial_dims, 0);
     CHECK_EQ(window.dimensions_size(), num_spatial_dims);
 
-    const auto lhs_rank = ShapeUtil::Rank(lhs_shape);
-    const auto rhs_rank = ShapeUtil::Rank(rhs_shape);
+    const auto lhs_rank = lhs_shape.rank();
+    const auto rhs_rank = rhs_shape.rank();
 
     CHECK_EQ(num_spatial_dims + 2, lhs_rank);
     CHECK_EQ(num_spatial_dims + 2, rhs_rank);
 
-    TF_ASSIGN_OR_RETURN(
-        auto inferred_return_shape,
-        ShapeInference::InferConvolveShape(
-            lhs_shape, rhs_shape, conv->feature_group_count(), window, dnums));
+    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
+                        ShapeInference::InferConvolveShape(
+                            lhs_shape, rhs_shape, conv->feature_group_count(),
+                            conv->batch_group_count(), window, dnums));
     CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
         << "return shape set to: " << ShapeUtil::HumanString(result_shape)
         << " but is inferred to be: "
@@ -1030,12 +1060,13 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     auto lhs_literal_data = lhs_literal.data<ReturnT>();
     auto rhs_literal_data = rhs_literal.data<ReturnT>();
 
-    int64 feature_group_count = conv->feature_group_count();
+    const int64 feature_group_count = conv->feature_group_count();
+    const int64 batch_group_count = conv->batch_group_count();
 
     auto func = [&window_shape, &dnums, &lhs_shape, &rhs_shape, &window,
                  &lhs_dim_multipliers, &rhs_dim_multipliers, lhs_literal_data,
-                 rhs_literal_data,
-                 feature_group_count](const absl::Span<const int64> out_index) {
+                 rhs_literal_data, feature_group_count,
+                 batch_group_count](const absl::Span<const int64> out_index) {
       // Dimension number applicable for input (lhs).
       const int64 input_batch_dim = dnums.input_batch_dimension();
       const int64 input_z_dim = dnums.input_feature_dimension();
@@ -1048,6 +1079,12 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
       const int64 input_z_size =
           ShapeUtil::GetDimension(lhs_shape, input_z_dim);
+
+      const int64 input_batch_size =
+          ShapeUtil::GetDimension(lhs_shape, input_batch_dim);
+
+      const int64 batch_group_size = input_batch_size / batch_group_count;
+
       // The size of an input feature group.
       const int64 input_feature_group_size = input_z_size / feature_group_count;
 
@@ -1063,11 +1100,15 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       const int64 feature_group_index =
           out_index[output_z_dim] / output_feature_group_size;
 
+      const int64 batch_group_index = out_index[output_z_dim];
+
       ElementwiseT result_val = static_cast<ElementwiseT>(0);
       DimensionVector rhs_spatial_index(dnums.kernel_spatial_dimensions_size(),
                                         0);
 
       // Convolve input feature with kernel.
+      // The mechanism indexes into the correct LHS (input) and RHS (kernel)
+      // locations and accumulates multiplications for a given output index.
       do {
         // Find corresponding spatial dimension index for input (lhs).
         int64 lhs_linear_spatial_index = 0;
@@ -1120,11 +1161,24 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
               feature_group_index * input_feature_group_size + rhs_iz;
 
           int64 lhs_linear_index = lhs_linear_spatial_index;
+
           lhs_linear_index += out_index[output_batch_dim] *
                               lhs_dim_multipliers[input_batch_dim];
+
+          // We are scraping only the diagonal elements in the resultant
+          // convolution output when batch_group_count is greater than 1,
+          // where 1 is the default. No scraping is done in that case.
+          // This approach works out automatically for 'groups' in batches
+          // with group_size > 1, because we already descend down the batch
+          // dimension for the 'output_batch_dim' above.
+          lhs_linear_index +=
+              ((batch_group_index * batch_group_size) % input_batch_size) *
+              lhs_dim_multipliers[input_batch_dim];
+
           lhs_linear_index += iz * lhs_dim_multipliers[input_z_dim];
 
           int64 rhs_linear_index = rhs_linear_spatial_index;
+
           rhs_linear_index += out_index[output_z_dim] *
                               rhs_dim_multipliers[kernel_output_z_dim];
           rhs_linear_index += rhs_iz * rhs_dim_multipliers[kernel_input_z_dim];
@@ -1148,23 +1202,31 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleDot(HloInstruction* dot) override {
-    auto lhs = dot->operand(0);
-    auto rhs = dot->operand(1);
-    CHECK(ShapeUtil::IsArray(dot->shape()));
-    CHECK(ShapeUtil::IsArray(lhs->shape()));
-    CHECK(ShapeUtil::IsArray(rhs->shape()));
+    if (dot->dot_dimension_numbers().rhs_contracting_dimensions_size() == 1 &&
+        parent_->use_fast_path_) {
+      return HandleDot<ReturnT>(dot);
+    }
+    return HandleDotSlowPath(dot);
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_same<
+                                  NativeT, float>::value>::type* = nullptr>
+  Status HandleDot(HloInstruction* dot) {
+    const HloInstruction* lhs = dot->operand(0);
+    const HloInstruction* rhs = dot->operand(1);
+    CHECK(dot->shape().IsArray());
+    CHECK(lhs->shape().IsArray());
+    CHECK(rhs->shape().IsArray());
 
     const auto& dnums = dot->dot_dimension_numbers();
 
-    const auto lhs_rank = ShapeUtil::Rank(lhs->shape());
-    const auto rhs_rank = ShapeUtil::Rank(rhs->shape());
+    const int64 lhs_rank = lhs->shape().rank();
+    const int64 rhs_rank = rhs->shape().rank();
 
     CHECK(ShapeUtil::SameElementType(lhs->shape(), rhs->shape()));
     CHECK(ShapeUtil::SameElementType(lhs->shape(), dot->shape()));
 
     // There must be 1 and only 1 Contracting dimension for lhs and rhs.
-    CHECK_EQ(dnums.lhs_contracting_dimensions_size(), 1);
-    CHECK_EQ(dnums.rhs_contracting_dimensions_size(), 1);
     const int64 lhs_contracting_dimension = dnums.lhs_contracting_dimensions(0);
     const int64 rhs_contracting_dimension = dnums.rhs_contracting_dimensions(0);
     // Contracted dimension sizes must be the same.
@@ -1174,8 +1236,56 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
         << lhs->shape().dimensions(lhs_contracting_dimension)
         << " rhs contracted dimension: "
         << rhs->shape().dimensions(rhs_contracting_dimension);
-    const int64 contracted_dimension_size =
-        lhs->shape().dimensions(lhs_contracting_dimension);
+
+    // The fast path is for a simple rank 2 dot with default layout operands.
+    if (lhs_rank == 2 && rhs_rank == 2 && lhs_contracting_dimension == 1 &&
+        rhs_contracting_dimension == 0 &&
+        LayoutUtil::Equal(lhs->shape().layout(),
+                          LayoutUtil::GetDefaultLayoutForR2()) &&
+        LayoutUtil::Equal(rhs->shape().layout(),
+                          LayoutUtil::GetDefaultLayoutForR2()) &&
+        LayoutUtil::Equal(dot->shape().layout(),
+                          LayoutUtil::GetDefaultLayoutForR2())) {
+      const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+      const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+      const int64 contracted_dimension_size =
+          lhs->shape().dimensions(lhs_contracting_dimension);
+      Array2D<NativeT> lhs_array(lhs->shape().dimensions(0),
+                                 contracted_dimension_size);
+      lhs_array.SetValues(lhs_literal.data<NativeT>());
+      Array2D<NativeT> rhs_array(contracted_dimension_size,
+                                 rhs->shape().dimensions(1));
+      rhs_array.SetValues(rhs_literal.data<NativeT>());
+      std::unique_ptr<Array2D<NativeT>> result_array =
+          HloEvaluator::MatmulArray2D(lhs_array, rhs_array);
+      Literal result(dot->shape());
+      result.PopulateR2FromArray2D(*result_array);
+      parent_->evaluated_[dot] = std::move(result);
+      return Status::OK();
+    }
+    return HandleDotSlowPath(dot);
+  }
+
+  template <typename NativeT, typename std::enable_if<!std::is_same<
+                                  NativeT, float>::value>::type* = nullptr>
+  Status HandleDot(HloInstruction* dot) {
+    return HandleDotSlowPath(dot);
+  }
+
+  Status HandleDotSlowPath(HloInstruction* dot) {
+    auto lhs = dot->operand(0);
+    auto rhs = dot->operand(1);
+    CHECK(dot->shape().IsArray());
+    CHECK(lhs->shape().IsArray());
+    CHECK(rhs->shape().IsArray());
+
+    const auto& dnums = dot->dot_dimension_numbers();
+
+    const auto lhs_rank = lhs->shape().rank();
+    const auto rhs_rank = rhs->shape().rank();
+
+    CHECK(ShapeUtil::SameElementType(lhs->shape(), rhs->shape()));
+    CHECK(ShapeUtil::SameElementType(lhs->shape(), dot->shape()));
 
     const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
     const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
@@ -1190,7 +1300,9 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     // in lhs_index or rhs_index where the i'th result index should go.
     absl::InlinedVector<std::pair<int64*, int64*>, kInlineRank>
         result_index_locations;
-    result_index_locations.reserve(lhs_rank + rhs_rank - 2);
+    result_index_locations.reserve(
+        (lhs_rank - dnums.lhs_contracting_dimensions_size()) +
+        (rhs_rank - dnums.rhs_contracting_dimensions_size()));
 
     // The first components in the output shape are the LHS and RHS batch
     // dimensions:
@@ -1202,18 +1314,32 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
     // Then we have the LHS and RHS non-contracting dimensions, if any:
     for (int64 i = 0; i < lhs_rank; i++) {
-      if (i != lhs_contracting_dimension &&
+      if (!absl::c_linear_search(dnums.lhs_contracting_dimensions(), i) &&
           !absl::c_linear_search(dnums.lhs_batch_dimensions(), i)) {
         result_index_locations.push_back({&lhs_index[i], nullptr});
       }
     }
     for (int64 i = 0; i < rhs_rank; i++) {
-      if (i != rhs_contracting_dimension &&
+      if (!absl::c_linear_search(dnums.rhs_contracting_dimensions(), i) &&
           !absl::c_linear_search(dnums.rhs_batch_dimensions(), i)) {
         result_index_locations.push_back({&rhs_index[i], nullptr});
       }
     }
 
+    absl::InlinedVector<int64, kInlineRank> accumulate_index_sizes;
+    accumulate_index_sizes.reserve(dnums.lhs_contracting_dimensions_size());
+    absl::InlinedVector<std::pair<int64*, int64*>, kInlineRank>
+        accumulate_index_locations;
+    accumulate_index_locations.reserve(dnums.lhs_contracting_dimensions_size());
+    for (int64 i = 0; i < dnums.lhs_contracting_dimensions_size(); ++i) {
+      const int64 lhs_dnum = dnums.lhs_contracting_dimensions(i);
+      const int64 rhs_dnum = dnums.rhs_contracting_dimensions(i);
+      accumulate_index_locations.push_back(
+          {&lhs_index[lhs_dnum], &rhs_index[rhs_dnum]});
+      const int64 dim_size = lhs->shape().dimensions(lhs_dnum);
+      accumulate_index_sizes.push_back(dim_size);
+    }
+    const int64 total_contraction_size = Product(accumulate_index_sizes);
     Literal result(dot->shape());
     TF_RETURN_IF_ERROR(
         result.Populate<ReturnT>([&](absl::Span<const int64> result_index) {
@@ -1227,13 +1353,30 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
           }
 
           // Accumulates resulting product along the contracted dimension.
-          for (int64 i = 0; i < contracted_dimension_size; ++i) {
-            lhs_index[lhs_contracting_dimension] = i;
-            rhs_index[rhs_contracting_dimension] = i;
+          absl::InlinedVector<int64, kInlineRank> accumulate_index(
+              accumulate_index_sizes.size(), 0);
+          for (int64 k = 0; k < total_contraction_size; k++) {
+            for (int64 i = 0; i < accumulate_index_sizes.size(); ++i) {
+              *(accumulate_index_locations[i].first) = accumulate_index[i];
+              *(accumulate_index_locations[i].second) = accumulate_index[i];
+            }
 
             result_val +=
                 static_cast<ElementwiseT>(lhs_literal.Get<ReturnT>(lhs_index)) *
                 static_cast<ElementwiseT>(rhs_literal.Get<ReturnT>(rhs_index));
+
+            // If there are no contracting dimension accumulate_index_sizes is
+            // empty, do not try to count down from -1 to 0 since it is and
+            // infinite loop.
+            if (!accumulate_index_sizes.empty()) {
+              for (int64 i = accumulate_index_sizes.size() - 1; i >= 0; --i) {
+                int64 value = ++accumulate_index[i];
+                if (value != accumulate_index_sizes[i]) {
+                  break;
+                }
+                accumulate_index[i] = 0;
+              }
+            }
           }
 
           return static_cast<ReturnT>(result_val);
@@ -1244,10 +1387,10 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandlePad(HloInstruction* pad) override {
-    CHECK(ShapeUtil::IsArray(pad->operand(0)->shape()));
+    CHECK(pad->operand(0)->shape().IsArray());
     // Padding value must be scalar.
     CHECK(ShapeUtil::IsScalar(pad->operand(1)->shape()));
-    CHECK_EQ(ShapeUtil::Rank(pad->operand(0)->shape()),
+    CHECK_EQ(pad->operand(0)->shape().rank(),
              pad->padding_config().dimensions_size());
 
     TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
@@ -1270,9 +1413,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     const Literal& evaluated_operand =
         parent_->GetEvaluatedLiteralFor(pad->operand(0));
 
-    std::vector<int64> input_index(ShapeUtil::Rank(evaluated_operand.shape()),
-                                   0);
-    std::vector<int64> target_index(ShapeUtil::Rank(result.shape()), 0);
+    std::vector<int64> input_index(evaluated_operand.shape().rank(), 0);
+    std::vector<int64> target_index(result.shape().rank(), 0);
 
     // Loop through each element of the operand, assign them to the
     // corresponding index of the resulting padded literal.
@@ -1315,10 +1457,12 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     auto operand = dynamic_slice->operand(0);
     auto start_indices = dynamic_slice->operand(1);
     auto result_shape = dynamic_slice->shape();
-    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
-                        ShapeInference::InferDynamicSliceShape(
-                            operand->shape(), start_indices->shape(),
-                            dynamic_slice->dynamic_slice_sizes()));
+    TF_ASSIGN_OR_RETURN(
+        auto inferred_return_shape,
+        ShapeInference::InferDynamicSliceShape(
+            operand->shape(),
+            Cast<HloDynamicSliceInstruction>(dynamic_slice)->index_shapes(),
+            dynamic_slice->dynamic_slice_sizes()));
     TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
         << "return shape is set to: " << ShapeUtil::HumanString(result_shape)
         << " but is inferred to be: "
@@ -1327,33 +1471,39 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
         primitive_util::IsIntegralType(start_indices->shape().element_type()));
 
     const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
-    const Literal& start_indices_literal =
-        parent_->GetEvaluatedLiteralFor(start_indices);
 
     switch (start_indices->shape().element_type()) {
       case S32: {
         TF_ASSIGN_OR_RETURN(
             parent_->evaluated_[dynamic_slice],
-            DynamicSlice<int32>(operand_literal, start_indices_literal,
-                                result_shape));
+            DynamicSlice<int32>(
+                operand_literal,
+                absl::MakeConstSpan(dynamic_slice->operands()).subspan(1),
+                result_shape));
       } break;
       case S64: {
         TF_ASSIGN_OR_RETURN(
             parent_->evaluated_[dynamic_slice],
-            DynamicSlice<int64>(operand_literal, start_indices_literal,
-                                result_shape));
+            DynamicSlice<int64>(
+                operand_literal,
+                absl::MakeConstSpan(dynamic_slice->operands()).subspan(1),
+                result_shape));
       } break;
       case U32: {
         TF_ASSIGN_OR_RETURN(
             parent_->evaluated_[dynamic_slice],
-            DynamicSlice<uint32>(operand_literal, start_indices_literal,
-                                 result_shape));
+            DynamicSlice<uint32>(
+                operand_literal,
+                absl::MakeConstSpan(dynamic_slice->operands()).subspan(1),
+                result_shape));
       } break;
       case U64: {
         TF_ASSIGN_OR_RETURN(
             parent_->evaluated_[dynamic_slice],
-            DynamicSlice<uint64>(operand_literal, start_indices_literal,
-                                 result_shape));
+            DynamicSlice<uint64>(
+                operand_literal,
+                absl::MakeConstSpan(dynamic_slice->operands()).subspan(1),
+                result_shape));
       } break;
       default:
         LOG(FATAL) << "HandleDynamicSlice: unhandled primitive type for "
@@ -1373,7 +1523,9 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     TF_ASSIGN_OR_RETURN(
         auto inferred_return_shape,
         ShapeInference::InferDynamicUpdateSliceShape(
-            operand->shape(), update->shape(), start_indices->shape()));
+            operand->shape(), update->shape(),
+            Cast<HloDynamicUpdateSliceInstruction>(dynamic_update_slice)
+                ->index_shapes()));
     TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
         << "return shape is set to: " << ShapeUtil::HumanString(result_shape)
         << " but is inferred to be: "
@@ -1384,33 +1536,39 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
     const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
     const Literal& update_literal = parent_->GetEvaluatedLiteralFor(update);
-    const Literal& start_indices_literal =
-        parent_->GetEvaluatedLiteralFor(start_indices);
 
     switch (start_indices->shape().element_type()) {
       case S32: {
         TF_ASSIGN_OR_RETURN(
             parent_->evaluated_[dynamic_update_slice],
-            DynamicUpdateSlice<int32>(operand_literal, update_literal,
-                                      start_indices_literal));
+            DynamicUpdateSlice<int32>(
+                operand_literal, update_literal,
+                absl::MakeConstSpan(dynamic_update_slice->operands())
+                    .subspan(2)));
       } break;
       case S64: {
         TF_ASSIGN_OR_RETURN(
             parent_->evaluated_[dynamic_update_slice],
-            DynamicUpdateSlice<int64>(operand_literal, update_literal,
-                                      start_indices_literal));
+            DynamicUpdateSlice<int64>(
+                operand_literal, update_literal,
+                absl::MakeConstSpan(dynamic_update_slice->operands())
+                    .subspan(2)));
       } break;
       case U32: {
         TF_ASSIGN_OR_RETURN(
             parent_->evaluated_[dynamic_update_slice],
-            DynamicUpdateSlice<uint32>(operand_literal, update_literal,
-                                       start_indices_literal));
+            DynamicUpdateSlice<uint32>(
+                operand_literal, update_literal,
+                absl::MakeConstSpan(dynamic_update_slice->operands())
+                    .subspan(2)));
       } break;
       case U64: {
         TF_ASSIGN_OR_RETURN(
             parent_->evaluated_[dynamic_update_slice],
-            DynamicUpdateSlice<uint64>(operand_literal, update_literal,
-                                       start_indices_literal));
+            DynamicUpdateSlice<uint64>(
+                operand_literal, update_literal,
+                absl::MakeConstSpan(dynamic_update_slice->operands())
+                    .subspan(2)));
       } break;
       default:
         LOG(FATAL) << "HandleDynamicUpdateSlice: unhandled primitive type for "
@@ -1447,7 +1605,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
           }
 
           Literal computed_result =
-              embedded_evaluator.Evaluate<Literal>(*computation, arg_literals)
+              embedded_evaluator.Evaluate(*computation, arg_literals)
                   .ConsumeValueOrDie();
           // Clear visit states so that the we can use the evaluate again on
           // the same computation.
@@ -1505,6 +1663,10 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
         TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<complex64>(map));
         break;
       }
+      case C128: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<complex128>(map));
+        break;
+      }
       default:
         LOG(FATAL) << "HandleMap: unhandled primitive type for "
                       "input operand: "
@@ -1515,80 +1677,14 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
-  template <typename NativeT,
-            typename std::enable_if<
-                !is_complex_t<NativeT>::value &&
-                !std::is_same<NativeT, bool>::value>::type* = nullptr>
-  Status HandleSort(HloInstruction* sort) {
-    auto keys = sort->operand(0);
-    TF_RET_CHECK(sort->operand_count() == 1)
-        << "Typed visitor does not support key-value sort";
-
-    const Literal& keys_literal = parent_->GetEvaluatedLiteralFor(keys);
-    int64 sort_dim = sort->dimensions(0);
-    int64 sort_dim_elements = keys->shape().dimensions(sort_dim);
-    int64 rank = ShapeUtil::Rank(keys->shape());
-    if (rank == 0) {
-      // Nothing to sort.
-      parent_->evaluated_[sort] = keys_literal.Clone();
-      return Status::OK();
-    }
-    Literal result_literal(keys_literal.shape());
-    std::vector<int64> zero_base(rank, 0);
-    std::vector<int64> increment(rank, 1);
-    increment[sort_dim] = sort_dim_elements;
-    // Iterate through each dimension except 'sort_dim'.
-    TF_RETURN_IF_ERROR(ShapeUtil::ForEachIndexWithStatus(
-        keys->shape(), zero_base, AsInt64Slice(keys->shape().dimensions()),
-        increment, [&](absl::Span<const int64> indices) -> StatusOr<bool> {
-          // Extract a slice from the literal that corresponds to exactly the
-          // row in dimension 'sort_dim'.
-          std::vector<int64> limit_indices(indices.begin(), indices.end());
-          std::for_each(limit_indices.begin(), limit_indices.end(),
-                        [](int64& index) { ++index; });
-          limit_indices[sort_dim] = sort_dim_elements;
-          TF_ASSIGN_OR_RETURN(auto row_to_sort,
-                              keys_literal.Slice(indices, limit_indices)
-                                  .Reshape({sort_dim_elements}));
-          const auto& row_data = row_to_sort.data<NativeT>();
-
-          std::vector<NativeT> result_data(row_data.begin(), row_data.end());
-          std::stable_sort(result_data.begin(), result_data.end(),
-                           [](const NativeT& a, const NativeT& b) {
-                             return SafeLess<NativeT>(a, b);
-                           });
-          Literal sorted_row(ShapeUtil::MakeShape(keys->shape().element_type(),
-                                                  {sort_dim_elements}));
-          sorted_row.PopulateR1(absl::Span<const NativeT>(result_data));
-          std::vector<int64> slice_dimensions(rank, 1);
-          slice_dimensions[sort_dim] = sort_dim_elements;
-          TF_ASSIGN_OR_RETURN(auto sorted_row_reshaped,
-                              sorted_row.Reshape(slice_dimensions));
-          std::vector<int64> start_indices(rank, 0);
-          TF_RETURN_IF_ERROR(result_literal.CopySliceFrom(
-              sorted_row_reshaped, start_indices, indices, slice_dimensions));
-          return true;
-        }));
-    parent_->evaluated_[sort] = std::move(result_literal);
-    return Status::OK();
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<is_complex_t<NativeT>::value ||
-                                    std::is_same<NativeT, bool>::value>::type* =
-                nullptr>
-  Status HandleSort(HloInstruction* sort) {
-    return InvalidArgument("Unsupported type for Sort");
-  }
-
   Status HandleSort(HloInstruction* sort) override {
-    return HandleSort<ReturnT>(sort);
+    return UnsupportedTypeError(sort);
   }
 
   Status HandleReduce(HloInstruction* hlo) override {
     HloReduceInstruction* reduce = Cast<HloReduceInstruction>(hlo);
     int64 num_args = reduce->inputs().size();
-    bool has_tuple_output = ShapeUtil::IsTuple(reduce->shape());
+    bool has_tuple_output = reduce->shape().IsTuple();
     absl::Span<const int64> dimensions(reduce->dimensions());
     HloComputation* function = reduce->to_apply();
 
@@ -1619,7 +1715,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
     // All args and results have the same dimensions, so pick an arbitrary one.
     const Shape& arg_shape = arg_literals[0]->shape();
-    const Shape& result_shape = ShapeUtil::IsTuple(reduce->shape())
+    const Shape& result_shape = reduce->shape().IsTuple()
                                     ? reduce->shape().tuple_shapes(0)
                                     : reduce->shape();
     const auto arg_dimensions = AsInt64Slice(arg_shape.dimensions());
@@ -1708,7 +1804,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
                              [](Literal& literal) { return &literal; });
 
               TF_ASSIGN_OR_RETURN(Literal computed_result,
-                                  embedded_evaluator.Evaluate<const Literal*>(
+                                  embedded_evaluator.Evaluate(
                                       *function, embedded_operands_ptrs));
               // Clear visit states so that we can use the evaluator again on
               // the same computation.
@@ -1786,7 +1882,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
     const Literal& source_literal = parent_->GetEvaluatedLiteralFor(source);
 
-    int64 rank = ShapeUtil::Rank(operand_literal.shape());
+    int64 rank = operand_literal.shape().rank();
 
     HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
     DimensionVector source_index(rank, 0);
@@ -1824,8 +1920,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
             selected_val_literal.Set({}, *selected_val);
             Literal computed_result =
                 embedded_evaluator
-                    .Evaluate<const Literal*>(
-                        *select, {&selected_val_literal, &curr_val_literal})
+                    .Evaluate(*select,
+                              {&selected_val_literal, &curr_val_literal})
                     .ConsumeValueOrDie();
             bool selected = !computed_result.Get<bool>({});
             if (selected) {
@@ -1846,9 +1942,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
               scattered_literal.Set({}, scattered);
               Literal computed_result =
                   embedded_evaluator
-                      .Evaluate<const Literal*>(
-                          *scatter,
-                          {&source_literal_scatter, &scattered_literal})
+                      .Evaluate(*scatter,
+                                {&source_literal_scatter, &scattered_literal})
                       .ConsumeValueOrDie();
               result.Set(operand_index, computed_result.Get<ReturnT>({}));
               // Clear visit states so that the we can use the evaluator again
@@ -1898,7 +1993,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
         operand->shape().element_type(), window_dimension_sizes);
 
     DimensionVector window_index(window.dimensions_size());
-    DimensionVector operand_index(ShapeUtil::Rank(operand_literal.shape()));
+    DimensionVector operand_index(operand_literal.shape().rank());
 
     HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
     Literal result(reduce_window->shape());
@@ -1922,8 +2017,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
                     LiteralUtil::CreateR0<ReturnT>(result_val);
                 Literal computed_result =
                     embedded_evaluator
-                        .Evaluate<const Literal*>(
-                            *function, {&result_val_literal, &curr_val_literal})
+                        .Evaluate(*function,
+                                  {&result_val_literal, &curr_val_literal})
                         .ConsumeValueOrDie();
 
                 // Clear visit states so that the we can use the evaluate again
@@ -2285,9 +2380,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
           LiteralUtil::CreateR0<ReturnT>(updates.Get<ReturnT>(update_index));
       Literal updated_result =
           embedded_evaluator
-              .Evaluate<const Literal*>(
-                  *scatter->to_apply(),
-                  {&result_value_literal, &update_value_literal})
+              .Evaluate(*scatter->to_apply(),
+                        {&result_value_literal, &update_value_literal})
               .ConsumeValueOrDie();
       // Clear visit states so that the we can use the evaluate again on the
       // same computation.
@@ -2329,7 +2423,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
         << " but is inferred to be: "
         << ShapeUtil::HumanString(inferred_return_shape);
 
-    const int64 rank = ShapeUtil::Rank(operand->shape());
+    const int64 rank = operand->shape().rank();
     const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
     auto func = [&](absl::Span<const int64> out_index) {
       DimensionVector operand_index(rank);
@@ -2357,7 +2451,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
             std::is_same<NativeT, int64>::value ||
             std::is_same<NativeT, uint64>::value)>::type* = nullptr>
   Status HandleClz(HloInstruction* clz) {
-    return InvalidArgument("Unsupported type for Clz");
+    return UnsupportedTypeError(clz);
   }
 
   template <typename NativeT,
@@ -2403,7 +2497,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       typename std::enable_if<std::is_integral<NativeT>::value ||
                               is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleSin(HloInstruction* sin) {
-    return InvalidArgument("Unsupported type for Sin");
+    return UnsupportedTypeError(sin);
   }
 
   Status HandleSin(HloInstruction* sin) override {
@@ -2425,7 +2519,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       typename std::enable_if<std::is_integral<NativeT>::value ||
                               is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleCos(HloInstruction* cos) {
-    return InvalidArgument("Unsupported type for Cos");
+    return UnsupportedTypeError(cos);
   }
 
   Status HandleCos(HloInstruction* cos) override {
@@ -2526,7 +2620,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   template <typename NativeT, typename std::enable_if<std::is_same<
                                   double, NativeT>::value>::type* = nullptr>
   Status HandleReducePrecision(HloInstruction* reduce_precision) {
-    return InvalidArgument("Double not supported for reduce precision");
+    return InvalidArgument("Double is not supported for reduce precision");
   }
 
   template <
@@ -2534,46 +2628,172 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       typename std::enable_if<std::is_integral<NativeT>::value ||
                               is_complex_t<NativeT>::value>::type* = nullptr>
   Status HandleReducePrecision(HloInstruction* reduce_precision) {
-    return InvalidArgument("Unsupported type for reduce precision");
+    return UnsupportedTypeError(reduce_precision);
   }
 
   Status HandleReducePrecision(HloInstruction* reduce_precision) override {
     return HandleReducePrecision<ElementwiseT>(reduce_precision);
   }
 
-  template <typename NativeT,
-            typename std::enable_if<
-                std::is_integral<NativeT>::value ||
-                std::is_floating_point<NativeT>::value>::type* = nullptr>
+  template <
+      typename NativeT,
+      typename std::enable_if<
+          std::is_same<NativeT, bfloat16>::value ||
+          std::is_same<NativeT, Eigen::half>::value ||
+          std::is_integral<NativeT>::value || is_complex_t<NativeT>::value ||
+          std::is_floating_point<NativeT>::value>::type* = nullptr>
   Status HandleIota(HloInstruction* instruction) {
     auto* iota = Cast<HloIotaInstruction>(instruction);
+    const int64 iota_size = iota->shape().dimensions(iota->iota_dimension());
     // Avoid using std::vector since std::vector<bool> does not convert to
     // absl::Span<bool>.
-    absl::InlinedVector<NativeT, 1> data(
-        iota->shape().dimensions(iota->iota_dimension()));
-    std::iota(data.begin(), data.end(), 0);
+    absl::InlinedVector<NativeT, 1> data(iota_size);
+    // We don't use std::iota for two reasons:
+    //
+    // (1) std:iota does not support bfloat16 and float16.
+    //
+    // (2) std::iota saturates for floating point types when the value is not
+    //     representable, but the definition of HLO iota is the value as a
+    //     64-bit integer cast to the native type.
+    for (int64 i = 0; i < iota_size; ++i) {
+      // static_cast is required for Eigen::half (F16).
+      data[i] = static_cast<NativeT>(i);
+    }
     auto result = LiteralUtil::CreateR1<NativeT>(data);
 
-    if (ShapeUtil::Rank(iota->shape()) > 1) {
+    if (iota->shape().rank() > 1) {
       TF_ASSIGN_OR_RETURN(
           parent_->evaluated_[iota],
           result.Broadcast(iota->shape(), {iota->iota_dimension()}));
     } else {
-      TF_RET_CHECK(ShapeUtil::Rank(iota->shape()) == 1);
+      TF_RET_CHECK(iota->shape().rank() == 1);
       parent_->evaluated_[iota] = std::move(result);
     }
 
     return Status::OK();
   }
+  template <
+      typename NativeT,
+      typename std::enable_if<
+          !(std::is_same<NativeT, bfloat16>::value ||
+            std::is_same<NativeT, Eigen::half>::value ||
+            std::is_integral<NativeT>::value || is_complex_t<NativeT>::value ||
+            std::is_floating_point<NativeT>::value)>::type* = nullptr>
+  Status HandleIota(HloInstruction* iota) {
+    return UnsupportedTypeError(iota);
+  }
+  Status HandleIota(HloInstruction* iota) override {
+    return HandleIota<ReturnT>(iota);
+  }
+
   template <typename NativeT,
             typename std::enable_if<
                 !(std::is_integral<NativeT>::value ||
                   std::is_floating_point<NativeT>::value)>::type* = nullptr>
-  Status HandleIota(HloInstruction* iota) {
-    return InvalidArgument("Unsupported type for iota");
+  Status HandleRng(HloInstruction* random) {
+    return UnsupportedTypeError(random);
   }
-  Status HandleIota(HloInstruction* iota) override {
-    return HandleIota<ReturnT>(iota);
+  template <typename NativeT,
+            typename std::enable_if<
+                (std::is_floating_point<NativeT>::value)>::type* = nullptr>
+  Status HandleRng(HloInstruction* random) {
+    RandomDistribution distribution = random->random_distribution();
+    const auto result_shape = random->shape();
+    Literal result(result_shape);
+
+    switch (distribution) {
+      case RNG_UNIFORM: {
+        const Literal& low =
+            parent_->GetEvaluatedLiteralFor(random->operand(0));
+        const Literal& high =
+            parent_->GetEvaluatedLiteralFor(random->operand(1));
+
+        // std::uniform_real_distribution(a, b) can sometimes return a value
+        // equal to b.  Unclear if this is a spec bug or an implementation bug
+        // or WAI [0] [1] [2].  Anyway for our purposes we want a half-open
+        // interval, so we have to re-sample if we get `b` out.
+        //
+        // [0] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63176
+        // [1] https://bugs.llvm.org/show_bug.cgi?id=18767
+        // [2] http://open-std.org/JTC1/SC22/WG21/docs/lwg-active.html#2524
+        auto low_val = low.Get<NativeT>({});
+        auto high_val = high.Get<NativeT>({});
+        std::uniform_real_distribution<NativeT> generator(low_val, high_val);
+        TF_RETURN_IF_ERROR(
+            result.Populate<NativeT>([&](absl::Span<const int64> /*indexes*/) {
+              while (true) {
+                NativeT v = generator(parent_->engine_);
+                if (v != high_val) {
+                  return v;
+                }
+              }
+            }));
+        break;
+      }
+      case RNG_NORMAL: {
+        const Literal& mean =
+            parent_->GetEvaluatedLiteralFor(random->operand(0));
+        const Literal& stddev =
+            parent_->GetEvaluatedLiteralFor(random->operand(1));
+
+        std::normal_distribution<NativeT> generator(mean.Get<NativeT>({}),
+                                                    stddev.Get<NativeT>({}));
+
+        TF_RETURN_IF_ERROR(
+            result.Populate<NativeT>([&](absl::Span<const int64> /*indexes*/) {
+              return generator(parent_->engine_);
+            }));
+        break;
+      }
+      default:
+        return UnimplementedStrCat("The distribution ",
+                                   RandomDistribution_Name(distribution),
+                                   " is not implemented.");
+    }
+    parent_->evaluated_[random] = std::move(result);
+    return Status::OK();
+  }
+  template <typename NativeT,
+            typename std::enable_if<(std::is_integral<NativeT>::value)>::type* =
+                nullptr>
+  Status HandleRng(HloInstruction* random) {
+    RandomDistribution distribution = random->random_distribution();
+    const auto result_shape = random->shape();
+    Literal result(result_shape);
+
+    switch (distribution) {
+      case RNG_UNIFORM: {
+        const Literal& low =
+            parent_->GetEvaluatedLiteralFor(random->operand(0));
+        const Literal& high =
+            parent_->GetEvaluatedLiteralFor(random->operand(1));
+
+        // Note std::uniform_int_distribution assumes interval is closed, i.e.,
+        // [low, high], but we want [low, high) instead. Hence high-1 is used as
+        // the upper range.
+        std::uniform_int_distribution<int64> generator(
+            low.Get<NativeT>({}), high.Get<NativeT>({}) - 1);
+
+        TF_RETURN_IF_ERROR(
+            result.Populate<NativeT>([&](absl::Span<const int64> /*indexes*/) {
+              return static_cast<NativeT>(generator(parent_->engine_));
+            }));
+        break;
+      }
+      case RNG_NORMAL: {
+        return Unimplemented(
+            "Normal distribution is not supported for integral types.");
+      }
+      default:
+        return UnimplementedStrCat("The distribution ",
+                                   RandomDistribution_Name(distribution),
+                                   " is not implemented.");
+    }
+    parent_->evaluated_[random] = std::move(result);
+    return Status::OK();
+  }
+  Status HandleRng(HloInstruction* random) override {
+    return HandleRng<ReturnT>(random);
   }
 
  private:
@@ -2587,7 +2807,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   //
   // This lets you calculate LI given the multidimensional indices in any order.
   static DimensionVector MakeDimMultipliers(const Shape& shape) {
-    DimensionVector v(ShapeUtil::Rank(shape));
+    DimensionVector v(shape.rank());
     int64 scale = 1;
     for (auto dim : LayoutUtil::MinorToMajor(shape)) {
       v[dim] = scale;
@@ -2604,7 +2824,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       const Shape& window_shape, const Window& window, const Shape& base_shape,
       const absl::Span<const int64>& window_count_index,
       const std::function<void(const std::vector<int64>&)>& f) {
-    const int64 rank = ShapeUtil::Rank(base_shape);
+    const int64 rank = base_shape.rank();
     DimensionVector window_index(rank);
     std::fill(window_index.begin(), window_index.end(), 0);
     do {
@@ -2635,12 +2855,16 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   template <typename IndexT>
-  StatusOr<Literal> DynamicSlice(const Literal& operand_literal,
-                                 const Literal& start_indices_literal,
-                                 const Shape& result_shape) {
-    auto start_indices_typed = start_indices_literal.data<IndexT>();
-    std::vector<int64> start(start_indices_typed.begin(),
-                             start_indices_typed.end());
+  StatusOr<Literal> DynamicSlice(
+      const Literal& operand_literal,
+      absl::Span<HloInstruction* const> start_indices,
+      const Shape& result_shape) {
+    std::vector<int64> start;
+
+    for (HloInstruction* index : start_indices) {
+      start.push_back(
+          parent_->GetEvaluatedLiteralFor(index).GetFirstElement<IndexT>());
+    }
 
     // Clamp the start indices so the slice is in-bounds w.r.t the operand.
     for (int64 i = 0; i < start.size(); ++i) {
@@ -2666,14 +2890,17 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   }
 
   template <typename IndexT>
-  StatusOr<Literal> DynamicUpdateSlice(const Literal& operand_literal,
-                                       const Literal& update_literal,
-                                       const Literal& start_indices_literal) {
+  StatusOr<Literal> DynamicUpdateSlice(
+      const Literal& operand_literal, const Literal& update_literal,
+      absl::Span<HloInstruction* const> start_indices) {
     auto result = operand_literal.Clone();
-    auto start_indices_typed = start_indices_literal.data<IndexT>();
-    const auto rank = ShapeUtil::Rank(result.shape());
-    std::vector<int64> start(start_indices_typed.begin(),
-                             start_indices_typed.end());
+    const auto rank = result.shape().rank();
+    std::vector<int64> start;
+    for (HloInstruction* index : start_indices) {
+      start.push_back(
+          parent_->GetEvaluatedLiteralFor(index).GetFirstElement<IndexT>());
+    }
+
     // Clamp the update start indices so the slice is in-bounds w.r.t the
     // operand.
     for (int64 i = 0; i < rank; ++i) {
@@ -2790,6 +3017,7 @@ extern template class HloEvaluatorTypedVisitor<Eigen::half, float>;
 extern template class HloEvaluatorTypedVisitor<float>;
 extern template class HloEvaluatorTypedVisitor<double>;
 extern template class HloEvaluatorTypedVisitor<complex64>;
+extern template class HloEvaluatorTypedVisitor<complex128>;
 extern template class HloEvaluatorTypedVisitor<bfloat16, float>;
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_complex128.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_complex128.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1f48140ee4f6ca9415bef80c83664213109dbf9f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_complex128.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<complex128>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int16.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int16.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e54285a1577a3f3c97fba5ba6c2f969299ab599e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int16.cc
@@ -0,0 +1,22 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<int16>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint16.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint16.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cc708952d20a00429944c8388a84a0e610c2f38f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint16.cc
@@ -0,0 +1,22 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<uint16>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
index 5be9dba3aa49d63c580cd6f5800f608667826b6a..df06cf8c53ec8407f8b44c9126ed4fb5409f8ef3 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
@@ -45,7 +45,7 @@ TEST_F(HloExecutionProfileTest, Basic) {
 
   auto shape_size_function = [&](const Shape& shape) {
     const int64 pointer_size = 8;
-    if (ShapeUtil::IsOpaque(shape)) {
+    if (shape.IsOpaque()) {
       return pointer_size;
     }
     return ShapeUtil::ByteSizeOf(shape, pointer_size);
diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
index c919dbd82d3668c477bf37074f1d56f8cb7d9506..862b2029718bbd802b69d789b66683a4edfa2367 100644
--- a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
+++ b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
@@ -25,7 +26,9 @@ namespace xla {
 
 namespace {
 
-StatusOr<bool> ReplaceGetSize(HloInstruction* instr) {
+StatusOr<bool> ReplaceGetSize(
+    HloInstruction* instr,
+    const DynamicDimensionInference* dynamic_dimension_inference) {
   if (instr->opcode() != HloOpcode::kGetDimensionSize) {
     return false;
   }
@@ -36,10 +39,18 @@ StatusOr<bool> ReplaceGetSize(HloInstruction* instr) {
                           instr->operand(0)->shape(), instr->dimension()));
   TF_RET_CHECK(ShapeUtil::Equal(instr->shape(), legal_shape));
   TF_RET_CHECK(ShapeUtil::HasPrimitiveType(instr->shape(), U32));
-  uint32 size = instr->operand(0)->shape().dimensions(instr->dimension());
-  HloInstruction* new_instr = computation->AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(size)));
-  TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(new_instr));
+  HloInstruction* operand = instr->mutable_operand(0);
+  int64 dim = instr->dimension();
+  HloInstruction* dynamic_size =
+      dynamic_dimension_inference->GetDynamicSize(operand, {}, dim);
+  if (dynamic_size != nullptr) {
+    TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(dynamic_size));
+  } else {
+    uint32 size = instr->operand(0)->shape().dimensions(dim);
+    HloInstruction* new_instr = computation->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(size)));
+    TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(new_instr));
+  }
   return true;
 }
 
@@ -48,10 +59,13 @@ StatusOr<bool> ReplaceGetSize(HloInstruction* instr) {
 StatusOr<bool> HloGetDimensionSizeRewriter::Run(HloModule* module) {
   bool changed = false;
   HloProto proto;
+  TF_ASSIGN_OR_RETURN(DynamicDimensionInference inference,
+                      DynamicDimensionInference::Run(module));
   *proto.mutable_hlo_module() = module->ToProto();
   for (auto* computation : module->computations()) {
     for (auto instruction : computation->instructions()) {
-      TF_ASSIGN_OR_RETURN(bool replaced, ReplaceGetSize(instruction));
+      TF_ASSIGN_OR_RETURN(bool replaced,
+                          ReplaceGetSize(instruction, &inference));
       changed = changed || replaced;
     }
   }
diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h
index 30f44c23a835b3bcc935caaa917e040e07c4e703..9aa79fe66b665c48ec871c4188e44ba2056de3ad 100644
--- a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h
+++ b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h
@@ -21,7 +21,9 @@ limitations under the License.
 
 namespace xla {
 
-// Pass to replace a kGetDimensionSize instruction with a constant instruction.
+// Pass to replace a kGetDimensionSize instruction with a hlo instruction
+// representing the dynamic size if the dimension is dynamic, otherwise a
+// constant instruction representing the static size.
 class HloGetDimensionSizeRewriter : public HloModulePass {
  public:
   absl::string_view name() const override {
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 302eca656be53a3cec86ddbf05a7fa3925c5185b..254f66021d70622bfd1c0b2623767ca7ff803e0d 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -24,9 +24,9 @@ limitations under the License.
 #include <queue>
 #include <string>
 #include <tuple>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -38,7 +38,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -380,7 +379,7 @@ class HloDotDumper {
   // Each HloInstruction dumped gets a monotically-increasing node ID.  This
   // must start at 1, because that's where graphviz's accounting starts.
   int64 next_node_id_ = 1;
-  std::unordered_map<const HloInstruction*, int64> node_ids_;
+  absl::flat_hash_map<const HloInstruction*, int64> node_ids_;
 
   // The "root" tag doesn't have an associated HloInstruction pointer, so we
   // need to store it outside the map.
@@ -397,7 +396,7 @@ class HloDotDumper {
 
   // Each HloComputation that's emitted gets a monotonically-increasing ID.
   int64 next_cluster_id_ = 1;
-  std::unordered_map<const HloComputation*, int64> cluster_ids_;
+  absl::flat_hash_map<const HloComputation*, int64> cluster_ids_;
 
   // Edges to print from Footer().  Edges come at the end because graphviz is
   // unhappy if an edge from a subcomputation to a node in the outer computation
@@ -407,7 +406,7 @@ class HloDotDumper {
 
   // When coloring by sharding information, we track the sharding string
   // representation to color association, by round-robin the color schemes.
-  std::unordered_map<HloSharding, ColorScheme, HloSharding::Hasher>
+  absl::flat_hash_map<HloSharding, ColorScheme, HloSharding::Hasher>
       sharding_colors_;
   int64 next_shard_color_ = 0;
 };
@@ -536,7 +535,12 @@ stylesheet=<
     }
   }
 
-  return StrFormat(fmt, graph_label, StrJoin(edge_css_rules, "\n"));
+  // Browsers require that we URI-encode the contents of our data URI.  (It
+  // seems this was a relatively recent change?) In practice, this means that we
+  // need to escape '#'.
+  return StrFormat(
+      fmt, graph_label,
+      absl::StrReplaceAll(StrJoin(edge_css_rules, "\n"), {{"#", "%23"}}));
 }
 
 string HloDotDumper::Footer() { return StrCat(StrJoin(edges_, "\n"), "\n}"); }
@@ -561,8 +565,8 @@ bool HloDotDumper::ShouldShowSubcomputation(const HloComputation* subcomp) {
   }
 
   // Show the subcomputation if we're showing any of its members.
-  return std::any_of(
-      subcomp->instructions().begin(), subcomp->instructions().end(),
+  return absl::c_any_of(
+      subcomp->instructions(),
       [&](const HloInstruction* instr) { return filter_.Show(instr); });
 }
 
@@ -733,17 +737,16 @@ bool HloDotDumper::ShouldMergeIntoUsers(const HloInstruction* instr) const {
     return true;
   }
   const int kMinUsersToOmit = 3;
-  return instr->opcode() == HloOpcode::kParameter &&
-         ShapeUtil::IsTuple(instr->shape()) && !instr->IsFused() &&
-         std::count_if(instr->users().begin(), instr->users().end(),
-                       [&](const HloInstruction* user) {
-                         return filter_.Show(user);
-                       }) > kMinUsersToOmit &&
-         std::all_of(instr->users().begin(), instr->users().end(),
-                     [&](const HloInstruction* user) {
-                       return !filter_.Show(user) ||
-                              user->opcode() == HloOpcode::kGetTupleElement;
-                     });
+  return instr->opcode() == HloOpcode::kParameter && instr->shape().IsTuple() &&
+         !instr->IsFused() &&
+         absl::c_count_if(instr->users(),
+                          [&](const HloInstruction* user) {
+                            return filter_.Show(user);
+                          }) > kMinUsersToOmit &&
+         absl::c_all_of(instr->users(), [&](const HloInstruction* user) {
+           return !filter_.Show(user) ||
+                  user->opcode() == HloOpcode::kGetTupleElement;
+         });
 }
 
 string HloDotDumper::DumpInstruction(const HloInstruction* instr) {
@@ -816,7 +819,7 @@ string HloDotDumper::GetInstructionNodeInlinedOperands(
 
     // Print the literal value of constants with <= K elements.
     optional<int64> elem_count;
-    if (ShapeUtil::IsArray(shape)) {
+    if (shape.IsArray()) {
       elem_count = 1;
       for (int64 dim : shape.dimensions()) {
         *elem_count *= dim;
@@ -900,12 +903,11 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
   // the same color as a parameter.  Unless the merged-in parameter is a
   // parameter to a fusion node that is bound to a constant -- these aren't
   // "real" parameters from the user's perspective.
-  if (std::any_of(instr->operands().begin(), instr->operands().end(),
-                  [&](const HloInstruction* operand) {
-                    return operand->opcode() == HloOpcode::kParameter &&
-                           ShouldMergeIntoUsers(operand) &&
-                           TryGetFusionParameterConstant(operand) == nullptr;
-                  })) {
+  if (absl::c_any_of(instr->operands(), [&](const HloInstruction* operand) {
+        return operand->opcode() == HloOpcode::kParameter &&
+               ShouldMergeIntoUsers(operand) &&
+               TryGetFusionParameterConstant(operand) == nullptr;
+      })) {
     return parameter_color;
   }
 
@@ -951,6 +953,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kRemainder:
     case HloOpcode::kRng:
     case HloOpcode::kRoundNearestAfz:
+    case HloOpcode::kRsqrt:
     case HloOpcode::kSelect:
     case HloOpcode::kShiftLeft:
     case HloOpcode::kShiftRightArithmetic:
@@ -959,6 +962,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kSin:
     case HloOpcode::kSlice:
     case HloOpcode::kSort:
+    case HloOpcode::kSqrt:
     case HloOpcode::kSubtract:
     case HloOpcode::kTanh:
       // De-emphasize scalar-shaped elementwise ops -- they're generally
@@ -1013,6 +1017,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kConvolution:
     case HloOpcode::kDot:
     case HloOpcode::kFft:
+    case HloOpcode::kTriangularSolve:
       return kDarkBlue;
     case HloOpcode::kReducePrecision:
       return kRed;
@@ -1030,7 +1035,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kMap:
     case HloOpcode::kGetDimensionSize:
       return kGray;
-    case HloOpcode::kCrossReplicaSum:
+    case HloOpcode::kAllReduce:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
     case HloOpcode::kInfeed:
@@ -1039,6 +1044,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kRecvDone:
     case HloOpcode::kSend:
     case HloOpcode::kSendDone:
+    case HloOpcode::kReplicaId:
       return kBrown;
     case HloOpcode::kCall:
     case HloOpcode::kConditional:
@@ -1282,11 +1288,12 @@ namespace {
 
 // Gets a NodeFilter that includes roughly all instructions whose distance from
 // root is <= radius.
-NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
-                                      int64 radius) {
+NodeFilter MakeNodeRadiusAroundFilter(
+    const HloInstruction* root, int64 radius,
+    const absl::flat_hash_set<const HloInstruction*>& boundary) {
   // First, find the neighborhood of nodes with distance from root <= radius.
   // These nodes are our initial set of "normal" nodes.
-  std::unordered_map<const HloInstruction*, NodeFilterResult> nodes;
+  absl::flat_hash_map<const HloInstruction*, NodeFilterResult> nodes;
   std::deque<std::pair<const HloInstruction*, /*depth*/ int64>> worklist;
   worklist.push_back({root, 0});
   while (!worklist.empty()) {
@@ -1299,6 +1306,9 @@ NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
     if (depth == radius) {
       continue;
     }
+    if (boundary.contains(instr)) {
+      continue;
+    }
 
     // Traverse into instr's operands.
     //
@@ -1307,7 +1317,7 @@ NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
     // are not interesting to the graph at hand.
     if (instr == root || instr->opcode() != HloOpcode::kTuple) {
       for (const HloInstruction* operand : instr->operands()) {
-        if (!nodes.count(operand)) {
+        if (!nodes.contains(operand)) {
           worklist.push_back({operand, depth + 1});
         }
       }
@@ -1335,7 +1345,7 @@ NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
       continue;
     }
     for (const HloInstruction* user : instr->users()) {
-      if (!nodes.count(user)) {
+      if (!nodes.contains(user)) {
         worklist.push_back({user, depth + 1});
       }
     }
@@ -1344,7 +1354,7 @@ NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
   auto is_displayed = [&](const HloInstruction* instr) {
     // Constants are displayed inline with their users; they're never omitted.
     // Nodes in subcomputations are always shown.
-    return nodes.count(instr) > 0 || instr->opcode() == HloOpcode::kConstant ||
+    return nodes.contains(instr) || instr->opcode() == HloOpcode::kConstant ||
            instr->parent() != root->parent();
   };
 
@@ -1355,12 +1365,11 @@ NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
     NodeFilterResult& filter_result = kv.second;
     const auto& operands = instr->operands();
 
-    if (std::any_of(operands.begin(), operands.end(), is_displayed) &&
-        !std::all_of(operands.begin(), operands.end(), is_displayed)) {
+    if (absl::c_any_of(operands, is_displayed) &&
+        !absl::c_all_of(operands, is_displayed)) {
       // Mark nodes with some operands omitted appropriately.
       filter_result = kSomeOperandsOmitted;
-    } else if (!operands.empty() &&
-               std::none_of(operands.begin(), operands.end(), is_displayed)) {
+    } else if (!operands.empty() && absl::c_none_of(operands, is_displayed)) {
       // Mark nodes with *all* operands omitted appropriately.
       filter_result = kOmitNodeOperands;
     }
@@ -1368,8 +1377,7 @@ NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
     // Promote nodes with type kSomeUsersOmitted to kNormalNode if all of their
     // users made it into the graph.
     if (filter_result == kSomeUsersOmitted &&
-        std::all_of(instr->users().begin(), instr->users().end(),
-                    is_displayed)) {
+        absl::c_all_of(instr->users(), is_displayed)) {
       filter_result = kNormalNode;
     }
   }
@@ -1449,9 +1457,6 @@ string SaveGraph(const string& graph,
     case GraphRendererInterface::DOT_GRAPH:
       file_extension = ".dot";
       break;
-    case GraphRendererInterface::TF_GRAPHDEF:
-      file_extension = ".pbtxt";
-      break;
   }
   string path = JoinPath(dest_path, StrCat("hlo_graph_", output_num++, "."));
   auto status = Status::OK();
@@ -1474,39 +1479,42 @@ string ExportGraph(const string& graph,
                    GraphRendererInterface::GraphKind graph_kind,
                    const DebugOptions& debug_options) {
   string path = debug_options.xla_hlo_graph_path();
-  if (!path.empty()) {
+  if (!path.empty() && !debug_options.xla_hlo_dump_as_html()) {
     return SaveGraph(graph, graph_kind, path);
   } else {
     auto graph_renderer =
         GraphRendererRegistry::Default()->GetDefaultRenderer();
     CHECK(graph_renderer != nullptr)
         << "No registered renderer for the HLO graph. "
-           "Use --xla_hlo_graph_path=PATH to export to local file system";
+           "Use --xla_hlo_graph_path=PATH --xla_hlo_dump_as_html=false to "
+           "export to local file system";
     return graph_renderer->RenderGraph(graph, graph_kind, debug_options);
   }
 }
 
 }  // namespace
 
+string HloComputationToDotGraph(const HloComputation& computation,
+                                const DotGraphOptions& options) {
+  DebugOptions default_debug_options;
+  return HloDotDumper(&computation, options.label,
+                      options.debug_options ? *options.debug_options
+                                            : default_debug_options,
+                      options.show_backend_config, options.profile,
+                      NodeFilter())
+      .Dump();
+}
+
 string DumpGraph(const HloComputation& computation, const string& label,
                  const DebugOptions& debug_options,
                  const HloExecutionProfile* hlo_execution_profile,
                  bool show_backend_config) {
   GraphRendererInterface::GraphKind graph_kind;
-  string graph;
-  if (debug_options.xla_hlo_dump_as_graphdef()) {
-    HloTfGraphBuilder builder(debug_options);
-    TF_CHECK_OK(builder.AddComputation(computation));
-    CHECK(tensorflow::protobuf::TextFormat::PrintToString(builder.GetGraphDef(),
-                                                          &graph));
-    graph_kind = GraphRendererInterface::TF_GRAPHDEF;
-  } else {
-    graph =
-        HloDotDumper(&computation, label, debug_options, show_backend_config,
-                     hlo_execution_profile, NodeFilter())
-            .Dump();
-    graph_kind = GraphRendererInterface::DOT_GRAPH;
-  }
+  string graph =
+      HloDotDumper(&computation, label, debug_options, show_backend_config,
+                   hlo_execution_profile, NodeFilter())
+          .Dump();
+  graph_kind = GraphRendererInterface::DOT_GRAPH;
 
   string graph_url = ExportGraph(graph, graph_kind, debug_options);
   LOG(INFO) << "computation " << computation.name() << " [" << label
@@ -1514,12 +1522,13 @@ string DumpGraph(const HloComputation& computation, const string& label,
   return graph_url;
 }
 
-string DumpNeighborhoodAround(const HloInstruction& node, int radius,
-                              bool show_backend_config) {
+string DumpNeighborhoodAround(
+    const HloInstruction& node, int radius, bool show_backend_config,
+    const absl::flat_hash_set<const HloInstruction*>& boundary) {
   auto debug_options = node.GetModule()->config().debug_options();
   string label =
       StrCat("Neighborhood of ", radius, " nodes around ", node.name());
-  NodeFilter filter = MakeNodeRadiusAroundFilter(&node, radius);
+  NodeFilter filter = MakeNodeRadiusAroundFilter(&node, radius, boundary);
   string graph =
       HloDotDumper(node.parent(), label, debug_options, show_backend_config,
                    /*profile=*/nullptr, filter)
@@ -1589,5 +1598,145 @@ string MaybeDumpHloModule(const HloModule& module, const string& label,
   return graph_url;
 }
 
+string WrapDotInHTML(const string& dot) {
+  static const char html_prefix[] = R"html(
+<!DOCTYPE html>
+<html>
+<head>
+  <meta charset="utf-8">
+  <style type="text/css">
+    body {
+      height: 100vh;
+      margin: 0;
+    }
+  </style>
+</head>
+<body>
+  <!-- Integrity hash is generated by https://www.srihash.org/ -->
+  <script src="https://cdn.jsdelivr.net/npm/viz.js@2.1.1/viz.js"
+     integrity="sha384-aD1MJYb0WKIUT+CtwJp5LTuV3U4pLAS6B/nUxL7ECimC2pN9N8vjlMr/yQCAkzxE"
+     crossorigin="anonymous"></script>
+  <script src="https://cdn.jsdelivr.net/npm/viz.js@2.1.1/full.render.js"
+     integrity="sha384-bAixY275aIpCj6Te19y0MILZ4V+VEC8CVFujFEH+Lf7W+4XYYeYLwW5IBI6yQmMT"
+     crossorigin="anonymous"></script>
+  <script src="https://cdn.jsdelivr.net/npm/svg-pan-zoom@3.6.0/dist/svg-pan-zoom.min.js"
+     integrity="sha384-3008WpYB2pOBvE7lwkrKf+qTmbTPGGPYxA9C1YVhvbPukns4ZFj7E98QPLkNW9dS"
+     crossorigin="anonymous"></script>
+  <div id="container" style="height:95vh; border:1px solid black; "></div>
+  <script>
+    var data = `
+)html";
+
+  static const char html_suffix[] = R"html(
+`;
+    var cssregex = new RegExp('stylesheet=<([^]*)\n>\n', 'gm');
+    var results = cssregex.exec(data)
+    // graphviz has problem dealing with large stylesheets.
+    // https://github.com/tensorflow/tensorflow/issues/17220#issuecomment-369228492
+    // In order to avoid the problem, remove the stylesheet from the dot and
+    // insert it directly info the rendered SVG.
+    var dot_data = data;
+    var css_data = ''
+    if (results !== null) {
+        css_data = results[1].replace(/\s*data:.*\s*,/,''); // Strip content-type field.
+        dot_data = data.replace(cssregex, ''); // Remove the stylesheet
+    }
+
+    var render_start = performance.now()
+    function add_controls(svg) {
+        var htmlblob = new Blob([document.documentElement.innerHTML],
+                                {type: 'text/html'});
+        var savehtml = document.createElement('a');
+        savehtml.setAttribute('href', URL.createObjectURL(htmlblob));
+        savehtml.setAttribute('download', 'graph.html');
+        savehtml.innerHTML = " [Save HTML+SVG] ";
+        document.body.append(savehtml);
+        var svgblob = new Blob([svg.outerHTML], {type: 'image/svg'});
+        var savesvg = document.createElement('a');
+        savesvg.setAttribute('href', URL.createObjectURL(svgblob));
+        savesvg.setAttribute('download', 'graph.svg');
+        savesvg.innerHTML = " [Save SVG] ";
+        document.body.append(savesvg);
+        var dotblob =  new Blob([data], {type: 'text/dot'});
+        var savedot = document.createElement('a');
+        savedot.setAttribute('href', URL.createObjectURL(dotblob));
+        savedot.setAttribute('download', 'graph.dot');
+        savedot.innerHTML = " [Save DOT] ";
+        document.body.append(savedot);
+        // Will get called after embed element was loaded
+        var panzoom = svgPanZoom(svg, {
+            zoomEnabled: true,
+            controlIconsEnabled: true,
+        });
+        document.getElementsByTagName("BODY")[0].onresize = function() {
+            panzoom.resize();
+            panzoom.fit();
+            panzoom.center();
+        };
+        var render_end = performance.now();
+        var render_note = document.createElement('div')
+        render_note.innerHTML = 'Rendering took '
+                                + (render_end - render_start).toFixed(2) + "ms."
+        document.body.append(render_note);
+    }
+    var svg = document.getElementById('graph')
+    if (svg == null) {
+        // Need to render SVG first.
+        var viz = new Viz();
+        viz.renderSVGElement(dot_data)
+            .then(function(svg){
+                var container = document.getElementById('container')
+                var style = document.createElementNS('http://www.w3.org/2000/svg', 'style');
+                var node = document.createTextNode(css_data);
+                style.appendChild(node);
+                svg.setAttribute('width', '100%');
+                svg.setAttribute('height', '100%');
+                svg.setAttribute('id', 'graph');
+                svg.appendChild(style);
+                container.appendChild(svg);
+                add_controls(svg);
+            })
+    } else {
+        // HTML already has rendered SVG embedded, so we just need to add
+        // controls.
+        add_controls(svg);
+    }
+  </script>
+</body>
+</html>
+)html";
+
+  return html_prefix + dot + html_suffix;
+}
+
+string RenderDotAsHTMLFile(const string& dot,
+                           const DebugOptions& debug_options) {
+  string html = WrapDotInHTML(dot);
+
+  auto env = tensorflow::Env::Default();
+  std::vector<string> dirs;
+  string output_dir = debug_options.xla_hlo_graph_path();
+  if (output_dir.empty()) {
+    env->GetLocalTempDirectories(&dirs);
+  } else {
+    dirs.push_back(output_dir);
+  }
+  // Try each directory, as they might be full, have inappropriate
+  // permissions or have different problems at times.
+  string output;
+  for (const string& dir : dirs) {
+    string filename = tensorflow::io::JoinPath(dir, "graph-");
+    if (env->CreateUniqueFileName(&filename, ".html")) {
+      output = filename;
+      break;
+    }
+  }
+  if (output.empty()) {
+    LOG(FATAL) << "Failed to create unique output file name.";
+  }
+  TF_CHECK_OK(tensorflow::WriteStringToFile(env, output, html));
+  return "file://" + output;
+}
+
 }  // namespace hlo_graph_dumper
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.h b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
index de1eefab776f9c3d2c73959a5cd267e938a78a32..563cea42371d370b4c9ea739418692fd74dca799 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.h
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
@@ -26,13 +26,23 @@ limitations under the License.
 namespace xla {
 namespace hlo_graph_dumper {
 
+// Converts a HLO module to a DOT (graphviz) graph. Returns the dot graph as
+// a string.
+struct DotGraphOptions {
+  absl::string_view label;
+  const DebugOptions* debug_options = nullptr;
+  const HloExecutionProfile* profile = nullptr;
+  bool show_backend_config = false;
+};
+string HloComputationToDotGraph(const HloComputation& computation,
+                                const DotGraphOptions& options);
+
 // Abstract interface for classes that render HLO graphs (e.g. DOT graph,
-// tensorflow GraphDef).
+// tensorflow GraphDef) to files or services.
 class GraphRendererInterface {
  public:
   enum GraphKind {
     DOT_GRAPH,
-    TF_GRAPHDEF,
   };
 
   virtual ~GraphRendererInterface() = default;
@@ -63,8 +73,12 @@ string DumpGraph(const HloComputation& computation, const string& label,
 // The number of nodes dumped is controlled by the radius parameter, which
 // (roughly) corresponds to the max distance a node may be from the primary node
 // before it's omitted from the graph.
-string DumpNeighborhoodAround(const HloInstruction& node, int radius,
-                              bool show_backend_config = false);
+//
+// The optional boundary specifies a set of boundary nodes, beyond which nodes
+// will be omitted even if they are within the radius.
+string DumpNeighborhoodAround(
+    const HloInstruction& node, int radius, bool show_backend_config = false,
+    const absl::flat_hash_set<const HloInstruction*>& boundary = {});
 
 // Dumps nodes on any of the paths from `from` to `to`.  If there are more than
 // max_nodes on all paths, restricts to the max_nodes nodes on the shortest
@@ -81,6 +95,12 @@ string DumpAllPathsFromTo(const HloInstruction& from, const HloInstruction& to,
 void DumpText(const HloModule& module, const string& label,
               const string& directory_path, bool do_prefix = true);
 
+// Renders DOT graph as inline SVG and saves it in an HTML file in a temprary
+// directory or directory specified via --xla_hlo_graph_path. Returns the file
+// URI pointing to the file.
+string RenderDotAsHTMLFile(const string& dot,
+                           const DebugOptions& debug_options);
+
 // Graph renderers may be added using a registration mechanism, e.g.:
 // XLA_REGISTER_GRAPH_RENDERER(AGraphRendererClass, 100)
 // The renderer with the highest numeric priority value is used.
diff --git a/tensorflow/compiler/xla/service/hlo_graph_html_renderer.cc b/tensorflow/compiler/xla/service/hlo_graph_html_renderer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..84c4cf18df69816c611f4eb159ba247320ebc20e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_graph_html_renderer.cc
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Implementation of an DOT graph renderer that uses Javascript to render DOT to
+// SVG in a browser.
+
+#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+
+namespace xla {
+namespace hlo_graph_dumper {
+namespace {
+
+class GraphHtmlRenderer : public GraphRendererInterface {
+ public:
+  string RenderGraph(const string& graph, GraphKind graph_kind,
+                     const DebugOptions& debug_options) override {
+    switch (graph_kind) {
+      case DOT_GRAPH:
+        return RenderDotAsHTMLFile(graph, debug_options);
+      default:
+        LOG(FATAL) << "Only DOT graphs can be rendered";
+    }
+  }
+};
+
+XLA_REGISTER_GRAPH_RENDERER(GraphHtmlRenderer);
+
+}  // namespace
+}  // namespace hlo_graph_dumper
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc
index 6e1597fd03db0a78aa560340b7b9b64fe500df0c..b01c00121b3363630b83a1e49d0027a66f3a9e1a 100644
--- a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc
+++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.cc
@@ -17,22 +17,34 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 
 namespace xla {
+
+bool HloInputOutputAliasConfig::OutputHasAlias(
+    const ShapeIndex& output_index) const {
+  return alias_.element(output_index).has_value();
+}
+
 Status HloInputOutputAliasConfig::SetUpAlias(const ShapeIndex& output_index,
                                              int64 param_number,
-                                             const ShapeIndex& param_index) {
+                                             const ShapeIndex& param_index,
+                                             AliasKind kind) {
+  TF_RET_CHECK(kind == AliasKind::kUserAlias || kind == AliasKind::kSystemAlias)
+      << kind;
   TF_RET_CHECK(ShapeUtil::IndexIsValid(alias_.shape(), output_index))
       << absl::StrCat("Tring to set up alias at ", output_index.ToString(),
                       " which is an invalid index for shape ",
                       ShapeUtil::HumanString(alias_.shape()));
+  TF_RET_CHECK(param_number >= 0) << param_number;
+  TF_RET_CHECK(!OutputHasAlias(output_index))
+      << "Output index " << output_index << " already has an alias setup";
   // Output can't be aliased with multiple parameters.
   TF_RET_CHECK(!alias_.element(output_index)) << absl::StrFormat(
       "Trying to set up output alias for param %lld at %s but failed: output "
       "index %s is already aliased with param %lld at %s",
       param_number, param_index.ToString(), output_index.ToString(),
-      alias_.element(output_index)->first,
-      alias_.element(output_index)->second.ToString());
+      alias_.element(output_index)->parameter_number,
+      alias_.element(output_index)->parameter_index.ToString());
   (*alias_.mutable_element(output_index)) =
-      std::make_pair(param_number, param_index);
+      Alias(kind, param_number, param_index);
   VLOG(4) << "Set up alias between output index " << output_index.ToString()
           << " and parameter " << param_index << " at index "
           << param_index.ToString();
@@ -42,15 +54,24 @@ Status HloInputOutputAliasConfig::SetUpAlias(const ShapeIndex& output_index,
 HloInputOutputAliasProto HloInputOutputAliasConfig::ToProto() const {
   HloInputOutputAliasProto result;
   alias_.ForEachElement(
-      [&](const ShapeIndex& index,
-          const absl::optional<std::pair<int64, ShapeIndex>>& data) {
+      [&](const ShapeIndex& index, const absl::optional<Alias>& data) {
         if (data) {
           HloInputOutputAliasProto::AliasEntryProto entry;
+          switch (data->kind) {
+            case AliasKind::kUserAlias:
+              entry.set_kind(HloInputOutputAliasProto::USER_ALIAS);
+              break;
+            case AliasKind::kSystemAlias:
+              entry.set_kind(HloInputOutputAliasProto::SYSTEM_ALIAS);
+              break;
+            default:
+              LOG(FATAL) << "Unknown alias kind " << data->kind;
+          }
           for (int64 i : index) {
             entry.add_output_shape_index(i);
           }
-          entry.set_parameter_number(data->first);
-          for (int64 i : data->second) {
+          entry.set_parameter_number(data->parameter_number);
+          for (int64 i : data->parameter_index) {
             entry.add_parameter_shape_index(i);
           }
           result.add_entries()->Swap(&entry);
@@ -66,14 +87,18 @@ StatusOr<HloInputOutputAliasConfig> HloInputOutputAliasConfig::CreateFromProto(
        proto.entries()) {
     ShapeIndex output_index(entry.output_shape_index().begin(),
                             entry.output_shape_index().end());
-
     int64 param_number = entry.parameter_number();
     ShapeIndex param_index(entry.parameter_shape_index().begin(),
                            entry.parameter_shape_index().end());
+    // Handle backward compatibility with existing protos, which only knew of
+    // system aliases.
+    AliasKind kind = AliasKind::kSystemAlias;
+    if (entry.kind() == HloInputOutputAliasProto::USER_ALIAS) {
+      kind = AliasKind::kUserAlias;
+    }
     TF_RETURN_IF_ERROR(
-        result.SetUpAlias(output_index, param_number, param_index));
+        result.SetUpAlias(output_index, param_number, param_index, kind));
   }
-
   return result;
 }
 
@@ -81,45 +106,44 @@ string HloInputOutputAliasConfig::ToString() const {
   std::vector<string> pieces;
   pieces.push_back("HloInputOutputAliasConfig");
 
-  ForEachAlias([&](const ShapeIndex& output_index, int64 param_number,
-                   const ShapeIndex& param_index) {
+  ForEachAlias([&](const ShapeIndex& output_index, const Alias& alias) {
+    const char* kind = alias.kind == AliasKind::kUserAlias ? "USER" : "SYSTEM";
     pieces.push_back(absl::StrFormat(
-        "  OutputIndex %s is aliased with parameter %lld at %s:",
-        output_index.ToString(), param_number, param_index.ToString()));
+        "  OutputIndex %s is aliased (kind=%s) with parameter %lld at %s:",
+        output_index.ToString(), kind, alias.parameter_number,
+        alias.parameter_index.ToString()));
   });
-
   return absl::StrJoin(pieces, "\n");
 }
 
-bool HloInputOutputAliasConfig::ParameterHasAlias(
+HloInputOutputAliasConfig::AliasKind
+HloInputOutputAliasConfig::ParameterAliasKind(
     int64 param_number, const ShapeIndex& param_index) const {
-  bool output = false;
+  AliasKind kind = AliasKind::kNoAlias;
   alias_.ForEachElement(
-      [&](const xla::ShapeIndex&,
-          absl::optional<std::pair<int64, ShapeIndex>> alias) {
-        if (alias && alias->first == param_number &&
-            alias->second == param_index) {
-          output = true;
+      [&](const xla::ShapeIndex&, absl::optional<Alias> alias) {
+        if (alias && alias->parameter_number == param_number &&
+            alias->parameter_index == param_index) {
+          kind = alias->kind;
         }
       });
-  return output;
+  return kind;
 }
 
 absl::optional<ShapeIndex> HloInputOutputAliasConfig::GetAliasedOutput(
     int64 param_number, const ShapeIndex& param_index) const {
   absl::optional<ShapeIndex> output;
   alias_.ForEachElement(
-      [&](const xla::ShapeIndex& output_index,
-          absl::optional<std::pair<int64, ShapeIndex>> alias) {
-        if (alias && alias->first == param_number &&
-            alias->second == param_index) {
+      [&](const xla::ShapeIndex& output_index, absl::optional<Alias> alias) {
+        if (alias && alias->parameter_number == param_number &&
+            alias->parameter_index == param_index) {
           output = output_index;
         }
       });
   return output;
 }
 
-absl::optional<std::pair<int64, ShapeIndex>>
+absl::optional<HloInputOutputAliasConfig::Alias>
 HloInputOutputAliasConfig::GetAliasedParameter(
     const ShapeIndex& output_index) const {
   CHECK(ShapeUtil::IndexIsValid(alias_.shape(), output_index));
@@ -128,10 +152,9 @@ HloInputOutputAliasConfig::GetAliasedParameter(
 
 void HloInputOutputAliasConfig::ForEachAlias(AliasFn fn) const {
   alias_.ForEachElement(
-      [&](const ShapeIndex& output_index,
-          absl::optional<std::pair<int64, ShapeIndex>> aliased) {
+      [&](const ShapeIndex& output_index, absl::optional<Alias> aliased) {
         if (aliased) {
-          fn(output_index, aliased->first, aliased->second);
+          fn(output_index, *aliased);
         }
       });
 }
@@ -139,10 +162,9 @@ void HloInputOutputAliasConfig::ForEachAlias(AliasFn fn) const {
 Status HloInputOutputAliasConfig::ForEachAliasWithStatus(
     AliasFnWithStatus fn) const {
   return alias_.ForEachElementWithStatus(
-      [&](const ShapeIndex& output_index,
-          absl::optional<std::pair<int64, ShapeIndex>> aliased) {
+      [&](const ShapeIndex& output_index, absl::optional<Alias> aliased) {
         if (aliased) {
-          TF_RETURN_IF_ERROR(fn(output_index, aliased->first, aliased->second));
+          TF_RETURN_IF_ERROR(fn(output_index, *aliased));
         }
         return Status::OK();
       });
@@ -158,20 +180,19 @@ Status HloInputOutputAliasConfig::Verify(
     param_has_seen.emplace_back(param->shape());
   }
   return ForEachAliasWithStatus([&](const ShapeIndex& output_index,
-                                    int64 param_number,
-                                    const ShapeIndex& param_index) -> Status {
+                                    const Alias& alias) -> Status {
     const HloInstruction* root = entry->root_instruction();
 
-    TF_RET_CHECK(0 <= param_number);
-    TF_RET_CHECK(entry->num_parameters() > param_number);
+    TF_RET_CHECK(0 <= alias.parameter_number);
+    TF_RET_CHECK(entry->num_parameters() > alias.parameter_number);
     const Shape& param_shape =
-        entry->parameter_instruction(param_number)->shape();
+        entry->parameter_instruction(alias.parameter_number)->shape();
     const Shape& output_shape = root->shape();
-    TF_RET_CHECK(ShapeUtil::IndexIsValid(param_shape, param_index));
+    TF_RET_CHECK(ShapeUtil::IndexIsValid(param_shape, alias.parameter_index));
     TF_RET_CHECK(ShapeUtil::IndexIsValid(output_shape, output_index));
 
     const Shape& param_subshape =
-        ShapeUtil::GetSubshape(param_shape, param_index);
+        ShapeUtil::GetSubshape(param_shape, alias.parameter_index);
     const Shape& output_subshape =
         ShapeUtil::GetSubshape(output_shape, output_index);
     TF_RET_CHECK(LayoutUtil::IsDenseArray(param_subshape));
@@ -182,19 +203,20 @@ Status HloInputOutputAliasConfig::Verify(
           "Expected aliased input %lld at index %s and output at index %s to "
           "have the same size. Input sub-shape is %s with size %lld, output "
           "sub-shape is %s with size %lld",
-          param_number, param_index.ToString(), output_index.ToString(),
+          alias.parameter_number, alias.parameter_index.ToString(),
+          output_index.ToString(),
           ShapeUtil::HumanStringWithLayout(param_subshape),
           size_func(param_subshape),
           ShapeUtil::HumanStringWithLayout(output_subshape),
           size_func(output_subshape));
     }
 
-    // Check each param_number and param_index pair only show up once. No
-    // input can be aliased with output buffers.
-    TF_RET_CHECK(param_has_seen[param_number].element(param_index) == false);
-
-    *(param_has_seen[param_number].mutable_element(param_index)) = true;
-
+    // Check each alias.parameter_number and alias.parameter_index pair only
+    // show up once. No input can be aliased with output buffers.
+    TF_RET_CHECK(param_has_seen[alias.parameter_number].element(
+                     alias.parameter_index) == false);
+    *(param_has_seen[alias.parameter_number].mutable_element(
+        alias.parameter_index)) = true;
     return Status::OK();
   });
 }
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
index 439676b1546c4af7f781fb80bccffd5248309b0f..cd13c7a3ac7afe03fb99ed3114bdc6ac0f8ad6a7 100644
--- a/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
+++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <utility>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
@@ -31,21 +32,54 @@ class HloModule;
 // parameter index in the entry computation.
 class HloInputOutputAliasConfig {
  public:
+  // The kind of aliases which can be set. A kUserAlias is one setup at
+  // compilation time by the user, and has to be respected. A kSystemAlias one
+  // might be setup by the compiler, if it decides it is convenient to do so.
+  enum AliasKind {
+    kNoAlias,
+    kUserAlias,
+    kSystemAlias,
+  };
+
+  // Defines the alias information for a given output buffer. A given output
+  // buffer shape index can refer only to one parameter+index.
+  struct Alias {
+    Alias(AliasKind kind, int64 parameter_number, ShapeIndex parameter_index)
+        : kind(kind),
+          parameter_number(parameter_number),
+          parameter_index(std::move(parameter_index)) {}
+
+    AliasKind kind;
+    int64 parameter_number;
+    ShapeIndex parameter_index;
+  };
+
   HloInputOutputAliasConfig() = default;
 
-  explicit HloInputOutputAliasConfig(Shape shape) : alias_(shape) {}
+  explicit HloInputOutputAliasConfig(Shape output_shape)
+      : alias_(output_shape) {}
 
   virtual ~HloInputOutputAliasConfig() = default;
 
   // Sets up alias config from `output_index` to `param_index` at
   // `param_number`.
   Status SetUpAlias(const ShapeIndex& output_index, int64 param_number,
-                    const ShapeIndex& param_index);
+                    const ShapeIndex& param_index, AliasKind kind);
+
+  // Returns the kind of alias for the given parameter number and parameter
+  // index. If no alias exists, AliasKind::kNoAlias is returned.
+  AliasKind ParameterAliasKind(int64 param_number,
+                               const ShapeIndex& param_index) const;
 
   // Returns true if the given parameter is aliased with one of the output
   // buffers.
   bool ParameterHasAlias(int64 param_number,
-                         const ShapeIndex& param_index) const;
+                         const ShapeIndex& param_index) const {
+    return ParameterAliasKind(param_number, param_index) != AliasKind::kNoAlias;
+  }
+
+  // Checks whether the provided output index has already been aliased.
+  bool OutputHasAlias(const ShapeIndex& output_index) const;
 
   // (De)Serializes an HloInputOutoutAliasConfig to/from an
   // HloInputOutoutAliasProto.
@@ -63,19 +97,17 @@ class HloInputOutputAliasConfig {
   // Returns the number of parameter and index of the parameter buffer that the
   // given output buffer index is aliased with. A nullopt is returned if there
   // is no parameter is aliased with the specific output.
-  absl::optional<std::pair<int64, ShapeIndex>> GetAliasedParameter(
+  absl::optional<Alias> GetAliasedParameter(
       const ShapeIndex& output_index) const;
 
   using AliasFn =
-      std::function<void(const ShapeIndex& output_index, int64 param_number,
-                         const ShapeIndex& param_index)>;
+      std::function<void(const ShapeIndex& output_index, const Alias&)>;
 
   // Iterates through each aliased output and input.
   void ForEachAlias(AliasFn fn) const;
 
   using AliasFnWithStatus =
-      std::function<Status(const ShapeIndex& output_index, int64 param_number,
-                           const ShapeIndex& param_index)>;
+      std::function<Status(const ShapeIndex& output_index, const Alias&)>;
 
   // Verifies that the given config is valid for the given module.
   // Specifically, the config's input and output should be in-bound and size of
@@ -90,9 +122,10 @@ class HloInputOutputAliasConfig {
  private:
   // A ShapeTree which indicates the list of buffers that's expected to be
   // aliased. The key on this shape tree represents the output index. The value
-  // is a pair of parameter number and index into the buffer. If the value is
-  // nullopt, it means there is no parameter aliasing for this output.
-  ShapeTree<absl::optional<std::pair<int64, ShapeIndex>>> alias_;
+  // is an Alias data structure which defines the input parameter coordinates.
+  // If the value is nullopt, it means there is no parameter aliasing for this
+  // output.
+  ShapeTree<absl::optional<Alias>> alias_;
 };
 
 std::ostream& operator<<(std::ostream& out,
diff --git a/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc b/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc
index aeb9b0fdc8b6cca87731a2d4aae25120af6c3215..265bfdf7f989b0821a98c1f774cb408b78f348fe 100644
--- a/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_input_output_alias_config_test.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
@@ -45,11 +44,12 @@ class HloInputOutputAliasConfigTest : public HloTestBase {
     EXPECT_TRUE(aliased_output);
     EXPECT_EQ(aliased_output.value(), output_index);
 
-    absl::optional<std::pair<int64, ShapeIndex>> aliased_param =
+    absl::optional<HloInputOutputAliasConfig::Alias> aliased_param =
         config.GetAliasedParameter(output_index);
 
     EXPECT_TRUE(aliased_param);
-    EXPECT_EQ(aliased_param.value(), std::make_pair(param_number, param_index));
+    EXPECT_EQ(aliased_param->parameter_number, param_number);
+    EXPECT_EQ(aliased_param->parameter_index, param_index);
   }
 
   void expect_not_aliased(const ShapeIndex& output_index, int64 param_number,
@@ -60,11 +60,12 @@ class HloInputOutputAliasConfigTest : public HloTestBase {
 
     EXPECT_FALSE(aliased_output && aliased_output == output_index);
 
-    absl::optional<std::pair<int64, ShapeIndex>> aliased_param =
+    absl::optional<HloInputOutputAliasConfig::Alias> aliased_param =
         config.GetAliasedParameter(output_index);
 
-    EXPECT_FALSE(aliased_param && aliased_param->first == param_number &&
-                 aliased_param->second == param_index);
+    EXPECT_FALSE(aliased_param &&
+                 aliased_param->parameter_number == param_number &&
+                 aliased_param->parameter_index == param_index);
   }
 };
 
@@ -84,8 +85,10 @@ ENTRY main {
   HloInputOutputAliasConfig config(
       module->entry_computation()->root_instruction()->shape());
 
-  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{0}, /*param_number=*/1,
-                                 /*param_index=*/{}));
+  TF_ASSERT_OK(config.SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/1,
+      /*param_index=*/{},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   expect_aliased(/*output_index=*/{0}, /*param_number=*/1,
                  /*param_index=*/{}, config);
@@ -114,11 +117,15 @@ ENTRY main {
   HloInputOutputAliasConfig config(
       module->entry_computation()->root_instruction()->shape());
 
-  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{0}, /*param_number=*/0,
-                                 /*param_index=*/{0}));
+  TF_ASSERT_OK(config.SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/0,
+      /*param_index=*/{0},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
-  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{1}, /*param_number=*/0,
-                                 /*param_index=*/{1}));
+  TF_ASSERT_OK(config.SetUpAlias(
+      /*output_index=*/{1}, /*param_number=*/0,
+      /*param_index=*/{1},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   expect_aliased(/*output_index=*/{0}, /*param_number=*/0,
                  /*param_index=*/{0}, config);
@@ -149,11 +156,15 @@ ENTRY main {
   HloInputOutputAliasConfig config(
       module->entry_computation()->root_instruction()->shape());
 
-  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{0}, /*param_number=*/0,
-                                 /*param_index=*/{}));
+  TF_ASSERT_OK(config.SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/0,
+      /*param_index=*/{},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
-  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{1}, /*param_number=*/0,
-                                 /*param_index=*/{}));
+  TF_ASSERT_OK(config.SetUpAlias(
+      /*output_index=*/{1}, /*param_number=*/0,
+      /*param_index=*/{},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   ASSERT_IS_NOT_OK(config.Verify(*module, [](const Shape& shape) {
     return ShapeUtil::ByteSizeOf(shape);
@@ -176,8 +187,10 @@ ENTRY main {
   HloInputOutputAliasConfig config(
       module->entry_computation()->root_instruction()->shape());
 
-  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{1}, /*param_number=*/0,
-                                 /*param_index=*/{}));
+  TF_ASSERT_OK(config.SetUpAlias(
+      /*output_index=*/{1}, /*param_number=*/0,
+      /*param_index=*/{},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
   ASSERT_IS_NOT_OK(config.Verify(*module, [](const Shape& shape) {
     return ShapeUtil::ByteSizeOf(shape);
@@ -200,11 +213,15 @@ ENTRY main {
   HloInputOutputAliasConfig config(
       module->entry_computation()->root_instruction()->shape());
 
-  TF_ASSERT_OK(config.SetUpAlias(/*output_index=*/{0}, /*param_number=*/0,
-                                 /*param_index=*/{}));
+  TF_ASSERT_OK(config.SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/0,
+      /*param_index=*/{},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 
-  ASSERT_IS_NOT_OK(config.SetUpAlias(/*output_index=*/{0}, /*param_number=*/1,
-                                     /*param_index=*/{}));
+  ASSERT_IS_NOT_OK(config.SetUpAlias(
+      /*output_index=*/{0}, /*param_number=*/1,
+      /*param_index=*/{},
+      /*kind=*/HloInputOutputAliasConfig::AliasKind::kUserAlias));
 }
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 21b1dbc1676cccd2fe5b331a1f9d6ff5e3a73fcd..6c47bb8935a471743829ae3539c806d0465362c6 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
@@ -82,86 +83,70 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     return computation_map.at(proto.called_computation_ids(index));
   };
 
-  TF_RET_CHECK(std::all_of(
-      proto.operand_ids().begin(), proto.operand_ids().end(),
-      [&instruction_map](int64 id) { return instruction_map.contains(id); }))
+  TF_RET_CHECK(
+      absl::c_all_of(proto.operand_ids(),
+                     [&](int64 id) { return instruction_map.contains(id); }))
       << proto.name() << " instruction contains invalid operand id(s)";
 
-  TF_RET_CHECK(std::all_of(
-      proto.called_computation_ids().begin(),
-      proto.called_computation_ids().end(),
-      [&computation_map](int64 id) { return computation_map.contains(id); }))
+  TF_RET_CHECK(
+      absl::c_all_of(proto.called_computation_ids(),
+                     [&](int64 id) { return computation_map.contains(id); }))
       << proto.name() << " instruction references invalid computation id(s)";
 
   Shape shape(proto.shape());
   TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
 
+  absl::optional<int> arity = HloOpcodeArity(opcode);
+  if (arity) {
+    TF_RET_CHECK(proto.operand_ids_size() == *arity)
+        << proto.opcode() << " instruction should have " << *arity
+        << " operands but sees " << proto.operand_ids_size();
+  }
+
   switch (opcode) {
     // Ops migrated to subclasses.
     case HloOpcode::kBatchNormTraining:
-      TF_RET_CHECK(proto.operand_ids_size() == 3)
-          << "BatchNormTraining instruction should have 3 operands but sees "
-          << proto.operand_ids_size();
       instruction =
           CreateBatchNormTraining(shape, operands(0), operands(1), operands(2),
                                   proto.epsilon(), proto.feature_index());
       break;
     case HloOpcode::kBatchNormInference:
-      TF_RET_CHECK(proto.operand_ids_size() == 5)
-          << "BatchNormInference instruction should have 5 operands but sees "
-          << proto.operand_ids_size();
       instruction = CreateBatchNormInference(
           shape, operands(0), operands(1), operands(2), operands(3),
           operands(4), proto.epsilon(), proto.feature_index());
       break;
     case HloOpcode::kBatchNormGrad:
-      TF_RET_CHECK(proto.operand_ids_size() == 5)
-          << "BatchNormGrad instruction should have 5 operands but sees "
-          << proto.operand_ids_size();
       instruction = CreateBatchNormGrad(shape, operands(0), operands(1),
                                         operands(2), operands(3), operands(4),
                                         proto.epsilon(), proto.feature_index());
       break;
     case HloOpcode::kFft: {
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "Fft instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       std::vector<int64> fft_length(proto.fft_length().begin(),
                                     proto.fft_length().end());
       instruction = CreateFft(shape, operands(0), proto.fft_type(),
                               absl::Span<const int64>(fft_length));
       break;
     }
+    case HloOpcode::kTriangularSolve: {
+      instruction = CreateTriangularSolve(shape, operands(0), operands(1),
+                                          proto.triangular_solve_options());
+      break;
+    }
     case HloOpcode::kSend:
-      TF_RET_CHECK(proto.operand_ids_size() == 2)
-          << "Send instruction should have 2 operand but sees "
-          << proto.operand_ids_size();
       instruction = CreateSend(operands(0), operands(1), proto.channel_id(),
                                proto.is_host_transfer());
       break;
     case HloOpcode::kSendDone:
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "SendDone instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       instruction = CreateSendDone(operands(0), proto.is_host_transfer());
       break;
     case HloOpcode::kRecv:
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "Recv instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       instruction = CreateRecv(shape.tuple_shapes(0), operands(0),
                                proto.channel_id(), proto.is_host_transfer());
       break;
     case HloOpcode::kRecvDone:
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "RecvDone instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       instruction = CreateRecvDone(operands(0), proto.is_host_transfer());
       break;
     case HloOpcode::kReverse:
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "Reverse instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       instruction = CreateReverse(shape, operands(0),
                                   std::vector<int64>(proto.dimensions().begin(),
                                                      proto.dimensions().end()));
@@ -201,26 +186,21 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << proto.operand_ids_size();
       TF_RET_CHECK(proto.dimensions().size() == 1)
           << "Sort instruction should have 1 dimension";
+      TF_RET_CHECK(proto.called_computation_ids_size() == 1)
+          << "Sort instruction should one called computation but sees "
+          << proto.called_computation_ids_size();
       auto sort_operands = all_operands();
-      HloInstruction* keys = sort_operands[0];
-      instruction = CreateSort(
-          shape, proto.dimensions(0), keys,
-          absl::Span<HloInstruction* const>(sort_operands).subspan(1));
+      instruction = CreateSort(shape, proto.dimensions(0), all_operands(),
+                               computations(0), proto.is_stable());
       break;
     }
     case HloOpcode::kTranspose:
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "Transpose instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       instruction =
           CreateTranspose(shape, operands(0),
                           std::vector<int64>(proto.dimensions().begin(),
                                              proto.dimensions().end()));
       break;
     case HloOpcode::kBroadcast:
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "Broadcast instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       instruction =
           CreateBroadcast(shape, operands(0),
                           std::vector<int64>(proto.dimensions().begin(),
@@ -233,9 +213,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       instruction = CreateMap(shape, all_operands(), computations(0));
       break;
     case HloOpcode::kSlice: {
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "Slice instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       std::vector<int64> slice_starts, slice_limits, slice_strides;
       for (const HloInstructionProto::SliceDimensions& slice_dimensions :
            proto.slice_dimensions()) {
@@ -259,9 +236,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       break;
     }
     case HloOpcode::kTrace: {
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "Trace instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       TF_RET_CHECK(proto.has_literal());
       TF_ASSIGN_OR_RETURN(auto literal,
                           Literal::CreateFromProto(proto.literal()));
@@ -295,37 +269,29 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     case HloOpcode::kParameter:
       instruction =
           CreateParameter(proto.parameter_number(), shape, proto.name());
+      if (!proto.parameter_replication().replicated_at_leaf_buffers().empty()) {
+        instruction->set_parameter_replicated_at_leaf_buffers(
+            proto.parameter_replication().replicated_at_leaf_buffers());
+      }
       break;
     case HloOpcode::kGetTupleElement:
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "GetTupleElement instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       instruction =
           CreateGetTupleElement(shape, operands(0), proto.tuple_index());
       break;
     case HloOpcode::kReducePrecision:
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "ReducePrecision instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       instruction = CreateReducePrecision(
           shape, operands(0), proto.exponent_bits(), proto.mantissa_bits());
       break;
     case HloOpcode::kInfeed: {
-      TF_RET_CHECK(ShapeUtil::IsTuple(shape) &&
+      TF_RET_CHECK(shape.IsTuple() &&
                    (ShapeUtil::TupleElementCount(shape) == 2))
           << "Infeed should have a tuple shape with 2 operands, but has: "
           << shape;
       const Shape& data_shape = ShapeUtil::GetTupleElementShape(shape, 0);
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "Infeed instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       instruction =
           CreateInfeed(data_shape, operands(0), proto.infeed_config());
     } break;
     case HloOpcode::kOutfeed: {
-      TF_RET_CHECK(proto.operand_ids_size() == 2)
-          << "Outfeed instruction should have 2 operands but sees "
-          << proto.operand_ids_size();
       Shape outfeed_shape(proto.outfeed_shape());
       TF_RETURN_IF_ERROR(
           ShapeUtil::ValidateShapeWithOptionalLayout(outfeed_shape));
@@ -333,20 +299,20 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
                                   proto.outfeed_config());
       break;
     }
-    case HloOpcode::kCrossReplicaSum: {
+    case HloOpcode::kAllReduce: {
       TF_RET_CHECK(proto.called_computation_ids_size() == 1)
-          << "CrossReplicaSum should have 1 called computation but sees "
+          << "AllReduce should have 1 called computation but sees "
           << proto.called_computation_ids_size();
       absl::optional<int64> all_reduce_id;
       if (proto.all_reduce_id() > 0) {
         all_reduce_id = proto.all_reduce_id();
       }
-      instruction = CreateCrossReplicaSum(
+      instruction = CreateAllReduce(
           shape, all_operands(), computations(0),
           /*replica_groups=*/
           std::vector<ReplicaGroup>(proto.replica_groups().begin(),
                                     proto.replica_groups().end()),
-          /*barrier=*/proto.cross_replica_sum_barrier(),
+          /*barrier=*/proto.all_reduce_barrier(),
           /*all_reduce_id=*/all_reduce_id);
       break;
     }
@@ -359,9 +325,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       break;
     }
     case HloOpcode::kCollectivePermute: {
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "CollectivePermute instruction should have 1 operand but sees "
-          << proto.operand_ids_size();
       std::vector<std::pair<int64, int64>> source_target_pairs(
           proto.source_target_pairs_size());
       for (int i = 0; i < source_target_pairs.size(); i++) {
@@ -372,10 +335,11 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           CreateCollectivePermute(shape, operands(0), source_target_pairs);
       break;
     }
+    case HloOpcode::kReplicaId: {
+      instruction = CreateReplicaId();
+      break;
+    }
     case HloOpcode::kConvolution: {
-      TF_RET_CHECK(proto.operand_ids_size() == 2)
-          << "Convolution instruction should have 2 operands but sees "
-          << proto.operand_ids_size();
       TF_RET_CHECK(proto.has_window());
       TF_RET_CHECK(proto.has_convolution_dimension_numbers());
       PrecisionConfig precision_config = proto.precision_config();
@@ -383,14 +347,12 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           proto.operand_ids_size(), PrecisionConfig::DEFAULT);
       instruction = CreateConvolve(
           shape, operands(0), operands(1),
-          std::max<int64>(proto.feature_group_count(), 1), proto.window(),
+          std::max<int64>(proto.feature_group_count(), 1),
+          std::max<int64>(proto.batch_group_count(), 1), proto.window(),
           proto.convolution_dimension_numbers(), precision_config);
       break;
     }
     case HloOpcode::kReduceWindow:
-      TF_RET_CHECK(proto.operand_ids_size() == 2)
-          << "ReduceWindow instruction should have 2 operands but sees "
-          << proto.operand_ids_size();
       TF_RET_CHECK(proto.called_computation_ids_size() == 1)
           << "ReduceWindow should have 1 called computation but sees "
           << proto.called_computation_ids_size();
@@ -398,9 +360,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
                                        proto.window(), computations(0));
       break;
     case HloOpcode::kSelectAndScatter:
-      TF_RET_CHECK(proto.operand_ids_size() == 3)
-          << "SelectAndScatter instruction should have 3 operands but sees "
-          << proto.operand_ids_size();
       TF_RET_CHECK(proto.called_computation_ids_size() == 2)
           << "SelectAndScatter should have 2 called computations but sees "
           << proto.called_computation_ids_size();
@@ -438,29 +397,56 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       static_cast<HloCustomCallInstruction*>(instruction.get())
           ->set_feature_group_count(
               std::max(static_cast<int64>(proto.feature_group_count()), 1LL));
+      static_cast<HloCustomCallInstruction*>(instruction.get())
+          ->set_batch_group_count(
+              std::max(static_cast<int64>(proto.batch_group_count()), 1LL));
       break;
     case HloOpcode::kPad:
-      TF_RET_CHECK(proto.operand_ids_size() == 2)
-          << "Pad instruction should have 2 operands but sees "
-          << proto.operand_ids_size();
       TF_RET_CHECK(proto.has_padding_config());
       instruction =
           CreatePad(shape, operands(0), operands(1), proto.padding_config());
       break;
     case HloOpcode::kDynamicSlice: {
-      TF_RET_CHECK(proto.operand_ids_size() == 2)
-          << "DynamicSlice instruction should have 2 operands but sees "
-          << proto.operand_ids_size();
       std::vector<int64> slice_sizes(proto.dynamic_slice_sizes_size());
       absl::c_copy(proto.dynamic_slice_sizes(), slice_sizes.begin());
+      TF_RET_CHECK(proto.operand_ids_size() >= 1)
+          << "DynamicSlice instruction should have at least 1 operands but "
+             "sees "
+          << proto.operand_ids_size();
+      // TODO(b/118437727): Old form, make the check unconditional.
+      if (proto.operand_ids_size() != 2 || operands(1)->shape().rank() != 1) {
+        auto expected_operands = 1 + operands(0)->shape().rank();
+        TF_RET_CHECK(proto.operand_ids_size() == expected_operands)
+            << "DynamicSlice instruction should have " << expected_operands
+            << " operands, but has " << proto.operand_ids_size();
+      }
+      const auto& operand_vector = all_operands();
+      instruction = CreateDynamicSlice(
+          shape, operands(0), absl::MakeSpan(operand_vector).subspan(1),
+          slice_sizes);
+      break;
+    }
+    case HloOpcode::kDynamicUpdateSlice: {
+      TF_RET_CHECK(proto.operand_ids_size() >= 2)
+          << "DynamicUpdateSlice instruction should have at least 2 operands "
+             "but sees "
+          << proto.operand_ids_size();
+      // TODO(b/118437727): Old form, make the check unconditional.
+      if (proto.operand_ids_size() != 3 || operands(2)->shape().rank() != 1) {
+        auto expected_operands = 2 + operands(0)->shape().rank();
+        TF_RET_CHECK(proto.operand_ids_size() == expected_operands)
+            << "DynamicUpdateSlice instruction should have "
+            << expected_operands << " operands, but has "
+            << proto.operand_ids_size();
+      }
+      const auto& operand_vector = all_operands();
       instruction =
-          CreateDynamicSlice(shape, operands(0), operands(1), slice_sizes);
+          CreateDynamicUpdateSlice(shape, operands(0), operands(1),
+                                   absl::MakeSpan(operand_vector).subspan(2));
+
       break;
     }
     case HloOpcode::kGather: {
-      TF_RET_CHECK(proto.operand_ids_size() == 2)
-          << "Gather instruction should have 2 operands but sees "
-          << proto.operand_ids_size();
       TF_RET_CHECK(proto.has_gather_dimension_numbers())
           << "Gather instruction should have GatherDimensionNumbers set.";
       std::unique_ptr<GatherDimensionNumbers> gather_dimension_numbers =
@@ -475,9 +461,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       break;
     }
     case HloOpcode::kScatter: {
-      TF_RET_CHECK(proto.operand_ids_size() == 3)
-          << "Scatter instruction should have 3 operands but sees "
-          << proto.operand_ids_size();
       TF_RET_CHECK(proto.has_scatter_dimension_numbers())
           << "Scatter instruction should have ScatterDimensionNumbers set.";
       TF_RET_CHECK(proto.called_computation_ids_size() == 1)
@@ -499,9 +482,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     case HloOpcode::kDot: {
       TF_RET_CHECK(proto.has_dot_dimension_numbers())
           << "Dot instruction should have dot_dimension_numbers.";
-      TF_RET_CHECK(proto.operand_ids_size() == 2)
-          << "Dot instruction should have 2 operands but sees "
-          << proto.operand_ids_size();
       PrecisionConfig precision_config = proto.precision_config();
       precision_config.mutable_operand_precision()->Resize(
           proto.operand_ids_size(), PrecisionConfig::DEFAULT);
@@ -511,9 +491,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       break;
     }
     case HloOpcode::kDomain: {
-      TF_RET_CHECK(proto.operand_ids_size() == 1)
-          << "Domain instruction should have 1 operands but sees "
-          << proto.operand_ids_size();
       std::shared_ptr<const HloSharding> entry_hlo_sharding;
       std::shared_ptr<const HloSharding> exit_hlo_sharding;
       if (proto.has_domain_entry_sharding()) {
@@ -535,7 +512,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       break;
     }
     case HloOpcode::kGetDimensionSize:
-      TF_RET_CHECK(proto.operand_ids_size() == 1);
       TF_RET_CHECK(proto.dimensions_size() == 1);
       instruction =
           CreateGetDimensionSize(shape, operands(0), proto.dimensions(0));
@@ -569,6 +545,11 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   instruction->SetAndSanitizeName(proto.name());
   instruction->metadata_ = proto.metadata();
   instruction->backend_config_ = proto.backend_config();
+
+  TF_RET_CHECK(proto.id() >= 0)
+      << "Instruction with negative id: " << proto.id();
+  TF_RET_CHECK(proto.id() <= INT_MAX)
+      << "Instruction with id > INT_MAX: " << proto.id();
   instruction->unique_id_ = proto.id();
 
   if (proto.has_sharding()) {
@@ -619,7 +600,7 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
     absl::Span<HloInstruction* const> operands) {
   if (opcode == HloOpcode::kCopy) {
     // It is impossible to copy an opaque shape, we don't know how big it is.
-    CHECK(!ShapeUtil::IsOpaque(shape));
+    CHECK(!shape.IsOpaque());
   }
   auto instruction = absl::WrapUnique(new HloInstruction(opcode, shape));
   for (auto operand : operands) {
@@ -650,8 +631,10 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
     case HloOpcode::kNot:
     case HloOpcode::kNegate:
     case HloOpcode::kReal:
+    case HloOpcode::kRsqrt:
     case HloOpcode::kSign:
     case HloOpcode::kSin:
+    case HloOpcode::kSqrt:
     case HloOpcode::kTanh:
       break;
     default:
@@ -729,12 +712,12 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateConvolve(
     const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
-    int64 feature_group_count, const Window& window,
+    int64 feature_group_count, int64 batch_group_count, const Window& window,
     const ConvolutionDimensionNumbers& dimension_numbers,
     const PrecisionConfig& precision_config) {
   return absl::make_unique<HloConvolutionInstruction>(
-      shape, lhs, rhs, feature_group_count, window, dimension_numbers,
-      precision_config);
+      shape, lhs, rhs, feature_group_count, batch_group_count, window,
+      dimension_numbers, precision_config);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateFft(
@@ -744,6 +727,13 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
                                               fft_length);
 }
 
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateTriangularSolve(const Shape& shape, HloInstruction* a,
+                                      HloInstruction* b,
+                                      const TriangularSolveOptions& options) {
+  return absl::make_unique<HloTriangularSolveInstruction>(shape, a, b, options);
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateDot(
     const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
     const DotDimensionNumbers& dimension_numbers,
@@ -761,8 +751,7 @@ HloInstruction::CreateReducePrecision(const Shape& shape,
       shape, operand, exponent_bits, mantissa_bits);
 }
 
-/* static */ std::unique_ptr<HloInstruction>
-HloInstruction::CreateCrossReplicaSum(
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateAllReduce(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
     HloComputation* reduce_computation,
     const std::vector<ReplicaGroup>& replica_groups, absl::string_view barrier,
@@ -787,6 +776,11 @@ HloInstruction::CreateCollectivePermute(
       shape, operand, source_target_pairs);
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateReplicaId() {
+  return absl::WrapUnique(
+      new HloInstruction(HloOpcode::kReplicaId, ShapeUtil::MakeShape(U32, {})));
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateInfeed(
     const Shape& infeed_shape, HloInstruction* token_operand,
     const string& config) {
@@ -903,23 +897,19 @@ HloInstruction::CreateAddDependency(HloInstruction* data_operand,
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateDynamicSlice(
-    const Shape& shape, HloInstruction* operand, HloInstruction* start_indices,
+    const Shape& shape, HloInstruction* operand,
+    absl::Span<HloInstruction* const> start_indices,
     absl::Span<const int64> slice_sizes) {
   return absl::make_unique<HloDynamicSliceInstruction>(
       shape, operand, start_indices, slice_sizes);
 }
 
 /* static */ std::unique_ptr<HloInstruction>
-HloInstruction::CreateDynamicUpdateSlice(const Shape& shape,
-                                         HloInstruction* operand,
-                                         HloInstruction* update,
-                                         HloInstruction* start_indices) {
-  auto instruction = absl::WrapUnique(
-      new HloInstruction(HloOpcode::kDynamicUpdateSlice, shape));
-  instruction->AppendOperand(operand);
-  instruction->AppendOperand(update);
-  instruction->AppendOperand(start_indices);
-  return instruction;
+HloInstruction::CreateDynamicUpdateSlice(
+    const Shape& shape, HloInstruction* operand, HloInstruction* update,
+    absl::Span<HloInstruction* const> start_indices) {
+  return absl::make_unique<HloDynamicUpdateSliceInstruction>(
+      shape, operand, update, start_indices);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateConcatenate(
@@ -1035,7 +1025,7 @@ HloInstruction::CreateBroadcastSequence(
     const std::function<HloInstruction*(std::unique_ptr<HloInstruction>)>&
         adder) {
   CHECK(ShapeUtil::IsScalar(operand->shape()) ||
-        ShapeUtil::Rank(operand->shape()) == ShapeUtil::Rank(output_shape));
+        operand->shape().rank() == output_shape.rank());
   Shape broadcast_shape = ShapeUtil::ChangeElementType(
       output_shape, operand->shape().element_type());
   // Do explicit broadcast for scalar.
@@ -1051,7 +1041,7 @@ HloInstruction::CreateBroadcastSequence(
   // Do explicit broadcast for degenerate broadcast.
   std::vector<int64> broadcast_dimensions;
   std::vector<int64> reshaped_dimensions;
-  for (int i = 0; i < ShapeUtil::Rank(operand->shape()); i++) {
+  for (int i = 0; i < operand->shape().rank(); i++) {
     if (operand->shape().dimensions(i) == output_shape.dimensions(i)) {
       broadcast_dimensions.push_back(i);
       reshaped_dimensions.push_back(operand->shape().dimensions(i));
@@ -1107,9 +1097,11 @@ HloInstruction::CreateBroadcastSequence(
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateSort(
-    const Shape& shape, int64 dimension, HloInstruction* keys,
-    absl::Span<HloInstruction* const> values) {
-  return absl::make_unique<HloSortInstruction>(shape, dimension, keys, values);
+    const Shape& shape, int64 dimension,
+    absl::Span<HloInstruction* const> operands, HloComputation* compare,
+    bool is_stable) {
+  return absl::make_unique<HloSortInstruction>(shape, dimension, operands,
+                                               compare, is_stable);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateFusion(
@@ -1128,7 +1120,7 @@ HloInstruction::CreateBroadcastSequence(
 
 void HloInstruction::set_single_sharding(const HloSharding& sharding) {
   CHECK(!sharding.IsTuple()) << sharding;
-  if (ShapeUtil::IsTuple(shape())) {
+  if (shape().IsTuple()) {
     set_sharding(HloSharding::Tuple(sharding.GetAsShapeTree(shape())));
   } else {
     set_sharding(sharding);
@@ -1160,7 +1152,7 @@ bool HloInstruction::HasSideEffectNoRecurse() const {
     case HloOpcode::kOutfeed:
     case HloOpcode::kTrace:
       return true;
-    case HloOpcode::kCrossReplicaSum:
+    case HloOpcode::kAllReduce:
       return all_reduce_id().has_value();
     default:
       return false;
@@ -1283,7 +1275,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kParameter:
     case HloOpcode::kGetTupleElement:
     case HloOpcode::kReducePrecision:
-    case HloOpcode::kCrossReplicaSum:
+    case HloOpcode::kAllReduce:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
     case HloOpcode::kInfeed:
@@ -1301,6 +1293,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kDot:
     case HloOpcode::kDomain:
     case HloOpcode::kGetDimensionSize:
+    case HloOpcode::kTriangularSolve:
       clone = CloneWithNewOperandsImpl(shape, new_operands, context);
       break;
     // Unary ops.
@@ -1321,8 +1314,10 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kNot:
     case HloOpcode::kNegate:
     case HloOpcode::kReal:
+    case HloOpcode::kRsqrt:
     case HloOpcode::kSign:
     case HloOpcode::kSin:
+    case HloOpcode::kSqrt:
     case HloOpcode::kTanh:
       CHECK_EQ(new_operands.size(), 1);
       clone = CreateUnary(shape, opcode_, new_operands[0]);
@@ -1378,9 +1373,8 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       clone = CreateReshape(shape, new_operands[0]);
       break;
     case HloOpcode::kDynamicUpdateSlice:
-      CHECK_EQ(new_operands.size(), 3);
       clone = CreateDynamicUpdateSlice(shape, new_operands[0], new_operands[1],
-                                       new_operands[2]);
+                                       new_operands.subspan(2));
       break;
     case HloOpcode::kTuple:
       clone = CreateTuple(new_operands);
@@ -1408,6 +1402,10 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       CHECK_EQ(new_operands.size(), 2);
       clone = CreateAddDependency(new_operands[0], new_operands[1]);
       break;
+    case HloOpcode::kReplicaId:
+      CHECK_EQ(new_operands.size(), 0);
+      clone = CreateReplicaId();
+      break;
   }
   // SetupDerivedInstruction will setup the precision_config_ field.
   SetupDerivedInstruction(clone.get());
@@ -1542,12 +1540,10 @@ HloInstruction::InstructionVector HloInstruction::unique_operands() const {
 
 Status HloInstruction::AddControlDependencyTo(HloInstruction* instruction) {
   TF_RET_CHECK(instruction->parent() == parent());
-  if (std::find(control_successors_.begin(), control_successors_.end(),
-                instruction) == control_successors_.end()) {
+  if (!absl::c_linear_search(control_successors_, instruction)) {
     control_successors_.push_back(instruction);
-    TF_RET_CHECK(std::find(instruction->control_predecessors_.begin(),
-                           instruction->control_predecessors_.end(),
-                           this) == instruction->control_predecessors_.end());
+    TF_RET_CHECK(
+        !absl::c_linear_search(instruction->control_predecessors_, this));
     instruction->control_predecessors_.push_back(this);
   }
   return Status::OK();
@@ -1679,13 +1675,16 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kReal:
     case HloOpcode::kRemainder:
     case HloOpcode::kReshape:
+    case HloOpcode::kReplicaId:
     case HloOpcode::kRoundNearestAfz:
+    case HloOpcode::kRsqrt:
     case HloOpcode::kSelect:
     case HloOpcode::kShiftLeft:
     case HloOpcode::kShiftRightArithmetic:
     case HloOpcode::kShiftRightLogical:
     case HloOpcode::kSign:
     case HloOpcode::kSin:
+    case HloOpcode::kSqrt:
     case HloOpcode::kSubtract:
     case HloOpcode::kTanh:
     case HloOpcode::kTuple:
@@ -1740,7 +1739,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kReducePrecision:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
-    case HloOpcode::kCrossReplicaSum:
+    case HloOpcode::kAllReduce:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
     case HloOpcode::kConvolution:
@@ -1754,13 +1753,19 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kDot:
     case HloOpcode::kDomain:
     case HloOpcode::kGetDimensionSize:
+    case HloOpcode::kTriangularSolve:
       LOG(FATAL) << "Base class impl called for opcode with subclass: "
                  << opcode();
   }
   return false;
 }
 
-uint64 HloInstruction::Hash() const {
+static uint64 HashOperand(const HloInstruction* hlo) {
+  return ShapeUtil::Hash(hlo->shape());
+}
+
+uint64 HloInstruction::Hash(
+    const std::function<uint64(const HloInstruction*)>& hash_operand) const {
   using tensorflow::Hash64Combine;
 
   uint64 hash_value = Hash64Combine(0, static_cast<uint64>(opcode()));
@@ -1769,7 +1774,7 @@ uint64 HloInstruction::Hash() const {
   if (!IsCrossModuleAllReduce()) {
     if (!operands().empty()) {
       for (size_t i = 0; i < operands().size(); ++i) {
-        hash_value = Hash64Combine(hash_value, operand(i)->Hash());
+        hash_value = Hash64Combine(hash_value, hash_operand(operand(i)));
       }
     }
   }
@@ -1778,6 +1783,11 @@ uint64 HloInstruction::Hash() const {
   return hash_value;
 }
 
+uint64 HloInstruction::Hash() const {
+  // Use HashOperand as an argument to prevent non-termination.
+  return Hash(HashOperand);
+}
+
 uint64 HloInstruction::InnerHash() const { return 13; }
 
 void HloInstruction::RemoveUser(HloInstruction* user) {
@@ -1786,7 +1796,7 @@ void HloInstruction::RemoveUser(HloInstruction* user) {
   user_set_.erase(set_it);
   // This is linear in the number of the users, but a vector provides a stable
   // iteration order and much faster traversal.
-  auto vec_it = std::find(users_.begin(), users_.end(), user);
+  auto vec_it = absl::c_find(users_, user);
   CHECK(vec_it != users_.end());
   users_.erase(vec_it);
 }
@@ -1798,14 +1808,17 @@ Status HloInstruction::ReplaceUseWith(HloInstruction* user,
       << "this shape: " << ShapeUtil::HumanString(shape())
       << ", replacement shape: "
       << ShapeUtil::HumanString(new_producer->shape());
+  return ReplaceUseWithDifferentShape(user, new_producer);
+}
 
+Status HloInstruction::ReplaceUseWithDifferentShape(
+    HloInstruction* user, HloInstruction* new_producer) {
   VLOG(3) << "Replacing uses of " << name() << " in " << user->name()
           << " with " << new_producer->name();
 
   RemoveUser(user);
 
-  TF_RET_CHECK(
-      std::count(user->operands_.begin(), user->operands_.end(), this) >= 0);
+  TF_RET_CHECK(absl::c_count(user->operands_, this) >= 0);
   std::replace(user->operands_.begin(), user->operands_.end(), this,
                new_producer);
   new_producer->AddUser(user);
@@ -1818,6 +1831,16 @@ Status HloInstruction::ReplaceUseWith(HloInstruction* user,
 
 Status HloInstruction::ReplaceOperandWith(int64 operand_num,
                                           HloInstruction* new_operand) {
+  auto old_operand = operand(operand_num);
+  TF_RET_CHECK(ShapeUtil::CompatibleIgnoringFpPrecision(old_operand->shape(),
+                                                        new_operand->shape()))
+      << old_operand->shape() << " is not compatible with "
+      << new_operand->shape();
+  return ReplaceOperandWithDifferentShape(operand_num, new_operand);
+}
+
+Status HloInstruction::ReplaceOperandWithDifferentShape(
+    int64 operand_num, HloInstruction* new_operand) {
   TF_RET_CHECK(operand_num >= 0);
   TF_RET_CHECK(operand_num < operand_count());
   HloInstruction* old_operand = mutable_operand(operand_num);
@@ -1825,17 +1848,12 @@ Status HloInstruction::ReplaceOperandWith(int64 operand_num,
     return Status::OK();
   }
 
-  TF_RET_CHECK(ShapeUtil::CompatibleIgnoringFpPrecision(old_operand->shape(),
-                                                        new_operand->shape()))
-      << old_operand->shape() << " is not compatible with "
-      << new_operand->shape();
   operands_[operand_num] = new_operand;
 
   VLOG(3) << "Replacing operand " << operand_num << " of " << name() << " with "
           << new_operand->name() << ", was " << old_operand->name();
 
-  if (std::find(operands_.begin(), operands_.end(), old_operand) ==
-      operands_.end()) {
+  if (!absl::c_linear_search(operands_, old_operand)) {
     old_operand->RemoveUser(this);
   }
   new_operand->AddUser(this);
@@ -1843,6 +1861,14 @@ Status HloInstruction::ReplaceOperandWith(int64 operand_num,
 }
 
 Status HloInstruction::ReplaceAllUsesWith(HloInstruction* new_producer) {
+  TF_RET_CHECK(
+      ShapeUtil::CompatibleIgnoringFpPrecision(shape(), new_producer->shape()))
+      << shape() << " is not compatible with " << new_producer->shape();
+  return ReplaceAllUsesWithDifferentShape(new_producer);
+}
+
+Status HloInstruction::ReplaceAllUsesWithDifferentShape(
+    HloInstruction* new_producer) {
   bool new_producer_is_user = false;
   for (HloInstruction* user : users()) {
     if (user == new_producer) {
@@ -1867,7 +1893,8 @@ Status HloInstruction::ReplaceAllUsesWith(HloInstruction* new_producer) {
     AddUser(new_producer);
   }
   if (parent_ && parent_->root_instruction() == this) {
-    parent_->set_root_instruction(new_producer);
+    parent_->set_root_instruction(new_producer,
+                                  /*accept_different_shape=*/true);
   }
 
   return Status::OK();
@@ -1879,8 +1906,9 @@ HloComputation* HloInstruction::to_apply() const {
     case HloOpcode::kMap:
     case HloOpcode::kReduceWindow:
     case HloOpcode::kReduce:
-    case HloOpcode::kCrossReplicaSum:
+    case HloOpcode::kAllReduce:
     case HloOpcode::kScatter:
+    case HloOpcode::kSort:
       CHECK_EQ(called_computations_.size(), 1);
       return called_computations_[0];
     default:
@@ -1898,8 +1926,9 @@ void HloInstruction::set_to_apply(HloComputation* computation) {
     case HloOpcode::kMap:
     case HloOpcode::kReduceWindow:
     case HloOpcode::kReduce:
-    case HloOpcode::kCrossReplicaSum:
+    case HloOpcode::kAllReduce:
     case HloOpcode::kScatter:
+    case HloOpcode::kSort:
       CHECK_EQ(called_computations_.size(), 1);
       called_computations_[0] = computation;
       break;
@@ -2010,8 +2039,10 @@ bool HloInstruction::IsElementwiseImpl(
     case HloOpcode::kNegate:
     case HloOpcode::kReal:
     case HloOpcode::kReducePrecision:
+    case HloOpcode::kRsqrt:
     case HloOpcode::kSign:
     case HloOpcode::kSin:
+    case HloOpcode::kSqrt:
     case HloOpcode::kTanh:
       CHECK_EQ(1, operand_count());
       return true;
@@ -2056,7 +2087,11 @@ bool HloInstruction::IsElementwiseImpl(
 }
 
 bool HloInstruction::IsCrossModuleAllReduce() const {
-  return opcode() == HloOpcode::kCrossReplicaSum && all_reduce_id();
+  return opcode() == HloOpcode::kAllReduce && all_reduce_id();
+}
+
+bool HloInstruction::IsCrossReplicaAllReduce() const {
+  return opcode() == HloOpcode::kAllReduce && !all_reduce_id();
 }
 
 string HloInstruction::ToStringWithCanonicalNameMap(
@@ -2167,8 +2202,9 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
     } else if (opcode() == HloOpcode::kCall || opcode() == HloOpcode::kMap ||
                opcode() == HloOpcode::kReduceWindow ||
                opcode() == HloOpcode::kReduce ||
-               opcode() == HloOpcode::kCrossReplicaSum ||
-               opcode() == HloOpcode::kScatter) {
+               opcode() == HloOpcode::kAllReduce ||
+               opcode() == HloOpcode::kScatter ||
+               opcode() == HloOpcode::kSort) {
       extra.push_back(
           StrCat("to_apply=", PrintName(to_apply()->name(), options)));
     } else if (!called_computations().empty()) {
@@ -2203,8 +2239,9 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
       case HloOpcode::kMap:
       case HloOpcode::kReduceWindow:
       case HloOpcode::kReduce:
-      case HloOpcode::kCrossReplicaSum:
+      case HloOpcode::kAllReduce:
       case HloOpcode::kScatter:
+      case HloOpcode::kSort:
         extra.push_back(
             StrCat("to_apply=\n", to_apply()->ToString(new_options)));
         break;
@@ -2400,12 +2437,14 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleConvolution(this);
     case HloOpcode::kFft:
       return visitor->HandleFft(this);
-    case HloOpcode::kCrossReplicaSum:
-      return visitor->HandleCrossReplicaSum(this);
+    case HloOpcode::kAllReduce:
+      return visitor->HandleAllReduce(this);
     case HloOpcode::kAllToAll:
       return visitor->HandleAllToAll(this);
     case HloOpcode::kCollectivePermute:
       return visitor->HandleCollectivePermute(this);
+    case HloOpcode::kReplicaId:
+      return visitor->HandleReplicaId(this);
     case HloOpcode::kTuple:
       return visitor->HandleTuple(this);
     case HloOpcode::kMap:
@@ -2440,6 +2479,10 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleCos(this);
     case HloOpcode::kSin:
       return visitor->HandleSin(this);
+    case HloOpcode::kSqrt:
+      return visitor->HandleSqrt(this);
+    case HloOpcode::kRsqrt:
+      return visitor->HandleRsqrt(this);
     case HloOpcode::kReal:
       return visitor->HandleReal(this);
     case HloOpcode::kImag:
@@ -2508,6 +2551,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleIota(this);
     case HloOpcode::kGetDimensionSize:
       return visitor->HandleGetDimensionSize(this);
+    case HloOpcode::kTriangularSolve:
+      return visitor->HandleTriangularSolve(this);
 
     // These opcodes are not handled here.
     case HloOpcode::kTrace:
@@ -2806,7 +2851,7 @@ HloInstruction::UseKind HloInstruction::OperandElementUse(int64 i) const {
       }
       return UseKind::kReuse;
     case HloOpcode::kDynamicUpdateSlice:
-      // Dynamic-update-slice reuses only operand 2 (start_indices).
+      // Dynamic-update-slice reuses only start_indices.
       if (i == 0 || i == 1) {
         return UseKind::kUse;
       }
@@ -2859,10 +2904,10 @@ StatusOr<HloInstruction::FusionKind> StringToFusionKind(
 
 string PaddingConfigToString(const PaddingConfig& padding) {
   bool has_interior_padding =
-      std::any_of(padding.dimensions().begin(), padding.dimensions().end(),
-                  [](const PaddingConfig::PaddingConfigDimension& dim) {
-                    return dim.interior_padding() != 0;
-                  });
+      absl::c_any_of(padding.dimensions(),
+                     [](const PaddingConfig::PaddingConfigDimension& dim) {
+                       return dim.interior_padding() != 0;
+                     });
   return StrJoin(
       padding.dimensions(), "x",
       [&](string* out, const PaddingConfig::PaddingConfigDimension& dim) {
@@ -3219,6 +3264,19 @@ int64 HloInstruction::parameter_number() const {
   return Cast<HloParameterInstruction>(this)->parameter_number();
 }
 
+void HloInstruction::set_parameter_replicated_at_leaf_buffers(
+    absl::Span<const bool> parameter_replicated_at_leaf_buffers) {
+  return Cast<HloParameterInstruction>(this)
+      ->set_parameter_replicated_at_leaf_buffers(
+          parameter_replicated_at_leaf_buffers);
+}
+
+const absl::optional<std::vector<bool>>&
+HloInstruction::parameter_replicated_at_leaf_buffers() const {
+  return Cast<HloParameterInstruction>(this)
+      ->parameter_replicated_at_leaf_buffers();
+}
+
 int64 HloInstruction::tuple_index() const {
   return Cast<HloGetTupleElementInstruction>(this)->tuple_index();
 }
@@ -3256,13 +3314,12 @@ HloInstruction::source_target_pairs() const {
   return Cast<HloCollectivePermuteInstruction>(this)->source_target_pairs();
 }
 
-string HloInstruction::cross_replica_sum_barrier() const {
-  return Cast<HloAllReduceInstruction>(this)->cross_replica_sum_barrier();
+string HloInstruction::all_reduce_barrier() const {
+  return Cast<HloAllReduceInstruction>(this)->all_reduce_barrier();
 }
 
-void HloInstruction::set_cross_replica_sum_barrier(const string& barrier) {
-  return Cast<HloAllReduceInstruction>(this)->set_cross_replica_sum_barrier(
-      barrier);
+void HloInstruction::set_all_reduce_barrier(const string& barrier) {
+  return Cast<HloAllReduceInstruction>(this)->set_all_reduce_barrier(barrier);
 }
 
 absl::optional<int64> HloInstruction::all_reduce_id() const {
@@ -3308,6 +3365,18 @@ void HloInstruction::set_feature_group_count(int64 feature_group_count) {
       feature_group_count);
 }
 
+int64 HloInstruction::batch_group_count() const {
+  if (auto convolution = DynCast<HloConvolutionInstruction>(this)) {
+    return convolution->batch_group_count();
+  }
+  return Cast<HloCustomCallInstruction>(this)->batch_group_count();
+}
+
+void HloInstruction::set_batch_group_count(int64 batch_group_count) {
+  Cast<HloCustomCallInstruction>(this)->set_batch_group_count(
+      batch_group_count);
+}
+
 HloComputation* HloInstruction::select() const {
   return Cast<HloSelectAndScatterInstruction>(this)->select();
 }
@@ -3364,4 +3433,8 @@ const DomainMetadata& HloInstruction::operand_side_metadata() const {
 const DomainMetadata& HloInstruction::user_side_metadata() const {
   return Cast<HloDomainInstruction>(this)->user_side_metadata();
 }
+
+const TriangularSolveOptions& HloInstruction::triangular_solve_options() const {
+  return Cast<HloTriangularSolveInstruction>(this)->triangular_solve_options();
+}
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index a54716217d6bbc5c0601f5d9ff7bf4072a6b30f5..33cbb9a41bab838e02813e75e2ca6327f785b007 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -47,6 +47,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -384,6 +385,14 @@ class HloInstruction {
 
   // Creates a random number generation instruction that fills a shape with
   // random numbers from a given distribution.
+  //
+  // The parameters to the instruction are interpreted as follows:
+  //
+  //  - If `distribution` is RNG_UNIFORM, generates a number in range
+  //    [param0, param1).
+  //
+  //  - If `distribution` is RNG_NORMAL, generates a normally-distributed value
+  //    with mean `param0` and standard deviation `param1`.
   static std::unique_ptr<HloInstruction> CreateRng(
       const Shape& shape, RandomDistribution distribution,
       absl::Span<HloInstruction* const> parameters);
@@ -426,7 +435,7 @@ class HloInstruction {
   // and window describes how the filter is applied to lhs.
   static std::unique_ptr<HloInstruction> CreateConvolve(
       const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
-      int64 feature_group_count, const Window& window,
+      int64 feature_group_count, int64 batch_group_count, const Window& window,
       const ConvolutionDimensionNumbers& dimension_numbers,
       const PrecisionConfig& precision_config);
 
@@ -435,6 +444,10 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand, FftType fft_type,
       absl::Span<const int64> fft_length);
 
+  static std::unique_ptr<HloInstruction> CreateTriangularSolve(
+      const Shape& shape, HloInstruction* a, HloInstruction* b,
+      const TriangularSolveOptions& options);
+
   // Creates a dot op with operands 'lhs' and 'rhs' with contracting and batch
   // dimensions specified in 'dimension_numbers'.
   static std::unique_ptr<HloInstruction> CreateDot(
@@ -462,9 +475,7 @@ class HloInstruction {
   // `all_reduce_id`: for Allreduce nodes from different modules, if they have
   // the same all_reduce_id, they will be 'Allreduce'd. If empty, Allreduce will
   // not be applied cross modules.
-  //
-  // TODO(b/117564385): Rename this to AllReduce.
-  static std::unique_ptr<HloInstruction> CreateCrossReplicaSum(
+  static std::unique_ptr<HloInstruction> CreateAllReduce(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
       HloComputation* reduce_computation,
       const std::vector<ReplicaGroup>& replica_groups,
@@ -491,11 +502,14 @@ class HloInstruction {
   // Data is sent/received according to the (source_replica_id,
   // target_replica_id) pairs in `source_target_pairs`. If a replica id is not a
   // target_replica_id in any pair, the output on that replica is a tensor
-  // conssits of 0(s) in `shape`.
+  // consists of 0(s) in `shape`.
   static std::unique_ptr<HloInstruction> CreateCollectivePermute(
       const Shape& shape, HloInstruction* operand,
       const std::vector<std::pair<int64, int64>>& source_target_pairs);
 
+  // Creates an instruction that returns a U32 replica ID.
+  static std::unique_ptr<HloInstruction> CreateReplicaId();
+
   // Creates a conversion instruction, where operand is the data to convert and
   // shape is the target shape for the conversion.
   static std::unique_ptr<HloInstruction> CreateConvert(const Shape& shape,
@@ -560,13 +574,14 @@ class HloInstruction {
   // 'slice_sizes'.
   static std::unique_ptr<HloInstruction> CreateDynamicSlice(
       const Shape& shape, HloInstruction* operand,
-      HloInstruction* start_indices, absl::Span<const int64> slice_sizes);
+      absl::Span<HloInstruction* const> start_indices,
+      absl::Span<const int64> slice_sizes);
 
   // Creates a dynamic update slice instruction, which updates a slice
   // of 'operand' with 'update' and 'start_indices'.
   static std::unique_ptr<HloInstruction> CreateDynamicUpdateSlice(
       const Shape& shape, HloInstruction* operand, HloInstruction* update,
-      HloInstruction* start_indices);
+      absl::Span<HloInstruction* const> start_indices);
 
   // Creates a concatenate instruction, where the operands are concatenated on
   // the provided dimension.
@@ -596,7 +611,6 @@ class HloInstruction {
   // f_2 = f(f_1.tuple_element(0), ..., f_1.tuple_element(N), input0.value1,
   // ..., inputN.value1)
   // ...
-  // TODO(b/112040122): Add support to this in HLO passes and in backends.
   static std::unique_ptr<HloInstruction> CreateReduce(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
       absl::Span<HloInstruction* const> init_values,
@@ -669,10 +683,15 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand,
       absl::Span<const int64> dimensions);
 
-  // Creates a sort op, with a keys operand, and optional values operands.
+  // Creates a n-ary sort op with a 'compare' computation which is used for
+  // comparisons in the sorting algorithm. 'compare' gets 2 * n parameters,
+  // where parameters 2 * i and 2 * i + 1 are the values of the i-th operand at
+  // specific index positions which should be compared, and should return a
+  // PRED. 'is_stable' specifies whether stable sorting is required.
   static std::unique_ptr<HloInstruction> CreateSort(
-      const Shape& shape, int64 dimension, HloInstruction* keys,
-      absl::Span<HloInstruction* const> values = {});
+      const Shape& shape, int64 dimension,
+      absl::Span<HloInstruction* const> operands, HloComputation* compare,
+      bool is_stable);
 
   // Creates a while instruction, given a condition computation, a body
   // computation, and the initial value for the input of the computations. For
@@ -909,6 +928,14 @@ class HloInstruction {
   // information on opcode, shape, operands, and typically a root instruction.
   // This function returns the same hash value for equivalent HLO instructions,
   // with respect to HloInstruction::Identical() method.
+  //
+  // Uses hash_operand function to compute hash values of its operands.
+  // At the very top level, hash_operand should be non-recursive to prevent
+  // non-termination.
+  uint64 Hash(
+      const std::function<uint64(const HloInstruction*)>& hash_operand) const;
+
+  // Calls the above method with non-recursive hash_operand function.
   uint64 Hash() const;
 
   // Returns whether the instruction has a constant operand.
@@ -922,11 +949,20 @@ class HloInstruction {
   // operands of it which could be created due to this replacement.
   Status ReplaceUseWith(HloInstruction* user, HloInstruction* new_producer);
 
-  // Replaces the specified operand with new_operand.
+  // Same as ReplaceUseWith(), but new_producer can have a different shape.
+  Status ReplaceUseWithDifferentShape(HloInstruction* user,
+                                      HloInstruction* new_producer);
+
+  // Replaces the specified operand with new_operand. The old and new operands
+  // must have compatible shapes ignoring floating-point precision.
   //
   // This function does NOT remove duplicated operands even if this instruction
   // is a fusion, so that the existing operand numbers do not change.
-  Status ReplaceOperandWith(int64 operand_no, HloInstruction* new_operand);
+  Status ReplaceOperandWith(int64 operand_num, HloInstruction* new_operand);
+
+  // Same as ReplaceOperandWith(), but new_operand can have a different shape.
+  Status ReplaceOperandWithDifferentShape(int64 operand_num,
+                                          HloInstruction* new_operand);
 
   // Replaces all uses of this instruction with the new producer. If
   // new_producer is a user of this instruction then new_producer remains a use
@@ -935,10 +971,16 @@ class HloInstruction {
   // If this instruction is the root of its computation, sets the computation's
   // root to new_producer.
   //
+  // The new producer must have a compatible shape ignoring floating-point
+  // precision.
+  //
   // If a user is a fusion instruction, this function will remove any duplicated
   // operands of it which could be created due to this replacement.
   Status ReplaceAllUsesWith(HloInstruction* new_producer);
 
+  // Same as ReplaceAllUsesWith, but new_producer can have a different shape.
+  Status ReplaceAllUsesWithDifferentShape(HloInstruction* new_producer);
+
   // Performs a postorder DFS visit using this node as the root. If
   // call_finish_visit is true, then DfsHloVisitor::FinishVisit is called when
   // complete. If ignore_control_predecessors is true, instructions only
@@ -1174,9 +1216,12 @@ class HloInstruction {
   // Returns true if this instruction is elementwise on all its operands.
   bool IsElementwise() const;
 
-  // Returns true if this is an cross module all-reduce instrucion.
+  // Returns true if this is a cross module all-reduce instruction.
   bool IsCrossModuleAllReduce() const;
 
+  // Returns true if this is a cross-replica all-reduce instruction.
+  bool IsCrossReplicaAllReduce() const;
+
   // Returns true if this elementwise instruction implicitly broadcasts operand
   // `operand_idx`.
   //
@@ -1218,6 +1263,10 @@ class HloInstruction {
   // on the instruction's existing name.
   void UniquifyName(NameUniquer* name_uniquer);
 
+  // Clear the unique ID of the instruction so that it can be re-assigned, such
+  // as for the purpose of compacting the instruction unique IDs.
+  void ClearUniqueIdInternal() { unique_id_ = -1; }
+
   // Set the unique id for this instruction to "id"
   void SetUniqueId(int id) {
     CHECK_EQ(unique_id_, -1);  // Should not be assigned already
@@ -1251,6 +1300,9 @@ class HloInstruction {
     backend_config_ = std::move(config_str);
   }
 
+  bool is_default_config() const { return is_default_config_; }
+  void set_default_config() { is_default_config_ = true; }
+
   // Returns a string representation of a proto in the format used by
   // raw_backend_config_string.
   //
@@ -1421,6 +1473,15 @@ class HloInstruction {
   // Delegates to HloParameterInstruction::parameter_number.
   int64 parameter_number() const;
 
+  // Delegates to
+  // HloParameterInstruction::set_parameter_replicated_at_leaf_buffers.
+  void set_parameter_replicated_at_leaf_buffers(
+      absl::Span<const bool> parameter_replicated_at_leaf_buffers);
+
+  // Delegates to HloParameterInstruction::parameter_replicated_at_leaf_buffers.
+  const absl::optional<std::vector<bool>>&
+  parameter_replicated_at_leaf_buffers() const;
+
   // Delegates to HloGetTupleElementInstruction::tuple_index.
   int64 tuple_index() const;
 
@@ -1448,9 +1509,9 @@ class HloInstruction {
   // Delegates to HloCollectivePermuteInstruction::source_target_pairs.
   const std::vector<std::pair<int64, int64>>& source_target_pairs() const;
 
-  // Delegates to HloAllReduceInstruction::cross_replica_sum_barrier.
-  string cross_replica_sum_barrier() const;
-  void set_cross_replica_sum_barrier(const string& barrier);
+  // Delegates to HloAllReduceInstruction::all_reduce_barrier.
+  string all_reduce_barrier() const;
+  void set_all_reduce_barrier(const string& barrier);
 
   // Delegates to HloAllReduceInstruction::all_reduce_id.
   absl::optional<int64> all_reduce_id() const;
@@ -1484,6 +1545,11 @@ class HloInstruction {
 
   void set_feature_group_count(int64 feature_group_count);
 
+  // The number of batch groups. Must be a divisor of the input batch dimension
+  int64 batch_group_count() const;
+
+  void set_batch_group_count(int64 batch_group_count);
+
   // Delegates to HloSelectAndScatterInstruction::select.
   HloComputation* select() const;
 
@@ -1525,6 +1591,9 @@ class HloInstruction {
   // Delegates to HloDomainInstruction::user_side_metadata().
   const DomainMetadata& user_side_metadata() const;
 
+  // Delegates to HloTriangularSolveInstruction::triangular_solve_options().
+  const TriangularSolveOptions& triangular_solve_options() const;
+
   // Old methods kept for smooth subclassing transition END.
 
  protected:
@@ -1691,6 +1760,10 @@ class HloInstruction {
   // HLO. See the documentation on backend_config().
   string backend_config_;
 
+  // This field is assigned to true when backend_config_ is assigned to
+  // a default configuration.
+  bool is_default_config_ = false;
+
   // String identifier for instruction.
   string name_;
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index 8048e332cb57747286758b75773b29ba154aa888..35f031f29a7aca8db7ebe2fbcfdcebb7a778d703 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
@@ -55,13 +56,13 @@ class OpAndUserCollectingVisitor : public DfsHloVisitorWithDefault {
   }
 
   Status HandleParameter(HloInstruction* parameter) override {
-    EXPECT_EQ(0, count_.count(parameter));
+    EXPECT_FALSE(count_.contains(parameter));
     count_[parameter] = GetCountsForNode(parameter);
     return Status::OK();
   }
 
   Status HandleConstant(HloInstruction* constant) override {
-    EXPECT_EQ(0, count_.count(constant));
+    EXPECT_FALSE(count_.contains(constant));
     count_[constant] = GetCountsForNode(constant);
     return Status::OK();
   }
@@ -69,25 +70,25 @@ class OpAndUserCollectingVisitor : public DfsHloVisitorWithDefault {
   Status HandleAdd(HloInstruction* add) override {
     auto lhs = add->operand(0);
     auto rhs = add->operand(1);
-    EXPECT_EQ(0, count_.count(add));
-    EXPECT_GT(count_.count(lhs), 0);
-    EXPECT_GT(count_.count(rhs), 0);
+    EXPECT_FALSE(count_.contains(add));
+    EXPECT_TRUE(count_.contains(lhs));
+    EXPECT_TRUE(count_.contains(rhs));
     count_[add] = GetCountsForNode(add);
     return Status::OK();
   }
 
   Status HandleNegate(HloInstruction* negate) override {
     auto operand = negate->operand(0);
-    EXPECT_EQ(0, count_.count(negate));
-    EXPECT_GT(count_.count(operand), 0);
+    EXPECT_FALSE(count_.contains(negate));
+    EXPECT_TRUE(count_.contains(operand));
     count_[negate] = GetCountsForNode(negate);
     return Status::OK();
   }
 
   Status HandleMap(HloInstruction* map) override {
-    EXPECT_EQ(0, count_.count(map));
+    EXPECT_FALSE(count_.contains(map));
     for (HloInstruction* arg : map->operands()) {
-      EXPECT_GT(count_.count(arg), 0);
+      EXPECT_TRUE(count_.contains(arg));
     }
     count_[map] = GetCountsForNode(map);
     return Status::OK();
@@ -96,9 +97,9 @@ class OpAndUserCollectingVisitor : public DfsHloVisitorWithDefault {
   Status HandleReduce(HloInstruction* reduce) override {
     auto arg = reduce->operand(0);
     auto init_value = reduce->operand(1);
-    EXPECT_EQ(0, count_.count(reduce));
-    EXPECT_GT(count_.count(arg), 0);
-    EXPECT_GT(count_.count(init_value), 0);
+    EXPECT_FALSE(count_.contains(reduce));
+    EXPECT_TRUE(count_.contains(arg));
+    EXPECT_TRUE(count_.contains(init_value));
     count_[reduce] = GetCountsForNode(reduce);
     return Status::OK();
   }
@@ -128,7 +129,7 @@ class OpAndUserCollectingVisitor : public DfsHloVisitorWithDefault {
   }
 
   // Counters for HLOs. Maps HLO to a NumOpsAndUsers.
-  std::unordered_map<const HloInstruction*, NumOpsAndUsers> count_;
+  absl::flat_hash_map<const HloInstruction*, NumOpsAndUsers> count_;
 };
 
 TEST_F(HloInstructionTest, BasicProperties) {
@@ -137,7 +138,7 @@ TEST_F(HloInstructionTest, BasicProperties) {
   EXPECT_EQ(HloOpcode::kParameter, parameter->opcode());
   EXPECT_TRUE(ShapeUtil::IsScalarWithElementType(parameter->shape(), F32));
   EXPECT_FALSE(ShapeUtil::IsScalarWithElementType(parameter->shape(), S32));
-  EXPECT_EQ(0, parameter->operand_count());
+  EXPECT_FALSE(parameter->operand_count());
 }
 
 TEST_F(HloInstructionTest, UserWithTwoOperands) {
@@ -981,9 +982,9 @@ TEST_F(HloInstructionTest, FunctionVisitor) {
   module->AddEntryComputation(builder.Build());
 
   int visit_num = 0;
-  std::unordered_map<HloInstruction*, int> visit_order;
+  absl::flat_hash_map<HloInstruction*, int> visit_order;
   EXPECT_IS_OK(add->Accept([&visit_num, &visit_order](HloInstruction* inst) {
-    EXPECT_EQ(0, visit_order.count(inst));
+    EXPECT_FALSE(visit_order.contains(inst));
     visit_order[inst] = visit_num;
     visit_num++;
     return Status::OK();
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 1ea02cf9c03866a598bec0e5356f0eb31ad27755..905a6fe08b4430ad862edf0886a57c9f7e9f7977 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -42,11 +42,9 @@ using absl::StrJoin;
 bool IsInstructionElementwiseOnOperand(const HloInstruction* instruction,
                                        const HloInstruction* operand) {
   std::vector<int64> operand_indices = instruction->OperandIndices(operand);
-  return std::all_of(
-      operand_indices.begin(), operand_indices.end(),
-      [instruction](int64 operand_index) {
-        return instruction->IsElementwiseOnOperand(operand_index);
-      });
+  return absl::c_all_of(operand_indices, [instruction](int64 operand_index) {
+    return instruction->IsElementwiseOnOperand(operand_index);
+  });
 }
 
 string PrecisionConfigToString(const PrecisionConfig& precision_config) {
@@ -203,6 +201,57 @@ std::unique_ptr<HloInstruction> HloFftInstruction::CloneWithNewOperandsImpl(
                                               fft_length_);
 }
 
+HloTriangularSolveInstruction::HloTriangularSolveInstruction(
+    const Shape& shape, HloInstruction* a, HloInstruction* b,
+    const TriangularSolveOptions& options)
+    : HloInstruction(HloOpcode::kTriangularSolve, shape),
+      triangular_solve_options_(options) {
+  AppendOperand(a);
+  AppendOperand(b);
+}
+
+HloInstructionProto HloTriangularSolveInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  *proto.mutable_triangular_solve_options() = triangular_solve_options_;
+  return proto;
+}
+
+std::vector<string> HloTriangularSolveInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& options) const {
+  return {
+      StrCat("left_side=",
+             triangular_solve_options_.left_side() ? "true" : "false"),
+      StrCat("lower=", triangular_solve_options_.lower() ? "true" : "false"),
+      StrCat("unit_diagonal=",
+             triangular_solve_options_.unit_diagonal() ? "true" : "false"),
+      StrCat("transpose_a=", TriangularSolveOptions_Transpose_Name(
+                                 triangular_solve_options_.transpose_a()))};
+}
+
+bool HloTriangularSolveInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations) const {
+  const auto& casted_other =
+      static_cast<const HloTriangularSolveInstruction&>(other);
+  const auto& options = triangular_solve_options();
+  const auto& other_options = casted_other.triangular_solve_options();
+
+  return options.left_side() == other_options.left_side() &&
+         options.lower() == other_options.lower() &&
+         options.unit_diagonal() == other_options.unit_diagonal() &&
+         options.transpose_a() == other_options.transpose_a();
+}
+
+std::unique_ptr<HloInstruction>
+HloTriangularSolveInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+    HloCloneContext* context) const {
+  CHECK_EQ(new_operands.size(), 2);
+  return absl::make_unique<HloTriangularSolveInstruction>(
+      shape, new_operands[0], new_operands[1], triangular_solve_options());
+}
+
 HloSendRecvInstruction::HloSendRecvInstruction(HloOpcode opcode,
                                                const Shape& shape,
                                                int64 channel_id,
@@ -363,9 +412,9 @@ HloAllReduceInstruction::HloAllReduceInstruction(
     HloComputation* reduce_computation,
     const std::vector<ReplicaGroup>& replica_groups, absl::string_view barrier,
     const absl::optional<int64>& all_reduce_id)
-    : HloCollectiveInstruction(HloOpcode::kCrossReplicaSum, shape, operands,
+    : HloCollectiveInstruction(HloOpcode::kAllReduce, shape, operands,
                                replica_groups),
-      cross_replica_sum_barrier_(barrier),
+      all_reduce_barrier_(barrier),
       all_reduce_id_(all_reduce_id) {
   AppendComputation(reduce_computation);
 }
@@ -381,16 +430,25 @@ HloInstructionProto HloAllReduceInstruction::ToProto() const {
   if (all_reduce_id_) {
     proto.set_all_reduce_id(*all_reduce_id_);
   }
-  proto.set_cross_replica_sum_barrier(cross_replica_sum_barrier_);
+  proto.set_all_reduce_barrier(all_reduce_barrier_);
   return proto;
 }
 
+bool HloAllReduceInstruction::IsNoop() const {
+  for (auto replica_group : replica_groups()) {
+    if (replica_group.replica_ids().size() != 1) {
+      return false;
+    }
+  }
+  return !all_reduce_id();
+}
+
 std::vector<string> HloAllReduceInstruction::ExtraAttributesToStringImpl(
     const HloPrintOptions& options) const {
   std::vector<string> result =
       HloCollectiveInstruction::ExtraAttributesToStringImpl(options);
-  if (!cross_replica_sum_barrier().empty()) {
-    result.push_back(StrCat("barrier=\"", cross_replica_sum_barrier(), "\""));
+  if (!all_reduce_barrier().empty()) {
+    result.push_back(StrCat("barrier=\"", all_reduce_barrier(), "\""));
   }
   if (all_reduce_id_) {
     result.push_back(StrCat("all_reduce_id=", *all_reduce_id_));
@@ -405,8 +463,7 @@ bool HloAllReduceInstruction::IdenticalSlowPath(
   const auto& casted_other = static_cast<const HloAllReduceInstruction&>(other);
   return HloCollectiveInstruction::IdenticalSlowPath(other, eq_computations) &&
          eq_computations(to_apply(), casted_other.to_apply()) &&
-         cross_replica_sum_barrier() ==
-             casted_other.cross_replica_sum_barrier() &&
+         all_reduce_barrier() == casted_other.all_reduce_barrier() &&
          all_reduce_id() == casted_other.all_reduce_id();
 }
 
@@ -415,8 +472,8 @@ HloAllReduceInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* /*context*/) const {
   return absl::make_unique<HloAllReduceInstruction>(
-      shape, new_operands, to_apply(), replica_groups(),
-      cross_replica_sum_barrier(), all_reduce_id());
+      shape, new_operands, to_apply(), replica_groups(), all_reduce_barrier(),
+      all_reduce_id());
 }
 
 HloAllToAllInstruction::HloAllToAllInstruction(
@@ -603,14 +660,17 @@ std::unique_ptr<HloInstruction> HloReduceInstruction::CloneWithNewOperandsImpl(
                                                  dimensions(), to_apply());
 }
 
-HloSortInstruction::HloSortInstruction(const Shape& shape, int64 dimension,
-                                       HloInstruction* keys,
-                                       absl::Span<HloInstruction* const> values)
-    : HloInstruction(HloOpcode::kSort, shape), dimensions_({dimension}) {
-  AppendOperand(keys);
-  for (auto* value : values) {
+HloSortInstruction::HloSortInstruction(
+    const Shape& shape, int64 dimension,
+    absl::Span<HloInstruction* const> operands, HloComputation* compare,
+    bool is_stable)
+    : HloInstruction(HloOpcode::kSort, shape),
+      dimensions_({dimension}),
+      is_stable_(is_stable) {
+  for (auto* value : operands) {
     AppendOperand(value);
   }
+  AppendComputation(compare);
 }
 
 HloInstructionProto HloSortInstruction::ToProto() const {
@@ -618,12 +678,18 @@ HloInstructionProto HloSortInstruction::ToProto() const {
   for (int64 dimension : dimensions_) {
     proto.add_dimensions(dimension);
   }
+  proto.set_is_stable(is_stable());
   return proto;
 }
 
 std::vector<string> HloSortInstruction::ExtraAttributesToStringImpl(
     const HloPrintOptions& options) const {
-  return {StrCat("dimensions={", StrJoin(dimensions(), ","), "}")};
+  std::vector<string> attrs;
+  attrs.push_back(StrCat("dimensions={", StrJoin(dimensions(), ","), "}"));
+  if (is_stable()) {
+    attrs.push_back("is_stable=true");
+  }
+  return attrs;
 }
 
 bool HloSortInstruction::IdenticalSlowPath(
@@ -631,15 +697,20 @@ bool HloSortInstruction::IdenticalSlowPath(
     const std::function<bool(const HloComputation*, const HloComputation*)>&
         eq_computations) const {
   const auto& casted_other = static_cast<const HloSortInstruction&>(other);
-  return dimensions() == casted_other.dimensions();
+  if (dimensions() != casted_other.dimensions()) {
+    return false;
+  }
+  if (is_stable() != casted_other.is_stable()) {
+    return false;
+  }
+  return eq_computations(to_apply(), other.to_apply());
 }
 
 std::unique_ptr<HloInstruction> HloSortInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* context) const {
-  HloInstruction* keys = new_operands[0];
-  return absl::make_unique<HloSortInstruction>(shape, dimensions(0), keys,
-                                               new_operands.subspan(1));
+  return absl::make_unique<HloSortInstruction>(
+      shape, dimensions(0), new_operands, to_apply(), is_stable());
 }
 
 HloTransposeInstruction::HloTransposeInstruction(
@@ -735,7 +806,7 @@ HloMapInstruction::HloMapInstruction(const Shape& shape,
   AppendComputation(map_computation);
   // TODO(b/65689298) Remove code below once Map is generalized to accept
   // arbitrary map dimensions.
-  dimensions_.resize(ShapeUtil::Rank(shape));
+  dimensions_.resize(shape.rank());
   std::iota(dimensions_.begin(), dimensions_.end(), 0);
 }
 
@@ -815,8 +886,7 @@ std::vector<string> HloSliceInstruction::ExtraAttributesToStringImpl(
   std::vector<string> bounds;
   bounds.reserve(slice_starts_.size());
   const bool omit_stride =
-      std::all_of(slice_strides_.begin(), slice_strides_.end(),
-                  [](int64 stride) { return stride == 1; });
+      absl::c_all_of(slice_strides_, [](int64 stride) { return stride == 1; });
   for (int i = 0; i < slice_starts_.size(); ++i) {
     string stride_str = omit_stride ? "" : StrCat(":", slice_strides_[i]);
     bounds.push_back(
@@ -867,7 +937,7 @@ void HloConstantInstruction::RelayoutConstant(const Layout& new_layout,
                                               const ShapeIndex& shape_index) {
   Shape* mutable_array_subshape =
       ShapeUtil::GetMutableSubshape(mutable_shape(), shape_index);
-  CHECK(ShapeUtil::IsArray(*mutable_array_subshape));
+  CHECK(mutable_array_subshape->IsArray());
 
   // Normally array_subshape will always have a layout, but this invariant is
   // temporarily broken in LayoutAssignment::AssignLayouts.
@@ -901,11 +971,11 @@ string HloConstantInstruction::OperandsToStringWithCanonicalNameMap(
   string operands;
   // For constants, show the actual value in place of an empty operand list.
   if (literal_.has_value() &&
-      ((ShapeUtil::IsArray(shape()) && ShapeUtil::ElementsIn(shape()) <= 10) ||
+      ((shape().IsArray() && ShapeUtil::ElementsIn(shape()) <= 10) ||
        options.print_large_constants())) {
     // Literal::ToString emits multidimensional arrays over multiple
     // lines. Compact this into one line by stripping out white space.
-    string tmp = literal().ToString();
+    string tmp = literal().ToStringWithoutShape();
     std::replace(tmp.begin(), tmp.end(), '\n', ' ');
     std::vector<string> v = absl::StrSplit(tmp, ' ');
     bool first = true;
@@ -1052,8 +1122,7 @@ HloInstruction* HloFusionInstruction::AddFusionOperand(
 
 void HloFusionInstruction::MergeFusionInstruction(
     HloFusionInstruction* instruction_to_merge) {
-  CHECK(std::find(operands().begin(), operands().end(), instruction_to_merge) !=
-        operands().end());
+  CHECK(absl::c_linear_search(operands(), instruction_to_merge));
   // Clone the instruction from which to merge fused instructions.
   std::unique_ptr<HloInstruction> cloned = instruction_to_merge->Clone();
   HloFusionInstruction* cloned_fusion =
@@ -1220,8 +1289,8 @@ HloInstruction* HloFusionInstruction::CloneAndFuseInternal(
     // corresponding fused parameter instruction. Renumber parameters as
     // necessary to make parameter numbers consistent with their index in the
     // fused_parameter_ vector.
-    bool in_operand_list = std::find(operands().begin(), operands().end(),
-                                     instruction_to_fuse) != operands().end();
+    bool in_operand_list =
+        absl::c_linear_search(operands(), instruction_to_fuse);
     CHECK(add_output || in_operand_list);
     if (instruction_to_fuse->opcode() == HloOpcode::kTuple) {
       // We assume all uses of a kTuple operation are GTE ops, not another
@@ -1325,7 +1394,7 @@ HloInstruction* HloFusionInstruction::CloneAndFuseInternal(
     if (newly_created_tuple_instr) {
       HloInstruction* new_instr = parent()->AddInstruction(
           HloInstruction::CreateGetTupleElement(fused_root->shape(), this, 0));
-      TF_CHECK_OK(ReplaceAllUsesWith(new_instr));
+      TF_CHECK_OK(ReplaceAllUsesWithDifferentShape(new_instr));
     }
     int64 index = tuple_elements.size();
     if (instruction_to_fuse->opcode() == HloOpcode::kTuple) {
@@ -1372,8 +1441,14 @@ bool HloFusionInstruction::IdenticalSlowPath(
                          other.fused_instructions_computation());
 }
 
+static uint64 HashOperandRecursive(const HloInstruction* hlo) {
+  return hlo->Hash(HashOperandRecursive);
+}
+
 uint64 HloFusionInstruction::InnerHash() const {
-  return fused_instructions_computation()->Hash();
+  // Use HashOperandRecursive to recursively compute hash on inner operands.
+  return fused_instructions_computation()->root_instruction()->Hash(
+      HashOperandRecursive);
 }
 
 std::unique_ptr<HloInstruction> HloFusionInstruction::CloneWithNewOperandsImpl(
@@ -1463,9 +1538,30 @@ HloParameterInstruction::HloParameterInstruction(int64 parameter_number,
 HloInstructionProto HloParameterInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   proto.set_parameter_number(parameter_number_);
+  if (parameter_replicated_at_leaf_buffers_) {
+    for (bool replicated : *parameter_replicated_at_leaf_buffers_) {
+      proto.mutable_parameter_replication()->add_replicated_at_leaf_buffers(
+          replicated);
+    }
+  }
   return proto;
 }
 
+std::vector<string> HloParameterInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& /*options*/) const {
+  std::vector<string> result;
+  if (!parameter_replicated_at_leaf_buffers_) {
+    return result;
+  }
+  std::vector<string> buffers_replicated_strs;
+  for (bool replicated : *parameter_replicated_at_leaf_buffers_) {
+    buffers_replicated_strs.push_back(replicated ? "true" : "false");
+  }
+  result.push_back(StrCat("parameter_replication={",
+                          StrJoin(buffers_replicated_strs, ","), "}"));
+  return result;
+}
+
 string HloParameterInstruction::OperandsToStringWithCanonicalNameMap(
     const HloPrintOptions& options,
     CanonicalNameMap* canonical_name_map) const {
@@ -1649,11 +1745,12 @@ std::unique_ptr<HloInstruction> HloOutfeedInstruction::CloneWithNewOperandsImpl(
 
 HloConvolutionInstruction::HloConvolutionInstruction(
     const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
-    int64 feature_group_count, const Window& window,
+    int64 feature_group_count, int64 batch_group_count, const Window& window,
     const ConvolutionDimensionNumbers& dimension_numbers,
     const PrecisionConfig& precision_config)
     : HloInstruction(HloOpcode::kConvolution, shape),
       feature_group_count_(feature_group_count),
+      batch_group_count_(batch_group_count),
       window_(window),
       convolution_dimension_numbers_(dimension_numbers),
       precision_config_(precision_config) {
@@ -1684,6 +1781,7 @@ HloInstructionProto HloConvolutionInstruction::ToProto() const {
   *proto.mutable_convolution_dimension_numbers() =
       convolution_dimension_numbers_;
   proto.set_feature_group_count(feature_group_count_);
+  proto.set_batch_group_count(batch_group_count_);
   *proto.mutable_precision_config() = precision_config_;
   return proto;
 }
@@ -1700,6 +1798,10 @@ std::vector<string> HloConvolutionInstruction::ExtraAttributesToStringImpl(
     extra.push_back(StrCat("feature_group_count=", feature_group_count_));
   }
 
+  if (batch_group_count_ != 1) {
+    extra.push_back(StrCat("batch_group_count=", batch_group_count_));
+  }
+
   string precision_config_string = PrecisionConfigToString(precision_config_);
   if (!precision_config_string.empty()) {
     extra.push_back(precision_config_string);
@@ -1717,6 +1819,9 @@ bool HloConvolutionInstruction::IdenticalSlowPath(
   if (feature_group_count_ != other.feature_group_count()) {
     return false;
   }
+  if (batch_group_count_ != other.batch_group_count()) {
+    return false;
+  }
   return protobuf_util::ProtobufEquals(window(), casted_other.window()) &&
          protobuf_util::ProtobufEquals(
              convolution_dimension_numbers(),
@@ -1731,8 +1836,9 @@ HloConvolutionInstruction::CloneWithNewOperandsImpl(
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 2);
   return absl::make_unique<HloConvolutionInstruction>(
-      shape, new_operands[0], new_operands[1], feature_group_count_, window(),
-      convolution_dimension_numbers_, precision_config_);
+      shape, new_operands[0], new_operands[1], feature_group_count_,
+      batch_group_count_, window(), convolution_dimension_numbers_,
+      precision_config_);
 }
 
 HloReduceWindowInstruction::HloReduceWindowInstruction(
@@ -1834,6 +1940,7 @@ HloCustomCallInstruction::HloCustomCallInstruction(
       custom_call_target_(custom_call_target.begin(), custom_call_target.end()),
       opaque_(opaque.begin(), opaque.end()),
       feature_group_count_(1),
+      batch_group_count_(1),
       layout_constrained_(false) {
   for (auto operand : operands) {
     AppendOperand(operand);
@@ -1848,6 +1955,7 @@ HloCustomCallInstruction::HloCustomCallInstruction(
       custom_call_target_(custom_call_target.begin(), custom_call_target.end()),
       opaque_(opaque.begin(), opaque.end()),
       feature_group_count_(1),
+      batch_group_count_(1),
       layout_constrained_(true),
       operand_shapes_with_layout_(operand_shapes_with_layout.begin(),
                                   operand_shapes_with_layout.end()) {
@@ -1868,6 +1976,7 @@ HloInstructionProto HloCustomCallInstruction::ToProto() const {
   proto.set_custom_call_target(custom_call_target_);
   proto.set_custom_call_opaque(opaque_);
   proto.set_feature_group_count(feature_group_count_);
+  proto.set_batch_group_count(batch_group_count_);
   if (layout_constrained()) {
     proto.set_constrain_layout(true);
     for (const Shape& shape : operand_shapes_with_layout_) {
@@ -1891,6 +2000,9 @@ std::vector<string> HloCustomCallInstruction::ExtraAttributesToStringImpl(
   if (feature_group_count_ != 1) {
     extra.push_back(StrCat("feature_group_count=", feature_group_count_));
   }
+  if (batch_group_count_ != 1) {
+    extra.push_back(StrCat("batch_group_count=", batch_group_count_));
+  }
   // By contract, we print the custom call target even if
   // options.print_subcomputation_mode() == kOff, because the call target is not
   // an HloComputation.
@@ -1934,6 +2046,20 @@ bool HloCustomCallInstruction::IdenticalSlowPath(
   if (feature_group_count_ != casted_other.feature_group_count_) {
     return false;
   }
+  if (batch_group_count_ != casted_other.batch_group_count_) {
+    return false;
+  }
+  if (layout_constrained() != casted_other.layout_constrained()) {
+    return false;
+  }
+  if (layout_constrained()) {
+    for (int64 i = 0; i < operand_shapes_with_layout_.size(); ++i) {
+      if (!ShapeUtil::Equal(operand_shapes_with_layout_[i],
+                            casted_other.operand_shapes_with_layout_[i])) {
+        return false;
+      }
+    }
+  }
   return custom_call_target_ == casted_other.custom_call_target_ &&
          opaque_ == casted_other.opaque_;
 }
@@ -1944,6 +2070,10 @@ HloCustomCallInstruction::CloneWithNewOperandsImpl(
     HloCloneContext* context) const {
   auto cloned = absl::make_unique<HloCustomCallInstruction>(
       shape, new_operands, custom_call_target(), opaque());
+  if (layout_constrained()) {
+    cloned->layout_constrained_ = true;
+    cloned->operand_shapes_with_layout_ = operand_shapes_with_layout();
+  }
   if (window_ != nullptr) {
     cloned->set_window(*window_);
   }
@@ -1951,6 +2081,7 @@ HloCustomCallInstruction::CloneWithNewOperandsImpl(
     cloned->set_convolution_dimension_numbers(*convolution_dimension_numbers_);
   }
   cloned->set_feature_group_count(feature_group_count_);
+  cloned->set_batch_group_count(batch_group_count_);
   return std::move(cloned);
 }
 
@@ -1994,12 +2125,44 @@ std::unique_ptr<HloInstruction> HloPadInstruction::CloneWithNewOperandsImpl(
 HloDynamicSliceInstruction::HloDynamicSliceInstruction(
     const Shape& shape, HloInstruction* operand, HloInstruction* start_indices,
     absl::Span<const int64> slice_sizes)
-    : HloInstruction(HloOpcode::kDynamicSlice, shape),
+    : HloDynamicIndexInstruction(HloOpcode::kDynamicSlice, shape),
       dynamic_slice_sizes_(slice_sizes.begin(), slice_sizes.end()) {
   AppendOperand(operand);
   AppendOperand(start_indices);
 }
 
+HloDynamicSliceInstruction::HloDynamicSliceInstruction(
+    const Shape& shape, HloInstruction* operand,
+    absl::Span<HloInstruction* const> start_indices,
+    absl::Span<const int64> slice_sizes)
+    : HloDynamicIndexInstruction(HloOpcode::kDynamicSlice, shape),
+      dynamic_slice_sizes_(slice_sizes.begin(), slice_sizes.end()) {
+  AppendOperand(operand);
+  for (HloInstruction* index : start_indices) {
+    AppendOperand(index);
+  }
+}
+
+HloDynamicUpdateSliceInstruction::HloDynamicUpdateSliceInstruction(
+    const Shape& shape, HloInstruction* operand, HloInstruction* update,
+    HloInstruction* start_indices)
+    : HloDynamicIndexInstruction(HloOpcode::kDynamicUpdateSlice, shape) {
+  AppendOperand(operand);
+  AppendOperand(update);
+  AppendOperand(start_indices);
+}
+
+HloDynamicUpdateSliceInstruction::HloDynamicUpdateSliceInstruction(
+    const Shape& shape, HloInstruction* operand, HloInstruction* update,
+    absl::Span<HloInstruction* const> start_indices)
+    : HloDynamicIndexInstruction(HloOpcode::kDynamicUpdateSlice, shape) {
+  AppendOperand(operand);
+  AppendOperand(update);
+  for (HloInstruction* index : start_indices) {
+    AppendOperand(index);
+  }
+}
+
 HloInstructionProto HloDynamicSliceInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   for (int64 slice_size : dynamic_slice_sizes_) {
@@ -2025,9 +2188,14 @@ std::unique_ptr<HloInstruction>
 HloDynamicSliceInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* context) const {
-  CHECK_EQ(new_operands.size(), 2);
-  return absl::make_unique<HloDynamicSliceInstruction>(
-      shape, new_operands[0], new_operands[1], dynamic_slice_sizes_);
+  if (new_operands.size() == 2 && new_operands[1]->shape().rank() == 1) {
+    // TODO(b/118437727): Old form, remove this path.
+    return absl::make_unique<HloDynamicSliceInstruction>(
+        shape, new_operands[0], new_operands[1], dynamic_slice_sizes_);
+  } else {
+    return absl::make_unique<HloDynamicSliceInstruction>(
+        shape, new_operands[0], new_operands.subspan(1), dynamic_slice_sizes_);
+  }
 }
 
 HloGatherInstruction::HloGatherInstruction(
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index b5c28137a145667a977d39c9d3c40c6d36a8436e..4d23cb671f24623f56faa9b69015cef21752a799 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -131,6 +131,34 @@ class HloFftInstruction : public HloInstruction {
   std::vector<int64> fft_length_;
 };
 
+class HloTriangularSolveInstruction : public HloInstruction {
+ public:
+  explicit HloTriangularSolveInstruction(const Shape& shape, HloInstruction* a,
+                                         HloInstruction* b,
+                                         const TriangularSolveOptions& options);
+  const TriangularSolveOptions& triangular_solve_options() const {
+    return triangular_solve_options_;
+  }
+
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+ private:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  TriangularSolveOptions triangular_solve_options_;
+};
+
 class HloSendRecvInstruction : public HloInstruction {
  public:
   // Returns the channel id associated with the instruction. The id is
@@ -242,14 +270,10 @@ class HloAllReduceInstruction : public HloCollectiveInstruction {
       const std::vector<ReplicaGroup>& replica_groups,
       absl::string_view barrier, const absl::optional<int64>& all_reduce_id);
 
-  // Returns the barrier config used for the CrossReplicaSum implementation of
+  // Returns the barrier config used for the AllReduce implementation of
   // each backend.
-  string cross_replica_sum_barrier() const {
-    return cross_replica_sum_barrier_;
-  }
-  void set_cross_replica_sum_barrier(string barrier) {
-    cross_replica_sum_barrier_ = barrier;
-  }
+  string all_reduce_barrier() const { return all_reduce_barrier_; }
+  void set_all_reduce_barrier(string barrier) { all_reduce_barrier_ = barrier; }
 
   absl::optional<int64> all_reduce_id() const { return all_reduce_id_; }
   void set_all_reduce_id(const absl::optional<int64>& all_reduce_id);
@@ -257,6 +281,10 @@ class HloAllReduceInstruction : public HloCollectiveInstruction {
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
+  // Returns true if the AllReduce does no communication, so it's equivalent
+  // to a mem copy.
+  bool IsNoop() const;
+
  private:
   std::vector<string> ExtraAttributesToStringImpl(
       const HloPrintOptions& options) const override;
@@ -270,8 +298,8 @@ class HloAllReduceInstruction : public HloCollectiveInstruction {
       const Shape& shape, absl::Span<HloInstruction* const> new_operands,
       HloCloneContext* context) const override;
 
-  // The string representation of the barrier config used for CrossReplicaSum.
-  string cross_replica_sum_barrier_;
+  // The string representation of the barrier config used for AllReduce.
+  string all_reduce_barrier_;
 
   // For Allreduce nodes from different modules, if they have the same
   // all_reduce_id, they will be 'Allreduce'd. If empty, Allreduce will not be
@@ -418,8 +446,8 @@ class HloReduceInstruction : public HloInstruction {
 class HloSortInstruction : public HloInstruction {
  public:
   explicit HloSortInstruction(const Shape& shape, int64 dimension,
-                              HloInstruction* keys,
-                              absl::Span<HloInstruction* const> values = {});
+                              absl::Span<HloInstruction* const> operands,
+                              HloComputation* compare, bool is_stable);
   // Returns the dimension sizes or numbers associated with this instruction.
   const std::vector<int64>& dimensions() const override { return dimensions_; }
   int64 dimensions(int64 index) const override { return dimensions()[index]; }
@@ -432,6 +460,7 @@ class HloSortInstruction : public HloInstruction {
   HloInstruction* mutable_keys() { return mutable_operand(0); }
   // Returns the number of value operands.
   int64 values_count() const { return operand_count() - 1; }
+  bool is_stable() const { return is_stable_; }
 
  private:
   std::vector<string> ExtraAttributesToStringImpl(
@@ -446,6 +475,7 @@ class HloSortInstruction : public HloInstruction {
       HloCloneContext* context) const override;
 
   std::vector<int64> dimensions_;
+  bool is_stable_;
 };
 
 class HloTransposeInstruction : public HloInstruction {
@@ -787,10 +817,28 @@ class HloParameterInstruction : public HloInstruction {
   explicit HloParameterInstruction(int64 parameter_number, const Shape& shape,
                                    const string& name);
   int64 parameter_number() const { return parameter_number_; }
+
+  // Sets and gets the whether all replicas will receive the same parameter data
+  // for each leaf buffer in data parallelism.
+  void set_parameter_replicated_at_leaf_buffers(
+      absl::Span<const bool> parameter_replicated_at_leaf_buffers) {
+    CHECK_EQ(ShapeUtil::GetLeafCount(shape()),
+             parameter_replicated_at_leaf_buffers.size());
+    parameter_replicated_at_leaf_buffers_.emplace(
+        parameter_replicated_at_leaf_buffers.begin(),
+        parameter_replicated_at_leaf_buffers.end());
+  }
+  const absl::optional<std::vector<bool>>&
+  parameter_replicated_at_leaf_buffers() const {
+    return parameter_replicated_at_leaf_buffers_;
+  }
+
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
  private:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
   bool IdenticalSlowPath(
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
@@ -804,6 +852,10 @@ class HloParameterInstruction : public HloInstruction {
       HloCloneContext* context) const override;
 
   int64 parameter_number_ = 0;
+
+  // Specifies whether each buffer has the same parameter value on all replicas
+  // in data parallelism.
+  absl::optional<std::vector<bool>> parameter_replicated_at_leaf_buffers_;
 };
 
 class HloGetTupleElementInstruction : public HloInstruction {
@@ -903,9 +955,7 @@ class HloOutfeedInstruction : public HloInstruction {
                                  HloInstruction* token_operand,
                                  absl::string_view outfeed_config);
   // Returns the shape for the Outfeed instruction.
-  const Shape& outfeed_shape() const {
-    return outfeed_shape_;
-  }
+  const Shape& outfeed_shape() const { return outfeed_shape_; }
   // Returns the config for the Outfeed instruction.
   const string& outfeed_config() const { return outfeed_config_; }
   // Returns a serialized representation of this instruction.
@@ -933,7 +983,7 @@ class HloConvolutionInstruction : public HloInstruction {
  public:
   explicit HloConvolutionInstruction(
       const Shape& shape, HloInstruction* lhs, HloInstruction* rhs,
-      int64 feature_group_count, const Window& window,
+      int64 feature_group_count, int64 batch_group_count, const Window& window,
       const ConvolutionDimensionNumbers& dimension_numbers,
       const PrecisionConfig& precision_config);
   const Window& window() const override { return window_; }
@@ -949,6 +999,10 @@ class HloConvolutionInstruction : public HloInstruction {
   // dimension and output feature dimension.
   int64 feature_group_count() const { return feature_group_count_; }
 
+  // The number of feature groups. Must be a divisor of the input batch
+  // dimension.
+  int64 batch_group_count() const { return batch_group_count_; }
+
   // Returns the information used to tell the implementation information about
   // what sort of precision is requested. The meaning of the field is backend
   // specific. At the moment, it is only supported for kConvolution and kDot.
@@ -977,6 +1031,9 @@ class HloConvolutionInstruction : public HloInstruction {
   // The number of feature groups. Must be a divisor of the input feature
   // dimension and output feature dimension.
   int64 feature_group_count_;
+  // The number of feature groups. Must be a divisor of the input batch
+  // dimension.
+  int64 batch_group_count_;
   // Describes the window used for a convolution.
   Window window_;
   // Describes the dimension numbers used for a convolution.
@@ -1099,7 +1156,11 @@ class HloCustomCallInstruction : public HloInstruction {
   void set_feature_group_count(int64 feature_group_count) {
     feature_group_count_ = feature_group_count;
   }
+  void set_batch_group_count(int64 batch_group_count) {
+    batch_group_count_ = batch_group_count;
+  }
   int64 feature_group_count() const { return feature_group_count_; }
+  int64 batch_group_count() const { return batch_group_count_; }
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
@@ -1134,6 +1195,7 @@ class HloCustomCallInstruction : public HloInstruction {
   std::unique_ptr<ConvolutionDimensionNumbers> convolution_dimension_numbers_;
   // The number of feature groups. This is used for grouped convolutions.
   int64 feature_group_count_;
+  int64 batch_group_count_;
   // Whether the result and operand layouts are constrained.
   bool layout_constrained_;
   // For layout-constrained custom calls, this vector holds the shape with
@@ -1171,12 +1233,38 @@ class HloPadInstruction : public HloInstruction {
   PaddingConfig padding_config_;
 };
 
-class HloDynamicSliceInstruction : public HloInstruction {
+class HloDynamicIndexInstruction : public HloInstruction {
+ public:
+  explicit HloDynamicIndexInstruction(HloOpcode opcode, const Shape& shape)
+      : HloInstruction(opcode, shape) {}
+  virtual int64 first_index_operand_number() const = 0;
+
+  // Returns a subspan of operands which represent the start indices.
+  absl::Span<HloInstruction* const> index_operands() const {
+    return absl::MakeSpan(operands()).subspan(first_index_operand_number());
+  }
+
+  // Returns the shapes of the index operands.
+  std::vector<Shape> index_shapes() const {
+    std::vector<Shape> shapes;
+    auto indices = index_operands();
+    for (const HloInstruction* index : indices) {
+      shapes.push_back(index->shape());
+    }
+    return shapes;
+  }
+};
+
+class HloDynamicSliceInstruction : public HloDynamicIndexInstruction {
  public:
   explicit HloDynamicSliceInstruction(const Shape& shape,
                                       HloInstruction* operand,
                                       HloInstruction* start_indices,
                                       absl::Span<const int64> slice_sizes);
+  explicit HloDynamicSliceInstruction(
+      const Shape& shape, HloInstruction* operand,
+      absl::Span<HloInstruction* const> start_indices,
+      absl::Span<const int64> slice_sizes);
   // Old methods kept for smooth subclassing transition END.
   // Returns the size of the slice in the given dimension for a dynamic
   // slice node.
@@ -1189,6 +1277,8 @@ class HloDynamicSliceInstruction : public HloInstruction {
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
+  int64 first_index_operand_number() const override { return 1; }
+
  private:
   std::vector<string> ExtraAttributesToStringImpl(
       const HloPrintOptions& options) const override;
@@ -1206,6 +1296,19 @@ class HloDynamicSliceInstruction : public HloInstruction {
   std::vector<int64> dynamic_slice_sizes_;
 };
 
+class HloDynamicUpdateSliceInstruction : public HloDynamicIndexInstruction {
+ public:
+  explicit HloDynamicUpdateSliceInstruction(const Shape& shape,
+                                            HloInstruction* operand,
+                                            HloInstruction* update,
+                                            HloInstruction* start_indices);
+  explicit HloDynamicUpdateSliceInstruction(
+      const Shape& shape, HloInstruction* operand, HloInstruction* update,
+      absl::Span<HloInstruction* const> start_indices);
+
+  int64 first_index_operand_number() const override { return 2; }
+};
+
 class HloGatherInstruction : public HloInstruction {
  public:
   explicit HloGatherInstruction(
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.cc b/tensorflow/compiler/xla/service/hlo_lexer.cc
index 1390537101e95a08e4ba4eef7ae8d6059a40e916..2255383322873a39c7076e0f4f0dd541bc79014d 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.cc
+++ b/tensorflow/compiler/xla/service/hlo_lexer.cc
@@ -17,8 +17,10 @@ limitations under the License.
 
 #include <unordered_map>
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/escaping.h"
 #include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -36,8 +38,8 @@ constexpr int kError = -2;
 
 // [a-zA-Z0-9_.-]
 bool IsIdentifierChar(char c) {
-  return isalnum(static_cast<unsigned char>(c)) || c == '-' || c == '.' ||
-         c == '_';
+  return absl::ascii_isalnum(static_cast<unsigned char>(c)) || c == '-' ||
+         c == '.' || c == '_';
 }
 
 }  // namespace
@@ -82,15 +84,29 @@ tensorflow::RegexpStringPiece HloLexer::RegexpStringPieceFromPointers(
   return tensorflow::RegexpStringPiece(begin, end - begin);
 }
 
+TokKind HloLexer::LookAhead() {
+  if (GetKind() == TokKind::kEof || GetKind() == TokKind::kError) {
+    return GetKind();
+  }
+
+  const char* old_current_ptr = current_ptr_;
+  TokenState old_token_state = token_state_;
+  Lex();
+  TokKind kind = GetKind();
+  token_state_ = old_token_state;
+  current_ptr_ = old_current_ptr;
+  return kind;
+}
+
 TokKind HloLexer::LexToken() {
   while (true) {
-    token_start_ = current_ptr_;
+    token_state_.token_start = current_ptr_;
 
     int current_char = GetNextChar();
     switch (current_char) {
       default:
         // [a-zA-Z_]
-        if (isalpha(static_cast<unsigned char>(current_char)) ||
+        if (absl::ascii_isalpha(static_cast<unsigned char>(current_char)) ||
             current_char == '_') {
           return LexIdentifier();
         }
@@ -125,12 +141,20 @@ TokKind HloLexer::LexToken() {
         return LexNumberOrPattern();
       case '=':
         return TokKind::kEqual;
+      case '<':
+        if (current_char == '<' && PeekCurrentChar() == '=') {
+          current_ptr_++;
+          return TokKind::kLeq;
+        }
+        return TokKind::kError;
       case ',':
         return TokKind::kComma;
       case '%':
         return LexPercent();
       case ':':
         return TokKind::kColon;
+      case '*':
+        return TokKind::kAsterisk;
       case '[':
         return TokKind::kLsquare;
       case ']':
@@ -190,6 +214,15 @@ TokKind HloLexer::LexToken() {
         // A lone '/' is an error.
         return TokKind::kError;
       }
+      case '.':
+        if (PeekCurrentChar() == '.') {
+          current_ptr_++;
+          if (PeekCurrentChar() == '.') {
+            current_ptr_++;
+            return TokKind::kDots;
+          }
+        }
+        return TokKind::kError;
       case '"':
         return LexString();
     }
@@ -206,43 +239,37 @@ TokKind HloLexer::LexToken() {
 // dim_labels_pattern ::= [0-9bf]{2,}_[0-9io]{2,}->[0-9bf]{2,}
 // identifiers ::= other cases that match [a-zA-Z_][a-zA-Z0-9_.-]*
 TokKind HloLexer::LexIdentifier() {
-  {
-    auto consumable = RegexpStringPieceFromPointers(token_start_, buf_.end());
-    // 'consumable' will be advanced iff its prefix matches the pattern.
-    static LazyRE2 shape_pattern = {
-        R"(^(\w*\d*)\[([\d,\s]*)\](?:(dense|sparse)?{([\d,\s]+)})?)"};
-    if (RE2::Consume(&consumable, *shape_pattern)) {
-      auto status_or_shape = ShapeUtil::ParseShapeString(
-          StringPieceFromPointers(token_start_, consumable.begin()));
-      if (status_or_shape.ok()) {
-        // This is a shape string.
-        shape_val_ = status_or_shape.ValueOrDie();
-        current_ptr_ = consumable.begin();
-        return TokKind::kShape;
-      }
-    }
-  }
-
   while (IsIdentifierChar(PeekCurrentChar())) {
     current_ptr_++;
   }
 
   // If followed by ':', it's a name.
   if (PeekCurrentChar() == ':') {
-    str_val_.assign(token_start_, current_ptr_);
+    token_state_.str_val.assign(token_state_.token_start, current_ptr_);
     current_ptr_++;  // skip ':'
     return TokKind::kName;
   }
 
   // If followed by '=', it's a attribute name.
   if (PeekCurrentChar() == '=') {
-    str_val_.assign(token_start_, current_ptr_);
+    token_state_.str_val.assign(token_state_.token_start, current_ptr_);
     current_ptr_++;  // skip '='
     return TokKind::kAttributeName;
   }
 
   absl::string_view identifier =
-      StringPieceFromPointers(token_start_, current_ptr_);
+      StringPieceFromPointers(token_state_.token_start, current_ptr_);
+
+  // Primitive type strings are reserved words. The exception is 'tuple' whose
+  // type is represented using nested parentheses without the string 'tuple'.
+  if (primitive_util::IsPrimitiveTypeName(identifier)) {
+    PrimitiveType primitive_type =
+        primitive_util::StringToPrimitiveType(identifier).ValueOrDie();
+    if (primitive_type != TUPLE) {
+      token_state_.primitive_type_val = primitive_type;
+      return TokKind::kPrimitiveType;
+    }
+  }
 
   // See if this is a keyword.
 #define KEYWORD(STR)            \
@@ -261,21 +288,23 @@ TokKind HloLexer::LexIdentifier() {
   KEYWORD(ROOT);
   KEYWORD(maximal);
   KEYWORD(replicated);
+  KEYWORD(sparse);
 
 #undef KEYWORD
 
   {
-    auto consumable = RegexpStringPieceFromPointers(token_start_, buf_.end());
+    auto consumable =
+        RegexpStringPieceFromPointers(token_state_.token_start, buf_.end());
     static LazyRE2 dim_labels_pattern = {
         R"([0-9bf]{2,}_[0-9io]{2,}->[0-9bf]{2,})"};
     if (RE2::Consume(&consumable, *dim_labels_pattern)) {
       current_ptr_ = consumable.begin();
-      str_val_.assign(token_start_, current_ptr_);
+      token_state_.str_val.assign(token_state_.token_start, current_ptr_);
       return TokKind::kDimLabels;
     }
   }
 
-  str_val_ = string(identifier);
+  token_state_.str_val = string(identifier);
   return TokKind::kIdent;
 }
 
@@ -283,13 +312,13 @@ TokKind HloLexer::LexIdentifier() {
 // name ::= [a-zA-Z_][a-zA-Z0-9_.-]*
 TokKind HloLexer::LexPercent() {
   const char* name_start = current_ptr_;
-  if (isalpha(static_cast<unsigned char>(PeekCurrentChar())) ||
+  if (absl::ascii_isalpha(static_cast<unsigned char>(PeekCurrentChar())) ||
       PeekCurrentChar() == '_') {
     current_ptr_++;
     while (IsIdentifierChar(PeekCurrentChar())) {
       current_ptr_++;
     }
-    str_val_.assign(name_start, current_ptr_);
+    token_state_.str_val.assign(name_start, current_ptr_);
     return TokKind::kName;
   }
   return TokKind::kError;
@@ -307,12 +336,14 @@ TokKind HloLexer::LexPercent() {
 // int ::=  [-]?[0-9]+
 // negative inf ::= '-inf'
 TokKind HloLexer::LexNumberOrPattern() {
-  auto consumable = RegexpStringPieceFromPointers(token_start_, buf_.end());
+  auto consumable =
+      RegexpStringPieceFromPointers(token_state_.token_start, buf_.end());
   static LazyRE2 float_pattern = {
       R"([-]?((\d+|\d+[.]\d*|\d*[.]\d+)([eE][+-]?\d+))|[-]?(\d+[.]\d*|\d*[.]\d+))"};
   if (RE2::Consume(&consumable, *float_pattern)) {
     current_ptr_ = consumable.begin();
-    CHECK(absl::SimpleAtod(string(token_start_, current_ptr_), &decimal_val_));
+    CHECK(absl::SimpleAtod(string(token_state_.token_start, current_ptr_),
+                           &token_state_.decimal_val));
     return TokKind::kDecimal;
   }
 
@@ -324,27 +355,28 @@ TokKind HloLexer::LexNumberOrPattern() {
 
   if (RE2::Consume(&consumable, *dim_labels_pattern)) {
     current_ptr_ = consumable.begin();
-    str_val_.assign(token_start_, current_ptr_);
+    token_state_.str_val.assign(token_state_.token_start, current_ptr_);
     return TokKind::kDimLabels;
   }
 
   if (RE2::Consume(&consumable, *dxd_pattern)) {
     current_ptr_ = consumable.begin();
-    str_val_.assign(token_start_, current_ptr_);
+    token_state_.str_val.assign(token_state_.token_start, current_ptr_);
     return TokKind::kDxD;
   }
 
   if (RE2::Consume(&consumable, *pad_pattern)) {
     current_ptr_ = consumable.begin();
-    str_val_.assign(token_start_, current_ptr_);
+    token_state_.str_val.assign(token_state_.token_start, current_ptr_);
     return TokKind::kPad;
   }
 
   static LazyRE2 int_pattern = {R"([-]?\d+)"};
   if (RE2::Consume(&consumable, *int_pattern)) {
     current_ptr_ = consumable.begin();
-    auto slice = StringPieceFromPointers(token_start_, current_ptr_);
-    if (absl::SimpleAtoi(slice, &int64_val_)) {
+    auto slice =
+        StringPieceFromPointers(token_state_.token_start, current_ptr_);
+    if (absl::SimpleAtoi(slice, &token_state_.int64_val)) {
       return TokKind::kInt;
     }
     LOG(ERROR) << "Failed to parse int literal: " << slice;
@@ -403,16 +435,17 @@ absl::string_view HloLexer::GetLine(LocTy loc) const {
 }
 
 // Lexes quoted string with escaping characters. If matched, the quoted string
-// will be unescaped and stored to str_val_.
+// will be unescaped and stored to token_state_.str_val.
 TokKind HloLexer::LexString() {
-  auto consumable = RegexpStringPieceFromPointers(token_start_, buf_.end());
+  auto consumable =
+      RegexpStringPieceFromPointers(token_state_.token_start, buf_.end());
   static LazyRE2 escaping_pattern = {R"("([^"\\]|\\.)*")"};
   if (RE2::Consume(&consumable, *escaping_pattern)) {
     current_ptr_ = consumable.begin();
     absl::string_view raw =
-        StringPieceFromPointers(token_start_ + 1, current_ptr_ - 1);
+        StringPieceFromPointers(token_state_.token_start + 1, current_ptr_ - 1);
     string error;
-    if (!absl::CUnescape(raw, &str_val_, &error)) {
+    if (!absl::CUnescape(raw, &token_state_.str_val, &error)) {
       LOG(ERROR) << "Failed unescaping string: " << raw << ". error: " << error;
       return TokKind::kError;
     }
@@ -433,6 +466,8 @@ string TokKindToString(TokKind kind) {
       return "kComma";
     case TokKind::kColon:
       return "kColon";
+    case TokKind::kAsterisk:
+      return "kAsterisk";
     case TokKind::kLsquare:
       return "kLsquare";
     case TokKind::kRsquare:
@@ -447,6 +482,8 @@ string TokKindToString(TokKind kind) {
       return "kRparen";
     case TokKind::kArrow:
       return "kArrow";
+    case TokKind::kLeq:
+      return "kLeq";
     case TokKind::kw_HloModule:
       return "kw_HloModule";
     case TokKind::kw_ENTRY:
@@ -467,6 +504,10 @@ string TokKindToString(TokKind kind) {
       return "kw_inf";
     case TokKind::kNegInf:
       return "kNegInf";
+    case TokKind::kw_sparse:
+      return "kw_sparse";
+    case TokKind::kPrimitiveType:
+      return "kPrimitiveType";
     case TokKind::kName:
       return "kName";
     case TokKind::kAttributeName:
@@ -481,12 +522,12 @@ string TokKindToString(TokKind kind) {
       return "kIdent";
     case TokKind::kString:
       return "kString";
-    case TokKind::kShape:
-      return "kShape";
     case TokKind::kInt:
       return "kInt";
     case TokKind::kDecimal:
       return "kDecimal";
+    case TokKind::kDots:
+      return "kDots";
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.h b/tensorflow/compiler/xla/service/hlo_lexer.h
index d6a2b292a3916b2ff85f278cf5cb9f1567df88fa..383fb4e862b8e32771879d055e663dc821a5c839 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.h
+++ b/tensorflow/compiler/xla/service/hlo_lexer.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/hlo_token.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -29,6 +28,60 @@ limitations under the License.
 
 namespace xla {
 
+// Defines different kinds of tokens used by the HLO lexer.
+//
+// You shouldn't need to use this directly unless you're using HloLexer
+// directly, and you probably don't need to do that.  Use hlo_parser instead.
+enum class TokKind {
+  // Markers
+  kEof,
+  kError,
+
+  // Tokens with no info.
+  kEqual,     // =
+  kComma,     // ,
+  kColon,     // :
+  kAsterisk,  // *
+  kLsquare,
+  kRsquare,  // [  ]
+  kLbrace,
+  kRbrace,  // {  }
+  kLparen,
+  kRparen,  // (  )
+  kDots,    // ...
+
+  kArrow,  // ->
+  kLeq,    // <=
+
+  // Keywords
+  kw_HloModule,
+  kw_ENTRY,
+  kw_ROOT,
+  kw_true,
+  kw_false,
+  kw_maximal,
+  kw_replicated,
+  kw_nan,
+  kw_inf,
+  kw_sparse,
+
+  kNegInf,  // -inf
+
+  // Typed tokens.
+  kPrimitiveType,  // F32, PRED, etc.
+  kName,           // %foo
+  kAttributeName,  // dimensions=
+  kDimLabels,      // [0-9bf]{2,}_[0-9io]{2,}->[0-9bf]{2,}
+  kDxD,            // [0-9]+(x[0-9]+)+
+  kPad,            // [0-9]+_[0-9]+(_[0-9]+)?(x[0-9]+_[0-9]+(_[0-9]+)?)*
+  kIdent,          // other identifiers
+  kString,         // "abcd\"\n"
+  kInt,            // 42
+  kDecimal,        // 4.2
+};
+
+string TokKindToString(TokKind kind);
+
 // Lexer for the HloModule::ToString() format text.
 //
 // This class is meant to be used by hlo_parser.cc.  You shouldn't need to use
@@ -39,9 +92,9 @@ class HloLexer {
     current_ptr_ = buf_.begin();
   }
 
-  TokKind Lex() { return current_kind_ = LexToken(); }
+  TokKind Lex() { return token_state_.current_kind = LexToken(); }
 
-  TokKind GetKind() const { return current_kind_; }
+  TokKind GetKind() const { return token_state_.current_kind; }
   string GetStrVal() const {
     switch (GetKind()) {
       case TokKind::kName:
@@ -51,28 +104,28 @@ class HloLexer {
       case TokKind::kPad:
       case TokKind::kString:
       case TokKind::kIdent:
-        return str_val_;
+        return token_state_.str_val;
       default:
         LOG(FATAL) << "This token does not have string value";
     }
   }
-  Shape GetShapeVal() const {
-    CHECK(GetKind() == TokKind::kShape);
-    return shape_val_;
-  }
-  tensorflow::int64 GetInt64Val() const {
+  int64 GetInt64Val() const {
     CHECK(GetKind() == TokKind::kInt);
-    return int64_val_;
+    return token_state_.int64_val;
   }
   double GetDecimalVal() const {
     CHECK(GetKind() == TokKind::kDecimal);
-    return decimal_val_;
+    return token_state_.decimal_val;
+  }
+  PrimitiveType GetPrimitiveTypeVal() const {
+    CHECK(GetKind() == TokKind::kPrimitiveType);
+    return token_state_.primitive_type_val;
   }
 
   typedef const char* LocTy;
 
   // Returns the location of the current token.
-  LocTy GetLoc() const { return token_start_; }
+  LocTy GetLoc() const { return token_state_.token_start; }
 
   // Returns the line and column of a location in the buffer.
   std::pair<unsigned, unsigned> GetLineAndColumn(LocTy location) const;
@@ -80,6 +133,9 @@ class HloLexer {
   // Returns the whole line given the location.
   absl::string_view GetLine(LocTy loc) const;
 
+  // Looks ahead one token and returns it. Lexer state is unchanged.
+  TokKind LookAhead();
+
  private:
   // Returns the current character. If it's neither the end of input buffer nor
   // an invalid character, moves the pointer forward.
@@ -112,12 +168,15 @@ class HloLexer {
   const char* current_ptr_;
 
   // Information about the current token.
-  const char* token_start_ = nullptr;
-  TokKind current_kind_;
-  string str_val_;
-  Shape shape_val_;
-  tensorflow::int64 int64_val_;
-  double decimal_val_;
+  struct TokenState {
+    const char* token_start = nullptr;
+    TokKind current_kind;
+    string str_val;
+    int64 int64_val;
+    double decimal_val;
+    PrimitiveType primitive_type_val;
+  };
+  TokenState token_state_;
 
   struct LineNoCacheTy {
     const char* last_query;
diff --git a/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc b/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc
index 5bf055f3c012fef687cdc275d62efdf2d4cd5e5c..e14bcfa7f67e736a4d04f5b236fb2df02cf150e0 100644
--- a/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <deque>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/map_util.h"
@@ -36,11 +37,11 @@ namespace xla {
 namespace {
 
 using Worklist = std::deque<const HloInstruction*>;
-using Workset = std::unordered_set<const HloInstruction*>;
+using Workset = absl::flat_hash_set<const HloInstruction*>;
 
 void AddToWorklist(const HloInstruction* instruction, Worklist* worklist,
                    Workset* workset) {
-  if (workset->count(instruction) == 0) {
+  if (!workset->contains(instruction)) {
     worklist->push_back(instruction);
     workset->insert(instruction);
     VLOG(3) << "ADD instruction: " << instruction->name();
diff --git a/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc
index e0ae1173c6114f0bc6ef18b2cfff9d54ccfe2faf..436cccb1fb9ecf6f4efad772c700c611b28ce628 100644
--- a/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc
@@ -403,9 +403,9 @@ TEST_F(HloLivenessAnalysisTest, WhileWithOutfeed) {
   HloModule OutfeedLoop
   WhileBody {
     body_param = (s32[]) parameter(0)
-    token = token[] after-all()
+    token0 = token[] after-all()
     constant.2 = s32[] constant(2)
-    outfeed_tuple = (s32[]) outfeed(constant.2, token)
+    outfeed_tuple = (s32[]) outfeed(constant.2, token0)
     get-tuple-element.1 = s32[] get-tuple-element(body_param), index=0
     constant.1 = s32[] constant(1)
     add = s32[] add(get-tuple-element.1, constant.1)
@@ -436,9 +436,9 @@ TEST_F(HloLivenessAnalysisTest, NestedWhileWithOutfeed) {
   HloModule OutfeedLoop
   InnerWhileBody {
     body_param = (s32[]) parameter(0)
-    token = token[] after-all()
+    token0 = token[] after-all()
     constant.2 = s32[] constant(2)
-    outfeed_tuple = (s32[]) outfeed(constant.2, token)
+    outfeed_tuple = (s32[]) outfeed(constant.2, token0)
     get-tuple-element.1 = s32[] get-tuple-element(body_param), index=0
     constant.1 = s32[] constant(1)
     add = s32[] add(get-tuple-element.1, constant.1)
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index 235efb19ce4ed28a5cd9fe5ca52ae5d8e9e5ba3d..67488a6a9a0c9cba7f576f9036c3a0cbe1900fff 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -178,7 +178,7 @@ HLO_MATCHER(Constant);
 HLO_MATCHER(Convert);
 HLO_MATCHER(Convolution);
 HLO_MATCHER(Copy);
-HLO_MATCHER(CrossReplicaSum);
+HLO_MATCHER(AllReduce);
 HLO_MATCHER(CollectivePermute);
 HLO_MATCHER(Divide);
 HLO_MATCHER(Domain);
@@ -312,8 +312,8 @@ inline ::testing::Matcher<const ::xla::HloInstruction*> Shape(
 }
 inline ::testing::Matcher<const ::xla::HloInstruction*> Shape(
     absl::string_view shape) {
-  return ::testing::MakeMatcher(new ::xla::testing::HloShapeMatcher(
-      ShapeUtil::ParseShapeString(shape).ValueOrDie()));
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloShapeMatcher(ParseShape(shape).ValueOrDie()));
 }
 inline ::testing::Matcher<const ::xla::HloInstruction*> ShapeWithLayout(
     const class Shape& shape) {
@@ -323,7 +323,7 @@ inline ::testing::Matcher<const ::xla::HloInstruction*> ShapeWithLayout(
 inline ::testing::Matcher<const ::xla::HloInstruction*> ShapeWithLayout(
     absl::string_view shape) {
   return ::testing::MakeMatcher(new ::xla::testing::HloShapeAndLayoutMatcher(
-      ShapeUtil::ParseShapeString(shape).ValueOrDie()));
+      ParseShape(shape).ValueOrDie()));
 }
 
 // Verifies the value of the HloSharing against the provided sharding object.
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler.h b/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
index 7227bfb27c74758d2b79e404afc9eb97a1ca894d..76cc29cbb7848eb424d07abf11a95ffd59e9eed6 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
@@ -118,7 +118,7 @@ class HloTrivialScheduler : public HloModulePass {
 };
 
 // A trivial pass which clears the schedule currently set on the
-// HloModule. After this pass runs HloModudle::has_schedule will return false.
+// HloModule. After this pass runs HloModule::has_schedule will return false.
 class HloDescheduler : public HloModulePass {
  public:
   HloDescheduler() = default;
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index fe8371384c0fa3900a9022f101ff0b296439cf16..8322870cfd6a89fc6f863da8fd4a3576e8845cd7 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -107,11 +107,10 @@ HloComputation* HloModule::AddEntryComputation(
 }
 
 Status HloModule::RemoveEmbeddedComputation(HloComputation* to_remove) {
-  auto it =
-      std::find_if(computations_.begin(), computations_.end(),
-                   [&to_remove](const std::unique_ptr<HloComputation>& comp) {
-                     return comp.get() == to_remove;
-                   });
+  auto it = absl::c_find_if(
+      computations_, [&to_remove](const std::unique_ptr<HloComputation>& comp) {
+        return comp.get() == to_remove;
+      });
   TF_RET_CHECK(it->get() == to_remove);
   computations_.erase(it);
   return Status::OK();
@@ -247,11 +246,39 @@ HloModuleProto HloModule::ToProto() const {
   return proto;
 }
 
+Status HloModule::CheckUniqueNamesAndIdsForComputationsAndInstructions() const {
+  absl::flat_hash_set<string> computation_names;
+  absl::flat_hash_set<int> computation_ids;
+  absl::flat_hash_set<string> instruction_names;
+  absl::flat_hash_set<int> instruction_ids;
+
+  for (const HloComputation* computation : computations()) {
+    TF_RET_CHECK(!ContainsKey(computation_names, computation->name()))
+        << "Computation name is not unique: " << computation->name();
+    computation_names.insert(computation->name());
+
+    TF_RET_CHECK(!ContainsKey(computation_ids, computation->unique_id()))
+        << "Computation id is not unique: " << computation->unique_id();
+    computation_ids.insert(computation->unique_id());
+
+    for (const HloInstruction* instruction : computation->instructions()) {
+      TF_RET_CHECK(!ContainsKey(instruction_names, instruction->name()))
+          << "Instruction name is not unique: " << instruction->name();
+      instruction_names.insert(instruction->name());
+
+      TF_RET_CHECK(!ContainsKey(instruction_ids, instruction->unique_id()))
+          << "Instruction id is not unique: " << instruction->unique_id();
+      instruction_ids.insert(instruction->unique_id());
+    }
+  }
+  return Status::OK();
+}
+
 /* static */
 StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
     const HloModuleProto& proto, const HloModuleConfig& module_config) {
   VLOG(2) << "CreateFromProto()";
-  XLA_VLOG_LINES(2, proto.DebugString());
+  XLA_VLOG_LINES(3, proto.DebugString());
 
   // The ProgramShape in the passed in module config must match the shapes of
   // the entry parameters and root.
@@ -304,11 +331,10 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
   auto module = absl::make_unique<HloModule>(proto.name(), module_config);
 
   // Sort the computations in the proto id's order.
-  std::sort(computations.begin(), computations.end(),
-            [&](const std::unique_ptr<HloComputation>& a,
-                const std::unique_ptr<HloComputation>& b) {
-              return to_proto_id[a.get()] < to_proto_id[b.get()];
-            });
+  absl::c_sort(computations, [&](const std::unique_ptr<HloComputation>& a,
+                                 const std::unique_ptr<HloComputation>& b) {
+    return to_proto_id[a.get()] < to_proto_id[b.get()];
+  });
 
   // Add sorted computations to the module.
   for (auto& computation : computations) {
@@ -331,28 +357,8 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
                       DynamicParameterBinding::CreateFromProto(
                           proto.dynamic_parameter_binding()));
 
-  absl::flat_hash_set<string> computation_names;
-  absl::flat_hash_set<string> instruction_names;
-  absl::flat_hash_set<int> computation_ids;
-  absl::flat_hash_set<int> instruction_ids;
-  for (HloComputation* computation : module->computations()) {
-    TF_RET_CHECK(!ContainsKey(computation_names, computation->name()))
-        << "Computation name is not unique: " << computation->name();
-    computation_names.insert(computation->name());
-
-    TF_RET_CHECK(!ContainsKey(computation_ids, computation->unique_id()))
-        << "Computation id is not unique: " << computation->unique_id();
-    computation_ids.insert(computation->unique_id());
-    for (HloInstruction* instruction : computation->instructions()) {
-      TF_RET_CHECK(!ContainsKey(instruction_names, instruction->name()))
-          << "Instruction name is not unique: " << instruction->name();
-      instruction_names.insert(instruction->name());
-
-      TF_RET_CHECK(!ContainsKey(instruction_ids, instruction->unique_id()))
-          << "Instruction id is not unique: " << instruction->unique_id();
-      instruction_ids.insert(instruction->unique_id());
-    }
-  }
+  TF_RETURN_IF_ERROR(
+      module->CheckUniqueNamesAndIdsForComputationsAndInstructions());
 
   if (proto.has_schedule()) {
     TF_ASSIGN_OR_RETURN(
@@ -392,15 +398,12 @@ namespace {
 // Returns whether `hlo` is used outside the given subcomputation.
 // `instructions_in_subcomputation` is the instruction set of the given
 // subcomputation.
-bool IsUsedOutsideSubcomputation(
-    const HloInstruction& hlo,
-    const std::unordered_set<HloInstruction*>& instructions_in_subcomputation) {
-  for (HloInstruction* user : hlo.users()) {
-    if (!instructions_in_subcomputation.count(user)) {
-      return true;
-    }
-  }
-  return false;
+bool IsUsedOutsideSubcomputation(const HloInstruction& hlo,
+                                 const absl::flat_hash_set<HloInstruction*>&
+                                     instructions_in_subcomputation) {
+  return absl::c_any_of(hlo.users(), [&](HloInstruction* user) {
+    return !instructions_in_subcomputation.contains(user);
+  });
 }
 }  // anonymous namespace
 
@@ -411,9 +414,9 @@ HloInstruction* HloModule::OutlineExpressionFromComputation(
 
   // A map from original instructions to their counterparts in the new outlined
   // function.
-  std::unordered_map<HloInstruction*, HloInstruction*> outlined_instructions;
+  absl::flat_hash_map<HloInstruction*, HloInstruction*> outlined_instructions;
   // A set that contains all instructions to be outlined.
-  std::unordered_set<HloInstruction*> instruction_set_to_outline(
+  absl::flat_hash_set<HloInstruction*> instruction_set_to_outline(
       instructions_to_outline.begin(), instructions_to_outline.end());
   std::vector<HloInstruction*> arguments;
   std::vector<HloInstruction*> outputs;
@@ -502,7 +505,7 @@ std::vector<HloComputation*> HloModule::MakeComputationPostOrder() const {
   // First determine all root computations by building a set of nonroot
   // computations (computations which are called by an instruction in the
   // module).
-  std::set<HloComputation*> nonroot_computations;
+  absl::flat_hash_set<HloComputation*> nonroot_computations;
   for (auto& computation : computations_) {
     for (auto* instruction : computation->instructions()) {
       for (HloComputation* called_computation :
@@ -515,19 +518,19 @@ std::vector<HloComputation*> HloModule::MakeComputationPostOrder() const {
   // Keep track of computations which have already been added to the post
   // order. This prevents duplication as an embedded computation may be called
   // from two different root computations.
-  std::set<HloComputation*> added_computations;
+  absl::flat_hash_set<HloComputation*> added_computations;
   std::vector<HloComputation*> post_order;
   for (auto& computation : computations_) {
-    if (nonroot_computations.count(computation.get()) == 0) {
+    if (!nonroot_computations.contains(computation.get())) {
       for (HloComputation* embedded_computation :
            computation->MakeEmbeddedComputationsList()) {
-        if (added_computations.count(embedded_computation) == 0) {
+        if (!added_computations.contains(embedded_computation)) {
           post_order.push_back(embedded_computation);
           added_computations.insert(embedded_computation);
         }
       }
       // Root computations should only be encountered once.
-      CHECK_EQ(0, added_computations.count(computation.get()));
+      CHECK(!added_computations.contains(computation.get()));
       post_order.push_back(computation.get());
       added_computations.insert(computation.get());
     }
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index 7b9cbf9a53a2201b1312405bbd7ed2b88f65c9be..b6fe6a5cdbd0934014f1152acd48c7a5973bead3 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -136,7 +136,9 @@ class HloModule {
   // information on opcode, shape, operands, and typically a root instruction.
   // This function returns the same hash value for equivalent HLO modules,
   // with respect to HloInstruction::Identical() method.
-  uint64 Hash() const { return entry_computation()->Hash(); }
+  uint64 Hash() const {
+    return entry_computation()->root_instruction()->Hash();
+  }
 
   // Gets the computations in this module.
   //
@@ -185,6 +187,7 @@ class HloModule {
   std::vector<HloComputation*> MakeNonfusionComputations() const;
 
   const HloModuleConfig& config() const { return config_; }
+  void set_config(HloModuleConfig& config) { config_ = config; }
 
   // Return a string representation of the module.
   //
@@ -262,6 +265,18 @@ class HloModule {
   const HloSchedule& schedule() const { return *schedule_; }
   HloSchedule& schedule() { return *schedule_; }
 
+  HloComputation* AddComputationAndUnifyNamesAndIds(
+      std::unique_ptr<HloComputation> computation, bool is_entry) {
+    computation->ClearUniqueIdInternal();
+    for (auto* instruction : computation->instructions()) {
+      instruction->ClearUniqueIdInternal();
+    }
+    return AddComputationInternal(std::move(computation), is_entry,
+                                  /*uniquify_identifiers=*/true);
+  }
+
+  Status CheckUniqueNamesAndIdsForComputationsAndInstructions() const;
+
  private:
   HloComputation* AddComputationInternal(
       std::unique_ptr<HloComputation> computation, bool is_entry,
diff --git a/tensorflow/compiler/xla/service/hlo_module_dce.cc b/tensorflow/compiler/xla/service/hlo_module_dce.cc
index 31d26cc51e8217234526bbfeb83510aadf2c27b5..6b72ba128664d27c51aa8dcfa61fe959a0160c73 100644
--- a/tensorflow/compiler/xla/service/hlo_module_dce.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_dce.cc
@@ -49,7 +49,7 @@ StatusOr<bool> RunWhileDCE(HloModule* module, HloLivenessAnalysis* liveness) {
       auto* while_body_param = while_body_comp->parameter_instruction(0);
       auto* while_body_root = while_body_comp->root_instruction();
 
-      if (!ShapeUtil::IsTuple(xla_while->shape()) ||
+      if (!xla_while->shape().IsTuple() ||
           while_body_root->opcode() != HloOpcode::kTuple) {
         // Only run DCE on tuple-shaped while loops where body root is Tuple,
         // with no I/O instructions.
diff --git a/tensorflow/compiler/xla/service/hlo_module_dce_test.cc b/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
index bf66cc6bc37a5e11c9ecfc07a62ba0ea5ca11a03..f6e2866204955ac024c2b6f972de449cc3df4c15 100644
--- a/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
@@ -38,9 +38,7 @@ class HloModuleDceTest : public HloTestBase {
   // Returns whether the given instruction exists in the given computation.
   bool HasInstruction(const HloComputation& computation,
                       const HloInstruction* instruction) {
-    return std::find(computation.instructions().begin(),
-                     computation.instructions().end(),
-                     instruction) != computation.instructions().end();
+    return absl::c_linear_search(computation.instructions(), instruction);
   }
 
   // Returns whether the while instruction with name 'while_name' in
@@ -373,9 +371,9 @@ TEST_F(HloModuleDceTest, WhileWithOutfeed) {
   HloModule OutfeedLoop
   WhileBody {
     body_param = (s32[]) parameter(0)
-    token = token[] after-all()
+    token0 = token[] after-all()
     constant.2 = s32[] constant(2)
-    outfeed_tuple = (s32[]) outfeed(constant.2, token)
+    outfeed_tuple = (s32[]) outfeed(constant.2, token0)
     get-tuple-element.1 = s32[] get-tuple-element(body_param), index=0
     constant.1 = s32[] constant(1)
     add = s32[] add(get-tuple-element.1, constant.1)
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
index b4aac4c8076cb69647d42c6243bc969d06d0709e..b877081be5775bf6c75a69ffeba28d0f2cc17f90 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
@@ -79,36 +79,36 @@ Status HloModuleGroupMetadata::Build() {
       return Status::OK();
     }
 
-    std::vector<HloComputation*> peers;
-    if (IsChannelInstruction(hlo)) {
-      peers.push_back(PeerComputation(hlo));
-    } else if (hlo->IsCrossModuleAllReduce()) {
-      for (HloInstruction* instr : GetAllReduceGroup(*hlo->all_reduce_id())) {
-        if (instr == hlo) {
-          continue;
+    if (IsChannelInstruction(hlo) || hlo->IsCrossModuleAllReduce()) {
+      std::vector<HloComputation*> peers;
+      if (IsChannelInstruction(hlo)) {
+        peers.push_back(PeerComputation(hlo));
+      } else if (hlo->IsCrossModuleAllReduce()) {
+        for (HloInstruction* instr : GetAllReduceGroup(*hlo->all_reduce_id())) {
+          if (instr == hlo) {
+            continue;
+          }
+          peers.push_back(instr->parent());
         }
-        peers.push_back(instr->parent());
       }
-    }
-
-    // Add the parent computation of this channel (or all-reduce) instruction
-    // and its peer computation(s) (both must be while computations) as
-    // companions.
-    for (HloComputation* peer_computation : peers) {
-      const TrackedInstruction* peer_tracked =
-          GetTrackedInstruction(peer_computation);
-      TF_RET_CHECK(peer_tracked != nullptr)
-          << "Peer instruction is not a possible companion";
-      TF_RET_CHECK(*tracked == *peer_tracked)
-          << "Peer instruction does not match the computation kind";
-      TF_RETURN_IF_ERROR(
-          AddCompanion(tracked->instruction(), peer_tracked->instruction()));
-      tracked_instructions_comms_[tracked->instruction()].push_back(hlo);
-    }
 
-    // Add the parents of companion instructions (they must be all of the same
-    // kind of instructions, opcode wise) as companions.
-    if (IsCompanionInstruction(hlo)) {
+      // Add the parent computation of this channel (or all-reduce) instruction
+      // and its peer computation(s) (both must be while computations) as
+      // companions.
+      for (HloComputation* peer_computation : peers) {
+        const TrackedInstruction* peer_tracked =
+            GetTrackedInstruction(peer_computation);
+        TF_RET_CHECK(peer_tracked != nullptr)
+            << "Peer instruction is not a possible companion";
+        TF_RET_CHECK(*tracked == *peer_tracked)
+            << "Peer instruction does not match the computation kind";
+        TF_RETURN_IF_ERROR(
+            AddCompanion(tracked->instruction(), peer_tracked->instruction()));
+        tracked_instructions_comms_[tracked->instruction()].push_back(hlo);
+      }
+    } else if (IsCompanionInstruction(hlo)) {
+      // Add the parents of companion instructions (they must be all of the same
+      // kind of instructions, opcode wise) as companions.
       for (HloInstruction* companion : Companions(hlo)) {
         const TrackedInstruction* companion_tracked =
             GetTrackedInstruction(companion->parent());
@@ -118,6 +118,7 @@ Status HloModuleGroupMetadata::Build() {
                                         companion_tracked->instruction()));
       }
     }
+
     return Status::OK();
   };
 
@@ -198,7 +199,7 @@ bool HloModuleGroupMetadata::IsChannelInstruction(
 }
 
 bool HloModuleGroupMetadata::IsCompanionInstruction(HloInstruction* hlo) const {
-  return companion_set_index_.count(hlo) > 0;
+  return companion_set_index_.contains(hlo);
 }
 
 bool HloModuleGroupMetadata::InstructionCommunicates(
@@ -388,9 +389,10 @@ Status HloModuleGroupMetadata::AddCompanion(HloInstruction* instruction1,
                instruction1->opcode() == HloOpcode::kCall);
   VLOG(2) << "adding as companions:" << instruction1->ToString() << " and "
           << instruction2->ToString();
-
-  if (!ContainsKey(companion_set_index_, instruction1) &&
-      !ContainsKey(companion_set_index_, instruction2)) {
+  if (instruction1 == instruction2) {
+    return Status::OK();
+  } else if (!ContainsKey(companion_set_index_, instruction1) &&
+             !ContainsKey(companion_set_index_, instruction2)) {
     companion_sets_.push_back(
         absl::make_unique<std::vector<HloInstruction*>>());
     auto companion_set = companion_sets_.back().get();
@@ -418,7 +420,10 @@ Status HloModuleGroupMetadata::AddCompanion(HloInstruction* instruction1,
     for (HloInstruction* hlo : Companions(instruction2)) {
       companion_set_index_[hlo] = companion_set_index_[instruction1];
     }
-    companion_sets_.erase(companion_sets_.begin() + index_to_remove);
+    // We can't remove the set from the vector because companion_set_index_
+    // references sets by their index in this vector, so we reset to nullptr
+    // instead.
+    companion_sets_[index_to_remove].reset(nullptr);
   }
   return Status::OK();
 }
@@ -509,7 +514,7 @@ Status HloModuleGroupMetadata::CheckCommunicatingInstruction(
   HloComputation* computation = instruction->parent();
   const HloModule* module = computation->parent();
   if (module->entry_computation() == computation ||
-      tracked_instructions_.count(computation) > 0) {
+      tracked_instructions_.contains(computation)) {
     return Status::OK();
   }
   return FailedPrecondition("channel is used in disallowed computation");
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
index 928df0f5a7444ad877961a5de970c752e1d024da..84f7f2f31339ae9e98ea2301b6e6d94fcf4dedbb 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
@@ -38,7 +38,7 @@ namespace xla {
 // Class for bookkeeping the information on the given modules, in particular on
 // the interaction between computations.
 //
-// Companion instructions are one of the information collected as we build the
+// Companion instructions are one piece of information collected as we build the
 // metadata. For example, for each While instruction, companion instructions
 // refer to a set of While instructions in other computations that communicate
 // with each other.
@@ -51,6 +51,13 @@ namespace xla {
 // }                          While_4() { Recv(0) }
 //                          }
 //
+// Each instruction can belong to at most one companion set: While_0 and While_5
+// are in the same set even though they don't communicate with each other,
+// because they both communicate with While_2.
+//
+// A send and the matching recv must both have the same level of nesting of
+// companion instructions.
+//
 // Companion instructions are used to detect cycles in the graph and also for
 // global scheduling.
 class HloModuleGroupMetadata {
@@ -166,12 +173,13 @@ class HloModuleGroupMetadata {
   // Returns the number of modules for devices (excluding the host module).
   int64 GetDeviceModulesCount() const;
 
-  // Returns the companion instructions for the given instruction.
+  // Returns the companion set for the given instruction, including the
+  // instruction itself.
   //
   // Precondition: IsCompanionWhile(instruction) is true.
   const std::vector<HloInstruction*>& Companions(
       const HloInstruction* instruction) const {
-    CHECK_EQ(companion_set_index_.count(instruction), 1);
+    CHECK(companion_set_index_.contains(instruction));
     return companion_set(companion_set_index_.at(instruction));
   }
 
@@ -215,11 +223,8 @@ class HloModuleGroupMetadata {
   // * Each channel has all 4 instructions (Send, Recv, SendDone, RecvDone).
   // * The shape of channel instructions match.
   // * The nest level of channel instructions match.
-  // * Channel instructions are used in allowed computations; i.e., in the
+  // * Channel instructions are used in allowed computations, i.e., in the
   //   entry computation of the module or condition/body of While computations.
-  //
-  // TODO(b/62064342): Currently, HloModuleGroupScheduler checks if there is a
-  // cycle in the graph, but it would be good to verify here.
   Status VerifyChannelInstructions();
 
   // Adds metadata that the given two instructions are companions.
@@ -231,8 +236,8 @@ class HloModuleGroupMetadata {
   Status CheckCommunicatingInstruction(HloInstruction* instruction) const;
 
   // Performs a consistency check on the companion sets built for the input
-  // modules. Check that a companion set does not include instructions from the
-  // same module/device.
+  // modules. Checks that each instruction in a companion set is in a different
+  // module/device.
   Status VerifyCompanionSets() const;
 
   // Retrieves a pointer to the stored TrackedInstruction associated with a
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_util.cc b/tensorflow/compiler/xla/service/hlo_module_group_util.cc
index fddeb5f0a27a43ff9ca8b2b5d314bcfe91aaf0e6..91417bd2d9a6ca8a5192a37302e6a91e49a94d77 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_util.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_util.cc
@@ -198,6 +198,8 @@ std::vector<HloInstruction*> HloModuleGroupUtil::RootInstructions(
   for (HloComputation* computation : computations) {
     for (HloInstruction* instruction : computation->instructions()) {
       if (GlobalSuccessors(instruction).empty()) {
+        // An instruction that has no successors, e.g., an unused instruction,
+        // is in roots, even though it's not the ROOT of its computation.
         roots.push_back(instruction);
       }
     }
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_util.h b/tensorflow/compiler/xla/service/hlo_module_group_util.h
index f21b44bcd98d77b831de5d8a6afa4f9ddd91d15d..862666b48c9aa423ba4eeea3052c17fcc1064fd2 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_util.h
+++ b/tensorflow/compiler/xla/service/hlo_module_group_util.h
@@ -49,7 +49,7 @@ class HloModuleGroupUtil {
   // Returns all unique successors of the instruction. This includes:
   // * successors in the same computation: users and control successors
   // * Send is a successor of Recv
-  // * RecvDone is a predecessor of Send
+  // * RecvDone is a successor of Send
   // * successors of companions (if the instruction is a companion while)
   // * successors' companions (for any successor that is a companion while)
   std::vector<HloInstruction*> GlobalSuccessors(HloInstruction* instruction);
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.cc b/tensorflow/compiler/xla/service/hlo_opcode.cc
index 4551a1c2e259b06818f913cb6a9e782436b7e594..548fbb873aa646e061fb990454bb555d098607d8 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.cc
+++ b/tensorflow/compiler/xla/service/hlo_opcode.cc
@@ -53,8 +53,8 @@ StatusOr<HloOpcode> StringToHloOpcode(const string& opcode_name) {
 
 bool HloOpcodeIsComparison(HloOpcode opcode) {
   switch (opcode) {
-#define CASE_IS_COMPARISON(enum_name, ...) \
-  case HloOpcode::enum_name:               \
+#define CASE_IS_COMPARISON(enum_name, opcode_name, ...) \
+  case HloOpcode::enum_name:                            \
     return HAS_PROPERTY(kHloOpcodeIsComparison, __VA_ARGS__);
     HLO_OPCODE_LIST(CASE_IS_COMPARISON)
 #undef CASE_IS_COMPARISON
@@ -63,14 +63,25 @@ bool HloOpcodeIsComparison(HloOpcode opcode) {
 
 bool HloOpcodeIsVariadic(HloOpcode opcode) {
   switch (opcode) {
-#define CASE_IS_VARIADIC(enum_name, ...) \
-  case HloOpcode::enum_name:             \
-    return HAS_PROPERTY(kHloOpcodeIsVariadic, __VA_ARGS__);
+#define CASE_IS_VARIADIC(enum_name, opcode_name, arity, ...) \
+  case HloOpcode::enum_name:                                 \
+    return arity == kHloOpcodeIsVariadic;
     HLO_OPCODE_LIST(CASE_IS_VARIADIC)
 #undef CASE_IS_VARIADIC
   }
 }
 
+absl::optional<int> HloOpcodeArity(HloOpcode opcode) {
+  switch (opcode) {
+#define CASE_ARITY(enum_name, opcode_name, arity, ...)   \
+  case HloOpcode::enum_name:                             \
+    return arity == kHloOpcodeIsVariadic ? absl::nullopt \
+                                         : absl::make_optional(arity);
+    HLO_OPCODE_LIST(CASE_ARITY)
+#undef CASE_ARITY
+  }
+}
+
 #undef HAS_PROPERTY
 #undef RESOLVE
 #undef CHECK_DEFAULT
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index 127cfd165a5d8229cac3035f56a66f1bcfa734f3..c571664c81256e8dc319c97ddffa4e0f10609db2 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <iosfwd>
 #include <string>
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 
@@ -30,9 +31,9 @@ namespace xla {
 // See the XLA documentation for the semantics of each opcode.
 //
 // Each entry has the format:
-// (enum_name, opcode_name)
+// (enum_name, opcode_name, arity)
 // or
-// (enum_name, opcode_name, p1 | p2 | ...)
+// (enum_name, opcode_name, arity, p1 | p2 | ...)
 //
 // with p1, p2, ... are members of HloOpcodeProperty. They are combined
 // using bitwise-or.
@@ -44,102 +45,106 @@ namespace xla {
 // - In fully qualified names (HloInstruction::FullyQualifiedName()), to
 //   separate the qualifiers (name of the computation and potentially the
 //   fusion instruction) from the name
-#define HLO_OPCODE_LIST(V)                                   \
-  V(kAbs, "abs")                                             \
-  V(kAdd, "add")                                             \
-  V(kAddDependency, "add-dependency")                        \
-  V(kAfterAll, "after-all", kHloOpcodeIsVariadic)            \
-  V(kAllToAll, "all-to-all")                                 \
-  V(kAtan2, "atan2")                                         \
-  V(kBatchNormGrad, "batch-norm-grad")                       \
-  V(kBatchNormInference, "batch-norm-inference")             \
-  V(kBatchNormTraining, "batch-norm-training")               \
-  V(kBitcast, "bitcast")                                     \
-  V(kBitcastConvert, "bitcast-convert")                      \
-  V(kBroadcast, "broadcast")                                 \
-  V(kCall, "call", kHloOpcodeIsVariadic)                     \
-  V(kCeil, "ceil")                                           \
-  V(kClamp, "clamp")                                         \
-  V(kCollectivePermute, "collective-permute")                \
-  V(kClz, "count-leading-zeros")                             \
-  V(kComplex, "complex")                                     \
-  V(kConcatenate, "concatenate", kHloOpcodeIsVariadic)       \
-  V(kConditional, "conditional")                             \
-  V(kConstant, "constant")                                   \
-  V(kConvert, "convert")                                     \
-  V(kConvolution, "convolution")                             \
-  V(kCopy, "copy")                                           \
-  V(kCos, "cosine")                                          \
-  V(kCrossReplicaSum, "cross-replica-sum")                   \
-  V(kCustomCall, "custom-call")                              \
-  V(kDivide, "divide")                                       \
-  V(kDomain, "domain")                                       \
-  V(kDot, "dot")                                             \
-  V(kDynamicSlice, "dynamic-slice")                          \
-  V(kDynamicUpdateSlice, "dynamic-update-slice")             \
-  V(kEq, "equal-to", kHloOpcodeIsComparison)                 \
-  V(kExp, "exponential")                                     \
-  V(kExpm1, "exponential-minus-one")                         \
-  V(kFft, "fft")                                             \
-  V(kFloor, "floor")                                         \
-  V(kFusion, "fusion", kHloOpcodeIsVariadic)                 \
-  V(kGather, "gather")                                       \
-  V(kGe, "greater-than-or-equal-to", kHloOpcodeIsComparison) \
-  V(kGetDimensionSize, "get-dimension-size")                 \
-  V(kGetTupleElement, "get-tuple-element")                   \
-  V(kGt, "greater-than", kHloOpcodeIsComparison)             \
-  V(kImag, "imag")                                           \
-  V(kInfeed, "infeed")                                       \
-  V(kIota, "iota")                                           \
-  V(kIsFinite, "is-finite")                                  \
-  V(kLe, "less-than-or-equal-to", kHloOpcodeIsComparison)    \
-  V(kLog, "log")                                             \
-  V(kLog1p, "log-plus-one")                                  \
-  V(kAnd, "and")                                             \
-  V(kNot, "not")                                             \
-  V(kOr, "or")                                               \
-  V(kXor, "xor")                                             \
-  V(kLt, "less-than", kHloOpcodeIsComparison)                \
-  V(kMap, "map", kHloOpcodeIsVariadic)                       \
-  V(kMaximum, "maximum")                                     \
-  V(kMinimum, "minimum")                                     \
-  V(kMultiply, "multiply")                                   \
-  V(kNe, "not-equal-to", kHloOpcodeIsComparison)             \
-  V(kNegate, "negate")                                       \
-  V(kOutfeed, "outfeed")                                     \
-  V(kPad, "pad")                                             \
-  V(kParameter, "parameter")                                 \
-  V(kPower, "power")                                         \
-  V(kReal, "real")                                           \
-  V(kRecv, "recv")                                           \
-  V(kRecvDone, "recv-done")                                  \
-  V(kReduce, "reduce")                                       \
-  V(kReducePrecision, "reduce-precision")                    \
-  V(kReduceWindow, "reduce-window")                          \
-  V(kRemainder, "remainder")                                 \
-  V(kReshape, "reshape")                                     \
-  V(kReverse, "reverse")                                     \
-  V(kRng, "rng")                                             \
-  V(kRoundNearestAfz, "round-nearest-afz")                   \
-  V(kScatter, "scatter")                                     \
-  V(kSelect, "select")                                       \
-  V(kSelectAndScatter, "select-and-scatter")                 \
-  V(kSend, "send")                                           \
-  V(kSendDone, "send-done")                                  \
-  V(kShiftLeft, "shift-left")                                \
-  V(kShiftRightArithmetic, "shift-right-arithmetic")         \
-  V(kShiftRightLogical, "shift-right-logical")               \
-  V(kSign, "sign")                                           \
-  V(kSin, "sine")                                            \
-  V(kSlice, "slice")                                         \
-  V(kSort, "sort")                                           \
-  V(kSubtract, "subtract")                                   \
-  V(kTanh, "tanh")                                           \
-  V(kTrace, "trace")                                         \
-  V(kTranspose, "transpose")                                 \
-  V(kTuple, "tuple", kHloOpcodeIsVariadic)                   \
-  V(kTupleSelect, "tuple-select")                            \
-  V(kWhile, "while")
+#define HLO_OPCODE_LIST(V)                                             \
+  V(kAbs, "abs", 1)                                                    \
+  V(kAdd, "add", 2)                                                    \
+  V(kAddDependency, "add-dependency", 2)                               \
+  V(kAfterAll, "after-all", kHloOpcodeIsVariadic)                      \
+  V(kAllReduce, "all-reduce", kHloOpcodeIsVariadic)                    \
+  V(kAllToAll, "all-to-all", kHloOpcodeIsVariadic)                     \
+  V(kAtan2, "atan2", 2)                                                \
+  V(kBatchNormGrad, "batch-norm-grad", 5)                              \
+  V(kBatchNormInference, "batch-norm-inference", 5)                    \
+  V(kBatchNormTraining, "batch-norm-training", 3)                      \
+  V(kBitcast, "bitcast", 1)                                            \
+  V(kBitcastConvert, "bitcast-convert", 1)                             \
+  V(kBroadcast, "broadcast", 1)                                        \
+  V(kCall, "call", kHloOpcodeIsVariadic)                               \
+  V(kCeil, "ceil", 1)                                                  \
+  V(kClamp, "clamp", 3)                                                \
+  V(kCollectivePermute, "collective-permute", 1)                       \
+  V(kClz, "count-leading-zeros", 1)                                    \
+  V(kComplex, "complex", 2)                                            \
+  V(kConcatenate, "concatenate", kHloOpcodeIsVariadic)                 \
+  V(kConditional, "conditional", 3)                                    \
+  V(kConstant, "constant", 0)                                          \
+  V(kConvert, "convert", 1)                                            \
+  V(kConvolution, "convolution", 2)                                    \
+  V(kCopy, "copy", 1)                                                  \
+  V(kCos, "cosine", 1)                                                 \
+  V(kCustomCall, "custom-call", kHloOpcodeIsVariadic)                  \
+  V(kDivide, "divide", 2)                                              \
+  V(kDomain, "domain", 1)                                              \
+  V(kDot, "dot", 2)                                                    \
+  V(kDynamicSlice, "dynamic-slice", kHloOpcodeIsVariadic)              \
+  V(kDynamicUpdateSlice, "dynamic-update-slice", kHloOpcodeIsVariadic) \
+  V(kEq, "equal-to", 2, kHloOpcodeIsComparison)                        \
+  V(kExp, "exponential", 1)                                            \
+  V(kExpm1, "exponential-minus-one", 1)                                \
+  V(kFft, "fft", 1)                                                    \
+  V(kFloor, "floor", 1)                                                \
+  V(kFusion, "fusion", kHloOpcodeIsVariadic)                           \
+  V(kGather, "gather", 2)                                              \
+  V(kGe, "greater-than-or-equal-to", 2, kHloOpcodeIsComparison)        \
+  V(kGetDimensionSize, "get-dimension-size", 1)                        \
+  V(kGetTupleElement, "get-tuple-element", 1)                          \
+  V(kGt, "greater-than", 2, kHloOpcodeIsComparison)                    \
+  V(kImag, "imag", 1)                                                  \
+  V(kInfeed, "infeed", 1)                                              \
+  V(kIota, "iota", 0)                                                  \
+  V(kIsFinite, "is-finite", 1)                                         \
+  V(kLe, "less-than-or-equal-to", 2, kHloOpcodeIsComparison)           \
+  V(kLog, "log", 1)                                                    \
+  V(kLog1p, "log-plus-one", 1)                                         \
+  V(kAnd, "and", 2)                                                    \
+  V(kNot, "not", 1)                                                    \
+  V(kOr, "or", 2)                                                      \
+  V(kXor, "xor", 2)                                                    \
+  V(kLt, "less-than", 2, kHloOpcodeIsComparison)                       \
+  V(kMap, "map", kHloOpcodeIsVariadic)                                 \
+  V(kMaximum, "maximum", 2)                                            \
+  V(kMinimum, "minimum", 2)                                            \
+  V(kMultiply, "multiply", 2)                                          \
+  V(kNe, "not-equal-to", 2, kHloOpcodeIsComparison)                    \
+  V(kNegate, "negate", 1)                                              \
+  V(kOutfeed, "outfeed", 2)                                            \
+  V(kPad, "pad", 2)                                                    \
+  V(kParameter, "parameter", 0)                                        \
+  V(kPower, "power", 2)                                                \
+  V(kReal, "real", 1)                                                  \
+  V(kRecv, "recv", 1)                                                  \
+  V(kRecvDone, "recv-done", 1)                                         \
+  V(kReduce, "reduce", kHloOpcodeIsVariadic)                           \
+  V(kReducePrecision, "reduce-precision", 1)                           \
+  V(kReduceWindow, "reduce-window", 2)                                 \
+  V(kRemainder, "remainder", 2)                                        \
+  V(kReplicaId, "replica-id", 0)                                       \
+  V(kReshape, "reshape", 1)                                            \
+  V(kReverse, "reverse", 1)                                            \
+  V(kRng, "rng", kHloOpcodeIsVariadic)                                 \
+  V(kRoundNearestAfz, "round-nearest-afz", 1)                          \
+  V(kRsqrt, "rsqrt", 1)                                                \
+  V(kScatter, "scatter", 3)                                            \
+  V(kSelect, "select", 3)                                              \
+  V(kSelectAndScatter, "select-and-scatter", 3)                        \
+  V(kSend, "send", 2)                                                  \
+  V(kSendDone, "send-done", 1)                                         \
+  V(kShiftLeft, "shift-left", 2)                                       \
+  V(kShiftRightArithmetic, "shift-right-arithmetic", 2)                \
+  V(kShiftRightLogical, "shift-right-logical", 2)                      \
+  V(kSign, "sign", 1)                                                  \
+  V(kSin, "sine", 1)                                                   \
+  V(kSlice, "slice", 1)                                                \
+  V(kSort, "sort", kHloOpcodeIsVariadic)                               \
+  V(kSqrt, "sqrt", 1)                                                  \
+  V(kSubtract, "subtract", 2)                                          \
+  V(kTanh, "tanh", 1)                                                  \
+  V(kTrace, "trace", 1)                                                \
+  V(kTranspose, "transpose", 1)                                        \
+  V(kTriangularSolve, "triangular-solve", 2)                           \
+  V(kTuple, "tuple", kHloOpcodeIsVariadic)                             \
+  V(kTupleSelect, "tuple-select", 3)                                   \
+  V(kWhile, "while", 1)
 
 enum class HloOpcode {
 #define DECLARE_ENUM(enum_name, opcode_name, ...) enum_name,
@@ -147,12 +152,16 @@ enum class HloOpcode {
 #undef DECLARE_ENUM
 };
 
+// Arity value that denotes that an operator is variadic.
+enum {
+  kHloOpcodeIsVariadic = -1,
+};
+
 // List of properties associated with opcodes.
 // Properties are defined as increasing powers of two, so that we can use
 // bitwise-or to combine properties, and bitwise-and to test for them.
 enum HloOpcodeProperty {
   kHloOpcodeIsComparison = 1 << 0,
-  kHloOpcodeIsVariadic = 1 << 1,
 };
 
 // Returns a string representation of the opcode.
@@ -171,6 +180,10 @@ bool HloOpcodeIsComparison(HloOpcode opcode);
 // Returns true iff the given opcode has variadic operands.
 bool HloOpcodeIsVariadic(HloOpcode opcode);
 
+// Returns the arity of opcode. If the opcode is variadic,
+// returns nullopt.
+absl::optional<int> HloOpcodeArity(HloOpcode opcode);
+
 // Returns the number of HloOpcode values.
 inline const uint32_t HloOpcodeCount() {
 #define HLO_COUNT_ONE(...) +1
diff --git a/tensorflow/compiler/xla/service/hlo_opcode_test.cc b/tensorflow/compiler/xla/service/hlo_opcode_test.cc
index 6f3f83f63a05fafaa3f3ddcff8a7cac7cb7b06d5..c599690f44e4eb2713c287e9f3d89a658771032f 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_opcode_test.cc
@@ -54,11 +54,19 @@ TEST(HloOpcodeTest, OpcodeProperties) {
         EXPECT_FALSE(HloOpcodeIsComparison(opcode));
     }
     switch (opcode) {
+      case HloOpcode::kAfterAll:
+      case HloOpcode::kAllReduce:
+      case HloOpcode::kAllToAll:
       case HloOpcode::kCall:
       case HloOpcode::kConcatenate:
+      case HloOpcode::kCustomCall:
+      case HloOpcode::kDynamicSlice:
+      case HloOpcode::kDynamicUpdateSlice:
       case HloOpcode::kFusion:
       case HloOpcode::kMap:
-      case HloOpcode::kAfterAll:
+      case HloOpcode::kReduce:
+      case HloOpcode::kRng:
+      case HloOpcode::kSort:
       case HloOpcode::kTuple:
         EXPECT_TRUE(HloOpcodeIsVariadic(opcode));
         break;
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index ca6a154809be46d6a0305c29e2b89219de408019..0cec61c257bb84e467290fb52ec9063a32ed558d 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -367,7 +367,7 @@ bool SequentialHloOrdering::ExecutesBeforeInSameComputation(
     const HloInstruction* a, const HloInstruction* b) const {
   CHECK_EQ(a->parent(), b->parent());
   // If either instruction is not in the order, then 'a' and 'b' are unordered.
-  if (order_position_.count(a) == 0 || order_position_.count(b) == 0) {
+  if (!order_position_.contains(a) || !order_position_.contains(b)) {
     return false;
   }
   return order_position_.at(a) < order_position_.at(b);
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 9b5bb5d0bd6af104ef62eaa5d3e53cedbe0213d3..4aa1090f48af0d674eb816cf0823395f08cc3836 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include <type_traits>
 
 #include "absl/algorithm/container.h"
 #include "absl/memory/memory.h"
@@ -21,10 +22,13 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
+#include "absl/types/variant.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_lexer.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_schedule.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
@@ -43,8 +47,6 @@ using absl::StrCat;
 using absl::StrFormat;
 using absl::StrJoin;
 
-const double kF16max = 65504;
-
 // Creates and returns a schedule created using the order of the instructions in
 // the HloComputation::instructions() vectors in the module.
 HloSchedule ScheduleFromInstructionOrder(HloModule* module) {
@@ -59,6 +61,10 @@ HloSchedule ScheduleFromInstructionOrder(HloModule* module) {
   return schedule;
 }
 
+// Some functions accept either a linear index or a multi-dimensional index
+// (used for indexing into sparse literals).
+using LinearOrMultiIndex = absl::variant<int64, absl::Span<const int64>>;
+
 // Parser for the HloModule::ToString() format text.
 class HloParser {
  public:
@@ -74,7 +80,9 @@ class HloParser {
   string GetError() const { return StrJoin(error_, "\n"); }
 
   // Stand alone parsing utils for various aggregate data types.
+  StatusOr<Shape> ParseShapeOnly();
   StatusOr<HloSharding> ParseShardingOnly();
+  StatusOr<std::vector<bool>> ParseParameterReplicationOnly();
   StatusOr<Window> ParseWindowOnly();
   StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbersOnly();
   StatusOr<PaddingConfig> ParsePaddingConfigOnly();
@@ -100,7 +108,7 @@ class HloParser {
   // Parse a single instruction worth of text.
   bool ParseSingleInstruction(HloModule* module);
 
-  // ParseXXX returns false if an error occurred.
+  // Parses a module, returning false if an error occurred.
   bool ParseHloModule(HloModule* module);
 
   bool ParseComputations(HloModule* module);
@@ -116,21 +124,30 @@ class HloParser {
   bool ParseNonTupleLiteral(Literal* literal, const Shape& shape);
   bool ParseDenseLiteral(Literal* literal, const Shape& shape);
   bool ParseSparseLiteral(Literal* literal, const Shape& shape);
-  template <typename LiteralNativeT>
-  bool ParseSparseLiteralHelper(Literal* literal, const Shape& shape);
 
-  // Sets the sub-value of literal at the given index to the given value. The
-  // literal's shape must have the default layout.
-  bool SetValueInLiteral(tensorflow::int64 value,
-                         tensorflow::int64 linear_index, Literal* literal);
-  bool SetValueInLiteral(double value, tensorflow::int64 linear_index,
+  // Sets the sub-value of literal at the given linear or sparse index to the
+  // given value. If the literal is dense, it myst have the default layout.
+  //
+  // `loc` should be the source location of the value.
+  bool SetValueInLiteral(LocTy loc, int64 value, LinearOrMultiIndex index,
+                         Literal* literal);
+  bool SetValueInLiteral(LocTy loc, double value, LinearOrMultiIndex index,
                          Literal* literal);
-  bool SetValueInLiteral(bool value, tensorflow::int64 linear_index,
+  bool SetValueInLiteral(LocTy loc, bool value, LinearOrMultiIndex index,
                          Literal* literal);
+  bool SetValueInLiteral(LocTy loc, std::complex<double> value,
+                         LinearOrMultiIndex index, Literal* literal);
+  // `loc` should be the source location of the value.
+  template <typename LiteralNativeT, typename ParsedElemT>
+  bool SetValueInLiteralHelper(LocTy loc, ParsedElemT value,
+                               LinearOrMultiIndex index, Literal* literal);
+
+  // Checks whether the given value is within the range of LiteralNativeT.
+  // `loc` should be the source location of the value.
   template <typename LiteralNativeT, typename ParsedElemT>
-  bool SetValueInLiteralHelper(ParsedElemT value,
-                               tensorflow::int64 linear_index,
-                               Literal* literal);
+  bool CheckParsedValueIsInRange(LocTy loc, ParsedElemT value);
+  template <typename LiteralNativeT>
+  bool CheckParsedValueIsInRange(LocTy loc, std::complex<double> value);
 
   bool ParseOperands(std::vector<HloInstruction*>* operands);
   // Fills parsed operands into 'operands' and expects a certain number of
@@ -141,9 +158,9 @@ class HloParser {
   // Describes the start, limit, and stride on every dimension of the operand
   // being sliced.
   struct SliceRanges {
-    std::vector<tensorflow::int64> starts;
-    std::vector<tensorflow::int64> limits;
-    std::vector<tensorflow::int64> strides;
+    std::vector<int64> starts;
+    std::vector<int64> limits;
+    std::vector<int64> strides;
   };
 
   // The data parsed for the kDomain instruction.
@@ -163,9 +180,11 @@ class HloParser {
     kBracedInt64ListList,
     kHloComputation,
     kFftType,
+    kTriangularSolveTranspose,
     kWindow,
     kConvolutionDimensionNumbers,
     kSharding,
+    kParameterReplication,
     kInstructionList,
     kSliceRanges,
     kPaddingConfig,
@@ -230,21 +249,21 @@ class HloParser {
   bool ParseMetadata(OpMetadata* metadata);
   bool ParseSharding(OpSharding* sharding);
   bool ParseSingleSharding(OpSharding* sharding, bool lbrace_pre_lexed);
+  bool ParseParameterReplication(ParameterReplication* parameter_replication);
 
   // Parses the metadata behind a kDOmain instruction.
   bool ParseDomain(DomainData* domain);
 
   // Parses a sub-attribute of the window attribute, e.g.,size=1x2x3.
-  bool ParseDxD(const string& name, std::vector<tensorflow::int64>* result);
+  bool ParseDxD(const string& name, std::vector<int64>* result);
   // Parses window's pad sub-attriute, e.g., pad=0_0x3x3.
-  bool ParseWindowPad(std::vector<std::vector<tensorflow::int64>>* pad);
+  bool ParseWindowPad(std::vector<std::vector<int64>>* pad);
 
   bool ParseSliceRanges(SliceRanges* result);
   bool ParsePrecisionList(std::vector<PrecisionConfig::Precision>* result);
   bool ParseShapeList(std::vector<Shape>* result);
   bool ParseInt64List(const TokKind start, const TokKind end,
-                      const TokKind delim,
-                      std::vector<tensorflow::int64>* result);
+                      const TokKind delim, std::vector<int64>* result);
   // 'parse_and_add_item' is an lambda to parse an element in the list and add
   // the parsed element to the result. It's supposed to capture the result.
   bool ParseList(const TokKind start, const TokKind end, const TokKind delim,
@@ -255,14 +274,20 @@ class HloParser {
   bool ParseName(string* result);
   bool ParseAttributeName(string* result);
   bool ParseString(string* result);
+  bool ParseDimensionSizes(std::vector<int64>* dimension_sizes,
+                           std::vector<bool>* dynamic_dimensions);
   bool ParseShape(Shape* result);
+  bool ParseLayout(Layout* layout);
+  bool ParseTiles(std::vector<Tile>* tiles);
   bool ParseOpcode(HloOpcode* result);
   bool ParseFftType(FftType* result);
+  bool ParseTriangularSolveTranspose(TriangularSolveOptions::Transpose* result);
   bool ParseFusionKind(HloInstruction::FusionKind* result);
   bool ParseRandomDistribution(RandomDistribution* result);
   bool ParsePrecision(PrecisionConfig::Precision* result);
-  bool ParseInt64(tensorflow::int64* result);
+  bool ParseInt64(int64* result);
   bool ParseDouble(double* result);
+  bool ParseComplex(std::complex<double>* result);
   bool ParseBool(bool* result);
   bool ParseToken(TokKind kind, const string& msg);
 
@@ -279,9 +304,6 @@ class HloParser {
   // If the current token is 'kind', eats it (i.e. lexes the next token) and
   // returns true.
   bool EatIfPresent(TokKind kind);
-  // Parses a shape, and returns true if the result is compatible with the given
-  // shape.
-  bool EatShapeAndCheckCompatible(const Shape& shape);
 
   // Adds the instruction to the pool. Returns false and emits an error if the
   // instruction already exists.
@@ -625,6 +647,10 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
   std::unordered_map<string, AttrConfig> attrs;
   optional<OpSharding> sharding;
   attrs["sharding"] = {/*required=*/false, AttrTy::kSharding, &sharding};
+  optional<ParameterReplication> parameter_replication;
+  attrs["parameter_replication"] = {/*required=*/false,
+                                    AttrTy::kParameterReplication,
+                                    &parameter_replication};
   optional<std::vector<HloInstruction*>> predecessors;
   attrs["control-predecessors"] = {/*required=*/false, AttrTy::kInstructionList,
                                    &predecessors};
@@ -638,11 +664,17 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
   HloInstruction* instruction;
   switch (opcode) {
     case HloOpcode::kParameter: {
-      tensorflow::int64 parameter_number;
+      int64 parameter_number;
       if (!ParseToken(TokKind::kLparen,
                       "expects '(' before parameter number") ||
-          !ParseInt64(&parameter_number) ||
-          !ParseToken(TokKind::kRparen, "expects ')' after parameter number") ||
+          !ParseInt64(&parameter_number)) {
+        return false;
+      }
+      if (parameter_number < 0) {
+        Error(lexer_.GetLoc(), "parameter number must be >= 0");
+        return false;
+      }
+      if (!ParseToken(TokKind::kRparen, "expects ')' after parameter number") ||
           !ParseAttributes(attrs)) {
         return false;
       }
@@ -664,7 +696,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kIota: {
-      optional<tensorflow::int64> iota_dimension;
+      optional<int64> iota_dimension;
       attrs["iota_dimension"] = {/*required=*/true, AttrTy::kInt64,
                                  &iota_dimension};
       if (!ParseOperands(&operands, /*expected_size=*/0) ||
@@ -693,8 +725,10 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
     case HloOpcode::kNot:
     case HloOpcode::kNegate:
     case HloOpcode::kReal:
+    case HloOpcode::kRsqrt:
     case HloOpcode::kSign:
     case HloOpcode::kSin:
+    case HloOpcode::kSqrt:
     case HloOpcode::kTanh: {
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
           !ParseAttributes(attrs)) {
@@ -766,7 +800,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
           HloInstruction::CreateBitcastConvert(shape, operands[0]));
       break;
     }
-    case HloOpcode::kCrossReplicaSum: {
+    case HloOpcode::kAllReduce: {
       optional<std::vector<std::vector<int64>>> tmp_groups;
       optional<HloComputation*> to_apply;
       optional<std::vector<int64>> replica_group_ids;
@@ -786,10 +820,9 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       if (tmp_groups) {
         replica_groups = CreateReplicaGroups(*tmp_groups);
       }
-      instruction =
-          builder->AddInstruction(HloInstruction::CreateCrossReplicaSum(
-              shape, operands, *to_apply, replica_groups,
-              barrier ? *barrier : "", all_reduce_id));
+      instruction = builder->AddInstruction(HloInstruction::CreateAllReduce(
+          shape, operands, *to_apply, replica_groups, barrier ? *barrier : "",
+          all_reduce_id));
       break;
     }
     case HloOpcode::kAllToAll: {
@@ -829,6 +862,14 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
           HloInstruction::CreateCollectivePermute(shape, operands[0], pairs));
       break;
     }
+    case HloOpcode::kReplicaId: {
+      if (!ParseOperands(&operands, /*expected_size=*/0) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateReplicaId());
+      break;
+    }
     case HloOpcode::kReshape: {
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
           !ParseAttributes(attrs)) {
@@ -860,17 +901,21 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kSort: {
-      optional<std::vector<tensorflow::int64>> dimensions;
+      optional<std::vector<int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions};
+      optional<bool> is_stable = false;
+      attrs["is_stable"] = {/*required=*/false, AttrTy::kBool, &is_stable};
+      optional<HloComputation*> to_apply;
+      attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
+                           &to_apply};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs) ||
           dimensions->size() != 1) {
         return false;
       }
-      instruction = builder->AddInstruction(HloInstruction::CreateSort(
-          shape, dimensions->at(0),
-          /*keys=*/operands[0],
-          /*values=*/absl::Span<HloInstruction* const>(operands).subspan(1)));
+      instruction = builder->AddInstruction(
+          HloInstruction::CreateSort(shape, dimensions->at(0), operands,
+                                     to_apply.value(), is_stable.value()));
       break;
     }
     case HloOpcode::kTuple: {
@@ -896,7 +941,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kRecv: {
-      optional<tensorflow::int64> channel_id;
+      optional<int64> channel_id;
       // If the is_host_transfer attribute is not present then default to false.
       optional<bool> is_host_transfer = false;
       attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
@@ -912,7 +957,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kRecvDone: {
-      optional<tensorflow::int64> channel_id;
+      optional<int64> channel_id;
       // If the is_host_transfer attribute is not present then default to false.
       optional<bool> is_host_transfer = false;
       attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
@@ -930,7 +975,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kSend: {
-      optional<tensorflow::int64> channel_id;
+      optional<int64> channel_id;
       // If the is_host_transfer attribute is not present then default to false.
       optional<bool> is_host_transfer = false;
       attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
@@ -945,7 +990,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kSendDone: {
-      optional<tensorflow::int64> channel_id;
+      optional<int64> channel_id;
       // If the is_host_transfer attribute is not present then default to false.
       optional<bool> is_host_transfer = false;
       attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
@@ -963,7 +1008,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kGetTupleElement: {
-      optional<tensorflow::int64> index;
+      optional<int64> index;
       attrs["index"] = {/*required=*/true, AttrTy::kInt64, &index};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
           !ParseAttributes(attrs)) {
@@ -1006,11 +1051,14 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       optional<Window> window;
       optional<ConvolutionDimensionNumbers> dnums;
       optional<int64> feature_group_count;
+      optional<int64> batch_group_count;
       attrs["window"] = {/*required=*/false, AttrTy::kWindow, &window};
       attrs["dim_labels"] = {/*required=*/true,
                              AttrTy::kConvolutionDimensionNumbers, &dnums};
       attrs["feature_group_count"] = {/*required=*/false, AttrTy::kInt64,
                                       &feature_group_count};
+      attrs["batch_group_count"] = {/*required=*/false, AttrTy::kInt64,
+                                    &batch_group_count};
       optional<std::vector<PrecisionConfig::Precision>> operand_precision;
       attrs["operand_precision"] = {/*required=*/false, AttrTy::kPrecisionList,
                                     &operand_precision};
@@ -1024,6 +1072,9 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       if (!feature_group_count) {
         feature_group_count = 1;
       }
+      if (!batch_group_count) {
+        batch_group_count = 1;
+      }
       PrecisionConfig precision_config;
       if (operand_precision) {
         *precision_config.mutable_operand_precision() = {
@@ -1034,12 +1085,13 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       }
       instruction = builder->AddInstruction(HloInstruction::CreateConvolve(
           shape, /*lhs=*/operands[0], /*rhs=*/operands[1],
-          feature_group_count.value(), *window, *dnums, precision_config));
+          feature_group_count.value(), batch_group_count.value(), *window,
+          *dnums, precision_config));
       break;
     }
     case HloOpcode::kFft: {
       optional<FftType> fft_type;
-      optional<std::vector<tensorflow::int64>> fft_length;
+      optional<std::vector<int64>> fft_length;
       attrs["fft_type"] = {/*required=*/true, AttrTy::kFftType, &fft_type};
       attrs["fft_length"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &fft_length};
@@ -1051,8 +1103,40 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
           shape, operands[0], *fft_type, *fft_length));
       break;
     }
+    case HloOpcode::kTriangularSolve: {
+      optional<bool> left_side;
+      optional<bool> lower;
+      optional<bool> unit_diagonal;
+      optional<TriangularSolveOptions::Transpose> transpose_a;
+      attrs["left_side"] = {/*required=*/false, AttrTy::kBool, &left_side};
+      attrs["lower"] = {/*required=*/false, AttrTy::kBool, &lower};
+      attrs["unit_diagonal"] = {/*required=*/false, AttrTy::kBool,
+                                &unit_diagonal};
+      attrs["transpose_a"] = {/*required=*/false,
+                              AttrTy::kTriangularSolveTranspose, &transpose_a};
+      if (!ParseOperands(&operands, /*expected_size=*/2) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      TriangularSolveOptions options;
+      if (left_side) {
+        options.set_left_side(*left_side);
+      }
+      if (lower) {
+        options.set_lower(*lower);
+      }
+      if (unit_diagonal) {
+        options.set_unit_diagonal(*unit_diagonal);
+      }
+      options.set_transpose_a(
+          transpose_a ? *transpose_a : TriangularSolveOptions::NO_TRANSPOSE);
+      instruction =
+          builder->AddInstruction(HloInstruction::CreateTriangularSolve(
+              shape, operands[0], operands[1], options));
+      break;
+    }
     case HloOpcode::kBroadcast: {
-      optional<std::vector<tensorflow::int64>> broadcast_dimensions;
+      optional<std::vector<int64>> broadcast_dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &broadcast_dimensions};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
@@ -1064,7 +1148,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kConcatenate: {
-      optional<std::vector<tensorflow::int64>> dimensions;
+      optional<std::vector<int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs) ||
@@ -1079,7 +1163,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       optional<HloComputation*> to_apply;
       attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
                            &to_apply};
-      optional<std::vector<tensorflow::int64>> dimensions;
+      optional<std::vector<int64>> dimensions;
       attrs["dimensions"] = {/*required=*/false, AttrTy::kBracedInt64List,
                              &dimensions};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
@@ -1095,7 +1179,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       optional<HloComputation*> reduce_computation;
       attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
                            &reduce_computation};
-      optional<std::vector<tensorflow::int64>> dimensions_to_reduce;
+      optional<std::vector<int64>> dimensions_to_reduce;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions_to_reduce};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
@@ -1116,7 +1200,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kReverse: {
-      optional<std::vector<tensorflow::int64>> dimensions;
+      optional<std::vector<int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
@@ -1160,31 +1244,46 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kDynamicSlice: {
-      optional<std::vector<tensorflow::int64>> dynamic_slice_sizes;
+      optional<std::vector<int64>> dynamic_slice_sizes;
       attrs["dynamic_slice_sizes"] = {
           /*required=*/true, AttrTy::kBracedInt64List, &dynamic_slice_sizes};
-      if (!ParseOperands(&operands, /*expected_size=*/2) ||
-          !ParseAttributes(attrs)) {
+      LocTy loc = lexer_.GetLoc();
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
         return false;
       }
+      if (operands.empty()) {
+        return Error(loc, "Expected at least one operand.");
+      }
+      if (!(operands.size() == 2 && operands[1]->shape().rank() == 1) &&
+          operands.size() != 1 + operands[0]->shape().rank()) {
+        return Error(loc, "Wrong number of operands.");
+      }
       instruction = builder->AddInstruction(HloInstruction::CreateDynamicSlice(
-          shape, /*operand=*/operands[0], /*start_indices=*/operands[1],
+          shape, /*operand=*/operands[0],
+          /*start_indices=*/absl::MakeSpan(operands).subspan(1),
           *dynamic_slice_sizes));
       break;
     }
     case HloOpcode::kDynamicUpdateSlice: {
-      if (!ParseOperands(&operands, /*expected_size=*/3) ||
-          !ParseAttributes(attrs)) {
+      LocTy loc = lexer_.GetLoc();
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
         return false;
       }
+      if (operands.size() < 2) {
+        return Error(loc, "Expected at least two operands.");
+      }
+      if (!(operands.size() == 3 && operands[2]->shape().rank() == 1) &&
+          operands.size() != 2 + operands[0]->shape().rank()) {
+        return Error(loc, "Wrong number of operands.");
+      }
       instruction =
           builder->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
               shape, /*operand=*/operands[0], /*update=*/operands[1],
-              /*start_indices=*/operands[2]));
+              /*start_indices=*/absl::MakeSpan(operands).subspan(2)));
       break;
     }
     case HloOpcode::kTranspose: {
-      optional<std::vector<tensorflow::int64>> dimensions;
+      optional<std::vector<int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
@@ -1198,7 +1297,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
     case HloOpcode::kBatchNormTraining: {
       optional<float> epsilon;
       attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon};
-      optional<tensorflow::int64> feature_index;
+      optional<int64> feature_index;
       attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64,
                                 &feature_index};
       if (!ParseOperands(&operands, /*expected_size=*/3) ||
@@ -1214,7 +1313,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
     case HloOpcode::kBatchNormInference: {
       optional<float> epsilon;
       attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon};
-      optional<tensorflow::int64> feature_index;
+      optional<int64> feature_index;
       attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64,
                                 &feature_index};
       if (!ParseOperands(&operands, /*expected_size=*/5) ||
@@ -1231,7 +1330,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
     case HloOpcode::kBatchNormGrad: {
       optional<float> epsilon;
       attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon};
-      optional<tensorflow::int64> feature_index;
+      optional<int64> feature_index;
       attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64,
                                 &feature_index};
       if (!ParseOperands(&operands, /*expected_size=*/5) ||
@@ -1280,7 +1379,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       // the infeed instruction. ShapeUtil::GetTupleElementShape will check fail
       // if the shape is not a non-empty tuple, so add guard so an error message
       // can be emitted instead of a check fail
-      if (!ShapeUtil::IsTuple(shape) && !ShapeUtil::IsEmptyTuple(shape)) {
+      if (!shape.IsTuple() && !ShapeUtil::IsEmptyTuple(shape)) {
         return Error(lexer_.GetLoc(),
                      "infeed must have a non-empty tuple shape");
       }
@@ -1313,8 +1412,8 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kReducePrecision: {
-      optional<tensorflow::int64> exponent_bits;
-      optional<tensorflow::int64> mantissa_bits;
+      optional<int64> exponent_bits;
+      optional<int64> mantissa_bits;
       attrs["exponent_bits"] = {/*required=*/true, AttrTy::kInt64,
                                 &exponent_bits};
       attrs["mantissa_bits"] = {/*required=*/true, AttrTy::kInt64,
@@ -1352,6 +1451,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       optional<Window> window;
       optional<ConvolutionDimensionNumbers> dnums;
       optional<int64> feature_group_count;
+      optional<int64> batch_group_count;
       optional<std::vector<Shape>> operand_layout_constraints;
       attrs["custom_call_target"] = {/*required=*/true, AttrTy::kString,
                                      &custom_call_target};
@@ -1361,6 +1461,8 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
                              AttrTy::kConvolutionDimensionNumbers, &dnums};
       attrs["feature_group_count"] = {/*required=*/false, AttrTy::kInt64,
                                       &feature_group_count};
+      attrs["batch_group_count"] = {/*required=*/false, AttrTy::kInt64,
+                                    &batch_group_count};
       attrs["operand_layout_constraints"] = {
           /*required=*/false, AttrTy::kShapeList, &operand_layout_constraints};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
@@ -1416,19 +1518,22 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       if (feature_group_count.has_value()) {
         instruction->set_feature_group_count(*feature_group_count);
       }
+      if (batch_group_count.has_value()) {
+        instruction->set_batch_group_count(*batch_group_count);
+      }
       break;
     }
     case HloOpcode::kDot: {
-      optional<std::vector<tensorflow::int64>> lhs_contracting_dims;
+      optional<std::vector<int64>> lhs_contracting_dims;
       attrs["lhs_contracting_dims"] = {
           /*required=*/false, AttrTy::kBracedInt64List, &lhs_contracting_dims};
-      optional<std::vector<tensorflow::int64>> rhs_contracting_dims;
+      optional<std::vector<int64>> rhs_contracting_dims;
       attrs["rhs_contracting_dims"] = {
           /*required=*/false, AttrTy::kBracedInt64List, &rhs_contracting_dims};
-      optional<std::vector<tensorflow::int64>> lhs_batch_dims;
+      optional<std::vector<int64>> lhs_batch_dims;
       attrs["lhs_batch_dims"] = {/*required=*/false, AttrTy::kBracedInt64List,
                                  &lhs_batch_dims};
-      optional<std::vector<tensorflow::int64>> rhs_batch_dims;
+      optional<std::vector<int64>> rhs_batch_dims;
       attrs["rhs_batch_dims"] = {/*required=*/false, AttrTy::kBracedInt64List,
                                  &rhs_batch_dims};
       optional<std::vector<PrecisionConfig::Precision>> operand_precision;
@@ -1472,19 +1577,19 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kGather: {
-      optional<std::vector<tensorflow::int64>> offset_dims;
+      optional<std::vector<int64>> offset_dims;
       attrs["offset_dims"] = {/*required=*/true, AttrTy::kBracedInt64List,
                               &offset_dims};
-      optional<std::vector<tensorflow::int64>> collapsed_slice_dims;
+      optional<std::vector<int64>> collapsed_slice_dims;
       attrs["collapsed_slice_dims"] = {
           /*required=*/true, AttrTy::kBracedInt64List, &collapsed_slice_dims};
-      optional<std::vector<tensorflow::int64>> start_index_map;
+      optional<std::vector<int64>> start_index_map;
       attrs["start_index_map"] = {/*required=*/true, AttrTy::kBracedInt64List,
                                   &start_index_map};
-      optional<tensorflow::int64> index_vector_dim;
+      optional<int64> index_vector_dim;
       attrs["index_vector_dim"] = {/*required=*/true, AttrTy::kInt64,
                                    &index_vector_dim};
-      optional<std::vector<tensorflow::int64>> slice_sizes;
+      optional<std::vector<int64>> slice_sizes;
       attrs["slice_sizes"] = {/*required=*/true, AttrTy::kBracedInt64List,
                               &slice_sizes};
 
@@ -1506,17 +1611,17 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kScatter: {
-      optional<std::vector<tensorflow::int64>> update_window_dims;
+      optional<std::vector<int64>> update_window_dims;
       attrs["update_window_dims"] = {
           /*required=*/true, AttrTy::kBracedInt64List, &update_window_dims};
-      optional<std::vector<tensorflow::int64>> inserted_window_dims;
+      optional<std::vector<int64>> inserted_window_dims;
       attrs["inserted_window_dims"] = {
           /*required=*/true, AttrTy::kBracedInt64List, &inserted_window_dims};
-      optional<std::vector<tensorflow::int64>> scatter_dims_to_operand_dims;
+      optional<std::vector<int64>> scatter_dims_to_operand_dims;
       attrs["scatter_dims_to_operand_dims"] = {/*required=*/true,
                                                AttrTy::kBracedInt64List,
                                                &scatter_dims_to_operand_dims};
-      optional<tensorflow::int64> index_vector_dim;
+      optional<int64> index_vector_dim;
       attrs["index_vector_dim"] = {/*required=*/true, AttrTy::kInt64,
                                    &index_vector_dim};
 
@@ -1557,7 +1662,7 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       return TokenError(StrCat("parsing not yet implemented for op: ",
                                HloOpcodeString(opcode)));
     case HloOpcode::kGetDimensionSize:
-      optional<std::vector<tensorflow::int64>> dimensions;
+      optional<std::vector<int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
@@ -1582,6 +1687,18 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
     instruction->set_sharding(
         HloSharding::FromProto(sharding.value()).ValueOrDie());
   }
+  if (parameter_replication) {
+    int leaf_count = ShapeUtil::GetLeafCount(instruction->shape());
+    const auto& replicated =
+        parameter_replication->replicated_at_leaf_buffers();
+    if (leaf_count != replicated.size()) {
+      return Error(lexer_.GetLoc(),
+                   StrCat("parameter has ", leaf_count,
+                          " leaf buffers, but parameter_replication has ",
+                          replicated.size(), " elements."));
+    }
+    instruction->set_parameter_replicated_at_leaf_buffers(replicated);
+  }
   if (predecessors) {
     for (auto* pre : *predecessors) {
       Status status = pre->AddControlDependencyTo(instruction);
@@ -1646,8 +1763,8 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
   LocTy loc = lexer_.GetLoc();
   bool maximal = false;
   bool replicated = false;
-  std::vector<tensorflow::int64> devices;
-  std::vector<tensorflow::int64> tile_assignment_dimensions;
+  std::vector<int64> devices;
+  std::vector<int64> tile_assignment_dimensions;
   while (lexer_.GetKind() != TokKind::kRbrace) {
     switch (lexer_.GetKind()) {
       case TokKind::kw_maximal:
@@ -1673,7 +1790,7 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
           }
 
           do {
-            tensorflow::int64 dim;
+            int64 dim;
             if (!ParseInt64(&dim)) {
               return false;
             }
@@ -1685,7 +1802,7 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
             return false;
           }
           do {
-            tensorflow::int64 device;
+            int64 device;
             if (!ParseInt64(&device)) {
               return false;
             }
@@ -1697,11 +1814,6 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
         }
         break;
       }
-      case TokKind::kShape:
-        // TODO(b/112302613): Left here for backward compatibility to ignore the
-        // removed tile shape data.
-        lexer_.Lex();
-        break;
       case TokKind::kRbrace:
         break;
       default:
@@ -1734,10 +1846,10 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
           "dimensions");
     }
     sharding->set_type(OpSharding::Type::OpSharding_Type_OTHER);
-    for (tensorflow::int64 dim : tile_assignment_dimensions) {
+    for (int64 dim : tile_assignment_dimensions) {
       sharding->add_tile_assignment_dimensions(dim);
     }
-    for (tensorflow::int64 device : devices) {
+    for (int64 device : devices) {
       sharding->add_tile_assignment_devices(device);
     }
   }
@@ -1746,6 +1858,32 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
   return true;
 }
 
+// parameter_replication ::=
+//   '{' ('true' | 'false')* (',' ('true' | 'false'))*  '}'
+bool HloParser::ParseParameterReplication(
+    ParameterReplication* parameter_replication) {
+  if (!ParseToken(TokKind::kLbrace,
+                  "expected '{' to start parameter_replication attribute")) {
+    return false;
+  }
+
+  if (lexer_.GetKind() != TokKind::kRbrace) {
+    do {
+      if (lexer_.GetKind() == TokKind::kw_true) {
+        parameter_replication->add_replicated_at_leaf_buffers(true);
+      } else if (lexer_.GetKind() == TokKind::kw_false) {
+        parameter_replication->add_replicated_at_leaf_buffers(false);
+      } else {
+        return false;
+      }
+      lexer_.Lex();
+    } while (EatIfPresent(TokKind::kComma));
+  }
+
+  return ParseToken(TokKind::kRbrace,
+                    "expected '}' to end parameter_replication attribute");
+}
+
 // domain ::= '{' 'kind=' domain_kind ',' 'entry=' entry_sharding ','
 //            'exit=' exit_sharding '}'
 bool HloParser::ParseDomain(DomainData* domain) {
@@ -1798,142 +1936,145 @@ bool HloParser::ParseInstructionNames(
                     "expects '}' at the end of instruction name list");
 }
 
-bool HloParser::SetValueInLiteral(tensorflow::int64 value,
-                                  tensorflow::int64 linear_index,
-                                  Literal* literal) {
+bool HloParser::SetValueInLiteral(LocTy loc, int64 value,
+                                  LinearOrMultiIndex index, Literal* literal) {
   const Shape& shape = literal->shape();
   switch (shape.element_type()) {
     case S8:
-      return SetValueInLiteralHelper<tensorflow::int8>(value, linear_index,
-                                                       literal);
+      return SetValueInLiteralHelper<int8>(loc, value, index, literal);
     case S16:
-      return SetValueInLiteralHelper<tensorflow::int16>(value, linear_index,
-                                                        literal);
+      return SetValueInLiteralHelper<int16>(loc, value, index, literal);
     case S32:
-      return SetValueInLiteralHelper<tensorflow::int32>(value, linear_index,
-                                                        literal);
+      return SetValueInLiteralHelper<int32>(loc, value, index, literal);
     case S64:
-      return SetValueInLiteralHelper<tensorflow::int64>(value, linear_index,
-                                                        literal);
+      return SetValueInLiteralHelper<int64>(loc, value, index, literal);
     case U8:
-      return SetValueInLiteralHelper<tensorflow::uint8>(value, linear_index,
+      return SetValueInLiteralHelper<tensorflow::uint8>(loc, value, index,
                                                         literal);
     case U16:
-      return SetValueInLiteralHelper<tensorflow::uint16>(value, linear_index,
+      return SetValueInLiteralHelper<tensorflow::uint16>(loc, value, index,
                                                          literal);
     case U32:
-      return SetValueInLiteralHelper<tensorflow::uint32>(value, linear_index,
+      return SetValueInLiteralHelper<tensorflow::uint32>(loc, value, index,
                                                          literal);
     case U64:
-      return SetValueInLiteralHelper<tensorflow::uint64>(value, linear_index,
+      return SetValueInLiteralHelper<tensorflow::uint64>(loc, value, index,
                                                          literal);
     case PRED:
       // Bool type literals with rank >= 1 are printed in 0s and 1s.
-      return SetValueInLiteralHelper<bool>(static_cast<bool>(value),
-                                           linear_index, literal);
+      return SetValueInLiteralHelper<bool>(loc, static_cast<bool>(value), index,
+                                           literal);
     default:
       LOG(FATAL) << "unknown integral primitive type "
                  << PrimitiveType_Name(shape.element_type());
   }
 }
 
-bool HloParser::SetValueInLiteral(double value, tensorflow::int64 linear_index,
-                                  Literal* literal) {
+bool HloParser::SetValueInLiteral(LocTy loc, double value,
+                                  LinearOrMultiIndex index, Literal* literal) {
   const Shape& shape = literal->shape();
   switch (shape.element_type()) {
     case F16:
-      return SetValueInLiteralHelper<Eigen::half>(value, linear_index, literal);
+      return SetValueInLiteralHelper<Eigen::half>(loc, value, index, literal);
     case BF16:
-      return SetValueInLiteralHelper<tensorflow::bfloat16>(value, linear_index,
+      return SetValueInLiteralHelper<tensorflow::bfloat16>(loc, value, index,
                                                            literal);
     case F32:
-      return SetValueInLiteralHelper<float>(value, linear_index, literal);
+      return SetValueInLiteralHelper<float>(loc, value, index, literal);
     case F64:
-      return SetValueInLiteralHelper<double>(value, linear_index, literal);
+      return SetValueInLiteralHelper<double>(loc, value, index, literal);
     default:
       LOG(FATAL) << "unknown floating point primitive type "
                  << PrimitiveType_Name(shape.element_type());
   }
 }
 
-bool HloParser::SetValueInLiteral(bool value, tensorflow::int64 linear_index,
-                                  Literal* literal) {
+bool HloParser::SetValueInLiteral(LocTy loc, bool value,
+                                  LinearOrMultiIndex index, Literal* literal) {
   const Shape& shape = literal->shape();
   switch (shape.element_type()) {
     case PRED:
-      return SetValueInLiteralHelper<bool>(value, linear_index, literal);
+      return SetValueInLiteralHelper<bool>(loc, value, index, literal);
     default:
       LOG(FATAL) << PrimitiveType_Name(shape.element_type())
                  << " is not PRED type";
   }
 }
 
+bool HloParser::SetValueInLiteral(LocTy loc, std::complex<double> value,
+                                  LinearOrMultiIndex index, Literal* literal) {
+  const Shape& shape = literal->shape();
+  switch (shape.element_type()) {
+    case C64:
+      return SetValueInLiteralHelper<std::complex<float>>(loc, value, index,
+                                                          literal);
+    case C128:
+      return SetValueInLiteralHelper<std::complex<double>>(loc, value, index,
+                                                           literal);
+    default:
+      LOG(FATAL) << PrimitiveType_Name(shape.element_type())
+                 << " is not a complex type type";
+  }
+}
+
+template <typename T>
+string StringifyValue(T val) {
+  return StrCat(val);
+}
+template <>
+string StringifyValue(std::complex<double> val) {
+  return StrFormat("(%f, %f)", std::real(val), std::imag(val));
+}
+
 template <typename LiteralNativeT, typename ParsedElemT>
-bool HloParser::SetValueInLiteralHelper(ParsedElemT value,
-                                        tensorflow::int64 linear_index,
+bool HloParser::SetValueInLiteralHelper(LocTy loc, ParsedElemT value,
+                                        LinearOrMultiIndex index,
                                         Literal* literal) {
-  // Check that linear_index is in range.
-  if (linear_index >= ShapeUtil::ElementsIn(literal->shape())) {
-    return TokenError(
-        StrCat("trys to set value ", value, " to a literal in shape ",
-               ShapeUtil::HumanString(literal->shape()), " at linear index ",
-               linear_index, ", but the index is out of range"));
+  if (!CheckParsedValueIsInRange<LiteralNativeT>(loc, value)) {
+    return false;
   }
 
-  if (std::isnan(value) ||
-      (std::numeric_limits<ParsedElemT>::has_infinity &&
-       (std::numeric_limits<ParsedElemT>::infinity() == value ||
-        -std::numeric_limits<ParsedElemT>::infinity() == value))) {
-    // Skip range checking for non-finite value.
-  } else if (literal->shape().element_type() == F16 ||
-             literal->shape().element_type() == BF16) {
-    if (value > kF16max || value < -kF16max) {
-      return TokenError(StrCat(
-          "value ", value, " is out of range for literal's primitive type ",
-          PrimitiveType_Name(literal->shape().element_type())));
+  // Check that the index is in range and assign into the literal
+  if (auto* linear_index = absl::get_if<int64>(&index)) {
+    if (*linear_index >= ShapeUtil::ElementsIn(literal->shape())) {
+      return Error(loc, StrCat("trys to set value ", StringifyValue(value),
+                               " to a literal in shape ",
+                               ShapeUtil::HumanString(literal->shape()),
+                               " at linear index ", *linear_index,
+                               ", but the index is out of range"));
     }
-  } else if (std::is_unsigned<LiteralNativeT>::value) {
-    CHECK((std::is_same<ParsedElemT, tensorflow::int64>::value ||
-           std::is_same<ParsedElemT, bool>::value))
-        << "Unimplemented checking for ParsedElemT";
-
-    ParsedElemT upper_bound;
-    if (sizeof(LiteralNativeT) >= sizeof(ParsedElemT)) {
-      upper_bound = std::numeric_limits<ParsedElemT>::max();
-    } else {
-      upper_bound =
-          static_cast<ParsedElemT>(std::numeric_limits<LiteralNativeT>::max());
-    }
-    if (value > upper_bound || value < 0) {
-      // Value is out of range for LiteralNativeT.
-      return TokenError(StrCat(
-          "value ", value, " is out of range for literal's primitive type ",
-          PrimitiveType_Name(literal->shape().element_type())));
-    }
-  } else if (value > static_cast<ParsedElemT>(
-                         std::numeric_limits<LiteralNativeT>::max()) ||
-             value < static_cast<ParsedElemT>(
-                         std::numeric_limits<LiteralNativeT>::lowest())) {
-    // Value is out of range for LiteralNativeT.
-    return TokenError(StrCat(
-        "value ", value, " is out of range for literal's primitive type ",
-        PrimitiveType_Name(literal->shape().element_type())));
-  }
+    literal->data<LiteralNativeT>().at(*linear_index) =
+        static_cast<LiteralNativeT>(value);
+  } else {
+    auto* multi_index = absl::get_if<absl::Span<const int64>>(&index);
+    CHECK(multi_index != nullptr);
 
-  literal->data<LiteralNativeT>().at(linear_index) =
-      static_cast<LiteralNativeT>(value);
-  return true;
-}
+    auto invalid_idx = [&](string msg) {
+      return Error(loc, StrFormat("Invalid sparse index [%s]. %s",
+                                  absl::StrJoin(*multi_index, ", "), msg));
+    };
 
-bool HloParser::EatShapeAndCheckCompatible(const Shape& shape) {
-  Shape new_shape;
-  if (!ParseShape(&new_shape)) {
-    return TokenError(StrCat("expects shape ", ShapeUtil::HumanString(shape)));
-  }
-  if (!ShapeUtil::Compatible(shape, new_shape)) {
-    return TokenError(StrCat(
-        "expects shape ", ShapeUtil::HumanString(shape),
-        ", but sees a different shape: ", ShapeUtil::HumanString(new_shape)));
+    const auto& shape = literal->shape();
+    if (shape.rank() != multi_index->size()) {
+      return invalid_idx(
+          StrFormat("Has rank %d, but constant has shape %s, which has rank %d",
+                    multi_index->size(), shape.ToString(), shape.rank()));
+    }
+    for (int64 i = 0; i < shape.rank(); ++i) {
+      auto idx = (*multi_index)[i];
+      if (idx < 0) {
+        return invalid_idx(StrFormat(
+            "Sub-index value at %d, namely %d, cannot be negative.", i, idx));
+      }
+      if (idx >= shape.dimensions(i)) {
+        return invalid_idx(
+            StrFormat("Sub-index at %d, namely %d, doesn't fit within shape "
+                      "dimension %d in %s",
+                      i, idx, shape.dimensions(i), shape.ToString()));
+      }
+    }
+    literal->AppendSparseElement(*multi_index,
+                                 static_cast<LiteralNativeT>(value));
   }
   return true;
 }
@@ -1942,8 +2083,8 @@ bool HloParser::EatShapeAndCheckCompatible(const Shape& shape) {
 //  ::= tuple
 //  ::= non_tuple
 bool HloParser::ParseLiteral(Literal* literal, const Shape& shape) {
-  return ShapeUtil::IsTuple(shape) ? ParseTupleLiteral(literal, shape)
-                                   : ParseNonTupleLiteral(literal, shape);
+  return shape.IsTuple() ? ParseTupleLiteral(literal, shape)
+                         : ParseNonTupleLiteral(literal, shape);
 }
 
 // tuple
@@ -1952,10 +2093,6 @@ bool HloParser::ParseLiteral(Literal* literal, const Shape& shape) {
 //  ::= /*empty*/
 //  ::= literal (',' literal)*
 bool HloParser::ParseTupleLiteral(Literal* literal, const Shape& shape) {
-  if (!EatShapeAndCheckCompatible(shape)) {
-    return TokenError(StrCat("expects tuple constant in shape ",
-                             ShapeUtil::HumanString(shape)));
-  }
   if (!ParseToken(TokKind::kLparen, "expects '(' in front of tuple elements")) {
     return false;
   }
@@ -1990,21 +2127,21 @@ bool HloParser::ParseNonTupleLiteral(Literal* literal, const Shape& shape) {
     return ParseSparseLiteral(literal, shape);
   }
 
-  CHECK(LayoutUtil::IsDenseArray(shape));
+  CHECK(LayoutUtil::IsDenseArray(shape)) << shape.ToString(true);
   return ParseDenseLiteral(literal, shape);
 }
 
 bool HloParser::ParseDenseLiteral(Literal* literal, const Shape& shape) {
-  const tensorflow::int64 rank = ShapeUtil::Rank(shape);
-  if (rank > 1 && !EatShapeAndCheckCompatible(shape)) {
-    return false;
-  }
+  // Cast `rank` to int because we call shape.dimensions(int rank) below, and if
+  // `rank` is an int64, that's an implicit narrowing conversion, which is
+  // implementation-defined behavior.
+  const int rank = static_cast<int>(shape.rank());
 
   // Create a literal with the given shape in default layout.
   *literal = LiteralUtil::CreateFromDimensions(
       shape.element_type(), AsInt64Slice(shape.dimensions()));
-  tensorflow::int64 nest_level = 0;
-  tensorflow::int64 linear_index = 0;
+  int64 nest_level = 0;
+  int64 linear_index = 0;
   // elems_seen_per_dim[i] is how many elements or sub-arrays we have seen for
   // the dimension i. For example, to parse f32[2,3] {{1, 2, 3}, {4, 5, 6}},
   // when we are parsing the 2nd '{' (right before '1'), we are seeing a
@@ -2012,17 +2149,35 @@ bool HloParser::ParseDenseLiteral(Literal* literal, const Shape& shape) {
   // the first '}' (right after '3'), it means the sub-array ends, and the
   // sub-array is supposed to contain exactly 3 elements, so check if
   // elems_seen_per_dim[1] is 3.
-  std::vector<tensorflow::int64> elems_seen_per_dim(rank);
+  std::vector<int64> elems_seen_per_dim(rank);
   auto get_index_str = [&elems_seen_per_dim](int dim) -> string {
-    std::vector<tensorflow::int64> elems_seen_until_dim(
-        elems_seen_per_dim.begin(), elems_seen_per_dim.begin() + dim);
+    std::vector<int64> elems_seen_until_dim(elems_seen_per_dim.begin(),
+                                            elems_seen_per_dim.begin() + dim);
     return StrCat("[",
                   StrJoin(elems_seen_until_dim, ",",
-                          [](string* out, const tensorflow::int64& num_elems) {
+                          [](string* out, const int64& num_elems) {
                             StrAppend(out, num_elems - 1);
                           }),
                   "]");
   };
+
+  auto add_one_elem_seen = [&] {
+    if (rank > 0) {
+      if (nest_level != rank) {
+        return TokenError(absl::StrFormat(
+            "expects nested array in rank %d, but sees %d", rank, nest_level));
+      }
+      elems_seen_per_dim[rank - 1]++;
+      if (elems_seen_per_dim[rank - 1] > shape.dimensions(rank - 1)) {
+        return TokenError(absl::StrFormat(
+            "expects %d elements on the minor-most dimension, but "
+            "sees more",
+            shape.dimensions(rank - 1)));
+      }
+    }
+    return true;
+  };
+
   do {
     switch (lexer_.GetKind()) {
       default:
@@ -2058,6 +2213,31 @@ bool HloParser::ParseDenseLiteral(Literal* literal, const Shape& shape) {
         lexer_.Lex();
         break;
       }
+      case TokKind::kLparen: {
+        if (!primitive_util::IsComplexType(shape.element_type())) {
+          return TokenError(
+              absl::StrFormat("unexpected '(' in literal.  Parens are only "
+                              "valid for complex literals"));
+        }
+
+        std::complex<double> value;
+        LocTy loc = lexer_.GetLoc();
+        if (!add_one_elem_seen() || !ParseComplex(&value) ||
+            !SetValueInLiteral(loc, value, linear_index++, literal)) {
+          return false;
+        }
+        break;
+      }
+      case TokKind::kDots: {
+        if (nest_level != 1) {
+          return TokenError(absl::StrFormat(
+              "expects `...` at nest level 1, but sees it at nest level %d",
+              nest_level));
+        }
+        elems_seen_per_dim[0] = shape.dimensions(0);
+        lexer_.Lex();
+        break;
+      }
       case TokKind::kComma:
         // Skip.
         lexer_.Lex();
@@ -2069,23 +2249,11 @@ bool HloParser::ParseDenseLiteral(Literal* literal, const Shape& shape) {
       case TokKind::kw_nan:
       case TokKind::kw_inf:
       case TokKind::kNegInf: {
-        if (rank > 0) {
-          if (nest_level != rank) {
-            return TokenError(
-                absl::StrFormat("expects nested array in rank %d, but sees %d",
-                                rank, nest_level));
-          }
-          elems_seen_per_dim[rank - 1]++;
-          if (elems_seen_per_dim[rank - 1] > shape.dimensions(rank - 1)) {
-            return TokenError(absl::StrFormat(
-                "expects %d elements on the minor-most dimension, but "
-                "sees more",
-                shape.dimensions(rank - 1)));
-          }
-        }
+        add_one_elem_seen();
         if (lexer_.GetKind() == TokKind::kw_true ||
             lexer_.GetKind() == TokKind::kw_false) {
-          if (!SetValueInLiteral(lexer_.GetKind() == TokKind::kw_true,
+          if (!SetValueInLiteral(lexer_.GetLoc(),
+                                 lexer_.GetKind() == TokKind::kw_true,
                                  linear_index++, literal)) {
             return false;
           }
@@ -2093,12 +2261,12 @@ bool HloParser::ParseDenseLiteral(Literal* literal, const Shape& shape) {
         } else if (primitive_util::IsIntegralType(shape.element_type()) ||
                    shape.element_type() == PRED) {
           LocTy loc = lexer_.GetLoc();
-          tensorflow::int64 value;
+          int64 value;
           if (!ParseInt64(&value)) {
             return Error(loc, StrCat("expects integer for primitive type: ",
                                      PrimitiveType_Name(shape.element_type())));
           }
-          if (!SetValueInLiteral(value, linear_index++, literal)) {
+          if (!SetValueInLiteral(loc, value, linear_index++, literal)) {
             return false;
           }
         } else if (primitive_util::IsFloatingPointType(shape.element_type())) {
@@ -2109,7 +2277,7 @@ bool HloParser::ParseDenseLiteral(Literal* literal, const Shape& shape) {
                 loc, StrCat("expect floating point value for primitive type: ",
                             PrimitiveType_Name(shape.element_type())));
           }
-          if (!SetValueInLiteral(value, linear_index++, literal)) {
+          if (!SetValueInLiteral(loc, value, linear_index++, literal)) {
             return false;
           }
         } else {
@@ -2126,52 +2294,7 @@ bool HloParser::ParseDenseLiteral(Literal* literal, const Shape& shape) {
 }
 
 bool HloParser::ParseSparseLiteral(Literal* literal, const Shape& shape) {
-  if (!EatShapeAndCheckCompatible(shape)) {
-    return false;
-  }
-
-  switch (shape.element_type()) {
-    case PRED:
-      return ParseSparseLiteralHelper<tensorflow::uint8>(literal, shape);
-    case S8:
-      return ParseSparseLiteralHelper<tensorflow::int8>(literal, shape);
-    case S16:
-      return ParseSparseLiteralHelper<tensorflow::int16>(literal, shape);
-    case S32:
-      return ParseSparseLiteralHelper<tensorflow::int32>(literal, shape);
-    case S64:
-      return ParseSparseLiteralHelper<tensorflow::int64>(literal, shape);
-    case U8:
-      return ParseSparseLiteralHelper<tensorflow::uint8>(literal, shape);
-    case U16:
-      return ParseSparseLiteralHelper<tensorflow::uint16>(literal, shape);
-    case U32:
-      return ParseSparseLiteralHelper<tensorflow::uint32>(literal, shape);
-    case U64:
-      return ParseSparseLiteralHelper<tensorflow::uint64>(literal, shape);
-    case F16:
-      return ParseSparseLiteralHelper<Eigen::half>(literal, shape);
-    case F32:
-      return ParseSparseLiteralHelper<float>(literal, shape);
-    case BF16:
-      return ParseSparseLiteralHelper<tensorflow::bfloat16>(literal, shape);
-    case F64:
-      return ParseSparseLiteralHelper<double>(literal, shape);
-    default:
-      return Error(lexer_.GetLoc(),
-                   StrCat("invalid primitive type for sparse literal: ",
-                          PrimitiveType_Name(shape.element_type())));
-  }
-}
-
-template <typename LiteralNativeT>
-bool HloParser::ParseSparseLiteralHelper(Literal* literal, const Shape& shape) {
-  std::vector<tensorflow::int64> index;
-
-  tensorflow::int64 rank = ShapeUtil::Rank(shape);
-
   *literal = Literal(shape);
-
   if (!ParseToken(TokKind::kLbrace,
                   "expects '{' at the beginning of a sparse literal")) {
     return false;
@@ -2183,61 +2306,66 @@ bool HloParser::ParseSparseLiteralHelper(Literal* literal, const Shape& shape) {
       break;
     }
 
-    LocTy index_loc = lexer_.GetLoc();
-    index.clear();
+    std::vector<int64> index;
     if (lexer_.GetKind() == TokKind::kInt) {
-      tensorflow::int64 single_index = lexer_.GetInt64Val();
+      int64 single_index = lexer_.GetInt64Val();
       lexer_.Lex();
-      if (rank != 1) {
-        return Error(
-            index_loc,
-            StrCat("invalid single-dimensional index for shape with rank ",
-                   rank, ": ", single_index));
-      }
       index.push_back(single_index);
     } else {
       if (!ParseInt64List(TokKind::kLsquare, TokKind::kRsquare, TokKind::kComma,
                           &index)) {
         return false;
       }
-      if (index.size() != rank) {
-        return Error(
-            index_loc,
-            StrCat("invalid multi-dimension index for shape with rank ", rank,
-                   ": [", StrJoin(index, ", "), "]"));
-      }
     }
     if (!ParseToken(TokKind::kColon,
                     "expects ':' after after the sparse array index and before "
                     "the sparse array value")) {
       return false;
     }
+
     LocTy value_loc = lexer_.GetLoc();
-    LiteralNativeT value;
     if (lexer_.GetKind() == TokKind::kw_true ||
         lexer_.GetKind() == TokKind::kw_false) {
-      value = static_cast<LiteralNativeT>(lexer_.GetKind() == TokKind::kw_true);
+      bool value = lexer_.GetKind() == TokKind::kw_true;
+      if (!SetValueInLiteral(lexer_.GetLoc(), value, index, literal)) {
+        return false;
+      }
       lexer_.Lex();
     } else if (primitive_util::IsIntegralType(shape.element_type())) {
-      tensorflow::int64 value_s64;
-      if (!ParseInt64(&value_s64)) {
+      int64 value;
+      if (!ParseInt64(&value)) {
         return Error(value_loc,
                      StrCat("expects integer for primitive type: ",
                             PrimitiveType_Name(shape.element_type())));
       }
-      value = static_cast<LiteralNativeT>(value_s64);
+      if (!SetValueInLiteral(value_loc, value, index, literal)) {
+        return false;
+      }
     } else if (primitive_util::IsFloatingPointType(shape.element_type())) {
-      double value_f64;
-      if (!ParseDouble(&value_f64)) {
+      double value;
+      if (!ParseDouble(&value)) {
         return Error(value_loc,
                      StrCat("expects floating point value for primitive type: ",
                             PrimitiveType_Name(shape.element_type())));
       }
-      value = static_cast<LiteralNativeT>(value_f64);
+      if (!SetValueInLiteral(value_loc, value, index, literal)) {
+        return false;
+      }
+    } else if (primitive_util::IsComplexType(shape.element_type())) {
+      std::complex<double> value;
+      if (!ParseComplex(&value)) {
+        return Error(value_loc,
+                     StrCat("expects complex value for primitive type: ",
+                            PrimitiveType_Name(shape.element_type())));
+      }
+      if (!SetValueInLiteral(value_loc, value, index, literal)) {
+        return false;
+      }
     } else {
       LOG(FATAL) << "Unexpected element type: "
                  << PrimitiveType_Name(shape.element_type());
     }
+
     if (lexer_.GetKind() != TokKind::kRbrace &&
         !ParseToken(TokKind::kComma,
                     "expects ',' separator between sparse array elements")) {
@@ -2251,14 +2379,114 @@ bool HloParser::ParseSparseLiteralHelper(Literal* literal, const Shape& shape) {
           StrCat("number of sparse elements exceeds maximum for layout: ",
                  ShapeUtil::HumanStringWithLayout(shape)));
     }
-
-    literal->AppendSparseElement(index, value);
   }
 
   literal->SortSparseElements();
   return true;
 }
 
+// MaxFiniteValue is a type-traits helper used by
+// HloParser::CheckParsedValueIsInRange.
+template <typename T>
+struct MinMaxFiniteValue {
+  static T max() { return std::numeric_limits<T>::max(); }
+  static T min() { return std::numeric_limits<T>::lowest(); }
+};
+
+template <>
+struct MinMaxFiniteValue<Eigen::half> {
+  static double max() {
+    // Sadly this is not constexpr, so this forces `value` to be a method.
+    return static_cast<double>(Eigen::NumTraits<Eigen::half>::highest());
+  }
+  static double min() { return -max(); }
+};
+
+template <>
+struct MinMaxFiniteValue<bfloat16> {
+  static double max() { return static_cast<double>(bfloat16::highest()); }
+  static double min() { return -max(); }
+};
+
+template <typename LiteralNativeT, typename ParsedElemT>
+bool HloParser::CheckParsedValueIsInRange(LocTy loc, ParsedElemT value) {
+  PrimitiveType literal_ty =
+      primitive_util::NativeToPrimitiveType<LiteralNativeT>();
+  if (std::isnan(value) ||
+      (std::numeric_limits<ParsedElemT>::has_infinity &&
+       (std::numeric_limits<ParsedElemT>::infinity() == value ||
+        -std::numeric_limits<ParsedElemT>::infinity() == value))) {
+    // Skip range checking for non-finite value.
+  } else if (std::is_unsigned<LiteralNativeT>::value) {
+    CHECK((std::is_same<ParsedElemT, int64>::value ||
+           std::is_same<ParsedElemT, bool>::value))
+        << "Unimplemented checking for ParsedElemT";
+
+    ParsedElemT upper_bound;
+    if (sizeof(LiteralNativeT) >= sizeof(ParsedElemT)) {
+      upper_bound = std::numeric_limits<ParsedElemT>::max();
+    } else {
+      upper_bound =
+          static_cast<ParsedElemT>(std::numeric_limits<LiteralNativeT>::max());
+    }
+    if (value > upper_bound || value < 0) {
+      // Value is out of range for LiteralNativeT.
+      return Error(loc, StrCat("value ", value,
+                               " is out of range for literal's primitive type ",
+                               PrimitiveType_Name(literal_ty), " namely [0, ",
+                               upper_bound, "]."));
+    }
+  } else if (value > MinMaxFiniteValue<LiteralNativeT>::max() ||
+             value < MinMaxFiniteValue<LiteralNativeT>::min()) {
+    // Value is out of range for LiteralNativeT.
+    return Error(loc, StrCat("value ", value,
+                             " is out of range for literal's primitive type ",
+                             PrimitiveType_Name(literal_ty), " namely [",
+                             MinMaxFiniteValue<LiteralNativeT>::min(), ", ",
+                             MinMaxFiniteValue<LiteralNativeT>::max(), "]."));
+  }
+  return true;
+}
+
+template <typename LiteralNativeT>
+bool HloParser::CheckParsedValueIsInRange(LocTy loc,
+                                          std::complex<double> value) {
+  // e.g. `float` for std::complex<float>
+  using LiteralComplexComponentT =
+      decltype(std::real(std::declval<LiteralNativeT>()));
+
+  // We could do simply
+  //
+  //   return CheckParsedValueIsInRange<LiteralNativeT>(std::real(value)) &&
+  //          CheckParsedValueIsInRange<LiteralNativeT>(std::imag(value));
+  //
+  // but this would give bad error messages on failure.
+
+  auto check_component = [&](absl::string_view name, double v) {
+    if (std::isnan(v) || v == std::numeric_limits<double>::infinity() ||
+        v == -std::numeric_limits<double>::infinity()) {
+      // Skip range-checking for non-finite values.
+      return true;
+    }
+
+    double min = MinMaxFiniteValue<LiteralComplexComponentT>::min();
+    double max = MinMaxFiniteValue<LiteralComplexComponentT>::max();
+    if (v < min || v > max) {
+      // Value is out of range for LitearlComplexComponentT.
+      return Error(
+          loc,
+          StrCat(name, " part ", v,
+                 " is out of range for literal's primitive type ",
+                 PrimitiveType_Name(
+                     primitive_util::NativeToPrimitiveType<LiteralNativeT>()),
+                 ", namely [", min, ", ", max, "]."));
+    }
+    return true;
+  };
+  return check_component("real", std::real(value)) &&
+         check_component("imaginary", std::imag(value));
+}
+
 // operands ::= '(' operands1 ')'
 // operands1
 //   ::= /*empty*/
@@ -2416,24 +2644,23 @@ bool HloParser::ParseAttributeHelper(
         return true;
       }
       case AttrTy::kInt64: {
-        tensorflow::int64 result;
+        int64 result;
         if (!ParseInt64(&result)) {
           return false;
         }
-        static_cast<optional<tensorflow::int64>*>(attr_out_ptr)
-            ->emplace(result);
+        static_cast<optional<int64>*>(attr_out_ptr)->emplace(result);
         return true;
       }
       case AttrTy::kInt32: {
-        tensorflow::int64 result;
+        int64 result;
         if (!ParseInt64(&result)) {
           return false;
         }
-        if (result != static_cast<tensorflow::int32>(result)) {
+        if (result != static_cast<int32>(result)) {
           return Error(attr_loc, "value out of range for int32");
         }
-        static_cast<optional<tensorflow::int32>*>(attr_out_ptr)
-            ->emplace(static_cast<tensorflow::int32>(result));
+        static_cast<optional<int32>*>(attr_out_ptr)
+            ->emplace(static_cast<int32>(result));
         return true;
       }
       case AttrTy::kFloat: {
@@ -2473,6 +2700,15 @@ bool HloParser::ParseAttributeHelper(
         static_cast<optional<FftType>*>(attr_out_ptr)->emplace(result);
         return true;
       }
+      case AttrTy::kTriangularSolveTranspose: {
+        TriangularSolveOptions::Transpose result;
+        if (!ParseTriangularSolveTranspose(&result)) {
+          return false;
+        }
+        static_cast<optional<TriangularSolveOptions::Transpose>*>(attr_out_ptr)
+            ->emplace(result);
+        return true;
+      }
       case AttrTy::kWindow: {
         Window result;
         if (!ParseWindow(&result, /*expect_outer_curlies=*/true)) {
@@ -2498,6 +2734,15 @@ bool HloParser::ParseAttributeHelper(
         static_cast<optional<OpSharding>*>(attr_out_ptr)->emplace(sharding);
         return true;
       }
+      case AttrTy::kParameterReplication: {
+        ParameterReplication parameter_replication;
+        if (!ParseParameterReplication(&parameter_replication)) {
+          return false;
+        }
+        static_cast<optional<ParameterReplication>*>(attr_out_ptr)
+            ->emplace(parameter_replication);
+        return true;
+      }
       case AttrTy::kInstructionList: {
         std::vector<HloInstruction*> result;
         if (!ParseInstructionNames(&result)) {
@@ -2517,19 +2762,19 @@ bool HloParser::ParseAttributeHelper(
         return true;
       }
       case AttrTy::kBracedInt64List: {
-        std::vector<tensorflow::int64> result;
+        std::vector<int64> result;
         if (!ParseInt64List(TokKind::kLbrace, TokKind::kRbrace, TokKind::kComma,
                             &result)) {
           return false;
         }
-        static_cast<optional<std::vector<tensorflow::int64>>*>(attr_out_ptr)
+        static_cast<optional<std::vector<int64>>*>(attr_out_ptr)
             ->emplace(result);
         return true;
       }
       case AttrTy::kBracedInt64ListList: {
-        std::vector<std::vector<tensorflow::int64>> result;
+        std::vector<std::vector<int64>> result;
         auto parse_and_add_item = [&]() {
-          std::vector<tensorflow::int64> item;
+          std::vector<int64> item;
           if (!ParseInt64List(TokKind::kLbrace, TokKind::kRbrace,
                               TokKind::kComma, &item)) {
             return false;
@@ -2541,8 +2786,7 @@ bool HloParser::ParseAttributeHelper(
                        parse_and_add_item)) {
           return false;
         }
-        static_cast<optional<std::vector<std::vector<tensorflow::int64>>>*>(
-            attr_out_ptr)
+        static_cast<optional<std::vector<std::vector<int64>>>*>(attr_out_ptr)
             ->emplace(result);
         return true;
       }
@@ -2743,7 +2987,7 @@ bool HloParser::ParseConvolutionDimensionNumbers(
   absl::string_view rhs = split2[0];
   absl::string_view out = split2[1];
 
-  const tensorflow::int64 rank = lhs.length();
+  const int64 rank = lhs.length();
   if (rank != rhs.length() || rank != out.length()) {
     return TokenError(
         "convolution lhs, rhs, and output must have the same rank");
@@ -2753,7 +2997,7 @@ bool HloParser::ParseConvolutionDimensionNumbers(
   }
 
   auto is_unique = [](string str) -> bool {
-    std::sort(str.begin(), str.end());
+    absl::c_sort(str);
     return std::unique(str.begin(), str.end()) == str.end();
   };
 
@@ -2854,7 +3098,7 @@ bool HloParser::ParseSliceRanges(SliceRanges* result) {
   if (!ParseToken(TokKind::kLbrace, "expects '{' to start ranges")) {
     return false;
   }
-  std::vector<std::vector<tensorflow::int64>> ranges;
+  std::vector<std::vector<int64>> ranges;
   if (lexer_.GetKind() == TokKind::kRbrace) {
     // empty
     return ParseToken(TokKind::kRbrace, "expects '}' to end ranges");
@@ -2924,9 +3168,9 @@ bool HloParser::ParseShapeList(std::vector<Shape>* result) {
 //   ::= int64_val (delim int64_val)*
 bool HloParser::ParseInt64List(const TokKind start, const TokKind end,
                                const TokKind delim,
-                               std::vector<tensorflow::int64>* result) {
+                               std::vector<int64>* result) {
   auto parse_and_add_item = [&]() {
-    tensorflow::int64 i;
+    int64 i;
     if (!ParseInt64(&i)) {
       return false;
     }
@@ -2994,6 +3238,136 @@ bool HloParser::ParseParamList() {
   return ParseToken(TokKind::kRparen, "expects ')' at the end of param list");
 }
 
+// dimension_sizes ::= '[' dimension_list ']'
+// dimension_list
+//   ::= /*empty*/
+//   ::= <=? int64 (',' param)*
+// param ::= name shape
+bool HloParser::ParseDimensionSizes(std::vector<int64>* dimension_sizes,
+                                    std::vector<bool>* dynamic_dimensions) {
+  auto parse_and_add_item = [&]() {
+    int64 i;
+    bool is_dynamic = false;
+    if (lexer_.GetKind() == TokKind::kLeq) {
+      is_dynamic = true;
+      lexer_.Lex();
+    }
+    if (!ParseInt64(&i)) {
+      return false;
+    }
+    dimension_sizes->push_back(i);
+    dynamic_dimensions->push_back(is_dynamic);
+    return true;
+  };
+  return ParseList(TokKind::kLsquare, TokKind::kRsquare, TokKind::kComma,
+                   parse_and_add_item);
+}
+
+// tiles
+//   ::= /*empty*/
+//   ::= 'T' '(' dim_list ')'
+// dim_list
+//   ::= /*empty*/
+//   ::= (int64 | '*') (',' (int64 | '*'))*
+bool HloParser::ParseTiles(std::vector<Tile>* tiles) {
+  auto parse_and_add_tile_dimension = [&]() {
+    tensorflow::int64 i;
+    if (ParseInt64(&i)) {
+      tiles->back().add_dimensions(i);
+      return true;
+    }
+    if (lexer_.GetKind() == TokKind::kAsterisk) {
+      tiles->back().add_dimensions(Tile::kCombineDimension);
+      lexer_.Lex();
+      return true;
+    }
+    return false;
+  };
+
+  do {
+    tiles->push_back(Tile());
+    if (!ParseList(TokKind::kLparen, TokKind::kRparen, TokKind::kComma,
+                   parse_and_add_tile_dimension)) {
+      return false;
+    }
+  } while (lexer_.GetKind() == TokKind::kLparen);
+  return true;
+}
+
+// layout ::= '{' int64_list (':' tiles element_size_in_bits)? '}'
+// element_size_in_bits
+//   ::= /*empty*/
+//   ::= 'E' '(' int64 ')'
+bool HloParser::ParseLayout(Layout* layout) {
+  std::vector<int64> minor_to_major;
+  std::vector<Tile> tiles;
+  tensorflow::int64 element_size_in_bits = 0;
+
+  auto parse_and_add_item = [&]() {
+    int64 i;
+    if (!ParseInt64(&i)) {
+      return false;
+    }
+    minor_to_major.push_back(i);
+    return true;
+  };
+
+  if (!ParseToken(TokKind::kLbrace,
+                  StrCat("expects layout to start with ",
+                         TokKindToString(TokKind::kLbrace)))) {
+    return false;
+  }
+  if (lexer_.GetKind() != TokKind::kRbrace) {
+    if (lexer_.GetKind() == TokKind::kInt) {
+      // Parse minor to major.
+      do {
+        if (!parse_and_add_item()) {
+          return false;
+        }
+      } while (EatIfPresent(TokKind::kComma));
+    }
+
+    if (lexer_.GetKind() == TokKind::kColon) {
+      lexer_.Lex();
+      if (lexer_.GetKind() == TokKind::kIdent && lexer_.GetStrVal() == "T") {
+        lexer_.Lex();
+        ParseTiles(&tiles);
+      }
+
+      if (lexer_.GetKind() == TokKind::kIdent && lexer_.GetStrVal() == "E") {
+        // Parse element size in bits.
+        lexer_.Lex();
+        if (!ParseToken(TokKind::kLparen,
+                        StrCat("expects element size in bits to start with ",
+                               TokKindToString(TokKind::kLparen)))) {
+          return false;
+        }
+        if (!ParseInt64(&element_size_in_bits)) {
+          return false;
+        }
+        if (!ParseToken(TokKind::kRparen,
+                        StrCat("expects element size in bits to end with ",
+                               TokKindToString(TokKind::kRparen)))) {
+          return false;
+        }
+      }
+    }
+  }
+  if (!ParseToken(TokKind::kRbrace,
+                  StrCat("expects layout to end with ",
+                         TokKindToString(TokKind::kRbrace)))) {
+    return false;
+  }
+
+  std::vector<Tile> vec_tiles(tiles.size());
+  for (int i = 0; i < tiles.size(); i++) {
+    vec_tiles[i] = Tile(tiles[i]);
+  }
+  *layout =
+      LayoutUtil::MakeLayout(minor_to_major, vec_tiles, element_size_in_bits);
+  return true;
+}
+
 // shape ::= shape_val_
 // shape ::= '(' tuple_elements ')'
 // tuple_elements
@@ -3017,19 +3391,74 @@ bool HloParser::ParseShape(Shape* result) {
     return ParseToken(TokKind::kRparen, "expects ')' at the end of tuple.");
   }
 
-  if (lexer_.GetKind() != TokKind::kShape) {
-    return TokenError(absl::StrCat("expected shape, saw ",
+  if (lexer_.GetKind() != TokKind::kPrimitiveType) {
+    return TokenError(absl::StrCat("expected primitive type, saw ",
                                    TokKindToString(lexer_.GetKind())));
   }
-  *result = lexer_.GetShapeVal();
+  PrimitiveType primitive_type = lexer_.GetPrimitiveTypeVal();
   lexer_.Lex();
+
+  // Each element contains a dimension size and a bool indicating whether this
+  // is a dynamic dimension.
+  std::vector<int64> dimension_sizes;
+  std::vector<bool> dynamic_dimensions;
+  if (!ParseDimensionSizes(&dimension_sizes, &dynamic_dimensions)) {
+    return false;
+  }
+  result->set_element_type(primitive_type);
+  for (int i = 0; i < dimension_sizes.size(); ++i) {
+    result->add_dimensions(dimension_sizes[i]);
+    result->set_dynamic_dimension(i, dynamic_dimensions[i]);
+  }
+  LayoutUtil::SetToDefaultLayout(result);
+
+  if (lexer_.GetKind() == TokKind::kw_sparse) {
+    lexer_.Lex();
+    const string message =
+        "expects a brace-bracketed integer for sparse layout";
+    int64 max_sparse_elements;
+    if (!ParseToken(TokKind::kLbrace, message) ||
+        !ParseInt64(&max_sparse_elements) ||
+        !ParseToken(TokKind::kRbrace, message)) {
+      return false;
+    }
+    *result->mutable_layout() =
+        LayoutUtil::MakeSparseLayout(max_sparse_elements);
+    return true;
+  }
+
+  // We need to lookahead to see if a following open brace is the start of a
+  // layout. The specific problematic case is:
+  //
+  // ENTRY %foo (x: f32[42]) -> f32[123] {
+  //  ...
+  // }
+  //
+  // The open brace could either be the start of a computation or the start of a
+  // layout for the f32[123] shape. We consider it the start of a layout if the
+  // next token after the open brace is an integer or a colon.
+  if (lexer_.GetKind() == TokKind::kLbrace &&
+      (lexer_.LookAhead() == TokKind::kInt ||
+       lexer_.LookAhead() == TokKind::kColon)) {
+    Layout layout;
+    if (!ParseLayout(&layout)) {
+      return false;
+    }
+    if (layout.minor_to_major_size() != result->rank()) {
+      return Error(
+          lexer_.GetLoc(),
+          StrFormat("Dimensions size is %ld, but minor to major size is %ld.",
+                    result->rank(), layout.minor_to_major_size()));
+    }
+    *result->mutable_layout() = layout;
+  }
   return true;
 }
 
 bool HloParser::CanBeShape() {
-  // A non-tuple shape starts with a kShape token; a tuple shape starts with
-  // '('.
-  return lexer_.GetKind() == TokKind::kShape ||
+  // A non-tuple shape starts with a kPrimitiveType token; a tuple shape starts
+  // with '('.
+  return lexer_.GetKind() == TokKind::kPrimitiveType ||
          lexer_.GetKind() == TokKind::kLparen;
 }
 
@@ -3063,15 +3492,14 @@ bool HloParser::ParseString(string* result) {
   return true;
 }
 
-bool HloParser::ParseDxD(const string& name,
-                         std::vector<tensorflow::int64>* result) {
+bool HloParser::ParseDxD(const string& name, std::vector<int64>* result) {
   LocTy loc = lexer_.GetLoc();
   if (!result->empty()) {
     return Error(loc, StrFormat("sub-attribute '%s=' already exists", name));
   }
   // 1D
   if (lexer_.GetKind() == TokKind::kInt) {
-    tensorflow::int64 number;
+    int64 number;
     if (!ParseInt64(&number)) {
       return Error(loc, StrFormat("expects sub-attribute '%s=i'", name));
     }
@@ -3090,8 +3518,7 @@ bool HloParser::ParseDxD(const string& name,
   return TokenError("expects token type kInt or kDxD");
 }
 
-bool HloParser::ParseWindowPad(
-    std::vector<std::vector<tensorflow::int64>>* pad) {
+bool HloParser::ParseWindowPad(std::vector<std::vector<int64>>* pad) {
   LocTy loc = lexer_.GetLoc();
   if (!pad->empty()) {
     return Error(loc, "sub-attribute 'pad=' already exists");
@@ -3101,7 +3528,7 @@ bool HloParser::ParseWindowPad(
   }
   string str = lexer_.GetStrVal();
   for (const auto& padding_dim_str : absl::StrSplit(str, 'x')) {
-    std::vector<tensorflow::int64> low_high;
+    std::vector<int64> low_high;
     if (!SplitToInt64s(padding_dim_str, '_', &low_high) ||
         low_high.size() != 2) {
       return Error(loc,
@@ -3124,7 +3551,7 @@ bool HloParser::ParsePaddingConfig(PaddingConfig* padding) {
   LocTy loc = lexer_.GetLoc();
   string str = lexer_.GetStrVal();
   for (const auto& padding_dim_str : absl::StrSplit(str, 'x')) {
-    std::vector<tensorflow::int64> padding_dim;
+    std::vector<int64> padding_dim;
     if (!SplitToInt64s(padding_dim_str, '_', &padding_dim) ||
         (padding_dim.size() != 2 && padding_dim.size() != 3)) {
       return Error(loc,
@@ -3146,7 +3573,7 @@ bool HloParser::ParseMetadata(OpMetadata* metadata) {
   optional<string> op_type;
   optional<string> op_name;
   optional<string> source_file;
-  optional<tensorflow::int32> source_line;
+  optional<int32> source_line;
   attrs["op_type"] = {/*required=*/false, AttrTy::kString, &op_type};
   attrs["op_name"] = {/*required=*/false, AttrTy::kString, &op_name};
   attrs["source_file"] = {/*required=*/false, AttrTy::kString, &source_file};
@@ -3198,6 +3625,22 @@ bool HloParser::ParseFftType(FftType* result) {
   return true;
 }
 
+bool HloParser::ParseTriangularSolveTranspose(
+    TriangularSolveOptions::Transpose* result) {
+  VLOG(1) << "ParseTriangularSolveTranspose";
+  if (lexer_.GetKind() != TokKind::kIdent) {
+    return TokenError("expects triangular solve transpose type");
+  }
+  string val = lexer_.GetStrVal();
+  if (!TriangularSolveOptions_Transpose_Parse(val, result) ||
+      !TriangularSolveOptions_Transpose_IsValid(*result)) {
+    return TokenError(
+        StrFormat("expects triangular solve transpose type but sees: %s", val));
+  }
+  lexer_.Lex();
+  return true;
+}
+
 bool HloParser::ParseFusionKind(HloInstruction::FusionKind* result) {
   VLOG(1) << "ParseFusionKind";
   if (lexer_.GetKind() != TokKind::kIdent) {
@@ -3249,7 +3692,7 @@ bool HloParser::ParsePrecision(PrecisionConfig::Precision* result) {
   return true;
 }
 
-bool HloParser::ParseInt64(tensorflow::int64* result) {
+bool HloParser::ParseInt64(int64* result) {
   VLOG(1) << "ParseInt64";
   if (lexer_.GetKind() != TokKind::kInt) {
     return TokenError("expects integer");
@@ -3261,9 +3704,18 @@ bool HloParser::ParseInt64(tensorflow::int64* result) {
 
 bool HloParser::ParseDouble(double* result) {
   switch (lexer_.GetKind()) {
-    case TokKind::kDecimal:
-      *result = lexer_.GetDecimalVal();
+    case TokKind::kDecimal: {
+      double val = lexer_.GetDecimalVal();
+      // If GetDecimalVal returns +/-inf, that means that we overflowed
+      // `double`.
+      if (std::isinf(val)) {
+        return TokenError(StrCat("Constant is out of range for double (+/-",
+                                 std::numeric_limits<double>::max(),
+                                 ") and so is unparsable."));
+      }
+      *result = val;
       break;
+    }
     case TokKind::kInt:
       *result = static_cast<double>(lexer_.GetInt64Val());
       break;
@@ -3283,6 +3735,42 @@ bool HloParser::ParseDouble(double* result) {
   return true;
 }
 
+bool HloParser::ParseComplex(std::complex<double>* result) {
+  if (lexer_.GetKind() != TokKind::kLparen) {
+    return TokenError("expects '(' before complex number");
+  }
+  lexer_.Lex();
+
+  double real;
+  LocTy loc = lexer_.GetLoc();
+  if (!ParseDouble(&real)) {
+    return Error(loc,
+                 "expect floating-point value for real part of complex number");
+  }
+
+  if (lexer_.GetKind() != TokKind::kComma) {
+    return TokenError(
+        absl::StrFormat("expect comma after real part of complex literal"));
+  }
+  lexer_.Lex();
+
+  double imag;
+  loc = lexer_.GetLoc();
+  if (!ParseDouble(&imag)) {
+    return Error(
+        loc,
+        "expect floating-point value for imaginary part of complex number");
+  }
+
+  if (lexer_.GetKind() != TokKind::kRparen) {
+    return TokenError(absl::StrFormat("expect ')' after complex number"));
+  }
+
+  *result = std::complex<double>(real, imag);
+  lexer_.Lex();
+  return true;
+}
+
 bool HloParser::ParseBool(bool* result) {
   if (lexer_.GetKind() != TokKind::kw_true &&
       lexer_.GetKind() != TokKind::kw_false) {
@@ -3332,6 +3820,18 @@ bool HloParser::AddComputation(const string& name, HloComputation* computation,
   return true;
 }
 
+StatusOr<Shape> HloParser::ParseShapeOnly() {
+  lexer_.Lex();
+  Shape shape;
+  if (!ParseShape(&shape)) {
+    return InvalidArgument("Syntax error:\n%s", GetError());
+  }
+  if (lexer_.GetKind() != TokKind::kEof) {
+    return InvalidArgument("Syntax error:\nExtra content after shape");
+  }
+  return shape;
+}
+
 StatusOr<HloSharding> HloParser::ParseShardingOnly() {
   lexer_.Lex();
   OpSharding op_sharding;
@@ -3344,6 +3844,21 @@ StatusOr<HloSharding> HloParser::ParseShardingOnly() {
   return HloSharding::FromProto(op_sharding);
 }
 
+StatusOr<std::vector<bool>> HloParser::ParseParameterReplicationOnly() {
+  lexer_.Lex();
+  ParameterReplication parameter_replication;
+  if (!ParseParameterReplication(&parameter_replication)) {
+    return InvalidArgument("Syntax error:\n%s", GetError());
+  }
+  if (lexer_.GetKind() != TokKind::kEof) {
+    return InvalidArgument(
+        "Syntax error:\nExtra content after parameter replication");
+  }
+  return std::vector<bool>(
+      parameter_replication.replicated_at_leaf_buffers().begin(),
+      parameter_replication.replicated_at_leaf_buffers().end());
+}
+
 StatusOr<Window> HloParser::ParseWindowOnly() {
   lexer_.Lex();
   Window window;
@@ -3459,6 +3974,11 @@ StatusOr<HloSharding> ParseSharding(absl::string_view str) {
   return parser.ParseShardingOnly();
 }
 
+StatusOr<std::vector<bool>> ParseParameterReplication(absl::string_view str) {
+  HloParser parser(str);
+  return parser.ParseParameterReplicationOnly();
+}
+
 StatusOr<Window> ParseWindow(absl::string_view str) {
   HloParser parser(str);
   return parser.ParseWindowOnly();
@@ -3475,4 +3995,9 @@ StatusOr<PaddingConfig> ParsePaddingConfig(absl::string_view str) {
   return parser.ParsePaddingConfigOnly();
 }
 
+StatusOr<Shape> ParseShape(absl::string_view str) {
+  HloParser parser(str);
+  return parser.ParseShapeOnly();
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_parser.h b/tensorflow/compiler/xla/service/hlo_parser.h
index d830fa61438239005875f785f85cf2486123ebc9..a96260b4d75e515a4cb23d315444142cae1b9587 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.h
+++ b/tensorflow/compiler/xla/service/hlo_parser.h
@@ -44,11 +44,16 @@ Status ParseHloString(absl::string_view str, HloModule* module);
 // creates a HloModule with default config.
 StatusOr<std::unique_ptr<HloModule>> ParseHloString(absl::string_view str);
 
-// ParseHloString sharding from str. str is supposed to contain the body of the
-// sharding, i.e. just the rhs of the "sharding={...}" attribute string,
-// e.g., "{replicated}".
+// Parses sharding from str. str is supposed to contain the body of the
+// sharding, i.e. just the rhs of the "sharding={...}" attribute string, e.g.,
+// "{replicated}".
 StatusOr<HloSharding> ParseSharding(absl::string_view str);
 
+// Parses parameter replication from str. str is supposed to contain the body of
+// the parameter replication, i.e. just the rhs of the
+// "parameter_replication={...}" attribute string, e.g., "{true, false}".
+StatusOr<std::vector<bool>> ParseParameterReplication(absl::string_view str);
+
 // Parses the result of window_util::ToString(const Window&).
 StatusOr<Window> ParseWindow(absl::string_view str);
 
@@ -60,6 +65,9 @@ StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbers(
 // Parses the result of PaddingConfigToString(), e.g. "0_0x1_1".
 StatusOr<PaddingConfig> ParsePaddingConfig(absl::string_view str);
 
+// Parses and returns a Shape::ToString-format string.
+StatusOr<Shape> ParseShape(absl::string_view str);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PARSER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index ab71f011ac9d77d00ddfb41aca7a224d26d416b7..8e3f1e44b9562334130aa565ed447a78899fad53 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -63,6 +63,19 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
   ROOT %add = f32[2,4]{1,0} add(f32[2,4]{1,0} %multiply, f32[2,4]{1,0} %y)
 }
 
+)"
+},
+// parameter replication
+{
+"ParamReplication",
+R"(HloModule param_replication_module
+
+ENTRY %param_replication (a: f32[], b: (f32[2,4], (f32[2,4]))) -> (f32[], (f32[2,4], (f32[2,4]))) {
+  %a = f32[] parameter(0), parameter_replication={true}
+  %b = (f32[2,4]{1,0}, (f32[2,4]{1,0})) parameter(1), parameter_replication={false,true}
+  ROOT %tuple = (f32[], (f32[2,4]{1,0}, (f32[2,4]{1,0}))) tuple(f32[] %a, (f32[2,4]{1,0}, (f32[2,4]{1,0})) %b)
+}
+
 )"
 },
 // pred constant
@@ -82,7 +95,7 @@ ENTRY %constant_pred () -> pred[] {
 R"(HloModule module
 
 ENTRY %constant_pred_array () -> pred[2,3] {
-  ROOT %constant = pred[2,3]{1,0} constant(pred[2,3] { { 0, 1, 0 }, { 1, 0, 1 } })
+  ROOT %constant = pred[2,3]{1,0} constant({ { 0, 1, 0 }, { 1, 0, 1 } })
 }
 
 )"
@@ -128,7 +141,7 @@ ENTRY %ConstantF32Empty.v4 () -> f32[0] {
 R"(HloModule ConstantF32R4Empty_module
 
 ENTRY %ConstantF32R4Empty.v4 () -> f32[2,0,4,3] {
-  ROOT %constant = f32[2,0,4,3]{3,2,1,0} constant(f32[2,0,4,3] { { /*i0=0*/ }, { /*i0=1*/ } })
+  ROOT %constant = f32[2,0,4,3]{3,2,1,0} constant({ { /*i0=0*/ }, { /*i0=1*/ } })
 }
 
 )"
@@ -139,7 +152,7 @@ ENTRY %ConstantF32R4Empty.v4 () -> f32[2,0,4,3] {
 R"(HloModule Small_3x2x1x1_module
 
 ENTRY %Small_3x2x1x1.v1 () -> f32[3,2,1,1] {
-  ROOT %constant = f32[3,2,1,1]{3,2,1,0} constant(f32[3,2,1,1] { { /*i0=0*/ { /*i1=0*/ {-1} }, { /*i1=1*/ {4.1} } }, { /*i0=1*/ { /*i1=0*/ {2} }, { /*i1=1*/ {4.1} } }, { /*i0=2*/ { /*i1=0*/ {5} }, { /*i1=1*/ {4.4} } } })
+  ROOT %constant = f32[3,2,1,1]{3,2,1,0} constant({ { /*i0=0*/ { /*i1=0*/ {-1} }, { /*i1=1*/ {4.1} } }, { /*i0=1*/ { /*i1=0*/ {2} }, { /*i1=1*/ {4.1} } }, { /*i0=2*/ { /*i1=0*/ {5} }, { /*i1=1*/ {4.4} } } })
 }
 
 )"
@@ -196,7 +209,7 @@ ENTRY %add_constants () -> f32[] {
 R"(HloModule TupleConstant_module
 
 ENTRY %TupleConstant.v1 () -> (f32[2,1], f32[2]) {
-  ROOT %constant = (f32[2,1]{1,0}, f32[2]{0}) constant((f32[2,1], f32[2]) ( f32[2,1] { {1}, {2} }, {2, 42} ))
+  ROOT %constant = (f32[2,1]{1,0}, f32[2]{0}) constant(( { {1}, {2} }, {2, 42} ))
 }
 
 )"
@@ -295,11 +308,11 @@ ENTRY %WhileWithScalarS32Result.v2 () -> s32[] {
 R"(HloModule TwoSendRecvBothWayRecvFist_module
 
 ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> (f32[], token[]) {
-  %token = token[] after-all()
-  %recv = (f32[], u32[], token[]) recv(token[] %token), channel_id=15, sharding={maximal device=1}
+  %token0 = token[] after-all()
+  %recv = (f32[], u32[], token[]) recv(token[] %token0), channel_id=15, sharding={maximal device=1}
   ROOT %recv-done = (f32[], token[]) recv-done((f32[], u32[], token[]) %recv), channel_id=15, sharding={maximal device=1}
   %constant = f32[] constant(2.1), sharding={maximal device=0}
-  %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token), channel_id=16, sharding={maximal device=0}, control-predecessors={%recv}
+  %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token0), channel_id=16, sharding={maximal device=0}, control-predecessors={%recv}
   %send-done = token[] send-done((f32[], u32[], token[]) %send), channel_id=16, sharding={maximal device=0}
 }
 
@@ -310,11 +323,11 @@ ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> (f32[], token[]) {
 R"(HloModule HostTransferSendRecv_module
 
 ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> (f32[], token[]) {
-  %token = token[] after-all()
-  %recv = (f32[], u32[], token[]) recv(token[] %token), channel_id=15, is_host_transfer=true
+  %token0 = token[] after-all()
+  %recv = (f32[], u32[], token[]) recv(token[] %token0), channel_id=15, is_host_transfer=true
   ROOT %recv-done = (f32[], token[]) recv-done((f32[], u32[], token[]) %recv), channel_id=15, is_host_transfer=true
   %constant = f32[] constant(2.1), sharding={maximal device=0}
-  %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token), channel_id=16, is_host_transfer=true
+  %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token0), channel_id=16, is_host_transfer=true
   %send-done = token[] send-done((f32[], u32[], token[]) %send), channel_id=16, is_host_transfer=true
 }
 
@@ -327,7 +340,7 @@ R"(HloModule GetTupleElement_module
 
 ENTRY %GetTupleElement.v4 () -> s32[2,3] {
   %constant = f32[3]{0} constant({1, 2, 3})
-  %constant.1 = s32[2,3]{1,0} constant(s32[2,3] { { 1, 2, 3 }, { 4, 5, 6 } })
+  %constant.1 = s32[2,3]{1,0} constant({ { 1, 2, 3 }, { 4, 5, 6 } })
   %tuple = (f32[3]{0}, s32[2,3]{1,0}) tuple(f32[3]{0} %constant, s32[2,3]{1,0} %constant.1)
   ROOT %get-tuple-element = s32[2,3]{1,0} get-tuple-element((f32[3]{0}, s32[2,3]{1,0}) %tuple), index=1, sharding={maximal device=0}
 }
@@ -434,7 +447,7 @@ ENTRY %ConvolveBackward (input: f32[128,7,7,512], filter: f32[3,3,512,512]) -> f
 R"(HloModule Reverse4DFloatArrayOnDim01_module
 
 ENTRY %Reverse4DFloatArrayOnDim01.v2 () -> f32[4,3,2,1] {
-  %constant = f32[4,3,2,1]{0,1,2,3} constant(f32[4,3,2,1] { { /*i0=0*/ { /*i1=0*/ {1}, {2} }, { /*i1=1*/ {3}, {4} }, { /*i1=2*/ {5}, {6} } }, { /*i0=1*/ { /*i1=0*/ {7}, {8} }, { /*i1=1*/ {9}, {10} }, { /*i1=2*/ {11}, {12} } }, { /*i0=2*/ { /*i1=0*/ {13}, {14} }, { /*i1=1*/ {15}, {16} }, { /*i1=2*/ {17}, {18} } }, { /*i0=3*/ { /*i1=0*/ {19}, {20} }, { /*i1=1*/ {21}, {22} }, { /*i1=2*/ {23}, {24} } } })
+  %constant = f32[4,3,2,1]{0,1,2,3} constant({ { /*i0=0*/ { /*i1=0*/ {1}, {2} }, { /*i1=1*/ {3}, {4} }, { /*i1=2*/ {5}, {6} } }, { /*i0=1*/ { /*i1=0*/ {7}, {8} }, { /*i1=1*/ {9}, {10} }, { /*i1=2*/ {11}, {12} } }, { /*i0=2*/ { /*i1=0*/ {13}, {14} }, { /*i1=1*/ {15}, {16} }, { /*i1=2*/ {17}, {18} } }, { /*i0=3*/ { /*i1=0*/ {19}, {20} }, { /*i1=1*/ {21}, {22} }, { /*i1=2*/ {23}, {24} } } })
   ROOT %reverse = f32[4,3,2,1]{0,1,2,3} reverse(f32[4,3,2,1]{0,1,2,3} %constant), dimensions={0,1}
 }
 
@@ -446,8 +459,8 @@ ENTRY %Reverse4DFloatArrayOnDim01.v2 () -> f32[4,3,2,1] {
 R"(HloModule Concat2x3With2x5_module
 
 ENTRY %Concat2x3With2x5.v3 () -> f32[2,8] {
-  %constant = f32[2,3]{1,0} constant(f32[2,3] { { 0, 1, 2 }, { 1000, 1001, 1002 } })
-  %constant.1 = f32[2,5]{1,0} constant(f32[2,5] { { 64, 65, 66, 67, 68 }, { 1064, 1065, 1066, 1067, 1068 } })
+  %constant = f32[2,3]{1,0} constant({ { 0, 1, 2 }, { 1000, 1001, 1002 } })
+  %constant.1 = f32[2,5]{1,0} constant({ { 64, 65, 66, 67, 68 }, { 1064, 1065, 1066, 1067, 1068 } })
   ROOT %concatenate = f32[2,8]{1,0} concatenate(f32[2,3]{1,0} %constant, f32[2,5]{1,0} %constant.1), dimensions={1}
 }
 
@@ -471,8 +484,8 @@ R"(HloModule R4F32OverlapSmall_module
 }
 
 ENTRY %R4F32OverlapSmall.v4 () -> f32[4,5,1,1] {
-  %constant = f32[4,5,1,1]{3,2,1,0} constant(f32[4,5,1,1] { { /*i0=0*/ { /*i1=0*/ {7} }, { /*i1=1*/ {2} }, { /*i1=2*/ {5} }, { /*i1=3*/ {3} }, { /*i1=4*/ {8} } }, { /*i0=1*/ { /*i1=0*/ {3} }, { /*i1=1*/ {8} }, { /*i1=2*/ {9} }, { /*i1=3*/ {3} }, { /*i1=4*/ {4} } }, { /*i0=2*/ { /*i1=0*/ {1} }, { /*i1=1*/ {5} }, { /*i1=2*/ {7} }, { /*i1=3*/ {5} }, { /*i1=4*/ {6} } }, { /*i0=3*/ { /*i1=0*/ {0} }, { /*i1=1*/ {6} }, { /*i1=2*/ {2} }, { /*i1=3*/ {10} }, { /*i1=4*/ {2} } } })
-  %constant.1 = f32[2,2,1,1]{3,2,1,0} constant(f32[2,2,1,1] { { /*i0=0*/ { /*i1=0*/ {2} }, { /*i1=1*/ {6} } }, { /*i0=1*/ { /*i1=0*/ {3} }, { /*i1=1*/ {1} } } })
+  %constant = f32[4,5,1,1]{3,2,1,0} constant({ { /*i0=0*/ { /*i1=0*/ {7} }, { /*i1=1*/ {2} }, { /*i1=2*/ {5} }, { /*i1=3*/ {3} }, { /*i1=4*/ {8} } }, { /*i0=1*/ { /*i1=0*/ {3} }, { /*i1=1*/ {8} }, { /*i1=2*/ {9} }, { /*i1=3*/ {3} }, { /*i1=4*/ {4} } }, { /*i0=2*/ { /*i1=0*/ {1} }, { /*i1=1*/ {5} }, { /*i1=2*/ {7} }, { /*i1=3*/ {5} }, { /*i1=4*/ {6} } }, { /*i0=3*/ { /*i1=0*/ {0} }, { /*i1=1*/ {6} }, { /*i1=2*/ {2} }, { /*i1=3*/ {10} }, { /*i1=4*/ {2} } } })
+  %constant.1 = f32[2,2,1,1]{3,2,1,0} constant({ { /*i0=0*/ { /*i1=0*/ {2} }, { /*i1=1*/ {6} } }, { /*i0=1*/ { /*i1=0*/ {3} }, { /*i1=1*/ {1} } } })
   %constant.2 = f32[] constant(0)
   ROOT %select-and-scatter = f32[4,5,1,1]{3,2,1,0} select-and-scatter(f32[4,5,1,1]{3,2,1,0} %constant, f32[2,2,1,1]{3,2,1,0} %constant.1, f32[] %constant.2), window={size=2x3x1x1 stride=2x2x1x1}, select=%ge_F32.v3, scatter=%add_F32.v3
 }
@@ -523,7 +536,7 @@ ENTRY %slice.v2 (p0: f32[3,3,4,4]) -> f32[3,3,2,4] {
 R"(HloModule Slice3x3x3_To_1x3x3_F32_module
 
 ENTRY %Slice3x3x3_To_1x3x3_F32.v2 () -> f32[1,3,3] {
-  %constant = f32[3,3,3]{2,1,0} constant(f32[3,3,3] { { { 0, 1, 2 }, { 3, 4, 5 }, { 6, 7, 8 } }, { { 9, 10, 11 }, { 12, 13, 14 }, { 15, 16, 17 } }, { { 18, 19, 20 }, { 21, 22, 23 }, { 24, 25, 26 } } })
+  %constant = f32[3,3,3]{2,1,0} constant({ { { 0, 1, 2 }, { 3, 4, 5 }, { 6, 7, 8 } }, { { 9, 10, 11 }, { 12, 13, 14 }, { 15, 16, 17 } }, { { 18, 19, 20 }, { 21, 22, 23 }, { 24, 25, 26 } } })
   ROOT %slice = f32[1,3,3]{2,1,0} slice(f32[3,3,3]{2,1,0} %constant), slice={[0:1], [0:3], [0:3]}
 }
 
@@ -547,10 +560,21 @@ ENTRY %SliceR0.v2 () -> s32[] {
 R"(HloModule Transpose_module
 
 ENTRY %Transpose.v2 () -> s32[1,2,3] {
-  %constant = s32[1,2,3]{2,1,0} constant(s32[1,2,3] { { { 1, 2, 3 }, { 4, 5, 6 } } })
+  %constant = s32[1,2,3]{2,1,0} constant({ { { 1, 2, 3 }, { 4, 5, 6 } } })
   ROOT %transpose = s32[1,2,3]{2,1,0} transpose(s32[1,2,3]{2,1,0} %constant), dimensions={0,1,2}
 }
 
+)"
+},
+{
+"TransposeC128",
+R"(HloModule TransposeC128_module
+
+ENTRY %Transpose.v3 (input: c128[1,2,3]) -> c128[1,2,3] {
+  %input = c128[1,2,3]{2,1,0} parameter(0)
+  ROOT %transpose = c128[1,2,3]{2,1,0} transpose(c128[1,2,3]{2,1,0} %input), dimensions={0,1,2}
+}
+
 )"
 },
 // Dynamic slice
@@ -566,12 +590,26 @@ ENTRY %DynamicSlice.v5 (original_parameter: s32[2,2,258], start_index: s32[1]) -
   ROOT %dynamic-slice = s32[2,2,258]{2,1,0} dynamic-slice(s32[2,2,258]{2,1,0} %original_parameter, s32[3]{0} %concatenate), dynamic_slice_sizes={2,2,258}
 }
 
+)"
+},
+// Dynamic slice with scalar indices
+{
+"DynamicSliceScalarIndices",
+R"(HloModule DynamicSlice_module
+
+ENTRY %DynamicSlice.v5 (original_parameter: s32[2,2,258], start_index: s32[]) -> s32[2,2,258] {
+  %original_parameter = s32[2,2,258]{2,1,0} parameter(0)
+  %constant = s32[] constant(0)
+  %start_index = s32[] parameter(1)
+  ROOT %dynamic-slice = s32[2,2,258]{2,1,0} dynamic-slice(s32[2,2,258]{2,1,0} %original_parameter, s32[] %constant, s32[] %constant, s32[] %start_index), dynamic_slice_sizes={2,2,258}
+}
+
 )"
 },
 // Dynamic update slice
 {
 "DynamicUpdateSlice",
-R"(HloModule DynamicUpdateSlice_module
+R"(HloModule DynamicSlice_module
 
 ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_indices: s32[4]) -> s32[1,1,25,1] {
   %input = s32[1,1,25,1]{3,2,1,0} parameter(0)
@@ -580,6 +618,23 @@ ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_
   ROOT %dynamic-update-slice = s32[1,1,25,1]{3,2,1,0} dynamic-update-slice(s32[1,1,25,1]{3,2,1,0} %input, s32[1,1,2,1]{3,2,1,0} %update, s32[4]{0} %start_indices)
 }
 
+)"
+},
+// Dynamic update slice with scalar indices
+{
+"DynamicUpdateSliceScalarIndex",
+R"(HloModule DynamicUpdateSlice_module
+
+ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_index.0: s32[], start_index.1: s32[], start_index.2: s32[], start_index.3: s32[]) -> s32[1,1,25,1] {
+  %input = s32[1,1,25,1]{3,2,1,0} parameter(0)
+  %update = s32[1,1,2,1]{3,2,1,0} parameter(1)
+  %start_index.0 = s32[] parameter(2)
+  %start_index.1 = s32[] parameter(3)
+  %start_index.2 = s32[] parameter(4)
+  %start_index.3 = s32[] parameter(5)
+  ROOT %dynamic-update-slice = s32[1,1,25,1]{3,2,1,0} dynamic-update-slice(s32[1,1,25,1]{3,2,1,0} %input, s32[1,1,2,1]{3,2,1,0} %update, s32[] %start_index.0, s32[] %start_index.1, s32[] %start_index.2, s32[] %start_index.3)
+}
+
 )"
 },
 // batch norm training
@@ -588,7 +643,7 @@ ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_
 R"(HloModule BasicTraining_module
 
 ENTRY %BasicTraining.v4 () -> (f32[2,2,1,2], f32[2], f32[2]) {
-  %constant = f32[2,2,1,2]{3,2,1,0} constant(f32[2,2,1,2] { { /*i0=0*/ { /*i1=0*/ { 1, 2 } }, { /*i1=1*/ { 3, 4 } } }, { /*i0=1*/ { /*i1=0*/ { 5, 6 } }, { /*i1=1*/ { 7, 8 } } } })
+  %constant = f32[2,2,1,2]{3,2,1,0} constant({ { /*i0=0*/ { /*i1=0*/ { 1, 2 } }, { /*i1=1*/ { 3, 4 } } }, { /*i0=1*/ { /*i1=0*/ { 5, 6 } }, { /*i1=1*/ { 7, 8 } } } })
   %constant.1 = f32[2]{0} constant({2, 3})
   %constant.2 = f32[2]{0} constant({1, 2})
   ROOT %batch-norm-training = (f32[2,2,1,2]{3,2,1,0}, f32[2]{0}, f32[2]{0}) batch-norm-training(f32[2,2,1,2]{3,2,1,0} %constant, f32[2]{0} %constant.1, f32[2]{0} %constant.2), epsilon=0.001, feature_index=3
@@ -728,7 +783,7 @@ R"(HloModule fusion_module
 }
 
 ENTRY %fusion.v3 () -> f32[3,2,1,1] {
-  %constant = f32[3,2,1,1]{3,2,1,0} constant(f32[3,2,1,1] { { /*i0=0*/ { /*i1=0*/ {-1} }, { /*i1=1*/ {4.1} } }, { /*i0=1*/ { /*i1=0*/ {2} }, { /*i1=1*/ {4.1} } }, { /*i0=2*/ { /*i1=0*/ {5} }, { /*i1=1*/ {4.4} } } })
+  %constant = f32[3,2,1,1]{3,2,1,0} constant({ { /*i0=0*/ { /*i1=0*/ {-1} }, { /*i1=1*/ {4.1} } }, { /*i0=1*/ { /*i1=0*/ {2} }, { /*i1=1*/ {4.1} } }, { /*i0=2*/ { /*i1=0*/ {5} }, { /*i1=1*/ {4.4} } } })
   %constant.1 = f32[2]{0} constant({3.14, 4.25})
   ROOT %fusion = f32[3,2,1,1]{3,2,1,0} fusion(f32[3,2,1,1]{3,2,1,0} %constant, f32[2]{0} %constant.1), kind=kLoop, calls=%fused_computation
 }
@@ -740,7 +795,17 @@ ENTRY %fusion.v3 () -> f32[3,2,1,1] {
 R"(HloModule sparse_f32
 
 ENTRY %sparse () -> f32[2,3,4] {
-  ROOT %foo = f32[2,3,4]sparse{10} constant(f32[2,3,4]{[0, 1, 2]: 1, [1, 2, 3]: 2, [2, 3, 4]: 3})
+  ROOT %foo = f32[2,3,4]sparse{10} constant({[0, 1, 2]: 1, [1, 2, 2]: 2, [1, 2, 3]: 3})
+}
+
+)"
+},
+{
+"SparseC128",
+R"(HloModule sparse_c128
+
+ENTRY %sparse () -> c128[2,3,4] {
+  ROOT %foo = c128[2,3,4]sparse{10} constant({[0, 1, 2]: (1, 0), [1, 2, 2]: (2, 5), [1, 2, 3]: (3, 10)})
 }
 
 )"
@@ -750,7 +815,7 @@ ENTRY %sparse () -> f32[2,3,4] {
 R"(HloModule sparse_f32_empty
 
 ENTRY %sparse_f32_empty () -> f32[2,3,4] {
-  ROOT %foo = f32[2,3,4]sparse{10} constant(f32[2,3,4]{})
+  ROOT %foo = f32[2,3,4]sparse{10} constant({})
 }
 
 )"
@@ -760,7 +825,7 @@ ENTRY %sparse_f32_empty () -> f32[2,3,4] {
 R"(HloModule sparse_f32_r1
 
 ENTRY %sparse_f32_r1 () -> f32[9] {
-  ROOT %foo = f32[9]sparse{10} constant(f32[9]{1: 2, 3: 4, 5: 6})
+  ROOT %foo = f32[9]sparse{10} constant({1: 2, 3: 4, 5: 6})
 }
 
 )"
@@ -852,6 +917,28 @@ ENTRY %CustomCallWithLayoutConstraints (p0: (f32[2,2], f32[42,2,3]), p1: f32[123
   ROOT %custom-call = (f32[1,2,3]{0,2,1}, f32[1,2,3]{1,2,0}) custom-call((f32[2,2]{0,1}, f32[42,2,3]{0,1,2}) %p0, f32[123,4]{0,1} %p1), custom_call_target="baz", operand_layout_constraints={(f32[2,2]{1,0}, f32[42,2,3]{2,0,1}), f32[123,4]{1,0}}
 }
 
+)"
+},
+// Parse c64 literal
+{
+"ParseC64Literal",
+R"(HloModule ParseC64Literal
+
+ENTRY %ParseC64Literal () -> c64[2] {
+  ROOT %c = c64[2]{0} constant({(1, 2), (-inf, nan)})
+}
+
+)"
+},
+// Parse c128 literal
+{
+"ParseC128Literal",
+R"(HloModule ParseC128Literal
+
+ENTRY %ParseC128Literal () -> c128[2] {
+  ROOT %c = c128[2]{0} constant({(1, 2), (-inf, nan)})
+}
+
 )"
 },
   });
@@ -931,11 +1018,11 @@ ENTRY reduce_entry {
 R"(HloModule outfeed_module
 
 ENTRY InfeedToOutfeed {
-  token = token[] after-all()
-  infeed = ((u32[3]{0}, pred[]), token[]) infeed(token)
+  token0 = token[] after-all()
+  infeed = ((u32[3]{0}, pred[]), token[]) infeed(token0)
   infeed.data = (u32[3]{0}, pred[]) get-tuple-element(infeed), index=0
-  outfeed = token[] outfeed(infeed.data, token)
-  ROOT infeed.1 = ((u32[3]{0}, pred[]), token[]) infeed(token)
+  outfeed = token[] outfeed(infeed.data, token0)
+  ROOT infeed.1 = ((u32[3]{0}, pred[]), token[]) infeed(token0)
   infeed.1.data = (u32[3]{0}, pred[]) get-tuple-element(infeed.1), index=0
   infeed.1.token = token[] get-tuple-element(infeed.1), index=1
   outfeed.1 = token[] outfeed(infeed.1.data, infeed.1.token)
@@ -973,9 +1060,15 @@ ENTRY ReducePrecision {
 "SortKey",
 R"(HloModule sort
 
+compare {
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+}
+
 ENTRY Sort {
   x = f32[1024]{0} parameter(0)
-  ROOT sorted = f32[1024]{0} sort(x), dimensions={0}
+  ROOT sorted = f32[1024]{0} sort(x), dimensions={0}, to_apply=compare
 }
 
 )"
@@ -985,10 +1078,18 @@ ENTRY Sort {
 "SortKeyValue",
 R"(HloModule sort
 
+compare {
+  p.1.lhs = s32[] parameter(2)
+  p.1.rhs = s32[] parameter(3)
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+}
+
 ENTRY Sort {
   keys = f32[1024]{0} parameter(0)
   values = s32[1024]{0} parameter(1)
-  ROOT sorted = (f32[1024]{0}, s32[1024]{0}) sort(keys, values), dimensions={0}
+  ROOT sorted = (f32[1024]{0}, s32[1024]{0}) sort(keys, values), dimensions={0}, to_apply=compare
 }
 
 )"
@@ -998,9 +1099,15 @@ ENTRY Sort {
 "SortKeyR2",
 R"(HloModule sort
 
+compare {
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+}
+
 ENTRY Sort {
   x = f32[1024,16]{0,1} parameter(0)
-  ROOT sorted = f32[1024,16]{0,1} sort(x), dimensions={0}
+  ROOT sorted = f32[1024,16]{0,1} sort(x), dimensions={0}, to_apply=compare
 }
 
 )"
@@ -1010,10 +1117,18 @@ ENTRY Sort {
 "SortKeyValueR2",
 R"(HloModule sort
 
+compare {
+  p.1.lhs = s32[] parameter(2)
+  p.1.rhs = s32[] parameter(3)
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+}
+
 ENTRY Sort {
   keys = f32[1024,16]{0,1} parameter(0)
   values = s32[1024,16]{0,1} parameter(1)
-  ROOT sorted = (f32[1024,16]{0,1}, s32[1024,16]{0,1}) sort(keys, values), dimensions={0}
+  ROOT sorted = (f32[1024,16]{0,1}, s32[1024,16]{0,1}) sort(keys, values), dimensions={0}, to_apply=compare
 }
 
 )"
@@ -1023,12 +1138,42 @@ ENTRY Sort {
 "SortManyValues",
 R"(HloModule sort
 
+compare {
+  p.1.lhs = s32[] parameter(2)
+  p.1.rhs = s32[] parameter(3)
+  p.2.lhs = u32[] parameter(4)
+  p.2.rhs = u32[] parameter(5)
+  p.3.lhs = f32[] parameter(6)
+  p.3.rhs = f32[] parameter(7)
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+}
+
 ENTRY Sort {
   keys = f32[1024,16]{0,1} parameter(0)
   values.0 = s32[1024,16]{0,1} parameter(1)
   values.1 = u32[1024,16]{0,1} parameter(2)
   values.2 = f32[1024,16]{0,1} parameter(3)
-  ROOT sorted = (f32[1024,16]{0,1}, s32[1024,16]{0,1}, u32[1024,16]{0,1}, f32[1024,16]{0,1}) sort(keys, values.0, values.1, values.2), dimensions={0}
+  ROOT sorted = (f32[1024,16]{0,1}, s32[1024,16]{0,1}, u32[1024,16]{0,1}, f32[1024,16]{0,1}) sort(keys, values.0, values.1, values.2), dimensions={0}, to_apply=compare
+}
+
+)"
+},
+// Sort (Key) is_stable=true
+{
+"SortKeyStable",
+R"(HloModule sort
+
+compare {
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+}
+
+ENTRY Sort {
+  x = f32[1024]{0} parameter(0)
+  ROOT sorted = f32[1024]{0} sort(x), dimensions={0}, is_stable=true, to_apply=compare
 }
 
 )"
@@ -1117,9 +1262,9 @@ ENTRY Gather {
 
 )"
 },
-// cross-replica-sum
+// all-reduce
 {
-"CrossReplicaSum",
+"AllReduce",
 R"(HloModule CRS
 
 add {
@@ -1130,14 +1275,14 @@ add {
 
 ENTRY CRS {
   input = f32[8]{0} parameter(0)
-  ROOT crs = f32[8]{0} cross-replica-sum(input), replica_groups={}, to_apply=add
+  ROOT crs = f32[8]{0} all-reduce(input), replica_groups={}, to_apply=add
 }
 
 )"
 },
-// cross-replica-sum with subgroups
+// all-reduce with subgroups
 {
-"CrossReplicaSumWithSubgroups",
+"AllReduceWithSubgroups",
 R"(HloModule CRS_Subgroups
 
 add {
@@ -1146,16 +1291,16 @@ add {
   ROOT add = f32[] add(lhs, rhs)
 }
 
-ENTRY CrossReplicaSumWithSubgroups {
+ENTRY AllReduceWithSubgroups {
   input = f32[128,32]{0,1} parameter(0)
-  ROOT cross-replica-sum = f32[128,32]{0,1} cross-replica-sum(input), replica_groups={{0,1},{2,3}}, barrier="abc", to_apply=add
+  ROOT all-reduce = f32[128,32]{0,1} all-reduce(input), replica_groups={{0,1},{2,3}}, barrier="abc", to_apply=add
 }
 
 )"
 },
-// cross-replica-sum with all-reduce-id
+// all-reduce with all-reduce-id
 {
-"CrossReplicaSumAllReduce",
+"AllReduceAllReduce",
 R"(HloModule CRS
 
 add {
@@ -1166,8 +1311,8 @@ add {
 
 ENTRY CRS {
   input = f32[8]{0} parameter(0)
-  crs.1 = f32[8]{0} cross-replica-sum(input), replica_groups={{0}}, all_reduce_id=1, to_apply=add
-  ROOT crs.0 = f32[8]{0} cross-replica-sum(input), replica_groups={{0}}, all_reduce_id=1, to_apply=add
+  crs.1 = f32[8]{0} all-reduce(input), replica_groups={{0}}, all_reduce_id=1, to_apply=add
+  ROOT crs.0 = f32[8]{0} all-reduce(input), replica_groups={{0}}, all_reduce_id=1, to_apply=add
 }
 
 )"
@@ -1206,6 +1351,17 @@ ENTRY CollectivePermute {
   ROOT root = f32[128,32]{0,1} collective-permute(input), source_target_pairs={{0,1},{1,2},{2,3}}
 }
 
+)"
+},
+// replica-id
+{
+"ReplicaId",
+R"(HloModule replica-id
+
+ENTRY Replica-id {
+  ROOT replica-id = u32[] replica-id()
+}
+
 )"
 },
 // Iota
@@ -1235,10 +1391,18 @@ ENTRY Computation {
 "ScheduledModule",
 R"(HloModule scheduled_module, is_scheduled=true
 
+compare {
+  p.1.lhs = s32[] parameter(2)
+  p.1.rhs = s32[] parameter(3)
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lhs = pred[] less-than(p.0.lhs, p.0.rhs)
+}
+
 ENTRY Sort {
   keys = f32[1024]{0} parameter(0)
   values = s32[1024]{0} parameter(1)
-  ROOT sorted = (f32[1024]{0}, s32[1024]{0}) sort(keys, values), dimensions={0}
+  ROOT sorted = (f32[1024]{0}, s32[1024]{0}) sort(keys, values), dimensions={0}, to_apply=compare
 }
 
 )"
@@ -1266,12 +1430,36 @@ R"(HloModule AddDependency
 ENTRY AddDependency {
   p = f32[] parameter(0)
   neg = f32[] negate(p)
-  token = token[] after-all(neg)
-  p_after_token = f32[] add-dependency(p, token)
+  token0 = token[] after-all(neg)
+  p_after_token = f32[] add-dependency(p, token0)
   exp = f32[] exponential(p_after_token)
   ROOT sum = f32[] add(neg, exp)
 }
 
+)"
+},
+
+// A module containing constants equal to the min/max values of various data
+// types.
+{
+"MinMaxValues",
+R"(HloModule MinMaxValues
+
+ENTRY MinMaxValues {
+  x.s8 = s8[2]{0} constant({-128, 127})
+  x.s16 = s16[2]{0} constant({-32768, 32767})
+  x.s32 = s32[2]{0} constant({-2147483648, 2147483647})
+  x.u8 = u8[2]{0} constant({0, 255})
+  x.u16 = u16[2]{0} constant({0, 65535})
+  x.u32 = u32[2]{0} constant({0, 4294967295})
+  x.f16 = f16[2]{0} constant({-65504, 65504})
+  x.bf16 = bf16[2]{0} constant({-3.38953e+38, 3.38953e+38})
+  x.f32 = f32[2]{0} constant({-3.40282e+38, 3.40282e+38})
+  x.f64 = f64[2]{0} constant({-1.79769e+308, 1.79769e+308})
+  x.c64 = c64[2]{0} constant({(-3.40282e+38, 3.40282e+38), (3.40282e+38, -3.40282e+38)})
+  ROOT c.c128 = c128[2]{0} constant({(-1.79769e+308, 1.79769e+308), (1.79769e+308, -1.79769e+308)})
+}
+
 )"
 },
 });
@@ -1298,7 +1486,7 @@ class HloParameterizedParserTest
  protected:
   // Expects "ToString(ParseHloString(string)) == string", that is, parses the
   // string, asserts that it succeeded, stringifies the parsed module, and
-  // checks that the it equals the original string.
+  // checks that it equals the original string.
   void ExpectEqual() {
     const string& original = GetParam().module_string;
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
@@ -1329,20 +1517,20 @@ TEST_P(HloParserTestLongProto, Run) { ExpectEqual(); }
 TEST_P(HloParserTestShort, Run) { ExpectEqual(); }
 TEST_P(HloParserTestShortProto, Run) { ExpectEqual(); }
 
-INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation, HloParserTestLong,
-                        ::testing::ValuesIn(CreateTestCases()),
-                        TestDataToString);
-INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation,
-                        HloParserTestLongProto,
-                        ::testing::ValuesIn(CreateTestCases()),
-                        TestDataToString);
-INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation, HloParserTestShort,
-                        ::testing::ValuesIn(CreateShortTestCases()),
-                        TestDataToString);
-INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation,
-                        HloParserTestShortProto,
-                        ::testing::ValuesIn(CreateShortTestCases()),
-                        TestDataToString);
+INSTANTIATE_TEST_SUITE_P(HloParserTestSuccessInstantiation, HloParserTestLong,
+                         ::testing::ValuesIn(CreateTestCases()),
+                         TestDataToString);
+INSTANTIATE_TEST_SUITE_P(HloParserTestSuccessInstantiation,
+                         HloParserTestLongProto,
+                         ::testing::ValuesIn(CreateTestCases()),
+                         TestDataToString);
+INSTANTIATE_TEST_SUITE_P(HloParserTestSuccessInstantiation, HloParserTestShort,
+                         ::testing::ValuesIn(CreateShortTestCases()),
+                         TestDataToString);
+INSTANTIATE_TEST_SUITE_P(HloParserTestSuccessInstantiation,
+                         HloParserTestShortProto,
+                         ::testing::ValuesIn(CreateShortTestCases()),
+                         TestDataToString);
 
 class HloParserTest : public ::testing::Test {
  protected:
@@ -1419,7 +1607,7 @@ TEST_F(HloParserTest, MoreConstants) {
 
 ENTRY %SelectScalarS32True.v4 () -> s32[] {
   %constant.2 = pred[] constant(true)
-  %constant.1 = s32[] constant(-42), sharding={s32[5,6] devices=[2,2]1,2,3,4}
+  %constant.1 = s32[] constant(-42), sharding={devices=[2,2]1,2,3,4}
   %constant = s32[] constant(42)
   %select = s32[] select(pred[] %constant.2, s32[] %constant.1, s32[] %constant)
 }
@@ -1462,7 +1650,7 @@ TEST_F(HloParserTest, LiteralDimensionsMismatch_2) {
   const string original = R"(HloModule some_2x3_module
 
 ENTRY %some_2x3 () -> f32[2,3] {
-  ROOT %constant = f32[2,3]{1,0} constant(f32[2,3] {1, 2, 3, 4, 5, 6})
+  ROOT %constant = f32[2,3]{1,0} constant({1, 2, 3, 4, 5, 6})
 }
 
 )";
@@ -1476,7 +1664,7 @@ TEST_F(HloParserTest, LiteralDimensionsMismatch_3) {
   const string original = R"(HloModule some_2x3x2_module
 
 ENTRY %some_2x3x2 () -> f32[2,3,2] {
-  ROOT %constant = f32[2,3,2]{2,1,0} constant(f32[2,3,2] {{{1, 2}, {3, 4}, {5, 6}, {7, 8}, {9, 10}, {11, 12}}})
+  ROOT %constant = f32[2,3,2]{2,1,0} constant({{{1, 2}, {3, 4}, {5, 6}, {7, 8}, {9, 10}, {11, 12}}})
 }
 
 )";
@@ -1501,6 +1689,37 @@ ENTRY %ConstantF16Overflow.v4 () -> f16[] {
                   "is out of range for literal's primitive type F16");
 }
 
+TEST_F(HloParserTest, ConstantBf16NoOverflow) {
+  // 65505 is in range for bf16.
+  const string original = R"(
+  HloModule test_module
+  ENTRY test {
+    ROOT c = bf16[] constant(-65505)
+  })";
+  EXPECT_EQ(Status::OK(), ParseHloString(original).status());
+}
+
+TEST_F(HloParserTest, ConstantBf16Overflow) {
+  // 1e100 is out of range for bf16.
+  const string original = R"(
+  HloModule test_module
+  ENTRY test {
+    ROOT c = bf16[] constant(1e100)
+  })";
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
+                  "out of range");
+}
+
+TEST_F(HloParserTest, ConstantF16OverflowInSparseArray) {
+  const string original = R"(
+    HloModule test_module
+    ENTRY test {
+      ROOT c = f16[5]sparse{10} constant({[0]: 0, [1]: -65505})
+    })";
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
+                  "is out of range for literal's primitive type F16");
+}
+
 TEST_F(HloParserTest, ConstantUnsignedUnderflow) {
   const string original = R"(
       HloModule ConstantUnsignedUnderflow_module
@@ -1535,6 +1754,46 @@ TEST_F(HloParserTest, ConstantUnsignedInt64Overflow) {
   EXPECT_NE(Status::OK(), result.status());
 }
 
+TEST_F(HloParserTest, ConstantC64Overflow) {
+  const string original = R"(
+      HloModule test_module
+      ENTRY test () -> c64[] {
+        ROOT c = c64[] constant((1e100, 0))
+      })";
+  auto result = ParseHloString(original);
+  EXPECT_NE(Status::OK(), result.status());
+}
+
+TEST_F(HloParserTest, ConstantC64Underflow) {
+  const string original = R"(
+      HloModule test_module
+      ENTRY test () -> c64[] {
+        ROOT c = c64[] constant((0, -1e100))
+      })";
+  auto result = ParseHloString(original);
+  EXPECT_NE(Status::OK(), result.status());
+}
+
+TEST_F(HloParserTest, ConstantF64Overflow) {
+  const string original = R"(
+      HloModule test_module
+      ENTRY test {
+        ROOT c = f64[] constant(1.8e308)
+      })";
+  auto result = ParseHloString(original);
+  EXPECT_NE(Status::OK(), result.status());
+}
+
+TEST_F(HloParserTest, ConstantF64Underflow) {
+  const string original = R"(
+      HloModule test_module
+      ENTRY test {
+        ROOT c = f64[] constant(-1.8e308)
+      })";
+  auto result = ParseHloString(original);
+  EXPECT_NE(Status::OK(), result.status());
+}
+
 TEST_F(HloParserTest, ConstantWithExp) {
   const string original = R"(HloModule ConstantWithExp_module
 
@@ -1550,6 +1809,19 @@ ENTRY %ConstantWithExp.v4 () -> f32[] {
   // printed as "300".
 }
 
+TEST_F(HloParserTest, ShortConstant) {
+  const string original = R"(HloModule ShortCOnstant_module
+
+ENTRY %ShortConstant.v4 () -> f32[67,89] {
+  ROOT %constant.1 = f32[67,89]{1,0} constant({...})
+}
+
+)";
+  auto result = ParseHloString(original);
+  TF_EXPECT_OK(result.status());
+  EXPECT_EQ(result.ValueOrDie()->ToString(HloPrintOptions()), original);
+}
+
 TEST_F(HloParserTest, AttibutesAnyOrder) {
   const string original = R"(HloModule any_order_module
 
@@ -1594,11 +1866,11 @@ TEST_F(HloParserTest, UnexpectedAttribute) {
   const string original = R"(HloModule unexpected_attr_module
 
 ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
-  %token = token[] after-all()
-  %recv = (f32[], u32[], token[]) recv(token[] %token), channel_id=15
+  %token0 = token[] after-all()
+  %recv = (f32[], u32[], token[]) recv(token[] %token0), channel_id=15
   %recv-done = (f32[], token[]) recv-done((f32[], u32[], token[]) %recv), channel_id=15
   ROOT %constant = f32[] constant(2.1)
-  %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token), channel_id=16, calls=%recv
+  %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token0), channel_id=16, calls=%recv
   %send-done = token[] send-done((f32[], u32[], token[]) %send), channel_id=16
 }
 
@@ -1611,11 +1883,11 @@ TEST_F(HloParserTest, MissingAttribute) {
   const string original = R"(HloModule missing_attr_module
 
 ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
-  %token = token[] after-all()
-  %recv = (f32[], u32[], token[]) recv(token[] %token), channel_id=15
+  %token0 = token[] after-all()
+  %recv = (f32[], u32[], token[]) recv(token[] %token0), channel_id=15
   %recv-done = (f32[], token[]) recv-done((f32[], u32[], token[]) %recv), channel_id=15
   ROOT %constant = f32[] constant(-2.1)
-  %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token)
+  %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token0)
   %send-done = token[] send-done((f32[], u32[], token[]) %send), channel_id=16
 }
 
@@ -1628,11 +1900,11 @@ TEST_F(HloParserTest, PredecessorUndefined) {
   const string original = R"(HloModule pre_not_found_module
 
 ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
-  %token = token[] after-all()
-  %recv = (f32[], u32[], token[]) recv(token[] %token), channel_id=15
+  %token0 = token[] after-all()
+  %recv = (f32[], u32[], token[]) recv(token[] %token0), channel_id=15
   %recv-done = (f32[], token[]) recv-done((f32[], u32[], token[]) %recv), channel_id=15
   ROOT %constant = f32[] constant(2.1)
-  %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token), channel_id=16, control-predecessors={%done}
+  %send = (f32[], u32[], token[]) send(f32[] %constant, token[] %token0), channel_id=16, control-predecessors={%done}
   %send-done = token[] send-done((f32[], u32[], token[]) %send), channel_id=16
 }
 
@@ -1940,8 +2212,8 @@ TEST_F(HloParserTest, ParsePaddingConfigInteriorPaddingImplicitZeroDim) {
 TEST_F(HloParserTest, NontupleInfeed) {
   const string original = R"(HloModule nontuple_infeed:
 ENTRY nontuple_infeed {
-  token = token[] after-all()
-  ROOT infeed = pred[] infeed(token)
+  token0 = token[] after-all()
+  ROOT infeed = pred[] infeed(token0)
 })";
   ExpectHasSubstr(ParseHloString(original).status().error_message(),
                   "infeed must have a non-empty tuple shape");
@@ -2239,7 +2511,7 @@ HloModule foobar
 
 ENTRY %entrycomp (p: f32[2,2]) -> f32[2,2] {
   %p = f32[2,2] parameter(0)
-  %constant.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.1 = f32[2,2] constant({{1, 2}, {3, 4}})
   ROOT %add.1 = f32[2,2] add(f32[2,2] %p, f32[2,5] %constant.1)
 }
 )";
@@ -2249,7 +2521,218 @@ ENTRY %entrycomp (p: f32[2,2]) -> f32[2,2] {
                   " with the shape of the operand instruction f32[2,2]{1,0}.");
 }
 
-// custom call incompatible shape.
+TEST_F(HloParserTest, OutOfRangeSparseIndex) {
+  const string original = R"(
+    HloModule test_module
+    ENTRY test {
+      ROOT c = f16[5]sparse{10} constant({[100]: 0})
+    })";
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
+                  "Invalid sparse index");
+}
+
+TEST_F(HloParserTest, NegativeSparseIndex) {
+  const string original = R"(
+    HloModule test_module
+    ENTRY test {
+      ROOT c = f16[5]sparse{10} constant({-1: 0})
+    })";
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
+                  "Invalid sparse index");
+}
+
+TEST_F(HloParserTest, SparseIndexWithRankTooLarge) {
+  const string original = R"(
+    HloModule test_module
+    ENTRY test {
+      ROOT c = f16[5]sparse{10} constant({[0, 0]: 0})
+    })";
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
+                  "Invalid sparse index");
+}
+
+TEST_F(HloParserTest, SparseIndexWithRankTooSmall) {
+  const string original = R"(
+    HloModule test_module
+    ENTRY test {
+      ROOT c = f16[5, 5]sparse{10} constant({[0]: 0})
+    })";
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
+                  "Invalid sparse index");
+}
+
+TEST_F(HloParserTest, ParseShapeStringR2F32) {
+  string shape_string = "f32[123,456]";
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  Shape expected = ShapeUtil::MakeShape(F32, {123, 456});
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
+TEST_F(HloParserTest, ParseShapeStringTupleOfArrays) {
+  string shape_string = "(f32[1572864],s8[5120,1024])";
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  Shape expected =
+      ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {1572864}),
+                                 ShapeUtil::MakeShape(S8, {5120, 1024})});
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
+TEST_F(HloParserTest, ParseShapeStringNestedTuple) {
+  string shape_string = "(f32[1],(f32[2], token[]), opaque[], f32[3])";
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  Shape expected = ShapeUtil::MakeTupleShape({
+      ShapeUtil::MakeShape(F32, {1}),
+      ShapeUtil::MakeTupleShape(
+          {ShapeUtil::MakeShape(F32, {2}), ShapeUtil::MakeTokenShape()}),
+      ShapeUtil::MakeOpaqueShape(),
+      ShapeUtil::MakeShape(F32, {3}),
+  });
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
+TEST_F(HloParserTest, ParseShapeStringWithLayout) {
+  string shape_string = "f32[123,456]{0,1}";
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  Shape expected = ShapeUtil::MakeShapeWithLayout(F32, {123, 456}, {0, 1});
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
+TEST_F(HloParserTest, ParseShapeStringWithTilingLayout) {
+  // One tile.
+  string shape_string = "f32[123,456]{0,1:T(2,128)}";
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  Shape expected =
+      ShapeUtil::MakeShapeWithLayout(F32, {123, 456}, {0, 1}, {Tile({2, 128})});
+  EXPECT_EQ(expected, actual)
+      << "expected: " << ShapeUtil::HumanStringWithLayout(expected)
+      << "actual:   " << ShapeUtil::HumanStringWithLayout(actual);
+
+  // Tile with negative dimension size for combining dimensions.
+  shape_string = "f32[123,456,789]{0,1,2:T(2, * , 128)}";
+  TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
+  expected =
+      ShapeUtil::MakeShapeWithLayout(F32, {123, 456, 789}, {0, 1, 2},
+                                     {Tile({2, Tile::kCombineDimension, 128})});
+  EXPECT_EQ(expected, actual)
+      << "expected: " << ShapeUtil::HumanStringWithLayout(expected)
+      << "actual:   " << ShapeUtil::HumanStringWithLayout(actual);
+
+  // Two tiles.
+  shape_string = "bf16[123,456,789]{2,1,0:T(2,*,128)(2,1)}";
+  TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
+  expected = ShapeUtil::MakeShapeWithLayout(
+      BF16, {123, 456, 789}, {2, 1, 0},
+      {Tile({2, Tile::kCombineDimension, 128}), Tile({2, 1})});
+  EXPECT_EQ(expected, actual)
+      << "expected: " << ShapeUtil::HumanStringWithLayout(expected)
+      << "actual:   " << ShapeUtil::HumanStringWithLayout(actual);
+
+  // Tile with element size in bits.
+  shape_string = "pred[123,456]{1,0:T(2,128)E(1)}";
+  TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
+  expected = ShapeUtil::MakeShapeWithLayout(PRED, {123, 456}, {1, 0},
+                                            {Tile({2, 128})}, 1);
+  EXPECT_EQ(expected, actual)
+      << "expected: " << ShapeUtil::HumanStringWithLayout(expected)
+      << "actual:   " << ShapeUtil::HumanStringWithLayout(actual);
+
+  // Element size in bits without tile.
+  shape_string = "pred[123,456]{1,0:E(1)}";
+  TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
+  expected = ShapeUtil::MakeShapeWithLayout(PRED, {123, 456}, {1, 0}, {}, 1);
+  EXPECT_EQ(expected, actual)
+      << "expected: " << ShapeUtil::HumanStringWithLayout(expected)
+      << "actual:   " << ShapeUtil::HumanStringWithLayout(actual);
+
+  // Wrong minor_to_major.
+  shape_string = "f32[123,456,789]{1:T(2, * , 128)}";
+  auto result = ParseShape(shape_string);
+  ExpectHasSubstr(result.status().error_message(),
+                  "Dimensions size is 3, but minor to major size is 1.");
+}
+
+TEST_F(HloParserTest, ParseShapeStringWithSparseLayout) {
+  string shape_string = "f32[123,456]sparse{10}";
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  Shape expected = ShapeUtil::MakeShapeWithSparseLayout(F32, {123, 456}, 10);
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual: " << ShapeUtil::HumanString(actual);
+}
+
+TEST_F(HloParserTest, ParseOpaqueType) {
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape("opaque[]"));
+  Shape expected = ShapeUtil::MakeOpaqueShape();
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
+TEST_F(HloParserTest, ParseTokenType) {
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape("token[]"));
+  Shape expected = ShapeUtil::MakeTokenShape();
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
+TEST_F(HloParserTest, ParseInvalidShapeString) {
+  string shape_strings[] = {
+      "f32[123,456]foobar{0,1}", "f32[123,456]sparse{0,1}", "f32[123,456]{foo}",
+      "f32[123,456]dense{foo}",  "f32[123,456]sparse{foo}",
+  };
+  for (const string& shape_string : shape_strings) {
+    StatusOr<Shape> result = ParseShape(shape_string);
+    ASSERT_FALSE(result.ok()) << "shape: " << shape_string;
+  }
+}
+
+TEST_F(HloParserTest, ParseDynamicArray) {
+  string shape_string = "f32[123,<=456]";
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  Shape expected = ShapeUtil::MakeShape(F32, {123, 456}, {false, true});
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
+TEST_F(HloParserTest, ParseDynamicTuple) {
+  string shape_string = "(f32[42], u32[<=123,<=456])";
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  Shape expected = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {42}),
+       ShapeUtil::MakeShape(U32, {123, 456}, {true, true})});
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
+TEST_F(HloParserTest, NegativeParameterNumber) {
+  const string hlo_string = "par0 = f32[3,5] parameter(-1)";
+  auto result = ParseHloString(hlo_string);
+  ASSERT_FALSE(result.status().ok());
+  EXPECT_THAT(result.status().error_message(),
+              ::testing::HasSubstr("parameter number must be >= 0"));
+}
+
+TEST_F(HloParserTest, WrongNumberOfParameterLeafBuffersInReplication) {
+  const string hlo_string =
+      "par0 = (f32[3,5], f32[]) parameter(0), "
+      "parameter_replication={true,false,true}";
+  auto result = ParseHloString(hlo_string);
+  ASSERT_FALSE(result.status().ok());
+  EXPECT_THAT(result.status().error_message(),
+              ::testing::HasSubstr("parameter has 2 leaf buffers, but "
+                                   "parameter_replication has 3 elements"));
+}
 
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_pass_fix.h b/tensorflow/compiler/xla/service/hlo_pass_fix.h
index 791b1a97b0b82edf19ff1588fd8d5d996ac0fef4..35dc9c0029f9871334cb500c6b71f0c86ab136d7 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_fix.h
+++ b/tensorflow/compiler/xla/service/hlo_pass_fix.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <algorithm>
 
 #include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_module_group.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -39,9 +40,36 @@ class HloPassFix : public Pass {
     int64 iteration_count = 0;
     int64 limit =
         std::max(static_cast<int64>(1000), module->instruction_count());
+    VLOG(3) << "Running HloPassFix.";
     while (changed_this_iteration) {
       TF_ASSIGN_OR_RETURN(changed_this_iteration, Pass::Run(module));
       changed |= changed_this_iteration;
+      VLOG(3) << "changed_this_iteration: " << changed_this_iteration;
+      ++iteration_count;
+      if (iteration_count == limit) {
+        LOG(ERROR)
+            << "Unexpectedly high number of iterations in HLO passes ("
+            << iteration_count
+            << ")\nIf compilation hangs here, please file a bug with XLA.";
+      }
+    }
+    return changed;
+  }
+
+  StatusOr<bool> RunOnModuleGroup(HloModuleGroup* module_group) override {
+    bool changed = false;
+    bool changed_this_iteration = true;
+    int64 iteration_count = 0;
+    int64 limit = 1000;
+    for (const HloModule* module : module_group->modules()) {
+      limit = std::max<int64>(limit, module->instruction_count());
+    }
+    VLOG(3) << "Running HloPassFix.";
+    while (changed_this_iteration) {
+      TF_ASSIGN_OR_RETURN(changed_this_iteration,
+                          Pass::RunOnModuleGroup(module_group));
+      changed |= changed_this_iteration;
+      VLOG(3) << "changed_this_iteration: " << changed_this_iteration;
       ++iteration_count;
       if (iteration_count == limit) {
         LOG(ERROR)
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index 51177f24f5ee702be96fc8b4530ed38a5798109f..ae8c08cf1d16ad6738962f3be7c1b5512110b1d1 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -77,6 +77,11 @@ std::vector<HloPassInterface*> HloPassPipeline::GetEnabledPasses(
   auto repeated_field = debug_options.xla_disable_hlo_passes();
   absl::flat_hash_set<string> disabled_pass_names(repeated_field.begin(),
                                                   repeated_field.end());
+  if (debug_options.xla_disable_all_hlo_passes()) {
+    VLOG(1) << "*All* passes disabled by --xla_disable_all_hlo_passes.";
+    return {};
+  }
+
   if (!disabled_pass_names.empty()) {
     VLOG(1) << "Passes disabled by --xla_disable_hlo_passes: "
             << absl::StrJoin(disabled_pass_names, ", ");
@@ -84,7 +89,7 @@ std::vector<HloPassInterface*> HloPassPipeline::GetEnabledPasses(
 
   std::vector<HloPassInterface*> enabled_passes;
   for (auto& pass : passes_) {
-    if (disabled_pass_names.count(string(pass->name())) == 0) {
+    if (!disabled_pass_names.contains(pass->name())) {
       enabled_passes.push_back(pass.get());
     }
   }
diff --git a/tensorflow/compiler/xla/service/hlo_profile_printer.cc b/tensorflow/compiler/xla/service/hlo_profile_printer.cc
index 5eb707a957e49d86cdb2f72b72ce750bf29b8fd2..9cc202aa9f5fe5a20a9da05251ea811137ccaadb 100644
--- a/tensorflow/compiler/xla/service/hlo_profile_printer.cc
+++ b/tensorflow/compiler/xla/service/hlo_profile_printer.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_profile_printer.h"
 
+#include "absl/algorithm/container.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/human_readable_profile_builder.h"
 
@@ -34,11 +35,10 @@ string PrintHloProfile(const HloProfilePrinterData& hlo_profile_printer_data,
   for (const HloComputationInfo& computation_info :
        hlo_profile_printer_data.computation_infos()) {
     const auto& instruction_infos = computation_info.instruction_infos();
-    bool any_instruction_profiled =
-        std::any_of(instruction_infos.begin(), instruction_infos.end(),
-                    [&](const HloInstructionInfo& instruction_info) {
-                      return counters[instruction_info.profile_index()] != 0;
-                    });
+    bool any_instruction_profiled = absl::c_any_of(
+        instruction_infos, [&](const HloInstructionInfo& instruction_info) {
+          return counters[instruction_info.profile_index()] != 0;
+        });
 
     if (!any_instruction_profiled) {
       continue;
diff --git a/tensorflow/compiler/xla/service/hlo_proto_util.cc b/tensorflow/compiler/xla/service/hlo_proto_util.cc
index 981d06ce101644ecce587c4bd2f7a12c8edf6548..3a9ee57e5551ae5b608f02d9f8bd0428ff16db13 100644
--- a/tensorflow/compiler/xla/service/hlo_proto_util.cc
+++ b/tensorflow/compiler/xla/service/hlo_proto_util.cc
@@ -39,6 +39,7 @@ HloProto MakeHloProto(const HloModule& module) {
 
 StatusOr<std::unique_ptr<HloModule>> CreateModuleFromProto(
     const HloModuleProto& proto, const HloModuleConfig& module_config) {
+  VLOG(4) << proto.ShortDebugString();
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
                       HloModule::CreateFromProto(proto, module_config));
   TF_RETURN_IF_ERROR(
diff --git a/tensorflow/compiler/xla/service/hlo_reachability.cc b/tensorflow/compiler/xla/service/hlo_reachability.cc
index 4aa8067752481ffab29e1a573ffa49d4aa046f1f..b7f507b1184dbe021effc1102a68040286480ed2 100644
--- a/tensorflow/compiler/xla/service/hlo_reachability.cc
+++ b/tensorflow/compiler/xla/service/hlo_reachability.cc
@@ -49,7 +49,7 @@ void HloReachabilityMap::SetReachabilityToUnionHelper(
     absl::Span<const HloInstruction* const> inputs,
     const HloInstruction* instruction, BitVector* bit_vector) {
   // If instruction is part of inputs, don't reset the bit_vector.
-  if (std::find(inputs.begin(), inputs.end(), instruction) == inputs.end()) {
+  if (!absl::c_linear_search(inputs, instruction)) {
     bit_vector->SetToZero();
   }
   bit_vector->Set(GetIndex(instruction));
@@ -77,28 +77,51 @@ std::unique_ptr<HloReachabilityMap> HloReachabilityMap::Build(
     const HloComputation* computation) {
   const auto& all = computation->MakeInstructionPostOrder();
   auto result = absl::make_unique<HloReachabilityMap>(all);
-  auto channel_dependency_map = computation->ComputeChannelDependencies();
+  auto channel_group = computation->ComputeChannelDependencies();
 
-  std::vector<HloInstruction*> inputs;
   for (const HloInstruction* hlo : all) {
-    inputs.assign(hlo->operands().begin(), hlo->operands().end());
-    inputs.insert(inputs.end(), hlo->control_predecessors().begin(),
-                  hlo->control_predecessors().end());
+    std::vector<HloInstruction*> inputs;
+    const auto add_input = [&channel_group, &inputs](HloInstruction* input) {
+      inputs.push_back(input);
+      if (input->opcode() == HloOpcode::kAllReduce && input->all_reduce_id()) {
+        auto it = channel_group.find(*input->all_reduce_id());
+        if (it != channel_group.end()) {
+          inputs.insert(inputs.end(), it->second.begin(), it->second.end());
+        }
+      }
+    };
+
+    const auto add_dependencies = [&add_input](const HloInstruction* hlo) {
+      for (HloInstruction* operand : hlo->operands()) {
+        add_input(operand);
+      }
+      for (HloInstruction* predecessor : hlo->control_predecessors()) {
+        add_input(predecessor);
+      }
+    };
+
+    add_dependencies(hlo);
 
     switch (hlo->opcode()) {
       case HloOpcode::kRecvDone: {
-        auto it = channel_dependency_map.find(hlo->channel_id());
-        if (it != channel_dependency_map.end()) {
-          absl::c_copy(it->second, std::back_inserter(inputs));
+        auto it = channel_group.find(hlo->channel_id());
+        if (it != channel_group.end()) {
+          for (HloInstruction* channel : it->second) {
+            if (channel->opcode() == HloOpcode::kSend) {
+              add_input(channel);
+            }
+          }
         }
         break;
       }
-      case HloOpcode::kCrossReplicaSum: {
+      case HloOpcode::kAllReduce: {
         auto all_reduce_id = hlo->all_reduce_id();
         if (all_reduce_id) {
-          auto it = channel_dependency_map.find(all_reduce_id.value());
-          if (it != channel_dependency_map.end()) {
-            absl::c_copy(it->second, std::back_inserter(inputs));
+          auto it = channel_group.find(all_reduce_id.value());
+          if (it != channel_group.end()) {
+            for (HloInstruction* all_reduce : it->second) {
+              add_dependencies(all_reduce);
+            }
           }
         }
         break;
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 48add75523f02005c70bc6baf69a6b7d5aa4f7ef..a175e4643de2ac6ce07ac00da914d7ab7acca541 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -57,13 +57,22 @@ using ::tensorflow::strings::HumanReadableNumBytes;
 
 // Returns true if the given instruction is rematerializable.
 bool IsRematerializable(const HloInstruction* instruction) {
+  if (instruction->opcode() == HloOpcode::kCopy) {
+    if (LayoutUtil::Equal(instruction->shape().layout(),
+                          instruction->operand(0)->shape().layout())) {
+      // Don't rematerialize copies added by copy insertion (layout doesn't
+      // change).
+      return false;
+    }
+  }
+
   // Don't rematerialize instructions with side effects or instructions which
   // cannot be cloned safely.
   switch (instruction->opcode()) {
     case HloOpcode::kCall:
     case HloOpcode::kConstant:
     case HloOpcode::kConditional:
-    case HloOpcode::kCrossReplicaSum:
+    case HloOpcode::kAllReduce:
     case HloOpcode::kCustomCall:
     case HloOpcode::kParameter:
     case HloOpcode::kWhile:
@@ -179,7 +188,8 @@ class InstructionList {
   Item* CreateItem(HloInstruction* inst) {
     Item* item = new Item;
     item->instruction = inst;
-    CHECK(item_map_.insert({inst, item}).second) << "inserting inst twice";
+    CHECK(item_map_.insert({inst, item}).second)
+        << "inserting inst twice " << inst->name();
     return item;
   }
 
@@ -235,8 +245,7 @@ class InstructionList {
     }
 
     // Now scan forwards until we find one of the before_instructions.
-    while (std::find(before_instructions.begin(), before_instructions.end(),
-                     min_position_item) == before_instructions.end()) {
+    while (!absl::c_linear_search(before_instructions, min_position_item)) {
       min_position_item = min_position_item->next;
     }
     return InsertBefore(to_insert, min_position_item);
@@ -302,7 +311,7 @@ ItemList GetUsers(const InstructionList& instruction_list,
       // A buffer may be used by the instruction via more than one alias. For
       // example, a buffer which appears in more than one element of a tuple.
       Item* user_item = instruction_list.GetItem(user);
-      if (std::find(users.begin(), users.end(), user_item) == users.end()) {
+      if (!absl::c_linear_search(users, user_item)) {
         users.push_back(user_item);
       }
     }
@@ -418,11 +427,12 @@ class MemoryUsageTracker {
   // the given uses.
   Buffer& RematerializeBuffer(const Buffer& original_buffer, Item* remat_item,
                               ItemList&& rematerialized_uses) {
-    CHECK(original_buffer.defining_instruction->placed);
-    CHECK(!original_buffer.has_indirect_uses);
-    CHECK(!original_buffer.live_out);
+    CHECK(original_buffer.defining_instruction->placed)
+        << original_buffer.defining_instruction->instruction->name();
+    CHECK(!original_buffer.has_indirect_uses) << original_buffer.ToString();
+    CHECK(!original_buffer.live_out) << original_buffer.ToString();
     for (Item* use : rematerialized_uses) {
-      CHECK(!use->placed);
+      CHECK(!use->placed) << use->instruction->name();
     }
     return NewBuffer(remat_item, original_buffer.size,
                      std::move(rematerialized_uses), /*live_out=*/false,
@@ -456,8 +466,7 @@ class MemoryUsageTracker {
       return false;
     }
     const BufferIdList& in_progress_uses = in_progress_item_->buffers_used;
-    return std::find(in_progress_uses.begin(), in_progress_uses.end(),
-                     buffer_id) != in_progress_uses.end();
+    return absl::c_linear_search(in_progress_uses, buffer_id);
   }
 
   // Returns whether the given instruction is live at the current program
@@ -535,8 +544,7 @@ MemoryUsageTracker::MemoryUsageTracker(
         bool unused;
         for (Item* user_item : GetUsers(instruction_list_, logical_buffer,
                                         points_to_analysis, &unused)) {
-          if (std::find(buffer->users.begin(), buffer->users.end(),
-                        user_item) == buffer->users.end()) {
+          if (!absl::c_linear_search(buffer->users, user_item)) {
             buffer->users.push_back(user_item);
             buffer->unfinished_user_count++;
             user_item->buffers_used.push_back(buffer->id);
@@ -677,8 +685,8 @@ Status MemoryUsageTracker::AddRematerializedInstruction(Item* original_item,
           << ", remat_instruction = " << remat_item->instruction->name();
 
   TF_RET_CHECK(in_progress_item_ != nullptr);
-  TF_RET_CHECK(original_item->placed);
-  TF_RET_CHECK(!remat_item->placed);
+  TF_RET_CHECK(original_item->placed) << original_item->instruction->name();
+  TF_RET_CHECK(!remat_item->placed) << remat_item->instruction->name();
 
   // Construct the list of buffers used and defined by the rematerialization.
   remat_item->buffers_used = original_item->buffers_used;
@@ -707,7 +715,7 @@ Status MemoryUsageTracker::AddRematerializedInstruction(Item* original_item,
     ItemList unplaced_users;
     for (Item* user : old_buffer.users) {
       if (user->placed) {
-        CHECK(IsFinished(user));
+        CHECK(IsFinished(user)) << user->instruction->name();
         placed_users.push_back(user);
       } else {
         unplaced_users.push_back(user);
@@ -784,8 +792,7 @@ bool MemoryUsageTracker::Check() const {
 
     for (const Buffer& buffer : buffers_) {
       if (buffer.defining_instruction->instruction == instruction) {
-        CHECK(std::find(defined_buffers.begin(), defined_buffers.end(),
-                        buffer.id) != defined_buffers.end())
+        CHECK(absl::c_linear_search(defined_buffers, buffer.id))
             << "Instruction " << instruction->name()
             << " defined buffers is missing: " << buffer.ToString();
       }
@@ -808,8 +815,7 @@ bool MemoryUsageTracker::Check() const {
     int64 unfinished_uses = 0;
     for (Item* user : buffer.users) {
       const BufferIdList& used_buffers = user->buffers_used;
-      CHECK(std::find(used_buffers.begin(), used_buffers.end(), buffer.id) !=
-            used_buffers.end())
+      CHECK(absl::c_linear_search(used_buffers, buffer.id))
           << "Instruction " << user->instruction->name()
           << " used buffers is missing " << buffer.ToString();
       if (!IsFinished(user)) {
@@ -836,10 +842,10 @@ int64 RematerializationCost(const HloInstruction* instruction,
   // If none of the users of 'instruction' have been placed in the sequence (as
   // tracked by memory_tracker), then rematerialization of 'instruction' is a
   // zero-cost move of 'instruction' in the sequence.
-  if (!std::any_of(instruction->users().begin(), instruction->users().end(),
-                   [&memory_tracker](const HloInstruction* inst) {
-                     return memory_tracker.IsPlaced(inst);
-                   })) {
+  if (!absl::c_any_of(instruction->users(),
+                      [&memory_tracker](const HloInstruction* inst) {
+                        return memory_tracker.IsPlaced(inst);
+                      })) {
     return 0;
   }
 
@@ -1094,7 +1100,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
         Item* successor_item = instruction_list.GetItem(successor);
         // Assert to make sure we never remat an operation with control
         // successor already placed.
-        CHECK(!successor_item->placed);
+        CHECK(!successor_item->placed) << successor_item->instruction->name();
         place_before.push_back(successor_item);
       }
       instruction_list.InsertBeforeInstructions(remat_item, place_before);
@@ -1164,7 +1170,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
   // Verify some invariants on the memory tracker.
   CHECK_EQ(memory_tracker.memory_usage(), 0);
   for (auto* instruction : computation->instructions()) {
-    CHECK(memory_tracker.IsPlaced(instruction));
+    CHECK(memory_tracker.IsPlaced(instruction)) << instruction->name();
   }
 
   VLOG(1) << "In computation " << computation->name() << " rematerialized "
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
index 22c3c40a93a1ddcd36659483fcc79fede32dd2c3..102a360ad8116d8781baf9cb7627a920f4a687c4 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
@@ -499,6 +499,52 @@ TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) {
   EXPECT_THAT(add_4->operand(0), op::Broadcast(param));
 }
 
+TEST_F(HloRematerializationTest, CopyNotRematerialized) {
+  // Test that copies are not rematerialized.
+  auto module = CreateNewVerifiedModule();
+
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, vec1024_shape_, "param"));
+
+  auto copy = builder.AddInstruction(
+      HloInstruction::CreateUnary(vec1024_shape_, HloOpcode::kCopy, param));
+
+  auto negate_a_1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(vec1024_shape_, HloOpcode::kNegate, copy));
+
+  auto negate_a_2 = builder.AddInstruction(HloInstruction::CreateUnary(
+      vec1024_shape_, HloOpcode::kNegate, negate_a_1));
+
+  auto negate_b_1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(vec1024_shape_, HloOpcode::kNegate, copy));
+
+  auto negate_b_2 = builder.AddInstruction(HloInstruction::CreateUnary(
+      vec1024_shape_, HloOpcode::kNegate, negate_b_1));
+
+  builder.AddInstruction(HloInstruction::CreateTuple({negate_a_2, negate_b_2}));
+
+  HloComputation* entry_computation =
+      module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          RunHloRematerialization(
+                              /*memory_limit_bytes=*/1 * 1024, module.get()));
+
+  auto count_copies = [](const HloComputation* computation) {
+    int64 copy_count = 0;
+    for (auto* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kCopy) {
+        copy_count++;
+      }
+    }
+    return copy_count;
+  };
+  EXPECT_TRUE(changed);
+
+  EXPECT_EQ(count_copies(entry_computation), 1);
+}
+
 class IndirectUseTest : public HloRematerializationTest,
                         public ::testing::WithParamInterface<bool> {};
 
@@ -588,8 +634,8 @@ TEST_P(IndirectUseTest, IndirectUseNotRematerialized) {
   }
 }
 
-INSTANTIATE_TEST_CASE_P(IndirectUseTestInstantiation, IndirectUseTest,
-                        ::testing::Values(true, false));
+INSTANTIATE_TEST_SUITE_P(IndirectUseTestInstantiation, IndirectUseTest,
+                         ::testing::Values(true, false));
 
 }  // namespace
 
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 5a9b820a9d7f58695383b21c9e2126cf98970c83..5a5401e351384867016a3a9addfd43d57091848c 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -168,6 +168,35 @@ StatusOr<Literal> HloRunner::Execute(std::unique_ptr<HloModule> module,
       /*profile=*/profile);
 }
 
+StatusOr<Literal> HloRunner::Execute(
+    std::unique_ptr<Executable> executable,
+    const absl::Span<const Literal* const> arguments,
+    ExecutionProfile* profile) {
+  TF_ASSIGN_OR_RETURN(std::vector<ScopedShapedBuffer> argument_buffers,
+                      TransferLiteralsToDevice(arguments));
+  TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result,
+                      ExecuteWithDeviceBuffers(
+                          /*executable=*/executable.get(),
+                          /*arguments=*/argument_buffers,
+                          /*profile=*/profile));
+  return TransferLiteralFromDevice(result);
+}
+
+StatusOr<Literal> HloRunner::Execute(std::unique_ptr<Executable> executable,
+                                     const absl::Span<const Literal> arguments,
+                                     ExecutionProfile* profile) {
+  // Construct a vector of plain pointers for the arguments.
+  std::vector<const Literal*> argument_pointers;
+  argument_pointers.reserve(arguments.size());
+  for (const auto& argument : arguments) {
+    argument_pointers.push_back(&argument);
+  }
+  return Execute(
+      /*module=*/std::move(executable),
+      /*arguments=*/argument_pointers,
+      /*profile=*/profile);
+}
+
 StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
     std::unique_ptr<HloModule> module,
     const absl::Span<const ShapedBuffer* const> arguments, bool run_hlo_passes,
@@ -206,7 +235,7 @@ StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
 }
 
 StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
-    std::unique_ptr<Executable> executable,
+    Executable* executable,
     const absl::Span<const ShapedBuffer* const> arguments,
     ExecutionProfile* profile) {
   // Get service run options.
@@ -225,7 +254,7 @@ StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
 }
 
 StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
-    std::unique_ptr<Executable> executable,
+    Executable* executable,
     const absl::Span<const ScopedShapedBuffer> arguments,
     ExecutionProfile* profile) {
   std::vector<const ShapedBuffer*> argument_pointers;
@@ -383,9 +412,7 @@ ServiceExecutableRunOptions HloRunner::GetServiceRunOptionsForDevice(
   if (device_assignment != nullptr) {
     run_options.set_device_assignment(device_assignment);
   }
-  return ServiceExecutableRunOptions(
-      run_options, backend().StreamBorrower(),
-      /*xla_intra_op_thread_pool=*/backend().eigen_intra_op_thread_pool());
+  return ServiceExecutableRunOptions(run_options, backend().StreamBorrower());
 }
 
 Backend& HloRunner::backend() {
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index bb792cf8c9825ff67ca33bbcf2c3c32b1a0ecb85..098989cd4c78fb5ad57cd6700fbf99c50064f225 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -60,7 +60,7 @@ class HloRunner {
     // The number of times the infeed literal should be fed to the HLO module.
     // For a clean exit, this should match the iterations-per-loop parameter
     // used when generating the HLO module proto (that is usually the main
-    // while bounary counter). A value higher then iterations-per-loop would
+    // while boundary counter). A value higher then iterations-per-loop would
     // lead to infeed threads feeding to a gone computation, while a lower
     // value would trigger a stuck ExecuteReplicated() call (the computation
     // will be trying to infeed data which will never come).
@@ -124,6 +124,14 @@ class HloRunner {
                             bool run_hlo_passes = true,
                             ExecutionProfile* profile = nullptr);
 
+  StatusOr<Literal> Execute(std::unique_ptr<Executable> executable,
+                            const absl::Span<const Literal* const> arguments,
+                            ExecutionProfile* profile = nullptr);
+
+  StatusOr<Literal> Execute(std::unique_ptr<Executable> executable,
+                            const absl::Span<const Literal> arguments,
+                            ExecutionProfile* profile = nullptr);
+
   // As Execute(), but accepts and returns device buffers instead of host
   // buffers.
   StatusOr<ScopedShapedBuffer> ExecuteWithDeviceBuffers(
@@ -136,13 +144,16 @@ class HloRunner {
       const absl::Span<const ScopedShapedBuffer> arguments,
       bool run_hlo_passes = true, ExecutionProfile* profile = nullptr);
 
+  // In the following two calls, "executable" is not a unique_ptr to allow
+  // reuse of the Executable.  This call may update the profile information in
+  // *executable.
   StatusOr<ScopedShapedBuffer> ExecuteWithDeviceBuffers(
-      std::unique_ptr<Executable> executable,
+      Executable* executable,
       const absl::Span<const ShapedBuffer* const> arguments,
       ExecutionProfile* profile = nullptr);
 
   StatusOr<ScopedShapedBuffer> ExecuteWithDeviceBuffers(
-      std::unique_ptr<Executable> executable,
+      Executable* executable,
       const absl::Span<const ScopedShapedBuffer> arguments,
       ExecutionProfile* profile = nullptr);
 
diff --git a/tensorflow/compiler/xla/service/hlo_schedule.cc b/tensorflow/compiler/xla/service/hlo_schedule.cc
index 8f6eb974c5179b420c8f961393ca923e0a3b3530..e75373501cffac6a736be89e9f6139b6ff2cdbc1 100644
--- a/tensorflow/compiler/xla/service/hlo_schedule.cc
+++ b/tensorflow/compiler/xla/service/hlo_schedule.cc
@@ -140,7 +140,7 @@ Status HloSchedule::UpdateComputationSchedule(
   std::queue<HloInstruction*> worklist;
 
   for (HloInstruction* instruction : computation->instructions()) {
-    if (ids_in_schedule.count(instruction->unique_id()) == 0) {
+    if (!ids_in_schedule.contains(instruction->unique_id())) {
       // This is a newly added instruction which is not in the schedule.
       if (instruction->operands().empty()) {
         worklist.push(instruction);
@@ -204,7 +204,7 @@ Status HloSchedule::Update() {
   std::vector<HloComputation*> nonfusion_computations =
       module_->MakeNonfusionComputations();
   for (const HloComputation* computation : nonfusion_computations) {
-    TF_RET_CHECK(sequences_.count(computation->unique_id()) == 1)
+    TF_RET_CHECK(sequences_.contains(computation->unique_id()))
         << "Computation " << computation->name() << " not in HloSchedule.";
   }
   if (sequences_.size() > nonfusion_computations.size()) {
@@ -215,7 +215,7 @@ Status HloSchedule::Update() {
       nonfusion_computations_ids.insert(computation->unique_id());
     }
     for (auto it = sequences_.begin(); it != sequences_.end();) {
-      if (nonfusion_computations_ids.count(it->first) == 0) {
+      if (!nonfusion_computations_ids.contains(it->first)) {
         sequences_.erase(it++);
       } else {
         ++it;
@@ -244,7 +244,7 @@ Status HloSchedule::Verify() const {
       << "Schedule has " << sequences_.size() << " sequences, but module has "
       << nonfusion_computations.size() << " non-fusion computations";
   for (const HloComputation* computation : nonfusion_computations) {
-    TF_RET_CHECK(sequences_.count(computation->unique_id()) == 1)
+    TF_RET_CHECK(sequences_.contains(computation->unique_id()))
         << "Computation " << computation->name()
         << " missing from HLO schedule.";
   }
@@ -268,7 +268,7 @@ Status HloSchedule::Verify() const {
         << instruction_position.size() << " instructions, expected "
         << computation->instruction_count();
     for (const HloInstruction* instruction : computation->instructions()) {
-      TF_RET_CHECK(instruction_position.count(instruction) == 1)
+      TF_RET_CHECK(instruction_position.contains(instruction))
           << "Instruction " << instruction->name() << " is not in schedule";
     }
 
diff --git a/tensorflow/compiler/xla/service/hlo_schedule.h b/tensorflow/compiler/xla/service/hlo_schedule.h
index 486ddbf499de80c634bc497158cd79ca066cc866..a5f54ae2c33259d080631061dff9ae40b41495dc 100644
--- a/tensorflow/compiler/xla/service/hlo_schedule.h
+++ b/tensorflow/compiler/xla/service/hlo_schedule.h
@@ -110,7 +110,7 @@ class HloSchedule {
 
   // Returns true if the schedule has a sequence for the given computation.
   bool is_computation_scheduled(const HloComputation* computation) const {
-    return sequences_.count(computation->unique_id()) == 1;
+    return sequences_.contains(computation->unique_id());
   }
 
   // Updates the schedule such that it is (again) a valid schedule for the
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index 70a860c356ca2fb1c4c973ea3d96c50fabc2c7c2..f1d7e60f2b5a68408f6d428a0ec47fba3c9c4f12 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/overflow_util.h"
@@ -30,7 +31,7 @@ HloSharding HloSharding::AssignDevice(int64 device_id) {
 }
 
 HloSharding HloSharding::Tile1D(const Shape& input_shape, int64 num_tiles) {
-  CHECK_EQ(1, ShapeUtil::Rank(input_shape));
+  CHECK_EQ(1, input_shape.rank());
   CHECK_GT(num_tiles, 1);
   std::vector<int64> dimensions(1, num_tiles);
   Array<int64> assignment(dimensions);
@@ -57,7 +58,7 @@ HloSharding HloSharding::Tuple(const ShapeTree<HloSharding>& sub_shardings) {
 
 HloSharding HloSharding::Tuple(const Shape& tuple_shape,
                                absl::Span<const HloSharding> shardings) {
-  CHECK(ShapeUtil::IsTuple(tuple_shape)) << ShapeUtil::HumanString(tuple_shape);
+  CHECK(tuple_shape.IsTuple()) << ShapeUtil::HumanString(tuple_shape);
   for (auto& sharding : shardings) {
     CHECK(!sharding.IsTuple()) << sharding.ToString();
   }
@@ -70,7 +71,7 @@ HloSharding HloSharding::Tuple(const Shape& tuple_shape,
 
 HloSharding HloSharding::SingleTuple(const Shape& tuple_shape,
                                      const HloSharding& sharding) {
-  CHECK(ShapeUtil::IsTuple(tuple_shape)) << ShapeUtil::HumanString(tuple_shape);
+  CHECK(tuple_shape.IsTuple()) << ShapeUtil::HumanString(tuple_shape);
   CHECK(!sharding.IsTuple()) << sharding.ToString();
   int64 leaf_count = RequiredLeaves(tuple_shape);
   std::vector<HloSharding> flattened_list;
@@ -80,7 +81,7 @@ HloSharding HloSharding::SingleTuple(const Shape& tuple_shape,
 
 HloSharding HloSharding::Single(const Shape& shape,
                                 const HloSharding& sharding) {
-  return ShapeUtil::IsTuple(shape) ? SingleTuple(shape, sharding) : sharding;
+  return shape.IsTuple() ? SingleTuple(shape, sharding) : sharding;
 }
 
 string HloSharding::ToString() const {
@@ -95,24 +96,23 @@ string HloSharding::ToString() const {
 
   if (replicated_) {
     return "{replicated}";
-  } else if (maximal_) {
+  }
+  if (maximal_) {
     return StrCat(
         "{maximal device=", static_cast<int64>(*tile_assignment_.begin()), "}");
-  } else {
-    return StrCat("{devices=[", StrJoin(tile_assignment_.dimensions(), ","),
-                  "]", StrJoin(tile_assignment_, ","), "}");
   }
+  return StrCat("{devices=[", StrJoin(tile_assignment_.dimensions(), ","), "]",
+                StrJoin(tile_assignment_, ","), "}");
 }
 
 bool HloSharding::UsesDevice(int64 device) const {
   if (IsTuple()) {
-    return std::any_of(
-        tuple_elements_.begin(), tuple_elements_.end(),
-        [&](const HloSharding& s) { return s.UsesDevice(device); });
+    return absl::c_any_of(tuple_elements_, [&](const HloSharding& s) {
+      return s.UsesDevice(device);
+    });
   }
   const auto& devices = tile_assignment_;
-  return replicated_ ||
-         std::find(devices.begin(), devices.end(), device) != devices.end();
+  return replicated_ || absl::c_linear_search(devices, device);
 }
 
 std::map<int64, int64> HloSharding::UsedDevices(int64* count) const {
@@ -269,7 +269,7 @@ int64 HloSharding::GetUniqueDevice() const {
 }
 
 Status HloSharding::ValidateTuple(const Shape& shape, int64 num_devices) const {
-  if (!ShapeUtil::IsTuple(shape)) {
+  if (!shape.IsTuple()) {
     return tensorflow::errors::InvalidArgument(
         StrCat("Sharding is tuple-shaped but validation shape is not."));
   }
@@ -305,7 +305,7 @@ Status HloSharding::Validate(const Shape& shape, int64 num_devices) const {
 
 Status HloSharding::ValidateNonTuple(const Shape& shape,
                                      int64 num_devices) const {
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     return tensorflow::errors::InvalidArgument(
         StrCat("Validation shape is a tuple but sharding is not."));
   }
@@ -316,7 +316,7 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
   // All tile assignments must be less than the number of available cores and
   // unique.
   Status status = Status::OK();
-  std::set<int64> seen_cores;
+  absl::flat_hash_set<int64> seen_cores;
   tile_assignment_.Each(
       [&](absl::Span<const int64> indices, int32 core) {
         // Don't overwrite a bad status, so we report the first error.
@@ -324,12 +324,12 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
           if (core >= num_devices) {
             status = tensorflow::errors::InvalidArgument(StrCat(
                 "core ", core, " > ", num_devices, " in tile assignment"));
-          } else if (seen_cores.count(core) != 0) {
+          } else if (seen_cores.contains(core)) {
             status = tensorflow::errors::InvalidArgument(
                 StrCat("core ", core, " is not unique in tile assignment"));
           }
+          seen_cores.insert(core);
         }
-        seen_cores.insert(core);
       });
   if (!status.ok()) {
     return status;
@@ -340,14 +340,14 @@ Status HloSharding::ValidateNonTuple(const Shape& shape,
   }
 
   // The tile assignment tensor must have the same rank as the input.
-  if (ShapeUtil::Rank(shape) != tile_assignment_.num_dimensions()) {
+  if (shape.rank() != tile_assignment_.num_dimensions()) {
     return tensorflow::errors::InvalidArgument(
         "Number of tile assignment dimensions is different to the input rank. "
         "sharding=",
         ToString(), ", input_shape=", ShapeUtil::HumanString(shape));
   }
 
-  // The correct constructor have to be used to create tile maximal shardings.
+  // The correct constructor has to be used to create tile maximal shardings.
   if (tile_assignment_.num_elements() == 1) {
     return tensorflow::errors::InvalidArgument(
         "Tile assignment only contains a single device. If a replicated "
@@ -437,8 +437,8 @@ Shape HloSharding::TileShape(const Shape& shape) const {
   }
   Shape result_shape = shape;
   for (int64 i = 0; i < shape.dimensions_size(); ++i) {
-    (*result_shape.mutable_dimensions())[i] =
-        CeilOfRatio<int64>(shape.dimensions(i), tile_assignment_.dim(i));
+    result_shape.set_dimensions(
+        i, CeilOfRatio<int64>(shape.dimensions(i), tile_assignment_.dim(i)));
   }
   return result_shape;
 }
@@ -455,7 +455,7 @@ HloSharding HloSharding::GetSubSharding(const Shape& shape,
     }
     sub_shape = &ShapeUtil::GetSubshape(*sub_shape, {idx});
   }
-  if (ShapeUtil::IsTuple(*sub_shape)) {
+  if (sub_shape->IsTuple()) {
     auto begin_it = tuple_elements_.begin() + sharding_index;
     std::vector<HloSharding> sub_shardings(
         begin_it, begin_it + ShapeUtil::GetLeafCount(*sub_shape));
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h
index 9775505f8608ced3e33abe376f4922cc6a972726..dd57ea83f1cb33aa052facb607bc040d2e708633 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding.h
@@ -101,8 +101,8 @@ class HloSharding {
     if (!IsTuple()) {
       return replicated_;
     }
-    return std::all_of(tuple_elements_.begin(), tuple_elements_.end(),
-                       [](const HloSharding& s) { return s.IsReplicated(); });
+    return absl::c_all_of(
+        tuple_elements_, [](const HloSharding& s) { return s.IsReplicated(); });
   }
 
   // Returns true if the tile size is the same as the input size.
@@ -110,14 +110,15 @@ class HloSharding {
     if (!IsTuple()) {
       return maximal_;
     }
-    return std::all_of(tuple_elements_.begin(), tuple_elements_.end(),
-                       [](const HloSharding& s) { return s.IsTileMaximal(); });
+    return absl::c_all_of(tuple_elements_, [](const HloSharding& s) {
+      return s.IsTileMaximal();
+    });
   }
 
   // Returns true if the sharding defines an operation on the given device.
   bool UsesDevice(int64 device) const;
 
-  // Retrieves an histogram of the devices used by the sharding. The returned
+  // Retrieves a histogram of the devices used by the sharding. The returned
   // map has the device number as key, and the occurrence count as value.
   // If a sharding does not have a device, it will not be incuded in the
   // histogram. The count argument, if not nullptr, will receive the total
@@ -259,6 +260,19 @@ class HloSharding {
   bool replicated_;
   bool maximal_;
   bool tuple_;
+  // This field is only used if replicated_ is false. If maximal_ is true, then
+  // the field contains a rank 1 array with a single element, which is the
+  // device the HLO is assigned to. If maximal_ is false, the field contains an
+  // array with the same rank as the corresponding HLO. The dimension sizes of
+  // the array describe the number of ways the HLO is partitioned along each
+  // dimension. The values of the array specify which device each tile of
+  // the HLO is assigned to. The index of each value determines which tile it
+  // takes.
+  // For example, {{{2, 3}}, {{5, 7}}} (whose ToString representation is
+  // "{devices=[2,1,2]2,3,5,7}"), means that dimension 1 is split two way and
+  // dimension 3 is split 2 way. Core 5, whose index is [2,1,1] will take the
+  // tile that contains the 2nd half of dimension 1 and the 1st half of
+  // dimension 3.
   Array<int64> tile_assignment_;
   // Only non-empty when tuple_ is true. If a tuple is empty then one entry is
   // present for the root. This is a flattened list of all the leaf shardings in
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
index f5061304456e04ab40448861343ef201c9450dcf..094d98bc6e54028557f6d38cd165bf34e1fb8c46 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
@@ -99,7 +99,7 @@ std::vector<PassThrough> LocatePassThroughDomainLinks(
         << "Instruction is not a kDomain: " << instruction->ToString();
     for (HloInstruction* user : instruction->users()) {
       if (user->opcode() == HloOpcode::kDomain &&
-          domain.exit_domains.count(user) != 0) {
+          domain.exit_domains.contains(user)) {
         pass_through.emplace_back(user, instruction);
         VLOG(2) << "Found passthrough domain link:";
         VLOG(2) << "  " << user->ToString();
@@ -234,7 +234,7 @@ StatusOr<bool> ApplyShardingFromUsers(HloInstruction* instruction,
   if (instruction->users().empty()) {
     // No sharding from users, use domain_sharding, after checking
     // compatibility.
-    TF_RET_CHECK(ShapeUtil::IsTuple(instruction->shape()) &&
+    TF_RET_CHECK(instruction->shape().IsTuple() &&
                  ShapeUtil::GetLeafCount(instruction->shape()) ==
                      domain_sharding.tuple_elements().size());
     instruction->set_sharding(domain_sharding);
@@ -253,7 +253,7 @@ StatusOr<bool> ApplyShardingFromUsers(HloInstruction* instruction,
       instruction->shape(), HloSharding::AssignDevice(kUnassignedDevice));
   for (HloInstruction* user : instruction->users()) {
     if (user->opcode() == HloOpcode::kDomain &&
-        domain.exit_domains.count(user) > 0) {
+        domain.exit_domains.contains(user)) {
       // If a user is a domain and it is registered in the domain exits, then
       // the instruction sharding is taken directly from the domain, and no
       // further users need to be visited.
@@ -266,7 +266,7 @@ StatusOr<bool> ApplyShardingFromUsers(HloInstruction* instruction,
     AssignmentKind sub_assigned = AssignmentKind::kUnassigned;
     TF_ASSIGN_OR_RETURN(ShapeTree<HloSharding> user_sharding_tree,
                         GetShardingTreeFromUser(*instruction, *user));
-    if (ShapeUtil::IsTuple(instruction->shape())) {
+    if (instruction->shape().IsTuple()) {
       // For tuple-shaped instructions collect individual tuple subshardings
       // from the uses, and then combine them into the tuple sharding.
       // If the user is a GTE its sharding concerns only the subtree of
@@ -298,7 +298,7 @@ StatusOr<bool> ApplyShardingFromUsers(HloInstruction* instruction,
   }
 
   if (assigned == AssignmentKind::kAssigned) {
-    if (ShapeUtil::IsTuple(instruction->shape())) {
+    if (instruction->shape().IsTuple()) {
       instruction->set_sharding(HloSharding::Tuple(sharding_tree));
     } else {
       TF_RET_CHECK(sharding_tree.leaf_count() == 1);
@@ -361,7 +361,7 @@ Status ApplyDomainSharding(const DomainMetadata::Domain& domain,
       // kUnassignedDevice. Indeed in case of doubt it is better to leave the
       // entire tuple unassigned, and let the device placer decide for it.
       if (instruction->sharding().UsesDevice(kUnassignedDevice)) {
-        TF_RET_CHECK(ShapeUtil::IsTuple(instruction->shape()))
+        TF_RET_CHECK(instruction->shape().IsTuple())
             << "Only tuples can have kUnassignedDevice sub shardings";
         instruction->clear_sharding();
       }
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_test.cc b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
index 80634677e78e4a35dcb9bf7de018a88122c3c030..9e234e025586ff14f99da73afc5610c627303a36 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
@@ -84,7 +84,7 @@ TEST_F(HloShardingTest, Tile) {
   }
 
   {
-    // Test should fail because of more devices used then `num_device`.
+    // Test should fail because of more devices used than `num_device`.
     HloSharding sharding = HloSharding::Tile(MakeArray({2, 2}, {0, 1, 2, 3}));
     EXPECT_IS_NOT_OK(sharding.Validate(ShapeUtil::MakeShape(U32, {4, 6}),
                                        /*num_devices=*/2));
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
deleted file mode 100644
index 487653344976a10e18ba667085525ba1ecbb8612..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.cc
+++ /dev/null
@@ -1,243 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-LIcensed under the Apache License, Version 2.0 (the "License");
-You may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
-#include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/tensor_shape.pb.h"
-
-namespace xla {
-namespace hlo_graph_dumper {
-namespace {
-
-using absl::StrAppend;
-using absl::StrCat;
-using tensorflow::GraphDef;
-using tensorflow::NodeDef;
-using tensorflow::TensorShapeProto;
-
-string GetOpDefName(const HloInstruction* instruction) {
-  string name = StrCat("hlo-", HloOpcodeString(instruction->opcode()));
-  tensorflow::str_util::TitlecaseString(&name, "-");  // non-absl ok
-  name.erase(std::remove(name.begin(), name.end(), '-'), name.end());
-
-  if (instruction->opcode() == HloOpcode::kFusion) {
-    string fusion_name = ToString(instruction->fusion_kind());
-    StrAppend(&name, absl::string_view(fusion_name).substr(1));
-  }
-  return name;
-}
-
-TensorShapeProto GetTensorShape(const HloInstruction* instruction) {
-  TensorShapeProto tensor_shape;
-  const Shape& shape = instruction->shape();
-  for (auto dim : shape.dimensions()) {
-    tensor_shape.add_dim()->set_size(dim);
-  }
-  return tensor_shape;
-}
-
-string GetDeviceName(int device) { return StrCat("/device/XLA:", device); }
-
-void CleanNodeName(string* name) {
-  name->erase(std::remove(name->begin(), name->end(), '%'), name->end());
-  const string chars_to_replace = "<>[]";
-  auto pred = [&](char c) {
-    return std::find(chars_to_replace.begin(), chars_to_replace.end(), c) !=
-           chars_to_replace.end();
-  };
-  std::replace_if(name->begin(), name->end(), pred, '_');
-}
-
-}  // namespace
-
-HloTfGraphBuilder::HloTfGraphBuilder(const DebugOptions& debug_options)
-    : debug_options_(debug_options) {}
-
-Status HloTfGraphBuilder::AddComputation(const HloComputation& computation) {
-  VLOG(2) << "Adding computation " << computation.name();
-  for (auto embedded : computation.MakeEmbeddedComputationsList()) {
-    for (auto* instruction : embedded->instructions()) {
-      TF_RETURN_IF_ERROR(AddInstruction(instruction));
-    }
-  }
-  for (auto* instruction : computation.instructions()) {
-    TF_RETURN_IF_ERROR(AddInstruction(instruction));
-  }
-  return Status::OK();
-}
-
-const GraphDef& HloTfGraphBuilder::GetGraphDef() const { return graph_def_; }
-
-const string& HloTfGraphBuilder::GetNodeNameForInstruction(
-    const HloInstruction* instruction) {
-  if (ContainsKey(instruction_to_node_name_, instruction)) {
-    return instruction_to_node_name_[instruction];
-  }
-  auto append = [](string* str, const string& other) {
-    if (str->empty()) {
-      *str = other;
-    } else if (!other.empty()) {
-      StrAppend(str, "/", other);
-    }
-  };
-  string node_name;
-  if (debug_options_.xla_hlo_tfgraph_device_scopes()) {
-    auto device = instruction->sharding_unique_device();
-    if (device) {
-      node_name = StrCat("dev", *device);
-    }
-  }
-  // If an instruction is fused, put it in the subgraph of the fusion;
-  // otherwise, put it in the computation subgraph.
-  const HloComputation* computation = instruction->parent();
-  if (computation->IsFusionComputation()) {
-    append(&node_name,
-           GetNodeNameForInstruction(computation->FusionInstruction()));
-  } else {
-    append(&node_name, computation->name());
-    if (!instruction->metadata().op_name().empty()) {
-      // Always make computations contain TF ops but not the other way around.
-      append(&node_name, instruction->metadata().op_name());
-    }
-  }
-  string instruction_name = instruction->name();
-  if (instruction->opcode() == HloOpcode::kParameter) {
-    StrAppend(&instruction_name, ".", instruction->parameter_number());
-  }
-  append(&node_name, instruction_name);
-  CleanNodeName(&node_name);
-  auto ret =
-      instruction_to_node_name_.insert(std::make_pair(instruction, node_name));
-  CHECK(ret.second);
-  return ret.first->second;
-}
-
-void HloTfGraphBuilder::SetNodeAttrs(const HloInstruction* instruction,
-                                     NodeDef* node_def) const {
-  auto& attrs = *node_def->mutable_attr();
-
-  // Set the number of arguments for instructions that have variadic operands.
-  if (HloOpcodeIsVariadic(instruction->opcode())) {
-    tensorflow::AttrValue attr_value;
-    attr_value.set_i(instruction->operands().size());
-    attrs["arg_num"] = attr_value;
-  }
-
-  // Set the node type.
-  attrs["type"].set_s(
-      xla::PrimitiveType_Name(instruction->shape().element_type()));
-
-  // Set the framework op (e.g. Tensorflow op) that generated this XLA op.
-  attrs["tf_op_type"].set_s(instruction->metadata().op_type());
-  attrs["tf_op_name"].set_s(instruction->metadata().op_name());
-
-  // Set the shape of the output tensor. "_output_shapes" is a special attribute
-  // name used by Tensorboard for shapes of output tensors.
-  tensorflow::AttrValue shapes;
-  *shapes.mutable_list()->add_shape() = GetTensorShape(instruction);
-  attrs["_output_shapes"] = shapes;
-
-  // Set the layout.
-  if (LayoutUtil::HasLayout(instruction->shape())) {
-    string layout_string;
-    if (ShapeUtil::IsTuple(instruction->shape())) {
-      // For tuples, emit the full shape because the layout of a tuple is not
-      // represented in a single Layout field.
-      layout_string = ShapeUtil::HumanStringWithLayout(instruction->shape());
-    } else {
-      layout_string = StrCat(
-          "{",
-          absl::StrJoin(LayoutUtil::MinorToMajor(instruction->shape()), ","),
-          "}");
-    }
-    attrs["layout"].set_s(layout_string);
-  }
-
-  // Set op-specific attributes.
-  switch (instruction->opcode()) {
-    case HloOpcode::kConcatenate:
-    case HloOpcode::kBroadcast:
-    case HloOpcode::kReduce:
-    case HloOpcode::kReverse:
-    case HloOpcode::kTranspose:
-      for (auto dim : instruction->dimensions()) {
-        attrs["dims"].mutable_list()->add_i(dim);
-      }
-      break;
-    case HloOpcode::kGetTupleElement:
-      attrs["index"].set_i(instruction->tuple_index());
-      break;
-    case HloOpcode::kRng:
-      attrs["dist"].set_s(
-          RandomDistribution_Name(instruction->random_distribution()));
-      break;
-    case HloOpcode::kConstant:
-      if (ShapeUtil::IsScalar(instruction->shape())) {
-        attrs["value"].set_s(instruction->literal().GetAsString({}));
-      }
-      break;
-    case HloOpcode::kCustomCall:
-      attrs["custom_call_target"].set_s(instruction->custom_call_target());
-      break;
-    case HloOpcode::kSend:
-    case HloOpcode::kRecv:
-      attrs["channel_id"].set_i(instruction->channel_id());
-      break;
-    default:
-      break;
-  }
-}
-
-Status HloTfGraphBuilder::AddInstruction(const HloInstruction* instruction) {
-  if (!visited_instructions_.insert(instruction).second) {
-    // Skip instructions that have already been added.
-    return Status::OK();
-  }
-
-  NodeDef* node_def = graph_def_.add_node();
-  node_def->set_name(GetNodeNameForInstruction(instruction));
-  node_def->set_op(GetOpDefName(instruction));
-
-  auto device = instruction->sharding_unique_device();
-  if (device) {
-    node_def->set_device(GetDeviceName(*device));
-  }
-  SetNodeAttrs(instruction, node_def);
-  if (instruction->opcode() == HloOpcode::kFusion) {
-    for (auto* fused_instruction : instruction->fused_instructions()) {
-      TF_RETURN_IF_ERROR(AddInstruction(fused_instruction));
-    }
-  }
-  // Add all edges including control edges.
-  for (unsigned i = 0; i < instruction->operands().size(); ++i) {
-    *node_def->add_input() = GetNodeNameForInstruction(instruction->operand(i));
-  }
-  // Called computations are control dependencies.
-  for (const auto* called_computation : instruction->called_computations()) {
-    *node_def->add_input() = StrCat(
-        "^", GetNodeNameForInstruction(called_computation->root_instruction()));
-  }
-  return Status::OK();
-}
-
-}  // namespace hlo_graph_dumper
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h b/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h
deleted file mode 100644
index c4876b852e32d34693202f4023aa20ad2b301ffd..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
-
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/xla.pb.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-
-namespace xla {
-namespace hlo_graph_dumper {
-
-// This constructs a tensorflow graph for HLO computations.
-class HloTfGraphBuilder {
- public:
-  HloTfGraphBuilder(const DebugOptions& debug_options = DebugOptions());
-
-  // Adds a computation to the graph.
-  Status AddComputation(const HloComputation& computation);
-
-  const tensorflow::GraphDef& GetGraphDef() const;
-
- private:
-  // Gets the node name of an instruction. The node name is hierarchical. For
-  // example, if an instruction is fused, it will be put in a subgraph of the
-  // fusion instruction.
-  const string& GetNodeNameForInstruction(const HloInstruction* instruction);
-
-  void SetNodeAttrs(const HloInstruction* instruction,
-                    tensorflow::NodeDef* node_def) const;
-
-  Status AddInstruction(const HloInstruction* instruction);
-
-  DebugOptions debug_options_;
-  tensorflow::GraphDef graph_def_;
-  // This records instructions that have been visited.
-  std::unordered_set<const HloInstruction*> visited_instructions_;
-  // A cache that maps instruction to the node name.
-  std::unordered_map<const HloInstruction*, string> instruction_to_node_name_;
-};
-
-}  // namespace hlo_graph_dumper
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TFGRAPH_BUILDER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
deleted file mode 100644
index 1e2b31a1f2bb4865faafc3d14e2b194e3aa171a1..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
+++ /dev/null
@@ -1,183 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
-#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/tensor_shape.pb.h"
-
-namespace xla {
-namespace hlo_graph_dumper {
-namespace {
-
-using ::tensorflow::GraphDef;
-
-class HloTfGraphBuilderTest : public HloTestBase {
- protected:
-  HloTfGraphBuilderTest() {}
-  HloTfGraphBuilder generator_;
-
-  // Create a computation which takes a scalar and returns its negation.
-  std::unique_ptr<HloComputation> CreateNegateComputation() {
-    auto builder = HloComputation::Builder("Negate");
-    auto param = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, r0f32_, "param0"));
-    builder.AddInstruction(
-        HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, param));
-    return builder.Build();
-  }
-
-  // Creates a computation which calls map with the given computation.
-  std::unique_ptr<HloComputation> CreateMapComputation(
-      HloComputation *map_computation) {
-    auto builder = HloComputation::Builder("Map");
-    auto param = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, r0f32_, "param0"));
-    builder.AddInstruction(
-        HloInstruction::CreateMap(r0f32_, {param}, map_computation));
-    return builder.Build();
-  }
-  Shape r0f32_ = ShapeUtil::MakeShape(PrimitiveType::F32, {});
-};
-
-static const tensorflow::AttrValue &GetNodeAttr(const tensorflow::NodeDef &node,
-                                                const string &attr_name) {
-  auto attr = node.attr().find(attr_name);
-  CHECK(attr != node.attr().end());
-  return attr->second;
-}
-
-TEST_F(HloTfGraphBuilderTest, CheckConcatenateDimsAndShapes) {
-  auto builder = HloComputation::Builder("Concatenate");
-  Shape shape = ShapeUtil::MakeShape(PrimitiveType::F32, {2, 2});
-  auto param_1 = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, shape, "param0"));
-  auto param_2 = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, shape, "param1"));
-  builder.AddInstruction(HloInstruction::CreateConcatenate(
-      ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), {param_1, param_2}, 1));
-  TF_CHECK_OK(generator_.AddComputation(*builder.Build()));
-  GraphDef graph_def = generator_.GetGraphDef();
-  EXPECT_EQ(graph_def.node_size(), 3);
-  const auto &node = graph_def.node(2);
-  EXPECT_EQ(node.name(), "Concatenate/concatenate");
-
-  // Check dimensions.
-  auto dims_value = GetNodeAttr(node, "dims");
-  EXPECT_EQ(dims_value.list().i_size(), 1);
-  EXPECT_EQ(dims_value.list().i(0), 1);
-
-  // Check shapes.
-  auto shape_value = GetNodeAttr(node, "_output_shapes");
-  EXPECT_EQ(shape_value.list().shape_size(), 1);
-  EXPECT_EQ(shape_value.list().shape(0).dim_size(), 2);
-  EXPECT_EQ(shape_value.list().shape(0).dim(0).size(), 2);
-  EXPECT_EQ(shape_value.list().shape(0).dim(1).size(), 4);
-}
-
-TEST_F(HloTfGraphBuilderTest, CheckScalarValue) {
-  auto builder = HloComputation::Builder("Const");
-  HloInstruction *instruction = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0(123)));
-  OpMetadata metadata;
-  metadata.set_op_name("x");
-  metadata.set_op_type("y");
-  instruction->set_metadata(metadata);
-  TF_CHECK_OK(generator_.AddComputation(*builder.Build()));
-  GraphDef graph_def = generator_.GetGraphDef();
-  EXPECT_EQ(graph_def.node_size(), 1);
-  const auto &node = graph_def.node(0);
-  EXPECT_EQ(GetNodeAttr(node, "value").s(), "123");
-  EXPECT_EQ(GetNodeAttr(node, "type").s(), "S32");
-  EXPECT_EQ(GetNodeAttr(node, "tf_op_name").s(), "x");
-  EXPECT_EQ(GetNodeAttr(node, "tf_op_type").s(), "y");
-}
-
-TEST_F(HloTfGraphBuilderTest, SimpleNegateComputation) {
-  auto negate_computation = CreateNegateComputation();
-  TF_CHECK_OK(generator_.AddComputation(*negate_computation));
-  GraphDef graph_def = generator_.GetGraphDef();
-  EXPECT_EQ(graph_def.node_size(), 2);
-  EXPECT_EQ(graph_def.node(0).name(), "Negate/param0.0");
-  EXPECT_EQ(graph_def.node(0).op(), "HloParameter");
-  EXPECT_EQ(graph_def.node(1).name(), "Negate/negate");
-  EXPECT_EQ(graph_def.node(1).op(), "HloNegate");
-  EXPECT_EQ(graph_def.node(1).input_size(), 1);
-  EXPECT_EQ(graph_def.node(1).input(0), "Negate/param0.0");
-}
-
-TEST_F(HloTfGraphBuilderTest, GreaterThanOrEqualTo) {
-  auto builder = HloComputation::Builder("GE");
-  auto param_1 = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, r0f32_, "param0"));
-  auto param_2 = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, r0f32_, "param1"));
-  builder.AddInstruction(
-      HloInstruction::CreateBinary(r0f32_, HloOpcode::kGe, param_1, param_2));
-  TF_CHECK_OK(generator_.AddComputation(*builder.Build()));
-  GraphDef graph_def = generator_.GetGraphDef();
-  EXPECT_EQ(graph_def.node_size(), 3);
-  EXPECT_EQ(graph_def.node(0).name(), "GE/param0.0");
-  EXPECT_EQ(graph_def.node(1).name(), "GE/param1.1");
-  EXPECT_EQ(graph_def.node(2).input_size(), 2);
-  EXPECT_EQ(graph_def.node(2).name(), "GE/greater-than-or-equal-to");
-  EXPECT_EQ(graph_def.node(2).op(), "HloGreaterThanOrEqualTo");
-}
-
-TEST_F(HloTfGraphBuilderTest, IncorparateTfOpsStructure) {
-  auto builder = HloComputation::Builder("GE");
-  auto param_1 = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, r0f32_, "param0"));
-  auto param_2 = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, r0f32_, "param1"));
-  auto ge = builder.AddInstruction(
-      HloInstruction::CreateBinary(r0f32_, HloOpcode::kGe, param_1, param_2));
-  OpMetadata metadata;
-  metadata.set_op_name("x/y");
-  metadata.set_op_type("Y");
-  ge->set_metadata(metadata);
-  TF_CHECK_OK(generator_.AddComputation(*builder.Build()));
-  GraphDef graph_def = generator_.GetGraphDef();
-  EXPECT_EQ(graph_def.node_size(), 3);
-  EXPECT_EQ(graph_def.node(0).name(), "GE/param0.0");
-  EXPECT_EQ(graph_def.node(1).name(), "GE/param1.1");
-  EXPECT_EQ(graph_def.node(2).input_size(), 2);
-  EXPECT_EQ(graph_def.node(2).name(), "GE/x/y/greater-than-or-equal-to");
-  EXPECT_EQ(graph_def.node(2).op(), "HloGreaterThanOrEqualTo");
-}
-
-TEST_F(HloTfGraphBuilderTest, EmbeddedComputationsDiamond) {
-  // Create computations with a diamond-shaped callgraph.
-  auto negate_computation = CreateNegateComputation();
-  auto map1_computation = CreateMapComputation(negate_computation.get());
-  auto map2_computation = CreateMapComputation(negate_computation.get());
-
-  auto builder = HloComputation::Builder(TestName());
-  auto param = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, r0f32_, "param0"));
-  auto map1 = builder.AddInstruction(
-      HloInstruction::CreateMap(r0f32_, {param}, map1_computation.get()));
-  auto map2 = builder.AddInstruction(
-      HloInstruction::CreateMap(r0f32_, {param}, map2_computation.get()));
-  builder.AddInstruction(
-      HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, map1, map2));
-  auto computation = builder.Build();
-  TF_CHECK_OK(generator_.AddComputation(*computation));
-  EXPECT_GT(generator_.GetGraphDef().node_size(), 0);
-}
-
-}  // namespace
-}  // namespace hlo_graph_dumper
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_token.h b/tensorflow/compiler/xla/service/hlo_token.h
deleted file mode 100644
index 4458c251dee4af365e39027dd4289925c8890efd..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/hlo_token.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TOKEN_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TOKEN_H_
-
-#include <string>
-
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace xla {
-
-// Defines different kinds of tokens in a hlo module string.
-//
-// You shouldn't need to use this directly unless you're using HloLexer
-// directly, and you probably don't need to do that.  Use hlo_parser instead.
-enum class TokKind {
-  // Markers
-  kEof,
-  kError,
-
-  // Tokens with no info.
-  kEqual,  // =
-  kComma,  // ,
-  kColon,  // :
-  kLsquare,
-  kRsquare,  // [  ]
-  kLbrace,
-  kRbrace,  // {  }
-  kLparen,
-  kRparen,  // (  )
-
-  kArrow,    // ->
-
-  // Keywords
-  kw_HloModule,
-  kw_ENTRY,
-  kw_ROOT,
-  kw_true,
-  kw_false,
-  kw_maximal,
-  kw_replicated,
-  kw_nan,
-  kw_inf,
-
-  kNegInf,  // -inf
-
-  // Typed tokens.
-  kName,           // %foo
-  kAttributeName,  // dimensions=
-  kDimLabels,      // [0-9bf]{2,}_[0-9io]{2,}->[0-9bf]{2,}
-  kDxD,            // [0-9]+(x[0-9]+)+
-  kPad,            // [0-9]+_[0-9]+(_[0-9]+)?(x[0-9]+_[0-9]+(_[0-9]+)?)*
-  kIdent,          // other identifiers
-  kString,         // "abcd\"\n"
-  kShape,          // f32[2,3]{1,0}
-  kInt,            // 42
-  kDecimal,        // 4.2
-};
-
-string TokKindToString(TokKind kind);
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TOKEN_H_
diff --git a/tensorflow/compiler/xla/service/hlo_value.cc b/tensorflow/compiler/xla/service/hlo_value.cc
index 59594ab2f0f70a206c73e998dbfa69c2c5c7ba43..218b33b2ac2b86edc30b2f014ba206c71da37682 100644
--- a/tensorflow/compiler/xla/service/hlo_value.cc
+++ b/tensorflow/compiler/xla/service/hlo_value.cc
@@ -46,7 +46,7 @@ const Shape& HloPosition::shape() const {
 
 string HloPosition::ToString() const {
   string index_str =
-      ShapeUtil::IsTuple(instruction->shape()) ? (" " + index.ToString()) : "";
+      instruction->shape().IsTuple() ? (" " + index.ToString()) : "";
   return StrCat(instruction->name(), index_str);
 }
 
@@ -56,10 +56,9 @@ std::ostream& operator<<(std::ostream& out, const HloPosition& position) {
 }
 
 string HloUse::ToString() const {
-  string index_str =
-      ShapeUtil::IsTuple(instruction->operand(operand_number)->shape())
-          ? (" " + operand_index.ToString())
-          : "";
+  string index_str = instruction->operand(operand_number)->shape().IsTuple()
+                         ? (" " + operand_index.ToString())
+                         : "";
   return StrCat(instruction->name(), ", operand ", operand_number, index_str);
 }
 
@@ -88,7 +87,7 @@ bool HloValue::operator!=(const HloValue& other) const {
 }
 
 string HloValue::ToShortString() const {
-  string index_str = ShapeUtil::IsTuple(defining_instruction()->shape())
+  string index_str = defining_instruction()->shape().IsTuple()
                          ? defining_index().ToString()
                          : "";
   return StrCat(id(), " ", is_phi_ ? "PHI " : "",
@@ -210,7 +209,7 @@ std::ostream& operator<<(std::ostream& out, const HloValue& value) {
 }
 
 void HloValueSet::SortAndUniquifyValues() {
-  std::sort(values_.begin(), values_.end(), HloValue::IdLessThan);
+  absl::c_sort(values_, HloValue::IdLessThan);
   values_.erase(std::unique(values_.begin(), values_.end(), HloValue::IdEqual),
                 values_.end());
 }
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 77db7b098a38ff4efdcc7447935fae61561c9ff4..56a06a182a236070340075848d301be54c0d9ebd 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -44,12 +44,13 @@ bool IsCallerInstruction(HloInstruction* hlo) {
     case HloOpcode::kCall:
     case HloOpcode::kConditional:
     case HloOpcode::kWhile:
-    case HloOpcode::kCrossReplicaSum:
+    case HloOpcode::kAllReduce:
     case HloOpcode::kMap:
     case HloOpcode::kReduce:
     case HloOpcode::kReduceWindow:
     case HloOpcode::kScatter:
     case HloOpcode::kSelectAndScatter:
+    case HloOpcode::kSort:
     case HloOpcode::kFusion:
       return true;
     default:
@@ -57,15 +58,6 @@ bool IsCallerInstruction(HloInstruction* hlo) {
   }
 }
 
-Status ShapeVerifier::Preprocess(HloInstruction* hlo) {
-  if (!hlo->called_computations().empty() && !IsCallerInstruction(hlo)) {
-    return InternalError(
-        "Called computations specified for non-caller instruction  %s",
-        hlo->ToString());
-  }
-  return VerifyNotSparse(hlo->shape());
-}
-
 namespace {
 
 Status CheckOperandCount(const HloInstruction* hlo, int expected) {
@@ -90,6 +82,21 @@ Status CheckParameterCount(const HloInstruction* calling_instruction,
 
 }  // namespace
 
+Status ShapeVerifier::Preprocess(HloInstruction* hlo) {
+  if (!hlo->called_computations().empty() && !IsCallerInstruction(hlo)) {
+    return InternalError(
+        "Called computations specified for non-caller instruction  %s",
+        hlo->ToString());
+  }
+  TF_RETURN_IF_ERROR(VerifyNotSparse(hlo->shape()));
+
+  absl::optional<int> arity = HloOpcodeArity(hlo->opcode());
+  if (arity) {
+    TF_RETURN_IF_ERROR(CheckOperandCount(hlo, *arity));
+  }
+  return Status::OK();
+}
+
 Status ShapeVerifier::HandleElementwiseUnary(HloInstruction* hlo) {
   return CheckUnaryShape(hlo);
 }
@@ -121,14 +128,12 @@ Status ShapeVerifier::HandleConcatenate(HloInstruction* concatenate) {
 }
 
 Status ShapeVerifier::HandleConvert(HloInstruction* convert) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(convert, 1));
   return CheckShape(convert, ShapeInference::InferConvertShape(
                                  convert->operand(0)->shape(),
                                  convert->shape().element_type()));
 }
 
 Status ShapeVerifier::HandleBitcastConvert(HloInstruction* convert) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(convert, 1));
   return CheckShape(convert, ShapeInference::InferBitcastConvertShape(
                                  convert->operand(0)->shape(),
                                  convert->shape().element_type()));
@@ -139,7 +144,6 @@ Status ShapeVerifier::HandleCopy(HloInstruction* copy) {
 }
 
 Status ShapeVerifier::HandleDot(HloInstruction* dot) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(dot, 2));
   TF_ASSIGN_OR_RETURN(const Shape expected,
                       ShapeInference::InferDotOpShape(
                           dot->operand(0)->shape(), dot->operand(1)->shape(),
@@ -148,18 +152,16 @@ Status ShapeVerifier::HandleDot(HloInstruction* dot) {
 }
 
 Status ShapeVerifier::HandleConvolution(HloInstruction* convolution) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(convolution, 2));
   TF_ASSIGN_OR_RETURN(
       const Shape expected,
       ShapeInference::InferConvolveShape(
           convolution->operand(0)->shape(), convolution->operand(1)->shape(),
-          convolution->feature_group_count(), convolution->window(),
-          convolution->convolution_dimension_numbers()));
+          convolution->feature_group_count(), convolution->batch_group_count(),
+          convolution->window(), convolution->convolution_dimension_numbers()));
   return CheckShape(convolution, expected);
 }
 
 Status ShapeVerifier::HandleFft(HloInstruction* fft) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(fft, 1));
   TF_ASSIGN_OR_RETURN(
       const Shape expected,
       ShapeInference::InferFftShape(fft->operand(0)->shape(), fft->fft_type(),
@@ -167,13 +169,20 @@ Status ShapeVerifier::HandleFft(HloInstruction* fft) {
   return CheckShape(fft, expected);
 }
 
-Status ShapeVerifier::HandleCrossReplicaSum(HloInstruction* crs) {
+Status ShapeVerifier::HandleTriangularSolve(HloInstruction* hlo) {
+  TF_ASSIGN_OR_RETURN(const Shape expected,
+                      ShapeInference::InferTriangularSolveShape(
+                          hlo->operand(0)->shape(), hlo->operand(1)->shape(),
+                          hlo->triangular_solve_options()));
+  return CheckShape(hlo, expected);
+}
+
+Status ShapeVerifier::HandleAllReduce(HloInstruction* crs) {
   std::vector<const Shape*> operand_shapes;
   for (const HloInstruction* operand : crs->operands()) {
     operand_shapes.push_back(&operand->shape());
   }
-  return CheckShape(crs,
-                    ShapeInference::InferCrossReplicaSumShape(operand_shapes));
+  return CheckShape(crs, ShapeInference::InferAllReduceShape(operand_shapes));
 }
 
 Status ShapeVerifier::HandleAllToAll(HloInstruction* hlo) {
@@ -185,14 +194,16 @@ Status ShapeVerifier::HandleAllToAll(HloInstruction* hlo) {
                     ShapeInference::InferAllToAllTupleShape(operand_shapes));
 }
 
+Status ShapeVerifier::HandleReplicaId(HloInstruction* hlo) {
+  return CheckShape(hlo, ShapeUtil::MakeShape(U32, {}));
+}
+
 Status ShapeVerifier::HandleCollectivePermute(HloInstruction* hlo) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(hlo, 1));
   return CheckShape(hlo, ShapeInference::InferCollectivePermuteShape(
                              hlo->operand(0)->shape()));
 }
 
 Status ShapeVerifier::HandleReducePrecision(HloInstruction* reduce_precision) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(reduce_precision, 1));
   return CheckShape(reduce_precision, ShapeInference::InferReducePrecisionShape(
                                           reduce_precision->operand(0)->shape(),
                                           reduce_precision->exponent_bits(),
@@ -226,7 +237,6 @@ Status ShapeVerifier::CheckOperandAndParameter(
 }
 
 Status ShapeVerifier::HandleInfeed(HloInstruction* instruction) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(instruction, 1));
   HloInfeedInstruction* infeed = Cast<HloInfeedInstruction>(instruction);
   TF_RETURN_IF_ERROR(CheckIsTokenOperand(instruction, 0));
 
@@ -237,7 +247,6 @@ Status ShapeVerifier::HandleInfeed(HloInstruction* instruction) {
 }
 
 Status ShapeVerifier::HandleOutfeed(HloInstruction* instruction) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(instruction, 2));
   HloOutfeedInstruction* outfeed = Cast<HloOutfeedInstruction>(instruction);
   TF_RETURN_IF_ERROR(CheckIsTokenOperand(instruction, 1));
 
@@ -313,7 +322,6 @@ Status ShapeVerifier::HandleRng(HloInstruction* instruction) {
 }
 
 Status ShapeVerifier::HandleReverse(HloInstruction* reverse) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(reverse, 1));
   return CheckShape(
       reverse, ShapeInference::InferReverseShape(reverse->operand(0)->shape(),
                                                  reverse->dimensions()));
@@ -324,13 +332,48 @@ Status ShapeVerifier::HandleSort(HloInstruction* sort) {
     return InternalError("Expected at least 1 operand for %s instruction: %s",
                          HloOpcodeString(sort->opcode()), sort->ToString());
   }
+  HloComputation* compare = sort->to_apply();
+
+  // Check that the 'compare' computation returns a PRED.
+  Shape compare_shape = compare->root_instruction()->shape();
+  if (!ShapesSame(compare_shape, ShapeUtil::MakeShape(PRED, {}))) {
+    return InternalError(
+        "The Sort compare computation shape does not lead to a scalar "
+        "predicate shape: %s",
+        StringifyShape(compare_shape));
+  }
+
+  // Check that the number of parameters of the 'compare' computation is
+  // correct.
+  TF_RETURN_IF_ERROR(
+      CheckParameterCount(sort, compare, sort->operand_count() * 2));
+
+  // Verify that the operands of the compare computation have the correct scalar
+  // shapes.
+  for (int64 parameter_idx = 0; parameter_idx < compare->num_parameters();
+       ++parameter_idx) {
+    int64 operand_idx = parameter_idx / 2;
+    Shape expected_scalar_shape = ShapeUtil::MakeShape(
+        sort->operand(operand_idx)->shape().element_type(), {});
+    Shape actual_parameter_shape =
+        compare->parameter_instruction(parameter_idx)->shape();
+    if (!ShapeUtil::CompatibleIgnoringFpPrecision(expected_scalar_shape,
+                                                  actual_parameter_shape)) {
+      return InternalError(
+          "Expected the %lld-th parameter of the compare computation of sort "
+          "to have shape %s, but got %s",
+          parameter_idx, StringifyShape(expected_scalar_shape),
+          StringifyShape(actual_parameter_shape));
+    }
+  }
+
+  // Verify that all operand shapes have the same dimensions.
   for (int64 operand = 1; operand < sort->operand_count(); ++operand) {
     if (!ShapeUtil::SameDimensions(sort->operand(0)->shape(),
                                    sort->operand(operand)->shape())) {
       return InternalError(
-          "Expected sort to have to have the same dimensions for the keys "
-          "and the values. Keys shape is: %s\n, Values shape (operand index "
-          "%lld) is: %s",
+          "Expected sort to have to have the same dimensions for all operands. "
+          "First operand shape is: %s\n, shape (operand index %lld) is: %s",
           StringifyShape(sort->operand(0)->shape()), operand,
           StringifyShape(sort->operand(operand)->shape()));
     }
@@ -339,7 +382,6 @@ Status ShapeVerifier::HandleSort(HloInstruction* sort) {
 }
 
 Status ShapeVerifier::HandleConstant(HloInstruction* constant) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(constant, 0));
   if (!Cast<HloConstantInstruction>(constant)->HasLiteral()) {
     return InternalError("Constant is required to have a valid literal: %s",
                          constant->ToString());
@@ -348,9 +390,11 @@ Status ShapeVerifier::HandleConstant(HloInstruction* constant) {
 }
 
 Status ShapeVerifier::HandleIota(HloInstruction* instruction) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(instruction, 0));
   auto* iota = Cast<HloIotaInstruction>(instruction);
-  const int64 rank = ShapeUtil::Rank(iota->shape());
+  if (!iota->shape().IsArray()) {
+    return InternalError("Iota does not support non-array result.");
+  }
+  const int64 rank = iota->shape().rank();
   if (rank == 0) {
     return InternalError("Iota does not support scalars.");
   }
@@ -363,13 +407,30 @@ Status ShapeVerifier::HandleIota(HloInstruction* instruction) {
 }
 
 Status ShapeVerifier::HandleGetTupleElement(HloInstruction* get_tuple_element) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(get_tuple_element, 1));
   return CheckShape(get_tuple_element,
                     ShapeInference::InferGetTupleElementShape(
                         get_tuple_element->operand(0)->shape(),
                         get_tuple_element->tuple_index()));
 }
 
+namespace {
+Status SameElementTypesForOperandsAndToApplyParameters(
+    const HloInstruction& instruction, int64 num_operands_to_check) {
+  const ProgramShape& to_apply = instruction.to_apply()->ComputeProgramShape();
+  for (int i = 0; i < num_operands_to_check; ++i) {
+    const Shape& parameter_shape = to_apply.parameters(i);
+    const Shape& operand_shape = instruction.operands()[i]->shape();
+    if (!ShapeUtil::SameElementType(parameter_shape, operand_shape)) {
+      return InvalidArgument(
+          "Shape mismatch between to_apply computation"
+          " parameter and operand %d in %s.",
+          i, instruction.ToString().c_str());
+    }
+  }
+  return Status::OK();
+}
+}  // namespace
+
 Status ShapeVerifier::HandleReduce(HloInstruction* reduce) {
   if (reduce->operand_count() % 2 != 0) {
     return InternalError(
@@ -381,30 +442,40 @@ Status ShapeVerifier::HandleReduce(HloInstruction* reduce) {
   for (const HloInstruction* operand : reduce->operands()) {
     operand_shapes.push_back(&operand->shape());
   }
-  return CheckShape(reduce, ShapeInference::InferReduceShape(
-                                operand_shapes, reduce->dimensions(),
-                                reduce->to_apply()->ComputeProgramShape()));
+  TF_RETURN_IF_ERROR(
+      CheckShape(reduce, ShapeInference::InferReduceShape(
+                             operand_shapes, reduce->dimensions(),
+                             reduce->to_apply()->ComputeProgramShape())));
+
+  return allow_mixed_precision_
+             ? Status::OK()
+             : SameElementTypesForOperandsAndToApplyParameters(
+                   *reduce, reduce->operands().size() - 1);
 }
 
 Status ShapeVerifier::HandleBitcast(HloInstruction* bitcast) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(bitcast, 1));
+  // Bitcasts are not allowed to change the element type.
+  if (bitcast->operand(0)->shape().element_type() !=
+      bitcast->shape().element_type()) {
+    return InternalError(
+        "Bitcast can not change the element type from %s to %s",
+        PrimitiveType_Name(bitcast->operand(0)->shape().element_type()),
+        PrimitiveType_Name(bitcast->shape().element_type()));
+  }
   return Status::OK();
 }
 
 Status ShapeVerifier::HandleBroadcast(HloInstruction* broadcast) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(broadcast, 1));
   // HLO broadcast has no exact analog at the proto level so there is no
   // ShapeInference method. Check the output shape explicitly.
   const Shape& operand_shape = broadcast->operand(0)->shape();
   // Check for mixed precision.
   TF_RET_CHECK(SameElementType(broadcast->shape(), operand_shape));
-  TF_RET_CHECK(ShapeUtil::Rank(operand_shape) ==
-               broadcast->dimensions().size());
-  for (int64 operand_dimension = 0;
-       operand_dimension < ShapeUtil::Rank(operand_shape);
+  TF_RET_CHECK(operand_shape.rank() == broadcast->dimensions().size());
+  for (int64 operand_dimension = 0; operand_dimension < operand_shape.rank();
        ++operand_dimension) {
     int64 output_dimension = broadcast->dimensions()[operand_dimension];
-    TF_RET_CHECK((output_dimension < ShapeUtil::Rank(broadcast->shape())) &&
+    TF_RET_CHECK((output_dimension < broadcast->shape().rank()) &&
                  output_dimension >= 0 &&
                  (broadcast->shape().dimensions(output_dimension) ==
                   operand_shape.dimensions(operand_dimension)))
@@ -414,7 +485,6 @@ Status ShapeVerifier::HandleBroadcast(HloInstruction* broadcast) {
 }
 
 Status ShapeVerifier::HandleReshape(HloInstruction* reshape) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(reshape, 1));
   // Check for mixed precision.
   const Shape& operand_shape = reshape->operand(0)->shape();
   TF_RET_CHECK(SameElementType(reshape->shape(), operand_shape));
@@ -424,14 +494,12 @@ Status ShapeVerifier::HandleReshape(HloInstruction* reshape) {
 }
 
 Status ShapeVerifier::HandleTranspose(HloInstruction* transpose) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(transpose, 1));
   return CheckShape(
       transpose, ShapeInference::InferTransposeShape(
                      transpose->operand(0)->shape(), transpose->dimensions()));
 }
 
 Status ShapeVerifier::HandleParameter(HloInstruction* hlo) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(hlo, 0));
   return Status::OK();
 }
 
@@ -481,7 +549,9 @@ Status ShapeVerifier::HandleCustomCall(HloInstruction* instruction) {
       const Shape& operand_shape_with_layout =
           custom_call->operand_shapes_with_layout()[i];
       TF_RET_CHECK(ShapeUtil::Compatible(custom_call->operand(i)->shape(),
-                                         operand_shape_with_layout));
+                                         operand_shape_with_layout))
+          << custom_call->operand(i)->shape().ToString() << " operand "
+          << operand_shape_with_layout.ToString();
       TF_RET_CHECK(LayoutUtil::HasLayout(operand_shape_with_layout));
     }
   }
@@ -489,7 +559,6 @@ Status ShapeVerifier::HandleCustomCall(HloInstruction* instruction) {
 }
 
 Status ShapeVerifier::HandleSlice(HloInstruction* slice) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(slice, 1));
   return CheckShape(slice,
                     ShapeInference::InferSliceShape(
                         slice->operand(0)->shape(), slice->slice_starts(),
@@ -497,21 +566,23 @@ Status ShapeVerifier::HandleSlice(HloInstruction* slice) {
 }
 
 Status ShapeVerifier::HandleDynamicSlice(HloInstruction* dynamic_slice) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(dynamic_slice, 2));
-  return CheckShape(dynamic_slice, ShapeInference::InferDynamicSliceShape(
-                                       dynamic_slice->operand(0)->shape(),
-                                       dynamic_slice->operand(1)->shape(),
-                                       dynamic_slice->dynamic_slice_sizes()));
+  return CheckShape(
+      dynamic_slice,
+      ShapeInference::InferDynamicSliceShape(
+          dynamic_slice->operand(0)->shape(),
+          Cast<HloDynamicSliceInstruction>(dynamic_slice)->index_shapes(),
+          dynamic_slice->dynamic_slice_sizes()));
 }
 
 Status ShapeVerifier::HandleDynamicUpdateSlice(
     HloInstruction* dynamic_update_slice) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(dynamic_update_slice, 3));
-  return CheckShape(dynamic_update_slice,
-                    ShapeInference::InferDynamicUpdateSliceShape(
-                        dynamic_update_slice->operand(0)->shape(),
-                        dynamic_update_slice->operand(1)->shape(),
-                        dynamic_update_slice->operand(2)->shape()));
+  return CheckShape(
+      dynamic_update_slice,
+      ShapeInference::InferDynamicUpdateSliceShape(
+          dynamic_update_slice->operand(0)->shape(),
+          dynamic_update_slice->operand(1)->shape(),
+          Cast<HloDynamicUpdateSliceInstruction>(dynamic_update_slice)
+              ->index_shapes()));
 }
 
 Status ShapeVerifier::HandleTuple(HloInstruction* tuple) {
@@ -523,30 +594,39 @@ Status ShapeVerifier::HandleMap(HloInstruction* map) {
   int64 max_operand_rank = 0;
   for (const HloInstruction* operand : map->operands()) {
     operand_shapes.push_back(&operand->shape());
-    max_operand_rank =
-        std::max(max_operand_rank, ShapeUtil::Rank(operand->shape()));
+    max_operand_rank = std::max(max_operand_rank, operand->shape().rank());
   }
   // TODO(b/65689298) Remove code below once Map is generalized to accept
   // arbitrary map dimensions.
   std::vector<int64> map_dims(max_operand_rank);
   std::iota(map_dims.begin(), map_dims.end(), 0);
-  return CheckShape(map, ShapeInference::InferMapShape(
-                             operand_shapes,
-                             map->to_apply()->ComputeProgramShape(), map_dims));
+
+  TF_RETURN_IF_ERROR(CheckShape(
+      map,
+      ShapeInference::InferMapShape(
+          operand_shapes, map->to_apply()->ComputeProgramShape(), map_dims)));
+
+  return allow_mixed_precision_
+             ? Status::OK()
+             : SameElementTypesForOperandsAndToApplyParameters(
+                   *map, map->operands().size());
 }
 
 Status ShapeVerifier::HandleReduceWindow(HloInstruction* reduce_window) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(reduce_window, 2));
-  return CheckShape(
+  TF_RETURN_IF_ERROR(CheckShape(
       reduce_window,
       ShapeInference::InferReduceWindowShape(
           reduce_window->operand(0)->shape(),
           reduce_window->operand(1)->shape(), reduce_window->window(),
-          reduce_window->to_apply()->ComputeProgramShape()));
+          reduce_window->to_apply()->ComputeProgramShape())));
+
+  return allow_mixed_precision_
+             ? Status::OK()
+             : SameElementTypesForOperandsAndToApplyParameters(*reduce_window,
+                                                               1);
 }
 
 Status ShapeVerifier::HandleSelectAndScatter(HloInstruction* instruction) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(instruction, 3));
   return CheckShape(
       instruction,
       ShapeInference::InferSelectAndScatterShape(
@@ -557,7 +637,6 @@ Status ShapeVerifier::HandleSelectAndScatter(HloInstruction* instruction) {
 }
 
 Status ShapeVerifier::HandleWhile(HloInstruction* xla_while) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(xla_while, 1));
   TF_RETURN_IF_ERROR(
       CheckParameterCount(xla_while, xla_while->while_body(), 1));
   TF_RETURN_IF_ERROR(
@@ -581,7 +660,6 @@ Status ShapeVerifier::HandleWhile(HloInstruction* xla_while) {
 }
 
 Status ShapeVerifier::HandleConditional(HloInstruction* conditional) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(conditional, 3));
   TF_RETURN_IF_ERROR(
       CheckParameterCount(conditional, conditional->true_computation(), 1));
   TF_RETURN_IF_ERROR(
@@ -600,14 +678,12 @@ Status ShapeVerifier::HandleConditional(HloInstruction* conditional) {
 }
 
 Status ShapeVerifier::HandlePad(HloInstruction* pad) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(pad, 2));
   return CheckShape(pad, ShapeInference::InferPadShape(pad->operand(0)->shape(),
                                                        pad->operand(1)->shape(),
                                                        pad->padding_config()));
 }
 
 Status ShapeVerifier::HandleSend(HloInstruction* send) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(send, 2));
   return CheckShape(send,
                     ShapeUtil::MakeTupleShape({send->operand(0)->shape(),
                                                ShapeUtil::MakeShape(U32, {}),
@@ -615,12 +691,10 @@ Status ShapeVerifier::HandleSend(HloInstruction* send) {
 }
 
 Status ShapeVerifier::HandleSendDone(HloInstruction* send_done) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(send_done, 1));
   return CheckShape(send_done, ShapeUtil::MakeTokenShape());
 }
 
 Status ShapeVerifier::HandleRecv(HloInstruction* recv) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(recv, 1));
   return CheckShape(
       recv, ShapeUtil::MakeTupleShape(
                 {ShapeUtil::GetTupleElementShape(recv->shape(), 0),
@@ -628,7 +702,6 @@ Status ShapeVerifier::HandleRecv(HloInstruction* recv) {
 }
 
 Status ShapeVerifier::HandleRecvDone(HloInstruction* recv_done) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(recv_done, 1));
   return CheckShape(
       recv_done,
       ShapeUtil::MakeTupleShape(
@@ -638,7 +711,6 @@ Status ShapeVerifier::HandleRecvDone(HloInstruction* recv_done) {
 
 Status ShapeVerifier::HandleBatchNormTraining(
     HloInstruction* batch_norm_training) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(batch_norm_training, 3));
   return CheckShape(batch_norm_training,
                     ShapeInference::InferBatchNormTrainingShape(
                         batch_norm_training->operand(0)->shape(),
@@ -649,7 +721,6 @@ Status ShapeVerifier::HandleBatchNormTraining(
 
 Status ShapeVerifier::HandleBatchNormInference(
     HloInstruction* batch_norm_inference) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(batch_norm_inference, 5));
   return CheckShape(batch_norm_inference,
                     ShapeInference::InferBatchNormInferenceShape(
                         batch_norm_inference->operand(0)->shape(),
@@ -661,7 +732,6 @@ Status ShapeVerifier::HandleBatchNormInference(
 }
 
 Status ShapeVerifier::HandleBatchNormGrad(HloInstruction* batch_norm_grad) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(batch_norm_grad, 5));
   return CheckShape(batch_norm_grad, ShapeInference::InferBatchNormGradShape(
                                          batch_norm_grad->operand(0)->shape(),
                                          batch_norm_grad->operand(1)->shape(),
@@ -683,7 +753,7 @@ Status CheckMixedPrecisionOperands(const HloInstruction* instruction) {
     case HloOpcode::kCall:
     case HloOpcode::kConditional:
     case HloOpcode::kConstant:
-    case HloOpcode::kCrossReplicaSum:
+    case HloOpcode::kAllReduce:
     case HloOpcode::kCustomCall:
     case HloOpcode::kDomain:
     case HloOpcode::kFusion:
@@ -694,7 +764,6 @@ Status CheckMixedPrecisionOperands(const HloInstruction* instruction) {
     case HloOpcode::kRecv:
     case HloOpcode::kRecvDone:
     case HloOpcode::kReducePrecision:
-    case HloOpcode::kSelect:
     case HloOpcode::kTupleSelect:
     case HloOpcode::kSend:
     case HloOpcode::kSendDone:
@@ -730,7 +799,6 @@ Status CheckMixedPrecisionOperands(const HloInstruction* instruction) {
 }  // namespace
 
 Status ShapeVerifier::HandleGather(HloInstruction* gather) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(gather, 2));
   return CheckShape(
       gather,
       ShapeInference::InferGatherShape(
@@ -739,7 +807,6 @@ Status ShapeVerifier::HandleGather(HloInstruction* gather) {
 }
 
 Status ShapeVerifier::HandleScatter(HloInstruction* scatter) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(scatter, 3));
   return CheckShape(
       scatter, ShapeInference::InferScatterShape(
                    scatter->operand(0)->shape(), scatter->operand(1)->shape(),
@@ -757,7 +824,6 @@ Status ShapeVerifier::HandleAfterAll(HloInstruction* token) {
 }
 
 Status ShapeVerifier::HandleAddDependency(HloInstruction* add_dependency) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(add_dependency, 2));
   TF_RETURN_IF_ERROR(CheckIsTokenOperand(add_dependency, 1));
   return CheckShape(add_dependency, add_dependency->operand(0)->shape());
 }
@@ -839,14 +905,12 @@ Status ShapeVerifier::CheckShape(const HloInstruction* instruction,
 }
 
 Status ShapeVerifier::CheckUnaryShape(const HloInstruction* instruction) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(instruction, 1));
   return CheckShape(instruction,
                     ShapeInference::InferUnaryOpShape(instruction->opcode(),
                                                       instruction->operand(0)));
 }
 
 Status ShapeVerifier::CheckBinaryShape(const HloInstruction* instruction) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(instruction, 2));
   return CheckShape(
       instruction, ShapeInference::InferBinaryOpShape(instruction->opcode(),
                                                       instruction->operand(0),
@@ -854,7 +918,6 @@ Status ShapeVerifier::CheckBinaryShape(const HloInstruction* instruction) {
 }
 
 Status ShapeVerifier::CheckTernaryShape(const HloInstruction* instruction) {
-  TF_RETURN_IF_ERROR(CheckOperandCount(instruction, 3));
   return CheckShape(instruction,
                     ShapeInference::InferTernaryOpShape(
                         instruction->opcode(), instruction->operand(0),
@@ -982,7 +1045,7 @@ bool ShapeContainsToken(const Shape& shape) {
   bool contains_token = false;
   ShapeUtil::ForEachSubshape(
       shape, [&contains_token](const Shape& subshape, const ShapeIndex&) {
-        if (ShapeUtil::IsToken(subshape)) {
+        if (subshape.IsToken()) {
           contains_token = true;
         }
       });
@@ -1230,8 +1293,8 @@ Status CheckFusionInstruction(HloInstruction* fusion) {
   return Status::OK();
 }
 
-// Checks that the non-scalar operand shapes are compatible to the output
-// shape, i.e., that there are no implicit broadcasts of size-one dimensions.
+// Checks that the operand shapes are compatible to the output shape, i.e.,
+// that there are no implicit broadcasts.
 Status CheckElementwiseInstruction(HloInstruction* instruction) {
   const Shape& out_shape = instruction->shape();
   for (HloInstruction* operand : instruction->operands()) {
@@ -1270,11 +1333,11 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
     // op. See https://groups.google.com/forum/#!topic/xla-dev/9LqijHmTt_I
     // or ComputationLowerer::Visit()
     TF_RET_CHECK(broadcast->dimensions().size() ==
-                 ShapeUtil::Rank(broadcast->operand(0)->shape()))
+                 broadcast->operand(0)->shape().rank())
         << "Broadcast HLO (" << broadcast->ToShortString()
         << ") has invalid number of dimensions: "
         << broadcast->dimensions().size()
-        << " != " << ShapeUtil::Rank(broadcast->operand(0)->shape());
+        << " != " << broadcast->operand(0)->shape().rank();
     return Status::OK();
   }
 
@@ -1324,7 +1387,7 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
   }
 
   Status HandleGetTupleElement(HloInstruction* gte) override {
-    TF_RET_CHECK(ShapeUtil::IsTuple(gte->operand(0)->shape()));
+    TF_RET_CHECK(gte->operand(0)->shape().IsTuple());
     return Status::OK();
   }
 
@@ -1344,7 +1407,7 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
-  Status HandleCrossReplicaSum(HloInstruction* crs) override {
+  Status HandleAllReduce(HloInstruction* crs) override {
     if (crs->all_reduce_id().has_value()) {
       TF_RET_CHECK(crs->all_reduce_id().value() > 0)
           << "All reduce id must be greater than 0 for "
@@ -1375,7 +1438,7 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
       for (HloInstruction* operand : instruction->operands()) {
         const Shape& operand_shape = operand->shape();
         if (LayoutUtil::IsDenseArray(operand_shape) &&
-            ShapeUtil::Rank(operand_shape) == ShapeUtil::Rank(result_shape)) {
+            operand_shape.rank() == result_shape.rank()) {
           const Layout& operand_layout = operand_shape.layout();
           TF_RET_CHECK(LayoutUtil::Equal(result_layout, operand_layout))
               << "Instruction shouldn't change layouts "
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index e4d0c3d6957885f1d719fedb5a900de601e397f8..a9b5e9a3e6eec19e125188a192694fcaadfe2322 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -52,9 +52,11 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleDot(HloInstruction* dot) override;
   Status HandleConvolution(HloInstruction* convolution) override;
   Status HandleFft(HloInstruction* fft) override;
-  Status HandleCrossReplicaSum(HloInstruction* crs) override;
+  Status HandleTriangularSolve(HloInstruction* hlo) override;
+  Status HandleAllReduce(HloInstruction* crs) override;
   Status HandleAllToAll(HloInstruction* hlo) override;
   Status HandleCollectivePermute(HloInstruction* hlo) override;
+  Status HandleReplicaId(HloInstruction* hlo) override;
   Status HandleReducePrecision(HloInstruction* reduce_precision) override;
   Status HandleInfeed(HloInstruction*) override;
   Status HandleOutfeed(HloInstruction*) override;
@@ -168,8 +170,13 @@ class ShapeVerifier : public DfsHloVisitor {
 // An interface used to encapsulate target-specific verification quirks.
 class TargetVerifierMetadata {
  public:
+  TargetVerifierMetadata(std::function<int64(const Shape&)> shape_size_function)
+      : shape_size_function_(shape_size_function) {}
+
   // Returns a target-specific shape size.
-  virtual int64 ShapeSize(const Shape& shape) const = 0;
+  int64 ShapeSize(const Shape& shape) const {
+    return shape_size_function_(shape);
+  }
 
   virtual std::unique_ptr<ShapeVerifier> GetVerifier() const = 0;
 
@@ -178,20 +185,23 @@ class TargetVerifierMetadata {
 
   TargetVerifierMetadata(const TargetVerifierMetadata&) = delete;
   TargetVerifierMetadata& operator=(const TargetVerifierMetadata&) = delete;
+
+ private:
+  // Returns a target-specific shape size.
+  std::function<int64(const Shape&)> shape_size_function_;
 };
 
 // The default implementation of TargetVerifierMetadata, used unless the target
 // needs to override it.
 class DefaultVerifierMetadata : public TargetVerifierMetadata {
  public:
-  DefaultVerifierMetadata(bool layout_sensitive, bool allow_mixed_precision)
-      : layout_sensitive_(layout_sensitive),
+  DefaultVerifierMetadata(
+      bool layout_sensitive, bool allow_mixed_precision,
+      std::function<int64(const Shape&)> shape_size_function)
+      : TargetVerifierMetadata(shape_size_function),
+        layout_sensitive_(layout_sensitive),
         allow_mixed_precision_(allow_mixed_precision) {}
 
-  int64 ShapeSize(const Shape& shape) const override {
-    return ShapeUtil::ByteSizeOf(shape);
-  }
-
   // Creates a ShapeVerifier that checks that shapes match inferred
   // expectations. This creates a new verifier every time because ShapeVerifier,
   // being a DfsHloVisitor, is stateful. We want a clean object for each run of
@@ -210,11 +220,14 @@ class DefaultVerifierMetadata : public TargetVerifierMetadata {
 // the module.
 class HloVerifier : public HloModulePass {
  public:
-  explicit HloVerifier(bool layout_sensitive, bool allow_mixed_precision,
-                       std::function<bool(const HloInstruction*)>
-                           instruction_can_change_layout_func = {})
+  explicit HloVerifier(
+      bool layout_sensitive, bool allow_mixed_precision,
+      std::function<bool(const HloInstruction*)>
+          instruction_can_change_layout_func = {},
+      std::function<int64(const Shape&)> shape_size_func =
+          [](const Shape& shape) { return ShapeUtil::ByteSizeOf(shape); })
       : target_metadata_(absl::make_unique<DefaultVerifierMetadata>(
-            layout_sensitive, allow_mixed_precision)),
+            layout_sensitive, allow_mixed_precision, shape_size_func)),
         instruction_can_change_layout_func_(
             std::move(instruction_can_change_layout_func)) {
     CHECK(instruction_can_change_layout_func_ == nullptr || layout_sensitive);
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index 4bc557e4e62e7df4e25fda86fe417e84129b464c..523890b3c7268c06cdb6aaa67749f26a1cb62855 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/layout_assignment.h"
@@ -27,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
@@ -386,6 +388,55 @@ TEST_F(HloVerifierTest, AddWithLayoutChange) {
   ASSERT_TRUE(status.ok());
 }
 
+TEST_F(HloVerifierTest, ScalarIndexDynamicSlice) {
+  const char* const kScalarIndexDynamicSlice = R"(
+    HloModule DynamicSlice_module
+
+    ENTRY %DynamicSlice.v5 (original_parameter: s32[2,2,258], start_index: s32[]) -> s32[2,2,258] {
+      %original_parameter = s32[2,2,258] parameter(0)
+      %constant = s32[] constant(0)
+      %start_index = s32[] parameter(1)
+      ROOT %dynamic-slice = s32[2,2,258] dynamic-slice(s32[2,2,258] %original_parameter, s32[] %constant, s32[] %constant, s32[] %start_index), dynamic_slice_sizes={2,2,258}
+    }
+  )";
+
+  HloModuleConfig config;
+  DebugOptions debug_options = config.debug_options();
+  debug_options.set_xla_allow_scalar_index_dynamic_ops(true);
+  config.set_debug_options(debug_options);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseHloString(kScalarIndexDynamicSlice, config));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
+TEST_F(HloVerifierTest, ScalarIndexDynamicUpdateSlice) {
+  const char* const kScalarIndexDynamicSlice = R"(
+    HloModule DynamicUpdateSlice_module
+
+    ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_index.0: s32[], start_index.1: s32[], start_index.2: s32[], start_index.3: s32[]) -> s32[1,1,25,1] {
+      %input = s32[1,1,25,1]{3,2,1,0} parameter(0)
+      %update = s32[1,1,2,1]{3,2,1,0} parameter(1)
+      %start_index.0 = s32[] parameter(2)
+      %start_index.1 = s32[] parameter(3)
+      %start_index.2 = s32[] parameter(4)
+      %start_index.3 = s32[] parameter(5)
+      ROOT %dynamic-update-slice = s32[1,1,25,1]{3,2,1,0} dynamic-update-slice(s32[1,1,25,1]{3,2,1,0} %input, s32[1,1,2,1]{3,2,1,0} %update, s32[] %start_index.0, s32[] %start_index.1, s32[] %start_index.2, s32[] %start_index.3)
+    }
+  )";
+
+  HloModuleConfig config;
+  DebugOptions debug_options = config.debug_options();
+  debug_options.set_xla_allow_scalar_index_dynamic_ops(true);
+  config.set_debug_options(debug_options);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseHloString(kScalarIndexDynamicSlice, config));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
 TEST_F(HloVerifierTestLayoutSensitive, AddWithLayoutChangeNotAllowed) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(kAddWithLayoutChangeHlo));
   auto status = verifier().Run(module.get()).status();
@@ -399,8 +450,9 @@ TEST_F(HloVerifierTestLayoutSensitive, SliceWithLayoutChangeNotAllowed) {
    HloModule SliceWithLayoutChange
     ENTRY SliceWithLayoutChange {
       par0 = f32[4,5]{0,1} parameter(0)
-      par1 = s32[2] parameter(1)
-      ROOT dslice0 = f32[3,4]{1,0} dynamic-slice(par0, par1),
+      par1 = s32[] parameter(1)
+      par2 = s32[] parameter(2)
+      ROOT dslice0 = f32[3,4]{1,0} dynamic-slice(par0, par1, par2),
         dynamic_slice_sizes={3,4}
     }
   )";
@@ -429,5 +481,138 @@ TEST_F(HloVerifierTestLayoutSensitive, ConcatWithLayoutChangeNotAllowed) {
   EXPECT_THAT(status.error_message(),
               HasSubstr("Instruction shouldn't change layouts"));
 }
+
+TEST_F(HloVerifierTest, BitcastCanNotChangeElementType) {
+  const char* const hlo_string = R"(
+  HloModule Module
+
+  ENTRY BitcastCanNotChangeElementType {
+   constant.0 = f32[2] constant({0.0, 0.0})
+   ROOT bitcast = s32[2] bitcast(constant.0)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("Bitcast can not change the element type"));
+}
+
+TEST_F(HloVerifierTest, SelectMixedPrecisionNotAllowed) {
+  const char* const hlo_string = R"(
+  HloModule Module
+
+  ENTRY SelectMixedPrecisionNotAllowed {
+   p0 = pred[] parameter(0)
+   p1 = f32[32] parameter(1)
+   p2 = bf16[32] parameter(2)
+   ROOT select = f32[32] select(p0, p1, p2)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("Seen floating point types of different precisions"));
+}
+
+TEST_F(HloVerifierTestAllowMixedPrecision, SelectMixedPrecisionAllowed) {
+  const char* const hlo_string = R"(
+  HloModule Module
+
+  ENTRY SelectMixedPrecisionAllowed {
+   p0 = pred[] parameter(0)
+   p1 = f32[32] parameter(1)
+   p2 = bf16[32] parameter(2)
+   ROOT select = f32[32] select(p0, p1, p2)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
+TEST_F(HloVerifierTest, IotaNonArrayResult) {
+  const char* const hlo_string = R"(
+  HloModule IotaTupleResult
+
+  ENTRY  kernelEntry {
+    ROOT iota = () iota(), iota_dimension=24
+  }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(hlo_string));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("does not support non-array result"));
+}
+
+static const char* const kMapOperandComputationMismatchHlo = R"(
+  HloModule MapOperandComputationMismatch
+
+  Computation {
+    param0 = f32[] parameter(0)
+    constant = f32[] constant(1)
+    ROOT add = f32[] add(param0, constant)
+  }
+
+  ENTRY kernelEntry {
+  param = f64[] parameter(0)
+  ROOT map = f32[] map(param), dimensions={}, to_apply=Computation
+})";
+
+TEST_F(HloVerifierTest, MapOperandComputationMismatch) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseHloString(kMapOperandComputationMismatchHlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(
+      status.error_message(),
+      HasSubstr(
+          "Shape mismatch between to_apply computation parameter and operand"));
+}
+
+TEST_F(HloVerifierTestAllowMixedPrecision, MapOperandComputationMismatch) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseHloString(kMapOperandComputationMismatchHlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
+static const char* const kReduceOperandComputationMismatchHlo = R"(
+  HloModule ReduceOperandComputationMismatch
+  computation {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT add = f32[] add(x, y)
+  }
+
+  ENTRY kernelEntry {
+    arg0 = f16[64,64,224,224]{3,2,1,0} parameter(0)
+    constant = f16[] constant(0)
+    reduce = f16[64]{0} reduce(arg0, constant), dimensions={0,2,3}, to_apply=computation
+  })";
+
+TEST_F(HloVerifierTest, ReduceOperandComputationMismatch) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseHloString(kReduceOperandComputationMismatchHlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("Expected instruction to have shape equal to f32[64]"));
+}
+
+TEST_F(HloVerifierTestAllowMixedPrecision, ReduceOperandComputationMismatch) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseHloString(kReduceOperandComputationMismatchHlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
index 90904ac00110457bcc3b8974816a7080c4ab89fc..88fc62bd1e2a7830b3f61738a8642308ef4225a7 100644
--- a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
+++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
@@ -128,9 +128,9 @@ string HumanReadableProfileBuilder::ToString() const {
 
   // Sort ops in decreasing order of cycles, and print them.
   std::vector<OpInfo> sorted_ops(op_infos_);
-  std::sort(
-      sorted_ops.begin(), sorted_ops.end(),
-      [](const OpInfo& a, const OpInfo& b) { return a.cycles > b.cycles; });
+  absl::c_sort(sorted_ops, [](const OpInfo& a, const OpInfo& b) {
+    return a.cycles > b.cycles;
+  });
   for (const auto& op : sorted_ops) {
     print_op(op);
   }
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis.cc b/tensorflow/compiler/xla/service/indexed_array_analysis.cc
index 1ebb3319779c00fd4afe90606bf336e16349429d..c5d32a4b9ad8c708ec0870173fa72320238e8464 100644
--- a/tensorflow/compiler/xla/service/indexed_array_analysis.cc
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
-namespace gtl = ::tensorflow::gtl;
 
 namespace {
 using Analysis = IndexedArrayAnalysis;
@@ -103,7 +102,7 @@ Status IndexedArrayAnalysis::TraverseAndPopulateCache(
 
   do {
     const HloInstruction* instr = stack.back();
-    if (cache_.count(instr)) {
+    if (cache_.contains(instr)) {
       stack.pop_back();
       continue;
     }
@@ -111,9 +110,9 @@ Status IndexedArrayAnalysis::TraverseAndPopulateCache(
     switch (FindOrDie(dfs_state_map, instr)) {
       case kDiscovered: {
         for (const HloInstruction* operand : instr->operands()) {
-          if (!cache_.count(operand)) {
+          if (!cache_.contains(operand)) {
             stack.push_back(operand);
-            CHECK(!dfs_state_map.count(operand) ||
+            CHECK(!dfs_state_map.contains(operand) ||
                   dfs_state_map[operand] == kDiscovered);
             dfs_state_map[operand] = kDiscovered;
           }
@@ -1002,7 +1001,7 @@ bool CanFoldDotIntoIndexedArray(
     absl::Span<const int64> contracting_dims,
     absl::Span<const int64> batch_dims) {
   absl::optional<int64> non_contracting_non_batch_dim =
-      GetOnlyNonContractingNonBatchDim(ShapeUtil::Rank(indexed_array->shape()),
+      GetOnlyNonContractingNonBatchDim(indexed_array->shape().rank(),
                                        contracting_dims, batch_dims);
   if (!non_contracting_non_batch_dim.has_value()) {
     VLOG(3) << tag << ": multiple or no non-contracting non-batch dimensions";
@@ -1015,7 +1014,7 @@ bool CanFoldDotIntoIndexedArray(
     return false;
   }
 
-  int64 indexed_array_rank = ShapeUtil::Rank(indexed_array->shape());
+  int64 indexed_array_rank = indexed_array->shape().rank();
   if (indexed_array->source_dim() < (indexed_array_rank - 2)) {
     // This restriction can be lifted by inserting reshape nodes.
     VLOG(3) << tag
@@ -1043,7 +1042,7 @@ IndexedArrayAnalysis::ComputeArrayForDotWithIndexedLhs(
     return nullptr;
   }
 
-  int64 lhs_rank = ShapeUtil::Rank(lhs->shape());
+  int64 lhs_rank = lhs->shape().rank();
   DotDimensionNumbers new_dim_numbers = dim_numbers;
   new_dim_numbers.set_lhs_contracting_dimensions(
       0, lhs->source_dim() == (lhs_rank - 1) ? (lhs_rank - 2) : (lhs_rank - 1));
@@ -1078,7 +1077,7 @@ IndexedArrayAnalysis::ComputeArrayForDotWithIndexedRhs(
     return nullptr;
   }
 
-  int64 rhs_rank = ShapeUtil::Rank(rhs->shape());
+  int64 rhs_rank = rhs->shape().rank();
 
   DotDimensionNumbers new_dim_numbers = dim_numbers;
   new_dim_numbers.set_rhs_contracting_dimensions(
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
index 98246d5403e4aebc2f4d81e52145706355ddd9a9..62107b5a88d4e37552fa5a6384700a9291a9c655 100644
--- a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
@@ -13,9 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <ctype.h>
-
 #include "tensorflow/compiler/xla/service/indexed_array_analysis.h"
+#include "absl/strings/ascii.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 
@@ -43,7 +42,7 @@ class IndexedArrayAnalysisTest : public HloTestBase {
     string result;
 
     for (char c : text) {
-      if (!isspace(c)) {
+      if (!absl::ascii_isspace(c)) {
         result.push_back(c);
       } else if (!result.empty() && result.back() != ' ') {
         result.push_back(' ');
@@ -99,7 +98,7 @@ TEST_F(IndexedArrayAnalysisTest, SimpleOneToOneConstantGather) {
 HloModule SimpleGather
 
 ENTRY main {
-  operand = s32[3,3] constant(s32[3,3]{{1,2,3},{1,2,3},{1,2,3}})
+  operand = s32[3,3] constant({{1,2,3},{1,2,3},{1,2,3}})
   indices = s32[5] parameter(0)
   ROOT gather = s32[5,3] gather(operand, indices),
       offset_dims={1},
@@ -119,7 +118,7 @@ TEST_F(IndexedArrayAnalysisTest, GatherIsNotScalarIndexed0) {
 HloModule SimpleGather
 
 ENTRY main {
-  operand = s32[3,3] constant(s32[3,3]{{1,2,3},{1,2,3},{1,2,3}})
+  operand = s32[3,3] constant({{1,2,3},{1,2,3},{1,2,3}})
   indices = s32[5,2] parameter(0)
   ROOT gather = s32[5] gather(operand, indices),
       offset_dims={},
@@ -195,7 +194,7 @@ TEST_F(IndexedArrayAnalysisTest, GatherOfGather_OneToOne) {
 HloModule SimpleGather
 
 ENTRY main {
-  operand = s32[3,3] constant(s32[3,3]{{1,2,3},{1,2,3},{1,2,3}})
+  operand = s32[3,3] constant({{1,2,3},{1,2,3},{1,2,3}})
   indices_a = s32[5] parameter(0)
   indices_b = s32[2] parameter(1)
   gather_a = s32[5,3] gather(operand, indices_a),
@@ -309,7 +308,7 @@ TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather0) {
 HloModule ReshapeOfGather
 
 ENTRY main {
-  operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,2,3,4},{1,2,3,4}})
+  operand = s32[3,4] constant({{1,2,3,4},{1,2,3,4},{1,2,3,4}})
   indices = s32[5] parameter(0)
   gather = s32[5,4] gather(operand, indices),
       offset_dims={1},
@@ -330,7 +329,7 @@ TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather1) {
 HloModule ReshapeOfGather
 
 ENTRY main {
-  operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,2,3,4},{1,2,3,4}})
+  operand = s32[3,4] constant({{1,2,3,4},{1,2,3,4},{1,2,3,4}})
   indices = s32[5,7] parameter(0)
   gather = s32[5,4,7] gather(operand, indices),
       offset_dims={1},
@@ -352,7 +351,7 @@ TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather2) {
 HloModule ReshapeOfGather
 
 ENTRY main {
-  operand = s32[3,2,6] constant(s32[3,2,6]{
+  operand = s32[3,2,6] constant({
       {{1,2,3,4,5,6},{1,2,3,4,5,6}},
       {{1,2,3,4,5,6},{1,2,3,4,5,6}},
       {{1,2,3,4,5,6},{1,2,3,4,5,6}}})
@@ -377,7 +376,7 @@ TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather3) {
 HloModule ReshapeOfGather
 
 ENTRY main {
-  operand = s32[2,6] constant(s32[2,6]{
+  operand = s32[2,6] constant({
       {1,2,3,4,5,6},{1,2,3,4,5,6}})
   indices = s32[1] parameter(0)
   gather = s32[1,6] gather(operand, indices),
@@ -405,7 +404,7 @@ TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather4) {
 HloModule ReshapeOfGather
 
 ENTRY main {
-  operand = s32[2,3]{1,0} constant(s32[2,3] { { 1, 2, 3 }, { 1, 2, 3 } })
+  operand = s32[2,3]{1,0} constant({ { 1, 2, 3 }, { 1, 2, 3 } })
 
   i.0 = s64[1,3]{1,0} parameter(0)
   g.0 = s32[1,3,3]{2,1,0} gather(operand, i.0), offset_dims={2},
@@ -438,7 +437,7 @@ TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather5) {
 HloModule ReshapeOfGather
 
 ENTRY main {
-  operand = s32[1,6] constant(s32[1,6]{{1,2,3,4,5,6}})
+  operand = s32[1,6] constant({{1,2,3,4,5,6}})
   indices = s32[1] parameter(0)
   gather = s32[1,6] gather(operand, indices),
       offset_dims={1},
@@ -465,7 +464,7 @@ TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather6) {
 HloModule ReshapeOfGather
 
 ENTRY main {
-  operand = s32[1,2,6] constant(s32[1,2,6]{{
+  operand = s32[1,2,6] constant({{
       {1,2,3,4,5,6},{1,2,3,4,5,6}}})
   indices = s32[1] parameter(0)
   gather = s32[1,1,6] gather(operand, indices),
@@ -496,7 +495,7 @@ TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather7) {
 HloModule ReshapeOfGather
 
 ENTRY main {
-  operand = s32[2,6] constant(s32[2,6]{
+  operand = s32[2,6] constant({
       {1,2,3,4,5,6},{1,2,3,4,5,6}})
   indices = s32[1,5] parameter(0)
   gather = s32[1,5,6] gather(operand, indices),
@@ -527,7 +526,7 @@ TEST_F(IndexedArrayAnalysisTest, ReshapeOfGatherNoFold0) {
 HloModule ReshapeOfGather
 
 ENTRY main {
-  operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,2,3,4},{1,2,3,4}})
+  operand = s32[3,4] constant({{1,2,3,4},{1,2,3,4},{1,2,3,4}})
   indices = s32[5,6] parameter(0)
   gather = s32[5,4,6] gather(operand, indices),
       offset_dims={1},
@@ -556,7 +555,7 @@ TEST_F(IndexedArrayAnalysisTest, ReshapeOfGatherNoFold1) {
 HloModule ReshapeOfGather
 
 ENTRY main {
-  operand = s32[3,5,2] constant(s32[3,5,2]{
+  operand = s32[3,5,2] constant({
       {{1,2},{3,4},{5,6},{7,8},{9,10}},
       {{1,2},{3,4},{5,6},{7,8},{9,10}},
       {{1,2},{3,4},{5,6},{7,8},{9,10}}})
@@ -588,7 +587,7 @@ TEST_F(IndexedArrayAnalysisTest, ReshapeOfGatherNoFold2) {
 HloModule ReshapeOfGather
 
 ENTRY main {
-  operand = s32[3,4,1] constant(s32[3,4,1]{
+  operand = s32[3,4,1] constant({
     {{1},{2},{3},{4}},
     {{1},{2},{3},{4}},
     {{1},{2},{3},{4}}})
@@ -620,7 +619,7 @@ TEST_F(IndexedArrayAnalysisTest, UnaryOpOfGather) {
 HloModule UnaryOpOfGather
 
 ENTRY main {
-  operand = f32[3,4] constant(f32[3,4]{{1,2,3,4},{1,3,2,4},{4,3,2,1}})
+  operand = f32[3,4] constant({{1,2,3,4},{1,3,2,4},{4,3,2,1}})
   indices = s32[5] parameter(0)
   gather = f32[5,4] gather(operand, indices),
       offset_dims={1},
@@ -645,7 +644,7 @@ TEST_F(IndexedArrayAnalysisTest, AddBroadcastedScalarWithGather) {
 HloModule AddBroadcastedScalarWithGather
 
 ENTRY main {
-  gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,3,2,4},{4,3,2,1}})
+  gather_operand = s32[3,4] constant({{1,2,3,4},{1,3,2,4},{4,3,2,1}})
   constant = s32[] constant(5)
   constant_broadcasted = s32[5,4] broadcast(constant), dimensions={}
   indices = s32[5] parameter(0)
@@ -673,7 +672,7 @@ TEST_F(IndexedArrayAnalysisTest,
 HloModule SubtractBroadcastedScalarWithGather
 
 ENTRY main {
-  gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,3,2,4},{4,3,2,1}})
+  gather_operand = s32[3,4] constant({{1,2,3,4},{1,3,2,4},{4,3,2,1}})
   constant = s32[] constant(5)
   constant_broadcasted = s32[5,4] broadcast(constant), dimensions={}
   indices = s32[5] parameter(0)
@@ -701,7 +700,7 @@ TEST_F(IndexedArrayAnalysisTest,
 HloModule SubtractBroadcastedScalarWithGather
 
 ENTRY main {
-  gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,3,2,4},{4,3,2,1}})
+  gather_operand = s32[3,4] constant({{1,2,3,4},{1,3,2,4},{4,3,2,1}})
   constant = s32[] constant(5)
   constant_broadcasted = s32[5,4] broadcast(constant), dimensions={}
   indices = s32[5] parameter(0)
@@ -728,7 +727,7 @@ TEST_F(IndexedArrayAnalysisTest, AddBroadcastedVectorWithGather) {
 HloModule AddBroadcastedVectorWithGather
 
 ENTRY main {
-  gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,3,2,4},{4,3,2,1}})
+  gather_operand = s32[3,4] constant({{1,2,3,4},{1,3,2,4},{4,3,2,1}})
   constant_vect = s32[4] constant({10,11,12,13})
   constant_broadcasted = s32[5,4] broadcast(constant_vect), dimensions={1}
   indices = s32[5] parameter(0)
@@ -755,7 +754,7 @@ TEST_F(IndexedArrayAnalysisTest, AddBroadcastedVectorWithGather_Negative) {
 HloModule AddBroadcastedVectorWithGather
 
 ENTRY main {
-  gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,3,2,4},{4,3,2,1}})
+  gather_operand = s32[3,4] constant({{1,2,3,4},{1,3,2,4},{4,3,2,1}})
   constant_vect = s32[5] constant({10,11,12,13,14})
   constant_broadcasted = s32[5,4] broadcast(constant_vect), dimensions={0}
   indices = s32[5] parameter(0)
@@ -804,8 +803,8 @@ TEST_F(IndexedArrayAnalysisTest, DotOpBasic_0) {
 HloModule DotOp
 
 ENTRY main {
-  gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{5,6,7,8},{9,10,11,12}})
-  dot_rhs_constant = s32[4,3] constant(s32[4,3]{{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
+  gather_operand = s32[3,4] constant({{1,2,3,4},{5,6,7,8},{9,10,11,12}})
+  dot_rhs_constant = s32[4,3] constant({{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
   indices = s32[5] parameter(0)
   dot_lhs = s32[5,4] gather(gather_operand, indices),
       offset_dims={1},
@@ -831,8 +830,8 @@ TEST_F(IndexedArrayAnalysisTest, DotOpBasic_1) {
 HloModule DotOp
 
 ENTRY main {
-  gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{5,6,7,8},{9,10,11,12}})
-  dot_rhs_constant = s32[3,3] constant(s32[3,3]{{1,2,3},{4,5,6},{7,8,9}})
+  gather_operand = s32[3,4] constant({{1,2,3,4},{5,6,7,8},{9,10,11,12}})
+  dot_rhs_constant = s32[3,3] constant({{1,2,3},{4,5,6},{7,8,9}})
   indices = s32[5] parameter(0)
   dot_lhs = s32[3,5] gather(gather_operand, indices),
       offset_dims={0},
@@ -859,8 +858,8 @@ TEST_F(IndexedArrayAnalysisTest, DotOpBasic_2) {
 HloModule DotOp
 
 ENTRY main {
-  gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{5,6,7,8},{9,10,11,12}})
-  dot_lhs_constant = s32[4,3] constant(s32[4,3]{{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
+  gather_operand = s32[3,4] constant({{1,2,3,4},{5,6,7,8},{9,10,11,12}})
+  dot_lhs_constant = s32[4,3] constant({{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
   indices = s32[5] parameter(0)
   dot_rhs = s32[3,5] gather(gather_operand, indices),
       offset_dims={0},
@@ -888,8 +887,8 @@ TEST_F(IndexedArrayAnalysisTest, DotOpBasic_3) {
 HloModule DotOp
 
 ENTRY main {
-  gather_operand = s32[4,3] constant(s32[4,3]{{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
-  dot_lhs_constant = s32[4,3] constant(s32[4,3]{{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
+  gather_operand = s32[4,3] constant({{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
+  dot_lhs_constant = s32[4,3] constant({{1,2,3},{4,5,6},{7,8,9},{10,11,12}})
   indices = s32[5] parameter(0)
   dot_rhs = s32[5,3] gather(gather_operand, indices),
       offset_dims={1},
@@ -917,8 +916,8 @@ TEST_F(IndexedArrayAnalysisTest, DotOpWithBatch) {
 HloModule DotOp
 
 ENTRY main {
-  gather_operand = s32[2,3,2] constant(s32[2,3,2]{{{1,2},{3,4},{5,6}},{{7,8},{9,10},{11,12}}})
-  dot_lhs_constant = s32[2,2,3] constant(s32[2,2,3]{{{1,2,3},{4,5,6}},{{7,8,9},{10,11,12}}})
+  gather_operand = s32[2,3,2] constant({{{1,2},{3,4},{5,6}},{{7,8},{9,10},{11,12}}})
+  dot_lhs_constant = s32[2,2,3] constant({{{1,2,3},{4,5,6}},{{7,8,9},{10,11,12}}})
   indices = s32[4] parameter(0)
   dot_rhs = s32[2,3,4] gather(gather_operand, indices),
       offset_dims={0,1},
@@ -948,8 +947,8 @@ TEST_F(IndexedArrayAnalysisTest, DotOpNegative) {
 HloModule DotOp
 
 ENTRY main {
-  gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{5,6,7,8},{9,10,11,12}})
-  dot_rhs_constant = s32[2,3] constant(s32[2,3]{{1,2,3},{4,5,6}})
+  gather_operand = s32[3,4] constant({{1,2,3,4},{5,6,7,8},{9,10,11,12}})
+  dot_rhs_constant = s32[2,3] constant({{1,2,3},{4,5,6}})
   indices = s32[2] parameter(0)
   dot_lhs = s32[3,2] gather(gather_operand, indices),
       offset_dims={0},
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 7559ed1bab84b21a4d51bc38db999900befcfad7..f5770eee2250511c0e29e434f224b4ff347142ba 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/fusion_queue.h"
@@ -94,6 +95,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kPad:
     case HloOpcode::kReal:
     case HloOpcode::kReducePrecision:
+    case HloOpcode::kReplicaId:
     case HloOpcode::kReshape:
     case HloOpcode::kReverse:
     case HloOpcode::kRoundNearestAfz:
@@ -126,7 +128,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kCall:
     case HloOpcode::kConditional:
     case HloOpcode::kConvolution:
-    case HloOpcode::kCrossReplicaSum:
+    case HloOpcode::kAllReduce:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
     case HloOpcode::kCustomCall:
@@ -149,13 +151,16 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kReduceWindow:
     case HloOpcode::kRemainder:
     case HloOpcode::kRng:
+    case HloOpcode::kRsqrt:
     case HloOpcode::kScatter:
     case HloOpcode::kSelectAndScatter:
     case HloOpcode::kSend:
     case HloOpcode::kSendDone:
     case HloOpcode::kSort:
+    case HloOpcode::kSqrt:
     case HloOpcode::kTanh:
     case HloOpcode::kTrace:
+    case HloOpcode::kTriangularSolve:
     case HloOpcode::kWhile:
     case HloOpcode::kGetDimensionSize:
       return true;
@@ -173,23 +178,22 @@ bool InstructionFusion::EffectivelyAtMostUnary(HloInstruction* hlo) {
   ShapeUtil::ForEachSubshape(
       hlo->shape(),
       [&output_rank](const Shape& subshape, const ShapeIndex& shape_index) {
-        if (ShapeUtil::IsArray(subshape)) {
+        if (subshape.IsArray()) {
           output_rank = std::max(output_rank, ShapeUtil::TrueRank(subshape));
         }
       });
-  return std::count_if(hlo->operands().begin(), hlo->operands().end(),
-                       [output_rank](HloInstruction* operand) {
-                         if (operand->opcode() == HloOpcode::kBroadcast ||
-                             operand->opcode() == HloOpcode::kIota) {
-                           return false;
-                         }
-                         if (operand->opcode() == HloOpcode::kConstant &&
-                             ShapeUtil::IsEffectiveScalar(operand->shape())) {
-                           return false;
-                         }
-                         return ShapeUtil::TrueRank(operand->shape()) >=
-                                output_rank;
-                       }) <= 1;
+  return absl::c_count_if(
+             hlo->operands(), [output_rank](HloInstruction* operand) {
+               if (operand->opcode() == HloOpcode::kBroadcast ||
+                   operand->opcode() == HloOpcode::kIota) {
+                 return false;
+               }
+               if (operand->opcode() == HloOpcode::kConstant &&
+                   ShapeUtil::IsEffectiveScalar(operand->shape())) {
+                 return false;
+               }
+               return ShapeUtil::TrueRank(operand->shape()) >= output_rank;
+             }) <= 1;
 }
 
 bool InstructionFusion::CanFuseOnAllPaths(
@@ -273,7 +277,7 @@ InstructionFusion::ComputeGloballyUnfusible(
         ShapeUtil::ForEachSubshape(
             shape,
             [&size](const Shape& subshape, const ShapeIndex& shape_index) {
-              if (ShapeUtil::IsArray(subshape)) {
+              if (subshape.IsArray()) {
                 size += ShapeUtil::ElementsIn(subshape);
               }
             });
@@ -408,9 +412,8 @@ class ReversePostOrderFusionQueue : public FusionQueue {
       }
       sorted_operand_numbers.push_back(i);
     }
-    std::sort(
-        sorted_operand_numbers.begin(), sorted_operand_numbers.end(),
-        [&](int64 i, int64 j) {
+    absl::c_sort(
+        sorted_operand_numbers, [&](int64 i, int64 j) {
           // Instructions with higher priority in the queue come first.
           return (
               FindOrDie(post_order_index_, instruction->mutable_operand(i)) >
@@ -570,19 +573,42 @@ HloInstruction* InstructionFusion::FuseIntoMultiOutput(
 
 bool InstructionFusion::MultiOutputFusionCreatesCycle(
     HloInstruction* producer, HloInstruction* consumer) {
-  auto is_reachable = [&](const HloInstruction* a, const HloInstruction* b) {
-    // A consumer operand may have been multi-output fused into a parallel
-    // consumer and thus be missing from the original reachability map.
-    if (!reachability_->IsPresent(a) || !reachability_->IsPresent(b)) {
-      reachability_ = HloReachabilityMap::Build(consumer->parent());
+  absl::flat_hash_set<int> operands;
+  for (const HloInstruction* operand : consumer->operands()) {
+    if (operand == producer) {
+      continue;
+    }
+
+    // If the reachability map already contains the producer and the operand of
+    // the consumer, and the producer can reach the operand, then we know for
+    // sure MultiOutputFusion would create a cycle. If not, we need to do a DFS
+    // traversal of the computation to verify that this multioutput fusion would
+    // not create a cycle.
+    if (reachability_->IsPresent(producer) &&
+        reachability_->IsPresent(operand) &&
+        reachability_->IsReachable(producer, operand)) {
+      return true;
     }
-    return reachability_->IsReachable(a, b);
-  };
-  return absl::c_any_of(consumer->operands(),
-                        [&](const HloInstruction* consumer_operand) {
-                          return consumer_operand != producer &&
-                                 is_reachable(producer, consumer_operand);
-                        });
+    operands.insert(operand->unique_id());
+  }
+
+  // Do a DFS on the producer to see if any of the other consumer operands are
+  // reachable in the current state of the graph.
+  std::vector<HloInstruction*> worklist = producer->users();
+  absl::flat_hash_set<int> visits;
+  while (!worklist.empty()) {
+    const HloInstruction* user = worklist.back();
+    worklist.pop_back();
+    if (operands.count(user->unique_id()) != 0) {
+      return true;
+    }
+    if (visits.count(user->unique_id()) == 0) {
+      visits.insert(user->unique_id());
+      worklist.insert(worklist.end(), user->users().begin(),
+                      user->users().end());
+    }
+  }
+  return false;
 }
 
 bool InstructionFusion::ShouldFuse(HloInstruction* consumer,
diff --git a/tensorflow/compiler/xla/service/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
index 58b7135cea7419f13d60ed510ecf7a88126aee48..611cfd404d7622f561f0acc86fc9b05e16eea22e 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
@@ -259,8 +259,8 @@ TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusibleRecursively) {
     add = f32[4,3]{1,0} add(p0, p0)
     abs1 = f32[4,3]{1,0} abs(add)
     log = f32[4,3]{1,0} log(abs1)
-    token = token[] after-all()
-    send = f32[4,3]{1,0} send(log, token), channel_id=0
+    token0 = token[] after-all()
+    send = f32[4,3]{1,0} send(log, token0), channel_id=0
     abs2 = f32[4,3]{1,0} abs(log)
     ROOT root = f32[4,3]{1,0} subtract(abs2, add)
   })")
@@ -290,8 +290,8 @@ TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusibleRecursively) {
     p0 = f32[4,3]{1,0} parameter(0)
     add1 = f32[4,3]{1,0} add(p0, p0)
     log = f32[4,3]{1,0} log(p0)
-    token = token[] after-all()
-    send = f32[4,3]{1,0} send(log, token), channel_id=0
+    token0 = token[] after-all()
+    send = f32[4,3]{1,0} send(log, token0), channel_id=0
     add2 = f32[4,3]{1,0} add(log, add1)
     ROOT root = f32[4,3]{1,0} subtract(add1, add2)
   })")
@@ -324,8 +324,8 @@ TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusibleRecursively) {
     add1 = f32[4,3]{1,0} add(p0, p0)
     add2 = f32[4,3]{1,0} add(add1, add1)
     log = f32[4,3]{1,0} log(add2)
-    token = token[] after-all()
-    send = f32[4,3]{1,0} send(log, token), channel_id=0
+    token0 = token[] after-all()
+    send = f32[4,3]{1,0} send(log, token0), channel_id=0
     sub1 = f32[4,3]{1,0} subtract(log, add2)
     sub2 = f32[4,3]{1,0} subtract(add2, add1)
     ROOT root = (f32[4,3]{1,0}, f32[4,3]{1,0}) tuple(sub1, sub2)
diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD
index a981d94a999e3d322986bc2bfd56a5b0b5d175fc..8cd936268994c2a25c2c0debe0a003d1d05cbd0b 100644
--- a/tensorflow/compiler/xla/service/interpreter/BUILD
+++ b/tensorflow/compiler/xla/service/interpreter/BUILD
@@ -1,12 +1,12 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//visibility:public"])
-
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
     "if_static",
 )
 
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//visibility:public"])
+
 cc_library(
     name = "interpreter_transfer_manager",
     srcs = ["interpreter_transfer_manager.cc"],
@@ -34,6 +34,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:computation_placer",
+        "//tensorflow/compiler/xla/service:dynamic_index_splitter",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
         "//tensorflow/compiler/xla/service:hlo",
@@ -47,8 +48,11 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_subcomputation_unification",
         "//tensorflow/compiler/xla/service:layout_assignment",
         "//tensorflow/compiler/xla/service:map_inliner",
+        "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/service:reshape_mover",
+        "//tensorflow/compiler/xla/service:triangular_solve_expander",
         "//tensorflow/compiler/xla/service:while_loop_simplifier",
+        "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
         "//tensorflow/core:lib",
         "//tensorflow/stream_executor",
         "@com_google_absl//absl/memory",
@@ -115,6 +119,8 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_headers_lib",
+        "//tensorflow/stream_executor/host:host_stream",
+        "//tensorflow/stream_executor/host:host_timer",
         "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index 3a5177c418e3af8253df228a51f2fc0901d10041..792773c676984aa280c1b20cb7fd0fc7c9425f6c 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
+#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo_constant_folding.h"
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
@@ -31,7 +33,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/interpreter/executable.h"
 #include "tensorflow/compiler/xla/service/layout_assignment.h"
 #include "tensorflow/compiler/xla/service/map_inliner.h"
+#include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
+#include "tensorflow/compiler/xla/service/triangular_solve_expander.h"
 #include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -40,12 +44,51 @@ limitations under the License.
 namespace xla {
 namespace interpreter {
 
+namespace {
+
+// Handles custom_call ops during evaluation by routing them through the global
+// CPU registry used by other CPU-based backends.
+StatusOr<Literal> HandleEvaluatorCustomCall(
+    HloInstruction* custom_call, absl::Span<const Literal*> operands) {
+  // Find the target C function in the global registry.
+  auto* registry = xla::cpu::CustomCallTargetRegistry::Global();
+  void* target_fn = registry->Lookup(custom_call->custom_call_target());
+  if (!target_fn) {
+    return NotFound("Custom call target '%s' was not registered",
+                    custom_call->custom_call_target());
+  }
+
+  // Populate pointers to operand and output literal data.
+  std::vector<const void*> operand_data;
+  operand_data.reserve(operands.size());
+  for (const auto* literal : operands) {
+    operand_data.push_back(literal->untyped_data());
+  }
+  auto output = Literal::CreateFromShape(custom_call->shape());
+  void* output_data = output.untyped_data();
+
+  // Call the target function matching the C ABI used by the CPU backends.
+  auto* typed_fn = reinterpret_cast<void (*)(void*, const void**)>(target_fn);
+  (*typed_fn)(output_data, operand_data.data());
+
+  return std::move(output);
+}
+
+}  // namespace
+
 Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
   HloPassPipeline pipeline("Interpreter");
 
+  pipeline.AddPass<DynamicIndexSplitter>();
+  pipeline.AddPass<TriangularSolveExpander>();
   pipeline.AddPass<LayoutAssignment>(
       hlo_module->mutable_entry_computation_layout(),
       LayoutAssignment::InstructionCanChangeLayout);
+
+  ReducePrecisionInsertion::AddPasses(
+      &pipeline, hlo_module->config().debug_options(),
+      ReducePrecisionInsertion::PassTiming::BEFORE_OPTIMIZATION);
+
   return pipeline.Run(hlo_module).status();
 }
 
@@ -75,10 +118,15 @@ StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
   // In this case we are using an HloEvaluator at execution time, so we don't
   // need to compile anything
 
+  auto evaluator = absl::make_unique<HloEvaluator>();
+  evaluator->set_use_fast_path(
+      hlo_module->config().debug_options().xla_hlo_evaluator_use_fast_path());
+  evaluator->set_custom_call_handler(HandleEvaluatorCustomCall);
+
   // Create executable from only the Hlo module.
   std::unique_ptr<Executable> executable =
-      absl::make_unique<InterpreterExecutable>(
-          std::move(hlo_module), absl::make_unique<HloEvaluator>());
+      absl::make_unique<InterpreterExecutable>(std::move(hlo_module),
+                                               std::move(evaluator));
 
   return std::move(executable);
 }
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index de9204011ce5ba8a9fc2871c6bd7120b6ed371b5..7a6ebdef708bcc3a92fbd8618db0c42c35e6ce8b 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -68,6 +68,18 @@ StatusOr<ScopedShapedBuffer> InterpreterExecutable::ExecuteOnStream(
         "Mismatch between argument count and graph parameter count.");
   }
 
+  // Check that the args have the right shape.
+  for (int64 i = 0; i < computation->num_parameters(); ++i) {
+    const auto& expected_shape = computation->parameter_instruction(i)->shape();
+    const auto& actual_shape = arguments[i]->on_device_shape();
+    if (!ShapeUtil::Equal(expected_shape, actual_shape)) {
+      return InvalidArgument(
+          "Shape mismatch on parameter %d.  Expected %s, but was %s.", i,
+          ShapeUtil::HumanString(expected_shape),
+          ShapeUtil::HumanString(actual_shape));
+    }
+  }
+
   TF_ASSIGN_OR_RETURN(TransferManager * transfer_manager,
                       TransferManager::GetForPlatform(platform));
 
@@ -86,8 +98,8 @@ StatusOr<ScopedShapedBuffer> InterpreterExecutable::ExecuteOnStream(
   {
     tensorflow::mutex_lock lock(evaluator_lock_);
     evaluator_->ResetVisitStates();
-    TF_ASSIGN_OR_RETURN(result_literal, evaluator_->Evaluate<Literal>(
-                                            *computation, arg_literals));
+    TF_ASSIGN_OR_RETURN(result_literal,
+                        evaluator_->Evaluate(*computation, arg_literals));
   }
 
   // Transform the result literal back into a ShapedBuffer.
@@ -117,7 +129,7 @@ StatusOr<ScopedShapedBuffer> InterpreterExecutable::ExecuteAsyncOnStream(
 }
 
 /*static*/ int64 InterpreterExecutable::ShapeSizeBytes(const Shape& shape) {
-  if (ShapeUtil::IsOpaque(shape)) {
+  if (shape.IsOpaque()) {
     return sizeof(void*);
   }
   return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index eddef850cf5250b85b564c1e6c92d1cc8ecd1a43..aa791ea195e7a88fd8ad28fd0b60c88dea8a6928 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -147,12 +147,9 @@ bool LayoutConstraints::OperandBufferForwarded(
   PointsToSet::BufferSet* output_buffers = GetBufferSet(instruction);
   PointsToSet::BufferSet* operand_buffers =
       GetBufferSet(instruction->operand(operand_no));
-  for (const LogicalBuffer* output_buffer : *output_buffers) {
-    if (operand_buffers->count(output_buffer) > 0) {
-      return true;
-    }
-  }
-  return false;
+  return absl::c_any_of(*output_buffers, [&](const LogicalBuffer* b) {
+    return operand_buffers->count(b) > 0;
+  });
 }
 
 Status LayoutConstraints::SetBufferLayout(const Layout& layout,
@@ -256,7 +253,7 @@ Status LayoutConstraints::SetArrayOperandLayout(
     const Layout& layout, const HloInstruction* instruction, int64 operand_no,
     bool mandatory, bool dfs) {
   const HloInstruction* operand = instruction->operand(operand_no);
-  TF_RET_CHECK(ShapeUtil::IsArray(operand->shape()));
+  TF_RET_CHECK(operand->shape().IsArray());
   Shape shape(operand->shape());
   *shape.mutable_layout() = layout;
   TF_RETURN_IF_ERROR(LayoutUtil::ValidateLayoutInShape(shape));
@@ -314,7 +311,7 @@ Status LayoutConstraints::SetInstructionLayout(
         CHECK_EQ(1, buffers.size());
         CHECK_EQ(buffers[0]->instruction(), instruction);
 
-        if (ShapeUtil::IsArray(subshape)) {
+        if (subshape.IsArray()) {
           return SetBufferLayout(subshape.layout(), *buffers[0], mandatory);
         } else {
           return Status::OK();
@@ -406,7 +403,7 @@ Status LayoutAssignment::BuildHostChannelConstraints(
         instruction->opcode() == HloOpcode::kRecv) {
       const Shape& data_shape =
           ShapeUtil::GetTupleElementShape(send_recv_instr->shape(), 0);
-      TF_RET_CHECK(ShapeUtil::IsArray(data_shape));
+      TF_RET_CHECK(data_shape.IsArray());
       TF_RET_CHECK(LayoutUtil::HasLayout(data_shape));
       const Layout* prev_layout = host_channel_constraints_.ConstrainChannel(
           send_recv_instr->channel_id(), data_shape.layout());
@@ -489,7 +486,7 @@ Status LayoutAssignment::AddMandatoryConstraints(
       if (instruction->opcode() == HloOpcode::kSend) {
         // TODO(b/68493863): Change to use SetOperandLayout().
         const Shape send_buffer_shape = instruction->operand(0)->shape();
-        TF_RET_CHECK(ShapeUtil::IsArray(send_buffer_shape));
+        TF_RET_CHECK(send_buffer_shape.IsArray());
         Shape new_buffer_shape =
             get_channel_constraints(instruction)
                 ->LayoutShapeForChannel(send_buffer_shape,
@@ -499,7 +496,7 @@ Status LayoutAssignment::AddMandatoryConstraints(
       } else {
         const Shape recv_buffer_shape =
             ShapeUtil::GetTupleElementShape(instruction->shape(), 0);
-        TF_RET_CHECK(ShapeUtil::IsArray(recv_buffer_shape));
+        TF_RET_CHECK(recv_buffer_shape.IsArray());
         TF_ASSIGN_OR_RETURN(
             const LogicalBuffer* buffer,
             constraints->points_to_analysis().GetBufferDefinedAt(instruction,
@@ -520,7 +517,7 @@ Status LayoutAssignment::AddMandatoryConstraints(
       }
       // TODO(b/68493863): Change to use SetOperandLayout().
       const Shape& buffer_shape = instruction->operand(0)->shape();
-      TF_RET_CHECK(ShapeUtil::IsArray(buffer_shape));
+      TF_RET_CHECK(buffer_shape.IsArray());
       Shape new_buffer_shape =
           get_channel_constraints(instruction)
               ->LayoutShapeForChannel(buffer_shape, all_reduce_id);
@@ -780,7 +777,7 @@ StatusOr<HloInstruction*> LayoutAssignment::CreateCopyWithNewLayout(
       << ShapeUtil::HumanString(instruction->shape())
       << " instruction: " << instruction->ToString();
 
-  if (ShapeUtil::IsTuple(instruction->shape())) {
+  if (instruction->shape().IsTuple()) {
     // Copy tuple elements which have differing layouts.
     std::vector<HloInstruction*> element_copies;
     for (int64 i = 0; i < ShapeUtil::TupleElementCount(instruction->shape());
@@ -811,7 +808,7 @@ StatusOr<HloInstruction*> LayoutAssignment::CreateCopyWithNewLayout(
     TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
         shape_with_layout, tuple_copy->mutable_shape()));
     return tuple_copy;
-  } else if (ShapeUtil::IsArray(instruction->shape())) {
+  } else if (instruction->shape().IsArray()) {
     HloInstruction* copy =
         instruction->parent()->AddInstruction(HloInstruction::CreateUnary(
             instruction->shape(), HloOpcode::kCopy, instruction));
@@ -988,11 +985,10 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
     const Layout& output_layout, const HloInstruction* instruction,
     int64 operand_no) {
   const HloInstruction* operand = instruction->operand(operand_no);
-  CHECK(ShapeUtil::IsArray(instruction->shape()));
-  CHECK(ShapeUtil::IsArray(operand->shape()));
+  CHECK(instruction->shape().IsArray());
+  CHECK(operand->shape().IsArray());
   if (!ShapeUtil::IsScalar(operand->shape()) &&
-      ShapeUtil::Rank(operand->shape()) ==
-          ShapeUtil::Rank(instruction->shape()) &&
+      operand->shape().rank() == instruction->shape().rank() &&
       !instruction_can_change_layout_func_(instruction)) {
     // Propagate the result layout to the operand layout if the instruction
     // requires the same layout out for the result and the operand.
@@ -1012,7 +1008,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
     // operations. For similar reasons, if the operand and output have the same
     // rank, try to match the operand's layout to the output.
     if (ShapeUtil::TrueRank(operand->shape()) == 1 &&
-        ShapeUtil::Rank(instruction->shape()) == 1) {
+        instruction->shape().rank() == 1) {
       // Don't assign a layout in case of R1 -> effective R1 reshape.
       return nullptr;
     }
@@ -1026,7 +1022,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
     if (ShapeUtil::ReshapeIsBitcast(operand_shape, output_shape_with_layout)) {
       return absl::make_unique<Layout>(operand_shape.layout());
     }
-    if (ShapeUtil::Rank(operand_shape) == ShapeUtil::Rank(output_shape)) {
+    if (operand_shape.rank() == output_shape.rank()) {
       *operand_shape.mutable_layout() = output_layout;
       if (ShapeUtil::ReshapeIsBitcast(operand_shape,
                                       output_shape_with_layout)) {
@@ -1045,7 +1041,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
 
   if (instruction->opcode() == HloOpcode::kTranspose) {
     // Pick the operand layout that makes the transpose a bitcast.
-    int64 rank = ShapeUtil::Rank(instruction->shape());
+    int64 rank = instruction->shape().rank();
     std::vector<int64> new_minor_to_major(rank);
     for (int64 i = 0; i < rank; ++i) {
       int64 output_dim = LayoutUtil::Minor(output_layout, i);
@@ -1066,11 +1062,10 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
     int64 operand_no) {
   const HloInstruction* operand = user->operand(operand_no);
 
-  CHECK(ShapeUtil::IsArray(user->shape()) &&
-        ShapeUtil::IsArray(operand->shape()));
+  CHECK(user->shape().IsArray() && operand->shape().IsArray());
 
   if (!ShapeUtil::IsScalar(operand->shape()) &&
-      ShapeUtil::Rank(operand->shape()) == ShapeUtil::Rank(user->shape()) &&
+      operand->shape().rank() == user->shape().rank() &&
       !instruction_can_change_layout_func_(user)) {
     // Assign users the same layout as the operand.
     return absl::make_unique<Layout>(operand_layout);
@@ -1083,7 +1078,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
     // reshape is a bitcast when using the same layout. This may avoid copy
     // operations. For similar reasons, if the operand and output have the same
     // rank, try to match the outputs's layout to the operand.
-    if (ShapeUtil::Rank(operand->shape()) == 1 &&
+    if (operand->shape().rank() == 1 &&
         ShapeUtil::TrueRank(user->shape()) == 1) {
       // Don't assign a layout in case of R1 -> effective R1 reshape.
       return nullptr;
@@ -1098,7 +1093,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
     if (ShapeUtil::ReshapeIsBitcast(output_shape, operand_shape_with_layout)) {
       return absl::make_unique<Layout>(output_shape.layout());
     }
-    if (ShapeUtil::Rank(operand->shape()) == ShapeUtil::Rank(output_shape)) {
+    if (operand->shape().rank() == output_shape.rank()) {
       *output_shape.mutable_layout() = operand_layout;
       if (ShapeUtil::ReshapeIsBitcast(output_shape,
                                       operand_shape_with_layout)) {
@@ -1117,7 +1112,7 @@ std::unique_ptr<Layout> LayoutAssignment::ChooseOutputLayoutFromOperandLayout(
 
   if (user->opcode() == HloOpcode::kTranspose) {
     // Pick the user layout that makes the transpose a bitcast.
-    int64 rank = ShapeUtil::Rank(user->shape());
+    int64 rank = user->shape().rank();
     std::vector<int64> new_minor_to_major(rank);
     auto inverse_dimensions = InversePermutation(user->dimensions());
     for (int64 i = 0; i < rank; ++i) {
@@ -1193,7 +1188,7 @@ std::vector<std::pair<const HloInstruction*, int64>> GetArrayUsesOfBuffer(
   CHECK(buffer.IsArray());
   std::vector<std::pair<const HloInstruction*, int64>> uses;
   for (const auto& buffer_alias : points_to_analysis.GetBufferAliases(buffer)) {
-    if (!ShapeUtil::IsArray(buffer_alias.instruction()->shape())) {
+    if (!buffer_alias.instruction()->shape().IsArray()) {
       continue;
     }
     // This alias must be the top-level (index == {}) of the instruction's
@@ -1227,7 +1222,7 @@ Status LayoutAssignment::PropagateUseConstraintToDefs(
         if (ShapeUtil::IsLeafIndex(shape_layout.shape(), index)) {
           for (const LogicalBuffer* buffer : buffers) {
             if (constraints->BufferLayout(*buffer) == nullptr &&
-                ShapeUtil::IsArray(buffer->shape())) {
+                buffer->shape().IsArray()) {
               TF_RETURN_IF_ERROR(constraints->SetBufferLayout(
                   ShapeUtil::GetSubshape(shape_layout.shape(), index).layout(),
                   *buffer, /*mandatory=*/true));
@@ -1238,6 +1233,23 @@ Status LayoutAssignment::PropagateUseConstraintToDefs(
       });
 }
 
+namespace {
+// A transpose or a reshape that only changes trivial dimensions have meaningful
+// layouts that are valuable to propagate in a depthfirst manner to avoid
+// unassigned layouts in the graph.
+bool InstructionShouldPropagateDepthFirst(const HloInstruction& hlo) {
+  switch (hlo.opcode()) {
+    case HloOpcode::kReshape:
+      return std::get<0>(hlo.ReshapeMerelyInsertsOrDeletes1SizedDimensions());
+    case HloOpcode::kTranspose:
+      return true;
+    default:
+      return false;
+  }
+}
+
+}  // namespace
+
 Status LayoutAssignment::PropagateOperandConstraint(
     const OperandLayoutConstraint& operand_constraint,
     LayoutConstraints* constraints) {
@@ -1258,11 +1270,10 @@ Status LayoutAssignment::PropagateOperandConstraint(
   // layout for the operands with the same ranks.
   const HloInstruction* operand = operand_constraint.operand();
   const HloInstruction* user = operand_constraint.instruction();
-  if (!ShapeUtil::IsArray(operand->shape())) {
+  if (!operand->shape().IsArray()) {
     return Status::OK();
   }
-  if (instruction_can_change_layout_func_(user) &&
-      !ShapeUtil::IsArray(user->shape())) {
+  if (instruction_can_change_layout_func_(user) && !user->shape().IsArray()) {
     return Status::OK();
   }
 
@@ -1273,7 +1284,7 @@ Status LayoutAssignment::PropagateOperandConstraint(
     return Status::OK();
   }
 
-  int64 operand_rank = ShapeUtil::Rank(operand->shape());
+  int64 operand_rank = operand->shape().rank();
   if (operand_rank <= 1) {
     return Status::OK();
   }
@@ -1288,7 +1299,7 @@ Status LayoutAssignment::PropagateOperandConstraint(
         continue;
       }
       const HloInstruction* sibling = user->operand(operand_no);
-      const int64 sibling_rank = ShapeUtil::Rank(sibling->shape());
+      const int64 sibling_rank = sibling->shape().rank();
       if (sibling_rank <= 1) {
         continue;
       }
@@ -1317,16 +1328,16 @@ Status LayoutAssignment::PropagateOperandConstraint(
     TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
         user->shape(),
         [&](const Shape& subshape, const ShapeIndex& shape_index) {
-          if (ShapeUtil::IsTuple(subshape)) {
+          if (subshape.IsTuple()) {
             return Status::OK();
           }
-          if (ShapeUtil::Rank(subshape) <= 1) {
+          if (subshape.rank() <= 1) {
             return Status::OK();
           }
 
           // Assign the right layout to input fusion of higher rank reduce
           // operations.
-          if (ShapeUtil::Rank(subshape) != ShapeUtil::Rank(operand->shape())) {
+          if (subshape.rank() != operand->shape().rank()) {
             return Status::OK();
           }
           // TODO(b/67641796): Are there cases except fusion that use this code
@@ -1354,10 +1365,10 @@ Status LayoutAssignment::PropagateOperandConstraint(
   }
   TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
       user->shape(), [&](const Shape& subshape, const ShapeIndex& shape_index) {
-        if (ShapeUtil::IsTuple(subshape)) {
+        if (subshape.IsTuple()) {
           return Status::OK();
         }
-        if (ShapeUtil::Rank(subshape) <= 1) {
+        if (subshape.rank() <= 1) {
           return Status::OK();
         }
         TF_ASSIGN_OR_RETURN(
@@ -1373,7 +1384,7 @@ Status LayoutAssignment::PropagateOperandConstraint(
             TF_RETURN_IF_ERROR(constraints->SetBufferLayout(
                 *layout, *buffer,
                 /*mandatory=*/user->opcode() == HloOpcode::kReduce,
-                /*dfs=*/false));
+                /*dfs=*/InstructionShouldPropagateDepthFirst(*user)));
           }
         }
         return Status::OK();
@@ -1401,8 +1412,8 @@ Status LayoutAssignment::PropagateBufferConstraintToOperands(
     }
     if (!instruction_can_change_layout_func_(instruction)) {
       // Copy the layout to the operand.
-      if (buffer.IsArray() && ShapeUtil::IsArray(operand->shape()) &&
-          ShapeUtil::Rank(operand->shape()) ==
+      if (buffer.IsArray() && operand->shape().IsArray() &&
+          operand->shape().rank() ==
               LayoutUtil::MinorToMajor(buffer_constraint.layout()).size()) {
         TF_RETURN_IF_ERROR(constraints->SetArrayOperandLayout(
             buffer_constraint.layout(), instruction, operand_no,
@@ -1410,7 +1421,7 @@ Status LayoutAssignment::PropagateBufferConstraintToOperands(
       }
     } else {
       if (!buffer.IsTopLevel() ||
-          !ShapeUtil::IsArray(instruction->operand(operand_no)->shape())) {
+          !instruction->operand(operand_no)->shape().IsArray()) {
         continue;  // Don't touch buffers that are internal to a tuple.
       }
       VLOG(6) << "Propagating constraint to operand " << operand_no << " of "
@@ -1423,11 +1434,9 @@ Status LayoutAssignment::PropagateBufferConstraintToOperands(
             ChooseOperandLayoutFromOutputLayout(buffer_constraint.layout(),
                                                 instruction, operand_no);
         if (operand_layout != nullptr) {
-          // Do not propagate operand constraints of transposes and reshapes, it
-          // tends to create really bad layouts.
           TF_RETURN_IF_ERROR(constraints->SetArrayOperandLayout(
               *operand_layout, instruction, operand_no, /*mandatory=*/false,
-              /*dfs=*/false));
+              /*dfs=*/InstructionShouldPropagateDepthFirst(*instruction)));
         }
       } else {
         VLOG(6) << "Operand already has a constraint "
@@ -1497,7 +1506,7 @@ StatusOr<Layout> InferArrayLayout(
   // This function should only be called for array shapes which don't yet have
   // layouts.
   const Shape& subshape = ShapeUtil::GetSubshape(instruction->shape(), index);
-  TF_RET_CHECK(ShapeUtil::IsArray(subshape));
+  TF_RET_CHECK(subshape.IsArray());
   TF_RET_CHECK(!subshape.has_layout());
 
   // The instruction should not define the buffer at this index.
@@ -1576,8 +1585,9 @@ Status SetFusionLayouts(HloInstruction* fusion) {
           fused_instruction->mutable_shape()));
     } else if (fused_instruction->opcode() == HloOpcode::kInfeed) {
       // Nop; leave the infeed layout alone.
-    } else {
+    } else if (fusion->fusion_kind() != HloInstruction::FusionKind::kCustom) {
       // Other instructions don't have layouts inside of fusion nodes.
+      // But do not clear layouts for other instructions in custom fusion nodes.
       LayoutUtil::ClearLayout(fused_instruction->mutable_shape());
     }
   }
@@ -1615,7 +1625,7 @@ Status LayoutAssignment::AssignLayouts(const LayoutConstraints& constraints,
     for (const LogicalBuffer* buffer :
          constraints.points_to_analysis().GetBuffersDefinedByInstruction(
              instruction)) {
-      if (!ShapeUtil::IsArray(buffer->shape())) {
+      if (!buffer->shape().IsArray()) {
         continue;
       }
 
@@ -1639,7 +1649,7 @@ Status LayoutAssignment::AssignLayouts(const LayoutConstraints& constraints,
     TF_RETURN_IF_ERROR(ShapeUtil::ForEachMutableSubshapeWithStatus(
         instruction->mutable_shape(),
         [instruction, &constraints](Shape* subshape, const ShapeIndex& index) {
-          if (subshape->has_layout() || !ShapeUtil::IsArray(*subshape)) {
+          if (subshape->has_layout() || !subshape->IsArray()) {
             return Status::OK();
           }
           // Set Layout of subshape to match layout of LogicalBuffer which
@@ -2012,7 +2022,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kConditional:
     case HloOpcode::kConvert:
     case HloOpcode::kCos:
-    case HloOpcode::kCrossReplicaSum:
+    case HloOpcode::kAllReduce:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
     case HloOpcode::kDivide:
@@ -2048,6 +2058,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kRemainder:
     case HloOpcode::kReverse:
     case HloOpcode::kRoundNearestAfz:
+    case HloOpcode::kRsqrt:
     case HloOpcode::kScatter:
     case HloOpcode::kSelect:
     case HloOpcode::kSelectAndScatter:
@@ -2058,8 +2069,10 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kSin:
     case HloOpcode::kSlice:
     case HloOpcode::kSort:
+    case HloOpcode::kSqrt:
     case HloOpcode::kSubtract:
     case HloOpcode::kTanh:
+    case HloOpcode::kTriangularSolve:
     case HloOpcode::kTupleSelect:
     case HloOpcode::kWhile:
       return false;
@@ -2085,6 +2098,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kRecv:
     case HloOpcode::kRecvDone:
     case HloOpcode::kReduce:
+    case HloOpcode::kReplicaId:
     case HloOpcode::kReshape:
     case HloOpcode::kRng:
     case HloOpcode::kSend:
@@ -2100,8 +2114,8 @@ bool LayoutAssignment::InstructionCanChangeLayout(
 
 /* static */
 bool LayoutAssignment::IsAtMostRank1(const Shape& shape) {
-  if (ShapeUtil::IsArray(shape)) {
-    return ShapeUtil::Rank(shape) <= 1;
+  if (shape.IsArray()) {
+    return shape.rank() <= 1;
   }
   return absl::c_all_of(shape.tuple_shapes(), [](const Shape& subshape) {
     return IsAtMostRank1(subshape);
@@ -2123,7 +2137,7 @@ Status LayoutAssignment::ClearPreviousPassSideEffects(HloModule* module) {
     for (HloInstruction* instruction :
          computation->MakeInstructionPostOrder()) {
       if (instruction->opcode() == HloOpcode::kCopy &&
-          added_copies_.count(instruction) > 0) {
+          added_copies_.contains(instruction)) {
         VLOG(5) << "Removing added copy: " << instruction->ToString();
         TF_RETURN_IF_ERROR(
             instruction->ReplaceAllUsesWith(instruction->mutable_operand(0)));
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index 3b081de3c7826c3c11a7d87d542835d0ecce1b7e..5701cb5b025e563247d46d0d24f81a5f886fc23b 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -243,7 +243,7 @@ class ChannelLayoutConstraints {
 
   // Returns true if channel_id has a layout constraint.
   bool IsChannelConstrained(int64 channel_id) const {
-    return constraints_.count(channel_id) > 0;
+    return constraints_.contains(channel_id);
   }
 
   // Given `shape`, apply the layout for `channel_id`. `channel_id` must already
@@ -276,7 +276,7 @@ class ChannelLayoutConstraints {
   }
 
  private:
-  std::unordered_map<int64, Layout> constraints_;
+  absl::flat_hash_map<int64, Layout> constraints_;
 };
 
 // HLO pass which assigns layouts to all instructions in the HLO module while
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 5c661bfacb08fe27f3cbdc1fb9db083315166008..c8cf3c47d380012fdb0206c0d20d67e6a13017ae 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -528,8 +528,7 @@ class OperandsMustBeTheSameLayoutAssignment : public LayoutAssignment {
     for (int64 operand_no = 0; operand_no < instruction->operand_count();
          ++operand_no) {
       const HloInstruction* operand = instruction->operand(operand_no);
-      if (ShapeUtil::Rank(instruction->shape()) !=
-          ShapeUtil::Rank(operand->shape())) {
+      if (instruction->shape().rank() != operand->shape().rank()) {
         continue;
       }
       TF_RETURN_IF_ERROR(constraints->SetArrayOperandLayout(
@@ -847,12 +846,12 @@ TEST_F(LayoutAssignmentTest, ChannelLayoutMismatch) {
     ENTRY entry_computation {
       param = (f32[2,2]) parameter(0)
       gte = f32[2,2] get-tuple-element(param), index=0
-      token = token[] after-all()
-      recv = (f32[2,2], u32[], token[]) recv(token), channel_id=1, sharding={maximal device=1}
+      token0 = token[] after-all()
+      recv = (f32[2,2], u32[], token[]) recv(token0), channel_id=1, sharding={maximal device=1}
       recv-done = (f32[2,2], token[]) recv-done(recv), channel_id=1,
         sharding={maximal device=1}
       ROOT root = f32[2,2] get-tuple-element(recv-done), index=0
-      send = (f32[2,2], u32[], token[]) send(gte, token), channel_id=1,
+      send = (f32[2,2], u32[], token[]) send(gte, token0), channel_id=1,
         sharding={maximal device=0}
       send-done = token[] send-done(send), channel_id=1, sharding={maximal device=0}
     }
@@ -894,11 +893,11 @@ TEST_F(LayoutAssignmentTest, AllReduceLayoutMissmatch) {
     ENTRY entry_computation {
       param = (f32[2,2]) parameter(0)
       gte = f32[2,2] get-tuple-element(param), index=0
-      ar.0 = f32[2,2] cross-replica-sum(gte),
+      ar.0 = f32[2,2] all-reduce(gte),
         all_reduce_id=1, replica_groups={{0}}, to_apply=add,
         sharding={maximal device=0}
-      const = f32[2,2] constant(f32[2,2]{{0,1},{2,3}})
-      ROOT ar.1 = f32[2,2] cross-replica-sum(const),
+      const = f32[2,2] constant({{0,1},{2,3}})
+      ROOT ar.1 = f32[2,2] all-reduce(const),
         all_reduce_id=1, replica_groups={{0}}, to_apply=add,
         sharding={maximal device=1}
     })";
@@ -961,8 +960,9 @@ TEST_F(LayoutAssignmentTest, CopyDSliceOperandToAvoidImplicitLayoutChange) {
     ENTRY CopyDSliceOperandToAvoidImplicitLayoutChange {
       par0 = f32[3,4]{1,0} parameter(0)
       par1 = f32[4,5]{0,1} parameter(1)
-      par2 = s32[2] parameter(2)
-      dslice0 = f32[3,4] dynamic-slice(par1, par2), dynamic_slice_sizes={3,4}
+      par2 = s32[] parameter(2)
+      par3 = s32[] parameter(3)
+      dslice0 = f32[3,4] dynamic-slice(par1, par2, par3), dynamic_slice_sizes={3,4}
       ROOT add0 = f32[3,4]{1,0} add(par0,dslice0)
     }
   )";
@@ -983,7 +983,7 @@ TEST_F(LayoutAssignmentTest, CopyDSliceOperandToAvoidImplicitLayoutChange) {
                   m::Parameter(),
                   m::DynamicSlice(
                       m::Copy(m::Parameter(1)).WithShapeEqualTo(&shape_copy),
-                      m::Parameter(2)))));
+                      m::Parameter(2), m::Parameter(3)))));
 }
 
 TEST_F(LayoutAssignmentTest, CopyConcatOperandToAvoidImplicitLayoutChange) {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index 728a66b388f0f9af480ff88b5e96990a26e36af5..c5d59fb28e02ce229967fb3856012d608fb83c5d 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -39,7 +39,6 @@ cc_library(
         "//tensorflow/compiler/xla/service:logical_buffer",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@llvm//:core",
     ],
@@ -169,6 +168,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:elemental_ir_emitter",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
         "@llvm//:core",
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
index 643ecd0fbaa546c551097b29e74ccd49418e1466..ce3d922ca7a9bdea3a520959a8b8d284bc3e0d64 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
@@ -81,9 +81,7 @@ void AliasAnalysis::AddAliasingInformationToIrArray(const HloInstruction& hlo,
     if (hlo.opcode() == HloOpcode::kParameter) {
       const std::vector<HloInstruction*>& parameter_instructions =
           module_.entry_computation()->parameter_instructions();
-      if (std::find(parameter_instructions.begin(),
-                    parameter_instructions.end(),
-                    &hlo) != parameter_instructions.end()) {
+      if (absl::c_linear_search(parameter_instructions, &hlo)) {
         array->MarkInvariantOverWholeProgram(context_);
       }
     }
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h
index 2b46b3c3964b15548dbacc8b0ada0047a0fa85b6..12e2f449e23ac2511aac576fed893f5a9ef510c0 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h
@@ -76,15 +76,12 @@ class AliasAnalysis {
   // A map from a buffer slice to metadata corresponding to its alias.scope
   // metadata.  The index kParameterAliasSet is used to hold aliasing
   // information for parameters.
-  absl::flat_hash_map<BufferAllocation::Slice, llvm::MDNode*,
-                      BufferAllocation::Slice::Hasher>
+  absl::flat_hash_map<BufferAllocation::Slice, llvm::MDNode*>
       alias_scope_metadata_;
 
   // A map from a buffer slice to metadata corresponding to its noalias
   // metadata.
-  absl::flat_hash_map<BufferAllocation::Slice, llvm::MDNode*,
-                      BufferAllocation::Slice::Hasher>
-      noalias_metadata_;
+  absl::flat_hash_map<BufferAllocation::Slice, llvm::MDNode*> noalias_metadata_;
 };
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc
index bdce4a171b8a58f617f1d56e6cf6db5354846703..1ea5a42b0b398818b0946eaa9e214100007bada4 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -41,14 +41,26 @@ static const HloInstruction& InstrForConstantBufferAllocation(
   return *const_instr;
 }
 
-string ConstantBufferAllocationToGlobalName(
-    const BufferAllocation& allocation) {
-  string instr_name = InstrForConstantBufferAllocation(allocation).name();
+string SanitizeConstantName(const HloInstruction& instr) {
+  CHECK_EQ(instr.opcode(), HloOpcode::kConstant);
+  string instr_name = instr.name();
   for (char& c : instr_name) {
-    if (c == '.') {
+    // Having a hyphen or a dot in a global variable name can crash the LLVM PTX
+    // backend.
+    if (c == '.' || c == '-') {
       c = '_';
     }
   }
+  return instr_name;
+}
+
+string ConstantBufferAllocationToGlobalName(
+    const BufferAllocation& allocation) {
+  const HloInstruction& instr = InstrForConstantBufferAllocation(allocation);
+  string instr_name = instr.name();
+  // Check that names are sanitized and stored in the HLO instructions
+  // before constant buffer allocation.
+  DCHECK_EQ(instr_name, SanitizeConstantName(instr));
   return absl::StrCat("buffer_for_", instr_name);
 }
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h
index bfb6eecb87f6a1b756b3a8da3377f608dd7f0be7..03e98a66900095889292cbff9d9924a9abe83ab0 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h
@@ -20,6 +20,10 @@ limitations under the License.
 
 namespace xla {
 namespace llvm_ir {
+// Sanitizes the HLO constant instruction name so that it can be used for the
+// name of the corresponding constant buffer. In particular, it replaces . and
+// - with _.
+string SanitizeConstantName(const HloInstruction& instr);
 // In XLA:GPU we map constant buffer allocations to globals in the generated
 // LLVM IR.  This function gives us the name of the global variable a constant
 // buffer is mapped to.  Not used on XLA:CPU.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
index 4d7f36d9f8b565a819edf0631efc5c7a58c4f87f..3acceccfa556103c15fe229c41e96e618ac59c80 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
@@ -36,19 +36,20 @@ bool CanUpdateDynamicSliceInPlace(HloInstruction* dynamic_update_slice,
 // EmitFusedDynamicUpdateSliceInPlace.
 //
 // Emits a sequential loop if launch_dimensions is null.
+using IndexGenerator = std::function<StatusOr<llvm::Value*>(int64)>;
+
 static Status EmitDynamicUpdateSliceInPlaceImpl(
-    const Shape& update_shape, const ElementGenerator& start_indices_generator,
+    const Shape& update_shape, const IndexGenerator& start_indices_generator,
     bool is_signed, ElementGenerator update_array_generator,
     const IrArray& output_array, const gpu::LaunchDimensions* launch_dimensions,
     absl::string_view name, llvm::IRBuilder<>* b) {
   const Shape& output_shape = output_array.GetShape();
 
   // Read start indices from start_indices_generator.
-  const int64 rank = ShapeUtil::Rank(output_shape);
+  const int64 rank = output_shape.rank();
   IrArray::Index start_index(b->getInt64Ty(), rank);
   for (int64 i = 0; i < rank; ++i) {
-    IrArray::Index dim_index({b->getInt64(i)});
-    TF_ASSIGN_OR_RETURN(start_index[i], start_indices_generator(dim_index));
+    TF_ASSIGN_OR_RETURN(start_index[i], start_indices_generator(i));
     llvm::Value* output_dim_size = llvm::ConstantInt::get(
         start_index[i]->getType(), output_shape.dimensions(i));
     llvm::Value* update_dim_size = llvm::ConstantInt::get(
@@ -112,8 +113,9 @@ Status EmitDynamicUpdateSliceInPlace(absl::Span<const IrArray> operand_arrays,
   Shape output_shape = output_array.GetShape();
   Shape update_shape = update_array.GetShape();
 
-  ElementGenerator start_indices_generator = [&](const IrArray::Index& index) {
-    return start_indices_array.EmitReadArrayElement(index, b);
+  IndexGenerator start_indices_generator = [&](int64 index) {
+    return operand_arrays[2 + index].EmitReadArrayElement(
+        IrArray::Index(b->getInt64Ty()), b);
   };
   ElementGenerator update_array_generator = [&](const IrArray::Index& index) {
     return update_array.EmitReadArrayElement(index, b);
@@ -165,9 +167,12 @@ static Status EmitFusedDynamicUpdateSliceInPlaceImpl(
                                elemental_emitter);
   TF_RETURN_IF_ERROR(dynamic_update_slice->Accept(&fused_emitter));
   ElementGenerator update_array_generator = fused_emitter.GetGenerator(update);
-  ElementGenerator start_indices_generator =
-      fused_emitter.GetGenerator(start_indices);
 
+  IndexGenerator start_indices_generator = [&](int64 index) {
+    ElementGenerator element_generator =
+        fused_emitter.GetGenerator(dynamic_update_slice->operand(2 + index));
+    return element_generator(IrArray::Index(b->getInt64Ty()));
+  };
   bool is_signed = ShapeUtil::ElementIsSigned(start_indices->shape());
   return EmitDynamicUpdateSliceInPlaceImpl(
       update_shape, start_indices_generator, is_signed, update_array_generator,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
index 38f2b5da23a7b92e4547dceaba011ce654977da3..e440f05e2b2f0d4a2a4c7b326b4881183de4d235 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
@@ -35,7 +35,7 @@ using llvm_ir::IrArray;
 Status FusedIrEmitter::DefaultAction(HloInstruction* hlo) {
   indexed_generators_[hlo] =
       [=](const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-    if (generated_value_cache_[hlo].count(index.multidim()) > 0) {
+    if (generated_value_cache_[hlo].contains(index.multidim())) {
       llvm::Value* generated_value =
           generated_value_cache_[hlo][index.multidim()];
       llvm::BasicBlock* generated_value_bb = nullptr;
@@ -115,7 +115,7 @@ Status FusedIrEmitter::HandleGetTupleElement(
         /*alignment=*/1, tuple_ptr, b_, module_);
   };
 
-  if (!ShapeUtil::IsTuple(get_tuple_element->shape())) {
+  if (!get_tuple_element->shape().IsTuple()) {
     indexed_generators_[get_tuple_element] =
         [=](const IrArray::Index& index) -> StatusOr<llvm::Value*> {
       // TODO(b/34080002) Add aliasing information to tuple element IrArray.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
index 1b9c61f6700e2a1309b21e499f4a9e2439ed3702..e6d52a580c04a920d3f0e8ed6f39c1cae587cf1b 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <map>
 #include <unordered_map>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "llvm/IR/IRBuilder.h"
@@ -134,8 +135,9 @@ class FusedIrEmitter : public DfsHloVisitorWithDefault {
 
   // Cache of generated values, lest we regenerate an element of a node with
   // multiple outgoing edges
-  std::unordered_map<const HloInstruction*,
-                     std::map<std::vector<llvm::Value*>, llvm::Value*>>
+  absl::flat_hash_map<
+      const HloInstruction*,
+      absl::flat_hash_map<std::vector<llvm::Value*>, llvm::Value*>>
       generated_value_cache_;
 };
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
index 67f7423121177e2ca1e3384341dad2644c8f5e34..8ee07ae8331e986f9d271be5e39065f0d87853b1 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
@@ -61,7 +61,7 @@ void IrArray::Index::Delinearize(std::vector<llvm::Value*>* multidim,
 
 IrArray::Index::Index(llvm::Value* linear, const Shape& shape,
                       llvm::IRBuilder<>* b)
-    : multidim_(ShapeUtil::Rank(shape)),
+    : multidim_(shape.rank()),
       linear_(linear),
       layout_(shape.layout()),
       dims_(shape.dimensions().begin(), shape.dimensions().end()) {
@@ -104,8 +104,8 @@ IrArray::Index::Index(absl::Span<llvm::Value* const> multidim,
   CHECK(LayoutUtil::HasLayout(shape));
 }
 
-IrArray::IrArray(llvm::Value* base_ptr, const Shape& shape)
-    : base_ptr_(base_ptr), shape_(&shape) {
+IrArray::IrArray(llvm::Value* base_ptr, Shape shape)
+    : base_ptr_(base_ptr), shape_(std::move(shape)) {
   TF_CHECK_OK(ShapeUtil::ValidateShape(shape));
   CHECK(base_ptr_->getType()->isPointerTy());
   int depth = 0;
@@ -117,10 +117,10 @@ IrArray::IrArray(llvm::Value* base_ptr, const Shape& shape)
     ++depth;
   }
 
-  if (!ShapeUtil::IsArray(*shape_) || ShapeUtil::IsScalar(*shape_)) {
+  if (!shape_->IsArray() || ShapeUtil::IsScalar(*shape_)) {
     DCHECK(depth == 1 || depth == 0) << depth;
   } else {
-    DCHECK_EQ(depth, ShapeUtil::Rank(*shape_)) << shape.ShortDebugString();
+    DCHECK_EQ(depth, shape_->rank()) << shape.ShortDebugString();
   }
 }
 
@@ -137,12 +137,12 @@ IrArray::Index IrArray::Index::SourceIndexOfReshape(
     const Shape& output_shape, const Shape& input_shape,
     llvm::IRBuilder<>* builder) const {
   const auto& target_index = *this;
-  CHECK_EQ(target_index.size(), ShapeUtil::Rank(output_shape));
+  CHECK_EQ(target_index.size(), output_shape.rank());
   std::vector<std::pair<int64, int64>> common_factors =
       CommonFactors(AsInt64Slice(input_shape.dimensions()),
                     AsInt64Slice(output_shape.dimensions()));
   std::vector<llvm::Value*> source_multidim_index(
-      ShapeUtil::Rank(input_shape), llvm::UndefValue::get(index_type_));
+      input_shape.rank(), llvm::UndefValue::get(index_type_));
   // We compute the source indices in each common factor from only the target
   // indices in the same common factor.
   for (ssize_t k = common_factors.size() - 2; k >= 0; --k) {
@@ -257,7 +257,7 @@ IrArray::Index IrArray::Index::SourceIndexOfBroadcast(
     const Shape& shape, const Shape& operand_shape,
     absl::Span<const int64> dimension_mapping,
     llvm::IRBuilder<>* builder) const {
-  int64 rank = ShapeUtil::Rank(operand_shape);
+  int64 rank = operand_shape.rank();
   std::vector<llvm::Value*> source_index(rank);
   for (int64 i = 0; i < rank; ++i) {
     source_index[i] = multidim_[dimension_mapping[i]];
@@ -271,7 +271,7 @@ IrArray::Index IrArray::Index::SourceIndexOfBroadcast(
   // The other dimensions can be masked out with a div and a mod operation.
   std::vector<int64> logical_to_physical =
       LayoutUtil::MakeLogicalToPhysical(shape.layout());
-  int64 output_rank = ShapeUtil::Rank(shape);
+  int64 output_rank = shape.rank();
   // The minimum physical dimension that is broadcasted.
   int64 min_broadcasted_dimension = output_rank;
   // The maximum physical dimension that is broadcasted.
@@ -348,7 +348,7 @@ llvm::Value* IrArray::EmitArrayElementAddress(const IrArray::Index& index,
     // over higher-rank arrays.
     return base_ptr_;
   }
-  CHECK_EQ(index.size(), ShapeUtil::Rank(*shape_));
+  CHECK_EQ(index.size(), shape_->rank());
 
   if (index.LinearValidOnShape(*shape_)) {
     llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
index d6d84994ee147f4b8c1a333b0eaccdf6e0a2219b..b706ebd311cbb706e7e4698b93319e37e664d10a 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
@@ -130,6 +130,11 @@ class IrArray {
       CHECK_LE(index, size());
       mutable_multidim().insert(mutable_multidim().begin() + index, value);
     }
+    void InsertAt(int64 index, int64 count, llvm::Value* value) {
+      CHECK_LE(index, size());
+      mutable_multidim().insert(mutable_multidim().begin() + index, count,
+                                value);
+    }
 
     using iterator = std::vector<llvm::Value*>::iterator;
     using const_iterator = std::vector<llvm::Value*>::const_iterator;
@@ -189,6 +194,8 @@ class IrArray {
       return llvm::ConstantInt::get(index_type_, c);
     }
 
+    void ClearLinearIndex() { linear_ = nullptr; }
+
    private:
     // Changing the multi-dimensional index invalidates the linear index.
     std::vector<llvm::Value*>& mutable_multidim() {
@@ -220,11 +227,11 @@ class IrArray {
   };
 
   // Default constructor. Constructs an IrArray in a null status.
-  IrArray() : base_ptr_(nullptr), shape_(nullptr) {}
+  IrArray() : base_ptr_(nullptr) {}
 
   // Construct an IrArray with the given base pointer and shape. base_ptr is a
   // pointer type pointing to the first element(lowest address) of the array.
-  IrArray(llvm::Value* base_ptr, const Shape& shape);
+  IrArray(llvm::Value* base_ptr, Shape shape);
 
   // Default implementations of copying and moving.
   IrArray(IrArray&& other) = default;
@@ -236,7 +243,6 @@ class IrArray {
   llvm::Type* GetElementLlvmType() const { return element_type_; }
 
   const Shape& GetShape() const {
-    CHECK(shape_ != nullptr);
     return *shape_;
   }
 
@@ -331,7 +337,7 @@ class IrArray {
   llvm::Type* element_type_;
 
   // Shape of the XLA array.
-  const Shape* shape_;
+  absl::optional<Shape> shape_;
 
   // The list of key/value pairs used when attaching metadata to emitted
   // loads/stores for this array.  They keys are the metadata kinds and the
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h b/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h
index abc06fb7b4245294df2dc20d25a22ac4fdaeb4cf..02c719502ee7b0a732ae74acec364f89d51ae0c1 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h
@@ -254,6 +254,11 @@ class IrBuilderMixin {
     return mixin_builder()->CreateFCmpOLT(std::forward<Args>(args)...);
   }
 
+  template <class... Args>
+  llvm::Value* FCmpOLE(Args&&... args) {
+    return mixin_builder()->CreateFCmpOLE(std::forward<Args>(args)...);
+  }
+
   template <class... Args>
   llvm::Value* FCmpONE(Args&&... args) {
     return mixin_builder()->CreateFCmpONE(std::forward<Args>(args)...);
@@ -264,6 +269,11 @@ class IrBuilderMixin {
     return mixin_builder()->CreateFCmpUNE(std::forward<Args>(args)...);
   }
 
+  template <class... Args>
+  llvm::Value* FCmpUNO(Args&&... args) {
+    return mixin_builder()->CreateFCmpUNO(std::forward<Args>(args)...);
+  }
+
   template <class... Args>
   llvm::Value* FDiv(Args&&... args) {
     return mixin_builder()->CreateFDiv(std::forward<Args>(args)...);
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
index bd0139f85b6a5c5dc23dad962263038451921e65..5eeb29c478a371dae83251771f2dc4844672d3e9 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
@@ -18,28 +18,29 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 
 namespace xla {
-Status KernelSupportLibrary::For(
+Status KernelSupportLibrary::ForWithStatus(
     absl::string_view name, llvm::Value* start, llvm::Value* end,
     llvm::Value* step,
     const std::function<Status(llvm::Value*, bool)>& for_body_generator) {
-  return If(b_->CreateICmpSLT(start, end), [&]() -> Status {
+  return IfWithStatus(b_->CreateICmpSLT(start, end), [&]() -> Status {
     TF_RETURN_IF_ERROR(for_body_generator(start, /*is_first_iteration=*/true));
-    return For(name, b_->CreateAdd(start, step), end, step,
-               [&](llvm::Value* iv) { return for_body_generator(iv, false); });
+    return ForWithStatus(
+        name, b_->CreateAdd(start, step), end, step,
+        [&](llvm::Value* iv) { return for_body_generator(iv, false); });
   });
 }
 
-Status KernelSupportLibrary::For(
+Status KernelSupportLibrary::ForWithStatus(
     absl::string_view name, llvm::Value* start, llvm::Value* end,
     llvm::Value* step, bool peel_first_iteration,
     const std::function<Status(llvm::Value*, llvm::Value*)>&
         for_body_generator) {
   if (peel_first_iteration) {
-    return For(name, start, end, step, true,
-               [&](llvm::Value* indvar, bool is_first_iteration) -> Status {
-                 return for_body_generator(indvar,
-                                           b_->getInt1(is_first_iteration));
-               });
+    return ForWithStatus(
+        name, start, end, step, true,
+        [&](llvm::Value* indvar, bool is_first_iteration) -> Status {
+          return for_body_generator(indvar, b_->getInt1(is_first_iteration));
+        });
   } else {
     std::unique_ptr<llvm_ir::ForLoop> loop = llvm_ir::ForLoop::EmitForLoop(
         name, start, end, step, b_,
@@ -55,7 +56,7 @@ Status KernelSupportLibrary::For(
   }
 }
 
-Status KernelSupportLibrary::If(
+Status KernelSupportLibrary::IfWithStatus(
     absl::string_view name, llvm::Value* condition,
     const std::function<Status()>& true_block_generator,
     const std::function<Status()>& false_block_generator) {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
index 43fec311f150d6054f6ad24f99db332f90ff94a3..612b839cfa15711061e1ae53358a72d5220e1801 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
@@ -48,41 +48,42 @@ class KernelSupportLibrary {
   //     for (i64 i = `start` + `step`; i s< `end`; i += `step`)
   //       `for_body_generator(/*ind_var=*/,i, /*is_first_iteration=*/false)`;
   //   }
-  Status For(
+  Status ForWithStatus(
       absl::string_view name, llvm::Value* start, llvm::Value* end,
       llvm::Value* step,
       const std::function<Status(llvm::Value* ind_var,
                                  bool is_first_iteration)>& for_body_generator);
 
-  void ForReturnVoid(
+  void For(
       absl::string_view name, llvm::Value* start, llvm::Value* end,
       llvm::Value* step,
       const std::function<void(llvm::Value* ind_var, bool is_first_iteration)>&
           for_body_generator) {
     CHECK_EQ(Status::OK(),
-             For(name, start, end, step,
+             ForWithStatus(
+                 name, start, end, step,
                  [&](llvm::Value* ind_var, bool is_first_iteration) -> Status {
                    for_body_generator(ind_var, is_first_iteration);
                    return Status::OK();
                  }));
   }
 
-  Status For(absl::string_view name, int64 start, int64 end, int64 step,
-             const std::function<Status(llvm::Value* ind_var,
-                                        bool is_first_iteration)>&
-                 for_body_generator) {
-    return For(name, /*start=*/b_->getInt64(start),
-               /*end=*/b_->getInt64(end),
-               /*step=*/b_->getInt64(step), for_body_generator);
+  Status ForWithStatus(
+      absl::string_view name, int64 start, int64 end, int64 step,
+      const std::function<Status(
+          llvm::Value* ind_var, bool is_first_iteration)>& for_body_generator) {
+    return ForWithStatus(name, /*start=*/b_->getInt64(start),
+                         /*end=*/b_->getInt64(end),
+                         /*step=*/b_->getInt64(step), for_body_generator);
   }
 
-  void ForReturnVoid(
+  void For(
       absl::string_view name, int64 start, int64 end, int64 step,
       const std::function<void(llvm::Value* ind_var, bool is_first_iteration)>&
           for_body_generator) {
-    ForReturnVoid(name, /*start=*/b_->getInt64(start),
-                  /*end=*/b_->getInt64(end),
-                  /*step=*/b_->getInt64(step), for_body_generator);
+    For(name, /*start=*/b_->getInt64(start),
+        /*end=*/b_->getInt64(end),
+        /*step=*/b_->getInt64(step), for_body_generator);
   }
 
   // Generates the following control flow structure if `peel_first_iteration` is
@@ -99,19 +100,19 @@ class KernelSupportLibrary {
   //   for (i64 i = `start`; i s< `end`; i += `step`)
   //     `for_body_generator(/*ind_var=*/,i,
   //                         /*is_first_iteration=*/,(i != `start`))`;
-  Status For(absl::string_view name, llvm::Value* start, llvm::Value* end,
-             llvm::Value* step, bool peel_first_iteration,
-             const std::function<Status(llvm::Value* ind_var,
-                                        llvm::Value* is_first_iteration)>&
-                 for_body_generator);
+  Status ForWithStatus(
+      absl::string_view name, llvm::Value* start, llvm::Value* end,
+      llvm::Value* step, bool peel_first_iteration,
+      const std::function<Status(llvm::Value* ind_var,
+                                 llvm::Value* is_first_iteration)>&
+          for_body_generator);
 
-  void ForReturnVoid(absl::string_view name, llvm::Value* start,
-                     llvm::Value* end, llvm::Value* step,
-                     bool peel_first_iteration,
-                     const std::function<void(llvm::Value* ind_var,
-                                              llvm::Value* is_first_iteration)>&
-                         for_body_generator) {
-    TF_CHECK_OK(For(
+  void For(absl::string_view name, llvm::Value* start, llvm::Value* end,
+           llvm::Value* step, bool peel_first_iteration,
+           const std::function<void(llvm::Value* ind_var,
+                                    llvm::Value* is_first_iteration)>&
+               for_body_generator) {
+    TF_CHECK_OK(ForWithStatus(
         name, start, end, step, peel_first_iteration,
         [&](llvm::Value* ind_var, llvm::Value* is_first_iteration) -> Status {
           for_body_generator(ind_var, is_first_iteration);
@@ -119,80 +120,81 @@ class KernelSupportLibrary {
         }));
   }
 
-  Status For(absl::string_view name, llvm::Value* start, llvm::Value* end,
-             int64 step, bool peel_first_iteration,
-             const std::function<Status(llvm::Value* ind_var,
-                                        llvm::Value* is_first_iteration)>&
-                 for_body_generator) {
-    return For(name, /*start=*/start, /*end=*/end,
-               /*step=*/llvm::ConstantInt::get(start->getType(), step),
-               peel_first_iteration, for_body_generator);
+  Status ForWithStatus(
+      absl::string_view name, llvm::Value* start, llvm::Value* end, int64 step,
+      bool peel_first_iteration,
+      const std::function<Status(llvm::Value* ind_var,
+                                 llvm::Value* is_first_iteration)>&
+          for_body_generator) {
+    return ForWithStatus(
+        name, /*start=*/start, /*end=*/end,
+        /*step=*/llvm::ConstantInt::get(start->getType(), step),
+        peel_first_iteration, for_body_generator);
   }
 
-  void ForReturnVoid(absl::string_view name, llvm::Value* start,
-                     llvm::Value* end, int64 step, bool peel_first_iteration,
-                     const std::function<void(llvm::Value* ind_var,
-                                              llvm::Value* is_first_iteration)>&
-                         for_body_generator) {
-    ForReturnVoid(name, /*start=*/start, /*end=*/end,
-                  /*step=*/llvm::ConstantInt::get(start->getType(), step),
-                  peel_first_iteration, for_body_generator);
+  void For(absl::string_view name, llvm::Value* start, llvm::Value* end,
+           int64 step, bool peel_first_iteration,
+           const std::function<void(llvm::Value* ind_var,
+                                    llvm::Value* is_first_iteration)>&
+               for_body_generator) {
+    For(name, /*start=*/start, /*end=*/end,
+        /*step=*/llvm::ConstantInt::get(start->getType(), step),
+        peel_first_iteration, for_body_generator);
   }
 
-  Status For(
+  Status ForWithStatus(
       absl::string_view name, llvm::Value* start, llvm::Value* end,
       llvm::Value* step,
       const std::function<Status(llvm::Value* ind_var)>& for_body_generator) {
-    return For(name, start, end, step,
-               /*peel_first_iteration=*/false,
-               [&](llvm::Value* indvar, llvm::Value*) -> Status {
-                 return for_body_generator(indvar);
-               });
+    return ForWithStatus(name, start, end, step,
+                         /*peel_first_iteration=*/false,
+                         [&](llvm::Value* indvar, llvm::Value*) -> Status {
+                           return for_body_generator(indvar);
+                         });
   }
 
-  void ForReturnVoid(
+  void For(
       absl::string_view name, llvm::Value* start, llvm::Value* end,
       llvm::Value* step,
       const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
-    ForReturnVoid(name, start, end, step,
-                  /*peel_first_iteration=*/false,
-                  [&](llvm::Value* indvar, llvm::Value*) {
-                    return for_body_generator(indvar);
-                  });
+    For(name, start, end, step,
+        /*peel_first_iteration=*/false, [&](llvm::Value* indvar, llvm::Value*) {
+          return for_body_generator(indvar);
+        });
   }
 
-  Status For(
+  Status ForWithStatus(
       absl::string_view name, llvm::Value* start, llvm::Value* end, int64 step,
       const std::function<Status(llvm::Value* ind_var)>& for_body_generator) {
-    return For(name, start, end, llvm::ConstantInt::get(start->getType(), step),
-               /*peel_first_iteration=*/false,
-               [&](llvm::Value* indvar, llvm::Value*) -> Status {
-                 return for_body_generator(indvar);
-               });
+    return ForWithStatus(name, start, end,
+                         llvm::ConstantInt::get(start->getType(), step),
+                         /*peel_first_iteration=*/false,
+                         [&](llvm::Value* indvar, llvm::Value*) -> Status {
+                           return for_body_generator(indvar);
+                         });
   }
 
-  void ForReturnVoid(
+  void For(
       absl::string_view name, llvm::Value* start, llvm::Value* end, int64 step,
       const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
-    ForReturnVoid(name, start, end,
-                  llvm::ConstantInt::get(start->getType(), step),
-                  for_body_generator);
+    For(name, start, end, llvm::ConstantInt::get(start->getType(), step),
+        for_body_generator);
   }
 
-  Status For(
+  Status ForWithStatus(
       absl::string_view name, int64 start, int64 end, int64 step,
       const std::function<Status(llvm::Value* ind_var)>& for_body_generator) {
-    return For(name, /*start=*/b_->getInt64(start),
-               /*end=*/b_->getInt64(end),
-               /*step=*/b_->getInt64(step), for_body_generator);
+    return ForWithStatus(name, /*start=*/b_->getInt64(start),
+                         /*end=*/b_->getInt64(end),
+                         /*step=*/b_->getInt64(step), for_body_generator);
   }
 
-  void ForReturnVoid(
+  void For(
       absl::string_view name, int64 start, int64 end, int64 step,
       const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
-    ForReturnVoid(name, /*start=*/b_->getInt64(start),
-                  /*end=*/b_->getInt64(end),
-                  /*step=*/b_->getInt64(step), for_body_generator);
+    For(name, /*start=*/b_->getInt64(start),
+        /*end=*/b_->getInt64(end),
+        /*step=*/b_->getInt64(step), for_body_generator);
   }
 
   // Generates the following control flow structure:
@@ -201,38 +203,43 @@ class KernelSupportLibrary {
   //     `true_block_generator()`;
   //   else
   //      `false_block_generator()`;
-  Status If(absl::string_view name, llvm::Value* condition,
-            const std::function<Status()>& true_block_generator,
-            const std::function<Status()>& false_block_generator =
-                []() -> Status { return Status::OK(); });
+  Status IfWithStatus(
+      absl::string_view name, llvm::Value* condition,
+      const std::function<Status()>& true_block_generator,
+      const std::function<Status()>& false_block_generator = []() -> Status {
+        return Status::OK();
+      });
 
-  Status If(llvm::Value* condition,
-            const std::function<Status()>& true_block_generator,
-            const std::function<Status()>& false_block_generator =
-                []() -> Status { return Status::OK(); }) {
-    return If("", condition, true_block_generator, false_block_generator);
+  Status IfWithStatus(
+      llvm::Value* condition,
+      const std::function<Status()>& true_block_generator,
+      const std::function<Status()>& false_block_generator = []() -> Status {
+        return Status::OK();
+      }) {
+    return IfWithStatus("", condition, true_block_generator,
+                        false_block_generator);
   }
 
-  void IfReturnVoid(llvm::Value* condition,
-                    const std::function<void()>& true_block_generator,
-                    const std::function<void()>& false_block_generator = []() {
-                    }) {
-    IfReturnVoid("", condition, true_block_generator, false_block_generator);
+  void If(
+      llvm::Value* condition, const std::function<void()>& true_block_generator,
+      const std::function<void()>& false_block_generator = []() {}) {
+    If("", condition, true_block_generator, false_block_generator);
   }
 
-  void IfReturnVoid(absl::string_view name, llvm::Value* condition,
-                    const std::function<void()>& true_block_generator,
-                    const std::function<void()>& false_block_generator = []() {
-                    }) {
-    TF_CHECK_OK(If(name, condition,
-                   [&]() {
-                     true_block_generator();
-                     return Status::OK();
-                   },
-                   [&]() {
-                     false_block_generator();
-                     return Status::OK();
-                   }));
+  void If(
+      absl::string_view name, llvm::Value* condition,
+      const std::function<void()>& true_block_generator,
+      const std::function<void()>& false_block_generator = []() {}) {
+    TF_CHECK_OK(IfWithStatus(
+        name, condition,
+        [&]() {
+          true_block_generator();
+          return Status::OK();
+        },
+        [&]() {
+          false_block_generator();
+          return Status::OK();
+        }));
   }
 
   using ArgumentVector = absl::Span<llvm::Value* const>;
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
index c26711e526c9b89cdedcb6aed9f93d41dd25dc83..cd8dd72cd775d5e0b52f96a2326367da0775e7eb 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
@@ -120,10 +120,11 @@ KernelMappingScheme::KernelMappingScheme(
     absl::Span<const int64> req_block_sizes, int64 num_threads_y,
     int64 num_threads_x, llvm::IRBuilder<>* b)
     : b_(b),
-      dims_in_elems_(dims_in_elems),
+      dims_in_elems_(dims_in_elems.begin(), dims_in_elems.end()),
       tile_sizes_{1, tile_size_y, tile_size_x},
       num_threads_x_(num_threads_x),
-      num_threads_y_(num_threads_y) {
+      num_threads_y_(num_threads_y),
+      dilated_x_(true) {
   DCHECK_EQ(dims_in_elems_.size(), 3);
   DCHECK_EQ(req_block_sizes.size(), 3);
 
@@ -170,14 +171,16 @@ IrArray::Index KernelMappingScheme::EmitBlockIndex(llvm::Type* index_ty) {
 
 IrArray::Index KernelMappingScheme::GetTileIndexForBlockOrigin(
     const IrArray::Index& block_index) {
-  IrArray::Index tile_index = block_index;
+  DCHECK_EQ(block_index.size(), block_sizes_.size());
+  std::vector<llvm::Value*> multidim;
+  multidim.reserve(block_sizes_.size());
   for (int i = 0; i < block_sizes_.size(); ++i) {
-    tile_index[i] = b_->CreateMul(
+    multidim.push_back(b_->CreateMul(
         block_index[i],
         llvm::ConstantInt::get(block_index[i]->getType(), block_sizes_[i]),
-        "block_origin." + std::to_string(i));
+        "block_origin." + std::to_string(i)));
   }
-  return tile_index;
+  return IrArray::Index(multidim, block_index[0]->getType());
 }
 
 IrArray::Index KernelMappingScheme::GetElementIndexForTileOrigin(
@@ -217,14 +220,14 @@ KernelMappingScheme::EmitThreadYXCoordinate(llvm::Type* index_ty) {
   // defined by (num_thread_y, num_thread_x) from thread_id.
   llvm::CallInst* thread_id_raw = llvm_ir::EmitCallToIntrinsic(
       llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b_);
-  llvm_ir::AddRangeMetadata(0, GetThreadsPerTile(), thread_id_raw);
+  llvm_ir::AddRangeMetadata(0, GetThreadsPerBlock(), thread_id_raw);
   llvm::Value* thread_id_int =
       b_->CreateIntCast(thread_id_raw, index_ty,
                         /*isSigned=*/true, "thread.id.x");
   llvm::Value* num_thread_x =
       llvm::ConstantInt::get(index_ty, GetNumberOfThreadsForDimensionX());
-  llvm::Value* x = b_->CreateURem(thread_id_int, num_thread_x);
-  llvm::Value* y = b_->CreateUDiv(thread_id_int, num_thread_x);
+  llvm::Value* x = b_->CreateURem(thread_id_int, num_thread_x, "thread.x");
+  llvm::Value* y = b_->CreateUDiv(thread_id_int, num_thread_x, "thread.y");
   return std::make_tuple(y, x);
 }
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
index 06002d57b0d7daa07f903feebe67a60a083c0e7c..f802cc27d519e621262f328903697373aa8c284c 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
@@ -90,15 +90,16 @@ class KernelMappingScheme {
   enum { DimZ = 0, DimY, DimX, DimTot };
 
  public:
+  KernelMappingScheme() {}
   // dims_in_elems: the normalized tensor dimensions.
   // req_block_sizes: the requested block size in number of tiles for each
   //   dimension. The actual block size is set to min(req_block_size,
   //   dims_in_number_of_blocks).
-  explicit KernelMappingScheme(absl::Span<const int64> dims_in_elems,
-                               int64 tile_size_y, int64 tile_size_x,
-                               absl::Span<const int64> req_block_sizes,
-                               int64 num_threads_y, int64 num_threads_x,
-                               llvm::IRBuilder<>* b);
+  KernelMappingScheme(absl::Span<const int64> dims_in_elems, int64 tile_size_y,
+                      int64 tile_size_x,
+                      absl::Span<const int64> req_block_sizes,
+                      int64 num_threads_y, int64 num_threads_x,
+                      llvm::IRBuilder<>* b);
 
   absl::Span<const int64> GetDimensionsInElements() const {
     return dims_in_elems_;
@@ -116,7 +117,10 @@ class KernelMappingScheme {
   int64 GetNumberOfTilesInOneBlock() const {
     return absl::c_accumulate(block_sizes_, 1, std::multiplies<int64>());
   }
-
+  int64 GetNumberOfTilesInOneBlockForDimension(int d) const {
+    DCHECK(d >= DimZ && d <= DimX);
+    return block_sizes_[d];
+  }
   int64 GetNumberOfBlocks() const {
     return absl::c_accumulate(dims_in_blocks_, 1, std::multiplies<int64>());
   }
@@ -133,15 +137,29 @@ class KernelMappingScheme {
   }
 
   absl::Span<const int64> GetBlockSizes() const { return block_sizes_; }
+  int64 GetTileBlockSizeForDimension(int d) const {
+    DCHECK(d >= DimZ && d <= DimX);
+    return dims_in_blocks_[d];
+  }
 
   int64 GetNumberOfThreadsForDimensionX() const { return num_threads_x_; }
   int64 GetNumberOfThreadsForDimensionY() const { return num_threads_y_; }
 
-  int64 GetThreadsPerTile() const {
+  int64 GetThreadsPerBlock() const {
     return GetNumberOfThreadsForDimensionX() *
            GetNumberOfThreadsForDimensionY();
   }
 
+  bool DilatedX() const { return dilated_x_; }
+  void SetDilatedX(bool v) {
+    dilated_x_ = v;
+    if (!dilated_x_) {
+      // dilated_x_=false is for the purpose of vectorization, which requires
+      // GetTileSizeForDimension(DimX) to be a multiplier of num_threads_x_.
+      CHECK_EQ(GetTileSizeForDimension(DimX) % num_threads_x_, 0);
+    }
+  }
+
   IrArray::Index EmitBlockIndex(llvm::Type* index_ty);
   // Returns the index for the first tile in the block with the given block
   // index.
@@ -163,7 +181,7 @@ class KernelMappingScheme {
  private:
   llvm::IRBuilder<>* b_;
   // The number of elements in each dimension.
-  absl::Span<const int64> dims_in_elems_;
+  std::vector<int64> dims_in_elems_;
 
   // The number of elements for each dimension of a tile.
   std::vector<int64> tile_sizes_;
@@ -181,6 +199,13 @@ class KernelMappingScheme {
   int64 num_threads_x_;
   // Number of threads used to process elements in the Y direction of a tile.
   int64 num_threads_y_;
+
+  // When num_threads_x threads process a total of tile_size_x elements in the
+  // X dimension of a tile, each threads process n=tile_size_x/num_threads_x
+  // elements. When dilated_x=false, the n elements processed by a thread are
+  // contiguous. On the other hand, when dilated_x=true the n elements are
+  // dilated by a factor of num_threads_x.
+  bool dilated_x_;
 };
 
 // A class to represent information for tiled parameters to support IR emission
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
index 219a9f221fbd116cdfbaf17985e21d82aefd079d..3a35405a2da0af386e01bb48bed56ad194048543 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -235,7 +234,7 @@ std::unique_ptr<ForLoop> ForLoopNest::AddLoop(int64 start_index,
 
 IrArray::Index ForLoopNest::AddLoopsForShape(const Shape& shape,
                                              absl::string_view suffix) {
-  std::vector<int64> dimensions(ShapeUtil::Rank(shape));
+  std::vector<int64> dimensions(shape.rank());
   std::iota(dimensions.begin(), dimensions.end(), 0);
   return AddLoopsForShapeOnDimensions(shape, dimensions, suffix);
 }
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index ceea24685af566e02340664f0a40c398c62b5ab0..807296329c07b8e4ac630486a1e1f59e4fdfa009 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -188,7 +188,16 @@ llvm::Type* PrimitiveTypeToIrType(PrimitiveType element_type,
       }
       return cplx_t;
     }
-    // A Tuple contains an array of pointers. Use i8*.
+    case C128: {
+      auto cplx_t = module->getTypeByName("complex128");
+      if (cplx_t == nullptr) {
+        return llvm::StructType::create(
+            {llvm::Type::getDoubleTy(module->getContext()),
+             llvm::Type::getDoubleTy(module->getContext())},
+            "complex128", /*isPacked=*/true);
+      }
+      return cplx_t;
+    }  // A Tuple contains an array of pointers. Use i8*.
     case TUPLE:
     // An Opaque is like a void*, use i8*.
     case OPAQUE:
@@ -219,10 +228,10 @@ int GetSizeInBits(llvm::Type* type) {
 
 llvm::Type* ShapeToIrType(const Shape& shape, llvm::Module* module) {
   llvm::Type* result_type = PrimitiveTypeToIrType(shape.element_type(), module);
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     // A tuple buffer is an array of pointers.
     result_type = llvm::ArrayType::get(result_type, shape.tuple_shapes_size());
-  } else if (ShapeUtil::IsArray(shape)) {
+  } else if (shape.IsArray()) {
     for (int64 dimension : LayoutUtil::MinorToMajor(shape)) {
       result_type =
           llvm::ArrayType::get(result_type, shape.dimensions(dimension));
@@ -621,6 +630,10 @@ llvm::Function* CreateFunction(llvm::FunctionType* function_type,
   function->setCallingConv(llvm::CallingConv::C);
   function->addFnAttr("no-frame-pointer-elim", "false");
 
+  // Generate unwind information so that GDB can crawl through the stack frames
+  // created by the JIT compiled code.
+  function->setHasUWTable();
+
   if (enable_fast_math) {
     function->addFnAttr("unsafe-fp-math", "true");
     function->addFnAttr("no-infs-fp-math", "true");
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
index 0dc120e0b0df47f261435f490a8459b49d989b53..a689881e65ec3a7ddf606c36bdd64b749cfe358e 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
index e22c2173c271fc9571be1ddb0759d2b31562dc98..d71addec9b7317dfe16e9d7e5380c3cfda0b8c06 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "llvm/ADT/APInt.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
@@ -45,13 +46,14 @@ namespace llvm_ir {
 namespace {
 
 // Adds the inner comparison loop body where we compare elements.
-void EmitCompareLoopBody(
-    int64 iteration_bound, PrimitiveType key_type, int64 num_values,
-    int64 iota_values_parameter_index, llvm::Value* element_pair_index,
+Status EmitCompareLoopBody(
+    int64 iteration_bound, int64 num_values, llvm::Value* element_pair_index,
     int64 xor_mask, llvm::Type* index_type,
-    std::function<llvm::Value*(int64 operand, llvm::Value* index)> read_element,
+    std::function<llvm::Value*(int64 operand, llvm::Value* index)>
+        element_address,
     std::function<void(int64 operand, llvm::Value* index, llvm::Value* value)>
         write_element,
+    const EmitCallToNestedComputationCallback& emit_compare_callback,
     llvm::IRBuilder<>* b, bool needs_bounds_checks = true) {
   auto index_typed_constant = [&](int64 value) {
     return llvm::ConstantInt::get(index_type, value);
@@ -108,74 +110,44 @@ void EmitCompareLoopBody(
 
   // if (is_smaller_index && index_is_inbounds)
   KernelSupportLibrary ksl(b);
-  ksl.IfReturnVoid("smaller_comparison_index", do_comparison, [&]() {
-    auto key1 = read_element(0, current_keys_index);
-    auto key2 = read_element(0, compare_keys_index);
-    auto compare_key1 = key1;
-    auto compare_key2 = key2;
-    bool is_signed_comparison = true;
-    if (primitive_util::IsFloatingPointType(key_type)) {
-      // We would like a total order of floating point numbers so that the
-      // sort has a predictable behavior in the presence of NaNs. Rather
-      // than using floating point comparison, we use the following trick:
-      // If f is a float, and
-      // x = bit_cast<int32>(f);
-      // y = x < 0 ? 0x7FFFFFFF - x : x;
-      // then y is ordered as an int32 such that finite values have the
-      // obvious order, -0 is ordered before 0, and -NaN and NaN appear at
-      // the beginning and end of the ordering.
-      auto k = b->getInt(llvm::APInt::getSignedMaxValue(
-          key1->getType()->getPrimitiveSizeInBits()));
-      auto comparison_type = k->getType();
-      auto zero = llvm::ConstantInt::get(comparison_type, 0);
-      auto maybe_flip = [&](llvm::Value* v) {
-        return b->CreateSelect(b->CreateICmp(llvm::ICmpInst::ICMP_SLT, v, zero),
-                               b->CreateSub(k, v), v);
-      };
-      compare_key1 = b->CreateBitCast(key1, comparison_type);
-      compare_key2 = b->CreateBitCast(key2, comparison_type);
-      compare_key1 = maybe_flip(compare_key1);
-      compare_key2 = maybe_flip(compare_key2);
-    } else if (!primitive_util::IsSignedIntegralType(key_type)) {
-      is_signed_comparison = false;
-    }
-    // If key2 < key1
-    auto is_smaller_than =
-        b->CreateICmp(is_signed_comparison ? llvm::ICmpInst::ICMP_SLT
-                                           : llvm::ICmpInst::ICMP_ULT,
-                      compare_key2, compare_key1);
-    if (iota_values_parameter_index >= 0) {
-      auto keys_equal = b->CreateICmpEQ(compare_key1, compare_key2);
-      auto key_index1 =
-          read_element(iota_values_parameter_index, current_keys_index);
-      auto key_index2 =
-          read_element(iota_values_parameter_index, compare_keys_index);
-      auto index_is_smaller_than =
-          b->CreateICmp(llvm::ICmpInst::ICMP_ULT, key_index2, key_index1);
-      is_smaller_than = b->CreateOr(
-          is_smaller_than, b->CreateAnd(keys_equal, index_is_smaller_than));
+  return ksl.IfWithStatus("smaller_comparison_index", do_comparison, [&]() {
+    std::vector<llvm::Value*> values_to_compare;
+    for (int i = 0; i < num_values; ++i) {
+      values_to_compare.push_back(element_address(i, compare_keys_index));
+      values_to_compare.push_back(element_address(i, current_keys_index));
     }
-    ksl.IfReturnVoid("is_smaller_than", is_smaller_than, [&]() {
-      // Swap key1 with key2.
-      write_element(0, current_keys_index, key2);
-      write_element(0, compare_keys_index, key1);
-      for (int64 i = 1; i <= num_values; ++i) {
-        // Also swap the values.
-        auto value1 = read_element(i, current_keys_index);
-        auto value2 = read_element(i, compare_keys_index);
-        write_element(i, current_keys_index, value2);
-        write_element(i, compare_keys_index, value1);
+    llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
+    llvm::Value* compare_return_buffer = llvm_ir::EmitAllocaAtFunctionEntry(
+        llvm_ir::PrimitiveTypeToIrType(PRED, module), "compare_return_buffer",
+        b);
+    TF_RETURN_IF_ERROR(
+        emit_compare_callback(values_to_compare, compare_return_buffer));
+    llvm::Value* result = b->CreateLoad(compare_return_buffer);
+
+    // Check if the 'compare' function returns true.
+    llvm::Value* is_smaller_than =
+        b->CreateICmpNE(result, llvm::ConstantInt::get(result->getType(), 0),
+                        "boolean_predicate");
+    ksl.If("is_smaller_than", is_smaller_than, [&]() {
+      for (int64 i = 0; i < num_values; ++i) {
+        // Swap the values.
+        auto value1 = b->CreateLoad(values_to_compare[i * 2]);
+        auto value2 = b->CreateLoad(values_to_compare[i * 2 + 1]);
+        write_element(i, current_keys_index, value1);
+        write_element(i, compare_keys_index, value2);
       }
     });
+    return Status::OK();
   });
 }
 
-void EmitTiledCompareLoop(
+Status EmitTiledCompareLoop(
     const IrArray::Index& tiled_keys_index, int64 dimension_to_sort,
-    int64 dimension_to_sort_bound, PrimitiveType keys_type,
-    absl::Span<const int64> xor_masks, const std::vector<IrArray>& params,
-    const std::vector<llvm::Value*>& param_shmem_buffers,
-    int64 iota_values_parameter_index, int64 tile_size, llvm::IRBuilder<>* b) {
+    int64 dimension_to_sort_bound, absl::Span<const int64> xor_masks,
+    const std::vector<IrArray>& params,
+    const std::vector<llvm::Value*>& param_shmem_buffers, int64 tile_size,
+    const EmitCallToNestedComputationCallback& emit_compare_callback,
+    llvm::IRBuilder<>* b) {
   KernelSupportLibrary ksl(b);
   llvm::Value* thread_id = llvm_ir::EmitCallToIntrinsic(
       llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b);
@@ -192,7 +164,7 @@ void EmitTiledCompareLoop(
             b->CreateShl(tiled_keys_index[dimension_to_sort], value_one);
         // We want to copy two adjacent elements. We first check whether the
         // first index position is within bounds.
-        ksl.IfReturnVoid(
+        ksl.If(
             "smaller_keys_index",
             b->CreateICmpSLT(current_keys_index,
                              tiled_keys_index.GetConstantWithIndexType(
@@ -200,18 +172,17 @@ void EmitTiledCompareLoop(
             [&]() {
               auto cache_index = b->CreateShl(thread_id, value_one);
               read_or_write(cache_index, current_keys_index);
-              // Increment to go the next index position.
+              // Increment to go to the next index position.
               current_keys_index = b->CreateAdd(current_keys_index, value_one);
               // Here we check whether the next index position is within bounds.
-              ksl.IfReturnVoid(
-                  "inner_smaller_keys_index",
-                  b->CreateICmpSLT(current_keys_index,
-                                   tiled_keys_index.GetConstantWithIndexType(
-                                       dimension_to_sort_bound)),
-                  [&]() {
-                    cache_index = b->CreateAdd(cache_index, value_one);
-                    read_or_write(cache_index, current_keys_index);
-                  });
+              ksl.If("inner_smaller_keys_index",
+                     b->CreateICmpSLT(current_keys_index,
+                                      tiled_keys_index.GetConstantWithIndexType(
+                                          dimension_to_sort_bound)),
+                     [&]() {
+                       cache_index = b->CreateAdd(cache_index, value_one);
+                       read_or_write(cache_index, current_keys_index);
+                     });
             });
       };
 
@@ -231,10 +202,18 @@ void EmitTiledCompareLoop(
   llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, b);
 
   // Now emit the bodies of the comparison loops.
-  auto read_element = [&](int64 operand, llvm::Value* index) {
-    return b->CreateLoad(
+  auto element_address = [&](int64 operand, llvm::Value* index) {
+    auto shared_memory_address =
         b->CreateGEP(param_shmem_buffers[operand],
-                     {tiled_keys_index.GetConstantWithIndexType(0), index}));
+                     {tiled_keys_index.GetConstantWithIndexType(0), index});
+    auto ptr_type = shared_memory_address->getType();
+    // We need a generic pointer with address space 0 instead of a pointer to
+    // shared memory (address space 3) so that we can pass it to the comparison
+    // computation.
+    return b->CreateAddrSpaceCast(
+        shared_memory_address,
+        llvm::PointerType::get(ptr_type->getPointerElementType(),
+                               /*AddressSpace=*/0));
   };
   auto write_element = [&](int64 operand, llvm::Value* index,
                            llvm::Value* value) {
@@ -253,7 +232,7 @@ void EmitTiledCompareLoop(
     if (dimension_to_sort_bound % tile_size) {
       // Otherwise we need a bounds check for the last tile. The last tile has
       // size 'dimension_to_sort_bound' % 'tile_size'.
-      ksl.IfReturnVoid(
+      TF_RETURN_IF_ERROR(ksl.IfWithStatus(
           "is_last_tile",
           b->CreateICmpUGE(
               b->CreateMul(tiled_keys_index[dimension_to_sort],
@@ -261,24 +240,24 @@ void EmitTiledCompareLoop(
               tiled_keys_index.GetConstantWithIndexType(
                   RoundDownToNearest(dimension_to_sort_bound, tile_size))),
           [&]() {
-            EmitCompareLoopBody(dimension_to_sort_bound % tile_size, keys_type,
-                                params.size() - 1, iota_values_parameter_index,
-                                element_pair_index, xor_mask,
-                                tiled_keys_index.GetType(), read_element,
-                                write_element, b);
+            return EmitCompareLoopBody(
+                dimension_to_sort_bound % tile_size, params.size(),
+                element_pair_index, xor_mask, tiled_keys_index.GetType(),
+                element_address, write_element, emit_compare_callback, b);
           },
           [&]() {
-            EmitCompareLoopBody(tile_size, keys_type, params.size() - 1,
-                                iota_values_parameter_index, element_pair_index,
-                                xor_mask, tiled_keys_index.GetType(),
-                                read_element, write_element, b,
-                                /*needs_bounds_checks=*/false);
-          });
+            return EmitCompareLoopBody(
+                tile_size, params.size(), element_pair_index, xor_mask,
+                tiled_keys_index.GetType(), element_address, write_element,
+                emit_compare_callback, b,
+                /*needs_bounds_checks=*/false);
+          }));
     } else {
-      EmitCompareLoopBody(tile_size, keys_type, params.size() - 1,
-                          iota_values_parameter_index, element_pair_index,
-                          xor_mask, tiled_keys_index.GetType(), read_element,
-                          write_element, b, /*needs_bounds_checks=*/false);
+      TF_RETURN_IF_ERROR(EmitCompareLoopBody(
+          tile_size, params.size(), element_pair_index, xor_mask,
+          tiled_keys_index.GetType(), element_address, write_element,
+          emit_compare_callback, b,
+          /*needs_bounds_checks=*/false));
     }
     // Wait until all comparisons have happened.
     llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, b);
@@ -302,17 +281,16 @@ void EmitTiledCompareLoop(
   // same location in shared memory because we have exactly tile_size / 2 many
   // threads, and the linear index calculated by ParallelLoopEmitter uses
   // linear_index = blockIdx.x * blockDim.x + threadIdx.x;
+  return Status::OK();
 }
 }  // namespace
 
-Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
-                       const std::vector<IrArray>& values_arrays,
-                       int64 iota_values_parameter_index,
-                       absl::string_view name,
-                       absl::Span<const int64> xor_masks, llvm::IRBuilder<>* b,
-                       const gpu::LaunchDimensions& launch_dimensions,
-                       int64 num_iterations_in_sort_dim,
-                       const int64 tile_size) {
+Status EmitSortInPlace(
+    int64 dimension_to_sort, const std::vector<IrArray>& values_arrays,
+    absl::string_view name, absl::Span<const int64> xor_masks,
+    llvm::IRBuilder<>* b, const gpu::LaunchDimensions& launch_dimensions,
+    int64 num_iterations_in_sort_dim, const int64 tile_size,
+    const EmitCallToNestedComputationCallback& emit_compare_callback) {
   // Iterate through the keys shape in physical order, but skip the dimension to
   // sort and make it the innermost loop which is the loop where the comparisons
   // happen. In the dimension to sort, if we use tiling, we iterate through it
@@ -322,8 +300,8 @@ Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
   // within those 64 elements and are therefore independent of the other
   // comparisons).
 
-  const Shape& keys_shape = keys_array.GetShape();
-  int64 rank = ShapeUtil::Rank(keys_shape);
+  const Shape& keys_shape = values_arrays[0].GetShape();
+  int64 rank = keys_shape.rank();
   int64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
   std::vector<int64> dimensions_in_iteration_order(rank);
   std::vector<int64> iteration_order_to_logical_order(rank);
@@ -339,18 +317,16 @@ Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
 
   Shape iteration_shape = ShapeUtil::MakeShape(keys_shape.element_type(),
                                                dimensions_in_iteration_order);
-  std::vector<IrArray> params(1, keys_array);
-  params.insert(params.end(), values_arrays.begin(), values_arrays.end());
 
   // Allocate shared memory for the tiled compare loop.
-  std::vector<llvm::Value*> param_shmem_buffers(params.size(), nullptr);
+  std::vector<llvm::Value*> param_shmem_buffers(values_arrays.size(), nullptr);
   if (xor_masks.size() > 1) {
     llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
-    for (int64 i = 0; i < params.size(); ++i) {
-      llvm::Type* tile_type =
-          llvm::ArrayType::get(llvm_ir::PrimitiveTypeToIrType(
-                                   params[i].GetShape().element_type(), module),
-                               tile_size);
+    for (int64 i = 0; i < values_arrays.size(); ++i) {
+      llvm::Type* tile_type = llvm::ArrayType::get(
+          llvm_ir::PrimitiveTypeToIrType(
+              values_arrays[i].GetShape().element_type(), module),
+          tile_size);
       param_shmem_buffers[i] = llvm_ir::AllocateSharedMemoryTile(
           module, tile_type, absl::StrCat(name, "_tile_param_", i));
     }
@@ -377,25 +353,24 @@ Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
       keys_index[iteration_order_to_logical_order[i]] = tiles_index[i];
     }
     if (xor_masks.size() > 1) {
-      EmitTiledCompareLoop(keys_index, dimension_to_sort,
-                           dimension_to_sort_bound, keys_shape.element_type(),
-                           xor_masks, params, param_shmem_buffers,
-                           iota_values_parameter_index, tile_size, b);
+      TF_RETURN_IF_ERROR(EmitTiledCompareLoop(
+          keys_index, dimension_to_sort, dimension_to_sort_bound, xor_masks,
+          values_arrays, param_shmem_buffers, tile_size, emit_compare_callback,
+          b));
     } else {
-      auto read_element = [&](int64 operand, llvm::Value* index) {
+      auto element_address = [&](int64 operand, llvm::Value* index) {
         keys_index[dimension_to_sort] = index;
-        return params[operand].EmitReadArrayElement(keys_index, b);
+        return values_arrays[operand].EmitArrayElementAddress(keys_index, b);
       };
       auto write_element = [&](int64 operand, llvm::Value* index,
                                llvm::Value* value) {
         keys_index[dimension_to_sort] = index;
-        params[operand].EmitWriteArrayElement(keys_index, value, b);
+        values_arrays[operand].EmitWriteArrayElement(keys_index, value, b);
       };
-      EmitCompareLoopBody(dimension_to_sort_bound, keys_shape.element_type(),
-                          values_arrays.size(), iota_values_parameter_index,
-                          tiles_index[rank - 1], xor_masks[0],
-                          tiles_index.GetType(), read_element, write_element,
-                          b);
+      TF_RETURN_IF_ERROR(EmitCompareLoopBody(
+          dimension_to_sort_bound, values_arrays.size(), tiles_index[rank - 1],
+          xor_masks[0], tiles_index.GetType(), element_address, write_element,
+          emit_compare_callback, b));
     }
     return Status::OK();
   };
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
index 685f9383acba416f51681270e4037d56abb4b6ea..b9341a34d1f2203db6e02c3df5d607174b6d0f74 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
@@ -28,19 +28,18 @@ limitations under the License.
 
 namespace xla {
 namespace llvm_ir {
+using EmitCallToNestedComputationCallback =
+    std::function<Status(absl::Span<llvm::Value* const>, llvm::Value*)>;
 // Emits llvm IR to do pairwise comparisons/swaps in the 'dimension_to_sort'
-// dimension of 'keys_array'. All other dimensions are kept as-is. This
-// implements the inner loop of BitonicSort. It is assumed that 'xor_masks'
-// contains only powers of 2, or values 2^k - 1 (k > 0). If
-// 'iota_values_parameter_index' is >= 0, it points at a 'values_arrays' operand
-// that is a iota and can be used to make the sorting stable.
-Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
-                       const std::vector<IrArray>& values_arrays,
-                       int64 iota_values_parameter_index,
-                       absl::string_view name,
-                       absl::Span<const int64> xor_masks, llvm::IRBuilder<>* b,
-                       const gpu::LaunchDimensions& launch_dimensions,
-                       int64 num_iterations_in_sort_dim, int64 tile_size);
+// dimension of each array in 'values_arrays'. All other dimensions are kept
+// as-is. This implements the inner loop of BitonicSort. It is assumed that
+// 'xor_masks' contains only powers of 2, or values 2^k - 1 (k > 0).
+Status EmitSortInPlace(
+    int64 dimension_to_sort, const std::vector<IrArray>& values_arrays,
+    absl::string_view name, absl::Span<const int64> xor_masks,
+    llvm::IRBuilder<>* b, const gpu::LaunchDimensions& launch_dimensions,
+    int64 num_iterations_in_sort_dim, int64 tile_size,
+    const EmitCallToNestedComputationCallback& emit_compare_callback);
 }  // namespace llvm_ir
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
index a60643bc754f896d096b3ca4e1216e77d7e384c6..d8d2700e1934fd202d44a1dc60e71a99913d4537 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
@@ -93,7 +93,7 @@ llvm::Value* EmitGetTupleElement(const Shape& target_shape, int64 index,
   llvm::LoadInst* src_buffer = b->CreateLoad(element_ptr);
 
   // Mark the loaded pointer as dereferenceable if we know its shape.
-  if (!ShapeUtil::IsOpaque(target_shape)) {
+  if (!target_shape.IsOpaque()) {
     SetDereferenceableMetadataForLoad(
         src_buffer,
         ByteSizeOf(target_shape, src_buffer->getModule()->getDataLayout()));
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 6c89700983363fec46c41b5430c6eab6b366a1b6..3470fe5b2c34bf832207ed546fad176319446f31 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -52,8 +52,10 @@ namespace xla {
   }
 
   BackendOptions backend_options;
-  backend_options.set_platform(platform).set_intra_op_parallelism_threads(
-      options.intra_op_parallelism_threads());
+  backend_options.set_platform(platform)
+      .set_intra_op_parallelism_threads(options.intra_op_parallelism_threads())
+      .set_allowed_devices(options.allowed_devices());
+
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Backend> backend,
                       Backend::CreateBackend(backend_options));
 
@@ -108,6 +110,7 @@ ExecutionOptions CreateExecutionOptions(
     *execution_options.mutable_shape_with_output_layout() =
         result_shape.ToProto();
   }
+  execution_options.set_num_replicas(build_options.num_replicas());
   return execution_options;
 }
 
diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.cc b/tensorflow/compiler/xla/service/multi_output_fusion.cc
index 9ccdd7d8d818b9fa3aa77cdd10d37ca18928b448..53d52d9a3d918fa6dee093668923fcfff963d084 100644
--- a/tensorflow/compiler/xla/service/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/multi_output_fusion.cc
@@ -198,7 +198,7 @@ void MultiOutputFusion::Update(HloInstruction* instr1, HloInstruction* instr2) {
     if (instr == fusion || is_fused(instr) || is_connected(fusion, instr)) {
       continue;
     }
-    if (in_list.count(instr) > 0) {
+    if (in_list.contains(instr)) {
       continue;
     }
     int64 profit = GetProfit(instr, fusion);
diff --git a/tensorflow/compiler/xla/service/name_uniquer.cc b/tensorflow/compiler/xla/service/name_uniquer.cc
index ac2f79674feceff436c0e9c65338967f498e4473..e55b83d17e90bc2ca0053a0421cf80ef6edd5bca 100644
--- a/tensorflow/compiler/xla/service/name_uniquer.cc
+++ b/tensorflow/compiler/xla/service/name_uniquer.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -27,13 +29,13 @@ namespace {
 
 bool IsAllowed(char character) {
   auto c = static_cast<unsigned char>(character);
-  return (isalnum(c) != 0) || c == '_' || c == '.' || c == '-';
+  return (absl::ascii_isalnum(c) != 0) || c == '_' || c == '.' || c == '-';
 }
 
 }  // namespace
 
 NameUniquer::NameUniquer(const string& separator) {
-  CHECK(std::all_of(separator.begin(), separator.end(), IsAllowed))
+  CHECK(absl::c_all_of(separator, IsAllowed))
       << "separator should comprises allowed characters only";
   separator_ = separator;
 }
@@ -42,9 +44,10 @@ NameUniquer::NameUniquer(const string& separator) {
   if (name.empty()) {
     return "";
   }
+
   string result = name;
   char c = static_cast<unsigned char>(result[0]);
-  if (!isalpha(c) && c != '_') {
+  if (!absl::ascii_isalpha(c) && c != '_') {
     result[0] = '_';
   }
   for (int i = 1; i < result.length(); i++) {
@@ -52,6 +55,13 @@ NameUniquer::NameUniquer(const string& separator) {
       result[i] = '_';
     }
   }
+
+  // HLO primitive type names (with the exception of 'tuple') are keywords in
+  // the HLO text representation and cannot be names, so append an underscore if
+  // the name is a primitive type.
+  if (primitive_util::IsPrimitiveTypeName(result) && result != "tuple") {
+    result += "_";
+  }
   return result;
 }
 
diff --git a/tensorflow/compiler/xla/service/name_uniquer_test.cc b/tensorflow/compiler/xla/service/name_uniquer_test.cc
index 3e2592c6ac626143f1421e545a31d9be91e376bc..d0d04147e0c29c66cba447550c0a9c703f35573a 100644
--- a/tensorflow/compiler/xla/service/name_uniquer_test.cc
+++ b/tensorflow/compiler/xla/service/name_uniquer_test.cc
@@ -104,5 +104,21 @@ TEST_F(NameUniquerTest, KeepNamesInRandomOrder) {
   EXPECT_EQ("foo.3", uniquer.GetUniqueName("foo.3"));
 }
 
+TEST_F(NameUniquerTest, AvoidKeywords) {
+  NameUniquer uniquer(".");
+
+  EXPECT_EQ("f32_", uniquer.GetUniqueName("f32"));
+  EXPECT_EQ("s64_", uniquer.GetUniqueName("s64"));
+  EXPECT_EQ("pred_", uniquer.GetUniqueName("pred"));
+
+  // Though a primitive type, "tuple" is not a keyword.
+  EXPECT_EQ("tuple", uniquer.GetUniqueName("tuple"));
+
+  // Keywords are not capitalized.
+  EXPECT_EQ("F32", uniquer.GetUniqueName("F32"));
+  EXPECT_EQ("S32", uniquer.GetUniqueName("S32"));
+  EXPECT_EQ("Pred", uniquer.GetUniqueName("Pred"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/op_expander_pass.cc b/tensorflow/compiler/xla/service/op_expander_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..02c9d4b387b112be39c204d35fe4fa1013ed064c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/op_expander_pass.cc
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/op_expander_pass.h"
+
+#include <utility>
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+
+StatusOr<bool> OpExpanderPass::Run(HloModule* module) {
+  std::vector<HloInstruction*> matching_instructions;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    absl::c_copy_if(
+        computation->instructions(), std::back_inserter(matching_instructions),
+        [&](HloInstruction* inst) { return InstructionMatchesPattern(inst); });
+  }
+
+  for (HloInstruction* inst : matching_instructions) {
+    TF_ASSIGN_OR_RETURN(HloInstruction * expanded_root,
+                        ExpandInstruction(inst));
+    if (expanded_root == nullptr) {
+      continue;
+    }
+    TF_RETURN_IF_ERROR(inst->parent()->ReplaceInstruction(inst, expanded_root));
+  }
+
+  return !matching_instructions.empty();
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/op_expander_pass.h b/tensorflow/compiler/xla/service/op_expander_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..276e3d70b8ecd8742e0b277698765063198fe872
--- /dev/null
+++ b/tensorflow/compiler/xla/service/op_expander_pass.h
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_OP_EXPANDER_PASS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_OP_EXPANDER_PASS_H_
+
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// This pass is an abstract superclass for passes that replace operations that
+// match a pattern. It is intended to be subclassed, not used directly.
+//
+// This pass is useful for legalizing HLO instructions that a particular backend
+// does not support into other HLO instructions.
+class OpExpanderPass : public HloModulePass {
+ public:
+  StatusOr<bool> Run(HloModule* module) override;
+
+ protected:
+  // Returns `true` if `instruction` should be expanded by this pass.
+  virtual bool InstructionMatchesPattern(HloInstruction* instruction) = 0;
+
+  // Returns a replacement for `instruction`, or nullptr if no replacement is
+  // neeeded (e.g. only the to_apply subcomputation of the instruction was
+  // modified).
+  virtual StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) = 0;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_OP_EXPANDER_PASS_H_
diff --git a/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias.cc b/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias.cc
new file mode 100644
index 0000000000000000000000000000000000000000..701c629add52a217f16877a085b9ef2d096623d9
--- /dev/null
+++ b/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias.cc
@@ -0,0 +1,106 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/xla/service/optimize_input_output_buffer_alias.h"
+
+#include <queue>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+// Returns true if the given shape is a non-nested tuple.
+bool IsNonNestedTuple(const Shape& shape) {
+  return shape.IsTuple() && !ShapeUtil::IsNestedTuple(shape);
+}
+
+}  // namespace
+
+StatusOr<bool> OptimizeInputOutputBufferAlias::Build(
+    const Shape& input_shape, const Shape& output_shape,
+    HloInputOutputAliasConfig* alias_config) {
+  bool changed = false;
+  TF_RET_CHECK(LayoutUtil::HasLayout(input_shape));
+  TF_RET_CHECK(LayoutUtil::HasLayout(output_shape));
+  VLOG(1) << "input_shape:" << input_shape.ToString();
+  VLOG(1) << "output_shape:" << output_shape.ToString();
+
+  // For all buffers defined by the parameter, build a map from the byte
+  // size to the list of the buffers of that size.
+  absl::flat_hash_map<int64, std::queue<ShapeIndex>> size_to_input_index;
+  ShapeUtil::ForEachSubshape(
+      input_shape, [&](const Shape& subshape, const ShapeIndex& index) {
+        if (subshape.IsTuple()) {
+          return;
+        }
+        int64 bytes = size_func_(subshape);
+        size_to_input_index[bytes].push(index);
+      });
+
+  // For each result buffer shape index, take the first unused parameter
+  // buffer that matches the size.
+  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+      output_shape, [&](const Shape& subshape, const ShapeIndex& index) {
+        if (subshape.IsTuple()) {
+          return Status::OK();
+        }
+        int64 bytes = size_func_(subshape);
+
+        auto it = size_to_input_index.find(bytes);
+        if (it != size_to_input_index.end() && !it->second.empty()) {
+          changed = true;
+          const ShapeIndex& input_index = it->second.front();
+          const ShapeIndex& output_index = index;
+          if (!alias_config->ParameterHasAlias(0, input_index) &&
+              !alias_config->OutputHasAlias(output_index)) {
+            TF_RETURN_IF_ERROR(alias_config->SetUpAlias(
+                output_index, 0, input_index,
+                HloInputOutputAliasConfig::AliasKind::kSystemAlias));
+          }
+          VLOG(3) << "Set up alias from with param index "
+                  << it->second.front().ToString() << ", shape size " << bytes
+                  << " and result subshape "
+                  << ShapeUtil::HumanStringWithLayout(subshape) << " at index "
+                  << index.ToString();
+          it->second.pop();
+        }
+        return Status::OK();
+      }));
+  return changed;
+}
+
+StatusOr<bool> OptimizeInputOutputBufferAlias::Run(HloModule* module) {
+  // User buffer alias only work for modules with 1 parameter.
+  if (module->entry_computation()->num_parameters() != 1) {
+    return false;
+  }
+
+  HloInputOutputAliasConfig* alias_config =
+      &module->input_output_alias_config();
+
+  return Build(module->entry_computation()->parameter_instruction(0)->shape(),
+               module->entry_computation()->root_instruction()->shape(),
+               alias_config);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias.h b/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias.h
new file mode 100644
index 0000000000000000000000000000000000000000..79ce468e975300ed703ae0fd780f4b9d5328a4b3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias.h
@@ -0,0 +1,71 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_OPTIMIZE_INPUT_OUTPUT_BUFFER_ALIAS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_OPTIMIZE_INPUT_OUTPUT_BUFFER_ALIAS_H_
+
+#include <memory>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+// This pass opportunistically finds input and output buffers that can be
+// aliased, and writes the alias config into the HloModule.
+//
+// The input and the output buffers can be in any shape, and each output buffer
+// can alias with an input buffer with the same size. Each input buffer may only
+// alias with a single output buffer. For example, for the following parameter
+// and the output buffers,
+//
+//  Parameters : { P1(2MiB), P2(4MiB), P3(8MiB), P4(4MiB), P5(4MiB), ... }
+//  Outputs    : { O1(4MiB), O2(2MiB), O3(4MiB), O4(6MiB), O5(4MiB), ... }
+//
+// one potential aliasing would be (O1, P2), (O2, P1), (O3, P4), (O5, P5), ..
+class OptimizeInputOutputBufferAlias : public HloModulePass {
+  using ShapeSizeFunction = std::function<int64(const Shape&)>;
+
+ public:
+  OptimizeInputOutputBufferAlias(ShapeSizeFunction size_func)
+      : size_func_(size_func) {}
+  ~OptimizeInputOutputBufferAlias() override = default;
+
+  absl::string_view name() const override {
+    return "optimize_input_output_buffer_alias.h";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  friend class OptimizeInputOutputBufferAliasTest;
+
+  StatusOr<bool> Build(const Shape& input_shape, const Shape& output_shape,
+                       HloInputOutputAliasConfig* alias_config);
+  ShapeSizeFunction size_func_ = nullptr;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_OPTIMIZE_INPUT_OUTPUT_BUFFER_ALIAS_H_
diff --git a/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias_test.cc b/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..41e90f9b6931619fd9824e2eda25e12e4c7197b0
--- /dev/null
+++ b/tensorflow/compiler/xla/service/optimize_input_output_buffer_alias_test.cc
@@ -0,0 +1,145 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/optimize_input_output_buffer_alias.h"
+
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+
+// Tests that UserBufferAlias properly maps input and output buffer indices of
+// various shapes for aliasing.
+class OptimizeInputOutputBufferAliasTest : public HloTestBase {
+ protected:
+  OptimizeInputOutputBufferAliasTest() {
+    r1f32_ = ShapeUtil::MakeShape(F32, {4});
+    r2f32_ = ShapeUtil::MakeShape(F32, {4, 5});
+    r3f32_ = ShapeUtil::MakeShape(F32, {4, 5, 6});
+    r4f32_ = ShapeUtil::MakeShape(F32, {4, 5, 6, 7});
+
+    auto size_func = [](const Shape& shape) {
+      return ShapeUtil::ByteSizeOf(shape);
+    };
+
+    optimize_pass_ =
+        absl::make_unique<OptimizeInputOutputBufferAlias>(size_func);
+  }
+
+  // Returns the number of output indices that aliases with the input.
+  int64 AliasCount() {
+    int64 count = 0;
+
+    config_.ForEachAlias(
+        [&](const ShapeIndex&, const HloInputOutputAliasConfig::Alias&) {
+          count++;
+        });
+    return count;
+  }
+
+  bool BuildAliasConfig(const Shape& input_shape, const Shape& output_shape) {
+    config_ = HloInputOutputAliasConfig(output_shape);
+    auto changed = optimize_pass_->Build(input_shape, output_shape, &config_);
+    TF_CHECK_OK(changed.status());
+
+    return changed.ValueOrDie();
+  }
+
+  std::unique_ptr<OptimizeInputOutputBufferAlias> optimize_pass_;
+
+  HloInputOutputAliasConfig config_;
+
+  Shape r1f32_;
+  Shape r2f32_;
+  Shape r3f32_;
+  Shape r4f32_;
+};
+
+// All shapes are different, so no aliasing is available.
+TEST_F(OptimizeInputOutputBufferAliasTest, AllDifferentBufferSizes) {
+  Shape input = ShapeUtil::MakeTupleShape({r1f32_, r2f32_});
+  Shape output = ShapeUtil::MakeTupleShape({r3f32_, r4f32_});
+  bool changed = BuildAliasConfig(input, output);
+  EXPECT_FALSE(changed);
+  EXPECT_EQ(AliasCount(), 0);
+}
+
+// Input and output shapes are equal, so buffers can alias at the same index.
+TEST_F(OptimizeInputOutputBufferAliasTest, OrderedNonNestedTuple) {
+  Shape input = ShapeUtil::MakeTupleShape({r1f32_, r2f32_, r3f32_, r4f32_});
+  Shape output = ShapeUtil::MakeTupleShape({r1f32_, r2f32_, r3f32_, r4f32_});
+  bool changed = BuildAliasConfig(input, output);
+  EXPECT_TRUE(changed);
+  EXPECT_EQ(AliasCount(), 4);
+
+  EXPECT_EQ(config_.GetAliasedOutput(0, {0}), ShapeIndex{0});
+  EXPECT_EQ(config_.GetAliasedOutput(0, {1}), ShapeIndex{1});
+  EXPECT_EQ(config_.GetAliasedOutput(0, {2}), ShapeIndex{2});
+  EXPECT_EQ(config_.GetAliasedOutput(0, {3}), ShapeIndex{3});
+}
+
+// Only a subset of the tuple element shapes match between the input and the
+// output.
+TEST_F(OptimizeInputOutputBufferAliasTest, PartialReuseNonNestedTuple) {
+  Shape input = ShapeUtil::MakeTupleShape({r1f32_, r1f32_, r2f32_, r2f32_});
+  Shape output = ShapeUtil::MakeTupleShape({r1f32_, r2f32_, r3f32_, r4f32_});
+  bool changed = BuildAliasConfig(input, output);
+  EXPECT_TRUE(changed);
+
+  EXPECT_EQ(AliasCount(), 2);
+
+  EXPECT_EQ(config_.GetAliasedOutput(0, {0}), ShapeIndex{0});
+  EXPECT_EQ(config_.GetAliasedOutput(0, {2}), ShapeIndex{1});
+}
+
+// The output shape is reverse of the input shape, but we can still reuse all
+// the buffers.
+TEST_F(OptimizeInputOutputBufferAliasTest, UnorderedNonNestedTuple) {
+  Shape input = ShapeUtil::MakeTupleShape({r1f32_, r2f32_, r3f32_, r4f32_});
+  Shape output = ShapeUtil::MakeTupleShape({r4f32_, r3f32_, r2f32_, r1f32_});
+  bool changed = BuildAliasConfig(input, output);
+  EXPECT_TRUE(changed);
+
+  EXPECT_EQ(AliasCount(), 4);
+
+  EXPECT_EQ(config_.GetAliasedOutput(0, {0}), ShapeIndex{3});
+  EXPECT_EQ(config_.GetAliasedOutput(0, {1}), ShapeIndex{2});
+  EXPECT_EQ(config_.GetAliasedOutput(0, {2}), ShapeIndex{1});
+  EXPECT_EQ(config_.GetAliasedOutput(0, {3}), ShapeIndex{0});
+}
+
+TEST_F(OptimizeInputOutputBufferAliasTest, UnorderedNestedTuple) {
+  Shape input = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeTupleShape({r1f32_}), r2f32_, r3f32_, r4f32_});
+  Shape output = ShapeUtil::MakeTupleShape(
+      {r1f32_, ShapeUtil::MakeTupleShape({r3f32_, r2f32_}), r2f32_});
+  bool changed = BuildAliasConfig(input, output);
+  EXPECT_TRUE(changed);
+
+  EXPECT_EQ(AliasCount(), 3);
+
+  EXPECT_EQ(config_.GetAliasedOutput(0, {0, 0}), ShapeIndex{0});
+  EXPECT_EQ(config_.GetAliasedOutput(0, {1}), ShapeIndex({1, 1}));
+  EXPECT_EQ(config_.GetAliasedOutput(0, {2}), ShapeIndex({1, 0}));
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index c35f72699bfe90f7b8021916c0f81d5e1926ff4c..7164bfc4cd48ea945519dadece92d8df2e88d02a 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -775,7 +775,7 @@ class ShapePatternIsArrayImpl {
   explicit constexpr ShapePatternIsArrayImpl() {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    if (!ShapeUtil::IsArray(*shape)) {
+    if (!shape->IsArray()) {
       EXPLAIN << "Shape is not an array";
       return false;
     }
@@ -793,7 +793,7 @@ class ShapePatternIsTupleImpl {
   explicit constexpr ShapePatternIsTupleImpl() {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    if (!ShapeUtil::IsTuple(*shape)) {
+    if (!shape->IsTuple()) {
       EXPLAIN << "Shape is not a tuple";
       return false;
     }
@@ -831,7 +831,7 @@ class ShapePatternRankImpl {
   explicit constexpr ShapePatternRankImpl(int64 rank) : rank_(rank) {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    if (ShapeUtil::Rank(*shape) != rank_) {
+    if (shape->rank() != rank_) {
       if (rank_ == 0) {
         EXPLAIN << "Shape is not a scalar";
       } else {
@@ -1737,7 +1737,8 @@ class HloConstantScalarImpl {
               literal_r0_as_val_ty_or.ValueOrDie() == val_literal &&
               literal_r0 == val_as_literal_ty;
     if (!rv) {
-      EXPLAIN << "HloInstruction's constant value " << literal_r0.ToString()
+      EXPLAIN << "HloInstruction's constant value "
+              << literal_r0.ToStringWithoutShape()
               << " did not match expected value " << *val_;
     }
     return rv;
@@ -1877,7 +1878,7 @@ class HloInstructionPattern {
   // Make this a templated function to work around gcc 4.9.4 template infinite
   // recursion bug.
   template <typename Dummy = void>
-  constexpr auto WithShapeEqualTo(const ::xla::Shape* shape)
+  constexpr auto WithShapeEqualTo(const ::xla::Shape* shape) const
       -> decltype(this->WithShape(Shape().EqualTo(shape))) {
     return WithShape(Shape().EqualTo(shape));
   }
@@ -1885,7 +1886,7 @@ class HloInstructionPattern {
   // Make this a templated function to work around gcc 4.9.4 template infinite
   // recursion bug.
   template <typename Dummy = void>
-  constexpr auto WithShapeCompatibleTo(const ::xla::Shape* shape)
+  constexpr auto WithShapeCompatibleTo(const ::xla::Shape* shape) const
       -> decltype(this->WithShape(Shape().CompatibleTo(shape))) {
     return WithShape(Shape().CompatibleTo(shape));
   }
@@ -2035,7 +2036,7 @@ XLA_UNOP_PATTERN(Ceil)
 XLA_UNOP_PATTERN(Convert)
 XLA_UNOP_PATTERN(Copy)
 XLA_UNOP_PATTERN(Cos)
-XLA_UNOP_PATTERN(CrossReplicaSum)
+XLA_UNOP_PATTERN(AllReduce)
 XLA_UNOP_PATTERN(Exp)
 XLA_UNOP_PATTERN(Fft)
 XLA_UNOP_PATTERN(Floor)
@@ -2052,11 +2053,12 @@ XLA_UNOP_PATTERN(RecvDone)
 XLA_UNOP_PATTERN(ReducePrecision)
 XLA_UNOP_PATTERN(Reshape)
 XLA_UNOP_PATTERN(Reverse)
+XLA_UNOP_PATTERN(Rsqrt)
 XLA_UNOP_PATTERN(SendDone)
 XLA_UNOP_PATTERN(Sign)
 XLA_UNOP_PATTERN(Sin)
 XLA_UNOP_PATTERN(Slice)
-XLA_UNOP_PATTERN(Sort)
+XLA_UNOP_PATTERN(Sqrt)
 XLA_UNOP_PATTERN(Tanh)
 XLA_UNOP_PATTERN(Transpose)
 #undef XLA_UNOP_PATTERN
@@ -2118,7 +2120,6 @@ XLA_BINOP_PATTERN(Divide)
 XLA_BINOP_PATTERN(Complex)
 XLA_BINOP_PATTERN(Convolution)
 XLA_BINOP_PATTERN(Dot)
-XLA_BINOP_PATTERN(DynamicSlice)
 XLA_COMMUTATIVE_BINOP_PATTERN(Eq)
 XLA_BINOP_PATTERN(Gather)
 XLA_BINOP_PATTERN(Ge)
@@ -2235,8 +2236,10 @@ inline auto WithOperands(Matcher&& m, int64 operand_num, FirstArg&& first_arg,
 XLA_VARIADIC_OP_PATTERN(AfterAll);
 XLA_VARIADIC_OP_PATTERN(Concatenate);
 XLA_VARIADIC_OP_PATTERN(CustomCall);
+XLA_VARIADIC_OP_PATTERN(DynamicSlice)
 XLA_VARIADIC_OP_PATTERN(Map)
 XLA_VARIADIC_OP_PATTERN(Reduce);
+XLA_VARIADIC_OP_PATTERN(Sort);
 XLA_VARIADIC_OP_PATTERN(Tuple);
 
 // Helpers for matching non-constant instructions.
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_gmock_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_gmock_test.cc
index 9ca2fb05c1f7ef093c58237cf21fbc7c813a592a..f51a18b13894d75300c46835fabd82a4ce0699af 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher_gmock_test.cc
+++ b/tensorflow/compiler/xla/service/pattern_matcher_gmock_test.cc
@@ -23,7 +23,6 @@ namespace xla {
 namespace {
 
 namespace m = ::xla::match;
-using ::testing::Eq;
 using ::testing::Not;
 
 template <typename MatchedTy>
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
index 186ef0c7911a2724df810780e018f52586e3e6a8..5c3c009a68bffbda8642fceedfb724879fbf1530 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher_test.cc
+++ b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
@@ -242,8 +242,8 @@ TEST(PatternMatcherTest, ConstantScalar) {
     HloModule test_module
     ENTRY test {
       a = s32[] constant(1)
-      b = s32[1,1] constant(s32[1,1]{{2}})
-      c = s32[1,2] constant(s32[1,2]{{2,2}})
+      b = s32[1,1] constant({{2}})
+      c = s32[1,2] constant({{2,2}})
       d = f32[] constant(1)
       e = f32[] constant(1.25)
       ROOT tuple = (s32[], s32[1,1], s32[1,2], f32[], f32[]) tuple(a,b,c,d,e)
diff --git a/tensorflow/compiler/xla/service/platform_util.cc b/tensorflow/compiler/xla/service/platform_util.cc
index c227106511c2c17b44569d3b696cd7d764226e81..886a0545624927fa77528141f61d8ecb6bec180a 100644
--- a/tensorflow/compiler/xla/service/platform_util.cc
+++ b/tensorflow/compiler/xla/service/platform_util.cc
@@ -70,6 +70,9 @@ PlatformUtil::GetSupportedPlatforms() {
   for (se::Platform* platform : all_platforms) {
     auto compiler_status = Compiler::GetForPlatform(platform);
     if (compiler_status.ok()) {
+      if (!platform->Initialized()) {
+        TF_RETURN_IF_ERROR(platform->Initialize({}));
+      }
       platforms.push_back(platform);
     } else {
       LOG(INFO) << "platform " << platform->Name() << " present but no "
@@ -205,7 +208,9 @@ static bool IsDeviceSupported(se::StreamExecutor* executor) {
 }
 
 /* static */ StatusOr<std::vector<se::StreamExecutor*>>
-PlatformUtil::GetStreamExecutors(se::Platform* platform) {
+PlatformUtil::GetStreamExecutors(
+    se::Platform* platform,
+    const absl::optional<std::set<int>>& allowed_devices) {
   int device_count = platform->VisibleDeviceCount();
   if (device_count <= 0) {
     return NotFound("no %s devices found", platform->Name());
@@ -226,6 +231,17 @@ PlatformUtil::GetStreamExecutors(se::Platform* platform) {
     tensorflow::thread::ThreadPool thread_pool(
         tensorflow::Env::Default(), "device_initialization", device_count);
     for (int i = 0; i < device_count; ++i) {
+      // Once a stream executor is instantiated it will cause allocations on
+      // the device, for example for GPUs cuda context, cudnn handles etc. will
+      // be constructed. By constructing stream executors only on the
+      // allowed_devices, we don't make any allocations on other devices.
+      // This helps in multi-process executions on the same host like horovod or
+      // shared hosts.
+      if (allowed_devices && allowed_devices->count(i) == 0) {
+        VLOG(1) << "Not initializing StreamExecutor for device " << i
+                << " since it is not in the visible device list";
+        continue;
+      }
       thread_pool.Schedule([platform, i, &stream_executors]() {
         VLOG(1) << "Started device init " << i;
         se::StreamExecutorConfig config;
@@ -247,8 +263,8 @@ PlatformUtil::GetStreamExecutors(se::Platform* platform) {
     // Block here in thread_pool destructor until all devices are initialized.
   }
   VLOG(1) << "Device initialization complete";
-  if (std::all_of(stream_executors.begin(), stream_executors.end(),
-                  [](se::StreamExecutor* s) { return s == nullptr; })) {
+  if (absl::c_all_of(stream_executors,
+                     [](se::StreamExecutor* s) { return s == nullptr; })) {
     return InternalError("no supported devices found for platform %s",
                          platform->Name());
   }
diff --git a/tensorflow/compiler/xla/service/platform_util.h b/tensorflow/compiler/xla/service/platform_util.h
index 571451ba43a81d19b70e4954e45d3447f15dcedc..592b20282f334e12e0d7a7f683c9a6ab59d21fea 100644
--- a/tensorflow/compiler/xla/service/platform_util.h
+++ b/tensorflow/compiler/xla/service/platform_util.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_PLATFORM_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_PLATFORM_UTIL_H_
 
+#include <set>
 #include <string>
 #include <vector>
 
@@ -60,10 +61,14 @@ class PlatformUtil {
   // Returns a vector of StreamExecutors for the given platform. The vector is
   // indexed by device ordinal (device numbering used by StreamExecutor). If an
   // element is nullptr, then the device is present by not supported by XLA.
+  // If populated, only the devices in allowed_devices will have
+  // their StreamExecutors initialized, otherwise all StreamExecutors will be
+  // initialized and returned.
   //
   // If the platform has no visible devices, a not-found error is returned.
   static StatusOr<std::vector<se::StreamExecutor*>> GetStreamExecutors(
-      se::Platform* platform);
+      se::Platform* platform,
+      const absl::optional<std::set<int>>& allowed_devices = absl::nullopt);
 
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(PlatformUtil);
diff --git a/tensorflow/compiler/xla/service/reshape_mover.cc b/tensorflow/compiler/xla/service/reshape_mover.cc
index 4df746fca9f8320eed72911726f33bb01f06fed5..a62118df157edf67114ff41befbdce3da129fe93 100644
--- a/tensorflow/compiler/xla/service/reshape_mover.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover.cc
@@ -226,7 +226,10 @@ StatusOr<bool> PerformSinkReshapeOrTranspose(
     // changes, so all the fused instructions have the same dimensions.
     for (const auto& fused_instruction : instruction->fused_instructions()) {
       Shape* shape = fused_instruction->mutable_shape();
-      *shape->mutable_dimensions() = new_operand_shape.dimensions();
+      shape->clear_dimensions();
+      for (int64 i : new_operand_shape.dimensions()) {
+        shape->add_dimensions(i);
+      }
       *shape->mutable_layout() = new_operand_shape.layout();
     }
   }
diff --git a/tensorflow/compiler/xla/service/scatter_expander.cc b/tensorflow/compiler/xla/service/scatter_expander.cc
index 11c2f8392d285095816dd5d61f7029c1bfd158d4..acad871c4d427b174ffce3a462a0a3918a1e0c33 100644
--- a/tensorflow/compiler/xla/service/scatter_expander.cc
+++ b/tensorflow/compiler/xla/service/scatter_expander.cc
@@ -26,7 +26,6 @@ limitations under the License.
 
 namespace xla {
 
-
 // Transposes the given scatter_indices such that the index_vector_dim becomes
 // the most-minor dimension.
 static StatusOr<HloInstruction*> TransposeIndexVectorDimToLast(
@@ -60,6 +59,13 @@ static StatusOr<HloInstruction*> CanonicalizeScatterIndices(
   TF_ASSIGN_OR_RETURN(
       HloInstruction * transposed_scatter_indices,
       TransposeIndexVectorDimToLast(scatter_indices, index_vector_dim));
+  if (scatter_indices->shape().rank() == index_vector_dim + 1 &&
+      scatter_indices->shape().dimensions(index_vector_dim) == 1) {
+    auto new_shape =
+        ShapeUtil::DeleteDimension(index_vector_dim, scatter_indices->shape());
+    TF_ASSIGN_OR_RETURN(scatter_indices,
+                        MakeReshapeHlo(new_shape, scatter_indices));
+  }
   bool indices_are_scalar =
       index_vector_dim == scatter_indices->shape().dimensions_size();
 
@@ -88,7 +94,7 @@ static StatusOr<HloInstruction*> CanonicalizeScatterIndices(
 static StatusOr<HloInstruction*> PermuteScatterAndWindowDims(
     HloInstruction* updates, absl::Span<const int64> update_window_dims) {
   std::vector<int64> permutation;
-  const int64 updates_rank = ShapeUtil::Rank(updates->shape());
+  const int64 updates_rank = updates->shape().rank();
   permutation.reserve(updates_rank);
 
   for (int64 i = 0; i < updates_rank; ++i) {
@@ -165,10 +171,9 @@ static StatusOr<HloInstruction*> CheckIndexValidity(
   // Valid range for the index: [0, operand_dims - window_sizes]
 
   // Check if the index has any negative values.
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * zero_index,
+  HloInstruction* zero_index =
       BroadcastZeros(computation, index->shape().element_type(),
-                     AsInt64Slice(index->shape().dimensions())));
+                     AsInt64Slice(index->shape().dimensions()));
   TF_ASSIGN_OR_RETURN(HloInstruction * negative_index_check,
                       MakeBinaryHlo(HloOpcode::kLe, zero_index, index));
 
@@ -214,15 +219,11 @@ static StatusOr<std::vector<HloInstruction*>> ScatterLoopBody(
   HloInstruction* updates = loop_state[2];
 
   bool has_scalar_indices = scatter_indices->shape().dimensions_size() == 1;
-  CHECK_EQ(has_scalar_indices,
-           dim_numbers.index_vector_dim() ==
-               scatter->operand(1)->shape().dimensions_size());
 
   // Build a vector form of the induction variable of the while loop.
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * induction_var_as_vector,
+  HloInstruction* induction_var_as_vector =
       MakeBroadcastHlo(induction_var, /*broadcast_dimensions=*/{},
-                       /*result_shape_bounds=*/{1}));
+                       /*result_shape_bounds=*/{1});
 
   // Pick the index to scatter from scatter_indices based on the induction_var
   // and transform that to an index into the `operand` space.
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 5ec7fe2adedac2fc3d8a7588e853dba90e99006f..9bda6fba3aabfed78ae724545387e86bad36c886 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
@@ -113,6 +114,16 @@ int ServiceOptions::intra_op_parallelism_threads() const {
   return intra_op_parallelism_threads_;
 }
 
+ServiceOptions& ServiceOptions::set_allowed_devices(
+    const absl::optional<std::set<int>>& allowed_devices) {
+  allowed_devices_ = allowed_devices;
+  return *this;
+}
+
+const absl::optional<std::set<int>>& ServiceOptions::allowed_devices() const {
+  return allowed_devices_;
+}
+
 /* static */ StatusOr<std::unique_ptr<Service>> Service::NewService(
     se::Platform* platform) {
   ServiceOptions default_options;
@@ -129,6 +140,7 @@ int ServiceOptions::intra_op_parallelism_threads() const {
   }
   BackendOptions backend_options;
   backend_options.set_platform(platform);
+  backend_options.set_allowed_devices(options.allowed_devices());
   TF_ASSIGN_OR_RETURN(execute_backend, Backend::CreateBackend(backend_options));
 
   std::unique_ptr<Service> service(
@@ -150,17 +162,13 @@ Service::Service(const ServiceOptions& options,
     LOG(INFO) << StrFormat(
         "XLA service %p executing computations on platform %s. Devices:", this,
         execute_backend_->platform()->Name());
+    auto stream_executors = execute_backend_->stream_executors();
     for (int i = 0; i < execute_backend_->device_count(); ++i) {
-      if (execute_backend_->device_ordinal_supported(i)) {
-        se::StreamExecutor* executor =
-            execute_backend_->stream_executor(i).ValueOrDie();
-        const auto& description = executor->GetDeviceDescription();
-        LOG(INFO) << StrFormat("  StreamExecutor device (%d): %s, %s", i,
-                               description.name(),
-                               description.platform_version());
-      } else {
-        LOG(INFO) << StrFormat("  StreamExecutor device (%d) not supported", i);
-      }
+      se::StreamExecutor* executor = stream_executors.at(i);
+      const auto& description = executor->GetDeviceDescription();
+      LOG(INFO) << StrFormat("  StreamExecutor device (%d): %s, %s", i,
+                             description.name(),
+                             description.platform_version());
     }
   } else {
     VLOG(1) << "XLA compile-only service constructed";
@@ -288,11 +296,16 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     computation_layout->mutable_result_layout()->SetToDefaultLayout();
   }
 
-  config->set_replica_count(options_.number_of_replicas());
   if (execution_options != nullptr) {
+    if (execution_options->num_replicas() > 0) {
+      config->set_replica_count(execution_options->num_replicas());
+    } else {
+      config->set_replica_count(options_.number_of_replicas());
+    }
     config->set_seed(execution_options->seed());
     config->set_debug_options(execution_options->debug_options());
   } else {
+    config->set_replica_count(options_.number_of_replicas());
     config->set_debug_options(GetDebugOptionsFromFlags());
   }
 
@@ -355,6 +368,7 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
     const HloModuleProto* proto = module_protos[i];
     const HloModuleConfig& config = *module_configs[i];
     TF_ASSIGN_OR_RETURN(auto module, CreateModuleFromProto(*proto, config));
+    TF_RETURN_IF_ERROR(MaybeDumpUnoptimizedHloModule(*module));
     module_group->push_back(std::move(module));
   }
 
@@ -516,13 +530,13 @@ Service::ExecuteParallelAndRegisterResult(
 
 StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
     Executable* executable,
-    const absl::Span<const std::vector<const ShapedBuffer*>> arguments,
-    Backend* backend, const string& result_tag, ExecutionProfile* profile) {
+    absl::Span<const std::vector<const ShapedBuffer*>> arguments,
+    Backend* backend, const DeviceHandle& device_handle,
+    const string& result_tag, ExecutionProfile* profile) {
   // Set up streams.
   std::vector<StreamPool::Ptr> streams;
 
-  TF_ASSIGN_OR_RETURN(auto replicas,
-                      Replicas(*backend, SingleComputationDeviceHandle()));
+  TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*backend, device_handle));
   TF_RET_CHECK(!replicas.empty());
   for (se::StreamExecutor* executor : replicas) {
     TF_ASSIGN_OR_RETURN(StreamPool::Ptr stream,
@@ -530,10 +544,11 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
     streams.push_back(std::move(stream));
   }
 
-  TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
-                      backend->computation_placer()->AssignDevices(
-                          options_.number_of_replicas(),
-                          /*computation_count=*/1));
+  DeviceAssignment device_assignment(options_.number_of_replicas(),
+                                     /*computation_count=*/1);
+  for (int64 replica = 0; replica < replicas.size(); ++replica) {
+    device_assignment(replica, 0) = replicas[replica]->device_ordinal();
+  }
 
   // Set up run options.
   std::vector<ServiceExecutableRunOptions> run_options;
@@ -545,9 +560,7 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
     options.set_intra_op_thread_pool(
         backend->eigen_intra_op_thread_pool_device());
     options.set_device_assignment(&device_assignment);
-    run_options.emplace_back(
-        options, backend->StreamBorrower(),
-        /*xla_intra_op_thread_pool=*/backend->eigen_intra_op_thread_pool());
+    run_options.emplace_back(options, backend->StreamBorrower());
   }
 
   if (options_.number_of_replicas() == 1) {
@@ -704,14 +717,33 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
     }
   }
 
-  // Execute the generated executables in parallel and return the device
-  // handles for each computation's output.
+  // If we have multiple executables to run, execute them all in parallel.  But
+  // if we only have one executable, execute it using the vanilla, non-parallel
+  // call.
+  //
+  // We do this because the Client API uses ExecuteGraphParallel when it wants
+  // to compile and run one computation without caching the executable, but not
+  // all backends support the async StreamExecutor API required by
+  // ExecuteParallelAndRegisterResult.
+  //
+  // TODO(b/122731460): Consolidate Execute{,Parallel}AndRegisterResult; they do
+  // basically the same thing.
   ExecutionProfile profile;
-  TF_ASSIGN_OR_RETURN(
-      std::vector<GlobalDataHandle> outputs,
-      ExecuteParallelAndRegisterResult(executable_ptrs, all_arguments,
-                                       execute_backend_.get(), device_handles,
-                                       computation_names, &profile));
+  std::vector<GlobalDataHandle> outputs;
+  if (executable_ptrs.size() == 1) {
+    TF_ASSIGN_OR_RETURN(
+        auto output,
+        ExecuteAndRegisterResult(executable_ptrs[0], all_arguments[0],
+                                 execute_backend_.get(), device_handles[0],
+                                 computation_names[0], &profile));
+    outputs.push_back(std::move(output));
+  } else {
+    TF_ASSIGN_OR_RETURN(
+        outputs, ExecuteParallelAndRegisterResult(
+                     executable_ptrs, all_arguments, execute_backend_.get(),
+                     device_handles, computation_names, &profile));
+  }
+
   for (const GlobalDataHandle& output : outputs) {
     ExecuteResponse response;
     *response.mutable_output() = output;
@@ -897,6 +929,7 @@ Status Service::Execute(const ExecuteRequest* arg, ExecuteResponse* result) {
       *result->mutable_output(),
       ExecuteAndRegisterResult(executable.get(), replicated_arguments,
                                execute_backend_.get(),
+                               SingleComputationDeviceHandle(),
                                "result of " + executable->module().name(),
                                result->mutable_profile()));
 
@@ -1078,9 +1111,11 @@ Status Service::ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
 
   ProgramShape program_shape(arg->computation().host_program_shape());
   TF_DCHECK_OK(ShapeUtil::ValidateShape(program_shape.result()));
+  absl::optional<Layout> output_layout;
   if (arg->has_output_layout()) {
+    output_layout = Layout::CreateFromProto(arg->output_layout());
     TF_RETURN_IF_ERROR(LayoutUtil::ValidateLayoutForShape(
-        arg->output_layout(), program_shape.result()));
+        *output_layout, program_shape.result()));
   }
 
   HloModuleConfig config(program_shape);
@@ -1088,16 +1123,19 @@ Status Service::ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
                       CreateModuleFromProto(arg->computation(), config));
 
+  TF_ASSIGN_OR_RETURN(DynamicDimensionInference dynamic_dimension_inference,
+                      DynamicDimensionInference::Run(module.get()));
+
   HloEvaluator evaluator;
-  TF_ASSIGN_OR_RETURN(auto result_literal, evaluator.Evaluate<Literal>(
-                                               *module, /*arg_literals=*/{}));
+  evaluator.set_dynamic_dimension_inference(&dynamic_dimension_inference);
+  TF_ASSIGN_OR_RETURN(auto result_literal, evaluator.Evaluate(*module, {}));
 
   // Since the result layout is non-effective to the Evaluator results, explicit
   // relayout here.
   //
   // TODO(b/77824332): Make HloEvaluator take care of the re-layout.
-  if (arg->has_output_layout()) {
-    result_literal = result_literal.Relayout(arg->output_layout());
+  if (output_layout.has_value()) {
+    result_literal = result_literal.Relayout(*output_layout);
   }
   *result->mutable_literal() = result_literal.ToProto();
 
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index 11e1a79552fbd944ab28da129b08cfe676fb08e9..fd907d07daef9e8337aeed198ef4fd23d069df21 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <functional>
 #include <memory>
+#include <set>
 #include <string>
 #include <vector>
 
@@ -52,7 +53,7 @@ class ServiceOptions {
   ServiceOptions& set_platform(se::Platform* platform);
   se::Platform* platform() const;
 
-  // Set the number of replicas to use when compiling replicated
+  // Set the default number of replicas to use when compiling replicated
   // programs.
   ServiceOptions& set_number_of_replicas(int number_of_replicas);
   int number_of_replicas() const;
@@ -61,10 +62,17 @@ class ServiceOptions {
   ServiceOptions& set_intra_op_parallelism_threads(int num_threads);
   int intra_op_parallelism_threads() const;
 
+  // Sets the allowed_devices set for selectively constructing stream executors
+  // on the platform.
+  ServiceOptions& set_allowed_devices(
+      const absl::optional<std::set<int>>& allowed_devices);
+  const absl::optional<std::set<int>>& allowed_devices() const;
+
  private:
   se::Platform* platform_ = nullptr;
   int number_of_replicas_ = 1;
   int intra_op_parallelism_threads_ = -1;
+  absl::optional<std::set<int>> allowed_devices_;
 };
 
 // The XLA service object, which is the same across all platforms. It maintains
@@ -242,8 +250,9 @@ class Service : public ServiceInterface {
   // ExecutionProfile object which will be filled in with profile data.
   StatusOr<GlobalDataHandle> ExecuteAndRegisterResult(
       Executable* executable,
-      const absl::Span<const std::vector<const ShapedBuffer*>> arguments,
-      Backend* backend, const string& result_tag, ExecutionProfile* profile);
+      absl::Span<const std::vector<const ShapedBuffer*>> arguments,
+      Backend* backend, const DeviceHandle& device_handle,
+      const string& result_tag, ExecutionProfile* profile);
 
   // Runs the given executables with the given arguments and register the result
   // from each executable in the allocation tracker. The handles of the result
diff --git a/tensorflow/compiler/xla/service/service_executable_run_options.h b/tensorflow/compiler/xla/service/service_executable_run_options.h
index dbfed628bfcabffe66bef41a82e0e2430897d80d..6bee671056552b83014367889320b748659bbfdf 100644
--- a/tensorflow/compiler/xla/service/service_executable_run_options.h
+++ b/tensorflow/compiler/xla/service/service_executable_run_options.h
@@ -32,12 +32,10 @@ class ServiceExecutableRunOptions {
   ServiceExecutableRunOptions()
       : ServiceExecutableRunOptions(ExecutableRunOptions()) {}
 
-  explicit ServiceExecutableRunOptions(
-      ExecutableRunOptions run_options, StreamBorrower borrow_stream = nullptr,
-      tensorflow::thread::ThreadPool* xla_intra_op_thread_pool = nullptr)
+  explicit ServiceExecutableRunOptions(ExecutableRunOptions run_options,
+                                       StreamBorrower borrow_stream = nullptr)
       : run_options_(std::move(run_options)),
-        borrow_stream_(std::move(borrow_stream)),
-        xla_intra_op_thread_pool_(xla_intra_op_thread_pool) {}
+        borrow_stream_(std::move(borrow_stream)) {}
 
   // Returns reference or pointer to `ExecutableRunOptions` member.
   const ExecutableRunOptions& run_options() const { return run_options_; }
@@ -56,15 +54,9 @@ class ServiceExecutableRunOptions {
                : Status(tensorflow::error::UNIMPLEMENTED, "No stream cache");
   }
 
-  // Returns reference to thread pool for execution of XLA ops on CPU backend.
-  tensorflow::thread::ThreadPool* xla_intra_op_thread_pool() const {
-    return xla_intra_op_thread_pool_;
-  }
-
  private:
   ExecutableRunOptions run_options_;
   StreamBorrower borrow_stream_;
-  tensorflow::thread::ThreadPool* xla_intra_op_thread_pool_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 7e7282a737041458aed39b0054f901c23aa87d7a..431c2e3a5e0dac3093ba39640f3451bec6911f9f 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 
-#include <stddef.h>
 #include <algorithm>
+#include <cstddef>
 #include <numeric>
 #include <set>
 #include <string>
@@ -50,7 +50,7 @@ bool AllUnique(absl::Span<const int64> slice) {
 }
 
 Status ExpectArray(const Shape& shape, absl::string_view op_type) {
-  if (!ShapeUtil::IsArray(shape)) {
+  if (!shape.IsArray()) {
     return InvalidArgument("Expected array argument for %s, but got %s.",
                            string(op_type), ShapeUtil::HumanString(shape));
   }
@@ -70,7 +70,7 @@ Status VerifyReducerShape(const ProgramShape& reducer_shape,
 
   const Shape& accumulator_shape = reducer_shape.result();
   std::vector<const Shape*> accumulator_subshapes;
-  if (ShapeUtil::IsArray(accumulator_shape)) {
+  if (accumulator_shape.IsArray()) {
     if (inputs != 1) {
       return InvalidArgument(
           "Reduction function must produce a tuple with %d elements, but "
@@ -78,7 +78,7 @@ Status VerifyReducerShape(const ProgramShape& reducer_shape,
           inputs);
     }
     accumulator_subshapes.push_back(&accumulator_shape);
-  } else if (ShapeUtil::IsTuple(accumulator_shape)) {
+  } else if (accumulator_shape.IsTuple()) {
     if (ShapeUtil::TupleElementCount(accumulator_shape) != inputs) {
       return InvalidArgument(
           "Reduction function must produce a tuple with %d elements, but has "
@@ -96,7 +96,7 @@ Status VerifyReducerShape(const ProgramShape& reducer_shape,
   }
 
   for (const Shape* element_shape : accumulator_subshapes) {
-    if (ShapeUtil::Rank(*element_shape) != 0) {
+    if (element_shape->rank() != 0) {
       return InvalidArgument(
           "Reduction function must return a scalar or tuple of scalars but "
           "returns shape: %s",
@@ -156,17 +156,26 @@ Status VerifyReducerShape(const ProgramShape& reducer_shape,
   return Status::OK();
 }
 
+bool IsTrivialWindowDimension(const WindowDimension& window_dimension) {
+  return window_dimension.size() == 1 && window_dimension.stride() == 1 &&
+         window_dimension.padding_low() == 0 &&
+         window_dimension.padding_high() == 0 &&
+         window_dimension.window_dilation() == 1 &&
+         window_dimension.base_dilation() == 1;
+}
+
 StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
                                        const Window& window,
                                        PrimitiveType element_type,
                                        bool allow_negative_padding) {
-  if (window.dimensions_size() != ShapeUtil::Rank(base_shape)) {
+  if (window.dimensions_size() != base_shape.rank()) {
     return InvalidArgument(
         "Window has dimension %d but base shape has dimension %d.",
-        window.dimensions_size(), ShapeUtil::Rank(base_shape));
+        window.dimensions_size(), base_shape.rank());
   }
 
   std::vector<int64> output_dimensions(window.dimensions_size());
+  std::vector<bool> output_is_dynamic(window.dimensions_size());
   for (int64 i = 0; i < window.dimensions_size(); ++i) {
     const auto& dim = window.dimensions(i);
     if (dim.size() <= 0) {
@@ -196,6 +205,12 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
           window.DebugString());
     }
 
+    if (base_shape.is_dynamic_dimension(i) && !IsTrivialWindowDimension(dim)) {
+      return Unimplemented(
+          "Dynamic shape is not supported for non trivial window: %s",
+          window_util::ToString(window));
+    }
+
     const int64 dilated_base = window_util::DilatedBound(
         ShapeUtil::GetDimension(base_shape, i), dim.base_dilation());
     const int64 padded_dilated_base =
@@ -205,9 +220,11 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
 
     output_dimensions[i] = window_util::StridedBound(
         padded_dilated_base, dilated_window, dim.stride());
+    output_is_dynamic[i] = base_shape.is_dynamic_dimension(i);
   }
 
-  return ShapeUtil::MakeValidatedShape(element_type, output_dimensions);
+  return ShapeUtil::MakeValidatedShape(element_type, output_dimensions,
+                                       output_is_dynamic);
 }
 
 }  // namespace
@@ -245,6 +262,8 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
     case HloOpcode::kExpm1:
     case HloOpcode::kLog:
     case HloOpcode::kLog1p:
+    case HloOpcode::kRsqrt:
+    case HloOpcode::kSqrt:
     case HloOpcode::kTanh:
       if (!ShapeUtil::ElementIsFloating(shape) &&
           !ShapeUtil::ElementIsComplex(shape)) {
@@ -338,7 +357,7 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
   if (arg_shapes.empty()) {
     return InvalidArgument("Concatenate expects at least one argument.");
   }
-  if (dimension < 0 || dimension >= ShapeUtil::Rank(*arg_shapes[0])) {
+  if (dimension < 0 || dimension >= arg_shapes[0]->rank()) {
     return InvalidArgument("Concatenate dimension out of bounds: %d.",
                            dimension);
   }
@@ -351,12 +370,12 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
       element_type = arg_shape->element_type();
       continue;
     }
-    if (ShapeUtil::Rank(*arg_shape) != ShapeUtil::Rank(*shape)) {
+    if (arg_shape->rank() != shape->rank()) {
       return InvalidArgument(
           "Cannot concatenate arrays with different ranks: %d (%s) vs %d "
           "(%s).",
-          ShapeUtil::Rank(*arg_shape), ShapeUtil::HumanString(*arg_shape),
-          ShapeUtil::Rank(*shape), ShapeUtil::HumanString(*shape));
+          arg_shape->rank(), ShapeUtil::HumanString(*arg_shape), shape->rank(),
+          ShapeUtil::HumanString(*shape));
     }
     if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(*arg_shape, *shape)) {
       return InvalidArgument(
@@ -364,8 +383,8 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
           PrimitiveType_Name(arg_shape->element_type()),
           PrimitiveType_Name(shape->element_type()));
     }
-    for (int64 dimension_number = 0;
-         dimension_number < ShapeUtil::Rank(*arg_shape); ++dimension_number) {
+    for (int64 dimension_number = 0; dimension_number < arg_shape->rank();
+         ++dimension_number) {
       if (arg_shape->dimensions(dimension_number) !=
           shape->dimensions(dimension_number)) {
         if (dimension_number == dimension) {
@@ -401,7 +420,7 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
         ShapeUtil::HumanString(operand_shape),
         PrimitiveType_Name(new_element_type));
   }
-  if (!ShapeUtil::IsArray(operand_shape) ||
+  if (!operand_shape.IsArray() ||
       !primitive_util::IsArrayType(new_element_type)) {
     // Note: we may want to support tuple conversions via this operation in the
     // future, by recursing into the tuple elements to check all sub-conversions
@@ -424,7 +443,7 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
                            ShapeUtil::HumanString(operand_shape),
                            PrimitiveType_Name(new_element_type));
   }
-  if (!ShapeUtil::IsArray(operand_shape) ||
+  if (!operand_shape.IsArray() ||
       !primitive_util::IsArrayType(new_element_type)) {
     // Note: we may want to support tuple conversions via this operation in the
     // future, by recursing into the tuple elements to check all sub-conversions
@@ -472,7 +491,7 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
 /* static */ StatusOr<Shape> ShapeInference::InferPadShape(
     const Shape& operand_shape, const Shape& padding_value_shape,
     const PaddingConfig& padding_config) {
-  if (!ShapeUtil::IsArray(operand_shape)) {
+  if (!operand_shape.IsArray()) {
     return InvalidArgument(
         "Pad operation does not support tuple-shape operands.");
   }
@@ -480,7 +499,7 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
     return InvalidArgument(
         "Pad operation does not support non-scalar padding values.");
   }
-  if (ShapeUtil::Rank(operand_shape) != padding_config.dimensions_size()) {
+  if (operand_shape.rank() != padding_config.dimensions_size()) {
     return InvalidArgument(
         "The rank of the operand and the padding configuration do not match: "
         "%s vs %s.",
@@ -500,35 +519,44 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
                            padding_config.ShortDebugString());
   }
 
-  std::vector<int64> dimensions(ShapeUtil::Rank(operand_shape));
+  if (!padding_value_shape.is_static()) {
+    return InvalidArgument("Dynamic padding value is not supported");
+  }
+
+  std::vector<int64> dimensions(operand_shape.rank());
+  std::vector<bool> is_dynamic(operand_shape.rank());
   for (int64 i = 0; i < operand_shape.dimensions_size(); ++i) {
     const auto& p = padding_config.dimensions(i);
+    if (operand_shape.is_dynamic_dimension(i) && p.edge_padding_high() != 0 &&
+        p.edge_padding_low() != 0 && p.interior_padding() != 0) {
+      return InvalidArgument(
+          "Dynamic dimension on padding dimension is not supported.");
+    }
     dimensions[i] = operand_shape.dimensions(i) + p.edge_padding_low() +
                     p.edge_padding_high() +
                     std::max<int64>(operand_shape.dimensions(i) - 1, 0LL) *
                         p.interior_padding();
+    if (dimensions[i] < 0) {
+      return InvalidArgument("Padding result in negative size for dimension %d",
+                             i);
+    }
+    is_dynamic[i] = operand_shape.is_dynamic_dimension(i);
   }
+
   return ShapeUtil::MakeShape(
       ShapeUtil::HigherPrecisionElementType(operand_shape, padding_value_shape),
-      dimensions);
+      dimensions, is_dynamic);
 }
 
 // Current DotDimensionNumbers Requirements:
 //
 // Contracting Dimensions:
-// *) Exactly one contracting dimension on both lhs and rhs.
+// *) Same number of contracting dimensions on both lhs and rhs.
 // *) Contracting dimension size must be the same on both lhs and rhs.
-// *) Contracting dimension numbers do not need to be the same (i.e. transposes
-//    are passed on to emitter implementations).
 //
 // Batch Dimensions:
 // *) Same number of batch dimensions on both lhs and rhs.
-// *) Same batch dimension numbers (and sizes) on both lhs and rhs.
-// *) Batch dimension numbers must be ordered before contracting and
-//    non-contracting/non-batch dimension numbers.
-//
-// Non-Contracting-Non-Batch Dimensions:
-// *) Can be 0 (matrix-vector) or 1 (matrix-matrix).
+// *) Same batch dimension sizes on both lhs and rhs.
 //
 
 namespace {
@@ -541,9 +569,8 @@ Status ValidateDotDimensionNumbers(
                           absl::Span<const int64> contracting_dims,
                           absl::Span<const int64> batch_dims) -> bool {
     auto in_range = [&rank](int64 i) -> bool { return 0 <= i && i < rank; };
-    return std::all_of(contracting_dims.begin(), contracting_dims.end(),
-                       in_range) &&
-           std::all_of(batch_dims.begin(), batch_dims.end(), in_range);
+    return absl::c_all_of(contracting_dims, in_range) &&
+           absl::c_all_of(batch_dims, in_range);
   };
 
   absl::Span<const int64> lhs_contracting_dimensions =
@@ -555,9 +582,9 @@ Status ValidateDotDimensionNumbers(
   absl::Span<const int64> rhs_batch_dimensions =
       AsInt64Slice(dimension_numbers.rhs_batch_dimensions());
 
-  if (!dims_in_range(ShapeUtil::Rank(lhs), lhs_contracting_dimensions,
+  if (!dims_in_range(lhs.rank(), lhs_contracting_dimensions,
                      lhs_batch_dimensions) ||
-      !dims_in_range(ShapeUtil::Rank(rhs), rhs_contracting_dimensions,
+      !dims_in_range(rhs.rank(), rhs_contracting_dimensions,
                      rhs_batch_dimensions)) {
     return InvalidArgument("A dimension number is out of range in Dot: %s.",
                            dimension_numbers.DebugString());
@@ -570,9 +597,8 @@ Status ValidateDotDimensionNumbers(
     auto is_unique = [&dim_set](int64 i) -> bool {
       return dim_set.insert(i).second;
     };
-    return std::all_of(contracting_dims.begin(), contracting_dims.end(),
-                       is_unique) &&
-           std::all_of(batch_dims.begin(), batch_dims.end(), is_unique);
+    return absl::c_all_of(contracting_dims, is_unique) &&
+           absl::c_all_of(batch_dims, is_unique);
   };
 
   if (!dims_unique(lhs_contracting_dimensions, lhs_batch_dimensions) ||
@@ -581,36 +607,6 @@ Status ValidateDotDimensionNumbers(
                            dimension_numbers.DebugString());
   }
 
-  // Check that the count of non-contracting-non-batch dimensions is in {0, 1}.
-  const int64 lhs_non_contracting_non_batch_dims =
-      ShapeUtil::Rank(lhs) -
-      dimension_numbers.lhs_contracting_dimensions_size() -
-      dimension_numbers.lhs_batch_dimensions_size();
-  const int64 rhs_non_contracting_non_batch_dims =
-      ShapeUtil::Rank(rhs) -
-      dimension_numbers.rhs_contracting_dimensions_size() -
-      dimension_numbers.rhs_batch_dimensions_size();
-  if (lhs_non_contracting_non_batch_dims < 0 ||
-      lhs_non_contracting_non_batch_dims > 1 ||
-      rhs_non_contracting_non_batch_dims < 0 ||
-      rhs_non_contracting_non_batch_dims > 1) {
-    return InvalidArgument(
-        "Batch and contracting dimension number mismatch with rank.");
-  }
-
-  // Check that batch dimension numbers are ordered before all others, and
-  // that they are monotonically increasing.
-  std::vector<int64> batch_dim_numbers(lhs_batch_dimensions.size());
-  std::iota(batch_dim_numbers.begin(), batch_dim_numbers.end(), 0);
-  if (!std::equal(batch_dim_numbers.begin(), batch_dim_numbers.end(),
-                  lhs_batch_dimensions.begin()) ||
-      !std::equal(batch_dim_numbers.begin(), batch_dim_numbers.end(),
-                  rhs_batch_dimensions.begin())) {
-    return InvalidArgument(
-        "Batch dimension numbers must precede non-batch dimensions and be"
-        "monotonically increasing.");
-  }
-
   return Status::OK();
 }
 
@@ -637,28 +633,33 @@ Status ValidateDotDimensionNumbers(
     return fail("Element types do not match.");
   }
 
-  if ((ShapeUtil::Rank(lhs) < 1) || (ShapeUtil::Rank(rhs) < 1)) {
+  if ((lhs.rank() < 1) || (rhs.rank() < 1)) {
     return fail("Dot only supports rank 1 or above.");
   }
 
   // Validate basic properties of dot dimension numbers.
   TF_RETURN_IF_ERROR(ValidateDotDimensionNumbers(lhs, rhs, dimension_numbers));
 
-  // Check that there is only one contracting dimension for both lhs and rhs.
+  // Check that number of contracting dimensions match.
   if (dimension_numbers.lhs_contracting_dimensions_size() !=
-          dimension_numbers.rhs_contracting_dimensions_size() ||
-      dimension_numbers.lhs_contracting_dimensions_size() != 1) {
-    return fail("Must specify one contracting dimension for both lhs and rhs.");
+      dimension_numbers.rhs_contracting_dimensions_size()) {
+    return fail(
+        "Must specify the same number of contracting dimensions for lhs and "
+        "rhs.");
   }
-
   // Check that contracting dimension sizes match.
-  const int64 lhs_contracting_dimension =
-      dimension_numbers.lhs_contracting_dimensions(0);
-  const int64 rhs_contracting_dimension =
-      dimension_numbers.rhs_contracting_dimensions(0);
-  if (lhs.dimensions(lhs_contracting_dimension) !=
-      rhs.dimensions(rhs_contracting_dimension)) {
-    return fail("Contracting dimension sizes do not match.");
+  for (int64 i = 0; i < dimension_numbers.lhs_contracting_dimensions_size();
+       ++i) {
+    const int64 lhs_contracting_dimension =
+        dimension_numbers.lhs_contracting_dimensions(i);
+    const int64 rhs_contracting_dimension =
+        dimension_numbers.rhs_contracting_dimensions(i);
+    if (lhs.dimensions(lhs_contracting_dimension) !=
+            rhs.dimensions(rhs_contracting_dimension) ||
+        lhs.is_dynamic_dimension(lhs_contracting_dimension) !=
+            rhs.is_dynamic_dimension(rhs_contracting_dimension)) {
+      return fail("Contracting dimension sizes do not match.");
+    }
   }
 
   // Check that number of batch dimensions match.
@@ -669,11 +670,12 @@ Status ValidateDotDimensionNumbers(
 
   // Check that batch dimension numbers and sizes match.
   for (int64 i = 0; i < dimension_numbers.lhs_batch_dimensions_size(); ++i) {
-    if (dimension_numbers.lhs_batch_dimensions(i) !=
-            dimension_numbers.rhs_batch_dimensions(i) ||
-        lhs.dimensions(dimension_numbers.lhs_batch_dimensions(i)) !=
-            rhs.dimensions(dimension_numbers.rhs_batch_dimensions(i))) {
-      return fail("Batch dimension numbers and sizes must match for lhs/rhs.");
+    if (lhs.dimensions(dimension_numbers.lhs_batch_dimensions(i)) !=
+            rhs.dimensions(dimension_numbers.rhs_batch_dimensions(i)) ||
+        lhs.is_dynamic_dimension(dimension_numbers.lhs_batch_dimensions(i)) !=
+            rhs.is_dynamic_dimension(
+                dimension_numbers.rhs_batch_dimensions(i))) {
+      return fail("Batch dimension sizes must match for lhs/rhs.");
     }
   }
 
@@ -683,21 +685,29 @@ Status ValidateDotDimensionNumbers(
   // Generate the result dimensions in order, rhs dimensions followed by lhs
   // dimensions except the contracted and batch dimensions.
   std::vector<int64> dimensions;
-  std::unordered_set<int64> rhs_batch_dims(
-      dimension_numbers.rhs_batch_dimensions().begin(),
-      dimension_numbers.rhs_batch_dimensions().end());
-  for (int64 i = 0; i < ShapeUtil::Rank(lhs); i++) {
-    if (i != lhs_contracting_dimension) {
+  std::vector<bool> is_dynamic;
+  for (int64 lhs_dim : dimension_numbers.lhs_batch_dimensions()) {
+    dimensions.push_back(lhs.dimensions(lhs_dim));
+    is_dynamic.push_back(lhs.is_dynamic_dimension(lhs_dim));
+  }
+  for (int64 i = 0; i < lhs.rank(); i++) {
+    if (!absl::c_linear_search(dimension_numbers.lhs_contracting_dimensions(),
+                               i) &&
+        !absl::c_linear_search(dimension_numbers.lhs_batch_dimensions(), i)) {
       dimensions.push_back(lhs.dimensions(i));
+      is_dynamic.push_back(lhs.is_dynamic_dimension(i));
     }
   }
-  for (int64 i = 0; i < ShapeUtil::Rank(rhs); i++) {
-    if (i != rhs_contracting_dimension && rhs_batch_dims.count(i) == 0) {
+  for (int64 i = 0; i < rhs.rank(); i++) {
+    if (!absl::c_linear_search(dimension_numbers.rhs_contracting_dimensions(),
+                               i) &&
+        !absl::c_linear_search(dimension_numbers.rhs_batch_dimensions(), i)) {
       dimensions.push_back(rhs.dimensions(i));
+      is_dynamic.push_back(rhs.is_dynamic_dimension(i));
     }
   }
   Shape result = ShapeUtil::MakeShape(
-      ShapeUtil::HigherPrecisionElementType(lhs, rhs), dimensions);
+      ShapeUtil::HigherPrecisionElementType(lhs, rhs), dimensions, is_dynamic);
 
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(result));
   VLOG(2) << "inferred dot shape: " << ShapeUtil::HumanString(result);
@@ -708,20 +718,24 @@ Status ValidateDotDimensionNumbers(
 ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
                                                        const Shape& lhs,
                                                        const Shape& rhs) {
-  TF_RET_CHECK(ShapeUtil::Rank(lhs) == ShapeUtil::Rank(rhs));
+  TF_RET_CHECK(lhs.rank() == rhs.rank());
 
   // The shapes have to be compatible. That is, if some dimension d has a
   // different size in the two shapes, one of them has to be 1 (a "degenerate"
   // dimension). In that case, the output shape has the non-1 dimension size
   // from the lhs/rhs pair in every index.
-  std::vector<int64> output_dimensions(ShapeUtil::Rank(lhs));
-  for (int64 i = 0; i < ShapeUtil::Rank(lhs); ++i) {
+  std::vector<int64> output_dimensions(lhs.rank());
+  std::vector<bool> output_dimensions_is_dynamic(lhs.rank());
+  for (int64 i = 0; i < lhs.rank(); ++i) {
     if (lhs.dimensions(i) == rhs.dimensions(i)) {
       output_dimensions[i] = lhs.dimensions(i);
+      output_dimensions_is_dynamic[i] = lhs.is_dynamic_dimension(i);
     } else if (lhs.dimensions(i) == 1) {
       output_dimensions[i] = rhs.dimensions(i);
+      output_dimensions_is_dynamic[i] = rhs.is_dynamic_dimension(i);
     } else if (rhs.dimensions(i) == 1) {
       output_dimensions[i] = lhs.dimensions(i);
+      output_dimensions_is_dynamic[i] = lhs.is_dynamic_dimension(i);
     } else {
       return InvalidArgument(
           "Binary op %s with incompatible shapes: %s and %s.",
@@ -730,7 +744,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     }
   }
   return ShapeUtil::MakeShape(ShapeUtil::HigherPrecisionElementType(lhs, rhs),
-                              output_dimensions);
+                              output_dimensions, output_dimensions_is_dynamic);
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferInDimBroadcastShape(
@@ -743,13 +757,13 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     return InvalidArgument("Automatic shape inference not supported: %s and %s",
                            ShapeUtil::HumanString(smaller_shape),
                            ShapeUtil::HumanString(larger_shape));
-  } else if (broadcast_dimensions.size() != ShapeUtil::Rank(smaller_shape)) {
+  } else if (broadcast_dimensions.size() != smaller_shape.rank()) {
     return InvalidArgument(
         "Size of broadcast_dimensions has to match lower-rank operand's "
         "rank; "
         " lower-rank operand's rank is %d, size of broadcast_dimensions is "
         "%u.",
-        ShapeUtil::Rank(smaller_shape), broadcast_dimensions.size());
+        smaller_shape.rank(), broadcast_dimensions.size());
   }
 
   // broadcast_dimensions is a sequence of dimensions; its length is equal to
@@ -809,6 +823,9 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     }
     int64 small_dimension_size = smaller_shape.dimensions(i);
     int64 large_dimension_size = larger_shape.dimensions(dimension_to_match);
+    bool small_is_dynamic = smaller_shape.is_dynamic_dimension(i);
+    bool large_is_dynamic =
+        larger_shape.is_dynamic_dimension(dimension_to_match);
     // Dimension sizes must be compatible: match or be degenerate (degenerate
     // case is handled by degenerate dimension broadcasting which occurs after
     // InDim broadcasting).
@@ -820,6 +837,18 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
           ShapeUtil::HumanString(smaller_shape),
           ShapeUtil::HumanString(larger_shape));
     }
+    if (small_is_dynamic != large_is_dynamic) {
+      if (small_dimension_size == large_dimension_size ||
+          (small_dimension_size == 1 && !small_is_dynamic) ||
+          (large_dimension_size == 1 && !large_is_dynamic)) {
+        // Do nothing. It's OK when the size-1 dimension is not static.
+      } else {
+        return InvalidArgument(
+            "Broadcast dimension %d dynamism mismatch: %s and %s.", i,
+            ShapeUtil::HumanString(smaller_shape),
+            ShapeUtil::HumanString(larger_shape));
+      }
+    }
     // Make sure the broadcast dimensions are listed in a strictly increasing
     // order.
     if (i > 0 && broadcast_dimensions.at(i - 1) >= dimension_to_match) {
@@ -829,6 +858,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     }
 
     output_shape.set_dimensions(dimension_to_match, small_dimension_size);
+    output_shape.set_dynamic_dimension(dimension_to_match, small_is_dynamic);
   }
 
   return output_shape;
@@ -847,8 +877,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         ShapeUtil::HumanString(rhs));
   }
 
-  if (ShapeUtil::Rank(lhs) == ShapeUtil::Rank(rhs)) {
-    std::vector<int64> identity_dims(ShapeUtil::Rank(lhs));
+  if (lhs.rank() == rhs.rank()) {
+    std::vector<int64> identity_dims(lhs.rank());
     std::iota(identity_dims.begin(), identity_dims.end(), 0);
     if (!broadcast_dimensions.empty() &&
         broadcast_dimensions != identity_dims) {
@@ -865,15 +895,13 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         lhs, ShapeUtil::HigherPrecisionElementType(lhs, rhs));
   }
 
-  if (ShapeUtil::Rank(lhs) == ShapeUtil::Rank(rhs)) {
+  if (lhs.rank() == rhs.rank()) {
     return InferDegenerateDimensionBroadcastShape(operation, lhs, rhs);
   } else {
     // Ranks do not match, so perform InDim broadcasting using
     // broadcast_dimensions. Scalar broadcasting is a special case of this.
-    const Shape& larger_shape =
-        ShapeUtil::Rank(lhs) > ShapeUtil::Rank(rhs) ? lhs : rhs;
-    const Shape& smaller_shape =
-        ShapeUtil::Rank(lhs) > ShapeUtil::Rank(rhs) ? rhs : lhs;
+    const Shape& larger_shape = lhs.rank() > rhs.rank() ? lhs : rhs;
+    const Shape& smaller_shape = lhs.rank() > rhs.rank() ? rhs : lhs;
 
     // After InDim broadcasting, perform degenerate dimensions broadcasting.
     TF_ASSIGN_OR_RETURN(Shape indim_broadcast_shape,
@@ -942,6 +970,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
                                                         broadcast_dimensions));
       if (lhs.element_type() == F32 && rhs.element_type() == F32) {
         return ShapeUtil::ChangeElementType(shape, C64);
+      } else if (lhs.element_type() == F64 && rhs.element_type() == F64) {
+        return ShapeUtil::ChangeElementType(shape, C128);
       } else {
         return Unimplemented("Complex component type is not implemented.");
       }
@@ -1162,12 +1192,12 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(scale_shape) ==
                Status::OK());
 
-  if (feature_index >= ShapeUtil::Rank(operand_shape)) {
+  if (feature_index >= operand_shape.rank()) {
     return InvalidArgument(
         "Expected feature_index of batch-norm-training to be "
         "smaller than the rank of operand_shape; "
         "got feature_index %d, and rank %d.",
-        feature_index, ShapeUtil::Rank(operand_shape));
+        feature_index, operand_shape.rank());
   }
 
   if (feature_index < 0) {
@@ -1177,25 +1207,25 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         feature_index);
   }
 
-  if (ShapeUtil::Rank(operand_shape) < 1) {
+  if (operand_shape.rank() < 1) {
     return InvalidArgument(
         "Expected the rank of operand to "
         "batch-norm-training to be at least 1; got %d.",
-        ShapeUtil::Rank(operand_shape));
+        operand_shape.rank());
   }
 
-  if (ShapeUtil::Rank(offset_shape) != 1) {
+  if (offset_shape.rank() != 1) {
     return InvalidArgument(
         "Offset input of batch-norm-training must have"
         " rank 1, but has rank %d.",
-        ShapeUtil::Rank(offset_shape));
+        offset_shape.rank());
   }
 
-  if (ShapeUtil::Rank(scale_shape) != 1) {
+  if (scale_shape.rank() != 1) {
     return InvalidArgument(
         "Scale input of batch-norm-training must have"
         " rank 1, but has rank %d.",
-        ShapeUtil::Rank(scale_shape));
+        scale_shape.rank());
   }
 
   if (!ShapeUtil::ElementIsFloating(operand_shape)) {
@@ -1272,12 +1302,12 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(variance_shape) ==
                Status::OK());
 
-  if (feature_index >= ShapeUtil::Rank(operand_shape)) {
+  if (feature_index >= operand_shape.rank()) {
     return InvalidArgument(
         "Expected feature_index of batch-norm-inference to be "
         "smaller than the rank of operand_shape; "
         "got feature_index %d, and rank %d.",
-        feature_index, ShapeUtil::Rank(operand_shape));
+        feature_index, operand_shape.rank());
   }
 
   if (feature_index < 0) {
@@ -1287,25 +1317,25 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         feature_index);
   }
 
-  if (ShapeUtil::Rank(operand_shape) < 1) {
+  if (operand_shape.rank() < 1) {
     return InvalidArgument(
         "Expected the rank of operand to "
         "batch-norm-inference to be at least 1; got %d.",
-        ShapeUtil::Rank(operand_shape));
+        operand_shape.rank());
   }
 
-  if (ShapeUtil::Rank(offset_shape) != 1) {
+  if (offset_shape.rank() != 1) {
     return InvalidArgument(
         "Offset input of batch-norm-inference must have"
         " rank 1, but has rank %d.",
-        ShapeUtil::Rank(offset_shape));
+        offset_shape.rank());
   }
 
-  if (ShapeUtil::Rank(scale_shape) != 1) {
+  if (scale_shape.rank() != 1) {
     return InvalidArgument(
         "Scale input of batch-norm-inference must have"
         " rank 1, but has rank %d.",
-        ShapeUtil::Rank(scale_shape));
+        scale_shape.rank());
   }
 
   if (!ShapeUtil::ElementIsFloating(operand_shape)) {
@@ -1417,41 +1447,41 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   TF_RETURN_IF_ERROR(
       ShapeUtil::ValidateShapeWithOptionalLayout(output_grad_shape));
 
-  if (feature_index >= ShapeUtil::Rank(operand_shape)) {
+  if (feature_index >= operand_shape.rank()) {
     return InvalidArgument(
         "Expected feature_index of batch-norm-grad to be "
         "smaller than the rank of operand_shape; "
         "got feature_index %d, and rank %d.",
-        feature_index, ShapeUtil::Rank(operand_shape));
+        feature_index, operand_shape.rank());
   }
 
-  if (ShapeUtil::Rank(operand_shape) != ShapeUtil::Rank(output_grad_shape)) {
+  if (operand_shape.rank() != output_grad_shape.rank()) {
     return InvalidArgument(
         "Expected operand_shape of batch-norm-grad to have the same rank as"
         " output_grad_shape; got rank(oprand_shape) %d, and"
         " rank(output_grad_shape) %d.",
-        ShapeUtil::Rank(operand_shape), ShapeUtil::Rank(output_grad_shape));
+        operand_shape.rank(), output_grad_shape.rank());
   }
 
-  if (ShapeUtil::Rank(mean_shape) != 1) {
+  if (mean_shape.rank() != 1) {
     return InvalidArgument(
         "Mean input of batch-norm-grad must have"
         " rank 1, but has rank %d.",
-        ShapeUtil::Rank(mean_shape));
+        mean_shape.rank());
   }
 
-  if (ShapeUtil::Rank(scale_shape) != 1) {
+  if (scale_shape.rank() != 1) {
     return InvalidArgument(
         "Scale input of batch-norm-grad must have"
         " rank 1, but has rank %d.",
-        ShapeUtil::Rank(scale_shape));
+        scale_shape.rank());
   }
 
-  if (ShapeUtil::Rank(var_shape) != 1) {
+  if (var_shape.rank() != 1) {
     return InvalidArgument(
         "Var input of batch-norm-grad must have"
         " rank 1, but has rank %d.",
-        ShapeUtil::Rank(var_shape));
+        var_shape.rank());
   }
 
   if (!ShapeUtil::ElementIsFloating(operand_shape)) {
@@ -1538,7 +1568,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   }
 
   // Verify operand_shape and output_grad_shape have same bounds.
-  for (int64 i = 0; i < ShapeUtil::Rank(operand_shape); ++i) {
+  for (int64 i = 0; i < operand_shape.rank(); ++i) {
     if (ShapeUtil::GetDimension(operand_shape, i) !=
         ShapeUtil::GetDimension(output_grad_shape, i)) {
       return InvalidArgument(
@@ -1556,7 +1586,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 
 /* static */ StatusOr<Shape> ShapeInference::InferConvolveShape(
     const Shape& lhs, const Shape& rhs, int64 feature_group_count,
-    const Window& window, const ConvolutionDimensionNumbers& dnums) {
+    int64 batch_group_count, const Window& window,
+    const ConvolutionDimensionNumbers& dnums) {
   TF_RETURN_IF_ERROR(ExpectArray(lhs, "lhs of convolution"));
   TF_RETURN_IF_ERROR(ExpectArray(rhs, "rhs of convolution"));
 
@@ -1565,6 +1596,20 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         "feature_group_count must be a positive number, got %d",
         feature_group_count);
   }
+
+  if (batch_group_count <= 0) {
+    return InvalidArgument(
+        "batch_group_count must be a positive number, got %d",
+        batch_group_count);
+  }
+
+  if (batch_group_count > 1 && feature_group_count > 1) {
+    return InvalidArgument(
+        "both batch_group_count %d and feature_group_count %d cannot be "
+        "greater than 1",
+        batch_group_count, feature_group_count);
+  }
+
   if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(lhs, rhs)) {
     return InvalidArgument(
         "Convolution with different element types: %s and %s.",
@@ -1595,12 +1640,12 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   }
 
   const int num_dims = num_spatial_dims + 2;
-  if (ShapeUtil::Rank(lhs) != num_dims) {
+  if (lhs.rank() != num_dims) {
     return InvalidArgument(
         "The LHS argument to a convolution should have rank %d; lhs: %s.",
         num_dims, ShapeUtil::HumanString(lhs));
   }
-  if (ShapeUtil::Rank(rhs) != num_dims) {
+  if (rhs.rank() != num_dims) {
     return InvalidArgument(
         "The RHS argument to a convolution should have rank %d; rhs: %s.",
         num_dims, ShapeUtil::HumanString(rhs));
@@ -1615,29 +1660,29 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   input_dnums[1] = dnums.input_feature_dimension();
   std::copy(dnums.input_spatial_dimensions().begin(),
             dnums.input_spatial_dimensions().end(), input_dnums.begin() + 2);
-  std::sort(input_dnums.begin(), input_dnums.end());
+  absl::c_sort(input_dnums);
 
   std::vector<int64> window_dnums(num_dims);
   window_dnums[0] = dnums.kernel_input_feature_dimension();
   window_dnums[1] = dnums.kernel_output_feature_dimension();
   std::copy(dnums.kernel_spatial_dimensions().begin(),
             dnums.kernel_spatial_dimensions().end(), window_dnums.begin() + 2);
-  std::sort(window_dnums.begin(), window_dnums.end());
+  absl::c_sort(window_dnums);
 
   std::vector<int64> output_dnums(num_dims);
   output_dnums[0] = dnums.output_batch_dimension();
   output_dnums[1] = dnums.output_feature_dimension();
   std::copy(dnums.output_spatial_dimensions().begin(),
             dnums.output_spatial_dimensions().end(), output_dnums.begin() + 2);
-  std::sort(output_dnums.begin(), output_dnums.end());
+  absl::c_sort(output_dnums);
 
   std::vector<int64> expected_dnums(num_dims);
   std::iota(expected_dnums.begin(), expected_dnums.end(), 0);
 
   const auto in_range = [num_dims](int64 i) { return 0 <= i && i < num_dims; };
-  if (!std::all_of(input_dnums.begin(), input_dnums.end(), in_range) ||
-      !std::all_of(window_dnums.begin(), window_dnums.end(), in_range) ||
-      !std::all_of(output_dnums.begin(), output_dnums.end(), in_range)) {
+  if (!absl::c_all_of(input_dnums, in_range) ||
+      !absl::c_all_of(window_dnums, in_range) ||
+      !absl::c_all_of(output_dnums, in_range)) {
     return InvalidArgument(
         "A dimension number is out of range in convolution: %s.",
         dnums.DebugString());
@@ -1678,6 +1723,17 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   const int64 kernel_output_features =
       rhs.dimensions(dnums.kernel_output_feature_dimension());
 
+  if (batch_group_count > 1 && input_batch % kernel_output_features != 0) {
+    return InvalidArgument(
+        "Expected output feature dimension (value %d) to be divisible by "
+        "input_batch (value %d) for batch group count %d; "
+        "got <conv>(%s, %s)\n"
+        "Dimension numbers: {%s}.",
+        kernel_output_features, input_batch, batch_group_count,
+        ShapeUtil::HumanString(lhs), ShapeUtil::HumanString(rhs),
+        dnums.DebugString());
+  }
+
   if (input_features % feature_group_count != 0 ||
       input_features / feature_group_count != kernel_input_features) {
     return InvalidArgument(
@@ -1700,6 +1756,17 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         ShapeUtil::HumanString(lhs), ShapeUtil::HumanString(rhs),
         dnums.DebugString());
   }
+
+  if (input_batch % batch_group_count > 0) {
+    return InvalidArgument(
+        "Expected input batch dimension (value %d) to be divisible by "
+        "batch_group_count (value %d); "
+        "got <conv>(%s, %s)\n"
+        "Dimension numbers: {%s}.",
+        input_batch, batch_group_count, ShapeUtil::HumanString(lhs),
+        ShapeUtil::HumanString(rhs), dnums.DebugString());
+  }
+
   std::vector<int64> window_dims(num_spatial_dims);
   for (int i = 0; i < num_spatial_dims; ++i) {
     window_dims[i] = window.dimensions(i).size();
@@ -1722,14 +1789,39 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
                              /*allow_negative_padding=*/true));
 
   std::vector<int64> dimensions(num_dims);
-  dimensions[dnums.output_batch_dimension()] = input_batch;
+  dimensions[dnums.output_batch_dimension()] = input_batch / batch_group_count;
   dimensions[dnums.output_feature_dimension()] = kernel_output_features;
   for (int i = 0; i < num_spatial_dims; ++i) {
     dimensions[dnums.output_spatial_dimensions(i)] =
         window_output_shape.dimensions(i);
   }
+  std::vector<bool> is_dynamic(num_dims);
+  for (int i = 0; i < num_dims; i++) {
+    if (lhs.is_dynamic_dimension(i)) {
+      if (i == dnums.input_batch_dimension()) {
+        is_dynamic[dnums.output_batch_dimension()] = true;
+      } else if (i == dnums.input_feature_dimension()) {
+        // Input feature dimension is a contracting dimension, which does not
+        // affect the output dimension size. So we need to do nothing.
+      } else {
+        return InvalidArgument(
+            "Dynamic Spatial Convolution is not supported: lhs shape is %s ",
+            lhs.ToString());
+      }
+    }
+    if (rhs.is_dynamic_dimension(i)) {
+      if (i == dnums.kernel_input_feature_dimension()) {
+        // Kernel feature dimension does not affect the output dimension size.
+        // So we need to do nothing.
+      } else {
+        return InvalidArgument(
+            "Dynamic Spatial Convolution is not supported: rhs shape is %s ",
+            rhs.ToString());
+      }
+    }
+  }
   return ShapeUtil::MakeShape(ShapeUtil::HigherPrecisionElementType(lhs, rhs),
-                              dimensions);
+                              dimensions, is_dynamic);
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferFftShape(
@@ -1750,7 +1842,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     case FFT:
     case IFFT:
       if (in.element_type() != C64) {
-        return InvalidArgument("%s requires C64 input type, found %s.",
+        return InvalidArgument("%s requires complex input type, found %s.",
                                FftType_Name(fft_type),
                                PrimitiveType_Name(in.element_type()));
       }
@@ -1773,6 +1865,9 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
               fft_length[i]);
         }
       }
+      if (ShapeUtil::IsZeroElementArray(in)) {
+        return in;
+      }
       Shape result = ShapeUtil::ChangeElementType(in, C64);
       result.set_dimensions(result.dimensions_size() - 1,
                             fft_length[fft_rank - 1] / 2 + 1);
@@ -1814,7 +1909,50 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 #undef RET_CHECK_RANK
 }
 
-/* static */ StatusOr<Shape> ShapeInference::InferCrossReplicaSumShape(
+/* static */ StatusOr<Shape> ShapeInference::InferTriangularSolveShape(
+    const Shape& a, const Shape& b, const TriangularSolveOptions& options) {
+  if (a.rank() < 2) {
+    return InvalidArgument(
+        "The 'a' argument to TriangularSolve must have rank >= 2, got shape %s",
+        a.ToString());
+  }
+  if (b.rank() != a.rank()) {
+    return InvalidArgument(
+        "Arguments to triangular solve must have equal rank; got %s and %s.",
+        b.ToString(), a.ToString());
+  }
+  if (a.dimensions(a.rank() - 2) != a.dimensions(a.rank() - 1)) {
+    return InvalidArgument(
+        "The two minor dimensions of 'a' must have equal size, got %s.",
+        a.ToString());
+  }
+  if (a.dimensions(a.rank() - 1) !=
+      b.dimensions(b.rank() - (options.left_side() ? 2 : 1))) {
+    return InvalidArgument(
+        "The shared dimension of 'a' and 'b' does not match, got shapes %s and "
+        "%s",
+        a.ToString(), b.ToString());
+  }
+  absl::Span<const int64> a_batch_dims(a.dimensions());
+  absl::Span<const int64> b_batch_dims(b.dimensions());
+  a_batch_dims.remove_suffix(2);
+  b_batch_dims.remove_suffix(2);
+  if (a_batch_dims != b_batch_dims) {
+    return InvalidArgument(
+        "The leading batch dimensions of the arguments to triangular solve "
+        "must be equal; got %s and %s.",
+        b.ToString(), a.ToString());
+  }
+  if (!TriangularSolveOptions_Transpose_IsValid(options.transpose_a()) ||
+      options.transpose_a() == TriangularSolveOptions::TRANSPOSE_INVALID) {
+    return InvalidArgument(
+        "Invalid transpose option value for triangular solve (%d).\n",
+        options.transpose_a());
+  }
+  return b;
+}
+
+/* static */ StatusOr<Shape> ShapeInference::InferAllReduceShape(
     absl::Span<const Shape* const> operand_shapes) {
   for (const Shape* operand_shape : operand_shapes) {
     TF_RETURN_IF_ERROR(
@@ -1834,12 +1972,12 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     const Shape& shape, int64 split_dimension, int64 concat_dimension,
     int64 split_count) {
   TF_RET_CHECK(split_count > 0);
-  if (split_dimension >= ShapeUtil::Rank(shape) || split_dimension < 0) {
+  if (split_dimension >= shape.rank() || split_dimension < 0) {
     return InvalidArgument(
         "AllToAll split_dimension %d is out-of-bounds in shape %s.",
         split_dimension, ShapeUtil::HumanString(shape));
   }
-  if (concat_dimension >= ShapeUtil::Rank(shape) || concat_dimension < 0) {
+  if (concat_dimension >= shape.rank() || concat_dimension < 0) {
     return InvalidArgument(
         "AllToAll concat_dimension %d is out-of-bounds in shape %s.",
         concat_dimension, ShapeUtil::HumanString(shape));
@@ -1877,7 +2015,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 
 /* static */ StatusOr<Shape> ShapeInference::InferCollectivePermuteShape(
     const Shape& shape) {
-  TF_RET_CHECK(ShapeUtil::IsArray(shape));
+  TF_RET_CHECK(shape.IsArray());
   return shape;
 }
 
@@ -1901,7 +2039,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   for (int64 i = 1; i < num_reduced_args; ++i) {
     if (!ShapeUtil::SameDimensions(*reduced_args[0], *reduced_args[i])) {
       return InvalidArgument(
-          "All reduced tensors must have the sime dimension. Tensor 0 has "
+          "All reduced tensors must have the same dimension. Tensor 0 has "
           "shape %s, Tensor %d has shape %s",
           ShapeUtil::HumanString(*reduced_args[0]), i,
           ShapeUtil::HumanString(*reduced_args[i]));
@@ -1913,7 +2051,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   // doesn't matter which one we choose.
   const Shape& arg = *reduced_args[0];
   for (int64 dimension : dimensions_to_reduce) {
-    if (dimension >= ShapeUtil::Rank(arg) || dimension < 0) {
+    if (dimension >= arg.rank() || dimension < 0) {
       return InvalidArgument("Reducing out-of-bounds dimension %d in shape %s.",
                              dimension, ShapeUtil::HumanString(arg));
     }
@@ -1930,20 +2068,22 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   std::set<int64> dimensions_to_reduce_set(dimensions_to_reduce.begin(),
                                            dimensions_to_reduce.end());
   std::vector<int64> new_dimensions;
-  for (int i = 0; i < ShapeUtil::Rank(arg); ++i) {
+  std::vector<bool> new_is_dynamic;
+  for (int i = 0; i < arg.rank(); ++i) {
     if (dimensions_to_reduce_set.find(i) == dimensions_to_reduce_set.end()) {
       new_dimensions.push_back(arg.dimensions(i));
+      new_is_dynamic.push_back(arg.is_dynamic_dimension(i));
     }
   }
 
   if (ShapeUtil::IsScalar(to_apply.result())) {
     return ShapeUtil::MakeShape(to_apply.result().element_type(),
-                                new_dimensions);
+                                new_dimensions, new_is_dynamic);
   } else {
     std::vector<Shape> result_subshapes;
     for (const Shape& subshape : to_apply.result().tuple_shapes()) {
-      result_subshapes.push_back(
-          ShapeUtil::MakeShape(subshape.element_type(), new_dimensions));
+      result_subshapes.push_back(ShapeUtil::MakeShape(
+          subshape.element_type(), new_dimensions, new_is_dynamic));
     }
     return ShapeUtil::MakeTupleShape(result_subshapes);
   }
@@ -2017,12 +2157,13 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         ShapeUtil::HumanString(source_shape),
         ShapeUtil::HumanString(window_result_shape));
   }
+
   return operand_shape;
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferGetDimensionSizeShape(
     const Shape& shape, int64 dimension) {
-  if (dimension < 0 || dimension >= ShapeUtil::Rank(shape)) {
+  if (dimension < 0 || dimension >= shape.rank()) {
     return InvalidArgument("GetDimensionSize dimension out of bounds: %d.",
                            dimension);
   }
@@ -2064,10 +2205,10 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
                            starts.size(), strides.size()));
   }
 
-  if (starts.size() != ShapeUtil::Rank(arg)) {
+  if (starts.size() != arg.rank()) {
     return InvalidArgument(
         "Slice index count does not match argument rank: %u vs %d.",
-        starts.size(), ShapeUtil::Rank(arg));
+        starts.size(), arg.rank());
   }
 
   std::vector<int64> sizes;
@@ -2102,41 +2243,87 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferDynamicSliceShape(
-    const Shape& operand_shape, const Shape& start_indices_shape,
-    absl::Span<const int64> slice_sizes) {
+    const Shape& operand_shape, absl::Span<const Shape> start_index_shapes,
+    absl::Span<const int64> slice_sizes, bool allow_scalar_indices) {
   TF_RETURN_IF_ERROR(ExpectArray(operand_shape, "operand of dynamic slice"));
-  TF_RETURN_IF_ERROR(
-      ExpectArray(start_indices_shape, "start indices of dynamic slice"));
+  auto number_of_indices = start_index_shapes.size();
+  // TODO(b/118437727): Remove this path.
+  if (!allow_scalar_indices ||
+      (number_of_indices >= 1 && start_index_shapes[0].rank() == 1)) {
+    if (number_of_indices != 1) {
+      return InvalidArgument(
+          "Dynamic slice should have exactly 1 index operand, has %d.",
+          number_of_indices);
+    }
 
-  VLOG(2) << StrFormat(
-      "slicing shape %s at dynamic start_indices %s with slice_sizes={%s}",
-      ShapeUtil::HumanString(operand_shape),
-      ShapeUtil::HumanString(start_indices_shape), StrJoin(slice_sizes, ", "));
+    const Shape& start_indices_shape = start_index_shapes[0];
+    VLOG(2) << StrFormat(
+        "slicing shape %s at dynamic start_indices %s with slice_sizes={%s}",
+        ShapeUtil::HumanString(operand_shape),
+        ShapeUtil::HumanString(start_indices_shape),
+        StrJoin(slice_sizes, ", "));
 
-  if (ShapeUtil::Rank(start_indices_shape) != 1) {
-    return InvalidArgument(
-        "Dynamic slice start indices of rank %d must be rank1.",
-        ShapeUtil::Rank(start_indices_shape));
-  }
+    TF_RETURN_IF_ERROR(
+        ExpectArray(start_indices_shape, "start indices of dynamic slice"));
 
-  if (!ShapeUtil::ElementIsIntegral(start_indices_shape)) {
-    return InvalidArgument(
-        "Dynamic slice start indices must be of integral type.");
-  }
+    if (start_indices_shape.rank() != 1) {
+      return InvalidArgument(
+          "Dynamic slice start indices of rank %d must be rank1.",
+          start_indices_shape.rank());
+    }
 
-  const int64 start_num_dims = start_indices_shape.dimensions(0);
-  if (ShapeUtil::Rank(operand_shape) != start_num_dims) {
-    return InvalidArgument(
-        "Dynamic slice start number of dimensions %d (%s) must match rank "
-        "%d of slice input (%s).",
-        start_num_dims, ShapeUtil::HumanString(start_indices_shape),
-        ShapeUtil::Rank(operand_shape), ShapeUtil::HumanString(operand_shape));
+    if (!ShapeUtil::ElementIsIntegral(start_indices_shape)) {
+      return InvalidArgument(
+          "Dynamic slice start indices must be of integral type.");
+    }
+
+    const int64 start_num_dims = start_indices_shape.dimensions(0);
+    if (operand_shape.rank() != start_num_dims) {
+      return InvalidArgument(
+          "Dynamic slice start number of dimensions %d (%s) must match rank "
+          "%d of slice input (%s).",
+          start_num_dims, ShapeUtil::HumanString(start_indices_shape),
+          operand_shape.rank(), ShapeUtil::HumanString(operand_shape));
+    }
+  } else {
+    VLOG(2) << StrFormat("slicing shape %s a with slice_sizes={%s}",
+                         ShapeUtil::HumanString(operand_shape),
+                         StrJoin(slice_sizes, ", "));
+
+    if (operand_shape.rank() != number_of_indices) {
+      return InvalidArgument(
+          "Dynamic slice start number of dimensions %d must match rank "
+          "%d of slice input (%s).",
+          number_of_indices, operand_shape.rank(),
+          ShapeUtil::HumanString(operand_shape));
+    }
+
+    if (number_of_indices > 0) {
+      const Shape& first_index_shape = start_index_shapes[0];
+      if (!ShapeUtil::IsScalar(first_index_shape)) {
+        return InvalidArgument("Dynamic slice indices must be scalar, not %s.",
+                               ShapeUtil::HumanString(first_index_shape));
+      }
+      if (!ShapeUtil::ElementIsIntegral(first_index_shape)) {
+        return InvalidArgument(
+            "Dynamic slice start indices must be of integral type.");
+      }
+      for (const Shape& index_shape : start_index_shapes) {
+        if (!ShapeUtil::Compatible(first_index_shape, index_shape)) {
+          return InvalidArgument(
+              "Dynamic slice start indices must all have the same shape, got "
+              "mismatching indices with shapes %s and %s.",
+              ShapeUtil::HumanString(first_index_shape),
+              ShapeUtil::HumanString(index_shape));
+        }
+      }
+    }
   }
 
-  if (slice_sizes.size() != ShapeUtil::Rank(operand_shape)) {
+  if (slice_sizes.size() != operand_shape.rank()) {
     return InvalidArgument(
         "Dynamic slice index count does not match argument rank: %u vs %d.",
-        slice_sizes.size(), ShapeUtil::Rank(operand_shape));
+        slice_sizes.size(), operand_shape.rank());
   }
 
   for (int64 dim = 0; dim < slice_sizes.size(); ++dim) {
@@ -2159,46 +2346,92 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 
 /* static */ StatusOr<Shape> ShapeInference::InferDynamicUpdateSliceShape(
     const Shape& operand_shape, const Shape& update_shape,
-    const Shape& start_indices_shape) {
+    absl::Span<const Shape> start_index_shapes, bool allow_scalar_indices) {
   TF_RETURN_IF_ERROR(
       ExpectArray(operand_shape, "operand of dynamic update slice"));
   TF_RETURN_IF_ERROR(
       ExpectArray(update_shape, "update of dynamic update slice"));
-  TF_RETURN_IF_ERROR(ExpectArray(start_indices_shape,
-                                 "start indices of dynamic update slice"));
 
-  VLOG(2) << StrFormat(
-      "updating slice of shape %s at dynamic start_indices %s with update "
-      "shape %s",
-      ShapeUtil::HumanString(operand_shape),
-      ShapeUtil::HumanString(start_indices_shape),
-      ShapeUtil::HumanString(update_shape));
+  auto number_of_indices = start_index_shapes.size();
+  // TODO(b/118437727): Remove this path.
+  if (!allow_scalar_indices ||
+      (number_of_indices >= 1 && start_index_shapes[0].rank() == 1)) {
+    if (number_of_indices != 1) {
+      return InvalidArgument(
+          "Dynamic update slice should have exactly 1 index operand, has %d.",
+          number_of_indices);
+    }
+    const Shape& start_indices_shape = start_index_shapes[0];
+    TF_RETURN_IF_ERROR(ExpectArray(start_indices_shape,
+                                   "start indices of dynamic update slice"));
 
-  if (ShapeUtil::Rank(start_indices_shape) != 1) {
-    return InvalidArgument(
-        "Dynamic update slice start indices of rank %d must be rank1.",
-        ShapeUtil::Rank(start_indices_shape));
-  }
+    VLOG(2) << StrFormat(
+        "updating slice of shape %s at dynamic start_indices %s with update "
+        "shape %s",
+        ShapeUtil::HumanString(operand_shape),
+        ShapeUtil::HumanString(start_indices_shape),
+        ShapeUtil::HumanString(update_shape));
 
-  if (!ShapeUtil::ElementIsIntegral(start_indices_shape)) {
-    return InvalidArgument(
-        "Dynamic update slice start indices must be of integral type.");
-  }
+    if (start_indices_shape.rank() != 1) {
+      return InvalidArgument(
+          "Dynamic update slice start indices of rank %d must be rank1.",
+          start_indices_shape.rank());
+    }
 
-  const int64 start_num_dims = start_indices_shape.dimensions(0);
-  if (ShapeUtil::Rank(operand_shape) != start_num_dims) {
-    return InvalidArgument(
-        "Dynamic update slice start number of dimensions %d (%s) must match "
-        "rank %d of slice input (%s).",
-        start_num_dims, ShapeUtil::HumanString(start_indices_shape),
-        ShapeUtil::Rank(operand_shape), ShapeUtil::HumanString(operand_shape));
+    if (!ShapeUtil::ElementIsIntegral(start_indices_shape)) {
+      return InvalidArgument(
+          "Dynamic update slice start indices must be of integral type.");
+    }
+
+    const int64 start_num_dims = start_indices_shape.dimensions(0);
+    if (operand_shape.rank() != start_num_dims) {
+      return InvalidArgument(
+          "Dynamic update slice start number of dimensions %d (%s) must match "
+          "rank %d of slice input (%s).",
+          start_num_dims, ShapeUtil::HumanString(start_indices_shape),
+          operand_shape.rank(), ShapeUtil::HumanString(operand_shape));
+    }
+  } else {
+    VLOG(2) << StrFormat("updating slice of shape %s with update shape %s",
+                         ShapeUtil::HumanString(operand_shape),
+                         ShapeUtil::HumanString(update_shape));
+
+    if (operand_shape.rank() != number_of_indices) {
+      return InvalidArgument(
+          "Dynamic update slice start number of dimensions %d must match "
+          "rank %d of slice input (%s).",
+          number_of_indices, operand_shape.rank(),
+          ShapeUtil::HumanString(operand_shape));
+    }
+
+    if (number_of_indices > 0) {
+      const Shape& first_index_shape = start_index_shapes[0];
+      if (!ShapeUtil::IsScalar(first_index_shape)) {
+        return InvalidArgument(
+            "Dynamic update slice indices must be scalar, not %s.",
+            ShapeUtil::HumanString(first_index_shape));
+      }
+      if (!ShapeUtil::ElementIsIntegral(first_index_shape)) {
+        return InvalidArgument(
+            "Dynamic update slice start indices must be of integral type.");
+      }
+      for (const Shape& index_shape : start_index_shapes) {
+        if (!ShapeUtil::Compatible(first_index_shape, index_shape)) {
+          return InvalidArgument(
+              "Dynamic update slice start indices must all have the same "
+              "shape, got mismatching indices with shapes %s and %s.",
+              ShapeUtil::HumanString(first_index_shape),
+              ShapeUtil::HumanString(index_shape));
+        }
+      }
+    }
   }
 
-  if (ShapeUtil::Rank(update_shape) != ShapeUtil::Rank(operand_shape)) {
+  if (update_shape.rank() != operand_shape.rank()) {
     return InvalidArgument(
         "Dynamic update slice update rank does not match argument rank: "
         "%d vs %d.",
-        ShapeUtil::Rank(update_shape), ShapeUtil::Rank(operand_shape));
+        update_shape.rank(), operand_shape.rank());
   }
 
   if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(operand_shape,
@@ -2210,7 +2443,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         PrimitiveType_Name(update_shape.element_type()));
   }
 
-  for (int64 dim = 0; dim < ShapeUtil::Rank(operand_shape); ++dim) {
+  for (int64 dim = 0; dim < operand_shape.rank(); ++dim) {
     const int64 input_dim_size = operand_shape.dimensions(dim);
     const int64 update_dim_size = update_shape.dimensions(dim);
     if (update_dim_size < 0) {
@@ -2236,7 +2469,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     return InvalidArgument("a dimension number is duplicated in reverse");
   }
   for (int64 dimension : dimensions) {
-    if (dimension >= ShapeUtil::Rank(operand_shape) || dimension < 0) {
+    if (dimension >= operand_shape.rank() || dimension < 0) {
       return InvalidArgument(
           "One of the reverse dimensions (%d) is out-of-bounds in shape %s.",
           dimension, ShapeUtil::HumanString(operand_shape));
@@ -2247,13 +2480,13 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 
 /* static */ StatusOr<Shape> ShapeInference::InferGetTupleElementShape(
     const Shape& arg, int64 index) {
-  if (!ShapeUtil::IsTuple(arg)) {
+  if (!arg.IsTuple()) {
     return InvalidArgument(
         "Cannot infer shape: attempting to index into non-tuple: %s.",
         ShapeUtil::HumanString(arg));
   }
 
-  if (index >= arg.tuple_shapes_size()) {
+  if (index < 0 || index >= arg.tuple_shapes_size()) {
     return InvalidArgument(
         "Cannot infer shape: attempt to index out of tuple bounds: %d "
         ">= %d in shape %s.",
@@ -2283,7 +2516,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   };
 
   // Check the shapes of computation parameters and return types.
-  if (!ShapeUtil::ShapeIs(condition.result(), PRED, {})) {
+  if (!ShapeUtil::Equal(condition.result(), ShapeUtil::MakeShape(PRED, {}))) {
     return InvalidArgument("Condition must return a boolean; got %s.",
                            shape_string());
   }
@@ -2303,7 +2536,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     const Shape& predicate, const Shape& true_operand,
     const Shape& false_operand, const ProgramShape& true_computation,
     const ProgramShape& false_computation) {
-  if (!ShapeUtil::ShapeIs(predicate, PRED, {})) {
+  if (!ShapeUtil::Equal(predicate, ShapeUtil::MakeShape(PRED, {}))) {
     return InvalidArgument("Predicate must be a boolean; got %s.",
                            ShapeUtil::HumanString(predicate));
   }
@@ -2378,8 +2611,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     absl::Span<const int64> broadcast_dimensions) {
   TF_RETURN_IF_ERROR(ExpectArray(operand_shape, "operand of broadcast"));
   TF_RETURN_IF_ERROR(ExpectArray(output_shape, "operand of broadcast"));
-  const int64 operand_rank = ShapeUtil::Rank(operand_shape);
-  const int64 output_rank = ShapeUtil::Rank(output_shape);
+  const int64 operand_rank = operand_shape.rank();
+  const int64 output_rank = output_shape.rank();
   if (operand_rank > output_rank) {
     return InvalidArgument(
         "InDim style broadcast must be to an equal or higher ranked shape; "
@@ -2402,11 +2635,17 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         operand_shape.dimensions(i) != 1) {
       return InvalidArgument(
           "Input dimension should be either 1 or equal to the output dimension "
-          "it's broadcasting into; the %lldth operand dimension is %lld, the "
+          "it is broadcasting into; the %lldth operand dimension is %lld, the "
           "%lldth output dimension is %lld.",
           i, operand_shape.dimensions(i), broadcast_dimensions[i],
           output_shape.dimensions(broadcast_dimensions[i]));
     }
+    if (operand_shape.is_dynamic_dimension(i) !=
+        output_shape.is_dynamic_dimension(broadcast_dimensions[i])) {
+      return InvalidArgument(
+          "Broadcast input and output dynamism mismatch: %s and %s",
+          operand_shape.ToString(), output_shape.ToString());
+    }
     // Make sure the broadcast dimensions are listed in a strictly increasing
     // order.
     if (i > 0 && broadcast_dimensions[i - 1] >= broadcast_dimensions[i]) {
@@ -2438,9 +2677,9 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         ShapeUtil::HumanString(inferred_shape));
   }
 
-  std::vector<int64> indices(ShapeUtil::Rank(operand));
+  std::vector<int64> indices(operand.rank());
   std::iota(indices.begin(), indices.end(), 0);
-  if (dimensions.size() != ShapeUtil::Rank(operand) ||
+  if (dimensions.size() != operand.rank() ||
       !std::is_permutation(dimensions.begin(), dimensions.end(),
                            indices.begin())) {
     return InvalidArgument(
@@ -2449,6 +2688,14 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         StrJoin(dimensions, ","), ShapeUtil::HumanString(operand));
   }
 
+  std::vector<std::pair<int64, int64>> unmodified_dims =
+      ShapeUtil::DimensionsUnmodifiedByReshape(operand, inferred_shape);
+  for (auto& unmodified : unmodified_dims) {
+    if (operand.is_dynamic_dimension(unmodified.first)) {
+      inferred_shape.set_dynamic_dimension(unmodified.second, true);
+    }
+  }
+
   return inferred_shape;
 }
 
@@ -2456,11 +2703,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     const Shape& operand, absl::Span<const int64> dimensions) {
   TF_RETURN_IF_ERROR(ExpectArray(operand, "transpose"));
 
-  std::vector<int64> indices(ShapeUtil::Rank(operand));
-  std::iota(indices.begin(), indices.end(), 0);
-  if (dimensions.size() != ShapeUtil::Rank(operand) ||
-      !std::is_permutation(dimensions.begin(), dimensions.end(),
-                           indices.begin())) {
+  if (!IsPermutation(dimensions, operand.rank())) {
     return InvalidArgument(
         "Transpose dimensions [%s] are not a permutation of the operand "
         "dimensions (operand shape is %s).",
@@ -2522,19 +2765,31 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
         "Select's pred operand must have PRED element type; got %s.",
         ShapeUtil::HumanString(pred));
   }
-  if (ShapeUtil::CompatibleIgnoringElementType(pred, on_true) ||
+  if (Shape::Equal()
+          .IgnoreElementType()
+          .IgnoreLayout()
+          .IgnoreDynamicDimension()(pred, on_true) ||
       ShapeUtil::IsScalar(pred)) {
     // By this stage we know that pred's element type is PRED. Therefore, this
     // check restricts pred to be a PRED scalar, or a PRED array with the same
     // dimensions as on_true and on_false.
-    return ShapeUtil::ChangeElementType(
+    Shape inferred_shape = ShapeUtil::ChangeElementType(
         on_true, ShapeUtil::HigherPrecisionElementType(on_true, on_false));
-  } else {
-    return InvalidArgument(
-        "Select operation with non-scalar predicate with dimensionality "
-        " different from the other operands: %s.",
-        ShapeUtil::HumanString(pred));
+
+    // Propagate dynamic dimensions if pred is not a scalar.
+    if (!ShapeUtil::IsScalar(pred)) {
+      for (int i = 0; i < inferred_shape.rank(); i++) {
+        if (pred.is_dynamic_dimension(i)) {
+          inferred_shape.set_dynamic_dimension(i, true);
+        }
+      }
+    }
+    return inferred_shape;
   }
+  return InvalidArgument(
+      "Select operation with non-scalar predicate with dimensionality "
+      "different from the other operands: %s.",
+      ShapeUtil::HumanString(pred));
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferTupleSelectShape(
@@ -2810,7 +3065,7 @@ Status ValidateScatterDimensionNumbers(
         "update_window_dims in scatter op must not repeat; got: %s.",
         StrJoin(dim_numbers.update_window_dims(), ", "));
   }
-  const int64 updates_rank = ShapeUtil::Rank(updates_shape);
+  const int64 updates_rank = updates_shape.rank();
   for (int64 window_dim : dim_numbers.update_window_dims()) {
     if (window_dim < 0 || window_dim >= updates_rank) {
       return InvalidArgument(
@@ -2844,10 +3099,10 @@ Status ValidateScatterDimensionNumbers(
   // Validate window size.
   auto window_size = dim_numbers.update_window_dims_size() +
                      dim_numbers.inserted_window_dims_size();
-  if (window_size != ShapeUtil::Rank(operand_shape)) {
+  if (window_size != operand_shape.rank()) {
     return InvalidArgument(
         "Scatter op has window of size %d; doesn't match operand of rank %d.",
-        window_size, ShapeUtil::Rank(operand_shape));
+        window_size, operand_shape.rank());
   }
 
   // Validate scatter_dims_to_operand_dims in ScatterDimensionNumbers.
@@ -2932,10 +3187,9 @@ Status ValidateScatterDimensionNumbers(
 
   int64 expected_updates_rank = expanded_scatter_indices_shape.size() - 1 +
                                 scatter_dim_numbers.update_window_dims_size();
-  if (ShapeUtil::Rank(updates_shape) != expected_updates_rank) {
+  if (updates_shape.rank() != expected_updates_rank) {
     return InvalidArgument("Updates tensor must be of rank %d; got %d.",
-                           expected_updates_rank,
-                           ShapeUtil::Rank(updates_shape));
+                           expected_updates_rank, updates_shape.rank());
   }
 
   TF_RETURN_IF_ERROR(ValidateScatterDimensionNumbers(
@@ -2966,7 +3220,7 @@ Status ValidateScatterDimensionNumbers(
   }
 
   int64 scatter_dims_seen = 0;
-  for (int64 i = 0; i < ShapeUtil::Rank(updates_shape); ++i) {
+  for (int64 i = 0; i < updates_shape.rank(); ++i) {
     bool is_update_window_dim =
         absl::c_binary_search(scatter_dim_numbers.update_window_dims(), i);
     if (is_update_window_dim) {
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index d94385a04d50baff8156570a09620fd458547936..acb071ab18824472153fc608b812ad2d9c52651e 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -109,16 +109,20 @@ class ShapeInference {
   // filter (rhs) to lhs in the way specified by the fields on window.
   static StatusOr<Shape> InferConvolveShape(
       const Shape& lhs, const Shape& rhs, int64 feature_group_count,
-      const Window& window,
+      int64 batch_group_count, const Window& window,
       const ConvolutionDimensionNumbers& dimension_numbers);
 
   // Infers the shape produced by the given FFT type on the given operand.
   static StatusOr<Shape> InferFftShape(const Shape& in, FftType fft_type,
                                        absl::Span<const int64> fft_length);
 
+  // Infers the shape produced by the given triangular solve operation.
+  static StatusOr<Shape> InferTriangularSolveShape(
+      const Shape& a, const Shape& b, const TriangularSolveOptions& options);
+
   // Infers the shape produced by a cross replica sum with the given operand
   // shapes.
-  static StatusOr<Shape> InferCrossReplicaSumShape(
+  static StatusOr<Shape> InferAllReduceShape(
       absl::Span<const Shape* const> operand_shapes);
 
   // Infers final shape of an Alltoall operation that is created by the xla
@@ -176,14 +180,15 @@ class ShapeInference {
   // Infers the shape produced by a dynamic slice operation of size specified
   // in 'slice_sizes', with dynamic start indices shape 'start_indices_shape'.
   static StatusOr<Shape> InferDynamicSliceShape(
-      const Shape& operand_shape, const Shape& start_indices_shape,
-      absl::Span<const int64> slice_sizes);
+      const Shape& operand_shape, absl::Span<const Shape> start_index_shapes,
+      absl::Span<const int64> slice_sizes, bool allow_scalar_indices = true);
 
   // Infers the shape produced by a dynamic update slice operation based
   // on the shape of operand and update.
   static StatusOr<Shape> InferDynamicUpdateSliceShape(
       const Shape& operand_shape, const Shape& update_shape,
-      const Shape& start_indices_shape);
+      absl::Span<const Shape> start_index_shapes,
+      bool allow_scalar_indices = true);
 
   // Infers the shape produced by doing a compile-time-constant indexing into
   // the given input shape. This is essential for operations on tuples, because
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index 4639e32db4d59080a9e85e46983fac61d9e76be9..f400ef51f07b006eef2ea674feff1dd72f836e77 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -35,6 +35,7 @@ class ShapeInferenceTest : public ::testing::Test {
  protected:
   // Some handy scalar shapes.
   const Shape s32_ = ShapeUtil::MakeShape(S32, {});
+  const Shape f16_ = ShapeUtil::MakeShape(F16, {});
   const Shape f32_ = ShapeUtil::MakeShape(F32, {});
   const Shape f64_ = ShapeUtil::MakeShape(F64, {});
   const Shape pred_ = ShapeUtil::MakeShape(PRED, {});
@@ -251,7 +252,7 @@ TEST_F(ShapeInferenceTest, ClampBadShapes) {
 
 TEST_F(ShapeInferenceTest, Complex) {
   auto complex_shape = [&](const Shape& lhs, const Shape& rhs,
-                           const absl::Span<const int64>& bcast) {
+                           absl::Span<const int64> bcast) {
     return ShapeInference::InferBinaryOpShape(HloOpcode::kComplex, lhs, rhs,
                                               bcast);
   };
@@ -260,8 +261,8 @@ TEST_F(ShapeInferenceTest, Complex) {
   ASSERT_FALSE(complex_shape(pred_, pred_, {}).ok());
   // Component types must match.
   ASSERT_FALSE(complex_shape(f32_, f64_, {}).ok());
-  // Only F32->C64 supported.
-  ASSERT_FALSE(complex_shape(f64_, f64_, {}).ok());
+  // Only F32->C64 and F64->C128 supported.
+  ASSERT_FALSE(complex_shape(f16_, f16_, {}).ok());
   // Validate correct uses.
   Shape c64_32 = ShapeUtil::MakeShape(C64, {32});
   TF_ASSERT_OK_AND_ASSIGN(Shape result, complex_shape(f32_, f32_, {}));
@@ -285,6 +286,9 @@ TEST_F(ShapeInferenceTest, Complex) {
   ASSERT_TRUE(ShapeUtil::Equal(result, c64_32_64));
   TF_ASSERT_OK_AND_ASSIGN(result, complex_shape(matrix_32_64_, f32_, {}));
   ASSERT_TRUE(ShapeUtil::Equal(result, c64_32_64));
+
+  TF_ASSERT_OK_AND_ASSIGN(result, complex_shape(f64_, f64_, {}));
+  ASSERT_TRUE(ShapeUtil::Equal(result, ShapeUtil::MakeShape(C128, {})));
 }
 
 TEST_F(ShapeInferenceTest, VariadicOpTuplify) {
@@ -420,7 +424,8 @@ TEST_F(ShapeInferenceTest, Convolve) {
   dim1->set_window_dilation(1);
   dim1->set_base_dilation(1);
   auto inferred_status = ShapeInference::InferConvolveShape(
-      lhs_shape, rhs_shape, /*feature_group_count=*/1, window, dnums);
+      lhs_shape, rhs_shape, /*feature_group_count=*/1, /*batch_group_count=*/1,
+      window, dnums);
   ASSERT_IS_OK(inferred_status.status());
   Shape inferred_shape = inferred_status.ValueOrDie();
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {10, 12, 2, 3}),
@@ -465,7 +470,8 @@ TEST_F(ShapeInferenceTest, ConvolveWithWindowDilation) {
   dim1->set_window_dilation(2);
   dim1->set_base_dilation(1);
   auto inferred_status = ShapeInference::InferConvolveShape(
-      lhs_shape, rhs_shape, /*feature_group_count=*/1, window, dnums);
+      lhs_shape, rhs_shape, /*feature_group_count=*/1, /*batch_group_count=*/1,
+      window, dnums);
   ASSERT_IS_OK(inferred_status.status());
   Shape inferred_shape = inferred_status.ValueOrDie();
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {10, 12, 31, 5}),
@@ -510,7 +516,8 @@ TEST_F(ShapeInferenceTest, ConvolveWithBaseDilation) {
   dim1->set_window_dilation(1);
   dim1->set_base_dilation(2);
   auto inferred_status = ShapeInference::InferConvolveShape(
-      lhs_shape, rhs_shape, /*feature_group_count=*/1, window, dnums);
+      lhs_shape, rhs_shape, /*feature_group_count=*/1, /*batch_group_count=*/1,
+      window, dnums);
   ASSERT_IS_OK(inferred_status.status());
   Shape inferred_shape = inferred_status.ValueOrDie();
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {10, 12, 4, 9}),
@@ -548,7 +555,8 @@ TEST_F(ShapeInferenceTest, ConvolveDimensionNumbersOverlapError) {
   dim1->set_padding_low(1);
   dim1->set_padding_high(1);
   auto inferred_status = ShapeInference::InferConvolveShape(
-      lhs_shape, rhs_shape, /*feature_group_count=*/1, window, dnums);
+      lhs_shape, rhs_shape, /*feature_group_count=*/1, /*batch_group_count=*/1,
+      window, dnums);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(inferred_status.status().error_message(),
               HasSubstr("each dimension exactly once"));
@@ -888,6 +896,20 @@ TEST_F(ShapeInferenceTest, InferConstIndexShape) {
   ASSERT_TRUE(ShapeUtil::Equal(s32_, inferred1_status.ValueOrDie()));
 }
 
+TEST_F(ShapeInferenceTest, InferTupleElementShapeOutOfBound) {
+  Shape tuple_shape = ShapeUtil::MakeTupleShape({f32_, s32_});
+  auto inferredNegative_status =
+      ShapeInference::InferGetTupleElementShape(tuple_shape, -1);
+  auto inferred2_status =
+      ShapeInference::InferGetTupleElementShape(tuple_shape, 2);
+  ASSERT_FALSE(inferredNegative_status.ok());
+  ASSERT_FALSE(inferred2_status.ok());
+  EXPECT_THAT(inferredNegative_status.status().error_message(),
+              HasSubstr("attempt to index out of tuple bounds"));
+  EXPECT_THAT(inferred2_status.status().error_message(),
+              HasSubstr("attempt to index out of tuple bounds"));
+}
+
 TEST_F(ShapeInferenceTest, InferPowShape) {
   auto ten_floats = ShapeUtil::MakeShape(F32, {10});
   auto inferred_status = ShapeInference::InferBinaryOpShape(
@@ -1002,9 +1024,9 @@ TEST_F(ShapeInferenceTest, DotWithRankHigherThanTwo) {
   dot_dnums.add_rhs_contracting_dimensions(0);
   auto inferred_status = ShapeInference::InferDotOpShape(
       ShapeUtil::MakeShape(F32, {32, 32, 32}), matrix_32_64_, dot_dnums);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().error_message(),
-              HasSubstr("Batch and contracting dimension number mismatch"));
+  EXPECT_TRUE(inferred_status.ok());
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_status.ValueOrDie(),
+                               ShapeUtil::MakeShape(F32, {32, 32, 64})));
 }
 
 // vector <dot> vector -> scalar
@@ -1096,7 +1118,6 @@ TEST_F(ShapeInferenceTest, DotGeneral) {
 TEST_F(ShapeInferenceTest, DotWithTwoContractingDimsFails) {
   Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3, 2});
   Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 14});
-  Shape output_shape = ShapeUtil::MakeShape(F32, {2, 11, 14});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(2);
@@ -1110,8 +1131,28 @@ TEST_F(ShapeInferenceTest, DotWithTwoContractingDimsFails) {
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(inferred_status.status().error_message(),
-              HasSubstr("Must specify one contracting dimension for both "
-                        "lhs and rhs"));
+              HasSubstr("Must specify the same number of contracting "
+                        "dimensions for lhs and rhs."));
+}
+
+TEST_F(ShapeInferenceTest, DotWithTwoContractingDimsPasses) {
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3, 2});
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {2, 3, 2, 14});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {2, 11, 14});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(2);
+  dot_dnums.add_lhs_contracting_dimensions(3);
+  dot_dnums.add_lhs_batch_dimensions(0);
+
+  dot_dnums.add_rhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(2);
+  dot_dnums.add_rhs_batch_dimensions(0);
+
+  auto inferred_status =
+      ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
+  EXPECT_TRUE(inferred_status.ok());
+  EXPECT_TRUE(ShapeUtil::Equal(inferred_status.ValueOrDie(), output_shape));
 }
 
 // BatchMatMul with different batch dimension sizes fails.
@@ -1130,11 +1171,11 @@ TEST_F(ShapeInferenceTest, DotWithMisatchedBatchDimSizesFails) {
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
   ASSERT_FALSE(inferred_status.ok());
   ASSERT_THAT(inferred_status.status().error_message(),
-              HasSubstr("Batch dimension numbers and sizes must match"));
+              HasSubstr("Batch dimension sizes must match"));
 }
 
-// BatchMatMul with different batch dimension numbers fails.
-TEST_F(ShapeInferenceTest, DotWithMisatchedBatchDimNumbersFails) {
+// BatchMatMul with different batch dimension numbers passes
+TEST_F(ShapeInferenceTest, DotWithMisatchedBatchDimNumbersPasses) {
   Shape lhs_shape = ShapeUtil::MakeShape(F32, {2, 11, 3});
   Shape rhs_shape = ShapeUtil::MakeShape(F32, {3, 2, 14});
 
@@ -1147,9 +1188,9 @@ TEST_F(ShapeInferenceTest, DotWithMisatchedBatchDimNumbersFails) {
 
   auto inferred_status =
       ShapeInference::InferDotOpShape(lhs_shape, rhs_shape, dot_dnums);
-  ASSERT_FALSE(inferred_status.ok());
-  ASSERT_THAT(inferred_status.status().error_message(),
-              HasSubstr("Batch dimension numbers must precede non-batch"));
+  ASSERT_TRUE(inferred_status.ok());
+  ASSERT_TRUE(ShapeUtil::Equal(inferred_status.ValueOrDie(),
+                               ShapeUtil::MakeShape(F32, {2, 11, 14})));
 }
 
 // BatchMatMul with out-of-range dimension numbers fails.
@@ -1440,6 +1481,14 @@ TEST_F(ShapeInferenceTest, Pad) {
   Shape inferred_shape = inferred_status.ValueOrDie();
   ASSERT_TRUE(
       ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {39, 31}), inferred_shape));
+
+  dimension1->set_edge_padding_low(-20);
+  dimension1->set_edge_padding_high(-10);
+  auto negative_dimension_size = ShapeInference::InferPadShape(
+      input_shape, padding_value_shape, padding_config);
+  ASSERT_FALSE(negative_dimension_size.ok());
+  ASSERT_THAT(negative_dimension_size.status().error_message(),
+              HasSubstr("negative size for dimension 1"));
 }
 
 TEST_F(ShapeInferenceTest, Reverse) {
@@ -1523,6 +1572,16 @@ TEST_F(ShapeInferenceTest, Transpose) {
                                     ShapeUtil::MakeShape(F32, {3, 4, 5, 2})));
 }
 
+TEST_F(ShapeInferenceTest, Rank1Transpose) {
+  Shape a_shape = ShapeUtil::MakeShape(F32, {5});
+  auto inferred_shape_and_status =
+      ShapeInference::InferTransposeShape(a_shape, {0});
+  EXPECT_IS_OK(inferred_shape_and_status);
+  Shape inferred_shape = inferred_shape_and_status.ValueOrDie();
+  EXPECT_TRUE(
+      ShapeUtil::Compatible(inferred_shape, ShapeUtil::MakeShape(F32, {5})));
+}
+
 TEST_F(ShapeInferenceTest, Conditional) {
   auto inferred_status0 = ShapeInference::InferConditionalShape(
       pred_, vector_32_, vector_64_,
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc
index 28a30b5ee2dbcb5012804578d4d037c241045309..d90dde3b13d3aa9e1de10dd9e1d11a8e6da170de 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer.cc
@@ -85,7 +85,7 @@ string ShapedBuffer::ToString() const {
       on_device_shape(),
       [this, &s](const Shape& subshape, const ShapeIndex& index) {
         string shape_str;
-        if (ShapeUtil::IsTuple(subshape)) {
+        if (subshape.IsTuple()) {
           shape_str = "tuple";
         } else {
           shape_str = ShapeUtil::HumanStringWithLayout(subshape);
diff --git a/tensorflow/compiler/xla/service/sort_simplifier.cc b/tensorflow/compiler/xla/service/sort_simplifier.cc
new file mode 100644
index 0000000000000000000000000000000000000000..122366a0f322a66963b364e1b19629cbd2d9aabe
--- /dev/null
+++ b/tensorflow/compiler/xla/service/sort_simplifier.cc
@@ -0,0 +1,165 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/sort_simplifier.h"
+
+#include <memory>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+namespace {
+
+// If the sort instruction has a tuple shape then looks for unused output
+// values and removes them from the sort instruction. Returns true if the
+// graph has been modified.
+StatusOr<bool> RemoveUnusedOperandFromSort(HloInstruction* sort) {
+  if (!sort->shape().IsTuple()) {
+    return false;
+  }
+
+  HloComputation* computation = sort->parent();
+
+  if (computation->root_instruction() == sort) {
+    // Can't analyse users of the root instruction.
+    return false;
+  }
+
+  absl::flat_hash_set<int64> used_indices;
+  for (const HloInstruction* user : sort->users()) {
+    if (user->opcode() != HloOpcode::kGetTupleElement) {
+      // Can't analyse users other then get-tuple-element.
+      return false;
+    }
+    used_indices.insert(user->tuple_index());
+  }
+
+  // Also note which parameters are used by the comparator computation.
+  auto comparator = sort->to_apply();
+  for (int64 i = 0; i < sort->operand_count() * 2; ++i) {
+    if (comparator->parameter_instruction(i)->user_count() > 0) {
+      // operand i corresponds to parameters 2 * i and 2 * i + 1 of the
+      // computation.
+      used_indices.insert(i / 2);
+    }
+  }
+
+  if (used_indices.size() == sort->operand_count()) {
+    // All operands are used.
+    return false;
+  }
+
+  std::vector<HloInstruction*> operands;
+  std::vector<Shape> new_shapes;
+  for (int64 i = 0; i < sort->operand_count(); ++i) {
+    if (used_indices.contains(i)) {
+      operands.push_back(sort->mutable_operand(i));
+      new_shapes.push_back(sort->operand(i)->shape());
+    }
+  }
+
+  Shape new_sort_shape = new_shapes.size() == 1
+                             ? new_shapes[0]
+                             : ShapeUtil::MakeTupleShape(new_shapes);
+  HloInstruction* new_sort = computation->AddInstruction(
+      sort->CloneWithNewOperands(new_sort_shape, operands));
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      replacements;
+  int64 parameter_number = 0;
+  for (int64 i = 0; i < sort->operand_count(); ++i) {
+    auto* old_lhs_parameter = comparator->parameter_instruction(i * 2);
+    auto* old_rhs_parameter = comparator->parameter_instruction(i * 2 + 1);
+    if (used_indices.contains(i)) {
+      Shape scalar_shape =
+          ShapeUtil::MakeShape(sort->operand(i)->shape().element_type(), {});
+      replacements[old_lhs_parameter] = HloInstruction::CreateParameter(
+          parameter_number, scalar_shape,
+          absl::StrCat("p.", parameter_number / 2, ".lhs"));
+      ++parameter_number;
+      replacements[old_rhs_parameter] = HloInstruction::CreateParameter(
+          parameter_number, scalar_shape,
+          absl::StrCat("p.", parameter_number / 2, ".rhs"));
+      ++parameter_number;
+    } else {
+      replacements[old_lhs_parameter] = nullptr;
+      replacements[old_rhs_parameter] = nullptr;
+    }
+  }
+  HloModule* module = sort->GetModule();
+  HloComputation* new_compare = module->AddEmbeddedComputation(
+      comparator->CloneWithReplacements(std::move(replacements)));
+  new_sort->set_to_apply(new_compare);
+
+  // Map from original get-tuple-element tuple index to new HLO instruction
+  absl::flat_hash_map<int64, HloInstruction*> result_map;
+  if (new_sort->shape().IsTuple()) {
+    // Old sort key maps to new sort key.
+    int64 new_index = 0;
+    for (int64 i = 0; i < sort->operand_count(); ++i) {
+      if (used_indices.count(i)) {
+        result_map[i] =
+            computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+                new_shapes[new_index], new_sort, new_index));
+        ++new_index;
+      }
+    }
+  } else {
+    CHECK_EQ(used_indices.size(), 1);
+    result_map[*used_indices.begin()] = new_sort;
+  }
+  std::vector<HloInstruction*> users(sort->users().begin(),
+                                     sort->users().end());
+  for (HloInstruction* user : users) {
+    TF_RETURN_IF_ERROR(
+        user->ReplaceAllUsesWith(result_map.at(user->tuple_index())));
+    TF_RETURN_IF_ERROR(computation->RemoveInstructionAndUnusedOperands(user));
+  }
+  return true;
+}
+}  // namespace
+
+StatusOr<bool> SortSimplifier::Run(HloModule* module) {
+  VLOG(2) << "HLO module before SortSimplifier:";
+  XLA_VLOG_LINES(2, module->ToString());
+
+  bool changed = false;
+  std::vector<HloInstruction*> sort_instrs;
+  for (auto* comp : module->MakeNonfusionComputations()) {
+    absl::c_copy_if(comp->instructions(), std::back_inserter(sort_instrs),
+                    [](const HloInstruction* instr) {
+                      return instr->opcode() == HloOpcode::kSort;
+                    });
+  }
+
+  for (HloInstruction* sort_instr : sort_instrs) {
+    TF_ASSIGN_OR_RETURN(bool result, RemoveUnusedOperandFromSort(sort_instr));
+    changed |= result;
+  }
+
+  if (changed) {
+    VLOG(2) << "HLO module after SortSimplifier:";
+    XLA_VLOG_LINES(2, module->ToString());
+  } else {
+    VLOG(2) << "HLO module unchanged after SortSimplifier";
+  }
+
+  return changed;
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/sort_simplifier.h b/tensorflow/compiler/xla/service/sort_simplifier.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c6f313aa04f51e14a14450bc72fc622d74133a4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/sort_simplifier.h
@@ -0,0 +1,35 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SORT_SIMPLIFIER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SORT_SIMPLIFIER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// HLO pass which removes unused operands from sort, where an unused operand is
+// defined as an operand at some index 'x' at which the output is not used.
+class SortSimplifier : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "simplify-sorts"; }
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SORT_SIMPLIFIER_H_
diff --git a/tensorflow/compiler/xla/service/sort_simplifier_test.cc b/tensorflow/compiler/xla/service/sort_simplifier_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..696ac1b465848894f8dcb1c88bc48c6a5b268ef4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/sort_simplifier_test.cc
@@ -0,0 +1,160 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/sort_simplifier.h"
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+namespace m = match;
+
+using SortSimplifierTest = HloTestBase;
+
+TEST_F(SortSimplifierTest, RemoveUnusedSortOperandArrayResult) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = s32[64,8732]{1,0} parameter(1)
+     sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values),
+       dimensions={1}, to_apply=compare
+     ROOT gte = f32[64,8732]{1,0} get-tuple-element(sort), index=0
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  SortSimplifier simplifier;
+  uint64 num_executions = 0;
+  do {
+    num_executions++;
+  } while (simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_EQ(num_executions, 2);
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Sort(m::Parameter(0))));
+}
+
+TEST_F(SortSimplifierTest, RemoveUnusedSortOperandTuple) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     p.2.lhs = u32[] parameter(4)
+     p.2.rhs = u32[] parameter(5)
+     ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,87] parameter(0)
+     values.0 = s32[64,87] parameter(1)
+     values.1 = u32[64,87] parameter(2)
+     sort = (f32[64,87], s32[64,87], u32[64,87]) sort(
+         keys, values.0, values.1),
+       dimensions={1}, to_apply=compare
+     gte.0 = f32[64,87] get-tuple-element(sort), index=0
+     gte.1 = u32[64,87] get-tuple-element(sort), index=2
+     ROOT tuple = (f32[64,87], u32[64,87]) tuple(gte.0, gte.1)
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  SortSimplifier simplifier;
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      GmockMatch(m::Tuple(
+          m::GetTupleElement(m::Sort(m::Parameter(0), m::Parameter(2)), 0),
+          m::GetTupleElement(m::Sort(m::Parameter(0), m::Parameter(2)), 1))));
+}
+
+TEST_F(SortSimplifierTest, DontRemoveUnusedSortKey) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = s32[64,8732]{1,0} parameter(1)
+     sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values), dimensions={1}, to_apply=compare
+     ROOT gte = s32[64,8732]{1,0} get-tuple-element(sort), index=1
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  SortSimplifier simplifier;
+  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+}
+
+TEST_F(SortSimplifierTest, RemoveUnusedFirstOperand) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     ROOT lt = pred[] less-than(p.1.lhs, p.1.rhs)
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = s32[64,8732]{1,0} parameter(1)
+     sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values),
+       dimensions={1}, to_apply=compare
+     ROOT gte = s32[64,8732]{1,0} get-tuple-element(sort), index=1
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  SortSimplifier simplifier;
+  uint64 num_executions = 0;
+  do {
+    num_executions++;
+  } while (simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_EQ(num_executions, 2);
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Sort(m::Parameter(1))));
+}
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/stable_sort_expander.cc b/tensorflow/compiler/xla/service/stable_sort_expander.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1aa7e5fe7c0d57ee3303480e4727c456727f64c8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/stable_sort_expander.cc
@@ -0,0 +1,204 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/stable_sort_expander.h"
+
+#include <limits>
+#include <memory>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/op_expander_pass.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Looks for a iota operand that can be used as tie breaker in the computation.
+// If no matching iota operand is found, a iota operand is added to Sort. The
+// comparison computation is adjusted to break ties using the values from the
+// iota operand.
+StatusOr<HloInstruction*> StableSortExpander::ExpandInstruction(
+    HloInstruction* instruction) {
+  auto* sort = Cast<HloSortInstruction>(instruction);
+  HloComputation* computation = sort->parent();
+
+  HloInstruction* expanded_sort = nullptr;
+  absl::flat_hash_set<int64> used_indices;
+  int64 iota_index = -1;
+  for (const HloInstruction* operand : sort->operands()) {
+    // We can only use the iota operand if it has an iota dimension which is the
+    // same as the dimension to sort. Also it should have an integral type that
+    // is large enough for the number of elements in the sort dimension. For
+    // now, we only allow S32, because we expect to find a S32 iota operand for
+    // all Sort ops which are created by TopK.
+    // TODO(b/122298745): Also support other types.
+    if (operand->opcode() == HloOpcode::kIota &&
+        Cast<HloIotaInstruction>(operand)->iota_dimension() ==
+            sort->sort_dimension() &&
+        operand->shape().element_type() == S32) {
+      iota_index = sort->operand_index(operand);
+      break;
+    }
+  }
+
+  // If there is currently no iota operand which we could use for making the
+  // sort stable, we will have to add a new such operand.
+  if (iota_index == -1) {
+    Shape iota_shape = sort->operand(0)->shape();
+    // We might need to use S64 if the number of elements in the sort dimension
+    // is bigger than 2^31 - 1.
+    // TODO(b/122298745): Handle Sort ops where S32 is too small for the number
+    // of elements in the sort dimension.
+    if (iota_shape.dimensions(sort->sort_dimension()) >
+        std::numeric_limits<int32>::max()) {
+      return Unimplemented(
+          "Stable sorting of more than 2^31-1 elements is not implemented");
+    }
+    iota_shape.set_element_type(S32);
+    auto iota = computation->AddInstruction(
+        HloInstruction::CreateIota(iota_shape, sort->sort_dimension()));
+
+    // Create a new comparator.
+    auto comparator = sort->to_apply();
+    absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+        replacements;
+    std::vector<std::unique_ptr<HloInstruction>> extra_parameters;
+    std::vector<HloInstruction*> extra_parameter_ptrs;
+    Shape scalar_shape = ShapeUtil::MakeShape(S32, {});
+    extra_parameters.push_back(HloInstruction::CreateParameter(
+        sort->operand_count() * 2, scalar_shape,
+        absl::StrCat("p.", sort->operand_count(), ".lhs")));
+    extra_parameter_ptrs.push_back(extra_parameters.back().get());
+    extra_parameters.push_back(HloInstruction::CreateParameter(
+        sort->operand_count() * 2 + 1, scalar_shape,
+        absl::StrCat("p.", sort->operand_count(), ".rhs")));
+    extra_parameter_ptrs.push_back(extra_parameters.back().get());
+    sort->set_to_apply(sort->GetModule()->AddEmbeddedComputation(
+        comparator->CloneWithReplacements(std::move(replacements),
+                                          extra_parameter_ptrs)));
+
+    // Replace the original sort op.
+    std::vector<HloInstruction*> new_operands(sort->operands().begin(),
+                                              sort->operands().end());
+    new_operands.push_back(iota);
+    std::vector<Shape> new_shapes = sort->operand_count() == 1
+                                        ? std::vector<Shape>{sort->shape()}
+                                        : sort->shape().tuple_shapes();
+    new_shapes.push_back(iota_shape);
+    Shape new_sort_shape = ShapeUtil::MakeTupleShape(new_shapes);
+    HloInstruction* new_sort = computation->AddInstruction(
+        sort->CloneWithNewOperands(new_sort_shape, new_operands));
+
+    // Add a "wrapper" around the new sort op to make sure we have the same
+    // shape as before. For the rank 1 case, we only need a GetTupleElement,
+    // otherwise we create a Tuple consisting of GetTupleElements of the new
+    // sort.
+    std::vector<HloInstruction*> tuple_elements;
+    tuple_elements.reserve(sort->operand_count());
+    for (int64 i = 0; i < sort->operand_count(); ++i) {
+      tuple_elements.push_back(
+          computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+              sort->operand(i)->shape(), new_sort, i)));
+    }
+    expanded_sort = tuple_elements[0];
+    if (tuple_elements.size() > 1) {
+      expanded_sort = computation->AddInstruction(
+          HloInstruction::CreateTuple(tuple_elements));
+    }
+    sort = Cast<HloSortInstruction>(new_sort);
+    iota_index = sort->operand_count() - 1;
+  }
+
+  // Modify the computation to break ties using the iota operand.
+  auto comparator = sort->to_apply();
+  std::vector<HloInstruction*> instructions_postorder =
+      comparator->MakeInstructionPostOrder();
+  absl::flat_hash_map<HloInstruction*, HloInstruction*> replacements;
+  // Look up instr in the replacements map, and return either the replacement,
+  // or instr, if the replacement isn't present.
+  auto replace = [&](HloInstruction* instr) {
+    auto it = replacements.find(instr);
+    if (it == replacements.end()) {
+      return instr;
+    }
+    return it->second;
+  };
+  HloInstruction* old_root = comparator->root_instruction();
+  // The comparison computation gets 2 * n parameters (n being the number of
+  // operands of Sort), where parameters 2 * i and 2 * i + 1 correspond to two
+  // different scalars of operand i of Sort which are to be compared. The
+  // comparison computation should induce a strict weak order, so if
+  // to_apply(p1.lhs, p1.rhs, ..., pn.lhs, pn.rhs) is equal to
+  // to_apply(p1.rhs, p1.lhs, ..., pn.rhs, pn.lhs), we can conclude that the
+  // values to be compared are equivalent, and perform a tie-breaker comparison.
+  //
+  // We clone each instruction with at least one operand, but use as new
+  // operands of the instruction the replacements of the original operands.
+  // Parameter 2 * i is replaced by parameter 2 * i + 1 and vice versa. This
+  // should make sure that the cloned root instruction gives the result of the
+  // comparison computation when being called with each scalar pair reversed.
+  // parameters corresponding to the iota operand.
+  for (int64 i = 0; i < comparator->num_parameters(); ++i) {
+    replacements[comparator->parameter_instruction(i)] =
+        comparator->parameter_instruction(i ^ 1);
+  }
+  HloInstruction* cloned_root = nullptr;
+  for (HloInstruction* inst : instructions_postorder) {
+    if (inst->operand_count() == 0) {
+      continue;
+    }
+    std::vector<HloInstruction*> new_operands;
+    new_operands.reserve(inst->operand_count());
+    for (HloInstruction* operand : inst->operands()) {
+      new_operands.push_back(replace(operand));
+    }
+    auto new_instruction =
+        inst->CloneWithNewOperands(inst->shape(), new_operands);
+    replacements[inst] = new_instruction.get();
+    if (inst == old_root) {
+      cloned_root = new_instruction.get();
+    }
+    comparator->AddInstruction(std::move(new_instruction));
+  }
+  CHECK_NE(cloned_root, nullptr);
+  Shape scalar_pred = ShapeUtil::MakeShape(PRED, {});
+  HloInstruction* same =
+      comparator->AddInstruction(HloInstruction::CreateBinary(
+          scalar_pred, HloOpcode::kEq, old_root, cloned_root));
+  HloInstruction* tie_breaker =
+      comparator->AddInstruction(HloInstruction::CreateBinary(
+          scalar_pred, HloOpcode::kLt,
+          comparator->parameter_instruction(2 * iota_index),
+          comparator->parameter_instruction(2 * iota_index + 1)));
+  HloInstruction* new_root =
+      comparator->AddInstruction(HloInstruction::CreateTernary(
+          ShapeUtil::MakeShape(PRED, {}), HloOpcode::kSelect, same, tie_breaker,
+          old_root));
+  comparator->set_root_instruction(new_root);
+
+  return expanded_sort;
+}
+
+bool StableSortExpander::InstructionMatchesPattern(
+    HloInstruction* instruction) {
+  return instruction->opcode() == HloOpcode::kSort &&
+         Cast<HloSortInstruction>(instruction)->is_stable();
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/stable_sort_expander.h b/tensorflow/compiler/xla/service/stable_sort_expander.h
new file mode 100644
index 0000000000000000000000000000000000000000..31b6fd92d25370218017c58072f1aa5e64df00c3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/stable_sort_expander.h
@@ -0,0 +1,42 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_STABLE_SORT_EXPANDER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_STABLE_SORT_EXPANDER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/op_expander_pass.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// HLO pass which expands Sort ops that have the is_stable field set to true
+// into equivalent Sort ops which guarantee stable sorting without relying on
+// the is_stable field.
+class StableSortExpander : public OpExpanderPass {
+ public:
+  absl::string_view name() const override { return "stable-sort-expander"; }
+
+ private:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+  StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_STABLE_SORT_EXPANDER_H_
diff --git a/tensorflow/compiler/xla/service/stable_sort_expander_test.cc b/tensorflow/compiler/xla/service/stable_sort_expander_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a62d953e6e8fa2f3c1ecfd9e4a7900eee74f9dca
--- /dev/null
+++ b/tensorflow/compiler/xla/service/stable_sort_expander_test.cc
@@ -0,0 +1,358 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/stable_sort_expander.h"
+
+#include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+namespace m = match;
+
+using StableSortExpanderTest = HloTestBase;
+
+// Checks whether 'a' and 'b' are roots of equivalent computations, except that
+// parameters 2 * i and 2 * i + 1 are switched.
+bool IsSameComputationExceptParams(const HloInstruction* a,
+                                   const HloInstruction* b) {
+  if (a->opcode() != b->opcode() || a->operand_count() != b->operand_count()) {
+    return false;
+  }
+  if (a->opcode() == HloOpcode::kParameter) {
+    // Check that parameters were switched.
+    return a->parameter_number() == (b->parameter_number() ^ 1);
+  }
+  // If the operation has no operands, it should actually be the same.
+  if (a->operand_count() == 0) {
+    return a == b;
+  }
+  // Otherwise recursively compare all operands.
+  for (int64 i = 0; i < a->operand_count(); ++i) {
+    if (!IsSameComputationExceptParams(a->operand(i), b->operand(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Check that the comparison computation has been modified to add a tie breaker
+// using 'iota_parameter'.
+void CheckComputationHasTieBreaker(const HloInstruction* root,
+                                   int64 iota_parameter) {
+  // With the tie breaker, the root instruction should be
+  //   Select(Eq(Comp(), CompReverse()), Lt(), Comp())
+  // with Comp() being the original comparison function, and CompReverse() being
+  // the copied comparison function where the parameters are reversed. Lt() is
+  // the tie breaker comparison using the Iota operand.
+  ASSERT_EQ(root->opcode(), HloOpcode::kSelect);
+  ASSERT_EQ(root->operand(0)->opcode(), HloOpcode::kEq);
+
+  // Check that the tie breaker instruction is correct.
+  EXPECT_THAT(root->operand(1),
+              GmockMatch(m::Lt(m::Parameter(iota_parameter * 2),
+                               m::Parameter(iota_parameter * 2 + 1))));
+  EXPECT_EQ(root->operand(2), root->operand(0)->operand(0));
+
+  // Check that Comp() and CompReverse() are equivalent except that
+  // CompReverse() has reversed parameters.
+  EXPECT_TRUE(IsSameComputationExceptParams(root->operand(0)->operand(0),
+                                            root->operand(0)->operand(1)));
+}
+
+TEST_F(StableSortExpanderTest, StabilizeSortReuseIotaOperand) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = s32[64,8732]{1,0} iota(), iota_dimension=1
+     sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values),
+       dimensions={1}, to_apply=compare, is_stable=true
+     ROOT gte = f32[64,8732]{1,0} get-tuple-element(sort), index=0
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_TRUE(stabilizer.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::GetTupleElement(
+                        m::Sort(m::Parameter(0), m::Iota()), 0)));
+  CheckComputationHasTieBreaker(
+      root->operand(0)->to_apply()->root_instruction(), /*iota_parameter=*/1);
+}
+
+TEST_F(StableSortExpanderTest,
+       StabilizeSortReuseIotaOperandComplicatedComparison) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     max = u32[] constant(2147483647)
+     zero = s32[] constant(0)
+     lhs.signed = s32[] bitcast-convert(p.0.lhs)
+     lhs.unsigned = u32[] bitcast-convert(p.0.lhs)
+     lhs.flipped = u32[] subtract(max, lhs.unsigned)
+     lhs.flipped.signed = s32[] bitcast-convert(lhs.flipped)
+     lhs.is_negative = pred[] less-than(lhs.flipped.signed, zero)
+     lhs.converted = s32[] select(lhs.is_negative, lhs.flipped.signed, lhs.signed)
+     rhs.signed = s32[] bitcast-convert(p.0.rhs)
+     rhs.unsigned = u32[] bitcast-convert(p.0.rhs)
+     rhs.flipped = u32[] subtract(max, rhs.unsigned)
+     rhs.flipped.signed = s32[] bitcast-convert(rhs.flipped)
+     rhs.is_negative = pred[] less-than(rhs.flipped.signed, zero)
+     rhs.converted = s32[] select(rhs.is_negative, rhs.flipped.signed, rhs.signed)
+     ROOT lt = pred[] less-than(lhs.converted, rhs.converted)
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = s32[64,8732]{1,0} iota(), iota_dimension=1
+     sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values),
+       dimensions={1}, to_apply=compare, is_stable=true
+     ROOT gte = f32[64,8732]{1,0} get-tuple-element(sort), index=0
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_TRUE(stabilizer.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::GetTupleElement(
+                        m::Sort(m::Parameter(0), m::Iota()), 0)));
+  CheckComputationHasTieBreaker(
+      root->operand(0)->to_apply()->root_instruction(), /*iota_parameter=*/1);
+}
+
+TEST_F(StableSortExpanderTest, StabilizeSortAddIotaOperandAndChangeRoot) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = s32[64,8732]{1,0} parameter(1)
+     ROOT sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values),
+       dimensions={1}, to_apply=compare, is_stable=true
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_TRUE(stabilizer.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root, GmockMatch(m::Tuple(
+                m::GetTupleElement(
+                    m::Sort(m::Parameter(0), m::Parameter(1), m::Iota()), 0),
+                m::GetTupleElement(
+                    m::Sort(m::Parameter(0), m::Parameter(1), m::Iota()), 1))));
+  CheckComputationHasTieBreaker(
+      root->operand(0)->operand(0)->to_apply()->root_instruction(),
+      /*iota_parameter=*/2);
+}
+
+TEST_F(StableSortExpanderTest, HonorIsStableFlag) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = s32[64,8732]{1,0} iota(), iota_dimension=1
+     sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values),
+       dimensions={1}, to_apply=compare, is_stable=false
+     ROOT gte = f32[64,8732]{1,0} get-tuple-element(sort), index=0
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_FALSE(stabilizer.Run(module.get()).ValueOrDie());
+}
+
+TEST_F(StableSortExpanderTest,
+       StabilizeSortDontReuseIotaOperandWrongDimension) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = s32[] parameter(2)
+     p.1.rhs = s32[] parameter(3)
+     ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = s32[64,8732]{1,0} iota(), iota_dimension=0
+     sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values),
+       dimensions={1}, to_apply=compare, is_stable=true
+     ROOT gte = f32[64,8732]{1,0} get-tuple-element(sort), index=0
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_TRUE(stabilizer.Run(module.get()).ValueOrDie());
+  // Simplify away the "wrapper" tuple around the new sort.
+  AlgebraicSimplifier simplifier(AlgebraicSimplifierOptions(
+      [](const Shape&, const Shape&) { return false; }));
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::GetTupleElement(
+                        m::Sort(m::Parameter(0), m::Iota(), m::Iota()), 0)));
+  CheckComputationHasTieBreaker(
+      root->operand(0)->to_apply()->root_instruction(),
+      /*iota_parameter=*/2);
+}
+
+TEST_F(StableSortExpanderTest, StabilizeSortDontReuseIotaOperandWrongType) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = f32[] parameter(0)
+     p.0.rhs = f32[] parameter(1)
+     p.1.lhs = f32[] parameter(2)
+     p.1.rhs = f32[] parameter(3)
+     ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+   }
+
+   ENTRY sort_computation {
+     keys = f32[64,8732]{1,0} parameter(0)
+     values = f32[64,8732]{1,0} iota(), iota_dimension=1
+     sort = (f32[64,8732]{1,0}, f32[64,8732]{1,0}) sort(keys, values),
+       dimensions={1}, to_apply=compare, is_stable=true
+     ROOT gte = f32[64,8732]{1,0} get-tuple-element(sort), index=0
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_TRUE(stabilizer.Run(module.get()).ValueOrDie());
+  // Simplify away the "wrapper" tuple around the new sort.
+  AlgebraicSimplifier simplifier(AlgebraicSimplifierOptions(
+      [](const Shape&, const Shape&) { return false; }));
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::GetTupleElement(
+                        m::Sort(m::Parameter(0), m::Iota(), m::Iota()), 0)));
+  CheckComputationHasTieBreaker(
+      root->operand(0)->to_apply()->root_instruction(),
+      /*iota_parameter=*/2);
+}
+
+TEST_F(StableSortExpanderTest, StabilizeSortR1) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = s32[] parameter(0)
+     p.0.rhs = s32[] parameter(1)
+     mask = s32[] constant(65535)
+     lhs = s32[] and(p.0.lhs, mask)
+     rhs = s32[] and(p.0.rhs, mask)
+     ROOT lt = pred[] less-than(lhs, rhs)
+   }
+
+   ENTRY sort_computation {
+     keys = s32[64,8732]{1,0} parameter(0)
+     ROOT sort = s32[64,8732]{1,0} sort(keys), dimensions={0}, to_apply=compare,
+       is_stable=true
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_TRUE(stabilizer.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::GetTupleElement(
+                        m::Sort(m::Parameter(0), m::Iota()), 0)));
+  CheckComputationHasTieBreaker(
+      root->operand(0)->to_apply()->root_instruction(), /*iota_parameter=*/1);
+}
+
+TEST_F(StableSortExpanderTest, StabilizeSortR1NoRoot) {
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+   compare {
+     p.0.lhs = s32[] parameter(0)
+     p.0.rhs = s32[] parameter(1)
+     mask = s32[] constant(65535)
+     lhs = s32[] and(p.0.lhs, mask)
+     rhs = s32[] and(p.0.rhs, mask)
+     ROOT lt = pred[] less-than(lhs, rhs)
+   }
+
+   ENTRY sort_computation {
+     keys = s32[64,8732]{1,0} parameter(0)
+     sort = s32[64,8732]{1,0} sort(keys), dimensions={0}, to_apply=compare,
+       is_stable=true
+     ROOT neg = s32[64,8732]{1,0} negate(sort)
+   })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  StableSortExpander stabilizer;
+  EXPECT_TRUE(stabilizer.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Negate(m::GetTupleElement(
+                        m::Sort(m::Parameter(0), m::Iota()), 0))));
+  CheckComputationHasTieBreaker(
+      root->operand(0)->operand(0)->to_apply()->root_instruction(),
+      /*iota_parameter=*/1);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index a21e586efadb85d18e88e44999283b28f7f65eac..15ef623cc7b2dbc31e9cba5c4783c39b8805a5aa 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -142,7 +142,7 @@ Status TransferManager::TransferArrayToDeviceAsync(
     se::Stream* stream, const LiteralSlice& literal,
     const se::DeviceMemoryBase& dest) {
   const Shape on_device_shape = HostShapeToDeviceShape(literal.shape());
-  TF_RET_CHECK(ShapeUtil::IsArray(on_device_shape))
+  TF_RET_CHECK(on_device_shape.IsArray())
       << "On-device representation of "
       << ShapeUtil::HumanString(literal.shape())
       << " is not an array: " << ShapeUtil::HumanString(on_device_shape);
@@ -227,7 +227,7 @@ Status TransferManager::WriteTupleIndexTablesAsync(
   return ShapeUtil::ForEachSubshapeWithStatus(
       device_buffer.on_device_shape(),
       [&](const Shape& device_subshape, const ShapeIndex& index) -> Status {
-        if (ShapeUtil::IsTuple(device_subshape)) {
+        if (device_subshape.IsTuple()) {
           se::DeviceMemoryBase device_memory = device_buffer.buffer(index);
           TF_RET_CHECK(GetByteSizeRequirement(device_subshape) ==
                        device_memory.size());
@@ -248,6 +248,22 @@ Status TransferManager::WriteTupleIndexTablesAsync(
       });
 }
 
+Status TransferManager::WriteRootTupleIndexTable(
+    se::Stream* stream, const ShapedBuffer& device_buffer) {
+  TF_RET_CHECK(device_buffer.on_device_shape().IsTuple());
+  se::DeviceMemoryBase device_memory = device_buffer.buffer({});
+  TF_RET_CHECK(GetByteSizeRequirement(device_buffer.on_device_shape()) ==
+               device_memory.size());
+
+  std::vector<se::DeviceMemoryBase> elements;
+  for (int64 i = 0;
+       i < ShapeUtil::TupleElementCount(device_buffer.on_device_shape()); ++i) {
+    elements.push_back(device_buffer.buffer({i}));
+  }
+  return WriteSingleTupleIndexTable(
+      stream, elements, device_buffer.on_device_shape(), &device_memory);
+}
+
 Status TransferManager::TransferBufferFromDevice(
     se::Stream* stream, const se::DeviceMemoryBase& source, int64 size,
     void* destination) {
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index 49f0b8f8b72001f07200d3e94828f60fcb0fa8fb..43a50487c636da75224547286a31625db3f91330 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -146,6 +146,12 @@ class TransferManager {
   Status WriteTupleIndexTablesAsync(se::Stream* stream,
                                     const ShapedBuffer& device_buffer);
 
+  // Writes a tuple index buffer for the root of 'device_buffer', which must
+  // be a tuple. Unlike WriteTupleIndexTables, only writes the root buffer,
+  // rather than writing all subbuffers. This method is always asynchronous.
+  Status WriteRootTupleIndexTable(se::Stream* stream,
+                                  const ShapedBuffer& device_buffer);
+
   // Determines the byte size requirement for the given shape on the underlying
   // architecture. This will be used to allocate an appropriately sized memory
   // region for a host-to-device transfer.
diff --git a/tensorflow/compiler/xla/service/transpose_folding.cc b/tensorflow/compiler/xla/service/transpose_folding.cc
index 7c1f4b5cc67dd2a84271b4f2b8015fdb2ff6e846..a95ca2bf2a8fcd700eb9234cafbfce9b62f2370c 100644
--- a/tensorflow/compiler/xla/service/transpose_folding.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding.cc
@@ -45,7 +45,7 @@ TransposeFolding::OperandIndices CanFoldOperandsIntoDot(
     auto& operand = *dot.operand(i);
     if (operand.IsRank2Transpose()) {
       operand_set.push_back(i);
-    } else if (ShapeUtil::Rank(operand.shape()) != 2) {
+    } else if (operand.shape().rank() != 2) {
       return {};
     }
   }
@@ -130,8 +130,7 @@ bool FoldTransposeIntoConvolution(InstructionOperandsPair pair) {
 
   HloInstruction* new_lhs;
   const int64 kLhsIdx = 0;
-  if (std::find(operand_indices.begin(), operand_indices.end(), kLhsIdx) !=
-      operand_indices.end()) {
+  if (absl::c_linear_search(operand_indices, kLhsIdx)) {
     HloInstruction& transpose = *convolution.mutable_operand(kLhsIdx);
     const auto& transpose_dimensions = transpose.dimensions();
     HloInstruction& transpose_operand = *transpose.mutable_operand(0);
@@ -154,8 +153,7 @@ bool FoldTransposeIntoConvolution(InstructionOperandsPair pair) {
 
   HloInstruction* new_rhs;
   const int64 kRhsIdx = 1;
-  if (std::find(operand_indices.begin(), operand_indices.end(), kRhsIdx) !=
-      operand_indices.end()) {
+  if (absl::c_linear_search(operand_indices, kRhsIdx)) {
     HloInstruction& transpose = *convolution.mutable_operand(kRhsIdx);
     const auto& transpose_dimensions = transpose.dimensions();
     HloInstruction& transpose_operand = *transpose.mutable_operand(0);
@@ -178,7 +176,8 @@ bool FoldTransposeIntoConvolution(InstructionOperandsPair pair) {
 
   auto new_conv = HloInstruction::CreateConvolve(
       convolution.shape(), new_lhs, new_rhs, convolution.feature_group_count(),
-      convolution.window(), new_dnums, convolution.precision_config());
+      convolution.batch_group_count(), convolution.window(), new_dnums,
+      convolution.precision_config());
   TF_CHECK_OK(convolution.parent()->ReplaceWithNewInstruction(
       &convolution, std::move(new_conv)));
 
diff --git a/tensorflow/compiler/xla/service/transpose_folding_test.cc b/tensorflow/compiler/xla/service/transpose_folding_test.cc
index 17cdaa74fc328d156292f5af828d4222a9a01f1f..f8a5fa0215007310d6bec35d20fc643afc824dda 100644
--- a/tensorflow/compiler/xla/service/transpose_folding_test.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding_test.cc
@@ -139,9 +139,9 @@ TEST_F(TransposeFoldingTest, FoldDotTransposeConstant) {
 HloModule FoldDotTransposeConstant
 
 ENTRY entry_computation {
-  constant = f32[2,1]{1,0} constant(f32[2,1] { { 1 }, { 2 } })
+  constant = f32[2,1]{1,0} constant({ { 1 }, { 2 } })
   transpose = f32[1,2]{1,0} transpose(constant), dimensions={1,0}
-  constant.1 = f32[3,2]{1,0} constant(f32[3,2] { { 1, 2 }, { 3, 4 }, { 5, 6 } })
+  constant.1 = f32[3,2]{1,0} constant({ { 1, 2 }, { 3, 4 }, { 5, 6 } })
   transpose.1 = f32[2,3]{1,0} transpose(constant.1), dimensions={1,0}
   ROOT dot = f32[1,3]{1,0} dot(transpose, transpose.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
@@ -240,12 +240,13 @@ TEST_F(TransposeFoldingTest, FoldConvDimSwapTransposeRhs) {
         transpose_y->shape().dimensions(dnums.kernel_spatial_dimensions(i)));
   }
   StatusOr<Shape> conv_shape = ShapeInference::InferConvolveShape(
-      x->shape(), transpose_y->shape(), /*feature_group_count=*/1, window,
-      dnums);
+      x->shape(), transpose_y->shape(), /*feature_group_count=*/1,
+      /*batch_group_count=*/1, window, dnums);
   EXPECT_IS_OK(conv_shape);
   HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
       conv_shape.ValueOrDie(), x, transpose_y,
-      /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
+      /*feature_group_count=*/1, /*batch_group_count=*/1, window, dnums,
+      DefaultPrecisionConfig(2)));
 
   auto module = CreateNewVerifiedModule("test_module");
   HloComputation* entry_computation =
@@ -295,12 +296,13 @@ TEST_F(TransposeFoldingTest, FoldConvComplexTransposeRhs) {
         transpose_y->shape().dimensions(dnums.kernel_spatial_dimensions(i)));
   }
   StatusOr<Shape> conv_shape = ShapeInference::InferConvolveShape(
-      x->shape(), transpose_y->shape(), /*feature_group_count=*/1, window,
-      dnums);
+      x->shape(), transpose_y->shape(), /*feature_group_count=*/1,
+      /*batch_group_count=*/1, window, dnums);
   EXPECT_IS_OK(conv_shape);
   HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
       conv_shape.ValueOrDie(), x, transpose_y,
-      /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
+      /*feature_group_count=*/1, /*batch_group_count=*/1, window, dnums,
+      DefaultPrecisionConfig(2)));
 
   auto module = CreateNewVerifiedModule("test_module");
   HloComputation* entry_computation =
@@ -355,12 +357,13 @@ TEST_F(TransposeFoldingTest, FoldConvTransposeLhs) {
     dim->set_size(y->shape().dimensions(dnums.kernel_spatial_dimensions(i)));
   }
   StatusOr<Shape> conv_shape = ShapeInference::InferConvolveShape(
-      transpose_x->shape(), y->shape(), /*feature_group_count=*/1, window,
-      dnums);
+      transpose_x->shape(), y->shape(), /*feature_group_count=*/1,
+      /*batch_group_count=*/1, window, dnums);
   EXPECT_IS_OK(conv_shape);
   HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
       conv_shape.ValueOrDie(), transpose_x, y,
-      /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
+      /*feature_group_count=*/1, /*batch_group_count=*/1, window, dnums,
+      DefaultPrecisionConfig(2)));
 
   auto module = CreateNewVerifiedModule("test_module");
   HloComputation* entry_computation =
@@ -421,12 +424,13 @@ TEST_F(TransposeFoldingTest, FoldConvComplexTransposeLhs) {
     dim->set_size(y->shape().dimensions(dnums.kernel_spatial_dimensions(i)));
   }
   StatusOr<Shape> conv_shape = ShapeInference::InferConvolveShape(
-      transpose_x->shape(), y->shape(), /*feature_group_count=*/1, window,
-      dnums);
+      transpose_x->shape(), y->shape(), /*feature_group_count=*/1,
+      /*batch_group_count=*/1, window, dnums);
   EXPECT_IS_OK(conv_shape);
   HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
       conv_shape.ValueOrDie(), transpose_x, y,
-      /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
+      /*feature_group_count=*/1, /*batch_group_count=*/1, window, dnums,
+      DefaultPrecisionConfig(2)));
 
   auto module = CreateNewVerifiedModule("test_module");
   HloComputation* entry_computation =
diff --git a/tensorflow/compiler/xla/client/lib/triangular_solve.cc b/tensorflow/compiler/xla/service/triangular_solve_expander.cc
similarity index 75%
rename from tensorflow/compiler/xla/client/lib/triangular_solve.cc
rename to tensorflow/compiler/xla/service/triangular_solve_expander.cc
index c5a1d34cc66e6f8c1a832f8a8437163b846a5431..b26cdc1db59b30d82b9ac58a8a2ac762220086be 100644
--- a/tensorflow/compiler/xla/client/lib/triangular_solve.cc
+++ b/tensorflow/compiler/xla/service/triangular_solve_expander.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
+#include "tensorflow/compiler/xla/service/triangular_solve_expander.h"
 
 #include <memory>
 #include <vector>
@@ -33,12 +33,14 @@ limitations under the License.
 
 namespace xla {
 
+namespace {
+
 // Get the diagonal blocks of the coefficient matrix
 XlaOp DiagonalBlocks(XlaOp a, int64 block_size) {
   XlaBuilder* builder = a.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(a));
-    int ndims = ShapeUtil::Rank(shape);
+    int ndims = shape.rank();
     int64 n = ShapeUtil::GetDimension(shape, -1);
     int64 num_blocks = n / block_size;
 
@@ -62,15 +64,26 @@ XlaOp DiagonalBlocks(XlaOp a, int64 block_size) {
                               /*broadcast_sizes=*/{2}),
                     /*permutation=*/{1, 0});
 
+      PaddingConfig padding_config =
+          MakeEdgePaddingConfig({{0, 0}, {ndims - 2, 0}});
+      start_indices =
+          Pad(start_indices, ConstantR0<int32>(builder, 0), padding_config);
+
       // Gather the diagonal blocks
+      std::vector<int64> slice_sizes(ndims);
       GatherDimensionNumbers dim_numbers;
+      for (int i = 0; i < ndims - 2; ++i) {
+        dim_numbers.add_offset_dims(i);
+        dim_numbers.add_start_index_map(i);
+        slice_sizes[i] = ShapeUtil::GetDimension(shape, i);
+      }
+      slice_sizes[ndims - 2] = slice_sizes[ndims - 1] = block_size;
       dim_numbers.add_offset_dims(ndims - 1);
       dim_numbers.add_offset_dims(ndims);
       dim_numbers.add_start_index_map(ndims - 2);
       dim_numbers.add_start_index_map(ndims - 1);
       dim_numbers.set_index_vector_dim(1);
-      diag_blocks = Gather(a, start_indices, dim_numbers,
-                           /*slice_sizes=*/{block_size, block_size});
+      diag_blocks = Gather(a, start_indices, dim_numbers, slice_sizes);
     }
 
     // The last block might be smaller than the block size,
@@ -129,9 +142,7 @@ XlaOp InvertDiagonalBlocks(XlaOp diag_blocks, bool lower, bool transpose_a,
     // zero (which can happen if the last block was padded) otherwise it will
     // introduce nans which will propagate
     auto diags = GetMatrixDiagonal(diag_blocks);
-    TF_ASSIGN_OR_RETURN(Shape diags_shape, builder->GetShape(diags));
-    auto one = ScalarLike(diags, 1);
-    auto ones = Broadcast(one, AsInt64Slice(diags_shape.dimensions()));
+    auto ones = FullLike(diags, 1);
     diags = Select(Eq(diags, Zero(builder, shape.element_type())), ones, diags);
     auto scaled_diag_blocks = Div(diag_blocks, diags, {0, 2});
 
@@ -154,10 +165,10 @@ XlaOp InvertDiagonalBlocks(XlaOp diag_blocks, bool lower, bool transpose_a,
     // The first or last  diagonal element should be set to 1 instead of -1
     // though, since we never update it
     auto pos_one = Reshape(One(builder, shape.element_type()), {1, 1});
-    auto start_index = (lower) ? 0 : block_size - 1;
-    auto output_block = DynamicUpdateSlice(
-        neg_identity, pos_one,
-        /*start_indices=*/ConstantR1<int>(builder, 2, start_index));
+    auto start_index = ConstantR0<int>(builder, (lower) ? 0 : block_size - 1);
+    auto output_block =
+        DynamicUpdateSlice(neg_identity, pos_one,
+                           /*start_indices=*/{start_index, start_index});
 
     // Broadcast diag([1, -1, -1, ...]) to every block
     XlaOp output = Broadcast(output_block,
@@ -200,12 +211,10 @@ XlaOp InvertDiagonalBlocks(XlaOp diag_blocks, bool lower, bool transpose_a,
       auto body_out = GetTupleElement(input_tuple, 1);
       auto body_input = GetTupleElement(input_tuple, 2);
 
-      auto zero = ConstantR1<int32>(bodyb.get(), 1, 0);
+      auto zero = ConstantR0<int32>(bodyb.get(), 0);
       auto j = (lower) ? i : ScalarLike(i, block_size - 1) - i;
-      auto start_indices =
-          ConcatInDim(bodyb.get(), {zero, Reshape(j, {1}), zero}, 0);
       auto input_row =
-          DynamicSlice(body_input, start_indices,
+          DynamicSlice(body_input, {zero, j, zero},
                        /*slice_sizes=*/{num_blocks, 1, block_size});
 
       // We want -L21 L11^{-1}
@@ -219,7 +228,7 @@ XlaOp InvertDiagonalBlocks(XlaOp diag_blocks, bool lower, bool transpose_a,
       precision_proto.add_operand_precision(precision);
       auto update = -DotGeneral(input_row, body_out, dnums, &precision_proto);
 
-      body_out = DynamicUpdateSlice(body_out, update, start_indices);
+      body_out = DynamicUpdateSlice(body_out, update, {zero, j, zero});
 
       auto next_i = i + ScalarLike(i, 1);
       Tuple(bodyb.get(), {next_i, body_out, body_input});
@@ -251,7 +260,7 @@ XlaOp SolveWithInvertedDiagonalBlocks(XlaOp a, XlaOp b, XlaOp inv_diag_blocks,
     int64 block_size = ShapeUtil::GetDimension(blocks_shape, -1);
 
     TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
-    int64 ndims = ShapeUtil::Rank(a_shape);
+    int64 ndims = a_shape.rank();
     int64 n = ShapeUtil::GetDimension(a_shape, -1);
     int64 num_blocks = n / block_size + (n % block_size != 0);
     int64 m_dim = (left_side) ? -1 : -2;
@@ -338,20 +347,21 @@ XlaOp SolveWithInvertedDiagonalBlocks(XlaOp a, XlaOp b, XlaOp inv_diag_blocks,
   });
 }
 
-XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
-                      bool transpose_a, bool conjugate_a, int64 block_size,
-                      PrecisionConfig::Precision precision) {
+XlaOp BuildTriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
+                           bool transpose_a, bool conjugate_a,
+                           bool unit_diagonal, int64 block_size,
+                           PrecisionConfig::Precision precision) {
   XlaBuilder* builder = a.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
     TF_ASSIGN_OR_RETURN(Shape b_shape, builder->GetShape(b));
-    if (ShapeUtil::Rank(a_shape) != ShapeUtil::Rank(b_shape)) {
+    if (a_shape.rank() != b_shape.rank()) {
       return InvalidArgument(
           "Arguments to TriangularSolve have shapes with different ranks: "
           "%s vs. %s",
           ShapeUtil::HumanString(a_shape), ShapeUtil::HumanString(b_shape));
     }
-    const int64 ndims = ShapeUtil::Rank(a_shape);
+    const int64 ndims = a_shape.rank();
     if (ndims < 2) {
       return InvalidArgument(
           "Arguments to TriangularSolve was rank %d but must have rank >= 2.",
@@ -393,6 +403,26 @@ XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
           block_size);
     }
 
+    if (ShapeUtil::IsZeroElementArray(b_shape)) {
+      // The output has the same shape as 'b', and since the output has zero
+      // elements, any such array will do.
+      return b;
+    }
+
+    // TODO(phawkins): consider pushing triangle masking into
+    // InvertDiagonalBlocks.
+    if (unit_diagonal) {
+      // Mask everything but the subdiagonal/superdiagonal elements.
+      a = lower ? Select(TriangleMask(a, -1), a, ZerosLike(a))
+                : Select(TriangleMask(a, 0), ZerosLike(a), a);
+      int64 k = ShapeUtil::GetDimension(a_shape, -1);
+      a = xla::Add(a, IdentityMatrix(builder, a_shape.element_type(), k, k),
+                   /*broadcast_dimensions=*/{ndims - 2, ndims - 1});
+    } else {
+      // Mask off the ignored elements of the triangular matrix a.
+      a = Triangle(a, lower);
+    }
+
     // We find the diagonal blocks of the coefficient matrix
     auto diag_blocks = DiagonalBlocks(a, block_size);
 
@@ -409,4 +439,66 @@ XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
   });
 }
 
+}  // namespace
+
+bool TriangularSolveExpander::InstructionMatchesPattern(
+    HloInstruction* instruction) {
+  return instruction->opcode() == HloOpcode::kTriangularSolve;
+}
+
+StatusOr<HloInstruction*> TriangularSolveExpander::ExpandInstruction(
+    HloInstruction* instruction) {
+  const TriangularSolveOptions& options =
+      instruction->triangular_solve_options();
+  const string name = absl::StrFormat(
+      "xla.triangular_solve_%s_%s_%s_%s_%s_%s",
+      instruction->operand(0)->shape().ToString(),
+      instruction->operand(1)->shape().ToString(),
+      options.left_side() ? "left" : "right",
+      options.lower() ? "lower" : "upper",
+      TriangularSolveOptions_Transpose_Name(options.transpose_a()),
+      options.unit_diagonal() ? "unit" : "nonunit");
+
+  HloModule* module = instruction->parent()->parent();
+
+  HloComputation*& computation =
+      computation_cache_.emplace(name, nullptr).first->second;
+  if (!computation) {
+    // Builds a new expansion.
+    //
+    // We do something unusual here: we build the computation using the
+    // XlaBuilder API, which is nominally an XLA client API. We do this because
+    // the external APIs for building complicated computations (XlaBuilder)
+    // are much more ergonomic than the internal ones. As it turns out,
+    // XlaBuilder isn't really a client API—what it does is build a
+    // HloModuleProto protocol buffer, that we can then deserialize and clone
+    // into our HloModule. Ideally we would avoid the protocol buffer step;
+    // that is left as an exercise for future work.
+    XlaBuilder builder(name);
+    XlaOp a = Parameter(&builder, 0, instruction->operand(0)->shape(), "a");
+    XlaOp b = Parameter(&builder, 1, instruction->operand(1)->shape(), "b");
+    bool transpose_a =
+        options.transpose_a() != TriangularSolveOptions::NO_TRANSPOSE;
+    bool conjugate_a = options.transpose_a() == TriangularSolveOptions::ADJOINT;
+
+    BuildTriangularSolve(a, b, options.left_side(), options.lower(),
+                         transpose_a, conjugate_a, options.unit_diagonal(),
+                         /*block_size=*/128,
+                         /*precision=*/PrecisionConfig::HIGHEST);
+    TF_ASSIGN_OR_RETURN(XlaComputation xla_computation, builder.Build());
+
+    TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
+                        xla_computation.GetProgramShape());
+    HloModuleConfig config(program_shape);
+    TF_ASSIGN_OR_RETURN(auto new_module, HloModule::CreateFromProto(
+                                             xla_computation.proto(), config));
+    HloCloneContext context(module);
+    computation =
+        module->DeepCloneComputation(new_module->entry_computation(), &context);
+  }
+
+  return instruction->parent()->AddInstruction(HloInstruction::CreateCall(
+      instruction->shape(), instruction->operands(), computation));
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/triangular_solve_expander.h b/tensorflow/compiler/xla/service/triangular_solve_expander.h
new file mode 100644
index 0000000000000000000000000000000000000000..be2374ef8c86254d8db5ac1acac385aa0de7d3a5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/triangular_solve_expander.h
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_TRIANGULAR_SOLVE_EXPANDER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_TRIANGULAR_SOLVE_EXPANDER_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/service/op_expander_pass.h"
+
+namespace xla {
+
+class TriangularSolveExpander : public OpExpanderPass {
+ public:
+  absl::string_view name() const override {
+    return "triangular_solve_expander";
+  }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+
+  StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+
+ private:
+  // Mapping from op signatures to existing computations.
+  absl::flat_hash_map<string, HloComputation*> computation_cache_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_TRIANGULAR_SOLVE_EXPANDER_H_
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index 50d51eaeb762e208004c1dae3dcc27503f3f94e9..cc82e9bb0287b5a586fb21fee35d3124a6d6f121 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -55,11 +56,10 @@ bool PointsToSet::IsAmbiguous() const {
 
 bool PointsToSet::IsDistinct() const {
   bool distinct = true;
-  std::set<const LogicalBuffer*> all_points_to;
-  ForEachElement([&distinct, &all_points_to](const ShapeIndex& /*index*/,
-                                             const BufferList& points_to) {
+  absl::flat_hash_set<const LogicalBuffer*> all_points_to;
+  ForEachElement([&](const ShapeIndex& /*index*/, const BufferList& points_to) {
     for (auto& buffer : points_to) {
-      if (all_points_to.count(buffer) != 0) {
+      if (all_points_to.contains(buffer)) {
         distinct = false;
       }
       all_points_to.insert(buffer);
@@ -87,9 +87,7 @@ bool PointsToSet::ContainsBuffer(const LogicalBuffer& buffer) const {
   bool found = false;
   ForEachElement([&found, &buffer](const ShapeIndex& /*index*/,
                                    const BufferList& pointed_to_buffers) {
-    if (!found &&
-        std::find(pointed_to_buffers.begin(), pointed_to_buffers.end(),
-                  &buffer) != pointed_to_buffers.end()) {
+    if (!found && absl::c_linear_search(pointed_to_buffers, &buffer)) {
       found = true;
     }
   });
@@ -99,8 +97,7 @@ bool PointsToSet::ContainsBuffer(const LogicalBuffer& buffer) const {
 bool PointsToSet::ContainsBufferAtIndex(const LogicalBuffer& buffer,
                                         const ShapeIndex& index) const {
   const auto& pointed_to_buffers = element(index);
-  return std::find(pointed_to_buffers.begin(), pointed_to_buffers.end(),
-                   &buffer) != pointed_to_buffers.end();
+  return absl::c_linear_search(pointed_to_buffers, &buffer);
 }
 
 void PointsToSet::AddPointedToBuffer(const LogicalBuffer& buffer,
@@ -210,7 +207,7 @@ Status TuplePointsToAnalysis::DefaultAction(HloInstruction* hlo_instruction) {
             &logical_buffer_analysis_->GetBuffer(hlo_instruction, index));
       });
 
-  if (ShapeUtil::IsTuple(hlo_instruction->shape())) {
+  if (hlo_instruction->shape().IsTuple()) {
     // If the hlo instruction is a tuple-shaped, then trivially the instruction
     // itself is the source of the tuple.
     points_to_set.add_tuple_source({}, hlo_instruction);
@@ -604,9 +601,8 @@ bool TuplePointsToAnalysis::DoesNotUseOperandBuffer(
   } else if (user->opcode() == HloOpcode::kFusion &&
              user->fusion_kind() == HloInstruction::FusionKind::kLoop) {
     // Find fusion parameter associated with 'operand'.
-    auto it = std::find_if(
-        user->fused_parameters().begin(), user->fused_parameters().end(),
-        [=](HloInstruction* fused_param) {
+    auto it = absl::c_find_if(
+        user->fused_parameters(), [&](HloInstruction* fused_param) {
           return user->operand(fused_param->parameter_number()) == operand;
         });
     CHECK(it != user->fused_parameters().end());
@@ -672,9 +668,8 @@ bool TuplePointsToAnalysis::HasUniqueFusedUseOfOperandAt(
   }
   // Find fusion parameter associated with 'operand'.
   const auto& fused_params = fusion->fused_parameters();
-  auto fused_param_it = std::find_if(
-      fused_params.begin(), fused_params.end(),
-      [&](HloInstruction* fused_param) {
+  auto fused_param_it =
+      absl::c_find_if(fused_params, [&](HloInstruction* fused_param) {
         return fusion->operand(fused_param->parameter_number()) == operand;
       });
   if (fused_param_it == fused_params.end()) {
@@ -704,6 +699,8 @@ bool TuplePointsToAnalysis::HasUniqueFusedUseOfOperandAt(
 // (4) The 'user' of 'operand' is DynamicUpdateSlice or While at operand index
 //     0.
 // (5) The 'user' of 'operand' is Sort, and it is the only user.
+// (6) The 'user' of 'operand' is TriangularSolve, it is the second operand,
+//     and it is the only user.
 //
 // (2) and (3) can only be determined if points-to analysis is available.
 bool TuplePointsToAnalysis::CanShareOperandBufferWithUser(
@@ -743,11 +740,10 @@ bool TuplePointsToAnalysis::CanShareOperandBufferWithUser(
       // Check if one operand of kAdd fused root is kDot or kConvolution.
       auto* add = user->fused_expression_root();
       auto add_operand_it =
-          std::find_if(add->operands().begin(), add->operands().end(),
-                       [&](HloInstruction* operand) {
-                         return operand->opcode() == HloOpcode::kConvolution ||
-                                operand->opcode() == HloOpcode::kDot;
-                       });
+          absl::c_find_if(add->operands(), [&](HloInstruction* operand) {
+            return operand->opcode() == HloOpcode::kConvolution ||
+                   operand->opcode() == HloOpcode::kDot;
+          });
       if (add_operand_it == add->operands().end()) {
         return false;
       }
@@ -785,6 +781,14 @@ bool TuplePointsToAnalysis::CanShareOperandBufferWithUser(
     std::vector<int64> operand_indices = user->OperandIndices(operand);
     return operand_indices.size() == 1 && user_index[0] == operand_indices[0];
   }
+  if (user->opcode() == HloOpcode::kTriangularSolve) {
+    // Only valid if there are no other users.
+    if (operand->users().size() != 1) {
+      return false;
+    }
+    std::vector<int64> operand_indices = user->OperandIndices(operand);
+    return operand_indices.size() == 1 && operand_indices[0] == 1;
+  }
   if (user->opcode() == HloOpcode::kCall) {
     // TODO(b/62548313): Remove when buffer assignment is module scoped and
     // does not assign buffers to calls.
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index 561762b5d424ed5f537665be9d67a81dc8bdd56e..6f61fc44166298e86a88dfc4f0ce8526d65ffd02 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/instruction_fusion.h"
@@ -623,7 +624,7 @@ class FusionPointsToAnalysisTest : public TuplePointsToAnalysisTest {
   void Run(const bool add_additional_gte0_user) {
     Shape input_shape = ShapeUtil::MakeShape(F32, {8});
     Shape update_shape = ShapeUtil::MakeShape(F32, {3});
-    Shape starts_shape = ShapeUtil::MakeShape(S32, {1});
+    Shape starts_shape = ShapeUtil::MakeShape(S32, {});
     Shape tuple_shape =
         ShapeUtil::MakeTupleShape({input_shape, update_shape, starts_shape});
 
@@ -657,7 +658,7 @@ class FusionPointsToAnalysisTest : public TuplePointsToAnalysisTest {
         HloInstruction::CreateGetTupleElement(starts_shape, tuple_param0, 2));
     // Update 'input' with 'update' at dynamic 'starts' indices.
     builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-        input_shape, input, update, starts));
+        input_shape, input, update, {starts}));
 
     // Build computation and add it to module as entry computation.
     BuildModule(builder.Build());
@@ -721,9 +722,8 @@ class FusionPointsToAnalysisTest : public TuplePointsToAnalysisTest {
   // to fusion 'operand'.
   HloInstruction* GetFusionParameterForOperand(HloInstruction* fusion,
                                                HloInstruction* operand) {
-    auto it = std::find_if(
-        fusion->fused_instructions().begin(),
-        fusion->fused_instructions().end(), [=](const HloInstruction* fused) {
+    auto it = absl::c_find_if(
+        fusion->fused_instructions(), [&](const HloInstruction* fused) {
           return fused->opcode() == HloOpcode::kParameter &&
                  fusion->operand(fused->parameter_number()) == operand;
         });
@@ -734,7 +734,7 @@ class FusionPointsToAnalysisTest : public TuplePointsToAnalysisTest {
   // Returns all users of 'fusion_paran' at 'tuple_index'.
   std::vector<HloInstruction*> GetFusionParameterUsersAt(
       HloInstruction* fusion_param, int64 tuple_index) {
-    CHECK(ShapeUtil::IsTuple(fusion_param->shape()));
+    CHECK(fusion_param->shape().IsTuple());
     std::vector<HloInstruction*> users_at_tuple_index;
     for (auto user : fusion_param->users()) {
       CHECK_EQ(HloOpcode::kGetTupleElement, user->opcode());
@@ -883,12 +883,12 @@ TEST_F(DoesNotUseOperandBufferTest, FusedDynamicUpdateSlice) {
 
   // Create a DynamicUpdateSlice instruction of tuple element 1.
   auto starts = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(2)));
   auto update = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f})));
   auto dynamic_update_slice =
       builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-          data_shape, gte1, update, starts));
+          data_shape, gte1, update, {starts}));
   builder.AddInstruction(
       HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
 
@@ -977,12 +977,12 @@ TEST_F(CanShareOperandBufferWithUserTest, FusedDynamicUpdateSlice) {
 
   // Create a DynamicUpdateSlice instruction of tuple element 1.
   auto starts = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({2})));
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(2)));
   auto update = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({2.f, 2.f, 2.f})));
   auto dynamic_update_slice =
       builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-          data_shape, gte1, update, starts));
+          data_shape, gte1, update, {starts}));
   builder.AddInstruction(
       HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
 
@@ -1004,7 +1004,7 @@ TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
 
   Shape data_shape = ShapeUtil::MakeShape(F32, {8});
   Shape update_shape = ShapeUtil::MakeShape(F32, {4});
-  Shape starts_shape = ShapeUtil::MakeShape(S32, {1});
+  Shape starts_shape = ShapeUtil::MakeShape(S32, {});
   auto data = builder.AddInstruction(
       HloInstruction::CreateParameter(0, data_shape, "data"));
   auto update = builder.AddInstruction(
@@ -1012,7 +1012,7 @@ TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
   auto starts = builder.AddInstruction(
       HloInstruction::CreateParameter(2, starts_shape, "starts"));
   auto dus = builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-      data_shape, data, update, starts));
+      data_shape, data, update, {starts}));
 
   BuildModuleAndRunAnalysis(builder.Build());
 
@@ -1066,14 +1066,17 @@ TEST_F(CanShareOperandBufferWithUserTest, ScatterCanShare) {
 
 TEST_F(CanShareOperandBufferWithUserTest, SortCanShare) {
   auto builder = HloComputation::Builder(TestName());
+  module_ = CreateNewVerifiedModule();
 
   Shape keys_shape = ShapeUtil::MakeShape(F32, {8});
   auto keys = builder.AddInstruction(
       HloInstruction::CreateParameter(0, keys_shape, "keys"));
-  auto sort =
-      builder.AddInstruction(HloInstruction::CreateSort(keys_shape, 0, keys));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* sort, MakeSortHlo(keys_shape, {keys}, 0, /*is_stable=*/false,
+                              &builder, module_.get()));
 
-  BuildModuleAndRunAnalysis(builder.Build());
+  computation_ = module_->AddEntryComputation(builder.Build());
+  RunAnalysis();
 
   EXPECT_TRUE(
       points_to_analysis_->CanShareOperandBufferWithUser(keys, {}, sort, {}));
@@ -1081,6 +1084,7 @@ TEST_F(CanShareOperandBufferWithUserTest, SortCanShare) {
 
 TEST_F(CanShareOperandBufferWithUserTest, SortCanShareWithTupleUser) {
   auto builder = HloComputation::Builder(TestName());
+  module_ = CreateNewVerifiedModule();
 
   Shape keys_shape = ShapeUtil::MakeShape(F32, {8});
   Shape values_shape = ShapeUtil::MakeShape(F32, {8});
@@ -1088,11 +1092,14 @@ TEST_F(CanShareOperandBufferWithUserTest, SortCanShareWithTupleUser) {
       HloInstruction::CreateParameter(0, keys_shape, "keys"));
   auto values = builder.AddInstruction(
       HloInstruction::CreateParameter(1, values_shape, "values"));
-  auto sort = builder.AddInstruction(HloInstruction::CreateSort(
-      ShapeUtil::MakeTupleShape({keys_shape, values_shape}), 0, keys,
-      {values}));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* sort,
+      MakeSortHlo(ShapeUtil::MakeTupleShape({keys_shape, values_shape}),
+                  {keys, values}, 0, /*is_stable=*/false, &builder,
+                  module_.get()));
 
-  BuildModuleAndRunAnalysis(builder.Build());
+  computation_ = module_->AddEntryComputation(builder.Build());
+  RunAnalysis();
 
   // The buffer for the keys can be shared with the first tuple entry.
   EXPECT_TRUE(
diff --git a/tensorflow/compiler/xla/service/tuple_util.cc b/tensorflow/compiler/xla/service/tuple_util.cc
index cfb0c787d09557fd1aec3517eb9698cfec323369..90ea79ec263a038556ccbd2cd345b337c5a5dcf3 100644
--- a/tensorflow/compiler/xla/service/tuple_util.cc
+++ b/tensorflow/compiler/xla/service/tuple_util.cc
@@ -21,7 +21,7 @@ namespace xla {
 
 /*static*/ HloInstruction* TupleUtil::ExtractPrefix(HloInstruction* input_tuple,
                                                     int64 elements) {
-  CHECK(ShapeUtil::IsTuple(input_tuple->shape()));
+  CHECK(input_tuple->shape().IsTuple());
 
   HloComputation* computation = input_tuple->parent();
   const Shape& input_shape = input_tuple->shape();
@@ -41,7 +41,7 @@ namespace xla {
 /*static*/ HloInstruction* TupleUtil::AppendSuffix(
     HloInstruction* input_tuple,
     absl::Span<HloInstruction* const> trailing_values) {
-  CHECK(ShapeUtil::IsTuple(input_tuple->shape()));
+  CHECK(input_tuple->shape().IsTuple());
 
   HloComputation* computation = input_tuple->parent();
   const Shape& input_shape = input_tuple->shape();
diff --git a/tensorflow/compiler/xla/service/while_loop_analysis.cc b/tensorflow/compiler/xla/service/while_loop_analysis.cc
index 68e2569f66bea9ec1223e454d1ead0efc7b9498e..c93a9ba3176002a34fe84a29e62075de4d19168f 100644
--- a/tensorflow/compiler/xla/service/while_loop_analysis.cc
+++ b/tensorflow/compiler/xla/service/while_loop_analysis.cc
@@ -301,7 +301,7 @@ optional<int64> ComputeWhileLoopTripCountUpperBound(HloInstruction* while_op) {
                                   /*dest_shape_index=*/{indvar_index},
                                   /*src_shape_index=*/{}));
   StatusOr<Literal> eval_result =
-      evaluator.Evaluate<Literal>(*while_cond, {std::move(fake_input)});
+      evaluator.Evaluate(*while_cond, {std::move(fake_input)});
 
   if (!eval_result.ok()) {
     VLOG(2) << "Couldn't evaluate while loop condition.";
diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
index 75d406435b6f58faecc86b82c33e9e2dd6bccbea..3bcf5c38309a86e9e3cab3268f3f065005f7a923 100644
--- a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
@@ -129,7 +129,7 @@ condition {
 
 ENTRY entry {
   const_0 = f32[2] constant({1, 2})
-  const_1 = (f32[2], f32[2]) constant((f32[2], f32[2]) ({2, 1},{3,1}))
+  const_1 = (f32[2], f32[2]) constant(({2, 1},{3,1}))
   while_init = (f32[2],(f32[2],f32[2])) tuple(const_0, const_1)
   ROOT while = (f32[2],(f32[2],f32[2])) while(while_init), condition=condition, body=body
 }
@@ -206,8 +206,8 @@ body {
   p_body.0 = f32[2] get-tuple-element((f32[2],f32[2]) p_body), index=0
   p_body.1 = f32[2] get-tuple-element((f32[2],f32[2]) p_body), index=1
 
-  token = token[] after-all()
-  outfeed = token[] outfeed(p_body.0, token)
+  token0 = token[] after-all()
+  outfeed = token[] outfeed(p_body.0, token0)
   ROOT root = (f32[2],f32[2],f32[2]) tuple(p_body.0, p_body.1, p_body.1)
 }
 
@@ -305,7 +305,7 @@ condition {
 
 ENTRY entry {
   const_0 = f32[] constant(0)
-  const_1 = (f32[], f32[]) constant((f32[], f32[]) (1, 10))
+  const_1 = (f32[], f32[]) constant((1, 10))
   while_init = (f32[],(f32[],f32[])) tuple(const_0, const_1)
   ROOT while = (f32[],(f32[],f32[])) while(while_init), condition=condition, body=body
 }
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
index 41011176ffa91e885bc58364d1fb19617d3518ad..69cc8feb3f31ad782b9d3437d81d0ab8ce10aadb 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
@@ -89,7 +89,7 @@ static void CreateLoopInvariantCopy(
 
     HloInstruction* next_operand =
         frame->instruction->mutable_operand(frame->operand_index++);
-    if (hoisted_instructions->count(next_operand) ||
+    if (hoisted_instructions->contains(next_operand) ||
         next_operand == while_body_param) {
       continue;
     }
@@ -127,7 +127,7 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
     HloInstruction* while_instr) {
   auto print_no_metadata = HloPrintOptions{}.set_print_metadata(false);
 
-  if (!ShapeUtil::IsTuple(while_instr->shape())) {
+  if (!while_instr->shape().IsTuple()) {
     // This restriction leaves one interesting pattern on the table:
     //
     //  while_body(f32[1024, 1024] %param) {
@@ -168,7 +168,7 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
   // is no benefit to hoisting them unless something that uses it is also
   // hoisted.
   for (auto* instr : WhileUtil::GetInvariantGTEsForWhileBody(*while_body)) {
-    if (ShapeUtil::IsArray(instr->shape())) {
+    if (instr->shape().IsArray()) {
       // TODO(b/79147885): We should try to generalize this to tuples for
       // uniformity's sake, if nothing else.
       InsertOrDie(&unhoisted_invariant_instructions, instr);
@@ -221,7 +221,7 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
         ShapeUtil::ForEachSubshape(
             operand->shape(),
             [&input_size](const Shape& subshape, const ShapeIndex& /*index*/) {
-              if (ShapeUtil::IsArray(subshape)) {
+              if (subshape.IsArray()) {
                 input_size += ShapeUtil::ByteSizeOfElements(subshape);
               }
             });
@@ -229,7 +229,7 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
       ShapeUtil::ForEachSubshape(
           instruction->shape(),
           [&output_size](const Shape& subshape, const ShapeIndex& /*index*/) {
-            if (ShapeUtil::IsArray(subshape)) {
+            if (subshape.IsArray()) {
               output_size += ShapeUtil::ByteSizeOfElements(subshape);
             }
           });
@@ -241,7 +241,7 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
 
     auto is_invariant = [&](HloInstruction* op) {
       return hoisted_instructions.find(op) != hoisted_instructions.end() ||
-             unhoisted_invariant_instructions.count(op) ||
+             unhoisted_invariant_instructions.contains(op) ||
              op->opcode() == HloOpcode::kConstant;
     };
 
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
index 8e7c4bc8828552e197b41f874c070d496b85a382..3587c016b4420163a607422b1acc838646fab83a 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
@@ -299,7 +299,7 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistBitcastAlone) {
   // bitcast either.
   auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
-  auto scalar_f32 = ShapeUtil::MakeShape(F32, {});
+  auto effective_scalar_s32 = ShapeUtil::MakeShape(S32, {1});
   auto token_shape = ShapeUtil::MakeTokenShape();
   Shape while_shape =
       ShapeUtil::MakeTupleShape({scalar_s32, scalar_s32, token_shape});
@@ -314,10 +314,12 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DontHoistBitcastAlone) {
         HloInstruction::CreateGetTupleElement(scalar_s32, param, 1));
     HloInstruction* in_token = builder.AddInstruction(
         HloInstruction::CreateGetTupleElement(token_shape, param, 2));
-    HloInstruction* bitcast_inst = builder.AddInstruction(
-        HloInstruction::CreateUnary(scalar_f32, HloOpcode::kBitcast, gte_0));
-    HloInstruction* out_token = builder.AddInstruction(
-        HloInstruction::CreateOutfeed(scalar_f32, bitcast_inst, in_token, ""));
+    HloInstruction* bitcast_inst =
+        builder.AddInstruction(HloInstruction::CreateUnary(
+            effective_scalar_s32, HloOpcode::kBitcast, gte_0));
+    HloInstruction* out_token =
+        builder.AddInstruction(HloInstruction::CreateOutfeed(
+            effective_scalar_s32, bitcast_inst, in_token, ""));
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_0, gte_1, out_token}));
 
@@ -352,9 +354,9 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistBitcastIfNeeded) {
   // The bitcast's user can be hoisted, so hoist the bitcast too.
   auto m = CreateNewVerifiedModule();
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
-  auto scalar_f32 = ShapeUtil::MakeShape(F32, {});
-  Shape while_shape =
-      ShapeUtil::MakeTupleShape({scalar_s32, scalar_f32, scalar_f32});
+  auto effective_scalar_s32 = ShapeUtil::MakeShape(S32, {1});
+  Shape while_shape = ShapeUtil::MakeTupleShape(
+      {scalar_s32, effective_scalar_s32, effective_scalar_s32});
 
   HloComputation* while_body = [&]() {
     HloComputation::Builder builder(TestName() + ".while_body");
@@ -363,12 +365,13 @@ TEST_F(WhileLoopInvariantCodeMotionTest, HoistBitcastIfNeeded) {
     HloInstruction* gte_0 = builder.AddInstruction(
         HloInstruction::CreateGetTupleElement(scalar_s32, param, 0));
     HloInstruction* gte_1 = builder.AddInstruction(
-        HloInstruction::CreateGetTupleElement(scalar_f32, param, 1));
-    HloInstruction* bitcast_inst = builder.AddInstruction(
-        HloInstruction::CreateUnary(scalar_f32, HloOpcode::kBitcast, gte_0));
+        HloInstruction::CreateGetTupleElement(effective_scalar_s32, param, 1));
+    HloInstruction* bitcast_inst =
+        builder.AddInstruction(HloInstruction::CreateUnary(
+            effective_scalar_s32, HloOpcode::kBitcast, gte_0));
     HloInstruction* add_inst =
         builder.AddInstruction(HloInstruction::CreateBinary(
-            scalar_f32, HloOpcode::kAdd, bitcast_inst, gte_1));
+            effective_scalar_s32, HloOpcode::kAdd, bitcast_inst, gte_1));
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte_0, gte_1, add_inst}));
 
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
index d30f67dd8110b88166fe807762fb653190ec00bc..386ffb995477ff1b4aef73080b6a6fd988dd1980 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
@@ -58,7 +58,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   HloComputation* while_body = while_op->while_body();
   HloInstruction* while_body_root = while_body->root_instruction();
 
-  if (!ShapeUtil::IsTuple(while_init->shape())) {
+  if (!while_init->shape().IsTuple()) {
     VLOG(2) << "While op's carried value isn't tuple shaped.";
     return false;
   }
@@ -109,8 +109,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
       // operand appears in, but it may appear more than once!
       if (user->user_count() == 1 && user->users().front() == while_body_root &&
           while_body_root->operand_index(user) == user->tuple_index() &&
-          std::count(while_body_root->operands().begin(),
-                     while_body_root->operands().end(), user) == 1) {
+          absl::c_count(while_body_root->operands(), user) == 1) {
         continue;
       }
 
@@ -127,7 +126,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   // through to the while body's root, count that element as "used", since
   // removing that element would be observable.
   for (int64 i = 0; i < while_body_root->operand_count(); ++i) {
-    if (used_tuple_indices.count(i)) {
+    if (used_tuple_indices.contains(i)) {
       continue;
     }
 
@@ -158,7 +157,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   // Build up maps from the old/new to the new/old tuple indices.
   std::vector<int64> new_to_old_tuple_idx(used_tuple_indices.begin(),
                                           used_tuple_indices.end());
-  std::sort(new_to_old_tuple_idx.begin(), new_to_old_tuple_idx.end());
+  absl::c_sort(new_to_old_tuple_idx);
 
   absl::flat_hash_map<int64, int64> old_to_new_tuple_idx;
   for (int64 new_idx = 0; new_idx < new_to_old_tuple_idx.size(); ++new_idx) {
@@ -181,7 +180,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   // replace the old instructions after we remove unused elements from the while
   // tuple.
   auto make_while_computation_replacements = [&](const HloComputation* comp) {
-    std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+    absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
         replacements;
 
     auto* param = comp->parameter_instruction(0);
@@ -233,7 +232,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
       while_cond->CloneWithReplacements(
           make_while_computation_replacements(while_cond));
 
-  std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
       while_body_replacements = make_while_computation_replacements(while_body);
   std::vector<HloInstruction*> new_while_body_root_elems;
   new_while_body_root_elems.reserve(new_to_old_tuple_idx.size());
@@ -583,8 +582,7 @@ static StatusOr<bool> TryPropagateConstant(HloInstruction* while_op) {
 static std::unique_ptr<HloInstruction> UnflattenTupleInstr(
     absl::Span<HloInstruction*> instrs, const Shape& desired_shape,
     std::vector<std::unique_ptr<HloInstruction>>* new_instrs) {
-  CHECK(ShapeUtil::IsTuple(desired_shape))
-      << ShapeUtil::HumanString(desired_shape);
+  CHECK(desired_shape.IsTuple()) << ShapeUtil::HumanString(desired_shape);
 
   // For each child shape in `desired_shape`, slice out the correct number of
   // `instrs` and call UnflattenTupleInstr recursively.  At each step we remove
@@ -593,7 +591,7 @@ static std::unique_ptr<HloInstruction> UnflattenTupleInstr(
   std::vector<HloInstruction*> elems;
   for (int64 i = 0; i < desired_shape.tuple_shapes_size(); ++i) {
     const Shape& subshape = desired_shape.tuple_shapes(i);
-    if (!ShapeUtil::IsTuple(subshape)) {
+    if (!subshape.IsTuple()) {
       elems.push_back(instrs[0]);
       instrs.remove_prefix(1);
       continue;
@@ -603,7 +601,7 @@ static std::unique_ptr<HloInstruction> UnflattenTupleInstr(
     int64 num_leaves = 0;
     ShapeUtil::ForEachSubshape(
         subshape, [&](const Shape& s, const ShapeIndex& /*index*/) {
-          if (!ShapeUtil::IsTuple(s)) {
+          if (!s.IsTuple()) {
             ++num_leaves;
           }
         });
@@ -625,7 +623,7 @@ static std::vector<HloInstruction*> GetFlatTupleElems(
     HloInstruction* instr,
     std::vector<std::unique_ptr<HloInstruction>>* new_instrs) {
   const auto& shape = instr->shape();
-  if (!ShapeUtil::IsTuple(shape)) {
+  if (!shape.IsTuple()) {
     return {instr};
   }
   std::vector<HloInstruction*> elems;
@@ -665,7 +663,7 @@ static StatusOr<bool> TryFlattenNestedTuples(HloInstruction* while_op) {
   std::vector<Shape> flattened_shape_elems;
   ShapeUtil::ForEachSubshape(while_shape,
                              [&](const Shape& s, const ShapeIndex& /*index*/) {
-                               if (!ShapeUtil::IsTuple(s)) {
+                               if (!s.IsTuple()) {
                                  flattened_shape_elems.push_back(s);
                                }
                              });
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
index 4950e8269e9cf0723d717bd1734518d104c0c9f2..ecca76b1e86d833c73fbb9bad6a341660a7d2669 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
@@ -406,13 +407,12 @@ TEST_F(WhileLoopSimplifierTest, RemoveUnusedLoopOperands) {
   // The original while instruction is still left in the module as a dead
   // instruction, find a while instruction with a different name as the new
   // while instruction.
+  const auto& instrs = m->entry_computation()->instructions();
   HloInstruction* new_while_op =
-      *std::find_if(m->entry_computation()->instructions().begin(),
-                    m->entry_computation()->instructions().end(),
-                    [&](const HloInstruction* instr) {
-                      return (instr->opcode() == HloOpcode::kWhile &&
-                              instr->name() != "while");
-                    });
+      *absl::c_find_if(instrs, [&](const HloInstruction* instr) {
+        return (instr->opcode() == HloOpcode::kWhile &&
+                instr->name() != "while");
+      });
 
   auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
   EXPECT_TRUE(
@@ -554,8 +554,7 @@ TEST_F(WhileLoopSimplifierTest, FlattenNestedTuple) {
 
   HloInstruction* new_while = FindFirstWhile(m.get());
   Shape flat_tuple =
-      ShapeUtil::ParseShapeString("(s32[1], s32[2], s32[3], s32[4])")
-          .ValueOrDie();
+      ParseShape("(s32[1], s32[2], s32[3], s32[4])").ValueOrDie();
   SCOPED_TRACE(m->ToString());
   EXPECT_TRUE(ShapeUtil::Equal(new_while->shape(), flat_tuple));
   EXPECT_TRUE(ShapeUtil::Equal(
@@ -567,8 +566,7 @@ TEST_F(WhileLoopSimplifierTest, FlattenNestedTuple) {
       flat_tuple));
   EXPECT_TRUE(ShapeUtil::Equal(
       m->entry_computation()->root_instruction()->shape(),
-      ShapeUtil::ParseShapeString("((s32[1]), (s32[2], s32[3], (s32[4])))")
-          .ValueOrDie()));
+      ParseShape("((s32[1]), (s32[2], s32[3], (s32[4])))").ValueOrDie()));
 }
 
 // Edge-case: All elements of the loop carry are constants which can be removed,
@@ -641,8 +639,7 @@ TEST_F(WhileLoopSimplifierTest, RemoveConstantFromLoopCarry) {
   EXPECT_TRUE(TupleSimplifier().Run(m.get()).ok());
 
   HloInstruction* new_while = FindFirstWhile(m.get());
-  Shape new_while_shape =
-      ShapeUtil::ParseShapeString("(s32[1], s32[3])").ValueOrDie();
+  Shape new_while_shape = ParseShape("(s32[1], s32[3])").ValueOrDie();
   EXPECT_TRUE(ShapeUtil::Equal(new_while->shape(), new_while_shape));
   EXPECT_TRUE(ShapeUtil::Equal(
       new_while->while_body()->root_instruction()->shape(), new_while_shape));
@@ -652,9 +649,9 @@ TEST_F(WhileLoopSimplifierTest, RemoveConstantFromLoopCarry) {
   EXPECT_TRUE(ShapeUtil::Equal(
       new_while->while_condition()->parameter_instruction(0)->shape(),
       new_while_shape));
-  EXPECT_TRUE(ShapeUtil::Equal(
-      m->entry_computation()->root_instruction()->shape(),
-      ShapeUtil::ParseShapeString("(s32[1], s32[2], s32[3])").ValueOrDie()));
+  EXPECT_TRUE(
+      ShapeUtil::Equal(m->entry_computation()->root_instruction()->shape(),
+                       ParseShape("(s32[1], s32[2], s32[3])").ValueOrDie()));
   EXPECT_THAT(m->entry_computation()->root_instruction(),
               op::Tuple(_, op::Constant(), _));
 }
@@ -712,7 +709,7 @@ TEST_F(WhileLoopSimplifierTest, MergeInductionVariables_Simple) {
   // We should have added a new loop counter for s32[] to the end of the tuple.
   SCOPED_TRACE(m->ToString());
   Shape new_while_shape =
-      ShapeUtil::ParseShapeString("(s32[], s32[], s32[], s32[])").ValueOrDie();
+      ParseShape("(s32[], s32[], s32[], s32[])").ValueOrDie();
   EXPECT_TRUE(ShapeUtil::Equal(new_while->shape(), new_while_shape));
   EXPECT_TRUE(ShapeUtil::Equal(
       new_while->while_body()->root_instruction()->shape(), new_while_shape));
diff --git a/tensorflow/compiler/xla/service/while_util.cc b/tensorflow/compiler/xla/service/while_util.cc
index 039ccda7322f5efda6a827efbeda1225c3596cc0..d77386497a14b3e52be2ea7f655fa330f60e4a97 100644
--- a/tensorflow/compiler/xla/service/while_util.cc
+++ b/tensorflow/compiler/xla/service/while_util.cc
@@ -97,7 +97,7 @@ WidenWhileBody(HloComputation* narrow_body, const Shape& wide_shape) {
 WhileUtil::MakeInstructionsLiveIn(
     HloInstruction* while_instr,
     absl::Span<HloInstruction* const> instructions) {
-  CHECK(ShapeUtil::IsTuple(while_instr->shape()));
+  CHECK(while_instr->shape().IsTuple());
 
   int64 elements_in_old_while_shape = while_instr->shape().tuple_shapes_size();
   Shape new_while_shape = while_instr->shape();
diff --git a/tensorflow/compiler/xla/service/while_util_test.cc b/tensorflow/compiler/xla/service/while_util_test.cc
index 5e6941933330fde29bc9c779aae4bb3c36914660..d92b9870f373564ae8fd904c8bf9f0d1afbff9c4 100644
--- a/tensorflow/compiler/xla/service/while_util_test.cc
+++ b/tensorflow/compiler/xla/service/while_util_test.cc
@@ -180,8 +180,8 @@ body {
 
 cond {
   param.c = (s32[], s32[]) parameter(0)
-  token = token[] after-all()
-  infeed = (pred[], token[]) infeed(token)
+  token0 = token[] after-all()
+  infeed = (pred[], token[]) infeed(token0)
   ROOT condition = pred[] get-tuple-element(infeed), index=0
 }
 
diff --git a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc
index 83d696fe0915086c3c98b6d7cbdaeaeb4d9d0bdb..661b7aa7d99ca549da6a509812760a1665d60919 100644
--- a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc
+++ b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination.cc
@@ -31,16 +31,21 @@ StatusOr<bool> ZeroSizedHloElimination::Run(HloModule* module) {
   bool changed = false;
   for (HloComputation* comp : module->MakeNonfusionComputations()) {
     for (HloInstruction* instruction : comp->MakeInstructionPostOrder()) {
-      if (instruction->HasSideEffect() ||
-          !ShapeUtil::IsArray(instruction->shape()) ||
+      if (instruction->HasSideEffect() || !instruction->shape().IsArray() ||
           instruction->opcode() == HloOpcode::kConstant) {
         continue;
       }
       if (comp->IsRemovable(instruction) &&
           ShapeUtil::IsZeroElementArray(instruction->shape())) {
+        // If the instruction doesn't have a layout, use a default layout for
+        // the literal.
+        Shape shape = instruction->shape();
+        if (!LayoutUtil::HasLayout(shape)) {
+          LayoutUtil::SetToDefaultLayout(&shape);
+        }
         TF_RETURN_IF_ERROR(comp->ReplaceWithNewInstruction(
-            instruction, HloInstruction::CreateConstant(
-                             Literal::CreateFromShape(instruction->shape()))));
+            instruction,
+            HloInstruction::CreateConstant(Literal::CreateFromShape(shape))));
         changed = true;
       }
     }
diff --git a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
index a546a6d39cc55d1f327b8449c7d26cd4c95dbf98..572a79609e7a912277af0fd2ba43f9a1e14a6f52 100644
--- a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
+++ b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
@@ -82,5 +82,18 @@ TEST_F(ZeroSizedHloEliminationTest, DoesNotEliminateConstant) {
   EXPECT_FALSE(changed);
 }
 
+TEST_F(ZeroSizedHloEliminationTest, ZeroSizedInstructionWithoutLayoutFolded) {
+  Shape op_shape = ShapeUtil::MakeShape(F32, {4, 0});
+  op_shape.clear_layout();
+  HloInstruction* param1 = builder_.AddInstruction(
+      HloInstruction::CreateParameter(1, op_shape, "zero sized param 1"));
+  HloInstruction* param2 = builder_.AddInstruction(
+      HloInstruction::CreateParameter(2, op_shape, "zero sized param 2"));
+  builder_.AddInstruction(
+      HloInstruction::CreateBinary(op_shape, HloOpcode::kAdd, param1, param2));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunZeroSizedElimination());
+  EXPECT_TRUE(changed);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape.cc b/tensorflow/compiler/xla/shape.cc
index 746ab9e9977b1b10cdb0cb57197027d65bd50f55..94854047e530babe2234381a615aeb805f0d5933 100644
--- a/tensorflow/compiler/xla/shape.cc
+++ b/tensorflow/compiler/xla/shape.cc
@@ -27,12 +27,31 @@ Shape::Shape(const ShapeProto& shape_proto) {
   for (const int64 dimension : shape_proto.dimensions()) {
     add_dimensions(dimension);
   }
+  // A malformed proto may have different is_dynamic_dimension_size and
+  // dimensions_size. Since C++ is evil, and we have no good way of bailing out
+  // in a constructor, conservatively trim the is_dynamic_dimension size.
+  // TODO(b/120111794): Make this a hard error when we have a factory method
+  // instead of a constructor.
+  if (shape_proto.dimensions_size() !=
+      shape_proto.is_dynamic_dimension_size()) {
+    if (shape_proto.is_dynamic_dimension_size() != 0) {
+      LOG(ERROR) << "Malformed shape proto: number of is_dynamic_dimension "
+                    "fields does not match number of dimension fields";
+    } else {
+      LOG(WARNING) << "Malformed shape proto: is_dynamic_dimension is empty";
+    }
+  }
+  int64 num_dynamic_dimension_fields = std::min(
+      shape_proto.dimensions_size(), shape_proto.is_dynamic_dimension_size());
+  for (int i = 0; i < num_dynamic_dimension_fields; i++) {
+    dynamic_dimensions_[i] = shape_proto.is_dynamic_dimension(i);
+  }
   tuple_shapes_.reserve(shape_proto.tuple_shapes_size());
   for (const ShapeProto& element_shape : shape_proto.tuple_shapes()) {
     *add_tuple_shapes() = Shape(element_shape);
   }
   if (shape_proto.has_layout()) {
-    *mutable_layout() = shape_proto.layout();
+    *mutable_layout() = Layout::CreateFromProto(shape_proto.layout());
   }
 }
 
@@ -43,12 +62,15 @@ ShapeProto Shape::ToProto() const {
   for (const int64 dimension : dimensions()) {
     proto.add_dimensions(dimension);
   }
+  for (const bool dynamic : dynamic_dimensions_) {
+    proto.add_is_dynamic_dimension(dynamic);
+  }
   proto.mutable_tuple_shapes()->Reserve(tuple_shapes_size());
   for (const Shape& shape : tuple_shapes()) {
     *proto.add_tuple_shapes() = shape.ToProto();
   }
   if (has_layout()) {
-    *proto.mutable_layout() = layout();
+    *proto.mutable_layout() = layout().ToProto();
   }
   return proto;
 }
@@ -61,6 +83,101 @@ string Shape::ToString(bool print_layout) const {
   }
 }
 
+bool Shape::is_static() const {
+  if (IsTuple()) {
+    for (const Shape& subshape : tuple_shapes_) {
+      if (!subshape.is_static()) {
+        return false;
+      }
+    }
+  }
+  return !absl::c_any_of(dynamic_dimensions_, [](bool b) { return b; });
+}
+
+void Shape::DeleteDimension(int64 dim_to_delete) {
+  CHECK(IsArray());
+  CHECK_GE(dim_to_delete, 0);
+  CHECK_LT(dim_to_delete, dimensions_.size());
+  dimensions_.erase(dimensions_.begin() + dim_to_delete);
+  dynamic_dimensions_.erase(dynamic_dimensions_.begin() + dim_to_delete);
+  if (LayoutUtil::HasLayout(*this)) {
+    layout_.set_format(DENSE);
+    for (int64 i = 0; i < layout_.minor_to_major().size();) {
+      if (layout_.minor_to_major(i) == dim_to_delete) {
+        layout_.mutable_minor_to_major()->erase(
+            layout_.mutable_minor_to_major()->begin() + i);
+        continue;
+      }
+      if (layout_.minor_to_major(i) > dim_to_delete) {
+        (*layout_.mutable_minor_to_major())[i] -= 1;
+      }
+      ++i;
+    }
+  }
+}
+
+bool Shape::Equal::operator()(const Shape& lhs, const Shape& rhs) {
+  if (lhs.IsTuple()) {
+    return rhs.IsTuple() &&
+           absl::c_equal(
+               lhs.tuple_shapes(), rhs.tuple_shapes(),
+               [=](const Shape& l, const Shape& r) { return (*this)(l, r); });
+  } else if (!lhs.IsArray()) {
+    // Non-tuple, non-array tupes such as opaque and token types are trivially
+    // the same.
+    return lhs.element_type() == rhs.element_type();
+  }
+
+  if (!rhs.IsArray()) {
+    return false;
+  }
+
+  if (!ignore_element_type_) {
+    if ((ignore_fp_precision_ &&
+         !ShapeUtil::SameElementTypeIgnoringFpPrecision(lhs, rhs)) ||
+        (!ignore_fp_precision_ && !ShapeUtil::SameElementType(lhs, rhs))) {
+      VLOG(3) << "CompareShapes: lhs element type != rhs element type";
+      return false;
+    }
+  }
+
+  if (!ignore_layout_) {
+    if (lhs.layout().format() != rhs.layout().format()) {
+      VLOG(3) << "CompareShapes: lhs layout format != rhs layout format";
+      return false;
+    }
+    if (LayoutUtil::IsDenseArray(lhs)) {
+      Layout::Equal equal;
+      if (ignore_tiles_in_layout_) {
+        equal.IgnoreTiles();
+      }
+      if (ignore_element_size_in_layout_) {
+        equal.IgnoreElementSize();
+      }
+      if (!equal(lhs.layout(), rhs.layout())) {
+        VLOG(3) << "CompareShapes: lhs layout != rhs layout";
+        return false;
+      }
+    }
+  }
+
+  if (!ShapeUtil::SameDimensions(lhs, rhs)) {
+    VLOG(3) << "CompareShapes: lhs dimensions != rhs dimensions";
+    return false;
+  }
+
+  if (!ignore_dynamic_dimension_) {
+    for (int i = 0; i < lhs.rank(); ++i) {
+      if (lhs.is_dynamic_dimension(i) != rhs.is_dynamic_dimension(i)) {
+        VLOG(3)
+            << "CompareShapes: lhs and rhs have different dynamic dimensions.";
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 std::ostream& operator<<(std::ostream& out, const Shape& shape) {
   out << shape.ToString(/*print_layout=*/true);
   return out;
diff --git a/tensorflow/compiler/xla/shape.h b/tensorflow/compiler/xla/shape.h
index 7f6b14ab4286c696dce64d2250a3fe8a57e4865b..78cea83c6d71e5965f10cd3a917ffccabd630462 100644
--- a/tensorflow/compiler/xla/shape.h
+++ b/tensorflow/compiler/xla/shape.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/layout.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/types.h"
@@ -43,6 +45,43 @@ class Shape {
   // without layout. e.g. "F32[42,12] {0, 1}" or "F32[64]".
   string ToString(bool print_layout = false) const;
 
+  // Returns the rank (number of dimensions) of the given shape. Shape must be
+  // an array.
+  int64 rank() const {
+    CHECK(IsArray()) << "Non-arrays do not have a rank, shape: " << ToString();
+    return dimensions_.size();
+  }
+
+  // Returns whether the shape is of the specified type (array, tuple, etc).
+  bool IsArray() const { return primitive_util::IsArrayType(element_type()); }
+  bool IsTuple() const { return element_type() == TUPLE; }
+  bool IsToken() const { return element_type() == TOKEN; }
+  bool IsOpaque() const { return element_type() == OPAQUE; }
+
+  // Returns true if no array dimension in the shape is dynamically sized. Tuple
+  // shapes are traversed recursively.
+  bool is_static() const;
+
+  // Returns true if the given dimension is dynamically-sized.
+  bool is_dynamic_dimension(int dimension) const {
+    return dynamic_dimensions_.at(dimension);
+  }
+
+  // Sets whether or not the given dimension is dynamically-sized.
+  void set_dynamic_dimension(int dimension, bool is_dynamic) {
+    dynamic_dimensions_[dimension] = is_dynamic;
+  }
+
+  const std::vector<bool>& dynamic_dimensions() const {
+    return dynamic_dimensions_;
+  }
+
+  // Add dimension_upper_bound().
+
+  // Removes the given dimension form the shape. Layout, if it exists, is
+  // adjusted to match the modified shape.
+  void DeleteDimension(int64 dim_to_delete);
+
   // The following methods mirror the protobuf generated code interface for the
   // message ShapeProto. This enabled easy migration of this data structure
   // from a proto to a proper C++ class.
@@ -57,10 +96,16 @@ class Shape {
   int dimensions_size() const { return dimensions_.size(); }
   int64 dimensions(int index) const { return dimensions_.at(index); }
   void set_dimensions(int index, int64 value) { dimensions_.at(index) = value; }
-  void add_dimensions(int64 value) { dimensions_.push_back(value); }
-  void clear_dimensions() { dimensions_.clear(); }
+  void add_dimensions(int64 value) {
+    dimensions_.push_back(value);
+    dynamic_dimensions_.push_back(false);
+  }
+  void clear_dimensions() {
+    dimensions_.clear();
+    dynamic_dimensions_.clear();
+  }
   const std::vector<int64>& dimensions() const { return dimensions_; }
-  std::vector<int64>* mutable_dimensions() { return &dimensions_; }
+  absl::Span<int64> mutable_dimensions() { return absl::MakeSpan(dimensions_); }
 
   // Methods for accessing the tuple subshapes. This field only non-empty for
   // tuple shapes.
@@ -76,21 +121,10 @@ class Shape {
   std::vector<Shape>* mutable_tuple_shapes() { return &tuple_shapes_; }
 
   // Methods for accessing the layout field.
-  bool has_layout() const { return layout_.has_value(); }
-  const Layout& layout() const {
-    if (layout_.has_value()) {
-      return *layout_;
-    } else {
-      return Layout::default_instance();
-    }
-  }
-  Layout* mutable_layout() {
-    if (!layout_.has_value()) {
-      layout_ = Layout();
-    }
-    return &layout_.value();
-  }
-  void clear_layout() { layout_.reset(); }
+  bool has_layout() const { return layout_.format() != INVALID_FORMAT; }
+  const Layout& layout() const { return layout_; }
+  Layout* mutable_layout() { return &layout_; }
+  void clear_layout() { layout_.Clear(); }
 
   void Swap(Shape* other) {
     using std::swap;
@@ -101,25 +135,84 @@ class Shape {
     element_type_ = PRIMITIVE_TYPE_INVALID;
     dimensions_.clear();
     tuple_shapes_.clear();
-    layout_.reset();
+    clear_layout();
   }
 
   string SerializeAsString() const { return ToProto().SerializeAsString(); }
   string ShortDebugString() const { return ToProto().ShortDebugString(); }
   string DebugString() const { return ToProto().DebugString(); }
 
- public:
+  // Equal is a configurable functor to check the equality of two shapes.
+  //
+  // Examples:
+  //
+  // - Comparing two shapes ignoring their layout difference:
+  //   Equal().IgnoreLayout()(shape1, shape2);
+  //
+  // - Comparing two shapes ignoring their layout and element type difference:
+  //   Equal().IgnoreLayout().IgnoreElementType()(shape1, shape2);
+  class Equal {
+   public:
+    Equal() = default;
+
+    bool operator()(const Shape& lhs, const Shape& rhs);
+
+    Equal& IgnoreLayout() {
+      ignore_layout_ = true;
+      return *this;
+    }
+    Equal& IgnoreTilesInLayout() {
+      ignore_tiles_in_layout_ = true;
+      return *this;
+    }
+    Equal& IgnoreElementSizeInLayout() {
+      ignore_element_size_in_layout_ = true;
+      return *this;
+    }
+    Equal& IgnoreElementType() {
+      ignore_element_type_ = true;
+      return *this;
+    }
+    Equal& IgnoreFpPrecision() {
+      ignore_fp_precision_ = true;
+      return *this;
+    }
+    Equal& IgnoreDynamicDimension() {
+      ignore_dynamic_dimension_ = true;
+      return *this;
+    }
+
+   private:
+    bool ignore_layout_ = false;
+    bool ignore_tiles_in_layout_ = false;
+    bool ignore_element_size_in_layout_ = false;
+    bool ignore_element_type_ = false;
+    bool ignore_fp_precision_ = false;
+    bool ignore_dynamic_dimension_ = false;
+  };
+
+  // Test that all fields of the shape are the same, equivalent to Equal().
+  bool operator==(const Shape& other) const { return Equal()(*this, other); }
+  bool operator!=(const Shape& other) const { return !(*this == other); }
+
+ private:
   // The element type of this shape (tuple, array, etc).
   PrimitiveType element_type_ = PRIMITIVE_TYPE_INVALID;
 
-  // The array bounds of the dimensions. This is nonempty only for array shapes.
+  // The array bounds of the dimensions. This is nonempty only for array
+  // shapes. For a dynamically-sized dimension, the respective value in this
+  // vector is an inclusive upper limit of the array bound.
   std::vector<int64> dimensions_;
 
+  // This vector is the same size as 'dimensions_' and indicates whether the
+  // respective dimension is dynamically sized.
+  std::vector<bool> dynamic_dimensions_;
+
   // The tuple element subshapes. This is nonempty only for tuple shapes.
   std::vector<Shape> tuple_shapes_;
 
-  // The array layout of the shape. This is present only for array shapes.
-  absl::optional<Layout> layout_;
+  // The layout of the shape. Only relevant for arrays.
+  Layout layout_;
 };
 
 // Shape of the parameters and output of an XLA computation. This is analogous
diff --git a/tensorflow/compiler/xla/shape_layout.cc b/tensorflow/compiler/xla/shape_layout.cc
index d44db89d571891ecef554cd45c050017833982bb..a000886d60d06a4a598910c901accb6dfd0a8f1a 100644
--- a/tensorflow/compiler/xla/shape_layout.cc
+++ b/tensorflow/compiler/xla/shape_layout.cc
@@ -52,7 +52,7 @@ bool ShapeLayout::MatchesLayoutInShape(const Shape& shape) const {
 
 const Layout& ShapeLayout::layout() const {
   CHECK(LayoutIsSet());
-  CHECK(!ShapeUtil::IsTuple(shape_));
+  CHECK(!shape_.IsTuple());
   return shape_.layout();
 }
 
@@ -61,15 +61,15 @@ void ShapeLayout::Clear() { LayoutUtil::ClearLayout(&shape_); }
 bool ShapeLayout::LayoutIsSet() const { return LayoutUtil::HasLayout(shape_); }
 
 void ShapeLayout::ResetLayout(const Layout& layout) {
-  CHECK(!ShapeUtil::IsTuple(shape_));
-  CHECK(!ShapeUtil::IsOpaque(shape_));
+  CHECK(!shape_.IsTuple());
+  CHECK(!shape_.IsOpaque());
   *shape_.mutable_layout() = layout;
   TF_CHECK_OK(ShapeUtil::ValidateShape(shape_));
 }
 
 void ShapeLayout::ResetLayout(const Layout& layout,
                               ShapeIndexView shape_index) {
-  CHECK(ShapeUtil::IsTuple(shape_));
+  CHECK(shape_.IsTuple());
   *ShapeUtil::GetMutableSubshape(&shape_, shape_index)->mutable_layout() =
       layout;
   TF_CHECK_OK(ShapeUtil::ValidateShape(shape_));
diff --git a/tensorflow/compiler/xla/shape_test.cc b/tensorflow/compiler/xla/shape_test.cc
index e396897eeebc2e7bdc2dc49300c8906710608b05..526abafea5cc244418a4ec05db7da6203716b483 100644
--- a/tensorflow/compiler/xla/shape_test.cc
+++ b/tensorflow/compiler/xla/shape_test.cc
@@ -41,11 +41,13 @@ class ShapeTest : public ::testing::Test {
       ShapeUtil::MakeTupleShape({opaque_, scalar_, matrix_, matrix2_});
   const Shape nested_tuple_ =
       ShapeUtil::MakeTupleShape({tuple_, matrix_, token_});
+  const Shape dyanmic_matrix_ =
+      ShapeUtil::MakeShape(S32, {5, 2}, {true, false});
 };
 
 TEST_F(ShapeTest, ShapeToFromProto) {
-  for (const Shape& shape :
-       {opaque_, token_, scalar_, matrix_, matrix2_, tuple_, nested_tuple_}) {
+  for (const Shape& shape : {opaque_, token_, scalar_, matrix_, matrix2_,
+                             tuple_, nested_tuple_, dyanmic_matrix_}) {
     Shape shape_copy(shape.ToProto());
     EXPECT_TRUE(ShapeUtil::Equal(shape, shape_copy))
         << shape << " != " << shape_copy;
@@ -74,6 +76,65 @@ TEST_F(ShapeTest, ShapeToString) {
       nested_tuple_.ToString(/*print_layout=*/true));
 }
 
+TEST_F(ShapeTest, DynamicShapeToString) {
+  Shape array_shape =
+      ShapeUtil::MakeShape(F32, {23, 44, 55}, {true, false, true});
+  EXPECT_EQ("f32[<=23,44,<=55]", array_shape.ToString());
+
+  array_shape.set_dynamic_dimension(2, false);
+  EXPECT_EQ("f32[<=23,44,55]", array_shape.ToString());
+}
+
+TEST_F(ShapeTest, EqualityTest) {
+  // Different layouts.
+  EXPECT_NE(ShapeUtil::MakeShapeWithLayout(F32, {23, 44}, {1, 0}),
+            ShapeUtil::MakeShapeWithLayout(F32, {23, 44}, {0, 1}));
+
+  // Different dims.
+  EXPECT_NE(ShapeUtil::MakeShapeWithLayout(F32, {44, 23}, {1, 0}),
+            ShapeUtil::MakeShapeWithLayout(F32, {23, 44}, {1, 0}));
+
+  // Different elements.
+  EXPECT_NE(ShapeUtil::MakeShapeWithLayout(S32, {44, 23}, {1, 0}),
+            ShapeUtil::MakeShapeWithLayout(F32, {23, 44}, {1, 0}));
+
+  // Equal shapes.
+  EXPECT_EQ(ShapeUtil::MakeShapeWithLayout(F32, {23, 44}, {1, 0}),
+            ShapeUtil::MakeShapeWithLayout(F32, {23, 44}, {1, 0}));
+}
+
+TEST_F(ShapeTest, IsStatic) {
+  EXPECT_TRUE(opaque_.is_static());
+  EXPECT_TRUE(token_.is_static());
+  EXPECT_TRUE(matrix_.is_static());
+  EXPECT_TRUE(tuple_.is_static());
+  EXPECT_TRUE(nested_tuple_.is_static());
+
+  Shape dynamic_matrix = matrix_;
+  EXPECT_TRUE(dynamic_matrix.is_static());
+  dynamic_matrix.set_dynamic_dimension(1, true);
+  EXPECT_FALSE(dynamic_matrix.is_static());
+
+  Shape dynamic_tuple = tuple_;
+  EXPECT_TRUE(dynamic_tuple.is_static());
+  ShapeUtil::GetMutableSubshape(&dynamic_tuple, {2})
+      ->set_dynamic_dimension(1, true);
+  EXPECT_FALSE(dynamic_tuple.is_static());
+}
+
+TEST_F(ShapeTest, IsDynamicDimension) {
+  Shape dynamic_matrix = matrix_;
+  dynamic_matrix.set_dynamic_dimension(1, true);
+  EXPECT_FALSE(dynamic_matrix.is_dynamic_dimension(0));
+  EXPECT_TRUE(dynamic_matrix.is_dynamic_dimension(1));
+
+  Shape dynamic_tuple = tuple_;
+  EXPECT_TRUE(dynamic_tuple.is_static());
+  ShapeUtil::GetMutableSubshape(&dynamic_tuple, {2})
+      ->set_dynamic_dimension(1, true);
+  EXPECT_FALSE(dynamic_tuple.is_static());
+}
+
 TEST_F(ShapeTest, ProgramShapeToFromProto) {
   ProgramShape program_shape;
   *program_shape.add_parameters() = ShapeUtil::MakeShape(F32, {1, 2, 3});
diff --git a/tensorflow/compiler/xla/shape_tree.h b/tensorflow/compiler/xla/shape_tree.h
index 7bf97729165bef98fabc29040e02203eee68a53c..089120179e2a77518eb5b18c11a35670b03e9b77 100644
--- a/tensorflow/compiler/xla/shape_tree.h
+++ b/tensorflow/compiler/xla/shape_tree.h
@@ -395,7 +395,7 @@ class ShapeTreeIterator
 template <typename T>
 int64 ShapeTree<T>::CountSubshapes(const Shape& shape) {
   int64 current_count = 1;
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     int64 count = ShapeUtil::TupleElementCount(shape);
     for (int i = 0; i < count; ++i) {
       current_count += CountSubshapes(shape.tuple_shapes(i));
@@ -407,7 +407,7 @@ int64 ShapeTree<T>::CountSubshapes(const Shape& shape) {
 template <typename T>
 void ShapeTree<T>::InitChildren(const Shape& shape, const T& init_value,
                                 Node* node, Index* index) {
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     const int64 size = ShapeUtil::TupleElementCount(shape);
 #ifndef NDEBUG
     index->children_count = size;
@@ -443,7 +443,7 @@ void ShapeTree<T>::InitChildren(const Shape& shape, const T& init_value,
 
 template <typename T>
 void ShapeTree<T>::InitChildren(const Shape& shape, Node* node, Index* index) {
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     const int64 size = ShapeUtil::TupleElementCount(shape);
 #ifndef NDEBUG
     index->children_count = size;
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index a4d4e1e53e727bdf7822cacaa4559fcae59d4eae..d045fc7a9e291258640eca75166e116cf7390a7b 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
@@ -81,78 +82,16 @@ bool ShapeIndexView::StartsWith(ShapeIndexView prefix) const {
 
 /* static */ bool ShapeUtil::IsArrayPrimitiveType(
     PrimitiveType primitive_type) {
-  return primitive_type != PRIMITIVE_TYPE_INVALID && primitive_type != TUPLE &&
-         primitive_type != OPAQUE && primitive_type != TOKEN;
+  return primitive_util::IsArrayType(primitive_type);
 }
 
 namespace {
-
-// Recursive helper for comparing the equality of two shapes. Returns true if
-// the shapes are the same. If compare_layouts is true, then layouts must also
-// match.
-bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts,
-                   bool ignore_fp_precision) {
-  if ((ignore_fp_precision &&
-       !ShapeUtil::SameElementTypeIgnoringFpPrecision(lhs, rhs)) ||
-      (!ignore_fp_precision && !ShapeUtil::SameElementType(lhs, rhs))) {
-    VLOG(3) << "CompareShapes: lhs element type != rhs element type";
-    return false;
-  }
-
-  if (ShapeUtil::IsTuple(lhs)) {
-    return absl::c_equal(lhs.tuple_shapes(), rhs.tuple_shapes(),
-                         [=](const Shape& l, const Shape& r) {
-                           return CompareShapes(l, r, compare_layouts,
-                                                ignore_fp_precision);
-                         });
-  } else if (!ShapeUtil::IsArray(lhs)) {
-    // Non-tuple, non-array tupes such as opaque and token types are trivially
-    // the same.
-    return true;
-  }
-
-  if (compare_layouts) {
-    if (lhs.layout().format() != rhs.layout().format()) {
-      return false;
-    }
-    if (LayoutUtil::IsDenseArray(lhs)) {
-      if (!absl::c_equal(LayoutUtil::MinorToMajor(lhs),
-                         LayoutUtil::MinorToMajor(rhs))) {
-        VLOG(3) << "CompareShapes: lhs layout != rhs layout";
-        return false;
-      }
-
-      const auto& lhs_tiles = lhs.layout().tiles();
-      const auto& rhs_tiles = rhs.layout().tiles();
-      if (lhs_tiles.size() != rhs_tiles.size()) {
-        return false;
-      }
-      for (int64 i = 0; i < lhs_tiles.size(); i++) {
-        if (!absl::c_equal(lhs_tiles[i].dimensions(),
-                           rhs_tiles[i].dimensions())) {
-          return false;
-        }
-      }
-
-      if (lhs.layout().element_size_in_bits() !=
-          rhs.layout().element_size_in_bits()) {
-        return false;
-      }
-    }
-  }
-
-  if (!ShapeUtil::SameDimensions(lhs, rhs)) {
-    VLOG(3) << "CompareShapes: lhs dimensions != rhs dimensions";
-    return false;
-  }
-  return true;
-}
-
 // Constructs and returns the new shape with the given minor_to_major order in
 // its Layout.
 StatusOr<Shape> MakeShapeWithLayoutInternal(
     PrimitiveType element_type, absl::Span<const int64> dimensions,
-    absl::Span<const int64> minor_to_major) {
+    absl::Span<const int64> minor_to_major, absl::Span<const Tile> tiles,
+    int64 element_size_in_bits) {
   if (dimensions.size() != minor_to_major.size()) {
     return InvalidArgument("Dimensions size is %ld, but layout size is %ld.",
                            dimensions.size(), minor_to_major.size());
@@ -163,23 +102,19 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
   }
   TF_ASSIGN_OR_RETURN(Shape shape,
                       ShapeUtil::MakeValidatedShape(element_type, dimensions));
-  auto min2maj = shape.mutable_layout()->mutable_minor_to_major();
-  min2maj->Clear();
-  for (int64 value : minor_to_major) {
-    min2maj->Add(value);
-  }
+  *shape.mutable_layout() =
+      LayoutUtil::MakeLayout(minor_to_major, tiles, element_size_in_bits);
   if (!shape.has_layout()) {
     return InvalidArgument("Shape has no layout.");
   }
   TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(shape));
   return shape;
 }
-
 }  // namespace
 
 /* static */ bool ShapeUtil::Equal(const Shape& lhs, const Shape& rhs) {
-  bool equal = CompareShapes(lhs, rhs, /*compare_layouts=*/true,
-                             /*ignore_fp_precision=*/false);
+  bool equal = Shape::Equal()(lhs, rhs);
+
   if (!equal && VLOG_IS_ON(3)) {
     VLOG(3) << "ShapeUtil::Equal differ: lhs = " << lhs.ShortDebugString()
             << ", rhs = " << rhs.ShortDebugString();
@@ -190,8 +125,7 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
 
 /* static */ bool ShapeUtil::EqualIgnoringFpPrecision(const Shape& lhs,
                                                       const Shape& rhs) {
-  bool equal = CompareShapes(lhs, rhs, /*compare_layouts=*/true,
-                             /*ignore_fp_precision=*/true);
+  bool equal = Shape::Equal().IgnoreFpPrecision()(lhs, rhs);
   if (!equal && VLOG_IS_ON(3)) {
     VLOG(3) << "ShapeUtil::EqualIgnoringFpPrecision differ: lhs = "
             << lhs.ShortDebugString() << ", rhs = " << rhs.ShortDebugString();
@@ -200,12 +134,6 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
   return equal;
 }
 
-/* static */ int64 ShapeUtil::Rank(const Shape& shape) {
-  CHECK(ShapeUtil::IsArray(shape))
-      << "Non-arrays do not have a rank, shape: " << shape;
-  return shape.dimensions_size();
-}
-
 /* static */ int64 ShapeUtil::TrueRank(const Shape& shape) {
   int64 accum = 0;
   for (int64 dimension : shape.dimensions()) {
@@ -232,18 +160,38 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
   return MakeValidatedShape(element_type, dimensions).ValueOrDie();
 }
 
+/* static */ Shape ShapeUtil::MakeShape(
+    PrimitiveType element_type, absl::Span<const int64> dimensions,
+    const std::vector<bool>& dynamic_dimensions) {
+  return MakeValidatedShape(element_type, dimensions, dynamic_dimensions)
+      .ValueOrDie();
+}
+
 /* static */ StatusOr<Shape> ShapeUtil::MakeValidatedShape(
     PrimitiveType element_type, absl::Span<const int64> dimensions) {
-  CHECK(IsArrayPrimitiveType(element_type));
+  CHECK(IsArrayPrimitiveType(element_type)) << element_type;
   Shape result;
   TF_RETURN_IF_ERROR(PopulateShape(element_type, dimensions, &result));
   return result;
 }
 
+/* static */ StatusOr<Shape> ShapeUtil::MakeValidatedShape(
+    PrimitiveType element_type, absl::Span<const int64> dimensions,
+    const std::vector<bool>& dynamic_dimensions) {
+  TF_ASSIGN_OR_RETURN(Shape shape,
+                      MakeValidatedShape(element_type, dimensions));
+  for (int i = 0; i < dynamic_dimensions.size(); ++i) {
+    shape.set_dynamic_dimension(i, dynamic_dimensions[i]);
+  }
+  return shape;
+}
+
 /* static */ Shape ShapeUtil::MakeShapeWithLayout(
     PrimitiveType element_type, absl::Span<const int64> dimensions,
-    absl::Span<const int64> minor_to_major) {
-  return MakeShapeWithLayoutInternal(element_type, dimensions, minor_to_major)
+    absl::Span<const int64> minor_to_major, absl::Span<const Tile> tiles,
+    int64 element_size_in_bits) {
+  return MakeShapeWithLayoutInternal(element_type, dimensions, minor_to_major,
+                                     tiles, element_size_in_bits)
       .ValueOrDie();
 }
 
@@ -319,7 +267,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 
 /* static */ void ShapeUtil::AppendMajorDimension(int bound, Shape* shape) {
   CHECK(LayoutUtil::IsDenseArray(*shape));
-  shape->mutable_layout()->add_minor_to_major(Rank(*shape));
+  shape->mutable_layout()->add_minor_to_major(shape->rank());
   shape->add_dimensions(bound);
   TF_DCHECK_OK(ValidateShape(*shape));
 }
@@ -334,7 +282,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 }
 
 /* static */ bool ShapeUtil::ElementHasBitWidth(const Shape& shape, int bits) {
-  if (!IsArray(shape)) {
+  if (!shape.IsArray()) {
     return false;
   }
   return primitive_util::BitWidth(shape.element_type()) == bits;
@@ -358,6 +306,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
     case U32:
     case U64:
     case C64:
+    case C128:
     case TUPLE:
     case OPAQUE:
     case TOKEN:
@@ -376,27 +325,24 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   return primitive_util::IsFloatingPointType(shape.element_type());
 }
 
-/* static */ bool ShapeUtil::IsArray(const Shape& shape) {
-  return IsArrayPrimitiveType(shape.element_type());
-}
-
 /* static */ bool ShapeUtil::IsNestedTuple(const Shape& shape) {
-  return IsTuple(shape) && std::any_of(shape.tuple_shapes().begin(),
-                                       shape.tuple_shapes().end(), IsTuple);
+  return shape.IsTuple() &&
+         absl::c_any_of(shape.tuple_shapes(),
+                        [](const Shape& s) { return s.IsTuple(); });
 }
 
 /* static */ bool ShapeUtil::IsEmptyTuple(const Shape& shape) {
-  return IsTuple(shape) && TupleElementCount(shape) == 0;
+  return shape.IsTuple() && TupleElementCount(shape) == 0;
 }
 
 /* static */ int64 ShapeUtil::TupleElementCount(const Shape& shape) {
-  CHECK(IsTuple(shape)) << HumanString(shape);
+  CHECK(shape.IsTuple()) << HumanString(shape);
   return shape.tuple_shapes_size();
 }
 
 /* static */ const Shape& ShapeUtil::GetTupleElementShape(const Shape& shape,
                                                           int64 index) {
-  CHECK(IsTuple(shape));
+  CHECK(shape.IsTuple());
   CHECK_GT(TupleElementCount(shape), index);
   TF_DCHECK_OK(ValidateShapeWithOptionalLayout(shape.tuple_shapes(index)));
   return shape.tuple_shapes(index);
@@ -412,7 +358,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 /* static */ Shape ShapeUtil::SliceTuple(const Shape& tuple, int64 start,
                                          int64 limit) {
   TF_DCHECK_OK(ValidateShapeWithOptionalLayout(tuple));
-  CHECK(IsTuple(tuple));
+  CHECK(tuple.IsTuple());
   CHECK_LE(start, TupleElementCount(tuple));
   CHECK_LE(limit, TupleElementCount(tuple));
 
@@ -429,15 +375,9 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
                                               complex_shape.element_type()));
 }
 
-/* static */ bool ShapeUtil::ShapeIs(const Shape& shape,
-                                     PrimitiveType element_type,
-                                     std::initializer_list<int64> dimensions) {
-  return Equal(shape, MakeShape(element_type, dimensions));
-}
-
 /* static */ int64 ShapeUtil::ElementsIn(const Shape& shape) {
-  DCHECK(IsArray(shape)) << ShapeUtil::HumanString(shape);
-  DCHECK_EQ(shape.dimensions_size(), Rank(shape));
+  DCHECK(shape.IsArray()) << ShapeUtil::HumanString(shape);
+  DCHECK_EQ(shape.dimensions_size(), shape.rank());
   if (shape.dimensions().size() == 1) {
     return shape.dimensions()[0];
   }
@@ -447,8 +387,8 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 }
 
 /* static */ int64 ShapeUtil::ElementsInRecursive(const Shape& shape) {
-  CHECK(IsArray(shape) || IsTuple(shape));
-  if (IsArray(shape)) {
+  CHECK(shape.IsArray() || shape.IsTuple());
+  if (shape.IsArray()) {
     return ElementsIn(shape);
   }
   int64 count = 0;
@@ -472,7 +412,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 }
 
 /* static */ bool ShapeUtil::IsZeroElementArray(const Shape& shape) {
-  return ShapeUtil::IsArray(shape) && ElementsIn(shape) == 0;
+  return shape.IsArray() && ElementsIn(shape) == 0;
 }
 
 /* static */ bool ShapeUtil::IsScalarWithElementType(
@@ -480,56 +420,8 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   return IsScalar(shape) && shape.element_type() == element_type;
 }
 
-namespace {
-
-// Class to memoize the computation of
-//   absl::AsciiStrToLower(PrimitiveType_Name(p))
-// for all PrimitiveType values "p"
-class PrimitiveTypeNameGenerator {
- public:
-  PrimitiveTypeNameGenerator() {
-    for (int i = 0; i < PrimitiveType_ARRAYSIZE; i++) {
-      if (PrimitiveType_IsValid(i)) {
-        lowercase_name_[i] = absl::AsciiStrToLower(
-            PrimitiveType_Name(static_cast<PrimitiveType>(i)));
-      }
-    }
-  }
-  const string& LowercaseName(PrimitiveType t) {
-    return lowercase_name_[static_cast<int>(t)];
-  }
-
- private:
-  string lowercase_name_[PrimitiveType_ARRAYSIZE];
-};
-
-const string& LowercasePrimitiveTypeName(PrimitiveType s) {
-  static PrimitiveTypeNameGenerator* gen = new PrimitiveTypeNameGenerator();
-  return gen->LowercaseName(s);
-}
-
-StatusOr<PrimitiveType> StringToPrimitiveType(const string& name) {
-  static std::unordered_map<string, PrimitiveType>* name_to_type = [] {
-    static auto* map = new std::unordered_map<string, PrimitiveType>;
-    for (int i = 0; i < PrimitiveType_ARRAYSIZE; i++) {
-      if (PrimitiveType_IsValid(i)) {
-        auto value = static_cast<PrimitiveType>(i);
-        (*map)[LowercasePrimitiveTypeName(value)] = value;
-      }
-    }
-    return map;
-  }();
-  auto found = name_to_type->find(name);
-  if (found == name_to_type->end()) {
-    return InvalidArgument("Invalid element type string: \"%s\".", name);
-  }
-  return found->second;
-}
-
-}  // namespace
-
 /* static */ string ShapeUtil::HumanString(const Shape& shape) {
-  if (IsTuple(shape)) {
+  if (shape.IsTuple()) {
     string text = "(";
     const char* prefix = "";
     for (const Shape& elem_shape : shape.tuple_shapes()) {
@@ -539,12 +431,21 @@ StatusOr<PrimitiveType> StringToPrimitiveType(const string& name) {
     text += ")";
     return text;
   }
-  return StrCat(LowercasePrimitiveTypeName(shape.element_type()), "[",
-                absl::StrJoin(shape.dimensions(), ","), "]");
+  std::vector<string> dim_elements;
+  for (int i = 0; i < shape.dimensions_size(); ++i) {
+    if (shape.is_dynamic_dimension(i)) {
+      dim_elements.push_back(StrCat("<=", shape.dimensions(i)));
+    } else {
+      dim_elements.push_back(StrCat(shape.dimensions(i)));
+    }
+  }
+  return StrCat(
+      primitive_util::LowercasePrimitiveTypeName(shape.element_type()), "[",
+      absl::StrJoin(dim_elements, ","), "]");
 }
 
 /* static */ string ShapeUtil::HumanStringWithLayout(const Shape& shape) {
-  if (IsTuple(shape)) {
+  if (shape.IsTuple()) {
     string text = "(";
     const char* prefix = "";
     for (const Shape& elem_shape : shape.tuple_shapes()) {
@@ -554,12 +455,14 @@ StatusOr<PrimitiveType> StringToPrimitiveType(const string& name) {
     text += ")";
     return text;
   }
-  string result = StrCat(LowercasePrimitiveTypeName(shape.element_type()), "[");
+  string result = StrCat(
+      primitive_util::LowercasePrimitiveTypeName(shape.element_type()), "[");
   for (int i = 0; i < shape.dimensions().size(); i++) {
-    StrAppend(&result, (i > 0) ? "," : "", shape.dimensions(i));
+    StrAppend(&result, (i > 0) ? "," : "",
+              shape.is_dynamic_dimension(i) ? "<=" : "", shape.dimensions(i));
   }
   result += "]";
-  if (!IsScalar(shape) && IsArray(shape)) {
+  if (!IsScalar(shape) && shape.IsArray()) {
     if (LayoutUtil::HasLayout(shape)) {
       StrAppend(&result, LayoutUtil::HumanString(shape.layout()));
     }
@@ -580,155 +483,25 @@ StatusOr<PrimitiveType> StringToPrimitiveType(const string& name) {
                 HumanString(program_shape.result()));
 }
 
-namespace {
-// Parses shapes with simple recursive descent structure -- consumes from the
-// front of s and passes that view recursively as required.
-StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
-  *s = absl::StripLeadingAsciiWhitespace(*s);
-
-  if (absl::ConsumePrefix(s, "(")) {  // Tuple.
-    std::vector<Shape> shapes;
-    bool must_end = false;
-    while (true) {
-      if (absl::ConsumePrefix(s, ")")) {
-        break;
-      } else if (must_end) {
-        return InvalidArgument("Expected end of tuple; got: \"%s\"", *s);
-      }
-      shapes.emplace_back();
-      TF_ASSIGN_OR_RETURN(shapes.back(), ParseShapeStringInternal(s));
-      *s = absl::StripLeadingAsciiWhitespace(*s);
-      must_end = !absl::ConsumePrefix(s, ",");
-    }
-    return ShapeUtil::MakeTupleShape(shapes);
-  }
-
-  string element_type_string;
-  string dimensions_string;
-  string format_string;
-  string layout_string;
-  // absl::string_view is not compatible with internal RE2 StringPiece, so
-  // we convert in to the RE2-consumable type and then consume the corresponding
-  // amount from our string_view type.
-  static LazyRE2 shape_pattern = {
-      "^(\\w*\\d*)\\[([\\d,\\s]*)\\](?:\\s*(dense|sparse)?\\s*{([\\d,\\s]+)})"
-      "?"};
-  tensorflow::RegexpStringPiece s_consumable(s->data(), s->size());
-  if (RE2::Consume(&s_consumable, *shape_pattern, &element_type_string,
-                   &dimensions_string, &format_string, &layout_string)) {
-    size_t consumed = s->size() - s_consumable.size();
-    s->remove_prefix(consumed);
-    auto string_to_int64 = [&s](absl::string_view input) -> StatusOr<int64> {
-      int64 element;
-      if (!absl::SimpleAtoi(input, &element)) {
-        return InvalidArgument(
-            "Invalid s64 value in parsed shape string: \"%s\" in \"%s\"", input,
-            *s);
-      }
-      return element;
-    };
-
-    auto comma_list_to_int64s =
-        [string_to_int64](const string& input) -> StatusOr<std::vector<int64>> {
-      std::vector<int64> results;
-      for (const auto& piece : absl::StrSplit(input, ',', absl::SkipEmpty())) {
-        TF_ASSIGN_OR_RETURN(int64 element, string_to_int64(piece));
-        results.push_back(element);
-      }
-      return results;
-    };
-
-    // Extract the dimensions.
-    TF_ASSIGN_OR_RETURN(std::vector<int64> dimensions,
-                        comma_list_to_int64s(dimensions_string));
-
-    // Extract the primitive element type.
-    TF_ASSIGN_OR_RETURN(const PrimitiveType primitive_type,
-                        StringToPrimitiveType(element_type_string));
-    if (primitive_type == PRIMITIVE_TYPE_INVALID || primitive_type == TUPLE) {
-      return InvalidArgument("Invalid element type string: \"%s\".",
-                             element_type_string);
-    }
-
-    Shape result;
-    if (primitive_type == OPAQUE) {
-      result = ShapeUtil::MakeOpaqueShape();
-    } else if (primitive_type == TOKEN) {
-      result = ShapeUtil::MakeTokenShape();
-    } else if (format_string.empty() && layout_string.empty()) {
-      // Create a shape without a layout set.
-      TF_ASSIGN_OR_RETURN(
-          result, ShapeUtil::MakeValidatedShape(primitive_type, dimensions));
-    } else if (format_string == "sparse") {
-      TF_ASSIGN_OR_RETURN(int64 max_elements, string_to_int64(layout_string));
-      result = ShapeUtil::MakeShapeWithSparseLayout(primitive_type, dimensions,
-                                                    max_elements);
-    } else if (format_string.empty() || format_string == "dense") {
-      // Extract the layout minor-to-major and set it.
-      TF_ASSIGN_OR_RETURN(std::vector<int64> min2maj,
-                          comma_list_to_int64s(layout_string));
-      TF_ASSIGN_OR_RETURN(result, MakeShapeWithLayoutInternal(
-                                      primitive_type, dimensions, min2maj));
-    } else {
-      // This should not be reached.
-      LOG(FATAL) << "Unhandled condition when parsing shape; format: \""
-                 << format_string << "\", layout: \"" << layout_string << "\"";
-    }
-    TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(result));
-    return std::move(result);
-  }
-
-  return InvalidArgument("Invalid shape string to parse: \"%s\"", *s);
-}
-}  // namespace
-
-/* static */ StatusOr<Shape> ShapeUtil::ParseShapeString(absl::string_view s) {
-  TF_ASSIGN_OR_RETURN(Shape shape, ParseShapeStringInternal(&s));
-  if (!s.empty()) {
-    return InvalidArgument("Invalid shape string to parse: \"%s\"", s);
-  }
-  return shape;
-}
-
 /* static */ bool ShapeUtil::SameDimensions(const Shape& lhs,
                                             const Shape& rhs) {
-  CHECK(ShapeUtil::IsArray(lhs));
-  CHECK(ShapeUtil::IsArray(rhs));
+  CHECK(lhs.IsArray());
+  CHECK(rhs.IsArray());
   return absl::c_equal(lhs.dimensions(), rhs.dimensions());
 }
 
 /* static */ bool ShapeUtil::Compatible(const Shape& lhs, const Shape& rhs) {
-  return CompareShapes(lhs, rhs, /*compare_layouts=*/false,
-                       /*ignore_fp_precision=*/false);
+  return Shape::Equal().IgnoreLayout()(lhs, rhs);
 }
 
 /* static */ bool ShapeUtil::CompatibleIgnoringElementType(const Shape& lhs,
                                                            const Shape& rhs) {
-  if (IsArray(lhs)) {
-    return IsArray(rhs) && SameDimensions(lhs, rhs);
-  } else if (lhs.element_type() == TUPLE) {
-    return rhs.element_type() == TUPLE &&
-           absl::c_equal(lhs.tuple_shapes(), rhs.tuple_shapes(),
-                         CompatibleIgnoringElementType);
-  } else {
-    // Opaque, token, etc types are vacuously compatible.
-    return lhs.element_type() == rhs.element_type();
-  }
+  return Shape::Equal().IgnoreElementType().IgnoreLayout()(lhs, rhs);
 }
 
 /* static */ bool ShapeUtil::CompatibleIgnoringFpPrecision(const Shape& lhs,
                                                            const Shape& rhs) {
-  if (IsArray(lhs)) {
-    return IsArray(rhs) && SameElementTypeIgnoringFpPrecision(lhs, rhs) &&
-           CompatibleIgnoringElementType(lhs, rhs);
-  } else if (lhs.element_type() == TUPLE) {
-    return rhs.element_type() == TUPLE &&
-           absl::c_equal(lhs.tuple_shapes(), rhs.tuple_shapes(),
-                         CompatibleIgnoringFpPrecision);
-  } else {
-    // Opaque, token, etc types are vacuously compatible.
-    return lhs.element_type() == rhs.element_type();
-  }
+  return Shape::Equal().IgnoreFpPrecision().IgnoreLayout()(lhs, rhs);
 }
 
 /* static */ int64 ShapeUtil::GetDimension(const Shape& shape,
@@ -739,7 +512,7 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
 /* static */ int64 ShapeUtil::GetDimensionNumber(const Shape& shape,
                                                  int64 dimension_number) {
   if (dimension_number < 0) {
-    dimension_number += Rank(shape);
+    dimension_number += shape.rank();
   }
   CHECK_GE(dimension_number, 0);
   return dimension_number;
@@ -776,6 +549,8 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
       return sizeof(double);
     case C64:
       return sizeof(complex64);
+    case C128:
+      return sizeof(complex128);
     case TOKEN:
       // Tokens require no space.
       return 0;
@@ -793,7 +568,7 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
   TF_DCHECK_OK(ValidateShape(shape));
   if (shape.element_type() == TUPLE) {
     return ByteSizeOfTupleIndexTable(shape, pointer_size);
-  } else if (IsArray(shape)) {
+  } else if (shape.IsArray()) {
     int64 byte_size = ByteSizeOfElements(shape);
     if (LayoutUtil::IsSparseArray(shape)) {
       byte_size += ByteSizeOfSparseIndices(shape);
@@ -819,7 +594,7 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
 
 /* static */ int64 ShapeUtil::ByteSizeOfElements(const Shape& shape) {
   TF_DCHECK_OK(ValidateShape(shape));
-  CHECK(ShapeUtil::IsArray(shape));
+  CHECK(shape.IsArray());
   int64 allocated_element_count;
 
   if (LayoutUtil::IsSparseArray(shape)) {
@@ -835,8 +610,8 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
 /* static */ int64 ShapeUtil::ByteSizeOfSparseIndices(const Shape& shape) {
   TF_DCHECK_OK(ValidateShape(shape));
   CHECK(LayoutUtil::IsSparseArray(shape));
-  return LayoutUtil::MaxSparseElements(shape.layout()) *
-         ShapeUtil::Rank(shape) * sizeof(int64);
+  return LayoutUtil::MaxSparseElements(shape.layout()) * shape.rank() *
+         sizeof(int64);
 }
 
 /* static */ Status ShapeUtil::ValidateShapeWithOptionalLayoutInternal(
@@ -867,22 +642,22 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
     if (shape.dimensions_size() != 0) {
       return InvalidArgument(
           "shape has %s element type, but has dimensions field: %s",
-          LowercasePrimitiveTypeName(shape.element_type()),
+          primitive_util::LowercasePrimitiveTypeName(shape.element_type()),
           shape.ShortDebugString());
     }
     if (shape.has_layout()) {
       return InvalidArgument(
           "shape has %s element type, but has layout field: %s",
-          LowercasePrimitiveTypeName(shape.element_type()),
+          primitive_util::LowercasePrimitiveTypeName(shape.element_type()),
           shape.ShortDebugString());
     }
     return Status::OK();
   }
 
-  if (LayoutUtil::IsSparseArray(shape) && Rank(shape) == 0) {
+  if (LayoutUtil::IsSparseArray(shape) && shape.rank() == 0) {
     return InvalidArgument("sparse arrays must have rank > 0");
   }
-  for (int64 i = 0; i < Rank(shape); ++i) {
+  for (int64 i = 0; i < shape.rank(); ++i) {
     int64 dimension = shape.dimensions(i);
     if (dimension < 0) {
       return InvalidArgument(
@@ -898,7 +673,7 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
 /* static */ Status ShapeUtil::ValidateShapeSize(const Shape& shape) {
   VLOG(3) << "Validating shape size: " << ShapeUtil::HumanString(shape);
 
-  if (!IsArray(shape)) {
+  if (!shape.IsArray()) {
     return Status::OK();
   }
 
@@ -919,7 +694,7 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
         return sparse_elements_size;
       }
       int64 sparse_indices_size =
-          MultiplyWithoutOverflow(max_sparse_elements, ShapeUtil::Rank(shape));
+          MultiplyWithoutOverflow(max_sparse_elements, shape.rank());
       if (sparse_indices_size < 0) {
         return sparse_indices_size;
       }
@@ -991,7 +766,7 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
                                           ShapeIndexView index) {
   const Shape* subshape = &shape;
   for (auto i : index) {
-    if (!IsTuple(*subshape) || i >= subshape->tuple_shapes_size() || i < 0) {
+    if (!subshape->IsTuple() || i >= subshape->tuple_shapes_size() || i < 0) {
       return false;
     }
     subshape = &subshape->tuple_shapes(i);
@@ -1003,7 +778,7 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
                                                  ShapeIndexView index) {
   const Shape* return_shape = &shape;
   for (auto i : index) {
-    CHECK(IsTuple(*return_shape))
+    CHECK(return_shape->IsTuple())
         << "Invalid index " << index << " for shape " << shape;
     return_shape = &return_shape->tuple_shapes(i);
   }
@@ -1014,7 +789,7 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
     const Shape& shape, ShapeIndexView index) {
   const Shape* return_shape = &shape;
   for (auto i : index) {
-    if (!IsTuple(*return_shape) || i < 0 ||
+    if (!return_shape->IsTuple() || i < 0 ||
         i >= return_shape->tuple_shapes_size()) {
       return InvalidArgument(
           "Shape index %s not a valid subshape index for tuple with shape %s",
@@ -1029,7 +804,7 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
                                                   ShapeIndexView index) {
   Shape* return_shape = shape;
   for (auto i : index) {
-    CHECK(IsTuple(*return_shape));
+    CHECK(return_shape->IsTuple());
     return_shape = return_shape->mutable_tuple_shapes(i);
   }
   return return_shape;
@@ -1037,11 +812,11 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
 
 /* static */
 bool ShapeUtil::IsLeafIndex(const Shape& shape, const ShapeIndex& index) {
-  return !IsTuple(GetSubshape(shape, index));
+  return !GetSubshape(shape, index).IsTuple();
 }
 
 /* static */ int64 ShapeUtil::GetLeafCount(const Shape& shape) {
-  if (!IsTuple(shape)) {
+  if (!shape.IsTuple()) {
     return 1;
   }
   int64 count = 0;
@@ -1063,10 +838,15 @@ bool ShapeUtil::IsLeafIndex(const Shape& shape, const ShapeIndex& index) {
 }
 
 /* static */ bool ShapeUtil::HasDegenerateDimensions(const Shape& shape) {
-  CHECK(ShapeUtil::IsArray(shape));
+  CHECK(shape.IsArray());
   return absl::c_linear_search(shape.dimensions(), 1);
 }
 
+/* static */ Shape ShapeUtil::DropDegenerateDimensions(const Shape& shape) {
+  return FilterDimensions(
+      [&](int64 dim) -> bool { return shape.dimensions()[dim] != 1; }, shape);
+}
+
 namespace {
 
 // Helper for ForEachSubshape which visits the subshapes of the given shape in
@@ -1075,7 +855,7 @@ Status ForEachSubshapeHelper(const Shape& shape,
                              const ShapeUtil::StatusVisitorFunction& func,
                              ShapeIndex* index) {
   TF_RETURN_IF_ERROR(func(shape, *index));
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
       index->push_back(i);
       TF_RETURN_IF_ERROR(ForEachSubshapeHelper(
@@ -1092,7 +872,7 @@ Status ForEachMutableSubshapeHelper(
     Shape* shape, const ShapeUtil::MutatingStatusVisitorFunction& func,
     ShapeIndex* index) {
   TF_RETURN_IF_ERROR(func(shape, *index));
-  if (ShapeUtil::IsTuple(*shape)) {
+  if (shape->IsTuple()) {
     for (int64 i = 0; i < ShapeUtil::TupleElementCount(*shape); ++i) {
       index->push_back(i);
       TF_RETURN_IF_ERROR(ForEachMutableSubshapeHelper(
@@ -1150,6 +930,10 @@ Status ForEachMutableSubshapeHelper(
   for (auto dim : Permute(permutation, shape.dimensions())) {
     new_shape.add_dimensions(dim);
   }
+  for (int64 i = 0; i < shape.rank(); i++) {
+    new_shape.set_dynamic_dimension(permutation[i],
+                                    shape.is_dynamic_dimension(i));
+  }
 
   // If `shape` has a layout, by contract we choose a new layout such that the
   // transpose defined by this permutation is a bitcast.
@@ -1200,8 +984,8 @@ Status ForEachMutableSubshapeHelper(
 /* static */ std::tuple<bool, std::vector<int64>, std::vector<int64>>
 ShapeUtil::InsertedOrDeleted1SizedDimensions(const Shape& shape_pre,
                                              const Shape& shape_post) {
-  CHECK(IsArray(shape_pre));
-  CHECK(IsArray(shape_post));
+  CHECK(shape_pre.IsArray());
+  CHECK(shape_post.IsArray());
 
   auto nil = std::make_tuple(false, std::vector<int64>(), std::vector<int64>());
 
@@ -1248,7 +1032,7 @@ ShapeUtil::InsertedOrDeleted1SizedDimensions(const Shape& shape_pre,
     auto unmodified_dim_pair =
         i < unmodified_dims.size()
             ? unmodified_dims[i]
-            : std::make_pair(Rank(shape_pre), Rank(shape_post));
+            : std::make_pair(shape_pre.rank(), shape_post.rank());
     if (!check_modified_dims(prior_unmodified_dim_pair, unmodified_dim_pair)) {
       return nil;
     }
@@ -1260,8 +1044,8 @@ ShapeUtil::InsertedOrDeleted1SizedDimensions(const Shape& shape_pre,
 /* static */ std::vector<std::pair<int64, int64>>
 ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
                                          const Shape& output_shape) {
-  CHECK(IsArray(input_shape));
-  CHECK(IsArray(output_shape));
+  CHECK(input_shape.IsArray());
+  CHECK(output_shape.IsArray());
 
   // Unmodified dimensions are merely common factors of rank 1.
   auto common_factors = CommonFactors(AsInt64Slice(input_shape.dimensions()),
@@ -1311,8 +1095,8 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
 
 /* static */ bool ShapeUtil::ReshapeIsBitcast(const Shape& input_shape,
                                               const Shape& output_shape) {
-  CHECK(IsArray(input_shape));
-  CHECK(IsArray(output_shape));
+  CHECK(input_shape.IsArray());
+  CHECK(output_shape.IsArray());
   CHECK(LayoutUtil::HasLayout(input_shape));
   CHECK(LayoutUtil::HasLayout(output_shape));
 
@@ -1440,12 +1224,12 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
     Shape output_shape_dim0_major = MakeShapeWithDescendingLayout(
         output_shape.element_type(), AsInt64Slice(output_shape.dimensions()));
 
-    for (int64 input_dim = 0; input_dim < Rank(input_shape); ++input_dim) {
+    for (int64 input_dim = 0; input_dim < input_shape.rank(); ++input_dim) {
       if (input_shape.dimensions(input_dim) <= 1) {
         continue;
       }
 
-      std::vector<int64> input_unit_index(Rank(input_shape), 0);
+      std::vector<int64> input_unit_index(input_shape.rank(), 0);
       input_unit_index[input_dim] = 1;
       int64 logical_linear_index =
           IndexUtil::MultidimensionalIndexToLinearIndex(input_shape_dim0_major,
@@ -1471,11 +1255,48 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
 
 /* static */ absl::optional<Shape> ShapeUtil::AlignLayouts(
     const Shape& input_shape, const Shape& output_shape) {
-  CHECK(IsArray(input_shape));
-  CHECK(IsArray(output_shape));
+  CHECK(input_shape.IsArray());
+  CHECK(output_shape.IsArray());
+  // Removing trivial dimensions from the shape simplifies the alignment
+  // algorithm since ones can go in any position.
+  if (HasDegenerateDimensions(input_shape) ||
+      HasDegenerateDimensions(output_shape)) {
+    auto simple_output_shape =
+        AlignLayouts(DropDegenerateDimensions(input_shape),
+                     DropDegenerateDimensions(output_shape));
+    if (!simple_output_shape) {
+      return absl::nullopt;
+    }
+
+    auto layout = simple_output_shape->layout().minor_to_major();
+    // For each one sized dimension in the output, increment the dimension
+    // numbers in layout that are more minor than the one.
+    absl::InlinedVector<int64, 8> dim_map;
+    dim_map.reserve(simple_output_shape->rank());
+    for (int64 i = 0; i < output_shape.rank(); ++i) {
+      if (output_shape.dimensions(i) != 1) {
+        dim_map.push_back(i);
+      }
+    }
+    for (int64& d : layout) {
+      d = dim_map[d];
+    }
 
-  int64 input_rank = Rank(input_shape);
-  int64 output_rank = Rank(output_shape);
+    // Add the ones in descending order to the layout. Descending layouts tend
+    // to reduce the number of copies inserted in layout assignment.
+    for (int64 i = output_shape.rank() - 1; i >= 0; --i) {
+      if (output_shape.dimensions(i) == 1) {
+        layout.push_back(i);
+      }
+    }
+    Shape output_shape_with_layout = output_shape;
+    *output_shape_with_layout.mutable_layout()->mutable_minor_to_major() =
+        layout;
+    return output_shape_with_layout;
+  }
+
+  int64 input_rank = input_shape.rank();
+  int64 output_rank = output_shape.rank();
 
   // First, calculate an alignment of the dimensions. A consecutive sequence of
   // input dimensions and output dimensions belong to the same alignment part if
@@ -1521,10 +1342,10 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
   if (input_dimension_product != output_dimension_product) {
     return absl::nullopt;
   }
+
   // We also need to store an end element so that we know where the last
   // alignment part ends.
   alignment.push_back({input_rank, output_rank});
-
   // Now check if the physical layout can potentially be aligned to the output
   // shape by changing the physical layout of the output shape. We need to check
   // that all dimension numbers that belong to the same alignment part appear
@@ -1536,40 +1357,23 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
   for (int64 i = 0; i < input_rank;) {
     int64 current_dimension_number = input_dimension_numbers[i];
 
-    // Skip trivial dimensions with a bound of 1.
-    if (input_shape.dimensions(current_dimension_number) == 1) {
-      ++i;
-      continue;
-    }
-
-    // Calculate the number of non-trivial dimension bounds in the input shape
-    // belonging to the current alignment part.
+    // Trivial dimensions are stripped.
+    CHECK_NE(input_shape.dimensions(current_dimension_number), 1);
     const int64 current_alignment_index =
         dimension_to_alignment_index[current_dimension_number];
     // Because of the special end element that we added, we can be sure that
     // 'current_alignment_index' is < alignment.size() - 1.
     CHECK_LT(current_alignment_index, alignment.size() - 1);
-    int64 num_non_trivial_dimensions_in_alignment_part = 0;
-    for (int64 j = alignment[current_alignment_index].first;
-         j < alignment[current_alignment_index + 1].first; ++j) {
-      if (input_shape.dimensions(j) != 1) {
-        ++num_non_trivial_dimensions_in_alignment_part;
-      }
-    }
 
     // Check that the following 'num_non_trivial_dimensions_in_alignment_part'
     // dimension numbers (ignoring dimension numbers with dimension bound 1) are
     // in descending order and belong to the current alignment part.
-    for (int64 j = 0; j < num_non_trivial_dimensions_in_alignment_part;
+    for (int64 j = 0; j < alignment[current_alignment_index + 1].first -
+                              alignment[current_alignment_index].first;
          ++i, ++j) {
       if (i == input_rank) {
         return absl::nullopt;
       }
-      // Skip trivial dimensions with a bound of 1.
-      if (input_shape.dimensions(input_dimension_numbers[i]) == 1) {
-        --j;
-        continue;
-      }
       // If the current dimension number belongs to a different alignment part,
       // or the dimension numbers are not in descending order, we can return
       // early.
@@ -1580,22 +1384,11 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
       }
       current_dimension_number = input_dimension_numbers[i];
     }
-
     // The output dimension numbers that belong to the current alignment part
-    // need to appear in the same descending order as in the input. Again, we
-    // can skip dimensions with a bound of 1.
+    // need to appear in the same descending order as in the input.
     for (int64 j = alignment[current_alignment_index + 1].second - 1;
          j >= alignment[current_alignment_index].second; --j) {
-      if (output_shape.dimensions(j) != 1) {
-        output_layout.push_back(j);
-      }
-    }
-  }
-  // Now add all the dimensions with dimension bound 1 at the end of
-  // 'output_layout'.
-  for (int64 i = 0; i < output_rank; ++i) {
-    if (output_shape.dimensions(i) == 1) {
-      output_layout.push_back(i);
+      output_layout.push_back(j);
     }
   }
   CHECK_EQ(output_layout.size(), output_rank);
@@ -1612,30 +1405,14 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
 
 /* static */ Shape ShapeUtil::DeleteDimension(int64 dim_to_delete,
                                               Shape shape) {
-  CHECK(IsArray(shape));
-  shape.mutable_dimensions()->erase(shape.mutable_dimensions()->begin() +
-                                    dim_to_delete);
-  if (LayoutUtil::HasLayout(shape)) {
-    Layout* layout = shape.mutable_layout();
-    layout->set_format(DENSE);
-    for (size_t i = 0; i < layout->minor_to_major().size();) {
-      if (layout->minor_to_major(i) == dim_to_delete) {
-        layout->mutable_minor_to_major()->erase(
-            layout->minor_to_major().begin() + i);
-        continue;
-      }
-      if (layout->minor_to_major(i) > dim_to_delete) {
-        (*layout->mutable_minor_to_major())[i] -= 1;
-      }
-      ++i;
-    }
-  }
+  CHECK(shape.IsArray());
+  shape.DeleteDimension(dim_to_delete);
   return shape;
 }
 
 /* static */ Shape ShapeUtil::FilterDimensions(
     const std::function<bool(int64)>& p, Shape shape) {
-  CHECK(IsArray(shape));
+  CHECK(shape.IsArray());
   std::vector<int64> dims_to_delete;
   for (int64 i = shape.dimensions().size() - 1; i >= 0; --i) {
     if (!p(i)) {
@@ -1655,8 +1432,11 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
   size_t hash_value = hash<PrimitiveType>()(shape.element_type());
 
   if (shape.tuple_shapes().empty()) {
-    for (int64 dim : shape.dimensions()) {
-      hash_value = Hash64Combine(hash_value, hash<int64>()(dim));
+    for (int i = 0; i < shape.dimensions_size(); ++i) {
+      hash_value =
+          Hash64Combine(hash_value, hash<int64>()(shape.dimensions(i)));
+      hash_value = Hash64Combine(hash_value,
+                                 hash<bool>()(shape.is_dynamic_dimension(i)));
     }
 
     hash_value = Hash64Combine(hash_value, LayoutUtil::Hash(shape.layout()));
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 84a27f662a57ba274562e2e9be57b7e971c9b477..7f610a6085d6fbe3d3143d5027cdc43d4b07bcbf 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -185,7 +185,7 @@ class ShapeUtil {
   // may not actually be able to store this number of elements. See
   // LayoutUtil::MaxSparseElements(shape) to obtain the maximum number of
   // elements that can be stored in a sparse shape.
-  // Precondition: IsArray(shape)
+  // Precondition: shape.IsArray()
   static int64 ElementsIn(const Shape& shape);
 
   // As ElementsIn(), but recurses through tuples.
@@ -207,7 +207,7 @@ class ShapeUtil {
 
   // Returns the number of bytes used to store the primitive_type.
   //
-  // Precondition: ShapeUtil::IsArray(shape)
+  // Precondition: shape.IsArray()
   static int64 ByteSizeOfPrimitiveType(PrimitiveType primitive_type);
 
   // Returns the number of bytes required to store the tuple member pointers for
@@ -241,10 +241,6 @@ class ShapeUtil {
   // (param_name: f32[42x12], ...) -> f32[24x42]
   static string HumanString(const ProgramShape& program_shape);
 
-  // Parses a ShapeUtil::HumanString-format shape string back into a shape
-  // object.
-  static StatusOr<Shape> ParseShapeString(absl::string_view s);
-
   // Returns whether the LHS and RHS shapes have the same dimensions; note: does
   // not check element type.
   // Precondition: IsArray(lhs) && IsArray(rhs)
@@ -266,7 +262,7 @@ class ShapeUtil {
   }
 
   // Returns the higher-precision element type if a and b are both floating
-  // point types; otherwise, checks that that they have the same element type
+  // point types; otherwise, checks that they have the same element type
   // and returns it.
   static PrimitiveType HigherPrecisionElementType(const Shape& a,
                                                   const Shape& b) {
@@ -294,16 +290,12 @@ class ShapeUtil {
   // being F32. Tuple elements are compared recursively for compatibility.
   static bool CompatibleIgnoringFpPrecision(const Shape& lhs, const Shape& rhs);
 
-  // Returns whether the lhs and rhs shapes are identical protobufs.
+  // Returns whether the lhs and rhs shapes are identical.
   static bool Equal(const Shape& lhs, const Shape& rhs);
 
   // As Equal, but allow one of lhs and rhs to be F16 while the other is F32.
   static bool EqualIgnoringFpPrecision(const Shape& lhs, const Shape& rhs);
 
-  // Returns the rank (number of dimensions) of the given shape.
-  // Precondition: !IsTuple(shape)
-  static int64 Rank(const Shape& shape);
-
   // Returns the number of dimensions for which the dimension is not (trivially)
   // 1. e.g., f32[2x1x1] has a true rank of 1D, the other dimensions are just
   // fluff. Note that zero dimensions are included in the true rank, e.g.,
@@ -317,10 +309,10 @@ class ShapeUtil {
   // Scalar-specific
 
   static bool IsScalar(const Shape& shape) {
-    return IsArray(shape) && Rank(shape) == 0;
+    return shape.IsArray() && shape.rank() == 0;
   }
   static bool IsEffectiveScalar(const Shape& shape) {
-    return IsArray(shape) && TrueRank(shape) == 0;
+    return shape.IsArray() && TrueRank(shape) == 0;
   }
 
   // Returns whether "shape" is a scalar (array) with the given element_type.
@@ -375,11 +367,24 @@ class ShapeUtil {
   static Shape MakeShape(PrimitiveType element_type,
                          absl::Span<const int64> dimensions);
 
+  // Constructs a new shape with the given element type and sequence of
+  // potentially dynamic dimensions. The argument 'dynamic_dimensions' indicates
+  // with a true value that the respective dimension is dynamic. If the
+  // dimension is dynamic then the respective value in 'dimension' is an upper
+  // bound on the dimension size. 'dimensions' and 'dynamic_dimensions' must be
+  // the same size.
+  static Shape MakeShape(PrimitiveType element_type,
+                         absl::Span<const int64> dimensions,
+                         const std::vector<bool>& dynamic_dimensions);
+
   // Constructs a new shape with the given element type and sequence of
   // dimensions. Method checks if the element type is valid and the shape's
   // size fits in std::numeric_limits<int64>::max().
   static StatusOr<Shape> MakeValidatedShape(PrimitiveType element_type,
                                             absl::Span<const int64> dimensions);
+  static StatusOr<Shape> MakeValidatedShape(
+      PrimitiveType element_type, absl::Span<const int64> dimensions,
+      const std::vector<bool>& dynamic_dimensions);
 
   // Creates a Shape with element type corresponding to T and the given
   // dimensions
@@ -393,7 +398,9 @@ class ShapeUtil {
   // Returns a value shape such that shape.has_layout().
   static Shape MakeShapeWithLayout(PrimitiveType element_type,
                                    absl::Span<const int64> dimensions,
-                                   absl::Span<const int64> minor_to_major);
+                                   absl::Span<const int64> minor_to_major,
+                                   absl::Span<const Tile> tiles = {},
+                                   int64 element_size_in_bits = 0);
 
   static Shape MakeShapeWithSparseLayout(PrimitiveType element_type,
                                          absl::Span<const int64> dimensions,
@@ -447,27 +454,6 @@ class ShapeUtil {
   // that floating point numbers are signed.
   static bool ElementIsSigned(const Shape& shape);
 
-  // Returns whether the shape is a tuple.
-  static bool IsTuple(const Shape& shape) {
-    return shape.element_type() == TUPLE;
-  }
-
-  // Returns whether the shape is an opaque value (i.e. an 'existential' typed
-  // value that is passed to CustomCall operations).
-  static bool IsOpaque(const Shape& shape) {
-    return shape.element_type() == OPAQUE;
-  }
-
-  // Returns whether the shape is an token value used for ordering
-  // side-effecting operations.
-  static bool IsToken(const Shape& shape) {
-    return shape.element_type() == TOKEN;
-  }
-
-  // Returns whether the shape is an array.  Note that scalars are considered
-  // arrays.
-  static bool IsArray(const Shape& shape);
-
   // Returns whether the given primitive type corresponds to an array shape.
   static bool IsArrayPrimitiveType(PrimitiveType primitive_type);
 
@@ -497,12 +483,6 @@ class ShapeUtil {
   // shape.
   static Shape ComplexComponentShape(const Shape& complex_shape);
 
-  // Shorthand for testing whether a shape is of a given element type and
-  // sequence of dimensions.
-  ABSL_DEPRECATED("Use Equal() instead.")
-  static bool ShapeIs(const Shape& shape, PrimitiveType element_type,
-                      std::initializer_list<int64> dimensions);
-
   // Returns true if the given shape has a subshape at the given index.
   static bool IndexIsValid(const Shape& shape, ShapeIndexView index);
 
@@ -551,6 +531,9 @@ class ShapeUtil {
   // (dimensions with bound 1).
   static bool HasDegenerateDimensions(const Shape& shape);
 
+  // Drops any degenerate dimensions (i.e. dimensions of size 1)
+  static Shape DropDegenerateDimensions(const Shape& shape);
+
   // Permutes the dimensions by the given permutation, so
   // return_value.dimensions[permutation[i]] = argument.dimensions[i].
   //
@@ -694,11 +677,9 @@ class ShapeUtil {
 
   template <typename FnType>
   static void ForEachIndex(const Shape& shape, const FnType& visitor_function) {
-    ForEachIndexWithStatus(shape,
-                           [&](absl::Span<const int64> indices) {
-                             return StatusOr<bool>(visitor_function(indices));
-                           })
-        .IgnoreError();
+    ForEachIndexWithStatus(shape, [&](absl::Span<const int64> indices) {
+      return StatusOr<bool>(visitor_function(indices));
+    }).IgnoreError();
   }
 
   // A parallel version of ForEachIndex(WithStatus). This can only be used if
@@ -747,7 +728,7 @@ class ShapeUtil {
     if (ShapeUtil::IsZeroElementArray(shape)) {
       return Status::OK();
     }
-    CHECK_EQ(Rank(shape), base.size());
+    CHECK_EQ(shape.rank(), base.size());
     CHECK_EQ(incr.size(), base.size());
     CHECK_EQ(count.size(), base.size());
     const int64 rank = LayoutUtil::MinorToMajor(shape).size();
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 60bdbe302045e6f3b4bae500c50bc68fb217525d..020b062f6b1b032bab958772d3a6a1e35daee38b 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -82,102 +82,6 @@ TEST(ShapeUtilTest, Rank4DimensionIndexing) {
   ASSERT_EQ(3, shape.dimensions(0));
 }
 
-TEST(ShapeUtilTest, ParseShapeStringR2F32) {
-  string shape_string = "f32[123,456]";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual,
-                          ShapeUtil::ParseShapeString(shape_string));
-  Shape expected = ShapeUtil::MakeShape(F32, {123, 456});
-  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
-      << "expected: " << ShapeUtil::HumanString(expected)
-      << "actual:   " << ShapeUtil::HumanString(actual);
-}
-
-TEST(ShapeUtilTest, ParseShapeStringTupleOfArrays) {
-  string shape_string = "(f32[1572864],s8[5120,1024])";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual,
-                          ShapeUtil::ParseShapeString(shape_string));
-  Shape expected =
-      ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {1572864}),
-                                 ShapeUtil::MakeShape(S8, {5120, 1024})});
-  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
-      << "expected: " << ShapeUtil::HumanString(expected)
-      << "actual:   " << ShapeUtil::HumanString(actual);
-}
-
-TEST(ShapeUtilTest, ParseShapeStringNestedTuple) {
-  string shape_string = "(f32[1],(f32[2], token[]), opaque[], f32[3])";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual,
-                          ShapeUtil::ParseShapeString(shape_string));
-  Shape expected = ShapeUtil::MakeTupleShape({
-      ShapeUtil::MakeShape(F32, {1}),
-      ShapeUtil::MakeTupleShape(
-          {ShapeUtil::MakeShape(F32, {2}), ShapeUtil::MakeTokenShape()}),
-      ShapeUtil::MakeOpaqueShape(),
-      ShapeUtil::MakeShape(F32, {3}),
-  });
-  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
-      << "expected: " << ShapeUtil::HumanString(expected)
-      << "actual:   " << ShapeUtil::HumanString(actual);
-}
-
-TEST(ShapeUtilTest, ParseShapeStringWithLayout) {
-  string shape_string = "f32[123,456]{0,1}";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual,
-                          ShapeUtil::ParseShapeString(shape_string));
-  Shape expected = ShapeUtil::MakeShapeWithLayout(F32, {123, 456}, {0, 1});
-  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
-      << "expected: " << ShapeUtil::HumanString(expected)
-      << "actual:   " << ShapeUtil::HumanString(actual);
-}
-
-TEST(ShapeUtilTest, ParseShapeStringWithExplicitDenseLayout) {
-  string shape_string = "f32[123,456]dense{0,1}";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual,
-                          ShapeUtil::ParseShapeString(shape_string));
-  Shape expected = ShapeUtil::MakeShapeWithLayout(F32, {123, 456}, {0, 1});
-  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
-      << "expected: " << ShapeUtil::HumanString(expected)
-      << "actual:   " << ShapeUtil::HumanString(actual);
-}
-
-TEST(ShapeUtilTest, ParseShapeStringWithSparseLayout) {
-  string shape_string = "f32[123,456]sparse{10}";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual,
-                          ShapeUtil::ParseShapeString(shape_string));
-  Shape expected = ShapeUtil::MakeShapeWithSparseLayout(F32, {123, 456}, 10);
-  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
-      << "expected: " << ShapeUtil::HumanString(expected)
-      << "actual: " << ShapeUtil::HumanString(actual);
-}
-
-TEST(ShapeUtilTest, ParseOpaqueType) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual,
-                          ShapeUtil::ParseShapeString("opaque[]"));
-  Shape expected = ShapeUtil::MakeOpaqueShape();
-  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
-      << "expected: " << ShapeUtil::HumanString(expected)
-      << "actual:   " << ShapeUtil::HumanString(actual);
-}
-
-TEST(ShapeUtilTest, ParseTokenType) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ShapeUtil::ParseShapeString("token[]"));
-  Shape expected = ShapeUtil::MakeTokenShape();
-  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
-      << "expected: " << ShapeUtil::HumanString(expected)
-      << "actual:   " << ShapeUtil::HumanString(actual);
-}
-
-TEST(ShapeUtilTest, ParseInvalidShapeString) {
-  string shape_strings[] = {
-      "f32[123,456]foobar{0,1}", "f32[123,456]sparse{0,1}", "f32[123,456]{foo}",
-      "f32[123,456]dense{foo}",  "f32[123,456]sparse{foo}",
-  };
-  for (const string& shape_string : shape_strings) {
-    StatusOr<Shape> result = ShapeUtil::ParseShapeString(shape_string);
-    ASSERT_FALSE(result.ok()) << "shape: " << shape_string;
-  }
-}
-
 TEST(ShapeUtilTest, CompatibleIdenticalShapes) {
   Shape shape1 = ShapeUtil::MakeShape(F32, {3, 2});
   Shape shape2 = ShapeUtil::MakeShape(F32, {3, 2});
@@ -272,6 +176,28 @@ TEST(ShapeUtilTest, UnequalIgnoringFpPrecision) {
       ShapeUtil::MakeShapeWithLayout(PRED, {4, 3}, {0, 1})));
 }
 
+TEST(ShapeUtilTest, EqualDynamicShapes) {
+  EXPECT_TRUE(
+      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {4, 3}, {true, false}),
+                       ShapeUtil::MakeShape(F32, {4, 3}, {true, false})));
+  EXPECT_FALSE(
+      ShapeUtil::Equal(ShapeUtil::MakeShape(F32, {4, 3}, {true, false}),
+                       ShapeUtil::MakeShape(F32, {4, 3}, {false, false})));
+}
+
+TEST(ShapeUtilTest, CompatibleDynamicShapes) {
+  Shape shape_a = ShapeUtil::MakeShape(F32, {4, 3}, {true, false});
+  *shape_a.mutable_layout() = Layout({1, 0});
+  Shape shape_b = ShapeUtil::MakeShape(F32, {4, 3}, {true, false});
+  *shape_b.mutable_layout() = Layout({0, 1});
+  Shape shape_c = ShapeUtil::MakeShape(F32, {4, 3}, {false, true});
+  *shape_c.mutable_layout() = Layout({0, 1});
+
+  EXPECT_TRUE(ShapeUtil::Compatible(shape_a, shape_a));
+  EXPECT_TRUE(ShapeUtil::Compatible(shape_a, shape_b));
+  EXPECT_FALSE(ShapeUtil::Compatible(shape_a, shape_c));
+}
+
 TEST(ShapeUtilTest, CompatibleTuples) {
   Shape tuple1 = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {3, 2}), ShapeUtil::MakeShape(PRED, {4, 5})});
@@ -612,10 +538,6 @@ TEST(ShapeUtilTest, InsertedOrDeleted1SizedDimensions) {
       ShapeUtil::InsertedOrDeleted1SizedDimensions(shape0, shape2)));
 }
 
-TEST(ShapeUtilTest, ShapeIs) {
-  EXPECT_FALSE(ShapeUtil::ShapeIs(ShapeUtil::MakeShape(PRED, {2}), PRED, {}));
-}
-
 TEST(ShapeUtilTest, ForEachIndex) {
   struct ShapeDimensionAndNumberInvocations {
     std::vector<int64> dimensions;
@@ -788,6 +710,26 @@ TEST(ShapeUtilTest, PermuteDimensionsLayout) {
   } while (std::next_permutation(layout.begin(), layout.end()));
 }
 
+TEST(ShapeUtilTest, PermuteDynamicDimensions) {
+  Shape shape =
+      ShapeUtil::MakeShape(F32, {10, 100, 1000},
+                           /*dynamic_dimensions*/ {false, true, true});
+  SCOPED_TRACE(absl::StrCat("shape=", shape.ToString()));
+
+  std::vector<int64> permutation(3);
+  std::iota(permutation.begin(), permutation.end(), 0);
+  do {
+    SCOPED_TRACE(absl::StrCat("permutation=", absl::StrJoin(permutation, ",")));
+
+    auto permuted = ShapeUtil::PermuteDimensions(permutation, shape);
+    for (int i = 0; i < shape.rank(); i++) {
+      EXPECT_EQ(permuted.dimensions(permutation[i]), shape.dimensions(i));
+      EXPECT_EQ(permuted.is_dynamic_dimension(permutation[i]),
+                shape.is_dynamic_dimension(i));
+    }
+  } while (std::next_permutation(permutation.begin(), permutation.end()));
+}
+
 TEST(AlgebraicSimplifierTest, ReshapeIsBitcast_3x2x2_6x2_Dim0IsMostMinor) {
   EXPECT_FALSE(ShapeUtil::ReshapeIsBitcast(
       ShapeUtil::MakeShapeWithLayout(F32, {3, 2, 2}, {0, 1, 2}),
@@ -819,8 +761,15 @@ TEST(AlignmentTest, AlignLayoutsWithTrivialDimensions) {
   auto aligned_shape = ShapeUtil::AlignLayouts(
       input, ShapeUtil::MakeShape(xla::F32, {1, 4, 1, 3, 2, 7, 5, 11, 1}));
   EXPECT_TRUE(aligned_shape);
-  EXPECT_THAT(aligned_shape.value().layout().minor_to_major(),
-              ElementsAre(6, 5, 4, 3, 1, 7, 0, 2, 8));
+  EXPECT_TRUE(ShapeUtil::ReshapeIsBitcast(input, aligned_shape.value()));
+}
+
+TEST(AlignmentTest, AlignLayoutsWithAllTrivialDimensions) {
+  Shape input =
+      ShapeUtil::MakeShapeWithLayout(xla::F32, {1, 1, 1, 1}, {0, 1, 3, 2});
+  auto aligned_shape = ShapeUtil::AlignLayouts(
+      input, ShapeUtil::MakeShape(xla::F32, {1, 1, 1, 1, 1}));
+  EXPECT_TRUE(aligned_shape);
   EXPECT_TRUE(ShapeUtil::ReshapeIsBitcast(input, aligned_shape.value()));
 }
 
diff --git a/tensorflow/compiler/xla/sparse_index_array.cc b/tensorflow/compiler/xla/sparse_index_array.cc
index a40bb7875e7ea53a8959a9a67ec09ec260ba9c37..82091bdee65c709bb6020f40acc15f13d8599c1d 100644
--- a/tensorflow/compiler/xla/sparse_index_array.cc
+++ b/tensorflow/compiler/xla/sparse_index_array.cc
@@ -79,7 +79,7 @@ void SparseIndexArray::Resize(int64 num_indices) {
 }
 
 bool SparseIndexArray::Validate(const Shape& shape) const {
-  if (rank_ == 0 || rank_ != ShapeUtil::Rank(shape)) {
+  if (rank_ == 0 || rank_ != shape.rank()) {
     return false;
   }
   int64 num_indices = index_count();
diff --git a/tensorflow/compiler/xla/sparse_index_array.h b/tensorflow/compiler/xla/sparse_index_array.h
index a96d483462efd77ae4761541e8c79b2c84fa49f3..0c25355467da3fd346d80db790d78252869975ef 100644
--- a/tensorflow/compiler/xla/sparse_index_array.h
+++ b/tensorflow/compiler/xla/sparse_index_array.h
@@ -135,7 +135,7 @@ void SparseIndexArray::SortWithValues(absl::Span<NativeT> values) {
   auto sort_order_less = [this](int64 lhs, int64 rhs) {
     return IndexUtil::CompareIndices(At(lhs), At(rhs)) < 0;
   };
-  std::sort(sort_order.begin(), sort_order.end(), sort_order_less);
+  absl::c_sort(sort_order, sort_order_less);
 
   // Reorder the array elements according to sort_order.  Work through the array
   // and follow cycles so we can do the reorder in-place.
diff --git a/tensorflow/compiler/xla/status_macros.cc b/tensorflow/compiler/xla/status_macros.cc
index b88fe367d7416a26c1147fd5e10fb20772814fe5..aa7238f07d432aabb44d2cbed66786217e6a846c 100644
--- a/tensorflow/compiler/xla/status_macros.cc
+++ b/tensorflow/compiler/xla/status_macros.cc
@@ -25,6 +25,13 @@ limitations under the License.
 namespace xla {
 namespace status_macros {
 
+ABSL_CONST_INIT const char kPossibleAutoJitAlternative[] =
+    "This error might be occurring with the use of xla.compile. If it is not "
+    "necessary that every Op be compiled with XLA, an alternative is to use "
+    "auto_jit with OptimizerOptions.global_jit_level = ON_2 or the environment "
+    "variable TF_XLA_FLAGS=\"tf_xla_auto_jit=2\" which will attempt to use xla "
+    "to compile as much of the graph as the compiler is able to.";
+
 static Status MakeStatus(tensorflow::error::Code code, const string& message) {
   return Status(code, message);
 }
diff --git a/tensorflow/compiler/xla/status_macros.h b/tensorflow/compiler/xla/status_macros.h
index e51dd64e2a3dc7c359918cb08c6c94b2b4d9e91b..315136acc71670fa3ad48da4dc064e384ddadaa9 100644
--- a/tensorflow/compiler/xla/status_macros.h
+++ b/tensorflow/compiler/xla/status_macros.h
@@ -30,6 +30,10 @@ limitations under the License.
 namespace xla {
 namespace status_macros {
 
+// This is a useful error message when encountering XLA Compiler errors that
+// could be handled with the non-strict AutoJit mode.
+extern const char kPossibleAutoJitAlternative[];
+
 // Stream object used to collect error messages in MAKE_ERROR macros
 // or append error messages with APPEND_ERROR.  It accepts any
 // arguments with operator<< to build an error string, and then has an
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 5a7a4faa7e89b27fb537f20d94c21cb4a76e000d..562854756628df64fbf92d40af859f8b218b0cc2 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -1,6 +1,13 @@
 # Description:
 #   Base testing infrastructure for XLA.
 
+load("//tensorflow/compiler/xla/tests:build_defs.bzl", "generate_backend_suites", "generate_backend_test_macros", "xla_test", "xla_test_library")
+load(
+    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "tf_cuda_tests_tags",
+)
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
+
 licenses(["notice"])  # Apache 2.0
 
 package(
@@ -23,17 +30,6 @@ filegroup(
     ]),
 )
 
-load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
-load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test_library")
-load("//tensorflow/compiler/xla/tests:build_defs.bzl", "generate_backend_suites")
-load("//tensorflow/compiler/xla/tests:build_defs.bzl", "generate_backend_test_macros")
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load(
-    "//tensorflow/core:platform/default/build_config_root.bzl",
-    "tf_cuda_tests_tags",
-)
-
 # Generate test_suites for all backends, named "${backend}_tests".
 generate_backend_suites()
 
@@ -75,6 +71,7 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:hlo_dataflow_analysis",
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:transfer_manager",
@@ -280,9 +277,6 @@ cc_library(
 xla_test(
     name = "bad_rng_shape_validation_test",
     srcs = ["bad_rng_shape_validation_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
@@ -319,6 +313,31 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "conv_depthwise_backprop_filter_test",
+    timeout = "long",
+    srcs = ["conv_depthwise_backprop_filter_test.cc"],
+    # these backends do not natively handle batch group counts.
+    blacklisted_backends = [
+        "gpu",
+        "cpu",
+    ],
+    shard_count = 6,
+    deps = [
+        "//tensorflow/compiler/xla:execution_options_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/service:bfloat16_normalization",
+        "//tensorflow/compiler/xla/service:despecializer",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 xla_test(
     name = "grouped_convolution_test",
     timeout = "long",
@@ -348,9 +367,6 @@ xla_test(
 xla_test(
     name = "check_execution_arity_test",
     srcs = ["check_execution_arity_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -371,9 +387,6 @@ xla_test(
 xla_test(
     name = "query_inferred_shape_test",
     srcs = ["query_inferred_shape_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
@@ -391,9 +404,6 @@ xla_test(
 xla_test(
     name = "while_test",
     srcs = ["while_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -417,6 +427,10 @@ xla_test(
 xla_test(
     name = "xla_hlo_profile_test",
     srcs = ["xla_hlo_profile_test.cc"],
+    blacklisted_backends = [
+        # Hlo profiles are not supported on the interpreter backend.
+        "interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:shape_util",
@@ -440,9 +454,6 @@ xla_test(
 xla_test(
     name = "axpy_simple_test",
     srcs = ["axpy_simple_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
@@ -457,7 +468,6 @@ xla_test(
 xla_test(
     name = "map_test",
     srcs = ["map_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
@@ -510,9 +520,6 @@ xla_test(
 xla_test(
     name = "pred_test",
     srcs = ["pred_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla/client:local_client",
@@ -528,9 +535,6 @@ xla_test(
 xla_test(
     name = "select_test",
     srcs = ["select_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
@@ -548,7 +552,6 @@ xla_test(
 xla_test(
     name = "conditional_test",
     srcs = ["conditional_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
@@ -566,7 +569,6 @@ xla_test(
 xla_test(
     name = "unary_op_test",
     srcs = ["unary_op_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
@@ -627,9 +629,6 @@ xla_test(
 xla_test(
     name = "deconstruct_tuple_test",
     srcs = ["deconstruct_tuple_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -652,7 +651,6 @@ xla_test(
     name = "array_elementwise_ops_test",
     srcs = ["array_elementwise_ops_test.cc"],
     shard_count = 25,
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -677,22 +675,19 @@ xla_test(
 
 xla_test(
     name = "exhaustive_f32_elementwise_op_test",
-    size = "enormous",
     srcs = ["exhaustive_f32_elementwise_op_test.cc"],
-    backends = [
-        "cpu",
-        "gpu",
-    ],
+    real_hardware_only = True,  # Very slow on the interpreter.
     shard_count = 48,
     tags = [
-        "broken",
-        "manual",
-        "notap",
+        "optonly",
+        # This is a big test that we skip for capacity reasons in OSS testing.
+        "no_oss",
     ],
     deps = [
         ":client_library_test_base",
         ":literal_test_util",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:math",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/base",
@@ -702,7 +697,6 @@ xla_test(
 xla_test(
     name = "reduce_precision_test",
     srcs = ["reduce_precision_test.cc"],
-    tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
@@ -729,7 +723,6 @@ xla_test(
     srcs = ["dot_operation_test.cc"],
     shard_count = 20,
     tags = [
-        "enable_for_xla_interpreter",
         "optonly",
     ],
     deps = [
@@ -739,7 +732,9 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:matrix",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -796,7 +791,9 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:matrix",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -810,9 +807,6 @@ xla_test(
 xla_test(
     name = "transpose_test",
     srcs = ["transpose_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:reference_util",
@@ -832,9 +826,6 @@ xla_test(
 xla_test(
     name = "constants_test",
     srcs = ["constants_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -845,7 +836,9 @@ xla_test(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -954,6 +947,11 @@ xla_test(
 xla_test(
     name = "batch_normalization_test",
     srcs = ["batch_normalization_test.cc"],
+    blacklisted_backends = [
+        # BatchNorm HLOs are not handled by the interpreter backend, and the
+        # BatchNorm expander is not run on the interpreter.
+        "interpreter",
+    ],
     shard_count = 40,
     deps = [
         ":test_utils",
@@ -1045,9 +1043,6 @@ xla_test(
     name = "slice_test",
     srcs = ["slice_test.cc"],
     shard_count = 40,
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:reference_util",
@@ -1068,9 +1063,6 @@ xla_test(
 xla_test(
     name = "multidimensional_slice_test",
     srcs = ["multidimensional_slice_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -1088,9 +1080,6 @@ xla_test(
     name = "dynamic_ops_test",
     timeout = "moderate",
     srcs = ["dynamic_ops_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:reference_util",
@@ -1116,9 +1105,6 @@ xla_test(
 xla_test(
     name = "tuple_test",
     srcs = ["tuple_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
@@ -1142,9 +1128,6 @@ xla_test(
 xla_test(
     name = "vector_ops_reduce_test",
     srcs = ["vector_ops_reduce_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -1163,9 +1146,8 @@ xla_test(
 xla_test(
     name = "reduce_test",
     srcs = ["reduce_test.cc"],
-    shard_count = 40,
+    shard_count = 31,
     tags = [
-        "enable_for_xla_interpreter",
         "optonly",
     ],
     deps = [
@@ -1232,7 +1214,6 @@ xla_test(
     srcs = [],
     shard_count = 20,
     tags = [
-        "enable_for_xla_interpreter",
         "optonly",
     ],
     xla_test_library_deps = [":reduce_window_test_library"],
@@ -1244,7 +1225,6 @@ xla_test(
     timeout = "long",
     srcs = ["select_and_scatter_test.cc"],
     tags = [
-        "enable_for_xla_interpreter",
         "optonly",
     ],
     deps = [
@@ -1270,9 +1250,6 @@ xla_test(
 xla_test(
     name = "copy_test",
     srcs = ["copy_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         ":client_library_test_base",
         "//tensorflow/compiler/xla:array2d",
@@ -1293,9 +1270,6 @@ xla_test(
 xla_test(
     name = "reduce_hlo_test",
     srcs = ["reduce_hlo_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -1309,9 +1283,6 @@ xla_test(
 xla_test(
     name = "token_hlo_test",
     srcs = ["token_hlo_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_verifier",
@@ -1326,9 +1297,6 @@ xla_test(
 xla_test(
     name = "call_test",
     srcs = ["call_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
@@ -1348,6 +1316,7 @@ xla_test(
 xla_test(
     name = "custom_call_test",
     srcs = ["custom_call_test.cc"],
+    backends = ["cpu"],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
@@ -1370,9 +1339,6 @@ xla_test(
 xla_test(
     name = "binop_scaling_test",
     srcs = ["binop_scaling_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
@@ -1390,9 +1356,6 @@ xla_test(
 xla_test(
     name = "broadcast_simple_test",
     srcs = ["broadcast_simple_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
@@ -1412,9 +1375,6 @@ xla_test(
 xla_test(
     name = "pad_test",
     srcs = ["pad_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
@@ -1434,11 +1394,8 @@ xla_test(
 )
 
 xla_test(
-    name = "fmax_test",
-    srcs = ["fmax_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
+    name = "fmax_fmin_test",
+    srcs = ["fmax_fmin_test.cc"],
     deps = [
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
@@ -1453,9 +1410,6 @@ xla_test(
 xla_test(
     name = "log_test",
     srcs = ["log_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
@@ -1470,9 +1424,6 @@ xla_test(
 xla_test(
     name = "matrix_ops_simple_test",
     srcs = ["matrix_ops_simple_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
@@ -1519,9 +1470,6 @@ xla_test(
     name = "reshape_test",
     srcs = ["reshape_test.cc"],
     shard_count = 30,
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
@@ -1547,9 +1495,6 @@ xla_test(
 xla_test(
     name = "reverse_test",
     srcs = ["reverse_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
@@ -1568,9 +1513,6 @@ xla_test(
 xla_test(
     name = "vector_ops_simple_test",
     srcs = ["vector_ops_simple_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:shape_util",
@@ -1594,9 +1536,6 @@ xla_test(
 xla_test(
     name = "concat_test",
     srcs = ["concat_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
@@ -1617,9 +1556,6 @@ xla_test(
 xla_test(
     name = "convert_test",
     srcs = ["convert_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -1637,8 +1573,12 @@ xla_test(
 )
 
 xla_test(
-    name = "cross_replica_sum_test",
-    srcs = ["cross_replica_sum_test.cc"],
+    name = "all_reduce_test",
+    srcs = ["all_reduce_test.cc"],
+    blacklisted_backends = [
+        # All reduce is not supported on the interpreter backend.
+        "interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -1663,9 +1603,6 @@ xla_test(
 xla_test(
     name = "bitcast_convert_test",
     srcs = ["bitcast_convert_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -1705,9 +1642,6 @@ xla_test(
 xla_test(
     name = "floor_ceil_test",
     srcs = ["floor_ceil_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
@@ -1769,6 +1703,10 @@ xla_test(
 xla_test(
     name = "execution_profile_test",
     srcs = ["execution_profile_test.cc"],
+    blacklisted_backends = [
+        # Execution profiles are not supported on the interpreter backend.
+        "interpreter",
+    ],
     deps = [
         ":client_library_test_base",
         "//tensorflow/compiler/xla/client:global_data",
@@ -1783,6 +1721,10 @@ xla_test(
     name = "execution_profile_test_with_xla_hlo_profile",
     srcs = ["execution_profile_test.cc"],
     args = ["--xla_hlo_profile"],
+    blacklisted_backends = [
+        # Hlo profiles are not supported on the interpreter backend.
+        "interpreter",
+    ],
     deps = [
         ":client_library_test_base",
         "//tensorflow/compiler/xla/client:global_data",
@@ -1796,9 +1738,6 @@ xla_test(
 xla_test(
     name = "replay_test",
     srcs = ["replay_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:protobuf_util",
@@ -1821,9 +1760,6 @@ xla_test(
 xla_test(
     name = "broadcast_test",
     srcs = ["broadcast_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -1885,9 +1821,6 @@ xla_test(
 xla_test(
     name = "fusion_test",
     srcs = ["fusion_test.cc"],
-    tags = [
-        "enable_for_xla_interpreter",
-    ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
@@ -2005,6 +1938,10 @@ xla_test(
 xla_test(
     name = "outfeed_in_nested_computation_test",
     srcs = ["outfeed_in_nested_computation_test.cc"],
+    blacklisted_backends = [
+        # Outfeed ops are not supported on the interpreter backend.
+        "interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla/tests:local_client_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -2181,7 +2118,6 @@ xla_test(
     srcs = ["iota_test.cc"],
     shard_count = 30,
     tags = [
-        "enable_for_xla_interpreter",
         # Require optimized builds, iota_test_cpu is very slow in fastbuild.
         "optonly",
     ],
@@ -2209,3 +2145,41 @@ tf_cc_test(
         "@com_google_absl//absl/synchronization",
     ],
 )
+
+xla_test(
+    name = "ptxas_bug_120501638",
+    srcs = ["ptxas_bug_120501638.cc"],
+    tags = [
+        # Disabled in OSS until nvidia publicly releases a fixed ptxas.
+        "no_oss",
+    ],
+    deps = [
+        ":hlo_test_base",
+        ":xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:test",
+    ],
+)
+
+xla_test(
+    name = "triangular_solve_test",
+    srcs = ["triangular_solve_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+        "noasan",  # sometimes times out, http://b/78650012
+    ],
+    deps = [
+        "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:math",
+        "//tensorflow/compiler/xla/client/lib:matrix",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
diff --git a/tensorflow/compiler/xla/tests/cross_replica_sum_test.cc b/tensorflow/compiler/xla/tests/all_reduce_test.cc
similarity index 94%
rename from tensorflow/compiler/xla/tests/cross_replica_sum_test.cc
rename to tensorflow/compiler/xla/tests/all_reduce_test.cc
index 410732c07b7b6d3ece33ab11f4778241dc53ca50..7e695f829e39831e2c8558cb07d0689e560bbafa 100644
--- a/tensorflow/compiler/xla/tests/cross_replica_sum_test.cc
+++ b/tensorflow/compiler/xla/tests/all_reduce_test.cc
@@ -41,7 +41,7 @@ XLA_TEST_F(TrivialCrossReplicaSumTest, OneOperand) {
 
   ENTRY test_computation {
     p = f32[3] parameter(0)
-    ROOT crs = f32[3] cross-replica-sum(p), to_apply=add
+    ROOT crs = f32[3] all-reduce(p), to_apply=add
   })";
   auto module =
       ParseHloString(module_str, GetModuleConfigForTest()).ValueOrDie();
@@ -62,7 +62,7 @@ XLA_TEST_F(TrivialCrossReplicaSumTest, MultipleOperands) {
   ENTRY test_computation {
     p0 = f32[3] parameter(0)
     p1 = f32[2] parameter(1)
-    ROOT crs = (f32[3], f32[2]) cross-replica-sum(p0, p1), to_apply=add
+    ROOT crs = (f32[3], f32[2]) all-reduce(p0, p1), to_apply=add
   })";
   auto module =
       ParseHloString(module_str, GetModuleConfigForTest()).ValueOrDie();
@@ -88,7 +88,7 @@ XLA_TEST_F(TrivialCrossReplicaSumTest, ConstantOperand) {
   ENTRY test_computation {
     p0 = f32[3] parameter(0)
     p1 = f32[2] constant({10, 20})
-    ROOT crs = (f32[3], f32[2]) cross-replica-sum(p0, p1), to_apply=add
+    ROOT crs = (f32[3], f32[2]) all-reduce(p0, p1), to_apply=add
   })";
   auto module =
       ParseHloString(module_str, GetModuleConfigForTest()).ValueOrDie();
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 915b456b52215f8d6a9eb6c5b933f3502f1d3d2c..acdd3c9da92efe8fae1336eaa861c01d5bb9b158 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -1443,6 +1442,27 @@ XLA_TEST_F(ArrayElementwiseOpTest, PowNonIntegerF32s) {
                              error_spec_);
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, PowC64s) {
+  SetFastMathDisabled(true);
+  XlaBuilder builder(TestName());
+  auto lhs =
+      ConstantR1<complex64>(&builder, {-2.0f, -0.6f, -0.6f, 0.0f, 0.0f, 0.0f});
+  auto rhs =
+      ConstantR1<complex64>(&builder, {0.5f, 0.6f, -0.6f, 0.5f, 0.6f, 0.0f});
+  Pow(lhs, rhs);
+
+  ComputeAndCompareR1<complex64>(&builder,
+                                 {
+                                     {0, 1.41421356},
+                                     {-2.27443288e-01, 0.69999846},
+                                     {-4.19847531e-01, -1.29215783},
+                                     {0, 0},
+                                     {0, 0},
+                                     {1, 0},
+                                 },
+                                 {}, error_spec_);
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, PowZeroElementF32s) {
   XlaBuilder builder(TestName());
   auto lhs = ConstantR1<float>(&builder, {});
@@ -2047,6 +2067,19 @@ XLA_TEST_F(ArrayElementwiseOpTest, NonNanClampF32) {
                              error_spec_);
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, ClampF32) {
+  SetFastMathDisabled(true);
+  XlaBuilder builder(TestName());
+  auto minimum = ConstantR1<float>(&builder, {1.0f, -6.5f, 1.0f, 2.25f, NAN});
+  auto argument =
+      ConstantR1<float>(&builder, {2.0f, 10.0f, -5.0f, 1.0f, 10.0f});
+  auto maximum = ConstantR1<float>(&builder, {3.0f, 0.5f, 25.5f, NAN, 123.0f});
+  Clamp(minimum, argument, maximum);
+
+  ComputeAndCompareR1<float>(&builder, {2.0f, 0.5f, 1.0f, NAN, NAN}, {},
+                             error_spec_);
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, ClampF32Scalar) {
   XlaBuilder builder(TestName());
   auto minimum = ConstantR0<float>(&builder, 0.0f);
diff --git a/tensorflow/compiler/xla/tests/bfloat16_test.cc b/tensorflow/compiler/xla/tests/bfloat16_test.cc
index e9728e636f0ee032416b2da17a3ea83c5bb18083..63e48117056dec4af603cbc85e478fcb15ad0cec 100644
--- a/tensorflow/compiler/xla/tests/bfloat16_test.cc
+++ b/tensorflow/compiler/xla/tests/bfloat16_test.cc
@@ -76,7 +76,9 @@ XLA_TEST_F(Bfloat16Test, NegateScalarF16) {
                                 error_spec_);
 }
 
-XLA_TEST_F(Bfloat16Test, BatchNormTraining) {
+// Disabled on interpreter since BatchNormExanper is not run by default on the
+// intepreter backend.
+XLA_TEST_F(Bfloat16Test, DISABLED_ON_INTERPRETER(BatchNormTraining)) {
   const int kFeatureIndex = 2;
   XlaBuilder builder(TestName());
 
@@ -110,7 +112,9 @@ XLA_TEST_F(Bfloat16Test, BatchNormTraining) {
   ComputeAndCompareTuple(&builder, expected, {}, ErrorSpec(0.01, 0.02));
 }
 
-XLA_TEST_F(Bfloat16Test, BatchNormGrad) {
+// Disabled on interpreter since BatchNormExanper is not run by default on the
+// intepreter backend.
+XLA_TEST_F(Bfloat16Test, DISABLED_ON_INTERPRETER(BatchNormGrad)) {
   const int kFeatureIndex = 2;
   XlaBuilder builder(TestName());
 
diff --git a/tensorflow/compiler/xla/tests/build_defs.bzl b/tensorflow/compiler/xla/tests/build_defs.bzl
index 05d4d04034bf50c8bb840e59b28a590fce048c19..c14d279ac560db33066ae4fc68b6290f7499bb39 100644
--- a/tensorflow/compiler/xla/tests/build_defs.bzl
+++ b/tensorflow/compiler/xla/tests/build_defs.bzl
@@ -34,6 +34,7 @@ def xla_test(
         xla_test_library_deps = [],
         backends = [],
         blacklisted_backends = [],
+        real_hardware_only = False,
         args = [],
         tags = [],
         copts = [],
@@ -108,6 +109,10 @@ def xla_test(
         use for that target.
       **kwargs: Additional keyword arguments to pass to native.cc_test.
     """
+
+    # All of the backends in all_backends are real hardware.
+    _ignore = [real_hardware_only]
+
     test_names = []
     if not backends:
         backends = all_backends
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index 12c029983336cc9aed0fde4ce6881c9a00a9869e..0e99ede5d01fcfa88c54c9cbc5a6a85bf8f15ddf 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 
+#include <memory>
 #include <string>
 
 #include "absl/memory/memory.h"
@@ -40,8 +41,9 @@ constexpr char kInterpreter[] = "interpreter";
 
 // Wrapper function that creates a nicer error message (than a bare
 // ValueOrDie()) if the platform we intend to test is not available.
-Client* GetOrCreateLocalClientOrDie(const LocalClientOptions& client_options) {
-  StatusOr<Client*> result =
+LocalClient* GetOrCreateLocalClientOrDie(
+    const LocalClientOptions& client_options) {
+  StatusOr<LocalClient*> result =
       ClientLibrary::GetOrCreateLocalClient(client_options);
   TF_CHECK_OK(result.status()) << " could not create local client for testing";
   return result.ValueOrDie();
@@ -74,6 +76,9 @@ ClientLibraryTestBase::ClientLibraryTestBase(
   // default.
   execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
       "constant_folding");
+
+  execution_options_.mutable_debug_options()
+      ->set_xla_hlo_evaluator_use_fast_path(true);
 }
 
 ClientLibraryTestBase::ClientLibraryTestBase(se::Platform* platform)
@@ -88,6 +93,9 @@ ClientLibraryTestBase::ClientLibraryTestBase(se::Platform* platform)
 
   execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
       "constant_folding");
+
+  execution_options_.mutable_debug_options()
+      ->set_xla_hlo_evaluator_use_fast_path(true);
 }
 
 string ClientLibraryTestBase::TestName() const {
@@ -184,7 +192,7 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllOutputLayouts(
   verify_output(actual, "");
 
   // Try with all output layouts.
-  std::vector<int64> minor_to_major(ShapeUtil::Rank(expected.shape()));
+  std::vector<int64> minor_to_major(expected.shape().rank());
   std::iota(minor_to_major.begin(), minor_to_major.end(), 0);
   do {
     auto layout = ShapeUtil::MakeShapeWithLayout(
@@ -217,7 +225,7 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
       TF_ASSIGN_OR_RETURN(auto literal,
                           client_->Transfer(*arguments[index], nullptr));
       // Skip tuples because they don't have a rank.
-      if (ShapeUtil::IsTuple(literal.shape())) {
+      if (literal.shape().IsTuple()) {
         layout_strings.push_back(
             ShapeUtil::HumanStringWithLayout(literal.shape()));
         arguments_with_layout.push_back(arguments[index]);
@@ -227,7 +235,7 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
         return Status::OK();
       }
 
-      std::vector<int64> minor_to_major(ShapeUtil::Rank(literal.shape()));
+      std::vector<int64> minor_to_major(literal.shape().rank());
       std::iota(minor_to_major.begin(), minor_to_major.end(), 0);
       do {
         auto literal_relayout =
@@ -273,9 +281,10 @@ StatusOr<Literal> ClientLibraryTestBase::ComputeAndTransfer(
   if (!arguments_.empty()) {
     CHECK(arguments.empty());
     for (const auto& argument : arguments_) {
-      owning_arguments.push_back(
-          client_->TransferToServer(MaybeConvertLiteralToBfloat16(argument))
-              .ValueOrDie());
+      TF_ASSIGN_OR_RETURN(
+          std::unique_ptr<GlobalData> owned_argument,
+          client_->TransferToServer(MaybeConvertLiteralToBfloat16(argument)));
+      owning_arguments.push_back(std::move(owned_argument));
       arguments.push_back(owning_arguments.back().get());
     }
   }
@@ -296,9 +305,10 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
   if (!arguments_.empty()) {
     CHECK(arguments.empty());
     for (const auto& argument : arguments_) {
-      owning_arguments.push_back(
-          client_->TransferToServer(MaybeConvertLiteralToBfloat16(argument))
-              .ValueOrDie());
+      TF_ASSIGN_OR_RETURN(
+          std::unique_ptr<GlobalData> owned_argument,
+          client_->TransferToServer(MaybeConvertLiteralToBfloat16(argument)));
+      owning_arguments.push_back(std::move(owned_argument));
       arguments.push_back(owning_arguments.back().get());
     }
   }
@@ -356,9 +366,10 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
   if (!arguments_.empty()) {
     CHECK(arguments.empty());
     for (const auto& argument : arguments_) {
-      owning_arguments.push_back(
-          client_->TransferToServer(MaybeConvertLiteralToBfloat16(argument))
-              .ValueOrDie());
+      TF_ASSIGN_OR_RETURN(
+          std::unique_ptr<GlobalData> owned_argument,
+          client_->TransferToServer(MaybeConvertLiteralToBfloat16(argument)));
+      owning_arguments.push_back(std::move(owned_argument));
       arguments.push_back(owning_arguments.back().get());
     }
   }
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 65a23dd883594b9bf9c37494a37e9be39b197788..d700437ed355c144639f76d683055e211975fde9 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -385,8 +385,8 @@ class ClientLibraryTestBase : public ::testing::Test {
   StatusOr<std::pair<Literal, Literal>> ComputeValueAndReference(
       XlaBuilder* builder, absl::Span<const Literal> arguments);
 
-  Client* client_;
-  Client* ref_client_;  // To compute reference result.
+  LocalClient* client_;
+  LocalClient* ref_client_;  // To compute reference result.
   ExecutionOptions execution_options_;
 
  private:
@@ -431,7 +431,8 @@ void ClientLibraryTestBase::ComputeAndCompareR0(
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
                     std::is_same<NativeT, half>::value ||
-                    std::is_same<NativeT, complex64>::value,
+                    std::is_same<NativeT, complex64>::value ||
+                    std::is_same<NativeT, complex128>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   Literal expected_literal = LiteralUtil::CreateR0<NativeT>(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
@@ -455,7 +456,8 @@ void ClientLibraryTestBase::ComputeAndCompareR1(
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
                     std::is_same<NativeT, half>::value ||
-                    std::is_same<NativeT, complex64>::value,
+                    std::is_same<NativeT, complex64>::value ||
+                    std::is_same<NativeT, complex128>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   Literal expected_literal = LiteralUtil::CreateR1<NativeT>(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
@@ -480,7 +482,8 @@ void ClientLibraryTestBase::ComputeAndCompareR2(
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
                     std::is_same<NativeT, half>::value ||
-                    std::is_same<NativeT, complex64>::value,
+                    std::is_same<NativeT, complex64>::value ||
+                    std::is_same<NativeT, complex128>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   Literal expected_literal =
       LiteralUtil::CreateR2FromArray2D<NativeT>(expected);
@@ -506,7 +509,8 @@ void ClientLibraryTestBase::ComputeAndCompareR3(
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
                     std::is_same<NativeT, half>::value ||
-                    std::is_same<NativeT, complex64>::value,
+                    std::is_same<NativeT, complex64>::value ||
+                    std::is_same<NativeT, complex128>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   Literal expected_literal =
       LiteralUtil::CreateR3FromArray3D<NativeT>(expected);
@@ -532,7 +536,8 @@ void ClientLibraryTestBase::ComputeAndCompareR4(
                     std::is_same<NativeT, double>::value ||
                     std::is_same<NativeT, bfloat16>::value ||
                     std::is_same<NativeT, half>::value ||
-                    std::is_same<NativeT, complex64>::value,
+                    std::is_same<NativeT, complex64>::value ||
+                    std::is_same<NativeT, complex128>::value,
                 "Float or complex type required when specifying an ErrorSpec");
   Literal expected_literal =
       LiteralUtil::CreateR4FromArray4D<NativeT>(expected);
diff --git a/tensorflow/compiler/xla/tests/client_test.cc b/tensorflow/compiler/xla/tests/client_test.cc
index 363dee74b2755a6bdc3c5a5164a85378581c21d2..247328b730f3af936d933f824da491b593b27c90 100644
--- a/tensorflow/compiler/xla/tests/client_test.cc
+++ b/tensorflow/compiler/xla/tests/client_test.cc
@@ -96,7 +96,7 @@ XLA_TEST_F(ClientTest, ExecuteWithTupleLayout) {
   LiteralTestUtil::ExpectR2Equal<int32>({{10, 20}, {30, 40}},
                                         LiteralSlice(result, {1}));
 
-  EXPECT_TRUE(ShapeUtil::IsTuple(result.shape()));
+  EXPECT_TRUE(result.shape().IsTuple());
   EXPECT_EQ(2, ShapeUtil::TupleElementCount(result.shape()));
 
   EXPECT_TRUE(ShapeUtil::Equal(
@@ -109,7 +109,10 @@ XLA_TEST_F(ClientTest, ExecuteWithTupleLayout) {
                                      /*minor_to_major=*/{1, 0})));
 }
 
-XLA_TEST_F(ClientTest, DISABLED_ON_GPU(ExecuteParallel)) {
+// Disabled for interpreter since ExecuteAsyncOnStream is not implemented on
+// interpreter backend.
+XLA_TEST_F(ClientTest,
+           DISABLED_ON_INTERPRETER(DISABLED_ON_GPU(ExecuteParallel))) {
   XlaComputation add_with_one_arg, mul_with_two_args, dot_with_one_arg;
   Shape shape = ShapeUtil::MakeShape(S32, {2, 2});
 
diff --git a/tensorflow/compiler/xla/tests/compute_constant_test.cc b/tensorflow/compiler/xla/tests/compute_constant_test.cc
index 3b0414a6045a7c5f4f75948d8ccf2775c575626e..ef800b8ef624bf1020ff1e6857c13b0387482cd3 100644
--- a/tensorflow/compiler/xla/tests/compute_constant_test.cc
+++ b/tensorflow/compiler/xla/tests/compute_constant_test.cc
@@ -151,19 +151,35 @@ TEST_F(ComputeConstantTest, DirectParamMissing) {
   }
 }
 
-TEST_F(ComputeConstantTest, IndirectParamMissing) {
+TEST_F(ComputeConstantTest, GetDimensionSize) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
     XlaBuilder b(TestName());
-    auto computation =
-        Add(ConstantR0<float>(&b, 1.0f),
-            Parameter(&b, 0, ShapeUtil::MakeShape(F32, {}), "param"));
-    EXPECT_FALSE(IsConstant(computation, &b));
+    auto add =
+        Add(ConstantR1<float>(&b, {1.0f}), ConstantR1<float>(&b, {1.0f}));
+    auto get_dimension_size = GetDimensionSize(add, 0);
+    EXPECT_TRUE(IsConstant(get_dimension_size, &b));
+
+    TF_ASSERT_OK_AND_ASSIGN(auto value, ComputeConstantScalar<uint32>(
+                                            client, get_dimension_size, &b));
+    EXPECT_EQ(value, 1);
+  }
+}
 
-    auto value = ComputeConstantScalar<float>(client, computation, &b);
-    EXPECT_TRUE(
-        absl::StrContains(value.status().ToString(), "depends on a parameter"))
-        << value.status();
+TEST_F(ComputeConstantTest, MultipleGetDimensionSize) {
+  for (ClientType client_type : client_types) {
+    Client* client = ClientOrDie(platform_, client_type);
+    XlaBuilder b(TestName());
+    auto add =
+        Add(ConstantR2<float>(&b, {{1.0f}}), ConstantR2<float>(&b, {{1.0f}}));
+    auto get_dimension_size = GetDimensionSize(add, 0);
+    auto get_dimension_size_2 = GetDimensionSize(add, 0);
+    auto add_2 = Add(get_dimension_size, get_dimension_size_2);
+    EXPECT_TRUE(IsConstant(add_2, &b));
+
+    TF_ASSERT_OK_AND_ASSIGN(auto value,
+                            ComputeConstantScalar<uint32>(client, add_2, &b));
+    EXPECT_EQ(value, 2);
   }
 }
 
diff --git a/tensorflow/compiler/xla/tests/constants_test.cc b/tensorflow/compiler/xla/tests/constants_test.cc
index 72ff1e74a47c8584cb5336c86a1c978c4637a902..6530007871ced1d0bbffe2b44ccc8cf9bddd79e1 100644
--- a/tensorflow/compiler/xla/tests/constants_test.cc
+++ b/tensorflow/compiler/xla/tests/constants_test.cc
@@ -21,11 +21,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -178,5 +181,54 @@ TEST_F(ConstantsTest, Token) {
   TF_ASSERT_OK(Execute(&builder, {}).status());
 }
 
+TEST_F(ConstantsTest, FullLike) {
+  XlaBuilder b(TestName());
+  auto val1 = Iota(&b, F32, 3);
+  auto val2 = FullLike(val1, 10);
+  val1 + val2;
+  ComputeAndCompareR1<float>(&b, {10, 11, 12}, {}, error_spec_);
+}
+
+TEST_F(ConstantsTest, IllegalFullLikeOnTuple) {
+  XlaBuilder b(TestName());
+  auto tuple = Tuple(&b, {Iota(&b, F32, 3), Iota(&b, F32, 1)});
+  FullLike(tuple, 10);  // Illegal; can't do FullLike on a tuple.
+  EXPECT_FALSE(b.Build().ok());
+}
+
+TEST_F(ConstantsTest, FullLikeScalar) {
+  XlaBuilder b(TestName());
+  auto scalar1 = ConstantR0WithType(&b, F32, 1);
+  auto scalar2 = FullLike(scalar1, 2);
+  scalar1 - scalar2;
+  ComputeAndCompareR0<float>(&b, -1, {}, error_spec_);
+}
+
+class ConstantsHloTest : public HloTestBase {};
+
+// TODO(b/121147351): Fails on GPU. Not clear if this is expected behavior.
+XLA_TEST_F(ConstantsHloTest, DISABLED_ON_GPU(BitcastOfConstant)) {
+  const char* testcase = R"(
+    HloModule module, is_scheduled=true
+
+    func {
+      lhs = s32[] parameter(0)
+      rhs = s32[] parameter(1)
+      ROOT mul = s32[] add(lhs, rhs)
+    }
+
+    ENTRY test {
+      constant.0 = s32[1]{0} constant({0})
+      parameter.0 = s32[] parameter(0)
+      constant-as-scalar = s32[] bitcast(constant.0)
+      ROOT result = s32[] call(parameter.0, constant-as-scalar), to_apply=func
+    }
+  )";
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
+  auto param = LiteralUtil::CreateR0<int32>(1);
+  auto result = ExecuteNoHloPasses(std::move(module), {&param});
+  EXPECT_TRUE(LiteralTestUtil::Equal(param, result));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc b/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dfbf0478e62713635446d11557367cfac6ab0dce
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc
@@ -0,0 +1,178 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
+#include "tensorflow/compiler/xla/service/despecializer.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+namespace {
+
+string GetFloatDataType(bool use_bfloat16) {
+  return use_bfloat16 ? "bf16" : "f32";
+}
+
+struct BatchGroupedConvolution2DSpec {
+  int64 output_batch, window, window_dilation;
+  std::vector<int64> activation_dims;
+  std::vector<int64> kernel_dims;
+  std::vector<int64> output_dims;
+  std::vector<int64> activation_and_kernel_layout;
+  std::vector<int64> output_layout;
+};
+
+class BatchGroupedConvolution2DTest
+    : public HloTestBase,
+      public ::testing::WithParamInterface<
+          ::testing::tuple<BatchGroupedConvolution2DSpec, bool>> {};
+
+static std::vector<BatchGroupedConvolution2DSpec> GetConv2DTestCases() {
+  std::vector<BatchGroupedConvolution2DSpec> config_set;
+  std::vector<std::vector<int64>> config_options = {
+      {8, 5, 3, 2},      {4, 5, 5, 2},    {8, 7, 4, 128},
+      {16, 20, 20, 256}, {256, 7, 5, 4},  {256, 6, 6, 4},
+      {256, 8, 8, 512},  {64, 7, 7, 960}, {64, 14, 14, 576}};
+
+  for (auto option : config_options) {
+    int64 feature = option[3];
+    int64 activation_size = option[1];
+    int64 kernel_size = option[2];
+    int64 batch = option[0];
+
+    BatchGroupedConvolution2DSpec config;
+    config.window_dilation = 1;
+    config.output_batch = feature;
+    config.window = kernel_size;
+
+    config.activation_dims = {batch, activation_size, activation_size, feature};
+
+    config.kernel_dims = {batch, kernel_size, kernel_size, feature};
+
+    int64 output_space_size = 3 + activation_size - kernel_size;
+    config.output_dims = {output_space_size, output_space_size, feature, 1};
+
+    config.activation_and_kernel_layout = {0, 3, 1, 2};
+    config.output_layout = {2, 3, 0, 1};
+    config_set.push_back(config);
+
+    BatchGroupedConvolution2DSpec different_layout_config = config;
+    different_layout_config.activation_and_kernel_layout = {3, 0, 1, 2};
+    config_set.push_back(different_layout_config);
+
+    // Add configurations for window dilation cases.
+    if (activation_size % 2 == 0 && activation_size == kernel_size) {
+      BatchGroupedConvolution2DSpec config;
+      config.window_dilation = 2;
+      config.output_batch = feature;
+      config.window = kernel_size / 2;
+      config.activation_dims = {batch, activation_size, activation_size,
+                                feature};
+      config.kernel_dims = {batch, kernel_size / 2, kernel_size / 2, feature};
+      config.activation_and_kernel_layout = {0, 3, 1, 2};
+      config.output_layout = {2, 3, 0, 1};
+
+      int64 output_space_size = 5;
+      config.output_dims = {output_space_size, output_space_size, feature, 1};
+
+      config_set.push_back(config);
+
+      BatchGroupedConvolution2DSpec different_layout_config = config;
+      different_layout_config.activation_and_kernel_layout = {3, 0, 1, 2};
+      config_set.push_back(different_layout_config);
+    }
+  }
+
+  return config_set;
+}
+
+string BatchGroupedConvolution2DTestDataToString(
+    const ::testing::TestParamInfo<
+        ::testing::tuple<BatchGroupedConvolution2DSpec, bool>>& data) {
+  const auto& spec = ::testing::get<0>(data.param);
+  const string data_type = GetFloatDataType(::testing::get<1>(data.param));
+  string str = absl::StrCat(
+      "activation_dims_", absl::StrJoin(spec.activation_dims, "x"),
+      "_kernel_dims_", absl::StrJoin(spec.kernel_dims, "x"),
+      "_activation_layout_",
+      absl::StrJoin(spec.activation_and_kernel_layout, "_"), "_output_dims_",
+      absl::StrJoin(spec.output_dims, "x"), data_type, "_output_layout_",
+      absl::StrJoin(spec.output_layout, "_"));
+
+  // Test names are not allowed to contain the '-' character.
+  absl::c_replace(str, '-', 'n');
+  return str;
+}
+
+string BuildHloTextBatchGroupedConvolution2D(
+    const BatchGroupedConvolution2DSpec& spec, bool use_bfloat16) {
+  const string data_type = GetFloatDataType(use_bfloat16);
+  return absl::StrFormat(
+      R"(
+    HloModule TensorFlowDepthwiseConv, is_scheduled=true
+
+    ENTRY main {
+      activation = %s[%s]{%s} parameter(0)
+      kernel = %s[%s]{%s} parameter(1)
+      ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+          window={size=%dx%d pad=1_%dx1_%d rhs_dilate=%dx%d}, dim_labels=f01b_i01o->01fb,
+          batch_group_count=%d
+    }
+    )",
+      data_type, absl::StrJoin(spec.activation_dims, ","),
+      absl::StrJoin(spec.activation_and_kernel_layout, ","), data_type,
+      absl::StrJoin(spec.kernel_dims, ","),
+      absl::StrJoin(spec.activation_and_kernel_layout, ","), data_type,
+      absl::StrJoin(spec.output_dims, ","),
+      absl::StrJoin(spec.output_layout, ","), data_type,
+      absl::StrJoin(spec.activation_dims, ","),
+      absl::StrJoin(spec.activation_and_kernel_layout, ","), data_type,
+      absl::StrJoin(spec.kernel_dims, ","),
+      absl::StrJoin(spec.activation_and_kernel_layout, ","), spec.window,
+      spec.window, spec.window_dilation, spec.window_dilation,
+      spec.window_dilation, spec.window_dilation, spec.output_batch);
+}
+
+XLA_TEST_P(BatchGroupedConvolution2DTest, DoIt) {
+  const BatchGroupedConvolution2DSpec& spec = ::testing::get<0>(GetParam());
+  bool use_bfloat16 = ::testing::get<1>(GetParam());
+  const string hlo_text =
+      BuildHloTextBatchGroupedConvolution2D(spec, use_bfloat16);
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(
+      hlo_text, ErrorSpec{0.01, 0.01}, [](HloModule* module) -> Status {
+        BFloat16MixedPrecisionRemoval remover;
+        TF_RETURN_IF_ERROR(remover.Run(module).status());
+        Despecializer despecializer;
+        return despecializer.Run(module).status();
+      }));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    BatchGroupedConvolution2DTestWithRandomIndices,
+    BatchGroupedConvolution2DTest,
+    ::testing::Combine(::testing::ValuesIn(GetConv2DTestCases()),
+                       ::testing::Bool()),
+    BatchGroupedConvolution2DTestDataToString);
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index 4a58a1ed66c438d1dd9561f4eb029b38d8c6cbdd..9db9f2563b636c4f929585eb13a9c7f809833eda 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -98,7 +98,7 @@ class ForwardPassConvolution_3x3x256_256_OutputZ_Iota : public ConvolutionTest {
     precision.add_operand_precision(PrecisionConfig::HIGHEST);
     precision.add_operand_precision(PrecisionConfig::DEFAULT);
     Conv(lhs, rhs, {1, 1}, Padding::kValid, /*feature_group_count=*/1,
-         &precision);
+         /*batch_group_count=*/1, &precision);
 
     ComputeAndCompare(&builder, {}, error_spec_);
   }
@@ -467,8 +467,8 @@ XLA_TEST_F(ConvolutionTest, Convolve3D_1x4x2x3x3_2x2x2x3x3_Valid) {
 // servers. The error message is missing the operator ++.
 template <typename T>
 void iota_int_init_value(std::vector<T>& values, int init_value) {
-  std::for_each(values.begin(), values.end(),
-                [&](T& value) { value = static_cast<T>(init_value++); });
+  absl::c_for_each(values,
+                   [&](T& value) { value = static_cast<T>(init_value++); });
 }
 
 template <typename T>
diff --git a/tensorflow/compiler/xla/tests/copy_test.cc b/tensorflow/compiler/xla/tests/copy_test.cc
index 3622f2c1e84639baed13059b21b20609d1347da6..df005a67097bb8aaf070c57d1c51acd1909fee12 100644
--- a/tensorflow/compiler/xla/tests/copy_test.cc
+++ b/tensorflow/compiler/xla/tests/copy_test.cc
@@ -133,7 +133,9 @@ XLA_TEST_F(CopyOpTest, CopyConstantR2DifferentLayouts) {
   // Reverse the minor-to-major order of the literal.
   Layout* literal_layout = literal.mutable_shape_do_not_use()->mutable_layout();
   ASSERT_EQ(2, literal_layout->minor_to_major_size());
-  literal_layout->mutable_minor_to_major()->SwapElements(0, 1);
+  // Swap the first and second elements.
+  *literal_layout->mutable_minor_to_major() = {
+      literal_layout->minor_to_major(1), literal_layout->minor_to_major(0)};
 
   HloInstruction* constant = builder.AddInstruction(
       HloInstruction::CreateConstant(std::move(literal)));
diff --git a/tensorflow/compiler/xla/tests/custom_call_test.cc b/tensorflow/compiler/xla/tests/custom_call_test.cc
index 738b6442354b01364278e3e3c713aa2cdb5cf47d..4687ed61a7de91bc1bce0efeadf1965ad7d52d55 100644
--- a/tensorflow/compiler/xla/tests/custom_call_test.cc
+++ b/tensorflow/compiler/xla/tests/custom_call_test.cc
@@ -54,11 +54,20 @@ void Add1ToValues(float* out, float** in) {
   out[2] = array[2] + 1;
   out[3] = array[3] + 1;
 }
+
+void F32TupleSwap(float** out, float** in) {
+  TF_ANNOTATE_MEMORY_IS_INITIALIZED(in[0], sizeof(float));
+  TF_ANNOTATE_MEMORY_IS_INITIALIZED(in[1], sizeof(float));
+  *out[0] = *in[1];
+  *out[1] = *in[0];
+}
+
 }  // namespace
 
 REGISTER_CUSTOM_CALL_TARGET(R0F32Add2);
 REGISTER_CUSTOM_CALL_TARGET(R2F32ReduceSum);
 REGISTER_CUSTOM_CALL_TARGET(Add1ToValues);
+REGISTER_CUSTOM_CALL_TARGET(F32TupleSwap);
 
 namespace xla {
 namespace {
@@ -69,7 +78,7 @@ class CustomCallTest : public HloTestBase {
   Shape r2f32_ = ShapeUtil::MakeShape(F32, {2, 2});
 };
 
-XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR0F32Add2)) {
+XLA_TEST_F(CustomCallTest, CustomCallR0F32Add2) {
   auto module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
@@ -84,7 +93,7 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR0F32Add2)) {
   LiteralTestUtil::ExpectR0Near<float>(44.0f, result, error_spec_);
 }
 
-XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR2F32Reduce)) {
+XLA_TEST_F(CustomCallTest, CustomCallR2F32Reduce) {
   auto module = CreateNewUnverifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
@@ -105,7 +114,7 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(CustomCallR2F32Reduce)) {
   LiteralTestUtil::ExpectR0Near<float>(10.0f, result, error_spec_);
 }
 
-XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(UsedInOtherComputations)) {
+XLA_TEST_F(CustomCallTest, UsedInOtherComputations) {
   auto module = CreateNewUnverifiedModule();
   auto b = HloComputation::Builder(TestName());
 
@@ -129,7 +138,7 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(UsedInOtherComputations)) {
       Array3D<float>{{{2, 3}, {4, 5}}, {{3, 4}, {5, 6}}}, result);
 }
 
-XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(InputAndOutputLayoutDiffer)) {
+XLA_TEST_F(CustomCallTest, InputAndOutputLayoutDiffer) {
   auto module = CreateNewUnverifiedModule();
   auto b = HloComputation::Builder(TestName());
 
@@ -151,7 +160,7 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(InputAndOutputLayoutDiffer)) {
   LiteralTestUtil::ExpectR2Equal<float>({{2.f, 4.f}, {3.f, 5.f}}, result);
 }
 
-XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(LayoutConstrained)) {
+XLA_TEST_F(CustomCallTest, LayoutConstrained) {
   // The argument and result of the computation are set to different layouts,
   // but the custom call is layout constrained to a fixed operand and result
   // layout, so the correct result should be produced.
@@ -163,8 +172,10 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(LayoutConstrained)) {
 
   const Shape& r2f32_dim0_major =
       ShapeUtil::MakeShapeWithLayout(F32, {2, 2}, {1, 0});
-  b.AddInstruction(HloInstruction::CreateCustomCall(
+  auto custom_call = b.AddInstruction(HloInstruction::CreateCustomCall(
       r2f32_dim0_major, {input}, "Add1ToValues", {r2f32_dim0_major}));
+  b.AddInstruction(
+      custom_call->CloneWithNewOperands(r2f32_dim0_major, {custom_call}));
 
   module->AddEntryComputation(b.Build());
   ForceParameterLayout(module.get(), 0, LayoutUtil::MakeLayout({1, 0}));
@@ -173,7 +184,27 @@ XLA_TEST_F(CustomCallTest, DISABLED_ON_GPU(LayoutConstrained)) {
   Literal argument = LiteralUtil::CreateR2<float>({{1.f, 2.f}, {3.f, 4.f}});
 
   Literal result = ExecuteAndTransfer(std::move(module), {&argument});
-  LiteralTestUtil::ExpectR2Equal<float>({{2.f, 3.f}, {4.f, 5.f}}, result);
+  LiteralTestUtil::ExpectR2Equal<float>({{3.f, 4.f}, {5.f, 6.f}}, result);
+}
+
+XLA_TEST_F(CustomCallTest, TupleOutput) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT %custom-call = (f32[], f32[]) custom-call(f32[] %p0, f32[] %p1), custom_call_target="F32TupleSwap", operand_layout_constraints={f32[], f32[]}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr));
+
+  Literal arg0 = LiteralUtil::CreateR0<float>(7.f);
+  Literal arg1 = LiteralUtil::CreateR0<float>(42.f);
+
+  Literal expected = LiteralUtil::MakeTuple({&arg1, &arg0});
+  Literal result = ExecuteAndTransfer(std::move(module), {&arg0, &arg1});
+  EXPECT_EQ(result, expected);
 }
 
 class CustomCallClientAPITest : public ClientLibraryTestBase {};
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index c5d8b663f4abe77e05ec213d2e4e075c260a8655..6ee2178a227a12b7baa933f036a44db8ec630a4c 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -19,12 +19,14 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
@@ -918,8 +920,9 @@ XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstRHSClassicMM) {
   XlaBuilder builder(TestName());
   auto lhs_constant = ConstantR2FromArray2D(&builder, *constant_lhs_array);
   auto rhs_constant = ConstantR2FromArray2D(&builder, *constant_rhs_array);
-  auto start_constant = ConstantR1<int32>(&builder, {1, 0});
-  auto dynamic_slice = DynamicSlice(lhs_constant, start_constant, {1, 6});
+  auto one = ConstantR0<int32>(&builder, 1);
+  auto zero = ConstantR0<int32>(&builder, 0);
+  auto dynamic_slice = DynamicSlice(lhs_constant, {one, zero}, {1, 6});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
@@ -945,8 +948,9 @@ XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstLHSClassicMM) {
   XlaBuilder builder(TestName());
   auto lhs_constant = ConstantR2FromArray2D(&builder, *constant_lhs_array);
   auto rhs_constant = ConstantR2FromArray2D(&builder, *constant_rhs_array);
-  auto start_constant = ConstantR1<int32>(&builder, {0, 1});
-  auto dynamic_slice = DynamicSlice(rhs_constant, start_constant, {6, 1});
+  auto zero = ConstantR0<int32>(&builder, 0);
+  auto one = ConstantR0<int32>(&builder, 1);
+  auto dynamic_slice = DynamicSlice(rhs_constant, {zero, one}, {6, 1});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
@@ -974,8 +978,9 @@ XLA_TEST_F(DotOperationTest,
   XlaBuilder builder(TestName());
   auto lhs_constant = ConstantR2FromArray2D(&builder, *constant_lhs_array);
   auto rhs_constant = ConstantR2FromArray2D(&builder, *constant_rhs_array);
-  auto start_constant = ConstantR1<int32>(&builder, {0, 1});
-  auto dynamic_slice = DynamicSlice(lhs_constant, start_constant, {6, 1});
+  auto zero = ConstantR0<int32>(&builder, 0);
+  auto one = ConstantR0<int32>(&builder, 1);
+  auto dynamic_slice = DynamicSlice(lhs_constant, {zero, one}, {6, 1});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(0);
@@ -1001,8 +1006,9 @@ XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstLHSReverseMM) {
   XlaBuilder builder(TestName());
   auto lhs_constant = ConstantR2FromArray2D(&builder, *constant_lhs_array);
   auto rhs_constant = ConstantR2FromArray2D(&builder, *constant_rhs_array);
-  auto start_constant = ConstantR1<int32>(&builder, {1, 0});
-  auto dynamic_slice = DynamicSlice(rhs_constant, start_constant, {1, 6});
+  auto zero = ConstantR0<int32>(&builder, 0);
+  auto one = ConstantR0<int32>(&builder, 1);
+  auto dynamic_slice = DynamicSlice(rhs_constant, {one, zero}, {1, 6});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(0);
@@ -1033,8 +1039,9 @@ XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstRHSRows) {
   XlaBuilder builder(TestName());
   auto lhs_constant = ConstantR2FromArray2D(&builder, *constant_lhs_array);
   auto rhs_constant = ConstantR2FromArray2D(&builder, *constant_rhs_array);
-  auto start_constant = ConstantR1<int32>(&builder, {0, 1});
-  auto dynamic_slice = DynamicSlice(lhs_constant, start_constant, {6, 1});
+  auto zero = ConstantR0<int32>(&builder, 0);
+  auto one = ConstantR0<int32>(&builder, 1);
+  auto dynamic_slice = DynamicSlice(lhs_constant, {zero, one}, {6, 1});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(0);
@@ -1065,8 +1072,9 @@ XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstLHSRows) {
   XlaBuilder builder(TestName());
   auto lhs_constant = ConstantR2FromArray2D(&builder, *constant_lhs_array);
   auto rhs_constant = ConstantR2FromArray2D(&builder, *constant_rhs_array);
-  auto start_constant = ConstantR1<int32>(&builder, {0, 1});
-  auto dynamic_slice = DynamicSlice(rhs_constant, start_constant, {6, 1});
+  auto zero = ConstantR0<int32>(&builder, 0);
+  auto one = ConstantR0<int32>(&builder, 1);
+  auto dynamic_slice = DynamicSlice(rhs_constant, {zero, one}, {6, 1});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(0);
@@ -1089,8 +1097,9 @@ XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstRHSCols) {
   XlaBuilder builder(TestName());
   auto lhs_constant = ConstantR2FromArray2D(&builder, *constant_lhs_array);
   auto rhs_constant = ConstantR2FromArray2D(&builder, *constant_rhs_array);
-  auto start_constant = ConstantR1<int32>(&builder, {1, 0});
-  auto dynamic_slice = DynamicSlice(lhs_constant, start_constant, {1, 6});
+  auto zero = ConstantR0<int32>(&builder, 0);
+  auto one = ConstantR0<int32>(&builder, 1);
+  auto dynamic_slice = DynamicSlice(lhs_constant, {one, zero}, {1, 6});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
@@ -1113,8 +1122,9 @@ XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstLHSCols) {
   XlaBuilder builder(TestName());
   auto lhs_constant = ConstantR2FromArray2D(&builder, *constant_lhs_array);
   auto rhs_constant = ConstantR2FromArray2D(&builder, *constant_rhs_array);
-  auto start_constant = ConstantR1<int32>(&builder, {1, 0});
-  auto dynamic_slice = DynamicSlice(rhs_constant, start_constant, {1, 6});
+  auto zero = ConstantR0<int32>(&builder, 0);
+  auto one = ConstantR0<int32>(&builder, 1);
+  auto dynamic_slice = DynamicSlice(rhs_constant, {one, zero}, {1, 6});
 
   DotDimensionNumbers dot_dnums;
   dot_dnums.add_lhs_contracting_dimensions(1);
@@ -1147,5 +1157,192 @@ XLA_TEST_F(DotOperationTest, DotRank2AndRank2NonDefaultContractionDims) {
 
   ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
 }
+
+using EinsumParamType =
+    std::tuple<std::vector<int64>, std::vector<int64>, string>;
+class EinsumTest : public DotOperationTest,
+                   public ::testing::WithParamInterface<EinsumParamType> {};
+XLA_TEST_P(EinsumTest, SimpleEinsumTest) {
+  XlaBuilder builder(TestName());
+  auto x = AddParam(
+      MakeFakeLiteral(ShapeUtil::MakeShape(F32, std::get<0>(GetParam())))
+          .ValueOrDie(),
+      &builder);
+  auto y = AddParam(
+      MakeFakeLiteral(ShapeUtil::MakeShape(F32, std::get<1>(GetParam())))
+          .ValueOrDie(),
+      &builder);
+  Einsum(x, y, std::get<2>(GetParam()));
+  ComputeAndCompare(&builder, {}, ErrorSpec{1e-3, 1e-3});
+}
+
+std::vector<EinsumParamType> GetEinsumTestCases() {
+  using v = std::vector<int64>;
+  using p = EinsumParamType;
+  std::vector<p> test_cases = {
+      p{v{5, 6}, v{6, 7}, "mk,kn->mn"},
+      p{v{5, 6}, v{6, 7}, "mk,kn->nm"},
+      p{v{5, 6, 11}, v{6, 11, 7}, "mkB,kBn->nmB"},
+      p{v{31, 55, 11}, v{55, 11, 29}, "mkB,kBn->nmB"},
+      p{v{31, 55, 11}, v{55, 11, 29}, "mkB,kBn->Bnm"},
+      p{v{8, 55, 11, 3}, v{55, 11, 3, 29}, "mkBC,kBCn->BCnm"},
+      p{v{5, 6}, v{6, 7}, "ab,cd->dcba"},
+      p{v{6}, v{6, 7}, "b,bc->c"},
+      p{v{5, 6, 7}, v{5, 6, 7}, "abc,abc->ab"},
+      p{v{5, 6, 7}, v{7, 6, 5}, "abc,cba->ca"},
+      p{v{77}, v{77}, "a,a->a"},
+      p{v{77}, v{77, 55}, "a,ab->ba"},
+      p{v{2, 3, 77}, v{77, 2, 3, 55}, "ija,aijb->baij"},
+      p{v{55}, v{}, "a,->a"},
+      p{v{11, 111}, v{11}, "ab,a->ab"},
+      p{v{16, 34}, v{16, 34}, "ab,ab->ab"},
+      p{v{16, 3, 34}, v{3, 16, 34}, "abc,bac->abc"},
+      p{v{5, 19}, v{}, "ab,->ab"},
+  };
+  return test_cases;
+}
+
+INSTANTIATE_TEST_CASE_P(Einsum, EinsumTest,
+                        ::testing::ValuesIn(GetEinsumTestCases()));
+
+class DotOperationTextTest : public HloTestBase {};
+
+XLA_TEST_F(DotOperationTextTest, DotReorderedDotDims) {
+  absl::string_view hlo_string =
+      R"(
+HloModule ComplexDotMultipleNonContracting
+
+ENTRY %test {
+  %lhs = f32[7,17,10,13]{3,2,1,0} parameter(0)
+  %rhs = f32[7,9,10,13,6]{4,3,2,1,0} parameter(1)
+  ROOT %dot = f32[10,7,17,9,6]{4,3,2,1,0} dot(%lhs, %rhs), lhs_batch_dims={2,0}, rhs_batch_dims={2,0}, lhs_contracting_dims={3}, rhs_contracting_dims={3}
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{1e-3, 1e-3}));
+}
+
+XLA_TEST_F(DotOperationTextTest, DotReorderedDotDimsAndMultipleContracting) {
+  absl::string_view hlo_string =
+      R"(
+HloModule ComplexDotMultipleNonContracting
+
+ENTRY %test {
+  %lhs = f32[7,5,17,10,13]{4,3,2,1,0} parameter(0)
+  %rhs = f32[7,9,10,13,6,5]{5,4,3,2,1,0} parameter(1)
+  ROOT %dot = f32[10,7,17,9,6]{4,3,2,1,0} dot(%lhs, %rhs), lhs_batch_dims={3,0}, rhs_batch_dims={2,0}, lhs_contracting_dims={1,4}, rhs_contracting_dims={5,3}
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{1e-3, 1e-3}));
+}
+
+XLA_TEST_F(DotOperationTextTest, DotWithNoDnums) {
+  absl::string_view hlo_string =
+      R"(
+HloModule DotWithNoDnums
+
+ENTRY %test {
+  %lhs = f32[2,3]{1,0} parameter(0)
+  %rhs = f32[4,5]{1,0} parameter(1)
+  ROOT %dot = f32[2,3,4,5]{3,2,1,0} dot(%lhs, %rhs)
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{1e-3, 1e-3}));
+}
+
+XLA_TEST_F(DotOperationTextTest, Einsum) {
+  absl::string_view hlo_string =
+      R"(
+HloModule Einsum
+
+ENTRY %test {
+  %lhs = f32[8,64,96]{2,1,0} parameter(0)
+  %rhs = f32[96,32,4]{2,1,0} parameter(1)
+  ROOT %dot = f32[8,64,32,4]{3,2,1,0}  dot(%lhs, %rhs), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{4e-3, 4e-3}));
+}
+
+XLA_TEST_F(DotOperationTextTest, CpuTiledDotEmitterCachingBug_1) {
+  // Tests for a caching bug in the XLA CPU backend.
+  absl::string_view hlo_string =
+      R"(
+HloModule CpuTiledDotEmitterCachingBug
+
+ENTRY main {
+  lhs = f32[20,40] parameter(0)
+  rhs_0 = f32[40,1] parameter(2)
+  rhs_1 = f32[1,40] parameter(1)
+
+  dot_0 = f32[20,1] dot(lhs, rhs_0), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  dot_1 = f32[20,1] dot(lhs, rhs_1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+
+  ROOT result = f32[20,1] divide(dot_0, dot_1)
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{4e-3, 4e-3}));
+}
+
+XLA_TEST_F(DotOperationTextTest, CpuTiledDotEmitterCachingBug_2) {
+  // Tests for a caching bug in the XLA CPU backend.
+  absl::string_view hlo_string =
+      R"(
+HloModule CpuTiledDotEmitterCachingBug
+
+ENTRY main {
+  lhs_0 = f32[20,40] parameter(0)
+  rhs_0 = f32[40,1] parameter(1)
+  lhs_1 = f32[1,40] parameter(2)
+  rhs_1 = f32[20,40] parameter(3)
+
+  dot_0 = f32[20,1] dot(lhs_0, rhs_0), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  dot_1 = f32[1,20] dot(lhs_1, rhs_1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+
+  dot_0_reshaped = f32[20] reshape(dot_0)
+  dot_1_reshaped = f32[20] reshape(dot_1)
+
+  ROOT result = f32[20] divide(dot_0_reshaped, dot_1_reshaped)
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{4e-3, 4e-3}));
+}
+
+XLA_TEST_F(DotOperationTextTest, DISABLED_ON_CPU(GpuIntegerDotCodegen)) {
+  absl::string_view hlo_string =
+      R"(
+HloModule SmallIntegerDot
+
+ENTRY SmallIntegerDot {
+  arg0 = s32[1,2,2] parameter(0)
+  arg1 = s32[1,2,1] parameter(1)
+  ROOT dot = s32[1,2,1] dot(arg0, arg1), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{4e-3, 4e-3}));
+}
+
+XLA_TEST_F(DotOperationTextTest, DISABLED_ON_CPU(GpuTransposeOutput)) {
+  absl::string_view hlo_string =
+      R"(
+HloModule TransposeOutput
+
+ENTRY TransposeOutput {
+  p0 = f32[32,32] parameter(0)
+  p1 = f32[32,64] parameter(1)
+  dot = f32[32,64] dot(p0, p1), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  ROOT tr = f32[64,32] transpose(dot), dimensions={1,0}
+}
+)";
+
+  EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{4e-3, 4e-3}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index 7501c6d957e7afe99b8c530e5f0d575f818367da..82e2db36143b2552472fedae701f32389a9be108 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -135,11 +135,11 @@ class DynamicSliceTest : public ClientLibraryTestBase {
     XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     XlaOp starts;
-    std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
-        slice_starts, 0, "slice_starts", &builder, &starts);
+    std::unique_ptr<GlobalData> start_data = CreateR0Parameter<IndexT>(
+        slice_starts[0], 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
     auto input = ConstantLiteral(&builder, input_values);
-    DynamicSlice(input, starts, slice_sizes);
+    DynamicSlice(input, absl::Span<const XlaOp>({starts}), slice_sizes);
     // Run computation and compare against expected values.
     ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()});
   }
@@ -160,14 +160,23 @@ class DynamicSliceTest : public ClientLibraryTestBase {
 
     XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    XlaOp starts;
-    std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
-        slice_starts, 0, "slice_starts", &builder, &starts);
+    std::vector<XlaOp> starts(2);
+    std::vector<std::unique_ptr<GlobalData>> start_data(2);
+    for (int i = 0; i < 2; ++i) {
+      start_data[i] = CreateR0Parameter<IndexT>(
+          slice_starts[i], i, "slice_starts", &builder, &starts[i]);
+    }
+
     // Build dynamic slice computation.
     auto input = ConstantLiteral(&builder, input_values);
     DynamicSlice(input, starts, slice_sizes);
     // Run computation and compare against expected values.
-    ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()});
+    std::vector<GlobalData*> argument_ptrs;
+    absl::c_transform(start_data, std::back_inserter(argument_ptrs),
+                      [](const std::unique_ptr<GlobalData>& argument) {
+                        return argument.get();
+                      });
+    ComputeAndCompareLiteral(&builder, expected_values, argument_ptrs);
   }
 
   template <typename IndexT, typename DataT>
@@ -186,14 +195,22 @@ class DynamicSliceTest : public ClientLibraryTestBase {
 
     XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    XlaOp starts;
-    std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
-        slice_starts, 0, "slice_starts", &builder, &starts);
+    std::vector<XlaOp> starts(3);
+    std::vector<std::unique_ptr<GlobalData>> start_data(3);
+    for (int i = 0; i < 3; ++i) {
+      start_data[i] = CreateR0Parameter<IndexT>(
+          slice_starts[i], i, "slice_starts", &builder, &starts[i]);
+    }
     // Build dynamic slice computation.
     auto input = ConstantLiteral(&builder, input_values);
     DynamicSlice(input, starts, slice_sizes);
     // Run computation and compare against expected values.
-    ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()});
+    std::vector<GlobalData*> argument_ptrs;
+    absl::c_transform(start_data, std::back_inserter(argument_ptrs),
+                      [](const std::unique_ptr<GlobalData>& argument) {
+                        return argument.get();
+                      });
+    ComputeAndCompareLiteral(&builder, expected_values, argument_ptrs);
   }
 };
 
@@ -372,16 +389,12 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
                       .ValueOrDie());
 
     XlaBuilder builder(TestName());
-    // Initialize and transfer dynamic slice start indices parameter.
-    XlaOp starts;
-    std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
-        slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
     auto input = ConstantLiteral(&builder, input_value);
     auto update = ConstantLiteral(&builder, update_value);
-    DynamicUpdateSlice(input, update, starts);
+    DynamicUpdateSlice(input, update, absl::Span<const XlaOp>({}));
     // Run computation and compare against expected values.
-    ComputeAndCompareLiteral(&builder, expected_value, {start_data.get()});
+    ComputeAndCompareLiteral(&builder, expected_value, {});
   }
 
   template <typename IndexT, typename DataT>
@@ -405,12 +418,12 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
     XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
     XlaOp starts;
-    std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
-        slice_starts, 0, "slice_starts", &builder, &starts);
+    std::unique_ptr<GlobalData> start_data = CreateR0Parameter<IndexT>(
+        slice_starts[0], 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
     auto input = ConstantLiteral(&builder, input_values);
     auto update = ConstantLiteral(&builder, update_values);
-    DynamicUpdateSlice(input, update, starts);
+    DynamicUpdateSlice(input, update, absl::Span<const XlaOp>({starts}));
     // Run computation and compare against expected values.
     ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()});
   }
@@ -435,15 +448,23 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
 
     XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    XlaOp starts;
-    std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
-        slice_starts, 0, "slice_starts", &builder, &starts);
+    std::vector<XlaOp> starts(2);
+    std::vector<std::unique_ptr<GlobalData>> start_data(2);
+    for (int i = 0; i < 2; ++i) {
+      start_data[i] = CreateR0Parameter<IndexT>(
+          slice_starts[i], i, "slice_starts", &builder, &starts[i]);
+    }
     // Build dynamic slice computation.
     auto input = ConstantLiteral(&builder, input_values);
     auto update = ConstantLiteral(&builder, update_values);
     DynamicUpdateSlice(input, update, starts);
     // Run computation and compare against expected values.
-    ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()});
+    std::vector<GlobalData*> argument_ptrs;
+    absl::c_transform(start_data, std::back_inserter(argument_ptrs),
+                      [](const std::unique_ptr<GlobalData>& argument) {
+                        return argument.get();
+                      });
+    ComputeAndCompareLiteral(&builder, expected_values, argument_ptrs);
   }
 
   template <typename IndexT, typename DataT>
@@ -466,15 +487,24 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
 
     XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    XlaOp starts;
-    std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
-        slice_starts, 0, "slice_starts", &builder, &starts);
+    std::vector<XlaOp> starts(3);
+    std::vector<std::unique_ptr<GlobalData>> start_data(3);
+    for (int i = 0; i < 3; ++i) {
+      start_data[i] = CreateR0Parameter<IndexT>(
+          slice_starts[i], i, "slice_starts", &builder, &starts[i]);
+    }
+
     // Build dynamic slice computation.
     auto input = ConstantLiteral(&builder, input_values);
     auto update = ConstantLiteral(&builder, update_values);
     DynamicUpdateSlice(input, update, starts);
     // Run computation and compare against expected values.
-    ComputeAndCompareLiteral(&builder, expected_values, {start_data.get()});
+    std::vector<GlobalData*> argument_ptrs;
+    absl::c_transform(start_data, std::back_inserter(argument_ptrs),
+                      [](const std::unique_ptr<GlobalData>& argument) {
+                        return argument.get();
+                      });
+    ComputeAndCompareLiteral(&builder, expected_values, argument_ptrs);
   }
 
   template <class T>
@@ -518,8 +548,9 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
     XlaOp update;
     std::unique_ptr<GlobalData> update_data = CreateR3Parameter<T>(
         update_values, 1, "update_values", &builder, &update);
-    auto starts = ConstantR1<int32>(&builder, {index, 0, 0});
-    DynamicUpdateSlice(input, update, starts);
+    auto constant_index = ConstantR0<int32>(&builder, index);
+    auto zero = ConstantR0<int32>(&builder, 0);
+    DynamicUpdateSlice(input, update, {constant_index, zero, zero});
 
     // Run computation and compare against expected values.
     ComputeAndCompareR3<T>(&builder, expected_values,
@@ -720,46 +751,55 @@ void BM_DynamicSlice(int num_iters) {
         {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}});
   auto input = ConstantLiteral(&builder, input_literal);
 
+  auto stream =
+      client->mutable_backend()->BorrowStream(device_ordinal).ValueOrDie();
+
   // Create dynamic slice start indices as a parameter: shape [4]
-  auto start_indices_shape = ShapeUtil::MakeShape(S32, {4});
-  auto start_indices =
-      Parameter(&builder, 0, start_indices_shape, "start_indices");
+  auto start_indices_shape = ShapeUtil::MakeShape(S32, {});
+  std::vector<XlaOp> start_indices(4);
+  std::vector<ScopedShapedBuffer> shaped_buffers;
+  std::vector<const Shape*> host_shapes(4);
+  for (int i = 0; i < 4; ++i) {
+    start_indices[i] =
+        Parameter(&builder, i, start_indices_shape, "start_indices");
+    auto start_index_literal = LiteralUtil::CreateR0<int32>(i + 1);
+    // Initialize and transfer parameter buffer.
+    shaped_buffers.emplace_back(
+        client->backend()
+            .transfer_manager()
+            ->AllocateScopedShapedBuffer(start_indices_shape, &allocator,
+                                         /*device_ordinal=*/0)
+            .ConsumeValueOrDie());
+    host_shapes[i] = &shaped_buffers[i].on_host_shape();
+    ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice(
+        stream.get(), start_index_literal, shaped_buffers[i]));
+  }
+
   // Add DynamicSlice op to the computatation.
   DynamicSlice(input, start_indices, {1, 1, 1, 1});
   auto computation = builder.Build().ConsumeValueOrDie();
 
-  // Initialize and transfer parameter buffer.
-  auto buffer = client->backend()
-                    .transfer_manager()
-                    ->AllocateScopedShapedBuffer(
-                        start_indices_shape, &allocator, /*device_ordinal=*/0)
-                    .ConsumeValueOrDie();
-
-  auto start_indices_literal = LiteralUtil::CreateR1<int32>({0, 1, 2, 3});
-  auto stream =
-      client->mutable_backend()->BorrowStream(device_ordinal).ValueOrDie();
-  ASSERT_IS_OK(transfer_manager->TransferLiteralToDevice(
-      stream.get(), start_indices_literal, buffer));
-
   std::unique_ptr<LocalExecutable> executable =
-      client
-          ->Compile(computation, {&buffer.on_host_shape()},
-                    ExecutableBuildOptions())
+      client->Compile(computation, host_shapes, ExecutableBuildOptions())
           .ConsumeValueOrDie();
 
   // Run some warm-up executions.
   ExecutableRunOptions options;
   options.set_allocator(&allocator);
   const int kWarmups = 2;
+  std::vector<const ShapedBuffer*> shaped_buffer_ptrs;
+  absl::c_transform(shaped_buffers, std::back_inserter(shaped_buffer_ptrs),
+                    [](const ScopedShapedBuffer& buffer) { return &buffer; });
+
   for (int i = 0; i < kWarmups; ++i) {
-    auto result = executable->Run({&buffer}, options);
+    auto result = executable->Run(shaped_buffer_ptrs, options);
     ASSERT_TRUE(result.ok());
   }
 
   // Run benchmark.
   tensorflow::testing::StartTiming();
   for (int i = 0; i < num_iters; ++i) {
-    auto result = executable->Run({&buffer}, options);
+    auto result = executable->Run(shaped_buffer_ptrs, options);
     ASSERT_TRUE(result.ok());
   }
 }
diff --git a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
index c84973e17b234c24c84f02a369ce0185f5772cca..b961e6102692cb3b90976d621c62cb4cf18a9b6b 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cmath>
 #include "absl/base/casts.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -21,66 +23,166 @@ limitations under the License.
 
 namespace xla {
 namespace {
+
 class ExhaustiveF32ElementwiseOpTest
     : public ClientLibraryTestBase,
       public ::testing::WithParamInterface<std::pair<int64, int64>> {
  protected:
-  ErrorSpec error_spec_{0.0001, 0.0001, /*relaxed_nans=*/true};
+  ErrorSpec error_spec_{0.0001, 0.0001};
+
+  bool IsClose(float expected, float actual) {
+    float abs_err = std::abs(expected - actual);
+    float rel_err = abs_err / std::abs(expected);
+    return abs_err < error_spec_.abs || rel_err < error_spec_.rel ||
+           (std::isnan(expected) && std::isnan(actual)) ||
+           (std::isinf(expected) && std::isinf(actual) &&
+            (expected > 0) == (actual > 0));
+  }
 
   template <typename EnqueueOpTy>
   void ExhaustivelyTestF32Op(EnqueueOpTy enqueue_op,
                              float (*evaluate_op)(float),
                              std::pair<int64, int64> known_incorrect_range) {
+    SetFastMathDisabled(true);
+
     int64 begin, end;
     std::tie(begin, end) = GetParam();
     int64 input_size = end - begin;
+
+    if (begin >= known_incorrect_range.first &&
+        end <= known_incorrect_range.second) {
+      LOG(INFO) << absl::StreamFormat(
+          "Skipping this shard, as the range under test, [%d, %d), falls "
+          "entirely within the known-incorrect range [%d, %d).",
+          begin, end, known_incorrect_range.first,
+          known_incorrect_range.second);
+      return;
+    }
+
     LOG(INFO) << "Checking range [" << begin << ", " << end << ")";
 
     XlaBuilder builder(TestName());
 
-    Literal input_literal =
-        LiteralUtil::CreateFromDimensions(F32, {input_size});
-    for (int64 i = begin; i < end; i++) {
+    auto ith_input_elem = [&](int64 i) -> float {
+      i += begin;
+      // If the operation is known to be buggy on a specific input clamp that
+      // input to 0 under the assumption that the op is at least correct on 0.
       if (i >= known_incorrect_range.first &&
           i < known_incorrect_range.second) {
-        // If the operation is known to be buggy on a specific input clamp that
-        // input to 0 under the assumption that the op is at least correct on 0.
-        input_literal.Set({i - begin}, 0.0f);
-      } else {
-        input_literal.Set({i - begin}, absl::bit_cast<float, int>(i));
+        return 0;
       }
-    }
-
-    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> input_data,
-                            client_->TransferToServer(input_literal));
+      return absl::bit_cast<float, int32>(i);
+    };
 
+    Literal input_literal =
+        LiteralUtil::CreateFromDimensions(F32, {input_size});
+    absl::Span<float> input_arr = input_literal.data<float>();
+    for (int64 i = 0; i < input_size; i++) {
+      input_arr[i] = ith_input_elem(i);
+    }
     auto input = Parameter(&builder, 0, input_literal.shape(), "input");
     enqueue_op(&builder, input);
+    TF_ASSERT_OK_AND_ASSIGN(XlaComputation comp, builder.Build());
+
+    // Build and run the computation using the LocalClient API, rather than the
+    // plain Client API, which is used by ClientLibraryTestBase.  This is
+    // because the plain Client API results does more memcpys to/from Literals,
+    // and that's slow given that we're touching a lot of data here.
+    //
+    // Copy debug options from ClientLibraryTestBase.  In particular, we're
+    // interested in disabling constant folding.
+    ExecutableBuildOptions build_opts;
+    *build_opts.mutable_debug_options() = *mutable_debug_options();
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto executable,
+        client_->Compile(comp, {&input_literal.shape()}, build_opts));
+
+    TF_ASSERT_OK_AND_ASSIGN(
+        ScopedShapedBuffer input_data,
+        client_->LiteralToShapedBuffer(input_literal, /*device_ordinal=*/0));
+
+    ExecutableRunOptions run_opts;
+    run_opts.set_allocator(client_->backend().memory_allocator());
+    run_opts.set_intra_op_thread_pool(
+        client_->backend().eigen_intra_op_thread_pool_device());
+    TF_ASSERT_OK_AND_ASSIGN(ScopedShapedBuffer result,
+                            executable->Run({&input_data}, run_opts));
+
+    TF_ASSERT_OK_AND_ASSIGN(Literal result_literal,
+                            client_->ShapedBufferToLiteral(result));
+
+    // We essentially reimplement LiteralTestUtil::Near here because
+    //  a) this streamlined implementation is much faster, and
+    //  b) we can print out better error messages (namely, we can print out
+    //     which floating-point value input failed, while LiteralTestUtil::Near
+    //     can only print out the input index that failed).
+    //  c) we need special handling of certain inputs.  For example, we say that
+    //     a denormal input has multiple correct outputs (namely, f(x) and f(0))
+    //     and just needs to be close to one of them.
+    absl::Span<float> result_arr = result_literal.data<float>();
+    ASSERT_EQ(result_arr.size(), input_arr.size());
+    int64 mismatches = 0;
+    // Hoisting this out of the loop is a nice speedup on shards that have many
+    // denormals.
+    const float expected_at_zero = evaluate_op(0);
+    for (int64 i = 0; i < input_arr.size(); ++i) {
+      float input = ith_input_elem(i);
+      float actual = result_arr[i];
+      float expected = evaluate_op(input);
+      if (IsClose(expected, actual)) {
+        continue;
+      }
 
-    std::vector<float> expected_result;
-    expected_result.reserve(input_size);
-    for (int64 i = 0; i < input_size; i++) {
-      expected_result.push_back(evaluate_op(input_literal.Get<float>({i})));
-    }
+      constexpr int64 kMaxMismatchesPrinted = 1000;
+      if (std::fpclassify(input) == FP_SUBNORMAL) {
+        // For denormal inputs, we accept answers that are close to either
+        //   - evaluate_op(input) OR
+        //   - evaluate_op(0).
+        if (IsClose(expected_at_zero, actual)) {
+          continue;
+        }
+        ++mismatches;
+        if (mismatches < kMaxMismatchesPrinted || VLOG_IS_ON(2)) {
+          // Use %0.9g because that's guaranteed to print an f32 to full
+          // precision.
+          LOG(ERROR) << absl::StreamFormat(
+              "Mismatch on denormal value %0.9g (0x%08x). Expected either "
+              "%0.9g (0x%08x) (evaluated at true value) or %0.9g (0x%08x) "
+              "(evaluated at zero), but got %0.9g (0x%08x).",
+              input, absl::bit_cast<uint32>(input),        //
+              expected, absl::bit_cast<uint32>(expected),  //
+              expected_at_zero, absl::bit_cast<uint32>(expected_at_zero),
+              actual, absl::bit_cast<uint32>(actual));
+        }
+      } else {
+        mismatches++;
+        if (mismatches < kMaxMismatchesPrinted || VLOG_IS_ON(2)) {
+          LOG(ERROR) << absl::StreamFormat(
+              "Mismatch on %0.9g (0x%08x). Expected %0.9g (0x%08x), but got "
+              "%0.9g (0x%08x).",
+              input, absl::bit_cast<uint32>(input),        //
+              expected, absl::bit_cast<uint32>(expected),  //
+              actual, absl::bit_cast<uint32>(actual));
+        }
+      }
 
-    ComputeAndCompareR1<float>(&builder, expected_result, {input_data.get()},
-                               error_spec_);
+      if (mismatches == kMaxMismatchesPrinted && !VLOG_IS_ON(2)) {
+        LOG(ERROR) << "Not printing any more mismatches; pass "
+                      "--vmodule=exhaustive_f32_elementwise_op_test=2 to see "
+                      "all of them.";
+      }
+    }
+    EXPECT_EQ(mismatches, 0);
   }
 };
 
 XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, LogF32) {
-#ifdef XLA_TEST_BACKEND_CPU
-  // TODO(b/73141998): The vectorized Log implementation gives results outside
-  // our error spec in this range (these numbers are bitwise representations of
-  // floats expressed as a zero extended int64).
-  std::pair<int64, int64> known_incorrect_range = {1, 8388608};
-#else
-  std::pair<int64, int64> known_incorrect_range = {0, 0};
+#if !defined(XLA_TEST_BACKEND_CPU) && !defined(XLA_TEST_BACKEND_GPU)
+  error_spec_ = ErrorSpec{0.001, 0.001};
 #endif
-
   ExhaustivelyTestF32Op(
       [](XlaBuilder* builder, const XlaOp& input) { Log(input); }, std::log,
-      known_incorrect_range);
+      /*known_incorrect_range=*/{0, 0});
 }
 
 XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, ExpF32) {
@@ -105,6 +207,18 @@ XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, TanhF32) {
       /*known_incorrect_range=*/{0, 0});
 }
 
+XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, ErfF32) {
+  ExhaustivelyTestF32Op(
+      [](XlaBuilder* builder, const XlaOp& input) { Erf(input); }, std::erf,
+      /*known_incorrect_range=*/{0, 0});
+}
+
+XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, ErfcF32) {
+  ExhaustivelyTestF32Op(
+      [](XlaBuilder* builder, const XlaOp& input) { Erfc(input); }, std::erfc,
+      /*known_incorrect_range=*/{0, 0});
+}
+
 std::vector<std::pair<int64, int64>> CreateExhaustiveParameters() {
   // We break up the 2^32-element space into small'ish chunks to keep peak
   // memory usage low.
diff --git a/tensorflow/compiler/xla/tests/filecheck.cc b/tensorflow/compiler/xla/tests/filecheck.cc
index dcb469087e0064d17ce3b04fdeaf0b6136069a55..1b0bebe2d03a9a153cd0c80329ed0c49c91333a3 100644
--- a/tensorflow/compiler/xla/tests/filecheck.cc
+++ b/tensorflow/compiler/xla/tests/filecheck.cc
@@ -48,7 +48,7 @@ StatusOr<bool> RunFileCheck(const string& input, const string& pattern) {
 
   tensorflow::SubProcess file_check_process;
   file_check_process.SetProgram(file_check_path,
-                                {file_check_path, pattern_path});
+                                {file_check_path, "-v", pattern_path});
   file_check_process.SetChannelAction(tensorflow::CHAN_STDIN,
                                       tensorflow::ACTION_PIPE);
   file_check_process.SetChannelAction(tensorflow::CHAN_STDERR,
@@ -71,9 +71,7 @@ StatusOr<bool> RunFileCheck(const string& input, const string& pattern) {
       LOG(WARNING) << "NOTE: FileCheck binary does not exist!";
     }
 
-    LOG(WARNING) << "FileCheck error: " << standard_error;
-    LOG(WARNING) << "FileCheck input was:";
-    XLA_LOG_LINES(tensorflow::WARNING, input);
+    LOG(WARNING) << "FileCheck error:\n" << standard_error;
     LOG(WARNING) << "FileCheck pattern was:";
     XLA_LOG_LINES(tensorflow::WARNING, pattern);
   } else if (!standard_error.empty()) {
diff --git a/tensorflow/compiler/xla/tests/fmax_fmin_test.cc b/tensorflow/compiler/xla/tests/fmax_fmin_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7423ac0bcdb0bc305ee384fb98bd17413404ecef
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/fmax_fmin_test.cc
@@ -0,0 +1,88 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+class FmaxSimpleTest : public ClientLibraryTestBase {};
+
+TEST_F(FmaxSimpleTest, FmaxTenValues) {
+  SetFastMathDisabled(true);
+  XlaBuilder builder(TestName());
+  auto x = ConstantR1<float>(
+      &builder, {-0.0, 1.0, 2.0, -3.0, -4.0, 5.0, 6.0, -7.0, -8.0, 9.0});
+  auto y = ConstantR1<float>(
+      &builder, {-0.0, -1.0, -2.0, 3.0, 4.0, -5.0, -6.0, 7.0, 8.0, -9.0});
+  Max(x, y);
+
+  std::vector<float> expected = {-0.0, 1.0, 2.0, 3.0, 4.0,
+                                 5.0,  6.0, 7.0, 8.0, 9.0};
+  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
+}
+
+TEST_F(FmaxSimpleTest, FmaxEdgeCases) {
+  SetFastMathDisabled(true);
+  XlaBuilder builder(TestName());
+  XlaOp param0, param1;
+  std::unique_ptr<GlobalData> param0_data = CreateR1Parameter<float>(
+      {INFINITY, INFINITY, INFINITY, -INFINITY, INFINITY, -INFINITY, NAN,
+       INFINITY, -INFINITY, NAN},
+      /*parameter_number=*/0, /*name=*/"param0",
+      /*builder=*/&builder, /*data_handle=*/&param0);
+  std::unique_ptr<GlobalData> param1_data = CreateR1Parameter<float>(
+      {INFINITY, -INFINITY, NAN, NAN, -4.0, -5.0, -6.0, 7.0, 8.0, 9.0},
+      /*parameter_number=*/1, /*name=*/"param1",
+      /*builder=*/&builder, /*data_handle=*/&param1);
+
+  Max(param0, param1);
+  std::vector<float> expected = {INFINITY, INFINITY, NAN,      NAN, INFINITY,
+                                 -5,       NAN,      INFINITY, 8,   NAN};
+  ComputeAndCompareR1<float>(&builder, expected,
+                             {param0_data.get(), param1_data.get()},
+                             ErrorSpec(0.0001));
+}
+
+TEST_F(FmaxSimpleTest, FminEdgeCases) {
+  SetFastMathDisabled(true);
+  XlaBuilder builder(TestName());
+  XlaOp param0, param1;
+  std::unique_ptr<GlobalData> param0_data = CreateR1Parameter<float>(
+      {INFINITY, INFINITY, INFINITY, -INFINITY, INFINITY, -INFINITY, NAN,
+       INFINITY, -INFINITY, NAN},
+      /*parameter_number=*/0, /*name=*/"param0",
+      /*builder=*/&builder, /*data_handle=*/&param0);
+  std::unique_ptr<GlobalData> param1_data = CreateR1Parameter<float>(
+      {INFINITY, -INFINITY, NAN, NAN, -4.0, -5.0, -6.0, 7.0, 8.0, 9.0},
+      /*parameter_number=*/1, /*name=*/"param1",
+      /*builder=*/&builder, /*data_handle=*/&param1);
+
+  Min(param0, param1);
+  std::vector<float> expected = {INFINITY,  -INFINITY, NAN, NAN,       -4,
+                                 -INFINITY, NAN,       7,   -INFINITY, NAN};
+  ComputeAndCompareR1<float>(&builder, expected,
+                             {param0_data.get(), param1_data.get()},
+                             ErrorSpec(0.0001));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/fmax_test.cc b/tensorflow/compiler/xla/tests/fmax_test.cc
deleted file mode 100644
index c5bbbe778df15d63a2586bd6291a7a33fc82aa52..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/tests/fmax_test.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <vector>
-
-#include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
-#include "tensorflow/compiler/xla/tests/literal_test_util.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace xla {
-namespace {
-
-class FmaxSimpleTest : public ClientLibraryTestBase {};
-
-TEST_F(FmaxSimpleTest, FmaxTenValues) {
-  XlaBuilder builder(TestName());
-  auto x = ConstantR1<float>(
-      &builder, {-0.0, 1.0, 2.0, -3.0, -4.0, 5.0, 6.0, -7.0, -8.0, 9.0});
-  auto y = ConstantR1<float>(
-      &builder, {-0.0, -1.0, -2.0, 3.0, 4.0, -5.0, -6.0, 7.0, 8.0, -9.0});
-  Max(x, y);
-
-  std::vector<float> expected = {-0.0, 1.0, 2.0, 3.0, 4.0,
-                                 5.0,  6.0, 7.0, 8.0, 9.0};
-  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index d1fddf9d6b494a822610e41307fa103dc90bdef3..2178c9b3f3d39ac034c59585c6836d2bc59162c1 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -523,10 +523,10 @@ XLA_TEST_F(FusionTest, DynamicSliceNegate) {
   auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32>({1, 2, 3, 4})));
   auto const1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<int32>({1})));
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(1)));
   auto dynamic_slice2 =
       builder.AddInstruction(HloInstruction::CreateDynamicSlice(
-          ShapeUtil::MakeShape(S32, {2}), const0, const1, {2}));
+          ShapeUtil::MakeShape(S32, {2}), const0, {const1}, {2}));
   auto negate3 = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(S32, {2}), HloOpcode::kNegate, dynamic_slice2));
   hlo_module->AddEntryComputation(builder.Build())
diff --git a/tensorflow/compiler/xla/tests/gather_operation_test.cc b/tensorflow/compiler/xla/tests/gather_operation_test.cc
index daa89398a697af9149797d621c3bdca80a00aedd..d65b67a535d43553a3a94f76482ad4618f9b8aab 100644
--- a/tensorflow/compiler/xla/tests/gather_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/gather_operation_test.cc
@@ -600,7 +600,9 @@ ENTRY main {
 
 class GatherClientLibraryTest : public ClientLibraryTestBase {};
 
-XLA_TEST_F(GatherClientLibraryTest, DISABLED_ON_GPU(Basic)) {
+// Disabled on interpreter since ExectuteAsyncOnStream is not supported.
+XLA_TEST_F(GatherClientLibraryTest,
+           DISABLED_ON_INTERPRETER(DISABLED_ON_GPU(Basic))) {
   // We create this HLO, but using the XlaBuilder API.
   //
   // ENTRY main {
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 989a7c705a8254f99e5cc0e97dfde5942f146964..0151981ef16aabe9e363bc4d7f9ba96d4a1f170f 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -139,7 +139,8 @@ std::unique_ptr<VerifiedHloModule> HloTestBase::CreateNewVerifiedModule(
     const string& name) {
   return absl::make_unique<VerifiedHloModule>(
       name, GetModuleConfigForTest(), verifier_layout_sensitive_,
-      allow_mixed_precision_in_hlo_verifier_);
+      allow_mixed_precision_in_hlo_verifier_,
+      backend().compiler()->ShapeSizeBytesFunction());
 }
 
 StatusOr<std::unique_ptr<VerifiedHloModule>>
@@ -147,7 +148,8 @@ HloTestBase::ParseAndReturnVerifiedModule(absl::string_view hlo_text,
                                           const HloModuleConfig& config) {
   auto module = absl::make_unique<VerifiedHloModule>(
       TestName(), config, verifier_layout_sensitive_,
-      allow_mixed_precision_in_hlo_verifier_);
+      allow_mixed_precision_in_hlo_verifier_,
+      backend().compiler()->ShapeSizeBytesFunction());
   TF_RETURN_IF_ERROR(ParseHloString(hlo_text, module.get()));
   TF_RETURN_IF_ERROR(module->Verify());
   return std::move(module);
@@ -181,6 +183,7 @@ DebugOptions HloTestBase::GetDebugOptionsForTest() {
   // TODO(b/38354253): Change tests to use Parameters instead of Constants.
   debug_options.add_xla_disable_hlo_passes("constant_folding");
   debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
+  debug_options.set_xla_hlo_evaluator_use_fast_path(true);
   return debug_options;
 }
 
@@ -202,6 +205,17 @@ Literal HloTestBase::ExecuteAndTransfer(std::unique_ptr<HloModule> module,
   return test_runner_.Execute(std::move(module), arguments).ValueOrDie();
 }
 
+StatusOr<std::vector<Literal>> HloTestBase::ExecuteReplicated(
+    std::unique_ptr<HloModule> module, absl::Span<Literal* const> arguments,
+    int64 num_replicas) {
+  HloRunner::ReplicatedExecuteOptions options;
+  options.num_replicas = num_replicas;
+  for (auto argument : arguments) {
+    options.arguments.push_back(argument);
+  }
+  return test_runner_.ExecuteReplicated(std::move(module), options);
+}
+
 StatusOr<std::unique_ptr<HloModule>> HloTestBase::MakeReferenceModule(
     const HloModule& test_module,
     const std::function<void(HloModule*)>& reference_preprocessor) {
@@ -310,7 +324,10 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
                        reference_preprocessor);
 }
 
-::testing::AssertionResult HloTestBase::Run(string_view hlo_string) {
+::testing::AssertionResult HloTestBase::Run(string_view hlo_string,
+                                            bool run_hlo_passes,
+                                            ExecutionProfile* profile,
+                                            string backend_config) {
   auto module_or_status =
       HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
   if (!module_or_status.ok()) {
@@ -318,19 +335,108 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
            << "Error while parsing HLO text format: "
            << module_or_status.status().ToString();
   }
+
+  std::unique_ptr<HloModule> module = std::move(module_or_status.ValueOrDie());
   const auto& fake_arguments =
-      MakeFakeArguments(module_or_status.ValueOrDie().get())
-          .ConsumeValueOrDie();
+      MakeFakeArguments(module.get()).ConsumeValueOrDie();
   std::vector<Literal*> fake_argument_ptrs;
   absl::c_transform(
       fake_arguments, std::back_inserter(fake_argument_ptrs),
       [](const Literal& literal) { return const_cast<Literal*>(&literal); });
-  return test_runner_
-                 .Execute(std::move(module_or_status.ValueOrDie()),
-                          fake_argument_ptrs, /*run_hlo_passes=*/true)
-                 .ok()
+
+  if (profile != nullptr) {
+    // We have to enable HLO profiling since otherwise currently the
+    // ExecutionProfile is not correct.
+    //
+    // TODO(b/119432044): Fix collection of the ExecutionProfile
+    // so that this is not necessary.
+    HloModuleConfig config = module->config();
+    DebugOptions debug_options = config.debug_options();
+    debug_options.set_xla_hlo_profile(true);
+    config.set_debug_options(debug_options);
+    module->set_config(config);
+  }
+
+  if (!backend_config.empty()) {
+    // Set backend configuration if it is given.
+    HloInstruction* instruction =
+        module->entry_computation()->root_instruction();
+    instruction->set_raw_backend_config_string(backend_config);
+  }
+
+  // return ::testing::AssertionSuccess();
+  auto output = test_runner_.Execute(std::move(module), fake_argument_ptrs,
+                                     /*run_hlo_passes=*/run_hlo_passes,
+                                     /*profile=*/profile);
+
+  return output.ok()
              ? ::testing::AssertionSuccess()
-             : ::testing::AssertionFailure();
+             : ::testing::AssertionFailure() << output.status().error_message();
+}
+
+::testing::AssertionResult HloTestBase::RunMultipleTimes(
+    string_view hlo_string, bool run_hlo_passes,
+    std::vector<ExecutionProfile>* profiles, string backend_config) {
+  int n = profiles->size();
+  std::vector<std::vector<Literal*>> fake_argument_ptrs(n);
+  std::vector<std::vector<Literal>> fake_arguments(n);
+  std::vector<std::unique_ptr<Executable>> executables(n);
+
+  for (int i = 0; i < n; ++i) {
+    auto module_or_status =
+        HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest());
+    if (!module_or_status.ok()) {
+      return ::testing::AssertionFailure()
+             << "Error while parsing HLO text format: "
+             << module_or_status.status().ToString();
+    }
+    std::unique_ptr<HloModule> module =
+        std::move(module_or_status.ValueOrDie());
+
+    fake_arguments[i] = MakeFakeArguments(module.get()).ConsumeValueOrDie();
+    absl::c_transform(
+        fake_arguments[i], std::back_inserter(fake_argument_ptrs[i]),
+        [](const Literal& literal) { return const_cast<Literal*>(&literal); });
+
+    if (profiles != nullptr) {
+      // We have to enable HLO profiling since otherwise currently the
+      // ExecutionProfile is not correct.
+      //
+      // TODO(b/119432044): Fix collection of the ExecutionProfile
+      // so that this is not necessary.
+      HloModuleConfig config = module->config();
+      DebugOptions debug_options = config.debug_options();
+      debug_options.set_xla_hlo_profile(true);
+      config.set_debug_options(debug_options);
+      module->set_config(config);
+    }
+
+    if (!backend_config.empty()) {
+      // Set backend configuration if it is given.
+      HloInstruction* instruction =
+          module->entry_computation()->root_instruction();
+      instruction->set_raw_backend_config_string(backend_config);
+    }
+
+    auto executable =
+        test_runner_.CreateExecutable(std::move(module), run_hlo_passes);
+    if (!executable.ok()) {
+      return ::testing::AssertionFailure()
+             << executable.status().error_message();
+    }
+    executables[i] = std::move(executable.ValueOrDie());
+  }
+
+  for (int i = 0; i < n; ++i) {
+    auto output =
+        test_runner_.Execute(std::move(executables[i]), fake_argument_ptrs[i],
+                             /*profile=*/&((*profiles)[i]));
+    if (!output.ok()) {
+      return ::testing::AssertionFailure() << output.status().error_message();
+    }
+  }
+
+  return ::testing::AssertionSuccess();
 }
 
 ::testing::AssertionResult HloTestBase::RunAndCompareFromFile(
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 1d1e7f437296a7493ef7da07039fcf6d273f35bc..3c2bcbb5df5ce94dd37f63d0c0e609f3ad2b60aa 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -46,10 +46,12 @@ class VerifiedHloModule : public HloModule {
  public:
   VerifiedHloModule(const string& name, const HloModuleConfig& config,
                     bool verifier_layout_sensitive,
-                    bool allow_mixed_precision_in_hlo_verifier)
+                    bool allow_mixed_precision_in_hlo_verifier,
+                    std::function<int64(const Shape&)> shape_size_function)
       : HloModule(name, config),
-        verifier_(verifier_layout_sensitive,
-                  allow_mixed_precision_in_hlo_verifier) {}
+        verifier_(
+            verifier_layout_sensitive, allow_mixed_precision_in_hlo_verifier,
+            /*instruction_can_change_layout_func=*/{}, shape_size_function) {}
 
   ~VerifiedHloModule() override { VerifyOrAddFailure("in destructor"); }
 
@@ -171,6 +173,11 @@ class HloTestBase : public ::testing::Test {
   Literal ExecuteAndTransfer(std::unique_ptr<HloModule> module,
                              absl::Span<Literal* const> arguments);
 
+  // Executes the given module on multiple replicas.
+  StatusOr<std::vector<Literal>> ExecuteReplicated(
+      std::unique_ptr<HloModule> module, absl::Span<Literal* const> arguments,
+      int64 num_replicas);
+
   // Executes the given hlo module on two backends and compares results.
   //
   // 'arguments': the input of the hlo module.
@@ -219,8 +226,14 @@ class HloTestBase : public ::testing::Test {
       const absl::optional<ErrorSpec>& error,
       const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
       TF_MUST_USE_RESULT;
-  ::testing::AssertionResult Run(const absl::string_view hlo_string)
-      TF_MUST_USE_RESULT;
+  ::testing::AssertionResult Run(const absl::string_view hlo_string,
+                                 bool run_hlo_passes = true,
+                                 ExecutionProfile* profile = nullptr,
+                                 string backend_config = "") TF_MUST_USE_RESULT;
+  ::testing::AssertionResult RunMultipleTimes(
+      const absl::string_view hlo_string, bool run_hlo_passes,
+      std::vector<ExecutionProfile>* profiles,
+      string backend_config = "") TF_MUST_USE_RESULT;
   ::testing::AssertionResult RunAndCompareFromFile(
       const string& filename, const absl::optional<ErrorSpec>& error,
       const std::function<void(HloModule*)>& reference_preprocessor = nullptr)
diff --git a/tensorflow/compiler/xla/tests/iota_test.cc b/tensorflow/compiler/xla/tests/iota_test.cc
index 65205f53ddc582ae477d67705f161fef1e31b857..37b2c635eebe57590e1ba73c62f015ccf399b548 100644
--- a/tensorflow/compiler/xla/tests/iota_test.cc
+++ b/tensorflow/compiler/xla/tests/iota_test.cc
@@ -80,7 +80,7 @@ TEST_P(IotaR2Test, DoIt) {
 }
 
 INSTANTIATE_TEST_CASE_P(IotaR2TestInstantiation, IotaR2Test,
-                        ::testing::Combine(::testing::Values(F32, S32),
+                        ::testing::Combine(::testing::Values(F32, S32, BF16),
                                            ::testing::Range(/*start=*/10,
                                                             /*end=*/1001,
                                                             /*step=*/10),
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index 554eb24d44168caa7d7252015e3d99f2d567df9b..a2fd6070731943f15c773265f428b16f520d02ee 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -86,7 +86,7 @@ void OnMiscompare(const LiteralSlice& expected, const LiteralSlice& actual,
 
 /* static */ ::testing::AssertionResult LiteralTestUtil::Near(
     const LiteralSlice& expected, const LiteralSlice& actual,
-    const ErrorSpec& error_spec, bool detailed_message) {
+    const ErrorSpec& error_spec, absl::optional<bool> detailed_message) {
   return StatusToAssertion(literal_comparison::Near(
       expected, actual, error_spec, detailed_message, &OnMiscompare));
 }
@@ -97,7 +97,8 @@ void OnMiscompare(const LiteralSlice& expected, const LiteralSlice& actual,
   if (error.has_value()) {
     VLOG(1) << "Expects near";
     return StatusToAssertion(literal_comparison::Near(
-        expected, actual, *error, /*detailed_message=*/false, &OnMiscompare));
+        expected, actual, *error, /*detailed_message=*/absl::nullopt,
+        &OnMiscompare));
   }
   VLOG(1) << "Expects equal";
   return StatusToAssertion(literal_comparison::Equal(expected, actual));
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.h b/tensorflow/compiler/xla/tests/literal_test_util.h
index 43cca91f64b2c0fbfde5054a361cf0f95302c23d..d7cf9bed98a3eb7479b6deb6838dc388a0869360 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.h
+++ b/tensorflow/compiler/xla/tests/literal_test_util.h
@@ -93,7 +93,7 @@ class LiteralTestUtil {
   static ::testing::AssertionResult Near(
       const LiteralSlice& expected, const LiteralSlice& actual,
       const ErrorSpec& error_spec,
-      bool detailed_message = false) TF_MUST_USE_RESULT;
+      absl::optional<bool> detailed_message = absl::nullopt) TF_MUST_USE_RESULT;
 
   // Asserts the given literal are within the given error bound of the given
   // expected values. Only supported for floating point values.
diff --git a/tensorflow/compiler/xla/tests/literal_test_util_test.cc b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
index b6f9b8156b51144e4f74d285b1e4111d098f13c2..ea9b3037cf482e41238413179888f125822d161c 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util_test.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
@@ -89,11 +89,11 @@ TEST(LiteralTestUtilTest, ExpectNearFailurePlacesResultsInTemporaryDirectory) {
     Literal literal =
         Literal::CreateFromProto(literal_proto).ConsumeValueOrDie();
     if (result.find("expected") != string::npos) {
-      EXPECT_EQ("2", literal.ToString());
+      EXPECT_EQ("f32[] 2", literal.ToString());
     } else if (result.find("actual") != string::npos) {
-      EXPECT_EQ("4", literal.ToString());
+      EXPECT_EQ("f32[] 4", literal.ToString());
     } else if (result.find("mismatches") != string::npos) {
-      EXPECT_EQ("true", literal.ToString());
+      EXPECT_EQ("pred[] true", literal.ToString());
     } else {
       FAIL() << "unknown file in temporary directory: " << result;
     }
@@ -105,9 +105,9 @@ TEST(LiteralTestUtilTest, NotEqualHasValuesInMessage) {
   auto actual = LiteralUtil::CreateR1<int32>({4, 5, 6});
   ::testing::AssertionResult result = LiteralTestUtil::Equal(expected, actual);
   EXPECT_THAT(result.message(),
-              ::testing::HasSubstr("Expected literal:\n{1, 2, 3}"));
+              ::testing::HasSubstr("Expected literal:\ns32[3] {1, 2, 3}"));
   EXPECT_THAT(result.message(),
-              ::testing::HasSubstr("Actual literal:\n{4, 5, 6}"));
+              ::testing::HasSubstr("Actual literal:\ns32[3] {4, 5, 6}"));
 }
 
 TEST(LiteralTestUtilTest, NearComparatorR1) {
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index a99b43f4690b3063f76e2cda1e58c9b4ba9a1df4..96527886b718bc1ea4ce8cc2d7dbeb2e3ef1d1eb 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -205,7 +205,7 @@ XLA_TEST_F(LocalClientExecuteTest, TupleResult) {
   ScopedShapedBuffer result =
       ExecuteLocallyOrDie(computation, {&x_array, &y_array});
 
-  EXPECT_TRUE(ShapeUtil::IsTuple(result.on_host_shape()));
+  EXPECT_TRUE(result.on_host_shape().IsTuple());
   EXPECT_EQ(3, ShapeUtil::TupleElementCount(result.on_host_shape()));
 
   Literal result_literal = ShapedBufferToLiteral(result);
@@ -233,7 +233,7 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleResult) {
   ScopedShapedBuffer result =
       ExecuteLocallyOrDie(computation, {&x_array, &y_array});
 
-  EXPECT_TRUE(ShapeUtil::IsTuple(result.on_host_shape()));
+  EXPECT_TRUE(result.on_host_shape().IsTuple());
   EXPECT_EQ(2, ShapeUtil::TupleElementCount(result.on_host_shape()));
 
   Literal result_literal = ShapedBufferToLiteral(result);
@@ -311,7 +311,7 @@ XLA_TEST_F(LocalClientExecuteTest, TupleArguments) {
   ScopedShapedBuffer result =
       ExecuteLocallyOrDie(computation, {&x_buffer, &y_buffer});
 
-  EXPECT_TRUE(ShapeUtil::IsTuple(result.on_host_shape()));
+  EXPECT_TRUE(result.on_host_shape().IsTuple());
   EXPECT_EQ(2, ShapeUtil::TupleElementCount(result.on_host_shape()));
 
   Literal result_literal = ShapedBufferToLiteral(result);
@@ -842,7 +842,8 @@ XLA_TEST_F(LocalClientExecuteTest, ShapeBufferToLiteralConversion64bit) {
        LiteralUtil::CreateR0<int64>(123456789000LL)}));
 }
 
-XLA_TEST_F(LocalClientExecuteTest, InfeedTest) {
+// Disabled on interpreter backend since infeed HLO is unsupported.
+XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_INTERPRETER(InfeedTest)) {
   XlaBuilder builder(TestName());
   const Shape shape = ShapeUtil::MakeShape(F32, {3});
   auto in = Infeed(&builder, shape);
@@ -867,7 +868,8 @@ XLA_TEST_F(LocalClientExecuteTest, InfeedTest) {
   LiteralTestUtil::ExpectR1Equal<float>({-4.0, 125.0, 45.0}, result);
 }
 
-XLA_TEST_F(LocalClientExecuteTest, InfeedOutfeedTest) {
+// Disabled on interpreter backend since infeed/outfeed HLOs are unsupported.
+XLA_TEST_F(LocalClientExecuteTest, DISABLED_ON_INTERPRETER(InfeedOutfeedTest)) {
   XlaBuilder builder(TestName());
   const Shape shape = ShapeUtil::MakeShape(F32, {3});
   auto in = Infeed(&builder, shape);
diff --git a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
index 3f5135438fc59bea98527b1be30ee49339edd455..1fd9cb055c0bebc0f31496eb82f53a7b7a6cbfba 100644
--- a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
@@ -208,9 +208,7 @@ XLA_TEST_F(MultiOutputFusionTest, FusionNodeIsRoot) {
       ROOT fusion = (s32[]) fusion(x), kind=kLoop, calls=fused_computation
     }
   )";
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param = LiteralUtil::MakeTupleOwned(
       LiteralUtil::MakeTupleOwned(
           LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR0<int32>(42)),
@@ -241,9 +239,7 @@ XLA_TEST_F(MultiOutputFusionTest, MultiOutputLoopFusion) {
       const = f32[4] constant({0, 0, 0, 0})
       ROOT select = f32[4] select(gte0, gte1, const)
     })";
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param = LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0, -1.0});
   Literal result = ExecuteNoHloPasses(std::move(module), {&param});
   LiteralTestUtil::ExpectR1Equal<float>({0.0, 4.0, 9.0, 1.0}, result);
@@ -273,9 +269,7 @@ XLA_TEST_F(MultiOutputFusionTest, MultiOutputLoopFeedingMap) {
       p1 = f32[3] parameter(0)
       ROOT map = f32[3] map(p1), to_apply=map_computation
     })";
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param = LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0});
   Literal result = ExecuteNoHloPasses(std::move(module), {&param});
   LiteralTestUtil::ExpectR1Equal<float>({0.0, 4.0, 9.0}, result);
@@ -315,9 +309,7 @@ XLA_TEST_F(MultiOutputFusionTest,
       ROOT fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(p), kind=kInput,
                                                         calls=fused_reduce
     })");
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param =
       LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
   Literal result = ExecuteNoHloPasses(std::move(module), {&param});
@@ -346,9 +338,7 @@ XLA_TEST_F(MultiOutputFusionTest,
       ROOT fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(p), kind=kInput,
                                                         calls=fused_reduce
     })");
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param =
       LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
   Literal result = ExecuteNoHloPasses(std::move(module), {&param});
@@ -378,9 +368,7 @@ XLA_TEST_F(MultiOutputFusionTest,
       ROOT fusion = (f32[2]{0}, f32[2]{0}, f32[2]{0}) fusion(p), kind=kInput,
                                                         calls=fused_reduce
     })");
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param =
       LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
   Literal result = ExecuteNoHloPasses(std::move(module), {&param});
@@ -410,9 +398,7 @@ XLA_TEST_F(MultiOutputFusionTest,
       ROOT fusion = (f32[2,2,2]{2,1,0}, f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(p),
                                                  kind=kInput, calls=fused_reduce
     })");
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param =
       LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
   Literal result = ExecuteNoHloPasses(std::move(module), {&param});
@@ -443,9 +429,7 @@ XLA_TEST_F(MultiOutputFusionTest,
       ROOT fusion = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}, f32[2,2]{1,0}) fusion(p),
                                                  kind=kInput, calls=fused_reduce
     })");
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param =
       LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
   Literal result = ExecuteNoHloPasses(std::move(module), {&param});
@@ -478,9 +462,7 @@ XLA_TEST_F(MultiOutputFusionTest,
       ROOT fusion = (f32[2]{0}, f32[2,2,2]{2,1,0}, f32[2,2,2]{2,1,0}) fusion(p),
                                                  kind=kInput, calls=fused_reduce
     })");
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param =
       LiteralUtil::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
   Literal result = ExecuteNoHloPasses(std::move(module), {&param});
@@ -513,9 +495,7 @@ XLA_TEST_F(MultiOutputFusionTest,
       ROOT fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(p, i, j), kind=kInput,
                                                               calls=fused_reduce
     })");
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param =
       LiteralUtil::CreateR3<float>({{{0, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
   auto init1 = LiteralUtil::CreateR0<float>(5);
@@ -549,9 +529,7 @@ XLA_TEST_F(MultiOutputFusionTest,
       ROOT fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}, f16[2,2,2]{2,1,0}) fusion(p),
                     kind=kInput, calls=fused_reduce
     })");
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param = LiteralUtil::CreateR3<Eigen::half>(
       {{{Eigen::half(1), Eigen::half(2)}, {Eigen::half(3), Eigen::half(4)}},
        {{Eigen::half(5), Eigen::half(6)}, {Eigen::half(7), Eigen::half(8)}}});
diff --git a/tensorflow/compiler/xla/tests/plugin.bzl b/tensorflow/compiler/xla/tests/plugin.bzl
index 8a5d91363b619c6b214a96ad96e92742e3052541..107869fe59d43d0a9a3e2b14af2c09e4906d9f15 100644
--- a/tensorflow/compiler/xla/tests/plugin.bzl
+++ b/tensorflow/compiler/xla/tests/plugin.bzl
@@ -33,4 +33,3 @@
 # }
 
 plugins = {}
-
diff --git a/tensorflow/compiler/xla/tests/prng_test.cc b/tensorflow/compiler/xla/tests/prng_test.cc
index 8f2c26f0eea9c7a3b33cd77e5977924c1659535a..e49bcf26bd6e50f8fb36c86f217907b5d4901eae 100644
--- a/tensorflow/compiler/xla/tests/prng_test.cc
+++ b/tensorflow/compiler/xla/tests/prng_test.cc
@@ -80,7 +80,9 @@ XLA_TEST_F(PrngTest, LargeU01) { UniformTest<float>(0, 1, {0x100, 0x100}); }
 XLA_TEST_F(PrngTest, TwelveValuesU524) { UniformTest<int32>(5, 24, {12}); }
 
 // TODO(b/71543667): Fix Rng ops on LLVM backends.
-XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU(ScalarBF16Tests))) {
+// TODO(b/122047800): Interpreter does not support BF16 for RNG ops.
+XLA_TEST_F(PrngTest, DISABLED_ON_INTERPRETER(
+                         DISABLED_ON_GPU(DISABLED_ON_CPU(ScalarBF16Tests)))) {
   for (int64 seed = 0; seed < 100; ++seed) {
     // The largest negative number smaller than zero in bf16 that's not
     // denormalized.
@@ -103,7 +105,9 @@ XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU(ScalarBF16Tests))) {
 }
 
 // TODO(b/71543667): Fix Rng ops on LLVM backends.
-XLA_TEST_F(PrngTest, DISABLED_ON_GPU(DISABLED_ON_CPU(ScalarBF16CountTests))) {
+// TODO(b/122047800): Interpreter does not support BF16 for RNG ops.
+XLA_TEST_F(PrngTest, DISABLED_ON_INTERPRETER(DISABLED_ON_GPU(
+                         DISABLED_ON_CPU(ScalarBF16CountTests)))) {
   // There are 3 BF16 values in the range of [32.25, 33): 32.25, 32.5, 32.75,
   // they should get similar counts.
   bfloat16 low = static_cast<bfloat16>(32.25);
@@ -276,6 +280,39 @@ XLA_TEST_F(PrngTest, PassInGlobalRngSeed) {
   EXPECT_FALSE(LiteralTestUtil::Equal(result5, result6));
 }
 
+// This test verifies that the two RNG instructions with the same parameters in
+// the same HloComputation produces different values.
+XLA_TEST_F(PrngTest, DifferentValuesForIdenticalRngNodesInSameComputation) {
+  // Build a U[0,1) computation.
+  auto build_computation = [this]() {
+    XlaBuilder builder(TestName());
+    auto a = RngUniform(ConstantR0<int32>(&builder, 0),
+                        ConstantR0<int32>(&builder, 100),
+                        ShapeUtil::MakeShape(S32, {10}));
+    auto b = RngUniform(ConstantR0<int32>(&builder, 0),
+                        ConstantR0<int32>(&builder, 100),
+                        ShapeUtil::MakeShape(S32, {10}));
+    Tuple(&builder, {a, b});
+    return builder.Build();
+  };
+
+  ExecutionOptions execution_options = execution_options_;
+  execution_options.set_seed(42);
+
+  Literal result_tuple;
+  {
+    TF_ASSERT_OK_AND_ASSIGN(auto computation, build_computation());
+    TF_ASSERT_OK_AND_ASSIGN(
+        result_tuple, client_->ExecuteAndTransfer(computation, /*arguments=*/{},
+                                                  &execution_options));
+  }
+
+  auto results = result_tuple.DecomposeTuple();
+  ASSERT_EQ(results.size(), 2);
+
+  EXPECT_FALSE(LiteralTestUtil::Equal(results[0], results[1]));
+}
+
 XLA_TEST_F(PrngTest, TenValuesN01) {
   XlaBuilder builder(TestName());
   RngNormal(ConstantR0<float>(&builder, 0), ConstantR0<float>(&builder, 1),
diff --git a/tensorflow/compiler/xla/tests/ptxas_bug_120501638.cc b/tensorflow/compiler/xla/tests/ptxas_bug_120501638.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0e5d7db97e88936e7336ed02a5c7a1171254b0cf
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/ptxas_bug_120501638.cc
@@ -0,0 +1,82 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+namespace {
+
+class PtxasBugTest : public HloTestBase {};
+
+// Checks for a bug in ptxas, tracked as Google bug 120501638, and nvidia bug
+// 2459377.  We never received an explanation of what exactly was going wrong
+// here in ptxas.  Known-bad in ptxas 10.0.145, known-good in ptxas 10.0.249.
+TEST_F(PtxasBugTest, DoIt) {
+  const char* const kModuleStr = R"(
+HloModule test
+
+add_F32.14 {
+  lhs.15 = f32[] parameter(0)
+  rhs.16 = f32[] parameter(1)
+  ROOT add.17 = f32[] add(lhs.15, rhs.16)
+}
+
+ENTRY testcase {
+  arg0.1 = f32[2,5,2]{2,1,0} parameter(0)
+  reshape.2 = f32[2,5,2]{2,1,0} reshape(arg0.1)
+  constant.3 = f32[] constant(0)
+  pad.4 = f32[2,6,2]{2,1,0} pad(reshape.2, constant.3), padding=0_0x0_1x0_0
+  reshape.5 = f32[2,3,2,2]{3,2,1,0} reshape(pad.4)
+  transpose.6 = f32[2,2,3,2]{3,0,2,1} transpose(reshape.5), dimensions={2,0,1,3}
+  reshape.7 = f32[4,3,2]{2,1,0} reshape(transpose.6)
+  reshape.8 = f32[4,1,3,2]{3,2,1,0} reshape(reshape.7)
+  transpose.9 = f32[4,2,1,3]{1,3,2,0} transpose(reshape.8), dimensions={0,3,1,2}
+  convert.10 = f32[4,2,1,3]{1,3,2,0} convert(transpose.9)
+  constant.12 = f32[] constant(0)
+  pad.13 = f32[4,2,1,3]{3,2,1,0} pad(convert.10, constant.12), padding=0_0x0_0x0_0x0_0
+  constant.11 = f32[] constant(0)
+  reduce-window.18 = f32[4,2,1,3]{3,2,1,0} reduce-window(pad.13, constant.11),
+    window={size=1x1x1x1}, to_apply=add_F32.14
+  constant.19 = f32[] constant(1)
+  broadcast.20 = f32[4,2,1,3]{3,2,1,0} broadcast(constant.19), dimensions={}
+  divide.21 = f32[4,2,1,3]{3,2,1,0} divide(reduce-window.18, broadcast.20)
+  convert.22 = f32[4,2,1,3]{3,2,1,0} convert(divide.21)
+  transpose.23 = f32[4,1,3,2]{2,1,3,0} transpose(convert.22), dimensions={0,2,3,1}
+  reshape.24 = f32[4,3,2]{2,1,0} reshape(transpose.23)
+  reshape.25 = f32[2,2,3,2]{3,2,1,0} reshape(reshape.24)
+  transpose.26 = f32[2,3,2,2]{3,1,0,2} transpose(reshape.25), dimensions={1,2,0,3}
+  reshape.27 = f32[2,6,2]{2,1,0} reshape(transpose.26)
+  slice.28 = f32[2,5,2]{2,1,0} slice(reshape.27), slice={[0:2], [0:5], [0:2]}
+  reshape.29 = f32[2,5,2]{2,1,0} reshape(slice.28)
+  tuple.30 = (f32[2,5,2]{2,1,0}) tuple(reshape.29)
+  ROOT get-tuple-element.31 = f32[2,5,2]{2,1,0} get-tuple-element(tuple.30), index=0
+})";
+
+  // Create a module with the true-default flags, not the default-for-testing
+  // flags.  In particular, true-default flags enable unrolling, whereas for
+  // testing we disable unrolling, and this bug doesn't trigger without
+  // unrolling.
+  HloModuleConfig config;
+  config.set_debug_options(DefaultDebugOptionsIgnoringFlags());
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kModuleStr, config));
+  EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec{0.01, 0.01}));
+}
+
+}  // anonymous namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/reduce_precision_test.cc b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
index f80d29b9de440b11c36e8c9bc65d4a93353a6267..e2cf4c0be289b52d5cc581ea07752ed6e98da76f 100644
--- a/tensorflow/compiler/xla/tests/reduce_precision_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 22fe4a2670e2e0e1fedc45036a1ceec19f44e42e..30e2d24184a5d399e5e058a9c4a382f57e82866f 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -607,7 +607,10 @@ class R4ReduceWindowTest : public ReduceWindowTestBase,
 
     Array4D<float> input(param.base_bounds[0], param.base_bounds[1],
                          param.base_bounds[2], param.base_bounds[3]);
-    input.FillRandom(0.1f, 0.1f);
+    // Choose a prime iota length so that each window sees a unique set of
+    // values. (Technically, the requirement is that the iota length is
+    // relatively prime to all of the dimensions involved in the reduce-window.)
+    input.FillRepeatedIota(0, 137);
     Literal input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
         input, LayoutUtil::MakeLayout(param.layout));
     XlaOp parameter;
@@ -623,9 +626,9 @@ class R4ReduceWindowTest : public ReduceWindowTestBase,
         CreateConstantFromLiteral(LiteralUtil::CreateR0(kInitValue), &b);
     CHECK(param.reducer == kAdd || param.reducer == kMax);
     auto reducer = param.reducer;
-    if (use_bfloat16() && Product(param.window_bounds) > 128) {
-      // To avoid numerical issues, force the reducer to be kMax for large bf16
-      // windows.
+    if (use_bfloat16()) {
+      // To avoid numerical issues, force the reducer to be kMax for bf16
+      // inputs.
       reducer = kMax;
     }
 
@@ -949,16 +952,16 @@ struct R3ReduceWindowTestData {
      /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
     {/*base_bounds=*/{95, 202, 251}, /*window_bounds=*/{95, 202, 251},
      /*strides=*/{1, 1, 1}, /*layout=*/{2, 1, 0},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kMax},
     {/*base_bounds=*/{999, 57, 3}, /*window_bounds=*/{999, 57, 3},
      /*strides=*/{1, 1, 1}, /*layout=*/{2, 1, 0},
      /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
     {/*base_bounds=*/{178, 302, 64}, /*window_bounds=*/{178, 302, 64},
      /*strides=*/{1, 1, 1}, /*layout=*/{2, 1, 0},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kMax},
     {/*base_bounds=*/{63, 261, 257}, /*window_bounds=*/{63, 261, 257},
      /*strides=*/{1, 1, 1}, /*layout=*/{2, 1, 0},
-     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
+     /*padding=*/Padding::kValid, /*reducer=*/Reducer::kMax},
     {/*base_bounds=*/{10003, 10, 5}, /*window_bounds=*/{9999, 7, 3},
      /*strides=*/{1, 1, 1}, /*layout=*/{2, 1, 0},
      /*padding=*/Padding::kValid, /*reducer=*/Reducer::kAdd},
@@ -1001,17 +1004,19 @@ TEST_P(R3ReduceWindowTest, DoIt) {
   const float kInitValue = 0.0f;
   Array3D<float> input(param.base_bounds[0], param.base_bounds[1],
                        param.base_bounds[2]);
-  input.FillRandom(0.1f, 0.1f);
+  // Choose a prime iota length so that each window sees a unique set of values.
+  // (Technically, the requirement is that the iota length is relatively prime
+  // to all of the dimensions involved in the reduce-window.)
+  input.FillRepeatedIota(0, 137);
   Literal input_literal = LiteralUtil::CreateR3FromArray3DWithLayout(
       input, LayoutUtil::MakeLayout(param.layout));
   auto reducer = param.reducer;
   if (use_bfloat16()) {
     input_literal = LiteralUtil::ConvertF32ToBF16(input_literal);
-    if (Product(param.window_bounds) > 128) {
-      // To avoid numerical issues, force the reducer to be kMax for large bf16
-      // windows.
-      reducer = kMax;
-    }
+
+    // To avoid numerical issues, force the reducer to be kMax for bf16
+    // inputs.
+    reducer = kMax;
   }
 
   XlaOp parameter = Parameter(&b, 0, input_literal.shape(), "input");
@@ -1527,6 +1532,25 @@ ENTRY %reduce-window (parameter.0: s32[81,8], parameter.1: s32[]) -> s32[82,8] {
   EXPECT_TRUE(RunAndCompare(hlo_string, absl::nullopt));
 }
 
+XLA_TEST_F(HloTestBase, ReduceWindowS64) {
+  const string hlo_string = R"(
+HloModule reduce-window
+
+%identity.pad_to_reduce_window (param0: s64[], param1: s64[]) -> s64[] {
+  %param0 = s64[] parameter(0)
+  ROOT %param1 = s64[] parameter(1)
+}
+
+ENTRY %reduce-window (parameter.0: s64[81,8], parameter.1: s64[]) -> s64[82,8] {
+  %parameter.0 = s64[81,8]{1,0} parameter(0)
+  %parameter.1 = s64[] parameter(1)
+  ROOT %reduce-window = s64[82,8]{1,0} reduce-window(s64[81,8]{1,0} %parameter.0, s64[] %parameter.1), window={size=1x1 pad=0_1x0_0}, to_apply=%identity.pad_to_reduce_window
+}
+
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_string, absl::nullopt));
+}
+
 XLA_TEST_F(HloTestBase, ReduceWindowF16) {
   const string hlo_string = R"(
 HloModule reduce-window
diff --git a/tensorflow/compiler/xla/tests/test_macros.h b/tensorflow/compiler/xla/tests/test_macros.h
index 7ca99a91635e85cd0888e59ecde31e47fec21844..80a6868485c9162d1cb0de24f0adf3f1c1d2503a 100644
--- a/tensorflow/compiler/xla/tests/test_macros.h
+++ b/tensorflow/compiler/xla/tests/test_macros.h
@@ -79,30 +79,28 @@ string PrependDisabledIfIndicated(const string& test_case_name,
 // heuristic to decide whether the test case should be disabled, and we
 // determine whether the test case should be disabled by resolving the (test
 // case name, test name) in a manifest file.
-#define XLA_GTEST_TEST_(test_case_name, test_name, parent_class, parent_id)   \
-  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name)                     \
-      : public parent_class {                                                 \
-   public:                                                                    \
-    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}                    \
-                                                                              \
-   private:                                                                   \
-    virtual void TestBody();                                                  \
-    static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;     \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_case_name,    \
-                                                           test_name));       \
-  };                                                                          \
-                                                                              \
-  ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name,           \
-                                                    test_name)::test_info_ =  \
-      ::testing::internal::MakeAndRegisterTestInfo(                           \
-          #test_case_name,                                                    \
-          ::xla::PrependDisabledIfIndicated(#test_case_name, #test_name)      \
-              .c_str(),                                                       \
-          nullptr, nullptr,                                                   \
-          ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id), \
-          parent_class::SetUpTestCase, parent_class::TearDownTestCase,        \
-          new ::testing::internal::TestFactoryImpl<GTEST_TEST_CLASS_NAME_(    \
-              test_case_name, test_name)>);                                   \
+#define XLA_GTEST_TEST_(test_case_name, test_name, parent_class)             \
+  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name)                    \
+      : public parent_class {                                                \
+   public:                                                                   \
+    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}                   \
+                                                                             \
+   private:                                                                  \
+    virtual void TestBody();                                                 \
+    static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;    \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_case_name,   \
+                                                           test_name));      \
+  };                                                                         \
+                                                                             \
+  ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name,          \
+                                                    test_name)::test_info_ = \
+      ::testing::RegisterTest(                                               \
+          #test_case_name,                                                   \
+          ::xla::PrependDisabledIfIndicated(#test_case_name, #test_name)     \
+              .c_str(),                                                      \
+          nullptr, nullptr, __FILE__, __LINE__, []() -> parent_class* {      \
+            return new GTEST_TEST_CLASS_NAME_(test_case_name, test_name)();  \
+          });                                                                \
   void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
 
 // This is identical to the TEST_F macro from "gtest", but it potentially
@@ -111,9 +109,8 @@ string PrependDisabledIfIndicated(const string& test_case_name,
 // Per usual, you can see what tests are available via --gunit_list_tests and
 // choose to run tests that have been disabled via the manifest via
 // --gunit_also_run_disabled_tests.
-#define XLA_TEST_F(test_fixture, test_name)              \
-  XLA_GTEST_TEST_(test_fixture, test_name, test_fixture, \
-                  ::testing::internal::GetTypeId<test_fixture>())
+#define XLA_TEST_F(test_fixture, test_name) \
+  XLA_GTEST_TEST_(test_fixture, test_name, test_fixture)
 
 // Likewise, this is identical to the TEST_P macro from "gtest", but
 // potentially disables the test based on the DISABLED_MANIFEST file.
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index eafa48ed7b8cf2bd67fe767ad36082661dbbd66e..67d2258928f75c078588c9425359f9468f4463ed 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
@@ -168,7 +169,7 @@ void PopulateWithRandomIntegralData(Literal* literal, std::minstd_rand0* engine,
 StatusOr<Literal> MakeFakeLiteralInternal(const Shape& shape,
                                           std::minstd_rand0* engine,
                                           bool no_duplicates) {
-  if (ShapeUtil::IsTuple(shape)) {
+  if (shape.IsTuple()) {
     std::vector<Literal> elements;
     for (const Shape& element_shape : shape.tuple_shapes()) {
       TF_ASSIGN_OR_RETURN(
@@ -237,6 +238,79 @@ StatusOr<Literal> MakeFakeLiteralInternal(const Shape& shape,
   return std::move(literal);
 }
 
+template <typename IntT>
+void PopulateWithRandomIntegralDataWithBounds(Literal* literal,
+                                              std::minstd_rand0* engine,
+                                              IntT min, IntT max) {
+  CHECK(engine != nullptr);
+  CHECK_EQ(literal->shape().element_type(),
+           primitive_util::NativeToPrimitiveType<IntT>());
+  std::uniform_int_distribution<IntT> generator(min, max);
+  for (IntT& value : literal->data<IntT>()) {
+    value = generator(*engine);
+  }
+}
+
+// Same as MakeFakeLiteralInternal but generates random numbers in the given
+// range [min, max]. Currently this works only for INT types.
+StatusOr<Literal> MakeFakeLiteralInternalWithBounds(const Shape& shape,
+                                                    std::minstd_rand0* engine,
+                                                    int64 min, int64 max) {
+  if (shape.IsTuple()) {
+    std::vector<Literal> elements;
+    for (const Shape& element_shape : shape.tuple_shapes()) {
+      TF_ASSIGN_OR_RETURN(
+          Literal element,
+          MakeFakeLiteralInternalWithBounds(element_shape, engine, min, max));
+      elements.push_back(std::move(element));
+    }
+    return LiteralUtil::MakeTupleOwned(std::move(elements));
+  }
+  if (engine == nullptr) {
+    return Literal::CreateFromShape(shape);
+  }
+  Literal literal(shape);
+  switch (shape.element_type()) {
+    case S8:
+      PopulateWithRandomIntegralDataWithBounds<int8>(
+          &literal, engine, static_cast<int8>(min), static_cast<int8>(max));
+      break;
+    case U8:
+      PopulateWithRandomIntegralDataWithBounds<uint8>(
+          &literal, engine, static_cast<uint8>(min), static_cast<uint8>(max));
+      break;
+    case S16:
+      PopulateWithRandomIntegralDataWithBounds<int16>(
+          &literal, engine, static_cast<int16>(min), static_cast<int16>(max));
+      break;
+    case U16:
+      PopulateWithRandomIntegralDataWithBounds<uint16>(
+          &literal, engine, static_cast<uint16>(min), static_cast<uint16>(max));
+      break;
+    case S32:
+      PopulateWithRandomIntegralDataWithBounds<int32>(
+          &literal, engine, static_cast<int32>(min), static_cast<int32>(max));
+      break;
+    case U32:
+      PopulateWithRandomIntegralDataWithBounds<uint32>(
+          &literal, engine, static_cast<uint32>(min), static_cast<uint32>(max));
+      break;
+    case S64:
+      PopulateWithRandomIntegralDataWithBounds<int64>(
+          &literal, engine, static_cast<int64>(min), static_cast<int64>(max));
+      break;
+    case U64:
+      PopulateWithRandomIntegralDataWithBounds<uint64>(
+          &literal, engine, static_cast<uint64>(min), static_cast<uint64>(max));
+      break;
+    default:
+      return Unimplemented(
+          "Unsupported type for fake random literal generation with bounds: %s",
+          ShapeUtil::HumanString(shape));
+  }
+  return std::move(literal);
+}
+
 enum class ConstantType { kUnknown, kZero, kOne };
 
 // Return the constant type required by this computation, if known.
@@ -274,16 +348,9 @@ bool NeedsInitValue(const HloUse& use) {
 
 // Generate random values that are constrained to the input_shape minus the
 // output_shape so as not to produce wrapping slices, for instance.
-Literal MakeRandomIndex(absl::Span<const int64> index_space,
-                        std::minstd_rand0* engine) {
-  std::vector<int32> start_indices(index_space.size());
-  if (engine != nullptr) {
-    for (int i = 0; i < index_space.size(); ++i) {
-      std::uniform_int_distribution<int32> generator(0, index_space[i]);
-      start_indices[i] = generator(*engine);
-    }
-  }
-  return LiteralUtil::CreateR1<int32>(start_indices);
+Literal MakeRandomIndex(int64 index_bound, std::minstd_rand0* engine) {
+  std::uniform_int_distribution<int32> generator(0, index_bound);
+  return LiteralUtil::CreateR0<int32>(generator(*engine));
 }
 
 // Use dataflow analysis on each parameter to see if there are uses that would
@@ -300,8 +367,12 @@ std::vector<HloInstruction*> FindConstrainedUses(
       HloInstruction* instruction = use.instruction;
       const HloOpcode opcode = instruction->opcode();
       const int64 op_num = use.operand_number;
-      if ((opcode == HloOpcode::kDynamicSlice && op_num == 1) ||
-          (opcode == HloOpcode::kDynamicUpdateSlice && op_num == 2)) {
+      if ((opcode == HloOpcode::kDynamicSlice && op_num >= 1) ||
+          (opcode == HloOpcode::kDynamicUpdateSlice && op_num >= 2)) {
+        constrained_uses.push_back(instruction);
+      } else if ((opcode == HloOpcode::kGather ||
+                  opcode == HloOpcode::kScatter) &&
+                 op_num == 1) {
         constrained_uses.push_back(instruction);
       } else if (opcode == HloOpcode::kFusion) {
         const HloInstruction* const to_analyze =
@@ -336,7 +407,7 @@ std::vector<HloInstruction*> FindConstrainedUses(
 StatusOr<Literal> CreateLiteralForConstrainedUses(
     const absl::Span<HloInstruction* const> constrained_uses,
     const HloInstruction& param, std::minstd_rand0* engine) {
-  std::vector<int64> index_space;
+  int64 index_bound = INT64_MAX;
   bool no_duplicates = false;
   bool needs_constant = false;
   ConstantType constant_type = ConstantType::kUnknown;
@@ -348,19 +419,32 @@ StatusOr<Literal> CreateLiteralForConstrainedUses(
         const Shape& slice_shape = use->opcode() == HloOpcode::kDynamicSlice
                                        ? use->shape()
                                        : use->operand(1)->shape();
-        const int64 rank = ShapeUtil::Rank(indexed_shape);
-        if (!index_space.empty()) {
-          TF_RET_CHECK(rank == index_space.size());
-          for (int64 i = 0; i < rank; ++i) {
-            index_space[i] = std::min(
-                index_space[i], ShapeUtil::GetDimension(indexed_shape, i) -
-                                    ShapeUtil::GetDimension(slice_shape, i));
+        const int64 first_index =
+            Cast<HloDynamicIndexInstruction>(use)->first_index_operand_number();
+        for (int64 operand = first_index; operand < use->operand_count();
+             ++operand) {
+          if (use->operand(operand) == &param) {
+            index_bound = std::min(
+                index_bound,
+                ShapeUtil::GetDimension(indexed_shape, operand - first_index) -
+                    ShapeUtil::GetDimension(slice_shape,
+                                            operand - first_index));
           }
-        } else {
-          index_space.resize(rank);
-          for (int64 i = 0; i < rank; ++i) {
-            index_space[i] = ShapeUtil::GetDimension(indexed_shape, i) -
-                             ShapeUtil::GetDimension(slice_shape, i);
+        }
+        break;
+      }
+      case HloOpcode::kGather:
+      case HloOpcode::kScatter: {
+        const Shape& operand_shape = use->operand(0)->shape();
+        if (use->operand(1) == &param) {
+          auto index_map =
+              use->opcode() == HloOpcode::kGather
+                  ? use->gather_dimension_numbers().start_index_map()
+                  : use->scatter_dimension_numbers()
+                        .scatter_dims_to_operand_dims();
+          for (const auto dim_in_operand : index_map) {
+            index_bound =
+                std::min(index_bound, operand_shape.dimensions(dim_in_operand));
           }
         }
         break;
@@ -388,13 +472,14 @@ StatusOr<Literal> CreateLiteralForConstrainedUses(
   }
   int constraint_count = 0;
   constraint_count += no_duplicates ? 1 : 0;
-  constraint_count += !index_space.empty() ? 1 : 0;
+  constraint_count += (index_bound != INT64_MAX) ? 1 : 0;
   constraint_count += needs_constant ? 1 : 0;
   if (constraint_count > 1) {
     return Unimplemented("Conflicting operand generation constraints.");
   }
-  if (!index_space.empty()) {
-    return MakeRandomIndex(index_space, engine);
+  if (index_bound != INT64_MAX) {
+    return MakeFakeLiteralInternalWithBounds(param.shape(), engine, -1,
+                                             index_bound);
   } else if (needs_constant) {
     switch (constant_type) {
       case ConstantType::kZero:
@@ -459,8 +544,8 @@ Status VerifyHloModule(HloModule* const module, bool layout_sensitive,
 std::unique_ptr<HloDotInstruction> CreateCanonicalDot(const Shape& shape,
                                                       HloInstruction* lhs,
                                                       HloInstruction* rhs) {
-  CHECK_EQ(ShapeUtil::Rank(lhs->shape()), 2);
-  CHECK_EQ(ShapeUtil::Rank(rhs->shape()), 2);
+  CHECK_EQ(lhs->shape().rank(), 2);
+  CHECK_EQ(rhs->shape().rank(), 2);
   PrecisionConfig precision_config;
   precision_config.mutable_operand_precision()->Resize(
       2, PrecisionConfig::DEFAULT);
diff --git a/tensorflow/compiler/xla/tests/test_utils_test.cc b/tensorflow/compiler/xla/tests/test_utils_test.cc
index e8f5d7a9a79ebddea3cb989dbe8eab90b630d5e7..f68ee04565f3898bd3db455e3e102bc2edb6255a 100644
--- a/tensorflow/compiler/xla/tests/test_utils_test.cc
+++ b/tensorflow/compiler/xla/tests/test_utils_test.cc
@@ -61,11 +61,11 @@ XLA_TEST_F(TestUtilsTest, Token) {
                     R"(HloModule outfeed_module
 
     ENTRY InfeedToOutfeed {
-      token = token[] parameter(0)
-      infeed = ((u32[3]{0}, pred[]), token[]) infeed(token)
+      token0 = token[] parameter(0)
+      infeed = ((u32[3]{0}, pred[]), token[]) infeed(token0)
       infeed.data = (u32[3]{0}, pred[]) get-tuple-element(infeed), index=0
-      outfeed = token[] outfeed(infeed.data, token)
-      ROOT infeed.1 = ((u32[3]{0}, pred[]), token[]) infeed(token)
+      outfeed = token[] outfeed(infeed.data, token0)
+      ROOT infeed.1 = ((u32[3]{0}, pred[]), token[]) infeed(token0)
       infeed.1.data = (u32[3]{0}, pred[]) get-tuple-element(infeed.1), index=0
       infeed.1.token = token[] get-tuple-element(infeed.1), index=1
       outfeed.1 = token[] outfeed(infeed.1.data, infeed.1.token)
@@ -79,25 +79,27 @@ XLA_TEST_F(TestUtilsTest, MultipleIndexSpacesForDynamicSlices) {
                     R"(HloModule index_space_module
 
     ENTRY IndexSpace {
-      index_param = s32[3]{0} parameter(0)
-      array_param.1 = f32[123,4,789]{0,1,2} parameter(1)
-      array_param.2 = f32[3,3000,5]{0,1,2} parameter(2)
-      dynamic-slice.1 = f32[1,2,3] dynamic-slice(array_param.1, index_param), dynamic_slice_sizes={1,2,3}
-      ROOT dynamic-slice.2 = f32[3,2,2] dynamic-slice(array_param.2, index_param), dynamic_slice_sizes={3,2,2}
+      index_param.0 = s32[] parameter(0)
+      index_param.1 = s32[] parameter(1)
+      index_param.2 = s32[] parameter(2)
+      array_param.1 = f32[123,4,789]{0,1,2} parameter(3)
+      array_param.2 = f32[3,3000,5]{0,1,2} parameter(4)
+      dynamic-slice.1 = f32[1,2,3] dynamic-slice(array_param.1, index_param.0, index_param.1, index_param.2), dynamic_slice_sizes={1,2,3}
+      ROOT dynamic-slice.2 = f32[3,2,2] dynamic-slice(array_param.2, index_param.0, index_param.1, index_param.2), dynamic_slice_sizes={3,2,2}
     })")
                     .ValueOrDie();
   TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> args,
                           MakeFakeArguments(module.get()));
-  ASSERT_EQ(args.size(), 3);
-  const Literal& index_arg = args[0];
+  ASSERT_EQ(args.size(), 5);
 
-  EXPECT_EQ(index_arg.Get<int32>({0}), 0);
+  EXPECT_GE(args[0].Get<int32>({}), -1);
+  EXPECT_LE(args[0].Get<int32>({}), 1);
 
-  EXPECT_GE(index_arg.Get<int32>({1}), 0);
-  EXPECT_LE(index_arg.Get<int32>({1}), 2);
+  EXPECT_GE(args[1].Get<int32>({}), -1);
+  EXPECT_LE(args[1].Get<int32>({}), 2);
 
-  EXPECT_GE(index_arg.Get<int32>({2}), 0);
-  EXPECT_LE(index_arg.Get<int32>({2}), 3);
+  EXPECT_GE(args[2].Get<int32>({}), -1);
+  EXPECT_LE(args[2].Get<int32>({}), 3);
 }
 
 XLA_TEST_F(TestUtilsTest, MultipleIndexSpacesForDynamicUpdateSlices) {
@@ -105,28 +107,30 @@ XLA_TEST_F(TestUtilsTest, MultipleIndexSpacesForDynamicUpdateSlices) {
                     R"(HloModule index_space_module
 
     ENTRY IndexSpace {
-      index_param = s32[3]{0} parameter(0)
-      array_param.1 = f32[123,4,789]{0,1,2} parameter(1)
-      array_param.2 = f32[3,3000,5]{0,1,2} parameter(2)
-      update_param.1 = f32[1,2,3]{0,1,2} parameter(3)
-      update_param.2 = f32[3,2,2]{0,1,2} parameter(4)
-
-      dynamic-update-slice.1 = f32[123,4,789] dynamic-update-slice(array_param.1, update_param.1, index_param)
-      ROOT dynamic-update-slice.2 = f32[3,3000,5] dynamic-update-slice(array_param.2, update_param.2, index_param)
+      index_param.0 = s32[] parameter(0)
+      index_param.1 = s32[] parameter(1)
+      index_param.2 = s32[] parameter(2)
+      array_param.1 = f32[123,4,789]{0,1,2} parameter(3)
+      array_param.2 = f32[3,3000,5]{0,1,2} parameter(4)
+      update_param.1 = f32[1,2,3]{0,1,2} parameter(5)
+      update_param.2 = f32[3,2,2]{0,1,2} parameter(6)
+
+      dynamic-update-slice.1 = f32[123,4,789] dynamic-update-slice(array_param.1, update_param.1, index_param.0, index_param.1, index_param.2)
+      ROOT dynamic-update-slice.2 = f32[3,3000,5] dynamic-update-slice(array_param.2, update_param.2, index_param.0, index_param.1, index_param.2)
     })")
                     .ValueOrDie();
   TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> args,
                           MakeFakeArguments(module.get()));
-  ASSERT_EQ(args.size(), 5);
-  const Literal& index_arg = args[0];
+  ASSERT_EQ(args.size(), 7);
 
-  EXPECT_EQ(index_arg.Get<int32>({0}), 0);
+  EXPECT_GE(args[0].Get<int32>({}), -1);
+  EXPECT_LE(args[0].Get<int32>({}), 1);
 
-  EXPECT_GE(index_arg.Get<int32>({1}), 0);
-  EXPECT_LE(index_arg.Get<int32>({1}), 2);
+  EXPECT_GE(args[1].Get<int32>({}), -1);
+  EXPECT_LE(args[1].Get<int32>({}), 2);
 
-  EXPECT_GE(index_arg.Get<int32>({2}), 0);
-  EXPECT_LE(index_arg.Get<int32>({2}), 3);
+  EXPECT_GE(args[2].Get<int32>({}), -1);
+  EXPECT_LE(args[2].Get<int32>({}), 3);
 }
 
 XLA_TEST_F(TestUtilsTest, NoDuplicatesFloats) {
@@ -134,10 +138,18 @@ XLA_TEST_F(TestUtilsTest, NoDuplicatesFloats) {
   auto module = ParseHloString(R"(
 HloModule sort.148.1589
 
+compare {
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  p.1.lhs = s32[] parameter(2)
+  p.1.rhs = s32[] parameter(3)
+  ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+}
+
 ENTRY %sort.148.1589 (parameter.0: f32[1048576], parameter.1: s32[1048576]) -> (f32[1048576], s32[1048576]) {
   %parameter.0 = f32[1048576]{0} parameter(0)
   %parameter.1 = s32[1048576]{0} parameter(1)
-  ROOT %sort.148.1589 = (f32[1048576]{0}, s32[1048576]{0}) sort(f32[1048576]{0} %parameter.0, s32[1048576]{0} %parameter.1), dimensions={0}
+  ROOT %sort.148.1589 = (f32[1048576]{0}, s32[1048576]{0}) sort(f32[1048576]{0} %parameter.0, s32[1048576]{0} %parameter.1), dimensions={0}, to_apply=compare
 }
 )")
                     .ValueOrDie();
@@ -157,10 +169,18 @@ XLA_TEST_F(TestUtilsTest, NoDuplicatesInt32) {
   auto module = ParseHloString(R"(
 HloModule sort.148.1589
 
+compare {
+  p.0.lhs = s32[] parameter(0)
+  p.0.rhs = s32[] parameter(1)
+  p.1.lhs = s32[] parameter(2)
+  p.1.rhs = s32[] parameter(3)
+  ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+}
+
 ENTRY %sort.148.1589 (parameter.0: s32[1048576], parameter.1: s32[1048576]) -> (s32[1048576], s32[1048576]) {
   %parameter.0 = s32[1048576]{0} parameter(0)
   %parameter.1 = s32[1048576]{0} parameter(1)
-  ROOT %sort.148.1589 = (s32[1048576]{0}, s32[1048576]{0}) sort(s32[1048576]{0} %parameter.0, s32[1048576]{0} %parameter.1), dimensions={0}
+  ROOT %sort.148.1589 = (s32[1048576]{0}, s32[1048576]{0}) sort(s32[1048576]{0} %parameter.0, s32[1048576]{0} %parameter.1), dimensions={0}, to_apply=compare
 }
 )")
                     .ValueOrDie();
@@ -180,10 +200,18 @@ XLA_TEST_F(TestUtilsTest, NoDuplicatesBfloat16) {
   auto module = ParseHloString(R"(
 HloModule sort, is_scheduled=true
 
+compare {
+  p.0.lhs = bf16[] parameter(0)
+  p.0.rhs = bf16[] parameter(1)
+  p.1.lhs = s32[] parameter(2)
+  p.1.rhs = s32[] parameter(3)
+  ROOT lt = pred[] less-than(p.0.lhs, p.0.rhs)
+}
+
 ENTRY %sort. (parameter.0: bf16[2,1452], parameter.1: s32[2,1452]) -> (bf16[2,1452], s32[2,1452]) {
   %parameter.0 = bf16[2,1452]{1,0} parameter(0)
   %parameter.1 = s32[2,1452]{1,0} parameter(1)
-  ROOT %sort = (bf16[2,1452]{1,0}, s32[2,1452]{1,0}) sort(bf16[2,1452]{1,0} %parameter.0, s32[2,1452]{1,0} %parameter.1), dimensions={1}
+  ROOT %sort = (bf16[2,1452]{1,0}, s32[2,1452]{1,0}) sort(bf16[2,1452]{1,0} %parameter.0, s32[2,1452]{1,0} %parameter.1), dimensions={1}, to_apply=compare
 }
 )")
                     .ValueOrDie();
@@ -198,5 +226,105 @@ ENTRY %sort. (parameter.0: bf16[2,1452], parameter.1: s32[2,1452]) -> (bf16[2,14
   }
 }
 
+XLA_TEST_F(TestUtilsTest, MakeFakeArgumentsR0InputToDynamicSlice) {
+  auto module = ParseHloString(R"(
+HloModule Test
+
+ENTRY %module (parameter.0: s32[], parameter.1: f32[20,20]) -> f32[] {
+  %parameter.1 = f32[20,20]{1,0} parameter(1)
+  %constant.1 = s32[1]{0} constant({0})
+  %parameter.0 = s32[] parameter(0)
+  %bitcast.3 = s32[1]{0} bitcast(s32[] %parameter.0)
+  %concatenate.1 = s32[2]{0} concatenate(s32[1]{0} %constant.1, s32[1]{0} %bitcast.3), dimensions={0}
+  %dynamic-slice.2 = f32[20,1]{1,0} dynamic-slice(f32[20,20]{1,0} %parameter.1, s32[2]{0} %concatenate.1), dynamic_slice_sizes={20,1}
+  %bitcast.4 = f32[20]{0} bitcast(f32[20,1]{1,0} %dynamic-slice.2)
+  %dynamic-slice.3 = f32[1]{0} dynamic-slice(f32[20]{0} %bitcast.4, s32[1]{0} %bitcast.3), dynamic_slice_sizes={1}
+  ROOT %bitcast.5 = f32[] bitcast(f32[1]{0} %dynamic-slice.3)
+}
+)")
+                    .ValueOrDie();
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> args,
+                          MakeFakeArguments(module.get()));
+  ASSERT_EQ(args.size(), 2);
+  EXPECT_TRUE(ShapeUtil::Equal(args[0].shape(), ShapeUtil::MakeShape(S32, {})))
+      << ShapeUtil::HumanString(args[0].shape());
+  EXPECT_TRUE(
+      ShapeUtil::Equal(args[1].shape(), ShapeUtil::MakeShape(F32, {20, 20})))
+      << ShapeUtil::HumanString(args[1].shape());
+}
+
+XLA_TEST_F(TestUtilsTest, MakeFakeArgumentsForGather) {
+  auto module = ParseHloString(R"(
+  HloModule Test
+
+ENTRY %module(paramater.0: f32[200,100,300], parameter.1: s32[10,2]) ->
+                                                          f32[10,300] {
+  %parameter.0 = f32[200,100,300] parameter(0)
+  %parameter.1 = s32[10,2] parameter(1)
+  ROOT gather = f32[10,300] gather(f32[200,100,300] %parameter.0,
+                                   s32[10,2] %parameter.1),
+      offset_dims={1},
+      collapsed_slice_dims={0,1},
+      start_index_map={0,1},
+      index_vector_dim=1,
+      slice_sizes={1,1,300}
+}
+)")
+                    .ValueOrDie();
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> args,
+                          MakeFakeArguments(module.get()));
+  ASSERT_EQ(args.size(), 2);
+
+  const Shape& indices_shape = args[1].shape();
+  EXPECT_TRUE(
+      ShapeUtil::Equal(indices_shape, ShapeUtil::MakeShape(S32, {10, 2})))
+      << ShapeUtil::HumanString(indices_shape);
+  auto indices = args[1].data<int32>();
+  for (const auto index : indices) {
+    EXPECT_GE(index, -1);
+    EXPECT_LE(index, 100);
+  }
+}
+
+XLA_TEST_F(TestUtilsTest, MakeFakeArgumentsForScatter) {
+  auto module = ParseHloString(R"(
+  HloModule Test
+
+scatter_update (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  ROOT rhs = f32[] parameter(1)
+}
+
+ENTRY main {
+  operand = f32[200,100,300] parameter(0)
+  indices = s32[10,2] parameter(1)
+  updates = f32[10,300] parameter(2)
+  ROOT scatter = f32[200,100,300] scatter(operand, indices, updates),
+    to_apply=scatter_update,
+    update_window_dims={1},
+    inserted_window_dims={0,1},
+    scatter_dims_to_operand_dims={0,1},
+    index_vector_dim=1
+  }
+)")
+                    .ValueOrDie();
+
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> args,
+                          MakeFakeArguments(module.get()));
+  ASSERT_EQ(args.size(), 3);
+
+  const Shape& indices_shape = args[1].shape();
+  EXPECT_TRUE(
+      ShapeUtil::Equal(indices_shape, ShapeUtil::MakeShape(S32, {10, 2})))
+      << ShapeUtil::HumanString(indices_shape);
+  auto indices = args[1].data<int32>();
+  for (const auto index : indices) {
+    EXPECT_GE(index, -1);
+    EXPECT_LE(index, 100);
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/token_hlo_test.cc b/tensorflow/compiler/xla/tests/token_hlo_test.cc
index 601c6b06938fef1f1ae809b33209ae59b24c70a2..b77cf38ed8e29973985406015c0a3936916ad5e6 100644
--- a/tensorflow/compiler/xla/tests/token_hlo_test.cc
+++ b/tensorflow/compiler/xla/tests/token_hlo_test.cc
@@ -214,8 +214,8 @@ ENTRY %AddDependency (p0: f32[], p1: f32[]) -> f32[] {
 
   %forty_two = f32[] constant(42.0)
   %add = f32[] add(f32[] %p0, f32[] %forty_two)
-  %token = token[] after-all(f32[] %add)
-  %p1_after_token = f32[] add-dependency(f32[] %p1, token[] %token)
+  %token0 = token[] after-all(f32[] %add)
+  %p1_after_token = f32[] add-dependency(f32[] %p1, token[] %token0)
   %neg = f32[] negate(f32[] %p1_after_token)
   ROOT %product = f32[] multiply(f32[] %add, f32[] %neg)
 }
@@ -236,8 +236,8 @@ HloModule AddDependencyOfConstant, is_scheduled=true
 ENTRY %AddDependency (p0: f32[]) -> f32[] {
   %p0 = f32[] parameter(0)
   %forty_two = f32[] constant(42.0)
-  %token = token[] after-all(f32[] %p0)
-  %forty_two_after_token = f32[] add-dependency(f32[] %forty_two, token[] %token)
+  %token0 = token[] after-all(f32[] %p0)
+  %forty_two_after_token = f32[] add-dependency(f32[] %forty_two, token[] %token0)
   ROOT %product = f32[] multiply(f32[] %p0, f32[] %forty_two_after_token)
 }
 )";
@@ -255,8 +255,8 @@ HloModule AddDependencyAsRoot, is_scheduled=true
 ENTRY %AddDependency (p: f32[3]) -> f32[3] {
   %p = f32[3] parameter(0)
   %neg = f32[3] negate(f32[3] %p)
-  %token = token[] after-all()
-  ROOT %add_dep = f32[3] add-dependency(f32[3] %neg, token[] %token)
+  %token0 = token[] after-all()
+  ROOT %add_dep = f32[3] add-dependency(f32[3] %neg, token[] %token0)
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(
@@ -274,9 +274,9 @@ ENTRY %TupleShapedAddDependency (p0: f32[3], p1: f32[3]) -> f32[3] {
   %p0 = f32[3] parameter(0)
   %p1 = f32[3] parameter(1)
   %forty_two = f32[] constant(42.0)
-  %token = token[] after-all()
-  %tuple = (f32[3], token[], f32[3], f32[]) tuple(f32[3] %p0, token[] %token, f32[3] %p1, f32[] %forty_two)
-  %add_dep = (f32[3], token[], f32[3], f32[]) add-dependency((f32[3], token[], f32[3], f32[]) %tuple, token[] %token)
+  %token0 = token[] after-all()
+  %tuple = (f32[3], token[], f32[3], f32[]) tuple(f32[3] %p0, token[] %token0, f32[3] %p1, f32[] %forty_two)
+  %add_dep = (f32[3], token[], f32[3], f32[]) add-dependency((f32[3], token[], f32[3], f32[]) %tuple, token[] %token0)
   %elem0 = f32[3] get-tuple-element((f32[3], token[], f32[3], f32[]) %add_dep), index=0
   %elem2 = f32[3] get-tuple-element((f32[3], token[], f32[3], f32[]) %add_dep), index=2
   ROOT %diff = f32[3] subtract(f32[3] %elem0, f32[3] %elem2)
diff --git a/tensorflow/compiler/xla/tests/triangular_solve_test.cc b/tensorflow/compiler/xla/tests/triangular_solve_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..24ab12136ff396bd9ac37bb058311b0d2d6f2515
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/triangular_solve_test.cc
@@ -0,0 +1,502 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+using TriangularSolveTest = ClientLibraryTestBase;
+using TriangularSolveLeftLookingTest = ClientLibraryTestBase;
+
+static constexpr float kNan = std::numeric_limits<float>::quiet_NaN();
+
+Array2D<float> AValsLower() {
+  return {{2, kNan, kNan, kNan},
+          {3, 6, kNan, kNan},
+          {4, 7, 9, kNan},
+          {5, 8, 10, 11}};
+}
+
+Array2D<float> AValsUpper() {
+  return {{2, 3, 4, 5},
+          {kNan, 6, 7, 8},
+          {kNan, kNan, 9, 10},
+          {kNan, kNan, kNan, 11}};
+}
+
+Array2D<float> AValsLowerUnitDiagonal() {
+  return {{kNan, kNan, kNan, kNan},
+          {3, kNan, kNan, kNan},
+          {4, 7, kNan, kNan},
+          {5, 8, 10, kNan}};
+}
+
+Array2D<float> AValsUpperUnitDiagonal() {
+  return {{kNan, 3, 4, 5},
+          {kNan, kNan, 7, 8},
+          {kNan, kNan, kNan, 10},
+          {kNan, kNan, kNan, kNan}};
+}
+
+Array2D<float> BValsRight() {
+  return {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}};
+}
+
+Array2D<float> BValsLeft() {
+  return {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}};
+}
+
+static constexpr complex64 kNanC64 = complex64(kNan, kNan);
+
+Array2D<complex64> AValsLowerComplex() {
+  return {{2, kNanC64, kNanC64, kNanC64},
+          {complex64(3, 1), 6, kNanC64, kNanC64},
+          {4, complex64(7, 2), 9, kNanC64},
+          {5, 8, complex64(10, 3), 11}};
+}
+
+Array2D<complex64> AValsUpperComplex() {
+  return {{2, 3, complex64(4, 3), 5},
+          {kNanC64, 6, complex64(7, 2), 8},
+          {kNanC64, kNanC64, complex64(9, 1), 10},
+          {kNanC64, kNanC64, kNanC64, 11}};
+}
+
+Array2D<complex64> BValsRightComplex() {
+  return {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}};
+}
+
+Array2D<complex64> BValsLeftComplex() {
+  return {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}, {10, 11, 12}};
+}
+
+XLA_TEST_F(TriangularSolveTest, EmptyArrays) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a, b;
+  auto a_data =
+      CreateR2Parameter<float>(Array2D<float>(0, 0), 0, "a", &builder, &a);
+  auto b_data =
+      CreateR2Parameter<float>(Array2D<float>(0, 10), 1, "b", &builder, &b);
+  TriangularSolve(a, b,
+                  /*left_side=*/true, /*lower=*/true,
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::TRANSPOSE);
+
+  ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 10),
+                             {a_data.get(), b_data.get()});
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTranspose) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
+  TriangularSolve(a, b,
+                  /*left_side=*/false, /*lower=*/true,
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::TRANSPOSE);
+
+  Array2D<float> expected({
+      {0.5, 0.08333334, 0.04629629, 0.03367003},
+      {2.5, -0.25, -0.1388889, -0.1010101},
+      {4.5, -0.58333331, -0.32407406, -0.23569024},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleRightLowerNotranspose) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
+  TriangularSolve(a, b,
+                  /*left_side=*/false, /*lower=*/true,
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::NO_TRANSPOSE);
+
+  Array2D<float> expected({
+      {-0.16414141, -0.06902357, -0.07070707, 0.36363636},
+      {0.64393939, 0.06565657, -0.03030303, 0.72727273},
+      {1.4520202, 0.2003367, 0.01010101, 1.09090909},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleRightUpperTranspose) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a, b;
+  auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
+  TriangularSolve(a, b,
+                  /*left_side=*/false, /*lower=*/false,
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::TRANSPOSE);
+
+  Array2D<float> expected({
+      {-0.16414141, -0.06902357, -0.07070707, 0.36363636},
+      {0.64393939, 0.06565657, -0.03030303, 0.72727273},
+      {1.4520202, 0.2003367, 0.01010101, 1.09090909},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleRightUpperNotranspose) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a, b;
+  auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
+  TriangularSolve(a, b,
+                  /*left_side=*/false, /*lower=*/false,
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::NO_TRANSPOSE);
+
+  Array2D<float> expected({
+      {0.5, 0.08333334, 0.04629629, 0.03367003},
+      {2.5, -0.25, -0.1388889, -0.1010101},
+      {4.5, -0.58333331, -0.32407406, -0.23569024},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerTranspose) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  TriangularSolve(a, b,
+                  /*left_side=*/true, /*lower=*/true,
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::TRANSPOSE);
+
+  Array2D<float> expected({
+      {-0.89646465, -0.69444444, -0.49242424},
+      {-0.27441077, -0.24074074, -0.20707071},
+      {-0.23232323, -0.22222222, -0.21212121},
+      {0.90909091, 1., 1.09090909},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotranspose) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  TriangularSolve(a, b,
+                  /*left_side=*/true, /*lower=*/true,
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::NO_TRANSPOSE);
+
+  Array2D<float> expected({
+      {0.5, 1.0, 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {0.23148148, 0.18518519, 0.13888889},
+      {0.16835017, 0.13468013, 0.1010101},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNoTransposeUnitDiagonal) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a, b;
+  auto a_data =
+      CreateR2Parameter<float>(AValsLowerUnitDiagonal(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  TriangularSolve(a, b,
+                  /*left_side=*/true, /*lower=*/true,
+                  /*unit_diagonal=*/true,
+                  /*transpose_a=*/TriangularSolveOptions::NO_TRANSPOSE);
+
+  Array2D<float> expected(
+      {{1., 2., 3.}, {1., -1., -3.}, {-4., 7., 18.}, {37., -61., -159.}});
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotransposeIrregularblock) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  TriangularSolve(a, b,
+                  /*left_side=*/true, /*lower=*/true,
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::NO_TRANSPOSE);
+
+  Array2D<float> expected({
+      {0.5, 1.0, 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {0.23148148, 0.18518519, 0.13888889},
+      {0.16835017, 0.13468013, 0.1010101},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTranspose) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a, b;
+  auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  TriangularSolve(a, b,
+                  /*left_side=*/true, /*lower=*/false,
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::TRANSPOSE);
+
+  Array2D<float> expected({
+      {0.5, 1.0, 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {0.23148148, 0.18518519, 0.13888889},
+      {0.16835017, 0.13468013, 0.1010101},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperNotranspose) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a, b;
+  auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  TriangularSolve(a, b,
+                  /*left_side=*/true, /*lower=*/false,
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::NO_TRANSPOSE);
+
+  Array2D<float> expected({
+      {-0.89646465, -0.69444444, -0.49242424},
+      {-0.27441077, -0.24074074, -0.20707071},
+      {-0.23232323, -0.22222222, -0.21212121},
+      {0.90909091, 1., 1.09090909},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperNotransposeUnitDiagonal) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a, b;
+  auto a_data =
+      CreateR2Parameter<float>(AValsUpperUnitDiagonal(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  TriangularSolve(a, b,
+                  /*left_side=*/true, /*lower=*/false,
+                  /*unit_diagonal=*/true,
+                  /*transpose_a=*/TriangularSolveOptions::NO_TRANSPOSE);
+
+  Array2D<float> expected({{-1402., -1538., -1674.},
+                           {575., 631., 687.},
+                           {-93., -102., -111.},
+                           {10., 11., 12.}});
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTransposeConjugate) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a, b;
+  auto a_data =
+      CreateR2Parameter<complex64>(AValsLowerComplex(), 0, "a", &builder, &a);
+  auto b_data =
+      CreateR2Parameter<complex64>(BValsRightComplex(), 1, "b", &builder, &b);
+  TriangularSolve(a, b,
+                  /*left_side=*/false, /*lower=*/true,
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::ADJOINT);
+
+  Array2D<complex64> expected({
+      {0.5, complex64(0.08333333, 0.08333333),
+       complex64(0.02777778, -0.0462963), complex64(0.06313131, -0.01094276)},
+      {2.5, complex64(-0.25, 0.41666667), complex64(-0.23148148, -0.37962963),
+       complex64(0.08670034, -0.02104377)},
+      {4.5, complex64(-0.58333333, 0.75), complex64(-0.49074074, -0.71296296),
+       complex64(0.11026936, -0.03114478)},
+  });
+
+  ComputeAndCompareR2<complex64>(
+      &builder, expected, {a_data.get(), b_data.get()}, ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTransposeNoconjugate) {
+  XlaBuilder builder(TestName());
+
+  XlaOp a, b;
+  auto a_data =
+      CreateR2Parameter<complex64>(AValsUpperComplex(), 0, "a", &builder, &a);
+  auto b_data =
+      CreateR2Parameter<complex64>(BValsLeftComplex(), 1, "b", &builder, &b);
+  TriangularSolve(a, b,
+                  /*left_side=*/true, /*lower=*/false,
+                  /*unit_diagonal=*/false,
+                  /*transpose_a=*/TriangularSolveOptions::TRANSPOSE);
+
+  Array2D<complex64> expected({
+      {0.5, 1., 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {complex64(0.20020325, -2.81504065e-01),
+       complex64(0.13821138, -4.22764228e-01),
+       complex64(0.07621951, -5.64024390e-01)},
+      {complex64(0.19678492, 2.55912786e-01),
+       complex64(0.17738359, 3.84331116e-01),
+       complex64(0.15798226, 5.12749446e-01)},
+  });
+
+  ComputeAndCompareR2<complex64>(
+      &builder, expected, {a_data.get(), b_data.get()}, ErrorSpec(1e-2, 1e-2));
+}
+
+XLA_TEST_F(TriangularSolveTest, BatchedLeftUpper) {
+  XlaBuilder builder(TestName());
+
+  Array3D<float> bvals(7, 5, 5);
+  bvals.FillIota(1.);
+
+  // Set avals to the upper triangle of bvals.
+  Array3D<float> avals = bvals;
+  avals.Each([](absl::Span<const int64> indices, float* value) {
+    if (indices[1] > indices[2]) {
+      *value = 0;
+    }
+  });
+
+  XlaOp a, b;
+  auto a_data = CreateR3Parameter<float>(avals, 0, "a", &builder, &a);
+  auto b_data = CreateR3Parameter<float>(bvals, 1, "b", &builder, &b);
+  BatchDot(
+      ConstantR3FromArray3D(&builder, avals),
+      TriangularSolve(a, b,
+                      /*left_side=*/true, /*lower=*/false,
+                      /*unit_diagonal=*/false,
+                      /*transpose_a=*/TriangularSolveOptions::NO_TRANSPOSE));
+
+  ComputeAndCompareR3<float>(&builder, bvals, {a_data.get(), b_data.get()},
+                             ErrorSpec(1e-2, 1e-2));
+}
+
+struct TriangularSolveTestSpec {
+  int m, n;  // A is mxm, B is mxn
+  bool left_side;
+  bool lower;
+  TriangularSolveOptions::Transpose transpose_a;
+};
+
+class TriangularSolveParametricTest
+    : public ClientLibraryTestBase,
+      public ::testing::WithParamInterface<TriangularSolveTestSpec> {};
+
+XLA_TEST_P(TriangularSolveParametricTest, Random) {
+  TriangularSolveTestSpec spec = GetParam();
+
+  XlaBuilder builder(TestName());
+
+  Array2D<float> avals(spec.m, spec.m);
+  avals.FillRandom(1.0);
+  for (int i = 0; i < spec.m; ++i) {
+    avals(i, i) += 10;
+  }
+
+  std::pair<int, int> bdims = spec.left_side ? std::make_pair(spec.m, spec.n)
+                                             : std::make_pair(spec.n, spec.m);
+  Array2D<float> bvals(bdims.first, bdims.second);
+  bvals.FillRandom(1.0);
+
+  XlaOp a, b;
+  auto a_data = CreateR2Parameter<float>(avals, 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(bvals, 1, "b", &builder, &b);
+  auto x = TriangularSolve(a, b, spec.left_side, spec.lower,
+                           /*unit_diagonal=*/false, spec.transpose_a);
+  auto a_tri = Triangle(a, spec.lower);
+  a_tri = MaybeTransposeInMinorDims(
+      a_tri, spec.transpose_a != TriangularSolveOptions::NO_TRANSPOSE);
+  if (spec.left_side) {
+    BatchDot(a_tri, x);
+  } else {
+    BatchDot(x, a_tri);
+  }
+
+  ComputeAndCompareR2<float>(&builder, bvals, {a_data.get(), b_data.get()},
+                             ErrorSpec(1e-2, 1e-2));
+}
+
+std::vector<TriangularSolveTestSpec> TriangularSolveTests() {
+  std::vector<TriangularSolveTestSpec> specs;
+  for (int m : {5, 10}) {
+    for (int n : {5, 10}) {
+      for (bool left_side : {false, true}) {
+        for (bool lower : {false, true}) {
+          for (TriangularSolveOptions::Transpose transpose_a :
+               {TriangularSolveOptions::NO_TRANSPOSE,
+                TriangularSolveOptions::TRANSPOSE}) {
+            specs.push_back({m, n, left_side, lower, transpose_a});
+          }
+        }
+      }
+    }
+  }
+  return specs;
+}
+
+INSTANTIATE_TEST_SUITE_P(TriangularSolveParametricTestInstantiation,
+                         TriangularSolveParametricTest,
+                         ::testing::ValuesIn(TriangularSolveTests()));
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index 27ce243e9bd4afbdcc1fdc5b6873d4968086e459..cdf2c34fcc3cc005e84626c39c8ab301a9040529 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -176,8 +176,9 @@ XLA_TEST_F(TupleTest, AddTupleElements) {
       {2.f, 4.f, 6.f},  // row 0
       {5.f, 7.f, 9.f},  // row 1
   });
-  ASSERT_TRUE(ShapeUtil::ShapeIs(vector_shape, F32, {3}));
-  ASSERT_TRUE(ShapeUtil::ShapeIs(matrix_shape, F32, {/*y=*/2, /*x=*/3}));
+  ASSERT_TRUE(ShapeUtil::Equal(vector_shape, ShapeUtil::MakeShape(F32, {3})));
+  ASSERT_TRUE(ShapeUtil::Equal(matrix_shape,
+                               ShapeUtil::MakeShape(F32, {/*y=*/2, /*x=*/3})));
   ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
 }
 
@@ -512,8 +513,7 @@ XLA_TEST_F(TupleTest, ComplexTuples) {
 
 class TupleHloTest : public HloTestBase {};
 
-// Disabled on the interpreter because bitcast doesn't exist on the interpreter.
-XLA_TEST_F(TupleHloTest, DISABLED_ON_INTERPRETER(BitcastAfterGTE)) {
+XLA_TEST_F(TupleHloTest, BitcastAfterGTE) {
   const char* testcase = R"(
     HloModule m, is_scheduled=true
 
@@ -525,9 +525,7 @@ XLA_TEST_F(TupleHloTest, DISABLED_ON_INTERPRETER(BitcastAfterGTE)) {
       ROOT tuple.4 = (f32[1,3]{1,0}) tuple(copy)
     }
   )";
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param =
       LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR1<float>({1, 2, 3}));
   auto result = ExecuteNoHloPasses(std::move(module), {&param});
@@ -555,13 +553,11 @@ XLA_TEST_F(TupleHloTest,
       s = (f32[2],f32[2]) tuple-select(cond, tup0, tup1)
       gte = f32[2] get-tuple-element(s), index=0
       tuple = (f32[2]) tuple(gte)
-      token = token[] after-all()
-      ROOT outfeed = token[] outfeed(tuple, token)
+      token0 = token[] after-all()
+      ROOT outfeed = token[] outfeed(tuple, token0)
     }
   )";
-  auto module =
-      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
-          .ValueOrDie();
+  auto module = ParseAndReturnVerifiedModule(testcase).ValueOrDie();
   auto param0 = LiteralUtil::CreateR1<float>({1, 2});
   auto param1 = LiteralUtil::CreateR1<float>({2, 3});
   auto param4 = LiteralUtil::CreateR0<bool>(false);
diff --git a/tensorflow/compiler/xla/tests/unary_op_test.cc b/tensorflow/compiler/xla/tests/unary_op_test.cc
index 4fbd7f2fb174ac899c1e3b23801986cb52db96a2..c51f30f3b5db95962a719ec226dd03f41142a782 100644
--- a/tensorflow/compiler/xla/tests/unary_op_test.cc
+++ b/tensorflow/compiler/xla/tests/unary_op_test.cc
@@ -64,7 +64,9 @@ class UnaryOpTest : public ClientLibraryTestBase {
         &builder, {-2, 25, 0, static_cast<T>(-0.0), -123, inf<T>(), -inf<T>()});
     Sign(arg);
 
-    ComputeAndCompareR1<T>(&builder, {-1, 1, 0, 0, -1, 1, -1}, {});
+    ComputeAndCompareR1<T>(
+        &builder,
+        {-1, 1, static_cast<T>(+0.0), static_cast<T>(-0.0), -1, 1, -1}, {});
   }
 
   template <typename T>
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 6d5f276e82087cedc356691b0ff08df24cec8d20..85212fa56d71088156d2f3edda17f71cdab56da2 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -861,7 +861,7 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) {
     // Update.
     auto update = ConvertElementType(Broadcast(out0, {2}), F32);
     // Starts = iteration * 2;
-    auto starts = Reshape(Mul(iteration, ConstantR0<int32>(&builder, 2)), {1});
+    auto starts = Mul(iteration, ConstantR0<int32>(&builder, 2));
     // UpdateSlice.
     auto out1 = DynamicUpdateSlice(input, update, starts);
 
@@ -901,7 +901,7 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) {
 // Per backend the values generated can be different as the different backends
 // use different random number generators.
 // TODO(b/32240857): Extend test to verify outputs.
-XLA_TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithPrngScalarResult)) {
+XLA_TEST_F(WhileTest, WhileWithPrngScalarResult) {
   auto v6s32 = ShapeUtil::MakeShape(S32, {6});
 
   // Create a computation for the condition: repeat for count iterations.
@@ -1146,7 +1146,7 @@ XLA_TEST_F(WhileTest, NestedWhileWithScalarResult) {
 // while (f(result).get<0>()) {
 //   result = result + 1;
 // }
-XLA_TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithCallInsideCondition)) {
+XLA_TEST_F(WhileTest, WhileWithCallInsideCondition) {
   auto result_shape = ShapeUtil::MakeShape(S32, {});
 
   // Create a computation for the condition: repeat for 5 iterations.
@@ -1299,9 +1299,9 @@ void BM_WhileLoop(int num_iters) {
     auto one = ConstantR0<float>(&builder, 1.0);
     auto update = Broadcast(one, {1, 1024, 1024});
     // Starts = iteration * 2;
-    auto starts = ConstantR1<int32>(&builder, {0, 0, 0});
+    auto zero = ConstantR0<int32>(&builder, 0);
     // UpdateSlice.
-    auto out1 = DynamicUpdateSlice(input, update, starts);
+    auto out1 = DynamicUpdateSlice(input, update, {zero, zero, zero});
     Tuple(&builder, {out0, out1});
     body = builder.Build().ConsumeValueOrDie();
   }
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index e57d072a0632b492b8b6e34439f4e80332b843b6..7b7b8f5d02dc99607b30f898e18c5b448d421e07 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -40,8 +40,6 @@ limitations under the License.
 namespace xla {
 namespace {
 
-namespace gtl = ::tensorflow::gtl;
-
 class HloProfileTest : public ClientLibraryTestBase {};
 
 struct ParsedProfileOutputLine {
@@ -174,9 +172,8 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
   exec_run_options.set_allocator(backend->memory_allocator());
   exec_run_options.set_intra_op_thread_pool(
       backend->eigen_intra_op_thread_pool_device());
-  ServiceExecutableRunOptions run_options(
-      exec_run_options, /*borrow_stream=*/nullptr,
-      backend->eigen_intra_op_thread_pool());
+  ServiceExecutableRunOptions run_options(exec_run_options,
+                                          /*borrow_stream=*/nullptr);
   std::vector<const ShapedBuffer*> args = {&lhs_arg, &rhs_arg};
   TF_ASSERT_OK_AND_ASSIGN(
       auto execution_result,
@@ -225,14 +222,17 @@ XLA_TEST_F(HloProfileTest, ProfileSingleComputation) {
 
   line_no++;  // Skip 'Execution profile for ....'
 
+  ASSERT_LT(line_no, profile_output_lines.size());
   TF_ASSERT_OK(ParseOneProfileOutputLine(profile_output_lines[line_no++],
                                          /*expect_hlo=*/false,
                                          &parsed_profile_lines));
 
+  ASSERT_LT(line_no, profile_output_lines.size());
   TF_ASSERT_OK(ParseOneProfileOutputLine(profile_output_lines[line_no++],
                                          /*expect_hlo=*/true,
                                          &parsed_profile_lines));
 
+  ASSERT_LT(line_no, profile_output_lines.size());
   TF_ASSERT_OK(ParseOneProfileOutputLine(profile_output_lines[line_no++],
                                          /*expect_hlo=*/true,
                                          &parsed_profile_lines));
diff --git a/tensorflow/compiler/xla/text_literal_reader.cc b/tensorflow/compiler/xla/text_literal_reader.cc
index cdde88c1359416d423685f330e9cbdf77948034f..c78ec522aa5f13556c6d4602267544694887f250 100644
--- a/tensorflow/compiler/xla/text_literal_reader.cc
+++ b/tensorflow/compiler/xla/text_literal_reader.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -66,7 +67,7 @@ StatusOr<Literal> TextLiteralReader::ReadAllLines() {
   }
 
   absl::StripAsciiWhitespace(&shape_string);
-  TF_ASSIGN_OR_RETURN(Shape shape, ShapeUtil::ParseShapeString(shape_string));
+  TF_ASSIGN_OR_RETURN(Shape shape, ParseShape(shape_string));
   if (shape.element_type() != F32) {
     return Unimplemented(
         "unsupported element type for text literal reading: %s",
diff --git a/tensorflow/compiler/xla/text_literal_writer.cc b/tensorflow/compiler/xla/text_literal_writer.cc
index 7289ae7df65e56652eeeb67e536e4c721d97d999..fc7949d889dc8ed9fac425982cc555a6c42a7f1d 100644
--- a/tensorflow/compiler/xla/text_literal_writer.cc
+++ b/tensorflow/compiler/xla/text_literal_writer.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index 8926bbed2b54fceaaf0e6e991f0e881d35731ef4..ebd4bb1e42c9d1dc1f72a75514e916a2d900c30e 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -14,7 +14,7 @@ filegroup(
     visibility = ["//tensorflow/compiler/xla:internal"],
 )
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
 
 tf_cc_binary(
     name = "hex_floats_to_packed_literal",
@@ -29,33 +29,6 @@ tf_cc_binary(
     ],
 )
 
-cc_library(
-    name = "dumped_computation_to_graphviz_library",
-    srcs = ["dumped_computation_to_graphviz.cc"],
-    deps = [
-        "//tensorflow/compiler/xla:debug_options_flags",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client",
-        "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/service",
-        "//tensorflow/compiler/xla/service:hlo_proto",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-tf_cc_binary(
-    name = "dumped_computation_to_graphviz",
-    deps = [
-        ":dumped_computation_to_graphviz_library",
-        "//tensorflow/compiler/xla/service:interpreter_plugin",
-    ],
-)
-
 tf_cc_binary(
     name = "show_signature",
     srcs = ["show_signature.cc"],
@@ -95,6 +68,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service/gpu:infeed_manager",
+        "//tensorflow/compiler/xla/service/gpu:outfeed_manager",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
@@ -204,33 +178,66 @@ tf_cc_binary(
 )
 
 tf_cc_binary(
-    name = "dumped_computation_to_tf_graphdef",
-    srcs = ["dumped_computation_to_tf_graphdef.cc"],
+    name = "hlo_proto_to_json",
+    srcs = ["hlo_proto_to_json.cc"],
     deps = [
-        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla/client",
-        "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/service",
-        "//tensorflow/compiler/xla/service:hlo_graph_dumper",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo_proto",
-        "//tensorflow/compiler/xla/service:interpreter_plugin",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
-        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cc_test(
+    name = "hlo_extractor_test",
+    srcs = ["hlo_extractor_test.cc"],
+    deps = [
+        ":hlo_extractor",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test",
+    ],
+)
+
+cc_library(
+    name = "hlo_extractor",
+    srcs = ["hlo_extractor.cc"],
+    hdrs = ["hlo_extractor.h"],
+    deps = [
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_verifier",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/memory",
     ],
 )
 
 tf_cc_binary(
-    name = "hlo_proto_to_json",
-    srcs = ["hlo_proto_to_json.cc"],
+    name = "interactive_graphviz",
+    srcs = ["interactive_graphviz.cc"],
     deps = [
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:util",
+        ":hlo_extractor",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/service:compiler",
+        "//tensorflow/compiler/xla/service:cpu_plugin",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/service:hlo_graph_dumper",
         "//tensorflow/compiler/xla/service:hlo_proto",
+        "//tensorflow/compiler/xla/service:hlo_runner",
+        "//tensorflow/compiler/xla/service:local_service",
+        "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
     ],
 )
+
+sh_test(
+    name = "interactive_graphviz_build_only_test",
+    srcs = ["interactive_graphviz_test.sh"],
+    data = [":interactive_graphviz"],
+)
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
deleted file mode 100644
index b623556468fb4a5d96be614b6c067d5a1df51a6f..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Usage: dumped_computation_to_graphviz some_binary_snapshot_proto*
-//
-// Dumps a graphviz URL for a snapshot computation to the command line.
-//
-// some_binary_snapshot_proto is obtained by serializing the HloSnapshot from
-// ServiceInterface::SnapshotComputation to disk.
-//
-// The GraphViz URL is placed into the log stderr, whereas computation
-// statistics are printed on stdout (implementation note: getting computation
-// statistics is how we trigger compilation to split out a GraphViz URL).
-
-#include <stdio.h>
-#include <memory>
-#include <string>
-
-#include "absl/types/span.h"
-#include "tensorflow/compiler/xla/client/client.h"
-#include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/debug_options_flags.h"
-#include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/service.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace xla {
-namespace tools {
-
-void RealMain(absl::Span<char* const> args) {
-  Client* client = ClientLibrary::LocalClientOrDie();
-  for (char* arg : args) {
-    HloSnapshot module;
-    TF_CHECK_OK(
-        tensorflow::ReadBinaryProto(tensorflow::Env::Default(), arg, &module));
-    XlaComputation computation =
-        client->LoadSnapshot(module).ConsumeValueOrDie();
-    DebugOptions debug_options = GetDebugOptionsFromFlags();
-    debug_options.set_xla_generate_hlo_graph(".*");
-    ComputationStats stats =
-        client->GetComputationStats(computation, debug_options)
-            .ConsumeValueOrDie();
-    fprintf(stdout, ">>> %s :: %s\n", arg, stats.DebugString().c_str());
-  }
-}
-
-}  // namespace tools
-}  // namespace xla
-
-int main(int argc, char** argv) {
-  std::vector<tensorflow::Flag> flag_list;
-  xla::AppendDebugOptionsFlags(&flag_list);
-  xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
-  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
-  if (!parse_result) {
-    LOG(ERROR) << "\n" << usage;
-    return 2;
-  }
-  tensorflow::port::InitMain(argv[0], &argc, &argv);
-
-  absl::Span<char* const> args(argv, argc);
-  args.remove_prefix(1);  // Pop off the binary name, argv[0]
-  xla::tools::RealMain(args);
-  return 0;
-}
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
index 4375e7c138c9e8d193feaa7a39d63946c4ea3086..df2d3d18b9ff86c0dd2047c2415527aeb1c1f154 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
index 723569862c7550387e95003e3a673743464b67b8..35bb82ca22f46d2cdeaac3b9a87b253efe9a07d9 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
deleted file mode 100644
index f8bb9a6b1e217fc4e6e15c8a3302be61ed339c82..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Usage: dumped_computation_to_tf_graph some_binary_snapshot_proto*
-//
-// Dumps a tensorflow GraphDef in text format for a snapshot computation. The
-// dumped graph is an HLO computation with HLO instructions as nodes and can be
-// visualized on Tensorboard. Upload the dumped files on Tensorboard.
-//
-// some_binary_snapshot_proto is obtained by serializing the SessionModule from
-// ServiceInterface::SnapshotComputation to disk.
-
-#include <stdio.h>
-#include <memory>
-#include <string>
-
-#include "absl/types/span.h"
-#include "tensorflow/compiler/xla/client/client.h"
-#include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/debug_options_flags.h"
-#include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/service.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/core/platform/logging.h"
-
-using tensorflow::Env;
-
-namespace xla {
-namespace tools {
-
-void RealMain(absl::Span<char* const> args) {
-  Client* client = ClientLibrary::LocalClientOrDie();
-  for (char* arg : args) {
-    HloSnapshot module;
-    TF_CHECK_OK(
-        tensorflow::ReadBinaryProto(tensorflow::Env::Default(), arg, &module));
-    XlaComputation computation =
-        client->LoadSnapshot(module).ConsumeValueOrDie();
-    DebugOptions debug_options = GetDebugOptionsFromFlags();
-    debug_options.set_xla_generate_hlo_graph(".*");
-    debug_options.set_xla_hlo_dump_as_graphdef(true);
-    ComputationStats stats =
-        client->GetComputationStats(computation, debug_options)
-            .ConsumeValueOrDie();
-    fprintf(stdout, ">>> %s :: %s\n", arg, stats.DebugString().c_str());
-  }
-}
-
-}  // namespace tools
-}  // namespace xla
-
-int main(int argc, char** argv) {
-  std::vector<tensorflow::Flag> flag_list;
-  xla::AppendDebugOptionsFlags(&flag_list);
-  xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
-  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
-  if (!parse_result) {
-    LOG(ERROR) << "\n" << usage;
-    return 2;
-  }
-
-  tensorflow::port::InitMain(argv[0], &argc, &argv);
-
-  absl::Span<char* const> args(argv, argc);
-  args.remove_prefix(1);  // Pop off the binary name, argv[0]
-  xla::tools::RealMain(args);
-  return 0;
-}
diff --git a/tensorflow/compiler/xla/tools/hlo_extractor.cc b/tensorflow/compiler/xla/tools/hlo_extractor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f3ce5f99b0c2a8e9ae5446f4bedc34b678c95b96
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/hlo_extractor.cc
@@ -0,0 +1,159 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/tools/hlo_extractor.h"
+
+#include <stdio.h>
+#include <unistd.h>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/memory/memory.h"
+#include "tensorflow/compiler/xla/service/hlo_clone_context.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
+#include "tensorflow/compiler/xla/status.h"
+
+namespace xla {
+namespace {
+
+// Visitor that build a new HLO module with an entry computation and a root that
+// is provided to the visit function. Only HLOs that are reachable from the new
+// root instruction are included in the new module.
+//
+// The constructor allows specifying a set of boundary HLOs to prune the HLO
+// graph. HLOs at the boundary are replaced with parameters. Can be nullptr
+// which means no boundary, i.e. no HLOs are replaced with parameters.
+class ExtractionVisitor : public ConstDfsHloVisitorWithDefault {
+ public:
+  explicit ExtractionVisitor(
+      const HloModule& old_module,
+      absl::flat_hash_set<const HloInstruction*>* boundary)
+      : old_module_(old_module),
+        module_(absl::make_unique<HloModule>("extracted", config_)),
+        clone_context_(module_.get()),
+        builder_("entry_computation"),
+        boundary_(boundary) {}
+
+  Status HandleParameter(const HloInstruction* parameter) override {
+    // Entry parameters need renumbering.
+    auto new_parameter = HloInstruction::CreateParameter(
+        parameter_number_++, parameter->shape(), parameter->name());
+    clone_context_.MapInstruction(parameter, new_parameter.get());
+    builder_.AddInstruction(std::move(new_parameter));
+    return Status::OK();
+  }
+
+  Status DefaultAction(const HloInstruction* hlo) override {
+    // Replace instructions at the boundary with parameters, but leave constants
+    // untouched.
+    if (boundary_ != nullptr && boundary_->count(hlo) > 0) {
+      auto new_parameter = HloInstruction::CreateParameter(
+          parameter_number_, hlo->shape(), hlo->name());
+      parameter_number_++;
+      clone_context_.MapInstruction(hlo, new_parameter.get());
+      builder_.AddInstruction(std::move(new_parameter));
+      return Status::OK();
+    }
+    std::vector<HloInstruction*> new_operands;
+    for (auto operand : hlo->operands()) {
+      new_operands.push_back(clone_context_.GetInstruction(operand));
+    }
+    auto instruction =
+        hlo->CloneWithNewOperands(hlo->shape(), new_operands, &clone_context_);
+    builder_.AddInstruction(std::move(instruction));
+    return Status::OK();
+  }
+
+  Status FinishVisit(const HloInstruction* /*root*/) override {
+    module_->AddEntryComputation(builder_.Build());
+    // Rename HLOs so that their name matches the original. By default,
+    // HLOs get new unique names when adding a new entry computation to
+    // a module.
+    for (auto computation : old_module_.MakeComputationPostOrder()) {
+      for (auto old_instruction : computation->MakeInstructionPostOrder()) {
+        if (auto new_instruction =
+                clone_context_.FindInstruction(old_instruction)) {
+          new_instruction->SetAndSanitizeName(old_instruction->name());
+        }
+      }
+    }
+    return Status::OK();
+  }
+
+  HloModule* module() { return module_.get(); }
+
+  std::unique_ptr<HloModule> ConsumeModule() { return std::move(module_); }
+
+ private:
+  const HloModule& old_module_;
+  HloModuleConfig config_;
+  std::unique_ptr<HloModule> module_;
+  HloCloneContext clone_context_;
+  HloComputation::Builder builder_;
+  absl::flat_hash_set<const HloInstruction*>* boundary_;
+  int64 parameter_number_ = 0;
+};
+
+void ComputeBoundary(const HloInstruction* root, int64 limit,
+                     absl::flat_hash_set<const HloInstruction*>* boundary) {
+  std::deque<const HloInstruction*> worklist;
+  absl::flat_hash_map<const HloInstruction*, int64> visited;
+  worklist.push_back(root);
+  visited.emplace(root, 0);
+  while (!worklist.empty()) {
+    auto hlo = worklist.front();
+    worklist.pop_front();
+    int64 hops = visited[hlo];
+    if (hops > limit) {
+      boundary->insert(hlo);
+      continue;
+    }
+    for (const HloInstruction* operand : hlo->operands()) {
+      if (visited.count(operand)) {
+        continue;
+      }
+      worklist.push_back(operand);
+      visited.emplace(operand, hops + 1);
+    }
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<HloModule> ExtractModule(HloInstruction* instruction,
+                                         int64 height) {
+  absl::flat_hash_set<const HloInstruction*> boundary;
+  if (height != -1) {
+    ComputeBoundary(instruction, height, &boundary);
+  }
+  ExtractionVisitor visitor(*instruction->GetModule(), &boundary);
+  CHECK(instruction->Accept(&visitor).ok());
+
+  // The first pass may leave unused parameter instructions. Do another
+  // extraction pass to remove unused parameters. This is done because
+  // HloComputation does not allow removing parameters after the computation has
+  // been built.
+  ExtractionVisitor cleanup_visitor(*visitor.module(), /*boundary=*/nullptr);
+  TF_CHECK_OK(visitor.module()->entry_computation()->root_instruction()->Accept(
+      &cleanup_visitor));
+
+  HloVerifier verifier(/*layout_sensitive=*/false,
+                       /*allow_mixed_precision=*/true);
+  TF_CHECK_OK(verifier.Run(cleanup_visitor.module()).status());
+  return cleanup_visitor.ConsumeModule();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/hlo_extractor.h b/tensorflow/compiler/xla/tools/hlo_extractor.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc13dc7e438fe0e64312746150af02df805e746a
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/hlo_extractor.h
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_TOOLS_HLO_EXTRACTOR_H_
+#define TENSORFLOW_COMPILER_XLA_TOOLS_HLO_EXTRACTOR_H_
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+
+namespace xla {
+
+// Creates a new HLO module rooted with an entry computation rooted at the given
+// instruction.
+//
+//  By default (height == -1), the new computation includes all transitive
+//  operands of `root`.  If you specify a different height, the new computation
+//  will include all instructions <= `height` hops away from `root`.
+//  Instructions at the boundary are replaced by parameters.
+std::unique_ptr<HloModule> ExtractModule(HloInstruction* instruction,
+                                         int64 height = -1);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_TOOLS_HLO_EXTRACTOR_H_
diff --git a/tensorflow/compiler/xla/tools/hlo_extractor_test.cc b/tensorflow/compiler/xla/tools/hlo_extractor_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4beb099b330cadf4540944979f38681bae07103c
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/hlo_extractor_test.cc
@@ -0,0 +1,139 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/tools/hlo_extractor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace {
+
+namespace op = testing::opcode_matchers;
+
+using HloExtractorTest = HloTestBase;
+
+TEST_F(HloExtractorTest, ExtractTopLevel) {
+  const string& hlo_string = R"(
+HloModule test
+
+ENTRY %entry {
+  param.0 = f32[4]{0} parameter(0)
+  negate = f32[4]{0} negate(f32[4]{0} param.0)
+  ROOT exp = f32[4]{0} exponential(f32[4]{0} negate)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  {
+    auto extracted_module =
+        ExtractModule(FindInstruction(hlo_module.get(), "exp"));
+    EXPECT_THAT(extracted_module->entry_computation()->root_instruction(),
+                op::Exp(op::Negate(op::Parameter(0))));
+  }
+
+  {
+    auto extracted_module =
+        ExtractModule(FindInstruction(hlo_module.get(), "exp"), /*height=*/0);
+    EXPECT_THAT(extracted_module->entry_computation()->root_instruction(),
+                op::Exp(op::Parameter(0)));
+  }
+
+  {
+    auto extracted_module = ExtractModule(
+        FindInstruction(hlo_module.get(), "negate"), /*height=*/0);
+    EXPECT_THAT(extracted_module->entry_computation()->root_instruction(),
+                op::Negate(op::Parameter(0)));
+  }
+}
+
+TEST_F(HloExtractorTest, ExtractDag) {
+  const string& hlo_string = R"(
+HloModule test
+
+ENTRY %entry {
+  param.0 = f32[4]{0} parameter(0)
+  tanh = f32[4]{0} tanh(f32[4]{0} param.0)
+  negate = f32[4]{0} negate(f32[4]{0} tanh)
+  exp = f32[4]{0} exponential(f32[4]{0} negate)
+  ROOT add = f32[4]{0} add(f32[4]{0} negate, f32[4]{0} exp)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  {
+    auto extracted_module =
+        ExtractModule(FindInstruction(hlo_module.get(), "exp"));
+    EXPECT_THAT(extracted_module->entry_computation()->root_instruction(),
+                op::Exp(op::Negate(op::Tanh(op::Parameter(0)))));
+  }
+
+  {
+    auto extracted_module =
+        ExtractModule(FindInstruction(hlo_module.get(), "add"), /*height=*/0);
+    EXPECT_THAT(extracted_module->entry_computation()->root_instruction(),
+                op::Add(op::Parameter(0), op::Parameter(1)));
+  }
+  {
+    auto extracted_module =
+        ExtractModule(FindInstruction(hlo_module.get(), "add"), /*height=*/1);
+    EXPECT_THAT(extracted_module->entry_computation()->root_instruction(),
+                op::Add(op::Negate(op::Parameter(0)),
+                        op::Exp(op::Negate(op::Parameter(0)))));
+  }
+  {
+    auto extracted_module =
+        ExtractModule(FindInstruction(hlo_module.get(), "add"), /*height=*/2);
+    EXPECT_THAT(extracted_module->entry_computation()->root_instruction(),
+                op::Add(op::Negate(op::Tanh(op::Parameter(0))),
+                        op::Exp(op::Negate(op::Tanh(op::Parameter(0))))));
+  }
+}
+
+TEST_F(HloExtractorTest, ExtractWithConstant) {
+  const string& hlo_string = R"(
+HloModule test
+
+ENTRY %entry {
+  p = f32[4]{0} parameter(0)
+  tanh = f32[4]{0} tanh(p)
+  c = f32[4]{0} constant({1, 2, 3, 4})
+  ROOT add = f32[4]{0} add(tanh, c)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  {
+    auto extracted_module =
+        ExtractModule(FindInstruction(hlo_module.get(), "add"), /*height=*/0);
+    EXPECT_THAT(extracted_module->entry_computation()->root_instruction(),
+                op::Add(op::Parameter(0), op::Parameter(1)));
+  }
+  {
+    auto extracted_module =
+        ExtractModule(FindInstruction(hlo_module.get(), "add"), /*height=*/1);
+    EXPECT_THAT(extracted_module->entry_computation()->root_instruction(),
+                op::Add(op::Tanh(op::Parameter(0)), op::Constant()));
+  }
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/interactive_graphviz.cc b/tensorflow/compiler/xla/tools/interactive_graphviz.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0c7c078b9b9d30427cb01b8930bd012046d852d3
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/interactive_graphviz.cc
@@ -0,0 +1,676 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A tool for interactively exploring graphviz dumps of HLO graphs.
+//
+// Input can be a binary HloSnapshot proto, a binary HloProto proto, or a
+// textual HLO string.
+//
+// Generated visualization is opened in a new default browser window using
+// /usr/bin/sensible-browser.
+
+#include <stdio.h>
+#include <unistd.h>
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/service/compiler.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_runner.h"
+#include "tensorflow/compiler/xla/service/local_service.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/tools/hlo_extractor.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/subprocess.h"
+#include "tensorflow/core/util/command_line_flags.h"
+#if defined(PLATFORM_GOOGLE)
+#include "util/readline/readline.h"
+#endif
+
+namespace xla {
+namespace tools {
+namespace {
+
+bool ReadLine(const char *prompt, string *line) {
+#if defined(PLATFORM_GOOGLE)
+  return util::ReadLine(prompt, line);
+#else
+  std::cout << prompt;
+  std::getline(std::cin, *line);
+  return std::cin.good();
+#endif
+}
+
+// Command-line opts to this tool.  See main() for descriptions of these
+// fields.
+struct Options {
+  string hlo_snapshot;
+  string hlo_proto;
+  string hlo_text;
+  string platform;
+  string browser;
+};
+
+const char* const kUsage = R"(
+This tool lets you load an XLA dump and then interactively explore its graphical
+representation.
+
+Most models are too large to visualize in their entirety using graphviz, but
+it's still useful to be able to look at the nodes "near" a particular node of
+interest.
+
+If you pass --platform, this tool will compile the HloModule for the given
+platform.  This means that if you acquired your proto from a binary running at a
+particular CL, the HLO graph it ran isn't necessarily the same as the one shown
+here, unless this program was built at the same CL (and our compiler is
+deterministic :).
+
+Be patient when starting this program if you give it a large input; it has to
+compile the whole thing.
+
+Usage:
+
+  interactive_graphviz -- \
+    --{hlo_snapshot,hlo_proto,hlo_text}=path/to/binary_proto
+    --platform={CUDA,CPU,...}
+)";
+
+// Unless an explicit width is specified, we will render a neighborhood of
+// kDefaultWidth nodes around the requested instruction.
+constexpr int64 kDefaultWidth = 2;
+
+// When printing all paths between two nodes, we print out only this many nodes
+// by default, truncating the graph if there are more nodes than this in the
+// all-paths set.
+constexpr int64 kDefaultMaxNumNodesInAllPaths = 100;
+
+using absl::EqualsIgnoreCase;
+
+// A global control for whether backend configuration display is enabled.
+bool show_backend_config = true;
+
+HloInstruction* FindInstruction(const HloModule& module, string node_name) {
+  if (absl::StartsWith(node_name, "%")) {
+    node_name.erase(node_name.begin());
+  }
+  for (const auto& computation : module.computations()) {
+    auto instrs = computation->instructions();
+    auto it = absl::c_find_if(instrs, [&](const HloInstruction* instr) {
+      // Try with and without "%" at the beginning of the node name.
+      return EqualsIgnoreCase(instr->name(), node_name) ||
+             EqualsIgnoreCase(instr->name(), absl::StrCat("%", node_name));
+    });
+    if (it != instrs.end()) {
+      return *it;
+    }
+  }
+  return nullptr;
+}
+
+HloComputation* FindComputation(const HloModule& module,
+                                const string& comp_name) {
+  for (auto* computation : module.computations()) {
+    if (EqualsIgnoreCase(computation->name(), comp_name)) {
+      return computation;
+    }
+  }
+  return nullptr;
+}
+
+// Print a help message describing the various available commands.
+void DoHelpCommand() {
+  std::cout << R"(Commands:
+  <instruction> [<width>] [/ <boundary_instruction>+]
+    Renders a neighborhood of <width> nodes around <instruction>, without going
+    beyond the optional boundary instructions.  If <width> is not provided, 
+    the default value is )"
+            << kDefaultWidth << R"(.
+  allpaths <instruction> <instruction> [<n>]
+    Renders a subset of all paths from one instruction to the other.  Either
+    order of nodes is accepted.  Shows the <n> nodes in the all-paths set on the
+    shortest paths; default is )"
+            << kDefaultMaxNumNodesInAllPaths << R"(.
+  <computation>
+    Renders all nodes in <computation>.
+  backend_config [on|off]
+    Controls whether backend operation configuration information is printed.
+  list [name|op_name|op_type] <pattern>
+    Lists all instructions whose name, metadata op_name, or metadata op_type
+    contains <pattern> as a substring.
+  list computations
+    Lists all computations in the module.
+  info <instruction>
+  info <computation>
+    Prints information about <instruction> or <computation>.
+  extract <instruction> <height>
+    Creates a new HLO module with <instruction> as entry computation root. If
+    <height> is specified, the new computation contains nodes up to <height>
+    nodes above the root.
+  help
+    Prints this usage information.)"
+            << std::endl;
+}
+
+// Turn metadata-printing on or off.
+void DoBackendConfigCommand(const std::vector<string>& tokens) {
+  if (tokens.size() == 2 && tokens[1] == "on") {
+    show_backend_config = true;
+  } else if (tokens.size() == 2 && tokens[1] == "off") {
+    show_backend_config = false;
+  } else if (tokens.size() != 1) {
+    std::cerr << "(Illegal backend_config value.  Use either 'on' or 'off'.)"
+              << std::endl;
+  }
+  std::cout << "Backend configuration display "
+            << (show_backend_config ? "ON" : "OFF") << std::endl;
+}
+
+// List all computations in the module.
+void DoListComputationsCommand(const HloModule& module,
+                               const std::vector<string>& tokens) {
+  if (tokens.size() > 2) {
+    std::cout << R"(Illegal syntax; "list computations" takes no arguments.)";
+    return;
+  }
+  if (module.entry_computation() != nullptr) {
+    std::cout << "Entry computation:" << std::endl;
+    std::cout << "  " << module.entry_computation()->name() << std::endl
+              << std::endl;
+  }
+  std::cout << "Subcomputations:" << std::endl;
+  std::vector<string> names;
+  for (const auto& computation : module.computations()) {
+    if (computation == module.entry_computation()) {
+      continue;
+    }
+    std::cout << "  " << computation->name() << std::endl;
+  }
+}
+
+// List all instructions matching a pattern.
+void DoListCommand(const HloModule& module, const std::vector<string>& tokens) {
+  string pattern = "";
+  string type = "name";
+  if (tokens.size() == 2) {
+    pattern = tokens[1];
+  } else if (tokens.size() == 3) {
+    type = tokens[1];
+    pattern = tokens[2];
+  } else {
+    std::cout << "Illegal list query syntax. Use "
+              << R"("list [name|op_name|op_type] pattern".)" << std::endl;
+    return;
+  }
+
+  std::cout << "Query results:" << std::endl;
+  for (const auto& computation : module.computations()) {
+    for (const auto& instr : computation->instructions()) {
+      if ((type == "name" && instr->name().find(pattern) != string::npos) ||
+          (type == "op_name" &&
+           instr->metadata().op_name().find(pattern) != string::npos) ||
+          (type == "op_type" &&
+           instr->metadata().op_type().find(pattern) != string::npos)) {
+        std::cout << "  " << instr->name();
+        std::cout << ", op_name '" << instr->metadata().op_name() << "'";
+        std::cout << ", op_type '" << instr->metadata().op_type() << "'";
+        std::cout << std::endl;
+      }
+    }
+  }
+}
+
+// Print info about an instruction or computation.
+void DoInfoCommand(const HloModule& module, const std::vector<string>& tokens) {
+  if (tokens.size() != 2) {
+    std::cerr << "Illegal info query syntax. Use "
+              << R"("info name".)";
+    return;
+  }
+  string node_name = tokens[1];
+
+  const HloInstruction* instr = FindInstruction(module, node_name);
+  const HloComputation* comp = FindComputation(module, node_name);
+  if (!instr && !comp) {
+    std::cerr << "Couldn't find HloInstruction or HloComputation named "
+              << node_name << std::endl;
+    return;
+  }
+
+  if (comp != nullptr) {
+    std::cout << "HloComputation " << comp->name() << std::endl;
+    if (comp->IsFusionComputation()) {
+      std::cout << "  Fusion instruction: " << comp->FusionInstruction()->name()
+                << std::endl;
+    }
+    std::cout << "  Parameters:" << std::endl;
+    for (const auto& param : comp->parameter_instructions()) {
+      std::cout << "    " << param->name() << " ("
+                << ShapeUtil::HumanStringWithLayout(param->shape()) << ")"
+                << std::endl;
+    }
+    HloInstruction* root = comp->root_instruction();
+    std::cout << "  Root instruction: " << root->name() << " ("
+              << ShapeUtil::HumanStringWithLayout(root->shape()) << ")"
+              << std::endl;
+
+    auto embedded_computations = comp->MakeEmbeddedComputationsList();
+    std::cout << "  " << embedded_computations.size() << " embedded computation"
+              << (embedded_computations.size() != 1 ? "s" : "")
+              << (!embedded_computations.empty() ? ":" : ".") << std::endl;
+    for (const HloComputation* c : embedded_computations) {
+      std::cout << "    " << c->name() << std::endl;
+    }
+
+    // Find which computations reference comp as an embedded computation.
+    std::vector<const HloComputation*> users;
+    for (const HloComputation* c : module.computations()) {
+      if (absl::c_linear_search(c->MakeEmbeddedComputationsList(), comp)) {
+        users.push_back(c);
+      }
+    }
+    std::cout << "  Used by " << users.size() << " computation"
+              << (users.size() != 1 ? "s" : "") << (!users.empty() ? ":" : ".");
+    for (const HloComputation* c : users) {
+      std::cout << "    " << c->name() << std::endl;
+    }
+  } else {
+    std::cout << "HloInstruction " << instr->name() << std::endl;
+    std::cout << "  Parent computation: " << instr->parent()->name()
+              << std::endl;
+    std::cout << "  Opcode: " << HloOpcodeString(instr->opcode()) << std::endl;
+    std::cout << "  Shape: " << ShapeUtil::HumanStringWithLayout(instr->shape())
+              << std::endl;
+    std::cout << "  Metadata:" << std::endl;
+    if (!instr->metadata().op_name().empty()) {
+      std::cout << "    Name: " << instr->metadata().op_name() << std::endl;
+    }
+    if (!instr->metadata().op_type().empty()) {
+      std::cout << "    Type: " << instr->metadata().op_type() << std::endl;
+    }
+    if (!instr->raw_backend_config_string().empty()) {
+      std::cout << "  Backend configuration: "
+                << instr->raw_backend_config_string() << std::endl;
+    }
+    if (instr->opcode() == HloOpcode::kFusion) {
+      std::cout << "  Fusion kind: " << xla::ToString(instr->fusion_kind())
+                << std::endl;
+      std::cout << "  Fusion computation: "
+                << instr->fused_instructions_computation()->name() << std::endl;
+      std::cout << "  Fused computation root: "
+                << instr->fused_expression_root()->name() << std::endl;
+    }
+    std::cout << "  Operands:" << std::endl;
+    for (HloInstruction* operand : instr->operands()) {
+      std::cout << "    " << operand->name() << " ("
+                << ShapeUtil::HumanStringWithLayout(operand->shape()) << ")"
+                << std::endl;
+    }
+    std::cout << "  Users:" << std::endl;
+    for (HloInstruction* user : instr->users()) {
+      std::cout << "    " << user->name() << std::endl;
+    }
+    if (instr->parent()->root_instruction() == instr) {
+      std::cout << "  Root instruction of " << instr->parent()->name()
+                << std::endl;
+    }
+  }
+}
+
+void DoExtractCommand(const HloModule& module,
+                      absl::Span<const string> tokens) {
+  if (tokens.size() > 3) {
+    std::cerr << R"(Illegal input.  Enter e.g. "extract %fusion.1 2")"
+              << std::endl;
+    return;
+  }
+
+  // Find the node with the given name.
+  string node_name = tokens[1];
+  HloInstruction* instr = FindInstruction(module, node_name);
+  if (!instr) {
+    std::cerr << "Couldn't find HloInstruction named " << node_name << "."
+              << std::endl;
+    return;
+  }
+
+  int64 height = -1;
+  if (tokens.size() == 3) {
+    if (!absl::SimpleAtoi(tokens[2], &height)) {
+      std::cerr << "Can't parse '" << tokens[2] << "' as an integer."
+                << std::endl;
+      return;
+    }
+  }
+
+  auto extracted_module = ExtractModule(instr, height);
+  std::cout << extracted_module->ToString(
+                   HloPrintOptions::ShortParsable().set_print_backend_config(
+                       show_backend_config))
+            << std::endl;
+}
+
+// Checks if there is a use-def path from `from` to `to`.
+bool ExistsPathFromTo(const HloInstruction* from, const HloInstruction* to) {
+  std::unordered_set<const HloInstruction*> visited;
+  std::vector<const HloInstruction*> to_visit = {from};
+  while (!to_visit.empty()) {
+    auto* n = to_visit.back();
+    if (n == to) {
+      return true;
+    }
+    to_visit.pop_back();
+    visited.insert(n);
+    for (auto* user : n->users()) {
+      if (!visited.count(user)) {
+        to_visit.push_back(user);
+      }
+    }
+  }
+  return false;
+}
+
+void DisplayGraphHandle(const Options &opts, const string& handle) {
+  std::cout << handle << std::endl;
+
+  // If it is a url, try to open it up in the user's browser too.
+  if (absl::StartsWithIgnoreCase(handle, "http://") ||
+      absl::StartsWithIgnoreCase(handle, "https://") ||
+      absl::StartsWithIgnoreCase(handle, "file://")) {
+    const char* browser_bin = opts.browser.empty() ? "/usr/bin/sensible-browser"
+                                                   : opts.browser.c_str();
+    tensorflow::SubProcess p;
+    p.SetProgram(browser_bin, {browser_bin, handle});
+    p.Start();
+  } else if (handle.empty()) {
+    std::cerr << "Unable to render graph, perhaps due to graphviz server "
+                 "timeout.  Run with --logtostderr to see."
+              << std::endl;
+  } else {
+    std::cerr << "\nExpected a URL, but got strange graph result (dumped "
+                 "above).  If this isn't what you expected, maybe file a bug?"
+              << std::endl;
+  }
+}
+
+void DoAllPathsCommand(const Options& opts, const HloModule& module,
+                       const std::vector<string>& tokens) {
+  if (tokens.size() > 4) {
+    std::cerr << R"(Illegal input.  Enter e.g. "allpaths %add.4 %subtract.2" or
+"allpaths add.4 subtract.2 42.)"
+              << std::endl;
+    return;
+  }
+
+  int64 max_nodes = kDefaultMaxNumNodesInAllPaths;
+  if (tokens.size() == 4 && !absl::SimpleAtoi(tokens[3], &max_nodes)) {
+    std::cerr << "Can't parse '" << tokens[3] << "' as an integer."
+              << std::endl;
+    return;
+  }
+
+  const HloInstruction* n1 = FindInstruction(module, tokens[1]);
+  if (!n1) {
+    std::cerr << "Couldn't find HloInstruction named " << tokens[1];
+    return;
+  }
+  const HloInstruction* n2 = FindInstruction(module, tokens[2]);
+  if (!n2) {
+    std::cerr << "Couldn't find HloInstruction named " << tokens[2];
+    return;
+  }
+
+  // Is there a path from n1 to n2, or vice versa?
+  const HloInstruction* from;
+  const HloInstruction* to;
+  if (ExistsPathFromTo(n1, n2)) {
+    from = n1;
+    to = n2;
+  } else if (ExistsPathFromTo(n2, n1)) {
+    from = n2;
+    to = n1;
+  } else {
+    std::cerr << "No path from/to " << tokens[1] << " to/from " << tokens[2];
+    return;
+  }
+  DisplayGraphHandle(opts, hlo_graph_dumper::DumpAllPathsFromTo(
+      *from, *to, max_nodes, /*show_backend_config=*/show_backend_config));
+}
+
+// Plot a given instruction neighborhood or computation with graphviz.
+void DoPlotCommand(const Options& opts, const HloModule& module,
+                   const std::vector<string>& tokens) {
+  string node_name = tokens[0];
+
+  // Find the node with the given name.
+  const HloInstruction* instr = FindInstruction(module, node_name);
+  const HloComputation* comp = FindComputation(module, node_name);
+  if (!instr && !comp) {
+    std::cerr << "Couldn't find HloInstruction or HloComputation named "
+              << node_name << "." << std::endl;
+    return;
+  }
+
+  uint64 graph_width = kDefaultWidth;
+  absl::flat_hash_set<const HloInstruction*> boundary;
+  if (tokens.size() >= 2) {
+    if (comp) {
+      std::cerr << "Can only use graph-size parameter with instructions, but "
+                << node_name << " is a computation." << std::endl;
+      return;
+    }
+
+    int bound_index = 1;
+    // Get the <width> if present.
+    if (absl::SimpleAtoi(tokens[bound_index], &graph_width)) {
+      bound_index++;
+    } else {
+      // <width> not found, need to reset graph_width.
+      graph_width = kDefaultWidth;
+    }
+    // Get the '/'.
+    if (bound_index < tokens.size()) {
+      // This token must be a '/'.
+      if (tokens[bound_index] != "/") {
+        std::cerr << "Expect a /, but get a '" << tokens[bound_index] << "'."
+                  << std::endl;
+        return;
+      }
+      bound_index++;
+    }
+    // Get the boundary nodes.
+    while (bound_index < tokens.size()) {
+      string bnode_name = tokens[bound_index];
+      const HloInstruction* binstr = FindInstruction(module, bnode_name);
+      if (!binstr) {
+        std::cerr << "Couldn't find HloInstruction named " << bnode_name << "."
+                  << std::endl;
+        return;
+      }
+      boundary.insert(binstr);
+      bound_index++;
+    }
+  }
+
+  // Generate the graph and print the resulting string, which should be a
+  // graphviz url.
+  if (comp) {
+    DisplayGraphHandle(opts, hlo_graph_dumper::DumpGraph(
+        *comp, "", comp->parent()->config().debug_options(), nullptr,
+        /*show_backend_config=*/show_backend_config));
+  } else {
+    DisplayGraphHandle(opts, hlo_graph_dumper::DumpNeighborhoodAround(
+                                 *instr, graph_width,
+                                 /*show_backend_config=*/show_backend_config,
+                                 /*boundary=*/boundary));
+  }
+}
+
+// Run the main event loop, reading user commands and processing them.
+void InteractiveDumpGraphs(const Options& opts, const HloModule& module) {
+  // This is an interactive tool, but some may use `extract` in non-tty
+  // environment anyway. Give them a clean hlo dump.
+  if (isatty(fileno(stdin))) {
+    std::cout << "\n\nLoaded module " << module.name() << "." << std::endl;
+    DoHelpCommand();
+  }
+  for (string line; ReadLine("\ncommand: ", &line);) {
+    if (line.empty()) {
+      std::cout << R"(Enter e.g. "fusion.1 3" or "add.8".)" << std::endl
+                << R"(Enter "help" for help; ^D, "quit", or "exit" to exit.)"
+                << std::endl;
+      continue;
+    }
+    std::vector<string> tokens = absl::StrSplit(line, ' ', absl::SkipEmpty());
+    if (tokens[0] == "quit" || tokens[0] == "exit") {
+      break;
+    } else if (tokens[0] == "help") {
+      DoHelpCommand();
+    } else if (tokens[0] == "backend_config") {
+      DoBackendConfigCommand(tokens);
+    } else if (tokens[0] == "list") {
+      if (tokens.size() > 1 && tokens[1] == "computations") {
+        DoListComputationsCommand(module, tokens);
+      } else {
+        DoListCommand(module, tokens);
+      }
+    } else if (tokens[0] == "info") {
+      DoInfoCommand(module, tokens);
+    } else if (tokens[0] == "extract") {
+      DoExtractCommand(module, tokens);
+    } else if (tokens[0] == "allpaths") {
+      DoAllPathsCommand(opts, module, tokens);
+    } else {
+      DoPlotCommand(opts, module, tokens);
+    }
+  }
+}
+
+void CheckFlags(const Options &opts) {
+  std::vector<string> nonempty_proto_flags;
+  if (!opts.hlo_proto.empty()) {
+    nonempty_proto_flags.push_back("--hlo_proto");
+  }
+  if (!opts.hlo_snapshot.empty()) {
+    nonempty_proto_flags.push_back("--hlo_snapshot");
+  }
+  if (!opts.hlo_text.empty()) {
+    nonempty_proto_flags.push_back("--hlo_text");
+  }
+  switch (nonempty_proto_flags.size()) {
+    case 1:
+      // We're good to go.
+      break;
+    case 0:
+      LOG(FATAL) << "Need one of the following options: "
+                 << absl::StrJoin(nonempty_proto_flags, ", ");
+    default:
+      LOG(FATAL) << "Can only specify one of "
+                 << absl::StrJoin(nonempty_proto_flags, ", ");
+  }
+}
+
+void RealMain(const Options& opts) {
+  if (!isatty(fileno(stdin))) {
+    LOG(ERROR) << "\n\n*****************************************\n"
+               << "This is an interactive tool, but stdin is not a tty.\n"
+               << "*****************************************\n\n";
+  }
+
+  CheckFlags(opts);
+
+  std::unique_ptr<HloModule> module;
+  if (!opts.hlo_snapshot.empty()) {
+    HloSnapshot snapshot;
+    TF_CHECK_OK(tensorflow::ReadBinaryProto(tensorflow::Env::Default(),
+                                            opts.hlo_snapshot, &snapshot))
+        << "Can't open, read, or parse HloSnapshot proto at "
+        << opts.hlo_snapshot;
+    auto config =
+        HloModule::CreateModuleConfigFromProto(snapshot.hlo().hlo_module(),
+                                               xla::GetDebugOptionsFromFlags())
+            .ValueOrDie();
+    module = HloModule::CreateFromProto(snapshot.hlo().hlo_module(), config)
+                 .ValueOrDie();
+  } else if (!opts.hlo_proto.empty()) {
+    module = HloRunner::ReadModuleFromBinaryProtoFile(
+                 opts.hlo_proto, xla::GetDebugOptionsFromFlags())
+                 .ValueOrDie();
+  } else if (!opts.hlo_text.empty()) {
+    module = HloRunner::ReadModuleFromHloTextFile(
+                 opts.hlo_text, xla::GetDebugOptionsFromFlags())
+                 .ValueOrDie();
+  }
+
+  // If a platform was specified, compile the module for that platform.
+  if (!opts.platform.empty()) {
+    se::Platform* platform =
+        PlatformUtil::GetPlatform(opts.platform).ValueOrDie();
+    LOG(INFO) << "Compiling module for " << platform->Name();
+
+    se::StreamExecutor* executor =
+        platform->ExecutorForDevice(/*ordinal=*/0).ValueOrDie();
+    auto compiler = Compiler::GetForPlatform(platform).ValueOrDie();
+    module = compiler
+                 ->RunHloPasses(std::move(module), executor,
+                                /*device_allocator=*/nullptr)
+                 .ValueOrDie();
+    auto executable = compiler
+                          ->RunBackend(std::move(module), executor,
+                                       /*device_allocator=*/nullptr)
+                          .ValueOrDie();
+    InteractiveDumpGraphs(opts, executable->module());
+  } else {
+    InteractiveDumpGraphs(opts, *module);
+  }
+}
+
+}  // namespace
+}  // namespace tools
+}  // namespace xla
+
+int main(int argc, char** argv) {
+  xla::tools::Options opts;
+  opts.browser = "/usr/bin/sensible-browser";
+  bool need_help = false;
+  const std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("hlo_snapshot", &opts.hlo_snapshot,
+                       "HloSnapshot proto to interactively dump to graphviz"),
+      tensorflow::Flag("hlo_proto", &opts.hlo_proto,
+                       "XLA hlo proto to interactively dump to graphviz"),
+      tensorflow::Flag("hlo_text", &opts.hlo_text,
+                       "XLA hlo proto to interactively dump to graphviz"),
+      tensorflow::Flag("platform", &opts.platform,
+                       "Platform to compile for: CPU, CUDA, etc"),
+      tensorflow::Flag("browser", &opts.browser,
+                       "Path to web browser used to display produced graphs."),
+      tensorflow::Flag("help", &need_help,
+                       "Prints this help message"),
+  };
+  xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  tensorflow::port::InitMain(argv[0], &argc, &argv);
+  if (argc != 1 || !parse_ok || need_help) {
+    LOG(QFATAL) << usage;
+  }
+  xla::tools::RealMain(opts);
+  return 0;
+}
diff --git a/tensorflow/compiler/xla/tools/interactive_graphviz_test.sh b/tensorflow/compiler/xla/tools/interactive_graphviz_test.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b3e43aa7da062547fb5f187b885e997fc44bbb65
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/interactive_graphviz_test.sh
@@ -0,0 +1,19 @@
+#! /bin/bash
+# /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================*/
+
+# This is a placeholder for a compile-only test for intractive_graphviz tool.
+
+exit 0
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index ff2c3399928c0e6339304323c4f93e212933a340..d66561315b4ad7a5e3f1f7b1bc1e557b71da6705 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/gpu/infeed_manager.h"
+#include "tensorflow/compiler/xla/service/gpu/outfeed_manager.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -73,14 +74,24 @@ namespace {
 // fields.
 struct Options {
   string fake_infeed_shape;
-  bool generate_fake_infeed = false;
+  string fake_outfeed_shape;
+
+  // generate_fake_infeed == true is a safe default: If the model has 0 or 1
+  // infeeds, then it will work like normal.  If the model has more than one
+  // infeed, it will be an error, but that wouldn't have worked anyway if you
+  // hadn't passed generate_fake_infeed.
+  //
+  // Same for generate_fake_outfeed.
+  bool generate_fake_infeed = true;
+  bool generate_fake_outfeed = true;
+
   bool use_fake_data = false;
   bool print_result = true;
   int num_runs = 1;
 };
 
-std::unique_ptr<LocalExecutable> CompileExecutable(const HloSnapshot& module,
-                                                   LocalClient* client) {
+StatusOr<std::unique_ptr<LocalExecutable>> CompileExecutable(
+    const HloSnapshot& module, LocalClient* client) {
   XlaComputation computation(module.hlo().hlo_module());
   std::vector<Shape> argument_layouts;
   argument_layouts.reserve(
@@ -91,9 +102,86 @@ std::unique_ptr<LocalExecutable> CompileExecutable(const HloSnapshot& module,
     argument_layouts.push_back(Shape(param));
     argument_layout_ptrs.push_back(&argument_layouts.back());
   }
-  return client
-      ->Compile(computation, argument_layout_ptrs, ExecutableBuildOptions())
-      .ValueOrDie();
+  ExecutableBuildOptions exec_build_options;
+  *exec_build_options.mutable_debug_options() = GetDebugOptionsFromFlags();
+  return client->Compile(computation, argument_layout_ptrs, exec_build_options);
+}
+
+absl::optional<Shape> GetXfeedShape(bool is_infeed,
+                                    const HloModuleProto& module,
+                                    const Options& opts) {
+  std::vector<HloInstructionProto> xfeed_instrs;
+  for (const auto& comp : module.computations()) {
+    for (const auto& instruction : comp.instructions()) {
+      if (instruction.opcode() == HloOpcodeString(is_infeed
+                                                      ? HloOpcode::kInfeed
+                                                      : HloOpcode::kOutfeed)) {
+        xfeed_instrs.push_back(instruction);
+      }
+    }
+  }
+
+  auto log_xfeed_instrs = [&] {
+    for (const auto& infeed : xfeed_instrs) {
+      LOG(ERROR) << "  " << ShapeUtil::HumanString(Shape(infeed.shape())) << " "
+                 << infeed.name();
+    }
+  };
+
+  auto find_instruction_from_id_or_die = [&](int64 id) {
+    for (const auto& comp : module.computations()) {
+      for (const auto& instruction : comp.instructions()) {
+        if (instruction.id() == id) {
+          return instruction;
+        }
+      }
+    }
+    LOG(FATAL) << "No instruction with id " << id;
+  };
+
+  absl::optional<Shape> xfeed_shape;
+  string xfeed_name = is_infeed ? "infeed" : "outfeed";
+  string fake_xfeed_shape =
+      is_infeed ? opts.fake_infeed_shape : opts.fake_outfeed_shape;
+  bool generate_fake_xfeed =
+      is_infeed ? opts.generate_fake_infeed : opts.generate_fake_outfeed;
+  if (!fake_xfeed_shape.empty()) {
+    xfeed_shape = std::move(ParseShape(fake_xfeed_shape)).ValueOrDie();
+  } else if (generate_fake_xfeed) {
+    CHECK_LT(xfeed_instrs.size(), 2)
+        << "--generate_fake_" << xfeed_name
+        << " only works if the model has 0 or 1 " << xfeed_name << " ops.";
+    if (xfeed_instrs.empty()) {
+      LOG(INFO) << "Not generating fake " << xfeed_name
+                << " shape; model has no " << xfeed_name << "s.";
+    } else if (xfeed_instrs.size() == 1) {
+      // kInfeed instructions should have a shape (buffer, token).  kOutfeed
+      // instructions should have operand 0 of shape `buffer`. We want to xfeed
+      // just `buffer`.
+      xfeed_shape = is_infeed
+                        ? Shape(xfeed_instrs.front().shape()).tuple_shapes(0)
+                        : Shape(find_instruction_from_id_or_die(
+                                    xfeed_instrs.front().operand_ids(0))
+                                    .shape());
+      LOG(INFO) << "Generating fake " << xfeed_name << " with inferred shape: "
+                << ShapeUtil::HumanString(*xfeed_shape);
+    } else {
+      LOG(ERROR) << "--generate_fake_" << xfeed_name
+                 << " only works if the model has 0 or 1 " << xfeed_name
+                 << " ops, but this model has " << xfeed_instrs.size()
+                 << " of them:";
+      log_xfeed_instrs();
+      LOG(FATAL) << "Can't run model with --generate_fake_infeed.";
+    }
+  } else if (!xfeed_instrs.empty()) {
+    LOG(ERROR) << "Model contains " << xfeed_instrs.size() << " " << xfeed_name
+               << " instruction(s), but neither --generate_fake_" << xfeed_name
+               << " nor --fake_" << xfeed_name
+               << "_shape was specified.  Execution will likely hang.";
+    log_xfeed_instrs();
+  }
+
+  return xfeed_shape;
 }
 
 // Invokes the given computation passing arbitrary data for every (unbound)
@@ -118,7 +206,12 @@ StatusOr<Literal> ReplayComputation(const HloSnapshot& module,
   std::vector<std::unique_ptr<GlobalData>> global_data_arguments;
   std::vector<const ShapedBuffer*> argument_ptrs;
   if (opts.use_fake_data) {
-    global_data_arguments = MakeFakeArgumentsOrDie(computation, client);
+    // Run fake computations with debug options ignoring XLA_FLAGS.  Users very
+    // likely want XLA_FLAGS only to apply to the "real" computation being run,
+    // not to the fake computations we use for generating arguments.
+    auto debug_opts = DefaultDebugOptionsIgnoringFlags();
+    global_data_arguments =
+        MakeFakeArgumentsOrDie(computation, client, &debug_opts);
     for (const auto& data : global_data_arguments) {
       argument_ptrs.push_back(
           client->GlobalDataToShapedBuffer(data->handle(), /*device_ordinal=*/0)
@@ -137,55 +230,37 @@ StatusOr<Literal> ReplayComputation(const HloSnapshot& module,
     }
   }
 
-  bool provide_infeed = false;
-  Shape infeed_shape;
-  if (!opts.fake_infeed_shape.empty()) {
-    StatusOr<Shape> shape_status =
-        ShapeUtil::ParseShapeString(opts.fake_infeed_shape);
-    TF_CHECK_OK(shape_status.status());
-    infeed_shape = std::move(shape_status).ValueOrDie();
-    provide_infeed = true;
-  } else if (opts.generate_fake_infeed) {
-    for (const auto& comp : computation.proto().computations()) {
-      for (const auto& instruction : comp.instructions()) {
-        if (instruction.opcode() == HloOpcodeString(HloOpcode::kInfeed)) {
-          CHECK(!provide_infeed)
-              << "--generate_fake_infeed only works if the model has 0 or 1 "
-                 "infeed ops, but this one has >= 2.";
-          provide_infeed = true;
-          infeed_shape = Shape(instruction.shape());
-          LOG(INFO) << "Generating fake infeed shape for inferred shape: "
-                    << ShapeUtil::HumanString(infeed_shape);
-        }
-      }
-    }
-  }
-  // We only instantiate the thread pool if the user has requested that a
-  // concurrent infeed occur via the fake_infeed_shape, or when
-  // --generate_fake_infeed is passed and there exists an infeed operation in
-  // the HloSnapshot.
-  absl::optional<tensorflow::thread::ThreadPool> pool;
-  Literal data;
-  if (provide_infeed) {
-    data = std::move(MakeFakeLiteral(infeed_shape)).ValueOrDie();
+  if (absl::optional<Shape> infeed_shape = GetXfeedShape(
+          /*is_infeed=*/true, computation.proto(), opts)) {
+    auto infeed_data = std::make_shared<Literal>(
+        std::move(MakeFakeLiteral(*infeed_shape)).ValueOrDie());
+    xla::gpu::GetOrCreateInfeedManager()
+        ->RegisterBeforeGetNextDestinationCallback([infeed_data, client] {
+          TF_CHECK_OK(client->TransferToInfeed(*infeed_data));
+        });
   }
-  auto transfer_infeed = [&data, client]() {
-    TF_CHECK_OK(client->TransferToInfeed(data));
-  };
-  if (provide_infeed) {
-    pool.emplace(tensorflow::Env::Default(), "infeed",
-                 /*num_threads=*/1);
-    pool->Schedule([transfer_infeed]() {
-      // There may be several infeed buffers needed, however we don't know how
-      // many. If we proactively transfer too many infeed buffers, we may run
-      // out of memory. If we transfer too few infeed buffers, the program will
-      // hang. Therefore, we register a callback that is called when the infeed
-      // becomes empty, and in this callback we will transfer another fake
-      // infeed.
-      auto infeed_manager = xla::gpu::GetOrCreateInfeedManager();
-      infeed_manager->RegisterOnEmptyCallback(transfer_infeed);
-      transfer_infeed();
-    });
+
+  absl::optional<tensorflow::thread::ThreadPool> outfeed_thread_pool;
+  if (absl::optional<Shape> outfeed_shape = GetXfeedShape(
+          /*is_infeed=*/false, computation.proto(), opts)) {
+    // For each an outfeed that runs, enqueue a task that will consume it.  We
+    // need a thread pool because the act of running an outfeed blocks on there
+    // being a destination available, and the act of making a destination
+    // available blocks on there being outfeed data available.
+    outfeed_thread_pool.emplace(tensorflow::Env::Default(), "infeed",
+                                /*num_threads=*/1);
+    auto consume_outfeed = [client, outfeed_shape] {
+      TF_CHECK_OK(
+          client->TransferFromOutfeedLocal(*outfeed_shape, /*device_ordinal=*/0)
+              .status());
+      VLOG(1) << "Received outfeed data of shape "
+              << ShapeUtil::HumanStringWithLayout(*outfeed_shape);
+    };
+    xla::gpu::GetOrCreateOutfeedManager()
+        ->RegisterBeforeGetNextDestinationCallback(
+            [consume_outfeed, &outfeed_thread_pool] {
+              outfeed_thread_pool->Schedule(consume_outfeed);
+            });
   }
 
   // Do not attempt to run the executable if num_runs is less than 1.
@@ -254,7 +329,10 @@ StatusOr<HloSnapshot> ParseInputFile(const string& filename,
   fprintf(stderr, "%s: is not HloProto. Trying HLO text.\n", filename.c_str());
   string contents;
   TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(env, filename, &contents));
-  StatusOr<std::unique_ptr<HloModule>> module = ParseHloString(contents);
+  HloModuleConfig config;
+  config.set_debug_options(GetDebugOptionsFromFlags());
+  StatusOr<std::unique_ptr<HloModule>> module =
+      ParseHloString(contents, config);
   if (module.ok()) {
     *snapshot.mutable_hlo()->mutable_hlo_module() =
         module.ValueOrDie()->ToProto();
@@ -282,7 +360,7 @@ int RealMain(absl::Span<char* const> args, const Options& opts) {
 
   // Compile all the modules in parallel.
   LOG(INFO) << "Compiling " << snapshots.size() << " modules in parallel.";
-  std::vector<std::unique_ptr<LocalExecutable>> executables;
+  std::vector<StatusOr<std::unique_ptr<LocalExecutable>>> executables;
   {
     // ThreadPool CHECK-fails if we give it 0 threads.
     tensorflow::thread::ThreadPool thread_pool(
@@ -299,9 +377,16 @@ int RealMain(absl::Span<char* const> args, const Options& opts) {
   LOG(INFO) << "Done compiling; now running the modules.";
 
   for (int64 i = 0; i < executables.size(); ++i) {
-    LocalExecutable* executable = executables[i].get();
+    if (!executables[i].ok()) {
+      LOG(ERROR) << "Compilation failed: " << executables[i].status();
+      exit_status = EXIT_FAILURE;
+      continue;
+    }
+    LocalExecutable* executable = executables[i].ValueOrDie().get();
+    LOG(ERROR) << "Running iteration " << i;
     StatusOr<Literal> result_status =
         ReplayComputation(snapshots[i], executable, client, opts);
+    LOG(ERROR) << "iteration complete.";
     if (!result_status.ok()) {
       fprintf(stderr, "%s: error: %s\n", args[i],
               result_status.status().ToString().c_str());
@@ -346,9 +431,14 @@ int main(int argc, char** argv) {
                        "Number of times to run each computation"),
       tensorflow::Flag("fake_infeed_shape", &opts.fake_infeed_shape,
                        "Shape of fake data to construct for (infinite) infeed"),
+      tensorflow::Flag("fake_outfeed_shape", &opts.fake_outfeed_shape,
+                       "Shape of fake data to outfeed from computation"),
       tensorflow::Flag("generate_fake_infeed", &opts.generate_fake_infeed,
-                       "Whether a fake infeed shape should be generated "
-                       "derived from the computation"),
+                       "Whether a fake infeed shape should be derived "
+                       "from the computation"),
+      tensorflow::Flag("generate_fake_outfeed", &opts.generate_fake_outfeed,
+                       "Whether a fake outfeed shape should be derived "
+                       "from the computation"),
   };
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tools/show_signature.cc b/tensorflow/compiler/xla/tools/show_signature.cc
index cdf306dfd1027cf6022c5d8ae844b4308f580e8d..b80d0db8d812380d8144713109d1c05168713c77 100644
--- a/tensorflow/compiler/xla/tools/show_signature.cc
+++ b/tensorflow/compiler/xla/tools/show_signature.cc
@@ -37,7 +37,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xla/types.h b/tensorflow/compiler/xla/types.h
index b645acb700b0f168112a40c9c72b4669435f717d..daf678f69017b9eb86cbc464a1f33b434021901d 100644
--- a/tensorflow/compiler/xla/types.h
+++ b/tensorflow/compiler/xla/types.h
@@ -41,6 +41,7 @@ using ::tensorflow::uint32;
 using ::tensorflow::uint64;
 
 using complex64 = std::complex<float>;
+using complex128 = std::complex<double>;
 
 using ::Eigen::half;
 
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index 68cab7387cf1576072f96878b50f07def6862d8b..bb8bbf57c4252b16836553334901a3c896a17f39 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stdarg.h>
 #include <numeric>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -80,13 +81,9 @@ bool IsPermutation(absl::Span<const int64> permutation, int64 rank) {
   if (rank != permutation.size()) {
     return false;
   }
-  std::vector<int64> output(permutation.size(), -1);
-  for (auto index : permutation) {
-    CHECK_GE(index, 0);
-    CHECK_LT(index, rank);
-    output[index] = 0;
-  }
-  return std::find(output.begin(), output.end(), -1) == output.end();
+  absl::InlinedVector<int64, 8> trivial_permutation(rank);
+  absl::c_iota(trivial_permutation, 0);
+  return absl::c_is_permutation(permutation, trivial_permutation);
 }
 
 std::vector<int64> InversePermutation(
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 6722641e9d2c177440361e6f0d1f6c0804eb7cda..f2fd17dc99455a921bf875aad2a3661b4d456823 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -324,8 +324,7 @@ bool IsIdentityPermutation(absl::Span<const int64> permutation);
 
 template <typename Container>
 int64 PositionInContainer(const Container& container, int64 value) {
-  return std::distance(container.begin(),
-                       std::find(container.begin(), container.end(), value));
+  return std::distance(container.begin(), absl::c_find(container, value));
 }
 
 // Formats the container as a comma-separated string. StrAppend must support
diff --git a/tensorflow/compiler/xla/window_util.cc b/tensorflow/compiler/xla/window_util.cc
index 51c73b3d17e4c32d9a8a14d3055ab56f02922af3..e001cc35f9fcea2783b3952e825838af6bbece72 100644
--- a/tensorflow/compiler/xla/window_util.cc
+++ b/tensorflow/compiler/xla/window_util.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -137,25 +138,23 @@ bool HasPadding(const Window& window) {
 }
 
 bool HasSymmetricPadding(const Window& window) {
-  return std::all_of(window.dimensions().begin(), window.dimensions().end(),
-                     [](const WindowDimension& dim) {
-                       return dim.padding_low() == dim.padding_high();
-                     });
+  return absl::c_all_of(window.dimensions(), [](const WindowDimension& dim) {
+    return dim.padding_low() == dim.padding_high();
+  });
 }
 
 bool HasSymmetricPadding(const PaddingConfig& padding_config) {
-  return std::all_of(padding_config.dimensions().begin(),
-                     padding_config.dimensions().end(),
-                     [](const PaddingConfig::PaddingConfigDimension& dim) {
-                       return dim.edge_padding_low() == dim.edge_padding_high();
-                     });
+  return absl::c_all_of(padding_config.dimensions(),
+                        [](const PaddingConfig::PaddingConfigDimension& dim) {
+                          return dim.edge_padding_low() ==
+                                 dim.edge_padding_high();
+                        });
 }
 
 bool HasNegativePadding(const Window& window) {
-  return std::any_of(window.dimensions().begin(), window.dimensions().end(),
-                     [](const WindowDimension& dim) {
-                       return dim.padding_low() < 0 || dim.padding_high() < 0;
-                     });
+  return absl::c_any_of(window.dimensions(), [](const WindowDimension& dim) {
+    return dim.padding_low() < 0 || dim.padding_high() < 0;
+  });
 }
 
 bool HasBaseDilation(const Window& window) {
@@ -190,10 +189,9 @@ bool AllOrNoneReversed(const Window& window) {
     return true;
   }
   bool reversed = window.dimensions()[0].window_reversal();
-  return std::all_of(window.dimensions().begin(), window.dimensions().end(),
-                     [&](const WindowDimension& dim) {
-                       return dim.window_reversal() == reversed;
-                     });
+  return absl::c_all_of(window.dimensions(), [&](const WindowDimension& dim) {
+    return dim.window_reversal() == reversed;
+  });
 }
 
 bool HasDilation(const Window& window) {
diff --git a/tensorflow/compiler/xla/xla.bzl b/tensorflow/compiler/xla/xla.bzl
index 1439f1bcc5cec39203a7cb4b1f8604e7349382c6..cda2d7c7c6b2403868f6d01a485753fa29a8d95f 100644
--- a/tensorflow/compiler/xla/xla.bzl
+++ b/tensorflow/compiler/xla/xla.bzl
@@ -1,30 +1,47 @@
 """Wrapper around cc_proto_library used inside the XLA codebase."""
 
-load("//tensorflow/core:platform/default/build_config.bzl",
-     "cc_proto_library")
-load("//tensorflow/core:platform/default/build_config_root.bzl",
-     "if_static")
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "cc_proto_library",
+)
+load(
+    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "if_static",
+)
+load("//tensorflow:tensorflow.bzl", "if_cuda_is_configured")
 
 # xla_proto_library() is a convenience wrapper around cc_proto_library.
-def xla_proto_library(name, srcs=[], deps=[], visibility=None, testonly=0, **kwargs):
-  if kwargs.get('use_grpc_plugin'):
-    kwargs['use_grpc_namespace'] = True
-  cc_proto_library(name=name,
-                   srcs=srcs,
-                   deps=deps,
-                   cc_libs = if_static(
-                       ["@protobuf_archive//:protobuf"],
-                       otherwise=["@protobuf_archive//:protobuf_headers"],
-                   ),
-                   protoc="@protobuf_archive//:protoc",
-                   testonly=testonly,
-                   visibility=visibility,
-                   **kwargs)
+def xla_proto_library(name, srcs = [], deps = [], visibility = None, testonly = 0, **kwargs):
+    if kwargs.get("use_grpc_plugin"):
+        kwargs["use_grpc_namespace"] = True
+    cc_proto_library(
+        name = name,
+        srcs = srcs,
+        # Append well-known proto dep. As far as I know this is the only way
+        # for xla_proto_library to access google.protobuf.{Any,Duration,...}.
+        deps = deps + ["@protobuf_archive//:cc_wkt_protos"],
+        cc_libs = if_static(
+            ["@protobuf_archive//:protobuf"],
+            otherwise = ["@protobuf_archive//:protobuf_headers"],
+        ),
+        protoc = "@protobuf_archive//:protoc",
+        testonly = testonly,
+        visibility = visibility,
+        **kwargs
+    )
 
-def xla_py_grpc_library(**kwargs):
-  # Note: we don't currently define any special targets for Python GRPC in OSS.
-  _ignore = kwargs
-  pass
+def xla_py_proto_library(**kwargs):
+    # Note: we don't currently define a proto library target for Python in OSS.
+    _ignore = kwargs
+    pass
 
+def xla_py_grpc_library(**kwargs):
+    # Note: we don't currently define any special targets for Python GRPC in OSS.
+    _ignore = kwargs
+    pass
 
 ORC_JIT_MEMORY_MAPPER_TARGETS = []
+
+# We link the GPU plugin into the XLA Python extension if CUDA is enabled.
+def xla_python_default_plugins():
+    return if_cuda_is_configured(["//tensorflow/compiler/xla/service:gpu_plugin"])
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index a37eac7fe441d91aa71e1b6fd7b84099fee2215b..925fcbf88c1e8dd81ab1339d292e05eae52e0d13 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -15,11 +15,11 @@ limitations under the License.
 
 syntax = "proto3";
 
-import "tensorflow/compiler/xla/xla_data.proto";
-import "tensorflow/compiler/xla/service/hlo.proto";
-
 package xla;
 
+import "tensorflow/compiler/xla/service/hlo.proto";
+import "tensorflow/compiler/xla/xla_data.proto";
+
 // Options for the HLO insert-reduce-precision-operations pass.
 message HloReducePrecisionOptions {
   // Where and when the reduce-precision operations will be added.
@@ -72,8 +72,7 @@ message DebugOptions {
   // Path to dump HLO graphs to.
   string xla_hlo_graph_path = 4;
 
-  // Dump HLO graphs as TensorFlow GraphDefs.
-  bool xla_hlo_dump_as_graphdef = 5;
+  reserved 5;  // Was xla_hlo_dump_as_graphdef
 
   // HLO modules matching this regex will be dumped to LOG(INFO). Set to ".*" to
   // dump *all* HLO modules.
@@ -100,6 +99,14 @@ message DebugOptions {
   // names as specified by the HloPassInterface::name() method.
   repeated string xla_disable_hlo_passes = 30;
 
+  // Disables all HLO passes.  Notes that some passes are necessary for
+  // correctness and the invariants that must be satisfied by "fully optimized"
+  // HLO are different for different devices and may change over time.  The only
+  // "guarantee", such as it is, is that if you compile XLA and dump the
+  // optimized HLO for some graph, you should be able to run it again on the
+  // same device with the same build of XLA.
+  bool xla_disable_all_hlo_passes = 104;
+
   // Numerical optimization level for the XLA compiler backend; the specific
   // interpretation of this value is left to the backends.
   int32 xla_backend_optimization_level = 31;
@@ -163,9 +170,7 @@ message DebugOptions {
   // HLO graph.
   bool xla_hlo_graph_sharding_color = 92;
 
-  // Prefix the name scopes of the TF graph exports with "devX" device
-  // assignments, if available.
-  bool xla_hlo_tfgraph_device_scopes = 93;
+  reserved 93;  // Was xla_hlo_tfgraph_device_scopes
 
   // If true, the GPU backend is free to use cudnn for HLO batch normalization
   // ops.
@@ -216,6 +221,34 @@ message DebugOptions {
   // If set to true XLA:GPU invokes `ptxas` with -O0 (default is -O3).
   bool xla_gpu_disable_ptxas_optimizations = 103;
 
+  // Dump HLO graphs as an HTML (DOT -> SVG inlined in HTML)
+  bool xla_hlo_dump_as_html = 105;
+
+  // Enable fast math with eigen in the HLO evaluator.
+  bool xla_hlo_evaluator_use_fast_path = 106;
+
+  // Temporary option to allow support for both the R1 and the scalar index
+  // versions of DynamicSlice and DynamicUpdateSlice. Only used for testing.
+  bool xla_allow_scalar_index_dynamic_ops = 107;
+
+  enum StepMarkerLocation {
+    // Generate step mark at each iteration of top level while loop, which
+    // is assumed to be a training loop. This is the default.
+    STEP_MARK_AT_ENTRY = 0;
+    // Generate step mark at program entry. This handles the case where each
+    // step are done by one or multiple programs execution. Only the first
+    // program will be tagged for generating step mark at program entry.
+    STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP = 1;
+    // No step mark.
+    STEP_MARK_NONE = 2;
+  }
+  // Option to emit a target-specific marker to indicate the start of a training
+  // step. The location of the marker (if any) is determined by the option
+  // value.
+  StepMarkerLocation xla_step_marker_location = 108;
+
+  // Next id: 109
+
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
   map<string, string> xla_backend_extra_options = 500;
@@ -245,6 +278,10 @@ message ExecutionOptions {
   // computation on. The computation will be partitioned across these devices.
   // If not provided, the default device will be chosen.
   repeated DeviceHandle device_handles = 5;
+
+  // Number of replicas of the computation to run. If zero, uses the default
+  // number of replicas for the XLA service.
+  int32 num_replicas = 6;
 }
 
 message GetDeviceHandlesRequest {
@@ -282,8 +319,7 @@ message TransferToInfeedRequest {
   DeviceHandle device_handle = 3;
 }
 
-message TransferToInfeedResponse {
-}
+message TransferToInfeedResponse {}
 
 message TransferFromOutfeedRequest {
   // This optional field directs the service to return the literal in this
@@ -302,8 +338,7 @@ message ResetDeviceRequest {
   DeviceHandle device_handle = 1;
 }
 
-message ResetDeviceResponse {
-}
+message ResetDeviceResponse {}
 
 message ComputationGraphStatsRequest {
   HloModuleProto computation = 1;
@@ -326,8 +361,7 @@ message UnregisterRequest {
   repeated GlobalDataHandle data = 1;
 }
 
-message UnregisterResponse {
-}
+message UnregisterResponse {}
 
 message CompileRequest {
   // The graph to be compiled.
@@ -389,7 +423,7 @@ message WaitForExecutionResponse {
 
 message ComputeConstantGraphRequest {
   HloModuleProto computation = 1;
-  Layout output_layout = 2;
+  LayoutProto output_layout = 2;
 }
 
 message ComputeConstantResponse {
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 85ec83437a10d973687a7fb84285c2e2541a53c7..226299a7186ef0acb41f6d01fdeffeee06f13d4d 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -56,6 +56,7 @@ enum PrimitiveType {
 
   // Complex values of fixed width.
   C64 = 15;  // Paired F32 (real, imag), as in std::complex<float>.
+  C128 = 18;  // Paired F64 (real, imag), as in std::complex<double>.
 
   // A tuple is a polymorphic sequence; e.g. a shape that holds different
   // sub-shapes. They are used for things like returning multiple values from a
@@ -75,7 +76,7 @@ enum PrimitiveType {
   // primitive type will have empty dimensions and tuple_shapes fields.
   TOKEN = 17;
 
-  // Next = 18
+  // Next = 19
 }
 
 // Describes the padding configuration for Pad operation. The padding amount on
@@ -100,6 +101,8 @@ message PaddingConfig {
 
 // A format specifies the method used by a layout to store an array in memory.
 enum Format {
+  // TODO(b/120869032): Rename this to FORMAT_NONE or something else which
+  // better corresponds to its meaning.
   INVALID_FORMAT = 0;
   // The default layout, with exactly one storage location per element.
   DENSE = 1;
@@ -109,8 +112,9 @@ enum Format {
 }
 
 // Describes a tile used in tiling-based layout. Refer to
-// g3doc/layout_with_tiling.md for details about tiling-based layout.
-message Tile {
+// g3doc/third_party/tensorflow/compiler/xla/g3doc/layout_with_tiling.md for
+// details about tiling-based layout.
+message TileProto {
   // Number of elements in each dimension of the tile. It's ordered from the
   // most major dimension of the tile to the most minor dimension of the tile.
   // The dimensions correspond to a suffix of the dimensions of the shape being
@@ -128,7 +132,7 @@ message Tile {
 // See the XLA documentation for more information on shapes and layouts.
 //
 // LINT.IfChange
-message Layout {
+message LayoutProto {
   // The method used to store the data in memory. The format determines which of
   // the other fields are used by the layout.
   Format format = 4;
@@ -153,7 +157,7 @@ message Layout {
   //
   // TODO(b/119839262): implement tiling in each backend or add Unimplemented
   // error.
-  repeated Tile tiles = 6;
+  repeated TileProto tiles = 6;
 
   // Bit size of each element. If the size is bigger than what the element
   // type requires, the value is stored in the least significant
@@ -185,18 +189,27 @@ message ShapeProto {
   // The element type for this shape.
   PrimitiveType element_type = 2;
 
-  // The size (number of elements) for each dimension.
-  // In XLA, dimensions are numbered from 0 to N-1 for an
-  // N-dimensional array. The first element of 'dimensions' is the size of
-  // dimension 0, the second element is the size of dimension 1, and so forth.
-  // Empty list indicates a scalar.
+  // The size (number of elements) for each dimension, or an upper bound on the
+  // size if the dimension is dynamic.  In XLA, dimensions are numbered from 0
+  // to N-1 for an N-dimensional array. The first element of 'dimensions' is the
+  // size of dimension 0, the second element is the size of dimension 1, and so
+  // forth.  Empty list indicates a scalar.
+  //
+  // If the respective element in 'is_dimension_dynamic' is true then the value
+  // in this field represents an upper bound on the size of the dimension.
   repeated int64 dimensions = 3;
 
   // For tuples only, the shapes of constitutent shapes in the tuple sequence.
   repeated ShapeProto tuple_shapes = 4;
 
   // The layout used to back this shape.
-  Layout layout = 5;
+  LayoutProto layout = 5;
+
+  // For arrays, this indicates whether or not each dimension is
+  // dynamically-sized. The number of elements in this repeated field should be
+  // zero (indicating that no dimensions are dynamic) or equal to the number of
+  // elements in the 'dimensions' field.
+  repeated bool is_dynamic_dimension = 6;
 
   // Important: if any field is added, be sure to modify ShapeUtil::Equal(),
   // ShapeUtil::Compatible() and ShapeUtil::Hash() appropriately to account for
@@ -355,6 +368,7 @@ message LiteralProto {
   repeated float f32s = 8;
   repeated double f64s = 9;
   repeated float c64s = 12;  // Stored as interleaved real, imag floats.
+  repeated double c128s = 18;  // Stored as interleaved real, imag doubles.
   repeated LiteralProto tuple_literals = 10;
   // The F16s, BF16s, U16s and S16s are encoded in little endian byte order
   bytes f16s = 11;
@@ -362,7 +376,7 @@ message LiteralProto {
   bytes u16s = 16;
   bytes s16s = 17;
   repeated int64 sparse_indices = 14;
-  // Next = 18
+  // Next = 19
 }
 
 message WindowDimension {
@@ -531,6 +545,26 @@ enum RandomDistribution {
   // Next: 4
 }
 
+message TriangularSolveOptions {
+  // If true, solves ax = b. If false, solves xa = b.
+  bool left_side = 1;
+
+  // If true, 'a' is lower triangular. If false, 'a' is upper triangular.
+  bool lower = 2;
+
+  // If true, the diagonal elements of 'a' are assumed to be 1 and not accessed.
+  bool unit_diagonal = 3;
+
+  // Should we transpose or use the adjoint of 'a'?
+  enum Transpose {
+    TRANSPOSE_INVALID = 0;
+    NO_TRANSPOSE = 1;  // Don't transpose 'a'.
+    TRANSPOSE = 2;     // Transpose 'a'.
+    ADJOINT = 3;       // Complex conjugate and transpose 'a'.
+  };
+  Transpose transpose_a = 4;
+}
+
 message OpSharding {
   enum Type {
     // This sharding is replicated across all devices (implies maximal,
@@ -590,3 +624,15 @@ message PrecisionConfig {
 
   // Next: 2
 }
+
+// Describes whether all data-parallelism replicas will receive the same
+// parameter data at each buffer.
+message ParameterReplication {
+  // A list of boolean values for the flattened leaf buffers. Each value
+  // indicates whether the corresponding leaf buffer is replicated.
+  //
+  // If this field is empty, it means no buffer is replicated. Otherwise, the
+  // number of elements in this field must match the number of leaf buffers in
+  // the HLO instruction's shape.
+  repeated bool replicated_at_leaf_buffers = 1;
+}
diff --git a/tensorflow/compiler/xrt/BUILD b/tensorflow/compiler/xrt/BUILD
index 2dae746d034a1bf52e84de74dfb0c6e23aaed4d1..b2718c5c283358d98da175a8d3b21bb1f2b01c75 100644
--- a/tensorflow/compiler/xrt/BUILD
+++ b/tensorflow/compiler/xrt/BUILD
@@ -11,9 +11,15 @@ package(
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "tf_custom_op_py_library",
     "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
 )
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library_py",
+)
 
 xla_proto_library(
     name = "xrt_proto",
@@ -27,6 +33,12 @@ xla_proto_library(
     ],
 )
 
+tf_proto_library_py(
+    name = "xrt_proto",  # bzl adds a _py suffix
+    srcs = ["xrt.proto"],
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "xrt_utils",
     srcs = [
@@ -78,6 +90,25 @@ tf_gen_op_libs(
     ],
 )
 
+tf_gen_op_wrapper_py(
+    name = "xrt_ops_wrapper_py",
+    out = "xrt_ops.py",
+    deps = [
+        ":xrt_compile_ops_op_lib",
+        ":xrt_execute_op_op_lib",
+        ":xrt_state_ops_op_lib",
+    ],
+)
+
+tf_custom_op_py_library(
+    name = "xrt_ops",
+    kernels = ["//tensorflow/compiler/xrt/kernels:xrt_ops"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":xrt_ops_wrapper_py",
+    ],
+)
+
 cc_library(
     name = "xrt_server",
     visibility = ["//visibility:public"],
diff --git a/tensorflow/compiler/xrt/kernels/BUILD b/tensorflow/compiler/xrt/kernels/BUILD
index 67f475846e5f16060c1080759b0acb4216c4e72b..1e325191bba828e3d5e4599f87dcf4f4d0674945 100644
--- a/tensorflow/compiler/xrt/kernels/BUILD
+++ b/tensorflow/compiler/xrt/kernels/BUILD
@@ -11,20 +11,15 @@ cc_library(
     name = "xrt_state_ops",
     hdrs = ["xrt_state_ops.h"],
     deps = [
+        "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:compile_only_client",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:computation_placer",
-        "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xrt:xrt_proto",
         "//tensorflow/compiler/xrt:xrt_utils",
         "//tensorflow/core:core_cpu_internal",
@@ -55,14 +50,18 @@ cc_library(
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:computation_placer",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xrt:xrt_compile_ops_op_lib",
+        "//tensorflow/compiler/xrt:xrt_execute_op_op_lib",
         "//tensorflow/compiler/xrt:xrt_proto",
+        "//tensorflow/compiler/xrt:xrt_state_ops_op_lib",
         "//tensorflow/compiler/xrt:xrt_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/stream_executor:stream_executor_headers_lib",
+        "//tensorflow/stream_executor:stream_executor_headers",
         "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
diff --git a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
index 2ccdf0f02d840600d5e0649c4805e3672d4a1286..b791519c09758a4f4124c95add5351a9433ecb8f 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
@@ -68,9 +68,11 @@ class XRTCompileOp : public OpKernel {
 
 Status CompilationCacheKey(const xrt::XLAComputation& computation,
                            string* key) {
-  string serialized;
-  TF_RET_CHECK(SerializeToStringDeterministic(computation, &serialized));
-  uint64 fingerprint = Fingerprint64(serialized);
+  const size_t size = computation.ByteSizeLong();
+  auto serialized = absl::make_unique<char[]>(size);
+  TF_RET_CHECK(
+      SerializeToBufferDeterministic(computation, serialized.get(), size));
+  uint64 fingerprint = Fingerprint64(absl::string_view(serialized.get(), size));
   *key = absl::StrCat(fingerprint);
   return Status::OK();
 }
@@ -215,11 +217,6 @@ XRTReleaseCompilationRefOp::~XRTReleaseCompilationRefOp() = default;
 void XRTReleaseCompilationRefOp::Compute(OpKernelContext* ctx) {
   VLOG(1) << "XRTReleaseCompilationRefOp::Compute";
 
-  const Tensor& key_tensor = ctx->input(0);
-  OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(key_tensor.shape()),
-              errors::Internal("computation key should be a string scalar"));
-  int64 uid = key_tensor.scalar<int64>()();
-
   ResourceMgr* rm;
   OP_REQUIRES_OK(ctx, XRTGenericDeviceAccessor::GetResourceManager(ctx, &rm));
 
@@ -230,9 +227,13 @@ void XRTReleaseCompilationRefOp::Compute(OpKernelContext* ctx) {
                           kXRTCompilationCacheResourceName, &cache));
   core::ScopedUnref cache_unref(cache);
 
-  OP_REQUIRES_OK(ctx, cache->Release(uid));
-
-  VLOG(2) << "Released computation handle " << uid;
+  const Tensor& keys_tensor = ctx->input(0);
+  auto flat_keys = keys_tensor.flat<int64>();
+  for (int64 i = 0; i < flat_keys.size(); ++i) {
+    int64 key = flat_keys(i);
+    OP_REQUIRES_OK(ctx, cache->Release(key));
+    VLOG(2) << "Released computation handle " << key;
+  }
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
index 751329eefc33f3372335c805233dafabbf42bf36..42ef88168af4b6f391ffc2e69ab4c4000d1cbee1 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
@@ -19,10 +19,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
+#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/compiler/xrt/xrt.pb.h"
 #include "tensorflow/compiler/xrt/xrt_compilation_cache.h"
 #include "tensorflow/compiler/xrt/xrt_device.h"
@@ -228,8 +228,27 @@ Status XRTExecuteOp::DoWork(OpKernelContext* context) {
   TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
       shaped_buffer, device_ref.backend(), device_ref.device_ordinal(),
       &output_tuple));
+
+  // The ScopedShapedBuffer returned by the executable Run() API, in case of
+  // input/output buffer aliasing, might have holes in it, which need to be
+  // filled using the proper input tuples buffers which are the source of
+  // aliasing.
+  const xla::HloInputOutputAliasConfig& input_output_alias =
+      executable->executable()->module().input_output_alias_config();
+  auto alias_function =
+      [&](const xla::ShapeIndex& output_index,
+          const xla::HloInputOutputAliasConfig::Alias& alias) -> Status {
+    TF_RET_CHECK(alias.parameter_number < input_tuples.size());
+    return alias.kind == xla::HloInputOutputAliasConfig::AliasKind::kUserAlias
+               ? output_tuple->AliasBufferFrom(
+                     *input_tuples[alias.parameter_number],
+                     alias.parameter_index, output_index)
+               : Status::OK();
+  };
+  TF_RETURN_IF_ERROR(input_output_alias.ForEachAliasWithStatus(alias_function));
+
   if (config_proto.return_exploded_tuple() &&
-      xla::ShapeUtil::IsTuple(output_tuple->on_device_shape())) {
+      output_tuple->on_device_shape().IsTuple()) {
     int64 tuple_element_count =
         xla::ShapeUtil::TupleElementCount(output_tuple->on_device_shape());
     Tensor* output_tensor;
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
index 3258286c10665225aab917107ffa614459c53f3d..343f43b7159b55bad184eed2cada55c76085ffa0 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
@@ -37,6 +37,17 @@ REGISTER_KERNEL_BUILDER(Name("XRTAllocate")
                             .HostMemory("handle"),
                         XRTAllocateOp<XRTGenericDeviceAccessor>);
 
+REGISTER_KERNEL_BUILDER(Name("XRTAllocateFromTensor")
+                            .Device(DEVICE_XLA_GPU)
+                            .HostMemory("inputs")
+                            .HostMemory("handle"),
+                        XRTAllocateFromTensorOp<XRTGenericDeviceAccessor>);
+REGISTER_KERNEL_BUILDER(Name("XRTAllocateFromTensor")
+                            .Device(DEVICE_XLA_CPU)
+                            .HostMemory("inputs")
+                            .HostMemory("handle"),
+                        XRTAllocateFromTensorOp<XRTGenericDeviceAccessor>);
+
 REGISTER_KERNEL_BUILDER(Name("XRTSubTuple")
                             .Device(DEVICE_XLA_GPU)
                             .HostMemory("base_handle")
@@ -111,6 +122,17 @@ REGISTER_KERNEL_BUILDER(Name("XRTReadLiteralAndRelease")
                             .HostMemory("literal"),
                         XRTReadLiteralOp<true, XRTGenericDeviceAccessor>);
 
+REGISTER_KERNEL_BUILDER(Name("XRTReadToTensor")
+                            .Device(DEVICE_XLA_GPU)
+                            .HostMemory("handles")
+                            .HostMemory("tensors"),
+                        XRTReadToTensorOp<XRTGenericDeviceAccessor>);
+REGISTER_KERNEL_BUILDER(Name("XRTReadToTensor")
+                            .Device(DEVICE_XLA_CPU)
+                            .HostMemory("handles")
+                            .HostMemory("tensors"),
+                        XRTReadToTensorOp<XRTGenericDeviceAccessor>);
+
 REGISTER_KERNEL_BUILDER(Name("XRTReleaseAllocationHandle")
                             .Device(DEVICE_XLA_GPU)
                             .HostMemory("handle"),
@@ -120,4 +142,9 @@ REGISTER_KERNEL_BUILDER(Name("XRTReleaseAllocationHandle")
                             .HostMemory("handle"),
                         XRTReleaseAllocationOp<XRTGenericDeviceAccessor>);
 
+REGISTER_KERNEL_BUILDER(Name("XRTReleaseAllAllocations").Device(DEVICE_XLA_GPU),
+                        XRTReleaseAllAllocationsOp<XRTGenericDeviceAccessor>);
+REGISTER_KERNEL_BUILDER(Name("XRTReleaseAllAllocations").Device(DEVICE_XLA_CPU),
+                        XRTReleaseAllAllocationsOp<XRTGenericDeviceAccessor>);
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
index 26a58fa42d8b730b365b11d2e5608e9945497763..6af73ecc85351a9b38ba526db076e9176d1cb2f1 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
@@ -19,10 +19,15 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XRT_KERNELS_XRT_STATE_OPS_H_
 #define TENSORFLOW_COMPILER_XRT_KERNELS_XRT_STATE_OPS_H_
 
+#include <functional>
 #include <memory>
 #include <string>
 
+#include "tensorflow/compiler/tf2xla/literal_util.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -30,11 +35,13 @@ limitations under the License.
 #include "tensorflow/compiler/xrt/xrt.pb.h"
 #include "tensorflow/compiler/xrt/xrt_device.h"
 #include "tensorflow/compiler/xrt/xrt_state.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
@@ -183,9 +190,7 @@ class XRTAllocateOp : public OpKernel {
     // We are guaranteed that the underlying device object won't be deleted out
     // from under us, while the ScopedRef is live.
     class DeviceAccessor::ScopedRef device_ref;
-    OP_REQUIRES_OK(ctx,
-                   DeviceAccessor::InitScopedRef(
-                       ctx, allocation_proto.device_ordinal(), &device_ref));
+    OP_REQUIRES_OK(ctx, DeviceAccessor::InitScopedRef(ctx, &device_ref));
 
     XRTTupleAllocation* allocation;
     OP_REQUIRES_OK(ctx, XRTTupleAllocation::CreateAndTransfer(
@@ -202,6 +207,110 @@ class XRTAllocateOp : public OpKernel {
   }
 };
 
+// Op that allocates memory for a tensor (with optional layout) and transfers it
+// to the device, returning an allocation handle.
+template <class DeviceAccessor>
+class XRTAllocateFromTensorOp : public OpKernel {
+ public:
+  explicit XRTAllocateFromTensorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    bool make_tuple = false;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shapes", &tf_shapes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtypes", &dtypes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("make_tuple", &make_tuple));
+    std::vector<int64> minor_to_major;
+    if (ctx->HasAttr("layouts")) {
+      OP_REQUIRES_OK(ctx, ctx->GetAttr("layouts", &minor_to_major));
+    }
+    OP_REQUIRES(
+        ctx, tf_shapes_.size() == dtypes_.size(),
+        errors::InvalidArgument("shapes and dtypes must be the same length"));
+    std::vector<xla::Shape> xla_shapes;
+    xla_shapes.reserve(tf_shapes_.size());
+    for (int i = 0; i < tf_shapes_.size(); i++) {
+      xla::Shape xla_shape;
+      OP_REQUIRES_OK(
+          ctx, TensorShapeToXLAShape(dtypes_[i], tf_shapes_[i], &xla_shape));
+      xla_shapes.push_back(std::move(xla_shape));
+    }
+    if (xla_shapes.size() > 1 || make_tuple) {
+      shape_ = xla::ShapeUtil::MakeTupleShape(xla_shapes);
+    } else {
+      shape_.Swap(&xla_shapes.front());
+    }
+    if (!minor_to_major.empty()) {
+      xla::Shape shape_with_layouts;
+      OP_REQUIRES_OK(ctx, GetShapeWithLayout(shape_, minor_to_major,
+                                             /*layout_func=*/nullptr,
+                                             &shape_with_layouts));
+      shape_.Swap(&shape_with_layouts);
+    }
+  }
+
+  ~XRTAllocateFromTensorOp() override = default;
+  XRTAllocateFromTensorOp(const XRTAllocateFromTensorOp&) = delete;
+  XRTAllocateFromTensorOp& operator=(const XRTAllocateFromTensorOp&) = delete;
+
+  void Compute(OpKernelContext* ctx) override {
+    VLOG(1) << "XRTAllocateFromTensorOp::Compute";
+
+    OpInputList values;
+    OP_REQUIRES_OK(ctx, ctx->input_list("inputs", &values));
+    OP_REQUIRES(ctx, values.size() == tf_shapes_.size(),
+                errors::InvalidArgument(
+                    "Wrong number of inputs to XRTAllocateFromTensor: ",
+                    values.size(), " vs. ", tf_shapes_.size()));
+
+    std::vector<const char*> tensors_data;
+    for (size_t i = 0; i < values.size(); ++i) {
+      const Tensor& input_tensor = values[i];
+      OP_REQUIRES(ctx, input_tensor.dtype() == dtypes_[i],
+                  errors::InvalidArgument(
+                      "Input tensor type and input dtype do not match"));
+      // We allow the requested on-device shape to differ from the shape of the
+      // input tensor, as long as they have the same number of elements.
+      OP_REQUIRES(
+          ctx,
+          input_tensor.shape().num_elements() == tf_shapes_[i].num_elements(),
+          errors::InvalidArgument(
+              "Input tensor must have the number of elements specified "
+              "in the matching input shape: ",
+              input_tensor.shape().num_elements(), " vs. ",
+              tf_shapes_[i].num_elements(), " at index ", i));
+      tensors_data.push_back(
+          static_cast<const char*>(DMAHelper::base(&input_tensor)));
+    }
+    // Use the buffer straight out of the input tensors to create the literal.
+    xla::BorrowingLiteral literal =
+        shape_.IsTuple() ? xla::BorrowingLiteral(tensors_data, shape_)
+                         : xla::BorrowingLiteral(tensors_data.front(), shape_);
+    ResourceMgr* rm;
+    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
+
+    // We are guaranteed that the underlying device object won't be deleted out
+    // from under us, while the ScopedRef is live.
+    class DeviceAccessor::ScopedRef device_ref;
+    OP_REQUIRES_OK(ctx, DeviceAccessor::InitScopedRef(ctx, &device_ref));
+
+    XRTTupleAllocation* allocation;
+    OP_REQUIRES_OK(ctx, XRTTupleAllocation::CreateAndTransfer(
+                            literal, device_ref.backend(),
+                            device_ref.device_ordinal(), &allocation));
+
+    // Intern takes ownership of our reference to allocation.
+    int64 key;
+    OP_REQUIRES_OK(ctx, allocation->Intern(rm, &key));
+
+    Tensor output(DT_INT64, TensorShape({}));
+    output.scalar<int64>()() = key;
+    ctx->set_output(0, output);
+  }
+
+ private:
+  std::vector<TensorShape> tf_shapes_;
+  DataTypeVector dtypes_;
+  xla::Shape shape_;
+};
+
 // Op that takes a tuple handle input and returns a handle to a sub-tuple of the
 // input.
 template <bool discard_, class DeviceAccessor>
@@ -381,7 +490,7 @@ class XRTReadLiteralOp : public OpKernel {
     OP_REQUIRES_OK(ctx, DeviceAccessor::InitScopedRef(
                             ctx, allocation->device_ordinal(), &device_ref));
 
-    xla::Literal literal;
+    xla::Literal literal(allocation->on_host_shape());
     OP_REQUIRES_OK(
         ctx, allocation->ToLiteral(device_ref.backend(),
                                    device_ref.device_ordinal(), &literal));
@@ -393,6 +502,96 @@ class XRTReadLiteralOp : public OpKernel {
   }
 };
 
+// Op that reads a device-resident tuple to host memory and returns it as a
+// literal.
+template <class DeviceAccessor>
+class XRTReadToTensorOp : public OpKernel {
+ public:
+  explicit XRTReadToTensorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("release_handles", &discard_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtypes", &dtypes_));
+  }
+  ~XRTReadToTensorOp() override = default;
+  XRTReadToTensorOp(const XRTReadToTensorOp&) = delete;
+  XRTReadToTensorOp& operator=(const XRTReadToTensorOp&) = delete;
+
+  void Compute(OpKernelContext* ctx) override {
+    VLOG(1) << "XRTReadToTensorOp::Compute";
+
+    const Tensor& handle_tensor = ctx->input(0);
+    // TODO(phawkins,dlibenzi): accept multiple handles (i.e., vectors, not
+    // just scalars.)
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(handle_tensor.shape()),
+        errors::Internal("computation input should be an int64 scalar"));
+    int64 allocation_handle = handle_tensor.scalar<int64>()();
+
+    ResourceMgr* rm;
+    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
+
+    XRTTupleAllocation* allocation;
+    OP_REQUIRES_OK(
+        ctx, XRTTupleAllocation::Lookup(rm, allocation_handle, &allocation));
+    core::ScopedUnref allocation_unref(allocation);
+
+    if (discard_) {
+      VLOG(2) << "Releasing handle " << allocation_handle;
+      OP_REQUIRES_OK(ctx, XRTTupleAllocation::DeleteFromResourceManager(
+                              rm, allocation_handle));
+    }
+
+    // We are guaranteed that the underlying device object won't be deleted out
+    // from under us, while the ScopedRef is live.
+    class DeviceAccessor::ScopedRef device_ref;
+    OP_REQUIRES_OK(ctx, DeviceAccessor::InitScopedRef(
+                            ctx, allocation->device_ordinal(), &device_ref));
+
+    xla::Shape shape = allocation->on_host_shape();
+    int output = 0;
+    Status status = xla::ShapeUtil::ForEachMutableSubshapeWithStatus(
+        &shape,
+        [&](xla::Shape* subshape, const xla::ShapeIndex& index) -> Status {
+          if (subshape->IsTuple()) return Status::OK();
+
+          xla::PrimitiveType xla_type;
+          TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(
+              ctx->expected_output_dtype(output), &xla_type));
+          if (xla_type != subshape->element_type()) {
+            return errors::InvalidArgument(
+                "Type mismatch between buffer type (", subshape->ToString(),
+                ") and tensor type (",
+                DataTypeString(ctx->expected_output_dtype(output)),
+                ") for output tensor ", output);
+          }
+
+          TensorShape output_shape;
+          TF_RETURN_IF_ERROR(XLAShapeToTensorShape(*subshape, &output_shape));
+
+          Tensor* output_tensor;
+          TF_RETURN_IF_ERROR(
+              ctx->allocate_output(output, output_shape, &output_tensor));
+
+          XRTTupleAllocation* sub;
+          TF_RETURN_IF_ERROR(XRTTupleAllocation::MakeSubBuffer(
+              allocation, index, &sub, /*alias_parent_allocation=*/true));
+          core::ScopedUnref sub_unref(sub);
+
+          xla::MutableBorrowingLiteral literal;
+          TF_RETURN_IF_ERROR(HostTensorToMutableBorrowingLiteral(
+              xla::LayoutUtil::GetWithDefaultLayout(*subshape), output_tensor,
+              &literal));
+          TF_RETURN_IF_ERROR(sub->ToLiteral(
+              device_ref.backend(), device_ref.device_ordinal(), &literal));
+
+          ++output;
+          return Status::OK();
+        });
+    OP_REQUIRES_OK(ctx, status);
+  }
+  bool discard_;
+  DataTypeVector dtypes_;
+};
+
 // Op that writes a new literal value into device-resident memory.
 template <class DeviceAccessor>
 class XRTWriteLiteralOp : public OpKernel {
@@ -455,17 +654,37 @@ class XRTReleaseAllocationOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     VLOG(1) << "XRTReleaseAllocationOp::Compute";
 
-    const Tensor& allocation_handle = ctx->input(0);
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(allocation_handle.shape()),
-                errors::Internal("handle input should be an int64 scalar"));
-    int64 key = allocation_handle.scalar<int64>()();
-
     ResourceMgr* rm;
     OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
 
-    OP_REQUIRES_OK(ctx, XRTTupleAllocation::DeleteFromResourceManager(rm, key));
+    const Tensor& allocation_handle = ctx->input(0);
+    auto flat_keys = allocation_handle.flat<int64>();
+    for (int64 i = 0; i < flat_keys.size(); ++i) {
+      int64 key = flat_keys(i);
+      OP_REQUIRES_OK(ctx,
+                     XRTTupleAllocation::DeleteFromResourceManager(rm, key));
+      VLOG(2) << "Released allocation handle " << key;
+    }
+  }
+};
+
+// Op that discards a handle to device memory.
+template <class DeviceAccessor>
+class XRTReleaseAllAllocationsOp : public OpKernel {
+ public:
+  explicit XRTReleaseAllAllocationsOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+  ~XRTReleaseAllAllocationsOp() override = default;
+  XRTReleaseAllAllocationsOp(const XRTReleaseAllAllocationsOp&) = delete;
+  XRTReleaseAllAllocationsOp& operator=(const XRTReleaseAllAllocationsOp&) =
+      delete;
+
+  void Compute(OpKernelContext* ctx) override {
+    VLOG(1) << "XRTReleaseAllAllocationsOp::Compute";
 
-    VLOG(2) << "Released allocation handle " << key;
+    ResourceMgr* rm;
+    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
+    OP_REQUIRES_OK(ctx, XRTTupleAllocation::ReleaseAllAllocations(rm));
   }
 };
 
diff --git a/tensorflow/compiler/xrt/ops/xrt_compile_ops.cc b/tensorflow/compiler/xrt/ops/xrt_compile_ops.cc
index 7b3b50c69559f6003a108fdf6a1325dbdbaa80a6..9dd964e5467cd855d67764a512e95a6a18f482e1 100644
--- a/tensorflow/compiler/xrt/ops/xrt_compile_ops.cc
+++ b/tensorflow/compiler/xrt/ops/xrt_compile_ops.cc
@@ -44,10 +44,10 @@ REGISTER_OP("XRTReleaseCompilationHandle")
     .SetShapeFn(tensorflow::shape_inference::NoOutputs)
     .Doc(
         R"(
-Discards a computation from the compilation cache. The handle cannot be
-subsequently used.
+Discards one or more computation handles from the compilation cache.
+The handle(s) cannot be subsequently used.
 
-'handle' is an id returned from a XRTCompile Op.
+'handle' is an ID (or vector of IDs) returned from a XRTCompile Op.
 )");
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
index a3d63106fa14674a9f5887ccfd908ce17dbc6384..8832270fb2730d1ba64fa069b38f4a04b61773ef 100644
--- a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
@@ -26,12 +26,41 @@ REGISTER_OP("XRTAllocate")
     .SetShapeFn(tensorflow::shape_inference::ScalarShape)
     .Doc(
         R"(
-Reads a literal proto and transfers it to TPU device memory.
+Reads a literal proto and transfers it to device memory.
 
-'allocation' is a serialized xrt::TPUAllocation proto.
+'allocation' is a serialized xrt::XLAAllocation proto.
 'handle' is an id that can be used in other ops to refer to the allocation.
 )");
 
+REGISTER_OP("XRTAllocateFromTensor")
+    .Input("inputs: dtypes")
+    .Output("handle: int64")
+    .Attr("dtypes: list(type)")
+    .Attr("shapes: list(shape)")
+    .Attr("layouts: list(int) = []")
+    .Attr("make_tuple: bool = false")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .Doc(
+        R"(
+Reads a list of tensors with optional layouts, and transfers it to device
+memory.
+
+inputs: The tensors holding the input data.
+shapes: The shapes which the tensors should have on device. The i-th shape
+corresponds to the i-th input. The shapes, together with the (optional)
+layouts, helps creating the fully qualified shape of the data on the device.
+The shapes can differ from the corresponding input one, as long as the total
+number of elements matches. In other words, it is possible to feed an input
+tensor with shape {8} and have a corresponding shape {2,2,2}.
+layouts: A vector holding the requested layout in minor-to-major sequence.
+If empty, the default layout wil be used.
+For a tuple, the layouts vector holds a linearized minor-to-major numbers
+for all the tuple leaves, in the order they appear within the tuple.
+The elements within the layouts sequence corresponding to a given tuple
+subshape can be set to -1, to leave such subshape to the default shape.
+handle: An id that can be used in other ops to refer to the allocation.
+)");
+
 REGISTER_OP("XRTSubTuple")
     .Input("base_handle: int64")
     .Input("shape_index: int32")
@@ -122,15 +151,44 @@ releases the handle.
 'literal' is a serialized xla::LiteralProto proto.
 )");
 
+REGISTER_OP("XRTReadToTensor")
+    .Input("handles: int64")
+    .Attr("release_handles: bool = False")
+    .Attr("dtypes: list(type)")
+    .Output("tensors: dtypes")
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape)
+    .Doc(
+        R"(
+Copies allocated values from device memory and returns them as zero or more
+Tensors. If a handle refers to a non-tuple buffer, a single tensor is returned.
+In general, the tensors returned for a handle correspond to an in-order traversal
+of a the tuple-tree value referenced by the handle.
+
+'handles' contains ids returned from Ops that produced on-device allocations.
+At present, only a single (scalar) handle is supported.
+'dtypes' are the expected types for each `Tensor` to be returned. If the
+expected and actual tensor types do not match, an error is returned.
+'release_handles': if True, `handles` are released.
+'tensors' are the output Tensors.
+)");
+
 REGISTER_OP("XRTReleaseAllocationHandle")
     .Input("handle: int64")
     .SetShapeFn(tensorflow::shape_inference::NoOutputs)
     .Doc(
         R"(
-Discards an allocation from device memory. The handle cannot be subsequently
+Discards one or more device memory handles. The handle(s) cannot be subsequently
 used.
 
-'handle' is the id returned from the Op that produced the on-device allocation.
+'handle' is the ID (or a vector of IDs) returned from the Op that produced the
+on-device allocation.
+)");
+
+REGISTER_OP("XRTReleaseAllAllocations")
+    .SetShapeFn(tensorflow::shape_inference::NoOutputs)
+    .Doc(
+        R"(
+Discards all the XRT allocations. All the client held handles will be invalid.
 )");
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/tests/BUILD b/tensorflow/compiler/xrt/tests/BUILD
index be44a3474acdeb9905c1d21b932fa0dd10b5a212..3a19327e5b5d8072fbecdbe10e9959c8491780eb 100644
--- a/tensorflow/compiler/xrt/tests/BUILD
+++ b/tensorflow/compiler/xrt/tests/BUILD
@@ -24,6 +24,7 @@ cc_library(
         "//tensorflow/cc:client_session",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
+        "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
diff --git a/tensorflow/compiler/xrt/tests/raw_api_test.cc b/tensorflow/compiler/xrt/tests/raw_api_test.cc
index abaa17e50e3f5e47a45f5a8a45fa2090d3efee39..1111f8240512e81c10a42a28c09f5b0a94daf1ee 100644
--- a/tensorflow/compiler/xrt/tests/raw_api_test.cc
+++ b/tensorflow/compiler/xrt/tests/raw_api_test.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/tf2xla/literal_util.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
@@ -53,6 +55,14 @@ string DeviceFromFlag() {
   return absl::StrCat("/device:", xla_test_device, ":0");
 }
 
+std::vector<int> GetAttrLayout(absl::Span<const int64> minor_to_mayor) {
+  std::vector<int> layout;
+  for (auto dim : minor_to_mayor) {
+    layout.push_back(static_cast<int>(dim));
+  }
+  return layout;
+}
+
 xla::LiteralProto TwoElementTuple() {
   auto array = xla::LiteralUtil::CreateR1<float>({1.0f, 3.0f});
   auto matrix = xla::LiteralUtil::CreateR2({{4, 5}, {6, 7}});
@@ -96,14 +106,21 @@ xla::LiteralProto FloatMatrix(
   return array.ToProto();
 }
 
+xla::Literal ReadOutputLiteral(const std::vector<Tensor>& outputs, size_t idx) {
+  xla::LiteralProto response;
+  CHECK(response.ParseFromString(outputs[idx].scalar<string>()()));
+  return xla::Literal::CreateFromProto(response).ValueOrDie();
+}
+
 bool CompareLiteralProtos(const xla::LiteralProto& a,
                           const xla::LiteralProto& b) {
   auto l_a = xla::Literal::CreateFromProto(a).ValueOrDie();
   auto l_b = xla::Literal::CreateFromProto(b).ValueOrDie();
   bool equal = l_a == l_b;
   if (!equal) {
-    LOG(INFO) << "LiteralProtos don't match: " << a.DebugString()
-              << " != " << b.DebugString();
+    LOG(INFO) << "LiteralProtos don't match:\n"
+              << a.DebugString() << "\n!=\n"
+              << b.DebugString();
   }
   return equal;
 }
@@ -113,8 +130,19 @@ bool CompareLiteralToLiteralProto(const xla::Literal& a,
   auto l_b = xla::Literal::CreateFromProto(b).ValueOrDie();
   bool equal = a == l_b;
   if (!equal) {
-    LOG(INFO) << "Literal and LiteralProto don't match "
-              << a.ToProto().DebugString() << " != " << b.DebugString();
+    LOG(INFO) << "Literal and LiteralProto don't match:\n"
+              << a.ToProto().DebugString() << "\n!=\n"
+              << b.DebugString();
+  }
+  return equal;
+}
+
+bool CompareLiterals(const xla::Literal& a, const xla::Literal& b) {
+  bool equal = a == b;
+  if (!equal) {
+    LOG(INFO) << "Literals don't match:\n"
+              << a.ToProto().DebugString() << "\n!=\n"
+              << b.ToProto().DebugString();
   }
   return equal;
 }
@@ -215,9 +243,122 @@ xla::ProgramShape XlaCompiledProgramShape(
       ->ComputeProgramShape();
 }
 
+TEST(RawApiTest, AllocFromTensor) {
+  xla::Literal literal =
+      xla::LiteralUtil::CreateR2<float>({{4.0f, 5.0f}, {6.0f, 7.0f}});
+  Tensor tensor;
+  TF_ASSERT_OK(LiteralToHostTensor(literal, DT_FLOAT, &tensor));
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  std::vector<int> layout =
+      GetAttrLayout(literal.shape().layout().minor_to_major());
+  ops::XRTAllocateFromTensor::Attrs alloc_attrs =
+      ops::XRTAllocateFromTensor::Layouts(layout);
+  auto handle =
+      ops::XRTAllocateFromTensor(root, {tensor}, {tensor.shape()}, alloc_attrs);
+  auto read_back = ops::XRTReadLiteralAndRelease(root, handle);
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back}, &outputs));
+  EXPECT_EQ(outputs.size(), 1);
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(CompareLiteralToLiteralProto(literal, response));
+}
+
+TEST(RawApiTest, AllocFromTensorTuple) {
+  xla::Literal literal0 =
+      xla::LiteralUtil::CreateR2<float>({{4.0f, 5.0f}, {6.0f, 7.0f}});
+  xla::Literal literal1 =
+      xla::LiteralUtil::CreateR2<float>({{14.0f, -5.0f}, {16.0f, 17.0f}});
+  xla::Literal literal = xla::LiteralUtil::MakeTuple({&literal0, &literal1});
+  Tensor tensor0;
+  TF_ASSERT_OK(LiteralToHostTensor(literal0, DT_FLOAT, &tensor0));
+  Tensor tensor1;
+  TF_ASSERT_OK(LiteralToHostTensor(literal1, DT_FLOAT, &tensor1));
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  std::vector<int> layout = GetShapeLayoutVector(literal.shape()).ValueOrDie();
+  ops::XRTAllocateFromTensor::Attrs alloc_attrs =
+      ops::XRTAllocateFromTensor::Layouts(layout);
+  auto handle = ops::XRTAllocateFromTensor(root, {tensor0, tensor1},
+                                           {tensor0.shape(), tensor1.shape()},
+                                           alloc_attrs);
+  auto read_back = ops::XRTReadLiteralAndRelease(root, handle);
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back}, &outputs));
+  EXPECT_EQ(outputs.size(), 1);
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(CompareLiteralToLiteralProto(literal, response));
+}
+
+TEST(RawApiTest, AllocFromTensorTupleSingle) {
+  xla::Literal literal0 =
+      xla::LiteralUtil::CreateR2<float>({{4.0f, 5.0f}, {6.0f, 7.0f}});
+  xla::Literal literal = xla::LiteralUtil::MakeTuple({&literal0});
+  Tensor tensor0;
+  TF_ASSERT_OK(LiteralToHostTensor(literal0, DT_FLOAT, &tensor0));
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  std::vector<int> layout = GetShapeLayoutVector(literal.shape()).ValueOrDie();
+  ops::XRTAllocateFromTensor::Attrs alloc_attrs =
+      ops::XRTAllocateFromTensor::Layouts(layout).MakeTuple(true);
+  auto handle = ops::XRTAllocateFromTensor(root, {tensor0}, {tensor0.shape()},
+                                           alloc_attrs);
+  auto read_back = ops::XRTReadLiteralAndRelease(root, handle);
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back}, &outputs));
+  EXPECT_EQ(outputs.size(), 1);
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(CompareLiteralToLiteralProto(literal, response));
+}
+
+TEST(RawApiTest, AllocFromTensorRelayout) {
+  xla::Literal literal =
+      xla::LiteralUtil::CreateR2<float>({{4.0f, 5.0f}, {6.0f, 7.0f}});
+  Tensor tensor;
+  TF_ASSERT_OK(LiteralToHostTensor(literal, DT_FLOAT, &tensor));
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  // Use inverse array layout with the tensor data above.
+  std::vector<int> layout({0, 1});
+  ops::XRTAllocateFromTensor::Attrs alloc_attrs =
+      ops::XRTAllocateFromTensor::Layouts(layout);
+  auto handle =
+      ops::XRTAllocateFromTensor(root, {tensor}, {tensor.shape()}, alloc_attrs);
+  auto read_back = ops::XRTReadLiteralAndRelease(root, handle);
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back}, &outputs));
+  EXPECT_EQ(outputs.size(), 1);
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  // We have sent literal's data (in array layout) with a attribute layout
+  // {0,1}, so the expected literal read from device needs to be changed
+  // accordingly.
+  xla::Literal expected_literal =
+      xla::LiteralUtil::CreateR2<float>({{4.0f, 6.0f}, {5.0f, 7.0f}});
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected_literal, response));
+}
+
 TEST(RawApiTest, AllocAndRewrite) {
   xrt::XLAAllocation alloc;
-  alloc.set_device_ordinal(0);
   *alloc.mutable_value() =
       xla::LiteralUtil::CreateR2({{4, 5}, {6, 7}}).ToProto();
 
@@ -259,15 +400,138 @@ TEST(RawApiTest, AllocAndRewrite) {
   EXPECT_TRUE(new_response.ParseFromString(outputs[0].scalar<string>()()));
   EXPECT_TRUE(CompareLiteralProtos(new_literal, new_response));
 
-  auto release =
-      ops::XRTReleaseAllocationHandle(root, Input(allocation_handle));
+  Tensor release_tensor(DT_INT64, TensorShape({1}));
+  release_tensor.flat<int64>()(0) = allocation_handle;
+
+  auto release = ops::XRTReleaseAllocationHandle(root, release_tensor);
+  TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {}, {release},
+                           &outputs));
+}
+
+TEST(RawApiTest, AllocReleaseMany) {
+  xrt::XLAAllocation alloc1;
+  *alloc1.mutable_value() =
+      xla::LiteralUtil::CreateR2({{4, 5}, {6, 7}}).ToProto();
+  xrt::XLAAllocation alloc2;
+  *alloc2.mutable_value() =
+      xla::LiteralUtil::CreateR2({{6, 7}, {4, 5}}).ToProto();
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  auto value1 =
+      ops::Const(root.WithDevice("/device:CPU:0"), alloc1.SerializeAsString());
+  auto value2 =
+      ops::Const(root.WithDevice("/device:CPU:0"), alloc2.SerializeAsString());
+  auto handle1 = ops::XRTAllocate(root, value1);
+  auto handle2 = ops::XRTAllocate(root, value2);
+  TF_ASSERT_OK(root.status());
+
+  tensorflow::ClientSession session(root);
+  std::vector<tensorflow::Tensor> outputs;
+  TF_EXPECT_OK(session.Run({handle1, handle2}, &outputs));
+  EXPECT_EQ(outputs.size(), 2);
+
+  int64 allocation_handle1 = outputs[0].scalar<int64>()();
+  int64 allocation_handle2 = outputs[1].scalar<int64>()();
+
+  Tensor release_tensor(DT_INT64, TensorShape({2}));
+  release_tensor.flat<int64>()(0) = allocation_handle1;
+  release_tensor.flat<int64>()(1) = allocation_handle2;
+
+  auto release = ops::XRTReleaseAllocationHandle(root, release_tensor);
+  outputs.clear();
   TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {}, {release},
                            &outputs));
 }
 
+TEST(RawApiTest, CompileAndReleaseMany) {
+  xrt::XLAComputation c1;
+  auto config1 = c1.mutable_config();
+  auto shapes1 = config1->mutable_program_shape();
+  *shapes1->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes1->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes1->mutable_result() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  StoreComputationSnapshot(AddAndScale(), c1.mutable_hlo_snapshot());
+
+  xrt::XLAComputation c2;
+  auto config2 = c2.mutable_config();
+  auto shapes2 = config2->mutable_program_shape();
+  *shapes2->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes2->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes2->mutable_result() =
+      xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::F32, {2})})
+          .ToProto();
+  StoreComputationSnapshot(AddAndTuple(), c2.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(false);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  auto e_config =
+      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
+  auto computation1 =
+      ops::Const(root.WithDevice("/device:CPU:0"), c1.SerializeAsString());
+  auto c_handle1 = ops::XRTCompile(root, computation1);
+  auto computation2 =
+      ops::Const(root.WithDevice("/device:CPU:0"), c2.SerializeAsString());
+  auto c_handle2 = ops::XRTCompile(root, computation2);
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({c_handle1.handle, c_handle2.handle}, &outputs));
+  EXPECT_EQ(outputs.size(), 2);
+
+  int64 compilation_handle1 = outputs[0].scalar<int64>()();
+  int64 compilation_handle2 = outputs[1].scalar<int64>()();
+
+  Tensor release_tensor(DT_INT64, TensorShape({2}));
+  release_tensor.flat<int64>()(0) = compilation_handle1;
+  release_tensor.flat<int64>()(1) = compilation_handle2;
+
+  auto release = ops::XRTReleaseCompilationHandle(root, release_tensor);
+  outputs.clear();
+  TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {}, {release},
+                           &outputs));
+}
+
+TEST(RawApiTest, AllocAndClearAll) {
+  xrt::XLAAllocation alloc;
+  *alloc.mutable_value() =
+      xla::LiteralUtil::CreateR2({{4, 5}, {6, 7}}).ToProto();
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  auto value =
+      ops::Const(root.WithDevice("/device:CPU:0"), alloc.SerializeAsString());
+  auto handle = ops::XRTAllocate(root, value);
+  TF_ASSERT_OK(root.status());
+
+  tensorflow::ClientSession session(root);
+  std::vector<tensorflow::Tensor> outputs;
+  TF_EXPECT_OK(session.Run({handle}, &outputs));
+  EXPECT_EQ(outputs.size(), 1);
+
+  int64 allocation_handle = outputs[0].scalar<int64>()();
+
+  auto clear_all = ops::XRTReleaseAllAllocations(root);
+
+  outputs.clear();
+  TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {},
+                           {clear_all}, &outputs));
+  EXPECT_EQ(outputs.size(), 0);
+
+  auto read_after_clear = ops::XRTReadLiteral(root, Input(allocation_handle));
+  EXPECT_EQ(session.Run({read_after_clear}, &outputs).code(),
+            tensorflow::error::Code::NOT_FOUND);
+}
+
 TEST(RawApiTest, ReadAndWriteState) {
   xrt::XLAAllocation alloc;
-  alloc.set_device_ordinal(0);
   *alloc.mutable_value() = TwoElementTuple();
 
   Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
@@ -292,7 +556,6 @@ TEST(RawApiTest, ReadAndWriteState) {
 
 TEST(RawApiTest, ReadAndWriteStateAutoFree) {
   xrt::XLAAllocation alloc;
-  alloc.set_device_ordinal(0);
   *alloc.mutable_value() = TwoElementTuple();
 
   Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
@@ -313,7 +576,6 @@ TEST(RawApiTest, ReadAndWriteStateAutoFree) {
 
 TEST(RawApiTest, SubBuffer) {
   xrt::XLAAllocation alloc;
-  alloc.set_device_ordinal(0);
   *alloc.mutable_value() = NestedTuple();
 
   Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
@@ -354,10 +616,8 @@ TEST(RawApiTest, SubBuffer) {
 
 TEST(RawApiTest, MakeTuple) {
   xrt::XLAAllocation alloc_0;
-  alloc_0.set_device_ordinal(0);
   *alloc_0.mutable_value() = TwoElementTuple();
   xrt::XLAAllocation alloc_1;
-  alloc_1.set_device_ordinal(0);
   *alloc_1.mutable_value() = ScalarLiteral();
 
   // The trivial tuple that just forwards its input and releases it.
@@ -428,10 +688,8 @@ TEST(RawApiTest, MakeTuple) {
 
 TEST(RawApiTest, CompileAndExecute) {
   xrt::XLAAllocation p0;
-  p0.set_device_ordinal(0);
   *p0.mutable_value() = FloatVector({1.0f, 2.0f});
   xrt::XLAAllocation p1;
-  p1.set_device_ordinal(0);
   *p1.mutable_value() = FloatVector({8.0f, 5.0f});
 
   xrt::XLAComputation c;
@@ -483,10 +741,8 @@ TEST(RawApiTest, CompileAndExecute) {
 
 TEST(RawApiTest, CompileAndExecuteWithArgumentVector) {
   xrt::XLAAllocation p0;
-  p0.set_device_ordinal(0);
   *p0.mutable_value() = FloatVector({1.0f, 2.0f});
   xrt::XLAAllocation p1;
-  p1.set_device_ordinal(0);
   *p1.mutable_value() = FloatVector({8.0f, 5.0f});
 
   xrt::XLAComputation c;
@@ -606,10 +862,8 @@ TEST(RawApiTest, DotGeneralWithLayoutTest) {
   auto layout = xla::LayoutUtil::MakeLayout({0, 1});
 
   xrt::XLAAllocation p0;
-  p0.set_device_ordinal(0);
   *p0.mutable_value() = FloatMatrix({{1.0f, 2.0f}, {3.0f, 4.0f}}, layout);
   xrt::XLAAllocation p1;
-  p1.set_device_ordinal(0);
   *p1.mutable_value() = FloatMatrix({{8.0f}, {5.0f}}, layout);
 
   xrt::XLAComputation c;
@@ -692,10 +946,8 @@ TEST(RawApiTest, CompileAndExecuteZeroArg) {
 
 TEST(RawApiTest, CompileAndExecuteReturnTuple) {
   xrt::XLAAllocation p0;
-  p0.set_device_ordinal(0);
   *p0.mutable_value() = FloatVector({1.0f, 2.0f});
   xrt::XLAAllocation p1;
-  p1.set_device_ordinal(0);
   *p1.mutable_value() = FloatVector({8.0f, 5.0f});
 
   xrt::XLAComputation c;
@@ -745,11 +997,9 @@ TEST(RawApiTest, CompileAndExecuteReturnTuple) {
 
 TEST(RawApiTest, CompileAndExecuteReturnExplodedTuple) {
   xrt::XLAAllocation p0;
-  p0.set_device_ordinal(0);
   *p0.mutable_value() = xla::LiteralUtil::CreateR0<float>(12.0f).ToProto();
 
   xrt::XLAAllocation p1;
-  p1.set_device_ordinal(0);
   *p1.mutable_value() = xla::LiteralUtil::CreateR0<float>(3.0f).ToProto();
 
   xrt::XLAComputation c;
@@ -831,12 +1081,111 @@ TEST(RawApiTest, LeakCompilationReference) {
   TF_EXPECT_OK(session.Run({c_handle.handle}, &outputs));
 }
 
+TEST(RawApiTest, CompileAndExecuteWithReusedBuffers) {
+  xla::Shape element_shape = xla::ShapeUtil::MakeShape(xla::F32, {2});
+  xla::Shape shape =
+      xla::ShapeUtil::MakeTupleShape({element_shape, element_shape});
+  xla::Shape return_shape = xla::ShapeUtil::MakeTupleShape(
+      {element_shape, element_shape, element_shape, element_shape});
+  xla::XlaBuilder builder("ReuseBuffer");
+  auto param = xla::Parameter(&builder, 0, shape, "param");
+  auto p0 = xla::GetTupleElement(param, 0);
+  auto p1 = xla::GetTupleElement(param, 1);
+  auto add = xla::Add(p0, p1);
+  auto sub = xla::Sub(p0, p1);
+  xla::Tuple(&builder, {add, sub, p0, p1});
+
+  // Flip the tuple literals in the input handle.
+  builder.SetUpAlias({1}, 0, {0});
+  builder.SetUpAlias({0}, 0, {1});
+
+  auto computation = builder.Build().ValueOrDie();
+
+  auto literal0 = xla::LiteralUtil::CreateR1<float>({1.0f, 2.0f});
+  auto literal1 = xla::LiteralUtil::CreateR1<float>({5.0f, 9.0f});
+  auto literal = xla::LiteralUtil::MakeTuple({&literal0, &literal1});
+
+  xrt::XLAAllocation param_alloc;
+  *param_alloc.mutable_value() = literal.ToProto();
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  *shapes->add_parameters() = shape.ToProto();
+  *shapes->mutable_result() = return_shape.ToProto();
+  StoreComputationSnapshot(computation, c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(false);
+  e.set_release_compilation_handle(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  ClientSession session(root);
+  auto e_config =
+      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
+  auto c_data =
+      ops::Const(root.WithDevice("/device:CPU:0"), c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, c_data);
+  auto param_value = ops::Const(root.WithDevice("/device:CPU:0"),
+                                param_alloc.SerializeAsString());
+  auto param_handle = ops::XRTAllocate(root, param_value);
+  TF_ASSERT_OK(root.status());
+
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({param_handle}, &outputs));
+
+  int64 alloc_handle = outputs[0].scalar<int64>()();
+
+  // Note that we release the result handle immediately, but since we aliased
+  // the output buffers onto the input allocation ones (held in alloc_handle),
+  // we can fetch the result from there.
+  auto result =
+      ops::XRTExecute(root, c_handle.handle, e_config, {Input(alloc_handle)});
+  auto read_back = ops::XRTReadLiteral(root, result);
+  auto release = ops::XRTReleaseAllocationHandle(
+      root.WithControlDependencies(read_back), result);
+  TF_ASSERT_OK(root.status());
+
+  outputs.clear();
+  TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {read_back},
+                           {release}, &outputs));
+
+  xla::Literal exec_literal = ReadOutputLiteral(outputs, 0);
+  auto exec_literal_parts = exec_literal.DecomposeTuple();
+  ASSERT_EQ(exec_literal_parts.size(), 4);
+
+  EXPECT_TRUE(CompareLiterals(exec_literal_parts[2], literal0));
+  EXPECT_TRUE(CompareLiterals(exec_literal_parts[3], literal1));
+
+  // Now we read back the original input handle values, which at this point
+  // should contain the result of the XLA computation.
+  auto read_handle = ops::XRTReadLiteral(root, Input(alloc_handle));
+  TF_ASSERT_OK(root.status());
+  auto release_handle = ops::XRTReleaseAllocationHandle(
+      root.WithControlDependencies(read_handle), Input(alloc_handle));
+  TF_ASSERT_OK(root.status());
+
+  outputs.clear();
+  TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {read_handle},
+                           {release_handle}, &outputs));
+
+  xla::Literal return_literal = ReadOutputLiteral(outputs, 0);
+
+  auto expected_literal0 = xla::LiteralUtil::CreateR1<float>({6.0f, 11.0f});
+  auto expected_literal1 = xla::LiteralUtil::CreateR1<float>({-4.0f, -7.0f});
+  // The first element of the computation returned tuple would be the add
+  // (expected_literal0), but since we flipped the buffers, the sub
+  // (expected_literal1) should come first.
+  auto expected_literal =
+      xla::LiteralUtil::MakeTuple({&expected_literal1, &expected_literal0});
+
+  EXPECT_TRUE(CompareLiterals(return_literal, expected_literal));
+}
+
 TEST(RawApiTest, CompileAndExecuteWithS64Argument) {
   xrt::XLAAllocation p0;
-  p0.set_device_ordinal(0);
   *p0.mutable_value() = xla::LiteralUtil::CreateR0<int64>(11031965).ToProto();
   xrt::XLAAllocation p1;
-  p1.set_device_ordinal(0);
   *p1.mutable_value() = xla::LiteralUtil::CreateR0<int64>(4091934).ToProto();
 
   xrt::XLAComputation c;
@@ -850,6 +1199,7 @@ TEST(RawApiTest, CompileAndExecuteWithS64Argument) {
   xrt::XRTExecutionConfig e;
   e.set_release_input_handles(true);
   e.set_release_compilation_handle(true);
+  e.set_return_exploded_tuple(true);
 
   Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
   auto e_config =
diff --git a/tensorflow/compiler/xrt/xrt.proto b/tensorflow/compiler/xrt/xrt.proto
index 378bb9246f27b8106310d565435404d7ac260a87..84adee7392825d408dd88dd74dc0c1bc7b06d7c4 100644
--- a/tensorflow/compiler/xrt/xrt.proto
+++ b/tensorflow/compiler/xrt/xrt.proto
@@ -59,7 +59,7 @@ message XLAComputation {
 
 // Literal to allocate space for, and transfer to, device memory.
 message XLAAllocation {
-  int32 device_ordinal = 1;
+  reserved 1;
   xla.LiteralProto value = 2;
 }
 
diff --git a/tensorflow/compiler/xrt/xrt_compilation_cache.cc b/tensorflow/compiler/xrt/xrt_compilation_cache.cc
index d1405eae468492748ae88d842334a922dce272c6..8bf0f28d2233d9e7593365bc42187e327a1c4ac4 100644
--- a/tensorflow/compiler/xrt/xrt_compilation_cache.cc
+++ b/tensorflow/compiler/xrt/xrt_compilation_cache.cc
@@ -273,6 +273,8 @@ Status XRTCompilationCache::Lookup(
   return Status::OK();
 }
 
-string XRTCompilationCache::DebugString() { return "XRTCompilationCache"; }
+string XRTCompilationCache::DebugString() const {
+  return "XRTCompilationCache";
+}
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/xrt_compilation_cache.h b/tensorflow/compiler/xrt/xrt_compilation_cache.h
index c43d0fc47873abdc82ee937c155bebc346a05f17..7398e847d8b744f947adb03e1bcfd5c0a5b2cc55 100644
--- a/tensorflow/compiler/xrt/xrt_compilation_cache.h
+++ b/tensorflow/compiler/xrt/xrt_compilation_cache.h
@@ -118,7 +118,7 @@ class XRTCompilationCache : public ResourceBase {
   // EntryRef holding the program is returned in entry.
   Status Lookup(int64 uid, std::unique_ptr<XRTCompilationCacheEntryRef>* entry);
 
-  string DebugString() override;
+  string DebugString() const override;
 
  private:
   // An entry in the compilation cache. The entry is deleted once it has been
diff --git a/tensorflow/compiler/xrt/xrt_device.cc b/tensorflow/compiler/xrt/xrt_device.cc
index ea40e6c895c4f6af13b74735685f2c342181ada9..34cb64742a20985b29d8e153bbaf5ee184fd385d 100644
--- a/tensorflow/compiler/xrt/xrt_device.cc
+++ b/tensorflow/compiler/xrt/xrt_device.cc
@@ -43,4 +43,12 @@ namespace tensorflow {
   return Status::OK();
 }
 
+/*static*/ Status XRTGenericDeviceAccessor::InitScopedRef(
+    OpKernelContext* ctx, ScopedRef* scoped_ref) {
+  const XlaDevice::Metadata* metadata;
+  TF_RETURN_IF_ERROR(XlaDevice::GetMetadata(ctx, &metadata));
+  scoped_ref->Acquire(metadata->client());
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/xrt_device.h b/tensorflow/compiler/xrt/xrt_device.h
index 1e3fddd2a72a3657d1e115375133c244772ea9f3..fb010651d9bf76c540517b9596e472c881241d8a 100644
--- a/tensorflow/compiler/xrt/xrt_device.h
+++ b/tensorflow/compiler/xrt/xrt_device.h
@@ -59,6 +59,8 @@ class XRTGenericDeviceAccessor {
 
   static Status InitScopedRef(OpKernelContext* ctx, int device_ordinal,
                               ScopedRef* scoped_ref);
+
+  static Status InitScopedRef(OpKernelContext* ctx, ScopedRef* scoped_ref);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/xrt_state.cc b/tensorflow/compiler/xrt/xrt_state.cc
index 31603e044d17baa3ae0ae583f61837811bb12495..1b3bcbea4c1228944a6604fc923228024e74d700 100644
--- a/tensorflow/compiler/xrt/xrt_state.cc
+++ b/tensorflow/compiler/xrt/xrt_state.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -133,7 +132,8 @@ Status AllocateScopedShapedBuffer(
 XRTBufferAllocation::XRTBufferAllocation(const se::DeviceMemoryBase& allocation,
                                          int device_ordinal,
                                          xla::DeviceMemoryAllocator* allocator)
-    : allocation_(allocation),
+    : size_(allocation.size()),
+      allocation_(allocation),
       device_ordinal_(device_ordinal),
       allocator_(allocator) {
   if (VLOG_IS_ON(2)) {
@@ -181,7 +181,7 @@ XRTTupleAllocation::~XRTTupleAllocation() {
 }
 
 /*static*/ Status XRTTupleAllocation::CreateAndTransfer(
-    const xla::Literal& literal, xla::Backend* backend, int device_ordinal,
+    const xla::LiteralBase& literal, xla::Backend* backend, int device_ordinal,
     XRTTupleAllocation** allocation) {
   auto transfer_manager = backend->transfer_manager();
   auto allocator = backend->memory_allocator();
@@ -220,12 +220,22 @@ XRTTupleAllocation::~XRTTupleAllocation() {
 }
 
 Status XRTTupleAllocation::ToLiteral(xla::Backend* backend, int device_ordinal,
-                                     xla::Literal* literal) {
+                                     xla::MutableLiteralBase* literal) {
   auto transfer_manager = backend->transfer_manager();
   TF_ASSIGN_OR_RETURN(auto stream, backend->BorrowStream(device_ordinal));
-  TF_ASSIGN_OR_RETURN(*literal, transfer_manager->TransferLiteralFromDevice(
-                                    stream.get(), ToShapedBuffer()));
-  return Status::OK();
+
+  // Validate the allocation buffers as if nulls gets to
+  // TransferLiteralFromDevice() a CHECK is issued.
+  xla::ShapedBuffer shaped_buffer = ToShapedBuffer();
+  for (auto& index_buffer : shaped_buffer.buffers()) {
+    if (index_buffer.second.is_null()) {
+      return errors::InvalidArgument("Literal buffer at index ",
+                                     index_buffer.first.ToString(),
+                                     " has been released");
+    }
+  }
+  return transfer_manager->TransferLiteralFromDevice(stream.get(),
+                                                     shaped_buffer, *literal);
 }
 
 Status XRTTupleAllocation::WriteLiteral(xla::Backend* backend,
@@ -272,6 +282,11 @@ const se::DeviceMemoryBase& XRTTupleAllocation::root_allocation() {
   return rm->Delete<XRTTupleAllocation>(kTupleContainer, key_string);
 }
 
+/* static */ Status XRTTupleAllocation::ReleaseAllAllocations(ResourceMgr* rm) {
+  VLOG(1) << "Releasing all XRT held device memory";
+  return rm->Cleanup(kTupleContainer);
+}
+
 // Helper typedef to make ShapeTree ForEach helper lambda signatures more
 // readable. They need a type of const T& where in this case T is the
 // following pointer.
@@ -500,11 +515,34 @@ xla::ShapedBuffer XRTTupleAllocation::ToShapedBuffer() {
   return shaped_buffer;
 }
 
+Status XRTTupleAllocation::AliasBufferFrom(const XRTTupleAllocation& source,
+                                           const xla::ShapeIndex& source_index,
+                                           const xla::ShapeIndex& dest_index) {
+  XRTBufferAllocation* source_buffer = source.buffers_.element(source_index);
+  XRTBufferAllocation* dest_buffer = buffers_.element(dest_index);
+  // We allow the destination size being zero, because there are cases where we
+  // are coming in later filling in null/uninitialized device buffers.
+  // In all other cases, the size of the new buffer must match.
+  if (source_buffer->size() != dest_buffer->size() &&
+      dest_buffer->size() != 0) {
+    return errors::InvalidArgument(
+        "Source buffer at index ", source_index.ToString(),
+        " does not match the size of destination buffer at index ",
+        dest_index.ToString(), ": ", source_buffer->size(), " vs ",
+        dest_buffer->size());
+  }
+  *buffers_.mutable_element(dest_index) = source_buffer;
+  source_buffer->Ref();
+  dest_buffer->Unref();
+  return Status::OK();
+}
+
 xla::ShapeTree<xla::MaybeOwningDeviceMemory>
-XRTTupleAllocation::ToDeviceMemoryTree(bool release) {
+XRTTupleAllocation::ToDeviceMemoryTree(
+    const std::function<bool(const xla::ShapeIndex&)>& release_checker) {
   xla::ShapeTree<xla::MaybeOwningDeviceMemory> shaped_tree(on_device_shape());
   for (const auto& buffer : buffers_) {
-    if (!release) {
+    if (!release_checker(buffer.first)) {
       *shaped_tree.mutable_element(buffer.first) = buffer.second->allocation();
     } else {
       *shaped_tree.mutable_element(buffer.first) = xla::OwningDeviceMemory(
diff --git a/tensorflow/compiler/xrt/xrt_state.h b/tensorflow/compiler/xrt/xrt_state.h
index 3664c0cd4e6ad26945ae1012208fdb006164a066..6519da30d02e41da5a862cadd2133bd8dd8b42d7 100644
--- a/tensorflow/compiler/xrt/xrt_state.h
+++ b/tensorflow/compiler/xrt/xrt_state.h
@@ -18,6 +18,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XRT_XRT_STATE_H_
 #define TENSORFLOW_COMPILER_XRT_XRT_STATE_H_
 
+#include <functional>
 #include <memory>
 #include <string>
 #include <vector>
@@ -58,7 +59,14 @@ class XRTBufferAllocation : public core::RefCounted {
   // freed when the reference count drops to zero.
   void DiscardAllocation();
 
+  // Returns the expected size of the allocation. Since DiscardAllocation() will
+  // set allocation_ to {null,0}, and since later we might want to replace the
+  // discarded buffer with a new one, we need to be able to verify the size
+  // compatibility.
+  uint64 size() const { return size_; }
+
  private:
+  uint64 size_ = 0;
   se::DeviceMemoryBase allocation_;
   int device_ordinal_;
   xla::DeviceMemoryAllocator* allocator_;
@@ -80,7 +88,7 @@ class XRTTupleAllocation : public ResourceBase {
   // Allocates new device memory buffers sufficient to store literal, transfers
   // literal to that memory, and returns a XRTTupleAllocation handle to the
   // allocated buffers.
-  static Status CreateAndTransfer(const xla::Literal& literal,
+  static Status CreateAndTransfer(const xla::LiteralBase& literal,
                                   xla::Backend* backend, int device_ordinal,
                                   XRTTupleAllocation** allocation);
 
@@ -129,13 +137,17 @@ class XRTTupleAllocation : public ResourceBase {
   // Deletes the reference in the rm to an allocation interned under key.
   static Status DeleteFromResourceManager(ResourceMgr* rm, int64 key);
 
+  // Releases all the device memory allocated by XRT within the resource
+  // manager.
+  static Status ReleaseAllAllocations(ResourceMgr* rm);
+
   // Adds the allocation to a ResourceMgr and returns the key that will be used
   // to retrieve it. Transfers a reference on *this to rm.
   Status Intern(ResourceMgr* rm, int64* key);
 
   // Copies the allocation from device to host and returns it in literal.
   Status ToLiteral(xla::Backend* backend, int device_ordinal,
-                   xla::Literal* literal);
+                   xla::MutableLiteralBase* literal);
 
   // Write a new literal value to the allocation.
   Status WriteLiteral(xla::Backend* backend, const xla::Literal& literal);
@@ -164,11 +176,20 @@ class XRTTupleAllocation : public ResourceBase {
   // the same shape as on_host_shape.
   xla::ShapedBuffer ToShapedBuffer();
 
-  // Returns the device memory tree of this allocation. If 'release' is set, the
-  // ownership of the device memory is transferred to the result.
-  xla::ShapeTree<xla::MaybeOwningDeviceMemory> ToDeviceMemoryTree(bool release);
+  // Aliases the source buffer at source_index into the current tuple allocation
+  // dest_index.
+  Status AliasBufferFrom(const XRTTupleAllocation& source,
+                         const xla::ShapeIndex& source_index,
+                         const xla::ShapeIndex& dest_index);
+
+  // Returns the device memory tree of this allocation. If the release_checker
+  // function returns true for a given index, the ownership of the device memory
+  // at that index is transferred to the result. Every attempt to read the value
+  // at that index will fail.
+  xla::ShapeTree<xla::MaybeOwningDeviceMemory> ToDeviceMemoryTree(
+      const std::function<bool(const xla::ShapeIndex&)>& release_checker);
 
-  string DebugString() override { return "XLA allocation handle"; }
+  string DebugString() const override { return "XLA allocation handle"; }
 
  private:
   // Creates a new handle with (tuple) shape.
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 832db0f4ab46911e067d17b4a125706c276cf798..0173b8bb064c7b2fb8a0df018204515b24cfa718 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -63,7 +63,6 @@ py_library(
         "//tensorflow/contrib/libsvm",
         "//tensorflow/contrib/linear_optimizer:sdca_estimator_py",
         "//tensorflow/contrib/linear_optimizer:sdca_ops_py",
-        "//tensorflow/contrib/lite/python:lite",
         "//tensorflow/contrib/lookup:lookup_py",
         "//tensorflow/contrib/losses:losses_py",
         "//tensorflow/contrib/losses:metric_learning_py",
@@ -197,7 +196,7 @@ cc_library(
             "//tensorflow/contrib/kinesis:dataset_kernels",
         ],
     }) + if_not_windows([
-        "//tensorflow/contrib/tensorrt:trt_engine_op_kernel",
+        "//tensorflow/contrib/tensorrt:trt_op_kernels",
     ]),
 )
 
@@ -219,7 +218,6 @@ cc_library(
         "//tensorflow/contrib/tensor_forest:stats_ops_op_lib",
         "//tensorflow/contrib/tensor_forest:tensor_forest_ops_op_lib",
         "//tensorflow/contrib/text:all_ops",
-        "//tensorflow/contrib/tpu:all_ops",
     ] + select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
@@ -239,7 +237,7 @@ cc_library(
             "//tensorflow/contrib/kinesis:dataset_ops_op_lib",
         ],
     }) + if_not_windows([
-        "//tensorflow/contrib/tensorrt:trt_engine_op_op_lib",
+        "//tensorflow/compiler/tf2tensorrt:trt_op_libs",
     ]) + select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 4f1a2a5693235183c8f486817b82c8c81fa389ec..48d5296c71cbdb470fa405b30547a32b7022f29b 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -20,13 +20,14 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import platform
 
 # Add projects here, they will show up under tf.contrib.
 from tensorflow.contrib import autograph
 from tensorflow.contrib import batching
 from tensorflow.contrib import bayesflow
 from tensorflow.contrib import checkpoint
-if os.name != "nt":
+if os.name != "nt" and platform.machine() != "s390x":
   from tensorflow.contrib import cloud
 from tensorflow.contrib import cluster_resolver
 from tensorflow.contrib import coder
@@ -91,7 +92,6 @@ from tensorflow.contrib import tpu
 from tensorflow.contrib import training
 from tensorflow.contrib import util
 from tensorflow.contrib.eager.python import tfe as eager
-from tensorflow.contrib.lite.python import lite
 from tensorflow.contrib.optimizer_v2 import optimizer_v2_symbols as optimizer_v2
 from tensorflow.contrib.receptive_field import receptive_field_api as receptive_field
 from tensorflow.contrib.recurrent.python import recurrent_api as recurrent
@@ -103,6 +103,8 @@ from tensorflow.python.util.lazy_loader import LazyLoader
 ffmpeg = LazyLoader("ffmpeg", globals(),
                     "tensorflow.contrib.ffmpeg")
 del os
+del platform
+
 del LazyLoader
 
 del absolute_import
diff --git a/tensorflow/contrib/android/BUILD b/tensorflow/contrib/android/BUILD
index f0b1c92cf7e4b760381da38febd9682ce2a4f27c..5608e7ddafa25757484d8c845c8c84a5691e143c 100644
--- a/tensorflow/contrib/android/BUILD
+++ b/tensorflow/contrib/android/BUILD
@@ -73,8 +73,7 @@ cc_binary(
         "-z defs",
         "-s",
         "-Wl,--gc-sections",
-        "-Wl,--version-script",  # This line must be directly followed by LINKER_SCRIPT.
-        "$(location {})".format(LINKER_SCRIPT),
+        "-Wl,--version-script,$(location {})".format(LINKER_SCRIPT),
     ]),
     linkshared = 1,
     linkstatic = 1,
diff --git a/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb b/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb
index 44532cb078f9bd1578172f8a7d8a4b55cd21a7cb..831c613f2c8c9a4fcc2cb9d313077fe79ee96fd7 100644
--- a/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb
+++ b/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb
@@ -186,8 +186,8 @@
         "\n",
         "  def __init__(self):\n",
         "    super(RnnColorbot, self).__init__()\n",
-        "    self.lower_cell = tf.contrib.rnn.LSTMBlockCell(256)\n",
-        "    self.upper_cell = tf.contrib.rnn.LSTMBlockCell(128)\n",
+        "    self.lower_cell = tf.contrib.rnn.LSTMBlockCell(256, dtype=tf.float32)\n",
+        "    self.upper_cell = tf.contrib.rnn.LSTMBlockCell(128, dtype=tf.float32)\n",
         "    self.relu_layer = tf.layers.Dense(3, activation=tf.nn.relu)\n",
         "\n",
         "  def _rnn_layer(self, chars, cell, batch_size, training):\n",
@@ -241,7 +241,7 @@
         "    seq = self._rnn_layer(seq, self.upper_cell, batch_size, training)\n",
         "\n",
         "    # Grab just the end-of-sequence from each output.\n",
-        "    indices = (length - 1, range(batch_size))\n",
+        "    indices = (length - 1, list(range(batch_size)))\n",
         "    indices = tf.stack(indices, 1)\n",
         "    sequence_ends = tf.gather_nd(seq, indices)\n",
         "    return self.relu_layer(sequence_ends)\n",
diff --git a/tensorflow/contrib/batching/BUILD b/tensorflow/contrib/batching/BUILD
index 648f3ebb05646a66144bcb118347cbc391909409..5174afe0a63d37e3ea3e19ac9bab644d1d83ecf1 100644
--- a/tensorflow/contrib/batching/BUILD
+++ b/tensorflow/contrib/batching/BUILD
@@ -37,6 +37,7 @@ py_library(
 cc_library(
     name = "batch_ops_kernels",
     deps = [
+        "//tensorflow/core:batch_ops_op_lib",
         "//tensorflow/core/kernels:batch_kernels",
     ],
     alwayslink = 1,
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc b/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc
index 6138d7912601344ef7422fd50fb35c8401fd2e63..f0637595db08cbeb3b3ee0c94c5399df4c8c83e6 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 
 namespace tensorflow {
-
 namespace {
 
 class BigtableClientOp : public OpKernel {
@@ -341,8 +340,8 @@ class ToBigtableOp : public AsyncOpKernel {
   }
 
   template <typename T>
-  Status ParseScalarArgument(OpKernelContext* ctx,
-                             const StringPiece& argument_name, T* output) {
+  Status ParseScalarArgument(OpKernelContext* ctx, StringPiece argument_name,
+                             T* output) {
     const Tensor* argument_t;
     TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
     if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
@@ -360,5 +359,4 @@ REGISTER_KERNEL_BUILDER(Name("DatasetToBigtable").Device(DEVICE_CPU),
 
 }  // namespace
 }  // namespace data
-
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_lib.h b/tensorflow/contrib/bigtable/kernels/bigtable_lib.h
index 4652021fecabfa11fa6a8754dc884d89e151b590..e3b4535bac4a01a1277290e0d1ea6d3c7613731c 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_lib.h
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_lib.h
@@ -42,7 +42,7 @@ class BigtableClientResource : public ResourceBase {
     return client_;
   }
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat("BigtableClientResource(project_id: ", project_id_,
                            ", instance_id: ", instance_id_, ")");
   }
@@ -67,7 +67,7 @@ class BigtableTableResource : public ResourceBase {
 
   ::google::cloud::bigtable::noex::Table& table() { return table_; }
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat(
         "BigtableTableResource(client: ", client_->DebugString(),
         ", table: ", table_name_, ")");
diff --git a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
index e95dc577184f7e81d942755b41065f52131ce9f6..d9fce6e09f47ab05074f0b4c03dd8e672ed3d2ce 100644
--- a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
+++ b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h"
 
-#include "google/bigtable/v2/data.pb.h"
+#include "external/com_github_googleapis_googleapis/google/bigtable/v2/data.pb.h"
 #include "google/protobuf/wrappers.pb.h"
 #include "re2/re2.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -335,6 +335,17 @@ grpc::Status BigtableTestClient::ReadModifyWriteRow(
   return grpc::Status(grpc::StatusCode::UNIMPLEMENTED,
                       "ReadModifyWriteRow not implemented.");
 }
+std::unique_ptr<grpc::ClientAsyncResponseReaderInterface<
+    google::bigtable::v2::ReadModifyWriteRowResponse>>
+BigtableTestClient::AsyncReadModifyWriteRow(
+    grpc::ClientContext* context,
+    google::bigtable::v2::ReadModifyWriteRowRequest const& request,
+    grpc::CompletionQueue* cq) {
+  LOG(WARNING) << "Call to AsyncReadModifyWriteRow:" << __func__
+               << "(); this will likely cause a crash!";
+  return nullptr;
+}
+
 std::unique_ptr<
     grpc::ClientReaderInterface<google::bigtable::v2::ReadRowsResponse>>
 BigtableTestClient::ReadRows(
@@ -399,6 +410,28 @@ BigtableTestClient::AsyncMutateRows(
   return nullptr;
 }
 
+std::unique_ptr<grpc::ClientAsyncResponseReaderInterface<
+    google::bigtable::v2::CheckAndMutateRowResponse>>
+BigtableTestClient::AsyncCheckAndMutateRow(
+    grpc::ClientContext* context,
+    const google::bigtable::v2::CheckAndMutateRowRequest& request,
+    grpc::CompletionQueue* cq) {
+  LOG(WARNING) << "Call to InMemoryDataClient::" << __func__
+               << "(); this will likely cause a crash!";
+  return nullptr;
+}
+
+std::unique_ptr<
+    grpc::ClientAsyncReaderInterface<google::bigtable::v2::ReadRowsResponse>>
+BigtableTestClient::AsyncReadRows(
+    grpc::ClientContext* context,
+    const google::bigtable::v2::ReadRowsRequest& request,
+    grpc::CompletionQueue* cq, void* tag) {
+  LOG(WARNING) << "Call to InMemoryDataClient::" << __func__
+               << "(); this will likely cause a crash!";
+  return nullptr;
+}
+
 std::shared_ptr<grpc::Channel> BigtableTestClient::Channel() {
   LOG(WARNING) << "Call to InMemoryDataClient::Channel(); this will likely "
                   "cause a crash!";
diff --git a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
index c4a1f06bc504c3565c7bb09b42e48e7fbddb9cc6..63d59b32dd17a2f58d3413932b69f4d704c84e48 100644
--- a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
+++ b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
@@ -46,6 +46,13 @@ class BigtableTestClient : public ::google::cloud::bigtable::DataClient {
       google::bigtable::v2::ReadModifyWriteRowRequest const& request,
       google::bigtable::v2::ReadModifyWriteRowResponse* response) override;
 
+  std::unique_ptr<grpc::ClientAsyncResponseReaderInterface<
+      google::bigtable::v2::ReadModifyWriteRowResponse>>
+  AsyncReadModifyWriteRow(
+      grpc::ClientContext* context,
+      google::bigtable::v2::ReadModifyWriteRowRequest const& request,
+      grpc::CompletionQueue* cq) override;
+
   std::unique_ptr<
       grpc::ClientReaderInterface<google::bigtable::v2::ReadRowsResponse>>
   ReadRows(grpc::ClientContext* context,
@@ -80,6 +87,19 @@ class BigtableTestClient : public ::google::cloud::bigtable::DataClient {
                   const ::google::bigtable::v2::MutateRowsRequest& request,
                   ::grpc::CompletionQueue* cq, void* tag) override;
 
+  std::unique_ptr<grpc::ClientAsyncResponseReaderInterface<
+      google::bigtable::v2::CheckAndMutateRowResponse>>
+  AsyncCheckAndMutateRow(
+      grpc::ClientContext* context,
+      const google::bigtable::v2::CheckAndMutateRowRequest& request,
+      grpc::CompletionQueue* cq) override;
+
+  std::unique_ptr<
+      grpc::ClientAsyncReaderInterface<google::bigtable::v2::ReadRowsResponse>>
+  AsyncReadRows(grpc::ClientContext* context,
+                const google::bigtable::v2::ReadRowsRequest& request,
+                grpc::CompletionQueue* cq, void* tag) override;
+
   std::shared_ptr<grpc::Channel> Channel() override;
 
  private:
diff --git a/tensorflow/contrib/bigtable/ops/bigtable_ops.cc b/tensorflow/contrib/bigtable/ops/bigtable_ops.cc
index 416b719e30aa5f2504449d151a48e95c9105c68b..39c2a2e775d5d5287b137bf33eef66251738e6d3 100644
--- a/tensorflow/contrib/bigtable/ops/bigtable_ops.cc
+++ b/tensorflow/contrib/bigtable/ops/bigtable_ops.cc
@@ -59,7 +59,7 @@ REGISTER_OP("BigtablePrefixKeyDataset")
     .Input("table: resource")
     .Input("prefix: string")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
@@ -68,14 +68,14 @@ REGISTER_OP("BigtableRangeKeyDataset")
     .Input("start_key: string")
     .Input("end_key: string")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("BigtableSampleKeysDataset")
     .Input("table: resource")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
@@ -85,7 +85,7 @@ REGISTER_OP("BigtableSampleKeyPairsDataset")
     .Input("start_key: string")
     .Input("end_key: string")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
@@ -100,7 +100,7 @@ REGISTER_OP("BigtableScanDataset")
     .Input("columns: string")
     .Input("probability: float")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
diff --git a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
index b6cdc7aab0320fe5f457288ada03a46e18a694cc..fa64055dfd65a134afdf46cebccb7f7d96106502 100644
--- a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
+++ b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
@@ -489,7 +489,7 @@ class BigtableTable(object):
                        "len(dataset.output_types))")
     return gen_bigtable_ops.dataset_to_bigtable(
         self._resource,
-        dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        dataset._variant_tensor,  # pylint: disable=protected-access
         column_families,
         columns,
         timestamp)
@@ -582,13 +582,14 @@ class _BigtableKeyDataset(dataset_ops.DatasetSource):
   """_BigtableKeyDataset is an abstract class representing the keys of a table.
   """
 
-  def __init__(self, table):
+  def __init__(self, table, variant_tensor):
     """Constructs a _BigtableKeyDataset.
 
     Args:
       table: a Bigtable class.
+      variant_tensor: DT_VARIANT representation of the dataset.
     """
-    super(_BigtableKeyDataset, self).__init__()
+    super(_BigtableKeyDataset, self).__init__(variant_tensor)
     self._table = table
 
   @property
@@ -601,13 +602,11 @@ class _BigtablePrefixKeyDataset(_BigtableKeyDataset):
   """
 
   def __init__(self, table, prefix):
-    super(_BigtablePrefixKeyDataset, self).__init__(table)
     self._prefix = prefix
-
-  def _as_variant_tensor(self):
-    return gen_bigtable_ops.bigtable_prefix_key_dataset(
-        table=self._table._resource,  # pylint: disable=protected-access
+    variant_tensor = gen_bigtable_ops.bigtable_prefix_key_dataset(
+        table=table._resource,  # pylint: disable=protected-access
         prefix=self._prefix)
+    super(_BigtablePrefixKeyDataset, self).__init__(table, variant_tensor)
 
 
 class _BigtableRangeKeyDataset(_BigtableKeyDataset):
@@ -615,15 +614,13 @@ class _BigtableRangeKeyDataset(_BigtableKeyDataset):
   """
 
   def __init__(self, table, start, end):
-    super(_BigtableRangeKeyDataset, self).__init__(table)
     self._start = start
     self._end = end
-
-  def _as_variant_tensor(self):
-    return gen_bigtable_ops.bigtable_range_key_dataset(
-        table=self._table._resource,  # pylint: disable=protected-access
+    variant_tensor = gen_bigtable_ops.bigtable_range_key_dataset(
+        table=table._resource,  # pylint: disable=protected-access
         start_key=self._start,
         end_key=self._end)
+    super(_BigtableRangeKeyDataset, self).__init__(table, variant_tensor)
 
 
 class _BigtableSampleKeysDataset(_BigtableKeyDataset):
@@ -633,11 +630,9 @@ class _BigtableSampleKeysDataset(_BigtableKeyDataset):
   # TODO(saeta): Expose the data size offsets into the keys.
 
   def __init__(self, table):
-    super(_BigtableSampleKeysDataset, self).__init__(table)
-
-  def _as_variant_tensor(self):
-    return gen_bigtable_ops.bigtable_sample_keys_dataset(
-        table=self._table._resource)  # pylint: disable=protected-access
+    variant_tensor = gen_bigtable_ops.bigtable_sample_keys_dataset(
+        table=table._resource)  # pylint: disable=protected-access
+    super(_BigtableSampleKeysDataset, self).__init__(table, variant_tensor)
 
 
 class _BigtableLookupDataset(dataset_ops.DatasetSource):
@@ -651,20 +646,18 @@ class _BigtableLookupDataset(dataset_ops.DatasetSource):
     self._normalized = normalized
     self._column_families = [i[0] for i in normalized]
     self._columns = [i[1] for i in normalized]
+    variant_tensor = gen_bigtable_ops.bigtable_lookup_dataset(
+        keys_dataset=self._dataset._variant_tensor,  # pylint: disable=protected-access
+        table=self._table._resource,  # pylint: disable=protected-access
+        column_families=self._column_families,
+        columns=self._columns)
+    super(_BigtableLookupDataset, self).__init__(variant_tensor)
 
   @property
   def _element_structure(self):
     return structure.NestedStructure(tuple(
         [structure.TensorStructure(dtypes.string, [])] * self._num_outputs))
 
-  def _as_variant_tensor(self):
-    # pylint: disable=protected-access
-    return gen_bigtable_ops.bigtable_lookup_dataset(
-        keys_dataset=self._dataset._as_variant_tensor(),
-        table=self._table._resource,
-        column_families=self._column_families,
-        columns=self._columns)
-
 
 class _BigtableScanDataset(dataset_ops.DatasetSource):
   """_BigtableScanDataset represents a dataset that retrieves keys and values.
@@ -679,14 +672,7 @@ class _BigtableScanDataset(dataset_ops.DatasetSource):
     self._columns = [i[1] for i in normalized]
     self._probability = probability
     self._num_outputs = len(normalized) + 1  # 1 for row key
-
-  @property
-  def _element_structure(self):
-    return structure.NestedStructure(tuple(
-        [structure.TensorStructure(dtypes.string, [])] * self._num_outputs))
-
-  def _as_variant_tensor(self):
-    return gen_bigtable_ops.bigtable_scan_dataset(
+    variant_tensor = gen_bigtable_ops.bigtable_scan_dataset(
         table=self._table._resource,  # pylint: disable=protected-access
         prefix=self._prefix,
         start_key=self._start,
@@ -694,6 +680,13 @@ class _BigtableScanDataset(dataset_ops.DatasetSource):
         column_families=self._column_families,
         columns=self._columns,
         probability=self._probability)
+    super(_BigtableScanDataset, self).__init__(variant_tensor)
+
+  @property
+  def _element_structure(self):
+    return structure.NestedStructure(
+        tuple(
+            [structure.TensorStructure(dtypes.string, [])] * self._num_outputs))
 
 
 class _BigtableSampleKeyPairsDataset(dataset_ops.DatasetSource):
@@ -705,17 +698,15 @@ class _BigtableSampleKeyPairsDataset(dataset_ops.DatasetSource):
     self._prefix = prefix
     self._start = start
     self._end = end
+    variant_tensor = gen_bigtable_ops.bigtable_sample_key_pairs_dataset(
+        table=self._table._resource,  # pylint: disable=protected-access
+        prefix=self._prefix,
+        start_key=self._start,
+        end_key=self._end)
+    super(_BigtableSampleKeyPairsDataset, self).__init__(variant_tensor)
 
   @property
   def _element_structure(self):
     return structure.NestedStructure(
         (structure.TensorStructure(dtypes.string, []),
          structure.TensorStructure(dtypes.string, [])))
-
-  def _as_variant_tensor(self):
-    # pylint: disable=protected-access
-    return gen_bigtable_ops.bigtable_sample_key_pairs_dataset(
-        table=self._table._resource,
-        prefix=self._prefix,
-        start_key=self._start,
-        end_key=self._end)
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
index d3b23d949ee2c7674c3918d39e8b71d76eefcfec..64e4c4560ba3a1b177db12a09997ff7afe8775a3 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
@@ -193,8 +193,9 @@ py_test(
 
 py_test(
     name = "estimator_test",
-    size = "large",
+    size = "medium",
     srcs = ["estimator_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
         "no_gpu",
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
index a178820841c4c8bcb7f5742babdb6d0f4825de31..5ffbb9067081d7440ab5e11290697b822051bee5 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
@@ -84,12 +84,10 @@ class GradientBoostedDecisionTreeClassifier(estimator.Estimator):
       output_leaf_index: whether to output leaf indices along with predictions
         during inference. The leaf node indexes are available in predictions
         dict by the key 'leaf_index'. It is a Tensor of rank 2 and its shape is
-        [batch_size, num_trees].
-        For example,
-        result_iter = classifier.predict(...)
-        for result_dict in result_iter:
-          # access leaf index list by result_dict["leaf_index"]
-          # which contains one leaf index per tree
+        [batch_size, num_trees]. For example, result_iter =
+        classifier.predict(...)
+        for result_dict in result_iter: # access leaf index list by
+          result_dict["leaf_index"] # which contains one leaf index per tree
       override_global_step_value: If after the training is done, global step
         value must be reset to this value. This should be used to reset global
         step to a number > number of steps used to train the current ensemble.
@@ -179,8 +177,8 @@ class GradientBoostedDecisionTreeRegressor(estimator.Estimator):
         `[batch_size, label_dimension]`).
       num_trees: An int, number of trees to build.
       feature_columns: A list of feature columns.
-      label_name: String, name of the key in label dict. Can be null if label
-          is a tensor (single headed models).
+      label_name: String, name of the key in label dict. Can be null if label is
+        a tensor (single headed models).
       weight_column_name: Name of the column for weights, or None if not
         weighted.
       model_dir: Directory for model exports, etc.
@@ -195,11 +193,11 @@ class GradientBoostedDecisionTreeRegressor(estimator.Estimator):
         opposed to contrib) version of tensorflow.
       output_leaf_index: whether to output leaf indices along with predictions
         during inference. The leaf node indexes are available in predictions
-        dict by the key 'leaf_index'. For example,
-        result_dict = classifier.predict(...)
-        for example_prediction_result in result_dict:
-          # access leaf index list by example_prediction_result["leaf_index"]
-          # which contains one leaf index per tree
+        dict by the key 'leaf_index'. For example, result_dict =
+        classifier.predict(...)
+        for example_prediction_result in result_dict: # access leaf index list
+          by example_prediction_result["leaf_index"] # which contains one leaf
+          index per tree
       override_global_step_value: If after the training is done, global step
         value must be reset to this value. This should be used to reset global
         step to a number > number of steps used to train the current ensemble.
@@ -286,11 +284,11 @@ class GradientBoostedDecisionTreeEstimator(estimator.Estimator):
         opposed to contrib) version of tensorflow.
       output_leaf_index: whether to output leaf indices along with predictions
         during inference. The leaf node indexes are available in predictions
-        dict by the key 'leaf_index'. For example,
-        result_dict = classifier.predict(...)
-        for example_prediction_result in result_dict:
-          # access leaf index list by example_prediction_result["leaf_index"]
-          # which contains one leaf index per tree
+        dict by the key 'leaf_index'. For example, result_dict =
+        classifier.predict(...)
+        for example_prediction_result in result_dict: # access leaf index list
+          by example_prediction_result["leaf_index"] # which contains one leaf
+          index per tree
       override_global_step_value: If after the training is done, global step
         value must be reset to this value. This should be used to reset global
         step to a number > number of steps used to train the current ensemble.
@@ -353,10 +351,9 @@ class GradientBoostedDecisionTreeRanker(estimator.Estimator):
         layer. It can also be a function that computes the number of examples
         based on the depth of the layer that's being built.
       head: `Head` instance.
-      ranking_model_pair_keys: Keys to distinguish between features
-        for left and right part of the training pairs for ranking. For example,
-        for an Example with features "a.f1" and "b.f1", the keys would be
-        ("a", "b").
+      ranking_model_pair_keys: Keys to distinguish between features for left and
+        right part of the training pairs for ranking. For example, for an
+        Example with features "a.f1" and "b.f1", the keys would be ("a", "b").
       num_trees: An int, number of trees to build.
       feature_columns: A list of feature columns.
       weight_column_name: Name of the column for weights, or None if not
@@ -376,12 +373,10 @@ class GradientBoostedDecisionTreeRanker(estimator.Estimator):
       output_leaf_index: whether to output leaf indices along with predictions
         during inference. The leaf node indexes are available in predictions
         dict by the key 'leaf_index'. It is a Tensor of rank 2 and its shape is
-        [batch_size, num_trees].
-        For example,
-        result_iter = classifier.predict(...)
-        for result_dict in result_iter:
-          # access leaf index list by result_dict["leaf_index"]
-          # which contains one leaf index per tree
+        [batch_size, num_trees]. For example, result_iter =
+        classifier.predict(...)
+        for result_dict in result_iter: # access leaf index list by
+          result_dict["leaf_index"] # which contains one leaf index per tree
       override_global_step_value: If after the training is done, global step
         value must be reset to this value. This should be used to reset global
         step to a number > number of steps used to train the current ensemble.
@@ -417,12 +412,12 @@ class GradientBoostedDecisionTreeRanker(estimator.Estimator):
         config=config,
         feature_engineering_fn=feature_engineering_fn)
 
+
 # When using this estimator, make sure to regularize the hessian (at least l2,
 # min_node_weight)!
 # TODO(nponomareva): extend to take multiple quantiles in one go.
 class GradientBoostedDecisionTreeQuantileRegressor(estimator.Estimator):
-  """An estimator that does quantile regression and returns quantile estimates.
-  """
+  """An estimator that does quantile regression and returns quantile estimates."""
 
   def __init__(self,
                learner_config,
@@ -449,8 +444,8 @@ class GradientBoostedDecisionTreeQuantileRegressor(estimator.Estimator):
         layer. It can also be a function that computes the number of examples
         based on the depth of the layer that's being built.
       quantiles: a list of quantiles for the loss, each between 0 and 1.
-      label_dimension: Dimension of regression label. This is the size
-        of the last dimension of the labels `Tensor` (typically, this has shape
+      label_dimension: Dimension of regression label. This is the size of the
+        last dimension of the labels `Tensor` (typically, this has shape
         `[batch_size, label_dimension]`). When label_dimension>1, it is
         recommended to use multiclass strategy diagonal hessian or full hessian.
       num_trees: An int, number of trees to build.
@@ -469,11 +464,11 @@ class GradientBoostedDecisionTreeQuantileRegressor(estimator.Estimator):
         opposed to contrib) version of tensorflow.
       output_leaf_index: whether to output leaf indices along with predictions
         during inference. The leaf node indexes are available in predictions
-        dict by the key 'leaf_index'. For example,
-        result_dict = classifier.predict(...)
-        for example_prediction_result in result_dict:
-          # access leaf index list by example_prediction_result["leaf_index"]
-          # which contains one leaf index per tree
+        dict by the key 'leaf_index'. For example, result_dict =
+        classifier.predict(...)
+        for example_prediction_result in result_dict: # access leaf index list
+          by example_prediction_result["leaf_index"] # which contains one leaf
+          index per tree
       override_global_step_value: If after the training is done, global step
         value must be reset to this value. This should be used to reset global
         step to a number > number of steps used to train the current ensemble.
@@ -519,6 +514,7 @@ class GradientBoostedDecisionTreeQuantileRegressor(estimator.Estimator):
         config=config,
         feature_engineering_fn=feature_engineering_fn)
 
+
 # ================== New Estimator interface===================================
 # The estimators below use new core Estimator interface and must be used with
 # new feature columns and heads.
@@ -534,10 +530,8 @@ def core_multiclass_head(
 
   def loss_fn(labels, logits):
     result = losses.per_example_maxent_loss(
-        labels=labels,
-        logits=logits,
-        weights=weight_column,
-        num_classes=n_classes)
+        # Don't pass the weights: head already multiplies by them.
+        labels=labels, logits=logits, weights=None, num_classes=n_classes)
     return result[0]
 
   # pylint:disable=protected-access
@@ -564,7 +558,8 @@ def core_quantile_regression_head(
     result = losses.per_example_quantile_regression_loss(
         labels=labels,
         predictions=logits,
-        weights=weight_column,
+        # Don't pass the weights: head already multiplies by them.
+        weights=None,
         quantile=quantiles)
     return result[0]
 
@@ -623,11 +618,11 @@ class CoreGradientBoostedDecisionTreeEstimator(core_estimator.Estimator):
         the bias.
       output_leaf_index: whether to output leaf indices along with predictions
         during inference. The leaf node indexes are available in predictions
-        dict by the key 'leaf_index'. For example,
-        result_dict = classifier.predict(...)
-        for example_prediction_result in result_dict:
-          # access leaf index list by example_prediction_result["leaf_index"]
-          # which contains one leaf index per tree
+        dict by the key 'leaf_index'. For example, result_dict =
+        classifier.predict(...)
+        for example_prediction_result in result_dict: # access leaf index list
+          by example_prediction_result["leaf_index"] # which contains one leaf
+          index per tree
       num_quantiles: Number of quantiles to build for numeric feature values.
     """
 
@@ -685,10 +680,9 @@ class CoreGradientBoostedDecisionTreeRanker(core_estimator.Estimator):
         layer. It can also be a function that computes the number of examples
         based on the depth of the layer that's being built.
       head: `Head` instance.
-      ranking_model_pair_keys: Keys to distinguish between features
-        for left and right part of the training pairs for ranking. For example,
-        for an Example with features "a.f1" and "b.f1", the keys would be
-        ("a", "b").
+      ranking_model_pair_keys: Keys to distinguish between features for left and
+        right part of the training pairs for ranking. For example, for an
+        Example with features "a.f1" and "b.f1", the keys would be ("a", "b").
       num_trees: An int, number of trees to build.
       feature_columns: A list of feature columns.
       weight_column_name: Name of the column for weights, or None if not
@@ -703,12 +697,10 @@ class CoreGradientBoostedDecisionTreeRanker(core_estimator.Estimator):
       output_leaf_index: whether to output leaf indices along with predictions
         during inference. The leaf node indexes are available in predictions
         dict by the key 'leaf_index'. It is a Tensor of rank 2 and its shape is
-        [batch_size, num_trees].
-        For example,
-        result_iter = classifier.predict(...)
-        for result_dict in result_iter:
-          # access leaf index list by result_dict["leaf_index"]
-          # which contains one leaf index per tree
+        [batch_size, num_trees]. For example, result_iter =
+        classifier.predict(...)
+        for result_dict in result_iter: # access leaf index list by
+          result_dict["leaf_index"] # which contains one leaf index per tree
       num_quantiles: Number of quantiles to build for numeric feature values.
 
     Raises:
@@ -748,8 +740,7 @@ class CoreGradientBoostedDecisionTreeRanker(core_estimator.Estimator):
 # TODO(nponomareva): extend to take multiple quantiles in one go.
 class CoreGradientBoostedDecisionTreeQuantileRegressor(
     core_estimator.Estimator):
-  """An estimator that does quantile regression and returns quantile estimates.
-  """
+  """An estimator that does quantile regression and returns quantile estimates."""
 
   def __init__(self,
                learner_config,
@@ -775,8 +766,8 @@ class CoreGradientBoostedDecisionTreeQuantileRegressor(
         layer. It can also be a function that computes the number of examples
         based on the depth of the layer that's being built.
       quantiles: a list of quantiles for the loss, each between 0 and 1.
-      label_dimension: Dimension of regression label. This is the size
-        of the last dimension of the labels `Tensor` (typically, this has shape
+      label_dimension: Dimension of regression label. This is the size of the
+        last dimension of the labels `Tensor` (typically, this has shape
         `[batch_size, label_dimension]`). When label_dimension>1, it is
         recommended to use multiclass strategy diagonal hessian or full hessian.
       num_trees: An int, number of trees to build.
@@ -795,11 +786,11 @@ class CoreGradientBoostedDecisionTreeQuantileRegressor(
         the bias.
       output_leaf_index: whether to output leaf indices along with predictions
         during inference. The leaf node indexes are available in predictions
-        dict by the key 'leaf_index'. For example,
-        result_dict = classifier.predict(...)
-        for example_prediction_result in result_dict:
-          # access leaf index list by example_prediction_result["leaf_index"]
-          # which contains one leaf index per tree
+        dict by the key 'leaf_index'. For example, result_dict =
+        classifier.predict(...)
+        for example_prediction_result in result_dict: # access leaf index list
+          by example_prediction_result["leaf_index"] # which contains one leaf
+          index per tree
       num_quantiles: Number of quantiles to build for numeric feature values.
     """
     if len(quantiles) > 1:
@@ -814,7 +805,9 @@ class CoreGradientBoostedDecisionTreeQuantileRegressor(
           params={
               'head':
                   core_quantile_regression_head(
-                      quantiles[0], label_dimension=label_dimension),
+                      quantiles[0],
+                      label_dimension=label_dimension,
+                      weight_column=weight_column_name),
               'feature_columns':
                   feature_columns,
               'learner_config':
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
index ee052ac60387d8f993e4942dd7dff39e191dd3a4..5a8b2ba9caf0a9813cb5b3409b8a0dc3de0a45d7 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
@@ -399,8 +399,8 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
   def testQuantileRegression(self):
     learner_config = learner_pb2.LearnerConfig()
     learner_config.num_classes = 2
-    learner_config.constraints.max_tree_depth = 3
-    learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
+    learner_config.constraints.max_tree_depth = 6
+    learner_config.growing_mode = learner_pb2.LearnerConfig.LAYER_BY_LAYER
     learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
     learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
     learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
@@ -413,7 +413,7 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     model_upper = estimator.GradientBoostedDecisionTreeQuantileRegressor(
         quantiles=[0.95],
         learner_config=learner_config,
-        num_trees=100,
+        num_trees=12,
         examples_per_layer=_QUANTILE_REGRESSION_SIZE,
         center_bias=False)
 
@@ -428,31 +428,12 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     self.assertTrue(frac_below_upper >= 0.92)
     self.assertTrue(frac_below_upper <= 0.98)
 
-    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns()
-    model_lower = estimator.GradientBoostedDecisionTreeQuantileRegressor(
-        quantiles=[0.05],
-        learner_config=learner_config,
-        num_trees=100,
-        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
-        center_bias=False)
-
-    model_lower.fit(input_fn=train_input_fn, steps=1000)
-    result_iter = model_lower.predict(input_fn=test_input_fn)
-    lower = []
-    for prediction_dict in result_iter:
-      lower.append(prediction_dict["scores"])
-
-    frac_above_lower = round(1. * np.count_nonzero(lower < y) / len(y), 3)
-    # +/- 3%
-    self.assertTrue(frac_above_lower >= 0.92)
-    self.assertTrue(frac_above_lower <= 0.98)
-
   # Multi-dimensional quantile regression.
   def testQuantileRegressionMultiDimLabel(self):
     learner_config = learner_pb2.LearnerConfig()
     learner_config.num_classes = 2
-    learner_config.constraints.max_tree_depth = 3
-    learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
+    learner_config.constraints.max_tree_depth = 6
+    learner_config.growing_mode = learner_pb2.LearnerConfig.LAYER_BY_LAYER
     learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
     learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
     learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
@@ -467,7 +448,7 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
         quantiles=[0.95],
         learner_config=learner_config,
         label_dimension=2,
-        num_trees=100,
+        num_trees=18,
         examples_per_layer=_QUANTILE_REGRESSION_SIZE,
         center_bias=False)
 
@@ -487,37 +468,8 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     self.assertTrue(frac_below_upper_0 <= 0.98)
     self.assertTrue(frac_below_upper_1 >= 0.92)
     self.assertTrue(frac_below_upper_1 <= 0.98)
-    self.assertTrue(frac_both_below_upper >= 0.92)
-    self.assertTrue(frac_both_below_upper <= 0.98)
-
-    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns(
-        two_dimension=True)
-    model_lower = estimator.GradientBoostedDecisionTreeQuantileRegressor(
-        quantiles=[0.05],
-        learner_config=learner_config,
-        label_dimension=2,
-        num_trees=100,
-        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
-        center_bias=False)
-
-    model_lower.fit(input_fn=train_input_fn, steps=1000)
-    result_iter = model_lower.predict(input_fn=test_input_fn)
-    lower = []
-    for prediction_dict in result_iter:
-      lower.append(prediction_dict["scores"])
-
-    count_above_lower = np.count_nonzero(lower < y, axis=0)
-    count_both_aboce_lower = np.count_nonzero(np.prod(lower < y, axis=1))
-    frac_above_lower_0 = round(1. * count_above_lower[0] / len(y), 3)
-    frac_above_lower_1 = round(1. * count_above_lower[1] / len(y), 3)
-    frac_both_above_lower = round(1. * count_both_aboce_lower / len(y), 3)
-    # +/- 3%
-    self.assertTrue(frac_above_lower_0 >= 0.92)
-    self.assertTrue(frac_above_lower_0 <= 0.98)
-    self.assertTrue(frac_above_lower_1 >= 0.92)
-    self.assertTrue(frac_above_lower_1 <= 0.98)
-    self.assertTrue(frac_both_above_lower >= 0.92)
-    self.assertTrue(frac_both_above_lower <= 0.98)
+    self.assertTrue(frac_both_below_upper >= 0.91)
+    self.assertTrue(frac_both_below_upper <= 0.99)
 
 
 class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
@@ -712,11 +664,12 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
     est.evaluate(input_fn=input_fn, steps=1)
     est.predict(input_fn=input_fn)
 
-  # One dimensional quantile regression.
-  def testQuantileRegression(self):
+  # Quantile regression in core is the same as in non core estimator, so we
+  # just check that it does not fail.
+  def testQuantileRegressionDoesNotThroughException(self):
     learner_config = learner_pb2.LearnerConfig()
     learner_config.num_classes = 2
-    learner_config.constraints.max_tree_depth = 3
+    learner_config.constraints.max_tree_depth = 1
     learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
     learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
     learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
@@ -731,112 +684,12 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
     model_upper = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
         quantiles=[0.95],
         learner_config=learner_config,
-        num_trees=100,
-        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
-        center_bias=False)
-
-    model_upper.train(input_fn=train_input_fn, steps=1000)
-    result_iter = model_upper.predict(input_fn=test_input_fn)
-    upper = []
-    for prediction_dict in result_iter:
-      upper.append(prediction_dict["predictions"])
-
-    frac_below_upper = round(1. * np.count_nonzero(upper > y) / len(y), 3)
-    # +/- 3%
-    self.assertTrue(frac_below_upper >= 0.92)
-    self.assertTrue(frac_below_upper <= 0.98)
-
-    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns()
-    model_lower = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
-        quantiles=[0.05],
-        learner_config=learner_config,
-        num_trees=100,
-        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
-        center_bias=False)
-
-    model_lower.train(input_fn=train_input_fn, steps=1000)
-    result_iter = model_lower.predict(input_fn=test_input_fn)
-    lower = []
-    for prediction_dict in result_iter:
-      lower.append(prediction_dict["predictions"])
-
-    frac_above_lower = round(1. * np.count_nonzero(lower < y) / len(y), 3)
-    # +/- 3%
-    self.assertTrue(frac_above_lower >= 0.92)
-    self.assertTrue(frac_above_lower <= 0.98)
-
-  # Multi-dimensional quantile regression.
-  def testQuantileRegressionMultiDimLabel(self):
-    learner_config = learner_pb2.LearnerConfig()
-    learner_config.num_classes = 2
-    learner_config.constraints.max_tree_depth = 3
-    learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
-    learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
-    learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
-    learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
-    learner_config.regularization.tree_complexity = (
-        1.0 / _QUANTILE_REGRESSION_SIZE)
-
-    train_input_fn, test_input_fn, y = _quantile_regression_input_fns(
-        two_dimension=True)
-    y = y.reshape(_QUANTILE_REGRESSION_SIZE, 2)
-
-    # 95% percentile.
-    model_upper = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
-        quantiles=[0.95],
-        learner_config=learner_config,
-        num_trees=100,
-        label_dimension=2,
+        num_trees=1,
         examples_per_layer=_QUANTILE_REGRESSION_SIZE,
         center_bias=False)
 
     model_upper.train(input_fn=train_input_fn, steps=1000)
     result_iter = model_upper.predict(input_fn=test_input_fn)
-    upper = []
-    for prediction_dict in result_iter:
-      upper.append(prediction_dict["predictions"])
-
-    count_below_upper = np.count_nonzero(upper > y, axis=0)
-    count_both_below_upper = np.count_nonzero(np.prod(upper > y, axis=1))
-    frac_below_upper_0 = round(1. * count_below_upper[0] / len(y), 3)
-    frac_below_upper_1 = round(1. * count_below_upper[1] / len(y), 3)
-    frac_both_below_upper = round(1. * count_both_below_upper / len(y), 3)
-    # +/- 3%
-    self.assertTrue(frac_below_upper_0 >= 0.92)
-    self.assertTrue(frac_below_upper_0 <= 0.98)
-    self.assertTrue(frac_below_upper_1 >= 0.92)
-    self.assertTrue(frac_below_upper_1 <= 0.98)
-    self.assertTrue(frac_both_below_upper >= 0.92)
-    self.assertTrue(frac_both_below_upper <= 0.98)
-
-    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns(
-        two_dimension=True)
-    model_lower = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
-        quantiles=[0.05],
-        learner_config=learner_config,
-        num_trees=100,
-        label_dimension=2,
-        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
-        center_bias=False)
-
-    model_lower.train(input_fn=train_input_fn, steps=1000)
-    result_iter = model_lower.predict(input_fn=test_input_fn)
-    lower = []
-    for prediction_dict in result_iter:
-      lower.append(prediction_dict["predictions"])
-
-    count_above_lower = np.count_nonzero(lower < y, axis=0)
-    count_both_aboce_lower = np.count_nonzero(np.prod(lower < y, axis=1))
-    frac_above_lower_0 = round(1. * count_above_lower[0] / len(y), 3)
-    frac_above_lower_1 = round(1. * count_above_lower[1] / len(y), 3)
-    frac_both_above_lower = round(1. * count_both_aboce_lower / len(y), 3)
-    # +/- 3%
-    self.assertTrue(frac_above_lower_0 >= 0.92)
-    self.assertTrue(frac_above_lower_0 <= 0.98)
-    self.assertTrue(frac_above_lower_1 >= 0.92)
-    self.assertTrue(frac_above_lower_1 <= 0.98)
-    self.assertTrue(frac_both_above_lower >= 0.92)
-    self.assertTrue(frac_both_above_lower <= 0.98)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/model.py b/tensorflow/contrib/boosted_trees/estimator_batch/model.py
index a6e422847d3914188bca9e6dff797ba1ffb06749..eecf3c5aeb6c6785cae3fd5808954a73db6190d6 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/model.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/model.py
@@ -25,6 +25,7 @@ from tensorflow.contrib.boosted_trees.estimator_batch import estimator_utils
 from tensorflow.contrib.boosted_trees.estimator_batch import trainer_hooks
 from tensorflow.contrib.boosted_trees.python.ops import model_ops
 from tensorflow.contrib.boosted_trees.python.training.functions import gbdt_batch
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.training import training_util
@@ -88,6 +89,12 @@ def model_builder(features,
 
   if config is None:
     raise ValueError("Missing estimator RunConfig.")
+  if config.session_config is not None:
+    session_config = config.session_config
+    session_config.allow_soft_placement = True
+  else:
+    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+  config = config.replace(session_config=session_config)
 
   center_bias = params["center_bias"]
 
diff --git a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
index 6d78e27e8f69ea289b686af8402bd91967f997f4..65276242abaf96de8b1936365278b18f8bba93a9 100644
--- a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
@@ -538,7 +538,6 @@ class BuildSparseInequalitySplitsOp : public OpKernel {
           partition_boundaries[non_empty_partitions[root_idx]];
 
       float best_gain = std::numeric_limits<float>::lowest();
-      int32 best_dimension_idx = 0;
       bool default_right = false;
       int32 best_element_idx = 0;
 
@@ -571,7 +570,6 @@ class BuildSparseInequalitySplitsOp : public OpKernel {
       // Iterate through dimensions.
       for (int j = 0; j < dimension_boundaries.size() - 1; ++j) {
         const DimensionBoundary& dimension_and_start = dimension_boundaries[j];
-        const int32 dimension_id = dimension_and_start.dimension_id;
 
         int start_index = dimension_and_start.start_index;
         // Even for the last dimension, we always have additional dummy
@@ -630,7 +628,6 @@ class BuildSparseInequalitySplitsOp : public OpKernel {
               best_right_node_stats = right_stats_default_left;
               best_element_idx = element_idx;
               default_right = false;
-              best_dimension_idx = dimension_id;
             }
           }
           // Consider calculating the default direction only when there were
@@ -648,7 +645,6 @@ class BuildSparseInequalitySplitsOp : public OpKernel {
               best_right_node_stats = right_stats_default_right;
               best_element_idx = element_idx;
               default_right = true;
-              best_dimension_idx = dimension_id;
             }
           }
         }
diff --git a/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc b/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
index e446c411a8d5075563b8f8b912b29df310e16c8c..6faf6963011b698a3b233329d87471da7608e44a 100644
--- a/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/stats_accumulator_ops.cc
@@ -96,7 +96,7 @@ class StatsAccumulatorResource : public boosted_trees::StampedResource {
              TensorShapeUtils::IsScalar(hessian_shape));
   }
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat("StatsAccumulatorResource[size=", values_.size(),
                            "]");
   }
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/model_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/model_ops_test.py
index 42d69645acaae063fcd46bd1f6c819ccb68f48bd..aa3f24f08a0f762507df83def72e7d595265221f 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/model_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/model_ops_test.py
@@ -227,7 +227,7 @@ class ModelOpsTest(test_util.TensorFlowTestCase):
             tree_ensemble_config=tree_ensemble_config.SerializeToString(),
             name="restore_tree")
         resources.initialize_resources(resources.shared_resources()).run()
-        variables.initialize_all_variables().run()
+        variables.global_variables_initializer().run()
         my_saver = saver.Saver()
 
         # Add the second tree and replace the ensemble of the handle.
diff --git a/tensorflow/contrib/boosted_trees/python/ops/model_ops.py b/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
index fca22c71a83459cb290eaebcf107cf1c14c222b7..ad6ff0a861af896ef0dd254bd47752d76378d63a 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/model_ops.py
@@ -33,7 +33,7 @@ from tensorflow.contrib.boosted_trees.python.ops.gen_model_ops import tree_ensem
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import resources
 from tensorflow.python.training import saver
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.tracking import tracking
 
 ops.NotDifferentiable("TreeEnsembleVariable")
 ops.NotDifferentiable("TreeEnsembleSerialize")
@@ -62,8 +62,8 @@ class TreeEnsembleVariableSavable(saver.BaseSaverBuilder.SaveableObject):
         saver.BaseSaverBuilder.SaveSpec(ensemble_config, slice_spec,
                                         name + "_config"),
     ]
-    super(TreeEnsembleVariableSavable,
-          self).__init__(tree_ensemble_handle, specs, name)
+    super(TreeEnsembleVariableSavable, self).__init__(tree_ensemble_handle,
+                                                      specs, name)
     self._tree_ensemble_handle = tree_ensemble_handle
     self._create_op = create_op
 
@@ -115,7 +115,7 @@ class TreeEnsembleVariable(tracking.TrackableResource):
 
   def _gather_saveables_for_checkpoint(self):
     return {
-        "tree_ensemble_variable":
+        self.resource_handle.op.name + "/tree_ensemble_variable":
             functools.partial(
                 TreeEnsembleVariableSavable,
                 tree_ensemble_handle=self.resource_handle,
@@ -131,8 +131,8 @@ def tree_ensemble_variable(stamp_token,
 
   Args:
     stamp_token: The initial stamp token value for the ensemble resource.
-    tree_ensemble_config: A `Tensor` of type `string`.
-      Serialized proto of the tree ensemble.
+    tree_ensemble_config: A `Tensor` of type `string`. Serialized proto of the
+      tree ensemble.
     name: A name for the ensemble variable.
     container: An optional `string`. Defaults to `""`.
 
diff --git a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
index 0c319cc9bd1f720eb404a9da05227c5807ec874f..aff7105e94729942efc6e3e9d3ae23b733e8f5ed 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
@@ -33,7 +33,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import resources
 from tensorflow.python.training import saver
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.tracking import tracking
 
 # Pattern to remove all non alpha numeric from a string.
 _PATTERN = re.compile(r"[\W_]+")
diff --git a/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py b/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py
index ad1191d41236e71008bff8c8a7fbd42c16e3f9c5..2a0a206d97bbf01ac382531df31a66d429842bbb 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py
@@ -26,7 +26,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import resources
 from tensorflow.python.training import saver
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.tracking import tracking
 
 # Pattern to remove all non alpha numeric from a string.
 _PATTERN = re.compile(r"[\W_]+")
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index 9fdc2fc0c2c7b85502f7a3f9ae7c85cf05d5916c..e78ec476ab3b43e5eb56a2502008bb8020ae97e0 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -566,9 +566,10 @@ class GradientBoostedDecisionTreeModel(object):
     # Determine if ensemble is colocated with the inputs.
     if self._ensemble_handle.device != input_deps[0].device:
       # Create a local ensemble and get its local stamp.
-      with ops.name_scope("local_ensemble", "TreeEnsembleVariable") as name:
+      with ops.name_scope("local_ensemble", "TreeEnsembleVariable"):
         local_ensemble_handle = (
-            gen_model_ops.decision_tree_ensemble_resource_handle_op(name=name))
+            gen_model_ops.decision_tree_ensemble_resource_handle_op(
+                self._ensemble_handle.op.name + "/local_ensemble"))
         create_op = gen_model_ops.create_tree_ensemble_variable(
             local_ensemble_handle, stamp_token=-1, tree_ensemble_config="")
         with ops.control_dependencies([create_op]):
@@ -614,13 +615,19 @@ class GradientBoostedDecisionTreeModel(object):
           predictions_dict[NUM_TREES_ATTEMPTED] % self._logits_dimension)
     return constant_op.constant(-1, dtype=dtypes.int32)
 
-  def update_stats(self, loss, predictions_dict):
+  def update_stats(self, loss, predictions_dict, gradients=None, hessians=None):
     """Update the accumulators with stats from this batch.
 
     Args:
       loss: A scalar tensor representing average loss of examples.
       predictions_dict: Dictionary of Rank 2 `Tensor` representing information
           about predictions per example.
+      gradients: A tensor with the gradients with the respect to logits from
+        predictions_dict. If not provided, tensorflow will do
+        autodifferentiation.
+      hessians: A tensor with the hessians with the respect to logits from
+        predictions_dict. If not provided, tensorflow will do
+        autodifferentiation.
 
     Returns:
       Three values:
@@ -642,13 +649,14 @@ class GradientBoostedDecisionTreeModel(object):
     predictions = predictions_dict[PREDICTIONS]
     partition_ids = predictions_dict[PARTITION_IDS]
     ensemble_stamp = predictions_dict[ENSEMBLE_STAMP]
-    gradients = gradients_impl.gradients(
-        loss,
-        predictions,
-        name="Gradients",
-        colocate_gradients_with_ops=False,
-        gate_gradients=0,
-        aggregation_method=None)[0]
+    if gradients is None:
+      gradients = gradients_impl.gradients(
+          loss,
+          predictions,
+          name="Gradients",
+          colocate_gradients_with_ops=False,
+          gate_gradients=0,
+          aggregation_method=None)[0]
     strategy = self._learner_config.multi_class_strategy
 
     class_id = self._get_class_id(predictions_dict)
@@ -657,17 +665,20 @@ class GradientBoostedDecisionTreeModel(object):
       # We build one vs rest trees.
       if self._logits_dimension == 1:
         # We have only 1 score, gradients is of shape [batch, 1].
-        hessians = gradients_impl.gradients(
-            gradients,
-            predictions,
-            name="Hessian",
-            colocate_gradients_with_ops=False,
-            gate_gradients=0,
-            aggregation_method=None)[0]
+        if hessians is None:
+          hessians = gradients_impl.gradients(
+              gradients,
+              predictions,
+              name="Hessian",
+              colocate_gradients_with_ops=False,
+              gate_gradients=0,
+              aggregation_method=None)[0]
 
         squeezed_gradients = array_ops.squeeze(gradients, axis=[1])
         squeezed_hessians = array_ops.squeeze(hessians, axis=[1])
       else:
+        if hessians is not None:
+          raise ValueError("Providing hessians is not yet supported here.")
         hessian_list = self._diagonal_hessian(gradients, predictions)
         # Assemble hessian list into a tensor.
         hessians = array_ops.stack(hessian_list, axis=1)
@@ -678,6 +689,8 @@ class GradientBoostedDecisionTreeModel(object):
         squeezed_hessians = array_ops.squeeze(
             _get_column_by_index(hessians, class_id))
     else:
+      if hessians is not None:
+        raise ValueError("Providing hessians is not yet supported here.")
       # Other multiclass strategies.
       if strategy == learner_pb2.LearnerConfig.FULL_HESSIAN:
         hessian_list = self._full_hessian(gradients, predictions)
@@ -835,9 +848,9 @@ class GradientBoostedDecisionTreeModel(object):
     stats_update_ops.append(
         control_flow_ops.cond(
             continue_centering,
-            self._make_update_bias_stats_fn(
-                ensemble_stamp, predictions, gradients,
-                bias_stats_accumulator), control_flow_ops.no_op))
+            self._make_update_bias_stats_fn(ensemble_stamp, predictions,
+                                            gradients, bias_stats_accumulator,
+                                            hessians), control_flow_ops.no_op))
 
     # Update handler stats.
     handler_reads = collections.OrderedDict()
@@ -1162,7 +1175,8 @@ class GradientBoostedDecisionTreeModel(object):
   def get_max_tree_depth(self):
     return self._max_tree_depth
 
-  def train(self, loss, predictions_dict, labels):
+  def train(self, loss, predictions_dict, labels, gradients=None,
+            hessians=None):
     """Updates the accumalator stats and grows the ensemble.
 
     Args:
@@ -1171,6 +1185,12 @@ class GradientBoostedDecisionTreeModel(object):
           about predictions per example.
       labels: Rank 2 `Tensor` representing labels per example. Has no effect
           on the training and is only kept for backward compatibility.
+      gradients: A tensor with the gradients with the respect to logits from
+        predictions_dict. If not provided, tensorflow will do
+        autodifferentiation.
+      hessians: A tensor with the hessians with the respect to logits from
+        predictions_dict. If not provided, tensorflow will do
+        autodifferentiation.
 
     Returns:
       An op that adds a new tree to the ensemble.
@@ -1179,7 +1199,8 @@ class GradientBoostedDecisionTreeModel(object):
       ValueError: if inputs are not valid.
     """
     del labels  # unused; kept for backward compatibility.
-    update_op, _, training_state = self.update_stats(loss, predictions_dict)
+    update_op, _, training_state = self.update_stats(loss, predictions_dict,
+                                                     gradients, hessians)
     with ops.control_dependencies(update_op):
       return self.increment_step_counter_and_maybe_update_ensemble(
           predictions_dict, training_state)
@@ -1271,21 +1292,28 @@ class GradientBoostedDecisionTreeModel(object):
         ps_ops=ps_ops,
         ps_strategy=ps_strategy)
 
-  def _make_update_bias_stats_fn(self, ensemble_stamp, predictions, gradients,
-                                 bias_stats_accumulator):
+  def _make_update_bias_stats_fn(self,
+                                 ensemble_stamp,
+                                 predictions,
+                                 gradients,
+                                 bias_stats_accumulator,
+                                 hessians=None):
     """A method to create the function which updates the bias stats."""
 
     def _update_bias_stats():
       """A method to update the bias stats."""
       # Get reduced gradients and hessians.
       grads_sum = math_ops.reduce_sum(gradients, 0)
-      hess = gradients_impl.gradients(
-          grads_sum,
-          predictions,
-          name="Hessians",
-          colocate_gradients_with_ops=False,
-          gate_gradients=0,
-          aggregation_method=None)[0]
+      if hessians is not None:
+        hess = hessians
+      else:
+        hess = gradients_impl.gradients(
+            grads_sum,
+            predictions,
+            name="Hessians",
+            colocate_gradients_with_ops=False,
+            gate_gradients=0,
+            aggregation_method=None)[0]
       hess_sum = math_ops.reduce_sum(hess, 0)
 
       # Accumulate gradients and hessians.
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
index 92068e88a76cb8bfdd394c1093347a8fb8a63449..7e45d0b2cecefa4bdec77d6cf7cfca7dba04db9c 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
@@ -43,7 +43,7 @@ from tensorflow.python.platform import googletest
 def _squared_loss(label, unused_weights, predictions):
   """Unweighted loss implementation."""
   loss = math_ops.reduce_sum(
-      math_ops.square(predictions - label), 1, keepdims=True)
+      math_ops.squared_difference(predictions, label), 1, keepdims=True)
   return loss
 
 
diff --git a/tensorflow/contrib/boosted_trees/python/utils/losses.py b/tensorflow/contrib/boosted_trees/python/utils/losses.py
index 220e981618b7c0bfb1e4e98c087d83b451b9b3cf..1ad40aca2880940c78d746674c7378ff0427c057 100644
--- a/tensorflow/contrib/boosted_trees/python/utils/losses.py
+++ b/tensorflow/contrib/boosted_trees/python/utils/losses.py
@@ -166,7 +166,7 @@ def per_example_squared_loss(labels, weights, predictions):
     update_op: An update operation to update the loss's internal state.
   """
   unweighted_loss = math_ops.reduce_sum(
-      math_ops.square(predictions - labels), 1, keepdims=True)
+      math_ops.squared_difference(predictions, labels), 1, keepdims=True)
 
   return unweighted_loss * weights, control_flow_ops.no_op()
 
diff --git a/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h b/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h
index 94aeb2c7bb48c6eddb6c7894f8bf6f1567470113..0fe57c0a4e8375cc7ec7aca9553bded87e238b33 100644
--- a/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h
+++ b/tensorflow/contrib/boosted_trees/resources/decision_tree_ensemble_resource.h
@@ -34,7 +34,7 @@ class DecisionTreeEnsembleResource : public StampedResource {
             protobuf::Arena::CreateMessage<
                 boosted_trees::trees::DecisionTreeEnsembleConfig>(&arena_)) {}
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat("GTFlowDecisionTreeEnsemble[size=",
                            decision_tree_ensemble_->trees_size(), "]");
   }
diff --git a/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h b/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h
index fdaaae7f472c8f564ab45a8366d3746cbf1158ee..574e3065e7f46049815897ef73e44d33f0d23f0f 100644
--- a/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h
+++ b/tensorflow/contrib/boosted_trees/resources/quantile_stream_resource.h
@@ -43,7 +43,7 @@ class QuantileStreamResource : public StampedResource {
     set_stamp(stamp_token);
   }
 
-  string DebugString() override { return "QuantileStreamResource"; }
+  string DebugString() const override { return "QuantileStreamResource"; }
 
   tensorflow::mutex* mutex() { return &mu_; }
 
diff --git a/tensorflow/contrib/checkpoint/__init__.py b/tensorflow/contrib/checkpoint/__init__.py
index 94b7f4f867655bf7fdf94e8488eeae7088c41622..7b3df962542a656af8052e9f2eae6e83744411f2 100644
--- a/tensorflow/contrib/checkpoint/__init__.py
+++ b/tensorflow/contrib/checkpoint/__init__.py
@@ -27,7 +27,7 @@ Managing dependencies:
 @@NoDependency
 @@split_dependency
 
-Checkpointable data structures:
+Trackable data structures:
 @@List
 @@Mapping
 @@UniqueNameTracker
@@ -49,17 +49,16 @@ from tensorflow.contrib.checkpoint.python.python_state import NumpyState
 from tensorflow.contrib.checkpoint.python.python_state import PythonStateWrapper
 from tensorflow.contrib.checkpoint.python.split_dependency import split_dependency
 from tensorflow.contrib.checkpoint.python.visualize import dot_graph_from_checkpoint
-from tensorflow.core.protobuf.checkpointable_object_graph_pb2 import CheckpointableObjectGraph
+from tensorflow.core.protobuf.trackable_object_graph_pb2 import TrackableObjectGraph as CheckpointableObjectGraph
 from tensorflow.python.training.checkpoint_management import CheckpointManager
-from tensorflow.python.training.checkpointable.base import CheckpointableBase
-from tensorflow.python.training.checkpointable.data_structures import List
-from tensorflow.python.training.checkpointable.data_structures import Mapping
-from tensorflow.python.training.checkpointable.data_structures import NoDependency
-from tensorflow.python.training.checkpointable.tracking import Checkpointable
-from tensorflow.python.training.checkpointable.util import capture_dependencies
-from tensorflow.python.training.checkpointable.util import list_objects
-from tensorflow.python.training.checkpointable.util import object_metadata
-
+from tensorflow.python.training.tracking.base import Trackable as CheckpointableBase
+from tensorflow.python.training.tracking.data_structures import List
+from tensorflow.python.training.tracking.data_structures import Mapping
+from tensorflow.python.training.tracking.data_structures import NoDependency
+from tensorflow.python.training.tracking.tracking import AutoTrackable as Checkpointable
+from tensorflow.python.training.tracking.util import capture_dependencies
+from tensorflow.python.training.tracking.util import list_objects
+from tensorflow.python.training.tracking.util import object_metadata
 from tensorflow.python.util.all_util import remove_undocumented
 
 remove_undocumented(module_name=__name__)
diff --git a/tensorflow/contrib/checkpoint/python/BUILD b/tensorflow/contrib/checkpoint/python/BUILD
index ada41687261ab63286933d01da4e286173042e0c..cd9c94c9bd72d398d183d3f3d485ab48cb2fd617 100644
--- a/tensorflow/contrib/checkpoint/python/BUILD
+++ b/tensorflow/contrib/checkpoint/python/BUILD
@@ -2,7 +2,7 @@ licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = ["//tensorflow:internal"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 py_library(
     name = "checkpoint",
@@ -12,7 +12,7 @@ py_library(
         ":python_state",
         ":split_dependency",
         ":visualize",
-        "//tensorflow/python/training/checkpointable:data_structures",
+        "//tensorflow/python/training/tracking:data_structures",
     ],
 )
 
@@ -22,22 +22,22 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python/training/checkpointable:base",
-        "//tensorflow/python/training/checkpointable:data_structures",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/training/tracking:data_structures",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "containers_test",
     srcs = ["containers_test.py"],
-    deps = [
+    additional_deps = [
         ":containers",
+        "@six_archive//:six",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python/training/checkpointable:base",
-        "//tensorflow/python/training/checkpointable:util",
-        "@six_archive//:six",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/training/tracking:util",
     ],
 )
 
@@ -47,24 +47,24 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "python_state_test",
     srcs = ["python_state_test.py"],
-    deps = [
+    additional_deps = [
         ":python_state",
+        "//third_party/py/numpy",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:session",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/training/checkpointable:util",
-        "//third_party/py/numpy",
+        "//tensorflow/python/training/tracking:util",
     ],
 )
 
@@ -76,21 +76,21 @@ py_library(
     deps = [
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:training",
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "split_dependency_test",
     srcs = ["split_dependency_test.py"],
-    deps = [
+    additional_deps = [
         ":split_dependency",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/training/checkpointable:base",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/training/tracking:util",
     ],
 )
 
@@ -101,15 +101,15 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python/training/checkpointable:base",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/training/tracking:util",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "visualize_test",
     srcs = ["visualize_test.py"],
-    deps = [
+    additional_deps = [
         ":visualize",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:resource_variable_ops",
@@ -118,6 +118,7 @@ py_test(
         "//tensorflow/python/eager:test",
         "//tensorflow/python/keras:engine",
         "//tensorflow/python/keras:layers",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/tracking:util",
     ],
+    tags = ["no_oss"],  # b/124472244
 )
diff --git a/tensorflow/contrib/checkpoint/python/containers.py b/tensorflow/contrib/checkpoint/python/containers.py
index 5418e2605b724edb60878e250d2c50fcc6ff5633..a25d51980ea760dfb7f323497a397fbd94fd5f23 100644
--- a/tensorflow/contrib/checkpoint/python/containers.py
+++ b/tensorflow/contrib/checkpoint/python/containers.py
@@ -1,4 +1,4 @@
-"""Checkpointable data structures."""
+"""Trackable data structures."""
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,12 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.training.checkpointable import base as checkpointable_lib
-from tensorflow.python.training.checkpointable import data_structures
+from tensorflow.python.training.tracking import base as trackable_lib
+from tensorflow.python.training.tracking import data_structures
 
 
-class UniqueNameTracker(data_structures.CheckpointableDataStructure):
-  """Adds dependencies on checkpointable objects with name hints.
+class UniqueNameTracker(data_structures.TrackableDataStructure):
+  """Adds dependencies on trackable objects with name hints.
 
   Useful for creating dependencies with locally unique names.
 
@@ -43,30 +43,30 @@ class UniqueNameTracker(data_structures.CheckpointableDataStructure):
 
   def __init__(self):
     super(UniqueNameTracker, self).__init__()
-    self._maybe_initialize_checkpointable()
+    self._maybe_initialize_trackable()
     self._name_counts = {}
 
   @property
   def _values(self):
     return [dep.ref for dep in self._checkpoint_dependencies]
 
-  def track(self, checkpointable, base_name):
-    """Add a dependency on `checkpointable`.
+  def track(self, trackable, base_name):
+    """Add a dependency on `trackable`.
 
     Args:
-      checkpointable: An object to add a checkpoint dependency on.
+      trackable: An object to add a checkpoint dependency on.
       base_name: A name hint, which is uniquified to determine the dependency
         name.
     Returns:
-      `checkpointable`, for chaining.
+      `trackable`, for chaining.
     Raises:
-      ValueError: If `checkpointable` is not a checkpointable object.
+      ValueError: If `trackable` is not a trackable object.
     """
 
-    if not isinstance(checkpointable, checkpointable_lib.CheckpointableBase):
+    if not isinstance(trackable, trackable_lib.Trackable):
       raise ValueError(
-          ("Expected a checkpointable value, got %s which does not inherit "
-           "from CheckpointableBase.") % (checkpointable,))
+          ("Expected a trackable value, got %s which does not inherit "
+           "from tf.track.Trackable.") % (trackable,))
 
     def _format_name(prefix, number):
       if number > 0:
@@ -80,5 +80,5 @@ class UniqueNameTracker(data_structures.CheckpointableDataStructure):
       count += 1
       candidate = _format_name(base_name, count)
     self._name_counts[base_name] = count + 1
-    self._track_value(checkpointable, name=candidate)
-    return checkpointable
+    self._track_value(trackable, name=candidate)
+    return trackable
diff --git a/tensorflow/contrib/checkpoint/python/containers_test.py b/tensorflow/contrib/checkpoint/python/containers_test.py
index ac85c7be803cd4c2f8ba19d3ef887a3c65a15933..bace21939602666aa48a05d2abfe05ae6aae41e2 100644
--- a/tensorflow/contrib/checkpoint/python/containers_test.py
+++ b/tensorflow/contrib/checkpoint/python/containers_test.py
@@ -26,9 +26,9 @@ from tensorflow.python.keras import layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
-from tensorflow.python.training.checkpointable import data_structures
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import data_structures
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
 
 
 class UniqueNameTrackerTests(test.TestCase):
@@ -52,7 +52,7 @@ class UniqueNameTrackerTests(test.TestCase):
     save_root = util.Checkpoint(slots=slots)
     save_path = save_root.save(checkpoint_prefix)
 
-    restore_slots = tracking.Checkpointable()
+    restore_slots = tracking.AutoTrackable()
     restore_root = util.Checkpoint(
         slots=restore_slots)
     status = restore_root.restore(save_path)
@@ -68,7 +68,7 @@ class UniqueNameTrackerTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testExample(self):
-    class SlotManager(tracking.Checkpointable):
+    class SlotManager(tracking.AutoTrackable):
 
       def __init__(self):
         self.slotdeps = containers.UniqueNameTracker()
diff --git a/tensorflow/contrib/checkpoint/python/python_state.py b/tensorflow/contrib/checkpoint/python/python_state.py
index 302d5cfb79a08b6adf52ebd44533152c5454eadc..737a6c30c1dce65dd7638ee52e6c26a8a40f8321 100644
--- a/tensorflow/contrib/checkpoint/python/python_state.py
+++ b/tensorflow/contrib/checkpoint/python/python_state.py
@@ -23,7 +23,7 @@ import six
 
 import numpy
 
-from tensorflow.python.training.checkpointable import base
+from tensorflow.python.training.tracking import base
 
 # pylint: disable=g-import-not-at-top
 try:
@@ -34,8 +34,8 @@ except ImportError:
 # pylint: enable=g-import-not-at-top
 
 
-class NumpyState(base.CheckpointableBase):
-  """A checkpointable object whose NumPy array attributes are saved/restored.
+class NumpyState(base.Trackable):
+  """A trackable object whose NumPy array attributes are saved/restored.
 
   Example usage:
 
@@ -72,7 +72,7 @@ class NumpyState(base.CheckpointableBase):
     """Create placeholder NumPy arrays for to-be-restored attributes.
 
     Typically `_lookup_dependency` is used to check by name whether a dependency
-    exists. We cheat slightly by creating a checkpointable object for `name` if
+    exists. We cheat slightly by creating a trackable object for `name` if
     we don't already have one, giving us attribute re-creation behavior when
     loading a checkpoint.
 
@@ -85,7 +85,7 @@ class NumpyState(base.CheckpointableBase):
     value = super(NumpyState, self)._lookup_dependency(name)
     if value is None:
       value = _NumpyWrapper(numpy.array([]))
-      new_reference = base.CheckpointableReference(name=name, ref=value)
+      new_reference = base.TrackableReference(name=name, ref=value)
       self._unconditional_checkpoint_dependencies.append(new_reference)
       self._unconditional_dependency_names[name] = value
       super(NumpyState, self).__setattr__(name, value)
@@ -101,7 +101,7 @@ class NumpyState(base.CheckpointableBase):
   def __setattr__(self, name, value):
     """Automatically wrap NumPy arrays assigned to attributes."""
     # TODO(allenl): Consider supporting lists/tuples, either ad-hoc or by making
-    # ndarrays checkpointable natively and using standard checkpointable list
+    # ndarrays trackable natively and using standard trackable list
     # tracking.
     if isinstance(value, (numpy.ndarray, numpy.generic)):
       try:
@@ -110,19 +110,19 @@ class NumpyState(base.CheckpointableBase):
         return
       except AttributeError:
         value = _NumpyWrapper(value)
-        self._track_checkpointable(value, name=name, overwrite=True)
+        self._track_trackable(value, name=name, overwrite=True)
     elif (name not in ("_setattr_tracking", "_update_uid")
           and getattr(self, "_setattr_tracking", True)):
-      # Mixing restore()-created attributes with user-added checkpointable
+      # Mixing restore()-created attributes with user-added trackable
       # objects is tricky, since we can't use the `_lookup_dependency` trick to
       # re-create attributes (we might accidentally steal the restoration for
-      # another checkpointable object). For now `NumpyState` objects must be
+      # another trackable object). For now `NumpyState` objects must be
       # leaf nodes. Theoretically we could add some extra arguments to
       # `_lookup_dependency` to figure out whether we should create a NumPy
       # array for the attribute or not.
       raise NotImplementedError(
           ("Assigned %s to the %s property of %s, which is not a NumPy array. "
-           "Currently mixing NumPy arrays and other checkpointable objects is "
+           "Currently mixing NumPy arrays and other trackable objects is "
            "not supported. File a feature request if this limitation bothers "
            "you.")
           % (value, name, self))
@@ -130,7 +130,7 @@ class NumpyState(base.CheckpointableBase):
 
 
 @six.add_metaclass(abc.ABCMeta)
-class PythonStateWrapper(base.CheckpointableBase):
+class PythonStateWrapper(base.Trackable):
   """Wraps a Python object for storage in an object-based checkpoint."""
 
   @abc.abstractmethod
diff --git a/tensorflow/contrib/checkpoint/python/python_state_test.py b/tensorflow/contrib/checkpoint/python/python_state_test.py
index 45494351ff4e6c8c75634d8563c3fb63c6089036..40d8fe836402c8b6c8240ef9f665b753c54ede0d 100644
--- a/tensorflow/contrib/checkpoint/python/python_state_test.py
+++ b/tensorflow/contrib/checkpoint/python/python_state_test.py
@@ -26,7 +26,7 @@ from tensorflow.python.eager import test
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import util
 
 
 class NumpyStateTests(test.TestCase):
diff --git a/tensorflow/contrib/checkpoint/python/split_dependency.py b/tensorflow/contrib/checkpoint/python/split_dependency.py
index 7e77453f3d848c2e321ed2ba66917a742d95459a..d7b02b538909305b14e638761bd8ba67a948d2b4 100644
--- a/tensorflow/contrib/checkpoint/python/split_dependency.py
+++ b/tensorflow/contrib/checkpoint/python/split_dependency.py
@@ -21,7 +21,7 @@ import functools
 
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.training import saver as saver_lib
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 
 
 class _CallbackSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
@@ -43,7 +43,7 @@ class _CallbackSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
     return self._restore_callback(tensor)
 
 
-class _SplitDependency(checkpointable.CheckpointableBase):
+class _SplitDependency(trackable.Trackable):
   """Looks like a regular variable while synchronizing save/restores."""
 
   def __init__(self, save_buffer, restore_buffer, name, dtype, num_components,
@@ -81,9 +81,9 @@ class _SplitDependency(checkpointable.CheckpointableBase):
       return control_flow_ops.no_op()
 
   def _gather_saveables_for_checkpoint(self):
-    """Looks to Checkpointable like a regular variable."""
+    """Looks to Trackable like a regular variable."""
     return {
-        checkpointable.VARIABLE_VALUE_KEY:
+        trackable.VARIABLE_VALUE_KEY:
         functools.partial(_CallbackSaveable,
                           dtype=self._dtype,
                           save_callback=self._save,
@@ -117,7 +117,7 @@ def split_dependency(component_names, component_dtypes,
       may return `None`).
 
   Returns:
-    A dictionary mapping from names to Checkpointable objects. If one is
+    A dictionary mapping from names to Trackable objects. If one is
     reachable from an object as a dependency, the others should be too; adding
     dependencies on some but not all of the objects will result in errors.
   """
diff --git a/tensorflow/contrib/checkpoint/python/split_dependency_test.py b/tensorflow/contrib/checkpoint/python/split_dependency_test.py
index 00a805af25d5d0ea723db5d015fb12bf45c53857..9bc01059481ff69064e3f9c682a764146b79a250 100644
--- a/tensorflow/contrib/checkpoint/python/split_dependency_test.py
+++ b/tensorflow/contrib/checkpoint/python/split_dependency_test.py
@@ -23,9 +23,9 @@ from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.training.checkpointable import base
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
 
 
 def _split_variable_closure(variable):
@@ -44,7 +44,7 @@ def _combine_variable_closure(variable):
   return _consume_restore_buffer_fn
 
 
-class SaveTensorSlicesAsDeps(base.CheckpointableBase):
+class SaveTensorSlicesAsDeps(base.Trackable):
 
   def __init__(self):
     self.combined = resource_variable_ops.ResourceVariable([0., 0., 0., 0.])
@@ -56,17 +56,17 @@ class SaveTensorSlicesAsDeps(base.CheckpointableBase):
         consume_restore_buffer_fn=_combine_variable_closure(
             self.combined))
     for name, dep in split_dependencies.items():
-      self._track_checkpointable(dep, name=name)
+      self._track_trackable(dep, name=name)
 
 
-class HasRegularDeps(tracking.Checkpointable):
+class HasRegularDeps(tracking.AutoTrackable):
 
   def __init__(self):
     self.first_half = resource_variable_ops.ResourceVariable([0., 0.])
     self.second_half = resource_variable_ops.ResourceVariable([0., 0.])
 
 
-class OnlyOneDep(tracking.Checkpointable):
+class OnlyOneDep(tracking.AutoTrackable):
 
   def __init__(self):
     self.first_half = resource_variable_ops.ResourceVariable([0., 0.])
diff --git a/tensorflow/contrib/checkpoint/python/visualize.py b/tensorflow/contrib/checkpoint/python/visualize.py
index bac071c4cff383f60b707b6e42c13faf5e0ac948..faf90f018476b3c70a7bfa1346a5b590edbbddcd 100644
--- a/tensorflow/contrib/checkpoint/python/visualize.py
+++ b/tensorflow/contrib/checkpoint/python/visualize.py
@@ -18,8 +18,8 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.training.checkpointable import base as checkpointable
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 def dot_graph_from_checkpoint(save_path):
@@ -51,7 +51,7 @@ def dot_graph_from_checkpoint(save_path):
     A graph in DOT format as a string.
   """
   reader = pywrap_tensorflow.NewCheckpointReader(save_path)
-  object_graph = checkpointable_utils.object_metadata(save_path)
+  object_graph = trackable_utils.object_metadata(save_path)
   shape_map = reader.get_variable_to_shape_map()
   dtype_map = reader.get_variable_to_dtype_map()
   graph = 'digraph {\n'
@@ -63,7 +63,7 @@ def dot_graph_from_checkpoint(save_path):
       slot_ids.add(slot_reference.slot_variable_node_id)
   for node_id, node in enumerate(object_graph.nodes):
     if (len(node.attributes) == 1
-        and node.attributes[0].name == checkpointable.VARIABLE_VALUE_KEY):
+        and node.attributes[0].name == trackable.VARIABLE_VALUE_KEY):
       if node_id in slot_ids:
         color = 'orange'
         tooltip_prefix = 'Slot variable'
diff --git a/tensorflow/contrib/checkpoint/python/visualize_test.py b/tensorflow/contrib/checkpoint/python/visualize_test.py
index 583e3bc442893d825c337d73fb999d1e586738a1..98a22d573fdb6172cde100df461d9ae520c2c483 100644
--- a/tensorflow/contrib/checkpoint/python/visualize_test.py
+++ b/tensorflow/contrib/checkpoint/python/visualize_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training import adam
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 
 try:
   import pydot  # pylint: disable=g-import-not-at-top
@@ -57,7 +57,7 @@ class DotGraphTests(test.TestCase):
       model = MyModel()
       optimizer = adam.AdamOptimizer(0.001)
       optimizer_step = resource_variable_ops.ResourceVariable(12)
-      save_checkpoint = checkpointable_utils.Checkpoint(
+      save_checkpoint = trackable_utils.Checkpoint(
           optimizer=optimizer, model=model, optimizer_step=optimizer_step)
       optimizer.minimize(functools.partial(model, input_value))
       checkpoint_directory = self.get_temp_dir()
diff --git a/tensorflow/contrib/cloud/kernels/BUILD b/tensorflow/contrib/cloud/kernels/BUILD
index 1311063ec023bdaa2588d6f1c826bf900f7dea09..20f8c2b2453a58fdbe5a3587fa6687debd9c06d3 100644
--- a/tensorflow/contrib/cloud/kernels/BUILD
+++ b/tensorflow/contrib/cloud/kernels/BUILD
@@ -27,7 +27,6 @@ tf_kernel_library(
     deps = [
         ":bigquery_table_accessor",
         ":bigquery_table_partition_proto_cc",
-        "//tensorflow/contrib/cloud:bigquery_reader_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:reader_base",
@@ -79,7 +78,6 @@ tf_kernel_library(
     srcs = ["gcs_config_ops.cc"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/contrib/cloud:gcs_config_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform/cloud:curl_http_request",
diff --git a/tensorflow/contrib/cluster_resolver/__init__.py b/tensorflow/contrib/cluster_resolver/__init__.py
index 390b3e7550b3d991269bb84707c3500f2fa33290..a4dea85efd98893c881abbd3f7ebda78755b8189 100644
--- a/tensorflow/contrib/cluster_resolver/__init__.py
+++ b/tensorflow/contrib/cluster_resolver/__init__.py
@@ -23,7 +23,7 @@ from __future__ import print_function
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import UnionClusterResolver
-from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GceClusterResolver
+from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GCEClusterResolver
 from tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver import KubernetesClusterResolver
 from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import SlurmClusterResolver
 from tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver import TFConfigClusterResolver
@@ -36,7 +36,7 @@ _allowed_symbols = [
     'ClusterResolver',
     'SimpleClusterResolver',
     'UnionClusterResolver',
-    'GceClusterResolver',
+    'GCEClusterResolver',
     'KubernetesClusterResolver',
     'TFConfigClusterResolver',
     'TPUClusterResolver',
diff --git a/tensorflow/contrib/cluster_resolver/python/training/__init__.py b/tensorflow/contrib/cluster_resolver/python/training/__init__.py
index 10d93549ebbd4f7e900796d0516b0af1744224af..ef1e9f11a07a5be6c0b181f5e0b80e0e2214f972 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/__init__.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/__init__.py
@@ -25,7 +25,7 @@ from __future__ import print_function
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import UnionClusterResolver
-from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GceClusterResolver
+from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GCEClusterResolver
 from tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver import KubernetesClusterResolver
 from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import SlurmClusterResolver
 from tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver import TFConfigClusterResolver
@@ -43,7 +43,7 @@ _allowed_symbols = [
     'ClusterResolver',
     'SimpleClusterResolver',
     'UnionClusterResolver',
-    'GceClusterResolver',
+    'GCEClusterResolver',
     'KubernetesClusterResolver',
     'TFConfigClusterResolver',
     'TPUClusterResolver',
diff --git a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
index 55e61155c683c928efab9bb018868faec3e3df8c..5b49116ff6a4e17a774ea79b33ae1b948ba9f187 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Stub file for GceClusterResolver to maintain backwards compatibility."""
+"""Stub file for GCEClusterResolver to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -23,13 +23,14 @@ from __future__ import print_function
 # existing OSS code will not be broken.
 
 # pylint: disable=unused-import
-from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GceClusterResolver
+from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GCEClusterResolver
 # pylint: enable=unused-import
 
 from tensorflow.python.util.all_util import remove_undocumented
 
+
 _allowed_symbols = [
-    'GceClusterResolver',
+    'GCEClusterResolver',
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index df8b48dfc46124d3b9454d92ffb70dbcf1bc4217..60ee1b4b3fd7d0b6afaefcc05effd3bbae00cf2c 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -147,19 +147,19 @@ suitable interface for project configuration and dependency setting.
     *   Go (required if you need ssl support, optional)
     *   NASM/YASM (required by grpc for ssl support, optional)
 2.  Start CMake GUI
-3.  Click on `Browse Source` and direct to the the folder
+3.  Click on `Browse Source` and direct to the folder
     `<tensorflow-source>/tensorflow/contrib/cmake`
 4.  Click on `Browse Build` and spectify a location that you want tensorflow to
     be build
 5.  Click on `Configure`, a new window will be prompted out, specify the
     generator mode for the project generation. For Windows, choose `Visual
     Studio <version> <year> Win64`, for Linux, choose `Unix Makefiles`, then
-    press `Finish`. Wait for a moment, the default project dependecy would
+    press `Finish`. Wait for a moment, the default project dependency would
     automatically generate.
 6.  There are a few options that you can customize your own build. **The setting
-    here is crucial for a sucessful build, please check all items carefully.**
+    here is crucial for a successful build, please check all items carefully.**
 
-    *   `tensorflow_BUILD_ALL_KERNELS` should alway be `on`
+    *   `tensorflow_BUILD_ALL_KERNELS` should always be `on`
     *   `tensorflow_BUILD_CC_EXAMPLE` is default to be `on`. This can help you
         to test build (optional)
     *   `tensorflow_BUILD_CONTRIB_KERNELS` is default to be `on`, but it won't
@@ -278,7 +278,7 @@ suitable interface for project configuration and dependency setting.
     `make -sj<number-of-threads> install`
 
     Where `<number-of-threads>` is the threads used for the compilation, change
-    to any integer less or equal to your computer's maxiumum thread number.
+    to any integer less or equal to your computer's maximum thread number.
 
     Headers are discretely located in the build folders. Tensorflow library can
     be found at `<path-to-build>`, namely `tensorflow.so` (Linux) or
diff --git a/tensorflow/contrib/cmake/external/abseil_cpp.cmake b/tensorflow/contrib/cmake/external/abseil_cpp.cmake
index 46a193971c5084523d432065f265fa7a9909f595..6c6a5df7f76723800740a81ccdcb137a0ec33846 100644
--- a/tensorflow/contrib/cmake/external/abseil_cpp.cmake
+++ b/tensorflow/contrib/cmake/external/abseil_cpp.cmake
@@ -31,17 +31,17 @@ if (systemlib_ABSEIL_CPP)
   message(STATUS "  abseil_cpp includes: ${ABSEIL_CPP_INCLUDE_DIR}")
   message(STATUS "  abseil_cpp libraries: ${ABSEIL_CPP_LIBRARIES}")
 
-  add_custom_target(abseil_cpp)
-  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp)
+  add_custom_target(abseil_cpp_build)
+  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp_build)
 
 else (systemlib_ABSEIL_CPP)
 
   include (ExternalProject)
 
-  set(abseil_cpp_INCLUDE_DIR ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp)
-  set(abseil_cpp_URL https://github.com/abseil/abseil-cpp/archive/e01d95528ea2137a4a27a88d1f57c6cb260aafed.tar.gz)
-  set(abseil_cpp_HASH SHA256=84043ed402d2a2a6ba4cdddb7e85118b1158fd81fe4ac3a14adc343d054c1e2e)
-  set(abseil_cpp_BUILD ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp-build)
+  set(abseil_cpp_INCLUDE_DIR ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp_build)
+  set(abseil_cpp_URL https://github.com/abseil/abseil-cpp.git)
+  set(abseil_cpp_TAG master)
+  set(abseil_cpp_BUILD ${CMAKE_BINARY_DIR}/abseil_cpp/src/abseil_cpp_build)
 
   if(WIN32)
     if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
@@ -49,8 +49,11 @@ else (systemlib_ABSEIL_CPP)
           ${abseil_cpp_BUILD}/absl/base/Release/absl_base.lib
           ${abseil_cpp_BUILD}/absl/base/Release/absl_dynamic_annotations.lib
           ${abseil_cpp_BUILD}/absl/base/Release/absl_internal_malloc_internal.lib
+          ${abseil_cpp_BUILD}/absl/base/Release/absl_internal_throw_delegate.lib
+          ${abseil_cpp_BUILD}/absl/numeric/Release/absl_int128.lib
           ${abseil_cpp_BUILD}/absl/strings/Release/absl_strings.lib
           ${abseil_cpp_BUILD}/absl/strings/Release/str_format_internal.lib
+          ${abseil_cpp_BUILD}/absl/time/Release/absl_time.lib
           ${abseil_cpp_BUILD}/absl/types/Release/absl_bad_optional_access.lib)
     else()
       set(abseil_cpp_STATIC_LIBRARIES
@@ -62,6 +65,7 @@ else (systemlib_ABSEIL_CPP)
           ${abseil_cpp_BUILD}/absl/numeric/absl_int128.lib
           ${abseil_cpp_BUILD}/absl/strings/absl_strings.lib
           ${abseil_cpp_BUILD}/absl/strings/str_format_internal.lib
+          ${abseil_cpp_BUILD}/absl/time/absl_time.lib
           ${abseil_cpp_BUILD}/absl/types/absl_bad_optional_access.lib)
     endif()
   else()
@@ -74,15 +78,18 @@ else (systemlib_ABSEIL_CPP)
         ${abseil_cpp_BUILD}/absl/numeric/libabsl_int128.a
         ${abseil_cpp_BUILD}/absl/strings/libabsl_strings.a
         ${abseil_cpp_BUILD}/absl/strings/libstr_format_internal.a
+        ${abseil_cpp_BUILD}/absl/time/libabsl_time.a
         ${abseil_cpp_BUILD}/absl/types/libabsl_bad_optional_access.a)
   endif()
 
-  ExternalProject_Add(abseil_cpp
+  ExternalProject_Add(abseil_cpp_build
       PREFIX abseil_cpp
-      URL ${abseil_cpp_URL}
-      URL_HASH ${abseil_cpp_HASH}
+      GIT_REPOSITORY ${abseil_cpp_URL}
       DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
+      BUILD_IN_SOURCE 1
       BUILD_BYPRODUCTS ${abseil_cpp_STATIC_LIBRARIES}
+      BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release
+      COMMAND ${CMAKE_COMMAND} --build . --config Release
       INSTALL_COMMAND ""
       CMAKE_CACHE_ARGS
           -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
@@ -91,8 +98,10 @@ else (systemlib_ABSEIL_CPP)
   )
 
   include_directories(${abseil_cpp_INCLUDE_DIR})
+  message(STATUS ${abseil_cpp_INCLUDE_DIR})
+
   list(APPEND tensorflow_EXTERNAL_LIBRARIES ${abseil_cpp_STATIC_LIBRARIES})
 
-  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp)
+  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp_build)
 
 endif (systemlib_ABSEIL_CPP)
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake
index e570c09ecb5e64130ed6f3375a51d74850cc3989..30b4e2dbdee1117df12ae7ab8ce902e667234fb0 100644
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@@ -17,7 +17,7 @@ include (ExternalProject)
 set(GRPC_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/include)
 set(GRPC_URL https://github.com/grpc/grpc.git)
 set(GRPC_BUILD ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc)
-set(GRPC_TAG 69b6c047bc767b4d80e7af4d00ccb7c45b683dae)
+set(GRPC_TAG 62688b6a05cc85b47fb77dd408611734253e47e2)
 
 if(WIN32)
   # We use unsecure gRPC because boringssl does not build on windows
diff --git a/tensorflow/contrib/cmake/external/nsync.cmake b/tensorflow/contrib/cmake/external/nsync.cmake
index 479609458c64f7c7bd7b3ce6b23aceaa3db17f21..b15143bfc1cd787b156c9d6dd724a17730f0f8fb 100644
--- a/tensorflow/contrib/cmake/external/nsync.cmake
+++ b/tensorflow/contrib/cmake/external/nsync.cmake
@@ -16,7 +16,7 @@ include (ExternalProject)
 
 set(nsync_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/external/nsync/public)
 set(nsync_URL https://github.com/google/nsync)
-set(nsync_TAG 1.20.1)
+set(nsync_TAG 1.20.2)
 set(nsync_BUILD ${CMAKE_CURRENT_BINARY_DIR}/nsync/src/nsync)
 set(nsync_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/nsync/install)
 
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 96160568fa79291a7b391761373e1eaf0f70974e..fd205a4b9b065a4756fbe3985694bb64b93b85e6 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -1,6 +1,9 @@
 # python_sanity_test.py will complain about invalid or missing entries
 # problematic entries can be commented for temporary whitelisting
 tensorflow
+tensorflow/compiler
+tensorflow/compiler/xla
+tensorflow/compiler/xla/service
 tensorflow/core
 tensorflow/core/example
 tensorflow/core/framework
@@ -10,6 +13,7 @@ tensorflow/core/lib
 tensorflow/core/lib/core
 tensorflow/core/profiler
 tensorflow/core/protobuf
+tensorflow/core/protobuf/tpu
 tensorflow/core/util
 tensorflow/examples
 tensorflow/examples/tutorials
@@ -67,8 +71,9 @@ tensorflow/python/summary/writer
 tensorflow/python/tools
 tensorflow/python/tools/api
 tensorflow/python/tools/api/generator
+tensorflow/python/tpu
 tensorflow/python/training
-tensorflow/python/training/checkpointable
+tensorflow/python/training/tracking
 tensorflow/python/user_ops
 tensorflow/python/util
 tensorflow/python/util/protobuf
@@ -434,7 +439,6 @@ tensorflow/contrib/timeseries/python/timeseries/state_space_models
 tensorflow/contrib/tpu
 tensorflow/contrib/tpu/ops
 tensorflow/contrib/tpu/profiler
-tensorflow/contrib/tpu/proto
 tensorflow/contrib/tpu/python
 tensorflow/contrib/tpu/python/ops
 tensorflow/contrib/tpu/python/profiler
diff --git a/tensorflow/contrib/cmake/python_protos.txt b/tensorflow/contrib/cmake/python_protos.txt
index 013180c89083748b240ad061b342300e886d3568..b4603206da419f44af0857b9b933eb7df1b255ff 100644
--- a/tensorflow/contrib/cmake/python_protos.txt
+++ b/tensorflow/contrib/cmake/python_protos.txt
@@ -1,6 +1,7 @@
 tensorflow/core
 tensorflow/core/kernels/boosted_trees
 tensorflow/core/profiler
+tensorflow/core/protobuf/tpu
 tensorflow/python
 tensorflow/contrib/boosted_trees/proto
 tensorflow/contrib/cloud/kernels
@@ -12,7 +13,6 @@ tensorflow/contrib/mpi_collectives
 tensorflow/contrib/session_bundle
 tensorflow/contrib/tensor_forest/proto
 tensorflow/contrib/tensorboard/plugins/projector
-tensorflow/contrib/tpu/proto
 tensorflow/contrib/tpu/profiler
 tensorflow/contrib/training/python/training
 tensorflow/contrib/verbs
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index d7b2a1339e047aba0a9424a53a63726805e89721..cc263d7995c01100f1c51436bcb584b600c8c161 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -125,9 +125,9 @@ endfunction()
 
 file(GLOB_RECURSE tf_protos_cc_srcs RELATIVE ${tensorflow_source_dir}
     "${tensorflow_source_dir}/tensorflow/core/*.proto"
+    "${tensorflow_source_dir}/tensorflow/core/protobuf/tpu/*.proto"
     "${tensorflow_source_dir}/tensorflow/compiler/xla/*.proto"
     "${tensorflow_source_dir}/tensorflow/contrib/boosted_trees/proto/*.proto"
-    "${tensorflow_source_dir}/tensorflow/contrib/tpu/proto/*.proto"
 )
 
 RELATIVE_PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS
@@ -147,7 +147,6 @@ set(tf_proto_text_srcs
     "tensorflow/core/framework/function.proto"
     "tensorflow/core/framework/graph.proto"
     "tensorflow/core/framework/graph_transfer_info.proto"
-    "tensorflow/core/framework/iterator.proto"
     "tensorflow/core/framework/kernel_def.proto"
     "tensorflow/core/framework/log_memory.proto"
     "tensorflow/core/framework/node_def.proto"
@@ -302,8 +301,8 @@ file(GLOB_RECURSE tf_core_framework_srcs
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/session.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/session_factory.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/session_options.cc"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/*.cc"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/*.h"
+    "${tensorflow_source_dir}/tensorflow/core/summary/*.cc"
+    "${tensorflow_source_dir}/tensorflow/core/summary/*.h"
     "${tensorflow_source_dir}/public/*.h"
 )
 
@@ -317,14 +316,14 @@ file(GLOB_RECURSE tf_core_framework_exclude_srcs
     "${tensorflow_source_dir}/tensorflow/core/util/*test*.h"
     "${tensorflow_source_dir}/tensorflow/core/util/*test*.cc"
     "${tensorflow_source_dir}/tensorflow/core/util/*main.cc"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/*test*.cc"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/loader.cc"
-    "${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/vacuum.cc"
+    "${tensorflow_source_dir}/tensorflow/core/summary/*test*.cc"
+    "${tensorflow_source_dir}/tensorflow/core/summary/loader.cc"
+    "${tensorflow_source_dir}/tensorflow/core/summary/vacuum.cc"
 )
 
 # TODO(jart): Why doesn't this work?
 # set_source_files_properties(
-#     ${tensorflow_source_dir}/tensorflow/contrib/tensorboard/db/snapfn.cc
+#     ${tensorflow_source_dir}/tensorflow/core/lib/db/snapfn.cc
 #     PROPERTIES COMPILE_FLAGS -DSQLITE_OMIT_LOAD_EXTENSION)
 
 list(REMOVE_ITEM tf_core_framework_srcs ${tf_core_framework_exclude_srcs})
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 8faccf8d55902e6701ebb4ce534b84705304fd5f..1fe8795ddf00232eba5a60a130e0845a6f6a8e17 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -802,6 +802,7 @@ add_custom_command(
       # tensorflow/__init__.py depends on files generated in this step. So, remove it while
       # this step is running since the files aren't there yet.
       COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+      COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
 
       # Run create_python_api.py to generate API init files.
       COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python "${PY_RUNTIME_ENV}" ${PYTHON_EXECUTABLE}
diff --git a/tensorflow/contrib/compiler/BUILD b/tensorflow/contrib/compiler/BUILD
index e4566437c60ebb2da039e61c171fbe954a7355c9..79c61589112b739837b401010690e7f4ca917d07 100644
--- a/tensorflow/contrib/compiler/BUILD
+++ b/tensorflow/contrib/compiler/BUILD
@@ -23,6 +23,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":xla",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
@@ -70,22 +71,30 @@ py_library(
     ],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "xla_test",
     srcs = ["xla_test.py"],
     additional_deps = [
         ":xla",
-        "@six_archive//:six",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/compiler/tests:xla_test",
+        "//tensorflow/contrib/tpu:tpu_estimator",
+        "//tensorflow/contrib/tpu:tpu_lib",
+        "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:control_flow_util",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
-        "//tensorflow/contrib/tpu:tpu_lib",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:summary",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+    tags = [
+        "no_mac",
+        "no_windows",
     ],
-    tags = ["no_pip"],
+    xla_enabled = True,
 )
diff --git a/tensorflow/contrib/compiler/__init__.py b/tensorflow/contrib/compiler/__init__.py
index c4937dadfb8be3211377f0ae7017b95e7642dab0..797e5e8164e231e8b3806d40b32774711879b050 100644
--- a/tensorflow/contrib/compiler/__init__.py
+++ b/tensorflow/contrib/compiler/__init__.py
@@ -19,3 +19,4 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.compiler import jit
+from tensorflow.contrib.compiler import xla
diff --git a/tensorflow/contrib/compiler/xla.py b/tensorflow/contrib/compiler/xla.py
index f867cd15b67dbd43650d8012b4299845af7200a8..238c6ab1366a50710efabea2f33eb1bd06fe9423 100644
--- a/tensorflow/contrib/compiler/xla.py
+++ b/tensorflow/contrib/compiler/xla.py
@@ -34,6 +34,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import function_utils
+from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
@@ -76,10 +77,22 @@ def compile(computation, inputs=None):  # pylint: disable=redefined-builtin
 
       All `Operation`s returned from `computation` will be executed when
       evaluating any of the returned output tensors.
-    inputs: A list of input tensors or `None` (equivalent to an empty list).
+    inputs: A list of inputs or `None` (equivalent to an empty list). Each input
+      can be a nested structure containing values that are convertible to
+      tensors. Note that passing an N-dimension list of compatible values will
+      result in a N-dimention list of scalar tensors rather than a single Rank-N
+      tensors. If you need different behavior, convert part of inputs to tensors
+      with `tf.convert_to_tensor`.
 
   Returns:
-    A list of output tensors.
+    Same data structure as if computation(*inputs) is called directly with some
+    exceptions for correctness. Exceptions include:
+      1) None output: a NoOp would be returned which control-depends on
+         computation.
+      2) Single value output: A tuple containing the value would be returned.
+      3) Operation-only outputs: a NoOp would be returned which
+         control-depends on computation.
+      TODO(b/121383831): Investigate into removing these special cases.
   """
   # pylint: disable=protected-access
   return _compile_internal(computation, inputs)
@@ -131,6 +144,30 @@ class XLACompileContext(control_flow_ops.XLAControlFlowContext):
         logging.warning('... and %d more',
                         len(self._unsupported_ops) - _MAX_WARNING_LINES)
 
+  def _RemoveExternalControlEdges(self, op):
+    """Remove any external control dependency on this op."""
+    internal_control_inputs = []
+    external_control_inputs = []
+    for x in op.control_inputs:
+      # pylint: disable=protected-access
+      is_internal_op = False
+      ctxt = x._get_control_flow_context()
+      while ctxt is not None:
+        if ctxt == self:
+          is_internal_op = True
+          break
+        ctxt = ctxt._outer_context
+      if is_internal_op:
+        internal_control_inputs.append(x)
+      else:
+        external_control_inputs.append(x)
+      # pylint: enable=protected-access
+    # pylint: disable=protected-access
+    op._remove_all_control_inputs()
+    op._add_control_inputs(internal_control_inputs)
+    # pylint: enable=protected-access
+    return internal_control_inputs, external_control_inputs
+
   def AddOp(self, op):
     """Create op in XLACompileContext and notifies outer context recursively."""
     # pylint: disable=protected-access
@@ -180,11 +217,14 @@ class XLACompileContext(control_flow_ops.XLAControlFlowContext):
     if external_control_inputs:
       # Use an identity to pull control inputs as data inputs. Note that we
       # ignore ops which don't have outputs. TODO(phawkins): fix that.
-      external_control_inputs = [
-          array_ops.identity(x.outputs[0]).op
-          for x in external_control_inputs
-          if x.outputs
-      ]
+      with ops.control_dependencies(None):
+        self.Enter()
+        external_control_inputs = [
+            array_ops.identity(x.outputs[0]).op
+            for x in external_control_inputs
+            if x.outputs
+        ]
+        self.Exit()
       # pylint: disable=protected-access
       op._add_control_inputs(external_control_inputs)
       # pylint: enable=protected-access
@@ -245,13 +285,21 @@ def _compile_internal(computation, inputs=None):
   Args:
     computation: A Python function that builds the computation to compile and
       execute.
-    inputs: A list of input tensors or `None` (equivalent to `[]`). Its order
-      should match ordering of computation arguments.
+    inputs: A list of inputs or `None` (equivalent to an empty list). Each input
+      can be a nested structure containing values that are convertible to
+      tensors. Note that passing an N-dimension list of compatible values will
+      result in a N-dimension list of scalar tensors rather than a single Rank-N
+      tensors. If you need different behavior, convert part of inputs to tensors
+      with `tf.convert_to_tensor`.
+
   Returns:
-    A list of output tensors from computation.
+    Same data structure as if computation(*inputs) is called directly with some
+    exceptions for correctness. Exceptions include: 1) None output 2) Single
+    value output 3) Operation-only outputs
   Raises:
     ValueError: If any element in computation outputs is neither an operations
       or a value that can be converted to tensor.
+    ValueError: If computation outputs is non-flat and contains any Operations.
     TypeError: If `inputs` is not a list or tuple.
   """
   if inputs is None:
@@ -260,17 +308,10 @@ def _compile_internal(computation, inputs=None):
   if not isinstance(inputs, collections.Sequence):
     raise TypeError('inputs must be a list')
 
+  # Flatten inputs.
+  flat_inputs = nest.flatten(inputs)
   # Converts inputs to Tensors.
-  inputs = [ops.convert_to_tensor(x) for x in inputs]
-  input_arity = len(inputs)
-
-  arg_error = check_function_argument_count(
-      computation, input_arity, infeed_queue=None)
-  if arg_error is not None:
-    raise TypeError(
-        'Supplied computation cannot be called with the specified inputs. You '
-        'specified %d inputs: %s, but the computation needs %s' %
-        (input_arity, str([i.name for i in inputs]), arg_error))
+  flat_inputs = [ops.convert_to_tensor(x) for x in flat_inputs]
 
   cluster_name = ops.get_default_graph().unique_name('cluster')
   pivot = control_flow_ops.no_op(name=cluster_name + '/pivot')
@@ -280,11 +321,15 @@ def _compile_internal(computation, inputs=None):
 
     # Add identity ops so even unused inputs are 'consumed' by the
     # computation.
-    computation_inputs = [
+    flat_inputs = [
         array_ops.identity(x, name='input_{}'.format(i))
-        for i, x in enumerate(inputs)
+        for i, x in enumerate(flat_inputs)
     ]
 
+    # Re-pack flat_inputs in same structure as 'inputs'.
+    computation_inputs = nest.pack_sequence_as(
+        structure=inputs, flat_sequence=flat_inputs)
+
     # Only resource variables work inside an XLA computation, so turn on
     # resource variables for the computation.
     vscope = variable_scope.get_variable_scope()
@@ -297,66 +342,166 @@ def _compile_internal(computation, inputs=None):
     # Restore variable scope after computation.
     vscope.set_use_resource(saved_use_resource)
 
-    # If the computation returns `None`, make it an empty tuple.
-    if outputs is None:
-      outputs = tuple()
-    # If the computation only returned one value, make it a tuple.
-    if not isinstance(outputs, collections.Sequence):
-      outputs = (outputs,)
-
-    # Append `no_op` here so that return value of this function always contains
-    # at least one op that can trigger XlaLaunch node.
-    outputs += (control_flow_ops.no_op(),)
-    try:
-      outputs = [
-          o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o)
-          for o in outputs
-      ]
-    except Exception as e:
-      raise ValueError(
-          'XLA computation function return values must all either be Operations'
-          ' or convertible to Tensors. Got error: "%s"' % str(e))
-
-    # Separates the returned Operations and Tensors.
-    output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
-    output_tensors = [o for o in outputs if not isinstance(o, ops.Operation)]
-
-    if outputs != output_tensors + output_operations:
-      raise ValueError(
-          'XLA computation function must return zero or more Tensor values '
-          'followed by zero or more Operations.')
-    output_arity = len(output_tensors)
-
-    new_output_tensors = []
-    for t in output_tensors:
-      with ops.device(t.device if t.device else ''):
-        new_output_tensors.append(array_ops.identity(t))
+    outputs_is_flat = is_flat(outputs)
+    if outputs_is_flat:
+      output_tensors, control_deps = _postprocess_flat_outputs(outputs)
+    else:
+      output_tensors, control_deps = _postprocess_non_flat_outputs(outputs)
 
-    output_tensors = new_output_tensors
     context.ExitResult(output_tensors)
   finally:
     context.report_unsupported_operations()
     context.Exit()
 
-  outputs = [
-      xla_ops.xla_cluster_output(output_tensors[i], name='output{}'.format(i))
-      for i in xrange(output_arity)
+  # When XLA computation returns only operations and no tensors, a NoOp
+  # dependent on the operations in outputs is returned. Otherwise final
+  # outputs would be empty and there is no way to trigger returned
+  # operations.
+  if not output_tensors:
+    return control_flow_ops.group(control_deps, name='output_0')
+
+  output_tensors = [
+      xla_ops.xla_cluster_output(o, name='output{}'.format(i))
+      for i, o in enumerate(output_tensors)
   ]
 
-  with ops.control_dependencies(output_operations):
-    if output_arity == 0:
-      # When XLA computation returns only operations and no tensors, a NoOp
-      # dependent on the operations in outputs is returned. Otherwise final
-      # outputs would be empty and there is no way to trigger returned
-      # operations.
-      return control_flow_ops.no_op(name='output_0')
-    else:
-      # Wraps the outputs in identity operators that carries control
-      # dependencies.
-      return [
-          array_ops.identity(outputs[i], name='output_%d' % i)
-          for i in xrange(output_arity)
-      ]
+  with ops.control_dependencies(control_deps):
+    # Wraps the outputs in identity operators that carries control
+    # dependencies.
+    output_tensors = [
+        array_ops.identity(o, name='output_%d' % i)
+        for i, o in enumerate(output_tensors)
+    ]
+
+  # If `computation` returned non-flat output structure, pack output tensors
+  # back into same structure.
+  if not outputs_is_flat:
+    output_tensors = nest.pack_sequence_as(
+        structure=outputs, flat_sequence=output_tensors)
+
+  return output_tensors
+
+
+def is_flat(outputs):
+  """Checks if outputs is a flat structure.
+
+    Following structures and values are considered flat:
+    1) None
+    2) A single object
+    3) A list or tuple of Tensors/Operations
+
+    The only structures that this function understands are sequences and
+    dictionaries.  E.g. this means that if outputs contains a single
+    user-defined Object, it is considered to be flat. Errors are raised later on
+    if that Object cannot be converted to a Tensor.
+
+  Args:
+    outputs: Output from `computation` inside `xla.compile`.
+
+  Returns:
+    A boolean indicates whether outputs is flat.
+  """
+  # If outputs is a list or tuple, check if it has any nested structure. If
+  # there is, then outputs is non-flat.
+  if isinstance(outputs, collections.Sequence):
+    for o in outputs:
+      if isinstance(o, collections.Sequence) or isinstance(o, dict):
+        return False
+
+  # If outputs is a dict, it is non-flat.
+  if isinstance(outputs, dict):
+    return False
+
+  # Getting here means either outputs itself is a single non-structured value
+  # or it is a flat list of single non-structured values.
+  return True
+
+
+def _postprocess_flat_outputs(outputs):
+  """Validates flat outputs and adds back device assignments.
+
+  Args:
+    outputs: Output from `computation` inside `xla.compile`.
+
+  Returns:
+    Tensors and Operations extracted from outputs.
+  """
+  # Following code segment is to preserve legacy behavior. Previously we only
+  # supported flat outputs and thus for consistency it was nice to convert even
+  # single element into a tuple. But now that we support arbitrary output
+  # structure, this is no longer necessary.
+  # TODO(b/121383831): Migrate all legacy use cases and delete this special
+  # case.
+  # If the computation returns `None`, make it an empty tuple.
+  if outputs is None:
+    outputs = tuple()
+  # If the computation only returned one value, make it a tuple.
+  if not isinstance(outputs, collections.Sequence):
+    outputs = (outputs,)
+
+  # Append `no_op` here so that return value of this function always contains
+  # at least one op that can trigger XlaLaunch node.
+  outputs += (control_flow_ops.no_op(),)
+  try:
+    outputs = [
+        o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o)
+        for o in outputs
+    ]
+  except Exception as e:
+    raise ValueError(
+        'XLA computation function return values must all either be Operations'
+        ' or convertible to Tensors. Got error: "%s"' % str(e))
+
+  # Separates the returned Operations and Tensors.
+  output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
+  output_tensors = [o for o in outputs if not isinstance(o, ops.Operation)]
+
+  if outputs != output_tensors + output_operations:
+    raise ValueError(
+        'XLA computation function must return zero or more Tensor values '
+        'followed by zero or more Operations.')
+
+  new_output_tensors = []
+  for t in output_tensors:
+    with ops.device(t.device if t.device else ''):
+      new_output_tensors.append(array_ops.identity(t))
+
+  return new_output_tensors, output_operations
+
+
+def _postprocess_non_flat_outputs(outputs):
+  """Validates non-flat outputs and adds back device assignments.
+
+  Args:
+    outputs: Output from `computation` inside `xla.compile`.
+
+  Returns:
+    Tensors extracted from outputs and an empty list because Operations are not
+    allowed in non-flat outputs..
+  """
+  # Convert all non-Operation outputs to Tensors.
+  new_output_tensors = []
+  for o in nest.flatten(outputs):
+    if isinstance(o, ops.Operation):
+      raise ValueError(
+          'xla.compile does not support Operation as return value in non-flat '
+          'output structure. You can set returned Operations as control '
+          'dependencies of returned Tensors so Operations are triggered when '
+          'Tensors are evaluated. Operation found: "%s"' % o.name)
+
+    try:
+      o = ops.convert_to_tensor(o)
+    except Exception as e:
+      raise ValueError(
+          'XLA computation function return values must all either be '
+          'Operations or convertible to Tensors. Got error: "%s"' % str(e))
+
+    # Makes sure even pass-through inputs/outputs are touched in compile
+    # context by creating an Identity node inside compile context.
+    with ops.device(o.device if o.device else ''):
+      new_output_tensors.append(array_ops.identity(o))
+
+  return new_output_tensors, []
 
 
 @contextlib.contextmanager
diff --git a/tensorflow/contrib/compiler/xla_test.py b/tensorflow/contrib/compiler/xla_test.py
index 3b49755afcf0753d31c0ce506dce42709b1ee8bc..c4384dcde75035dc55e67bd503e348fe19b97025 100644
--- a/tensorflow/contrib/compiler/xla_test.py
+++ b/tensorflow/contrib/compiler/xla_test.py
@@ -18,11 +18,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import re
+from absl.testing import parameterized
+
 from tensorflow.contrib.compiler import xla
+from tensorflow.contrib.tpu.python.tpu import tpu_estimator
 from tensorflow.contrib.tpu.python.tpu import tpu_feed
+from tensorflow.contrib.training.python.training import hparam
 from tensorflow.python import summary
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import logging_ops
@@ -30,6 +38,14 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
+from tensorflow.python.training import training
+
+
+_TRAIN = model_fn_lib.ModeKeys.TRAIN
+_EVAL = model_fn_lib.ModeKeys.EVAL
+_EXPECTED_LOSS = 1
+_EXPECTED_FEATURE = 2
+_EXPECTED_LABEL = 3
 
 
 class XLACompileContextTest(test.TestCase):
@@ -252,5 +268,329 @@ class CheckFunctionArgumentCountTest(test.TestCase):
                      xla.check_function_argument_count(func, 0, queue))
 
 
+def _test_train_model_fn(features, labels, mode, params):
+  """A dummy model_fn for testing purpose."""
+  del features, labels, params
+  loss = constant_op.constant(_EXPECTED_LOSS)
+  return model_fn_lib.EstimatorSpec(
+      mode=mode, loss=loss, train_op=array_ops.identity(loss))
+
+
+@xla.estimator_model_fn
+def decorated_model_fn(features, labels, mode, params):
+  return _test_train_model_fn(features, labels, mode, params)
+
+
+def make_dummy_features_labels():
+  # XLA CPU/GPU backend doesn't support guaranteed constant, thus use dataset
+  # container to work around.
+  features_dataset = dataset_ops.Dataset.from_tensors(
+      constant_op.constant(_EXPECTED_FEATURE)).repeat(10)
+  features_op = features_dataset.make_one_shot_iterator().get_next()
+  labels_dataset = dataset_ops.Dataset.from_tensors(
+      constant_op.constant(_EXPECTED_LABEL)).repeat(10)
+  labels_op = labels_dataset.make_one_shot_iterator().get_next()
+  return features_op, labels_op
+
+
+class XlaDecoratorTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('test_use_as_decorator', decorated_model_fn, None),
+      ('test_use_as_function', xla.estimator_model_fn(_test_train_model_fn),
+       None),
+      ('test_use_tpu_false_hparams', decorated_model_fn,
+       hparam.HParams(use_tpu=False)),
+      ('test_use_tpu_false_dict_params', decorated_model_fn, {
+          'use_tpu': False
+      }),
+  )
+  def test_compile(self, model_fn, params):
+    """Calls model_fn and verifies it is compiled."""
+    with test.mock.patch.object(xla, 'compile') as mock_xla_compile:
+      loss = constant_op.constant(_EXPECTED_LOSS)
+      mock_xla_compile.return_value = [loss]
+
+      features, labels = make_dummy_features_labels()
+      estimator_spec = model_fn(
+          features=features, labels=labels, mode=_TRAIN, params=params or {})
+
+      self.assertEqual(mock_xla_compile.call_count, 1)
+      self.assertEqual(estimator_spec.mode, _TRAIN)
+
+      with self.test_session() as sess:
+        self.assertEqual(sess.run(estimator_spec.loss), sess.run(loss))
+        self.assertEqual(sess.run(estimator_spec.train_op), sess.run(loss))
+
+  @parameterized.named_parameters(
+      ('test_use_tpu_true_hparams', decorated_model_fn,
+       hparam.HParams(use_tpu=True)),
+      ('test_use_tpu_true_dict_params', decorated_model_fn, {
+          'use_tpu': True
+      }),
+  )
+  def test_not_compile(self, model_fn, params):
+    """Calls model_fn and verifies it is NOT compiled."""
+    with test.mock.patch.object(xla, 'compile') as mock_xla_compile:
+      loss = constant_op.constant(_EXPECTED_LOSS)
+      mock_xla_compile.return_value = [loss]
+
+      features, labels = make_dummy_features_labels()
+      estimator_spec = model_fn(
+          features=features, labels=labels, mode=_TRAIN, params=params or {})
+
+      mock_xla_compile.assert_not_called()
+      self.assertEqual(estimator_spec.mode, _TRAIN)
+
+      with self.test_session() as sess:
+        self.assertEqual(sess.run(estimator_spec.loss), sess.run(loss))
+        self.assertEqual(sess.run(estimator_spec.train_op), sess.run(loss))
+
+  def test_model_with_summary(self):
+    """Tests that summary ops are disabled."""
+
+    @xla.estimator_model_fn
+    def model_fn_with_summary(features, labels, mode, params):
+      del features, labels, params
+      loss = constant_op.constant(_EXPECTED_LOSS)
+      summary.scalar('loss_scalar_summary', loss)
+      summary.histogram('loss_histogram_summary', loss)
+      summary.image('loss_image_summary', loss)
+      return model_fn_lib.EstimatorSpec(
+          mode=mode, loss=loss, train_op=array_ops.identity(loss))
+
+    features, labels = make_dummy_features_labels()
+    estimator_spec = model_fn_with_summary(
+        features=features, labels=labels, mode=_TRAIN, params={})
+
+    with self.test_session() as sess:
+      self.assertEqual(sess.run(estimator_spec.loss), _EXPECTED_LOSS)
+
+
+def _test_eval_metric_fn(eval_tensor_1, eval_tensor_2):
+  return {
+      'metric_1': (eval_tensor_1, eval_tensor_1),
+      'metric_2': (eval_tensor_2, eval_tensor_2),
+  }
+
+
+class XlaDecoratorEvaluationTest(test.TestCase):
+
+  def _verify_evaluation_result(self, eval_model_fn):
+    features, labels = make_dummy_features_labels()
+    estimator_spec = eval_model_fn(
+        features=features, labels=labels, mode=_EVAL, params={})
+
+    with self.test_session() as sess:
+      self.assertEqual(sess.run(estimator_spec.loss), _EXPECTED_LOSS)
+      self.assertEqual(
+          sess.run(estimator_spec.eval_metric_ops['metric_1'][0]),
+          _EXPECTED_FEATURE + _EXPECTED_LABEL)
+      self.assertEqual(
+          sess.run(estimator_spec.eval_metric_ops['metric_1'][1]),
+          _EXPECTED_FEATURE + _EXPECTED_LABEL)
+      self.assertEqual(
+          sess.run(estimator_spec.eval_metric_ops['metric_2'][0]),
+          _EXPECTED_FEATURE - _EXPECTED_LABEL)
+      self.assertEqual(
+          sess.run(estimator_spec.eval_metric_ops['metric_2'][1]),
+          _EXPECTED_FEATURE - _EXPECTED_LABEL)
+
+  def test_eval_base_estimator_spec_eval_metric_ops_disallowed(self):
+
+    @xla.estimator_model_fn
+    def eval_model_fn_return_estimator_spec(features, labels, mode, params):
+      del features, labels, params
+      loss = constant_op.constant(_EXPECTED_LOSS)
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          loss=loss,
+          eval_metric_ops={
+              'metric': (array_ops.identity(loss), control_flow_ops.no_op())
+          })
+
+    with self.assertRaisesRegexp(
+        ValueError, 'EstimatorSpec.eval_metric_ops is not supported with XLA '
+        'compilation. Please use TPUEstimatorSpec.eval_metrics instead.'):
+      self._verify_evaluation_result(eval_model_fn_return_estimator_spec)
+
+  def test_eval_base_estimator_spec_no_eval_metric_ops(self):
+
+    @xla.estimator_model_fn
+    def eval_model_fn_no_eval_metric_ops(features, labels, mode, params):
+      del features, labels, params
+      return model_fn_lib.EstimatorSpec(
+          mode=mode, loss=constant_op.constant(_EXPECTED_LOSS))
+
+    features, labels = make_dummy_features_labels()
+    estimator_spec = eval_model_fn_no_eval_metric_ops(
+        features=features, labels=labels, mode=_EVAL, params={})
+    with self.test_session() as sess:
+      self.assertEqual(sess.run(estimator_spec.loss), _EXPECTED_LOSS)
+
+  def test_eval_no_eval_metrics(self):
+
+    @xla.estimator_model_fn
+    def eval_model_fn_no_eval_metrics(features, labels, mode, params):
+      del features, labels, params
+      return tpu_estimator.TPUEstimatorSpec(
+          mode=mode, loss=constant_op.constant(_EXPECTED_LOSS))
+
+    features, labels = make_dummy_features_labels()
+    estimator_spec = eval_model_fn_no_eval_metrics(
+        features=features, labels=labels, mode=_EVAL, params={})
+
+    self.assertEqual(estimator_spec.eval_metric_ops, {})
+    with self.test_session() as sess:
+      self.assertEqual(sess.run(estimator_spec.loss), _EXPECTED_LOSS)
+
+  def test_eval_fn_missing_input_tensor(self):
+
+    @xla.estimator_model_fn
+    def eval_model_fn(features, labels, mode, params):
+      del params
+      dummy_eval_metric_fn_tensors_dict = {
+          'eval_tensor_1': features + labels,
+      }
+      return tpu_estimator.TPUEstimatorSpec(
+          mode=mode,
+          loss=constant_op.constant(_EXPECTED_LOSS),
+          eval_metrics=(_test_eval_metric_fn,
+                        dummy_eval_metric_fn_tensors_dict))
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        re.escape("Arguments ['eval_tensor_2'] are needed by metric_fn (first "
+                  'element of TPUEstimatorSpec.eval_metrics) but they are not '
+                  'provided by evaluation tensors (second element of '
+                  'TPUEstimatorSpec.eval_metrics).')):
+      self._verify_evaluation_result(eval_model_fn)
+
+  def test_eval_fn_extraneous_input_tensor(self):
+
+    @xla.estimator_model_fn
+    def eval_model_fn(features, labels, mode, params):
+      del params
+      dummy_eval_metric_fn_tensors_dict = {
+          'eval_tensor_1': features + labels,
+          'eval_tensor_2': features - labels,
+          'extra_tensor': features * 2 - labels,
+      }
+      return tpu_estimator.TPUEstimatorSpec(
+          mode=mode,
+          loss=constant_op.constant(_EXPECTED_LOSS),
+          eval_metrics=(_test_eval_metric_fn,
+                        dummy_eval_metric_fn_tensors_dict))
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        re.escape("Arguments ['extra_tensor'] are provided by evaluation "
+                  'tensors (second element of TPUEstimatorSpec.eval_metrics) '
+                  'but they are not needed by metric_fn (first element of '
+                  'TPUEstimatorSpec.eval_metrics).')):
+      self._verify_evaluation_result(eval_model_fn)
+
+  def test_eval_tensors_as_list(self):
+
+    @xla.estimator_model_fn
+    def eval_model_fn(features, labels, mode, params):
+      del params
+      dummy_eval_metric_fn_tensors = [features + labels, features - labels]
+      return tpu_estimator.TPUEstimatorSpec(
+          mode=mode,
+          loss=constant_op.constant(_EXPECTED_LOSS),
+          eval_metrics=(_test_eval_metric_fn, dummy_eval_metric_fn_tensors))
+
+    self._verify_evaluation_result(eval_model_fn)
+
+  def test_eval_tensors_as_dict(self):
+
+    @xla.estimator_model_fn
+    def eval_model_fn(features, labels, mode, params):
+      del params
+      dummy_eval_metric_fn_tensors_dict = {
+          'eval_tensor_1': features + labels,
+          'eval_tensor_2': features - labels,
+      }
+      return tpu_estimator.TPUEstimatorSpec(
+          mode=mode,
+          loss=constant_op.constant(_EXPECTED_LOSS),
+          eval_metrics=(_test_eval_metric_fn,
+                        dummy_eval_metric_fn_tensors_dict))
+
+    self._verify_evaluation_result(eval_model_fn)
+
+  def test_model_with_summary(self):
+    """Tests that summary ops are disabled."""
+
+    @xla.estimator_model_fn
+    def model_fn_with_summary(features, labels, mode, params):
+      del features, labels, params
+      loss = constant_op.constant(_EXPECTED_LOSS)
+      summary.scalar('loss_scalar_summary', loss)
+      summary.histogram('loss_histogram_summary', loss)
+      summary.image('loss_image_summary', loss)
+      return tpu_estimator.TPUEstimatorSpec(mode=mode, loss=loss)
+
+    features, labels = make_dummy_features_labels()
+    estimator_spec = model_fn_with_summary(
+        features=features, labels=labels, mode=_EVAL, params={})
+
+    with self.test_session() as sess:
+      self.assertEqual(sess.run(estimator_spec.loss), _EXPECTED_LOSS)
+
+
+class XlaDecoratorScaffoldTest(test.TestCase, parameterized.TestCase):
+
+  def _make_scaffold_fn(self, mode):
+
+    def _scaffold_fn_on_cpu():
+      scaffold = training.Scaffold()
+      self.assertNotIn(mode, self.is_scaffold_fn_called)
+      self.is_scaffold_fn_called[mode] = True
+      return scaffold
+
+    return _scaffold_fn_on_cpu
+
+  def test_scaffold_fn_return_none(self):
+
+    @xla.estimator_model_fn
+    def model_fn(features, labels, mode, params):
+      del features, labels, params
+      return tpu_estimator.TPUEstimatorSpec(
+          mode=mode,
+          loss=constant_op.constant(_EXPECTED_LOSS),
+          train_op=control_flow_ops.no_op(),
+          scaffold_fn=lambda: None)
+
+    features, labels = make_dummy_features_labels()
+    with self.assertRaisesRegexp(
+        ValueError,
+        'TPUEstimatorSpec.scaffold_fn returns None, which is not allowed'):
+      model_fn(features=features, labels=labels, mode=_TRAIN, params={})
+
+  @parameterized.named_parameters(
+      ('train_mode', _TRAIN),
+      ('eval_mode', _EVAL),
+      # TODO(ycao): Add predict_mode test after PREDICT mode is implemented.
+  )
+  def test_scaffold_fn_in_mode(self, mode):
+
+    @xla.estimator_model_fn
+    def model_fn(features, labels, mode, params):
+      del features, labels, params
+      return tpu_estimator.TPUEstimatorSpec(
+          mode=mode,
+          loss=constant_op.constant(_EXPECTED_LOSS),
+          train_op=control_flow_ops.no_op(),
+          scaffold_fn=self._make_scaffold_fn(mode))
+
+    features, labels = make_dummy_features_labels()
+
+    self.is_scaffold_fn_called = {}
+    model_fn(features=features, labels=labels, mode=mode, params={})
+    self.assertTrue(self.is_scaffold_fn_called[mode])
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/constrained_optimization/README.md b/tensorflow/contrib/constrained_optimization/README.md
index cb1dd7d836ae11700b2ffaaff4fda5b7f943f87d..7ffb6894d37444fd78015b6c124c46f2855c1cde 100644
--- a/tensorflow/contrib/constrained_optimization/README.md
+++ b/tensorflow/contrib/constrained_optimization/README.md
@@ -1,5 +1,10 @@
 <!-- TODO(acotter): Add usage example of non-convex optimization and stochastic classification. -->
 
+**NOTE: As tensorflow.contrib is being
+[deprecated](https://github.com/tensorflow/community/pull/18), TFCO is moving to
+its own repository on
+[github](https://github.com/google-research/tensorflow_constrained_optimization).**
+
 # ConstrainedOptimization (TFCO)
 
 TFCO is a library for optimizing inequality-constrained problems in TensorFlow.
diff --git a/tensorflow/contrib/constrained_optimization/python/candidates_test.py b/tensorflow/contrib/constrained_optimization/python/candidates_test.py
index a4c49d48bc5c763489215261a909573af0f19055..280e9acd88638a9385bfd9128ba6d3739879aab2 100644
--- a/tensorflow/contrib/constrained_optimization/python/candidates_test.py
+++ b/tensorflow/contrib/constrained_optimization/python/candidates_test.py
@@ -52,12 +52,12 @@ class CandidatesTest(test.TestCase):
     distribution = candidates.find_best_candidate_distribution(
         objective_vector, constraints_matrix)
     # Verify that the solution is a probability distribution.
-    self.assertTrue(np.all(distribution >= 0))
+    self.assertTrue(np.all(distribution >= -1e-6))
     self.assertAlmostEqual(np.sum(distribution), 1.0)
     # Verify that the solution satisfies the constraints.
     maximum_constraint_violation = np.amax(
         np.dot(constraints_matrix, distribution))
-    self.assertLessEqual(maximum_constraint_violation, 0)
+    self.assertLessEqual(maximum_constraint_violation, 1e-6)
     # Verify that the solution matches that which we expect.
     expected_distribution = np.array([0.37872711, 0.62127289, 0, 0])
     self.assertAllClose(expected_distribution, distribution, rtol=0, atol=1e-6)
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
index a268415f0e65206294431a537be18cadbe1a1e84..f5219eb134d07c09b16a544f71d4c18986c19681 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
@@ -68,6 +68,7 @@ def RunLSTM(sess,
             batch_size,
             time,
             num_layers=1,
+            variable_seq_lengths=False,
             is_training=True,
             dropout=0.,
             num_dirs=True,
@@ -99,6 +100,13 @@ def RunLSTM(sess,
                                  num_units).astype(dtype.as_numpy_dtype),
       dtype=dtype)
 
+  if variable_seq_lengths:
+    lengths_v = np.random.randint(low=1, high=time + 1, size=batch_size)
+    lengths_v[0] = time  # make sure the max sequence has 'time' elems
+    lengths = ops.convert_to_tensor(lengths_v.astype(np.int32))
+  else:
+    lengths = None
+
   initializer = init_ops.random_uniform_initializer(
       -0.01, 0.01, dtype=dtype, seed=19980904)
 
@@ -115,6 +123,7 @@ def RunLSTM(sess,
     outputs_op, state_tuple_op = rnn.dynamic_rnn(
         cell,
         inputs,
+        sequence_length=lengths,
         initial_state=rnn_cell_impl.LSTMStateTuple(
             h=initial_h_op, c=initial_c_op),
         dtype=dtype,
@@ -133,6 +142,7 @@ def RunLSTM(sess,
       cu_initial_h_op,
       cu_initial_c_op,
       opaque_params,
+      sequence_lengths=lengths,
       dropout=dropout,
       is_training=is_training,
       rnn_mode=cudnn_rnn_ops.CUDNN_LSTM)
@@ -325,12 +335,19 @@ class CudnnLSTMTest(TensorFlowTestCase, parameterized.TestCase):
                             time,
                             num_layers,
                             dtype,
-                            rtol=2e-6,
-                            atol=2e-6):
+                            variable_seq_lengths,
+                            rtol=3e-6,
+                            atol=3e-6):
     with self.session(use_gpu=True) as sess:
       (outputs, cu_outputs, state_tuple, cu_state_tuple, inp_grad, cu_inp_grad,
        state_grad, cu_state_grad, wgrad, bgrad, cu_wgrad, cu_bgrad) = RunLSTM(
-           sess, num_units, input_size, batch_size, time, num_layers)
+           sess,
+           num_units,
+           input_size,
+           batch_size,
+           time,
+           num_layers,
+           variable_seq_lengths=variable_seq_lengths)
 
       self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol)
       for s, cu_s in zip(state_tuple, cu_state_tuple):
@@ -341,20 +358,33 @@ class CudnnLSTMTest(TensorFlowTestCase, parameterized.TestCase):
       self.assertAllClose(bgrad, cu_bgrad, rtol=rtol, atol=atol)
       self.assertAllClose(wgrad, cu_wgrad, rtol=rtol, atol=atol)
 
-  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @parameterized.named_parameters(
+      ExpandNamedTestCases(NAMED_RNN_TESTCASES, **{
+          "variable_seq_lengths": [True, False],
+      }))
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def test_training(self, num_units, input_size, batch_size, time, num_layers):
+  def test_training(self, num_units, input_size, batch_size, time, num_layers,
+                    variable_seq_lengths):
     if not context.context().num_gpus():
       self.skipTest("No GPUs found")
-    self._test_training_helper(num_units, input_size, batch_size, time,
-                               num_layers, dtypes.float32)
+    self._test_training_helper(
+        num_units,
+        input_size,
+        batch_size,
+        time,
+        num_layers,
+        dtypes.float32,
+        variable_seq_lengths=variable_seq_lengths)
 
-  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @parameterized.named_parameters(
+      ExpandNamedTestCases(NAMED_RNN_TESTCASES, **{
+          "variable_seq_lengths": [True, False],
+      }))
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def test_training_fp16(self, num_units, input_size, batch_size, time,
-                         num_layers):
+                         num_layers, variable_seq_lengths):
     if not context.context().num_gpus():
       self.skipTest("No GPUs found")
     self._test_training_helper(
@@ -365,12 +395,17 @@ class CudnnLSTMTest(TensorFlowTestCase, parameterized.TestCase):
         num_layers,
         dtypes.float16,
         rtol=5e-3,
-        atol=5e-4)
+        atol=5e-4,
+        variable_seq_lengths=variable_seq_lengths)
 
-  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @parameterized.named_parameters(
+      ExpandNamedTestCases(NAMED_RNN_TESTCASES, **{
+          "variable_seq_lengths": [True, False],
+      }))
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def test_inference(self, num_units, input_size, batch_size, time, num_layers):
+  def test_inference(self, num_units, input_size, batch_size, time, num_layers,
+                     variable_seq_lengths):
     if not context.context().num_gpus():
       self.skipTest("No GPUs found")
     with self.session(use_gpu=True) as sess:
@@ -381,7 +416,8 @@ class CudnnLSTMTest(TensorFlowTestCase, parameterized.TestCase):
           batch_size,
           time,
           num_layers,
-          is_training=False)
+          is_training=False,
+          variable_seq_lengths=variable_seq_lengths)
 
       self.assertAllClose(outputs, cu_outputs)
       # h
@@ -389,11 +425,14 @@ class CudnnLSTMTest(TensorFlowTestCase, parameterized.TestCase):
       # c
       self.assertAllClose(state_tuple.c, cu_state_tuple.c)
 
-  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @parameterized.named_parameters(
+      ExpandNamedTestCases(NAMED_RNN_TESTCASES, **{
+          "variable_seq_lengths": [True, False],
+      }))
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def test_inference_fp16(self, num_units, input_size, batch_size, time,
-                          num_layers):
+                          num_layers, variable_seq_lengths):
     if not context.context().num_gpus():
       self.skipTest("No GPUs found")
     with self.session(use_gpu=True) as sess:
@@ -405,7 +444,8 @@ class CudnnLSTMTest(TensorFlowTestCase, parameterized.TestCase):
           time,
           num_layers,
           is_training=False,
-          dtype=dtypes.float16)
+          dtype=dtypes.float16,
+          variable_seq_lengths=variable_seq_lengths)
 
       rtol, atol = 5e-3, 5e-4
       self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol)
@@ -416,11 +456,14 @@ class CudnnLSTMTest(TensorFlowTestCase, parameterized.TestCase):
       self.assertAllClose(
           state_tuple.c, cu_state_tuple.c, rtol=rtol, atol=atol)
 
-  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @parameterized.named_parameters(
+      ExpandNamedTestCases(NAMED_RNN_TESTCASES, **{
+          "variable_seq_lengths": [True, False],
+      }))
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def test_inference_with_dropout(self, num_units, input_size, batch_size, time,
-                                  num_layers):
+                                  num_layers, variable_seq_lengths):
     """Validates that dropout does not affect Cudnn Rnn inference."""
     if not context.context().num_gpus():
       self.skipTest("No GPUs found")
@@ -436,7 +479,8 @@ class CudnnLSTMTest(TensorFlowTestCase, parameterized.TestCase):
             time,
             num_layers,
             is_training=False,
-            dropout=0.)
+            dropout=0.,
+            variable_seq_lengths=variable_seq_lengths)
 
     with ops.Graph().as_default() as g:
       with self.session(use_gpu=True, graph=g) as sess:
@@ -448,7 +492,8 @@ class CudnnLSTMTest(TensorFlowTestCase, parameterized.TestCase):
             time,
             num_layers,
             is_training=False,
-            dropout=1.)
+            dropout=1.,
+            variable_seq_lengths=variable_seq_lengths)
 
     self.assertAllClose(cu_outputs, cu_outputs2)
     # h
@@ -464,6 +509,7 @@ def RunGRU(sess,
            time,
            num_layers=1,
            is_training=True,
+           variable_seq_lengths=False,
            dropout=0.,
            num_dirs=True,
            dtype=dtypes.float32):
@@ -489,6 +535,13 @@ def RunGRU(sess,
                                  num_units).astype(dtype.as_numpy_dtype),
       dtype=dtype)
 
+  if variable_seq_lengths:
+    lengths_v = np.random.randint(low=1, high=time + 1, size=batch_size)
+    lengths_v[0] = time  # make sure the max sequence has 'time' elems
+    lengths = ops.convert_to_tensor(lengths_v.astype(np.int32))
+  else:
+    lengths = None
+
   initializer = init_ops.random_uniform_initializer(
       -0.01, 0.01, dtype=dtype, seed=19980904)
   with variable_scope.variable_scope("test", initializer=initializer):
@@ -521,6 +574,7 @@ def RunGRU(sess,
     outputs_op, h_op = rnn.dynamic_rnn(
         cell,
         inputs,
+        sequence_length=lengths,
         initial_state=initial_h_op,
         dtype=dtype,
         time_major=True,
@@ -533,12 +587,14 @@ def RunGRU(sess,
       num_layers, num_units, input_size)
   opaque_params = format_converter.tf_canonical_to_opaque(ws + bs)
 
+
   cu_initial_h_op = array_ops.expand_dims(initial_h_op, axis=0)
   cu_outputs_op, cu_h_op, _ = cudnn_rnn_ops._cudnn_rnn(
       inputs,
       cu_initial_h_op,
       array_ops.zeros_like(cu_initial_h_op),  # not used
       opaque_params,
+      sequence_lengths=lengths,
       dropout=dropout,
       is_training=is_training,
       rnn_mode=cudnn_rnn_ops.CUDNN_GRU)
@@ -615,12 +671,19 @@ class CudnnGRUTest(TensorFlowTestCase, parameterized.TestCase):
                             time,
                             num_layers,
                             dtype,
-                            rtol=2e-6,
-                            atol=2e-6):
+                            variable_seq_lengths,
+                            rtol=3e-6,
+                            atol=3e-6):
     with self.session(use_gpu=True) as sess:
-      (outputs, cu_outputs, h, cu_h, inp_grad, cu_inp_grad, hgrad,
-       cu_hgrad, wgrad, bgrad, cu_wgrad, cu_bgrad) = RunGRU(
-           sess, num_units, input_size, batch_size, time, num_layers)
+      (outputs, cu_outputs, h, cu_h, inp_grad, cu_inp_grad, hgrad, cu_hgrad,
+       wgrad, bgrad, cu_wgrad, cu_bgrad) = RunGRU(
+           sess,
+           num_units,
+           input_size,
+           batch_size,
+           time,
+           num_layers,
+           variable_seq_lengths=variable_seq_lengths)
 
       self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol)
       self.assertAllClose(h, cu_h, rtol=rtol, atol=atol)
@@ -631,20 +694,33 @@ class CudnnGRUTest(TensorFlowTestCase, parameterized.TestCase):
       for wg, cu_wg in zip(wgrad, cu_wgrad):
         self.assertAllClose(wg, cu_wg, rtol=rtol, atol=atol)
 
-  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @parameterized.named_parameters(
+      ExpandNamedTestCases(NAMED_RNN_TESTCASES, **{
+          "variable_seq_lengths": [True, False],
+      }))
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def test_training(self, num_units, input_size, batch_size, time, num_layers):
+  def test_training(self, num_units, input_size, batch_size, time, num_layers,
+                    variable_seq_lengths):
     if not context.context().num_gpus():
       self.skipTest("No GPUs found")
-    self._test_training_helper(num_units, input_size, batch_size, time,
-                               num_layers, dtypes.float32)
+    self._test_training_helper(
+        num_units,
+        input_size,
+        batch_size,
+        time,
+        num_layers,
+        dtypes.float32,
+        variable_seq_lengths=variable_seq_lengths)
 
-  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @parameterized.named_parameters(
+      ExpandNamedTestCases(NAMED_RNN_TESTCASES, **{
+          "variable_seq_lengths": [True, False],
+      }))
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def test_training_fp16(self, num_units, input_size, batch_size, time,
-                         num_layers):
+                         num_layers, variable_seq_lengths):
     if not context.context().num_gpus():
       self.skipTest("No GPUs found")
     self._test_training_helper(
@@ -655,12 +731,17 @@ class CudnnGRUTest(TensorFlowTestCase, parameterized.TestCase):
         num_layers,
         dtypes.float16,
         rtol=5e-3,
-        atol=5e-4)
+        atol=5e-4,
+        variable_seq_lengths=variable_seq_lengths)
 
-  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @parameterized.named_parameters(
+      ExpandNamedTestCases(NAMED_RNN_TESTCASES, **{
+          "variable_seq_lengths": [True, False],
+      }))
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def test_inference(self, num_units, input_size, batch_size, time, num_layers):
+  def test_inference(self, num_units, input_size, batch_size, time, num_layers,
+                     variable_seq_lengths):
     if not context.context().num_gpus():
       self.skipTest("No GPUs found")
     with self.session(use_gpu=True) as sess:
@@ -671,15 +752,19 @@ class CudnnGRUTest(TensorFlowTestCase, parameterized.TestCase):
           batch_size,
           time,
           num_layers,
-          is_training=False)
+          is_training=False,
+          variable_seq_lengths=variable_seq_lengths)
       self.assertAllClose(outputs, cu_outputs)
       self.assertAllClose(h, cu_h)
 
-  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @parameterized.named_parameters(
+      ExpandNamedTestCases(NAMED_RNN_TESTCASES, **{
+          "variable_seq_lengths": [True, False],
+      }))
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def test_inference_fp16(self, num_units, input_size, batch_size, time,
-                          num_layers):
+                          num_layers, variable_seq_lengths):
     if not context.context().num_gpus():
       self.skipTest("No GPUs found")
     with self.session(use_gpu=True) as sess:
@@ -691,17 +776,21 @@ class CudnnGRUTest(TensorFlowTestCase, parameterized.TestCase):
           time,
           num_layers,
           is_training=False,
-          dtype=dtypes.float16)
+          dtype=dtypes.float16,
+          variable_seq_lengths=variable_seq_lengths)
 
       rtol, atol = 5e-3, 5e-4
       self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol)
       self.assertAllClose(h, cu_h, rtol=rtol, atol=atol)
 
-  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @parameterized.named_parameters(
+      ExpandNamedTestCases(NAMED_RNN_TESTCASES, **{
+          "variable_seq_lengths": [True, False],
+      }))
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def test_inference_with_dropout(self, num_units, input_size, batch_size, time,
-                                  num_layers):
+                                  num_layers, variable_seq_lengths):
     """Validates that dropout does not affect Cudnn Rnn inference."""
     # Hand-picked dropouts are used below (0. and 1.)
     if not context.context().num_gpus():
@@ -717,7 +806,8 @@ class CudnnGRUTest(TensorFlowTestCase, parameterized.TestCase):
             time,
             num_layers,
             is_training=False,
-            dropout=0.)
+            dropout=0.,
+            variable_seq_lengths=variable_seq_lengths)
 
     with ops.Graph().as_default() as g:
       with self.session(use_gpu=True, graph=g) as sess:
@@ -729,7 +819,8 @@ class CudnnGRUTest(TensorFlowTestCase, parameterized.TestCase):
             time,
             num_layers,
             is_training=False,
-            dropout=1.)
+            dropout=1.,
+            variable_seq_lengths=variable_seq_lengths)
 
     self.assertAllClose(cu_outputs, cu_outputs2)
     self.assertAllClose(cu_h[0], cu_h2[0])
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
index 7e1b4062ce435f3ab4216e90b4f5fcbab984c1dc..403f30909520dc5cd5f5919af843291fe1400b91 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
@@ -58,7 +58,7 @@ from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import momentum
 from tensorflow.python.training import rmsprop
 from tensorflow.python.training import saver as saver_lib
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 CUDNN_LSTM = cudnn_rnn_ops.CUDNN_LSTM
@@ -709,7 +709,7 @@ class CudnnRNNTestSaveRestore(test_util.TensorFlowTestCase):
     self._TestSaveRestoreHelper(CUDNN_RNN_RELU)
 
 
-class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
+class CudnnRNNTestSaveRestoreTrackable(test_util.TensorFlowTestCase):
 
   def _VerifyCheckpoint(
       self, checkpoint_path, compatible_cell_fn, cudnn_cell_fn,
@@ -718,7 +718,7 @@ class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     with ops.device("gpu:0"):
       cudnn_layer = cudnn_cell_fn()
-      cudnn_checkpoint = checkpointable_utils.Checkpoint(cell=cudnn_layer)
+      cudnn_checkpoint = trackable_utils.Checkpoint(cell=cudnn_layer)
       status = cudnn_checkpoint.restore(checkpoint_path)
       inputs = 3. * array_ops.ones([num_applications, num_layers, input_size],
                                    dtype=dtypes.float32)
@@ -726,7 +726,7 @@ class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
       status.run_restore_ops()
     second_save_path = cudnn_checkpoint.save(checkpoint_prefix)
     restore_layer = compatible_cell_fn()
-    restore_layer_checkpoint = checkpointable_utils.Checkpoint(
+    restore_layer_checkpoint = trackable_utils.Checkpoint(
         cell=restore_layer)
     status = restore_layer_checkpoint.restore(second_save_path)
     current_state = restore_layer.zero_state(1, dtypes.float32)
@@ -742,7 +742,7 @@ class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
     self.assertAllClose(self.evaluate(restore_layer_output),
                         self.evaluate(cudnn_output)[-1, -1:, ...])
 
-  def _CheckpointableSingleCellUnidirectionalTestTemplate(
+  def _TrackableSingleCellUnidirectionalTestTemplate(
       self, single_cell_fn, cudnn_cell_fn):
     # Single-layer cuDNN cells with object-based checkpointing should be
     # checkpoint compatible with either single CudnnCompatible cells or
@@ -759,7 +759,7 @@ class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
       value = np.random.normal(size=variable.shape)
       expected_values.append(value)
       self.evaluate(variable.assign(value))
-    save_checkpoint = checkpointable_utils.Checkpoint(cell=save_cell_layer)
+    save_checkpoint = trackable_utils.Checkpoint(cell=save_cell_layer)
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     first_save_path = save_checkpoint.save(checkpoint_prefix)
@@ -775,10 +775,10 @@ class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   @test_util.run_in_graph_and_eager_modes
-  def testLSTMCheckpointableSingleLayer(self):
+  def testLSTMTrackableSingleLayer(self):
     num_units = 2
     direction = CUDNN_RNN_UNIDIRECTION
-    self._CheckpointableSingleCellUnidirectionalTestTemplate(
+    self._TrackableSingleCellUnidirectionalTestTemplate(
         single_cell_fn=functools.partial(
             cudnn_rnn_ops.CudnnCompatibleLSTMCell, num_units=num_units),
         cudnn_cell_fn=functools.partial(
@@ -788,19 +788,19 @@ class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   @test_util.run_in_graph_and_eager_modes
-  def testGRUCheckpointableSingleLayer(self):
+  def testGRUTrackableSingleLayer(self):
     num_units = 2
     direction = CUDNN_RNN_UNIDIRECTION
     with self.assertRaises(NotImplementedError):
       # TODO(allenl): Implement object-based saving for GRUs and other cells.
-      self._CheckpointableSingleCellUnidirectionalTestTemplate(
+      self._TrackableSingleCellUnidirectionalTestTemplate(
           single_cell_fn=functools.partial(
               cudnn_rnn_ops.CudnnCompatibleGRUCell, num_units=num_units),
           cudnn_cell_fn=functools.partial(
               cudnn_rnn.CudnnGRU, num_layers=1, num_units=num_units,
               direction=direction, name="awesome_gru"))
 
-  def _CheckpointableMultiLayerTestTemplate(
+  def _TrackableMultiLayerTestTemplate(
       self, single_cell_fn, cudnn_cell_fn, num_layers):
 
     def _MultiCellFn():
@@ -819,7 +819,7 @@ class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
         value = np.random.normal(size=variable.shape)
         expected_values.append(value)
         self.evaluate(variable.assign(value))
-      save_checkpoint = checkpointable_utils.Checkpoint(cell=save_layer)
+      save_checkpoint = trackable_utils.Checkpoint(cell=save_layer)
       checkpoint_directory = self.get_temp_dir()
       checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
       first_save_path = save_checkpoint.save(checkpoint_prefix)
@@ -837,7 +837,7 @@ class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
     num_units = 2
     num_layers = 3
     direction = CUDNN_RNN_UNIDIRECTION
-    self._CheckpointableMultiLayerTestTemplate(
+    self._TrackableMultiLayerTestTemplate(
         single_cell_fn=functools.partial(
             cudnn_rnn_ops.CudnnCompatibleLSTMCell, num_units=num_units),
         cudnn_cell_fn=functools.partial(
@@ -1023,7 +1023,7 @@ class CudnnRNNTestCompatibleRNNCells(test_util.TensorFlowTestCase):
           outputs_v, output_state_v = sess.run(
               [outputs, output_state],
               feed_dict={cell_inputs: inference_input})
-          self.assertAllClose(cudnn_outputs_v, outputs_v, atol=2e-5, rtol=2e-5)
+          self.assertAllClose(cudnn_outputs_v, outputs_v, atol=1e-4, rtol=2e-4)
           (cudnn_output_h_v,) = cudnn_output_states_v
           self.assertAllClose(cudnn_output_h_v, output_state_v, atol=2e-5,
                               rtol=2e-5)
diff --git a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
index 8e25637ed91a1559b321ea96efbfaa2910f67158..1cb477716dfc6a9cc793939059784f9d89bcdd8a 100644
--- a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
+++ b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
@@ -374,7 +374,11 @@ class _CudnnRNN(base_layer.Layer):
         "This cell does not yet support object-based saving. File a feature "
         "request if this limitation bothers you.")
 
-  def call(self, inputs, initial_state=None, training=True):
+  def call(self,
+           inputs,
+           initial_state=None,
+           sequence_lengths=None,
+           training=True):
     """Runs the forward step for the RNN model.
 
     Args:
@@ -382,6 +386,9 @@ class _CudnnRNN(base_layer.Layer):
       initial_state: a tuple of tensor(s) of shape
         `[num_layers * num_dirs, batch_size, num_units]`. If not provided, use
         zero initial states. The tuple size is 2 for LSTM and 1 for other RNNs.
+      sequence_lengths: an int32 array representing the variable sequence
+        lengths in a batch. The size of the array has to equal the
+        batch_size. If not provided, the same sequence length will be assumed.
       training: whether this operation will be used in training or inference.
     Returns:
       output: a tensor of shape `[time_len, batch_size, num_dirs * num_units]`.
@@ -411,7 +418,7 @@ class _CudnnRNN(base_layer.Layer):
       # For model that doesn't take input_c, replace with a dummy tensor.
       c = array_ops.constant([], dtype=dtype)
     outputs, (output_h, output_c) = self._forward(inputs, h, c, self.kernel,
-                                                  training)
+                                                  sequence_lengths, training)
     if self._rnn_mode == CUDNN_LSTM:
       return outputs, (output_h, output_c)
     else:
@@ -475,7 +482,7 @@ class _CudnnRNN(base_layer.Layer):
           dropout=self._dropout,
           direction=self._direction)
 
-  def _forward(self, inputs, h, c, opaque_params, training):
+  def _forward(self, inputs, h, c, opaque_params, sequence_lengths, training):
     output, output_h, output_c = cudnn_rnn_ops._cudnn_rnn(  # pylint:disable=protected-access
         inputs,
         h,
@@ -483,6 +490,7 @@ class _CudnnRNN(base_layer.Layer):
         opaque_params,
         training,
         self._rnn_mode,
+        sequence_lengths=sequence_lengths,
         input_mode=self._input_mode,
         direction=self._direction,
         dropout=self._dropout,
@@ -510,8 +518,8 @@ class _CudnnRNN(base_layer.Layer):
         direction=self.direction,
         scope=vs.get_variable_scope(),
         name="%s_saveable" % self.trainable_variables[0].name.split(":")[0])
-    self._saveable._add_checkpointable_dependencies(  # pylint: disable=protected-access
-        checkpointable=self, dtype=self._plain_dtype)
+    self._saveable._add_trackable_dependencies(  # pylint: disable=protected-access
+        trackable=self, dtype=self._plain_dtype)
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable)
 
 
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index 1ce29b42d52ff67477161278ed11016c2e73041d..7d848e2ec2d99cd2a78ff3e813207c0cd5bb97cf 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -33,7 +33,7 @@ from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.training import saver
-from tensorflow.python.training.checkpointable import tracking as checkpointable_lib
+from tensorflow.python.training.tracking import tracking as trackable_lib
 
 CUDNN_RNN_UNIDIRECTION = "unidirectional"
 CUDNN_RNN_BIDIRECTION = "bidirectional"
@@ -737,13 +737,13 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
     return state_ops.assign(
         self._variables, opaque_params, validate_shape=False)
 
-  def _checkpointable_save(self, save_buffer):
+  def _trackable_save(self, save_buffer):
     weights, biases = self.format_converter.opaque_to_tf_canonical(
         self._variables)
     for name, tensor in zip(self._param_names, weights + biases):
       save_buffer[name] = array_ops.identity(tensor)
 
-  def _checkpointable_restore(self, restore_buffer):
+  def _trackable_restore(self, restore_buffer):
     tensors = [
         array_ops.identity(restore_buffer[name]) for name in self._param_names
     ]
@@ -752,26 +752,26 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
         restored_shapes=None  # Unused
     )
 
-  def _add_checkpointable_dependencies(self, checkpointable, dtype):
-    """Add canonical weight dependencies to `checkpointable`.
+  def _add_trackable_dependencies(self, trackable, dtype):
+    """Add canonical weight dependencies to `trackable`.
 
     When saving or restoring, converts to or from the opaque buffer
     format. Weights are saved and loaded in the configuration expected by
     cuDNN-compatible cells.
 
     Args:
-      checkpointable: An object inheriting from `CheckpointableBase` to add
+      trackable: An object inheriting from `Trackable` to add
         dependencies too (typically the cuDNN `Layer`).
       dtype: The dtype for the canonical parameter Tensors.
     """
     split_dependencies = split_dependency.split_dependency(
         component_names=self._param_names,
         component_dtypes=(dtype,) * len(self._param_names),
-        fill_save_buffer_fn=self._checkpointable_save,
-        consume_restore_buffer_fn=self._checkpointable_restore)
-    self._checkpointable_track_params(checkpointable, split_dependencies)
+        fill_save_buffer_fn=self._trackable_save,
+        consume_restore_buffer_fn=self._trackable_restore)
+    self._trackable_track_params(trackable, split_dependencies)
 
-  def _checkpointable_track_params(self, checkpointable, params):
+  def _trackable_track_params(self, trackable, params):
     """Tracks parameters in a canonical configuration."""
     return  # NotImplementedError raised by the Layer.
 
@@ -819,7 +819,7 @@ class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
     tf_weights_names.append(prefix + "/kernel")
     tf_bias_names.append(prefix + "/bias")
 
-  def _checkpointable_track_params(self, checkpointable, params):
+  def _trackable_track_params(self, trackable, params):
     """Track parameters for compatibility with CudnnCompatibleLSTMCell."""
     biases = []
     weights = []
@@ -833,12 +833,12 @@ class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
       # wrapping.
       kernel, = weights  # pylint: disable=unbalanced-tuple-unpacking
       bias, = biases  # pylint: disable=unbalanced-tuple-unpacking
-      checkpointable._track_checkpointable(kernel, name="kernel")  # pylint: disable=protected-access
-      checkpointable._track_checkpointable(bias, name="bias")  # pylint: disable=protected-access
+      trackable._track_trackable(kernel, name="kernel")  # pylint: disable=protected-access
+      trackable._track_trackable(bias, name="bias")  # pylint: disable=protected-access
     assert len(biases) == len(weights)
     for cell_index, (bias, kernel) in enumerate(zip(biases, weights)):
-      cell = checkpointable_lib.Checkpointable()
-      checkpointable._track_checkpointable(cell, name="cell-%d" % cell_index)  # pylint: disable=protected-access
+      cell = trackable_lib.AutoTrackable()
+      trackable._track_trackable(cell, name="cell-%d" % cell_index)  # pylint: disable=protected-access
       cell.bias = bias
       cell.kernel = kernel
 
@@ -955,6 +955,7 @@ def _cudnn_rnn(inputs,
                params,
                is_training,
                rnn_mode,
+               sequence_lengths=None,
                input_mode=CUDNN_INPUT_LINEAR_MODE,
                direction=CUDNN_RNN_UNIDIRECTION,
                dropout=0.,
@@ -972,6 +973,10 @@ def _cudnn_rnn(inputs,
     params: the parameter buffer created for this model.
     is_training: whether this operation will be used in training or inference
     rnn_mode: one of ('lstm', 'gru', 'rnn_relu', 'rnn_tanh').
+    sequence_lengths: an int32 array representing the variable sequence lengths
+      in a batch. The size of the array has to equal the batch_size. Default to
+      None, in which case sequences in the batch are assumed to have the same
+      length, which is inferred from inputs.
     input_mode: indicate whether there is a linear projection between the
       input and the actual computation before the first layer. It could be
       'linear_input', 'skip_input' or 'auto_select'.
@@ -1010,7 +1015,10 @@ def _cudnn_rnn(inputs,
       "seed2": seed2,
       "name": name
   }
-  if use_cudnn_v2 != "1":
+  if sequence_lengths is not None:
+    args["sequence_lengths"] = sequence_lengths
+    outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv3(**args)
+  elif use_cudnn_v2 != "1":
     outputs, output_h, output_c, _ = gen_cudnn_rnn_ops.cudnn_rnn(**args)
   else:
     outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv2(**args)
@@ -1022,6 +1030,7 @@ def cudnn_lstm(inputs,
                input_c,
                params,
                is_training,
+               sequence_lengths=None,
                input_mode=CUDNN_INPUT_LINEAR_MODE,
                direction=CUDNN_RNN_UNIDIRECTION,
                dropout=0.,
@@ -1051,12 +1060,17 @@ def cudnn_lstm(inputs,
     dropout: whether to enable dropout. With it is 0, dropout is disabled.
     seed: the op seed used for initializing dropout. See `tf.set_random_seed`
         for behavior.
+    sequence_lengths: an int32 array representing the variable sequence lengths
+      in a batch. The size of the array has to equal the batch_size. Default to
+      None, in which case sequences in the batch are assumed to have the same
+      length, which is inferred from inputs.
     name: name of the operation.
   Returns:
     outputs, output_h, output_c
   """
   return _cudnn_rnn(inputs, input_h, input_c, params, is_training, CUDNN_LSTM,
-                    input_mode, direction, dropout, seed, name)
+                    sequence_lengths, input_mode, direction, dropout, seed,
+                    name)
 
 
 def _cudnn_rnn_no_input_c(inputs,
@@ -1064,6 +1078,7 @@ def _cudnn_rnn_no_input_c(inputs,
                           params,
                           is_training,
                           rnn_mode,
+                          sequence_lengths=None,
                           input_mode=CUDNN_INPUT_LINEAR_MODE,
                           direction=CUDNN_RNN_UNIDIRECTION,
                           dropout=0.,
@@ -1079,6 +1094,10 @@ def _cudnn_rnn_no_input_c(inputs,
     params: the parameter buffer created for this model.
     is_training: whether this operation will be used in training or inference
     rnn_mode: one of ('lstm', 'gru', 'rnn_relu', 'rnn_tanh').
+    sequence_lengths: an int32 array representing the variable sequence lengths
+      in a batch. The size of the array has to equal the batch_size. Default to
+      None, in which case sequences in the batch are assumed to have the same
+      length, which is inferred from inputs.
     input_mode: indicate whether there is a linear projection between the
       input and the actual computation before the first layer. It could be
       'linear_input', 'skip_input' or 'auto_select'.
@@ -1098,8 +1117,8 @@ def _cudnn_rnn_no_input_c(inputs,
   """
   input_c = array_ops.constant([], dtype=input_h.dtype)
   outputs, output_h, _ = _cudnn_rnn(inputs, input_h, input_c, params,
-                                    is_training, rnn_mode, input_mode,
-                                    direction, dropout, seed, name)
+                                    is_training, rnn_mode, sequence_lengths,
+                                    input_mode, direction, dropout, seed, name)
   return outputs, output_h
 
 
@@ -1107,6 +1126,7 @@ def cudnn_gru(inputs,
               input_h,
               params,
               is_training,
+              sequence_lengths=None,
               input_mode=CUDNN_INPUT_LINEAR_MODE,
               direction=CUDNN_RNN_UNIDIRECTION,
               dropout=0.,
@@ -1129,6 +1149,10 @@ def cudnn_gru(inputs,
         'skip_input' is only allowed when input_size == num_units;
         'auto_select' implies 'skip_input' when input_size == num_units;
         otherwise, it implies 'linear_input'.
+    sequence_lengths: an int32 array representing the variable sequence lengths
+      in a batch. The size of the array has to equal the batch_size. Default to
+      None, in which case sequences in the batch are assumed to have the same
+      length, which is inferred from inputs.
     direction: the direction model that the model operates. Could be either
         'unidirectional' or 'bidirectional'
     dropout: whether to enable dropout. With it is 0, dropout is disabled.
@@ -1139,7 +1163,8 @@ def cudnn_gru(inputs,
     outputs, output_h
   """
   return _cudnn_rnn_no_input_c(inputs, input_h, params, is_training, CUDNN_GRU,
-                               input_mode, direction, dropout, seed, name)
+                               sequence_lengths, input_mode, direction, dropout,
+                               seed, name)
 
 
 def cudnn_rnn_relu(inputs,
@@ -1150,6 +1175,7 @@ def cudnn_rnn_relu(inputs,
                    direction=CUDNN_RNN_UNIDIRECTION,
                    dropout=0.,
                    seed=0,
+                   sequence_lengths=None,
                    name=None):
   """Cudnn RNN Relu.
 
@@ -1162,30 +1188,34 @@ def cudnn_rnn_relu(inputs,
     is_training: whether this operation will be used in training or inference
       input_mode: indicate whether there is a linear projection between the
         input and the actual computation before the first layer. It could be
-        'linear_input', 'skip_input' or 'auto_select'.
-        'linear_input' (default) always applies a linear projection of input
-        onto RNN hidden state. (standard RNN behavior).
-        'skip_input' is only allowed when input_size == num_units;
-        'auto_select' implies 'skip_input' when input_size == num_units;
-        otherwise, it implies 'linear_input'.
+        'linear_input', 'skip_input' or 'auto_select'. 'linear_input' (default)
+        always applies a linear projection of input onto RNN hidden state.
+        (standard RNN behavior). 'skip_input' is only allowed when input_size ==
+        num_units; 'auto_select' implies 'skip_input' when input_size ==
+        num_units; otherwise, it implies 'linear_input'.
     direction: the direction model that the model operates. Could be either
-        'unidirectional' or 'bidirectional'
+      'unidirectional' or 'bidirectional'
     dropout: whether to enable dropout. With it is 0, dropout is disabled.
     seed: the op seed used for initializing dropout. See `tf.set_random_seed`
-        for behavior.
+      for behavior.
+    sequence_lengths: an int32 array representing the variable sequence lengths
+      in a batch. The size of the array has to equal the batch_size. If not
+      provided, the same sequence length will be assumed.
     name: name of the operation.
+
   Returns:
     outputs, output_h
   """
   return _cudnn_rnn_no_input_c(inputs, input_h, params, is_training,
-                               CUDNN_RNN_RELU, input_mode, direction, dropout,
-                               seed, name)
+                               CUDNN_RNN_RELU, sequence_lengths, input_mode,
+                               direction, dropout, seed, name)
 
 
 def cudnn_rnn_tanh(inputs,
                    input_h,
                    params,
                    is_training,
+                   sequence_lengths=None,
                    input_mode=CUDNN_INPUT_LINEAR_MODE,
                    direction=CUDNN_RNN_UNIDIRECTION,
                    dropout=0.,
@@ -1208,6 +1238,10 @@ def cudnn_rnn_tanh(inputs,
         'skip_input' is only allowed when input_size == num_units;
         'auto_select' implies 'skip_input' when input_size == num_units;
         otherwise, it implies 'linear_input'.
+    sequence_lengths: an int32 array representing the variable sequence lengths
+      in a batch. The size of the array has to equal the batch_size. Default to
+      None, in which case sequences in the batch are assumed to have the same
+      length, which is inferred from inputs.
     direction: the direction model that the model operates. Could be either
         'unidirectional' or 'bidirectional'
     dropout: whether to enable dropout. With it is 0, dropout is disabled.
@@ -1218,8 +1252,8 @@ def cudnn_rnn_tanh(inputs,
     outputs, output_h
   """
   return _cudnn_rnn_no_input_c(inputs, input_h, params, is_training,
-                               CUDNN_RNN_TANH, input_mode, direction, dropout,
-                               seed, name)
+                               CUDNN_RNN_TANH, sequence_lengths, input_mode,
+                               direction, dropout, seed, name)
 
 
 def cudnn_rnn_opaque_params_to_canonical(rnn_mode,
@@ -1497,7 +1531,13 @@ class _CudnnRNN(object):
         input_mode=self._input_mode,
         direction=self._direction)
 
-  def __call__(self, input_data, input_h, input_c, params, is_training=True):
+  def __call__(self,
+               input_data,
+               input_h,
+               input_c,
+               params,
+               is_training=True,
+               sequence_lengths=None):
     """Runs the forward step for the RNN model.
 
     Args:
@@ -1509,6 +1549,10 @@ class _CudnnRNN(object):
         A Tensor of the same shape as input_h.
       params: the parameter buffer created for this model.
       is_training: whether this operation will be used in training or inference.
+      sequence_lengths: an int32 array representing the variable sequence
+        lengths in a batch. The size of the array has to equal the batch_size.
+        Default to None, in which case sequences in the batch are assumed to
+        have the same length, which is inferred from inputs.
     Returns:
       output: the output sequence.
       output_h: the final state for h.
@@ -1521,6 +1565,7 @@ class _CudnnRNN(object):
         params,
         is_training,
         self._rnn_mode,
+        sequence_lengths=sequence_lengths,
         input_mode=self._input_mode,
         direction=self._direction,
         dropout=self._dropout,
@@ -1615,7 +1660,13 @@ class CudnnLSTM(_CudnnRNN):
         dropout=dropout,
         seed=seed)
 
-  def __call__(self, input_data, input_h, input_c, params, is_training=True):
+  def __call__(self,
+               input_data,
+               input_h,
+               input_c,
+               params,
+               sequence_lengths=None,
+               is_training=True):
     """Runs the forward step for the Cudnn LSTM model.
 
     Args:
@@ -1626,6 +1677,10 @@ class CudnnLSTM(_CudnnRNN):
       input_c: the initial hidden state for c. A Tensor of the same shape as
         input_h.
       params: the parameter buffer created for this model.
+      sequence_lengths: an int32 array representing the variable sequence
+        lengths in a batch. The size of the array has to equal the batch_size.
+        Default to None, in which case sequences in the batch are assumed to
+        have the same length, which is inferred from inputs.
       is_training: whether this operation will be used in training or inference.
     Returns:
       output: the output sequence.
@@ -1633,7 +1688,12 @@ class CudnnLSTM(_CudnnRNN):
       output_c: the final state for c.
     """
     output, output_h, output_c = super(CudnnLSTM, self).__call__(
-        input_data, input_h, input_c, params, is_training=is_training)
+        input_data,
+        input_h,
+        input_c,
+        params,
+        sequence_lengths=sequence_lengths,
+        is_training=is_training)
     return (output, output_h, output_c)
 
 
@@ -1687,7 +1747,12 @@ class _CudnnRNNNoInputC(_CudnnRNN):
         dropout=dropout,
         seed=seed)
 
-  def __call__(self, input_data, input_h, params, is_training=True):
+  def __call__(self,
+               input_data,
+               input_h,
+               params,
+               sequence_lengths=None,
+               is_training=True):
     """Runs the forward step for the Cudnn LSTM model.
 
     Args:
@@ -1696,6 +1761,10 @@ class _CudnnRNNNoInputC(_CudnnRNN):
       input_h: the initial hidden state for h. A Tensor of shape [num_layers,
         batch_size, num_units].
       params: the parameter buffer created for this model.
+      sequence_lengths: an int32 array representing the variable sequence
+        lengths in a batch. The size of the array has to equal the batch_size.
+        Default to None, in which case sequences in the batch are assumed to
+        have the same length, which is inferred from inputs.
       is_training: whether this operation will be used in training or inference.
     Returns:
       output: the output sequence.
@@ -1707,6 +1776,7 @@ class _CudnnRNNNoInputC(_CudnnRNN):
         params,
         is_training,
         self._rnn_mode,
+        sequence_lengths=sequence_lengths,
         input_mode=self._input_mode,
         direction=self._direction,
         dropout=self._dropout,
diff --git a/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py b/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py
index 6c5f8c6b00975b3fba041271309a93cecd9f5057..4db711c1f3f2815e7b8cf275af315c062ce4c02e 100644
--- a/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py
@@ -25,11 +25,13 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_v1_only("deprecated API, no eager or V2 test coverage")
 class AssertElementShapeTest(test_base.DatasetTestBase):
 
   def test_assert_element_shape(self):
diff --git a/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
index b9840b1ff1a3df5a05db0e64f436637220f49f80..220f9934b67d1d2a97f6c0fd4ba7779f011e1b09 100644
--- a/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
@@ -27,12 +27,14 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 prefix_path = "tensorflow/core/lib"
 
 
+@test_util.run_v1_only("deprecated API, no eager or V2 test coverage")
 class LMDBDatasetTest(test_base.DatasetTestBase):
 
   def setUp(self):
diff --git a/tensorflow/contrib/data/python/kernel_tests/reduce_dataset_test.py b/tensorflow/contrib/data/python/kernel_tests/reduce_dataset_test.py
index e7281d531870c75c638b5c48fa3fc6dc606a3623..78019fcc7d810da444f1407f3885d54e76a741c6 100644
--- a/tensorflow/contrib/data/python/kernel_tests/reduce_dataset_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/reduce_dataset_test.py
@@ -25,10 +25,12 @@ from tensorflow.contrib.data.python.ops import grouping
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_v1_only("deprecated API, no eager or V2 test coverage")
 class ReduceDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(
diff --git a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
index 2527706709fae8e459aca3489324d4db3c784be6..9275a36582a8c82b936659041129b71e100f883e 100644
--- a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
@@ -26,11 +26,13 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_v1_only("deprecated API, no eager or V2 test coverage")
 class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index c0152156a1ba70297adb7054622b15ca04f859cd..c6bf5215c9406d03d2704e46903b3aa57e7e68d9 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -389,13 +389,11 @@ class LMDBDataset(dataset_ops.DatasetSource):
     Args:
       filenames: A `tf.string` tensor containing one or more filenames.
     """
-    super(LMDBDataset, self).__init__()
     self._filenames = ops.convert_to_tensor(
         filenames, dtype=dtypes.string, name="filenames")
-
-  def _as_variant_tensor(self):
-    return gen_experimental_dataset_ops.experimental_lmdb_dataset(
+    variant_tensor = gen_experimental_dataset_ops.experimental_lmdb_dataset(
         self._filenames, **dataset_ops.flat_structure(self))
+    super(LMDBDataset, self).__init__(variant_tensor)
 
   @property
   def _element_structure(self):
diff --git a/tensorflow/contrib/data/python/ops/sliding.py b/tensorflow/contrib/data/python/ops/sliding.py
index 5c6ee6bfdc7167d14b292f8f763adafca4e3a72c..6708e01d08135a132b797e317cd2a241c3428f40 100644
--- a/tensorflow/contrib/data/python/ops/sliding.py
+++ b/tensorflow/contrib/data/python/ops/sliding.py
@@ -30,7 +30,6 @@ class _SlideDataset(dataset_ops.UnaryDataset):
 
   def __init__(self, input_dataset, window_size, window_shift, window_stride):
     """See `sliding_window_batch` for details."""
-    super(_SlideDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     self._window_size = ops.convert_to_tensor(
         window_size, dtype=dtypes.int64, name="window_stride")
@@ -43,14 +42,13 @@ class _SlideDataset(dataset_ops.UnaryDataset):
         input_dataset.output_types, input_dataset.output_shapes,
         input_dataset.output_classes)
     self._structure = input_structure._batch(None)  # pylint: disable=protected-access
-
-  def _as_variant_tensor(self):
-    return ged_ops.experimental_sliding_window_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+    variant_tensor = ged_ops.experimental_sliding_window_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
         window_size=self._window_size,
         window_shift=self._window_shift,
         window_stride=self._window_stride,
         **dataset_ops.flat_structure(self))
+    super(_SlideDataset, self).__init__(input_dataset, variant_tensor)
 
   @property
   def _element_structure(self):
diff --git a/tensorflow/contrib/distribute/README.md b/tensorflow/contrib/distribute/README.md
index 8a8dc159ade6f2a4a9b5ec29055ea4848492b29f..dbcaf8185fb7a9d2bcf22376439c0ebd49accb1a 100644
--- a/tensorflow/contrib/distribute/README.md
+++ b/tensorflow/contrib/distribute/README.md
@@ -43,28 +43,19 @@ the workers.
 
 Let's see how to scale to multiple GPUs on one machine using `MirroredStrategy` with [tf.keras] (https://www.tensorflow.org/guide/keras).
 
-Take a very simple model consisting of a single layer:
+Let's define a simple input dataset for training this model. Note that currently we require using
+[`tf.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset)
+with `DistributionStrategy`.
 
 ```python
 import tensorflow as tf
 from tensorflow import keras
 
-inputs = tf.keras.layers.Input(shape=(1,))
-predictions = tf.keras.layers.Dense(1)(inputs)
-model = tf.keras.models.Model(inputs=inputs, outputs=predictions)
-```
-
-Let's also define a simple input dataset for training this model. Note that currently we require using
-[`tf.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset)
-with `DistributionStrategy`.
-
-```python
 features = tf.data.Dataset.from_tensors([1.]).repeat(10000).batch(10)
 labels = tf.data.Dataset.from_tensors([1.]).repeat(10000).batch(10)
 train_dataset = tf.data.Dataset.zip((features, labels))
 ```
 
-
 To distribute this Keras model on multiple GPUs using `MirroredStrategy` we
 first instantiate a `MirroredStrategy` object.
 
@@ -72,14 +63,17 @@ first instantiate a `MirroredStrategy` object.
 distribution = tf.contrib.distribute.MirroredStrategy()
 ```
 
-We then compile the Keras model and pass the `MirroredStrategy` object in the
-`distribute` argument (apart from other usual arguments like `loss` and
-`optimizer`).
+Take a very simple model consisting of a single layer. We need to create and compile
+the model under the distribution strategy scope.
 
 ```python
-model.compile(loss='mean_squared_error',
-              optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.2),
-              distribute=distribution)
+with distribution.scope():
+  inputs = tf.keras.layers.Input(shape=(1,))
+  predictions = tf.keras.layers.Dense(1)(inputs)
+  model = tf.keras.models.Model(inputs=inputs, outputs=predictions)
+
+  model.compile(loss='mean_squared_error',
+                optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.2))
 ```
 
 To train the model we call Keras `fit` API using the input dataset that we
diff --git a/tensorflow/contrib/distribute/__init__.py b/tensorflow/contrib/distribute/__init__.py
index 8ec73654e30e4967f318c558ba94301e84a206e4..59d76f5d1c817d7f2cc8ad285b9fb517fe994a81 100644
--- a/tensorflow/contrib/distribute/__init__.py
+++ b/tensorflow/contrib/distribute/__init__.py
@@ -30,12 +30,13 @@ from tensorflow.contrib.distribute.python.monitor import Monitor
 from tensorflow.contrib.distribute.python.one_device_strategy import OneDeviceStrategy
 from tensorflow.contrib.distribute.python.parameter_server_strategy import ParameterServerStrategy
 from tensorflow.contrib.distribute.python.step_fn import *
+from tensorflow.contrib.distribute.python.tpu_strategy import initialize_tpu_system
 from tensorflow.contrib.distribute.python.tpu_strategy import TPUStrategy
 from tensorflow.python.distribute.cross_device_ops import *
 from tensorflow.python.distribute.distribute_config import DistributeConfig
 from tensorflow.python.distribute.distribute_coordinator import run_standard_tensorflow_server
-from tensorflow.python.training.distribute import *
-from tensorflow.python.training.distribution_strategy_context import *
+from tensorflow.python.distribute.distribute_lib import *
+from tensorflow.python.distribute.distribution_strategy_context import *
 
 from tensorflow.python.util.all_util import remove_undocumented
 
@@ -58,11 +59,14 @@ _allowed_symbols = [
     'StandardSingleLossStep',
     'ReplicaContext',
     'TPUStrategy',
+    'initialize_tpu_system',
     'get_cross_replica_context',
     'get_distribution_strategy',
     'get_loss_reduction',
     'get_replica_context',
+    'get_strategy',
     'has_distribution_strategy',
+    'has_strategy',
     'in_cross_replica_context',
     'require_replica_context',
     'run_standard_tensorflow_server',
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 4c9c35da5a36aa8149d15c8d1c25e4dfaa6a07c1..2ab94d00565376bfebd80ee61094831e09ed3e68 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -1,5 +1,10 @@
 # Implementation of a prototype TF distributed computation library.
 
+load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_test")
+load("//tensorflow/core:platform/default/distribute.bzl", "distribute_py_test")
+load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
 package(
     default_visibility = [
         "//tensorflow:internal",
@@ -10,11 +15,18 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
-load("//tensorflow:tensorflow.bzl", "cuda_py_test")
-
-# TODO(priyag): Figure out testonly issues that are preventing us from
-# including our tests in pip for now.
+py_library(
+    name = "distribute_test_lib_pip",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":combinations",
+        ":keras_correctness_test_lib",
+        ":keras_test_lib",
+        ":multi_worker_test_base",
+        ":single_loss_example",
+        ":strategy_test_lib",
+    ],
+)
 
 cuda_py_test(
     name = "values_test",
@@ -22,25 +34,36 @@ cuda_py_test(
     additional_deps = [
         ":combinations",
         ":mirrored_strategy",
-        ":multi_worker_test_base",
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:device_util",
         "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/estimator:estimator_py",
     ],
-    tags = [
-        "no_pip",
+)
+
+cuda_py_test(
+    name = "input_lib_test",
+    srcs = ["input_lib_test.py"],
+    additional_deps = [
+        ":combinations",
+        ":mirrored_strategy",
+        ":multi_worker_test_base",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/distribute:values",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
     ],
 )
 
@@ -50,8 +73,8 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:input_lib",
         "//tensorflow/python/distribute:mirrored_strategy",
-        "//tensorflow/python/distribute:values",
     ],
 )
 
@@ -60,18 +83,10 @@ py_library(
     srcs = ["parameter_server_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":mirrored_strategy",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:util",
-        "//tensorflow/python/distribute:cross_device_ops",
-        "//tensorflow/python/distribute:multi_worker_util",
-        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:parameter_server_strategy",
         "//tensorflow/python/distribute:values",
-        "//tensorflow/python/eager:context",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
     ],
 )
 
@@ -104,7 +119,6 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
     ],
 )
 
@@ -113,15 +127,17 @@ py_library(
     srcs = ["one_device_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python/distribute:distribute_lib",
-        "//tensorflow/python/distribute:reduce_util",
-        "//tensorflow/python/distribute:values",
-        "//tensorflow/python/eager:context",
-        "@six_archive//:six",
+        "//tensorflow/python/distribute:one_device_strategy",
+    ],
+)
+
+cuda_py_test(
+    name = "one_device_strategy_test",
+    srcs = ["one_device_strategy_test.py"],
+    additional_deps = [
+        ":strategy_test_lib",
+        ":combinations",
+        "//tensorflow/python/eager:test",
     ],
 )
 
@@ -130,28 +146,16 @@ py_library(
     srcs = ["collective_all_reduce_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":mirrored_strategy",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:collective_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python/distribute:cross_device_ops",
-        "//tensorflow/python/distribute:cross_device_utils",
-        "//tensorflow/python/distribute:multi_worker_util",
-        "//tensorflow/python/distribute:values",
-        "//tensorflow/python/eager:context",
+        "//tensorflow/python/distribute:collective_all_reduce_strategy",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
     ],
 )
 
 py_library(
     name = "strategy_test_lib",
-    testonly = 1,
     srcs = ["strategy_test_lib.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-    ],
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -164,20 +168,18 @@ py_library(
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
+        "//third_party/py/numpy",
     ],
 )
 
 py_library(
     name = "combinations",
-    testonly = 1,
     srcs = ["combinations.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-    ],
     deps = [
         ":mirrored_strategy",
         ":one_device_strategy",
+        ":parameter_server_strategy",
         ":tpu_strategy",
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
         "//tensorflow/contrib/optimizer_v2:training",
@@ -186,6 +188,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/keras/optimizer_v2",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -193,30 +196,12 @@ py_library(
 py_test(
     name = "combinations_test",
     srcs = ["combinations_test.py"],
-    tags = [
-        "no_pip",
-    ],
     deps = [
         ":combinations",
         "//tensorflow/python/eager:test",
     ],
 )
 
-py_test(
-    name = "one_device_strategy_test",
-    srcs = ["one_device_strategy_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-    ],
-    deps = [
-        ":one_device_strategy",
-        ":strategy_test_lib",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python/eager:test",
-    ],
-)
-
 # TODO(priyag): Rename this test to mirrored_strategy_test
 cuda_py_test(
     name = "mirrored_strategy_multigpu_test",
@@ -242,18 +227,13 @@ cuda_py_test(
     tags = [
         "guitar",
         "multi_and_single_gpu",
-        "no_pip",
     ],
 )
 
 py_library(
     name = "multi_worker_test_base",
-    testonly = 1,
     srcs = ["multi_worker_test_base.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-    ],
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
@@ -288,6 +268,8 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/distribute:numpy_dataset",
         "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/distribute:values",
     ],
@@ -320,14 +302,16 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
     ],
 )
 
-py_library(
-    name = "minimize_loss_test_lib",
-    testonly = 1,
+distribute_py_test(
+    name = "minimize_loss_test",
     srcs = ["minimize_loss_test.py"],
+    main = "minimize_loss_test.py",
+    tags = [
+        "multi_and_single_gpu",
+    ],
     deps = [
         ":combinations",
         ":mirrored_strategy",
@@ -347,18 +331,6 @@ py_library(
     ],
 )
 
-cuda_py_test(
-    name = "minimize_loss_test",
-    srcs = ["minimize_loss_test.py"],
-    additional_deps = [
-        ":minimize_loss_test_lib",
-    ],
-    tags = [
-        "multi_and_single_gpu",
-        "no_pip",
-    ],
-)
-
 cuda_py_test(
     name = "moving_averages_test",
     srcs = ["moving_averages_test.py"],
@@ -372,9 +344,6 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
-    tags = [
-        "no_pip",
-    ],
 )
 
 cuda_py_test(
@@ -392,7 +361,6 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
     ],
 )
 
@@ -415,7 +383,6 @@ cuda_py_test(
     tags = [
         "multi_and_single_gpu",
         "no_oss",  # http://b/119349471
-        "no_pip",
         "tf_integration_test",
     ],
 )
@@ -426,10 +393,10 @@ cuda_py_test(
     additional_deps = [
         ":keras_test_lib",
     ],
+    shard_count = 4,
     tags = [
         "multi_and_single_gpu",
         "no_oss",  # http://b/119349471
-        "no_pip",
         "tf_integration_test",
     ],
 )
@@ -459,7 +426,6 @@ cuda_py_test(
     shard_count = 48,
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
         # TODO(b/118768923): Re-enable {a,m,t}san test.
         "noasan",
         "nomsan",
@@ -481,10 +447,13 @@ py_library(
     ],
 )
 
-py_library(
-    name = "step_fn_test_lib",
-    testonly = 1,
+distribute_py_test(
+    name = "step_fn_test",
     srcs = ["step_fn_test.py"],
+    main = "step_fn_test.py",
+    tags = [
+        "multi_and_single_gpu",
+    ],
     deps = [
         ":combinations",
         ":single_loss_example",
@@ -497,18 +466,6 @@ py_library(
     ],
 )
 
-cuda_py_test(
-    name = "step_fn_test",
-    srcs = ["step_fn_test.py"],
-    additional_deps = [
-        ":step_fn_test_lib",
-    ],
-    tags = [
-        "multi_and_single_gpu",
-        "no_pip",
-    ],
-)
-
 py_library(
     name = "monitor",
     srcs = ["monitor.py"],
@@ -525,10 +482,10 @@ cuda_py_test(
     additional_deps = [
         ":combinations",
         ":monitor",
-        ":one_device_strategy",
         ":single_loss_example",
         "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/python/distribute:one_device_strategy",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
         "//tensorflow/python:framework_ops",
@@ -536,7 +493,6 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
     ],
 )
 
@@ -553,15 +509,13 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
     ],
-    tags = [
-        "no_pip",
-    ],
 )
 
 cuda_py_test(
     name = "cross_device_ops_test",
     srcs = ["cross_device_ops_test.py"],
     additional_deps = [
+        ":collective_all_reduce_strategy",
         ":combinations",
         ":multi_worker_test_base",
         ":mirrored_strategy",
@@ -577,14 +531,16 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
     ],
 )
 
 py_library(
     name = "keras_test_lib",
-    testonly = 1,
-    srcs = ["keras_test.py"],
+    srcs = [
+        "keras_backward_compat_test.py",
+        "keras_test.py",
+        "keras_utils_test.py",
+    ],
     deps = [
         ":combinations",
         "//tensorflow/contrib/distribute/python:mirrored_strategy",
@@ -599,46 +555,199 @@ py_library(
     ],
 )
 
-cuda_py_test(
+distribute_py_test(
     name = "keras_test",
     srcs = ["keras_test.py"],
-    additional_deps = [
+    full_precision = True,
+    main = "keras_test.py",
+    shard_count = 32,
+    tags = [
+        "multi_and_single_gpu",
+        "no_oss",  # TODO(b/117919883): Fix python error.
+        "no_windows_gpu",
+        "notsan",
+    ],
+    deps = [
         ":keras_test_lib",
     ],
-    shard_count = 16,
+)
+
+distribute_py_test(
+    name = "keras_utils_test",
+    srcs = ["keras_utils_test.py"],
+    full_precision = True,
+    main = "keras_utils_test.py",
+    shard_count = 32,
     tags = [
         "multi_and_single_gpu",
         "no_oss",  # TODO(b/117919883): Fix python error.
-        "no_pip",
         "no_windows_gpu",
         "notsan",
     ],
+    deps = [
+        ":keras_test",
+        ":keras_test_lib",
+    ],
+)
+
+# TODO(b/121200287): Remove this in 2.0
+distribute_py_test(
+    name = "keras_backward_compat_test",
+    srcs = ["keras_backward_compat_test.py"],
+    full_precision = True,
+    main = "keras_backward_compat_test.py",
+    shard_count = 31,
+    tags = [
+        "multi_and_single_gpu",
+        "no_oss",  # TODO(b/117919883): Fix python error.
+        "no_windows_gpu",
+        "notsan",
+    ],
+    deps = [
+        ":keras_test_lib",
+    ],
 )
 
 py_library(
-    name = "metrics_v1_test_lib",
-    testonly = 1,
-    srcs = ["metrics_v1_test.py"],
+    name = "keras_correctness_test_lib",
+    srcs = [
+        "keras_correctness_test_base.py",
+        "keras_dnn_correctness_test.py",
+        "keras_embedding_model_correctness_test.py",
+        "keras_image_model_correctness_test.py",
+        "keras_lstm_model_correctness_test.py",
+        "keras_stateful_lstm_model_correctness_test.py",
+    ],
     deps = [
         ":combinations",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:metrics",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/contrib/distribute/python:mirrored_strategy",
+        "//tensorflow/contrib/distribute/python:tpu_strategy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:training",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/keras",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
-    name = "metrics_v1_test",
-    srcs = ["metrics_v1_test.py"],
-    additional_deps = [
-        ":metrics_v1_test_lib",
+distribute_py_test(
+    name = "keras_dnn_correctness_test",
+    size = "medium",
+    srcs = ["keras_dnn_correctness_test.py"],
+    full_precision = True,
+    main = "keras_dnn_correctness_test.py",
+    # Shard count is set to an odd number to distribute tasks across
+    # shards more evenly.
+    shard_count = 19,
+    tags = [
+        "multi_and_single_gpu",
+        "no_oss",  # TODO(b/117919883): Fix python error.
+        "no_windows_gpu",
+        "notsan",
+    ],
+    deps = [
+        ":keras_correctness_test_lib",
     ],
+)
+
+distribute_py_test(
+    name = "keras_image_model_correctness_test",
+    size = "medium",
+    srcs = ["keras_image_model_correctness_test.py"],
+    full_precision = True,
+    main = "keras_image_model_correctness_test.py",
+    # Shard count is set to an odd number to distribute tasks across
+    # shards more evenly.
+    shard_count = 31,
+    tags = [
+        "multi_and_single_gpu",
+        "no_oss",  # TODO(b/117919883): Fix python error.
+        "no_windows_gpu",
+        "notsan",
+    ],
+    deps = [
+        ":keras_correctness_test_lib",
+    ],
+)
+
+distribute_py_test(
+    name = "keras_embedding_model_correctness_test",
+    size = "medium",
+    srcs = ["keras_embedding_model_correctness_test.py"],
+    full_precision = True,
+    main = "keras_embedding_model_correctness_test.py",
+    # Shard count is set to an odd number to distribute tasks across
+    # shards more evenly.
+    shard_count = 31,
+    tags = [
+        "multi_and_single_gpu",
+        "no_oss",  # TODO(b/117919883): Fix python error.
+        "no_windows_gpu",
+        "notsan",
+    ],
+    deps = [
+        ":keras_correctness_test_lib",
+    ],
+)
+
+distribute_py_test(
+    name = "keras_lstm_model_correctness_test",
+    size = "medium",
+    srcs = ["keras_lstm_model_correctness_test.py"],
+    full_precision = True,
+    main = "keras_lstm_model_correctness_test.py",
+    # Shard count is set to an odd number to distribute tasks across
+    # shards more evenly.
+    shard_count = 31,
+    tags = [
+        "multi_and_single_gpu",
+        "no_oss",  # TODO(b/117919883): Fix python error.
+        "no_windows_gpu",
+        "notsan",
+    ],
+    deps = [
+        ":keras_correctness_test_lib",
+    ],
+)
+
+distribute_py_test(
+    name = "keras_stateful_lstm_model_correctness_test",
+    size = "medium",
+    srcs = ["keras_stateful_lstm_model_correctness_test.py"],
+    full_precision = True,
+    main = "keras_stateful_lstm_model_correctness_test.py",
+    # Shard count is set to an odd number to distribute tasks across
+    # shards more evenly.
+    shard_count = 31,
     tags = [
         "multi_and_single_gpu",
+        "no_oss",  # TODO(b/117919883): Fix python error.
         "no_pip",
+        "no_windows_gpu",
+        "notsan",
+    ],
+    deps = [
+        ":keras_correctness_test_lib",
+    ],
+)
+
+distribute_py_test(
+    name = "metrics_v1_test",
+    srcs = ["metrics_v1_test.py"],
+    main = "metrics_v1_test.py",
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    deps = [
+        ":combinations",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:test",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -656,7 +765,6 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
     ],
 )
 
@@ -667,7 +775,6 @@ cuda_py_test(
     additional_deps = [
         ":combinations",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:checkpoint_utils_test",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
@@ -675,6 +782,25 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
-        "no_pip",
+    ],
+)
+
+tf_xla_py_test(
+    name = "checkpointing_test",
+    srcs = ["checkpointing_test.py"],
+    disabled_backends = [
+        # Only makes sense on TPUs
+        "cpu",
+        "gpu",
+        "cpu_ondemand",
+    ],
+    tags = [
+        "no_oss",
+    ],
+    deps = [
+        ":tpu_strategy",
+        "//tensorflow/compiler/tests:xla_test",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/training/tracking:util",
     ],
 )
diff --git a/tensorflow/contrib/distribute/python/checkpoint_utils_test.py b/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
index 31bd0e996a247a2fc01405fb3b8172a40853d698..7ee50f03155636a487020d0a9178107a06775588 100644
--- a/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
+++ b/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
@@ -25,6 +25,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 from absl.testing import parameterized
 
 from tensorflow.contrib.distribute.python import combinations
@@ -33,7 +34,23 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import checkpoint_utils
-from tensorflow.python.training import checkpoint_utils_test
+from tensorflow.python.training import saver as saver_lib
+
+
+def _create_checkpoints(sess, checkpoint_dir):
+  checkpoint_prefix = os.path.join(checkpoint_dir, "model")
+  checkpoint_state_name = "checkpoint"
+  v1 = variable_scope.get_variable("var1", [1, 10])
+  v2 = variable_scope.get_variable("var2", [10, 10])
+  sess.run(variables.global_variables_initializer())
+  v1_value, v2_value = sess.run([v1, v2])
+  saver = saver_lib.Saver()
+  saver.save(
+      sess,
+      checkpoint_prefix,
+      global_step=0,
+      latest_filename=checkpoint_state_name)
+  return v1_value, v2_value
 
 
 class CheckpointUtilsWithDistributionStrategyTest(
@@ -51,8 +68,7 @@ class CheckpointUtilsWithDistributionStrategyTest(
   def testInitFromCheckpoint(self, distribution, in_replica_mode):
     checkpoint_dir = self.get_temp_dir()
     with self.cached_session() as session:
-      v1_value, v2_value, _, _ = checkpoint_utils_test._create_checkpoints(
-          session, checkpoint_dir)
+      v1_value, v2_value = _create_checkpoints(session, checkpoint_dir)
 
     def init_and_verify(g):
       v1 = variable_scope.get_variable("new_var1", [1, 10])
@@ -71,7 +87,7 @@ class CheckpointUtilsWithDistributionStrategyTest(
 
     with ops.Graph().as_default() as g, distribution.scope():
       if in_replica_mode:
-        distribution.call_for_each_replica(init_and_verify, args=[g])
+        distribution.extended.call_for_each_replica(init_and_verify, args=[g])
       else:
         init_and_verify(g)
 
diff --git a/tensorflow/contrib/distribute/python/checkpointing_test.py b/tensorflow/contrib/distribute/python/checkpointing_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..eadf7233f2ae5ee50b71836ebfcc895163124ac2
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/checkpointing_test.py
@@ -0,0 +1,95 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.contrib.distribute.python import tpu_strategy
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
+from tensorflow.python.platform import test
+from tensorflow.python.training import adam as adam_v1
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training import training_util
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util as trackable_utils
+
+
+class NonLayerTrackable(tracking.AutoTrackable):
+
+  def __init__(self):
+    super(NonLayerTrackable, self).__init__()
+    self.a_variable = trackable_utils.add_variable(
+        self, name="a_variable", shape=[])
+
+
+class Subclassed(training.Model):
+  """A concrete Model for testing."""
+
+  def __init__(self):
+    super(Subclassed, self).__init__()
+    self._named_dense = core.Dense(1, use_bias=True)
+    self._second = core.Dense(1, use_bias=False)
+    # We can still track Trackables which aren't Layers.
+    self._non_layer = NonLayerTrackable()
+
+  def call(self, values):
+    ret = self._second(self._named_dense(values))
+    return ret
+
+
+class TrainingCheckpointTests(xla_test.XLATestCase):
+
+  def testEagerTPUDistributionStrategy(self):
+    self.skipTest("b/121387144")
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+    def _train_fn(optimizer, model):
+      input_value = constant_op.constant([[3.]])
+      optimizer.minimize(
+          functools.partial(model, input_value),
+          global_step=root.optimizer_step)
+
+    for training_continuation in range(3):
+      strategy = tpu_strategy.TPUStrategy()
+      with strategy.scope():
+        model = Subclassed()
+        optimizer = adam_v1.AdamOptimizer(0.001)
+        root = trackable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            optimizer_step=training_util.get_or_create_global_step())
+        root.restore(checkpoint_management.latest_checkpoint(
+            checkpoint_directory))
+
+        for _ in range(num_training_steps):
+          strategy.extended.call_for_each_replica(
+              functools.partial(_train_fn, optimizer, model))
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         root.optimizer_step.numpy())
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
index 5c50a20490482856becedf7b1379d2a0583d9a11..19741627980c34d8c281f7aed6f1464d4a03393e 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
@@ -18,27 +18,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import copy
-
-from tensorflow.contrib.distribute.python import mirrored_strategy
-from tensorflow.core.protobuf import rewriter_config_pb2
-from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
-from tensorflow.python.distribute import cross_device_utils
-from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import multi_worker_util
-from tensorflow.python.distribute import values
-from tensorflow.python.eager import context
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import collective_ops
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
 
 
 # TODO(yuefengz): support in-graph replication.
 class CollectiveAllReduceStrategy(distribute_lib.DistributionStrategy):
   """Distribution strategy that uses collective ops for all-reduce.
 
+  *** contrib version ***
+
   It is similar to the MirroredStrategy but it uses collective ops for
   reduction.
 
@@ -61,276 +52,19 @@ class CollectiveAllReduceStrategy(distribute_lib.DistributionStrategy):
         CollectiveAllReduceExtended(self, num_gpus_per_worker))
 
 
-class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
+class CollectiveAllReduceExtended(
+    collective_all_reduce_strategy.CollectiveAllReduceExtended):
   """Implementation of CollectiveAllReduceStrategy."""
 
   def __init__(self, container_strategy, num_gpus_per_worker):
-    distribute_lib.DistributionStrategyExtended.__init__(
-        self, container_strategy)
-    self._cross_device_ops = None
-    self._num_gpus_per_worker = num_gpus_per_worker
-    self._initialize_local_worker(num_gpus_per_worker)
-
-  def _initialize_local_worker(self, num_gpus_per_worker):
-    """Initializes the object for local training."""
-    self._is_chief = True
-    self._num_workers = 1
-
-    if num_gpus_per_worker:
-      local_devices = tuple(
-          "/device:GPU:%d" % i for i in range(num_gpus_per_worker)
-      )
-    else:
-      local_devices = ("/device:CPU:0",)
-    self._worker_device = device_util.canonicalize("/device:CPU:0")
-
-    self._collective_keys = cross_device_utils.CollectiveKeys()
-    self._initialize_local(local_devices)
-    self._cross_tower_ops = cross_device_ops_lib.CollectiveAllReduce(
-        num_workers=self._num_workers,
-        num_gpus_per_worker=num_gpus_per_worker,
-        collective_keys=self._collective_keys)
-
-    self._cluster_spec = None
-    self._task_type = None
-    self._task_id = None
-
-    logging.info("CollectiveAllReduceStrategy with local_devices = %r",
-                 local_devices)
-
-  def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec,
-                               task_type, task_id):
-    """Initializes the object for multi-worker training."""
-    if task_type is None or task_id is None:
-      raise ValueError("When `cluster_spec` is given, you must also specify "
-                       "`task_type` and `task_id`")
-    if task_type not in ("chief", "worker"):
-      raise ValueError(
-          "Unrecognized task_type: %r, valid task types are: \"chief\", "
-          "\"worker\"." % task_type)
-    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
-    self._num_workers = multi_worker_util.worker_count(cluster_spec, task_type)
-    if not self._num_workers:
-      raise ValueError("No `worker` or `chief` tasks can be found in "
-                       "`cluster_spec`.")
-
-    self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
-                                                task_id)
-
-    self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
-    if num_gpus_per_worker:
-      local_devices = tuple(
-          "%s/device:GPU:%d" % (self._worker_device, i)
-          for i in range(num_gpus_per_worker)
-      )
-    else:
-      local_devices = (self._worker_device,)
-
-    self._collective_keys = cross_device_utils.CollectiveKeys()
-    self._initialize_local(local_devices)
-    self._cross_tower_ops = cross_device_ops_lib.CollectiveAllReduce(
-        num_workers=self._num_workers,
-        num_gpus_per_worker=num_gpus_per_worker,
-        collective_keys=self._collective_keys)
-
-    # Add a default device so that ops without specified devices will not end up
-    # on other workers.
-    self._default_device = "/job:%s/task:%d" % (task_type, task_id)
-
-    self._cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
-    self._task_type = task_type
-    self._task_id = task_id
-
-    logging.info(
-        "Multi-worker CollectiveAllReduceStrategy with "
-        "cluster_spec = %r, task_type = %r, task_id = %r, "
-        "num_workers = %r, local_devices = %r", cluster_spec.as_dict(),
-        task_type, task_id, self._num_workers, local_devices)
-
-  def _create_variable(self, next_creator, *args, **kwargs):
-    colocate_with = kwargs.pop("colocate_with", None)
-    devices = self._get_devices_from(colocate_with)
-    group_size = len(devices) * self._num_workers
-    group_key = self._collective_keys.get_group_key(self._devices)
-
-    def _real_mirrored_creator(devices, *args, **kwargs):
-      """Creates one MirroredVariable on the current worker."""
-      index = {}
-      unique_var_name = ops.get_default_graph().unique_name(
-          kwargs["name"], mark_as_used=False).rstrip("/")
-      collective_instance_key = self._collective_keys.get_instance_key(
-          key_id=unique_var_name)
-      if "initial_value" not in kwargs:
-        raise ValueError("Initial value must be specified.")
-      initial_value = kwargs["initial_value"]
-      if callable(initial_value):
-        initial_value_fn = initial_value
-      else:
-        initial_value_fn = lambda: initial_value
-
-      for i, d in enumerate(devices):
-        with ops.device(d):
-          if i > 0:
-            # Give replicas meaningful distinct names:
-            var0name = index[devices[0]].name.split(":")[0]
-            # We append a / to variable names created on replicas with id > 0 to
-            # ensure that we ignore the name scope and instead use the given
-            # name as the absolute name of the variable.
-            kwargs["name"] = "%s/replica_%d/" % (var0name, i)
-
-          # The initial value fn makes sure variables all initialized to
-          # same values. The first device of the chief worker will send their
-          # variable values to other devices and other workers.
-          def _overridden_initial_value_fn(device=d, index=i):  # pylint: disable=g-missing-docstring
-            with ops.device(device):
-              initial_value = initial_value_fn()
-              assert not callable(initial_value)
-              initial_value = ops.convert_to_tensor(initial_value)
-
-              if self._is_chief and index == 0:
-                bcast_send = collective_ops.broadcast_send(
-                    initial_value, initial_value.shape, initial_value.dtype,
-                    group_size, group_key, collective_instance_key)
-                with ops.control_dependencies([bcast_send]):
-                  return array_ops.identity(initial_value)
-              else:
-                return collective_ops.broadcast_recv(
-                    initial_value.shape, initial_value.dtype, group_size,
-                    group_key, collective_instance_key)
-
-          kwargs["initial_value"] = _overridden_initial_value_fn
-
-          with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
-            v = next_creator(*args, **kwargs)
-
-          if i == 0:
-            actual_var_name = v.name.split(":")[0]
-            assert unique_var_name == actual_var_name, "%r vs %r" % (
-                unique_var_name, actual_var_name)
-          assert not isinstance(v, values.DistributedVariable)
-          index[d] = v
-      return index
-
-    # pylint: disable=protected-access
-    return mirrored_strategy._create_mirrored_variable(
-        devices, _real_mirrored_creator, *args, **kwargs)
-
-  def _distribute_dataset(self, dataset_fn):
-    """Distributes the dataset to each local GPU."""
-    # TODO(yuefengz): shard the dataset.
-    return values.PerReplicaDataset(
-        self._call_dataset_fn(dataset_fn), self._devices, True)
-
-  def _make_dataset_iterator(self, dataset):
-    worker_device_pairs = [(self._worker_device, self._devices)]
-    return values.DatasetIterator(dataset, worker_device_pairs,
-                                  self._num_replicas_in_sync)
-
-  def _make_input_fn_iterator(
-      self,
-      input_fn,
-      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
-    """Distributes the dataset to each local GPU."""
-    if self._cluster_spec is None:
-      input_pipeline_id = 0
-    else:
-      input_pipeline_id = multi_worker_util.id_in_cluster(
-          self._cluster_spec, self._task_type, self._task_id)
-    input_context = distribute_lib.InputContext(
-        num_input_pipelines=self._num_workers,
-        input_pipeline_id=input_pipeline_id,
-        num_replicas_in_sync=self._num_replicas_in_sync)
-
-    return values.InputFunctionIterator(
-        input_fn, [(self._worker_device, self._devices)], [input_context])
-
-  def _configure(self,
-                 session_config=None,
-                 cluster_spec=None,
-                 task_type=None,
-                 task_id=None):
-    """Configures the object.
-
-    Args:
-      session_config: a `tf.ConfigProto`
-      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
-        cluster configurations.
-      task_type: the current task type, such as "worker".
-      task_id: the current task id.
-
-    Raises:
-      ValueError: if `task_type` is not in the `cluster_spec`.
-    """
-    if not self._cluster_spec and cluster_spec:
-      # If a `cluster_spec` is already passed in, do nothing here.
-      # TODO(yuefengz): check `cluster_spec` is the same if this object has
-      # already been initialized with a `cluster_spec`.
-      self._initialize_multi_worker(self._num_gpus_per_worker, cluster_spec,
-                                    task_type, task_id)
-
-    if session_config:
-      session_config.CopyFrom(self._update_config_proto(session_config))
-
-  def _update_config_proto(self, config_proto):
-    updated_config = copy.deepcopy(config_proto)
-    # Enable the scoped allocator optimization for CollectiveOps.  This
-    # optimization converts many small all-reduces into fewer larger
-    # all-reduces.
-    rewrite_options = updated_config.graph_options.rewrite_options
-    rewrite_options.scoped_allocator_optimization = (
-        rewriter_config_pb2.RewriterConfig.ON)
-    # We turn on ScopedAllocator only for CollectiveReduce op, i.e. enable_op =
-    # ["CollectiveReduce"].  Since we can't assign to a repeated proto field, we
-    # clear and then append.
-    del rewrite_options.scoped_allocator_opts.enable_op[:]
-    rewrite_options.scoped_allocator_opts.enable_op.append("CollectiveReduce")
-
-    if not self._cluster_spec:
-      return updated_config
-
-    assert self._task_type
-    assert self._task_id is not None
-
-    # Collective group leader is needed for collective ops to coordinate
-    # workers.
-    if "chief" in self._cluster_spec.jobs:
-      updated_config.experimental.collective_group_leader = (
-          "/job:chief/replica:0/task:0")
-    else:
-      if "worker" not in self._cluster_spec.jobs:
-        raise ValueError(
-            "You must have `chief` or `worker` jobs in the `cluster_spec`.")
-      updated_config.experimental.collective_group_leader = (
-          "/job:worker/replica:0/task:0")
-
-    # The device filters prevent communication between workers.
-    del updated_config.device_filters[:]
-    updated_config.device_filters.append(
-        "/job:%s/task:%d" % (self._task_type, self._task_id))
-
-    return updated_config
-
-  @property
-  def experimental_between_graph(self):
-    return True
-
-  @property
-  def experimental_should_init(self):
-    return True
-
-  @property
-  def should_checkpoint(self):
-    return self._is_chief
-
-  @property
-  def should_save_summary(self):
-    return self._is_chief
-
-  @property
-  def _num_replicas_in_sync(self):
-    return len(self._devices) * self._num_workers
-
-  # TODO(priyag): Delete this once all strategies use global batch size.
-  @property
-  def _global_batch_size(self):
-    return False
+    # Use TFConfigClusterResolver to parse TF_CONFIG. We don't want to change
+    # the constructor's interface to allow customized cluster resolver. Use
+    # SimpleClusterResolver to override num_accelerators.
+    tfconfig = TFConfigClusterResolver()
+    cluster_resolver = SimpleClusterResolver(
+        cluster_spec=tfconfig.cluster_spec(),
+        task_type=tfconfig.task_type,
+        task_id=tfconfig.task_id,
+        num_accelerators=num_gpus_per_worker)
+    super(CollectiveAllReduceExtended, self).__init__(
+        container_strategy, cluster_resolver=cluster_resolver)
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
index 8a9e583f0afaac37a2057bae9b1ed79de43d68bc..ee7640dd1cea15e62ae9912ebedbd853778364a6 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
@@ -29,9 +29,13 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import collective_all_reduce_strategy as core_collective_all_reduce_strategy
 from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -49,6 +53,55 @@ from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import test
 from tensorflow.python.training import adam
 from tensorflow.python.training import training_util
+from tensorflow.python.training.server_lib import ClusterSpec
+
+
+class MockCollectiveAllReduceStrategy(distribute_lib.DistributionStrategy):
+  """Mock the strategy to allow cluster resolver as an argument."""
+
+  def __init__(self, cluster_resolver):
+    super(MockCollectiveAllReduceStrategy, self).__init__(
+        core_collective_all_reduce_strategy.CollectiveAllReduceExtended(
+            self, cluster_resolver=cluster_resolver))
+
+
+def create_test_objects(cluster_spec=None,
+                        task_type=None,
+                        task_id=None,
+                        num_gpus=None,
+                        use_core_strategy=False):
+  sess_config = config_pb2.ConfigProto()
+  if num_gpus is None:
+    num_gpus = context.num_gpus()
+  if use_core_strategy:
+    if cluster_spec and task_type and task_id is not None:
+      cluster_resolver = SimpleClusterResolver(
+          cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
+          task_type=task_type,
+          task_id=task_id,
+          num_accelerators=num_gpus)
+      target = 'grpc://' + cluster_spec[task_type][task_id]
+    else:
+      cluster_resolver = SimpleClusterResolver(
+          ClusterSpec({}), num_accelerators=num_gpus)
+      target = ''
+
+    strategy = MockCollectiveAllReduceStrategy(cluster_resolver)
+    sess_config = strategy.update_config_proto(sess_config)
+  else:
+    strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
+        num_gpus_per_worker=num_gpus)
+    if task_type and task_id is not None:
+      strategy.configure(
+          session_config=sess_config,
+          cluster_spec=cluster_spec,
+          task_type=task_type,
+          task_id=task_id)
+      target = 'grpc://' + cluster_spec[task_type][task_id]
+    else:
+      target = ''
+
+  return strategy, target, sess_config
 
 
 class CollectiveAllReduceStrategyTestBase(
@@ -64,16 +117,18 @@ class CollectiveAllReduceStrategyTestBase(
     CollectiveAllReduceStrategyTestBase.collective_key_base += 100000
     super(CollectiveAllReduceStrategyTestBase, self).setUp()
 
-  def _get_test_object(self, task_type, task_id, num_gpus=0):
-    distribution = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
-        num_gpus_per_worker=num_gpus)
-    session_config = config_pb2.ConfigProto()
-    if task_type and task_id is not None:
-      distribution.configure(
-          session_config=session_config,
-          cluster_spec=self._cluster_spec,
-          task_type=task_type,
-          task_id=task_id)
+  def _get_test_object(self,
+                       task_type,
+                       task_id,
+                       num_gpus=0,
+                       use_core_strategy=False):
+    strategy, target, session_config = create_test_objects(
+        cluster_spec=self._cluster_spec,
+        task_type=task_type,
+        task_id=task_id,
+        num_gpus=num_gpus,
+        use_core_strategy=use_core_strategy)
+
     collective_keys = cross_device_utils.CollectiveKeys(
         group_key_start=10 * num_gpus +
         CollectiveAllReduceStrategyTestBase.collective_key_base,
@@ -81,16 +136,16 @@ class CollectiveAllReduceStrategyTestBase(
         CollectiveAllReduceStrategyTestBase.collective_key_base,
         instance_key_with_id_start=num_gpus * 10000 +
         CollectiveAllReduceStrategyTestBase.collective_key_base)
-    distribution.extended._collective_keys = collective_keys
-    distribution.extended._inferred_cross_device_ops._collective_keys = (
-        collective_keys)
-    if task_type and task_id is not None:
-      return distribution, 'grpc://' + self._cluster_spec[task_type][
-          task_id], session_config
-    else:
-      return distribution, '', session_config
+    strategy.extended._collective_keys = collective_keys
+    strategy.extended._cross_device_ops._collective_keys = (collective_keys)
 
-  def _test_minimize_loss_graph(self, task_type, task_id, num_gpus):
+    return strategy, target, session_config
+
+  def _test_minimize_loss_graph(self,
+                                task_type,
+                                task_id,
+                                num_gpus,
+                                use_core_strategy=False):
     d, master_target, config = self._get_test_object(task_type, task_id,
                                                      num_gpus)
     with ops.Graph().as_default(), \
@@ -123,20 +178,20 @@ class CollectiveAllReduceStrategyTestBase(
       def step():
         """Perform one optimization step."""
         # Run forward & backward to get gradients, variables list.
-        g_v = d.call_for_each_replica(grad_fn, args=[one])
+        g_v = d.extended.call_for_each_replica(grad_fn, args=[one])
         # Update the variables using the gradients and the update() function.
         before_list = []
         after_list = []
         for g, v in g_v:
-          fetched = d.read_var(v)
+          fetched = d.extended.read_var(v)
           before_list.append(fetched)
           with ops.control_dependencies([fetched]):
             # TODO(yuefengz): support non-Mirrored variable as destinations.
             g = d.extended.reduce_to(
                 reduce_util.ReduceOp.SUM, g, destinations=v)
             with ops.control_dependencies(
-                d.update(v, update, g, grouped=False)):
-              after_list.append(d.read_var(v))
+                d.extended.update(v, update, args=(g,), group=False)):
+              after_list.append(d.extended.read_var(v))
         return before_list, after_list
 
       before_out, after_out = step()
@@ -158,7 +213,11 @@ class CollectiveAllReduceStrategyTestBase(
       self.assertLess(error_after, error_before)
       return error_after < error_before
 
-  def _test_complex_model(self, task_type, task_id, num_gpus):
+  def _test_complex_model(self,
+                          task_type,
+                          task_id,
+                          num_gpus,
+                          use_core_strategy=False):
     d, master_target, config = self._get_test_object(task_type, task_id,
                                                      num_gpus)
 
@@ -192,6 +251,7 @@ class CollectiveAllReduceStrategyTestBase(
       image = random_ops.random_uniform([2, 28, 28])
       label = random_ops.random_uniform([2, 1], maxval=10, dtype=dtypes.int32)
       logits = model(image, training=True)
+      # TODO(yuefengz): make loss a callable for eager mode.
       loss = losses.sparse_softmax_cross_entropy(labels=label, logits=logits)
       optimizer = adam.AdamOptimizer(learning_rate=1e-4)
       train_op = optimizer.minimize(loss,
@@ -202,14 +262,18 @@ class CollectiveAllReduceStrategyTestBase(
          self.cached_session(config=config,
                              target=master_target) as sess:
       with d.scope():
-        train_op = d.call_for_each_replica(model_fn)
+        train_op = d.extended.call_for_each_replica(model_fn)
         train_op = d.group(d.unwrap(train_op))
 
       sess.run(variables.global_variables_initializer())
       sess.run(train_op)
       return True
 
-  def _test_variable_initialization(self, task_type, task_id, num_gpus):
+  def _test_variable_initialization(self,
+                                    task_type,
+                                    task_id,
+                                    num_gpus,
+                                    use_core_strategy=False):
     distribution, master_target, config = self._get_test_object(
         task_type, task_id, num_gpus)
     with ops.Graph().as_default(), \
@@ -225,7 +289,7 @@ class CollectiveAllReduceStrategyTestBase(
                 1.0, 10.0, dtype=dtypes.float32))
         return array_ops.identity(x)
 
-      x = distribution.call_for_each_replica(model_fn)
+      x = distribution.extended.call_for_each_replica(model_fn)
       reduced_x = distribution.reduce(reduce_util.ReduceOp.MEAN, x)
       x = distribution.unwrap(x)[0]
 
@@ -238,8 +302,14 @@ class CollectiveAllReduceStrategyTestBase(
                                                        reduced_x_value)))
     return np.allclose(x_value, reduced_x_value, atol=1e-5)
 
-  def _test_input_fn_iterator(self, task_type, task_id, num_gpus, input_fn,
-                              expected_values):
+  def _test_input_fn_iterator(self,
+                              task_type,
+                              task_id,
+                              num_gpus,
+                              input_fn,
+                              expected_values,
+                              test_reinitialize=True,
+                              use_core_strategy=False):
     distribution, master_target, config = self._get_test_object(
         task_type, task_id, num_gpus)
     devices = distribution.extended.worker_devices
@@ -252,22 +322,24 @@ class CollectiveAllReduceStrategyTestBase(
 
       for expected_value in expected_values:
         next_element = iterator.get_next()
-        computed_value = sess.run(
-            [values.select_device(d, next_element) for d in devices])
+        computed_value = sess.run([values.select_replica(r, next_element)
+                                   for r in range(len(devices))])
         self.assertEqual(expected_value, computed_value)
 
       with self.assertRaises(errors.OutOfRangeError):
         next_element = iterator.get_next()
-        sess.run([values.select_device(d, next_element) for d in devices])
+        sess.run([values.select_replica(r, next_element)
+                  for r in range(len(devices))])
 
       # After re-initializing the iterator, should be able to iterate again.
-      sess.run(iterator.initialize())
+      if test_reinitialize:
+        sess.run(iterator.initialize())
 
-      for expected_value in expected_values:
-        next_element = iterator.get_next()
-        computed_value = sess.run(
-            [values.select_device(d, next_element) for d in devices])
-        self.assertEqual(expected_value, computed_value)
+        for expected_value in expected_values:
+          next_element = iterator.get_next()
+          computed_value = sess.run([values.select_replica(r, next_element)
+                                     for r in range(len(devices))])
+          self.assertEqual(expected_value, computed_value)
 
 
 class DistributedCollectiveAllReduceStrategyTest(
@@ -281,71 +353,116 @@ class DistributedCollectiveAllReduceStrategyTest(
     cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
         num_workers=3, num_ps=0)
 
-  def test_num_replicas_in_sync(self):
-    distribution = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
-        num_gpus_per_worker=2)
-    distribution.configure(cluster_spec=self._cluster_spec, task_type='worker',
-                           task_id=0)
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def test_num_replicas_in_sync(self, use_core_strategy):
+    distribution, _, _ = create_test_objects(
+        cluster_spec=self._cluster_spec,
+        task_type='worker',
+        task_id=0,
+        num_gpus=2,
+        use_core_strategy=use_core_strategy)
     num_workers = len(self._cluster_spec.get('chief', []) +
                       self._cluster_spec.get('worker', []))
     self.assertEqual(2 * num_workers,
                      distribution.num_replicas_in_sync)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
-  def testMinimizeLossGraph(self, num_gpus):
-    self._run_between_graph_clients(self._test_minimize_loss_graph,
-                                    self._cluster_spec, num_gpus)
+      combinations.combine(
+          mode=['graph'],
+          num_gpus=[0, 1, 2],
+          required_gpus=1,
+          use_core_strategy=[True, False]))
+  def testMinimizeLossGraph(self, num_gpus, use_core_strategy):
+    self._run_between_graph_clients(
+        self._test_minimize_loss_graph,
+        self._cluster_spec,
+        num_gpus,
+        use_core_strategy=use_core_strategy)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
-  def testVariableInitialization(self, num_gpus):
+      combinations.combine(
+          mode=['graph'],
+          num_gpus=[0, 1, 2],
+          required_gpus=1,
+          use_core_strategy=[True, False]))
+  def testVariableInitialization(self, num_gpus, use_core_strategy):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
     self._run_between_graph_clients(
         self._test_variable_initialization,
         self._cluster_spec,
-        num_gpus=num_gpus)
+        num_gpus=num_gpus,
+        use_core_strategy=use_core_strategy)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
-  def testComplexModel(self, num_gpus):
+      combinations.combine(
+          mode=['graph'],
+          num_gpus=[0, 1, 2],
+          required_gpus=1,
+          use_core_strategy=[True, False]))
+  def testComplexModel(self, num_gpus, use_core_strategy):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
     self._run_between_graph_clients(
-        self._test_complex_model, self._cluster_spec, num_gpus=num_gpus)
+        self._test_complex_model,
+        self._cluster_spec,
+        num_gpus=num_gpus,
+        use_core_strategy=use_core_strategy)
 
+  # TODO(b/124344198): Re-enable after fixing this flaky test.
   # TODO(yuefengz): Update how we use num_gpus and required_gpus
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
-  def testMakeInputFnIterator(self, num_gpus):
+      combinations.combine(
+          mode=['graph'],
+          num_gpus=[0, 1, 2],
+          required_gpus=1,
+          use_dataset=[True, False],
+          use_core_strategy=[True, False]))
+  def DISABLED_testMakeInputFnIterator(self, num_gpus, use_dataset,
+                                       use_core_strategy):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
-    dataset_fn = lambda: dataset_ops.Dataset.range(100)
+    if use_dataset:
+      fn = lambda: dataset_ops.Dataset.range(100)
+    else:
+      def fn():
+        dataset = dataset_ops.Dataset.range(100)
+        it = dataset.make_one_shot_iterator()
+        return it.get_next
     # We use CPU as the device when num_gpus = 0
     devices_per_worker = max(1, num_gpus)
     expected_values = [[i+j for j in range(devices_per_worker)]
                        for i in range(0, 100, devices_per_worker)]
 
     input_fn = self._input_fn_to_test_input_context(
-        dataset_fn,
+        fn,
         expected_num_replicas_in_sync=3*devices_per_worker,
         expected_num_input_pipelines=3,
         expected_input_pipeline_id=1)  # because task_id = 1
-    self._test_input_fn_iterator('worker', 1, num_gpus,
-                                 input_fn, expected_values)
+    self._test_input_fn_iterator(
+        'worker',
+        1,
+        num_gpus,
+        input_fn,
+        expected_values,
+        test_reinitialize=use_dataset,
+        use_core_strategy=use_core_strategy)
 
-  def testUpdateConfigProto(self):
-    distribution = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
-        num_gpus_per_worker=2)
-    distribution.configure(
-        cluster_spec=self._cluster_spec, task_type='worker', task_id=1)
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testUpdateConfigProto(self, use_core_strategy):
+    strategy, _, _ = self._get_test_object(
+        task_type='worker',
+        task_id=1,
+        num_gpus=2,
+        use_core_strategy=use_core_strategy)
 
     config_proto = config_pb2.ConfigProto(device_filters=['to_be_overridden'])
     rewrite_options = config_proto.graph_options.rewrite_options
     rewrite_options.scoped_allocator_opts.enable_op.append('to_be_removed')
 
-    new_config = distribution.update_config_proto(config_proto)
+    new_config = strategy.update_config_proto(config_proto)
 
     # Verify group leader
     self.assertEqual('/job:worker/replica:0/task:0',
@@ -396,36 +513,136 @@ class DistributedCollectiveAllReduceStrategyTestWithChief(
         self._test_complex_model, self._cluster_spec, num_gpus=num_gpus)
 
 
-class LocalCollectiveAllReduceStrategy(CollectiveAllReduceStrategyTestBase,
-                                       strategy_test_lib.DistributionTestBase,
-                                       parameterized.TestCase):
+class LocalCollectiveAllReduceStrategy(
+    CollectiveAllReduceStrategyTestBase,
+    strategy_test_lib.DistributionTestBase,
+    strategy_test_lib.TwoDeviceDistributionTestBase,
+    parameterized.TestCase):
 
-  def testMinimizeLossGraph(self, num_gpus=2):
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph', 'eager'],
+          num_gpus=[2, 4],
+          required_gpus=2,
+          use_core_strategy=[True, False]))
+  def testMinimizeLoss(self, num_gpus, use_core_strategy):
     # Collective ops doesn't support strategy with one device.
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
-    self._test_minimize_loss_graph(None, None, num_gpus)
+    if context.executing_eagerly():
+      strategy, _, _ = self._get_test_object(
+          None, None, num_gpus, use_core_strategy=use_core_strategy)
+      self._test_minimize_loss_eager(strategy)
+    else:
+      self._test_minimize_loss_graph(
+          None, None, num_gpus, use_core_strategy=use_core_strategy)
 
-  def testComplexModel(self, num_gpus=2):
-    # Collective ops doesn't support strategy with one device.
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph'],
+          num_gpus=[2, 4],
+          required_gpus=2,
+          use_core_strategy=[True, False]))
+  def testComplexModel(self, num_gpus, use_core_strategy):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
-    self._test_complex_model(None, None, num_gpus)
+    self._test_complex_model(
+        None, None, num_gpus, use_core_strategy=use_core_strategy)
 
-  def testMakeInputFnIterator(self, num_gpus=2):
-    # Collective ops doesn't support strategy with one device.
-    if context.num_gpus() < num_gpus:
-      self.skipTest('Not enough GPUs')
-    dataset_fn = lambda: dataset_ops.Dataset.range(10)
-    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph', 'eager'],
+          required_gpus=2,
+          use_dataset=[True, False],
+          use_core_strategy=[True, False]))
+  def DISABLED_testMakeInputFnIterator(self, use_dataset, use_core_strategy):
+    num_gpus = 2
+    if use_dataset:
+      fn = lambda: dataset_ops.Dataset.range(5 * num_gpus)
+    else:
+      def fn():
+        dataset = dataset_ops.Dataset.range(5 * num_gpus)
+        it = dataset.make_one_shot_iterator()
+        return it.get_next
+    expected_values = [range(i, i + num_gpus) for i in range(0, 10, num_gpus)]
 
     input_fn = self._input_fn_to_test_input_context(
-        dataset_fn,
+        fn,
         expected_num_replicas_in_sync=num_gpus,
         expected_num_input_pipelines=1,
         expected_input_pipeline_id=0)
-    self._test_input_fn_iterator(None, None, num_gpus,
-                                 input_fn, expected_values)
+    self._test_input_fn_iterator(
+        None,
+        None,
+        num_gpus,
+        input_fn,
+        expected_values,
+        test_reinitialize=use_dataset,
+        use_core_strategy=use_core_strategy)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testAllReduceSum(self, use_core_strategy):
+    if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
+    distribution, target, config = self._get_test_object(
+        None, None, num_gpus=2, use_core_strategy=use_core_strategy)
+    with self.cached_session(config=config, target=target):
+      self._test_all_reduce_sum(distribution)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testAllReduceSumGradients(self, use_core_strategy):
+    if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
+    distribution, target, config = self._get_test_object(
+        None, None, num_gpus=2, use_core_strategy=use_core_strategy)
+    with self.cached_session(config=config, target=target):
+      self._test_all_reduce_sum_gradients(distribution)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testAllReduceSumGradientTape(self, use_core_strategy):
+    if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
+    distribution, target, config = self._get_test_object(
+        None, None, num_gpus=2, use_core_strategy=use_core_strategy)
+    with self.cached_session(config=config, target=target):
+      self._test_all_reduce_sum_gradient_tape(distribution)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testAllReduceMean(self, use_core_strategy):
+    if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
+    distribution, target, config = self._get_test_object(
+        None, None, num_gpus=2, use_core_strategy=use_core_strategy)
+    with self.cached_session(config=config, target=target):
+      self._test_all_reduce_mean(distribution)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testAllReduceMeanGradients(self, use_core_strategy):
+    if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
+    distribution, target, config = self._get_test_object(
+        None, None, num_gpus=2, use_core_strategy=use_core_strategy)
+    with self.cached_session(config=config, target=target):
+      self._test_all_reduce_mean_gradients(distribution)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testAllReduceMeanGradientTape(self, use_core_strategy):
+    if context.num_gpus() < 2: self.skipTest('Not enough GPUs')
+    distribution, target, config = self._get_test_object(
+        None, None, num_gpus=2, use_core_strategy=use_core_strategy)
+    with self.cached_session(config=config, target=target):
+      self._test_all_reduce_mean_gradient_tape(distribution)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testNumpyIterator(self, use_core_strategy):
+    num_gpus = 2
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    strategy, _, _ = self._get_test_object(
+        None, None, num_gpus=num_gpus, use_core_strategy=use_core_strategy)
+    self._test_numpy_iterator(strategy)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
index 365ce5cdec79f1914f0c9ccdf59a7dc59e6f819e..7c0f8033fbc046580bc46f90ee9945ffa2a718f9 100644
--- a/tensorflow/contrib/distribute/python/combinations.py
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -46,16 +46,22 @@ import unittest
 from absl.testing import parameterized
 import six
 
-from tensorflow.contrib.cluster_resolver import TPUClusterResolver
+from tensorflow.contrib import cluster_resolver
 from tensorflow.contrib.distribute.python import mirrored_strategy as mirrored_lib
 from tensorflow.contrib.distribute.python import one_device_strategy as one_device_lib
+from tensorflow.contrib.distribute.python import parameter_server_strategy
 from tensorflow.contrib.distribute.python import tpu_strategy as tpu_lib
 from tensorflow.contrib.optimizer_v2 import adagrad as adagrad_v2
 from tensorflow.contrib.optimizer_v2 import adam as adam_v2
 from tensorflow.contrib.optimizer_v2 import gradient_descent as gradient_descent_v2
+from tensorflow.contrib.tpu.python.tpu import device_assignment as device_assignment_lib
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import adagrad as adagrad_keras_v2
+from tensorflow.python.keras.optimizer_v2 import adam as adam_keras_v2
+from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras_v2
+from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_keras_v2
 from tensorflow.python.training import adagrad
 from tensorflow.python.training import adam
 from tensorflow.python.training import gradient_descent
@@ -226,7 +232,7 @@ def combine(**kwargs):
   if not kwargs:
     return [OrderedDict()]
 
-  sort_by_key = lambda k: k[0][0]
+  sort_by_key = lambda k: k[0]
   kwargs = OrderedDict(sorted(kwargs.items(), key=sort_by_key))
   first = list(kwargs.items())[0]
 
@@ -321,22 +327,49 @@ class NamedDistribution(object):
     return self._required_tpu
 
 
+def _get_tpu_strategy_creator(steps_per_run, use_single_core=False, **kwargs):
+  def _create_tpu_strategy():
+    resolver = cluster_resolver.TPUClusterResolver("")
+    topology = tpu_lib.initialize_tpu_system(resolver)
+    device_assignment = None
+    if use_single_core:
+      device_assignment = device_assignment_lib.DeviceAssignment(
+          topology, core_assignment=device_assignment_lib.
+          SINGLE_CORE_ASSIGNMENT)
+
+    strategy = tpu_lib.TPUStrategy(resolver, steps_per_run=steps_per_run,
+                                   device_assignment=device_assignment,
+                                   **kwargs)
+    return strategy
+  return _create_tpu_strategy
+
+
 # pylint: disable=g-long-lambda
 default_strategy = NamedDistribution(
     "Default",
-    distribution_strategy_context._get_default_distribution_strategy,  # pylint: disable=protected-access
+    distribution_strategy_context._get_default_strategy,  # pylint: disable=protected-access
     required_gpus=None)
 one_device_strategy = NamedDistribution(
     "OneDeviceCPU", lambda: one_device_lib.OneDeviceStrategy("/cpu:0"),
     required_gpus=None)
+one_device_strategy_gpu = NamedDistribution(
+    "OneDeviceGPU", lambda: one_device_lib.OneDeviceStrategy("/gpu:0"),
+    required_gpus=1)
 tpu_strategy = NamedDistribution(
-    "TPU", lambda: tpu_lib.TPUStrategy(
-        TPUClusterResolver(""), steps_per_run=2),
+    "TPU", _get_tpu_strategy_creator(steps_per_run=2),
     required_tpu=True)
 tpu_strategy_one_step = NamedDistribution(
-    "TPUOneStep", lambda: tpu_lib.TPUStrategy(
-        TPUClusterResolver(""), steps_per_run=1),
+    "TPUOneStep", _get_tpu_strategy_creator(steps_per_run=1),
+    required_tpu=True)
+tpu_strategy_one_core = NamedDistribution(
+    "TPUOneCore", _get_tpu_strategy_creator(
+        steps_per_run=2, use_single_core=True),
     required_tpu=True)
+tpu_strategy_one_step_one_core = NamedDistribution(
+    "TPUOneStepOneCore", _get_tpu_strategy_creator(
+        steps_per_run=1, use_single_core=True),
+    required_tpu=True)
+
 mirrored_strategy_with_one_cpu = NamedDistribution(
     "Mirrored1CPU",
     lambda: mirrored_lib.MirroredStrategy(["/cpu:0"]))
@@ -367,6 +400,11 @@ core_mirrored_strategy_with_two_gpus = NamedDistribution(
     "CoreMirrored2GPUs",
     lambda: mirrored_lib.CoreMirroredStrategy(["/gpu:0", "/gpu:1"]),
     required_gpus=2)
+parameter_server_strategy_with_two_gpus = NamedDistribution(
+    "ParameterServer2GPUs",
+    lambda: parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2),
+    required_gpus=2)
 
 
 gradient_descent_optimizer_v1_fn = NamedObject(
@@ -386,10 +424,20 @@ gradient_descent_optimizer_v2_fn = NamedObject(
 adagrad_optimizer_v2_fn = NamedObject(
     "AdagradV2", lambda: adagrad_v2.AdagradOptimizer(0.001))
 adam_optimizer_v2_fn = NamedObject(
-    "AdamV2", lambda: adam_v2.AdamOptimizer(0.001, epsilon=1))
+    "AdamV2", lambda: adam_v2.AdamOptimizer(0.001, epsilon=1.0))
 
 optimizers_v2 = [gradient_descent_optimizer_v2_fn, adagrad_optimizer_v2_fn]
 
+gradient_descent_optimizer_keras_v2_fn = NamedObject(
+    "GradientDescentKerasV2",
+    lambda: gradient_descent_keras_v2.SGD(0.2))
+adagrad_optimizer_keras_v2_fn = NamedObject(
+    "AdagradKerasV2", lambda: adagrad_keras_v2.Adagrad(0.001))
+adam_optimizer_keras_v2_fn = NamedObject(
+    "AdamKerasV2", lambda: adam_keras_v2.Adam(0.001, epsilon=1.0))
+rmsprop_optimizer_keras_v2_fn = NamedObject(
+    "RmsPropKerasV2", lambda: rmsprop_keras_v2.RMSprop(0.001))
+
 graph_and_eager_modes = ["graph", "eager"]
 
 
diff --git a/tensorflow/contrib/distribute/python/combinations_test.py b/tensorflow/contrib/distribute/python/combinations_test.py
index 86aa48cea889c6c2ce169b18bcabb6d08890fbed..9f3deadbec98c4f66061ca29b4d29a74b8de40b1 100644
--- a/tensorflow/contrib/distribute/python/combinations_test.py
+++ b/tensorflow/contrib/distribute/python/combinations_test.py
@@ -42,6 +42,14 @@ class TestingCombinationsTest(test.TestCase):
         "b": 3
     }], combinations.combine(a=[1, 2], b=[2, 3]))
 
+  def test_arguments_sorted(self):
+    self.assertEqual([
+        OrderedDict([("aa", 1), ("ab", 2)]),
+        OrderedDict([("aa", 1), ("ab", 3)]),
+        OrderedDict([("aa", 2), ("ab", 2)]),
+        OrderedDict([("aa", 2), ("ab", 3)])
+    ], combinations.combine(ab=[2, 3], aa=[1, 2]))
+
   def test_combine_single_parameter(self):
     self.assertEqual([{
         "a": 1,
diff --git a/tensorflow/contrib/distribute/python/cross_device_ops_test.py b/tensorflow/contrib/distribute/python/cross_device_ops_test.py
index d6e9521c1c1115ffdbdcf375ad4017bacb962832..2b8e0197961ae37b67dc8958054a03e164242dcd 100644
--- a/tensorflow/contrib/distribute/python/cross_device_ops_test.py
+++ b/tensorflow/contrib/distribute/python/cross_device_ops_test.py
@@ -23,6 +23,7 @@ import itertools
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.contrib.distribute.python import collective_all_reduce_strategy
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python import multi_worker_test_base
@@ -40,8 +41,16 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 
 
+def _get_devices(devices):
+  if isinstance(devices, (tuple, list)):
+    return tuple(device_util.resolve(d) for d in devices)
+  elif isinstance(devices, value_lib.DistributedValues):
+    return devices.devices
+  return (device_util.resolve(devices),)
+
+
 def _make_per_replica(values, devices, regroup=False):
-  devices = cross_device_ops_lib.get_devices_from(devices)
+  devices = _get_devices(devices)
   assert len(values) == len(devices)
 
   # We simulate the result of regroup called on PerReplica which strips the
@@ -51,12 +60,12 @@ def _make_per_replica(values, devices, regroup=False):
       placed_v = array_ops.identity(values[0])
     return placed_v
 
-  index = {}
+  index = []
   for d, v in zip(devices, values):
     with ops.device(d):
       placed_v = array_ops.identity(v)
-    index[d] = placed_v
-  return value_lib.PerReplica(index)
+    index.append(placed_v)
+  return value_lib.PerReplica(value_lib.ReplicaDeviceMap(devices), index)
 
 
 # pylint: disable=g-doc-args,g-doc-return-or-yield
@@ -66,9 +75,9 @@ def _fake_mirrored(value, devices):
   All components of the returned Mirrored have the same objects, which is not
   true in reality.
   """
-  devices = cross_device_ops_lib.get_devices_from(devices)
-  return value_lib.Mirrored(
-      {d: v for d, v in zip(devices, [value] * len(devices))})
+  devices = _get_devices(devices)
+  return value_lib.Mirrored(value_lib.ReplicaDeviceMap(devices),
+                            [value] * len(devices))
 
 
 def _make_indexed_slices(values, indices, dense_shape, device):
@@ -81,9 +90,9 @@ def _make_indexed_slices(values, indices, dense_shape, device):
 
 
 def _make_mirrored_indexed_slices(devices, values, indices, dense_shape):
-  return value_lib.Mirrored({
-      d: _make_indexed_slices(values, indices, dense_shape, d) for d in devices
-  })
+  values = [_make_indexed_slices(values, indices, dense_shape, d)
+            for d in devices]
+  return value_lib.Mirrored(value_lib.ReplicaDeviceMap(devices), values)
 
 
 _cpu_device = "/device:CPU:0"
@@ -107,16 +116,16 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
     else:
       self.assertEqual(type(left), type(right))
       self.assertEqual(set(left.devices), set(right.devices))
-      if isinstance(list(left._index.values())[0], ops.IndexedSlices):
-        for (d, v) in left._index.items():
-          self._assert_indexed_slices_equal(v, right._index[d])
+      if isinstance(left.values[0], ops.IndexedSlices):
+        for d in left.devices:
+          self._assert_indexed_slices_equal(left.get(d), right.get(d))
       elif context.executing_eagerly():
-        self.assertEqual([v.numpy() for v in left._index.values()],
-                         list(right._index.values()))
+        self.assertEqual([v.numpy() for v in left.values],
+                         list(right.values))
       else:
         with self.cached_session() as sess:
           self.assertEqual(
-              sess.run(list(left._index.values())), list(right._index.values()))
+              sess.run(list(left.values)), list(right.values))
 
   def _testReductionAndBroadcast(self, cross_device_ops, distribution):
     devices = distribution.extended.worker_devices
@@ -196,15 +205,15 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
   reduction_to_one_combinations = combinations.combine(
       cross_device_ops=[
           combinations.NamedObject(
-              "DefaultReductionToOneDeviceCrossDeviceOps",
-              cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps()),
+              "DefaultReductionToOneDevice",
+              cross_device_ops_lib.ReductionToOneDevice()),
           combinations.NamedObject(
               "ReductionToCPUDeviceCrossDeviceOps",
-              cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps(
+              cross_device_ops_lib.ReductionToOneDevice(
                   reduce_to_device=_cpu_device)),
           combinations.NamedObject(
               "AccumulateNCrossDeviceOp",
-              cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps(
+              cross_device_ops_lib.ReductionToOneDevice(
                   accumulation_fn=math_ops.accumulate_n)),
       ],
       distribution=[
@@ -220,20 +229,23 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
           combinations.NamedObject(
               "AllReduce",
               cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)),
-          combinations.NamedObject(
-              "HierarchicalCopy",
-              cross_device_ops_lib.AllReduceCrossDeviceOps(
-                  "hierarchical_copy", 8, 0, 0)),
           combinations.NamedObject(
               "AllReduceNoGradientRepacking",
               cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)),
+          combinations.NamedObject("NcclAllReduce",
+                                   cross_device_ops_lib.NcclAllReduce()),
+          combinations.NamedObject(
+              "HierarchicalCopy",
+              cross_device_ops_lib.HierarchicalCopyAllReduce(8)),
           combinations.NamedObject(
               "HierarchicalCopyAggregateSmallTensors",
               cross_device_ops_lib.AllReduceCrossDeviceOps(
                   "hierarchical_copy", 0, 100, 10))
       ],
-      distribution=[combinations.mirrored_strategy_with_two_gpus,
-                    combinations.core_mirrored_strategy_with_two_gpus],
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus
+      ],
       mode=["graph", "eager"])
 
   @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
@@ -280,7 +292,8 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
     devices = ["/cpu:0", "/gpu:0"]
     t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
     t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1])
-    per_replica = value_lib.PerReplica({devices[0]: t0, devices[1]: t1})
+    per_replica = value_lib.PerReplica(
+        value_lib.ReplicaDeviceMap(devices), (t0, t1))
     result = cross_device_ops_lib._simple_reduce(
         per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM)
 
@@ -297,8 +310,8 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
       combinations.combine(
           cross_device_ops_instance=[
               combinations.NamedObject(
-                  "ReductionToOneDeviceCrossDeviceOps",
-                  cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps()),
+                  "ReductionToOneDevice",
+                  cross_device_ops_lib.ReductionToOneDevice()),
               combinations.NamedObject(
                   "AllReduceCrossDeviceOps",
                   cross_device_ops_lib.AllReduceCrossDeviceOps())
@@ -314,7 +327,8 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
     t0 = _make_indexed_slices([[1., 2.]], [1], dense_shape, devices[0])
     t1 = _make_indexed_slices(
         [[3., 4.], [5., 6.]], [1, 3], dense_shape, devices[1])
-    per_replica = value_lib.PerReplica({devices[0]: t0, devices[1]: t1})
+    per_replica = value_lib.PerReplica(
+        value_lib.ReplicaDeviceMap(devices), (t0, t1))
 
     if batch_reduce:
       result = cross_device_ops_instance.batch_reduce(
@@ -416,6 +430,9 @@ class MultiWorkerCrossDeviceOpsTest(multi_worker_test_base.MultiWorkerTestBase,
       self._testReductionAndBroadcast(cross_device_ops, distribution)
 
 
+NUM_WORKERS = 3
+
+
 class MultiWorkerCollectiveAllReduceTest(
     multi_worker_test_base.MultiWorkerTestBase, parameterized.TestCase):
 
@@ -423,9 +440,9 @@ class MultiWorkerCollectiveAllReduceTest(
 
   @classmethod
   def setUpClass(cls):
-    """Create a local cluster with 2 workers."""
+    """Create a local cluster with 3 workers."""
     cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
-        num_workers=3, num_ps=0)
+        num_workers=NUM_WORKERS, num_ps=0)
 
   def setUp(self):
     super(MultiWorkerCollectiveAllReduceTest, self).setUp()
@@ -433,7 +450,12 @@ class MultiWorkerCollectiveAllReduceTest(
     # collective key base for different tests.
     MultiWorkerCollectiveAllReduceTest.collective_key_base += 100000
 
-  def _get_test_objects(self, task_type, task_id, num_gpus=0, local_mode=False):
+  def _get_test_objects(self,
+                        task_type,
+                        task_id,
+                        num_gpus=0,
+                        use_strategy_object=False,
+                        local_mode=False):
     collective_keys = cross_device_utils.CollectiveKeys(
         group_key_start=10 * num_gpus +
         MultiWorkerCollectiveAllReduceTest.collective_key_base,
@@ -442,16 +464,24 @@ class MultiWorkerCollectiveAllReduceTest(
         instance_key_with_id_start=num_gpus * 10000 +
         MultiWorkerCollectiveAllReduceTest.collective_key_base)
     if local_mode:
-      collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
-          1, num_gpus, collective_keys=collective_keys)
       if num_gpus:
         devices = ["/device:GPU:%d" % i for i in range(num_gpus)]
       else:
         devices = ["/device:CPU:0"]
-      return collective_all_reduce_ops, devices, ""
+
+      if use_strategy_object:
+        # Still using contrib CollectiveAllReduceStrategy because we can specify
+        # num_gpus in its constructor.
+        strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
+            num_gpus_per_worker=num_gpus)
+        strategy.extended._collective_keys = collective_keys
+        strategy.extended._cross_device_ops._collective_keys = collective_keys
+        return strategy, devices, ""
+      else:
+        collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
+            1, num_gpus, collective_keys=collective_keys)
+        return collective_all_reduce_ops, devices, ""
     else:
-      collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
-          3, num_gpus, collective_keys=collective_keys)
       if num_gpus:
         devices = [
             "/job:%s/task:%d/device:GPU:%d" % (task_type, task_id, i)
@@ -459,8 +489,23 @@ class MultiWorkerCollectiveAllReduceTest(
         ]
       else:
         devices = ["/job:%s/task:%d" % (task_type, task_id)]
-      return (collective_all_reduce_ops, devices,
-              "grpc://" + self._cluster_spec[task_type][task_id])
+
+      if use_strategy_object:
+        strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
+            num_gpus_per_worker=num_gpus)
+        strategy.configure(
+            cluster_spec=self._cluster_spec,
+            task_type=task_type,
+            task_id=task_id)
+        strategy.extended._collective_keys = collective_keys
+        strategy.extended._cross_device_ops._collective_keys = collective_keys
+        return (strategy, devices,
+                "grpc://" + self._cluster_spec[task_type][task_id])
+      else:
+        collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
+            NUM_WORKERS, num_gpus, collective_keys=collective_keys)
+        return (collective_all_reduce_ops, devices,
+                "grpc://" + self._cluster_spec[task_type][task_id])
 
   def _assert_values_equal(self, left, right, sess):
     if isinstance(left, list):
@@ -474,15 +519,24 @@ class MultiWorkerCollectiveAllReduceTest(
       run_options.experimental.collective_graph_key = 6
 
       left_values = np.array(
-          sess.run(list(left._index.values()), options=run_options)).flatten()
-      right_values = np.array(list(right._index.values())).flatten()
+          sess.run(list(left.values), options=run_options)).flatten()
+      right_values = np.array(list(right.values)).flatten()
       self.assertEqual(len(left_values), len(right_values))
       for l, r in zip(left_values, right_values):
         self.assertEqual(l, r)
 
-  def _test_reduction(self, task_type, task_id, num_gpus, local_mode=False):
+  def _test_reduction(self,
+                      task_type,
+                      task_id,
+                      num_gpus,
+                      use_strategy_object=False,
+                      local_mode=False):
     collective_all_reduce, devices, master_target = self._get_test_objects(
-        task_type, task_id, num_gpus, local_mode=local_mode)
+        task_type,
+        task_id,
+        num_gpus,
+        use_strategy_object=use_strategy_object,
+        local_mode=local_mode)
     if local_mode:
       num_workers = 1
       worker_device = None
@@ -490,13 +544,34 @@ class MultiWorkerCollectiveAllReduceTest(
       num_workers = len(self._cluster_spec.get("chief", [])) + len(
           self._cluster_spec.get("worker", []))
       worker_device = "/job:%s/task:%d" % (task_type, task_id)
+
+    def _reduce(test_object, reduce_op, per_replica, destinations):
+      if use_strategy_object:
+        with test_object.scope():
+          # Mimic the behavior that distribution strategy usually strips the
+          # wrapper if there is only one value.
+          if len(per_replica.values) == 1:
+            per_replica = per_replica.values[0]
+          return test_object.extended.reduce_to(reduce_op, per_replica,
+                                                destinations)
+      else:
+        return test_object.reduce(reduce_op, per_replica, destinations)
+
+    def _batch_reduce(test_object, reduce_op, value_destination_pairs):
+      if use_strategy_object:
+        with test_object.scope():
+          return test_object.extended.batch_reduce_to(reduce_op,
+                                                      value_destination_pairs)
+      else:
+        return test_object.batch_reduce(reduce_op, value_destination_pairs)
+
     with ops.Graph().as_default(), \
          ops.device(worker_device), \
          self.cached_session(target=master_target) as sess:
       # Collective ops doesn't support scalar tensors, so we have to construct
       # 1-d tensors.
       values = [constant_op.constant([float(d)]) for d in range(len(devices))]
-      per_replica = _make_per_replica(values, devices, regroup=True)
+      per_replica = _make_per_replica(values, devices)
       mean = np.array([(len(devices) - 1.) / 2.])
 
       values_2 = [constant_op.constant([d + 1.0]) for d in range(len(devices))]
@@ -514,26 +589,30 @@ class MultiWorkerCollectiveAllReduceTest(
       # test reduce()
       for destinations in all_destinations:
         self._assert_values_equal(
-            collective_all_reduce.reduce(
+            _reduce(
+                collective_all_reduce,
                 reduce_util.ReduceOp.MEAN,
                 per_replica,
-                destinations=destinations),
-            _fake_mirrored(mean, destinations), sess)
+                destinations=destinations), _fake_mirrored(mean, destinations),
+            sess)
         self._assert_values_equal(
-            collective_all_reduce.reduce(
+            _reduce(
+                collective_all_reduce,
                 reduce_util.ReduceOp.MEAN,
                 per_replica_2,
-                destinations=destinations),
-            _fake_mirrored(mean_2, destinations), sess)
+                destinations=destinations), _fake_mirrored(
+                    mean_2, destinations), sess)
         self._assert_values_equal(
-            collective_all_reduce.reduce(
+            _reduce(
+                collective_all_reduce,
                 reduce_util.ReduceOp.SUM,
                 per_replica,
                 destinations=destinations),
             _fake_mirrored(mean * len(devices) * num_workers, destinations),
             sess)
         self._assert_values_equal(
-            collective_all_reduce.reduce(
+            _reduce(
+                collective_all_reduce,
                 reduce_util.ReduceOp.SUM,
                 per_replica_2,
                 destinations=destinations),
@@ -543,17 +622,13 @@ class MultiWorkerCollectiveAllReduceTest(
       # test batch_reduce()
       for d1, d2 in itertools.product(all_destinations, all_destinations):
         self._assert_values_equal(
-            collective_all_reduce.batch_reduce(reduce_util.ReduceOp.MEAN,
-                                               [(per_replica, d1),
-                                                (per_replica_2, d2)]),
-            [
-                _fake_mirrored(mean, d1),
-                _fake_mirrored(mean_2, d2)
-            ], sess)
+            _batch_reduce(collective_all_reduce, reduce_util.ReduceOp.MEAN,
+                          [(per_replica, d1), (per_replica_2, d2)]),
+            [_fake_mirrored(mean, d1),
+             _fake_mirrored(mean_2, d2)], sess)
         self._assert_values_equal(
-            collective_all_reduce.batch_reduce(reduce_util.ReduceOp.SUM,
-                                               [(per_replica, d1),
-                                                (per_replica_2, d2)]),
+            _batch_reduce(collective_all_reduce, reduce_util.ReduceOp.SUM,
+                          [(per_replica, d1), (per_replica_2, d2)]),
             [
                 _fake_mirrored(mean * len(devices) * num_workers, d1),
                 _fake_mirrored(mean_2 * len(devices) * num_workers, d2)
@@ -562,18 +637,36 @@ class MultiWorkerCollectiveAllReduceTest(
     return True
 
   @combinations.generate(
-      combinations.combine(mode=["graph"], num_gpus=[0, 1, 2], required_gpus=1))
-  def testReductionDistributed(self, num_gpus):
+      combinations.combine(
+          mode=["graph"],
+          num_gpus=[0, 1, 2],
+          required_gpus=1,
+          use_strategy_object=[True, False]))
+  def testReductionDistributed(self, num_gpus, use_strategy_object):
     if context.num_gpus() < num_gpus:
       return
-    self._run_between_graph_clients(self._test_reduction, self._cluster_spec,
-                                    num_gpus)
+    self._run_between_graph_clients(
+        self._test_reduction,
+        self._cluster_spec,
+        num_gpus,
+        use_strategy_object=use_strategy_object)
 
   # Collective ops doesn't support strategy with one device.
-  def testReductionLocal(self, num_gpus=2):
+  @combinations.generate(
+      combinations.combine(
+          mode=["graph"],
+          num_gpus=[2],
+          required_gpus=2,
+          use_strategy_object=[True, False]))
+  def testReductionLocal(self, num_gpus, use_strategy_object):
     if context.num_gpus() < num_gpus:
       return
-    self._test_reduction(None, None, num_gpus, local_mode=True)
+    self._test_reduction(
+        None,
+        None,
+        num_gpus,
+        use_strategy_object=use_strategy_object,
+        local_mode=True)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distribute/python/cross_device_utils_test.py b/tensorflow/contrib/distribute/python/cross_device_utils_test.py
index 2303a31677afbd12a0b8e7eea3ecf7c7736c46ad..275aac2eeca575e927878d1ece63ce37ed38e8a0 100644
--- a/tensorflow/contrib/distribute/python/cross_device_utils_test.py
+++ b/tensorflow/contrib/distribute/python/cross_device_utils_test.py
@@ -103,7 +103,8 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
         constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
     t1 = math_ops._as_indexed_slices(
         constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    per_replica = value_lib.PerReplica({"/gpu:0": t0, "/cpu:0": t1})
+    device_map = value_lib.ReplicaDeviceMap(("/gpu:0", "/cpu:0"))
+    per_replica = value_lib.PerReplica(device_map, (t0, t1))
     self.assertTrue(cross_device_utils.contains_indexed_slices(per_replica))
 
   @combinations.generate(combinations.combine(
diff --git a/tensorflow/contrib/distribute/python/estimator_integration_test.py b/tensorflow/contrib/distribute/python/estimator_integration_test.py
index e17085628ba6d1dfc79839fd824801723f07a518..1ff1e7c1d255492e0535175dae7594d2ceb4010b 100644
--- a/tensorflow/contrib/distribute/python/estimator_integration_test.py
+++ b/tensorflow/contrib/distribute/python/estimator_integration_test.py
@@ -22,7 +22,6 @@ import shutil
 import tempfile
 from absl.testing import parameterized
 import numpy as np
-import six
 
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.optimizer_v2 import adagrad
@@ -117,7 +116,7 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase,
       scores = estimator.evaluate(eval_input_fn)
 
     self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn('loss', six.iterkeys(scores))
+    self.assertIn('loss', scores)
 
     predictions = np.array([
         x[prediction_keys.PredictionKeys.PREDICTIONS]
diff --git a/tensorflow/contrib/distribute/python/estimator_training_test.py b/tensorflow/contrib/distribute/python/estimator_training_test.py
index b369a7fefe6f35cf5a9b64451419cf4f72a99471..3f55a8a1c8b88d1b8e4031547fa3fbe519983630 100644
--- a/tensorflow/contrib/distribute/python/estimator_training_test.py
+++ b/tensorflow/contrib/distribute/python/estimator_training_test.py
@@ -375,11 +375,13 @@ class DistributeCoordinatorIntegrationTest(
     threads = self.run_multiple_tasks_in_threads(self._independent_worker_fn,
                                                  cluster_spec, train_distribute,
                                                  eval_distribute)
+    threads_to_join = []
     for task_type, ts in threads.items():
       if task_type == PS:
         continue
       for t in ts:
-        t.join()
+        threads_to_join.append(t)
+    self.join_independent_workers(threads_to_join)
 
     estimator = self._get_estimator(train_distribute, eval_distribute)
     self._inspect_train_and_eval_events(estimator)
@@ -413,8 +415,7 @@ class DistributeCoordinatorIntegrationTest(
     threads = self.run_multiple_tasks_in_threads(self._independent_worker_fn,
                                                  cluster_spec, train_distribute,
                                                  eval_distribute)
-    threads[WORKER][0].join()
-    threads[EVALUATOR][0].join()
+    self.join_independent_workers([threads[WORKER][0], threads[EVALUATOR][0]])
 
     estimator = self._get_estimator(train_distribute, eval_distribute)
     self._inspect_train_and_eval_events(estimator)
diff --git a/tensorflow/contrib/distribute/python/examples/BUILD b/tensorflow/contrib/distribute/python/examples/BUILD
index 84b106545e1326fddd3ed299462534af982dc102..5f89df5824a8d03198987a6fa3d21e2330deedf0 100644
--- a/tensorflow/contrib/distribute/python/examples/BUILD
+++ b/tensorflow/contrib/distribute/python/examples/BUILD
@@ -31,6 +31,12 @@ py_binary(
 
 py_binary(
     name = "keras_mnist",
+    srcs = ["keras_mnist.py"],
+    deps = [":keras_mnist_lib"],
+)
+
+py_library(
+    name = "keras_mnist_lib",
     srcs = [
         "keras_mnist.py",
     ],
@@ -39,3 +45,14 @@ py_binary(
         "//third_party/py/numpy",
     ],
 )
+
+py_binary(
+    name = "mnist_eager_multigpu",
+    srcs = [
+        "mnist_eager_multigpu.py",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/contrib/distribute/python/examples/keras_mnist.py b/tensorflow/contrib/distribute/python/examples/keras_mnist.py
index 60fda996642464135fe1fb8c314bcf7f04d19362..1ce91ecaf22a80a53124c8f00fac05c6b4711ed9 100644
--- a/tensorflow/contrib/distribute/python/examples/keras_mnist.py
+++ b/tensorflow/contrib/distribute/python/examples/keras_mnist.py
@@ -109,22 +109,21 @@ def main(_):
   tf.enable_eager_execution()
 
   train_ds, eval_ds, input_shape = get_input_datasets()
-  model = get_model(input_shape)
 
   # Instantiate the MirroredStrategy object. If we don't specify `num_gpus` or
   # the `devices` argument then all the GPUs available on the machine are used.
   # TODO(priyag): Use `tf.distribute.MirroredStrategy` once available.
   strategy = mirrored_strategy.MirroredStrategy(['/gpu:0', '/cpu:0'])
 
-  optimizer = rmsprop.RMSProp(learning_rate=0.001)
-
-  # Compile the model by passing the distribution strategy object to the
-  # `distribute` argument. `fit`, `evaluate` and `predict` will be distributed
-  # based on the strategy instantiated.
-  model.compile(loss=tf.keras.losses.categorical_crossentropy,
-                optimizer=optimizer,
-                metrics=['accuracy'],
-                distribute=strategy)
+  # Create and compile the model under Distribution strategy scope.
+  # `fit`, `evaluate` and `predict` will be distributed based on the strategy
+  # model was compiled with.
+  with strategy.scope():
+    model = get_model(input_shape)
+    optimizer = rmsprop.RMSProp(learning_rate=0.001)
+    model.compile(loss=tf.keras.losses.categorical_crossentropy,
+                  optimizer=optimizer,
+                  metrics=['accuracy'])
 
   # Train the model with the train dataset.
   model.fit(x=train_ds, epochs=20, steps_per_epoch=468)
diff --git a/tensorflow/contrib/distribute/python/examples/mnist_eager_multigpu.py b/tensorflow/contrib/distribute/python/examples/mnist_eager_multigpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c045a5586b9dad371d8c505f9cac4b792dd157fd
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/examples/mnist_eager_multigpu.py
@@ -0,0 +1,169 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Run MNIST on multiple GPUs on using MirroredStrategy with eager execution.
+
+By default, runs on all available GPUs, or CPU if no GPUs are available.
+
+NOTE: Currently, this takes more time than when running MNIST in eager without
+MirroredStrategy because of a number overheads. Therefore, this is just a
+proof of concept right now and cannot be used to actually scale up training.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+import numpy as np
+import tensorflow.compat.v2 as tf
+
+flags.DEFINE_integer("num_gpus", None, "How many GPUs should we run on?"
+                     "Defaults to all available GPUs, otherwise CPU.")
+flags.DEFINE_integer("batch_size", 64,
+                     "What should be the size of each batch?")
+flags.DEFINE_integer("num_epochs", 10, "How many epochs to run?")
+flags.DEFINE_float("learning_rate", 0.01, "Learning Rate")
+flags.DEFINE_float("momentum", 0.5, "SGD momentum")
+flags.DEFINE_boolean("use_function", False,
+                     "Should we wrap the step in a tf.function.")
+
+FLAGS = flags.FLAGS
+NUM_TRAIN_IMAGES = 60000
+
+
+def create_model():
+  max_pool = tf.keras.layers.MaxPooling2D((2, 2), (2, 2), padding="same")
+  # The model consists of a sequential chain of layers, so tf.keras.Sequential
+  # (a subclass of tf.keras.Model) makes for a compact description.
+  return tf.keras.Sequential([
+      tf.keras.layers.Reshape(
+          target_shape=[28, 28, 1],
+          input_shape=(28, 28,)),
+      tf.keras.layers.Conv2D(2, 5, padding="same", activation=tf.nn.relu),
+      max_pool,
+      tf.keras.layers.Conv2D(4, 5, padding="same", activation=tf.nn.relu),
+      max_pool,
+      tf.keras.layers.Flatten(),
+      tf.keras.layers.Dense(32, activation=tf.nn.relu),
+      tf.keras.layers.Dropout(0.4),
+      tf.keras.layers.Dense(10)])
+
+
+def compute_loss(logits, labels):
+  loss = tf.reduce_sum(
+      tf.nn.sparse_softmax_cross_entropy_with_logits(
+          logits=logits, labels=labels))
+  # Scale loss by global batch size.
+  return loss * (1. / FLAGS.batch_size)
+
+
+def mnist_datasets():
+  (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
+  # Numpy defaults to dtype=float64; TF defaults to float32. Stick with float32.
+  x_train, x_test = x_train / np.float32(255), x_test / np.float32(255)
+  y_train, y_test = y_train.astype(np.int64), y_test.astype(np.int64)
+  # TODO(priyag): `strategy.make_numpy_iterator` can be used directly instead of
+  # converting to datasets.
+  train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+  test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
+  return train_dataset, test_dataset
+
+
+def main(unused_argv):
+  """Run a CNN model on MNIST data to demonstrate DistributedStrategies."""
+
+  tf.enable_v2_behavior()
+
+  num_gpus = FLAGS.num_gpus
+  if num_gpus is None:
+    devices = None
+  elif num_gpus == 0:
+    devices = ["/device:CPU:0"]
+  else:
+    devices = ["/device:GPU:{}".format(i) for i in range(num_gpus)]
+  strategy = tf.distribute.MirroredStrategy(devices)
+
+  with strategy.scope():
+    train_ds, test_ds = mnist_datasets()
+    train_ds = train_ds.shuffle(NUM_TRAIN_IMAGES).batch(FLAGS.batch_size)
+    test_ds = test_ds.batch(FLAGS.batch_size)
+
+    model = create_model()
+    optimizer = tf.keras.optimizers.SGD(FLAGS.learning_rate, FLAGS.momentum)
+    training_loss = tf.keras.metrics.Mean("training_loss", dtype=tf.float32)
+    training_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
+        "training_accuracy", dtype=tf.float32)
+    test_loss = tf.keras.metrics.Mean("test_loss", dtype=tf.float32)
+    test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
+        "test_accuracy", dtype=tf.float32)
+
+    def train_step(inputs):
+      images, labels = inputs
+      with tf.GradientTape() as tape:
+        logits = model(images, training=True)
+        loss = compute_loss(logits, labels)
+      grads = tape.gradient(loss, model.variables)
+      optimizer.apply_gradients(zip(grads, model.variables))
+      training_loss.update_state(loss)
+      training_accuracy.update_state(labels, logits)
+
+    def test_step(inputs):
+      images, labels = inputs
+      logits = model(images, training=False)
+      loss = compute_loss(logits, labels)
+      test_loss.update_state(loss)
+      test_accuracy.update_state(labels, logits)
+
+    train_iterator = strategy.make_dataset_iterator(train_ds)
+    test_iterator = strategy.make_dataset_iterator(test_ds)
+
+    for epoch in range(0, FLAGS.num_epochs):
+      # TODO(b/123315763): Create the tf.function outside this loop once we are
+      # able to initialize iterator in eager mode.
+      dist_train = lambda it: strategy.experimental_run(train_step, it)
+      dist_test = lambda it: strategy.experimental_run(test_step, it)
+      if FLAGS.use_function:
+        dist_train = tf.function(dist_train)
+        dist_test = tf.function(dist_test)
+
+      # Train
+      print("Starting epoch {}".format(epoch))
+      train_iterator.initialize()
+      while True:
+        try:
+          dist_train(train_iterator)
+        except tf.errors.OutOfRangeError:
+          break
+      print("Training loss: {:0.4f}, accuracy: {:0.2f}%".format(
+          training_loss.result(), training_accuracy.result() * 100))
+      training_loss.reset_states()
+      training_accuracy.reset_states()
+
+      # Test
+      test_iterator.initialize()
+      while True:
+        try:
+          dist_test(test_iterator)
+        except tf.errors.OutOfRangeError:
+          break
+      print("Test loss: {:0.4f}, accuracy: {:0.2f}%".format(
+          test_loss.result(), test_accuracy.result() * 100))
+      test_loss.reset_states()
+      test_accuracy.reset_states()
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/tensorflow/contrib/distribute/python/input_lib_test.py b/tensorflow/contrib/distribute/python/input_lib_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..204f52b034f2366a42fbdab41c467feddb5969a0
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/input_lib_test.py
@@ -0,0 +1,217 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the input_lib library."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import multi_worker_test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
+from tensorflow.python.distribute import values
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.util import nest
+
+
+class InputIteratorTestBase(test.TestCase):
+
+  def _test_iterator(self, input_type, dataset_fn, worker_device_pairs,
+                     expected_values, sess=None, split_batch_by=None):
+    devices = nest.flatten([ds for _, ds in worker_device_pairs])
+    device_map = values.ReplicaDeviceMap(devices)
+    input_workers = input_lib.InputWorkers(device_map, worker_device_pairs)
+
+    if input_type == "input_fn":
+      input_contexts = [
+          distribute_lib.InputContext() for _ in worker_device_pairs]
+      input_fn = lambda _: dataset_fn()
+      iterator = input_lib.InputFunctionIterator(
+          input_fn, input_workers, input_contexts)
+    else:
+      iterator = input_lib.DatasetIterator(
+          dataset_fn(), input_workers, split_batch_by)
+
+    evaluate = lambda x: sess.run(x) if sess else self.evaluate(x)
+
+    evaluate(control_flow_ops.group(iterator.initialize()))
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next()
+      computed_value = evaluate(
+          [values.select_replica(r, next_element) for r in range(len(devices))])
+      self.assertAllEqual(expected_value, computed_value)
+
+    with self.assertRaises(errors.OutOfRangeError):
+      next_element = iterator.get_next()
+      evaluate([values.select_replica(r, next_element)
+                for r in range(len(devices))])
+
+    # After re-initializing the iterator, should be able to iterate again.
+    evaluate(control_flow_ops.group(iterator.initialize()))
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next()
+      computed_value = evaluate(
+          [values.select_replica(r, next_element) for r in range(len(devices))])
+      self.assertAllEqual(expected_value, computed_value)
+
+
+class InputIteratorSingleWorkerTest(InputIteratorTestBase,
+                                    parameterized.TestCase):
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"]))
+  def testOneDeviceCPU(self, input_type):
+    worker_device_pairs = [("", ["/device:CPU:0"])]
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+
+    expected_values = [[i] for i in range(10)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testTwoDevicesOneGPUOneCPU(self, input_type):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testTupleDataset(self, input_type):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    def dataset_fn():
+      dataset1 = dataset_ops.Dataset.range(10)
+      dataset2 = dataset_ops.Dataset.range(10).map(lambda x: x**2)
+      return dataset_ops.Dataset.zip((dataset1, dataset2))
+
+    expected_values = [[(i, i**2), (i+1, (i+1)**2)] for i in range(0, 10, 2)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testUnevenDatasetBatches(self, input_type):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    dataset_fn = lambda: dataset_ops.Dataset.range(11)
+
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["dataset"],
+      split_batch_by=[None, 2],
+      required_gpus=1))
+  def testBatchSplitting(self, input_type, split_batch_by):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    batch_size = 10
+    dataset_fn = lambda: dataset_ops.Dataset.range(100).batch(batch_size)
+
+    updated_batch_size = (
+        batch_size // split_batch_by if split_batch_by else batch_size)
+    expected_values = [[range(i, i+updated_batch_size),
+                        range(i+updated_batch_size, i+2*updated_batch_size)]
+                       for i in range(0, 100, updated_batch_size*2)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values, sess=None,
+                        split_batch_by=split_batch_by)
+
+
+class InputIteratorMultiWorkerTest(
+    multi_worker_test_base.MultiWorkerTestBase, InputIteratorTestBase,
+    parameterized.TestCase):
+
+  def _cpu_devices(self):
+    return [
+        ("/job:worker/replica:0/task:0",
+         ["/job:worker/replica:0/task:0/device:CPU:0"]),
+        ("/job:worker/replica:0/task:1",
+         ["/job:worker/replica:0/task:1/device:CPU:0"])]
+
+  def _cpu_and_one_gpu_devices(self):
+    return [
+        ("/job:worker/replica:0/task:0", [
+            "/job:worker/replica:0/task:0/device:GPU:0",
+            "/job:worker/replica:0/task:0/device:CPU:0"
+        ]),
+        ("/job:worker/replica:0/task:1", [
+            "/job:worker/replica:0/task:1/device:GPU:0",
+            "/job:worker/replica:0/task:1/device:CPU:0"
+        ])
+    ]
+
+  @combinations.generate(combinations.combine(
+      mode=["graph"],
+      input_type=["input_fn", "dataset"]))
+  def testOneDevicePerWorker(self, input_type):
+    worker_devices = self._cpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
+      dataset_fn = lambda: dataset_ops.Dataset.range(4)
+      self._test_iterator(input_type, dataset_fn, worker_devices,
+                          [[0, 0], [1, 1], [2, 2], [3, 3]], sess)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testTwoDevicesPerWorker(self, input_type):
+    worker_devices = self._cpu_and_one_gpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
+      dataset_fn = lambda: dataset_ops.Dataset.range(4)
+      self._test_iterator(input_type, dataset_fn, worker_devices,
+                          [[0, 1, 0, 1], [2, 3, 2, 3]], sess)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph"],
+      input_type=["input_fn", "dataset"]))
+  def testTupleDataset(self, input_type):
+    worker_devices = self._cpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
+      def dataset_fn():
+        dataset1 = dataset_ops.Dataset.range(4)
+        dataset2 = dataset_ops.Dataset.range(4).map(lambda x: x**2)
+        return dataset_ops.Dataset.zip((dataset1, dataset2))
+
+      expected_values = [[(i, i**2), (i, i**2)] for i in range(0, 4)]
+      self._test_iterator(input_type, dataset_fn, worker_devices,
+                          expected_values, sess)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_backward_compat_test.py b/tensorflow/contrib/distribute/python/keras_backward_compat_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c49b5522f9135efd9ae3005e92099caf54a76a3a
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/keras_backward_compat_test.py
@@ -0,0 +1,1083 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.keras models using DistributionStrategy."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import tpu_strategy
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.eager import test
+from tensorflow.python.framework import random_seed
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine import distributed_training_utils
+from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
+from tensorflow.python.keras.utils.mode_keys import ModeKeys
+from tensorflow.python.ops.parsing_ops import gen_parsing_ops
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.training import rmsprop
+
+_RANDOM_SEED = 1337
+_TRAIN_SIZE = 200
+_INPUT_SIZE = (10,)
+_NUM_CLASS = 2
+
+
+# TODO(anjalisridhar): Add a decorator that will allow us to run these tests as
+# part of the tf.keras unit tests suite.
+def simple_sequential_model():
+  model = keras.models.Sequential()
+  model.add(keras.layers.Dense(16, activation='relu', input_shape=_INPUT_SIZE))
+  model.add(keras.layers.Dropout(0.1))
+  model.add(keras.layers.Dense(_NUM_CLASS, activation='softmax'))
+  return model
+
+
+def simple_functional_model():
+  a = keras.layers.Input(shape=_INPUT_SIZE)
+  b = keras.layers.Dense(16, activation='relu')(a)
+  b = keras.layers.Dropout(0.1)(b)
+  b = keras.layers.Dense(_NUM_CLASS, activation='softmax')(b)
+  model = keras.models.Model(inputs=[a], outputs=[b])
+  return model
+
+
+def multi_inputs_multi_outputs_model():
+  input_a = keras.layers.Input(shape=(16,), name='input_a')
+  input_b = keras.layers.Input(shape=(16,), name='input_b')
+  input_m = keras.layers.Input(shape=(8,), dtype='string', name='input_m')
+  dense = keras.layers.Dense(8, name='dense_1')
+
+  interm_a = dense(input_a)
+  # Read m
+  interm_m = keras.layers.Lambda(gen_parsing_ops.string_to_number)(input_m)
+  interm_s = keras.layers.Lambda(lambda k: k[0] * k[1])([interm_m, interm_a])
+  interm_b = dense(input_b)
+  merged = keras.layers.concatenate([interm_s, interm_b], name='merge')
+  output_c = keras.layers.Dense(3, activation='softmax', name='dense_2')(merged)
+  output_d = keras.layers.Dense(2, activation='softmax', name='dense_3')(merged)
+  model = keras.models.Model(
+      inputs=[input_a, input_b, input_m], outputs=[output_c, output_d])
+  model.compile(
+      loss='categorical_crossentropy',
+      optimizer=gradient_descent.GradientDescentOptimizer(0.001),
+      metrics={
+          'dense_2': 'categorical_accuracy',
+          'dense_3': 'categorical_accuracy'
+      })
+  return model
+
+
+def get_ds_train_input_fn():
+  np.random.seed(_RANDOM_SEED)
+  (x_train, y_train), _ = testing_utils.get_test_data(
+      train_samples=_TRAIN_SIZE,
+      test_samples=50,
+      input_shape=_INPUT_SIZE,
+      num_classes=_NUM_CLASS)
+  y_train = keras.utils.to_categorical(y_train)
+
+  dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
+  dataset = dataset.batch(32)
+  return dataset
+
+
+def get_ds_test_input_fn():
+  np.random.seed(_RANDOM_SEED)
+  _, (x_test, y_test) = testing_utils.get_test_data(
+      train_samples=_TRAIN_SIZE,
+      test_samples=50,
+      input_shape=_INPUT_SIZE,
+      num_classes=_NUM_CLASS)
+  y_test = keras.utils.to_categorical(y_test)
+
+  dataset = dataset_ops.Dataset.from_tensor_slices((x_test, y_test))
+  dataset = dataset.batch(32)
+  return dataset
+
+
+def get_multi_inputs_multi_outputs_data():
+  (a_train, c_train), (a_test, c_test) = testing_utils.get_test_data(
+      train_samples=_TRAIN_SIZE,
+      test_samples=50,
+      input_shape=(16,),
+      num_classes=3,
+      random_seed=_RANDOM_SEED)
+  (b_train, d_train), (b_test, d_test) = testing_utils.get_test_data(
+      train_samples=_TRAIN_SIZE,
+      test_samples=50,
+      input_shape=(16,),
+      num_classes=2,
+      random_seed=_RANDOM_SEED)
+  (m_train, _), (m_test, _) = testing_utils.get_test_data(
+      train_samples=_TRAIN_SIZE,
+      test_samples=50,
+      input_shape=(8,),
+      num_classes=2,
+      random_seed=_RANDOM_SEED)
+
+  c_train = keras.utils.to_categorical(c_train)
+  c_test = keras.utils.to_categorical(c_test)
+  d_train = keras.utils.to_categorical(d_train)
+  d_test = keras.utils.to_categorical(d_test)
+
+  train_data = {
+      'input_a': a_train,
+      'input_b': b_train,
+      'input_m': m_train,
+      'output_c': c_train,
+      'output_d': d_train
+  }
+  test_data = {
+      'input_a': a_test,
+      'input_b': b_test,
+      'input_m': m_test,
+      'output_c': c_test,
+      'output_d': d_test
+  }
+
+  return (train_data, test_data)
+
+
+def batch_wrapper(dataset, batch_size, distribution, repeat=None):
+  if repeat:
+    dataset = dataset.repeat(repeat)
+  # TPUs currently require fully defined input shapes, drop_remainder ensures
+  # the input will have fully defined shapes.
+  if isinstance(distribution, tpu_strategy.TPUStrategy):
+    return dataset.batch(batch_size, drop_remainder=True)
+  else:
+    return dataset.batch(batch_size)
+
+
+def get_model():
+  x = keras.layers.Input(shape=(3,), name='input')
+  y = keras.layers.Dense(4, name='dense')(x)
+  model = keras.Model(x, y)
+  return model
+
+
+def get_dataset(distribution):
+  inputs = np.zeros((10, 3), dtype=np.float32)
+  targets = np.zeros((10, 4), dtype=np.float32)
+  dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+  dataset = dataset.repeat(100)
+  dataset = batch_wrapper(dataset, 10, distribution)
+  return dataset
+
+
+def get_predict_dataset(distribution):
+  inputs = np.zeros((10, 3), dtype=np.float32)
+  dataset = dataset_ops.Dataset.from_tensor_slices(inputs)
+  dataset = dataset.repeat(100)
+  dataset = batch_wrapper(dataset, 10, distribution)
+  return dataset
+
+
+def multi_input_output_model():
+  a = keras.layers.Input(shape=(3,), name='input_a')
+  b = keras.layers.Input(shape=(5,), name='input_b')
+  # TODO(anjalisridhar): Change the output dimension of the second Dense layer
+  # once the iterator output validation issue has been fixed.
+  dense_1 = keras.layers.Dense(7, name='dense_1')
+  dense_2 = keras.layers.Dense(7, name='dense_2')
+  c = dense_1(a)
+  d = dense_2(b)
+  e = keras.layers.Dropout(0.5, name='dropout')(c)
+  model = keras.models.Model([a, b], [d, e])
+  return model
+
+
+def get_correctness_test_inputs(use_numpy, use_validation_data,
+                                with_distribution,
+                                x_train, y_train, x_predict):
+  """Generates the inputs for correctness check when enable Keras with DS."""
+  training_epochs = 2
+  global_batch_size = 64
+  batch_size = global_batch_size
+  # TODO(b/118776054): Use global batch size for Keras/DS support.
+  use_per_core_batch_size = (
+      with_distribution and
+      not distributed_training_utils.global_batch_size_supported(
+          with_distribution))
+  if use_per_core_batch_size:
+    batch_size //= with_distribution.num_replicas_in_sync
+
+  if use_numpy:
+    training_inputs = {
+        'batch_size': batch_size,
+        'x': x_train,
+        'y': y_train,
+        'epochs': training_epochs,
+        'shuffle': False,
+    }
+
+    if use_validation_data:
+      eval_inputs = None
+      training_inputs['validation_data'] = (x_train, y_train)
+    else:
+      eval_inputs = {
+          'batch_size': batch_size,
+          'x': x_train,
+          'y': y_train,
+      }
+    predict_inputs = {
+        'x': np.array(x_predict, dtype=np.float32),
+    }
+  else:
+    # For dataset inputs, we do not pass batch_size to
+    # keras.fit/evaluate/predict. The batch size is part of the dataset.
+    train_dataset = dataset_ops.Dataset.from_tensor_slices(
+        (x_train, y_train))
+    x = batch_wrapper(
+        train_dataset, batch_size, with_distribution, repeat=training_epochs)
+
+    training_inputs = {
+        'batch_size': None,
+        'x': x,
+        'y': None,
+        'epochs': training_epochs,
+        'shuffle': False,
+        'steps_per_epoch': len(x_train) // global_batch_size,
+    }
+    if use_validation_data:
+      eval_inputs = None  # Remove the eval_inputs
+      eval_dataset = dataset_ops.Dataset.from_tensor_slices(
+          (x_train, y_train))
+      x = batch_wrapper(eval_dataset, batch_size, with_distribution)
+      training_inputs['validation_data'] = x
+      training_inputs['validation_steps'] = 5
+    else:
+      eval_inputs = {
+          'batch_size': None,
+          'x': x,
+          'y': None,
+          'steps': 20,
+      }
+
+    predict_batch_size = len(x_predict)
+    if use_per_core_batch_size:
+      predict_batch_size //= with_distribution.num_replicas_in_sync
+    predict_dataset = dataset_ops.Dataset.from_tensor_slices(x_predict)
+    predict_dataset = batch_wrapper(predict_dataset,
+                                    predict_batch_size, with_distribution)
+    predict_inputs = {
+        'steps': 1,
+        'x': predict_dataset,
+    }
+
+  return training_inputs, eval_inputs, predict_inputs
+
+
+strategies_minus_tpu = [
+    combinations.default_strategy,
+    combinations.one_device_strategy,
+    combinations.mirrored_strategy_with_gpu_and_cpu,
+    combinations.mirrored_strategy_with_two_gpus,
+    combinations.core_mirrored_strategy_with_gpu_and_cpu,
+    combinations.core_mirrored_strategy_with_two_gpus]
+
+tpu_strategies = [
+    combinations.tpu_strategy,  # steps_per_run=2
+    combinations.tpu_strategy_one_step]
+
+
+def strategy_minus_tpu_combinations():
+  return combinations.combine(
+      distribution=strategies_minus_tpu,
+      mode=['graph', 'eager'])
+
+
+def tpu_strategy_combinations():
+  return combinations.combine(
+      distribution=tpu_strategies,
+      mode=['graph'])
+
+
+def all_strategy_combinations():
+  return strategy_minus_tpu_combinations() + tpu_strategy_combinations()
+
+
+def strategy_and_optimizer_combinations():
+  return combinations.times(
+      all_strategy_combinations(),
+      combinations.combine(optimizer=[
+          combinations.adagrad_optimizer_v1_fn,
+          combinations.adagrad_optimizer_keras_v2_fn,
+          combinations.adam_optimizer_v1_fn,
+          combinations.adam_optimizer_keras_v2_fn,
+          combinations.gradient_descent_optimizer_v1_fn,
+          combinations.gradient_descent_optimizer_keras_v2_fn,
+          combinations.rmsprop_optimizer_v1_fn,
+          combinations.rmsprop_optimizer_keras_v2_fn
+      ]))
+
+
+def strategy_and_input_combinations():
+  return (
+      combinations.times(
+          combinations.combine(distribution=strategies_minus_tpu),
+          combinations.combine(mode=['graph'],
+                               use_numpy=[True, False],
+                               use_validation_data=[True, False])
+          + combinations.combine(mode=['eager'],
+                                 use_numpy=[False],
+                                 use_validation_data=[False])) +
+      combinations.times(
+          combinations.combine(distribution=tpu_strategies),
+          combinations.combine(mode=['graph'],
+                               use_numpy=[True, False],
+                               use_validation_data=[True, False])))
+
+
+def strategy_for_numpy_input_combinations():
+  return combinations.combine(
+      distribution=strategies_minus_tpu + tpu_strategies,
+      mode=['graph'])
+
+
+class TestDistributionStrategyWithNumpyArrays(test.TestCase,
+                                              parameterized.TestCase):
+
+  @combinations.generate(strategy_for_numpy_input_combinations())
+  def test_calling_model_with_numpy_arrays(self, distribution):
+    with self.cached_session():
+      model = get_model()
+
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
+
+      inputs = np.zeros((64, 3), dtype=np.float32)
+      targets = np.zeros((64, 4), dtype=np.float32)
+
+      # Call fit with validation data
+      model.fit(inputs, targets, epochs=1, batch_size=2, verbose=0,
+                validation_data=(inputs, targets))
+
+      # TODO(anjalisridhar): We need tests for when the batch size and steps are
+      # smaller and results in a 0 batch_size and steps value.
+      model.evaluate(inputs, targets)
+      # with steps
+      model.evaluate(inputs, targets, steps=2)
+      # with batch_size
+      model.evaluate(inputs, targets, batch_size=8)
+
+      model.predict(inputs)
+      # with steps
+      model.predict(inputs, steps=2)
+      # with batch_size
+      model.predict(inputs, batch_size=8)
+
+  @combinations.generate(strategy_for_numpy_input_combinations())
+  def test_calling_model_with_nested_numpy_arrays(self, distribution):
+    with self.cached_session():
+      model = multi_input_output_model()
+
+      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      model.compile(optimizer, loss, distribute=distribution)
+
+      input_a_np = np.asarray(np.random.random((64, 3)), dtype=np.float32)
+      input_b_np = np.asarray(np.random.random((64, 5)), dtype=np.float32)
+      inputs = [input_a_np, input_b_np]
+
+      output_d_np = np.asarray(np.random.random((64, 7)), dtype=np.float32)
+      output_e_np = np.asarray(np.random.random((64, 7)), dtype=np.float32)
+      targets = [output_d_np, output_e_np]
+
+      # Call fit with validation data
+      model.fit(inputs, targets, epochs=1, batch_size=8, verbose=0)
+
+      # TODO(anjalisridhar): We need tests for when the batch size and steps are
+      # smaller and results in a 0 batch_size and steps value.
+      model.evaluate(inputs, targets)
+      # with steps
+      model.evaluate(inputs, targets, steps=2)
+      # with batch_size
+      model.evaluate(inputs, targets, batch_size=8)
+
+      model.predict(inputs)
+      # with steps
+      model.predict(inputs, steps=2)
+      # with batch_size
+      model.predict(inputs, batch_size=8)
+
+  @combinations.generate(combinations.combine(
+      distribution=strategies_minus_tpu, mode=['graph']))
+  def test_numpy_with_sample_weights(self, distribution):
+    model = get_model()
+    optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    model.compile(optimizer, loss, distribute=distribution)
+
+    inputs = np.zeros((20, 3), np.float32)
+    targets = np.zeros((20, 4), np.float32)
+    sample_weights = np.ones((20), np.float32)
+
+    model.fit(inputs, targets, sample_weight=sample_weights, epochs=1,
+              steps_per_epoch=2, verbose=1)
+
+  @combinations.generate(strategy_for_numpy_input_combinations())
+  def test_flatten_predict_outputs(self, distribution):
+    with self.cached_session():
+      model = multi_input_output_model()
+
+      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      model.compile(optimizer, loss, distribute=distribution)
+
+      # We take 6 input samples with each input having a dimension of 3 or 5.
+      input_a_np = np.asarray(np.random.random((6, 3)), dtype=np.float32)
+      input_b_np = np.asarray(np.random.random((6, 5)), dtype=np.float32)
+      inputs = [input_a_np, input_b_np]
+
+      outs = model.predict(inputs, steps=1)
+      # `predict` a list that is equal in length to the number of model outputs.
+      # In this test our model has two outputs and each element of `outs`
+      # corresponds to all the samples of one of the model outputs.
+      self.assertLen(outs, 2)
+      # Each of the output samples have a dimension of 7. We should process all
+      # the available input samples(6).
+      self.assertAllEqual([6, 7], outs[0].shape)
+      self.assertAllEqual([6, 7], outs[1].shape)
+
+
+class TestDistributionStrategyWithDatasets(test.TestCase,
+                                           parameterized.TestCase):
+
+  @combinations.generate(all_strategy_combinations())
+  def test_calling_model_on_same_dataset(self, distribution):
+    with self.cached_session():
+      model = get_model()
+
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      loss = 'mse'
+      metrics = ['mae', keras.metrics.CategoricalAccuracy()]
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
+
+      dataset = get_dataset(distribution)
+
+      # Call fit with validation data
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+                validation_data=dataset, validation_steps=2)
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+                validation_data=dataset, validation_steps=2)
+      model.predict(get_predict_dataset(distribution), steps=2)
+
+  @combinations.generate(all_strategy_combinations())
+  def test_model_interleaved_eval_same_as_direct_eval(self, distribution):
+    with self.cached_session():
+      user_controlled_model = get_model()
+      user_controlled_model.compile(
+          gradient_descent.GradientDescentOptimizer(0.001),
+          loss='mse',
+          metrics=['mae', keras.metrics.CategoricalAccuracy()],
+          distribute=distribution)
+
+      interleaved_model = get_model()
+      interleaved_model.set_weights(user_controlled_model.get_weights())
+      interleaved_model.compile(
+          gradient_descent.GradientDescentOptimizer(0.001),
+          loss='mse',
+          metrics=['mae', keras.metrics.CategoricalAccuracy()],
+          distribute=distribution)
+
+      dataset = get_dataset(distribution)
+
+      # Call fit with validation interleaved
+      interleaved_output = interleaved_model.fit(
+          dataset, epochs=2, steps_per_epoch=2, verbose=1,
+          validation_data=dataset, validation_steps=2, shuffle=False)
+
+      # Manually control the validation running after each epoch.
+      user_controlled_output = []
+      for _ in range(2):
+        user_controlled_model.fit(
+            dataset, epochs=1, steps_per_epoch=2, verbose=1, shuffle=False)
+        user_controlled_output.append(
+            user_controlled_model.evaluate(dataset, steps=2))
+
+      self.assertEqual(interleaved_output.history['val_loss'],
+                       [x[0] for x in user_controlled_output])
+      self.assertEqual(interleaved_output.history['val_mean_absolute_error'],
+                       [x[1] for x in user_controlled_output])
+      self.assertEqual(interleaved_output.history['val_categorical_accuracy'],
+                       [x[2] for x in user_controlled_output])
+
+  # TODO(priyag): Enable this test for TPU. Currently tuples/dict don't work
+  # as clone_model's input_tensors argument only seems to accept list and not
+  # tuples or dict.
+
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph', 'eager']))
+  def test_fit_with_tuple_and_dict_dataset_inputs(self, distribution):
+    with self.cached_session():
+      model = multi_input_output_model()
+
+      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      metrics = ['mae', keras.metrics.CategoricalAccuracy()]
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
+
+      input_a_np = np.random.random((10, 3))
+      input_b_np = np.random.random((10, 5))
+      output_d_np = np.random.random((10, 7))
+      output_e_np = np.random.random((10, 7))
+
+      # Test with tuples
+      dataset_tuple = dataset_ops.Dataset.from_tensor_slices((
+          (input_a_np, input_b_np), (output_d_np, output_e_np)))
+      dataset_tuple = dataset_tuple.repeat(100)
+      dataset_tuple = dataset_tuple.batch(10)
+
+      model.fit(dataset_tuple, epochs=1, steps_per_epoch=2, verbose=1)
+
+      # Test with dict
+      dataset_dict = dataset_ops.Dataset.from_tensor_slices((
+          {'input_a': input_a_np, 'input_b': input_b_np},
+          (output_d_np, output_e_np)))
+      dataset_dict = dataset_dict.repeat(100)
+      dataset_dict = dataset_dict.batch(10)
+
+      model.fit(dataset_dict, epochs=1, steps_per_epoch=2, verbose=1)
+
+  @combinations.generate(all_strategy_combinations())
+  def test_fit_eval_and_predict_methods_on_dataset(self, distribution):
+    with self.cached_session():
+      model = get_model()
+
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      loss = 'mse'
+      metrics = ['mae', keras.metrics.CategoricalAccuracy()]
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
+
+      dataset = get_dataset(distribution)
+
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+      model.evaluate(dataset, steps=2, verbose=1)
+      model.predict(get_predict_dataset(distribution), steps=2)
+
+  @combinations.generate(strategy_and_optimizer_combinations())
+  def test_fit_eval_and_predict_with_optimizer(self, distribution, optimizer):
+    with self.cached_session():
+      model = get_model()
+
+      loss = 'mse'
+      model.compile(optimizer(), loss, distribute=distribution)
+
+      dataset = get_dataset(distribution)
+
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+      model.evaluate(dataset, steps=2, verbose=1)
+      model.predict(get_predict_dataset(distribution), steps=2)
+
+  @combinations.generate(strategy_minus_tpu_combinations())
+  def test_dataset_with_sample_weights(self, distribution):
+    model = get_model()
+    optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    model.compile(optimizer, loss, distribute=distribution)
+
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
+    sample_weights = np.ones((10), np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets,
+                                                      sample_weights))
+    dataset = dataset.repeat()
+    dataset = dataset.batch(10)
+
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+    model.evaluate(dataset, steps=2, verbose=1)
+    model.predict(dataset, steps=2)
+
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph', 'eager']))
+  # TODO(b/120943676, b/120957836): Re-enable once the validation code is
+  # restored.
+  def DISABLED_test_dataset_wrong_input_shape(self, distribution):
+    with self.cached_session():
+      model = get_model()
+
+      optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      model.compile(optimizer, loss, distribute=distribution)
+
+      # Wrong input shape
+      inputs = np.zeros((10, 5), dtype=np.float32)
+      targets = np.zeros((10, 4), dtype=np.float32)
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+
+      with self.assertRaisesRegexp(ValueError,
+                                   'expected input to have shape'):
+        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
+
+  @combinations.generate(combinations.combine(
+      distribution=[combinations.mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph', 'eager']))
+  # TODO(b/120943676, b/120957836): Re-enable once the validation code is
+  # restored.
+  def DISABLED_test_dataset_no_batch_input_validation(self, distribution):
+    with self.cached_session():
+      model = get_model()
+
+      optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      model.compile(optimizer, loss, distribute=distribution)
+
+      # User forgets to batch the dataset
+      inputs = np.zeros((10, 3), dtype=np.float32)
+      targets = np.zeros((10, 4), dtype=np.float32)
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+
+      with self.assertRaisesRegexp(ValueError, 'expected input to have shape'):
+        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
+
+  @combinations.generate(combinations.combine(
+      distribution=[combinations.tpu_strategy_one_step],
+      mode=['graph']))
+  def test_dataset_input_shape_fully_defined(self, distribution):
+    with self.cached_session():
+      model = get_model()
+
+      optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      model.compile(optimizer, loss, distribute=distribution)
+
+      dataset = get_dataset(distribution)
+      # Input shapes are not fully known. Batch dimension is unknown as we are
+      # not using the drop_remainder argument.
+      dataset = dataset.repeat(100).batch(10)
+
+      with self.assertRaisesRegexp(ValueError, 'requires fully defined shapes'):
+        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
+
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph', 'eager']))
+  def test_learning_phase_value(self, distribution):
+    # TODO(anjalisridhar): Modify this test to use Lambdas since we can compare
+    # meaningful values. Currently we don't pass the learning phase if the
+    # Lambda layer uses the learning phase.
+    with self.cached_session():
+      x = keras.layers.Input(shape=(1,), name='input')
+      y = keras.layers.Dense(1, kernel_initializer='ones')(x)
+      z = keras.layers.Dropout(0.9999)(y)
+      model = keras.Model(x, z)
+      initial_weights = model.get_weights()
+
+      optimizer = gradient_descent.GradientDescentOptimizer(0.005)
+      loss = 'mse'
+      metrics = ['acc']
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
+
+      batch_size = 8
+      if isinstance(distribution, mirrored_strategy.CoreMirroredStrategy):
+        # CoreMirroredStrategy uses global batch size.
+        batch_size = 8 * distribution.num_replicas_in_sync
+
+      inputs = np.ones((10, 1), dtype=np.float32)
+      targets = np.ones((10, 1), dtype=np.float32)
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat().batch(batch_size)
+      hist = model.fit(dataset, epochs=1, steps_per_epoch=20, verbose=1)
+      self.assertAlmostEqual(hist.history['acc'][0], 0, 0)
+
+      model.set_weights(initial_weights)
+      # TODO(psv/anjalisridhar): Enable these lines after we fix b/117431185.
+      # evaluate_output = model.evaluate(dataset, steps=20)
+      # self.assertAlmostEqual(evaluate_output[1], 1, 0)
+
+      inputs = np.ones((10, 1), dtype=np.float32)
+      predict_dataset = dataset_ops.Dataset.from_tensor_slices(inputs)
+
+      predict_dataset = predict_dataset.repeat().batch(batch_size)
+      output = model.predict(predict_dataset, steps=10)
+      # `predict` runs for 10 steps
+      ref_output = np.ones((160, 1), dtype=np.float32)
+      self.assertArrayNear(output, ref_output, 1e-1)
+
+  @combinations.generate(strategy_minus_tpu_combinations())
+  def testOptimizerWithCallbacks(self, distribution):
+    with self.cached_session():
+      model = get_model()
+
+      optimizer = gradient_descent_keras.SGD(0.01)
+      loss = 'mse'
+      model.compile(optimizer, loss, distribute=distribution)
+
+      dataset = get_dataset(distribution)
+
+      def schedule(_):
+        return 0.001
+
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+                callbacks=[keras.callbacks.LearningRateScheduler(schedule)])
+      grouped_models = distribution.unwrap(
+          distributed_training_utils.get_distributed_model(
+              model, ModeKeys.TRAIN))
+      with distribution.scope():
+        for m in grouped_models:
+          self.assertAllClose(0.001, keras.backend.get_value(
+              m.optimizer.lr), atol=1e-05, rtol=1e-05)
+
+
+class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph', 'eager']))
+  def test_unsupported_features(self, distribution):
+    with self.cached_session():
+      model = get_model()
+
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
+
+      dataset = get_dataset(distribution)
+
+      # Test with validation split
+      with self.assertRaisesRegexp(
+          ValueError, '`validation_split` argument is not '
+                      'supported when input `x` is a dataset or a '
+                      'dataset iterator.+'):
+        model.fit(dataset,
+                  epochs=1, steps_per_epoch=2, verbose=0,
+                  validation_split=0.5, validation_steps=2)
+
+      # Test with sample weight.
+      sample_weight = np.random.random((10,))
+      with self.assertRaisesRegexp(
+          ValueError, '`sample_weight` argument is not supported when input '
+                      '`x` is a dataset or a dataset iterator.'):
+        model.fit(
+            dataset,
+            epochs=1,
+            steps_per_epoch=2,
+            verbose=0,
+            sample_weight=sample_weight)
+
+      # Test with not specifying the `steps` argument for dataset with
+      # infinite cardinality.
+      dataset = dataset.repeat()
+      with self.assertRaisesRegexp(ValueError, 'When passing an infinitely '
+                                   'repeating dataset, you must specify the '
+                                   '`steps_per_epoch` argument'):
+        model.fit(dataset, epochs=1, verbose=0)
+      with self.assertRaisesRegexp(ValueError, 'When passing an infinitely '
+                                   'repeating dataset, you must specify the '
+                                   '`steps` argument'):
+        model.evaluate(dataset, verbose=0)
+
+      with self.assertRaisesRegexp(ValueError, 'When passing an infinitely '
+                                   'repeating dataset, you must specify the '
+                                   '`steps` argument'):
+        model.predict(dataset, verbose=0)
+
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph', 'eager']))
+  def test_calling_with_unsupported_predefined_callbacks(self, distribution):
+    with self.cached_session():
+      model = get_model()
+
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
+
+      dataset = get_dataset(distribution)
+
+      def schedule(_):
+        return 0.001
+      with self.assertRaisesRegexp(ValueError,
+                                   'You must specify a Keras Optimizer V2 when '
+                                   'using'):
+        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+                  callbacks=[keras.callbacks.LearningRateScheduler(schedule)])
+
+      with self.assertRaisesRegexp(ValueError,
+                                   'You must specify a Keras Optimizer V2 when '
+                                   'using'):
+        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+                  callbacks=[keras.callbacks.ReduceLROnPlateau()])
+
+
+class TestDistributionStrategyWithLossMasking(test.TestCase,
+                                              parameterized.TestCase):
+
+  # TODO(priyag): Enable all strategies for this test. Currently it does not
+  # work for TPU due to some invalid datatype.
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph', 'eager']))
+  def test_masking(self, distribution):
+    with self.cached_session():
+      np.random.seed(1337)
+      x = np.array([[[1], [1]], [[0], [0]]])
+      model = keras.models.Sequential()
+      model.add(keras.layers.Masking(mask_value=0, input_shape=(2, 1)))
+      model.add(
+          keras.layers.TimeDistributed(
+              keras.layers.Dense(1, kernel_initializer='one')))
+      model.compile(loss='mse',
+                    optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                    distribute=distribution)
+      y = np.array([[[1], [1]], [[1], [1]]])
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+      hist = model.fit(x=dataset, epochs=1, steps_per_epoch=2)
+      self.assertEqual(hist.history['loss'][0], 0)
+
+
+class TestDistributionStrategyWithNormalizationLayer(
+    test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(all_strategy_combinations())
+  def test_batchnorm_correctness(self, distribution):
+    with self.cached_session():
+      model = keras.models.Sequential()
+      norm = keras.layers.BatchNormalization(input_shape=(10,), momentum=0.8)
+      model.add(norm)
+      model.compile(loss='mse',
+                    optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                    distribute=distribution)
+
+      # centered on 5.0, variance 10.0
+      x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10))
+      x = x.astype('float32')
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, x))
+      dataset = dataset.repeat(100)
+      dataset = batch_wrapper(dataset, 32, distribution)
+
+      predict_dataset = dataset_ops.Dataset.from_tensor_slices(x)
+      predict_dataset = predict_dataset.repeat(100)
+      predict_dataset = batch_wrapper(predict_dataset, 32, distribution)
+
+      model.fit(dataset, epochs=4, verbose=0, steps_per_epoch=10)
+      out = model.predict(predict_dataset, steps=2)
+      out -= keras.backend.eval(norm.beta)
+      out /= keras.backend.eval(norm.gamma)
+      np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
+      np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
+
+
+class TestDistributionStrategyCorrectness(test.TestCase,
+                                          parameterized.TestCase):
+
+  @combinations.generate(all_strategy_combinations())
+  def test_metric_correctness(self, distribution):
+    with self.cached_session():
+      keras.backend.set_image_data_format('channels_last')
+      num_samples = 10000
+
+      x_train = np.random.randint(0, 2, num_samples)
+      x_train = np.reshape(x_train, (num_samples, 1))
+      y_train = x_train
+      x_train = x_train.astype('float32')
+      y_train = y_train.astype('float32')
+
+      # Create identity model.
+      model = keras.Sequential()
+      model.add(
+          keras.layers.Dense(1, input_shape=(1,), kernel_initializer='ones'))
+      model.compile(
+          loss=keras.losses.mean_squared_error,
+          optimizer=gradient_descent.GradientDescentOptimizer(0.5),
+          metrics=[keras.metrics.BinaryAccuracy()],
+          distribute=distribution)
+
+      batch_size = 64
+      if not distributed_training_utils.global_batch_size_supported(
+          distribution):
+        batch_size //= distribution.num_replicas_in_sync
+      train_dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
+      train_dataset = batch_wrapper(train_dataset, batch_size, distribution)
+
+      history = model.fit(x=train_dataset, epochs=2, steps_per_epoch=10)
+      self.assertEqual(history.history['binary_accuracy'], [1.0, 1.0])
+
+  @combinations.generate(all_strategy_combinations())
+  def test_eval_metrics_correctness(self, distribution):
+    with self.cached_session():
+      model = keras.Sequential()
+      model.add(
+          keras.layers.Dense(
+              3, activation='relu', input_dim=4, kernel_initializer='ones'))
+      model.add(
+          keras.layers.Dense(
+              1, activation='sigmoid', kernel_initializer='ones'))
+      model.compile(
+          loss='mae',
+          metrics=['accuracy', keras.metrics.BinaryAccuracy()],
+          optimizer=gradient_descent.GradientDescentOptimizer(0.001),
+          distribute=distribution)
+
+      # verify correctness of stateful and stateless metrics.
+      x = np.ones((100, 4)).astype('float32')
+      y = np.ones((100, 1)).astype('float32')
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat()
+      dataset = batch_wrapper(dataset, 4, distribution)
+      outs = model.evaluate(dataset, steps=10)
+      self.assertEqual(outs[1], 1.)
+      self.assertEqual(outs[2], 1.)
+
+      y = np.zeros((100, 1)).astype('float32')
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat()
+      dataset = batch_wrapper(dataset, 4, distribution)
+      outs = model.evaluate(dataset, steps=10)
+      self.assertEqual(outs[1], 0.)
+      self.assertEqual(outs[2], 0.)
+
+  @combinations.generate(strategy_and_input_combinations())
+  def test_correctness(self, distribution, use_numpy, use_validation_data):
+    with self.cached_session():
+      default_tolerance = 1e-5
+      tol_table = {}
+
+      if isinstance(distribution, (
+          mirrored_strategy.MirroredStrategy,
+          mirrored_strategy.CoreMirroredStrategy,
+          distribute_lib._DefaultDistributionStrategy)):  # pylint: disable=protected-access
+        # TODO(b/119257215): Weights are not exactly the same, so use larger
+        # tolerance for now. Predict should be related to weights.
+        tol_table = {
+            'weights_1': 1e-4,
+            'weights_2': 1e-4,
+            'predict_result_1': 1e-4,
+        }
+
+      keras.backend.set_image_data_format('channels_last')
+      np.random.seed(_RANDOM_SEED)
+      random_seed.set_random_seed(_RANDOM_SEED)
+
+      # Train, eval, and predict datasets are created with the same input numpy
+      # arrays.
+      # TODO(xiejw): Change this back to 10000, once we support final partial
+      # batch.
+      num_samples = 9984
+      x_train = np.random.rand(num_samples, 1)
+      y_train = 3 * x_train
+      x_train = x_train.astype('float32')
+      y_train = y_train.astype('float32')
+      x_predict = [[1.], [2.], [3.], [4.]]
+
+      # The model is built once and the initial weights are saved.
+      # This is used to initialize the model for both the distribution and
+      # non-distribution run. In addition, we add few non-linear layers to make
+      # it non-trivial.
+      def _create_model():
+        model = keras.Sequential()
+        model.add(keras.layers.Dense(10, activation='relu', input_shape=(1,)))
+        model.add(keras.layers.Dense(10, activation='relu'))
+        model.add(keras.layers.Dense(10, activation='relu'))
+        model.add(keras.layers.Dense(1))
+        return model
+
+      model = _create_model()
+      initial_weights = model.get_weights()
+      del model  # avoid accident usage.
+
+      def fit_eval_and_predict(with_distribution=None):
+        model = _create_model()
+        # We have initialized the model to the same weight for the distribution
+        # and non-distribution run.
+        model.set_weights(initial_weights)
+        model.compile(
+            loss=keras.losses.mean_squared_error,
+            optimizer=gradient_descent_keras.SGD(0.5),
+            metrics=['mse'],
+            distribute=with_distribution)
+
+        training_inputs, eval_inputs, predict_inputs = (
+            get_correctness_test_inputs(use_numpy, use_validation_data,
+                                        with_distribution,
+                                        x_train, y_train, x_predict))
+
+        result = {}
+        result['training_history_1'] = model.fit(**training_inputs).history
+
+        if eval_inputs is not None:
+          result['eval_result_1'] = model.evaluate(**eval_inputs)
+
+        result['weights_1'] = model.get_weights()
+        result['predict_result_1'] = model.predict(**predict_inputs)
+
+        # Train and eval again to mimic user's flow.
+
+        result['training_history_2'] = model.fit(**training_inputs).history
+
+        if eval_inputs is not None:
+          result['eval_result_2'] = model.evaluate(**eval_inputs)
+
+        result['weights_2'] = model.get_weights()
+
+        return result
+
+      results_with_ds = fit_eval_and_predict(with_distribution=distribution)
+      results_without_ds = fit_eval_and_predict(with_distribution=None)
+
+      # Verify that the weights, training history, eval results, predict outputs
+      # are the same within some limits of tolerance.
+      for key in results_with_ds:
+        if (key.startswith('training_history') and
+            isinstance(distribution, tpu_strategy.TPUStrategy) and
+            distribution.extended.steps_per_run > 1):
+          # TODO(b/119894254): Enable this test for all cases once the
+          # underlying bug is fixed.
+          continue
+
+        tolerance = tol_table.get(key, default_tolerance)
+
+        self.assertAllClose(
+            results_with_ds[key],
+            results_without_ds[key],
+            atol=tolerance,
+            rtol=tolerance,
+            msg='Fail to assert {}.'.format(key))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_correctness_test_base.py b/tensorflow/contrib/distribute/python/keras_correctness_test_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fb7a18c40484ce01a5acfd6b191de464cfd9840
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/keras_correctness_test_base.py
@@ -0,0 +1,487 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Correctness tests for tf.keras using DistributionStrategy."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+import six
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import tpu_strategy
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import random_seed
+from tensorflow.python.keras.engine import distributed_training_utils
+
+_RANDOM_SEED = 1337
+_EVAL_STEPS = 20
+_GLOBAL_BATCH_SIZE = 64
+
+# Note: Please make sure the tests in this file are also covered in
+# keras_backward_compat_test for features that are supported with both APIs.
+
+
+all_strategies = [
+    combinations.default_strategy,
+    combinations.one_device_strategy,
+    combinations.mirrored_strategy_with_gpu_and_cpu,
+    combinations.mirrored_strategy_with_two_gpus,
+    combinations.core_mirrored_strategy_with_gpu_and_cpu,
+    combinations.core_mirrored_strategy_with_two_gpus,
+    combinations.tpu_strategy,  # steps_per_run=2
+    combinations.tpu_strategy_one_step,
+]
+
+
+def eager_mode_test_configuration():
+  return combinations.combine(mode='eager',
+                              use_numpy=False,
+                              use_validation_data=False)
+
+
+def graph_mode_test_configuration():
+  return combinations.combine(mode='graph',
+                              use_numpy=[True, False],
+                              use_validation_data=[True, False])
+
+
+def all_strategy_and_input_config_combinations():
+  return (
+      combinations.times(
+          combinations.combine(distribution=all_strategies),
+          eager_mode_test_configuration() + graph_mode_test_configuration()))
+
+
+def strategies_for_embedding_models():
+  """Returns distribution strategies to test for embedding models.
+
+  Since embedding models take longer to train, we disregard OneDeviceStrategy
+  and DefaultStrategy in order to prevent testing timeouts.
+  """
+
+  return [s for s in all_strategies if s.required_tpu or s.required_gpus]
+
+
+def test_combinations_for_embedding_model():
+  return (
+      combinations.times(
+          combinations.combine(distribution=
+                               strategies_for_embedding_models()),
+          (graph_mode_test_configuration() +
+           eager_mode_test_configuration())))
+
+
+def test_combinations_with_tpu_strategies():
+  tpu_strategies = [combinations.tpu_strategy,
+                    combinations.tpu_strategy_one_step]
+
+  return (
+      combinations.times(
+          combinations.combine(distribution=tpu_strategies),
+          graph_mode_test_configuration()))
+
+
+class MaybeDistributionScope(object):
+  """Provides a context allowing no distribution strategy."""
+
+  def __init__(self, distribution):
+    self._distribution = distribution
+    self._scope = None
+
+  def __enter__(self):
+    if self._distribution:
+      self._scope = self._distribution.scope()
+      self._scope.__enter__()
+
+  def __exit__(self, exc_type, value, traceback):
+    if self._distribution:
+      self._scope.__exit__(exc_type, value, traceback)
+      self._scope = None
+
+
+def batch_wrapper(dataset, batch_size, distribution, repeat=None):
+  if repeat:
+    dataset = dataset.repeat(repeat)
+  # TPUs currently require fully defined input shapes, drop_remainder ensures
+  # the input will have fully defined shapes.
+  if isinstance(distribution, tpu_strategy.TPUStrategy):
+    return dataset.batch(batch_size, drop_remainder=True)
+  else:
+    return dataset.batch(batch_size)
+
+
+def get_batch_size(global_batch_size, distribution):
+  batch_size = global_batch_size
+  # TODO(b/118776054): Use global batch size for Keras/DS support.
+  use_per_core_batch_size = (
+      distribution and
+      not distributed_training_utils.global_batch_size_supported(distribution))
+  if use_per_core_batch_size:
+    batch_size //= distribution.num_replicas_in_sync
+  return batch_size
+
+
+def get_data_size(data):
+  """Gets the size of data in list, tuple, dict, or a numpy array."""
+  assert isinstance(data, (np.ndarray, list, dict, tuple))
+
+  if isinstance(data, np.ndarray):
+    return len(data)
+
+  if isinstance(data, (list, tuple)):
+    return len(data[0])
+
+  return len(six.next(six.itervalues(data)))
+
+
+def get_correctness_test_inputs(use_numpy, use_validation_data,
+                                with_distribution, x_train, y_train, x_predict):
+  """Generates the inputs for correctness check when enable Keras with DS."""
+  training_epochs = 2
+  global_batch_size = _GLOBAL_BATCH_SIZE
+  batch_size = get_batch_size(global_batch_size, with_distribution)
+
+  if use_numpy:
+    training_inputs = {
+        'batch_size': batch_size,
+        'x': x_train,
+        'y': y_train,
+        'epochs': training_epochs,
+        'shuffle': False,
+    }
+
+    if use_validation_data:
+      eval_inputs = None
+      training_inputs['validation_data'] = (x_train, y_train)
+    else:
+      eval_inputs = {
+          'batch_size': batch_size,
+          'x': x_train,
+          'y': y_train,
+      }
+    predict_inputs = {
+        'x': x_predict
+    }
+  else:
+    training_data_size = get_data_size(x_train)
+    if training_data_size < _GLOBAL_BATCH_SIZE * _EVAL_STEPS:
+      # Currently, we cannot detect the size of a dataset. So, the eval steps is
+      # hard coded.
+      raise ValueError('x_train must have at least '
+                       '_GLOBAL_BATCH_SIZE * _EVAL_STEPS samples')
+    # For dataset inputs, we do not pass batch_size to
+    # keras.fit/evaluate/predict. The batch size is part of the dataset.
+    train_dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
+    x = batch_wrapper(train_dataset, batch_size, with_distribution,
+                      repeat=training_epochs)
+
+    training_inputs = {
+        'batch_size': None,
+        'x': x,
+        'y': None,
+        'epochs': training_epochs,
+        'shuffle': False,
+        'steps_per_epoch': training_data_size // global_batch_size,
+    }
+    if use_validation_data:
+      eval_inputs = None  # Remove the eval_inputs
+      eval_dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
+      x = batch_wrapper(eval_dataset, batch_size, with_distribution)
+      training_inputs['validation_data'] = x
+      training_inputs['validation_steps'] = 5
+    else:
+      eval_inputs = {
+          'batch_size': None,
+          'x': x,
+          'y': None,
+          'steps': _EVAL_STEPS,
+      }
+
+    predict_batch_size = get_batch_size(get_data_size(x_predict),
+                                        with_distribution)
+    predict_dataset = dataset_ops.Dataset.from_tensor_slices(x_predict)
+    predict_dataset = batch_wrapper(predict_dataset, predict_batch_size,
+                                    with_distribution)
+    predict_inputs = {
+        'steps': 1,
+        'x': predict_dataset,
+    }
+
+  return training_inputs, eval_inputs, predict_inputs
+
+
+def fit_eval_and_predict(initial_weights, input_fn, model_fn,
+                         distribution=None, is_stateful_model=False):
+  """Generates results for fit/predict/evaluate for given model."""
+  model = model_fn(initial_weights=initial_weights, distribution=distribution)
+  training_inputs, eval_inputs, predict_inputs = input_fn(distribution)
+
+  result = {}
+  result['training_history_1'] = model.fit(**training_inputs).history
+
+  if eval_inputs is not None:
+    result['eval_result_1'] = model.evaluate(**eval_inputs)
+
+  result['weights_1'] = model.get_weights()
+
+  if predict_inputs is not None:
+    # Check correctness of the result of predict() invoked
+    # multiple times -- as for stateful models, result of
+    # predict may differ for each batch.
+    predict_length = 1
+    if is_stateful_model:
+      predict_length = 3
+    for i in range(predict_length):
+      result_key = 'predict_result_{}'.format(i)
+      result[result_key] = model.predict(**predict_inputs)
+
+  # Train and eval again to mimic user's flow.
+
+  result['training_history_2'] = model.fit(**training_inputs).history
+
+  if eval_inputs is not None:
+    result['eval_result_2'] = model.evaluate(**eval_inputs)
+
+  result['weights_2'] = model.get_weights()
+
+  return result
+
+
+def compare_results(results_with_ds, results_without_ds, distribution,
+                    testcase):
+  """Compares results of model compiled with/without distribution strategy."""
+
+  default_tolerance = 1e-5
+  relaxed_tolerance = 1e-4
+
+  def _get_compare_result_tolerance(key):
+    """Returns tolerance to compare results."""
+    # TODO(b/119257215): For MirroredStrategy, weights are not exactly the same,
+    # so use larger tolerance for now. Predict should be related to weights.
+    if (isinstance(distribution, (
+        mirrored_strategy.MirroredStrategy,
+        mirrored_strategy.CoreMirroredStrategy,
+        distribute_lib._DefaultDistributionStrategy)) and  # pylint: disable=protected-access
+        key.startswith(('weights_1', 'weights_2', 'predict_result'))):
+      return relaxed_tolerance
+
+    return default_tolerance
+
+  for key in results_with_ds:
+    if (key.startswith('training_history') and
+        isinstance(distribution, tpu_strategy.TPUStrategy) and
+        distribution.extended.steps_per_run > 1):
+      # TODO(b/119894254): Enable this test for all cases once the
+      # underlying bug is fixed.
+      continue
+
+    tolerance = _get_compare_result_tolerance(key)
+    testcase.assertAllClose(
+        results_with_ds[key],
+        results_without_ds[key],
+        atol=tolerance,
+        rtol=tolerance,
+        msg='Fail to assert {}.'.format(key))
+
+
+def should_skip_tpu_with_eager(distribution):
+  return (context.executing_eagerly() and
+          isinstance(distribution, tpu_strategy.TPUStrategy))
+
+
+class LearningRateBatchScheduler(keras.callbacks.Callback):
+  """Scheduler that dynamically sets the learning rate of model."""
+
+  def __init__(self, update_freq=None):
+    self._update_freq = update_freq
+
+  def on_batch_begin(self, batch, logs=None):
+    if self._update_freq and batch % self._update_freq != 0:
+      return
+
+    # To avoid divergence, limit the value range.
+    lr = 0.001 * (batch % 10)
+    keras.backend.set_value(self.model.optimizer.lr, lr)
+
+
+class TestDistributionStrategyCorrectnessBase(test.TestCase,
+                                              parameterized.TestCase):
+  """Model agnostic testing infra to test correctness of Keras models."""
+
+  def set_up_test_config(self, use_numpy=False,
+                         use_validation_data=False,
+                         with_batch_norm=False):
+    self.use_numpy = use_numpy
+    self.use_validation_data = use_validation_data
+    self.with_batch_norm = with_batch_norm
+
+    keras.backend.set_image_data_format('channels_last')
+    np.random.seed(_RANDOM_SEED)
+    random_seed.set_random_seed(_RANDOM_SEED)
+
+  def get_data(self):
+    num_samples = 10000
+    x_train = np.random.randint(0, 2, num_samples)
+    x_train = np.reshape(x_train, (num_samples, 1))
+    y_train = x_train
+    return (x_train.astype('float32'), y_train.astype('float32'), None)
+
+  def get_model(self, distribution=None):
+    raise NotImplementedError
+
+  def skip_unsupported_test_configuration(self, distribution):
+    if should_skip_tpu_with_eager(distribution):
+      self.skipTest('TPUStrategy does not support eager mode now.')
+
+    if context.executing_eagerly() and self.use_numpy:
+      self.skipTest('Numpy as inputs is not supported with strategy in eager.')
+
+    if context.executing_eagerly() and self.use_validation_data:
+      self.skipTest('TODO(hongjunchoi): Add test logic for using validation '
+                    'data for eager execution.')
+    return
+
+  def run_correctness_test(self,
+                           distribution,
+                           use_numpy,
+                           use_validation_data,
+                           with_batch_norm=False,
+                           is_stateful_model=False):
+    with self.cached_session():
+      self.set_up_test_config(use_numpy, use_validation_data, with_batch_norm)
+      self.skip_unsupported_test_configuration(distribution)
+
+      # Train, eval, and predict datasets are created with the same input numpy
+      # arrays.
+      x_train, y_train, x_predict = self.get_data()
+
+      # The model is built once and the initial weights are saved.
+      # This is used to initialize the model for both the distribution and
+      # non-distribution run.
+      model = self.get_model()
+      initial_weights = model.get_weights()
+
+      def input_fn(dist):
+        return get_correctness_test_inputs(
+            use_numpy, use_validation_data, dist, x_train, y_train, x_predict)
+
+      results_with_ds = fit_eval_and_predict(
+          initial_weights, input_fn=input_fn, model_fn=self.get_model,
+          distribution=distribution, is_stateful_model=is_stateful_model)
+      results_without_ds = fit_eval_and_predict(
+          initial_weights, input_fn=input_fn, model_fn=self.get_model,
+          distribution=None, is_stateful_model=is_stateful_model)
+
+      # First, special case, for multi-replica distributed training, batch norm
+      # is not aggregated globally. So it is expected to have different weights.
+      if (self.with_batch_norm and
+          distribution.num_replicas_in_sync > 1):
+        with self.assertRaises(AssertionError):
+          compare_results(results_with_ds, results_without_ds, distribution,
+                          testcase=self)
+      else:
+        compare_results(results_with_ds, results_without_ds, distribution,
+                        testcase=self)
+
+  def run_dynamic_lr_test(self, distribution):
+    with self.cached_session():
+      self.set_up_test_config()
+      self.skip_unsupported_test_configuration(distribution)
+
+      x_train, y_train, _ = self.get_data()
+      model = self.get_model()
+      initial_weights = model.get_weights()
+      update_freq = None
+
+      if (isinstance(distribution, tpu_strategy.TPUStrategy) and
+          distribution.extended.steps_per_run > 1):
+        # For TPUStrategy with steps_per_run > 1, the callback is not invoked
+        # every step. So, to compare the CPU/TPU, we let the CPU to behave the
+        # same as TPU.
+        update_freq = distribution.extended.steps_per_run
+
+      def input_fn(dist):
+        """Generates training test given test configuration."""
+        training_epochs = 2
+        global_batch_size = 64
+        batch_size = get_batch_size(global_batch_size, dist)
+
+        training_inputs = {
+            'batch_size': batch_size,
+            'x': x_train,
+            'y': y_train,
+            'epochs': training_epochs,
+            'shuffle': False,
+            'callbacks': [LearningRateBatchScheduler(update_freq)],
+            'validation_data': (x_train, y_train)
+        }
+        # In this test case, we do not care eval and predict.
+        eval_inputs, predict_inputs = None, None
+        return training_inputs, eval_inputs, predict_inputs
+
+      results_with_ds = fit_eval_and_predict(
+          initial_weights, input_fn=input_fn, model_fn=self.get_model,
+          distribution=distribution)
+      results_without_ds = fit_eval_and_predict(
+          initial_weights, input_fn=input_fn, model_fn=self.get_model,
+          distribution=None)
+      compare_results(results_with_ds, results_without_ds, distribution,
+                      testcase=self)
+
+
+class TestDistributionStrategyEmbeddingModelCorrectnessBase(
+    TestDistributionStrategyCorrectnessBase):
+  """Base class to test correctness of Keras models with embedding layers."""
+
+  def get_data(self,
+               count=(_GLOBAL_BATCH_SIZE * _EVAL_STEPS),
+               min_words=5,
+               max_words=10,
+               max_word_id=19,
+               num_classes=2):
+    distribution = []
+    for _ in range(num_classes):
+      dist = np.abs(np.random.randn(max_word_id))
+      dist /= np.sum(dist)
+      distribution.append(dist)
+
+    features = []
+    labels = []
+    for _ in range(count):
+      label = np.random.randint(0, num_classes, size=1)[0]
+      num_words = np.random.randint(min_words, max_words, size=1)[0]
+      word_ids = np.random.choice(
+          max_word_id, size=num_words, replace=True, p=distribution[label])
+      word_ids = word_ids
+      labels.append(label)
+      features.append(word_ids)
+
+    features = keras.preprocessing.sequence.pad_sequences(
+        features, maxlen=max_words)
+    x_train = np.asarray(features, dtype=np.float32)
+    y_train = np.asarray(labels, dtype=np.int32).reshape((count, 1))
+    x_predict = x_train[:_GLOBAL_BATCH_SIZE]
+    return x_train, y_train, x_predict
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_dnn_correctness_test.py b/tensorflow/contrib/distribute/python/keras_dnn_correctness_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..61202e30c4f33892d2675080fae07cc4d7102337
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/keras_dnn_correctness_test.py
@@ -0,0 +1,173 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Correctness tests for tf.keras DNN model using DistributionStrategy."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import keras_correctness_test_base
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import test
+from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
+from tensorflow.python.training import gradient_descent
+
+
+def all_strategy_combinations_with_eager_and_graph_modes():
+  return combinations.combine(distribution=keras_correctness_test_base.
+                              all_strategies,
+                              mode=['graph', 'eager'])
+
+
+def all_strategy_combinations_with_graph_mode():
+  return combinations.combine(distribution=keras_correctness_test_base.
+                              all_strategies, mode=['graph'])
+
+
+class TestDistributionStrategyDnnCorrectness(
+    keras_correctness_test_base.TestDistributionStrategyCorrectnessBase):
+
+  def get_model(self, initial_weights=None, distribution=None):
+    with keras_correctness_test_base.MaybeDistributionScope(distribution):
+      # We add few non-linear layers to make it non-trivial.
+      model = keras.Sequential()
+      model.add(keras.layers.Dense(10, activation='relu', input_shape=(1,)))
+      model.add(keras.layers.Dense(
+          10, activation='relu',
+          kernel_regularizer=keras.regularizers.l2(1e-4)))
+      model.add(keras.layers.Dense(10, activation='relu'))
+      model.add(keras.layers.Dense(1))
+
+      if initial_weights:
+        model.set_weights(initial_weights)
+
+      model.compile(
+          loss=keras.losses.mean_squared_error,
+          optimizer=gradient_descent_keras.SGD(0.5),
+          metrics=['mse'])
+      return model
+
+  def get_data(self):
+    # TODO(xiejw): Change this back to 10000, once we support final partial
+    # batch.
+    num_samples = 9984
+    x_train = np.random.rand(num_samples, 1)
+    y_train = 3 * x_train
+    x_train = x_train.astype('float32')
+    y_train = y_train.astype('float32')
+    x_predict = np.array([[1.], [2.], [3.], [4.]], dtype=np.float32)
+    return x_train, y_train, x_predict
+
+  @combinations.generate(keras_correctness_test_base.
+                         all_strategy_and_input_config_combinations())
+  def test_dnn_correctness(self, distribution, use_numpy, use_validation_data):
+    self.run_correctness_test(distribution, use_numpy, use_validation_data)
+
+  @combinations.generate(all_strategy_combinations_with_graph_mode())
+  def test_dnn_with_dynamic_learning_rate(self, distribution):
+    self.run_dynamic_lr_test(distribution)
+
+
+class TestDistributionStrategyDnnMetricCorrectness(
+    keras_correctness_test_base.TestDistributionStrategyCorrectnessBase):
+
+  def get_model(self, distribution=None):
+    with distribution.scope():
+      model = keras.Sequential()
+      model.add(keras.layers.Dense(1,
+                                   input_shape=(1,),
+                                   kernel_initializer='ones'))
+      model.compile(
+          loss=keras.losses.mean_squared_error,
+          optimizer=gradient_descent.GradientDescentOptimizer(0.5),
+          metrics=[keras.metrics.BinaryAccuracy()])
+    return model
+
+  def run_metric_correctness_test(self, distribution):
+    with self.cached_session():
+      self.set_up_test_config()
+      self.skip_unsupported_test_configuration(distribution)
+
+      x_train, y_train, _ = self.get_data()
+      model = self.get_model(distribution=distribution)
+
+      batch_size = 64
+      batch_size = (keras_correctness_test_base.
+                    get_batch_size(batch_size, distribution))
+      train_dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
+      train_dataset = (keras_correctness_test_base.
+                       batch_wrapper(train_dataset, batch_size, distribution))
+
+      history = model.fit(x=train_dataset, epochs=2, steps_per_epoch=10)
+      self.assertEqual(history.history['binary_accuracy'], [1.0, 1.0])
+
+  @combinations.generate(all_strategy_combinations_with_eager_and_graph_modes())
+  def test_simple_dnn_metric_correctness(self, distribution):
+    self.run_metric_correctness_test(distribution)
+
+
+class TestDistributionStrategyDnnMetricEvalCorrectness(
+    keras_correctness_test_base.TestDistributionStrategyCorrectnessBase):
+
+  def get_model(self, distribution=None):
+    with distribution.scope():
+      model = keras.Sequential()
+      model.add(
+          keras.layers.Dense(
+              3, activation='relu', input_dim=4, kernel_initializer='ones'))
+      model.add(
+          keras.layers.Dense(
+              1, activation='sigmoid', kernel_initializer='ones'))
+      model.compile(
+          loss='mae',
+          metrics=['accuracy', keras.metrics.BinaryAccuracy()],
+          optimizer=gradient_descent.GradientDescentOptimizer(0.001))
+    return model
+
+  def run_eval_metrics_correctness_test(self, distribution):
+    with self.cached_session():
+      self.set_up_test_config()
+      self.skip_unsupported_test_configuration(distribution)
+
+      model = self.get_model(distribution=distribution)
+
+      # verify correctness of stateful and stateless metrics.
+      x = np.ones((100, 4)).astype('float32')
+      y = np.ones((100, 1)).astype('float32')
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat()
+      dataset = (keras_correctness_test_base.
+                 batch_wrapper(dataset, 4, distribution))
+      outs = model.evaluate(dataset, steps=10)
+      self.assertEqual(outs[1], 1.)
+      self.assertEqual(outs[2], 1.)
+
+      y = np.zeros((100, 1)).astype('float32')
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat()
+      dataset = (keras_correctness_test_base.
+                 batch_wrapper(dataset, 4, distribution))
+      outs = model.evaluate(dataset, steps=10)
+      self.assertEqual(outs[1], 0.)
+      self.assertEqual(outs[2], 0.)
+
+  @combinations.generate(all_strategy_combinations_with_eager_and_graph_modes())
+  def test_identity_model_metric_eval_correctness(self, distribution):
+    self.run_eval_metrics_correctness_test(distribution)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_embedding_model_correctness_test.py b/tensorflow/contrib/distribute/python/keras_embedding_model_correctness_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e881bb70ecc428e3f972cde5f19c1b61b1dc0f0b
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/keras_embedding_model_correctness_test.py
@@ -0,0 +1,150 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Correctness test for tf.keras Embedding models using DistributionStrategy."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import keras_correctness_test_base
+from tensorflow.python import keras
+from tensorflow.python.eager import test
+from tensorflow.python.training import gradient_descent
+
+
+class DistributionStrategyEmbeddingModelCorrectnessTest(
+    keras_correctness_test_base.
+    TestDistributionStrategyEmbeddingModelCorrectnessBase):
+
+  def get_model(self, max_words=10, initial_weights=None, distribution=None):
+    with keras_correctness_test_base.MaybeDistributionScope(distribution):
+      word_ids = keras.layers.Input(
+          shape=(max_words,), dtype=np.int32, name='words')
+      word_embed = keras.layers.Embedding(input_dim=20,
+                                          output_dim=10)(word_ids)
+      if self.use_distributed_dense:
+        word_embed = keras.layers.TimeDistributed(keras.layers.Dense(4))(
+            word_embed)
+      avg = keras.layers.GlobalAveragePooling1D()(word_embed)
+      preds = keras.layers.Dense(2, activation='softmax')(avg)
+      model = keras.Model(inputs=[word_ids], outputs=[preds])
+
+      if initial_weights:
+        model.set_weights(initial_weights)
+
+      model.compile(
+          optimizer=gradient_descent.GradientDescentOptimizer(
+              learning_rate=0.1),
+          loss='sparse_categorical_crossentropy',
+          metrics=['sparse_categorical_accuracy'])
+    return model
+
+  @combinations.generate(keras_correctness_test_base.
+                         test_combinations_for_embedding_model())
+  def test_embedding_model_correctness(self, distribution, use_numpy,
+                                       use_validation_data):
+
+    self.use_distributed_dense = False
+    self.run_correctness_test(distribution, use_numpy, use_validation_data)
+
+  @combinations.generate(keras_correctness_test_base.
+                         test_combinations_for_embedding_model())
+  def test_embedding_time_distributed_model_correctness(self,
+                                                        distribution,
+                                                        use_numpy,
+                                                        use_validation_data):
+    self.use_distributed_dense = True
+    self.run_correctness_test(distribution, use_numpy, use_validation_data)
+
+
+class DistributionStrategySiameseEmbeddingModelCorrectnessTest(
+    keras_correctness_test_base.
+    TestDistributionStrategyEmbeddingModelCorrectnessBase):
+
+  def get_model(self, max_words=10, initial_weights=None, distribution=None):
+    with keras_correctness_test_base.MaybeDistributionScope(distribution):
+      word_ids_a = keras.layers.Input(
+          shape=(max_words,), dtype=np.int32, name='words_a')
+      word_ids_b = keras.layers.Input(
+          shape=(max_words,), dtype=np.int32, name='words_b')
+
+      def submodel(embedding, word_ids):
+        word_embed = embedding(word_ids)
+        rep = keras.layers.GlobalAveragePooling1D()(word_embed)
+        return keras.Model(inputs=[word_ids], outputs=[rep])
+
+      word_embed = keras.layers.Embedding(
+          input_dim=20,
+          output_dim=10,
+          input_length=max_words,
+          embeddings_initializer=keras.initializers.RandomUniform(0, 1))
+
+      a_rep = submodel(word_embed, word_ids_a).outputs[0]
+      b_rep = submodel(word_embed, word_ids_b).outputs[0]
+      sim = keras.layers.Dot(axes=1, normalize=True)([a_rep, b_rep])
+
+      model = keras.Model(inputs=[word_ids_a, word_ids_b], outputs=[sim])
+
+      if initial_weights:
+        model.set_weights(initial_weights)
+
+      model.compile(
+          optimizer=gradient_descent.GradientDescentOptimizer(
+              learning_rate=0.1),
+          loss='mse',
+          metrics=['mse'])
+    return model
+
+  def get_data(self,
+               count=(keras_correctness_test_base._GLOBAL_BATCH_SIZE *
+                      keras_correctness_test_base._EVAL_STEPS),
+               min_words=5,
+               max_words=10,
+               max_word_id=19,
+               num_classes=2):
+    features_a, labels_a, _ = (super(
+        DistributionStrategySiameseEmbeddingModelCorrectnessTest, self).
+                               get_data(count, min_words, max_words,
+                                        max_word_id, num_classes))
+
+    features_b, labels_b, _ = (super(
+        DistributionStrategySiameseEmbeddingModelCorrectnessTest, self).
+                               get_data(count, min_words, max_words,
+                                        max_word_id, num_classes))
+
+    y_train = np.zeros((count, 1), dtype=np.float32)
+    y_train[labels_a == labels_b] = 1.0
+    y_train[labels_a != labels_b] = -1.0
+    # TODO(b/123360757): Add tests for using list as inputs for multi-input
+    # models.
+    x_train = {
+        'words_a': features_a,
+        'words_b': features_b,
+    }
+    x_predict = x_train
+
+    return x_train, y_train, x_predict
+
+  @combinations.generate(keras_correctness_test_base.
+                         test_combinations_for_embedding_model())
+  def test_siamese_embedding_model_correctness(self, distribution, use_numpy,
+                                               use_validation_data):
+    self.run_correctness_test(distribution, use_numpy, use_validation_data)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_image_model_correctness_test.py b/tensorflow/contrib/distribute/python/keras_image_model_correctness_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c2961456b2eede9570ce29f7a8900834f2ccfb7
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/keras_image_model_correctness_test.py
@@ -0,0 +1,93 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Correctness tests for tf.keras CNN models using DistributionStrategy."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import keras_correctness_test_base
+from tensorflow.python import keras
+from tensorflow.python.eager import test
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
+
+
+class DistributionStrategyCnnCorrectnessTest(
+    keras_correctness_test_base.TestDistributionStrategyCorrectnessBase):
+
+  def get_model(self, initial_weights=None, distribution=None):
+    with keras_correctness_test_base.MaybeDistributionScope(distribution):
+      image = keras.layers.Input(shape=(28, 28, 3), name='image')
+      c1 = keras.layers.Conv2D(
+          name='conv1', filters=16, kernel_size=(3, 3), strides=(4, 4),
+          kernel_regularizer=keras.regularizers.l2(1e-4))(
+              image)
+      if self.with_batch_norm:
+        c1 = keras.layers.BatchNormalization(name='bn1')(c1)
+      c1 = keras.layers.MaxPooling2D(pool_size=(2, 2))(c1)
+      logits = keras.layers.Dense(
+          10, activation='softmax', name='pred')(
+              keras.layers.Flatten()(c1))
+      model = keras.Model(inputs=[image], outputs=[logits])
+
+      if initial_weights:
+        model.set_weights(initial_weights)
+
+      model.compile(
+          optimizer=gradient_descent.SGD(
+              learning_rate=0.1),
+          loss='sparse_categorical_crossentropy',
+          metrics=['sparse_categorical_accuracy'])
+
+    return model
+
+  def get_data(self,
+               count=keras_correctness_test_base._GLOBAL_BATCH_SIZE
+               * keras_correctness_test_base._EVAL_STEPS,
+               shape=(28, 28, 3),
+               num_classes=10):
+    centers = np.random.randn(num_classes, *shape)
+
+    features = []
+    labels = []
+    for _ in range(count):
+      label = np.random.randint(0, num_classes, size=1)[0]
+      offset = np.random.normal(loc=0, scale=0.1, size=np.prod(shape))
+      offset = offset.reshape(shape)
+      labels.append(label)
+      features.append(centers[label] + offset)
+
+    x_train = np.asarray(features, dtype=np.float32)
+    y_train = np.asarray(labels, dtype=np.float32).reshape((count, 1))
+    x_predict = x_train
+    return x_train, y_train, x_predict
+
+  @combinations.generate(keras_correctness_test_base.
+                         all_strategy_and_input_config_combinations())
+  def test_cnn_correctness(self, distribution, use_numpy, use_validation_data):
+    self.run_correctness_test(distribution, use_numpy, use_validation_data)
+
+  @combinations.generate(keras_correctness_test_base.
+                         all_strategy_and_input_config_combinations())
+  def test_cnn_with_batch_norm_correctness(self, distribution, use_numpy,
+                                           use_validation_data):
+    self.run_correctness_test(distribution, use_numpy, use_validation_data,
+                              with_batch_norm=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_lstm_model_correctness_test.py b/tensorflow/contrib/distribute/python/keras_lstm_model_correctness_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ed2dfa206cdf4be24a88b1d54090487c1873399
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/keras_lstm_model_correctness_test.py
@@ -0,0 +1,65 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Correctness tests for tf.keras LSTM model using DistributionStrategy."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import keras_correctness_test_base
+from tensorflow.python import keras
+from tensorflow.python.eager import test
+from tensorflow.python.training import gradient_descent
+
+
+class DistributionStrategyLstmModelCorrectnessTest(
+    keras_correctness_test_base.
+    TestDistributionStrategyEmbeddingModelCorrectnessBase):
+
+  def get_model(self, max_words=10, initial_weights=None, distribution=None):
+    with keras_correctness_test_base.MaybeDistributionScope(distribution):
+      word_ids = keras.layers.Input(
+          shape=(max_words,), dtype=np.int32, name='words')
+      word_embed = keras.layers.Embedding(input_dim=20,
+                                          output_dim=10)(word_ids)
+      lstm_embed = keras.layers.LSTM(units=4,
+                                     return_sequences=False)(word_embed)
+
+      preds = keras.layers.Dense(2, activation='softmax')(lstm_embed)
+      model = keras.Model(inputs=[word_ids], outputs=[preds])
+
+      if initial_weights:
+        model.set_weights(initial_weights)
+
+      model.compile(
+          optimizer=gradient_descent.GradientDescentOptimizer(
+              learning_rate=0.1),
+          loss='sparse_categorical_crossentropy',
+          metrics=['sparse_categorical_accuracy'])
+    return model
+
+  @combinations.generate(keras_correctness_test_base.
+                         test_combinations_for_embedding_model())
+  def test_lstm_model_correctness(self,
+                                  distribution,
+                                  use_numpy,
+                                  use_validation_data):
+    self.run_correctness_test(distribution, use_numpy, use_validation_data)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py b/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
index 6dfd85bcc4f3784e2744fd876a7190cc9581d96a..c93d7afa7ceef2c9c272e91997e2871655cea079 100644
--- a/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
+++ b/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
@@ -18,24 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import shutil
-import tempfile
 from absl.testing import parameterized
 import numpy as np
-import six
 
 from tensorflow.contrib.distribute.python import combinations
-from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import keras
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
-from tensorflow.python.estimator import run_config
-from tensorflow.python.estimator import training
-from tensorflow.python.estimator.canned import dnn_linear_combined
-from tensorflow.python.estimator.canned import prediction_keys
-from tensorflow.python.estimator.export import export
-from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.feature_column import feature_column_lib as feature_column
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -44,103 +33,7 @@ from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
-from tensorflow.python.summary.writer import writer_cache
-
-
-class KerasOptimizerV2IntegrationTest(test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    self._model_dir = tempfile.mkdtemp()
-
-  def dataset_input_fn(self, x, y, batch_size):
-
-    def input_fn():
-      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
-      dataset = dataset.repeat(1).batch(batch_size)
-      return dataset
-
-    return input_fn
-
-  @combinations.generate(
-      combinations.combine(
-          mode=['graph'],
-          distribution=[
-              combinations.one_device_strategy,
-              combinations.mirrored_strategy_with_gpu_and_cpu,
-              combinations.mirrored_strategy_with_two_gpus,
-              combinations.core_mirrored_strategy_with_gpu_and_cpu,
-              combinations.core_mirrored_strategy_with_two_gpus
-          ],
-          use_train_and_evaluate=[True, False]))
-  def test_complete_flow_with_mode(self, distribution, use_train_and_evaluate):
-    label_dimension = 2
-    input_dimension = label_dimension
-    batch_size = 10
-    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
-    data = data.reshape(batch_size, label_dimension)
-    train_input_fn = self.dataset_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size // distribution.num_replicas_in_sync)
-    eval_input_fn = self.dataset_input_fn(
-        x={'x': data},
-        y=data,
-        batch_size=batch_size // distribution.num_replicas_in_sync)
-    predict_input_fn = numpy_io.numpy_input_fn(
-        x={'x': data}, batch_size=batch_size, shuffle=False)
-
-    linear_feature_columns = [
-        feature_column.numeric_column('x', shape=(input_dimension,))
-    ]
-    dnn_feature_columns = [
-        feature_column.numeric_column('x', shape=(input_dimension,))
-    ]
-    feature_columns = linear_feature_columns + dnn_feature_columns
-    session_config = config_pb2.ConfigProto(
-        log_device_placement=True, allow_soft_placement=True)
-    estimator = dnn_linear_combined.DNNLinearCombinedRegressor(
-        linear_feature_columns=linear_feature_columns,
-        dnn_hidden_units=(2, 2),
-        dnn_feature_columns=dnn_feature_columns,
-        label_dimension=label_dimension,
-        model_dir=self._model_dir,
-        dnn_optimizer=adam.Adam(0.001),
-        linear_optimizer=adam.Adam(0.001),
-        config=run_config.RunConfig(
-            train_distribute=distribution,
-            eval_distribute=distribution,
-            session_config=session_config))
-
-    num_steps = 2
-    if use_train_and_evaluate:
-      scores, _ = training.train_and_evaluate(
-          estimator, training.TrainSpec(train_input_fn, max_steps=num_steps),
-          training.EvalSpec(eval_input_fn))
-    else:
-      estimator.train(train_input_fn, steps=num_steps)
-      scores = estimator.evaluate(eval_input_fn)
-
-    self.assertIn('loss', six.iterkeys(scores))
-
-    predictions = np.array([
-        x[prediction_keys.PredictionKeys.PREDICTIONS]
-        for x in estimator.predict(predict_input_fn)
-    ])
-    self.assertAllEqual((batch_size, label_dimension), predictions.shape)
-
-    feature_spec = feature_column.make_parse_example_spec(feature_columns)
-    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
-        feature_spec)
-    export_dir = estimator.export_savedmodel(tempfile.mkdtemp(),
-                                             serving_input_receiver_fn)
-    self.assertTrue(gfile.Exists(export_dir))
-
-  def tearDown(self):
-    if self._model_dir:
-      writer_cache.FileWriterCache.clear()
-      shutil.rmtree(self._model_dir)
 
 
 def get_model():
@@ -152,113 +45,80 @@ def get_model():
 
 class MirroredStrategyOptimizerV2Test(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(combinations.combine(
-      distribution=[
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.core_mirrored_strategy_with_gpu_and_cpu],
-      mode=['graph']))
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              combinations.core_mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_two_gpus,
+              combinations.parameter_server_strategy_with_two_gpus,
+          ],
+          mode=['graph', 'eager']))
   def testKerasOptimizerWithUnequalInput(self, distribution):
-    def create_fn():
+    with distribution.scope():
       var = variables.Variable(
           2.0, name='var', aggregation=variable_scope.VariableAggregation.SUM)
-      # grad for cpu is 1, grad for gpu is 2, avg grad is 1.5.
-      loss = math_ops.cast(_replica_id() + 1, dtype=dtypes.float32) * var
       optimizer = adam.Adam(learning_rate=0.01, beta_1=0.2, beta_2=0.2)
-      train_op = optimizer.minimize(loss, var_list=[var])
-      m = optimizer.get_slot(var, 'm')
-      v = optimizer.get_slot(var, 'v')
-      return (var, m, v, train_op, optimizer.iterations)
+      all_vars = []
 
-    devices = ['/device:GPU:0', '/device:CPU:0']
-    with distribution.scope():
-      (var, m, v, op, counter) = distribution.call_for_each_replica(create_fn)
+      def model_fn():
+
+        def loss_fn():
+          replica_id = _replica_id()
+          return math_ops.cast(replica_id + 1, dtype=dtypes.float32) * 0.5 * var
+
+        train_op = optimizer.minimize(loss_fn, var_list=[var])
+
+        return train_op, optimizer
+
+      def train_fn():
+        train_op, optimizer = distribution.extended.call_for_each_replica(
+            model_fn)
+        if not all_vars:
+          all_vars.append(var)
+          all_vars.append(optimizer.get_slot(var, 'm'))
+          all_vars.append(optimizer.get_slot(var, 'v'))
+        return distribution.group(train_op)
+
+      if not context.executing_eagerly():
+        with self.cached_session() as sess:
+          train_fn = sess.make_callable(train_fn())
       self.evaluate(variables.global_variables_initializer())
-      var_val = [2.0, 2.0, 2.0]
-      self.assertAllClose(
-          var_val,
-          self.evaluate(
-              [distribution.read_var(var),
-               var.get(devices[0]),
-               var.get(devices[1])]))
-      self.assertAllClose([0, 0, 0],
-                          self.evaluate([
-                              distribution.read_var(counter),
-                              counter.get(devices[0]),
-                              counter.get(devices[1])
-                          ]))
 
-      train_op = distribution.unwrap(op)
-      self.evaluate(train_op)
-      # m(1) = beta1 * m(0) + (1-beta1) * grad = 0.2 * 0 + 0.8 * (1 + 2) / 2
-      m_val = [1.2, 1.2, 1.2]
-      # assert slot variables in both replicas are the same.
-      self.assertAllClose(
-          m_val,
-          self.evaluate(
-              [distribution.read_var(m),
-               m.get(devices[0]),
-               m.get(devices[1])]))
-      # v(1) = beta2 * v(0) + (1-beta2) * grad^2 = 0.2 * 0 + 0.8 * 2.25
-      v_val = [1.8, 1.8, 1.8]
-      self.assertAllClose(
-          v_val,
-          self.evaluate(
-              [distribution.read_var(v),
-               v.get(devices[0]),
-               v.get(devices[1])]))
+      # first step.
+      train_fn()
       # var(1) = var(0) - lr * m(1) * sqrt(1 - beta2) / sqrt(v(1)) / (1 - beta1)
       #        = 2.0 - 0.01 * 1.2 * sqrt(0.8) / sqrt(1.8) / 0.8
-      var_val = [1.99, 1.99, 1.99]
-      self.assertAllClose(
-          var_val,
-          self.evaluate(
-              [distribution.read_var(var),
-               var.get(devices[0]),
-               var.get(devices[1])]))
-      self.assertAllClose([1, 1, 1],
-                          self.evaluate([
-                              distribution.read_var(counter),
-                              counter.get(devices[0]),
-                              counter.get(devices[1])
-                          ]))
+      self.assertAllClose(1.99, self.evaluate(all_vars[0]))
+      # m(1) = beta1 * m(0) + (1-beta1) * grad = 0.2 * 0 + 0.8 * (1 + 2) / 2
+      self.assertAllClose(1.2, self.evaluate(all_vars[1]))
+      # v(1) = beta2 * v(0) + (1-beta2) * grad^2 = 0.2 * 0 + 0.8 * 2.25
+      self.assertAllClose(1.8, self.evaluate(all_vars[2]))
 
-      self.evaluate(train_op)
+      # second step.
+      train_fn()
+      # var(1) = var(0) - lr * 2 = 1.98
+      self.assertAllClose(1.98, self.evaluate(all_vars[0]))
       # m(2) = beta1 * m(1) + (1-beta1) * grad = 0.2 * 1.2 + 0.8 * 1.5
-      m_val = [1.44, 1.44, 1.44]
-      self.assertAllClose(
-          m_val,
-          self.evaluate(
-              [distribution.read_var(m),
-               m.get(devices[0]),
-               m.get(devices[1])]))
+      self.assertAllClose(1.44, self.evaluate(all_vars[1]))
       # v(2) = beta2 * v(1) + (1-beta2) * grad^2 = 0.2 * 1.8 + 0.8 * 2.25
-      v_val = [2.16, 2.16, 2.16]
-      self.assertAllClose(
-          v_val,
-          self.evaluate(
-              [distribution.read_var(v),
-               v.get(devices[0]),
-               v.get(devices[1])]))
-      self.assertAllClose([2, 2, 2],
-                          self.evaluate([
-                              distribution.read_var(counter),
-                              counter.get(devices[0]),
-                              counter.get(devices[1])
-                          ]))
+      self.assertAllClose(2.16, self.evaluate(all_vars[2]))
 
-  @combinations.generate(combinations.combine(
-      distribution=[
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.core_mirrored_strategy_with_gpu_and_cpu],
-      mode=['graph']))
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              combinations.core_mirrored_strategy_with_gpu_and_cpu,
+              combinations.parameter_server_strategy_with_two_gpus,
+          ],
+          mode=['graph', 'eager']))
   def testOptimizerWithKerasModelAndNumpyArrays(self, distribution):
 
     with self.cached_session():
-      model = get_model()
-      optimizer = gradient_descent.SGD(0.001)
-      loss = 'mse'
-      metrics = ['mae']
-      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
+      with distribution.scope():
+        model = get_model()
+        optimizer = gradient_descent.SGD(0.001)
+        loss = 'mse'
+        metrics = ['mae']
+        model.compile(optimizer, loss, metrics=metrics)
 
       inputs = np.zeros((64, 3), dtype=np.float32)
       targets = np.zeros((64, 4), dtype=np.float32)
diff --git a/tensorflow/contrib/distribute/python/keras_stateful_lstm_model_correctness_test.py b/tensorflow/contrib/distribute/python/keras_stateful_lstm_model_correctness_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5faf6c36b880a72bafc8d082cff2816f3b11a76
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/keras_stateful_lstm_model_correctness_test.py
@@ -0,0 +1,99 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for stateful tf.keras LSTM models using DistributionStrategy."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import keras_correctness_test_base
+from tensorflow.python import keras
+from tensorflow.python.eager import test
+from tensorflow.python.training import gradient_descent
+
+
+def strategies_for_stateful_embedding_model():
+  """Returns TPUStrategy with single core device assignment."""
+
+  return [combinations.tpu_strategy_one_core,
+          combinations.tpu_strategy_one_step_one_core]
+
+
+def test_combinations_for_stateful_embedding_model():
+  return (
+      combinations.combine(
+          distribution=strategies_for_stateful_embedding_model(),
+          mode='graph',
+          use_numpy=False,
+          use_validation_data=False
+      ))
+
+
+class DistributionStrategyStatefulLstmModelCorrectnessTest(
+    keras_correctness_test_base.
+    TestDistributionStrategyEmbeddingModelCorrectnessBase):
+
+  def get_model(self, max_words=10, initial_weights=None, distribution=None):
+    batch_size = keras_correctness_test_base._GLOBAL_BATCH_SIZE
+
+    with keras_correctness_test_base.MaybeDistributionScope(distribution):
+      word_ids = keras.layers.Input(
+          shape=(max_words,),
+          batch_size=batch_size,
+          dtype=np.int32, name='words')
+      word_embed = keras.layers.Embedding(input_dim=20,
+                                          output_dim=10)(word_ids)
+      lstm_embed = keras.layers.LSTM(units=4,
+                                     return_sequences=False,
+                                     stateful=True)(word_embed)
+
+      preds = keras.layers.Dense(2, activation='softmax')(lstm_embed)
+      model = keras.Model(inputs=[word_ids], outputs=[preds])
+
+      if initial_weights:
+        model.set_weights(initial_weights)
+
+      model.compile(
+          optimizer=gradient_descent.GradientDescentOptimizer(
+              learning_rate=0.1),
+          loss='sparse_categorical_crossentropy',
+          metrics=['sparse_categorical_accuracy'])
+    return model
+
+  @combinations.generate(test_combinations_for_stateful_embedding_model())
+  def test_stateful_lstm_model_correctness(self,
+                                           distribution,
+                                           use_numpy,
+                                           use_validation_data):
+    self.run_correctness_test(distribution, use_numpy, use_validation_data,
+                              is_stateful_model=True)
+
+  @combinations.generate(keras_correctness_test_base.
+                         test_combinations_with_tpu_strategies())
+  def test_incorrectly_use_multiple_cores_for_stateful_lstm_model(
+      self, distribution, use_numpy, use_validation_data):
+    with self.assertRaisesRegexp(ValueError,
+                                 'Single core must be used for computation '
+                                 'on stateful models. Consider adding '
+                                 '`device_assignment` parameter to '
+                                 'TPUStrategy'):
+      self.run_correctness_test(distribution, use_numpy, use_validation_data,
+                                is_stateful_model=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_test.py b/tensorflow/contrib/distribute/python/keras_test.py
index 683cc89bfbae9c877ea6794d311ffc00c96c6937..77e241974f7c4c27382ab548a202891fdbbc6ba0 100644
--- a/tensorflow/contrib/distribute/python/keras_test.py
+++ b/tensorflow/contrib/distribute/python/keras_test.py
@@ -25,18 +25,17 @@ from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python import tpu_strategy
 from tensorflow.python import keras
+from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import values
 from tensorflow.python.eager import test
 from tensorflow.python.estimator import keras as keras_lib
 from tensorflow.python.estimator import run_config as run_config_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import distributed_training_utils
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.parsing_ops import gen_parsing_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.summary.writer import writer_cache
@@ -48,6 +47,9 @@ _TRAIN_SIZE = 200
 _INPUT_SIZE = (10,)
 _NUM_CLASS = 2
 
+# Note: Please make sure the tests in this file are also covered in
+# keras_backward_compat_test for features that are supported with both APIs.
+
 
 # TODO(anjalisridhar): Add a decorator that will allow us to run these tests as
 # part of the tf.keras unit tests suite.
@@ -68,6 +70,32 @@ def simple_functional_model():
   return model
 
 
+def simple_subclassed_model(num_labels=_NUM_CLASS):
+
+  class _SimpleMLP(keras.Model):
+
+    def __init__(self, num_labels):
+      super(_SimpleMLP, self).__init__()
+      self.dense = keras.layers.Dense(num_labels)
+
+    def call(self, inputs):
+      return self.dense(inputs)
+
+  return _SimpleMLP(num_labels)
+
+
+def simple_multi_inputs_multi_outputs_model():
+  input_a = keras.layers.Input(shape=(16,), name='input_a')
+  input_b = keras.layers.Input(shape=(16,), name='input_b')
+
+  merged = keras.layers.concatenate([input_a, input_b], name='merge')
+  output_c = keras.layers.Dense(3, activation='softmax', name='dense_2')(merged)
+  output_d = keras.layers.Dense(2, activation='softmax', name='dense_3')(merged)
+  model = keras.models.Model(
+      inputs=[input_a, input_b], outputs=[output_c, output_d])
+  return model
+
+
 def multi_inputs_multi_outputs_model():
   input_a = keras.layers.Input(shape=(16,), name='input_a')
   input_b = keras.layers.Input(shape=(16,), name='input_b')
@@ -200,6 +228,22 @@ def get_predict_dataset(distribution):
   return dataset
 
 
+def convert_numpy_to_dataset_with_unknown_cardinality(inputs,
+                                                      targets=None):
+  if targets is not None:
+    input_slices = (inputs, targets)
+    dummy_op = (lambda inp, target: True)
+  else:
+    input_slices = inputs
+    dummy_op = (lambda inp: True)
+
+  original_dataset = (dataset_ops.Dataset.from_tensor_slices(
+      input_slices))
+  ds_with_unknown_cardinality = (original_dataset.filter(dummy_op).
+                                 batch(10, drop_remainder=True))
+  return ds_with_unknown_cardinality
+
+
 def multi_input_output_model():
   a = keras.layers.Input(shape=(3,), name='input_a')
   b = keras.layers.Input(shape=(5,), name='input_b')
@@ -214,90 +258,12 @@ def multi_input_output_model():
   return model
 
 
-def get_correctness_test_inputs(use_numpy, use_validation_data,
-                                with_distribution,
-                                x_train, y_train, x_predict):
-  """Generates the inputs for correctness check when enable Keras with DS."""
-  training_epochs = 2
-  global_batch_size = 64
-  batch_size = global_batch_size
-  # TODO(b/118776054): Use global batch size for Keras/DS support.
-  use_per_core_batch_size = (
-      with_distribution and
-      not distributed_training_utils.global_batch_size_supported(
-          with_distribution))
-  if use_per_core_batch_size:
-    batch_size //= with_distribution.num_replicas_in_sync
-
-  if use_numpy:
-    training_inputs = {
-        'batch_size': batch_size,
-        'x': x_train,
-        'y': y_train,
-        'epochs': training_epochs,
-        'shuffle': False,
-    }
-
-    if use_validation_data:
-      eval_inputs = None
-      training_inputs['validation_data'] = (x_train, y_train)
-    else:
-      eval_inputs = {
-          'batch_size': batch_size,
-          'x': x_train,
-          'y': y_train,
-      }
-    predict_inputs = {
-        'x': np.array(x_predict, dtype=np.float32),
-    }
-  else:
-    # For dataset inputs, we do not pass batch_size to
-    # keras.fit/evaluate/predict. The batch size is part of the dataset.
-    train_dataset = dataset_ops.Dataset.from_tensor_slices(
-        (x_train, y_train))
-    x = batch_wrapper(
-        train_dataset, batch_size, with_distribution, repeat=training_epochs)
-
-    training_inputs = {
-        'batch_size': None,
-        'x': x,
-        'y': None,
-        'epochs': training_epochs,
-        'shuffle': False,
-        'steps_per_epoch': len(x_train) // global_batch_size,
-    }
-    if use_validation_data:
-      eval_inputs = None  # Remove the eval_inputs
-      eval_dataset = dataset_ops.Dataset.from_tensor_slices(
-          (x_train, y_train))
-      x = batch_wrapper(eval_dataset, batch_size, with_distribution)
-      training_inputs['validation_data'] = x
-      training_inputs['validation_steps'] = 5
-    else:
-      eval_inputs = {
-          'batch_size': None,
-          'x': x,
-          'y': None,
-          'steps': 20,
-      }
-
-    predict_batch_size = len(x_predict)
-    if use_per_core_batch_size:
-      predict_batch_size //= with_distribution.num_replicas_in_sync
-    predict_dataset = dataset_ops.Dataset.from_tensor_slices(x_predict)
-    predict_dataset = batch_wrapper(predict_dataset,
-                                    predict_batch_size, with_distribution)
-    predict_inputs = {
-        'steps': 1,
-        'x': predict_dataset,
-    }
-
-  return training_inputs, eval_inputs, predict_inputs
-
-
+# TODO(josh11b): Add combinations.one_device_strategy_gpu once it works with
+# TestDistributionStrategyWithCallbacks.test_callbacks_in_predict.
 strategies_minus_tpu = [
     combinations.default_strategy,
     combinations.one_device_strategy,
+    combinations.one_device_strategy_gpu,
     combinations.mirrored_strategy_with_gpu_and_cpu,
     combinations.mirrored_strategy_with_two_gpus,
     combinations.core_mirrored_strategy_with_gpu_and_cpu,
@@ -309,53 +275,45 @@ tpu_strategies = [
 
 
 def strategy_minus_tpu_combinations():
-  return combinations.combine(
-      distribution=strategies_minus_tpu,
-      mode=['graph', 'eager'])
+  return combinations.combine(distribution=strategies_minus_tpu,
+                              mode=['graph', 'eager'])
 
 
 def tpu_strategy_combinations():
-  return combinations.combine(
-      distribution=tpu_strategies,
-      mode=['graph'])
+  return combinations.combine(distribution=tpu_strategies,
+                              mode=['graph'])
 
 
 def all_strategy_combinations():
   return strategy_minus_tpu_combinations() + tpu_strategy_combinations()
 
 
-# TODO(priyag): Add v2 optimizers here.
+def all_strategy_combinations_minus_default():
+  strategy_minus_default_combinations = combinations.combine(
+      distribution=[
+          combinations.one_device_strategy,
+          combinations.one_device_strategy_gpu,
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph', 'eager'])
+  return strategy_minus_default_combinations + tpu_strategy_combinations()
+
+
 def strategy_and_optimizer_combinations():
   return combinations.times(
       all_strategy_combinations(),
-      combinations.combine(
-          optimizer=[combinations.adagrad_optimizer_v1_fn,
-                     combinations.adam_optimizer_v1_fn,
-                     combinations.gradient_descent_optimizer_v1_fn,
-                     combinations.rmsprop_optimizer_v1_fn]))
-
-
-def strategy_and_input_combinations():
-  return (
-      combinations.times(
-          combinations.combine(distribution=strategies_minus_tpu),
-          combinations.combine(mode=['graph'],
-                               use_numpy=[True, False],
-                               use_validation_data=[True, False])
-          + combinations.combine(mode=['eager'],
-                                 use_numpy=[False],
-                                 use_validation_data=[False])) +
-      combinations.times(
-          combinations.combine(distribution=tpu_strategies),
-          combinations.combine(mode=['graph'],
-                               use_numpy=[True, False],
-                               use_validation_data=[True, False])))
-
-
-def strategy_for_numpy_input_combinations():
-  return combinations.combine(
-      distribution=strategies_minus_tpu + tpu_strategies,
-      mode=['graph'])
+      combinations.combine(optimizer=[
+          combinations.adagrad_optimizer_v1_fn,
+          combinations.adagrad_optimizer_keras_v2_fn,
+          combinations.adam_optimizer_v1_fn,
+          combinations.adam_optimizer_keras_v2_fn,
+          combinations.gradient_descent_optimizer_v1_fn,
+          combinations.gradient_descent_optimizer_keras_v2_fn,
+          combinations.rmsprop_optimizer_v1_fn,
+          combinations.rmsprop_optimizer_keras_v2_fn
+      ]))
 
 
 class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
@@ -375,7 +333,9 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
 
   @combinations.generate(combinations.combine(
       distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
           combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu,
           combinations.core_mirrored_strategy_with_two_gpus],
       mode=['graph']))
   def test_train_functional_with_distribution_strategy(self, distribution):
@@ -403,7 +363,9 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
 
   @combinations.generate(combinations.combine(
       distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
           combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu,
           combinations.core_mirrored_strategy_with_two_gpus],
       mode=['graph']))
   def test_train_sequential_with_distribution_strategy(self, distribution):
@@ -430,8 +392,8 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
 
   @combinations.generate(combinations.combine(
       distribution=[
-          combinations.mirrored_strategy_with_two_gpus,
-          combinations.core_mirrored_strategy_with_two_gpus],
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
       mode=['graph']))
   def test_multi_inputs_multi_outputs_with_input_fn_as_dict(self, distribution):
     train_data, test_data = get_multi_inputs_multi_outputs_data()
@@ -482,8 +444,8 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
 
   @combinations.generate(combinations.combine(
       distribution=[
-          combinations.mirrored_strategy_with_two_gpus,
-          combinations.core_mirrored_strategy_with_two_gpus],
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
       mode=['graph']))
   def test_keras_optimizer_with_distribution_strategy(self, distribution):
     keras_model = simple_sequential_model()
@@ -509,16 +471,7 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
 class TestDistributionStrategyWithNumpyArrays(test.TestCase,
                                               parameterized.TestCase):
 
-  @combinations.generate(strategy_for_numpy_input_combinations())
-  def test_creating_var_with_numpy_arrays(self, distribution):
-    with self.cached_session():
-      x = np.asarray(np.random.random((64, 3)), dtype=np.float32)
-      var_x = distributed_training_utils.get_var_for_numpy(distribution, x)
-      val = self.evaluate(var_x.value())
-      # Verify that the numpy value is copied to the variable.
-      self.assertAllEqual(x, val)
-
-  @combinations.generate(strategy_for_numpy_input_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_calculating_input_params_no_steps_no_batch_size(self, distribution):
     # Calculate the per_replica_batch_size scaling factor for strategies
     # that use per_core_batch_size
@@ -549,7 +502,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         distributed_training_utils.get_input_params(
             distribution, input_63_samples, steps=None, batch_size=None)
 
-  @combinations.generate(strategy_for_numpy_input_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_calculating_input_params_with_steps_no_batch_size(self,
                                                              distribution):
     # Calculate the per_replica_batch_size scaling factor for strategies
@@ -595,7 +548,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
           distributed_training_utils.get_input_params(
               distribution, input_63_samples, steps=1, batch_size=None)
 
-  @combinations.generate(strategy_for_numpy_input_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_calculating_input_params_no_steps_with_batch_size(self,
                                                              distribution):
     # Calculate the per_replica_batch_size scaling factor for strategies
@@ -629,7 +582,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         distributed_training_utils.get_input_params(
             distribution, input_64_samples, steps=None, batch_size=3)
 
-  @combinations.generate(strategy_for_numpy_input_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_calculating_input_params_with_steps_with_batch_size(self,
                                                                distribution):
     with self.cached_session():
@@ -646,45 +599,46 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         distributed_training_utils.get_input_params(
             distribution, input_64_samples, steps=10, batch_size=13)
 
-  @combinations.generate(strategy_for_numpy_input_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_calling_model_with_numpy_arrays(self, distribution):
     with self.cached_session():
-      model = get_model()
-
-      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-      loss = 'mse'
-      metrics = ['mae']
-      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
-
-      inputs = np.zeros((64, 3), dtype=np.float32)
-      targets = np.zeros((64, 4), dtype=np.float32)
-
-      # Call fit with validation data
-      model.fit(inputs, targets, epochs=1, batch_size=2, verbose=0,
-                validation_data=(inputs, targets))
-
-      # TODO(anjalisridhar): We need tests for when the batch size and steps are
-      # smaller and results in a 0 batch_size and steps value.
-      model.evaluate(inputs, targets)
-      # with steps
-      model.evaluate(inputs, targets, steps=2)
-      # with batch_size
-      model.evaluate(inputs, targets, batch_size=8)
-
-      model.predict(inputs)
-      # with steps
-      model.predict(inputs, steps=2)
-      # with batch_size
-      model.predict(inputs, batch_size=8)
+      with distribution.scope():
+        model = get_model()
+        optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+        loss = 'mse'
+        metrics = ['mae']
+        model.compile(optimizer, loss, metrics=metrics)
+
+        inputs = np.zeros((64, 3), dtype=np.float32)
+        targets = np.zeros((64, 4), dtype=np.float32)
+
+        # Call fit with validation data
+        model.fit(inputs, targets, epochs=1, batch_size=2, verbose=0,
+                  validation_data=(inputs, targets))
+
+        # TODO(anjalisridhar): We need tests for when the batch size and steps
+        # are smaller and results in a 0 batch_size and steps value.
+        model.evaluate(inputs, targets)
+        # with steps
+        model.evaluate(inputs, targets, steps=2)
+        # with batch_size
+        model.evaluate(inputs, targets, batch_size=8)
+
+        model.predict(inputs)
+        # with steps
+        model.predict(inputs, steps=2)
+        # with batch_size
+        model.predict(inputs, batch_size=8)
 
-  @combinations.generate(strategy_for_numpy_input_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_calling_model_with_nested_numpy_arrays(self, distribution):
     with self.cached_session():
-      model = multi_input_output_model()
-
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.001)
-      loss = 'mse'
-      model.compile(optimizer, loss, distribute=distribution)
+      with distribution.scope():
+        model = multi_input_output_model()
+        optimizer = gradient_descent.GradientDescentOptimizer(
+            learning_rate=0.001)
+        loss = 'mse'
+        model.compile(optimizer, loss)
 
       input_a_np = np.asarray(np.random.random((64, 3)), dtype=np.float32)
       input_b_np = np.asarray(np.random.random((64, 5)), dtype=np.float32)
@@ -714,26 +668,29 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
   @combinations.generate(combinations.combine(
       distribution=strategies_minus_tpu, mode=['graph']))
   def test_numpy_with_sample_weights(self, distribution):
-    model = get_model()
-    optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    model.compile(optimizer, loss, distribute=distribution)
+    with self.cached_session():
+      with distribution.scope():
+        model = get_model()
+        optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+        loss = 'mse'
+        model.compile(optimizer, loss)
 
-    inputs = np.zeros((20, 3), np.float32)
-    targets = np.zeros((20, 4), np.float32)
-    sample_weights = np.ones((20), np.float32)
+      inputs = np.zeros((20, 3), np.float32)
+      targets = np.zeros((20, 4), np.float32)
+      sample_weights = np.ones((20), np.float32)
 
-    model.fit(inputs, targets, sample_weight=sample_weights, epochs=1,
-              steps_per_epoch=2, verbose=1)
+      model.fit(inputs, targets, sample_weight=sample_weights, epochs=1,
+                steps_per_epoch=2, verbose=1)
 
-  @combinations.generate(strategy_for_numpy_input_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_flatten_predict_outputs(self, distribution):
     with self.cached_session():
-      model = multi_input_output_model()
-
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.001)
-      loss = 'mse'
-      model.compile(optimizer, loss, distribute=distribution)
+      with distribution.scope():
+        model = multi_input_output_model()
+        optimizer = gradient_descent.GradientDescentOptimizer(
+            learning_rate=0.001)
+        loss = 'mse'
+        model.compile(optimizer, loss)
 
       # We take 6 input samples with each input having a dimension of 3 or 5.
       input_a_np = np.asarray(np.random.random((6, 3)), dtype=np.float32)
@@ -750,6 +707,61 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       self.assertAllEqual([6, 7], outs[0].shape)
       self.assertAllEqual([6, 7], outs[1].shape)
 
+  @combinations.generate(tpu_strategy_combinations())
+  def test_predict_with_partial_batch(self, distribution):
+    with self.cached_session():
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      loss = 'mse'
+
+      with distribution.scope():
+        model_with_ds_strategy = get_model()
+        model_with_ds_strategy.compile(optimizer, loss)
+
+      cpu_model = get_model()
+      cpu_model.compile(optimizer, loss)
+
+      inputs = np.zeros((10, 3), dtype=np.float32)
+
+      # As sample size is 10, we batch by 4 so that the last batch is
+      # a partial batch. Also `fit()` using numpy array as inputs without
+      # distribution strategy uses entire sample as a single batch. As so,
+      # we remove parameters `batch_size` and `steps`.
+      cpu_model.set_weights(model_with_ds_strategy.get_weights())
+      self.assertAllClose(
+          model_with_ds_strategy.predict(inputs, batch_size=4, steps=3),
+          cpu_model.predict(inputs),
+          atol=1e-5, rtol=1e-5)
+
+  @combinations.generate(tpu_strategy_combinations())
+  def test_predict_multi_output_model_with_partial_batch(
+      self, distribution):
+    with self.cached_session():
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      loss = 'mse'
+
+      with distribution.scope():
+        model_with_ds_strategy = simple_multi_inputs_multi_outputs_model()
+        model_with_ds_strategy.compile(optimizer, loss)
+
+      cpu_model = simple_multi_inputs_multi_outputs_model()
+      cpu_model.compile(optimizer, loss)
+
+      input_data, _ = get_multi_inputs_multi_outputs_data()
+      input_dict = {
+          'input_a': input_data['input_a'],
+          'input_b': input_data['input_b'],
+      }
+
+      # As sample size is 200, we batch by 18 so that the last batch is
+      # a partial batch. Also `fit()` using numpy array as inputs without
+      # distribution strategy uses entire sample as a single batch. As so,
+      # we remove parameters `batch_size` and `steps`.
+      cpu_model.set_weights(model_with_ds_strategy.get_weights())
+      self.assertAllClose(
+          model_with_ds_strategy.predict(input_dict, batch_size=18, steps=12),
+          cpu_model.predict(input_dict),
+          atol=1e-4, rtol=1e-4)
+
 
 class TestDistributionStrategyWithDatasets(test.TestCase,
                                            parameterized.TestCase):
@@ -757,12 +769,12 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
   @combinations.generate(all_strategy_combinations())
   def test_calling_model_on_same_dataset(self, distribution):
     with self.cached_session():
-      model = get_model()
-
-      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-      loss = 'mse'
-      metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
+      with distribution.scope():
+        model = get_model()
+        optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+        loss = 'mse'
+        metrics = ['mae', keras.metrics.CategoricalAccuracy()]
+        model.compile(optimizer, loss, metrics=metrics)
 
       dataset = get_dataset(distribution)
 
@@ -776,20 +788,19 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
   @combinations.generate(all_strategy_combinations())
   def test_model_interleaved_eval_same_as_direct_eval(self, distribution):
     with self.cached_session():
-      user_controlled_model = get_model()
-      user_controlled_model.compile(
-          gradient_descent.GradientDescentOptimizer(0.001),
-          loss='mse',
-          metrics=['mae', keras.metrics.CategoricalAccuracy()],
-          distribute=distribution)
-
-      interleaved_model = get_model()
-      interleaved_model.set_weights(user_controlled_model.get_weights())
-      interleaved_model.compile(
-          gradient_descent.GradientDescentOptimizer(0.001),
-          loss='mse',
-          metrics=['mae', keras.metrics.CategoricalAccuracy()],
-          distribute=distribution)
+      with distribution.scope():
+        user_controlled_model = get_model()
+        user_controlled_model.compile(
+            gradient_descent.GradientDescentOptimizer(0.001),
+            loss='mse',
+            metrics=['mae', keras.metrics.CategoricalAccuracy()])
+
+        interleaved_model = get_model()
+        interleaved_model.set_weights(user_controlled_model.get_weights())
+        interleaved_model.compile(
+            gradient_descent.GradientDescentOptimizer(0.001),
+            loss='mse',
+            metrics=['mae', keras.metrics.CategoricalAccuracy()])
 
       dataset = get_dataset(distribution)
 
@@ -824,12 +835,13 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       mode=['graph', 'eager']))
   def test_fit_with_tuple_and_dict_dataset_inputs(self, distribution):
     with self.cached_session():
-      model = multi_input_output_model()
-
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.001)
-      loss = 'mse'
-      metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
+      with distribution.scope():
+        model = multi_input_output_model()
+        optimizer = gradient_descent.GradientDescentOptimizer(
+            learning_rate=0.001)
+        loss = 'mse'
+        metrics = ['mae', keras.metrics.CategoricalAccuracy()]
+        model.compile(optimizer, loss, metrics=metrics)
 
       input_a_np = np.random.random((10, 3))
       input_b_np = np.random.random((10, 5))
@@ -854,14 +866,103 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       model.fit(dataset_dict, epochs=1, steps_per_epoch=2, verbose=1)
 
   @combinations.generate(all_strategy_combinations())
-  def test_fit_eval_and_predict_methods_on_dataset(self, distribution):
+  def test_fit_eval_and_predict_methods_on_dataset_without_steps(
+      self, distribution):
     with self.cached_session():
-      model = get_model()
+      with distribution.scope():
+        model = get_model()
+        optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+        loss = 'mse'
+        metrics = ['mae', keras.metrics.CategoricalAccuracy()]
+        model.compile(optimizer, loss, metrics=metrics)
+
+      inputs = np.zeros((1000, 3), dtype=np.float32)
+      targets = np.zeros((1000, 4), dtype=np.float32)
+      # steps/steps_per_epoch are calculated when using numpy arrays as
+      # input data.
+      fit_with_numpy = model.fit(inputs, targets, epochs=1,
+                                 batch_size=10).history
+      eval_with_numpy = model.evaluate(inputs, targets, batch_size=10)
+      predict_with_numpy = model.predict(inputs, batch_size=10)
 
-      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-      loss = 'mse'
-      metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.batch(10, drop_remainder=True)
+      fit_with_ds = model.fit(dataset, epochs=1).history
+      eval_with_ds = model.evaluate(dataset)
+      predict_dataset = dataset_ops.Dataset.from_tensor_slices(inputs)
+      predict_dataset = predict_dataset.batch(10, drop_remainder=True)
+      predict_with_ds = model.predict(predict_dataset)
+      self.assertAllClose(
+          fit_with_numpy, fit_with_ds, atol=1e-4, rtol=1e-4)
+      self.assertAllClose(
+          eval_with_numpy, eval_with_ds, atol=1e-4, rtol=1e-4)
+      self.assertAllClose(
+          predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4)
+
+  @combinations.generate(all_strategy_combinations())
+  def test_on_dataset_with_unknown_cardinality_without_steps(
+      self, distribution):
+    with self.cached_session():
+      with distribution.scope():
+        model = get_model()
+        optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+        loss = 'mse'
+        metrics = ['mae', keras.metrics.CategoricalAccuracy()]
+        model.compile(optimizer, loss, metrics=metrics)
+
+      inputs = np.zeros((1000, 3), dtype=np.float32)
+      targets = np.zeros((1000, 4), dtype=np.float32)
+      # steps/steps_per_epoch are calculated when using numpy arrays as
+      # input data.
+      fit_with_numpy = model.fit(inputs, targets, epochs=1,
+                                 batch_size=10).history
+      fit_with_numpy_multiple_epochs = model.fit(
+          inputs, targets, epochs=2, batch_size=10).history
+      eval_with_numpy = model.evaluate(inputs, targets, batch_size=10)
+      predict_with_numpy = model.predict(inputs, batch_size=10)
+
+      dataset = convert_numpy_to_dataset_with_unknown_cardinality(
+          inputs, targets)
+      predict_dataset = convert_numpy_to_dataset_with_unknown_cardinality(
+          inputs)
+
+      self.assertEqual(keras.backend.get_value(cardinality.cardinality(
+          dataset)), cardinality.UNKNOWN)
+      self.assertEqual(keras.backend.get_value(cardinality.cardinality(
+          predict_dataset)), cardinality.UNKNOWN)
+
+      eval_with_ds = model.evaluate(dataset)
+      predict_with_ds = model.predict(predict_dataset)
+      self.assertAllClose(
+          eval_with_numpy, eval_with_ds, atol=1e-4, rtol=1e-4)
+      self.assertAllClose(
+          predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4)
+
+      if (distributed_training_utils.is_tpu_strategy(distribution) and
+          distribution.extended.steps_per_run != 1):
+        with self.assertRaisesRegexp(ValueError, '`steps_per_epoch` '
+                                     'should be specified'):
+          fit_with_ds = model.fit(dataset, epochs=1)
+      else:
+        fit_with_ds = model.fit(dataset,
+                                epochs=1).history
+        fit_with_ds_multiple_epochs = model.fit(dataset,
+                                                epochs=2).history
+        self.assertAllClose(
+            fit_with_numpy, fit_with_ds, atol=1e-4, rtol=1e-4)
+        self.assertAllClose(
+            fit_with_numpy_multiple_epochs,
+            fit_with_ds_multiple_epochs, atol=1e-4, rtol=1e-4)
+
+  @combinations.generate(all_strategy_combinations())
+  def test_fit_eval_and_predict_methods_on_dataset(self, distribution):
+    with self.cached_session():
+      with distribution.scope():
+        model = get_model()
+        optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+        loss = 'mse'
+        metrics = ['mae', keras.metrics.CategoricalAccuracy()]
+        model.compile(optimizer, loss, metrics=metrics)
 
       dataset = get_dataset(distribution)
 
@@ -872,10 +973,10 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
   @combinations.generate(strategy_and_optimizer_combinations())
   def test_fit_eval_and_predict_with_optimizer(self, distribution, optimizer):
     with self.cached_session():
-      model = get_model()
-
-      loss = 'mse'
-      model.compile(optimizer(), loss, distribute=distribution)
+      with distribution.scope():
+        model = get_model()
+        loss = 'mse'
+        model.compile(optimizer(), loss)
 
       dataset = get_dataset(distribution)
 
@@ -885,35 +986,39 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
   @combinations.generate(strategy_minus_tpu_combinations())
   def test_dataset_with_sample_weights(self, distribution):
-    model = get_model()
-    optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    model.compile(optimizer, loss, distribute=distribution)
-
-    inputs = np.zeros((10, 3), np.float32)
-    targets = np.zeros((10, 4), np.float32)
-    sample_weights = np.ones((10), np.float32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets,
-                                                      sample_weights))
-    dataset = dataset.repeat()
-    dataset = dataset.batch(10)
-
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
-    model.evaluate(dataset, steps=2, verbose=1)
-    model.predict(dataset, steps=2)
+    with self.cached_session():
+      with distribution.scope():
+        model = get_model()
+        optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+        loss = 'mse'
+        model.compile(optimizer, loss)
+
+      inputs = np.zeros((10, 3), np.float32)
+      targets = np.zeros((10, 4), np.float32)
+      sample_weights = np.ones((10), np.float32)
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets,
+                                                        sample_weights))
+      dataset = dataset.repeat()
+      dataset = dataset.batch(10)
+
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+      model.evaluate(dataset, steps=2, verbose=1)
+      model.predict(dataset, steps=2)
 
   @combinations.generate(combinations.combine(
       distribution=[
-          combinations.mirrored_strategy_with_two_gpus,
-          combinations.core_mirrored_strategy_with_two_gpus],
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
       mode=['graph', 'eager']))
-  def test_dataset_wrong_input_shape(self, distribution):
+  # TODO(b/120943676, b/120957836): Re-enable once the validation code is
+  # restored.
+  def DISABLED_test_dataset_wrong_input_shape(self, distribution):
     with self.cached_session():
-      model = get_model()
-
-      optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
-      loss = 'mse'
-      model.compile(optimizer, loss, distribute=distribution)
+      with distribution.scope():
+        model = get_model()
+        optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+        loss = 'mse'
+        model.compile(optimizer, loss)
 
       # Wrong input shape
       inputs = np.zeros((10, 5), dtype=np.float32)
@@ -927,15 +1032,17 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
 
   @combinations.generate(combinations.combine(
-      distribution=[combinations.mirrored_strategy_with_two_gpus],
+      distribution=[combinations.mirrored_strategy_with_gpu_and_cpu],
       mode=['graph', 'eager']))
-  def test_dataset_no_batch_input_validation(self, distribution):
+  # TODO(b/120943676, b/120957836): Re-enable once the validation code is
+  # restored.
+  def DISABLED_test_dataset_no_batch_input_validation(self, distribution):
     with self.cached_session():
-      model = get_model()
-
-      optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
-      loss = 'mse'
-      model.compile(optimizer, loss, distribute=distribution)
+      with distribution.scope():
+        model = get_model()
+        optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+        loss = 'mse'
+        model.compile(optimizer, loss)
 
       # User forgets to batch the dataset
       inputs = np.zeros((10, 3), dtype=np.float32)
@@ -951,11 +1058,11 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       mode=['graph']))
   def test_dataset_input_shape_fully_defined(self, distribution):
     with self.cached_session():
-      model = get_model()
-
-      optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
-      loss = 'mse'
-      model.compile(optimizer, loss, distribute=distribution)
+      with distribution.scope():
+        model = get_model()
+        optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+        loss = 'mse'
+        model.compile(optimizer, loss)
 
       dataset = get_dataset(distribution)
       # Input shapes are not fully known. Batch dimension is unknown as we are
@@ -967,7 +1074,9 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
   @combinations.generate(combinations.combine(
       distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
           combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu,
           combinations.core_mirrored_strategy_with_two_gpus],
       mode=['graph', 'eager']))
   def test_learning_phase_value(self, distribution):
@@ -975,16 +1084,17 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
     # meaningful values. Currently we don't pass the learning phase if the
     # Lambda layer uses the learning phase.
     with self.cached_session():
-      x = keras.layers.Input(shape=(1,), name='input')
-      y = keras.layers.Dense(1, kernel_initializer='ones')(x)
-      z = keras.layers.Dropout(0.9999)(y)
-      model = keras.Model(x, z)
-      initial_weights = model.get_weights()
+      with distribution.scope():
+        x = keras.layers.Input(shape=(1,), name='input')
+        y = keras.layers.Dense(1, kernel_initializer='ones')(x)
+        z = keras.layers.Dropout(0.9999)(y)
+        model = keras.Model(x, z)
+        initial_weights = model.get_weights()
 
-      optimizer = gradient_descent.GradientDescentOptimizer(0.005)
-      loss = 'mse'
-      metrics = ['acc']
-      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
+        optimizer = gradient_descent.GradientDescentOptimizer(0.005)
+        loss = 'mse'
+        metrics = ['acc']
+        model.compile(optimizer, loss, metrics=metrics)
 
       batch_size = 8
       if isinstance(distribution, mirrored_strategy.CoreMirroredStrategy):
@@ -998,7 +1108,8 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       hist = model.fit(dataset, epochs=1, steps_per_epoch=20, verbose=1)
       self.assertAlmostEqual(hist.history['acc'][0], 0, 0)
 
-      model.set_weights(initial_weights)
+      with distribution.scope():
+        model.set_weights(initial_weights)
       # TODO(psv/anjalisridhar): Enable these lines after we fix b/117431185.
       # evaluate_output = model.evaluate(dataset, steps=20)
       # self.assertAlmostEqual(evaluate_output[1], 1, 0)
@@ -1012,14 +1123,14 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       ref_output = np.ones((160, 1), dtype=np.float32)
       self.assertArrayNear(output, ref_output, 1e-1)
 
-  @combinations.generate(strategy_minus_tpu_combinations())
+  @combinations.generate(all_strategy_combinations())
   def testOptimizerWithCallbacks(self, distribution):
     with self.cached_session():
-      model = get_model()
-
-      optimizer = gradient_descent_keras.SGD(0.01)
-      loss = 'mse'
-      model.compile(optimizer, loss, distribute=distribution)
+      with distribution.scope():
+        model = get_model()
+        optimizer = gradient_descent_keras.SGD(0.01)
+        loss = 'mse'
+        model.compile(optimizer, loss)
 
       dataset = get_dataset(distribution)
 
@@ -1028,375 +1139,187 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
       model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
                 callbacks=[keras.callbacks.LearningRateScheduler(schedule)])
-      grouped_models = distribution.unwrap(model._grouped_model)
-      with distribution.scope():
-        for m in grouped_models:
-          self.assertAllClose(0.001, keras.backend.get_value(
-              m.optimizer.lr), atol=1e-05, rtol=1e-05)
-
-
-class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
+      self.assertAllClose(0.001, keras.backend.get_value(model.optimizer.lr))
 
-  @combinations.generate(combinations.combine(
-      distribution=[
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.core_mirrored_strategy_with_gpu_and_cpu],
-      mode=['graph', 'eager']))
-  def test_validating_dataset_input_tensors_with_shape_mismatch(self,
-                                                                distribution):
+  @combinations.generate(tpu_strategy_combinations())
+  def test_predict_with_dataset_with_partial_batch(self, distribution):
     with self.cached_session():
-      a = constant_op.constant([1, 2], shape=(1, 2))
-      b = constant_op.constant([[1, 2], [1, 2]], shape=(2, 2))
-      x = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': b})
-      y = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': a})
-      with distribution.scope():
-        # Removed device and input tensor shape details from the error message
-        # since the order of the device and the corresponding input tensor shape
-        # is not deterministic over different runs.
-        with self.assertRaisesRegexp(ValueError,
-                                     'Input tensor shapes do not match for '
-                                     'distributed tensor inputs '
-                                     'DistributedValues:.+'):
-          distributed_training_utils.validate_distributed_dataset_inputs(
-              distribution, x, y)
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      loss = 'mse'
 
-  @combinations.generate(combinations.combine(
-      distribution=[
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.core_mirrored_strategy_with_gpu_and_cpu],
-      mode=['graph', 'eager']))
-  def test_validating_dataset_input_tensors_with_dtype_mismatch(self,
-                                                                distribution):
-    with self.cached_session():
-      a = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.int32)
-      b = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.float64)
-      x = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': b})
-      y = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': a})
       with distribution.scope():
-        # Removed device and input tensor dtype details from the error message
-        # since the order of the device and the corresponding input tensor dtype
-        # is not deterministic over different runs.
-        with self.assertRaisesRegexp(ValueError,
-                                     'Input tensor dtypes do not match for '
-                                     'distributed tensor inputs '
-                                     'DistributedValues:.+'):
-          distributed_training_utils.validate_distributed_dataset_inputs(
-              distribution, x, y)
-
-  @combinations.generate(combinations.combine(
-      distribution=[
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.core_mirrored_strategy_with_gpu_and_cpu],
-      mode=['graph', 'eager']))
-  def test_unsupported_features(self, distribution):
-    with self.cached_session():
-      model = get_model()
+        model_with_ds_strategy = get_model()
+        model_with_ds_strategy.compile(optimizer, loss)
 
-      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-      loss = 'mse'
-      metrics = ['mae']
-      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
+      cpu_model = get_model()
+      cpu_model.compile(optimizer, loss)
 
-      dataset = get_dataset(distribution)
+      inputs = np.zeros((10, 3), dtype=np.float32)
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs))
 
-      # Test with validation split
-      with self.assertRaisesRegexp(
-          ValueError, '`validation_split` argument is not '
-                      'supported when input `x` is a dataset or a '
-                      'dataset iterator.+'):
-        model.fit(dataset,
-                  epochs=1, steps_per_epoch=2, verbose=0,
-                  validation_split=0.5, validation_steps=2)
-
-      # Test with sample weight.
-      sample_weight = np.random.random((10,))
-      with self.assertRaisesRegexp(
-          ValueError, '`sample_weight` argument is not supported when input '
-                      '`x` is a dataset or a dataset iterator.'):
-        model.fit(
-            dataset,
-            epochs=1,
-            steps_per_epoch=2,
-            verbose=0,
-            sample_weight=sample_weight)
-
-      # Test with not specifying the `steps` argument.
-      with self.assertRaisesRegexp(
-          ValueError, 'you should specify the `steps_per_epoch` argument'):
-        model.fit(dataset, epochs=1, verbose=0)
-      with self.assertRaisesRegexp(ValueError,
-                                   'you should specify the `steps` argument'):
-        model.evaluate(dataset, verbose=0)
+      # As sample size is 10, we batch by 4 so that the last batch is
+      # a partial batch.
+      dataset_with_partial_batch = dataset.batch(4)
+      cpu_model.set_weights(model_with_ds_strategy.get_weights())
 
-      with self.assertRaisesRegexp(ValueError,
-                                   'you should specify the `steps` argument'):
-        model.predict(dataset, verbose=0)
+      self.assertAllClose(
+          model_with_ds_strategy.predict(dataset_with_partial_batch, steps=3),
+          cpu_model.predict(dataset_with_partial_batch, steps=3),
+          atol=1e-5, rtol=1e-5)
 
-  @combinations.generate(combinations.combine(
-      distribution=[
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.core_mirrored_strategy_with_gpu_and_cpu],
-      mode=['graph', 'eager']))
-  def test_calling_with_unsupported_predefined_callbacks(self, distribution):
+  @combinations.generate(tpu_strategy_combinations())
+  def test_predict_multi_output_model_with_dataset_with_partial_batch(
+      self, distribution):
     with self.cached_session():
-      model = get_model()
-
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
-      metrics = ['mae']
-      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
-
-      dataset = get_dataset(distribution)
-
-      def schedule(_):
-        return 0.001
-      with self.assertRaisesRegexp(ValueError,
-                                   'You must specify a Keras Optimizer V2 when '
-                                   'using'):
-        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-                  callbacks=[keras.callbacks.LearningRateScheduler(schedule)])
 
-      with self.assertRaisesRegexp(ValueError,
-                                   'You must specify a Keras Optimizer V2 when '
-                                   'using'):
-        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-                  callbacks=[keras.callbacks.ReduceLROnPlateau()])
+      with distribution.scope():
+        model_with_ds_strategy = simple_multi_inputs_multi_outputs_model()
+        model_with_ds_strategy.compile(optimizer, loss)
 
+      cpu_model = simple_multi_inputs_multi_outputs_model()
+      cpu_model.compile(optimizer, loss)
 
-class TestDistributionStrategyWithLossMasking(test.TestCase,
-                                              parameterized.TestCase):
+      input_data, _ = get_multi_inputs_multi_outputs_data()
+      input_dict = {
+          'input_a': input_data['input_a'],
+          'input_b': input_data['input_b'],
+      }
 
-  # TODO(priyag): Enable all strategies for this test. Currently it does not
-  # work for TPU due to some invalid datatype.
-  @combinations.generate(combinations.combine(
-      distribution=[
-          combinations.mirrored_strategy_with_two_gpus,
-          combinations.core_mirrored_strategy_with_two_gpus],
-      mode=['graph', 'eager']))
-  def test_masking(self, distribution):
-    with self.cached_session():
-      np.random.seed(1337)
-      x = np.array([[[1], [1]], [[0], [0]]])
-      model = keras.models.Sequential()
-      model.add(keras.layers.Masking(mask_value=0, input_shape=(2, 1)))
-      model.add(
-          keras.layers.TimeDistributed(
-              keras.layers.Dense(1, kernel_initializer='one')))
-      model.compile(loss='mse',
-                    optimizer=gradient_descent.GradientDescentOptimizer(0.01),
-                    distribute=distribution)
-      y = np.array([[[1], [1]], [[1], [1]]])
-      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
-      hist = model.fit(x=dataset, epochs=1, steps_per_epoch=2)
-      self.assertEqual(hist.history['loss'][0], 0)
+      dataset = dataset_ops.Dataset.from_tensor_slices(input_dict)
 
+      # As sample size is 200, we batch by 18 using 12 steps per epoch so
+      # that the last batch is a partial batch.
+      dataset_with_partial_batch = dataset.batch(18)
+      cpu_model.set_weights(model_with_ds_strategy.get_weights())
 
-class TestDistributionStrategyWithNormalizationLayer(
-    test.TestCase, parameterized.TestCase):
+      self.assertAllClose(
+          model_with_ds_strategy.predict(dataset_with_partial_batch, steps=12),
+          cpu_model.predict(dataset_with_partial_batch, steps=12),
+          atol=1e-4, rtol=1e-4)
 
-  @combinations.generate(all_strategy_combinations())
-  def test_batchnorm_correctness(self, distribution):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      norm = keras.layers.BatchNormalization(input_shape=(10,), momentum=0.8)
-      model.add(norm)
-      model.compile(loss='mse',
-                    optimizer=gradient_descent.GradientDescentOptimizer(0.01),
-                    distribute=distribution)
-
-      # centered on 5.0, variance 10.0
-      x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10))
-      x = x.astype('float32')
-      dataset = dataset_ops.Dataset.from_tensor_slices((x, x))
-      dataset = dataset.repeat(100)
-      dataset = batch_wrapper(dataset, 32, distribution)
 
-      predict_dataset = dataset_ops.Dataset.from_tensor_slices(x)
-      predict_dataset = predict_dataset.repeat(100)
-      predict_dataset = batch_wrapper(predict_dataset, 32, distribution)
+class TestRegularizerLoss(test.TestCase, parameterized.TestCase):
+  class IdentityRegularizer(keras.regularizers.Regularizer):
 
-      model.fit(dataset, epochs=4, verbose=0, steps_per_epoch=10)
-      out = model.predict(predict_dataset, steps=2)
-      out -= keras.backend.eval(norm.beta)
-      out /= keras.backend.eval(norm.gamma)
-      np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
-      np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
+    def __call__(self, x):
+      return array_ops.identity(x)
 
+  class AddLayer(keras.layers.Layer):
 
-class TestDistributionStrategyCorrectness(test.TestCase,
-                                          parameterized.TestCase):
+    def build(self, _):
+      self.v = self.add_weight(
+          'v', (), initializer='ones',
+          regularizer=TestRegularizerLoss.IdentityRegularizer())
 
-  @combinations.generate(all_strategy_combinations())
-  def test_metric_correctness(self, distribution):
-    with self.cached_session():
-      keras.backend.set_image_data_format('channels_last')
-      num_samples = 10000
-
-      x_train = np.random.randint(0, 2, num_samples)
-      x_train = np.reshape(x_train, (num_samples, 1))
-      y_train = x_train
-      x_train = x_train.astype('float32')
-      y_train = y_train.astype('float32')
-
-      # Create identity model.
-      model = keras.Sequential()
-      model.add(
-          keras.layers.Dense(1, input_shape=(1,), kernel_initializer='ones'))
-      model.compile(
-          loss=keras.losses.mean_squared_error,
-          optimizer=gradient_descent.GradientDescentOptimizer(0.5),
-          metrics=[keras.metrics.BinaryAccuracy()],
-          distribute=distribution)
+    def call(self, inputs):
+      return inputs + self.v
 
-      batch_size = 64
-      if not distributed_training_utils.global_batch_size_supported(
-          distribution):
-        batch_size //= distribution.num_replicas_in_sync
-      train_dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
-      train_dataset = batch_wrapper(train_dataset, batch_size, distribution)
+  @staticmethod
+  def loss_fn(_, y_pred):
+    return math_ops.reduce_mean(y_pred)
 
-      history = model.fit(x=train_dataset, epochs=2, steps_per_epoch=10)
-      self.assertEqual(history.history['binary_accuracy'], [1.0, 1.0])
+  @combinations.generate(all_strategy_combinations_minus_default())
+  def test_regularizer_loss(self, distribution):
+    batch_size = 2
+    if not distributed_training_utils.global_batch_size_supported(distribution):
+      batch_size //= distribution.num_replicas_in_sync
+
+      # Given an input x, which is always 1, and variable v, this model computes
+      # Loss=x+v+regularizer_loss, where regularizer_loss=v and the variable is
+      # initialized to 1. Therefore, this model computes Loss=1+2v, and so the
+      # gradient dLoss/dv = 2. This gradient of 2 is averaged over all examples
+      # in a batch and then multiplied by the learning rate of 1. As a result,
+      # the model update for one batch should subtract 2 from v, resulting in v
+      # being -1. If the regularizer loss is not scaled correctly by number of
+      # replicas, the variable value will be incorrect when number of replicas
+      # >1. For e.g. it will be -2 if num replicas = 2.
+    with distribution.scope():
+      x = keras.layers.Input(shape=(), batch_size=batch_size)
+      y = TestRegularizerLoss.AddLayer()(x)
+      model = keras.models.Model(inputs=x, outputs=y)
+      opt = gradient_descent_keras.SGD(1.)
+      model.compile(opt, loss=TestRegularizerLoss.loss_fn)
+      model.fit(
+          x=np.array([[1.], [1.]], dtype=np.float32),
+          y=np.array([[1.], [1.]], dtype=np.float32),
+          batch_size=batch_size)
+      v = model.get_weights()[0]
+      self.assertEqual(-1.0, v)
+
+
+class TestDistributionStrategyWithKerasModels(test.TestCase,
+                                              parameterized.TestCase):
 
   @combinations.generate(all_strategy_combinations())
-  def test_eval_metrics_correctness(self, distribution):
-    with self.cached_session():
-      model = keras.Sequential()
-      model.add(
-          keras.layers.Dense(
-              3, activation='relu', input_dim=4, kernel_initializer='ones'))
-      model.add(
-          keras.layers.Dense(
-              1, activation='sigmoid', kernel_initializer='ones'))
-      model.compile(
-          loss='mae',
-          metrics=['accuracy', keras.metrics.BinaryAccuracy()],
-          optimizer=gradient_descent.GradientDescentOptimizer(0.001),
-          distribute=distribution)
-
-      # verify correctness of stateful and stateless metrics.
-      x = np.ones((100, 4)).astype('float32')
-      y = np.ones((100, 1)).astype('float32')
-      dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat()
-      dataset = batch_wrapper(dataset, 4, distribution)
-      outs = model.evaluate(dataset, steps=10)
-      self.assertEqual(outs[1], 1.)
-      self.assertEqual(outs[2], 1.)
-
-      y = np.zeros((100, 1)).astype('float32')
-      dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat()
-      dataset = batch_wrapper(dataset, 4, distribution)
-      outs = model.evaluate(dataset, steps=10)
-      self.assertEqual(outs[1], 0.)
-      self.assertEqual(outs[2], 0.)
-
-  @combinations.generate(strategy_and_input_combinations())
-  def test_correctness(self, distribution, use_numpy, use_validation_data):
-
-    with self.cached_session():
-      default_tolerance = 1e-5
-      tol_table = {}
-
-      if isinstance(distribution, (mirrored_strategy.MirroredStrategy,
-                                   mirrored_strategy.CoreMirroredStrategy)):
-        # TODO(b/119257215): Weights are not exactly the same, so use larger
-        # tolerance for now. Predict should be related to weights.
-        tol_table = {
-            'weights_1': 1e-4,
-            'weights_2': 1e-4,
-            'predict_result_1': 1e-4,
-        }
-
-      keras.backend.set_image_data_format('channels_last')
-      np.random.seed(_RANDOM_SEED)
-      random_seed.set_random_seed(_RANDOM_SEED)
-
-      # Train, eval, and predict datasets are created with the same input numpy
-      # arrays.
-      # TODO(xiejw): Change this back to 10000, once we support final partial
-      # batch.
-      num_samples = 9984
-      x_train = np.random.rand(num_samples, 1)
-      y_train = 3 * x_train
-      x_train = x_train.astype('float32')
-      y_train = y_train.astype('float32')
-      x_predict = [[1.], [2.], [3.], [4.]]
-
-      # The model is built once and the initial weights are saved.
-      # This is used to initialize the model for both the distribution and
-      # non-distribution run. In addition, we add few non-linear layers to make
-      # it non-trivial.
-      def _create_model():
-        model = keras.Sequential()
-        model.add(keras.layers.Dense(10, activation='relu', input_shape=(1,)))
-        model.add(keras.layers.Dense(10, activation='relu'))
-        model.add(keras.layers.Dense(10, activation='relu'))
-        model.add(keras.layers.Dense(1))
-        return model
-
-      model = _create_model()
-      initial_weights = model.get_weights()
-      del model  # avoid accident usage.
-
-      def fit_eval_and_predict(with_distribution=None):
-        model = _create_model()
-        # We have initialized the model to the same weight for the distribution
-        # and non-distribution run.
-        model.set_weights(initial_weights)
-        model.compile(
-            loss=keras.losses.mean_squared_error,
-            optimizer=gradient_descent_keras.SGD(0.5),
-            metrics=['mse'],
-            distribute=with_distribution)
+  def test_distribution_strategy_on_sequential_model(self, distribution):
+    with distribution.scope():
+      model = simple_sequential_model()
+      optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      model.compile(optimizer, loss)
 
-        training_inputs, eval_inputs, predict_inputs = (
-            get_correctness_test_inputs(use_numpy, use_validation_data,
-                                        with_distribution,
-                                        x_train, y_train, x_predict))
+      inputs = np.zeros((20, 10), np.float32)
+      targets = np.zeros((20, 2), np.float32)
 
-        result = {}
-        result['training_history_1'] = model.fit(**training_inputs).history
+    model.fit(inputs, targets, epochs=1, steps_per_epoch=2)
+    model.predict(inputs, steps=1)
+    model.evaluate(inputs, targets, steps=1)
 
-        if eval_inputs is not None:
-          result['eval_result_1'] = model.evaluate(**eval_inputs)
+  @combinations.generate(all_strategy_combinations())
+  def test_distribution_strategy_on_functional_model(self, distribution):
+    with distribution.scope():
+      model = get_model()
+      optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      model.compile(optimizer, loss)
 
-        result['weights_1'] = model.get_weights()
-        result['predict_result_1'] = model.predict(**predict_inputs)
+      inputs = np.zeros((64, 3), dtype=np.float32)
+      targets = np.zeros((64, 4), dtype=np.float32)
 
-        # Train and eval again to mimic user's flow.
+    model.fit(inputs, targets, epochs=1, steps_per_epoch=2)
+    model.predict(inputs, steps=1)
+    model.evaluate(inputs, targets, steps=1)
 
-        result['training_history_2'] = model.fit(**training_inputs).history
+  # TODO(b/124377929): Remove error assertions once subclassed models
+  # are supported in DistributedStrategy.
+  @combinations.generate(all_strategy_combinations_minus_default())
+  def test_distribution_strategy_on_subclassed_model(self, distribution):
+    with distribution.scope():
+      model = simple_subclassed_model()
+      optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      model.compile(optimizer, loss)
 
-        if eval_inputs is not None:
-          result['eval_result_2'] = model.evaluate(**eval_inputs)
+      inputs = np.zeros((64, 3), dtype=np.float32)
+      targets = np.zeros((64, 2), dtype=np.float32)
 
-        result['weights_2'] = model.get_weights()
+    with self.assertRaisesRegexp(AttributeError, 'has no attribute'):
+      model.fit(inputs, targets, epochs=1, steps_per_epoch=2)
 
-        return result
+    with self.assertRaisesRegexp(AttributeError, 'has no attribute'):
+      model.predict(inputs, steps=1)
 
-      results_with_ds = fit_eval_and_predict(with_distribution=distribution)
-      results_without_ds = fit_eval_and_predict(with_distribution=None)
+    with self.assertRaisesRegexp(AttributeError, 'has no attribute'):
+      model.evaluate(inputs, targets, steps=1)
 
-      # Verify that the weights, training history, eval results, predict outputs
-      # are the same within some limits of tolerance.
-      for key in results_with_ds:
-        if (key.startswith('training_history') and
-            isinstance(distribution, tpu_strategy.TPUStrategy) and
-            distribution.extended.steps_per_run > 1):
-          # TODO(b/119894254): Enable this test for all cases once the
-          # underlying bug is fixed.
-          continue
+  @combinations.generate(all_strategy_combinations_minus_default())
+  def test_distribution_strategy_one_dimensional(self, distribution):
+    with distribution.scope():
+      inp = keras.layers.Input(shape=(10,))
+      out = keras.layers.Dense(3, activation='softmax')(inp)
+      model = keras.Model(inputs=[inp], outputs=[out])
+      model.compile(
+          optimizer='rmsprop',
+          loss='sparse_categorical_crossentropy',
+          metrics=['sparse_categorical_accuracy'],
+      )
 
-        tolerance = tol_table.get(key, default_tolerance)
+      x = np.random.random((64, 10)).astype('float32')
+      y = np.random.randint(3, size=64)
 
-        self.assertAllClose(
-            results_with_ds[key],
-            results_without_ds[key],
-            atol=tolerance,
-            rtol=tolerance,
-            msg='Fail to assert {}.'.format(key))
+      model.fit(x, y, epochs=1, steps_per_epoch=2)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/distribute/python/keras_utils_test.py b/tensorflow/contrib/distribute/python/keras_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..36eaee77f21a9f6d62a7c3f616d0126b7a4a8902
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/keras_utils_test.py
@@ -0,0 +1,471 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.keras models with callbacks, checkpointing with dist strategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import tempfile
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import keras_test as keras_test_lib
+from tensorflow.contrib.distribute.python import tpu_strategy
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import values
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras.engine import distributed_training_utils
+from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
+from tensorflow.python.training import gradient_descent
+
+
+class Counter(keras.callbacks.Callback):
+  """Counts the number of times each callback method was run.
+
+  Attributes:
+    method_counts: dict. Contains the counts of time  each callback method was
+      run.
+  """
+
+  def __init__(self):
+    self.method_counts = collections.defaultdict(int)
+    methods_to_count = [
+        'on_batch_begin', 'on_batch_end', 'on_epoch_begin', 'on_epoch_end',
+        'on_predict_batch_begin', 'on_predict_batch_end', 'on_predict_begin',
+        'on_predict_end', 'on_test_batch_begin', 'on_test_batch_end',
+        'on_test_begin', 'on_test_end', 'on_train_batch_begin',
+        'on_train_batch_end', 'on_train_begin', 'on_train_end'
+    ]
+    for method_name in methods_to_count:
+      setattr(self, method_name,
+              self.wrap_with_counts(method_name, getattr(self, method_name)))
+
+  def wrap_with_counts(self, method_name, method):
+
+    def _call_and_count(*args, **kwargs):
+      self.method_counts[method_name] += 1
+      return method(*args, **kwargs)
+
+    return _call_and_count
+
+
+class TestDistributionStrategyWithCallbacks(test.TestCase,
+                                            parameterized.TestCase):
+
+  @combinations.generate(keras_test_lib.all_strategy_combinations())
+  def test_callbacks_in_fit(self, distribution):
+    with distribution.scope():
+      model = keras_test_lib.get_model()
+      model.compile(optimizer='sgd', loss='mse', metrics=['mae'])
+
+    dataset = keras_test_lib.get_dataset(distribution)
+    counter = Counter()
+
+    epochs = 2
+    steps_per_epoch = 5
+    validation_steps = 3
+
+    model.fit(
+        dataset,
+        epochs=epochs,
+        steps_per_epoch=steps_per_epoch,
+        verbose=0,
+        validation_data=dataset,
+        validation_steps=validation_steps,
+        callbacks=[counter])
+
+    if isinstance(distribution, tpu_strategy.TPUStrategy):
+      # TPU Strategy can have multi step training, from extended.steps_per_run
+      # if steps_per_run = 1, then num_batch_call_per_epoch = steps_per_epoch
+      steps_per_run = distribution.extended.steps_per_run
+      num_batch_call_per_epoch = steps_per_epoch // steps_per_run
+      if steps_per_epoch % steps_per_run:
+        num_batch_call_per_epoch += 1
+    else:
+      num_batch_call_per_epoch = steps_per_epoch
+
+    self.assertDictEqual(
+        counter.method_counts, {
+            'on_batch_begin': epochs * num_batch_call_per_epoch,
+            'on_batch_end': epochs * num_batch_call_per_epoch,
+            'on_epoch_begin': epochs,
+            'on_epoch_end': epochs,
+            'on_test_batch_begin': epochs * validation_steps,
+            'on_test_batch_end': epochs * validation_steps,
+            'on_test_begin': epochs,
+            'on_test_end': epochs,
+            'on_train_batch_begin': epochs * num_batch_call_per_epoch,
+            'on_train_batch_end': epochs * num_batch_call_per_epoch,
+            'on_train_begin': 1,
+            'on_train_end': 1
+        })
+
+  @combinations.generate(keras_test_lib.all_strategy_combinations())
+  def test_callbacks_in_eval(self, distribution):
+    with distribution.scope():
+      model = keras_test_lib.get_model()
+      model.compile(optimizer='sgd', loss='mse', metrics=['mae'])
+
+    dataset = keras_test_lib.get_dataset(distribution)
+    counter = Counter()
+
+    model.evaluate(dataset, steps=5, callbacks=[counter])
+
+    self.assertDictEqual(
+        counter.method_counts, {
+            'on_test_batch_begin': 5,
+            'on_test_batch_end': 5,
+            'on_test_begin': 1,
+            'on_test_end': 1
+        })
+
+  @combinations.generate(keras_test_lib.all_strategy_combinations())
+  def test_callbacks_in_predict(self, distribution):
+    with distribution.scope():
+      model = keras_test_lib.get_model()
+      model.compile(optimizer='sgd', loss='mse', metrics=['mae'])
+
+    dataset = keras_test_lib.get_dataset(distribution)
+    counter = Counter()
+
+    model.predict(
+        keras_test_lib.get_predict_dataset(dataset),
+        steps=5,
+        callbacks=[counter])
+
+    self.assertDictEqual(
+        counter.method_counts, {
+            'on_predict_batch_begin': 5,
+            'on_predict_batch_end': 5,
+            'on_predict_begin': 1,
+            'on_predict_end': 1
+        })
+
+
+class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              combinations.mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu
+          ],
+          mode=['graph', 'eager']))
+  def test_validating_dataset_input_tensors_with_shape_mismatch(
+      self, distribution):
+    with self.cached_session():
+      a = constant_op.constant([1, 2], shape=(1, 2))
+      b = constant_op.constant([[1, 2], [1, 2]], shape=(2, 2))
+      device_map = values.ReplicaDeviceMap(('/device:CPU:0', '/device:GPU:0'))
+      x = values.DistributedValues(device_map, (a, b))
+      y = values.DistributedValues(device_map, (a, a))
+      # Removed device and input tensor shape details from the error message
+      # since the order of the device and the corresponding input tensor shape
+      # is not deterministic over different runs.
+      with self.assertRaisesRegexp(
+          ValueError, 'Input tensor shapes do not match for '
+          'distributed tensor inputs '
+          'DistributedValues:.+'):
+        with distribution.scope():
+          distributed_training_utils.validate_distributed_dataset_inputs(
+              distribution, x, y)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              combinations.mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu
+          ],
+          mode=['graph', 'eager']))
+  def test_validating_dataset_input_tensors_with_dtype_mismatch(
+      self, distribution):
+    with self.cached_session():
+      a = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.int32)
+      b = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.float64)
+      device_map = values.ReplicaDeviceMap(('/device:CPU:0', '/device:GPU:0'))
+      x = values.DistributedValues(device_map, (a, b))
+      y = values.DistributedValues(device_map, (a, a))
+      # Removed device and input tensor dtype details from the error message
+      # since the order of the device and the corresponding input tensor dtype
+      # is not deterministic over different runs.
+      with self.assertRaisesRegexp(
+          ValueError, 'Input tensor dtypes do not match for '
+          'distributed tensor inputs '
+          'DistributedValues:.+'):
+        with distribution.scope():
+          distributed_training_utils.validate_distributed_dataset_inputs(
+              distribution, x, y)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              combinations.mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu
+          ],
+          mode=['graph', 'eager']))
+  def test_unsupported_features(self, distribution):
+    with self.cached_session():
+      with distribution.scope():
+        model = keras_test_lib.get_model()
+        optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+        loss = 'mse'
+        metrics = ['mae']
+        model.compile(optimizer, loss, metrics=metrics)
+
+      dataset = keras_test_lib.get_dataset(distribution)
+
+      # Test with validation split
+      with self.assertRaisesRegexp(
+          ValueError, '`validation_split` argument is not '
+          'supported when input `x` is a dataset or a '
+          'dataset iterator.+'):
+        model.fit(
+            dataset,
+            epochs=1,
+            steps_per_epoch=2,
+            verbose=0,
+            validation_split=0.5,
+            validation_steps=2)
+
+      # Test with sample weight.
+      sample_weight = np.random.random((10,))
+      with self.assertRaisesRegexp(
+          ValueError, '`sample_weight` argument is not supported when input '
+          '`x` is a dataset or a dataset iterator.'):
+        model.fit(
+            dataset,
+            epochs=1,
+            steps_per_epoch=2,
+            verbose=0,
+            sample_weight=sample_weight)
+
+      # Test with not specifying the `steps` argument for dataset with infinite
+      # cardinality.
+      dataset = dataset.repeat()
+      with self.assertRaisesRegexp(
+          ValueError, 'When passing an infinitely '
+          'repeating dataset, you must specify the '
+          '`steps_per_epoch` argument'):
+        model.fit(dataset, epochs=1, verbose=0)
+      with self.assertRaisesRegexp(
+          ValueError, 'When passing an infinitely '
+          'repeating dataset, you must specify the '
+          '`steps` argument'):
+        model.evaluate(dataset, verbose=0)
+
+      with self.assertRaisesRegexp(
+          ValueError, 'When passing an infinitely '
+          'repeating dataset, you must specify the '
+          '`steps` argument'):
+        model.predict(dataset, verbose=0)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              combinations.mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu
+          ],
+          mode=['graph', 'eager']))
+  def test_calling_with_unsupported_predefined_callbacks(self, distribution):
+    with self.cached_session():
+      with distribution.scope():
+        model = keras_test_lib.get_model()
+        optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+        loss = 'mse'
+        metrics = ['mae']
+        model.compile(optimizer, loss, metrics=metrics)
+
+      dataset = keras_test_lib.get_dataset(distribution)
+
+      def schedule(_):
+        return 0.001
+
+      with self.assertRaisesRegexp(
+          ValueError, 'You must specify a Keras Optimizer V2 when '
+          'using'):
+        model.fit(
+            dataset,
+            epochs=1,
+            steps_per_epoch=2,
+            verbose=0,
+            callbacks=[keras.callbacks.LearningRateScheduler(schedule)])
+
+      with self.assertRaisesRegexp(
+          ValueError, 'You must specify a Keras Optimizer V2 when '
+          'using'):
+        model.fit(
+            dataset,
+            epochs=1,
+            steps_per_epoch=2,
+            verbose=0,
+            callbacks=[keras.callbacks.ReduceLROnPlateau()])
+
+
+class TestDistributionStrategyWithLossMasking(test.TestCase,
+                                              parameterized.TestCase):
+
+  # TODO(priyag): Enable all strategies for this test. Currently it does not
+  # work for TPU due to some invalid datatype.
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              combinations.mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu
+          ],
+          mode=['graph', 'eager']))
+  def test_masking(self, distribution):
+    with self.cached_session():
+      np.random.seed(1337)
+      x = np.array([[[1], [1]], [[0], [0]]])
+      with distribution.scope():
+        model = keras.models.Sequential()
+        model.add(keras.layers.Masking(mask_value=0, input_shape=(2, 1)))
+        model.add(
+            keras.layers.TimeDistributed(
+                keras.layers.Dense(1, kernel_initializer='one')))
+        model.compile(
+            loss='mse',
+            optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+      y = np.array([[[1], [1]], [[1], [1]]])
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+      hist = model.fit(x=dataset, epochs=1, steps_per_epoch=2)
+      self.assertEqual(hist.history['loss'][0], 0)
+
+
+class TestDistributionStrategyWithNormalizationLayer(test.TestCase,
+                                                     parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(keras_test_lib.all_strategy_combinations(),
+                         combinations.combine(fused=[True, False])))
+  def test_batchnorm_correctness(self, distribution, fused):
+    with self.cached_session():
+      with distribution.scope():
+        model = keras.models.Sequential()
+        norm = keras.layers.BatchNormalization(
+            input_shape=(10,), momentum=0.8, fused=fused)
+        model.add(norm)
+        model.compile(
+            loss='mse',
+            optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+
+      # centered on 5.0, variance 10.0
+      x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10))
+      x = x.astype('float32')
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, x))
+      dataset = dataset.repeat(100)
+      dataset = keras_test_lib.batch_wrapper(dataset, 32, distribution)
+
+      predict_dataset = dataset_ops.Dataset.from_tensor_slices(x)
+      predict_dataset = predict_dataset.repeat(100)
+      predict_dataset = keras_test_lib.batch_wrapper(predict_dataset, 32,
+                                                     distribution)
+
+      model.fit(dataset, epochs=4, verbose=0, steps_per_epoch=10)
+      out = model.predict(predict_dataset, steps=2)
+      out -= keras.backend.eval(norm.beta)
+      out /= keras.backend.eval(norm.gamma)
+      np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
+      np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
+
+
+class TestDistributionStrategySaveLoadWeights(test.TestCase,
+                                              parameterized.TestCase):
+
+  @combinations.generate(
+      keras_test_lib.all_strategy_combinations_minus_default())
+  def test_save_load_h5(self, distribution):
+    with self.cached_session():
+      dataset = keras_test_lib.get_dataset(distribution)
+      with distribution.scope():
+        model = keras_test_lib.get_model()
+        model.compile(gradient_descent_keras.SGD(0.01), 'mse')
+        model.fit(dataset, epochs=1, steps_per_epoch=1)
+
+        weights_file = tempfile.mktemp('.h5')
+        model.save_weights(weights_file)
+
+        model_2 = keras_test_lib.get_model()
+        model_2.compile(gradient_descent_keras.SGD(0.01), 'mse')
+        model_2.load_weights(weights_file)
+        model_2.predict(
+            keras_test_lib.get_predict_dataset(distribution), steps=2)
+        model_2.fit(dataset, epochs=1, steps_per_epoch=1)
+
+  @combinations.generate(
+      keras_test_lib.all_strategy_combinations_minus_default())
+  def test_save_load_trackable(self, distribution):
+    # TODO(sourabhbajaj): Test fails with optimizer v2 without h5
+    with self.cached_session():
+      dataset = keras_test_lib.get_dataset(distribution)
+      with distribution.scope():
+        model = keras_test_lib.get_model()
+        model.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
+        model.fit(dataset, epochs=1, steps_per_epoch=1)
+
+        weights_file = tempfile.mktemp()
+        model.save_weights(weights_file)
+
+        model_2 = keras_test_lib.get_model()
+        model_2.compile(gradient_descent.GradientDescentOptimizer(0.01), 'mse')
+        model_2.load_weights(weights_file)
+        model_2.predict(
+            keras_test_lib.get_predict_dataset(distribution), steps=2)
+        model_2.fit(dataset, epochs=1, steps_per_epoch=1)
+
+
+class TestDistributionStrategyValidation(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      keras_test_lib.all_strategy_combinations_minus_default())
+  def test_layer_outside_scope(self, distribution):
+    with self.cached_session():
+      with self.assertRaisesRegexp(
+          ValueError, 'was not created in the distribution strategy'):
+        x = keras.layers.Input(shape=(3,), name='input')
+        y = keras.layers.Dense(4, name='dense')(x)
+        with distribution.scope():
+          model = keras.Model(x, y)
+          optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+          loss = 'mse'
+          metrics = ['mae', keras.metrics.CategoricalAccuracy()]
+          model.compile(optimizer, loss, metrics=metrics)
+
+  @combinations.generate(
+      keras_test_lib.all_strategy_combinations_minus_default())
+  def test_model_outside_scope(self, distribution):
+    with self.cached_session():
+      with self.assertRaisesRegexp(
+          ValueError, 'was not created in the distribution strategy'):
+        x = keras.layers.Input(shape=(3,), name='input')
+        y = keras.layers.Dense(4, name='dense')(x)
+        model = keras.Model(x, y)
+        with distribution.scope():
+          optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+          loss = 'mse'
+          metrics = ['mae', keras.metrics.CategoricalAccuracy()]
+          model.compile(optimizer, loss, metrics=metrics)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/metrics_v1_test.py b/tensorflow/contrib/distribute/python/metrics_v1_test.py
index 8ac659abe96370b751ed1556cc699fe20788a0fd..a663e809dd45ea099e1d8a08e681d07b05bee3c9 100644
--- a/tensorflow/contrib/distribute/python/metrics_v1_test.py
+++ b/tensorflow/contrib/distribute/python/metrics_v1_test.py
@@ -95,16 +95,15 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
 
   def _test_metric(self, distribution, dataset_fn, metric_fn, expected_fn):
     with ops.Graph().as_default(), distribution.scope():
-      iterator = distribution.distribute_dataset(
-          dataset_fn).make_initializable_iterator()
+      iterator = distribution.make_input_fn_iterator(lambda _: dataset_fn())
       if isinstance(distribution, tpu_strategy.TPUStrategy):
         def step_fn(ctx, inputs):
-          value, update = distribution.call_for_each_replica(
-              metric_fn, args=inputs)
+          value, update = distribution.extended.call_for_each_replica(
+              metric_fn, args=(inputs,))
           ctx.set_non_tensor_output(name="value", output=value)
           return distribution.group(update)
 
-        ctx = distribution.run_steps_on_dataset(
+        ctx = distribution.extended.experimental_run_steps_on_iterator(
             step_fn, iterator, iterations=distribution.extended.steps_per_run)
         update = ctx.run_op
         value = ctx.non_tensor_outputs["value"]
@@ -114,15 +113,14 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
             distribution.num_replicas_in_sync *
             distribution.extended.steps_per_run)
       else:
-        value, update = distribution.call_for_each_replica(
-            metric_fn, iterator.get_next())
+        value, update = distribution.extended.call_for_each_replica(
+            metric_fn, args=(iterator.get_next(),))
         update = distribution.group(update)
         # TODO(josh11b): Once we switch to using a global batch size for input,
         # replace "distribution.num_replicas_in_sync" with "1".
         batches_per_update = distribution.num_replicas_in_sync
 
-      self.evaluate(iterator.initializer)
-      self.evaluate(distribution.initialize())
+      self.evaluate(iterator.initialize())
       self.evaluate(variables.local_variables_initializer())
 
       batches_consumed = 0
@@ -136,8 +134,6 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
         if batches_consumed >= 4:  # Consume 4 input batches in total.
           break
 
-      self.evaluate(distribution.finalize())
-
   @combinations.generate(all_combinations() + tpu_combinations())
   def testMean(self, distribution):
     def _dataset_fn():
diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
index f09483cb56b66fd4720ee71085203c14f1ccadc3..f06c9b75644b2890b7657f75e74e4e20a6f15705 100644
--- a/tensorflow/contrib/distribute/python/minimize_loss_test.py
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -41,12 +41,9 @@ from tensorflow.python.ops.losses import losses_impl
 
 class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
 
-  def _get_iterator(self, ds):
-    if context.executing_eagerly():
-      iterator = ds.make_one_shot_iterator()
-    else:
-      iterator = ds.make_initializable_iterator()
-      self.evaluate(iterator.initializer)
+  def _get_iterator(self, strategy, input_fn):
+    iterator = strategy.make_input_fn_iterator(lambda _: input_fn())
+    self.evaluate(iterator.initialize())
     return iterator
 
   @combinations.generate(
@@ -67,15 +64,15 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       def step_fn(ctx, inputs):
         del ctx  # Unused
         return distribution.group(
-            distribution.call_for_each_replica(model_fn, args=inputs))
+            distribution.extended.call_for_each_replica(
+                model_fn, args=(inputs,)))
 
-      iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
+      iterator = self._get_iterator(distribution, dataset_fn)
 
       def run_step():
-        return distribution.run_steps_on_dataset(
+        return distribution.extended.experimental_run_steps_on_iterator(
             step_fn, iterator, iterations=2).run_op
 
-      self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
         with self.cached_session() as sess:
           run_step = sess.make_callable(run_step())
@@ -84,12 +81,9 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       weights, biases = [], []
       for _ in range(5):
         run_step()
-
         weights.append(self.evaluate(layer.kernel))
         biases.append(self.evaluate(layer.bias))
 
-      self.evaluate(distribution.finalize())
-
       error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
       is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
       self.assertTrue(is_not_increasing)
@@ -105,11 +99,11 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       model_fn, dataset_fn, layer = minimize_loss_example(
           optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
 
-      iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
+      iterator = self._get_iterator(distribution, dataset_fn)
 
       def run_step():
         return distribution.group(
-            distribution.call_for_each_replica(
+            distribution.extended.call_for_each_replica(
                 model_fn, args=(iterator.get_next(),)))
 
       if not context.executing_eagerly():
@@ -152,7 +146,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
     # `distribution.scope`.
     with variable_scope.variable_creator_scope(
         appending_creator), distribution.scope():
-      model_fn, dataset_fn, layer = minimize_loss_example(
+      model_fn, dataset_fn, _ = minimize_loss_example(
           optimizer_fn,
           use_bias=True,
           use_callable_loss=True,
@@ -161,24 +155,21 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       def step_fn(ctx, inputs):
         del ctx  # Unused
         return distribution.group(
-            distribution.call_for_each_replica(model_fn, args=inputs))
+            distribution.extended.call_for_each_replica(
+                model_fn, args=(inputs,)))
 
-      iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
+      iterator = self._get_iterator(distribution, dataset_fn)
 
       def run_step():
-        return distribution.run_steps_on_dataset(
+        return distribution.extended.experimental_run_steps_on_iterator(
             step_fn, iterator, iterations=1).run_op
 
-      self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
         with self.cached_session() as sess:
           run_step = sess.make_callable(run_step())
       self.evaluate(variables_lib.global_variables_initializer())
-
       run_step()
 
-      self.evaluate(distribution.finalize())
-
       def get_expected_variables(optimizer_fn, num_parameter_devices):
         variables_map = {
             "GradientDescent": ["dense/kernel", "dense/bias"],
@@ -197,7 +188,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
 
       self.assertEqual(
           get_expected_variables(optimizer_fn,
-                                 len(distribution.parameter_devices)),
+                                 len(distribution.extended.parameter_devices)),
           set(created_variables))
 
   @combinations.generate(
@@ -230,18 +221,18 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       def step_fn(ctx, inputs):
         del ctx  # Unused
         fetches = distribution.unwrap(
-            distribution.call_for_each_replica(model_fn, args=inputs))
+            distribution.extended.call_for_each_replica(
+                model_fn, args=(inputs,)))
         if update_ops_in_cross_replica_mode:
           fetches += tuple(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
         return control_flow_ops.group(fetches)
 
-      iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
+      iterator = self._get_iterator(distribution, dataset_fn)
 
       def run_step():
-        return distribution.run_steps_on_dataset(
+        return distribution.extended.experimental_run_steps_on_iterator(
             step_fn, iterator, iterations=1).run_op
 
-      self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
         with self.cached_session() as sess:
           run_step = sess.make_callable(run_step())
@@ -267,8 +258,6 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
               expected_moving_mean - averaged_batch_mean(i)) * (1.0 - momentum))
           self.assertNear(expected_moving_means[i], moving_means[i], 0.0001)
 
-      self.evaluate(distribution.finalize())
-
   @combinations.generate(
       combinations.times(
           combinations.combine(
@@ -302,8 +291,8 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
     with distribution.scope():
       all_vars = []
 
-      def model_fn(x, y):
-
+      def model_fn(inputs):
+        x, y = inputs
         def loss_fn():
           # Use fixed initialization to make the steps deterministic.
           w = variable_scope.get_variable("w", initializer=[[2.]])
@@ -327,15 +316,15 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       def step_fn(ctx, inputs):
         del ctx  # Unused
         return distribution.group(
-            distribution.call_for_each_replica(model_fn, args=inputs))
+            distribution.extended.call_for_each_replica(
+                model_fn, args=(inputs,)))
 
-      iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
+      iterator = self._get_iterator(distribution, dataset_fn)
 
       def run_step():
-        return distribution.run_steps_on_dataset(
+        return distribution.extended.experimental_run_steps_on_iterator(
             step_fn, iterator, iterations=1).run_op
 
-      self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
         with self.cached_session() as sess:
           run_step = sess.make_callable(run_step())
@@ -370,8 +359,6 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         # One of the mean loss reductions.
         self.assertNear(weight, 2 + 10.6, 0.0001)
 
-      self.evaluate(distribution.finalize())
-
   @combinations.generate(
       combinations.times(
           combinations.distributions_and_v1_optimizers(),
@@ -412,8 +399,8 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         return (train_op, loss)
 
       def step_fn(output_context, inputs):
-        (train_op, loss) = distribution.call_for_each_replica(
-            model_fn, args=(output_context,) + inputs)
+        (train_op, loss) = distribution.extended.call_for_each_replica(
+            model_fn, args=(output_context, inputs))
         output_context.set_last_step_output(
             name="cross_replica_loss_reduced",
             output=loss,
@@ -423,7 +410,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
             output=loss)
         return distribution.group(train_op)
 
-      iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
+      iterator = self._get_iterator(distribution, dataset_fn)
 
       def run_step():
         initial_loss = lambda: constant_op.constant(1e7)
@@ -439,7 +426,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
             "cross_replica_loss_not_reduced":
             distribution.unwrap(distribution.broadcast(initial_loss()))
         }
-        ctx = distribution.run_steps_on_dataset(
+        ctx = distribution.extended.experimental_run_steps_on_iterator(
             step_fn, iterator, iterations=2,
             initial_loop_values=initial_loop_values)
 
@@ -458,7 +445,6 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
             reduced=False, distribution=distribution)
         return (ctx.run_op, ctx.last_step_outputs["replica_loss_reduced"])
 
-      self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
         with self.cached_session() as sess:
           run_step = sess.make_callable(run_step())
@@ -471,8 +457,6 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         weights.append(self.evaluate(layer.kernel))
         biases.append(self.evaluate(layer.bias))
 
-      self.evaluate(distribution.finalize())
-
       loss_is_not_increasing = all(y <= x for x, y in zip(losses, losses[1:]))
       self.assertTrue(loss_is_not_increasing)
 
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index 20f1a08d4261b931a9353738147fba7d7dff9225..5391e083fc9b3ed99cc64bbed11bdeb8dea07f93 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -18,17 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
-
-from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import mirrored_strategy
-from tensorflow.python.distribute import values
 
 
 # pylint: disable=protected-access,invalid-name
 _call_for_each_replica = mirrored_strategy._call_for_each_replica
-_reduce_non_distributed_value = mirrored_strategy._reduce_non_distributed_value
 _create_mirrored_variable = mirrored_strategy._create_mirrored_variable
 all_local_devices = mirrored_strategy.all_local_devices
 CoreMirroredStrategy = mirrored_strategy.MirroredStrategy
@@ -50,8 +46,8 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
   distributed environment.
 
   There are several important concepts for distributed TensorFlow, e.g.
-  `client`, `job`, 'task', `cluster`, `in-graph replication` and
-  'synchronous training' and they have already been defined in the
+  `client`, `job`, `task`, `cluster`, `in-graph replication` and
+  `synchronous training` and they have already been defined in the
   [TensorFlow's documentation](https://www.tensorflow.org/deploy/distributed).
   The distribution strategy inherits these concepts as well and in addition to
   that we also clarify several more concepts:
@@ -106,6 +102,61 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
                                 auto_shard_dataset)
     super(MirroredStrategy, self).__init__(extended)
 
+  # Override to change the documentation to reflect the different handling of
+  # global vs. local batch size between core and contrib.
+  def make_dataset_iterator(self, dataset):  # pylint: disable=useless-super-delegation
+    """Makes an iterator for input provided via `dataset`.
+
+    NOTE: The batch size of the `dataset` argument is treated differently for
+    this contrib version of `MirroredStrategy`.
+
+    Data from the given dataset will be distributed evenly across all the
+    compute replicas. We will assume that the input dataset is batched by the
+    per-replica batch size.
+
+    The user could also use `make_input_fn_iterator` if they want to
+    customize which input is fed to which replica/worker etc.
+
+    Args:
+      dataset: `tf.data.Dataset` that will be distributed evenly across all
+        replicas.
+
+    Returns:
+      An `tf.distribute.InputIterator` which returns inputs for each step of the
+      computation.  User should call `initialize` on the returned iterator.
+    """
+    return super(MirroredStrategy, self).make_dataset_iterator(dataset)
+
+  # Override to change the documentation to reflect the different handling of
+  # global vs. local batch size between core and contrib.
+  def experimental_make_numpy_iterator(  # pylint: disable=useless-super-delegation
+      self, numpy_input, batch_size, num_epochs=1, shuffle=1024, session=None):
+    """Makes an iterator for input provided via a nest of numpy arrays.
+
+    NOTE: The `batch_size` argument here has different behavior for this
+    contrib version of `MirroredStrategy`.
+
+    Args:
+      numpy_input: A nest of NumPy input arrays that will be distributed evenly
+        across all replicas.
+      batch_size: The number of entries from the array we should consume in one
+        step of the computation, across all replicas. This is the per-replica
+        batch size. The global batch size will be this times
+        `num_replicas_in_sync`.
+      num_epochs: The number of times to iterate through the examples. A value
+        of `None` means repeat forever.
+      shuffle: Size of buffer to use for shuffling the input examples.
+        Use `None` to disable shuffling.
+      session: (TensorFlow v1.x graph execution only) A session used for
+        initialization.
+
+    Returns:
+      An `tf.distribute.InputIterator` which returns inputs for each step of the
+      computation.  User should call `initialize` on the returned iterator.
+    """
+    return super(MirroredStrategy, self).experimental_make_numpy_iterator(
+        numpy_input, batch_size, num_epochs, shuffle, session)
+
 
 class MirroredExtended(CoreMirroredExtended):
   """Implementation of (contrib) MirroredStrategy."""
@@ -137,24 +188,10 @@ class MirroredExtended(CoreMirroredExtended):
     Returns:
       An `InputIterator` which returns inputs for each step of the computation.
     """
-    if self._local_mode:
-      worker = device_util.canonicalize("/device:CPU:0")
-      worker_device_pairs = [(worker, self._devices)]
-    else:
-      worker_device_pairs = self._worker_devices
-    return values.DatasetIterator(dataset, worker_device_pairs)
-
-  def _distribute_dataset(self, dataset_fn):
-    if self._local_mode:
-      return values.PerReplicaDataset(
-          self._call_dataset_fn(dataset_fn), self._devices)
-    else:
-      return values.MultiWorkerDataset(
-          functools.partial(self._call_dataset_fn, dataset_fn),
-          self._worker_devices,
-          auto_shard=self._auto_shard_dataset)
+    return input_lib.DatasetIterator(dataset, self._input_workers)
 
   # TODO(priyag): Delete this once all strategies use global batch size.
   @property
   def _global_batch_size(self):
+    """The contrib version of Mirrored strategy uses per-replica batch size."""
     return False
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
index 36be5c83f8bafb6c934d1d7682b5227b1f71c089..5ce731816ccefe36c1f876c79589e448f00b86f5 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -38,8 +38,8 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.keras.engine import training as keras_training
 from tensorflow.python.keras.layers import core as keras_core
@@ -66,8 +66,10 @@ GPU_TEST = "test_gpu" in sys.argv[0]
         combinations.core_mirrored_strategy_with_gpu_and_cpu,
         combinations.core_mirrored_strategy_with_two_gpus],
     mode=["graph", "eager"]))
-class MirroredTwoDeviceDistributionTest(strategy_test_lib.DistributionTestBase,
-                                        parameterized.TestCase):
+class MirroredTwoDeviceDistributionTest(
+    strategy_test_lib.DistributionTestBase,
+    strategy_test_lib.TwoDeviceDistributionTestBase,
+    parameterized.TestCase):
 
   def testMinimizeLoss(self, distribution):
     if context.executing_eagerly():
@@ -101,7 +103,7 @@ class MirroredTwoDeviceDistributionTest(strategy_test_lib.DistributionTestBase,
       expected = sum(range(distribution.num_replicas_in_sync))
       self.assertEqual(expected, self.evaluate(reduced))
 
-  def testMakeInputFnIterator(self, distribution):
+  def testMakeInputFnIteratorWithDataset(self, distribution):
     dataset_fn = lambda: dataset_ops.Dataset.range(10)
     expected_values = [[i, i+1] for i in range(0, 10, 2)]
 
@@ -114,9 +116,48 @@ class MirroredTwoDeviceDistributionTest(strategy_test_lib.DistributionTestBase,
     self._test_input_fn_iterator(iterator, distribution.extended.worker_devices,
                                  expected_values)
 
+  # TODO(b/124344198): Re-enable after fixing this flaky test.
+  def DISABLED_testMakeInputFnIteratorWithCallable(self, distribution):
+    def fn():
+      dataset = dataset_ops.Dataset.range(2).interleave(
+          (lambda _: dataset_ops.Dataset.range(10)), cycle_length=2)
+      it = dataset.make_one_shot_iterator()
+      return it.get_next
+    expected_values = [[i, i] for i in range(0, 10)]
+
+    input_fn = self._input_fn_to_test_input_context(
+        fn,
+        expected_num_replicas_in_sync=2,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)
+    iterator = distribution.make_input_fn_iterator(input_fn)
+    self._test_input_fn_iterator(iterator, distribution.extended.worker_devices,
+                                 expected_values, test_reinitialize=False)
+
+  def testNumpyIterator(self, distribution):
+    self._test_numpy_iterator(distribution)
+
   def testGlobalStepUpdate(self, distribution):
     self._test_global_step_update(distribution)
 
+  def testAllReduceSum(self, distribution):
+    self._test_all_reduce_sum(distribution)
+
+  def testAllReduceSumGradients(self, distribution):
+    self._test_all_reduce_sum_gradients(distribution)
+
+  def testAllReduceSumGradientTape(self, distribution):
+    self._test_all_reduce_sum_gradient_tape(distribution)
+
+  def testAllReduceMean(self, distribution):
+    self._test_all_reduce_mean(distribution)
+
+  def testAllReduceMeanGradients(self, distribution):
+    self._test_all_reduce_mean_gradients(distribution)
+
+  def testAllReduceMeanGradientTape(self, distribution):
+    self._test_all_reduce_mean_gradient_tape(distribution)
+
 
 def one_device_combinations():
   return combinations.combine(
@@ -128,25 +169,42 @@ def one_device_combinations():
       mode=["graph", "eager"])
 
 
+@combinations.generate(one_device_combinations())
 class MirroredOneDeviceDistributionTest(
     strategy_test_lib.DistributionTestBase,
+    strategy_test_lib.OneDeviceDistributionTestBase,
     parameterized.TestCase):
 
-  @combinations.generate(one_device_combinations())
   def testMinimizeLoss(self, distribution):
     if context.executing_eagerly():
       self._test_minimize_loss_eager(distribution)
     else:
       self._test_minimize_loss_graph(distribution)
 
-  @combinations.generate(one_device_combinations())
   def testReplicaId(self, distribution):
     self._test_replica_id(distribution)
 
-  @combinations.generate(one_device_combinations())
   def testCallAndMergeExceptions(self, distribution):
     self._test_call_and_merge_exceptions(distribution)
 
+  def testAllReduceSum(self, distribution):
+    self._test_all_reduce_sum(distribution)
+
+  def testAllReduceSumGradients(self, distribution):
+    self._test_all_reduce_sum_gradients(distribution)
+
+  def testAllReduceSumGradientTape(self, distribution):
+    self._test_all_reduce_sum_gradient_tape(distribution)
+
+  def testAllReduceMean(self, distribution):
+    self._test_all_reduce_mean(distribution)
+
+  def testAllReduceMeanGradients(self, distribution):
+    self._test_all_reduce_mean_gradients(distribution)
+
+  def testAllReduceMeanGradientTape(self, distribution):
+    self._test_all_reduce_mean_gradient_tape(distribution)
+
 
 class MirroredStrategyVariableCreatorStackTest(
     test.TestCase, parameterized.TestCase):
@@ -183,6 +241,34 @@ class MirroredStrategyVariableCreatorStackTest(
       expected = ("main_thread:thread_0", "main_thread:thread_1")
       self.assertEqual(expected, result)
 
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
+class MirroredStrategyCallForEachReplicaTest(test.TestCase):
+
+  def testExecutingEagerlyOutsideFunction(self, distribution):
+    """Verify we preserve the value of executing_eagerly_outside_functions()."""
+    def model_fn():
+      return ops.executing_eagerly_outside_functions()
+
+    originally = ops.executing_eagerly_outside_functions()
+    with distribution.scope():
+      in_scope = ops.executing_eagerly_outside_functions()
+      in_model_fn = distribution.extended.call_for_each_replica(model_fn)
+      unwrapped = distribution.unwrap(in_model_fn)
+      self.assertEqual(in_scope, unwrapped[0])
+      self.assertEqual(in_scope, originally)
+
+    # Verify this all again, but this time in a FuncGraph.
+    with func_graph.FuncGraph("fg").as_default(), distribution.scope():
+      in_scope = ops.executing_eagerly_outside_functions()
+      in_model_fn = distribution.extended.call_for_each_replica(model_fn)
+      unwrapped = distribution.unwrap(in_model_fn)
+      self.assertEqual(in_scope, unwrapped[0])
+      self.assertEqual(in_scope, originally)
+
 
 @combinations.generate(combinations.combine(
     distribution=[
@@ -193,11 +279,13 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
   # TODO(priyag): Modify more tests to use this helper and check more
   # properties.
-  def _test_mv_properties(self, var, name):
+  def _test_mv_properties(self, var, name, strategy):
     self.assertIsInstance(var, values.MirroredVariable)
     self.assertEqual(name, var.name)
+    self.assertIs(strategy, var.distribute_strategy)
     for d in var.devices:
       self.assertEqual(d, var.get(d).device)
+      self.assertIs(strategy, var.get(d)._distribute_strategy)  # pylint: disable=protected-access
 
   def testVariableInFuncGraph(self, distribution):
     def model_fn():
@@ -209,8 +297,8 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
       v1 = variable_scope.variable(1.0, name="foo")
       v2 = distribution.extended.call_for_each_replica(model_fn)
 
-    self._test_mv_properties(v1, "foo:0")
-    self._test_mv_properties(v2, "bar:0")
+    self._test_mv_properties(v1, "foo:0", distribution)
+    self._test_mv_properties(v2, "bar:0", distribution)
 
   def testSingleVariable(self, distribution):
     def model_fn():
@@ -223,7 +311,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
     with distribution.scope():
       result = distribution.extended.call_for_each_replica(model_fn)
-      self._test_mv_properties(result, "foo:0")
+      self._test_mv_properties(result, "foo:0", distribution)
 
   def testUnnamedVariable(self, distribution):
     def model_fn():
@@ -233,7 +321,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
     with distribution.scope():
       result = distribution.extended.call_for_each_replica(model_fn)
-      self._test_mv_properties(result, "Variable:0")
+      self._test_mv_properties(result, "Variable:0", distribution)
 
   def testMultipleVariables(self, distribution):
     def model_fn():
@@ -246,7 +334,7 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
     with distribution.scope():
       result = distribution.extended.call_for_each_replica(model_fn)
       for i, v in enumerate(result):
-        self._test_mv_properties(v, "foo" + str(i) + ":0")
+        self._test_mv_properties(v, "foo" + str(i) + ":0", distribution)
 
   def testMultipleVariablesWithSameCanonicalName(self, distribution):
     def model_fn():
@@ -296,14 +384,9 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
                 (layer2.kernel, layer2.bias),
                 (layer3.kernel, layer3.bias)]
 
-    ds = distribution.distribute_dataset(
-        lambda: dataset_ops.Dataset.from_tensors([[1.]]).repeat(10))
-    if context.executing_eagerly():
-      iterator = ds.make_one_shot_iterator()
-    else:
-      iterator = ds.make_initializable_iterator()
-      self.evaluate([iterator.initializer])
-
+    iterator = distribution.make_input_fn_iterator(
+        lambda _: dataset_ops.Dataset.from_tensors([[1.]]).repeat(10))
+    self.evaluate(iterator.initialize())
     features = iterator.get_next()
 
     with distribution.scope():
@@ -524,16 +607,15 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
             aggregation="invalid")
 
   def testNonMatchingVariableCreation(self, distribution):
+    self.skipTest("b/123075960")
     def model_fn(name):
       v = variable_scope.variable(1.0, name=name)
       ds_context.get_replica_context().merge_call(lambda _: _)
       return v
 
     with distribution.scope():
-      names = values.DistributedValues({
-          "/device:CPU:0": "foo",
-          "/device:GPU:0": "bar"
-      })
+      device_map = values.ReplicaDeviceMap(("/device:CPU:0", "/device:GPU:0"))
+      names = values.DistributedValues(device_map, ("foo", "bar"))
       with self.assertRaises(RuntimeError):
         _ = distribution.extended.call_for_each_replica(model_fn, args=(names,))
 
@@ -667,6 +749,15 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
           distribution.extended.worker_devices[0]).read_value()))
       self.assertEqual(10.0, self.evaluate(ret_v_sum))
 
+  def testVarDistributeStrategy(self, distribution):
+    with distribution.scope():
+      mirrored = variable_scope.variable(1.0)
+      replica_local = variable_scope.variable(
+          1.0,
+          synchronization=variable_scope.VariableSynchronization.ON_READ)
+      self.assertIs(distribution, mirrored.distribute_strategy)
+      self.assertIs(distribution, replica_local.distribute_strategy)
+
 
 @combinations.generate(combinations.combine(
     distribution=[
@@ -1095,7 +1186,7 @@ class ReplicaLocalVariableAssignTest(test.TestCase):
       # When we read the value using `read_var` we should see the SUM of each of
       # values on each of the replicas.
       self.assertEqual(2.0, self.evaluate(
-          distribution.read_var(replica_local_var)))
+          distribution.extended.read_var(replica_local_var)))
       # Assigning 6.0 in cross replica context will assign a value of
       # 6.0/num_replicas to each replica.
       tlv_ops = replica_local_var.assign(6.0)
@@ -1104,7 +1195,7 @@ class ReplicaLocalVariableAssignTest(test.TestCase):
       # The value on all the replicas are added before being returned by
       # `read_var`.
       self.assertEqual(6.0, self.evaluate(
-          distribution.read_var(replica_local_var)))
+          distribution.extended.read_var(replica_local_var)))
 
   def testAssignReplicaLocalVarMeanAggregation(self, distribution):
     def model_fn():
@@ -1123,13 +1214,13 @@ class ReplicaLocalVariableAssignTest(test.TestCase):
       # When we read the value using `read_var` we should see the MEAN of values
       # on all replicas which is the value assigned in replica context.
       self.assertEqual(1.0, self.evaluate(
-          distribution.read_var(replica_local_var)))
+          distribution.extended.read_var(replica_local_var)))
       tlv_ops = replica_local_var.assign(6.0)
       self.evaluate(tlv_ops)
       # On reading the replica local var we should get the MEAN of all values
       # which is equal to the value assigned.
       self.assertEqual(6.0, self.evaluate(
-          distribution.read_var(replica_local_var)))
+          distribution.extended.read_var(replica_local_var)))
 
 
 class MockModel(object):
@@ -1182,14 +1273,14 @@ class MirroredStrategyDefunTest(test.TestCase):
 
       result = distribution.extended.call_for_each_replica(
           model_fn, args=[mock_model] + inputs)
-      for device in devices:
-        device_result = values.select_device(device, result)
-        device_expected_result = values.select_device(device, expected_result)
+      for r in range(len(devices)):
+        device_result = values.select_replica(r, result)
+        device_expected_result = values.select_replica(r, expected_result)
         self.assertAllClose(device_expected_result,
                             self.evaluate(device_result))
 
       for defun in defuns:
-        # PolymorphicFunctions are specialized to the current device stack, so
+        # `Function`s are specialized to the current device stack, so
         # call_for_each has one trace per device. To check that the expected set
         # of variables was accessed on each trace, we first retrieve each
         # device-specific graph function.
@@ -1265,9 +1356,9 @@ class MirroredStrategyDefunTest(test.TestCase):
     def fn1(mock_model, factor):
       return mock_model(factor)
 
-    factors = values.PerReplica({"CPU:0": 5.0, "GPU:0": 3.0})
-    expected_result = values.PerReplica({"CPU:0": 5.0 * 1.25,
-                                         "GPU:0": 3.0 * 1.25})
+    device_map = values.ReplicaDeviceMap(("/device:CPU:0", "/device:GPU:0"))
+    factors = values.PerReplica(device_map, (5.0, 3.0))
+    expected_result = values.PerReplica(device_map, (5.0 * 1.25, 3.0 * 1.25))
     self._call_and_check(distribution, fn1, [factors], expected_result, [fn1])
 
   def testTrain(self, distribution):
@@ -1344,7 +1435,7 @@ class MultiWorkerMirroredStrategyTest(
       self.assertEqual(a.device, "/job:worker/task:0")
       self.assertEqual(b.device, "/job:worker/task:0/device:CPU:0")
 
-  def testMakeInputFnIterator(self, distribution):
+  def testMakeInputFnIteratorWithDataset(self, distribution):
     self._configure_distribution_strategy(distribution)
     dataset_fn = lambda: dataset_ops.Dataset.range(100)
     num_gpus = context.num_gpus()
@@ -1365,6 +1456,32 @@ class MultiWorkerMirroredStrategyTest(
       self._test_input_fn_iterator(
           iterator, distribution.extended.worker_devices, expected_values, sess)
 
+  def DISABLED_testMakeInputFnIteratorWithCallable(self, distribution):
+    self._configure_distribution_strategy(distribution)
+    def fn():
+      dataset = dataset_ops.Dataset.range(100)
+      it = dataset.make_one_shot_iterator()
+      return it.get_next
+    num_gpus = context.num_gpus()
+    num_workers = 2
+
+    expected_values = []
+    for i in range(0, 100, num_gpus):
+      expected_values.append([i+j for j in range(num_gpus)] * num_workers)
+
+    with context.graph_mode(), self.cached_session() as sess:
+      # `expected_input_pipeline_id` is None because the input_fn will be called
+      # multiple times, each with a different input_pipeline_id.
+      input_fn = self._input_fn_to_test_input_context(
+          fn,
+          expected_num_replicas_in_sync=num_workers*num_gpus,
+          expected_num_input_pipelines=num_workers,
+          expected_input_pipeline_id=None)
+      iterator = distribution.make_input_fn_iterator(input_fn)
+      self._test_input_fn_iterator(
+          iterator, distribution.extended.worker_devices, expected_values, sess,
+          test_reinitialize=False)
+
   def testUpdateConfigProto(self, distribution):
     distribution.configure(cluster_spec={"worker": ["fake1", "fake2"]})
 
diff --git a/tensorflow/contrib/distribute/python/monitor.py b/tensorflow/contrib/distribute/python/monitor.py
index 17b7ab74f63f42e1ee14a82d3bffdd1df9b25857..53e35ea6b75088a3de9866973f872da4a4ce25d6 100644
--- a/tensorflow/contrib/distribute/python/monitor.py
+++ b/tensorflow/contrib/distribute/python/monitor.py
@@ -51,7 +51,7 @@ class Monitor(object):
     else:
       if session is None:
         raise ValueError("Should provide a `session` in Graph mode.")
-      session.run(step_callable._iterator.initializer)  # pylint: disable=protected-access
+      session.run(step_callable.initialize())
       self._run_step = session.make_callable(step_callable())
       session.run(variables.global_variables_initializer())
 
diff --git a/tensorflow/contrib/distribute/python/monitor_test.py b/tensorflow/contrib/distribute/python/monitor_test.py
index 16be839e1d155003b9490fbe3da6ab85b7d2d78a..c0651610cafc06a6d5f4206f4e64d27020fae30b 100644
--- a/tensorflow/contrib/distribute/python/monitor_test.py
+++ b/tensorflow/contrib/distribute/python/monitor_test.py
@@ -23,9 +23,9 @@ import numpy
 
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import monitor as monitor_lib
-from tensorflow.contrib.distribute.python import one_device_strategy
 from tensorflow.contrib.distribute.python.single_loss_example import single_loss_example
 from tensorflow.python.client import session
+from tensorflow.python.distribute import one_device_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/distribute/python/moving_averages_test.py b/tensorflow/contrib/distribute/python/moving_averages_test.py
index 8f13e9153ea7a951dd722c4549882c97e79b57fe..c4622cdd2af2f6a9c936fe554bcc2eb76f805fdc 100644
--- a/tensorflow/contrib/distribute/python/moving_averages_test.py
+++ b/tensorflow/contrib/distribute/python/moving_averages_test.py
@@ -53,7 +53,7 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
       return var, assign
 
     with distribution.scope(), self.cached_session() as sess:
-      var, assign = distribution.call_for_each_replica(replica_fn)
+      var, assign = distribution.extended.call_for_each_replica(replica_fn)
       variables.global_variables_initializer().run()
       self.assertAllClose([10.0, 11.0], var.eval())
       sess.run(distribution.unwrap(assign))
@@ -79,7 +79,7 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
       return var, assign.op
 
     with distribution.scope(), self.cached_session() as sess:
-      var, assign_op = distribution.call_for_each_replica(replica_fn)
+      var, assign_op = distribution.extended.call_for_each_replica(replica_fn)
       variables.global_variables_initializer().run()
       self.assertAllClose([0.0, 0.0], var.eval())
       sess.run(distribution.unwrap(assign_op))
@@ -152,7 +152,7 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
       return var, assign
 
     with distribution.scope(), self.cached_session() as sess:
-      var, assign = distribution.call_for_each_replica(replica_fn)
+      var, assign = distribution.extended.call_for_each_replica(replica_fn)
       variables.global_variables_initializer().run()
       self.assertAllClose([10.0, 11.0], var.eval())
       sess.run(distribution.unwrap(assign))
diff --git a/tensorflow/contrib/distribute/python/multi_worker_test_base.py b/tensorflow/contrib/distribute/python/multi_worker_test_base.py
index 147c9b83f866fd364ea23cf7988692a7b5f61b9c..7dca13a5b41d1a2db474c44c82f1da88be84df05 100644
--- a/tensorflow/contrib/distribute/python/multi_worker_test_base.py
+++ b/tensorflow/contrib/distribute/python/multi_worker_test_base.py
@@ -40,6 +40,7 @@ from tensorflow.python.client import session
 from tensorflow.python.estimator import run_config
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import coordinator
 from tensorflow.python.training import server_lib
 
 ASSIGNED_PORTS = set()
@@ -360,6 +361,7 @@ class IndependentWorkerTestBase(test.TestCase):
     self._mock_os_env = MockOsEnv()
     self._mock_context = test.mock.patch.object(os, 'environ',
                                                 self._mock_os_env)
+    self._coord = coordinator.Coordinator()
     super(IndependentWorkerTestBase, self).setUp()
     self._mock_context.__enter__()
 
@@ -368,8 +370,9 @@ class IndependentWorkerTestBase(test.TestCase):
     super(IndependentWorkerTestBase, self).tearDown()
 
   def _task_thread(self, task_fn, tf_config, *args, **kwargs):
-    os.environ['TF_CONFIG'] = json.dumps(tf_config)
-    task_fn(*args, **kwargs)
+    with self._coord.stop_on_exception():
+      os.environ['TF_CONFIG'] = json.dumps(tf_config)
+      task_fn(*args, **kwargs)
 
   def _run_task_in_thread(self, task_fn, cluster_spec, task_type, task_id,
                           *args, **kwargs):
@@ -403,3 +406,28 @@ class IndependentWorkerTestBase(test.TestCase):
                                      *args, **kwargs)
         threads[task_type].append(t)
     return threads
+
+  def join_independent_workers(self, worker_threads):
+    self._coord.join(worker_threads)
+
+
+def get_tf_config_task():
+  return json.loads(os.environ['TF_CONFIG'])['task']
+
+
+def get_tf_config_cluster_spec():
+  return json.loads(os.environ['TF_CONFIG'])['cluster']
+
+
+def get_task_type():
+  return get_tf_config_task()['type']
+
+
+def get_task_index():
+  return get_tf_config_task()['index']
+
+
+def is_chief():
+  return ('chief' not in get_tf_config_cluster_spec()
+          and get_task_type() == 'worker'
+          and get_task_index() == 0)
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py
index fdbfba4e04358451a46b23ef250dc7c534c855a0..13a501394ee1fec2dfc1427f6d16d3a4624d7747 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy.py
@@ -18,202 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six
+from tensorflow.python.distribute import one_device_strategy
 
-from tensorflow.python.distribute import device_util
-from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import values
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.util import nest
-
-
-# TODO(josh11b): Replace asserts in this file with if ...: raise ...
-
-
-class OneDeviceStrategy(distribute_lib.DistributionStrategy):
-  """A distribution strategy for running on a single device."""
-  # TODO(josh11b): Do we wrap values in types to generate errors if you are
-  # doing something that won't work with other DistributionStrategy
-  # implementations?
-
-  def __init__(self, device):
-    super(OneDeviceStrategy, self).__init__(OneDeviceExtended(self, device))
-
-
-class OneDeviceExtended(distribute_lib.DistributionStrategyExtended):
-  """Implementation of OneDeviceStrategy."""
-
-  def __init__(self, container_strategy, device):
-    super(OneDeviceExtended, self).__init__(container_strategy)
-    self._device = device
-    self._default_device = device
-
-  def _create_variable(self, next_creator, *args, **kwargs):
-    colocate_with = kwargs.pop("colocate_with", None)
-    if colocate_with is None:
-      with ops.device(self._device):
-        return next_creator(*args, **kwargs)
-    if isinstance(colocate_with, six.string_types):
-      with ops.device(colocate_with):
-        return next_creator(*args, **kwargs)
-    if (isinstance(colocate_with, (list, tuple)) and len(colocate_with) == 1 and
-        isinstance(colocate_with[0], six.string_types)):
-      with ops.device(colocate_with[0]):
-        return next_creator(*args, **kwargs)
-    with ops.colocate_with(colocate_with):
-      return next_creator(*args, **kwargs)
-
-  def _make_dataset_iterator(self, dataset):
-    """Make iterator from dataset without splitting the batch."""
-    worker = device_util.canonicalize("/device:CPU:0")
-    worker_device_pairs = [(worker, [self._device])]
-    return values.DatasetIterator(dataset, worker_device_pairs)
-
-  def _distribute_dataset(self, dataset_fn):
-    return values.PerReplicaDataset(
-        self._call_dataset_fn(dataset_fn), [self._device])
-
-  def _make_input_fn_iterator(
-      self,
-      input_fn,
-      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
-    worker = device_util.canonicalize("/device:CPU:0")
-    worker_device_pairs = [(worker, [self._device])]
-    return values.InputFunctionIterator(
-        input_fn, worker_device_pairs,
-        [distribute_lib.InputContext()])
-
-  def _broadcast_to(self, tensor, destinations):
-    del destinations
-    return tensor
-
-  # TODO(priyag): Deal with OutOfRange errors  once b/111349762 is fixed.
-  def _experimental_run_steps_on_iterator(self, fn, iterator, iterations,
-                                          initial_loop_values=None):
-    if initial_loop_values is None:
-      initial_loop_values = {}
-    initial_loop_values = nest.flatten(initial_loop_values)
-
-    ctx = values.MultiStepContext()
-    def body(i, *args):
-      """A wrapper around `fn` to create the while loop body."""
-      del args
-      fn_inputs = iterator.get_next()
-      if not isinstance(fn_inputs, tuple):
-        fn_inputs = (fn_inputs,)
-      fn_result = fn(ctx, fn_inputs)
-      flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
-      with ops.control_dependencies([fn_result]):
-        return [i + 1] + flat_last_step_outputs
-
-    # We capture the control_flow_context at this point, before we run `fn`
-    # inside a while_loop. This is useful in cases where we might need to exit
-    # these contexts and get back to the outer context to do some things, for
-    # e.g. create an op which should be evaluated only once at the end of the
-    # loop on the host. One such usage is in creating metrics' value op.
-    self._outer_control_flow_context = (
-        ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
-
-    # TODO(priyag): Use max_iterations instead of an explicit counter.
-    cond = lambda i, *args: i < iterations
-    i = constant_op.constant(0)
-    loop_result = control_flow_ops.while_loop(
-        cond, body, [i] + initial_loop_values, name="",
-        parallel_iterations=1, back_prop=False, swap_memory=False,
-        return_same_structure=True)
-    del self._outer_control_flow_context
-
-    ctx.run_op = control_flow_ops.group(loop_result)
-
-    # Convert the last_step_outputs from a list to the original dict structure
-    # of last_step_outputs.
-    last_step_tensor_outputs = loop_result[1:]
-    last_step_tensor_outputs_dict = nest.pack_sequence_as(
-        ctx.last_step_outputs, last_step_tensor_outputs)
-
-    ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
-    return ctx
-
-  def _call_for_each_replica(self, fn, args, kwargs):
-    strategy = self._container_strategy()
-    with ops.device(self._device), _OneDeviceReplicaContext(strategy):
-      return fn(*args, **kwargs)
-
-  def _reduce_to(self, reduce_op, value, destinations):
-    del reduce_op, destinations
-    return value
-
-  def _update(self, var, fn, args, kwargs, group):
-    # The implementations of _update() and _update_non_slot() are identical
-    # except _update() passes `var` as the first argument to `fn()`.
-    return self._update_non_slot(var, fn, (var,) + tuple(args), kwargs, group)
-
-  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
-    del colocate_with
-    with ops.device(self._device), distribute_lib.UpdateContext(self._device):
-      result = fn(*args, **kwargs)
-      if group:
-        return result
-      else:
-        return nest.map_structure(self._unwrap, result)
-
-  def read_var(self, replica_local_var):
-    """Read the aggregate value of a replica-local variable."""
-    return array_ops.identity(replica_local_var)
-
-  def _unwrap(self, value):
-    return (value,)
-
-  def value_container(self, value):
-    return value
-
-  @property
-  def _num_replicas_in_sync(self):
-    return 1
-
-  @property
-  def worker_devices(self):
-    return (self._device,)
-
-  @property
-  def parameter_devices(self):
-    return (self._device,)
-
-  def non_slot_devices(self, var_list):
-    del var_list
-    return (self._device,)
-
-  @property
-  def experimental_should_init(self):
-    return True
-
-  @property
-  def should_checkpoint(self):
-    return True
-
-  @property
-  def should_save_summary(self):
-    return True
-
-  # TODO(priyag): Delete this once all strategies use global batch size.
-  @property
-  def _global_batch_size(self):
-    return True
-
-
-class _OneDeviceReplicaContext(distribute_lib.ReplicaContext):
-  """ReplicaContext for OneDeviceStrategy."""
-
-  def __init__(self, distribution_strategy):
-    distribute_lib.ReplicaContext.__init__(
-        self,
-        distribution_strategy,
-        replica_id_in_sync_group=constant_op.constant(0, dtypes.int32))
-
-  @property
-  def devices(self):
-    return self._distribution_strategy.extended.worker_devices
+OneDeviceStrategy = one_device_strategy.OneDeviceStrategy
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy_test.py b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
index d46cd6f529e363f76bfa2b22339add63530cfde8..0e56f663d6a1ed7945befd933f2f4a83c5f64342 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
@@ -18,34 +18,35 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distribute.python import one_device_strategy
+from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import strategy_test_lib
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.eager import test
-from tensorflow.python.framework import test_util
 
 
-class OneDeviceStrategyTest(strategy_test_lib.DistributionTestBase):
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.one_device_strategy,
+        combinations.one_device_strategy_gpu],
+    mode=["eager", "graph"]))
+class OneDeviceStrategyTest(
+    strategy_test_lib.DistributionTestBase,
+    strategy_test_lib.OneDeviceDistributionTestBase):
 
-  def _get_distribution_strategy(self):
-    return one_device_strategy.OneDeviceStrategy("/device:CPU:0")
+  def testMinimizeLoss(self, distribution):
+    if context.executing_eagerly():
+      self._test_minimize_loss_eager(distribution)
+    else:
+      self._test_minimize_loss_graph(distribution)
 
-  def testMinimizeLossEager(self):
-    self._test_minimize_loss_eager(self._get_distribution_strategy())
+  def testReplicaId(self, distribution):
+    self._test_replica_id(distribution)
 
-  def testMinimizeLossGraph(self):
-    self._test_minimize_loss_graph(self._get_distribution_strategy())
+  def testCallAndMergeExceptions(self, distribution):
+    self._test_call_and_merge_exceptions(distribution)
 
-  def testReplicaId(self):
-    self._test_replica_id(self._get_distribution_strategy())
-
-  @test_util.run_in_graph_and_eager_modes
-  def testCallAndMergeExceptions(self):
-    self._test_call_and_merge_exceptions(self._get_distribution_strategy())
-
-  @test_util.run_in_graph_and_eager_modes
-  def testMakeInputFnIterator(self):
-    d = one_device_strategy.OneDeviceStrategy("/device:CPU:0")
+  def testMakeInputFnIteratorWithDataset(self, distribution):
     dataset_fn = lambda: dataset_ops.Dataset.range(10)
     expected_values = [[i] for i in range(10)]
     input_fn = self._input_fn_to_test_input_context(
@@ -53,9 +54,46 @@ class OneDeviceStrategyTest(strategy_test_lib.DistributionTestBase):
         expected_num_replicas_in_sync=1,
         expected_num_input_pipelines=1,
         expected_input_pipeline_id=0)
-    iterator = d.make_input_fn_iterator(input_fn)
+    iterator = distribution.make_input_fn_iterator(input_fn)
+    self._test_input_fn_iterator(
+        iterator, distribution.extended.worker_devices, expected_values)
+
+  def testMakeInputFnIteratorWithCallable(self, distribution):
+    def fn():
+      dataset = dataset_ops.Dataset.range(10)
+      it = dataset.make_one_shot_iterator()
+      return it.get_next
+    expected_values = [[i] for i in range(10)]
+    input_fn = self._input_fn_to_test_input_context(
+        fn,
+        expected_num_replicas_in_sync=1,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)
+    iterator = distribution.make_input_fn_iterator(input_fn)
     self._test_input_fn_iterator(
-        iterator, d.extended.worker_devices, expected_values)
+        iterator, distribution.extended.worker_devices, expected_values,
+        test_reinitialize=False)
+
+  def testNumpyIterator(self, distribution):
+    self._test_numpy_iterator(distribution)
+
+  def testAllReduceSum(self, distribution):
+    self._test_all_reduce_sum(distribution)
+
+  def testAllReduceSumGradients(self, distribution):
+    self._test_all_reduce_sum_gradients(distribution)
+
+  def testAllReduceSumGradientTape(self, distribution):
+    self._test_all_reduce_sum_gradient_tape(distribution)
+
+  def testAllReduceMean(self, distribution):
+    self._test_all_reduce_mean(distribution)
+
+  def testAllReduceMeanGradients(self, distribution):
+    self._test_all_reduce_mean_gradients(distribution)
+
+  def testAllReduceMeanGradientTape(self, distribution):
+    self._test_all_reduce_mean_gradient_tape(distribution)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distribute/python/optimizer_v2_test.py b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
index fa4705af7cb592119f56686d1f693a156f7b4b13..e388061b17a9b92dedbbf9839049b13c8575a22c 100644
--- a/tensorflow/contrib/distribute/python/optimizer_v2_test.py
+++ b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
@@ -41,21 +41,17 @@ class MinimizeLossOptimizerV2Test(test.TestCase, parameterized.TestCase):
     with distribution.scope():
       model_fn, dataset_fn, layer = minimize_loss_example(
           optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
-
-      ds = distribution.distribute_dataset(dataset_fn)
-      if context.executing_eagerly():
-        iterator = ds.make_one_shot_iterator()
-      else:
-        iterator = ds.make_initializable_iterator()
+      iterator = distribution.make_input_fn_iterator(lambda _: dataset_fn())
 
       def run_step():
-        return control_flow_ops.group(distribution.unwrap(
-            distribution.call_for_each_replica(
-                model_fn, args=(iterator.get_next(),))))
+        return control_flow_ops.group(
+            distribution.unwrap(
+                distribution.extended.call_for_each_replica(
+                    model_fn, args=(iterator.get_next(),))))
 
       if not context.executing_eagerly():
         with self.cached_session() as sess:
-          sess.run(iterator.initializer)
+          sess.run(iterator.initialize())
           run_step = sess.make_callable(run_step())
         self.evaluate(variables.global_variables_initializer())
 
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy.py b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
index 2c7766f95fbcb7b68a53ad0052f21485c763a1db..e42bc50fdc4e5e93c998708b0790fdea7768faf2 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
@@ -18,34 +18,24 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import copy
-
-from tensorflow.contrib.distribute.python import mirrored_strategy
-from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
-from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import multi_worker_util
-from tensorflow.python.distribute import values
-from tensorflow.python.eager import context
-from tensorflow.python.framework import device as tf_device
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import device_setter
-from tensorflow.python.util import nest
+from tensorflow.python.distribute import input_lib
+from tensorflow.python.distribute import parameter_server_strategy
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
+
+# pylint: disable=protected-access,invalid-name,line-too-long
+CoreParameterServerStrategy = parameter_server_strategy.ParameterServerStrategy
+CoreParameterServerExtended = parameter_server_strategy.ParameterServerStrategyExtended
 
-_LOCAL_CPU = "/device:CPU:0"
-_LOCAL_GPU_0 = "/device:GPU:0"
+# pylint: enable=protected-access,invalid-name,line-too-long
 
 
-# TODO(yuefengz): maybe cache variables on local CPU.
-# TODO(yuefengz): we may want to set session options to disallow communication
-# between workers.
 class ParameterServerStrategy(distribute_lib.DistributionStrategy):
   """A parameter server DistributionStrategy.
 
+  *** contrib version ***
+
   This strategy class works for both local training and between-graph replicated
   training for multiple workers. If `cluster_spec` is specified, either passed
   in to __init__() method or parsed from the
@@ -80,9 +70,9 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
   variables.
 
   3) It is also not recommended to open a colocation scope (i.e. calling
-  `tf.colocate_with`) under the strategy's scope. For colocating variables,
-  use `distribution.colocate_vars_with` instead. Colocation of ops will possibly
-  create conflicts of device assignment.
+  `tf.colocate_with`) under the strategy's scope. For colocating variables, use
+  `strategy.extended.colocate_vars_with` instead. Colocation of ops will
+  possibly create conflicts of device assignment.
   """
 
   def __init__(self, num_gpus_per_worker=0):
@@ -99,431 +89,84 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
     super(ParameterServerStrategy, self).__init__(
         ParameterServerExtended(self, num_gpus_per_worker))
 
+  # Override to change the documentation to reflect the different handling of
+  # global vs. local batch size between core and contrib.
+  def make_dataset_iterator(self, dataset):  # pylint: disable=useless-super-delegation
+    """Makes an iterator for input provided via `dataset`.
 
-class ParameterServerExtended(distribute_lib.DistributionStrategyExtended):
-  """Implementation of ParameterServerStrategy."""
+    NOTE: The batch size of the `dataset` argument is treated differently for
+    this contrib version of `ParameterServerStrategy`.
 
-  def __init__(self, container_strategy, num_gpus_per_worker):
-    super(ParameterServerExtended, self).__init__(container_strategy)
-    self._num_gpus_per_worker = num_gpus_per_worker
-    self._initialize_local(num_gpus_per_worker)
+    Data from the given dataset will be distributed evenly across all the
+    compute replicas. We will assume that the input dataset is batched by the
+    per-replica batch size.
 
-    # We typically don't need to do all-reduce in this strategy.
-    self._cross_device_ops = (
-        cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps(
-            reduce_to_device=_LOCAL_CPU))
-
-  def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec,
-                               task_type, task_id):
-    """Initialize devices for multiple workers.
-
-    It creates variable devices and compute devices. Variables and operations
-    will be assigned to them respectively. We have one compute device per
-    replica. The variable device is a device function or device string. The
-    default variable device assigns variables to parameter servers in a
-    round-robin fashion.
+    The user could also use `make_input_fn_iterator` if they want to
+    customize which input is fed to which replica/worker etc.
 
     Args:
-      num_gpus_per_worker: number of local GPUs or GPUs per worker.
-      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
-        cluster configurations.
-      task_type: the current task type.
-      task_id: the current task id.
+      dataset: `tf.data.Dataset` that will be distributed evenly across all
+        replicas.
 
-    Raises:
-      ValueError: if the cluster_spec doesn't have ps jobs.
+    Returns:
+      An `tf.distribute.InputIterator` which returns inputs for each step of the
+      computation.  User should call `initialize` on the returned iterator.
     """
-    assert cluster_spec
-    if not task_type or task_id is None:
-      raise ValueError("When `cluster_spec` is given, you must also specify "
-                       "`task_type` and `task_id`")
-    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
-
-    self._worker_device = "/job:%s/task:%d" % (self._task_type, self._task_id)
-
-    # Define compute devices which is a list of device strings and one for each
-    # replica. When there are GPUs, replicate operations on these GPUs.
-    # Otherwise, place operations on CPU.
-    if num_gpus_per_worker > 0:
-      self._compute_devices = tuple(
-          "%s/device:GPU:%d" % (self._worker_device, i)
-          for i in range(num_gpus_per_worker)
-      )
-    else:
-      self._compute_devices = (self._worker_device,)
-
-    self._compute_devices = tuple(
-        map(device_util.resolve, self._compute_devices))
-    self._canonical_compute_device_set = set(self._compute_devices)
-
-    # In distributed mode, place variables on ps jobs in a round-robin fashion.
-    # Note that devices returned from `replica_device_setter` are not
-    # canonical and therefore we don't canonicalize all variable devices to
-    # make them consistent.
-    # TODO(yuefengz): support passing a strategy object to control variable
-    # assignment.
-    # TODO(yuefengz): merge the logic of replica_device_setter into this
-    # class.
-    num_ps_replicas = len(cluster_spec.as_dict().get("ps", []))
-    if num_ps_replicas == 0:
-      raise ValueError("The cluster spec needs to have `ps` jobs.")
-    self._variable_device = device_setter.replica_device_setter(
-        ps_tasks=num_ps_replicas,
-        worker_device=self._worker_device,
-        merge_devices=True,
-        cluster=cluster_spec)
-
-    # The `_parameter_devices` is needed for the `parameter_devices` property
-    # and is a list of all variable devices. Here parameter devices are all
-    # tasks of the "ps" job.
-    self._parameter_devices = tuple(map("/job:ps/task:{}".format,
-                                        range(num_ps_replicas)))
-
-    # Add a default device so that ops without specified devices will not end up
-    # on other workers.
-    self._default_device = self._worker_device
-
-    self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
-                                                task_id)
-    self._cluster_spec = cluster_spec
-    self._task_type = task_type
-    self._task_id = task_id
-
-    logging.info(
-        "Multi-worker ParameterServerStrategy with "
-        "cluster_spec = %r, task_type = %r, task_id = %r, "
-        "num_ps_replicas = %r, is_chief = %r, compute_devices = %r, "
-        "variable_device = %r", cluster_spec.as_dict(), task_type, task_id,
-        num_ps_replicas, self._is_chief, self._compute_devices,
-        self._variable_device)
-
-  def _initialize_local(self, num_gpus_per_worker):
-    """Initialize internal devices for local training."""
-    self._worker_device = device_util.canonicalize("/device:CPU:0")
-    # Define compute devices which is a list of device strings and one for each
-    # replica. When there are GPUs, replicate operations on these GPUs.
-    # Otherwise, place operations on CPU.
-    if num_gpus_per_worker > 0:
-      self._compute_devices = tuple(
-          map("/device:GPU:{}".format, range(num_gpus_per_worker)))
-    else:
-      self._compute_devices = (_LOCAL_CPU,)
-
-    self._compute_devices = tuple(
-        map(device_util.resolve, self._compute_devices))
-    self._canonical_compute_device_set = set(self._compute_devices)
-
-    # If there is only one GPU, put everything on that GPU. Otherwise, place
-    # variables on CPU.
-    if num_gpus_per_worker == 1:
-      assert len(self._compute_devices) == 1
-      self._variable_device = _LOCAL_GPU_0
-      self._parameter_devices = (_LOCAL_GPU_0,)
-    else:
-      self._variable_device = _LOCAL_CPU
-      self._parameter_devices = (_LOCAL_CPU,)
-
-    self._is_chief = True
-    self._cluster_spec = None
-    self._task_type = None
-    self._task_id = None
-
-    logging.info(
-        "ParameterServerStrategy with compute_devices = %r, "
-        "variable_device = %r", self._compute_devices, self._variable_device)
-
-  def _distribute_dataset(self, dataset_fn):
-    """Distributes the dataset to each local GPU."""
-    return values.PerReplicaDataset(
-        self._call_dataset_fn(dataset_fn), self._compute_devices, True)
-
-  def _make_dataset_iterator(self, dataset):
-    worker_device_pairs = [(self._worker_device, self._compute_devices)]
-    return values.DatasetIterator(dataset, worker_device_pairs,
-                                  self._num_replicas_in_sync)
-
-  def _make_input_fn_iterator(
-      self,
-      input_fn,
-      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
-    """Distributes the dataset to each local GPU."""
-    if self._cluster_spec:
-      input_pipeline_id = multi_worker_util.id_in_cluster(
-          self._cluster_spec, self._task_type, self._task_id)
-      num_input_pipelines = multi_worker_util.worker_count(
-          self._cluster_spec, self._task_type)
-    else:
-      input_pipeline_id = 0
-      num_input_pipelines = 1
-    input_context = distribute_lib.InputContext(
-        num_input_pipelines=num_input_pipelines,
-        input_pipeline_id=input_pipeline_id,
-        num_replicas_in_sync=self._num_replicas_in_sync)
-    worker_device_pairs = [(self._worker_device, self._compute_devices)]
-    return values.InputFunctionIterator(
-        input_fn, worker_device_pairs, [input_context])
-
-  def _broadcast_to(self, tensor, destinations):
-    # This is both a fast path for Python constants, and a way to delay
-    # converting Python values to a tensor until we know what type it
-    # should be converted to. Otherwise we have trouble with:
-    #   global_step.assign_add(1)
-    # since the `1` gets broadcast as an int32 but global_step is int64.
-    if isinstance(tensor, (float, int)):
-      return tensor
-    if not cross_device_ops_lib.check_destinations(destinations):
-      destinations = self._compute_devices
-    return self._cross_device_ops.broadcast(tensor, destinations)
-
-  def _allow_variable_partition(self):
-    return not context.executing_eagerly()
-
-  # TODO(yuefengz): not all ops in device_setter.STANDARD_PS_OPS will go through
-  # this creator, such as "MutableHashTable".
-  def _create_variable(self, next_creator, *args, **kwargs):
-    if self._num_replicas_in_sync > 1:
-      aggregation = kwargs.pop("aggregation", vs.VariableAggregation.NONE)
-      if aggregation not in (
-          vs.VariableAggregation.NONE,
-          vs.VariableAggregation.SUM,
-          vs.VariableAggregation.MEAN,
-          vs.VariableAggregation.ONLY_FIRST_REPLICA
-      ):
-        raise ValueError("Invalid variable aggregation mode: " + aggregation +
-                         " for variable: " + kwargs["name"])
-
-      def var_creator(*args, **kwargs):
-        """Create an AggregatingVariable and fix up collections."""
-        # Record what collections this variable should be added to.
-        collections = kwargs.pop("collections", None)
-        if collections is None:
-          collections = [ops.GraphKeys.GLOBAL_VARIABLES]
-        kwargs["collections"] = []
-
-        # Create and wrap the variable.
-        v = next_creator(*args, **kwargs)
-        wrapped = values.AggregatingVariable(v, aggregation)
-
-        # Add the wrapped variable to the requested collections.
-        # The handling of eager mode and the global step matches
-        # ResourceVariable._init_from_args().
-        if not context.executing_eagerly():
-          g = ops.get_default_graph()
-          # If "trainable" is True, next_creator() will add the contained
-          # variable to the TRAINABLE_VARIABLES collection, so we manually
-          # remove it and replace with the wrapper. We can't set "trainable"
-          # to False for next_creator() since that causes functions like
-          # implicit_gradients to skip those variables.
-          if kwargs.get("trainable", True):
-            collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
-            l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
-            l.remove(v)
-          g.add_to_collections(collections, wrapped)
-        elif ops.GraphKeys.GLOBAL_STEP in collections:
-          ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, wrapped)
-
-        return wrapped
-    else:
-      var_creator = next_creator
-
-    if "colocate_with" in kwargs:
-      with ops.device(None):
-        with ops.colocate_with(kwargs["colocate_with"]):
-          return var_creator(*args, **kwargs)
-
-    with ops.colocate_with(None, ignore_existing=True):
-      with ops.device(self._variable_device):
-        return var_creator(*args, **kwargs)
-
-  def _call_for_each_replica(self, fn, args, kwargs):
-    # pylint: disable=protected-access
-    return mirrored_strategy._call_for_each_replica(
-        self._container_strategy(), fn, args, kwargs)
+    return super(ParameterServerStrategy, self).make_dataset_iterator(dataset)
 
-  def _verify_destinations_not_different_worker(self, destinations):
-    if not self._cluster_spec:
-      return
-    if destinations is None:
-      return
-    for d in cross_device_ops_lib.get_devices_from(destinations):
-      d_spec = tf_device.DeviceSpec.from_string(d)
-      if d_spec.job == self._task_type and d_spec.task != self._task_id:
-        raise ValueError(
-            "Cannot reduce to another worker: %r, current worker is %r" %
-            (d, self._worker_device))
+  # Override to change the documentation to reflect the different handling of
+  # global vs. local batch size between core and contrib.
+  def experimental_make_numpy_iterator(  # pylint: disable=useless-super-delegation
+      self, numpy_input, batch_size, num_epochs=1, shuffle=1024, session=None):
+    """Makes an iterator for input provided via a nest of numpy arrays.
 
-  def _reduce_to(self, reduce_op, value, destinations):
-    self._verify_destinations_not_different_worker(destinations)
-    if not isinstance(value, values.DistributedValues):
-      # pylint: disable=protected-access
-      return mirrored_strategy._reduce_non_distributed_value(
-          self, reduce_op, value, destinations)
-    return self._cross_device_ops.reduce(
-        reduce_op, value, destinations=destinations)
-
-  def _batch_reduce_to(self, reduce_op, value_destination_pairs):
-    for _, destinations in value_destination_pairs:
-      self._verify_destinations_not_different_worker(destinations)
-    return self._cross_device_ops.batch_reduce(reduce_op,
-                                               value_destination_pairs)
-
-  def _select_single_value(self, structured):
-    """Select any single values in `structured`."""
-
-    def _select_fn(x):  # pylint: disable=g-missing-docstring
-      if isinstance(x, values.Mirrored):
-        if len(x.devices) == 1:
-          return list(x._index.values())[0]  # pylint: disable=protected-access
-        else:
-          raise ValueError(
-              "You cannot update variable with a Mirrored object with multiple "
-              "components %r when using ParameterServerStrategy. You must "
-              "specify a single value or a Mirrored with a single value." % x)
-      elif isinstance(x, values.PerReplica):
-        raise ValueError(
-            "You cannot update variable with a PerReplica object %r when using "
-            "ParameterServerStrategy. You must specify a single value or a "
-            "Mirrored with a single value" % x)
-      else:
-        return x
-
-    return nest.map_structure(_select_fn, structured)
-
-  def _update(self, var, fn, args, kwargs, group):
-    if isinstance(var, values.AggregatingVariable):
-      var = var.get()
-    if not isinstance(var, resource_variable_ops.ResourceVariable):
-      raise ValueError(
-          "You can not update `var` %r. It must be a Variable." % var)
-    with ops.colocate_with(var), distribute_lib.UpdateContext(var.device):
-      result = fn(var, *self._select_single_value(args),
-                  **self._select_single_value(kwargs))
-      if group:
-        return result
-      else:
-        return nest.map_structure(self._unwrap, result)
-
-  # TODO(yuefengz): does it need to call _select_single_value?
-  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
-    with ops.device(
-        colocate_with.device), distribute_lib.UpdateContext(colocate_with):
-      result = fn(*args, **kwargs)
-      if group:
-        return result
-      else:
-        return nest.map_structure(self._unwrap, result)
-
-  def _unwrap(self, val):
-    if isinstance(val, values.DistributedValues):
-      # Return in a deterministic order.
-      if set(val.devices) == self._canonical_compute_device_set:
-        return tuple(val.get(device=d) for d in self._compute_devices)
-      return tuple(val.get(device=d) for d in sorted(val.devices))
-    return (val,)
-
-  def value_container(self, val):
-    if (hasattr(val, "_aggregating_container") and
-        not isinstance(val, values.AggregatingVariable)):
-      wrapper = val._aggregating_container()  # pylint: disable=protected-access
-      if wrapper is not None:
-        return wrapper
-    return val
-
-  def read_var(self, var):
-    # No need to distinguish between normal variables and replica-local
-    # variables.
-    return array_ops.identity(var)
-
-  def _configure(self,
-                 session_config=None,
-                 cluster_spec=None,
-                 task_type=None,
-                 task_id=None):
-    """Configures the strategy class.
-
-    The strategy object will be re-initialized if `cluster_spec` is given but
-    was not passed in the constructor.
+    NOTE: The `batch_size` argument here has different behavior for this
+    contrib version of `ParameterServerStrategy`.
 
     Args:
-      session_config: not used currently.
-      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
-        cluster configurations.
-      task_type: the current task type.
-      task_id: the current task id.
-
-    Raises:
-      ValueError: if `cluster_spec` is given but `task_type` or `task_id` is
-        not.
+      numpy_input: A nest of NumPy input arrays that will be distributed evenly
+        across all replicas.
+      batch_size: The number of entries from the array we should consume in one
+        step of the computation, across all replicas. This is the per-replica
+        batch size. The global batch size will be this times
+        `num_replicas_in_sync`.
+      num_epochs: The number of times to iterate through the examples. A value
+        of `None` means repeat forever.
+      shuffle: Size of buffer to use for shuffling the input examples.
+        Use `None` to disable shuffling.
+      session: (TensorFlow v1.x graph execution only) A session used for
+        initialization.
+
+    Returns:
+      An `tf.distribute.InputIterator` which returns inputs for each step of the
+      computation.  User should call `initialize` on the returned iterator.
     """
-    if not self._cluster_spec and cluster_spec:
-      # If a `cluster_spec` is already passed in, do nothing here.
-      # TODO(yuefengz): check `cluster_spec` is the same if this object has
-      # already been initialized with a `cluster_spec`.
-      if task_type is None or task_id is None:
-        raise ValueError("When `cluster_spec` is given, must also specify "
-                         "`task_type` and `task_id`.")
-      self._cluster_spec = multi_worker_util.normalize_cluster_spec(
-          cluster_spec)
-      self._task_type = task_type
-      self._task_id = task_id
-      self._initialize_multi_worker(self._num_gpus_per_worker,
-                                    self._cluster_spec, task_type, task_id)
-
-    if session_config:
-      session_config.CopyFrom(self._update_config_proto(session_config))
-
-  def _update_config_proto(self, config_proto):
-    updated_config = copy.deepcopy(config_proto)
-    if not self._cluster_spec:
-      updated_config.isolate_session_state = True
-      return updated_config
+    return super(ParameterServerStrategy,
+                 self).experimental_make_numpy_iterator(
+                     numpy_input, batch_size, num_epochs, shuffle, session)
 
-    updated_config.isolate_session_state = False
 
-    assert self._task_type
-    assert self._task_id is not None
-
-    # The device filters prevent communication between workers.
-    if self._task_type not in ["chief", "worker"]:
-      return updated_config
-    del updated_config.device_filters[:]
-    updated_config.device_filters.extend(
-        ["/job:%s/task:%d" % (self._task_type, self._task_id), "/job:ps"])
-    return updated_config
-
-  @property
-  def _num_replicas_in_sync(self):
-    return len(self._compute_devices)
-
-  @property
-  def worker_devices(self):
-    return self._compute_devices
-
-  @property
-  def parameter_devices(self):
-    return self._parameter_devices
-
-  def non_slot_devices(self, var_list):
-    return min(var_list, key=lambda x: x.name)
-
-  @property
-  def experimental_between_graph(self):
-    # TODO(yuefengz): Should this return False in the local case?
-    return True
-
-  @property
-  def experimental_should_init(self):
-    return self._is_chief
+class ParameterServerExtended(CoreParameterServerExtended):
+  """Implementation of ParameterServerStrategy."""
 
-  @property
-  def should_checkpoint(self):
-    return self._is_chief
+  def __init__(self, container_strategy, num_gpus_per_worker):
+    # Use TFConfigClusterResolver to parse TF_CONFIG. We don't want to change
+    # the constructor's interface to allow customized cluster resolver. Use
+    # SimpleClusterResolver to override num_accelerators.
+    tfconfig = TFConfigClusterResolver()
+    cluster_resolver = SimpleClusterResolver(
+        cluster_spec=tfconfig.cluster_spec(),
+        task_type=tfconfig.task_type,
+        task_id=tfconfig.task_id,
+        num_accelerators=num_gpus_per_worker)
+    super(ParameterServerExtended, self).__init__(
+        container_strategy, cluster_resolver=cluster_resolver)
 
-  @property
-  def should_save_summary(self):
-    return self._is_chief
+  def _make_dataset_iterator(self, dataset):
+    return input_lib.DatasetIterator(dataset, self._input_workers)
 
   # TODO(priyag): Delete this once all strategies use global batch size.
   @property
   def _global_batch_size(self):
+    """The contrib version of PS strategy uses per-replica batch size."""
     return False
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
index 83d7473666a65e438a1c0119d2a12bf54e53c8fc..3de2041ae35775de6df5bca02c0f1d04a9c2f24e 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
@@ -29,10 +29,13 @@ from tensorflow.contrib.distribute.python import strategy_test_lib
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import parameter_server_strategy as core_parameter_server_strategy
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import run_config
@@ -45,10 +48,12 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import training_util
+from tensorflow.python.training.server_lib import ClusterSpec
 
 CHIEF = run_config.TaskType.CHIEF
 WORKER = run_config.TaskType.WORKER
@@ -62,6 +67,57 @@ def _get_replica_id_integer():
   return replica_id
 
 
+class MockCoreParameterServerStrategy(distribute_lib.DistributionStrategy):
+  """Mock the strategy to allow cluster resolver as an argument."""
+
+  def __init__(self, cluster_resolver):
+    super(MockCoreParameterServerStrategy, self).__init__(
+        core_parameter_server_strategy.ParameterServerStrategyExtended(
+            self, cluster_resolver=cluster_resolver))
+
+
+def create_test_objects(cluster_spec=None,
+                        task_type=None,
+                        task_id=None,
+                        num_gpus=None,
+                        sess_config=None,
+                        use_core_strategy=False):
+  sess_config = sess_config or config_pb2.ConfigProto()
+  if num_gpus is None:
+    num_gpus = context.num_gpus()
+  if use_core_strategy:
+    if cluster_spec and task_type and task_id is not None:
+      cluster_resolver = SimpleClusterResolver(
+          cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
+          task_type=task_type,
+          task_id=task_id,
+          num_accelerators=num_gpus)
+      target = 'grpc://' + cluster_spec[WORKER][task_id]
+    else:
+      cluster_resolver = SimpleClusterResolver(
+          ClusterSpec({}), num_accelerators=num_gpus)
+      target = ''
+
+    distribution = MockCoreParameterServerStrategy(cluster_resolver)
+    sess_config = copy.deepcopy(sess_config)
+    sess_config = distribution.update_config_proto(sess_config)
+  else:
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=num_gpus)
+    if task_type:
+      sess_config = copy.deepcopy(sess_config)
+      distribution.configure(
+          session_config=sess_config,
+          cluster_spec=cluster_spec,
+          task_type=task_type,
+          task_id=task_id)
+      target = 'grpc://' + cluster_spec[WORKER][task_id]
+    else:
+      target = ''
+
+  return distribution, target, sess_config
+
+
 class ParameterServerStrategyTestBase(
     multi_worker_test_base.MultiWorkerTestBase):
 
@@ -75,24 +131,27 @@ class ParameterServerStrategyTestBase(
     self._sess_config = config_pb2.ConfigProto(allow_soft_placement=True)
     super(ParameterServerStrategyTestBase, self).setUp()
 
-  def _get_test_objects(self, task_type, task_id, num_gpus):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=num_gpus)
-    if not task_type:
-      return distribution, '', self._sess_config
-
-    sess_config = copy.deepcopy(self._sess_config)
-    distribution.configure(
-        session_config=sess_config,
+  def _get_test_objects(self,
+                        task_type,
+                        task_id,
+                        num_gpus,
+                        use_core_strategy=False):
+    return create_test_objects(
         cluster_spec=self._cluster_spec,
         task_type=task_type,
-        task_id=task_id)
-    return (distribution, 'grpc://' + self._cluster_spec[WORKER][task_id],
-            sess_config)
-
-  def _test_device_assignment_distributed(self, task_type, task_id, num_gpus):
+        task_id=task_id,
+        num_gpus=num_gpus,
+        sess_config=self._sess_config,
+        use_core_strategy=use_core_strategy)
+
+  def _test_device_assignment_distributed(self,
+                                          task_type,
+                                          task_id,
+                                          num_gpus,
+                                          use_core_strategy=False):
     worker_device = '/job:%s/replica:0/task:%d' % (task_type, task_id)
-    d, _, sess_config = self._get_test_objects(task_type, task_id, num_gpus)
+    d, _, sess_config = self._get_test_objects(
+        task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
     with ops.Graph().as_default(), \
          self.cached_session(target=self._default_target,
                              config=sess_config) as sess, \
@@ -131,7 +190,7 @@ class ParameterServerStrategyTestBase(
                          '/job:worker/replica:0/task:0/%s' % last_part_device)
 
         # The colocate_vars_with can override the distribution's device.
-        with d.colocate_vars_with(x):
+        with d.extended.colocate_vars_with(x):
           y = variable_scope.get_variable(
               'y', initializer=20.0,
               aggregation=variable_scope.VariableAggregation.SUM)
@@ -177,7 +236,7 @@ class ParameterServerStrategyTestBase(
         self.assertIn('/job:ps/', h.device)
         return y_add, z_add, f
 
-      y, z, f = d.call_for_each_replica(model_fn)
+      y, z, f = d.extended.call_for_each_replica(model_fn)
       self.assertNotEqual(y, None)
       self.assertNotEqual(z, None)
       self.assertNotEqual(f, None)
@@ -190,9 +249,10 @@ class ParameterServerStrategyTestBase(
         self.assertEqual(f_val, 46.0)
 
   def _test_device_assignment_distributed_enable_partitioner(
-      self, task_type, task_id, num_gpus):
-    d, _, sess_config = self._get_test_objects(task_type, task_id, num_gpus)
-    num_shards = len(d.parameter_devices)
+      self, task_type, task_id, num_gpus, use_core_strategy=False):
+    d, _, sess_config = self._get_test_objects(
+        task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
+    num_shards = len(d.extended.parameter_devices)
     partitioner = partitioned_variables.fixed_size_partitioner(num_shards)
     with ops.Graph().as_default(), \
          self.cached_session(target=self._default_target,
@@ -224,39 +284,18 @@ class ParameterServerStrategyTestBase(
           self.assertEqual(var.device, '/job:ps/task:%d' % part_id)
           self.assertEqual(var.device, x_add[part_id].device)
 
-        # The colocate_vars_with can override the distribution's device.
-        with d.colocate_vars_with(x_add[0]):
-          y = variable_scope.get_variable(
-              'y',
-              initializer=constant_op.constant([20.0, 10.0]),
-              aggregation=variable_scope.VariableAggregation.SUM,
-              partitioner=partitioner)
-        y_add = y.assign_add(
-            [array_ops.identity(x_add[0]),
-             array_ops.identity(x_add[1])])
-
-        for part_id, var in enumerate(y):
-          self.assertEqual(var.device, '/job:ps/task:0')
-          self.assertEqual(y_add[part_id].device, var.device)
-          self.assertEqual(var.device, x_add[0].device)
-
-        return x_add, y_add
+        return x_add
 
-      x, y = d.call_for_each_replica(model_fn)
+      x = d.extended.call_for_each_replica(model_fn)
 
       if context.num_gpus() >= 1:
         variables.global_variables_initializer().run()
-        x_val, y_val = sess.run([x, y])
+        x_val = sess.run(x)
         if num_gpus < 1:
           self.assertEqual(x_val, [13.0, 25.0])
-          self.assertEqual(y_val, [33.0, 35.0])
         else:
           x_expect = [10.0 + 3 * num_gpus, 20.0 + 5 * num_gpus]
-          y_expect = [
-              20.0 + x_expect[0] * num_gpus, 10.0 + x_expect[1] * num_gpus
-          ]
           self.assertEqual(x_val, x_expect)
-          self.assertEqual(y_val, y_expect)
 
   def _test_device_assignment_local(self,
                                     d,
@@ -305,7 +344,7 @@ class ParameterServerStrategyTestBase(
         self.assertEqual(e.device, device_util.canonicalize('/device:GPU:2'))
 
         # The colocate_vars_with can override the distribution's device.
-        with d.colocate_vars_with(x):
+        with d.extended.colocate_vars_with(x):
           y = variable_scope.get_variable(
               'y', initializer=20.0,
               aggregation=variable_scope.VariableAggregation.SUM)
@@ -348,7 +387,7 @@ class ParameterServerStrategyTestBase(
             device_util.canonicalize(h.device))
         return y_add, z_add, f
 
-      y, z, f = d.call_for_each_replica(model_fn)
+      y, z, f = d.extended.call_for_each_replica(model_fn)
       self.assertNotEqual(y, None)
       self.assertNotEqual(z, None)
       self.assertNotEqual(f, None)
@@ -360,9 +399,13 @@ class ParameterServerStrategyTestBase(
         self.assertEqual(z_val, 43.0)
         self.assertEqual(f_val, 46.0)
 
-  def _test_simple_increment(self, task_type, task_id, num_gpus):
+  def _test_simple_increment(self,
+                             task_type,
+                             task_id,
+                             num_gpus,
+                             use_core_strategy=False):
     d, master_target, sess_config = self._get_test_objects(
-        task_type, task_id, num_gpus)
+        task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
     if d.extended._cluster_spec:
       num_workers = len(d.extended._cluster_spec.as_dict().get(WORKER))
       if 'chief' in d.extended._cluster_spec.as_dict():
@@ -395,7 +438,7 @@ class ParameterServerStrategyTestBase(
         train_op = control_flow_ops.group(x_add, y_add, z_add)
         return x, y, z, train_op
 
-      x, y, z, train_op = d.call_for_each_replica(model_fn)
+      x, y, z, train_op = d.extended.call_for_each_replica(model_fn)
       train_op = d.group(train_op)
 
       if context.num_gpus() < d.extended._num_gpus_per_worker:
@@ -430,9 +473,13 @@ class ParameterServerStrategyTestBase(
               y_val == 20.0 + 1.0 * num_workers * d.num_replicas_in_sync and
               z_val == 30.0 + 1.0 * num_workers)
 
-  def _test_minimize_loss_graph(self, task_type, task_id, num_gpus):
+  def _test_minimize_loss_graph(self,
+                                task_type,
+                                task_id,
+                                num_gpus,
+                                use_core_strategy=False):
     d, master_target, sess_config = self._get_test_objects(
-        task_type, task_id, num_gpus)
+        task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
     if task_type:
       # Multi-worker
       assert hasattr(d.extended, '_cluster_spec') and d.extended._cluster_spec
@@ -472,20 +519,20 @@ class ParameterServerStrategyTestBase(
       def step():
         """Perform one optimization step."""
         # Run forward & backward to get gradients, variables list.
-        g_v = d.call_for_each_replica(grad_fn, args=(one,))
+        g_v = d.extended.call_for_each_replica(grad_fn, args=(one,))
         # Update the variables using the gradients and the update() function.
         before_list = []
         after_list = []
         for g, v in g_v:
-          fetched = d.read_var(v)
+          fetched = d.extended.read_var(v)
           before_list.append(fetched)
           with ops.control_dependencies([fetched]):
             # TODO(yuefengz): support non-Mirrored variable as destinations.
             g = d.extended.reduce_to(
                 reduce_util.ReduceOp.SUM, g, destinations=v)
             with ops.control_dependencies(
-                d.update(v, update, g, grouped=False)):
-              after_list.append(d.read_var(v))
+                d.extended.update(v, update, args=(g,), group=False)):
+              after_list.append(d.extended.read_var(v))
         return before_list, after_list
 
       before_out, after_out = step()
@@ -518,10 +565,16 @@ class ParameterServerStrategyTestBase(
       self.assertLess(error_after, error_before)
       return error_after < error_before
 
-  def _test_input_fn_iterator(self, task_type, task_id, num_gpus, input_fn,
-                              expected_values):
+  def _test_input_fn_iterator(self,
+                              task_type,
+                              task_id,
+                              num_gpus,
+                              input_fn,
+                              expected_values,
+                              test_reinitialize=True,
+                              use_core_strategy=False):
     distribution, master_target, config = self._get_test_objects(
-        task_type, task_id, num_gpus)
+        task_type, task_id, num_gpus, use_core_strategy=use_core_strategy)
     devices = distribution.extended.worker_devices
 
     with ops.Graph().as_default(), \
@@ -532,27 +585,31 @@ class ParameterServerStrategyTestBase(
 
       for expected_value in expected_values:
         next_element = iterator.get_next()
-        computed_value = sess.run(
-            [values.select_device(d, next_element) for d in devices])
+        computed_value = sess.run([values.select_replica(r, next_element)
+                                   for r in range(len(devices))])
         self.assertEqual(expected_value, computed_value)
 
       with self.assertRaises(errors.OutOfRangeError):
         next_element = iterator.get_next()
-        sess.run([values.select_device(d, next_element) for d in devices])
+        sess.run([values.select_replica(r, next_element)
+                  for r in range(len(devices))])
 
       # After re-initializing the iterator, should be able to iterate again.
-      sess.run(iterator.initialize())
+      if test_reinitialize:
+        sess.run(iterator.initialize())
 
-      for expected_value in expected_values:
-        next_element = iterator.get_next()
-        computed_value = sess.run(
-            [values.select_device(d, next_element) for d in devices])
-        self.assertEqual(expected_value, computed_value)
+        for expected_value in expected_values:
+          next_element = iterator.get_next()
+          computed_value = sess.run([values.select_replica(r, next_element)
+                                     for r in range(len(devices))])
+          self.assertEqual(expected_value, computed_value)
 
 
-class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
-                                  strategy_test_lib.DistributionTestBase,
-                                  parameterized.TestCase):
+class ParameterServerStrategyTest(
+    ParameterServerStrategyTestBase,
+    strategy_test_lib.DistributionTestBase,
+    strategy_test_lib.TwoDeviceDistributionTestBase,
+    parameterized.TestCase):
 
   @classmethod
   def setUpClass(cls):
@@ -560,111 +617,175 @@ class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
         num_workers=3, num_ps=2)
     cls._default_target = 'grpc://' + cls._cluster_spec[WORKER][0]
 
-  def test_num_replicas_in_sync(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=2)
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def test_num_replicas_in_sync(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=2, use_core_strategy=use_core_strategy)
     # All the devices on a given worker are in sync which in this case is the
     # number of gpus on each worker.
-    self.assertEqual(2, distribution.num_replicas_in_sync)
+    self.assertEqual(2, strategy.num_replicas_in_sync)
 
-  def testDeviceAssignmentLocalCPU(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=0)
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testDeviceAssignmentLocalCPU(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=0, use_core_strategy=use_core_strategy)
     self._test_device_assignment_local(
-        distribution, compute_device='CPU', variable_device='CPU', num_gpus=0)
+        strategy, compute_device='CPU', variable_device='CPU', num_gpus=0)
 
-  def testDeviceAssignmentLocalOneGPU(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=1)
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testDeviceAssignmentLocalOneGPU(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=1, use_core_strategy=use_core_strategy)
     self._test_device_assignment_local(
-        distribution, compute_device='GPU', variable_device='GPU', num_gpus=1)
+        strategy, compute_device='GPU', variable_device='GPU', num_gpus=1)
 
-  def testDeviceAssignmentLocalTwoGPUs(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=2)
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testDeviceAssignmentLocalTwoGPUs(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=2, use_core_strategy=use_core_strategy)
     self._test_device_assignment_local(
-        distribution, compute_device='GPU', variable_device='CPU', num_gpus=2)
+        strategy, compute_device='GPU', variable_device='CPU', num_gpus=2)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
-  def testDeviceAssignmentDistributed(self, num_gpus):
-    self._test_device_assignment_distributed('worker', 1, num_gpus)
+      combinations.combine(
+          mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False]))
+  def testDeviceAssignmentDistributed(self, num_gpus, use_core_strategy):
+    self._test_device_assignment_distributed(
+        'worker', 1, num_gpus, use_core_strategy=use_core_strategy)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
-  def testDeviceAssignmentDistributedEnablePartitioner(self, num_gpus):
+      combinations.combine(
+          mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False]))
+  def testDeviceAssignmentDistributedEnablePartitioner(self, num_gpus,
+                                                       use_core_strategy):
     self._test_device_assignment_distributed_enable_partitioner(
-        'worker', 1, num_gpus)
+        'worker', 1, num_gpus, use_core_strategy=use_core_strategy)
 
-  def testSimpleBetweenGraph(self):
-    self._run_between_graph_clients(self._test_simple_increment,
-                                    self._cluster_spec, context.num_gpus())
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testSimpleBetweenGraph(self, use_core_strategy):
+    self._run_between_graph_clients(
+        self._test_simple_increment,
+        self._cluster_spec,
+        context.num_gpus(),
+        use_core_strategy=use_core_strategy)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
-  def testLocalSimpleIncrement(self, num_gpus):
-    self._test_simple_increment(None, 0, num_gpus)
+      combinations.combine(
+          mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False]))
+  def testLocalSimpleIncrement(self, num_gpus, use_core_strategy):
+    self._test_simple_increment(None, 0, num_gpus, use_core_strategy)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
-  def testMinimizeLossGraphDistributed(self, num_gpus):
-    self._run_between_graph_clients(self._test_minimize_loss_graph,
-                                    self._cluster_spec, num_gpus)
+      combinations.combine(
+          mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False]))
+  def testMinimizeLossGraphDistributed(self, num_gpus, use_core_strategy):
+    self._run_between_graph_clients(
+        self._test_minimize_loss_graph,
+        self._cluster_spec,
+        num_gpus,
+        use_core_strategy=use_core_strategy)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
-  def testMinimizeLossGraphLocal(self, num_gpus):
-    self._test_minimize_loss_graph(None, None, num_gpus)
+      combinations.combine(
+          mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False]))
+  def testMinimizeLossGraphLocal(self, num_gpus, use_core_strategy):
+    self._test_minimize_loss_graph(None, None, num_gpus, use_core_strategy)
 
+  # TODO(b/124344198): Re-enable after fixing this flaky test.
   # TODO(priyag): Refactor this and other multi worker tests.
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[1, 2], required_gpus=1))
-  def testMakeInputFnIteratorDistributed(self, num_gpus):
+      combinations.combine(
+          mode=['graph'],
+          num_gpus=[1, 2],
+          required_gpus=1,
+          use_core_strategy=[True, False],
+          use_dataset=[True, False]))
+  def DISABLED_testMakeInputFnIteratorDistributed(
+      self, num_gpus, use_core_strategy, use_dataset):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
-    dataset_fn = lambda: dataset_ops.Dataset.range(100)
+    if use_dataset:
+      fn = lambda: dataset_ops.Dataset.range(100)
+    else:
+      def fn():
+        dataset = dataset_ops.Dataset.range(100)
+        it = dataset.make_one_shot_iterator()
+        return it.get_next
     expected_values = [[i+j for j in range(num_gpus)]
                        for i in range(0, 100, num_gpus)]
 
     input_fn = self._input_fn_to_test_input_context(
-        dataset_fn,
+        fn,
         expected_num_replicas_in_sync=num_gpus,
         expected_num_input_pipelines=3,
         expected_input_pipeline_id=1)  # because task_id = 1
-    self._test_input_fn_iterator('worker', 1, num_gpus,
-                                 input_fn, expected_values)
-
+    self._test_input_fn_iterator(
+        'worker',
+        1,
+        num_gpus,
+        input_fn,
+        expected_values,
+        test_reinitialize=use_dataset,
+        use_core_strategy=use_core_strategy)
+
+  # TODO(b/124344198): Re-enable after fixing this flaky test.
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[1, 2], required_gpus=1))
-  def testMakeInputFnIteratorLocal(self, num_gpus):
+      combinations.combine(
+          mode=['graph'],
+          num_gpus=[1, 2],
+          required_gpus=1,
+          use_core_strategy=[True, False],
+          use_dataset=[True, False]))
+  def DISABLED_testMakeInputFnIteratorLocal(self, num_gpus, use_core_strategy,
+                                            use_dataset):
     if context.num_gpus() < num_gpus:
       self.skipTest('Not enough GPUs')
-    dataset_fn = lambda: dataset_ops.Dataset.range(100)
+    if use_dataset:
+      fn = lambda: dataset_ops.Dataset.range(100)
+    else:
+      def fn():
+        dataset = dataset_ops.Dataset.range(100)
+        it = dataset.make_one_shot_iterator()
+        return it.get_next
     expected_values = [[i+j for j in range(num_gpus)]
                        for i in range(0, 100, num_gpus)]
 
     input_fn = self._input_fn_to_test_input_context(
-        dataset_fn,
+        fn,
         expected_num_replicas_in_sync=num_gpus,
         expected_num_input_pipelines=1,
         expected_input_pipeline_id=0)  # only one worker and pipeline for local.
-    self._test_input_fn_iterator(None, None, num_gpus,
-                                 input_fn, expected_values)
+    self._test_input_fn_iterator(
+        None,
+        None,
+        num_gpus,
+        input_fn,
+        expected_values,
+        test_reinitialize=use_dataset,
+        use_core_strategy=use_core_strategy)
 
-  def testGlobalStepUpdate(self):
-    strategy = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=context.num_gpus())
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testGlobalStepUpdate(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(use_core_strategy=use_core_strategy)
     self._test_global_step_update(strategy)
 
-  def testUpdateConfigProtoMultiWorker(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=2)
-    distribution.configure(
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testUpdateConfigProtoMultiWorker(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=2, use_core_strategy=use_core_strategy)
+    strategy.configure(
         cluster_spec=self._cluster_spec, task_type='worker', task_id=1)
 
     config_proto = config_pb2.ConfigProto(device_filters=['to_be_overridden'])
 
-    new_config = distribution.update_config_proto(config_proto)
+    new_config = strategy.update_config_proto(config_proto)
 
     # Verify device filters.
     self.assertEqual(['/job:worker/task:1', '/job:ps'],
@@ -673,16 +794,48 @@ class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
     # Verify isolate_session_state
     self.assertFalse(new_config.isolate_session_state)
 
-  def testUpdateConfigProtoLocal(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=2)
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testUpdateConfigProtoLocal(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=2, use_core_strategy=use_core_strategy)
 
     config_proto = config_pb2.ConfigProto()
-    new_config = distribution.update_config_proto(config_proto)
+    new_config = strategy.update_config_proto(config_proto)
 
     # Verify isolate_session_state
     self.assertTrue(new_config.isolate_session_state)
 
+  def testAllReduceSum(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    self._test_all_reduce_sum(distribution)
+
+  def testAllReduceSumGradients(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    self._test_all_reduce_sum_gradients(distribution)
+
+  def testAllReduceSumGradientTape(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    self._test_all_reduce_sum_gradient_tape(distribution)
+
+  def testAllReduceMean(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    self._test_all_reduce_mean(distribution)
+
+  def testAllReduceMeanGradients(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    self._test_all_reduce_mean_gradients(distribution)
+
+  def testAllReduceMeanGradientTape(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    self._test_all_reduce_mean_gradient_tape(distribution)
+
 
 class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
                                            parameterized.TestCase):
@@ -693,20 +846,31 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
         num_workers=3, num_ps=2, has_chief=True)
     cls._default_target = 'grpc://' + cls._cluster_spec[CHIEF][0]
 
-  def testSimpleBetweenGraph(self):
-    self._run_between_graph_clients(self._test_simple_increment,
-                                    self._cluster_spec, context.num_gpus())
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testSimpleBetweenGraph(self, use_core_strategy):
+    self._run_between_graph_clients(
+        self._test_simple_increment,
+        self._cluster_spec,
+        context.num_gpus(),
+        use_core_strategy=use_core_strategy)
 
   @combinations.generate(
-      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
-  def testMinimizeLossGraph(self, num_gpus):
-    self._run_between_graph_clients(self._test_minimize_loss_graph,
-                                    self._cluster_spec, num_gpus)
+      combinations.combine(
+          mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False]))
+  def testMinimizeLossGraph(self, num_gpus, use_core_strategy):
+    self._run_between_graph_clients(
+        self._test_minimize_loss_graph,
+        self._cluster_spec,
+        num_gpus,
+        use_core_strategy=use_core_strategy)
 
-  def testGlobalStepIsWrapped(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=2)
-    with ops.Graph().as_default(), distribution.scope():
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testGlobalStepIsWrappedOnTwoGPUs(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=2, use_core_strategy=use_core_strategy)
+    with ops.Graph().as_default(), strategy.scope():
       created_step = training_util.create_global_step()
       get_step = training_util.get_global_step()
       self.assertEqual(created_step, get_step,
@@ -715,19 +879,55 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
                              id(get_step), get_step.__class__.__name__)))
       self.assertIs(values.AggregatingVariable, type(created_step))
       self.assertIs(values.AggregatingVariable, type(get_step))
+      self.assertIs(strategy, created_step.distribute_strategy)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testGlobalStepIsNotWrappedOnOneGPU(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=1, use_core_strategy=use_core_strategy)
+    with ops.Graph().as_default(), strategy.scope():
+      created_step = training_util.create_global_step()
+      get_step = training_util.get_global_step()
+      self.assertEqual(created_step, get_step,
+                       msg=('created_step %s type %s vs. get_step %s type %s' %
+                            (id(created_step), created_step.__class__.__name__,
+                             id(get_step), get_step.__class__.__name__)))
+      self.assertIs(resource_variable_ops.ResourceVariable, type(created_step))
+      self.assertIs(resource_variable_ops.ResourceVariable, type(get_step))
+      # All variables have an _distribute_strategy parameter. Only variable
+      # subclasses in distribution strategy expose it publicly.
+      self.assertFalse(hasattr(strategy, 'distribute_strategy'))
+      self.assertIs(strategy, created_step._distribute_strategy)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], use_core_strategy=[True, False]))
+  def testValueContainer(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=2, use_core_strategy=use_core_strategy)
+    with ops.Graph().as_default(), strategy.scope():
 
-  def testValueContainer(self):
-    distribution = parameter_server_strategy.ParameterServerStrategy(
-        num_gpus_per_worker=2)
-    with ops.Graph().as_default(), distribution.scope():
       def f():
         with backprop.GradientTape() as tape:
           v = variable_scope.get_variable('v', initializer=10.0)
           _ = v * v
         v, = tape.watched_variables()
-        w = distribution.extended.value_container(v)
+        w = strategy.extended.value_container(v)
         self.assertIs(values.AggregatingVariable, type(w))
-      distribution.extended.call_for_each_replica(f)
+
+      strategy.extended.call_for_each_replica(f)
+
+
+class LocalParameterServerStrategyTest(strategy_test_lib.DistributionTestBase,
+                                       parameterized.TestCase):
+
+  @combinations.generate(combinations.combine(mode=['graph', 'eager'],
+                                              use_core_strategy=[True, False],
+                                              required_gpus=2))
+  def testNumpyIterator(self, use_core_strategy):
+    strategy, _, _ = create_test_objects(
+        num_gpus=2, use_core_strategy=use_core_strategy)
+    self._test_numpy_iterator(strategy)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/distribute/python/step_fn.py b/tensorflow/contrib/distribute/python/step_fn.py
index c928b6d9f1f21508edd753f94c38ab2723cc0a9f..27aad46b97195aa498d0382f08c04c312cebbe65 100644
--- a/tensorflow/contrib/distribute/python/step_fn.py
+++ b/tensorflow/contrib/distribute/python/step_fn.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.eager import backprop
-from tensorflow.python.eager import context
 from tensorflow.python.training import optimizer as optimizer_lib
 
 
@@ -33,6 +32,9 @@ class Step(object):
   def distribution(self):
     return self._distribution
 
+  def initialize(self):
+    return []
+
   def __call__(self):
     """Perform one step of this training algorithm."""
     raise NotImplementedError("must be implemented in descendants")
@@ -50,12 +52,10 @@ class StandardInputStep(Step):
 
   def __init__(self, dataset_fn, distribution):
     super(StandardInputStep, self).__init__(distribution)
-    self._distributed_input = distribution.distribute_dataset(dataset_fn)
-    if context.executing_eagerly():
-      self._iterator = self._distributed_input.make_one_shot_iterator()
-    else:
-      # TODO(priyag): Expose initializer via some initializer property.
-      self._iterator = self._distributed_input.make_initializable_iterator()
+    self._iterator = distribution.make_input_fn_iterator(lambda _: dataset_fn())
+
+  def initialize(self):
+    return self._iterator.initialize()
 
 
 class StandardSingleLossStep(StandardInputStep):
@@ -99,8 +99,8 @@ class StandardSingleLossStep(StandardInputStep):
         gradients_fn = backprop.implicit_grad(self._loss_fn)
         gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
 
-        grads_and_vars = self.distribution.call_for_each_replica(
-            gradients_fn, args=(ctx,) + inputs)
+        grads_and_vars = self.distribution.extended.call_for_each_replica(
+            gradients_fn, args=(ctx, inputs))
         # If threads use layers, then we need to run the first step
         # sequentially, so that layers.build() is not executed in parallel.
         # Otherwise, multiple sets of mirrored variables are going to be
@@ -109,6 +109,6 @@ class StandardSingleLossStep(StandardInputStep):
             self.distribution, grads_and_vars)
 
       # TODO(priyag): Return the outputs, context, etc as well.
-      ctx = self.distribution.run_steps_on_dataset(
+      ctx = self.distribution.extended.experimental_run_steps_on_iterator(
           step_fn, self._iterator, self._iterations_per_step)
       return ctx.run_op
diff --git a/tensorflow/contrib/distribute/python/step_fn_test.py b/tensorflow/contrib/distribute/python/step_fn_test.py
index 1ff9b9ceec13351b098d47ed3ff62f689a625a31..9f48560b2666036e149a63c98b6529fb24cc5067 100644
--- a/tensorflow/contrib/distribute/python/step_fn_test.py
+++ b/tensorflow/contrib/distribute/python/step_fn_test.py
@@ -45,24 +45,21 @@ class SingleLossStepTest(test.TestCase, parameterized.TestCase):
       single_loss_step, layer = single_loss_example(
           optimizer_fn, distribution, use_bias=True, iterations_per_step=2)
 
-      self.evaluate(distribution.initialize())
       if context.executing_eagerly():
+        single_loss_step.initialize()
         run_step = single_loss_step
       else:
         with self.cached_session() as sess:
-          sess.run(single_loss_step._iterator.initializer)
+          sess.run(single_loss_step.initialize())
           run_step = sess.make_callable(single_loss_step())
       self.evaluate(variables.global_variables_initializer())
 
       weights, biases = [], []
       for _ in range(5):
         run_step()
-
         weights.append(self.evaluate(layer.kernel))
         biases.append(self.evaluate(layer.bias))
 
-      self.evaluate(distribution.finalize())
-
       error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
       is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
       self.assertTrue(is_not_increasing)
diff --git a/tensorflow/contrib/distribute/python/strategy_test_lib.py b/tensorflow/contrib/distribute/python/strategy_test_lib.py
index d441b5af5f6aa41efde2c75d09d9589516c54992..90f552eda4c41742f21ca276d8a059b2b102554f 100644
--- a/tensorflow/contrib/distribute/python/strategy_test_lib.py
+++ b/tensorflow/contrib/distribute/python/strategy_test_lib.py
@@ -18,7 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values
@@ -31,6 +34,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.layers import core
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -41,25 +45,26 @@ class _TestException(Exception):
   pass
 
 
-# May be the argument to either distribution.call_for_each_replica() or
+# May be the argument to either distribution.extended.call_for_each_replica() or
 # get_replica_context().merge_call()
 def _raise_exception_fn(_=None):
   raise _TestException()
 
 
-# Must be the argument to a distribution.call_for_each_replica() call, calls a
-# get_replica_context().merge_call() that raises an exception.
+# Must be the argument to a distribution.extended.call_for_each_replica() call,
+# calls a get_replica_context().merge_call() that raises an exception.
 def _merge_raises_fn():
   ds_context.get_replica_context().merge_call(_raise_exception_fn)
 
 
 # Must be the argument to a get_replica_context().merge_call() call, calls
-# dist.call_for_each_replica() with a function that raises an exception.
+# dist.extended.call_for_each_replica() with a function that raises an
+# exception.
 def _call_raises_fn(dist):
-  dist.call_for_each_replica(_raise_exception_fn)
+  dist.extended.call_for_each_replica(_raise_exception_fn)
 
 
-# Must be the argument to a distribution.call_for_each_replica() call,
+# Must be the argument to a distribution.extended.call_for_each_replica() call,
 # calls a get_replica_context().merge_call() that calls a
 # call_for_each_replica() that raises an exception.
 def _merge_call_raises_fn():
@@ -67,15 +72,16 @@ def _merge_call_raises_fn():
 
 
 # Must be the argument to a get_replica_context().merge_call() call, calls
-# dist.call_for_each_replica() with a function that calls a
+# dist.extended.call_for_each_replica() with a function that calls a
 # get_replica_context().merge_call() that raises an exception.
 def _call_merge_raises_fn(dist):
-  dist.call_for_each_replica(_merge_raises_fn)
+  dist.extended.call_for_each_replica(_merge_raises_fn)
 
 
-# Must be the argument to a distribution.call_for_each_replica() call, calls a
-# get_replica_context().merge_call() that calls a call_for_each_replica() that
-# calls a get_replica_context().merge_call() that raises an exception.
+# Must be the argument to a distribution.extended.call_for_each_replica() call,
+# calls a get_replica_context().merge_call() that calls a
+# call_for_each_replica() that calls a get_replica_context().merge_call() that
+# raises an exception.
 def _merge_call_merge_raises_fn():
   ds_context.get_replica_context().merge_call(_call_merge_raises_fn)
 
@@ -106,21 +112,21 @@ class DistributionTestBase(test.TestCase):
       def step():
         """Perform one optimization step."""
         # Run forward & backward to get gradients, variables list.
-        g_v = d.call_for_each_replica(grad_fn, args=(one,))
+        g_v = d.extended.call_for_each_replica(grad_fn, args=(one,))
 
         # Update the variables using the gradients and the update() function.
         before_list = []
         after_list = []
         for g, v in g_v:
-          fetched = d.read_var(v)
+          fetched = d.extended.read_var(v)
           before_list.append(fetched)
           # control_dependencies irrelevant but harmless in eager execution
           with ops.control_dependencies([fetched]):
             g = d.extended.reduce_to(
                 reduce_util.ReduceOp.SUM, g, destinations=v)
-            with ops.control_dependencies(d.update(
-                v, update, g, grouped=False)):
-              after_list.append(d.read_var(v))
+            with ops.control_dependencies(d.extended.update(
+                v, update, args=(g,), group=False)):
+              after_list.append(d.extended.read_var(v))
         return before_list, after_list
 
       for i in range(10):
@@ -162,20 +168,20 @@ class DistributionTestBase(test.TestCase):
       def step():
         """Perform one optimization step."""
         # Run forward & backward to get gradients, variables list.
-        g_v = d.call_for_each_replica(grad_fn, args=(one,))
+        g_v = d.extended.call_for_each_replica(grad_fn, args=(one,))
 
         # Update the variables using the gradients and the update() function.
         before_list = []
         after_list = []
         for g, v in g_v:
-          fetched = d.read_var(v)
+          fetched = d.extended.read_var(v)
           before_list.append(fetched)
           with ops.control_dependencies([fetched]):
             g = d.extended.reduce_to(
                 reduce_util.ReduceOp.SUM, g, destinations=v)
-            with ops.control_dependencies(d.update(
-                v, update, g, grouped=False)):
-              after_list.append(d.read_var(v))
+            with ops.control_dependencies(d.extended.update(
+                v, update, args=(g,), group=False)):
+              after_list.append(d.extended.read_var(v))
         return before_list, after_list
 
       before_out, after_out = step()
@@ -202,23 +208,23 @@ class DistributionTestBase(test.TestCase):
         self.assertFalse(expected_devices[replica_id])
         expected_devices[replica_id] = True
 
-      d.call_for_each_replica(mark_devices_fn)
+      d.extended.call_for_each_replica(mark_devices_fn)
       self.assertAllEqual(expected_devices,
                           [True] * len(d.extended.worker_devices))
 
   def _test_call_and_merge_exceptions(self, dist):
     with dist.scope():
       with self.assertRaises(_TestException):
-        dist.call_for_each_replica(_raise_exception_fn)
+        dist.extended.call_for_each_replica(_raise_exception_fn)
       with self.assertRaises(_TestException):
-        dist.call_for_each_replica(_merge_raises_fn)
+        dist.extended.call_for_each_replica(_merge_raises_fn)
       with self.assertRaises(_TestException):
-        dist.call_for_each_replica(_merge_call_raises_fn)
+        dist.extended.call_for_each_replica(_merge_call_raises_fn)
       with self.assertRaises(_TestException):
-        dist.call_for_each_replica(_merge_call_merge_raises_fn)
+        dist.extended.call_for_each_replica(_merge_call_merge_raises_fn)
 
   def _input_fn_to_test_input_context(self,
-                                      dataset_fn,
+                                      dataset_or_callable_fn,
                                       expected_num_replicas_in_sync,
                                       expected_num_input_pipelines,
                                       expected_input_pipeline_id):
@@ -242,33 +248,35 @@ class DistributionTestBase(test.TestCase):
         self.assertEqual(worker_id_counter[0], input_context.input_pipeline_id)
         worker_id_counter[0] += 1
 
-      return dataset_fn()
+      return dataset_or_callable_fn()
 
     return _input_fn
 
   def _test_input_fn_iterator(self, iterator, devices, expected_values,
-                              sess=None):
+                              sess=None, test_reinitialize=True):
     evaluate = lambda x: sess.run(x) if sess else self.evaluate(x)
     evaluate(iterator.initialize())
 
     for expected_value in expected_values:
       next_element = iterator.get_next()
       computed_value = evaluate(
-          [values.select_device(d, next_element) for d in devices])
+          [values.select_replica(r, next_element) for r in range(len(devices))])
       self.assertEqual(expected_value, computed_value)
 
     with self.assertRaises(errors.OutOfRangeError):
       next_element = iterator.get_next()
-      evaluate([values.select_device(d, next_element) for d in devices])
+      evaluate(
+          [values.select_replica(r, next_element) for r in range(len(devices))])
 
     # After re-initializing the iterator, should be able to iterate again.
-    evaluate(iterator.initialize())
+    if test_reinitialize:
+      evaluate(iterator.initialize())
 
-    for expected_value in expected_values:
-      next_element = iterator.get_next()
-      computed_value = evaluate(
-          [values.select_device(d, next_element) for d in devices])
-      self.assertEqual(expected_value, computed_value)
+      for expected_value in expected_values:
+        next_element = iterator.get_next()
+        computed_value = evaluate([values.select_replica(r, next_element)
+                                   for r in range(len(devices))])
+        self.assertEqual(expected_value, computed_value)
 
   def _test_global_step_update(self, strategy):
     with strategy.scope():
@@ -286,8 +294,195 @@ class DistributionTestBase(test.TestCase):
         value = global_step.read_value()
         return train_op, value
 
-      train_ops, value = strategy.call_for_each_replica(model_fn)
+      train_ops, value = strategy.extended.call_for_each_replica(model_fn)
       self.evaluate(strategy.group(train_ops))
       global_step_tensors = strategy.unwrap(value)
       global_step_values = self.evaluate(global_step_tensors)
       self.assertEqual((1,) * len(global_step_tensors), global_step_values)
+
+  def _test_numpy_iterator(self, strategy):
+    with strategy.scope(), self.cached_session() as sess:
+      x = np.asarray([[1, 2], [6, 12], [2, 4],
+                      [5, 10], [3, 6], [4, 8]])
+      y = np.asarray([5, 4, 3, 2, 1, 0])
+      batch_size = 6
+      if not strategy.extended._global_batch_size:  # pylint: disable=protected-access
+        batch_size = batch_size // strategy.num_replicas_in_sync
+      i = strategy.experimental_make_numpy_iterator(
+          (x, y), batch_size=batch_size, num_epochs=2, shuffle=None,
+          session=sess)
+      self.evaluate(i.initialize())
+
+      def run_and_concatenate(strategy, i):
+        x, y = strategy.experimental_run(lambda z: z, i)
+        x, y = self.evaluate((strategy.unwrap(x), strategy.unwrap(y)))
+        return np.concatenate(x), np.concatenate(y)
+
+      x_1, y_1 = run_and_concatenate(strategy, i)
+      self.assertAllEqual(x, x_1)
+      self.assertAllEqual(y, y_1)
+      x_2, y_2 = run_and_concatenate(strategy, i)
+      self.assertAllEqual(x, x_2)
+      self.assertAllEqual(y, y_2)
+      with self.assertRaises(errors.OutOfRangeError):
+        run_and_concatenate(strategy, i)
+
+
+class OneDeviceDistributionTestBase(test.TestCase):
+  """Some tests that should work with any one-device DistributionStrategy."""
+
+  def _test_all_reduce_sum(self, strategy):
+    self._test_collective_comms(
+        strategy, _all_sum, inputs=(4., [42., 43.]), expected=(4., [42., 43.]))
+
+  def _test_all_reduce_sum_gradients(self, strategy):
+    self._test_collective_comms_gradients(
+        strategy, _all_sum, inputs=[4.], expected_grads=[4.])
+
+  def _test_all_reduce_sum_gradient_tape(self, strategy):
+    self._test_collective_comms_gradient_tape(
+        strategy, _all_sum, inputs=[4.], expected_grads=[4.])
+
+  def _test_all_reduce_mean(self, strategy):
+    self._test_collective_comms(
+        strategy, _all_mean, inputs=(2., [21., 22.]), expected=(2., [21., 22.]))
+
+  def _test_all_reduce_mean_gradients(self, strategy):
+    self._test_collective_comms_gradients(
+        strategy, _all_mean, inputs=[5.], expected_grads=[5.])
+
+  def _test_all_reduce_mean_gradient_tape(self, strategy):
+    self._test_collective_comms_gradient_tape(
+        strategy, _all_mean, inputs=[5.], expected_grads=[5.])
+
+  def _test_collective_comms(self, strategy, comm_fn, inputs, expected):
+    inputs = strategy.make_input_fn_iterator(
+        lambda _: dataset_ops.Dataset.from_tensors(inputs))
+
+    self.evaluate(inputs.initialize())
+    outputs = self.evaluate(
+        list(map(strategy.unwrap, strategy.experimental_run(comm_fn, inputs))))
+    self.assertAllEqual([expected[0]], outputs[0])
+    self.assertAllEqual([expected[1]], outputs[1])
+
+  def _test_collective_comms_gradients(
+      self, strategy, comm_fn, inputs, expected_grads):
+    if context.executing_eagerly():
+      self.skipTest("`tf.gradients` is not supported with eager execution.")
+
+    def step(c):
+      x = constant_op.constant(42.)
+      y = comm_fn(x) * c
+      return gradients_impl.gradients(y, [x])[0]
+
+    inputs = strategy.make_input_fn_iterator(
+        lambda _: dataset_ops.Dataset.from_tensors(inputs))
+
+    self.evaluate(inputs.initialize())
+    self.assertAllEqual(
+        expected_grads,
+        self.evaluate(strategy.unwrap(strategy.experimental_run(step, inputs))))
+
+  def _test_collective_comms_gradient_tape(
+      self, strategy, comm_fn, inputs, expected_grads):
+    def step(c):
+      x = constant_op.constant(42.)
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        y = comm_fn(x) * c
+      return tape.gradient(y, x)
+
+    inputs = strategy.make_input_fn_iterator(
+        lambda _: dataset_ops.Dataset.from_tensors(inputs))
+
+    self.evaluate(inputs.initialize())
+    self.assertAllEqual(
+        expected_grads,
+        self.evaluate(strategy.unwrap(strategy.experimental_run(step, inputs))))
+
+
+class TwoDeviceDistributionTestBase(test.TestCase):
+  """Some tests that should work with any two-device DistributionStrategy."""
+
+  def _test_all_reduce_sum(self, strategy):
+    self._test_collective_comms(
+        strategy, _all_sum,
+        inputs=([1., 3.], [[39., 2.], [3., 41.]]),
+        expected=(4., [42., 43.]))
+
+  def _test_all_reduce_sum_gradients(self, strategy):
+    self._test_collective_comms_gradients(
+        strategy, _all_sum, inputs=[1., 3.], expected_grads=[4., 4.])
+
+  def _test_all_reduce_sum_gradient_tape(self, strategy):
+    self._test_collective_comms_gradient_tape(
+        strategy, _all_sum, inputs=[1., 3.], expected_grads=[4., 4.])
+
+  def _test_all_reduce_mean(self, strategy):
+    self._test_collective_comms(
+        strategy, _all_mean,
+        inputs=([1., 3.], [[39., 2.], [3., 41.]]),
+        expected=(2., [21., 21.5]))
+
+  def _test_all_reduce_mean_gradients(self, strategy):
+    self._test_collective_comms_gradients(
+        strategy, _all_mean, inputs=[1., 3.], expected_grads=[2., 2.])
+
+  def _test_all_reduce_mean_gradient_tape(self, strategy):
+    self._test_collective_comms_gradient_tape(
+        strategy, _all_mean, inputs=[1., 3.], expected_grads=[2., 2.])
+
+  def _test_collective_comms(self, strategy, comm_fn, inputs, expected):
+    inputs = strategy.make_input_fn_iterator(
+        lambda _: dataset_ops.Dataset.from_tensor_slices(inputs))
+
+    self.evaluate(inputs.initialize())
+    outputs = self.evaluate(
+        list(map(strategy.unwrap, strategy.experimental_run(comm_fn, inputs))))
+    self.assertAllEqual([expected[0], expected[0]], outputs[0])
+    self.assertAllEqual([expected[1], expected[1]], outputs[1])
+
+  def _test_collective_comms_gradients(
+      self, strategy, comm_fn, inputs, expected_grads):
+    if context.executing_eagerly():
+      self.skipTest("`tf.gradients` is not supported with eager execution.")
+
+    def step(c):
+      x = constant_op.constant(42.)
+      y = comm_fn(x) * c
+      return gradients_impl.gradients(y, [x])[0]
+
+    inputs = strategy.make_input_fn_iterator(
+        lambda _: dataset_ops.Dataset.from_tensor_slices(inputs))
+
+    self.evaluate(inputs.initialize())
+    self.assertAllEqual(
+        expected_grads,
+        self.evaluate(strategy.unwrap(strategy.experimental_run(step, inputs))))
+
+  def _test_collective_comms_gradient_tape(
+      self, strategy, comm_fn, inputs, expected_grads):
+    def step(c):
+      x = constant_op.constant(42.)
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        y = comm_fn(x) * c
+      return tape.gradient(y, x)
+
+    inputs = strategy.make_input_fn_iterator(
+        lambda _: dataset_ops.Dataset.from_tensor_slices(inputs))
+
+    self.evaluate(inputs.initialize())
+    self.assertAllEqual(
+        expected_grads,
+        self.evaluate(strategy.unwrap(strategy.experimental_run(step, inputs))))
+
+
+def _all_sum(value):
+  ctx = ds_context.get_replica_context()
+  return ctx.all_reduce(reduce_util.ReduceOp.SUM, value)
+
+
+def _all_mean(value):
+  ctx = ds_context.get_replica_context()
+  return ctx.all_reduce(reduce_util.ReduceOp.MEAN, value)
diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
index b6f5b492017fc7dfd329e69ad9ca418ae682bc4b..2d9d221f427422f8bbeba55c5644658af9a9a620 100644
--- a/tensorflow/contrib/distribute/python/tpu_strategy.py
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -21,10 +21,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import copy
-import functools
 
 from tensorflow.contrib.tpu.python.ops import tpu_ops
+from tensorflow.contrib.tpu.python.tpu import device_assignment as device_assignment_lib
+from tensorflow.contrib.tpu.python.tpu import functional as tpu_functional_ops
+from tensorflow.contrib.tpu.python.tpu import topology
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
 from tensorflow.contrib.tpu.python.tpu import training_loop
@@ -33,11 +36,16 @@ from tensorflow.python.client import session as session_lib
 from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
+from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values
+from tensorflow.python.distribute.cluster_resolver import TPUClusterResolver
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -46,9 +54,58 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 
 
+def initialize_tpu_system(cluster_resolver=None):
+  """Initialize the TPU devices in a separate session and graph.
+
+  Args:
+    cluster_resolver: A tf.contrib.cluster_resolver.TPUClusterResolver,
+        which provides information about the TPU cluster.
+  Returns:
+    The tf.contrib.tpu.Topology object for the topology of the TPU cluster.
+  """
+  if cluster_resolver is None:
+    cluster_resolver = TPUClusterResolver("")
+  master = cluster_resolver.master()
+
+  logging.info("Initializing the TPU system.")
+
+  if context.executing_eagerly():
+    # This function looks as it is for the following non-intuitive reasons.
+    # tpu.initialize_system creates a dummy op whose sole purpose is to trigger
+    # DistributedTPURewritePass. This pass actually adds real ops that
+    # initialize the TPU system. Thus, we can't simply run tpu.initialize_system
+    # eagerly. We need to wrap it in defun and trigger the rewrite passes on it.
+    # The easiest way to trigger a rewrite is to run the function with
+    # TPUPartitionedCallOp.
+    @function.defun
+    def _tpu_init_fn():
+      return tpu.initialize_system()
+
+    # We can't call _tpu_init_fn normally (because it contains just a dummy op,
+    # see above) but need to define it to get it added to eager context
+    # and get its assigned name.
+    # pylint: disable=protected-access
+    graph_func = _tpu_init_fn._get_concrete_function_internal()
+    func_name = compat.as_str(graph_func._inference_function.name)
+    # pylint: enable=protected-access
+
+    output = tpu_functional_ops.TPUPartitionedCall(
+        args=[], device_ordinal=0, Tout=[dtypes.string], f=func_name)
+    serialized_topology = output[0].numpy()
+  else:
+    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+    with ops.Graph().as_default():
+      with session_lib.Session(config=session_config, target=master) as sess:
+        serialized_topology = sess.run(tpu.initialize_system())
+
+  logging.info("Finished initializing TPU system.")
+  return topology.Topology(serialized=serialized_topology)
+
+
 def get_tpu_system_metadata(tpu_cluster_resolver):
   """Retrieves TPU system metadata given a TPUClusterResolver."""
   master = tpu_cluster_resolver.master()
@@ -66,13 +123,14 @@ def get_tpu_system_metadata(tpu_cluster_resolver):
 
 
 # TODO(jhseu): Deduplicate with MirroredStrategy?
-def _create_tpu_mirrored_variable(devices, real_mirrored_creator, *args,
-                                  **kwargs):  # pylint: disable=g-missing-docstring
+def _create_tpu_mirrored_variable(  # pylint: disable=missing-docstring
+    strategy, device_map, logical_device, real_mirrored_creator,
+    *args, **kwargs):
   # Figure out what collections this variable should be added to.
   # We'll add the TPUMirroredVariable to those collections instead.
-  collections = kwargs.pop("collections", None)
-  if collections is None:
-    collections = [ops.GraphKeys.GLOBAL_VARIABLES]
+  var_collections = kwargs.pop("collections", None)
+  if var_collections is None:
+    var_collections = [ops.GraphKeys.GLOBAL_VARIABLES]
   kwargs["collections"] = []
 
   # TODO(jhseu): Should we have different behavior for different
@@ -97,10 +155,13 @@ def _create_tpu_mirrored_variable(devices, real_mirrored_creator, *args,
   # was never recorded on the tape instead of having to do this manually
   # here.
   with tape.stop_recording():
-    index = real_mirrored_creator(devices, *args, **kwargs)
-    result = values.TPUMirroredVariable(index, index[devices[0]], aggregation)
+    devices = device_map.logical_to_actual_devices(logical_device)
+    value_list = real_mirrored_creator(devices, *args, **kwargs)
+    result = values.TPUMirroredVariable(
+        strategy, device_map, value_list, aggregation,
+        logical_device=logical_device)
 
-  if not context.executing_eagerly():
+  if not (context.executing_eagerly() or ops.inside_function()):
     g = ops.get_default_graph()
     # If "trainable" is True, next_creator() will add the member variables
     # to the TRAINABLE_VARIABLES collection, so we manually remove
@@ -108,18 +169,21 @@ def _create_tpu_mirrored_variable(devices, real_mirrored_creator, *args,
     # "trainable" to False for next_creator() since that causes functions
     # like implicit_gradients to skip those variables.
     if kwargs.get("trainable", True):
-      collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
+      var_collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
       l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
-      for v in index.values():
+      for v in value_list:
         l.remove(v)
-    g.add_to_collections(collections, result)
+    g.add_to_collections(var_collections, result)
   return result
 
 
 class TPUStrategy(distribute_lib.DistributionStrategy):
   """TPU distribution strategy implementation."""
 
-  def __init__(self, tpu_cluster_resolver, steps_per_run, num_cores=None):
+  def __init__(self,
+               tpu_cluster_resolver=None,
+               steps_per_run=None,
+               device_assignment=None):
     """Initializes the TPUStrategy object.
 
     Args:
@@ -130,72 +194,124 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
           metrics, summaries etc.
           This parameter is only used when Distribution Strategy is used with
           estimator or keras.
-      num_cores: Number of cores to use on the TPU. If None specified, then
-          auto-detect the cores and topology of the TPU system.
+      device_assignment: Optional `tf.contrib.tpu.DeviceAssignment` to specify
+          the placement of replicas on the TPU cluster. Currently only supports
+          the usecase of using a single core within a TPU cluster.
     """
     super(TPUStrategy, self).__init__(TPUExtended(
-        self, tpu_cluster_resolver, steps_per_run, num_cores))
+        self, tpu_cluster_resolver, steps_per_run, device_assignment))
 
   @property
   def steps_per_run(self):
     """DEPRECATED: use .extended.steps_per_run instead."""
     return self._extended.steps_per_run
 
+  # TODO(cjfj): Modify `_call_for_each_replica` in `TPUExtended` such that this
+  # can use the default implementation.
+  # This implementation runs a single step. It does not use infeed or outfeed.
+  def experimental_run(self, fn, input_iterator=None):
+    """See base class."""
+    if context.executing_eagerly() and not ops.inside_function():
+      raise NotImplementedError(
+          "Eager mode not supported in TPUStrategy outside TF functions.")
+
+    if input_iterator is None:
+      inputs = []
+    else:
+      inputs = input_iterator.get_next()
+
+    result = [None]
+    def replicated_fn(replica_id, replica_input):
+      """Wraps user function to provide replica ID and `Tensor` inputs."""
+      with _TPUReplicaContext(self, replica_id_in_sync_group=replica_id):
+        if input_iterator is None:
+          result[0] = fn()
+        else:
+          result[0] = fn(replica_input)
+      return result[0]
+
+    replicate_inputs = []  # By replica.
+    for i in range(self.num_replicas_in_sync):
+      replicate_inputs.append(
+          [constant_op.constant(i, dtype=dtypes.int32),
+           values.select_replica(i, inputs)])
+
+    with self.scope():
+      replicate_outputs = tpu.replicate(replicated_fn, replicate_inputs)
+
+    # Workaround for `tpu.replicate` behaviour when single `Tensor` returned.
+    replicate_outputs = [
+        nest.pack_sequence_as(result[0], nest.flatten(replica_outputs))
+        for replica_outputs in replicate_outputs]
+
+    device_map = self.extended._device_map  # pylint: disable=protected-access
+    return values.regroup(device_map, replicate_outputs)
+
 
 class TPUExtended(distribute_lib.DistributionStrategyExtended):
   """Implementation of TPUStrategy."""
 
-  # Track what TPU devices have been initialized.
-  _initialized_devices = []
-
-  def __init__(self, container_strategy, tpu_cluster_resolver, steps_per_run,
-               num_cores=None):
+  def __init__(self,
+               container_strategy,
+               tpu_cluster_resolver=None,
+               steps_per_run=None,
+               device_assignment=None):
     super(TPUExtended, self).__init__(container_strategy)
+
+    if tpu_cluster_resolver is None:
+      tpu_cluster_resolver = TPUClusterResolver("")
+
+    if steps_per_run is None:
+      # TODO(frankchn): Warn when we are being used by DS/Keras and this is
+      # not specified.
+      steps_per_run = 1
+
     self._tpu_cluster_resolver = tpu_cluster_resolver
     self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver)
-    # TODO(sourabhbajaj): Change this from num_cores to metadata_override
-    self._num_cores_override = num_cores
+    self._device_assignment = device_assignment
+
+    # Device assignment is currently only supported for 1 core case.
+    if self._device_assignment:
+      assert isinstance(self._device_assignment,
+                        device_assignment_lib.DeviceAssignment)
+      if self._device_assignment.num_replicas != 1:
+        raise ValueError("Device assignment is only supported for a single "
+                         "core single replica case currently.")
+      if self._device_assignment.num_cores_per_replica != 1:
+        raise ValueError("Device assignment is only supported for a single "
+                         "core single replica case currently.")
+      if not all(self._device_assignment.core_assignment[0][0] == [0, 0, 0]):
+        raise ValueError("Device assignment is only supported for a single "
+                         "core single replica case currently.")
 
     # TODO(jhseu): Switch to DeviceAssignment to support pods and model
     # parallelism.
-    device_map = {d.name: i for i, d in enumerate(self._tpu_metadata.devices)
-                  if "device:TPU:" in d.name}
-    self._device_index = values.PerReplica(device_map)
+    self._device_index = {
+        d.name: i for i, d in enumerate(self._tpu_metadata.devices)
+        if "device:TPU:" in d.name
+    }
     self._host_device = self.get_host_cpu_device(0)
-    self._tpu_devices = tuple(sorted(device_map.keys()))
+    self._tpu_devices = tuple(sorted(self._device_index.keys()))
     # Only create variables for the number of replicas we're running.
     self._tpu_devices = self._tpu_devices[:self._num_replicas_in_sync]
+    self._device_map = values.ReplicaDeviceMap(self._tpu_devices)
+
+    # Preload the data onto the TPUs.
+    input_worker_devices = collections.OrderedDict()
+    for tpu_device in self._tpu_devices:
+      host_device = _get_host_for_device(tpu_device)
+      input_worker_devices.setdefault(host_device, [])
+      input_worker_devices[host_device].append(tpu_device)
+    self._input_workers = input_lib.InputWorkers(
+        self._device_map, tuple(input_worker_devices.items()))
 
     # TODO(sourabhbajaj): Remove this once performance of running one step
     # at a time is comparable to multiple steps.
     self.steps_per_run = steps_per_run
     self._require_static_shapes = True
 
-    # Initialize the TPU devices.
-    self._initialize_tpu()
-
-  def _initialize_tpu(self):
-    """Initialize the TPU devices in a separate session and graph.
-
-    We keep track of all the TPU devices that we're initialized as we should
-    only be running TPU initialize once for the entire process.
-    """
-    master = self._tpu_cluster_resolver.master()
-    # Verify TPU has not already been initialized in this process.
-    if master in TPUExtended._initialized_devices:
-      logging.info("TPU master %s has already been initialized." % master)
-      return
-
-    logging.info("Initializing the TPU system.")
-    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
-    self._configure(session_config)
-    with ops.Graph().as_default():
-      with session_lib.Session(config=session_config, target=master) as sess:
-        sess.run([tpu.initialize_system()])
-    logging.info("Finized initializing TPU system.")
-
-    # Update Strategy state to make sure we can track device initialization.
-    TPUExtended._initialized_devices.append(master)
+  def _validate_colocate_with_variable(self, colocate_with_variable):
+    values.validate_colocate_tpu_variable(colocate_with_variable, self)
 
   def _get_enqueue_op_per_host(self, host_id, multi_worker_iterator,
                                input_shapes, iterations):
@@ -260,21 +376,27 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
 
   def _make_dataset_iterator(self, dataset):
     """Make iterators for each of the TPU hosts."""
-
-    worker_devices = [
-        (self.get_host(hid), [self.get_host_cpu_device(hid)])
-        for hid in range(self.num_hosts)
-    ]
-    return values.DatasetIterator(dataset, worker_devices,
-                                  self._num_replicas_in_sync)
-
-  def _distribute_dataset(self, dataset_fn):
-    worker_devices = [
-        (self.get_host(hid), [self.get_host_cpu_device(hid)])
-        for hid in range(self.num_hosts)
-    ]
-    return values.MultiWorkerDataset(
-        functools.partial(self._call_dataset_fn, dataset_fn), worker_devices)
+    return input_lib.DatasetIterator(dataset, self._input_workers,
+                                     self._num_replicas_in_sync)
+
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    input_contexts = []
+    num_workers = self._input_workers.num_workers
+    for i in range(num_workers):
+      input_contexts.append(distribute_lib.InputContext(
+          num_input_pipelines=num_workers,
+          input_pipeline_id=i,
+          num_replicas_in_sync=self._num_replicas_in_sync))
+    return input_lib.InputFunctionIterator(
+        input_fn, self._input_workers, input_contexts)
+
+  def _experimental_make_numpy_dataset(self, numpy_input, session):
+    return numpy_dataset.one_host_numpy_dataset(
+        numpy_input, numpy_dataset.SingleDevice(self.get_host_cpu_device(0)),
+        session)
 
   # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
   # TODO(sourabhbajaj): Remove the initial_loop_values parameter when we have
@@ -288,29 +410,16 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
           "TPU currently requires fully defined shapes. Either use "
           "set_shape() on the input tensors or use "
           "dataset.batch(..., drop_remainder=True).")
-    types = nest.flatten(multi_worker_iterator.output_types)
-
-    enqueue_ops = [
-        self._get_enqueue_op_per_host(host_id, multi_worker_iterator, shapes,
-                                      iterations)
-        for host_id in range(self.num_hosts)]
-
-    def dequeue_fn():
-      dequeued = tpu_ops.infeed_dequeue_tuple(dtypes=types, shapes=shapes)
-      return nest.pack_sequence_as(output_shapes, dequeued)
 
     # Wrap `fn` for repeat.
     if initial_loop_values is None:
       initial_loop_values = {}
     initial_loop_values = nest.flatten(initial_loop_values)
-    ctx = values.MultiStepContext()
+    ctx = input_lib.MultiStepContext()
 
-    def run_fn():
+    def run_fn(inputs):
       """Single step on the TPU device."""
-      fn_inputs = dequeue_fn()
-      if not isinstance(fn_inputs, tuple):
-        fn_inputs = (fn_inputs,)
-      fn_result = fn(ctx, fn_inputs)
+      fn_result = fn(ctx, inputs)
       flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
       if flat_last_step_outputs:
         with ops.control_dependencies([fn_result]):
@@ -330,7 +439,14 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
     def rewrite_fn(*args):
       """The rewritten step fn running on TPU."""
       del args
-      replicate_inputs = [[]] * self._num_replicas_in_sync
+
+      per_replica_inputs = multi_worker_iterator.get_next()
+      replicate_inputs = []
+      for replica_id in range(self._num_replicas_in_sync):
+        select_replica = lambda x: values.select_replica(replica_id, x)  # pylint: disable=cell-var-from-loop
+        replicate_inputs.append((nest.map_structure(
+            select_replica, per_replica_inputs),))
+
       replicate_outputs = tpu.replicate(run_fn, replicate_inputs)
 
       # If run_fn has tensor outputs, tpu.replicate returns a list of list. We
@@ -342,8 +458,8 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
 
       return replicate_outputs
 
-    # TODO(sourabhbajaj): The input to while loop should be based on the output
-    # type of the step_fn
+    # TODO(sourabhbajaj): The input to while loop should be based on the
+    # output type of the step_fn
     assert isinstance(initial_loop_values, list)
     initial_loop_values = initial_loop_values * self._num_replicas_in_sync
 
@@ -353,7 +469,7 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
                                                initial_loop_values)
 
     del self._outer_control_flow_context
-    ctx.run_op = control_flow_ops.group(replicate_outputs, enqueue_ops)
+    ctx.run_op = control_flow_ops.group(replicate_outputs)
 
     if isinstance(replicate_outputs, list):
       # Filter out any ops from the outputs, typically this would be the case
@@ -378,23 +494,7 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
       # no tensors returned.
       last_step_tensor_outputs = []
 
-    # Convert replicate_outputs to the original dict structure of
-    # last_step_outputs.
-    last_step_tensor_outputs_dict = nest.pack_sequence_as(
-        ctx.last_step_outputs, last_step_tensor_outputs)
-
-    for name, reduce_op in ctx._last_step_outputs_reduce_ops.items():  # pylint: disable=protected-access
-      output = last_step_tensor_outputs_dict[name]
-      # For outputs that have already been reduced, take the first value
-      # from the list as each value should be the same. Else return the full
-      # list of values.
-      # TODO(josh11b): If reduce_op is NONE, we should return a PerReplica
-      # value.
-      if reduce_op is not None:
-        # TODO(priyag): Should this return the element or a list with 1 element
-        last_step_tensor_outputs_dict[name] = output[0]
-    ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
-
+    _set_last_step_outputs(ctx, last_step_tensor_outputs)
     return ctx
 
   def _call_for_each_replica(self, fn, args, kwargs):
@@ -403,57 +503,57 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
     with _TPUReplicaContext(self._container_strategy()):
       return fn(*args, **kwargs)
 
-  def _initialize(self):
-    if context.executing_eagerly():
-      # TODO(priyag): Add appopriate call here when eager is supported for TPUs.
-      raise NotImplementedError("Eager mode not supported in TPUStrategy.")
-    else:
-      return []
+  def _experimental_initialize_system(self):
+    """Experimental method added to be used by Estimator.
 
-  def _finalize(self):
-    if context.executing_eagerly():
-      # TODO(priyag): Add appopriate call here when eager is supported for TPUs.
-      raise NotImplementedError("Eager mode not supported in TPUStrategy.")
-    else:
-      return []
-
-  def _get_devices_from(self, colocate_with=None):
-    # TODO(jhseu): Change this when we support model parallelism.
-    return self._tpu_devices
+    This is a private method only to be used by Estimator. Other frameworks
+    should directly be calling `tf.contrib.distribute.initialize_tpu_system`
+    """
+    initialize_tpu_system(self._tpu_cluster_resolver)
 
   def _create_variable(self, next_creator, *args, **kwargs):
     """Create a TPUMirroredVariable. See `DistributionStrategy.scope`."""
     colocate_with = kwargs.pop("colocate_with", None)
-    devices = self._get_devices_from(colocate_with)
+    if colocate_with is None:
+      device_map = self._device_map
+      logical_device = 0  # TODO(josh11b): Get logical device from scope here.
+    elif isinstance(colocate_with, numpy_dataset.SingleDevice):
+      with ops.device(colocate_with.device):
+        return next_creator(*args, **kwargs)
+    else:
+      device_map = colocate_with.device_map
+      logical_device = colocate_with.logical_device
 
     def _real_mirrored_creator(devices, *args, **kwargs):  # pylint: disable=g-missing-docstring
-      index = {}
+      value_list = []
       for i, d in enumerate(devices):
         with ops.device(d):
           if i > 0:
             # Give replicas meaningful distinct names:
-            var0name = index[devices[0]].name.split(":")[0]
+            var0name = value_list[0].name.split(":")[0]
             # We append a / to variable names created on replicas with id > 0 to
             # ensure that we ignore the name scope and instead use the given
             # name as the absolute name of the variable.
             kwargs["name"] = "%s/replica_%d/" % (var0name, i)
             # Initialize replicas with the same value:
-            if context.executing_eagerly():
-              kwargs["initial_value"] = array_ops.identity(
-                  index[devices[0]].value())
+            if context.executing_eagerly() or ops.inside_function():
+              with ops.init_scope():
+                kwargs["initial_value"] = array_ops.identity(
+                    value_list[0].value())
             else:
               def initial_value_fn(device=d):
                 with ops.device(device):
-                  return array_ops.identity(index[devices[0]].initial_value)
+                  return array_ops.identity(value_list[0].initial_value)
               kwargs["initial_value"] = initial_value_fn
           with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
             v = next_creator(*args, **kwargs)
           assert not isinstance(v, values.TPUMirroredVariable)
-          index[d] = v
-      return index
+          value_list.append(v)
+      return value_list
 
-    return _create_tpu_mirrored_variable(devices, _real_mirrored_creator, *args,
-                                         **kwargs)
+    return _create_tpu_mirrored_variable(
+        self._container_strategy(), device_map, logical_device,
+        _real_mirrored_creator, *args, **kwargs)
 
   def _reduce_to(self, reduce_op, value, destinations):
     if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
@@ -465,19 +565,32 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
             "Currently only support sum & mean in TPUStrategy.")
       return tpu_ops.cross_replica_sum(value)
 
-    # Validate that the destination is same as the host device
-    # Note we don't do this when in replicate context as the reduction is
-    # performed on the TPU device itself.
+    if not isinstance(value, values.DistributedValues):
+      # This function handles reducing values that are not PerReplica or
+      # Mirrored values. For example, the same value could be present on all
+      # replicas in which case `value` would be a single value or value could
+      # be 0.
+      return cross_device_ops_lib.reduce_non_distributed_value(
+          reduce_op, self._device_map, value, destinations)
+
     devices = cross_device_ops_lib.get_devices_from(destinations)
-    if len(devices) == 1:
-      assert device_util.canonicalize(devices[0]) == device_util.canonicalize(
-          self._host_device)
-    else:
+    if len(devices) != 1:
       raise ValueError("Multiple devices are not supported for TPUStrategy")
 
-    output = math_ops.add_n(value)
-    if reduce_op == reduce_util.ReduceOp.MEAN:
-      return output * (1. / len(value))
+    # Always performs the reduction on the TPU host.
+    with ops.device(self._host_device):
+      output = math_ops.add_n(value.values)
+      if reduce_op == reduce_util.ReduceOp.MEAN:
+        output *= (1. / len(value.values))
+
+    # If necessary, copy to requested destination.
+    dest_canonical = device_util.canonicalize(devices[0])
+    host_canonical = device_util.canonicalize(self._host_device)
+
+    if dest_canonical != host_canonical:
+      with ops.device(devices[0]):
+        output = array_ops.identity(output)
+
     return output
 
   def _update(self, var, fn, args, kwargs, group):
@@ -486,19 +599,19 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
       if group:
         return fn(var, *args, **kwargs)
       else:
-        return [fn(var, *args, **kwargs)]
+        return (fn(var, *args, **kwargs),)
 
     # Otherwise, we revert to MirroredStrategy behavior and update each variable
     # directly.
-    updates = {}
-    for d, v in var._index.items():  # pylint: disable=protected-access
-      name = "update_%d" % self._device_index.get(d)
+    updates = []
+    for i, (d, v) in enumerate(zip(var.devices, var.values)):
+      name = "update_%d" % i
       with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
         # If args and kwargs are not mirrored, the value is returned as is.
-        updates[d] = fn(v,
-                        *values.select_device_mirrored(d, args),
-                        **values.select_device_mirrored(d, kwargs))
-    return values.update_regroup(self, updates, group)
+        updates.append(fn(v,
+                          *values.select_device_mirrored(d, args),
+                          **values.select_device_mirrored(d, kwargs)))
+    return values.update_regroup(self, self._device_map, updates, group)
 
   def read_var(self, var):
     assert isinstance(var, values.TPUMirroredVariable)
@@ -513,6 +626,11 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
       # be represented using a PerReplica wrapper instead of a list with
       # one entry per device.
       return tuple(val)
+    elif isinstance(val, values.TPUMirroredVariable):
+      # pylint: disable=protected-access
+      if values._enclosing_tpu_context() is not None:
+        return (val,)
+      return val.values
     return (val,)
 
   def value_container(self, value):
@@ -524,15 +642,34 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
 
   @property
   def num_hosts(self):
-    return self._tpu_metadata.num_hosts
+    if self._device_assignment is None:
+      return self._tpu_metadata.num_hosts
+
+    return len(set([self._device_assignment.host_device(r)
+                    for r in range(self._device_assignment.num_replicas)]))
 
   @property
   def num_replicas_per_host(self):
-    return self._tpu_metadata.num_of_cores_per_host
+    if self._device_assignment is None:
+      return self._tpu_metadata.num_of_cores_per_host
+
+    # TODO(sourabhbajaj): Remove this method we use inputs and remove infeed
+    # as the computation of num_replicas_per_host is not a constant
+    # when using device_assignment. This is a temporary workaround to support
+    # StatefulRNN as everything is 1 in that case.
+    # This method needs to take host_id as input for correct computation.
+    max_models_per_host = (self._tpu_metadata.num_of_cores_per_host //
+                           self._device_assignment.num_cores_per_replica)
+    models_per_host = min(self._device_assignment.num_replicas,
+                          max_models_per_host)
+    return models_per_host * self._device_assignment.num_cores_per_replica
 
   @property
   def _num_replicas_in_sync(self):
-    return self._num_cores_override or self._tpu_metadata.num_cores
+    if self._device_assignment is None:
+      return self._tpu_metadata.num_cores
+    return (self._device_assignment.num_replicas *
+            self._device_assignment.num_cores_per_replica)
 
   @property
   def experimental_between_graph(self):
@@ -600,23 +737,62 @@ class TPUExtended(distribute_lib.DistributionStrategyExtended):
   # TODO(priyag): Delete this once all strategies use global batch size.
   @property
   def _global_batch_size(self):
+    """`make_dataset_iterator` and `make_numpy_iterator` use global batch size.
+
+    `make_input_fn_iterator` assumes per-replica batching.
+
+    Returns:
+      Boolean.
+    """
     return True
 
 
 class _TPUReplicaContext(distribute_lib.ReplicaContext):
   """Replication Context class for TPU Strategy."""
 
-  # TODO(sourabhbajaj): Call for each tower should be updating this.
-  def __init__(self, distribution_strategy):
+  # TODO(sourabhbajaj): Call for each replica should be updating this.
+  # TODO(b/118385803): Always properly initialize replica_id.
+  def __init__(self, strategy, replica_id_in_sync_group=None):
+    if replica_id_in_sync_group is None:
+      replica_id_in_sync_group = constant_op.constant(0, dtypes.int32)
     distribute_lib.ReplicaContext.__init__(
-        self,
-        distribution_strategy,
-        # TODO(b/118385803): properly initialize replica_id, instead of always 0
-        replica_id_in_sync_group=constant_op.constant(0, dtypes.int32))
+        self, strategy, replica_id_in_sync_group=replica_id_in_sync_group)
 
   @property
   def devices(self):
     distribute_lib.require_replica_context(self)
-    ds = self._distribution_strategy
+    ds = self._strategy
     replica_id = tensor_util.constant_value(self._replica_id_in_sync_group)
-    return (ds.extended.worker_devices[replica_id],)
+
+    if replica_id is None:  # Non-constant `Tensor` inside `tpu.replicate`.
+      # TODO(cjfj): Return other devices when model parallelism is supported.
+      return (tpu.core(0),)
+    else:
+      return (ds.extended.worker_devices[replica_id],)
+
+
+def _get_host_for_device(device):
+  spec = tf_device.DeviceSpec.from_string(device)
+  return tf_device.DeviceSpec(
+      job=spec.job, replica=spec.replica, task=spec.task,
+      device_type="CPU", device_index=0).to_string()
+
+
+def _set_last_step_outputs(ctx, last_step_tensor_outputs):
+  """Sets the last step outputs on the given context."""
+  # Convert replicate_outputs to the original dict structure of
+  # last_step_outputs.
+  last_step_tensor_outputs_dict = nest.pack_sequence_as(
+      ctx.last_step_outputs, last_step_tensor_outputs)
+
+  for name, reduce_op in ctx._last_step_outputs_reduce_ops.items():  # pylint: disable=protected-access
+    output = last_step_tensor_outputs_dict[name]
+    # For outputs that have already been reduced, take the first value
+    # from the list as each value should be the same. Else return the full
+    # list of values.
+    # TODO(josh11b): If reduce_op is NONE, we should return a PerReplica
+    # value.
+    if reduce_op is not None:
+      # TODO(priyag): Should this return the element or a list with 1 element
+      last_step_tensor_outputs_dict[name] = output[0]
+  ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
diff --git a/tensorflow/contrib/distribute/python/values_test.py b/tensorflow/contrib/distribute/python/values_test.py
index 538b859f3d1ece55b460f6dbf8f01540a6013381..9fd251175b8b8e3453e33434b4d86386a078295e 100644
--- a/tensorflow/contrib/distribute/python/values_test.py
+++ b/tensorflow/contrib/distribute/python/values_test.py
@@ -22,27 +22,20 @@ import os
 from absl.testing import parameterized
 
 from tensorflow.contrib.distribute.python import combinations
-from tensorflow.contrib.distribute.python import multi_worker_test_base
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import device_util
-from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training import saver as saver_lib
-from tensorflow.python.util import nest
 
 
 class DistributedValuesTest(test.TestCase):
@@ -51,7 +44,8 @@ class DistributedValuesTest(test.TestCase):
     with ops.device("/device:CPU:0"):
       one = constant_op.constant(1)
       two = constant_op.constant(2)
-      v = values.DistributedValues({"/device:CPU:0": one, "/device:GPU:0": two})
+      device_map = values.ReplicaDeviceMap(("/device:CPU:0", "/device:GPU:0"))
+      v = values.DistributedValues(device_map, (one, two))
       self.assertEqual(two, v.get("/device:GPU:0"))
       self.assertEqual(one, v.get())
       with self.assertRaises(ValueError):
@@ -63,24 +57,26 @@ class DistributedValuesTest(test.TestCase):
         ops.device("/device:CPU:0"):
       one = constant_op.constant(1)
       two = constant_op.constant(2)
-      v = values.DistributedValues({"/device:CPU:0": one, "/device:GPU:0": two})
+      device_map = values.ReplicaDeviceMap(("/device:CPU:0", "/device:GPU:0"))
+      v = values.DistributedValues(device_map, (one, two))
       self.assertEqual(two, v.get("/device:GPU:0"))
       self.assertEqual(one, v.get())
       with self.assertRaises(ValueError):
         self.assertIsNone(v.get("/device:GPU:2"))
 
   def testCanonicalization(self):
-    canonical_cpu = ["/job:localhost/replica:0/task:0/device:CPU:0"]
-    v = values.DistributedValues({"": 42})
-    self.assertEqual(canonical_cpu, list(v._index.keys()))
-    v = values.DistributedValues({"/device:CPU:0": 42})
-    self.assertEqual(canonical_cpu, list(v._index.keys()))
-    v = values.DistributedValues({"/cpu:0": 42})
-    self.assertEqual(canonical_cpu, list(v._index.keys()))
-    v = values.DistributedValues({"/CPU:0": 42})
-    self.assertEqual(canonical_cpu, list(v._index.keys()))
+    canonical_cpu = ("/job:localhost/replica:0/task:0/device:CPU:0",)
+    v = values.DistributedValues(values.SingleDeviceMap(""), (42,))
+    self.assertEqual(canonical_cpu, v.devices)
+    v = values.DistributedValues(values.SingleDeviceMap("/device:CPU:0"), (42,))
+    self.assertEqual(canonical_cpu, v.devices)
+    v = values.DistributedValues(values.SingleDeviceMap("/cpu:0"), (42,))
+    self.assertEqual(canonical_cpu, v.devices)
+    v = values.DistributedValues(values.SingleDeviceMap("/CPU:0"), (42,))
+    self.assertEqual(canonical_cpu, v.devices)
     with self.assertRaises(AssertionError):
-      v = values.DistributedValues({"/device:cpu:0": 42})
+      v = values.DistributedValues(
+          values.SingleDeviceMap("/device:cpu:0"), (42,))
 
   def testIsTensorLike(self):
     with context.graph_mode(), \
@@ -88,7 +84,8 @@ class DistributedValuesTest(test.TestCase):
          ops.device("/device:CPU:0"):
       one = constant_op.constant(1)
       two = constant_op.constant(2)
-      v = values.DistributedValues({"/device:CPU:0": one, "/device:GPU:0": two})
+      device_map = values.ReplicaDeviceMap(("/device:CPU:0", "/device:GPU:0"))
+      v = values.DistributedValues(device_map, (one, two))
       self.assertEqual(two, v.get("/device:GPU:0"))
       self.assertEqual(one, v.get())
       self.assertTrue(v.is_tensor_like)
@@ -100,7 +97,8 @@ class DistributedValuesTest(test.TestCase):
          ops.device("/device:CPU:0"):
       one = constant_op.constant(1)
       two = 2.0
-      v = values.DistributedValues({"/device:CPU:0": one, "/device:GPU:0": two})
+      device_map = values.ReplicaDeviceMap(("/device:CPU:0", "/device:GPU:0"))
+      v = values.DistributedValues(device_map, (one, two))
       self.assertEqual(two, v.get("/device:GPU:0"))
       self.assertEqual(one, v.get())
       self.assertFalse(v.is_tensor_like)
@@ -118,8 +116,8 @@ class DistributedDelegateTest(test.TestCase):
         def __init__(self, x):
           self.x = x
 
-      v = values.DistributedDelegate(
-          {"/device:CPU:0": Foo(7), "/device:GPU:0": Foo(8)})
+      device_map = values.ReplicaDeviceMap(("/device:CPU:0", "/device:GPU:0"))
+      v = values.DistributedDelegate(device_map, (Foo(7), Foo(8)))
       self.assertEqual(7, v.x)
       with self.assertRaises(AttributeError):
         _ = v.y
@@ -127,7 +125,8 @@ class DistributedDelegateTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testOperatorOverride(self):
     with ops.device("/device:CPU:0"):
-      v = values.DistributedDelegate({"/device:CPU:0": 7, "/device:GPU:0": 8})
+      device_map = values.ReplicaDeviceMap(("/device:CPU:0", "/device:GPU:0"))
+      v = values.DistributedDelegate(device_map, (7, 8))
       # v should act like int(7).
       self.assertEqual(8, v + 1)
       self.assertEqual(10, 3 + v)
@@ -178,16 +177,15 @@ def _nested_value(d):
 
 def _make_mirrored():
   v = []
-  index = {}
   devices = ["/device:GPU:0", "/device:CPU:0"]
   for d, n, init in zip(devices, ["v", "v/replica"], [1., 2.]):
     with ops.device(d):
       v.append(variable_scope.get_variable(
           name=n, initializer=init, use_resource=True))
-      index[d] = v[-1]
-  mirrored = values.MirroredVariable(index, v[0],
+  device_map = values.ReplicaDeviceMap(devices)
+  mirrored = values.MirroredVariable(None, device_map, v,
                                      variable_scope.VariableAggregation.SUM)
-  return v, devices, mirrored
+  return v, device_map, mirrored
 
 
 class RegroupAndSelectDeviceTest(test.TestCase):
@@ -204,8 +202,9 @@ class RegroupAndSelectDeviceTest(test.TestCase):
       self.assertEqual(expected[i], result.get(_device_str(i)))
 
   def testNested(self):
-    result = values.regroup({_device_str(0): _nested_value("1"),
-                             _device_str(1): _nested_value("2")})
+    device_map = values.ReplicaDeviceMap((_device_str(0), _device_str(1)))
+    result = values.regroup(device_map,
+                            (_nested_value("1"), _nested_value("2")))
     self.assertIsInstance(result, tuple)
     self.assertEqual(3, len(result))
     self._is_per_replica(result[0], ["a1", "a2"])
@@ -221,11 +220,11 @@ class RegroupAndSelectDeviceTest(test.TestCase):
     self._is_per_replica(result[1][1]["c"], ["d1", "d2"])
     self._is_per_replica(result[1][1]["e"], ["f1", "f2"])
 
-    # Also test that we can undo the merge using select_device()
+    # Also test that we can undo the merge using select_replica()
     self.assertEqual(_nested_value("1"),
-                     values.select_device(_device_str(0), result))
+                     values.select_replica(0, result))
     self.assertEqual(_nested_value("2"),
-                     values.select_device(_device_str(1), result))
+                     values.select_replica(1, result))
     # select_device_mirrored() should fail due to non-mirrored values
     with self.assertRaises(TypeError):
       values.select_device_mirrored(_device_str(0), result)
@@ -235,8 +234,9 @@ class RegroupAndSelectDeviceTest(test.TestCase):
   def testWrapClass(self):
     # Normally a mirrored value would be the same across devices, but
     # for a test it is convenient to be able to tell the values apart.
-    result = values.regroup({_device_str(0): _nested_value("1"),
-                             _device_str(1): _nested_value("2")},
+    device_map = values.ReplicaDeviceMap((_device_str(0), _device_str(1)))
+    result = values.regroup(device_map,
+                            (_nested_value("1"), _nested_value("2")),
                             values.Mirrored)
     self.assertIsInstance(result, tuple)
     self.assertEqual(3, len(result))
@@ -253,11 +253,11 @@ class RegroupAndSelectDeviceTest(test.TestCase):
     self._is_per_replica(result[1][1]["c"], ["d1", "d2"], values.Mirrored)
     self._is_per_replica(result[1][1]["e"], ["f1", "f2"], values.Mirrored)
 
-    # Also test that we can undo the merge using select_device()
+    # Also test that we can undo the merge using select_replica()
     self.assertEqual(_nested_value("1"),
-                     values.select_device(_device_str(0), result))
+                     values.select_replica(0, result))
     self.assertEqual(_nested_value("2"),
-                     values.select_device(_device_str(1), result))
+                     values.select_replica(1, result))
     # Values are marked as mirrored, so select_device_mirrored() is allowed.
     self.assertEqual(_nested_value("1"),
                      values.select_device_mirrored(_device_str(0), result))
@@ -267,63 +267,66 @@ class RegroupAndSelectDeviceTest(test.TestCase):
   def testMirroredContainer(self):
     if context.num_gpus() < 1 and context.executing_eagerly():
       self.skipTest("A GPU is not available for this test in eager mode.")
-    v, devices, mirrored = _make_mirrored()
-    result = values.regroup(dict(zip(devices, v)))
+    v, device_map, mirrored = _make_mirrored()
+    result = values.regroup(device_map, v)
     self.assertIs(mirrored, result)
 
   def testSameId(self):
     foo = object()
-    result = values.regroup({_device_str(0): ("a", foo),
-                             _device_str(1): ("b", foo)})
+    device_map = values.ReplicaDeviceMap((_device_str(0), _device_str(1)))
+    result = values.regroup(device_map, (("a", foo), ("b", foo)))
     self.assertIsInstance(result, tuple)
     self.assertEqual(2, len(result))
     self._is_per_replica(result[0], ["a", "b"])
     self.assertIs(foo, result[1])
 
-    # Test select_device(), should undo the merge done by regroup().
-    result_0 = values.select_device(_device_str(0), result)
+    # Test select_replica(), should undo the merge done by regroup().
+    result_0 = values.select_replica(0, result)
     self.assertIsInstance(result_0, tuple)
     self.assertEqual(2, len(result_0))
     self.assertEqual("a", result_0[0])
     self.assertIs(foo, result_0[1])
-    result_1 = values.select_device(_device_str(1), result)
+    result_1 = values.select_replica(1, result)
     self.assertIsInstance(result_1, tuple)
     self.assertEqual(2, len(result_1))
     self.assertEqual("b", result_1[0])
     self.assertIs(foo, result_1[1])
 
   def testOneDevice(self):
-    result = values.regroup({_device_str(0): _nested_value("1")})
-    # On one device regroup() and select_device() are basically identity.
+    device_map = values.ReplicaDeviceMap((_device_str(0),))
+    result = values.regroup(device_map, (_nested_value("1"),))
+    # On one device regroup() and select_replica() are basically identity.
     self.assertEqual(_nested_value("1"), result)
     self.assertEqual(_nested_value("1"),
-                     values.select_device(_device_str(0), result))
+                     values.select_replica(0, result))
 
     # The one exception has to do with MirroredVariables.
     d = "/device:CPU:0"
     with ops.device(d):
       v = variable_scope.get_variable(
           name="v", initializer=1., use_resource=True)
-      index = {d: v}
-    mirrored = values.MirroredVariable(index, v,
+      device_map = values.ReplicaDeviceMap((d,))
+    mirrored = values.MirroredVariable(None, device_map, (v,),
                                        variable_scope.VariableAggregation.SUM)
-    result = values.regroup(index)
+    result = values.regroup(device_map, (v,))
     self.assertIs(mirrored, result)
 
   def testNamedTupleEstimatorSpec(self):
     with context.graph_mode(), ops.Graph().as_default():
-      created_estimator_specs = {}
-      to_regroup = {}
+      devices = []
+      created_estimator_specs = []
 
       for device_id in range(3):
         spec = model_fn_lib.EstimatorSpec(
             mode=model_fn_lib.ModeKeys.TRAIN,
             loss=constant_op.constant(device_id / 2),
             train_op=array_ops.identity(constant_op.constant(device_id)))
-        created_estimator_specs[device_id] = spec
-        to_regroup[_device_str(device_id)] = spec
+        devices.append(_device_str(device_id))
+        created_estimator_specs.append(spec)
 
-      merged_estimator_spec = values.regroup(to_regroup)
+      device_map = values.ReplicaDeviceMap(devices)
+      merged_estimator_spec = values.regroup(
+          device_map, created_estimator_specs)
 
       self.assertTrue(
           isinstance(merged_estimator_spec, model_fn_lib.EstimatorSpec))
@@ -337,415 +340,10 @@ class RegroupAndSelectDeviceTest(test.TestCase):
         # Scaffold is populated by `EstimatorSpec.__new__`.
         self.assertEqual(created_estimator_specs[device_id].scaffold,
                          merged_estimator_spec.scaffold.get(d))
-        # Also test that we can undo the merge using select_device()
+        # Also test that we can undo the merge using select_replica()
         self.assertEqual(created_estimator_specs[device_id],
-                         values.select_device(_device_str(device_id),
-                                              merged_estimator_spec))
-
-
-class PerReplicaDatasetTest(test.TestCase):
-
-  config = config_pb2.ConfigProto()
-  config.allow_soft_placement = True
-
-  def _test_iterator(self, devices, dataset, expected_values):
-    per_replica_dataset = values.PerReplicaDataset(dataset, devices)
-    if context.executing_eagerly():
-      iterator = per_replica_dataset.make_one_shot_iterator()
-    else:
-      iterator = per_replica_dataset.make_initializable_iterator()
-      self.evaluate([iterator.initializer])
-
-    for expected_value in expected_values:
-      next_element = iterator.get_next()
-      computed_value = self.evaluate(
-          [values.select_device(d, next_element) for d in devices])
-      self.assertEqual(expected_value, computed_value)
-
-    with self.assertRaises(errors.OutOfRangeError):
-      next_element = iterator.get_next()
-      self.evaluate([
-          values.select_device(d, next_element) for d in devices])
-
-  @test_util.run_in_graph_and_eager_modes
-  def testOneDevice(self):
-    devices = ["/device:CPU:0"]
-    dataset = dataset_ops.Dataset.range(10)
-
-    expected_values = [[i] for i in range(10)]
-
-    self._test_iterator(devices, dataset, expected_values)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testMultipleDevices(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dataset = dataset_ops.Dataset.range(10)
-
-    expected_values = [[i, i+1] for i in range(0, 10, 2)]
-
-    self._test_iterator(devices, dataset, expected_values)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testTupleDataset(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dataset1 = dataset_ops.Dataset.range(10)
-    dataset2 = dataset_ops.Dataset.range(10).map(lambda x: x**2)
-    dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
-
-    expected_values = [[(i, i**2), (i+1, (i+1)**2)] for i in range(0, 10, 2)]
-
-    self._test_iterator(devices, dataset, expected_values)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testUnevenDatasetBatches(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dataset = dataset_ops.Dataset.range(11)
-
-    expected_values = [[i, i+1] for i in range(0, 10, 2)]
-    self._test_iterator(devices, dataset, expected_values)
-
-  def testInitializableIterator(self):
-    with context.graph_mode():
-      devices = ["/device:CPU:0"]
-      # Using random input since that is only allowed with initializable
-      # iterator.
-      dataset = dataset_ops.Dataset.from_tensor_slices(
-          random_ops.random_uniform((10,)))
-
-      per_replica_dataset = values.PerReplicaDataset(dataset, devices)
-      iterator = per_replica_dataset.make_initializable_iterator()
-
-      self.evaluate(iterator.initializer)
-      next_element = iterator.get_next()
-      for _ in range(10):
-        self.evaluate(next_element)
-
-      # Should fail after the input is finished.
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
-
-      # After re-initializing the iterator, should be able to iterate again.
-      self.evaluate(iterator.initializer)
-      for _ in range(10):
-        self.evaluate(next_element)
-
-
-class MultiWorkerDatasetTest(multi_worker_test_base.MultiWorkerTestBase):
-
-  def _test_iterator(self, sess, iterator, devices, expected_values):
-    next_element = iterator.get_next()
-    for device in devices:
-      v = values.select_device(device, next_element)
-      # The `v` here can be a tuple.
-      for element in nest.flatten(v):
-        self.assertTrue(element.device in device)
-
-    for expected_value in expected_values:
-      actual = sess.run(
-          [values.select_device(d, next_element) for d in devices])
-      self.assertEqual(expected_value, actual)
-
-    with self.assertRaises(errors.OutOfRangeError):
-      sess.run([values.select_device(d, next_element) for d in devices])
-
-  def _test_dataset(self, dataset_fn, worker_devices, devices,
-                    expected_values, auto_shard=True):
-    multi_worker_dataset = values.MultiWorkerDataset(
-        dataset_fn, worker_devices, auto_shard=auto_shard)
-    multi_worker_iterator = multi_worker_dataset.make_initializable_iterator()
-    with self.cached_session() as sess:
-      sess.run(multi_worker_iterator.initializer)
-      self._test_iterator(sess, multi_worker_iterator, devices, expected_values)
-
-  def _cpu_devices(self):
-    worker_devices = [
-        ("/job:worker/replica:0/task:0",
-         ["/job:worker/replica:0/task:0/device:CPU:0"]),
-        ("/job:worker/replica:0/task:1",
-         ["/job:worker/replica:0/task:1/device:CPU:0"])]
-    devices = [
-        "/job:worker/replica:0/task:0/device:CPU:0",
-        "/job:worker/replica:0/task:1/device:CPU:0"
-    ]
-    return worker_devices, devices
-
-  def _cpu_and_one_gpu_devices(self):
-    worker_devices = [
-        ("/job:worker/replica:0/task:0", [
-            "/job:worker/replica:0/task:0/device:GPU:0",
-            "/job:worker/replica:0/task:0/device:CPU:0"
-        ]),
-        ("/job:worker/replica:0/task:1", [
-            "/job:worker/replica:0/task:1/device:GPU:0",
-            "/job:worker/replica:0/task:1/device:CPU:0"
-        ])
-    ]
-    devices = [
-        "/job:worker/replica:0/task:0/device:GPU:0",
-        "/job:worker/replica:0/task:0/device:CPU:0",
-        "/job:worker/replica:0/task:1/device:GPU:0",
-        "/job:worker/replica:0/task:1/device:CPU:0"
-    ]
-    return worker_devices, devices
-
-  def testDataDistributionOneDevicePerWorker(self):
-    worker_devices, devices = self._cpu_devices()
-    with context.graph_mode():
-      dataset_fn = lambda: dataset_ops.Dataset.range(8)
-      self._test_dataset(dataset_fn, worker_devices, devices,
-                         [[0, 1], [2, 3], [4, 5], [6, 7]])
-
-  def testDataDistributionNoAutoShard(self):
-    worker_devices, devices = self._cpu_devices()
-    with context.graph_mode():
-      dataset_fn = lambda: dataset_ops.Dataset.range(4)
-      self._test_dataset(dataset_fn, worker_devices, devices,
-                         [[0, 0], [1, 1], [2, 2], [3, 3]],
-                         auto_shard=False)
-
-  def testDataDistributionTwoDevicePerWorker(self):
-    if context.num_gpus() < 1:
-      self.skipTest("A GPU is not available for this test.")
-    worker_devices, devices = self._cpu_and_one_gpu_devices()
-    with context.graph_mode():
-      dataset_fn = lambda: dataset_ops.Dataset.range(8)
-      self._test_dataset(dataset_fn, worker_devices, devices,
-                         [[0, 2, 1, 3], [4, 6, 5, 7]])
-
-  def testTupleDataset(self):
-    worker_devices, devices = self._cpu_devices()
-
-    with context.graph_mode():
-
-      def dataset_fn():
-        dataset1 = dataset_ops.Dataset.range(8)
-        dataset2 = dataset_ops.Dataset.range(8).map(lambda x: x**2)
-        return dataset_ops.Dataset.zip((dataset1, dataset2))
-
-      expected_values = [
-          [(i, i**2), (i + 1, (i + 1)**2)] for i in range(0, 8, 2)
-      ]
-      self._test_dataset(dataset_fn, worker_devices, devices,
-                         expected_values)
-
-  def testInitializableIterator(self):
-    worker_devices, devices = self._cpu_devices()
-    with context.graph_mode(), self.cached_session() as sess:
-      dataset_fn = lambda: dataset_ops.Dataset.range(8)
-      multi_worker_dataset = values.MultiWorkerDataset(
-          dataset_fn, worker_devices, auto_shard=True)
-      multi_worker_iterator = multi_worker_dataset.make_initializable_iterator()
-
-      sess.run(multi_worker_iterator.initializer)
-      self._test_iterator(sess, multi_worker_iterator, devices,
-                          [[0, 1], [2, 3], [4, 5], [6, 7]])
-
-      # After re-initializing the iterator, should be able to iterate again.
-      sess.run(multi_worker_iterator.initializer)
-      self._test_iterator(sess, multi_worker_iterator, devices,
-                          [[0, 1], [2, 3], [4, 5], [6, 7]])
-
-  def testValueErrorForIterator(self):
-    # Incompatiable arguments.
-    with self.assertRaises(ValueError):
-      values.MultiWorkerDataIterator({"w1": None}, {"w1": "d1", "w2": "d2"})
-
-    # Test duplicated devices under same worker.
-    worker_devices, _ = self._cpu_devices()
-    worker_devices[0][1].append("/job:worker/replica:0/task:0/device:CPU:0")
-    with context.graph_mode():
-      dataset_fn = lambda: dataset_ops.Dataset.range(8)
-      multi_worker_dataset = values.MultiWorkerDataset(
-          dataset_fn, worker_devices, auto_shard=True)
-      multi_worker_iterator = multi_worker_dataset.make_initializable_iterator()
-      with self.assertRaises(ValueError):
-        multi_worker_iterator.get_next()
-
-
-class InputIteratorTestBase(test.TestCase):
-
-  def _test_iterator(self, input_type, dataset_fn, worker_device_pairs,
-                     expected_values, sess=None, split_batch_by=None):
-    devices = nest.flatten([ds for _, ds in worker_device_pairs])
-
-    if input_type == "input_fn":
-      input_contexts = [
-          distribute_lib.InputContext() for _ in worker_device_pairs]
-      input_fn = lambda _: dataset_fn()
-      iterator = values.InputFunctionIterator(input_fn, worker_device_pairs,
-                                              input_contexts)
-    else:
-      iterator = values.DatasetIterator(dataset_fn(), worker_device_pairs,
-                                        split_batch_by)
-
-    evaluate = lambda x: sess.run(x) if sess else self.evaluate(x)
-
-    evaluate(control_flow_ops.group(iterator.initialize()))
-
-    for expected_value in expected_values:
-      next_element = iterator.get_next()
-      computed_value = evaluate(
-          [values.select_device(d, next_element) for d in devices])
-      self.assertAllEqual(expected_value, computed_value)
-
-    with self.assertRaises(errors.OutOfRangeError):
-      next_element = iterator.get_next()
-      evaluate([values.select_device(d, next_element) for d in devices])
-
-    # After re-initializing the iterator, should be able to iterate again.
-    evaluate(control_flow_ops.group(iterator.initialize()))
-
-    for expected_value in expected_values:
-      next_element = iterator.get_next()
-      computed_value = evaluate(
-          [values.select_device(d, next_element) for d in devices])
-      self.assertAllEqual(expected_value, computed_value)
-
-
-class InputIteratorSingleWorkerTest(InputIteratorTestBase,
-                                    parameterized.TestCase):
-
-  @combinations.generate(combinations.combine(
-      mode=["graph", "eager"],
-      input_type=["input_fn", "dataset"]))
-  def testOneDeviceCPU(self, input_type):
-    worker_device_pairs = [("", ["/device:CPU:0"])]
-    dataset_fn = lambda: dataset_ops.Dataset.range(10)
-
-    expected_values = [[i] for i in range(10)]
-
-    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
-                        expected_values)
-
-  @combinations.generate(combinations.combine(
-      mode=["graph", "eager"],
-      input_type=["input_fn", "dataset"],
-      required_gpus=1))
-  def testTwoDevicesOneGPUOneCPU(self, input_type):
-    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
-    dataset_fn = lambda: dataset_ops.Dataset.range(10)
-
-    expected_values = [[i, i+1] for i in range(0, 10, 2)]
-
-    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
-                        expected_values)
-
-  @combinations.generate(combinations.combine(
-      mode=["graph", "eager"],
-      input_type=["input_fn", "dataset"],
-      required_gpus=1))
-  def testTupleDataset(self, input_type):
-    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
-    def dataset_fn():
-      dataset1 = dataset_ops.Dataset.range(10)
-      dataset2 = dataset_ops.Dataset.range(10).map(lambda x: x**2)
-      return dataset_ops.Dataset.zip((dataset1, dataset2))
-
-    expected_values = [[(i, i**2), (i+1, (i+1)**2)] for i in range(0, 10, 2)]
-
-    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
-                        expected_values)
-
-  @combinations.generate(combinations.combine(
-      mode=["graph", "eager"],
-      input_type=["input_fn", "dataset"],
-      required_gpus=1))
-  def testUnevenDatasetBatches(self, input_type):
-    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
-    dataset_fn = lambda: dataset_ops.Dataset.range(11)
-
-    expected_values = [[i, i+1] for i in range(0, 10, 2)]
-    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
-                        expected_values)
-
-  @combinations.generate(combinations.combine(
-      mode=["graph", "eager"],
-      input_type=["dataset"],
-      split_batch_by=[None, 2],
-      required_gpus=1))
-  def testBatchSplitting(self, input_type, split_batch_by):
-    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
-    batch_size = 10
-    dataset_fn = lambda: dataset_ops.Dataset.range(100).batch(batch_size)
-
-    updated_batch_size = (
-        batch_size // split_batch_by if split_batch_by else batch_size)
-    expected_values = [[range(i, i+updated_batch_size),
-                        range(i+updated_batch_size, i+2*updated_batch_size)]
-                       for i in range(0, 100, updated_batch_size*2)]
-
-    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
-                        expected_values, sess=None,
-                        split_batch_by=split_batch_by)
-
-
-class InputIteratorMultiWorkerTest(
-    multi_worker_test_base.MultiWorkerTestBase, InputIteratorTestBase,
-    parameterized.TestCase):
-
-  def _cpu_devices(self):
-    return [
-        ("/job:worker/replica:0/task:0",
-         ["/job:worker/replica:0/task:0/device:CPU:0"]),
-        ("/job:worker/replica:0/task:1",
-         ["/job:worker/replica:0/task:1/device:CPU:0"])]
-
-  def _cpu_and_one_gpu_devices(self):
-    return [
-        ("/job:worker/replica:0/task:0", [
-            "/job:worker/replica:0/task:0/device:GPU:0",
-            "/job:worker/replica:0/task:0/device:CPU:0"
-        ]),
-        ("/job:worker/replica:0/task:1", [
-            "/job:worker/replica:0/task:1/device:GPU:0",
-            "/job:worker/replica:0/task:1/device:CPU:0"
-        ])
-    ]
-
-  @combinations.generate(combinations.combine(
-      mode=["graph"],
-      input_type=["input_fn", "dataset"]))
-  def testOneDevicePerWorker(self, input_type):
-    worker_devices = self._cpu_devices()
-    with context.graph_mode(), self.cached_session() as sess:
-      dataset_fn = lambda: dataset_ops.Dataset.range(4)
-      self._test_iterator(input_type, dataset_fn, worker_devices,
-                          [[0, 0], [1, 1], [2, 2], [3, 3]], sess)
-
-  @combinations.generate(combinations.combine(
-      mode=["graph"],
-      input_type=["input_fn", "dataset"],
-      required_gpus=1))
-  def testTwoDevicesPerWorker(self, input_type):
-    worker_devices = self._cpu_and_one_gpu_devices()
-    with context.graph_mode(), self.cached_session() as sess:
-      dataset_fn = lambda: dataset_ops.Dataset.range(4)
-      self._test_iterator(input_type, dataset_fn, worker_devices,
-                          [[0, 1, 0, 1], [2, 3, 2, 3]], sess)
-
-  @combinations.generate(combinations.combine(
-      mode=["graph"],
-      input_type=["input_fn", "dataset"]))
-  def testTupleDataset(self, input_type):
-    worker_devices = self._cpu_devices()
-    with context.graph_mode(), self.cached_session() as sess:
-      def dataset_fn():
-        dataset1 = dataset_ops.Dataset.range(4)
-        dataset2 = dataset_ops.Dataset.range(4).map(lambda x: x**2)
-        return dataset_ops.Dataset.zip((dataset1, dataset2))
-
-      expected_values = [[(i, i**2), (i, i**2)] for i in range(0, 4)]
-      self._test_iterator(input_type, dataset_fn, worker_devices,
-                          expected_values, sess)
+                         values.select_replica(device_id,
+                                               merged_estimator_spec))
 
 
 class MirroredVariableTest(test.TestCase, parameterized.TestCase):
@@ -768,8 +366,8 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
   def testVariableOnAnotherDevice(self):
     v = variable_scope.get_variable(
         name="v", initializer=[1.], use_resource=True)
-    index = {"/job:foo/device:CPU:0": v}
-    mirrored = values.MirroredVariable(index, v,
+    device_map = values.ReplicaDeviceMap(("/job:foo/device:CPU:0",))
+    mirrored = values.MirroredVariable(None, device_map, (v,),
                                        variable_scope.VariableAggregation.MEAN)
 
     self.assertEqual(v.name, mirrored.name)
@@ -797,7 +395,8 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
       self.skipTest("A GPU is not available for this test in eager mode.")
 
     with self.cached_session(config=self.config) as sess:
-      v, devices, mirrored = _make_mirrored()
+      v, device_map, mirrored = _make_mirrored()
+      devices = device_map.all_devices
 
       # Overwrite the initial values.
       self._assign_mirrored(devices, v, [3., 4.])
@@ -815,7 +414,8 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
   def _save_mirrored(self):
     """Save variables with mirroring, returns save_path."""
     with self.session(graph=ops.Graph()) as sess:
-      v, devices, mirrored = _make_mirrored()
+      v, device_map, mirrored = _make_mirrored()
+      devices = device_map.all_devices
 
       # Overwrite the initial values.
       self._assign_mirrored(devices, v, [3., 4.])
@@ -860,7 +460,8 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
   def _restore_mirrored(self, save_path):
     """Restore to variables with mirroring in a fresh graph."""
     with self.session(graph=ops.Graph()) as sess:
-      v, devices, mirrored = _make_mirrored()
+      v, device_map, mirrored = _make_mirrored()
+      devices = device_map.all_devices
 
       # Overwrite the initial values.
       self._assign_mirrored(devices, v, [7., 8.])
@@ -904,25 +505,24 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase):
       with ops.device("/device:GPU:0"):
         v = variable_scope.get_variable(
             name="v", initializer=1., use_resource=True)
-      mirrored = values.MirroredVariable({
-          "/device:GPU:0": v
-      }, v, variable_scope.VariableAggregation.MEAN)
+      mirrored = values.MirroredVariable(
+          distribution, values.ReplicaDeviceMap(("/device:GPU:0",)), (v,),
+          variable_scope.VariableAggregation.MEAN)
       sess.run(variables_lib.global_variables_initializer())
       sess.run({"complicated": mirrored})
 
 
-_devices = ["/device:GPU:0", "/device:CPU:0"]
+_devices = ("/device:GPU:0", "/device:CPU:0")
 
 
-def _make_replica_local(method):
+def _make_replica_local(method, strategy=None):
+  device_map = values.ReplicaDeviceMap(_devices)
   v = []
-  index = {}
   for d, n, init in zip(_devices, ["v", "v/replica"], [1., 2.]):
     with ops.device(d):
       v.append(variable_scope.get_variable(
           name=n, initializer=init, use_resource=True))
-      index[d] = v[-1]
-  replica_local = values.ReplicaLocalVariable(index, v[0], method)
+  replica_local = values.ReplicaLocalVariable(strategy, device_map, v, method)
   return v, replica_local
 
 
@@ -948,9 +548,9 @@ class ReplicaLocalVariablePropertiesTest(test.TestCase):
   def testVariableOnAnotherDevice(self):
     v = variable_scope.get_variable(
         name="v", initializer=[1.], use_resource=True)
-    index = {"/job:foo/device:CPU:0": v}
+    device_map = values.ReplicaDeviceMap(("/job:foo/device:CPU:0",))
     replica_local = values.ReplicaLocalVariable(
-        index, v, variable_scope.VariableAggregation.MEAN)
+        None, device_map, (v,), variable_scope.VariableAggregation.MEAN)
 
     self.assertEqual(v.name, replica_local.name)
     self.assertEqual(v.dtype, replica_local.dtype)
@@ -997,7 +597,7 @@ class ReplicaLocalVariableTest(test.TestCase, parameterized.TestCase):
   def testSaveAndRestoreReplicaLocalSumOneGraph(self, distribution):
     with self.cached_session() as sess:
       v, replica_local = _make_replica_local(
-          variable_scope.VariableAggregation.SUM)
+          variable_scope.VariableAggregation.SUM, distribution)
 
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [3., 4.])
@@ -1020,7 +620,7 @@ class ReplicaLocalVariableTest(test.TestCase, parameterized.TestCase):
 
     with self.cached_session() as sess:
       v, replica_local = _make_replica_local(
-          variable_scope.VariableAggregation.MEAN)
+          variable_scope.VariableAggregation.MEAN, distribution)
 
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [3., 4.])
@@ -1040,7 +640,7 @@ class ReplicaLocalVariableTest(test.TestCase, parameterized.TestCase):
     """Save variables with mirroring, returns save_path."""
     with self.session(graph=ops.Graph()) as sess:
       v, replica_local = _make_replica_local(
-          variable_scope.VariableAggregation.MEAN)
+          variable_scope.VariableAggregation.MEAN, distribution)
 
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [3., 4.])
@@ -1056,7 +656,8 @@ class ReplicaLocalVariableTest(test.TestCase, parameterized.TestCase):
   def _save_replica_local_sum(self, distribution):
     """Save variables with mirroring, returns save_path."""
     with self.session(graph=ops.Graph()) as sess:
-      v, replica_local = _make_replica_local("sum")
+      v, replica_local = _make_replica_local(
+          variable_scope.VariableAggregation.SUM, distribution)
 
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [1.5, 2.])
@@ -1103,7 +704,7 @@ class ReplicaLocalVariableTest(test.TestCase, parameterized.TestCase):
     """Restore to variables with mirroring in a fresh graph."""
     with self.session(graph=ops.Graph()) as sess:
       v, replica_local = _make_replica_local(
-          variable_scope.VariableAggregation.MEAN)
+          variable_scope.VariableAggregation.MEAN, distribution)
 
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [7., 8.])
@@ -1118,7 +719,7 @@ class ReplicaLocalVariableTest(test.TestCase, parameterized.TestCase):
     """Restore to variables with mirroring in a fresh graph."""
     with self.session(graph=ops.Graph()) as sess:
       v, replica_local = _make_replica_local(
-          variable_scope.VariableAggregation.SUM)
+          variable_scope.VariableAggregation.SUM, distribution)
 
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [7., 8.])
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 3079175015a9aee1625404902070df8f13b2089c..c2300286d3be4bb757dac588623c47044a1a9db5 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -822,7 +822,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "affine_test",
-    size = "large",
+    size = "medium",
     srcs = ["python/kernel_tests/bijectors/affine_test.py"],
     additional_deps = [
         ":bijectors_py",
@@ -837,7 +837,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
-    shard_count = 5,
+    shard_count = 10,
     tags = ["noasan"],  # times out b/63678675
 )
 
diff --git a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
index 452628257ea96713453bf2aa32b5baa9d6d0cb86..1006dfac49f36baa7cf5136f6f2982e3fd965298 100644
--- a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
+++ b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
@@ -249,9 +249,9 @@ class InverseGamma(distribution.Distribution):
       `self.allow_nan_stats` is `False`, an exception will be raised rather
       than returning `NaN`.""")
   def _variance(self):
-    var = (math_ops.square(self.rate)
-           / math_ops.square(self.concentration - 1.)
-           / (self.concentration - 2.))
+    var = (
+        math_ops.square(self.rate) / math_ops.squared_difference(
+            self.concentration, 1.) / (self.concentration - 2.))
     if self.allow_nan_stats:
       nan = array_ops.fill(
           self.batch_shape_tensor(),
diff --git a/tensorflow/contrib/distributions/python/ops/sample_stats.py b/tensorflow/contrib/distributions/python/ops/sample_stats.py
index 978e627d6638ddeea9df288d389354f0ac53d115..19e99e03803e7f4cdfdb023feb04daaba68eceed 100644
--- a/tensorflow/contrib/distributions/python/ops/sample_stats.py
+++ b/tensorflow/contrib/distributions/python/ops/sample_stats.py
@@ -300,7 +300,7 @@ def percentile(x,
       raise ValueError("Argument 'interpolation' must be in %s.  Found %s" %
                        (allowed_interpolations, interpolation))
 
-  with ops.name_scope(name, [x, q]):
+  with ops.name_scope(name, values=[x, q]):
     x = ops.convert_to_tensor(x, name="x")
     # Double is needed here and below, else we get the wrong index if the array
     # is huge along axis.
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index 77052a75a70bec1162feb2b126d247924b3a2e36..d441e4735b64fe1176e77a978d281d46a7b287ab 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -15,7 +15,6 @@ py_library(
         ":metrics",
         ":network",
         ":parameter_server",
-        ":remote",
         ":saver",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
@@ -31,6 +30,7 @@ py_library(
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:execution_callbacks",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:remote",
     ],
 )
 
@@ -144,7 +144,7 @@ py_library(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
     ],
 )
 
@@ -238,24 +238,12 @@ py_test(
     ],
 )
 
-py_library(
-    name = "remote",
-    srcs = ["remote.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:platform",
-        "//tensorflow/python/eager:context",
-    ],
-)
-
 cuda_py_test(
     name = "remote_test",
     srcs = ["remote_test.py"],
     additional_deps = [
         ":parameter_server",
-        ":remote",
+        "//tensorflow/python/eager:remote",
         "//tensorflow/contrib/eager/python:tfe",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
diff --git a/tensorflow/contrib/eager/python/datasets_test.py b/tensorflow/contrib/eager/python/datasets_test.py
index 257d02057ae0d280074559aa9e97725bf5cc3fd0..48925b1bfacc6b59c210b2fb4b53a9a1a851673f 100644
--- a/tensorflow/contrib/eager/python/datasets_test.py
+++ b/tensorflow/contrib/eager/python/datasets_test.py
@@ -37,7 +37,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 class IteratorTest(test.TestCase):
@@ -200,13 +200,6 @@ class IteratorTest(test.TestCase):
         y = math_ops.add(x, x)
     self.assertAllEqual([0., 2.], y.numpy())
 
-  def testGpuDefinedDataset(self):
-    with ops.device(test.gpu_device_name()):
-      ds = Dataset.from_tensors([0., 1.])
-      for x in ds:
-        y = math_ops.add(x, x)
-    self.assertAllEqual([0., 2.], y.numpy())
-
   def testOverrideThreadPool(self):
 
     def get_thread_id(_):
@@ -245,7 +238,7 @@ class IteratorTest(test.TestCase):
     dataset = Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
     dataset = dataset.map(math_ops.square).batch(2)
     iterator = datasets.Iterator(dataset)
-    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    checkpoint = trackable_utils.Checkpoint(iterator=iterator)
     self.assertAllEqual([1, 4], iterator.get_next().numpy())
     save_path = checkpoint.save(checkpoint_prefix)
     self.assertAllEqual([9, 16], iterator.get_next().numpy())
@@ -264,7 +257,7 @@ class IteratorTest(test.TestCase):
     dataset_2 = Dataset.range(10)
     iterator_3 = datasets.Iterator(dataset_2)
 
-    checkpoint = checkpointable_utils.Checkpoint(
+    checkpoint = trackable_utils.Checkpoint(
         iterator_1=iterator_1, iterator_2=iterator_2, iterator_3=iterator_3)
     self.assertAllEqual([1, 4], iterator_1.get_next().numpy())
     self.assertEqual(0, iterator_3.get_next().numpy())
@@ -286,7 +279,7 @@ class IteratorTest(test.TestCase):
     dataset = Dataset.range(3)
     iterator = datasets.Iterator(dataset)
 
-    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    checkpoint = trackable_utils.Checkpoint(iterator=iterator)
     self.assertEqual(0, iterator.get_next().numpy())
     self.assertEqual(1, iterator.get_next().numpy())
     save_path = checkpoint.save(checkpoint_prefix)
@@ -300,7 +293,7 @@ class IteratorTest(test.TestCase):
     dataset = Dataset.range(10)
     for i in range(5):
       iterator = datasets.Iterator(dataset)
-      checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+      checkpoint = trackable_utils.Checkpoint(iterator=iterator)
       checkpoint.restore(checkpoint_management.latest_checkpoint(
           checkpoint_directory))
       for j in range(2):
diff --git a/tensorflow/contrib/eager/python/examples/BUILD b/tensorflow/contrib/eager/python/examples/BUILD
index 97c299a911c9180bf69faa0fa46527e80eada790..3e0881754c750f4d36e2e4dd8b80835b031c658c 100644
--- a/tensorflow/contrib/eager/python/examples/BUILD
+++ b/tensorflow/contrib/eager/python/examples/BUILD
@@ -6,16 +6,16 @@ package(default_visibility = ["//tensorflow:internal"])
 py_library(
     name = "examples_pip",
     deps = [
-        "//tensorflow/contrib/eager/python/examples/densenet",
-        "//tensorflow/contrib/eager/python/examples/gan:mnist",
+        "//tensorflow/contrib/eager/python/examples/densenet:densenet_lib",
+        "//tensorflow/contrib/eager/python/examples/gan:mnist_lib",
         "//tensorflow/contrib/eager/python/examples/l2hmc",
         "//tensorflow/contrib/eager/python/examples/l2hmc:neural_nets",
-        "//tensorflow/contrib/eager/python/examples/linear_regression",
+        "//tensorflow/contrib/eager/python/examples/linear_regression:linear_regression_lib",
         "//tensorflow/contrib/eager/python/examples/resnet50",
         "//tensorflow/contrib/eager/python/examples/revnet",
         "//tensorflow/contrib/eager/python/examples/revnet:config",
-        "//tensorflow/contrib/eager/python/examples/rnn_colorbot",
-        "//tensorflow/contrib/eager/python/examples/rnn_ptb",
+        "//tensorflow/contrib/eager/python/examples/rnn_colorbot:rnn_colorbot_lib",
+        "//tensorflow/contrib/eager/python/examples/rnn_ptb:rnn_ptb_lib",
         "//tensorflow/contrib/eager/python/examples/spinn:data",
     ],
 )
diff --git a/tensorflow/contrib/eager/python/examples/densenet/BUILD b/tensorflow/contrib/eager/python/examples/densenet/BUILD
index e2154fcc5fcf774dcd52285d9442dfd5073a4992..fbb5daf230bb79f08a3d071062ddc0e8507ab324 100644
--- a/tensorflow/contrib/eager/python/examples/densenet/BUILD
+++ b/tensorflow/contrib/eager/python/examples/densenet/BUILD
@@ -9,6 +9,13 @@ py_binary(
     name = "densenet",
     srcs = ["densenet.py"],
     srcs_version = "PY2AND3",
+    deps = [":densenet_lib"],
+)
+
+py_library(
+    name = "densenet_lib",
+    srcs = ["densenet.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/eager/python:tfe",
@@ -17,33 +24,37 @@ py_binary(
 
 cuda_py_test(
     name = "densenet_test",
-    size = "large",
+    size = "medium",
     srcs = ["densenet_test.py"],
     additional_deps = [
         ":densenet",
         "//tensorflow/contrib/eager/python:tfe",
         "//tensorflow:tensorflow_py",
     ],
+    shard_count = 4,
     tags = [
         "no_pip",
         "optonly",
+        "oss_serial",
     ],
 )
 
 cuda_py_test(
     name = "densenet_graph_test",
-    size = "large",
+    size = "medium",
     srcs = ["densenet_graph_test.py"],
     additional_deps = [
         ":densenet",
         "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
     ],
+    shard_count = 4,
     tags = [
         "no_pip",
         "noasan",
         "nomsan",
         "notsan",
         "optonly",
+        "oss_serial",
     ],
 )
diff --git a/tensorflow/contrib/eager/python/examples/gan/BUILD b/tensorflow/contrib/eager/python/examples/gan/BUILD
index d64c8eb9ce122fa277567b2fbc632abfbc72df64..d99a519112787bad664232983208279cfb4d0036 100644
--- a/tensorflow/contrib/eager/python/examples/gan/BUILD
+++ b/tensorflow/contrib/eager/python/examples/gan/BUILD
@@ -9,6 +9,13 @@ py_binary(
     name = "mnist",
     srcs = ["mnist.py"],
     srcs_version = "PY2AND3",
+    deps = [":mnist_lib"],
+)
+
+py_library(
+    name = "mnist_lib",
+    srcs = ["mnist.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/eager/python:tfe",
@@ -20,7 +27,7 @@ cuda_py_test(
     name = "mnist_test",
     srcs = ["mnist_test.py"],
     additional_deps = [
-        ":mnist",
+        ":mnist_lib",
         "//tensorflow/contrib/eager/python:tfe",
         "//tensorflow:tensorflow_py",
     ],
@@ -30,7 +37,7 @@ cuda_py_test(
     name = "mnist_graph_test",
     srcs = ["mnist_graph_test.py"],
     additional_deps = [
-        ":mnist",
+        ":mnist_lib",
         "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
     ],
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb
index 1a08cc0fd06516be4af5c2b0b46a3ffcf9101e95..e1a02db76f705414a34d232022f50124a5a6a3ed 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb
@@ -13,11 +13,13 @@
         "\n",
         "# Convolutional VAE: An example with tf.keras and eager\n",
         "\n",
+        "This example has moved:\n",
+        "\n",
         "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb\"\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/r2/tutorials/generative/cvae.ipynb\"\u003e\n",
         "    \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e  \n",
         "\u003c/td\u003e\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
+        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/docs/blob/master/site/en/r2/tutorials/generative/cvae.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
       ]
     },
     {
@@ -28,604 +30,14 @@
       },
       "source": [
         "![evolution of output during training](https://tensorflow.org/images/autoencoders/cvae.gif)\n",
-        "\n",
-        "This notebook demonstrates how to generate images of handwritten digits using [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager) by training a Variational Autoencoder. (VAE, [[1]](https://arxiv.org/abs/1312.6114), [[2]](https://arxiv.org/abs/1401.4082)).\n",
         "\n"
       ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "P-JuIu2N_SQf"
-      },
-      "outputs": [],
-      "source": [
-        "# to generate gifs\n",
-        "!pip install imageio"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "e1_Y75QXJS6h"
-      },
-      "source": [
-        "## Import TensorFlow and enable Eager execution"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "YfIk2es3hJEd"
-      },
-      "outputs": [],
-      "source": [
-        "from __future__ import absolute_import, division, print_function\n",
-        "\n",
-        "# Import TensorFlow \u003e= 1.9 and enable eager execution\n",
-        "import tensorflow as tf\n",
-        "tfe = tf.contrib.eager\n",
-        "tf.enable_eager_execution()\n",
-        "\n",
-        "import os\n",
-        "import time\n",
-        "import numpy as np\n",
-        "import glob\n",
-        "import matplotlib.pyplot as plt\n",
-        "import PIL\n",
-        "import imageio\n",
-        "from IPython import display"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "iYn4MdZnKCey"
-      },
-      "source": [
-        "## Load the MNIST dataset\n",
-        "Each MNIST image is originally a vector of 784 integers, each of which is between 0-255 and represents the intensity of a pixel. We model each pixel with a Bernoulli distribution in our model, and we statically binarize the dataset."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "a4fYMGxGhrna"
-      },
-      "outputs": [],
-      "source": [
-        "(train_images, _), (test_images, _) = tf.keras.datasets.mnist.load_data()"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "NFC2ghIdiZYE"
-      },
-      "outputs": [],
-      "source": [
-        "train_images = train_images.reshape(train_images.shape[0], 28, 28, 1).astype('float32')\n",
-        "test_images = test_images.reshape(test_images.shape[0], 28, 28, 1).astype('float32')\n",
-        "\n",
-        "# Normalizing the images to the range of [0., 1.]\n",
-        "train_images /= 255.\n",
-        "test_images /= 255.\n",
-        "\n",
-        "# Binarization\n",
-        "train_images[train_images \u003e= .5] = 1.\n",
-        "train_images[train_images \u003c .5] = 0.\n",
-        "test_images[test_images \u003e= .5] = 1.\n",
-        "test_images[test_images \u003c .5] = 0."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "S4PIDhoDLbsZ"
-      },
-      "outputs": [],
-      "source": [
-        "TRAIN_BUF = 60000\n",
-        "BATCH_SIZE = 100\n",
-        "\n",
-        "TEST_BUF = 10000"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "PIGN6ouoQxt3"
-      },
-      "source": [
-        "## Use *tf.data* to create batches and shuffle the dataset"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "-yKCCQOoJ7cn"
-      },
-      "outputs": [],
-      "source": [
-        "train_dataset = tf.data.Dataset.from_tensor_slices(train_images).shuffle(TRAIN_BUF).batch(BATCH_SIZE)\n",
-        "test_dataset = tf.data.Dataset.from_tensor_slices(test_images).shuffle(TEST_BUF).batch(BATCH_SIZE)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "THY-sZMiQ4UV"
-      },
-      "source": [
-        "## Wire up the generative and inference network with *tf.keras.Sequential*\n",
-        "\n",
-        "In our VAE example, we use two small ConvNets for the generative and inference network. Since these neural nets are small, we use `tf.keras.Sequential` to simplify our code. Let $x$ and $z$ denote the observation and latent variable respectively in the following descriptions. \n",
-        "\n",
-        "### Generative Network\n",
-        "This defines the generative model which takes a latent encoding as input, and outputs the parameters for a conditional distribution of the observation, i.e. $p(x|z)$. Additionally, we use a unit Gaussian prior $p(z)$ for the latent variable.\n",
-        "\n",
-        "### Inference Network\n",
-        "This defines an approximate posterior distribution $q(z|x)$, which takes as input an observation and outputs a set of parameters for the conditional distribution of the latent representation. In this example, we simply model this distribution as a diagonal Gaussian. In this case, the inference network outputs the mean and log-variance parameters of a factorized Gaussian (log-variance instead of the variance directly is for numerical stability).\n",
-        "\n",
-        "### Reparameterization Trick\n",
-        "During optimization, we can sample from $q(z|x)$ by first sampling from a unit Gaussian, and then multiplying by the standard deviation and adding the mean. This ensures the gradients could pass through the sample to the inference network parameters.\n",
-        "\n",
-        "### Network architecture\n",
-        "For the inference network, we use two convolutional layers followed by a fully-connected layer. In the generative network, we mirror this architecture by using a fully-connected layer followed by three convolution transpose layers (a.k.a. deconvolutional layers in some contexts). Note, it's common practice to avoid using batch normalization when training VAEs, since the additional stochasticity due to using mini-batches may aggravate instability on top of the stochasticity from sampling."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "VGLbvBEmjK0a"
-      },
-      "outputs": [],
-      "source": [
-        "class CVAE(tf.keras.Model):\n",
-        "  def __init__(self, latent_dim):\n",
-        "    super(CVAE, self).__init__()\n",
-        "    self.latent_dim = latent_dim\n",
-        "    self.inference_net = tf.keras.Sequential(\n",
-        "      [\n",
-        "          tf.keras.layers.InputLayer(input_shape=(28, 28, 1)),\n",
-        "          tf.keras.layers.Conv2D(\n",
-        "              filters=32, kernel_size=3, strides=(2, 2), activation=tf.nn.relu),\n",
-        "          tf.keras.layers.Conv2D(\n",
-        "              filters=64, kernel_size=3, strides=(2, 2), activation=tf.nn.relu),\n",
-        "          tf.keras.layers.Flatten(),\n",
-        "          # No activation\n",
-        "          tf.keras.layers.Dense(latent_dim + latent_dim),\n",
-        "      ]\n",
-        "    )\n",
-        "\n",
-        "    self.generative_net = tf.keras.Sequential(\n",
-        "        [\n",
-        "          tf.keras.layers.InputLayer(input_shape=(latent_dim,)),\n",
-        "          tf.keras.layers.Dense(units=7*7*32, activation=tf.nn.relu),\n",
-        "          tf.keras.layers.Reshape(target_shape=(7, 7, 32)),\n",
-        "          tf.keras.layers.Conv2DTranspose(\n",
-        "              filters=64,\n",
-        "              kernel_size=3,\n",
-        "              strides=(2, 2),\n",
-        "              padding=\"SAME\",\n",
-        "              activation=tf.nn.relu),\n",
-        "          tf.keras.layers.Conv2DTranspose(\n",
-        "              filters=32,\n",
-        "              kernel_size=3,\n",
-        "              strides=(2, 2),\n",
-        "              padding=\"SAME\",\n",
-        "              activation=tf.nn.relu),\n",
-        "          # No activation\n",
-        "          tf.keras.layers.Conv2DTranspose(\n",
-        "              filters=1, kernel_size=3, strides=(1, 1), padding=\"SAME\"),\n",
-        "        ]\n",
-        "    )\n",
-        "\n",
-        "  def sample(self, eps=None):\n",
-        "    if eps is None:\n",
-        "      eps = tf.random_normal(shape=(100, self.latent_dim))\n",
-        "    return self.decode(eps, apply_sigmoid=True)\n",
-        "\n",
-        "  def encode(self, x):\n",
-        "    mean, logvar = tf.split(self.inference_net(x), num_or_size_splits=2, axis=1)\n",
-        "    return mean, logvar\n",
-        "\n",
-        "  def reparameterize(self, mean, logvar):\n",
-        "    eps = tf.random_normal(shape=mean.shape)\n",
-        "    return eps * tf.exp(logvar * .5) + mean\n",
-        "\n",
-        "  def decode(self, z, apply_sigmoid=False):\n",
-        "    logits = self.generative_net(z)\n",
-        "    if apply_sigmoid:\n",
-        "      probs = tf.sigmoid(logits)\n",
-        "      return probs\n",
-        "\n",
-        "    return logits"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "0FMYgY_mPfTi"
-      },
-      "source": [
-        "## Define the loss function and the optimizer\n",
-        "\n",
-        "VAEs train by maximizing the evidence lower bound (ELBO) on the marginal log-likelihood:\n",
-        "\n",
-        "$$\\log p(x) \\ge \\text{ELBO} = \\mathbb{E}_{q(z|x)}\\left[\\log \\frac{p(x, z)}{q(z|x)}\\right].$$\n",
-        "\n",
-        "In practice, we optimize the single sample Monte Carlo estimate of this expectation:\n",
-        "\n",
-        "$$\\log p(x| z) + \\log p(z) - \\log q(z|x),$$\n",
-        "where $z$ is sampled from $q(z|x)$.\n",
-        "\n",
-        "**Note**: we could also analytically compute the KL term, but here we incorporate all three terms in the Monte Carlo estimator for simplicity."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "iWCn_PVdEJZ7"
-      },
-      "outputs": [],
-      "source": [
-        "def log_normal_pdf(sample, mean, logvar, raxis=1):\n",
-        "  log2pi = tf.log(2. * np.pi)\n",
-        "  return tf.reduce_sum(\n",
-        "      -.5 * ((sample - mean) ** 2. * tf.exp(-logvar) + logvar + log2pi),\n",
-        "      axis=raxis)\n",
-        "\n",
-        "def compute_loss(model, x):\n",
-        "  mean, logvar = model.encode(x)\n",
-        "  z = model.reparameterize(mean, logvar)\n",
-        "  x_logit = model.decode(z)\n",
-        "\n",
-        "  cross_ent = tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=x)\n",
-        "  logpx_z = -tf.reduce_sum(cross_ent, axis=[1, 2, 3])\n",
-        "  logpz = log_normal_pdf(z, 0., 0.)\n",
-        "  logqz_x = log_normal_pdf(z, mean, logvar)\n",
-        "  return -tf.reduce_mean(logpx_z + logpz - logqz_x)\n",
-        "\n",
-        "def compute_gradients(model, x):\n",
-        "  with tf.GradientTape() as tape:\n",
-        "    loss = compute_loss(model, x)\n",
-        "  return tape.gradient(loss, model.trainable_variables), loss\n",
-        "\n",
-        "optimizer = tf.train.AdamOptimizer(1e-4)\n",
-        "def apply_gradients(optimizer, gradients, variables, global_step=None):\n",
-        "  optimizer.apply_gradients(zip(gradients, variables), global_step=global_step)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "Rw1fkAczTQYh"
-      },
-      "source": [
-        "## Training\n",
-        "\n",
-        "* We start by iterating over the dataset\n",
-        "* During each iteration, we pass the image to the encoder to obtain a set of mean and log-variance parameters of the approximate posterior $q(z|x)$\n",
-        "* We then apply the *reparameterization trick* to sample from $q(z|x)$\n",
-        "* Finally, we pass the reparameterized samples to the decoder to obtain the logits of the generative distribution $p(x|z)$\n",
-        "* **Note:** Since we use the dataset loaded by keras with 60k datapoints in the training set and 10k datapoints in the test set, our resulting ELBO on the test set is slightly higher than reported results in the literature which uses dynamic binarization of Larochelle's MNIST.\n",
-        "\n",
-        "## Generate Images\n",
-        "\n",
-        "* After training, it is time to generate some images\n",
-        "* We start by sampling a set of latent vectors from the unit Gaussian prior distribution $p(z)$\n",
-        "* The generator will then convert the latent sample $z$ to logits of the observation, giving a distribution $p(x|z)$\n",
-        "* Here we plot the probabilities of Bernoulli distributions\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "NS2GWywBbAWo"
-      },
-      "outputs": [],
-      "source": [
-        "epochs = 100\n",
-        "latent_dim = 50\n",
-        "num_examples_to_generate = 16\n",
-        "\n",
-        "# keeping the random vector constant for generation (prediction) so\n",
-        "# it will be easier to see the improvement.\n",
-        "random_vector_for_generation = tf.random_normal(\n",
-        "    shape=[num_examples_to_generate, latent_dim])\n",
-        "model = CVAE(latent_dim)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "RmdVsmvhPxyy"
-      },
-      "outputs": [],
-      "source": [
-        "def generate_and_save_images(model, epoch, test_input):\n",
-        "  predictions = model.sample(test_input)\n",
-        "  fig = plt.figure(figsize=(4,4))\n",
-        "\n",
-        "  for i in range(predictions.shape[0]):\n",
-        "      plt.subplot(4, 4, i+1)\n",
-        "      plt.imshow(predictions[i, :, :, 0], cmap='gray')\n",
-        "      plt.axis('off')\n",
-        "\n",
-        "  # tight_layout minimizes the overlap between 2 sub-plots\n",
-        "  plt.savefig('image_at_epoch_{:04d}.png'.format(epoch))\n",
-        "  plt.show()"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "2M7LmLtGEMQJ"
-      },
-      "outputs": [],
-      "source": [
-        "generate_and_save_images(model, 0, random_vector_for_generation)\n",
-        "\n",
-        "for epoch in range(1, epochs + 1):\n",
-        "  start_time = time.time()\n",
-        "  for train_x in train_dataset:\n",
-        "    gradients, loss = compute_gradients(model, train_x)\n",
-        "    apply_gradients(optimizer, gradients, model.trainable_variables)\n",
-        "  end_time = time.time()\n",
-        "\n",
-        "  if epoch % 1 == 0:\n",
-        "    loss = tfe.metrics.Mean()\n",
-        "    for test_x in test_dataset:\n",
-        "      loss(compute_loss(model, test_x))\n",
-        "    elbo = -loss.result()\n",
-        "    display.clear_output(wait=False)\n",
-        "    print('Epoch: {}, Test set ELBO: {}, '\n",
-        "          'time elapse for current epoch {}'.format(epoch,\n",
-        "                                                    elbo,\n",
-        "                                                    end_time - start_time))\n",
-        "    generate_and_save_images(\n",
-        "        model, epoch, random_vector_for_generation)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "P4M_vIbUi7c0"
-      },
-      "source": [
-        "### Display an image using the epoch number"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "WfO5wCdclHGL"
-      },
-      "outputs": [],
-      "source": [
-        "def display_image(epoch_no):\n",
-        "  return PIL.Image.open('image_at_epoch_{:04d}.png'.format(epoch_no))"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "5x3q9_Oe5q0A"
-      },
-      "outputs": [],
-      "source": [
-        "display_image(epochs)  # Display images"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "NywiH3nL8guF"
-      },
-      "source": [
-        "### Generate a GIF of all the saved images."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "IGKQgENQ8lEI"
-      },
-      "outputs": [],
-      "source": [
-        "with imageio.get_writer('cvae.gif', mode='I') as writer:\n",
-        "  filenames = glob.glob('image*.png')\n",
-        "  filenames = sorted(filenames)\n",
-        "  last = -1\n",
-        "  for i,filename in enumerate(filenames):\n",
-        "    frame = 2*(i**0.5)\n",
-        "    if round(frame) \u003e round(last):\n",
-        "      last = frame\n",
-        "    else:\n",
-        "      continue\n",
-        "    image = imageio.imread(filename)\n",
-        "    writer.append_data(image)\n",
-        "  image = imageio.imread(filename)\n",
-        "  writer.append_data(image)\n",
-        "    \n",
-        "# this is a hack to display the gif inside the notebook\n",
-        "os.system('cp cvae.gif cvae.gif.png')"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "uV0yiKpzNP1b"
-      },
-      "outputs": [],
-      "source": [
-        "display.Image(filename=\"cvae.gif.png\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "yQXO_dlXkKsT"
-      },
-      "source": [
-        "To downlod the animation from Colab uncomment the code below:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "4fSJS3m5HLFM"
-      },
-      "outputs": [],
-      "source": [
-        "#from google.colab import files\n",
-        "#files.download('cvae.gif')"
-      ]
     }
   ],
   "metadata": {
     "accelerator": "GPU",
     "colab": {
       "collapsed_sections": [],
-      "default_view": {},
       "name": "cvae.ipynb",
       "private_outputs": true,
       "provenance": [
@@ -635,8 +47,7 @@
         }
       ],
       "toc_visible": true,
-      "version": "0.3.2",
-      "views": {}
+      "version": "0.3.2"
     },
     "kernelspec": {
       "display_name": "Python 3",
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb
index 78fcd397087fd1fd64aebed7ac3b5c6b2f45c450..53767058838459e56215d286e9f8f8eb66287147 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb
@@ -1,26 +1,11 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "dcgan.ipynb",
-      "version": "0.3.2",
-      "provenance": [],
-      "collapsed_sections": []
-    },
-    "kernelspec": {
-      "name": "python2",
-      "display_name": "Python 2"
-    },
-    "accelerator": "GPU"
-  },
   "cells": [
     {
+      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "0TD5ZrvEMbhZ"
       },
-      "cell_type": "markdown",
       "source": [
         "**Copyright 2018 The TensorFlow Authors**.\n",
         "\n",
@@ -28,851 +13,39 @@
         "\n",
         "# Generating Handwritten Digits with DCGAN\n",
         "\n",
-        "<table class=\"tfo-notebook-buttons\" align=\"left\"><td>\n",
-        "<a target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb\">\n",
-        "    <img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>  \n",
-        "</td><td>\n",
-        "<a target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb\"><img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a></td></table>"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "ITZuApL56Mny"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "This tutorial demonstrates how to generate images of handwritten digits using a Deep Convolutional Generative Adversarial Network ([DCGAN](https://arxiv.org/pdf/1511.06434.pdf)). The code is written in [tf.keras](https://www.tensorflow.org/programmers_guide/keras) with [eager execution](https://www.tensorflow.org/programmers_guide/eager) enabled. "
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "toc",
-        "id": "x2McrO9bMyLN"
-      },
-      "cell_type": "markdown",
-      "source": [
-        ">[Generating Handwritten Digits with DCGAN](#scrollTo=0TD5ZrvEMbhZ)\n",
-        "\n",
-        ">>[What are GANs?](#scrollTo=2MbKJY38Puy9)\n",
-        "\n",
-        ">>>[Import TensorFlow and enable eager execution](#scrollTo=e1_Y75QXJS6h)\n",
-        "\n",
-        ">>>[Load the dataset](#scrollTo=iYn4MdZnKCey)\n",
-        "\n",
-        ">>>[Use tf.data to create batches and shuffle the dataset](#scrollTo=PIGN6ouoQxt3)\n",
-        "\n",
-        ">>[Create the models](#scrollTo=THY-sZMiQ4UV)\n",
-        "\n",
-        ">>>[The Generator Model](#scrollTo=-tEyxE-GMC48)\n",
-        "\n",
-        ">>>[The Discriminator model](#scrollTo=D0IKnaCtg6WE)\n",
-        "\n",
-        ">>[Define the loss functions and the optimizer](#scrollTo=0FMYgY_mPfTi)\n",
-        "\n",
-        ">>>[Generator loss](#scrollTo=Jd-3GCUEiKtv)\n",
-        "\n",
-        ">>>[Discriminator loss](#scrollTo=PKY_iPSPNWoj)\n",
-        "\n",
-        ">>[Set up GANs for Training](#scrollTo=Rw1fkAczTQYh)\n",
-        "\n",
-        ">>[Train the GANs](#scrollTo=dZrd4CdjR-Fp)\n",
-        "\n",
-        ">>[Generated images](#scrollTo=P4M_vIbUi7c0)\n",
+        "This example has moved.\n",
         "\n",
-        ">>[Learn more about GANs](#scrollTo=k6qC-SbjK0yW)\n",
-        "\n"
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/r2/tutorials/generative/dcgan.ipynb\"\u003e\n",
+        "    \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e  \n",
+        "\u003c/td\u003e\u003ctd\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/blob/master/site/en/r2/tutorials/generative/dcgan.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
       ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "2MbKJY38Puy9"
       },
-      "cell_type": "markdown",
       "source": [
-        "## What are GANs?\n",
-        "GANs, or [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661), are a framework for estimating generative models. Two models are trained simultaneously by an adversarial process: a Generator, which is responsible for generating data (say, images), and a Discriminator, which is responsible for estimating the probability that an image was drawn from the training data (the image is real), or was produced by the Generator (the image is fake). During training, the Generator becomes progressively better at generating images, until the Discriminator is no longer able to distinguish real images from fake. \n",
-        "\n",
-        "![alt text](https://github.com/margaretmz/tensorflow/blob/margaret-dcgan/tensorflow/contrib/eager/python/examples/generative_examples/gans_diagram.png?raw=1)\n",
-        "\n",
-        "We will demonstrate this process end-to-end on MNIST. Below is an animation that shows a series of images produced by the Generator as it was trained for 50 epochs. Overtime, the generated images become increasingly difficult to distinguish from the training set.\n",
-        "\n",
-        "To learn more about GANs, we recommend MIT's [Intro to Deep Learning](http://introtodeeplearning.com/) course, which includes a lecture on Deep Generative Models ([video](https://youtu.be/JVb54xhEw6Y) | [slides](http://introtodeeplearning.com/materials/2018_6S191_Lecture4.pdf)). Now, let's head to the code!\n",
-        "\n",
         "![sample output](https://tensorflow.org/images/gan/dcgan.gif)"
       ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [],
+      "name": "dcgan.ipynb",
+      "provenance": [],
+      "version": "0.3.2"
     },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "u_2z-B3piVsw",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "# Install imgeio in order to generate an animated gif showing the image generating process\n",
-        "!pip install imageio"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "e1_Y75QXJS6h"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "### Import TensorFlow and enable eager execution"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "YfIk2es3hJEd",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "import tensorflow as tf\n",
-        "tf.enable_eager_execution()\n",
-        "\n",
-        "import glob\n",
-        "import imageio\n",
-        "import matplotlib.pyplot as plt\n",
-        "import numpy as np\n",
-        "import os\n",
-        "import PIL\n",
-        "import time\n",
-        "\n",
-        "from IPython import display"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "iYn4MdZnKCey"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "### Load the dataset\n",
-        "\n",
-        "We are going to use the MNIST dataset to train the generator and the discriminator. The generator will generate handwritten digits resembling the MNIST data."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "a4fYMGxGhrna",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "(train_images, train_labels), (_, _) = tf.keras.datasets.mnist.load_data()"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "NFC2ghIdiZYE",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "train_images = train_images.reshape(train_images.shape[0], 28, 28, 1).astype('float32')\n",
-        "train_images = (train_images - 127.5) / 127.5 # Normalize the images to [-1, 1]"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "S4PIDhoDLbsZ",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "BUFFER_SIZE = 60000\n",
-        "BATCH_SIZE = 256"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "PIGN6ouoQxt3"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "### Use tf.data to create batches and shuffle the dataset"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "-yKCCQOoJ7cn",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "train_dataset = tf.data.Dataset.from_tensor_slices(train_images).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "THY-sZMiQ4UV"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Create the models\n",
-        "\n",
-        "We will use tf.keras [Sequential API](https://www.tensorflow.org/guide/keras#sequential_model) to define the generator and discriminator models."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "-tEyxE-GMC48"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "### The Generator Model\n",
-        "\n",
-        "The generator is responsible for creating convincing images that are good enough to fool the discriminator. The network architecture for the generator consists of [Conv2DTranspose](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Conv2DTranspose) (Upsampling) layers. We start with a fully connected layer and upsample the image two times in order to reach the desired image size of 28x28x1. We increase the width and height, and reduce the depth as we move through the layers in the network. We use [Leaky ReLU](https://www.tensorflow.org/api_docs/python/tf/keras/layers/LeakyReLU) activation for each layer except for the last one where we use a tanh activation."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "6bpTcDqoLWjY",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "def make_generator_model():\n",
-        "    model = tf.keras.Sequential()\n",
-        "    model.add(tf.keras.layers.Dense(7*7*256, use_bias=False, input_shape=(100,)))\n",
-        "    model.add(tf.keras.layers.BatchNormalization())\n",
-        "    model.add(tf.keras.layers.LeakyReLU())\n",
-        "      \n",
-        "    model.add(tf.keras.layers.Reshape((7, 7, 256)))\n",
-        "    assert model.output_shape == (None, 7, 7, 256) # Note: None is the batch size\n",
-        "    \n",
-        "    model.add(tf.keras.layers.Conv2DTranspose(128, (5, 5), strides=(1, 1), padding='same', use_bias=False))\n",
-        "    assert model.output_shape == (None, 7, 7, 128)  \n",
-        "    model.add(tf.keras.layers.BatchNormalization())\n",
-        "    model.add(tf.keras.layers.LeakyReLU())\n",
-        "\n",
-        "    model.add(tf.keras.layers.Conv2DTranspose(64, (5, 5), strides=(2, 2), padding='same', use_bias=False))\n",
-        "    assert model.output_shape == (None, 14, 14, 64)    \n",
-        "    model.add(tf.keras.layers.BatchNormalization())\n",
-        "    model.add(tf.keras.layers.LeakyReLU())\n",
-        "\n",
-        "    model.add(tf.keras.layers.Conv2DTranspose(1, (5, 5), strides=(2, 2), padding='same', use_bias=False, activation='tanh'))\n",
-        "    assert model.output_shape == (None, 28, 28, 1)\n",
-        "  \n",
-        "    return model"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "D0IKnaCtg6WE"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "### The Discriminator model\n",
-        "\n",
-        "The discriminator is responsible for distinguishing fake images from real images. It's similar to a regular CNN-based image classifier."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "dw2tPLmk2pEP",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "def make_discriminator_model():\n",
-        "    model = tf.keras.Sequential()\n",
-        "    model.add(tf.keras.layers.Conv2D(64, (5, 5), strides=(2, 2), padding='same'))\n",
-        "    model.add(tf.keras.layers.LeakyReLU())\n",
-        "    model.add(tf.keras.layers.Dropout(0.3))\n",
-        "      \n",
-        "    model.add(tf.keras.layers.Conv2D(128, (5, 5), strides=(2, 2), padding='same'))\n",
-        "    model.add(tf.keras.layers.LeakyReLU())\n",
-        "    model.add(tf.keras.layers.Dropout(0.3))\n",
-        "       \n",
-        "    model.add(tf.keras.layers.Flatten())\n",
-        "    model.add(tf.keras.layers.Dense(1))\n",
-        "     \n",
-        "    return model"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "gDkA05NE6QMs",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "generator = make_generator_model()\n",
-        "discriminator = make_discriminator_model()"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "0FMYgY_mPfTi"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Define the loss functions and the optimizer\n",
-        "\n",
-        "Let's define the loss functions and the optimizers for the generator and the discriminator.\n"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "Jd-3GCUEiKtv"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "### Generator loss\n",
-        "The generator loss is a sigmoid cross entropy loss of the generated images and an array of ones, since the generator is trying to generate fake images that resemble the real images."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "90BIcCKcDMxz",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "def generator_loss(generated_output):\n",
-        "    return tf.losses.sigmoid_cross_entropy(tf.ones_like(generated_output), generated_output)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "PKY_iPSPNWoj"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "### Discriminator loss\n",
-        "\n",
-        "The discriminator loss function takes two inputs: real images, and generated images. Here is how to calculate the discriminator loss:\n",
-        "1. Calculate real_loss which is a sigmoid cross entropy loss of the real images and an array of ones (since these are the real images).\n",
-        "2. Calculate generated_loss which is a sigmoid cross entropy loss of the generated images and an array of zeros (since these are the fake images).\n",
-        "3. Calculate the total_loss as the sum of real_loss and generated_loss."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "wkMNfBWlT-PV",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "def discriminator_loss(real_output, generated_output):\n",
-        "    # [1,1,...,1] with real output since it is true and we want our generated examples to look like it\n",
-        "    real_loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=tf.ones_like(real_output), logits=real_output)\n",
-        "\n",
-        "    # [0,0,...,0] with generated images since they are fake\n",
-        "    generated_loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=tf.zeros_like(generated_output), logits=generated_output)\n",
-        "\n",
-        "    total_loss = real_loss + generated_loss\n",
-        "\n",
-        "    return total_loss"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "MgIc7i0th_Iu"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "The discriminator and the generator optimizers are different since we will train two networks separately."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "iWCn_PVdEJZ7",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "generator_optimizer = tf.train.AdamOptimizer(1e-4)\n",
-        "discriminator_optimizer = tf.train.AdamOptimizer(1e-4)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "mWtinsGDPJlV"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "**Checkpoints (Object-based saving)**"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "CA1w-7s2POEy",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "checkpoint_dir = './training_checkpoints'\n",
-        "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")\n",
-        "checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,\n",
-        "                                 discriminator_optimizer=discriminator_optimizer,\n",
-        "                                 generator=generator,\n",
-        "                                 discriminator=discriminator)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "Rw1fkAczTQYh"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Set up GANs for Training\n",
-        "\n"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "5QC5BABamh_c"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "Now it's time to put together the generator and discriminator to set up the Generative Adversarial Networks, as you see in the diagam at the beginning of the tutorial."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "Ff6oN6PZX27n"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "**Define training parameters**"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "NS2GWywBbAWo",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "EPOCHS = 50\n",
-        "noise_dim = 100\n",
-        "num_examples_to_generate = 16\n",
-        "\n",
-        "# We'll re-use this random vector used to seed the generator so\n",
-        "# it will be easier to see the improvement over time.\n",
-        "random_vector_for_generation = tf.random_normal([num_examples_to_generate,\n",
-        "                                                 noise_dim])"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "jylSonrqSWfi"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "**Define training method**\n",
-        "\n",
-        "We start by iterating over the dataset. The generator is given a random vector as an input which is processed to  output an image looking like a handwritten digit. The discriminator is then shown the real MNIST images as well as the generated images.\n",
-        "\n",
-        "Next, we calculate the generator and the discriminator loss. Then, we calculate the gradients of loss with respect to both the generator and the discriminator variables."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "3t5ibNo05jCB",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "def train_step(images):\n",
-        "   # generating noise from a normal distribution\n",
-        "      noise = tf.random_normal([BATCH_SIZE, noise_dim])\n",
-        "      \n",
-        "      with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:\n",
-        "        generated_images = generator(noise, training=True)\n",
-        "      \n",
-        "        real_output = discriminator(images, training=True)\n",
-        "        generated_output = discriminator(generated_images, training=True)\n",
-        "         \n",
-        "        gen_loss = generator_loss(generated_output)\n",
-        "        disc_loss = discriminator_loss(real_output, generated_output)\n",
-        "        \n",
-        "      gradients_of_generator = gen_tape.gradient(gen_loss, generator.variables)\n",
-        "      gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.variables)\n",
-        "      \n",
-        "      generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.variables))\n",
-        "      discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.variables))"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "6TSZgwc2BUQ-"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "\n",
-        "This model takes about ~30 seconds per epoch to train on a single Tesla K80 on Colab, as of October 2018. \n",
-        "\n",
-        "Eager execution can be slower than executing the equivalent graph as it can't benefit from whole-program optimizations on the graph, and also incurs overheads of interpreting Python code. By using [tf.contrib.eager.defun](https://www.tensorflow.org/api_docs/python/tf/contrib/eager/defun) to create graph functions, we get a ~20 secs/epoch performance boost (from ~50 secs/epoch down to ~30 secs/epoch). This way we get the best of both eager execution (easier for debugging) and graph mode (better performance)."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "Iwya07_j5p2A",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "train_step = tf.contrib.eager.defun(train_step)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "2M7LmLtGEMQJ",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "def train(dataset, epochs):  \n",
-        "  for epoch in range(epochs):\n",
-        "    start = time.time()\n",
-        "    \n",
-        "    for images in dataset:\n",
-        "      train_step(images)\n",
-        "\n",
-        "    display.clear_output(wait=True)\n",
-        "    generate_and_save_images(generator,\n",
-        "                               epoch + 1,\n",
-        "                               random_vector_for_generation)\n",
-        "    \n",
-        "    # saving (checkpoint) the model every 15 epochs\n",
-        "    if (epoch + 1) % 15 == 0:\n",
-        "      checkpoint.save(file_prefix = checkpoint_prefix)\n",
-        "    \n",
-        "    print ('Time taken for epoch {} is {} sec'.format(epoch + 1,\n",
-        "                                                      time.time()-start))\n",
-        "  # generating after the final epoch\n",
-        "  display.clear_output(wait=True)\n",
-        "  generate_and_save_images(generator,\n",
-        "                           epochs,\n",
-        "                           random_vector_for_generation)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "2aFF7Hk3XdeW"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "**Generate and save images**\n",
-        "\n"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "RmdVsmvhPxyy",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "def generate_and_save_images(model, epoch, test_input):\n",
-        "  # make sure the training parameter is set to False because we\n",
-        "  # don't want to train the batchnorm layer when doing inference.\n",
-        "  predictions = model(test_input, training=False)\n",
-        "\n",
-        "  fig = plt.figure(figsize=(4,4))\n",
-        "  \n",
-        "  for i in range(predictions.shape[0]):\n",
-        "      plt.subplot(4, 4, i+1)\n",
-        "      plt.imshow(predictions[i, :, :, 0] * 127.5 + 127.5, cmap='gray')\n",
-        "      plt.axis('off')\n",
-        "        \n",
-        "  plt.savefig('image_at_epoch_{:04d}.png'.format(epoch))\n",
-        "  plt.show()"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "dZrd4CdjR-Fp"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Train the GANs\n",
-        "We will call the train() method defined above to train the generator and discriminator simultaneously. Note, training GANs can be tricky. It's important that the generator and discriminator do not overpower each other (e.g., that they train at a similar rate).\n",
-        "\n",
-        "At the beginning of the training, the generated images look like random noise. As training progresses, you can see the generated digits look increasingly real. After 50 epochs, they look very much like the MNIST digits."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "Ly3UN0SLLY2l",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "%%time\n",
-        "train(train_dataset, EPOCHS)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "rfM4YcPVPkNO"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "**Restore the latest checkpoint**"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "XhXsd0srPo8c",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "# restoring the latest checkpoint in checkpoint_dir\n",
-        "checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "P4M_vIbUi7c0"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Generated images \n"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "mLskt7EfXAjr"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "\n",
-        "After training, its time to generate some images! \n",
-        "The last step is to plot the generated images and voila!\n"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "WfO5wCdclHGL",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "# Display a single image using the epoch number\n",
-        "def display_image(epoch_no):\n",
-        "  return PIL.Image.open('image_at_epoch_{:04d}.png'.format(epoch_no))"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "5x3q9_Oe5q0A",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "display_image(EPOCHS)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "NywiH3nL8guF"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "**Generate a GIF of all the saved images**\n",
-        "\n",
-        "We will use imageio to create an animated gif using all the images saved during training."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "IGKQgENQ8lEI",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "with imageio.get_writer('dcgan.gif', mode='I') as writer:\n",
-        "  filenames = glob.glob('image*.png')\n",
-        "  filenames = sorted(filenames)\n",
-        "  last = -1\n",
-        "  for i,filename in enumerate(filenames):\n",
-        "    frame = 2*(i**0.5)\n",
-        "    if round(frame) > round(last):\n",
-        "      last = frame\n",
-        "    else:\n",
-        "      continue\n",
-        "    image = imageio.imread(filename)\n",
-        "    writer.append_data(image)\n",
-        "  image = imageio.imread(filename)\n",
-        "  writer.append_data(image)\n",
-        "    \n",
-        "# this is a hack to display the gif inside the notebook\n",
-        "os.system('cp dcgan.gif dcgan.gif.png')"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "cGhC3-fMWSwl"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "Display the animated gif with all the mages generated during the training of GANs."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "uV0yiKpzNP1b",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "display.Image(filename=\"dcgan.gif.png\")"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "6EEG-wePkmJQ"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "**Download the animated gif**\n",
-        "\n",
-        "Uncomment the code below to download an animated gif from Colab."
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "code",
-        "id": "4UJjSnIMOzOJ",
-        "colab": {}
-      },
-      "cell_type": "code",
-      "source": [
-        "#from google.colab import files\n",
-        "#files.download('dcgan.gif')"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "k6qC-SbjK0yW"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Learn more about GANs\n"
-      ]
-    },
-    {
-      "metadata": {
-        "colab_type": "text",
-        "id": "xjjkT9KAK6H7"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "We hope this tutorial was helpful! As a next step, you might like to experiment with a different dataset, for example the Large-scale Celeb Faces Attributes (CelebA) dataset [available on Kaggle](https://www.kaggle.com/jessicali9530/celeba-dataset/home).\n",
-        "\n",
-        "To learn more about GANs:\n",
-        "\n",
-        "* Check out MIT's lecture (linked above), or [this](http://cs231n.stanford.edu/slides/2018/cs231n_2018_lecture12.pdf) lecture form Stanford's CS231n. \n",
-        "\n",
-        "* We also recommend the [CVPR 2018 Tutorial on GANs](https://sites.google.com/view/cvpr2018tutorialongans/), and the [NIPS 2016 Tutorial: Generative Adversarial Networks](https://arxiv.org/abs/1701.00160).\n"
-      ]
+    "kernelspec": {
+      "display_name": "Python 2",
+      "name": "python2"
     }
-  ]
-}
\ No newline at end of file
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/gans_diagram.png b/tensorflow/contrib/eager/python/examples/generative_examples/gans_diagram.png
deleted file mode 100644
index b715bd83ef117641c6429e0ac173dbe9b8d5fd88..0000000000000000000000000000000000000000
Binary files a/tensorflow/contrib/eager/python/examples/generative_examples/gans_diagram.png and /dev/null differ
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
index 12c5eff2b4aa901bdab52bf545e95b1e4dce7468..979772acd3f823a8cc53ab5e026946ad3bb19353 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
@@ -1,1174 +1,71 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "K2s1A9eLRPEj"
-   },
-   "source": [
-    "##### Copyright 2018 The TensorFlow Authors.\n",
-    "\n",
-    "Licensed under the Apache License, Version 2.0 (the \"License\").\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "Cffg2i257iMS"
-   },
-   "source": [
-    "# Image Captioning with Attention\n",
-    "\n",
-    "<table class=\"tfo-notebook-buttons\" align=\"left\"><td>\n",
-    "<a target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb\">\n",
-    "    <img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>  \n",
-    "</td><td>\n",
-    "<a target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb\"><img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a></td></table>"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "QASbY_HGo4Lq"
-   },
-   "source": [
-    "Image captioning is the task of generating a caption for an image. Given an image like this:\n",
-    "\n",
-    "![Man Surfing](https://tensorflow.org/images/surf.jpg) \n",
-    "\n",
-    "[Image Source](https://commons.wikimedia.org/wiki/Surfing#/media/File:Surfing_in_Hawaii.jpg), License: Public Domain\n",
-    "\n",
-    "Our goal is to generate a caption, such as \"a surfer riding on a wave\". Here, we'll use an attention-based model. This enables us to see which parts of the image the model focuses on as it generates a caption.\n",
-    "\n",
-    "![Prediction](https://tensorflow.org/images/imcap_prediction.png)\n",
-    "\n",
-    "This model architecture below is similar to [Show, Attend and Tell: Neural Image Caption Generation with Visual Attention](https://arxiv.org/abs/1502.03044). \n",
-    "\n",
-    "The code uses [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager), which you can learn more about in the linked guides.\n",
-    "\n",
-    "This notebook is an end-to-end example. If you run it, it will download the  [MS-COCO](http://cocodataset.org/#home) dataset, preprocess and cache a subset of the images using Inception V3, train an encoder-decoder model, and use it to generate captions on new images.\n",
-    "\n",
-    "The code requires TensorFlow version >=1.9. If you're running this in [Colab]()\n",
-    "\n",
-    "In this example, we're training on a relatively small amount of data as an example. On a single P100 GPU, this example will take about ~2 hours to train. We train on the first 30,000 captions (corresponding to about ~20,000 images depending on shuffling, as there are multiple captions per image in the dataset)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "U8l4RJ0XRPEm"
-   },
-   "outputs": [],
-   "source": [
-    "# Import TensorFlow and enable eager execution\n",
-    "# This code requires TensorFlow version >=1.9\n",
-    "import tensorflow as tf\n",
-    "tf.enable_eager_execution()\n",
-    "\n",
-    "# We'll generate plots of attention in order to see which parts of an image\n",
-    "# our model focuses on during captioning\n",
-    "import matplotlib.pyplot as plt\n",
-    "\n",
-    "# Scikit-learn includes many helpful utilities\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "from sklearn.utils import shuffle\n",
-    "\n",
-    "import re\n",
-    "import numpy as np\n",
-    "import os\n",
-    "import time\n",
-    "import json\n",
-    "from glob import glob\n",
-    "from PIL import Image\n",
-    "import pickle"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "b6qbGw8MRPE5"
-   },
-   "source": [
-    "## Download and prepare the MS-COCO dataset\n",
-    "\n",
-    "We will use the [MS-COCO dataset](http://cocodataset.org/#home) to train our model. This dataset contains >82,000 images, each of which has been annotated with at least 5 different captions. The code below will download and extract the dataset automatically.  \n",
-    "\n",
-    "**Caution: large download ahead**. We'll use the training set, it's a 13GB file."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "krQuPYTtRPE7"
-   },
-   "outputs": [],
-   "source": [
-    "annotation_zip = tf.keras.utils.get_file('captions.zip', \n",
-    "                                          cache_subdir=os.path.abspath('.'),\n",
-    "                                          origin = 'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',\n",
-    "                                          extract = True)\n",
-    "annotation_file = os.path.dirname(annotation_zip)+'/annotations/captions_train2014.json'\n",
-    "\n",
-    "name_of_zip = 'train2014.zip'\n",
-    "if not os.path.exists(os.path.abspath('.') + '/' + name_of_zip):\n",
-    "  image_zip = tf.keras.utils.get_file(name_of_zip, \n",
-    "                                      cache_subdir=os.path.abspath('.'),\n",
-    "                                      origin = 'http://images.cocodataset.org/zips/train2014.zip',\n",
-    "                                      extract = True)\n",
-    "  PATH = os.path.dirname(image_zip)+'/train2014/'\n",
-    "else:\n",
-    "  PATH = os.path.abspath('.')+'/train2014/'"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "aANEzb5WwSzg"
-   },
-   "source": [
-    "## Optionally, limit the size of the training set for faster training\n",
-    "For this example, we'll select a subset of 30,000 captions and use these and the corresponding images to train our model. As always, captioning quality will improve if you choose to use more data."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "4G3b8x8_RPFD"
-   },
-   "outputs": [],
-   "source": [
-    "# read the json file\n",
-    "with open(annotation_file, 'r') as f:\n",
-    "    annotations = json.load(f)\n",
-    "\n",
-    "# storing the captions and the image name in vectors\n",
-    "all_captions = []\n",
-    "all_img_name_vector = []\n",
-    "\n",
-    "for annot in annotations['annotations']:\n",
-    "    caption = '<start> ' + annot['caption'] + ' <end>'\n",
-    "    image_id = annot['image_id']\n",
-    "    full_coco_image_path = PATH + 'COCO_train2014_' + '%012d.jpg' % (image_id)\n",
-    "    \n",
-    "    all_img_name_vector.append(full_coco_image_path)\n",
-    "    all_captions.append(caption)\n",
-    "\n",
-    "# shuffling the captions and image_names together\n",
-    "# setting a random state\n",
-    "train_captions, img_name_vector = shuffle(all_captions,\n",
-    "                                          all_img_name_vector,\n",
-    "                                          random_state=1)\n",
-    "\n",
-    "# selecting the first 30000 captions from the shuffled set\n",
-    "num_examples = 30000\n",
-    "train_captions = train_captions[:num_examples]\n",
-    "img_name_vector = img_name_vector[:num_examples]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "mPBMgK34RPFL"
-   },
-   "outputs": [],
-   "source": [
-    "len(train_captions), len(all_captions)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "8cSW4u-ORPFQ"
-   },
-   "source": [
-    "## Preprocess the images using InceptionV3\n",
-    "Next, we will use InceptionV3 (pretrained on Imagenet) to classify each image. We will extract features from the last convolutional layer. \n",
-    "\n",
-    "First, we will need to convert the images into the format inceptionV3 expects by:\n",
-    "* Resizing the image to (299, 299)\n",
-    "* Using the [preprocess_input](https://www.tensorflow.org/api_docs/python/tf/keras/applications/inception_v3/preprocess_input) method to place the pixels in the range of -1 to 1 (to match the format of the images used to train InceptionV3)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "zXR0217aRPFR"
-   },
-   "outputs": [],
-   "source": [
-    "def load_image(image_path):\n",
-    "    img = tf.read_file(image_path)\n",
-    "    img = tf.image.decode_jpeg(img, channels=3)\n",
-    "    img = tf.image.resize_images(img, (299, 299))\n",
-    "    img = tf.keras.applications.inception_v3.preprocess_input(img)\n",
-    "    return img, image_path"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "MDvIu4sXRPFV"
-   },
-   "source": [
-    "## Initialize InceptionV3 and load the pretrained Imagenet weights\n",
-    "\n",
-    "To do so, we'll create a tf.keras model where the output layer is the last convolutional layer in the InceptionV3 architecture. \n",
-    "* Each image is forwarded through the network and the vector that we get at the end is stored in a dictionary (image_name --> feature_vector). \n",
-    "* We use the last convolutional layer because we are using attention in this example. The shape of the output of this layer is ```8x8x2048```. \n",
-    "* We avoid doing this during training so it does not become a bottleneck. \n",
-    "* After all the images are passed through the network, we pickle the dictionary and save it to disk."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "RD3vW4SsRPFW"
-   },
-   "outputs": [],
-   "source": [
-    "image_model = tf.keras.applications.InceptionV3(include_top=False, \n",
-    "                                                weights='imagenet')\n",
-    "new_input = image_model.input\n",
-    "hidden_layer = image_model.layers[-1].output\n",
-    "\n",
-    "image_features_extract_model = tf.keras.Model(new_input, hidden_layer)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "rERqlR3WRPGO"
-   },
-   "source": [
-    "## Caching the features extracted from InceptionV3\n",
-    "\n",
-    "We will pre-process each image with InceptionV3 and cache the output to disk. Caching the output in RAM would be faster but memory intensive, requiring 8 \\* 8 \\* 2048 floats per image. At the time of writing, this would exceed the memory limitations of Colab (although these may change, an instance appears to have about 12GB of memory currently). \n",
-    "\n",
-    "Performance could be improved with a more sophisticated caching strategy (e.g., by sharding the images to reduce random access disk I/O) at the cost of more code.\n",
-    "\n",
-    "This will take about 10 minutes to run in Colab with a GPU. If you'd like to see a progress bar, you could: install [tqdm](https://github.com/tqdm/tqdm) (```!pip install tqdm```), then change this line: \n",
-    "\n",
-    "```for img, path in image_dataset:``` \n",
-    "\n",
-    "to:\n",
-    "\n",
-    "```for img, path in tqdm(image_dataset):```."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "Dx_fvbVgRPGQ"
-   },
-   "outputs": [],
-   "source": [
-    "# getting the unique images\n",
-    "encode_train = sorted(set(img_name_vector))\n",
-    "\n",
-    "# feel free to change the batch_size according to your system configuration\n",
-    "image_dataset = tf.data.Dataset.from_tensor_slices(\n",
-    "                                encode_train).map(load_image).batch(16)\n",
-    "\n",
-    "for img, path in image_dataset:\n",
-    "  batch_features = image_features_extract_model(img)\n",
-    "  batch_features = tf.reshape(batch_features, \n",
-    "                              (batch_features.shape[0], -1, batch_features.shape[3]))\n",
-    "\n",
-    "  for bf, p in zip(batch_features, path):\n",
-    "    path_of_feature = p.numpy().decode(\"utf-8\")\n",
-    "    np.save(path_of_feature, bf.numpy())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "nyqH3zFwRPFi"
-   },
-   "source": [
-    "## Preprocess and tokenize the captions\n",
-    "\n",
-    "* First, we'll tokenize the captions (e.g., by splitting on spaces). This will give us a  vocabulary of all the unique words in the data (e.g., \"surfing\", \"football\", etc).\n",
-    "* Next, we'll limit the vocabulary size to the top 5,000 words to save memory. We'll replace all other words with the token \"UNK\" (for unknown).\n",
-    "* Finally, we create a word --> index mapping and vice-versa.\n",
-    "* We will then pad all sequences to the be same length as the longest one. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "HZfK8RhQRPFj"
-   },
-   "outputs": [],
-   "source": [
-    "# This will find the maximum length of any caption in our dataset\n",
-    "def calc_max_length(tensor):\n",
-    "    return max(len(t) for t in tensor)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "oJGE34aiRPFo"
-   },
-   "outputs": [],
-   "source": [
-    "# The steps above is a general process of dealing with text processing\n",
-    "\n",
-    "# choosing the top 5000 words from the vocabulary\n",
-    "top_k = 5000\n",
-    "tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k, \n",
-    "                                                  oov_token=\"<unk>\", \n",
-    "                                                  filters='!\"#$%&()*+.,-/:;=?@[\\]^_`{|}~ ')\n",
-    "tokenizer.fit_on_texts(train_captions)\n",
-    "train_seqs = tokenizer.texts_to_sequences(train_captions)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "8Q44tNQVRPFt"
-   },
-   "outputs": [],
-   "source": [
-    "tokenizer.word_index['<pad>'] = 0"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "0fpJb5ojRPFv"
-   },
-   "outputs": [],
-   "source": [
-    "# creating the tokenized vectors\n",
-    "train_seqs = tokenizer.texts_to_sequences(train_captions)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "AidglIZVRPF4"
-   },
-   "outputs": [],
-   "source": [
-    "# padding each vector to the max_length of the captions\n",
-    "# if the max_length parameter is not provided, pad_sequences calculates that automatically\n",
-    "cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "gL0wkttkRPGA"
-   },
-   "outputs": [],
-   "source": [
-    "# calculating the max_length \n",
-    "# used to store the attention weights\n",
-    "max_length = calc_max_length(train_seqs)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "M3CD75nDpvTI"
-   },
-   "source": [
-    "## Split the data into training and testing"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "iS7DDMszRPGF"
-   },
-   "outputs": [],
-   "source": [
-    "# Create training and validation sets using 80-20 split\n",
-    "img_name_train, img_name_val, cap_train, cap_val = train_test_split(img_name_vector, \n",
-    "                                                                    cap_vector, \n",
-    "                                                                    test_size=0.2, \n",
-    "                                                                    random_state=0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "XmViPkRFRPGH"
-   },
-   "outputs": [],
-   "source": [
-    "len(img_name_train), len(cap_train), len(img_name_val), len(cap_val)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "uEWM9xrYcg45"
-   },
-   "source": [
-    "## Our images and captions are ready! Next, let's create a tf.data dataset to use for training our model.\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "Q3TnZ1ToRPGV"
-   },
-   "outputs": [],
-   "source": [
-    "# feel free to change these parameters according to your system's configuration\n",
-    "\n",
-    "BATCH_SIZE = 64\n",
-    "BUFFER_SIZE = 1000\n",
-    "embedding_dim = 256\n",
-    "units = 512\n",
-    "vocab_size = len(tokenizer.word_index)\n",
-    "# shape of the vector extracted from InceptionV3 is (64, 2048)\n",
-    "# these two variables represent that\n",
-    "features_shape = 2048\n",
-    "attention_features_shape = 64"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "SmZS2N0bXG3T"
-   },
-   "outputs": [],
-   "source": [
-    "# loading the numpy files \n",
-    "def map_func(img_name, cap):\n",
-    "    img_tensor = np.load(img_name.decode('utf-8')+'.npy')\n",
-    "    return img_tensor, cap"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "FDF_Nm3tRPGZ"
-   },
-   "outputs": [],
-   "source": [
-    "dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))\n",
-    "\n",
-    "# using map to load the numpy files in parallel\n",
-    "# NOTE: Be sure to set num_parallel_calls to the number of CPU cores you have\n",
-    "# https://www.tensorflow.org/api_docs/python/tf/py_func\n",
-    "dataset = dataset.map(lambda item1, item2: tf.py_func(\n",
-    "          map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=8)\n",
-    "\n",
-    "# shuffling and batching\n",
-    "dataset = dataset.shuffle(BUFFER_SIZE)\n",
-    "# https://www.tensorflow.org/api_docs/python/tf/contrib/data/batch_and_drop_remainder\n",
-    "dataset = dataset.batch(BATCH_SIZE)\n",
-    "dataset = dataset.prefetch(1)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "nrvoDphgRPGd"
-   },
-   "source": [
-    "## Model\n",
-    "\n",
-    "Fun fact, the decoder below is identical to the one in the example for [Neural Machine Translation with Attention]( https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb).\n",
-    "\n",
-    "The model architecture is inspired by the [Show, Attend and Tell](https://arxiv.org/pdf/1502.03044.pdf) paper.\n",
-    "\n",
-    "* In this example, we extract the features from the lower convolutional layer of InceptionV3 giving us a vector of shape (8, 8, 2048). \n",
-    "* We squash that to a shape of (64, 2048).\n",
-    "* This vector is then passed through the CNN Encoder(which consists of a single Fully connected layer).\n",
-    "* The RNN(here GRU) attends over the image to predict the next word."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "AAppCGLKRPGd"
-   },
-   "outputs": [],
-   "source": [
-    "def gru(units):\n",
-    "  # If you have a GPU, we recommend using the CuDNNGRU layer (it provides a \n",
-    "  # significant speedup).\n",
-    "  if tf.test.is_gpu_available():\n",
-    "    return tf.keras.layers.CuDNNGRU(units, \n",
-    "                                    return_sequences=True, \n",
-    "                                    return_state=True, \n",
-    "                                    recurrent_initializer='glorot_uniform')\n",
-    "  else:\n",
-    "    return tf.keras.layers.GRU(units, \n",
-    "                               return_sequences=True, \n",
-    "                               return_state=True, \n",
-    "                               recurrent_activation='sigmoid', \n",
-    "                               recurrent_initializer='glorot_uniform')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "ja2LFTMSdeV3"
-   },
-   "outputs": [],
-   "source": [
-    "class BahdanauAttention(tf.keras.Model):\n",
-    "  def __init__(self, units):\n",
-    "    super(BahdanauAttention, self).__init__()\n",
-    "    self.W1 = tf.keras.layers.Dense(units)\n",
-    "    self.W2 = tf.keras.layers.Dense(units)\n",
-    "    self.V = tf.keras.layers.Dense(1)\n",
-    "  \n",
-    "  def call(self, features, hidden):\n",
-    "    # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)\n",
-    "    \n",
-    "    # hidden shape == (batch_size, hidden_size)\n",
-    "    # hidden_with_time_axis shape == (batch_size, 1, hidden_size)\n",
-    "    hidden_with_time_axis = tf.expand_dims(hidden, 1)\n",
-    "    \n",
-    "    # score shape == (batch_size, 64, hidden_size)\n",
-    "    score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))\n",
-    "    \n",
-    "    # attention_weights shape == (batch_size, 64, 1)\n",
-    "    # we get 1 at the last axis because we are applying score to self.V\n",
-    "    attention_weights = tf.nn.softmax(self.V(score), axis=1)\n",
-    "    \n",
-    "    # context_vector shape after sum == (batch_size, hidden_size)\n",
-    "    context_vector = attention_weights * features\n",
-    "    context_vector = tf.reduce_sum(context_vector, axis=1)\n",
-    "    \n",
-    "    return context_vector, attention_weights"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "AZ7R1RxHRPGf"
-   },
-   "outputs": [],
-   "source": [
-    "class CNN_Encoder(tf.keras.Model):\n",
-    "    # Since we have already extracted the features and dumped it using pickle\n",
-    "    # This encoder passes those features through a Fully connected layer\n",
-    "    def __init__(self, embedding_dim):\n",
-    "        super(CNN_Encoder, self).__init__()\n",
-    "        # shape after fc == (batch_size, 64, embedding_dim)\n",
-    "        self.fc = tf.keras.layers.Dense(embedding_dim)\n",
-    "        \n",
-    "    def call(self, x):\n",
-    "        x = self.fc(x)\n",
-    "        x = tf.nn.relu(x)\n",
-    "        return x"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "V9UbGQmERPGi"
-   },
-   "outputs": [],
-   "source": [
-    "class RNN_Decoder(tf.keras.Model):\n",
-    "  def __init__(self, embedding_dim, units, vocab_size):\n",
-    "    super(RNN_Decoder, self).__init__()\n",
-    "    self.units = units\n",
-    "\n",
-    "    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n",
-    "    self.gru = gru(self.units)\n",
-    "    self.fc1 = tf.keras.layers.Dense(self.units)\n",
-    "    self.fc2 = tf.keras.layers.Dense(vocab_size)\n",
-    "    \n",
-    "    self.attention = BahdanauAttention(self.units)\n",
-    "        \n",
-    "  def call(self, x, features, hidden):\n",
-    "    # defining attention as a separate model\n",
-    "    context_vector, attention_weights = self.attention(features, hidden)\n",
-    "    \n",
-    "    # x shape after passing through embedding == (batch_size, 1, embedding_dim)\n",
-    "    x = self.embedding(x)\n",
-    "    \n",
-    "    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)\n",
-    "    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)\n",
-    "    \n",
-    "    # passing the concatenated vector to the GRU\n",
-    "    output, state = self.gru(x)\n",
-    "    \n",
-    "    # shape == (batch_size, max_length, hidden_size)\n",
-    "    x = self.fc1(output)\n",
-    "    \n",
-    "    # x shape == (batch_size * max_length, hidden_size)\n",
-    "    x = tf.reshape(x, (-1, x.shape[2]))\n",
-    "    \n",
-    "    # output shape == (batch_size * max_length, vocab)\n",
-    "    x = self.fc2(x)\n",
-    "\n",
-    "    return x, state, attention_weights\n",
-    "\n",
-    "  def reset_state(self, batch_size):\n",
-    "    return tf.zeros((batch_size, self.units))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "Qs_Sr03wRPGk"
-   },
-   "outputs": [],
-   "source": [
-    "encoder = CNN_Encoder(embedding_dim)\n",
-    "decoder = RNN_Decoder(embedding_dim, units, vocab_size)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "-bYN7xA0RPGl"
-   },
-   "outputs": [],
-   "source": [
-    "optimizer = tf.train.AdamOptimizer()\n",
-    "\n",
-    "# We are masking the loss calculated for padding\n",
-    "def loss_function(real, pred):\n",
-    "    mask = 1 - np.equal(real, 0)\n",
-    "    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask\n",
-    "    return tf.reduce_mean(loss_)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "PHod7t72RPGn"
-   },
-   "source": [
-    "## Training\n",
-    "\n",
-    "* We extract the features stored in the respective `.npy` files and then pass those features through the encoder.\n",
-    "* The encoder output, hidden state(initialized to 0) and the decoder input (which is the start token) is passed to the decoder.\n",
-    "* The decoder returns the predictions and the decoder hidden state.\n",
-    "* The decoder hidden state is then passed back into the model and the predictions are used to calculate the loss.\n",
-    "* Use teacher forcing to decide the next input to the decoder.\n",
-    "* Teacher forcing is the technique where the target word is passed as the next input to the decoder.\n",
-    "* The final step is to calculate the gradients and apply it to the optimizer and backpropagate.\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "Vt4WZ5mhJE-E"
-   },
-   "outputs": [],
-   "source": [
-    "# adding this in a separate cell because if you run the training cell \n",
-    "# many times, the loss_plot array will be reset\n",
-    "loss_plot = []"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "UlA4VIQpRPGo"
-   },
-   "outputs": [],
-   "source": [
-    "EPOCHS = 20\n",
-    "\n",
-    "for epoch in range(EPOCHS):\n",
-    "    start = time.time()\n",
-    "    total_loss = 0\n",
-    "    \n",
-    "    for (batch, (img_tensor, target)) in enumerate(dataset):\n",
-    "        loss = 0\n",
-    "        \n",
-    "        # initializing the hidden state for each batch\n",
-    "        # because the captions are not related from image to image\n",
-    "        hidden = decoder.reset_state(batch_size=target.shape[0])\n",
-    "\n",
-    "        dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * BATCH_SIZE, 1)\n",
-    "        \n",
-    "        with tf.GradientTape() as tape:\n",
-    "            features = encoder(img_tensor)\n",
-    "            \n",
-    "            for i in range(1, target.shape[1]):\n",
-    "                # passing the features through the decoder\n",
-    "                predictions, hidden, _ = decoder(dec_input, features, hidden)\n",
-    "\n",
-    "                loss += loss_function(target[:, i], predictions)\n",
-    "                \n",
-    "                # using teacher forcing\n",
-    "                dec_input = tf.expand_dims(target[:, i], 1)\n",
-    "        \n",
-    "        total_loss += (loss / int(target.shape[1]))\n",
-    "        \n",
-    "        variables = encoder.variables + decoder.variables\n",
-    "        \n",
-    "        gradients = tape.gradient(loss, variables) \n",
-    "        \n",
-    "        optimizer.apply_gradients(zip(gradients, variables), tf.train.get_or_create_global_step())\n",
-    "        \n",
-    "        if batch % 100 == 0:\n",
-    "            print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, \n",
-    "                                                          batch, \n",
-    "                                                          loss.numpy() / int(target.shape[1])))\n",
-    "    # storing the epoch end loss value to plot later\n",
-    "    loss_plot.append(total_loss / len(cap_vector))\n",
-    "    \n",
-    "    print ('Epoch {} Loss {:.6f}'.format(epoch + 1, \n",
-    "                                         total_loss/len(cap_vector)))\n",
-    "    print ('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "1Wm83G-ZBPcC"
-   },
-   "outputs": [],
-   "source": [
-    "plt.plot(loss_plot)\n",
-    "plt.xlabel('Epochs')\n",
-    "plt.ylabel('Loss')\n",
-    "plt.title('Loss Plot')\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "xGvOcLQKghXN"
-   },
-   "source": [
-    "## Caption!\n",
-    "\n",
-    "* The evaluate function is similar to the training loop, except we don't use teacher forcing here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.\n",
-    "* Stop predicting when the model predicts the end token.\n",
-    "* And store the attention weights for every time step."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "RCWpDtyNRPGs"
-   },
-   "outputs": [],
-   "source": [
-    "def evaluate(image):\n",
-    "    attention_plot = np.zeros((max_length, attention_features_shape))\n",
-    "\n",
-    "    hidden = decoder.reset_state(batch_size=1)\n",
-    "\n",
-    "    temp_input = tf.expand_dims(load_image(image)[0], 0)\n",
-    "    img_tensor_val = image_features_extract_model(temp_input)\n",
-    "    img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))\n",
-    "\n",
-    "    features = encoder(img_tensor_val)\n",
-    "\n",
-    "    dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)\n",
-    "    result = []\n",
-    "\n",
-    "    for i in range(max_length):\n",
-    "        predictions, hidden, attention_weights = decoder(dec_input, features, hidden)\n",
-    "\n",
-    "        attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()\n",
-    "\n",
-    "        predicted_id = tf.argmax(predictions[0]).numpy()\n",
-    "        result.append(tokenizer.index_word[predicted_id])\n",
-    "\n",
-    "        if tokenizer.index_word[predicted_id] == '<end>':\n",
-    "            return result, attention_plot\n",
-    "\n",
-    "        dec_input = tf.expand_dims([predicted_id], 0)\n",
-    "\n",
-    "    attention_plot = attention_plot[:len(result), :]\n",
-    "    return result, attention_plot"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
-    },
-    "colab_type": "code",
-    "id": "fD_y7PD6RPGt"
-   },
-   "outputs": [],
-   "source": [
-    "def plot_attention(image, result, attention_plot):\n",
-    "    temp_image = np.array(Image.open(image))\n",
-    "\n",
-    "    fig = plt.figure(figsize=(10, 10))\n",
-    "    \n",
-    "    len_result = len(result)\n",
-    "    for l in range(len_result):\n",
-    "        temp_att = np.resize(attention_plot[l], (8, 8))\n",
-    "        ax = fig.add_subplot(len_result//2, len_result//2, l+1)\n",
-    "        ax.set_title(result[l])\n",
-    "        img = ax.imshow(temp_image)\n",
-    "        ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=img.get_extent())\n",
-    "\n",
-    "    plt.tight_layout()\n",
-    "    plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "K2s1A9eLRPEj"
+      },
+      "source": [
+        "##### Copyright 2018 The TensorFlow Authors.\n",
+        "\n",
+        "Licensed under the Apache License, Version 2.0 (the \"License\").\n"
+      ]
     },
-    "colab_type": "code",
-    "id": "io7ws3ReRPGv"
-   },
-   "outputs": [],
-   "source": [
-    "# captions on the validation set\n",
-    "rid = np.random.randint(0, len(img_name_val))\n",
-    "image = img_name_val[rid]\n",
-    "real_caption = ' '.join([tokenizer.index_word[i] for i in cap_val[rid] if i not in [0]])\n",
-    "result, attention_plot = evaluate(image)\n",
-    "\n",
-    "print ('Real Caption:', real_caption)\n",
-    "print ('Prediction Caption:', ' '.join(result))\n",
-    "plot_attention(image, result, attention_plot)\n",
-    "# opening the image\n",
-    "Image.open(img_name_val[rid])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "Rprk3HEvZuxb"
-   },
-   "source": [
-    "## Try it on your own images\n",
-    "For fun, below we've provided a method you can use to caption your own images with the model we've just trained. Keep in mind, it was trained on a relatively small amount of data, and your images may be different from the training data (so be prepared for weird results!)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "autoexec": {
-      "startup": false,
-      "wait_interval": 0
-     }
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Cffg2i257iMS"
+      },
+      "source": [
+        "# Image Captioning with Attention\n",
+        "\n",
+        "This example has moved:\n",
+        "\n",
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/r2/tutorials/generative/image_captioning.ipynb\"\u003e\n",
+        "    \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e  \n",
+        "\u003c/td\u003e\u003ctd\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/docs/blob/master/site/en/r2/tutorials/generative/image_captioning.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
+      ]
     },
-    "colab_type": "code",
-    "id": "9Psd1quzaAWg"
-   },
-   "outputs": [],
-   "source": [
-    "image_url = 'https://tensorflow.org/images/surf.jpg'\n",
-    "image_extension = image_url[-4:]\n",
-    "image_path = tf.keras.utils.get_file('image'+image_extension, \n",
-    "                                     origin=image_url)\n",
-    "\n",
-    "result, attention_plot = evaluate(image_path)\n",
-    "print ('Prediction Caption:', ' '.join(result))\n",
-    "plot_attention(image_path, result, attention_plot)\n",
-    "# opening the image\n",
-    "Image.open(image_path)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "VJZXyJco6uLO"
-   },
-   "source": [
-    "# Next steps\n",
-    "\n",
-    "Congrats! You've just trained an image captioning model with attention. Next, we recommend taking a look at this example [Neural Machine Translation with Attention]( https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb). It uses a similar architecture to translate between Spanish and English sentences. You can also experiment with training the code in this notebook on a different dataset."
-   ]
-  }
- ],
- "metadata": {
-  "accelerator": "GPU",
-  "colab": {
-   "collapsed_sections": [],
-   "default_view": {},
-   "name": "image_captioning_with_attention.ipynb",
-   "private_outputs": true,
-   "provenance": [
     {
-     "file_id": "1HI8OK2sMjcx9CTWVn0122QAHOuXaOaMg",
-     "timestamp": 1530222436922
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "QASbY_HGo4Lq"
+      },
+      "source": [
+        "![Man Surfing](https://tensorflow.org/images/surf.jpg) \n",
+        "\n",
+        "[Image Source](https://commons.wikimedia.org/wiki/Surfing#/media/File:Surfing_in_Hawaii.jpg), License: Public Domain\n",
+        "\n",
+        "![Prediction](https://tensorflow.org/images/imcap_prediction.png)\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [],
+      "name": "image_captioning_with_attention.ipynb",
+      "private_outputs": true,
+      "provenance": [
+        {
+          "file_id": "1HI8OK2sMjcx9CTWVn0122QAHOuXaOaMg",
+          "timestamp": 1530222436922
+        }
+      ],
+      "toc_visible": true,
+      "version": "0.3.2"
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
     }
-   ],
-   "toc_visible": true,
-   "version": "0.3.2",
-   "views": {}
-  },
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
   },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
+  "nbformat": 4,
+  "nbformat_minor": 0
 }
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
index bda9e77085e45ae31a228142135425e22a1c6780..c945c753b3ba36d16aa6985d23a5849f8f552304 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
@@ -13,633 +13,13 @@
         "\n",
         "# Text Generation using a RNN\n",
         "\n",
+        "This example has moved.\n",
+        "\n",
         "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb\"\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/sequences/text_generation.ipynb\"\u003e\n",
         "    \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e  \n",
         "\u003c/td\u003e\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on Github\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "BwpJ5IffzRG6"
-      },
-      "source": [
-        "This notebook demonstrates how to generate text using an RNN using [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager). If you like, you can write a similar [model](https://github.com/fchollet/deep-learning-with-python-notebooks/blob/master/8.1-text-generation-with-lstm.ipynb) using less code. Here, we show a lower-level impementation that's useful to understand as prework before diving in to deeper examples in a similar, like [Neural Machine Translation with Attention](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb).\n",
-        "\n",
-        "This notebook is an end-to-end example. When you run it, it will download a dataset of Shakespeare's writing. We'll use a collection of plays, borrowed from Andrej Karpathy's excellent [The Unreasonable Effectiveness of Recurrent Neural Networks](http://karpathy.github.io/2015/05/21/rnn-effectiveness/).  The notebook will train a model, and use it to generate sample output.\n",
-        "  \n",
-        "Here is the output(with start string='w') after training a single layer GRU for 30 epochs with the default settings below:\n",
-        "\n",
-        "```\n",
-        "were to the death of him\n",
-        "And nothing of the field in the view of hell,\n",
-        "When I said, banish him, I will not burn thee that would live.\n",
-        "\n",
-        "HENRY BOLINGBROKE:\n",
-        "My gracious uncle--\n",
-        "\n",
-        "DUKE OF YORK:\n",
-        "As much disgraced to the court, the gods them speak,\n",
-        "And now in peace himself excuse thee in the world.\n",
-        "\n",
-        "HORTENSIO:\n",
-        "Madam, 'tis not the cause of the counterfeit of the earth,\n",
-        "And leave me to the sun that set them on the earth\n",
-        "And leave the world and are revenged for thee.\n",
-        "\n",
-        "GLOUCESTER:\n",
-        "I would they were talking with the very name of means\n",
-        "To make a puppet of a guest, and therefore, good Grumio,\n",
-        "Nor arm'd to prison, o' the clouds, of the whole field,\n",
-        "With the admire\n",
-        "With the feeding of thy chair, and we have heard it so,\n",
-        "I thank you, sir, he is a visor friendship with your silly your bed.\n",
-        "\n",
-        "SAMPSON:\n",
-        "I do desire to live, I pray: some stand of the minds, make thee remedies\n",
-        "With the enemies of my soul.\n",
-        "\n",
-        "MENENIUS:\n",
-        "I'll keep the cause of my mistress.\n",
-        "\n",
-        "POLIXENES:\n",
-        "My brother Marcius!\n",
-        "\n",
-        "Second Servant:\n",
-        "Will't ple\n",
-        "```\n",
-        "\n",
-        "Of course, while some of the sentences are grammatical, most do not make sense. But, consider:\n",
-        "\n",
-        "* Our model is character based (when we began training, it did not yet know how to spell a valid English word, or that words were even a unit of text).\n",
-        "\n",
-        "* The structure of the output resembles a play (blocks begin with a speaker name, in all caps similar to the original text). Sentences generally end with a period. If you look at the text from a distance (or don't read the invididual words too closely, it appears as if it's an excerpt from a play).\n",
-        "\n",
-        "As a next step, you can experiment training the model on a different dataset - any large text file(ASCII) will do, and you can modify a single line of code below to make that change. Have fun!\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "R3p22DBDsaCA"
-      },
-      "source": [
-        "## Install unidecode library\n",
-        "A helpful library to convert unicode to ASCII."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "wZ6LOM12wKGH"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install unidecode"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "WGyKZj3bzf9p"
-      },
-      "source": [
-        "## Import tensorflow and enable eager execution."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "yG_n40gFzf9s"
-      },
-      "outputs": [],
-      "source": [
-        "# Import TensorFlow \u003e= 1.10 and enable eager execution\n",
-        "import tensorflow as tf\n",
-        "\n",
-        "# Note: Once you enable eager execution, it cannot be disabled. \n",
-        "tf.enable_eager_execution()\n",
-        "\n",
-        "import numpy as np\n",
-        "import os\n",
-        "import re\n",
-        "import random\n",
-        "import unidecode\n",
-        "import time"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "EHDoRoc5PKWz"
-      },
-      "source": [
-        "## Download the dataset\n",
-        "\n",
-        "In this example, we will use the [shakespeare dataset](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt). You can use any other dataset that you like.\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "pD_55cOxLkAb"
-      },
-      "outputs": [],
-      "source": [
-        "path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "UHjdCjDuSvX_"
-      },
-      "source": [
-        "## Read the dataset\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "-E5JvY3wzf94"
-      },
-      "outputs": [],
-      "source": [
-        "text = unidecode.unidecode(open(path_to_file).read())\n",
-        "# length of text is the number of characters in it\n",
-        "print (len(text))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "Il9ww98izf-D"
-      },
-      "source": [
-        "Creating dictionaries to map from characters to their indices and vice-versa, which will be used to vectorize the inputs"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "IalZLbvOzf-F"
-      },
-      "outputs": [],
-      "source": [
-        "# unique contains all the unique characters in the file\n",
-        "unique = sorted(set(text))\n",
-        "\n",
-        "# creating a mapping from unique characters to indices\n",
-        "char2idx = {u:i for i, u in enumerate(unique)}\n",
-        "idx2char = {i:u for i, u in enumerate(unique)}"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "1v_qUYfAzf-I"
-      },
-      "outputs": [],
-      "source": [
-        "# setting the maximum length sentence we want for a single input in characters\n",
-        "max_length = 100\n",
-        "\n",
-        "# length of the vocabulary in chars\n",
-        "vocab_size = len(unique)\n",
-        "\n",
-        "# the embedding dimension \n",
-        "embedding_dim = 256\n",
-        "\n",
-        "# number of RNN (here GRU) units\n",
-        "units = 1024\n",
-        "\n",
-        "# batch size \n",
-        "BATCH_SIZE = 64\n",
-        "\n",
-        "# buffer size to shuffle our dataset\n",
-        "BUFFER_SIZE = 10000"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "LFjSVAlWzf-N"
-      },
-      "source": [
-        "## Creating the input and output tensors\n",
-        "\n",
-        "Vectorizing the input and the target text because our model cannot understand strings only numbers.\n",
-        "\n",
-        "But first, we need to create the input and output vectors.\n",
-        "Remember the max_length we set above, we will use it here. We are creating **max_length** chunks of input, where each input vector is all the characters in that chunk except the last and the target vector is all the characters in that chunk except the first.\n",
-        "\n",
-        "For example, consider that the string = 'tensorflow' and the max_length is 9\n",
-        "\n",
-        "So, the `input = 'tensorflo'` and `output = 'ensorflow'`\n",
-        "\n",
-        "After creating the vectors, we convert each character into numbers using the **char2idx** dictionary we created above."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "0UHJDA39zf-O"
-      },
-      "outputs": [],
-      "source": [
-        "input_text = []\n",
-        "target_text = []\n",
-        "\n",
-        "for f in range(0, len(text)-max_length, max_length):\n",
-        "    inps = text[f:f+max_length]\n",
-        "    targ = text[f+1:f+1+max_length]\n",
-        "\n",
-        "    input_text.append([char2idx[i] for i in inps])\n",
-        "    target_text.append([char2idx[t] for t in targ])\n",
-        "    \n",
-        "print (np.array(input_text).shape)\n",
-        "print (np.array(target_text).shape)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "MJdfPmdqzf-R"
-      },
-      "source": [
-        "## Creating batches and shuffling them using tf.data"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "p2pGotuNzf-S"
-      },
-      "outputs": [],
-      "source": [
-        "dataset = tf.data.Dataset.from_tensor_slices((input_text, target_text)).shuffle(BUFFER_SIZE)\n",
-        "dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "m8gPwEjRzf-Z"
-      },
-      "source": [
-        "## Creating the model\n",
-        "\n",
-        "We use the Model Subclassing API which gives us full flexibility to create the model and change it however we like. We use 3 layers to define our model.\n",
-        "\n",
-        "* Embedding layer\n",
-        "* GRU layer (you can use an LSTM layer here)\n",
-        "* Fully connected layer"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "P3KTiiInzf-a"
-      },
-      "outputs": [],
-      "source": [
-        "class Model(tf.keras.Model):\n",
-        "  def __init__(self, vocab_size, embedding_dim, units, batch_size):\n",
-        "    super(Model, self).__init__()\n",
-        "    self.units = units\n",
-        "    self.batch_sz = batch_size\n",
-        "\n",
-        "    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n",
-        "\n",
-        "    if tf.test.is_gpu_available():\n",
-        "      self.gru = tf.keras.layers.CuDNNGRU(self.units, \n",
-        "                                          return_sequences=True, \n",
-        "                                          return_state=True, \n",
-        "                                          recurrent_initializer='glorot_uniform')\n",
-        "    else:\n",
-        "      self.gru = tf.keras.layers.GRU(self.units, \n",
-        "                                     return_sequences=True, \n",
-        "                                     return_state=True, \n",
-        "                                     recurrent_activation='sigmoid', \n",
-        "                                     recurrent_initializer='glorot_uniform')\n",
-        "\n",
-        "    self.fc = tf.keras.layers.Dense(vocab_size)\n",
-        "        \n",
-        "  def call(self, x, hidden):\n",
-        "    x = self.embedding(x)\n",
-        "\n",
-        "    # output shape == (batch_size, max_length, hidden_size) \n",
-        "    # states shape == (batch_size, hidden_size)\n",
-        "\n",
-        "    # states variable to preserve the state of the model\n",
-        "    # this will be used to pass at every step to the model while training\n",
-        "    output, states = self.gru(x, initial_state=hidden)\n",
-        "\n",
-        "\n",
-        "    # reshaping the output so that we can pass it to the Dense layer\n",
-        "    # after reshaping the shape is (batch_size * max_length, hidden_size)\n",
-        "    output = tf.reshape(output, (-1, output.shape[2]))\n",
-        "\n",
-        "    # The dense layer will output predictions for every time_steps(max_length)\n",
-        "    # output shape after the dense layer == (max_length * batch_size, vocab_size)\n",
-        "    x = self.fc(output)\n",
-        "\n",
-        "    return x, states"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "trpqTWyvk0nr"
-      },
-      "source": [
-        "## Call the model and set the optimizer and the loss function"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "7t2XrzEOzf-e"
-      },
-      "outputs": [],
-      "source": [
-        "model = Model(vocab_size, embedding_dim, units, BATCH_SIZE)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "dkjWIATszf-h"
-      },
-      "outputs": [],
-      "source": [
-        "optimizer = tf.train.AdamOptimizer()\n",
-        "\n",
-        "# using sparse_softmax_cross_entropy so that we don't have to create one-hot vectors\n",
-        "def loss_function(real, preds):\n",
-        "    return tf.losses.sparse_softmax_cross_entropy(labels=real, logits=preds)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "3K6s6F79P7za"
-      },
-      "source": [
-        "## Checkpoints (Object-based saving)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "oAGisDdfP9rL"
-      },
-      "outputs": [],
-      "source": [
-        "checkpoint_dir = './training_checkpoints'\n",
-        "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")\n",
-        "checkpoint = tf.train.Checkpoint(optimizer=optimizer,\n",
-        "                                 model=model)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "lPrP0XMUzf-p"
-      },
-      "source": [
-        "## Train the model\n",
-        "\n",
-        "Here we will use a custom training loop with the help of GradientTape()\n",
-        "\n",
-        "* We initialize the hidden state of the model with zeros and shape == (batch_size, number of rnn units). We do this by calling the function defined while creating the model.\n",
-        "\n",
-        "* Next, we iterate over the dataset(batch by batch) and calculate the **predictions and the hidden states** associated with that input.\n",
-        "\n",
-        "* There are a lot of interesting things happening here.\n",
-        "  * The model gets hidden state(initialized with 0), lets call that **H0** and the first batch of input, lets call that **I0**.\n",
-        "  * The model then returns the predictions **P1** and **H1**.\n",
-        "  * For the next batch of input, the model receives **I1** and **H1**.\n",
-        "  * The interesting thing here is that we pass **H1** to the model with **I1** which is how the model learns. The context learned from batch to batch is contained in the **hidden state**.\n",
-        "  * We continue doing this until the dataset is exhausted and then we start a new epoch and repeat this.\n",
-        "\n",
-        "* After calculating the predictions, we calculate the **loss** using the loss function defined above. Then we calculate the gradients of the loss with respect to the model variables(input)\n",
-        "\n",
-        "* Finally, we take a step in that direction with the help of the optimizer using the apply_gradients function.\n",
-        "\n",
-        "Note:- If you are running this notebook in Colab which has a **Tesla K80 GPU** it takes about 23 seconds per epoch.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "d4tSNwymzf-q"
-      },
-      "outputs": [],
-      "source": [
-        "# Training step\n",
-        "\n",
-        "EPOCHS = 20\n",
-        "\n",
-        "for epoch in range(EPOCHS):\n",
-        "    start = time.time()\n",
-        "    \n",
-        "    # initializing the hidden state at the start of every epoch\n",
-        "    hidden = model.reset_states()\n",
-        "    \n",
-        "    for (batch, (inp, target)) in enumerate(dataset):\n",
-        "          with tf.GradientTape() as tape:\n",
-        "              # feeding the hidden state back into the model\n",
-        "              # This is the interesting step\n",
-        "              predictions, hidden = model(inp, hidden)\n",
-        "              \n",
-        "              # reshaping the target because that's how the \n",
-        "              # loss function expects it\n",
-        "              target = tf.reshape(target, (-1,))\n",
-        "              loss = loss_function(target, predictions)\n",
-        "              \n",
-        "          grads = tape.gradient(loss, model.variables)\n",
-        "          optimizer.apply_gradients(zip(grads, model.variables))\n",
-        "\n",
-        "          if batch % 100 == 0:\n",
-        "              print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch+1,\n",
-        "                                                            batch,\n",
-        "                                                            loss))\n",
-        "    # saving (checkpoint) the model every 5 epochs\n",
-        "    if (epoch + 1) % 5 == 0:\n",
-        "      checkpoint.save(file_prefix = checkpoint_prefix)\n",
-        "\n",
-        "    print ('Epoch {} Loss {:.4f}'.format(epoch+1, loss))\n",
-        "    print('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "01AR9vpNQMFF"
-      },
-      "source": [
-        "## Restore the latest checkpoint"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "tyvpYomYQQkF"
-      },
-      "outputs": [],
-      "source": [
-        "# restoring the latest checkpoint in checkpoint_dir\n",
-        "checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "DjGz1tDkzf-u"
-      },
-      "source": [
-        "## Predicting using our trained model\n",
-        "\n",
-        "The below code block is used to generated the text\n",
-        "\n",
-        "* We start by choosing a start string and initializing the hidden state and setting the number of characters we want to generate.\n",
-        "\n",
-        "* We get predictions using the start_string and the hidden state\n",
-        "\n",
-        "* Then we use argmax to calculate the index of the predicted word. **We use this predicted word as our next input to the model**\n",
-        "\n",
-        "* **The hidden state returned by the model is fed back into the model so that it now has more context rather than just one word.** After we predict the next word, the modified hidden states are again fed back into the model, which is how it learns as it gets more context from the previously predicted words.\n",
-        "\n",
-        "* If you see the predictions, the model knows when to capitalize, make paragraphs and the text follows a shakespeare style of writing which is pretty awesome!"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "WvuwZBX5Ogfd"
-      },
-      "outputs": [],
-      "source": [
-        "# Evaluation step(generating text using the model learned)\n",
-        "\n",
-        "# number of characters to generate\n",
-        "num_generate = 1000\n",
-        "\n",
-        "# You can change the start string to experiment\n",
-        "start_string = 'Q'\n",
-        "# converting our start string to numbers(vectorizing!) \n",
-        "input_eval = [char2idx[s] for s in start_string]\n",
-        "input_eval = tf.expand_dims(input_eval, 0)\n",
-        "\n",
-        "# empty string to store our results\n",
-        "text_generated = ''\n",
-        "\n",
-        "# hidden state shape == (batch_size, number of rnn units); here batch size == 1\n",
-        "hidden = [tf.zeros((1, units))]\n",
-        "for i in range(num_generate):\n",
-        "    predictions, hidden = model(input_eval, hidden)\n",
-        "\n",
-        "    # using argmax to predict the word returned by the model\n",
-        "    predicted_id = tf.argmax(predictions[-1]).numpy()\n",
-        "    \n",
-        "    # We pass the predicted word as the next input to the model\n",
-        "    # along with the previous hidden state\n",
-        "    input_eval = tf.expand_dims([predicted_id], 0)\n",
-        "    \n",
-        "    text_generated += idx2char[predicted_id]\n",
-        "\n",
-        "print (start_string + text_generated)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "AM2Uma_-yVIq"
-      },
-      "source": [
-        "## Next steps\n",
-        "\n",
-        "* Change the start string to a different character, or the start of a sentence.\n",
-        "* Experiment with training on a different, or with different parameters. [Project  Gutenberg](http://www.gutenberg.org/ebooks/100), for example, contains a large collection of books.\n",
-        "* Add another RNN layer.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "gtEd86sX5cB2"
-      },
-      "outputs": [],
-      "source": [
-        ""
+        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/docs/blob/master/site/en/tutorials/sequences/text_generation.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
       ]
     }
   ],
diff --git a/tensorflow/contrib/eager/python/examples/l2hmc/BUILD b/tensorflow/contrib/eager/python/examples/l2hmc/BUILD
index 7bdf9053de749af9d09b12ba7b848e21c1fdb8f0..35d509904211d98f124d2555fc48166e75cb0dd9 100644
--- a/tensorflow/contrib/eager/python/examples/l2hmc/BUILD
+++ b/tensorflow/contrib/eager/python/examples/l2hmc/BUILD
@@ -28,7 +28,7 @@ py_library(
 
 cuda_py_test(
     name = "l2hmc_test",
-    size = "large",
+    size = "medium",
     srcs = ["l2hmc_test.py"],
     additional_deps = [
         ":l2hmc",
@@ -36,4 +36,8 @@ cuda_py_test(
         "//tensorflow/contrib/eager/python:tfe",
         "//third_party/py/numpy",
     ],
+    shard_count = 4,
+    tags = [
+        "oss_serial",
+    ],
 )
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/BUILD b/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
index 74ce9e84f013d79b3a33ffa79993980b561e366d..30afef83bc5c6c164c8456ed472f4d6064068a25 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
@@ -9,6 +9,13 @@ py_binary(
     name = "linear_regression",
     srcs = ["linear_regression.py"],
     srcs_version = "PY2AND3",
+    deps = [":linear_regression_lib"],
+)
+
+py_library(
+    name = "linear_regression_lib",
+    srcs = ["linear_regression.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/eager/python:tfe",
@@ -20,10 +27,13 @@ cuda_py_test(
     size = "small",
     srcs = ["linear_regression_test.py"],
     additional_deps = [
-        ":linear_regression",
+        ":linear_regression_lib",
         "//tensorflow:tensorflow_py",
     ],
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
+    tags = [
+        "no_windows",  # TODO: needs investigation on Windows
+        "oss_serial",
+    ],
 )
 
 cuda_py_test(
@@ -31,7 +41,7 @@ cuda_py_test(
     size = "small",
     srcs = ["linear_regression_graph_test.py"],
     additional_deps = [
-        ":linear_regression",
+        ":linear_regression_lib",
         "//tensorflow:tensorflow_py",
     ],
 )
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
index 099b712fc06d1d3eb9ab4095f8db7283690bda76..206ef9409df7b1dc21de42ba919d2ba97f334a8c 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
@@ -56,7 +56,7 @@ class LinearModel(tf.keras.Model):
 
 
 def mean_square_loss(model, xs, ys):
-  return tf.reduce_mean(tf.square(tf.subtract(model(xs), ys)))
+  return tf.reduce_mean(tf.squared_difference(model(xs), ys))
 
 
 def fit(model, dataset, optimizer, verbose=False, logdir=None):
diff --git a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
index 66d52a74943d0d81fde05ce51b019558b327978d..436e887736158ec1ba8e46eac8de4ac7b8e6be01 100644
--- a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
+++ b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
@@ -1,11 +1,28 @@
 {
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "nmt_with_attention.ipynb",
+      "version": "0.3.2",
+      "provenance": [],
+      "private_outputs": true,
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "accelerator": "GPU"
+  },
   "cells": [
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "AOpGoE2T-YXS"
       },
+      "cell_type": "markdown",
       "source": [
         "##### Copyright 2018 The TensorFlow Authors.\n",
         "\n",
@@ -13,19 +30,19 @@
         "\n",
         "# Neural Machine Translation with Attention\n",
         "\n",
-        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb\"\u003e\n",
-        "    \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e  \n",
-        "\u003c/td\u003e\u003ctd\u003e\n",
-        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
+        "<table class=\"tfo-notebook-buttons\" align=\"left\"><td>\n",
+        "<a target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb\">\n",
+        "    <img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>  \n",
+        "</td><td>\n",
+        "<a target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb\"><img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a></td></table>"
       ]
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "CiwtNgENbx2g"
       },
+      "cell_type": "markdown",
       "source": [
         "This notebook trains a sequence to sequence (seq2seq) model for Spanish to English translation using [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager). This is an advanced example that assumes some knowledge of sequence to sequence models.\n",
         "\n",
@@ -33,24 +50,22 @@
         "\n",
         "The translation quality is reasonable for a toy example, but the generated attention plot is perhaps more interesting. This shows which parts of the input sentence has the model's attention while translating:\n",
         "\n",
-        "\u003cimg src=\"https://tensorflow.org/images/spanish-english.png\" alt=\"spanish-english attention plot\"\u003e\n",
+        "<img src=\"https://tensorflow.org/images/spanish-english.png\" alt=\"spanish-english attention plot\">\n",
         "\n",
         "Note: This example takes approximately 10 mintues to run on a single P100 GPU."
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "tnxXKDjq3jEL"
+        "id": "tnxXKDjq3jEL",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "from __future__ import absolute_import, division, print_function\n",
         "\n",
-        "# Import TensorFlow \u003e= 1.10 and enable eager execution\n",
+        "# Import TensorFlow >= 1.10 and enable eager execution\n",
         "import tensorflow as tf\n",
         "\n",
         "tf.enable_eager_execution()\n",
@@ -65,14 +80,16 @@
         "import time\n",
         "\n",
         "print(tf.__version__)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "wfodePkj3jEa"
       },
+      "cell_type": "markdown",
       "source": [
         "## Download and prepare the dataset\n",
         "\n",
@@ -91,14 +108,12 @@
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "kRVATYOgJs1b"
+        "id": "kRVATYOgJs1b",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# Download the file\n",
         "path_to_zip = tf.keras.utils.get_file(\n",
@@ -106,17 +121,17 @@
         "    extract=True)\n",
         "\n",
         "path_to_file = os.path.dirname(path_to_zip)+\"/spa-eng/spa.txt\""
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "rd0jw-eC3jEh"
+        "id": "rd0jw-eC3jEh",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# Converts the unicode file to ascii\n",
         "def unicode_to_ascii(s):\n",
@@ -128,7 +143,7 @@
         "    w = unicode_to_ascii(w.lower().strip())\n",
         "    \n",
         "    # creating a space between a word and the punctuation following it\n",
-        "    # eg: \"he is a boy.\" =\u003e \"he is a boy .\" \n",
+        "    # eg: \"he is a boy.\" => \"he is a boy .\" \n",
         "    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation\n",
         "    w = re.sub(r\"([?.!,¿])\", r\" \\1 \", w)\n",
         "    w = re.sub(r'[\" \"]+', \" \", w)\n",
@@ -140,19 +155,19 @@
         "    \n",
         "    # adding a start and an end token to the sentence\n",
         "    # so that the model know when to start and stop predicting.\n",
-        "    w = '\u003cstart\u003e ' + w + ' \u003cend\u003e'\n",
+        "    w = '<start> ' + w + ' <end>'\n",
         "    return w"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "OHn4Dct23jEm"
+        "id": "OHn4Dct23jEm",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# 1. Remove the accents\n",
         "# 2. Clean the sentences\n",
@@ -163,20 +178,20 @@
         "    word_pairs = [[preprocess_sentence(w) for w in l.split('\\t')]  for l in lines[:num_examples]]\n",
         "    \n",
         "    return word_pairs"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "9xbqO7Iie9bb"
+        "id": "9xbqO7Iie9bb",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
-        "# This class creates a word -\u003e index mapping (e.g,. \"dad\" -\u003e 5) and vice-versa \n",
-        "# (e.g., 5 -\u003e \"dad\") for each language,\n",
+        "# This class creates a word -> index mapping (e.g,. \"dad\" -> 5) and vice-versa \n",
+        "# (e.g., 5 -> \"dad\") for each language,\n",
         "class LanguageIndex():\n",
         "  def __init__(self, lang):\n",
         "    self.lang = lang\n",
@@ -192,23 +207,23 @@
         "    \n",
         "    self.vocab = sorted(self.vocab)\n",
         "    \n",
-        "    self.word2idx['\u003cpad\u003e'] = 0\n",
+        "    self.word2idx['<pad>'] = 0\n",
         "    for index, word in enumerate(self.vocab):\n",
         "      self.word2idx[word] = index + 1\n",
         "    \n",
         "    for word, index in self.word2idx.items():\n",
         "      self.idx2word[index] = word"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "eAY9k49G3jE_"
+        "id": "eAY9k49G3jE_",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "def max_length(tensor):\n",
         "    return max(len(t) for t in tensor)\n",
@@ -244,71 +259,71 @@
         "                                                                  padding='post')\n",
         "    \n",
         "    return input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_tar"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "GOi42V79Ydlr"
       },
+      "cell_type": "markdown",
       "source": [
         "### Limit the size of the dataset to experiment faster (optional)\n",
         "\n",
-        "Training on the complete dataset of \u003e100,000 sentences will take a long time. To train faster, we can limit the size of the dataset to 30,000 sentences (of course, translation quality degrades with less data):"
+        "Training on the complete dataset of >100,000 sentences will take a long time. To train faster, we can limit the size of the dataset to 30,000 sentences (of course, translation quality degrades with less data):"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "cnxC7q-j3jFD"
+        "id": "cnxC7q-j3jFD",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# Try experimenting with the size of that dataset\n",
         "num_examples = 30000\n",
         "input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset(path_to_file, num_examples)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "4QILQkOs3jFG"
+        "id": "4QILQkOs3jFG",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# Creating training and validation sets using an 80-20 split\n",
         "input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)\n",
         "\n",
         "# Show length\n",
         "len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "rgCLkfv5uO3d"
       },
+      "cell_type": "markdown",
       "source": [
         "### Create a tf.data dataset"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "TqHsArVZ3jFS"
+        "id": "TqHsArVZ3jFS",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "BUFFER_SIZE = len(input_tensor_train)\n",
         "BATCH_SIZE = 64\n",
@@ -320,27 +335,29 @@
         "\n",
         "dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)\n",
         "dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "TNfHIF71ulLu"
       },
+      "cell_type": "markdown",
       "source": [
         "## Write the encoder and decoder model\n",
         "\n",
-        "Here, we'll implement an encoder-decoder model with attention which you can read about in the TensorFlow [Neural Machine Translation (seq2seq) tutorial](https://www.tensorflow.org/tutorials/seq2seq). This example uses a more recent set of APIs. This notebook implements the [attention equations](https://www.tensorflow.org/tutorials/seq2seq#background_on_the_attention_mechanism) from the seq2seq tutorial. The following diagram shows that each input words is assigned a weight by the attention mechanism which is then used by the decoder to predict the next word in the sentence.\n",
+        "Here, we'll implement an encoder-decoder model with attention which you can read about in the TensorFlow [Neural Machine Translation (seq2seq) tutorial](https://github.com/tensorflow/nmt). This example uses a more recent set of APIs. This notebook implements the [attention equations](https://github.com/tensorflow/nmt#background-on-the-attention-mechanism) from the seq2seq tutorial. The following diagram shows that each input word is assigned a weight by the attention mechanism which is then used by the decoder to predict the next word in the sentence.\n",
         "\n",
-        "\u003cimg src=\"https://www.tensorflow.org/images/seq2seq/attention_mechanism.jpg\" width=\"500\" alt=\"attention mechanism\"\u003e\n",
+        "<img src=\"https://www.tensorflow.org/images/seq2seq/attention_mechanism.jpg\" width=\"500\" alt=\"attention mechanism\">\n",
         "\n",
         "The input is put through an encoder model which gives us the encoder output of shape *(batch_size, max_length, hidden_size)* and the encoder hidden state of shape *(batch_size, hidden_size)*. \n",
         "\n",
         "Here are the equations that are implemented:\n",
         "\n",
-        "\u003cimg src=\"https://www.tensorflow.org/images/seq2seq/attention_equation_0.jpg\" alt=\"attention equation 0\" width=\"800\"\u003e\n",
-        "\u003cimg src=\"https://www.tensorflow.org/images/seq2seq/attention_equation_1.jpg\" alt=\"attention equation 1\" width=\"800\"\u003e\n",
+        "<img src=\"https://www.tensorflow.org/images/seq2seq/attention_equation_0.jpg\" alt=\"attention equation 0\" width=\"800\">\n",
+        "<img src=\"https://www.tensorflow.org/images/seq2seq/attention_equation_1.jpg\" alt=\"attention equation 1\" width=\"800\">\n",
         "\n",
         "We're using *Bahdanau attention*. Lets decide on notation before writing the simplified form:\n",
         "\n",
@@ -362,14 +379,12 @@
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "avyJ_4VIUoHb"
+        "id": "avyJ_4VIUoHb",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "def gru(units):\n",
         "  # If you have a GPU, we recommend using CuDNNGRU(provides a 3x speedup than GRU)\n",
@@ -385,17 +400,17 @@
         "                               return_state=True, \n",
         "                               recurrent_activation='sigmoid', \n",
         "                               recurrent_initializer='glorot_uniform')"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "nZ2rI24i3jFg"
+        "id": "nZ2rI24i3jFg",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "class Encoder(tf.keras.Model):\n",
         "    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):\n",
@@ -412,17 +427,17 @@
         "    \n",
         "    def initialize_hidden_state(self):\n",
         "        return tf.zeros((self.batch_sz, self.enc_units))"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "yJ_B3mhW3jFk"
+        "id": "yJ_B3mhW3jFk",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "class Decoder(tf.keras.Model):\n",
         "    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):\n",
@@ -476,41 +491,41 @@
         "        \n",
         "    def initialize_hidden_state(self):\n",
         "        return tf.zeros((self.batch_sz, self.dec_units))"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "P5UY8wko3jFp"
+        "id": "P5UY8wko3jFp",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)\n",
         "decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "_ch_71VbIRfK"
       },
+      "cell_type": "markdown",
       "source": [
         "## Define the optimizer and the loss function"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "WmTHr5iV3jFr"
+        "id": "WmTHr5iV3jFr",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "optimizer = tf.train.AdamOptimizer()\n",
         "\n",
@@ -519,41 +534,43 @@
         "  mask = 1 - np.equal(real, 0)\n",
         "  loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask\n",
         "  return tf.reduce_mean(loss_)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "DMVWzzsfNl4e"
       },
+      "cell_type": "markdown",
       "source": [
         "## Checkpoints (Object-based saving)"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "Zj8bXQTgNwrF"
+        "id": "Zj8bXQTgNwrF",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "checkpoint_dir = './training_checkpoints'\n",
         "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt\")\n",
         "checkpoint = tf.train.Checkpoint(optimizer=optimizer,\n",
         "                                 encoder=encoder,\n",
         "                                 decoder=decoder)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "hpObfY22IddU"
       },
+      "cell_type": "markdown",
       "source": [
         "## Training\n",
         "\n",
@@ -567,14 +584,12 @@
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "ddefjBMa3jF0"
+        "id": "ddefjBMa3jF0",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "EPOCHS = 10\n",
         "\n",
@@ -592,7 +607,7 @@
         "            \n",
         "            dec_hidden = enc_hidden\n",
         "            \n",
-        "            dec_input = tf.expand_dims([targ_lang.word2idx['\u003cstart\u003e']] * BATCH_SIZE, 1)       \n",
+        "            dec_input = tf.expand_dims([targ_lang.word2idx['<start>']] * BATCH_SIZE, 1)       \n",
         "            \n",
         "            # Teacher forcing - feeding the target as the next input\n",
         "            for t in range(1, targ.shape[1]):\n",
@@ -625,14 +640,16 @@
         "    print('Epoch {} Loss {:.4f}'.format(epoch + 1,\n",
         "                                        total_loss / N_BATCH))\n",
         "    print('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "mU3Ce8M6I3rz"
       },
+      "cell_type": "markdown",
       "source": [
         "## Translate\n",
         "\n",
@@ -644,14 +661,12 @@
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "EbQpyYs13jF_"
+        "id": "EbQpyYs13jF_",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "def evaluate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ):\n",
         "    attention_plot = np.zeros((max_length_targ, max_length_inp))\n",
@@ -668,7 +683,7 @@
         "    enc_out, enc_hidden = encoder(inputs, hidden)\n",
         "\n",
         "    dec_hidden = enc_hidden\n",
-        "    dec_input = tf.expand_dims([targ_lang.word2idx['\u003cstart\u003e']], 0)\n",
+        "    dec_input = tf.expand_dims([targ_lang.word2idx['<start>']], 0)\n",
         "\n",
         "    for t in range(max_length_targ):\n",
         "        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)\n",
@@ -681,24 +696,24 @@
         "\n",
         "        result += targ_lang.idx2word[predicted_id] + ' '\n",
         "\n",
-        "        if targ_lang.idx2word[predicted_id] == '\u003cend\u003e':\n",
+        "        if targ_lang.idx2word[predicted_id] == '<end>':\n",
         "            return result, sentence, attention_plot\n",
         "        \n",
         "        # the predicted ID is fed back into the model\n",
         "        dec_input = tf.expand_dims([predicted_id], 0)\n",
         "\n",
         "    return result, sentence, attention_plot"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "s5hQWlbN3jGF"
+        "id": "s5hQWlbN3jGF",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# function for plotting the attention weights\n",
         "def plot_attention(attention, sentence, predicted_sentence):\n",
@@ -712,17 +727,17 @@
         "    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)\n",
         "\n",
         "    plt.show()"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "sl9zUHzg3jGI"
+        "id": "sl9zUHzg3jGI",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "def translate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ):\n",
         "    result, sentence, attention_plot = evaluate(sentence, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)\n",
@@ -732,91 +747,93 @@
         "    \n",
         "    attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]\n",
         "    plot_attention(attention_plot, sentence.split(' '), result.split(' '))"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "n250XbnjOaqP"
       },
+      "cell_type": "markdown",
       "source": [
         "## Restore the latest checkpoint and test"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "UJpT9D5_OgP6"
+        "id": "UJpT9D5_OgP6",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# restoring the latest checkpoint in checkpoint_dir\n",
         "checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "WrAM0FDomq3E"
+        "id": "WrAM0FDomq3E",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "translate(u'hace mucho frio aqui.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "zSx2iM36EZQZ"
+        "id": "zSx2iM36EZQZ",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "translate(u'esta es mi vida.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "A3LLCx3ZE0Ls"
+        "id": "A3LLCx3ZE0Ls",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "translate(u'todavia estan en casa?', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "DUQVLVqUE1YW"
+        "id": "DUQVLVqUE1YW",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# wrong translation\n",
         "translate(u'trata de averiguarlo.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "RTe5P5ioMJwN"
       },
+      "cell_type": "markdown",
       "source": [
         "## Next steps\n",
         "\n",
@@ -824,31 +841,5 @@
         "* Experiment with training on a larger dataset, or using more epochs\n"
       ]
     }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "collapsed_sections": [],
-      "name": "nmt_with_attention.ipynb",
-      "private_outputs": true,
-      "provenance": [
-        {
-          "file_id": "1C4fpM7_7IL8ZzF7Gc5abywqQjeQNS2-U",
-          "timestamp": 1527858391290
-        },
-        {
-          "file_id": "1pExo6aUuw0S6MISFWoinfJv0Ftm9V4qv",
-          "timestamp": 1527776041613
-        }
-      ],
-      "toc_visible": true,
-      "version": "0.3.2"
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
+  ]
+}
\ No newline at end of file
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/BUILD b/tensorflow/contrib/eager/python/examples/resnet50/BUILD
index f3135a9668fc0dc7faa93a5f119b53f3efd34c6e..f2851d97223e483da11120f1fe3f0a2f641dfb81 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/BUILD
+++ b/tensorflow/contrib/eager/python/examples/resnet50/BUILD
@@ -27,7 +27,7 @@ py_library(
 
 cuda_py_test(
     name = "resnet50_test",
-    size = "large",
+    size = "medium",
     srcs = ["resnet50_test.py"],
     additional_deps = [
         ":resnet50",
@@ -35,17 +35,19 @@ cuda_py_test(
         "//tensorflow/contrib/eager/python:tfe",
         "//tensorflow:tensorflow_py",
     ],
+    shard_count = 4,
     tags = [
         "noasan",  # Fix b/118130911
         "nomsan",  # Fix b/118130911
         "notsan",  # Fix b/118130911
         "optonly",
+        "oss_serial",
     ],
 )
 
 cuda_py_test(
     name = "resnet50_graph_test",
-    size = "large",
+    size = "medium",
     srcs = ["resnet50_graph_test.py"],
     additional_deps = [
         ":resnet50",
@@ -53,10 +55,12 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
     ],
+    shard_count = 4,
     tags = [
         "noasan",
         "nomsan",
         "notsan",
         "optonly",
+        "oss_serial",
     ],
 )
diff --git a/tensorflow/contrib/eager/python/examples/revnet/BUILD b/tensorflow/contrib/eager/python/examples/revnet/BUILD
index 4f0d46b1bae3760a63b2abe871034bdedf258f07..cb207b8ddf3641a68a114386f6a95a26ce2b74d6 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/BUILD
+++ b/tensorflow/contrib/eager/python/examples/revnet/BUILD
@@ -67,30 +67,36 @@ py_library(
 # Tests
 cuda_py_test(
     name = "ops_test",
-    size = "large",
+    size = "medium",
     srcs = ["ops_test.py"],
     additional_deps = [
         ":ops",
         "//tensorflow:tensorflow_py",
     ],
+    shard_count = 4,
+    tags = [
+        "oss_serial",
+    ],
 )
 
 cuda_py_test(
     name = "blocks_test",
-    size = "large",
+    size = "medium",
     srcs = ["blocks_test.py"],
     additional_deps = [
         ":blocks",
         "//tensorflow:tensorflow_py",
     ],
+    shard_count = 4,
     tags = [
+        "no_oss",  # b/123045964
         "optonly",
     ],
 )
 
 cuda_py_test(
     name = "revnet_test",
-    size = "large",
+    size = "medium",
     srcs = ["revnet_test.py"],
     additional_deps = [
         ":blocks_test",
@@ -98,9 +104,11 @@ cuda_py_test(
         ":revnet",
         "//tensorflow:tensorflow_py",
     ],
+    shard_count = 4,
     tags = [
         "no_pip",  # depends on blocks_test, which is not available in pip package
         "optonly",
+        "oss_serial",
     ],
 )
 
@@ -127,6 +135,13 @@ py_binary(
     name = "main",
     srcs = ["main.py"],
     srcs_version = "PY2AND3",
+    deps = [":main_lib"],
+)
+
+py_library(
+    name = "main_lib",
+    srcs = ["main.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":cifar_input",
         ":config",
@@ -141,7 +156,7 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = [
         ":cifar_input",
-        ":main",
+        ":main_lib",
         ":revnet",
         "//tensorflow:tensorflow_py",
     ],
@@ -153,7 +168,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":cifar_input",
-        ":main",
+        ":main_lib",
         ":revnet",
         "//tensorflow:tensorflow_py",
     ],
@@ -165,7 +180,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":cifar_input",
-        ":main",
+        ":main_lib",
         ":revnet",
         "//tensorflow:tensorflow_py",
     ],
diff --git a/tensorflow/contrib/eager/python/examples/revnet/revnet.py b/tensorflow/contrib/eager/python/examples/revnet/revnet.py
index 1f2cb14972f0b92d29489adff8f94e790e1ec4ed..7406787ba438345dc485c50e347e40597b2037f5 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/revnet.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/revnet.py
@@ -96,6 +96,7 @@ class RevNet(tf.keras.Model):
   def call(self, inputs, training=True):
     """Forward pass."""
 
+    saved_hidden = None
     if training:
       saved_hidden = [inputs]
 
diff --git a/tensorflow/contrib/eager/python/examples/rnn_colorbot/BUILD b/tensorflow/contrib/eager/python/examples/rnn_colorbot/BUILD
index d500b632ebb97fd12ded3a215b0f1a686194874f..f4dbe7ac16f734f7bee045bc71e9559b630adf81 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_colorbot/BUILD
+++ b/tensorflow/contrib/eager/python/examples/rnn_colorbot/BUILD
@@ -9,6 +9,13 @@ py_binary(
     name = "rnn_colorbot",
     srcs = ["rnn_colorbot.py"],
     srcs_version = "PY2AND3",
+    deps = [":rnn_colorbot_lib"],
+)
+
+py_library(
+    name = "rnn_colorbot_lib",
+    srcs = ["rnn_colorbot.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/eager/python:tfe",
@@ -21,8 +28,11 @@ cuda_py_test(
     name = "rnn_colorbot_test",
     srcs = ["rnn_colorbot_test.py"],
     additional_deps = [
-        ":rnn_colorbot",
+        ":rnn_colorbot_lib",
         "//tensorflow/contrib/eager/python:tfe",
         "//tensorflow:tensorflow_py",
     ],
+    tags = [
+        "oss_serial",
+    ],
 )
diff --git a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
index 74ebb1ec77131a560b1ebfd062c690920c35e261..1c718a5ce3d8e1541656d92fd5e8dad6c6683c4c 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
@@ -207,7 +207,7 @@ class RNNColorbot(tf.keras.Model):
 
 def loss(labels, predictions):
   """Computes mean squared loss."""
-  return tf.reduce_mean(tf.square(predictions - labels))
+  return tf.reduce_mean(tf.squared_difference(predictions, labels))
 
 
 def test(model, eval_data):
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/BUILD b/tensorflow/contrib/eager/python/examples/rnn_ptb/BUILD
index 2cc2fcbfeb21ee6218d7912d9a93ea2f7b2ea226..43a6ca526d3a0aecda2c8df865a0487ac28758ab 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/BUILD
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/BUILD
@@ -9,6 +9,13 @@ py_binary(
     name = "rnn_ptb",
     srcs = ["rnn_ptb.py"],
     srcs_version = "PY2AND3",
+    deps = [":rnn_ptb_lib"],
+)
+
+py_library(
+    name = "rnn_ptb_lib",
+    srcs = ["rnn_ptb.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_py",
@@ -21,18 +28,22 @@ cuda_py_test(
     name = "rnn_ptb_test",
     srcs = ["rnn_ptb_test.py"],
     additional_deps = [
-        ":rnn_ptb",
+        ":rnn_ptb_lib",
         "//tensorflow/contrib/eager/python:tfe",
         "//tensorflow:tensorflow_py",
     ],
+    tags = ["no_oss"],  # b/123045964
 )
 
 cuda_py_test(
     name = "rnn_ptb_graph_test",
     srcs = ["rnn_ptb_graph_test.py"],
     additional_deps = [
-        ":rnn_ptb",
+        ":rnn_ptb_lib",
         "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
     ],
+    tags = [
+        "oss_serial",
+    ],
 )
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
index 15776c694e92825895437a4c1547699f6d9269fb..9b5a2c947b153308c83f1a922d06c034ec5f9ddf 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
@@ -128,7 +128,7 @@ class PTBModel(tf.keras.Model):
 
     self.linear = layers.Dense(
         vocab_size, kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1))
-    self._output_shape = [-1, embedding_dim]
+    self._output_shape = [-1, hidden_dim]
 
   def call(self, input_seq, training):
     """Run the forward pass of PTBModel.
diff --git a/tensorflow/contrib/eager/python/examples/spinn/BUILD b/tensorflow/contrib/eager/python/examples/spinn/BUILD
index 5966f1d4873e8e77b3ad5914da7bfc7e69d4e341..9b0fbaa6793e28d327745767e6ccd3085211ff7d 100644
--- a/tensorflow/contrib/eager/python/examples/spinn/BUILD
+++ b/tensorflow/contrib/eager/python/examples/spinn/BUILD
@@ -42,5 +42,6 @@ cuda_py_test(
         "no-internal-py3",  # flaky
         "no_cuda_on_cpu_tap",
         "no_pip",  # because spinn.py is under third_party/.
+        "oss_serial",
     ],
 )
diff --git a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
index d18a097063c7d25947af3e2e2959ce574edd553f..3143270ccfe4f670428c80bdc1e09fa452584207 100644
--- a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
+++ b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
@@ -37,7 +37,7 @@ from tensorflow.contrib.summary import summary_test_util
 from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 # pylint: enable=g-bad-import-order
 
 
@@ -421,7 +421,7 @@ class SpinnTest(test_util.TensorFlowTestCase):
 
     # 5. Verify that checkpoints exist and contains all the expected variables.
     self.assertTrue(glob.glob(os.path.join(config.logdir, "ckpt*")))
-    object_graph = checkpointable_utils.object_metadata(
+    object_graph = trackable_utils.object_metadata(
         checkpoint_management.latest_checkpoint(config.logdir))
     ckpt_variable_names = set()
     for node in object_graph.nodes:
diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py
index 566246de4957c1dc5919c10e22146706f9e50be8..b32501c2e804838af9d4c77663be131b77bd30b4 100644
--- a/tensorflow/contrib/eager/python/metrics_impl.py
+++ b/tensorflow/contrib/eager/python/metrics_impl.py
@@ -32,12 +32,12 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 
 _to_replace = re.compile("[^A-Za-z0-9.]")
 
 
-class Metric(checkpointable.CheckpointableBase):
+class Metric(trackable.Trackable):
   """A metric holds state for aggregating statistics over an evaluation run.
 
   Example use with eager execution:
@@ -269,7 +269,7 @@ class Metric(checkpointable.CheckpointableBase):
       else:
         collections = [ops.GraphKeys.LOCAL_VARIABLES]
       collections += [ops.GraphKeys.METRIC_VARIABLES]
-    # Variables are Checkpointable dependencies of Metrics regardless of the
+    # Variables are Trackable dependencies of Metrics regardless of the
     # global/local distinction. Users can avoid saving variables by not adding a
     # dependency on the Metric.
     v = self._add_variable_with_custom_getter(
@@ -282,7 +282,7 @@ class Metric(checkpointable.CheckpointableBase):
         use_resource=True,
         getter=variable_scope.get_variable,
         # Raise duplicate variable exceptions from get_variable rather than
-        # Checkpointable.
+        # Trackable.
         overwrite=True)
     self._vars.append(v)
     if context.executing_eagerly():
diff --git a/tensorflow/contrib/eager/python/metrics_test.py b/tensorflow/contrib/eager/python/metrics_test.py
index 39e5957f5d1760613f2c33607c0bdb163040efb4..c56d1956fde35b562e60496015e666efe9ebc8f6 100644
--- a/tensorflow/contrib/eager/python/metrics_test.py
+++ b/tensorflow/contrib/eager/python/metrics_test.py
@@ -35,7 +35,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 class MetricsTest(test.TestCase):
@@ -314,7 +314,7 @@ class MetricsTest(test.TestCase):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     mean = metrics.Mean()
-    checkpoint = checkpointable_utils.Checkpoint(mean=mean)
+    checkpoint = trackable_utils.Checkpoint(mean=mean)
     mean.build()
     mean._built = True
     self.evaluate(mean.init_variables())
@@ -327,7 +327,7 @@ class MetricsTest(test.TestCase):
     self.assertAllEqual(200., self.evaluate(mean.value()))
 
     restore_mean = metrics.Mean()
-    restore_checkpoint = checkpointable_utils.Checkpoint(mean=restore_mean)
+    restore_checkpoint = trackable_utils.Checkpoint(mean=restore_mean)
     status = restore_checkpoint.restore(save_path)
     restore_update = restore_mean(300.)
     status.assert_consumed().run_restore_ops()
diff --git a/tensorflow/contrib/eager/python/network_test.py b/tensorflow/contrib/eager/python/network_test.py
index 240f213c602395b8589d39c3ecd90b602ffa9848..b3e8daddaf2369e9e33179fde2aab1469e97ea47 100644
--- a/tensorflow/contrib/eager/python/network_test.py
+++ b/tensorflow/contrib/eager/python/network_test.py
@@ -31,7 +31,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 # pylint: disable=not-callable
@@ -65,7 +65,7 @@ class NetworkTest(test.TestCase):
 
   def test_checkpointing_not_implemented(self):
     checkpoint_directory = self.get_temp_dir()
-    checkpoint = checkpointable_utils.Checkpoint(net=MyNetwork())
+    checkpoint = trackable_utils.Checkpoint(net=MyNetwork())
     with self.assertRaises(NotImplementedError):
       checkpoint.save(checkpoint_directory)
 
diff --git a/tensorflow/contrib/eager/python/parameter_server.py b/tensorflow/contrib/eager/python/parameter_server.py
index 7803a6799bb64441fab881bf6ca986d5cf3851a8..258f0a19309235dcd99b31b4de3d35ef8d89b15b 100644
--- a/tensorflow/contrib/eager/python/parameter_server.py
+++ b/tensorflow/contrib/eager/python/parameter_server.py
@@ -30,7 +30,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 
 
 def _eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
@@ -129,8 +129,8 @@ class SharedVariable(resource_variable_ops.ResourceVariable):
     if constraint is not None and not callable(constraint):
       raise ValueError("The `constraint` argument must be a callable.")
 
-    if isinstance(initial_value, checkpointable.CheckpointInitialValue):
-      self._maybe_initialize_checkpointable()
+    if isinstance(initial_value, trackable.CheckpointInitialValue):
+      self._maybe_initialize_trackable()
       self._update_uid = initial_value.checkpoint_position.restore_uid
       initial_value = initial_value.wrapped_value
 
diff --git a/tensorflow/contrib/eager/python/remote_test.py b/tensorflow/contrib/eager/python/remote_test.py
index 3926de15e71c9917f88fc3f58740b8c75354ab26..f540d9b37b69c7be3b0662b07bd6e9cb8220fadc 100644
--- a/tensorflow/contrib/eager/python/remote_test.py
+++ b/tensorflow/contrib/eager/python/remote_test.py
@@ -24,12 +24,12 @@ import os
 import numpy as np
 
 from tensorflow.contrib.eager.python import parameter_server
-from tensorflow.contrib.eager.python import remote
 from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.core.protobuf import tensorflow_server_pb2
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
+from tensorflow.python.eager import remote
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py
index 33c988fd9065e7fbe7b9aeb85cad82eb3c119f76..df5b059448f735f7dc1f2963ffbc9c8a8287250a 100644
--- a/tensorflow/contrib/eager/python/tfe.py
+++ b/tensorflow/contrib/eager/python/tfe.py
@@ -41,6 +41,8 @@ To use, at program startup, call `tf.enable_eager_execution()`.
 
 @@add_execution_callback
 @@clear_execution_callbacks
+@@errstate
+@@ExecutionCallback
 @@inf_callback
 @@inf_nan_callback
 @@nan_callback
@@ -60,7 +62,6 @@ To use, at program startup, call `tf.enable_eager_execution()`.
 
 @@Checkpoint
 @@Checkpointable
-@@CheckpointableSaver
 
 @@executing_eagerly
 @@in_eager_mode
@@ -97,7 +98,6 @@ from tensorflow.contrib.eager.python.network import Network
 from tensorflow.contrib.eager.python.network import Sequential
 from tensorflow.contrib.eager.python.network import save_network_checkpoint
 from tensorflow.contrib.eager.python.network import restore_network_checkpoint
-from tensorflow.contrib.eager.python.remote import connect_to_remote_host
 from tensorflow.contrib.eager.python.saver import get_optimizer_variables
 from tensorflow.contrib.eager.python.saver import restore_variables_on_create
 from tensorflow.contrib.eager.python.saver import Saver
@@ -119,10 +119,13 @@ from tensorflow.python.eager.context import set_server_def
 from tensorflow.python.eager.def_function import function
 from tensorflow.python.eager.execution_callbacks import add_execution_callback
 from tensorflow.python.eager.execution_callbacks import clear_execution_callbacks
+from tensorflow.python.eager.execution_callbacks import errstate
+from tensorflow.python.eager.execution_callbacks import ExecutionCallback
 from tensorflow.python.eager.execution_callbacks import inf_callback
 from tensorflow.python.eager.execution_callbacks import inf_nan_callback
 from tensorflow.python.eager.execution_callbacks import nan_callback
 from tensorflow.python.eager.execution_callbacks import seterr
+from tensorflow.python.eager.remote import connect_to_remote_host
 from tensorflow.python.framework.tensor_spec import TensorSpec
 from tensorflow.python.framework.ops import enable_eager_execution
 from tensorflow.python.framework.ops import enable_eager_execution_internal as enable_remote_eager_execution
@@ -134,9 +137,8 @@ from tensorflow.python.ops.resource_variable_ops import ResourceVariable as Vari
 from tensorflow.python.ops.variable_scope import EagerVariableStore
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import template
-from tensorflow.python.training.checkpointable.tracking import Checkpointable
-from tensorflow.python.training.checkpointable.util import CheckpointableSaver
-from tensorflow.python.training.checkpointable.util import Checkpoint
+from tensorflow.python.training.tracking.tracking import AutoTrackable as Checkpointable
+from tensorflow.python.training.tracking.util import Checkpoint
 from tensorflow.python.util.all_util import remove_undocumented
 
 py_func = script_ops.eager_py_func
diff --git a/tensorflow/contrib/eager/python/tfe_test.py b/tensorflow/contrib/eager/python/tfe_test.py
index 8c35dddb5a515aa09cc70c173a9f0605e8567e82..6881fabdc09e3275c29f3013283999c96e283770 100644
--- a/tensorflow/contrib/eager/python/tfe_test.py
+++ b/tensorflow/contrib/eager/python/tfe_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import tempfile
 
 from tensorflow.contrib.eager.python import tfe
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -40,6 +41,9 @@ class TFETest(test_util.TensorFlowTestCase):
     self.assertAllEqual([[4.]], y.numpy())
 
   def testInstantError(self):
+    if context.num_gpus():
+      # TODO(nareshmodi): make this test better
+      self.skipTest("Gather doesn't do index checking on GPUs")
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  r'indices = 7 is not in \[0, 3\)'):
       array_ops.gather([0, 1, 2], 7)
diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index e344d7a23b55134612aab430b50cf065bd1095e4..da2479a0b7b029561136903c82cabed9aae622b8 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -28,7 +28,6 @@ tf_custom_op_py_library(
         "python/ops/wals.py",
     ],
     dso = [
-        ":python/ops/_clustering_ops.so",
         ":python/ops/_factorization_ops.so",
     ],
     kernels = [
@@ -38,12 +37,12 @@ tf_custom_op_py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":factorization_ops_test_utils_py",
-        ":gen_clustering_ops",
         ":gen_factorization_ops",
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
+        "//tensorflow/python:clustering_ops_gen",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:embedding_ops",
@@ -77,17 +76,6 @@ py_library(
     ],
 )
 
-# Ops
-tf_custom_op_library(
-    name = "python/ops/_clustering_ops.so",
-    srcs = [
-        "ops/clustering_ops.cc",
-    ],
-    deps = [
-        "//tensorflow/contrib/factorization/kernels:clustering_ops",
-    ],
-)
-
 tf_custom_op_library(
     name = "python/ops/_factorization_ops.so",
     srcs = [
@@ -100,26 +88,16 @@ tf_custom_op_library(
 )
 
 tf_gen_op_libs([
-    "clustering_ops",
     "factorization_ops",
 ])
 
 cc_library(
     name = "all_ops",
     deps = [
-        ":clustering_ops_op_lib",
         ":factorization_ops_op_lib",
     ],
 )
 
-tf_gen_op_wrapper_py(
-    name = "gen_clustering_ops",
-    out = "python/ops/gen_clustering_ops.py",
-    deps = [
-        ":clustering_ops_op_lib",
-    ],
-)
-
 tf_gen_op_wrapper_py(
     name = "gen_factorization_ops",
     out = "python/ops/gen_factorization_ops.py",
@@ -131,7 +109,7 @@ tf_gen_op_wrapper_py(
 # Ops tests
 tf_py_test(
     name = "gmm_test",
-    size = "large",
+    size = "medium",
     srcs = [
         "python/ops/gmm_test.py",
     ],
@@ -152,6 +130,7 @@ tf_py_test(
         "//tensorflow/python:random_seed",
         "//tensorflow/python:training",
     ],
+    shard_count = 4,
     tags = [
         "no_pip",  # b/38283730
         "notsan",  # Flaky: b/30756419
@@ -224,10 +203,7 @@ py_test(
     srcs = ["python/ops/kmeans_test.py"],
     shard_count = 4,
     srcs_version = "PY2AND3",
-    tags = [
-        "nomac",  # b/73741358
-        "notsan",  # b/67512932
-    ],
+    tags = ["notsan"],
     deps = [
         ":factorization_py",
         ":factorization_py_CYCLIC_DEPENDENCIES_THAT_NEED_TO_GO",
@@ -249,7 +225,7 @@ py_test(
 
 tf_py_test(
     name = "wals_test",
-    size = "large",
+    size = "medium",
     srcs = ["python/ops/wals_test.py"],
     additional_deps = [
         ":factorization_py",
@@ -272,8 +248,8 @@ tf_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
+    shard_count = 4,
     tags = [
-        "manual",
         "noasan",  # times out b/63678675
         "nomsan",
     ],
diff --git a/tensorflow/contrib/factorization/kernels/BUILD b/tensorflow/contrib/factorization/kernels/BUILD
index ea8b9a17a27093cb57564861815edd6ecb18a014..23d7e088d067effa446e4bcdc9609db612066568 100644
--- a/tensorflow/contrib/factorization/kernels/BUILD
+++ b/tensorflow/contrib/factorization/kernels/BUILD
@@ -11,7 +11,6 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 cc_library(
     name = "all_kernels",
     deps = [
-        ":clustering_ops",
         ":masked_matmul_ops",
         ":wals_solver_ops",
         "@protobuf_archive//:protobuf_headers",
@@ -29,17 +28,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "clustering_ops",
-    srcs = ["clustering_ops.cc"],
-    deps = [
-        "//tensorflow/core:framework_headers_lib",
-        "//third_party/eigen3",
-        "@protobuf_archive//:protobuf_headers",
-    ],
-    alwayslink = 1,
-)
-
 cc_library(
     name = "masked_matmul_ops",
     srcs = ["masked_matmul_ops.cc"],
@@ -51,19 +39,3 @@ cc_library(
     ],
     alwayslink = 1,
 )
-
-tf_cc_test(
-    name = "clustering_ops_test",
-    srcs = ["clustering_ops_test.cc"],
-    deps = [
-        ":clustering_ops",
-        "//tensorflow/contrib/factorization:clustering_ops_op_lib",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
diff --git a/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc b/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc
index a8c5d0763c28ba2b54f217405f0da65533f26b91..68078ba8bbb07b4344c19d554012d214229f9c4f 100644
--- a/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc
+++ b/tensorflow/contrib/factorization/kernels/masked_matmul_ops.cc
@@ -19,12 +19,12 @@
 #include <numeric>
 #include <vector>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 
diff --git a/tensorflow/contrib/factorization/ops/clustering_ops.cc b/tensorflow/contrib/factorization/ops/clustering_ops.cc
deleted file mode 100644
index 2686702c1d5768f661dac610c96089eb02e360d7..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/factorization/ops/clustering_ops.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License"); you may not
-// use this file except in compliance with the License.  You may obtain a copy
-// of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
-// License for the specific language governing permissions and limitations under
-// the License.
-// ==============================================================================
-
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
-
-namespace tensorflow {
-
-REGISTER_OP("KmeansPlusPlusInitialization")
-    .Input("points: float32")
-    .Input("num_to_sample: int64")
-    .Input("seed: int64")
-    .Input("num_retries_per_sample: int64")
-    .Output("samples: float32")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"(
-Selects num_to_sample rows of input using the KMeans++ criterion.
-
-Rows of points are assumed to be input points. One row is selected at random.
-Subsequent rows are sampled with probability proportional to the squared L2
-distance from the nearest row selected thus far till num_to_sample rows have
-been sampled.
-
-points: Matrix of shape (n, d). Rows are assumed to be input points.
-num_to_sample: Scalar. The number of rows to sample. This value must not be
-  larger than n.
-seed: Scalar. Seed for initializing the random number generator.
-num_retries_per_sample: Scalar. For each row that is sampled, this parameter
-  specifies the number of additional points to draw from the current
-  distribution before selecting the best. If a negative value is specified, a
-  heuristic is used to sample O(log(num_to_sample)) additional points.
-samples: Matrix of shape (num_to_sample, d). The sampled rows.
-)");
-
-REGISTER_OP("KMC2ChainInitialization")
-    .Input("distances: float32")
-    .Input("seed: int64")
-    .Output("index: int64")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"(
-Returns the index of a data point that should be added to the seed set.
-
-Entries in distances are assumed to be squared distances of candidate points to
-the already sampled centers in the seed set. The op constructs one Markov chain
-of the k-MC^2 algorithm and returns the index of one candidate point to be added
-as an additional cluster center.
-
-distances: Vector with squared distances to the closest previously sampled
-  cluster center for each candidate point.
-seed: Scalar. Seed for initializing the random number generator.
-index: Scalar with the index of the sampled point.
-)");
-
-REGISTER_OP("NearestNeighbors")
-    .Input("points: float32")
-    .Input("centers: float32")
-    .Input("k: int64")
-    .Output("nearest_center_indices: int64")
-    .Output("nearest_center_distances: float32")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"(
-Selects the k nearest centers for each point.
-
-Rows of points are assumed to be input points. Rows of centers are assumed to be
-the list of candidate centers. For each point, the k centers that have least L2
-distance to it are computed.
-
-points: Matrix of shape (n, d). Rows are assumed to be input points.
-centers: Matrix of shape (m, d). Rows are assumed to be centers.
-k: Scalar. Number of nearest centers to return for each point. If k is larger
-  than m, then only m centers are returned.
-nearest_center_indices: Matrix of shape (n, min(m, k)). Each row contains the
-  indices of the centers closest to the corresponding point, ordered by
-  increasing distance.
-nearest_center_distances: Matrix of shape (n, min(m, k)). Each row contains the
-  squared L2 distance to the corresponding center in nearest_center_indices.
-)");
-
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/factorization/python/ops/clustering_ops.py b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
index 84e80791f4991ad2b67d0a00ee1e00cf0d0daadc..d48b89cbacce34781819010addbcbd0ba66f9873 100644
--- a/tensorflow/contrib/factorization/python/ops/clustering_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
@@ -18,28 +18,23 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.factorization.python.ops import gen_clustering_ops
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.factorization.python.ops.gen_clustering_ops import *
-# pylint: enable=wildcard-import
-from tensorflow.contrib.util import loader
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_clustering_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.embedding_ops import embedding_lookup
-from tensorflow.python.platform import resource_loader
-
-_clustering_ops = loader.load_op_library(
-    resource_loader.get_path_to_datafile('_clustering_ops.so'))
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.python.ops.gen_clustering_ops import *
+# pylint: enable=wildcard-import
 
 # Euclidean distance between vectors U and V is defined as \\(||U - V||_F\\)
 # which is the square root of the sum of the absolute squares of the elements
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops.py b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
index d365ad111760247fc18b730657390f07ba6b865e..9f0664dfe5ba7a098b6976388d1cf737dafb4842 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
@@ -314,8 +314,7 @@ class GmmAlgorithm(object):
     # reparametrization of variance parameters.
     det_expanded = math_ops.reduce_sum(
         math_ops.log(self._covs + 1e-3), 1, keepdims=True)
-    diff = shard - self._means
-    x2 = math_ops.square(diff)
+    x2 = math_ops.squared_difference(shard, self._means)
     cov_expanded = array_ops.expand_dims(1.0 / (self._covs + 1e-3), 2)
     # num_classes X num_examples
     x2_cov = math_ops.matmul(x2, cov_expanded)
diff --git a/tensorflow/contrib/feature_column/BUILD b/tensorflow/contrib/feature_column/BUILD
index 1cd83bdb5de7c2f6dc91c980750b49aca1a7790b..0a9199d61f36f10c98b95d79ece7e86765d2db0e 100644
--- a/tensorflow/contrib/feature_column/BUILD
+++ b/tensorflow/contrib/feature_column/BUILD
@@ -6,7 +6,7 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 py_library(
     name = "feature_column_py",
@@ -14,7 +14,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":sequence_feature_column",
-        ":sequence_feature_column_v2",
         "//tensorflow/python:util",
     ],
 )
@@ -37,13 +36,13 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "sequence_feature_column_test",
     srcs = ["python/feature_column/sequence_feature_column_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
+    additional_deps = [
         ":sequence_feature_column",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
@@ -53,17 +52,14 @@ py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
         "//tensorflow/python/feature_column:feature_column_py",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
+    tags = ["no_pip"],
 )
 
-py_test(
+tf_py_test(
     name = "sequence_feature_column_integration_test",
     srcs = ["python/feature_column/sequence_feature_column_integration_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
+    additional_deps = [
         ":sequence_feature_column",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
@@ -73,46 +69,5 @@ py_test(
         "//tensorflow/python/feature_column:feature_column_py",
         "//tensorflow/python/keras:layers",
     ],
-)
-
-py_library(
-    name = "sequence_feature_column_v2",
-    srcs = ["python/feature_column/sequence_feature_column_v2.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/feature_column",
-        "//tensorflow/python/feature_column:feature_column_py",
-    ],
-)
-
-py_test(
-    name = "sequence_feature_column_v2_test",
-    srcs = ["python/feature_column/sequence_feature_column_v2_test.py"],
-    srcs_version = "PY2AND3",
     tags = ["no_pip"],
-    deps = [
-        ":sequence_feature_column",
-        ":sequence_feature_column_v2",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:training",
-        "//tensorflow/python/feature_column",
-        "//tensorflow/python/feature_column:feature_column_py",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
 )
diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD
index dad50a3a73085526f65bd87c3d8549ceb75b3af4..8fd2b5f39bc88b76fe5583f8d18389e232ea9f40 100644
--- a/tensorflow/contrib/framework/BUILD
+++ b/tensorflow/contrib/framework/BUILD
@@ -32,7 +32,6 @@ tf_custom_op_py_library(
         "python/ops/arg_scope.py",
         "python/ops/audio_ops.py",
         "python/ops/checkpoint_ops.py",
-        "python/ops/critical_section_ops.py",
         "python/ops/ops.py",
         "python/ops/prettyprint_ops.py",
         "python/ops/script_ops.py",
@@ -50,6 +49,8 @@ tf_custom_op_py_library(
     visibility = [
         "//learning/brain:__subpackages__",
         "//tensorflow:__subpackages__",
+        "//tensorflow_estimator:__subpackages__",
+        "//tensorflow_model_optimization:__subpackages__",
         "//video/youtube/personalization:__subpackages__",
     ],
     deps = [
@@ -170,26 +171,6 @@ py_test(
     ],
 )
 
-cuda_py_test(
-    name = "critical_section_test",
-    size = "medium",
-    srcs = ["python/ops/critical_section_test.py"],
-    additional_deps = [
-        "//tensorflow/python:client_testlib",
-        ":framework_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/eager:context",
-    ],
-)
-
 py_test(
     name = "ops_test",
     size = "small",
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index e72e50585a3861d4527b66f89e1659d76c85960a..063717f08aa88f4de9470d8392db2b7c95b3e4bf 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -94,8 +94,6 @@
 @@smart_constant_value
 @@smart_case
 
-@@CriticalSection
-
 @@BoundedTensorSpec
 @@TensorSpec
 
@@ -129,18 +127,24 @@ from tensorflow.python.util.all_util import remove_undocumented
 _allowed_symbols = ['nest']
 _nest_allowed_symbols = [
     'assert_same_structure',
+    'is_nested',
     'is_sequence',
+    'is_sequence_or_composite',
     'flatten',
     'flatten_dict_items',
     'pack_sequence_as',
     'map_structure',
     'map_structure_with_paths',
+    'map_structure_with_tuple_paths',
     'assert_shallow_structure',
     'flatten_up_to',
+    'flatten_with_tuple_paths_up_to',
     'map_structure_up_to',
+    'map_structure_with_tuple_paths_up_to',
     'get_traverse_shallow_structure',
     'yield_flat_paths',
     'flatten_with_joined_string_paths',
+    'flatten_with_tuple_paths',
 ]
 
 remove_undocumented(nest.__name__, allowed_exception_list=_nest_allowed_symbols)
diff --git a/tensorflow/contrib/framework/python/ops/__init__.py b/tensorflow/contrib/framework/python/ops/__init__.py
index c4976497f5fa95d82e492153b117681f693eaa13..8113bf7c095bd0817e40cfd08bdf1ef7275ba55b 100644
--- a/tensorflow/contrib/framework/python/ops/__init__.py
+++ b/tensorflow/contrib/framework/python/ops/__init__.py
@@ -22,7 +22,6 @@ from __future__ import print_function
 # pylint: disable=wildcard-import
 from tensorflow.contrib.framework.python.ops.arg_scope import *
 from tensorflow.contrib.framework.python.ops.checkpoint_ops import *
-from tensorflow.contrib.framework.python.ops.critical_section_ops import *
 from tensorflow.contrib.framework.python.ops.ops import *
 from tensorflow.contrib.framework.python.ops.prettyprint_ops import *
 from tensorflow.contrib.framework.python.ops.script_ops import *
diff --git a/tensorflow/contrib/fused_conv/BUILD b/tensorflow/contrib/fused_conv/BUILD
index 57a5bfbf43c915775c6b0ef05baac19581213a09..f65f450eba49163c319af54ec2bd7f6b61e34c1e 100644
--- a/tensorflow/contrib/fused_conv/BUILD
+++ b/tensorflow/contrib/fused_conv/BUILD
@@ -171,6 +171,7 @@ cuda_py_test(
     main = "python/ops/fused_conv2d_bias_activation_benchmark.py",
     tags = [
         "manual",  # TODO(b/117128481): re-enable after fixing OSS build
+        "nogpu",
         "requires-gpu-sm70",
     ],
 )
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
index 93b1aaa85e88e00c1b12a388321a4d6fb10f1611..f13a66717f67a1a627f66af9468c6f2897aaf7a4 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -19,13 +19,13 @@ limitations under the License.
 
 #include "tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -522,7 +522,7 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
   auto bias_ptr = AsDeviceMemory(bias.template flat<BiasType>().data(),
                                  bias.template flat<BiasType>().size());
 
-  static int64 ConvolveScratchSize = GetCudnnWorkspaceLimit(
+  static int64 ConvolveScratchSize = GetDnnWorkspaceLimit(
       // default value is in bytes despite the name of the environment variable
       "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
   );
@@ -565,12 +565,26 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
         fused_conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(
             stream->parent()),
         &algorithms));
+    if (activation_mode == ActivationMode::NONE) {
+      // Only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM is supported for
+      // identity activation, other algs seem to quietly do Relu.
+      // See
+      // https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBiasActivationForward
+      algorithms.erase(
+          std::remove_if(
+              algorithms.begin(), algorithms.end(),
+              [](dnn::AlgorithmDesc alg) {
+                return alg.algo_id() !=
+                       CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+              }),
+          algorithms.end());
+    }
     dnn::ProfileResult best_result;
     dnn::ProfileResult best_result_no_scratch;
     for (auto profile_algorithm : algorithms) {
       // TODO(zhengxq): profile each algorithm multiple times to better
       // accuracy.
-      CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+      DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
       dnn::ProfileResult profile_result;
       bool cudnn_launch_status =
           stream
@@ -609,7 +623,7 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
                                                       algorithm_config);
   }
 
-  CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+  DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
   bool cudnn_launch_status =
       stream
           ->ThenFusedConvolveWithAlgorithm(
diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD
index f89d7ed0f45f919b17398de5d9449d12c08dd2f2..386e4cf69b7aa118a85fb25bcb809a879c5c1bd8 100644
--- a/tensorflow/contrib/gan/BUILD
+++ b/tensorflow/contrib/gan/BUILD
@@ -1,12 +1,14 @@
-# Files for using TFGAN framework.
-package(default_visibility = ["//tensorflow:__subpackages__"])
+# Files for using TF-GAN framework.
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+package(default_visibility = [
+    "//tensorflow:__subpackages__",
+])
 
 licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
-
 py_library(
     name = "gan",
     srcs = [
@@ -104,7 +106,9 @@ py_library(
     deps = [
         ":gan_estimator",
         ":head",
+        ":latent_gan_estimator",
         ":stargan_estimator",
+        ":tpu_gan_estimator",
         "//tensorflow/python:util",
     ],
 )
@@ -128,6 +132,7 @@ py_library(
         ":clip_weights",
         ":conditioning_utils",
         ":random_tensor_pool",
+        ":spectral_normalization",
         ":virtual_batchnorm",
         "//tensorflow/python:util",
     ],
@@ -141,16 +146,15 @@ py_library(
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:clip_ops",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:gradients",
+        "//tensorflow/python:gradients_impl",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:summary",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/ops/distributions",
         "//tensorflow/python/ops/losses",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -373,7 +377,10 @@ py_test(
     name = "classifier_metrics_test",
     srcs = ["python/eval/python/classifier_metrics_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
+    tags = [
+        "no_pip",
+        "no_windows",
+    ],
     deps = [
         ":classifier_metrics",
         "//tensorflow/core:protos_all_py",
@@ -518,15 +525,19 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:summary",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python:training",
         "//tensorflow/python:training_util",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/estimator:numpy_io",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
         "@six_archive//:six",
@@ -562,28 +573,114 @@ py_test(
     deps = [
         ":namedtuples",
         ":stargan_estimator",
-        ":tuple_losses",
         "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/contrib/learn",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
-        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:summary",
         "//tensorflow/python:training",
         "//tensorflow/python:training_util",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/estimator:numpy_io",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "tpu_gan_estimator",
+    srcs = [
+        "python/estimator/python/tpu_gan_estimator.py",
+        "python/estimator/python/tpu_gan_estimator_impl.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":gan_estimator",
+        ":namedtuples",
+        ":train",
+        "//tensorflow/contrib/tpu:tpu_estimator",
+        "//tensorflow/contrib/tpu:tpu_lib",
+        "//tensorflow/contrib/training:training_py",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python:util",
+        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/ops/losses",
+    ],
+)
+
+py_test(
+    name = "tpu_gan_estimator_test",
+    srcs = ["python/estimator/python/tpu_gan_estimator_test.py"],
+    shard_count = 11,
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = [
+        ":namedtuples",
+        ":tpu_gan_estimator",
+        ":tuple_losses",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/tpu:tpu_estimator",
+        "//tensorflow/contrib/tpu:tpu_lib",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:training",
+        "//tensorflow/python:training_util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:model_fn",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
         "@six_archive//:six",
     ],
 )
 
+py_library(
+    name = "latent_gan_estimator",
+    srcs = [
+        "python/estimator/python/latent_gan_estimator.py",
+        "python/estimator/python/latent_gan_estimator_impl.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":train",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training_util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/estimator:estimator_py",
+    ],
+)
+
+py_test(
+    name = "latent_gan_estimator_test",
+    srcs = [
+        "python/estimator/python/latent_gan_estimator_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":latent_gan_estimator",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/ops/losses",
+    ],
+)
+
 py_library(
     name = "sliced_wasserstein",
     srcs = [
@@ -618,3 +715,45 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
+
+py_library(
+    name = "spectral_normalization",
+    srcs = [
+        "python/features/python/spectral_normalization.py",
+        "python/features/python/spectral_normalization_impl.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:standard_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/keras:engine",
+    ],
+)
+
+py_test(
+    name = "spectral_normalization_test",
+    srcs = ["python/features/python/spectral_normalization_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":spectral_normalization",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/slim",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/keras:layers",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/contrib/gan/README.md b/tensorflow/contrib/gan/README.md
index 9ab86329eaf0e6fd426aef1f552f4e27c2ad65de..4eac4e80cdacd779fdbedef19e4a654196f0caf1 100644
--- a/tensorflow/contrib/gan/README.md
+++ b/tensorflow/contrib/gan/README.md
@@ -1,14 +1,15 @@
 <!-- TODO(joelshor): Add images to the examples. -->
-# TensorFlow-GAN (TFGAN)
+<!-- TODO(joelshor): Add link to new location when b/122114187 is done. -->
+# TensorFlow-GAN (TF-GAN)
 
-TFGAN is a lightweight library for training and evaluating Generative
+TF-GAN is a lightweight library for training and evaluating Generative
 Adversarial Networks (GANs). This technique allows you to train a network
 (called the 'generator') to sample from a distribution, without having to
 explicitly model the distribution and without writing an explicit loss. For
 example, the generator could learn to draw samples from the distribution of
 natural images. For more details on this technique, see
 ['Generative Adversarial Networks'](https://arxiv.org/abs/1406.2661) by
-Goodfellow et al. See [tensorflow/models](https://github.com/tensorflow/models/tree/master/research/gan/) for examples, and [this tutorial](https://github.com/tensorflow/models/tree/master/research/gan/tutorial.ipynb) for an
+Goodfellow et al. See [tensorflow/models](https://github.com/tensorflow/models/tree/master/research/gan/) for examples, and [this tutorial](http://https://github.com/tensorflow/models/tree/master/research/gan/tutorial.ipynb) for an
 introduction.
 
 #### Usage
@@ -17,27 +18,27 @@ import tensorflow as tf
 tfgan = tf.contrib.gan
 ```
 
-## Why TFGAN?
+## Why TF-GAN?
 
 * Easily train generator and discriminator networks with well-tested, flexible [library calls](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/train.py). You can
-mix TFGAN, native TF, and other custom frameworks
+mix TF-GAN, native TF, and other custom frameworks
 * Use already implemented [GAN losses and penalties](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/losses/python/losses_impl.py) (ex Wasserstein loss, gradient penalty, mutual information penalty, etc)
 * [Monitor and visualize](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/eval/python/summaries_impl.py) GAN progress during training, and [evaluate](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py) them
 * Use already-implemented [tricks](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/features/python/) to stabilize and improve training
 * Develop based on examples of [common GAN setups](https://github.com/tensorflow/models/tree/master/research/gan/)
-* Use the TFGAN-backed [GANEstimator](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py) to easily train a GAN model
-* Improvements in TFGAN infrastructure will automatically benefit your TFGAN project
+* Use the TF-GAN-backed [GANEstimator](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py) to easily train a GAN model
+* Improvements in TF-GAN infrastructure will automatically benefit your TF-GAN project
 * Stay up-to-date with research as we add more algorithms
 
-## What are the TFGAN components?
+## What are the TF-GAN components?
 
-TFGAN is composed of several parts which were design to exist independently.
+TF-GAN is composed of several parts which were design to exist independently.
 These include the following main pieces (explained in detail below).
 
 *   [core](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/train.py):
     provides the main infrastructure needed to train a GAN. Training occurs in
     four phases, and each phase can be completed by custom-code or by using a
-    TFGAN library call.
+    TF-GAN library call.
 
 *   [features](https://www.tensorflow.org/code/tensorflow/contrib/gan/python/features/python/):
     Many common GAN operations and normalization techniques are implemented for
@@ -56,14 +57,14 @@ These include the following main pieces (explained in detail below).
     generative models.
 
 *   [examples](https://github.com/tensorflow/models/tree/master/research/gan/)
-    and [tutorial](https://github.com/tensorflow/models/tree/master/research/gan/tutorial.ipynb): See examples of how to use TFGAN to make
-    GAN training easier, or use the more complicated examples to jumpstart your
+    and [tutorial](https://github.com/tensorflow/models/tree/master/research/gan/tutorial.ipynb): See examples of how to use TF-GAN to make
+    GAN training easier, or use the more complicated examples to jump-start your
     own project. These include unconditional and conditional GANs, InfoGANs,
     adversarial losses on existing networks, and image-to-image translation.
 
 ## Training a GAN model
 
-Training in TFGAN typically consists of the following steps:
+Training in TF-GAN typically consists of the following steps:
 
 1. Specify the input to your networks.
 1. Set up your generator and discriminator using a `GANModel`.
@@ -71,12 +72,12 @@ Training in TFGAN typically consists of the following steps:
 1. Create your train ops using a `GANTrainOps`.
 1. Run your train ops.
 
-At each stage, you can either use TFGAN's convenience functions, or you can
+At each stage, you can either use TF-GAN's convenience functions, or you can
 perform the step manually for fine-grained control. We provide examples below.
 
 There are various types of GAN setups. For instance, you can train a generator
 to sample unconditionally from a learned distribution, or you can condition on
-extra information such as a class label. TFGAN is compatible with many setups,
+extra information such as a class label. TF-GAN is compatible with many setups,
 and we demonstrate a few below:
 
 ### Examples
@@ -254,9 +255,9 @@ with variable_scope.variable_scope(dis_scope, reuse=True):
   discriminator_real_outputs = discriminator_fn(images)
 generator_variables = variables_lib.get_trainable_variables(gen_scope)
 discriminator_variables = variables_lib.get_trainable_variables(dis_scope)
-# Depending on what TFGAN features you use, you don't always need to supply
+# Depending on what TF-GAN features you use, you don't always need to supply
 # every `GANModel` field. At a minimum, you need to include the discriminator
-# outputs and variables if you want to use TFGAN to construct losses.
+# outputs and variables if you want to use TF-GAN to construct losses.
 gan_model = tfgan.GANModel(
     generator_inputs,
     generated_data,
diff --git a/tensorflow/contrib/gan/__init__.py b/tensorflow/contrib/gan/__init__.py
index f1946c7f925660eae3aaa650c437e03da1f33d6c..1e6000898f7b8a53ad3f6fa12deebd54bf3a57ff 100644
--- a/tensorflow/contrib/gan/__init__.py
+++ b/tensorflow/contrib/gan/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN is a lightweight library for training and evaluating GANs.
+"""TF-GAN is a lightweight library for training and evaluating GANs.
 
 In addition to providing the infrastructure for easily training and evaluating
 GANS, this library contains modules for a TFGAN-backed Estimator,
@@ -24,7 +24,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# Collapse TFGAN into a tiered namespace.
+# Collapse TF-GAN into a tiered namespace.
 from tensorflow.contrib.gan.python import estimator
 from tensorflow.contrib.gan.python import eval  # pylint:disable=redefined-builtin
 from tensorflow.contrib.gan.python import features
diff --git a/tensorflow/contrib/gan/python/estimator/__init__.py b/tensorflow/contrib/gan/python/estimator/__init__.py
index 99d38011ba677f03e198a431634fbb2ce349f912..430266555b723e6ca39dccffc1442dbef5d4a385 100644
--- a/tensorflow/contrib/gan/python/estimator/__init__.py
+++ b/tensorflow/contrib/gan/python/estimator/__init__.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN estimator module.
+"""TF-GAN estimator module.
 
 GANEstimator provides all the infrastructure support of a TensorFlow Estimator
-with the feature support of TFGAN.
+with the feature support of TF-GAN.
 """
 
 from __future__ import absolute_import
@@ -26,18 +26,25 @@ from __future__ import print_function
 # pylint: disable=unused-import,wildcard-import
 from tensorflow.contrib.gan.python.estimator.python import gan_estimator
 from tensorflow.contrib.gan.python.estimator.python import head
+from tensorflow.contrib.gan.python.estimator.python import latent_gan_estimator
 from tensorflow.contrib.gan.python.estimator.python import stargan_estimator
+from tensorflow.contrib.gan.python.estimator.python import tpu_gan_estimator
 
 from tensorflow.contrib.gan.python.estimator.python.gan_estimator import *
 from tensorflow.contrib.gan.python.estimator.python.head import *
+from tensorflow.contrib.gan.python.estimator.python.latent_gan_estimator import *
 from tensorflow.contrib.gan.python.estimator.python.stargan_estimator import *
+from tensorflow.contrib.gan.python.estimator.python.tpu_gan_estimator import *
 # pylint: enable=unused-import,wildcard-import
 
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = [
+_allowed_symbols = ([
     'gan_estimator',
     'stargan_estimator',
+    'tpu_gan_estimator',
+    'latent_gan_estimator',
     'head',
-] + gan_estimator.__all__ + stargan_estimator.__all__ + head.__all__
+] + gan_estimator.__all__ + stargan_estimator.__all__ + head.__all__ +
+                    tpu_gan_estimator.__all__ + latent_gan_estimator.__all__)
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
index 3593b501bb738b8f58dce4e40cffbdf410f136b3..dd904611d1a3bb78de8316d5ed29ab0f800f29a9 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""A TFGAN-backed GAN Estimator."""
+"""A TF-GAN-backed GAN Estimator."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -56,10 +56,10 @@ _summary_type_map = {
 class GANEstimator(estimator.Estimator):
   """An estimator for Generative Adversarial Networks (GANs).
 
-  This Estimator is backed by TFGAN. The network functions follow the TFGAN API
-  except for one exception: if either `generator_fn` or `discriminator_fn` have
-  an argument called `mode`, then the tf.Estimator mode is passed in for that
-  argument. This helps with operations like batch normalization, which have
+  This Estimator is backed by TF-GAN. The network functions follow the TF-GAN
+  API except for one exception: if either `generator_fn` or `discriminator_fn`
+  have an argument called `mode`, then the tf.Estimator mode is passed in for
+  that argument. This helps with operations like batch normalization, which have
   different train and evaluation behavior.
 
   Example:
@@ -68,7 +68,7 @@ class GANEstimator(estimator.Estimator):
       import tensorflow as tf
       tfgan = tf.contrib.gan
 
-      # See TFGAN's `train.py` for a description of the generator and
+      # See TF-GAN's `train.py` for a description of the generator and
       # discriminator API.
       def generator_fn(generator_inputs):
         ...
@@ -123,13 +123,13 @@ class GANEstimator(estimator.Estimator):
         to continue training a previously saved model.
       generator_fn: A python function that takes a Tensor, Tensor list, or
         Tensor dictionary as inputs and returns the outputs of the GAN
-        generator. See `TFGAN` for more details and examples. Additionally, if
+        generator. See `TF-GAN` for more details and examples. Additionally, if
         it has an argument called `mode`, the Estimator's `mode` will be passed
         in (ex TRAIN, EVAL, PREDICT). This is useful for things like batch
         normalization.
       discriminator_fn: A python function that takes the output of
         `generator_fn` or real data in the GAN setup, and `generator_inputs`.
-        Outputs a Tensor in the range [-inf, inf]. See `TFGAN` for more details
+        Outputs a Tensor in the range [-inf, inf]. See `TF-GAN` for more details
         and examples.
       generator_loss_fn: The loss function on the generator. Takes a `GANModel`
         tuple.
@@ -233,13 +233,14 @@ def _get_estimator_spec(
       estimator_spec = _get_eval_estimator_spec(
           gan_model, gan_loss, get_eval_metric_ops_fn)
     else:  # model_fn_lib.ModeKeys.TRAIN:
-      gopt = (generator_optimizer() if callable(generator_optimizer) else
-              generator_optimizer)
-      dopt = (discriminator_optimizer() if callable(discriminator_optimizer)
-              else discriminator_optimizer)
+      if callable(generator_optimizer):
+        generator_optimizer = generator_optimizer()
+      if callable(discriminator_optimizer):
+        discriminator_optimizer = discriminator_optimizer()
       get_hooks_fn = get_hooks_fn or tfgan_train.get_sequential_train_hooks()
       estimator_spec = _get_train_estimator_spec(
-          gan_model, gan_loss, gopt, dopt, get_hooks_fn, is_chief=is_chief)
+          gan_model, gan_loss, generator_optimizer, discriminator_optimizer,
+          get_hooks_fn, is_chief=is_chief)
 
   return estimator_spec
 
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
index bc9021050bc010ce75c3091fef868549686c0e90..66af79d1e81bbc450141673dd54d865e5c7932d5 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for TFGAN's estimator.py."""
+"""Tests for TF-GAN's estimator.py."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -23,7 +23,6 @@ import tempfile
 
 from absl.testing import parameterized
 import numpy as np
-import six
 
 from tensorflow.contrib import layers
 from tensorflow.contrib.gan.python import namedtuples as tfgan_tuples
@@ -75,8 +74,8 @@ class GetGANModelTest(test.TestCase, parameterized.TestCase):
   def test_get_gan_model(self, mode):
     with ops.Graph().as_default():
       generator_inputs = {'x': array_ops.ones([3, 4])}
-      real_data = (array_ops.zeros([3, 4]) if
-                   mode != model_fn_lib.ModeKeys.PREDICT else None)
+      is_predict = mode == model_fn_lib.ModeKeys.PREDICT
+      real_data = array_ops.zeros([3, 4]) if not is_predict else None
       gan_model = estimator._get_gan_model(
           mode, generator_fn, discriminator_fn, real_data, generator_inputs,
           add_summaries=False)
@@ -139,6 +138,7 @@ class GetEstimatorSpecTest(test.TestCase, parameterized.TestCase):
 
   @classmethod
   def setUpClass(cls):
+    super(GetEstimatorSpecTest, cls).setUpClass()
     cls._generator_optimizer = training.GradientDescentOptimizer(1.0)
     cls._discriminator_optimizer = training.GradientDescentOptimizer(1.0)
 
@@ -200,7 +200,6 @@ class GetEstimatorSpecTest(test.TestCase, parameterized.TestCase):
       self.assertSetEqual(frozenset(sync_opts), frozenset((g_opt, d_opt)))
 
 
-# TODO(joelshor): Add pandas test.
 class GANEstimatorIntegrationTest(test.TestCase):
 
   def setUp(self):
@@ -231,19 +230,19 @@ class GANEstimatorIntegrationTest(test.TestCase):
         get_eval_metric_ops_fn=get_metrics,
         model_dir=self._model_dir)
 
-    # TRAIN
+    # Train.
     num_steps = 10
     est.train(train_input_fn, steps=num_steps)
 
-    # EVALUTE
+    # Evaluate.
     scores = est.evaluate(eval_input_fn)
     self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn('loss', six.iterkeys(scores))
+    self.assertIn('loss', scores)
     self.assertEqual(scores['discriminator_loss'] + scores['generator_loss'],
                      scores['loss'])
-    self.assertIn('mse_custom_metric', six.iterkeys(scores))
+    self.assertIn('mse_custom_metric', scores)
 
-    # PREDICT
+    # Predict.
     predictions = np.array([x for x in est.predict(predict_input_fn)])
 
     self.assertAllEqual(prediction_size, predictions.shape)
diff --git a/tensorflow/contrib/gan/python/estimator/python/head_impl.py b/tensorflow/contrib/gan/python/estimator/python/head_impl.py
index 1a0ee6dfc498eb6dc8c97411589d9e35bc352062..cbe990b476c3b17ce61e0826b17d10976fea43c7 100644
--- a/tensorflow/contrib/gan/python/estimator/python/head_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/head_impl.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""A TFGAN-backed GAN Estimator."""
+"""A TF-GAN-backed GAN Estimator."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/estimator/python/head_test.py b/tensorflow/contrib/gan/python/estimator/python/head_test.py
index 8205bc889dc01c8680e2139393d65723280cfbd0..5b50234a0e33cd297b176f142b358338966b6758 100644
--- a/tensorflow/contrib/gan/python/estimator/python/head_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/head_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for TFGAN's head.py."""
+"""Tests for TF-GAN's head.py."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator.py b/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e164e24168bb0cc5e9a7cc772081781ea088bb1
--- /dev/null
+++ b/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator.py
@@ -0,0 +1,28 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""`tf.Learn` components for `Train Input Estimator`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.gan.python.estimator.python import latent_gan_estimator_impl
+# pylint: disable=wildcard-import
+from tensorflow.contrib.gan.python.estimator.python.latent_gan_estimator_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+__all__ = latent_gan_estimator_impl.__all__
+remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5afc7731937ed1a82c8ebb5969b2687ffdd583b
--- /dev/null
+++ b/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator_impl.py
@@ -0,0 +1,205 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implements an estimator wrapper that allows training the input latent space.
+
+This file implements a latent gan estimator that wraps around a previously
+trained GAN. The latent gan estimator trains a single variable z, representing
+the hidden latent distribution that is the 'noise' input to the GAN. By training
+z, the inpainting estimator can move around the latent z space towards
+minimizing a specific loss function.
+
+The latent gan estimator has a few key differences from a normal estimator.
+
+First: the variables in the estimator should not be saved, as we are not
+updating the original GAN and are only adding a new z variable that is meant
+to be different for each run. In order to do distributed training using
+train_and_evaluate, the Tensorflow RunConfig is expected to save checkpoints
+by having either save_checkpoints_steps or save_checkpoints_secs saved.
+To avoid this conflict, we purposely set the save_checkpoints_steps value in
+the RunConfig to be one step more than the total number of steps that the
+inpainter estimator will run.
+
+Second: we need to specify warm start settings, as we are reloading the
+GAN model into a different graph (specifically, one with a new z variable).
+The warm start settings defined below reload all GAN variables and ignore the
+new z variable (and the optimizer).
+
+Usage:
+
+  def _generator(net, mode):
+    ...
+
+  def _discriminator(net, condition, mode):
+    ...
+
+  def _loss(gan_model, features, labels, add_summaries):
+    ...
+
+  def optimizer():
+    ...
+
+  params = {<required params>}
+  config = tf.estimator.RunConfig()
+  tmp_dir = path/to/output/storage
+
+  estimator = latent_gan_estimator.get_latent_gan_estimator(
+      _generator, _discriminator, _loss, optimizer, params, config, tmp_dir)
+
+  def input_fn():
+    ...
+
+  estimator.train(input_fn=input_fn)
+
+See latent_gan_estimator_test.py or tensorflow_models/gan/face_inpainting for
+further examples.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+from tensorflow.contrib.gan.python import train as tfgan_train
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.summary import summary
+from tensorflow.python.training import training_util
+
+
+INPUT_NAME = 'new_var_z_input'  # The name for the new z space input variable.
+OPTIMIZER_NAME = 'latent_gan_optimizer'  # The name for the new optimizer vars.
+
+__all__ = [
+    'get_latent_gan_estimator',
+]
+
+
+def _get_latent_gan_model_fn(generator_fn, discriminator_fn, loss_fn,
+                             optimizer):
+  """Sets up a model function that wraps around a given GAN."""
+  def model_fn(features, labels, mode, params):
+    """Model function defining an inpainting estimator."""
+    batch_size = params['batch_size']
+    z_shape = [batch_size] + params['z_shape']
+    add_summaries = params['add_summaries']
+    input_clip = params['input_clip']
+
+    z = variable_scope.get_variable(
+        name=INPUT_NAME, initializer=random_ops.truncated_normal(z_shape),
+        constraint=lambda x: clip_ops.clip_by_value(x, -input_clip, input_clip))
+
+    generator = functools.partial(generator_fn, mode=mode)
+    discriminator = functools.partial(discriminator_fn, mode=mode)
+    gan_model = tfgan_train.gan_model(generator_fn=generator,
+                                      discriminator_fn=discriminator,
+                                      real_data=labels,
+                                      generator_inputs=z,
+                                      check_shapes=False)
+
+    loss = loss_fn(gan_model, features, labels, add_summaries)
+
+    # Use a variable scope to make sure that estimator variables dont cause
+    # save/load problems when restoring from ckpts.
+    with variable_scope.variable_scope(OPTIMIZER_NAME):
+      opt = optimizer(learning_rate=params['learning_rate'],
+                      **params['opt_kwargs'])
+      train_op = opt.minimize(
+          loss=loss, global_step=training_util.get_or_create_global_step(),
+          var_list=[z])
+
+    if add_summaries:
+      z_grads = gradients_impl.gradients(loss, z)
+      summary.scalar('z_loss/z_grads', clip_ops.global_norm(z_grads))
+      summary.scalar('z_loss/loss', loss)
+
+    return model_fn_lib.EstimatorSpec(mode=mode,
+                                      predictions=gan_model.generated_data,
+                                      loss=loss,
+                                      train_op=train_op)
+  return model_fn
+
+
+def get_latent_gan_estimator(generator_fn, discriminator_fn, loss_fn,
+                             optimizer, params, config, ckpt_dir,
+                             warmstart_options=True):
+  """Gets an estimator that passes gradients to the input.
+
+  This function takes in a generator and adds a trainable z variable that is
+  used as input to this generator_fn. The generator itself is treated as a black
+  box through which gradients can pass through without updating any weights. The
+  result is a trainable way to traverse the GAN latent space. The loss_fn is
+  used to actually train the z variable. The generator_fn and discriminator_fn
+  should be previously trained by the tfgan library (on reload, the variables
+  are expected to follow the tfgan format. It may be possible to use the
+  latent gan estimator with entirely custom GANs that do not use the tfgan
+  library as long as the appropriate variables are wired properly).
+
+  Args:
+    generator_fn: a function defining a Tensorflow graph for a GAN generator.
+      The weights defined in this graph should already be defined in the given
+      checkpoint location. Should have 'mode' as an argument.
+    discriminator_fn: a function defining a Tensorflow graph for a GAN
+      discriminator. Should have 'mode' as an argument.
+    loss_fn: a function defining a Tensorflow graph for a GAN loss. Takes in a
+      GANModel tuple, features, labels, and add_summaries as inputs.
+    optimizer: a tf.Optimizer or a function that returns a tf.Optimizer with no
+      inputs.
+   params: An object containing the following parameters:
+      - batch_size: an int indicating the size of the training batch.
+      - z_shape: the desired shape of the input z values (not counting batch).
+      - learning_rate: a scalar or function defining a learning rate applied to
+        optimizer.
+      - input_clip: the amount to clip the x training variable by.
+      - add_summaries: whether or not to add summaries.
+      - opt_kwargs: optimizer kwargs.
+    config: tf.RunConfig. Should point model to output dir and should indicate
+     whether to save checkpoints (to avoid saving checkpoints, set
+     save_checkpoints_steps to a number larger than the number of train steps).
+     The model_dir field in the RunConfig should point to a directory WITHOUT
+     any saved checkpoints.
+    ckpt_dir: the directory where the model checkpoints live. The checkpoint is
+     used to warm start the underlying GAN. This should NOT be the same as
+     config.model_dir.
+    warmstart_options: boolean, None, or a WarmStartSettings object. If set to
+      True, uses a default WarmStartSettings object. If set to False or None,
+      does not use warm start. If using a custom WarmStartSettings object, make
+      sure that new variables are properly accounted for when reloading the
+      underlying GAN. Defaults to True.
+  Returns:
+    An estimator spec defining a GAN input training estimator.
+  """
+  model_fn = _get_latent_gan_model_fn(generator_fn, discriminator_fn,
+                                      loss_fn, optimizer)
+
+  if isinstance(warmstart_options, estimator.WarmStartSettings):
+    ws = warmstart_options
+  elif warmstart_options:
+    # Default WarmStart loads all variable names except INPUT_NAME and
+    # OPTIMIZER_NAME.
+    var_regex = '^(?!.*(%s|%s).*)' % (INPUT_NAME, OPTIMIZER_NAME)
+    ws = estimator.WarmStartSettings(ckpt_to_initialize_from=ckpt_dir,
+                                     vars_to_warm_start=var_regex)
+  else:
+    ws = None
+
+  if 'opt_kwargs' not in params:
+    params['opt_kwargs'] = {}
+
+  return estimator.Estimator(model_fn=model_fn, config=config, params=params,
+                             warm_start_from=ws)
diff --git a/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac139e532e35f7aae6da0655103a7249fe3382d4
--- /dev/null
+++ b/tensorflow/contrib/gan/python/estimator/python/latent_gan_estimator_test.py
@@ -0,0 +1,119 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for latent_gan_estimator.
+
+See g3.tp.tensorflow.contrib.gan.python.estimator.python.latent_gan_estimator.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tempfile
+import numpy as np
+from tensorflow.contrib.gan.python.estimator.python import latent_gan_estimator
+from tensorflow.python.estimator import run_config as run_config
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import test
+from tensorflow.python.training import training
+
+
+class TrainInputEstimatorTest(test.TestCase):
+
+  def test_get_input_training_estimator(self):
+    """Integration test to make sure the input_training_estimator works."""
+
+    # Create dummy test input tensors.
+    true_features = np.reshape(np.random.uniform(size=100), (10, 10))
+    true_labels = np.reshape(np.random.uniform(size=100), (5, 20))
+    expected_z_output = [[1, -1], [-1, 1]]
+
+    # Fill out required parameters randomly, includes optimizer kwargs.
+    params = {
+        'batch_size': 2,
+        'z_shape': [2],
+        'learning_rate': 1.0,
+        'input_clip': 1.0,
+        'add_summaries': False,
+        'opt_kwargs': {
+            'beta1': 0.1
+        }
+    }
+
+    input_z_shape = [params['batch_size']] + params['z_shape']
+
+    # Create dummy model functions that represent an underlying GANEstimator and
+    # the input training wrapper. Make sure that everything is wired up
+    # correctly in the internals of each dummy function.
+    def _generator(net, mode):
+      """The generator function will get the newly created z variable."""
+      del mode
+      self.assertSequenceEqual(net.shape, input_z_shape)
+      gen_dummy_var = variable_scope.get_variable(
+          name='generator_dummy_variable',
+          initializer=array_ops.ones(input_z_shape))
+      return net * gen_dummy_var
+
+    def _discriminator(net, condition, mode):
+      """The discriminator function will get either the z variable or labels."""
+      del condition, mode
+      try:
+        self.assertSequenceEqual(net.shape, true_labels.shape)
+      except AssertionError:
+        self.assertSequenceEqual(net.shape, input_z_shape)
+      return net
+
+    def _loss(gan_model, features, labels, _):
+      """Make sure that features and labels are passed in from input."""
+      self.assertTrue(np.array_equal(features, true_features))
+      self.assertTrue(np.array_equal(labels, true_labels))
+      return losses.absolute_difference(expected_z_output,
+                                        gan_model.generated_data)
+
+    optimizer = training.AdamOptimizer
+
+    # We are not loading checkpoints, so set the corresponding directory to a
+    # dummy directories.
+    tmp_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig(model_dir=tmp_dir,
+                                  save_summary_steps=None,
+                                  save_checkpoints_steps=1,
+                                  save_checkpoints_secs=None)
+
+    # Get the estimator. Disable warm start so that there is no attempted
+    # checkpoint reloading.
+    estimator = latent_gan_estimator.get_latent_gan_estimator(
+        _generator, _discriminator, _loss, optimizer, params, config, tmp_dir,
+        warmstart_options=None)
+
+    # Train for a few steps.
+    def dummy_input():
+      return true_features, true_labels
+    estimator.train(input_fn=dummy_input, steps=10)
+
+    # Make sure the generator variables did not change, but the z variables did
+    # change.
+    self.assertTrue(np.array_equal(
+        estimator.get_variable_value('Generator/generator_dummy_variable'),
+        np.ones(input_z_shape)))
+    self.assertTrue(np.array_equal(
+        estimator.get_variable_value('new_var_z_input'),
+        expected_z_output))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_impl.py
index f60e16bc04662b33bc0bb22b5acc8c7fcc7a03ba..2a485e7d47ff10cf34c1b44f8dcc6b1f33c9a05f 100644
--- a/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_impl.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""A TFGAN-backed StarGAN Estimator."""
+"""A TF-GAN-backed StarGAN Estimator."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py
index 2ec7938c7c4051842c7e982b54c1213b6e841b79..0fcd1b7924eb02f5d617b45af16852baf2e2bb48 100644
--- a/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/stargan_estimator_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for TFGAN's stargan_estimator.py."""
+"""Tests for TF-GAN's stargan_estimator.py."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -23,7 +23,6 @@ import tempfile
 
 from absl.testing import parameterized
 import numpy as np
-import six
 
 from tensorflow.contrib import layers
 from tensorflow.contrib.gan.python import namedtuples as tfgan_tuples
@@ -80,7 +79,7 @@ class StarGetGANModelTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(input_data, gan_model.input_data)
     self.assertIsNotNone(gan_model.generated_data)
     self.assertIsNotNone(gan_model.generated_data_domain_target)
-    self.assertEqual(1, len(gan_model.generator_variables))
+    self.assertLen(gan_model.generator_variables, 1)
     self.assertIsNotNone(gan_model.generator_scope)
     self.assertIsNotNone(gan_model.generator_fn)
     if mode == model_fn_lib.ModeKeys.PREDICT:
@@ -109,7 +108,7 @@ class StarGetGANModelTest(test.TestCase, parameterized.TestCase):
           gan_model.discriminator_input_data_domain_predication)
       self.assertIsNotNone(
           gan_model.discriminator_generated_data_domain_predication)
-      self.assertEqual(2, len(gan_model.discriminator_variables))  # 1 FC layer
+      self.assertLen(gan_model.discriminator_variables, 2)  # 1 FC layer
       self.assertIsNotNone(gan_model.discriminator_scope)
       self.assertIsNotNone(gan_model.discriminator_fn)
 
@@ -163,6 +162,7 @@ class GetEstimatorSpecTest(test.TestCase, parameterized.TestCase):
 
   @classmethod
   def setUpClass(cls):
+    super(GetEstimatorSpecTest, cls).setUpClass()
     cls._generator_optimizer = training.GradientDescentOptimizer(1.0)
     cls._discriminator_optimizer = training.GradientDescentOptimizer(1.0)
 
@@ -234,10 +234,10 @@ class StarGANEstimatorIntegrationTest(test.TestCase):
     # EVALUTE
     scores = est.evaluate(eval_input_fn)
     self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
-    self.assertIn('loss', six.iterkeys(scores))
+    self.assertIn('loss', scores)
     self.assertEqual(scores['discriminator_loss'] + scores['generator_loss'],
                      scores['loss'])
-    self.assertIn('mse_custom_metric', six.iterkeys(scores))
+    self.assertIn('mse_custom_metric', scores)
 
     # PREDICT
     predictions = np.array([x for x in est.predict(predict_input_fn)])
diff --git a/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator.py b/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..deb381f7be3f9545ed918813ee55aede946f22d4
--- /dev/null
+++ b/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator.py
@@ -0,0 +1,28 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""`tf.Learn` components for `TPUGANEstimator`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.gan.python.estimator.python import tpu_gan_estimator_impl
+# pylint: disable=wildcard-import
+from tensorflow.contrib.gan.python.estimator.python.tpu_gan_estimator_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+__all__ = tpu_gan_estimator_impl.__all__
+remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f2a22c78a304c7cc66ef069a235483e9279b3b2
--- /dev/null
+++ b/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_impl.py
@@ -0,0 +1,423 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A TF-GAN-backed GAN Estimator that works on TPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.gan.python import namedtuples as tfgan_tuples
+from tensorflow.contrib.gan.python import train as tfgan_train
+from tensorflow.contrib.gan.python.estimator.python import gan_estimator_impl as gan_estimator_lib
+from tensorflow.contrib.tpu.python.tpu import tpu_estimator
+from tensorflow.contrib.tpu.python.tpu import tpu_optimizer
+from tensorflow.contrib.training.python.training import training
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import metrics as metrics_lib
+from tensorflow.python.ops.losses import losses
+
+__all__ = [
+    'TPUGANEstimator',
+]
+
+
+class TPUGANEstimator(tpu_estimator.TPUEstimator):
+  """An estimator for Generative Adversarial Networks (GANs) on TPU.
+
+  This Estimator is backed by TFGAN. It is similar to `tfgan.GANEstimator`,
+  but works on TPU.
+
+  Example:
+
+  ```python
+      import tensorflow as tf
+      tfgan = tf.contrib.gan
+
+      # See TFGAN's `train.py` for a description of the generator and
+      # discriminator API.
+      def generator_fn(generator_inputs):
+        ...
+        return generated_data
+
+      def discriminator_fn(data, conditioning):
+        ...
+        return logits
+
+      # Create GAN estimator.
+      config = tpu_config.RunConfig(model_dir='/my/dir')
+      gan_estimator = tfgan.estimator.TPUGANEstimator(
+          generator_fn=generator_fn,
+          discriminator_fn=discriminator_fn,
+          generator_loss_fn=tfgan.losses.wasserstein_generator_loss,
+          discriminator_loss_fn=tfgan.losses.wasserstein_discriminator_loss,
+          generator_optimizer=tf.train.AdamOptimizer(0.1, 0.5),
+          discriminator_optimizer=tf.train.AdamOptimizer(0.1, 0.5),
+          train_batch_size=4,
+          config=config)
+
+      # Train estimator.
+      gan_estimator.train(train_input_fn, train_steps)
+
+      # Evaluate resulting estimator.
+      gan_estimator.evaluate(eval_input_fn, eval_steps)
+
+      # Generate samples from generator.
+      predictions = np.array([
+          x['generated_data'] for x in gan_estimator.predict(predict_input_fn)])
+  ```
+  """
+
+  def __init__(self,
+               # Arguments to construct the `model_fn`.
+               generator_fn=None,
+               discriminator_fn=None,
+               generator_loss_fn=None,
+               discriminator_loss_fn=None,
+               generator_optimizer=None,
+               discriminator_optimizer=None,
+               get_eval_metric_ops_fn=None,
+               add_summaries=None,
+               joint_train=False,
+               gan_train_steps=tfgan_tuples.GANTrainSteps(1, 1),
+               # TPUEstimator options.
+               model_dir=None,
+               config=None,
+               params=None,
+               use_tpu=True,
+               train_batch_size=None,
+               eval_batch_size=None,
+               predict_batch_size=None,
+               batch_axis=None,
+               eval_on_tpu=True,
+               export_to_tpu=True,
+               warm_start_from=None):
+    """Initializes a TPUGANEstimator instance.
+
+    Args:
+      generator_fn: A python function that takes a Tensor, Tensor list, or
+        Tensor dictionary as inputs and returns the outputs of the GAN
+        generator. See `TFGAN` for more details and examples. Additionally, if
+        it has an argument called `mode`, the Estimator's `mode` will be passed
+        in (ex TRAIN, EVAL, PREDICT). This is useful for things like batch
+        normalization.
+      discriminator_fn: A python function that takes the output of
+        `generator_fn` or real data in the GAN setup, and `generator_inputs`.
+        Outputs a Tensor in the range [-inf, inf]. See `TFGAN` for more details
+        and examples.
+      generator_loss_fn: The loss function on the generator. Takes a `GANModel`
+        tuple.
+      discriminator_loss_fn: The loss function on the discriminator. Takes a
+        `GANModel` tuple.
+      generator_optimizer: The optimizer for generator updates, or a function
+        that takes no arguments and returns an optimizer. This function will
+        be called when the default graph is the `GANEstimator`'s graph, so
+        utilities like `tf.contrib.framework.get_or_create_global_step` will
+        work.
+      discriminator_optimizer: Same as `generator_optimizer`, but for the
+        discriminator updates.
+      get_eval_metric_ops_fn: A function that takes a list of arguments and
+        returns a dict of metric results keyed by name. The output of this
+        function is passed into `tf.estimator.EstimatorSpec` during evaluation.
+        The arguments must be:
+            * generator_inputs
+            * generated_data
+            * real_data
+            * discriminator_real_outputs
+            * discriminator_gen_outputs
+      add_summaries: `None`, a single `SummaryType`, or a list of `SummaryType`.
+        This is ignored for jobs that run on TPU, such as the train job if
+        `use_tpu` is `True` or the eval job if `eval_on_tpu` is `True`.
+      joint_train: A Python boolean. If `True`, jointly train the generator and
+        the discriminator. If `False`, sequentially train them. See `train.py`
+        in TFGAN for more details on the differences between the two GAN
+        training methods.
+      gan_train_steps: A `tfgan.GANTrainSteps` named tuple describing the ratio
+        of generator to discriminator steps. For now, only supports 1:1
+        training.
+      model_dir: Same as `TPUEstimator`: Directory to save model parameters,
+        graph and etc. This can also be used to load checkpoints from the
+        directory into a estimator to continue training a previously saved
+        model. If `None`, the model_dir in `config` will be used if set. If both
+        are set, they must be same. If both are `None`, a temporary directory
+        will be used.
+      config: Same as `TPUEstimator`: An `tpu_config.RunConfig` configuration
+        object. Cannot be `None`.
+      params: Same as `TPUEstimator`: An optional `dict` of hyper parameters
+        that will be passed into `input_fn` and `model_fn`.  Keys are names of
+        parameters, values are basic python types. There are reserved keys for
+        `TPUEstimator`, including 'batch_size'.
+      use_tpu: Same as `TPUEstimator`: A bool indicating whether TPU support is
+        enabled. Currently, TPU training and evaluation respect this bit, but
+        eval_on_tpu can override execution of eval. See below. Predict still
+        happens on CPU.
+      train_batch_size: Same as `TPUEstimator`: An int representing the global
+        training batch size. TPUEstimator transforms this global batch size to a
+        per-shard batch size, as params['batch_size'], when calling `input_fn`
+        and `model_fn`. Cannot be `None` if `use_tpu` is `True`. Must be
+        divisible by total number of replicas.
+      eval_batch_size: Same as `TPUEstimator`: An int representing evaluation
+        batch size. Must be divisible by total number of replicas.
+      predict_batch_size: Same as `TPUEstimator`: An int representing the
+        prediction batch size. Must be divisible by total number of replicas.
+      batch_axis: Same as `TPUEstimator`: A python tuple of int values
+        describing how each tensor produced by the Estimator `input_fn` should
+        be split across the TPU compute shards. For example, if your input_fn
+        produced (images, labels) where the images tensor is in `HWCN` format,
+        your shard dimensions would be [3, 0], where 3 corresponds to the `N`
+        dimension of your images Tensor, and 0 corresponds to the dimension
+        along which to split the labels to match up with the corresponding
+        images. If None is supplied, and per_host_input_for_training is True,
+        batches will be sharded based on the major dimension. If
+        tpu_config.per_host_input_for_training is False or `PER_HOST_V2`,
+        batch_axis is ignored.
+      eval_on_tpu: Same as `TPUEstimator`: If False, evaluation runs on CPU or
+        GPU. In this case, the model_fn must return `EstimatorSpec` when called
+        with `mode` as `EVAL`.
+      export_to_tpu: Same as `TPUEstimator`: If True, `export_savedmodel()`
+        exports a metagraph for serving on TPU besides the one on CPU.
+      warm_start_from: Same as `TPUEstimator`: Optional string filepath to a
+        checkpoint or SavedModel to warm-start from, or a
+        `tf.estimator.WarmStartSettings` object to fully configure
+        warm-starting.  If the string filepath is provided instead of a
+        `WarmStartSettings`, then all variables are warm-started, and it is
+        assumed that vocabularies and Tensor names are unchanged.
+
+    Raises:
+      ValueError: If loss functions aren't callable.
+      ValueError: If `gan_train_steps` isn't a `tfgan_tuples.GANTrainSteps`
+        tuple.
+      ValueError: If `gan_train_steps` isn't 1:1 training.
+    """
+    if not callable(generator_loss_fn):
+      raise ValueError('generator_loss_fn must be callable.')
+    if not callable(discriminator_loss_fn):
+      raise ValueError('discriminator_loss_fn must be callable.')
+    if not isinstance(gan_train_steps, tfgan_tuples.GANTrainSteps):
+      raise ValueError(
+          '`gan_train_steps` must be `tfgan_tuples.GANTrainSteps`. Instead, '
+          'was type: %s' % type(gan_train_steps))
+    if (gan_train_steps.generator_train_steps != 1 or
+        gan_train_steps.discriminator_train_steps != 1):
+      raise ValueError('Estimator currently only supports 1:1 training.')
+
+    if use_tpu:
+      generator_optimizer = _maybe_make_cross_shard_optimizer(
+          generator_optimizer)
+      discriminator_optimizer = _maybe_make_cross_shard_optimizer(
+          discriminator_optimizer)
+
+    def _model_fn(features, labels, mode, params):
+      """GANEstimator model function."""
+      del params  # unused
+      if mode not in [model_fn_lib.ModeKeys.TRAIN, model_fn_lib.ModeKeys.EVAL,
+                      model_fn_lib.ModeKeys.PREDICT]:
+        raise ValueError('Mode not recognized: %s' % mode)
+      real_data = labels  # rename inputs for clarity
+      generator_inputs = features  # rename inputs for clarity
+
+      # Make GANModel, which encapsulates the GAN model architectures.
+      # TODO(joelshor): Switch TF-GAN over to TPU-compatible summaries, then
+      # remove `add_summaries` logic below.
+      is_on_tpu = _is_on_tpu(mode, use_tpu, eval_on_tpu)
+      gan_model = gan_estimator_lib._get_gan_model(  # pylint:disable=protected-access
+          mode, generator_fn, discriminator_fn, real_data, generator_inputs,
+          add_summaries=None if is_on_tpu else add_summaries)
+
+      # Make the TPUEstimatorSpec, which incorporates the GANModel, losses, eval
+      # metrics, and optimizers (if required).
+      estimator_spec = _get_estimator_spec(
+          mode, gan_model, generator_loss_fn, discriminator_loss_fn,
+          get_eval_metric_ops_fn, generator_optimizer, discriminator_optimizer,
+          joint_train, is_on_tpu, gan_train_steps)
+      assert isinstance(estimator_spec, tpu_estimator.TPUEstimatorSpec)
+      return estimator_spec
+
+    super(TPUGANEstimator, self).__init__(
+        model_fn=_model_fn,
+        model_dir=model_dir,
+        config=config,
+        params=params,
+        use_tpu=use_tpu,
+        train_batch_size=train_batch_size,
+        eval_batch_size=eval_batch_size,
+        predict_batch_size=predict_batch_size,
+        batch_axis=batch_axis,
+        eval_on_tpu=eval_on_tpu,
+        export_to_tpu=export_to_tpu,
+        warm_start_from=warm_start_from)
+
+
+def _is_on_tpu(mode, use_tpu, eval_on_tpu):
+  if mode == model_fn_lib.ModeKeys.TRAIN:
+    return use_tpu
+  elif mode == model_fn_lib.ModeKeys.EVAL:
+    return eval_on_tpu
+  else:
+    return False
+
+
+def _get_estimator_spec(
+    mode, gan_model, generator_loss_fn, discriminator_loss_fn,
+    get_eval_metric_ops_fn, generator_optimizer, discriminator_optimizer,
+    joint_train, is_on_tpu, gan_train_steps):
+  """Get the TPUEstimatorSpec for the current mode."""
+  if mode == model_fn_lib.ModeKeys.PREDICT:
+    estimator_spec = tpu_estimator.TPUEstimatorSpec(
+        mode=mode, predictions={'generated_data': gan_model.generated_data})
+  elif mode == model_fn_lib.ModeKeys.EVAL:
+    gan_loss = tfgan_tuples.GANLoss(
+        generator_loss=generator_loss_fn(
+            gan_model, add_summaries=not is_on_tpu),
+        discriminator_loss=discriminator_loss_fn(
+            gan_model, add_summaries=not is_on_tpu))
+    # Eval losses for metrics must preserve batch dimension.
+    gan_loss_no_reduction = tfgan_tuples.GANLoss(
+        generator_loss=generator_loss_fn(
+            gan_model, add_summaries=False, reduction=losses.Reduction.NONE),
+        discriminator_loss=discriminator_loss_fn(
+            gan_model, add_summaries=False, reduction=losses.Reduction.NONE))
+    estimator_spec = _get_eval_estimator_spec(
+        gan_model, gan_loss, gan_loss_no_reduction, get_eval_metric_ops_fn)
+  else:  # model_fn_lib.ModeKeys.TRAIN:
+    gan_loss = tfgan_tuples.GANLoss(
+        generator_loss=generator_loss_fn(
+            gan_model, add_summaries=not is_on_tpu),
+        discriminator_loss=discriminator_loss_fn(
+            gan_model, add_summaries=not is_on_tpu))
+
+    # Construct optimizers if arguments were callable. For TPUs, they must be
+    # `CrossShardOptimizer`.
+    g_callable = callable(generator_optimizer)
+    gopt = generator_optimizer() if g_callable  else generator_optimizer
+    d_callable = callable(discriminator_optimizer)
+    dopt = discriminator_optimizer() if d_callable else discriminator_optimizer
+
+    estimator_spec = _get_train_estimator_spec(
+        gan_model, gan_loss, gopt, dopt, joint_train, gan_train_steps)
+
+  return estimator_spec
+
+
+def _get_eval_estimator_spec(gan_model, gan_loss, gan_loss_no_reduction,
+                             get_eval_metric_ops_fn):
+  """Return an TPUEstimatorSpec for the eval case."""
+  # Make the metric function and tensor names.
+  if get_eval_metric_ops_fn is not None:
+    def metric_fn(
+        generator_inputs, generated_data, real_data, discriminator_real_outputs,
+        discriminator_gen_outputs, generator_loss, discriminator_loss):
+      """`metric_fn` used in TPUEstimator to calculate metrics."""
+      eval_metric_ops = {
+          'generator_loss': metrics_lib.mean(generator_loss),
+          'discriminator_loss': metrics_lib.mean(discriminator_loss),
+      }
+      custom_eval_metric_ops = get_eval_metric_ops_fn(
+          generator_inputs, generated_data, real_data,
+          discriminator_real_outputs, discriminator_gen_outputs)
+      if not isinstance(custom_eval_metric_ops, dict):
+        raise TypeError('`get_eval_metric_ops_fn` must return a dict, '
+                        'received: {}'.format(custom_eval_metric_ops))
+      eval_metric_ops.update(custom_eval_metric_ops)
+      return eval_metric_ops
+    tensors = {
+        'generator_loss': gan_loss_no_reduction.generator_loss,
+        'discriminator_loss': gan_loss_no_reduction.discriminator_loss,
+        'generator_inputs': gan_model.generator_inputs,
+        'generated_data': gan_model.generated_data,
+        'real_data': gan_model.real_data,
+        'discriminator_real_outputs': gan_model.discriminator_real_outputs,
+        'discriminator_gen_outputs': gan_model.discriminator_gen_outputs,
+    }
+  else:
+    def metric_fn(generator_loss, discriminator_loss):
+      return {
+          'generator_loss': metrics_lib.mean(generator_loss),
+          'discriminator_loss': metrics_lib.mean(discriminator_loss),
+      }
+    tensors = {
+        'generator_loss': gan_loss_no_reduction.generator_loss,
+        'discriminator_loss': gan_loss_no_reduction.discriminator_loss,
+    }
+
+  scalar_loss = gan_loss.generator_loss + gan_loss.discriminator_loss
+  return tpu_estimator.TPUEstimatorSpec(
+      mode=model_fn_lib.ModeKeys.EVAL,
+      predictions=gan_model.generated_data,
+      loss=scalar_loss,
+      eval_metrics=(metric_fn, tensors))
+
+
+def _get_train_estimator_spec(
+    gan_model, gan_loss, generator_optimizer, discriminator_optimizer,
+    joint_train, gan_train_steps):
+  """Return a TPUEstimatorSpec for the train case."""
+  scalar_loss = gan_loss.generator_loss + gan_loss.discriminator_loss
+
+  # Get generator and discriminator update ops. We split them so that update
+  # ops aren't accidentally run multiple times. For now, throw an error if
+  # there are update ops that aren't associated with either the generator or
+  # the discriminator. Might modify the `kwargs` dictionary.
+  gen_update_ops, dis_update_ops = tfgan_train._get_update_ops(  # pylint:disable=protected-access
+      {}, gan_model.generator_scope.name, gan_model.discriminator_scope.name)
+
+  def gen_train_op():
+    with ops.name_scope('generator_train'):
+      return training.create_train_op(
+          total_loss=gan_loss.generator_loss,
+          optimizer=generator_optimizer,
+          variables_to_train=gan_model.generator_variables,
+          update_ops=gen_update_ops)
+  def dis_train_op():
+    with ops.name_scope('discriminator_train'):
+      return training.create_train_op(
+          total_loss=gan_loss.discriminator_loss,
+          optimizer=discriminator_optimizer,
+          variables_to_train=gan_model.discriminator_variables,
+          update_ops=dis_update_ops)
+
+  # Either optimize the generator and discriminator sequentially or jointly.
+  tpu_train_op = _combine_train_ops(gen_train_op, dis_train_op, joint_train,
+                                    gan_train_steps)
+
+  return tpu_estimator.TPUEstimatorSpec(
+      loss=scalar_loss,
+      mode=model_fn_lib.ModeKeys.TRAIN,
+      train_op=tpu_train_op)
+
+
+# TODO(joelshor): Add support for multiple D / G steps.
+def _combine_train_ops(gen_train_op, dis_train_op, joint_train,
+                       gan_train_steps):
+  """Combine generator and discriminator train ops into a single op."""
+  del gan_train_steps
+  if joint_train:
+    tpu_train_op = control_flow_ops.group(gen_train_op(), dis_train_op(),
+                                          name='joint_train')
+  else:
+    with ops.control_dependencies([dis_train_op()]):
+      tpu_train_op = gen_train_op()
+
+  return tpu_train_op
+
+
+def _maybe_make_cross_shard_optimizer(opt):
+  if callable(opt):
+    if not isinstance(opt(), tpu_optimizer.CrossShardOptimizer):
+      return lambda: tpu_optimizer.CrossShardOptimizer(opt())
+  elif not isinstance(opt, tpu_optimizer.CrossShardOptimizer):
+    return tpu_optimizer.CrossShardOptimizer(opt)
+  return opt
diff --git a/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..baf2c28df4b63cff525dcf3ff880730768ad000a
--- /dev/null
+++ b/tensorflow/contrib/gan/python/estimator/python/tpu_gan_estimator_test.py
@@ -0,0 +1,318 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TF-GAN's TPU Estimator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import shutil
+import tempfile
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.contrib import layers
+from tensorflow.contrib.gan.python import namedtuples as tfgan_tuples
+from tensorflow.contrib.gan.python.estimator.python import tpu_gan_estimator_impl as estimator
+from tensorflow.contrib.gan.python.losses.python import tuple_losses as losses
+from tensorflow.contrib.tpu.python.tpu import tpu_config
+from tensorflow.contrib.tpu.python.tpu import tpu_estimator
+from tensorflow.contrib.tpu.python.tpu import tpu_optimizer
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator.estimator import WarmStartSettings
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework.errors_impl import NotFoundError
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import metrics as metrics_lib
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import flags
+from tensorflow.python.platform import test
+from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import learning_rate_decay
+from tensorflow.python.training import training
+from tensorflow.python.training import training_util
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_bool('use_tpu', False, 'Whether to run test on TPU or not.')
+
+
+def generator_fn(noise, mode):
+  del mode
+  return layers.fully_connected(noise, tensor_shape.dimension_value(
+      noise.shape[1]))
+
+
+def discriminator_fn(data, unused_conditioning, mode):
+  del unused_conditioning, mode
+  return layers.fully_connected(data, 1)
+
+
+def get_dummy_gan_model():
+  # TODO(joelshor): Find a better way of creating a variable scope.
+  with variable_scope.variable_scope('generator') as gen_scope:
+    gen_var = variable_scope.get_variable('dummy_var', initializer=0.0)
+  with variable_scope.variable_scope('discriminator') as dis_scope:
+    dis_var = variable_scope.get_variable('dummy_var', initializer=0.0)
+  return tfgan_tuples.GANModel(
+      generator_inputs=None,
+      generated_data=array_ops.ones([3, 4]),
+      generator_variables=[gen_var],
+      generator_scope=gen_scope,
+      generator_fn=None,
+      real_data=array_ops.zeros([3, 4]),
+      discriminator_real_outputs=array_ops.ones([1, 2, 3]) * dis_var,
+      discriminator_gen_outputs=array_ops.ones([1, 2, 3]) * gen_var * dis_var,
+      discriminator_variables=[dis_var],
+      discriminator_scope=dis_scope,
+      discriminator_fn=None)
+
+
+def get_metrics(generator_inputs, generated_data, real_data,
+                discriminator_real_outputs, discriminator_gen_outputs):
+  del generator_inputs, discriminator_real_outputs, discriminator_gen_outputs
+  return {
+      'mse_custom_metric': metrics_lib.mean_squared_error(
+          real_data, generated_data)
+  }
+
+
+class GetTPUEstimatorSpecTest(test.TestCase, parameterized.TestCase):
+  """Tests that the EstimatorSpec is constructed appropriately."""
+
+  @classmethod
+  def setUpClass(cls):
+    super(GetTPUEstimatorSpecTest, cls).setUpClass()
+    cls._generator_optimizer = tpu_optimizer.CrossShardOptimizer(
+        training.GradientDescentOptimizer(1.0))
+    cls._discriminator_optimizer = tpu_optimizer.CrossShardOptimizer(
+        training.GradientDescentOptimizer(1.0))
+
+  @parameterized.named_parameters(
+      ('joint_train', model_fn_lib.ModeKeys.TRAIN, True),
+      ('train_sequential', model_fn_lib.ModeKeys.TRAIN, False),
+      ('eval', model_fn_lib.ModeKeys.EVAL, None),
+      ('predict', model_fn_lib.ModeKeys.PREDICT, None))
+  def test_get_estimator_spec(self, mode, joint_train):
+    with ops.Graph().as_default():
+      self._gan_model = get_dummy_gan_model()
+      spec = estimator._get_estimator_spec(
+          mode,
+          self._gan_model,
+          generator_loss_fn=losses.wasserstein_generator_loss,
+          discriminator_loss_fn=losses.wasserstein_discriminator_loss,
+          get_eval_metric_ops_fn=get_metrics,
+          generator_optimizer=self._generator_optimizer,
+          discriminator_optimizer=self._discriminator_optimizer,
+          joint_train=joint_train,
+          is_on_tpu=FLAGS.use_tpu,
+          gan_train_steps=tfgan_tuples.GANTrainSteps(1, 1))
+
+    self.assertIsInstance(spec, tpu_estimator.TPUEstimatorSpec)
+    self.assertEqual(mode, spec.mode)
+    if mode == model_fn_lib.ModeKeys.PREDICT:
+      self.assertEqual({'generated_data': self._gan_model.generated_data},
+                       spec.predictions)
+    elif mode == model_fn_lib.ModeKeys.TRAIN:
+      self.assertShapeEqual(np.array(0), spec.loss)  # must be a scalar
+      self.assertIsNotNone(spec.train_op)
+      self.assertIsNotNone(spec.training_hooks)
+    elif mode == model_fn_lib.ModeKeys.EVAL:
+      self.assertEqual(self._gan_model.generated_data, spec.predictions)
+      self.assertShapeEqual(np.array(0), spec.loss)  # must be a scalar
+      self.assertIsNotNone(spec.eval_metrics)
+
+
+class TPUGANEstimatorIntegrationTest(test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super(TPUGANEstimatorIntegrationTest, self).setUp()
+    self._model_dir = tempfile.mkdtemp()
+    self._config = tpu_config.RunConfig(model_dir=self._model_dir)
+
+  def tearDown(self):
+    super(TPUGANEstimatorIntegrationTest, self).tearDown()
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _test_complete_flow(
+      self, train_input_fn, eval_input_fn, predict_input_fn, prediction_size,
+      lr_decay=False, joint_train=True):
+    def make_opt():
+      gstep = training_util.get_or_create_global_step()
+      lr = learning_rate_decay.exponential_decay(1.0, gstep, 10, 0.9)
+      return training.GradientDescentOptimizer(lr)
+
+    gopt = make_opt if lr_decay else training.GradientDescentOptimizer(1.0)
+    dopt = make_opt if lr_decay else training.GradientDescentOptimizer(1.0)
+    est = estimator.TPUGANEstimator(
+        generator_fn=generator_fn,
+        discriminator_fn=discriminator_fn,
+        generator_loss_fn=losses.wasserstein_generator_loss,
+        discriminator_loss_fn=losses.wasserstein_discriminator_loss,
+        generator_optimizer=gopt,
+        discriminator_optimizer=dopt,
+        joint_train=joint_train,
+        get_eval_metric_ops_fn=get_metrics,
+        train_batch_size=4,
+        eval_batch_size=10,
+        predict_batch_size=8,
+        use_tpu=FLAGS.use_tpu,
+        config=self._config)
+
+    # Train.
+    num_steps_train = 10
+    est.train(train_input_fn, steps=num_steps_train)
+
+    # Evaluate.
+    num_steps_eval = 2
+    scores = est.evaluate(eval_input_fn, steps=num_steps_eval)
+    self.assertIn(ops.GraphKeys.GLOBAL_STEP, scores)
+    self.assertIn('loss', scores)
+    self.assertEqual(scores['discriminator_loss'] + scores['generator_loss'],
+                     scores['loss'])
+    self.assertIn('mse_custom_metric', scores)
+
+    # Predict.
+    predictions = np.array([x['generated_data'] for x in
+                            est.predict(predict_input_fn)])
+    self.assertAllEqual(prediction_size, predictions.shape)
+
+  @parameterized.named_parameters(
+      ('joint_train', True, False, False),
+      ('train_sequential', False, False, False),
+      ('lr_decay', False, True, False),
+      ('train_sequential_ds', False, False, True))
+  def test_numpy_input_fn(self, joint_train, lr_decay, return_ds):
+    """Tests complete flow with numpy_input_fn."""
+    input_dim = 4
+    def train_input_fn(params):
+      data = np.zeros([input_dim], dtype=np.float32)
+      ds = (dataset_ops.Dataset
+            .from_tensors((data, data))
+            .repeat()
+            .batch(params['batch_size'], drop_remainder=True))
+      if return_ds:
+        return ds
+      else:
+        x, y = ds.make_one_shot_iterator().get_next()
+        return x, y
+    def eval_input_fn(params):
+      data = np.zeros([input_dim], dtype=np.float32)
+      ds = (dataset_ops.Dataset
+            .from_tensors((data, data))
+            .repeat()
+            .batch(params['batch_size'], drop_remainder=True))
+      if return_ds:
+        return ds
+      else:
+        x, y = ds.make_one_shot_iterator().get_next()
+        return x, y
+    predict_size = 10
+    def predict_input_fn(params):
+      del params  # unused
+      data = np.zeros([input_dim], dtype=np.float32)
+      ds = (dataset_ops.Dataset
+            .from_tensors(data)
+            .repeat(predict_size)
+            .batch(1, drop_remainder=True))
+      return ds
+
+    self._test_complete_flow(
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        prediction_size=[predict_size, input_dim],
+        lr_decay=lr_decay,
+        joint_train=joint_train)
+
+
+class TPUGANEstimatorWarmStartTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = self.get_temp_dir()
+    self._config = tpu_config.RunConfig(model_dir=self._model_dir)
+    self.new_variable_name = 'new_var'
+    self.new_variable_value = [1.0, 2.0, 3.0]
+
+  def tearDown(self):
+    writer_cache.FileWriterCache.clear()
+
+  def _test_warm_start(self, warm_start_from=None):
+    """Tests whether WarmStartSettings work as intended."""
+    def generator_with_new_variable(noise_dict, mode):
+      variable_scope.get_variable(name=self.new_variable_name,
+                                  initializer=self.new_variable_value,
+                                  trainable=True)
+      return generator_fn(noise_dict, mode)
+
+    est = estimator.TPUGANEstimator(
+        generator_fn=generator_fn,
+        discriminator_fn=discriminator_fn,
+        generator_loss_fn=losses.wasserstein_generator_loss,
+        discriminator_loss_fn=losses.wasserstein_discriminator_loss,
+        generator_optimizer=training.GradientDescentOptimizer(1.0),
+        discriminator_optimizer=training.GradientDescentOptimizer(1.0),
+        train_batch_size=4,
+        use_tpu=FLAGS.use_tpu,
+        config=self._config)
+
+    def train_input_fn(params):
+      data = np.zeros([params['batch_size'], 4], dtype=np.float32)
+      return data, data
+
+    est.train(train_input_fn, steps=1)
+
+    est_warm = estimator.TPUGANEstimator(
+        generator_fn=generator_with_new_variable,
+        discriminator_fn=discriminator_fn,
+        generator_loss_fn=losses.wasserstein_generator_loss,
+        discriminator_loss_fn=losses.wasserstein_discriminator_loss,
+        generator_optimizer=training.GradientDescentOptimizer(1.0),
+        discriminator_optimizer=training.GradientDescentOptimizer(1.0),
+        config=tpu_config.RunConfig(
+            model_dir=None if warm_start_from else self._model_dir),
+        train_batch_size=4,
+        use_tpu=FLAGS.use_tpu,
+        warm_start_from=warm_start_from)
+
+    est_warm.train(train_input_fn, steps=1)
+
+    return est_warm
+
+  def test_warm_start_error(self):
+    """Test if exception when reloading different estimators."""
+    with self.assertRaises(NotFoundError):
+      self._test_warm_start()
+
+  def test_warm_start_success(self):
+    """Test if GANEstimator allows explicit warm start variable assignment."""
+    # Regex matches all variable names in ckpt except for new_var.
+    var_regex = '^(?!.*%s.*)' % self.new_variable_name
+    warmstart = WarmStartSettings(ckpt_to_initialize_from=self._model_dir,
+                                  vars_to_warm_start=var_regex)
+    est_warm = self._test_warm_start(warm_start_from=warmstart)
+    full_variable_name = 'Generator/%s' % self.new_variable_name
+    self.assertIn(full_variable_name, est_warm.get_variable_names())
+    equal_vals = np.array_equal(est_warm.get_variable_value(full_variable_name),
+                                self.new_variable_value)
+    self.assertTrue(equal_vals)
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/gan/python/eval/__init__.py b/tensorflow/contrib/gan/python/eval/__init__.py
index f86b8513053a45f9830411f7df2c32d1f36a97b2..92e9abf8a35de1999eb800e169f32220fe47f8cd 100644
--- a/tensorflow/contrib/gan/python/eval/__init__.py
+++ b/tensorflow/contrib/gan/python/eval/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN evaluation module.
+"""TF-GAN evaluation module.
 
 This module supports techniques such as Inception Score, Frechet Inception
 distance, and Sliced Wasserstein distance.
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics.py
index 1c872626a957279132772ae27df7a66a2564e9a5..a52e899114b62cb29752f72aa59f142f4a428aa1 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Model evaluation tools for TFGAN."""
+"""Model evaluation tools for TF-GAN."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
index a71ee53311c1c057a5b41be0331bf56ce1a82f74..ff19ce2f78e9c86400089e454c88450f01c41764 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Model evaluation tools for TFGAN.
+"""Model evaluation tools for TF-GAN.
 
 These methods come from https://arxiv.org/abs/1606.03498,
 https://arxiv.org/abs/1706.08500, and https://arxiv.org/abs/1801.01401.
@@ -41,9 +41,9 @@ from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import nn_ops
@@ -346,7 +346,7 @@ def classifier_score(images, classifier_fn, num_batches=1):
       images, num_or_size_splits=num_batches)
 
   # Compute the classifier splits using the memory-efficient `map_fn`.
-  logits = functional_ops.map_fn(
+  logits = map_fn.map_fn(
       fn=classifier_fn,
       elems=array_ops.stack(generated_images_list),
       parallel_iterations=1,
@@ -387,7 +387,7 @@ def classifier_score_from_logits(logits):
   # Use maximum precision for best results.
   logits_dtype = logits.dtype
   if logits_dtype != dtypes.float64:
-    logits = math_ops.to_double(logits)
+    logits = math_ops.cast(logits, dtypes.float64)
 
   p = nn_ops.softmax(logits)
   q = math_ops.reduce_mean(p, axis=0)
@@ -505,12 +505,12 @@ def frechet_classifier_distance(real_images,
 
   # Compute the activations using the memory-efficient `map_fn`.
   def compute_activations(elems):
-    return functional_ops.map_fn(fn=classifier_fn,
-                                 elems=elems,
-                                 parallel_iterations=1,
-                                 back_prop=False,
-                                 swap_memory=True,
-                                 name='RunClassifier')
+    return map_fn.map_fn(fn=classifier_fn,
+                         elems=elems,
+                         parallel_iterations=1,
+                         back_prop=False,
+                         swap_memory=True,
+                         name='RunClassifier')
 
   real_a = compute_activations(real_imgs)
   gen_a = compute_activations(generated_imgs)
@@ -562,8 +562,8 @@ def mean_only_frechet_classifier_distance_from_activations(
 
   activations_dtype = real_activations.dtype
   if activations_dtype != dtypes.float64:
-    real_activations = math_ops.to_double(real_activations)
-    generated_activations = math_ops.to_double(generated_activations)
+    real_activations = math_ops.cast(real_activations, dtypes.float64)
+    generated_activations = math_ops.cast(generated_activations, dtypes.float64)
 
   # Compute means of activations.
   m = math_ops.reduce_mean(real_activations, 0)
@@ -623,8 +623,8 @@ def diagonal_only_frechet_classifier_distance_from_activations(
 
   activations_dtype = real_activations.dtype
   if activations_dtype != dtypes.float64:
-    real_activations = math_ops.to_double(real_activations)
-    generated_activations = math_ops.to_double(generated_activations)
+    real_activations = math_ops.cast(real_activations, dtypes.float64)
+    generated_activations = math_ops.cast(generated_activations, dtypes.float64)
 
   # Compute mean and covariance matrices of activations.
   m, var = nn_impl.moments(real_activations, axes=[0])
@@ -698,15 +698,16 @@ def frechet_classifier_distance_from_activations(real_activations,
 
   activations_dtype = real_activations.dtype
   if activations_dtype != dtypes.float64:
-    real_activations = math_ops.to_double(real_activations)
-    generated_activations = math_ops.to_double(generated_activations)
+    real_activations = math_ops.cast(real_activations, dtypes.float64)
+    generated_activations = math_ops.cast(generated_activations, dtypes.float64)
 
   # Compute mean and covariance matrices of activations.
   m = math_ops.reduce_mean(real_activations, 0)
   m_w = math_ops.reduce_mean(generated_activations, 0)
-  num_examples_real = math_ops.to_double(array_ops.shape(real_activations)[0])
-  num_examples_generated = math_ops.to_double(
-      array_ops.shape(generated_activations)[0])
+  num_examples_real = math_ops.cast(
+      array_ops.shape(real_activations)[0], dtypes.float64)
+  num_examples_generated = math_ops.cast(
+      array_ops.shape(generated_activations)[0], dtypes.float64)
 
   # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T
   real_centered = real_activations - m
@@ -794,9 +795,9 @@ def kernel_classifier_distance(real_images,
       on a classifier.
     num_classifier_batches: Number of batches to split images in to in order to
       efficiently run them through the classifier network.
-    max_estimator_block_size: integer, default 1024. The distance estimator
-      splits samples into blocks for computational efficiency. Larger values are
-      more computationally expensive but decrease the variance of the distance
+    max_block_size: integer, default 1024. The distance estimator splits samples
+      into blocks for computational efficiency. Larger values are more
+      computationally expensive but decrease the variance of the distance
       estimate.
     dtype: if not None, coerce activations to this dtype before computations.
 
@@ -871,9 +872,9 @@ def kernel_classifier_distance_and_std(real_images,
       on a classifier.
     num_classifier_batches: Number of batches to split images in to in order to
       efficiently run them through the classifier network.
-    max_estimator_block_size: integer, default 1024. The distance estimator
-      splits samples into blocks for computational efficiency. Larger values are
-      more computationally expensive but decrease the variance of the distance
+    max_block_size: integer, default 1024. The distance estimator splits samples
+      into blocks for computational efficiency. Larger values are more
+      computationally expensive but decrease the variance of the distance
       estimate. Having a smaller block size also gives a better estimate of the
       standard error.
     dtype: if not None, coerce activations to this dtype before computations.
@@ -894,7 +895,7 @@ def kernel_classifier_distance_and_std(real_images,
 
   # Compute the activations using the memory-efficient `map_fn`.
   def compute_activations(elems):
-    return functional_ops.map_fn(
+    return map_fn.map_fn(
         fn=classifier_fn,
         elems=elems,
         parallel_iterations=1,
@@ -910,7 +911,7 @@ def kernel_classifier_distance_and_std(real_images,
   gen_a = array_ops.concat(array_ops.unstack(gen_a), 0)
 
   return kernel_classifier_distance_and_std_from_activations(
-      real_a, gen_a, max_block_size=max_block_size)
+      real_a, gen_a, max_block_size, dtype)
 
 
 kernel_inception_distance_and_std = functools.partial(
@@ -967,14 +968,14 @@ def kernel_classifier_distance_from_activations(real_activations,
       into blocks for computational efficiency. Larger values are more
       computationally expensive but decrease the variance of the distance
       estimate.
-    dtype: if not None, coerce activations to this dtype before computations.
+    dtype: If not None, coerce activations to this dtype before computations.
 
   Returns:
    The Kernel Inception Distance. A floating-point scalar of the same type
    as the output of the activations.
   """
   return kernel_classifier_distance_and_std_from_activations(
-      real_activations, generated_activations, max_block_size=max_block_size)[0]
+      real_activations, generated_activations, max_block_size, dtype)[0]
 
 
 def kernel_classifier_distance_and_std_from_activations(real_activations,
@@ -1029,7 +1030,7 @@ def kernel_classifier_distance_and_std_from_activations(real_activations,
       computationally expensive but decrease the variance of the distance
       estimate. Having a smaller block size also gives a better estimate of the
       standard error.
-    dtype: if not None, coerce activations to this dtype before computations.
+    dtype: If not None, coerce activations to this dtype before computations.
 
   Returns:
    The Kernel Inception Distance. A floating-point scalar of the same type
@@ -1080,7 +1081,7 @@ def kernel_classifier_distance_and_std_from_activations(real_activations,
   dim = math_ops.cast(real_activations.shape[1], dtype)
 
   def compute_kid_block(i):
-    'Compute the ith block of the KID estimate.'
+    """Computes the ith block of the KID estimate."""
     r_s = inds_r[i]
     r_e = inds_r[i + 1]
     r = real_activations[r_s:r_e]
@@ -1098,7 +1099,7 @@ def kernel_classifier_distance_and_std_from_activations(real_activations,
             (math_ops.reduce_sum(k_rr) - math_ops.trace(k_rr)) / (m * (m - 1)) +
             (math_ops.reduce_sum(k_gg) - math_ops.trace(k_gg)) / (n * (n - 1)))
 
-  ests = functional_ops.map_fn(
+  ests = map_fn.map_fn(
       compute_kid_block, math_ops.range(n_blocks), dtype=dtype, back_prop=False)
 
   mn = math_ops.reduce_mean(ests)
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
index dbff1d2a367e10adc607dafb4c571bb3607a3963..bc7c1057b478fe2656898e68c1a14013b5a71d12 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for TFGAN classifier_metrics."""
+"""Tests for TF-GAN classifier_metrics."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -234,7 +234,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
     else:
       logits = classifier_metrics.run_inception(img, _get_dummy_graphdef())
 
-    self.assertTrue(isinstance(logits, ops.Tensor))
+    self.assertIsInstance(logits, ops.Tensor)
     logits.shape.assert_is_compatible_with([batch_size, 1001])
 
     # Check that none of the model variables are trainable.
@@ -258,7 +258,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
           img, _get_dummy_graphdef(),
           output_tensor=classifier_metrics.INCEPTION_FINAL_POOL)
 
-    self.assertTrue(isinstance(pool, ops.Tensor))
+    self.assertIsInstance(pool, ops.Tensor)
     pool.shape.assert_is_compatible_with([batch_size, 2048])
 
     # Check that none of the model variables are trainable.
@@ -276,8 +276,8 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
             classifier_metrics.INCEPTION_FINAL_POOL
         ])
 
-    self.assertTrue(isinstance(logits, ops.Tensor))
-    self.assertTrue(isinstance(pool, ops.Tensor))
+    self.assertIsInstance(logits, ops.Tensor)
+    self.assertIsInstance(pool, ops.Tensor)
     logits.shape.assert_is_compatible_with([batch_size, 1001])
     pool.shape.assert_is_compatible_with([batch_size, 2048])
 
@@ -290,7 +290,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
         classifier_metrics.inception_score,
         array_ops.zeros([6, 299, 299, 3]),
         num_batches=3)
-    self.assertTrue(isinstance(score, ops.Tensor))
+    self.assertIsInstance(score, ops.Tensor)
     score.shape.assert_has_rank(0)
 
     # Check that none of the model variables are trainable.
@@ -302,7 +302,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
     distance = _run_with_mock(
         classifier_metrics.frechet_inception_distance, img, img)
 
-    self.assertTrue(isinstance(distance, ops.Tensor))
+    self.assertIsInstance(distance, ops.Tensor)
     distance.shape.assert_has_rank(0)
 
     # Check that none of the model variables are trainable.
@@ -314,7 +314,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
     distance = _run_with_mock(classifier_metrics.kernel_inception_distance, img,
                               img)
 
-    self.assertTrue(isinstance(distance, ops.Tensor))
+    self.assertIsInstance(distance, ops.Tensor)
     distance.shape.assert_has_rank(0)
 
     # Check that none of the model variables are trainable.
@@ -365,7 +365,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
     unused_image = array_ops.zeros([2, 299, 299, 3])
     incscore = _run_with_mock(classifier_metrics.inception_score, unused_image)
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       incscore_np = sess.run(incscore, {'concat:0': logits})
 
     self.assertAllClose(_expected_inception_score(logits), incscore_np)
@@ -473,7 +473,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
         classifier_fn=lambda x: x,
         max_block_size=600)
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       actual_kid, actual_std = sess.run(kid_op)
 
     expected_kid, expected_std = _expected_kid_and_std(test_pool_real_a,
@@ -500,7 +500,7 @@ class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
         max_block_size=max_block_size)
 
     for block_size in [50, 512, 1000]:
-      with self.test_session() as sess:
+      with self.cached_session() as sess:
         actual_kid, actual_std = sess.run(kid_op, {max_block_size: block_size})
 
       expected_kid, expected_std = _expected_kid_and_std(
diff --git a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein.py b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein.py
index 523968bed91f1021ae629bf52c405cf5c2d7b917..326fcb3cdbf2eda66207f134cd2926f09a216a99 100644
--- a/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein.py
+++ b/tensorflow/contrib/gan/python/eval/python/sliced_wasserstein.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Model evaluation tools for TFGAN."""
+"""Model evaluation tools for TF-GAN."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries.py b/tensorflow/contrib/gan/python/eval/python/summaries.py
index ecfdb39499b1e824e02415c0db1de3157e4f3216..1b202dfc97304ddc7ced42d65366aaf419439392 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Common TFGAN summaries."""
+"""Common TF-GAN summaries."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py b/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
index f9995bb19d0d09eaf6fd96d039b0bba1d3a7055c..c7bbd65bbff41c25327733ae1f17a090fb69cb52 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Common TFGAN summaries."""
+"""Common TF-GAN summaries."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,7 +22,7 @@ from tensorflow.contrib.gan.python import namedtuples
 from tensorflow.contrib.gan.python.eval.python import eval_utils
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import util as loss_util
@@ -261,7 +261,7 @@ def add_stargan_image_summaries(stargan_model,
 
   summary.image(
       'stargan_image_generation',
-      functional_ops.map_fn(
+      map_fn.map_fn(
           _build_image,
           stargan_model.input_data[:num_images],
           parallel_iterations=num_images,
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_test.py b/tensorflow/contrib/gan/python/eval/python/summaries_test.py
index 54a6f8d4d9086ad7fc8db31032677628561e48e8..53fc7cb8ede698c2d8590c7fd3016a884cef9be9 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for TFGAN summaries."""
+"""Tests for TF-GAN summaries."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/features/__init__.py b/tensorflow/contrib/gan/python/features/__init__.py
index 4816daf760143af9f1502873b123ffad8e5ec8ce..410c3a02052cd3a07a36a0ba332a80b3c2705d89 100644
--- a/tensorflow/contrib/gan/python/features/__init__.py
+++ b/tensorflow/contrib/gan/python/features/__init__.py
@@ -27,11 +27,13 @@ from __future__ import print_function
 from tensorflow.contrib.gan.python.features.python import clip_weights
 from tensorflow.contrib.gan.python.features.python import conditioning_utils
 from tensorflow.contrib.gan.python.features.python import random_tensor_pool
+from tensorflow.contrib.gan.python.features.python import spectral_normalization
 from tensorflow.contrib.gan.python.features.python import virtual_batchnorm
 
 from tensorflow.contrib.gan.python.features.python.clip_weights import *
 from tensorflow.contrib.gan.python.features.python.conditioning_utils import *
 from tensorflow.contrib.gan.python.features.python.random_tensor_pool import *
+from tensorflow.contrib.gan.python.features.python.spectral_normalization import *
 from tensorflow.contrib.gan.python.features.python.virtual_batchnorm import *
 # pylint: enable=unused-import,wildcard-import
 
@@ -40,5 +42,6 @@ from tensorflow.python.util.all_util import remove_undocumented
 _allowed_symbols = clip_weights.__all__
 _allowed_symbols += conditioning_utils.__all__
 _allowed_symbols += random_tensor_pool.__all__
+_allowed_symbols += spectral_normalization.__all__
 _allowed_symbols += virtual_batchnorm.__all__
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/gan/python/features/python/spectral_normalization.py b/tensorflow/contrib/gan/python/features/python/spectral_normalization.py
new file mode 100644
index 0000000000000000000000000000000000000000..54d3d0a218dec3588844333cd47e1f92489d8df9
--- /dev/null
+++ b/tensorflow/contrib/gan/python/features/python/spectral_normalization.py
@@ -0,0 +1,32 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras-like layers and utilities that implement Spectral Normalization.
+
+Based on "Spectral Normalization for Generative Adversarial Networks" by Miyato,
+et al in ICLR 2018. https://openreview.net/pdf?id=B1QRgziT-
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.gan.python.features.python import spectral_normalization_impl
+# pylint: disable=wildcard-import
+from tensorflow.contrib.gan.python.features.python.spectral_normalization_impl import *
+# pylint: enable=wildcard-import
+from tensorflow.python.util.all_util import remove_undocumented
+
+__all__ = spectral_normalization_impl.__all__
+remove_undocumented(__name__, __all__)
diff --git a/tensorflow/contrib/gan/python/features/python/spectral_normalization_impl.py b/tensorflow/contrib/gan/python/features/python/spectral_normalization_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cc653f0a7907f407e66add5537d1e0a5adb6d8b
--- /dev/null
+++ b/tensorflow/contrib/gan/python/features/python/spectral_normalization_impl.py
@@ -0,0 +1,315 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras-like layers and utilities that implement Spectral Normalization.
+
+Based on "Spectral Normalization for Generative Adversarial Networks" by Miyato,
+et al in ICLR 2018. https://openreview.net/pdf?id=B1QRgziT-
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import numbers
+import re
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.engine import base_layer_utils as keras_base_layer_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import tf_logging as logging
+
+__all__ = [
+    'compute_spectral_norm', 'spectral_normalize', 'spectral_norm_regularizer',
+    'spectral_normalization_custom_getter', 'keras_spectral_normalization'
+]
+
+# tf.bfloat16 should work, but tf.matmul converts those to tf.float32 which then
+# can't directly be assigned back to the tf.bfloat16 variable.
+_OK_DTYPES_FOR_SPECTRAL_NORM = (dtypes.float16, dtypes.float32, dtypes.float64)
+_PERSISTED_U_VARIABLE_SUFFIX = 'spectral_norm_u'
+
+
+def compute_spectral_norm(w_tensor, power_iteration_rounds=1, name=None):
+  """Estimates the largest singular value in the weight tensor.
+
+  Args:
+    w_tensor: The weight matrix whose spectral norm should be computed.
+    power_iteration_rounds: The number of iterations of the power method to
+      perform. A higher number yeilds a better approximation.
+    name: An optional scope name.
+
+  Returns:
+    The largest singular value (the spectral norm) of w.
+  """
+  with variable_scope.variable_scope(name, 'spectral_norm'):
+    # The paper says to flatten convnet kernel weights from
+    # (C_out, C_in, KH, KW) to (C_out, C_in * KH * KW). But TensorFlow's Conv2D
+    # kernel weight shape is (KH, KW, C_in, C_out), so it should be reshaped to
+    # (KH * KW * C_in, C_out), and similarly for other layers that put output
+    # channels as last dimension.
+    # n.b. this means that w here is equivalent to w.T in the paper.
+    w = array_ops.reshape(w_tensor, (-1, w_tensor.get_shape()[-1]))
+
+    # Persisted approximation of first left singular vector of matrix `w`.
+    u_var = variable_scope.get_variable(
+        _PERSISTED_U_VARIABLE_SUFFIX,
+        shape=(w.shape[0], 1),
+        dtype=w.dtype,
+        initializer=init_ops.random_normal_initializer(),
+        trainable=False)
+    u = u_var
+
+    # Use power iteration method to approximate spectral norm.
+    for _ in range(power_iteration_rounds):
+      # `v` approximates the first right singular vector of matrix `w`.
+      v = nn.l2_normalize(math_ops.matmul(array_ops.transpose(w), u))
+      u = nn.l2_normalize(math_ops.matmul(w, v))
+
+    # Update persisted approximation.
+    with ops.control_dependencies([u_var.assign(u, name='update_u')]):
+      u = array_ops.identity(u)
+
+    u = array_ops.stop_gradient(u)
+    v = array_ops.stop_gradient(v)
+
+    # Largest singular value of `w`.
+    spectral_norm = math_ops.matmul(
+        math_ops.matmul(array_ops.transpose(u), w), v)
+    spectral_norm.shape.assert_is_fully_defined()
+    spectral_norm.shape.assert_is_compatible_with([1, 1])
+
+    return spectral_norm[0][0]
+
+
+def spectral_normalize(w, power_iteration_rounds=1, name=None):
+  """Normalizes a weight matrix by its spectral norm.
+
+  Args:
+    w: The weight matrix to be normalized.
+    power_iteration_rounds: The number of iterations of the power method to
+      perform. A higher number yeilds a better approximation.
+    name: An optional scope name.
+
+  Returns:
+    A normalized weight matrix tensor.
+  """
+  with variable_scope.variable_scope(name, 'spectral_normalize'):
+    w_normalized = w / compute_spectral_norm(
+        w, power_iteration_rounds=power_iteration_rounds)
+    return array_ops.reshape(w_normalized, w.get_shape())
+
+
+def spectral_norm_regularizer(scale, power_iteration_rounds=1, scope=None):
+  """Returns a functions that can be used to apply spectral norm regularization.
+
+  Small spectral norms enforce a small Lipschitz constant, which is necessary
+  for Wasserstein GANs.
+
+  Args:
+    scale: A scalar multiplier. 0.0 disables the regularizer.
+    power_iteration_rounds: The number of iterations of the power method to
+      perform. A higher number yeilds a better approximation.
+    scope: An optional scope name.
+
+  Returns:
+    A function with the signature `sn(weights)` that applies spectral norm
+    regularization.
+
+  Raises:
+    ValueError: If scale is negative or if scale is not a float.
+  """
+  if isinstance(scale, numbers.Integral):
+    raise ValueError('scale cannot be an integer: %s' % scale)
+  if isinstance(scale, numbers.Real):
+    if scale < 0.0:
+      raise ValueError(
+          'Setting a scale less than 0 on a regularizer: %g' % scale)
+    if scale == 0.0:
+      logging.info('Scale of 0 disables regularizer.')
+      return lambda _: None
+
+  def sn(weights, name=None):
+    """Applies spectral norm regularization to weights."""
+    with ops.name_scope(scope, 'SpectralNormRegularizer', [weights]) as name:
+      scale_t = ops.convert_to_tensor(
+          scale, dtype=weights.dtype.base_dtype, name='scale')
+      return math_ops.multiply(
+          scale_t,
+          compute_spectral_norm(
+              weights, power_iteration_rounds=power_iteration_rounds),
+          name=name)
+
+  return sn
+
+
+def _default_name_filter(name):
+  """A filter function to identify common names of weight variables.
+
+  Args:
+    name: The variable name.
+
+  Returns:
+    Whether `name` is a standard name for a weight/kernel variables used in the
+    Keras, tf.layers, tf.contrib.layers or tf.contrib.slim libraries.
+  """
+  match = re.match(r'(.*\/)?(depthwise_|pointwise_)?(weights|kernel)$', name)
+  return match is not None
+
+
+def spectral_normalization_custom_getter(name_filter=_default_name_filter,
+                                         power_iteration_rounds=1):
+  """Custom getter that performs Spectral Normalization on a weight tensor.
+
+  Specifically it divides the weight tensor by its largest singular value. This
+  is intended to stabilize GAN training, by making the discriminator satisfy a
+  local 1-Lipschitz constraint.
+
+  Based on [Spectral Normalization for Generative Adversarial Networks][sn-gan].
+
+  [sn-gan]: https://openreview.net/forum?id=B1QRgziT-
+
+  To reproduce an SN-GAN, apply this custom_getter to every weight tensor of
+  your discriminator. The last dimension of the weight tensor must be the number
+  of output channels.
+
+  Apply this to layers by supplying this as the `custom_getter` of a
+  `tf.variable_scope`. For example:
+
+    with tf.variable_scope('discriminator',
+                           custom_getter=spectral_norm_getter()):
+      net = discriminator_fn(net)
+
+  IMPORTANT: Keras does not respect the custom_getter supplied by the
+  VariableScope, so Keras users should use `keras_spectral_normalization`
+  instead of (or in addition to) this approach.
+
+  It is important to carefully select to which weights you want to apply
+  Spectral Normalization. In general you want to normalize the kernels of
+  convolution and dense layers, but you do not want to normalize biases. You
+  also want to avoid normalizing batch normalization (and similar) variables,
+  but in general such layers play poorly with Spectral Normalization, since the
+  gamma can cancel out the normalization in other layers. By default we supply a
+  filter that matches the kernel variable names of the dense and convolution
+  layers of the tf.layers, tf.contrib.layers, tf.keras and tf.contrib.slim
+  libraries. If you are using anything else you'll need a custom `name_filter`.
+
+  This custom getter internally creates a variable used to compute the spectral
+  norm by power iteration. It will update every time the variable is accessed,
+  which means the normalized discriminator weights may change slightly whilst
+  training the generator. Whilst unusual, this matches how the paper's authors
+  implement it, and in general additional rounds of power iteration can't hurt.
+
+  Args:
+    name_filter: Optionally, a method that takes a Variable name as input and
+      returns whether this Variable should be normalized.
+    power_iteration_rounds: The number of iterations of the power method to
+      perform per step. A higher number yeilds a better approximation of the
+      true spectral norm.
+
+  Returns:
+    A custom getter function that applies Spectral Normalization to all
+    Variables whose names match `name_filter`.
+
+  Raises:
+    ValueError: If name_filter is not callable.
+  """
+  if not callable(name_filter):
+    raise ValueError('name_filter must be callable')
+
+  def _internal_getter(getter, name, *args, **kwargs):
+    """A custom getter function that applies Spectral Normalization.
+
+    Args:
+      getter: The true getter to call.
+      name: Name of new/existing variable, in the same format as
+        tf.get_variable.
+      *args: Other positional arguments, in the same format as tf.get_variable.
+      **kwargs: Keyword arguments, in the same format as tf.get_variable.
+
+    Returns:
+      The return value of `getter(name, *args, **kwargs)`, spectrally
+      normalized.
+
+    Raises:
+      ValueError: If used incorrectly, or if `dtype` is not supported.
+    """
+    if not name_filter(name):
+      return getter(name, *args, **kwargs)
+
+    if name.endswith(_PERSISTED_U_VARIABLE_SUFFIX):
+      raise ValueError(
+          'Cannot apply Spectral Normalization to internal variables created '
+          'for Spectral Normalization. Tried to normalized variable [%s]' %
+          name)
+
+    if kwargs['dtype'] not in _OK_DTYPES_FOR_SPECTRAL_NORM:
+      raise ValueError('Disallowed data type {}'.format(kwargs['dtype']))
+
+    # This layer's weight Variable/PartitionedVariable.
+    w_tensor = getter(name, *args, **kwargs)
+
+    if len(w_tensor.get_shape()) < 2:
+      raise ValueError(
+          'Spectral norm can only be applied to multi-dimensional tensors')
+
+    return spectral_normalize(
+        w_tensor,
+        power_iteration_rounds=power_iteration_rounds,
+        name=(name + '/spectral_normalize'))
+
+  return _internal_getter
+
+
+@contextlib.contextmanager
+def keras_spectral_normalization(name_filter=_default_name_filter,
+                                 power_iteration_rounds=1):
+  """A context manager that enables Spectral Normalization for Keras.
+
+  Keras doesn't respect the `custom_getter` in the VariableScope, so this is a
+  bit of a hack to make things work.
+
+  Usage:
+    with keras_spectral_normalization():
+      net = discriminator_fn(net)
+
+  Args:
+    name_filter: Optionally, a method that takes a Variable name as input and
+      returns whether this Variable should be normalized.
+    power_iteration_rounds: The number of iterations of the power method to
+      perform per step. A higher number yeilds a better approximation of the
+      true spectral norm.
+
+  Yields:
+    A context manager that wraps the standard Keras variable creation method
+    with the `spectral_normalization_custom_getter`.
+  """
+  original_make_variable = keras_base_layer_utils.make_variable
+  sn_getter = spectral_normalization_custom_getter(
+      name_filter=name_filter, power_iteration_rounds=power_iteration_rounds)
+
+  def make_variable_wrapper(name, *args, **kwargs):
+    return sn_getter(original_make_variable, name, *args, **kwargs)
+
+  keras_base_layer_utils.make_variable = make_variable_wrapper
+
+  yield
+
+  keras_base_layer_utils.make_variable = original_make_variable
diff --git a/tensorflow/contrib/gan/python/features/python/spectral_normalization_test.py b/tensorflow/contrib/gan/python/features/python/spectral_normalization_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ea21f70ec01950cfef5e4fa851c78b219d6062f
--- /dev/null
+++ b/tensorflow/contrib/gan/python/features/python/spectral_normalization_test.py
@@ -0,0 +1,354 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for features.spectral_normalization."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib import slim
+from tensorflow.contrib.gan.python.features.python import spectral_normalization_impl as spectral_normalization
+from tensorflow.contrib.layers.python.layers import layers as contrib_layers
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.layers import convolutional as keras_convolutional
+from tensorflow.python.keras.layers import core as keras_core
+from tensorflow.python.layers import convolutional as layers_convolutional
+from tensorflow.python.layers import core as layers_core
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class SpectralNormalizationTest(test.TestCase):
+
+  def testComputeSpectralNorm(self):
+    weights = variable_scope.get_variable(
+        'w', dtype=dtypes.float32, shape=[2, 3, 50, 100])
+    weights = math_ops.multiply(weights, 10.0)
+    s = linalg_ops.svd(
+        array_ops.reshape(weights, [-1, weights.shape[-1]]), compute_uv=False)
+    true_sn = s[..., 0]
+    estimated_sn = spectral_normalization.compute_spectral_norm(weights)
+
+    with self.cached_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      np_true_sn = sess.run(true_sn)
+      for i in range(50):
+        est = sess.run(estimated_sn)
+        if i < 1:
+          np_est_1 = est
+        if i < 4:
+          np_est_5 = est
+        if i < 9:
+          np_est_10 = est
+        np_est_50 = est
+
+      # Check that the estimate improves with more iterations.
+      self.assertAlmostEqual(np_true_sn, np_est_50, 0)
+      self.assertGreater(
+          abs(np_true_sn - np_est_10), abs(np_true_sn - np_est_50))
+      self.assertGreater(
+          abs(np_true_sn - np_est_5), abs(np_true_sn - np_est_10))
+      self.assertGreater(abs(np_true_sn - np_est_1), abs(np_true_sn - np_est_5))
+
+  def testSpectralNormalize(self):
+    weights = variable_scope.get_variable(
+        'w', dtype=dtypes.float32, shape=[2, 3, 50, 100])
+    weights = math_ops.multiply(weights, 10.0)
+    normalized_weights = spectral_normalization.spectral_normalize(
+        weights, power_iteration_rounds=1)
+
+    unnormalized_sigma = linalg_ops.svd(
+        array_ops.reshape(weights, [-1, weights.shape[-1]]),
+        compute_uv=False)[..., 0]
+    normalized_sigma = linalg_ops.svd(
+        array_ops.reshape(normalized_weights, [-1, weights.shape[-1]]),
+        compute_uv=False)[..., 0]
+
+    with self.cached_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      s0 = sess.run(unnormalized_sigma)
+
+      for i in range(50):
+        sigma = sess.run(normalized_sigma)
+        if i < 1:
+          s1 = sigma
+        if i < 5:
+          s5 = sigma
+        if i < 10:
+          s10 = sigma
+        s50 = sigma
+
+      self.assertAlmostEqual(1., s50, 0)
+      self.assertGreater(abs(s10 - 1.), abs(s50 - 1.))
+      self.assertGreater(abs(s5 - 1.), abs(s10 - 1.))
+      self.assertGreater(abs(s1 - 1.), abs(s5 - 1.))
+      self.assertGreater(abs(s0 - 1.), abs(s1 - 1.))
+
+  def _testLayerHelper(self, build_layer_fn, w_shape, b_shape, is_keras=False):
+    x = array_ops.placeholder(dtypes.float32, shape=[2, 10, 10, 3])
+
+    w_initial = np.random.randn(*w_shape) * 10
+    w_initializer = init_ops.constant_initializer(w_initial)
+    b_initial = np.random.randn(*b_shape)
+    b_initializer = init_ops.constant_initializer(b_initial)
+
+    if is_keras:
+      context_manager = spectral_normalization.keras_spectral_normalization()
+    else:
+      getter = spectral_normalization.spectral_normalization_custom_getter()
+      context_manager = variable_scope.variable_scope('', custom_getter=getter)
+
+    with context_manager:
+      (net,
+       expected_normalized_vars, expected_not_normalized_vars) = build_layer_fn(
+           x, w_initializer, b_initializer)
+
+    x_data = np.random.rand(*x.shape)
+
+    with self.cached_session() as sess:
+      sess.run(variables.global_variables_initializer())
+
+      # Before running a forward pass we still expect the variables values to
+      # differ from the initial value because of the normalizer.
+      w_befores = []
+      for name, var in expected_normalized_vars.items():
+        w_before = sess.run(var)
+        w_befores.append(w_before)
+        self.assertFalse(
+            np.allclose(w_initial, w_before),
+            msg=('%s appears not to be normalized. Before: %s After: %s' %
+                 (name, w_initial, w_before)))
+
+      # Not true for the unnormalized variables.
+      for name, var in expected_not_normalized_vars.items():
+        b_before = sess.run(var)
+        self.assertTrue(
+            np.allclose(b_initial, b_before),
+            msg=('%s appears to be unexpectedly normalized. '
+                 'Before: %s After: %s' % (name, b_initial, b_before)))
+
+      # Run a bunch of forward passes.
+      for _ in range(1000):
+        _ = sess.run(net, feed_dict={x: x_data})
+
+      # We expect this to have improved the estimate of the spectral norm,
+      # which should have changed the variable values and brought them close
+      # to the true Spectral Normalized values.
+      _, s, _ = np.linalg.svd(w_initial.reshape([-1, 3]))
+      exactly_normalized = w_initial / s[0]
+      for w_before, (name, var) in zip(w_befores,
+                                       expected_normalized_vars.items()):
+        w_after = sess.run(var)
+        self.assertFalse(
+            np.allclose(w_before, w_after, rtol=1e-8, atol=1e-8),
+            msg=('%s did not improve over many iterations. '
+                 'Before: %s After: %s' % (name, w_before, w_after)))
+        self.assertAllClose(
+            exactly_normalized,
+            w_after,
+            rtol=1e-4,
+            atol=1e-4,
+            msg=('Estimate of spectral norm for %s was innacurate. '
+                 'Normalized matrices do not match.'
+                 'Estimate: %s Actual: %s' % (name, w_after,
+                                              exactly_normalized)))
+
+  def testConv2D_Layers(self):
+
+    def build_layer_fn(x, w_initializer, b_initializer):
+      layer = layers_convolutional.Conv2D(
+          filters=3,
+          kernel_size=3,
+          padding='same',
+          kernel_initializer=w_initializer,
+          bias_initializer=b_initializer)
+      net = layer.apply(x)
+      expected_normalized_vars = {'tf.layers.Conv2d.kernel': layer.kernel}
+      expected_not_normalized_vars = {'tf.layers.Conv2d.bias': layer.bias}
+
+      return net, expected_normalized_vars, expected_not_normalized_vars
+
+    self._testLayerHelper(build_layer_fn, (3, 3, 3, 3), (3,))
+
+  def testConv2D_ContribLayers(self):
+
+    def build_layer_fn(x, w_initializer, b_initializer):
+      var_collection = {
+          'weights': ['CONTRIB_LAYERS_CONV2D_WEIGHTS'],
+          'biases': ['CONTRIB_LAYERS_CONV2D_BIASES']
+      }
+      net = contrib_layers.conv2d(
+          x,
+          3,
+          3,
+          weights_initializer=w_initializer,
+          biases_initializer=b_initializer,
+          variables_collections=var_collection)
+      weight_vars = ops.get_collection('CONTRIB_LAYERS_CONV2D_WEIGHTS')
+      self.assertEquals(1, len(weight_vars))
+      bias_vars = ops.get_collection('CONTRIB_LAYERS_CONV2D_BIASES')
+      self.assertEquals(1, len(bias_vars))
+      expected_normalized_vars = {
+          'contrib.layers.conv2d.weights': weight_vars[0]
+      }
+      expected_not_normalized_vars = {
+          'contrib.layers.conv2d.bias': bias_vars[0]
+      }
+
+      return net, expected_normalized_vars, expected_not_normalized_vars
+
+    self._testLayerHelper(build_layer_fn, (3, 3, 3, 3), (3,))
+
+  def testConv2D_Slim(self):
+
+    def build_layer_fn(x, w_initializer, b_initializer):
+      var_collection = {
+          'weights': ['SLIM_CONV2D_WEIGHTS'],
+          'biases': ['SLIM_CONV2D_BIASES']
+      }
+      net = slim.conv2d(
+          x,
+          3,
+          3,
+          weights_initializer=w_initializer,
+          biases_initializer=b_initializer,
+          variables_collections=var_collection)
+      weight_vars = ops.get_collection('SLIM_CONV2D_WEIGHTS')
+      self.assertEquals(1, len(weight_vars))
+      bias_vars = ops.get_collection('SLIM_CONV2D_BIASES')
+      self.assertEquals(1, len(bias_vars))
+      expected_normalized_vars = {'slim.conv2d.weights': weight_vars[0]}
+      expected_not_normalized_vars = {'slim.conv2d.bias': bias_vars[0]}
+
+      return net, expected_normalized_vars, expected_not_normalized_vars
+
+    self._testLayerHelper(build_layer_fn, (3, 3, 3, 3), (3,))
+
+  def testConv2D_Keras(self):
+
+    def build_layer_fn(x, w_initializer, b_initializer):
+      layer = keras_convolutional.Conv2D(
+          filters=3,
+          kernel_size=3,
+          padding='same',
+          kernel_initializer=w_initializer,
+          bias_initializer=b_initializer)
+      net = layer.apply(x)
+      expected_normalized_vars = {'keras.layers.Conv2d.kernel': layer.kernel}
+      expected_not_normalized_vars = {'keras.layers.Conv2d.bias': layer.bias}
+
+      return net, expected_normalized_vars, expected_not_normalized_vars
+
+    self._testLayerHelper(build_layer_fn, (3, 3, 3, 3), (3,), is_keras=True)
+
+  def testFC_Layers(self):
+
+    def build_layer_fn(x, w_initializer, b_initializer):
+      x = layers_core.Flatten()(x)
+      layer = layers_core.Dense(
+          units=3,
+          kernel_initializer=w_initializer,
+          bias_initializer=b_initializer)
+      net = layer.apply(x)
+      expected_normalized_vars = {'tf.layers.Dense.kernel': layer.kernel}
+      expected_not_normalized_vars = {'tf.layers.Dense.bias': layer.bias}
+
+      return net, expected_normalized_vars, expected_not_normalized_vars
+
+    self._testLayerHelper(build_layer_fn, (300, 3), (3,))
+
+  def testFC_ContribLayers(self):
+
+    def build_layer_fn(x, w_initializer, b_initializer):
+      var_collection = {
+          'weights': ['CONTRIB_LAYERS_FC_WEIGHTS'],
+          'biases': ['CONTRIB_LAYERS_FC_BIASES']
+      }
+      x = contrib_layers.flatten(x)
+      net = contrib_layers.fully_connected(
+          x,
+          3,
+          weights_initializer=w_initializer,
+          biases_initializer=b_initializer,
+          variables_collections=var_collection)
+      weight_vars = ops.get_collection('CONTRIB_LAYERS_FC_WEIGHTS')
+      self.assertEquals(1, len(weight_vars))
+      bias_vars = ops.get_collection('CONTRIB_LAYERS_FC_BIASES')
+      self.assertEquals(1, len(bias_vars))
+      expected_normalized_vars = {
+          'contrib.layers.fully_connected.weights': weight_vars[0]
+      }
+      expected_not_normalized_vars = {
+          'contrib.layers.fully_connected.bias': bias_vars[0]
+      }
+
+      return net, expected_normalized_vars, expected_not_normalized_vars
+
+    self._testLayerHelper(build_layer_fn, (300, 3), (3,))
+
+  def testFC_Slim(self):
+
+    def build_layer_fn(x, w_initializer, b_initializer):
+      var_collection = {
+          'weights': ['SLIM_FC_WEIGHTS'],
+          'biases': ['SLIM_FC_BIASES']
+      }
+      x = slim.flatten(x)
+      net = slim.fully_connected(
+          x,
+          3,
+          weights_initializer=w_initializer,
+          biases_initializer=b_initializer,
+          variables_collections=var_collection)
+      weight_vars = ops.get_collection('SLIM_FC_WEIGHTS')
+      self.assertEquals(1, len(weight_vars))
+      bias_vars = ops.get_collection('SLIM_FC_BIASES')
+      self.assertEquals(1, len(bias_vars))
+      expected_normalized_vars = {
+          'slim.fully_connected.weights': weight_vars[0]
+      }
+      expected_not_normalized_vars = {'slim.fully_connected.bias': bias_vars[0]}
+
+      return net, expected_normalized_vars, expected_not_normalized_vars
+
+    self._testLayerHelper(build_layer_fn, (300, 3), (3,))
+
+  def testFC_Keras(self):
+
+    def build_layer_fn(x, w_initializer, b_initializer):
+      x = keras_core.Flatten()(x)
+      layer = keras_core.Dense(
+          units=3,
+          kernel_initializer=w_initializer,
+          bias_initializer=b_initializer)
+      net = layer.apply(x)
+      expected_normalized_vars = {'keras.layers.Dense.kernel': layer.kernel}
+      expected_not_normalized_vars = {'keras.layers.Dense.bias': layer.bias}
+
+      return net, expected_normalized_vars, expected_not_normalized_vars
+
+    self._testLayerHelper(build_layer_fn, (300, 3), (3,), is_keras=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl.py b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
index a0a86c6337eefa756a209635faa70db686a36247..1f1ae2df4d6def618e86aced3296ac89c836eab7 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
@@ -28,7 +28,7 @@ wasserstein_gradient_penalty
 All losses must be able to accept 1D or 2D Tensors, so as to be compatible with
 patchGAN style losses (https://arxiv.org/abs/1611.07004).
 
-To make these losses usable in the TFGAN framework, please create a tuple
+To make these losses usable in the TF-GAN framework, please create a tuple
 version of the losses with `losses_utils.py`.
 """
 
@@ -38,6 +38,7 @@ from __future__ import print_function
 
 
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables_lib
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -69,6 +70,10 @@ __all__ = [
 ]
 
 
+def _to_float(tensor):
+  return math_ops.cast(tensor, dtypes.float32)
+
+
 # Wasserstein losses from `Wasserstein GAN` (https://arxiv.org/abs/1701.07875).
 def wasserstein_generator_loss(
     discriminator_gen_outputs,
@@ -98,7 +103,7 @@ def wasserstein_generator_loss(
   """
   with ops.name_scope(scope, 'generator_wasserstein_loss', (
       discriminator_gen_outputs, weights)) as scope:
-    discriminator_gen_outputs = math_ops.to_float(discriminator_gen_outputs)
+    discriminator_gen_outputs = _to_float(discriminator_gen_outputs)
 
     loss = - discriminator_gen_outputs
     loss = losses.compute_weighted_loss(
@@ -144,8 +149,8 @@ def wasserstein_discriminator_loss(
   with ops.name_scope(scope, 'discriminator_wasserstein_loss', (
       discriminator_real_outputs, discriminator_gen_outputs, real_weights,
       generated_weights)) as scope:
-    discriminator_real_outputs = math_ops.to_float(discriminator_real_outputs)
-    discriminator_gen_outputs = math_ops.to_float(discriminator_gen_outputs)
+    discriminator_real_outputs = _to_float(discriminator_real_outputs)
+    discriminator_gen_outputs = _to_float(discriminator_gen_outputs)
     discriminator_real_outputs.shape.assert_is_compatible_with(
         discriminator_gen_outputs.shape)
 
@@ -320,7 +325,7 @@ def wasserstein_gradient_penalty(
     generated_data: Output of the generator.
     generator_inputs: Exact argument to pass to the generator, which is used
       as optional conditioning to the discriminator.
-    discriminator_fn: A discriminator function that conforms to TFGAN API.
+    discriminator_fn: A discriminator function that conforms to TF-GAN API.
     discriminator_scope: If not `None`, reuse discriminators from this scope.
     epsilon: A small positive number added for numerical stability when
       computing the gradient norm.
@@ -647,7 +652,7 @@ def least_squares_generator_loss(
   """
   with ops.name_scope(scope, 'lsq_generator_loss',
                       (discriminator_gen_outputs, real_label)) as scope:
-    discriminator_gen_outputs = math_ops.to_float(discriminator_gen_outputs)
+    discriminator_gen_outputs = _to_float(discriminator_gen_outputs)
     loss = math_ops.squared_difference(
         discriminator_gen_outputs, real_label) / 2.0
     loss = losses.compute_weighted_loss(
@@ -702,8 +707,8 @@ def least_squares_discriminator_loss(
   """
   with ops.name_scope(scope, 'lsq_discriminator_loss',
                       (discriminator_gen_outputs, real_label)) as scope:
-    discriminator_real_outputs = math_ops.to_float(discriminator_real_outputs)
-    discriminator_gen_outputs = math_ops.to_float(discriminator_gen_outputs)
+    discriminator_real_outputs = _to_float(discriminator_real_outputs)
+    discriminator_gen_outputs = _to_float(discriminator_gen_outputs)
     discriminator_real_outputs.shape.assert_is_compatible_with(
         discriminator_gen_outputs.shape)
 
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
index e3c780ac1a0f0ef15ff993bd3a9bf9730dcb45b8..44ee0f52696dc1cdcd91286a80b2d4b42be93a4d 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
@@ -403,7 +403,9 @@ class _PenaltyTest(object):
   def test_all_correct(self):
     loss = self._penalty_fn(**self._kwargs)
     self.assertEqual(self._expected_dtype, loss.dtype)
-    self.assertEqual(self._expected_op_name, loss.op.name)
+    # NOTE: Op names will change, it is inappropriate to include them in tests.
+    # See go/tf-breaking-change.
+    # self.assertEqual(self._expected_op_name, loss.op.name)
     with self.cached_session():
       variables.global_variables_initializer().run()
       self.assertAlmostEqual(self._expected_loss, loss.eval(), 6)
diff --git a/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py b/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py
index 221c70c38bd432a6be7f6cda9c6700aa2255821f..76e57df7f646547037b3461ac44f7ee5b971406c 100644
--- a/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py
+++ b/tensorflow/contrib/gan/python/losses/python/tuple_losses_impl.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TFGAN utilities for loss functions that accept GANModel namedtuples.
+"""TF-GAN utilities for loss functions that accept GANModel namedtuples.
 
 The losses and penalties in this file all correspond to losses in
 `losses_impl.py`. Losses in that file take individual arguments, whereas in this
diff --git a/tensorflow/contrib/gan/python/namedtuples.py b/tensorflow/contrib/gan/python/namedtuples.py
index 969b68449d9c82f9f9144a8657cd8932b38fd0f7..73dfee4fdeec87cf0bac5eb675fd02a64a9ad7f5 100644
--- a/tensorflow/contrib/gan/python/namedtuples.py
+++ b/tensorflow/contrib/gan/python/namedtuples.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Named tuples for TFGAN.
+"""Named tuples for TF-GAN.
 
-TFGAN training occurs in four steps, and each step communicates with the next
-step via one of these named tuples. At each step, you can either use a TFGAN
+TF-GAN training occurs in four steps, and each step communicates with the next
+step via one of these named tuples. At each step, you can either use a TF-GAN
 helper function in `train.py`, or you can manually construct a tuple.
 """
 
diff --git a/tensorflow/contrib/gan/python/train.py b/tensorflow/contrib/gan/python/train.py
index 4c7bee41b33ce1fee46d374ca5fd1c0b603762f9..9bff8090d93d3ad7def69726073accfb234ef301 100644
--- a/tensorflow/contrib/gan/python/train.py
+++ b/tensorflow/contrib/gan/python/train.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""The TFGAN project provides a lightweight GAN training/testing framework.
+"""The TF-GAN project provides a lightweight GAN training/testing framework.
 
 This file contains the core helper functions to create and train a GAN model.
 See the README or examples in `tensorflow_models` for details on how to use.
 
-TFGAN training occurs in four steps:
+TF-GAN training occurs in four steps:
 1) Create a model
 2) Add a loss
 3) Create train ops
@@ -645,9 +645,10 @@ def gan_loss(
         type(model))
 
   # Optionally create pooled model.
-  pooled_model = (
-      _tensor_pool_adjusted_model(model, tensor_pool_fn)
-      if tensor_pool_fn else model)
+  if tensor_pool_fn:
+    pooled_model = _tensor_pool_adjusted_model(model, tensor_pool_fn)
+  else:
+    pooled_model = model
 
   # Create standard losses.
   gen_loss = generator_loss_fn(model, add_summaries=add_summaries)
@@ -665,10 +666,11 @@ def gan_loss(
   if _use_aux_loss(mutual_information_penalty_weight):
     gen_info_loss = tfgan_losses.mutual_information_penalty(
         model, add_summaries=add_summaries)
-    dis_info_loss = (
-        gen_info_loss
-        if tensor_pool_fn is None else tfgan_losses.mutual_information_penalty(
-            pooled_model, add_summaries=add_summaries))
+    if tensor_pool_fn is None:
+      dis_info_loss = gen_info_loss
+    else:
+      dis_info_loss = tfgan_losses.mutual_information_penalty(
+          pooled_model, add_summaries=add_summaries)
     gen_loss += mutual_information_penalty_weight * gen_info_loss
     dis_loss += mutual_information_penalty_weight * dis_info_loss
   if _use_aux_loss(aux_cond_generator_weight):
@@ -755,7 +757,9 @@ def cyclegan_loss(
 
   return namedtuples.CycleGANLoss(loss_x2y, loss_y2x)
 
-
+# Begin google-internal
+# The four major parts can be found here: http://screen/tMRMBAohDYG.
+# End google-internal
 def stargan_loss(
     model,
     generator_loss_fn=tfgan_losses.stargan_generator_loss_wrapper(
@@ -774,8 +778,6 @@ def stargan_loss(
     add_summaries=True):
   """StarGAN Loss.
 
-  The four major part can be found here: http://screen/tMRMBAohDYG.
-
   Args:
     model: (StarGAN) Model output of the stargan_model() function call.
     generator_loss_fn: The loss function on the generator. Takes a
@@ -929,7 +931,7 @@ def gan_train_ops(
     **kwargs):
   """Returns GAN train ops.
 
-  The highest-level call in TFGAN. It is composed of functions that can also
+  The highest-level call in TF-GAN. It is composed of functions that can also
   be called, should a user require more control over some part of the GAN
   training process.
 
diff --git a/tensorflow/contrib/gdr/BUILD b/tensorflow/contrib/gdr/BUILD
index e534fdc17749974ebe713c2730682bea6d7a85e4..bf8b66dcfa5e44a03107cdf1ef8b04e1dbff4a9c 100644
--- a/tensorflow/contrib/gdr/BUILD
+++ b/tensorflow/contrib/gdr/BUILD
@@ -17,11 +17,6 @@ filegroup(
     ]),
 )
 
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cuda_library",
-)
-
 # For platform specific build config
 load(
     "//tensorflow/core:platform/default/build_config.bzl",
@@ -37,7 +32,7 @@ tf_proto_library_cc(
     ],
 )
 
-tf_cuda_library(
+cc_library(
     name = "gdr_memory_manager",
     srcs = ["gdr_memory_manager.cc"],
     hdrs = ["gdr_memory_manager.h"],
@@ -58,7 +53,7 @@ tf_cuda_library(
     ],
 )
 
-tf_cuda_library(
+cc_library(
     name = "gdr_worker",
     srcs = ["gdr_worker.cc"],
     hdrs = ["gdr_worker.h"],
@@ -66,7 +61,6 @@ tf_cuda_library(
         ":gdr_memory_manager",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core:gpu_runtime",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/distributed_runtime:graph_mgr",
@@ -100,15 +94,37 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "gdr_collective_executor_mgr",
+    srcs = ["gdr_collective_executor_mgr.cc"],
+    hdrs = ["gdr_collective_executor_mgr.h"],
+    deps = [
+        ":gdr_memory_manager",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/distributed_runtime:cancellable_call",
+        "//tensorflow/core/distributed_runtime:collective_param_resolver_distributed",
+        "//tensorflow/core/distributed_runtime:device_resolver_distributed",
+        "//tensorflow/core/distributed_runtime:request_id",
+        "//tensorflow/core/distributed_runtime:rpc_collective_executor_mgr",
+        "//tensorflow/core/distributed_runtime:worker_cache",
+    ],
+)
+
 cc_library(
     name = "gdr_server_lib",
     srcs = ["gdr_server_lib.cc"],
     hdrs = ["gdr_server_lib.h"],
     linkstatic = 1,  # Seems to be needed since alwayslink is broken in bazel
     deps = [
+        ":gdr_collective_executor_mgr",
         ":gdr_memory_manager",
         ":gdr_rendezvous_mgr",
         ":gdr_worker",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core/distributed_runtime:collective_param_resolver_distributed",
+        "//tensorflow/core/distributed_runtime:device_resolver_distributed",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
     ],
     alwayslink = 1,
diff --git a/tensorflow/contrib/gdr/README.md b/tensorflow/contrib/gdr/README.md
index 8242d93f129904828a11b61d48f2df8fb0f88bc3..711adc865f37fc84550e4b45d9f0c7fff421a0dc 100644
--- a/tensorflow/contrib/gdr/README.md
+++ b/tensorflow/contrib/gdr/README.md
@@ -114,7 +114,16 @@ Caveats
 
 In current implementation, only tensors that reside in host memory or in GPU memory such that the GPU is adjacent to an RDMA capable NIC will use direct RDMA as its transport. When RDMA is available but not GDR, a temporary tensor copy on host memory will be used as RDMA source/destination (and copied from/to the target device). When there is no RDMA device present, it can even fallback to the original gRPC runtime. While it is theoretically possible to mix GDR enabled TF with non-GDR deployments in the same job, make sure the environment is properly setup so the GDR mode is enabled whenever possible (i.e. do not fall back to gRPC when it is not absolutely necessary).
 
-In the original design (as in the reference), tensor buffers are only registered to NIC when we could determine that the tensor will be either a source of Send or a sink of Recv across physical machine boundary. However, to implement the precise allocations, we need to change all the devices to possibly return a NIC compatible allocator. As GDR is currently in contrib, we would like to avoid the unnecessary code disruption to the TF core, so we allocate all tensors from NIC-registered buffers using a BFC allocator. This behaviour is similar to the effect of enabling the extra GPU option `force_gpu_compatible`, which allocate all host tensors in GPU-registered buffers no matter they will be transferred from/to GPUs or not.
+In the original design (as in the reference), tensor buffers are only registered
+to NIC when we could determine that the tensor will be either a source of Send
+or a sink of Recv across physical machine boundary. However, to implement the
+precise allocations, we need to change all the devices to possibly return a NIC
+compatible allocator. As GDR is currently in contrib, we would like to avoid the
+unnecessary code disruption to the TF core, so we allocate all tensors from
+NIC-registered buffers using a BFC allocator. This behavior is similar to the
+effect of enabling the extra GPU option `force_gpu_compatible`, which allocate
+all host tensors in GPU-registered buffers no matter they will be transferred
+from/to GPUs or not.
 
 Reference
 ===
diff --git a/tensorflow/contrib/gdr/gdr.proto b/tensorflow/contrib/gdr/gdr.proto
index c0b89245b150bfa49cb527d25b6e1f324f353b25..bd438787c3374be6ead4f6233101fd1f548643ea 100644
--- a/tensorflow/contrib/gdr/gdr.proto
+++ b/tensorflow/contrib/gdr/gdr.proto
@@ -9,5 +9,4 @@ message RemoteMemoryRegion {
   uint64 addr = 3;
   uint32 rkey = 4;
   uint32 tensor_key = 5;
-  uint64 checksum = 6;
 }
diff --git a/tensorflow/contrib/gdr/gdr_collective_executor_mgr.cc b/tensorflow/contrib/gdr/gdr_collective_executor_mgr.cc
new file mode 100644
index 0000000000000000000000000000000000000000..755cbdff31cd7ca31579e0d64399d681dc24ad81
--- /dev/null
+++ b/tensorflow/contrib/gdr/gdr_collective_executor_mgr.cc
@@ -0,0 +1,159 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/gdr/gdr_collective_executor_mgr.h"
+
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/collective_executor_mgr.h"
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/distributed_runtime/cancellable_call.h"
+#include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/request_id.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/lib/random/random.h"
+
+namespace tensorflow {
+
+class WorkerCacheInterface;
+
+namespace {
+
+class RecvBufCall : public CancellableCall {
+ public:
+  RecvBufCall(int64 step_id, const string& peer_device, const string& peer_task,
+              const string& key, Device* to_device,
+              DeviceContext* to_device_ctx,
+              const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+              const DeviceLocality& client_locality,
+              const DeviceLocality& server_locality,
+              CancellationManager* cancel_mgr, WorkerCacheInterface* wc)
+      : CancellableCall(cancel_mgr, peer_task, wc) {
+    req_.set_step_id(step_id);
+    req_.set_buf_rendezvous_key(key);
+    *req_.mutable_client_locality() = client_locality;
+    *req_.mutable_server_locality() = server_locality;
+    req_.set_num_bytes(to_tensor->TotalBytes());
+    req_.set_buf_ptr(reinterpret_cast<int64>(DMAHelper::base(to_tensor)));
+    req_.set_src_device(peer_device);
+    req_.set_dst_device(to_device->name());
+    req_.set_request_id(GetUniqueRequestId());
+  }
+
+  ~RecvBufCall() override {}
+
+  void IssueCall(const StatusCallback& done) override {
+    wi_->RecvBufAsync(&opts_, &req_, &resp_, done);
+  }
+
+  RecvBufRequest req_;
+  RecvBufResponse resp_;
+};
+
+class CollectiveRemoteAccessDistributed : public CollectiveRemoteAccessLocal {
+ public:
+  CollectiveRemoteAccessDistributed(const DeviceMgr* dev_mgr,
+                                    DeviceResolverInterface* dev_resolver,
+                                    WorkerCacheInterface* worker_cache,
+                                    int64 step_id,
+                                    RemoteMemoryManager* remote_memory_manager)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
+        worker_cache_(worker_cache),
+        remote_memory_manager_(remote_memory_manager) {}
+
+  ~CollectiveRemoteAccessDistributed() override {}
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,
+                    int dev_to_dev_stream_index,
+                    const StatusCallback& done) override {
+    if (peer_is_local) {
+      CollectiveRemoteAccessLocal::RecvFromPeer(
+          peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx,
+          to_alloc_attr, to_tensor, client_locality, dev_to_dev_stream_index,
+          done);
+      return;
+    }
+
+    // State that needs to be threaded through a couple of async calls
+    // in order to make this function completely non-blocking.
+    struct State {
+      DeviceLocality server_locality;
+      std::unique_ptr<RecvBufCall> call;
+    };
+    State* state = new State;
+
+    // Logic to be executed on the RecvBufAsync callback.
+    auto recv_buf_callback = [this, state, peer_task, to_device, to_alloc_attr,
+                              to_device_ctx, to_tensor, done](const Status& s) {
+      if (s.ok()) {
+        remote_memory_manager_->TensorFromTransportOptions(
+            to_tensor, state->call->resp_.transport_options(), to_device,
+            to_device_ctx, to_alloc_attr.on_host(), done);
+      }
+      if (!s.ok() && errors::IsFailedPrecondition(s)) {
+        dev_resolver_->ClearTask(peer_task);
+      }
+
+      delete state;
+    };
+
+    // Logic to execute once we have the device locality for the server-side
+    // device.
+    auto dev_locality_callback = [this, state, peer_device, peer_task, key,
+                                  to_device, to_device_ctx, to_alloc_attr,
+                                  to_tensor, client_locality,
+                                  recv_buf_callback](const Status& s) {
+      if (!s.ok()) {
+        recv_buf_callback(s);
+      } else {
+        state->call.reset(new RecvBufCall(
+            step_id_, peer_device, peer_task, key, to_device, to_device_ctx,
+            to_alloc_attr, to_tensor, client_locality, state->server_locality,
+            &cancel_mgr_, worker_cache_));
+        state->call->Start(recv_buf_callback);
+      }
+    };
+
+    dev_resolver_->GetLocalityAsync(
+        peer_device, peer_task, &state->server_locality, dev_locality_callback);
+  }
+
+  void StartAbort(const Status& s) override {
+    CollectiveRemoteAccessLocal::StartAbort(s);
+    cancel_mgr_.StartCancel();
+  }
+
+ protected:
+  WorkerCacheInterface* worker_cache_;  // Not owned
+  CancellationManager cancel_mgr_;
+  RemoteMemoryManager* remote_memory_manager_;
+};
+
+}  // namespace
+
+CollectiveExecutor* GdrCollectiveExecutorMgr::Create(int64 step_id) {
+  CollectiveRemoteAccessDistributed* rma =
+      new CollectiveRemoteAccessDistributed(dev_mgr_, dev_resolver_.get(),
+                                            worker_cache_, step_id,
+                                            remote_memory_manager_);
+  return new BaseCollectiveExecutor(this, rma, step_id, dev_mgr_,
+                                    &gpu_ring_order_);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/gdr/gdr_collective_executor_mgr.h b/tensorflow/contrib/gdr/gdr_collective_executor_mgr.h
new file mode 100644
index 0000000000000000000000000000000000000000..1417e51e82c31035f058e8e9b546e04fb0ad97b8
--- /dev/null
+++ b/tensorflow/contrib/gdr/gdr_collective_executor_mgr.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_GDR_GDR_COLLECTIVE_EXECUTOR_MGR_H_
+#define TENSORFLOW_CONTRIB_GDR_GDR_COLLECTIVE_EXECUTOR_MGR_H_
+
+#include "tensorflow/contrib/gdr/gdr_memory_manager.h"
+#include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h"
+#include "tensorflow/core/framework/collective.h"
+
+namespace tensorflow {
+class ConfigProto;
+class DeviceMgr;
+class WorkerCacheInterface;
+class StepSequenceRequest;
+class StepSequenceResponse;
+
+// An implementation of CollectiveExecutorMgr for a distributed environment
+// that uses WorkerInterface::RecvBufAsync to route data transfers over RDMA.
+class GdrCollectiveExecutorMgr : public RpcCollectiveExecutorMgr {
+ public:
+  GdrCollectiveExecutorMgr(
+      const ConfigProto& config, const DeviceMgr* dev_mgr,
+      std::unique_ptr<DeviceResolverDistributed> dev_resolver,
+      std::unique_ptr<CollectiveParamResolverDistributed> param_resolver,
+      WorkerCacheInterface* worker_cache, const string& task_name,
+      RemoteMemoryManager* remote_memory_manager)
+      : RpcCollectiveExecutorMgr(config, dev_mgr, std::move(dev_resolver),
+                                 std::move(param_resolver), worker_cache,
+                                 task_name),
+        remote_memory_manager_(remote_memory_manager) {}
+
+  ~GdrCollectiveExecutorMgr() override {}
+
+ protected:
+  virtual CollectiveExecutor* Create(int64 step_id) override;
+
+ private:
+  RemoteMemoryManager* remote_memory_manager_;  // Not owned.
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CONTRIB_GDR_GDR_COLLECTIVE_EXECUTOR_MGR_H_
diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc
index 53587fcf3050f313c85485f77ce411cba7faccff..7321e973191c4cc45f88735c6be7f2f67fe71c39 100644
--- a/tensorflow/contrib/gdr/gdr_memory_manager.cc
+++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc
@@ -26,17 +26,14 @@ limitations under the License.
 #include <fcntl.h>
 #include <rdma/rdma_cma.h>
 #include <rdma/rdma_verbs.h>
-#include <sys/epoll.h>
 
 #include "tensorflow/contrib/gdr/gdr.pb.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
-#include "tensorflow/core/common_runtime/process_state.h"
-#if GOOGLE_CUDA
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
-#endif  // GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/process_state.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/numa.h"
@@ -76,15 +73,14 @@ int TryToReadNumaNode(ibv_device* device) {
 
   std::ifstream ifs(filename.c_str());
   string content;
-  CHECK(std::getline(ifs, content));
+  const auto& ret = std::getline(ifs, content);
+  if (!ret) {
+    return port::kNUMANoAffinity;
+  }
 
   int32 value;
   if (strings::safe_strto32(content, &value)) {
     if (value < 0) {
-      LOG(INFO) << "Successful NUMA node read from SysFS had negative value ("
-                << value
-                << "), but there must be at least one NUMA node"
-                   ", so returning NUMA node zero";
       return port::kNUMANoAffinity;
     }
     LOG(INFO) << "NUMA node for device: " << device->name << " is " << value;
@@ -114,7 +110,7 @@ class GdrMemoryManager : public RemoteMemoryManager {
  public:
   GdrMemoryManager(const string& host, const string& port);
 
-  virtual ~GdrMemoryManager();
+  virtual ~GdrMemoryManager() {}
 
   virtual Status Init() override;
 
@@ -140,7 +136,7 @@ class GdrMemoryManager : public RemoteMemoryManager {
     return ptr < reinterpret_cast<char*>(other->addr) + other->length;
   }
 
-  ibv_mr* FindMemoryRegion(void* addr, size_t length);
+  ibv_mr* FindMemoryRegion(const Tensor* tensor);
 
   void InsertMemoryRegion(void* addr, size_t length,
                           const std::string& allocator_name);
@@ -152,7 +148,6 @@ class GdrMemoryManager : public RemoteMemoryManager {
   const string port_;
   RdmaEndpointPtr listening_;
   std::atomic<bool> stopped_;
-  int epfd_;
   int numa_node_;
 
   // Server side endpoints
@@ -163,15 +158,19 @@ class GdrMemoryManager : public RemoteMemoryManager {
   std::atomic<TensorKey> next_key_;
 
   // Server side on-the-fly tensor buffers
-  mutex server_mu_;
-  std::map<TensorKey, const TensorBuffer*> tensor_buffers_
-      GUARDED_BY(server_mu_);
+  mutex buf_mu_;
+  std::map<TensorKey, const TensorBuffer*> tensor_buffers_ GUARDED_BY(buf_mu_);
 
   // Client side endpoints
   mutex client_mu_;
   std::map<std::pair<string, string>, RdmaEndpointPtr> clients_
       GUARDED_BY(client_mu_);
 
+  // Client side callbacks
+  mutex callback_mu_;
+  std::map<TensorKey, StatusCallback> tensor_callbacks_
+      GUARDED_BY(callback_mu_);
+
   // Managed memory regions
   mutex alloc_mu_;
   std::vector<MemoryRegionPtr> mrs_ GUARDED_BY(alloc_mu_);
@@ -184,16 +183,9 @@ GdrMemoryManager::GdrMemoryManager(const string& host, const string& port)
       port_(port),
       listening_(nullptr, EndpointDeleter),
       stopped_(true),
-      next_key_(0) {}
-
-GdrMemoryManager::~GdrMemoryManager() { close(epfd_); }
+      next_key_(static_cast<uint32_t>(random::New64())) {}
 
 Status GdrMemoryManager::Init() {
-  epfd_ = epoll_create1(0);
-  if (epfd_ == -1) {
-    return errors::Unavailable(strerror(errno), ": ", "epoll_create");
-  }
-
   rdma_addrinfo* addrinfo;
   rdma_addrinfo hints = {};
   hints.ai_port_space = RDMA_PS_TCP;
@@ -206,7 +198,7 @@ Status GdrMemoryManager::Init() {
 
   ibv_qp_init_attr init_attr = {};
   init_attr.qp_type = IBV_QPT_RC;
-  init_attr.cap.max_recv_wr = 32;
+  init_attr.cap.max_recv_wr = 1024;
   init_attr.cap.max_send_wr = 1;
   init_attr.cap.max_recv_sge = 1;
   init_attr.cap.max_send_sge = 1;
@@ -239,14 +231,6 @@ Status GdrMemoryManager::Init() {
                                "cannot set server to non-blocking mode");
   }
 
-  epoll_event event = {};
-  event.events = EPOLLIN | EPOLLPRI;
-  event.data.ptr = listening_.get();
-  if (epoll_ctl(epfd_, EPOLL_CTL_ADD, listening_->channel->fd, &event)) {
-    return errors::Unavailable(strerror(errno), ": ",
-                               "cannot add server to epoll");
-  }
-
   numa_node_ = TryToReadNumaNode(listening_->verbs->device);
 
   SubAllocator::Visitor alloc_visitor = [this](void* ptr, int numa_node,
@@ -265,121 +249,114 @@ Status GdrMemoryManager::Init() {
   ProcessState::singleton()->AddCPUFreeVisitor(free_visitor);
   LOG(INFO) << "Instrumenting CPU allocator(s)";
 
-#if GOOGLE_CUDA
   for (int numa_idx = 0; numa_idx < port::NUMANumNodes(); ++numa_idx) {
     GPUProcessState::singleton()->AddCUDAHostAllocVisitor(numa_idx,
                                                           alloc_visitor);
     GPUProcessState::singleton()->AddCUDAHostFreeVisitor(numa_idx,
                                                          free_visitor);
   }
+
   if (IsGDRAvailable()) {
     SubAllocator::Visitor cuda_alloc_visitor = [this](void* ptr, int gpu_id,
                                                       size_t num_bytes) {
       VLOG(2) << "Registering RDMA capable memory region on GPU " << gpu_id;
       InsertMemoryRegion(ptr, num_bytes, strings::StrCat("GPU:", gpu_id));
     };
-    for (int numa_idx = 0; numa_idx < port::NUMANumNodes(); ++numa_idx) {
-      GPUProcessState::singleton()->AddGPUAllocVisitor(numa_idx,
-                                                       cuda_alloc_visitor);
-    }
-    VLOG(1) << "Instrumenting GPU allocator(s) for all Numas";
+    GPUProcessState::singleton()->AddGPUAllocVisitor(numa_node_,
+                                                     cuda_alloc_visitor);
+    LOG(INFO) << "Instrumenting GPU allocator for NUMA " << numa_node_;
   }
-#endif  // GOOGLE_CUDA
+
   return Status::OK();
 }
 
 void GdrMemoryManager::Run() {
   stopped_ = false;
   while (!stopped_) {
-    epoll_event events[32];
-    int ret = epoll_wait(epfd_, events, 32, 1);
-    if (ret == -1) {
-      LOG(ERROR) << "epoll_wait: " << strerror(errno);
-      return;
-    }
-    for (int i = 0; i < ret; i++) {
-      rdma_cm_id* id = static_cast<rdma_cm_id*>(events[i].data.ptr);
-      if (id == listening_.get()) {
-        // Accept incoming connections
-        if (!rdma_get_request(listening_.get(), &id)) {
-          if (!rdma_accept(id, nullptr)) {
-            LOG(INFO) << "Accepted new RDMA connection";
-            if (ibv_req_notify_cq(id->recv_cq, 0)) {
-              LOG(ERROR) << strerror(errno) << ": ibv_req_notify_cq failed";
-              EndpointDeleter(id);
-              continue;
-            }
-            for (int i = 0; i < 32; i++) {
-              if (rdma_post_recvv(id, nullptr, nullptr, 0)) {
-                LOG(ERROR) << strerror(errno) << ": rdma_post_recvv failed";
-                EndpointDeleter(id);
-                continue;
-              }
-            }
-            int flags = fcntl(id->recv_cq_channel->fd, F_GETFL, 0);
-            if (fcntl(id->recv_cq_channel->fd, F_SETFL, flags | O_NONBLOCK)) {
-              LOG(ERROR) << strerror(errno)
-                         << ": cannot set server_client to non-blocking mode";
-              EndpointDeleter(id);
-              continue;
-            }
-            epoll_event event = {};
-            event.events = EPOLLIN | EPOLLPRI;
-            event.data.ptr = id;
-            if (epoll_ctl(epfd_, EPOLL_CTL_ADD, id->recv_cq_channel->fd,
-                          &event)) {
-              LOG(ERROR) << strerror(errno)
-                         << ": cannot add server client to epoll";
-              EndpointDeleter(id);
-              continue;
-            }
-            server_clients_.push_back({id, EndpointDeleter});
+    rdma_cm_id* id = nullptr;
+    // Accept incoming connections
+    if (!rdma_get_request(listening_.get(), &id)) {
+      if (!rdma_accept(id, nullptr)) {
+        LOG(INFO) << "Accepted new RDMA connection";
+        for (int i = 0; i < 1024; i++) {
+          if (rdma_post_recvv(id, nullptr, nullptr, 0)) {
+            LOG(ERROR) << strerror(errno) << ": rdma_post_recvv failed";
+            EndpointDeleter(id);
+            continue;
           }
         }
-      } else {
-        // Polling work completions
-        ibv_cq* cq;
-        void* context;
-        if (!ibv_get_cq_event(id->recv_cq_channel, &cq, &context)) {
-          ibv_ack_cq_events(id->recv_cq, 1);
-          if (ibv_req_notify_cq(id->recv_cq, 0)) {
-            LOG(ERROR) << strerror(errno) << ": ibv_req_notify_cq failed";
-            continue;
+        server_clients_.push_back({id, EndpointDeleter});
+      }
+    }
+    // Polling server side work completions
+    for (const auto& client : server_clients_) {
+      ibv_wc wc[32];
+      int ret = ibv_poll_cq(client->recv_cq, 32, wc);
+      if (ret < 0) {
+        LOG(ERROR) << "ibv_poll_cq failed";
+        continue;
+      }
+      for (int i = 0; i < ret; i++) {
+        if (wc[i].opcode != IBV_WC_RECV_RDMA_WITH_IMM) {
+          LOG(ERROR) << "Received unknown operation " << wc[i].opcode;
+        }
+        if (wc[i].status != 0) {
+          LOG(ERROR) << ibv_wc_status_str(wc[i].status);
+        }
+        TensorKey tensor_key = ntohl(wc[i].imm_data);
+
+        if (rdma_post_recvv(client.get(), nullptr, nullptr, 0)) {
+          perror("rdma_post_recvv");
+          LOG(ERROR) << "rdma_post_recvv failed";
+        }
+
+        mutex_lock l(buf_mu_);
+        auto iter = tensor_buffers_.find(tensor_key);
+        if (iter == std::end(tensor_buffers_)) {
+          LOG(ERROR) << "Cannot find tensor buffer for tensor key "
+                     << tensor_key;
+        } else {
+          const TensorBuffer* buffer = iter->second;
+          buffer->Unref();
+          tensor_buffers_.erase(iter);
+        }
+      }
+    }
+    // Polling client side work completions
+    if (client_mu_.try_lock()) {
+      for (const auto& client : clients_) {
+        ibv_wc wc[32];
+        int ret = ibv_poll_cq(client.second->send_cq, 32, wc);
+        for (int i = 0; i < ret; i++) {
+          Status s;
+          if (wc[i].status) {
+            s = errors::Unavailable(ibv_wc_status_str(wc[i].status));
+          } else {
+            s = Status::OK();
           }
-          ibv_wc wc[32];
-          int ret = ibv_poll_cq(id->recv_cq, 32, wc);
-          if (ret < 0) {
-            LOG(ERROR) << "ibv_poll_cq failed";
-            continue;
+          TensorKey key = wc[i].wr_id;
+
+          ibv_send_wr wr = {};
+          wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
+          wr.imm_data = htonl(key);
+          ibv_send_wr* bad_wr;
+          if (ibv_post_send(client.second->qp, &wr, &bad_wr)) {
+            LOG(ERROR) << strerror(errno)
+                       << ": ibv_post_send failed for tensor_key " << key;
           }
-          for (int i = 0; i < ret; i++) {
-            if (wc[i].opcode != IBV_WC_RECV_RDMA_WITH_IMM) {
-              LOG(ERROR) << "Received unknown operation " << wc[i].opcode;
-            }
-            if (wc[i].status != 0) {
-              LOG(ERROR) << ibv_wc_status_str(wc[i].status);
-            }
-            TensorKey tensor_key = ntohl(wc[i].imm_data);
-            {
-              mutex_lock l(server_mu_);
-              auto iter = tensor_buffers_.find(tensor_key);
-              if (iter == std::end(tensor_buffers_)) {
-                LOG(ERROR) << "Cannot find tensor buffer for tensor key "
-                           << tensor_key;
-              } else {
-                const TensorBuffer* buffer = iter->second;
-                buffer->Unref();
-                tensor_buffers_.erase(iter);
-              }
-            }
-            if (rdma_post_recvv(id, nullptr, nullptr, 0)) {
-              perror("rdma_post_recvv");
-              LOG(ERROR) << "rdma_post_recvv failed";
-              continue;
-            }
+
+          mutex_lock l(callback_mu_);
+          auto iter = tensor_callbacks_.find(key);
+          if (iter != std::end(tensor_callbacks_)) {
+            iter->second(s);
+            tensor_callbacks_.erase(iter);
+          } else {
+            LOG(WARNING) << "Cannot find client callback with tensor key "
+                         << key;
           }
         }
       }
+      client_mu_.unlock();
     }
   }
 }
@@ -390,116 +367,58 @@ void GdrMemoryManager::TransportOptionsFromTensor(
     ::google::protobuf::Any* mutable_transport_options, const Tensor& tensor,
     Device* device, DeviceContext* device_context, bool on_host,
     StatusCallback done) {
-  auto buffer = DMAHelper::buffer(&tensor);
-  void* addr = buffer->data();
-  size_t length = buffer->size();
-  if (length == 0) {
-    done(errors::Unavailable("Cannot register tensor buffer of size 0"));
-    return;
-  }
+  ibv_mr* mr = FindMemoryRegion(&tensor);
+  const TensorBuffer* buffer = DMAHelper::buffer(&tensor);
 
-  ibv_mr* mr = FindMemoryRegion(addr, length);
-
-#if GOOGLE_CUDA
-  if (device->tensorflow_gpu_device_info() && !on_host) {
-    Allocator* alloc = GPUProcessState::singleton()->GetCUDAHostAllocator(0);
-    Tensor* host_copy = new Tensor(alloc, tensor.dtype(), tensor.shape());
-    GPUUtil::CopyGPUTensorToCPU(
-        device, device_context, &tensor, host_copy,
-        [done, host_copy, mutable_transport_options, this](const Status& s) {
-          if (!s.ok()) {
-            done(s);
-            delete host_copy;
-            return;
-          }
-          auto buffer = DMAHelper::buffer(host_copy);
-          void* addr = buffer->data();
-          size_t length = buffer->size();
-          ibv_mr* mr = FindMemoryRegion(addr, length);
-
-          if (mr == nullptr) {
-            done(errors::Unavailable("Cannot find pinned memory region"));
-            delete host_copy;
-            return;
-          }
-
-          buffer->Ref();
-          TensorKey tensor_key = next_key_++;
-          {
-            mutex_lock l(server_mu_);
-            tensor_buffers_.insert(std::make_pair(tensor_key, buffer));
-          }
-
-          uint64_t checksum = 0;
-          if (VLOG_IS_ON(2)) {
-            checksum = GPUUtil::Checksum(*host_copy);
-          }
-
-          RemoteMemoryRegion remote_mr;
-          remote_mr.set_host(host_);
-          remote_mr.set_port(port_);
-          remote_mr.set_addr(reinterpret_cast<uint64_t>(addr));
-          remote_mr.set_rkey(mr->rkey);
-          remote_mr.set_tensor_key(tensor_key);
-          remote_mr.set_checksum(checksum);
-          mutable_transport_options->PackFrom(remote_mr);
-
-          done(Status::OK());
-          delete host_copy;
-        });
-    return;
-  }
-#endif
+  Tensor* copy = nullptr;
 
   if (mr == nullptr) {
-    Allocator* alloc = ProcessState::singleton()->GetCPUAllocator(numa_node_);
-    Tensor host_copy(alloc, tensor.dtype(), tensor.shape());
-
-    std::memcpy(DMAHelper::buffer(&host_copy)->data(), buffer->data(), length);
-    VLOG(2) << "Copying " << length << " bytes unpinned tensor buffer";
-
-    buffer = DMAHelper::buffer(&host_copy);
-    addr = buffer->data();
-    length = buffer->size();
-
-    mr = FindMemoryRegion(addr, length);
+    AllocatorAttributes alloc_attrs;
+    alloc_attrs.set_gpu_compatible(true);
+    alloc_attrs.set_nic_compatible(true);
+    alloc_attrs.set_on_host(true);
+    Allocator* alloc = device->GetAllocator(alloc_attrs);
+    copy = new Tensor(alloc, tensor.dtype(), tensor.shape());
+
+    mr = FindMemoryRegion(copy);
+    buffer = DMAHelper::buffer(copy);
     if (mr == nullptr) {
       done(errors::Unavailable("Cannot find pinned memory region"));
+      delete copy;
       return;
     }
-
-    buffer->Ref();
-  } else {
-    buffer->Ref();
   }
 
   TensorKey tensor_key = next_key_++;
+  buffer->Ref();
   {
-    mutex_lock l(server_mu_);
+    mutex_lock l(buf_mu_);
     tensor_buffers_.insert(std::make_pair(tensor_key, buffer));
   }
 
-  uint64_t checksum = 0;
-  if (VLOG_IS_ON(2)) {
-#ifdef GOOGLE_CUDA
-    if (device->tensorflow_gpu_device_info() && !on_host) {
-      checksum = GPUUtil::Checksum(device, device_context, tensor);
-    } else {
-      checksum = GPUUtil::Checksum(tensor);
-    }
-#endif
-  }
-
   RemoteMemoryRegion remote_mr;
   remote_mr.set_host(host_);
   remote_mr.set_port(port_);
-  remote_mr.set_addr(reinterpret_cast<uint64_t>(addr));
+  remote_mr.set_addr(reinterpret_cast<uint64_t>(buffer->data()));
   remote_mr.set_rkey(mr->rkey);
   remote_mr.set_tensor_key(tensor_key);
-  remote_mr.set_checksum(checksum);
   mutable_transport_options->PackFrom(remote_mr);
 
-  done(Status::OK());
+  if (copy && device->tensorflow_gpu_device_info() && !on_host) {
+    device_context->CopyDeviceTensorToCPU(&tensor, "" /* tensor_name */, device,
+                                          copy, [done, copy](const Status& s) {
+                                            done(s);
+                                            delete copy;
+                                          });
+    return;
+  } else if (copy) {
+    std::memcpy(buffer->data(), DMAHelper::buffer(&tensor)->data(),
+                buffer->size());
+    done(Status::OK());
+    delete copy;  // OK to delete; we have reffed the underlying TensorBuffer
+  } else {
+    done(Status::OK());
+  }
 }
 
 void GdrMemoryManager::TensorFromTransportOptions(
@@ -512,42 +431,10 @@ void GdrMemoryManager::TensorFromTransportOptions(
     return;
   }
 
-  auto buffer = DMAHelper::buffer(tensor);
-  void* addr = buffer->data();
-  size_t length = buffer->size();
-  ibv_mr* mr = FindMemoryRegion(addr, length);
-
-  Tensor host_copy;
-#if GOOGLE_CUDA
-  if (mr == nullptr && !on_host) {
-    Allocator* alloc =
-        GPUProcessState::singleton()->GetCUDAHostAllocator(numa_node_);
-    host_copy = Tensor(alloc, tensor->dtype(), tensor->shape());
-    buffer = DMAHelper::buffer(&host_copy);
-    addr = buffer->data();
-    length = buffer->size();
-    mr = FindMemoryRegion(addr, length);
-  }
-#endif  // GOOGLE_CUDA
-
-  if (mr == nullptr) {
-    Allocator* alloc = ProcessState::singleton()->GetCPUAllocator(numa_node_);
-    host_copy = Tensor(alloc, tensor->dtype(), tensor->shape());
-
-    buffer = DMAHelper::buffer(&host_copy);
-    addr = buffer->data();
-    length = buffer->size();
-
-    mr = FindMemoryRegion(addr, length);
-    if (mr == nullptr) {
-      done(errors::Unavailable("Cannot find pinned memory region"));
-      return;
-    }
-  }
-
-  decltype(clients_)::iterator iter;
-  bool success;
+  rdma_cm_id* id = nullptr;
   {
+    decltype(clients_)::iterator iter;
+    bool success;
     mutex_lock l(client_mu_);
     std::tie(iter, success) = clients_.insert(
         std::make_pair(std::make_pair(remote_mr.host(), remote_mr.port()),
@@ -560,93 +447,94 @@ void GdrMemoryManager::TensorFromTransportOptions(
         return;
       }
     }
-  }
-  rdma_cm_id* id = iter->second.get();
-
-  uint64_t start = Env::Default()->NowMicros();
-
-  if (rdma_post_read(id, nullptr, buffer->data(), buffer->size(), mr, 0,
-                     remote_mr.addr(), remote_mr.rkey())) {
-    done(errors::Unavailable(strerror(errno), ": ", "rdma_post_read failed"));
-    return;
+    id = iter->second.get();
   }
 
-  ibv_send_wr wr = {};
-  wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
-  wr.imm_data = htonl(remote_mr.tensor_key());
-  wr.send_flags = IBV_SEND_SIGNALED;
-  ibv_send_wr* bad_wr;
-  if (ibv_post_send(id->qp, &wr, &bad_wr)) {
-    done(errors::Unavailable(strerror(errno), ": ", "ibv_post_send failed"));
-    return;
-  }
+  ibv_mr* mr = FindMemoryRegion(tensor);
+  const TensorBuffer* buffer = DMAHelper::buffer(tensor);
 
-  ibv_wc wc = {};
-  int ret;
-  while ((ret = ibv_poll_cq(id->send_cq, 1, &wc)) == 0)
-    ;
-  if (ret < 0 || wc.status) {
-    done(errors::Unavailable(ibv_wc_status_str(wc.status)));
-    return;
-  }
+  const Tensor* copy = nullptr;
 
-#if GOOGLE_CUDA
-  if (device->tensorflow_gpu_device_info() && !on_host &&
-      host_copy.NumElements() > 0) {
-    uint64_t checksum = 0;
-    if (VLOG_IS_ON(2)) {
-      checksum = GPUUtil::Checksum(host_copy);
-      CHECK(checksum == remote_mr.checksum())
-          << "Checksum mismatch: " << checksum << "!=" << remote_mr.checksum();
+  if (mr == nullptr) {
+    AllocatorAttributes alloc_attrs;
+    alloc_attrs.set_gpu_compatible(true);
+    alloc_attrs.set_nic_compatible(true);
+    alloc_attrs.set_on_host(true);
+    Allocator* alloc = device->GetAllocator(alloc_attrs);
+    copy = new Tensor(alloc, tensor->dtype(), tensor->shape());
+
+    mr = FindMemoryRegion(copy);
+    buffer = DMAHelper::buffer(copy);
+    if (mr == nullptr) {
+      done(errors::Unavailable("Cannot find pinned memory region"));
+      delete copy;
+      return;
     }
-    Tensor* ref = new Tensor;
-    std::swap(host_copy, *ref);
-    GPUUtil::CopyCPUTensorToGPU(
-        ref, device_context, device, tensor,
-        [ref, done, buffer, remote_mr, start](const Status& s) {
-          if (!s.ok()) {
-            done(s);
-            delete ref;
-            return;
-          }
-          uint64_t end = Env::Default()->NowMicros();
-
-          VLOG(2) << "RDMA from remote memory region " << remote_mr.rkey()
-                  << " of size " << buffer->size() << " with tensor key "
-                  << remote_mr.tensor_key() << " took " << (end - start)
-                  << " micros";
-          done(Status::OK());
-          delete ref;
-        });
-    return;
   }
-#endif  // GOOGLE_CUDA
 
-  if ((on_host || !device->tensorflow_gpu_device_info()) &&
-      host_copy.NumElements() > 0) {
-    std::memcpy(DMAHelper::buffer(tensor)->data(), addr, length);
-    VLOG(2) << "Copying " << length << " bytes unpinned tensor buffer";
-  }
+  uint64_t start = Env::Default()->NowMicros();
 
-  uint64_t end = Env::Default()->NowMicros();
+  TensorKey tensor_key = remote_mr.tensor_key();
 
-  VLOG(2) << "RDMA from remote memory region " << remote_mr.rkey()
-          << " of size " << buffer->size() << " with tensor key "
-          << remote_mr.tensor_key() << " took " << (end - start) << " micros";
+  StatusCallback callback = [done, copy, device, device_context, on_host,
+                             tensor, start, tensor_key](const Status& s) {
+    if (!s.ok()) {
+      done(s);
+      if (copy) {
+        delete copy;
+      }
+      return;
+    }
 
-  uint64_t checksum = 0;
-  if (VLOG_IS_ON(2)) {
-#ifdef GOOGLE_CUDA
-    if (device->tensorflow_gpu_device_info() && !on_host) {
-      checksum = GPUUtil::Checksum(device, device_context, *tensor);
+    VLOG(2) << "RDMA of tensor " << tensor_key << " of size "
+            << DMAHelper::buffer(tensor)->size() << " took "
+            << (Env::Default()->NowMicros() - start) << " micros";
+
+    if (copy && device->tensorflow_gpu_device_info() && !on_host) {
+      device_context->CopyCPUTensorToDevice(copy, device, tensor,
+                                            [done, copy](const Status& s) {
+                                              done(s);
+                                              delete copy;
+                                            });
+    } else if (copy) {
+      std::memcpy(DMAHelper::buffer(tensor)->data(),
+                  DMAHelper::buffer(copy)->data(),
+                  DMAHelper::buffer(copy)->size());
+      done(s);
+      delete copy;
     } else {
-      checksum = GPUUtil::Checksum(*tensor);
+      done(s);
+    }
+  };
+
+  {
+    mutex_lock l(callback_mu_);
+    if (tensor_callbacks_.find(tensor_key) == std::end(tensor_callbacks_)) {
+      tensor_callbacks_.insert(std::make_pair(tensor_key, std::move(callback)));
+    } else {
+      done(errors::Unavailable("Received duplicated tensor key"));
+      if (copy) {
+        delete copy;
+      }
+      return;
+    }
+  }
+
+  if (rdma_post_read(id, reinterpret_cast<void*>(tensor_key), buffer->data(),
+                     buffer->size(), mr, IBV_SEND_SIGNALED, remote_mr.addr(),
+                     remote_mr.rkey())) {
+    done(errors::Unavailable(strerror(errno), ": ", "rdma_post_read failed"));
+    {
+      mutex_lock l(callback_mu_);
+      auto iter = tensor_callbacks_.find(tensor_key);
+      if (iter != std::end(tensor_callbacks_)) {
+        tensor_callbacks_.erase(iter);
+      }
+    }
+    if (copy) {
+      delete copy;
     }
-    CHECK(checksum == remote_mr.checksum())
-        << "Checksum mismatch: " << checksum << "!=" << remote_mr.checksum();
-#endif
   }
-  done(Status::OK());
 }
 
 Status GdrMemoryManager::CreateEndpoint(const string& host, const string& port,
@@ -663,7 +551,7 @@ Status GdrMemoryManager::CreateEndpoint(const string& host, const string& port,
   ibv_qp_init_attr init_attr = {};
   init_attr.qp_type = IBV_QPT_RC;
   init_attr.cap.max_recv_wr = 1;
-  init_attr.cap.max_send_wr = 32;
+  init_attr.cap.max_send_wr = 1024;
   init_attr.cap.max_recv_sge = 1;
   init_attr.cap.max_send_sge = 1;
 
@@ -687,8 +575,8 @@ Status GdrMemoryManager::CreateEndpoint(const string& host, const string& port,
   return Status::OK();
 }
 
-ibv_mr* GdrMemoryManager::FindMemoryRegion(void* addr, size_t length) {
-  if (length == 0) return nullptr;
+ibv_mr* GdrMemoryManager::FindMemoryRegion(const Tensor* tensor) {
+  const void* addr = DMAHelper::buffer(tensor)->data();
   mutex_lock l(alloc_mu_);
   auto iter = std::upper_bound(mrs_.begin(), mrs_.end(), addr, &Comparator);
   if (iter == std::end(mrs_) || iter->get()->addr > addr) {
diff --git a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
index fbccbead03fc0d641db40ede661bf3677d44c45d..1124dff741309d8fd04954e70c5ebaaf164b940a 100644
--- a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
+++ b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
@@ -58,11 +58,9 @@ class GdrRecvTensorCall : public BaseRecvTensorCall {
     resp_.InitAlloc(dst_device_, recv_args_.alloc_attrs);
     StatusCallback cb = [this, recv_done](const Status& s) {
       bool dma_ok = resp_.metadata().has_transport_options();
-      if (s.ok() && tensor().TotalBytes() > 0 && (!is_dead()) && dma_ok) {
+      if (s.ok() && tensor().TotalBytes() > 1024 && (!is_dead()) && dma_ok) {
         auto transport_options = resp_.metadata().transport_options();
-        const bool on_host =
-            (dst_device_->tensorflow_gpu_device_info() == nullptr) ||
-            recv_args_.alloc_attrs.on_host();
+        const bool on_host = recv_args_.alloc_attrs.on_host();
         remote_memory_manager_->TensorFromTransportOptions(
             const_cast<Tensor*>(&tensor()), transport_options, dst_device_,
             recv_args_.device_context, on_host,
@@ -70,9 +68,6 @@ class GdrRecvTensorCall : public BaseRecvTensorCall {
               if (!s.ok()) {
                 mutex_lock l(mu_);
                 status_.Update(s);
-                LOG(ERROR) << "Cannot find pinned memory region from allocator "
-                           << dst_device_->GetAllocator(recv_args_.alloc_attrs)
-                                  ->Name();
               }
               recv_done();
             });
@@ -172,8 +167,11 @@ class GdrRemoteRendezvous : public BaseRemoteRendezvous {
 
     // RendezvousMgr already aborted, shouldn't send RPC call any more
     if (!call->status().ok()) {
-      done(call->status(), Args(), Args(), Tensor(), false);
+      // NOTE: `*session()` can potentially be deleted before we return from
+      // `call->done()(...)`, so we must release the worker before calling the
+      // callback.
       session()->worker_cache->ReleaseWorker(src_worker, rwi);
+      done(call->status(), Args(), Args(), Tensor(), false);
       delete call;
       return;
     }
@@ -186,8 +184,11 @@ class GdrRemoteRendezvous : public BaseRemoteRendezvous {
       // If StartAbort was called prior to DeregisterCall, then the
       // current status should be bad.
       Status s = call->status();
-      done(s, Args(), call->recv_args(), call->tensor(), call->is_dead());
+      // NOTE: `*session()` can potentially be deleted before we return from
+      // `call->done()(...)`, so we must release the worker before calling the
+      // callback.
       session()->worker_cache->ReleaseWorker(src_worker, rwi);
+      done(s, Args(), call->recv_args(), call->tensor(), call->is_dead());
       delete call;
       Unref();
     });
diff --git a/tensorflow/contrib/gdr/gdr_server_lib.cc b/tensorflow/contrib/gdr/gdr_server_lib.cc
index b3f48ec1dd9c75055f4e1ea76eb203b6ccf94718..c39cc0f9bcecc26aedfaf9707113210acf670244 100644
--- a/tensorflow/contrib/gdr/gdr_server_lib.cc
+++ b/tensorflow/contrib/gdr/gdr_server_lib.cc
@@ -16,11 +16,13 @@ limitations under the License.
 #include "tensorflow/contrib/gdr/gdr_server_lib.h"
 
 #include "grpc/support/alloc.h"
+#include "tensorflow/contrib/gdr/gdr_collective_executor_mgr.h"
 #include "tensorflow/contrib/gdr/gdr_memory_manager.h"
 #include "tensorflow/contrib/gdr/gdr_rendezvous_mgr.h"
 #include "tensorflow/contrib/gdr/gdr_worker.h"
-
-#include "grpc/support/alloc.h"
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
 
 namespace tensorflow {
 
@@ -57,10 +59,34 @@ Status GdrServer::Init() {
     return std::unique_ptr<GdrWorker>(
         new GdrWorker(env, config, remote_memory_manager_.get()));
   };
-
+  CollectiveMgrCreationFunction collective_mgr_func =
+      [this](const ConfigProto& config, const WorkerEnv* env,
+             WorkerCacheInterface* worker_cache) {
+        string unused;
+        string default_worker_name;
+        DeviceNameUtils::SplitDeviceName(
+            env->device_mgr->ListDevices()[0]->name(), &default_worker_name,
+            &unused);
+
+        std::unique_ptr<DeviceResolverDistributed> dev_resolver(
+            new DeviceResolverDistributed(env->device_mgr, worker_cache,
+                                          default_worker_name));
+        std::unique_ptr<CollectiveParamResolverDistributed> param_resolver(
+            new CollectiveParamResolverDistributed(
+                config, env->device_mgr, dev_resolver.get(), worker_cache,
+                default_worker_name));
+        return new GdrCollectiveExecutorMgr(
+            config, env->device_mgr, std::move(dev_resolver),
+            std::move(param_resolver), worker_cache, default_worker_name,
+            remote_memory_manager_.get());
+      };
   TF_RETURN_IF_ERROR(remote_memory_manager_->Init());
 
-  return GrpcServer::Init(nullptr, rendezvous_mgr_func, nullptr, worker_func);
+  GrpcServerOptions opts;
+  opts.rendezvous_mgr_func = rendezvous_mgr_func;
+  opts.collective_mgr_func = collective_mgr_func;
+  opts.worker_func = worker_func;
+  return GrpcServer::Init(opts);
 }
 
 Status GdrServer::Start() {
@@ -74,9 +100,8 @@ Status GdrServer::Start() {
 }
 
 Status GdrServer::Stop() {
-  TF_RETURN_IF_ERROR(GrpcServer::Stop());
   remote_memory_manager_->Stop();
-  return Status::OK();
+  return GrpcServer::Stop();
 }
 
 Status GdrServer::Join() {
diff --git a/tensorflow/contrib/gdr/gdr_worker.cc b/tensorflow/contrib/gdr/gdr_worker.cc
index 867cb83f42034c8e9061e333ea671457745f92c3..1204b8ca501a8f99ea6abd6c047ab2d91350bae1 100644
--- a/tensorflow/contrib/gdr/gdr_worker.cc
+++ b/tensorflow/contrib/gdr/gdr_worker.cc
@@ -15,12 +15,10 @@ limitations under the License.
 
 #include "tensorflow/contrib/gdr/gdr_worker.h"
 
+#include "tensorflow/core/common_runtime/buf_rendezvous.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
-#if GOOGLE_CUDA
-#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
-#endif  // GOOGLE_CUDA
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/distributed_runtime/graph_mgr.h"
@@ -32,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_session.h"
 #include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
@@ -43,13 +42,13 @@ GdrWorker::GdrWorker(WorkerEnv* worker_env, const ConfigProto& config,
                      RemoteMemoryManager* remote_memory_manager)
     : GrpcWorker(worker_env, config),
       remote_memory_manager_(remote_memory_manager),
-      recv_tensor_recent_request_ids_(100000) {}
+      recent_request_ids_(100000) {}
 
 void GdrWorker::GrpcRecvTensorAsync(CallOptions* opts,
                                     const RecvTensorRequest* request,
                                     ::grpc::ByteBuffer* response,
                                     StatusCallback done) {
-  Status s = recv_tensor_recent_request_ids_.TrackUnique(
+  Status s = recent_request_ids_.TrackUnique(
       request->request_id(), "RecvTensor (GdrWorker)", *request);
   if (!s.ok()) {
     done(s);
@@ -78,7 +77,7 @@ void GdrWorker::GrpcRecvTensorAsync(CallOptions* opts,
   const bool dma_ok = request->dma_ok();
   env_->rendezvous_mgr->RecvLocalAsync(
       step_id, parsed,
-      [this, opts, response, done, src_dev, dma_ok](
+      [this, opts, response, done, src_dev, request, dma_ok](
           const Status& status, const Rendezvous::Args& send_args,
           const Rendezvous::Args&, const Tensor& val, const bool is_dead) {
         opts->ClearCancelCallback();
@@ -89,10 +88,8 @@ void GdrWorker::GrpcRecvTensorAsync(CallOptions* opts,
           // 3) the tensor has the on_host allocation attribute,
           // i.e. it's in CPU RAM *independent of its assigned
           // device type*.
-          const bool on_host =
-              (src_dev->tensorflow_gpu_device_info() == nullptr) ||
-              send_args.alloc_attrs.on_host();
-          if (val.TotalBytes() > 0 && (!is_dead) &&
+          const bool on_host = send_args.alloc_attrs.on_host();
+          if (val.TotalBytes() > 1024 && (!is_dead) &&
               DMAHelper::CanUseDMA(&val) && dma_ok) {
             // DMA cases.
             RecvTensorResponse* proto = new RecvTensorResponse;
@@ -117,8 +114,7 @@ void GdrWorker::GrpcRecvTensorAsync(CallOptions* opts,
           } else {
             // Non-DMA cases.
             if (src_dev->tensorflow_gpu_device_info() && (!on_host)) {
-#if GOOGLE_CUDA
-              const DeviceContext* send_dev_context = send_args.device_context;
+              DeviceContext* send_dev_context = send_args.device_context;
               AllocatorAttributes alloc_attrs;
               alloc_attrs.set_gpu_compatible(true);
               alloc_attrs.set_on_host(true);
@@ -127,7 +123,8 @@ void GdrWorker::GrpcRecvTensorAsync(CallOptions* opts,
               CHECK(send_dev_context)
                   << "send dev name: " << src_dev->name()
                   << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
-              // "val" is on a GPU. Uses GPUUtil to fill the response proto.
+              // "val" is on an accelerator device. Uses the device_context to
+              // fill the copy on host.
               StatusCallback copy_ready = [response, done, copy,
                                            is_dead](const Status& s) {
                 // The value is now ready to be returned on the wire.
@@ -136,11 +133,8 @@ void GdrWorker::GrpcRecvTensorAsync(CallOptions* opts,
                 delete copy;
               };
 
-              GPUUtil::CopyGPUTensorToCPU(src_dev, send_dev_context, &val, copy,
-                                          copy_ready);
-#else
-              done(errors::Internal("No GPU device in process"));
-#endif  // GOOGLE_CUDA
+              send_dev_context->CopyDeviceTensorToCPU(
+                  &val, request->rendezvous_key(), src_dev, copy, copy_ready);
             } else {
               grpc::EncodeTensorToByteBuffer(is_dead, val, response);
               done(Status::OK());
@@ -153,4 +147,41 @@ void GdrWorker::GrpcRecvTensorAsync(CallOptions* opts,
       });
 }
 
+void GdrWorker::RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
+                             RecvBufResponse* response, StatusCallback done) {
+  // This is an RDMA enabled implementation augmenting grpc.
+  Status s = recent_request_ids_.TrackUnique(request->request_id(),
+                                             "RecvBuf (GdrWorker)", *request);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
+  CollectiveExecutor::Handle ce_handle(
+      env_->collective_executor_mgr->FindOrCreate(request->step_id()), true);
+  CollectiveRemoteAccess* rma = ce_handle.get()->remote_access();
+  rma->buf_rendezvous()->ConsumeBuf(
+      request->buf_rendezvous_key(),
+      [this, request, response, done](const Status& status,
+                                      BufRendezvous::Hook* hook) {
+        Status s = status;
+        if (s.ok()) {
+          if (!DMAHelper::CanUseDMA(hook->prod_value)) {
+            s = errors::Internal("Tensor value for key ",
+                                 request->buf_rendezvous_key(),
+                                 " is not of a type supported by RecvBuf");
+          }
+        }
+        if (s.ok()) {
+          remote_memory_manager_->TransportOptionsFromTensor(
+              response->mutable_transport_options(), *hook->prod_value,
+              hook->prod_dev, hook->prod_ctx, hook->prod_attr.on_host(),
+              [this, response, done, hook](const Status& s) {
+                response->set_send_start_micros(env_->env->NowMicros());
+                done(s);
+                BufRendezvous::DoneWithHook(hook);
+              });
+        }
+      });
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/gdr/gdr_worker.h b/tensorflow/contrib/gdr/gdr_worker.h
index 39f11e6bde5a1ca7ae91ead02279d22d70af027b..9a85cfd4263ad86f6579eedce95969c2829ff62c 100644
--- a/tensorflow/contrib/gdr/gdr_worker.h
+++ b/tensorflow/contrib/gdr/gdr_worker.h
@@ -38,9 +38,13 @@ class GdrWorker : public GrpcWorker {
                                    ::grpc::ByteBuffer* response,
                                    StatusCallback done) override;
 
+  virtual void RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
+                            RecvBufResponse* response,
+                            StatusCallback done) override;
+
  private:
   RemoteMemoryManager* remote_memory_manager_;  // Not owned
-  RecentRequestIds recv_tensor_recent_request_ids_;
+  RecentRequestIds recent_request_ids_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/graph_editor/transform.py b/tensorflow/contrib/graph_editor/transform.py
index e79ccd8da1f8952758ae322d3a92dec34910a9db..5b37239665d46db38fc249e9004d2200abb3d610 100644
--- a/tensorflow/contrib/graph_editor/transform.py
+++ b/tensorflow/contrib/graph_editor/transform.py
@@ -22,7 +22,6 @@ from __future__ import print_function
 from copy import deepcopy
 from functools import partial
 from six import iteritems
-from six import iterkeys
 from six import string_types
 from six import StringIO
 from tensorflow.contrib.graph_editor import reroute
@@ -735,9 +734,8 @@ def graph_replace(target_ts, replacement_ts, dst_scope="",
   # control dependencies.
   graph = util.get_unique_graph(flatten_target_ts, check_types=(tf_ops.Tensor))
   control_ios = util.ControlOutputs(graph)
-  ops = select.get_walks_intersection_ops(list(iterkeys(replacement_ts)),
-                                          flatten_target_ts,
-                                          control_ios=control_ios)
+  ops = select.get_walks_intersection_ops(
+      list(replacement_ts), flatten_target_ts, control_ios=control_ios)
   if not ops:
     raise ValueError("Targets and replacements are not connected!")
 
diff --git a/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py b/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py
index 5c5599858ee6879a5703d65658bf4bbd881c7e72..71eac729a8a81c2f59f9ed5d7f42fb7b1c3e1b5c 100644
--- a/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py
+++ b/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py
@@ -23,11 +23,16 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.util import deprecation
 
 
 class SequenceFileDataset(dataset_ops.DatasetSource):
   """A Sequence File Dataset that reads the sequence file."""
 
+  @deprecation.deprecated(
+      None,
+      "tf.contrib.hadoop will be removed in 2.0, the support for Apache Hadoop "
+      "will continue to be provided through the tensorflow/io GitHub project.")
   def __init__(self, filenames):
     """Create a `SequenceFileDataset`.
 
@@ -50,13 +55,11 @@ class SequenceFileDataset(dataset_ops.DatasetSource):
     Args:
       filenames: A `tf.string` tensor containing one or more filenames.
     """
-    super(SequenceFileDataset, self).__init__()
     self._filenames = ops.convert_to_tensor(
         filenames, dtype=dtypes.string, name="filenames")
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.sequence_file_dataset(
+    variant_tensor = gen_dataset_ops.sequence_file_dataset(
         self._filenames, self._element_structure._flat_types)  # pylint: disable=protected-access
+    super(SequenceFileDataset, self).__init__(variant_tensor)
 
   @property
   def _element_structure(self):
diff --git a/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD b/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
index 0081fb61770075a2c36e92f65e01126f657edeb4..92016e6a83975a9b15a39a15125e0eabc111912e 100644
--- a/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
+++ b/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
@@ -16,9 +16,31 @@ tf_cc_binary(
     srcs = ["hvx_ops_support_checker_main.cc"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:candidate_sampling_ops_op_lib",
+        "//tensorflow/core:control_flow_ops_op_lib",
+        "//tensorflow/core:data_flow_ops_op_lib",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:functional_ops_op_lib",
+        "//tensorflow/core:io_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:list_ops_op_lib",
+        "//tensorflow/core:logging_ops_op_lib",
+        "//tensorflow/core:lookup_ops_op_lib",
+        "//tensorflow/core:manip_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
+        "//tensorflow/core:parsing_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:random_ops_op_lib",
+        "//tensorflow/core:remote_fused_graph_ops_op_lib",
+        "//tensorflow/core:sendrecv_ops_op_lib",
+        "//tensorflow/core:sparse_ops_op_lib",
+        "//tensorflow/core:state_ops_op_lib",
+        "//tensorflow/core:string_ops_op_lib",
+        "//tensorflow/core:training_ops_op_lib",
+        "//tensorflow/core:user_ops_op_lib",
         "//tensorflow/core/kernels:remote_fused_graph_execute_utils",
         "//tensorflow/core/kernels/hexagon:graph_transferer",
         "//tensorflow/tools/graph_transforms:file_utils",
diff --git a/tensorflow/contrib/ignite/README.md b/tensorflow/contrib/ignite/README.md
index 5a8c650fb927be0c835aaceffc516c048195c7bf..c1f6cac4942436d32f9867d4b5557c6b9e376c69 100644
--- a/tensorflow/contrib/ignite/README.md
+++ b/tensorflow/contrib/ignite/README.md
@@ -30,7 +30,8 @@ system based on Apache Ignite.
 
 ## Features
 
-Ignite Dataset provides features that that you can use in a wide range of cases. The most important and interesting features are described below.
+Ignite Dataset provides features that you can use in a wide range of cases. The
+most important and interesting features are described below.
 
 ### Distributed In-Memory Datasource
 [Apache Ignite](https://ignite.apache.org/) is a distributed in-memory database, caching, and processing platform that provides fast data access. It allows you to avoid limitations of hard drive and store and operate with as much data as you need in distributed cluster. You can utilize
@@ -97,6 +98,7 @@ jdbc:ignite:thin://localhost/> INSERT INTO KITTEN_CACHE VALUES (3, 'LITTLE BALL
 ```python
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
+>>> tf.enable_eager_execution()
 >>>
 >>> dataset = IgniteDataset(cache_name="IMAGES").map(lambda obj: obj['val']['pixels'])
 >>>
@@ -116,7 +118,15 @@ Using this ability we can calculate gradients on the nodes the data is stored on
 
 Apache Ignite uses horizontal partitioning to store data in distributed cluster. When we create Apache Ignite cache (or table in terms of SQL), we can specify the number of partitions the data will be partitioned on. For example, if an Apache Ignite cluster consists of 10 machines and we create cache with 10 partitions, then every machine will maintain approximately one data partition.
 
-Ignite Dataset allows using these two aspects of distributed neural network training (using TensorFlow) and Apache Ignite partitioning. Ignite Dataset is a computation graph operation that can be performed on a remote worker. The remote worker can override Ignite Dataset parameters (such as `host`, `port` or `part`) by setting correstondent environment variables for worker process (such as `IGNITE_DATASET_HOST`, `IGNITE_DATASET_PORT` or `IGNITE_DATASET_PART`). Using this overriding approach, we can assign a specific partition to every worker so that one worker handles one partition and, at the same time, transparently work with single dataset.
+Ignite Dataset allows using these two aspects of distributed neural network
+training (using TensorFlow) and Apache Ignite partitioning. Ignite Dataset is a
+computation graph operation that can be performed on a remote worker. The remote
+worker can override Ignite Dataset parameters (such as `host`, `port` or `part`)
+by setting correspondent environment variables for worker process (such as
+`IGNITE_DATASET_HOST`, `IGNITE_DATASET_PORT` or `IGNITE_DATASET_PART`). Using
+this overriding approach, we can assign a specific partition to every worker so
+that one worker handles one partition and, at the same time, transparently work
+with single dataset.
 
 ```python
 >>> import tensorflow as tf
@@ -149,23 +159,31 @@ system called [IGFS](https://ignite.apache.org/features/igfs.html). IGFS
 delivers a similar functionality to Hadoop HDFS, but only in-memory. In fact, in
 addition to its own APIs, IGFS implements Hadoop FileSystem API and can be
 transparently plugged into Hadoop or Spark deployments. This contrib package
-contains an integration between IGFS and TensorFlow. The integration is based
-on [custom filesystem plugin](https://www.tensorflow.org/extend/add_filesys)
-from TensorFlow side and
+contains an integration between IGFS and TensorFlow. The integration is based on
+[custom filesystem plugin](https://www.tensorflow.org/extend/add_filesys) from
+TensorFlow side and
 [IGFS Native API](https://ignite.apache.org/features/igfs.html) from Apache
-Ignite side. It has numerous uses, for example: * Checkpoints of state can be
-saved to IGFS for reliability and fault-tolerance. * Training processes
-communicate with TensorBoard by writing event files to a directory, which
-TensorBoard watches. IGFS allows this communication to work even when
-TensorBoard runs in a different process or machine.
+Ignite side. It has numerous uses, for example:
+
+*   Checkpoints of state can be saved to IGFS for reliability and
+    fault-tolerance.
+*   Training processes communicate with TensorBoard by writing event files to a
+    directory, which TensorBoard watches. IGFS allows this communication to work
+    even when TensorBoard runs in a different process or machine.
 
 ### SSL Connection
 
-Apache Ignite allows to protect data transfer channels by [SSL](https://en.wikipedia.org/wiki/Transport_Layer_Security) and authentification. Ignite Dataset supports both SSL connection with and without authntication. For more information, please refer to the [Apache Ignite SSL/TLS](https://apacheignite.readme.io/docs/ssltls) documentation.
+Apache Ignite allows to protect data transfer channels by
+[SSL](https://en.wikipedia.org/wiki/Transport_Layer_Security) and
+authentication. Ignite Dataset supports both SSL connection with and without
+authentication. For more information, please refer to the
+[Apache Ignite SSL/TLS](https://apacheignite.readme.io/docs/ssltls)
+documentation.
 
 ```python
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
+>>> tf.enable_eager_execution()
 >>>
 >>> dataset = IgniteDataset(cache_name="IMAGES",
                             certfile="client.pem",
@@ -186,7 +204,7 @@ Following examples will help you to easily start working with this module.
 
 The simplest way to try Ignite Dataset is to run a
 [Docker](https://www.docker.com/) container with Apache Ignite and loaded
-[MNIST](http://yann.lecun.com/exdb/mnist/) data and after start interruct with
+[MNIST](http://yann.lecun.com/exdb/mnist/) data and after start interrupt with
 it using Ignite Dataset. Such container is available on Docker Hub:
 [dmitrievanthony/ignite-with-mnist](https://hub.docker.com/r/dmitrievanthony/ignite-with-mnist/).
 You need to start this container on your machine:
@@ -197,13 +215,13 @@ docker run -it -p 10800:10800 dmitrievanthony/ignite-with-mnist
 
 After that you will be able to work with it following way:
 
-![ignite-dataset-mnist](https://s3.amazonaws.com/helloworld23423423ew23/ignite-dataset-mnist.png "Ignite Dataset Mnist")
+![ignite-dataset-mnist](https://s3.amazonaws.com/helloworld23423423ew23/ignite-dataset-mnist-2.png "Ignite Dataset Mnist")
 
 ### IGFS
 
 The simplest way to try IGFS with TensorFlow is to run
 [Docker](https://www.docker.com/) container with Apache Ignite and enabled IGFS
-and then interruct with it using TensorFlow
+and then interrupt with it using TensorFlow
 [tf.gfile](https://www.tensorflow.org/api_docs/python/tf/gfile). Such container
 is available on Docker Hub:
 [dmitrievanthony/ignite-with-igfs](https://hub.docker.com/r/dmitrievanthony/ignite-with-igfs/).
diff --git a/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py b/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
index e4762c91b193f9c5e32fa2642e702e61e8e5e57f..3ffceef8070e0fc3b3cebae2522f89fe98ce4413 100644
--- a/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
+++ b/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
@@ -31,6 +31,7 @@ from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.util import deprecation
 
 
 @six.add_metaclass(abc.ABCMeta)
@@ -699,6 +700,10 @@ class IgniteDataset(dataset_ops.DatasetSource):
      Ignite Binary Client Protocol.
   """
 
+  @deprecation.deprecated(
+      None,
+      "tf.contrib.ignite will be removed in 2.0, the support for Apache Ignite "
+      "will continue to be provided through the tensorflow/io GitHub project.")
   def __init__(self,
                cache_name,
                host="localhost",
@@ -730,8 +735,6 @@ class IgniteDataset(dataset_ops.DatasetSource):
       cert_password: Password to be used if the private key is encrypted and a
         password is necessary.
     """
-    super(IgniteDataset, self).__init__()
-
     with IgniteClient(host, port, username, password, certfile, keyfile,
                       cert_password) as client:
       client.handshake()
@@ -755,6 +758,8 @@ class IgniteDataset(dataset_ops.DatasetSource):
         self.cache_type.to_output_types(), self.cache_type.to_output_shapes(),
         self.cache_type.to_output_classes())
 
+    super(IgniteDataset, self).__init__(self._as_variant_tensor())
+
   def _as_variant_tensor(self):
     return gen_dataset_ops.ignite_dataset(self.cache_name, self.host, self.port,
                                           self.local, self.part, self.page_size,
diff --git a/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py b/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
index ff5d4c458c859fd8e5e3ae65ee41a454d55d6538..89b74fbfdc38c9f42795d5c778889210baf6387f 100644
--- a/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
+++ b/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
@@ -19,9 +19,9 @@ from __future__ import print_function
 
 import os
 
+from tensorflow import compat
 from tensorflow.contrib.ignite import IgniteDataset
 from tensorflow.python.client import session
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
@@ -66,7 +66,7 @@ class IgniteDatasetTest(test.TestCase):
     self.assertEqual(dtypes.string, dataset.output_types["val"]["NAME"])
     self.assertEqual(dtypes.int64, dataset.output_types["val"]["VAL"])
 
-    it = dataset_ops.make_one_shot_iterator(dataset)
+    it = compat.v1.data.make_one_shot_iterator(dataset)
     ne = it.get_next()
 
     with session.Session() as sess:
diff --git a/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh b/tensorflow/contrib/kafka/python/kernel_tests/kafka_test.sh
old mode 100644
new mode 100755
diff --git a/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
index 2b86331099ccae03664462987ee0c141d766c10f..5591c3b0cc8c8bf196bb4821c018cbf155cba4ce 100644
--- a/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
+++ b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
@@ -23,12 +23,17 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.util import deprecation
 
 
 class KafkaDataset(dataset_ops.DatasetSource):
   """A Kafka Dataset that consumes the message.
   """
 
+  @deprecation.deprecated(
+      None,
+      "tf.contrib.kafka will be removed in 2.0, the support for Apache Kafka "
+      "will continue to be provided through the tensorflow/io GitHub project.")
   def __init__(self,
                topics,
                servers="localhost",
@@ -47,7 +52,6 @@ class KafkaDataset(dataset_ops.DatasetSource):
       timeout: The timeout value for the Kafka Consumer to wait
                (in millisecond).
     """
-    super(KafkaDataset, self).__init__()
     self._topics = ops.convert_to_tensor(
         topics, dtype=dtypes.string, name="topics")
     self._servers = ops.convert_to_tensor(
@@ -58,6 +62,8 @@ class KafkaDataset(dataset_ops.DatasetSource):
     self._timeout = ops.convert_to_tensor(
         timeout, dtype=dtypes.int64, name="timeout")
 
+    super(KafkaDataset, self).__init__(self._as_variant_tensor())
+
   def _as_variant_tensor(self):
     return gen_dataset_ops.kafka_dataset(self._topics, self._servers,
                                          self._group, self._eof, self._timeout)
diff --git a/tensorflow/contrib/keras/api/keras/losses/__init__.py b/tensorflow/contrib/keras/api/keras/losses/__init__.py
index c4476a7bbd5056fa898468a46031bf3d8b1e44cf..b12832d2e2a3cccb4948d9e3bf3d226030121ac2 100644
--- a/tensorflow/contrib/keras/api/keras/losses/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/losses/__init__.py
@@ -22,7 +22,7 @@ from __future__ import print_function
 from tensorflow.python.keras.losses import binary_crossentropy
 from tensorflow.python.keras.losses import categorical_crossentropy
 from tensorflow.python.keras.losses import categorical_hinge
-from tensorflow.python.keras.losses import cosine_proximity
+from tensorflow.python.keras.losses import cosine_similarity
 from tensorflow.python.keras.losses import hinge
 from tensorflow.python.keras.losses import kullback_leibler_divergence
 from tensorflow.python.keras.losses import logcosh
diff --git a/tensorflow/contrib/keras/api/keras/metrics/__init__.py b/tensorflow/contrib/keras/api/keras/metrics/__init__.py
index 7317fdb52c5b79e787a49d71be49f5261d6b1fff..095b5d798df9ac9038fa1088cdd402dff304e87e 100644
--- a/tensorflow/contrib/keras/api/keras/metrics/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/metrics/__init__.py
@@ -23,7 +23,7 @@ from tensorflow.python.keras.metrics import binary_accuracy
 from tensorflow.python.keras.metrics import binary_crossentropy
 from tensorflow.python.keras.metrics import categorical_accuracy
 from tensorflow.python.keras.metrics import categorical_crossentropy
-from tensorflow.python.keras.metrics import cosine_proximity
+from tensorflow.python.keras.metrics import cosine_similarity
 from tensorflow.python.keras.metrics import hinge
 from tensorflow.python.keras.metrics import kullback_leibler_divergence
 from tensorflow.python.keras.metrics import mean_absolute_error
diff --git a/tensorflow/contrib/kernel_methods/python/losses.py b/tensorflow/contrib/kernel_methods/python/losses.py
index 4ef0a66a52429233c6e6f70667a451466493629c..294a7d69a704b3c06ab9e30489af116929ab6c2a 100644
--- a/tensorflow/contrib/kernel_methods/python/losses.py
+++ b/tensorflow/contrib/kernel_methods/python/losses.py
@@ -34,7 +34,7 @@ def sparse_multiclass_hinge_loss(
     scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
     reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS):
-  """Adds Ops for computing the multiclass hinge loss.
+  r"""Adds Ops for computing the multiclass hinge loss.
 
   The implementation is based on the following paper:
   On the Algorithmic Implementation of Multiclass Kernel-based Vector Machines
diff --git a/tensorflow/contrib/kfac/README.md b/tensorflow/contrib/kfac/README.md
index 42b91d031375b8edb7e4f364ac91ffb74ef1f54b..19daffea6c7e4486499388314d0aaaa611e94218 100644
--- a/tensorflow/contrib/kfac/README.md
+++ b/tensorflow/contrib/kfac/README.md
@@ -1,3 +1,3 @@
 # K-FAC: Kronecker-Factored Approximate Curvature
 
-## KFAC moved to third_party/tensorflow_kfac.
+## KFAC moved to https://github.com/tensorflow/kfac.
diff --git a/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py b/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py
index 20395395281768ac429984a1e3552cfd187527a2..9479afb180df7bb4a08d6aafa4fc3bf63489d9f3 100644
--- a/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py
+++ b/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py
@@ -23,6 +23,7 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.util import deprecation
 
 
 class KinesisDataset(dataset_ops.DatasetSource):
@@ -50,6 +51,10 @@ class KinesisDataset(dataset_ops.DatasetSource):
   is returned immediately instead.
   """
 
+  @deprecation.deprecated(
+      None,
+      "tf.contrib.kinesis will be removed in 2.0, the support for Kinesis "
+      "will continue to be provided through the tensorflow/io GitHub project.")
   def __init__(self,
                stream,
                shard="",
@@ -66,7 +71,6 @@ class KinesisDataset(dataset_ops.DatasetSource):
       interval: The interval for the Kinesis Client to wait before
         it tries to get records again (in millisecond).
     """
-    super(KinesisDataset, self).__init__()
     self._stream = ops.convert_to_tensor(
         stream, dtype=dtypes.string, name="stream")
     self._shard = ops.convert_to_tensor(
@@ -75,6 +79,7 @@ class KinesisDataset(dataset_ops.DatasetSource):
         read_indefinitely, dtype=dtypes.bool, name="read_indefinitely")
     self._interval = ops.convert_to_tensor(
         interval, dtype=dtypes.int64, name="interval")
+    super(KinesisDataset, self).__init__(self._as_variant_tensor())
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.kinesis_dataset(
diff --git a/tensorflow/contrib/labeled_tensor/BUILD b/tensorflow/contrib/labeled_tensor/BUILD
index 588f15b867c1fedbadd5a5d945d870a356549468..7e19ae7c13df421ec5bb9cb0e07dff0d00fb9548 100644
--- a/tensorflow/contrib/labeled_tensor/BUILD
+++ b/tensorflow/contrib/labeled_tensor/BUILD
@@ -155,7 +155,7 @@ py_library(
         ":core",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:map_fn",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:numerics",
         "//tensorflow/python:random_ops",
diff --git a/tensorflow/contrib/labeled_tensor/python/ops/ops.py b/tensorflow/contrib/labeled_tensor/python/ops/ops.py
index 2ede5daee74223e812cc29e9708b1989b698fb4e..a65f045cc886f4d4f351423858d92412baa3a622 100644
--- a/tensorflow/contrib/labeled_tensor/python/ops/ops.py
+++ b/tensorflow/contrib/labeled_tensor/python/ops/ops.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import map_fn as map_fn_lib
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import numerics
 from tensorflow.python.ops import random_ops
@@ -629,7 +630,7 @@ def map_fn(fn, labeled_tensor, name=None):
 
     # TODO(ericmc): Fix this upstream.
     if labeled_tensor.dtype == dtypes.string:
-      # We must construct the full graph here, because functional_ops.map_fn
+      # We must construct the full graph here, because map_fn_lib.map_fn
       # doesn't work for string-valued tensors.
       # Constructing the full graph may be slow.
       map_lts = [fn(t) for t in unpack_lts]
@@ -652,7 +653,7 @@ def map_fn(fn, labeled_tensor, name=None):
         tensor_lt = core.LabeledTensor(tensor, original_axes)
         return fn(tensor_lt).tensor
 
-      map_op = functional_ops.map_fn(
+      map_op = map_fn_lib.map_fn(
           tf_fn, labeled_tensor.tensor, dtype=first_map_lt.dtype)
       map_lt = core.LabeledTensor(map_op, final_axes)
 
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index 9ca6f8df5dbe3c236c4cd85095176ce69ad9deaa..69d5496f8aebb9b89c5d79f80a1a439f556093d7 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -81,6 +81,7 @@ tf_custom_op_py_library(
     visibility = [
         "//learning/brain:__subpackages__",
         "//tensorflow:__subpackages__",
+        "//tensorflow_model_optimization:__subpackages__",
         "//video/youtube/personalization:__subpackages__",
     ],
     deps = [
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
index 7e6eafaa0d6f60cfc28a4c422abac0b6d5a991fb..00e41026d0038409ace178e6affd2c1cdc812122 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
@@ -1757,7 +1757,7 @@ class WeightedSumTest(test.TestCase):
       logits_core = fc_core.linear_model(features, [movies])
 
       with self.cached_session() as sess:
-        variables_lib.initialize_all_variables().run()
+        variables_lib.global_variables_initializer().run()
         lookup_ops.tables_initializer().run()
 
         weights = column_to_variable[movies][0]
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 403b522ce45ac6ad98a321378626b87aaa7738aa..9d9524e4e4b995d795b7c71b5bd083d11c60d5ce 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -2308,7 +2308,7 @@ def layer_norm(inputs,
           initializer=init_ops.ones_initializer(),
           collections=gamma_collections,
           trainable=trainable)
-    # Calculate the moments on the last axis (layer activations).
+    # By default, compute the moments across all the dimensions except the one with index 0.
     norm_axes = list(range(begin_norm_axis, inputs_rank))
     mean, variance = nn.moments(inputs, norm_axes, keep_dims=True)
     # Compute layer normalization using the batch_normalization function.
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index d791418c9d0f887058ceb535092fa8122da1aa75..1c0088186c030437454c0f764decab9e5a276adc 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -1356,7 +1356,7 @@ class DropoutTest(test.TestCase):
     with self.cached_session():
       images = np.random.uniform(size=(5, height, width, 3))
       output = _layers.dropout(images)
-      self.assertEqual(output.op.name, 'Dropout/dropout_1/mul')
+      self.assertEqual(output.op.name, 'Dropout/dropout_1/mul_1')
       output.get_shape().assert_is_compatible_with(
           ops.convert_to_tensor(images).get_shape())
 
diff --git a/tensorflow/contrib/layers/python/layers/normalization.py b/tensorflow/contrib/layers/python/layers/normalization.py
index 11033a2e9cb646c2e7cd2f45de1f751d88c6921a..76b03ff514821d3459f84c5f46a64d1134e0d4de 100644
--- a/tensorflow/contrib/layers/python/layers/normalization.py
+++ b/tensorflow/contrib/layers/python/layers/normalization.py
@@ -186,7 +186,7 @@ def group_norm(inputs,
 
   Args:
     inputs: A Tensor with at least 2 dimensions one which is channels. All
-     shape dimensions must be fully defined.
+     shape dimensions except for batch must be fully defined.
     groups: Integer. Divide the channels into this number of groups over which
       normalization statistics are computed. This number must be commensurate
       with the number of channels in `inputs`.
@@ -249,13 +249,21 @@ def group_norm(inputs,
   """
   # TODO(shlens): Support partially defined shapes for the inputs.
   inputs = ops.convert_to_tensor(inputs)
-  original_shape = inputs.shape
 
   if inputs.shape.ndims is None:
     raise ValueError('Inputs %s has undefined rank.' % inputs.name)
   if channels_axis > (inputs.shape.ndims - 1):
     raise ValueError('Axis is out of bounds.')
 
+  # Use dynamic shape for not fully defined dimensions in the inputs.
+  dyanmic_shape = array_ops.shape(inputs)
+  input_shape_list = []
+  for i, dim in enumerate(inputs.shape):
+    if dim.value is None:
+      input_shape_list.append(dyanmic_shape[i])
+    else:
+      input_shape_list.append(dim)
+
   # Standardize the channels_axis to be positive and identify # of channels.
   if channels_axis < 0:
     channels_axis = inputs.shape.ndims + channels_axis
@@ -289,8 +297,8 @@ def group_norm(inputs,
   # Determine axes before channels. Some examples of common image formats:
   #  'NCHW': before = [N], after = [HW]
   #  'NHWC': before = [NHW], after = []
-  axes_before_channels = inputs.shape.as_list()[:channels_axis]
-  axes_after_channels = inputs.shape.as_list()[channels_axis+1:]
+  axes_before_channels = input_shape_list[:channels_axis]
+  axes_after_channels = input_shape_list[channels_axis+1:]
 
   # Manually broadcast the parameters to conform to the number of groups.
   params_shape_broadcast = ([1] * len(axes_before_channels) +
@@ -369,7 +377,7 @@ def group_norm(inputs,
     outputs = inputs * gain + offset
 
     # Collapse the groups into the channel dimension.
-    outputs = array_ops.reshape(outputs, original_shape)
+    outputs = array_ops.reshape(outputs, input_shape_list)
 
     if activation_fn is not None:
       outputs = activation_fn(outputs)
diff --git a/tensorflow/contrib/layers/python/layers/normalization_test.py b/tensorflow/contrib/layers/python/layers/normalization_test.py
index c8d3c91b10dbe3b959e91182f9924b78352d370d..9a85084b239837ade87d8c778393ef8e885f5bdd 100644
--- a/tensorflow/contrib/layers/python/layers/normalization_test.py
+++ b/tensorflow/contrib/layers/python/layers/normalization_test.py
@@ -221,6 +221,15 @@ class GroupNormTest(test.TestCase):
       normalization.group_norm(inputs, channels_axis=-1,
                                reduction_axes=[-3, -2])
 
+  def testParamsShapeNotFullyDefinedBatchAxis(self):
+    height, width, groups = 3, 3, 4
+    inputs = array_ops.placeholder(dtypes.float32,
+                                   shape=(None, height, width, 2*groups))
+    output = normalization.group_norm(inputs, channels_axis=-1,
+                                      reduction_axes=[-3, -2], groups=groups)
+    self.assertListEqual([None, height, width, 2 * groups],
+                         output.shape.as_list())
+
   def testCreateOp(self):
     height, width, groups = 3, 3, 4
     images = random_ops.random_uniform((5, height, width, 2*groups), seed=1)
diff --git a/tensorflow/contrib/layers/python/layers/target_column.py b/tensorflow/contrib/layers/python/layers/target_column.py
index 8a6b4f68a8b33d497ddb16614a7e3cdf32f2c422..5234869718b427d7e275b76ae12021a096241a56 100644
--- a/tensorflow/contrib/layers/python/layers/target_column.py
+++ b/tensorflow/contrib/layers/python/layers/target_column.py
@@ -399,7 +399,7 @@ def _mean_squared_loss(logits, target):
     target = array_ops.expand_dims(target, axis=1)
 
   logits.get_shape().assert_is_compatible_with(target.get_shape())
-  return math_ops.square(logits - math_ops.to_float(target))
+  return math_ops.squared_difference(logits, math_ops.to_float(target))
 
 
 def _log_loss_with_two_classes(logits, target):
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index 14065fcee51c014a1af227504eaaca1fa39941e1..4749371248ee89a033912132986d7f76c85dbaa6 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -357,9 +357,9 @@ py_test(
 
 py_test(
     name = "dnn_linear_combined_test",
-    size = "large",
+    size = "medium",
     srcs = ["python/learn/estimators/dnn_linear_combined_test.py"],
-    shard_count = 4,
+    shard_count = 8,
     srcs_version = "PY2AND3",
     tags = ["no_oss"],  # flaky b/70524820
     deps = [
diff --git a/tensorflow/contrib/learn/README.md b/tensorflow/contrib/learn/README.md
index b0bff915a993c9a01e2e6d9ef9f71c14d2f29a73..b2d3a6273abba7e3a893f30bbdd4f8b2662bd54a 100644
--- a/tensorflow/contrib/learn/README.md
+++ b/tensorflow/contrib/learn/README.md
@@ -111,18 +111,17 @@ Some arguments are renamed, please refer to documentation. In addition:
 
 Switch to `tf.estimator.train_and_evaluate`. Some differences:
 
-* Most of the constructor arguments, like `train_input_fn`, `eval_input_fn`,
-  should be wrapped into `tf.estimator.TrainSpec` and `tf.estimator.EvalSpec`.
-* Remove the `experiment_fn`. Instead, create the `Estimator`,
-  `train_spec` and `eval_spec`, then call `tf.estimator.train_and_evaluate`
-  directly.
-* Inside `tf.estimator.EvalSpec`, the `exporter` field is the replacement
-  for `export_strategy`. To be precise, `tf.estimator.LatestExporter` is the
-  replacement for `tf.contrib.learn.make_export_strategy`. If you want to export
-  only at the end of training  use `tf.estimator.FinalExporter`.
-* If the `TF_CONFIG` environment variable is constructed manually, please read
-  the `train_and_evaluate` documentation for the new requirementds (in
-  particular, the chief node and evaluator node).
+*   Most of the constructor arguments, like `train_input_fn`, `eval_input_fn`,
+    should be wrapped into `tf.estimator.TrainSpec` and `tf.estimator.EvalSpec`.
+*   Remove the `experiment_fn`. Instead, create the `Estimator`, `train_spec`
+    and `eval_spec`, then call `tf.estimator.train_and_evaluate` directly.
+*   Inside `tf.estimator.EvalSpec`, the `exporter` field is the replacement for
+    `export_strategy`. To be precise, `tf.estimator.LatestExporter` is the
+    replacement for `tf.contrib.learn.make_export_strategy`. If you want to
+    export only at the end of training use `tf.estimator.FinalExporter`.
+*   If the `TF_CONFIG` environment variable is constructed manually, please read
+    the `train_and_evaluate` documentation for the new requirements (in
+    particular, the chief node and evaluator node).
 
 ## Others Classes and Functions
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
index 28c4964527bb034c8c6b1642366c6c82c1a72201..c3e9e3af9427037a4e7be6b86417cd081c42ef67 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
@@ -37,8 +37,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import rnn_cell
@@ -524,7 +524,7 @@ class DynamicRNNEstimatorLearningTest(test.TestCase):
       def input_fn():
         starts = random_ops.random_uniform(
             [batch_size], maxval=(2 * np.pi), seed=seed)
-        sin_curves = functional_ops.map_fn(
+        sin_curves = map_fn.map_fn(
             _sin_fn, (starts,), dtype=dtypes.float32)
         inputs = array_ops.expand_dims(
             array_ops.slice(sin_curves, [0, 0], [batch_size, sequence_length]),
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
index 8a461a0bd7ba457fcf830769f23c6ca2860a2732..cbcae338a0a195da2aca1eea2e1b4c7eb8b0e35e 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator_test.py
@@ -1181,14 +1181,14 @@ class EstimatorTest(test.TestCase):
         ]
         self.assertItemsEqual([expected_vocab_file], assets)
         graph_ops = [x.name for x in graph.get_operations()]
-        self.assertTrue('input_example_tensor' in graph_ops)
-        self.assertTrue('ParseExample/ParseExample' in graph_ops)
-        self.assertTrue('linear/linear/feature/matmul' in graph_ops)
+        self.assertIn('input_example_tensor', graph_ops)
+        self.assertIn('ParseExample/ParseExample', graph_ops)
+        self.assertIn('linear/linear/feature/matmul', graph_ops)
         # Since there were no transforms, both save ops are still present.
-        self.assertTrue('save/SaveV2/tensor_names' in graph_ops)
-        self.assertTrue('save_1/SaveV2/tensor_names' in graph_ops)
+        self.assertIn('save/SaveV2/tensor_names', graph_ops)
+        self.assertIn('save_1/SaveV2/tensor_names', graph_ops)
         # Since there were no transforms, the hash table lookup is still there.
-        self.assertTrue('hash_table_Lookup' in graph_ops)
+        self.assertIn('hash_table_Lookup/LookupTableFindV2', graph_ops)
 
     # Restore, to validate that the export was well-formed.
     # tag_2, tag_3 was subjected to strip_unused_nodes.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index c1b97d8b49613ea49d9813954da3b7a63d3ba04c..4bb14a6e63b159fa4d09c9ef20947d4b125de657 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -567,7 +567,8 @@ def _mean_squared_loss(labels, logits, weights=None):
     if len(logits.get_shape()) == 1:
       logits = array_ops.expand_dims(logits, axis=1)
     logits.get_shape().assert_is_compatible_with(labels.get_shape())
-    loss = math_ops.square(logits - math_ops.to_float(labels), name=name)
+    loss = math_ops.squared_difference(
+        logits, math_ops.to_float(labels), name=name)
     return _compute_weighted_loss(loss, weights)
 
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head_test.py b/tensorflow/contrib/learn/python/learn/estimators/head_test.py
index 7c2d9bb0767cb979dae9c84b5342d129225677ed..a52d25acf402bdda46771e9146a40cfb71e99d53 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head_test.py
@@ -62,8 +62,8 @@ def _assert_no_variables(test_case):
 def _assert_metrics(test_case, expected_loss, expected_eval_metrics,
                     model_fn_ops):
   test_case.assertAlmostEqual(expected_loss, model_fn_ops.loss.eval(), places=4)
-  for k in six.iterkeys(expected_eval_metrics):
-    test_case.assertIn(k, six.iterkeys(model_fn_ops.eval_metric_ops))
+  for k in expected_eval_metrics:
+    test_case.assertIn(k, model_fn_ops.eval_metric_ops)
   variables.initialize_local_variables().run()
   for key, expected_value in six.iteritems(expected_eval_metrics):
     value_tensor, update_tensor = model_fn_ops.eval_metric_ops[key]
@@ -545,19 +545,19 @@ class MultiLabelHeadTest(test.TestCase):
       with session.Session():
         self.assertListEqual(
             [1, 0, 0], model_fn_ops.predictions["classes"].eval().tolist()[0])
-        self.assertItemsEqual(
-            ["head_name"], six.iterkeys(model_fn_ops.output_alternatives))
+        self.assertItemsEqual(["head_name"],
+                              list(model_fn_ops.output_alternatives))
         self.assertEqual(
             constants.ProblemType.CLASSIFICATION,
             model_fn_ops.output_alternatives["head_name"][0])
 
         predictions_for_serving = (
             model_fn_ops.output_alternatives["head_name"][1])
-        self.assertIn("classes", six.iterkeys(predictions_for_serving))
+        self.assertIn("classes", predictions_for_serving)
         self.assertAllEqual(
             [[b"0", b"1", b"2"], [b"0", b"1", b"2"]],
             predictions_for_serving["classes"].eval())
-        self.assertIn("probabilities", six.iterkeys(predictions_for_serving))
+        self.assertIn("probabilities", predictions_for_serving)
         self.assertAllClose(
             [[0.731059, 0.5, 0.5],
              [0.5, 0.5, 0.731059,]],
@@ -850,18 +850,18 @@ class BinaryClassificationHeadTest(test.TestCase):
       with session.Session():
         self.assertListEqual(
             [1, 1], list(model_fn_ops.predictions["classes"].eval()))
-        self.assertItemsEqual(
-            ["head_name"], six.iterkeys(model_fn_ops.output_alternatives))
+        self.assertItemsEqual(["head_name"],
+                              list(model_fn_ops.output_alternatives))
         self.assertEqual(
             constants.ProblemType.LOGISTIC_REGRESSION,
             model_fn_ops.output_alternatives["head_name"][0])
         predictions_for_serving = (
             model_fn_ops.output_alternatives["head_name"][1])
-        self.assertIn("classes", six.iterkeys(predictions_for_serving))
+        self.assertIn("classes", predictions_for_serving)
         predicted_classes = predictions_for_serving["classes"].eval().tolist()
         self.assertListEqual(
             [b"0", b"1"], predicted_classes[0])
-        self.assertIn("probabilities", six.iterkeys(predictions_for_serving))
+        self.assertIn("probabilities", predictions_for_serving)
 
   def testBinaryClassificationInferMode_withWeightColumn(self):
     n_classes = 2
@@ -1349,18 +1349,18 @@ class MultiClassHeadTest(test.TestCase):
         self.assertAllEqual(
             [0, 2],
             model_fn_ops.predictions["classes"].eval())
-        self.assertItemsEqual(
-            ["head_name"], six.iterkeys(model_fn_ops.output_alternatives))
+        self.assertItemsEqual(["head_name"],
+                              list(model_fn_ops.output_alternatives))
         self.assertEqual(
             constants.ProblemType.CLASSIFICATION,
             model_fn_ops.output_alternatives["head_name"][0])
         predictions_for_serving = (
             model_fn_ops.output_alternatives["head_name"][1])
-        self.assertIn("classes", six.iterkeys(predictions_for_serving))
+        self.assertIn("classes", predictions_for_serving)
         self.assertAllEqual(
             [[b"0", b"1", b"2"], [b"0", b"1", b"2"]],
             predictions_for_serving["classes"].eval())
-        self.assertIn("probabilities", six.iterkeys(predictions_for_serving))
+        self.assertIn("probabilities", predictions_for_serving)
         self.assertAllClose(
             [[0.576117, 0.2119416, 0.2119416],
              [0.2119416, 0.2119416, 0.576117]],
@@ -1401,18 +1401,18 @@ class MultiClassHeadTest(test.TestCase):
         self.assertAllEqual(
             [b"key0", b"key2"],
             model_fn_ops.predictions["classes"].eval())
-        self.assertItemsEqual(
-            ["head_name"], six.iterkeys(model_fn_ops.output_alternatives))
+        self.assertItemsEqual(["head_name"],
+                              list(model_fn_ops.output_alternatives))
         self.assertEqual(
             constants.ProblemType.CLASSIFICATION,
             model_fn_ops.output_alternatives["head_name"][0])
         predictions_for_serving = (
             model_fn_ops.output_alternatives["head_name"][1])
-        self.assertIn("classes", six.iterkeys(predictions_for_serving))
+        self.assertIn("classes", predictions_for_serving)
         self.assertAllEqual(
             [[b"key0", b"key1", b"key2"], [b"key0", b"key1", b"key2"]],
             predictions_for_serving["classes"].eval())
-        self.assertIn("probabilities", six.iterkeys(predictions_for_serving))
+        self.assertIn("probabilities", predictions_for_serving)
         self.assertAllClose(
             [[0.576117, 0.2119416, 0.2119416],
              [0.2119416, 0.2119416, 0.576117]],
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py b/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py
index 5e90d1fa20535de3b5e25bc7ff8c3862cea5514c..318046733bf75a6d661d26f478118c8e944afe15 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/generator_io_test.py
@@ -174,7 +174,7 @@ class GeneratorIoTest(test.TestCase):
       return np.arange(32, 36)
 
     with self.cached_session():
-      with self.assertRaisesRegexp(TypeError, 'x\(\) must be generator'):
+      with self.assertRaisesRegexp(TypeError, r'x\(\) must be generator'):
         failing_input_fn = generator_io.generator_input_fn(
             generator, batch_size=2, shuffle=False, num_epochs=1)
         failing_input_fn()
@@ -185,7 +185,7 @@ class GeneratorIoTest(test.TestCase):
       yield np.arange(32, 36)
 
     with self.cached_session():
-      with self.assertRaisesRegexp(TypeError, 'x\(\) must yield dict'):
+      with self.assertRaisesRegexp(TypeError, r'x\(\) must yield dict'):
         failing_input_fn = generator_io.generator_input_fn(
             generator, batch_size=2, shuffle=False, num_epochs=1)
         failing_input_fn()
diff --git a/tensorflow/contrib/learn/python/learn/utils/gc_test.py b/tensorflow/contrib/learn/python/learn/utils/gc_test.py
index e7d091e18a8f186f89f5217442c24fb106c5cdab..af93e517f51ed33a8968982945ac1f65ec915ab1 100644
--- a/tensorflow/contrib/learn/python/learn/utils/gc_test.py
+++ b/tensorflow/contrib/learn/python/learn/utils/gc_test.py
@@ -36,10 +36,10 @@ def _create_parser(base_dir):
     # Modify the path object for RegEx match for Windows Paths
     if os.name == "nt":
       match = re.match(
-          "^" + compat.as_str_any(base_dir).replace("\\", "/") + "/(\\d+)$",
+          r"^" + compat.as_str_any(base_dir).replace("\\", "/") + r"/(\d+)$",
           compat.as_str_any(path.path).replace("\\", "/"))
     else:
-      match = re.match("^" + compat.as_str_any(base_dir) + "/(\\d+)$",
+      match = re.match(r"^" + compat.as_str_any(base_dir) + r"/(\d+)$",
                        compat.as_str_any(path.path))
     if not match:
       return None
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
index a28394964a12013c43d85701b5a0ab5c559afd62..8fda828e994bc2436eaba4475077020436703631 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
@@ -36,7 +36,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.util import deprecation
 
 
-# TODO(rohanj): This should subclass Checkpointable and implement
+# TODO(rohanj): This should subclass Trackable and implement
 # _gather_saveables_for_checkpoint.
 class ShardedMutableDenseHashTable(object):
   """A sharded version of MutableDenseHashTable.
diff --git a/tensorflow/contrib/lookup/lookup_ops.py b/tensorflow/contrib/lookup/lookup_ops.py
index e52fb5ab1431e086f99b4033a6216636a83bad79..3d21fb68a1452c97f7eb85491fc850d9e846266a 100644
--- a/tensorflow/contrib/lookup/lookup_ops.py
+++ b/tensorflow/contrib/lookup/lookup_ops.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
-
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -28,7 +26,6 @@ from tensorflow.python.ops import lookup_ops
 # pylint: disable=unused-import
 from tensorflow.python.ops.lookup_ops import FastHashSpec
 from tensorflow.python.ops.lookup_ops import HasherSpec
-from tensorflow.python.ops.lookup_ops import HashTable
 from tensorflow.python.ops.lookup_ops import IdTableWithHashBuckets
 from tensorflow.python.ops.lookup_ops import index_table_from_file
 from tensorflow.python.ops.lookup_ops import index_to_string_table_from_file
@@ -42,7 +39,6 @@ from tensorflow.python.ops.lookup_ops import TextFileIndex
 from tensorflow.python.ops.lookup_ops import TextFileInitializer
 from tensorflow.python.ops.lookup_ops import TextFileStringTableInitializer
 # pylint: enable=unused-import
-from tensorflow.python.training.saver import BaseSaverBuilder
 from tensorflow.python.util.deprecation import deprecated
 
 
@@ -91,7 +87,7 @@ def index_table_from_tensor(mapping,
   The bucket ID range is `[mapping size, mapping size + num_oov_buckets - 1]`.
 
   The underlying table must be initialized by calling
-  `tf.tables_initializer.run()` or `table.initializer.run()` once.
+  `session.run(tf.tables_initializer)` or `session.run(table.init)` once.
 
   Elements in `mapping` cannot have duplicates, otherwise when executing the
   table initializer op, it will throw a `FailedPreconditionError`.
@@ -158,7 +154,7 @@ def string_to_index(tensor, mapping, default_value=-1, name=None):
   will throw a FailedPreconditionError.
 
   The underlying table must be initialized by calling
-  `tf.tables_initializer.run()` once.
+  `session.run(tf.tables_initializer)` once.
 
   For example:
 
@@ -202,7 +198,7 @@ def index_to_string_table_from_tensor(mapping, default_value="UNK", name=None):
   (an out-of-vocabulary entry) is assigned the `default_value`
 
   The underlying table must be initialized by calling
-  `tf.tables_initializer.run()` or `table.initializer.run()` once.
+  `session.run(tf.tables_initializer)` or `session.run(table.init)` once.
 
   Elements in `mapping` cannot have duplicates, otherwise when executing the
   table initializer op, it will throw a `FailedPreconditionError`.
@@ -257,7 +253,7 @@ def index_to_string(tensor, mapping, default_value="UNK", name=None):
   (an out-of-vocabulary entry) is assigned the `default_value`
 
   The underlying table must be initialized by calling
-  `tf.tables_initializer.run()` once.
+  `session.run(tf.tables_initializer)` once.
 
   For example:
 
@@ -288,353 +284,52 @@ def index_to_string(tensor, mapping, default_value="UNK", name=None):
   return table.lookup(tensor)
 
 
-class MutableHashTable(LookupInterface):
-  """A generic mutable hash table implementation.
-
-  Data can be inserted by calling the insert method and removed by calling the
-  remove method. It does not support initialization via the init method.
+class HashTable(InitializableLookupTableBase):
+  """A generic hash table implementation.
 
   Example usage:
 
   ```python
-  table = tf.contrib.lookup.MutableHashTable(key_dtype=tf.string,
-                                             value_dtype=tf.int64,
-                                             default_value=-1)
-  sess.run(table.insert(keys, values))
-  out = table.lookup(query_keys)
+  table = tf.HashTable(
+      tf.KeyValueTensorInitializer(keys, values), -1)
+  out = table.lookup(input_tensor)
+  table.init.run()
   print(out.eval())
   ```
   """
 
-  def __init__(self,
-               key_dtype,
-               value_dtype,
-               default_value,
-               shared_name=None,
-               name="MutableHashTable",
-               checkpoint=True):
-    """Creates an empty `MutableHashTable` object.
+  def __init__(self, initializer, default_value, shared_name=None, name=None):
+    """Creates a non-initialized `HashTable` object.
 
-    Creates a table, the type of its keys and values are specified by key_dtype
-    and value_dtype, respectively.
+    Creates a table, the type of its keys and values are specified by the
+    initializer.
+    Before using the table you will have to initialize it. After initialization
+    the table will be immutable.
 
     Args:
-      key_dtype: the type of the key tensors.
-      value_dtype: the type of the value tensors.
+      initializer: The table initializer to use. See `HashTable` kernel for
+        supported key and value types.
       default_value: The value to use if a key is missing in the table.
-      shared_name: If non-empty, this table will be shared under
-        the given name across multiple sessions.
+      shared_name: If non-empty, this table will be shared under the given name
+        across multiple sessions.
       name: A name for the operation (optional).
-      checkpoint: if True, the contents of the table are saved to and restored
-        from checkpoints. If `shared_name` is empty for a checkpointed table, it
-        is shared using the table node name.
 
     Returns:
-      A `MutableHashTable` object.
-
-    Raises:
-      ValueError: If checkpoint is True and no name was specified.
+      A `HashTable` object.
     """
-    self._default_value = ops.convert_to_tensor(default_value,
-                                                dtype=value_dtype)
-    self._value_shape = self._default_value.get_shape()
-    self._checkpoint = checkpoint
-    self._key_dtype = key_dtype
-    self._value_dtype = value_dtype
-    self._name = name
-
-    if context.executing_eagerly() and shared_name is None:
-      # TODO(allenl): This will leak memory due to kernel caching by the
-      # shared_name attribute value (but is better than the alternative of
-      # sharing everything by default when executing eagerly; hopefully creating
-      # tables in a loop is uncommon).
-      shared_name = "table_%d" % (ops.uid(),)
+    self._initializer = initializer
+    self._default_value = default_value
     self._shared_name = shared_name
-    super(MutableHashTable, self).__init__(key_dtype, value_dtype)
-
-    self._resource_handle = self.create_resource()
-    if checkpoint:
-      saveable = MutableHashTable._Saveable(self, name)
-      if not context.executing_eagerly():
-        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
-
-  def create_resource(self):
-    # The table must be shared if checkpointing is requested for multi-worker
-    # training to work correctly. Use the node name if no shared_name has been
-    # explicitly specified.
-    use_node_name_sharing = self._checkpoint and self._shared_name is None
-    if self._default_value.get_shape().ndims == 0:
-      table_ref = gen_lookup_ops.mutable_hash_table_v2(
-          shared_name=self._shared_name,
-          use_node_name_sharing=use_node_name_sharing,
-          key_dtype=self._key_dtype,
-          value_dtype=self._value_dtype,
-          name=self._name)
-    else:
-      table_ref = gen_lookup_ops.mutable_hash_table_of_tensors_v2(
-          shared_name=self._shared_name,
-          use_node_name_sharing=use_node_name_sharing,
-          key_dtype=self._key_dtype,
-          value_dtype=self._value_dtype,
-          value_shape=self._default_value.get_shape(),
-          name=self._name)
-
-    if context.executing_eagerly():
-      self._table_name = None
-    else:
-      self._table_name = table_ref.op.name.split("/")[-1]
-    return table_ref
-
-  @property
-  def name(self):
-    return self._table_name
-
-  def size(self, name=None):
-    """Compute the number of elements in this table.
-
-    Args:
-      name: A name for the operation (optional).
-
-    Returns:
-      A scalar tensor containing the number of elements in this table.
-    """
-    with ops.name_scope(name, "%s_Size" % self.name,
-                        [self.resource_handle]) as name:
-      with ops.colocate_with(self.resource_handle):
-        return gen_lookup_ops.lookup_table_size_v2(
-            self.resource_handle, name=name)
-
-  def remove(self, keys, name=None):
-    """Removes `keys` and its associated values from the table.
-
-    If a key is not present in the table, it is silently ignored.
-
-    Args:
-      keys: Keys to remove. Can be a tensor of any shape. Must match the table's
-        key type.
-      name: A name for the operation (optional).
-
-    Returns:
-      The created Operation.
-
-    Raises:
-      TypeError: when `keys` do not match the table data types.
-    """
-    if keys.dtype != self._key_dtype:
-      raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
-                      (self._key_dtype, keys.dtype))
-
-    with ops.name_scope(
-        name, "%s_lookup_table_remove" % self.name,
-        (self.resource_handle, keys, self._default_value)) as name:
-      # pylint: disable=protected-access
-      op = gen_lookup_ops.lookup_table_remove_v2(
-          self.resource_handle, keys, name=name)
-
-    return op
-
-  def lookup(self, keys, name=None):
-    """Looks up `keys` in a table, outputs the corresponding values.
-
-    The `default_value` is used for keys not present in the table.
-
-    Args:
-      keys: Keys to look up. Can be a tensor of any shape. Must match the
-        table's key_dtype.
-      name: A name for the operation (optional).
-
-    Returns:
-      A tensor containing the values in the same shape as `keys` using the
-        table's value type.
-
-    Raises:
-      TypeError: when `keys` do not match the table data types.
-    """
-    with ops.name_scope(
-        name, "%s_lookup_table_find" % self.name,
-        (self.resource_handle, keys, self._default_value)) as name:
-      keys = ops.convert_to_tensor(keys, dtype=self._key_dtype, name="keys")
-      with ops.colocate_with(self.resource_handle):
-        values = gen_lookup_ops.lookup_table_find_v2(
-            self.resource_handle, keys, self._default_value, name=name)
-    return values
-
-  def insert(self, keys, values, name=None):
-    """Associates `keys` with `values`.
-
-    Args:
-      keys: Keys to insert. Can be a tensor of any shape. Must match the
-        table's key type.
-      values: Values to be associated with keys. Must be a tensor of the same
-        shape as `keys` and match the table's value type.
-      name: A name for the operation (optional).
-
-    Returns:
-      The created Operation.
-
-    Raises:
-      TypeError: when `keys` or `values` doesn't match the table data
-        types.
-    """
-    with ops.name_scope(name, "%s_lookup_table_insert" % self.name,
-                        [self.resource_handle, keys, values]) as name:
-      keys = ops.convert_to_tensor(keys, self._key_dtype, name="keys")
-      values = ops.convert_to_tensor(values, self._value_dtype, name="values")
-      with ops.colocate_with(self.resource_handle):
-        # pylint: disable=protected-access
-        op = gen_lookup_ops.lookup_table_insert_v2(
-            self.resource_handle, keys, values, name=name)
-    return op
-
-  def export(self, name=None):
-    """Returns tensors of all keys and values in the table.
-
-    Args:
-      name: A name for the operation (optional).
-
-    Returns:
-      A pair of tensors with the first tensor containing all keys and the
-        second tensors containing all values in the table.
-    """
-    with ops.name_scope(name, "%s_lookup_table_export_values" % self.name,
-                        [self.resource_handle]) as name:
-      with ops.colocate_with(self.resource_handle):
-        exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
-            self.resource_handle, self._key_dtype, self._value_dtype, name=name)
-    return exported_keys, exported_values
-
-  def _gather_saveables_for_checkpoint(self):
-    """For object-based checkpointing."""
-    return {"table": functools.partial(MutableHashTable._Saveable, table=self)}
-
-  class _Saveable(BaseSaverBuilder.SaveableObject):
-    """SaveableObject implementation for MutableHashTable."""
-
-    def __init__(self, table, name):
-      tensors = table.export()
-      specs = [
-          BaseSaverBuilder.SaveSpec(tensors[0], "", name + "-keys"),
-          BaseSaverBuilder.SaveSpec(tensors[1], "", name + "-values")
-      ]
-      # pylint: disable=protected-access
-      super(MutableHashTable._Saveable, self).__init__(table, specs, name)
-
-    def restore(self, restored_tensors, restored_shapes):
-      del restored_shapes  # unused
-      # pylint: disable=protected-access
-      with ops.colocate_with(self.op.resource_handle):
-        return gen_lookup_ops.lookup_table_import_v2(
-            self.op.resource_handle, restored_tensors[0], restored_tensors[1])
-
-
-class MutableDenseHashTable(LookupInterface):
-  """A generic mutable hash table implementation using tensors as backing store.
-
-  Data can be inserted by calling the insert method and removed by calling the
-  remove method. It does not support initialization via the init method.
-
-  It uses "open addressing" with quadratic reprobing to resolve collisions.
-  Compared to `MutableHashTable` the insert, remove and lookup operations in a
-  `MutableDenseHashTable` are typically faster, but memory usage can be higher.
-  However, `MutableDenseHashTable` does not require additional memory for
-  temporary tensors created during checkpointing and restore operations.
-
-  Example usage:
-
-  ```python
-  table = tf.contrib.lookup.MutableDenseHashTable(key_dtype=tf.int64,
-                                                  value_dtype=tf.int64,
-                                                  default_value=-1,
-                                                  empty_key=0,
-                                                  deleted_key=-1)
-
-  sess.run(table.insert(keys, values))
-  out = table.lookup(query_keys)
-  print(out.eval())
-  ```
-  """
-
-  # TODO(andreasst): consider extracting common code with MutableHashTable into
-  # a common superclass.
-  def __init__(self,
-               key_dtype,
-               value_dtype,
-               default_value,
-               empty_key,
-               deleted_key,
-               initial_num_buckets=None,
-               shared_name=None,
-               name="MutableDenseHashTable",
-               checkpoint=True):
-    """Creates an empty `MutableDenseHashTable` object.
-
-    Creates a table, the type of its keys and values are specified by key_dtype
-    and value_dtype, respectively.
-
-    Args:
-      key_dtype: the type of the key tensors.
-      value_dtype: the type of the value tensors.
-      default_value: The value to use if a key is missing in the table.
-      empty_key: the key to use to represent empty buckets internally. Must not
-        be used in insert, remove or lookup operations.
-      initial_num_buckets: the initial number of buckets.
-      shared_name: If non-empty, this table will be shared under
-        the given name across multiple sessions.
-      name: A name for the operation (optional).
-      checkpoint: if True, the contents of the table are saved to and restored
-        from checkpoints. If `shared_name` is empty for a checkpointed table, it
-        is shared using the table node name.
-      deleted_key: the key to use to represent deleted buckets internally. Must
-        not be used in insert, remove or lookup operations and be different from
-        the empty_key.
-
-    Returns:
-      A `MutableDenseHashTable` object.
-
-    Raises:
-      ValueError: If checkpoint is True and no name was specified.
-    """
-    self._default_value = ops.convert_to_tensor(
-        default_value, dtype=value_dtype, name="default_value")
-    self._key_dtype = key_dtype
-    self._value_dtype = value_dtype
-    self._initial_num_buckets = initial_num_buckets
+    self._name = name or "hash_table"
+    self._table_name = None
+    super(HashTable, self).__init__(default_value, initializer)
     self._value_shape = self._default_value.get_shape()
-    self._checkpoint = checkpoint
-    self._name = name
-
-    self._empty_key = ops.convert_to_tensor(
-        empty_key, dtype=key_dtype, name="empty_key")
-    self._deleted_key = ops.convert_to_tensor(
-        deleted_key, dtype=key_dtype, name="deleted_key")
-    if context.executing_eagerly() and shared_name is None:
-      # TODO(allenl): This will leak memory due to kernel caching by the
-      # shared_name attribute value (but is better than the alternative of
-      # sharing everything by default when executing eagerly; hopefully creating
-      # tables in a loop is uncommon).
-      shared_name = "table_%d" % (ops.uid(),)
-    self._shared_name = shared_name
-    super(MutableDenseHashTable, self).__init__(key_dtype, value_dtype)
-
-    self._resource_handle = self.create_resource()
-    if checkpoint:
-      saveable = MutableDenseHashTable._Saveable(self, name)
-      if not context.executing_eagerly():
-        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
 
   def create_resource(self):
-    # The table must be shared if checkpointing is requested for multi-worker
-    # training to work correctly. Use the node name if no shared_name has been
-    # explicitly specified.
-    use_node_name_sharing = self._checkpoint and self._shared_name is None
-    table_ref = gen_lookup_ops.mutable_dense_hash_table_v2(
-        empty_key=self._empty_key,
-        deleted_key=self._deleted_key,
+    table_ref = gen_lookup_ops.hash_table_v2(
         shared_name=self._shared_name,
-        use_node_name_sharing=use_node_name_sharing,
-        value_dtype=self._value_dtype,
-        value_shape=self._value_shape,
-        initial_num_buckets=self._initial_num_buckets,
+        key_dtype=self._initializer.key_dtype,
+        value_dtype=self._initializer.value_dtype,
         name=self._name)
     if context.executing_eagerly():
       self._table_name = None
@@ -646,103 +341,6 @@ class MutableDenseHashTable(LookupInterface):
   def name(self):
     return self._table_name
 
-  def size(self, name=None):
-    """Compute the number of elements in this table.
-
-    Args:
-      name: A name for the operation (optional).
-
-    Returns:
-      A scalar tensor containing the number of elements in this table.
-    """
-    with ops.name_scope(name, "%s_Size" % self.name,
-                        [self.resource_handle]) as name:
-      with ops.colocate_with(self.resource_handle):
-        return gen_lookup_ops.lookup_table_size_v2(
-            self.resource_handle, name=name)
-
-  def lookup(self, keys, name=None):
-    """Looks up `keys` in a table, outputs the corresponding values.
-
-    The `default_value` is used for keys not present in the table.
-
-    Args:
-      keys: Keys to look up. Can be a tensor of any shape. Must match the
-        table's key_dtype.
-      name: A name for the operation (optional).
-
-    Returns:
-      A tensor containing the values in the same shape as `keys` using the
-        table's value type.
-
-    Raises:
-      TypeError: when `keys` do not match the table data types.
-    """
-    with ops.name_scope(name, "%s_lookup_table_find" % self.name,
-                        [self.resource_handle, keys]) as name:
-      keys = ops.convert_to_tensor(keys, dtype=self._key_dtype, name="keys")
-      with ops.colocate_with(self.resource_handle):
-        values = gen_lookup_ops.lookup_table_find_v2(
-            self.resource_handle, keys, self._default_value, name=name)
-
-    return values
-
-  def insert(self, keys, values, name=None):
-    """Associates `keys` with `values`.
-
-    Args:
-      keys: Keys to insert. Can be a tensor of any shape. Must match the
-        table's key type.
-      values: Values to be associated with keys. Must be a tensor of the same
-        shape as `keys` and match the table's value type.
-      name: A name for the operation (optional).
-
-    Returns:
-      The created Operation.
-
-    Raises:
-      TypeError: when `keys` or `values` doesn't match the table data
-        types.
-    """
-    with ops.name_scope(name, "%s_lookup_table_insert" % self.name,
-                        [self.resource_handle, keys, values]) as name:
-      keys = ops.convert_to_tensor(keys, dtype=self._key_dtype, name="keys")
-      values = ops.convert_to_tensor(
-          values, dtype=self._value_dtype, name="values")
-      with ops.colocate_with(self.resource_handle):
-        op = gen_lookup_ops.lookup_table_insert_v2(
-            self.resource_handle, keys, values, name=name)
-      return op
-
-  def remove(self, keys, name=None):
-    """Removes `keys` and its associated values from the table.
-
-    If a key is not present in the table, it is silently ignored.
-
-    Args:
-      keys: Keys to remove. Can be a tensor of any shape. Must match the table's
-        key type.
-      name: A name for the operation (optional).
-
-    Returns:
-      The created Operation.
-
-    Raises:
-      TypeError: when `keys` do not match the table data types.
-    """
-    if keys.dtype != self._key_dtype:
-      raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
-                      (self._key_dtype, keys.dtype))
-
-    with ops.name_scope(
-        name, "%s_lookup_table_remove" % self.name,
-        (self.resource_handle, keys, self._default_value)) as name:
-      # pylint: disable=protected-access
-      op = gen_lookup_ops.lookup_table_remove_v2(
-          self.resource_handle, keys, name=name)
-
-    return op
-
   def export(self, name=None):
     """Returns tensors of all keys and values in the table.
 
@@ -753,34 +351,15 @@ class MutableDenseHashTable(LookupInterface):
       A pair of tensors with the first tensor containing all keys and the
         second tensors containing all values in the table.
     """
-    with ops.name_scope(name, "%s_lookup_table_export_values" % self.name,
+    with ops.name_scope(name, "%s_Export" % self.name,
                         [self.resource_handle]) as name:
-      with ops.colocate_with(self.resource_handle):
-        exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
-            self.resource_handle, self._key_dtype, self._value_dtype, name=name)
+      exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
+          self.resource_handle, self._key_dtype, self._value_dtype, name=name)
 
+    exported_values.set_shape(exported_keys.get_shape().concatenate(
+        self._value_shape))
     return exported_keys, exported_values
 
-  def _gather_saveables_for_checkpoint(self):
-    """For object-based checkpointing."""
-    return {"table": functools.partial(
-        MutableDenseHashTable._Saveable, table=self)}
-
-  class _Saveable(BaseSaverBuilder.SaveableObject):
-    """SaveableObject implementation for MutableDenseHashTable."""
-
-    def __init__(self, table, name):
-      tensors = table.export()
-      specs = [
-          BaseSaverBuilder.SaveSpec(tensors[0], "", name + "-keys"),
-          BaseSaverBuilder.SaveSpec(tensors[1], "", name + "-values")
-      ]
-      # pylint: disable=protected-access
-      super(MutableDenseHashTable._Saveable, self).__init__(table, specs, name)
-
-    def restore(self, restored_tensors, restored_shapes):
-      del restored_shapes  # unused
-      # pylint: disable=protected-access
-      with ops.colocate_with(self.op.resource_handle):
-        return gen_lookup_ops.lookup_table_import_v2(
-            self.op.resource_handle, restored_tensors[0], restored_tensors[1])
+
+MutableHashTable = lookup_ops.MutableHashTable
+MutableDenseHashTable = lookup_ops.MutableDenseHashTable
diff --git a/tensorflow/contrib/lookup/lookup_ops_test.py b/tensorflow/contrib/lookup/lookup_ops_test.py
index 9b2c2dd87cc8a92fbb6b45504939be3788b60839..9fe8dafcc8edd6b80625c61a4a0e783e65b44720 100644
--- a/tensorflow/contrib/lookup/lookup_ops_test.py
+++ b/tensorflow/contrib/lookup/lookup_ops_test.py
@@ -18,14 +18,10 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import tempfile
 import numpy as np
-import six
 
 from tensorflow.contrib import lookup
 from tensorflow.python.client import session
-from tensorflow.python.data.experimental.ops import counter
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -37,9 +33,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
-from tensorflow.python.training import saver
 from tensorflow.python.training import server_lib
-from tensorflow.python.training.checkpointable import util as checkpointable
 
 
 class HashTableOpTest(test.TestCase):
@@ -299,1240 +293,6 @@ class HashTableOpTest(test.TestCase):
       self.assertAllEqual([b"brain", b"salad", b"n/a"], result)
 
 
-class MutableHashTableOpTest(test.TestCase):
-
-  def testMutableHashTable(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery", "tarkus"])
-      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
-
-      remove_string = constant_op.constant(["tarkus", "tank"])
-      table.remove(remove_string).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output = table.lookup(input_string)
-      self.assertAllEqual([3], output.get_shape())
-
-      result = output.eval()
-      self.assertAllEqual([0, 1, -1], result)
-
-      exported_keys, exported_values = table.export()
-      self.assertAllEqual([None], exported_keys.get_shape().as_list())
-      self.assertAllEqual([None], exported_values.get_shape().as_list())
-
-      # exported data is in the order of the internal map, i.e. undefined
-      sorted_keys = np.sort(exported_keys.eval())
-      sorted_values = np.sort(exported_values.eval())
-      self.assertAllEqual([b"brain", b"salad", b"surgery"], sorted_keys)
-      self.assertAllEqual([0, 1, 2], sorted_values)
-
-  def testSaveRestore(self):
-    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
-    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
-
-    with self.session(graph=ops.Graph()) as sess:
-      v0 = variables.Variable(10.0, name="v0")
-      v1 = variables.Variable(20.0, name="v1")
-
-      default_val = -1
-      keys = constant_op.constant(["b", "c", "d"], dtypes.string)
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup.MutableHashTable(
-          dtypes.string, dtypes.int64, default_val, name="t1", checkpoint=True)
-
-      save = saver.Saver()
-      variables.global_variables_initializer().run()
-
-      # Check that the parameter nodes have been initialized.
-      self.assertEqual(10.0, v0.eval())
-      self.assertEqual(20.0, v1.eval())
-
-      self.assertAllEqual(0, table.size().eval())
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      val = save.save(sess, save_path)
-      self.assertTrue(isinstance(val, six.string_types))
-      self.assertEqual(save_path, val)
-
-    with self.session(graph=ops.Graph()) as sess:
-      v0 = variables.Variable(-1.0, name="v0")
-      v1 = variables.Variable(-1.0, name="v1")
-      default_val = -1
-      table = lookup.MutableHashTable(
-          dtypes.string, dtypes.int64, default_val, name="t1", checkpoint=True)
-      table.insert(
-          constant_op.constant(["a", "c"], dtypes.string),
-          constant_op.constant([12, 24], dtypes.int64)).run()
-      self.assertAllEqual(2, table.size().eval())
-
-      save = saver.Saver()
-
-      # Restore the saved values in the parameter nodes.
-      save.restore(sess, save_path)
-      # Check that the parameter nodes have been restored.
-      self.assertEqual(10.0, v0.eval())
-      self.assertEqual(20.0, v1.eval())
-
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant(["a", "b", "c", "d", "e"],
-                                          dtypes.string)
-      output = table.lookup(input_string)
-      self.assertAllEqual([-1, 0, 1, 2, -1], output.eval())
-
-  @test_util.run_in_graph_and_eager_modes
-  def testObjectSaveRestore(self):
-    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
-    save_prefix = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
-
-    v0 = variables.Variable(10.0, name="v0")
-    v1 = variables.Variable(20.0, name="v1")
-
-    default_val = -1
-    keys = constant_op.constant(["b", "c", "d"], dtypes.string)
-    values = constant_op.constant([0, 1, 2], dtypes.int64)
-    table = lookup.MutableHashTable(
-        dtypes.string, dtypes.int64, default_val, name="t1", checkpoint=True)
-
-    checkpoint = checkpointable.Checkpoint(table=table, v0=v0, v1=v1)
-    self.evaluate([v0.initializer, v1.initializer])
-
-    # Check that the parameter nodes have been initialized.
-    self.assertEqual(10.0, self.evaluate(v0))
-    self.assertEqual(20.0, self.evaluate(v1))
-
-    self.assertAllEqual(0, self.evaluate(table.size()))
-    self.evaluate(table.insert(keys, values))
-    self.assertAllEqual(3, self.evaluate(table.size()))
-
-    save_path = checkpoint.save(save_prefix)
-    del table, checkpoint, v0, v1
-
-    v0 = variables.Variable(-1.0, name="v0")
-    v1 = variables.Variable(-1.0, name="v1")
-    default_val = -1
-    table = lookup.MutableHashTable(
-        dtypes.string, dtypes.int64, default_val, name="t1", checkpoint=True)
-    self.evaluate(table.insert(
-        constant_op.constant(["a", "c"], dtypes.string),
-        constant_op.constant([12, 24], dtypes.int64)))
-    self.assertAllEqual(2, self.evaluate(table.size()))
-
-    checkpoint = checkpointable.Checkpoint(table=table, v0=v0, v1=v1)
-
-    # Restore the saved values in the parameter nodes.
-    checkpoint.restore(save_path).run_restore_ops()
-    # Check that the parameter nodes have been restored.
-    self.assertEqual(10.0, self.evaluate(v0))
-    self.assertEqual(20.0, self.evaluate(v1))
-
-    self.assertAllEqual(3, self.evaluate(table.size()))
-
-    input_string = constant_op.constant(["a", "b", "c", "d", "e"],
-                                        dtypes.string)
-    output = table.lookup(input_string)
-    self.assertAllEqual([-1, 0, 1, 2, -1], self.evaluate(output))
-
-  def testSharing(self):
-    # Start a server to store the table state
-    server = server_lib.Server(
-        {
-            "local0": ["localhost:0"]
-        }, protocol="grpc", start=True)
-    # Create two sessions sharing the same state
-    session1 = session.Session(server.target)
-    session2 = session.Session(server.target)
-
-    table = lookup.MutableHashTable(
-        dtypes.int64, dtypes.string, "-", name="t1")
-
-    # Populate the table in the first session
-    with session1:
-      self.assertAllEqual(0, table.size().eval())
-
-      keys = constant_op.constant([11, 12], dtypes.int64)
-      values = constant_op.constant(["a", "b"])
-      table.insert(keys, values).run()
-      self.assertAllEqual(2, table.size().eval())
-
-      output = table.lookup(constant_op.constant([11, 12, 13], dtypes.int64))
-      self.assertAllEqual([b"a", b"b", b"-"], output.eval())
-
-    # Verify that we can access the shared data from the second session
-    with session2:
-      self.assertAllEqual(2, table.size().eval())
-
-      output = table.lookup(constant_op.constant([10, 11, 12], dtypes.int64))
-      self.assertAllEqual([b"-", b"a", b"b"], output.eval())
-
-  def testMutableHashTableOfTensors(self):
-    with self.cached_session():
-      default_val = constant_op.constant([-1, -1], dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery", "tarkus"])
-      values = constant_op.constant([[0, 1], [2, 3], [4, 5], [6, 7]],
-                                    dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
-
-      remove_string = constant_op.constant(["tarkus", "tank"])
-      table.remove(remove_string).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output = table.lookup(input_string)
-      self.assertAllEqual([3, 2], output.get_shape())
-
-      result = output.eval()
-      self.assertAllEqual([[0, 1], [2, 3], [-1, -1]], result)
-
-      exported_keys, exported_values = table.export()
-      self.assertAllEqual([None], exported_keys.get_shape().as_list(),
-                          msg="Saw shape %s" % exported_keys.shape)
-      self.assertAllEqual([None, 2], exported_values.get_shape().as_list(),
-                          msg="Saw shape %s" % exported_values.shape)
-      # exported data is in the order of the internal map, i.e. undefined
-      sorted_keys = np.sort(exported_keys.eval())
-      sorted_values = np.sort(exported_values.eval())
-      self.assertAllEqual([b"brain", b"salad", b"surgery"], sorted_keys)
-      self.assertAllEqual([[4, 5], [2, 3], [0, 1]], sorted_values)
-
-  def testMutableHashTableExportInsert(self):
-    with self.cached_session():
-      default_val = constant_op.constant([-1, -1], dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([[0, 1], [2, 3], [4, 5]], dtypes.int64)
-      table1 = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                       default_val)
-      self.assertAllEqual(0, table1.size().eval())
-      table1.insert(keys, values).run()
-      self.assertAllEqual(3, table1.size().eval())
-
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      expected_output = [[0, 1], [2, 3], [-1, -1]]
-      output1 = table1.lookup(input_string)
-      self.assertAllEqual(expected_output, output1.eval())
-
-      exported_keys, exported_values = table1.export()
-      self.assertAllEqual(3, exported_keys.eval().size)
-      self.assertAllEqual(6, exported_values.eval().size)
-
-      # Populate a second table from the exported data
-      table2 = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                       default_val)
-      self.assertAllEqual(0, table2.size().eval())
-      table2.insert(exported_keys, exported_values).run()
-      self.assertAllEqual(3, table2.size().eval())
-
-      # Verify lookup result is still the same
-      output2 = table2.lookup(input_string)
-      self.assertAllEqual(expected_output, output2.eval())
-
-  def testMutableHashTableOfTensorsInvalidShape(self):
-    with self.cached_session():
-      default_val = constant_op.constant([-1, -1], dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
-
-      # Shape [6] instead of [3, 2]
-      values = constant_op.constant([0, 1, 2, 3, 4, 5], dtypes.int64)
-      with self.assertRaisesOpError("Expected shape"):
-        table.insert(keys, values).run()
-
-      # Shape [2,3] instead of [3, 2]
-      values = constant_op.constant([[0, 1, 2], [3, 4, 5]], dtypes.int64)
-      with self.assertRaisesOpError("Expected shape"):
-        table.insert(keys, values).run()
-
-      # Shape [2, 2] instead of [3, 2]
-      values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64)
-      with self.assertRaisesOpError("Expected shape"):
-        table.insert(keys, values).run()
-
-      # Shape [3, 1] instead of [3, 2]
-      values = constant_op.constant([[0], [2], [4]], dtypes.int64)
-      with self.assertRaisesOpError("Expected shape"):
-        table.insert(keys, values).run()
-
-      # Valid Insert
-      values = constant_op.constant([[0, 1], [2, 3], [4, 5]], dtypes.int64)
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-  def testMutableHashTableInvalidDefaultValue(self):
-    with self.cached_session():
-      default_val = constant_op.constant([[-1, -1]], dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
-      with self.assertRaisesOpError("Default value must be a vector"):
-        self.assertAllEqual(0, table.size().eval())
-
-  def testMutableHashTableDuplicateInsert(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery", "brain"])
-      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output = table.lookup(input_string)
-
-      result = output.eval()
-      self.assertAllEqual([3, 1, -1], result)
-
-  def testMutableHashTableFindHighRank(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant(
-          [["brain", "salad"], ["tank", "tarkus"]])
-      output = table.lookup(input_string)
-      self.assertAllEqual([2, 2], output.get_shape())
-
-      result = output.eval()
-      self.assertAllEqual([[0, 1], [-1, -1]], result)
-
-  def testMutableHashTableInsertHighRank(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant([["brain", "salad"], ["surgery", "tank"]])
-      values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
-
-      input_string = constant_op.constant(["brain", "salad", "tank", "tarkus"])
-      output = table.lookup(input_string)
-
-      result = output.eval()
-      self.assertAllEqual([0, 1, 3, -1], result)
-
-  def testMutableHashTableRemoveHighRank(self):
-    with self.test_session():
-      default_val = -1
-      keys = constant_op.constant([["brain", "salad"], ["surgery", "tank"]])
-      values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64, default_val)
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
-
-      remove_string = constant_op.constant(["salad", "tarkus"])
-      table.remove(remove_string).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant(["brain", "salad", "tank", "tarkus"])
-      output = table.lookup(input_string)
-
-      result = output.eval()
-      self.assertAllEqual([0, -1, 3, -1], result)
-
-  def testMutableHashTableOfTensorsFindHighRank(self):
-    with self.cached_session():
-      default_val = constant_op.constant([-1, -1, -1], dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([[0, 1, 2], [2, 3, 4], [4, 5, 6]],
-                                    dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant(
-          [["brain", "salad"], ["tank", "tarkus"]])
-      output = table.lookup(input_string)
-      self.assertAllEqual([2, 2, 3], output.get_shape())
-
-      result = output.eval()
-      self.assertAllEqual(
-          [[[0, 1, 2], [2, 3, 4]], [[-1, -1, -1], [-1, -1, -1]]], result)
-
-  def testMutableHashTableOfTensorsRemoveHighRank(self):
-    with self.test_session():
-      default_val = constant_op.constant([-1, -1, -1], dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([[0, 1, 2], [2, 3, 4], [4, 5, 6]],
-                                    dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64, default_val)
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      remove_string = constant_op.constant([["brain", "tank"]])
-      table.remove(remove_string).run()
-      self.assertAllEqual(2, table.size().eval())
-
-      input_string = constant_op.constant([["brain", "salad"],
-                                           ["surgery", "tank"]])
-      output = table.lookup(input_string)
-      self.assertAllEqual([2, 2, 3], output.get_shape())
-
-      result = output.eval()
-      self.assertAllEqual(
-          [[[-1, -1, -1], [2, 3, 4]], [[4, 5, 6], [-1, -1, -1]]], result)
-
-  def testMultipleMutableHashTables(self):
-    with self.cached_session() as sess:
-      default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-
-      table1 = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                       default_val)
-      table2 = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                       default_val)
-      table3 = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                       default_val)
-      table1.insert(keys, values).run()
-      table2.insert(keys, values).run()
-      table3.insert(keys, values).run()
-
-      self.assertAllEqual(3, table1.size().eval())
-      self.assertAllEqual(3, table2.size().eval())
-      self.assertAllEqual(3, table3.size().eval())
-
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output1 = table1.lookup(input_string)
-      output2 = table2.lookup(input_string)
-      output3 = table3.lookup(input_string)
-
-      out1, out2, out3 = sess.run([output1, output2, output3])
-      self.assertAllEqual([0, 1, -1], out1)
-      self.assertAllEqual([0, 1, -1], out2)
-      self.assertAllEqual([0, 1, -1], out3)
-
-  def testMutableHashTableWithTensorDefault(self):
-    with self.cached_session():
-      default_val = constant_op.constant(-1, dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output = table.lookup(input_string)
-
-      result = output.eval()
-      self.assertAllEqual([0, 1, -1], result)
-
-  def testSignatureMismatch(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.int64,
-                                      default_val)
-
-      # insert with keys of the wrong type
-      with self.assertRaises(ValueError):
-        table.insert(constant_op.constant([4, 5, 6]), values).run()
-
-      # insert with values of the wrong type
-      with self.assertRaises(ValueError):
-        table.insert(keys, constant_op.constant(["a", "b", "c"])).run()
-
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string_ref = variables.Variable("brain")
-      input_int64_ref = variables.Variable(-1, dtype=dtypes.int64)
-      variables.global_variables_initializer().run()
-
-      # Ref types do not produce an insert signature mismatch.
-      table.insert(input_string_ref, input_int64_ref).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      # Ref types do not produce a lookup signature mismatch.
-      self.assertEqual(-1, table.lookup(input_string_ref).eval())
-
-      # lookup with keys of the wrong type
-      input_string = constant_op.constant([1, 2, 3], dtypes.int64)
-      with self.assertRaises(ValueError):
-        table.lookup(input_string).eval()
-
-      # default value of the wrong type
-      with self.assertRaises(TypeError):
-        lookup.MutableHashTable(dtypes.string, dtypes.int64, "UNK")
-
-  def testMutableHashTableStringFloat(self):
-    with self.cached_session():
-      default_val = -1.5
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1.1, 2.2], dtypes.float32)
-      table = lookup.MutableHashTable(dtypes.string, dtypes.float32,
-                                      default_val)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output = table.lookup(input_string)
-
-      result = output.eval()
-      self.assertAllClose([0, 1.1, default_val], result)
-
-  def testMutableHashTableIntFloat(self):
-    with self.cached_session():
-      default_val = -1.0
-      keys = constant_op.constant([3, 7, 0], dtypes.int64)
-      values = constant_op.constant([7.5, -1.2, 9.9], dtypes.float32)
-      table = lookup.MutableHashTable(dtypes.int64, dtypes.float32,
-                                      default_val)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant([7, 0, 11], dtypes.int64)
-      output = table.lookup(input_string)
-
-      result = output.eval()
-      self.assertAllClose([-1.2, 9.9, default_val], result)
-
-  def testMutableHashTableInt64String(self):
-    with self.cached_session():
-      default_val = "n/a"
-      keys = constant_op.constant([0, 1, 2], dtypes.int64)
-      values = constant_op.constant(["brain", "salad", "surgery"])
-      table = lookup.MutableHashTable(dtypes.int64, dtypes.string,
-                                      default_val)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant([0, 1, 3], dtypes.int64)
-      output = table.lookup(input_string)
-
-      result = output.eval()
-      self.assertAllEqual((b"brain", b"salad", b"n/a"), result)
-
-
-class MutableDenseHashTableOpTest(test.TestCase):
-
-  def testBasic(self):
-    with self.cached_session():
-
-      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
-      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=-1,
-          empty_key=0,
-          deleted_key=-1)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
-
-      remove_string = constant_op.constant([12, 15], dtypes.int64)
-      table.remove(remove_string).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant([11, 12, 15], dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([3], output.get_shape())
-
-      result = output.eval()
-      self.assertAllEqual([0, -1, -1], result)
-
-  def testBasicBool(self):
-    with self.cached_session():
-
-      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
-      values = constant_op.constant([True, True, True, True], dtypes.bool)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.bool,
-          default_value=False,
-          empty_key=0,
-          deleted_key=-1)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
-
-      remove_string = constant_op.constant([11, 15], dtypes.int64)
-      table.remove(remove_string).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant([11, 12, 15], dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([3], output.get_shape())
-
-      result = output.eval()
-      self.assertAllEqual([False, True, False], result)
-
-  def testSameEmptyAndDeletedKey(self):
-    with self.cached_session():
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "deleted_key"):
-        table = lookup.MutableDenseHashTable(
-            dtypes.int64,
-            dtypes.int64,
-            default_value=-1,
-            empty_key=42,
-            deleted_key=42)
-        self.assertAllEqual(0, table.size().eval())
-
-  def testLookupUnknownShape(self):
-    with self.cached_session():
-      keys = constant_op.constant([11, 12, 13], dtypes.int64)
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=-1,
-          empty_key=0,
-          deleted_key=-1)
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      placeholder_keys = array_ops.placeholder(dtypes.int64)
-      output = table.lookup(placeholder_keys)
-      self.assertAllEqual(None, output.get_shape())
-      result = output.eval({placeholder_keys: [11, 12, 15]})
-      self.assertAllEqual([0, 1, -1], result)
-
-  def testMapStringToFloat(self):
-    with self.cached_session():
-
-      keys = constant_op.constant(["a", "b", "c", "d"], dtypes.string)
-      values = constant_op.constant([0.0, 1.1, 2.2, 3.3], dtypes.float32)
-      default_value = constant_op.constant(-1.5, dtypes.float32)
-      table = lookup.MutableDenseHashTable(
-          dtypes.string,
-          dtypes.float32,
-          default_value=default_value,
-          empty_key="",
-          deleted_key="$")
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
-
-      remove_string = constant_op.constant(["b", "e"])
-      table.remove(remove_string).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant(["a", "b", "d", "e"], dtypes.string)
-      output = table.lookup(input_string)
-      self.assertAllEqual([4], output.get_shape())
-
-      result = output.eval()
-      self.assertAllClose([0, -1.5, 3.3, -1.5], result)
-
-  def testMapInt64ToFloat(self):
-    for float_dtype in [dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-
-        keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
-        values = constant_op.constant([0.0, 1.1, 2.2, 3.3], float_dtype)
-        default_value = constant_op.constant(-1.5, float_dtype)
-        table = lookup.MutableDenseHashTable(
-            dtypes.int64,
-            float_dtype,
-            default_value=default_value,
-            empty_key=0,
-            deleted_key=-1)
-        self.assertAllEqual(0, table.size().eval())
-
-        table.insert(keys, values).run()
-        self.assertAllEqual(4, table.size().eval())
-
-        remove_string = constant_op.constant([12, 15], dtypes.int64)
-        table.remove(remove_string).run()
-        self.assertAllEqual(3, table.size().eval())
-
-        input_string = constant_op.constant([11, 12, 14, 15], dtypes.int64)
-        output = table.lookup(input_string)
-        self.assertAllEqual([4], output.get_shape())
-
-        result = output.eval()
-        self.assertAllClose([0, -1.5, 3.3, -1.5], result)
-
-  def testVectorValues(self):
-    with self.cached_session():
-      keys = constant_op.constant([11, 12, 13], dtypes.int64)
-      values = constant_op.constant([[0, 1, 2, 3], [3, 4, 5, 6], [6, 7, 8, 9]],
-                                    dtypes.int64)
-      default_value = constant_op.constant([-1, -2, -3, -4], dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=default_value,
-          empty_key=0,
-          deleted_key=-1,
-          initial_num_buckets=4)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-      self.assertAllEqual(4, len(table.export()[0].eval()))
-
-      table.insert(
-          constant_op.constant([14], dtypes.int64),
-          constant_op.constant([[2, 3, 4, 5]], dtypes.int64)).run()
-      self.assertAllEqual(4, table.size().eval())
-      self.assertAllEqual(8, len(table.export()[0].eval()))
-
-      remove_string = constant_op.constant([12, 16], dtypes.int64)
-      table.remove(remove_string).run()
-      self.assertAllEqual(3, table.size().eval())
-      self.assertAllEqual(8, len(table.export()[0].eval()))
-
-      input_string = constant_op.constant([11, 12, 14, 15], dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([4, 4],
-                          output.shape,
-                          msg="Saw shape: %s" % output.shape)
-
-      result = output.eval()
-      self.assertAllEqual(
-          [[0, 1, 2, 3], [-1, -2, -3, -4], [2, 3, 4, 5], [-1, -2, -3, -4]],
-          result)
-
-  def testVectorKeys(self):
-    with self.cached_session():
-      keys = constant_op.constant([[0, 1], [1, 2], [1, 3]], dtypes.int64)
-      values = constant_op.constant([10, 11, 12], dtypes.int64)
-      empty_key = constant_op.constant([0, 3], dtypes.int64)
-      deleted_key = constant_op.constant([-1, -1], dtypes.int64)
-      default_value = constant_op.constant(-1, dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=default_value,
-          empty_key=empty_key,
-          deleted_key=deleted_key,
-          initial_num_buckets=8)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      table.insert(
-          constant_op.constant([[0, 0]], dtypes.int64),
-          constant_op.constant([13], dtypes.int64)).run()
-      self.assertAllEqual(4, table.size().eval())
-      self.assertAllEqual(8, len(table.export()[0].eval()))
-
-      remove_string = constant_op.constant([[1, 2], [7, 8]], dtypes.int64)
-      table.remove(remove_string).run()
-      self.assertAllEqual(3, table.size().eval())
-      self.assertAllEqual(8, len(table.export()[0].eval()))
-
-      input_string = constant_op.constant([[0, 1], [1, 2], [1, 3], [0, 2]],
-                                          dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([4], output.get_shape())
-
-      result = output.eval()
-      self.assertAllEqual([10, -1, 12, -1], result)
-
-  def testResize(self):
-    with self.cached_session():
-      keys = constant_op.constant([11, 12, 13], dtypes.int64)
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=-1,
-          empty_key=0,
-          deleted_key=-1,
-          initial_num_buckets=4)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-      self.assertAllEqual(4, len(table.export()[0].eval()))
-
-      keys2 = constant_op.constant([12, 99], dtypes.int64)
-      table.remove(keys2).run()
-      self.assertAllEqual(2, table.size().eval())
-      self.assertAllEqual(4, len(table.export()[0].eval()))
-
-      keys3 = constant_op.constant([13, 14, 15, 16, 17], dtypes.int64)
-      values3 = constant_op.constant([3, 4, 5, 6, 7], dtypes.int64)
-
-      table.insert(keys3, values3).run()
-      self.assertAllEqual(6, table.size().eval())
-      self.assertAllEqual(16, len(table.export()[0].eval()))
-
-      keys4 = constant_op.constant([10, 11, 12, 13, 14, 15, 16, 17, 18],
-                                   dtypes.int64)
-      output = table.lookup(keys4)
-      self.assertAllEqual([-1, 0, -1, 3, 4, 5, 6, 7, -1], output.eval())
-
-  def testExport(self):
-    with self.cached_session():
-
-      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
-      values = constant_op.constant([1, 2, 3, 4], dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=-1,
-          empty_key=100,
-          deleted_key=200,
-          initial_num_buckets=8)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
-
-      keys2 = constant_op.constant([12, 15], dtypes.int64)
-      table.remove(keys2).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      exported_keys, exported_values = table.export()
-      self.assertAllEqual([None], exported_keys.get_shape().as_list())
-      self.assertAllEqual([None], exported_values.get_shape().as_list())
-
-      np_keys = exported_keys.eval()
-      np_values = exported_values.eval()
-
-      self.assertAllEqual(8, len(np_keys))
-      self.assertAllEqual(8, len(np_values))
-
-      # pair up keys and values, drop extra added dimension
-      pairs = np.dstack((np_keys.flatten(), np_values.flatten()))[0]
-      # sort by key
-      pairs = pairs[pairs[:, 0].argsort()]
-      self.assertAllEqual([[11, 1], [13, 3], [14, 4], [100, 0], [100, 0],
-                           [100, 0], [100, 0], [200, 2]], pairs)
-
-  def testSaveRestore(self):
-    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
-    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
-
-    with self.session(graph=ops.Graph()) as sess:
-      default_value = -1
-      empty_key = 0
-      deleted_key = -1
-      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
-      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=default_value,
-          empty_key=empty_key,
-          deleted_key=deleted_key,
-          name="t1",
-          checkpoint=True,
-          initial_num_buckets=32)
-
-      save = saver.Saver()
-
-      self.assertAllEqual(0, table.size().eval())
-      table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
-      self.assertAllEqual(32, len(table.export()[0].eval()))
-
-      keys2 = constant_op.constant([12, 15], dtypes.int64)
-      table.remove(keys2).run()
-      self.assertAllEqual(3, table.size().eval())
-      self.assertAllEqual(32, len(table.export()[0].eval()))
-
-      val = save.save(sess, save_path)
-      self.assertTrue(isinstance(val, six.string_types))
-      self.assertEqual(save_path, val)
-
-    with self.session(graph=ops.Graph()) as sess:
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=default_value,
-          empty_key=empty_key,
-          deleted_key=deleted_key,
-          name="t1",
-          checkpoint=True,
-          initial_num_buckets=64)
-      table.insert(
-          constant_op.constant([11, 14], dtypes.int64),
-          constant_op.constant([12, 24], dtypes.int64)).run()
-      self.assertAllEqual(2, table.size().eval())
-      self.assertAllEqual(64, len(table.export()[0].eval()))
-
-      save = saver.Saver()
-
-      # Restore the saved values in the parameter nodes.
-      save.restore(sess, save_path)
-
-      self.assertAllEqual(3, table.size().eval())
-      self.assertAllEqual(32, len(table.export()[0].eval()))
-
-      input_string = constant_op.constant([10, 11, 12, 13, 14], dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([-1, 0, -1, 2, 3], output.eval())
-
-  @test_util.run_in_graph_and_eager_modes
-  def testObjectSaveRestore(self):
-    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
-    save_prefix = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
-
-    default_value = -1
-    empty_key = 0
-    deleted_key = -1
-    keys = constant_op.constant([11, 12, 13], dtypes.int64)
-    values = constant_op.constant([0, 1, 2], dtypes.int64)
-    save_table = lookup.MutableDenseHashTable(
-        dtypes.int64,
-        dtypes.int64,
-        default_value=default_value,
-        empty_key=empty_key,
-        deleted_key=deleted_key,
-        name="t1",
-        checkpoint=True,
-        initial_num_buckets=32)
-
-    save_checkpoint = checkpointable.Checkpoint(table=save_table)
-
-    self.assertAllEqual(0, self.evaluate(save_table.size()))
-    self.evaluate(save_table.insert(keys, values))
-    self.assertAllEqual(3, self.evaluate(save_table.size()))
-    self.assertAllEqual(32, len(self.evaluate(save_table.export()[0])))
-
-    save_path = save_checkpoint.save(save_prefix)
-    del save_table, save_checkpoint
-
-    load_table = lookup.MutableDenseHashTable(
-        dtypes.int64,
-        dtypes.int64,
-        default_value=default_value,
-        empty_key=empty_key,
-        deleted_key=deleted_key,
-        name="t1",
-        checkpoint=True,
-        initial_num_buckets=64)
-    self.evaluate(load_table.insert(
-        constant_op.constant([11, 14], dtypes.int64),
-        constant_op.constant([12, 24], dtypes.int64)))
-    self.assertAllEqual(2, self.evaluate(load_table.size()))
-    self.assertAllEqual(64, len(self.evaluate(load_table.export()[0])))
-
-    restore_checkpoint = checkpointable.Checkpoint(table=load_table)
-
-    # Restore the saved values in the parameter nodes.
-    restore_checkpoint.restore(save_path).run_restore_ops()
-
-    self.assertAllEqual(3, self.evaluate(load_table.size()))
-    self.assertAllEqual(32, len(self.evaluate(load_table.export()[0])))
-
-    input_string = constant_op.constant([10, 11, 12, 13, 14], dtypes.int64)
-    output = load_table.lookup(input_string)
-    self.assertAllEqual([-1, 0, 1, 2, -1], self.evaluate(output))
-
-  def testVectorSaveRestore(self):
-    save_dir = os.path.join(self.get_temp_dir(), "vector_save_restore")
-    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
-
-    with self.session(graph=ops.Graph()) as sess:
-      empty_key = constant_op.constant([11, 13], dtypes.int64)
-      deleted_key = constant_op.constant([-2, -3], dtypes.int64)
-      default_value = constant_op.constant([-1, -2], dtypes.int64)
-      keys = constant_op.constant([[11, 12], [11, 14], [12, 13], [13, 14]],
-                                  dtypes.int64)
-      values = constant_op.constant([[0, 1], [2, 3], [2, 4], [4, 5]],
-                                    dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=default_value,
-          empty_key=empty_key,
-          deleted_key=deleted_key,
-          name="t1",
-          checkpoint=True,
-          initial_num_buckets=32)
-
-      save = saver.Saver()
-
-      self.assertAllEqual(0, table.size().eval())
-      table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
-      self.assertAllEqual(32, len(table.export()[0].eval()))
-
-      keys2 = constant_op.constant([[12, 13], [16, 17]], dtypes.int64)
-      table.remove(keys2).run()
-      self.assertAllEqual(3, table.size().eval())
-      self.assertAllEqual(32, len(table.export()[0].eval()))
-
-      val = save.save(sess, save_path)
-      self.assertTrue(isinstance(val, six.string_types))
-      self.assertEqual(save_path, val)
-
-    with self.session(graph=ops.Graph()) as sess:
-      empty_key = constant_op.constant([11, 13], dtypes.int64)
-      deleted_key = constant_op.constant([-2, -3], dtypes.int64)
-      default_value = constant_op.constant([-1, -2], dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=default_value,
-          empty_key=empty_key,
-          deleted_key=deleted_key,
-          name="t1",
-          checkpoint=True,
-          initial_num_buckets=64)
-      table.insert(
-          constant_op.constant([[11, 12], [13, 15]], dtypes.int64),
-          constant_op.constant([[21, 22], [23, 24]], dtypes.int64)).run()
-      self.assertAllEqual(2, table.size().eval())
-      self.assertAllEqual(64, len(table.export()[0].eval()))
-
-      save = saver.Saver()
-
-      # Restore the saved values in the parameter nodes.
-      save.restore(sess, save_path)
-
-      self.assertAllEqual(3, table.size().eval())
-      self.assertAllEqual(32, len(table.export()[0].eval()))
-
-      input_string = constant_op.constant(
-          [[11, 12], [11, 14], [11, 15], [13, 14], [13, 15]], dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([[0, 1], [2, 3], [-1, -2], [4, 5], [-1, -2]],
-                          output.eval())
-
-  def testVectorScalarSaveRestore(self):
-    save_dir = os.path.join(self.get_temp_dir(), "vector_scalar_save_restore")
-    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
-
-    with self.session(graph=ops.Graph()) as sess:
-      empty_key = constant_op.constant([11, 13], dtypes.int64)
-      deleted_key = constant_op.constant([-1, -1], dtypes.int64)
-      default_value = constant_op.constant(-1, dtypes.int64)
-      keys = constant_op.constant([[11, 12], [11, 14], [12, 13], [13, 14]],
-                                  dtypes.int64)
-      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=default_value,
-          empty_key=empty_key,
-          deleted_key=deleted_key,
-          name="t2",
-          checkpoint=True,
-          initial_num_buckets=32)
-
-      save = saver.Saver()
-
-      self.assertAllEqual(0, table.size().eval())
-      table.insert(keys, values).run()
-      self.assertAllEqual(4, table.size().eval())
-      self.assertAllEqual(32, len(table.export()[0].eval()))
-
-      keys2 = constant_op.constant([[12, 13], [15, 16]], dtypes.int64)
-      table.remove(keys2).run()
-      self.assertAllEqual(3, table.size().eval())
-      self.assertAllEqual(32, len(table.export()[0].eval()))
-
-      val = save.save(sess, save_path)
-      self.assertTrue(isinstance(val, six.string_types))
-      self.assertEqual(save_path, val)
-
-    with self.session(graph=ops.Graph()) as sess:
-      empty_key = constant_op.constant([11, 13], dtypes.int64)
-      deleted_key = constant_op.constant([-1, -1], dtypes.int64)
-      default_value = constant_op.constant(-1, dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=default_value,
-          empty_key=empty_key,
-          deleted_key=deleted_key,
-          name="t2",
-          checkpoint=True,
-          initial_num_buckets=64)
-      table.insert(
-          constant_op.constant([[11, 12], [13, 15]], dtypes.int64),
-          constant_op.constant([3, 4], dtypes.int64)).run()
-      self.assertAllEqual(2, table.size().eval())
-      self.assertAllEqual(64, len(table.export()[0].eval()))
-
-      save = saver.Saver()
-
-      # Restore the saved values in the parameter nodes.
-      save.restore(sess, save_path)
-
-      self.assertAllEqual(3, table.size().eval())
-      self.assertAllEqual(32, len(table.export()[0].eval()))
-
-      input_string = constant_op.constant(
-          [[11, 12], [11, 14], [11, 15], [13, 14], [13, 15]], dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([0, 1, -1, 3, -1], output.eval())
-
-  def testReprobe(self):
-    with self.cached_session():
-      # Insert 6 keys into a table with 8 buckets.
-      # The values are chosen to make sure collisions occur when using GCC STL
-      keys = constant_op.constant([11, 12, 13, 19, 20, 21], dtypes.int64)
-      values = constant_op.constant([51, 52, 53, 54, 55, 56], dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=-1,
-          empty_key=0,
-          deleted_key=-1,
-          initial_num_buckets=8)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(6, table.size().eval())
-
-      input_string = constant_op.constant([10, 11, 12, 13, 14, 19, 20, 21, 22],
-                                          dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([9], output.get_shape())
-
-      result = output.eval()
-      self.assertAllEqual([-1, 51, 52, 53, -1, 54, 55, 56, -1], result)
-
-  def testCustomEmptyKey(self):
-    with self.cached_session():
-      keys = constant_op.constant([11, 0, 13], dtypes.int64)
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=-1,
-          empty_key=12,
-          deleted_key=-1)
-      self.assertAllEqual(0, table.size().eval())
-
-      table.insert(keys, values).run()
-      self.assertAllEqual(3, table.size().eval())
-
-      input_string = constant_op.constant([11, 0, 15], dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([3], output.get_shape())
-
-      result = output.eval()
-      self.assertAllEqual([0, 1, -1], result)
-
-  def testErrors(self):
-    with self.cached_session():
-      table = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=-1,
-          empty_key=0,
-          deleted_key=-1)
-
-      # Inserting the empty key returns an error
-      keys1 = constant_op.constant([11, 0], dtypes.int64)
-      values1 = constant_op.constant([0, 1], dtypes.int64)
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "empty_key"):
-        table.insert(keys1, values1).run()
-
-      # Looking up the empty key returns an error
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "empty_key"):
-        table.lookup(keys1).eval()
-
-      # Inserting the deleted key returns an error
-      keys2 = constant_op.constant([11, -1], dtypes.int64)
-      values2 = constant_op.constant([0, 1], dtypes.int64)
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "deleted_key"):
-        table.insert(keys2, values2).run()
-
-      # Looking up the empty key returns an error
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "deleted_key"):
-        table.lookup(keys2).eval()
-
-      # Arbitrary tensors of keys are not supported
-      keys = constant_op.constant([[11, 0], [12, 1]], dtypes.int64)
-      values = constant_op.constant([[11, 0], [12, 1]], dtypes.int64)
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Expected key shape"):
-        table.lookup(keys).eval()
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Expected key shape"):
-        table.insert(keys, values).run()
-
-      table2 = lookup.MutableDenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=-1,
-          empty_key=17,
-          deleted_key=-1,
-          initial_num_buckets=12)
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Number of buckets must be"):
-        self.assertAllEqual(0, table2.size().eval())
-
-      with self.assertRaisesRegexp(
-          errors_impl.InvalidArgumentError,
-          "Empty and deleted keys must have same shape"):
-        table3 = lookup.MutableDenseHashTable(
-            dtypes.int64,
-            dtypes.int64,
-            default_value=-1,
-            empty_key=42,
-            deleted_key=[1, 2])
-        self.assertAllEqual(0, table3.size().eval())
-
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Empty and deleted keys cannot be equal"):
-        table4 = lookup.MutableDenseHashTable(
-            dtypes.int64,
-            dtypes.int64,
-            default_value=-1,
-            empty_key=42,
-            deleted_key=42)
-        self.assertAllEqual(0, table4.size().eval())
-
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Empty and deleted keys cannot be equal"):
-        table5 = lookup.MutableDenseHashTable(
-            dtypes.int64,
-            dtypes.int64,
-            default_value=-1,
-            empty_key=[1, 2, 3],
-            deleted_key=[1, 2, 3])
-        self.assertAllEqual(0, table5.size().eval())
-
-
 class IndexTableFromFile(test.TestCase):
 
   def _createVocabFile(self, basename, values=("brain", "salad", "surgery")):
@@ -2721,64 +1481,6 @@ class IdTableWithHashBucketsTest(test.TestCase):
             hasher_spec=lookup.StrongHashSpec([None, 2]))
 
 
-class MutableHashTableBenchmark(test.Benchmark):
-
-  def _create_table(self):
-    return lookup.MutableHashTable(dtypes.int64, dtypes.float32, 0.0)
-
-  def benchmark_single_repeated_scalar_insert_scalar(self):
-    table = self._create_table()
-    value = variables.Variable(1.0)
-    insert = table.insert(0, value)
-    size = table.size()
-    with session.Session() as sess:
-      sess.run(value.initializer)
-      self.run_op_benchmark(sess, insert, burn_iters=10, min_iters=10000)
-      assert sess.run(size) == 1
-
-  def benchmark_many_repeated_scalar_insert_scalar(self):
-    table = self._create_table()
-    c = dataset_ops.make_one_shot_iterator(counter.Counter()).get_next()
-    value = variables.Variable(1.0)
-    insert = table.insert(c, value)
-    size = table.size()
-    with session.Session() as sess:
-      sess.run(value.initializer)
-      self.run_op_benchmark(sess, insert, burn_iters=10, min_iters=10000)
-      assert sess.run(size) >= 10000
-
-  def benchmark_single_repeated_batch_32_insert_scalar(self):
-    table = self._create_table()
-    value = variables.Variable([1.0] * 32)
-    insert = table.insert(list(range(32)), value)
-    size = table.size()
-    with session.Session() as sess:
-      sess.run(value.initializer)
-      self.run_op_benchmark(sess, insert, burn_iters=10, min_iters=1000)
-      assert sess.run(size) == 32
-
-  def benchmark_many_repeated_batch_32_insert_scalar(self):
-    table = self._create_table()
-    c = dataset_ops.make_one_shot_iterator(counter.Counter()).get_next()
-    value = variables.Variable([1.0] * 32)
-    insert = table.insert(32 * c + list(range(32)), value)
-    size = table.size()
-    with session.Session() as sess:
-      sess.run(value.initializer)
-      self.run_op_benchmark(sess, insert, burn_iters=10, min_iters=1000)
-      assert sess.run(size) >= 1000*32
-
-
-class MutableDenseHashTableBenchmark(MutableHashTableBenchmark):
-
-  def _create_table(self):
-    return lookup.MutableDenseHashTable(
-        dtypes.int64,
-        dtypes.float32,
-        default_value=0.0,
-        empty_key=-1,
-        deleted_key=-2)
-
-
 if __name__ == "__main__":
   test.main()
+
diff --git a/tensorflow/contrib/losses/BUILD b/tensorflow/contrib/losses/BUILD
index 728f75f8ef1eb3b107dbd0ab4ffbecd63787bf3e..f4ebbdeee883ddeef0d47cb561901c16e2195bb2 100644
--- a/tensorflow/contrib/losses/BUILD
+++ b/tensorflow/contrib/losses/BUILD
@@ -82,10 +82,11 @@ py_library(
 
 py_test(
     name = "metric_loss_ops_test",
-    size = "large",
+    size = "medium",
     srcs = [
         "python/metric_learning/metric_loss_ops_test.py",
     ],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
         ":metric_learning_py",
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py
index 709a042bbcefb89125f7e4cd14a0d7ecd2b53281..5ebdd0b8b50063c99e6b747c594eb99c306b4efb 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops.py
@@ -511,7 +511,7 @@ def mean_squared_error(predictions, labels=None, weights=1.0, scope=None):
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
     predictions = math_ops.to_float(predictions)
     labels = math_ops.to_float(labels)
-    losses = math_ops.square(math_ops.subtract(predictions, labels))
+    losses = math_ops.squared_difference(predictions, labels)
     return compute_weighted_loss(losses, weights, scope=scope)
 
 
diff --git a/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py b/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
index de76acb51ffe985162a66c617b266f47c5216b19..f3b0e77740ff1d940fcd6d00b3482e90f6ebf952 100644
--- a/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
+++ b/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py
@@ -105,7 +105,8 @@ def contrastive_loss(labels, embeddings_anchor, embeddings_positive,
   # Get per pair distances
   distances = math_ops.sqrt(
       math_ops.reduce_sum(
-          math_ops.square(embeddings_anchor - embeddings_positive), 1))
+          math_ops.squared_difference(embeddings_anchor, embeddings_positive),
+          1))
 
   # Add contrastive loss for the siamese network.
   #   label here is {0,1} for neg, pos.
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 2a5232b476712a96f84be0f4725beb78bc138297..af3c541dc214c30e9e59fdcca995ffc53b028df4 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -142,5 +142,6 @@ replace_by_sed 's#static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DAT
 # TODO(satok): Remove this once protobuf/autogen.sh is fixed.
 replace_by_sed 's#https://googlemock.googlecode.com/files/gmock-1.7.0.zip#http://download.tensorflow.org/deps/gmock-1.7.0.zip#' \
   "${DOWNLOADS_DIR}/protobuf/autogen.sh"
+cat "third_party/eigen3/gebp_neon.patch" | patch "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h"
 
 echo "download_dependencies.sh completed successfully." >&2
diff --git a/tensorflow/contrib/makefile/proto_text_cc_files.txt b/tensorflow/contrib/makefile/proto_text_cc_files.txt
index 9ea94c74330e3e49414a6a84cd5bc0db3778114a..0a0ba36232075460b561bc54a95fc24973017571 100644
--- a/tensorflow/contrib/makefile/proto_text_cc_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_cc_files.txt
@@ -40,7 +40,6 @@ tensorflow/core/lib/wav/wav_io.cc
 tensorflow/core/platform/cpu_info.cc
 tensorflow/core/platform/default/logging.cc
 tensorflow/core/platform/default/mutex.cc
-tensorflow/core/platform/default/protobuf.cc
 tensorflow/core/platform/default/tracing.cc
 tensorflow/core/platform/denormal.cc
 tensorflow/core/platform/env.cc
@@ -53,6 +52,7 @@ tensorflow/core/platform/posix/error.cc
 tensorflow/core/platform/posix/load_library.cc
 tensorflow/core/platform/posix/port.cc
 tensorflow/core/platform/posix/posix_file_system.cc
+tensorflow/core/platform/protobuf.cc
 tensorflow/core/platform/protobuf_util.cc
 tensorflow/core/platform/setround.cc
 tensorflow/core/platform/tensor_coding.cc
diff --git a/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt b/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
index 87c73ec1ca610cac6d63468887bc350bada5910b..1c1460ce77c99d29785c7e8b8a8e9f770a45b59f 100644
--- a/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
@@ -25,6 +25,7 @@ tensorflow/core/framework/variable.pb.cc
 tensorflow/core/framework/versions.pb.cc
 tensorflow/core/grappler/costs/op_performance_data.pb.cc
 tensorflow/core/lib/core/error_codes.pb.cc
+tensorflow/core/protobuf/trackable_object_graph.pb.cc
 tensorflow/core/protobuf/cluster.pb.cc
 tensorflow/core/protobuf/config.pb.cc
 tensorflow/core/protobuf/eager_service.pb.cc
@@ -34,8 +35,11 @@ tensorflow/core/protobuf/meta_graph.pb.cc
 tensorflow/core/protobuf/named_tensor.pb.cc
 tensorflow/core/protobuf/queue_runner.pb.cc
 tensorflow/core/protobuf/rewriter_config.pb.cc
+tensorflow/core/protobuf/saved_object_graph.pb.cc
 tensorflow/core/protobuf/saver.pb.cc
+tensorflow/core/protobuf/struct.pb.cc
 tensorflow/core/protobuf/tensorflow_server.pb.cc
+tensorflow/core/protobuf/verifier_config.pb.cc
 tensorflow/core/util/event.pb.cc
 tensorflow/core/util/memmapped_file_system.pb.cc
 tensorflow/core/util/saved_tensor_slice.pb.cc
diff --git a/tensorflow/contrib/makefile/proto_text_pb_h_files.txt b/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
index 4120ea52ec5255b1efce7a6ce6890fc79c1e4831..5def632e8a7b65272a1339bdacd92c1fa23012d2 100644
--- a/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_pb_h_files.txt
@@ -25,6 +25,7 @@ tensorflow/core/framework/variable.pb.h
 tensorflow/core/framework/versions.pb.h
 tensorflow/core/grappler/costs/op_performance_data.pb.h
 tensorflow/core/lib/core/error_codes.pb.h
+tensorflow/core/protobuf/trackable_object_graph.pb.h
 tensorflow/core/protobuf/cluster.pb.h
 tensorflow/core/protobuf/config.pb.h
 tensorflow/core/protobuf/debug.pb.h
@@ -34,9 +35,12 @@ tensorflow/core/protobuf/meta_graph.pb.h
 tensorflow/core/protobuf/named_tensor.pb.h
 tensorflow/core/protobuf/queue_runner.pb.h
 tensorflow/core/protobuf/rewriter_config.pb.h
+tensorflow/core/protobuf/saved_object_graph.pb.h
 tensorflow/core/protobuf/saver.pb.h
+tensorflow/core/protobuf/struct.pb.h
 tensorflow/core/protobuf/tensor_bundle.pb.h
 tensorflow/core/protobuf/tensorflow_server.pb.h
+tensorflow/core/protobuf/verifier_config.pb.h
 tensorflow/core/util/event.pb.h
 tensorflow/core/util/memmapped_file_system.pb.h
 tensorflow/core/util/saved_tensor_slice.pb.h
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index 655c7eefcb978d40c8bc16a23685e03ed71bfb63..2cd7d6d519a55423a96526b541845392d9ec6bc2 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -119,6 +119,7 @@ tensorflow/core/kernels/fake_quant_ops.cc
 tensorflow/core/kernels/fifo_queue.cc
 tensorflow/core/kernels/fifo_queue_op.cc
 tensorflow/core/kernels/fill_functor.cc
+tensorflow/core/kernels/fft_ops.cc
 tensorflow/core/kernels/function_ops.cc
 tensorflow/core/kernels/fused_batch_norm_op.cc
 tensorflow/core/kernels/gather_functor.cc
diff --git a/tensorflow/contrib/makefile/tf_pb_text_files.txt b/tensorflow/contrib/makefile/tf_pb_text_files.txt
index f94d70db9046cec43073ab1406762aea1f28c8e3..13e3b6422d1989b0d499d8d20901d919554c630e 100644
--- a/tensorflow/contrib/makefile/tf_pb_text_files.txt
+++ b/tensorflow/contrib/makefile/tf_pb_text_files.txt
@@ -29,5 +29,6 @@ tensorflow/core/protobuf/debug.pb_text.cc
 tensorflow/core/protobuf/rewriter_config.pb_text.cc
 tensorflow/core/protobuf/saver.pb_text.cc
 tensorflow/core/protobuf/tensor_bundle.pb_text.cc
+tensorflow/core/protobuf/verifier_config.pb_text.cc
 tensorflow/core/util/memmapped_file_system.pb_text.cc
 tensorflow/core/util/saved_tensor_slice.pb_text.cc
diff --git a/tensorflow/contrib/makefile/tf_proto_files.txt b/tensorflow/contrib/makefile/tf_proto_files.txt
index 2712e906d719e72dacb60e213205ad68895f905f..deb6a5b94020a02b878bdd68a33b3737a97fcf2b 100644
--- a/tensorflow/contrib/makefile/tf_proto_files.txt
+++ b/tensorflow/contrib/makefile/tf_proto_files.txt
@@ -31,6 +31,7 @@ tensorflow/core/framework/versions.proto
 tensorflow/core/grappler/costs/op_performance_data.proto
 tensorflow/core/kernels/boosted_trees/boosted_trees.proto
 tensorflow/core/lib/core/error_codes.proto
+tensorflow/core/protobuf/trackable_object_graph.proto
 tensorflow/core/protobuf/cluster.proto
 tensorflow/core/protobuf/config.proto
 tensorflow/core/protobuf/debug.proto
@@ -40,9 +41,12 @@ tensorflow/core/protobuf/meta_graph.proto
 tensorflow/core/protobuf/named_tensor.proto
 tensorflow/core/protobuf/queue_runner.proto
 tensorflow/core/protobuf/rewriter_config.proto
+tensorflow/core/protobuf/saved_object_graph.proto
 tensorflow/core/protobuf/saver.proto
+tensorflow/core/protobuf/struct.proto
 tensorflow/core/protobuf/tensor_bundle.proto
 tensorflow/core/protobuf/tensorflow_server.proto
+tensorflow/core/protobuf/verifier_config.proto
 tensorflow/core/util/event.proto
 tensorflow/core/util/memmapped_file_system.proto
 tensorflow/core/util/saved_tensor_slice.proto
diff --git a/tensorflow/contrib/memory_stats/BUILD b/tensorflow/contrib/memory_stats/BUILD
index 63843b993c16363a80b64622af665aaa64e05830..93701249cc8bf722c8c8558e91e0b700ca1c4a04 100644
--- a/tensorflow/contrib/memory_stats/BUILD
+++ b/tensorflow/contrib/memory_stats/BUILD
@@ -10,6 +10,7 @@ package(default_visibility = ["//tensorflow:__subpackages__"])
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_cc")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
@@ -45,6 +46,28 @@ tf_gen_op_wrapper_py(
     deps = [":memory_stats_ops_op_lib"],
 )
 
+tf_gen_op_wrapper_cc(
+    name = "memory_stats_ops",
+    out_ops_file = "memory_stats_ops",
+)
+
+cc_library(
+    name = "memory_stats_cc",
+    srcs = ["memory_stats_ops.cc"],
+    hdrs = ["memory_stats_ops.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":memory_stats_kernels",
+        ":memory_stats_ops_op_lib",
+        "//tensorflow/cc:const_op",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
+
 tf_custom_op_py_library(
     name = "memory_stats_py",
     srcs = [
diff --git a/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc b/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
index 974fb537499c5ea4591a0a128f53d2dea67b9e57..7ae1dbeaa2d04d7846e7fada117f3941319cc1c1 100644
--- a/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
+++ b/tensorflow/contrib/memory_stats/kernels/memory_stats_ops.cc
@@ -24,13 +24,15 @@ class MemoryStatsOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     Allocator* allocator =
         context->device()->GetAllocator(AllocatorAttributes());
-    AllocatorStats allocator_stats;
-    allocator->GetStats(&allocator_stats);
+    absl::optional<AllocatorStats> allocator_stats = allocator->GetStats();
+    if (!allocator_stats) {
+      *allocator_stats = AllocatorStats();
+    }
 
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(
         context, context->allocate_output(0, TensorShape({}), &output_tensor));
-    output_tensor->scalar<int64>()() = ExtractAllocatorStats(allocator_stats);
+    output_tensor->scalar<int64>()() = ExtractAllocatorStats(*allocator_stats);
   }
 
  protected:
@@ -71,7 +73,7 @@ class BytesLimitOp : public MemoryStatsOp {
  private:
   int64 ExtractAllocatorStats(
       const AllocatorStats& allocator_stats) const override {
-    return allocator_stats.bytes_limit;
+    return allocator_stats.bytes_limit ? *allocator_stats.bytes_limit : -1;
   }
 };
 
@@ -93,7 +95,7 @@ class MaxBytesInUseOp : public MemoryStatsOp {
  private:
   int64 ExtractAllocatorStats(
       const AllocatorStats& allocator_stats) const override {
-    return allocator_stats.max_bytes_in_use;
+    return allocator_stats.peak_bytes_in_use;
   }
 };
 
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 7b432f8bd20989c6d95310bcaca88d44ce3e0d1f..ece246b7c28569a551f7733daf16ee1507f9c95d 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -1356,9 +1356,8 @@ def _compute_placement_auc(labels, predictions, weights, alpha,
           weights_0 * math_ops.square(1. - placement_values_0 - auc_0)) /
       (total_0 - 1. + _EPSILON))
   var_1 = (
-      math_ops.reduce_sum(
-          weights_1 * math_ops.square(placement_values_1 - auc_1)) /
-      (total_1 - 1. + _EPSILON))
+      math_ops.reduce_sum(weights_1 * math_ops.squared_difference(
+          placement_values_1, auc_1)) / (total_1 - 1. + _EPSILON))
   auc_std_err = math_ops.sqrt(
       (var_0 / (total_0 + _EPSILON)) + (var_1 / (total_1 + _EPSILON)))
 
diff --git a/tensorflow/contrib/model_pruning/README.md b/tensorflow/contrib/model_pruning/README.md
index 45a60d79482787df4564ae3360f8252af93c7a26..710a262f33872ada8d090d796f80dc06c2a27f84 100644
--- a/tensorflow/contrib/model_pruning/README.md
+++ b/tensorflow/contrib/model_pruning/README.md
@@ -53,7 +53,6 @@ The pruning library allows for specification of the following hyper parameters:
 | weight_sparsity_map | list of strings | [""] | list of weight variable name (or layer name):target sparsity pairs. Eg. [conv1:0.9,conv2/kernel:0.8]. For layers/weights not in this list, sparsity as specified by the target_sparsity hyperparameter is used. |
 | threshold_decay | float | 0.0 | The decay factor to use for exponential decay of the thresholds |
 | pruning_frequency | integer | 10 | How often should the masks be updated? (in # of global_steps) |
-| nbins | integer | 256 | Number of bins to use for histogram computation. Note: When running on TPUs, a large (>1024) value for `nbins` may adversely affect the training time. |
 | block_height|integer | 1 | Number of rows in a block for block sparse matrices|
 | block_width |integer | 1 | Number of cols in a block for block sparse matrices|
 | block_pooling_function| string | AVG | The function to use to pool weight values in a block: average (AVG) or max (MAX)|
diff --git a/tensorflow/contrib/model_pruning/python/pruning.py b/tensorflow/contrib/model_pruning/python/pruning.py
index f6b4373edd0544555dd16a373802d2feb5d674b1..9966f7cf798d206fffbaeb4d16b6500a90d113e4 100644
--- a/tensorflow/contrib/model_pruning/python/pruning.py
+++ b/tensorflow/contrib/model_pruning/python/pruning.py
@@ -214,7 +214,7 @@ def get_pruning_hparams():
       target_sparsity=0.5,
       sparsity_function_begin_step=0,
       sparsity_function_end_step=100,
-      sparsity_function_exponent=3,
+      sparsity_function_exponent=3.0,
       use_tpu=False)
 
 
@@ -397,28 +397,26 @@ class Pruning(object):
       raise ValueError('Sparsity variable undefined')
 
     sparsity = self._get_sparsity(weights.op.name)
-
     with ops.name_scope(weights.op.name + '_pruning_ops'):
       abs_weights = math_ops.abs(weights)
-      max_value = math_ops.reduce_max(abs_weights)
-      cdf_fn = pruning_utils.compute_cdf_from_histogram
-      if self._spec.use_tpu:
-        cdf_fn = pruning_utils.compute_cdf
-
-      norm_cdf = cdf_fn(abs_weights, [0.0, max_value], nbins=self._spec.nbins)
-      current_threshold = math_ops.multiply(
-          math_ops.div(
-              math_ops.reduce_sum(
-                  math_ops.cast(
-                      math_ops.less(norm_cdf, sparsity), dtypes.float32)),
-              float(self._spec.nbins)), max_value)
-
+      k = math_ops.cast(
+          math_ops.round(
+              math_ops.cast(array_ops.size(abs_weights), dtypes.float32) *
+              (1 - sparsity)), dtypes.int32)
+      # Sort the entire array
+      values, _ = nn_ops.top_k(
+          array_ops.reshape(abs_weights, [-1]), k=array_ops.size(abs_weights))
+      # Grab the (k-1) th value
+      current_threshold = array_ops.gather(values, k - 1)
       smoothed_threshold = math_ops.add_n([
           math_ops.multiply(current_threshold, 1 - self._spec.threshold_decay),
           math_ops.multiply(threshold, self._spec.threshold_decay)
       ])
+
       new_mask = math_ops.cast(
-          math_ops.greater(abs_weights, smoothed_threshold), dtypes.float32)
+          math_ops.greater_equal(abs_weights, smoothed_threshold),
+          dtypes.float32)
+
     return smoothed_threshold, new_mask
 
   def _maybe_update_block_mask(self, weights, threshold):
diff --git a/tensorflow/contrib/model_pruning/python/pruning_test.py b/tensorflow/contrib/model_pruning/python/pruning_test.py
index 1b6da5ce2b4ebb3ea3b204c4ed12bed8db951447..835614d8822147dadb029107ae0e917cc955eef0 100644
--- a/tensorflow/contrib/model_pruning/python/pruning_test.py
+++ b/tensorflow/contrib/model_pruning/python/pruning_test.py
@@ -102,7 +102,7 @@ class PruningTest(test.TestCase):
       weights = variables.VariableV1(
           math_ops.linspace(1.0, 100.0, 100), name="weights")
       masked_weights = pruning.apply_mask(weights)
-      sparsity = variables.VariableV1(0.5, name="sparsity")
+      sparsity = variables.VariableV1(0.95, name="sparsity")
       p = pruning.Pruning(sparsity=sparsity)
       p._spec.threshold_decay = 0.0
       mask_update_op = p.mask_update_op()
@@ -111,7 +111,7 @@ class PruningTest(test.TestCase):
       self.assertAllEqual(np.count_nonzero(masked_weights_val), 100)
       session.run(mask_update_op)
       masked_weights_val = masked_weights.eval()
-      self.assertAllEqual(np.count_nonzero(masked_weights_val), 50)
+      self.assertAllEqual(np.count_nonzero(masked_weights_val), 5)
 
   def _blockMasking(self, hparams, weights, expected_mask):
 
diff --git a/tensorflow/contrib/model_pruning/python/pruning_utils.py b/tensorflow/contrib/model_pruning/python/pruning_utils.py
index 14fc51229ab53a77e8089040e8a8576babd0fafd..8f2ba036469bd02328a831a3d1de2ffbd10f5004 100644
--- a/tensorflow/contrib/model_pruning/python/pruning_utils.py
+++ b/tensorflow/contrib/model_pruning/python/pruning_utils.py
@@ -25,16 +25,12 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 
-_NBINS = 256
-
 
 def weight_mask_variable(var, scope):
   """Create a mask for the weights.
@@ -165,128 +161,6 @@ def expand_tensor(tensor, block_dims):
   return expanded_tensor
 
 
-def _histogram(values, value_range, nbins=100, dtype=dtypes.int32, name=None):
-  """Return histogram of values.
-
-  Given the tensor `values`, this operation returns a rank 1 histogram counting
-  the number of entries in `values` that fell into every bin.  The bins are
-  equal width and determined by the arguments `value_range` and `nbins`.
-
-  Args:
-    values:  Numeric `Tensor`.
-    value_range:  Shape [2] `Tensor` of same `dtype` as `values`.
-      values <= value_range[0] will be mapped to hist[0],
-      values >= value_range[1] will be mapped to hist[-1].
-    nbins:  Scalar `int32 Tensor`.  Number of histogram bins.
-    dtype:  dtype for returned histogram.
-    name:  A name for this operation (defaults to 'histogram').
-
-  Returns:
-    A 1-D `Tensor` holding histogram of values.
-
-  """
-  with ops.name_scope(name, 'histogram', [values, value_range, nbins]) as scope:
-    values = ops.convert_to_tensor(values, name='values')
-    values = array_ops.reshape(values, [-1])
-    nbins_float = np.float32(nbins)
-
-    # Map tensor values that fall within value_range to [0, 1].
-    scaled_values = math_ops.truediv(
-        values - value_range[0],
-        value_range[1] - value_range[0],
-        name='scaled_values')
-
-    # map tensor values within the open interval value_range to {0,.., nbins-1},
-    # values outside the open interval will be zero or less, or nbins or more.
-    indices = math_ops.floor(nbins_float * scaled_values, name='indices')
-
-    # Clip edge cases (e.g. value = value_range[1]) or "outliers."
-    indices = math_ops.cast(
-        clip_ops.clip_by_value(indices, 0, nbins_float - 1), dtypes.int32)
-
-    return math_ops.unsorted_segment_sum(
-        array_ops.ones_like(indices, dtype=dtype), indices, nbins, name=scope)
-
-
-def compute_cdf_from_histogram(values, value_range, **kwargs):
-  """Returns the normalized cumulative distribution of the given values tensor.
-
-  Computes the histogram and uses tf.cumsum to arrive at cdf
-
-  Args:
-    values:  Numeric `Tensor`.
-    value_range:  Shape [2] `Tensor` of same `dtype` as `values`.
-    **kwargs: keyword arguments: nbins, name
-
-  Returns:
-    A 1-D `Tensor` holding normalized cdf of values.
-
-  """
-  nbins = kwargs.get('nbins', _NBINS)
-  name = kwargs.get('name', None)
-  with ops.name_scope(name, 'cdf', [values, value_range, nbins]):
-    histogram = _histogram(
-        values, value_range, dtype=dtypes.float32, nbins=nbins)
-    cdf = math_ops.cumsum(histogram)
-    return math_ops.div(cdf, math_ops.reduce_max(cdf))
-
-
-def compute_cdf(values, value_range, **kwargs):
-  """Returns the normalized cumulative distribution of the given values tensor.
-
-  Uses tf.while_loop to directly compute the cdf of the values.
-
-  Args:
-    values:  Numeric `Tensor`.
-    value_range:  Shape [2] `Tensor` of same `dtype` as `values`
-    **kwargs: keyword arguments: nbins, name
-
-  Returns:
-    A 1-D `Tensor` holding normalized cdf of values.
-
-  """
-  nbins = kwargs.get('nbins', _NBINS)
-  name = kwargs.get('name', None)
-  with ops.name_scope(name, 'cdf', [values, value_range, nbins]):
-    values = ops.convert_to_tensor(values, name='values')
-    nbins_float = np.float32(nbins)
-
-    # Map tensor values that fall within value_range to [0, 1].
-    scaled_values = math_ops.truediv(
-        values - value_range[0],
-        value_range[1] - value_range[0],
-        name='scaled_values')
-
-    # map tensor values within the open interval value_range to {0,.., nbins-1},
-    # values outside the open interval will be zero or less, or nbins or more.
-    indices = math_ops.floor(nbins_float * scaled_values, name='indices')
-
-    # Clip edge cases (e.g. value = value_range[1]) or "outliers."
-    indices = math_ops.cast(
-        clip_ops.clip_by_value(indices, 0, nbins_float - 1), dtypes.int32)
-
-    cdf = array_ops.zeros(nbins)
-    i = constant_op.constant(0)
-
-    def loop_cond(loop_count, _):
-      return math_ops.less(loop_count, nbins)
-
-    def loop_body(loop_count, cdf):
-      temp = math_ops.reduce_sum(
-          math_ops.cast(
-              math_ops.less_equal(indices, loop_count), dtypes.float32))
-      cdf = math_ops.add(
-          cdf,
-          array_ops.one_hot(
-              loop_count, depth=nbins, on_value=temp, off_value=0.0))
-      return [loop_count + 1, cdf]
-
-    _, cdf = control_flow_ops.while_loop(
-        loop_cond, loop_body, [i, cdf], maximum_iterations=nbins)
-
-    return math_ops.div(cdf, math_ops.reduce_max(cdf))
-
-
 def factorized_pool(input_tensor,
                     window_shape,
                     pooling_type,
diff --git a/tensorflow/contrib/model_pruning/python/pruning_utils_test.py b/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
index d6f2bfcb6c2e2beda912eb538d8a4a0a17b486b3..b85bc413155d53cd6d53e98dae0ad626531f61eb 100644
--- a/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
+++ b/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
@@ -19,13 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 from absl.testing import parameterized
-import numpy as np
 
 from tensorflow.contrib.model_pruning.python import pruning_utils
-from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
@@ -33,57 +29,6 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
-class PruningUtilsTest(test.TestCase):
-
-  def _compare_cdf(self, values):
-    abs_values = math_ops.abs(values)
-    max_value = math_ops.reduce_max(abs_values)
-    with self.cached_session():
-      variables.global_variables_initializer().run()
-      cdf_from_histogram = pruning_utils.compute_cdf_from_histogram(
-          abs_values, [0.0, max_value], nbins=pruning_utils._NBINS)
-      cdf = pruning_utils.compute_cdf(abs_values, [0.0, max_value])
-      self.assertAllEqual(cdf.eval(), cdf_from_histogram.eval())
-
-  def testHistogram(self):
-    width = 10
-    height = 10
-    nbins = 100
-    expected_histogram = np.full(nbins, 1.0)
-    init = init_ops.constant_initializer(np.linspace(0.0, 1.0, width * height))
-    weights = variable_scope.get_variable(
-        "weights", [width, height], initializer=init)
-    histogram = pruning_utils._histogram(
-        weights, [0, 1.0], nbins, dtype=np.float32)
-    with self.cached_session():
-      variables.global_variables_initializer().run()
-      computed_histogram = histogram.eval()
-    self.assertAllEqual(expected_histogram, computed_histogram)
-
-  def testCDF(self):
-    nbins = 5
-    weights = constant_op.constant([-1, 0, 1, 1.5, 2, 3, 4, 5, 10, 100])
-    abs_weights = math_ops.abs(weights)
-    norm_cdf = pruning_utils.compute_cdf_from_histogram(
-        abs_weights, [0.0, 5.0], nbins=nbins)
-    expected_cdf = np.array([0.1, 0.4, 0.5, 0.6, 1.0], dtype=np.float32)
-    with self.cached_session() as sess:
-      variables.global_variables_initializer().run()
-      norm_cdf_val = sess.run(norm_cdf)
-      self.assertAllEqual(len(norm_cdf_val), nbins)
-      self.assertAllEqual(expected_cdf, norm_cdf_val)
-
-  def testCDFEquivalence2D(self):
-    width = 100
-    height = 100
-    weights = variable_scope.get_variable("weights", shape=[width, height])
-    self._compare_cdf(weights)
-
-  def testCDFEquivalence4D(self):
-    weights = variable_scope.get_variable("weights", shape=[5, 5, 128, 128])
-    self._compare_cdf(weights)
-
-
 @parameterized.named_parameters(
     ("Input_32x32_block_1x1", [32, 32], [1, 1]),
     # block size 6x6
diff --git a/tensorflow/contrib/mpi/mpi_server_lib.cc b/tensorflow/contrib/mpi/mpi_server_lib.cc
index a31fa9ce0b3110d875689d74a41ca9f9cc85f532..e44e10af0814ba8d6d964dfc34a0470ce45c0b40 100644
--- a/tensorflow/contrib/mpi/mpi_server_lib.cc
+++ b/tensorflow/contrib/mpi/mpi_server_lib.cc
@@ -54,7 +54,10 @@ MPIServer::~MPIServer() {
 
 Status MPIServer::Init(ServiceInitFunction service_func,
                        RendezvousMgrCreationFunction rendezvous_mgr_func) {
-  Status s = GrpcServer::Init(service_func, rendezvous_mgr_func);
+  GrpcServerOptions opts;
+  opts.service_func = service_func;
+  opts.rendezvous_mgr_func = rendezvous_mgr_func;
+  Status s = GrpcServer::Init(opts);
   return s;
 }
 
diff --git a/tensorflow/contrib/mpi_collectives/BUILD b/tensorflow/contrib/mpi_collectives/BUILD
index ecac06354d2ce796f2a6021cdf2370d7c30ccab7..a7be92a35e0d62a61f7923ac61bb2c1267d039c6 100644
--- a/tensorflow/contrib/mpi_collectives/BUILD
+++ b/tensorflow/contrib/mpi_collectives/BUILD
@@ -52,7 +52,6 @@ tf_custom_op_library(
     deps = [
         ":mpi_defines",
         ":mpi_message_proto_cc",
-        "//tensorflow/stream_executor:stream_executor_headers_lib",
         "//third_party/mpi",
     ],
 )
diff --git a/tensorflow/contrib/mpi_collectives/ring.h b/tensorflow/contrib/mpi_collectives/ring.h
index cae57ce60eb09509af69f8ccab9eacedea361548..9b5d52e1b648e62af93d5420885e4f22796e3ea1 100644
--- a/tensorflow/contrib/mpi_collectives/ring.h
+++ b/tensorflow/contrib/mpi_collectives/ring.h
@@ -129,7 +129,7 @@ cudaStream_t CudaStreamForMPI();
  *  has the fully accumulated Segment 1; and so on. The scatter-reduce is
  * complete.
  *
- *  Next, the allgather distributes these fully accumululated chunks across all
+ *  Next, the allgather distributes these fully accumulated chunks across all
  * nodes. Communication proceeds in the same ring, once again in N-1 steps. At
  * the ith step, node j will send chunk (j - i + 1) and receive chunk (j - i).
  * For example, at the first iteration, the following transfers will occur:
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index f4ac70eb1a720c2acc3ef942f269228156749cba..f30643cf3059754daaeee4093938ac47b26f76ea 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -14,6 +14,7 @@ py_library(
     name = "opt_py",
     srcs = [
         "__init__.py",
+        "python/training/adam_gs_optimizer.py",
         "python/training/adamax.py",
         "python/training/addsign.py",
         "python/training/agn_optimizer.py",
@@ -22,6 +23,7 @@ py_library(
         "python/training/external_optimizer.py",
         "python/training/ggt.py",
         "python/training/lars_optimizer.py",
+        "python/training/lazy_adam_gs_optimizer.py",
         "python/training/lazy_adam_optimizer.py",
         "python/training/matrix_functions.py",
         "python/training/model_average_optimizer.py",
@@ -60,6 +62,21 @@ py_library(
     ],
 )
 
+py_test(
+    name = "adam_gs_optimizer_test",
+    srcs = ["python/training/adam_gs_optimizer_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:training",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "adamax_test",
     srcs = ["python/training/adamax_test.py"],
@@ -148,6 +165,25 @@ py_test(
     ],
 )
 
+py_test(
+    name = "lazy_adam_gs_optimizer_test",
+    srcs = ["python/training/lazy_adam_gs_optimizer_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":opt_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "lazy_adam_optimizer_test",
     srcs = ["python/training/lazy_adam_optimizer_test.py"],
@@ -283,6 +319,9 @@ tf_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//third_party/py/numpy",
     ],
+    tags = [
+        "oss_serial",
+    ],
 )
 
 tf_py_test(
@@ -374,8 +413,9 @@ py_test(
 
 py_test(
     name = "shampoo_test",
-    size = "large",
+    size = "medium",
     srcs = ["python/training/shampoo_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
         ":opt_py",
diff --git a/tensorflow/contrib/opt/__init__.py b/tensorflow/contrib/opt/__init__.py
index c7ea68efa9a13a471bba3f41d0600855793b20a2..e8fc52342ceabb47da97ca0f3c8a01e419a221a1 100644
--- a/tensorflow/contrib/opt/__init__.py
+++ b/tensorflow/contrib/opt/__init__.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import
+from tensorflow.contrib.opt.python.training.adam_gs_optimizer import *
 from tensorflow.contrib.opt.python.training.adamax import *
 from tensorflow.contrib.opt.python.training.addsign import *
 from tensorflow.contrib.opt.python.training.agn_optimizer import *
@@ -28,6 +29,7 @@ from tensorflow.contrib.opt.python.training.external_optimizer import *
 from tensorflow.contrib.opt.python.training.lars_optimizer import *
 from tensorflow.contrib.opt.python.training.ggt import *
 from tensorflow.contrib.opt.python.training.lazy_adam_optimizer import *
+from tensorflow.contrib.opt.python.training.lazy_adam_gs_optimizer import *
 from tensorflow.contrib.opt.python.training.model_average_optimizer import *
 from tensorflow.contrib.opt.python.training.moving_average_optimizer import *
 from tensorflow.contrib.opt.python.training.multitask_optimizer_wrapper import *
@@ -44,12 +46,14 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
     'AdaMaxOptimizer',
+    'AdamGSOptimizer',
     'PowerSignOptimizer',
     'AddSignOptimizer',
     'DelayCompensatedGradientDescentOptimizer',
     'DropStaleGradientOptimizer',
     'ExternalOptimizerInterface',
     'LARSOptimizer',
+    'LazyAdamGSOptimizer',
     'LazyAdamOptimizer',
     'NadamOptimizer',
     'MovingAverageOptimizer',
diff --git a/tensorflow/contrib/opt/python/training/adam_gs_optimizer.py b/tensorflow/contrib/opt/python/training/adam_gs_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b149ed17533adff3bd7cd8fd8ff94d171f72911
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/adam_gs_optimizer.py
@@ -0,0 +1,223 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Adam rewrite to use global step for computing beta1 & beta2 accumulation."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("train.AdamOptimizer")
+class AdamGSOptimizer(optimizer.Optimizer):
+  """Optimizer that implements the Adam algorithm.
+
+  See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+  ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
+  """
+
+  def __init__(self,
+               global_step=0,
+               learning_rate=0.001,
+               beta1=0.9,
+               beta2=0.999,
+               epsilon=1e-8,
+               use_locking=False,
+               name="Adam"):
+    r"""Construct a new Adam optimizer.
+
+    Branched from tf.train.AdamOptimizer. The only difference is to pass
+    global step for computing beta1 and beta2 accumulators, instead of having
+    optimizer keep its own independent beta1 and beta2 accumulators as non-slot
+    variables.
+
+    Initialization:
+
+    $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
+    $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
+    $$t := 0 \text{(Initialize timestep)}$$
+
+    The update rule for `variable` with gradient `g` uses an optimization
+    described at the end of section2 of the paper:
+
+    $$t := t + 1$$
+    $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+
+    $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+    $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+    $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+
+    The default value of 1e-8 for epsilon might not be a good default in
+    general. For example, when training an Inception network on ImageNet a
+    current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
+    formulation just before Section 2.1 of the Kingma and Ba paper rather than
+    the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
+    hat" in the paper.
+
+    The sparse implementation of this algorithm (used when the gradient is an
+    IndexedSlices object, typically because of `tf.gather` or an embedding
+    lookup in the forward pass) does apply momentum to variable slices even if
+    they were not used in the forward pass (meaning they have a gradient equal
+    to zero). Momentum decay (beta1) is also applied to the entire momentum
+    accumulator. This means that the sparse behavior is equivalent to the dense
+    behavior (in contrast to some momentum implementations which ignore momentum
+    unless a variable slice was actually used).
+
+    Args:
+      global_step: tensorflow variable indicating the step.
+      learning_rate: A Tensor or a floating point value.  The learning rate.
+      beta1: A float value or a constant float tensor. The exponential decay
+        rate for the 1st moment estimates.
+      beta2: A float value or a constant float tensor. The exponential decay
+        rate for the 2nd moment estimates.
+      epsilon: A small constant for numerical stability. This epsilon is
+        "epsilon hat" in the Kingma and Ba paper (in the formula just before
+        Section 2.1), not the epsilon in Algorithm 1 of the paper.
+      use_locking: If True use locks for update operations.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to "Adam".  @compatibility(eager) When eager execution is
+        enabled, `learning_rate`, `beta1`, `beta2`, and `epsilon` can each be a
+        callable that takes no arguments and returns the actual value to use.
+        This can be useful for changing these values across different
+        invocations of optimizer functions. @end_compatibility
+    """
+    super(AdamGSOptimizer, self).__init__(use_locking, name)
+    self._lr = learning_rate
+    self._beta1 = beta1
+    self._beta2 = beta2
+    self._epsilon = epsilon
+    self._global_step = global_step
+    self._global_step_on_worker = None
+
+    # Tensor versions of the constructor arguments, created in _prepare().
+    self._lr_t = None
+    self._beta1_t = None
+    self._beta2_t = None
+    self._epsilon_t = None
+
+  def _get_beta_accumulators(self):
+    return (math_ops.pow(self._beta1_t, self._global_step_on_worker),
+            math_ops.pow(self._beta2_t, self._global_step_on_worker))
+
+  def _create_slots(self, var_list):
+    # Create slots for the first and second moments.
+    for v in var_list:
+      self._zeros_slot(v, "m", self._name)
+      self._zeros_slot(v, "v", self._name)
+
+  def _prepare(self):
+    lr = self._call_if_callable(self._lr)
+    beta1 = self._call_if_callable(self._beta1)
+    beta2 = self._call_if_callable(self._beta2)
+    epsilon = self._call_if_callable(self._epsilon)
+
+    self._lr_t = ops.convert_to_tensor(lr, name="learning_rate")
+    self._beta1_t = ops.convert_to_tensor(beta1, name="beta1")
+    self._beta2_t = ops.convert_to_tensor(beta2, name="beta2")
+    self._epsilon_t = ops.convert_to_tensor(epsilon, name="epsilon")
+
+    # Performance optimization so that worker creates a copy of the global step
+    # to avoid overloading the parameter server holding the global step.
+    self._global_step_on_worker = math_ops.cast(
+        array_ops.identity(self._global_step) + 1, dtypes.float32)
+
+  def _apply_dense(self, grad, var):
+    m = self.get_slot(var, "m")
+    v = self.get_slot(var, "v")
+    beta1_power, beta2_power = self._get_beta_accumulators()
+    return training_ops.apply_adam(
+        var,
+        m,
+        v,
+        math_ops.cast(beta1_power, var.dtype.base_dtype),
+        math_ops.cast(beta2_power, var.dtype.base_dtype),
+        math_ops.cast(self._lr_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta1_t, var.dtype.base_dtype),
+        math_ops.cast(self._beta2_t, var.dtype.base_dtype),
+        math_ops.cast(self._epsilon_t, var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking).op
+
+  def _resource_apply_dense(self, grad, var):
+    m = self.get_slot(var, "m")
+    v = self.get_slot(var, "v")
+    beta1_power, beta2_power = self._get_beta_accumulators()
+    return training_ops.resource_apply_adam(
+        var.handle,
+        m.handle,
+        v.handle,
+        math_ops.cast(beta1_power, grad.dtype.base_dtype),
+        math_ops.cast(beta2_power, grad.dtype.base_dtype),
+        math_ops.cast(self._lr_t, grad.dtype.base_dtype),
+        math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
+        math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
+        math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _apply_sparse_shared(self, grad, var, indices, scatter_add):
+    beta1_power, beta2_power = self._get_beta_accumulators()
+    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
+    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
+    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
+    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
+    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
+    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
+    lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = self.get_slot(var, "m")
+    m_scaled_g_values = grad * (1 - beta1_t)
+    m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
+    with ops.control_dependencies([m_t]):
+      m_t = scatter_add(m, indices, m_scaled_g_values)
+    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
+    v = self.get_slot(var, "v")
+    v_scaled_g_values = (grad * grad) * (1 - beta2_t)
+    v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
+    with ops.control_dependencies([v_t]):
+      v_t = scatter_add(v, indices, v_scaled_g_values)
+    v_sqrt = math_ops.sqrt(v_t)
+    var_update = state_ops.assign_sub(
+        var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)
+    return control_flow_ops.group(*[var_update, m_t, v_t])
+
+  def _apply_sparse(self, grad, var):
+    return self._apply_sparse_shared(
+        grad.values,
+        var,
+        grad.indices,
+        lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
+            x,
+            i,
+            v,
+            use_locking=self._use_locking))
+
+  def _resource_scatter_add(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
+      return x.value()
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    return self._apply_sparse_shared(grad, var, indices,
+                                     self._resource_scatter_add)
diff --git a/tensorflow/contrib/opt/python/training/adam_gs_optimizer_test.py b/tensorflow/contrib/opt/python/training/adam_gs_optimizer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c68c965aef3729bebe7d0e0dd707c344321d9e3f
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/adam_gs_optimizer_test.py
@@ -0,0 +1,382 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for AdamGS."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.opt.python.training import adam_gs_optimizer
+from tensorflow.python.client import session
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def adam_update_numpy(param,
+                      g_t,
+                      t,
+                      m,
+                      v,
+                      alpha=0.001,
+                      beta1=0.9,
+                      beta2=0.999,
+                      epsilon=1e-8):
+  alpha_t = alpha * np.sqrt(1 - beta2**t) / (1 - beta1**t)
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+  param_t = param - alpha_t * m_t / (np.sqrt(v_t) + epsilon)
+  return param_t, m_t, v_t
+
+
+class AdamGSOptimizerTest(test.TestCase):
+
+  def doTestSparse(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          global_step = resource_variable_ops.ResourceVariable(
+              array_ops.zeros([], dtypes.int64))
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          global_step = variables.Variable(array_ops.zeros([], dtypes.int64))
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([2]))
+        grads1_np_indices = np.array([0, 1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([2]))
+        opt = adam_gs_optimizer.AdamGSOptimizer(global_step=global_step)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                     global_step=global_step)
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  def testSparse(self):
+    self.doTestSparse(use_resource=False)
+
+  def testResourceSparse(self):
+    self.doTestSparse(use_resource=True)
+
+  def testSparseDevicePlacement(self):
+    for index_dtype in [dtypes.int32, dtypes.int64]:
+      with self.cached_session(force_gpu=test.is_gpu_available()):
+        # If a GPU is available, tests that all optimizer ops can be placed on
+        # it (i.e. they have GPU kernels).
+        var = variables.Variable([[1.0], [2.0]])
+        indices = constant_op.constant([0, 1], dtype=index_dtype)
+        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        optimizer = adam_gs_optimizer.AdamGSOptimizer(3.0)
+        minimize_op = optimizer.minimize(gathered_sum)
+        variables.global_variables_initializer().run()
+        minimize_op.run()
+
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        repeated_index_global_step = variables.Variable(
+            array_ops.zeros([], dtypes.int64))
+        aggregated_global_step = variables.Variable(
+            array_ops.zeros([], dtypes.int64))
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adam_gs_optimizer.AdamGSOptimizer(
+            global_step=repeated_index_global_step).apply_gradients(
+                [(grad_repeated_index, repeated_index_update_var)],
+                global_step=repeated_index_global_step)
+        aggregated_update = adam_gs_optimizer.AdamGSOptimizer(
+            global_step=aggregated_global_step).apply_gradients(
+                [(grad_aggregated, aggregated_update_var)],
+                global_step=aggregated_global_step)
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            self.evaluate(repeated_index_update_var))
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              self.evaluate(repeated_index_update_var))
+
+  def doTestBasic(self, use_resource=False, use_callable_params=False):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          global_step = resource_variable_ops.ResourceVariable(
+              array_ops.zeros([], dtypes.int64), name="global_step_%d" % i)
+          var0 = resource_variable_ops.ResourceVariable(
+              var0_np, name="var0_%d" % i)
+          var1 = resource_variable_ops.ResourceVariable(
+              var1_np, name="var1_%d" % i)
+        else:
+          global_step = variables.Variable(array_ops.zeros([], dtypes.int64))
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = lambda: 0.001
+        beta1 = lambda: 0.9
+        beta2 = lambda: 0.999
+        epsilon = lambda: 1e-8
+        if not use_callable_params:
+          learning_rate = learning_rate()
+          beta1 = beta1()
+          beta2 = beta2()
+          epsilon = epsilon()
+
+        opt = adam_gs_optimizer.AdamGSOptimizer(global_step=global_step,
+                                                learning_rate=learning_rate)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                     global_step=global_step)
+        opt_variables = opt.variables()
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+        self.assertTrue(beta1_power is not None)
+        self.assertTrue(beta2_power is not None)
+        self.assertNotIn(beta1_power, opt_variables)
+        self.assertNotIn(beta2_power, opt_variables)
+
+        if not context.executing_eagerly():
+          with ops.Graph().as_default():
+            # Shouldn't return non-slot variables from other graphs.
+            self.assertEqual(0, len(opt.variables()))
+          self.evaluate(variables.global_variables_initializer())
+          # Fetch params to validate initial values
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          if not context.executing_eagerly():
+            self.evaluate(update)
+            self.assertAllCloseAccordingToType(
+                0.9**(t + 1), self.evaluate(beta1_power))
+            self.assertAllCloseAccordingToType(
+                0.999**(t + 1), self.evaluate(beta2_power))
+          else:
+            if t > 1:
+              opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                  global_step=global_step)
+              beta1_power, beta2_power = opt._get_beta_accumulators()
+              self.assertAllCloseAccordingToType(
+                  0.9**t, self.evaluate(beta1_power))
+              self.assertAllCloseAccordingToType(
+                  0.999**t, self.evaluate(beta2_power))
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+          if use_resource:
+            self.assertEqual("var0_%d/Adam:0" % (i,),
+                             opt.get_slot(var=var0, name="m").name)
+
+  def testBasic(self):
+    with self.cached_session():
+      self.doTestBasic(use_resource=False)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  def testBasicCallableParams(self):
+    with context.eager_mode():
+      self.doTestBasic(use_resource=True, use_callable_params=True)
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        global_step = variables.Variable(array_ops.zeros([], dtypes.int64))
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adam_gs_optimizer.AdamGSOptimizer(
+            global_step=global_step, learning_rate=constant_op.constant(0.001))
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                     global_step=global_step)
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        global_step = variables.Variable(array_ops.zeros([], dtypes.int64))
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adam_gs_optimizer.AdamGSOptimizer(global_step=global_step)
+        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                      global_step=global_step)
+        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                      global_step=global_step)
+        variables.global_variables_initializer().run()
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 3 steps of intertwined Adam1 and Adam2.
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
+          if t % 2 == 0:
+            update1.run()
+          else:
+            update2.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  def testTwoSessions(self):
+    optimizer = adam_gs_optimizer.AdamGSOptimizer()
+
+    with context.eager_mode():
+      var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+      grads0 = constant_op.constant(np.array([0.1, 0.1]))
+      optimizer.apply_gradients([(grads0, var0)])
+
+    g = ops.Graph()
+    with g.as_default():
+      with session.Session():
+        var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+        grads0 = constant_op.constant(np.array([0.1, 0.1]))
+        optimizer.apply_gradients([(grads0, var0)])
+
+    gg = ops.Graph()
+    with gg.as_default():
+      with session.Session():
+        var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+        grads0 = constant_op.constant(np.array([0.1, 0.1]))
+
+        # If the optimizer saves any state not keyed by graph the following line
+        # fails.
+        optimizer.apply_gradients([(grads0, var0)])
+
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = resource_variable_ops.ResourceVariable(1.)
+      v2 = resource_variable_ops.ResourceVariable(1.)
+      opt = adam_gs_optimizer.AdamGSOptimizer(1.)
+      opt.minimize(lambda: v1 + v2)
+      # There should be two unique slot variables for v1 and v2 respectively.
+      self.assertEqual(4, len(set(opt.variables())))
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/opt/python/training/lazy_adam_gs_optimizer.py b/tensorflow/contrib/opt/python/training/lazy_adam_gs_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8827007e4d7f6722398a8e36bd626377842d92ef
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/lazy_adam_gs_optimizer.py
@@ -0,0 +1,114 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""LazyAdam rewrite to use global step for computing beta1 & beta2 accumulation.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.opt.python.training import adam_gs_optimizer
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+
+
+class LazyAdamGSOptimizer(adam_gs_optimizer.AdamGSOptimizer):
+  """Variant of the Adam optimizer that handles sparse updates more efficiently.
+
+  Branched from tf.contrib.opt.LazyAdamGSOptimizer. The only difference is to
+  pass global step for computing beta1 and beta2 accumulators, instead of having
+  optimizer keep its own independent beta1 and beta2 accumulators as non-slot
+  variables.
+
+  The original Adam algorithm maintains two moving-average accumulators for
+  each trainable variable; the accumulators are updated at every step.
+  This class provides lazier handling of gradient updates for sparse variables.
+  It only updates moving-average accumulators for sparse variable indices that
+  appear in the current batch, rather than updating the accumulators for all
+  indices. Compared with the original Adam optimizer, it can provide large
+  improvements in model training throughput for some applications. However, it
+  provides slightly different semantics than the original Adam algorithm, and
+  may lead to different empirical results.
+  """
+
+  def _apply_sparse(self, grad, var):
+    beta1_power, beta2_power = self._get_beta_accumulators()
+    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
+    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
+    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
+    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
+    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
+    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
+    lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
+
+    # \\(m := beta1 * m + (1 - beta1) * g_t\\)
+    m = self.get_slot(var, "m")
+    m_t = state_ops.scatter_update(m, grad.indices,
+                                   beta1_t * array_ops.gather(m, grad.indices) +
+                                   (1 - beta1_t) * grad.values,
+                                   use_locking=self._use_locking)
+
+    # \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\)
+    v = self.get_slot(var, "v")
+    v_t = state_ops.scatter_update(v, grad.indices,
+                                   beta2_t * array_ops.gather(v, grad.indices) +
+                                   (1 - beta2_t) * math_ops.square(grad.values),
+                                   use_locking=self._use_locking)
+
+    # \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\)
+    m_t_slice = array_ops.gather(m_t, grad.indices)
+    v_t_slice = array_ops.gather(v_t, grad.indices)
+    denominator_slice = math_ops.sqrt(v_t_slice) + epsilon_t
+    var_update = state_ops.scatter_sub(var, grad.indices,
+                                       lr * m_t_slice / denominator_slice,
+                                       use_locking=self._use_locking)
+    return control_flow_ops.group(var_update, m_t, v_t)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    beta1_power, beta2_power = self._get_beta_accumulators()
+    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
+    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
+    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
+    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
+    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
+    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
+    lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
+
+    # \\(m := beta1 * m + (1 - beta1) * g_t\\)
+    m = self.get_slot(var, "m")
+    m_t_slice = beta1_t * array_ops.gather(m, indices) + (1 - beta1_t) * grad
+    m_update_op = resource_variable_ops.resource_scatter_update(m.handle,
+                                                                indices,
+                                                                m_t_slice)
+
+    # \\(v := beta2 * v + (1 - beta2) * (g_t * g_t)\\)
+    v = self.get_slot(var, "v")
+    v_t_slice = (beta2_t * array_ops.gather(v, indices) +
+                 (1 - beta2_t) * math_ops.square(grad))
+    v_update_op = resource_variable_ops.resource_scatter_update(v.handle,
+                                                                indices,
+                                                                v_t_slice)
+
+    # \\(variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t))\\)
+    var_slice = lr * m_t_slice / (math_ops.sqrt(v_t_slice) + epsilon_t)
+    var_update_op = resource_variable_ops.resource_scatter_sub(var.handle,
+                                                               indices,
+                                                               var_slice)
+
+    return control_flow_ops.group(var_update_op, m_update_op, v_update_op)
diff --git a/tensorflow/contrib/opt/python/training/lazy_adam_gs_optimizer_test.py b/tensorflow/contrib/opt/python/training/lazy_adam_gs_optimizer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdc9a02a546c8399172d0c5b58941b4d80179955
--- /dev/null
+++ b/tensorflow/contrib/opt/python/training/lazy_adam_gs_optimizer_test.py
@@ -0,0 +1,402 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for LazyAdamGSOptimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.contrib.opt.python.training import lazy_adam_gs_optimizer
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def adam_update_numpy(param,
+                      g_t,
+                      t,
+                      m,
+                      v,
+                      alpha=0.001,
+                      beta1=0.9,
+                      beta2=0.999,
+                      epsilon=1e-8):
+  alpha_t = alpha * np.sqrt(1 - beta2**t) / (1 - beta1**t)
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+  param_t = param - alpha_t * m_t / (np.sqrt(v_t) + epsilon)
+  return param_t, m_t, v_t
+
+
+class LazyAdamGSOptimizerTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters([False, True])
+  def testSparse(self, use_resource):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          global_step = resource_variable_ops.ResourceVariable(
+              array_ops.zeros([], dtypes.int64))
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          global_step = variables.Variable(array_ops.zeros([], dtypes.int64))
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+
+        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([2]))
+        grads1_np_indices = np.array([0, 1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([2]))
+        opt = lazy_adam_gs_optimizer.LazyAdamGSOptimizer(
+            global_step=global_step)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                     global_step=global_step)
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  @parameterized.parameters([False, True])
+  def testSparseDevicePlacement(self, use_resource):
+    for index_dtype in [dtypes.int32, dtypes.int64]:
+      with self.cached_session(force_gpu=test.is_gpu_available()):
+        # If a GPU is available, tests that all optimizer ops can be placed on
+        # it (i.e. they have GPU kernels).
+        if use_resource:
+          global_step = resource_variable_ops.ResourceVariable(
+              array_ops.zeros([], dtypes.int64))
+          var = resource_variable_ops.ResourceVariable([[1.0], [2.0]])
+        else:
+          global_step = variables.Variable(array_ops.zeros([], dtypes.int64))
+          var = variables.Variable([[1.0], [2.0]])
+
+        indices = constant_op.constant([0, 1], dtype=index_dtype)
+        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        optimizer = lazy_adam_gs_optimizer.LazyAdamGSOptimizer(
+            global_step=global_step, learning_rate=3.0)
+        minimize_op = optimizer.minimize(gathered_sum, global_step=global_step)
+        variables.global_variables_initializer().run()
+        minimize_op.run()
+
+  @parameterized.parameters([False, True])
+  def testSparseRepeatedIndices(self, use_resource):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        if use_resource:
+          repeated_index_global_step = resource_variable_ops.ResourceVariable(
+              array_ops.zeros([], dtypes.int64))
+          aggregated_global_step = resource_variable_ops.ResourceVariable(
+              array_ops.zeros([], dtypes.int64))
+          repeated_index_update_var = resource_variable_ops.ResourceVariable(
+              [[1.0], [2.0]], dtype=dtype)
+          aggregated_update_var = resource_variable_ops.ResourceVariable(
+              [[1.0], [2.0]], dtype=dtype)
+        else:
+          repeated_index_global_step = variables.Variable(
+              array_ops.zeros([], dtypes.int64))
+          aggregated_global_step = variables.Variable(
+              array_ops.zeros([], dtypes.int64))
+          repeated_index_update_var = variables.Variable(
+              [[1.0], [2.0]], dtype=dtype)
+          aggregated_update_var = variables.Variable(
+              [[1.0], [2.0]], dtype=dtype)
+
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update_opt = lazy_adam_gs_optimizer.LazyAdamGSOptimizer(
+            global_step=repeated_index_global_step)
+        repeated_update = repeated_update_opt.apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)],
+            global_step=repeated_index_global_step)
+        aggregated_update_opt = lazy_adam_gs_optimizer.LazyAdamGSOptimizer(
+            global_step=aggregated_global_step)
+        aggregated_update = aggregated_update_opt.apply_gradients(
+            [(grad_aggregated, aggregated_update_var)],
+            global_step=aggregated_global_step)
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
+  def doTestBasic(self, use_resource=False, use_callable_params=False):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          global_step = resource_variable_ops.ResourceVariable(
+              array_ops.zeros([], dtypes.int64), name="global_step_%d" % i)
+          var0 = resource_variable_ops.ResourceVariable(
+              var0_np, name="var0_%d" % i)
+          var1 = resource_variable_ops.ResourceVariable(
+              var1_np, name="var1_%d" % i)
+        else:
+          global_step = variables.Variable(array_ops.zeros([], dtypes.int64))
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = lambda: 0.001
+        beta1 = lambda: 0.9
+        beta2 = lambda: 0.999
+        epsilon = lambda: 1e-8
+        if not use_callable_params:
+          learning_rate = learning_rate()
+          beta1 = beta1()
+          beta2 = beta2()
+          epsilon = epsilon()
+
+        opt = lazy_adam_gs_optimizer.LazyAdamGSOptimizer(
+            global_step=global_step, learning_rate=learning_rate)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                     global_step=global_step)
+        opt_variables = opt.variables()
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+        self.assertIsNotNone(beta1_power)
+        self.assertIsNotNone(beta2_power is not None)
+        self.assertNotIn(beta1_power, opt_variables)
+        self.assertNotIn(beta2_power, opt_variables)
+
+        if not context.executing_eagerly():
+          with ops.Graph().as_default():
+            # Shouldn't return non-slot variables from other graphs.
+            self.assertEqual(0, len(opt.variables()))
+          self.evaluate(variables.global_variables_initializer())
+          # Fetch params to validate initial values
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          if not context.executing_eagerly():
+            self.evaluate(update)
+            self.assertAllCloseAccordingToType(
+                0.9**(t + 1), self.evaluate(beta1_power))
+            self.assertAllCloseAccordingToType(
+                0.999**(t + 1), self.evaluate(beta2_power))
+          else:
+            if t > 1:
+              opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                  global_step=global_step)
+              beta1_power, beta2_power = opt._get_beta_accumulators()
+              self.assertAllCloseAccordingToType(
+                  0.9**t, self.evaluate(beta1_power))
+              self.assertAllCloseAccordingToType(
+                  0.999**t, self.evaluate(beta2_power))
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+          if use_resource:
+            self.assertEqual("var0_%d/Adam:0" % (i,),
+                             opt.get_slot(var=var0, name="m").name)
+
+  def testBasic(self):
+    with self.cached_session():
+      self.doTestBasic(use_resource=False)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  def testBasicCallableParams(self):
+    with context.eager_mode():
+      self.doTestBasic(use_resource=True, use_callable_params=True)
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        global_step = variables.Variable(array_ops.zeros([], dtypes.int64))
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = lazy_adam_gs_optimizer.LazyAdamGSOptimizer(
+            global_step=global_step, learning_rate=constant_op.constant(0.001))
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                     global_step=global_step)
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        global_step = variables.Variable(array_ops.zeros([], dtypes.int64))
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = lazy_adam_gs_optimizer.LazyAdamGSOptimizer(
+            global_step=global_step)
+        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                      global_step=global_step)
+        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]),
+                                      global_step=global_step)
+        variables.global_variables_initializer().run()
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 3 steps of intertwined Adam1 and Adam2.
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          if t % 2 == 0:
+            update1.run()
+          else:
+            update2.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testTwoSessions(self):
+    optimizer = lazy_adam_gs_optimizer.LazyAdamGSOptimizer()
+
+    with context.eager_mode():
+      var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+      grads0 = constant_op.constant(np.array([0.1, 0.1]))
+      optimizer.apply_gradients([(grads0, var0)])
+
+    g = ops.Graph()
+    with g.as_default():
+      with self.session(graph=g):
+        var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+        grads0 = constant_op.constant(np.array([0.1, 0.1]))
+        optimizer.apply_gradients([(grads0, var0)])
+
+    gg = ops.Graph()
+    with gg.as_default():
+      with self.session(graph=gg):
+        var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+        grads0 = constant_op.constant(np.array([0.1, 0.1]))
+
+        # If the optimizer saves any state not keyed by graph the following line
+        # fails.
+        optimizer.apply_gradients([(grads0, var0)])
+
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = resource_variable_ops.ResourceVariable(1.)
+      v2 = resource_variable_ops.ResourceVariable(1.)
+      opt = lazy_adam_gs_optimizer.LazyAdamGSOptimizer(1.)
+      opt.minimize(lambda: v1 + v2)
+      # There should be two non-slot variables, and two unique slot variables
+      # for v1 and v2 respectively.
+      self.assertLen(set(opt.variables()), 4)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py
index 248ffb1f7eb5dc27112ddf9b8670344904065ed0..1b7800f324b908e3c88fe90d31a2a08cbbd5ccf2 100644
--- a/tensorflow/contrib/optimizer_v2/adam.py
+++ b/tensorflow/contrib/optimizer_v2/adam.py
@@ -36,7 +36,7 @@ class AdamOptimizer(optimizer_v2.OptimizerV2):
 
   def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
                use_locking=False, name="Adam"):
-    """Construct a new Adam optimizer.
+    r"""Construct a new Adam optimizer.
 
     Initialization:
 
diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
index 72019b31540a943582ebb4699013d9dcfc10769f..b2ea3daf82ed8daa6e0b9acd8e3cf258b8181615 100644
--- a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
+++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
@@ -44,14 +44,15 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as core_saver
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import graph_view
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
 
 
-class NonLayerCheckpointable(tracking.Checkpointable):
+class NonLayerTrackable(tracking.AutoTrackable):
 
   def __init__(self):
-    super(NonLayerCheckpointable, self).__init__()
+    super(NonLayerTrackable, self).__init__()
     self.a_variable = util.add_variable(
         self, name="a_variable", shape=[])
 
@@ -64,8 +65,8 @@ class MyModel(training.Model):
     super(MyModel, self).__init__()
     self._named_dense = core.Dense(1, use_bias=True)
     self._second = core.Dense(1, use_bias=False)
-    # We can still track Checkpointables which aren't Layers.
-    self._non_layer = NonLayerCheckpointable()
+    # We can still track Trackables which aren't Layers.
+    self._non_layer = NonLayerTrackable()
 
   def call(self, values):
     ret = self._second(self._named_dense(values))
@@ -100,7 +101,7 @@ class CheckpointingTests(test.TestCase):
     other_model = MyModel()
     optimizer = adam.AdamOptimizer(0.001)
     optimizer_step = training_util.get_or_create_global_step()
-    root_checkpointable = util.Checkpoint(
+    root_trackable = util.Checkpoint(
         optimizer=optimizer, model=model, optimizer_step=optimizer_step)
     if context.executing_eagerly():
       optimizer.minimize(
@@ -116,11 +117,10 @@ class CheckpointingTests(test.TestCase):
           other_model(input_value),
           global_step=optimizer_step)
       self.evaluate(util.gather_initializers(
-          root_checkpointable))
+          root_trackable))
       self.evaluate(train_op)
-    named_variables, serialized_graph, _ = (
-        util._serialize_object_graph(
-            root_checkpointable, saveables_cache=None))
+    named_variables, serialized_graph, _ = graph_view.ObjectGraphView(
+        root_trackable).serialize_object_graph()
     expected_checkpoint_names = (
         # Created in the root node, so no prefix.
         "optimizer_step",
@@ -208,7 +208,7 @@ class CheckpointingTests(test.TestCase):
   def testSaveRestore(self):
     model = MyModel()
     optimizer = adam.AdamOptimizer(0.001)
-    root_checkpointable = util.Checkpoint(
+    root_trackable = util.Checkpoint(
         optimizer=optimizer, model=model)
     input_value = constant_op.constant([[3.]])
     if context.executing_eagerly():
@@ -217,24 +217,24 @@ class CheckpointingTests(test.TestCase):
     else:
       train_op = optimizer.minimize(model(input_value))
       # TODO(allenl): Make initialization more pleasant when graph building.
-      root_checkpointable.save_counter  # pylint: disable=pointless-statement
+      root_trackable.save_counter  # pylint: disable=pointless-statement
       self.evaluate(util.gather_initializers(
-          root_checkpointable))
+          root_trackable))
       self.evaluate(train_op)
     prefix = os.path.join(self.get_temp_dir(), "ckpt")
     self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
     m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
     self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
-    save_path = root_checkpointable.save(file_prefix=prefix)
+    save_path = root_trackable.save(file_prefix=prefix)
     self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
-    self.evaluate(state_ops.assign(root_checkpointable.save_counter, 3))
+    self.evaluate(state_ops.assign(root_trackable.save_counter, 3))
     optimizer_variables = self.evaluate(optimizer.variables())
     self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
     # Immediate restoration
-    status = root_checkpointable.restore(save_path=save_path).assert_consumed()
+    status = root_trackable.restore(save_path=save_path).assert_consumed()
     status.run_restore_ops()
     self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
-    self.assertAllEqual(1, self.evaluate(root_checkpointable.save_counter))
+    self.assertAllEqual(1, self.evaluate(root_trackable.save_counter))
     self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
     if not context.executing_eagerly():
       return  # Restore-on-create is only supported when executing eagerly
@@ -440,7 +440,7 @@ class CheckpointingTests(test.TestCase):
   def testDeferredSlotRestoration(self):
     checkpoint_directory = self.get_temp_dir()
 
-    root = tracking.Checkpointable()
+    root = util.Checkpoint()
     root.var = util.add_variable(
         root, name="var", initializer=0.)
     optimizer = adam.AdamOptimizer(0.1)
@@ -455,21 +455,17 @@ class CheckpointingTests(test.TestCase):
           util.Checkpoint(root=root, optimizer=optimizer)))
       self.evaluate(train_op)
     self.evaluate(state_ops.assign(root.var, 12.))
-    no_slots_path = util.CheckpointableSaver(root).save(
-        os.path.join(checkpoint_directory, "no_slots"))
+    no_slots_path = root.save(os.path.join(checkpoint_directory, "no_slots"))
     root.optimizer = optimizer
     self.evaluate(state_ops.assign(root.var, 13.))
     self.evaluate(state_ops.assign(optimizer.get_slot(name="m", var=root.var),
                                    14.))
-    slots_path = util.CheckpointableSaver(root).save(
-        os.path.join(checkpoint_directory, "with_slots"))
-    new_root = tracking.Checkpointable()
+    slots_path = root.save(os.path.join(checkpoint_directory, "with_slots"))
+    new_root = util.Checkpoint()
     # Load the slot-containing checkpoint (deferred), then immediately overwrite
     # the non-slot variable (also deferred).
-    slot_status = util.CheckpointableSaver(
-        new_root).restore(slots_path)
-    no_slot_status = util.CheckpointableSaver(
-        new_root).restore(no_slots_path)
+    slot_status = new_root.restore(slots_path)
+    no_slot_status = new_root.restore(no_slots_path)
     with self.assertRaises(AssertionError):
       no_slot_status.assert_consumed()
     new_root.var = util.add_variable(
@@ -508,15 +504,14 @@ class CheckpointingTests(test.TestCase):
       with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = tracking.Checkpointable()
+        obj = util.Checkpoint()
         obj.var = variable_scope.get_variable(name="v", initializer=0.)
         obj.opt = adam.AdamOptimizer(0.1)
         obj.opt.minimize(obj.var.read_value())
         self.evaluate(util.gather_initializers(obj))
-        saver = util.CheckpointableSaver(obj)
-        saver.save(checkpoint_prefix)
+        obj.save(checkpoint_prefix)
         before_ops = graph.get_operations()
-        saver.save(checkpoint_prefix)
+        obj.save(checkpoint_prefix)
         self.assertEqual(before_ops, graph.get_operations())
 
   def testManyRestoresGraph(self):
@@ -526,16 +521,15 @@ class CheckpointingTests(test.TestCase):
       with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = tracking.Checkpointable()
+        obj = util.Checkpoint()
         obj.var = variable_scope.get_variable(name="v", initializer=0.)
         obj.opt = adam.AdamOptimizer(0.1)
         obj.opt.minimize(obj.var.read_value())
         self.evaluate(util.gather_initializers(obj))
-        saver = util.CheckpointableSaver(obj)
-        save_path = saver.save(checkpoint_prefix)
-        saver.restore(save_path)
+        save_path = obj.save(checkpoint_prefix)
+        obj.restore(save_path)
         before_ops = graph.get_operations()
-        saver.restore(save_path)
+        obj.restore(save_path)
         self.assertEqual(before_ops, graph.get_operations())
 
   def testMultipleGraphsNonSlotVariables(self):
@@ -548,11 +542,11 @@ class CheckpointingTests(test.TestCase):
       first_session = session_lib.Session(graph=first_graph)
       with first_graph.as_default(), first_session.as_default():
         first_variable = resource_variable_ops.ResourceVariable([1.])
-        first_root_checkpointable = util.Checkpoint(
+        first_root_trackable = util.Checkpoint(
             optimizer=optimizer, variable=first_variable)
         train_op = optimizer.minimize(first_variable.read_value)
         self.evaluate(util.gather_initializers(
-            first_root_checkpointable))
+            first_root_trackable))
         self.evaluate(train_op)
         self.evaluate(first_variable.assign([1.]))
         self.evaluate(optimizer.get_slot(
@@ -564,23 +558,23 @@ class CheckpointingTests(test.TestCase):
       second_graph = ops.Graph()
       with second_graph.as_default(), session_lib.Session(graph=second_graph):
         second_variable = resource_variable_ops.ResourceVariable([1.])
-        second_root_checkpointable = util.Checkpoint(
+        second_root_trackable = util.Checkpoint(
             optimizer=optimizer, variable=second_variable)
         train_op = optimizer.minimize(second_variable.read_value)
-        second_root_checkpointable.restore(None).initialize_or_restore()
+        second_root_trackable.restore(None).initialize_or_restore()
         self.evaluate(train_op)
         self.evaluate(second_variable.assign([4.]))
         self.evaluate(optimizer.get_slot(
             var=second_variable, name="m").assign([5.]))
         beta_1_power, _ = optimizer._get_beta_accumulators()
         self.evaluate(beta_1_power.assign(6.))
-        save_path = second_root_checkpointable.save(checkpoint_prefix)
+        save_path = second_root_trackable.save(checkpoint_prefix)
         self.evaluate(second_variable.assign([7.]))
         self.evaluate(optimizer.get_slot(
             var=second_variable, name="m").assign([8.]))
         beta_1_power, _ = optimizer._get_beta_accumulators()
         self.assertAllEqual(6., self.evaluate(beta_1_power))
-        status = second_root_checkpointable.restore(save_path)
+        status = second_root_trackable.restore(save_path)
         status.assert_consumed().run_restore_ops()
         self.assertAllEqual([4.], self.evaluate(second_variable))
         self.assertAllEqual([5.], self.evaluate(optimizer.get_slot(
@@ -600,7 +594,7 @@ class CheckpointingTests(test.TestCase):
 class TemplateTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
-  def test_checkpointable_save_restore(self):
+  def test_trackable_save_restore(self):
 
     def _templated():
       v = variable_scope.get_variable(
@@ -647,13 +641,13 @@ class CheckpointCompatibilityTests(test.TestCase):
     model = MyModel()
     optimizer = adam.AdamOptimizer(0.001)
     optimizer_step = training_util.get_or_create_global_step()
-    root_checkpointable = util.Checkpoint(
+    root_trackable = util.Checkpoint(
         optimizer=optimizer, model=model, optimizer_step=optimizer_step)
     train_op = optimizer.minimize(
         functools.partial(model, input_value),
         global_step=optimizer_step)
     self.evaluate(util.gather_initializers(
-        root_checkpointable))
+        root_trackable))
     self.evaluate(train_op)
     # A regular variable, a slot variable, and a non-slot Optimizer variable
     # with known values to check when loading.
@@ -662,24 +656,24 @@ class CheckpointCompatibilityTests(test.TestCase):
         var=model._named_dense.bias, name="m").assign([2.]))
     beta_1_power, _ = optimizer._get_beta_accumulators()
     self.evaluate(beta_1_power.assign(3.))
-    return root_checkpointable
+    return root_trackable
 
-  def _set_sentinels(self, root_checkpointable):
-    self.evaluate(root_checkpointable.model._named_dense.bias.assign([101.]))
+  def _set_sentinels(self, root_trackable):
+    self.evaluate(root_trackable.model._named_dense.bias.assign([101.]))
     self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, name="m")
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, name="m")
         .assign([102.]))
-    beta_1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    beta_1_power, _ = root_trackable.optimizer._get_beta_accumulators()
     self.evaluate(beta_1_power.assign(103.))
 
-  def _check_sentinels(self, root_checkpointable):
+  def _check_sentinels(self, root_trackable):
     self.assertAllEqual(
-        [1.], self.evaluate(root_checkpointable.model._named_dense.bias))
+        [1.], self.evaluate(root_trackable.model._named_dense.bias))
     self.assertAllEqual([2.], self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, name="m")))
-    beta_1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, name="m")))
+    beta_1_power, _ = root_trackable.optimizer._get_beta_accumulators()
     self.assertAllEqual(3., self.evaluate(beta_1_power))
 
   def _write_name_based_checkpoint(self):
@@ -704,7 +698,7 @@ class CheckpointCompatibilityTests(test.TestCase):
       self._set_sentinels(root)
       with self.assertRaises(AssertionError):
         self._check_sentinels(root)
-      object_saver = util.CheckpointableSaver(root)
+      object_saver = util.TrackableSaver(graph_view.ObjectGraphView(root))
       self._set_sentinels(root)
       status = object_saver.restore(save_path)
       if context.executing_eagerly():
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
index 7fb23abc38d9dc101204ed83808aebe5a8ef1e78..a7f978634ed45012144b2cc49ed069f6fca44f66 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -24,7 +24,6 @@ import abc
 
 import six
 
-from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.eager import backprop
@@ -39,7 +38,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import optimizer as optimizer_v1
 from tensorflow.python.training import slot_creator
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
 
 
@@ -224,7 +223,7 @@ class _OptimizerV2State(object):
       }
     self._slots = {}
     self._non_slot_dict = {}
-    # Extra state to help Optimizers implement Checkpointable. Holds information
+    # Extra state to help Optimizers implement Trackable. Holds information
     # about variables which will be restored as soon as they're created.
     self._deferred_dependencies = {}  # Non-slot variables
     self._deferred_slot_restorations = {}  # Slot variables
@@ -367,8 +366,8 @@ class _OptimizerV2State(object):
     slot variable needs to be restored).
 
     Args:
-      slot_variable_position: A `checkpointable._CheckpointPosition` object
-        indicating the slot variable `Checkpointable` object to be restored.
+      slot_variable_position: A `trackable._CheckpointPosition` object
+        indicating the slot variable `Trackable` object to be restored.
       slot_name: The name of this `Optimizer`'s slot to restore into.
       variable: The variable object this slot is being created for.
       optional_op_name: Name to use when scoping the Variable that needs to be
@@ -386,7 +385,7 @@ class _OptimizerV2State(object):
         # (aside from double initialization), and makes variable creator scopes
         # behave the same way they do when graph building.
         and not ops.get_default_graph()._variable_creator_stack):  # pylint: disable=protected-access
-      initializer = checkpointable.CheckpointInitialValue(
+      initializer = trackable.CheckpointInitialValue(
           checkpoint_position=slot_variable_position)
       slot_variable = self.create_slot(
           var=variable,
@@ -661,7 +660,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
                name=None,
                grad_loss=None,
                stop_gradients=None,
-               scale_loss_by_num_replicas=None):
+               scale_loss_by_num_replicas=False):
     """Add operations to minimize `loss` by updating `var_list`.
 
     This method simply combines calls `compute_gradients()` and
@@ -685,8 +684,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
       stop_gradients: Optional. A Tensor or list of tensors not to differentiate
         through.
       scale_loss_by_num_replicas: Optional boolean. If true, scale the loss down
-        by the number of replicas. By default, auto-detects whether this is
-        needed.
+        by the number of replicas. DEPRECATED and generally no longer needed.
 
     Returns:
       An Operation that updates the variables in `var_list`.  If `global_step`
@@ -732,7 +730,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
                         aggregation_method=None,
                         grad_loss=None,
                         stop_gradients=None,
-                        scale_loss_by_num_replicas=None):
+                        scale_loss_by_num_replicas=False):
     """Compute gradients of `loss` for the variables in `var_list`.
 
     This is the first part of `minimize()`.  It returns a list
@@ -756,8 +754,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
       stop_gradients: Optional. A Tensor or list of tensors not to differentiate
         through.
       scale_loss_by_num_replicas: Optional boolean. If true, scale the loss down
-        by the number of replicas. By default, auto-detects whether this is
-        needed.
+        by the number of replicas. DEPRECATED and generally no longer needed.
 
     Returns:
       A list of (gradient, variable) pairs. Variable is always present, but
@@ -781,9 +778,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
           tape.watch(var_list)
         loss_value = loss()
 
-        # Scale loss for number of replicas (callable-loss case). In this case,
-        # we have to be careful to call distribute_lib.get_loss_reduction()
-        # *after* loss() is evaluated, so we know what loss reduction it uses.
+        # Scale loss for number of replicas (callable-loss case).
         loss_value = self._scale_loss(loss_value, scale_loss_by_num_replicas)
 
       if var_list is None:
@@ -839,12 +834,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
   @staticmethod
   def _scale_loss(loss_value, scale_loss_by_num_replicas):
     """Scale loss for the number of replicas."""
-    if scale_loss_by_num_replicas is None:
-      scale_loss_by_num_replicas = (
-          distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN)
     if scale_loss_by_num_replicas:
-      num_replicas = \
-        distribute_ctx.get_distribution_strategy().num_replicas_in_sync
+      num_replicas = distribute_ctx.get_strategy().num_replicas_in_sync
       if num_replicas > 1:
         loss_value *= 1. / num_replicas
     return loss_value
@@ -1268,10 +1259,10 @@ class OptimizerV2(optimizer_v1.Optimizer):
     return self._per_graph_state.get(var._graph_key, None)
 
   # --------------
-  # Overridden methods from Checkpointable.
+  # Overridden methods from Trackable.
   # --------------
 
-  def _track_checkpointable(self, *args, **kwargs):
+  def _track_trackable(self, *args, **kwargs):
     """Optimizers may not track dependencies. Raises an error."""
     raise NotImplementedError(
         "Optimizers may not have dependencies. File a feature request if this "
@@ -1279,7 +1270,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
   @property
   def _checkpoint_dependencies(self):
-    """From Checkpointable. Gather graph-specific non-slot variables to save."""
+    """From Trackable. Gather graph-specific non-slot variables to save."""
     current_graph_non_slot_variables = []
     state = self._get_per_graph_state()
     if state is not None:
@@ -1288,14 +1279,14 @@ class OptimizerV2(optimizer_v1.Optimizer):
           # Avoid comparing variables
           key=lambda item: item[0]):
         current_graph_non_slot_variables.append(
-            checkpointable.CheckpointableReference(
+            trackable.TrackableReference(
                 name=name, ref=variable_object))
     # Note: ignores super(); Optimizers may not have any dependencies outside of
     # state objects.
     return current_graph_non_slot_variables
 
   def _lookup_dependency(self, name):
-    """From Checkpointable. Find a non-slot variable in the current graph."""
+    """From Trackable. Find a non-slot variable in the current graph."""
     state = self._get_per_graph_state()
     if state is None:
       return None
@@ -1304,10 +1295,10 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
   @property
   def _deferred_dependencies(self):
-    """Lets Checkpointable know where non-slot variables are created.
+    """Lets Trackable know where non-slot variables are created.
 
     If necessary, creates a new state object for the current default graph.
-    Checkpointable will then add entries to that state's deferred dependency
+    Trackable will then add entries to that state's deferred dependency
     dictionary. The state object will check that dictionary when creating
     non-slot variables, restoring their value if an entry is found.
 
@@ -1320,14 +1311,14 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
   def _create_or_restore_slot_variable(self, slot_variable_position, slot_name,
                                        variable):
-    """Checkpointable: Restore a slot variable's value, possibly creating it.
+    """Trackable: Restore a slot variable's value, possibly creating it.
 
     Called when a variable which has an associated slot variable is created or
     restored.
 
     Args:
-      slot_variable_position: A `checkpointable._CheckpointPosition` object
-        indicating the slot variable `Checkpointable` object to be restored.
+      slot_variable_position: A `trackable._CheckpointPosition` object
+        indicating the slot variable `Trackable` object to be restored.
       slot_name: The name of this `Optimizer`'s slot to restore into.
       variable: The variable object this slot is being created for.
     """
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py b/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py
index dd7f2f44055a2e48e8a48d01c1da3a8e7513255d..2fc0b5ea4de2332ff3bf32f9a12a15eee566d5c4 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py
@@ -26,7 +26,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
-from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import gradients_util
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
@@ -71,7 +71,7 @@ class OptimizerTest(test.TestCase):
         opt_op = sgd_op.minimize(
             cost,
             global_step, [var0, var1],
-            aggregation_method=gradients_impl.AggregationMethod.
+            aggregation_method=gradients_util.AggregationMethod.
             EXPERIMENTAL_ACCUMULATE_N)
 
         variables.global_variables_initializer().run()
diff --git a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test_base.py b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test_base.py
index 17b69c7b35dce130c45ab0aadb28be330b4bfb88..c8524e9871864e0b4fffbd549d1fe347714f072a 100644
--- a/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test_base.py
+++ b/tensorflow/contrib/proto/python/kernel_tests/decode_proto_op_test_base.py
@@ -84,7 +84,10 @@ class DecodeProtoOpTestBase(test_base.ProtoOpTestBase, parameterized.TestCase):
       values = field_dict[field.name]
       self.assertEqual(dtypes.as_dtype(values.dtype), field.dtype)
 
-      fd = field.value.DESCRIPTOR.fields_by_name[field.name]
+      if 'ext_value' in field.name:
+        fd = test_example_pb2.PrimitiveValue()
+      else:
+        fd = field.value.DESCRIPTOR.fields_by_name[field.name]
 
       # Values has the same shape as the input plus an extra
       # dimension for repeats.
@@ -92,13 +95,16 @@ class DecodeProtoOpTestBase(test_base.ProtoOpTestBase, parameterized.TestCase):
 
       # Nested messages are represented as TF strings, requiring
       # some special handling.
-      if field.name == 'message_value':
+      if field.name == 'message_value' or 'ext_value' in field.name:
         vs = []
         for buf in values.flat:
           msg = test_example_pb2.PrimitiveValue()
           msg.ParseFromString(buf)
           vs.append(msg)
-        evs = getattr(field.value, field.name)
+        if 'ext_value' in field.name:
+          evs = field.value.Extensions[test_example_pb2.ext_value]
+        else:
+          evs = getattr(field.value, field.name)
         if len(vs) != len(evs):
           self.fail('Field %s decoded %d outputs, expected %d' %
                     (fd.name, len(vs), len(evs)))
@@ -223,7 +229,8 @@ class DecodeProtoOpTestBase(test_base.ProtoOpTestBase, parameterized.TestCase):
         sanitize=False,
         force_disordered=True)
 
-  @parameterized.named_parameters(*test_base.ProtoOpTestBase.named_parameters())
+  @parameterized.named_parameters(
+      *test_base.ProtoOpTestBase.named_parameters(extension=False))
   def testPacked(self, case):
     # Now try with the packed serialization.
     #
@@ -235,8 +242,7 @@ class DecodeProtoOpTestBase(test_base.ProtoOpTestBase, parameterized.TestCase):
         # Note: float_format='.17g' is necessary to ensure preservation of
         # doubles and floats in text format.
         text_format.Parse(
-            text_format.MessageToString(
-                value, float_format='.17g'),
+            text_format.MessageToString(value, float_format='.17g'),
             test_example_pb2.PackedTestValue()).SerializeToString()
         for value in case.values
     ]
diff --git a/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test_base.py b/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test_base.py
index 01b3ccc7fd3918c4ff910281289e31177e5a8097..5ec681ff55dbd18580761bb23e7017cfc9767b89 100644
--- a/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test_base.py
+++ b/tensorflow/contrib/proto/python/kernel_tests/encode_proto_op_test_base.py
@@ -15,9 +15,6 @@
 # =============================================================================
 """Table-driven test for encode_proto op.
 
-This test is run once with each of the *.TestCase.pbtxt files
-in the test directory.
-
 It tests that encode_proto is a lossless inverse of decode_proto
 (for the specified fields).
 """
@@ -145,7 +142,8 @@ class EncodeProtoOpTestBase(test_base.ProtoOpTestBase, parameterized.TestCase):
         # loss of packing in the encoding).
         self.assertEqual(in_buf, out_buf)
 
-  @parameterized.named_parameters(*test_base.ProtoOpTestBase.named_parameters())
+  @parameterized.named_parameters(
+      *test_base.ProtoOpTestBase.named_parameters(extension=False))
   def testRoundtrip(self, case):
     in_bufs = [value.SerializeToString() for value in case.values]
 
@@ -154,7 +152,8 @@ class EncodeProtoOpTestBase(test_base.ProtoOpTestBase, parameterized.TestCase):
     return self._testRoundtrip(
         in_bufs, 'tensorflow.contrib.proto.TestValue', case.fields)
 
-  @parameterized.named_parameters(*test_base.ProtoOpTestBase.named_parameters())
+  @parameterized.named_parameters(
+      *test_base.ProtoOpTestBase.named_parameters(extension=False))
   def testRoundtripPacked(self, case):
     # Now try with the packed serialization.
     # We test the packed representations by loading the same test cases using
diff --git a/tensorflow/contrib/proto/python/kernel_tests/proto_op_test_base.py b/tensorflow/contrib/proto/python/kernel_tests/proto_op_test_base.py
index 2950c7dfdc59a11ba7d2c07d8406bd4af26b5bd9..1a636486a1765ad9544b5cb5e52961cc47f92950 100644
--- a/tensorflow/contrib/proto/python/kernel_tests/proto_op_test_base.py
+++ b/tensorflow/contrib/proto/python/kernel_tests/proto_op_test_base.py
@@ -38,17 +38,18 @@ class ProtoOpTestBase(test.TestCase):
       ct.cdll.LoadLibrary(lib)
 
   @staticmethod
-  def named_parameters():
-    return (
-        ("defaults", ProtoOpTestBase.defaults_test_case()),
-        ("minmax", ProtoOpTestBase.minmax_test_case()),
-        ("nested", ProtoOpTestBase.nested_test_case()),
-        ("optional", ProtoOpTestBase.optional_test_case()),
-        ("promote", ProtoOpTestBase.promote_test_case()),
-        ("ragged", ProtoOpTestBase.ragged_test_case()),
-        ("shaped_batch", ProtoOpTestBase.shaped_batch_test_case()),
-        ("simple", ProtoOpTestBase.simple_test_case()),
-    )
+  def named_parameters(extension=True):
+    parameters = [("defaults", ProtoOpTestBase.defaults_test_case()),
+                  ("minmax", ProtoOpTestBase.minmax_test_case()),
+                  ("nested", ProtoOpTestBase.nested_test_case()),
+                  ("optional", ProtoOpTestBase.optional_test_case()),
+                  ("promote", ProtoOpTestBase.promote_test_case()),
+                  ("ragged", ProtoOpTestBase.ragged_test_case()),
+                  ("shaped_batch", ProtoOpTestBase.shaped_batch_test_case()),
+                  ("simple", ProtoOpTestBase.simple_test_case())]
+    if extension:
+      parameters.append(("extension", ProtoOpTestBase.extension_test_case()))
+    return parameters
 
   @staticmethod
   def defaults_test_case():
@@ -399,6 +400,21 @@ class ProtoOpTestBase(test.TestCase):
     field.value.bool_value.append(True)
     return test_case
 
+  @staticmethod
+  def extension_test_case():
+    test_case = test_example_pb2.TestCase()
+    value = test_case.values.add()
+    message_value = value.Extensions[test_example_pb2.ext_value].add()
+    message_value.double_value = 23.5
+    test_case.shapes.append(1)
+    test_case.sizes.append(1)
+    field = test_case.fields.add()
+    field.name = test_example_pb2.ext_value.full_name
+    field.dtype = types_pb2.DT_STRING
+    message_value = field.value.Extensions[test_example_pb2.ext_value].add()
+    message_value.double_value = 23.5
+    return test_case
+
   @staticmethod
   def simple_test_case():
     test_case = test_example_pb2.TestCase()
diff --git a/tensorflow/contrib/proto/python/kernel_tests/test_example.proto b/tensorflow/contrib/proto/python/kernel_tests/test_example.proto
index 674d881220a1113631def47c5111e3ef401b99f3..b1ce66de4feb9c6666ca9ccf39403b4e12840fcf 100644
--- a/tensorflow/contrib/proto/python/kernel_tests/test_example.proto
+++ b/tensorflow/contrib/proto/python/kernel_tests/test_example.proto
@@ -61,6 +61,8 @@ message TestValue {
   optional sfixed64 sfixed64_value_with_default = 32 [default = 11];
   optional sint32 sint32_value_with_default = 33 [default = 12];
   optional sint64 sint64_value_with_default = 34 [default = 13];
+
+  extensions 100 to 199;
 }
 
 // A PackedTestValue looks exactly the same as a TestValue in the text format,
@@ -68,7 +70,7 @@ message TestValue {
 // by loading the same test cases using this definition instead of TestValue.
 //
 // NOTE: This definition must be kept in sync with TestValue in every way except
-// the packed=true declaration.
+// the packed=true declaration and the lack of extensions.
 message PackedTestValue {
   repeated double double_value = 1 [packed = true];
   repeated float float_value = 2 [packed = true];
@@ -132,6 +134,10 @@ message ExtraFields {
   optional bool bool_value = 1777;
 }
 
+extend TestValue {
+  repeated PrimitiveValue ext_value = 100;
+}
+
 // The messages below are for yet-to-be created tests.
 
 message EnumValue {
diff --git a/tensorflow/contrib/quantize/BUILD b/tensorflow/contrib/quantize/BUILD
index b35c4fde1a2c704880e023a0c3ac1e0766493514..b67e68ea96a15f94e62050c92405eec4fe4be70f 100644
--- a/tensorflow/contrib/quantize/BUILD
+++ b/tensorflow/contrib/quantize/BUILD
@@ -202,8 +202,9 @@ py_test(
 
 py_test(
     name = "quantize_parameterized_test",
-    size = "large",
+    size = "medium",
     srcs = ["python/quantize_parameterized_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     # TODO(b/118839526): Re-enable msan test.
     tags = [
diff --git a/tensorflow/contrib/quantize/README.md b/tensorflow/contrib/quantize/README.md
index 9085d9fa719520ac84ef6f8e07d7fa335bef5605..b335e1af69b7b2e6020f8e745c43bb1bdc95a62d 100644
--- a/tensorflow/contrib/quantize/README.md
+++ b/tensorflow/contrib/quantize/README.md
@@ -8,9 +8,9 @@ for both training and inference. There are two aspects to this:
 
 For efficient inference, TensorFlow combines batch normalization with the preceding
 convolutional and fully-connected layers prior to quantization by
-[folding batch norm layers](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/quantize/python/fold_batch_norms.py){:.external}. 
+[folding batch norm layers](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/quantize/python/fold_batch_norms.py){:.external}.
 
-The quantization error is modeled using [fake quantization](../api_guides/python/array_ops.md#Fake_quantization)
+The quantization error is modeled using [fake quantization](../../api_guides/python/array_ops.md#Fake_quantization)
 nodes to simulate the effect of quantization in the forward and backward passes. The
 forward-pass models quantization, while the backward-pass models quantization as a
 straight-through estimator. Both the forward- and backward-pass simulate the quantization
@@ -105,12 +105,12 @@ toco \
   --std_value=127.5 --mean_value=127.5
 ```
 
-See the documentation for `tf.contrib.quantize` and [TensorFlow Lite](../lite/).
+See the documentation for `tf.contrib.quantize` and [TensorFlow Lite](../../lite/).
 
 
 ## Quantized accuracy results
 
-The following are results of trainiing some popular CNN models (Mobilenet-v1,
+The following are results of training some popular CNN models (Mobilenet-v1,
 Mobilenet-v2, and Inception-v3) using this tool:
 
 <figure>
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index e0c6da00d86fe4c5f881bcab7b444182da092b8f..a70f748fad60c6467946225ad5035caaf89c2aaf 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -454,7 +454,7 @@ def _CloneWithNewOperands(layer_op, input_tensor, weight_tensor,
         strides=layer_op.get_attr('strides'),
         padding=layer_op.get_attr('padding'),
         use_cudnn_on_gpu=layer_op.get_attr('use_cudnn_on_gpu'),
-        data_format=layer_op.get_attr('data_format'),
+        data_format=layer_op.get_attr('data_format').decode(),
         name=new_layer_name)
   elif layer_op.type == 'MatMul':
     return math_ops.matmul(
@@ -867,7 +867,7 @@ class _OpCloner(object):
         strides=op.get_attr('strides'),
         padding=op.get_attr('padding'),
         use_cudnn_on_gpu=op.get_attr('use_cudnn_on_gpu'),
-        data_format=op.get_attr('data_format'),
+        data_format=op.get_attr('data_format').decode(),
         name=new_name).op
 
   def _CloneDepthwiseConv2d(self, op, inputs, new_name):
diff --git a/tensorflow/contrib/quantize/python/quant_ops.py b/tensorflow/contrib/quantize/python/quant_ops.py
index 8619708cdaecd78bcc7de0e8e0cbf2baa11bf6a2..39082cacf9770619cf5fb529ac9a0aad6e955c6d 100644
--- a/tensorflow/contrib/quantize/python/quant_ops.py
+++ b/tensorflow/contrib/quantize/python/quant_ops.py
@@ -224,8 +224,8 @@ def MovingAvgQuantize(inputs,
       None, default_name=name_prefix, values=[inputs], reuse=reuse) as scope:
     scope.set_partitioner(None)
     input_shape = inputs.get_shape()
-    input_dim = len(input_shape)
     if per_channel:
+      input_dim = len(input_shape)
       # Only support quantizing 1-, 2- and 4-dimensional tensors.
       assert input_dim in [1, 2, 4], ('Expected 1D, 2D or 4D input, was: %s in '
                                       ' scope: %s' % (input_shape, name_prefix))
diff --git a/tensorflow/contrib/quantize/python/quant_ops_test.py b/tensorflow/contrib/quantize/python/quant_ops_test.py
index 36d2af94e059cdc75b758bbf607d26c4e1ee73e9..c636c90d23a0f5a6de9d14085c824283cb41f6ca 100644
--- a/tensorflow/contrib/quantize/python/quant_ops_test.py
+++ b/tensorflow/contrib/quantize/python/quant_ops_test.py
@@ -63,6 +63,12 @@ class QuantOpsTest(googletest.TestCase):
     self.assertAlmostEqual(min_value, -0.5, delta=1e-3)
     self.assertAlmostEqual(max_value, 0.5, delta=1e-3)
 
+  def testMovingAvgQuantizeTrainingAssignNoShape(self):
+    min_value, max_value = self._GetMinMaxValues(
+        quant_ops.MovingAvgQuantize, [[-1, 1], [0, 0]], shape=None)
+    self.assertAlmostEqual(min_value, -0.5, delta=1e-3)
+    self.assertAlmostEqual(max_value, 0.5, delta=1e-3)
+
   def testMovingAvgSymmetricQuantizeTrainingAssign(self):
     min_value, max_value = self._GetMinMaxValues(
         quant_ops.MovingAvgQuantize, [[-1, 0.5], [0, 0]], symmetric=True)
@@ -109,10 +115,10 @@ class QuantOpsTest(googletest.TestCase):
             is_training=True,
             vars_collection=_MIN_MAX_VARS)
 
-  def _GetMinMaxValues(self, quantize_fn, input_values, **kwds):
+  def _GetMinMaxValues(self, quantize_fn, input_values, shape=(2), **kwds):
     g = ops.Graph()
     with session.Session(graph=g) as sess:
-      x = array_ops.placeholder(dtypes.float32, shape=[2])
+      x = array_ops.placeholder(dtypes.float32, shape=shape)
       y = quantize_fn(
           x,
           init_min=0.0,
diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index 21d1b1213090273b5abd8e012f8711db98c94347..7c973fe597181b822e617db1f85a08f1b678e26f 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -685,7 +685,7 @@ def _InsertQuantOp(context,
       [1; 2^bits - 1] or wide range [0; 2^bits - 1].
     producer_scope: The restriction of producer scope. If not None, the new op
       will be inserted only when the producer is in this scope.
-    consumer_scope: The restriction of producer scope. If not None, the new op
+    consumer_scope: The restriction of consumer scope. If not None, the new op
       will be inserted only when all the consumers are in this scope.
   Raises:
     ValueError: When producer operation is not directly connected to the
diff --git a/tensorflow/contrib/receptive_field/README.md b/tensorflow/contrib/receptive_field/README.md
index 79b015a9163f5727caa40b54579c71e57621c92f..d1c41e4c0a11028765c9fc0dc345cb29453baa31 100644
--- a/tensorflow/contrib/receptive_field/README.md
+++ b/tensorflow/contrib/receptive_field/README.md
@@ -185,5 +185,4 @@ Effective padding (vertical) = 1482
 
 ## Authors
 
-Andr&eacute; Araujo (github id: andrefaraujo) and Mark Sandler (github id:
-marksandler)
+Andr&eacute; Araujo (@andrefaraujo) and Mark Sandler (@marksandler)
diff --git a/tensorflow/contrib/receptive_field/python/util/examples/compute_rf.py b/tensorflow/contrib/receptive_field/python/util/examples/compute_rf.py
index d6fdd12bbe37fb0e0cb12f1d0adc3fce29b19e8a..72f98ccc32e945b48b5f1b570bcca323a5b5f48a 100644
--- a/tensorflow/contrib/receptive_field/python/util/examples/compute_rf.py
+++ b/tensorflow/contrib/receptive_field/python/util/examples/compute_rf.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Computes Receptive Field (RF) information given a graph protobuf.
-
-For an example of usage, see accompanying file compute_rf.sh
-"""
+"""Computes Receptive Field (RF) information given a graph protobuf."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/receptive_field/python/util/examples/rf_benchmark.py b/tensorflow/contrib/receptive_field/python/util/examples/rf_benchmark.py
index a298b4d49038468299b58140758c69675368e855..325929a5937ac60a6134fae064e7633a4c57473d 100644
--- a/tensorflow/contrib/receptive_field/python/util/examples/rf_benchmark.py
+++ b/tensorflow/contrib/receptive_field/python/util/examples/rf_benchmark.py
@@ -16,8 +16,6 @@
 
 The receptive field (and related parameters) for the different models are
 printed to stdout, and may also optionally be written to a CSV file.
-
-For an example of usage, see rf_benchmark.sh
 """
 
 from __future__ import absolute_import
@@ -262,11 +260,11 @@ def _model_rf(graphdef,
       information will be computed.
     model_type: Type of model to be used, used only for printing purposes.
     csv_writer: A CSV writer for RF parameters, which is used if it is not None.
-    input_resolution: Input resolution to use when computing RF
-      parameters. This is important for the case where padding can only be
-      defined if the input resolution is known, which may happen if using SAME
-      padding. This is assumed the resolution for both height and width. If
-      None, we consider the resolution is unknown.
+    input_resolution: Input resolution to use when computing RF parameters. This
+      is important for the case where padding can only be defined if the input
+      resolution is known, which may happen if using SAME padding. This is
+      assumed the resolution for both height and width. If None, we consider the
+      resolution is unknown.
   """
   for desired_end_point_key in desired_end_point_keys:
     print('- %s:' % desired_end_point_key)
@@ -283,10 +281,10 @@ def _model_rf(graphdef,
       if (receptive_field_x == receptive_field_y) and (
           effective_stride_x == effective_stride_y) and (
               effective_padding_x == effective_padding_y):
-        print('Receptive field size = %5s, effective stride = %5s, effective '
-              'padding = %5s' % (str(receptive_field_x),
-                                 str(effective_stride_x),
-                                 str(effective_padding_x)))
+        print(
+            'Receptive field size = %5s, effective stride = %5s, effective '
+            'padding = %5s' % (str(receptive_field_x), str(effective_stride_x),
+                               str(effective_padding_x)))
       else:
         print('Receptive field size: horizontal = %5s, vertical = %5s. '
               'Effective stride: horizontal = %5s, vertical = %5s. Effective '
@@ -362,9 +360,8 @@ def _process_model_rf(model_type='resnet_v1_50',
       defined if the input resolution is known, which may happen if using SAME
       padding. The entries in the list are assumed the resolution for both
       height and width. If one of the elements in the list is None, we consider
-      it to mean that the resolution is unknown. If the list itself is None,
-      we use the default list [None, 224, 321].
-
+      it to mean that the resolution is unknown. If the list itself is None, we
+      use the default list [None, 224, 321].
   """
   # Process default value for this list.
   if input_resolutions is None:
@@ -477,8 +474,8 @@ def _mobilenet_v1_rf(csv_writer=None):
     csv_writer: A CSV writer for RF parameters, which is used if it is not None.
   """
   for model_type in _SUPPORTED_MOBILENETV1_VARIANTS:
-    with slim.arg_scope(
-        [slim.batch_norm, slim.dropout], is_training=False) as arg_sc:
+    with slim.arg_scope([slim.batch_norm, slim.dropout],
+                        is_training=False) as arg_sc:
       _process_model_rf(model_type, csv_writer, arg_sc)
 
 
diff --git a/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py b/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
index 0e3c46f17d2e2a277418d39e31927db73a509670..92ae1021bc8f8fbf19ca7f7cbe208ecea18128e8 100644
--- a/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
+++ b/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
@@ -27,7 +27,8 @@ from tensorflow.python.platform import tf_logging as logging
 _UNCHANGED_RF_LAYER_OPS = [
     "Add", "BiasAdd", "Cast", "Ceil", "ConcatV2", "Const", "Floor",
     "FusedBatchNorm", "Identity", "Log", "Mul", "Pow", "RealDiv", "Relu",
-    "Relu6", "Round", "Rsqrt", "Softplus", "Sub", "VariableV2", "LRN"
+    "Relu6", "Round", "Rsqrt", "Softplus", "Sub", "VariableV2", "LRN",
+    "GreaterEqual"
 ]
 
 # Different ways in which padding modes may be spelled.
@@ -276,11 +277,11 @@ def get_layer_params(node, name_to_node, input_resolution=None, force=False):
     kernel_size_x, kernel_size_y = _conv_kernel_size(node, name_to_node)
     # Compute the padding for this node separately for each direction.
     total_padding_x, padding_x = _padding_size_conv_pool(
-        node, kernel_size_x, stride_x, input_resolution[1]
-        if input_resolution is not None else None)
+        node, kernel_size_x, stride_x,
+        input_resolution[1] if input_resolution is not None else None)
     total_padding_y, padding_y = _padding_size_conv_pool(
-        node, kernel_size_y, stride_y, input_resolution[0]
-        if input_resolution is not None else None)
+        node, kernel_size_y, stride_y,
+        input_resolution[0] if input_resolution is not None else None)
   elif node.op == "Pad":
     # Kernel and stride are simply 1 in this case.
     kernel_size_x = 1
@@ -294,11 +295,11 @@ def get_layer_params(node, name_to_node, input_resolution=None, force=False):
     kernel_size_x, kernel_size_y = _pool_kernel_size(node, name_to_node)
     # Compute the padding for this node separately for each direction.
     total_padding_x, padding_x = _padding_size_conv_pool(
-        node, kernel_size_x, stride_x, input_resolution[1]
-        if input_resolution is not None else None)
+        node, kernel_size_x, stride_x,
+        input_resolution[1] if input_resolution is not None else None)
     total_padding_y, padding_y = _padding_size_conv_pool(
-        node, kernel_size_y, stride_y, input_resolution[0]
-        if input_resolution is not None else None)
+        node, kernel_size_y, stride_y,
+        input_resolution[0] if input_resolution is not None else None)
   elif node.op in _UNCHANGED_RF_LAYER_OPS:
     # These nodes do not modify the RF parameters.
     kernel_size_x = 1
@@ -320,7 +321,7 @@ def get_layer_params(node, name_to_node, input_resolution=None, force=False):
       total_padding_y = None
       padding_y = None
     else:
-      raise ValueError("Unknown layer for operation '%s': %s" % (node.name,
-                                                                 node.op))
+      raise ValueError(
+          "Unknown layer for operation '%s': %s" % (node.name, node.op))
   return (kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x,
           padding_y, total_padding_x, total_padding_y)
diff --git a/tensorflow/contrib/receptive_field/python/util/receptive_field.py b/tensorflow/contrib/receptive_field/python/util/receptive_field.py
index b9bd2f09761ab10a62d37e8e2580b93b9b8a4453..9127c772c75279d9c8eacc5a17680beba9247d01 100644
--- a/tensorflow/contrib/receptive_field/python/util/receptive_field.py
+++ b/tensorflow/contrib/receptive_field/python/util/receptive_field.py
@@ -12,12 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Functions to compute receptive field of a fully-convolutional network.
-
-Please refer to the following g3doc for detailed explanation on how this
-computation is performed, and why it is important:
-g3doc/photos/vision/features/delf/g3doc/rf_computation.md
-"""
+"""Functions to compute receptive field of a fully-convolutional network."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -96,8 +91,8 @@ class ReceptiveField(object):
     Args:
       y: An array of feature coordinates with shape `(..., d)`, where `d` is the
         number of dimensions of the coordinates.
-      axis: The dimensions for which to compute the input center coordinates.
-        If `None` (the default), compute the input center coordinates for all
+      axis: The dimensions for which to compute the input center coordinates. If
+        `None` (the default), compute the input center coordinates for all
         dimensions.
 
     Returns:
@@ -127,8 +122,8 @@ class ReceptiveField(object):
     Args:
       x: An array of input center coordinates with shape `(..., d)`, where `d`
         is the number of dimensions of the coordinates.
-      axis: The dimensions for which to compute the feature coordinates.
-        If `None` (the default), compute the feature coordinates for all
+      axis: The dimensions for which to compute the feature coordinates. If
+        `None` (the default), compute the feature coordinates for all
         dimensions.
 
     Returns:
@@ -274,14 +269,15 @@ def compute_receptive_field_from_graph_def(graph_def,
         continue
 
       # Get params for this layer.
-      (kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x,
-       padding_y, _, _) = parse_layer_parameters.get_layer_params(
+      (kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x, padding_y,
+       _, _) = parse_layer_parameters.get_layer_params(
            node, name_to_node, node_info[node.name].input_size)
-      logging.vlog(3, "kernel_size_x = %s, kernel_size_y = %s, "
-                   "stride_x = %s, stride_y = %s, "
-                   "padding_x = %s, padding_y = %s, input size = %s" %
-                   (kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x,
-                    padding_y, node_info[node.name].input_size))
+      logging.vlog(
+          3, "kernel_size_x = %s, kernel_size_y = %s, "
+          "stride_x = %s, stride_y = %s, "
+          "padding_x = %s, padding_y = %s, input size = %s" %
+          (kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x,
+           padding_y, node_info[node.name].input_size))
       if padding_x is None or padding_y is None:
         undefined_padding = True
 
@@ -352,15 +348,15 @@ def compute_receptive_field_from_graph_def(graph_def,
               raise ValueError(
                   "Graph is not aligned since effective stride from different "
                   "paths is different in vertical direction")
-            if (rf_sizes_x[inp_name] - 1
-               ) / 2 - effective_paddings_x[inp_name] != (
-                   rf_size_input_x - 1) / 2 - effective_padding_input_x:
+            if (rf_sizes_x[inp_name] -
+                1) / 2 - effective_paddings_x[inp_name] != (
+                    rf_size_input_x - 1) / 2 - effective_padding_input_x:
               raise ValueError(
                   "Graph is not aligned since center shift from different "
                   "paths is different in horizontal direction")
-            if (rf_sizes_y[inp_name] - 1
-               ) / 2 - effective_paddings_y[inp_name] != (
-                   rf_size_input_y - 1) / 2 - effective_padding_input_y:
+            if (rf_sizes_y[inp_name] -
+                1) / 2 - effective_paddings_y[inp_name] != (
+                    rf_size_input_y - 1) / 2 - effective_padding_input_y:
               raise ValueError(
                   "Graph is not aligned since center shift from different "
                   "paths is different in vertical direction")
diff --git a/tensorflow/contrib/remote_fused_graph/pylib/python/ops/remote_fused_graph_ops.py b/tensorflow/contrib/remote_fused_graph/pylib/python/ops/remote_fused_graph_ops.py
index 2054367f0d1461c8868e3332d82322a8a3dd38af..7e79785d2867de586f0730373d4864602ef770ae 100644
--- a/tensorflow/contrib/remote_fused_graph/pylib/python/ops/remote_fused_graph_ops.py
+++ b/tensorflow/contrib/remote_fused_graph/pylib/python/ops/remote_fused_graph_ops.py
@@ -50,13 +50,13 @@ def remote_fused_graph_execute(inputs,
   if default_graph_input_tensor_type_shapes:
     for type_shape in default_graph_input_tensor_type_shapes:
       type_shape_proto = info_proto.default_graph_input_tensor_shape.add()
-      type_shape_proto.dtype = int(dtypes.as_dtype(type_shape[0]))
+      type_shape_proto.dtype = dtypes.as_dtype(type_shape[0]).as_datatype_enum
       for dim in type_shape[1]:
         type_shape_proto.shape.dim.add().size = dim
   if default_graph_output_tensor_type_shapes:
     for type_shape in default_graph_output_tensor_type_shapes:
       type_shape_proto = info_proto.default_graph_output_tensor_shape.add()
-      type_shape_proto.dtype = int(dtypes.as_dtype(type_shape[0]))
+      type_shape_proto.dtype = dtypes.as_dtype(type_shape[0]).as_datatype_enum
       for dim in type_shape[1]:
         type_shape_proto.shape.dim.add().size = dim
 
diff --git a/tensorflow/contrib/resampler/xla/resampler_ops_xla_test.py b/tensorflow/contrib/resampler/xla/resampler_ops_xla_test.py
index d8ca0eab276b39f025d018edebb78eed7a8433bb..cec4c3c23305034d167a248a637425507750064e 100644
--- a/tensorflow/contrib/resampler/xla/resampler_ops_xla_test.py
+++ b/tensorflow/contrib/resampler/xla/resampler_ops_xla_test.py
@@ -164,6 +164,15 @@ class ResamplerOpsTest(xla_test.XLATestCase):
       expected = [[[0.0], [27.62]]]
       self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
 
+      expected_grad_data = [[[[0.12], [0.27999997]], [[0.18000001],
+                                                      [0.42000002]]]]
+      expected_grad_warp = [[[0., 0.], [22.60000038, 35.20000076]]]
+
+      grad_output = np.ones([1, 2, 1], dtype=dtype)
+      self._assertBackwardOpMatchesExpected(input_np, warp_np, grad_output,
+                                            expected_grad_data,
+                                            expected_grad_warp)
+
     # One of (x, y) is less than 0.
     for dtype in self.float_types:
       input_shape = [1, 2, 2, 1]
@@ -171,11 +180,21 @@ class ResamplerOpsTest(xla_test.XLATestCase):
       input_np = np.array(input_data, dtype=dtype).reshape(input_shape)
 
       warp_shape = [1, 2, 2]
+      # -1 is out of bound for grad_warp.
       warp_data = [-1, 0.1, 0.7, 0.6]
       warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
       expected = [[[0.0], [27.62]]]
       self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
 
+      expected_grad_data = [[[[0.12], [0.27999997]], [[0.18000001],
+                                                      [0.42000002]]]]
+      expected_grad_warp = [[[0., 0.], [22.60000038, 35.20000076]]]
+
+      grad_output = np.ones([1, 2, 1], dtype=dtype)
+      self._assertBackwardOpMatchesExpected(input_np, warp_np, grad_output,
+                                            expected_grad_data,
+                                            expected_grad_warp)
+
     # Both of (x, y) are greater than image size.
     for dtype in self.float_types:
       input_shape = [1, 2, 2, 1]
@@ -183,11 +202,20 @@ class ResamplerOpsTest(xla_test.XLATestCase):
       input_np = np.array(input_data, dtype=dtype).reshape(input_shape)
 
       warp_shape = [1, 2, 2]
+      # -0.1 is *inbound* for grad_warp and grad_data, 2.1 is out of bound.
       warp_data = [-0.1, 0.1, 1.2, 2.1]
       warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
       expected = [[[0.0], [0.0]]]
       self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
 
+      expected_grad_data = [[[[0.81], [0.0]], [[0.09], [0.0]]]]
+      expected_grad_warp = [[[10.30, 2.7], [0.0, 0.0]]]
+
+      grad_output = np.ones([1, 2, 1], dtype=dtype)
+      self._assertBackwardOpMatchesExpected(input_np, warp_np, grad_output,
+                                            expected_grad_data,
+                                            expected_grad_warp)
+
     # One of (x, y) is greater than image size.
     for dtype in self.float_types:
       input_shape = [1, 2, 2, 1]
@@ -200,6 +228,14 @@ class ResamplerOpsTest(xla_test.XLATestCase):
       expected = [[[0.0], [0.0]]]
       self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
 
+      expected_grad_data = [[[[0.81], [0.81]], [[0.0], [0.08]]]]
+      expected_grad_warp = [[[-4.5, 9.5], [-9.9, 39.20]]]
+
+      grad_output = np.ones([1, 2, 1], dtype=dtype)
+      self._assertBackwardOpMatchesExpected(input_np, warp_np, grad_output,
+                                            expected_grad_data,
+                                            expected_grad_warp)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index e124867415f94fb5052f34f50363ea718d71053b..24fa740d24502a28cb42c994715d09180ee99899 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -102,25 +102,6 @@ cuda_py_tests(
     xla_enabled = True,
 )
 
-cuda_py_tests(
-    name = "core_rnn_cell_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/core_rnn_cell_test.py"],
-    additional_deps = [
-        ":rnn_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:rnn",
-        "//tensorflow/python:rnn_cell",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-    ],
-)
-
 cuda_py_tests(
     name = "rnn_test",
     size = "medium",
@@ -143,32 +124,6 @@ cuda_py_tests(
     ],
 )
 
-cuda_py_tests(
-    name = "core_rnn_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/core_rnn_test.py"],
-    additional_deps = [
-        ":rnn_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:rnn",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/eager:context",
-    ],
-    shard_count = 10,
-)
-
 tf_py_test(
     name = "fused_rnn_cell_test",
     size = "medium",
@@ -226,7 +181,10 @@ tf_custom_op_library(
         "kernels/lstm_ops_gpu.cu.cc",
         "kernels/lstm_ops.h",
     ],
-    deps = ["//tensorflow/core/kernels:eigen_helpers"],
+    deps = [
+        "//tensorflow/core/kernels:eigen_contraction_kernel",
+        "//tensorflow/core/kernels:eigen_helpers",
+    ],
 )
 
 tf_gen_op_wrapper_py(
@@ -248,7 +206,10 @@ tf_custom_op_library(
         "kernels/gru_ops_gpu.cu.cc",
         "kernels/gru_ops.h",
     ],
-    deps = ["//tensorflow/core/kernels:eigen_helpers"],
+    deps = [
+        "//tensorflow/core/kernels:eigen_contraction_kernel",
+        "//tensorflow/core/kernels:eigen_helpers",
+    ],
 )
 
 tf_gen_op_wrapper_py(
@@ -345,6 +306,7 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/kernels:eigen_contraction_kernel",
         "//tensorflow/core/kernels:eigen_helpers",
         "//third_party/eigen3",
     ],
@@ -380,6 +342,13 @@ py_binary(
     name = "checkpoint_convert",
     srcs = ["python/tools/checkpoint_convert.py"],
     srcs_version = "PY2AND3",
+    deps = [":checkpoint_convert_lib"],
+)
+
+py_library(
+    name = "checkpoint_convert_lib",
+    srcs = ["python/tools/checkpoint_convert.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework_ops",
@@ -398,7 +367,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
-        ":checkpoint_convert",
+        ":checkpoint_convert_lib",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:session",
diff --git a/tensorflow/contrib/rnn/kernels/blas_gemm.h b/tensorflow/contrib/rnn/kernels/blas_gemm.h
index d37210d4b81203287fb633adc309688a35d093bb..12f3182a6a8878aa27ee143fa6405903e3fc4ef3 100644
--- a/tensorflow/contrib/rnn/kernels/blas_gemm.h
+++ b/tensorflow/contrib/rnn/kernels/blas_gemm.h
@@ -21,6 +21,10 @@ limitations under the License.
 #include "tensorflow/core/kernels/eigen_activations.h"
 #include "tensorflow/core/platform/types.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 namespace tensorflow {
 class OpKernelContext;
 namespace functor {
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
deleted file mode 100644
index 7d57b0413a3bb51c35e670ce3fdb2cc818f44a58..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ /dev/null
@@ -1,1078 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for RNN cells."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-import numpy as np
-
-from tensorflow.contrib import rnn as contrib_rnn
-from tensorflow.contrib.rnn.python.ops import core_rnn_cell
-from tensorflow.contrib.rnn.python.ops import rnn_cell as contrib_rnn_cell
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import random_seed
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import rnn
-from tensorflow.python.ops import rnn_cell_impl
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables as variables_lib
-from tensorflow.python.platform import test
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
-
-# pylint: enable=protected-access
-Linear = core_rnn_cell._Linear  # pylint: disable=invalid-name
-
-
-class RNNCellTest(test.TestCase):
-
-  def testLinear(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(1.0)):
-        x = array_ops.zeros([1, 2])
-        l = Linear([x], 2, False)([x])
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([l], {x.name: np.array([[1., 2.]])})
-        self.assertAllClose(res[0], [[3.0, 3.0]])
-
-        # Checks prevent you from accidentally creating a shared function.
-        with self.assertRaises(ValueError):
-          l1 = Linear([x], 2, False)([x])
-
-        # But you can create a new one in a new scope and share the variables.
-        with variable_scope.variable_scope("l1") as new_scope:
-          l1 = Linear([x], 2, False)([x])
-        with variable_scope.variable_scope(new_scope, reuse=True):
-          Linear([l1], 2, False)([l1])
-        self.assertEqual(len(variables_lib.trainable_variables()), 2)
-
-  def testBasicRNNCell(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 2])
-        cell = rnn_cell_impl.BasicRNNCell(2)
-        g, _ = cell(x, m)
-        self.assertEqual([
-            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME
-        ], [v.name for v in cell.trainable_variables])
-        self.assertFalse(cell.non_trainable_variables)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        self.assertEqual(res[0].shape, (1, 2))
-
-  def testBasicRNNCellNotTrainable(self):
-    with self.cached_session() as sess:
-
-      def not_trainable_getter(getter, *args, **kwargs):
-        kwargs["trainable"] = False
-        return getter(*args, **kwargs)
-
-      with variable_scope.variable_scope(
-          "root",
-          initializer=init_ops.constant_initializer(0.5),
-          custom_getter=not_trainable_getter):
-        x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 2])
-        cell = rnn_cell_impl.BasicRNNCell(2)
-        g, _ = cell(x, m)
-        self.assertFalse(cell.trainable_variables)
-        self.assertEqual([
-            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME
-        ], [v.name for v in cell.non_trainable_variables])
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        self.assertEqual(res[0].shape, (1, 2))
-
-  def testIndRNNCell(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 2])
-        cell = contrib_rnn_cell.IndRNNCell(2)
-        g, _ = cell(x, m)
-        self.assertEqual([
-            "root/ind_rnn_cell/%s_w:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-            "root/ind_rnn_cell/%s_u:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-            "root/ind_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME
-        ], [v.name for v in cell.trainable_variables])
-        self.assertFalse(cell.non_trainable_variables)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        self.assertEqual(res[0].shape, (1, 2))
-
-  def testGRUCell(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 2])
-        g, _ = rnn_cell_impl.GRUCell(2)(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        # Smoke test
-        self.assertAllClose(res[0], [[0.175991, 0.175991]])
-      with variable_scope.variable_scope(
-          "other", initializer=init_ops.constant_initializer(0.5)):
-        # Test GRUCell with input_size != num_units.
-        x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 2])
-        g, _ = rnn_cell_impl.GRUCell(2)(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        # Smoke test
-        self.assertAllClose(res[0], [[0.156736, 0.156736]])
-
-  def testIndyGRUCell(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 2])
-        g, _ = contrib_rnn_cell.IndyGRUCell(2)(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        # Smoke test
-        self.assertAllClose(res[0], [[0.185265, 0.17704]])
-      with variable_scope.variable_scope(
-          "other", initializer=init_ops.constant_initializer(0.5)):
-        # Test IndyGRUCell with input_size != num_units.
-        x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 2])
-        g, _ = contrib_rnn_cell.IndyGRUCell(2)(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        # Smoke test
-        self.assertAllClose(res[0], [[0.155127, 0.157328]])
-
-  def testSRUCell(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 2])
-        g, _ = contrib_rnn_cell.SRUCell(2)(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        # Smoke test
-        self.assertAllClose(res[0], [[0.509682, 0.509682]])
-
-  def testSRUCellWithDiffSize(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 2])
-        g, _ = contrib_rnn_cell.SRUCell(2)(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g], {
-            x.name: np.array([[1., 1., 1.]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        # Smoke test
-        self.assertAllClose(res[0], [[0.55255556, 0.55255556]])
-
-  def testBasicLSTMCell(self):
-    for dtype in [dtypes.float16, dtypes.float32]:
-      np_dtype = dtype.as_numpy_dtype
-      with self.session(graph=ops.Graph()) as sess:
-        with variable_scope.variable_scope(
-            "root", initializer=init_ops.constant_initializer(0.5)):
-          x = array_ops.zeros([1, 2], dtype=dtype)
-          m = array_ops.zeros([1, 8], dtype=dtype)
-          cell = rnn_cell_impl.MultiRNNCell(
-              [
-                  rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
-                  for _ in range(2)
-              ],
-              state_is_tuple=False)
-          self.assertEqual(cell.dtype, None)
-          self.assertEqual("cell-0", cell._checkpoint_dependencies[0].name)
-          self.assertEqual("cell-1", cell._checkpoint_dependencies[1].name)
-          cell.get_config()  # Should not throw an error
-          g, out_m = cell(x, m)
-          # Layer infers the input type.
-          self.assertEqual(cell.dtype, dtype.name)
-          expected_variable_names = [
-              "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" %
-              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-              "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" %
-              rnn_cell_impl._BIAS_VARIABLE_NAME,
-              "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" %
-              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-              "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" %
-              rnn_cell_impl._BIAS_VARIABLE_NAME
-          ]
-          self.assertEqual(expected_variable_names,
-                           [v.name for v in cell.trainable_variables])
-          self.assertFalse(cell.non_trainable_variables)
-          sess.run([variables_lib.global_variables_initializer()])
-          res = sess.run([g, out_m], {
-              x.name: np.array([[1., 1.]]),
-              m.name: 0.1 * np.ones([1, 8])
-          })
-          self.assertEqual(len(res), 2)
-          variables = variables_lib.global_variables()
-          self.assertEqual(expected_variable_names, [v.name for v in variables])
-          # The numbers in results were not calculated, this is just a
-          # smoke test.
-          self.assertAllClose(res[0], np.array(
-              [[0.240, 0.240]], dtype=np_dtype), 1e-2)
-          expected_mem = np.array(
-              [[0.689, 0.689, 0.448, 0.448, 0.398, 0.398, 0.240, 0.240]],
-              dtype=np_dtype)
-          self.assertAllClose(res[1], expected_mem, 1e-2)
-        with variable_scope.variable_scope(
-            "other", initializer=init_ops.constant_initializer(0.5)):
-          # Test BasicLSTMCell with input_size != num_units.
-          x = array_ops.zeros([1, 3], dtype=dtype)
-          m = array_ops.zeros([1, 4], dtype=dtype)
-          g, out_m = rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)(x, m)
-          sess.run([variables_lib.global_variables_initializer()])
-          res = sess.run(
-              [g, out_m], {
-                  x.name: np.array([[1., 1., 1.]], dtype=np_dtype),
-                  m.name: 0.1 * np.ones([1, 4], dtype=np_dtype)
-              })
-          self.assertEqual(len(res), 2)
-
-  def testBasicLSTMCellDimension0Error(self):
-    """Tests that dimension 0 in both(x and m) shape must be equal."""
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        num_units = 2
-        state_size = num_units * 2
-        batch_size = 3
-        input_size = 4
-        x = array_ops.zeros([batch_size, input_size])
-        m = array_ops.zeros([batch_size - 1, state_size])
-        with self.assertRaises(ValueError):
-          g, out_m = rnn_cell_impl.BasicLSTMCell(
-              num_units, state_is_tuple=False)(x, m)
-          sess.run([variables_lib.global_variables_initializer()])
-          sess.run(
-              [g, out_m], {
-                  x.name: 1 * np.ones([batch_size, input_size]),
-                  m.name: 0.1 * np.ones([batch_size - 1, state_size])
-              })
-
-  def testBasicLSTMCellStateSizeError(self):
-    """Tests that state_size must be num_units * 2."""
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        num_units = 2
-        state_size = num_units * 3  # state_size must be num_units * 2
-        batch_size = 3
-        input_size = 4
-        x = array_ops.zeros([batch_size, input_size])
-        m = array_ops.zeros([batch_size, state_size])
-        with self.assertRaises(ValueError):
-          g, out_m = rnn_cell_impl.BasicLSTMCell(
-              num_units, state_is_tuple=False)(x, m)
-          sess.run([variables_lib.global_variables_initializer()])
-          sess.run(
-              [g, out_m], {
-                  x.name: 1 * np.ones([batch_size, input_size]),
-                  m.name: 0.1 * np.ones([batch_size, state_size])
-              })
-
-  def testBasicLSTMCellStateTupleType(self):
-    with self.cached_session():
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m0 = (array_ops.zeros([1, 2]),) * 2
-        m1 = (array_ops.zeros([1, 2]),) * 2
-        cell = rnn_cell_impl.MultiRNNCell(
-            [rnn_cell_impl.BasicLSTMCell(2) for _ in range(2)],
-            state_is_tuple=True)
-        self.assertTrue(isinstance(cell.state_size, tuple))
-        self.assertTrue(
-            isinstance(cell.state_size[0], rnn_cell_impl.LSTMStateTuple))
-        self.assertTrue(
-            isinstance(cell.state_size[1], rnn_cell_impl.LSTMStateTuple))
-
-        # Pass in regular tuples
-        _, (out_m0, out_m1) = cell(x, (m0, m1))
-        self.assertTrue(isinstance(out_m0, rnn_cell_impl.LSTMStateTuple))
-        self.assertTrue(isinstance(out_m1, rnn_cell_impl.LSTMStateTuple))
-
-        # Pass in LSTMStateTuples
-        variable_scope.get_variable_scope().reuse_variables()
-        zero_state = cell.zero_state(1, dtypes.float32)
-        self.assertTrue(isinstance(zero_state, tuple))
-        self.assertTrue(isinstance(zero_state[0], rnn_cell_impl.LSTMStateTuple))
-        self.assertTrue(isinstance(zero_state[1], rnn_cell_impl.LSTMStateTuple))
-        _, (out_m0, out_m1) = cell(x, zero_state)
-        self.assertTrue(isinstance(out_m0, rnn_cell_impl.LSTMStateTuple))
-        self.assertTrue(isinstance(out_m1, rnn_cell_impl.LSTMStateTuple))
-
-  def testBasicLSTMCellWithStateTuple(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m0 = array_ops.zeros([1, 4])
-        m1 = array_ops.zeros([1, 4])
-        cell = rnn_cell_impl.MultiRNNCell(
-            [
-                rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
-                for _ in range(2)
-            ],
-            state_is_tuple=True)
-        g, (out_m0, out_m1) = cell(x, (m0, m1))
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g, out_m0, out_m1], {
-                x.name: np.array([[1., 1.]]),
-                m0.name: 0.1 * np.ones([1, 4]),
-                m1.name: 0.1 * np.ones([1, 4])
-            })
-        self.assertEqual(len(res), 3)
-        # The numbers in results were not calculated, this is just a smoke test.
-        # Note, however, these values should match the original
-        # version having state_is_tuple=False.
-        self.assertAllClose(res[0], [[0.24024698, 0.24024698]])
-        expected_mem0 = np.array(
-            [[0.68967271, 0.68967271, 0.44848421, 0.44848421]])
-        expected_mem1 = np.array(
-            [[0.39897051, 0.39897051, 0.24024698, 0.24024698]])
-        self.assertAllClose(res[1], expected_mem0)
-        self.assertAllClose(res[2], expected_mem1)
-
-  def testIndyLSTMCell(self):
-    for dtype in [dtypes.float16, dtypes.float32]:
-      np_dtype = dtype.as_numpy_dtype
-      with self.session(graph=ops.Graph()) as sess:
-        with variable_scope.variable_scope(
-            "root", initializer=init_ops.constant_initializer(0.5)):
-          x = array_ops.zeros([1, 2], dtype=dtype)
-          state_0 = (array_ops.zeros([1, 2], dtype=dtype),) * 2
-          state_1 = (array_ops.zeros([1, 2], dtype=dtype),) * 2
-          cell = rnn_cell_impl.MultiRNNCell(
-              [contrib_rnn_cell.IndyLSTMCell(2) for _ in range(2)])
-          self.assertEqual(cell.dtype, None)
-          self.assertEqual("cell-0", cell._checkpoint_dependencies[0].name)
-          self.assertEqual("cell-1", cell._checkpoint_dependencies[1].name)
-          cell.get_config()  # Should not throw an error
-          g, (out_state_0, out_state_1) = cell(x, (state_0, state_1))
-          # Layer infers the input type.
-          self.assertEqual(cell.dtype, dtype.name)
-          expected_variable_names = [
-              "root/multi_rnn_cell/cell_0/indy_lstm_cell/%s_w:0" %
-              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-              "root/multi_rnn_cell/cell_0/indy_lstm_cell/%s_u:0" %
-              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-              "root/multi_rnn_cell/cell_0/indy_lstm_cell/%s:0" %
-              rnn_cell_impl._BIAS_VARIABLE_NAME,
-              "root/multi_rnn_cell/cell_1/indy_lstm_cell/%s_w:0" %
-              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-              "root/multi_rnn_cell/cell_1/indy_lstm_cell/%s_u:0" %
-              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
-              "root/multi_rnn_cell/cell_1/indy_lstm_cell/%s:0" %
-              rnn_cell_impl._BIAS_VARIABLE_NAME
-          ]
-          self.assertEqual(expected_variable_names,
-                           [v.name for v in cell.trainable_variables])
-          self.assertFalse(cell.non_trainable_variables)
-          sess.run([variables_lib.global_variables_initializer()])
-          res = sess.run(
-              [g, out_state_0, out_state_1], {
-                  x.name: np.array([[1., 1.]]),
-                  state_0[0].name: 0.1 * np.ones([1, 2]),
-                  state_0[1].name: 0.1 * np.ones([1, 2]),
-                  state_1[0].name: 0.1 * np.ones([1, 2]),
-                  state_1[1].name: 0.1 * np.ones([1, 2]),
-              })
-          self.assertEqual(len(res), 3)
-          variables = variables_lib.global_variables()
-          self.assertEqual(expected_variable_names, [v.name for v in variables])
-          # Only check the range of outputs as this is just a smoke test.
-          self.assertAllInRange(res[0], -1.0, 1.0)
-          self.assertAllInRange(res[1], -1.0, 1.0)
-          self.assertAllInRange(res[2], -1.0, 1.0)
-        with variable_scope.variable_scope(
-            "other", initializer=init_ops.constant_initializer(0.5)):
-          # Test IndyLSTMCell with input_size != num_units.
-          x = array_ops.zeros([1, 3], dtype=dtype)
-          state = (array_ops.zeros([1, 2], dtype=dtype),) * 2
-          g, out_state = contrib_rnn_cell.IndyLSTMCell(2)(x, state)
-          sess.run([variables_lib.global_variables_initializer()])
-          res = sess.run(
-              [g, out_state], {
-                  x.name: np.array([[1., 1., 1.]], dtype=np_dtype),
-                  state[0].name: 0.1 * np.ones([1, 2], dtype=np_dtype),
-                  state[1].name: 0.1 * np.ones([1, 2], dtype=np_dtype),
-              })
-          self.assertEqual(len(res), 2)
-
-  def testLSTMCell(self):
-    with self.cached_session() as sess:
-      num_units = 8
-      num_proj = 6
-      state_size = num_units + num_proj
-      batch_size = 3
-      input_size = 2
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([batch_size, input_size])
-        m = array_ops.zeros([batch_size, state_size])
-        cell = rnn_cell_impl.LSTMCell(
-            num_units=num_units,
-            num_proj=num_proj,
-            forget_bias=1.0,
-            state_is_tuple=False)
-        output, state = cell(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [output, state], {
-                x.name: np.array([[1., 1.], [2., 2.], [3., 3.]]),
-                m.name: 0.1 * np.ones((batch_size, state_size))
-            })
-        self.assertEqual(len(res), 2)
-        # The numbers in results were not calculated, this is mostly just a
-        # smoke test.
-        self.assertEqual(res[0].shape, (batch_size, num_proj))
-        self.assertEqual(res[1].shape, (batch_size, state_size))
-        # Different inputs so different outputs and states
-        for i in range(1, batch_size):
-          self.assertTrue(
-              float(np.linalg.norm((res[0][0, :] - res[0][i, :]))) > 1e-6)
-          self.assertTrue(
-              float(np.linalg.norm((res[1][0, :] - res[1][i, :]))) > 1e-6)
-
-  def testLSTMCellVariables(self):
-    with self.cached_session():
-      num_units = 8
-      num_proj = 6
-      state_size = num_units + num_proj
-      batch_size = 3
-      input_size = 2
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([batch_size, input_size])
-        m = array_ops.zeros([batch_size, state_size])
-        cell = rnn_cell_impl.LSTMCell(
-            num_units=num_units,
-            num_proj=num_proj,
-            forget_bias=1.0,
-            state_is_tuple=False)
-        cell(x, m)  # Execute to create variables
-      variables = variables_lib.global_variables()
-      self.assertEquals(variables[0].op.name, "root/lstm_cell/kernel")
-      self.assertEquals(variables[1].op.name, "root/lstm_cell/bias")
-      self.assertEquals(variables[2].op.name,
-                        "root/lstm_cell/projection/kernel")
-
-  def testLSTMCellLayerNorm(self):
-    with self.cached_session() as sess:
-      num_units = 2
-      num_proj = 3
-      batch_size = 1
-      input_size = 4
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([batch_size, input_size])
-        c = array_ops.zeros([batch_size, num_units])
-        h = array_ops.zeros([batch_size, num_proj])
-        state = rnn_cell_impl.LSTMStateTuple(c, h)
-        cell = contrib_rnn_cell.LayerNormLSTMCell(
-            num_units=num_units,
-            num_proj=num_proj,
-            forget_bias=1.0,
-            layer_norm=True,
-            norm_gain=1.0,
-            norm_shift=0.0)
-        g, out_m = cell(x, state)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            [g, out_m], {
-                x.name: np.ones((batch_size, input_size)),
-                c.name: 0.1 * np.ones((batch_size, num_units)),
-                h.name: 0.1 * np.ones((batch_size, num_proj))
-            })
-        self.assertEqual(len(res), 2)
-        # The numbers in results were not calculated, this is mostly just a
-        # smoke test.
-        self.assertEqual(res[0].shape, (batch_size, num_proj))
-        self.assertEqual(res[1][0].shape, (batch_size, num_units))
-        self.assertEqual(res[1][1].shape, (batch_size, num_proj))
-        # Different inputs so different outputs and states
-        for i in range(1, batch_size):
-          self.assertTrue(
-              float(np.linalg.norm((res[0][0, :] - res[0][i, :]))) < 1e-6)
-          self.assertTrue(
-              float(np.linalg.norm((res[1][0, :] - res[1][i, :]))) < 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testWrapperCheckpointing(self):
-    for wrapper_type in [
-        rnn_cell_impl.DropoutWrapper,
-        rnn_cell_impl.ResidualWrapper,
-        lambda cell: rnn_cell_impl.MultiRNNCell([cell])]:
-      cell = rnn_cell_impl.BasicRNNCell(1)
-      wrapper = wrapper_type(cell)
-      wrapper(array_ops.ones([1, 1]),
-              state=wrapper.zero_state(batch_size=1, dtype=dtypes.float32))
-      self.evaluate([v.initializer for v in cell.variables])
-      checkpoint = checkpointable_utils.Checkpoint(wrapper=wrapper)
-      prefix = os.path.join(self.get_temp_dir(), "ckpt")
-      self.evaluate(cell._bias.assign([40.]))
-      save_path = checkpoint.save(prefix)
-      self.evaluate(cell._bias.assign([0.]))
-      checkpoint.restore(save_path).assert_consumed().run_restore_ops()
-      self.assertAllEqual([40.], self.evaluate(cell._bias))
-
-  def testOutputProjectionWrapper(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 3])
-        cell = contrib_rnn.OutputProjectionWrapper(rnn_cell_impl.GRUCell(3), 2)
-        g, new_m = cell(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g, new_m], {
-            x.name: np.array([[1., 1., 1.]]),
-            m.name: np.array([[0.1, 0.1, 0.1]])
-        })
-        self.assertEqual(res[1].shape, (1, 3))
-        # The numbers in results were not calculated, this is just a smoke test.
-        self.assertAllClose(res[0], [[0.231907, 0.231907]])
-
-  def testInputProjectionWrapper(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 3])
-        cell = contrib_rnn.InputProjectionWrapper(
-            rnn_cell_impl.GRUCell(3), num_proj=3)
-        g, new_m = cell(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g, new_m], {
-            x.name: np.array([[1., 1.]]),
-            m.name: np.array([[0.1, 0.1, 0.1]])
-        })
-        self.assertEqual(res[1].shape, (1, 3))
-        # The numbers in results were not calculated, this is just a smoke test.
-        self.assertAllClose(res[0], [[0.154605, 0.154605, 0.154605]])
-
-  def testResidualWrapper(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 3])
-        m = array_ops.zeros([1, 3])
-        base_cell = rnn_cell_impl.GRUCell(3)
-        g, m_new = base_cell(x, m)
-        variable_scope.get_variable_scope().reuse_variables()
-        wrapper_object = rnn_cell_impl.ResidualWrapper(base_cell)
-        (name, dep), = wrapper_object._checkpoint_dependencies
-        wrapper_object.get_config()  # Should not throw an error
-        self.assertIs(dep, base_cell)
-        self.assertEqual("cell", name)
-
-        g_res, m_new_res = wrapper_object(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g, g_res, m_new, m_new_res], {
-            x: np.array([[1., 1., 1.]]),
-            m: np.array([[0.1, 0.1, 0.1]])
-        })
-        # Residual connections
-        self.assertAllClose(res[1], res[0] + [1., 1., 1.])
-        # States are left untouched
-        self.assertAllClose(res[2], res[3])
-
-  def testResidualWrapperWithSlice(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 5])
-        m = array_ops.zeros([1, 3])
-        base_cell = rnn_cell_impl.GRUCell(3)
-        g, m_new = base_cell(x, m)
-        variable_scope.get_variable_scope().reuse_variables()
-
-        def residual_with_slice_fn(inp, out):
-          inp_sliced = array_ops.slice(inp, [0, 0], [-1, 3])
-          return inp_sliced + out
-
-        g_res, m_new_res = rnn_cell_impl.ResidualWrapper(
-            base_cell, residual_with_slice_fn)(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res_g, res_g_res, res_m_new, res_m_new_res = sess.run(
-            [g, g_res, m_new, m_new_res], {
-                x: np.array([[1., 1., 1., 1., 1.]]),
-                m: np.array([[0.1, 0.1, 0.1]])
-            })
-        # Residual connections
-        self.assertAllClose(res_g_res, res_g + [1., 1., 1.])
-        # States are left untouched
-        self.assertAllClose(res_m_new, res_m_new_res)
-
-  def testDeviceWrapper(self):
-    with variable_scope.variable_scope(
-        "root", initializer=init_ops.constant_initializer(0.5)):
-      x = array_ops.zeros([1, 3])
-      m = array_ops.zeros([1, 3])
-      wrapped = rnn_cell_impl.GRUCell(3)
-      cell = rnn_cell_impl.DeviceWrapper(wrapped, "/cpu:14159")
-      (name, dep), = cell._checkpoint_dependencies
-      cell.get_config()  # Should not throw an error
-      self.assertIs(dep, wrapped)
-      self.assertEqual("cell", name)
-
-      outputs, _ = cell(x, m)
-      self.assertTrue("cpu:14159" in outputs.device.lower())
-
-  def _retrieve_cpu_gpu_stats(self, run_metadata):
-    cpu_stats = None
-    gpu_stats = None
-    step_stats = run_metadata.step_stats
-    for ds in step_stats.dev_stats:
-      if "cpu:0" in ds.device[-5:].lower():
-        cpu_stats = ds.node_stats
-      if "gpu:0" == ds.device[-5:].lower():
-        gpu_stats = ds.node_stats
-    return cpu_stats, gpu_stats
-
-  def testDeviceWrapperDynamicExecutionNodesAreAllProperlyLocated(self):
-    if not test.is_gpu_available():
-      # Can't perform this test w/o a GPU
-      return
-
-    gpu_dev = test.gpu_device_name()
-    with self.session(use_gpu=True) as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 1, 3])
-        cell = rnn_cell_impl.DeviceWrapper(rnn_cell_impl.GRUCell(3), gpu_dev)
-        with ops.device("/cpu:0"):
-          outputs, _ = rnn.dynamic_rnn(
-              cell=cell, inputs=x, dtype=dtypes.float32)
-        run_metadata = config_pb2.RunMetadata()
-        opts = config_pb2.RunOptions(
-            trace_level=config_pb2.RunOptions.FULL_TRACE)
-
-        sess.run([variables_lib.global_variables_initializer()])
-        _ = sess.run(outputs, options=opts, run_metadata=run_metadata)
-
-      cpu_stats, gpu_stats = self._retrieve_cpu_gpu_stats(run_metadata)
-      self.assertFalse([s for s in cpu_stats if "gru_cell" in s.node_name])
-      self.assertTrue([s for s in gpu_stats if "gru_cell" in s.node_name])
-
-  def testEmbeddingWrapper(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 1], dtype=dtypes.int32)
-        m = array_ops.zeros([1, 2])
-        embedding_cell = contrib_rnn.EmbeddingWrapper(
-            rnn_cell_impl.GRUCell(2), embedding_classes=3, embedding_size=2)
-        self.assertEqual(embedding_cell.output_size, 2)
-        g, new_m = embedding_cell(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([g, new_m], {
-            x.name: np.array([[1]]),
-            m.name: np.array([[0.1, 0.1]])
-        })
-        self.assertEqual(res[1].shape, (1, 2))
-        # The numbers in results were not calculated, this is just a smoke test.
-        self.assertAllClose(res[0], [[0.17139, 0.17139]])
-
-  def testEmbeddingWrapperWithDynamicRnn(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope("root"):
-        inputs = ops.convert_to_tensor([[[0], [0]]], dtype=dtypes.int64)
-        input_lengths = ops.convert_to_tensor([2], dtype=dtypes.int64)
-        embedding_cell = contrib_rnn.EmbeddingWrapper(
-            rnn_cell_impl.BasicLSTMCell(1, state_is_tuple=True),
-            embedding_classes=1,
-            embedding_size=2)
-        outputs, _ = rnn.dynamic_rnn(
-            cell=embedding_cell,
-            inputs=inputs,
-            sequence_length=input_lengths,
-            dtype=dtypes.float32)
-        sess.run([variables_lib.global_variables_initializer()])
-        # This will fail if output's dtype is inferred from input's.
-        sess.run(outputs)
-
-  def testMultiRNNCell(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m = array_ops.zeros([1, 4])
-        multi_rnn_cell = rnn_cell_impl.MultiRNNCell(
-            [rnn_cell_impl.GRUCell(2) for _ in range(2)],
-            state_is_tuple=False)
-        _, ml = multi_rnn_cell(x, m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(ml, {
-            x.name: np.array([[1., 1.]]),
-            m.name: np.array([[0.1, 0.1, 0.1, 0.1]])
-        })
-        # The numbers in results were not calculated, this is just a smoke test.
-        self.assertAllClose(res, [[0.175991, 0.175991, 0.13248, 0.13248]])
-        self.assertEqual(len(multi_rnn_cell.weights), 2 * 4)
-        self.assertTrue(
-            [x.dtype == dtypes.float32 for x in multi_rnn_cell.weights])
-
-  def testMultiRNNCellWithStateTuple(self):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        x = array_ops.zeros([1, 2])
-        m_bad = array_ops.zeros([1, 4])
-        m_good = (array_ops.zeros([1, 2]), array_ops.zeros([1, 2]))
-
-        # Test incorrectness of state
-        with self.assertRaisesRegexp(ValueError, "Expected state .* a tuple"):
-          rnn_cell_impl.MultiRNNCell(
-              [rnn_cell_impl.GRUCell(2) for _ in range(2)],
-              state_is_tuple=True)(x, m_bad)
-
-        _, ml = rnn_cell_impl.MultiRNNCell(
-            [rnn_cell_impl.GRUCell(2) for _ in range(2)],
-            state_is_tuple=True)(x, m_good)
-
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run(
-            ml, {
-                x.name: np.array([[1., 1.]]),
-                m_good[0].name: np.array([[0.1, 0.1]]),
-                m_good[1].name: np.array([[0.1, 0.1]])
-            })
-
-        # The numbers in results were not calculated, this is just a
-        # smoke test.  However, these numbers should match those of
-        # the test testMultiRNNCell.
-        self.assertAllClose(res[0], [[0.175991, 0.175991]])
-        self.assertAllClose(res[1], [[0.13248, 0.13248]])
-
-
-class DropoutWrapperTest(test.TestCase):
-
-  def _testDropoutWrapper(self,
-                          batch_size=None,
-                          time_steps=None,
-                          parallel_iterations=None,
-                          **kwargs):
-    with self.cached_session() as sess:
-      with variable_scope.variable_scope(
-          "root", initializer=init_ops.constant_initializer(0.5)):
-        if batch_size is None and time_steps is None:
-          # 2 time steps, batch size 1, depth 3
-          batch_size = 1
-          time_steps = 2
-          x = constant_op.constant(
-              [[[2., 2., 2.]], [[1., 1., 1.]]], dtype=dtypes.float32)
-          m = rnn_cell_impl.LSTMStateTuple(
-              *[constant_op.constant([[0.1, 0.1, 0.1]], dtype=dtypes.float32
-                                    )] * 2)
-        else:
-          x = constant_op.constant(
-              np.random.randn(time_steps, batch_size, 3).astype(np.float32))
-          m = rnn_cell_impl.LSTMStateTuple(*[
-              constant_op.
-              constant([[0.1, 0.1, 0.1]] * batch_size, dtype=dtypes.float32)
-          ] * 2)
-        outputs, final_state = rnn.dynamic_rnn(
-            cell=rnn_cell_impl.DropoutWrapper(
-                rnn_cell_impl.LSTMCell(3), dtype=x.dtype, **kwargs),
-            time_major=True,
-            parallel_iterations=parallel_iterations,
-            inputs=x,
-            initial_state=m)
-        sess.run([variables_lib.global_variables_initializer()])
-        res = sess.run([outputs, final_state])
-        self.assertEqual(res[0].shape, (time_steps, batch_size, 3))
-        self.assertEqual(res[1].c.shape, (batch_size, 3))
-        self.assertEqual(res[1].h.shape, (batch_size, 3))
-        return res
-
-  def testWrappedCellProperty(self):
-    cell = rnn_cell_impl.BasicRNNCell(10)
-    wrapper = rnn_cell_impl.DropoutWrapper(cell)
-    # Github issue 15810
-    self.assertEqual(wrapper.wrapped_cell, cell)
-
-  def testDropoutWrapperKeepAllConstantInput(self):
-    keep = array_ops.ones([])
-    res = self._testDropoutWrapper(
-        input_keep_prob=keep, output_keep_prob=keep, state_keep_prob=keep)
-    true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
-        dtype=np.float32)
-    true_full_final_c = np.array(
-        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
-    self.assertAllClose(true_full_output, res[0])
-    self.assertAllClose(true_full_output[1], res[1].h)
-    self.assertAllClose(true_full_final_c, res[1].c)
-
-  def testDropoutWrapperKeepAll(self):
-    keep = variable_scope.get_variable("all", initializer=1.0)
-    res = self._testDropoutWrapper(
-        input_keep_prob=keep, output_keep_prob=keep, state_keep_prob=keep)
-    true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
-        dtype=np.float32)
-    true_full_final_c = np.array(
-        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
-    self.assertAllClose(true_full_output, res[0])
-    self.assertAllClose(true_full_output[1], res[1].h)
-    self.assertAllClose(true_full_final_c, res[1].c)
-
-  def testDropoutWrapperWithSeed(self):
-    keep_some = 0.5
-    random_seed.set_random_seed(2)
-    ## Use parallel_iterations = 1 in both calls to
-    ## _testDropoutWrapper to ensure the (per-time step) dropout is
-    ## consistent across both calls.  Otherwise the seed may not end
-    ## up being munged consistently across both graphs.
-    res_standard_1 = self._testDropoutWrapper(
-        input_keep_prob=keep_some,
-        output_keep_prob=keep_some,
-        state_keep_prob=keep_some,
-        seed=10,
-        parallel_iterations=1)
-    # Clear away the graph and the test session (which keeps variables around)
-    ops.reset_default_graph()
-    self._ClearCachedSession()
-    random_seed.set_random_seed(2)
-    res_standard_2 = self._testDropoutWrapper(
-        input_keep_prob=keep_some,
-        output_keep_prob=keep_some,
-        state_keep_prob=keep_some,
-        seed=10,
-        parallel_iterations=1)
-    self.assertAllClose(res_standard_1[0], res_standard_2[0])
-    self.assertAllClose(res_standard_1[1].c, res_standard_2[1].c)
-    self.assertAllClose(res_standard_1[1].h, res_standard_2[1].h)
-
-  def testDropoutWrapperKeepNoOutput(self):
-    keep_all = variable_scope.get_variable("all", initializer=1.0)
-    keep_none = variable_scope.get_variable("none", initializer=1e-6)
-    res = self._testDropoutWrapper(
-        input_keep_prob=keep_all,
-        output_keep_prob=keep_none,
-        state_keep_prob=keep_all)
-    true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
-        dtype=np.float32)
-    true_full_final_c = np.array(
-        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
-    self.assertAllClose(np.zeros(res[0].shape), res[0])
-    self.assertAllClose(true_full_output[1], res[1].h)
-    self.assertAllClose(true_full_final_c, res[1].c)
-
-  def testDropoutWrapperKeepNoStateExceptLSTMCellMemory(self):
-    keep_all = variable_scope.get_variable("all", initializer=1.0)
-    keep_none = variable_scope.get_variable("none", initializer=1e-6)
-    # Even though we dropout state, by default DropoutWrapper never
-    # drops out the memory ("c") term of an LSTMStateTuple.
-    res = self._testDropoutWrapper(
-        input_keep_prob=keep_all,
-        output_keep_prob=keep_all,
-        state_keep_prob=keep_none)
-    true_c_state = np.array([[1.713925, 1.713925, 1.713925]], dtype=np.float32)
-    true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
-        dtype=np.float32)
-    self.assertAllClose(true_full_output[0], res[0][0])
-    # Second output is modified by zero input state
-    self.assertGreater(np.linalg.norm(true_full_output[1] - res[0][1]), 1e-4)
-    # h state has been set to zero
-    self.assertAllClose(np.zeros(res[1].h.shape), res[1].h)
-    # c state of an LSTMStateTuple is NEVER modified.
-    self.assertAllClose(true_c_state, res[1].c)
-
-  def testDropoutWrapperKeepNoInput(self):
-    keep_all = variable_scope.get_variable("all", initializer=1.0)
-    keep_none = variable_scope.get_variable("none", initializer=1e-6)
-    true_full_output = np.array(
-        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
-        dtype=np.float32)
-    true_full_final_c = np.array(
-        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
-    # All outputs are different because inputs are zeroed out
-    res = self._testDropoutWrapper(
-        input_keep_prob=keep_none,
-        output_keep_prob=keep_all,
-        state_keep_prob=keep_all)
-    self.assertGreater(np.linalg.norm(res[0] - true_full_output), 1e-4)
-    self.assertGreater(np.linalg.norm(res[1].h - true_full_output[1]), 1e-4)
-    self.assertGreater(np.linalg.norm(res[1].c - true_full_final_c), 1e-4)
-
-  def testDropoutWrapperRecurrentOutput(self):
-    keep_some = 0.8
-    keep_all = variable_scope.get_variable("all", initializer=1.0)
-    res = self._testDropoutWrapper(
-        input_keep_prob=keep_all,
-        output_keep_prob=keep_some,
-        state_keep_prob=keep_all,
-        variational_recurrent=True,
-        input_size=3,
-        batch_size=5,
-        time_steps=7)
-    # Ensure the same dropout pattern for all time steps
-    output_mask = np.abs(res[0]) > 1e-6
-    for m in output_mask[1:]:
-      self.assertAllClose(output_mask[0], m)
-
-  def testDropoutWrapperRecurrentStateInputAndOutput(self):
-    keep_some = 0.9
-    res = self._testDropoutWrapper(
-        input_keep_prob=keep_some,
-        output_keep_prob=keep_some,
-        state_keep_prob=keep_some,
-        variational_recurrent=True,
-        input_size=3,
-        batch_size=5,
-        time_steps=7)
-
-    # Smoke test for the state/input masks.
-    output_mask = np.abs(res[0]) > 1e-6
-    for time_step in output_mask:
-      # Ensure the same dropout output pattern for all time steps
-      self.assertAllClose(output_mask[0], time_step)
-      for batch_entry in time_step:
-        # Assert all batch entries get the same mask
-        self.assertAllClose(batch_entry, time_step[0])
-
-    # For state, ensure all batch entries have the same mask
-    state_c_mask = np.abs(res[1].c) > 1e-6
-    state_h_mask = np.abs(res[1].h) > 1e-6
-    for batch_entry in state_c_mask:
-      self.assertAllClose(batch_entry, state_c_mask[0])
-    for batch_entry in state_h_mask:
-      self.assertAllClose(batch_entry, state_h_mask[0])
-
-  def testDropoutWrapperRecurrentStateInputAndOutputWithSeed(self):
-    keep_some = 0.9
-    random_seed.set_random_seed(2347)
-    np.random.seed(23487)
-    res0 = self._testDropoutWrapper(
-        input_keep_prob=keep_some,
-        output_keep_prob=keep_some,
-        state_keep_prob=keep_some,
-        variational_recurrent=True,
-        input_size=3,
-        batch_size=5,
-        time_steps=7,
-        seed=-234987)
-    ops.reset_default_graph()
-    self._ClearCachedSession()
-    random_seed.set_random_seed(2347)
-    np.random.seed(23487)
-    res1 = self._testDropoutWrapper(
-        input_keep_prob=keep_some,
-        output_keep_prob=keep_some,
-        state_keep_prob=keep_some,
-        variational_recurrent=True,
-        input_size=3,
-        batch_size=5,
-        time_steps=7,
-        seed=-234987)
-
-    output_mask = np.abs(res0[0]) > 1e-6
-    for time_step in output_mask:
-      # Ensure the same dropout output pattern for all time steps
-      self.assertAllClose(output_mask[0], time_step)
-      for batch_entry in time_step:
-        # Assert all batch entries get the same mask
-        self.assertAllClose(batch_entry, time_step[0])
-
-    # For state, ensure all batch entries have the same mask
-    state_c_mask = np.abs(res0[1].c) > 1e-6
-    state_h_mask = np.abs(res0[1].h) > 1e-6
-    for batch_entry in state_c_mask:
-      self.assertAllClose(batch_entry, state_c_mask[0])
-    for batch_entry in state_h_mask:
-      self.assertAllClose(batch_entry, state_h_mask[0])
-
-    # Ensure seeded calculation is identical.
-    self.assertAllClose(res0[0], res1[0])
-    self.assertAllClose(res0[1].c, res1[1].c)
-    self.assertAllClose(res0[1].h, res1[1].h)
-
-
-def basic_rnn_cell(inputs, state, num_units, scope=None):
-  if state is None:
-    if inputs is not None:
-      batch_size = inputs.get_shape()[0]
-      dtype = inputs.dtype
-    else:
-      batch_size = 0
-      dtype = dtypes.float32
-    init_output = array_ops.zeros(
-        array_ops.stack([batch_size, num_units]), dtype=dtype)
-    init_state = array_ops.zeros(
-        array_ops.stack([batch_size, num_units]), dtype=dtype)
-    init_output.set_shape([batch_size, num_units])
-    init_state.set_shape([batch_size, num_units])
-    return init_output, init_state
-  else:
-    with variable_scope.variable_scope(scope, "basic_rnn_cell",
-                                       [inputs, state]):
-      output = math_ops.tanh(
-          Linear([inputs, state], num_units, True)([inputs, state]))
-    return output, output
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index aa1d7d2b01b4595bbb03ba8e867e93db759cbd52..dfac2df6a0d4143106ad0f090805597c26659280 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -22,6 +22,7 @@ import itertools
 
 import numpy as np
 
+from tensorflow.contrib.rnn.python.ops import core_rnn_cell as legacy_rnn_cell
 from tensorflow.contrib.rnn.python.ops import rnn_cell as contrib_rnn_cell
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
@@ -29,7 +30,9 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras import initializers
+from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras import utils
 from tensorflow.python.ops import array_ops
@@ -51,6 +54,294 @@ from tensorflow.python.util import nest
 
 class RNNCellTest(test.TestCase):
 
+  def testIndRNNCell(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 2])
+        cell = contrib_rnn_cell.IndRNNCell(2)
+        g, _ = cell(x, m)
+        self.assertEqual([
+            "root/ind_rnn_cell/%s_w:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+            "root/ind_rnn_cell/%s_u:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+            "root/ind_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME
+        ], [v.name for v in cell.trainable_variables])
+        self.assertFalse(cell.non_trainable_variables)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        self.assertEqual(res[0].shape, (1, 2))
+
+  def testIndyGRUCell(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 2])
+        g, _ = contrib_rnn_cell.IndyGRUCell(2)(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.185265, 0.17704]])
+      with variable_scope.variable_scope(
+          "other", initializer=init_ops.constant_initializer(0.5)):
+        # Test IndyGRUCell with input_size != num_units.
+        x = array_ops.zeros([1, 3])
+        m = array_ops.zeros([1, 2])
+        g, _ = contrib_rnn_cell.IndyGRUCell(2)(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.155127, 0.157328]])
+
+  def testIndyLSTMCell(self):
+    for dtype in [dtypes.float16, dtypes.float32]:
+      np_dtype = dtype.as_numpy_dtype
+      with self.session(graph=ops.Graph()) as sess:
+        with variable_scope.variable_scope(
+            "root", initializer=init_ops.constant_initializer(0.5)):
+          x = array_ops.zeros([1, 2], dtype=dtype)
+          state_0 = (array_ops.zeros([1, 2], dtype=dtype),) * 2
+          state_1 = (array_ops.zeros([1, 2], dtype=dtype),) * 2
+          cell = rnn_cell_impl.MultiRNNCell(
+              [contrib_rnn_cell.IndyLSTMCell(2) for _ in range(2)])
+          self.assertEqual(cell.dtype, None)
+          self.assertEqual("cell-0", cell._checkpoint_dependencies[0].name)
+          self.assertEqual("cell-1", cell._checkpoint_dependencies[1].name)
+          cell.get_config()  # Should not throw an error
+          g, (out_state_0, out_state_1) = cell(x, (state_0, state_1))
+          # Layer infers the input type.
+          self.assertEqual(cell.dtype, dtype.name)
+          expected_variable_names = [
+              "root/multi_rnn_cell/cell_0/indy_lstm_cell/%s_w:0" %
+              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+              "root/multi_rnn_cell/cell_0/indy_lstm_cell/%s_u:0" %
+              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+              "root/multi_rnn_cell/cell_0/indy_lstm_cell/%s:0" %
+              rnn_cell_impl._BIAS_VARIABLE_NAME,
+              "root/multi_rnn_cell/cell_1/indy_lstm_cell/%s_w:0" %
+              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+              "root/multi_rnn_cell/cell_1/indy_lstm_cell/%s_u:0" %
+              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+              "root/multi_rnn_cell/cell_1/indy_lstm_cell/%s:0" %
+              rnn_cell_impl._BIAS_VARIABLE_NAME
+          ]
+          self.assertEqual(expected_variable_names,
+                           [v.name for v in cell.trainable_variables])
+          self.assertFalse(cell.non_trainable_variables)
+          sess.run([variables.global_variables_initializer()])
+          res = sess.run(
+              [g, out_state_0, out_state_1], {
+                  x.name: np.array([[1., 1.]]),
+                  state_0[0].name: 0.1 * np.ones([1, 2]),
+                  state_0[1].name: 0.1 * np.ones([1, 2]),
+                  state_1[0].name: 0.1 * np.ones([1, 2]),
+                  state_1[1].name: 0.1 * np.ones([1, 2]),
+              })
+          self.assertEqual(len(res), 3)
+          global_variables = variables.global_variables()
+          self.assertEqual(expected_variable_names,
+                           [v.name for v in global_variables])
+          # Only check the range of outputs as this is just a smoke test.
+          self.assertAllInRange(res[0], -1.0, 1.0)
+          self.assertAllInRange(res[1], -1.0, 1.0)
+          self.assertAllInRange(res[2], -1.0, 1.0)
+        with variable_scope.variable_scope(
+            "other", initializer=init_ops.constant_initializer(0.5)):
+          # Test IndyLSTMCell with input_size != num_units.
+          x = array_ops.zeros([1, 3], dtype=dtype)
+          state = (array_ops.zeros([1, 2], dtype=dtype),) * 2
+          g, out_state = contrib_rnn_cell.IndyLSTMCell(2)(x, state)
+          sess.run([variables.global_variables_initializer()])
+          res = sess.run(
+              [g, out_state], {
+                  x.name: np.array([[1., 1., 1.]], dtype=np_dtype),
+                  state[0].name: 0.1 * np.ones([1, 2], dtype=np_dtype),
+                  state[1].name: 0.1 * np.ones([1, 2], dtype=np_dtype),
+              })
+          self.assertEqual(len(res), 2)
+
+  def testLSTMCellLayerNorm(self):
+    with self.cached_session() as sess:
+      num_units = 2
+      num_proj = 3
+      batch_size = 1
+      input_size = 4
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([batch_size, input_size])
+        c = array_ops.zeros([batch_size, num_units])
+        h = array_ops.zeros([batch_size, num_proj])
+        state = rnn_cell_impl.LSTMStateTuple(c, h)
+        cell = contrib_rnn_cell.LayerNormLSTMCell(
+            num_units=num_units,
+            num_proj=num_proj,
+            forget_bias=1.0,
+            layer_norm=True,
+            norm_gain=1.0,
+            norm_shift=0.0)
+        g, out_m = cell(x, state)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run(
+            [g, out_m], {
+                x.name: np.ones((batch_size, input_size)),
+                c.name: 0.1 * np.ones((batch_size, num_units)),
+                h.name: 0.1 * np.ones((batch_size, num_proj))
+            })
+        self.assertEqual(len(res), 2)
+        # The numbers in results were not calculated, this is mostly just a
+        # smoke test.
+        self.assertEqual(res[0].shape, (batch_size, num_proj))
+        self.assertEqual(res[1][0].shape, (batch_size, num_units))
+        self.assertEqual(res[1][1].shape, (batch_size, num_proj))
+        # Different inputs so different outputs and states
+        for i in range(1, batch_size):
+          self.assertTrue(
+              float(np.linalg.norm((res[0][0, :] - res[0][i, :]))) < 1e-6)
+          self.assertTrue(
+              float(np.linalg.norm((res[1][0, :] - res[1][i, :]))) < 1e-6)
+
+  def testOutputProjectionWrapper(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 3])
+        m = array_ops.zeros([1, 3])
+        cell = legacy_rnn_cell.OutputProjectionWrapper(
+            rnn_cell_impl.GRUCell(3), 2)
+        g, new_m = cell(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g, new_m], {
+            x.name: np.array([[1., 1., 1.]]),
+            m.name: np.array([[0.1, 0.1, 0.1]])
+        })
+        self.assertEqual(res[1].shape, (1, 3))
+        # The numbers in results were not calculated, this is just a smoke test.
+        self.assertAllClose(res[0], [[0.231907, 0.231907]])
+
+  def testInputProjectionWrapper(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 3])
+        cell = legacy_rnn_cell.InputProjectionWrapper(
+            rnn_cell_impl.GRUCell(3), num_proj=3)
+        g, new_m = cell(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g, new_m], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1, 0.1]])
+        })
+        self.assertEqual(res[1].shape, (1, 3))
+        # The numbers in results were not calculated, this is just a smoke test.
+        self.assertAllClose(res[0], [[0.154605, 0.154605, 0.154605]])
+
+  def testEmbeddingWrapper(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 1], dtype=dtypes.int32)
+        m = array_ops.zeros([1, 2])
+        embedding_cell = legacy_rnn_cell.EmbeddingWrapper(
+            rnn_cell_impl.GRUCell(2), embedding_classes=3, embedding_size=2)
+        self.assertEqual(embedding_cell.output_size, 2)
+        g, new_m = embedding_cell(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g, new_m], {
+            x.name: np.array([[1]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        self.assertEqual(res[1].shape, (1, 2))
+        # The numbers in results were not calculated, this is just a smoke test.
+        self.assertAllClose(res[0], [[0.17139, 0.17139]])
+
+  def testEmbeddingWrapperWithDynamicRnn(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope("root"):
+        inputs = ops.convert_to_tensor([[[0], [0]]], dtype=dtypes.int64)
+        input_lengths = ops.convert_to_tensor([2], dtype=dtypes.int64)
+        embedding_cell = legacy_rnn_cell.EmbeddingWrapper(
+            rnn_cell_impl.BasicLSTMCell(1, state_is_tuple=True),
+            embedding_classes=1,
+            embedding_size=2)
+        outputs, _ = rnn.dynamic_rnn(
+            cell=embedding_cell,
+            inputs=inputs,
+            sequence_length=input_lengths,
+            dtype=dtypes.float32)
+        sess.run([variables.global_variables_initializer()])
+        # This will fail if output's dtype is inferred from input's.
+        sess.run(outputs)
+
+  def testSRUCell(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 2])
+        g, _ = contrib_rnn_cell.SRUCell(2)(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.509682, 0.509682]])
+
+  def testSRUCellKerasRNN(self):
+    """Tests that SRUCell works with keras RNN layer."""
+    cell = contrib_rnn_cell.SRUCell(10)
+    seq_input = ops.convert_to_tensor(
+        np.random.rand(2, 3, 5), name="seq_input", dtype=dtypes.float32)
+    rnn_layer = keras_layers.RNN(cell=cell)
+    rnn_outputs_keras = rnn_layer(seq_input)
+    with self.cached_session() as sess:
+      sess.run([variables.global_variables_initializer()])
+      self.assertEqual(sess.run(rnn_outputs_keras).shape, (2, 10))
+
+  def testSRUCellBiasType(self):
+    """Tests that the bias' dtype is properly set."""
+    cell = contrib_rnn_cell.SRUCell(10)
+    cell.build((2, 3, 5))
+    self.assertEqual(cell._bias.dtype, dtypes.float32_ref)
+
+    cell = contrib_rnn_cell.SRUCell(10, dtype=dtypes.int32)
+    cell.build((2, 3, 5))
+    self.assertEqual(cell._bias.dtype, dtypes.int32_ref)
+
+    cell_input = ops.convert_to_tensor(
+        np.random.rand(2, 5), name="cell_input", dtype=dtypes.float16)
+    cell_state = ops.convert_to_tensor(
+        np.random.rand(2, 10), name="cell_state", dtype=dtypes.float16)
+    cell = contrib_rnn_cell.SRUCell(10)
+    cell(cell_input, [cell_state])
+    self.assertEqual(cell._bias.dtype, dtypes.float16_ref)
+
+  def testSRUCellWithDiffSize(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 3])
+        m = array_ops.zeros([1, 2])
+        g, _ = contrib_rnn_cell.SRUCell(2)(x, m)
+        sess.run([variables.global_variables_initializer()])
+        res = sess.run([g], {
+            x.name: np.array([[1., 1., 1.]]),
+            m.name: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.55255556, 0.55255556]])
+
   def testCoupledInputForgetGateLSTMCell(self):
     with self.cached_session() as sess:
       num_units = 2
@@ -763,6 +1054,17 @@ class RNNCellTest(test.TestCase):
         self.assertEqual(new_h.shape[1], num_proj)
         self.assertAllClose(np.concatenate(res[1], axis=1), expected_state)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testNASCellKerasRNN(self):
+    """Tests that NASCell works with keras RNN layer."""
+    cell = contrib_rnn_cell.NASCell(10)
+    seq_input = ops.convert_to_tensor(
+        np.random.rand(2, 3, 5), name="seq_input", dtype=dtypes.float32)
+    rnn_layer = keras_layers.RNN(cell=cell)
+    rnn_outputs = rnn_layer(seq_input)
+    self.evaluate([variables.global_variables_initializer()])
+    self.assertEqual(self.evaluate(rnn_outputs).shape, (2, 10))
+
   def testUGRNNCell(self):
     num_units = 2
     batch_size = 3
diff --git a/tensorflow/contrib/rnn/python/ops/rnn.py b/tensorflow/contrib/rnn/python/ops/rnn.py
index 0266b72dcb15e4aba01a9a31b4be75c5b84d44da..41b1698321e20f4360d75fa2db79f9bd8a806cea 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn.py
@@ -131,7 +131,8 @@ def stack_bidirectional_dynamic_rnn(cells_fw,
                                     sequence_length=None,
                                     parallel_iterations=None,
                                     time_major=False,
-                                    scope=None):
+                                    scope=None,
+                                    swap_memory=False):
   """Creates a dynamic bidirectional recurrent neural network.
 
   Stacks several bidirectional rnn layers. The combined forward and backward
@@ -171,6 +172,10 @@ def stack_bidirectional_dynamic_rnn(cells_fw,
       data is batch-major, so by default this function accepts input and emits
       output in batch-major form.
     scope: VariableScope for the created subgraph; defaults to None.
+    swap_memory: Transparently swap the tensors produced in forward inference
+      but needed for back prop from GPU to CPU.  This allows training RNNs
+      which would typically not fit on a single GPU, with very minimal (or no)
+      performance penalty.
 
   Returns:
     A tuple (outputs, output_state_fw, output_state_bw) where:
@@ -230,6 +235,7 @@ def stack_bidirectional_dynamic_rnn(cells_fw,
             sequence_length=sequence_length,
             parallel_iterations=parallel_iterations,
             dtype=dtype,
+            swap_memory=swap_memory,
             time_major=time_major)
         # Concat the outputs to create the new input.
         prev_layer = array_ops.concat(outputs, 2)
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 8a1c09f171e6108174671e3122d5ff4c0b236003..d25afc8b9c4381fb3b0092ef21f46646353e1b8e 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -1462,7 +1462,7 @@ class LayerNormBasicLSTMCell(rnn_cell_impl.RNNCell):
     return new_h, new_state
 
 
-class NASCell(rnn_cell_impl.RNNCell):
+class NASCell(rnn_cell_impl.LayerRNNCell):
   """Neural Architecture Search (NAS) recurrent network cell.
 
   This implements the recurrent cell from the paper:
@@ -1475,23 +1475,28 @@ class NASCell(rnn_cell_impl.RNNCell):
   The class uses an optional projection layer.
   """
 
-  def __init__(self, num_units, num_proj=None, use_biases=False, reuse=None):
+  # NAS cell's architecture base.
+  _NAS_BASE = 8
+
+  def __init__(self, num_units, num_proj=None, use_bias=False, reuse=None,
+               **kwargs):
     """Initialize the parameters for a NAS cell.
 
     Args:
-      num_units: int, The number of units in the NAS cell
+      num_units: int, The number of units in the NAS cell.
       num_proj: (optional) int, The output dimensionality for the projection
         matrices.  If None, no projection is performed.
-      use_biases: (optional) bool, If True then use biases within the cell. This
+      use_bias: (optional) bool, If True then use biases within the cell. This
         is False by default.
       reuse: (optional) Python boolean describing whether to reuse variables
         in an existing scope.  If not `True`, and the existing scope already has
         the given variables, an error is raised.
+      **kwargs: Additional keyword arguments.
     """
-    super(NASCell, self).__init__(_reuse=reuse)
+    super(NASCell, self).__init__(_reuse=reuse, **kwargs)
     self._num_units = num_units
     self._num_proj = num_proj
-    self._use_biases = use_biases
+    self._use_bias = use_bias
     self._reuse = reuse
 
     if num_proj is not None:
@@ -1509,6 +1514,33 @@ class NASCell(rnn_cell_impl.RNNCell):
   def output_size(self):
     return self._output_size
 
+  def build(self, inputs_shape):
+    input_size = tensor_shape.dimension_value(
+        tensor_shape.TensorShape(inputs_shape).with_rank(2)[1])
+    if input_size is None:
+      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
+
+    num_proj = self._num_units if self._num_proj is None else self._num_proj
+
+    # Variables for the NAS cell. `recurrent_kernel` is all matrices multiplying
+    # the hiddenstate and `kernel` is all matrices multiplying the inputs.
+    self.recurrent_kernel = self.add_variable(
+        "recurrent_kernel", [num_proj, self._NAS_BASE * self._num_units])
+    self.kernel = self.add_variable(
+        "kernel", [input_size, self._NAS_BASE * self._num_units])
+
+    if self._use_bias:
+      self.bias = self.add_variable("bias",
+                                    shape=[self._NAS_BASE * self._num_units],
+                                    initializer=init_ops.zeros_initializer)
+
+    # Projection layer if specified
+    if self._num_proj is not None:
+      self.projection_weights = self.add_variable(
+          "projection_weights", [self._num_units, self._num_proj])
+
+    self.built = True
+
   def call(self, inputs, state):
     """Run one step of NAS Cell.
 
@@ -1535,38 +1567,20 @@ class NASCell(rnn_cell_impl.RNNCell):
     tanh = math_ops.tanh
     relu = nn_ops.relu
 
-    num_proj = self._num_units if self._num_proj is None else self._num_proj
-
     (c_prev, m_prev) = state
 
-    dtype = inputs.dtype
-    input_size = inputs.get_shape().with_rank(2).dims[1]
-    if input_size.value is None:
-      raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
-    # Variables for the NAS cell. W_m is all matrices multiplying the
-    # hiddenstate and W_inputs is all matrices multiplying the inputs.
-    concat_w_m = vs.get_variable("recurrent_kernel",
-                                 [num_proj, 8 * self._num_units], dtype)
-    concat_w_inputs = vs.get_variable(
-        "kernel", [input_size.value, 8 * self._num_units], dtype)
-
-    m_matrix = math_ops.matmul(m_prev, concat_w_m)
-    inputs_matrix = math_ops.matmul(inputs, concat_w_inputs)
-
-    if self._use_biases:
-      b = vs.get_variable(
-          "bias",
-          shape=[8 * self._num_units],
-          initializer=init_ops.zeros_initializer(),
-          dtype=dtype)
-      m_matrix = nn_ops.bias_add(m_matrix, b)
+    m_matrix = math_ops.matmul(m_prev, self.recurrent_kernel)
+    inputs_matrix = math_ops.matmul(inputs, self.kernel)
+
+    if self._use_bias:
+      m_matrix = nn_ops.bias_add(m_matrix, self.bias)
 
     # The NAS cell branches into 8 different splits for both the hiddenstate
     # and the input
     m_matrix_splits = array_ops.split(
-        axis=1, num_or_size_splits=8, value=m_matrix)
+        axis=1, num_or_size_splits=self._NAS_BASE, value=m_matrix)
     inputs_matrix_splits = array_ops.split(
-        axis=1, num_or_size_splits=8, value=inputs_matrix)
+        axis=1, num_or_size_splits=self._NAS_BASE, value=inputs_matrix)
 
     # First layer
     layer1_0 = sigmoid(inputs_matrix_splits[0] + m_matrix_splits[0])
@@ -1598,9 +1612,7 @@ class NASCell(rnn_cell_impl.RNNCell):
 
     # Projection layer if specified
     if self._num_proj is not None:
-      concat_w_proj = vs.get_variable("projection_weights",
-                                      [self._num_units, self._num_proj], dtype)
-      new_m = math_ops.matmul(new_m, concat_w_proj)
+      new_m = math_ops.matmul(new_m, self.projection_weights)
 
     new_state = rnn_cell_impl.LSTMStateTuple(new_c, new_m)
     return new_m, new_state
@@ -2071,7 +2083,7 @@ class ConvLSTMCell(rnn_cell_impl.RNNCell):
       conv_ndims: Convolution dimensionality (1, 2 or 3).
       input_shape: Shape of the input as int tuple, excluding the batch size.
       output_channels: int, number of output channels of the conv LSTM.
-      kernel_shape: Shape of kernel as in tuple (of size 1,2 or 3).
+      kernel_shape: Shape of kernel as an int tuple (of size 1, 2 or 3).
       use_bias: (bool) Use bias in convolutions.
       skip_connection: If set to `True`, concatenate the input to the
         output of the conv LSTM. Default: `False`.
@@ -2092,7 +2104,7 @@ class ConvLSTMCell(rnn_cell_impl.RNNCell):
     self._conv_ndims = conv_ndims
     self._input_shape = input_shape
     self._output_channels = output_channels
-    self._kernel_shape = kernel_shape
+    self._kernel_shape = list(kernel_shape)
     self._use_bias = use_bias
     self._forget_bias = forget_bias
     self._skip_connection = skip_connection
@@ -2172,7 +2184,7 @@ def _conv(args, filter_size, num_features, bias, bias_start=0.0):
   Args:
     args: a Tensor or a list of Tensors of dimension 3D, 4D or 5D,
     batch x n, Tensors.
-    filter_size: int tuple of filter height and width.
+    filter_size: int tuple of filter shape (of size 1, 2 or 3).
     num_features: int, number of features.
     bias: Whether to use biases in the convolution layer.
     bias_start: starting value to initialize the bias; 0 by default.
@@ -2744,10 +2756,12 @@ class SRUCell(rnn_cell_impl.LayerRNNCell):
     name: (optional) String, the name of the layer. Layers with the same name
       will share weights, but to avoid mistakes we require reuse=True in such
       cases.
+    **kwargs: Additional keyword arguments.
   """
 
-  def __init__(self, num_units, activation=None, reuse=None, name=None):
-    super(SRUCell, self).__init__(_reuse=reuse, name=name)
+  def __init__(self, num_units, activation=None, reuse=None, name=None,
+               **kwargs):
+    super(SRUCell, self).__init__(_reuse=reuse, name=name, **kwargs)
     self._num_units = num_units
     self._activation = activation or math_ops.tanh
 
@@ -2777,7 +2791,7 @@ class SRUCell(rnn_cell_impl.LayerRNNCell):
     self._bias = self.add_variable(
         rnn_cell_impl._BIAS_VARIABLE_NAME,  # pylint: disable=protected-access
         shape=[2 * self._num_units],
-        initializer=init_ops.constant_initializer(0.0, dtype=self.dtype))
+        initializer=init_ops.zeros_initializer)
 
     self._built = True
 
@@ -3139,7 +3153,7 @@ class IndyGRUCell(rnn_cell_impl.LayerRNNCell):
   r"""Independently Gated Recurrent Unit cell.
 
   Based on IndRNNs (https://arxiv.org/abs/1803.04831) and similar to GRUCell,
-  yet with the \(U_r\), \(U_z\), and \(U\) matrices in equations 5, 6, and
+  yet with the \\(U_r\\), \\(U_z\\), and \\(U\\) matrices in equations 5, 6, and
   8 of http://arxiv.org/abs/1406.1078 respectively replaced by diagonal
   matrices, i.e. a Hadamard product with a single vector:
 
@@ -3150,12 +3164,10 @@ class IndyGRUCell(rnn_cell_impl.LayerRNNCell):
     $$\tilde{h}^{(t)}_j = \phi\left([\mathbf W \mathbf x]_j +
       [\mathbf u \circ \mathbf r \circ \mathbf h_{(t-1)}]_j\right)$$
 
-  where \(\circ\) denotes the Hadamard operator. This means that each IndyGRU
+  where \\(\circ\\) denotes the Hadamard operator. This means that each IndyGRU
   node sees only its own state, as opposed to seeing all states in the same
   layer.
 
-  TODO(gonnet): Write a paper describing this and add a reference here.
-
   Args:
     num_units: int, The number of units in the GRU cell.
     activation: Nonlinearity to use.  Default: `tanh`.
@@ -3240,7 +3252,7 @@ class IndyGRUCell(rnn_cell_impl.LayerRNNCell):
     self.built = True
 
   def call(self, inputs, state):
-    """Gated recurrent unit (GRU) with nunits cells."""
+    """Recurrently independent Gated Recurrent Unit (GRU) with nunits cells."""
 
     gate_inputs = math_ops.matmul(inputs, self._gate_kernel_w) + (
         gen_array_ops.tile(state, [1, 2]) * self._gate_kernel_u)
@@ -3264,10 +3276,9 @@ class IndyLSTMCell(rnn_cell_impl.LayerRNNCell):
   r"""Basic IndyLSTM recurrent network cell.
 
   Based on IndRNNs (https://arxiv.org/abs/1803.04831) and similar to
-  BasicLSTMCell, yet with the \(U_f\), \(U_i\), \(U_o\) and \(U_c\)
-  matrices in
-  https://en.wikipedia.org/wiki/Long_short-term_memory#LSTM_with_a_forget_gate
-  replaced by diagonal matrices, i.e. a Hadamard product with a single vector:
+  BasicLSTMCell, yet with the \\(U_f\\), \\(U_i\\), \\(U_o\\) and \\(U_c\\)
+  matrices in the regular LSTM equations replaced by diagonal matrices, i.e. a
+  Hadamard product with a single vector:
 
     $$f_t = \sigma_g\left(W_f x_t + u_f \circ h_{t-1} + b_f\right)$$
     $$i_t = \sigma_g\left(W_i x_t + u_i \circ h_{t-1} + b_i\right)$$
@@ -3275,8 +3286,8 @@ class IndyLSTMCell(rnn_cell_impl.LayerRNNCell):
     $$c_t = f_t \circ c_{t-1} +
             i_t \circ \sigma_c\left(W_c x_t + u_c \circ h_{t-1} + b_c\right)$$
 
-  where \(\circ\) denotes the Hadamard operator. This means that each IndyLSTM
-  node sees only its own state \(h\) and \(c\), as opposed to seeing all
+  where \\(\circ\\) denotes the Hadamard operator. This means that each IndyLSTM
+  node sees only its own state \\(h\\) and \\(c\\), as opposed to seeing all
   states in the same layer.
 
   We add forget_bias (default: 1) to the biases of the forget gate in order to
@@ -3284,11 +3295,6 @@ class IndyLSTMCell(rnn_cell_impl.LayerRNNCell):
 
   It does not allow cell clipping, a projection layer, and does not
   use peep-hole connections: it is the basic baseline.
-
-  For advanced models, please use the full `tf.nn.rnn_cell.LSTMCell`
-  that follows.
-
-  TODO(gonnet): Write a paper describing this and add a reference here.
   """
 
   def __init__(self,
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
index 3fc6bfbb4d03a39906d4441e48b2788423caa234..d8ab9eba7049e468b373a1641f92dc781aa22558 100644
--- a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
+++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test.py
@@ -61,10 +61,7 @@ class RpcOpTest(test.TestCase, rpc_op_test_base.RpcOpTestBase):
     self._server = server
 
   def tearDown(self):
-    # TODO(ebrevdo): Figure out why this sometimes times out.
-    #    self._service.ExitLoop()
-    #    self._service_thread.join()
-    # self._server.stop()
+    self._server.stop(grace=None)
     super(RpcOpTest, self).tearDown()
 
 
diff --git a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
index 0d615923e04915a8429252317025ac8e79f9bb4e..d6148715be91c78e6e5a99fc0f3caa905b5c1a7d 100644
--- a/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
+++ b/tensorflow/contrib/rpc/python/kernel_tests/rpc_op_test_base.py
@@ -176,7 +176,9 @@ class RpcOpTestBase(object):
       expected_message_values = np.where(
           status_code_values == errors.INVALID_ARGUMENT,
           I_WARNED_YOU.encode('ascii'), b'')
-      self.assertAllEqual(expected_message_values, status_message_values)
+      for msg, expected in zip(status_message_values, expected_message_values):
+        self.assertTrue(expected in msg,
+                        '"%s" did not contain "%s"' % (msg, expected))
 
   def testVecHostPortRpc(self):
     with self.cached_session() as sess:
diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD
index 269443b2c6508bb618d30f64487b1a6a84e8646f..f0242a3b40fd566ec0f477d462426d5f550d1620 100644
--- a/tensorflow/contrib/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/BUILD
@@ -84,35 +84,6 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:metrics",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:saver",
-        "//tensorflow/python:util",
-        "//tensorflow/python/estimator:estimator_py",
-        "//tensorflow/python/keras:engine",
-        "//tensorflow/python/saved_model",
-    ],
-)
-
-py_test(
-    name = "keras_saved_model_test",
-    size = "medium",
-    srcs = ["python/saved_model/keras_saved_model_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_oss",  # TODO(b/119349471): Re-enable
-        "no_windows",
-    ],
-    deps = [
-        ":keras_saved_model",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:training",
-        "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/keras",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
index ffba514bb96f5ce8d963cb0a0482738eafe88355..a61e9579b84a60d74b73e45a6100a2c772d9cff8 100644
--- a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
+++ b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
@@ -18,348 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-import six
+from tensorflow.python.keras import saving
 
-from tensorflow.python.client import session
-from tensorflow.python.estimator import keras as estimator_keras_util
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator.export import export as export_helpers
-from tensorflow.python.framework import ops
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras import models as models_lib
-from tensorflow.python.keras import optimizers
-from tensorflow.python.keras.engine import sequential
-from tensorflow.python.keras.metrics import Metric
-from tensorflow.python.keras.models import model_from_json
-from tensorflow.python.lib.io import file_io
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.saved_model import builder as saved_model_builder
-from tensorflow.python.saved_model import constants
-from tensorflow.python.saved_model import utils_impl as saved_model_utils
-from tensorflow.python.training import saver as saver_lib
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
-from tensorflow.python.util import compat
 
-
-def save_keras_model(
-    model, saved_model_path, custom_objects=None, as_text=None):
-  """Save a `tf.keras.Model` into Tensorflow SavedModel format.
-
-  `save_model` generates new files/folders under the `saved_model_path` folder:
-  1) an asset folder containing the json string of the model's
-     configuration (topology).
-  2) a checkpoint containing the model weights.
-  3) a saved_model.pb file containing the model's MetaGraphs. The prediction
-     graph is always exported. The evaluaton and training graphs are exported
-     if the following conditions are met:
-     - Evaluation: model loss is defined.
-     - Training: model is compiled with an optimizer defined under `tf.train`.
-       This is because `tf.keras.optimizers.Optimizer` instances cannot be
-       saved to checkpoints.
-
-  Model Requirements:
-  - Model must be a sequential model or functional model. Subclassed models can
-    not be saved via this function, unless you provide an implementation for
-    get_config() and from_config().
-  - All variables must be saveable by the model. In general, this condition is
-    met through the use of layers defined in the keras library. However,
-    there is currently a bug with variables created in Lambda layer functions
-    not being saved correctly (see
-    https://github.com/keras-team/keras/issues/9740).
-
-  Note that each mode is exported in separate graphs, so different modes do not
-  share variables. To use the train graph with evaluation or prediction graphs,
-  create a new checkpoint if variable values have been updated.
-
-  Example:
-
-  ```python
-  import tensorflow as tf
-
-  # Create a tf.keras model.
-  model = tf.keras.Sequential()
-  model.add(tf.keras.layers.Dense(1, input_shape=[10]))
-  model.summary()
-
-  # Save the tf.keras model in the SavedModel format.
-  saved_to_path = tf.contrib.saved_model.save_keras_model(
-        model, '/tmp/my_simple_tf_keras_saved_model')
-
-  # Load the saved keras model back.
-  model_prime = tf.contrib.saved_model.load_keras_model(saved_to_path)
-  model_prime.summary()
-  ```
-
-  Args:
-    model: A `tf.keras.Model` to be saved.
-    saved_model_path: a string specifying the path to the SavedModel directory.
-      The SavedModel will be saved to a timestamped folder created within this
-      directory.
-    custom_objects: Optional dictionary mapping string names to custom classes
-      or functions (e.g. custom loss functions).
-    as_text: whether to write the `SavedModel` proto in text format.
-
-  Returns:
-    String path to the SavedModel folder, a subdirectory of `saved_model_path`.
-
-  Raises:
-    NotImplementedError: If the model is a subclassed model.
-    ValueError: If a Sequential model does not have input shapes defined by the
-      user, and is not built.
-  """
-  if not model._is_graph_network:
-    if isinstance(model, sequential.Sequential):
-      # If input shape is not directly set in the model, the exported model
-      # will assume that the inputs have the same shape as the shape the model
-      # was built model with.
-      if not model.built:
-        raise ValueError(
-            'Sequential model must be built before it can be exported.')
-    else:
-      raise NotImplementedError(
-          'Exporting subclassed models is not yet supported.')
-
-  export_dir = export_helpers.get_timestamped_export_dir(saved_model_path)
-  temp_export_dir = export_helpers.get_temp_export_dir(export_dir)
-
-  builder = saved_model_builder._SavedModelBuilder(temp_export_dir)
-
-  # Manually save variables to export them in an object-based checkpoint. This
-  # skips the `builder.add_meta_graph_and_variables()` step, which saves a
-  # named-based checkpoint.
-  # TODO(b/113134168): Add fn to Builder to save with object-based saver.
-  # TODO(b/113178242): This should only export the model json structure. Only
-  # one save is needed once the weights can be copied from the model to clone.
-  checkpoint_path = _export_model_json_and_variables(model, temp_export_dir)
-
-  # Export each mode. Use ModeKeys enums defined for `Estimator` to ensure that
-  # Keras models and `Estimator`s are exported with the same format.
-  # Every time a mode is exported, the code checks to see if new variables have
-  # been created (e.g. optimizer slot variables). If that is the case, the
-  # checkpoint is re-saved to include the new variables.
-  export_args = {'builder': builder,
-                 'model': model,
-                 'custom_objects': custom_objects,
-                 'checkpoint_path': checkpoint_path}
-
-  has_saved_vars = False
-  if model.optimizer:
-    if isinstance(model.optimizer, optimizers.TFOptimizer):
-      _export_mode(model_fn_lib.ModeKeys.TRAIN, has_saved_vars, **export_args)
-      has_saved_vars = True
-      _export_mode(model_fn_lib.ModeKeys.EVAL, has_saved_vars, **export_args)
-    else:
-      logging.warning(
-          'Model was compiled with an optimizer, but the optimizer is not from '
-          '`tf.train` (e.g. `tf.train.AdagradOptimizer`). Only the serving '
-          'graph was exported. The train and evaluate graphs were not added to '
-          'the SavedModel.')
-  _export_mode(model_fn_lib.ModeKeys.PREDICT, has_saved_vars, **export_args)
-
-  builder.save(as_text)
-
-  gfile.Rename(temp_export_dir, export_dir)
-  return export_dir
-
-
-def _export_model_json_and_variables(model, saved_model_path):
-  """Save model variables and json structure into SavedModel subdirectories."""
-  # Save model configuration as a json string under assets folder.
-  model_json = model.to_json()
-  model_json_filepath = os.path.join(
-      saved_model_utils.get_or_create_assets_dir(saved_model_path),
-      compat.as_text(constants.SAVED_MODEL_FILENAME_JSON))
-  file_io.write_string_to_file(model_json_filepath, model_json)
-
-  # Save model weights in checkpoint format under variables folder.
-  saved_model_utils.get_or_create_variables_dir(saved_model_path)
-  checkpoint_prefix = saved_model_utils.get_variables_path(saved_model_path)
-  model.save_weights(checkpoint_prefix, save_format='tf', overwrite=True)
-  return checkpoint_prefix
-
-
-def _get_var_list(model):
-  """Return list of all checkpointed saveable objects in the model."""
-  return checkpointable_utils.named_saveables(model)
-
-
-def _export_mode(
-    mode, has_saved_vars, builder, model, custom_objects, checkpoint_path):
-  """Export a model, and optionally save new vars from the clone model.
-
-  Args:
-    mode: A `tf.estimator.ModeKeys` string.
-    has_saved_vars: A `boolean` indicating whether the SavedModel has already
-      exported variables.
-    builder: A `SavedModelBuilder` object.
-    model: A `tf.keras.Model` object.
-    custom_objects: A dictionary mapping string names to custom classes
-      or functions.
-    checkpoint_path: String path to checkpoint.
-
-  Raises:
-    ValueError: If the train/eval mode is being exported, but the model does
-      not have an optimizer.
-  """
-  compile_clone = (mode != model_fn_lib.ModeKeys.PREDICT)
-  if compile_clone and not model.optimizer:
-    raise ValueError(
-        'Model does not have an optimizer. Cannot export mode %s' % mode)
-
-  model_graph = ops.get_default_graph()
-  with ops.Graph().as_default() as g:
-
-    K.set_learning_phase(mode == model_fn_lib.ModeKeys.TRAIN)
-
-    # Clone the model into blank graph. This will create placeholders for inputs
-    # and targets.
-    clone = models_lib.clone_and_build_model(
-        model, custom_objects=custom_objects, compile_clone=compile_clone)
-
-    # Make sure that iterations variable is added to the global step collection,
-    # to ensure that, when the SavedModel graph is loaded, the iterations
-    # variable is returned by `tf.train.get_global_step()`. This is required for
-    # compatibility with the SavedModelEstimator.
-    if compile_clone:
-      g.add_to_collection(ops.GraphKeys.GLOBAL_STEP, clone.optimizer.iterations)
-
-    # Extract update and train ops from train/test/predict functions.
-    train_op = None
-    if mode == model_fn_lib.ModeKeys.TRAIN:
-      clone._make_train_function()
-      train_op = clone.train_function.updates_op
-    elif mode == model_fn_lib.ModeKeys.EVAL:
-      clone._make_test_function()
-    else:
-      clone._make_predict_function()
-    g.get_collection_ref(ops.GraphKeys.UPDATE_OPS).extend(clone.state_updates)
-
-    clone_var_list = checkpointable_utils.named_saveables(clone)
-
-    with session.Session().as_default():
-      if has_saved_vars:
-        # Confirm all variables in the clone have an entry in the checkpoint.
-        status = clone.load_weights(checkpoint_path)
-        status.assert_existing_objects_matched()
-      else:
-        # Confirm that variables between the clone and model match up exactly,
-        # not counting optimizer objects. Optimizer objects are ignored because
-        # if the model has not trained, the slot variables will not have been
-        # created yet.
-        # TODO(b/113179535): Replace with checkpointable equivalence.
-        _assert_same_non_optimizer_objects(model, model_graph, clone, g)
-
-        # TODO(b/113178242): Use value transfer for checkpointable objects.
-        clone.load_weights(checkpoint_path)
-
-        # Add graph and variables to SavedModel.
-        # TODO(b/113134168): Switch to add_meta_graph_and_variables.
-        clone.save_weights(checkpoint_path, save_format='tf', overwrite=True)
-        builder._has_saved_variables = True
-
-    # Add graph to the SavedModel builder.
-    builder.add_meta_graph(
-        model_fn_lib.EXPORT_TAG_MAP[mode],
-        signature_def_map=_create_signature_def_map(clone, mode),
-        saver=saver_lib.Saver(clone_var_list),
-        init_op=variables.local_variables_initializer(),
-        train_op=train_op)
-    return None
-
-
-def _create_signature_def_map(model, mode):
-  """Create a SignatureDef map from a Keras model."""
-  inputs_dict = {name: x for name, x in zip(model.input_names, model.inputs)}
-  if model.optimizer:
-    targets_dict = {x.name.split(':')[0]: x
-                    for x in model.targets if x is not None}
-    inputs_dict.update(targets_dict)
-  outputs_dict = {name: x
-                  for name, x in zip(model.output_names, model.outputs)}
-  metrics = estimator_keras_util._convert_keras_metrics_to_estimator(model)
-
-  # Add metric variables to the `LOCAL_VARIABLES` collection. Metric variables
-  # are by default not added to any collections. We are doing this here, so
-  # that metric variables get initialized.
-  local_vars = set(ops.get_collection(ops.GraphKeys.LOCAL_VARIABLES))
-  vars_to_add = set()
-  if metrics is not None:
-    for key, value in six.iteritems(metrics):
-      if isinstance(value, Metric):
-        vars_to_add.update(value.variables)
-        # Convert Metric instances to (value_tensor, update_op) tuple.
-        metrics[key] = (value.result(), value.updates[0])
-  # Remove variables that are in the local variables collection already.
-  vars_to_add = vars_to_add.difference(local_vars)
-  for v in vars_to_add:
-    ops.add_to_collection(ops.GraphKeys.LOCAL_VARIABLES, v)
-
-  export_outputs = model_fn_lib.export_outputs_for_mode(
-      mode,
-      predictions=outputs_dict,
-      loss=model.total_loss if model.optimizer else None,
-      metrics=metrics)
-  return export_helpers.build_all_signature_defs(
-      inputs_dict,
-      export_outputs=export_outputs,
-      serving_only=(mode == model_fn_lib.ModeKeys.PREDICT))
-
-
-def _assert_same_non_optimizer_objects(model, model_graph, clone, clone_graph):  # pylint: disable=unused-argument
-  """Assert model and clone contain the same checkpointable objects."""
-
-  # TODO(fchollet, kathywu): make sure this works in eager mode.
-  return True
-
-
-def load_keras_model(saved_model_path):
-  """Load a keras.Model from SavedModel.
-
-  load_model reinstantiates model state by:
-  1) loading model topology from json (this will eventually come
-     from metagraph).
-  2) loading model weights from checkpoint.
-
-  Example:
-
-  ```python
-  import tensorflow as tf
-
-  # Create a tf.keras model.
-  model = tf.keras.Sequential()
-  model.add(tf.keras.layers.Dense(1, input_shape=[10]))
-  model.summary()
-
-  # Save the tf.keras model in the SavedModel format.
-  saved_to_path = tf.contrib.saved_model.save_keras_model(
-        model, '/tmp/my_simple_tf_keras_saved_model')
-
-  # Load the saved keras model back.
-  model_prime = tf.contrib.saved_model.load_keras_model(saved_to_path)
-  model_prime.summary()
-  ```
-
-  Args:
-    saved_model_path: a string specifying the path to an existing SavedModel.
-
-  Returns:
-    a keras.Model instance.
-  """
-  # restore model topology from json string
-  model_json_filepath = os.path.join(
-      compat.as_bytes(saved_model_path),
-      compat.as_bytes(constants.ASSETS_DIRECTORY),
-      compat.as_bytes(constants.SAVED_MODEL_FILENAME_JSON))
-  model_json = file_io.read_file_to_string(model_json_filepath)
-  model = model_from_json(model_json)
-
-  # restore model weights
-  checkpoint_prefix = os.path.join(
-      compat.as_text(saved_model_path),
-      compat.as_text(constants.VARIABLES_DIRECTORY),
-      compat.as_text(constants.VARIABLES_FILENAME))
-  model.load_weights(checkpoint_prefix)
-  return model
+# TODO(kathywu): Remove all contrib callers, switch to tf.keras.
+save_keras_model = saving.export_saved_model
+load_keras_model = saving.load_from_saved_model
diff --git a/tensorflow/contrib/seq2seq/BUILD b/tensorflow/contrib/seq2seq/BUILD
index 18b56cd21942e28cb0dc3210df0bb04d55c1e16f..8e2ce82294287dda07d2067c5b9f012f510dbd08 100644
--- a/tensorflow/contrib/seq2seq/BUILD
+++ b/tensorflow/contrib/seq2seq/BUILD
@@ -33,7 +33,6 @@ tf_custom_op_py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":beam_search_ops",
-        "//tensorflow/contrib/distributions:distributions_py",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/contrib/rnn:rnn_py",
         "//tensorflow/contrib/util:util_py",
@@ -59,7 +58,6 @@ tf_custom_op_py_library(
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/ops/distributions",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -141,6 +139,27 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "basic_decoder_v2_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/basic_decoder_v2_test.py"],
+    additional_deps = [
+        ":seq2seq_py",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/rnn:rnn_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
 cuda_py_test(
     name = "beam_search_ops_test",
     size = "medium",
@@ -175,6 +194,27 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "decoder_v2_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/decoder_v2_test.py"],
+    additional_deps = [
+        ":seq2seq_py",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/rnn:rnn_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
 cuda_py_test(
     name = "beam_search_decoder_test",
     size = "medium",
@@ -215,3 +255,19 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
 )
+
+cuda_py_test(
+    name = "attention_wrapper_v2_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/attention_wrapper_v2_test.py"],
+    additional_deps = [
+        ":seq2seq_py",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:variables",
+    ],
+    shard_count = 4,
+)
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index 922f21b98b35dfff19c8c605a25e89c5d2da8d98..1a5692f7b5be5e87b78dac9d1ae51f280ca089f8 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import rnn
 from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import variable_scope as vs
@@ -357,7 +358,7 @@ class AttentionWrapperTest(test.TestCase):
         rnn_output=ResultSummary(
             shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.00597103),
         sample_id=ResultSummary(
-            shape=(5, 3), dtype=dtype('int32'), mean=1.6))
+            shape=(5, 3), dtype=dtype('int32'), mean=1.4))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
             c=ResultSummary(
@@ -386,7 +387,7 @@ class AttentionWrapperTest(test.TestCase):
         rnn_output=ResultSummary(
             shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0052615386),
         sample_id=ResultSummary(
-            shape=(5, 3), dtype=dtype('int32'), mean=1.3333333333))
+            shape=(5, 3), dtype=dtype('int32'), mean=1.4))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
             c=ResultSummary(
@@ -453,7 +454,7 @@ class AttentionWrapperTest(test.TestCase):
         rnn_output=ResultSummary(
             shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0052615386),
         sample_id=ResultSummary(
-            shape=(5, 3), dtype=dtype('int32'), mean=1.3333333333333333))
+            shape=(5, 3), dtype=dtype('int32'), mean=1.4))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
             c=ResultSummary(
@@ -695,7 +696,7 @@ class AttentionWrapperTest(test.TestCase):
         rnn_output=ResultSummary(
             shape=(5, 3, 6), dtype=dtype('float32'), mean=-0.0025896581),
         sample_id=ResultSummary(
-            shape=(5, 3), dtype=dtype('int32'), mean=1.6))
+            shape=(5, 3), dtype=dtype('int32'), mean=1.73333333))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
             c=ResultSummary(
@@ -706,12 +707,12 @@ class AttentionWrapperTest(test.TestCase):
             shape=(5, 6), dtype=dtype('float32'), mean=-0.00069823361),
         time=3,
         alignments=ResultSummary(
-            shape=(5, 8), dtype=dtype('float32'), mean=0.028698336),
+            shape=(5, 8), dtype=dtype('float32'), mean=0.029914695),
         attention_state=ResultSummary(
-            shape=(5, 8), dtype=dtype('float32'), mean=0.028698336),
+            shape=(5, 8), dtype=dtype('float32'), mean=0.029914695),
         alignment_history=())
     expected_final_alignment_history = ResultSummary(
-        shape=(3, 5, 8), dtype=dtype('float32'), mean=0.04865776002407074)
+        shape=(3, 5, 8), dtype=dtype('float32'), mean=0.0465225502849)
 
     self._testWithAttention(
         create_attention_mechanism,
@@ -920,9 +921,9 @@ class AttentionWrapperTest(test.TestCase):
 
     expected_final_output = BasicDecoderOutput(
         rnn_output=ResultSummary(
-            shape=(5, 3, 20), dtype=dtype('float32'), mean=0.11723966),
+            shape=(5, 3, 20), dtype=dtype('float32'), mean=0.115853324533),
         sample_id=ResultSummary(
-            shape=(5, 3), dtype=dtype('int32'), mean=7.266666666666667))
+            shape=(5, 3), dtype=dtype('int32'), mean=8.6))
     expected_final_state = AttentionWrapperState(
         cell_state=LSTMStateTuple(
             c=ResultSummary(
@@ -930,7 +931,7 @@ class AttentionWrapperTest(test.TestCase):
             h=ResultSummary(
                 shape=(5, 9), dtype=dtype('float32'), mean=-0.0018327223)),
         attention=ResultSummary(
-            shape=(5, 20), dtype=dtype('float32'), mean=0.11601614207),
+            shape=(5, 20), dtype=dtype('float32'), mean=0.11462739855),
         time=3,
         alignments=(ResultSummary(
             shape=(5, 8), dtype=dtype('float32'), mean=0.125),
@@ -992,5 +993,67 @@ class AttentionWrapperTest(test.TestCase):
         expected_final_alignment_history=expected_final_alignment_history,
         name='testMultiAttention')
 
+  def testCustomizedAttention(self):
+    batch_size = 2
+    max_time = 3
+    num_units = 2
+    memory = constant_op.constant([[[1., 1.], [2., 2.], [3., 3.]],
+                                   [[4., 4.], [5., 5.], [6., 6.]]])
+    memory_sequence_length = constant_op.constant([3, 2])
+    attention_mechanism = wrapper.BahdanauAttention(num_units, memory,
+                                                    memory_sequence_length)
+
+    # Sets all returned values to be all ones.
+    def _customized_attention(unused_attention_mechanism, unused_cell_output,
+                              unused_attention_state, unused_attention_layer):
+      """Customized attention.
+
+      Returns:
+        attention: `Tensor` of shape [batch_size, num_units], attention output.
+        alignments: `Tensor` of shape [batch_size, max_time], sigma value for
+          each input memory (prob. function of input keys).
+        next_attention_state: A `Tensor` representing the next state for the
+          attention.
+      """
+      attention = array_ops.ones([batch_size, num_units])
+      alignments = array_ops.ones([batch_size, max_time])
+      next_attention_state = alignments
+      return attention, alignments, next_attention_state
+
+    attention_cell = wrapper.AttentionWrapper(
+        rnn_cell.LSTMCell(2),
+        attention_mechanism,
+        attention_layer_size=None,  # don't use attention layer.
+        output_attention=False,
+        alignment_history=(),
+        attention_fn=_customized_attention,
+        name='attention')
+    self.assertEqual(num_units, attention_cell.output_size)
+
+    initial_state = attention_cell.zero_state(
+        batch_size=2, dtype=dtypes.float32)
+    source_input_emb = array_ops.ones([2, 3, 2])
+    source_input_length = constant_op.constant([3, 2])
+
+    # 'state' is a tuple of
+    # (cell_state, h, attention, alignments, alignment_history, attention_state)
+    output, state = rnn.dynamic_rnn(
+        attention_cell,
+        inputs=source_input_emb,
+        sequence_length=source_input_length,
+        initial_state=initial_state,
+        dtype=dtypes.float32)
+
+    with self.session() as sess:
+      sess.run(variables.global_variables_initializer())
+      output_value, state_value = sess.run([output, state], feed_dict={})
+      self.assertAllEqual(np.array([2, 3, 2]), output_value.shape)
+      self.assertAllClose(np.array([[1., 1.], [1., 1.]]), state_value.attention)
+      self.assertAllClose(
+          np.array([[1., 1., 1.], [1., 1., 1.]]), state_value.alignments)
+      self.assertAllClose(
+          np.array([[1., 1., 1.], [1., 1., 1.]]), state_value.attention_state)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_v2_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ee01f66f165bd2ac22cae10807f24f6b97f0c64
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_v2_test.py
@@ -0,0 +1,745 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for contrib.seq2seq.python.ops.attention_wrapper."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.contrib.seq2seq.python.ops import attention_wrapper as wrapper
+from tensorflow.contrib.seq2seq.python.ops import basic_decoder
+from tensorflow.contrib.seq2seq.python.ops import sampler as sampler_py
+from tensorflow.python import keras
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import initializers
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.util import nest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class AttentionMechanismTest(test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super(AttentionMechanismTest, self).setUp()
+    self.batch = 10
+    self.timestep = 5
+    self.memory_size = 6
+    self.units = 8
+
+    self.memory = np.random.randn(self.batch, self.timestep,
+                                  self.memory_size).astype(np.float32)
+    self.query = np.random.randn(self.batch, self.units).astype(np.float32)
+    self.state = np.random.randn(self.batch, self.timestep).astype(np.float32)
+
+  @parameterized.named_parameters(
+      ("luong", wrapper.LuongAttentionV2),
+      ("luong_monotonic", wrapper.LuongMonotonicAttentionV2),
+      ("bahdanau", wrapper.BahdanauAttentionV2),
+      ("bahdanau_monotonic", wrapper.BahdanauMonotonicAttentionV2),
+  )
+  def test_attention_shape_inference(self, attention_cls):
+    attention = attention_cls(self.units, self.memory)
+    attention_score = attention([self.query, self.state])
+    self.assertLen(attention_score, 2)
+    self.assertEqual(attention_score[0].shape, (self.batch, self.timestep))
+    self.assertEqual(attention_score[1].shape, (self.batch, self.timestep))
+
+  @parameterized.named_parameters(
+      ("luong", wrapper.LuongAttentionV2),
+      ("luong_monotonic", wrapper.LuongMonotonicAttentionV2),
+      ("bahdanau", wrapper.BahdanauAttentionV2),
+      ("bahdanau_monotonic", wrapper.BahdanauMonotonicAttentionV2),
+  )
+  def test_get_config(self, attention_cls):
+    attention = attention_cls(self.units, self.memory)
+    config = attention.get_config()
+
+    attention_from_config = attention_cls.from_config(config)
+    config_from_clone = attention_from_config.get_config()
+
+    self.assertDictEqual(config, config_from_clone)
+
+  @parameterized.named_parameters(
+      ("luong", wrapper.LuongAttentionV2),
+      ("luong_monotonic", wrapper.LuongMonotonicAttentionV2),
+      ("bahdanau", wrapper.BahdanauAttentionV2),
+      ("bahdanau_monotonic", wrapper.BahdanauMonotonicAttentionV2),
+  )
+  def test_layer_output(self, attention_cls):
+    attention = attention_cls(self.units, self.memory)
+    score = attention([self.query, self.state])
+    self.evaluate(variables.variables_initializer(attention.variables))
+
+    score_val = self.evaluate(score)
+    self.assertLen(score_val, 2)
+    self.assertEqual(score_val[0].shape, (self.batch, self.timestep))
+    self.assertEqual(score_val[1].shape, (self.batch, self.timestep))
+
+  @parameterized.named_parameters(
+      ("luong", wrapper.LuongAttentionV2),
+      ("luong_monotonic", wrapper.LuongMonotonicAttentionV2),
+      ("bahdanau", wrapper.BahdanauAttentionV2),
+      ("bahdanau_monotonic", wrapper.BahdanauMonotonicAttentionV2),
+  )
+  def test_passing_memory_from_call(self, attention_cls):
+    attention = attention_cls(self.units, self.memory)
+    weights_before_query = attention.get_weights()
+    ref_score = attention([self.query, self.state])
+
+    self.evaluate(variables.global_variables_initializer())
+    ref_score_val = self.evaluate(ref_score)
+
+    all_weights = attention.get_weights()
+    config = attention.get_config()
+    # Simulate the twice invocation of calls here.
+    attention_from_config = attention_cls.from_config(config)
+    attention_from_config.build(self.memory.shape)
+    attention_from_config.set_weights(weights_before_query)
+    attention_from_config(self.memory, setup_memory=True)
+    attention_from_config.build([self.query.shape, self.state.shape])
+    attention_from_config.set_weights(all_weights)
+    score = attention_from_config([self.query, self.state])
+
+    score_val = self.evaluate(score)
+    self.assertAllClose(ref_score_val, score_val)
+
+  @parameterized.named_parameters(
+      ("luong", wrapper.LuongAttentionV2),
+      ("luong_monotonic", wrapper.LuongMonotonicAttentionV2),
+      ("bahdanau", wrapper.BahdanauAttentionV2),
+      ("bahdanau_monotonic", wrapper.BahdanauMonotonicAttentionV2),
+  )
+  def test_save_load_layer(self, attention_cls):
+    vocab = 20
+    embedding_dim = 6
+    inputs = keras.layers.Input(shape=[self.timestep])
+    encoder_input = keras.layers.Embedding(
+        vocab, embedding_dim, mask_zero=True)(
+            inputs)
+    encoder_output = keras.layers.UnifiedLSTM(
+        self.memory_size, return_sequences=True)(
+            encoder_input)
+
+    attention = attention_cls(self.units, encoder_output)
+    query = keras.layers.Input(shape=[self.units])
+    state = keras.layers.Input(shape=[self.timestep])
+
+    score = attention([query, state])
+
+    x = np.random.randint(vocab, size=(self.batch, self.timestep))
+    x_test = np.random.randint(vocab, size=(self.batch, self.timestep))
+    y = np.random.randn(self.batch, self.timestep)
+    model = keras.models.Model([inputs, query, state], score)
+    model.compile("rmsprop", "mse")
+    model.fit([x, self.query, self.state], (y, y))
+    y_ref = model.predict_on_batch([x_test, self.query, self.state])
+
+    config = model.get_config()
+    weights = model.get_weights()
+    loaded_model = keras.models.Model.from_config(
+        config, custom_objects={attention_cls.__name__: attention_cls})
+    loaded_model.set_weights(weights)
+
+    y = loaded_model.predict_on_batch([x_test, self.query, self.state])
+
+    self.assertAllClose(y_ref, y)
+
+  # TODO(scottzhu): Add tests for model.compile(run_eagerly=True)
+
+
+class ResultSummary(
+    collections.namedtuple("ResultSummary", ("shape", "dtype", "mean"))):
+  pass
+
+
+def get_result_summary(x):
+  if isinstance(x, np.ndarray):
+    return ResultSummary(x.shape, x.dtype, x.mean())
+  return x
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class AttentionWrapperV2Test(test.TestCase, parameterized.TestCase):
+
+  def assertAllCloseOrEqual(self, x, y, **kwargs):
+    if isinstance(x, np.ndarray) or isinstance(x, float):
+      return super(AttentionWrapperV2Test, self).assertAllClose(
+          x, y, atol=1e-3, **kwargs)
+    else:
+      self.assertAllEqual(x, y, **kwargs)
+
+  def setUp(self):
+    super(AttentionWrapperV2Test, self).setUp()
+    self.batch = 64
+    self.units = 128
+    self.encoder_timestep = 10
+    self.encoder_dim = 256
+    self.decoder_timestep = 12
+    self.encoder_outputs = np.random.randn(self.batch, self.encoder_timestep,
+                                           self.encoder_dim)
+    self.encoder_sequence_length = np.random.randint(
+        self.encoder_timestep, size=(self.batch,)).astype(np.int32)
+    self.decoder_inputs = np.random.randn(self.batch, self.decoder_timestep,
+                                          self.units)
+    self.decoder_sequence_length = np.random.randint(
+        self.decoder_timestep, size=(self.batch,)).astype(np.int32)
+
+  def _testWithAttention(self,
+                         create_attention_mechanism,
+                         expected_final_output,
+                         expected_final_state,
+                         attention_mechanism_depth=3,
+                         alignment_history=False,
+                         expected_final_alignment_history=None,
+                         attention_layer_size=6,
+                         attention_layer=None,
+                         create_query_layer=False,
+                         create_memory_layer=True,
+                         create_attention_kwargs=None):
+    attention_layer_sizes = ([attention_layer_size]
+                             if attention_layer_size is not None else None)
+    attention_layers = ([attention_layer]
+                        if attention_layer is not None else None)
+    self._testWithMaybeMultiAttention(
+        is_multi=False,
+        create_attention_mechanisms=[create_attention_mechanism],
+        expected_final_output=expected_final_output,
+        expected_final_state=expected_final_state,
+        attention_mechanism_depths=[attention_mechanism_depth],
+        alignment_history=alignment_history,
+        expected_final_alignment_history=expected_final_alignment_history,
+        attention_layer_sizes=attention_layer_sizes,
+        attention_layers=attention_layers,
+        create_query_layer=create_query_layer,
+        create_memory_layer=create_memory_layer,
+        create_attention_kwargs=create_attention_kwargs)
+
+  def _testWithMaybeMultiAttention(self,
+                                   is_multi,
+                                   create_attention_mechanisms,
+                                   expected_final_output,
+                                   expected_final_state,
+                                   attention_mechanism_depths,
+                                   alignment_history=False,
+                                   expected_final_alignment_history=None,
+                                   attention_layer_sizes=None,
+                                   attention_layers=None,
+                                   create_query_layer=False,
+                                   create_memory_layer=True,
+                                   create_attention_kwargs=None):
+    # Allow is_multi to be True with a single mechanism to enable test for
+    # passing in a single mechanism in a list.
+    assert len(create_attention_mechanisms) == 1 or is_multi
+    encoder_sequence_length = [3, 2, 3, 1, 1]
+    decoder_sequence_length = [2, 0, 1, 2, 3]
+    batch_size = 5
+    encoder_max_time = 8
+    decoder_max_time = 4
+    input_depth = 7
+    encoder_output_depth = 10
+    cell_depth = 9
+    create_attention_kwargs = create_attention_kwargs or {}
+
+    if attention_layer_sizes is not None:
+      # Compute sum of attention_layer_sizes. Use encoder_output_depth if None.
+      attention_depth = sum(attention_layer_size or encoder_output_depth
+                            for attention_layer_size in attention_layer_sizes)
+    elif attention_layers is not None:
+      # Compute sum of attention_layers output depth.
+      attention_depth = sum(
+          attention_layer.compute_output_shape(
+              [batch_size, cell_depth + encoder_output_depth]).dims[-1].value
+          for attention_layer in attention_layers)
+    else:
+      attention_depth = encoder_output_depth * len(create_attention_mechanisms)
+
+    decoder_inputs = np.random.randn(batch_size, decoder_max_time,
+                                     input_depth).astype(np.float32)
+    encoder_outputs = np.random.randn(batch_size, encoder_max_time,
+                                      encoder_output_depth).astype(np.float32)
+
+    attention_mechanisms = []
+    for creator, depth in zip(create_attention_mechanisms,
+                              attention_mechanism_depths):
+      # Create a memory layer with deterministic initializer to avoid randomness
+      # in the test between graph and eager.
+      if create_query_layer:
+        create_attention_kwargs["query_layer"] = keras.layers.Dense(
+            depth, kernel_initializer="ones", use_bias=False)
+      if create_memory_layer:
+        create_attention_kwargs["memory_layer"] = keras.layers.Dense(
+            depth, kernel_initializer="ones", use_bias=False)
+
+      attention_mechanisms.append(
+          creator(
+              units=depth,
+              memory=encoder_outputs,
+              memory_sequence_length=encoder_sequence_length,
+              **create_attention_kwargs))
+
+    with self.cached_session(use_gpu=True):
+      attention_layer_size = attention_layer_sizes
+      attention_layer = attention_layers
+      if not is_multi:
+        if attention_layer_size is not None:
+          attention_layer_size = attention_layer_size[0]
+        if attention_layer is not None:
+          attention_layer = attention_layer[0]
+      cell = rnn_cell.LSTMCell(cell_depth, initializer="ones")
+      cell = wrapper.AttentionWrapper(
+          cell,
+          attention_mechanisms if is_multi else attention_mechanisms[0],
+          attention_layer_size=attention_layer_size,
+          alignment_history=alignment_history,
+          attention_layer=attention_layer)
+      # Set the attention_layer within AttentionWrapper to have deterministic
+      # kernel initializer, for testing purpose.
+      if cell._attention_layers is not None:
+        for layer in cell._attention_layers:
+          if getattr(layer, "kernel_initializer") is None:
+            layer.kernel_initializer = initializers.ones()
+
+      sampler = sampler_py.TrainingSampler()
+      my_decoder = basic_decoder.BasicDecoderV2(cell=cell, sampler=sampler)
+      initial_state = cell.zero_state(
+          dtype=dtypes.float32, batch_size=batch_size)
+      final_outputs, final_state, _ = my_decoder(
+          decoder_inputs,
+          initial_state=initial_state,
+          sequence_length=decoder_sequence_length)
+
+      self.assertIsInstance(final_outputs, basic_decoder.BasicDecoderOutput)
+      self.assertIsInstance(final_state, wrapper.AttentionWrapperState)
+      self.assertIsInstance(final_state.cell_state, rnn_cell.LSTMStateTuple)
+
+      expected_time = (
+          expected_final_state.time if context.executing_eagerly() else None)
+      self.assertEqual((batch_size, expected_time, attention_depth),
+                       tuple(final_outputs.rnn_output.get_shape().as_list()))
+      self.assertEqual((batch_size, expected_time),
+                       tuple(final_outputs.sample_id.get_shape().as_list()))
+
+      self.assertEqual((batch_size, attention_depth),
+                       tuple(final_state.attention.get_shape().as_list()))
+      self.assertEqual((batch_size, cell_depth),
+                       tuple(final_state.cell_state.c.get_shape().as_list()))
+      self.assertEqual((batch_size, cell_depth),
+                       tuple(final_state.cell_state.h.get_shape().as_list()))
+
+      if alignment_history:
+        if is_multi:
+          state_alignment_history = []
+          for history_array in final_state.alignment_history:
+            history = history_array.stack()
+            self.assertEqual((expected_time, batch_size, encoder_max_time),
+                             tuple(history.get_shape().as_list()))
+            state_alignment_history.append(history)
+          state_alignment_history = tuple(state_alignment_history)
+        else:
+          state_alignment_history = final_state.alignment_history.stack()
+          self.assertEqual((expected_time, batch_size, encoder_max_time),
+                           tuple(state_alignment_history.get_shape().as_list()))
+        nest.assert_same_structure(cell.state_size,
+                                   cell.zero_state(batch_size, dtypes.float32))
+        # Remove the history from final_state for purposes of the
+        # remainder of the tests.
+        final_state = final_state._replace(alignment_history=())  # pylint: disable=protected-access
+      else:
+        state_alignment_history = ()
+
+      self.evaluate(variables.global_variables_initializer())
+      eval_result = self.evaluate({
+          "final_outputs": final_outputs,
+          "final_state": final_state,
+          "state_alignment_history": state_alignment_history,
+      })
+
+      final_output_info = nest.map_structure(get_result_summary,
+                                             eval_result["final_outputs"])
+      final_state_info = nest.map_structure(get_result_summary,
+                                            eval_result["final_state"])
+      print("final_output_info: ", final_output_info)
+      print("final_state_info: ", final_state_info)
+
+      nest.map_structure(self.assertAllCloseOrEqual, expected_final_output,
+                         final_output_info)
+      nest.map_structure(self.assertAllCloseOrEqual, expected_final_state,
+                         final_state_info)
+      if alignment_history:  # by default, the wrapper emits attention as output
+        final_alignment_history_info = nest.map_structure(
+            get_result_summary, eval_result["state_alignment_history"])
+        print("final_alignment_history_info: ", final_alignment_history_info)
+        nest.map_structure(
+            self.assertAllCloseOrEqual,
+            # outputs are batch major but the stacked TensorArray is time major
+            expected_final_alignment_history,
+            final_alignment_history_info)
+
+  @parameterized.parameters([np.float16, np.float32, np.float64])
+  def _testBahdanauNormalizedDType(self, dtype):
+    encoder_outputs = self.encoder_outputs.astype(dtype)
+    decoder_inputs = self.decoder_inputs.astype(dtype)
+    attention_mechanism = wrapper.BahdanauAttentionV2(
+        units=self.units,
+        memory=encoder_outputs,
+        memory_sequence_length=self.encoder_sequence_length,
+        normalize=True,
+        dtype=dtype)
+    cell = rnn_cell.LSTMCell(self.units)
+    cell = wrapper.AttentionWrapper(cell, attention_mechanism)
+
+    sampler = sampler_py.TrainingSampler()
+    my_decoder = basic_decoder.BasicDecoderV2(cell=cell, sampler=sampler)
+
+    final_outputs, final_state, _ = my_decoder(
+        decoder_inputs,
+        initial_state=cell.zero_state(dtype=dtype, batch_size=self.batch),
+        sequence_length=self.decoder_sequence_length)
+    self.assertIsInstance(final_outputs, basic_decoder.BasicDecoderOutput)
+    self.assertEqual(final_outputs.rnn_output.dtype, dtype)
+    self.assertIsInstance(final_state, wrapper.AttentionWrapperState)
+    self.assertIsInstance(final_state.cell_state, rnn_cell.LSTMStateTuple)
+
+  @parameterized.parameters([np.float16, np.float32, np.float64])
+  def testLuongScaledDType(self, dtype):
+    # Test case for GitHub issue 18099
+    encoder_outputs = self.encoder_outputs.astype(dtype)
+    decoder_inputs = self.decoder_inputs.astype(dtype)
+    attention_mechanism = wrapper.LuongAttentionV2(
+        units=self.units,
+        memory=encoder_outputs,
+        memory_sequence_length=self.encoder_sequence_length,
+        scale=True,
+        dtype=dtype,
+    )
+    cell = rnn_cell.LSTMCell(self.units)
+    cell = wrapper.AttentionWrapper(cell, attention_mechanism)
+
+    sampler = sampler_py.TrainingSampler()
+    my_decoder = basic_decoder.BasicDecoderV2(cell=cell, sampler=sampler)
+
+    final_outputs, final_state, _ = my_decoder(
+        decoder_inputs,
+        initial_state=cell.zero_state(dtype=dtype, batch_size=self.batch),
+        sequence_length=self.decoder_sequence_length)
+    self.assertIsInstance(final_outputs, basic_decoder.BasicDecoderOutput)
+    self.assertEqual(final_outputs.rnn_output.dtype, dtype)
+    self.assertIsInstance(final_state, wrapper.AttentionWrapperState)
+    self.assertIsInstance(final_state.cell_state, rnn_cell.LSTMStateTuple)
+
+  def testBahdanauNotNormalized(self):
+    create_attention_mechanism = wrapper.BahdanauAttentionV2
+    create_attention_kwargs = {"kernel_initializer": "ones"}
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype(np.float32), mean=4.8290324),
+        sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype(np.int32), mean=0))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=rnn_cell.LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=np.dtype(np.float32), mean=1.6432636),
+            h=ResultSummary(
+                shape=(5, 9), dtype=np.dtype(np.float32), mean=0.75866824)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype(np.float32), mean=6.7445569),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype(np.float32), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype(np.float32), mean=0.125),
+        alignment_history=())
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=np.dtype(np.float32), mean=0.125)
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        alignment_history=True,
+        create_query_layer=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        create_attention_kwargs=create_attention_kwargs)
+
+  def testBahdanauNormalized(self):
+    create_attention_mechanism = wrapper.BahdanauAttentionV2
+    create_attention_kwargs = {"kernel_initializer": "ones", "normalize": True}
+
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=3.9548259),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=np.dtype("int32"), mean=0.0))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=rnn_cell.LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=1.4652209),
+            h=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.70997983)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=6.3075728),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        alignment_history=())
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        create_query_layer=True,
+        create_attention_kwargs=create_attention_kwargs)
+
+  def testLuongNotNormalized(self):
+    create_attention_mechanism = wrapper.LuongAttentionV2
+
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=2.6605489),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=np.dtype("int32"), mean=0.0))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=rnn_cell.LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.88403547),
+            h=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.37819088)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=4.084631),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        alignment_history=())
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depth=9)
+
+  def testLuongScaled(self):
+    create_attention_mechanism = wrapper.LuongAttentionV2
+    create_attention_kwargs = {"scale": True}
+
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=2.6605489),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=np.dtype("int32"), mean=0.0))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=rnn_cell.LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.88403547),
+            h=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.37819088)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=4.0846314),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        alignment_history=())
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depth=9,
+        create_attention_kwargs=create_attention_kwargs)
+
+  def testNotUseAttentionLayer(self):
+    create_attention_mechanism = wrapper.BahdanauAttentionV2
+    create_attention_kwargs = {"kernel_initializer": "ones"}
+
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 10), dtype=np.dtype("float32"), mean=0.072406612),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=np.dtype("int32"), mean=3.86666666))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=rnn_cell.LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=1.032002),
+            h=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.61177742)),
+        attention=ResultSummary(
+            shape=(5, 10), dtype=np.dtype("float32"), mean=0.011346335),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+        alignment_history=())
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_layer_size=None,
+        create_query_layer=True,
+        create_attention_kwargs=create_attention_kwargs)
+
+  def testBahdanauMonotonicNotNormalized(self):
+    create_attention_mechanism = wrapper.BahdanauMonotonicAttentionV2
+    create_attention_kwargs = {"kernel_initializer": "ones"}
+
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=5.9850435),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=np.dtype("int32"), mean=0.0))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=rnn_cell.LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=1.6752492),
+            h=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.76052248)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=8.361186),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.10989678),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.10989678),
+        alignment_history=())
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.117412611)
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        create_query_layer=True,
+        create_attention_kwargs=create_attention_kwargs)
+
+  def testBahdanauMonotonicNormalized(self):
+    create_attention_mechanism = wrapper.BahdanauMonotonicAttentionV2
+    create_attention_kwargs = {"kernel_initializer": "ones",
+                               "normalize": True}
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=4.5706983),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=np.dtype("int32"), mean=0.0))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=rnn_cell.LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=1.6005473),
+            h=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.77863038)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=7.3326721),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.12258384),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.12258384),
+        alignment_history=())
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.12258384)
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        create_query_layer=True,
+        create_attention_kwargs=create_attention_kwargs)
+
+  def testLuongMonotonicNotNormalized(self):
+    create_attention_mechanism = wrapper.LuongMonotonicAttentionV2
+
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=3.159497),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=np.dtype("int32"), mean=0.0))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=rnn_cell.LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=1.072384),
+            h=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.50331038)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=5.3079605),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.11467695),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.11467695),
+        alignment_history=())
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.11899644)
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depth=9,
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history)
+
+  def testLuongMonotonicScaled(self):
+    create_attention_mechanism = wrapper.LuongMonotonicAttentionV2
+    create_attention_kwargs = {"scale": True}
+
+    expected_final_output = basic_decoder.BasicDecoderOutput(
+        rnn_output=ResultSummary(
+            shape=(5, 3, 6), dtype=np.dtype("float32"), mean=3.159497),
+        sample_id=ResultSummary(
+            shape=(5, 3), dtype=np.dtype("int32"), mean=0.0))
+    expected_final_state = wrapper.AttentionWrapperState(
+        cell_state=rnn_cell.LSTMStateTuple(
+            c=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=1.072384),
+            h=ResultSummary(
+                shape=(5, 9), dtype=np.dtype("float32"), mean=0.50331038)),
+        attention=ResultSummary(
+            shape=(5, 6), dtype=np.dtype("float32"), mean=5.3079605),
+        time=3,
+        alignments=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.11467695),
+        attention_state=ResultSummary(
+            shape=(5, 8), dtype=np.dtype("float32"), mean=0.11467695),
+        alignment_history=())
+    expected_final_alignment_history = ResultSummary(
+        shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.11899644)
+
+    self._testWithAttention(
+        create_attention_mechanism,
+        expected_final_output,
+        expected_final_state,
+        attention_mechanism_depth=9,
+        alignment_history=True,
+        expected_final_alignment_history=expected_final_alignment_history,
+        create_attention_kwargs=create_attention_kwargs)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
index b7f9f3fb090356a1c8d2bfb5044712ff93e267ce..abcf71c61b6e6df9462bf06323b8b11d5cc0d9a8 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_test.py
@@ -34,8 +34,6 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.distributions import bernoulli
-from tensorflow.python.ops.distributions import categorical
 from tensorflow.python.platform import test
 # pylint: enable=g-import-not-at-top
 
@@ -517,7 +515,7 @@ class BasicDecoderTest(test.TestCase):
         vocabulary_size)
 
     # The sample function samples categorically from the logits.
-    sample_fn = lambda x: categorical.Categorical(logits=x).sample()
+    sample_fn = lambda x: helper_py.categorical_sample(logits=x)
     # The next inputs are a one-hot encoding of the sampled labels.
     next_inputs_fn = (
         lambda x: array_ops.one_hot(x, vocabulary_size, dtype=dtypes.float32))
@@ -599,7 +597,7 @@ class BasicDecoderTest(test.TestCase):
 
     # The sample function samples independent bernoullis from the logits.
     sample_fn = (
-        lambda x: bernoulli.Bernoulli(logits=x, dtype=dtypes.bool).sample())
+        lambda x: helper_py.bernoulli_sample(logits=x, dtype=dtypes.bool))
     # The next inputs are a one-hot encoding of the sampled labels.
     next_inputs_fn = math_ops.to_float
     end_fn = lambda sample_ids: sample_ids[:, end_token]
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_v2_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2341ebb77ab6ecad1e979bc8bed0080128a804da
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/basic_decoder_v2_test.py
@@ -0,0 +1,670 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for contrib.seq2seq.python.seq2seq.basic_decoder_v2."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.contrib.seq2seq.python.ops import basic_decoder
+from tensorflow.contrib.seq2seq.python.ops import sampler as sampler_py
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.layers import core as layers_core
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+@keras_parameterized.run_all_keras_modes
+class BasicDecoderTest(keras_parameterized.TestCase):
+  """Unit test for basic_decoder.BasicDecoderV2."""
+
+  @parameterized.named_parameters(
+      ("use_output_layer", True),
+      ("without_output_layer", False))
+  def testStepWithTrainingHelperOutputLayer(self, use_output_layer):
+    sequence_length = [3, 4, 3, 1, 0]
+    batch_size = 5
+    max_time = 8
+    input_depth = 7
+    cell_depth = 10
+    output_layer_depth = 3
+
+    with self.cached_session(use_gpu=True):
+      inputs = np.random.randn(batch_size, max_time,
+                               input_depth).astype(np.float32)
+      input_t = constant_op.constant(inputs)
+      cell = rnn_cell.LSTMCell(cell_depth)
+      sampler = sampler_py.TrainingSampler(time_major=False)
+      if use_output_layer:
+        output_layer = layers_core.Dense(output_layer_depth, use_bias=False)
+        expected_output_depth = output_layer_depth
+      else:
+        output_layer = None
+        expected_output_depth = cell_depth
+      initial_state = cell.zero_state(dtype=dtypes.float32,
+                                      batch_size=batch_size)
+      my_decoder = basic_decoder.BasicDecoderV2(
+          cell=cell,
+          sampler=sampler,
+          output_layer=output_layer)
+
+      (first_finished,
+       first_inputs,
+       first_state) = my_decoder.initialize(input_t,
+                                            initial_state=initial_state,
+                                            sequence_length=sequence_length)
+      output_size = my_decoder.output_size
+      output_dtype = my_decoder.output_dtype
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(expected_output_depth,
+                                           tensor_shape.TensorShape([])),
+          output_size)
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.int32),
+          output_dtype)
+
+      (step_outputs, step_state, step_next_inputs,
+       step_finished) = my_decoder.step(
+           constant_op.constant(0), first_inputs, first_state)
+      batch_size_t = my_decoder.batch_size
+
+      self.assertTrue(isinstance(first_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(isinstance(step_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(
+          isinstance(step_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual((batch_size, expected_output_depth),
+                       step_outputs[0].get_shape())
+      self.assertEqual((batch_size,), step_outputs[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[1].get_shape())
+
+      if use_output_layer:
+        # The output layer was accessed
+        self.assertEqual(len(output_layer.variables), 1)
+
+      self.evaluate(variables.global_variables_initializer())
+      eval_result = self.evaluate({
+          "batch_size": batch_size_t,
+          "first_finished": first_finished,
+          "first_inputs": first_inputs,
+          "first_state": first_state,
+          "step_outputs": step_outputs,
+          "step_state": step_state,
+          "step_next_inputs": step_next_inputs,
+          "step_finished": step_finished
+      })
+
+      self.assertAllEqual([False, False, False, False, True],
+                          eval_result["first_finished"])
+      self.assertAllEqual([False, False, False, True, True],
+                          eval_result["step_finished"])
+      self.assertEqual(output_dtype.sample_id,
+                       eval_result["step_outputs"].sample_id.dtype)
+      self.assertAllEqual(
+          np.argmax(eval_result["step_outputs"].rnn_output, -1),
+          eval_result["step_outputs"].sample_id)
+
+  def DISABLED_testStepWithGreedyEmbeddingHelper(self):
+    batch_size = 5
+    vocabulary_size = 7
+    cell_depth = vocabulary_size  # cell's logits must match vocabulary size
+    input_depth = 10
+    start_tokens = np.random.randint(0, vocabulary_size, size=batch_size)
+    end_token = 1
+
+    with self.cached_session(use_gpu=True):
+      embeddings = np.random.randn(vocabulary_size,
+                                   input_depth).astype(np.float32)
+      embeddings_t = constant_op.constant(embeddings)
+      cell = rnn_cell.LSTMCell(vocabulary_size)
+      sampler = sampler_py.GreedyEmbeddingSampler()
+      initial_state = cell.zero_state(
+          dtype=dtypes.float32, batch_size=batch_size)
+      my_decoder = basic_decoder.BasicDecoderV2(
+          cell=cell,
+          sampler=sampler)
+      (first_finished, first_inputs, first_state) = my_decoder.initialize(
+          embeddings_t,
+          start_tokens=start_tokens,
+          end_token=end_token,
+          initial_state=initial_state)
+      output_size = my_decoder.output_size
+      output_dtype = my_decoder.output_dtype
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(cell_depth,
+                                           tensor_shape.TensorShape([])),
+          output_size)
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.int32),
+          output_dtype)
+
+      (step_outputs, step_state, step_next_inputs,
+       step_finished) = my_decoder.step(
+           constant_op.constant(0), first_inputs, first_state)
+      batch_size_t = my_decoder.batch_size
+
+      self.assertTrue(isinstance(first_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(isinstance(step_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(
+          isinstance(step_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual((batch_size, cell_depth), step_outputs[0].get_shape())
+      self.assertEqual((batch_size,), step_outputs[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[1].get_shape())
+
+      self.evaluate(variables.global_variables_initializer())
+      eval_result = self.evaluate({
+          "batch_size": batch_size_t,
+          "first_finished": first_finished,
+          "first_inputs": first_inputs,
+          "first_state": first_state,
+          "step_outputs": step_outputs,
+          "step_state": step_state,
+          "step_next_inputs": step_next_inputs,
+          "step_finished": step_finished
+      })
+
+      expected_sample_ids = np.argmax(
+          eval_result["step_outputs"].rnn_output, -1)
+      expected_step_finished = (expected_sample_ids == end_token)
+      expected_step_next_inputs = embeddings[expected_sample_ids]
+      self.assertAllEqual([False, False, False, False, False],
+                          eval_result["first_finished"])
+      self.assertAllEqual(expected_step_finished, eval_result["step_finished"])
+      self.assertEqual(output_dtype.sample_id,
+                       eval_result["step_outputs"].sample_id.dtype)
+      self.assertAllEqual(expected_sample_ids,
+                          eval_result["step_outputs"].sample_id)
+      self.assertAllEqual(expected_step_next_inputs,
+                          eval_result["step_next_inputs"])
+
+  def testStepWithSampleEmbeddingHelper(self):
+    batch_size = 5
+    vocabulary_size = 7
+    cell_depth = vocabulary_size  # cell's logits must match vocabulary size
+    input_depth = 10
+    np.random.seed(0)
+    start_tokens = np.random.randint(0, vocabulary_size, size=batch_size)
+    end_token = 1
+
+    with self.cached_session(use_gpu=True):
+      embeddings = np.random.randn(vocabulary_size,
+                                   input_depth).astype(np.float32)
+      embeddings_t = constant_op.constant(embeddings)
+      cell = rnn_cell.LSTMCell(vocabulary_size)
+      sampler = sampler_py.SampleEmbeddingSampler(seed=0)
+      initial_state = cell.zero_state(
+          dtype=dtypes.float32, batch_size=batch_size)
+      my_decoder = basic_decoder.BasicDecoderV2(cell=cell, sampler=sampler)
+      (first_finished,
+       first_inputs,
+       first_state) = my_decoder.initialize(embeddings_t,
+                                            start_tokens=start_tokens,
+                                            end_token=end_token,
+                                            initial_state=initial_state)
+      output_size = my_decoder.output_size
+      output_dtype = my_decoder.output_dtype
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(cell_depth,
+                                           tensor_shape.TensorShape([])),
+          output_size)
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.int32),
+          output_dtype)
+
+      (step_outputs, step_state, step_next_inputs,
+       step_finished) = my_decoder.step(
+           constant_op.constant(0), first_inputs, first_state)
+      batch_size_t = my_decoder.batch_size
+
+      self.assertTrue(isinstance(first_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(isinstance(step_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(
+          isinstance(step_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual((batch_size, cell_depth), step_outputs[0].get_shape())
+      self.assertEqual((batch_size,), step_outputs[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[1].get_shape())
+
+      self.evaluate(variables.global_variables_initializer())
+      eval_result = self.evaluate({
+          "batch_size": batch_size_t,
+          "first_finished": first_finished,
+          "first_inputs": first_inputs,
+          "first_state": first_state,
+          "step_outputs": step_outputs,
+          "step_state": step_state,
+          "step_next_inputs": step_next_inputs,
+          "step_finished": step_finished
+      })
+
+      sample_ids = eval_result["step_outputs"].sample_id
+      self.assertEqual(output_dtype.sample_id, sample_ids.dtype)
+      expected_step_finished = (sample_ids == end_token)
+      expected_step_next_inputs = embeddings[sample_ids]
+      self.assertAllEqual(expected_step_finished,
+                          eval_result["step_finished"])
+      self.assertAllEqual(expected_step_next_inputs,
+                          eval_result["step_next_inputs"])
+
+  def testStepWithScheduledEmbeddingTrainingHelper(self):
+    sequence_length = [3, 4, 3, 1, 0]
+    batch_size = 5
+    max_time = 8
+    input_depth = 7
+    vocabulary_size = 10
+
+    with self.cached_session(use_gpu=True):
+      inputs = np.random.randn(
+          batch_size, max_time, input_depth).astype(np.float32)
+      input_t = constant_op.constant(inputs)
+      embeddings = np.random.randn(
+          vocabulary_size, input_depth).astype(np.float32)
+      half = constant_op.constant(0.5)
+      cell = rnn_cell.LSTMCell(vocabulary_size)
+      sampler = sampler_py.ScheduledEmbeddingTrainingSampler(
+          sampling_probability=half,
+          time_major=False)
+      initial_state = cell.zero_state(
+          dtype=dtypes.float32, batch_size=batch_size)
+      my_decoder = basic_decoder.BasicDecoderV2(
+          cell=cell,
+          sampler=sampler)
+      (first_finished, first_inputs, first_state) = my_decoder.initialize(
+          input_t, sequence_length=sequence_length, embedding=embeddings,
+          initial_state=initial_state)
+      output_size = my_decoder.output_size
+      output_dtype = my_decoder.output_dtype
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(vocabulary_size,
+                                           tensor_shape.TensorShape([])),
+          output_size)
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.int32),
+          output_dtype)
+
+      (step_outputs, step_state, step_next_inputs,
+       step_finished) = my_decoder.step(
+           constant_op.constant(0), first_inputs, first_state)
+      batch_size_t = my_decoder.batch_size
+
+      self.assertTrue(isinstance(first_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(isinstance(step_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(
+          isinstance(step_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual((batch_size, vocabulary_size),
+                       step_outputs[0].get_shape())
+      self.assertEqual((batch_size,), step_outputs[1].get_shape())
+      self.assertEqual((batch_size, vocabulary_size),
+                       first_state[0].get_shape())
+      self.assertEqual((batch_size, vocabulary_size),
+                       first_state[1].get_shape())
+      self.assertEqual((batch_size, vocabulary_size),
+                       step_state[0].get_shape())
+      self.assertEqual((batch_size, vocabulary_size),
+                       step_state[1].get_shape())
+      self.assertEqual((batch_size, input_depth),
+                       step_next_inputs.get_shape())
+
+      self.evaluate(variables.global_variables_initializer())
+      eval_result = self.evaluate({
+          "batch_size": batch_size_t,
+          "first_finished": first_finished,
+          "first_inputs": first_inputs,
+          "first_state": first_state,
+          "step_outputs": step_outputs,
+          "step_state": step_state,
+          "step_next_inputs": step_next_inputs,
+          "step_finished": step_finished
+      })
+
+      self.assertAllEqual([False, False, False, False, True],
+                          eval_result["first_finished"])
+      self.assertAllEqual([False, False, False, True, True],
+                          eval_result["step_finished"])
+      sample_ids = eval_result["step_outputs"].sample_id
+      self.assertEqual(output_dtype.sample_id, sample_ids.dtype)
+      batch_where_not_sampling = np.where(sample_ids == -1)
+      batch_where_sampling = np.where(sample_ids > -1)
+      self.assertAllClose(
+          eval_result["step_next_inputs"][batch_where_sampling],
+          embeddings[sample_ids[batch_where_sampling]])
+      self.assertAllClose(
+          eval_result["step_next_inputs"][batch_where_not_sampling],
+          np.squeeze(inputs[batch_where_not_sampling, 1], axis=0))
+
+  def _testStepWithScheduledOutputTrainingHelper(
+      self, sampling_probability, use_next_inputs_fn, use_auxiliary_inputs):
+    sequence_length = [3, 4, 3, 1, 0]
+    batch_size = 5
+    max_time = 8
+    input_depth = 7
+    cell_depth = input_depth
+    if use_auxiliary_inputs:
+      auxiliary_input_depth = 4
+      auxiliary_inputs = np.random.randn(
+          batch_size, max_time, auxiliary_input_depth).astype(np.float32)
+    else:
+      auxiliary_inputs = None
+
+    with self.cached_session(use_gpu=True):
+      inputs = np.random.randn(batch_size, max_time,
+                               input_depth).astype(np.float32)
+      input_t = constant_op.constant(inputs)
+      cell = rnn_cell.LSTMCell(cell_depth)
+      sampling_probability = constant_op.constant(sampling_probability)
+
+      if use_next_inputs_fn:
+        def next_inputs_fn(outputs):
+          # Use deterministic function for test.
+          samples = math_ops.argmax(outputs, axis=1)
+          return array_ops.one_hot(samples, cell_depth, dtype=dtypes.float32)
+      else:
+        next_inputs_fn = None
+
+      sampler = sampler_py.ScheduledOutputTrainingSampler(
+          sampling_probability=sampling_probability,
+          time_major=False,
+          next_inputs_fn=next_inputs_fn)
+      initial_state = cell.zero_state(
+          dtype=dtypes.float32, batch_size=batch_size)
+
+      my_decoder = basic_decoder.BasicDecoderV2(
+          cell=cell,
+          sampler=sampler)
+
+      (first_finished,
+       first_inputs,
+       first_state) = my_decoder.initialize(input_t,
+                                            sequence_length=sequence_length,
+                                            initial_state=initial_state,
+                                            auxiliary_inputs=auxiliary_inputs)
+      output_size = my_decoder.output_size
+      output_dtype = my_decoder.output_dtype
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(cell_depth,
+                                           tensor_shape.TensorShape([])),
+          output_size)
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.int32),
+          output_dtype)
+
+      (step_outputs, step_state, step_next_inputs,
+       step_finished) = my_decoder.step(
+           constant_op.constant(0), first_inputs, first_state)
+
+      if use_next_inputs_fn:
+        output_after_next_inputs_fn = next_inputs_fn(step_outputs.rnn_output)
+
+      batch_size_t = my_decoder.batch_size
+
+      self.assertTrue(isinstance(first_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(isinstance(step_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(
+          isinstance(step_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual((batch_size, cell_depth), step_outputs[0].get_shape())
+      self.assertEqual((batch_size,), step_outputs[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[1].get_shape())
+
+      self.evaluate(variables.global_variables_initializer())
+
+      fetches = {
+          "batch_size": batch_size_t,
+          "first_finished": first_finished,
+          "first_inputs": first_inputs,
+          "first_state": first_state,
+          "step_outputs": step_outputs,
+          "step_state": step_state,
+          "step_next_inputs": step_next_inputs,
+          "step_finished": step_finished
+      }
+      if use_next_inputs_fn:
+        fetches["output_after_next_inputs_fn"] = output_after_next_inputs_fn
+
+      eval_result = self.evaluate(fetches)
+
+      self.assertAllEqual([False, False, False, False, True],
+                          eval_result["first_finished"])
+      self.assertAllEqual([False, False, False, True, True],
+                          eval_result["step_finished"])
+
+      sample_ids = eval_result["step_outputs"].sample_id
+      self.assertEqual(output_dtype.sample_id, sample_ids.dtype)
+      batch_where_not_sampling = np.where(np.logical_not(sample_ids))
+      batch_where_sampling = np.where(sample_ids)
+
+      auxiliary_inputs_to_concat = (
+          auxiliary_inputs[:, 1] if use_auxiliary_inputs else
+          np.array([]).reshape(batch_size, 0).astype(np.float32))
+
+      expected_next_sampling_inputs = np.concatenate(
+          (eval_result["output_after_next_inputs_fn"][batch_where_sampling]
+           if use_next_inputs_fn else
+           eval_result["step_outputs"].rnn_output[batch_where_sampling],
+           auxiliary_inputs_to_concat[batch_where_sampling]),
+          axis=-1)
+      self.assertAllClose(
+          eval_result["step_next_inputs"][batch_where_sampling],
+          expected_next_sampling_inputs)
+
+      self.assertAllClose(
+          eval_result["step_next_inputs"][batch_where_not_sampling],
+          np.concatenate(
+              (np.squeeze(inputs[batch_where_not_sampling, 1], axis=0),
+               auxiliary_inputs_to_concat[batch_where_not_sampling]),
+              axis=-1))
+
+  def testStepWithScheduledOutputTrainingHelperWithoutNextInputsFnOrAuxInputs(
+      self):
+    self._testStepWithScheduledOutputTrainingHelper(
+        sampling_probability=0.5, use_next_inputs_fn=False,
+        use_auxiliary_inputs=False)
+
+  def testStepWithScheduledOutputTrainingHelperWithNextInputsFn(self):
+    self._testStepWithScheduledOutputTrainingHelper(
+        sampling_probability=0.5, use_next_inputs_fn=True,
+        use_auxiliary_inputs=False)
+
+  def testStepWithScheduledOutputTrainingHelperWithAuxiliaryInputs(self):
+    self._testStepWithScheduledOutputTrainingHelper(
+        sampling_probability=0.5, use_next_inputs_fn=False,
+        use_auxiliary_inputs=True)
+
+  def testStepWithScheduledOutputTrainingHelperWithNextInputsFnAndAuxInputs(
+      self):
+    self._testStepWithScheduledOutputTrainingHelper(
+        sampling_probability=0.5, use_next_inputs_fn=True,
+        use_auxiliary_inputs=True)
+
+  def testStepWithScheduledOutputTrainingHelperWithNoSampling(self):
+    self._testStepWithScheduledOutputTrainingHelper(
+        sampling_probability=0.0, use_next_inputs_fn=True,
+        use_auxiliary_inputs=True)
+
+  def testStepWithInferenceHelperCategorical(self):
+    batch_size = 5
+    vocabulary_size = 7
+    cell_depth = vocabulary_size
+    start_token = 0
+    end_token = 6
+
+    start_inputs = array_ops.one_hot(
+        np.ones(batch_size, dtype=np.int32) * start_token,
+        vocabulary_size)
+
+    # The sample function samples categorically from the logits.
+    sample_fn = lambda x: sampler_py.categorical_sample(logits=x)
+    # The next inputs are a one-hot encoding of the sampled labels.
+    next_inputs_fn = (
+        lambda x: array_ops.one_hot(x, vocabulary_size, dtype=dtypes.float32))
+    end_fn = lambda sample_ids: math_ops.equal(sample_ids, end_token)
+
+    with self.cached_session(use_gpu=True):
+      cell = rnn_cell.LSTMCell(vocabulary_size)
+      sampler = sampler_py.InferenceSampler(
+          sample_fn, sample_shape=(), sample_dtype=dtypes.int32, end_fn=end_fn,
+          next_inputs_fn=next_inputs_fn)
+      initial_state = cell.zero_state(
+          dtype=dtypes.float32, batch_size=batch_size)
+      my_decoder = basic_decoder.BasicDecoderV2(
+          cell=cell,
+          sampler=sampler)
+      (first_finished, first_inputs, first_state) = my_decoder.initialize(
+          start_inputs, initial_state=initial_state)
+
+      output_size = my_decoder.output_size
+      output_dtype = my_decoder.output_dtype
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(cell_depth,
+                                           tensor_shape.TensorShape([])),
+          output_size)
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.int32),
+          output_dtype)
+
+      (step_outputs, step_state, step_next_inputs,
+       step_finished) = my_decoder.step(
+           constant_op.constant(0), first_inputs, first_state)
+      batch_size_t = my_decoder.batch_size
+
+      self.assertTrue(isinstance(first_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(isinstance(step_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(
+          isinstance(step_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual((batch_size, cell_depth), step_outputs[0].get_shape())
+      self.assertEqual((batch_size,), step_outputs[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[1].get_shape())
+
+      self.evaluate(variables.global_variables_initializer())
+      eval_result = self.evaluate({
+          "batch_size": batch_size_t,
+          "first_finished": first_finished,
+          "first_inputs": first_inputs,
+          "first_state": first_state,
+          "step_outputs": step_outputs,
+          "step_state": step_state,
+          "step_next_inputs": step_next_inputs,
+          "step_finished": step_finished
+      })
+
+      sample_ids = eval_result["step_outputs"].sample_id
+      self.assertEqual(output_dtype.sample_id, sample_ids.dtype)
+      expected_step_finished = (sample_ids == end_token)
+      expected_step_next_inputs = np.zeros((batch_size, vocabulary_size))
+      expected_step_next_inputs[np.arange(batch_size), sample_ids] = 1.0
+      self.assertAllEqual(expected_step_finished,
+                          eval_result["step_finished"])
+      self.assertAllEqual(expected_step_next_inputs,
+                          eval_result["step_next_inputs"])
+
+  def testStepWithInferenceHelperMultilabel(self):
+    batch_size = 5
+    vocabulary_size = 7
+    cell_depth = vocabulary_size
+    start_token = 0
+    end_token = 6
+
+    start_inputs = array_ops.one_hot(
+        np.ones(batch_size, dtype=np.int32) * start_token,
+        vocabulary_size)
+
+    # The sample function samples independent bernoullis from the logits.
+    sample_fn = (
+        lambda x: sampler_py.bernoulli_sample(logits=x, dtype=dtypes.bool))
+    # The next inputs are a one-hot encoding of the sampled labels.
+    next_inputs_fn = math_ops.to_float
+    end_fn = lambda sample_ids: sample_ids[:, end_token]
+
+    with self.cached_session(use_gpu=True):
+      cell = rnn_cell.LSTMCell(vocabulary_size)
+      sampler = sampler_py.InferenceSampler(
+          sample_fn, sample_shape=[cell_depth], sample_dtype=dtypes.bool,
+          end_fn=end_fn, next_inputs_fn=next_inputs_fn)
+      initial_state = cell.zero_state(
+          dtype=dtypes.float32, batch_size=batch_size)
+      my_decoder = basic_decoder.BasicDecoderV2(
+          cell=cell,
+          sampler=sampler)
+      (first_finished, first_inputs, first_state) = my_decoder.initialize(
+          start_inputs, initial_state=initial_state)
+      output_size = my_decoder.output_size
+      output_dtype = my_decoder.output_dtype
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(cell_depth, cell_depth),
+          output_size)
+      self.assertEqual(
+          basic_decoder.BasicDecoderOutput(dtypes.float32, dtypes.bool),
+          output_dtype)
+
+      (step_outputs, step_state, step_next_inputs,
+       step_finished) = my_decoder.step(
+           constant_op.constant(0), first_inputs, first_state)
+      batch_size_t = my_decoder.batch_size
+
+      self.assertTrue(isinstance(first_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(isinstance(step_state, rnn_cell.LSTMStateTuple))
+      self.assertTrue(
+          isinstance(step_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual((batch_size, cell_depth), step_outputs[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_outputs[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), first_state[1].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[0].get_shape())
+      self.assertEqual((batch_size, cell_depth), step_state[1].get_shape())
+
+      self.evaluate(variables.global_variables_initializer())
+      eval_result = self.evaluate({
+          "batch_size": batch_size_t,
+          "first_finished": first_finished,
+          "first_inputs": first_inputs,
+          "first_state": first_state,
+          "step_outputs": step_outputs,
+          "step_state": step_state,
+          "step_next_inputs": step_next_inputs,
+          "step_finished": step_finished
+      })
+
+      sample_ids = eval_result["step_outputs"].sample_id
+      self.assertEqual(output_dtype.sample_id, sample_ids.dtype)
+      expected_step_finished = sample_ids[:, end_token]
+      expected_step_next_inputs = sample_ids.astype(np.float32)
+      self.assertAllEqual(expected_step_finished,
+                          eval_result["step_finished"])
+      self.assertAllEqual(expected_step_next_inputs,
+                          eval_result["step_next_inputs"])
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
index 5e28e651c666b1c448f778fc9c02d637ce817bae..56f2a0acc9f2e6f951c5df26a53a31645697da4f 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py
@@ -25,10 +25,13 @@ from tensorflow.contrib.seq2seq.python.ops import attention_wrapper
 from tensorflow.contrib.seq2seq.python.ops import beam_search_decoder
 from tensorflow.contrib.seq2seq.python.ops import beam_search_ops
 from tensorflow.contrib.seq2seq.python.ops import decoder
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import layers
 from tensorflow.python.layers import core as layers_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn_ops
@@ -530,11 +533,10 @@ class BeamSearchDecoderTest(test.TestCase):
           return (shape[1], shape[0]) + shape[2:]
         return shape
 
-      self.assertTrue(
-          isinstance(final_outputs,
-                     beam_search_decoder.FinalBeamSearchDecoderOutput))
-      self.assertTrue(
-          isinstance(final_state, beam_search_decoder.BeamSearchDecoderState))
+      self.assertIsInstance(
+          final_outputs, beam_search_decoder.FinalBeamSearchDecoderOutput)
+      self.assertIsInstance(
+          final_state, beam_search_decoder.BeamSearchDecoderState)
 
       beam_search_decoder_output = final_outputs.beam_search_decoder_output
       self.assertEqual(
@@ -574,5 +576,119 @@ class BeamSearchDecoderTest(test.TestCase):
         with_alignment_history=True)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class BeamSearchDecoderV2Test(test.TestCase):
+
+  def _testDynamicDecodeRNN(self, time_major, has_attention,
+                            with_alignment_history=False):
+    encoder_sequence_length = np.array([3, 2, 3, 1, 1])
+    decoder_sequence_length = np.array([2, 0, 1, 2, 3])
+    batch_size = 5
+    decoder_max_time = 4
+    input_depth = 7
+    cell_depth = 9
+    attention_depth = 6
+    vocab_size = 20
+    end_token = vocab_size - 1
+    start_token = 0
+    embedding_dim = 50
+    max_out = max(decoder_sequence_length)
+    output_layer = layers.Dense(vocab_size, use_bias=True, activation=None)
+    beam_width = 3
+
+    with self.cached_session():
+      batch_size_tensor = constant_op.constant(batch_size)
+      embedding = np.random.randn(vocab_size, embedding_dim).astype(np.float32)
+      cell = rnn_cell.LSTMCell(cell_depth)
+      initial_state = cell.zero_state(batch_size, dtypes.float32)
+      coverage_penalty_weight = 0.0
+      if has_attention:
+        coverage_penalty_weight = 0.2
+        inputs = array_ops.placeholder_with_default(
+            np.random.randn(batch_size, decoder_max_time, input_depth).astype(
+                np.float32),
+            shape=(None, None, input_depth))
+        tiled_inputs = beam_search_decoder.tile_batch(
+            inputs, multiplier=beam_width)
+        tiled_sequence_length = beam_search_decoder.tile_batch(
+            encoder_sequence_length, multiplier=beam_width)
+        attention_mechanism = attention_wrapper.BahdanauAttention(
+            num_units=attention_depth,
+            memory=tiled_inputs,
+            memory_sequence_length=tiled_sequence_length)
+        initial_state = beam_search_decoder.tile_batch(
+            initial_state, multiplier=beam_width)
+        cell = attention_wrapper.AttentionWrapper(
+            cell=cell,
+            attention_mechanism=attention_mechanism,
+            attention_layer_size=attention_depth,
+            alignment_history=with_alignment_history)
+      cell_state = cell.zero_state(
+          dtype=dtypes.float32, batch_size=batch_size_tensor * beam_width)
+      if has_attention:
+        cell_state = cell_state.clone(cell_state=initial_state)
+      bsd = beam_search_decoder.BeamSearchDecoderV2(
+          cell=cell,
+          beam_width=beam_width,
+          output_layer=output_layer,
+          length_penalty_weight=0.0,
+          coverage_penalty_weight=coverage_penalty_weight,
+          output_time_major=time_major,
+          maximum_iterations=max_out)
+
+      final_outputs, final_state, final_sequence_lengths = bsd(
+          embedding,
+          start_tokens=array_ops.fill([batch_size_tensor], start_token),
+          end_token=end_token,
+          initial_state=cell_state)
+
+      def _t(shape):
+        if time_major:
+          return (shape[1], shape[0]) + shape[2:]
+        return shape
+
+      self.assertIsInstance(
+          final_outputs, beam_search_decoder.FinalBeamSearchDecoderOutput)
+      self.assertIsInstance(
+          final_state, beam_search_decoder.BeamSearchDecoderState)
+
+      beam_search_decoder_output = final_outputs.beam_search_decoder_output
+      expected_seq_length = 3 if context.executing_eagerly() else None
+      self.assertEqual(
+          _t((batch_size, expected_seq_length, beam_width)),
+          tuple(beam_search_decoder_output.scores.get_shape().as_list()))
+      self.assertEqual(
+          _t((batch_size, expected_seq_length, beam_width)),
+          tuple(final_outputs.predicted_ids.get_shape().as_list()))
+
+      self.evaluate(variables.global_variables_initializer())
+      eval_results = self.evaluate({
+          'final_outputs': final_outputs,
+          'final_sequence_lengths': final_sequence_lengths
+      })
+
+      max_sequence_length = np.max(eval_results['final_sequence_lengths'])
+
+      # A smoke test
+      self.assertEqual(
+          _t((batch_size, max_sequence_length, beam_width)),
+          eval_results['final_outputs'].beam_search_decoder_output.scores.shape)
+      self.assertEqual(
+          _t((batch_size, max_sequence_length, beam_width)), eval_results[
+              'final_outputs'].beam_search_decoder_output.predicted_ids.shape)
+
+  def testDynamicDecodeRNNBatchMajorNoAttention(self):
+    self._testDynamicDecodeRNN(time_major=False, has_attention=False)
+
+  def testDynamicDecodeRNNBatchMajorYesAttention(self):
+    self._testDynamicDecodeRNN(time_major=False, has_attention=True)
+
+  def testDynamicDecodeRNNBatchMajorYesAttentionWithAlignmentHistory(self):
+    self._testDynamicDecodeRNN(
+        time_major=False,
+        has_attention=True,
+        with_alignment_history=True)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_v2_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5bba2b32e940aa4d5984821ebd3845d7f272549
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/decoder_v2_test.py
@@ -0,0 +1,169 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for contrib.seq2seq.python.seq2seq.decoder."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.seq2seq.python.ops import basic_decoder
+from tensorflow.contrib.seq2seq.python.ops import sampler as sampler_py
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+@keras_parameterized.run_all_keras_modes
+class DecodeV2RNNTest(keras_parameterized.TestCase, test.TestCase):
+  """Tests for DecoderV2."""
+
+  def _testDecodeRNN(self, time_major, maximum_iterations=None):
+
+    sequence_length = [3, 4, 3, 1, 0]
+    batch_size = 5
+    max_time = 8
+    input_depth = 7
+    cell_depth = 10
+    max_out = max(sequence_length)
+
+    with self.cached_session(use_gpu=True):
+      if time_major:
+        inputs = np.random.randn(max_time, batch_size,
+                                 input_depth).astype(np.float32)
+      else:
+        inputs = np.random.randn(batch_size, max_time,
+                                 input_depth).astype(np.float32)
+      input_t = constant_op.constant(inputs)
+      cell = rnn_cell.LSTMCell(cell_depth)
+      sampler = sampler_py.TrainingSampler(time_major=time_major)
+      my_decoder = basic_decoder.BasicDecoderV2(
+          cell=cell,
+          sampler=sampler,
+          output_time_major=time_major,
+          maximum_iterations=maximum_iterations)
+
+      initial_state = cell.zero_state(
+          dtype=dtypes.float32, batch_size=batch_size)
+      (final_outputs, unused_final_state, final_sequence_length) = my_decoder(
+          input_t, initial_state=initial_state, sequence_length=sequence_length)
+
+      def _t(shape):
+        if time_major:
+          return (shape[1], shape[0]) + shape[2:]
+        return shape
+
+      if not context.executing_eagerly():
+        self.assertEqual((batch_size,),
+                         tuple(final_sequence_length.get_shape().as_list()))
+        self.assertEqual(
+            _t((batch_size, None, cell_depth)),
+            tuple(final_outputs.rnn_output.get_shape().as_list()))
+        self.assertEqual(
+            _t((batch_size, None)),
+            tuple(final_outputs.sample_id.get_shape().as_list()))
+
+      self.evaluate(variables.global_variables_initializer())
+      final_outputs = self.evaluate(final_outputs)
+      final_sequence_length = self.evaluate(final_sequence_length)
+
+      # Mostly a smoke test
+      time_steps = max_out
+      expected_length = sequence_length
+      if maximum_iterations is not None:
+        time_steps = min(max_out, maximum_iterations)
+        expected_length = [min(x, maximum_iterations) for x in expected_length]
+      if context.executing_eagerly() and maximum_iterations != 0:
+        # Only check the shape of output when maximum_iterations > 0, see
+        # b/123431432 for more details.
+        self.assertEqual(
+            _t((batch_size, time_steps, cell_depth)),
+            final_outputs.rnn_output.shape)
+        self.assertEqual(
+            _t((batch_size, time_steps)), final_outputs.sample_id.shape)
+      self.assertItemsEqual(expected_length, final_sequence_length)
+
+  def testDynamicDecodeRNNBatchMajor(self):
+    self._testDecodeRNN(time_major=False)
+
+  def testDynamicDecodeRNNTimeMajor(self):
+    self._testDecodeRNN(time_major=True)
+
+  def testDynamicDecodeRNNZeroMaxIters(self):
+    self._testDecodeRNN(time_major=True, maximum_iterations=0)
+
+  def testDynamicDecodeRNNOneMaxIter(self):
+    self._testDecodeRNN(time_major=True, maximum_iterations=1)
+
+  def _testDynamicDecodeRNNWithTrainingHelperMatchesDynamicRNN(
+      self, use_sequence_length):
+    sequence_length = [3, 4, 3, 1, 0]
+    batch_size = 5
+    max_time = 8
+    input_depth = 7
+    cell_depth = 10
+    max_out = max(sequence_length)
+
+    with self.cached_session(use_gpu=True):
+      inputs = np.random.randn(batch_size, max_time,
+                               input_depth).astype(np.float32)
+      inputs = constant_op.constant(inputs)
+
+      cell = rnn_cell.LSTMCell(cell_depth)
+      zero_state = cell.zero_state(dtype=dtypes.float32, batch_size=batch_size)
+      sampler = sampler_py.TrainingSampler()
+      my_decoder = basic_decoder.BasicDecoderV2(
+          cell=cell, sampler=sampler, impute_finished=use_sequence_length)
+
+      final_decoder_outputs, final_decoder_state, _ = my_decoder(
+          inputs, initial_state=zero_state, sequence_length=sequence_length)
+
+      final_rnn_outputs, final_rnn_state = rnn.dynamic_rnn(
+          cell,
+          inputs,
+          sequence_length=sequence_length if use_sequence_length else None,
+          initial_state=zero_state)
+
+      self.evaluate(variables.global_variables_initializer())
+      eval_result = self.evaluate({
+          "final_decoder_outputs": final_decoder_outputs,
+          "final_decoder_state": final_decoder_state,
+          "final_rnn_outputs": final_rnn_outputs,
+          "final_rnn_state": final_rnn_state
+      })
+
+      # Decoder only runs out to max_out; ensure values are identical
+      # to dynamic_rnn, which also zeros out outputs and passes along state.
+      self.assertAllClose(eval_result["final_decoder_outputs"].rnn_output,
+                          eval_result["final_rnn_outputs"][:, 0:max_out, :])
+      if use_sequence_length:
+        self.assertAllClose(eval_result["final_decoder_state"],
+                            eval_result["final_rnn_state"])
+
+  def testDynamicDecodeRNNWithTrainingHelperMatchesDynamicRNNWithSeqLen(self):
+    self._testDynamicDecodeRNNWithTrainingHelperMatchesDynamicRNN(
+        use_sequence_length=True)
+
+  def testDynamicDecodeRNNWithTrainingHelperMatchesDynamicRNNNoSeqLen(self):
+    self._testDynamicDecodeRNNWithTrainingHelperMatchesDynamicRNN(
+        use_sequence_length=False)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py
index 5aa32b532ffcf5772f6ace26662f5e5471cf6923..41b2a53ca5b178be9b04446c81d832575e5ed75b 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/loss_test.py
@@ -14,80 +14,254 @@
 # ==============================================================================
 
 """Tests for contrib.seq2seq.python.seq2seq.loss_ops."""
-# pylint: disable=unused-import,g-bad-import-order
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-# pylint: enable=unused-import
 
 import numpy as np
 
 from tensorflow.contrib.seq2seq.python.ops import loss
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class LossTest(test.TestCase):
 
+  def setUp(self):
+    self.batch_size = 2
+    self.sequence_length = 3
+    self.number_of_classes = 5
+    logits = [
+        constant_op.constant(i + 0.5, shape=[self.batch_size,
+                                             self.number_of_classes])
+        for i in range(self.sequence_length)
+    ]
+    self.logits = array_ops.stack(logits, axis=1)
+    targets = [
+        constant_op.constant(i, dtypes.int32, shape=[self.batch_size])
+        for i in range(self.sequence_length)
+    ]
+    self.targets = array_ops.stack(targets, axis=1)
+    weights = [
+        constant_op.constant(1.0, shape=[self.batch_size])
+        for _ in range(self.sequence_length)
+    ]
+    self.weights = array_ops.stack(weights, axis=1)
+    # expected_loss = sparse_softmax_cross_entropy_with_logits(targets, logits)
+    # where targets = [0, 1, 2], and logits = [[0.5] * 5, [1.5] * 5, [2.5] * 5]
+    self.expected_loss = 1.60944
+
   def testSequenceLoss(self):
-    with self.session(use_gpu=True) as sess:
-      with variable_scope.variable_scope(
-          'root', initializer=init_ops.constant_initializer(0.5)):
-        batch_size = 2
-        sequence_length = 3
-        number_of_classes = 5
-        logits = [
-            constant_op.constant(
-                i + 0.5, shape=[batch_size, number_of_classes])
-            for i in range(sequence_length)
-        ]
-        logits = array_ops.stack(logits, axis=1)
-        targets = [
-            constant_op.constant(
-                i, dtypes.int32, shape=[batch_size])
-            for i in range(sequence_length)
-        ]
-        targets = array_ops.stack(targets, axis=1)
-        weights = [
-            constant_op.constant(
-                1.0, shape=[batch_size]) for i in range(sequence_length)
-        ]
-        weights = array_ops.stack(weights, axis=1)
-
-        average_loss_per_example = loss.sequence_loss(
-            logits, targets, weights,
-            average_across_timesteps=True,
-            average_across_batch=True)
-        res = sess.run(average_loss_per_example)
-        self.assertAllClose(1.60944, res)
-
-        average_loss_per_sequence = loss.sequence_loss(
-            logits, targets, weights,
-            average_across_timesteps=False,
-            average_across_batch=True)
-        res = sess.run(average_loss_per_sequence)
-        compare_per_sequence = np.ones((sequence_length)) * 1.60944
-        self.assertAllClose(compare_per_sequence, res)
-
-        average_loss_per_batch = loss.sequence_loss(
-            logits, targets, weights,
-            average_across_timesteps=True,
-            average_across_batch=False)
-        res = sess.run(average_loss_per_batch)
-        compare_per_batch = np.ones((batch_size)) * 1.60944
-        self.assertAllClose(compare_per_batch, res)
-
-        total_loss = loss.sequence_loss(
-            logits, targets, weights,
-            average_across_timesteps=False,
-            average_across_batch=False)
-        res = sess.run(total_loss)
-        compare_total = np.ones((batch_size, sequence_length)) * 1.60944
-        self.assertAllClose(compare_total, res)
+    with self.test_session(use_gpu=True):
+      average_loss_per_example = loss.sequence_loss(
+          self.logits, self.targets, self.weights,
+          average_across_timesteps=True,
+          average_across_batch=True)
+      res = self.evaluate(average_loss_per_example)
+      self.assertAllClose(self.expected_loss, res)
+
+      average_loss_per_sequence = loss.sequence_loss(
+          self.logits, self.targets, self.weights,
+          average_across_timesteps=False,
+          average_across_batch=True)
+      res = self.evaluate(average_loss_per_sequence)
+      compare_per_sequence = np.full((self.sequence_length), self.expected_loss)
+      self.assertAllClose(compare_per_sequence, res)
+
+      average_loss_per_batch = loss.sequence_loss(
+          self.logits, self.targets, self.weights,
+          average_across_timesteps=True,
+          average_across_batch=False)
+      res = self.evaluate(average_loss_per_batch)
+      compare_per_batch = np.full((self.batch_size), self.expected_loss)
+      self.assertAllClose(compare_per_batch, res)
+
+      total_loss = loss.sequence_loss(
+          self.logits, self.targets, self.weights,
+          average_across_timesteps=False,
+          average_across_batch=False)
+      res = self.evaluate(total_loss)
+      compare_total = np.full((self.batch_size, self.sequence_length),
+                              self.expected_loss)
+      self.assertAllClose(compare_total, res)
+
+  def testSequenceLossClass(self):
+    with self.test_session(use_gpu=True):
+      seq_loss = loss.SequenceLoss(average_across_timesteps=True,
+                                   average_across_batch=True,
+                                   sum_over_timesteps=False,
+                                   sum_over_batch=False)
+      average_loss_per_example = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_example)
+      self.assertAllClose(self.expected_loss, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=True,
+                                   sum_over_timesteps=False,
+                                   sum_over_batch=False)
+      average_loss_per_sequence = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_sequence)
+      compare_per_sequence = np.full((self.sequence_length), self.expected_loss)
+      self.assertAllClose(compare_per_sequence, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=True,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=False,
+                                   sum_over_batch=False)
+      average_loss_per_batch = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_batch)
+      compare_per_batch = np.full((self.batch_size), self.expected_loss)
+      self.assertAllClose(compare_per_batch, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=False,
+                                   sum_over_batch=False)
+      total_loss = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(total_loss)
+      compare_total = np.full((self.batch_size, self.sequence_length),
+                              self.expected_loss)
+      self.assertAllClose(compare_total, res)
+
+  def testSumReduction(self):
+    with self.test_session(use_gpu=True):
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=True,
+                                   sum_over_batch=True)
+      average_loss_per_example = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_example)
+      self.assertAllClose(self.expected_loss, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=False,
+                                   sum_over_batch=True)
+      average_loss_per_sequence = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_sequence)
+      compare_per_sequence = np.full((self.sequence_length), self.expected_loss)
+      self.assertAllClose(compare_per_sequence, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=True,
+                                   sum_over_batch=False)
+      average_loss_per_batch = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_batch)
+      compare_per_batch = np.full((self.batch_size), self.expected_loss)
+      self.assertAllClose(compare_per_batch, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=False,
+                                   sum_over_batch=False)
+      total_loss = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(total_loss)
+      compare_total = np.full((self.batch_size, self.sequence_length),
+                              self.expected_loss)
+      self.assertAllClose(compare_total, res)
+
+  def testWeightedSumReduction(self):
+    weights = [
+        constant_op.constant(1.0, shape=[self.batch_size])
+        for _ in range(self.sequence_length)
+    ]
+    # Make the last element in the sequence to have zero weights.
+    weights[-1] = constant_op.constant(0.0, shape=[self.batch_size])
+    self.weights = array_ops.stack(weights, axis=1)
+    with self.test_session(use_gpu=True):
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=True,
+                                   sum_over_batch=True)
+      average_loss_per_example = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_example)
+      self.assertAllClose(self.expected_loss, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=False,
+                                   sum_over_batch=True)
+      average_loss_per_sequence = seq_loss(
+          self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_sequence)
+      compare_per_sequence = np.full((self.sequence_length), self.expected_loss)
+      # The last element in every sequence are zeros, which will be filtered.
+      compare_per_sequence[-1] = 0.
+      self.assertAllClose(compare_per_sequence, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=True,
+                                   sum_over_batch=False)
+      average_loss_per_batch = seq_loss(self.targets, self.logits, self.weights)
+      res = self.evaluate(average_loss_per_batch)
+      compare_per_batch = np.full((self.batch_size), self.expected_loss)
+      self.assertAllClose(compare_per_batch, res)
+
+      seq_loss = loss.SequenceLoss(average_across_timesteps=False,
+                                   average_across_batch=False,
+                                   sum_over_timesteps=False,
+                                   sum_over_batch=False)
+      total_loss = seq_loss(self.targets, self.logits, self.weights)
+      res = self.evaluate(total_loss)
+      compare_total = np.full((self.batch_size, self.sequence_length),
+                              self.expected_loss)
+      # The last element in every sequence are zeros, which will be filtered.
+      compare_total[:, -1] = 0
+      self.assertAllClose(compare_total, res)
+
+  def testZeroWeights(self):
+    weights = [
+        constant_op.constant(0.0, shape=[self.batch_size])
+        for _ in range(self.sequence_length)
+    ]
+    weights = array_ops.stack(weights, axis=1)
+    with self.test_session(use_gpu=True):
+      average_loss_per_example = loss.sequence_loss(
+          self.logits, self.targets, weights,
+          average_across_timesteps=True,
+          average_across_batch=True)
+      res = self.evaluate(average_loss_per_example)
+      self.assertAllClose(0.0, res)
+
+      average_loss_per_sequence = loss.sequence_loss(
+          self.logits, self.targets, weights,
+          average_across_timesteps=False,
+          average_across_batch=True)
+      res = self.evaluate(average_loss_per_sequence)
+      compare_per_sequence = np.zeros((self.sequence_length))
+      self.assertAllClose(compare_per_sequence, res)
+
+      average_loss_per_batch = loss.sequence_loss(
+          self.logits, self.targets, weights,
+          average_across_timesteps=True,
+          average_across_batch=False)
+      res = self.evaluate(average_loss_per_batch)
+      compare_per_batch = np.zeros((self.batch_size))
+      self.assertAllClose(compare_per_batch, res)
+
+      total_loss = loss.sequence_loss(
+          self.logits, self.targets, weights,
+          average_across_timesteps=False,
+          average_across_batch=False)
+      res = self.evaluate(total_loss)
+      compare_total = np.zeros((self.batch_size, self.sequence_length))
+      self.assertAllClose(compare_total, res)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index 77e9f848b137911b53e1b4df5dd740fe38af55bb..79c2ac2f500307ba23b6d97a7a30c6d04cea5176 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -25,9 +25,13 @@ import math
 import numpy as np
 
 from tensorflow.contrib.framework.python.framework import tensor_util
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import layers
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.layers import base as layers_base
 from tensorflow.python.layers import core as layers_core
 from tensorflow.python.ops import array_ops
@@ -72,77 +76,6 @@ class AttentionMechanism(object):
     raise NotImplementedError
 
 
-def _prepare_memory(memory, memory_sequence_length, check_inner_dims_defined):
-  """Convert to tensor and possibly mask `memory`.
-
-  Args:
-    memory: `Tensor`, shaped `[batch_size, max_time, ...]`.
-    memory_sequence_length: `int32` `Tensor`, shaped `[batch_size]`.
-    check_inner_dims_defined: Python boolean.  If `True`, the `memory`
-      argument's shape is checked to ensure all but the two outermost
-      dimensions are fully defined.
-
-  Returns:
-    A (possibly masked), checked, new `memory`.
-
-  Raises:
-    ValueError: If `check_inner_dims_defined` is `True` and not
-      `memory.shape[2:].is_fully_defined()`.
-  """
-  memory = nest.map_structure(
-      lambda m: ops.convert_to_tensor(m, name="memory"), memory)
-  if memory_sequence_length is not None:
-    memory_sequence_length = ops.convert_to_tensor(
-        memory_sequence_length, name="memory_sequence_length")
-  if check_inner_dims_defined:
-    def _check_dims(m):
-      if not m.get_shape()[2:].is_fully_defined():
-        raise ValueError("Expected memory %s to have fully defined inner dims, "
-                         "but saw shape: %s" % (m.name, m.get_shape()))
-    nest.map_structure(_check_dims, memory)
-  if memory_sequence_length is None:
-    seq_len_mask = None
-  else:
-    seq_len_mask = array_ops.sequence_mask(
-        memory_sequence_length,
-        maxlen=array_ops.shape(nest.flatten(memory)[0])[1],
-        dtype=nest.flatten(memory)[0].dtype)
-    seq_len_batch_size = (
-        tensor_shape.dimension_value(memory_sequence_length.shape[0])
-        or array_ops.shape(memory_sequence_length)[0])
-  def _maybe_mask(m, seq_len_mask):
-    rank = m.get_shape().ndims
-    rank = rank if rank is not None else array_ops.rank(m)
-    extra_ones = array_ops.ones(rank - 2, dtype=dtypes.int32)
-    m_batch_size = tensor_shape.dimension_value(
-        m.shape[0]) or array_ops.shape(m)[0]
-    if memory_sequence_length is not None:
-      message = ("memory_sequence_length and memory tensor batch sizes do not "
-                 "match.")
-      with ops.control_dependencies([
-          check_ops.assert_equal(
-              seq_len_batch_size, m_batch_size, message=message)]):
-        seq_len_mask = array_ops.reshape(
-            seq_len_mask,
-            array_ops.concat((array_ops.shape(seq_len_mask), extra_ones), 0))
-        return m * seq_len_mask
-    else:
-      return m
-  return nest.map_structure(lambda m: _maybe_mask(m, seq_len_mask), memory)
-
-
-def _maybe_mask_score(score, memory_sequence_length, score_mask_value):
-  if memory_sequence_length is None:
-    return score
-  message = ("All values in memory_sequence_length must greater than zero.")
-  with ops.control_dependencies(
-      [check_ops.assert_positive(memory_sequence_length, message=message)]):
-    score_mask = array_ops.sequence_mask(
-        memory_sequence_length, maxlen=array_ops.shape(score)[1])
-    score_mask_values = score_mask_value * array_ops.ones_like(score)
-    return array_ops.where(score_mask, score, score_mask_values)
-
-
 class _BaseAttentionMechanism(AttentionMechanism):
   """A base AttentionMechanism class providing common functionality.
 
@@ -205,12 +138,14 @@ class _BaseAttentionMechanism(AttentionMechanism):
           self._memory_layer.dtype).as_numpy_dtype(-np.inf)
     self._probability_fn = lambda score, prev: (  # pylint:disable=g-long-lambda
         probability_fn(
-            _maybe_mask_score(score, memory_sequence_length, score_mask_value),
+            _maybe_mask_score(score,
+                              memory_sequence_length=memory_sequence_length,
+                              score_mask_value=score_mask_value),
             prev))
     with ops.name_scope(
         name, "BaseAttentionMechanismInit", nest.flatten(memory)):
       self._values = _prepare_memory(
-          memory, memory_sequence_length,
+          memory, memory_sequence_length=memory_sequence_length,
           check_inner_dims_defined=check_inner_dims_defined)
       self._keys = (
           self.memory_layer(self._values) if self.memory_layer  # pylint: disable=not-callable
@@ -286,6 +221,376 @@ class _BaseAttentionMechanism(AttentionMechanism):
     return self.initial_alignments(batch_size, dtype)
 
 
+class _BaseAttentionMechanismV2(AttentionMechanism, layers.Layer):
+  """A base AttentionMechanism class providing common functionality.
+
+  Common functionality includes:
+    1. Storing the query and memory layers.
+    2. Preprocessing and storing the memory.
+
+  Note that this layer takes memory as its init parameter, which is an
+  anti-pattern of Keras API, we have to keep the memory as init parameter for
+  performance and dependency reason. Under the hood, during `__init__()`, it
+  will invoke `base_layer.__call__(memory, setup_memory=True)`. This will let
+  keras to keep track of the memory tensor as the input of this layer. Once
+  the `__init__()` is done, then user can query the attention by
+  `score = att_obj([query, state])`, and use it as a normal keras layer.
+
+  Special attention is needed when adding using this class as the base layer for
+  new attention:
+    1. Build() could be invoked at least twice. So please make sure weights are
+       not duplicated.
+    2. Layer.get_weights() might return different set of weights if the instance
+       has `query_layer`. The query_layer weights is not initialized until the
+       memory is configured.
+
+  Also note that this layer does not work with Keras model when
+  `model.compile(run_eagerly=True)` due to the fact that this layer is stateful.
+  The support for that will be added in a future version.
+  """
+
+  def __init__(self,
+               memory,
+               probability_fn,
+               query_layer=None,
+               memory_layer=None,
+               memory_sequence_length=None,
+               **kwargs):
+    """Construct base AttentionMechanism class.
+
+    Args:
+      memory: The memory to query; usually the output of an RNN encoder.  This
+        tensor should be shaped `[batch_size, max_time, ...]`.
+      probability_fn: A `callable`. Converts the score and previous alignments
+        to probabilities. Its signature should be:
+        `probabilities = probability_fn(score, state)`.
+      query_layer:  (optional): Instance of `tf.keras.Layer`.  The layer's depth
+        must match the depth of `memory_layer`.  If `query_layer` is not
+        provided, the shape of `query` must match that of `memory_layer`.
+      memory_layer: (optional): Instance of `tf.keras.Layer`. The layer's
+        depth must match the depth of `query_layer`.
+        If `memory_layer` is not provided, the shape of `memory` must match
+        that of `query_layer`.
+      memory_sequence_length (optional): Sequence lengths for the batch entries
+        in memory. If provided, the memory tensor rows are masked with zeros
+        for values past the respective sequence lengths.
+      **kwargs: Dictionary that contains other common arguments for layer
+        creation.
+    """
+    if (query_layer is not None
+        and not isinstance(query_layer, layers.Layer)):
+      raise TypeError(
+          "query_layer is not a Layer: %s" % type(query_layer).__name__)
+    if (memory_layer is not None
+        and not isinstance(memory_layer, layers.Layer)):
+      raise TypeError(
+          "memory_layer is not a Layer: %s" % type(memory_layer).__name__)
+    self.query_layer = query_layer
+    self.memory_layer = memory_layer
+    if self.memory_layer is not None and "dtype" not in kwargs:
+      kwargs["dtype"] = self.memory_layer.dtype
+    super(_BaseAttentionMechanismV2, self).__init__(**kwargs)
+    if not callable(probability_fn):
+      raise TypeError("probability_fn must be callable, saw type: %s" %
+                      type(probability_fn).__name__)
+    self.probability_fn = probability_fn
+
+    self.keys = None
+    self.values = None
+    self.batch_size = None
+    self._memory_initialized = False
+    self._check_inner_dims_defined = True
+    self.supports_masking = True
+    self.score_mask_value = dtypes.as_dtype(self.dtype).as_numpy_dtype(-np.inf)
+
+    if memory is not None:
+      # Setup the memory by self.__call__() with memory and memory_seq_length.
+      # This will make the attention follow the keras convention which takes
+      # all the tensor inputs via __call__().
+      if memory_sequence_length is None:
+        inputs = memory
+      else:
+        inputs = [memory, memory_sequence_length]
+
+      self.values = super(_BaseAttentionMechanismV2, self).__call__(
+          inputs, setup_memory=True)
+
+  def build(self, input_shape):
+    if not self._memory_initialized:
+      # This is for setting up the memory, which contains memory and optional
+      # memory_sequence_length. Build the memory_layer with memory shape.
+      if self.memory_layer is not None and not self.memory_layer.built:
+        if isinstance(input_shape, list):
+          self.memory_layer.build(input_shape[0])
+        else:
+          self.memory_layer.build(input_shape)
+    else:
+      # The input_shape should be query.shape and state.shape. Use the query
+      # to init the query layer.
+      if self.query_layer is not None and not self.query_layer.built:
+        self.query_layer.build(input_shape[0])
+
+  def __call__(self, inputs, **kwargs):
+    """Preprocess the inputs before calling `base_layer.__call__()`.
+
+    Note that there are situation here, one for setup memory, and one with
+    actual query and state.
+    1. When the memory has not been configured, we just pass all the param to
+    base_layer.__call__(), which will then invoke self.call() with proper
+    inputs, which allows this class to setup memory.
+    2. When the memory has already been setup, the input should contain query
+    and state, and optionally processed memory. If the processed memory is
+    not included in the input, we will have to append it to the inputs and
+    give it to the base_layer.__call__(). The processed memory is the output
+    of first invocation of self.__call__(). If we don't add it here, then from
+    keras perspective, the graph is disconnected since the output from
+    previous call is never used.
+
+    Args:
+      inputs: the inputs tensors.
+      **kwargs: dict, other keyeword arguments for the `__call__()`
+    """
+    if self._memory_initialized:
+      if len(inputs) not in (2, 3):
+        raise ValueError("Expect the inputs to have 2 or 3 tensors, got %d" %
+                         len(inputs))
+      if len(inputs) == 2:
+        # We append the calculated memory here so that the graph will be
+        # connected.
+        inputs.append(self.values)
+    return super(_BaseAttentionMechanismV2, self).__call__(inputs, **kwargs)
+
+  def call(self, inputs, mask=None, setup_memory=False, **kwargs):
+    """Setup the memory or query the attention.
+
+    There are two case here, one for setup memory, and the second is query the
+    attention score. `setup_memory` is the flag to indicate which mode it is.
+    The input list will be treated differently based on that flag.
+
+    Args:
+      inputs: a list of tensor that could either be `query` and `state`, or
+        `memory` and `memory_sequence_length`.
+        `query` is the tensor of dtype matching `memory` and shape
+        `[batch_size, query_depth]`.
+        `state` is the tensor of dtype matching `memory` and shape
+        `[batch_size, alignments_size]`. (`alignments_size` is memory's
+        `max_time`).
+        `memory` is the memory to query; usually the output of an RNN encoder.
+        The tensor should be shaped `[batch_size, max_time, ...]`.
+        `memory_sequence_length` (optional) is the sequence lengths for the
+         batch entries in memory. If provided, the memory tensor rows are masked
+        with zeros for values past the respective sequence lengths.
+      mask: optional bool tensor with shape `[batch, max_time]` for the mask of
+        memory. If it is not None, the corresponding item of the memory should
+        be filtered out during calculation.
+      setup_memory: boolean, whether the input is for setting up memory, or
+        query attention.
+      **kwargs: Dict, other keyword arguments for the call method.
+    Returns:
+      Either processed memory or attention score, based on `setup_memory`.
+    """
+    if setup_memory:
+      if isinstance(inputs, list):
+        if len(inputs) not in (1, 2):
+          raise ValueError("Expect inputs to have 1 or 2 tensors, got %d" %
+                           len(inputs))
+        memory = inputs[0]
+        memory_sequence_length = inputs[1] if len(inputs) == 2 else None
+        memory_mask = mask
+      else:
+        memory, memory_sequence_length = inputs, None
+        memory_mask = mask
+      self._setup_memory(memory, memory_sequence_length, memory_mask)
+      # We force the self.built to false here since only memory is initialized,
+      # but the real query/state has not been call() yet. The layer should be
+      # build and call again.
+      self.built = False
+      # Return the processed memory in order to create the Keras connectivity
+      # data for it.
+      return self.values
+    else:
+      if not self._memory_initialized:
+        raise ValueError("Cannot query the attention before the setup of "
+                         "memory")
+      if len(inputs) not in (2, 3):
+        raise ValueError("Expect the inputs to have query, state, and optional "
+                         "processed memory, got %d items" % len(inputs))
+      # Ignore the rest of the inputs and only care about the query and state
+      query, state = inputs[0], inputs[1]
+      return self._calculate_attention(query, state)
+
+  def _setup_memory(self, memory, memory_sequence_length=None,
+                    memory_mask=None):
+    """Pre-process the memory before actually query the memory.
+
+    This should only be called once at the first invocation of call().
+
+    Args:
+      memory: The memory to query; usually the output of an RNN encoder. This
+        tensor should be shaped `[batch_size, max_time, ...]`.
+      memory_sequence_length (optional): Sequence lengths for the batch entries
+        in memory. If provided, the memory tensor rows are masked with zeros for
+        values past the respective sequence lengths.
+      memory_mask: (Optional) The boolean tensor with shape `[batch_size,
+        max_time]`. For any value equal to False, the corresponding value in
+        memory should be ignored.
+    """
+    if self._memory_initialized:
+      raise ValueError("The memory for the attention has already been setup.")
+    if memory_sequence_length is not None and memory_mask is not None:
+      raise ValueError("memory_sequence_length and memory_mask cannot be "
+                       "used at same time for attention.")
+    with ops.name_scope(
+        self.name, "BaseAttentionMechanismInit", nest.flatten(memory)):
+      self.values = _prepare_memory(
+          memory,
+          memory_sequence_length=memory_sequence_length,
+          memory_mask=memory_mask,
+          check_inner_dims_defined=self._check_inner_dims_defined)
+      # Mark the value as check since the memory and memory mask might not
+      # passed from __call__(), which does not have proper keras metadata.
+      # TODO(omalleyt): Remove this hack once the mask the has proper keras
+      # history.
+      base_layer_utils.mark_checked(self.values)
+      if self.memory_layer is not None:
+        self.keys = self.memory_layer(self.values)
+      else:
+        self.keys = self.values
+      self.batch_size = (
+          tensor_shape.dimension_value(self.keys.shape[0]) or
+          array_ops.shape(self.keys)[0])
+      self._alignments_size = (tensor_shape.dimension_value(self.keys.shape[1])
+                               or array_ops.shape(self.keys)[1])
+      if memory_mask is not None:
+        unwrapped_probability_fn = self.probability_fn
+        def _mask_probability_fn(score, prev):
+          return unwrapped_probability_fn(
+              _maybe_mask_score(
+                  score,
+                  memory_mask=memory_mask,
+                  memory_sequence_length=memory_sequence_length,
+                  score_mask_value=self.score_mask_value), prev)
+        self.probability_fn = _mask_probability_fn
+    self._memory_initialized = True
+
+  def _calculate_attention(self, query, state):
+    raise NotImplementedError(
+        "_calculate_attention need to be implemented by subclasses.")
+
+  def compute_mask(self, inputs, mask=None):
+    # There real input of the attention is query and state, and the memory layer
+    # mask shouldn't be pass down. Returning None for all output mask here.
+    return None, None
+
+  def get_config(self):
+    config = {}
+    # Since the probability_fn is likely to be a wrapped function, the child
+    # class should preserve the original function and how its wrapped.
+
+    if self.query_layer is not None:
+      config["query_layer"] = {
+          "class_name": self.query_layer.__class__.__name__,
+          "config": self.query_layer.get_config(),
+      }
+    if self.memory_layer is not None:
+      config["memory_layer"] = {
+          "class_name": self.memory_layer.__class__.__name__,
+          "config": self.memory_layer.get_config(),
+      }
+    # memory is a required init parameter and its a tensor. It cannot be
+    # serialized to config, so we put a placeholder for it.
+    config["memory"] = None
+    base_config = super(_BaseAttentionMechanismV2, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def _process_probability_fn(self, func_name):
+    """Helper method to retrieve the probably function by string input."""
+    valid_probability_fns = {
+        "softmax": nn_ops.softmax,
+        "hardmax": hardmax,
+    }
+    if func_name not in valid_probability_fns.keys():
+      raise ValueError("Invalid probability function: %s, options are %s" %
+                       (func_name, valid_probability_fns.keys()))
+    return valid_probability_fns[func_name]
+
+  @classmethod
+  def deserialize_inner_layer_from_config(cls, config, custom_objects):
+    """Helper method that reconstruct the query and memory from the config.
+
+    In the get_config() method, the query and memory layer configs are
+    serialized into dict for persistence, this method perform the reverse action
+    to reconstruct the layer from the config.
+
+    Args:
+      config: dict, the configs that will be used to reconstruct the object.
+      custom_objects: dict mapping class names (or function names) of custom
+        (non-Keras) objects to class/functions.
+    Returns:
+      config: dict, the config with layer instance created, which is ready to be
+        used as init parameters.
+    """
+    # Reconstruct the query and memory layer for parent class.
+    from tensorflow.python.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
+    # Instead of updating the input, create a copy and use that.
+    config = config.copy()
+    query_layer_config = config.pop("query_layer", None)
+    if query_layer_config:
+      query_layer = deserialize_layer(query_layer_config,
+                                      custom_objects=custom_objects)
+      config["query_layer"] = query_layer
+    memory_layer_config = config.pop("memory_layer", None)
+    if memory_layer_config:
+      memory_layer = deserialize_layer(memory_layer_config,
+                                       custom_objects=custom_objects)
+      config["memory_layer"] = memory_layer
+    return config
+
+  @property
+  def alignments_size(self):
+    return self._alignments_size
+
+  @property
+  def state_size(self):
+    return self._alignments_size
+
+  def initial_alignments(self, batch_size, dtype):
+    """Creates the initial alignment values for the `AttentionWrapper` class.
+
+    This is important for AttentionMechanisms that use the previous alignment
+    to calculate the alignment at the next time step (e.g. monotonic attention).
+
+    The default behavior is to return a tensor of all zeros.
+
+    Args:
+      batch_size: `int32` scalar, the batch_size.
+      dtype: The `dtype`.
+
+    Returns:
+      A `dtype` tensor shaped `[batch_size, alignments_size]`
+      (`alignments_size` is the values' `max_time`).
+    """
+    max_time = self._alignments_size
+    return _zero_state_tensors(max_time, batch_size, dtype)
+
+  def initial_state(self, batch_size, dtype):
+    """Creates the initial state values for the `AttentionWrapper` class.
+
+    This is important for AttentionMechanisms that use the previous alignment
+    to calculate the alignment at the next time step (e.g. monotonic attention).
+
+    The default behavior is to return the same output as initial_alignments.
+
+    Args:
+      batch_size: `int32` scalar, the batch_size.
+      dtype: The `dtype`.
+
+    Returns:
+      A structure of all-zero tensors with shapes as described by `state_size`.
+    """
+    return self.initial_alignments(batch_size, dtype)
+
+
 def _luong_score(query, keys, scale):
   """Implements Luong-style (multiplicative) scoring function.
 
@@ -304,7 +609,7 @@ def _luong_score(query, keys, scale):
   Args:
     query: Tensor, shape `[batch_size, num_units]` to compare to keys.
     keys: Processed memory, shape `[batch_size, max_time, num_units]`.
-    scale: Whether to apply a scale to the score function.
+    scale: the optional tensor to scale the attention score.
 
   Returns:
     A `[batch_size, max_time]` tensor of unnormalized score values.
@@ -320,7 +625,6 @@ def _luong_score(query, keys, scale):
         "Query (%s) has units: %s.  Keys (%s) have units: %s.  "
         "Perhaps you need to set num_units to the keys' dimension (%s)?"
         % (query, depth, keys, key_units, key_units))
-  dtype = query.dtype
 
   # Reshape from [batch_size, depth] to [batch_size, 1, depth]
   # for matmul.
@@ -338,12 +642,8 @@ def _luong_score(query, keys, scale):
   score = math_ops.matmul(query, keys, transpose_b=True)
   score = array_ops.squeeze(score, [1])
 
-  if scale:
-    # Scalar used in weight scaling
-    g = variable_scope.get_variable(
-        "attention_g", dtype=dtype,
-        initializer=init_ops.ones_initializer, shape=())
-    score = g * score
+  if scale is not None:
+    score = scale * score
   return score
 
 
@@ -354,8 +654,8 @@ class LuongAttention(_BaseAttentionMechanism):
   as described in:
 
   Minh-Thang Luong, Hieu Pham, Christopher D. Manning.
-  "Effective Approaches to Attention-based Neural Machine Translation."
-  EMNLP 2015.  https://arxiv.org/abs/1508.04025
+  [Effective Approaches to Attention-based Neural Machine Translation.
+  EMNLP 2015.](https://arxiv.org/abs/1508.04025)
 
   The second is the scaled form inspired partly by the normalized form of
   Bahdanau attention.
@@ -429,13 +729,133 @@ class LuongAttention(_BaseAttentionMechanism):
         `max_time`).
     """
     with variable_scope.variable_scope(None, "luong_attention", [query]):
-      score = _luong_score(query, self._keys, self._scale)
+      attention_g = None
+      if self._scale:
+        attention_g = variable_scope.get_variable(
+            "attention_g", dtype=query.dtype,
+            initializer=init_ops.ones_initializer, shape=())
+      score = _luong_score(query, self._keys, attention_g)
     alignments = self._probability_fn(score, state)
     next_state = alignments
     return alignments, next_state
 
 
-def _bahdanau_score(processed_query, keys, normalize):
+class LuongAttentionV2(_BaseAttentionMechanismV2):
+  """Implements Luong-style (multiplicative) attention scoring.
+
+  This attention has two forms.  The first is standard Luong attention,
+  as described in:
+
+  Minh-Thang Luong, Hieu Pham, Christopher D. Manning.
+  [Effective Approaches to Attention-based Neural Machine Translation.
+  EMNLP 2015.](https://arxiv.org/abs/1508.04025)
+
+  The second is the scaled form inspired partly by the normalized form of
+  Bahdanau attention.
+
+  To enable the second form, construct the object with parameter
+  `scale=True`.
+  """
+
+  def __init__(self,
+               units,
+               memory,
+               memory_sequence_length=None,
+               scale=False,
+               probability_fn="softmax",
+               dtype=None,
+               name="LuongAttention",
+               **kwargs):
+    """Construct the AttentionMechanism mechanism.
+
+    Args:
+      units: The depth of the attention mechanism.
+      memory: The memory to query; usually the output of an RNN encoder.  This
+        tensor should be shaped `[batch_size, max_time, ...]`.
+      memory_sequence_length: (optional): Sequence lengths for the batch entries
+        in memory.  If provided, the memory tensor rows are masked with zeros
+        for values past the respective sequence lengths.
+      scale: Python boolean. Whether to scale the energy term.
+      probability_fn: (optional) string, the name of function to convert the
+        attention score to probabilities. The default is `softmax` which is
+        `tf.nn.softmax`. Other options is `hardmax`, which is hardmax() within
+        this module. Any other value will result intovalidation error. Default
+        to use `softmax`.
+      dtype: The data type for the memory layer of the attention mechanism.
+      name: Name to use when creating ops.
+      **kwargs: Dictionary that contains other common arguments for layer
+        creation.
+    """
+    # For LuongAttention, we only transform the memory layer; thus
+    # num_units **must** match expected the query depth.
+    self.probability_fn_name = probability_fn
+    probability_fn = self._process_probability_fn(self.probability_fn_name)
+    wrapped_probability_fn = lambda score, _: probability_fn(score)
+    if dtype is None:
+      dtype = dtypes.float32
+    memory_layer = kwargs.pop("memory_layer", None)
+    if not memory_layer:
+      memory_layer = layers.Dense(
+          units, name="memory_layer", use_bias=False, dtype=dtype)
+    self.units = units
+    self.scale = scale
+    self.scale_weight = None
+    super(LuongAttentionV2, self).__init__(
+        memory=memory,
+        memory_sequence_length=memory_sequence_length,
+        query_layer=None,
+        memory_layer=memory_layer,
+        probability_fn=wrapped_probability_fn,
+        name=name,
+        dtype=dtype,
+        **kwargs)
+
+  def build(self, input_shape):
+    super(LuongAttentionV2, self).build(input_shape)
+    if self.scale and self.scale_weight is None:
+      self.scale_weight = self.add_weight(
+          "attention_g", initializer=init_ops.ones_initializer, shape=())
+    self.built = True
+
+  def _calculate_attention(self, query, state):
+    """Score the query based on the keys and values.
+
+    Args:
+      query: Tensor of dtype matching `self.values` and shape
+        `[batch_size, query_depth]`.
+      state: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]`
+        (`alignments_size` is memory's `max_time`).
+
+    Returns:
+      alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]` (`alignments_size` is memory's
+        `max_time`).
+      next_state: Same as the alignments.
+    """
+    score = _luong_score(query, self.keys, self.scale_weight)
+    alignments = self.probability_fn(score, state)
+    next_state = alignments
+    return alignments, next_state
+
+  def get_config(self):
+    config = {
+        "units": self.units,
+        "scale": self.scale,
+        "probability_fn": self.probability_fn_name,
+    }
+    base_config = super(LuongAttentionV2, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    config = _BaseAttentionMechanismV2.deserialize_inner_layer_from_config(
+        config, custom_objects=custom_objects)
+    return cls(**config)
+
+
+def _bahdanau_score(processed_query, keys, attention_v,
+                    attention_g=None, attention_b=None):
   """Implements Bahdanau-style (additive) scoring function.
 
   This attention has two forms.  The first is Bhandanau attention,
@@ -453,41 +873,28 @@ def _bahdanau_score(processed_query, keys, normalize):
    Training of Deep Neural Networks."
   https://arxiv.org/abs/1602.07868
 
-  To enable the second form, set `normalize=True`.
+  To enable the second form, set please pass in attention_g and attention_b.
 
   Args:
     processed_query: Tensor, shape `[batch_size, num_units]` to compare to keys.
     keys: Processed memory, shape `[batch_size, max_time, num_units]`.
-    normalize: Whether to normalize the score function.
+    attention_v: Tensor, shape `[num_units]`.
+    attention_g: Optional scalar tensor for normalization.
+    attention_b: Optional tensor with shape `[num_units]` for normalization.
 
   Returns:
     A `[batch_size, max_time]` tensor of unnormalized score values.
   """
-  dtype = processed_query.dtype
-  # Get the number of hidden units from the trailing dimension of keys
-  num_units = tensor_shape.dimension_value(
-      keys.shape[2]) or array_ops.shape(keys)[2]
   # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting.
   processed_query = array_ops.expand_dims(processed_query, 1)
-  v = variable_scope.get_variable(
-      "attention_v", [num_units], dtype=dtype)
-  if normalize:
-    # Scalar used in weight normalization
-    g = variable_scope.get_variable(
-        "attention_g", dtype=dtype,
-        initializer=init_ops.constant_initializer(math.sqrt((1. / num_units))),
-        shape=())
-    # Bias added prior to the nonlinearity
-    b = variable_scope.get_variable(
-        "attention_b", [num_units], dtype=dtype,
-        initializer=init_ops.zeros_initializer())
-    # normed_v = g * v / ||v||
-    normed_v = g * v * math_ops.rsqrt(
-        math_ops.reduce_sum(math_ops.square(v)))
+  if attention_g is not None and attention_b is not None:
+    normed_v = attention_g * attention_v * math_ops.rsqrt(
+        math_ops.reduce_sum(math_ops.square(attention_v)))
     return math_ops.reduce_sum(
-        normed_v * math_ops.tanh(keys + processed_query + b), [2])
+        normed_v * math_ops.tanh(keys + processed_query + attention_b), [2])
   else:
-    return math_ops.reduce_sum(v * math_ops.tanh(keys + processed_query), [2])
+    return math_ops.reduce_sum(
+        attention_v * math_ops.tanh(keys + processed_query), [2])
 
 
 class BahdanauAttention(_BaseAttentionMechanism):
@@ -578,12 +985,169 @@ class BahdanauAttention(_BaseAttentionMechanism):
     """
     with variable_scope.variable_scope(None, "bahdanau_attention", [query]):
       processed_query = self.query_layer(query) if self.query_layer else query
-      score = _bahdanau_score(processed_query, self._keys, self._normalize)
+      attention_v = variable_scope.get_variable(
+          "attention_v", [self._num_units], dtype=query.dtype)
+      if not self._normalize:
+        attention_g = None
+        attention_b = None
+      else:
+        attention_g = variable_scope.get_variable(
+            "attention_g", dtype=query.dtype,
+            initializer=init_ops.constant_initializer(
+                math.sqrt((1. / self._num_units))),
+            shape=())
+        attention_b = variable_scope.get_variable(
+            "attention_b", [self._num_units], dtype=query.dtype,
+            initializer=init_ops.zeros_initializer())
+
+      score = _bahdanau_score(processed_query, self._keys, attention_v,
+                              attention_g=attention_g, attention_b=attention_b)
     alignments = self._probability_fn(score, state)
     next_state = alignments
     return alignments, next_state
 
 
+class BahdanauAttentionV2(_BaseAttentionMechanismV2):
+  """Implements Bahdanau-style (additive) attention.
+
+  This attention has two forms.  The first is Bahdanau attention,
+  as described in:
+
+  Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio.
+  "Neural Machine Translation by Jointly Learning to Align and Translate."
+  ICLR 2015. https://arxiv.org/abs/1409.0473
+
+  The second is the normalized form.  This form is inspired by the
+  weight normalization article:
+
+  Tim Salimans, Diederik P. Kingma.
+  "Weight Normalization: A Simple Reparameterization to Accelerate
+   Training of Deep Neural Networks."
+  https://arxiv.org/abs/1602.07868
+
+  To enable the second form, construct the object with parameter
+  `normalize=True`.
+  """
+
+  def __init__(self,
+               units,
+               memory,
+               memory_sequence_length=None,
+               normalize=False,
+               probability_fn="softmax",
+               kernel_initializer="glorot_uniform",
+               dtype=None,
+               name="BahdanauAttention",
+               **kwargs):
+    """Construct the Attention mechanism.
+
+    Args:
+      units: The depth of the query mechanism.
+      memory: The memory to query; usually the output of an RNN encoder.  This
+        tensor should be shaped `[batch_size, max_time, ...]`.
+      memory_sequence_length: (optional): Sequence lengths for the batch entries
+        in memory.  If provided, the memory tensor rows are masked with zeros
+        for values past the respective sequence lengths.
+      normalize: Python boolean.  Whether to normalize the energy term.
+      probability_fn: (optional) string, the name of function to convert the
+        attention score to probabilities. The default is `softmax` which is
+        `tf.nn.softmax`. Other options is `hardmax`, which is hardmax() within
+        this module. Any other value will result into validation error. Default
+        to use `softmax`.
+      kernel_initializer: (optional), the name of the initializer for the
+        attention kernel.
+      dtype: The data type for the query and memory layers of the attention
+        mechanism.
+      name: Name to use when creating ops.
+      **kwargs: Dictionary that contains other common arguments for layer
+        creation.
+    """
+    self.probability_fn_name = probability_fn
+    probability_fn = self._process_probability_fn(self.probability_fn_name)
+    wrapped_probability_fn = lambda score, _: probability_fn(score)
+    if dtype is None:
+      dtype = dtypes.float32
+    query_layer = kwargs.pop("query_layer", None)
+    if not query_layer:
+      query_layer = layers.Dense(
+          units, name="query_layer", use_bias=False, dtype=dtype)
+    memory_layer = kwargs.pop("memory_layer", None)
+    if not memory_layer:
+      memory_layer = layers.Dense(
+          units, name="memory_layer", use_bias=False, dtype=dtype)
+    self.units = units
+    self.normalize = normalize
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.attention_v = None
+    self.attention_g = None
+    self.attention_b = None
+    super(BahdanauAttentionV2, self).__init__(
+        memory=memory,
+        memory_sequence_length=memory_sequence_length,
+        query_layer=query_layer,
+        memory_layer=memory_layer,
+        probability_fn=wrapped_probability_fn,
+        name=name,
+        dtype=dtype,
+        **kwargs)
+
+  def build(self, input_shape):
+    super(BahdanauAttentionV2, self).build(input_shape)
+    if self.attention_v is None:
+      self.attention_v = self.add_weight(
+          "attention_v", [self.units],
+          dtype=self.dtype,
+          initializer=self.kernel_initializer)
+    if self.normalize and self.attention_g is None and self.attention_b is None:
+      self.attention_g = self.add_weight(
+          "attention_g", initializer=init_ops.constant_initializer(
+              math.sqrt((1. / self.units))), shape=())
+      self.attention_b = self.add_weight(
+          "attention_b", shape=[self.units],
+          initializer=init_ops.zeros_initializer())
+    self.built = True
+
+  def _calculate_attention(self, query, state):
+    """Score the query based on the keys and values.
+
+    Args:
+      query: Tensor of dtype matching `self.values` and shape
+        `[batch_size, query_depth]`.
+      state: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]`
+        (`alignments_size` is memory's `max_time`).
+
+    Returns:
+      alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]` (`alignments_size` is memory's
+        `max_time`).
+      next_state: same as alignments.
+    """
+    processed_query = self.query_layer(query) if self.query_layer else query
+    score = _bahdanau_score(processed_query, self.keys, self.attention_v,
+                            attention_g=self.attention_g,
+                            attention_b=self.attention_b)
+    alignments = self.probability_fn(score, state)
+    next_state = alignments
+    return alignments, next_state
+
+  def get_config(self):
+    config = {
+        "units": self.units,
+        "normalize": self.normalize,
+        "probability_fn": self.probability_fn_name,
+        "kernel_initializer": initializers.serialize(self.kernel_initializer)
+    }
+    base_config = super(BahdanauAttentionV2, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    config = _BaseAttentionMechanismV2.deserialize_inner_layer_from_config(
+        config, custom_objects=custom_objects)
+    return cls(**config)
+
+
 def safe_cumprod(x, *args, **kwargs):
   """Computes cumprod of x in logspace using cumsum to avoid underflow.
 
@@ -766,6 +1330,34 @@ class _BaseMonotonicAttentionMechanism(_BaseAttentionMechanism):
         dtype=dtype)
 
 
+class _BaseMonotonicAttentionMechanismV2(_BaseAttentionMechanismV2):
+  """Base attention mechanism for monotonic attention.
+
+  Simply overrides the initial_alignments function to provide a dirac
+  distribution, which is needed in order for the monotonic attention
+  distributions to have the correct behavior.
+  """
+
+  def initial_alignments(self, batch_size, dtype):
+    """Creates the initial alignment values for the monotonic attentions.
+
+    Initializes to dirac distributions, i.e. [1, 0, 0, ...memory length..., 0]
+    for all entries in the batch.
+
+    Args:
+      batch_size: `int32` scalar, the batch_size.
+      dtype: The `dtype`.
+
+    Returns:
+      A `dtype` tensor shaped `[batch_size, alignments_size]`
+      (`alignments_size` is the values' `max_time`).
+    """
+    max_time = self._alignments_size
+    return array_ops.one_hot(
+        array_ops.zeros((batch_size,), dtype=dtypes.int32), max_time,
+        dtype=dtype)
+
+
 class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism):
   """Monotonic attention mechanism with Bahadanau-style energy function.
 
@@ -860,7 +1452,22 @@ class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism):
     with variable_scope.variable_scope(
         None, "bahdanau_monotonic_attention", [query]):
       processed_query = self.query_layer(query) if self.query_layer else query
-      score = _bahdanau_score(processed_query, self._keys, self._normalize)
+      attention_v = variable_scope.get_variable(
+          "attention_v", [self._num_units], dtype=query.dtype)
+      if not self._normalize:
+        attention_g = None
+        attention_b = None
+      else:
+        attention_g = variable_scope.get_variable(
+            "attention_g", dtype=query.dtype,
+            initializer=init_ops.constant_initializer(
+                math.sqrt((1. / self._num_units))),
+            shape=())
+        attention_b = variable_scope.get_variable(
+            "attention_b", [self._num_units], dtype=query.dtype,
+            initializer=init_ops.zeros_initializer())
+      score = _bahdanau_score(processed_query, self._keys, attention_v,
+                              attention_g=attention_g, attention_b=attention_b)
       score_bias = variable_scope.get_variable(
           "attention_score_bias", dtype=processed_query.dtype,
           initializer=self._score_bias_init)
@@ -870,6 +1477,164 @@ class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism):
     return alignments, next_state
 
 
+class BahdanauMonotonicAttentionV2(_BaseMonotonicAttentionMechanismV2):
+  """Monotonic attention mechanism with Bahadanau-style energy function.
+
+  This type of attention enforces a monotonic constraint on the attention
+  distributions; that is once the model attends to a given point in the memory
+  it can't attend to any prior points at subsequence output timesteps.  It
+  achieves this by using the _monotonic_probability_fn instead of softmax to
+  construct its attention distributions.  Since the attention scores are passed
+  through a sigmoid, a learnable scalar bias parameter is applied after the
+  score function and before the sigmoid.  Otherwise, it is equivalent to
+  BahdanauAttention.  This approach is proposed in
+
+  Colin Raffel, Minh-Thang Luong, Peter J. Liu, Ron J. Weiss, Douglas Eck,
+  "Online and Linear-Time Attention by Enforcing Monotonic Alignments."
+  ICML 2017.  https://arxiv.org/abs/1704.00784
+  """
+
+  def __init__(self,
+               units,
+               memory,
+               memory_sequence_length=None,
+               normalize=False,
+               sigmoid_noise=0.,
+               sigmoid_noise_seed=None,
+               score_bias_init=0.,
+               mode="parallel",
+               kernel_initializer="glorot_uniform",
+               dtype=None,
+               name="BahdanauMonotonicAttention",
+               **kwargs):
+    """Construct the Attention mechanism.
+
+    Args:
+      units: The depth of the query mechanism.
+      memory: The memory to query; usually the output of an RNN encoder.  This
+        tensor should be shaped `[batch_size, max_time, ...]`.
+      memory_sequence_length: (optional): Sequence lengths for the batch entries
+        in memory.  If provided, the memory tensor rows are masked with zeros
+        for values past the respective sequence lengths.
+      normalize: Python boolean. Whether to normalize the energy term.
+      sigmoid_noise: Standard deviation of pre-sigmoid noise. See the docstring
+        for `_monotonic_probability_fn` for more information.
+      sigmoid_noise_seed: (optional) Random seed for pre-sigmoid noise.
+      score_bias_init: Initial value for score bias scalar. It's recommended to
+        initialize this to a negative value when the length of the memory is
+        large.
+      mode: How to compute the attention distribution. Must be one of
+        'recursive', 'parallel', or 'hard'. See the docstring for
+        `tf.contrib.seq2seq.monotonic_attention` for more information.
+      kernel_initializer: (optional), the name of the initializer for the
+        attention kernel.
+      dtype: The data type for the query and memory layers of the attention
+        mechanism.
+      name: Name to use when creating ops.
+      **kwargs: Dictionary that contains other common arguments for layer
+        creation.
+    """
+    # Set up the monotonic probability fn with supplied parameters
+    if dtype is None:
+      dtype = dtypes.float32
+    wrapped_probability_fn = functools.partial(
+        _monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode,
+        seed=sigmoid_noise_seed)
+    query_layer = kwargs.pop("query_layer", None)
+    if not query_layer:
+      query_layer = layers.Dense(
+          units, name="query_layer", use_bias=False, dtype=dtype)
+    memory_layer = kwargs.pop("memory_layer", None)
+    if not memory_layer:
+      memory_layer = layers.Dense(
+          units, name="memory_layer", use_bias=False, dtype=dtype)
+    self.units = units
+    self.normalize = normalize
+    self.sigmoid_noise = sigmoid_noise
+    self.sigmoid_noise_seed = sigmoid_noise_seed
+    self.score_bias_init = score_bias_init
+    self.mode = mode
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.attention_v = None
+    self.attention_score_bias = None
+    self.attention_g = None
+    self.attention_b = None
+    super(BahdanauMonotonicAttentionV2, self).__init__(
+        memory=memory,
+        memory_sequence_length=memory_sequence_length,
+        query_layer=query_layer,
+        memory_layer=memory_layer,
+        probability_fn=wrapped_probability_fn,
+        name=name,
+        dtype=dtype,
+        **kwargs)
+
+  def build(self, input_shape):
+    super(BahdanauMonotonicAttentionV2, self).build(input_shape)
+    if self.attention_v is None:
+      self.attention_v = self.add_weight(
+          "attention_v", [self.units], dtype=self.dtype,
+          initializer=self.kernel_initializer)
+    if self.attention_score_bias is None:
+      self.attention_score_bias = self.add_weight(
+          "attention_score_bias", shape=(), dtype=self.dtype,
+          initializer=init_ops.constant_initializer(
+              self.score_bias_init, dtype=self.dtype))
+    if self.normalize and self.attention_g is None and self.attention_b is None:
+      self.attention_g = self.add_weight(
+          "attention_g", dtype=self.dtype,
+          initializer=init_ops.constant_initializer(
+              math.sqrt((1. / self.units))),
+          shape=())
+      self.attention_b = self.add_weight(
+          "attention_b", [self.units], dtype=self.dtype,
+          initializer=init_ops.zeros_initializer())
+    self.built = True
+
+  def _calculate_attention(self, query, state):
+    """Score the query based on the keys and values.
+
+    Args:
+      query: Tensor of dtype matching `self.values` and shape
+        `[batch_size, query_depth]`.
+      state: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]`
+        (`alignments_size` is memory's `max_time`).
+
+    Returns:
+      alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]` (`alignments_size` is memory's
+        `max_time`).
+    """
+    processed_query = self.query_layer(query) if self.query_layer else query
+    score = _bahdanau_score(processed_query, self.keys, self.attention_v,
+                            attention_g=self.attention_g,
+                            attention_b=self.attention_b)
+    score += self.attention_score_bias
+    alignments = self.probability_fn(score, state)
+    next_state = alignments
+    return alignments, next_state
+
+  def get_config(self):
+    config = {
+        "units": self.units,
+        "normalize": self.normalize,
+        "sigmoid_noise": self.sigmoid_noise,
+        "sigmoid_noise_seed": self.sigmoid_noise_seed,
+        "score_bias_init": self.score_bias_init,
+        "mode": self.mode,
+        "kernel_initializer": initializers.serialize(self.kernel_initializer),
+    }
+    base_config = super(BahdanauMonotonicAttentionV2, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    config = _BaseAttentionMechanismV2.deserialize_inner_layer_from_config(
+        config, custom_objects=custom_objects)
+    return cls(**config)
+
+
 class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
   """Monotonic attention mechanism with Luong-style energy function.
 
@@ -960,7 +1725,12 @@ class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
     """
     with variable_scope.variable_scope(None, "luong_monotonic_attention",
                                        [query]):
-      score = _luong_score(query, self._keys, self._scale)
+      attention_g = None
+      if self._scale:
+        attention_g = variable_scope.get_variable(
+            "attention_g", dtype=query.dtype,
+            initializer=init_ops.ones_initializer, shape=())
+      score = _luong_score(query, self._keys, attention_g)
       score_bias = variable_scope.get_variable(
           "attention_score_bias", dtype=query.dtype,
           initializer=self._score_bias_init)
@@ -970,6 +1740,139 @@ class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
     return alignments, next_state
 
 
+class LuongMonotonicAttentionV2(_BaseMonotonicAttentionMechanismV2):
+  """Monotonic attention mechanism with Luong-style energy function.
+
+  This type of attention enforces a monotonic constraint on the attention
+  distributions; that is once the model attends to a given point in the memory
+  it can't attend to any prior points at subsequence output timesteps.  It
+  achieves this by using the _monotonic_probability_fn instead of softmax to
+  construct its attention distributions.  Otherwise, it is equivalent to
+  LuongAttention.  This approach is proposed in
+
+  [Colin Raffel, Minh-Thang Luong, Peter J. Liu, Ron J. Weiss, Douglas Eck,
+  "Online and Linear-Time Attention by Enforcing Monotonic Alignments."
+  ICML 2017.](https://arxiv.org/abs/1704.00784)
+  """
+
+  def __init__(self,
+               units,
+               memory,
+               memory_sequence_length=None,
+               scale=False,
+               sigmoid_noise=0.,
+               sigmoid_noise_seed=None,
+               score_bias_init=0.,
+               mode="parallel",
+               dtype=None,
+               name="LuongMonotonicAttention",
+               **kwargs):
+    """Construct the Attention mechanism.
+
+    Args:
+      units: The depth of the query mechanism.
+      memory: The memory to query; usually the output of an RNN encoder.  This
+        tensor should be shaped `[batch_size, max_time, ...]`.
+      memory_sequence_length: (optional): Sequence lengths for the batch entries
+        in memory.  If provided, the memory tensor rows are masked with zeros
+        for values past the respective sequence lengths.
+      scale: Python boolean.  Whether to scale the energy term.
+      sigmoid_noise: Standard deviation of pre-sigmoid noise.  See the docstring
+        for `_monotonic_probability_fn` for more information.
+      sigmoid_noise_seed: (optional) Random seed for pre-sigmoid noise.
+      score_bias_init: Initial value for score bias scalar.  It's recommended to
+        initialize this to a negative value when the length of the memory is
+        large.
+      mode: How to compute the attention distribution.  Must be one of
+        'recursive', 'parallel', or 'hard'.  See the docstring for
+        `tf.contrib.seq2seq.monotonic_attention` for more information.
+      dtype: The data type for the query and memory layers of the attention
+        mechanism.
+      name: Name to use when creating ops.
+      **kwargs: Dictionary that contains other common arguments for layer
+        creation.
+    """
+    # Set up the monotonic probability fn with supplied parameters
+    if dtype is None:
+      dtype = dtypes.float32
+    wrapped_probability_fn = functools.partial(
+        _monotonic_probability_fn, sigmoid_noise=sigmoid_noise, mode=mode,
+        seed=sigmoid_noise_seed)
+    memory_layer = kwargs.pop("memory_layer", None)
+    if not memory_layer:
+      memory_layer = layers.Dense(
+          units, name="memory_layer", use_bias=False, dtype=dtype)
+    self.units = units
+    self.scale = scale
+    self.sigmoid_noise = sigmoid_noise
+    self.sigmoid_noise_seed = sigmoid_noise_seed
+    self.score_bias_init = score_bias_init
+    self.mode = mode
+    self.attention_g = None
+    self.attention_score_bias = None
+    super(LuongMonotonicAttentionV2, self).__init__(
+        memory=memory,
+        memory_sequence_length=memory_sequence_length,
+        query_layer=None,
+        memory_layer=memory_layer,
+        probability_fn=wrapped_probability_fn,
+        name=name,
+        dtype=dtype,
+        **kwargs)
+
+  def build(self, input_shape):
+    super(LuongMonotonicAttentionV2, self).build(input_shape)
+    if self.scale and self.attention_g is None:
+      self.attention_g = self.add_weight(
+          "attention_g", initializer=init_ops.ones_initializer, shape=())
+    if self.attention_score_bias is None:
+      self.attention_score_bias = self.add_weight(
+          "attention_score_bias", shape=(),
+          initializer=init_ops.constant_initializer(
+              self.score_bias_init, dtype=self.dtype))
+    self.built = True
+
+  def _calculate_attention(self, query, state):
+    """Score the query based on the keys and values.
+
+    Args:
+      query: Tensor of dtype matching `self.values` and shape
+        `[batch_size, query_depth]`.
+      state: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]`
+        (`alignments_size` is memory's `max_time`).
+
+    Returns:
+      alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]` (`alignments_size` is memory's
+        `max_time`).
+      next_state: Same as alignments
+    """
+    score = _luong_score(query, self.keys, self.attention_g)
+    score += self.attention_score_bias
+    alignments = self.probability_fn(score, state)
+    next_state = alignments
+    return alignments, next_state
+
+  def get_config(self):
+    config = {
+        "units": self.units,
+        "scale": self.scale,
+        "sigmoid_noise": self.sigmoid_noise,
+        "sigmoid_noise_seed": self.sigmoid_noise_seed,
+        "score_bias_init": self.score_bias_init,
+        "mode": self.mode,
+    }
+    base_config = super(LuongMonotonicAttentionV2, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    config = _BaseAttentionMechanismV2.deserialize_inner_layer_from_config(
+        config, custom_objects=custom_objects)
+    return cls(**config)
+
+
 class AttentionWrapperState(
     collections.namedtuple("AttentionWrapperState",
                            ("cell_state", "attention", "time", "alignments",
@@ -1017,7 +1920,15 @@ class AttentionWrapperState(
     def with_same_shape(old, new):
       """Check and set new tensor's shape."""
       if isinstance(old, ops.Tensor) and isinstance(new, ops.Tensor):
-        return tensor_util.with_same_shape(old, new)
+        if not context.executing_eagerly():
+          return tensor_util.with_same_shape(old, new)
+        else:
+          if old.shape.as_list() != new.shape.as_list():
+            raise ValueError("The shape of the AttentionWrapperState is "
+                             "expected to be same as the one to clone. "
+                             "self.shape: %s, input.shape: %s" %
+                             (old.shape, new.shape))
+          return new
       return new
 
     return nest.map_structure(
@@ -1026,6 +1937,82 @@ class AttentionWrapperState(
         super(AttentionWrapperState, self)._replace(**kwargs))
 
 
+def _prepare_memory(memory, memory_sequence_length=None, memory_mask=None,
+                    check_inner_dims_defined=True):
+  """Convert to tensor and possibly mask `memory`.
+
+  Args:
+    memory: `Tensor`, shaped `[batch_size, max_time, ...]`.
+    memory_sequence_length: `int32` `Tensor`, shaped `[batch_size]`.
+    memory_mask: `boolean` tensor with shape [batch_size, max_time]. The memory
+      should be skipped when the corresponding mask is False.
+    check_inner_dims_defined: Python boolean.  If `True`, the `memory`
+      argument's shape is checked to ensure all but the two outermost
+      dimensions are fully defined.
+
+  Returns:
+    A (possibly masked), checked, new `memory`.
+
+  Raises:
+    ValueError: If `check_inner_dims_defined` is `True` and not
+      `memory.shape[2:].is_fully_defined()`.
+  """
+  memory = nest.map_structure(
+      lambda m: ops.convert_to_tensor(m, name="memory"), memory)
+  if memory_sequence_length is not None and memory_mask is not None:
+    raise ValueError("memory_sequence_length and memory_mask can't be provided "
+                     "at same time.")
+  if memory_sequence_length is not None:
+    memory_sequence_length = ops.convert_to_tensor(
+        memory_sequence_length, name="memory_sequence_length")
+  if check_inner_dims_defined:
+    def _check_dims(m):
+      if not m.get_shape()[2:].is_fully_defined():
+        raise ValueError("Expected memory %s to have fully defined inner dims, "
+                         "but saw shape: %s" % (m.name, m.get_shape()))
+    nest.map_structure(_check_dims, memory)
+  if memory_sequence_length is None and memory_mask is None:
+    return memory
+  elif memory_sequence_length is not None:
+    seq_len_mask = array_ops.sequence_mask(
+        memory_sequence_length,
+        maxlen=array_ops.shape(nest.flatten(memory)[0])[1],
+        dtype=nest.flatten(memory)[0].dtype)
+  else:
+    # For memory_mask is not None
+    seq_len_mask = math_ops.cast(
+        memory_mask, dtype=nest.flatten(memory)[0].dtype)
+  def _maybe_mask(m, seq_len_mask):
+    """Mask the memory based on the memory mask."""
+    rank = m.get_shape().ndims
+    rank = rank if rank is not None else array_ops.rank(m)
+    extra_ones = array_ops.ones(rank - 2, dtype=dtypes.int32)
+    seq_len_mask = array_ops.reshape(
+        seq_len_mask,
+        array_ops.concat((array_ops.shape(seq_len_mask), extra_ones), 0))
+    return m * seq_len_mask
+
+  return nest.map_structure(lambda m: _maybe_mask(m, seq_len_mask), memory)
+
+
+def _maybe_mask_score(score, memory_sequence_length=None, memory_mask=None,
+                      score_mask_value=None):
+  """Mask the attention score based on the masks."""
+  if memory_sequence_length is None and memory_mask is None:
+    return score
+  if memory_sequence_length is not None and memory_mask is not None:
+    raise ValueError("memory_sequence_length and memory_mask can't be provided "
+                     "at same time.")
+  if memory_sequence_length is not None:
+    message = "All values in memory_sequence_length must greater than zero."
+    with ops.control_dependencies(
+        [check_ops.assert_positive(memory_sequence_length, message=message)]):
+      memory_mask = array_ops.sequence_mask(
+          memory_sequence_length, maxlen=array_ops.shape(score)[1])
+  score_mask_values = score_mask_value * array_ops.ones_like(score)
+  return array_ops.where(memory_mask, score, score_mask_values)
+
+
 def hardmax(logits, name=None):
   """Returns batched one-hot vectors.
 
@@ -1050,8 +2037,14 @@ def hardmax(logits, name=None):
 def _compute_attention(attention_mechanism, cell_output, attention_state,
                        attention_layer):
   """Computes the attention and alignments for a given attention_mechanism."""
-  alignments, next_attention_state = attention_mechanism(
-      cell_output, state=attention_state)
+  if isinstance(attention_mechanism, _BaseAttentionMechanismV2):
+    alignments, next_attention_state = attention_mechanism(
+        [cell_output, attention_state])
+  else:
+    # For other class, assume they are following _BaseAttentionMechanism, which
+    # takes query and state as separate parameter.
+    alignments, next_attention_state = attention_mechanism(
+        cell_output, state=attention_state)
 
   # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
   expanded_alignments = array_ops.expand_dims(alignments, 1)
@@ -1064,13 +2057,13 @@ def _compute_attention(attention_mechanism, cell_output, attention_state,
   # the batched matmul is over memory_time, so the output shape is
   #   [batch_size, 1, memory_size].
   # we then squeeze out the singleton dim.
-  context = math_ops.matmul(expanded_alignments, attention_mechanism.values)
-  context = array_ops.squeeze(context, [1])
+  context_ = math_ops.matmul(expanded_alignments, attention_mechanism.values)
+  context_ = array_ops.squeeze(context_, [1])
 
   if attention_layer is not None:
-    attention = attention_layer(array_ops.concat([cell_output, context], 1))
+    attention = attention_layer(array_ops.concat([cell_output, context_], 1))
   else:
-    attention = context
+    attention = context_
 
   return attention, alignments, next_attention_state
 
@@ -1088,7 +2081,8 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
                output_attention=True,
                initial_cell_state=None,
                name=None,
-               attention_layer=None):
+               attention_layer=None,
+               attention_fn=None):
     """Construct the `AttentionWrapper`.
 
     **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in
@@ -1132,7 +2126,9 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
         feed the context and cell output into the attention layer to generate
         attention at each time step. If attention_mechanism is a list,
         attention_layer_size must be a list of the same length. If
-        attention_layer is set, this must be None.
+        attention_layer is set, this must be None. If attention_fn is set,
+        it must guaranteed that the outputs of attention_fn also meet the
+        above requirements.
       alignment_history: Python boolean, whether to store alignment history
         from all time steps in the final output state (currently stored as a
         time major `TensorArray` on which you must call `stack()`).
@@ -1158,6 +2154,12 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
         the context as attention at each time step. If attention_mechanism is a
         list, attention_layer must be a list of the same length. If
         attention_layers_size is set, this must be None.
+      attention_fn: An optional callable function that allows users to provide
+        their own customized attention function, which takes input
+        (attention_mechanism, cell_output, attention_state, attention_layer) and
+        outputs (attention, alignments, next_attention_state). If provided,
+        the attention_layer_size should be the size of the outputs of
+        attention_fn.
 
     Raises:
       TypeError: `attention_layer_size` is not None and (`attention_mechanism`
@@ -1240,6 +2242,10 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
           tensor_shape.dimension_value(attention_mechanism.values.shape[-1])
           for attention_mechanism in attention_mechanisms)
 
+    if attention_fn is None:
+      attention_fn = _compute_attention
+    self._attention_fn = attention_fn
+
     self._cell = cell
     self._attention_mechanisms = attention_mechanisms
     self._cell_input_fn = cell_input_fn
@@ -1443,7 +2449,7 @@ class AttentionWrapper(rnn_cell_impl.RNNCell):
     all_attention_states = []
     maybe_all_histories = []
     for i, attention_mechanism in enumerate(self._attention_mechanisms):
-      attention, alignments, next_attention_state = _compute_attention(
+      attention, alignments, next_attention_state = self._attention_fn(
           attention_mechanism, cell_output, previous_attention_state[i],
           self._attention_layers[i] if self._attention_layers else None)
       alignment_history = previous_alignment_history[i].write(
diff --git a/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py b/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py
index 7eb95e5a70de985dca0d4b565ba03bdf454b6161..16dfa7ed8268d761dee49ec0146efabcaaef1393 100644
--- a/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/basic_decoder.py
@@ -23,8 +23,10 @@ import collections
 
 from tensorflow.contrib.seq2seq.python.ops import decoder
 from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
+from tensorflow.contrib.seq2seq.python.ops import sampler as sampler_py
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras import layers
 from tensorflow.python.layers import base as layers_base
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.util import nest
@@ -146,3 +148,102 @@ class BasicDecoder(decoder.Decoder):
           sample_ids=sample_ids)
     outputs = BasicDecoderOutput(cell_outputs, sample_ids)
     return (outputs, next_state, next_inputs, finished)
+
+
+class BasicDecoderV2(decoder.BaseDecoder):
+  """Basic sampling decoder."""
+
+  def __init__(self, cell, sampler, output_layer=None, **kwargs):
+    """Initialize BasicDecoder.
+
+    Args:
+      cell: An `RNNCell` instance.
+      sampler: A `Sampler` instance.
+      output_layer: (Optional) An instance of `tf.layers.Layer`, i.e.,
+        `tf.layers.Dense`. Optional layer to apply to the RNN output prior to
+        storing the result or sampling.
+      **kwargs: Other keyward arguments for layer creation.
+
+    Raises:
+      TypeError: if `cell`, `helper` or `output_layer` have an incorrect type.
+    """
+    rnn_cell_impl.assert_like_rnncell("cell", cell)
+    if not isinstance(sampler, sampler_py.Sampler):
+      raise TypeError("sampler must be a Sampler, received: %s" % (sampler,))
+    if (output_layer is not None and
+        not isinstance(output_layer, layers.Layer)):
+      raise TypeError(
+          "output_layer must be a Layer, received: %s" % (output_layer,))
+    self.cell = cell
+    self.sampler = sampler
+    self.output_layer = output_layer
+    super(BasicDecoderV2, self).__init__(**kwargs)
+
+  def initialize(self, inputs, initial_state=None, **kwargs):
+    """Initialize the decoder."""
+    # Assume the dtype of the cell is the output_size structure
+    # containing the input_state's first component's dtype.
+    self._cell_dtype = nest.flatten(initial_state)[0].dtype
+    return self.sampler.initialize(inputs, **kwargs) + (initial_state,)
+
+  @property
+  def batch_size(self):
+    return self.sampler.batch_size
+
+  def _rnn_output_size(self):
+    size = tensor_shape.TensorShape(self.cell.output_size)
+    if self.output_layer is None:
+      return size
+    else:
+      # To use layer's compute_output_shape, we need to convert the
+      # RNNCell's output_size entries into shapes with an unknown
+      # batch size.  We then pass this through the layer's
+      # compute_output_shape and read off all but the first (batch)
+      # dimensions to get the output size of the rnn with the layer
+      # applied to the top.
+      output_shape_with_unknown_batch = nest.map_structure(
+          lambda s: tensor_shape.TensorShape([None]).concatenate(s), size)
+      layer_output_shape = self.output_layer.compute_output_shape(
+          output_shape_with_unknown_batch)
+      return nest.map_structure(lambda s: s[1:], layer_output_shape)
+
+  @property
+  def output_size(self):
+    # Return the cell output and the id
+    return BasicDecoderOutput(
+        rnn_output=self._rnn_output_size(),
+        sample_id=self.sampler.sample_ids_shape)
+
+  @property
+  def output_dtype(self):
+    # Assume the dtype of the cell is the output_size structure
+    # containing the input_state's first component's dtype.
+    # Return that structure and the sample_ids_dtype from the helper.
+    dtype = self._cell_dtype
+    return BasicDecoderOutput(
+        nest.map_structure(lambda _: dtype, self._rnn_output_size()),
+        self.sampler.sample_ids_dtype)
+
+  def step(self, time, inputs, state):
+    """Perform a decoding step.
+
+    Args:
+      time: scalar `int32` tensor.
+      inputs: A (structure of) input tensors.
+      state: A (structure of) state tensors and TensorArrays.
+
+    Returns:
+      `(outputs, next_state, next_inputs, finished)`.
+    """
+    cell_outputs, cell_state = self.cell(inputs, state)
+    if self.output_layer is not None:
+      cell_outputs = self.output_layer(cell_outputs)
+    sample_ids = self.sampler.sample(
+        time=time, outputs=cell_outputs, state=cell_state)
+    (finished, next_inputs, next_state) = self.sampler.next_inputs(
+        time=time,
+        outputs=cell_outputs,
+        state=cell_state,
+        sample_ids=sample_ids)
+    outputs = BasicDecoderOutput(cell_outputs, sample_ids)
+    return (outputs, next_state, next_inputs, finished)
diff --git a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
index ab36848f13ab3078cd232c18f140188e12db703b..1d773a449890cd7335b2225db39d79ca958a3276 100644
--- a/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/beam_search_decoder.py
@@ -24,11 +24,12 @@ import numpy as np
 from tensorflow.contrib.seq2seq.python.ops import attention_wrapper
 from tensorflow.contrib.seq2seq.python.ops import beam_search_ops
 from tensorflow.contrib.seq2seq.python.ops import decoder
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.layers import base as layers_base
+from tensorflow.python.keras import layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import embedding_ops
@@ -182,11 +183,12 @@ def gather_tree_from_array(t, parent_ids, sequence_length):
   return ordered
 
 
-def _check_maybe(t):
+def _check_ndims(t):
   if t.shape.ndims is None:
     raise ValueError(
         "Expected tensor (%s) to have known rank, but ndims == None." % t)
 
+
 def _check_static_batch_beam_maybe(shape, batch_size, beam_width):
   """Raises an exception if dimensions are known statically and can not be
   reshaped to [batch_size, beam_size, -1].
@@ -205,6 +207,7 @@ def _check_static_batch_beam_maybe(shape, batch_size, beam_width):
     return False
   return True
 
+
 def _check_batch_beam(t, batch_size, beam_width):
   """Returns an Assert operation checking that the elements of the stacked
   TensorArray can be reshaped to [batch_size, beam_size, -1]. At this point,
@@ -229,70 +232,30 @@ def _check_batch_beam(t, batch_size, beam_width):
   return control_flow_ops.Assert(condition, [error_message])
 
 
+class BeamSearchDecoderMixin(object):
+  """BeamSearchDecoderMixin contains the common methods for BeamSearchDecoder.
 
-class BeamSearchDecoder(decoder.Decoder):
-  """BeamSearch sampling decoder.
-
-    **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in
-    `AttentionWrapper`, then you must ensure that:
-
-    - The encoder output has been tiled to `beam_width` via
-      `tf.contrib.seq2seq.tile_batch` (NOT `tf.tile`).
-    - The `batch_size` argument passed to the `zero_state` method of this
-      wrapper is equal to `true_batch_size * beam_width`.
-    - The initial state created with `zero_state` above contains a
-      `cell_state` value containing properly tiled final state from the
-      encoder.
-
-    An example:
-
-    ```
-    tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(
-        encoder_outputs, multiplier=beam_width)
-    tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(
-        encoder_final_state, multiplier=beam_width)
-    tiled_sequence_length = tf.contrib.seq2seq.tile_batch(
-        sequence_length, multiplier=beam_width)
-    attention_mechanism = MyFavoriteAttentionMechanism(
-        num_units=attention_depth,
-        memory=tiled_inputs,
-        memory_sequence_length=tiled_sequence_length)
-    attention_cell = AttentionWrapper(cell, attention_mechanism, ...)
-    decoder_initial_state = attention_cell.zero_state(
-        dtype, batch_size=true_batch_size * beam_width)
-    decoder_initial_state = decoder_initial_state.clone(
-        cell_state=tiled_encoder_final_state)
-    ```
-
-    Meanwhile, with `AttentionWrapper`, coverage penalty is suggested to use
-    when computing scores(https://arxiv.org/pdf/1609.08144.pdf). It encourages
-    the translation to cover all inputs.
+  It is expected to be used a base class for concrete BeamSearchDecoder. Since
+  this is a mixin class, it is expected to be used together with other class as
+  base.
   """
 
   def __init__(self,
                cell,
-               embedding,
-               start_tokens,
-               end_token,
-               initial_state,
                beam_width,
                output_layer=None,
                length_penalty_weight=0.0,
                coverage_penalty_weight=0.0,
-               reorder_tensor_arrays=True):
-    """Initialize the BeamSearchDecoder.
+               reorder_tensor_arrays=True,
+               **kwargs):
+    """Initialize the BeamSearchDecoderMixin.
 
     Args:
       cell: An `RNNCell` instance.
-      embedding: A callable that takes a vector tensor of `ids` (argmax ids),
-        or the `params` argument for `embedding_lookup`.
-      start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
-      end_token: `int32` scalar, the token that marks end of decoding.
-      initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
       beam_width:  Python integer, the number of beams.
-      output_layer: (Optional) An instance of `tf.layers.Layer`, i.e.,
-        `tf.layers.Dense`.  Optional layer to apply to the RNN output prior
-        to storing the result or sampling.
+      output_layer: (Optional) An instance of `tf.keras.layers.Layer`, i.e.,
+        `tf.keras.layers.Dense`.  Optional layer to apply to the RNN output
+        prior to storing the result or sampling.
       length_penalty_weight: Float weight to penalize length. Disabled with 0.0.
       coverage_penalty_weight: Float weight to penalize the coverage of source
         sentence. Disabled with 0.0.
@@ -302,59 +265,35 @@ class BeamSearchDecoder(decoder.Decoder):
         Otherwise, the `TensorArray` will be returned as is. Set this flag to
         `False` if the cell state contains `TensorArray`s that are not amenable
         to reordering.
+      **kwargs: Dict, other keyword arguments for parent class.
 
     Raises:
       TypeError: if `cell` is not an instance of `RNNCell`,
-        or `output_layer` is not an instance of `tf.layers.Layer`.
-      ValueError: If `start_tokens` is not a vector or
-        `end_token` is not a scalar.
+        or `output_layer` is not an instance of `tf.keras.layers.Layer`.
     """
     rnn_cell_impl.assert_like_rnncell("cell", cell)  # pylint: disable=protected-access
     if (output_layer is not None and
-        not isinstance(output_layer, layers_base.Layer)):
+        not isinstance(output_layer, layers.Layer)):
       raise TypeError(
           "output_layer must be a Layer, received: %s" % type(output_layer))
     self._cell = cell
     self._output_layer = output_layer
     self._reorder_tensor_arrays = reorder_tensor_arrays
 
-    if callable(embedding):
-      self._embedding_fn = embedding
-    else:
-      self._embedding_fn = (
-          lambda ids: embedding_ops.embedding_lookup(embedding, ids))
-
-    self._start_tokens = ops.convert_to_tensor(
-        start_tokens, dtype=dtypes.int32, name="start_tokens")
-    if self._start_tokens.get_shape().ndims != 1:
-      raise ValueError("start_tokens must be a vector")
-    self._end_token = ops.convert_to_tensor(
-        end_token, dtype=dtypes.int32, name="end_token")
-    if self._end_token.get_shape().ndims != 0:
-      raise ValueError("end_token must be a scalar")
-
-    self._batch_size = array_ops.size(start_tokens)
+    self._start_tokens = None
+    self._end_token = None
+    self._batch_size = None
     self._beam_width = beam_width
     self._length_penalty_weight = length_penalty_weight
     self._coverage_penalty_weight = coverage_penalty_weight
-    self._initial_cell_state = nest.map_structure(
-        self._maybe_split_batch_beams, initial_state, self._cell.state_size)
-    self._start_tokens = array_ops.tile(
-        array_ops.expand_dims(self._start_tokens, 1), [1, self._beam_width])
-    self._start_inputs = self._embedding_fn(self._start_tokens)
-
-    self._finished = array_ops.one_hot(
-        array_ops.zeros([self._batch_size], dtype=dtypes.int32),
-        depth=self._beam_width,
-        on_value=False,
-        off_value=True,
-        dtype=dtypes.bool)
+    super(BeamSearchDecoderMixin, self).__init__(**kwargs)
 
   @property
   def batch_size(self):
     return self._batch_size
 
   def _rnn_output_size(self):
+    """Get the output shape from the RNN layer."""
     size = self._cell.output_size
     if self._output_layer is None:
       return size
@@ -393,50 +332,6 @@ class BeamSearchDecoder(decoder.Decoder):
         predicted_ids=tensor_shape.TensorShape([self._beam_width]),
         parent_ids=tensor_shape.TensorShape([self._beam_width]))
 
-  @property
-  def output_dtype(self):
-    # Assume the dtype of the cell is the output_size structure
-    # containing the input_state's first component's dtype.
-    # Return that structure and int32 (the id)
-    dtype = nest.flatten(self._initial_cell_state)[0].dtype
-    return BeamSearchDecoderOutput(
-        scores=nest.map_structure(lambda _: dtype, self._rnn_output_size()),
-        predicted_ids=dtypes.int32,
-        parent_ids=dtypes.int32)
-
-  def initialize(self, name=None):
-    """Initialize the decoder.
-
-    Args:
-      name: Name scope for any created operations.
-
-    Returns:
-      `(finished, start_inputs, initial_state)`.
-    """
-    finished, start_inputs = self._finished, self._start_inputs
-
-    dtype = nest.flatten(self._initial_cell_state)[0].dtype
-    log_probs = array_ops.one_hot(  # shape(batch_sz, beam_sz)
-        array_ops.zeros([self._batch_size], dtype=dtypes.int32),
-        depth=self._beam_width,
-        on_value=ops.convert_to_tensor(0.0, dtype=dtype),
-        off_value=ops.convert_to_tensor(-np.Inf, dtype=dtype),
-        dtype=dtype)
-    init_attention_probs = get_attention_probs(
-        self._initial_cell_state, self._coverage_penalty_weight)
-    if init_attention_probs is None:
-      init_attention_probs = ()
-
-    initial_state = BeamSearchDecoderState(
-        cell_state=self._initial_cell_state,
-        log_probs=log_probs,
-        finished=finished,
-        lengths=array_ops.zeros(
-            [self._batch_size, self._beam_width], dtype=dtypes.int64),
-        accumulated_attention_probs=init_attention_probs)
-
-    return (finished, start_inputs, initial_state)
-
   def finalize(self, outputs, final_state, sequence_lengths):
     """Finalize and return the predicted_ids.
 
@@ -562,7 +457,7 @@ class BeamSearchDecoder(decoder.Decoder):
     """
     if isinstance(t, tensor_array_ops.TensorArray):
       return t
-    _check_maybe(t)
+    _check_ndims(t)
     if t.shape.ndims >= 1:
       return self._split_batch_beams(t, s)
     else:
@@ -586,7 +481,7 @@ class BeamSearchDecoder(decoder.Decoder):
     """
     if isinstance(t, tensor_array_ops.TensorArray):
       return t
-    _check_maybe(t)
+    _check_ndims(t)
     if t.shape.ndims >= 2:
       return self._merge_batch_beams(t, s)
     else:
@@ -609,11 +504,18 @@ class BeamSearchDecoder(decoder.Decoder):
     if not isinstance(t, tensor_array_ops.TensorArray):
       return t
     # pylint: disable=protected-access
-    if (not t._infer_shape or not t._element_shape
-        or t._element_shape[0].ndims is None
-        or t._element_shape[0].ndims < 1):
+    # This is a bad hack due to the implementation detail of eager/graph TA.
+    # TODO(b/124374427): Update this to use public property of TensorArray.
+    if context.executing_eagerly():
+      element_shape = t._element_shape
+    else:
+      element_shape = t._element_shape[0]
+    if (not t._infer_shape
+        or not t._element_shape
+        or element_shape.ndims is None
+        or element_shape.ndims < 1):
       shape = (
-          t._element_shape[0] if t._infer_shape and t._element_shape
+          element_shape if t._infer_shape and t._element_shape
           else tensor_shape.TensorShape(None))
       tf_logging.warn("The TensorArray %s in the cell state is not amenable to "
                       "sorting based on the beam search result. For a "
@@ -621,10 +523,10 @@ class BeamSearchDecoder(decoder.Decoder):
                       "defined and have at least a rank of 1, but saw shape: %s"
                       % (t.handle.name, shape))
       return t
-    shape = t._element_shape[0]
     # pylint: enable=protected-access
     if not _check_static_batch_beam_maybe(
-        shape, tensor_util.constant_value(self._batch_size), self._beam_width):
+        element_shape, tensor_util.constant_value(self._batch_size),
+        self._beam_width):
       return t
     t = t.stack()
     with ops.control_dependencies(
@@ -684,6 +586,359 @@ class BeamSearchDecoder(decoder.Decoder):
     return (beam_search_output, beam_search_state, next_inputs, finished)
 
 
+class BeamSearchDecoder(BeamSearchDecoderMixin, decoder.Decoder):
+  # Note that the inheritance hierarchy is important here. The Mixin has to be
+  # the first parent class since we will use super().__init__(), and Mixin which
+  # is a object will properly invoke the __init__ method of other parent class.
+  """BeamSearch sampling decoder.
+
+    **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in
+    `AttentionWrapper`, then you must ensure that:
+
+    - The encoder output has been tiled to `beam_width` via
+      `tf.contrib.seq2seq.tile_batch` (NOT `tf.tile`).
+    - The `batch_size` argument passed to the `zero_state` method of this
+      wrapper is equal to `true_batch_size * beam_width`.
+    - The initial state created with `zero_state` above contains a
+      `cell_state` value containing properly tiled final state from the
+      encoder.
+
+    An example:
+
+    ```
+    tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(
+        encoder_outputs, multiplier=beam_width)
+    tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(
+        encoder_final_state, multiplier=beam_width)
+    tiled_sequence_length = tf.contrib.seq2seq.tile_batch(
+        sequence_length, multiplier=beam_width)
+    attention_mechanism = MyFavoriteAttentionMechanism(
+        num_units=attention_depth,
+        memory=tiled_inputs,
+        memory_sequence_length=tiled_sequence_length)
+    attention_cell = AttentionWrapper(cell, attention_mechanism, ...)
+    decoder_initial_state = attention_cell.zero_state(
+        dtype, batch_size=true_batch_size * beam_width)
+    decoder_initial_state = decoder_initial_state.clone(
+        cell_state=tiled_encoder_final_state)
+    ```
+
+    Meanwhile, with `AttentionWrapper`, coverage penalty is suggested to use
+    when computing scores (https://arxiv.org/pdf/1609.08144.pdf). It encourages
+    the decoder to cover all inputs.
+  """
+
+  def __init__(self,
+               cell,
+               embedding,
+               start_tokens,
+               end_token,
+               initial_state,
+               beam_width,
+               output_layer=None,
+               length_penalty_weight=0.0,
+               coverage_penalty_weight=0.0,
+               reorder_tensor_arrays=True):
+    """Initialize the BeamSearchDecoder.
+
+    Args:
+      cell: An `RNNCell` instance.
+      embedding: A callable that takes a vector tensor of `ids` (argmax ids),
+        or the `params` argument for `embedding_lookup`.
+      start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
+      end_token: `int32` scalar, the token that marks end of decoding.
+      initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
+      beam_width:  Python integer, the number of beams.
+      output_layer: (Optional) An instance of `tf.keras.layers.Layer`, i.e.,
+        `tf.keras.layers.Dense`.  Optional layer to apply to the RNN output
+        prior to storing the result or sampling.
+      length_penalty_weight: Float weight to penalize length. Disabled with 0.0.
+      coverage_penalty_weight: Float weight to penalize the coverage of source
+        sentence. Disabled with 0.0.
+      reorder_tensor_arrays: If `True`, `TensorArray`s' elements within the cell
+        state will be reordered according to the beam search path. If the
+        `TensorArray` can be reordered, the stacked form will be returned.
+        Otherwise, the `TensorArray` will be returned as is. Set this flag to
+        `False` if the cell state contains `TensorArray`s that are not amenable
+        to reordering.
+
+    Raises:
+      TypeError: if `cell` is not an instance of `RNNCell`,
+        or `output_layer` is not an instance of `tf.keras.layers.Layer`.
+      ValueError: If `start_tokens` is not a vector or
+        `end_token` is not a scalar.
+    """
+    super(BeamSearchDecoder, self).__init__(
+        cell,
+        beam_width,
+        output_layer=output_layer,
+        length_penalty_weight=length_penalty_weight,
+        coverage_penalty_weight=coverage_penalty_weight,
+        reorder_tensor_arrays=reorder_tensor_arrays)
+
+    if callable(embedding):
+      self._embedding_fn = embedding
+    else:
+      self._embedding_fn = (
+          lambda ids: embedding_ops.embedding_lookup(embedding, ids))
+
+    self._start_tokens = ops.convert_to_tensor(
+        start_tokens, dtype=dtypes.int32, name="start_tokens")
+    if self._start_tokens.get_shape().ndims != 1:
+      raise ValueError("start_tokens must be a vector")
+    self._end_token = ops.convert_to_tensor(
+        end_token, dtype=dtypes.int32, name="end_token")
+    if self._end_token.get_shape().ndims != 0:
+      raise ValueError("end_token must be a scalar")
+
+    self._batch_size = array_ops.size(start_tokens)
+    self._initial_cell_state = nest.map_structure(
+        self._maybe_split_batch_beams, initial_state, self._cell.state_size)
+    self._start_tokens = array_ops.tile(
+        array_ops.expand_dims(self._start_tokens, 1), [1, self._beam_width])
+    self._start_inputs = self._embedding_fn(self._start_tokens)
+
+    self._finished = array_ops.one_hot(
+        array_ops.zeros([self._batch_size], dtype=dtypes.int32),
+        depth=self._beam_width,
+        on_value=False,
+        off_value=True,
+        dtype=dtypes.bool)
+
+  def initialize(self, name=None):
+    """Initialize the decoder.
+
+    Args:
+      name: Name scope for any created operations.
+
+    Returns:
+      `(finished, start_inputs, initial_state)`.
+    """
+    finished, start_inputs = self._finished, self._start_inputs
+
+    dtype = nest.flatten(self._initial_cell_state)[0].dtype
+    log_probs = array_ops.one_hot(  # shape(batch_sz, beam_sz)
+        array_ops.zeros([self._batch_size], dtype=dtypes.int32),
+        depth=self._beam_width,
+        on_value=ops.convert_to_tensor(0.0, dtype=dtype),
+        off_value=ops.convert_to_tensor(-np.Inf, dtype=dtype),
+        dtype=dtype)
+    init_attention_probs = get_attention_probs(
+        self._initial_cell_state, self._coverage_penalty_weight)
+    if init_attention_probs is None:
+      init_attention_probs = ()
+
+    initial_state = BeamSearchDecoderState(
+        cell_state=self._initial_cell_state,
+        log_probs=log_probs,
+        finished=finished,
+        lengths=array_ops.zeros(
+            [self._batch_size, self._beam_width], dtype=dtypes.int64),
+        accumulated_attention_probs=init_attention_probs)
+
+    return (finished, start_inputs, initial_state)
+
+  @property
+  def output_dtype(self):
+    # Assume the dtype of the cell is the output_size structure
+    # containing the input_state's first component's dtype.
+    # Return that structure and int32 (the id)
+    dtype = nest.flatten(self._initial_cell_state)[0].dtype
+    return BeamSearchDecoderOutput(
+        scores=nest.map_structure(lambda _: dtype, self._rnn_output_size()),
+        predicted_ids=dtypes.int32,
+        parent_ids=dtypes.int32)
+
+
+class BeamSearchDecoderV2(BeamSearchDecoderMixin, decoder.BaseDecoder):
+  # Note that the inheritance hierarchy is important here. The Mixin has to be
+  # the first parent class since we will use super().__init__(), and Mixin which
+  # is a object will properly invoke the __init__ method of other parent class.
+  """BeamSearch sampling decoder.
+
+    **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in
+    `AttentionWrapper`, then you must ensure that:
+
+    - The encoder output has been tiled to `beam_width` via
+      `tf.contrib.seq2seq.tile_batch` (NOT `tf.tile`).
+    - The `batch_size` argument passed to the `zero_state` method of this
+      wrapper is equal to `true_batch_size * beam_width`.
+    - The initial state created with `zero_state` above contains a
+      `cell_state` value containing properly tiled final state from the
+      encoder.
+
+    An example:
+
+    ```
+    tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(
+        encoder_outputs, multiplier=beam_width)
+    tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(
+        encoder_final_state, multiplier=beam_width)
+    tiled_sequence_length = tf.contrib.seq2seq.tile_batch(
+        sequence_length, multiplier=beam_width)
+    attention_mechanism = MyFavoriteAttentionMechanism(
+        num_units=attention_depth,
+        memory=tiled_inputs,
+        memory_sequence_length=tiled_sequence_length)
+    attention_cell = AttentionWrapper(cell, attention_mechanism, ...)
+    decoder_initial_state = attention_cell.zero_state(
+        dtype, batch_size=true_batch_size * beam_width)
+    decoder_initial_state = decoder_initial_state.clone(
+        cell_state=tiled_encoder_final_state)
+    ```
+
+    Meanwhile, with `AttentionWrapper`, coverage penalty is suggested to use
+    when computing scores (https://arxiv.org/pdf/1609.08144.pdf). It encourages
+    the decoding to cover all inputs.
+  """
+
+  def __init__(self,
+               cell,
+               beam_width,
+               embedding_fn=None,
+               output_layer=None,
+               length_penalty_weight=0.0,
+               coverage_penalty_weight=0.0,
+               reorder_tensor_arrays=True,
+               **kwargs):
+    """Initialize the BeamSearchDecoderV2.
+
+    Args:
+      cell: An `RNNCell` instance.
+      beam_width:  Python integer, the number of beams.
+      embedding_fn: A callable that takes a vector tensor of `ids` (argmax ids).
+      output_layer: (Optional) An instance of `tf.keras.layers.Layer`, i.e.,
+        `tf.keras.layers.Dense`.  Optional layer to apply to the RNN output
+        prior to storing the result or sampling.
+      length_penalty_weight: Float weight to penalize length. Disabled with 0.0.
+      coverage_penalty_weight: Float weight to penalize the coverage of source
+        sentence. Disabled with 0.0.
+      reorder_tensor_arrays: If `True`, `TensorArray`s' elements within the cell
+        state will be reordered according to the beam search path. If the
+        `TensorArray` can be reordered, the stacked form will be returned.
+        Otherwise, the `TensorArray` will be returned as is. Set this flag to
+        `False` if the cell state contains `TensorArray`s that are not amenable
+        to reordering.
+      **kwargs: Dict, other keyword arguments for initialization.
+
+    Raises:
+      TypeError: if `cell` is not an instance of `RNNCell`,
+        or `output_layer` is not an instance of `tf.keras.layers.Layer`.
+    """
+    super(BeamSearchDecoderV2, self).__init__(
+        cell,
+        beam_width,
+        output_layer=output_layer,
+        length_penalty_weight=length_penalty_weight,
+        coverage_penalty_weight=coverage_penalty_weight,
+        reorder_tensor_arrays=reorder_tensor_arrays,
+        **kwargs)
+
+    if embedding_fn is None or callable(embedding_fn):
+      self._embedding_fn = embedding_fn
+    else:
+      raise ValueError("embedding_fn is expected to be a callable, got %s" %
+                       type(embedding_fn))
+
+  def initialize(self,
+                 embedding,
+                 start_tokens,
+                 end_token,
+                 initial_state):
+    """Initialize the decoder.
+
+    Args:
+      embedding: A tensor from the embedding layer output, which is the
+        `params` argument for `embedding_lookup`.
+      start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
+      end_token: `int32` scalar, the token that marks end of decoding.
+      initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
+    Returns:
+      `(finished, start_inputs, initial_state)`.
+    Raises:
+      ValueError: If `start_tokens` is not a vector or `end_token` is not a
+        scalar.
+    """
+    if embedding is not None and self._embedding_fn is not None:
+      raise ValueError(
+          "embedding and embedding_fn cannot be provided at same time")
+    elif embedding is not None:
+      self._embedding_fn = (
+          lambda ids: embedding_ops.embedding_lookup(embedding, ids))
+
+    self._start_tokens = ops.convert_to_tensor(
+        start_tokens, dtype=dtypes.int32, name="start_tokens")
+    if self._start_tokens.get_shape().ndims != 1:
+      raise ValueError("start_tokens must be a vector")
+    self._end_token = ops.convert_to_tensor(
+        end_token, dtype=dtypes.int32, name="end_token")
+    if self._end_token.get_shape().ndims != 0:
+      raise ValueError("end_token must be a scalar")
+
+    self._batch_size = array_ops.size(start_tokens)
+    self._initial_cell_state = nest.map_structure(
+        self._maybe_split_batch_beams, initial_state, self._cell.state_size)
+    self._start_tokens = array_ops.tile(
+        array_ops.expand_dims(self._start_tokens, 1), [1, self._beam_width])
+    self._start_inputs = self._embedding_fn(self._start_tokens)
+
+    self._finished = array_ops.one_hot(
+        array_ops.zeros([self._batch_size], dtype=dtypes.int32),
+        depth=self._beam_width,
+        on_value=False,
+        off_value=True,
+        dtype=dtypes.bool)
+
+    finished, start_inputs = self._finished, self._start_inputs
+
+    dtype = nest.flatten(self._initial_cell_state)[0].dtype
+    log_probs = array_ops.one_hot(  # shape(batch_sz, beam_sz)
+        array_ops.zeros([self._batch_size], dtype=dtypes.int32),
+        depth=self._beam_width,
+        on_value=ops.convert_to_tensor(0.0, dtype=dtype),
+        off_value=ops.convert_to_tensor(-np.Inf, dtype=dtype),
+        dtype=dtype)
+    init_attention_probs = get_attention_probs(
+        self._initial_cell_state, self._coverage_penalty_weight)
+    if init_attention_probs is None:
+      init_attention_probs = ()
+
+    initial_state = BeamSearchDecoderState(
+        cell_state=self._initial_cell_state,
+        log_probs=log_probs,
+        finished=finished,
+        lengths=array_ops.zeros(
+            [self._batch_size, self._beam_width], dtype=dtypes.int64),
+        accumulated_attention_probs=init_attention_probs)
+
+    return (finished, start_inputs, initial_state)
+
+  @property
+  def output_dtype(self):
+    # Assume the dtype of the cell is the output_size structure
+    # containing the input_state's first component's dtype.
+    # Return that structure and int32 (the id)
+    dtype = nest.flatten(self._initial_cell_state)[0].dtype
+    return BeamSearchDecoderOutput(
+        scores=nest.map_structure(lambda _: dtype, self._rnn_output_size()),
+        predicted_ids=dtypes.int32,
+        parent_ids=dtypes.int32)
+
+  def call(self, embeddning, start_tokens, end_token, initial_state, **kwargs):
+    init_kwargs = kwargs
+    init_kwargs["start_tokens"] = start_tokens
+    init_kwargs["end_token"] = end_token
+    init_kwargs["initial_state"] = initial_state
+    return decoder.dynamic_decode(self,
+                                  output_time_major=self.output_time_major,
+                                  impute_finished=self.impute_finished,
+                                  maximum_iterations=self.maximum_iterations,
+                                  parallel_iterations=self.parallel_iterations,
+                                  swap_memory=self.swap_memory,
+                                  decoder_init_input=embeddning,
+                                  decoder_init_kwargs=init_kwargs)
+
+
 def _beam_search_step(time, logits, next_cell_state, beam_state, batch_size,
                       beam_width, end_token, length_penalty_weight,
                       coverage_penalty_weight):
@@ -921,6 +1176,7 @@ def _get_scores(log_probs, sequence_lengths, length_penalty_weight,
   """
   length_penalty_ = _length_penalty(
       sequence_lengths=sequence_lengths, penalty_factor=length_penalty_weight)
+  length_penalty_ = math_ops.cast(length_penalty_, dtype=log_probs.dtype)
   scores = log_probs / length_penalty_
 
   coverage_penalty_weight = ops.convert_to_tensor(
@@ -1067,7 +1323,7 @@ def _maybe_tensor_gather_helper(gather_indices, gather_from, batch_size,
   """
   if isinstance(gather_from, tensor_array_ops.TensorArray):
     return gather_from
-  _check_maybe(gather_from)
+  _check_ndims(gather_from)
   if gather_from.shape.ndims >= len(gather_shape):
     return _tensor_gather_helper(
         gather_indices=gather_indices,
diff --git a/tensorflow/contrib/seq2seq/python/ops/decoder.py b/tensorflow/contrib/seq2seq/python/ops/decoder.py
index f58268eff525a4b592c79acb32207e1a3f62bdc7..33f7bac8159401175ce57c0463fff1398c1dd9bb 100644
--- a/tensorflow/contrib/seq2seq/python/ops/decoder.py
+++ b/tensorflow/contrib/seq2seq/python/ops/decoder.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.keras import layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
@@ -135,6 +136,127 @@ class Decoder(object):
     return False
 
 
+class BaseDecoder(layers.Layer):
+  """An RNN Decoder that is based on a Keras layer.
+
+  Concepts used by this interface:
+  - `inputs`: (structure of) tensors and TensorArrays that is passed as input to
+    the RNNCell composing the decoder, at each time step.
+  - `state`: (structure of) tensors and TensorArrays that is passed to the
+    RNNCell instance as the state.
+  - `memory`: (sturecute of) tensors that is usually the full output of the
+    encoder, which will be used for the attention wrapper for the RNNCell.
+  - `finished`: boolean tensor telling whether each sequence in the batch is
+    finished.
+  - `outputs`: Instance of BasicDecoderOutput. Result of the decoding, at each
+    time step.
+  """
+
+  def __init__(self,
+               output_time_major=False,
+               impute_finished=False,
+               maximum_iterations=None,
+               parallel_iterations=32,
+               swap_memory=False,
+               **kwargs):
+    self.output_time_major = output_time_major
+    self.impute_finished = impute_finished
+    self.maximum_iterations = maximum_iterations
+    self.parallel_iterations = parallel_iterations
+    self.swap_memory = swap_memory
+    super(BaseDecoder, self).__init__(**kwargs)
+
+  def call(self, inputs, initial_state=None, **kwargs):
+    init_kwargs = kwargs
+    init_kwargs["initial_state"] = initial_state
+    return dynamic_decode(self,
+                          output_time_major=self.output_time_major,
+                          impute_finished=self.impute_finished,
+                          maximum_iterations=self.maximum_iterations,
+                          parallel_iterations=self.parallel_iterations,
+                          swap_memory=self.swap_memory,
+                          decoder_init_input=inputs,
+                          decoder_init_kwargs=init_kwargs)
+
+  @property
+  def batch_size(self):
+    """The batch size of input values."""
+    raise NotImplementedError
+
+  @property
+  def output_size(self):
+    """A (possibly nested tuple of...) integer[s] or `TensorShape` object[s]."""
+    raise NotImplementedError
+
+  @property
+  def output_dtype(self):
+    """A (possibly nested tuple of...) dtype[s]."""
+    raise NotImplementedError
+
+  def initialize(self, inputs, initial_state=None, **kwargs):
+    """Called before any decoding iterations.
+
+    This methods must compute initial input values and initial state.
+
+    Args:
+      inputs: (structure of) tensors that contains the input for the decoder. In
+        the normal case, its a tensor with shape [batch, timestep, embedding].
+      initial_state: (structure of) tensors that contains the initial state for
+        the RNNCell.
+      **kwargs: Other arguments that are passed in from layer.call() method. It
+        could contains item like input sequence_length, or masking for input.
+
+    Returns:
+      `(finished, initial_inputs, initial_state)`: initial values of
+      'finished' flags, inputs and state.
+    """
+    raise NotImplementedError
+
+  def step(self, time, inputs, state):
+    """Called per step of decoding (but only once for dynamic decoding).
+
+    Args:
+      time: Scalar `int32` tensor. Current step number.
+      inputs: RNNCell input (possibly nested tuple of) tensor[s] for this time
+        step.
+      state: RNNCell state (possibly nested tuple of) tensor[s] from previous
+        time step.
+
+    Returns:
+      `(outputs, next_state, next_inputs, finished)`: `outputs` is an object
+      containing the decoder output, `next_state` is a (structure of) state
+      tensors and TensorArrays, `next_inputs` is the tensor that should be used
+      as input for the next step, `finished` is a boolean tensor telling whether
+      the sequence is complete, for each sequence in the batch.
+    """
+    raise NotImplementedError
+
+  def finalize(self, outputs, final_state, sequence_lengths):
+    raise NotImplementedError
+
+  @property
+  def tracks_own_finished(self):
+    """Describes whether the Decoder keeps track of finished states.
+
+    Most decoders will emit a true/false `finished` value independently
+    at each time step.  In this case, the `dynamic_decode` function keeps track
+    of which batch entries are already finished, and performs a logical OR to
+    insert new batches to the finished set.
+
+    Some decoders, however, shuffle batches / beams between time steps and
+    `dynamic_decode` will mix up the finished state across these entries because
+    it does not track the reshuffle across time steps.  In this case, it is
+    up to the decoder to declare that it will keep track of its own finished
+    state by setting this property to `True`.
+
+    Returns:
+      Python bool.
+    """
+    return False
+
+  # TODO(scottzhu): Add build/get_config/from_config and other layer methods.
+
+
 def _create_zero_outputs(size, dtype, batch_size):
   """Create a zero outputs Tensor structure."""
   def _create(s, d):
@@ -149,7 +271,8 @@ def dynamic_decode(decoder,
                    maximum_iterations=None,
                    parallel_iterations=32,
                    swap_memory=False,
-                   scope=None):
+                   scope=None,
+                   **kwargs):
   """Perform dynamic decoding with `decoder`.
 
   Calls initialize() once and step() repeatedly on the Decoder object.
@@ -171,6 +294,9 @@ def dynamic_decode(decoder,
     parallel_iterations: Argument passed to `tf.while_loop`.
     swap_memory: Argument passed to `tf.while_loop`.
     scope: Optional variable scope to use.
+    **kwargs: dict, other keyword arguments for dynamic_decode. It might contain
+      arguments for `BaseDecoder` to initialize, which takes all tensor inputs
+      during call().
 
   Returns:
     `(final_outputs, final_state, final_sequence_lengths)`.
@@ -179,7 +305,7 @@ def dynamic_decode(decoder,
     TypeError: if `decoder` is not an instance of `Decoder`.
     ValueError: if `maximum_iterations` is provided but is not a scalar.
   """
-  if not isinstance(decoder, Decoder):
+  if not isinstance(decoder, (Decoder, BaseDecoder)):
     raise TypeError("Expected decoder to be type Decoder, but saw: %s" %
                     type(decoder))
 
@@ -204,7 +330,14 @@ def dynamic_decode(decoder,
       if maximum_iterations.get_shape().ndims != 0:
         raise ValueError("maximum_iterations must be a scalar")
 
-    initial_finished, initial_inputs, initial_state = decoder.initialize()
+    if isinstance(decoder, Decoder):
+      initial_finished, initial_inputs, initial_state = decoder.initialize()
+    else:
+      # For BaseDecoder that takes tensor inputs during call.
+      decoder_init_input = kwargs.pop("decoder_init_input", None)
+      decoder_init_kwargs = kwargs.pop("decoder_init_kwargs", {})
+      initial_finished, initial_inputs, initial_state = decoder.initialize(
+          decoder_init_input, **decoder_init_kwargs)
 
     zero_outputs = _create_zero_outputs(decoder.output_size,
                                         decoder.output_dtype,
@@ -222,7 +355,7 @@ def dynamic_decode(decoder,
     def _shape(batch_size, from_shape):
       if (not isinstance(from_shape, tensor_shape.TensorShape) or
           from_shape.ndims == 0):
-        return tensor_shape.TensorShape(None)
+        return None
       else:
         batch_size = tensor_util.constant_value(
             ops.convert_to_tensor(
diff --git a/tensorflow/contrib/seq2seq/python/ops/helper.py b/tensorflow/contrib/seq2seq/python/ops/helper.py
index 3245cc5e72154289ea3ba000b9a30586a7ad03a9..033c2eb0801d5a51ee937f5e960faa91a6f1ae54 100644
--- a/tensorflow/contrib/seq2seq/python/ops/helper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/helper.py
@@ -32,9 +32,8 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import tensor_array_ops
-from tensorflow.python.ops.distributions import bernoulli
-from tensorflow.python.ops.distributions import categorical
 from tensorflow.python.util import nest
 
 __all__ = [
@@ -51,6 +50,68 @@ __all__ = [
 _transpose_batch_time = decoder._transpose_batch_time  # pylint: disable=protected-access
 
 
+# The following sample functions (_call_sampler, bernoulli_sample,
+# categorical_sample) mimic TensorFlow Probability distribution semantics.
+
+
+def _call_sampler(sample_n_fn, sample_shape, name=None):
+  """Reshapes vector of samples."""
+  with ops.name_scope(name, "call_sampler", values=[sample_shape]):
+    sample_shape = ops.convert_to_tensor(
+        sample_shape, dtype=dtypes.int32, name="sample_shape")
+    # Ensure sample_shape is a vector (vs just a scalar).
+    pad = math_ops.cast(math_ops.equal(array_ops.rank(sample_shape), 0),
+                        dtypes.int32)
+    sample_shape = array_ops.reshape(
+        sample_shape,
+        array_ops.pad(array_ops.shape(sample_shape),
+                      paddings=[[pad, 0]],
+                      constant_values=1))
+    samples = sample_n_fn(math_ops.reduce_prod(sample_shape))
+    batch_event_shape = array_ops.shape(samples)[1:]
+    final_shape = array_ops.concat([sample_shape, batch_event_shape], 0)
+    return array_ops.reshape(samples, final_shape)
+
+
+def bernoulli_sample(probs=None, logits=None, dtype=dtypes.int32,
+                     sample_shape=(), seed=None):
+  """Samples from Bernoulli distribution."""
+  if probs is None:
+    probs = math_ops.sigmoid(logits, name="probs")
+  else:
+    probs = ops.convert_to_tensor(probs, name="probs")
+  batch_shape_tensor = array_ops.shape(probs)
+  def _sample_n(n):
+    """Sample vector of Bernoullis."""
+    new_shape = array_ops.concat([[n], batch_shape_tensor], 0)
+    uniform = random_ops.random_uniform(
+        new_shape, seed=seed, dtype=probs.dtype)
+    return math_ops.cast(math_ops.less(uniform, probs), dtype)
+  return _call_sampler(_sample_n, sample_shape)
+
+
+def categorical_sample(logits, dtype=dtypes.int32,
+                       sample_shape=(), seed=None):
+  """Samples from categorical distribution."""
+  logits = ops.convert_to_tensor(logits, name="logits")
+  event_size = array_ops.shape(logits)[-1]
+  batch_shape_tensor = array_ops.shape(logits)[:-1]
+  def _sample_n(n):
+    """Sample vector of categoricals."""
+    if logits.shape.ndims == 2:
+      logits_2d = logits
+    else:
+      logits_2d = array_ops.reshape(logits, [-1, event_size])
+    sample_dtype = dtypes.int64 if logits.dtype.size > 4 else dtypes.int32
+    draws = random_ops.multinomial(
+        logits_2d, n, seed=seed, output_dtype=sample_dtype)
+    draws = array_ops.reshape(
+        array_ops.transpose(draws),
+        array_ops.concat([[n], batch_shape_tensor], 0))
+    return math_ops.cast(draws, dtype)
+  return _call_sampler(_sample_n, sample_shape)
+
+
 def _unstack_ta(inp):
   return tensor_array_ops.TensorArray(
       dtype=inp.dtype, size=array_ops.shape(inp)[0],
@@ -307,14 +368,14 @@ class ScheduledEmbeddingTrainingHelper(TrainingHelper):
     with ops.name_scope(name, "ScheduledEmbeddingTrainingHelperSample",
                         [time, outputs, state]):
       # Return -1s where we did not sample, and sample_ids elsewhere
-      select_sampler = bernoulli.Bernoulli(
-          probs=self._sampling_probability, dtype=dtypes.bool)
-      select_sample = select_sampler.sample(
-          sample_shape=self.batch_size, seed=self._scheduling_seed)
-      sample_id_sampler = categorical.Categorical(logits=outputs)
+      select_sample = bernoulli_sample(
+          probs=self._sampling_probability,
+          dtype=dtypes.bool,
+          sample_shape=self.batch_size,
+          seed=self._scheduling_seed)
       return array_ops.where(
           select_sample,
-          sample_id_sampler.sample(seed=self._seed),
+          categorical_sample(logits=outputs, seed=self._seed),
           gen_array_ops.fill([self.batch_size], -1))
 
   def next_inputs(self, time, outputs, state, sample_ids, name=None):
@@ -425,8 +486,10 @@ class ScheduledOutputTrainingHelper(TrainingHelper):
   def sample(self, time, outputs, state, name=None):
     with ops.name_scope(name, "ScheduledOutputTrainingHelperSample",
                         [time, outputs, state]):
-      sampler = bernoulli.Bernoulli(probs=self._sampling_probability)
-      return sampler.sample(sample_shape=self.batch_size, seed=self._seed)
+      return bernoulli_sample(
+          probs=self._sampling_probability,
+          sample_shape=self.batch_size,
+          seed=self._seed)
 
   def next_inputs(self, time, outputs, state, sample_ids, name=None):
     with ops.name_scope(name, "ScheduledOutputTrainingHelperNextInputs",
@@ -610,8 +673,7 @@ class SampleEmbeddingHelper(GreedyEmbeddingHelper):
     else:
       logits = outputs / self._softmax_temperature
 
-    sample_id_sampler = categorical.Categorical(logits=logits)
-    sample_ids = sample_id_sampler.sample(seed=self._seed)
+    sample_ids = categorical_sample(logits=logits, seed=self._seed)
 
     return sample_ids
 
diff --git a/tensorflow/contrib/seq2seq/python/ops/loss.py b/tensorflow/contrib/seq2seq/python/ops/loss.py
index 39a6d2f58b140706a94d83273d3327edd1891368..0fbfd6187030f14ac105a18b3e09b7a42d4de32a 100644
--- a/tensorflow/contrib/seq2seq/python/ops/loss.py
+++ b/tensorflow/contrib/seq2seq/python/ops/loss.py
@@ -20,11 +20,12 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.keras.losses import Loss
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 
-__all__ = ["sequence_loss"]
+__all__ = ["sequence_loss", "SequenceLoss"]
 
 
 def sequence_loss(logits,
@@ -32,16 +33,26 @@ def sequence_loss(logits,
                   weights,
                   average_across_timesteps=True,
                   average_across_batch=True,
+                  sum_over_timesteps=False,
+                  sum_over_batch=False,
                   softmax_loss_function=None,
                   name=None):
   """Weighted cross-entropy loss for a sequence of logits.
 
-  Depending on the values of `average_across_timesteps` and
-  `average_across_batch`, the return Tensor will have rank 0, 1, or 2 as these
-  arguments reduce the cross-entropy at each target, which has shape
-  `[batch_size, sequence_length]`, over their respective dimensions. For
-  example, if `average_across_timesteps` is `True` and `average_across_batch`
-  is `False`, then the return Tensor will have shape `[batch_size]`.
+  Depending on the values of `average_across_timesteps` / `sum_over_timesteps`
+  and `average_across_batch` / `sum_over_batch`, the return Tensor will have
+  rank 0, 1, or 2 as these arguments reduce the cross-entropy at each target,
+  which has shape `[batch_size, sequence_length]`, over their respective
+  dimensions. For example, if `average_across_timesteps` is `True` and
+  `average_across_batch` is `False`, then the return Tensor will have shape
+  `[batch_size]`.
+
+  Note that `average_across_timesteps` and `sum_over_timesteps` cannot be True
+  at same time. Same for `average_across_batch` and `sum_over_batch`.
+
+  The recommended loss reduction in tf 2.0 has been changed to sum_over, instead
+  of weighted average. User are recommend to use `sum_over_timesteps` and
+  `sum_over_batch` for reduction.
 
   Args:
     logits: A Tensor of shape
@@ -58,6 +69,12 @@ def sequence_loss(logits,
       dimension and divide the cost by the total label weight across timesteps.
     average_across_batch: If set, sum the cost across the batch dimension and
       divide the returned cost by the batch size.
+    sum_over_timesteps: If set, sum the cost across the sequence dimension and
+      divide the size of the sequence. Note that any element with 0 weights will
+      be excluded from size calculation.
+    sum_over_batch: if set, sum the cost across the batch dimension and divide
+      the total cost by the batch size. Not that any element with 0 weights will
+      be excluded from size calculation.
     softmax_loss_function: Function (labels, logits) -> loss-batch
       to be used instead of the standard softmax (the default if this is None).
       **Note that to avoid confusion, it is required for the function to accept
@@ -78,11 +95,15 @@ def sequence_loss(logits,
     raise ValueError("Logits must be a "
                      "[batch_size x sequence_length x logits] tensor")
   if len(targets.get_shape()) != 2:
-    raise ValueError("Targets must be a [batch_size x sequence_length] "
-                     "tensor")
+    raise ValueError("Targets must be a [batch_size x sequence_length] tensor")
   if len(weights.get_shape()) != 2:
-    raise ValueError("Weights must be a [batch_size x sequence_length] "
-                     "tensor")
+    raise ValueError("Weights must be a [batch_size x sequence_length] tensor")
+  if average_across_timesteps and sum_over_timesteps:
+    raise ValueError("average_across_timesteps and sum_over_timesteps cannot "
+                     "be set to True at same time.")
+  if average_across_batch and sum_over_batch:
+    raise ValueError("average_across_batch and sum_over_batch cannot be set "
+                     "to True at same time.")
   with ops.name_scope(name, "sequence_loss", [logits, targets, weights]):
     num_classes = array_ops.shape(logits)[2]
     logits_flat = array_ops.reshape(logits, [-1, num_classes])
@@ -96,20 +117,56 @@ def sequence_loss(logits,
     if average_across_timesteps and average_across_batch:
       crossent = math_ops.reduce_sum(crossent)
       total_size = math_ops.reduce_sum(weights)
-      total_size += 1e-12  # to avoid division by 0 for all-0 weights
-      crossent /= total_size
+      crossent = math_ops.div_no_nan(crossent, total_size)
+    elif sum_over_timesteps and sum_over_batch:
+      crossent = math_ops.reduce_sum(crossent)
+      total_count = math_ops.cast(math_ops.count_nonzero(weights),
+                                  crossent.dtype)
+      crossent = math_ops.div_no_nan(crossent, total_count)
     else:
-      batch_size = array_ops.shape(logits)[0]
-      sequence_length = array_ops.shape(logits)[1]
-      crossent = array_ops.reshape(crossent, [batch_size, sequence_length])
-    if average_across_timesteps and not average_across_batch:
-      crossent = math_ops.reduce_sum(crossent, axis=[1])
-      total_size = math_ops.reduce_sum(weights, axis=[1])
-      total_size += 1e-12  # to avoid division by 0 for all-0 weights
-      crossent /= total_size
-    if not average_across_timesteps and average_across_batch:
-      crossent = math_ops.reduce_sum(crossent, axis=[0])
-      total_size = math_ops.reduce_sum(weights, axis=[0])
-      total_size += 1e-12  # to avoid division by 0 for all-0 weights
-      crossent /= total_size
+      crossent = array_ops.reshape(crossent, array_ops.shape(logits)[0:2])
+      if average_across_timesteps or average_across_batch:
+        reduce_axis = [0] if average_across_batch else [1]
+        crossent = math_ops.reduce_sum(crossent, axis=reduce_axis)
+        total_size = math_ops.reduce_sum(weights, axis=reduce_axis)
+        crossent = math_ops.div_no_nan(crossent, total_size)
+      elif sum_over_timesteps or sum_over_batch:
+        reduce_axis = [0] if sum_over_batch else [1]
+        crossent = math_ops.reduce_sum(crossent, axis=reduce_axis)
+        total_count = math_ops.cast(
+            math_ops.count_nonzero(weights, axis=reduce_axis),
+            dtype=crossent.dtype)
+        crossent = math_ops.div_no_nan(crossent, total_count)
     return crossent
+
+
+class SequenceLoss(Loss):
+  """Weighted cross-entropy loss for a sequence of logits."""
+
+  def __init__(self,
+               average_across_timesteps=False,
+               average_across_batch=False,
+               sum_over_timesteps=True,
+               sum_over_batch=True,
+               softmax_loss_function=None,
+               name=None):
+    super(SequenceLoss, self).__init__(name=name)
+    self.average_across_timesteps = average_across_timesteps
+    self.average_across_batch = average_across_batch
+    self.sum_over_timesteps = sum_over_timesteps
+    self.sum_over_batch = sum_over_batch
+    self.softmax_loss_function = softmax_loss_function
+
+  def __call__(self, y_true, y_pred, sample_weight=None):
+    """Override the parent __call__ to have a customized reduce behavior."""
+    return sequence_loss(y_pred, y_true, sample_weight,
+                         average_across_timesteps=self.average_across_timesteps,
+                         average_across_batch=self.average_across_batch,
+                         sum_over_timesteps=self.sum_over_timesteps,
+                         sum_over_batch=self.sum_over_batch,
+                         softmax_loss_function=self.softmax_loss_function,
+                         name=self.name)
+
+  def call(self, y_true, y_pred):
+    # Skip this method since the __call__ contains real implementation.
+    pass
diff --git a/tensorflow/contrib/seq2seq/python/ops/sampler.py b/tensorflow/contrib/seq2seq/python/ops/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e3e48b3bc61c0ff94ae0a1794767c7ff6914969
--- /dev/null
+++ b/tensorflow/contrib/seq2seq/python/ops/sampler.py
@@ -0,0 +1,765 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A library of sampler for use with SamplingDecoders."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+from tensorflow.contrib.seq2seq.python.ops import decoder
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.util import nest
+
+__all__ = [
+    "Sampler",
+    "TrainingSampler",
+    "GreedyEmbeddingSampler",
+    "SampleEmbeddingSampler",
+    "CustomSampler",
+    "ScheduledEmbeddingTrainingSampler",
+    "ScheduledOutputTrainingSampler",
+    "InferenceSampler",
+]
+
+_transpose_batch_time = decoder._transpose_batch_time  # pylint: disable=protected-access
+
+
+@six.add_metaclass(abc.ABCMeta)
+class Sampler(object):
+  """Interface for implementing sampling in seq2seq decoders.
+
+  Sampler instances are used by `BasicDecoder`. The normal usage of a sampler is
+  like below:
+  sampler = Sampler(init_args)
+  (initial_finished, initial_inputs) = sampler.initialize(input_tensors)
+  for time_step in range(time):
+    cell_output, cell_state = cell.call(cell_input, previous_state)
+    sample_ids = sampler.sample(time_step, cell_output, cell_state)
+    (finished, next_inputs, next_state) = sampler.next_inputs(
+        time_step,cell_output, cell_state)
+
+  Note that all the tensor input should not be feed to Sampler as __init__()
+  parameters, instead, they should be feed by decoders via initialize().
+  """
+
+  @abc.abstractmethod
+  def initialize(self, inputs, **kwargs):
+    """initialize the sampler with the input tensors.
+
+    This method suppose to be only invoke once before the calling other methods
+    of the Sampler.
+
+    Args:
+      inputs: A (structure of) input tensors, it could be a nested tuple or a
+        single tensor.
+      **kwargs: Other kwargs for initialization. It could contain tensors like
+        mask for inputs, or non tensor parameter.
+
+    Returns:
+      `(initial_finished, initial_inputs)`.
+    """
+    pass
+
+  @abc.abstractmethod
+  def sample(self, time, outputs, state):
+    """Returns `sample_ids`."""
+    pass
+
+  @abc.abstractmethod
+  def next_inputs(self, time, outputs, state, sample_ids):
+    """Returns `(finished, next_inputs, next_state)`."""
+    pass
+
+  @abc.abstractproperty
+  def batch_size(self):
+    """Batch size of tensor returned by `sample`.
+
+    Returns a scalar int32 tensor. The return value might not available before
+    the invocation of initialize(), in this case, ValueError is raised.
+    """
+    raise NotImplementedError("batch_size has not been implemented")
+
+  @abc.abstractproperty
+  def sample_ids_shape(self):
+    """Shape of tensor returned by `sample`, excluding the batch dimension.
+
+    Returns a `TensorShape`. The return value might not available before the
+    invocation of initialize().
+    """
+    raise NotImplementedError("sample_ids_shape has not been implemented")
+
+  @abc.abstractproperty
+  def sample_ids_dtype(self):
+    """DType of tensor returned by `sample`.
+
+    Returns a DType. The return value might not available before the
+    invocation of initialize().
+    """
+    raise NotImplementedError("sample_ids_dtype has not been implemented")
+
+
+class CustomSampler(Sampler):
+  """Base abstract class that allows the user to customize sampling."""
+
+  def __init__(self,
+               initialize_fn,
+               sample_fn,
+               next_inputs_fn,
+               sample_ids_shape=None,
+               sample_ids_dtype=None):
+    """Initializer.
+
+    Args:
+      initialize_fn: callable that returns `(finished, next_inputs)` for the
+        first iteration.
+      sample_fn: callable that takes `(time, outputs, state)` and emits tensor
+        `sample_ids`.
+      next_inputs_fn: callable that takes `(time, outputs, state, sample_ids)`
+        and emits `(finished, next_inputs, next_state)`.
+      sample_ids_shape: Either a list of integers, or a 1-D Tensor of type
+        `int32`, the shape of each value in the `sample_ids` batch. Defaults to
+        a scalar.
+      sample_ids_dtype: The dtype of the `sample_ids` tensor. Defaults to int32.
+    """
+    self._initialize_fn = initialize_fn
+    self._sample_fn = sample_fn
+    self._next_inputs_fn = next_inputs_fn
+    self._batch_size = None
+    self._sample_ids_shape = tensor_shape.TensorShape(sample_ids_shape or [])
+    self._sample_ids_dtype = sample_ids_dtype or dtypes.int32
+
+  @property
+  def batch_size(self):
+    if self._batch_size is None:
+      raise ValueError("batch_size accessed before initialize was called")
+    return self._batch_size
+
+  @property
+  def sample_ids_shape(self):
+    return self._sample_ids_shape
+
+  @property
+  def sample_ids_dtype(self):
+    return self._sample_ids_dtype
+
+  def initialize(self, inputs, **kwargs):
+    (finished, next_inputs) = self._initialize_fn(inputs, **kwargs)
+    if self._batch_size is None:
+      self._batch_size = array_ops.size(finished)
+    return (finished, next_inputs)
+
+  def sample(self, time, outputs, state):
+    return self._sample_fn(time=time, outputs=outputs, state=state)
+
+  def next_inputs(self, time, outputs, state, sample_ids):
+    return self._next_inputs_fn(
+        time=time, outputs=outputs, state=state, sample_ids=sample_ids)
+
+
+class TrainingSampler(Sampler):
+  """A Sampler for use during training.
+
+  Only reads inputs.
+
+  Returned sample_ids are the argmax of the RNN output logits.
+  """
+
+  def __init__(self, time_major=False):
+    """Initializer.
+
+    Args:
+      time_major: Python bool.  Whether the tensors in `inputs` are time major.
+        If `False` (default), they are assumed to be batch major.
+
+    Raises:
+      ValueError: if `sequence_length` is not a 1D tensor.
+    """
+    self.time_major = time_major
+    self._batch_size = None
+
+  @property
+  def batch_size(self):
+    if self._batch_size is None:
+      raise ValueError("batch_size accessed before initialize was called")
+    return self._batch_size
+
+  @property
+  def sample_ids_shape(self):
+    return tensor_shape.TensorShape([])
+
+  @property
+  def sample_ids_dtype(self):
+    return dtypes.int32
+
+  def initialize(self, inputs, sequence_length=None):
+    """Initialize the TrainSampler.
+
+    Args:
+      inputs: A (structure of) input tensors.
+      sequence_length: An int32 vector tensor.
+
+    Returns:
+      (finished, next_inputs), a tuple of two items. The first item is a boolean
+        vector to indicate whether the item in the batch has finished. The
+        second item is the first slide of input data based on the timestep
+        dimension (usually the second dim of the input).
+    """
+    self.inputs = ops.convert_to_tensor(inputs, name="inputs")
+    if not self.time_major:
+      inputs = nest.map_structure(_transpose_batch_time, inputs)
+
+    self.input_tas = nest.map_structure(_unstack_ta, inputs)
+    if sequence_length is None:
+      raise ValueError("sequence_length is required for TrainingSampler")
+    self.sequence_length = ops.convert_to_tensor(
+        sequence_length, name="sequence_length")
+    if self.sequence_length.get_shape().ndims != 1:
+      raise ValueError(
+          "Expected sequence_length to be a vector, but received shape: %s" %
+          self._sequence_length.get_shape())
+
+    self.zero_inputs = nest.map_structure(
+        lambda inp: array_ops.zeros_like(inp[0, :]), inputs)
+
+    self._batch_size = array_ops.size(self.sequence_length)
+
+    finished = math_ops.equal(0, self.sequence_length)
+    all_finished = math_ops.reduce_all(finished)
+    next_inputs = control_flow_ops.cond(
+        all_finished,
+        lambda: self.zero_inputs,
+        lambda: nest.map_structure(lambda inp: inp.read(0), self.input_tas))
+    return (finished, next_inputs)
+
+  def sample(self, time, outputs, state):
+    del state
+    sample_ids = math_ops.cast(math_ops.argmax(outputs, axis=-1), dtypes.int32)
+    return sample_ids
+
+  def next_inputs(self, time, outputs, state, sample_ids):
+    del sample_ids
+    next_time = time + 1
+    finished = (next_time >= self.sequence_length)
+    all_finished = math_ops.reduce_all(finished)
+
+    def read_from_ta(inp):
+      return inp.read(next_time)
+
+    next_inputs = control_flow_ops.cond(
+        all_finished,
+        lambda: self.zero_inputs,
+        lambda: nest.map_structure(read_from_ta, self.input_tas))
+    return (finished, next_inputs, state)
+
+
+class ScheduledEmbeddingTrainingSampler(TrainingSampler):
+  """A training sampler that adds scheduled sampling.
+
+  Returns -1s for sample_ids where no sampling took place; valid sample id
+  values elsewhere.
+  """
+
+  def __init__(self,
+               sampling_probability,
+               embedding_fn=None,
+               time_major=False,
+               seed=None,
+               scheduling_seed=None):
+    """Initializer.
+
+    Args:
+      sampling_probability: A `float32` 0-D or 1-D tensor: the probability of
+        sampling categorically from the output ids instead of reading directly
+        from the inputs.
+      embedding_fn: A callable that takes a vector tensor of `ids` (argmax ids),
+         or the `params` argument for `embedding_lookup`.
+      time_major: Python bool. Whether the tensors in `inputs` are time major.
+        If `False` (default), they are assumed to be batch major.
+      seed: The sampling seed.
+      scheduling_seed: The schedule decision rule sampling seed.
+
+    Raises:
+      ValueError: if `sampling_probability` is not a scalar or vector.
+    """
+    if callable(embedding_fn) or embedding_fn is None:
+      self.embedding_fn = embedding_fn
+    else:
+      raise ValueError("embedding_fn is expected to be callable, got %s"
+                       % type(embedding_fn))
+    self.sampling_probability = ops.convert_to_tensor(
+        sampling_probability, name="sampling_probability")
+    if self.sampling_probability.get_shape().ndims not in (0, 1):
+      raise ValueError(
+          "sampling_probability must be either a scalar or a vector. "
+          "saw shape: %s" % (self.sampling_probability.get_shape()))
+    self.seed = seed
+    self.scheduling_seed = scheduling_seed
+    super(ScheduledEmbeddingTrainingSampler,
+          self).__init__(time_major=time_major)
+
+  def initialize(self, inputs, sequence_length=None, embedding=None):
+    if self.embedding_fn is None:
+      if embedding is None:
+        raise ValueError("embedding is required as a keyword argument for "
+                         "ScheduledEmbeddingTrainingSampler")
+      self.embedding_fn = (
+          lambda ids: embedding_ops.embedding_lookup(embedding, ids))
+    return super(ScheduledEmbeddingTrainingSampler, self).initialize(
+        inputs, sequence_length=sequence_length)
+
+  def sample(self, time, outputs, state):
+    del state
+    # Return -1s where we did not sample, and sample_ids elsewhere
+    select_sample = bernoulli_sample(
+        probs=self.sampling_probability,
+        dtype=dtypes.bool,
+        sample_shape=self.batch_size,
+        seed=self.scheduling_seed)
+    return array_ops.where(select_sample,
+                           categorical_sample(logits=outputs, seed=self.seed),
+                           gen_array_ops.fill([self.batch_size], -1))
+
+  def next_inputs(self, time, outputs, state, sample_ids):
+    (finished, base_next_inputs, state) = (
+        super(ScheduledEmbeddingTrainingSampler, self).next_inputs(
+            time=time, outputs=outputs, state=state, sample_ids=sample_ids))
+
+    def maybe_sample():
+      """Perform scheduled sampling."""
+      where_sampling = math_ops.cast(
+          array_ops.where(sample_ids > -1), dtypes.int32)
+      where_not_sampling = math_ops.cast(
+          array_ops.where(sample_ids <= -1), dtypes.int32)
+      sample_ids_sampling = array_ops.gather_nd(sample_ids, where_sampling)
+      inputs_not_sampling = array_ops.gather_nd(base_next_inputs,
+                                                where_not_sampling)
+      sampled_next_inputs = self.embedding_fn(sample_ids_sampling)
+      base_shape = array_ops.shape(base_next_inputs)
+      return (array_ops.scatter_nd(
+          indices=where_sampling, updates=sampled_next_inputs, shape=base_shape)
+              + array_ops.scatter_nd(
+                  indices=where_not_sampling,
+                  updates=inputs_not_sampling,
+                  shape=base_shape))
+
+    all_finished = math_ops.reduce_all(finished)
+    next_inputs = control_flow_ops.cond(all_finished, lambda: base_next_inputs,
+                                        maybe_sample)
+    return (finished, next_inputs, state)
+
+
+class ScheduledOutputTrainingSampler(TrainingSampler):
+  """A training sampler that adds scheduled sampling directly to outputs.
+
+  Returns False for sample_ids where no sampling took place; True elsewhere.
+  """
+
+  def __init__(self,
+               sampling_probability,
+               time_major=False,
+               seed=None,
+               next_inputs_fn=None):
+    """Initializer.
+
+    Args:
+      sampling_probability: A `float32` scalar tensor: the probability of
+        sampling from the outputs instead of reading directly from the inputs.
+      time_major: Python bool. Whether the tensors in `inputs` are time major.
+        If `False` (default), they are assumed to be batch major.
+      seed: The sampling seed.
+      next_inputs_fn: (Optional) callable to apply to the RNN outputs to create
+        the next input when sampling. If `None` (default), the RNN outputs will
+        be used as the next inputs.
+
+    Raises:
+      ValueError: if `sampling_probability` is not a scalar or vector.
+    """
+    self.sampling_probability = ops.convert_to_tensor(
+        sampling_probability, name="sampling_probability")
+    if self.sampling_probability.get_shape().ndims not in (0, 1):
+      raise ValueError(
+          "sampling_probability must be either a scalar or a vector. "
+          "saw shape: %s" % (self._sampling_probability.get_shape()))
+
+    self.seed = seed
+    self.next_inputs_fn = next_inputs_fn
+
+    super(ScheduledOutputTrainingSampler, self).__init__(time_major=time_major)
+
+  def initialize(self, inputs, sequence_length=None, auxiliary_inputs=None):
+    if auxiliary_inputs is None:
+      maybe_concatenated_inputs = inputs
+    else:
+      inputs = ops.convert_to_tensor(inputs)
+      auxiliary_inputs = ops.convert_to_tensor(auxiliary_inputs)
+      maybe_concatenated_inputs = nest.map_structure(
+          lambda x, y: array_ops.concat((x, y), -1), inputs, auxiliary_inputs)
+      if not self.time_major:
+        auxiliary_inputs = nest.map_structure(_transpose_batch_time,
+                                              auxiliary_inputs)
+    if auxiliary_inputs is not None:
+      self._auxiliary_input_tas = nest.map_structure(_unstack_ta,
+                                                     auxiliary_inputs)
+    else:
+      self._auxiliary_input_tas = None
+
+    return super(ScheduledOutputTrainingSampler, self).initialize(
+        maybe_concatenated_inputs, sequence_length=sequence_length)
+
+  def sample(self, time, outputs, state):
+    del state
+    return bernoulli_sample(
+        probs=self.sampling_probability,
+        sample_shape=self.batch_size,
+        seed=self.seed)
+
+  def next_inputs(self, time, outputs, state, sample_ids):
+    (finished, base_next_inputs, state) = (
+        super(ScheduledOutputTrainingSampler, self).next_inputs(
+            time=time, outputs=outputs, state=state, sample_ids=sample_ids))
+    sample_ids = math_ops.cast(sample_ids, dtypes.bool)
+
+    def maybe_sample():
+      """Perform scheduled sampling."""
+
+      def maybe_concatenate_auxiliary_inputs(outputs_, indices=None):
+        """Concatenate outputs with auxiliary inputs, if they exist."""
+        if self._auxiliary_input_tas is None:
+          return outputs_
+
+        next_time = time + 1
+        auxiliary_inputs = nest.map_structure(lambda ta: ta.read(next_time),
+                                              self._auxiliary_input_tas)
+        if indices is not None:
+          auxiliary_inputs = array_ops.gather_nd(auxiliary_inputs, indices)
+        return nest.map_structure(lambda x, y: array_ops.concat((x, y), -1),
+                                  outputs_, auxiliary_inputs)
+
+      if self.next_inputs_fn is None:
+        return array_ops.where(sample_ids,
+                               maybe_concatenate_auxiliary_inputs(outputs),
+                               base_next_inputs)
+
+      where_sampling = math_ops.cast(array_ops.where(sample_ids), dtypes.int32)
+      where_not_sampling = math_ops.cast(
+          array_ops.where(math_ops.logical_not(sample_ids)), dtypes.int32)
+      outputs_sampling = array_ops.gather_nd(outputs, where_sampling)
+      inputs_not_sampling = array_ops.gather_nd(base_next_inputs,
+                                                where_not_sampling)
+      sampled_next_inputs = maybe_concatenate_auxiliary_inputs(
+          self.next_inputs_fn(outputs_sampling), where_sampling)
+
+      base_shape = array_ops.shape(base_next_inputs)
+      return (array_ops.scatter_nd(
+          indices=where_sampling, updates=sampled_next_inputs, shape=base_shape)
+              + array_ops.scatter_nd(
+                  indices=where_not_sampling,
+                  updates=inputs_not_sampling,
+                  shape=base_shape))
+
+    all_finished = math_ops.reduce_all(finished)
+    no_samples = math_ops.logical_not(math_ops.reduce_any(sample_ids))
+    next_inputs = control_flow_ops.cond(
+        math_ops.logical_or(all_finished, no_samples), lambda: base_next_inputs,
+        maybe_sample)
+    return (finished, next_inputs, state)
+
+
+class GreedyEmbeddingSampler(Sampler):
+  """A sampler for use during inference.
+
+  Uses the argmax of the output (treated as logits) and passes the
+  result through an embedding layer to get the next input.
+  """
+
+  def __init__(self, embedding_fn=None):
+    """Initializer.
+
+    Args:
+      embedding_fn: A optional callable that takes a vector tensor of `ids`
+        (argmax ids), or the `params` argument for `embedding_lookup`. The
+        returned tensor will be passed to the decoder input. Default to use
+        `embedding_ops.embedding_lookup`.
+    """
+    if embedding_fn is None or callable(embedding_fn):
+      self.embedding_fn = embedding_fn
+    else:
+      raise ValueError("embedding_fn is expected to be a callable, got %s" %
+                       type(embedding_fn))
+    self._batch_size = None
+
+  @property
+  def batch_size(self):
+    if self._batch_size is None:
+      raise ValueError("batch_size accessed before initialize was called")
+    return self._batch_size
+
+  @property
+  def sample_ids_shape(self):
+    return tensor_shape.TensorShape([])
+
+  @property
+  def sample_ids_dtype(self):
+    return dtypes.int32
+
+  def initialize(self, embedding, start_tokens=None, end_token=None):
+    """Initialize the GreedyEmbeddingSampler.
+
+    Args:
+      embedding: tensor that contains embedding states matrix. It will be used
+        to generate generate outputs with start_tokens and end_tokens. The
+        embedding will be ignored if the embedding_fn has been provided at
+        __init__().
+      start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
+      end_token: `int32` scalar, the token that marks end of decoding.
+
+    Returns:
+      Tuple of two items: `(finished, self.start_inputs)`.
+    Raises:
+      ValueError: if `start_tokens` is not a 1D tensor or `end_token` is not a
+        scalar.
+    """
+    if self.embedding_fn is None:
+      self.embedding_fn = (
+          lambda ids: embedding_ops.embedding_lookup(embedding, ids))
+
+    self.start_tokens = ops.convert_to_tensor(
+        start_tokens, dtype=dtypes.int32, name="start_tokens")
+    self.end_token = ops.convert_to_tensor(
+        end_token, dtype=dtypes.int32, name="end_token")
+    if self.start_tokens.get_shape().ndims != 1:
+      raise ValueError("start_tokens must be a vector")
+    self._batch_size = array_ops.size(start_tokens)
+    if self.end_token.get_shape().ndims != 0:
+      raise ValueError("end_token must be a scalar")
+    self.start_inputs = self.embedding_fn(self.start_tokens)
+
+    finished = array_ops.tile([False], [self._batch_size])
+    return (finished, self.start_inputs)
+
+  def sample(self, time, outputs, state):
+    """sample for GreedyEmbeddingHelper."""
+    del time, state  # unused by sample_fn
+    # Outputs are logits, use argmax to get the most probable id
+    if not isinstance(outputs, ops.Tensor):
+      raise TypeError(
+          "Expected outputs to be a single Tensor, got: %s" % type(outputs))
+    sample_ids = math_ops.argmax(outputs, axis=-1, output_type=dtypes.int32)
+    return sample_ids
+
+  def next_inputs(self, time, outputs, state, sample_ids):
+    """next_inputs_fn for GreedyEmbeddingHelper."""
+    del time, outputs  # unused by next_inputs_fn
+    finished = math_ops.equal(sample_ids, self.end_token)
+    all_finished = math_ops.reduce_all(finished)
+    next_inputs = control_flow_ops.cond(
+        all_finished,
+        # If we're finished, the next_inputs value doesn't matter
+        lambda: self.start_inputs,
+        lambda: self.embedding_fn(sample_ids))
+    return (finished, next_inputs, state)
+
+
+class SampleEmbeddingSampler(GreedyEmbeddingSampler):
+  """A sampler for use during inference.
+
+  Uses sampling (from a distribution) instead of argmax and passes the
+  result through an embedding layer to get the next input.
+  """
+
+  def __init__(self, embedding_fn=None, softmax_temperature=None, seed=None):
+    """Initializer.
+
+    Args:
+      embedding_fn: (Optional) A callable that takes a vector tensor of `ids`
+        (argmax ids), or the `params` argument for `embedding_lookup`. The
+        returned tensor will be passed to the decoder input.
+      softmax_temperature: (Optional) `float32` scalar, value to divide the
+        logits by before computing the softmax. Larger values (above 1.0) result
+        in more random samples, while smaller values push the sampling
+        distribution towards the argmax. Must be strictly greater than 0.
+        Defaults to 1.0.
+      seed: (Optional) The sampling seed.
+
+    Raises:
+      ValueError: if `start_tokens` is not a 1D tensor or `end_token` is not a
+        scalar.
+    """
+    super(SampleEmbeddingSampler, self).__init__(embedding_fn)
+    self.softmax_temperature = softmax_temperature
+    self.seed = seed
+
+  def sample(self, time, outputs, state):
+    """sample for SampleEmbeddingHelper."""
+    del time, state  # unused by sample_fn
+    # Outputs are logits, we sample instead of argmax (greedy).
+    if not isinstance(outputs, ops.Tensor):
+      raise TypeError(
+          "Expected outputs to be a single Tensor, got: %s" % type(outputs))
+    if self.softmax_temperature is None:
+      logits = outputs
+    else:
+      logits = outputs / self.softmax_temperature
+
+    return categorical_sample(logits=logits, seed=self.seed)
+
+
+class InferenceSampler(Sampler):
+  """A helper to use during inference with a custom sampling function."""
+
+  def __init__(self,
+               sample_fn,
+               sample_shape,
+               sample_dtype,
+               end_fn,
+               next_inputs_fn=None):
+    """Initializer.
+
+    Args:
+      sample_fn: A callable that takes `outputs` and emits tensor `sample_ids`.
+      sample_shape: Either a list of integers, or a 1-D Tensor of type `int32`,
+        the shape of the each sample in the batch returned by `sample_fn`.
+      sample_dtype: the dtype of the sample returned by `sample_fn`.
+      end_fn: A callable that takes `sample_ids` and emits a `bool` vector
+        shaped `[batch_size]` indicating whether each sample is an end token.
+      next_inputs_fn: (Optional) A callable that takes `sample_ids` and returns
+        the next batch of inputs. If not provided, `sample_ids` is used as the
+        next batch of inputs.
+    """
+    self.sample_fn = sample_fn
+    self.sample_shape = tensor_shape.TensorShape(sample_shape)
+    self.sample_dtype = sample_dtype
+    self.end_fn = end_fn
+    self.next_inputs_fn = next_inputs_fn
+    self._batch_size = None
+
+  @property
+  def batch_size(self):
+    if self._batch_size is None:
+      raise ValueError("batch_size accessed before initialize was called")
+    return self._batch_size
+
+  @property
+  def sample_ids_shape(self):
+    return self.sample_shape
+
+  @property
+  def sample_ids_dtype(self):
+    return self.sample_dtype
+
+  def initialize(self, start_inputs):
+    self.start_inputs = ops.convert_to_tensor(start_inputs, name="start_inputs")
+    self._batch_size = array_ops.shape(start_inputs)[0]
+    finished = array_ops.tile([False], [self._batch_size])
+    return (finished, self.start_inputs)
+
+  def sample(self, time, outputs, state):
+    del time, state  # unused by sample
+    return self.sample_fn(outputs)
+
+  def next_inputs(self, time, outputs, state, sample_ids):
+    del time, outputs  # unused by next_inputs
+    if self.next_inputs_fn is None:
+      next_inputs = sample_ids
+    else:
+      next_inputs = self.next_inputs_fn(sample_ids)
+    finished = self.end_fn(sample_ids)
+    return (finished, next_inputs, state)
+
+
+# The following sample functions (_call_sampler, bernoulli_sample,
+# categorical_sample) mimic TensorFlow Probability distribution semantics.
+def _call_sampler(sample_n_fn, sample_shape, name=None):
+  """Reshapes vector of samples."""
+  with ops.name_scope(name, "call_sampler", values=[sample_shape]):
+    sample_shape = ops.convert_to_tensor(
+        sample_shape, dtype=dtypes.int32, name="sample_shape")
+    # Ensure sample_shape is a vector (vs just a scalar).
+    pad = math_ops.cast(
+        math_ops.equal(array_ops.rank(sample_shape), 0), dtypes.int32)
+    sample_shape = array_ops.reshape(
+        sample_shape,
+        array_ops.pad(
+            array_ops.shape(sample_shape),
+            paddings=[[pad, 0]],
+            constant_values=1))
+    samples = sample_n_fn(math_ops.reduce_prod(sample_shape))
+    batch_event_shape = array_ops.shape(samples)[1:]
+    final_shape = array_ops.concat([sample_shape, batch_event_shape], 0)
+    return array_ops.reshape(samples, final_shape)
+
+
+def bernoulli_sample(probs=None,
+                     logits=None,
+                     dtype=dtypes.int32,
+                     sample_shape=(),
+                     seed=None):
+  """Samples from Bernoulli distribution."""
+  if probs is None:
+    probs = math_ops.sigmoid(logits, name="probs")
+  else:
+    probs = ops.convert_to_tensor(probs, name="probs")
+  batch_shape_tensor = array_ops.shape(probs)
+
+  def _sample_n(n):
+    """Sample vector of Bernoullis."""
+    new_shape = array_ops.concat([[n], batch_shape_tensor], 0)
+    uniform = random_ops.random_uniform(new_shape, seed=seed, dtype=probs.dtype)
+    return math_ops.cast(math_ops.less(uniform, probs), dtype)
+
+  return _call_sampler(_sample_n, sample_shape)
+
+
+def categorical_sample(logits, dtype=dtypes.int32, sample_shape=(), seed=None):
+  """Samples from categorical distribution."""
+  logits = ops.convert_to_tensor(logits, name="logits")
+  event_size = array_ops.shape(logits)[-1]
+  batch_shape_tensor = array_ops.shape(logits)[:-1]
+
+  def _sample_n(n):
+    """Sample vector of categoricals."""
+    if logits.shape.ndims == 2:
+      logits_2d = logits
+    else:
+      logits_2d = array_ops.reshape(logits, [-1, event_size])
+    sample_dtype = dtypes.int64 if logits.dtype.size > 4 else dtypes.int32
+    draws = random_ops.multinomial(
+        logits_2d, n, seed=seed, output_dtype=sample_dtype)
+    draws = array_ops.reshape(
+        array_ops.transpose(draws),
+        array_ops.concat([[n], batch_shape_tensor], 0))
+    return math_ops.cast(draws, dtype)
+
+  return _call_sampler(_sample_n, sample_shape)
+
+
+def _unstack_ta(inp):
+  return tensor_array_ops.TensorArray(
+      dtype=inp.dtype,
+      size=array_ops.shape(inp)[0],
+      element_shape=inp.get_shape()[1:]).unstack(inp)
diff --git a/tensorflow/contrib/session_bundle/exporter.py b/tensorflow/contrib/session_bundle/exporter.py
index 08983337fccc138d40eb959cecc5bf9e47cf6cac..f3efd292cf5acba4319c8a5545a7f70fae4b5ce1 100644
--- a/tensorflow/contrib/session_bundle/exporter.py
+++ b/tensorflow/contrib/session_bundle/exporter.py
@@ -304,10 +304,10 @@ class Exporter(object):
       def parser(path):
         if os.name == "nt":
           match = re.match(
-              "^" + export_dir_base.replace("\\", "/") + "/(\\d{8})$",
+              r"^" + export_dir_base.replace("\\", "/") + r"/(\d{8})$",
               path.path.replace("\\", "/"))
         else:
-          match = re.match("^" + export_dir_base + "/(\\d{8})$", path.path)
+          match = re.match(r"^" + export_dir_base + r"/(\d{8})$", path.path)
         if not match:
           return None
         return path._replace(export_version=int(match.group(1)))
diff --git a/tensorflow/contrib/session_bundle/gc_test.py b/tensorflow/contrib/session_bundle/gc_test.py
index 8faf3ef3d4cd7ee0096265283070e25d06782254..02725bb1cbb4ef9ace29dcc58f6d23fb241d96b2 100644
--- a/tensorflow/contrib/session_bundle/gc_test.py
+++ b/tensorflow/contrib/session_bundle/gc_test.py
@@ -104,7 +104,7 @@ class GcTest(test_util.TensorFlowTestCase):
 
     # create a simple parser that pulls the export_version from the directory.
     def parser(path):
-      match = re.match("^" + base_dir + "/(\\d+)$", path.path)
+      match = re.match(r"^" + base_dir + r"/(\d+)$", path.path)
       if not match:
         return None
       return path._replace(export_version=int(match.group(1)))
diff --git a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
index 1b2b6acacca838f95cb758ae88f79263993ca69e..c63a3ca19b6a70cf7776c7fce4e0291ee94b775c 100644
--- a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
+++ b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
@@ -32,7 +32,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
@@ -396,8 +396,8 @@ class Image(ItemHandler):
     image_format = keys_to_tensors[self._format_key]
 
     if self._repeated:
-      return functional_ops.map_fn(lambda x: self._decode(x, image_format),
-                                   image_buffer, dtype=self._dtype)
+      return map_fn.map_fn(lambda x: self._decode(x, image_format),
+                           image_buffer, dtype=self._dtype)
     else:
       return self._decode(image_buffer, image_format)
 
diff --git a/tensorflow/contrib/slim/python/slim/nets/BUILD b/tensorflow/contrib/slim/python/slim/nets/BUILD
index 8bbdf96384683c68648367c6433eeb89c64c22bf..e9595d1b324dbd3d570d2407a6620c5295b15548 100644
--- a/tensorflow/contrib/slim/python/slim/nets/BUILD
+++ b/tensorflow/contrib/slim/python/slim/nets/BUILD
@@ -115,9 +115,9 @@ py_library(
 
 py_test(
     name = "inception_v1_test",
-    size = "large",
+    size = "medium",
     srcs = ["inception_v1_test.py"],
-    shard_count = 3,
+    shard_count = 8,
     srcs_version = "PY2AND3",
     deps = [
         ":inception_v1",
@@ -135,9 +135,9 @@ py_test(
 
 py_test(
     name = "inception_v2_test",
-    size = "large",
+    size = "medium",
     srcs = ["inception_v2_test.py"],
-    shard_count = 3,
+    shard_count = 8,
     srcs_version = "PY2AND3",
     deps = [
         ":inception_v2",
@@ -155,9 +155,9 @@ py_test(
 
 py_test(
     name = "inception_v3_test",
-    size = "large",
+    size = "medium",
     srcs = ["inception_v3_test.py"],
-    shard_count = 3,
+    shard_count = 8,
     srcs_version = "PY2AND3",
     deps = [
         ":inception_v3",
@@ -233,8 +233,9 @@ py_library(
 
 py_test(
     name = "resnet_v1_test",
-    size = "large",
+    size = "medium",
     srcs = ["resnet_v1_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
         ":resnet_utils",
@@ -268,8 +269,9 @@ py_library(
 
 py_test(
     name = "resnet_v2_test",
-    size = "large",
+    size = "medium",
     srcs = ["resnet_v2_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
         ":resnet_utils",
diff --git a/tensorflow/contrib/sparsemax/BUILD b/tensorflow/contrib/sparsemax/BUILD
index d7ba754f701d4b433e35ad8396eae7ee6132b97f..ed4eca1a60a6f0ccf629d8aa7906c02092e25ba0 100644
--- a/tensorflow/contrib/sparsemax/BUILD
+++ b/tensorflow/contrib/sparsemax/BUILD
@@ -49,6 +49,9 @@ cuda_py_tests(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    tags = [
+        "oss_serial",
+    ],
 )
 
 cuda_py_tests(
@@ -64,4 +67,7 @@ cuda_py_tests(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    tags = [
+        "oss_serial",
+    ],
 )
diff --git a/tensorflow/contrib/summary/BUILD b/tensorflow/contrib/summary/BUILD
index f88b03ec4c2b1f250091594ea12d7d1862029fa2..7dd52df6b68caea6111813837ba1e872acbeccdb 100644
--- a/tensorflow/contrib/summary/BUILD
+++ b/tensorflow/contrib/summary/BUILD
@@ -4,17 +4,14 @@ exports_files([
     "LICENSE",
 ])
 
-load(
-    "//tensorflow:tensorflow.bzl",
-    "py_test",
-    "tf_gen_op_wrapper_py",
-)
+load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_test(
     name = "summary_ops_test",
     srcs = ["summary_ops_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":summary",
         ":summary_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:errors",
@@ -22,7 +19,6 @@ py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform",
         "//tensorflow/python:state_ops",
-        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:training",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/eager:test",
@@ -35,6 +31,7 @@ py_test(
     srcs = ["summary_ops_graph_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":summary",
         ":summary_test_util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -43,7 +40,6 @@ py_test(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
         "@six_archive//:six",
diff --git a/tensorflow/contrib/summary/summary_ops_graph_test.py b/tensorflow/contrib/summary/summary_ops_graph_test.py
index 807741e05f92f6b666c175269742dc1af50c0054..8e13f7f56b23e47f046120b285b1519c6371ddab 100644
--- a/tensorflow/contrib/summary/summary_ops_graph_test.py
+++ b/tensorflow/contrib/summary/summary_ops_graph_test.py
@@ -22,6 +22,7 @@ import time
 
 import six
 
+from tensorflow.contrib.summary import summary as summary_ops
 from tensorflow.contrib.summary import summary_test_util
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
@@ -32,7 +33,6 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/summary/summary_ops_test.py b/tensorflow/contrib/summary/summary_ops_test.py
index 10e4556dacbc17ec02c2bd698389b04d517d7076..27bfdeb3601f4fdb9897feee509b06d5e8f9b873 100644
--- a/tensorflow/contrib/summary/summary_ops_test.py
+++ b/tensorflow/contrib/summary/summary_ops_test.py
@@ -25,6 +25,7 @@ import sqlite3
 import numpy as np
 import six
 
+from tensorflow.contrib.summary import summary as summary_ops
 from tensorflow.contrib.summary import summary_test_util
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
@@ -36,7 +37,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.training import training_util
 
diff --git a/tensorflow/contrib/tensor_forest/BUILD b/tensorflow/contrib/tensor_forest/BUILD
index 398ac314f4b520610ec100273b37c33bc4b5b43a..583bbf97c57cf263f65bc3b0a56b32cc2dce5482 100644
--- a/tensorflow/contrib/tensor_forest/BUILD
+++ b/tensorflow/contrib/tensor_forest/BUILD
@@ -537,8 +537,9 @@ py_library(
 
 py_test(
     name = "random_forest_test",
-    size = "large",
+    size = "medium",
     srcs = ["client/random_forest_test.py"],
+    shard_count = 6,
     srcs_version = "PY2AND3",
     tags = [
         "noasan",
diff --git a/tensorflow/contrib/tensor_forest/client/eval_metrics.py b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
index d8236a0a6fa6d0d0e383e454eb0146bb10b6f49d..0d87cea9fbaa8fe28b55ec996414a568d39efee3 100644
--- a/tensorflow/contrib/tensor_forest/client/eval_metrics.py
+++ b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
@@ -50,9 +50,10 @@ def _accuracy(predictions, targets, weights=None):
 def _r2(probabilities, targets, weights=None):
   targets = math_ops.to_float(targets)
   y_mean = math_ops.reduce_mean(targets, 0)
-  squares_total = math_ops.reduce_sum(math_ops.square(targets - y_mean), 0)
+  squares_total = math_ops.reduce_sum(
+      math_ops.squared_difference(targets, y_mean), 0)
   squares_residuals = math_ops.reduce_sum(
-      math_ops.square(targets - probabilities), 0)
+      math_ops.squared_difference(targets, probabilities), 0)
   score = 1 - math_ops.reduce_sum(squares_residuals / squares_total)
   return metrics.mean(score, weights=weights)
 
diff --git a/tensorflow/contrib/tensor_forest/kernels/model_ops.cc b/tensorflow/contrib/tensor_forest/kernels/model_ops.cc
index b9aad36f3d25b9fb7b8b525be54fb7a39394b373..76b1d2b4da269cda71f5b49878f2933d7d9b5776 100644
--- a/tensorflow/contrib/tensor_forest/kernels/model_ops.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/model_ops.cc
@@ -304,7 +304,7 @@ class TraverseTreeV4Op : public OpKernel {
     auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
     int num_threads = worker_threads->num_threads;
     const int64 costPerTraverse = 500;
-    auto traverse = [this, &set_leaf_ids, &data_set, decision_tree_resource,
+    auto traverse = [&set_leaf_ids, &data_set, decision_tree_resource,
                      num_data](int64 start, int64 end) {
       CHECK(start <= end);
       CHECK(end <= num_data);
diff --git a/tensorflow/contrib/tensor_forest/kernels/stats_ops.cc b/tensorflow/contrib/tensor_forest/kernels/stats_ops.cc
index fe2c91c1047fe56710b1a86b2fa3206caf6ff3bc..0243f106814511c1b53a5aacb830b845214a00a3 100644
--- a/tensorflow/contrib/tensor_forest/kernels/stats_ops.cc
+++ b/tensorflow/contrib/tensor_forest/kernels/stats_ops.cc
@@ -307,7 +307,7 @@ class ProcessInputOp : public OpKernel {
     // from a digits run on local desktop.  Heuristics might be necessary
     // if it really matters that much.
     const int64 costPerUpdate = 1000;
-    auto update = [this, &target, &leaf_ids_tensor, &num_targets, &data_set,
+    auto update = [&target, &leaf_ids_tensor, &num_targets, &data_set,
                    fertile_stats_resource, &locks, &set_lock, &ready_to_split,
                    num_data](int64 start, int64 end) {
       CHECK(start <= end);
@@ -317,7 +317,7 @@ class ProcessInputOp : public OpKernel {
                   static_cast<int32>(end), &ready_to_split);
     };
 
-    auto update_collated = [this, &target, &num_targets, fertile_stats_resource,
+    auto update_collated = [&target, &num_targets, fertile_stats_resource,
                             tree_resource, &leaf_examples, &set_lock,
                             &ready_to_split, &data_set,
                             num_leaves](int64 start, int64 end) {
diff --git a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
index e04eb60f9b27cfd8b6b4e1502594d4d310ae55cc..774da472f1543f938d1b607ebdef008f7b540211 100644
--- a/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
+++ b/tensorflow/contrib/tensor_forest/kernels/tree_utils.h
@@ -18,10 +18,10 @@
 #include <limits>
 
 #include "tensorflow/contrib/tensor_forest/kernels/data_spec.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/random/distribution_sampler.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/lib/strings/strcat.h"
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
index d3edb43733761a906c6e5bf8b65f76e3e1ae56fc..3100a5a0e5da1103b61bd089cd433721686b9e72 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/decision-tree-resource.h
@@ -32,7 +32,7 @@ class DecisionTreeResource : public ResourceBase {
   // Constructor.
   explicit DecisionTreeResource(const TensorForestParams& params);
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat("DecisionTree[size=",
                            decision_tree_->decision_tree().nodes_size(), "]");
   }
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h b/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
index eea0be27caf0a022ba7acaacd359c75a2df4eedb..44f2b3f473b9eced06bd800b9cf0a5a0825ec3eb 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/fertile-stats-resource.h
@@ -40,7 +40,7 @@ class FertileStatsResource : public ResourceBase {
     model_op_ = LeafModelOperatorFactory::CreateLeafModelOperator(params_);
   }
 
-  string DebugString() override { return "FertileStats"; }
+  string DebugString() const override { return "FertileStats"; }
 
   void ExtractFromProto(const FertileStats& stats);
 
diff --git a/tensorflow/contrib/tensor_forest/python/ops/model_ops.py b/tensorflow/contrib/tensor_forest/python/ops/model_ops.py
index 290c16fe3966791ea78986539750caf938a37322..40bf7081a3f22dfd68fd46f0f61695ee9ca7863b 100644
--- a/tensorflow/contrib/tensor_forest/python/ops/model_ops.py
+++ b/tensorflow/contrib/tensor_forest/python/ops/model_ops.py
@@ -35,7 +35,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import resources
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.training import saver
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.tracking import tracking
 
 
 _model_ops = loader.load_op_library(
diff --git a/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py b/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py
index 9184198cd4c8fd2a7609714d094d5ef2b6868658..80afcfb251f4d6455a9eb8ba5df4a6e43d2feb1c 100644
--- a/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py
+++ b/tensorflow/contrib/tensor_forest/python/ops/stats_ops.py
@@ -32,7 +32,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import resources
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.training import saver
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.tracking import tracking
 
 
 _stats_ops = loader.load_op_library(
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 784acce444a8d0c066f1b7ae6c1b5d7d65405549..91b6d2614a8963c21e35c385411dc4c9956e3146 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -11,567 +11,54 @@ exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
-    "tf_copts",
     "tf_cuda_library",
-    "tf_custom_op_library",
     "tf_custom_op_library_additional_deps",
-    "tf_gen_op_libs",
-    "tf_gen_op_wrapper_py",
 )
-load("//tensorflow:tensorflow.bzl", "cuda_py_test")
-load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
-load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
-load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
-load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 load(
     "@local_config_tensorrt//:build_defs.bzl",
     "if_tensorrt",
 )
 
-exports_files(glob([
-    "test/testdata/*",
-]))
-
-tf_cuda_cc_test(
-    name = "tensorrt_test_cc",
-    size = "small",
-    srcs = ["tensorrt_test.cc"],
-    tags = [
-        "no_windows",
-        "nomac",
-    ],
-    deps = [
-        "//tensorflow/core:gpu_init",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ] + if_tensorrt([
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_config_tensorrt//:nv_infer",
-    ]),
-)
-
-tf_custom_op_library(
-    name = "python/ops/_trt_engine_op.so",
-    srcs = [
-        "ops/trt_engine_op.cc",
-    ],
-    deps = [
-        ":trt_shape_function",
-        "//tensorflow/core:lib_proto_parsing",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]),
-)
-
 tf_cuda_library(
     name = "trt_shape_function",
     srcs = ["shape_fn/trt_shfn.cc"],
     hdrs = ["shape_fn/trt_shfn.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":trt_logging",
-        ":trt_plugins",
+        "//tensorflow/compiler/tf2tensorrt:trt_logging",
+        "//tensorflow/compiler/tf2tensorrt:trt_plugins",
     ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
+        "@local_config_tensorrt//:tensorrt",
     ]) + tf_custom_op_library_additional_deps(),
 )
 
-cc_library(
-    name = "trt_engine_op_kernel",
-    srcs = [
-        "kernels/trt_engine_op.cc",
-    ],
-    hdrs = [
-        "kernels/trt_engine_op.h",
-    ],
-    copts = tf_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":test_utils",
-        ":trt_allocator",
-        ":trt_conversion",
-        ":trt_logging",
-        ":trt_plugins",
-        ":trt_resources",
-        ":utils",
-        "//tensorflow/core:gpu_headers_lib",
-        "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/core:stream_executor_headers_lib",
-        "//tensorflow/core/grappler/costs:graph_properties",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]) + tf_custom_op_library_additional_deps(),
-    # TODO(laigd): fix this by merging header file in cc file.
-    alwayslink = 1,  # buildozer: disable=alwayslink-with-hdrs
-)
-
-tf_gen_op_libs(
-    op_lib_names = [
-        "trt_engine_op",
-    ],
-)
-
-tf_cuda_library(
-    name = "trt_logging",
-    srcs = ["log/trt_logger.cc"],
-    hdrs = ["log/trt_logger.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core:lib_proto_parsing",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]),
-)
-
-tf_gen_op_wrapper_py(
-    name = "trt_engine_op",
-    deps = [
-        ":trt_engine_op_op_lib",
-        ":trt_logging",
-        ":trt_shape_function",
-    ],
-)
-
-tf_custom_op_py_library(
-    name = "trt_engine_op_loader",
-    srcs = ["python/ops/trt_engine_op.py"],
-    dso = [
-        ":python/ops/_trt_engine_op.so",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]),
-    kernels = [
-        ":trt_engine_op_kernel",
-        ":trt_engine_op_op_lib",
-        ":trt_shape_function",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/contrib/util:util_py",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:resources",
-    ],
-)
-
 py_library(
     name = "init_py",
     srcs = [
         "__init__.py",
         "python/__init__.py",
+        "python/trt_convert.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":tf_trt_integration_test_base",
-        ":trt_convert_py",
-        ":trt_ops_py",
-        "//tensorflow/python:errors",
-    ],
-)
-
-py_library(
-    name = "trt_ops_py",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":trt_engine_op",
-        ":trt_engine_op_loader",
-    ],
-)
-
-py_library(
-    name = "trt_convert_py",
-    srcs = ["python/trt_convert.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":wrap_conversion",
-        "//tensorflow/python:graph_util",
-        "//tensorflow/python:session",
-        "//tensorflow/python:tf_optimizer",
-        "//tensorflow/python/saved_model:builder",
-        "//tensorflow/python/saved_model:loader",
-        "//tensorflow/python/saved_model:tag_constants",
-    ],
-)
-
-# TODO(aaroey): this wrapper has been causing troubles of double linking, so
-# either get rid of it, or split to make it contain minimum dependencies.
-tf_py_wrap_cc(
-    name = "wrap_conversion",
-    srcs = ["trt_conversion.i"],
-    copts = tf_copts(),
-    swig_includes = [
-        "//tensorflow/python:platform/base.i",
-    ],
-    deps = [
-        ":test_utils",
-        ":trt_conversion",
-        ":trt_engine_op_kernel",
-        "//third_party/python_runtime:headers",
-    ],
-)
-
-tf_cuda_library(
-    name = "trt_resources",
-    srcs = [
-        "resources/trt_int8_calibrator.cc",
-        "resources/trt_resource_manager.cc",
-    ],
-    hdrs = [
-        "resources/trt_int8_calibrator.h",
-        "resources/trt_resource_manager.h",
-        "resources/trt_resources.h",
+        "//tensorflow/python/compiler/tensorrt:init_py",
     ],
-    deps = [
-        ":trt_allocator",
-        ":trt_logging",
-        ":utils",
-        "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/core:framework_lite",
-        "//tensorflow/core:lib_proto_parsing",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]),
 )
 
-tf_cuda_library(
-    name = "trt_allocator",
-    srcs = ["resources/trt_allocator.cc"],
-    hdrs = ["resources/trt_allocator.h"],
-    deps = [
-        "//tensorflow/core:framework_headers_lib",
-        "//tensorflow/core:framework_lite",
-        "//tensorflow/core:lib_proto_parsing",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]),
-)
+# The following rules forward the libraries that were moved in order to not
+# break other internal targets.
 
-tf_cc_test(
-    name = "trt_allocator_test",
-    size = "small",
-    srcs = ["resources/trt_allocator_test.cc"],
-    tags = [
-        "no_windows",
-        "nomac",
-    ],
-    deps = [
-        ":trt_allocator",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ],
-)
-
-# Library for the node-level conversion portion of TensorRT operation creation
-tf_cuda_library(
+alias(
     name = "trt_conversion",
-    srcs = [
-        "convert/convert_graph.cc",
-        "convert/convert_nodes.cc",
-        "convert/trt_optimization_pass.cc",
-    ],
-    hdrs = [
-        "convert/convert_graph.h",
-        "convert/convert_nodes.h",
-        "convert/trt_optimization_pass.h",
-    ],
-    deps = [
-        ":segment",
-        ":test_utils",
-        ":trt_allocator",
-        ":trt_plugins",
-        ":trt_logging",
-        ":trt_resources",
-        ":utils",
-        "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler:utils",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_lite",
-        "//tensorflow/core:gpu_runtime",
-        "//tensorflow/core:graph",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/grappler:devices",
-        "//tensorflow/core/grappler/clusters:virtual_cluster",
-        "//tensorflow/core/grappler/costs:graph_properties",
-        "//tensorflow/core/grappler/optimizers:meta_optimizer",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]) + tf_custom_op_library_additional_deps(),
-)
-
-tf_cuda_cc_test(
-    name = "convert_graph_test",
-    size = "medium",
-    srcs = ["convert/convert_graph_test.cc"],
-    tags = [
-        "no_cuda_on_cpu_tap",
-        "no_windows",
-        "nomac",
-    ],
-    deps = [
-        ":trt_conversion",
-        "@com_google_googletest//:gtest",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:scope",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_base",
-        "//tensorflow/core:direct_session",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]),
-)
-
-tf_cuda_cc_test(
-    name = "convert_nodes_test",
-    size = "medium",
-    srcs = ["convert/convert_nodes_test.cc"],
-    tags = [
-        "no_cuda_on_cpu_tap",
-        "no_windows",
-        "nomac",
-    ],
-    deps = [
-        ":trt_logging",
-        ":trt_conversion",
-        ":trt_plugins",
-        "@com_google_googletest//:gtest",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:ops",
-        "//tensorflow/cc:scope",
-        "//tensorflow/core/grappler/costs:graph_properties",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_base",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ] + if_tensorrt([
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_config_tensorrt//:nv_infer",
-    ]),
-)
-
-# Library for the segmenting portion of TensorRT operation creation
-cc_library(
-    name = "segment",
-    srcs = ["segment/segment.cc"],
-    hdrs = [
-        "segment/segment.h",
-        "segment/union_find.h",
-    ],
-    deps = [
-        "//tensorflow/core:graph",
-        "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/core:protos_all_cc",
-        "@protobuf_archive//:protobuf_headers",
-    ],
-)
-
-tf_cc_test(
-    name = "segment_test",
-    size = "small",
-    srcs = ["segment/segment_test.cc"],
-    tags = [
-        "no_windows",
-        "nomac",
-    ],
-    deps = [
-        ":segment",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:scope",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:ops",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-# Library for the plugin factory
-tf_cuda_library(
-    name = "trt_plugins",
-    srcs = [
-        "plugin/trt_plugin.cc",
-        "plugin/trt_plugin_factory.cc",
-        "plugin/trt_plugin_utils.cc",
-    ],
-    hdrs = [
-        "plugin/trt_plugin.h",
-        "plugin/trt_plugin_factory.h",
-        "plugin/trt_plugin_utils.h",
-    ],
-    deps = [
-        "//tensorflow/core:framework_lite",
-        "//tensorflow/core:lib_proto_parsing",
-    ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]),
-)
-
-tf_cuda_cc_test(
-    name = "trt_plugin_factory_test",
-    size = "small",
-    srcs = ["plugin/trt_plugin_factory_test.cc"],
-    tags = [
-        "no_cuda_on_cpu_tap",
-        "no_windows",
-        "nomac",
-    ],
-    deps = [
-        ":trt_plugins",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ] + if_tensorrt([
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_config_tensorrt//:nv_infer",
-    ]),
-)
-
-py_library(
-    name = "tf_trt_integration_test_base",
-    srcs = ["test/tf_trt_integration_test_base.py"],
-    deps = [
-        ":trt_convert_py",
-        ":trt_ops_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-    ],
-)
-
-cuda_py_test(
-    name = "trt_convert_test",
-    srcs = ["python/trt_convert_test.py"],
-    additional_deps = [
-        ":trt_convert_py",
-        ":trt_ops_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:graph_util",
-        "//tensorflow/python/saved_model:builder",
-        "//tensorflow/python/saved_model:loader",
-        "//tensorflow/python/saved_model:signature_constants",
-        "//tensorflow/python/saved_model:signature_def_utils",
-        "//tensorflow/python/saved_model:tag_constants",
-        "//tensorflow/python/saved_model:utils",
-        "//tensorflow/python/tools:freeze_graph_lib",
-        "//tensorflow/python/tools:saved_model_utils",
-    ],
-    tags = [
-        "no_cuda_on_cpu_tap",
-        "no_windows",
-        "nomac",
-    ],
-)
-
-cuda_py_tests(
-    name = "tf_trt_integration_test",
-    srcs = [
-        "test/base_test.py",
-        "test/batch_matmul_test.py",
-        "test/biasadd_matmul_test.py",
-        "test/binary_tensor_weight_broadcast_test.py",
-        "test/concatenation_test.py",
-        "test/const_broadcast_test.py",
-        "test/manual_test.py",
-        "test/memory_alignment_test.py",
-        "test/multi_connection_neighbor_engine_test.py",
-        "test/neighboring_engine_test.py",
-        "test/quantization_test.py",
-        "test/rank_two_test.py",
-        "test/reshape_transpose_test.py",
-        "test/vgg_block_nchw_test.py",
-        "test/vgg_block_test.py",
-    ],
-    additional_deps = [
-        ":tf_trt_integration_test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-    ],
-    tags = [
-        "no_cuda_on_cpu_tap",
-        "no_windows",
-        "nomac",
-    ],
-)
-
-cuda_py_tests(
-    name = "tf_trt_integration_test_no_oss",
-    srcs = [
-        "test/unary_test.py",
-    ],
-    additional_deps = [
-        ":tf_trt_integration_test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-    ],
-    tags = [
-        "no_cuda_on_cpu_tap",
-        "no_oss",  # TODO(b/117274186): re-enable in OSS after crash fixed
-        "no_pip",  # TODO(b/117274186): re-enable in OSS after crash fixed
-        "no_windows",
-        "nomac",
-    ],
+    actual = "//tensorflow/compiler/tf2tensorrt:trt_conversion",
 )
 
-cuda_py_test(
-    name = "quantization_mnist_test",
-    srcs = ["test/quantization_mnist_test.py"],
-    additional_deps = [
-        ":tf_trt_integration_test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python/keras:keras",
-        "//tensorflow/python/estimator:estimator",
-    ],
-    data = [
-        "test/testdata/checkpoint",
-        "test/testdata/model.ckpt-46900.data-00000-of-00001",
-        "test/testdata/model.ckpt-46900.index",
-    ],
-    tags = [
-        "no_cuda_on_cpu_tap",
-        "no_pip",
-        "no_tap",  # It is not able to download the mnist data.
-        "no_windows",
-        "nomac",
-    ],
+alias(
+    name = "trt_op_kernels",
+    actual = "//tensorflow/compiler/tf2tensorrt:trt_op_kernels",
 )
 
-cc_library(
-    name = "utils",
-    srcs = ["convert/utils.cc"],
-    hdrs = ["convert/utils.h"],
-    copts = tf_copts(),
-    deps = [
-        "//tensorflow/core:lib",
-    ],
-)
-
-cc_library(
-    name = "test_utils",
-    srcs = ["test/utils.cc"],
-    hdrs = ["test/utils.h"],
-    deps = [
-        "//tensorflow/core:lib",
-        "@com_googlesource_code_re2//:re2",
-    ],
+alias(
+    name = "trt_engine_op_op_lib",
+    actual = "//tensorflow/compiler/tf2tensorrt:trt_engine_op_op_lib",
 )
diff --git a/tensorflow/contrib/tensorrt/README.md b/tensorflow/contrib/tensorrt/README.md
deleted file mode 100644
index caf8b6db0dc0a220d593f9c0afc9464ca51a1e05..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensorrt/README.md
+++ /dev/null
@@ -1,29 +0,0 @@
-# Using TensorRT in TensorFlow
-
-This module provides necessary bindings and introduces TRT_engine_op operator
-that wraps a subgraph in TensorRT. This is still a work in progress but should
-be useable with most common graphs.
-
-## Compilation
-
-In order to compile the module, you need to have a local TensorRT installation
-(libnvinfer.so and respective include files). During the configuration step,
-TensorRT should be enabled and installation path should be set. If installed
-through package managers (deb,rpm), configure script should find the necessary
-components from the system automatically. If installed from tar packages, user
-has to set path to location where the library is installed during configuration.
-
-```shell
-bazel build --config=cuda --config=opt //tensorflow/tools/pip_package:build_pip_package
-bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/
-```
-
-After the installation of tensorflow package, TensorRT transformation will be
-available. An example use can be found in test/test_tftrt.py script
-
-## Installing TensorRT 3.0.4
-
-In order to make use of TensorRT integration, you will need a local installation
-of TensorRT 3.0.4 from the [NVIDIA Developer website](https://developer.nvidia.com/tensorrt).
-Installation instructions for compatibility with TensorFlow are provided on the
-[TensorFlow GPU support](https://www.tensorflow.org/install/gpu) guide.
diff --git a/tensorflow/contrib/tensorrt/__init__.py b/tensorflow/contrib/tensorrt/__init__.py
index 140ad4828208ae4844a49bf664955b50cd9e51cd..fd551d70b4385b14b84b7b98a6d16b0c03733d38 100644
--- a/tensorflow/contrib/tensorrt/__init__.py
+++ b/tensorflow/contrib/tensorrt/__init__.py
@@ -18,18 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import errors
-
-# pylint: disable=unused-import,wildcard-import,g-import-not-at-top
-try:
-  from tensorflow.contrib.tensorrt.python import *
-except errors.NotFoundError as e:
-  no_trt_message = (
-      '**** Failed to initialize TensorRT. This is either because the TensorRT'
-      ' installation path is not in LD_LIBRARY_PATH, or because you do not have'
-      ' it installed. If not installed, please go to'
-      ' https://developer.nvidia.com/tensorrt to download and install'
-      ' TensorRT ****')
-  print(no_trt_message)
-  raise e
-# pylint: enable=unused-import,wildcard-import,g-import-not-at-top
+# pylint: disable=unused-import,wildcard-import
+from tensorflow.contrib.tensorrt.python import *
+# pylint: enable=unused-import,wildcard-import
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD b/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD
index 69058c5826822c519a69d50860c06b8ab3ec6578..0a2cf105baf5efb62d0c535c1f2d081973ec0ea3 100644
--- a/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD
@@ -45,10 +45,10 @@ tf_custom_op_library(
         "inc_op_kernel.cu.cc",
     ],
     deps = [
-        "//tensorflow/contrib/tensorrt:trt_plugins",
+        "//tensorflow/compiler/tf2tensorrt:trt_plugins",
         "//tensorflow/core:framework_lite",
     ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
+        "@local_config_tensorrt//:tensorrt",
     ]),
 )
 
@@ -64,10 +64,10 @@ tf_kernel_library(
         "inc_op_kernel.cu.cc",
     ],
     deps = [
-        "//tensorflow/contrib/tensorrt:trt_plugins",
+        "//tensorflow/compiler/tf2tensorrt:trt_plugins",
         "//tensorflow/core:stream_executor_headers_lib",
     ] + if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
+        "@local_config_tensorrt//:tensorrt",
     ]) + tf_custom_op_library_additional_deps(),
 )
 
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc
index 8d4c893af56689185da72398919e2241d451594b..7c9075142a02546ddd580e861ac87cb86badd739 100644
--- a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h"
 
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
 #include "tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h
index 189e9c939b9ffd4450f7ba95fe1abdbbc049b430..fb048d7b19da0f010ed918b147013b20d37ed0dd 100644
--- a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <cassert>
 #include <cstring>
 
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
deleted file mode 100644
index b545f497f32d5a1a6960b748467ca189b7debf6c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
-
-#include <memory>
-#include <vector>
-
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
-#include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/platform/mutex.h"
-
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
-#include "cuda/include/cuda_runtime_api.h"
-#include "tensorrt/include/NvInfer.h"
-
-namespace tensorflow {
-namespace tensorrt {
-struct TRTInt8Calibrator;
-class TRTCalibrationResource;
-class AsyncHelper;
-//  TODO(Sami): Remove this file?
-
-//  This OP can construct TRTEngine on the fly and if construction of engine
-//  fails, executes equivalent subgraph as a TensorFlow function.
-class TRTEngineOp : public AsyncOpKernel {
- public:
-  explicit TRTEngineOp(OpKernelConstruction* context);
-
-  void ComputeAsync(OpKernelContext* context,
-                    AsyncOpKernel::DoneCallback done) override;
-  ~TRTEngineOp();
-
- private:
-  // Execute calibration
-  void ExecuteCalibration(OpKernelContext* ctx, AsyncHelper* helper);
-
-  // Construct a function handle for executing native funcdef graph
-  Status ConstructFunctionHandle(OpKernelContext* ctx);
-
-  // Execute replaced native segment as function Op.
-  void ExecuteNativeSegment(OpKernelContext* ctx, AsyncHelper* helper);
-
-  // Execute the tensorrt engine. Returns whether we need to retry by running
-  // the native segment.
-  bool ExecuteTrtEngine(OpKernelContext* ctx, const int num_batch,
-                        nvinfer1::ICudaEngine* trt_engine_ptr,
-                        nvinfer1::IExecutionContext* trt_execution_context_ptr);
-
-  // Allocate necessary resources for calibration
-  Status AllocateCalibrationResources(OpKernelContext* ctx,
-                                      TRTCalibrationResource** cr);
-
-  // TODO(samikama): context should go to a resource manager!
-  typedef std::pair<TrtUniquePtrType<nvinfer1::ICudaEngine>,
-                    TrtUniquePtrType<nvinfer1::IExecutionContext>>
-      EngineCtxPair;
-  EngineCtxPair& GetEngine(int batch_size, OpKernelContext* ctx);
-
-  // Return engine batch closest to input batch.
-  int GetEngineBatch(OpKernelContext* ctx);
-
-  nvinfer1::IGpuAllocator* GetAllocator(OpKernelContext* ctx);
-
-  // map to keep engines and their execution context for given batch size.
-  std::unordered_map<int, EngineCtxPair> engine_map_;
-  std::vector<string> input_nodes_;
-  std::vector<string> output_nodes_;
-
-  // keep device allocator for TRT.
-  std::unique_ptr<TRTBaseAllocator> allocator_;
-
-  // serialized protobuf segment or trt engine depending on static_engine_ flag.
-  string serialized_segment_;
-
-  // Name of the function for TF native execution of the segment.
-  string funcdef_name_;
-
-  // GraphDef representation of the segment.
-  GraphDef segment_graph_;
-
-  // Lookup table for temporary staging areas of input tensors for calibration.
-  std::unordered_map<string, std::pair<void*, size_t>> device_buffers_;
-
-  // Temporary staging areas for calibration inputs.
-  std::vector<PersistentTensor> dev_tensors_;
-
-  // Engine Precision mode.
-  int precision_mode_;
-
-  // Whether engine is constructed during the conversion or needs to be
-  // constructed from protobuf segment.
-  bool static_engine_;
-
-  // Whether to calibrate INT8 engine.
-  bool calibration_mode_;
-
-  // Whether non-batch ranks of the inputs are assumed to be fixed or not for
-  // engine construction.
-  bool fixed_input_size_;
-
-  // Batches of the cached engines
-  std::vector<int> cached_engine_batches_;
-
-  // Maximum number of cached engines
-  int max_cached_engines_;
-
-  int64 workspace_size_;
-  mutex engine_mutex_;
-  FunctionLibraryRuntime::Handle native_func_;
-
-  // The finalized calibrator for inference.
-  std::unique_ptr<TRTInt8Calibrator> calibrator_;
-
-  // If true, create calibration graph for INT8 mode. Otherwise, we are using
-  // user-provided quantization ranges.
-  bool use_calibration_;
-};
-
-}  // namespace tensorrt
-}  // namespace tensorflow
-
-#endif  // GOOGLE_TENSORRT
-#endif  // GOOGLE_CUDA
-
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
diff --git a/tensorflow/contrib/tensorrt/python/__init__.py b/tensorflow/contrib/tensorrt/python/__init__.py
index 7cdfe2b1a612be2eec473d806d0eb44b611ca68a..0cae401023e7d3e3780b9dd2e2a92c9fd0e92db8 100644
--- a/tensorflow/contrib/tensorrt/python/__init__.py
+++ b/tensorflow/contrib/tensorrt/python/__init__.py
@@ -19,12 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long
-from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
-from tensorflow.contrib.tensorrt.python.trt_convert import add_test_value
 from tensorflow.contrib.tensorrt.python.trt_convert import calib_graph_to_infer_graph
-from tensorflow.contrib.tensorrt.python.trt_convert import clear_test_values
 from tensorflow.contrib.tensorrt.python.trt_convert import create_inference_graph
-from tensorflow.contrib.tensorrt.python.trt_convert import enable_test_value
-from tensorflow.contrib.tensorrt.python.trt_convert import get_test_value
-from tensorflow.contrib.tensorrt.python.trt_convert import is_tensorrt_enabled
 # pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index 203b2697babe32b45523109708cbf062dceee33b..4a959378138dec6f1c1a3f490704d7aebeae9b47 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -18,404 +18,41 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six as _six
-# pylint: disable=unused-import,line-too-long
-from tensorflow.contrib.tensorrt.wrap_conversion import add_test_value
-from tensorflow.contrib.tensorrt.wrap_conversion import calib_convert
-from tensorflow.contrib.tensorrt.wrap_conversion import clear_test_values
-from tensorflow.contrib.tensorrt.wrap_conversion import enable_test_value
-from tensorflow.contrib.tensorrt.wrap_conversion import get_linked_tensorrt_version
-from tensorflow.contrib.tensorrt.wrap_conversion import get_loaded_tensorrt_version
-from tensorflow.contrib.tensorrt.wrap_conversion import get_test_value
-from tensorflow.contrib.tensorrt.wrap_conversion import is_tensorrt_enabled
-# pylint: enable=unused-import,line-too-long
-from tensorflow.core.framework import graph_pb2
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.protobuf import meta_graph_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
-from tensorflow.python.client import session
-from tensorflow.python.framework import errors_impl as _impl
-from tensorflow.python.framework import graph_util
-from tensorflow.python.framework import importer
-from tensorflow.python.framework import ops
-from tensorflow.python.grappler import tf_optimizer
-from tensorflow.python.platform import tf_logging
-from tensorflow.python.saved_model import builder
-from tensorflow.python.saved_model import loader_impl
-from tensorflow.python.saved_model import tag_constants
-from tensorflow.python.training import saver
-
-if _six.PY2:
-  _to_bytes = lambda s: s
-  _to_string = lambda s: s
-else:
-  _to_bytes = lambda s: s.encode("utf-8", errors="surrogateescape")
-  _to_string = lambda s: s.decode("utf-8")
-
-
-class TrtPrecisionMode(object):
-  FP32 = "FP32"
-  FP16 = "FP16"
-  INT8 = "INT8"
-
-  @staticmethod
-  def supported_precision_modes():
-    return [TrtPrecisionMode.FP32, TrtPrecisionMode.FP16, TrtPrecisionMode.INT8]
-
-
-def get_tensorrt_rewriter_config(rewriter_config=None,
-                                 max_batch_size=1,
-                                 max_workspace_size_bytes=2 << 20,
-                                 precision_mode=TrtPrecisionMode.FP32,
-                                 minimum_segment_size=3,
-                                 is_dynamic_op=False,
-                                 maximum_cached_engines=1,
-                                 cached_engine_batch_sizes=None,
-                                 use_calibration=True):
-  """Returns a RewriterConfig proto for TRT transformation.
-
-  Args:
-    rewriter_config: a template RewriterConfig proto used to create a
-      TRT-enabled RewriterConfig. If None, it will use a default one.
-    max_batch_size: max size for the input batch
-    max_workspace_size_bytes: the maximum GPU temporary memory which the TRT
-      engine can use at execution time. This corresponds to the 'workspaceSize'
-      parameter of nvinfer1::IBuilder::setMaxWorkspaceSize().
-    precision_mode: one of TrtPrecisionMode.supported_precision_modes().
-    minimum_segment_size: the minimum number of nodes required for a subgraph to
-      be replaced by TRTEngineOp.
-    is_dynamic_op: whether to generate dynamic TRT ops which will build the TRT
-      network and engine at run time.
-    maximum_cached_engines: max number of cached TRT engines in dynamic TRT ops.
-      If the number of cached engines is already at max but none of them can
-      serve the input, the TRTEngineOp will fall back to run the TF function
-      based on which the TRTEngineOp is created.
-    cached_engine_batch_sizes: a list of batch sizes used to create cached
-      engines, only used when is_dynamic_op is True. The length of the list
-      should be smaller than maximum_cached_engines, and the dynamic TRT op will
-      use this list to determine the batch sizes of the cached engines, instead
-      of making the decision on the fly. This is useful when we know the most
-      common batch size(s) the application is going to generate.
-    use_calibration: this argument is ignored if precision_mode is not INT8. If
-      set to True, a calibration graph will be created to calibrate the missing
-      ranges. The calibration graph must be converted to an inference graph
-      using calib_graph_to_infer_graph() after running calibration. if set to
-      False, quantization nodes will be expected for every tensor in the graph
-      (exlcuding those which will be fused). If a range is missing, an error
-      will occur. Please note that accuracy may be negatively affected if there
-      is a mismatch between which tensors TRT quantizes and which tensors were
-      trained with fake quantization.
-
-  Returns:
-    A RewriterConfig proto which sets a TensorRTOptimizer to run Grappler.
-
-  Raises:
-    TypeError: if any of the parameters are of unexpected type.
-    ValueError: if any of the parameters are of unexpected value.
-  """
-  if rewriter_config is not None and not isinstance(
-      rewriter_config, rewriter_config_pb2.RewriterConfig):
-    raise TypeError("rewriter_config should be a RewriterConfig proto.")
-
-  rewriter_config_with_trt = rewriter_config_pb2.RewriterConfig()
-  if rewriter_config is None:
-    # Layout optimizer may add Const nodes followed by Reshape nodes, thus we
-    # need to run constant folding again.
-    rewriter_config_with_trt.optimizers.extend(
-        ["constfold", "layout", "constfold"])
-    rewriter_config_with_trt.meta_optimizer_iterations = (
-        rewriter_config_pb2.RewriterConfig.ONE)
-  else:
-    rewriter_config_with_trt.CopyFrom(rewriter_config)
-
-  if precision_mode.upper() not in TrtPrecisionMode.supported_precision_modes():
-    raise ValueError(("precision mode '{}' is not supported."
-                      "It should be one of {}").format(
-                          precision_mode,
-                          TrtPrecisionMode.supported_precision_modes))
-
-  optimizer = rewriter_config_with_trt.custom_optimizers.add()
-  optimizer.name = "TensorRTOptimizer"
-  optimizer.parameter_map["minimum_segment_size"].i = minimum_segment_size
-  optimizer.parameter_map["max_batch_size"].i = max_batch_size
-  optimizer.parameter_map["is_dynamic_op"].b = is_dynamic_op
-  optimizer.parameter_map[
-      "max_workspace_size_bytes"].i = max_workspace_size_bytes
-  optimizer.parameter_map["precision_mode"].s = _to_bytes(precision_mode)
-  optimizer.parameter_map["maximum_cached_engines"].i = maximum_cached_engines
-  if cached_engine_batch_sizes:
-    if not isinstance(cached_engine_batch_sizes, list):
-      raise TypeError("cached_engine_batch_sizes should be a list.")
-    if len(cached_engine_batch_sizes) > maximum_cached_engines:
-      raise ValueError("cached_engine_batch_sizes should not contain more than "
-                       "maximum_cached_engines items.")
-    optimizer.parameter_map["cached_engine_batches"].list.i.extend(
-        cached_engine_batch_sizes)
-  optimizer.parameter_map["use_calibration"].b = use_calibration
-  return rewriter_config_with_trt
-
-
-def create_inference_graph(input_graph_def,
-                           outputs,
-                           max_batch_size=1,
-                           max_workspace_size_bytes=2 << 20,
-                           precision_mode=TrtPrecisionMode.FP32,
-                           minimum_segment_size=3,
-                           is_dynamic_op=False,
-                           maximum_cached_engines=1,
-                           cached_engine_batch_sizes=None,
-                           use_calibration=True,
-                           input_saved_model_dir=None,
-                           input_saved_model_tags=None,
-                           output_saved_model_dir=None,
-                           session_config=None):
-  """Python wrapper for the TRT transformation.
-
-  Args:
-    input_graph_def: a GraphDef object containing a model to be transformed. If
-      set to None, the graph will be read from the SavedModel loaded from
-      input_saved_model_dir.
-    outputs: list of tensors or node names for the model outputs. Only used when
-      input_graph_def is not None.
-    max_batch_size: max size for the input batch.
-    max_workspace_size_bytes: the maximum GPU temporary memory which the TRT
-      engine can use at execution time. This corresponds to the 'workspaceSize'
-      parameter of nvinfer1::IBuilder::setMaxWorkspaceSize().
-    precision_mode: one of TrtPrecisionMode.supported_precision_modes().
-    minimum_segment_size: the minimum number of nodes required for a subgraph to
-      be replaced by TRTEngineOp.
-    is_dynamic_op: whether to generate dynamic TRT ops which will build the TRT
-      network and engine at run time.
-    maximum_cached_engines: max number of cached TRT engines in dynamic TRT ops.
-      If the number of cached engines is already at max but none of them can
-      serve the input, the TRTEngineOp will fall back to run the TF function
-      based on which the TRTEngineOp is created.
-    cached_engine_batch_sizes: a list of batch sizes used to create cached
-      engines, only used when is_dynamic_op is True. The length of the list
-      should be smaller than maximum_cached_engines, and the dynamic TRT op will
-      use this list to determine the batch sizes of the cached engines, instead
-      of making the decision on the fly. This is useful when we know the most
-      common batch size(s) the application is going to generate.
-    use_calibration: this argument is ignored if precision_mode is not INT8. If
-      set to True, a calibration graph will be created to calibrate the missing
-      ranges. The calibration graph must be converted to an inference graph
-      using calib_graph_to_infer_graph() after running calibration. if set to
-      False, quantization nodes will be expected for every tensor in the graph
-      (exlcuding those which will be fused). If a range is missing, an error
-      will occur. Please note that accuracy may be negatively affected if there
-      is a mismatch between which tensors TRT quantizes and which tensors were
-      trained with fake quantization.
-    input_saved_model_dir: the directory to load the SavedModel which contains
-      the input graph to transforms. Used only when input_graph_def is None.
-    input_saved_model_tags: list of tags to load the SavedModel.
-    output_saved_model_dir: if not None, construct a SavedModel using the
-      returned GraphDef and save it to the specified directory. This option only
-      works when the input graph is loaded from a SavedModel, i.e. when
-      input_saved_model_dir is specified and input_graph_def is None.
-    session_config: the ConfigProto used to create a Session. It's also used as
-      a template to create a TRT-enabled ConfigProto for conversion. If not
-      specified, a default ConfigProto will be used.
-
-  Returns:
-    A GraphDef transformed from input_graph_def (or the SavedModel graph def
-    loaded from input_saved_model_dir, if input_graph_def is not present), where
-    all TRT compatible subgraphs are replaced with TRTEngineOps, and a TF
-    function is added for each of the subgraphs.
-
-    If is_dynamic_op is True, each TRTEngineOp will contain a serialized
-    subgraph GraphDef, which will be converted to a TRT engine at execution time
-    and the TRT engine will be cached for future usage. A new TRT engine will be
-    created each time when none of the cached engines match the input shapes. If
-    it fails to execute the TRT engine or the number of cached engines reaches
-    maximum_cached_engines, the op will fall back to call the corresponding TF
-    function.
-
-    If is_dynamic_op is False, each TRTEngineOp will contain a serialized TRT
-    engine created from the corresponding subgraph. No more engines will be
-    created on the fly, and the op will fall back to call the corresponding TF
-    function when it fails to execute the engine.
-
-  Raises:
-    ValueError: if the combination of the parameters is invalid.
-    RuntimeError: if the TensorRT library version is incompatible.
-  """
-  compiled_version = get_linked_tensorrt_version()
-  loaded_version = get_loaded_tensorrt_version()
-  version_mismatch = False
-  if loaded_version[0] < compiled_version[0]:
-    tf_logging.error(
-        "TensorRT version mismatch. Tensorflow was compiled against " +
-        "TensorRT %s but library loaded from environment is TensorRT %s" %
-        (".".join([str(x) for x in compiled_version]),
-         ".".join([str(x) for x in loaded_version])) +
-        ". Please make sure that correct version of TensorRT " +
-        "is available in the system and added to ldconfig or LD_LIBRARY_PATH")
-    raise RuntimeError("Incompatible TensorRT library version")
-  for i in zip(loaded_version, compiled_version):
-    if i[0] != i[1]:
-      tf_logging.warn("TensorRT mismatch. Compiled against version " +
-                      "%s, but loaded %s. Things may not work" %
-                      (".".join([str(x) for x in compiled_version]),
-                       ".".join([str(x) for x in loaded_version])))
-      version_mismatch = True
-      break
-  if not version_mismatch:
-    tf_logging.info("Running against TensorRT version %s" % ".".join(
-        [str(x) for x in loaded_version]))
-
-  if session_config is None:
-    session_config = config_pb2.ConfigProto()
-
-  if input_saved_model_tags is None:
-    input_saved_model_tags = [tag_constants.SERVING]
-  saved_model_loader = None
-  grappler_meta_graph_def = None
-
-  if input_graph_def is None:
-    # Read from SavedModel and freeze the graph if necessary.
-    if input_saved_model_dir is None:
-      raise ValueError("input_graph_def and input_saved_model_dir cannot be "
-                       "both None")
-    with ops.Graph().as_default():
-      with session.Session(config=session_config) as sess:
-        saved_model_loader = loader_impl.SavedModelLoader(input_saved_model_dir)
-        input_meta_graph_def = saved_model_loader.load(sess,
-                                                       input_saved_model_tags)
-        output_node_names = set()
-
-        def _gather_names(tensor_info):
-          """Get the node names from a TensorInfo."""
-          return set(
-              [tensor_info[key].name.split(":")[0] for key in tensor_info])
-
-        # Get input and outputs from all SignatureDef.
-        for key in input_meta_graph_def.signature_def:
-          signature_def = input_meta_graph_def.signature_def[key]
-          output_node_names.update(_gather_names(signature_def.inputs))
-          output_node_names.update(_gather_names(signature_def.outputs))
-
-        # Freeze the variables in the SavedModel graph and copy the frozen
-        # graph over.
-        frozen_graph_def = graph_util.convert_variables_to_constants(
-            sess, sess.graph.as_graph_def(add_shapes=True),
-            list(output_node_names))
-        grappler_meta_graph_def = meta_graph_pb2.MetaGraphDef()
-        grappler_meta_graph_def.graph_def.CopyFrom(frozen_graph_def)
-
-        # Copy the collections that are not variables.
-        for key in input_meta_graph_def.collection_def:
-          # TODO(laigd): currently we use the collection key to filter out
-          # collections that depend on variable ops, but this may miss some
-          # other user-defined collections. A better way would be to use
-          # CollectionDef::NodeList for the filtering.
-          if key not in [
-              "variables", "local_variables", "model_variables",
-              "trainable_variables", "train_op", "table_initializer"
-          ]:
-            grappler_meta_graph_def.collection_def[key].CopyFrom(
-                input_meta_graph_def.collection_def[key])
-
-        # Copy other information.
-        grappler_meta_graph_def.meta_info_def.CopyFrom(
-            input_meta_graph_def.meta_info_def)
-        for key in input_meta_graph_def.signature_def:
-          grappler_meta_graph_def.signature_def[key].CopyFrom(
-              input_meta_graph_def.signature_def[key])
-        # TODO(laigd): maybe add back AssetFileDef.
-  else:
-    if output_saved_model_dir is not None:
-      raise ValueError("output_saved_model_dir cannot be set when "
-                       "input_graph_def is set")
-    # Create MetaGraphDef from input graph.
-    graph = ops.Graph()
-    with graph.as_default():
-      importer.import_graph_def(input_graph_def, name="")
-    grappler_meta_graph_def = saver.export_meta_graph(
-        graph_def=graph.as_graph_def(add_shapes=True), graph=graph)
-    if outputs:
-      output_collection = meta_graph_pb2.CollectionDef()
-      output_list = output_collection.node_list.value
-      for i in outputs:
-        if isinstance(i, ops.Tensor):
-          output_list.append(_to_bytes(i.name))
-        else:
-          output_list.append(_to_bytes(i))
-      # TODO(laigd): use another key as the outputs are really not train_op.
-      grappler_meta_graph_def.collection_def["train_op"].CopyFrom(
-          output_collection)
-
-  # Create TRT-enabled ConfigProto.
-  session_config_with_trt = config_pb2.ConfigProto()
-  session_config_with_trt.CopyFrom(session_config)
-  rewriter_config = None
-  if (session_config_with_trt.HasField("graph_options") and
-      session_config_with_trt.graph_options.HasField("rewrite_options")):
-    rewriter_config = session_config_with_trt.graph_options.rewrite_options
-  rewriter_config_with_trt = get_tensorrt_rewriter_config(
-      rewriter_config, max_batch_size, max_workspace_size_bytes, precision_mode,
-      minimum_segment_size, is_dynamic_op, maximum_cached_engines,
-      cached_engine_batch_sizes, use_calibration)
-  session_config_with_trt.graph_options.rewrite_options.CopyFrom(
-      rewriter_config_with_trt)
-
-  # Run Grappler.
-  transformed_graph_def = tf_optimizer.OptimizeGraph(
-      session_config_with_trt, grappler_meta_graph_def, graph_id=b"tf_graph")
-
-  # Optionally write the transformed graphdef as SavedModel.
-  if output_saved_model_dir is not None:
-    saved_model_builder = builder.SavedModelBuilder(output_saved_model_dir)
-    with ops.Graph().as_default():
-      importer.import_graph_def(transformed_graph_def, name="")
-      # We don't use TRT here.
-      with session.Session(config=session_config) as sess:
-        saved_model_builder.add_meta_graph_and_variables(
-            sess,
-            input_saved_model_tags,
-            signature_def_map=grappler_meta_graph_def.signature_def)
-    # Ignore other meta graphs from the input SavedModel.
-    saved_model_builder.save()
-
-  return transformed_graph_def
+from tensorflow.python.compiler.tensorrt import trt_convert
+
+
+def create_inference_graph(
+    input_graph_def,
+    outputs,
+    max_batch_size=1,
+    max_workspace_size_bytes=trt_convert.DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES,
+    precision_mode=trt_convert.TrtPrecisionMode.FP32,
+    minimum_segment_size=3,
+    is_dynamic_op=False,
+    maximum_cached_engines=1,
+    cached_engine_batches=None,
+    use_calibration=True,
+    input_saved_model_dir=None,
+    input_saved_model_tags=None,
+    output_saved_model_dir=None,
+    session_config=None):
+  return trt_convert.create_inference_graph(
+      input_graph_def=input_graph_def,
+      outputs=outputs,
+      max_batch_size=max_batch_size,
+      max_workspace_size_bytes=max_workspace_size_bytes,
+      precision_mode=precision_mode,
+      minimum_segment_size=minimum_segment_size,
+      is_dynamic_op=is_dynamic_op,
+      maximum_cached_engines=maximum_cached_engines,
+      cached_engine_batches=cached_engine_batches,
+      use_calibration=use_calibration,
+      input_saved_model_dir=input_saved_model_dir,
+      input_saved_model_tags=input_saved_model_tags,
+      output_saved_model_dir=output_saved_model_dir,
+      session_config=session_config)
 
 
 def calib_graph_to_infer_graph(calibration_graph_def, is_dynamic_op=False):
-  """Convert an existing calibration graph to inference graph.
-
-  Args:
-    calibration_graph_def: the calibration GraphDef object with calibration data
-    is_dynamic_op: whether to create dynamic static engines from calibration
-
-  Returns:
-    New GraphDef with TRTEngineOps placed in graph replacing calibration nodes.
-  Raises:
-    RuntimeError: if the returned status message is malformed.
-  """
-
-  is_calib_graph = False
-  for n in calibration_graph_def.node:
-    if n.op == "TRTEngineOp":
-      is_calib_graph = is_calib_graph or not n.attr["calibration_data"].s
-  if not is_calib_graph:
-    tf_logging.error(
-        "Not a calib graph. Doesn't seem to contain any calibration nodes.")
-    return None
-  graph_str = calibration_graph_def.SerializeToString()
-  out = calib_convert(graph_str, is_dynamic_op)
-  status = _to_string(out[0])
-  output_graph_def_string = out[1]
-  del graph_str  # Save some memory
-  if len(status) < 2:
-    raise _impl.UnknownError(None, None, status)
-  if status[:2] != "OK":
-    msg = status.split(";")
-    if len(msg) == 1:
-      raise RuntimeError("Status message is malformed {}".format(status))
-    # pylint: disable=protected-access
-    raise _impl._make_specific_exception(None, None, ";".join(msg[1:]),
-                                         int(msg[0]))
-    # pylint: enable=protected-access
-  output_graph_def = graph_pb2.GraphDef()
-  output_graph_def.ParseFromString(output_graph_def_string)
-  del output_graph_def_string  # Save some memory
-  return output_graph_def
+  return trt_convert.calib_graph_to_infer_graph(
+      calibration_graph_def=calibration_graph_def, is_dynamic_op=is_dynamic_op)
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
deleted file mode 100644
index 9c3698e5d1cc5d6d8d31a8fcaf03d103f1e1915d..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace tensorflow {
-namespace tensorrt {
-
-std::shared_ptr<TRTResourceManager>
-tensorflow::tensorrt::TRTResourceManager::instance() {
-  static std::shared_ptr<TRTResourceManager> instance_(new TRTResourceManager);
-  return instance_;
-}
-
-std::shared_ptr<tensorflow::ResourceMgr>
-tensorflow::tensorrt::TRTResourceManager::getManager(const string& op_name) {
-  // mutex is held for lookup only. Most instantiations where mutex will be held
-  // longer will be during op creation and should be ok.
-  tensorflow::mutex_lock lock(map_mutex_);
-  auto s = managers_.find(op_name);
-  if (s == managers_.end()) {
-    auto it = managers_.emplace(
-        op_name, std::make_shared<tensorflow::ResourceMgr>(op_name));
-    VLOG(1) << "Returning a new manager " << op_name;
-    return it.first->second;
-  }
-  VLOG(1) << "Returning old manager " << op_name;
-  return s->second;
-}
-
-}  // namespace tensorrt
-}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h b/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
deleted file mode 100644
index 19f39e6d3db1571573fb290dd2c30fd43ea604ef..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensorrt/resources/trt_resource_manager.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCE_MANAGER_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCE_MANAGER_H_
-#include <memory>
-
-#include <string>
-#include <unordered_map>
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/platform/mutex.h"
-
-namespace tensorflow {
-namespace tensorrt {
-
-class TRTResourceManager {
-  TRTResourceManager() = default;
-
- public:
-  static std::shared_ptr<TRTResourceManager> instance();
-  // returns a manager for given op, if it doesn't exists it creates one
-  std::shared_ptr<tensorflow::ResourceMgr> getManager(const string& op_name);
-
- private:
-  std::unordered_map<string, std::shared_ptr<tensorflow::ResourceMgr>>
-      managers_;
-  tensorflow::mutex map_mutex_;
-};
-
-}  // namespace tensorrt
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCE_MANAGER_H_
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resources.h b/tensorflow/contrib/tensorrt/resources/trt_resources.h
deleted file mode 100644
index aac9e5c7bd725fc10bcaa04536ebc7be071b4d4c..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensorrt/resources/trt_resources.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCES_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCES_H_
-
-#include <list>
-#include <sstream>
-#include <string>
-#include <thread>
-#include <vector>
-
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
-#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-
-#if GOOGLE_CUDA
-#if GOOGLE_TENSORRT
-
-#include "tensorrt/include/NvInfer.h"
-
-namespace tensorflow {
-namespace tensorrt {
-
-class TRTCalibrationResource : public tensorflow::ResourceBase {
- public:
-  ~TRTCalibrationResource() {
-    LOG(INFO) << "Destroying Calibration Resource " << std::endl
-              << DebugString();
-    builder_.reset();
-    engine_.reset();
-    // We need to manually destroy the builder and engine before the allocator
-    // is destroyed.
-    allocator_.reset();
-  }
-
-  string DebugString() override {
-    std::stringstream oss;
-    using std::dec;
-    using std::endl;
-    using std::hex;
-    oss << " Calibrator = " << hex << calibrator_.get() << dec << endl
-        << " Builder    = " << hex << builder_.get() << dec << endl
-        << " Engine     = " << hex << engine_.get() << dec << endl
-        << " Logger     = " << hex << &logger_ << dec << endl
-        << " Allocator  = " << hex << allocator_.get() << dec << endl
-        << " Thread     = " << hex << thr_.get() << dec << endl;
-    return oss.str();
-  }
-
-  std::unique_ptr<TRTInt8Calibrator> calibrator_;
-  TrtUniquePtrType<nvinfer1::IBuilder> builder_;
-  TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
-  std::unique_ptr<TRTBaseAllocator> allocator_;
-  tensorflow::tensorrt::Logger logger_;
-  // TODO(sami): Use threadpool threads!
-  std::unique_ptr<std::thread> thr_;
-};
-
-}  // namespace tensorrt
-}  // namespace tensorflow
-
-#endif
-#endif
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCES_H_
diff --git a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
index f30dba59ad55317d7ad7730e4dc66c9aba4e6a6b..5c60d6b589ed6a16276226726d989e949bcbf9d7 100644
--- a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
+++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
@@ -14,14 +14,14 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h"
-#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 
 #include <string>
 #include <vector>
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/compiler/tf2tensorrt/plugin/trt_plugin_factory.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorrt/include/NvInfer.h"
 
diff --git a/tensorflow/contrib/tensorrt/test/manual_test.py b/tensorflow/contrib/tensorrt/test/manual_test.py
deleted file mode 100644
index 1187c759b4b5483cbf5afe136401abe86d6ef989..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensorrt/test/manual_test.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Basic tests for TF-TensorRT integration."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import ast
-import os
-
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
-from tensorflow.core.framework import graph_pb2
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
-
-
-class ManualTest(trt_test.TfTrtIntegrationTestBase):
-
-  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
-    super(ManualTest, self).__init__(methodName)
-    self._params_map = None
-
-  def _GetEnv(self):
-    """Get an environment variable specifying the manual test parameters.
-
-    The value of the environment variable is the string representation of a dict
-    which should contain the following keys:
-    - 'graph_path': the file path to the serialized frozen graphdef
-    - 'input_names': TfTrtIntegrationTestParams.input_names
-    - 'input_dims': TfTrtIntegrationTestParams.input_dims
-    - 'expected_output_dims': TfTrtIntegrationTestParams.expected_output_dims
-    - 'output_name': the name of op to fetch
-    - 'expected_engines_to_run': ExpectedEnginesToRun() will return this
-    - 'expected_engines_to_build': ExpectedEnginesToBuild() will return this
-    - 'max_batch_size': ConversionParams.max_batch_size
-
-    Returns:
-      The value of the environment variable.
-    """
-    return os.getenv('TRT_MANUAL_TEST_PARAMS', '')
-
-  def _GetParamsMap(self):
-    """Parse the environment variable as a dict and return it."""
-    if self._params_map is None:
-      self._params_map = ast.literal_eval(self._GetEnv())
-    return self._params_map
-
-  def GetParams(self):
-    """Testing conversion of manually provided frozen graph."""
-    params_map = self._GetParamsMap()
-    gdef = graph_pb2.GraphDef()
-    with gfile.Open(params_map['graph_path'], 'rb') as f:
-      gdef.ParseFromString(f.read())
-    return trt_test.TfTrtIntegrationTestParams(
-        gdef=gdef,
-        input_names=params_map['input_names'],
-        input_dims=params_map['input_dims'],
-        output_names=params_map['output_names'],
-        expected_output_dims=params_map['expected_output_dims'])
-
-  def GetConversionParams(self, run_params):
-    """Return a ConversionParams for test."""
-    conversion_params = super(ManualTest, self).GetConversionParams(run_params)
-    params_map = self._GetParamsMap()
-    if 'max_batch_size' in params_map:
-      conversion_params = conversion_params._replace(
-          max_batch_size=params_map['max_batch_size'])
-    return conversion_params
-
-  def ExpectedEnginesToBuild(self, run_params):
-    """Return the expected engines to build."""
-    return self._GetParamsMap()['expected_engines_to_build']
-
-  def ExpectedEnginesToRun(self, run_params):
-    """Return the expected engines to run."""
-    params_map = self._GetParamsMap()
-    if 'expected_engines_to_run' in params_map:
-      return params_map['expected_engines_to_run']
-    return self.ExpectedEnginesToBuild(run_params)
-
-  def ExpectedAbsoluteTolerance(self, run_params):
-    """The absolute tolerance to compare floating point results."""
-    params_map = self._GetParamsMap()
-    if 'atol' in params_map:
-      return params_map['atol']
-    return 1.e-3
-
-  def ExpectedRelativeTolerance(self, run_params):
-    """The relative tolerance to compare floating point results."""
-    params_map = self._GetParamsMap()
-    if 'rtol' in params_map:
-      return params_map['rtol']
-    return 1.e-3
-
-  def ShouldRunTest(self, run_params):
-    """Whether to run the test."""
-    return len(self._GetEnv())
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/test_tftrt.py b/tensorflow/contrib/tensorrt/test/test_tftrt.py
deleted file mode 100644
index d26f26008635733c6c364a98b72b88c1e552f5fe..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensorrt/test/test_tftrt.py
+++ /dev/null
@@ -1,287 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Script to test TF-TensorRT integration."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import numpy as np
-import six as _six
-
-# normally we should do import tensorflow as tf and then
-# tf.placeholder, tf.constant, tf.nn.conv2d etc but
-# it looks like internal builds don't like it so
-# importing every module individually
-
-from tensorflow.contrib import tensorrt as trt
-from tensorflow.core.protobuf import config_pb2 as cpb2
-from tensorflow.core.protobuf import rewriter_config_pb2 as rwpb2
-from tensorflow.python.client import session as csess
-from tensorflow.python.framework import constant_op as cop
-from tensorflow.python.framework import dtypes as dtypes
-from tensorflow.python.framework import importer as importer
-from tensorflow.python.framework import ops as ops
-from tensorflow.python.ops import array_ops as aops
-from tensorflow.python.ops import math_ops as mops
-from tensorflow.python.ops import nn as nn
-from tensorflow.python.ops import nn_ops as nn_ops
-
-
-def py2bytes(inp):
-  return inp
-
-
-def py3bytes(inp):
-  return inp.encode("utf-8", errors="surrogateescape")
-
-
-def py2string(inp):
-  return inp
-
-
-def py3string(inp):
-  return inp.decode("utf-8")
-
-
-if _six.PY2:
-  to_bytes = py2bytes
-  to_string = py2string
-else:
-  to_bytes = py3bytes
-  to_string = py3string
-
-
-def get_multi_engine_graph_def(mode="FP32"):
-  """Create a simple graph and return its graph_def."""
-  dtype = dtypes.float32
-  if mode.upper() == "FP16":
-    dtype = dtypes.float16
-  else:
-    pass
-
-  g = ops.Graph()
-  with g.as_default():
-    x = aops.placeholder(shape=[None, 3, 7, 5], name="input", dtype=dtype)
-    with g.name_scope("Global_scope"):
-      with g.name_scope("first_scope"):
-        e = cop.constant(
-            np.random.randn(3, 2, 3, 4), name="weights", dtype=dtype)
-        conv = nn.conv2d(
-            input=x,
-            filter=e,
-            data_format="NCHW",
-            strides=[1, 1, 1, 1],
-            padding="VALID",
-            name="conv")
-        b = cop.constant(np.random.randn(1, 4, 1, 1), name="bias1", dtype=dtype)
-        t = conv * b
-
-        b = cop.constant(np.random.randn(1, 4, 1, 1), name="bias2", dtype=dtype)
-        q = conv / b
-      edge = mops.sin(q)
-      edge1 = mops.cos(conv)
-      with g.name_scope("test_scope"):
-        de = edge + edge1
-        t -= edge1
-        q *= edge
-        t += q
-        t -= de
-    k = aops.squeeze(t, name="output")
-  print(k.dtype)
-  return g.as_graph_def()
-
-
-def get_simple_graph_def():
-  """Create a simple graph and return its graph_def."""
-  g = ops.Graph()
-  with g.as_default():
-    a = aops.placeholder(
-        dtype=dtypes.float32, shape=(None, 24, 24, 2), name="input")
-    e = cop.constant(
-        [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
-        name="weights",
-        dtype=dtypes.float32)
-    conv = nn.conv2d(
-        input=a, filter=e, strides=[1, 2, 2, 1], padding="SAME", name="conv")
-    b = cop.constant(
-        [4., 1.5, 2., 3., 5., 7.], name="bias", dtype=dtypes.float32)
-    t = nn.bias_add(conv, b, name="biasAdd")
-    relu = nn.relu(t, "relu")
-    idty = aops.identity(relu, "ID")
-    v = nn_ops.max_pool(
-        idty, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
-    aops.squeeze(v, name="output")
-  return g.as_graph_def()
-
-
-def execute_graph(gdef, dumm_inp):
-  """Run given graphdef once."""
-  print("executing")
-  gpu_options = None
-  if trt.trt_convert.get_linked_tensorrt_version()[0] == 3:
-    gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
-  sessconfig = cpb2.ConfigProto(gpu_options=gpu_options)
-  ops.reset_default_graph()
-  g = ops.Graph()
-  with g.as_default():
-    inp, out = importer.import_graph_def(
-        graph_def=gdef, return_elements=["input", "output"])
-    inp = inp.outputs[0]
-    out = out.outputs[0]
-  with csess.Session(config=sessconfig, graph=g) as sess:
-    val = sess.run(out, {inp: dumm_inp})
-  return val
-
-
-# Use real data that is representative of the inference dataset
-# for calibration. For this test script it is random data.
-def execute_calibration(gdef, dumm_inp):
-  """Run given calibration graph multiple times."""
-  gpu_options = None
-  if trt.trt_convert.get_linked_tensorrt_version()[0] == 3:
-    gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
-  ops.reset_default_graph()
-  g = ops.Graph()
-  with g.as_default():
-    inp, out = importer.import_graph_def(
-        graph_def=gdef, return_elements=["input", "output"])
-    inp = inp.outputs[0]
-    out = out.outputs[0]
-  with csess.Session(
-      config=cpb2.ConfigProto(gpu_options=gpu_options), graph=g) as sess:
-    # run over real calibration data here, we are mimicking a calibration set of
-    # 30 different batches. Use as much calibration data as you want
-    for _ in range(30):
-      val = sess.run(out, {inp: dumm_inp})
-  return val
-
-
-def user(multi_engine,
-         run_graph=execute_graph,
-         run_calibration=execute_calibration):
-  """Example function that converts a graph to TFTRT graph."""
-  if multi_engine:
-    inp_dims = (2, 3, 7, 5)
-    orig_graph = get_multi_engine_graph_def()
-  else:
-    inp_dims = (100, 24, 24, 2)
-    orig_graph = get_simple_graph_def()  # use a frozen graph for inference
-  dummy_input = np.random.random_sample(inp_dims)
-  # Get optimized graph
-  trt_graph = trt.create_inference_graph(
-      input_graph_def=orig_graph,
-      outputs=["output"],
-      max_batch_size=inp_dims[0],
-      max_workspace_size_bytes=1 << 25,
-      precision_mode="FP32",  # TRT Engine precision "FP32","FP16" or "INT8"
-      minimum_segment_size=2,  # minimum number of nodes in an engine
-      is_dynamic_op=False,
-      maximum_cached_engines=1,
-      cached_engine_batch_sizes=[])
-  o1 = run_graph(orig_graph, dummy_input)
-  o2 = run_graph(trt_graph, dummy_input)
-  o3 = run_graph(trt_graph, dummy_input)
-  assert np.array_equal(o1, o2)
-  assert np.array_equal(o3, o2)  # sanity check
-  fp16_graph = trt.create_inference_graph(
-      input_graph_def=orig_graph,
-      outputs=["output"],
-      max_batch_size=inp_dims[0],
-      max_workspace_size_bytes=1 << 25,
-      precision_mode="FP16",  # TRT Engine precision "FP32","FP16" or "INT8"
-      minimum_segment_size=2,  # minimum number of nodes in an engine
-      is_dynamic_op=False,
-      maximum_cached_engines=1,
-      cached_engine_batch_sizes=[])
-  int8_calib_gdef = trt.create_inference_graph(
-      input_graph_def=orig_graph,
-      outputs=["output"],
-      max_batch_size=inp_dims[0],
-      max_workspace_size_bytes=1 << 25,
-      precision_mode="INT8",  # TRT Engine precision "FP32","FP16" or "INT8"
-      minimum_segment_size=2,  # minimum number of nodes in an engine
-      is_dynamic_op=False,
-      maximum_cached_engines=1,
-      cached_engine_batch_sizes=[])
-  o4 = run_graph(fp16_graph, dummy_input)
-  _ = run_calibration(int8_calib_gdef, dummy_input)
-  int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef)
-  o5 = run_graph(int8_graph, dummy_input)
-  print("Is FP32 == FP16? %s (False is possible)" % np.allclose(o1, o4))
-  print("Is FP32 == INT8? %s (False is possible)" % np.allclose(o1, o5))
-  print("Pass")
-
-
-def auto(multi_engine):
-  """Run the conversion as an optimization pass."""
-  if multi_engine:
-    inp_dims = (2, 3, 7, 5)
-    orig_graph = get_multi_engine_graph_def()
-  else:
-    inp_dims = (100, 24, 24, 2)
-    orig_graph = get_simple_graph_def()  # use a frozen graph for inference
-  dummy_input = np.random.random_sample(inp_dims)
-  opt_config = rwpb2.RewriterConfig()
-  opt_config.meta_optimizer_iterations = opt_config.ONE
-  opt_config.optimizers.extend(["constfold", "layout"])
-  custom_op = opt_config.custom_optimizers.add()
-  custom_op.name = "TensorRTOptimizer"
-  custom_op.parameter_map["minimum_segment_size"].i = 3
-  custom_op.parameter_map["precision_mode"].s = to_bytes("FP32")
-  custom_op.parameter_map["max_batch_size"].i = inp_dims[0]
-  custom_op.parameter_map["max_workspace_size_bytes"].i = 1 << 25
-  print(custom_op)
-  gpu_options = None
-  if trt.trt_convert.get_linked_tensorrt_version()[0] == 3:
-    gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
-  graph_options = cpb2.GraphOptions(rewrite_options=opt_config)
-  sessconfig = cpb2.ConfigProto(
-      gpu_options=gpu_options, graph_options=graph_options)
-  print(sessconfig)
-  g = ops.Graph()
-  ops.reset_default_graph()
-  with g.as_default():
-    inp, out = importer.import_graph_def(
-        graph_def=orig_graph, return_elements=["input", "output"], name="")
-    inp = inp.outputs[0]
-    out = out.outputs[0]
-    with csess.Session(config=sessconfig, graph=g) as sess:
-      val = sess.run(out, {inp: dummy_input})
-  print(val.shape)
-
-
-if "__main__" in __name__:
-  P = argparse.ArgumentParser(
-      prog="tftrt_test",
-      description="Example utilization of TensorFlow-TensorRT integration")
-  P.add_argument(
-      "--automatic",
-      "-a",
-      action="store_true",
-      help="Do TRT conversion automatically",
-      default=False)
-  P.add_argument(
-      "--multi-engine",
-      "-m",
-      action="store_true",
-      help="Use a graph that will result in 2 engines",
-      default=False)
-  flags, unparsed = P.parse_known_args()
-  if flags.automatic:
-    auto(flags.multi_engine)
-  else:
-    user(flags.multi_engine)
diff --git a/tensorflow/contrib/timeseries/examples/BUILD b/tensorflow/contrib/timeseries/examples/BUILD
index 57797214d1684550aa7ad2664b71d22b504f70ed..e10be88ece8ebba9635af955b3c3410f29e5503c 100644
--- a/tensorflow/contrib/timeseries/examples/BUILD
+++ b/tensorflow/contrib/timeseries/examples/BUILD
@@ -105,6 +105,7 @@ py_binary(
     data = ["data/multivariate_periods.csv"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
+    visibility = ["//visibility:public"],
     deps = select({
         ":empty_condition": [],
         "//conditions:default": [],
@@ -113,6 +114,7 @@ py_binary(
         "//tensorflow:tensorflow_py",
         "//tensorflow/contrib/timeseries/python/timeseries:estimators",
         "//tensorflow/contrib/timeseries/python/timeseries:model",
+        "//tensorflow/contrib/timeseries/python/timeseries:state_management",
     ],
 )
 
diff --git a/tensorflow/contrib/timeseries/examples/predict_test.py b/tensorflow/contrib/timeseries/examples/predict_test.py
index 678fd71cd8b94ee0be46e10a9a673de55bd44215..b353f85cb5df0cf961d1900b241e4fa1a84a24b4 100644
--- a/tensorflow/contrib/timeseries/examples/predict_test.py
+++ b/tensorflow/contrib/timeseries/examples/predict_test.py
@@ -43,10 +43,6 @@ class PeriodTrendExampleTest(test.TestCase):
     self.assertAllEqual([700], mean.shape)
     self.assertAllEqual([700], upper_limit.shape)
     self.assertAllEqual([700], lower_limit.shape)
-    # Check that variance hasn't blown up too much. This is a relatively good
-    # indication that training was successful.
-    self.assertLess(upper_limit[-1] - lower_limit[-1],
-                    1.5 * (upper_limit[0] - lower_limit[0]))
 
   def test_ar(self):
     (times, observed, all_times, mean,
@@ -55,7 +51,6 @@ class PeriodTrendExampleTest(test.TestCase):
     self.assertAllEqual(all_times.shape, mean.shape)
     self.assertAllEqual(all_times.shape, upper_limit.shape)
     self.assertAllEqual(all_times.shape, lower_limit.shape)
-    self.assertLess((upper_limit - lower_limit).mean(), 4.)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index 4b90b596b28efec83aa349782c4874d79b6817c7..4ba814b9e3d3621f9ab924961e2740885fa93b33 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -155,13 +155,16 @@ py_library(
 
 py_test(
     name = "head_test",
-    size = "large",
+    size = "medium",
     srcs = [
         "head_test.py",
     ],
-    shard_count = 4,
+    shard_count = 10,
     srcs_version = "PY2AND3",
-    tags = ["no_pip_gpu"],  # b/63391119
+    tags = [
+        "no_pip_gpu",  # b/63391119
+        "notap",  # b/124520733
+    ],
     deps = [
         ":estimators",
         ":feature_keys",
@@ -281,6 +284,7 @@ py_library(
         "input_pipeline.py",
     ],
     srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
     deps = [
         ":feature_keys",
         ":model_utils",
@@ -361,9 +365,10 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":feature_keys",
+        ":math_utils",
         ":model",
         ":model_utils",
-        "//tensorflow/contrib/distributions:distributions_py",
+        "//tensorflow/contrib/rnn:rnn_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:constant_op",
diff --git a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
index bcadf4094e1e79fff1685515f2bde0b88f717cac..3626701d24163ef52564b42d8a630bd9c5a788eb 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
@@ -18,9 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib import distributions
-
 from tensorflow.contrib.rnn.python.ops import lstm_ops
+from tensorflow.contrib.timeseries.python.timeseries import math_utils
 from tensorflow.contrib.timeseries.python.timeseries import model
 from tensorflow.contrib.timeseries.python.timeseries import model_utils
 from tensorflow.contrib.timeseries.python.timeseries.feature_keys import PredictionFeatures
@@ -462,11 +461,12 @@ class ARModel(model.TimeSeriesModel):
     if self.loss == ARModel.NORMAL_LIKELIHOOD_LOSS:
       covariance = prediction_ops["covariance"]
       sigma = math_ops.sqrt(gen_math_ops.maximum(covariance, 1e-5))
-      normal = distributions.Normal(loc=targets, scale=sigma)
-      loss_op = -math_ops.reduce_sum(normal.log_prob(prediction))
+      loss_op = -math_ops.reduce_sum(
+          math_utils.normal_log_prob(targets, sigma, prediction))
     else:
       assert self.loss == ARModel.SQUARED_LOSS, self.loss
-      loss_op = math_ops.reduce_sum(math_ops.square(prediction - targets))
+      loss_op = math_ops.reduce_sum(
+          math_ops.squared_difference(prediction, targets))
     loss_op /= math_ops.cast(
         math_ops.reduce_prod(array_ops.shape(targets)), loss_op.dtype)
     return loss_op
@@ -965,16 +965,11 @@ class AnomalyMixtureARModel(ARModel):
       anomaly_variance = prediction_ops["anomaly_params"]
       anomaly_sigma = math_ops.sqrt(
           gen_math_ops.maximum(anomaly_variance, 1e-5))
-      normal = distributions.Normal(loc=targets, scale=anomaly_sigma)
-      log_prob = normal.log_prob(prediction)
+      log_prob = math_utils.normal_log_prob(targets, anomaly_sigma, prediction)
     else:
       assert self._anomaly_distribution == AnomalyMixtureARModel.CAUCHY_ANOMALY
       anomaly_scale = prediction_ops["anomaly_params"]
-      cauchy = distributions.StudentT(
-          df=array_ops.ones([], dtype=anomaly_scale.dtype),
-          loc=targets,
-          scale=anomaly_scale)
-      log_prob = cauchy.log_prob(prediction)
+      log_prob = math_utils.cauchy_log_prob(targets, anomaly_scale, prediction)
     return log_prob
 
   def loss_op(self, targets, prediction_ops):
@@ -983,8 +978,7 @@ class AnomalyMixtureARModel(ARModel):
     covariance = prediction_ops["covariance"]
     # Normal data log probability.
     sigma = math_ops.sqrt(gen_math_ops.maximum(covariance, 1e-5))
-    normal1 = distributions.Normal(loc=targets, scale=sigma)
-    log_prob1 = normal1.log_prob(prediction)
+    log_prob1 = math_utils.normal_log_prob(targets, sigma, prediction)
     log_prob1 += math_ops.log(1 - self._anomaly_prior_probability)
     # Anomaly log probability.
     log_prob2 = self._anomaly_log_prob(targets, prediction_ops)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
index aab330643862c1ccf073d2a0e34e1c475b1ec15f..b7375e5055e29efea3f23c3b9b9f3af59f45495b 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 import collections
 import math
 
+import numpy as np
+
 from tensorflow.contrib import lookup
 from tensorflow.contrib.layers.python.layers import layers
 
@@ -43,6 +45,32 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import nest
 
 
+def normal_log_prob(loc, scale, x):
+  """Computes the Normal log pdf."""
+  z = (x - loc) / scale
+  return -0.5 * (math_ops.square(z)
+                 + np.log(2. * np.pi) + math_ops.log(scale))
+
+
+def cauchy_log_prob(loc, scale, x):
+  """Computes the Cauchy log pdf."""
+  z = (x - loc) / scale
+  return (-np.log(np.pi) - math_ops.log(scale) -
+          math_ops.log1p(math_ops.square(z)))
+
+
+def mvn_tril_log_prob(loc, scale_tril, x):
+  """Computes the MVN log pdf under tril scale. Doesn't handle batches."""
+  x0 = x - loc
+  z = linalg_ops.matrix_triangular_solve(
+      scale_tril, x0[..., array_ops.newaxis])[..., 0]
+  log_det_cov = 2. * math_ops.reduce_sum(math_ops.log(
+      array_ops.matrix_diag_part(scale_tril)), axis=-1)
+  d = math_ops.cast(array_ops.shape(scale_tril)[-1], log_det_cov.dtype)
+  return -0.5 * (math_ops.reduce_sum(math_ops.square(z), axis=-1)
+                 + d * np.log(2. * np.pi) + log_det_cov)
+
+
 def clip_covariance(
     covariance_matrix, maximum_variance_ratio, minimum_variance):
   """Enforce constraints on a covariance matrix to improve numerical stability.
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
index 125750e7639ad40c481472a93353e6fb7055be96..cf5e749042afd83f927a3d22edfd3a9538ab2ffd 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
@@ -78,7 +78,6 @@ py_library(
     srcs = ["kalman_filter.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/distributions:distributions_py",
         "//tensorflow/contrib/timeseries/python/timeseries:math_utils",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
@@ -235,7 +234,6 @@ py_library(
     srcs = ["filtering_postprocessor.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/distributions:distributions_py",
         "//tensorflow/contrib/timeseries/python/timeseries:math_utils",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/filtering_postprocessor.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/filtering_postprocessor.py
index e9e2ac0aaf4c4d6c41f5007662f261af3de9bbd1..3fa2fbd9f77cb887c30fde264815728ca345f45a 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/filtering_postprocessor.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/filtering_postprocessor.py
@@ -22,8 +22,6 @@ import abc
 
 import six
 
-from tensorflow.contrib import distributions
-
 from tensorflow.contrib.timeseries.python.timeseries import math_utils
 
 from tensorflow.python.framework import dtypes
@@ -91,10 +89,10 @@ def cauchy_alternative_to_gaussian(current_times, current_values, outputs):
   """
   del current_times  # unused
   cauchy_scale = math_utils.entropy_matched_cauchy_scale(outputs["covariance"])
-  individual_log_pdfs = distributions.StudentT(
-      df=array_ops.ones([], dtype=current_values.dtype),
+  individual_log_pdfs = math_utils.cauchy_log_prob(
       loc=outputs["mean"],
-      scale=cauchy_scale).log_prob(current_values)
+      scale=cauchy_scale,
+      x=current_values)
   return math_ops.reduce_sum(individual_log_pdfs, axis=1)
 
 
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py
index a614386121e000961bf8b32625a28e1251654320..c0ec797bc5b7c41ca996c807840ce38311201f87 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/kalman_filter.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib import distributions
-
 from tensorflow.contrib.timeseries.python.timeseries import math_utils
 
 from tensorflow.python.framework import dtypes
@@ -137,9 +135,10 @@ class KalmanFilter(object):
     with ops.control_dependencies([non_negative_assert]):
       observation_covariance_cholesky = linalg_ops.cholesky(
           symmetrized_observation_covariance)
-    log_prediction_prob = distributions.MultivariateNormalTriL(
-        predicted_observation, observation_covariance_cholesky).log_prob(
-            observation)
+    log_prediction_prob = math_utils.mvn_tril_log_prob(
+        loc=predicted_observation,
+        scale_tril=observation_covariance_cholesky,
+        x=observation)
     (posterior_state,
      posterior_state_var) = self.posterior_from_prior_state(
          prior_state=estimated_state,
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index 4bf3a0463d9046eea2f60e9154fca1357e728215..7c1661d20f15f94a929a46dafc79d59ca73e53cb 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -1,15 +1,15 @@
 # Description: Operations defined for Cloud TPUs
 
-licenses(["notice"])  # Apache 2.0
-
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_custom_op_library",
     "tf_gen_op_libs",
     "tf_gen_op_wrapper_py",
+    "tf_py_test",
 )
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+licenses(["notice"])  # Apache 2.0
 
 package(
     default_visibility = [
@@ -23,17 +23,12 @@ package(
     ],
 )
 
-cc_library(
-    name = "all_ops",
+py_library(
+    name = "tpu_py",
+    srcs = ["python/ops/tpu_ops.py"],
+    srcs_version = "PY2AND3",
     deps = [
-        ":cross_replica_ops_op_lib",
-        ":heartbeat_ops_op_lib",
-        ":host_compute_ops_op_lib",
-        ":infeed_ops_op_lib",
-        ":outfeed_ops_op_lib",
-        ":replication_ops_op_lib",
-        ":tpu_configuration_ops_op_lib",
-        ":tpu_embedding_ops_op_lib",
+        "//tensorflow/python/tpu:tpu_py",
     ],
 )
 
@@ -42,25 +37,14 @@ py_library(
     srcs = ["python/tpu/async_checkpoint.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:summary",
-        "//tensorflow/python:summary_ops_v2",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/tpu:async_checkpoint",
     ],
 )
 
 py_library(
     name = "tpu_estimator",
     srcs = [
+        "python/tpu/_tpu_estimator_embedding.py",
         "python/tpu/error_handling.py",
         "python/tpu/tpu_config.py",
         "python/tpu/tpu_context.py",
@@ -70,86 +54,24 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":async_checkpoint",
+        ":feature_column",
+        ":functional",
+        ":tpu_embedding",
         ":tpu_lib",
         "//tensorflow/contrib/training:training_py",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:session",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:summary",
-        "//tensorflow/python:summary_ops_v2",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/estimator:estimator_py",
-        "//tensorflow/python/estimator:util",
-        "@six_archive//:six",
-    ],
-)
-
-tf_gen_op_libs(
-    op_lib_names = [
-        "cross_replica_ops",
-        "heartbeat_ops",
-        "host_compute_ops",
-        "infeed_ops",
-        "outfeed_ops",
-        "replication_ops",
-        "tpu_configuration_ops",
-        "tpu_embedding_ops",
-    ],
-    deps = [
-        "//tensorflow/contrib/tpu/proto:tpu_embedding_configuration_proto_cc",
-        "//tensorflow/contrib/tpu/utils:tpu_embedding_optimization_parameters_utils",
-        "//tensorflow/contrib/tpu/utils:tpu_embedding_output_layout_utils",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/python/tpu:tpu_estimator",
     ],
 )
 
-tf_custom_op_library(
-    name = "python/ops/_tpu_ops.so",
-    srcs = [
-        "ops/cross_replica_ops.cc",
-        "ops/heartbeat_ops.cc",
-        "ops/host_compute_ops.cc",
-        "ops/infeed_ops.cc",
-        "ops/outfeed_ops.cc",
-        "ops/replication_ops.cc",
-        "ops/tpu_configuration_ops.cc",
-        "ops/tpu_embedding_ops.cc",
-    ],
-    deps = [
-        "//tensorflow/contrib/tpu/proto:tpu_embedding_configuration_proto_cc",
-        "//tensorflow/contrib/tpu/utils:tpu_embedding_optimization_parameters_utils",
-        "//tensorflow/contrib/tpu/utils:tpu_embedding_output_layout_utils",
-        "//tensorflow/core:lib_proto_parsing",
-    ],
-)
-
-tf_gen_op_wrapper_py(
-    name = "tpu_ops",
-    hidden = [
-        "SendTPUEmbeddingGradients",
-        "EnqueueTPUEmbeddingIntegerBatch",
-        "EnqueueTPUEmbeddingSparseBatch",
-        "EnqueueTPUEmbeddingSparseTensorBatch",
+py_library(
+    name = "functional",
+    srcs = ["python/tpu/functional.py"],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//visibility:public",
     ],
     deps = [
-        ":cross_replica_ops_op_lib",
-        ":heartbeat_ops_op_lib",
-        ":host_compute_ops_op_lib",
-        ":infeed_ops_op_lib",
-        ":outfeed_ops_op_lib",
-        ":replication_ops_op_lib",
-        ":tpu_configuration_ops_op_lib",
-        ":tpu_embedding_ops_op_lib",
+        "//tensorflow/python/tpu:functional",
     ],
 )
 
@@ -158,30 +80,7 @@ py_library(
     srcs = ["python/profiler/__init__.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/tpu/profiler:tpu_profiler_analysis_pb2_grpc",
-        "//tensorflow/contrib/tpu/profiler:tpu_profiler_analysis_proto_py",
-        "//tensorflow/contrib/tpu/profiler:trace_events_proto_py",
-        "//tensorflow/python:util",
-    ],
-)
-
-tf_custom_op_py_library(
-    name = "tpu_py",
-    srcs = glob(["python/ops/*.py"]),
-    dso = [":python/ops/_tpu_ops.so"],
-    kernels = [
-        ":all_ops",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":profiler",
-        ":tpu_ops",
-        "//tensorflow/contrib/compiler:xla",
-        "//tensorflow/contrib/util:util_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
+        "//tensorflow/python/tpu/profiler",
     ],
 )
 
@@ -193,10 +92,12 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":feature_column",
         ":keras_support",  # split out to avoid cycle with tpu_strategy
         ":tpu_embedding",
         ":tpu_estimator",
         ":tpu_lib",
+        "//tensorflow/python/tpu",
     ],
 )
 
@@ -212,7 +113,6 @@ py_library(
         "//learning/brain:__subpackages__",
         "//tensorflow:__subpackages__",
         "//third_party/cloud_tpu/models/keras_colab:__subpackages__",
-        "//third_party/cloud_tpu/models/mnist_keras:__subpackages__",
         "//third_party/cloud_tpu/models/resnet50_keras:__subpackages__",
     ],
     deps = [
@@ -220,8 +120,8 @@ py_library(
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
         "//tensorflow/contrib/distribute",
         "//tensorflow/contrib/framework:framework_py",
-        "//tensorflow/contrib/tpu/proto:compilation_result_proto_py",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/core/protobuf/tpu:compilation_result_proto_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
@@ -261,29 +161,12 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":datasets",
+        ":functional",
         ":profiler",
         ":tpu_py",
-        "//tensorflow/compiler/xla/experimental/xla_sharding",
-        "//tensorflow/compiler/xla/python_api:xla_shape",
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
         "//tensorflow/contrib/compiler:xla",
-        "//tensorflow/contrib/tpu/proto:compilation_result_proto_py",
-        "//tensorflow/contrib/tpu/proto:optimization_parameters_proto_py",
-        "//tensorflow/contrib/tpu/proto:topology_proto_py",
-        "//tensorflow/contrib/tpu/proto:tpu_embedding_configuration_proto_py",
-        "//tensorflow/contrib/tpu/proto:tpu_embedding_output_layout_proto_py",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:control_flow_util",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:training",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/ops/losses",
+        "//tensorflow/python/tpu:tpu_lib",
     ],
 )
 
@@ -294,121 +177,28 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/contrib/data/python/ops:batching",
-        "//tensorflow/contrib/data/python/ops:interleave_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:function",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-        "//tensorflow/python/data/ops:readers",
-    ],
-)
-
-tf_py_test(
-    name = "datasets_test",
-    srcs = ["python/tpu/datasets_test.py"],
-    additional_deps = [
-        "//tensorflow/python:client_testlib",
-        ":datasets",
+        "//tensorflow/python/tpu:datasets",
     ],
-    flaky = 1,  # TODO(b/117363808): fails 1/1000 OSS runs
-    grpc_enabled = True,
 )
 
-tf_py_test(
-    name = "tpu_test",
-    size = "small",
-    srcs = ["python/tpu/tpu_test.py"],
-    additional_deps = [
-        ":tpu",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:layers",
-    ],
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
-)
-
-tf_py_test(
-    name = "tpu_sharding_test",
-    size = "small",
-    srcs = ["python/tpu/tpu_sharding_test.py"],
-    additional_deps = [
-        ":tpu",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-    ],
-)
-
-tf_py_test(
-    name = "bfloat16_test",
-    size = "small",
-    srcs = ["python/tpu/bfloat16_test.py"],
-    additional_deps = [
-        ":tpu",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-    ],
-)
-
-tf_py_test(
-    name = "tpu_infeed_test",
-    size = "small",
-    srcs = ["python/tpu/tpu_infeed_test.py"],
-    additional_deps = [
-        ":tpu",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-    ],
-)
-
-tf_py_test(
-    name = "tpu_config_test",
-    size = "small",
-    srcs = ["python/tpu/tpu_config_test.py"],
-    additional_deps = [
-        ":tpu_estimator",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-    ],
-)
-
-tf_py_test(
-    name = "tpu_estimator_signals_test",
-    size = "small",
-    srcs = ["python/tpu/tpu_estimator_signals_test.py"],
-    additional_deps = [
-        ":tpu_estimator",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
+py_library(
+    name = "tpu_embedding",
+    srcs = [
+        "python/tpu/tpu_embedding.py",
+        "python/tpu/tpu_embedding_gradient.py",
     ],
-)
-
-tf_py_test(
-    name = "topology_test",
-    size = "medium",
-    srcs = ["python/tpu/topology_test.py"],
-    additional_deps = [
-        ":tpu",
-        "//tensorflow/python:framework_test_lib",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tpu_lib",
+        "//tensorflow/python/tpu:tpu_embedding",
     ],
 )
 
 py_library(
-    name = "tpu_embedding",
-    srcs = ["python/tpu/tpu_embedding.py"],
-    srcs_version = "PY2AND3",
+    name = "feature_column",
+    srcs = ["python/tpu/feature_column.py"],
     deps = [
-        "//tensorflow/contrib/tpu:tpu_ops",
-        "//tensorflow/contrib/tpu/proto:tpu_embedding_configuration_proto_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "@six_archive//:six",
+        ":tpu_lib",
+        "//tensorflow/python/tpu:feature_column",
     ],
 )
diff --git a/tensorflow/contrib/tpu/profiler/BUILD b/tensorflow/contrib/tpu/profiler/BUILD
index 541fbf33a302a4d850422885fdbbc438bd6b9b7b..e2ce77e118182bb07193cbac82e176d3b2057e17 100644
--- a/tensorflow/contrib/tpu/profiler/BUILD
+++ b/tensorflow/contrib/tpu/profiler/BUILD
@@ -2,35 +2,6 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library_cc")
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
-
-tf_proto_library(
-    name = "tpu_profiler_proto",
-    srcs = ["tpu_profiler.proto"],
-    has_services = 1,
-    cc_api_version = 2,
-    cc_grpc_version = 1,
-    protodeps = [":op_profile_proto"] + tf_additional_all_protos(),
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "dump_tpu_profile",
-    srcs = ["dump_tpu_profile.cc"],
-    hdrs = ["dump_tpu_profile.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":op_profile_proto_cc",
-        ":tpu_profiler_proto_cc",
-        ":trace_events_proto_cc",
-        ":trace_events_to_json",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
 
 cc_library(
     name = "version",
@@ -43,71 +14,13 @@ tf_cc_binary(
     srcs = [
         "capture_tpu_profile.cc",
     ],
+    tags = ["no_windows"],
     visibility = ["//visibility:public"],
     deps = [
-        ":dump_tpu_profile",
-        ":tpu_profiler_analysis_proto_cc",
-        ":tpu_profiler_proto_cc",
         ":version",
-        "//tensorflow:grpc++",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
-        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "//tensorflow/core/platform/cloud:gcs_file_system",
+        "//tensorflow/core/profiler/rpc/client:capture_profile",
     ],
 )
-
-tf_proto_library(
-    name = "trace_events_proto",
-    srcs = ["trace_events.proto"],
-    cc_api_version = 2,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "trace_events_to_json",
-    srcs = ["trace_events_to_json.cc"],
-    hdrs = ["trace_events_to_json.h"],
-    deps = [
-        ":trace_events_proto_cc",
-        "//tensorflow/core:lib",
-        "@jsoncpp_git//:jsoncpp",
-    ],
-)
-
-tf_cc_test(
-    name = "trace_events_to_json_test",
-    srcs = ["trace_events_to_json_test.cc"],
-    deps = [
-        ":trace_events_to_json",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "@jsoncpp_git//:jsoncpp",
-    ],
-)
-
-tf_proto_library(
-    name = "op_profile_proto",
-    srcs = ["op_profile.proto"],
-    cc_api_version = 2,
-    visibility = ["//visibility:public"],
-)
-
-tf_proto_library(
-    name = "tpu_profiler_analysis_proto",
-    srcs = ["tpu_profiler_analysis.proto"],
-    has_services = 1,
-    cc_api_version = 2,
-    cc_grpc_version = 1,
-    protodeps = [":tpu_profiler_proto"] + tf_additional_all_protos(),
-    visibility = ["//visibility:public"],
-)
-
-py_library(
-    name = "tpu_profiler_analysis_pb2_grpc",
-    srcs = ["tpu_profiler_analysis_pb2_grpc.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [":tpu_profiler_analysis_proto_py"],
-)
diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index 1c5ea2d997a58ca57ddc212ffd56aad525e961da..f11d1a9f37eeb19b95a876bd68575022e6b91521 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -18,235 +18,11 @@ limitations under the License.
 // Initiates a TPU profiling on the TPUProfiler service at service_addr,
 // receives and dumps the profile data to a tensorboard log directory.
 
-#include "grpcpp/grpcpp.h"
-
-#include <cstdio>
-#include <ctime>
-#include <vector>
-
-#include "tensorflow/contrib/tpu/profiler/dump_tpu_profile.h"
-#include "tensorflow/contrib/tpu/profiler/tpu_profiler.grpc.pb.h"
-#include "tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.grpc.pb.h"
 #include "tensorflow/contrib/tpu/profiler/version.h"
-#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/strings/numbers.h"
-#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/profiler/rpc/client/capture_profile.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
-namespace tensorflow {
-namespace tpu {
-namespace {
-
-using ::tensorflow::TPUProfileAnalysis;
-using ::tensorflow::TPUProfiler;
-
-constexpr uint64 kMaxEvents = 1000000;
-
-string GetCurrentTimeStampAsString() {
-  char s[128];
-  std::time_t t = std::time(nullptr);
-  CHECK_NE(std::strftime(s, sizeof(s), "%F_%T", std::localtime(&t)), 0);
-  return s;
-}
-
-Status ValidateHostPortPair(const string& host_port) {
-  uint32 port;
-  std::vector<string> parts = str_util::Split(host_port, ':');
-  // Must be host:port, port must be a number, host must not contain a '/',
-  // host also must not be empty.
-  if (parts.size() != 2 || !strings::safe_strtou32(parts[1], &port) ||
-      parts[0].find("/") != string::npos || parts[0].empty()) {
-    return errors::InvalidArgument("Could not interpret \"", host_port,
-                                   "\" as a host-port pair.");
-  }
-  return Status::OK();
-}
-
-ProfileRequest PopulateProfileRequest(int duration_ms,
-                                      const string& repository_root,
-                                      const string& session_id,
-                                      const ProfileOptions& opts) {
-  ProfileRequest request;
-  request.set_duration_ms(duration_ms);
-  request.set_max_events(kMaxEvents);
-  if (tensorflow::str_util::StartsWith(repository_root, "gs://")) {
-    // For backward compatibilities, only generate tracetable etc when the
-    // user provide a GCS path for model directory.
-    request.set_repository_root(repository_root);
-    request.set_session_id(session_id);
-  }
-  request.add_tools("op_profile");
-  request.add_tools("input_pipeline");
-  request.add_tools("memory_viewer");
-  request.add_tools("overview_page");
-  *request.mutable_opts() = opts;
-  return request;
-}
-
-// Returns whether the returned trace is empty.
-// Failure are handled by CHECK, i.e. abort()
-bool Profile(const string& service_addr, const string& logdir, int duration_ms,
-             const string& repository_root, const string& session_id,
-             const ProfileOptions& opts) {
-  ProfileRequest request =
-      PopulateProfileRequest(duration_ms, repository_root, session_id, opts);
-
-  ::grpc::ClientContext context;
-  ::grpc::ChannelArguments channel_args;
-  // TODO(qiuminxu): use `NewHostPortGrpcChannel` instead once their
-  // `ValidateHostPortPair` checks for empty host string case.
-  channel_args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH,
-                      std::numeric_limits<int32>::max());
-  std::unique_ptr<TPUProfiler::Stub> stub =
-      TPUProfiler::NewStub(::grpc::CreateCustomChannel(
-          "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
-          channel_args));
-  ProfileResponse response;
-  TF_QCHECK_OK(FromGrpcStatus(stub->Profile(&context, request, &response)));
-
-  if (!response.encoded_trace().empty()) {
-    TF_CHECK_OK(tensorflow::tpu::WriteTensorboardTPUProfile(
-        logdir, session_id, "", response, &std::cout));
-    // Print this at the end so that it's not buried in irrelevant LOG messages.
-    std::cout
-        << "NOTE: using the trace duration " << duration_ms << "ms."
-        << std::endl
-        << "Set an appropriate duration (with --duration_ms) if you "
-           "don't see a full step in your trace or the captured trace is too "
-           "large."
-        << std::endl;
-  }
-
-  return response.encoded_trace().empty();
-}
-
-// Start a new profiling session that include all the hosts included in
-// hostnames, for the time interval of duration_ms. Possibly save the profiling
-// result in the directory specified by repository_root and session_id.
-bool NewSession(const string& service_addr,
-                const std::vector<tensorflow::string>& hostnames,
-                int duration_ms, const string& repository_root,
-                const string& session_id, const ProfileOptions& opts) {
-  NewProfileSessionRequest new_session_request;
-  *new_session_request.mutable_request() =
-      PopulateProfileRequest(duration_ms, repository_root, session_id, opts);
-  new_session_request.set_repository_root(repository_root);
-  new_session_request.set_session_id(session_id);
-  for (const auto& hostname : hostnames) {
-    new_session_request.add_hosts(hostname);
-  }
-
-  ::grpc::ClientContext context;
-  ::grpc::ChannelArguments channel_args;
-  // TODO(qiuminxu): use `NewHostPortGrpcChannel` instead once their
-  // `ValidateHostPortPair` checks for empty host string case.
-  channel_args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
-  // TODO(jiesun): GRPC support following relevant naming scheme:
-  // 1. dns:///host:port
-  // 2. ipv4:host:port or ipv6:[host]:port
-  // We might need to change the prefix which depends on what TPU name resolver
-  // will give us.
-  std::unique_ptr<TPUProfileAnalysis::Stub> stub =
-      TPUProfileAnalysis::NewStub(::grpc::CreateCustomChannel(
-          "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
-          channel_args));
-  NewProfileSessionResponse new_session_response;
-  TF_QCHECK_OK(FromGrpcStatus(
-      stub->NewSession(&context, new_session_request, &new_session_response)));
-
-  std::cout << "Profile session succeed for host(s):"
-            << str_util::Join(hostnames, ",") << std::endl;
-  return new_session_response.empty_trace();
-}
-
-// Starts tracing on a single or multiple TPU hosts and saves the result in the
-// given logdir. If no trace was collected, retries tracing for
-// num_tracing_attempts.
-void StartTracing(const tensorflow::string& service_addr,
-                  const tensorflow::string& logdir,
-                  const tensorflow::string& workers_list,
-                  bool include_dataset_ops, int duration_ms,
-                  int num_tracing_attempts) {
-  // Use the current timestamp as the run name.
-  tensorflow::string session_id = GetCurrentTimeStampAsString();
-  constexpr char kProfilePluginDirectory[] = "plugins/profile/";
-  tensorflow::string repository_root =
-      io::JoinPath(logdir, kProfilePluginDirectory);
-  std::vector<tensorflow::string> hostnames =
-      tensorflow::str_util::Split(workers_list, ",");
-
-  bool empty_trace = false;
-  int remaining_attempts = num_tracing_attempts;
-  tensorflow::ProfileOptions opts;
-  opts.set_include_dataset_ops(include_dataset_ops);
-  while (true) {
-    std::cout << "Starting to profile TPU traces for " << duration_ms << " ms. "
-              << "Remaining attempt(s): " << remaining_attempts-- << std::endl;
-    if (hostnames.empty()) {
-      empty_trace = tensorflow::tpu::Profile(service_addr, logdir, duration_ms,
-                                             repository_root, session_id, opts);
-    } else {
-      tensorflow::string tpu_master = service_addr;
-      empty_trace =
-          tensorflow::tpu::NewSession(tpu_master, hostnames, duration_ms,
-                                      repository_root, session_id, opts);
-    }
-    if (remaining_attempts <= 0 || !empty_trace) break;
-    std::cout << "No trace event is collected. Automatically retrying."
-              << std::endl
-              << std::endl;
-  }
-
-  if (empty_trace) {
-    std::cout << "No trace event is collected after " << num_tracing_attempts
-              << " attempt(s). "
-              << "Perhaps, you want to try again (with more attempts?)."
-              << std::endl
-              << "Tip: increase number of attempts with --num_tracing_attempts."
-              << std::endl;
-  }
-}
-
-MonitorRequest PopulateMonitorRequest(int duration_ms, int monitoring_level) {
-  MonitorRequest request;
-  request.set_duration_ms(duration_ms);
-  request.set_monitoring_level(monitoring_level);
-  return request;
-}
-
-// Repeatedly collects profiles and shows user-friendly metrics for
-// 'num_queries' time(s).
-void StartMonitoring(const tensorflow::string& service_addr, int duration_ms,
-                     int monitoring_level, int num_queries) {
-  for (int query = 0; query < num_queries; ++query) {
-    MonitorRequest request =
-        PopulateMonitorRequest(duration_ms, monitoring_level);
-
-    ::grpc::ClientContext context;
-    ::grpc::ChannelArguments channel_args;
-    channel_args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH,
-                        std::numeric_limits<int32>::max());
-    std::unique_ptr<TPUProfiler::Stub> stub =
-        TPUProfiler::NewStub(::grpc::CreateCustomChannel(
-            "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
-            channel_args));
-    MonitorResponse response;
-    TF_QCHECK_OK(FromGrpcStatus(stub->Monitor(&context, request, &response)));
-
-    std::cout << "Cloud TPU Monitoring Results (Sample " << query + 1
-              << "):\n\n"
-              << response.data() << std::flush;
-  }
-}
-
-}  // namespace
-}  // namespace tpu
-}  // namespace tensorflow
-
 int main(int argc, char** argv) {
   tensorflow::string FLAGS_service_addr;
   tensorflow::string FLAGS_logdir;
@@ -300,8 +76,9 @@ int main(int argc, char** argv) {
     std::cout << usage.c_str() << std::endl;
     return 2;
   }
-  tensorflow::Status status =
-      tensorflow::tpu::ValidateHostPortPair(FLAGS_service_addr);
+  tensorflow::Status status;
+  status =
+      tensorflow::profiler::client::ValidateHostPortPair(FLAGS_service_addr);
   if (!status.ok()) {
     std::cout << status.error_message() << std::endl;
     std::cout << usage.c_str() << std::endl;
@@ -324,12 +101,17 @@ int main(int argc, char** argv) {
               << FLAGS_service_addr << " for " << duration_ms
               << "ms and show metrics for " << num_queries << " time(s)."
               << std::endl;
-    tensorflow::tpu::StartMonitoring(FLAGS_service_addr, duration_ms,
-                                     FLAGS_monitoring_level, num_queries);
+    tensorflow::profiler::client::StartMonitoring(
+        FLAGS_service_addr, duration_ms, FLAGS_monitoring_level, num_queries);
   } else {
-    tensorflow::tpu::StartTracing(FLAGS_service_addr, FLAGS_logdir,
-                                  FLAGS_workers_list, FLAGS_include_dataset_ops,
-                                  duration_ms, num_tracing_attempts);
+    status = tensorflow::profiler::client::StartTracing(
+        FLAGS_service_addr, FLAGS_logdir, FLAGS_workers_list,
+        FLAGS_include_dataset_ops, duration_ms, num_tracing_attempts);
+    if (!status.ok()) {
+      std::cout << status.error_message() << std::endl;
+      std::cout << usage.c_str() << std::endl;
+      return 2;
+    }
   }
   return 0;
 }
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/setup.py b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
index f27ae38e0434991da7475e631be1c6cb4a463118..807cf26fe983b4ebe17695d6f4f90ecfc0e0cbf5 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/setup.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/setup.py
@@ -33,7 +33,7 @@ setup(
     long_description='Tools for capture TPU profile',
     url='https://www.tensorflow.org/tfrc/',
     author='Google Inc.',
-    author_email='opensource@google.com',
+    author_email='packages@tensorflow.org',
     packages=['cloud_tpu_profiler'],
     package_data={
         'cloud_tpu_profiler': ['data/*'],
diff --git a/tensorflow/contrib/tpu/python/ops/tpu_ops.py b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
index 6a6eba282a12d68cc3cd4e46a46a1b4190fb737b..8605bae5c128513186d8c03835dcf49d3e4b6fd9 100644
--- a/tensorflow/contrib/tpu/python/ops/tpu_ops.py
+++ b/tensorflow/contrib/tpu/python/ops/tpu_ops.py
@@ -1,389 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-
-"""Operations for TPUs."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import platform
-
-from tensorflow.contrib.tpu.python.tpu import tpu_function
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import tf_logging as logging
-
-if platform.system() != "Windows":
-  # pylint: disable=wildcard-import,unused-import,g-import-not-at-top
-  from tensorflow.contrib.tpu.ops import gen_tpu_ops
-  from tensorflow.contrib.tpu.ops.gen_tpu_ops import *
-
-  from tensorflow.contrib.util import loader
-  from tensorflow.python.platform import resource_loader
-  # pylint: enable=wildcard-import,unused-import,g-import-not-at-top
-
-  _tpu_ops = loader.load_op_library(
-      resource_loader.get_path_to_datafile("_tpu_ops.so"))
-
-  def _create_default_group_assignment():
-    num_shards = tpu_function.get_tpu_context().number_of_shards
-    if num_shards is None:
-      logging.warning(
-          "cross_replica_sum should be used within a tpu_shard_context, but "
-          "got unset number_of_shards. Assuming 1.")
-      num_shards = 1
-    group_assignment = [list(range(num_shards))]
-    return group_assignment
-
-  def all_to_all(x,
-                 concat_dimension,
-                 split_dimension,
-                 split_count,
-                 group_assignment=None,
-                 name=None):
-    """Exchange data across TPU replicas.
-
-    Args:
-      x: The local tensor.
-      concat_dimension: The dimension number to concatenate.
-      split_dimension: The dimension number to split.
-      split_count: The number of splits, this number must equal to the sub-group
-        size(group_assignment.get_shape()[1])
-      group_assignment: Optional 2d int32 lists with shape [num_groups,
-        num_replicas_per_group]. `group_assignment[i]` represents the replica
-        ids in the ith subgroup.
-      name: Optional op name.
-
-    Returns:
-      A `Tensor` which is concatenated by data from different replicas.
-    """
-    if group_assignment is None:
-      group_assignment = _create_default_group_assignment()
-    return gen_tpu_ops.all_to_all(
-        x,
-        group_assignment,
-        concat_dimension=concat_dimension,
-        split_dimension=split_dimension,
-        split_count=split_count,
-        name=name)
-
-  @ops.RegisterGradient("AllToAll")
-  def _all_to_all_grad(op, grad):
-    # The gradient of a all-to-all is also a all-to-all but the
-    # split_dimension and concat_dimension is swapped.
-    # The graident with respect to group_assignment is None.
-    return [
-        gen_tpu_ops.all_to_all(
-            grad,
-            op.inputs[1],
-            concat_dimension=op.get_attr("split_dimension"),
-            split_dimension=op.get_attr("concat_dimension"),
-            split_count=op.get_attr("split_count")), None
-    ]
-
-  def cross_replica_sum(x, group_assignment=None, name=None):
-    """Sum the input tensor across replicas according to group_assignment.
-
-    Args:
-      x: The local tensor to the sum.
-      group_assignment: Optional 2d int32 lists with shape [num_groups,
-        num_replicas_per_group]. `group_assignment[i]` represents the replica
-        ids in the ith subgroup.
-      name: Optional op name.
-
-    Returns:
-      A `Tensor` which is summed across replicas.
-    """
-    if group_assignment is None:
-      group_assignment = _create_default_group_assignment()
-
-    return gen_tpu_ops.cross_replica_sum(x, group_assignment, name=name)
-
-  def collective_permute(x, source_target_pairs, name=None):
-    """Permute the input tensor across replicas given source_target_pairs.
-
-    For each source_target_pair <a, b>, we send replica a's input to replica b.
-    Each replica id must only appear once in the source column. Also it must
-    only appear once in the target column.
-    For the replica id not in the target column, this op returns a zero tensor
-    with the same shape and dtype of the input x.
-
-    For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
-    source_target_pairs=`[[0,1],[1,2],[2,3]]` gets the outputs:
-    `[0, A, B, C]`.
-
-    Args:
-      x: The local tensor to be permuted.
-      source_target_pairs: 2d int lists with shape [num_pairs, 2].
-        source_target_pairs[i][0] represents the source replica id and
-        source_target_pairs[i][1] represents the target replica id.
-      name: Optional op name.
-
-    Returns:
-      A `Tensor` which is permuted.
-    """
-    return gen_tpu_ops.collective_permute(x, source_target_pairs, name=name)
-
-  @ops.RegisterGradient("CollectivePermute")
-  def _collective_permute_grad(op, grad):
-    # The gradient of a collective permute operation is also a collective
-    # permute, but with source/target pairs reversed. The gradient with respect
-    # to input argument `source_target_pairs` is `None`.
-    source_target_pairs = op.inputs[1][:, ::-1]
-    return [gen_tpu_ops.collective_permute(grad, source_target_pairs), None]
-
-  @ops.RegisterGradient("CrossReplicaSum")
-  def _cross_replica_sum_grad(op, grad):
-    # The gradient of a cross replica sum is also a cross-replica sum.
-    # The gradient with respect to group_assignment is None.
-    return [gen_tpu_ops.cross_replica_sum(grad, op.inputs[1]), None]
-
-  # This extra type checking exists to give a more helpful error message in
-  # the common case that uint8 and int64 values are infed. Remove when both
-  # types are supported.
-
-  _SUPPORTED_INFEED_DTYPES = set([
-      dtypes.bool, dtypes.int32, dtypes.int64, dtypes.bfloat16, dtypes.float32,
-      dtypes.complex64
-  ])
-
-  def infeed_dequeue(dtype, shape, name=None):
-    """A placeholder op for a value that will be fed into the computation.
-
-    Args:
-      dtype: A `tf.DType`. The type of elements in the tensor.
-      shape: A `tf.TensorShape` or list of `ints`. The shape of the tensor.
-      name: A name for the operation (optional).
-
-    Returns:
-      A `Tensor` of type `dtype`.
-      A tensor that will be provided using the infeed mechanism.
-
-    Raises:
-      TypeError: If 'dtype` is not a supported infeed type.
-    """
-    if dtype not in _SUPPORTED_INFEED_DTYPES:
-      raise TypeError(
-          "{} is not a supported TPU infeed type. Supported types are: "
-          "{}".format(dtype, list(_SUPPORTED_INFEED_DTYPES)))
-
-    return gen_tpu_ops.infeed_dequeue(dtype, shape, name=name)
-
-  # pylint: disable=redefined-outer-name
-  def infeed_dequeue_tuple(dtypes, shapes, name=None):
-    """A placeholder op for values fed into the TPU simultaneously as a tuple.
-
-    Args:
-      dtypes: A list of `tf.DType`s that has length `>= 1`.
-        The element types of each element in `outputs`.
-      shapes: A list of shapes (each a `tf.TensorShape` or list of `ints`).
-        The shapes of each tensor in `outputs`.
-      name: A name for the operation (optional).
-
-    Returns:
-      A list of `Tensor` objects of type `dtypes`.
-      A list of tensors that will be provided using the infeed mechanism.
-
-    Raises:
-      TypeError: If a type in 'dtypes` is not a supported infeed type.
-    """
-    for dtype in dtypes:
-      if dtype not in _SUPPORTED_INFEED_DTYPES:
-        raise TypeError(
-            "{} is not a supported TPU infeed type. Supported types are: "
-            "{}".format(dtype, list(_SUPPORTED_INFEED_DTYPES)))
-    return gen_tpu_ops.infeed_dequeue_tuple(dtypes, shapes, name=name)
-  # pylint: enable=redefined-outer-name
-
-  # pylint: disable=protected-access
-  def send_tpu_embedding_gradients(inputs,
-                                   config,
-                                   learning_rates=None,
-                                   name=None):
-    """A placeholder op for feeding per-sample gradients to the embedding layer.
-
-    Args:
-      inputs: A TensorList of gradients with which to update embedding tables.
-        Contains one tensor per embedding table in the model.
-      config: Serialized TPUEmbeddingConfiguration proto.
-      learning_rates: A TensorList of float32 scalars, one for each embedding
-        table, containing the learning rates for each table when dynamic
-        learning rate is enabled through the OptimizationParameters in
-        TPUEmbeddingConfiguration. When the learning rate is constant, the list
-        should be empty (optional).
-      name: A name for the operation (optional).
-
-    Returns:
-      A SendTPUEmbeddingGradients operation.
-    """
-    if learning_rates is None:
-      learning_rates = []
-    return gen_tpu_ops._send_tpu_embedding_gradients(
-        inputs=inputs, learning_rates=learning_rates, config=config, name=name)
-
-
-  send_tpu_embedding_gradients.__doc__ = (
-      gen_tpu_ops._send_tpu_embedding_gradients.__doc__)
-
-  # pylint: disable=protected-access
-  def enqueue_tpu_embedding_integer_batch(batch,
-                                          device_ordinal,
-                                          mode_override=None,
-                                          name=None):
-    """A placeholder op for enqueueing embedding IDs to the TPU.
-
-    Args:
-      batch: A list of 1D tensors, one for each embedding table, containing the
-        indices into the tables.
-      device_ordinal: The TPU device to use. Should be >= 0 and less than the
-        number of TPU cores in the task on which the node is placed.
-      mode_override: A string input that overrides the mode specified in the
-        TPUEmbeddingConfiguration. Supported values are {'unspecified',
-        'inference', 'training', 'backward_pass_only'}. When set to
-        'unspecified', the mode set in TPUEmbeddingConfiguration is used,
-        otherwise mode_override is used (optional).
-      name: A name for the operation (optional).
-
-    Returns:
-      An EnqueueTPUEmbeddingIntegerBatch operation.
-    """
-    if mode_override is None:
-      mode_override = "unspecified"
-    return gen_tpu_ops._enqueue_tpu_embedding_integer_batch(
-        batch=batch,
-        device_ordinal=device_ordinal,
-        mode_override=mode_override,
-        name=name)
-
-  enqueue_tpu_embedding_integer_batch.__doc__ = (
-      gen_tpu_ops._enqueue_tpu_embedding_integer_batch.__doc__)
-
-  # pylint: disable=protected-access
-  def enqueue_tpu_embedding_sparse_batch(sample_indices,
-                                         embedding_indices,
-                                         aggregation_weights,
-                                         device_ordinal,
-                                         combiners=None,
-                                         mode_override=None,
-                                         name=None):
-    """A placeholder op for enqueueing embedding IDs to the TPU.
-
-    Args:
-      sample_indices: A list of rank 1 Tensors specifying the training example
-        and feature to which the corresponding embedding_indices and
-        aggregation_weights values belong. sample_indices[i] must equal b * nf +
-        f, where nf is the number of features from the corresponding table, f is
-        in [0, nf), and b is in [0, batch size).
-      embedding_indices: A list of rank 1 Tensors, indices into the embedding
-        tables.
-      aggregation_weights: A list of rank 1 Tensors containing per sample --
-        i.e. per (training example, feature) -- aggregation weights.
-      device_ordinal: The TPU device to use. Should be >= 0 and less than the
-        number of TPU cores in the task on which the node is placed.
-      combiners: A list of string scalars, one for each embedding table that
-        specify how to normalize the embedding activations after weighted
-        summation. Supported combiners are 'mean', 'sum', or 'sqrtn'. It is
-        invalid to have the sum of the weights be 0 for 'mean' or the sum of the
-        squared weights be 0 for 'sqrtn'. If combiners isn't passed, the default
-        is to use 'sum' for all tables (optional).
-      mode_override: A string input that overrides the mode specified in the
-        TPUEmbeddingConfiguration. Supported values are {'unspecified',
-        'inference', 'training', 'backward_pass_only'}. When set to
-        'unspecified', the mode set in TPUEmbeddingConfiguration is used,
-        otherwise mode_override is used (optional).
-      name: A name for the operation (optional).
-
-    Returns:
-      An EnqueueTPUEmbeddingSparseBatch operation.
-    """
-    if mode_override is None:
-      mode_override = "unspecified"
-    return gen_tpu_ops._enqueue_tpu_embedding_sparse_batch(
-        sample_indices=sample_indices,
-        embedding_indices=embedding_indices,
-        aggregation_weights=aggregation_weights,
-        device_ordinal=device_ordinal,
-        combiners=combiners,
-        mode_override=mode_override,
-        name=name)
-
-  enqueue_tpu_embedding_sparse_batch.__doc__ = (
-      gen_tpu_ops._enqueue_tpu_embedding_sparse_batch.__doc__)
-
-  # pylint: disable=protected-access
-  def enqueue_tpu_embedding_sparse_tensor_batch(sample_indices,
-                                                embedding_indices,
-                                                aggregation_weights,
-                                                table_ids,
-                                                device_ordinal,
-                                                combiners=None,
-                                                mode_override=None,
-                                                name=None):
-    """A placeholder op for enqueueing embedding IDs to the TPU.
-
-    Args:
-      sample_indices: A list of rank 1 Tensors specifying the training example
-        to which the corresponding embedding_indices and aggregation_weights
-        values
-        belong. It corresponds to sp_ids.indices[:,0] in
-          embedding_lookup_sparse().
-      embedding_indices: A list of rank 1 Tensors, indices into the embedding
-        tables. It corresponds to sp_ids.values in embedding_lookup_sparse().
-      aggregation_weights: A list of rank 1 Tensors containing per training
-        example aggregation weights. It corresponds to sp_weights.values in
-        embedding_lookup_sparse().
-      table_ids: A list of integers specifying the identifier of the embedding
-        table (offset of TableDescriptor in the TPUEmbeddingConfiguration) to
-        lookup the corresponding input. The ith input is looked up using
-        table_ids[i]. The size of the table_ids list must be equal to that of
-        sample_indices, embedding_indices and aggregation_weights.
-      device_ordinal: The TPU device to use. Should be >= 0 and less than the
-        number of TPU cores in the task on which the node is placed.
-      combiners: A list of string scalars, one for each embedding table that
-        specify how to normalize the embedding activations after weighted
-        summation. Supported combiners are 'mean', 'sum', or 'sqrtn'. It is
-        invalid to have the sum of the weights be 0 for 'mean' or the sum of the
-        squared weights be 0 for 'sqrtn'. If combiners isn't passed, the default
-        is to use 'sum' for all tables (optional).
-      mode_override: A string input that overrides the mode specified in the
-        TPUEmbeddingConfiguration. Supported values are {'unspecified',
-        'inference', 'training', 'backward_pass_only'}. When set to
-        'unspecified', the mode set in TPUEmbeddingConfiguration is used,
-        otherwise mode_override is used (optional).
-      name: A name for the operation (optional).
-
-    Returns:
-      An EnqueueTPUEmbeddingSparseTensorBatch operation.
-    """
-    if mode_override is None:
-      mode_override = "unspecified"
-    return gen_tpu_ops._enqueue_tpu_embedding_sparse_tensor_batch(
-        sample_indices=sample_indices,
-        embedding_indices=embedding_indices,
-        aggregation_weights=aggregation_weights,
-        table_ids=table_ids,
-        device_ordinal=device_ordinal,
-        combiners=combiners,
-        mode_override=mode_override,
-        name=name)
-
-  enqueue_tpu_embedding_sparse_tensor_batch.__doc__ = (
-      gen_tpu_ops._enqueue_tpu_embedding_sparse_tensor_batch.__doc__)
-
-else:
-  # We have already built the appropriate libraries into the binary via CMake
-  # if we have built contrib, so we don't need this
-  pass
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.ops.tpu_ops import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/ops/tpu_ordinal_selector_op.py b/tensorflow/contrib/tpu/python/ops/tpu_ordinal_selector_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..788e1fe0568cf2f406c379e4d928100ea51a37a3
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/ops/tpu_ordinal_selector_op.py
@@ -0,0 +1,23 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.ops.tpu_ordinal_selector_op import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/profiler/__init__.py b/tensorflow/contrib/tpu/python/profiler/__init__.py
index 15ce6aceec299adacd7025f0021cf8b6f6ef765b..aeb061dbe114bc287946b50d08a86778c78c7b38 100644
--- a/tensorflow/contrib/tpu/python/profiler/__init__.py
+++ b/tensorflow/contrib/tpu/python/profiler/__init__.py
@@ -1,31 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-
-"""Classes for TPU trace events."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import,unused-import
-from tensorflow.contrib.tpu.profiler.tpu_profiler_analysis_pb2 import *
-from tensorflow.contrib.tpu.profiler.trace_events_pb2 import *
+from tensorflow.python.tpu.profiler import *
 # pylint: enable=wildcard-import,unused-import
-
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = ['Trace', 'Resource', 'Device', 'TraceEvent']
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/tpu/python/tpu/__init__.py b/tensorflow/contrib/tpu/python/tpu/__init__.py
index 0dffd7064b19f353aed6afa3ad383564643a4a90..82d4f68c0221013706f70bcf54ae4c97cc7db1d3 100644
--- a/tensorflow/contrib/tpu/python/tpu/__init__.py
+++ b/tensorflow/contrib/tpu/python/tpu/__init__.py
@@ -1,20 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-
-"""Ops related to Tensor Processing Units."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/_tpu_estimator_embedding.py b/tensorflow/contrib/tpu/python/tpu/_tpu_estimator_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..41aa4d267812cabe775459723df7e01efaa83c93
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/_tpu_estimator_embedding.py
@@ -0,0 +1,23 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu._tpu_estimator_embedding import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py b/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
index 1b09ce173a64ba3f93ec019c8fd65dc4710f0fcf..5eb8034e47474873ccef0b6123f2becd0668738c 100644
--- a/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
+++ b/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
@@ -1,212 +1,23 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the 'License');
+# Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an 'AS IS' BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ======================================
-"""Hook for asynchronous checkpointing.
-
-This hook dispatches checkpoint writing operations in a separate thread to
-allow execution to continue on the main thread.
-"""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-import threading
-import time
-
-from tensorflow.core.util.event_pb2 import SessionLog
-from tensorflow.python.framework import meta_graph
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import basic_session_run_hooks
-from tensorflow.python.training import training_util
-from tensorflow.python.training.session_run_hook import SessionRunArgs
-from tensorflow.python.training.summary_io import SummaryWriterCache
-
-
-class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
-  """Saves checkpoints every N steps or seconds."""
-
-  def __init__(self,
-               checkpoint_dir,
-               save_secs=None,
-               save_steps=None,
-               saver=None,
-               checkpoint_basename="model.ckpt",
-               scaffold=None,
-               listeners=None):
-    """Initializes a `CheckpointSaverHook`.
-
-    Args:
-      checkpoint_dir: `str`, base directory for the checkpoint files.
-      save_secs: `int`, save every N secs.
-      save_steps: `int`, save every N steps.
-      saver: `Saver` object, used for saving.
-      checkpoint_basename: `str`, base name for the checkpoint files.
-      scaffold: `Scaffold`, use to get saver object.
-      listeners: List of `CheckpointSaverListener` subclass instances. Used for
-        callbacks that run immediately before or after this hook saves the
-        checkpoint.
-
-    Raises:
-      ValueError: One of `save_steps` or `save_secs` should be set.
-      ValueError: At most one of `saver` or `scaffold` should be set.
-    """
-    logging.info("Create AsyncCheckpointSaverHook.")
-    if saver is not None and scaffold is not None:
-      raise ValueError("You cannot provide both saver and scaffold.")
-    self._saver = saver
-    self._save_thread = None
-    self._write_graph_thread = None
-    self._checkpoint_dir = checkpoint_dir
-    self._save_path = os.path.join(checkpoint_dir, checkpoint_basename)
-    self._scaffold = scaffold
-    self._timer = basic_session_run_hooks.SecondOrStepTimer(
-        every_secs=save_secs, every_steps=save_steps)
-    self._listeners = listeners or []
-    self._steps_per_run = 1
-    self._summary_writer = None
-    self._global_step_tensor = None
-
-    self._last_checkpoint_step = None
-
-  def _set_steps_per_run(self, steps_per_run):
-    self._steps_per_run = steps_per_run
-
-  def begin(self):
-    self._summary_writer = SummaryWriterCache.get(self._checkpoint_dir)
-    self._global_step_tensor = training_util._get_or_create_global_step_read()  # pylint: disable=protected-access
-    if self._global_step_tensor is None:
-      raise RuntimeError(
-          "Global step should be created to use CheckpointSaverHook.")
-    for l in self._listeners:
-      l.begin()
-
-  def after_create_session(self, session, coord):
-    global_step = session.run(self._global_step_tensor)
-
-    # We do write graph and saver_def at the first call of before_run.
-    # We cannot do this in begin, since we let other hooks to change graph and
-    # add variables in begin. Graph is finalized after all begin calls.
-    def _write_graph_fn(self):
-      training_util.write_graph(
-          ops.get_default_graph().as_graph_def(add_shapes=True),
-          self._checkpoint_dir, "graph.pbtxt")
-    self._write_graph_thread = threading.Thread(target=_write_graph_fn,
-                                                args=[self])
-    self._write_graph_thread.start()
-
-    saver_def = self._get_saver().saver_def if self._get_saver() else None
-    graph = ops.get_default_graph()
-    meta_graph_def = meta_graph.create_meta_graph_def(
-        graph_def=graph.as_graph_def(add_shapes=True), saver_def=saver_def)
-    self._summary_writer.add_graph(graph)
-    self._summary_writer.add_meta_graph(meta_graph_def)
-    # The checkpoint saved here is the state at step "global_step".
-    self._save(session, global_step)
-    self._timer.update_last_triggered_step(global_step)
-
-  def before_run(self, run_context):  # pylint: disable=unused-argument
-    return SessionRunArgs(self._global_step_tensor)
-
-  def after_run(self, run_context, run_values):
-    global_step = run_context.session.run(self._global_step_tensor)
-    if self._timer.should_trigger_for_step(global_step):
-      self._timer.update_last_triggered_step(global_step)
-      logging.info("Triggering checkpoint. %s", global_step)
-      if self._save(run_context.session, global_step):
-        run_context.request_stop()
-
-  def end(self, session):
-    if self._save_thread:
-      logging.info("Waiting for any pending checkpoints to finish.")
-      self._save_thread.join()
-    if self._write_graph_thread:
-      logging.info("Waiting for any pending write_graph to finish.")
-      self._write_graph_thread.join()
-
-    last_step = session.run(self._global_step_tensor)
-
-    if self._last_checkpoint_step != last_step:
-      self._save(session, last_step, asynchronous=False)
-
-    for l in self._listeners:
-      l.end(session, last_step)
-
-  def _save(self, session, step, asynchronous=True):
-    """Saves the latest checkpoint, returns should_stop."""
-
-    # Skip saving on step 0
-    if step == 0:
-      return
-
-    def _save_fn():
-      """Run the saver process."""
-      logging.info("Saving checkpoints for %d into %s.", step, self._save_path)
-
-      start_time = time.time()
-      for l in self._listeners:
-        l.before_save(session, step)
-
-      self._get_saver().save(session, self._save_path, global_step=step)
-      self._summary_writer.add_session_log(
-          SessionLog(
-              status=SessionLog.CHECKPOINT, checkpoint_path=self._save_path),
-          step)
-
-      for l in self._listeners:
-        l.after_save(session, step)
-
-      end_time = time.time()
-      logging.info("Checkpoint actual writing time: (%.3f sec)",
-                   end_time - start_time)
-      logging.info("Checkpoint finished for %d into %s.", step, self._save_path)
-
-    if not asynchronous:
-      self._last_checkpoint_step = step
-      _save_fn()
-      return
-
-    if self._save_thread is not None:
-      self._save_thread.join(timeout=0.1)
-      if self._save_thread.is_alive():
-        logging.info("Saver thread still in progress, skipping checkpoint.")
-        return
-
-    self._last_checkpoint_step = step
-    self._save_thread = threading.Thread(target=_save_fn)
-    self._save_thread.start()
-
-  def _get_saver(self):
-    if self._saver is not None:
-      return self._saver
-    elif self._scaffold is not None:
-      return self._scaffold.saver
-
-    # Get saver from the SAVERS collection if present.
-    collection_key = ops.GraphKeys.SAVERS
-    savers = ops.get_collection(collection_key)
-    if not savers:
-      raise RuntimeError(
-          "No items in collection {}. Please add a saver to the collection "
-          "or provide a saver or scaffold.".format(collection_key))
-    elif len(savers) > 1:
-      raise RuntimeError(
-          "More than one item in collection {}. "
-          "Please indicate which one to use by passing it to the constructor."
-          .format(collection_key))
-
-    self._saver = savers[0]
-    return savers[0]
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.async_checkpoint import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/bfloat16.py b/tensorflow/contrib/tpu/python/tpu/bfloat16.py
index fa74f651aa63c72d14eb78c8af479263810e9b7d..f3d392a8daec2a80f974d90051324a02be002afd 100644
--- a/tensorflow/contrib/tpu/python/tpu/bfloat16.py
+++ b/tensorflow/contrib/tpu/python/tpu/bfloat16.py
@@ -1,77 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-
-"""Helper context for running models with bfloat16."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.util import tf_contextlib
-
-
-def _get_custom_getter():
-  """Returns a custom getter that this class's methods must be called under.
-
-  All methods of this class must be called under a variable scope that was
-  passed this custom getter. Example:
-
-  ```python
-  network = ConvNetBuilder(...)
-  with tf.variable_scope('cg', custom_getter=network.get_custom_getter()):
-    network.conv(...)
-    # Call more methods of network here
-  ```
-
-  Currently, this custom getter only does anything if self.use_tf_layers is
-  True. In that case, it causes variables to be stored as dtype
-  self.variable_type, then casted to the requested dtype, instead of directly
-  storing the variable as the requested dtype.
-  """
-
-  def inner_custom_getter(getter, *args, **kwargs):
-    """Custom getter that forces variables to have type self.variable_type."""
-    cast_to_bfloat16 = False
-    requested_dtype = kwargs['dtype']
-    if requested_dtype == dtypes.bfloat16:
-      # Only change the variable dtype if doing so does not decrease variable
-      # precision.
-      kwargs['dtype'] = dtypes.float32
-      cast_to_bfloat16 = True
-    var = getter(*args, **kwargs)
-    # This if statement is needed to guard the cast, because batch norm
-    # assigns directly to the return value of this custom getter. The cast
-    # makes the return value not a variable so it cannot be assigned. Batch
-    # norm variables are always in fp32 so this if statement is never
-    # triggered for them.
-    if cast_to_bfloat16:
-      var = math_ops.cast(var, dtypes.bfloat16)
-    return var
-
-  return inner_custom_getter
-
-
-@tf_contextlib.contextmanager
-def bfloat16_scope():
-  """Scope class for bfloat16 variables so that the model uses custom getter.
-
-  This enables variables to be read as bfloat16 type when using get_variable.
-  """
-  with variable_scope.variable_scope(
-      '', custom_getter=_get_custom_getter()) as varscope:
-    yield varscope
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.bfloat16 import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets.py b/tensorflow/contrib/tpu/python/tpu/datasets.py
index 8d6245390fc3fa005c92d01bc9b64ddb47583582..c20aac7e36aa31c5a9d88ca6fe02a8703f9ed5a3 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets.py
@@ -1,194 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ======================================
-"""Library of Cloud TPU helper functions for data loading."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.data.experimental.ops import batching
-from tensorflow.python.data.experimental.ops import interleave_ops
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.data.ops import readers
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import function
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import functional_ops
-
-
-def _TextLineDataset(filename):
-  buffer_size = 8 * 1024 * 1024  # 8 MiB per file
-  dataset = readers.TextLineDataset(filename, buffer_size=buffer_size)
-  return dataset
-
-
-def _TFRecordDataset(filename):
-  buffer_size = 8 * 1024 * 1024  # 8 MiB per file
-  dataset = readers.TFRecordDataset(filename, buffer_size=buffer_size)
-  return dataset
-
-
-_FILETYPE_MAP = {
-    'tfrecord': _TFRecordDataset,
-    'textline': _TextLineDataset,
-    'text': _TextLineDataset,
-}
-
-
-def StreamingFilesDataset(files,
-                          filetype=None,
-                          file_reader_job=None,
-                          worker_job=None,
-                          num_epochs=None,
-                          filename_shuffle_buffer_size=None,
-                          num_parallel_reads=None,
-                          batch_transfer_size=None,
-                          sloppy=None):
-  """StreamingFilesDataset constructs a dataset to stream from workers (GCE VM).
-
-  Because Cloud TPUs are allocated over the network, a Cloud TPU cannot read
-  files local to your GCE VM. In order to train using files stored on your local
-  VM (e.g. on local SSD for extreme performance), use the StreamingFilesDataset
-  helper to generate a dataset to feed your Cloud TPU with files from your GCE
-  VM.
-
-  The resulting dataset may return an OutOfRangeError if there are no files
-  found as a result of the fileglob expansion.
-
-  Note: StreamingFilesDataset assumes that the session is using a
-  TPUClusterResolver and has therefore a worker and a coordinator job. File
-  loading will be done on the coordinator job.
-
-  Args:
-    files: A string glob to match files, or a `tf.data.Dataset` generating file
-      names.
-    filetype: A string (one of 'tfrecord', or 'textline') or a single-argument
-      TensorFlow function that when given a filename returns a dataset.
-    file_reader_job: An optional string that corresponds to the job that should
-      perform the file reads.
-    worker_job: An optional string that corresponds to the job that should
-      process the tensors (i.e. your GPU or TPU worker).
-    num_epochs: The number of epochs through the training set that should be
-      generated. By default, it will repeat infinitely.
-    filename_shuffle_buffer_size: An optional integer whose value controls the
-      shuffling of the file names. If you would like to read from the files in
-      the same order, set to 0 or False.
-    num_parallel_reads: An optional integer controlling the number of files to
-      read from concurrently. (Set to 1 for no parallelism.)
-    batch_transfer_size: An optional integer controlling the batching used to
-      amortize the remote function invocation overhead. Set to a very large
-      number to increase throughput. Set to a very small number to reduce memory
-      consumption. Set to False to skip batching.
-    sloppy: (Optional.) If `False`, read input data while maintaining a
-      deterministic order. (This may have significant performance impacts.)
-      sloppy defaults to: True.
-  Returns:
-    A `tf.data.Dataset` with an infinite stream of elements generated by a
-    parallel interleaving of the set of files matched (or generated) by `files`
-    with a type is the output of the dataset specified by `filetype`.
-
-  Raises:
-    ValueError: if any argument is not of the expected type.
-  """
-  if filetype is None:
-    filetype = 'tfrecord'
-
-  if isinstance(filetype, str):
-    if filetype not in _FILETYPE_MAP:
-      raise ValueError('Unexpected filetype: %s' % filetype)
-    reader_fn = _FILETYPE_MAP[filetype]
-  elif callable(filetype):
-    reader_fn = filetype
-  else:
-    raise ValueError('filetype should be a string or a callable')
-
-  file_reader_job = file_reader_job or 'coordinator'
-
-  worker_job = worker_job or 'worker'
-
-  if filename_shuffle_buffer_size is None:
-    filename_shuffle_buffer_size = 4096
-
-  num_parallel_reads = num_parallel_reads or 8
-
-  if batch_transfer_size is None:
-    batch_transfer_size = 256
-
-  if sloppy is None:
-    sloppy = True
-
-  with ops.device('/job:%s' % file_reader_job):
-    if isinstance(files, str):
-      source_dataset = dataset_ops.Dataset.list_files(files)
-    elif isinstance(files, dataset_ops.DatasetV2):
-      source_dataset = files
-    else:
-      raise ValueError('files was not a string or a dataset: %s' % files)
-
-    if filename_shuffle_buffer_size:
-      source_dataset = source_dataset.shuffle(
-          buffer_size=filename_shuffle_buffer_size)
-
-    # NOTE: We perform the `repeat` on the source dataset, because the output
-    # dataset does not currently have enough information to recreate an iterator
-    # over the source dataset when it reaches the end.
-    source_dataset = source_dataset.repeat(num_epochs)
-
-    source_dataset = source_dataset.apply(
-        interleave_ops.parallel_interleave(
-            reader_fn, cycle_length=num_parallel_reads, sloppy=sloppy))
-
-    if batch_transfer_size:
-      source_dataset = source_dataset.batch(batch_transfer_size)
-
-    source_dataset = source_dataset.prefetch(1)
-
-    source_iterator = dataset_ops.make_one_shot_iterator(source_dataset)
-    source_handle = source_iterator.string_handle()
-
-  @function.Defun(dtypes.string)
-  def LoadingFunc(h):
-    remote_iterator = iterator_ops.Iterator.from_string_handle(
-        h, source_dataset.output_types, source_dataset.output_shapes)
-    return remote_iterator.get_next()
-
-  def MapFn(unused_input):
-    if isinstance(source_dataset.output_types, dtypes.DType):
-      output_types = [source_dataset.output_types]
-    elif isinstance(source_dataset.output_types, (list, tuple)):
-      output_types = source_dataset.output_types
-    else:
-      raise ValueError('source dataset has invalid output types')
-    remote_calls = functional_ops.remote_call(
-        args=[source_handle],
-        Tout=output_types,
-        f=LoadingFunc,
-        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)
-    if len(remote_calls) == 1:
-      return remote_calls[0]
-    else:
-      return remote_calls
-
-  with ops.device('/job:%s' % worker_job):
-    output_dataset = dataset_ops.Dataset.range(2).repeat().map(
-        MapFn, num_parallel_calls=4 if sloppy else None)
-    output_dataset = output_dataset.prefetch(1)
-
-    if batch_transfer_size:
-      # Undo the batching used during the transfer.
-      output_dataset = output_dataset.apply(batching.unbatch()).prefetch(1)
-
-  return output_dataset
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.datasets import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/device_assignment.py b/tensorflow/contrib/tpu/python/tpu/device_assignment.py
index 6906501ecf90c8e577aa0becf2dba818deb19df4..05dffef3a1efdae2ad7306ca5ad3bc7a9eac04cf 100644
--- a/tensorflow/contrib/tpu/python/tpu/device_assignment.py
+++ b/tensorflow/contrib/tpu/python/tpu/device_assignment.py
@@ -1,310 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ======================================
-"""Library of TPU helper functions."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
-import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.contrib.tpu.python.tpu.topology import Topology
-
-
-def _compute_task_and_cores_to_replicas(core_assignment, topology):
-  """Computes a nested dict which maps task and logical core to replicas."""
-  task_and_cores_to_replicas = {}
-  for replica in xrange(core_assignment.shape[0]):
-    for logical_core in xrange(core_assignment.shape[1]):
-      coordinates = core_assignment[replica, logical_core, :]
-      task_id = topology.task_ordinal_at_coordinates(coordinates)
-      if task_id not in task_and_cores_to_replicas:
-        task_and_cores_to_replicas[task_id] = {}
-      if logical_core not in task_and_cores_to_replicas[task_id]:
-        task_and_cores_to_replicas[task_id][logical_core] = set()
-
-      task_and_cores_to_replicas[task_id][logical_core].add(replica)
-
-  task_to_sorted_replica_id = {}
-
-  for task, core_to_replicas in task_and_cores_to_replicas.items():
-    core_to_sorted_replicas = {}
-    for core, replicas in core_to_replicas.items():
-      core_to_sorted_replicas[core] = sorted(replicas)
-
-    task_to_sorted_replica_id[task] = core_to_sorted_replicas
-  return task_to_sorted_replica_id
-
-
-class DeviceAssignment(object):
-  """Mapping from logical cores in a computation to the physical TPU topology.
-
-  Prefer to use the `device_assignment()` helper to construct a
-  `DeviceAssignment`; it is easier if less flexible than constructing a
-  `DeviceAssignment` directly.
-  """
-
-  def __init__(self, topology, core_assignment):
-    """Constructs a `DeviceAssignment` object.
-
-    Args:
-      topology: A `Topology` object that describes the physical TPU topology.
-      core_assignment: A logical to physical core mapping, represented as a
-        rank 3 numpy array. See the description of the `core_assignment`
-        property for more details.
-
-    Raises:
-      ValueError: If `topology` is not `Topology` object.
-      ValueError: If `core_assignment` is not a rank 3 numpy array.
-    """
-    if not isinstance(topology, Topology):
-      raise ValueError("topology must be a Topology object, got {}".format(
-          type(topology)))
-    core_assignment = np.asarray(core_assignment, dtype=np.int32)
-
-    self._topology = topology
-
-    if core_assignment.ndim != 3:
-      raise ValueError("core_assignment must be a rank 3 numpy array, "
-                       "got shape {}".format(core_assignment.shape))
-
-    self._num_replicas = core_assignment.shape[0]
-    self._num_cores_per_replica = core_assignment.shape[1]
-
-    if core_assignment.shape[-1] != topology.mesh_rank:
-      raise ValueError(
-          "minor dimension of core_assignment must have size equal to topology "
-          "rank ({}), got shape {}".format(topology.mesh_rank,
-                                           core_assignment.shape))
-
-    self._core_assignment = core_assignment
-    self._task_and_cores_to_replicas = _compute_task_and_cores_to_replicas(
-        self._core_assignment, topology)
-
-  @property
-  def topology(self):
-    """A `Topology` that describes the TPU topology."""
-    return self._topology
-
-  @property
-  def num_cores_per_replica(self):
-    """The number of cores per replica."""
-    return self._num_cores_per_replica
-
-  @property
-  def num_replicas(self):
-    """The number of replicas of the computation."""
-    return self._num_replicas
-
-  @property
-  def core_assignment(self):
-    """The logical to physical core mapping.
-
-    Returns:
-      An integer numpy array of rank 3, with shape
-      `[num_replicas, num_cores_per_replica, topology_rank]`. Maps
-      (replica, logical core) pairs to physical topology coordinates.
-    """
-    return self._core_assignment
-
-  def _coordinates(self, replica, logical_core):
-    """Returns the physical topology coordinates of a logical core."""
-    return tuple(self.core_assignment[replica, logical_core, :])
-
-  def lookup_replicas(self, task_id, logical_core):
-    """Lookup replica ids by task number and logical core.
-
-    Args:
-      task_id: TensorFlow task number.
-      logical_core: An integer, identifying a logical core.
-    Returns:
-      A sorted list of the replicas that are attached to that task and
-      logical_core.
-    Raises:
-      ValueError: If no replica exists in the task which contains the logical
-      core.
-    """
-    try:
-      return self._task_and_cores_to_replicas[task_id][logical_core]
-    except KeyError:
-      raise ValueError(
-          "Can not find any replica in task: {} contains logical_core: {} ".
-          format(task_id, logical_core))
-
-  def tpu_ordinal(self, replica=0, logical_core=0):
-    """Returns the ordinal of the TPU device assigned to a logical core."""
-    coordinates = self._coordinates(replica, logical_core)
-    return self._topology.tpu_device_ordinal_at_coordinates(coordinates)
-
-  def host_device(self, replica=0, logical_core=0, job=None):
-    """Returns the CPU device attached to a logical core."""
-    coordinates = self._coordinates(replica, logical_core)
-    return self._topology.cpu_device_name_at_coordinates(coordinates, job=job)
-
-  def tpu_device(self, replica=0, logical_core=0, job=None):
-    """Returns the name of the TPU device assigned to a logical core."""
-    coordinates = self._coordinates(replica, logical_core)
-    return self._topology.tpu_device_name_at_coordinates(coordinates, job=job)
-
-
-def device_assignment(topology,
-                      computation_shape=None,
-                      computation_stride=None,
-                      num_replicas=1):
-  """Computes a device_assignment of a computation across a TPU topology.
-
-  Attempts to choose a compact grid of cores for locality.
-
-  Returns a `DeviceAssignment` that describes the cores in the topology assigned
-  to each core of each replica.
-
-  `computation_shape` and `computation_stride` values should be powers of 2 for
-  optimal packing.
-
-  Args:
-    topology: A `Topology` object that describes the TPU cluster topology.
-      To obtain a TPU topology, evaluate the `Tensor` returned by
-      `initialize_system` using `Session.run`. Either a serialized
-      `TopologyProto` or a `Topology` object may be passed. Note: you must
-      evaluate the `Tensor` first; you cannot pass an unevaluated `Tensor` here.
-    computation_shape: A rank 1 int32 numpy array with size equal to the
-      topology rank, describing the shape of the computation's block of cores.
-      If None, the `computation_shape` is `[1] * topology_rank`.
-    computation_stride: A rank 1 int32 numpy array of size `topology_rank`,
-      describing the inter-core spacing of the `computation_shape` cores in the
-      TPU topology. If None, the `computation_stride` is `[1] * topology_rank`.
-    num_replicas: The number of computation replicas to run. The replicas will
-      be packed into the free spaces of the topology.
-
-  Returns:
-    A DeviceAssignment object, which describes the mapping between the logical
-    cores in each computation replica and the physical cores in the TPU
-    topology.
-
-  Raises:
-    ValueError: If `topology` is not a valid `Topology` object.
-    ValueError: If `computation_shape` or `computation_stride` are not 1D int32
-      numpy arrays with shape [3] where all values are positive.
-    ValueError: If computation's replicas cannot fit into the TPU topology.
-  """
-  # Deserialize the Topology proto, if it is a string.
-  if isinstance(topology, bytes):
-    topology = Topology(serialized=topology)
-
-  if not isinstance(topology, Topology):
-    raise ValueError("`topology` is not a Topology object; got {}".format(
-        type(topology)))
-
-  topology_rank = len(topology.mesh_shape)
-  mesh_shape = topology.mesh_shape
-  if computation_shape is None:
-    computation_shape = np.array([1] * topology_rank, dtype=np.int32)
-  else:
-    computation_shape = np.asarray(computation_shape, dtype=np.int32)
-
-  if computation_stride is None:
-    computation_stride = np.array([1] * topology_rank, dtype=np.int32)
-  else:
-    computation_stride = np.asarray(computation_stride, dtype=np.int32)
-
-  if computation_shape.shape != (topology_rank,):
-    raise ValueError("computation_shape must have shape [{}]; got {}".format(
-        topology_rank, computation_shape.shape))
-  if computation_stride.shape != (topology_rank,):
-    raise ValueError("computation_stride must have shape [{}]; got {}".format(
-        topology_rank, computation_stride.shape))
-
-  if any(computation_shape < 1):
-    raise ValueError(
-        "computation_shape must be positive; got computation_shape={}".format(
-            computation_shape))
-  if any(computation_stride < 1):
-    raise ValueError(
-        "computation_stride must be positive; got computation_stride={}".format(
-            computation_stride))
-
-  # Computes the physical size of one computation instance.
-  computation_footprint = computation_shape * computation_stride
-  if any(computation_footprint > mesh_shape):
-    raise ValueError(
-        "computation footprint {} does not fit in TPU topology shape {}".format(
-            computation_footprint, mesh_shape))
-
-  # Computes how many copies of the computation footprint fit in the mesh.
-  block_counts = mesh_shape // computation_footprint
-
-  replica_counts = block_counts * computation_stride
-  max_replicas = np.prod(replica_counts)
-  if num_replicas > max_replicas:
-    raise ValueError(
-        "requested {} replicas but only {} replicas with shape {} and "
-        "computation_stride {} fit in a TPU mesh of shape {}".format(
-            num_replicas, max_replicas, computation_shape, computation_stride,
-            mesh_shape))
-
-  def ceil_of_ratio(n, m):
-    return (n + m - 1) // m
-
-  replica_shape = [0] * topology_rank
-  if num_replicas > 0:
-    remaining_replicas = num_replicas
-    remaining_dims = topology_rank
-
-    # Choose dimensions as close to an equal cube as possible, in order of
-    # increasing dimension size. By visiting dimensions in increasing size, we
-    # assign the most constrained dimension first, so we won't make infeasible
-    # choices.
-    #
-    # As a secondary sort order, visit the dimensions in reverse order. This
-    # means we try to use both cores on the same chip in preference to two cores
-    # on different chips.
-    for x, ni in sorted(((x, -i) for (i, x) in enumerate(replica_counts))):
-      i = -ni
-      target_size = int(math.ceil(remaining_replicas**(1.0 / remaining_dims)))
-      replica_shape[i] = min(target_size, x)
-      remaining_replicas = ceil_of_ratio(remaining_replicas, replica_shape[i])
-      remaining_dims -= 1
-
-    assert remaining_replicas == 1 and remaining_dims == 0
-
-  # Assigns an offset to each replica such that no two replicas overlap.
-  replica_offsets = np.full([num_replicas, topology_rank], -1, dtype=np.int32)
-  for replica in xrange(num_replicas):
-    # Chooses a replica number in each axis.
-    t = replica
-    pos = []
-    for dim in replica_shape[::-1]:
-      pos.append(t % dim)
-      t //= dim
-    replica_pos = np.array(pos[::-1], dtype=np.int32)
-
-    # Determines where that replica starts in each axis.
-    outer = replica_pos // computation_stride
-    inner = replica_pos % computation_stride
-    replica_offsets[replica, :] = outer * computation_footprint + inner
-
-  # Computes a complete logical core -> physical core mapping for each replica.
-  indices = [
-      np.arange(0, computation_shape[i] * computation_stride[i],
-                computation_stride[i]) for i in xrange(topology_rank)
-  ]
-  indices = np.concatenate(
-      [i[..., np.newaxis] for i in np.meshgrid(*indices, indexing="ij")],
-      axis=-1)
-  indices = indices.reshape((-1, topology_rank))
-  assignment = indices + replica_offsets[:, np.newaxis, :]
-  return DeviceAssignment(topology, core_assignment=assignment)
+# pylint: disable=wildcard-import,unused-import,redefined-builtin
+from tensorflow.python.tpu.device_assignment import *
+# pylint: enable=wildcard-import,unused-import,redefined-builtin
diff --git a/tensorflow/contrib/tpu/python/tpu/error_handling.py b/tensorflow/contrib/tpu/python/tpu/error_handling.py
index 52e1ea42370d653d1de7c12eee4b456ec7ce921c..1b1328b4075d9a737e40693c13e33e0b7c1fbedf 100644
--- a/tensorflow/contrib/tpu/python/tpu/error_handling.py
+++ b/tensorflow/contrib/tpu/python/tpu/error_handling.py
@@ -1,132 +1,23 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-"""ErrorRendezvous handler for collecting errors from multiple threads."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
-import sys
-import threading
-import time
-
-import six
-
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import tf_logging as logging
-
-_UNINTERESTING_ERRORS = (errors.CancelledError,)
-
-
-class ErrorRendezvous(object):
-  """Resolve errors from multiple threads during TPU execution.
-
-  TPU errors can occur on the infeed or outfeed threads as well as the main
-  training thread.
-
-  Depending on which thread "wins" and receives the session error first, we may
-  end up showing users a confusing and non-actionable error message (session
-  cancelled) instead of a root cause (e.g. a bad filename).
-
-  The rendezvous object provides a location to capture these errors until all
-  threads terminate.  At that point we can choose the most informative error
-  to report.
-  """
-
-  def __init__(self, num_sources):
-    # string -> (message, traceback)
-    self._errors = {}
-    self._num_sources = num_sources
-    self._session_cancel_timer = None
-
-  def record_error(self, source, exc_info, session=None):
-    """Report an exception from the given source.
-
-    If a session is passed, a timer will be registered to close it after a few
-    seconds.  This is necessary to ensure the main training loop does not hang
-    if an infeed/oufeed error occurs.  We sleep a few seconds to allow a more
-    interesting error from another thread to propagate.
-
-    Args:
-      source: string, source of the error
-      exc_info: Output from `sys.exc_info` (type, value, traceback)
-      session: Session to close after delay.
-    """
-    _, value, _ = exc_info
-    self._errors[source] = exc_info
-    logging.info('Error recorded from %s: %s', source, value)
-
-    if session is not None and self._session_cancel_timer is None:
-
-      def _cancel_session():
-        time.sleep(5)
-        try:
-          session.close()
-        except:  # pylint: disable=bare-except
-          pass
-
-      self._session_cancel_timer = threading.Thread(target=_cancel_session,)
-      self._session_cancel_timer.daemon = True
-      self._session_cancel_timer.start()
-
-  def record_done(self, source):
-    """Mark execution source `source` as done.
-
-    If an error was originally reported from `source` it is left intact.
-
-    Args:
-      source: `str`, source being recorded
-    """
-    logging.info('%s marked as finished', source)
-    if source not in self._errors:
-      self._errors[source] = None
-
-  @contextlib.contextmanager
-  def catch_errors(self, source, session=None):
-    """Context manager to report any errors within a block."""
-    try:
-      yield
-    except Exception:  # pylint: disable=broad-except
-      self.record_error(source, sys.exc_info(), session)
-
-  def raise_errors(self, timeout_sec=0):
-    """Wait for up to `timeout` seconds for all error sources to finish.
-
-    Preferentially raise "interesting" errors (errors not in the
-    _UNINTERESTING_ERRORS) set.
-
-    Args:
-      timeout_sec: Seconds to wait for other error sources.
-    """
-    for _ in range(timeout_sec):
-      if len(self._errors) == self._num_sources:
-        break
-      time.sleep(1)
-
-    kept_errors = [(k, v) for (k, v) in self._errors.items() if v is not None]
-
-    # First check for any interesting errors, then fall back on the session
-    # cancelled errors etc.
-    for k, (typ, value, traceback) in kept_errors:
-      if isinstance(value, _UNINTERESTING_ERRORS):
-        continue
-      else:
-        logging.warn('Reraising captured error')
-        six.reraise(typ, value, traceback)
-
-    for k, (typ, value, traceback) in kept_errors:
-      logging.warn('Reraising captured error')
-      six.reraise(typ, value, traceback)
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.error_handling import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/feature_column.py b/tensorflow/contrib/tpu/python/tpu/feature_column.py
new file mode 100644
index 0000000000000000000000000000000000000000..ded75e975b10c4265370af260bf804687c9caebc
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/feature_column.py
@@ -0,0 +1,30 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.feature_column import *
+# used by tests
+from tensorflow.python.tpu.feature_column import _is_running_on_cpu
+from tensorflow.python.tpu.feature_column import _record_variable_scope_and_name
+from tensorflow.python.tpu.feature_column import _TPU_FC_TO_SCOPE
+from tensorflow.python.tpu.feature_column import _TPUBaseEmbeddingColumn
+from tensorflow.python.tpu.feature_column import _TPUEmbeddingColumn
+from tensorflow.python.tpu.feature_column import _TPUSharedEmbeddingColumn
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/functional.py b/tensorflow/contrib/tpu/python/tpu/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a5759221ed9660200cc213df69961db56f8d490
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/functional.py
@@ -0,0 +1,23 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.functional import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py
index 4ce194590342555a7c4e9e119bf51e516a37a715..6ad4e45e9625f191bb4c01f70b434dc2c4fba638 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py
@@ -55,8 +55,6 @@ import numpy as np
 import six
 
 from tensorflow.contrib.cluster_resolver.python.training import tpu_cluster_resolver as tpu_cluster_resolver_lib
-from tensorflow.contrib.framework.python.framework import experimental
-from tensorflow.contrib.tpu.proto import compilation_result_pb2 as tpu_compilation_result
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import keras_tpu_variables
 from tensorflow.contrib.tpu.python.tpu import tpu
@@ -64,6 +62,7 @@ from tensorflow.contrib.tpu.python.tpu import tpu_function
 from tensorflow.contrib.tpu.python.tpu import tpu_optimizer
 from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf.tpu import compilation_result_pb2 as tpu_compilation_result
 from tensorflow.python.client import session as tf_session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
@@ -94,6 +93,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.deprecation import deprecated
 
 
 # TODO(b/114775106): temporary shim to optionally initialize the TPU
@@ -1373,6 +1373,10 @@ class KerasTPUModel(models.Model):
     # not hashable.
     self._numpy_to_infeed_manager_list = []
 
+    # Add distribution specific arguments since we don't call the Model init.
+    self._distribution_strategy = None
+    self._compile_distribution = None
+
     self.predict_function = None
     self.test_function = None
     self.train_function = None
@@ -2069,6 +2073,8 @@ class KerasTPUModel(models.Model):
       # tpu_model may not be compiled, e.g., loading weights and then predict.
       return
     for k, v in six.iteritems(cpu_optimizer_config):
+      if k == 'name':
+        continue
       opt_var = getattr(self._tpu_model.optimizer, k)
       if isinstance(opt_var, variables.Variable):
         logging.info('CPU -> TPU %s: %s {%s}', k, v, K.get_value(opt_var))
@@ -2097,6 +2103,8 @@ class KerasTPUModel(models.Model):
     self._cpu_model.set_weights(tpu_weights)
     for k, v in six.iteritems(tpu_optimizer_config):
       logging.info('TPU -> CPU %s: %s', k, v)
+      if k == 'name':
+        continue
       opt_var = getattr(self.cpu_optimizer, k)
       if isinstance(opt_var, variables.Variable):
         K.get_session().run(opt_var.assign(v))
@@ -2164,7 +2172,10 @@ Output shape: %(output_shape)s
 # pylint: enable=bad-continuation
 
 
-@experimental
+@deprecated(
+    '2019-02-20', 'Switch to tf.contrib.distribute.TPUStrategy. '
+    'https://www.tensorflow.org/api_docs/python/tf/contrib/distribute/DistributionStrategy'
+)
 def tpu_model(model, strategy=None):
   """Copy `model` along with weights to the TPU.
 
diff --git a/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py b/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py
index 8b0b240dc7302c203a22349d583323327fc4480b..de425626c813784ef657d17eac0c7bb77599a155 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py
@@ -69,6 +69,7 @@ class ReplicatedVariable(object):
   def __init__(self, name, variables):
     self._name = name
     self._primary_var = variables[0]
+    self._common_name = self._primary_var.name.split(":")[0]
     self._vars = variables
     self._cached_value = None
     self._dtype = variables[0].dtype
diff --git a/tensorflow/contrib/tpu/python/tpu/session_support.py b/tensorflow/contrib/tpu/python/tpu/session_support.py
index 3e463823c820a3ef8628324f77e1a9caf8d385d5..ed8f9525c9b91208d39805654b01837abdbf3a77 100644
--- a/tensorflow/contrib/tpu/python/tpu/session_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/session_support.py
@@ -1,433 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the 'License');
+# Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an 'AS IS' BASIS,
+# distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ======================================
-"""Operations for handling session logging and shutdown notifications."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import threading
-
-import time
-from google.protobuf import text_format
-
-from tensorflow.contrib.tpu.python.ops import tpu_ops
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.util import event_pb2
-from tensorflow.python.client import session as session_lib
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import session_run_hook
-from tensorflow.python.training import training_util
-
-_WATCHDOG = None
-
-
-class CoordinatorShutdownException(Exception):
-  """Raised when the coordinator needs to shutdown."""
-  pass
-
-
-def _clone_session(session, graph=None):
-  return session_lib.Session(
-      target=session.sess_str,
-      config=session._config,  # pylint: disable=protected-access
-      graph=graph if graph else session.graph)
-
-
-def _make_heartbeat_op(session, device, request_ph):
-  """Return a heartbeat op or None if heartbeats are not supported by device."""
-  try:
-    # Test if we can connect in a isolated graph + session
-    with ops.Graph().as_default():
-      with _clone_session(session) as temp_session:
-        with ops.device(device):
-          heartbeat_op = tpu_ops.worker_heartbeat('')
-          options = config_pb2.RunOptions(timeout_in_ms=5000)
-          temp_session.run(heartbeat_op, options=options)
-  except errors.InvalidArgumentError as _:
-    logging.warning('Error running heartbeat on %s', device)
-    return None
-  except errors.DeadlineExceededError as _:
-    logging.warning('Timeout connecting to %s when testing heartbeat', device)
-    return None
-
-  # If we successfully connected and pinged the worker, go ahead and construct
-  # the operation.
-  with ops.device(device):
-    return tpu_ops.worker_heartbeat(request_ph)
-
-
-class WorkerHeartbeatManager(object):
-  """Manages the status/heartbeat monitor for a set of workers."""
-
-  def __init__(self, session, devices, heartbeat_ops, request_placeholder):
-    """Construct a new WorkerHeartbeatManager.
-
-    (Prefer using `WorkerHeartbeatManager.from_devices` when possible.)
-
-    Args:
-      session: `tf.Session`, session to use for heartbeat operations.
-      devices: `list[string]` Set of devices to connect to.
-      heartbeat_ops: `list[tf.Operation]` Heartbeat operations.
-      request_placeholder: `tf.Placeholder[String]` Placeholder used to specify
-        the WorkerHeartbeatRequest protocol buffer.
-    """
-    self._session = session
-    self._devices = devices
-    self._ops = heartbeat_ops
-    self._request_placeholder = request_placeholder
-
-  @staticmethod
-  def from_devices(session, devices):
-    """Construct a heartbeat manager for the given devices."""
-    if not devices:
-      logging.error('Trying to create heartbeat manager with no devices?')
-
-    logging.info('Creating heartbeat manager for %s', devices)
-    request_placeholder = array_ops.placeholder(
-        name='worker_heartbeat_request', dtype=dtypes.string)
-
-    heartbeat_ops = []
-    kept_devices = []
-    for device in devices:
-      heartbeat_op = _make_heartbeat_op(session, device, request_placeholder)
-      if heartbeat_op is not None:
-        kept_devices.append(device)
-        heartbeat_ops.append(heartbeat_op)
-      else:
-        logging.warning('Heartbeat support not available for %s', device)
-
-    return WorkerHeartbeatManager(session, kept_devices, heartbeat_ops,
-                                  request_placeholder)
-
-  def num_workers(self):
-    return len(self._devices)
-
-  def configure(self, message):
-    """Configure heartbeat manager for all devices.
-
-    Args:
-      message: `event_pb2.WorkerHeartbeatRequest`
-    Returns: `None`
-    """
-    logging.info('Configuring worker heartbeat: %s',
-                 text_format.MessageToString(message))
-    self._session.run(self._ops,
-                      {self._request_placeholder: message.SerializeToString()})
-
-  def ping(self, request=None, timeout_in_ms=5000):
-    """Ping all workers, returning the parsed status results."""
-    if request is None:
-      request = event_pb2.WorkerHeartbeatRequest()
-
-    options = config_pb2.RunOptions(timeout_in_ms=timeout_in_ms)
-    results = self._session.run(
-        self._ops,
-        feed_dict={self._request_placeholder: request.SerializeToString()},
-        options=options)
-    parsed_results = [
-        event_pb2.WorkerHeartbeatResponse.FromString(res_pb)
-        for res_pb in results
-    ]
-    logging.debug('Ping results: %s', parsed_results)
-    return parsed_results
-
-  def lame_workers(self):
-    """Ping all workers, returning manager containing lame workers (or None)."""
-    ping_results = self.ping()
-    lame_workers = []
-
-    for ping_response, device, op in zip(ping_results, self._devices,
-                                         self._ops):
-      if ping_response.health_status != event_pb2.OK:
-        lame_workers.append((device, op))
-
-    if not lame_workers:
-      return None
-
-    bad_devices, bad_ops = zip(*lame_workers)
-    return WorkerHeartbeatManager(self._session, bad_devices, bad_ops,
-                                  self._request_placeholder)
-
-  def __repr__(self):
-    return 'HeartbeatManager(%s)' % ','.join(self._devices)
-
-  def shutdown(self, timeout_ms=10000):
-    """Shutdown all workers after `shutdown_timeout_secs`."""
-    logging.info('Shutting down %s.', self)
-    req = event_pb2.WorkerHeartbeatRequest(
-        watchdog_config=event_pb2.WatchdogConfig(timeout_ms=timeout_ms))
-    self.configure(req)
-
-    # Wait for workers to shutdown.  This isn't strictly required
-    # but it avoids triggering multiple checkpoints with the same lame worker.
-    logging.info('Waiting %dms for worker shutdown.', timeout_ms)
-    time.sleep(timeout_ms / 1000)
-
-
-def all_worker_devices(session):
-  """Return a list of devices for each worker in the system."""
-  devices = session.list_devices()
-  return [
-      device.name for device in devices
-      if ':CPU:' in device.name and 'coordinator' not in device.name
-  ]
-
-
-class WatchdogManager(threading.Thread):
-  """Configures worker watchdog timer and handles periodic pings.
-
-  Usage:
-    # Ping workers every minute, shutting down workers if they haven't received
-    # a ping after 1 hour.
-    watchdog_manager = WatchdogManager(
-      ping_interval=60, shutdown_timeout=3600
-    )
-
-    # Use as a context manager, resetting watchdog on context exit:
-    with watchdog_manager:
-      session.run(...)
-
-    # Or setup globally; watchdog will remain active until program exit.
-    watchdog_manager.configure_and_run()
-  """
-
-  def __init__(self,
-               session,
-               devices=None,
-               ping_interval=60,
-               shutdown_timeout=3600):
-    """Initialize a watchdog manager.
-
-    Args:
-      session: Session connected to worker devices.  A cloned session and graph
-        will be created for managing worker pings.
-      devices: Set of devices to monitor.  If none, all workers will be
-        monitored.
-      ping_interval: Time, in seconds, between watchdog pings.
-      shutdown_timeout: Time, in seconds, before watchdog timeout.
-    """
-    threading.Thread.__init__(self)
-    self.ping_interval = ping_interval
-    self.shutdown_timeout = shutdown_timeout
-    self.daemon = True
-    self._config = session._config  # pylint: disable=protected-access
-    self._target = session.sess_str
-    self._running = False
-    self._devices = devices
-
-    self._graph = None
-    self._session = None
-    self._worker_manager = None
-
-  def _reset_manager(self):
-    """Reset the graph, session and worker manager."""
-    self._graph = ops.Graph()
-    self._session = session_lib.Session(
-        target=self._target,
-        graph=self._graph,
-        config=self._config,
-    )
-
-    if self._devices is None:
-      self._devices = all_worker_devices(self._session)
-
-    with self._graph.as_default():
-      self._worker_manager = WorkerHeartbeatManager.from_devices(
-          self._session, self._devices)
-
-    self._worker_manager.configure(
-        event_pb2.WorkerHeartbeatRequest(
-            watchdog_config=event_pb2.WatchdogConfig(
-                timeout_ms=self.shutdown_timeout * 1000,)))
-
-  def configure_and_run(self):
-    logging.info('Enabling watchdog timer with %d second timeout '
-                 'and %d second ping interval.',
-                 self.shutdown_timeout, self.ping_interval)
-    self._reset_manager()
-    self._running = True
-    self.start()
-
-  def stop(self):
-    logging.info('Stopping worker watchdog.')
-    self._worker_manager.configure(
-        event_pb2.WorkerHeartbeatRequest(
-            watchdog_config=event_pb2.WatchdogConfig(timeout_ms=-1,)))
-    self._running = False
-    self.join()
-
-  def __enter__(self):
-    self.configure_and_run()
-
-  def __exit__(self, exc_type, exc_val, exc_tb):
-    self.stop()
-
-  def run(self):
-    # Don't fetch logs or adjust timing: just ping the watchdog.
-    #
-    # If we hit an exception, reset our session as it is likely broken.
-    while self._running:
-      try:
-        self._worker_manager.ping(request=None)
-        time.sleep(self.ping_interval)
-      except errors.OpError as e:
-        # Catch any TF errors that occur so we don't stop sending heartbeats
-        logging.debug('Caught error while sending heartbeat: %s', e)
-        self._reset_manager()
-
-
-def start_worker_watchdog(session,
-                          devices=None,
-                          ping_interval=60,
-                          shutdown_timeout=3600):
-  """Start global worker watchdog to shutdown workers on coordinator exit."""
-  global _WATCHDOG
-  if _WATCHDOG is None:
-    # Ensure we can send a few pings before we timeout!
-    ping_interval = min(shutdown_timeout / 10., ping_interval)
-    _WATCHDOG = WatchdogManager(session, devices, ping_interval,
-                                shutdown_timeout)
-    _WATCHDOG.configure_and_run()
-
-
-class GracefulShutdownHook(session_run_hook.SessionRunHook):
-  """Session hook that watches for shutdown events.
-
-  If a shutdown is indicated, `saver.save(checkpoint_prefix)` is executed, and a
-  SystemShutdown exception is raised to terminate the main session.  If `saver`
-  is None the `SAVERS` collection will be read to find a saver.
-
-  `on_shutdown_hooks` is an optional list of functions that should be called
-  after checkpointing.  The function is called with (`run_context`,
-  `all_workers`, `lame_workers`).
-
-  If `heartbeat_group` is not specified, it will default to all CPU workers
-  in the system.
-  """
-
-  def __init__(self, checkpoint_prefix, saver=None, on_shutdown_hooks=None):
-    self._saver = saver
-    self._checkpoint_prefix = checkpoint_prefix
-    self._on_shutdown_hooks = on_shutdown_hooks if on_shutdown_hooks else []
-
-    # Worker heartbeats are managed independently of the main training graph.
-    self._graph = ops.Graph()
-    self._workers = None
-    self._session = None
-    self._heartbeat_supported = False
-
-  def after_create_session(self, training_session, coord):  # pylint: disable=unused-argument
-    # N.B. We have to pull the global step here to avoid it being unavailable
-    # at checkpoint time; the graph has been frozen at that point.
-    if training_util.get_global_step() is None and self.saver() is not None:
-      raise ValueError(
-          'Saver defined but no global step.  Run `get_or_create_global_step()`'
-          ' in your model definition to allow checkpointing.')
-
-    with self._graph.as_default():
-      logging.info('Installing graceful shutdown hook.')
-      self._session = _clone_session(training_session, self._graph)
-      self._workers = WorkerHeartbeatManager.from_devices(
-          self._session, all_worker_devices(self._session))
-      self._heartbeat_supported = self._workers.num_workers() > 0
-      if self._heartbeat_supported:
-        self._workers.configure(
-            event_pb2.WorkerHeartbeatRequest(
-                shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR))
-      else:
-        logging.warn(
-            'No workers support hearbeats. Failure handling will be disabled.')
-
-  def saver(self):
-    if self._saver:
-      return self._saver
-
-    savers = ops.get_collection(ops.GraphKeys.SAVERS)
-    if not savers:
-      return None
-
-    if not isinstance(savers, list):
-      return savers
-
-    if len(savers) > 1:
-      logging.error(
-          'Multiple savers in the SAVERS collection.  On-demand checkpointing '
-          'will be disabled. Pass an explicit `saver` to the constructor to '
-          'override this behavior.')
-      return None
-
-    return savers[0]
-
-  def after_run(self, run_context, run_values):
-    del run_values
-
-    if not self._heartbeat_supported:
-      return
-
-    lame_workers = self._workers.lame_workers()
-    if lame_workers:
-      logging.info('ShutdownHook: lame workers found: %s', lame_workers)
-
-      if self.saver():
-        logging.info('ShutdownHook: saving checkpoint to %s',
-                     self._checkpoint_prefix)
-        self.saver().save(
-            run_context.session,
-            self._checkpoint_prefix,
-            global_step=training_util.get_global_step(),
-            write_state=True,
-        )
-      else:
-        logging.info('ShutdownHook: no Saver defined.')
-
-      for fn in self._on_shutdown_hooks:
-        fn(run_context, self._workers, lame_workers)
-
-
-class RestartComputation(object):
-  """Restart the entire computation.
-
-  This hook shuts down all workers and returns control to the top-level by
-  throwing a CoordinatorShutdownException.
-  """
-
-  def __init__(self, timeout_ms=10000):
-    self.timeout_ms = timeout_ms
-
-  def __call__(self, run_context, all_workers, lame_workers):
-    del run_context, lame_workers
-    all_workers.shutdown(timeout_ms=self.timeout_ms)
-
-    logging.info('Terminating coordinator.')
-    raise CoordinatorShutdownException()
-
-
-class ShutdownLameWorkers(object):
-  """Shutdown lamed workers.
-
-  Processing will continue normally (typically by waiting for the down
-  workers to be restarted).
-  """
-
-  def __init__(self, timeout_ms=10000):
-    self.timeout_in_ms = timeout_ms
-
-  def __call__(self, run_context, all_workers, lame_workers):
-    lame_workers.shutdown(timeout_ms=self.timeout_in_ms)
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.session_support import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py b/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py
index 70baea203cc6174bebc7d90646045efae5f2391d..73db253fd790f26679fb05bd6e7a5da6a99da1a7 100644
--- a/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py
+++ b/tensorflow/contrib/tpu/python/tpu/tensor_tracer.py
@@ -1,553 +1,23 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ========================================================================
-"""A utility to trace tensor values on TPU."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-import os.path
-import re
-
-from tensorflow.contrib.tpu.python.ops import tpu_ops
-from tensorflow.contrib.tpu.python.tpu import tpu
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_util
-from tensorflow.python.ops import gen_math_ops
-from tensorflow.python.ops import logging_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import tf_logging as logging
-
-_TRACER_LOG_PREFIX = ' [>>>TT>>>]'
-_DEVICE_TYPE_TPU = 'tpu'
-_DEVICE_TYPE_CPU = 'cpu'
-_GLOBAL_STEP_OP_NAME = 'GLOBAL-STEP'
-_TRACE_MODE_NAN_INF = 'nan-inf'
-_TRACE_MODE_PART_TENSOR = 'part-tensor'
-_TRACE_MODE_PART_TENSOR_SIZE = 3
-_TRACE_MODE_FULL_TENSOR = 'full-tensor'
-_RECORD_OUTSIDE_OP_RANGE = 'not-traced-outside-op-range'
-_RECORD_SHOULD_NOT_TRACE = 'not-traced-should-not-trace'
-_RECORD_FILTERED_OUT = 'not-traced-filtered-out'
-_RECORD_SCALAR = 'not-traced-scalar'
-_RECORD_DYNAMIC_SHAPE = 'not-traced-dynamic-shape'
-_RECORD_GET_TRACED = 'get-traced'
-_MARKER_SECTION_BEGIN = '!!!!!!! section-begin:'
-_MARKER_SECTION_END = '!!!!!!! section-end:'
-_SECTION_NAME_CONFIG = 'configuration'
-_SECTION_NAME_REASON = 'reason'
-_SECTION_NAME_OP_LIST = 'op-list'
-_SECTION_NAME_GRAPH = 'graph'
-_FIELD_NAME_VERSION = 'version:'
-_FIELD_NAME_DEVICE = 'device:'
-_FIELD_NAME_TRACE_MODE = 'trace-mode:'
-_FIELD_NAME_NUM_REPLICAS = 'num-replicas:'
-_FIELD_NAME_NUM_OPS = 'number-of-ops:'
-_FIELD_NAME_TOPOLOGICAL_SORT_SUCCEED = 'topological-sort-succeed:'
-_FLAGS_ENV_VAR = 'TENSOR_TRACER_FLAGS'
-_FLAG_SINGLE_QUOTE_PAT = re.compile(r"\s*--([^=]+)='([^']*)'")
-_FLAG_DOUBLE_QUOTE_PAT = re.compile(r'\s*--([^=]+)="([^"]*)"')
-_FLAG_NO_QUOTE_PAT = re.compile(r'\s*--([^=]+)=(\S*)')
-_FLAG_NAME_ENABLE = 'enable'
-_FLAG_NAME_TRACE_MODE = 'trace_mode'
-_FLAG_NAME_INTERESTING_OPS = 'interesting_ops'
-_FLAG_NAME_TRACE_FILE = 'trace_file_path'
-_FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR = 'use_test_undeclared_outputs_dir'
-_FLAG_NAME_OP_RANGE = 'op_range'
-_OP_RANGE_PAT = re.compile(r'(\d+):(\d+)')
-_OUTPUT_STREAM_ESCAPE = 'file://'
-_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR = 'TEST_UNDECLARED_OUTPUTS_DIR'
-
-
-class TensorTracer(object):
-  """A software construct for tracing tensor values in a TF graph on TPU.
-
-  This utility is disabled by default. It can be enabled by setting
-  the TENSOR_TRACER_FLAGS env variable as:
-    export TENSOR_TRACER_FLAGS="--enable=1"
-  If it is enabled, it will trace the output tensor values of
-  selected Ops in the graph. It has two outputs: (1) the traces and (2)
-  a report. The traces are dumped to a specified local file on the TPU
-  host. The report is printed to the log.info of the TPU job.
-  By passing options via the env variable, users can change:
-     (1) the trace mode (e.g., detecting NaN/Inf, printing partial or
-         full tensor values)
-     (2) which Ops to be traced (via op.name or op.type)
-     (3) output trace file path.
-  """
-
-  @staticmethod
-  def _match_next_flag(flags, pos):
-    """Returns the match for the next TensorTracer flag."""
-
-    match = _FLAG_DOUBLE_QUOTE_PAT.match(flags, pos)
-    if match:
-      return match
-    match = _FLAG_SINGLE_QUOTE_PAT.match(flags, pos)
-    if match:
-      return match
-    match = _FLAG_NO_QUOTE_PAT.match(flags, pos)
-    return match
-
-  @staticmethod
-  def print_flag_values():
-    """Prints all TensorTracer flags passed via environment variables."""
-
-    tensor_tracer_flags = os.environ.get(_FLAGS_ENV_VAR)
-    if not tensor_tracer_flags:
-      return 'Env variable "%s" is not set'%_FLAGS_ENV_VAR
-    result = 'Env variable "%s" is set to "%s"\n'%(_FLAGS_ENV_VAR,
-                                                   tensor_tracer_flags)
-    result += 'Individual flag value:\n'
-    pos = 0
-    while True:
-      match = TensorTracer._match_next_flag(tensor_tracer_flags, pos)
-      if not match:
-        break
-      flag_name = match.group(1)
-      flag_value = match.group(2)
-      result += '  %s: %s\n'%(flag_name, flag_value)
-      pos = match.end()
-    result += '\n'
-    return result
-
-  @staticmethod
-  def get_flag_value(wanted_flag_name):
-    """Returns the value of a TensorTracer flags."""
-
-    tensor_tracer_flags = os.getenv(_FLAGS_ENV_VAR)
-    if not tensor_tracer_flags:
-      return ''
-    pos = 0
-    while True:
-      match = TensorTracer._match_next_flag(tensor_tracer_flags, pos)
-      if not match:
-        return ''
-      flag_name = match.group(1)
-      flag_value = match.group(2)
-      if flag_name == wanted_flag_name:
-        return flag_value
-      pos = match.end()
-    return ''
-
-  @staticmethod
-  def is_enabled():
-    """Returns True if TensorTracer is enabled."""
-
-    flag_value = TensorTracer.get_flag_value(_FLAG_NAME_ENABLE)
-    flag_value = flag_value.lower()
-    enabled = flag_value in ['1', 't', 'true', 'y', 'yes']
-    return enabled
-
-  @staticmethod
-  def use_test_undeclared_outputs_dir():
-    """Decides the output directory of the trace file.
-
-    Args:
-       None.
-
-    Returns:
-       True if the output trace file should be written to the
-       test-undeclared-outputs-directory defined via an
-       env variable.
-    """
-
-    flag_value = TensorTracer.get_flag_value(
-        _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR)
-    flag_value = flag_value.lower()
-    enabled = flag_value in ['1', 't', 'true', 'y', 'yes']
-    return enabled
-
-  @staticmethod
-  def check_device_type(device_type):
-    """Checks if the given device type is valid."""
-
-    if device_type not in [_DEVICE_TYPE_TPU, _DEVICE_TYPE_CPU]:
-      raise ValueError('Invalid device_type "%s"'%device_type)
-
-  @staticmethod
-  def check_trace_mode(trace_mode):
-    """Checks if the given trace mode is valid."""
-
-    valid_trace_modes = [_TRACE_MODE_NAN_INF, _TRACE_MODE_PART_TENSOR,
-                         _TRACE_MODE_FULL_TENSOR]
-    if trace_mode not in valid_trace_modes:
-      raise ValueError('Invalid trace mode "%s" given to the Tensor_Tracer.'
-                       'Valid trace modes are: %s'%(trace_mode,
-                                                    valid_trace_modes))
-
-  @staticmethod
-  def should_trace(device_type, op):
-    """Returns True if the given Op should be traced."""
-
-    if device_type != _DEVICE_TYPE_TPU:
-      raise ValueError('Non TPU device type is not supported')
-    if control_flow_util.IsInCond(op):
-      return False
-    if op.type in ['Reshape', 'ArgMin', 'ArgMax']:
-      return False
-    # pylint: disable=protected-access
-    return tpu._TPU_REPLICATE_ATTR in op.node_def.attr
-    # pylint: enable=protected-access
-
-  @staticmethod
-  def reason(op_idx, details):
-    """Returns why the Op at op_idx is traced or not."""
-    return '%d %s'%(op_idx, details)
-
-  @staticmethod
-  def topological_sort(g):
-    """Performs topological sort on the given graph.
-
-    Args:
-       g: the graph.
-
-    Returns:
-       A pair where the first element indicates if the topological
-       sort succeeded (True if there is no cycle found; False if a
-       cycle is found) and the second element is either the sorted
-       list of nodes or the cycle of nodes found.
-    """
-
-    def visit(op, cycle, permanently_marked_ops,
-              temporarily_marked_ops, sorted_ops):
-      """Recursively visits all Ops in a graph.
-
-      Args:
-         op: the current Op being visited.
-         cycle: a cycle of Ops found.
-         permanently_marked_ops: the set of Ops that were already visited.
-         temporarily_marked_ops: the set of Ops that we have visited during
-                                 the current descent.
-         sorted_ops: the list of Ops sorted in topological order.
-      """
-
-      if cycle:
-        return
-      if op in permanently_marked_ops:
-        return
-      if op in temporarily_marked_ops:
-        cycle = temporarily_marked_ops
-        return
-      temporarily_marked_ops.add(op)
-      for i in range(len(op.outputs)):
-        out_tensor = op.outputs[i]
-        for consumer_op in out_tensor.consumers():
-          visit(consumer_op, cycle, permanently_marked_ops,
-                temporarily_marked_ops, sorted_ops)
-      # pylint: disable=protected-access
-      for ctrl_output_op in op._control_outputs:
-      # pylint: enable=protected-access
-        visit(ctrl_output_op, cycle, permanently_marked_ops,
-              temporarily_marked_ops, sorted_ops)
-      temporarily_marked_ops.remove(op)
-      permanently_marked_ops.add(op)
-      sorted_ops.insert(0, op)
-
-    graph_cycle = set([])
-    sorted_ops = []
-    permanently_marked_ops = set([])
-    temporarily_marked_ops = set([])
-    unsorted_ops = g.get_operations()
-    for op in unsorted_ops:
-      visit(op, graph_cycle, permanently_marked_ops,
-            temporarily_marked_ops, sorted_ops)
-    if graph_cycle:
-      return (False, graph_cycle)
-    else:
-      assert len(unsorted_ops) == len(sorted_ops)
-      return (True, sorted_ops)
-
-  def __init__(self):
-    """Initializes a TensorTracer.
-
-    Sets the various member fields from the flags (if given) or the defaults.
-    """
-    self._version = 'use-outside-compilation'
-    self._device_type = None
-    self._trace_mode = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_MODE)
-    if not self._trace_mode:
-      self._trace_mode = _TRACE_MODE_NAN_INF
-    TensorTracer.check_trace_mode(self._trace_mode)
-    self._part_tensor_size = _TRACE_MODE_PART_TENSOR_SIZE
-    self._instrument_records = {}
-    interesting_ops = TensorTracer.get_flag_value(_FLAG_NAME_INTERESTING_OPS)
-    self._selected_ops = interesting_ops.split()
-    self._set_trace_file_path()
-    self._set_op_range()
-    self._num_replicas = None
-    self._replica_id = None
-
-  def _add_replica_id_to_graph(self, num_replicas, result_tensor):
-    """Adds nodes for computing the replica ID to the graph."""
-
-    if not num_replicas:
-      self._replica_id = 'unknown'
-      return result_tensor
-
-    self._num_replicas = num_replicas
-
-    with ops.control_dependencies(None):
-      # Uses None as dependency to run outside of TPU graph rewrites.
-      self._replica_id = tpu_ops.tpu_replicated_input(
-          list(range(self._num_replicas)),
-          name='tt_replica_id')
-    use_replica_id = array_ops.identity(self._replica_id).op
-    with ops.control_dependencies([use_replica_id]):
-      # Adds a control dependency from the result_tensor to
-      # the replica_id to ensure that replica_id will be added to the graph.
-      return array_ops.identity(result_tensor)
-
-  def _set_trace_file_path(self):
-    """Sets the path of the output trace file."""
-
-    self._trace_file_path = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_FILE)
-    if not self._trace_file_path:
-      raise ValueError('--%s is not set in the environment variable %s'
-                       %(_FLAG_NAME_TRACE_FILE, _FLAGS_ENV_VAR))
-    elif TensorTracer.use_test_undeclared_outputs_dir():
-      if os.path.isabs(self._trace_file_path):
-        raise ValueError('If use_test_undeclared_outputs_dir is set,'
-                         'trace_file_path cannot be an absolute path (%s)'
-                         %self._trace_file_path)
-      outputs_dir = os.environ.get(_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR)
-      self._trace_file_path = os.path.join(outputs_dir,
-                                           self._trace_file_path)
-
-  def _set_op_range(self):
-    """Sets the index range of the Ops that we will consider tracing."""
-
-    op_range = TensorTracer.get_flag_value(_FLAG_NAME_OP_RANGE)
-    if not op_range:
-      self._op_range = (-1, -1)  # this means including all ops.
-      return
-    match = _OP_RANGE_PAT.match(op_range)
-    if not match:
-      self._op_range = (-1, -1)  # this means including all ops.
-      return
-    self._op_range = (int(match.group(1)), int(match.group(2)))
-
-  def _inside_op_range(self, idx):
-    """Return True if the given index is inside the selected range."""
-
-    if idx < self._op_range[0]:
-      return False
-    return self._op_range[1] < 0 or idx <= self._op_range[1]
-
-  def _write_report(self, content):
-    """Writes the given content to the report."""
-
-    logging.info('%s %s'%(_TRACER_LOG_PREFIX, content))
-
-  def _is_selected_op(self, op_name):
-    """Returns True if the Op with op_name is selected to be traced."""
-
-    if not self._selected_ops:
-      return True
-    if op_name in self._selected_ops:
-      return True
-    return False
-
-  def _write_config_section(self):
-    """Writes the config section of the report."""
-
-    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_CONFIG))
-    self._write_report('%s %s\n'%(_FIELD_NAME_VERSION, self._version))
-    self._write_report('%s %s\n'%(_FIELD_NAME_DEVICE, self._device_type))
-    self._write_report('%s %s\n'%(_FIELD_NAME_TRACE_MODE, self._trace_mode))
-    self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS, self._num_replicas))
-    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_CONFIG))
-
-  def _write_reason_section(self):
-    """Writes the reason section of the report."""
-
-    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_REASON))
-    for key in sorted(self._instrument_records):
-      self._write_report('"%s" %s\n'%(key, self._instrument_records[key]))
-    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_REASON))
-
-  def _write_op_list_section(self, op_list):
-    """Writes the Op-list section of the report."""
-
-    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_OP_LIST))
-    self._write_report('%s %d\n'%(_FIELD_NAME_NUM_OPS, len(op_list)))
-    for i in range(0, len(op_list)):
-      self._write_report('%d "%s" %s\n'%(i, op_list[i].name, op_list[i].type))
-    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_OP_LIST))
-
-  def _write_graph_section(self, succeed, sorted_or_cycle):
-    """Writes the graph section of the report."""
-
-    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_GRAPH))
-    self._write_report('%s %s\n'%(_FIELD_NAME_TOPOLOGICAL_SORT_SUCCEED,
-                                  succeed))
-    l = list(sorted_or_cycle)
-    for i in range(0, len(l)):
-      self._write_report('%d "%s"\n'%(i, l[i].name))
-    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_GRAPH))
-
-  def _make_tensor_trace_fun(self, op_name, output_idx):
-    """Makes the tensor tracing function called by outside compilation.
-
-    Args:
-      op_name: the name of the Op that outputs the tensor to be traced.
-      output_idx: which output of the Op it is (0 means the first output).
-
-    Returns:
-      A function to be passed as the first argument to outside compilation.
-
-    Raises:
-      RuntimeError: If the trace mode is invalid.
-    """
-
-    def _print_tensor(op_name, output_idx, num_elements, tensor, output_tensor):
-      """Prints a tensor value to a file.
-
-      Args:
-        op_name: the name of the Op that outputs the tensor to be printed.
-        output_idx: which output of the Op it is (0 means the first output).
-        num_elements: number of elements to print.
-        tensor: the tensor needs to be returned.
-        output_tensor: the tensor needs to be printed.
-
-      Returns:
-        The same tensor passed via the "tensor" argument.
-      """
-      msg = '"%s:%d" '%(op_name, output_idx)
-      output_stream = _OUTPUT_STREAM_ESCAPE + self._trace_file_path
-      print_op = logging_ops.print_v2(msg, array_ops.shape(output_tensor),
-                                      ' @', self._replica_id,
-                                      '\n', output_tensor,
-                                      summarize=num_elements,
-                                      output_stream=output_stream)
-      with ops.control_dependencies([print_op]):
-        return array_ops.identity(tensor).op
-
-    def _detect_nan_inf(tensor):
-      """Trace function for detecting any NaN/Inf in the tensor."""
-
-      if tensor.dtype.is_floating:
-        # Since host can't handle bf16, always convert tensor to f32.
-        tensor = math_ops.cast(tensor, dtypes.float32)
-        output_tensor = math_ops.reduce_any(
-            gen_math_ops.logical_or(gen_math_ops.is_nan(tensor),
-                                    gen_math_ops.is_inf(tensor)))
-      else:
-        output_tensor = constant_op.constant(0)
-      return _print_tensor(op_name, output_idx, 1, tensor, output_tensor)
-
-    def _show_global_step(tensor):
-      """Trace function for printing the global step count."""
-
-      return _print_tensor(op_name, output_idx, 1, tensor, tensor)
-
-    def _show_part_tensor(tensor):
-      """Trace function for printing part of the tensor."""
-
-      return _print_tensor(op_name, output_idx, self._part_tensor_size,
-                           tensor, tensor)
-
-    def _show_full_tensor(tensor):
-      """Trace function for printing the entire tensor."""
-
-      return _print_tensor(op_name, output_idx, -1, tensor, tensor)
-
-    if op_name == _GLOBAL_STEP_OP_NAME:
-      return _show_global_step
-    if self._trace_mode == _TRACE_MODE_NAN_INF:
-      return _detect_nan_inf
-    if self._trace_mode == _TRACE_MODE_PART_TENSOR:
-      return _show_part_tensor
-    if self._trace_mode == _TRACE_MODE_FULL_TENSOR:
-      return _show_full_tensor
-
-    raise RuntimeError('Tensor trace fun for %s is not yet implemented'
-                       %self._trace_mode)
-
-  def trace_tpu(self, graph, result_tensor, num_replicas=None):
-    """Traces the tensors generated by TPU Ops in a TF graph.
-
-    Args:
-      graph: the graph of Ops.
-      result_tensor: a result tensor of evaluating the graph.
-      num_replicas: number of replicas used on the TPU.
-
-    Returns:
-      A tuple (result_tensor_copy, tracing_ops), where:
-        result_tensor_copy: an exact copy of result_tensor
-        tracing_ops: a list of tracing ops. If this list
-                     is non empty, the caller of this function
-                     should pose control dependencies upon these
-                     Ops so that they will be executed when the
-                     graph is evaluated.
-    """
-
-    self._device_type = _DEVICE_TYPE_TPU
-    TensorTracer.check_device_type(self._device_type)
-    result_tensor_copy = self._add_replica_id_to_graph(num_replicas,
-                                                       result_tensor)
-    self._write_config_section()
-    tracing_ops = []
-    operations = graph.get_operations()
-    self._write_op_list_section(operations)
-    # Does the topological sort before adding any nodes to the graph.
-    (succeed, sorted_or_cycle) = TensorTracer.topological_sort(graph)
-    for op_id, op in enumerate(operations):
-      if not self._inside_op_range(op_id):
-        self._instrument_records[op.name] = TensorTracer.reason(
-            op_id, _RECORD_OUTSIDE_OP_RANGE)
-        continue
-      if not TensorTracer.should_trace(self._device_type, op):
-        self._instrument_records[op.name] = TensorTracer.reason(
-            op_id, _RECORD_SHOULD_NOT_TRACE)
-        continue
-      if not self._is_selected_op(op.name):
-        self._instrument_records[op.name] = TensorTracer.reason(
-            op_id, _RECORD_FILTERED_OUT)
-        continue
-      for i in range(len(op.outputs)):
-        out_tensor = op.outputs[i]
-        if not out_tensor.get_shape().is_fully_defined():
-          self._instrument_records[out_tensor.name] = TensorTracer.reason(
-              op_id, _RECORD_DYNAMIC_SHAPE)
-          continue  # cannot trace tensors with dynamic shape.
-        rank = len(out_tensor.shape)
-        if rank < 1:
-          self._instrument_records[out_tensor.name] = TensorTracer.reason(
-              op_id, _RECORD_SCALAR)
-          continue  # cannot trace scalar.
-        self._instrument_records[out_tensor.name] = TensorTracer.reason(
-            op_id, _RECORD_GET_TRACED)
-        consumers = out_tensor.consumers()
-        trace_op = tpu.outside_compilation(
-            self._make_tensor_trace_fun(op.name, i), out_tensor)
-        if consumers:
-          for consumer_op in consumers:
-            # pylint: disable=protected-access
-            consumer_op._add_control_input(trace_op)
-            # pylint: enable=protected-access
-        else:
-          # if there is no consumer, we will add the control dependence later
-          # when we add the control dependency to the output operations.
-          tracing_ops.append(trace_op)
-
-    self._write_reason_section()
-    self._write_graph_section(succeed, sorted_or_cycle)
-
-    return (result_tensor_copy, tracing_ops)
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.tensor_tracer import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/topology.py b/tensorflow/contrib/tpu/python/tpu/topology.py
index 6ae718cc2c9716587849aeee8abcd0a1de82a9ae..5bf805752cf51b0a0f4b7400b18b63aae93cf831 100644
--- a/tensorflow/contrib/tpu/python/tpu/topology.py
+++ b/tensorflow/contrib/tpu/python/tpu/topology.py
@@ -1,220 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ======================================
-"""Defines the `Topology` class, that describes a TPU fabric topology."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.contrib.tpu.proto import topology_pb2
-
-
-def _tpu_device_name(job, task, device):
-  """Returns the device name for the TPU `device` on `task` of `job`."""
-  if job is None:
-    return "/task:%d/device:TPU:%d" % (task, device)
-  else:
-    return "/job:%s/task:%d/device:TPU:%d" % (job, task, device)
-
-
-def _tpu_host_device_name(job, task):
-  """Returns the device name for the CPU device on `task` of `job`."""
-  if job is None:
-    return "/task:%d/device:CPU:0" % task
-  else:
-    return "/job:%s/task:%d/device:CPU:0" % (job, task)
-
-
-class Topology(object):
-  """Describes a set of TPU devices.
-
-  Represents both the shape of the physical mesh, and the mapping between
-  TensorFlow TPU devices to physical mesh coordinates.
-  """
-
-  def __init__(self, serialized=None, mesh_shape=None, device_coordinates=None):
-    """Builds a Topology object.
-
-    If `serialized` is not `None`, the topology is parsed from `serialized` and
-    the other arguments are ignored. Otherwise, the topology is computed from
-    `mesh_shape` and `device_coordinates`.
-
-    Args:
-      serialized: A serialized `TopologyProto`, or `None`. If not `None`, the
-        serialized proto is parsed to discover the topology.
-      mesh_shape: A sequence of 3 positive integers, or `None`. If not `None`,
-        the shape of the TPU topology, in number of cores. Ignored if
-        `serialized` is not `None`.
-      device_coordinates: A rank 3 numpy array that describes the mapping from
-        TensorFlow TPU devices to TPU fabric coordinates, or `None`. Ignored
-        if `serialized is not `None`.
-
-    Raises:
-      ValueError: If `serialized` does not describe a well-formed topology.
-      ValueError: If `serialized` is `None` and `mesh_shape` is not a sequence
-        of 3 positive integers.
-      ValueError: If `serialized` is `None` and `device_coordinates` is not a
-        rank 3 numpy int32 array that describes a valid coordinate mapping.
-    """
-
-    self._serialized = serialized
-
-    if serialized:
-      self._parse_topology(serialized)
-    else:
-      self._mesh_shape = np.asarray(mesh_shape, dtype=np.int32)
-      self._device_coordinates = np.asarray(device_coordinates, np.int32)
-      if len(self._mesh_shape) != 3 or any(self._mesh_shape < 1):
-        raise ValueError("`mesh_shape` must be a sequence of 3 positive "
-                         "entries; got {}".format(self._mesh_shape))
-
-      if (len(self._device_coordinates.shape) != 3 or
-          self._device_coordinates.shape[2] != len(self._mesh_shape)):
-        raise ValueError("`device_coordinates` must be a rank 3 int32 array "
-                         "with minor dimension equal to the mesh shape rank")
-
-    self._topology_tasks, self._topology_devices = self._invert_topology()
-
-  def _parse_topology(self, serialized):
-    """Parses a serialized `TopologyProto` into `self`."""
-    proto = topology_pb2.TopologyProto()
-    proto.ParseFromString(serialized)
-
-    self._mesh_shape = np.array(proto.mesh_shape, dtype=np.int32)
-    if len(self._mesh_shape) != 3 or any(self._mesh_shape < 1):
-      raise ValueError("`mesh_shape` must be a vector of size 3 with positive "
-                       "entries; got {}".format(self._mesh_shape))
-
-    if proto.num_tasks < 0:
-      raise ValueError("`num_tasks` must be >= 0; got {}".format(
-          proto.num_tasks))
-    if proto.num_tpu_devices_per_task < 0:
-      raise ValueError("`num_tpu_devices_per_task` must be >= 0; got {}".format(
-          proto.num_tpu_devices_per_task))
-
-    expected_coordinates_size = (
-        proto.num_tasks * proto.num_tpu_devices_per_task * len(
-            proto.mesh_shape))
-    if len(proto.device_coordinates) != expected_coordinates_size:
-      raise ValueError("`device_coordinates` must have shape num_tasks ({}) * "
-                       "num_tpu_devices_per_task ({}) * len(mesh_shape) ({}); "
-                       "got shape {}".format(proto.num_tasks,
-                                             proto.num_tpu_devices_per_task,
-                                             proto.mesh_shape,
-                                             len(proto.device_coordinates)))
-
-    coords = np.array(proto.device_coordinates, dtype=np.int32)
-    if any(coords < 0):
-      raise ValueError("`device_coordinates` must be >= 0")
-    coords = coords.reshape((proto.num_tasks, proto.num_tpu_devices_per_task,
-                             len(proto.mesh_shape)))
-    self._device_coordinates = coords
-
-  def _invert_topology(self):
-    """Inverts a [task,device,axis] topology to [x,y,z] -> task/device maps."""
-    tasks = np.full(list(self.mesh_shape), -1, dtype=np.int32)
-    devices = np.full(list(self.mesh_shape), -1, dtype=np.int32)
-    for task in xrange(self.device_coordinates.shape[0]):
-      for device in xrange(self.device_coordinates.shape[1]):
-        x, y, z = self.device_coordinates[task, device, :]
-        tasks[x, y, z] = task
-        devices[x, y, z] = device
-    return tasks, devices
-
-  @property
-  def mesh_shape(self):
-    """A rank 1 int32 array describing the shape of the TPU topology."""
-    return self._mesh_shape
-
-  @property
-  def mesh_rank(self):
-    """Returns the number of dimensions in the mesh."""
-    return len(self._mesh_shape)
-
-  @property
-  def device_coordinates(self):
-    """Describes the mapping from TPU devices to topology coordinates.
-
-    Returns:
-      A rank 3 int32 array with shape `[tasks, devices, axis]`.
-      `tasks` is the number of tasks in the TPU cluster, `devices` is the number
-      of TPU devices per task, and `axis` is the number of axes in the TPU
-      cluster topology. Each entry gives the `axis`-th coordinate in the
-      topology of a task/device pair. TPU topologies are 3-dimensional, with
-      dimensions `(x, y, core number)`.
-    """
-    return self._device_coordinates
-
-  def task_ordinal_at_coordinates(self, device_coordinates):
-    """Returns the TensorFlow task number attached to `device_coordinates`.
-
-    Args:
-      device_coordinates: An integer sequence describing a device's physical
-        coordinates in the TPU fabric.
-
-    Returns:
-      Returns the TensorFlow task number that contains the TPU device with those
-      physical coordinates.
-    """
-    return self._topology_tasks[tuple(device_coordinates)]
-
-  def tpu_device_ordinal_at_coordinates(self, device_coordinates):
-    """Returns the TensorFlow device number at `device_coordinates`.
-
-    Args:
-      device_coordinates: An integer sequence describing a device's physical
-        coordinates in the TPU fabric.
-
-    Returns:
-      Returns the TensorFlow device number within the task corresponding to
-      attached to the device with those physical coordinates.
-    """
-    return self._topology_devices[tuple(device_coordinates)]
-
-  def cpu_device_name_at_coordinates(self, device_coordinates, job=None):
-    """Returns the CPU device attached to a logical core."""
-    return _tpu_host_device_name(
-        job, self._topology_tasks[tuple(device_coordinates)])
-
-  def tpu_device_name_at_coordinates(self, device_coordinates, job=None):
-    """Returns the name of the TPU device assigned to a logical core."""
-    return _tpu_device_name(job,
-                            self._topology_tasks[tuple(device_coordinates)],
-                            self._topology_devices[tuple(device_coordinates)])
-
-  @property
-  def num_tasks(self):
-    """Returns the number of TensorFlow tasks in the TPU slice."""
-    return self._device_coordinates.shape[0]
-
-  @property
-  def num_tpus_per_task(self):
-    """Returns the number of TPU devices per task in the TPU slice."""
-    return self._device_coordinates.shape[1]
-
-  def serialized(self):
-    """Returns the serialized form of the topology."""
-    if self._serialized is None:
-      proto = topology_pb2.TopologyProto()
-      proto.mesh_shape[:] = list(self._mesh_shape)
-      proto.num_tasks = self._device_coordinates.shape[0]
-      proto.num_tpu_devices_per_task = self._device_coordinates.shape[1]
-      proto.device_coordinates.extend(list(self._device_coordinates.flatten()))
-      self._serialized = proto.SerializeToString()
-
-    return self._serialized
+# pylint: disable=wildcard-import,unused-import,redefined-builtin
+from tensorflow.python.tpu.topology import *
+# pylint: enable=wildcard-import,unused-import,redefined-builtin
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index def57da20d6018dcf27ccb7a9d04592f38ce2f7c..5364b20f231ac7af8adf943c3d5e21921b7a06a9 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -1,1189 +1,25 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ======================================
-
-"""Library of TPU helper functions."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.contrib.compiler import xla
-from tensorflow.contrib.framework.python.framework import experimental
-from tensorflow.contrib.tpu.python.ops import tpu_ops
-from tensorflow.contrib.tpu.python.tpu import tpu_function
-
-from tensorflow.core.framework import attr_value_pb2
-from tensorflow.python.compat import compat as api_compat
-from tensorflow.python.framework import device as pydev
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import compat
-
-
-# Operations that indicate some error in the users graph, e.g. a placeholder
-# that's introduced outside of the infeed.
-_BLACKLISTED_OPS = set([
-    "Placeholder",
-])
-
-# XLA doesn't currently support reading of intermediate tensors, thus some ops
-# are not supported.
-_UNSUPPORTED_OPS = set([
-    "AudioSummary",
-    "AudioSummaryV2",
-    "HistogramSummary",
-    "ImageSummary",
-    "MergeSummary",
-    "Print",
-    "ScalarSummary",
-    "TensorSummary",
-    "TensorSummaryV2",
-    ])
-
-_MAX_WARNING_LINES = 5
-
-_TPU_REPLICATE_ATTR = "_tpu_replicate"
-_TPU_COMPILATION_STATUS_ATTR = "_tpu_compilation_status"
-_OUTSIDE_COMPILATION_ATTR = "_xla_outside_compilation"
-
-
-def _tpu_system_device_name(job):
-  """Returns the device name for the TPU_SYSTEM device of `job`."""
-  if job is None:
-    return "/device:TPU_SYSTEM:0"
-  else:
-    return "/job:%s/device:TPU_SYSTEM:0" % job
-
-
-def initialize_system(embedding_config=None, job=None):
-  """Initializes a distributed TPU system for use with TensorFlow.
-
-  Args:
-    embedding_config: If not None, a `TPUEmbeddingConfiguration` proto
-      describing the desired configuration of the hardware embedding lookup
-      tables. If embedding_config is None, no hardware embeddings can be used.
-    job: The job (the XXX in TensorFlow device specification /job:XXX) that
-      contains the TPU devices that will be initialized. If job=None it is
-      assumed there is only one job in the TensorFlow flock, and an error will
-      be returned if this assumption does not hold.
-  Returns:
-    A serialized `TopologyProto` that describes the TPU system. Note:
-      the topology must be evaluated using `Session.run` before it can be used.
-  """
-  config_string = ("" if embedding_config is None else
-                   embedding_config.SerializeToString())
-  with ops.device(_tpu_system_device_name(job)):
-    return tpu_ops.configure_distributed_tpu(embedding_config=config_string)
-
-
-def shutdown_system(job=None):
-  """Shuts down a running a distributed TPU system."""
-  with ops.device(_tpu_system_device_name(job)):
-    shutdown_distributed_tpu = tpu_ops.shutdown_distributed_tpu()
-  return shutdown_distributed_tpu
-
-
-def core(num):
-  """Returns the device name for a core in a replicated TPU computation.
-
-  Args:
-    num: the virtual core number within each replica to which operators should
-    be assigned.
-  Returns:
-    A device name, suitable for passing to `tf.device()`.
-  """
-  return "device:TPU_REPLICATED_CORE:{}".format(num)
-
-
-class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
-  """A `ControlFlowContext` for nodes inside a TPU computation.
-
-  The primary role of `TPUReplicateContext` is to mark operators inside a
-  tpu.replicate() computation with the attribute "_tpu_replicate=XYZ", where XYZ
-  is a unique name.
-
-  We use a `ControlFlowContext` to perform the annotation since it integrates
-  with Tensorflow constructs like ResourceVariables. For example, if a
-  `ResourceVariable` is constructed inside a tpu.replicate() block, the
-  `ResourceVariable` implementation can use
-  `with ops.control_dependencies(None)` to build the variable's definition
-  outside the replicated computation.
-  """
-
-  def __init__(self, name, num_replicas, pivot):
-    """Builds a new TPUReplicateContext.
-
-    Args:
-      name: a unique name for the context, used to populate the `_tpu_replicate`
-        attribute.
-      num_replicas: an integer that gives the number of replicas for the
-        computation.
-      pivot: a pivot node. Nodes in the TPUReplicateContext that do not have any
-        inputs will have a control dependency on the pivot node. This ensures
-        that nodes are correctly included in any enclosing control flow
-        contexts.
-    """
-    super(TPUReplicateContext, self).__init__()
-    self._num_replicas = num_replicas
-    self._outer_device_function_stack = None
-    self._oc_dev_fn_stack = None
-    self._outside_compilation_cluster = None
-    self._outside_compilation_counter = 0
-    self._in_gradient_colocation = None
-    self._gradient_colocation_stack = []
-    self._host_compute_core = []
-    self._name = name
-    self._name_as_bytes = compat.as_bytes(name)
-    self._unsupported_ops = []
-    self._pivot = pivot
-    self._replicated_vars = {}
-
-  def get_replicated_var_handle(self, name, vars_):
-    """Returns a variable handle for replicated TPU variable 'var'.
-
-    This is a method used by an experimental replicated variable implementation
-    and is not intended as a public API.
-
-    Args:
-      name: The common name of the variable.
-      vars_: The replicated TPU variables.
-
-    Returns:
-      The handle of the TPU replicated input node.
-    """
-    handle = self._replicated_vars.get(name)
-    if handle is not None:
-      return handle
-
-    # Builds a TPUReplicatedInput node for the variable, if one does not already
-    # exist. The TPUReplicatedInput node must belong to the enclosing
-    # control-flow scope of the TPUReplicateContext.
-    # TODO(phawkins): consider changing the contract of the TPU encapsulation
-    # so the TPUReplicatedInput nodes go inside the TPUReplicateContext scope
-    # instead.
-
-    # pylint: disable=protected-access
-    graph = ops.get_default_graph()
-    saved_context = graph._get_control_flow_context()
-    graph._set_control_flow_context(self.outer_context)
-    handle = tpu_ops.tpu_replicated_input(
-        [v.handle for v in vars_], name=name + "/handle")
-    graph._set_control_flow_context(saved_context)
-    # pylint: enable=protected-access
-    self._replicated_vars[name] = handle
-    return handle
-
-  def report_unsupported_operations(self):
-    if self._unsupported_ops:
-      op_str = "\n".join(["  %s (%s)" % (op.type, op.name)
-                          for op in self._unsupported_ops[:_MAX_WARNING_LINES]])
-      logging.warning("%d unsupported operations found: \n%s",
-                      len(self._unsupported_ops), op_str)
-      if len(self._unsupported_ops) > _MAX_WARNING_LINES:
-        logging.warning("... and %d more" %
-                        (len(self._unsupported_ops) - _MAX_WARNING_LINES))
-
-  def EnterGradientColocation(self, op, gradient_uid):
-    if op is not None:
-      self._gradient_colocation_stack.append(op)
-      if not self._outside_compilation_cluster:
-        try:
-          outside_attr = op.get_attr(_OUTSIDE_COMPILATION_ATTR)
-          if self._in_gradient_colocation:
-            raise NotImplementedError(
-                "Cannot nest gradient colocation operations outside compilation"
-            )
-          if gradient_uid == "__unsupported__":
-            raise NotImplementedError(
-                "No gradient_uid calling gradient within outside_compilation")
-          # When we take the gradient of an op X in an outside_compilation
-          # cluster C in a forward computation we would like to put the ops
-          # corresponding to the gradient of X into a new outside_compilation
-          # cluster C'. However, if we take the gradient of X twice, the second
-          # one should get yet another new outside_compilation cluster C''.
-          #
-          # The mechanism we adopt is to use a 'root_cluster' which is the
-          # cluster that X was in before we took gradients, and a 'gradient_uid'
-          # which is different for every invocation of gradients, and put the
-          # gradient of X in cluster 'root_cluster.gradient_uid'.
-          #
-          # When taking a gradient of a gradient, some ops will be colocated
-          # with Op in the forward pass (e.g., cluster root_cluster) and some in
-          # the backward pass (e.g., cluster root_cluster.initial_gradient_uid).
-          # We need all of the grad-of-grad ops to be in the same cluster to
-          # avoid cyclic dependencies between clusters. We adopt a heuristic
-          # that puts any op clustered with root_cluster.<xxx> in
-          # root_cluster.gradient_uid, even if xxx was initial_gradient_uid.
-          self._in_gradient_colocation = op
-          parts = outside_attr.split(".")
-          cluster = parts[0] + "." + gradient_uid
-          self._EnterOutsideCompilationScope(cluster=cluster)
-        except ValueError:
-          # The attr was not present: do nothing.
-          pass
-
-  def ExitGradientColocation(self, op, gradient_uid):
-    if op is not None:
-      if not self._gradient_colocation_stack:
-        raise errors.InternalError(
-            op.node_def, op,
-            "Badly nested gradient colocation: empty stack when popping Op " +
-            op.name)
-      last_op = self._gradient_colocation_stack.pop()
-      if op is last_op:
-        if op is self._in_gradient_colocation:
-          self._in_gradient_colocation = None
-          self._ExitOutsideCompilationScope()
-      else:
-        raise errors.InternalError(
-            op.node_def, op, "Badly nested gradient colocation, expected " +
-            last_op + ", got " + op.name)
-
-  def _EnterOutsideCompilationScope(self, cluster=None):
-
-    class FakeOp(object):
-      """A helper class to determine the current device.
-
-      Supports only the type and device set/get methods needed to run the
-      graph's _apply_device_function method.
-      """
-
-      def __init__(self):
-        self._device = ""
-
-      @property
-      def type(self):
-        return "FakeOp"
-
-      @property
-      def device(self):
-        return self._device
-
-      def _set_device(self, device):
-        if isinstance(device, pydev.DeviceSpec):
-          self._device = device.to_string()
-        else:
-          self._device = device
-
-    if self._outside_compilation_cluster:
-      raise NotImplementedError("Cannot nest outside_compilation clusters")
-    if cluster:
-      self._outside_compilation_cluster = cluster
-    else:
-      self._outside_compilation_cluster = str(self._outside_compilation_counter)
-      self._outside_compilation_counter += 1
-    graph = ops.get_default_graph()
-    fake_op = FakeOp()
-    graph._apply_device_functions(fake_op)  # pylint: disable=protected-access
-    device = pydev.DeviceSpec.from_string(fake_op.device)
-    if (device.device_type == "TPU_REPLICATED_CORE" and
-        device.device_index is not None):
-      self._host_compute_core.append(self._outside_compilation_cluster + ":" +
-                                     str(device.device_index))
-    self._oc_dev_fn_stack = graph._device_function_stack  # pylint: disable=protected-access
-    graph._device_function_stack = self._outer_device_function_stack  # pylint: disable=protected-access
-
-  def _ExitOutsideCompilationScope(self):
-    if not self._outside_compilation_cluster:
-      raise NotImplementedError(
-          "Attempted to exit outside_compilation scope when not in scope")
-    self._outside_compilation_cluster = None
-    graph = ops.get_default_graph()
-    graph._device_function_stack = self._oc_dev_fn_stack  # pylint: disable=protected-access
-
-  def Enter(self):
-    if not self._outer_device_function_stack:
-      # Capture the device function stack at the time of first entry
-      # since that is the stack that will be used outside_compilation.
-      graph = ops.get_default_graph()
-      # pylint: disable=protected-access
-      self._outer_device_function_stack = graph._device_function_stack.copy()
-      # pylint: enable=protected-access
-    super(TPUReplicateContext, self).Enter()
-
-  def HostComputeCore(self):
-    return self._host_compute_core
-
-  def AddOp(self, op):
-    # pylint: disable=protected-access
-    if op.type in _BLACKLISTED_OPS:
-      logging.error("Operation of type %s (%s) is not supported on the TPU. "
-                    "Execution will fail if this op is used in the graph. " %
-                    (op.type, op.name))
-
-    if op.type in _UNSUPPORTED_OPS:
-      self._unsupported_ops.append(op)
-
-    if any(x.dtype._is_ref_dtype for x in op.inputs):
-      raise NotImplementedError(
-          "Non-resource Variables are not supported inside TPU computations "
-          "(operator name: %s)" % op.name)
-    if _TPU_REPLICATE_ATTR in op.node_def.attr:
-      raise ValueError("TPU computations cannot be nested")
-    op._set_attr(_TPU_REPLICATE_ATTR,
-                 attr_value_pb2.AttrValue(s=self._name_as_bytes))
-    if self._outside_compilation_cluster:
-      op._set_attr(
-          _OUTSIDE_COMPILATION_ATTR,
-          attr_value_pb2.AttrValue(
-              s=compat.as_bytes(self._outside_compilation_cluster)))
-    if self._num_replicas > 1 or not self._outside_compilation_cluster:
-      # Prevent feeding or fetching anything that is being compiled,
-      # and any replicated outside_compilation Op.
-      op.graph.prevent_feeding(op)
-      op.graph.prevent_fetching(op)
-
-    # Remove any control edges from outer control flow contexts. These may cause
-    # mismatched frame errors.
-    (internal_control_inputs,
-     external_control_inputs) = self._RemoveExternalControlEdges(op)
-
-    if not op.inputs:
-      # Add a control edge from the control pivot to this op.
-      if not internal_control_inputs:
-        # pylint: disable=protected-access
-        op._add_control_input(self.GetControlPivot())
-        # pylint: enable=protected-access
-    else:
-      for index in xrange(len(op.inputs)):
-        x = op.inputs[index]
-        real_x = self.AddValue(x)
-        if real_x != x:
-          op._update_input(index, real_x)  # pylint: disable=protected-access
-
-    if external_control_inputs:
-      # Use an identity to pull control inputs as data inputs. Note that we
-      # ignore ops which don't have outputs. TODO(phawkins): fix that.
-      external_control_inputs = [
-          array_ops.identity(x.outputs[0]).op
-          for x in external_control_inputs
-          if x.outputs
-      ]
-      # pylint: disable=protected-access
-      op._add_control_inputs(external_control_inputs)
-      # pylint: enable=protected-access
-
-    # Mark op's outputs as seen by this context and any outer contexts.
-    output_names = [x.name for x in op.outputs]
-    context = self
-    while context is not None:
-      # pylint: disable=protected-access
-      context._values.update(output_names)
-      context = context._outer_context
-      # pylint: enable=protected-access
-
-    if self._outer_context:
-      self._outer_context.AddInnerOp(op)
-
-  def AddValue(self, val):
-    """Add `val` to the current context and its outer context recursively."""
-    if val.name in self._values:
-      # Use the real value if it comes from outer context.
-      result = self._external_values.get(val.name)
-      return val if result is None else result
-
-    result = val
-    self._values.add(val.name)
-    if self._outer_context:
-      result = self._outer_context.AddValue(val)
-      self._values.add(result.name)
-
-    self._external_values[val.name] = result
-
-    return result
-
-  def AddInnerOp(self, op):
-    self.AddOp(op)
-    if self._outer_context:
-      self._outer_context.AddInnerOp(op)
-
-  @property
-  def grad_state(self):
-    # Define the gradient loop state associated with the TPUReplicateContext to
-    # be None as the TPUReplicateContext does not get nested nor does the
-    # grad_state outside the TPUReplicateContext affect the graph inside so the
-    # grad_state should be as if this is the top-level gradient state.
-    return None
-
-  @property
-  def back_prop(self):
-    """Forwards to the enclosing while context, if any."""
-    if self.GetWhileContext():
-      return self.GetWhileContext().back_prop
-    return False
-
-  def GetControlPivot(self):
-    return self._pivot
-
-
-def outside_compilation(computation, *args, **kwargs):
-  """Builds part of a computation outside any current TPU replicate scope.
-
-  Args:
-    computation: A Python function that builds the computation to
-      place on the host.
-    *args: the positional arguments for the computation.
-    **kwargs: the keyword arguments for the computation.
-
-  Returns:
-    The Tensors returned by computation.
-  """
-  args = [] if args is None else args
-  graph = ops.get_default_graph()
-
-  # If we are in a TPUReplicateContext, signal that we are now
-  # outside_compilation
-  initial_context = graph._get_control_flow_context()  # pylint: disable=protected-access
-  context = initial_context
-  while context:
-    if isinstance(context, TPUReplicateContext):
-      context._EnterOutsideCompilationScope()  # pylint: disable=protected-access
-    context = context.outer_context
-
-  retval = computation(*args, **kwargs)
-
-  # If we are in a TPUReplicateContext, signal that we are no longer
-  # outside_compilation
-  final_context = graph._get_control_flow_context()  # pylint: disable=protected-access
-  if initial_context is not final_context:
-    raise NotImplementedError(
-        "Control-flow context cannot be different at start and end of an "
-        "outside_compilation scope")
-  context = initial_context
-  while context:
-    if isinstance(context, TPUReplicateContext):
-      context._ExitOutsideCompilationScope()  # pylint: disable=protected-access
-    context = context.outer_context
-
-  return retval
-
-
-def replicate(computation,
-              inputs=None,
-              infeed_queue=None,
-              device_assignment=None,
-              name=None):
-  """Builds a graph operator that runs a replicated TPU computation.
-
-  Args:
-    computation: A Python function that builds the computation to replicate.
-    inputs: A list of lists of input tensors or `None` (equivalent to
-      `[[]]`), indexed by `[replica_num][input_num]`. All replicas must
-      have the same number of inputs.
-    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
-      of arguments as inputs to computation.
-    device_assignment: If not `None`, a `DeviceAssignment` describing the
-      mapping between logical cores in the computation with physical cores in
-      the TPU topology. Uses a default device assignment if `None`. The
-      `DeviceAssignment` may be omitted if each replica of the computation uses
-      only one core, and there is either only one replica, or the number of
-      replicas is equal to the number of cores in the TPU system.
-    name: (Deprecated) Does nothing.
-  Returns:
-    A list of lists of output tensors, indexed by `[replica_num][output_num]`.
-  Raises:
-    ValueError: If all replicas do not have equal numbers of input tensors.
-    ValueError: If the number of inputs per replica does not match
-      the number of formal parameters to `computation`.
-  """
-  return split_compile_and_replicate(computation, inputs, infeed_queue,
-                                     device_assignment, name)[1]
-
-
-def split_compile_and_replicate(computation,
-                                inputs=None,
-                                infeed_queue=None,
-                                device_assignment=None,
-                                name=None,
-                                use_tpu=True):
-  """Builds graph operators that runs compilation and replicated computation.
-
-  This is a lower level interface than replicate that returns a separate compile
-  and execute output tensor. In the generated graph the compile op feeds into
-  the execute op and no additional compilation is incurred when running the
-  compile op before the execute op. The compile op returns additional
-  information about the compilation but does not return the compiled program.
-
-  Args:
-    computation: A Python function that builds the computation to replicate.
-    inputs: A list of lists of input tensors or `None` (equivalent to
-      `[[]]`), indexed by `[replica_num][input_num]`. All replicas must
-      have the same number of inputs.
-    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
-      of arguments as inputs to computation.
-    device_assignment: If not `None`, a `DeviceAssignment` describing the
-      mapping between logical cores in the computation with physical cores in
-      the TPU topology. Uses a default device assignment if `None`. The
-      `DeviceAssignment` may be omitted if each replica of the computation uses
-      only one core, and there is either only one replica, or the number of
-      replicas is equal to the number of cores in the TPU system.
-    name: (Deprecated) Does nothing.
-    use_tpu: When false, the input `computation` is executed on the XLA CPU/GPU
-      backends. Currently, only supports a default placement (computation is
-      placed on GPU if one is available, and on CPU if not).
-  Returns:
-    A list of lists with the first list corresponding to the compile op and the
-    second a list of output tensors, indexed by `[replica_num][output_num]`.
-  Raises:
-    ValueError: If all replicas do not have equal numbers of input tensors.
-    ValueError: If the number of inputs per replica does not match
-      the number of formal parameters to `computation`.
-  """
-  del name
-  inputs = [[]] if inputs is None else inputs
-
-  metadata_kwargs = {}
-  if device_assignment is not None:
-    # Turn the Numpy array into a flattened list so we can pass it as an
-    # operator attribute.
-    metadata_kwargs = {
-        "topology":
-            device_assignment.topology.serialized(),
-        "device_assignment":
-            device_assignment.core_assignment.flatten().tolist()
-    }
-    # TODO(phawkins): remove this case after the forward compatibility window
-    # expires on 2018-10-5.
-    if api_compat.forward_compatible(2018, 10, 5):
-      metadata_kwargs["num_cores_per_replica"] = (
-          device_assignment.num_cores_per_replica)
-    else:
-      metadata_kwargs["computation_shape"] = [
-          device_assignment.num_cores_per_replica
-      ]
-
-  if ((not isinstance(inputs, list)) or
-      any(not isinstance(inp, (list, tuple)) for inp in inputs)):
-    raise TypeError("tpu.replicate() inputs must be a list of lists/tuples")
-
-  num_replicas = len(inputs)
-
-  # No replicas? Nothing to do.
-  if num_replicas == 0:
-    return []
-
-  # Converts inputs to Tensors.
-  inputs = [[ops.convert_to_tensor(x) for x in inp] for inp in inputs]
-
-  # Verifies that all replicas have matching numbers and types of inputs
-  input_types = [x.dtype for x in inputs[0]]
-  input_arity = len(input_types)
-  for i in range(num_replicas):
-    if len(inputs[i]) != input_arity:
-      raise ValueError("Replicas must have the same number of inputs. "
-                       "Replica 0 had {} inputs, replica {} had {} "
-                       "inputs.".format(input_arity, i, len(inputs[i])))
-
-    types = [x.dtype for x in inputs[i]]
-    if types != input_types:
-      raise ValueError(
-          "Replicas must have matching input types. Replica 0 had "
-          "input types {}, replica {} had input types {}".format(
-              input_types, i, types))
-
-  arg_error = xla.check_function_argument_count(
-      computation, input_arity, infeed_queue)
-  if arg_error is not None:
-    if infeed_queue is None:
-      raise TypeError(
-          "Supplied computation cannot be called with the specified inputs. "
-          "You specified %d inputs: %s, but the computation needs %s" % (
-              input_arity, str([i.name for i in inputs[0]]), arg_error))
-    else:
-      raise TypeError(
-          "Supplied computation cannot be called with the specified inputs. "
-          "You specified %d inputs: %s and %d additional inputs from infeed,"
-          " but the computation needs %s" % (input_arity, str(
-              [i.name
-               for i in inputs[0]]), infeed_queue.number_of_tuple_elements,
-                                             arg_error))
-
-  graph = ops.get_default_graph()
-
-  # Fan-in: Builds a TPUReplicatedInput node for each input.
-  computation_inputs = []
-  for i in range(0, input_arity):
-    replicas = [inputs[replica][i] for replica in xrange(num_replicas)]
-    computation_inputs.append(
-        tpu_ops.tpu_replicated_input(replicas, name="input{}".format(i)))
-
-  cluster_name = graph.unique_name("cluster")
-  pivot = control_flow_ops.no_op(name=cluster_name + "/pivot")
-  context = TPUReplicateContext(
-      name=cluster_name, num_replicas=num_replicas, pivot=pivot)
-  try:
-    context.Enter()
-
-    metadata = tpu_ops.tpu_replicate_metadata(
-        num_replicas=num_replicas, use_tpu=use_tpu, **metadata_kwargs)
-
-    with tpu_function.tpu_shard_context(
-        num_replicas), ops.control_dependencies([metadata]):
-
-      # Add identity ops so even unused inputs are "consumed" by the
-      # computation. This is to avoid orphaned TPUReplicatedInput nodes.
-      # TODO(phawkins): consider instead pruning unused TPUReplicatedInput
-      # and eliding trivial TPUReplicatedInput/TPUReplicatedOutput pairs.
-      computation_inputs = [
-          array_ops.identity(x, name="replicated_input_{}".format(i))
-          for i, x in enumerate(computation_inputs)
-      ]
-
-      # If there is an infeed queue, adds the dequeued values to the
-      # computation's inputs.
-      if infeed_queue is not None:
-        infeed_queue.set_number_of_shards(num_replicas)
-        for t in infeed_queue.generate_dequeue_op():
-          computation_inputs.append(t)
-
-      # Only resource variables work inside a TPU computation, so turn on
-      # resource variables for the computation.
-      # TODO(phawkins): consider removing this code. It will
-      # be less confusing to clients if they knowingly choose to use resource
-      # variables.
-      # Partitioned variables is not supported (b/112311320).
-      vscope = variable_scope.get_variable_scope()
-      saved_use_resource = vscope.use_resource
-      saved_custom_getter = vscope.custom_getter
-
-      def custom_getter(getter, name, *args, **kwargs):
-        """Variables on TPU have a few restrictions."""
-        partitioner = kwargs["partitioner"]
-        if partitioner is not None:
-          kwargs["partitioner"] = None
-          logging.warning(
-              "Partitioned variables are not supported on TPU. Got "
-              "`partitioner` that is {} for variable {}. "
-              "Setting `partitioner` to `None`."
-              .format(partitioner, name))
-        if saved_custom_getter is None:
-          return getter(name, *args, **kwargs)
-        else:
-          return saved_custom_getter(getter, name, *args, **kwargs)
-
-      vscope.set_use_resource(True)
-      vscope.set_custom_getter(custom_getter)
-
-      outputs = computation(*computation_inputs)
-
-      vscope.set_use_resource(saved_use_resource)
-      vscope.set_custom_getter(saved_custom_getter)
-
-    # If the computation returns `None`, make it an empty tuple.
-    if outputs is None:
-      outputs = tuple()
-    # If the computation only returned one value, makes it a tuple.
-    if not isinstance(outputs, (list, tuple)):
-      outputs = (outputs,)
-
-    # Append `no_op` here so that fetching any return value of this function
-    # will trigger TPUExecute node.
-    outputs += (control_flow_ops.no_op(),)
-    try:
-      with ops.device(core(0)):
-        outputs = [
-            o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o)
-            for o in outputs
-        ]
-    except Exception as e:
-      raise ValueError(
-          "TPU function return values must all either be Operations or "
-          "convertible to Tensors. Got '%s'" % str(e))
-
-    # Separates the returned Operations and Tensors.
-    output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
-    output_tensors = [o for o in outputs if not isinstance(o, ops.Operation)]
-
-    if outputs != output_tensors + output_operations:
-      raise ValueError(
-          "TPU functions must return zero-or more Tensor values followed by "
-          "zero or more Operations.")
-    output_arity = len(output_tensors)
-
-    # Wraps outputs in Identity ops. Otherwise a replicated input copied
-    # straight to an output would bypass the replicate(). This would be bad
-    # because the TPUReplicatedInput/TPUReplicatedOutput operator would not
-    # be rewritten away, leading to a runtime error.
-    # TODO(phawkins): extend the rewrite to elide these nodes instead.
-    new_output_tensors = []
-    for t in output_tensors:
-      with ops.device(t.device if t.device else core(0)):
-        new_output_tensors.append(array_ops.identity(t))
-    output_tensors = new_output_tensors
-    context.ExitResult(output_tensors)
-  finally:
-    context.report_unsupported_operations()
-    context.Exit()
-    host_compute_core = context.HostComputeCore()
-
-  if host_compute_core:
-    attr_value = attr_value_pb2.AttrValue()
-    attr_value.list.s.extend([compat.as_bytes(x) for x in host_compute_core])
-    metadata._set_attr("host_compute_core", attr_value)  # pylint: disable=protected-access
-
-  # Fan-out: Builds a TPUReplicatedOutput node for each output.
-  outputs = [tpu_ops.tpu_replicated_output(output_tensors[i], num_replicas,
-                                           name="output{}".format(i))
-             for i in xrange(output_arity)]
-
-  with ops.control_dependencies([metadata]):
-    if use_tpu:
-      compile_status = tpu_ops.tpu_compilation_result()
-      op = compile_status.op
-      attr_value = attr_value_pb2.AttrValue(s=compat.as_bytes(cluster_name))
-      op._set_attr(_TPU_COMPILATION_STATUS_ATTR, attr_value)  # pylint: disable=protected-access
-    else:
-      compile_status = control_flow_ops.no_op(name="compilation_status")
-
-  with ops.control_dependencies(output_operations):
-    if output_arity == 0:
-      # Returns a list of NoOps dependent on the replication Op, indexed by
-      # [replica_num].
-      return [
-          compile_status, [
-              control_flow_ops.no_op(name="shard_%d" % i)
-              for i in range(num_replicas)
-          ]
-      ]
-    else:
-      # Wraps the outputs in identity operators so the names of any possible
-      # `fetch` nodes are preserved by the replication rewrite.
-      return [
-          compile_status, [[
-              array_ops.identity(
-                  outputs[out][replica],
-                  name="output_%d_shard_%d" % (out, replica))
-              for out in xrange(output_arity)
-          ]
-                           for replica in xrange(num_replicas)]
-      ]
-
-
-def shard(computation,
-          inputs=None,
-          num_shards=1,
-          input_shard_axes=None,
-          outputs_from_all_shards=True,
-          output_shard_axes=None,
-          infeed_queue=None,
-          device_assignment=None,
-          name=None):
-  """Shards `computation` for parallel execution.
-
-  `inputs` must be a list of Tensors or None (equivalent to an empty list), each
-  of which has a corresponding split axis (from `input_shard_axes`). Each input
-  is split into `num_shards` pieces along the corresponding axis, and
-  computation is applied to each shard in parallel.
-
-  Tensors are broadcast to all shards if they are lexically captured by
-  `computation`. e.g.,
-
-  x = tf.constant(7)
-  def computation():
-    return x + 3
-  ... = shard(computation, ...)
-
-  TODO(phawkins): consider adding support for broadcasting Tensors passed
-  as inputs.
-
-  If `outputs_from_all_shards` is true, the outputs from all shards of
-  `computation` are concatenated back together along their `output_shards_axes`.
-  Otherwise, each output is taken from an arbitrary shard.
-
-  Inputs and outputs of the computation must be at least rank-1 Tensors.
-
-  Args:
-    computation: A Python function that builds a computation to apply to each
-      shard of the input.
-    inputs: A list of input tensors or None (equivalent to an empty list). Each
-      input tensor has a corresponding shard axes, given by `input_shard_axes`,
-      which must have size divisible by `num_shards`.
-    num_shards: The number of shards.
-    input_shard_axes: A list of dimensions along which to shard `inputs`, or
-      `None`. `None` means "shard all inputs along dimension 0". If not `None`,
-      there must be one dimension per input.
-    outputs_from_all_shards: Boolean or list of boolean. For each output, if
-      `True`, outputs from all shards are concatenated along the corresponding
-      `output_shard_axes` entry. Otherwise, each output is taken
-      from an arbitrary shard. If the argument is a boolean, the argument's
-      value is used for each output.
-    output_shard_axes: A list of dimensions along which to concatenate the
-      outputs of `computation`, or `None`. `None` means "concatenate all outputs
-      along dimension 0". If not `None`, there must be one dimension per output.
-      Ignored if `outputs_from_all_shards` is False.
-    infeed_queue: If not `None`, the `InfeedQueue` to use to augment the inputs
-      of `computation`.
-    device_assignment: If not `None`, a `DeviceAssignment` describing the
-      mapping between logical cores in the computation with physical cores in
-      the TPU topology. Uses a default device assignment if `None`. The
-      `DeviceAssignment` may be omitted if each shard of the computation uses
-      only one core, and there is either only one shard, or the number of shards
-      is equal to the number of cores in the TPU system.
-    name: (Deprecated) Does nothing.
-  Returns:
-    A list of output tensors.
-  Raises:
-    ValueError: If num_shards <= 0
-    ValueError: If len(input_shard_axes) != len(inputs)
-    ValueError: If len(output_shard_axes) != len(outputs from `computation`)
-  """
-
-  if num_shards <= 0:
-    raise ValueError("num_shards must be a positive integer.")
-
-  inputs = [] if inputs is None else inputs
-  if not isinstance(inputs, list):
-    raise TypeError("tpu.shard()'s inputs must be a list of Tensors or None.")
-
-  # Converts inputs to Tensors.
-  inputs = [ops.convert_to_tensor(x) for x in inputs]
-
-  if input_shard_axes is None:
-    input_shard_axes = [0] * len(inputs)
-  if len(inputs) != len(input_shard_axes):
-    raise ValueError("Length of input_shard_axes must be equal to the number "
-                     "of inputs.")
-
-  if inputs:
-    # Splits the `inputs` along the corresponding `input_shard_axes`, giving
-    # lists with layout [input][shard]
-    split_inputs = [
-        array_ops.split(x, num_shards, axis=axis)
-        for (axis, x) in zip(input_shard_axes, inputs)]
-
-    # Transposes the input lists to have layout [shard][input]
-    transposed_inputs = [list(i) for i in zip(*split_inputs)]
-  else:
-    transposed_inputs = [[]] * num_shards
-
-  outputs = replicate(
-      computation,
-      transposed_inputs,
-      infeed_queue=infeed_queue,
-      device_assignment=device_assignment,
-      name=name)
-
-  # There must be at least one shard since num_shards > 0.
-  # TODO(b/36647078) remove disable when pylint bug is fixed.
-  # pylint: disable=indexing-exception
-  if isinstance(outputs[0], ops.Operation):
-    # pylint: enable=indexing-exception
-    # There were no outputs from the computation and replicate returned a list
-    # of NoOps with control dependencies on the computation. Return the first
-    # one so it can be used as a control dependency or fetch node.
-    # TODO(b/36647078) remove disable when pylint bug is fixed.
-    # pylint: disable=indexing-exception
-    return [outputs[0]]
-    # pylint: enable=indexing-exception
-
-  # TODO(b/36647078) remove disable when pylint bug is fixed.
-  # pylint: disable=indexing-exception
-  num_outputs = len(outputs[0])
-  # pylint: enable=indexing-exception
-
-  if output_shard_axes is None:
-    output_shard_axes = [0] * num_outputs
-  if num_outputs != len(output_shard_axes):
-    raise ValueError("Length of output_shard_axes must be equal to the number "
-                     "of outputs.")
-
-  if isinstance(outputs_from_all_shards, bool):
-    outputs_from_all_shards = [outputs_from_all_shards] * num_outputs
-
-  if num_outputs != len(outputs_from_all_shards):
-    raise ValueError("Length of outputs_from_all_shards must be equal to the "
-                     "number of outputs.")
-
-  results = []
-  for (axis, all_shards, x) in zip(output_shard_axes, outputs_from_all_shards,
-                                   zip(*outputs)):
-    if all_shards:
-      # Concatenate all of the outputs together (use stack for scalars).
-      shape = x[0].shape
-      is_scalar = shape is not None and (shape.ndims == 0)
-      results.append((array_ops.stack(list(x)) if is_scalar
-                      else array_ops.concat(list(x), axis=axis)))
-    else:
-      # TODO(phawkins): use a smarter policy, e.g., round-robin across shards.
-      results.append(x[0])
-
-  return results
-
-
-def batch_parallel(computation,
-                   inputs=None,
-                   num_shards=1,
-                   infeed_queue=None,
-                   device_assignment=None,
-                   name=None):
-  """Shards `computation` along the batch dimension for parallel execution.
-
-  Convenience wrapper around shard().
-
-  `inputs` must be a list of Tensors or None (equivalent to an empty list).
-  Each input is split into `num_shards` pieces along the 0-th dimension, and
-  computation is applied to each shard in parallel.
-
-  Tensors are broadcast to all shards if they are lexically captured by
-  `computation`. e.g.,
-
-  x = tf.constant(7)
-  def computation():
-    return x + 3
-  ... = shard(computation, ...)
-
-  The outputs from all shards are concatenated back together along their 0-th
-  dimension.
-
-  Inputs and outputs of the computation must be at least rank-1 Tensors.
-
-  Args:
-    computation: A Python function that builds a computation to apply to each
-      shard of the input.
-    inputs: A list of input tensors or None (equivalent to an empty list). The
-      0-th dimension of each Tensor must have size divisible by `num_shards`.
-    num_shards: The number of shards.
-    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
-      of arguments as inputs to `computation`.
-    device_assignment: If not `None`, a `DeviceAssignment` describing the
-      mapping between logical cores in the computation with physical cores in
-      the TPU topology. Uses a default device assignment if `None`. The
-      `DeviceAssignment` may be omitted if each shard of the computation uses
-      only one core, and there is either only one shard, or the number of shards
-      is equal to the number of cores in the TPU system.
-    name: (Deprecated) Does nothing.
-  Returns:
-    A list of output tensors.
-  Raises:
-    ValueError: If `num_shards <= 0`
-  """
-  return shard(
-      computation,
-      inputs,
-      num_shards=num_shards,
-      infeed_queue=infeed_queue,
-      device_assignment=device_assignment,
-      name=name)
-
-
-def rewrite(computation,
-            inputs=None,
-            infeed_queue=None,
-            device_assignment=None,
-            name=None):
-  """Rewrites `computation` for execution on a TPU system.
-
-  Args:
-    computation: A Python function that builds a computation to apply to the
-      input. If the function takes n inputs, 'inputs' should be a list of n
-      tensors.
-
-      `computation` may return a list of operations and tensors. Tensors must
-      come before operations in the returned list.  The return value of
-      `rewrite` is a list of tensors corresponding to the tensors from the
-      output of `computation`.
-
-      All `Operation`s constructed during `computation` will be executed when
-      evaluating any of the returned output tensors, not just the ones returned.
-    inputs: A list of input tensors or `None` (equivalent to an empty list).
-    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
-      of arguments as inputs to `computation`.
-    device_assignment: if not `None`, a `DeviceAssignment` describing the
-      mapping between logical cores in the computation with physical cores in
-      the TPU topology. May be omitted for a single-core computation, in which
-      case the core attached to task 0, TPU device 0 is used.
-    name: (Deprecated) Does nothing.
-  Returns:
-    A list of output tensors.
-  """
-  if inputs is not None and not isinstance(inputs, (list, tuple)):
-    raise TypeError("tpu.rewrite() inputs must be a list or tuple")
-
-  # TODO(b/36647078) remove disable when pylint bug is fixed.
-  # pylint: disable=indexing-exception
-  return replicate(
-      computation,
-      None if inputs is None else [inputs],
-      infeed_queue=infeed_queue,
-      device_assignment=device_assignment,
-      name=name)[0]
-  # pylint: enable=indexing-exception
-
-  # Operations that indicate some error in the user's inference graph.
-_BLACKLISTED_INFERENCE_OPS = set([
-    "ReadVariableOp",
-    "AssignVariableOp",
-    "AssignAddVariableOp",
-    "AssignSubVariableOp",
-    "VarHandleOp",
-    "Variable",
-    "VariableV2",
-])
-
-
-def under_tpu_inference_context():
-  """Check if it is currently under `tpu.rewrite_for_inference()`."""
-  graph = ops.get_default_graph()
-
-  context = graph._get_control_flow_context()  # pylint: disable=protected-access
-  while context:
-    if isinstance(context, _TPUInferenceContext):
-      return True
-    context = context.outer_context
-
-  return False
-
-
-class _TPUInferenceContext(control_flow_ops.XLAControlFlowContext):
-  """A `ControlFlowContext` for nodes inside a TPU inference computation.
-
-  The primary role of `TPUReplicateContext` is to sanity check operators inside
-  a tpu.rewrite_for_inference() computation.
-  """
-
-  def __init__(self, name):
-    super(_TPUInferenceContext, self).__init__()
-    self._name = name
-
-  def AddOp(self, op):
-    self._AddOpInternal(op)
-
-  def _AddOpInternal(self, op):
-    # pylint: disable=protected-access
-    if op.type in _BLACKLISTED_INFERENCE_OPS:
-      raise NotImplementedError(
-          "Operation of type %s (%s) is not supported on the TPU for inference."
-          " Execution will fail if this op is used in the graph. Make sure your"
-          " variables are using variable_scope." % (op.type, op.name))
-    if self._outer_context:
-      self._outer_context.AddInnerOp(op)
-
-  def AddValue(self, val):
-    result = val
-    if self._outer_context:
-      result = self._outer_context.AddValue(val)
-    return result
-
-  def AddInnerOp(self, op):
-    self._AddOpInternal(op)
-
-  @property
-  def grad_state(self):
-    return None
-
-
-@experimental
-def validate_inference_rewrite_for_variables(graph):
-  """Validates whether rewrite_for_inference() 'worked' for variables.
-
-     The rewrite_for_inference() method is supposed to append GuaranteeConstOps
-     after ReadVariableOps, but this mechanism works only if you are using
-     tf.get_variable() to create and access variables in your tpu computation.
-     This validation method can be called immediately after calling
-     tpu.rewrite_for_inference() to check whether GuaranteeConstOps where added
-     to the graph.
-
-     Typical usages:
-       tpu.validate_inference_rewrite_for_variables(tf.get_default_graph())
-
-       tpu.validate_inference_rewrite_for_variables(sess.graph)
-
-  Args:
-    graph: The graph which needs to be validated.
-  Raises:
-    RuntimeError: if validation failed.
-  """
-  if not any(x.type == "GuaranteeConst" for x in graph.get_operations()):
-    raise RuntimeError(
-        "No GuaranteeConst ops found in the graph after running "
-        "tpu.rewrite_for_inference(...). Please check that you are using "
-        "tf.get_variable() to create and access variables in your tpu "
-        "computation.")
-
-
-@experimental
-def rewrite_for_inference(computation,
-                          inputs=None,
-                          infeed_queue=None,
-                          device_assignment=None,
-                          name=None):
-  """Rewrites `computation` for inference on a TPU system.
-
-     Other than 'rewriting' the computation to run on a TPU, if using variables
-     in your computation, it moves the ReadVariableOps outside the TPU
-     computation, and adds GuaranteeConst ops just after the ReadVariableOps.
-     This mechanism works only if you are using tf.get_variable() to create and
-     access variables in your tpu computation. You can validate whether this
-     worked, by calling validate_inference_rewrite_for_variables() method
-     immediately after this method to check whether GuaranteeConstOps where
-     added to the graph.
-
-  Args:
-    computation: A Python function that builds a computation to apply to the
-      input. If the function takes n inputs, 'inputs' should be a list of n
-      tensors. If the function returns m outputs, rewrite will return a list of
-      m tensors.
-    inputs: A list of input tensors or `None` (equivalent to an empty list).
-    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
-      of arguments as inputs to `computation`.
-    device_assignment: if not `None`, a `DeviceAssignment` describing the
-      mapping between logical cores in the computation with physical cores in
-      the TPU topology. May be omitted for a single-core computation, in which
-      case the core attached to task 0, TPU device 0 is used.
-    name: The name of the operator.
-  Returns:
-    A list of output tensors.
-  """
-
-  def guarantee_const_getter(getter, name, *args, **kwargs):
-    with ops.control_dependencies(None):
-      return array_ops.guarantee_const(
-          getter(name, *args, **kwargs), name=name + "/GuaranteeConst")
-
-  def wrapped_computation(*args, **kwargs):
-    """Execute computation under `_TPUInferenceContext`."""
-    context = _TPUInferenceContext(
-        name=ops.get_default_graph().unique_name("rewrite_for_inference"))
-    try:
-      context.Enter()
-
-      vscope = variable_scope.get_variable_scope()
-      prev_custom_getter = vscope.custom_getter
-      prev_caching_device = vscope.caching_device
-      vscope.set_custom_getter(guarantee_const_getter)
-      vscope.set_caching_device(lambda op: op.device)
-
-      result = computation(*args, **kwargs)
-
-      vscope.set_custom_getter(prev_custom_getter)
-      vscope.set_caching_device(prev_caching_device)
-    finally:
-      context.Exit()
-    return result
-
-  # pylint: disable=undefined-variable
-  return rewrite(
-      wrapped_computation,
-      inputs=inputs,
-      infeed_queue=infeed_queue,
-      device_assignment=device_assignment,
-      name=name)
-  # pylint: enable=undefined-variable
+# pylint: disable=wildcard-import,unused-import,redefined-builtin
+from tensorflow.python.tpu.tpu import *
+# used by tests
+from tensorflow.python.tpu.tpu import _TPU_REPLICATE_ATTR
+# pylint: enable=wildcard-import,unused-import,redefined-builtin
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
index 9f8d14706845baa1ed45c84b2c15d372915a0eb4..c36aaa38c0e4823bfc438773e4aa5b5109794da4 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
@@ -1,275 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-
-"""A RunConfig subclass with TPU support."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import json
-import os
-
-from tensorflow.contrib.tpu.python.tpu import util as util_lib
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.estimator import run_config as run_config_lib
-from tensorflow.python.platform import tf_logging as logging
-
-# pylint: disable=protected-access
-_TF_CONFIG_ENV = run_config_lib._TF_CONFIG_ENV
-_SERVICE_KEY = run_config_lib._SERVICE_KEY
-_TPU_WORKER_JOB_NAME = 'tpu_worker_job_name'
-# pylint: enable=protected-access
-
-
-class InputPipelineConfig(object):
-  r"""Please see the definition of these values in TPUConfig."""
-  PER_SHARD_V1 = 1
-  PER_HOST_V1 = 2
-  PER_HOST_V2 = 3
-  BROADCAST = 4
-
-
-class TPUConfig(
-    collections.namedtuple('TPUConfig', [
-        'iterations_per_loop',
-        'num_shards',
-        'num_cores_per_replica',
-        'per_host_input_for_training',
-        'tpu_job_name',
-        'initial_infeed_sleep_secs',
-        'input_partition_dims',
-    ])):
-  r"""TPU related configuration required by `TPUEstimator`.
-
-  Args:
-    iterations_per_loop: This is the number of train steps running in TPU
-      system before returning to CPU host for each `Session.run`. This means
-      global step is increased `iterations_per_loop` times in one `Session.run`.
-      It is recommended to be set as number of global steps for next checkpoint.
-    num_shards: (Deprecated, ignored by TPUEstimator).
-      The number of model replicas in the system. For non-model-parallelism
-      case, this number equals the total number of TPU cores. For
-      model-parallelism, the total number of TPU cores equals
-      num_cores_per_replica * num_shards.
-    num_cores_per_replica: Defaults to `None`, which disables model parallelism.
-      An integer which describes the number of TPU cores per model replica. This
-      is required by model-parallelism which enables partitioning
-      the model to multiple cores. Currently num_cores_per_replica must be
-      1, 2, 4, or 8.
-    per_host_input_for_training: If `True`, `PER_HOST_V1`, or `PER_HOST_V2`,
-      `input_fn` is invoked once on each host. With the per-core input pipeline
-      configuration, it is invoked once for each core.
-      With a global batch size `train_batch_size` in `TPUEstimator` constructor,
-      the batch size for each shard is `train_batch_size` // #hosts in the
-      `True` or `PER_HOST_V1` mode. In `PER_HOST_V2` mode, it is
-      `train_batch_size` // #cores. In `BROADCAST` mode, `input_fn` is only
-      invoked once on host 0 and the tensors are broadcasted to all other
-      replicas. The batch size equals to train_batch_size`. With the per-core
-      input pipeline configuration, the shard batch size is also
-      `train_batch_size` // #cores.
-      Note: per_host_input_for_training==PER_SHARD_V1 only supports mode.TRAIN.
-    tpu_job_name: The name of the TPU job. Typically, this name is auto-inferred
-      within TPUEstimator, however when using ClusterSpec propagation in more
-      esoteric cluster configurations, you may need to specify the job name as a
-      string.
-    initial_infeed_sleep_secs: The number of seconds the infeed thread should
-      wait before enqueueing the first batch. This helps avoid timeouts for
-      models that require a long compilation time.
-    input_partition_dims: A nested list to describe the partition dims
-      for all the tensors from input_fn(). The structure of
-      input_partition_dims must match the structure of `features` and
-      `labels` from input_fn(). The total number of partitions must match
-      `num_cores_per_replica`. For example, if input_fn() returns two tensors:
-      images with shape [N, H, W, C] and labels [N].
-      input_partition_dims = [[1, 2, 2, 1], None] will split the images to 4
-      pieces and feed into 4 TPU cores. labels tensor are directly broadcasted
-      to all the TPU cores since the partition dims is `None`.
-      Current limitations: This feature is only supported with the PER_HOST_V2
-      input mode.
-
-    Raises:
-      ValueError: If `num_cores_per_replica` is not 1, 2, 4, 8 or 16.
-  """
-
-  def __new__(cls,
-              iterations_per_loop=2,
-              num_shards=None,
-              num_cores_per_replica=None,
-              per_host_input_for_training=True,
-              tpu_job_name=None,
-              initial_infeed_sleep_secs=None,
-              input_partition_dims=None):
-
-    # Check iterations_per_loop.
-    util_lib.check_positive_integer(iterations_per_loop,
-                                    'TPUConfig iterations_per_loop')
-
-    # Check num_shards.
-    if num_shards is not None:
-      util_lib.check_positive_integer(num_shards, 'TPUConfig num_shards')
-
-    if input_partition_dims is not None:
-      if len(input_partition_dims) != 1 and len(input_partition_dims) != 2:
-        raise ValueError(
-            'input_partition_dims must be a list/tuple with one or two'
-            ' elements.')
-
-      if per_host_input_for_training is not InputPipelineConfig.PER_HOST_V2:
-        raise ValueError(
-            'input_partition_dims is only supported in PER_HOST_V2 mode.')
-
-      if num_cores_per_replica is None:
-        raise ValueError(
-            'input_partition_dims requires setting num_cores_per_replica.')
-
-    # Check num_cores_per_replica
-    if num_cores_per_replica is not None:
-      if num_cores_per_replica not in [1, 2, 4, 8, 16]:
-        raise ValueError(
-            'num_cores_per_replica must be 1, 2, 4, 8, or 16; got {}'.format(
-                str(num_cores_per_replica)))
-
-    # per_host_input_for_training may be True, False, or integer in [1..3].
-    # Map legacy values (True, False) to numeric values.
-    if per_host_input_for_training is False:
-      per_host_input_for_training = InputPipelineConfig.PER_SHARD_V1
-    elif per_host_input_for_training is True:
-      per_host_input_for_training = InputPipelineConfig.PER_HOST_V1
-
-    # Check initial_infeed_sleep_secs.
-    if initial_infeed_sleep_secs:
-      util_lib.check_positive_integer(initial_infeed_sleep_secs,
-                                      'TPUConfig initial_infeed_sleep_secs')
-
-    tpu_job_name = tpu_job_name or _get_tpu_job_name_from_tf_config()
-
-    return super(TPUConfig, cls).__new__(
-        cls,
-        iterations_per_loop=iterations_per_loop,
-        num_shards=num_shards,
-        num_cores_per_replica=num_cores_per_replica,
-        per_host_input_for_training=per_host_input_for_training,
-        tpu_job_name=tpu_job_name,
-        initial_infeed_sleep_secs=initial_infeed_sleep_secs,
-        input_partition_dims=input_partition_dims)
-
-
-class RunConfig(run_config_lib.RunConfig):
-  """RunConfig with TPU support."""
-
-  def __init__(self,
-               tpu_config=None,
-               evaluation_master=None,
-               master=None,
-               cluster=None,
-               **kwargs):
-    """Constructs a RunConfig.
-
-    Args:
-      tpu_config: the TPUConfig that specifies TPU-specific configuration.
-      evaluation_master: a string. The address of the master to use for eval.
-        Defaults to master if not set.
-      master: a string. The address of the master to use for training.
-      cluster: a ClusterResolver
-      **kwargs: keyword config parameters.
-
-    Raises:
-      ValueError: if cluster is not None and the provided session_config has a
-        cluster_def already.
-    """
-    super(RunConfig, self).__init__(**kwargs)
-    self._tpu_config = tpu_config or TPUConfig()
-    self._cluster = cluster
-
-    # If user sets master and/or evaluation_master explicitly, including empty
-    # string '', take it. Otherwise, take the values set by parent class.
-    if master is not None:
-      if cluster is not None:
-        raise ValueError('Both master and cluster are set.')
-      self._master = master
-    else:
-      if cluster:
-        self._master = cluster.master()
-
-    if evaluation_master is not None:
-      self._evaluation_master = evaluation_master
-    elif (not self._evaluation_master and
-          self.task_type != run_config_lib.TaskType.EVALUATOR):
-      # If the task type is EVALUATOR, it means some cluster manager sets the
-      # TF_CONFIG. In that case, we respect the configuration in TF_CONFIG.
-      #
-      # Otherwise, it means user executes the code without external cluster
-      # manager. For that, we optimize the user experience by setting
-      # evaluation_master to master, unless user overwrites it.
-      self._evaluation_master = self._master
-
-    # Set the ClusterSpec to use
-    if cluster:
-      self._cluster_spec = cluster.cluster_spec()
-
-      # Merge the cluster_def into the ConfigProto.
-      if self._session_config is None:  # pylint: disable=access-member-before-definition
-        self._session_config = config_pb2.ConfigProto(allow_soft_placement=True)
-      if self._session_config.HasField('cluster_def'):
-        raise ValueError(
-            'You cannot provide a ClusterResolver and '
-            'session_config.cluster_def.')
-      if self._cluster_spec:
-        self._session_config.cluster_def.CopyFrom(
-            self._cluster_spec.as_cluster_def())
-
-  def _maybe_overwrite_session_config_for_distributed_training(self):
-    # Overrides the parent class session_config overwrite for between-graph. TPU
-    # runs with in-graph, which should not have device filter. Doing nothing
-    # ("pass") basically disables it.
-    pass
-
-  @property
-  def evaluation_master(self):
-    return self._evaluation_master
-
-  @property
-  def master(self):
-    return self._master
-
-  @property
-  def tpu_config(self):
-    return self._tpu_config
-
-  @property
-  def cluster(self):
-    return self._cluster
-
-  def replace(self, **kwargs):
-    if 'tpu_config' not in kwargs:
-      return super(RunConfig, self).replace(**kwargs)
-
-    tpu_config = kwargs.pop('tpu_config')
-    new_instance = super(RunConfig, self).replace(**kwargs)
-    new_instance._tpu_config = tpu_config  # pylint: disable=protected-access
-    return new_instance
-
-
-def _get_tpu_job_name_from_tf_config():
-  """Extracts the TPU job name from TF_CONFIG env variable."""
-  # TODO(xiejw): Extends this to support both TF_CONFIG env variable and cluster
-  # spec propagation.
-  tf_config = json.loads(os.environ.get(_TF_CONFIG_ENV, '{}'))
-  tpu_job_name = tf_config.get(_SERVICE_KEY, {}).get(_TPU_WORKER_JOB_NAME)
-  if tpu_job_name:
-    logging.info('Load TPU job name from TF_CONFIG: %s', tpu_job_name)
-  return tpu_job_name
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.tpu_config import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_context.py b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
index 672462447944b777375331d49727c4d5366cf295..b77b010cba6bf32c3b6d170bc522eebfb6a04f77 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_context.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
@@ -1,725 +1,23 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-"""TPU system metadata and associated tooling."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from contextlib import contextmanager
-import copy
-
-from tensorflow.contrib.tpu.python.tpu import device_assignment  as tpu_device_assignment
-from tensorflow.contrib.tpu.python.tpu import tpu_config
-from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.platform import tf_logging as logging
-
-
-_DEFAULT_JOB_NAME = 'tpu_worker'
-_DEFAULT_COORDINATOR_JOB_NAME = 'coordinator'
-_LOCAL_MASTERS = ('', 'local')
-_NUM_CORES_TO_COMPUTATION_SHAPE = {
-    1: [1, 1, 1],
-    2: [1, 1, 2],
-    4: [1, 2, 2],
-    8: [2, 2, 2],
-    16: [4, 2, 2],
-}
-
-
-class TPUContext(object):
-  """A context that holds the current configuration of the TPU computation."""
-
-  def __init__(self,
-               internal_ctx,
-               input_device=None,
-               invocation_index=None,
-               call_from_input_fn=True):
-    self._internal_ctx = internal_ctx
-    self._input_device = input_device
-    self._invocation_index = invocation_index
-    self._call_from_input_fn = call_from_input_fn
-
-  def current_input_fn_deployment(self):
-    """The configuration of the current input_fn invocation.
-
-    The configuration depends on `TPUConfig.per_host_input_for_training`. See
-    `TPUConfig` for details.
-
-    Only set in params dict of input_fn
-
-    Returns:
-      A tuple of
-        1. Device spec string: String, is the current CPU host where the
-           input_fn is invoked.
-        2. Current invocation index: Int, 0-based index of the input_fn
-           invocation. See next item for details.
-        3. Total invocation count: Int, the total number of times to invoke the
-           input_fn on all CPU hosts. Each invocation will be passed with a new
-           `TPUContext` instance with current invocation index set properly.
-        4. Total number of replicas consumed by current_invocation: Int, the
-           number of replicas fed by the data returned by current input_fn. For
-           example, for per_core input pipeline deployment
-           and non-model-parallelism, total invocation count is equal to
-           the number of cores in the system and num replicas consumed by
-           current invocation is 1. For per-host v2 input pipeline deployment,
-           total invocation count is equal to the number of hosts in the system
-           and num replicas consumed by current invocation is equal to number of
-           cores per host.
-
-    Raises:
-      RuntimeError: If this method must not be called from input_fn.
-    """
-    if not self._call_from_input_fn:
-      raise RuntimeError('This TPUContext instance must not be called from'
-                         ' model_fn.')
-
-    if self._internal_ctx.is_input_sharded_per_core():
-      total_invocation_count = (self._internal_ctx.num_hosts
-                                * self._internal_ctx.num_of_replicas_per_host)
-      replicas_consumed = 1
-    elif self._internal_ctx.is_input_broadcast_with_iterators():
-      total_invocation_count = 1
-      replicas_consumed = self._internal_ctx.num_replicas
-    else:
-      total_invocation_count = self._internal_ctx.num_hosts
-      replicas_consumed = self._internal_ctx.num_of_replicas_per_host
-    return (self._input_device, self._invocation_index,
-            total_invocation_count, replicas_consumed)
-
-  @property
-  def num_replicas(self):
-    """The total number of replicas.
-
-    For non-model-parallelism, num_replicas should be the total num of TPU
-    cores in the system.
-
-    Returns:
-      The number of replicas.
-    """
-    return self._internal_ctx.num_replicas
-
-  @property
-  def num_hosts(self):
-    """The number of hosts for the TPU system."""
-    return self._internal_ctx.num_hosts
-
-  @property
-  def current_host(self):
-    """The current host index for the TPU system."""
-    return self._invocation_index
-
-  @property
-  def num_of_replicas_per_host(self):
-    """The number of replicas for each host."""
-    if self._internal_ctx.model_parallelism_enabled:
-      raise ValueError(
-          'num_of_replicas_per_host is not supported for model_parallelism')
-    return self._internal_ctx.num_of_replicas_per_host
-
-  @property
-  def device_assignment(self):
-    """Returns device_assignment object."""
-    if self._call_from_input_fn:
-      raise RuntimeError('This TPUContext instance must not be called from'
-                         ' input_fn.')
-    return self._internal_ctx.device_assignment
-
-  def device_for_replica(self, replica_id):
-    """Returns the tuple of (CPU device and device ordinal) for replica.
-
-    This should be used for full replicate for non-model-parallelism.
-
-    Args:
-       replica_id: Int, the replica index.
-
-    Returns:
-       A tuple of device spec for CPU device and int device ordinal.
-    """
-    # Note that: For the non-model parallelism, the mapping could be
-    # a random permutation. The order should not matter in most cases
-    # as far as model is replicated to all cores in the system.
-    return self._internal_ctx.device_for_replica(replica_id)
-
-  @property
-  def tpu_host_placement_function(self):
-    """Returns the TPU host place function.
-
-    The place function takes host_id as the input and returns the TF device
-    for the correspoding host.
-    """
-
-    def _placement_function(host_id):
-      """Return the host device given host_id."""
-      return self._internal_ctx.tpu_host_placement_function(host_id=host_id)
-
-    return _placement_function
-
-
-class _InternalTPUContext(object):
-  """A context holds immutable states of TPU computation.
-
-  This immutable object holds TPUEstimator config, train/eval batch size, and
-  `TPUEstimator.use_tpu`, which is expected to be passed around. It also
-  provides utility functions, based on the current state, to determine other
-  information commonly required by TPU computation, such as TPU device names,
-  TPU hosts, shard batch size, etc.
-
-  if eval_on_tpu is False, then execution of eval on TPU is disabled.
-  if eval_on_tpu is True, but use_tpu is False, a warning is issued,
-  and TPU execution is disabled for all modes.
-
-  N.B. As `mode` is not immutable state in Estimator, but essential to
-  distinguish between TPU training and evaluation, a common usage for
-  _InternalTPUContext with `mode` is as follows:
-  ```
-  with _ctx.with_mode(mode) as ctx:
-    if ctx.is_running_on_cpu():
-       ...
-  ```
-  """
-
-  def __init__(self, config, train_batch_size, eval_batch_size,
-               predict_batch_size, use_tpu, eval_on_tpu=True):
-    self._config = config
-    self._train_batch_size = train_batch_size
-    self._eval_batch_size = eval_batch_size
-    self._predict_batch_size = predict_batch_size
-    self._use_tpu = use_tpu
-    logging.info('_TPUContext: eval_on_tpu %s', eval_on_tpu)
-    if not use_tpu and eval_on_tpu:
-      logging.warning('eval_on_tpu ignored because use_tpu is False.')
-
-    self._eval_on_tpu = eval_on_tpu
-    self._model_parallelism_enabled = (
-        use_tpu and config.tpu_config.num_cores_per_replica)
-    self._mode = None
-    num_cores_per_replica = config.tpu_config.num_cores_per_replica
-    if num_cores_per_replica:
-      self._computation_shape = _NUM_CORES_TO_COMPUTATION_SHAPE[
-          num_cores_per_replica]
-    else:
-      self._computation_shape = None
-    self._lazy_tpu_system_metadata_dict = {}  # key by master address
-    self._lazy_device_assignment_dict = {}  # key by master address
-    self._lazy_validation_dict = {}  # key by ModeKeys
-
-  def _assert_mode(self):
-    if self._mode is None:
-      raise RuntimeError(
-          '`mode` needs to be set via contextmanager `with_mode`.')
-    return self._mode
-
-  @contextmanager
-  def with_mode(self, mode):
-    # NOTE(xiejw): Shallow copy is enough. It will share he lazy dictionaries,
-    # such as _lazy_tpu_system_metadata_dict between new copy and the original
-    # one. Note that all lazy states stored in properties _lazy_foo are sort of
-    # immutable as they should be same for the process lifetime.
-    new_ctx = copy.copy(self)
-    new_ctx._mode = mode  # pylint: disable=protected-access
-    yield new_ctx
-
-  @property
-  def mode(self):
-    return self._assert_mode()
-
-  def _get_master_address(self):
-    mode = self._assert_mode()
-    config = self._config
-    master = (
-        config.master
-        if mode != model_fn_lib.ModeKeys.EVAL else config.evaluation_master)
-    return master
-
-  def _get_tpu_system_metadata(self):
-    """Gets the (maybe cached) TPU system metadata."""
-    master = self._get_master_address()
-    tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
-    if tpu_system_metadata is not None:
-      return tpu_system_metadata
-
-    cluster_def = None
-    if (self._config.session_config and
-        self._config.session_config.cluster_def.job):
-      cluster_def = self._config.session_config.cluster_def
-
-    # pylint: disable=protected-access
-    tpu_system_metadata = (
-        tpu_system_metadata_lib._query_tpu_system_metadata(
-            master,
-            cluster_def=cluster_def,
-            query_topology=self.model_parallelism_enabled))
-
-    self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
-    return tpu_system_metadata
-
-  def _get_device_assignment(self):
-    """Gets the (maybe cached) TPU device assignment."""
-    master = self._get_master_address()
-    device_assignment = self._lazy_device_assignment_dict.get(master)
-    if device_assignment is not None:
-      return device_assignment
-
-    tpu_system_metadata = self._get_tpu_system_metadata()
-
-    device_assignment = tpu_device_assignment.device_assignment(
-        tpu_system_metadata.topology,
-        computation_shape=self._computation_shape,
-        num_replicas=self.num_replicas)
-
-    logging.info('num_cores_per_replica: %s',
-                 str(self._config.tpu_config.num_cores_per_replica))
-    logging.info('computation_shape: %s', str(self._computation_shape))
-    logging.info('num_replicas: %d', self.num_replicas)
-    logging.info('device_assignment.topology.device_coordinates: %s',
-                 str(device_assignment.topology.device_coordinates))
-    logging.info('device_assignment.core_assignment: %s',
-                 str(device_assignment.core_assignment))
-
-    self._lazy_device_assignment_dict[master] = device_assignment
-    return device_assignment
-
-  @property
-  def model_parallelism_enabled(self):
-    return self._model_parallelism_enabled
-
-  @property
-  def input_partition_dims(self):
-    return self._config.tpu_config.input_partition_dims
-
-  @property
-  def device_assignment(self):
-    return (self._get_device_assignment()
-            if self._model_parallelism_enabled else None)
-
-  @property
-  def num_of_cores_per_host(self):
-    metadata = self._get_tpu_system_metadata()
-    return metadata.num_of_cores_per_host
-
-  @property
-  def num_cores(self):
-    metadata = self._get_tpu_system_metadata()
-    return metadata.num_cores
-
-  @property
-  def num_of_replicas_per_host(self):
-    """Return the number of replicas per host."""
-    if self.model_parallelism_enabled:
-      return self.num_replicas // self.num_hosts
-    else:
-      return self.num_of_cores_per_host
-
-  @property
-  def num_replicas(self):
-    num_cores_in_system = self.num_cores
-
-    if self.model_parallelism_enabled:
-      num_cores_per_replica = self._config.tpu_config.num_cores_per_replica
-      if num_cores_per_replica > num_cores_in_system:
-        raise ValueError(
-            'The num of cores required by the model parallelism, specified by '
-            'TPUConfig.num_cores_per_replica, is larger than the total num of '
-            'TPU cores in the system. num_cores_per_replica: {}, num cores '
-            'in the system: {}'.format(num_cores_per_replica,
-                                       num_cores_in_system))
-
-      if num_cores_in_system % num_cores_per_replica != 0:
-        raise RuntimeError(
-            'The num of cores in the system ({}) is not divisible by the num '
-            'of cores ({}) required by the model parallelism, specified by '
-            'TPUConfig.num_cores_per_replica. This should never happen!'.format(
-                num_cores_in_system, num_cores_per_replica))
-
-      return num_cores_in_system // num_cores_per_replica
-    else:
-      return num_cores_in_system
-
-  @property
-  def num_hosts(self):
-    metadata = self._get_tpu_system_metadata()
-    return metadata.num_hosts
-
-  @property
-  def config(self):
-    return self._config
-
-  def is_input_sharded_per_core(self):
-    """Return true if input_fn is invoked per-core (other than per-host)."""
-    mode = self._assert_mode()
-    return (mode == model_fn_lib.ModeKeys.TRAIN and
-            (self._config.tpu_config.per_host_input_for_training is
-             tpu_config.InputPipelineConfig.PER_SHARD_V1))
-
-  def is_input_per_host_with_iterators(self):
-    """Return true if input_fn should be run in the per-host v2 config."""
-    return (self._config.tpu_config.per_host_input_for_training is
-            tpu_config.InputPipelineConfig.PER_HOST_V2)
-
-  def is_input_broadcast_with_iterators(self):
-    """Return true if input_fn should be run in the full_replicae config."""
-    return (self._config.tpu_config.per_host_input_for_training is
-            tpu_config.InputPipelineConfig.BROADCAST)
-
-  def is_running_on_cpu(self, is_export_mode=False):
-    """Determines whether the input_fn and model_fn should be invoked on CPU.
-
-    This API also validates user provided configuration, such as batch size,
-    according the lazy initialized TPU system metadata.
-
-    Args:
-      is_export_mode: Indicates whether the current mode is for exporting the
-        model, when mode == PREDICT. Only with this bool, we could
-        tell whether user is calling the Estimator.predict or
-        Estimator.export_savedmodel, which are running on TPU and CPU
-        respectively. Parent class Estimator does not distinguish these two.
-
-    Returns:
-      bool, whether current input_fn or model_fn should be running on CPU.
-
-    Raises:
-      ValueError: any configuration is invalid.
-    """
-
-    is_running_on_cpu = self._is_running_on_cpu(is_export_mode)
-    if not is_running_on_cpu:
-      self._validate_tpu_configuration()
-    return is_running_on_cpu
-
-  def _is_running_on_cpu(self, is_export_mode):
-    """Determines whether the input_fn and model_fn should be invoked on CPU."""
-    mode = self._assert_mode()
-
-    if not self._use_tpu:
-      return True
-
-    if mode == model_fn_lib.ModeKeys.EVAL and not self._eval_on_tpu:
-      logging.info('_is_running_on_cpu: eval_on_tpu disabled')
-      return True
-
-    if is_export_mode:
-      return True
-
-    return False
-
-  @property
-  def global_batch_size(self):
-    mode = self._assert_mode()
-    if mode == model_fn_lib.ModeKeys.TRAIN:
-      return self._train_batch_size
-    elif mode == model_fn_lib.ModeKeys.EVAL:
-      return self._eval_batch_size
-    elif mode == model_fn_lib.ModeKeys.PREDICT:
-      return self._predict_batch_size
-    else:
-      return None
-
-  @property
-  def batch_size_for_input_fn(self):
-    """Returns the shard batch size for `input_fn`."""
-    global_batch_size = self.global_batch_size
-
-    if (self.is_running_on_cpu() or self.is_input_broadcast_with_iterators()):
-      return global_batch_size
-
-    # On TPU
-    if self.is_input_sharded_per_core() or (
-        self.is_input_per_host_with_iterators()):
-      return global_batch_size // self.num_replicas
-    else:
-      return global_batch_size // self.num_hosts
-
-  @property
-  def batch_size_for_model_fn(self):
-    """Returns the shard batch size for `model_fn`."""
-    global_batch_size = self.global_batch_size
-
-    if (self.is_running_on_cpu() or self.is_input_broadcast_with_iterators()):
-      return global_batch_size
-
-    # On TPU. always sharded per shard.
-    return global_batch_size // self.num_replicas
-
-  @property
-  def master_job(self):
-    """Returns the job name to use to place TPU computations on.
-
-    Returns:
-      A string containing the job name, or None if no job should be specified.
-
-    Raises:
-      ValueError: If the user needs to specify a tpu_job_name, because we are
-        unable to infer the job name automatically, or if the user-specified job
-        names are inappropriate.
-    """
-    run_config = self._config
-    # If the user specifies the tpu_job_name, use that.
-    if run_config.tpu_config.tpu_job_name:
-      return run_config.tpu_config.tpu_job_name
-
-    # The tpu job is determined by the run_config. Right now, this method is
-    # required as tpu_config is not part of the RunConfig.
-    mode = self._assert_mode()
-    master = (
-        run_config.evaluation_master
-        if mode == model_fn_lib.ModeKeys.EVAL else run_config.master)
-    if master in _LOCAL_MASTERS:
-      return None
-
-    if (not run_config.session_config or
-        not run_config.session_config.cluster_def.job):
-      return _DEFAULT_JOB_NAME
-    cluster_def = run_config.session_config.cluster_def
-    job_names = set([job.name for job in cluster_def.job])
-    if _DEFAULT_JOB_NAME in job_names:
-      # b/37868888 tracks allowing ClusterSpec propagation to reuse job names.
-      raise ValueError('Currently, tpu_worker is not an allowed job name.')
-    if len(job_names) == 1:
-      return cluster_def.job[0].name
-    if len(job_names) == 2:
-      if _DEFAULT_COORDINATOR_JOB_NAME in job_names:
-        job_names.remove(_DEFAULT_COORDINATOR_JOB_NAME)
-        return job_names.pop()
-      # TODO(b/67716447): Include more sophisticated heuristics.
-    raise ValueError(
-        'Could not infer TPU job name. Please specify a tpu_job_name as part '
-        'of your TPUConfig.')
-
-  @property
-  def tpu_host_placement_function(self):
-    """Returns the TPU host place function."""
-
-    master = self.master_job
-
-    def _placement_function(_sentinal=None, replica_id=None, host_id=None):  # pylint: disable=invalid-name
-      """Return the host device given replica_id or host_id."""
-      assert _sentinal is None
-      if replica_id is not None and host_id is not None:
-        raise RuntimeError(
-            'replica_id and host_id can have only one non-None value.')
-
-      if master is None:
-        return '/replica:0/task:0/device:CPU:0'
-      else:
-        if replica_id is not None:
-          if self.model_parallelism_enabled:
-            return self.device_assignment.host_device(
-                replica=replica_id, job=master)
-          else:
-            host_id = replica_id / self.num_of_cores_per_host
-
-        return '/job:%s/task:%d/device:CPU:0' % (master, host_id)
-
-    return _placement_function
-
-  @property
-  def tpu_device_placement_function(self):
-    """Returns a TPU device placement Fn."""
-    master = self.master_job
-    job_device = '' if master is None else ('/job:%s' % master)
-
-    def _placement_function(i):
-      if self.model_parallelism_enabled:
-        return self.device_assignment.tpu_device(replica=i, job=master)
-      else:
-        num_of_cores_per_host = self.num_of_cores_per_host
-        host_id = i / num_of_cores_per_host
-        ordinal_id = i % num_of_cores_per_host
-        return '%s/task:%d/device:TPU:%d' % (job_device, host_id, ordinal_id)
-
-    return _placement_function
-
-  def tpu_ordinal_function(self, host_id):
-    """Returns the TPU ordinal fn."""
-
-    def _tpu_ordinal_function(shard_index_in_host):
-      """Return the TPU ordinal associated with a shard.
-
-      Required because the enqueue ops are placed on CPU.
-
-      Args:
-        shard_index_in_host: the shard index
-
-      Returns:
-        The ordinal of the TPU device the shard's infeed should be placed on.
-      """
-      if self.model_parallelism_enabled:
-        # We put both enqueue/dequeue ops at tpu.core(0) in each replica.
-        replica = self.device_assignment.lookup_replicas(host_id,
-                                                         0)[shard_index_in_host]
-        return self.device_assignment.tpu_ordinal(replica=replica)
-      else:
-        return shard_index_in_host % self.num_of_cores_per_host
-
-    return _tpu_ordinal_function
-
-  def _validate_tpu_configuration(self):
-    """Validates the configuration based on the TPU system metadata."""
-    mode = self._assert_mode()
-    if self._lazy_validation_dict.get(mode):
-      return
-
-    # All following information is obtained from TPU system metadata.
-    num_cores = self.num_cores
-    num_replicas = self.num_replicas
-    num_hosts = self.num_hosts
-
-    if not num_cores:
-      tpu_system_metadata = self._get_tpu_system_metadata()
-      raise RuntimeError(
-          'Cannot find any TPU cores in the system. Please double check '
-          'Tensorflow master address and TPU worker(s). Available devices '
-          'are {}.'.format(tpu_system_metadata.devices))
-
-    if self._config.tpu_config.num_shards:
-      user_provided_num_replicas = self._config.tpu_config.num_shards
-      if user_provided_num_replicas != num_replicas:
-        message = (
-            'TPUConfig.num_shards is not set correctly. According to TPU '
-            'system metadata for Tensorflow master ({}): num_replicas should '
-            'be ({}), got ({}). For non-model-parallelism, num_replicas should '
-            'be the total num of TPU cores in the system. For '
-            'model-parallelism, the total number of TPU cores should be '
-            'num_cores_per_replica * num_replicas. Please set it '
-            'accordingly or leave it as `None`'.format(
-                self._get_master_address(), num_replicas,
-                user_provided_num_replicas))
-
-        raise ValueError(message)
-
-    if self._config.tpu_config.num_cores_per_replica:
-      num_cores_per_replica = self._config.tpu_config.num_cores_per_replica
-      num_cores_per_host = self._get_tpu_system_metadata().num_of_cores_per_host
-      if num_cores_per_replica > num_cores_per_host:
-        raise ValueError(
-            'The num of cores required by the model parallelism, specified by '
-            'TPUConfig.num_cores_per_replica, is larger than the '
-            'num_cores_per_host. num_cores_per_replica: {}, '
-            'num_cores_per_host: {}'.format(num_cores_per_replica,
-                                            num_cores_per_host))
-
-    if mode == model_fn_lib.ModeKeys.TRAIN:
-      if (self._train_batch_size % num_replicas != 0 and
-          not self.is_input_broadcast_with_iterators()):
-        raise ValueError(
-            'train batch size {} must be divisible by number of replicas {}'
-            .format(self._train_batch_size, num_replicas))
-
-    elif mode == model_fn_lib.ModeKeys.EVAL:
-      if self._eval_batch_size is None:
-        raise ValueError(
-            'eval_batch_size in TPUEstimator constructor cannot be `None`'
-            'if .evaluate is running on TPU.')
-      if (self._eval_batch_size % num_replicas != 0 and
-          not self.is_input_broadcast_with_iterators()):
-        raise ValueError(
-            'eval batch size {} must be divisible by number of replicas {}'
-            .format(self._eval_batch_size, num_replicas))
-      if num_hosts > 1 and not self.is_input_broadcast_with_iterators():
-        raise ValueError(
-            'TPUEstimator.evaluate should be running on single TPU'
-            ' instead of a Pod.')
-    else:
-      assert mode == model_fn_lib.ModeKeys.PREDICT
-      if self._predict_batch_size is None:
-        raise ValueError(
-            'predict_batch_size in TPUEstimator constructor should not be '
-            '`None` if .predict is running on TPU.')
-      if (self._predict_batch_size % num_replicas != 0 and
-          not self.is_input_broadcast_with_iterators()):
-        raise ValueError(
-            'predict batch size {} must be divisible by number of replicas {}'
-            .format(self._predict_batch_size, num_replicas))
-      if num_hosts > 1 and not self.is_input_broadcast_with_iterators():
-        raise ValueError(
-            'TPUEstimator.predict should be running on single TPU worker. '
-            'got {}.'.format(num_hosts))
-
-    # Record the state "validated" into lazy dictionary.
-    self._lazy_validation_dict[mode] = True
-
-  def device_for_replica(self, replica_id):
-    """Returns the tuple of (CPU device and device ordinal) for replica.
-
-    This should be used for full replicate for non-model-parallelism.
-
-    Args:
-       replica_id: Int, the replica index.
-
-    Returns:
-       A tuple of device spec for CPU device and int device ordinal.
-    """
-    master = self.master_job
-
-    if self.model_parallelism_enabled:
-      return (self.device_assignment.host_device(
-          replica=replica_id, job=master),
-              self.device_assignment.tpu_ordinal(replica=replica_id))
-
-    job_device = '' if master is None else ('/job:%s' % master)
-
-    num_of_replicas_per_host = self.num_of_replicas_per_host
-    host_id = replica_id / num_of_replicas_per_host
-    ordinal_id = replica_id % num_of_replicas_per_host
-
-    host_device = '%s/task:%d/device:CPU:0' % (job_device, host_id)
-    return (host_device, ordinal_id)
-
-
-class _OneCoreTPUContext(_InternalTPUContext):
-  """Special _InternalTPUContext for one core usage."""
-
-  def __init__(self, config, train_batch_size, eval_batch_size,
-               predict_batch_size, use_tpu):
-
-    super(_OneCoreTPUContext, self).__init__(
-        config, train_batch_size, eval_batch_size,
-        predict_batch_size, use_tpu)
-
-  def _get_tpu_system_metadata(self):
-    """Gets the (maybe cached) TPU system metadata."""
-    master = self._get_master_address()
-    tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
-    if tpu_system_metadata is not None:
-      return tpu_system_metadata
-
-    tpu_system_metadata = (
-        tpu_system_metadata_lib._TPUSystemMetadata(  # pylint: disable=protected-access
-            num_cores=1,
-            num_hosts=1,
-            num_of_cores_per_host=1,
-            topology=None,
-            devices=[]))
-
-    self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
-    return tpu_system_metadata
-
-
-def _get_tpu_context(config, train_batch_size, eval_batch_size,
-                     predict_batch_size, use_tpu, eval_on_tpu):
-  """Returns an instance of `_InternalTPUContext`."""
-
-  if (config.tpu_config.num_shards == 1 and
-      config.tpu_config.num_cores_per_replica is None):
-    logging.warning(
-        'Setting TPUConfig.num_shards==1 is an unsupported behavior. '
-        'Please fix as soon as possible (leaving num_shards as None.)')
-    return _OneCoreTPUContext(config, train_batch_size, eval_batch_size,
-                              predict_batch_size, use_tpu)
-
-  return _InternalTPUContext(config, train_batch_size, eval_batch_size,
-                             predict_batch_size, use_tpu, eval_on_tpu)
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.tpu_context import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py b/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py
index ccba8a46c7cad0337119672e02314684f4451479..cb38a8f1a6bee3c2adfbefc203c1d143303c3368 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py
@@ -1,10 +1,10 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,1099 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TPU embedding APIs."""
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import copy
-import math
-import re
-import six
-
-from tensorflow.contrib.framework.python.framework import experimental
-from tensorflow.contrib.tpu.ops import gen_tpu_ops
-from tensorflow.contrib.tpu.proto import tpu_embedding_configuration_pb2 as elc
-from tensorflow.contrib.tpu.python.ops import tpu_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import partitioned_variables
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-
-TRAINING = elc.TPUEmbeddingConfiguration.TRAINING
-INFERENCE = elc.TPUEmbeddingConfiguration.INFERENCE
-
-# TODO(shizhiw): A better interface is to make `num_hosts` and
-# `num_cores_per_host` optional parameters for `TPUEmbedding`
-# constructor. Usually they can be automatically detected, but
-# user can also specify them for debugging (b/112112496).
-# Auto-detection can be done with `tpu_system_metadata.py`.
-_MASTER_JOB = 'tpu_worker'
-_HOST_PATTERN = '/job:tpu_worker/task:{}/device:CPU:0'
-_NUM_CORES_PER_HOST = 8
-
-_TEST_MASTER_JOB = None
-_TEST_HOST = '/replica:0/task:0/device:CPU:0'
-_TEST_NUM_CORES_PER_HOST = 2
-
-
-class TableConfig(
-    collections.namedtuple(
-        'TableConfig',
-        ['vocabulary_size', 'dimension', 'initializer', 'combiner'])):
-  """Embedding table configuration."""
-
-  @experimental
-  def __new__(cls,
-              vocabulary_size,
-              dimension,
-              initializer=None,
-              combiner='mean'):
-    """Embedding table configuration.
-
-    Args:
-      vocabulary_size: Number of vocabulary (/rows) in the table.
-      dimension: The embedding dimension.
-      initializer: A variable initializer function to be used in embedding
-        variable initialization. If not specified, defaults to
-        `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
-        `1/sqrt(dimension)`.
-      combiner: A string specifying how to reduce if there are multiple entries
-        in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
-        'mean' the default. 'sqrtn' often achieves good accuracy, in particular
-        with bag-of-words columns. For more information, see
-        `tf.nn.embedding_lookup_sparse`.
-
-    Returns:
-      `TableConfig`.
-
-    Raises:
-      ValueError: if `vocabulary_size` is not positive integer.
-      ValueError: if `dimension` is not positive integer.
-      ValueError: if `initializer` is specified and is not callable.
-      ValueError: if `combiner` is not supported.
-    """
-    if not isinstance(vocabulary_size, int) or vocabulary_size < 1:
-      raise ValueError('Invalid vocabulary_size {}.'.format(vocabulary_size))
-
-    if not isinstance(dimension, int) or dimension < 1:
-      raise ValueError('Invalid dimension {}.'.format(dimension))
-
-    if (initializer is not None) and (not callable(initializer)):
-      raise ValueError('initializer must be callable if specified.')
-    if initializer is None:
-      initializer = init_ops.truncated_normal_initializer(
-          mean=0.0, stddev=1 / math.sqrt(dimension))
-
-    if combiner not in ('mean', 'sum', 'sqrtn'):
-      raise ValueError('Invalid combiner {}'.format(combiner))
-
-    return super(TableConfig, cls).__new__(cls, vocabulary_size, dimension,
-                                           initializer, combiner)
-
-
-# TODO(shizhiw): Factor `use_gradient_accumulation` and
-# `pipeline_execution_with_tensor_core` out of `_OptimizationParameters`.
-class _OptimizationParameters(object):
-  """Parameters common to all optimizations."""
-
-  def __init__(self, learning_rate, use_gradient_accumulation,
-               pipeline_execution_with_tensor_core):
-    self.learning_rate = learning_rate
-    self.use_gradient_accumulation = use_gradient_accumulation
-    self.pipeline_execution_with_tensor_core = (
-        pipeline_execution_with_tensor_core)
-
-
-class AdagradParameters(_OptimizationParameters):
-  """Optimization parameters for Adagrad."""
-
-  def __init__(self, learning_rate, initial_accumulator,
-               use_gradient_accumulation=False,
-               pipeline_execution_with_tensor_core=True):
-    """Optimization parameters for Adagrad.
-
-    Args:
-      learning_rate: used for updating embedding table.
-      initial_accumulator: initial accumulator for Adagrad.
-      use_gradient_accumulation: setting this to `True` makes embedding
-         gradients calculation more accurate but slower. Please see
-         `optimization_parameters.proto` for details.
-         for details.
-      pipeline_execution_with_tensor_core: setting this to `True` makes training
-        faster, but trained model will be different if step N and step N+1
-        involve the same set of embedding ID. Please see
-        `tpu_embedding_configuration.proto` for details.
-    """
-    super(AdagradParameters, self).__init__(learning_rate,
-                                            use_gradient_accumulation,
-                                            pipeline_execution_with_tensor_core)
-    self.initial_accumulator = initial_accumulator
-
-
-class AdamParameters(_OptimizationParameters):
-  """Optimization parameters for Adam."""
-
-  def __init__(self, learning_rate,
-               beta1=0.9,
-               beta2=0.999,
-               epsilon=1e-08,
-               lazy_adam=True,
-               sum_inside_sqrt=True,
-               use_gradient_accumulation=False,
-               pipeline_execution_with_tensor_core=True):
-    """Optimization parameters for Adam.
-
-    Args:
-      learning_rate: a floating point value. The learning rate.
-      beta1: A float value.
-        The exponential decay rate for the 1st moment estimates.
-      beta2: A float value.
-        The exponential decay rate for the 2nd moment estimates.
-      epsilon: A small constant for numerical stability.
-      lazy_adam: Use lazy Adam instead of Adam. Lazy Adam trains faster.
-        Please see `optimization_parameters.proto` for details.
-      sum_inside_sqrt: This improves training speed. Please see
-        `optimization_parameters.proto` for details.
-      use_gradient_accumulation: setting this to `True` makes embedding
-        gradients calculation more accurate but slower. Please see
-        `optimization_parameters.proto` for details.
-        for details.
-      pipeline_execution_with_tensor_core: setting this to `True` makes training
-        faster, but trained model will be different if step N and step N+1
-        involve the same set of embedding ID. Please see
-        `tpu_embedding_configuration.proto` for details.
-    """
-    super(AdamParameters, self).__init__(learning_rate,
-                                         use_gradient_accumulation,
-                                         pipeline_execution_with_tensor_core)
-    self.beta1 = beta1
-    self.beta2 = beta2
-    self.epsilon = epsilon
-    self.lazy_adam = lazy_adam
-    self.sum_inside_sqrt = sum_inside_sqrt
-
-
-class StochasticGradientDescentParameters(_OptimizationParameters):
-  """Optimization parameters for stochastic gradient descent.
-
-  Args:
-    learning_rate: a floating point value. The learning rate.
-    use_gradient_accumulation: setting this to `True` makes embedding
-      gradients calculation more accurate but slower. Please see
-         `optimization_parameters.proto` for details.
-    pipeline_execution_with_tensor_core: setting this to `True` makes training
-      faster, but trained model will be different if step N and step N+1
-      involve the same set of embedding ID. Please see
-      `tpu_embedding_configuration.proto` for details.
-    """
-
-  def __init__(self, learning_rate, use_gradient_accumulation=False,
-               pipeline_execution_with_tensor_core=True):
-    super(StochasticGradientDescentParameters, self).__init__(
-        learning_rate, use_gradient_accumulation,
-        pipeline_execution_with_tensor_core)
-
-
-class TPUEmbedding(object):
-  """API for using TPU for embedding.
-
-    Example:
-    ```
-    table_config_user = tpu_embedding.TableConfig(
-        vocabulary_size=4, dimension=2,
-        initializer=initializer, combiner='mean')
-    table_to_config_dict = {'video': table_config_video,
-                          'user': table_config_user}
-    feature_to_table_dict = {'watched': 'video',
-                             'favorited': 'video',
-                             'friends': 'user'}
-    batch_size = 4
-    num_hosts = 1
-    optimization_parameters = tpu_embedding.AdagradParameters(1., 1.)
-    mode = tpu_embedding.TRAINING
-    embedding = tpu_embedding.TPUEmbedding(
-        table_to_config_dict, feature_to_table_dict,
-        batch_size, num_hosts, mode, optimization_parameters)
-
-    batch_size_per_core = embedding.batch_size_per_core
-    sparse_features_list = []
-    for host in hosts:
-      with ops.device(host):
-        for _ in range(embedding.num_cores_per_host):
-          sparse_features = {}
-          sparse_features['watched'] = sparse_tensor.SparseTensor(...)
-          sparse_features['favorited'] = sparse_tensor.SparseTensor(...)
-          sparse_features['friends'] = sparse_tensor.SparseTensor(...)
-          sparse_features_list.append(sparse_features)
-
-    enqueue_ops = embedding.generate_enqueue_ops(sparse_features_list)
-
-    def computation():
-      activations = embedding.get_activations()
-      loss = compute_loss(activations)
-
-      base_optimizer = gradient_descent.GradientDescentOptimizer(
-          learning_rate=1)
-      cross_shard_optimizer = tpu_optimizer.CrossShardOptimizer(
-          base_optimizer)
-
-      train_op = cross_shard_optimizer.minimize(loss)
-      # `train_op` and `send_gradients_op` must happen in order.
-      with ops.control_dependencies([train_op]):
-        send_gradients_op = embedding.generate_send_gradients_op()
-      with ops.control_dependencies([send_gradients_op]):
-        loss = array_ops.identity(loss)
-
-    loss = tpu.shard(computation,
-                     num_shards=embedding.num_cores)
-
-    with self.test_session() as sess:
-      sess.run(tpu.initialize_system(embedding_config=
-                                     embedding.config_proto))
-      sess.run(variables.global_variables_initializer())
-      sess.run(embedding.init_ops)
-      sess.run(enqueue_ops)
-      loss_val = sess.run(loss)
-    ```
-  """
-
-  # TODO(shizhiw): Instead of `feature_to_table_dict` which maps to table
-  # name, consider `feature_to_config_dict` which maps to `FeatureConfig`.
-  # `FeatureConfig` could have fields other than table name. For example, it
-  # could have a field to indicate that the feature should not be used to
-  # update embedding table (cr/204852758, cr/204940540). Also, this can support
-  # different combiners for different features within the same table.
-  # TODO(shizhiw, b/118512626): Remove `batch_size` from `__init__` and move it
-  # to `FeatureConfig`?
-
-  # TODO(shizhiw): will it be cleaner to make `table_to_config_dict` and
-  # `feature_to_table_dict` lists of `TableSpec` and `FeatureSpec` respectively?
-
-  # TODO(shizhiw): Consider adding `input_fn` as an option to remove boilerplate
-  # for-loops around construction of inputs.
-
-  # `optimization_parameter` applies to all tables. If the need arises,
-  # we can add `optimization_parameters` to `TableConfig` to override this
-  # global setting.
-  @experimental
-  def __init__(self,
-               table_to_config_dict,
-               feature_to_table_dict,
-               batch_size,
-               num_hosts,
-               mode,
-               optimization_parameters=None,
-               tpu_embedding_test=False):
-    """API for using TPU for embedding lookups.
-
-    Args:
-      table_to_config_dict: A dictionary mapping from string of table name to
-        `TableConfig`. Table refers to an embedding table, e.g. `params`
-        argument to `tf.nn.embedding_lookup_sparse()`.
-      feature_to_table_dict: A dictionary mapping from string of feature name
-        to string of table name. Feature refers to ids to lookup in embedding
-        table, e.g. `sp_ids` argument to `tf.nn.embedding_lookup_sparse()`.
-      batch_size: An `int` representing the global batch size.
-      num_hosts: An `int` representing the number of TPU hosts.
-      mode: `TRAINING` or `INFERENCE`.
-      optimization_parameters: `AdagradParameters`, `AdamParameters`,
-        `Stochasticgradientdescentparameters`. Must be set in training and must
-        be `None` in inference.
-      tpu_embedding_test: A `bool`. Only used for testing.
-
-    Raises:
-      ValueError: if any input is invalid.
-    """
-    _validate_table_to_config_dict(table_to_config_dict)
-    # Avoid nondeterminism from `Dict` iteration order by using `OrderedDict`.
-    self._table_to_config_dict = _create_ordered_dict(table_to_config_dict)
-    self._combiners = _create_combiners(self._table_to_config_dict)
-
-    _validate_feature_to_table_dict(table_to_config_dict, feature_to_table_dict)
-    self._feature_to_table_dict = _create_ordered_dict(feature_to_table_dict)
-    self._table_to_features_dict = _create_table_to_features_dict(
-        self._feature_to_table_dict)
-
-    self._batch_size = batch_size
-
-    if tpu_embedding_test:
-      self._num_hosts = 1
-      self._hosts = [_TEST_HOST]
-      self._num_cores_per_host = _TEST_NUM_CORES_PER_HOST
-    else:
-      self._num_hosts = num_hosts
-      self._hosts = [_HOST_PATTERN.format(i) for i in range(self._num_hosts)]
-      self._num_cores_per_host = _NUM_CORES_PER_HOST
-    self._num_cores = self._num_cores_per_host * self._num_hosts
-
-    _validate_batch_size(self._batch_size, self._num_cores)
-    self._batch_size_per_core = self._batch_size // self._num_cores
-
-    self._init_ops = []
-
-    # TODO(shizhiw): remove `mode`?
-    if mode == TRAINING:
-      _validate_optimization_parameters(optimization_parameters)
-      self._optimization_parameters = optimization_parameters
-    elif mode == INFERENCE:
-      if optimization_parameters is not None:
-        raise ValueError('`optimization_parameters` should be `None` '
-                         'for inference mode.')
-      self._optimization_parameters = (
-          StochasticGradientDescentParameters(1.))
-    else:
-      raise ValueError('`mode` only supports {} and {}; got {}.'
-                       .format(TRAINING, INFERENCE, mode))
-    self._mode = mode
-
-    # TODO(shizhiw): move `optimization_parameters` into `_optimizer_handler`
-    # and create special handler for inference that inherits from
-    # StochasticGradientDescentHandler with more user-friendly error message
-    # on get_slot().
-    self._optimizer_handler = _get_optimization_handler(
-        self._optimization_parameters)
-
-    dummy_table_variables_init_op = self._create_dummy_table_variables()
-    self._init_ops.append(dummy_table_variables_init_op)
-
-    self._config_proto = self._create_config_proto()
-
-    self._create_variables_and_ops()
-    self._init_ops.extend(self._load_parameters_ops)
-
-  @property
-  def hosts(self):
-    """A list of device names for CPU hosts.
-
-    Returns:
-      A list of device names for CPU hosts.
-    """
-    return self._hosts
-
-  # TODO(shizhiw): change to num_tensor_cores_per_host to be more explicit and
-  # to be consistent with `tpu_embedding_configuration.proto`.
-  @property
-  def num_cores_per_host(self):
-    """Number of TPU cores on a CPU host.
-
-    Returns:
-      Number of TPU cores on a CPU host.
-    """
-    return self._num_cores_per_host
-
-  @property
-  def num_cores(self):
-    """Total number of TPU cores on all hosts.
-
-    Returns:
-      Total number of TPU cores on all hosts.
-    """
-    return self._num_cores
-
-  @property
-  def batch_size_per_core(self):
-    """Batch size for each TPU core.
-
-    The sparse tensors in `sparse_features_list` to `generate_enqueue_ops`
-       must have batch dimension equal to this.
-
-    Returns:
-      Batch size for each TPU core.
-    """
-    return self._batch_size_per_core
-
-  @property
-  def config_proto(self):
-    """Create embedding config proto for `tpu.initialize_system()`.
-
-    Returns:
-      an `TPUEmbeddingConfiguration` proto describing the desired
-         configuration of the hardware embedding lookup tables, which
-         is passed to `tpu.initialize_system()`.
-    """
-    return self._config_proto
-
-  @property
-  def init_ops(self):
-    """Initialization ops for TPU embedding.
-
-    It must be called after all global variables have been initialized,
-    i.e. after `global_variables_initializer()`, as it loads embedding
-    tables into TPU.
-
-    Returns:
-      A list of ops.
-    """
-    return self._init_ops
-
-  # TODO(shizhiw): get table variables the same way as getting slot variables.
-  @property
-  def table_to_table_variables_dict(self):
-    return copy.copy(self._table_to_table_variables_dict)
-
-  def get_slot_names(self):
-    """Return a list of the names of slots created by `TPUEmbedding`."""
-    return self._optimizer_handler.get_slot_names()
-
-  def get_slot(self, table, name):
-    """Return a slot named `name` create for `table` by `TPUEmbedding`."""
-    return self._optimizer_handler.get_slot(table, name)
-
-  # TODO(shizhiw): expose load to user too?
-  @property
-  def retrieve_parameters_ops(self):
-    return self._retrieve_parameters_ops
-
-  def _create_config_proto(self):
-    """Create `TPUEmbeddingConfiguration`."""
-    config_proto = elc.TPUEmbeddingConfiguration()
-    for table in self._table_to_config_dict:
-      table_descriptor = config_proto.table_descriptor.add()
-      table_descriptor.name = table
-
-      table_config = self._table_to_config_dict[table]
-      table_descriptor.vocabulary_size = table_config.vocabulary_size
-      table_descriptor.dimension = table_config.dimension
-
-      features_for_table = self._table_to_features_dict[table]
-      table_descriptor.num_features = len(features_for_table)
-
-      table_descriptor.optimization_parameters.learning_rate.constant = (
-          self._optimization_parameters.learning_rate)
-      table_descriptor.optimization_parameters.use_gradient_accumulation = (
-          self._optimization_parameters.use_gradient_accumulation)
-      self._optimizer_handler.set_optimization_parameters(table_descriptor)
-
-    config_proto.mode = self._mode
-    config_proto.batch_size_per_tensor_core = self._batch_size_per_core
-    config_proto.num_hosts = self._num_hosts
-    config_proto.num_tensor_cores = self._num_cores
-    config_proto.sharding_strategy = elc.TPUEmbeddingConfiguration.DIV_DEFAULT
-    config_proto.pipeline_execution_with_tensor_core = (
-        self._optimization_parameters.pipeline_execution_with_tensor_core)
-
-    return config_proto
-
-  def _create_variables_and_ops(self):
-    """Create embedding variables and return ops to load them into TPU."""
-    self._load_parameters_ops = []
-    self._retrieve_parameters_ops = []
-    self._table_to_table_variables_dict = {}
-    for table in self._table_to_config_dict:
-      device_fn = _create_device_fn(self._hosts)
-      with ops.device(device_fn):
-        # TODO(shizhiw): allow user to specify variable name so that
-        # they could make the name consistent with CPU etc.
-        variable_name = table
-        table_variables = _create_partitioned_variables(
-            name=variable_name,
-            num_hosts=self._num_hosts,
-            vocabulary_size=self._table_to_config_dict[table].vocabulary_size,
-            embedding_dimension=self._table_to_config_dict[table].dimension,
-            initializer=self._table_to_config_dict[table].initializer,
-            collections=[ops.GraphKeys.GLOBAL_VARIABLES])
-        self._table_to_table_variables_dict[table] = table_variables
-
-        self._optimizer_handler.create_variables_and_ops(
-            table, variable_name, self._num_hosts,
-            self._table_to_config_dict[table], table_variables,
-            self._load_parameters_ops, self._retrieve_parameters_ops)
-
-  def _create_dummy_table_variables(self):
-    """Create dummy embedding table variables.
-
-    The sole purpose of these dummy variables are to trigger gradient
-    calcuation wrt them so that the gradients wrt activation can be captured
-    and later sent to TPU embedding.
-
-    Returns:
-      Initializer for these variables.
-
-    Raises:
-      RuntimeError: if collection to store gradients already exists and is not
-      empty.
-    """
-    self._dummy_table_variables = []
-    # TODO(shizhiw): remove table id.
-    for table_id, table in enumerate(self._table_to_features_dict):
-      self._dummy_table_variables.append(
-          variable_scope.get_variable(
-              'tpu_embedding_dummy_table_variable_%s' % table,
-              dtype=dtypes.float32,
-              shape=[1],
-              use_resource=True,
-              trainable=True,
-              # TODO(shizhiw): Remove these dummy variables as
-              # tensorflow optimizer creates slot variable for them which
-              # is undesirable.
-              # e.g. tpu_embedding_dummy_table_variable_mlp_user/Adam{_1}.
-              # Explicitly specifying collections prevents this variable from
-              # being added to the GLOBAL_VARIABLES collection, so that Saver()
-              # ignores it.
-              collections=['tpu_embedding_dummy_table_variables']))
-
-      g = ops.get_default_graph()
-      table_gradients = g.get_collection_ref(
-          'tpu_embedding_gradients_table_%d' % table_id)
-      if table_gradients:
-        raise RuntimeError(
-            'tpu_embedding_gradients_table_%d is not empty.' % table_id)
-      table_gradients.extend([None] * len(self._table_to_features_dict[table]))
-
-    return variables.variables_initializer(
-        self._dummy_table_variables,
-        name='tpu_embedding_dummy_table_variables_init')
-
-  def generate_enqueue_ops(self, sparse_features_list):
-    """Generate enqueue ops.
-
-    Args:
-      sparse_features_list: a list of dictionary mapping from string
-        of feature names to sparse tensor. Each dictionary is for one
-        TPU core. Dictionaries for the same core should be contiguous
-        on the list.
-
-    Returns:
-      Ops to enqueue to TPU for embedding.
-    """
-    self._validate_generate_enqueue_ops_sparse_features_list(
-        sparse_features_list)
-    return [
-        self._generate_enqueue_op(
-            sparse_features, device_ordinal=i % self._num_cores_per_host)
-        for i, sparse_features in enumerate(sparse_features_list)
-    ]
-
-  def _validate_generate_enqueue_ops_sparse_features_list(
-      self, sparse_features_list):
-    """Validate `sparse_features_list`."""
-    if len(sparse_features_list) != self._num_cores:
-      raise ValueError('Length of `sparse_features_list` should match the '
-                       'number of cores; '
-                       '`len(sparse_features_list)` is {}, '
-                       'number of cores is {}.'.format(
-                           len(sparse_features_list), self._num_cores))
-
-    feature_set = set(self._feature_to_table_dict.keys())
-    contiguous_device = None
-    for i, sparse_features in enumerate(sparse_features_list):
-      used_feature_set = set(sparse_features.keys())
-
-      # Check features are valid.
-      missing_feature_set = feature_set - used_feature_set
-      if missing_feature_set:
-        raise ValueError('`sparse_features_list[{}]` misses a feature that is '
-                         'in `feature_to_config_dict`: {}.'.format(
-                             i, missing_feature_set))
-
-      extra_feature_set = used_feature_set - feature_set
-      if extra_feature_set:
-        raise ValueError('`sparse_features_list[{}]` has a feature that is not '
-                         'in `feature_to_config_dict`: {}.'.format(
-                             i, extra_feature_set))
-
-      device = None
-      device_feature = None
-      for feature, tensor in six.iteritems(sparse_features):
-        if not isinstance(tensor, sparse_tensor.SparseTensor):
-          raise ValueError('`sparse_features_list[{}]` has a feature that is '
-                           'not mapped to `SparseTensor`. '
-                           '`feature`: {}, type: {}'.format(
-                               i, feature, type(tensor)))
-
-        # Check all features are on the same device.
-        if device is None:
-          device = tensor.op.device
-          device_feature = feature
-        else:
-          if device != tensor.op.device:
-            raise ValueError('Devices are different between features in '
-                             '`sparse_features_list[{}]`; '
-                             'devices: {}, {}; features: {}, {}.'.format(
-                                 i, device, tensor.op.device, feature,
-                                 device_feature))
-
-      if i % self._num_cores_per_host:
-        if device != contiguous_device:
-          raise ValueError('We expect the `sparse_features` which are on the '
-                           'same host to be contiguous in '
-                           '`sparse_features_list`, '
-                           '`sparse_features_list[{}]` is on device {}, '
-                           'but is expected to be on device {}.'.format(
-                               i, device, contiguous_device))
-      else:
-        contiguous_device = device
-
-  def _generate_enqueue_op(self, sparse_features, device_ordinal):
-    with ops.colocate_with(list(sparse_features.values())[0]):
-      sample_idcs, embedding_idcs, aggregation_weights = (
-          self._format_for_tpu_embedding_sparse_batch(sparse_features))
-      return tpu_ops.enqueue_tpu_embedding_sparse_batch(
-          sample_idcs,
-          embedding_idcs,
-          aggregation_weights,
-          combiners=self._combiners,
-          device_ordinal=device_ordinal)
-
-  def _format_for_tpu_embedding_sparse_batch(self, sparse_features):
-    """Format sparse features for `enqueue_tpu_embedding_sparse_batch()`.
-
-    Args:
-      sparse_features: a `Dict` of `SparseTensor`s for embedding.
-
-    Returns:
-      Arguments for `enqueue_tpu_embedding_sparse_batch()`.
-    """
-
-    sample_idcs, embedding_idcs, aggregation_weights = list(), list(), list()
-    for table in self._table_to_features_dict:
-      sample_t, indices_t, weights_t = list(), list(), list()
-
-      features = self._table_to_features_dict[table]
-      for i, feature in enumerate(features):
-        tensor = sparse_features[feature]
-        sample_indices = tensor.indices[:, 0]
-        embedding_indices = tensor.values
-        weights = array_ops.ones_like(embedding_indices)
-        sample_t.append(i * self._batch_size_per_core + sample_indices)
-        indices_t.append(embedding_indices)
-        weights_t.append(weights)
-
-      sample_idcs.append(
-          math_ops.cast(array_ops.concat(sample_t, axis=0), dtype=dtypes.int32))
-      embedding_idcs.append(
-          math_ops.cast(
-              array_ops.concat(indices_t, axis=0), dtype=dtypes.int32))
-      aggregation_weights.append(
-          math_ops.cast(
-              array_ops.concat(weights_t, axis=0), dtype=dtypes.float32))
-
-    return sample_idcs, embedding_idcs, aggregation_weights
-
-  def get_activations(self):
-    """Get activations for features.
-
-    This should be called within `computation` that is passed to
-      `tpu.replicate` and friends.
-
-    Returns:
-      A dictionary mapping from `String` of feature name to `Tensor`
-        of activation.
-    """
-    recv_activations = tpu_ops.recv_tpu_embedding_activations(
-        num_outputs=len(self._table_to_config_dict),
-        config=self._config_proto.SerializeToString())
-
-    activations = collections.OrderedDict()
-    for table_id, table in enumerate(self._table_to_features_dict):
-      features = self._table_to_features_dict[table]
-      for lookup_id, feature in enumerate(features):
-        start_row = lookup_id * self._batch_size_per_core
-        end_row = start_row + self._batch_size_per_core
-        activations[feature] = gen_tpu_ops.tpu_embedding_activations(
-            self._dummy_table_variables[table_id],
-            recv_activations[table_id][start_row:end_row, :],
-            table_id=table_id,
-            lookup_id=lookup_id)
-    return activations
-
-  # TODO(shizhiw): Make `gradient_multiplier` per feature. Setting it to 0 would
-  # have the effect of `tf.stop_gradients()`.
-  # TODO(shizhiw): Consider alternative ways to capture gradients wrt embedding
-  # layer outputs to remove `_dummy_table_variables`,
-  # `_embedding_activation_grad` and `tpu_embedding_gradients_table_%d'.
-  def generate_send_gradients_op(self, gradient_multipliers=None):
-    """Retrieve gradients from collections and send them to TPU embedding.
-
-    Args:
-      gradient_multipliers: None, or dict mapping table names to gradient
-        multiplier Tensors.
-
-    Returns:
-      SendTPUEmbeddingGradients Op.
-
-    Raises:
-      ValueError: If required gradients have not been defined.
-      RuntimeError: If `mode` is not `TRAINING`.
-    """
-    if self._mode != TRAINING:
-      raise RuntimeError('Only in training mode gradients need to '
-                         'be sent to TPU embedding; got mode {}.'
-                         .format(self._mode))
-
-    g = ops.get_default_graph()
-    gradients = list()
-    for table_id, table in enumerate(self._table_to_config_dict):
-      table_gradients = g.get_collection(
-          'tpu_embedding_gradients_table_%d' % table_id)
-      if any(gradient is None for gradient in table_gradients):
-        raise ValueError(
-            'Table {}/{} has undefined gradients: this is probably because the '
-            'model asked TPUEmbedding to compute activations that were not '
-            'used.'.format(table_id, table))
-      concat_table_grads = array_ops.concat(table_gradients, axis=0)
-      if gradient_multipliers is not None:
-        concat_table_grads *= gradient_multipliers[table.name]
-      gradients.append(concat_table_grads)
-
-    return tpu_ops.send_tpu_embedding_gradients(
-        inputs=gradients, config=self.config_proto.SerializeToString())
-
-
-def _validate_table_to_config_dict(table_to_config_dict):
-  """Validate `table_to_config_dict`."""
-  for k, v in six.iteritems(table_to_config_dict):
-    if not isinstance(v, TableConfig):
-      raise ValueError('Value of `table_to_config_dict` must be of type '
-                       '`TableConfig`, got {} for {}.'.format(type(v), k))
-
-
-def _validate_feature_to_table_dict(table_to_config_dict,
-                                    feature_to_table_dict):
-  """Validate `feature_to_table_dict`."""
-  used_table_set = set(feature_to_table_dict.values())
-  table_set = set(table_to_config_dict.keys())
-
-  unused_table_set = table_set - used_table_set
-  if unused_table_set:
-    raise ValueError('`table_to_config_dict` specifies table that is not '
-                     'used in `feature_to_table_dict`: {}.'
-                     .format(unused_table_set))
-
-  extra_table_set = used_table_set - table_set
-  if extra_table_set:
-    raise ValueError('`feature_to_table_dict` refers to a table that is not '
-                     'specified in `table_to_config_dict`: {}.'
-                     .format(extra_table_set))
-
-
-def _validate_batch_size(batch_size, num_cores):
-  if batch_size % num_cores:
-    raise ValueError('`batch_size` is not a multiple of number of '
-                     'cores. `batch_size`={}, `_num_cores`={}.'.format(
-                         batch_size, num_cores))
-
-
-def _validate_optimization_parameters(optimization_parameters):
-  if not isinstance(optimization_parameters, _OptimizationParameters):
-    raise ValueError('`optimization_parameters` must inherit from '
-                     '`_OptimizationPramaters`. '
-                     '`type(optimization_parameters)`={}'.format(
-                         type(optimization_parameters)))
-
-
-class _OptimizerHandler(object):
-  """Interface class for handling optimizer specific logic."""
-
-  def __init__(self, optimization_parameters):
-    self._optimization_parameters = optimization_parameters
-
-  def set_optimization_parameters(self, table_descriptor):
-    raise NotImplementedError()
-
-  def create_variables_and_ops(self, table, variable_name):
-    raise NotImplementedError()
-
-  def get_slot_names(self):
-    raise NotImplementedError()
-
-  def get_slot(self, table, name):
-    raise NotImplementedError()
-
-
-class _AdagradHandler(_OptimizerHandler):
-  """Handles Adagrad specific logic."""
-
-  def __init__(self, optimization_parameters):
-    super(_AdagradHandler, self).__init__(optimization_parameters)
-    self._table_to_accumulator_variables_dict = {}
-
-  def set_optimization_parameters(self, table_descriptor):
-    table_descriptor.optimization_parameters.adagrad.SetInParent()
-
-  def create_variables_and_ops(self, table, variable_name, num_hosts,
-                               table_config, table_variables,
-                               load_parameters_ops, retrieve_parameters_ops):
-    optimizer_name = 'Adagrad'
-    accumulator_initializer = init_ops.constant_initializer(
-        self._optimization_parameters.initial_accumulator)
-    accumulator_variables = _create_partitioned_variables(
-        name='%s/%s' % (variable_name, optimizer_name),
-        num_hosts=num_hosts,
-        vocabulary_size=table_config.vocabulary_size,
-        embedding_dimension=table_config.dimension,
-        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
-        initializer=accumulator_initializer)
-
-    self._table_to_accumulator_variables_dict[table] = accumulator_variables
-    for host_id, table_variable, accumulator_variable in (zip(
-        range(num_hosts), table_variables, accumulator_variables)):
-      with ops.colocate_with(table_variable):
-        load_parameters_op = (
-            tpu_ops.load_tpu_embedding_adagrad_parameters(
-                parameters=table_variable,
-                accumulators=accumulator_variable,
-                table_name=table,
-                num_shards=num_hosts,
-                shard_id=host_id))
-        retrieved_table, retrieved_accumulator = (
-            tpu_ops.retrieve_tpu_embedding_adagrad_parameters(
-                table_name=table,
-                num_shards=num_hosts,
-                shard_id=host_id))
-        retrieve_parameters_op = control_flow_ops.group(
-            state_ops.assign(table_variable, retrieved_table),
-            state_ops.assign(accumulator_variable, retrieved_accumulator))
-
-      load_parameters_ops.append(load_parameters_op)
-      retrieve_parameters_ops.append(retrieve_parameters_op)
-
-  def get_slot_names(self):
-    return ['accumulator']
-
-  def get_slot(self, table, name):
-    if name not in self.get_slot_names():
-      raise ValueError('Adagrad has {} as slot names; got {}.'
-                       .format(self.get_slot_names(), name))
-    return self._table_to_accumulator_variables_dict[table]
-
-
-class _AdamHandler(_OptimizerHandler):
-  """Handles Adam specific logic."""
-
-  def __init__(self, optimization_parameters):
-    super(_AdamHandler, self).__init__(optimization_parameters)
-    self._table_to_m_variables_dict = {}
-    self._table_to_v_variables_dict = {}
-
-  def set_optimization_parameters(self, table_descriptor):
-    table_descriptor.optimization_parameters.adam.beta1 = (
-        self._optimization_parameters.beta1)
-    table_descriptor.optimization_parameters.adam.beta2 = (
-        self._optimization_parameters.beta2)
-    table_descriptor.optimization_parameters.adam.epsilon = (
-        self._optimization_parameters.epsilon)
-    table_descriptor.optimization_parameters.adam.use_non_lazy_adam = (
-        not self._optimization_parameters.lazy_adam)
-    table_descriptor.optimization_parameters.adam.use_sum_inside_sqrt = (
-        self._optimization_parameters.sum_inside_sqrt)
-
-  def create_variables_and_ops(self, table, variable_name, num_hosts,
-                               table_config, table_variables,
-                               load_parameters_ops, retrieve_parameters_ops):
-    optimizer_name = 'Adam'
-    m_initializer = init_ops.zeros_initializer()
-    m_variables = _create_partitioned_variables(
-        name='%s/%s/m' % (variable_name, optimizer_name),
-        num_hosts=num_hosts,
-        vocabulary_size=table_config.vocabulary_size,
-        embedding_dimension=table_config.dimension,
-        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
-        initializer=m_initializer)
-    v_initializer = init_ops.zeros_initializer()
-    v_variables = _create_partitioned_variables(
-        name='%s/%s/v' % (variable_name, optimizer_name),
-        num_hosts=num_hosts,
-        vocabulary_size=table_config.vocabulary_size,
-        embedding_dimension=table_config.dimension,
-        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
-        initializer=v_initializer)
-
-    self._table_to_m_variables_dict[table] = m_variables
-    self._table_to_v_variables_dict[table] = v_variables
-
-    for host_id, table_variable, m_variable, v_variable in (zip(
-        range(num_hosts), table_variables,
-        m_variables, v_variables)):
-      with ops.colocate_with(table_variable):
-        load_parameters_op = (
-            tpu_ops.load_tpu_embedding_adam_parameters(
-                parameters=table_variable,
-                momenta=m_variable,
-                velocities=v_variable,
-                table_name=table,
-                num_shards=num_hosts,
-                shard_id=host_id))
-        retrieved_table, retrieved_m, retrieved_v = (
-            tpu_ops.retrieve_tpu_embedding_adam_parameters(
-                table_name=table,
-                num_shards=num_hosts,
-                shard_id=host_id))
-        retrieve_parameters_op = control_flow_ops.group(
-            state_ops.assign(table_variable, retrieved_table),
-            state_ops.assign(m_variable, retrieved_m),
-            state_ops.assign(v_variable, retrieved_v))
-
-      load_parameters_ops.append(load_parameters_op)
-      retrieve_parameters_ops.append(retrieve_parameters_op)
-
-  def get_slot_names(self):
-    return ['m', 'v']
-
-  def get_slot(self, table, name):
-    if name == 'm':
-      return self._table_to_m_variables_dict[table]
-    elif name == 'v':
-      return self._table_to_v_variables_dict[table]
-    else:
-      raise ValueError('Adam has {} as slot names; got {}.'
-                       .format(self.get_slot_names(), name))
-
-
-class _StochasticGradientDescentHandler(_OptimizerHandler):
-  """Handles stochastic gradient descent specific logic."""
-
-  def set_optimization_parameters(self, table_descriptor):
-    (table_descriptor.optimization_parameters.stochastic_gradient_descent
-     .SetInParent())
-
-  def create_variables_and_ops(self, table, variable_name, num_hosts,
-                               table_config, table_variables,
-                               load_parameters_ops, retrieve_parameters_ops):
-    del table_config
-
-    for host_id, table_variable in (zip(
-        range(num_hosts), table_variables)):
-      with ops.colocate_with(table_variable):
-        load_parameters_op = (
-            tpu_ops
-            .load_tpu_embedding_stochastic_gradient_descent_parameters(
-                parameters=table_variable,
-                table_name=table,
-                num_shards=num_hosts,
-                shard_id=host_id))
-        retrieved_table = (
-            tpu_ops
-            .retrieve_tpu_embedding_stochastic_gradient_descent_parameters(
-                table_name=table,
-                num_shards=num_hosts,
-                shard_id=host_id))
-        retrieve_parameters_op = control_flow_ops.group(
-            state_ops.assign(table_variable, retrieved_table))
-
-      load_parameters_ops.append(load_parameters_op)
-      retrieve_parameters_ops.append(retrieve_parameters_op)
-
-  def get_slot_names(self):
-    return []
-
-  def get_slot(self, table, name):
-    raise ValueError('Stochastic gradient descent does not have slot variable.')
-
-
-def _get_optimization_handler(optimization_parameters):
-  if isinstance(optimization_parameters, AdagradParameters):
-    return _AdagradHandler(optimization_parameters)
-  elif isinstance(optimization_parameters, AdamParameters):
-    return _AdamHandler(optimization_parameters)
-  elif isinstance(optimization_parameters, StochasticGradientDescentParameters):
-    return _StochasticGradientDescentHandler(optimization_parameters)
-  else:
-    return NotImplementedError()
-
-
-def _create_ordered_dict(d):
-  """Create an OrderedDict from Dict."""
-  return collections.OrderedDict((k, d[k]) for k in sorted(d))
-
-
-def _create_combiners(table_to_config_dict):
-  return [table_to_config_dict[t].combiner for t in table_to_config_dict]
-
-
-def _create_table_to_features_dict(feature_to_table_dict):
-  """Create mapping from table to a list of its features."""
-  table_to_features_dict_tmp = {}
-  for feature, table in six.iteritems(feature_to_table_dict):
-    if table in table_to_features_dict_tmp:
-      table_to_features_dict_tmp[table].append(feature)
-    else:
-      table_to_features_dict_tmp[table] = [feature]
-
-  table_to_features_dict = collections.OrderedDict()
-  for table in sorted(table_to_features_dict_tmp):
-    table_to_features_dict[table] = sorted(table_to_features_dict_tmp[table])
-  return table_to_features_dict
-
-
-def _create_device_fn(hosts):
-  """Create device_fn() to use with _create_partitioned_variables()."""
-
-  def device_fn(op):
-    """Returns the `device` for `op`."""
-    part_match = re.match(r'.*/part_(\d+)(/|$)', op.name)
-
-    if part_match:
-      idx = int(part_match.group(1))
-    else:
-      raise RuntimeError('Internal Error: '
-                         'Expected %s to contain /part_*.' % op.name)
-
-    device = hosts[idx]
-    return device
-
-  return device_fn
-
-
-def _create_partitioned_variables(name,
-                                  num_hosts,
-                                  vocabulary_size,
-                                  embedding_dimension,
-                                  initializer,
-                                  collections=None):  # pylint: disable=redefined-outer-name
-  """Creates ParitionedVariables based on `num_hosts` for `table`."""
-  # TODO(shizhiw): automatically place embedding lookup elsewhere?
-  if vocabulary_size < num_hosts:
-    raise ValueError('`vocabulary_size`({}) is smaller than `num_hosts`({}). '
-                     'As TPU embedding is not optimized for small tables, '
-                     'please consider other ways for this embedding lookup.')
-
-  return list(variable_scope.get_variable(
-      name,
-      shape=(vocabulary_size, embedding_dimension),
-      partitioner=partitioned_variables.fixed_size_partitioner(num_hosts),
-      dtype=dtypes.float32,
-      initializer=initializer,
-      collections=collections,
-      trainable=False))
-
-
-@ops.RegisterGradient('TPUEmbeddingActivations')
-def _embedding_activations_grad(activations_op, grad_wrt_activations):
-  """Saves the gradient of embedding activations ops in a graph collection."""
-  g = ops.get_default_graph()
-  table_id = activations_op.get_attr('table_id')
-  lookup_id = activations_op.get_attr('lookup_id')
-  table_gradients = g.get_collection_ref(
-      'tpu_embedding_gradients_table_%d' % table_id)
-
-  if not table_gradients:
-    raise RuntimeError(
-        'Gradients for TPUEmbedding have been generated in non-training mode. '
-        'This is not expected. Consider putting your Optimizer.minimize code '
-        'behind the training mode condition check. For Estimator, you can '
-        'do \n\n'
-        '    if mode == tf.estimator.ModeKeys.TRAIN:\n'
-        '        train_op = opt.minimize(loss)\n'
-        '\n')
-
-  table_gradients[lookup_id] = array_ops.identity(grad_wrt_activations)
-  return [
-      # RegisterGradient requires that value be returned for all inputs. Since
-      # the first argument (tpu_gradient_variable_{table_name}) has shape [1],
-      # we will return zeros(shape=[1]). The actual gradient w.r.t. the
-      # embedding activations (grad_wrt_activations) has the same shape as the
-      # activations returned by  embedding_activations.
-      array_ops.zeros(arg.shape, dtype=dtypes.float32)
-      for arg in activations_op.inputs
-  ]
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.tpu_embedding import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_embedding_gradient.py b/tensorflow/contrib/tpu/python/tpu/tpu_embedding_gradient.py
new file mode 100644
index 0000000000000000000000000000000000000000..308adc77e9ad2d912d0461512655b55faa53da60
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_embedding_gradient.py
@@ -0,0 +1,23 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.tpu_embedding_gradient import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 96b9556e137effcaaa5916b9723142f737a6dc33..893118412e1363ce50416e6ef36692bc23d04179 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -1,3468 +1,33 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-"""TPUEstimator class."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import copy
-import os
-import signal
-import sys
-import threading
-import time
-
-import numpy as np
-import six
-from six.moves import queue as Queue  # pylint: disable=redefined-builtin
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.contrib.tpu.python.tpu import tensor_tracer
-from tensorflow.contrib.tpu.python.ops import tpu_ops
-from tensorflow.contrib.tpu.python.tpu import error_handling
-from tensorflow.contrib.tpu.python.tpu import session_support
-from tensorflow.contrib.tpu.python.tpu import tpu
-from tensorflow.contrib.tpu.python.tpu import tpu_config
-from tensorflow.contrib.tpu.python.tpu import tpu_context
-from tensorflow.contrib.tpu.python.tpu import tpu_feed
-from tensorflow.contrib.tpu.python.tpu import training_loop
-from tensorflow.contrib.tpu.python.tpu import util as util_lib
-from tensorflow.contrib.training.python.training import hparam
-from tensorflow.core.framework import variable_pb2
-from tensorflow.core.framework.summary_pb2 import Summary
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session as tf_session
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest as data_nest
-from tensorflow.python.estimator import estimator as estimator_lib
-from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator.export import export_output as export_output_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import summary_ops_v2 as contrib_summary
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.saved_model import tag_constants
-from tensorflow.python.summary import summary
-from tensorflow.python.training import basic_session_run_hooks
-from tensorflow.python.training import evaluation
-from tensorflow.python.training import session_run_hook
-from tensorflow.python.training import training
-from tensorflow.python.training import training_util
-from tensorflow.python.util import function_utils
-from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
-
-_INITIAL_LOSS = 1e7
-_ZERO_LOSS = 0.
-_TPU_ESTIMATOR = 'tpu_estimator'
-_ITERATIONS_PER_LOOP_VAR = 'iterations_per_loop'
-_BATCH_SIZE_KEY = 'batch_size'
-_CTX_KEY = 'context'
-_USE_TPU_KEY = 'use_tpu'
-_CROSS_REPLICA_SUM_OP = 'CrossReplicaSum'
-_ONE_GIGABYTE = 1024 * 1024 * 1024
-_TPU_ENQUEUE_OPS = '_tpu_enqueue_ops'
-_TPU_TRAIN_OP = '_tpu_train_op'
-_REWRITE_FOR_INFERENCE_MODE = '_rewrite_for_inference'
-
-# Ideally _USE_TPU_KEY should be reserved as well. However there are already
-# models that make use of this key, thus it can not be reserved now to prevent
-# breakage. In the long run, we would like to mitigate this by migrating models
-# off of using _USE_TPU_KEY.
-_RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY, _CTX_KEY]
-
-# TODO(b/65703635): Flip the value and remove all dead code. Currently, this is
-# only used for per-core based deployments. For per-host based pipelines, if a
-# user returns a Dataset instance it will be automatically wrapped in a
-# tf.while_loop (This can be disabled by returning features and labels
-# explicitly).
-_WRAP_INPUT_FN_INTO_WHILE_LOOP = False
-
-ops.register_proto_function(
-    '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR),
-    proto_type=variable_pb2.VariableDef,
-    to_proto=resource_variable_ops._to_proto_fn,  # pylint: disable=protected-access
-    from_proto=resource_variable_ops._from_proto_fn)  # pylint: disable=protected-access
-
-
-def _is_iterable(obj):
-  """A Python 2 and 3 compatible util to check whether `obj` is iterable."""
-  try:
-    iter(obj)
-    return True
-  except TypeError:
-    return False
-
-
-def _create_global_step(graph):
-  graph = graph or ops.get_default_graph()
-  if training.get_global_step(graph) is not None:
-    raise ValueError('"global_step" already exists.')
-  # Create in proper graph and base name_scope.
-  with graph.as_default() as g, g.name_scope(None):
-    return variable_scope.get_variable(
-        ops.GraphKeys.GLOBAL_STEP,
-        shape=[],
-        dtype=dtypes.int64,
-        initializer=init_ops.zeros_initializer(),
-        trainable=False,
-        use_resource=True,
-        collections=[ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.GLOBAL_STEP])
-
-
-def _create_or_get_iterations_per_loop():
-  """Creates or gets the iterations_per_loop variable.
-
-  In TPUEstimator, the user provided computation, the model_fn, is wrapped
-  inside a tf.while_loop for peak performance. The iterations of the loop are
-  specified by this variable, which adjusts its value on the CPU after each TPU
-  program execution and before the next TPU execution.
-
-  The purpose of using a variable, rather then a constant, is to allow
-  TPUEstimator adapt the TPU training iterations according to the final steps
-  specified by users. For example, if the user sets the iterations_per_loop as 4
-  in TPUConfig and steps as 10 in TPUEstimator.train(), the iterations_per_loop
-  variable will have the following value before each TPU training.
-
-      - 1-th TPU execution: iterations_per_loop = 4
-      - 2-th TPU execution: iterations_per_loop = 4
-      - 3-th TPU execution: iterations_per_loop = 2
-
-  As model_fn increases the global step once per train_op invocation, the global
-  step is 10 after all TPU executions, matching the steps=10 inputs passed in by
-  users.
-
-  Returns:
-    A TF non-trainable resource variable.
-
-  Raises:
-    RuntimeError: If multi iterations_per_loop variables were found.
-  """
-  graph = ops.get_default_graph()
-  collection_name = '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR)
-  iter_vars = graph.get_collection(collection_name)
-  if len(iter_vars) == 1:
-    return iter_vars[0]
-  elif len(iter_vars) > 1:
-    raise RuntimeError('Multiple iterations_per_loop_var in collection.')
-
-  with ops.colocate_with(training_util.get_global_step()):
-    with variable_scope.variable_scope(
-        _TPU_ESTIMATOR, reuse=variable_scope.AUTO_REUSE):
-      return variable_scope.get_variable(
-          _ITERATIONS_PER_LOOP_VAR,
-          initializer=init_ops.zeros_initializer(),
-          shape=[],
-          dtype=dtypes.int32,
-          trainable=False,
-          collections=[collection_name, ops.GraphKeys.LOCAL_VARIABLES],
-          use_resource=True)
-
-
-def _sync_variables_ops(ctx):
-  """Create varriables synchronization ops.
-
-  Gets the variables back from TPU nodes. This means the variables updated
-  by TPU will now be *synced* to host memory.
-  In BROADCAST mode, we skip this sync since the variables are ususally too
-  big to transmit via RPC.
-
-  Args:
-    ctx: A `_InternalTPUContext` instance with mode.
-
-  Returns:
-    A list of sync ops.
-  """
-
-  if not ctx.is_input_broadcast_with_iterators():
-    return [
-        array_ops.check_numerics(v.read_value(),
-                                 'Gradient for %s is NaN' % v.name).op
-        for v in variables.trainable_variables()
-    ]
-  else:
-    return [control_flow_ops.no_op()]
-
-
-def _increase_eval_step_op(iterations_per_loop):
-  """Returns an op to increase the eval step for TPU evaluation.
-
-  Args:
-    iterations_per_loop: Tensor. The number of eval steps running in TPU system
-      before returning to CPU host for each `Session.run`.
-
-  Returns:
-    An operation
-  """
-  eval_step = evaluation._get_or_create_eval_step()  # pylint: disable=protected-access
-  # Estimator evaluate increases 1 by default. So, we increase the difference.
-  return state_ops.assign_add(
-      eval_step,
-      math_ops.cast(iterations_per_loop - 1, dtype=eval_step.dtype),
-      use_locking=True)
-
-
-def _extract_key_names(tensor_or_dict):
-  if isinstance(tensor_or_dict, dict):
-    return sorted(tensor_or_dict.keys())
-  return []
-
-
-class _SIGNAL(object):
-  """Signal used to control the thread of infeed/outfeed.
-
-  All preserved signals must be negative numbers. Positive numbers are used to
-  indicate the number of iterations for next training/evaluation loop.
-  """
-  NEXT_BATCH = -1
-  STOP = -2
-
-
-class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
-  """Ops and objects returned from a `model_fn` and passed to `TPUEstimator`.
-
-  See `EstimatorSpec` for `mode`, `predictions`, `loss`, `train_op`, and
-  `export_outputs`.
-
-  For evaluation, `eval_metrics `is a tuple of `metric_fn` and `tensors`, where
-  `metric_fn` runs on CPU to generate metrics and `tensors` represents the
-  `Tensor`s transferred from TPU system to CPU host and passed to `metric_fn`.
-  To be precise, TPU evaluation expects a slightly different signature from the
-  `tf.estimator.Estimator`. While `EstimatorSpec.eval_metric_ops` expects a
-  dict, `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`.
-  The `tensors` could be a list of `Tensor`s or dict of names to `Tensor`s. The
-  `tensors` usually specify the model logits, which are transferred back from
-  TPU system to CPU host. All tensors must have be batch-major, i.e., the batch
-  size is the first dimension. Once all tensors are available at CPU host from
-  all shards, they are concatenated (on CPU) and passed as positional arguments
-  to the `metric_fn` if `tensors` is list or keyword arguments if `tensors` is
-  a dict. `metric_fn` takes the `tensors` and returns a dict from metric string
-  name to the result of calling a metric function, namely a `(metric_tensor,
-  update_op)` tuple. See `TPUEstimator` for MNIST example how to specify the
-  `eval_metrics`.
-
-  `scaffold_fn` is a function running on CPU to generate the `Scaffold`. This
-  function should not capture any Tensors in `model_fn`.
-
-  `host_call` is a tuple of a `function` and a list or dictionary of `tensors`
-  to pass to that function and returns a list of Tensors. `host_call` currently
-  works for train() and evaluate(). The Tensors returned by the function is
-  executed on the CPU on every step, so there is communication overhead when
-  sending tensors from TPU to CPU. To reduce the overhead, try reducing the
-  size of the tensors. The `tensors` are concatenated along their major (batch)
-  dimension, and so must be >= rank 1. The `host_call` is useful for writing
-  summaries with `tf.contrib.summary.create_file_writer`.
-  """
-
-  def __new__(cls,
-              mode,
-              predictions=None,
-              loss=None,
-              train_op=None,
-              eval_metrics=None,
-              export_outputs=None,
-              scaffold_fn=None,
-              host_call=None,
-              training_hooks=None,
-              evaluation_hooks=None,
-              prediction_hooks=None):
-    """Creates a validated `TPUEstimatorSpec` instance."""
-    host_calls = {}
-    if eval_metrics is not None:
-      host_calls['eval_metrics'] = eval_metrics
-    if host_call is not None:
-      host_calls['host_call'] = host_call
-    _OutfeedHostCall.validate(host_calls)
-
-    training_hooks = tuple(training_hooks or [])
-    evaluation_hooks = tuple(evaluation_hooks or [])
-    prediction_hooks = tuple(prediction_hooks or [])
-
-    for hook in training_hooks + evaluation_hooks + prediction_hooks:
-      if not isinstance(hook, session_run_hook.SessionRunHook):
-        raise TypeError('All hooks must be SessionRunHook instances, given: {}'
-                        .format(hook))
-
-    return super(TPUEstimatorSpec, cls).__new__(
-        cls,
-        mode=mode,
-        predictions=predictions,
-        loss=loss,
-        train_op=train_op,
-        eval_metrics=eval_metrics,
-        export_outputs=export_outputs,
-        scaffold_fn=scaffold_fn,
-        host_call=host_call,
-        training_hooks=training_hooks,
-        evaluation_hooks=evaluation_hooks,
-        prediction_hooks=prediction_hooks)
-
-  def as_estimator_spec(self):
-    """Creates an equivalent `EstimatorSpec` used by CPU train/eval."""
-    host_calls = {}
-    if self.eval_metrics is not None:
-      host_calls['eval_metrics'] = self.eval_metrics
-    if self.host_call is not None:
-      host_calls['host_call'] = self.host_call
-    host_call_ret = _OutfeedHostCall.create_cpu_hostcall(host_calls)
-    eval_metric_ops = None
-    if self.eval_metrics is not None:
-      eval_metric_ops = host_call_ret['eval_metrics']
-    hooks = None
-    if self.host_call is not None:
-      hooks = [_OutfeedHostCallHook(host_call_ret['host_call'])]
-    hooks = tuple(hooks or [])
-    scaffold = self.scaffold_fn() if self.scaffold_fn else None
-    return model_fn_lib.EstimatorSpec(
-        mode=self.mode,
-        predictions=self.predictions,
-        loss=self.loss,
-        train_op=self.train_op,
-        eval_metric_ops=eval_metric_ops,
-        export_outputs=self.export_outputs,
-        scaffold=scaffold,
-        training_hooks=self.training_hooks + hooks,
-        evaluation_hooks=self.evaluation_hooks + hooks,
-        prediction_hooks=self.prediction_hooks + hooks)
-
-
-class _OpQueueContext(object):
-  """Manages work queue and thread for a infeed/outfeed thread."""
-
-  def __init__(self, name, target, args):
-    self._name = name
-    self._queue = Queue.Queue()
-    args = (self,) + args
-    self._thread = threading.Thread(name=name, target=target, args=args)
-    self._thread.daemon = True
-    self._thread.start()
-
-  def stop(self):
-    self._queue.put(_SIGNAL.STOP)
-
-  def send_next_batch_signal(self, iterations):
-    self._queue.put(iterations)
-
-  def read_iteration_counts(self):
-    while True:
-      iterations = self._queue.get(block=True)
-      logging.debug('%s read iterations %s', self._name, iterations)
-      if iterations == _SIGNAL.STOP:
-        logging.info('%s received shutdown signal, stopping.', self._name)
-        return
-      yield iterations
-
-  def join(self):
-    logging.info('Shutting down %s thread.', self._name)
-    self.stop()
-    self._thread.join()
-
-
-class _OpSignalOnceQueueContext(_OpQueueContext):
-  """Manages work queue and thread for a infeed/outfeed thread.
-
-  This subclass only signals once.
-  """
-
-  def __init__(self, name, target, args):
-    super(_OpSignalOnceQueueContext, self).__init__(name, target, args)
-    self._has_signaled = False
-
-  def send_next_batch_signal(self, iterations):
-    if not self._has_signaled:
-      self._queue.put(iterations)
-      self._has_signaled = True
-
-
-class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
-  """A Session hook setting up the TPU initialization, infeed, and outfeed.
-
-  This hook does two major things:
-  1. initialize and shutdown TPU system.
-  2. launch and join the threads for infeed enqueue and (optional) outfeed
-     dequeue.
-  """
-
-  def __init__(self,
-               ctx,
-               enqueue_ops,
-               dequeue_ops,
-               run_infeed_loop_on_coordinator=True,
-               rendezvous=None,
-               master=None,
-               session_config=None):
-    self._master_job = ctx.master_job
-    self._enqueue_ops = enqueue_ops
-    self._dequeue_ops = dequeue_ops
-    self._rendezvous = rendezvous
-    self._master = master
-    self._session_config = session_config
-    self._run_infeed_loop_on_coordinator = run_infeed_loop_on_coordinator
-    self._initial_infeed_sleep_secs = (
-        ctx.config.tpu_config.initial_infeed_sleep_secs)
-
-    self._feed_error = None
-    self._finished = False
-    self._should_initialize_tpu = True
-
-  def begin(self):
-    logging.info('TPU job name %s', self._master_job)
-    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-    self._init_ops = []
-    if self._should_initialize_tpu:
-      self._finalize_ops = [tpu.shutdown_system(job=self._master_job)]
-    else:
-      self._finalize_ops = []
-
-    summary_writer_init_ops = contrib_summary.summary_writer_initializer_op()
-    self._init_ops.extend(summary_writer_init_ops)
-    # Get all the writer resources from the initializer, so we know what to
-    # flush.
-    for op in summary_writer_init_ops:
-      self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
-
-  def _run_infeed(self, queue_ctx, session):
-    logging.info('Starting infeed thread controller.')
-    if self._initial_infeed_sleep_secs:
-      logging.info('Infeed thread sleeping for %d seconds.',
-                   self._initial_infeed_sleep_secs)
-      time.sleep(self._initial_infeed_sleep_secs)
-      logging.info('Infeed thread starting after sleep')
-
-    with self._rendezvous.catch_errors(source='infeed', session=session):
-      if self._run_infeed_loop_on_coordinator:
-        for count, steps in enumerate(queue_ctx.read_iteration_counts()):
-          for i in xrange(steps):
-            logging.debug('Infeed enqueue for iteration (%d, %d)', count, i)
-            session.run(self._enqueue_ops)
-      else:
-        for _ in queue_ctx.read_iteration_counts():
-          session.run(self._enqueue_ops)
-      logging.info('Infeed thread finished, shutting down.')
-
-  def _run_outfeed(self, queue_ctx, session):
-    logging.info('Starting outfeed thread controller.')
-    with self._rendezvous.catch_errors(source='outfeed', session=session):
-      for count, steps in enumerate(queue_ctx.read_iteration_counts()):
-        for i in xrange(steps):
-          logging.debug('Outfeed dequeue for iteration (%d, %d)', count, i)
-          session.run(self._dequeue_ops)
-      logging.info('Outfeed thread finished, shutting down.')
-
-  def _create_infeed_controller(self, name, target, args):
-    return _OpQueueContext(name=name, target=target, args=args)
-
-  def after_create_session(self, session, coord):
-    if self._should_initialize_tpu:
-      logging.info('Init TPU system')
-      start = time.time()
-      with ops.Graph().as_default():
-        with tf_session.Session(
-            self._master, config=self._session_config) as sess:
-          sess.run(tpu.initialize_system(job=self._master_job))
-      logging.info('Initialized TPU in %d seconds', time.time() - start)
-
-    session.run(self._init_ops,
-                options=config_pb2.RunOptions(timeout_in_ms=5 * 60 * 1000))
-
-    self._infeed_controller = self._create_infeed_controller(
-        name='InfeedController', target=self._run_infeed, args=(session,))
-
-    self._outfeed_controller = _OpQueueContext(
-        name='OutfeedController', target=self._run_outfeed, args=(session,))
-
-    # Enable the worker watchdog to terminate workers on coordinator exit.
-    watchdog_timeout = int(os.environ.get('TF_TPU_WATCHDOG_TIMEOUT', '0'))
-    if watchdog_timeout > 0:
-      session_support.start_worker_watchdog(session,
-                                            shutdown_timeout=watchdog_timeout)
-
-  def before_run(self, run_context):
-    self._feed_error = None
-
-    iterations = run_context.session.run(self._iterations_per_loop_var)
-
-    logging.info('Enqueue next (%d) batch(es) of data to infeed.', iterations)
-    self._infeed_controller.send_next_batch_signal(iterations)
-
-    logging.info('Dequeue next (%d) batch(es) of data from outfeed.',
-                 iterations)
-    self._outfeed_controller.send_next_batch_signal(iterations)
-
-  def end(self, session):
-    self._finished = True
-    logging.info('Stop infeed thread controller')
-    self._infeed_controller.join()
-    self._rendezvous.record_done('infeed')
-
-    logging.info('Stop output thread controller')
-    self._outfeed_controller.join()
-    self._rendezvous.record_done('outfeed')
-
-    logging.info('Shutdown TPU system.')
-    session.run(self._finalize_ops)
-
-
-class TPUInfeedOutfeedSessionHookForPrediction(TPUInfeedOutfeedSessionHook):
-
-  def __init__(self, ctx, enqueue_ops, dequeue_ops, rendezvous=None,
-               master=None, session_config=None):
-    super(TPUInfeedOutfeedSessionHookForPrediction, self).__init__(
-        ctx,
-        enqueue_ops,
-        dequeue_ops,
-        run_infeed_loop_on_coordinator=False,
-        rendezvous=rendezvous,
-        master=master,
-        session_config=session_config)
-
-  def _create_infeed_controller(self, name, target, args):
-    return _OpSignalOnceQueueContext(name=name, target=target, args=args)
-
-
-class _TPUStopAtStepHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop at a specified step.
-
-  This hook is similar to the `session_run_hook._StopAfterNEvalsHook` with
-  following differences for TPU training:
-
-  1. This hook sets the variable for iterations_per_loop, which is used by
-     `TPUInfeedOutfeedSessionHook` to control the iterations for infeed/outfeed.
-     As the hook execution order is not guaranteed, the variable update is
-     handled in `after_create_session` and `after_run` as
-     `TPUInfeedOutfeedSessionHook` reads the variable value in `before_run`.
-
-  2. For each training loop (session.run), the global step could be increased
-     multiple times on TPU. The global step tensor value will be explicitly read
-     again in `after_run` to ensure the latest value is retrieved to avoid race
-     condition.
-  """
-
-  def __init__(self, iterations, num_steps=None, last_step=None):
-    """Initializes a `StopAtStepHook`.
-
-    Args:
-      iterations: The number of iterations to run optimizer per training loop.
-      num_steps: Number of steps to execute.
-      last_step: Step after which to stop.
-
-    Raises:
-      ValueError: If one of the arguments is invalid.
-    """
-    if num_steps is None and last_step is None:
-      raise ValueError('One of num_steps or last_step must be specified.')
-    if num_steps is not None and last_step is not None:
-      raise ValueError('Only one of num_steps or last_step can be specified.')
-    self._num_steps = num_steps
-    self._last_step = last_step
-    self._iterations = iterations
-
-  def _next_iterations(self, global_step, last_step):
-    gap = last_step - global_step
-    return min(gap, self._iterations)
-
-  def begin(self):
-    self._global_step_tensor = training_util.get_global_step()
-    if self._global_step_tensor is None:
-      raise RuntimeError('Global step should be created.')
-
-    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-
-  def after_create_session(self, session, coord):
-    global_step = session.run(self._global_step_tensor)
-    if self._last_step is None:
-      self._last_step = global_step + self._num_steps
-
-    iterations = self._next_iterations(global_step, self._last_step)
-
-    self._iterations_per_loop_var.load(iterations, session=session)
-
-  def after_run(self, run_context, run_values):
-    # Global step cannot be retrieved via SessionRunArgs and before_run due to
-    # race condition.
-    global_step = run_context.session.run(self._global_step_tensor)
-    if global_step >= self._last_step:
-      run_context.request_stop()
-    else:
-      iterations = self._next_iterations(global_step, self._last_step)
-      self._iterations_per_loop_var.load(
-          iterations, session=run_context.session)
-
-
-class _SetEvalIterationsHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop at a specified step."""
-
-  def __init__(self, num_steps):
-    """Initializes a `_SetEvalIterationsHook`.
-
-    Args:
-      num_steps: Number of steps to execute.
-    """
-    self._num_steps = num_steps
-
-  def begin(self):
-    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-
-  def after_create_session(self, session, coord):
-    self._iterations_per_loop_var.load(self._num_steps, session=session)
-
-
-class _StoppingPredictHook(session_run_hook.SessionRunHook):
-  """Hook that requests stop according to the stopping signal in prediction."""
-
-  def __init__(self, scalar_stopping_signal):
-    self._scalar_stopping_signal = scalar_stopping_signal
-
-  def begin(self):
-    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
-
-  def after_create_session(self, session, coord):
-    # This is not necessary as we do not run infeed enqueue and outfeed dequeue
-    # in side threads for prediction model. But it makes the
-    # TPUInfeedOutfeedSessionHook prints nice message.
-    self._iterations_per_loop_var.load(1, session=session)
-
-  def before_run(self, run_context):
-    return session_run_hook.SessionRunArgs(self._scalar_stopping_signal)
-
-  def after_run(self, run_context, run_values):
-    _ = run_context
-    scalar_stopping_signal = run_values.results
-    if _StopSignals.should_stop(scalar_stopping_signal):
-      # NOTE(xiejw): In prediction, stopping signals are inserted for each
-      # batch. And we append one more batch to signal the system it should stop.
-      # The data flow might look like
-      #
-      #  batch   0: images, labels, stop = 0  (user provided)
-      #  batch   1: images, labels, stop = 0  (user provided)
-      #  ...
-      #  batch  99: images, labels, stop = 0  (user provided)
-      #  batch 100: images, labels, stop = 1  (TPUEstimator appended)
-      #
-      # where the final batch (id = 100) is appended by TPUEstimator, so we
-      # should drop it before returning the predictions to user.
-      # To achieve that, we throw the OutOfRangeError in after_run. Once
-      # Monitored Session sees this error in SessionRunHook.after_run, the
-      # "current" prediction, i.e., batch with id=100, will be discarded
-      # immediately
-      raise errors.OutOfRangeError(None, None, 'Stopped by stopping signal.')
-
-
-def generate_per_core_enqueue_ops_fn_for_host(
-    ctx, input_fn, inputs_structure_recorder, host_device, host_id):
-  """Generates infeed enqueue ops for per-core input_fn on a single host."""
-  captured_infeed_queue = _CapturedObject()
-  tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
-
-  def enqueue_ops_fn():
-    """A fn returns enqueue_ops."""
-    num_cores_per_host = ctx.num_of_cores_per_host
-    per_host_sharded_inputs = []
-    for core_ordinal in range(num_cores_per_host):
-      with ops.name_scope('ordinal_%d' % (core_ordinal)):
-        user_context = tpu_context.TPUContext(
-            internal_ctx=ctx,
-            input_device=host_device,
-            invocation_index=host_id * ctx.num_of_cores_per_host + core_ordinal)
-        inputs = _Inputs.from_input_fn(input_fn(user_context))
-        if inputs.is_dataset:
-          raise TypeError(
-              '`input_fn` returning `Dataset`  is not yet supported in '
-              'per-Core input pipeline deployment yet. Please set '
-              'TPUConfig.per_host_input_for_training to True or return '
-              '`features` and `labels` from `input_fn`')
-        features, labels = inputs.features_and_labels()
-
-        inputs_structure_recorder.validate_and_record_structure(
-            features, labels)
-        flattened_inputs = (
-            inputs_structure_recorder.flatten_features_and_labels(
-                features, labels))
-        per_host_sharded_inputs.append(flattened_inputs)
-
-    infeed_queue = tpu_feed.InfeedQueue(
-        number_of_tuple_elements=len(per_host_sharded_inputs[0]))
-    captured_infeed_queue.capture(infeed_queue)
-
-    per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
-        per_host_sharded_inputs, tpu_ordinal_function=tpu_ordinal_function_impl)
-    return per_host_enqueue_ops
-
-  return enqueue_ops_fn, captured_infeed_queue
-
-
-def generate_per_host_enqueue_ops_fn_for_host(
-    ctx, input_fn, inputs_structure_recorder, batch_axis, device, host_id):
-  """Generates infeed enqueue ops for per-host input_fn on a single host."""
-  captured_infeed_queue = _CapturedObject()
-
-  dataset_initializer = None
-
-  with ops.device(device):
-    user_context = tpu_context.TPUContext(
-        internal_ctx=ctx, input_device=device, invocation_index=host_id)
-    inputs = _Inputs.from_input_fn(input_fn(user_context))
-
-    is_dataset = inputs.is_dataset
-    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
-      if not is_dataset:
-        raise TypeError(
-            'For mode PREDICT, `input_fn` must return `Dataset` instead of '
-            '`features` and `labels`.')
-      if batch_axis is not None:
-        raise TypeError('For mode PREDICT, batch_axis is not supported yet.')
-      inputs = _InputsWithStoppingSignals(
-          dataset=inputs.dataset,
-          batch_size=ctx.batch_size_for_input_fn,
-          add_padding=True)
-
-    if is_dataset:
-      dataset_initializer = inputs.dataset_initializer()
-
-    tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
-
-  def enqueue_ops_fn():
-    """A Fn returning the TPU infeed enqueue ops.
-
-    By providing as a Fn, it can be invoked inside the tf.while_loop such that
-    the input pipeline for multiple iterations can be executed by one
-    Session.run call.
-
-    Returns:
-      list of dict of ops.
-    """
-    with ops.device(device):
-      num_of_replicas_per_host = ctx.num_of_replicas_per_host
-      # Convert user input to features and labels.  If the user returns a
-      # dataset, it is initialized and the features and labels extracted via
-      # `dataset.iterator.get_next()`
-      features, labels = inputs.features_and_labels()
-      signals = inputs.signals()
-
-      inputs_structure_recorder.validate_and_record_structure(features, labels)
-      unsharded_tensor_list = (
-          inputs_structure_recorder.flatten_features_and_labels(
-              features, labels, signals))
-
-      infeed_queue = tpu_feed.InfeedQueue(
-          tuple_types=[t.dtype for t in unsharded_tensor_list],
-          tuple_shapes=[t.shape for t in unsharded_tensor_list],
-          shard_dimensions=batch_axis)
-      captured_infeed_queue.capture(infeed_queue)
-      infeed_queue.set_number_of_shards(num_of_replicas_per_host)
-      per_host_enqueue_ops = (
-          infeed_queue.split_inputs_and_generate_enqueue_ops(
-              unsharded_tensor_list,
-              placement_function=lambda x: device,
-              tpu_ordinal_function=tpu_ordinal_function_impl))
-      if signals is None:
-        return per_host_enqueue_ops
-      else:
-        return {
-            'ops': per_host_enqueue_ops,
-            'signals': signals,
-        }
-
-  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
-
-
-def generate_per_host_v2_enqueue_ops_fn_for_host(
-    ctx, input_fn, inputs_structure_recorder, device, host_id):
-  """Generates infeed enqueue ops for per-host input_fn on a single host."""
-  captured_infeed_queue = _CapturedObject()
-  dataset_initializer = None
-
-  with ops.device(device):
-    user_context = tpu_context.TPUContext(
-        internal_ctx=ctx, input_device=device, invocation_index=host_id)
-    inputs = _Inputs.from_input_fn(input_fn(user_context))
-
-    is_dataset = inputs.is_dataset
-    if not is_dataset:
-      raise TypeError('`input_fn` must return a `Dataset` for the PER_HOST_V2 '
-                      'input pipeline configuration.')
-
-    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
-      inputs = _InputsWithStoppingSignals(
-          dataset=inputs.dataset,
-          batch_size=ctx.batch_size_for_input_fn,
-          add_padding=True,
-          num_invocations_per_step=ctx.num_of_replicas_per_host)
-
-    dataset_initializer = inputs.dataset_initializer()
-    tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
-
-  def enqueue_ops_fn():
-    """Generates the per_host enqueue ops."""
-    control_deps = []
-    per_host_sharded_inputs = []
-    num_replicas_per_host = ctx.num_of_replicas_per_host
-    cached_signals = None
-    with ops.device(device):
-      if not inputs.is_dataset:
-        raise TypeError('`input_fn` must return a `Dataset` for this mode.')
-      for _ in range(num_replicas_per_host):
-        # Use control dependencies to ensure a deterministic ordering.
-        with ops.control_dependencies(control_deps):
-          features, labels = inputs.features_and_labels()  # Calls get_next()
-          signals = inputs.signals()
-
-          # All the replicas share the replica 0's stopping singal.
-          # This avoids inconsistent state among different model replcias.
-          if cached_signals:
-            signals['stopping'] = cached_signals['stopping']
-          else:
-            cached_signals = signals
-
-        inputs_structure_recorder.validate_and_record_structure(
-            features, labels)
-        flattened_inputs = (
-            inputs_structure_recorder.flatten_features_and_labels(
-                features, labels, signals))
-        control_deps.extend(flattened_inputs)
-        per_host_sharded_inputs.append(flattened_inputs)
-
-      if inputs_structure_recorder.flattened_input_dims:
-        input_partition_dims = inputs_structure_recorder.flattened_input_dims
-        if signals:
-          input_partition_dims += [None] * len(signals)
-        # pylint: disable=protected-access
-        infeed_queue = tpu_feed._PartitionedInfeedQueue(
-            number_of_tuple_elements=len(per_host_sharded_inputs[0]),
-            host_id=host_id,
-            input_partition_dims=input_partition_dims,
-            device_assignment=ctx.device_assignment)
-        per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
-            per_host_sharded_inputs)
-      else:
-        infeed_queue = tpu_feed.InfeedQueue(
-            number_of_tuple_elements=len(per_host_sharded_inputs[0]))
-        per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
-            per_host_sharded_inputs,
-            tpu_ordinal_function=tpu_ordinal_function_impl)
-      captured_infeed_queue.capture(infeed_queue)
-
-    if signals is None:
-      return per_host_enqueue_ops
-    else:
-      return {
-          'ops': per_host_enqueue_ops,
-          'signals': signals,
-      }
-
-  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
-
-
-def generate_broadcast_enqueue_ops_fn(ctx, input_fn, inputs_structure_recorder,
-                                      num_hosts):
-  """Generates infeed enqueue ops for one input_fn on all the hosts."""
-  captured_infeed_queue = _CapturedObject()
-  dataset_initializer = None
-  device_0 = ctx.tpu_host_placement_function(host_id=0)
-  with ops.device(device_0):
-    user_context = tpu_context.TPUContext(
-        internal_ctx=ctx, input_device=device_0, invocation_index=0)
-    inputs = _Inputs.from_input_fn(input_fn(user_context))
-
-    is_dataset = inputs.is_dataset
-    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
-      if not is_dataset:
-        raise TypeError(
-            'For mode PREDICT, `input_fn` must return `Dataset` instead of '
-            '`features` and `labels`.')
-
-      inputs = _InputsWithStoppingSignals(
-          dataset=inputs.dataset,
-          batch_size=ctx.batch_size_for_input_fn,
-          add_padding=True)
-
-    if is_dataset:
-      dataset_initializer = inputs.dataset_initializer()
-    num_replicas_per_host = ctx.num_of_replicas_per_host
-
-  def tpu_ordinal_function_impl(replica_id):
-    if ctx.device_assignment:
-      return ctx.device_assignment.tpu_ordinal(replica=replica_id)
-    else:
-      return replica_id % num_replicas_per_host
-
-  def device_function_impl(replica_id):
-    return ctx.tpu_host_placement_function(replica_id=replica_id)
-
-  def enqueue_ops_fn():
-    """Generates enqueue ops for all the hosts."""
-    broadcasted_inputs = []
-    flattened_inputs = None  # Cache result from input_fn.
-    signals = None
-    for host_id in xrange(num_hosts):
-      with ops.device(ctx.tpu_host_placement_function(host_id=host_id)):
-        for _ in xrange(ctx.num_of_replicas_per_host):
-          # Note: input_fn is only called once at host 0 for the first replica.
-          # The features and labels returned from that invocation are
-          # broadcasted to other replicas(including the replicas on other
-          # hosts).
-          if flattened_inputs is None:
-            features, labels = inputs.features_and_labels()  # Calls get_next()
-            signals = inputs.signals()
-
-            inputs_structure_recorder.validate_and_record_structure(
-                features, labels)
-            flattened_inputs = (
-                inputs_structure_recorder.flatten_features_and_labels(
-                    features, labels, signals))
-          broadcasted_inputs.append(flattened_inputs)
-
-    infeed_queue = tpu_feed.InfeedQueue(
-        number_of_tuple_elements=len(broadcasted_inputs[0]))
-    captured_infeed_queue.capture(infeed_queue)
-    enqueue_ops = infeed_queue.generate_enqueue_ops(
-        broadcasted_inputs,
-        tpu_ordinal_function=tpu_ordinal_function_impl,
-        placement_function=device_function_impl)
-
-    if signals is None:
-      return enqueue_ops
-    else:
-      return {
-          'ops': enqueue_ops,
-          'signals': signals,
-      }
-
-  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
-
-
-class _InputPipeline(object):
-  """`_InputPipeline` handles invoking `input_fn` and piping to infeed queue.
-
-  `_InputPipeline` abstracts the per-core/per-host `input_fn` invocation from
-  call site.  To be precise, based on the configuration in
-  `_InternalTPUContext`,  it invokes `input_fn` for all cores (usually
-  multi-host TPU training) or for one host (usually for single-host TPU
-  evaluation), and sends all `features` and `labels` returned by `input_fn` to
-  TPU infeed. For per-core invocation, `features` and `labels` are piped to
-  infeed directly, one tuple for each core. For per-host invocation,  `features`
-  and `labels` are split at host (with respect to `batch_axis`) and piped to all
-  cores accordingly.
-
-  In addition, flatten/unflatten are handled by `_InputPipeline` also.  Model
-  inputs returned by the `input_fn` can have one of the following forms:
-  1. features
-  2. (features, labels)
-  3. ((arbitrarily nested structure of features), labels)
-
-  Internally, form 1 is reformed to `(features, None)` as features and labels
-  are passed separately to underlying methods. For TPU training, TPUEstimator
-  may expect multiple `features` and `labels` tuples one for each core.
-
-  TPUEstimator allows various different structures for inputs (namely `features`
-  and `labels`).  Both `features` and `labels` can be any nested sturcture
-  supported by TF nest (namely, dict, tuples, namedtuples or any nested
-  structure of such of Tensors).  `labels` could be `None` as well.
-
-  These are flattened before they are passed to the infeed/outfeed library
-  as that expectes flattend lists.
-  """
-
-  class InputsStructureRecorder(object):
-    """The recorder to record inputs structure."""
-
-    def __init__(self, input_partition_dims=None):
-      # Holds the structure of inputs
-      self._feature_structure = {}
-      self._flattened_input_dims = None
-
-      if input_partition_dims:
-        # This should have been validated in TPUConfig.
-        assert len(input_partition_dims) <= 2, 'must have 1 or 2 elements.'
-        if len(input_partition_dims) == 2:
-          self._feature_dims, self._label_dims = input_partition_dims
-        else:
-          self._feature_dims = input_partition_dims[0]
-          self._label_dims = None
-
-        assert self._feature_dims is not None, ('input_partition_dims[0] must '
-                                                'not be None')
-      else:
-        self._feature_dims = None
-        self._label_dims = None
-
-      # Internal state.
-      self._initialized = False
-
-    @property
-    def flattened_input_dims(self):
-      assert self._initialized, 'InputsStructureRecorder is not initialized.'
-      return self._flattened_input_dims
-
-    def has_labels(self):
-      return 'labels' in self._feature_structure
-
-    def _flatten_input_dims(self, feature_dims, feature_dims_names, label_dims,
-                            label_dims_names, label_names, has_labels):
-      """Flatten input dims with the same order as flattened input tensors."""
-      flattened_input_dims = []
-      if feature_dims_names:
-        # We need a fixed ordering for matching the tensors in features.
-        flattened_input_dims.extend(
-            [feature_dims[name] for name in feature_dims_names])
-      else:
-        flattened_input_dims.append(feature_dims)
-
-      if label_dims_names:
-        # We need a fixed ordering for matching the tensors in labels.
-        flattened_input_dims.extend(
-            [label_dims[name] for name in label_dims_names])
-      else:
-        if label_names:
-          num_tensors_in_label = len(label_names)
-        else:
-          num_tensors_in_label = int(has_labels)
-        # Setting `None` in input_partition_dims[1] will apply `None` to
-        # all the tensors in labels, regardless of internal structure.
-        flattened_input_dims.extend([label_dims] * num_tensors_in_label)
-
-      return flattened_input_dims
-
-    def validate_and_record_structure(self, features, labels):
-      """Validates and records the structure of `features` and `labels`."""
-      # Extract structure.
-      has_labels = labels is not None
-      feature_names = _extract_key_names(features)
-      label_names = _extract_key_names(labels)
-
-      if not self._initialized:
-        # Record structure.
-        self._initialized = True
-        if self._feature_dims is not None:
-          feature_dims_names = _extract_key_names(self._feature_dims)
-          if feature_dims_names != feature_names:
-            raise ValueError(
-                'TPUConfig.input_partition_dims[0] mismatched feature'
-                ' keys. Expected {}, got {}'.format(feature_names,
-                                                    feature_dims_names))
-
-          label_dims_names = _extract_key_names(self._label_dims)
-          if self._label_dims is not None and label_dims_names != label_names:
-            raise ValueError(
-                'TPUConfig.input_partition_dims[1] mismatched label'
-                ' keys. Expected {}, got {}'.format(label_names,
-                                                    label_dims_names))
-
-          self._flattened_input_dims = self._flatten_input_dims(
-              self._feature_dims, feature_dims_names, self._label_dims,
-              label_dims_names, label_names, has_labels)
-
-    def flatten_features_and_labels(self, features, labels, signals=None):
-      """Flattens the `features` and `labels` to a single tensor list."""
-      self._feature_structure['features'] = features
-      if labels is not None:
-        self._feature_structure['labels'] = labels
-      if signals is not None:
-        self._feature_structure['signals'] = signals
-      return data_nest.flatten(self._feature_structure)
-
-    def unflatten_features_and_labels(self, flattened_inputs):
-      """Restores the flattened inputs to original features and labels form.
-
-      Args:
-        flattened_inputs: Flattened inputs for each shard.
-
-      Returns:
-        A tuple of (`features`, `labels`), where `labels` could be None.
-        Each one, if present, should have identical structure (single tensor vs
-        dict) as the one returned by input_fn.
-
-      Raises:
-        ValueError: If the number of expected tensors from `flattened_inputs`
-          mismatches the recorded structure.
-      """
-
-      unflattened_inputs = data_nest.pack_sequence_as(self._feature_structure,
-                                                      flattened_inputs)
-      return _Inputs(
-          unflattened_inputs['features'],
-          unflattened_inputs.get('labels'),
-          signals=unflattened_inputs.get('signals'))
-
-  def __init__(self, input_fn, batch_axis, ctx):
-    """Constructor.
-
-    Args:
-      input_fn: input fn for train or eval.
-      batch_axis: A python tuple of int values describing how each tensor
-        produced by the Estimator `input_fn` should be split across the TPU
-        compute shards.
-      ctx: A `_InternalTPUContext` instance with mode.
-
-    Raises:
-      ValueError: If both `sharded_features` and `num_cores` are `None`.
-    """
-    self._inputs_structure_recorder = _InputPipeline.InputsStructureRecorder(
-        ctx.input_partition_dims)
-
-    self._sharded_per_core = ctx.is_input_sharded_per_core()
-    self._input_fn = input_fn
-    self._infeed_queue = None
-    self._ctx = ctx
-    self._batch_axis = batch_axis
-
-  def generate_infeed_enqueue_ops_and_dequeue_fn(self):
-    """Generates infeed enqueue ops and dequeue_fn."""
-    # While tf.while_loop is called, the body function, which invokes
-    # `enqueue_fn` passed in, is called to construct the graph. So, input_fn
-    # structure is recorded.
-    enqueue_ops, all_hooks, run_infeed_loop_on_coordinator = (
-        self._invoke_input_fn_and_record_structure())
-
-    self._validate_input_pipeline()
-
-    def dequeue_fn():
-      """dequeue_fn is used by TPU to retrieve the tensors."""
-      # In the model-parallel case, both the host-side and device-side
-      # computations must agree on the core on which infeed takes place. We
-      # choose to perform infeed on logical core 0 of each replica.
-      values = self._infeed_queue.generate_dequeue_op(tpu_device=0)
-      # The unflatten process uses the structure information recorded above.
-      return self._inputs_structure_recorder.unflatten_features_and_labels(
-          values)
-
-    return (enqueue_ops, dequeue_fn, all_hooks, run_infeed_loop_on_coordinator)
-
-  def _invoke_input_fn_and_record_structure(self):
-    """Deploys the input pipeline and record input structure."""
-    enqueue_ops = []
-    infeed_queues = []
-    all_dataset_initializers = []
-    num_hosts = self._ctx.num_hosts
-    tpu_host_placement_fn = self._ctx.tpu_host_placement_function
-
-    run_infeed_loop_on_coordinator = True
-
-    if self._sharded_per_core:
-      # Per-Core input pipeline deployment.
-      # Invoke input pipeline for each core and placed on the corresponding
-      # host.
-      for host_id in range(num_hosts):
-        host_device = tpu_host_placement_fn(host_id=host_id)
-        with ops.device(host_device):
-          with ops.name_scope('input_pipeline_task%d' % (host_id)):
-            enqueue_ops_fn, captured_infeed_queue = (
-                generate_per_core_enqueue_ops_fn_for_host(
-                    self._ctx, self._input_fn, self._inputs_structure_recorder,
-                    host_device, host_id))
-
-            if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
-              run_infeed_loop_on_coordinator = False
-              enqueue_ops.append(
-                  _wrap_computation_in_while_loop(
-                      device=host_device, op_fn=enqueue_ops_fn))
-            else:
-              enqueue_ops.append(enqueue_ops_fn())
-            # Infeed_queue_getter must be called after enqueue_ops_fn is called.
-            infeed_queues.append(captured_infeed_queue.get())
-
-    elif self._ctx.is_input_broadcast_with_iterators():
-      # Only calls input_fn in host 0.
-      host_device = tpu_host_placement_fn(host_id=0)
-      enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
-          generate_broadcast_enqueue_ops_fn(self._ctx, self._input_fn,
-                                            self._inputs_structure_recorder,
-                                            num_hosts))
-      if dataset_initializer:
-        all_dataset_initializers.append(dataset_initializer)
-        run_infeed_loop_on_coordinator = False
-        wrap_fn = (
-            _wrap_computation_in_while_loop
-            if self._ctx.mode != model_fn_lib.ModeKeys.PREDICT else
-            _wrap_computation_in_while_loop_with_stopping_signals)
-        enqueue_ops.append(wrap_fn(device=host_device, op_fn=enqueue_ops_fn))
-      else:
-        enqueue_ops.append(enqueue_ops_fn())
-      infeed_queues.append(captured_infeed_queue.get())
-    else:
-      for host_id in range(num_hosts):
-        host_device = tpu_host_placement_fn(host_id=host_id)
-        with ops.device(host_device):
-          with ops.name_scope('input_pipeline_task%d' % (host_id)):
-            if self._ctx.is_input_per_host_with_iterators():
-              enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
-                  generate_per_host_v2_enqueue_ops_fn_for_host(
-                      self._ctx, self._input_fn,
-                      self._inputs_structure_recorder, host_device, host_id))
-            else:
-              enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
-                  generate_per_host_enqueue_ops_fn_for_host(
-                      self._ctx, self._input_fn,
-                      self._inputs_structure_recorder, self._batch_axis,
-                      host_device, host_id))
-
-            # NOTE(xiejw): We dispatch here based on the return type of the
-            # users `input_fn`.
-            #
-            # 1. If input_fn returns a Dataset instance, we initialize the
-            # iterator outside of tf.while_loop, and call the iterator.get_next
-            # inside tf.while_loop.  This should be always safe.
-            #
-            # 2. If input_fn returns (features, labels), it is too late to wrap
-            # them inside tf.while_loop, as resource initialization cannot be
-            # handled in TF control flow properly. In this case, we will use
-            # python loop to enqueue the data into TPU system.  This may be
-            # slow compared to the previous case.
-            if dataset_initializer:
-              all_dataset_initializers.append(dataset_initializer)
-              run_infeed_loop_on_coordinator = False
-              wrap_fn = (
-                  _wrap_computation_in_while_loop
-                  if self._ctx.mode != model_fn_lib.ModeKeys.PREDICT else
-                  _wrap_computation_in_while_loop_with_stopping_signals)
-              enqueue_ops.append(
-                  wrap_fn(device=host_device, op_fn=enqueue_ops_fn))
-            else:
-              enqueue_ops.append(enqueue_ops_fn())
-            infeed_queues.append(captured_infeed_queue.get())
-    # infeed_queue is used to generate dequeue ops. The only thing it uses for
-    # dequeue is dtypes and types. So, any one can be used. Here, grab the
-    # first one.
-    self._infeed_queue = infeed_queues[0]
-    return enqueue_ops, [
-        util_lib.MultiHostDatasetInitializerHook(all_dataset_initializers)
-    ], run_infeed_loop_on_coordinator
-
-  def _validate_input_pipeline(self):
-    """Validates the input pipeline.
-
-    Perform some sanity checks to log user friendly information. We should
-    error out to give users better error message. But, if
-    _WRAP_INPUT_FN_INTO_WHILE_LOOP is False (legacy behavior), we cannot break
-    user code, so, log a warning.
-
-    Raises:
-      RuntimeError: If the validation failed.
-    """
-    if ops.get_default_graph().get_collection(ops.GraphKeys.QUEUE_RUNNERS):
-      err_msg = ('Input pipeline contains one or more QueueRunners. '
-                 'It could be slow and not scalable. Please consider '
-                 'converting your input pipeline to use `tf.data` instead (see '
-                 'https://www.tensorflow.org/guide/datasets for '
-                 'instructions.')
-      if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
-        raise RuntimeError(err_msg)
-      else:
-        logging.warn(err_msg)
-
-
-class _ModelFnWrapper(object):
-  """A `model_fn` wrapper.
-
-  This makes calling model_fn on CPU and TPU easier and more consistent and
-  performs necessary check and mutation required by TPU training and evaluation.
-
-  In addition, this wrapper manages converting the `model_fn` to a single TPU
-  train and eval step.
-  """
-
-  def __init__(self, model_fn, config, params, ctx):
-    self._model_fn = model_fn
-    self._config = config
-    self._params = params
-    self._ctx = ctx
-
-  def call_without_tpu(self, features, labels, is_export_mode):
-    return self._call_model_fn(features, labels, is_export_mode=is_export_mode)
-
-  def convert_to_single_tpu_train_step(self, dequeue_fn):
-    """Converts user provided model_fn` as a single train step on TPU.
-
-    The user provided `model_fn` takes input tuple
-    (features, labels) and produces the EstimatorSpec with train_op and loss for
-    train `mode`. This usually represents a single train computation on CPU.
-
-    For TPU training, a train (computation) step is first wrapped in a
-    tf.while_loop control flow to repeat for many times and then replicated to
-    all TPU shards. Besides the input should be taken from TPU infeed rather
-    than input pipeline (input_fn) directly. To fit TPU loop and replicate
-    pattern, the original train computation should be reformed, which is the
-    returned `train_step`.
-
-    Args:
-      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
-        infeed dequeue channel.
-
-    Returns:
-      A tuple of train_fn, host_calls, and captured scaffold_fn. The train_fn
-      representing the train step for TPU.
-    """
-
-    host_call = _OutfeedHostCall(self._ctx)
-    captured_scaffold_fn = _CapturedObject()
-    captured_training_hooks = _CapturedObject()
-
-    def train_step(loss):
-      """Training step function for use inside a while loop."""
-      del loss  # unused; required in function signature.
-      inputs = dequeue_fn()
-      features, labels = inputs.features_and_labels()
-
-      estimator_spec = self._verify_estimator_spec(
-          self._call_model_fn(features, labels))
-      loss, train_op = estimator_spec.loss, estimator_spec.train_op
-
-      if isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
-        captured_scaffold_fn.capture(estimator_spec.scaffold_fn)
-      else:
-        captured_scaffold_fn.capture(None)
-
-      captured_training_hooks.capture(estimator_spec.training_hooks)
-
-      tracing_ops = []
-      if tensor_tracer.TensorTracer.is_enabled():
-        tt = tensor_tracer.TensorTracer()
-        loss, tracing_ops = tt.trace_tpu(ops.get_default_graph(), loss,
-                                         self._ctx.num_replicas)
-
-      # We must run train_op to update the variables prior to running the
-      # outfeed.
-      with ops.control_dependencies([train_op]+tracing_ops):
-        host_call_outfeed_ops = []
-        if (isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec)  # pylint: disable=protected-access
-            and estimator_spec.host_call is not None):
-          host_call.record({'host_call': estimator_spec.host_call})
-          host_call_outfeed_ops = host_call.create_enqueue_op()
-        with ops.control_dependencies(host_call_outfeed_ops):
-          return array_ops.identity(loss)
-
-    return (train_step, host_call, captured_scaffold_fn,
-            captured_training_hooks)
-
-  def convert_to_single_tpu_eval_step(self, dequeue_fn):
-    """Converts user provided model_fn` as a single eval step on TPU.
-
-    Similar to training, the user provided `model_fn` takes input tuple
-    (features, labels) and produces the TPUEstimatorSpec with eval_metrics for
-    eval `mode`. This usually represents a single evaluation computation on CPU.
-
-    For TPU evaluation, a eval (computation) step is first wrapped in a
-    tf.while_loop control flow to repeat for many times and then replicated to
-    all TPU shards. Besides the input and output are slightly different. Input,
-    features and labels, should be taken from TPU infeed rather than input
-    pipeline (input_fn) directly. Output is managed in two stages.  First, the
-    model outputs as the result of evaluation computation, usually model logits,
-    should be transferred from TPU system to CPU. Then, all model outputs are
-    concatenated first on CPU and sent to the metric_fn for metrics computation.
-    To fit TPU evaluation pattern, the original eval computation should be
-    reformed, which is the returned `eval_step`.
-
-    Args:
-      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
-        infeed dequeue channel.
-
-    Returns:
-      A tuple of eval_fn, host_calls, and captured scaffold_fn. The eval_fn
-      representing the eval step for TPU.
-    """
-    host_calls = _OutfeedHostCall(self._ctx)
-    captured_scaffold_fn = _CapturedObject()
-    captured_eval_hooks = _CapturedObject()
-
-    def eval_step(total_loss):
-      """Evaluation step function for use inside a while loop."""
-      inputs = dequeue_fn()
-      features, labels = inputs.features_and_labels()
-
-      tpu_estimator_spec = self._call_model_fn(features, labels)
-      if not isinstance(tpu_estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
-        raise RuntimeError(
-            'estimator_spec used by TPU evaluation must have type'
-            '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec)))
-
-      loss = tpu_estimator_spec.loss
-      captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
-      captured_eval_hooks.capture(tpu_estimator_spec.evaluation_hooks)
-
-      to_record = {}
-      if tpu_estimator_spec.eval_metrics:
-        to_record['eval_metrics'] = tpu_estimator_spec.eval_metrics
-      if tpu_estimator_spec.host_call is not None:
-        # We assume that evaluate won't update global step, so we don't wrap
-        # this host_call.
-        to_record['host_call'] = tpu_estimator_spec.host_call
-      host_calls.record(to_record)
-
-      with ops.control_dependencies(host_calls.create_enqueue_op()):
-        return math_ops.add(total_loss, loss)
-
-    return eval_step, host_calls, captured_scaffold_fn, captured_eval_hooks
-
-  def convert_to_single_tpu_predict_step(self, dequeue_fn):
-    """Converts user provided model_fn` as a single predict step on TPU.
-
-    Args:
-      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
-        infeed dequeue channel.
-
-    Returns:
-      A tuple of predict_fn, host_calls, and captured scaffold_fn. The
-      predict_fn representing the predict step for TPU.
-    """
-    host_calls = _OutfeedHostCall(self._ctx)
-    captured_scaffold_fn = _CapturedObject()
-    captured_predict_hooks = _CapturedObject()
-
-    def predict_step(unused_scalar_stopping_signal):
-      """Evaluation step function for use inside a while loop."""
-      inputs = dequeue_fn()
-      features, labels = inputs.features_and_labels()
-      stopping_signals = inputs.signals()
-
-      assert stopping_signals is not None, (
-          'Internal Error: `signals` is missing.')
-
-      tpu_estimator_spec = self._call_model_fn(
-          features, labels, is_export_mode=False)
-      if not isinstance(tpu_estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
-        raise RuntimeError(
-            'estimator_spec used by TPU prediction must have type'
-            '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec)))
-
-      self._verify_tpu_spec_predictions(tpu_estimator_spec.predictions)
-
-      captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
-      captured_predict_hooks.capture(tpu_estimator_spec.prediction_hooks)
-      to_record = {}
-      identity_fn = lambda **kwargs: kwargs
-      to_record['predictions'] = [identity_fn, tpu_estimator_spec.predictions]
-      to_record['signals'] = [identity_fn, stopping_signals]
-      if tpu_estimator_spec.host_call is not None:
-        to_record['host_call'] = tpu_estimator_spec.host_call
-      host_calls.record(to_record)
-
-      with ops.control_dependencies(host_calls.create_enqueue_op()):
-        return _StopSignals.as_scalar_stopping_signal(stopping_signals)
-
-    return (predict_step, host_calls, captured_scaffold_fn,
-            captured_predict_hooks)
-
-  def _verify_tpu_spec_predictions(self, predictions):
-    """Validates TPUEstimatorSpec.predictions dict."""
-    # TODO(xiejw): Adds validation for prediction dictionrary.
-    # TODO(xiejw): Adds support for single tensor as predictions.
-    if not isinstance(predictions, dict):
-      raise TypeError('TPUEstimatorSpec.predictions must be dict of Tensors.')
-
-    for (key, tensor) in predictions.items():
-      if tensor.shape.dims[0].value is None:
-        raise ValueError(
-            'The tensor with key ({}) in TPUEstimatorSpec.predictions has '
-            'dynamic shape (should be static). Tensor: {}'.format(key, tensor))
-    return predictions
-
-  def _validate_model_features_and_labels(self, features, labels,
-                                          is_export_mode):
-    """Validates that the features and labels for the model function are valid.
-
-    A valid features/labels object is the one with:
-    - Type: A tensor or any nested structure of tensors supported by TF nest,
-        namely nested dictionary, tuple, namedtuple, or sequence of tensors.
-    - Static shape if is_export_mode is False.
-
-    Args:
-      features: the features that would be input to the model function.
-      labels: the labels that would be input to the model function.
-      is_export_mode: boolean value specifying if in export mode.
-
-    Raises:
-      TypeError: If features/labels are not of the correct type.
-      ValueError: If features/labels have dynamic shape.
-    """
-
-    def validate(obj, obj_name):
-      """Helper validate function."""
-      if is_export_mode or self._ctx.is_running_on_cpu(is_export_mode):
-        return
-      if isinstance(obj, ops.Tensor):
-        if not obj.get_shape().is_fully_defined():
-          raise ValueError(
-              'The {} to the model returned by input_fn must have static shape.'
-              ' Tensor: {}'.format(obj_name, obj))
-      else:
-        for tensor in data_nest.flatten(obj):
-          if not tensor.get_shape().is_fully_defined():
-            raise ValueError(
-                ('The {} to the model returned by input_fn must have static '
-                 'shape. Tensor: {}').format(obj_name, tensor))
-
-    validate(features, 'features')
-    if labels is not None:
-      validate(labels, 'labels')
-
-  def _call_model_fn(self, features, labels, is_export_mode=False):
-    """Calls the model_fn with required parameters."""
-    self._validate_model_features_and_labels(features, labels, is_export_mode)
-    model_fn_args = function_utils.fn_args(self._model_fn)
-    kwargs = {}
-
-    # Makes deep copy with `config` and params` in case user mutates them.
-    config = copy.deepcopy(self._config)
-    params = copy.deepcopy(self._params)
-
-    if 'labels' in model_fn_args:
-      kwargs['labels'] = labels
-    elif labels is not None:
-      raise ValueError(
-          'model_fn does not take labels, but input_fn returns labels.')
-    if 'mode' in model_fn_args:
-      kwargs['mode'] = self._ctx.mode
-    if 'config' in model_fn_args:
-      kwargs['config'] = config
-    if 'params' in model_fn_args:
-      kwargs['params'] = params
-
-    if 'params' not in model_fn_args:
-      raise ValueError('model_fn ({}) does not include params argument, '
-                       'required by TPUEstimator to pass batch size as '
-                       'params[\'batch_size\']'.format(self._model_fn))
-
-    if is_export_mode:
-      batch_size_for_model_fn = None
-    else:
-      batch_size_for_model_fn = self._ctx.batch_size_for_model_fn
-
-    if batch_size_for_model_fn is not None:
-      _add_item_to_params(params, _BATCH_SIZE_KEY, batch_size_for_model_fn)
-
-    running_on_cpu = self._ctx.is_running_on_cpu(is_export_mode)
-    _add_item_to_params(params, _USE_TPU_KEY, not running_on_cpu)
-
-    if not running_on_cpu:
-      user_context = tpu_context.TPUContext(
-          internal_ctx=self._ctx, call_from_input_fn=False)
-      _add_item_to_params(params, _CTX_KEY, user_context)
-
-    estimator_spec = self._model_fn(features=features, **kwargs)
-    if (running_on_cpu and
-        isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec)):  # pylint: disable=protected-access
-      # The estimator_spec will be passed to `Estimator` directly, which expects
-      # type `EstimatorSpec`.
-      return estimator_spec.as_estimator_spec()
-    else:
-      return estimator_spec
-
-  def _verify_estimator_spec(self, estimator_spec):
-    """Validates the estimator_spec."""
-    if isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
-      return estimator_spec
-
-    err_msg = '{} returned by EstimatorSpec is not supported in TPUEstimator.'
-    if estimator_spec.training_chief_hooks:
-      raise ValueError(
-          err_msg.format('training_chief_hooks') + 'If you want' +
-          ' to pass training hooks, please pass via training_hooks.')
-
-    if estimator_spec.scaffold:
-      logging.warning('EstimatorSpec.Scaffold is ignored by TPU train/eval. '
-                      'Please use TPUEstimatorSpec.')
-    return estimator_spec
-
-
-class _OutfeedHostCall(object):
-  """Support for `eval_metrics` and `host_call` in TPUEstimatorSpec."""
-
-  def __init__(self, ctx):
-    self._ctx = ctx
-    self._names = []
-    # All of these are dictionaries of lists keyed on the name.
-    self._host_fns = {}
-    self._tensor_keys = collections.defaultdict(list)
-    self._tensors = collections.defaultdict(list)
-    self._tensor_dtypes = collections.defaultdict(list)
-    self._tensor_shapes = collections.defaultdict(list)
-
-  @staticmethod
-  def validate(host_calls):
-    """Validates the `eval_metrics` and `host_call` in `TPUEstimatorSpec`."""
-
-    for name, host_call in host_calls.items():
-      if not isinstance(host_call, (tuple, list)):
-        raise ValueError('{} should be tuple or list'.format(name))
-      if len(host_call) != 2:
-        raise ValueError('{} should have two elements.'.format(name))
-      if not callable(host_call[0]):
-        raise TypeError('{}[0] should be callable.'.format(name))
-      if not isinstance(host_call[1], (tuple, list, dict)):
-        raise ValueError('{}[1] should be tuple or list, or dict.'.format(name))
-
-      if isinstance(host_call[1], (tuple, list)):
-        fullargspec = tf_inspect.getfullargspec(host_call[0])
-        fn_args = function_utils.fn_args(host_call[0])
-        # wrapped_hostcall_with_global_step uses varargs, so we allow that.
-        if fullargspec.varargs is None and len(host_call[1]) != len(fn_args):
-          raise RuntimeError(
-              'In TPUEstimatorSpec.{}, length of tensors {} does not match '
-              'method args of the function, which takes {}.'.format(
-                  name, len(host_call[1]), len(fn_args)))
-
-  @staticmethod
-  def create_cpu_hostcall(host_calls):
-    """Runs on the host_call on CPU instead of TPU when use_tpu=False."""
-
-    _OutfeedHostCall.validate(host_calls)
-    ret = {}
-    for name, host_call in host_calls.items():
-      host_fn, tensors = host_call
-      if isinstance(tensors, (tuple, list)):
-        ret[name] = host_fn(*tensors)
-      else:
-        # Must be dict.
-        try:
-          ret[name] = host_fn(**tensors)
-        except TypeError as e:
-          logging.warning(
-              'Exception while calling %s: %s. It is likely the tensors '
-              '(%s[1]) do not match the '
-              'function\'s arguments', name, e, name)
-          raise e
-    return ret
-
-  def record(self, host_calls):
-    """Records the host_call structure."""
-
-    for name, host_call in host_calls.items():
-      host_fn, tensor_list_or_dict = host_call
-      self._names.append(name)
-      self._host_fns[name] = host_fn
-
-      if isinstance(tensor_list_or_dict, dict):
-        for (key, tensor) in six.iteritems(tensor_list_or_dict):
-          self._tensor_keys[name].append(key)
-          self._tensors[name].append(tensor)
-          self._tensor_dtypes[name].append(tensor.dtype)
-          self._tensor_shapes[name].append(tensor.shape)
-      else:
-        # List or tuple.
-        self._tensor_keys[name] = None
-        for tensor in tensor_list_or_dict:
-          self._tensors[name].append(tensor)
-          self._tensor_dtypes[name].append(tensor.dtype)
-          self._tensor_shapes[name].append(tensor.shape)
-
-  def create_enqueue_op(self):
-    """Create the op to enqueue the recorded host_calls.
-
-    Returns:
-      A list of enqueue ops, which is empty if there are no host calls.
-    """
-    if not self._names:
-      return []
-
-    tensors = []
-    # TODO(jhseu): Consider deduping tensors.
-    for name in self._names:
-      tensors.extend(self._tensors[name])
-
-    with ops.device(tpu.core(0)):
-      return [tpu_ops.outfeed_enqueue_tuple(tensors)]
-
-  def create_tpu_hostcall(self):
-    """Sends the tensors through outfeed and runs the host_fn on CPU.
-
-    The tensors are concatenated along dimension 0 to form a global tensor
-    across all shards. The concatenated function is passed to the host_fn and
-    executed on the first host.
-
-    Returns:
-      A dictionary mapping name to the return type of the host_call by that
-      name.
-
-    Raises:
-      RuntimeError: If outfeed tensor is scalar.
-    """
-    if not self._names:
-      return {}
-
-    ret = {}
-    # For each i, dequeue_ops[i] is a list containing the tensors from all
-    # shards. This list is concatenated later.
-    dequeue_ops = []
-    tensor_dtypes = []
-    tensor_shapes = []
-    for name in self._names:
-      for _ in self._tensors[name]:
-        dequeue_ops.append([])
-      for dtype in self._tensor_dtypes[name]:
-        tensor_dtypes.append(dtype)
-      for shape in self._tensor_shapes[name]:
-        tensor_shapes.append(shape)
-
-    # Outfeed ops execute on each replica's first logical core. Note: we must
-    # constraint it such that we have at most one outfeed dequeue and enqueue
-    # per replica.
-    for i in xrange(self._ctx.num_replicas):
-      host_device, ordinal_id = self._ctx.device_for_replica(i)
-      with ops.device(host_device):
-        outfeed_tensors = tpu_ops.outfeed_dequeue_tuple(
-            dtypes=tensor_dtypes,
-            shapes=tensor_shapes,
-            device_ordinal=ordinal_id)
-        for j, item in enumerate(outfeed_tensors):
-          dequeue_ops[j].append(item)
-
-    # Deconstruct dequeue ops.
-    dequeue_ops_by_name = {}
-    pos = 0
-    for name in self._names:
-      dequeue_ops_by_name[name] = dequeue_ops[pos:pos +
-                                              len(self._tensors[name])]
-      pos += len(self._tensors[name])
-
-    # It is assumed evaluation always happens on single host TPU system. So,
-    # place all ops on tpu host if possible.
-    #
-    # TODO(jhseu): Evaluate whether this is right for summaries.
-    with ops.device(self._ctx.tpu_host_placement_function(replica_id=0)):
-      for name in self._names:
-        dequeue_ops = dequeue_ops_by_name[name]
-        for i, item in enumerate(dequeue_ops):
-          if dequeue_ops[i][0].shape.ndims == 0:
-            raise RuntimeError(
-                'All tensors outfed from TPU should preserve batch size '
-                'dimension, but got scalar {}'.format(dequeue_ops[i][0]))
-          # TODO(xiejw): Allow users to specify the axis for batch size
-          # dimension.
-          dequeue_ops[i] = array_ops.concat(dequeue_ops[i], axis=0)
-
-        if self._tensor_keys[name] is not None:
-          # The user-provided eval_metrics[1] is a dict.
-          dequeue_ops = dict(zip(self._tensor_keys[name], dequeue_ops))
-          try:
-            ret[name] = self._host_fns[name](**dequeue_ops)
-          except TypeError as e:
-            logging.warning(
-                'Exception while calling %s: %s. It is likely the tensors '
-                '(%s[1]) do not match the '
-                'function\'s arguments', name, e, name)
-            raise e
-        else:
-          ret[name] = self._host_fns[name](*dequeue_ops)
-
-    return ret
-
-
-class _OutfeedHostCallHook(session_run_hook.SessionRunHook):
-  """Hook to run host calls when use_tpu=False."""
-
-  def __init__(self, tensors):
-    self._tensors = tensors
-
-  def begin(self):
-    # We duplicate this code from the TPUInfeedOutfeedSessionHook rather than
-    # create a separate hook to guarantee execution order, because summaries
-    # need to be initialized before the outfeed thread starts.
-    # TODO(jhseu): Make a wrapper hook instead?
-    self._init_ops = contrib_summary.summary_writer_initializer_op()
-    # Get all the writer resources from the initializer, so we know what to
-    # flush.
-    self._finalize_ops = []
-    for op in self._init_ops:
-      self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
-
-  def after_create_session(self, session, coord):
-    session.run(self._init_ops)
-
-  def before_run(self, run_context):
-    return basic_session_run_hooks.SessionRunArgs(self._tensors)
-
-  def end(self, session):
-    session.run(self._finalize_ops)
-
-
-class ExamplesPerSecondHook(basic_session_run_hooks.StepCounterHook):
-  """Calculate and report global_step/sec and examples/sec during runtime."""
-
-  def __init__(self,
-               batch_size,
-               every_n_steps=100,
-               every_n_secs=None,
-               output_dir=None,
-               summary_writer=None):
-    self._batch_size = batch_size
-    super(ExamplesPerSecondHook, self).__init__(
-        every_n_steps=every_n_steps,
-        every_n_secs=every_n_secs,
-        output_dir=output_dir,
-        summary_writer=summary_writer)
-
-  def _log_and_record(self, elapsed_steps, elapsed_time, global_step):
-    global_step_per_sec = elapsed_steps / elapsed_time
-    examples_per_sec = self._batch_size * global_step_per_sec
-    if self._summary_writer is not None:
-      global_step_summary = Summary(value=[
-          Summary.Value(tag='global_step/sec', simple_value=global_step_per_sec)
-      ])
-      example_summary = Summary(value=[
-          Summary.Value(tag='examples/sec', simple_value=examples_per_sec)
-      ])
-      self._summary_writer.add_summary(global_step_summary, global_step)
-      self._summary_writer.add_summary(example_summary, global_step)
-    logging.info('global_step/sec: %g', global_step_per_sec)
-    logging.info('examples/sec: %g', examples_per_sec)
-
-
-class InstallSignalHandlerHook(session_run_hook.SessionRunHook):
-  """Change SIGINT (CTRL^C) handler to force quit the process.
-
-  The default behavior often results in hanging processes.
-  The original handler is restored after training/evaluation.
-  """
-
-  def __init__(self):
-    self._signal_fn = signal.getsignal(signal.SIGINT)
-
-  def before_run(self, run_context):
-    signal.signal(signal.SIGINT, signal.SIG_DFL)
-
-  def end(self, session):
-    signal.signal(signal.SIGINT, self._signal_fn)
-
-
-class TPUEstimator(estimator_lib.Estimator):
-  """Estimator with TPU support.
-
-  TPUEstimator also supports training on CPU and GPU. You don't need to define
-  a separate `tf.estimator.Estimator`.
-
-  TPUEstimator handles many of the details of running on TPU devices, such as
-  replicating inputs and models for each core, and returning to host
-  periodically to run hooks.
-
-  TPUEstimator transforms a global batch size in params to a per-shard batch
-  size when calling the `input_fn` and `model_fn`. Users should specify
-  global batch size in constructor, and then get the batch size for each shard
-  in `input_fn` and `model_fn` by `params['batch_size']`.
-
-  - For training, `model_fn` gets per-core batch size; `input_fn` may get
-    per-core or per-host batch size depending on `per_host_input_for_training`
-    in `TPUConfig` (See docstring for TPUConfig for details).
-
-  - For evaluation and prediction, `model_fn` gets per-core batch size and
-    `input_fn` get per-host batch size.
-
-  Evaluation
-  ==========
-
-  `model_fn` should return `TPUEstimatorSpec`, which expects the `eval_metrics`
-  for TPU evaluation. However, if eval_on_tpu is False, `model_fn` must return
-  `EstimatorSpec` and the evaluation will execute on CPU or GPU; in this case
-  the following discussion on TPU evaluation does not apply.
-
-  `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`, where
-  `tensors` could be a list of any nested structure of `Tensor`s (See
-  `TPUEstimatorSpec` for details).  `metric_fn` takes the `tensors` and returns
-  a dict from metric string name to the result of calling a metric function,
-  namely a `(metric_tensor, update_op)` tuple.
-
-  One can set `use_tpu` to `False` for testing. All training, evaluation, and
-  predict will be executed on CPU. `input_fn` and `model_fn` will receive
-  `train_batch_size` or `eval_batch_size` unmodified as `params['batch_size']`.
-
-  Current limitations:
-  --------------------
-
-  1. TPU evaluation only works on a single host (one TPU worker) except
-     BROADCAST mode.
-
-  2. `input_fn` for evaluation should **NOT** raise an end-of-input exception
-     (`OutOfRangeError` or `StopIteration`). And all evaluation steps and all
-     batches should have the same size.
-
-  Example (MNIST):
-  ----------------
-
-  ```
-  # The metric Fn which runs on CPU.
-  def metric_fn(labels, logits):
-    predictions = tf.argmax(logits, 1)
-    return {
-      'accuracy': tf.metrics.precision(
-          labels=labels, predictions=predictions),
-    }
-
-  # Your model Fn which runs on TPU (eval_metrics is list in this example)
-  def model_fn(features, labels, mode, config, params):
-    ...
-    logits = ...
-
-    if mode = tf.estimator.ModeKeys.EVAL:
-      return tpu_estimator.TPUEstimatorSpec(
-          mode=mode,
-          loss=loss,
-          eval_metrics=(metric_fn, [labels, logits]))
-
-  # or specify the eval_metrics tensors as dict.
-  def model_fn(features, labels, mode, config, params):
-    ...
-    final_layer_output = ...
-
-    if mode = tf.estimator.ModeKeys.EVAL:
-      return tpu_estimator.TPUEstimatorSpec(
-          mode=mode,
-          loss=loss,
-          eval_metrics=(metric_fn, {
-              'labels': labels,
-              'logits': final_layer_output,
-          }))
-  ```
-
-  Prediction
-  ==========
-
-  Prediction on TPU is an experimental feature to support large batch inference.
-  It is not designed for latency-critical system. In addition, due to some
-  usability issues, for prediction with small dataset, CPU `.predict`, i.e.,
-  creating a new `TPUEstimator` instance with `use_tpu=False`, might be more
-  convenient.
-
-  Note: In contrast to TPU training/evaluation, the `input_fn` for prediction
-  *should* raise an end-of-input exception (`OutOfRangeError` or
-  `StopIteration`), which serves as the stopping signal to `TPUEstimator`. To be
-  precise, the ops created by `input_fn` produce one batch of the data.
-  The `predict()` API processes one batch at a time. When reaching the end of
-  the data source, an end-of-input exception should be raised by one of these
-  operations. The user usually does not need to do this manually. As long as the
-  dataset is not repeated forever, the `tf.data` API will raise an end-of-input
-  exception automatically after the last batch has been produced.
-
-  Note: Estimator.predict returns a Python generator. Please consume all the
-  data from the generator so that TPUEstimator can shutdown the TPU system
-  properly for user.
-
-  Current limitations:
-  --------------------
-  1. TPU prediction only works on a single host (one TPU worker).
-
-  2. `input_fn` must return a `Dataset` instance rather than `features`. In
-  fact, .train() and .evaluate() also support Dataset as return value.
-
-  Example (MNIST):
-  ----------------
-  ```
-  height = 32
-  width = 32
-  total_examples = 100
-
-  def predict_input_fn(params):
-    batch_size = params['batch_size']
-
-    images = tf.random_uniform(
-        [total_examples, height, width, 3], minval=-1, maxval=1)
-
-    dataset = tf.data.Dataset.from_tensor_slices(images)
-    dataset = dataset.map(lambda images: {'image': images})
-
-    dataset = dataset.batch(batch_size)
-    return dataset
-
-  def model_fn(features, labels, params, mode):
-     # Generate predictions, called 'output', from features['image']
-
-    if mode == tf.estimator.ModeKeys.PREDICT:
-      return tf.contrib.tpu.TPUEstimatorSpec(
-          mode=mode,
-          predictions={
-              'predictions': output,
-              'is_padding': features['is_padding']
-          })
-
-  tpu_est = TPUEstimator(
-      model_fn=model_fn,
-      ...,
-      predict_batch_size=16)
-
-  # Fully consume the generator so that TPUEstimator can shutdown the TPU
-  # system.
-  for item in tpu_est.predict(input_fn=input_fn):
-    # Filter out item if the `is_padding` is 1.
-    # Process the 'predictions'
-  ```
-
-  Exporting
-  =========
-
-  `export_savedmodel` exports 2 metagraphs, one with `tag_constants.SERVING`,
-  and another with `tag_constants.SERVING` and `tag_constants.TPU`.
-  At serving time, these tags are used to select metagraph to load.
-
-  Before running the graph on TPU, TPU system needs to be initialized. If
-  TensorFlow Serving model-server is used, this is done automatically. If
-  not, please call `session.run(tpu.initialize_system())`.
-
-  `tpu.outside_compilation` can be used to wrap TPU incompatible ops in
-  `model_fn`.
-
-  Example:
-  ----------------
-
-  ```
-  def model_fn(features, labels, mode, config, params):
-    ...
-    logits = ...
-    export_outputs = {
-      'logits': export_output_lib.PredictOutput(
-        {'logits': logits})
-    }
-
-    def host_call(logits):
-      class_ids = math_ops.argmax(logits)
-      classes = string_ops.as_string(class_ids)
-      export_outputs['classes'] =
-        export_output_lib.ClassificationOutput(classes=classes)
-
-    tpu.outside_compilation(host_call, logits)
-
-    ...
-  ```
-
-  """
-
-  def __init__(self,
-               model_fn=None,
-               model_dir=None,
-               config=None,
-               params=None,
-               use_tpu=True,
-               train_batch_size=None,
-               eval_batch_size=None,
-               predict_batch_size=None,
-               batch_axis=None,
-               eval_on_tpu=True,
-               export_to_tpu=True,
-               warm_start_from=None):
-    """Constructs an `TPUEstimator` instance.
-
-    Args:
-      model_fn: Model function as required by `Estimator` which returns
-        EstimatorSpec or TPUEstimatorSpec. `training_hooks`, 'evaluation_hooks',
-        and `prediction_hooks` must not capure any TPU Tensor inside the
-        model_fn.
-      model_dir: Directory to save model parameters, graph and etc. This can
-        also be used to load checkpoints from the directory into a estimator to
-        continue training a previously saved model. If `None`, the model_dir in
-        `config` will be used if set. If both are set, they must be same. If
-        both are `None`, a temporary directory will be used.
-      config: An `tpu_config.RunConfig` configuration object. Cannot be `None`.
-      params: An optional `dict` of hyper parameters that will be passed into
-        `input_fn` and `model_fn`.  Keys are names of parameters, values are
-        basic python types. There are reserved keys for `TPUEstimator`,
-        including 'batch_size'.
-      use_tpu: A bool indicating whether TPU support is enabled. Currently, -
-        TPU training and evaluation respect this bit, but eval_on_tpu can
-        override execution of eval. See below. - Predict still happens on CPU.
-      train_batch_size: An int representing the global training batch size.
-        TPUEstimator transforms this global batch size to a per-shard batch
-        size, as params['batch_size'], when calling `input_fn` and `model_fn`.
-        Cannot be `None` if `use_tpu` is `True`. Must be divisible by total
-        number of replicas.
-      eval_batch_size: An int representing evaluation batch size. Must be
-        divisible by total number of replicas.
-      predict_batch_size: An int representing the prediction batch size. Must be
-        divisible by total number of replicas.
-      batch_axis: A python tuple of int values describing how each tensor
-        produced by the Estimator `input_fn` should be split across the TPU
-        compute shards. For example, if your input_fn produced (images, labels)
-        where the images tensor is in `HWCN` format, your shard dimensions would
-        be [3, 0], where 3 corresponds to the `N` dimension of your images
-        Tensor, and 0 corresponds to the dimension along which to split the
-        labels to match up with the corresponding images. If None is supplied,
-        and per_host_input_for_training is True, batches will be sharded based
-        on the major dimension. If tpu_config.per_host_input_for_training is
-        False or `PER_HOST_V2`, batch_axis is ignored.
-      eval_on_tpu: If False, evaluation runs on CPU or GPU. In this case, the
-        model_fn must return `EstimatorSpec` when called with `mode` as `EVAL`.
-      export_to_tpu: If True, `export_savedmodel()` exports a metagraph for
-        serving on TPU besides the one on CPU.
-      warm_start_from: Optional string filepath to a checkpoint or SavedModel to
-        warm-start from, or a `tf.estimator.WarmStartSettings` object to fully
-        configure warm-starting.  If the string filepath is provided instead of
-        a `WarmStartSettings`, then all variables are warm-started, and it is
-        assumed that vocabularies and Tensor names are unchanged.
-
-    Raises:
-      ValueError: `params` has reserved keys already.
-    """
-    if config is None or not isinstance(config, tpu_config.RunConfig):
-      raise ValueError(
-          '`config` must be provided with type `tpu_config.RunConfig`')
-
-    if params is not None and any(k in params for k in _RESERVED_PARAMS_KEYS):
-      raise ValueError('{} are reserved keys but existed in params {}.'.format(
-          _RESERVED_PARAMS_KEYS, params))
-
-    if use_tpu:
-      # Perform some very basic validations. More validations will be found in
-      # _InternalTPUContext.
-      if train_batch_size is None:
-        raise ValueError('`train_batch_size` cannot be `None`')
-      util_lib.check_positive_integer(train_batch_size, 'train_batch_size')
-
-      if (config.tpu_config.per_host_input_for_training is
-          tpu_config.InputPipelineConfig.PER_SHARD_V1 and
-          config.tpu_config.num_cores_per_replica):
-        raise ValueError(
-            'Model parallelism only supports per host input for training. '
-            'Please adjust TPURunconfig.per_host_input_for_training.')
-
-      if eval_batch_size is not None:
-        util_lib.check_positive_integer(eval_batch_size, 'eval_batch_size')
-
-      if predict_batch_size is not None:
-        util_lib.check_positive_integer(predict_batch_size,
-                                        'predict_batch_size')
-
-    # Verifies the model_fn signature according to Estimator framework.
-    estimator_lib._verify_model_fn_args(model_fn, params)  # pylint: disable=protected-access
-    # We cannot store config and params in this constructor as parent
-    # constructor might change them, such as assigning a temp dir for
-    # config.model_dir.
-    model_function = self._augment_model_fn(model_fn, batch_axis)
-
-    # Overwrite log_step_count_steps to disable TensorLoggingHook and
-    # StepCounterHook from being created in Estimator. TPUEstimator already
-    # added equivalent hooks in _augment_model_fn above.
-    self._log_every_n_steps = config.log_step_count_steps
-    config = config.replace(log_step_count_steps=None)
-
-    # Passing non-None params as wrapped model_fn has it.
-    params = params or {}
-    super(TPUEstimator, self).__init__(
-        model_fn=model_function,
-        model_dir=model_dir,
-        config=config,
-        params=params,
-        warm_start_from=warm_start_from)
-    self._iterations_per_training_loop = (
-        self._config.tpu_config.iterations_per_loop)
-
-    # All properties passed to _InternalTPUContext are immutable.
-    # pylint: disable=protected-access
-    self._ctx = tpu_context._get_tpu_context(
-        self._config, train_batch_size, eval_batch_size, predict_batch_size,
-        use_tpu, eval_on_tpu)
-
-    self._export_to_tpu = export_to_tpu
-
-    self._is_input_fn_invoked = None
-    self._rendezvous = {}
-
-  def _add_meta_graph_for_mode(self,
-                               builder,
-                               input_receiver_fn_map,
-                               checkpoint_path,
-                               save_variables=True,
-                               mode=model_fn_lib.ModeKeys.PREDICT,
-                               export_tags=None,
-                               check_variables=True):
-    if self._export_to_tpu and mode != model_fn_lib.ModeKeys.PREDICT:
-      raise NotImplementedError(
-          'TPUEstimator only handles mode PREDICT for exporting '
-          'when `export_to_tpu` is `True`; '
-          'got {}.'.format(mode))
-
-    (super(TPUEstimator, self)._add_meta_graph_for_mode(
-        builder,
-        input_receiver_fn_map,
-        checkpoint_path,
-        save_variables,
-        mode=mode,
-        export_tags=export_tags,
-        check_variables=check_variables))
-
-    if self._export_to_tpu:
-      input_receiver_fn_map = {
-          _REWRITE_FOR_INFERENCE_MODE: input_receiver_fn_map[mode]
-      }
-      export_tags = [tag_constants.SERVING, tag_constants.TPU]
-      mode = _REWRITE_FOR_INFERENCE_MODE
-      # See b/110052256 for why `check_variables` is `False`.
-      (super(TPUEstimator, self)._add_meta_graph_for_mode(
-          builder,
-          input_receiver_fn_map,
-          checkpoint_path,
-          save_variables=False,
-          mode=mode,
-          export_tags=export_tags,
-          check_variables=False))
-
-  def _call_model_fn(self, features, labels, mode, config):
-    if mode == _REWRITE_FOR_INFERENCE_MODE:
-      return self._call_model_fn_for_inference(features, labels, mode, config)
-    else:
-      return super(TPUEstimator, self)._call_model_fn(features, labels, mode,
-                                                      config)
-
-  def _call_model_fn_for_inference(self, features, labels, mode, config):
-    """Wraps `_call_model_fn` for `export_savedmodel`."""
-    if mode != _REWRITE_FOR_INFERENCE_MODE:
-      raise ValueError('mode must be {}; '
-                       'got {}.'.format(_REWRITE_FOR_INFERENCE_MODE, mode))
-
-    capture = _CapturedObject()
-
-    def computation():
-      """Compute tpu tensors used in export_outputs.
-
-      Passed to rewrite so that model_fn will be called under
-      the rewriting contexts. Only tpu tensors are returned, but export_outputs
-      and scaffold are captured.
-
-      Returns:
-         A list of Tensors used in export_outputs and not marked for
-         outside_compilation.
-      """
-      # We should only call model fn once and it should be inside `computation`
-      # so that building the graph will happen under `rewrite`.
-      mode = model_fn_lib.ModeKeys.PREDICT
-      estimator_spec = self._call_model_fn(features, labels, mode, config)
-
-      # We pick the TPU tensors out from `export_output` and later return them
-      # from `computation` for rewriting.
-      tensors_dict = collections.OrderedDict(
-          (k, _export_output_to_tensors(v))
-          for k, v in six.iteritems(estimator_spec.export_outputs))
-      tensors = nest.flatten(tensors_dict)
-      tpu_tensors = [t for t in tensors if _is_tpu_tensor(t)]
-
-      # We cannot return anything other than `tpu_tensors` here so we capture
-      # the rest for later use.
-      capture.capture((estimator_spec, tensors_dict, tensors))
-      return tpu_tensors
-
-    tpu_tensors_on_cpu = tpu.rewrite(computation)
-    estimator_spec, tensors_dict, tensors = capture.get()
-
-    # Reconstruct `tensors`, but with `tpu_tensors` replaced with
-    # `tpu_tensors_on_cpu`.
-    new_tensors = []
-    for t in tensors:
-      if _is_tpu_tensor(t):
-        new_tensors.append(tpu_tensors_on_cpu.pop(0))
-      elif t is None:
-        new_tensors.append(None)
-      else:
-        # Only fetching `tpu_tensors_on_cpu` does not trigger
-        # TPU computation and blocks, so we add the control dependency here.
-        control_inputs = (
-            tpu_tensors_on_cpu if _is_iterable(tpu_tensors_on_cpu) else
-            (tpu_tensors_on_cpu,))
-        with ops.control_dependencies(control_inputs):
-          new_tensors.append(array_ops.identity(t))
-
-    # Reconstruct `tensors_dict`.
-    new_tensors_dict = nest.pack_sequence_as(tensors_dict, new_tensors)
-    # Reconstruct `export_outputs`.
-    export_outputs = estimator_spec.export_outputs
-    new_export_outputs = collections.OrderedDict(
-        (k, _clone_export_output_with_tensors(export_outputs[k], v))
-        for k, v in six.iteritems(new_tensors_dict))
-
-    return estimator_spec._replace(export_outputs=new_export_outputs)
-
-  def _create_global_step(self, graph):
-    """Creates a global step suitable for TPUs.
-
-    Args:
-      graph: The graph in which to create the global step.
-
-    Returns:
-      A global step `Tensor`.
-
-    Raises:
-      ValueError: if the global step tensor is already defined.
-    """
-    return _create_global_step(graph)
-
-  def _convert_train_steps_to_hooks(self, steps, max_steps):
-    with self._ctx.with_mode(model_fn_lib.ModeKeys.TRAIN) as ctx:
-      if ctx.is_running_on_cpu():
-        return super(TPUEstimator, self)._convert_train_steps_to_hooks(
-            steps, max_steps)
-
-    # On TPU.
-    if steps is None and max_steps is None:
-      raise ValueError(
-          'For TPU training, one of `steps` or `max_steps` must be set. '
-          'Cannot be both `None`.')
-
-    # Estimator.train has explicit positiveness check.
-    if steps is not None:
-      util_lib.check_positive_integer(steps, 'Train steps')
-    if max_steps is not None:
-      util_lib.check_positive_integer(max_steps, 'Train max_steps')
-
-    return [
-        _TPUStopAtStepHook(self._iterations_per_training_loop, steps, max_steps)
-    ]
-
-  def _convert_eval_steps_to_hooks(self, steps):
-    with self._ctx.with_mode(model_fn_lib.ModeKeys.EVAL) as ctx:
-      if ctx.is_running_on_cpu():
-        return super(TPUEstimator, self)._convert_eval_steps_to_hooks(steps)
-
-    if steps is None:
-      raise ValueError('Evaluate `steps` must be set on TPU. Cannot be `None`.')
-
-    util_lib.check_positive_integer(steps, 'Eval steps')
-
-    return [
-        evaluation._StopAfterNEvalsHook(  # pylint: disable=protected-access
-            num_evals=steps),
-        _SetEvalIterationsHook(steps)
-    ]
-
-  def _call_input_fn(self, input_fn, mode):
-    """Calls the input function.
-
-    Args:
-      input_fn: The input function.
-      mode: ModeKeys
-
-    Returns:
-      In TPU mode, returns an input_fn to be called later in model_fn.
-      Otherwise, calls the input_fn and returns either fatures or
-        (features, labels).
-
-    Raises:
-      ValueError: if input_fn takes invalid arguments or does not have `params`.
-    """
-    input_fn_args = function_utils.fn_args(input_fn)
-    config = self.config  # a deep copy.
-    kwargs = {}
-    if 'params' in input_fn_args:
-      kwargs['params'] = self.params  # a deep copy.
-    else:
-      raise ValueError('input_fn ({}) does not include params argument, '
-                       'required by TPUEstimator to pass batch size as '
-                       'params["batch_size"]'.format(input_fn))
-    if 'config' in input_fn_args:
-      kwargs['config'] = config
-
-    if 'mode' in input_fn_args:
-      kwargs['mode'] = mode
-
-    # Records the fact input_fn has been invoked.
-    self._is_input_fn_invoked = True
-
-    with self._ctx.with_mode(mode) as ctx:
-      # Setting the batch size in params first. This helps user to have same
-      # input_fn for use_tpu=True/False.
-      batch_size_for_input_fn = ctx.batch_size_for_input_fn
-      if batch_size_for_input_fn is not None:
-        _add_item_to_params(kwargs['params'], _BATCH_SIZE_KEY,
-                            batch_size_for_input_fn)
-
-      # For export_savedmodel, input_fn is never passed to Estimator. So,
-      # `is_export_mode` must be False.
-      if ctx.is_running_on_cpu(is_export_mode=False):
-        with ops.device('/device:CPU:0'):
-          return input_fn(**kwargs)
-
-      # For TPU computation, input_fn should be invoked in a tf.while_loop for
-      # performance. While constructing the tf.while_loop, the structure of
-      # inputs returned by the `input_fn` needs to be recorded. The structure
-      # includes whether features or labels is dict or single Tensor, dict keys,
-      # tensor shapes, and dtypes. The recorded structure is used to create the
-      # infeed dequeue ops, which must be wrapped and passed as a Fn, called
-      # inside the TPU computation, as the TPU computation is wrapped inside a
-      # tf.while_loop also. So, we either pass input_fn to model_fn or pass
-      # dequeue_fn to model_fn. Here, `input_fn` is passed directly as
-      # `features` in `model_fn` signature.
-      def _input_fn(ctx):
-        _add_item_to_params(kwargs['params'], _CTX_KEY, ctx)
-        return input_fn(**kwargs)
-
-      return _input_fn
-
-  def _validate_features_in_predict_input(self, result):
-    """Skip the validation.
-
-    For TPUEstimator, we do not need to check the result type. `_InputPipeline`
-    has stronger check. Parent class's check generates confusing warning msg.
-
-    Args:
-      result: `features` returned by input_fn.
-    """
-    pass
-
-  def train(self,
-            input_fn,
-            hooks=None,
-            steps=None,
-            max_steps=None,
-            saving_listeners=None):
-    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
-    self._rendezvous[model_fn_lib.ModeKeys.TRAIN] = rendezvous
-    try:
-      return super(TPUEstimator, self).train(
-          input_fn=input_fn,
-          hooks=hooks,
-          steps=steps,
-          max_steps=max_steps,
-          saving_listeners=saving_listeners)
-    except Exception:  # pylint: disable=broad-except
-      rendezvous.record_error('training_loop', sys.exc_info())
-    finally:
-      rendezvous.record_done('training_loop')
-      rendezvous.raise_errors()
-
-  def evaluate(self,
-               input_fn,
-               steps=None,
-               hooks=None,
-               checkpoint_path=None,
-               name=None):
-    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
-    self._rendezvous[model_fn_lib.ModeKeys.EVAL] = rendezvous
-    try:
-      return super(TPUEstimator, self).evaluate(
-          input_fn,
-          steps=steps,
-          hooks=hooks,
-          checkpoint_path=checkpoint_path,
-          name=name)
-    except Exception:  # pylint: disable=broad-except
-      rendezvous.record_error('evaluation_loop', sys.exc_info())
-    finally:
-      rendezvous.record_done('evaluation_loop')
-      rendezvous.raise_errors()
-
-  def predict(self,
-              input_fn,
-              predict_keys=None,
-              hooks=None,
-              checkpoint_path=None,
-              yield_single_examples=True):
-    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
-    self._rendezvous[model_fn_lib.ModeKeys.PREDICT] = rendezvous
-    try:
-      for result in super(TPUEstimator, self).predict(
-          input_fn=input_fn,
-          predict_keys=predict_keys,
-          hooks=hooks,
-          checkpoint_path=checkpoint_path,
-          yield_single_examples=yield_single_examples):
-        yield result
-    except Exception:  # pylint: disable=broad-except
-      rendezvous.record_error('prediction_loop', sys.exc_info())
-    finally:
-      rendezvous.record_done('prediction_loop')
-      rendezvous.raise_errors()
-
-    rendezvous.record_done('prediction_loop')
-    rendezvous.raise_errors()
-
-  def _augment_model_fn(self, model_fn, batch_axis):
-    """Returns a new model_fn, which wraps the TPU support."""
-
-    def _model_fn(features, labels, mode, config, params):
-      """A Estimator `model_fn` for TPUEstimator."""
-      with self._ctx.with_mode(mode) as ctx:
-        model_fn_wrapper = _ModelFnWrapper(model_fn, config, params, ctx)
-
-        # `input_fn` is called in `train()`, `evaluate()`, and `predict()`,
-        # but not in `export_savedmodel()`.
-        if self._is_input_fn_invoked:
-          is_export_mode = False
-        else:
-          is_export_mode = True
-
-        # Clear the bit.
-        self._is_input_fn_invoked = None
-
-        # examples_hook is added to training_hooks for both CPU and TPU
-        # execution.
-        if self._log_every_n_steps is not None:
-          examples_hook = ExamplesPerSecondHook(
-              ctx.global_batch_size,
-              output_dir=self.model_dir,
-              every_n_steps=self._log_every_n_steps)
-
-        if ctx.is_running_on_cpu(is_export_mode=is_export_mode):
-          logging.info('Running %s on CPU', mode)
-          estimator_spec = model_fn_wrapper.call_without_tpu(
-              features, labels, is_export_mode=is_export_mode)
-          if self._log_every_n_steps is not None:
-            estimator_spec = estimator_spec._replace(
-                training_hooks=estimator_spec.training_hooks + (examples_hook,))
-          return estimator_spec
-
-        assert labels is None, '`labels` passed to `model_fn` must be `None`.'
-        # TPUEstimator._call_input_fn passes `input_fn` as features to here.
-        assert callable(features), '`input_fn` is not callable.'
-        input_fn = features
-
-        input_holders = _InputPipeline(input_fn, batch_axis, ctx)
-        enqueue_ops, dequeue_fn, input_hooks, run_infeed_loop_on_coordinator = (
-            input_holders.generate_infeed_enqueue_ops_and_dequeue_fn())
-
-        graph = ops.get_default_graph()
-        for enqueue_op in enqueue_ops:
-          if isinstance(enqueue_op, list):
-            graph.get_collection_ref(_TPU_ENQUEUE_OPS).extend(enqueue_op)
-          else:
-            graph.add_to_collection(_TPU_ENQUEUE_OPS, enqueue_op)
-
-        if mode == model_fn_lib.ModeKeys.TRAIN:
-          loss, host_call, scaffold, training_hooks = (
-              _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
-          host_ops = host_call.create_tpu_hostcall()
-          if host_ops is None:
-            host_ops = []
-
-          shutdown_hooks = []
-          shutdown_mode = os.environ.get('TF_TPU_GRACEFUL_SHUTDOWN_MODE',
-                                         'shutdown_worker')
-          if shutdown_mode:
-            if shutdown_mode == 'shutdown_worker':
-              finalizer_hooks = [
-                  session_support.ShutdownLameWorkers(timeout_ms=60 * 1000),
-              ]
-            elif shutdown_mode == 'shutdown_computation':
-              finalizer_hooks = [
-                  session_support.RestartComputation(timeout_ms=60 * 1000),
-              ]
-            else:
-              raise ValueError(
-                  'Unknown TF_TPU_GRACEFUL_SHUTDOWN_MODE "%s"' % shutdown_mode)
-
-            shutdown_hooks.append(
-                session_support.GracefulShutdownHook(
-                    checkpoint_prefix=self.model_dir + '/model.ckpt',
-                    on_shutdown_hooks=finalizer_hooks))
-
-          with ops.control_dependencies([loss]):
-            global_step = array_ops.identity(training.get_global_step())
-          hooks = input_hooks + shutdown_hooks
-          hooks.extend([
-              TPUInfeedOutfeedSessionHook(
-                  ctx,
-                  enqueue_ops,
-                  host_ops,
-                  run_infeed_loop_on_coordinator=(
-                      run_infeed_loop_on_coordinator),
-                  rendezvous=self._rendezvous[mode],
-                  master=self._config.master,
-                  session_config=self._session_config,
-              ),
-              InstallSignalHandlerHook()
-          ])
-          if self._log_every_n_steps is not None:
-            logging_hook_frequency = (  # Divide and round up
-                (self._log_every_n_steps +
-                 self._config.tpu_config.iterations_per_loop - 1) //
-                self._config.tpu_config.iterations_per_loop)
-            hooks.append(
-                training.LoggingTensorHook({
-                    'loss': array_ops.identity(loss),
-                    'step': global_step,
-                },
-                                           every_n_iter=logging_hook_frequency))
-            examples_hook._set_steps_per_run(  # pylint: disable=protected-access
-                self._config.tpu_config.iterations_per_loop)
-            hooks.append(examples_hook)
-
-          if training_hooks:
-            hooks.extend(training_hooks)
-
-          chief_hooks = []
-          if (self._config.save_checkpoints_secs or
-              self._config.save_checkpoints_steps):
-            checkpoint_hook = training.CheckpointSaverHook(
-                self.model_dir,
-                save_secs=self._config.save_checkpoints_secs,
-                save_steps=self._config.save_checkpoints_steps,
-                scaffold=scaffold)
-            checkpoint_hook._set_steps_per_run(  # pylint: disable=protected-access
-                self._config.tpu_config.iterations_per_loop)
-            chief_hooks.append(checkpoint_hook)
-
-          summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss)
-          with ops.control_dependencies([loss]):
-            update_ops = _sync_variables_ops(ctx)
-
-          # Validate the TPU training graph to catch basic errors
-          _validate_tpu_training_graph()
-
-          train_op = control_flow_ops.group(*update_ops)
-          graph.add_to_collection(_TPU_TRAIN_OP, train_op)
-
-          return model_fn_lib.EstimatorSpec(
-              mode,
-              loss=loss,
-              training_chief_hooks=chief_hooks,
-              training_hooks=hooks,
-              train_op=train_op,
-              scaffold=scaffold)
-
-        if mode == model_fn_lib.ModeKeys.EVAL:
-          total_loss, host_calls, scaffold, eval_hooks = _eval_on_tpu_system(
-              ctx, model_fn_wrapper, dequeue_fn)
-          iterations_per_loop_var = _create_or_get_iterations_per_loop()
-          mean_loss = math_ops.div(
-              total_loss,
-              math_ops.cast(iterations_per_loop_var, dtype=total_loss.dtype))
-
-          with ops.control_dependencies([mean_loss]):
-            # After TPU evaluation computation is done (the mean_loss tensor),
-            # reads all variables back from TPU and updates the eval step
-            # counter properly
-            internal_ops_to_run = _sync_variables_ops(ctx)
-            internal_ops_to_run.append(
-                _increase_eval_step_op(iterations_per_loop_var))
-
-          host_call_ret = host_calls.create_tpu_hostcall()
-          eval_metric_ops = {}
-          eval_update_ops = []
-
-          eval_metrics = host_call_ret.get('eval_metrics', {})
-          if eval_metrics:
-            # Creates a dummy metric update_op for all metrics. Estimator
-            # expects all metrics in `eval_metric_ops` have update_op and calls
-            # them one by one. The real metric update_ops are invoked in a
-            # separated thread. So, here give Estimator the dummy op for all
-            # metrics.
-            with ops.control_dependencies(internal_ops_to_run):
-              dummy_update_op = control_flow_ops.no_op()
-
-            for k, v in eval_metrics.items():
-              eval_metric_ops[k] = (v[0], dummy_update_op)
-              eval_update_ops.append(v[1])
-          else:
-            # If no eval metrics are passed, create an identity node for the
-            # loss and add `internal_ops_to_run` to its dependencies. So
-            # `internal_ops_to_run` can be executed.
-            with ops.control_dependencies(internal_ops_to_run):
-              mean_loss = array_ops.identity(mean_loss)
-
-          if 'host_call' not in host_call_ret:
-            host_ops = []
-          else:
-            host_ops = host_call_ret['host_call']
-          hooks = [
-              TPUInfeedOutfeedSessionHook(
-                  ctx,
-                  enqueue_ops,
-                  eval_update_ops + host_ops,
-                  run_infeed_loop_on_coordinator=(
-                      run_infeed_loop_on_coordinator),
-                  rendezvous=self._rendezvous[mode],
-                  master=self._config.evaluation_master,
-                  session_config=self._session_config,
-              )] + input_hooks
-
-          if eval_hooks:
-            hooks.extend(eval_hooks)
-
-          return model_fn_lib.EstimatorSpec(
-              mode,
-              loss=mean_loss,
-              evaluation_hooks=hooks,
-              eval_metric_ops=eval_metric_ops,
-              scaffold=scaffold)
-
-        # Predict
-        assert mode == model_fn_lib.ModeKeys.PREDICT
-
-        (dummy_predict_op, host_calls,
-         scaffold, prediction_hooks) = _predict_on_tpu_system(
-             ctx, model_fn_wrapper, dequeue_fn)
-        with ops.control_dependencies([dummy_predict_op]):
-          internal_ops_to_run = _sync_variables_ops(ctx)
-          with ops.control_dependencies(internal_ops_to_run):
-            dummy_predict_op = control_flow_ops.no_op()
-
-        # In train and evaluation, the main TPU program is passed to monitored
-        # training session to run. Infeed enqueue and outfeed dequeue are
-        # executed in side threads. This is not the configuration for
-        # prediction mode.
-        #
-        # For prediction, the Estimator executes the EstimatorSpec.predictions
-        # directly and yield the element (via generator) to call site. So, the
-        # outfeed based prediction must be passed to MonitoredSession directly.
-        # Other parts of the TPU execution are organized as follows.
-        #
-        # 1. All outfeed based Tensors must be grouped with predictions Tensors
-        #    to form a single invocation. This avoid the issue we might trigger
-        #    multiple outfeeds incorrectly. To achieve this, `host_call` is
-        #    placed in control_dependencies of `stopping_signals`, and
-        #    `stopping_signals` is passed into _StoppingPredictHook, which sets
-        #    the `stopping_signals` as SessionRunArgs. MonitoredSession merges
-        #    all SessionRunArgs with the fetch in session.run together.
-        #
-        # 2. The TPU program (dummy_predict_op) and enqueue_ops (infeed Enqueue)
-        #    are grouped together. They will be launched once and only once in
-        #    side threads and they quit naturally according to the SAME stopping
-        #    condition.
-        enqueue_ops.append(dummy_predict_op)
-
-        host_call_ret = host_calls.create_tpu_hostcall()
-        if 'host_call' not in host_call_ret:
-          host_ops = []
-        else:
-          host_ops = host_call_ret['host_call']
-
-        predictions = host_call_ret['predictions']
-        _verify_cross_hosts_transfer_size(
-            predictions,
-            message=(
-                'The estimated size for TPUEstimatorSpec.predictions is too '
-                'large.'))
-        signals = host_call_ret['signals']
-
-        with ops.control_dependencies(host_ops):
-          host_ops = []  # Empty, we do do not need it anymore.
-          scalar_stopping_signal = _StopSignals.as_scalar_stopping_signal(
-              signals)
-          predictions = _PaddingSignals.slice_tensor_or_dict(
-              predictions, signals)
-
-        hooks = [
-            _StoppingPredictHook(scalar_stopping_signal),
-            TPUInfeedOutfeedSessionHookForPrediction(
-                ctx, enqueue_ops, host_ops, rendezvous=self._rendezvous[mode],
-                master=self._config.master,
-                session_config=self._session_config),
-        ] + input_hooks
-
-        if prediction_hooks:
-          hooks.extend(prediction_hooks)
-
-        return model_fn_lib.EstimatorSpec(
-            mode,
-            prediction_hooks=hooks,
-            predictions=predictions,
-            scaffold=scaffold)
-
-    return _model_fn
-
-
-def _is_tpu_tensor(tensor):
-  if not isinstance(tensor, ops.Tensor):
-    return False
-  try:
-    tensor.op.get_attr(tpu._OUTSIDE_COMPILATION_ATTR)  # pylint: disable=protected-access
-  except ValueError:
-    return True
-  else:
-    return False
-
-
-def _export_output_to_tensors(export_output):
-  """Get a list of `Tensors` used in `export_output`.
-
-  Args:
-    export_output: an `ExportOutput` object such as `ClassificationOutput`,
-      `RegressionOutput`, or `PredictOutput`.
-
-  Returns:
-    a list of tensors used in export_output.
-
-  Raises:
-    ValueError: if `export_output` is not one of `ClassificationOutput`,
-        `RegressionOutput`, or `PredictOutput`.
-  """
-  if isinstance(export_output, export_output_lib.ClassificationOutput):
-    return [export_output.scores, export_output.classes]
-  elif isinstance(export_output, export_output_lib.RegressionOutput):
-    return [export_output.value]
-  elif isinstance(export_output, export_output_lib.PredictOutput):
-    return list(export_output.outputs.values())
-  else:
-    raise ValueError(
-        '`export_output` must be have type `ClassificationOutput`, '
-        '`RegressionOutput`, or `PredictOutput`; got {}.'.format(export_output))
-
-
-def _clone_export_output_with_tensors(export_output, tensors):
-  """Clones `export_output` but with new `tensors`.
-
-  Args:
-    export_output: an `ExportOutput` object such as `ClassificationOutput`,
-      `RegressionOutput`, or `PredictOutput`.
-    tensors: a list of `Tensors` used to construct a new `export_output`.
-
-  Returns:
-    A dict similar to `export_output` but with `tensors`.
-
-  Raises:
-    ValueError: if `export_output` is not one of `ClassificationOutput`,
-        `RegressionOutput`, or `PredictOutput`.
-  """
-  if isinstance(export_output, export_output_lib.ClassificationOutput):
-    if len(tensors) != 2:
-      raise ValueError('tensors must be of length 2; '
-                       'got {}.'.format(len(tensors)))
-    return export_output_lib.ClassificationOutput(*tensors)
-  elif isinstance(export_output, export_output_lib.RegressionOutput):
-    if len(tensors) != 1:
-      raise ValueError('tensors must be of length 1; '
-                       'got {}'.format(len(tensors)))
-    return export_output_lib.RegressionOutput(*tensors)
-  elif isinstance(export_output, export_output_lib.PredictOutput):
-    return export_output_lib.PredictOutput(
-        dict(zip(export_output.outputs.keys(), tensors)))
-  else:
-    raise ValueError(
-        '`export_output` must be have type `ClassificationOutput`, '
-        '`RegressionOutput`, or `PredictOutput`; got {}.'.format(export_output))
-
-
-def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
-  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
-  iterations_per_loop_var = _create_or_get_iterations_per_loop()
-
-  (single_tpu_eval_step, host_calls, captured_scaffold_fn, captured_eval_hooks
-  ) = model_fn_wrapper.convert_to_single_tpu_eval_step(dequeue_fn)
-
-  def multi_tpu_eval_steps_on_single_shard():
-    return training_loop.repeat(iterations_per_loop_var, single_tpu_eval_step,
-                                [_ZERO_LOSS])
-
-  (loss,) = tpu.shard(
-      multi_tpu_eval_steps_on_single_shard,
-      inputs=[],
-      num_shards=ctx.num_replicas,
-      outputs_from_all_shards=False,
-      device_assignment=ctx.device_assignment)
-
-  scaffold = _get_scaffold(captured_scaffold_fn)
-  return loss, host_calls, scaffold, captured_eval_hooks.get()
-
-
-def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
-  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
-  iterations_per_loop_var = _create_or_get_iterations_per_loop()
-
-  (single_tpu_train_step, host_call, captured_scaffold_fn,
-   captured_training_hooks) = (
-       model_fn_wrapper.convert_to_single_tpu_train_step(dequeue_fn))
-
-  def multi_tpu_train_steps_on_single_shard():
-    return training_loop.repeat(iterations_per_loop_var, single_tpu_train_step,
-                                [_INITIAL_LOSS])
-
-  (loss,) = tpu.shard(
-      multi_tpu_train_steps_on_single_shard,
-      inputs=[],
-      num_shards=ctx.num_replicas,
-      outputs_from_all_shards=False,
-      device_assignment=ctx.device_assignment)
-
-  scaffold = _get_scaffold(captured_scaffold_fn)
-  return loss, host_call, scaffold, captured_training_hooks.get()
-
-
-def _predict_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
-  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
-  (single_tpu_predict_step, host_calls, captured_scaffold_fn,
-   captured_predict_hooks
-  ) = model_fn_wrapper.convert_to_single_tpu_predict_step(dequeue_fn)
-
-  def multi_tpu_predict_steps_on_single_shard():
-
-    def cond(scalar_stopping_signal):
-      return math_ops.logical_not(
-          _StopSignals.should_stop(scalar_stopping_signal))
-
-    inputs = [_StopSignals.NON_STOPPING_SIGNAL]
-    outputs = training_loop.while_loop(
-        cond, single_tpu_predict_step, inputs=inputs, name=b'loop')
-    return outputs
-
-  (dummy_predict_op,) = tpu.shard(
-      multi_tpu_predict_steps_on_single_shard,
-      inputs=[],
-      num_shards=ctx.num_replicas,
-      outputs_from_all_shards=False,
-      device_assignment=ctx.device_assignment)
-
-  scaffold = _get_scaffold(captured_scaffold_fn)
-  return dummy_predict_op, host_calls, scaffold, captured_predict_hooks.get()
-
-
-def _wrap_computation_in_while_loop(device, op_fn):
-  """Wraps the ops generated by `op_fn` in tf.while_loop."""
-
-  def computation(i):
-    with ops.control_dependencies(op_fn()):
-      return i + 1
-
-  iterations_per_loop_var = _create_or_get_iterations_per_loop()
-  # By setting parallel_iterations=1, the parallel execution in while_loop is
-  # basically turned off.
-  with ops.device(device):
-    iterations = array_ops.identity(iterations_per_loop_var)
-    return control_flow_ops.while_loop(
-        lambda i: i < iterations,
-        computation, [constant_op.constant(0)],
-        parallel_iterations=1)
-
-
-def _wrap_computation_in_while_loop_with_stopping_signals(device, op_fn):
-  """Wraps the ops generated by `op_fn` in tf.while_loop."""
-
-  def cond(scalar_stopping_signal):
-    return math_ops.logical_not(
-        _StopSignals.should_stop(scalar_stopping_signal))
-
-  def computation(unused_scalar_stopping_signal):
-    return_value = op_fn()
-    execute_ops = return_value['ops']
-    signals = return_value['signals']
-    with ops.control_dependencies(execute_ops):
-      return _StopSignals.as_scalar_stopping_signal(signals)
-
-  # By setting parallel_iterations=1, the parallel execution in while_loop is
-  # basically turned off.
-  with ops.device(device):
-    return control_flow_ops.while_loop(
-        cond,
-        computation, [_StopSignals.NON_STOPPING_SIGNAL],
-        parallel_iterations=1)
-
-
-def _validate_tpu_training_graph():
-  """Validate graph before running distributed training.
-
-  Raises:
-    ValueError: If the graph seems invalid for running on device
-  """
-  operations = ops.get_default_graph().get_operations()
-
-  # Check if there is atleast one CrossReplicaSum operation in the graph
-  # This should be introduced by using the CrossShardOptimizer wrapper
-  cross_replica_sum_ops = [
-      o for o in operations if o.type == _CROSS_REPLICA_SUM_OP
-  ]
-  if not cross_replica_sum_ops:
-    raise ValueError(
-        'CrossShardOptimizer must be used for model training on TPUs.')
-
-
-class _CapturedObject(object):
-  """A placeholder to capture an object.
-
-  This is useful when we need to capture a Python object in the Tensorflow
-  control flow body function and use it outside the control flow.
-  """
-
-  def __init__(self):
-    self._object = None
-    self._captured = False
-
-  def capture(self, o):
-    if self._captured:
-      raise RuntimeError(
-          'InternalError: Object can capture only once. Please file bug.')
-
-    self._captured = True
-    self._object = o
-
-  def get(self):
-    if not self._captured:
-      raise RuntimeError(
-          'InternalError: Object is not captured properly before `get`. '
-          'Please file bug.')
-    return self._object
-
-
-def _get_scaffold(captured_scaffold_fn):
-  """Retrieves the Scaffold from `captured_scaffold_fn`."""
-  with _CapturingContext(message='Inside scaffold_fn'):
-    scaffold_fn = captured_scaffold_fn.get()
-    if scaffold_fn:
-      scaffold = scaffold_fn()
-      if scaffold is None:
-        raise ValueError(
-            'TPUEstimatorSpec.scaffold_fn returns None, which is not allowed')
-    else:
-      scaffold = None
-
-  if scaffold:
-    wrapped_finalize = scaffold.finalize
-
-    def _finalize():
-      with _CapturingContext('Inside Scaffold.finalize'):
-        wrapped_finalize()
-
-    scaffold.finalize = _finalize
-  return scaffold
-
-
-class _CapturingContext(control_flow_ops.ControlFlowContext):
-  """Tracks references to Tensors defined in TPU replication."""
-
-  def __init__(self, message):
-    control_flow_ops.ControlFlowContext.__init__(self)
-    self._message = message
-
-  def to_control_flow_context_def(self, context_def, export_scope=None):
-    # pylint: disable=useless-super-delegation
-    # NOTE(slebedev): the method is required by `ControlFlowContext`.
-    super(_CapturingContext, self).to_control_flow_context_def(
-        context_def, export_scope)
-
-  def AddOp(self, op):  # pylint: disable=invalid-name
-    for c in op.inputs:
-      if tpu._TPU_REPLICATE_ATTR in c.op.node_def.attr:  # pylint: disable=protected-access
-        raise ValueError('{}: Op {} depends on TPU computation {}, '
-                         'which is not allowed.'.format(self._message, op, c))
-
-  def __enter__(self):
-    # pylint: disable=protected-access
-    self._g = ops.get_default_graph()
-    self._old = self._g._get_control_flow_context()
-    self._g._set_control_flow_context(self)
-    # pylint: enable=protected-access
-
-  def __exit__(self, _, __, ___):  # pylint: disable=invalid-name
-    self._g._set_control_flow_context(self._old)  # pylint: disable=protected-access
-
-
-class _Inputs(object):
-  """A data structure representing the input_fn returned values.
-
-  This also supports the returned value from input_fn as `Dataset`.
-  """
-
-  def __init__(self, features=None, labels=None, dataset=None, signals=None):
-    if dataset is not None and (features is not None or labels is not None or
-                                signals is not None):
-      raise RuntimeError('Internal Error: Either (features and labels) or '
-                         'dataset should be provided, not both. Please file '
-                         'bug')
-
-    self._features = features
-    self._labels = labels
-    self._signals = signals
-
-    self._dataset = dataset
-    self._iterator = None
-
-  @staticmethod
-  def from_input_fn(return_values):
-    """Returns an `_Inputs` instance according to `input_fn` return value."""
-    if isinstance(return_values, dataset_ops.DatasetV2):
-      dataset = return_values
-      return _Inputs(dataset=dataset)
-
-    features, labels = _Inputs._parse_inputs(return_values)
-    return _Inputs(features, labels)
-
-  @staticmethod
-  def _parse_inputs(return_values):
-    if isinstance(return_values, tuple):
-      features, labels = return_values
-    else:
-      features, labels = return_values, None
-    return features, labels
-
-  @property
-  def is_dataset(self):
-    """Returns True if the return value from input_fn is Dataset."""
-    return self._dataset is not None
-
-  def dataset_initializer(self):
-    """Returns the dataset's initializer.
-
-    The initializer must be run before calling `features_and_labels`.
-    """
-    self._iterator = dataset_ops.make_initializable_iterator(self._dataset)
-    return self._iterator.initializer
-
-  def features_and_labels(self):
-    """Gets `features` and `labels`."""
-    if self.is_dataset:
-      if self._iterator is None:
-        raise RuntimeError('Internal error: Must run dataset_initializer '
-                           'before calling features_and_labels(). Please file '
-                           'a bug!')
-      return _Inputs._parse_inputs(self._iterator.get_next())
-
-    return (self._features, self._labels)
-
-  def signals(self):
-    return self._signals
-
-  @property
-  def dataset(self):
-    return self._dataset
-
-
-class _InputsWithStoppingSignals(_Inputs):
-  """Inputs with `_StopSignals` inserted into the dataset."""
-
-  def __init__(self,
-               dataset,
-               batch_size,
-               add_padding=False,
-               num_invocations_per_step=1):
-
-    assert dataset is not None
-    user_provided_dataset = dataset.map(
-        _InputsWithStoppingSignals.insert_stopping_signal(
-            stop=False, batch_size=batch_size, add_padding=add_padding))
-    if num_invocations_per_step == 1:
-      final_batch_dataset = dataset.take(1).map(
-          _InputsWithStoppingSignals.insert_stopping_signal(
-              stop=True, batch_size=batch_size, add_padding=add_padding))
-    else:
-      # We append (2 * num_invocations_per_step - 1) batches for exhausting the
-      # user_provided_dataset and stop properly.
-      # For example, if num_invocations_per_step is 2, we append 3 additional
-      # padding batches: b1, b2, b3.
-      # If user_provided_dataset contains two batches: a1, a2
-      # Step 1: [a1, a2]
-      # Step 2: [b1, b2] -> STOP
-      # If user_provided_dataset contains three batches: a1, a2, a3.
-      # The training loops:
-      # Step 1: [a1, a2]
-      # Step 2: [a3, b1]
-      # Step 3: [b2, b3] -> STOP.
-      final_batch_dataset = dataset.take(1).map(
-          _InputsWithStoppingSignals.insert_stopping_signal(
-              stop=True, batch_size=batch_size, add_padding=add_padding))
-      final_batch_dataset = final_batch_dataset.repeat(
-          2 * num_invocations_per_step - 1)
-
-      def _set_mask(data_dict):
-        signals = data_dict['signals']
-        signals['padding_mask'] = array_ops.ones_like(signals['padding_mask'])
-        data_dict['signals'] = signals
-        return data_dict
-
-      # Mask out the extra batch.
-      final_batch_dataset = final_batch_dataset.map(_set_mask)
-
-    dataset = user_provided_dataset.concatenate(final_batch_dataset).prefetch(2)
-
-    super(_InputsWithStoppingSignals, self).__init__(dataset=dataset)
-    self._current_inputs = None
-
-  def features_and_labels(self):
-    if self._current_inputs is not None:
-      raise RuntimeError(
-          'Internal Error: The previous inputs have not been properly '
-          'consumed. First call features_and_labels, then call signals.')
-
-    inputs_with_signals = self._iterator.get_next()
-    features = inputs_with_signals['features']
-    labels = inputs_with_signals.get('labels')
-
-    self._current_inputs = inputs_with_signals
-    return features, labels
-
-  def signals(self):
-    """Returns the `Signals` from `_Inputs`."""
-    if self._current_inputs is None:
-      raise RuntimeError(
-          'Internal Error: The current inputs have not been properly '
-          'generated. First call features_and_labels, then call signals.')
-    signals = self._current_inputs['signals']
-    self._current_inputs = None
-    return signals
-
-  @staticmethod
-  def insert_stopping_signal(stop, batch_size, add_padding=False):
-    """Inserts stopping_signal into dataset via _map_fn.
-
-    Here we change the data structure in the dataset, such that the return value
-    is a dictionary now and `features`, `labels`, and `signals` are three
-    distinguished keys in that dict. This provides a better structure, which
-    eases the process to decompose the inputs (see `features_and_labels`).
-
-    Args:
-      stop: bool, state of current stopping signals.
-      batch_size: int, batch size.
-      add_padding: bool, whether to pad the tensor to full batch size.
-
-    Returns:
-      A map_fn passed to dataset.map API.
-    """
-
-    def _map_fn(*args):
-      """The map fn to insert signals."""
-      if len(args) == 1:
-        # Unpack the single Tensor/dict argument as features. This is required
-        # for the input_fn returns no labels.
-        args = args[0]
-      features, labels = _Inputs._parse_inputs(args)
-      new_input_dict = {}
-
-      if add_padding:
-        padding_mask, features, labels = (
-            _PaddingSignals.pad_features_and_labels(features, labels,
-                                                    batch_size))
-
-        new_input_dict['features'] = features
-        if labels is not None:
-          new_input_dict['labels'] = labels
-
-      else:
-        new_input_dict['features'] = features
-        if labels is not None:
-          new_input_dict['labels'] = labels
-        padding_mask = None
-
-      new_input_dict['signals'] = _StopSignals(
-          stop=stop, batch_size=batch_size,
-          padding_mask=padding_mask).as_dict()
-
-      return new_input_dict
-
-    return _map_fn
-
-
-class _StopSignals(object):
-  """Signals class holding all logic to handle TPU stopping condition."""
-
-  NON_STOPPING_SIGNAL = False
-  STOPPING_SIGNAL = True
-
-  def __init__(self, stop, batch_size, padding_mask=None):
-    self._stop = stop
-    self._batch_size = batch_size
-    self._padding_mask = padding_mask
-
-  def as_dict(self):
-    """Returns the signals as Python dict."""
-    shape = [self._batch_size, 1]
-    dtype = dtypes.bool
-
-    if self._stop:
-      stopping = array_ops.ones(shape=shape, dtype=dtype)
-    else:
-      stopping = array_ops.zeros(shape=shape, dtype=dtype)
-
-    signals = {'stopping': stopping}
-    if self._padding_mask is not None:
-      signals['padding_mask'] = self._padding_mask
-    return signals
-
-  @staticmethod
-  def as_scalar_stopping_signal(signals):
-    return array_ops.identity(signals['stopping'][0][0])
-
-  @staticmethod
-  def should_stop(scalar_stopping_signal):
-    """Detects whether scalar_stopping_signal indicates stopping."""
-    if isinstance(scalar_stopping_signal, ops.Tensor):
-      # STOPPING_SIGNAL is a constant True. Here, the logical_and is just the TF
-      # way to express the bool check whether scalar_stopping_signal is True.
-      return math_ops.logical_and(scalar_stopping_signal,
-                                  _StopSignals.STOPPING_SIGNAL)
-    else:
-      # For non Tensor case, it is used in SessionRunHook. So, we cannot modify
-      # the graph anymore. Here, we use pure Python.
-      return bool(scalar_stopping_signal)
-
-
-class _PaddingSignals(object):
-  """Signals class holding all logic to handle padding."""
-
-  @staticmethod
-  def pad_features_and_labels(features, labels, batch_size):
-    """Pads out the batch dimension of features and labels."""
-    real_batch_size = array_ops.shape(
-        _PaddingSignals._find_any_tensor(features))[0]
-
-    batch_size_tensor = constant_op.constant(batch_size, dtypes.int32)
-
-    check_greater = check_ops.assert_greater_equal(
-        batch_size_tensor,
-        real_batch_size,
-        data=(batch_size_tensor, real_batch_size),
-        message='The real batch size should not be greater than batch_size.')
-
-    with ops.control_dependencies([check_greater]):
-      missing_count = batch_size_tensor - real_batch_size
-
-    def pad_single_tensor(tensor):
-      """Pads out the batch dimension of a tensor to the complete batch_size."""
-      rank = len(tensor.shape)
-      assert rank > 0
-      padding = array_ops.stack([[0, missing_count]] + [[0, 0]] * (rank - 1))
-      padded_shape = (batch_size,) + tuple(tensor.shape[1:])
-      padded_tensor = array_ops.pad(tensor, padding)
-      padded_tensor.set_shape(padded_shape)
-      return padded_tensor
-
-    def nest_pad(tensor_or_dict):
-      return nest.map_structure(pad_single_tensor, tensor_or_dict)
-
-    features = nest_pad(features)
-    if labels is not None:
-      labels = nest_pad(labels)
-
-    padding_mask = _PaddingSignals._padding_mask(real_batch_size, missing_count,
-                                                 batch_size)
-
-    return padding_mask, features, labels
-
-  @staticmethod
-  def slice_tensor_or_dict(tensor_or_dict, signals):
-    """Slice the real Tensors according to padding mask in signals."""
-
-    padding_mask = signals['padding_mask']
-    batch_size = array_ops.shape(padding_mask)[0]
-
-    def verify_batch_size(tensor):
-      check_batch_size = math_ops.equal(batch_size, tensor.shape[0])
-      with ops.control_dependencies([check_batch_size]):
-        return array_ops.identity(tensor)
-
-    def slice_single_tensor(tensor):
-      rank = len(tensor.shape)
-      assert rank > 0
-      real_batch_size = batch_size - math_ops.reduce_sum(padding_mask)
-      return verify_batch_size(tensor)[0:real_batch_size]
-
-    # As we split the Tensors to all TPU cores and concat them back, it is
-    # important to ensure the real data is placed before padded ones, i.e.,
-    # order is preserved. By that, the sliced padding mask should have all 0's.
-    # If this assertion failed, # the slice logic here would not hold.
-    sliced_padding_mask = slice_single_tensor(padding_mask)
-    assert_padding_mask = math_ops.equal(
-        math_ops.reduce_sum(sliced_padding_mask), 0)
-
-    with ops.control_dependencies([assert_padding_mask]):
-      should_stop = _StopSignals.should_stop(
-          _StopSignals.as_scalar_stopping_signal(signals))
-
-    is_full_batch = math_ops.equal(math_ops.reduce_sum(padding_mask), 0)
-
-    def slice_fn(tensor):
-      # If the current batch is full batch or part of stopping signals, we do
-      # not need to slice to save performance.
-      return control_flow_ops.cond(
-          math_ops.logical_or(should_stop, is_full_batch),
-          (lambda: verify_batch_size(tensor)),
-          (lambda: slice_single_tensor(tensor)))
-
-    return nest.map_structure(slice_fn, tensor_or_dict)
-
-  @staticmethod
-  def _find_any_tensor(batch_features):
-    tensors = [
-        x for x in nest.flatten(batch_features) if isinstance(x, ops.Tensor)
-    ]
-    if not tensors:
-      raise ValueError('Cannot find any Tensor in features dict.')
-    return tensors[0]
-
-  @staticmethod
-  def _padding_mask(real_batch_size, missing_count, batch_size):
-    padding_mask = array_ops.concat([
-        array_ops.zeros((real_batch_size,), dtype=dtypes.int32),
-        array_ops.ones((missing_count,), dtype=dtypes.int32)
-    ],
-                                    axis=0)
-    padding_mask.set_shape((batch_size,))
-    return padding_mask
-
-
-def _verify_cross_hosts_transfer_size(tensor_dict, message):
-  total_size = 0
-  tensor_structure = {}
-  for key, tensor in tensor_dict.items():
-    shape = tensor.shape
-    size = np.product(shape) * tensor.dtype.size
-    tensor_structure[key] = shape
-    total_size += size
-  if total_size >= _ONE_GIGABYTE:
-    raise ValueError(
-        '{} The transfer size is larger than the protobuf limit. Please '
-        'consider to use Tensors with smaller shapes or reduce batch '
-        'size. Given:\n'
-        '{}'.format(
-            message, '\n'.join([
-                ' -- Key: {}, Shape: {}'.format(k, v)
-                for k, v in tensor_structure.items()
-            ])))
-
-
-def _add_item_to_params(params, key, value):
-  """Adds a new item into `params`."""
-  if isinstance(params, hparam.HParams):
-    # For HParams, we need to use special API.
-    if key in params:
-      params.set_hparam(key, value)
-    else:
-      params.add_hparam(key, value)
-  else:
-    # Now params is Python dict.
-    params[key] = value
-
-
-def export_estimator_savedmodel(estimator,
-                                export_dir_base,
-                                serving_input_receiver_fn,
-                                assets_extra=None,
-                                as_text=False,
-                                checkpoint_path=None,
-                                strip_default_attrs=False):
-  """Export `Estimator` trained model for TPU inference.
-
-  Args:
-    estimator: `Estimator` with which model has been trained.
-    export_dir_base: A string containing a directory in which to create
-      timestamped subdirectories containing exported SavedModels.
-    serving_input_receiver_fn: A function that takes no argument and returns a
-      `ServingInputReceiver` or `TensorServingInputReceiver`.
-    assets_extra: A dict specifying how to populate the assets.extra directory
-      within the exported SavedModel, or `None` if no extra assets are needed.
-    as_text: whether to write the SavedModel proto in text format.
-    checkpoint_path: The checkpoint path to export.  If `None` (the default),
-      the most recent checkpoint found within the model directory is chosen.
-    strip_default_attrs: Boolean. If `True`, default-valued attributes will be
-      removed from the NodeDefs.
-
-  Returns:
-    The string path to the exported directory.
-  """
-  # `TPUEstimator` requires `tpu_config.RunConfig`, so we cannot use
-  # `estimator.config`.
-  config = tpu_config.RunConfig(model_dir=estimator.model_dir)
-  est = TPUEstimator(
-      estimator._model_fn,  # pylint: disable=protected-access
-      config=config,
-      params=estimator.params,
-      use_tpu=True,
-      train_batch_size=2048,  # Does not matter.
-      eval_batch_size=2048,  # Does not matter.
-  )
-  return est.export_savedmodel(export_dir_base, serving_input_receiver_fn,
-                               assets_extra, as_text, checkpoint_path,
-                               strip_default_attrs)
+# pylint: disable=wildcard-import,unused-import,redefined-builtin
+from tensorflow.python.tpu.tpu_estimator import *
+# used by tests
+from tensorflow.python.tpu.tpu_estimator import _clone_export_output_with_tensors
+from tensorflow.python.tpu.tpu_estimator import _create_global_step
+from tensorflow.python.tpu.tpu_estimator import _export_output_to_tensors
+from tensorflow.python.tpu.tpu_estimator import _get_scaffold
+from tensorflow.python.tpu.tpu_estimator import _Inputs
+from tensorflow.python.tpu.tpu_estimator import _ITERATIONS_PER_LOOP_VAR
+from tensorflow.python.tpu.tpu_estimator import _TPU_ENQUEUE_OPS
+from tensorflow.python.tpu.tpu_estimator import _TPU_ESTIMATOR
+from tensorflow.python.tpu.tpu_estimator import _TPU_TRAIN_OP
+# pylint: enable=wildcard-import,unused-import,redefined-builtin
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
index d5957b7e8ec40b40c7af8822378cee6134ef0d0f..af2542ea85290170ce6a38223188c4f9b871f032 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
@@ -1,898 +1,25 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-
-"""Helper library for handling infeed between hosts and TPUs.
-"""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import itertools
-
-import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.compiler.xla.experimental.xla_sharding import xla_sharding
-from tensorflow.contrib.tpu.python.ops import tpu_ops
-from tensorflow.contrib.tpu.python.tpu import tpu
-from tensorflow.contrib.tpu.python.tpu import tpu_sharding
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.util import nest
-
-
-class InfeedQueue(object):
-  """A helper object to build a device infeed queue.
-
-  The InfeedQueue builds the host-side and device-side Ops to enqueue and
-  dequeue elements, respectively, and ensures that their types and
-  shapes match.
-  """
-
-  def __init__(self,
-               number_of_tuple_elements=None,
-               tuple_types=None,
-               tuple_shapes=None,
-               shard_dimensions=None,
-               name=None):
-    """Creates a new InfeedQueue with the given configuration.
-
-    The configuration need not be fully specified at creation since it
-    can be modified subsequently by methods that set the values
-    explicitly or infer them from the shapes of inputs.
-
-    Args:
-      number_of_tuple_elements: the number of Tensors fed atomically through the
-        queue, must be present unless it can be inferred from other arguments.
-      tuple_types: if not None, a list of types of the elements of the queue.
-      tuple_shapes: if not None, a list of shapes of the elements of the queue.
-      shard_dimensions: if not None, a list of dimensions on which the
-        elements of the queue should be sharded during automatic
-        parallelization.
-      name: the name of the queue.
-
-    Raises:
-      ValueError: if number_of_tuple_elements <= 0; or
-        number_of_tuple_arguments, tuple_types, tuple_shapes, and
-        shard_dimensions are all None; or the length of tuple_types,
-        tuple_shapes, or shard_dimensions is not equal to
-        number_of_tuple_elements; or any element of shard_dimensions
-        can't be converted to a Dimension.
-      TypeError: if any element of tuple_types or tuple_shapes can't
-        be converted to a dtype or TensorShape, respectively.
-    """
-    self._frozen = False
-    self._generated_enqueue_ops = False
-    self._generated_dequeue_op = False
-    self._name = "InfeedQueue" if name is None else name
-    if number_of_tuple_elements is None:
-      if tuple_types is not None:
-        number_of_tuple_elements = len(tuple_types)
-      elif tuple_shapes is not None:
-        number_of_tuple_elements = len(tuple_shapes)
-      elif shard_dimensions is not None:
-        number_of_tuple_elements = len(shard_dimensions)
-      else:
-        raise ValueError(
-            "number of tuple elements cannot be inferred from InfeedQueue "
-            "constructor")
-    if number_of_tuple_elements <= 0:
-      raise ValueError("number_of_tuple_elements %d must be > 0" %
-                       number_of_tuple_elements)
-    # Make an empty sharding policy for each tuple element.
-    self._sharding_policies = [
-        tpu_sharding.ShardingPolicy()
-        for _ in xrange(number_of_tuple_elements)
-    ]
-    if tuple_types is not None:
-      self.set_tuple_types(tuple_types)
-    else:
-      self._tuple_types = None
-    if tuple_shapes is not None:
-      self.set_tuple_shapes(tuple_shapes)
-    else:
-      self._tuple_shapes = None
-    if shard_dimensions is not None:
-      self.set_shard_dimensions(shard_dimensions)
-    self._validate()
-
-  def _validate(self):
-    """Checks that the configuration is self-consistent.
-
-    Raises:
-      ValueError: if the shapes and sharding policies don't match.
-    """
-    if self.tuple_shapes is not None:
-      for (policy, shape) in zip(self._sharding_policies, self._tuple_shapes):
-        # Raise an error if the policy is incompatible with the shape.
-        _ = policy.get_sharded_shape(shape)
-
-  @property
-  def number_of_tuple_elements(self):
-    """Returns the number of InfeedQueue tuple elements."""
-    return len(self._sharding_policies)
-
-  @property
-  def tuple_types(self):
-    """Returns the types of the InfeedQueue tuple elements."""
-    return self._tuple_types
-
-  def set_tuple_types(self, tuple_types):
-    """Sets the type of each element of the queue.
-
-    tuple_types must be a list of length
-    self.number_of_tuple_elements, and each element must be
-    convertible to a dtype.
-
-    Args:
-      tuple_types: the types of each queue element.
-
-    Raises:
-      ValueError: if tuple_types is not of length
-        self.number_of_tuple_elements.
-      TypeError: if an element of tuple_types cannot be converted to a
-        dtype.
-    """
-    if len(tuple_types) != self.number_of_tuple_elements:
-      raise ValueError("tuple_types is %s, but must be a list of length %d" %
-                       (str(tuple_types), self.number_of_tuple_elements))
-    if self._frozen:
-      for (frozen, updated) in zip(self._tuple_types, tuple_types):
-        if frozen != updated:
-          raise ValueError(
-              "Trying to update InfeedQueue with frozen configuration with an "
-              "incompatible type. Frozen types are %s, updated types are %s" % (
-                  str(self._tuple_types), str(tuple_types)))
-    else:
-      try:
-        self._tuple_types = [dtypes.as_dtype(t) for t in tuple_types]
-      except (TypeError) as e:
-        raise TypeError(
-            "tuple_types is %s, but must be a list of elements each "
-            "convertible to dtype: got error %s" % (str(tuple_types), str(e)))
-
-  @property
-  def tuple_shapes(self):
-    """Returns the shapes of the InfeedQueue tuple elements."""
-    return self._tuple_shapes
-
-  def set_tuple_shapes(self, tuple_shapes):
-    """Sets the shape of each element of the queue.
-
-    tuple_shapes must be a list of length
-    self.number_of_tuple_elements, and each element must be
-    convertible to a TensorShape.
-
-    Args:
-      tuple_shapes: the shapes of each queue element.
-
-    Raises:
-      ValueError: if tuple_shapes is not of length
-        self.number_of_tuple_elements.
-      TypeError: if an element of tuple_shapes cannot be converted to
-        a TensorShape.
-    """
-    if len(tuple_shapes) != self.number_of_tuple_elements:
-      raise ValueError("tuple_shapes is %s, but must be a list of length %d" %
-                       (str(tuple_shapes), self.number_of_tuple_elements))
-    try:
-      tuple_shapes = [tensor_shape.as_shape(shape) for shape in tuple_shapes]
-    except (ValueError, TypeError) as e:
-      raise TypeError(
-          "tuple_shapes is %s, but must be a list of elements each "
-          "convertible to TensorShape: got error %s" % (str(tuple_shapes),
-                                                        str(e)))
-    if self._frozen:
-      for (frozen, updated) in zip(self._tuple_shapes, tuple_shapes):
-        if frozen != updated:
-          raise ValueError(
-              "Trying to update InfeedQueue with frozen configuration with an "
-              "incompatible shape. Frozen shapes are %s, updated shapes are %s"
-              % (str(self._tuple_shapes), str(tuple_shapes)))
-    else:
-      self._tuple_shapes = tuple_shapes
-    self._validate()
-
-  @property
-  def sharding_policies(self):
-    """Returns the sharding policies of the InfeedQueue tuple elements."""
-    return self._sharding_policies
-
-  @property
-  def shard_dimensions(self):
-    """Gets the shard dimension of each tuple element.
-
-    Returns:
-      A list of length number_of_tuple_elements, where each list entry
-      is the shard dimension of that tuple element or None if the
-      shard dimension has not been set.
-    """
-    # The number of shards is always the same for all the policies.
-    return [policy.shard_dimension for policy in self._sharding_policies]
-
-  def set_shard_dimensions(self, shard_dimensions):
-    """Sets the shard_dimension of each element of the queue.
-
-    shard_dimensions must be a list of length
-    self.number_of_tuple_elements, and each element must be
-    convertible to a Dimension compatible with self.tuple_shapes.
-
-    Args:
-      shard_dimensions: the dimensions of each queue element.
-
-    Raises:
-      ValueError: if shard_dimensions is not of length
-        self.number_of_tuple_elements; or an element of
-        shard_dimensions cannot be converted to a Dimension; or an
-        element of shard_dimensions is a Dimension that is out of
-        range for the corresponding tuple element shape.
-    """
-    if len(shard_dimensions) != self.number_of_tuple_elements:
-      raise ValueError("shard_dimensions is %s, but must be a list of length %d"
-                       % (str(shard_dimensions),
-                          self.number_of_tuple_elements))
-    for (policy, dimension) in zip(self._sharding_policies, shard_dimensions):
-      policy.set_shard_dimension(dimension)
-    self._validate()
-
-  @property
-  def number_of_shards(self):
-    """Gets the number of shards to use for the InfeedQueue.
-
-    Returns:
-      Number of shards or None if the number of shards has not been set.
-    """
-    # The number of shards is always the same for all the policies.
-    return self._sharding_policies[0].number_of_shards
-
-  def set_number_of_shards(self, number_of_shards):
-    """Sets the number of shards to use for the InfeedQueue.
-
-    Args:
-      number_of_shards: number of ways to shard the InfeedQueue.
-
-    Raises:
-      ValueError: if number_of_shards is not > 0; or the policies have
-        been frozen and number_of_shards was already set to something
-        else.
-    """
-    for policy in self._sharding_policies:
-      policy.set_number_of_shards(number_of_shards)
-    self._validate()
-
-  def set_configuration_from_input_tensors(self, input_tensors):
-    """Sets the shapes and types of the queue tuple elements.
-
-    input_tensors is a list of Tensors whose types and shapes are used
-    to set the queue configuration.
-
-    Args:
-      input_tensors: list of Tensors of the same types and shapes as
-        the desired queue Tuple.
-
-    Raises:
-      ValueError: if input_tensors is not a list of length
-        self.number_of_tuple_elements
-    """
-    if len(input_tensors) != self.number_of_tuple_elements:
-      raise ValueError("input_tensors is %s, but should be a list of %d Tensors"
-                       % (str(input_tensors), self.number_of_tuple_elements))
-    self.set_tuple_shapes([t.shape for t in input_tensors])
-    self.set_tuple_types([t.dtype for t in input_tensors])
-
-  def set_configuration_from_sharded_input_tensors(self, input_tensors):
-    """Sets the shapes and types of the queue tuple elements.
-
-    input_tensors is a list of lists of Tensors whose types and shapes are used
-    to set the queue configuration. The length of the outer list is the number
-    of shards required, and each inner list is the tuple of Tensors to use to
-    determine the types and shapes of the corresponding shard. This method
-    depends on the shard dimension, and calling it freezes the shard policy.
-
-    Args:
-      input_tensors: list of lists of Tensors. The outer list length corresponds
-        to the desired number of shards, and each inner list is the size
-        and shape of the desired configuration of the corresponding shard.
-
-    Raises:
-      ValueError: if any inner list is not a list of length
-        self.number_of_tuple_elements; or the inner lists do not combine to
-        form a consistent unsharded shape.
-      TypeError: if the types of the Tensors in the inner lists do not match.
-    """
-    if not self._frozen:
-      # Unset the tuple shapes in case the configuration becomes
-      # transiently inconsistent.
-      self._tuple_shapes = None
-    number_of_shards = len(input_tensors)
-    self.set_number_of_shards(number_of_shards)
-    for t in input_tensors:
-      if len(t) != self.number_of_tuple_elements:
-        raise ValueError(
-            "input_tensors is %s but must be a list of lists, where each inner"
-            " list has length number_of_tuple_elements=%d" % (
-                str(input_tensors), self.number_of_tuple_elements))
-    # Transpose the inputs to make a list of shard shapes for each tuple
-    # element.
-    sharded_shapes = [[t[i].shape for t in input_tensors]
-                      for i in xrange(self.number_of_tuple_elements)]
-    # For each tuple, get the unsharded shape using that tuple's policy.
-    unsharded_shapes = [
-        policy.get_unsharded_shape(s)
-        for (policy, s) in zip(self._sharding_policies, sharded_shapes)
-    ]
-    self.set_tuple_shapes(unsharded_shapes)
-    for i in xrange(1, self.number_of_shards):
-      for (t1, t2) in zip(input_tensors[0], input_tensors[i]):
-        if t1.dtype != t2.dtype:
-          raise TypeError(
-              "types of the tuple elements of input_tensors %s are not "
-              "consistent" % str(input_tensors))
-    self.set_tuple_types([t.dtype for t in input_tensors[0]])
-
-  def freeze(self):
-    """Freezes the InfeedQueue so it can no longer be modified.
-
-    The configuration is implicitly frozen before any host-side or
-    device-side Ops are generated. The configuration cannot be frozen
-    until the types and shapes of the tuple elements have been set.
-
-    Raises:
-      ValueError: if the types or shapes of the tuple elements have not been
-      set.
-    """
-    self._frozen = True
-    if self._tuple_types is None:
-      raise ValueError(
-          "Can't freeze an InfeedQueue without setting all tuple types.")
-    if self._tuple_shapes is None:
-      raise ValueError(
-          "Can't freeze an InfeedQueue without setting all tuple shapes.")
-    for shape in self._tuple_shapes:
-      if shape.dims is None:
-        raise ValueError(
-            "Can't freeze an InfeedQueue without setting all tuple shapes.")
-    for policy in self._sharding_policies:
-      policy.freeze()
-    self._validate()
-
-  def generate_dequeue_op(self, tpu_device=0):
-    """Generates the device-side Op to dequeue a tuple from the queue.
-
-    Implicitly freezes the queue configuration if it is not already
-    frozen, which will raise errors if the shapes and types have not
-    been fully specified.
-
-    Args:
-      tpu_device: The TPU device ordinal where the infeed instruction should be
-        placed. If None, no explicit placement will be performed, and it is up
-        to the user to call this API from within a proper TPU device scope.
-        The XLA code will fail if the TPU dequeue instruction is not bound to
-        any device.
-
-    Returns:
-      A list of Outputs corresponding to a shard of infeed dequeued
-      into XLA, suitable for use within a replicated block.
-
-    Raises:
-      ValueError: if the types or shapes of the tuple elements have not been
-      set; or if a dequeue op has already been generated.
-    """
-    self.freeze()
-    if self._generated_dequeue_op:
-      raise ValueError("Can't generate two dequeue Ops from the same queue")
-    self._generated_dequeue_op = True
-    full_name = "%s/dequeue" % self._name
-    sharded_shapes = [
-        policy.get_sharded_shape(shape)
-        for (shape, policy) in zip(self._tuple_shapes, self._sharding_policies)
-    ]
-    if tpu_device is not None:
-      with ops.device(tpu.core(tpu_device)):
-        return tpu_ops.infeed_dequeue_tuple(
-            dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
-    else:
-      return tpu_ops.infeed_dequeue_tuple(
-          dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
-
-  def _generate_enqueue_op(self,
-                           inputs,
-                           name_prefix,
-                           index,
-                           device=None,
-                           tpu_ordinal=-1):
-    """Generate a host-side Op to enqueue a tuple to the queue.
-
-    If device is None the inputs are all required to have the same
-    device specification, and the enqueue Op is colocated with
-    inputs[0]. Otherwise the enqueue Op is placed on 'device'.
-
-    Args:
-      inputs: a list of Tensors with the types and shapes of the tuple elements.
-      name_prefix: the base name for the Op.
-      index: the shard index, used to uniquify the Op name.
-      device: device to place the Op on, or None if it should be
-        colocated with the inputs.
-      tpu_ordinal: ordinal of the TPU device on the host to use for
-      infeed if device is a CPU device. Should be set to -1 if device
-      is a TPU device.
-
-    Returns:
-      An Op corresponding to a shard of infeed enqueued at the host,
-      suitable for use within a replicated block.
-
-    Raises:
-      ValueError: if device is None and inputs do not all have the
-        same device specification.
-    """
-    full_name = "%s/%d" % (name_prefix, index)
-    shapes = [t.shape for t in inputs]
-    if device is None:
-      devices = [t.device for t in inputs]
-      for i in xrange(1, self.number_of_tuple_elements):
-        if devices[0] != devices[i]:
-          raise ValueError(
-              "input devices for shard %d are %s, but should all be the same" %
-              (index, str(devices)))
-      with ops.colocate_with(inputs[0]):
-        return tpu_ops.infeed_enqueue_tuple(
-            inputs=inputs,
-            shapes=shapes,
-            name=full_name,
-            device_ordinal=tpu_ordinal)
-    else:
-      with ops.device(device):
-        return tpu_ops.infeed_enqueue_tuple(
-            inputs=inputs,
-            shapes=shapes,
-            name=full_name,
-            device_ordinal=tpu_ordinal)
-
-  def generate_enqueue_ops(self,
-                           sharded_inputs,
-                           tpu_ordinal_function=None,
-                           placement_function=None):
-    """Generates the host-side Ops to enqueue the shards of a tuple.
-
-    sharded_inputs is a list, one for each shard, of lists of
-    Tensors. sharded_inputs[0] is the tuple of Tensors to use to feed
-    shard 0 if the queue. Returns the host-side Ops that must be run to
-    enqueue the sharded tuple. The Op for shard i is colocated with the inputs
-    for shard i.
-
-    Implicitly freezes the queue configuration if it is not already
-    frozen. If the configuration has already been frozen, and is not
-    compatible with the types and shapes of sharded_inputs, an error
-    will be raised.
-
-    Args:
-      sharded_inputs: a list of lists of Tensors. The length of the outer list
-        determines the number of shards. Each inner list indicates the types
-        and shapes of the tuples in the corresponding shard.
-      tpu_ordinal_function: if not None, a function that takes the
-        shard index as input and returns the ordinal of the TPU device
-        the shard's infeed should be placed on. tpu_ordinal_function must be
-        set if the inputs are placed on CPU devices.
-      placement_function: if not None, a function that takes the shard index as
-        input and returns the host device where the enqueue op should be placed
-        on.
-
-    Returns:
-      A list of host-side Ops, one for each shard, that when executed together
-      will enqueue a full-size element of infeed.
-
-    Raises:
-      ValueError: if the queue configuration has previously been frozen and the
-        shapes of the elements of sharded_inputs are not compatible with the
-        frozen configuration; or if the shapes of the elements of sharded_inputs
-        don't form a consistent unsharded tuple; or if the elements of a tuple
-        have different device constraints.
-      TypeError: if the queue configuration has previously been frozen and the
-        types of the elements of sharded_inputs are not compatible with the
-        frozen configuration; or if the types of the elements of sharded_inputs
-        don't form a consistent unsharded tuple.
-    """
-    self.set_configuration_from_sharded_input_tensors(sharded_inputs)
-    self.freeze()
-    if self._generated_enqueue_ops:
-      raise ValueError("Can't generate two enqueue Ops from the same queue")
-    self._generated_enqueue_ops = True
-    if tpu_ordinal_function is None:
-      tpu_ordinal_function = lambda index: -1
-    name_prefix = "%s/enqueue" % self._name
-    return [
-        self._generate_enqueue_op(
-            shard,
-            name_prefix,
-            index,
-            tpu_ordinal=tpu_ordinal_function(index),
-            device=placement_function(index) if placement_function else None)
-        for (shard, index) in zip(sharded_inputs, xrange(self.number_of_shards))
-    ]
-
-  # TODO(misard) Generalize this to the case of systems that don't
-  # have 8 devices per host, and figure out what to do with
-  # model-parallelism.
-  def _default_placement_function(self, index):
-    return "/task:%d/device:CPU:0" % (index / 8)
-
-  def _default_ordinal_function(self, index):
-    return index % 8
-
-  # TODO(b/36470756) remove this from tutorials once we have a better story
-  # for automatic placement of input pipelines.
-  def split_inputs_and_generate_enqueue_ops(self,
-                                            inputs,
-                                            device_assignment=None,
-                                            placement_function=None,
-                                            tpu_ordinal_function=None):
-    """POORLY-PERFORMING ON MULTI-HOST SYSTEMS.
-
-    Generates the host-side Ops to enqueue a tuple.
-
-    This method performs poorly because it takes an entire input on a single
-    host, splits it, and distributes it to all of the cores. It is present only
-    to simplify tutorial examples.
-
-    inputs is a list of Tensors to use to feed the queue. Each input is split
-    into self.number_of_shards shards. Returns an Op for each shard to enqueue
-    the shard. The Op for shard i is placed on device placement_function(i).
-
-    Implicitly freezes the queue configuration if it is not already
-    frozen. If the configuration has already been frozen, and is not
-    compatible with the types and shapes of inputs, an error
-    will be raised.
-
-    Args:
-      inputs: a list of Tensors which indicates the types and shapes of the
-        queue tuple.
-     device_assignment: if not `None`, a TPU `DeviceAssignment`. If
-        device_assignment is not `None`, but `placement_function` and
-        `ordinal_function` are None, then `device_assignment` will be used to
-        place infeeds on the first k TPU shards, where k is the number of shards
-        in the queue. If all three are `None`, then default placement and
-        ordinal functions are used.
-      placement_function: if not None, a function that takes the shard
-        index as input and returns a device string indicating which
-        device the shard's infeed should be placed on. If placement_function
-        and tpu_ordinal_function are None, inputs are sharded round-robin
-        across the devices in the system.
-      tpu_ordinal_function: if not None, a function that takes the
-        shard index as input and returns the ordinal of the TPU device
-        the shard's infeed should be placed on. If placement_function
-        and tpu_ordinal_function are None, inputs are sharded round-robin
-        across the devices in the system.
-
-    Returns:
-      A list of host-side Ops, one for each shard, that when executed together
-      will enqueue a full-size element of infeed.
-
-    Raises:
-      ValueError: if the queue configuration has previously been frozen and the
-        shapes of the elements of inputs are not compatible with the frozen
-        configuration.
-      TypeError: if the queue configuration has previously been frozen and the
-        types of the elements of inputs are not compatible with the frozen
-        configuration.
-    """
-    if device_assignment is None:
-      if placement_function is None:
-        placement_function = self._default_placement_function
-      if tpu_ordinal_function is None:
-        tpu_ordinal_function = self._default_ordinal_function
-    else:
-
-      def _placement_function_from_map(index):
-        return device_assignment.host_device(replica=index)
-
-      def _ordinal_function_from_map(index):
-        return device_assignment.tpu_ordinal(replica=index)
-
-      if placement_function is None:
-        placement_function = _placement_function_from_map
-      if tpu_ordinal_function is None:
-        tpu_ordinal_function = _ordinal_function_from_map
-    self.set_configuration_from_input_tensors(inputs)
-    self.freeze()
-    if self._generated_enqueue_ops:
-      raise ValueError("Can't generate two enqueue Ops from the same queue")
-    self._generated_enqueue_ops = True
-    split_name_prefix = "%s/split" % self._name
-    if self.number_of_shards == 1:
-      transposed_sharded_inputs = [[inp] for inp in inputs]
-    else:
-
-      def split_fn(inp, num_shards, axis, name):
-        with ops.colocate_with(inp):
-          return array_ops.split(inp, num_shards, axis=axis, name=name)
-
-      transposed_sharded_inputs = [
-          split_fn(
-              inp,
-              self.number_of_shards,
-              axis=policy.shard_dimension,
-              name="%s/%d" % (split_name_prefix, index))
-          for (inp, policy, index) in zip(inputs, self._sharding_policies,
-                                          xrange(self.number_of_tuple_elements))
-      ]
-    sharded_inputs = [[shard[i] for shard in transposed_sharded_inputs]
-                      for i in xrange(self.number_of_shards)]
-    name_prefix = "%s/enqueue" % self._name
-    return [
-        self._generate_enqueue_op(
-            shard,
-            name_prefix,
-            index,
-            device=placement_function(index),
-            tpu_ordinal=tpu_ordinal_function(index))
-        for (shard, index) in zip(sharded_inputs, xrange(self.number_of_shards))
-    ]
-
-
-class _PartitionedInfeedQueue(InfeedQueue):
-  """A helper object to build a device infeed queue with input partition.
-
-  Args:
-    number_of_tuple_elements: the number of Tensors fed atomically through the
-      queue, must be present unless it can be inferred from other arguments.
-    device_assignment: A TPU `DeviceAssignment` which is used to place all the
-      partitions to different TPU infeed queues.
-    host_id: The id of the host machine.
-    input_partition_dims: A nested list/tuple of integers. Each inner
-      list/tuple describes how to partition the corresponding input tensor.
-    tuple_types: If not None, a list of types of the elements of the queue.
-    tuple_shapes: If not None, a list of shapes of the elements of the queue.
-    name: The name of the queue.
-  """
-
-  def __init__(self,
-               number_of_tuple_elements,
-               device_assignment,
-               host_id,
-               input_partition_dims=None,
-               tuple_types=None,
-               tuple_shapes=None,
-               name=None):
-    super(_PartitionedInfeedQueue, self).__init__(
-        number_of_tuple_elements=number_of_tuple_elements,
-        tuple_types=tuple_types,
-        tuple_shapes=None,
-        shard_dimensions=None,
-        name="PartitionedInfeedQueue" if name is None else name)
-    self._input_partition_dims = input_partition_dims
-    self._host_id = host_id
-    self._device_assignment = device_assignment
-
-  def generate_dequeue_op(self, tpu_device=0):
-    """Generate TPU dequeue ops.
-
-    Args:
-      tpu_device: The TPU device ordinal where the infeed instruction should be
-        placed.
-
-    Returns:
-      A list of Outputs corresponding to a partition of infeed dequeued
-      into XLA, suitable for use within a replicated block.
-
-    Raises:
-      ValueError: if the types or shapes of the tuple elements have not been
-      set; or if a dequeue op has already been generated.
-    """
-    self.freeze()
-    if self._generated_dequeue_op:
-      raise ValueError("Can't generate two dequeue Ops from the same queue")
-    self._generated_dequeue_op = True
-    full_name = "%s/dequeue" % self._name
-    sharded_shapes = [
-        policy.get_sharded_shape(shape)
-        for (shape, policy) in zip(self._tuple_shapes, self._sharding_policies)
-    ]
-    with ops.device(tpu.core(tpu_device)):
-      values = tpu_ops.infeed_dequeue_tuple(
-          dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
-    return self._tag_sharding_attribute_for_dequeued_tensors(
-        values, self._input_partition_dims)
-
-  def generate_enqueue_ops(self, per_host_sharded_inputs):
-    """Generates the host-side Ops to enqueue the partitioned inputs.
-
-    per_host_sharded_inputs is a list, one for each replica, of lists of
-    Tensors. sharded_inputs[i] is the tuple of Tensors to use to feed
-    replica i.
-    sharded_inputs[i][j] is partitioned by self._input_partition_dims[j].
-
-    For example, if sharded_inputs[i][j] is a 2-D Tensor:
-    [[A, B, C, D],
-     [E ,F, G, H]]
-    self._input_partition_dims[j] is [2, 4].
-
-    sharded_inputs[i][j] will be partitioned and flattened into:
-    [A, B, C, D, E, F, G, H] and fed into the logical core ids:
-    [0, 1, 2, 3, 4, 5, 6, 7] respectively.
-
-    Args:
-      per_host_sharded_inputs: a list of lists of Tensors. The length of the
-        outer list determines the number of shards. Each inner list indicates
-        the types and shapes of the tuples in the corresponding shard.
-
-    Returns:
-      A list of host-side Ops, one for each shard, that when executed together
-      will enqueue a full-size element of infeed.
-
-    Raises:
-      ValueError: if the queue configuration has previously been frozen and the
-        shapes of the elements of sharded_inputs are not compatible with the
-        frozen configuration; or if the shapes of the elements of sharded_inputs
-        don't form a consistent unsharded tuple; or if the elements of a tuple
-        have different device constraints; or if the partition dims are invalid.
-      TypeError: if the queue configuration has previously been frozen and the
-        types of the elements of sharded_inputs are not compatible with the
-        frozen configuration; or if the types of the elements of sharded_inputs
-        don't form a consistent unsharded tuple.
-    """
-    self.set_configuration_from_sharded_input_tensors(per_host_sharded_inputs)
-    number_of_replicas_per_host = len(per_host_sharded_inputs)
-    number_of_tuple_elements = len(per_host_sharded_inputs[0])
-
-    assert len(self._input_partition_dims) == number_of_tuple_elements
-    per_host_enqueue_ops = []
-
-    for replica_index in range(number_of_replicas_per_host):
-      flattened_inputs = per_host_sharded_inputs[replica_index]
-      inputs_part_dims_flat = nest.flatten_up_to(flattened_inputs,
-                                                 self._input_partition_dims)
-      inputs_parted_iters = [
-          iter(self._partition_or_replicate_on_host(x, dims)) for x, dims in
-          zip(per_host_sharded_inputs[replica_index], inputs_part_dims_flat)
-      ]
-
-      for logical_core in xrange(self._device_assignment.num_cores_per_replica):
-        # Places different partitions to different logic cores.
-        replica_id = self._device_assignment.lookup_replicas(
-            self._host_id, logical_core)[replica_index]
-        ordinal = self._device_assignment.tpu_ordinal(
-            replica=replica_id, logical_core=logical_core)
-        infeed_inputs = []
-        for it in inputs_parted_iters:
-          input_for_device = next(it, None)
-          if input_for_device is not None:
-            infeed_inputs.append(input_for_device)
-
-        if infeed_inputs:
-          per_host_enqueue_ops.append(
-              tpu_ops.infeed_enqueue_tuple(
-                  inputs=infeed_inputs,
-                  shapes=[x.shape for x in infeed_inputs],
-                  name="enqueue/replica_{0}/input_{1}".format(
-                      replica_index, logical_core),
-                  device_ordinal=ordinal))
-    return per_host_enqueue_ops
-
-  def _check_input_partition_dims(self, tensor, dims):
-    """Checks that input partition dims are valid for the `Tensor`.
-
-    Args:
-      tensor: Input tensor for partitioning.
-      dims: 1-D np.array of the list of integer describes how to partition the
-        input tensor.
-
-    Raises:
-      ValueError: If the tensor can't be partitioned by dims or the
-        num_cores_per_replica doesn't match the number of
-        partitions(dims.prod()).
-    """
-    if (dims < 1).any():
-      raise ValueError("All input partition dims must be >= 1.")
-
-    # No partitioning, so don't perform further checks.
-    if dims.prod() == 1:
-      return
-
-    if dims.prod() != self._device_assignment.num_cores_per_replica:
-      raise ValueError(
-          "The product of each input parition dim should equal to "
-          "num_cores_per_replica. (dim = {}, num_cores_per_replica "
-          "= {})".format(dims, self._device_assignment.num_cores_per_replica))
-    if dims.shape[0] != tensor.shape.ndims:
-      raise ValueError(
-          "Input partition dims must have the same number of dimensions "
-          "as the `Tensor` to be partitioned. (tensor shape = {}, input "
-          "partition dims = {}).".format(tensor.shape.as_list(), dims))
-
-    tensor.shape.assert_is_fully_defined()
-
-  def _partition_or_replicate_on_host(self, tensor, dims):
-    """Partitions or replicates the input tensor.
-
-      The ops inside this function are placed on the host side.
-
-    Args:
-      tensor: The input tensor which will be partioned or replicated.
-      dims: A list of integer describes how to partition the input tensor.
-    Returns:
-      An iterator of `Tensor`s or a list of partioned tensors.
-    """
-    if dims is None:
-      return itertools.repeat(tensor)
-    dims = np.array(dims)
-    self._check_input_partition_dims(tensor, dims)
-    output = [tensor]
-    shape_list = np.array(tensor.shape.as_list())
-    quotients, remainders = np.divmod(shape_list, dims)
-    for axis, (quotient, remainder, dim, original_size) in enumerate(
-        zip(quotients, remainders, dims, shape_list)):
-      if dim <= 1:
-        continue
-      if remainder > 0:
-        # For each dimension, when it cannot be evenly partitioned, XLA assumes
-        # tensors are partitioned in a greedy manner by using
-        # ceil_ratio(size/dim) first. E.g. 2D tensor with shape (5, 14) and dims
-        # are (2, 4). Since 5 % 2 = 1 and 14 % 4 = 2, [5, 14] =>
-        # [[(3, 4), (3, 4), (2, 4), (2, 2)],
-        # [(2, 4), (2, 4), (2, 4), (2, 2)]]
-        ceil_ratio = quotient + 1
-        num_full_slots, left_over = np.divmod(original_size, ceil_ratio)
-        num_or_size_splits = [ceil_ratio] * num_full_slots + [left_over]
-        if len(num_or_size_splits) < dim:
-          num_or_size_splits += [0] * (dim - len(num_or_size_splits))
-        new_output = []
-        for x in output:
-          new_output.append(
-              array_ops.split(
-                  x, num_or_size_splits=num_or_size_splits, axis=axis))
-        output = new_output
-      else:
-        output = [array_ops.split(x, dim, axis=axis) for x in output]
-      output = nest.flatten(output)
-    return output
-
-  def _tag_sharding_attribute_for_dequeued_tensor(self, tensor, dims):
-    """Tags appropriate XLA sharding attribute to the dequeued tensor.
-
-    Args:
-      tensor: The dequeued tensor on TPU.
-      dims: A list of integer describes how the tensor is partitioned.
-
-    Returns:
-      The same tensor with the xla_sharding attribute.
-    """
-    if dims is None:
-      return xla_sharding.replicate(tensor)
-    elif np.prod(dims) == 1:
-      return xla_sharding.assign_device(tensor, 0)
-    else:
-      tile_assignment = np.arange(np.prod(dims)).reshape(dims)
-      return xla_sharding.tile(
-          tensor=tensor,
-          tile_assignment=tile_assignment)
-
-  def _tag_sharding_attribute_for_dequeued_tensors(self, dequeues, dims):
-    """Tags appropriate XLA sharding attribute to the dequeued tensors.
-
-    Args:
-      dequeues: A list of dequeued tensors on TPU.
-      dims: A list of integer describes how the tensor is partitioned.
-
-    Returns:
-      The same dequeues with appropriate xla_sharding attribute.
-    """
-    nest.assert_shallow_structure(dequeues, dims)
-    return nest.map_structure_up_to(
-        dequeues, self._tag_sharding_attribute_for_dequeued_tensor, dequeues,
-        dims)
+# pylint: disable=wildcard-import,unused-import,redefined-builtin
+from tensorflow.python.tpu.tpu_feed import *
+# used by tests
+from tensorflow.python.tpu.tpu_feed import _PartitionedInfeedQueue
+# pylint: enable=wildcard-import,unused-import,redefined-builtin
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_function.py b/tensorflow/contrib/tpu/python/tpu/tpu_function.py
index 84d5967ea547f0c036f7c9aa936ac0c99c141304..f2755c6979c2e49dbc19b6800462949601811496 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_function.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_function.py
@@ -1,57 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-
-"""Helper library for functions used during TPU compilation."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
-
-
-class TpuContext(object):
-  """A context object holding state about the TPU computation being built."""
-
-  def __init__(self):
-    """Creates a new TpuContext."""
-    self._number_of_shards = None
-
-  @property
-  def number_of_shards(self):
-    return self._number_of_shards
-
-  def set_number_of_shards(self, number_of_shards):
-    self._number_of_shards = number_of_shards
-
-
-# The Tpu context holds the number of shards when a sharded computation is
-# being built, or None if no computation is being built.
-_current_tpu_context = TpuContext()
-
-
-@contextlib.contextmanager
-def tpu_shard_context(number_of_shards):
-  if _current_tpu_context.number_of_shards is not None:
-    raise NotImplementedError("tpu_shard_context cannot be nested.")
-  try:
-    _current_tpu_context.set_number_of_shards(number_of_shards)
-    yield
-  finally:
-    _current_tpu_context.set_number_of_shards(None)
-
-
-def get_tpu_context():
-  return _current_tpu_context
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.tpu_function import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py b/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py
index 1e11de6421e360faf0b9ad573a84f9aecdf9c98f..ca58e78d7b342c7ca70400652d99092ccbecbbde 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_optimizer.py
@@ -1,203 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-
-"""Optimizer that implements cross-shard gradient reduction for TPU."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-
-from tensorflow.contrib.tpu.python.ops import tpu_ops
-from tensorflow.contrib.tpu.python.tpu import tpu_function
-from tensorflow.python.framework import ops
-from tensorflow.python.ops.losses import losses
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import optimizer
-
-
-class CrossShardOptimizer(optimizer.Optimizer):
-  """An optimizer that averages gradients across TPU shards."""
-
-  def __init__(self,
-               opt,
-               reduction=losses.Reduction.MEAN,
-               name="CrossShardOptimizer",
-               group_assignment=None):
-    """Construct a new cross-shard optimizer.
-
-    Args:
-      opt: An existing `Optimizer` to encapsulate.
-      reduction: The reduction to apply to the shard losses.
-      name: Optional name prefix for the operations created when applying
-        gradients. Defaults to "CrossShardOptimizer".
-      group_assignment: Optional 2d int32 lists with shape
-        [num_groups, num_replicas_per_group] which describles how to apply
-        optimizer to subgroups.
-
-    Raises:
-      ValueError: If reduction is not a valid cross-shard reduction.
-    """
-    if reduction not in (losses.Reduction.SUM, losses.Reduction.MEAN):
-      raise ValueError("Unsupported reduction: %s." % reduction)
-
-    super(CrossShardOptimizer, self).__init__(False, name)
-    self._opt = opt
-    self._reduction = reduction
-    self._group_assignment = group_assignment
-
-  def _verify_and_get_subgroup_size(self, group_assignment, num_shards):
-    """Verify group_assignment and get the subgroup size".
-
-    Args:
-      group_assignment: list of group ids for applying the optimizer
-        to subgroups.
-      num_shards: The number of TPU shards.
-
-    Returns:
-      The size of one subgroup in group_assignment.
-
-    Raises:
-      ValueError: If group_assignment is invalid.
-    """
-    if not group_assignment:
-      return None
-    if not (isinstance(group_assignment, list) and
-            all(isinstance(i, list) for i in group_assignment)):
-      raise ValueError("group_assignment must be a list of list. Got {}".format(
-          group_assignment))
-
-    replica_ids = set()
-    for g in group_assignment:
-      for i in g:
-        replica_ids.add(i)
-
-    if set(range(num_shards)) != replica_ids:
-      raise ValueError("group_assignment must be a permutation of range({0})."
-                       " Got group_assignment={1}".format(
-                           num_shards, group_assignment))
-
-    subgroup_size_list = [len(group) for group in group_assignment]
-    if all(subgroup_size_list[0] == size for size in subgroup_size_list):
-      return subgroup_size_list[0]
-    else:
-      raise ValueError("The size of each subgroup in group_assignment must "
-                       "be equal. Got group_assignment={}".format(
-                           self._group_assignment))
-
-  def compute_gradients(self, loss, var_list=None, **kwargs):
-    """Compute gradients of "loss" for the variables in "var_list".
-
-    This simply wraps the compute_gradients() from the real optimizer. The
-    gradients will be aggregated in the apply_gradients() so that user can
-    modify the gradients like clipping with per replica global norm if needed.
-    The global norm with aggregated gradients can be bad as one replica's huge
-    gradients can hurt the gradients from other replicas.
-
-    Args:
-      loss: A Tensor containing the value to minimize.
-      var_list: Optional list or tuple of `tf.Variable` to update to minimize
-        `loss`.  Defaults to the list of variables collected in the graph
-        under the key `GraphKey.TRAINABLE_VARIABLES`.
-      **kwargs: Keyword arguments for compute_gradients().
-
-    Returns:
-      A list of (gradient, variable) pairs.
-
-    Raises:
-      ValueError: If not within a tpu_shard_context or group_assignment is
-        invalid.
-    """
-    num_shards = tpu_function.get_tpu_context().number_of_shards
-    if num_shards is None:
-      logging.warning(
-          "CrossShardOptimizer should be used within a tpu_shard_context, but "
-          "got unset number_of_shards. Assuming 1.")
-      num_shards = 1
-
-    subgroup_size = self._verify_and_get_subgroup_size(self._group_assignment,
-                                                       num_shards)
-
-    if num_shards > 1 and self._reduction == losses.Reduction.MEAN:
-      if self._group_assignment:
-        scale = 1.0 / subgroup_size
-      else:
-        scale = 1.0 / num_shards
-      loss *= scale
-
-    return self._opt.compute_gradients(loss, var_list=var_list, **kwargs)
-
-  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
-    """Apply gradients to variables.
-
-    Calls tpu_ops.cross_replica_sum() to sum gradient contributions across
-    replicas, and then applies the real optimizer.
-
-    Args:
-      grads_and_vars: List of (gradient, variable) pairs as returned by
-        compute_gradients().
-      global_step: Optional Variable to increment by one after the
-        variables have been updated.
-      name: Optional name for the returned operation.  Default to the
-        name passed to the Optimizer constructor.
-
-    Returns:
-      An `Operation` that applies the gradients. If `global_step` was not None,
-      that operation also increments `global_step`.
-
-    Raises:
-      ValueError: If the grads_and_vars is malformed.
-    """
-    summed_grads_and_vars = []
-    for (grad, var) in grads_and_vars:
-      if grad is None:
-        summed_grads_and_vars.append((grad, var))
-      else:
-        with ops.colocate_with(grad):
-          summed_grads_and_vars.append((tpu_ops.cross_replica_sum(
-              grad, self._group_assignment), var))
-    return self._opt.apply_gradients(summed_grads_and_vars, global_step, name)
-
-  def get_slot(self, *args, **kwargs):
-    """Return a slot named "name" created for "var" by the Optimizer.
-
-    This simply wraps the get_slot() from the actual optimizer.
-
-    Args:
-      *args: Arguments for get_slot().
-      **kwargs: Keyword arguments for get_slot().
-
-    Returns:
-      The `Variable` for the slot if it was created, `None` otherwise.
-    """
-    return self._opt.get_slot(*args, **kwargs)
-
-  def get_slot_names(self, *args, **kwargs):
-    """Return a list of the names of slots created by the `Optimizer`.
-
-    This simply wraps the get_slot_names() from the actual optimizer.
-
-    Args:
-      *args: Arguments for get_slot().
-      **kwargs: Keyword arguments for get_slot().
-
-    Returns:
-      A list of strings.
-    """
-    return self._opt.get_slot_names(*args, **kwargs)
-
-  def variables(self):
-    """Forwarding the variables from the underlying optimizer."""
-    return self._opt.variables()
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.tpu_optimizer import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_sharding.py b/tensorflow/contrib/tpu/python/tpu/tpu_sharding.py
index f5af03f33ca8f13af517007672e9ce0e12be6205..93c52335a582e5fa83092f78212ca268079b7c12 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_sharding.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_sharding.py
@@ -1,253 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-
-"""Helper library for sharding during TPU compilation."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.python.framework import tensor_shape
-
-_DEFAULT_NUMBER_OF_SHARDS = 1
-_DEFAULT_SHARD_DIMENSION = 0
-
-
-# TODO(b/36777903) change other parts of tpu.py to use this class.
-class ShardingPolicy(object):
-  """An object use to hold the sharding policy for a Tensor.
-  """
-
-  def __init__(self):
-    self._number_of_shards = None
-    self._shard_dimension = None
-    self._frozen = False
-
-  def __str__(self):
-    if self.number_of_shards is None or self.shard_dimension is None:
-      return "ShardingPolicy(unset)"
-    else:
-      return ("ShardingPolicy(%d shards dimension %d)" %
-              (self.number_of_shards, self.shard_dimension))
-
-  def _fill_default_values(self):
-    if self._number_of_shards is None:
-      self._number_of_shards = _DEFAULT_NUMBER_OF_SHARDS
-    if self._shard_dimension is None:
-      self._shard_dimension = tensor_shape.as_dimension(
-          _DEFAULT_SHARD_DIMENSION)
-
-  def freeze(self):
-    """Prevents further modification to the sharding policy.
-
-    Any values that have not been set when freeze is called are set to
-    defaults. If the ShardingPolicy is already frozen, this is a NoOp.
-    """
-    if not self._frozen:
-      self._fill_default_values()
-      self._frozen = True
-
-  @property
-  def number_of_shards(self):
-    """Returns the number of shards in the policy or None if unspecified."""
-    return self._number_of_shards
-
-  def set_number_of_shards(self, number_of_shards):
-    """Sets the number of shards for the current policy.
-
-    If the policy has been frozen then number_of_shards must match the
-    existing setting.
-
-    Args:
-      number_of_shards: The number of shards to use in the policy.
-
-    Raises:
-      ValueError: If the policy has been frozen and number_of_shards
-        differs from the frozen value; or number_of_shards <= 0.
-    """
-    if self._frozen:
-      if self._number_of_shards != number_of_shards:
-        raise ValueError(
-            "Can't set sharding policy to use %d shards since it has been "
-            "frozen to use %d." % (number_of_shards, self._number_of_shards))
-    else:
-      if number_of_shards > 0:
-        self._number_of_shards = number_of_shards
-      else:
-        raise ValueError(
-            "Can't set sharding policy to use %s shards; value must be >0",
-            str(number_of_shards))
-
-  @property
-  def shard_dimension(self):
-    """Returns the shard dimension of the policy or None if unspecified."""
-    return self._shard_dimension
-
-  def set_shard_dimension(self, shard_dimension):
-    """Sets the shard dimension for the current policy.
-
-    If the policy has been frozen then shard_dimension must match the
-    existing setting.
-
-    Args:
-      shard_dimension: The shard dimension to use in the policy.
-
-    Raises:
-      ValueError: If the policy has been frozen and shard_dimension
-        differs from the frozen value, or shard_dimension can't be
-        interpreted as a Dimension.
-    """
-    if self._frozen:
-      if self._shard_dimension != shard_dimension:
-        raise ValueError(
-            "Can't set shard dimension to %d since it has been frozen to "
-            "use %d." % (shard_dimension, self._shard_dimension))
-    else:
-      self._shard_dimension = tensor_shape.as_dimension(shard_dimension)
-
-  def merge(self, other):
-    """Merges the policy of another policy into the current policy.
-
-    Args:
-      other: The policy to merge into this one.
-
-    Raises:
-      ValueError: If this policy has been frozen and the merge conflicts with
-      the frozen policy.
-    """
-    if other.number_of_shards is not None:
-      self.set_number_of_shards(other.number_of_shards)
-    if other.shard_dimension is not None:
-      self.set_shard_dimension(other.shard_dimension)
-
-  def get_sharded_shape(self, shape, shard_index=None):
-    """Returns the shape of a shard of a full Tensor.
-
-    When given the shape of a 'full-size' Tensor, returns the shape of
-    the sub-Tensor after it has been sharded. Freezes the policy if it
-    has not yet been frozen.
-
-    Args:
-      shape: The shape of the full-size Tensor to be sharded.
-      shard_index: The index of the shard whose shape should be returned.
-        shard_index can be None for sharding policies that use the same
-        shape for every shard.
-      freeze_config:
-
-    Returns:
-      The shape of the sharded version of the Tensor.
-
-    Raises:
-      ValueError: If shard_index is None when shards are of different
-        shapes; or shard_index is not None and
-        !(0<=shard_index<number_of_shards); or shape does not have at
-        least self.shard_dimension+1 dimensions; or the value of
-        shape's shard dimension is not a multiple of
-        self.number_of_shards
-    """
-    if self._shard_dimension is None or self._number_of_shards is None:
-      # Don't raise an error if the config is unset.
-      return None
-    if shard_index is not None:
-      if shard_index < 0 or shard_index >= self.number_of_shards:
-        raise ValueError("shard_index %d, but must be in [0,%d)." %
-                         (shard_index, self._number_of_shards))
-    shape = tensor_shape.as_shape(shape)
-    if self._number_of_shards == 1:
-      # Don't do anything when there's only one shard.
-      return shape
-    ndims = shape.ndims
-    if ndims is None:
-      raise ValueError("shape must be a specified shape not Unknown")
-    if ndims <= self._shard_dimension:
-      raise ValueError("shape %s does not contain shard_dimension %d" %
-                       (shape.as_list(), self._shard_dimension))
-    dims = shape.as_list()
-    if dims[self._shard_dimension] is None:
-      raise ValueError("shape %s must have a fixed size for dimension %d "
-                       "that is known at graph construction time." %
-                       (shape.as_list(), self._shard_dimension))
-    if (dims[self._shard_dimension] % self._number_of_shards) != 0:
-      raise ValueError("shape %s cannot be sharded %d ways along dimension %d" %
-                       (shape.as_list(), self._number_of_shards,
-                        self._shard_dimension))
-    dims[self._shard_dimension] /= self._number_of_shards
-    return tensor_shape.as_shape(dims)
-
-  def _unshard_shape(self, shape):
-    """Return the unsharded shape that would generate a given sharded shape.
-
-    Args:
-      shape: the sharded shape to unshard
-
-    Returns:
-      The unsharded shape.
-
-    Raises:
-      ValueError: if shape is unknown or does not contain
-        self.shard_dimension
-      TypeError: if shape is not convertible to a TensorShape
-    """
-    shape = tensor_shape.as_shape(shape)
-    if self._number_of_shards == 1:
-      # Don't do anything when there's only one shard.
-      return shape
-    ndims = shape.ndims
-    if ndims is None:
-      raise ValueError("shape must be a specified shape not Unknown")
-    if ndims <= self._shard_dimension:
-      raise ValueError("shape %s does not contain shard_dimension %d" %
-                       (shape.as_list(), self._shard_dimension))
-    dims = shape.as_list()
-    dims[self._shard_dimension] *= self._number_of_shards
-    return tensor_shape.as_shape(dims)
-
-  def get_unsharded_shape(self, shapes):
-    """Returns the shape of an unsharded Tensor given a list of shards.
-
-    When given a list of shapes of shards, returns the shape of the
-    unsharded Tensor that would generate the shards. Sets defaults for the
-    policy if number_of_shards or shard_dimension is None.
-
-    Args:
-      shapes: The shapes of the Tensor shards to be combined.
-
-    Returns:
-      The shape of the unsharded version of the Tensor.
-
-    Raises:
-      ValueError: if shapes is not a list of length
-        self.number_of_shards; or any element of shapes is not a valid
-        shape consistent with the sharding policy; or the list of
-        shapes is not a valid sharding of a full shape.
-      TypeError: if an element of shapes is not convertible to a
-        TensorShape
-    """
-    self._fill_default_values()
-    if len(shapes) != self.number_of_shards:
-      raise ValueError(
-          "shapes is %s but must be a list of length number_of_shards=%d" % (
-              str(shapes), self.number_of_shards))
-    unsharded_shapes = [self._unshard_shape(s) for s in shapes]
-    for i in xrange(self.number_of_shards - 1):
-      if not unsharded_shapes[i].is_compatible_with(
-          unsharded_shapes[self.number_of_shards - 1]):
-        raise ValueError(
-            "sharded shapes %s are not consistent shards of a full shape "
-            "sharded %d ways along dimension %d" % (
-                str(shapes), self.number_of_shards, self.shard_dimension))
-    return unsharded_shapes[0]
+# pylint: disable=wildcard-import,unused-import,redefined-builtin
+from tensorflow.python.tpu.tpu_sharding import *
+# pylint: enable=wildcard-import,unused-import,redefined-builtin
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
index d66ecfcf4a56b8da1c2d2f518bebe4baa76b315e..258d34ddaf5250e49c5a354caf018e4b64abae62 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
@@ -1,156 +1,25 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-"""TPU system metadata and associated tooling."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import re
-
-from tensorflow.contrib.tpu.python.tpu import tpu
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session as session_lib
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import tf_logging as logging
-
-_PINGING_MASTER_TIMEOUT_IN_MS = 60 * 1000  # 1 min
-_RETRY_TIMES = 120
-_INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS = 300 * 1000  # 5 mins
-
-_TPU_DEVICE_REG = re.compile(r'.*task:(\d+)/.*device:TPU:(\d+)$')
-
-# _TPUSystemMetadata is used by TPUEstimator to hold TPU configuration,
-# including num_cores and num_hosts.
-_TPUSystemMetadata = collections.namedtuple('_TPUSystemMetadata', [
-    'num_cores',
-    'num_hosts',
-    'num_of_cores_per_host',
-    'topology',
-    'devices',
-])
-
-
-def _query_tpu_system_metadata(master_address, cluster_def=None,
-                               query_topology=False):
-  """Automatically detects the TPU system metadata in the system."""
-  tpu_core_count = 0
-  devices = []
-  device_dict = collections.defaultdict(list)
-
-  # TODO(b/120564445): Replace with standard library for retries.
-  retry_count = 1
-  while True:
-    logging.info('Querying Tensorflow master (%s) for TPU system metadata.',
-                 master_address)
-    try:
-      with ops.Graph().as_default():
-        with session_lib.Session(
-            master_address,
-            config=get_session_config_with_timeout(
-                _PINGING_MASTER_TIMEOUT_IN_MS,
-                cluster_def)) as sess:
-          devices = sess.list_devices()
-          for device in devices:
-            match = _TPU_DEVICE_REG.match(device.name)
-            if match:
-              host_id = match.group(1)
-              core_id = match.group(2)
-              device_dict[host_id].append(core_id)
-              tpu_core_count += 1
-          break
-    except errors.DeadlineExceededError:
-      msg = ('Failed to connect to the Tensorflow master. The TPU worker may '
-             'not be ready (still scheduling) or the Tensorflow master address '
-             'is incorrect: got (%s).' %
-             (master_address))
-
-      # TODO(xiejw): For local or grpc master we might not need retry logic
-      # here.
-      if retry_count <= _RETRY_TIMES:
-        logging.warning('%s', msg)
-        logging.warning('Retrying (%d/%d).', retry_count, _RETRY_TIMES)
-        retry_count += 1
-      else:
-        raise ValueError(msg)
-
-  num_of_cores_per_host = 0
-  if tpu_core_count:
-    num_cores_per_host_set = set(
-        [len(core_ids) for core_ids in device_dict.values()])
-    if len(num_cores_per_host_set) != 1:
-      raise RuntimeError(
-          'TPU cores on each host is not same. This should not happen!. '
-          'devices: {}'.format(devices))
-    num_of_cores_per_host = num_cores_per_host_set.pop()
-
-  topology = None
-  if query_topology:
-    if not tpu_core_count:
-      raise RuntimeError(
-          'Cannot find any TPU cores in the system (master address {}). '
-          'This usually means the master address is incorrect or the '
-          'TPU worker has some problems. Available devices: {}'.format(
-              master_address, devices))
-
-    topology = _obtain_topology(master_address, cluster_def)
-
-  metadata = _TPUSystemMetadata(
-      num_cores=tpu_core_count,
-      num_hosts=len(device_dict),
-      num_of_cores_per_host=num_of_cores_per_host,
-      topology=topology,
-      devices=devices)
-
-  if tpu_core_count:
-    logging.info('Found TPU system:')
-    logging.info('*** Num TPU Cores: %d', metadata.num_cores)
-    logging.info('*** Num TPU Workers: %d', metadata.num_hosts)
-    logging.info('*** Num TPU Cores Per Worker: %d',
-                 metadata.num_of_cores_per_host)
-    for device in metadata.devices:
-      logging.info('*** Available Device: %s', device)
-  else:
-    logging.info('Failed to find TPU: %s', metadata)
-  return metadata
-
-
-def _obtain_topology(master_address, cluster_def):
-  """Obtains TPU fabric topology."""
-  try:
-    logging.info('Initializing TPU system (master: %s) to fetch topology '
-                 'for model parallelism. This might take a while.',
-                 master_address)
-    with ops.Graph().as_default():
-      session_config = get_session_config_with_timeout(
-          _INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS, cluster_def)
-      with session_lib.Session(
-          master_address, config=session_config) as sess:
-        topology = sess.run(tpu.initialize_system())
-        return topology
-  except errors.DeadlineExceededError:
-    raise ValueError(
-        'Fail to initialize TPU system with master (%s). '
-        'Please double check the TPU system is functional.' % (
-            master_address))
-
-
-def get_session_config_with_timeout(timeout_in_secs, cluster_def):
-  """Returns a session given a timeout and a cluster configuration."""
-  config = config_pb2.ConfigProto(
-      operation_timeout_in_ms=timeout_in_secs, cluster_def=cluster_def)
-  return config
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.tpu_system_metadata import *
+# used by tests
+from tensorflow.python.tpu.tpu_system_metadata import _query_tpu_system_metadata
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/training_loop.py b/tensorflow/contrib/tpu/python/tpu/training_loop.py
index 0187b4bec6ecc55943bf48b9268a74e18ea5b488..673359b232d6857d468723873c449cb3e48168c7 100644
--- a/tensorflow/contrib/tpu/python/tpu/training_loop.py
+++ b/tensorflow/contrib/tpu/python/tpu/training_loop.py
@@ -1,214 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# =============================================================================
-
-"""Library for constructing a training loop, suitable for TPUs."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.compiler import xla
-from tensorflow.contrib.tpu.python.tpu import tpu_function
-
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-
-
-def while_loop(condition, body, inputs=None, infeed_queue=None, name=None):
-  """Builds a training loop for TPUs.
-
-  The set of loop-carried tensors corresponds to `inputs`.  Both
-  `condition` and `body` take the current value of the loop-carried
-  tensors. 'body' additionally takes a tuple of infeed from
-  infeed_queue if infeed_queue is not None. `condition` must return a
-  single boolean value that determines whether iteration
-  continues. `body` must return an updated list of values for the
-  loop-carried tensors.
-
-  Args:
-    condition: a Python function that builds the loop condition.
-    body: a Python function that builds the loop body.
-    inputs: a list of initial values passed into the training loop, or
-      None (equivalent to an empty list).
-    infeed_queue: if not None, the infeed queue from which to append a tuple
-      of arguments as inputs to condition.
-    name: (Deprecated) Does nothing.
-
-  Returns:
-    The final values of the loop-carried tensors.
-
-  Raises:
-    TypeError: if body or condition has the wrong signature.
-  """
-  del name
-  # Converts inputs to Tensors.
-  inputs = [] if inputs is None else [ops.convert_to_tensor(x) for
-                                      x in inputs]
-  input_types = [x.dtype for x in inputs]
-  input_arity = len(inputs)
-
-  body_arg_error = xla.check_function_argument_count(
-      body, input_arity, infeed_queue)
-  if body_arg_error is not None:
-    if infeed_queue is None:
-      raise TypeError(
-          "Supplied loop body function cannot be called with the specified "
-          "inputs. You specified %d inputs: %s, but the loop body needs %s" % (
-              input_arity, str([i.name for i in inputs]), body_arg_error))
-    else:
-      raise TypeError(
-          "Supplied loop body function cannot be called with the specified "
-          "inputs. You specified %d inputs: %s and %d additional inputs from "
-          "infeed, but the computation needs %s" % (input_arity, str(
-              [i.name for i in inputs]), infeed_queue.number_of_tuple_elements,
-                                                    body_arg_error))
-  condition_arg_error = xla.check_function_argument_count(
-      condition, input_arity, None)
-  if condition_arg_error is not None:
-    if infeed_queue is None:
-      raise TypeError(
-          "Supplied loop condition function cannot be called with the "
-          "specified inputs. You specified %d inputs: %s, but the loop "
-          "condition needs %s" % (input_arity, str([i.name for i in inputs]),
-                                  condition_arg_error))
-    else:
-      raise TypeError(
-          "Supplied loop condition function cannot be called with the "
-          "specified inputs. You specified %d inputs: %s, but the loop "
-          "condition needs %s. Note that infeed is not passed to the loop "
-          "condition." % (input_arity, str([i.name for i in inputs]),
-                          condition_arg_error))
-
-  def condition_wrapper(*inputs):
-    # Discards the dummy output added for arity-0 loops.
-    if input_arity == 0:
-      inputs = []
-    return condition(*inputs)
-
-  def body_wrapper(*inputs):
-    """Wrapper around `body` that handles infeed queues and control deps."""
-    inputs = list(inputs)
-
-    # Discards the dummy output added for arity-0 loops.
-    if input_arity == 0:
-      inputs = []
-
-    # Runs `body` with the dequeue_ops appended.
-    if infeed_queue:
-      number_of_shards = tpu_function.get_tpu_context().number_of_shards
-      if number_of_shards is None:
-        raise ValueError("Can't build training loop with infeed when there is "
-                         "no tpu_shard_context. Are you building a loop or "
-                         "graph directly rather than from inside tpu.rewrite, "
-                         "tpu.batch_parallel, tpu.shard, or tpu.replicate?")
-      infeed_queue.set_number_of_shards(number_of_shards)
-      dequeue_ops = [d for d in infeed_queue.generate_dequeue_op()]
-    else:
-      dequeue_ops = []
-    outputs = body(*(inputs + dequeue_ops))
-
-    # If the computation only returned one value, make it a tuple.
-    if not isinstance(outputs, (list, tuple)):
-      outputs = (outputs,)
-
-    outputs = [
-        o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o)
-        for o in outputs
-    ]
-
-    # Separates the returned Operations and Tensors.
-    output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
-    output_tensors = [o for o in outputs
-                      if not isinstance(o, ops.Operation)]
-
-    if outputs != output_tensors + output_operations:
-      raise ValueError(
-          "TPU training loop body must return zero or more Tensor values "
-          "followed by zero or more Operations.")
-
-    output_types = [op.dtype for op in output_tensors]
-    if input_types != output_types:
-      raise TypeError(
-          "Mismatch between input types and output types for training loop "
-          "body: {} vs {}".format(input_types, output_types))
-
-    # Add the dequeue operations to output_operations to ensure they are run
-    # by the loop, even if the programmer's loop body does not use them.
-    output_operations += dequeue_ops
-
-    # Add a dummy output, if needed.
-    if not output_tensors:
-      output_tensors = array_ops.constant(0)
-
-    if output_operations:
-      # TODO(phawkins): in principle this is too restrictive since it serializes
-      # the training loop steps. In practice it does not matter since this loop
-      # will be compiled by XLA.
-      return control_flow_ops.tuple(output_tensors,
-                                    control_inputs=output_operations)
-    else:
-      return output_tensors
-
-  # If the body has arity 0, add a dummy loop-carried value to which we can add
-  # control dependencies from any side-effecting operations.
-  if input_arity == 0:
-    inputs = [array_ops.constant(0)]
-  return control_flow_ops.while_loop(
-      condition_wrapper, body_wrapper, inputs, name="", parallel_iterations=1)
-
-
-def repeat(n, body, inputs=None, infeed_queue=None, name=None):
-  """Builds a training loop that executes a fixed number of iterations.
-
-  The set of loop-carried tensors correspond to `inputs`.
-  `body` must be a function that takes and returns the values of the
-  loop-carried tensors.
-
-  Args:
-    n: the number of loop iterations
-    body: a Python function that builds the loop body.
-    inputs: a list of initial values passed into the training loop or
-      None (equivalent to an empty list).
-    infeed_queue: if not None, the infeed queue from which to append a tuple
-      of arguments as inputs to condition.
-    name: (Deprecated) Does nothing.
-  Returns:
-    The final values of the loop-carried tensors.
-  Raises:
-    ValueError: if there is a type error.
-  """
-  def _convert_to_list(xs):
-    if not isinstance(xs, (list, tuple)):
-      return [xs]
-    else:
-      return list(xs)
-
-  def cond(i, *args):
-    del args
-    return i < n
-
-  def body_wrapper(i, *args):
-    return [i + 1] + _convert_to_list(body(*args))
-
-  inputs = [0] if inputs is None else [0] + _convert_to_list(inputs)
-  outputs = while_loop(
-      cond, body_wrapper, inputs=inputs, infeed_queue=infeed_queue, name=name)
-  outputs = _convert_to_list(outputs)
-  if len(outputs) == 1:
-    # Returns the Op rather than an empty list.
-    return outputs[0].op
-  else:
-    return outputs[1:]
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.training_loop import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/tpu/python/tpu/util.py b/tensorflow/contrib/tpu/python/tpu/util.py
index dfb8ce1d1821da05c853bb0d10b1db3a857ccb1b..8d9b70d46eb42c9a525eeafc51d07f0ad4241d52 100644
--- a/tensorflow/contrib/tpu/python/tpu/util.py
+++ b/tensorflow/contrib/tpu/python/tpu/util.py
@@ -1,51 +1,23 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ===================================================================
-
-"""Utilities for the functionalities."""
+# ==============================================================================
+"""Stub file to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-import six
-
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import training
-
-def check_positive_integer(value, name):
-  """Checks whether `value` is a positive integer."""
-  if not isinstance(value, six.integer_types):
-    raise TypeError('{} must be int, got {}'.format(name, type(value)))
-
-  if value <= 0:
-    raise ValueError('{} must be positive, got {}'.format(name, value))
-
-
-# TODO(b/118302029) Remove this copy of MultiHostDatasetInitializerHook after we
-# release a tensorflow_estimator with MultiHostDatasetInitializerHook in
-# python/estimator/util.py.
-class MultiHostDatasetInitializerHook(training.SessionRunHook):
-  """Creates a SessionRunHook that initializes all passed iterators."""
-
-  def __init__(self, dataset_initializers):
-    self._initializers = dataset_initializers
-
-  def after_create_session(self, session, coord):
-    del coord
-    start = time.time()
-    session.run(self._initializers)
-    logging.info('Initialized dataset iterators in %d seconds',
-                 time.time() - start)
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.tpu.util import *
+# pylint: enable=wildcard-import,unused-import
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index f6427ae05a20f253edf030eff0f860361616042b..5bc4c3b88efd641b6f17a54753a29b0603c2b98c 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -264,9 +264,9 @@ py_test(
 
 py_test(
     name = "training_test",
-    size = "large",
+    size = "medium",
     srcs = ["python/training/training_test.py"],
-    shard_count = 3,
+    shard_count = 8,
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
diff --git a/tensorflow/contrib/training/python/training/hparam.py b/tensorflow/contrib/training/python/training/hparam.py
index 3beb7bfe3048a8f0294f7e9149b5a07b5fcc7d17..27f0d9b2e38c433d4fb4573285ecb8c9946112e8 100644
--- a/tensorflow/contrib/training/python/training/hparam.py
+++ b/tensorflow/contrib/training/python/training/hparam.py
@@ -187,7 +187,7 @@ def _cast_to_type_if_compatible(name, param_type, value):
   return param_type(value)
 
 
-def parse_values(values, type_map):
+def parse_values(values, type_map, ignore_unknown=False):
   """Parses hyperparameter values from a string into a python map.
 
   `values` is a string containing comma-separated `name=value` pairs.
@@ -233,6 +233,9 @@ def parse_values(values, type_map):
       type T if either V has type T, or V is a list of elements of type T.
       Hence, for a multidimensional parameter 'x' taking float values,
       'x=[0.1,0.2]' will parse successfully if type_map['x'] = float.
+    ignore_unknown: Bool. Whether values that are missing a type in type_map
+      should be ignored. If set to True, a ValueError will not be raised for
+      unknown hyperparameter type.
 
   Returns:
     A python map mapping each name to either:
@@ -260,6 +263,8 @@ def parse_values(values, type_map):
     m_dict = m.groupdict()
     name = m_dict['name']
     if name not in type_map:
+      if ignore_unknown:
+        continue
       raise ValueError('Unknown hyperparameter type for %s' % name)
     type_ = type_map[name]
 
@@ -494,6 +499,7 @@ class HParams(object):
       value: New value of the hyperparameter.
 
     Raises:
+      KeyError: If the hyperparameter doesn't exist.
       ValueError: If there is a type mismatch.
     """
     param_type, is_list = self._hparam_types[name]
@@ -512,6 +518,8 @@ class HParams(object):
   def del_hparam(self, name):
     """Removes the hyperparameter with key 'name'.
 
+    Does nothing if it isn't present.
+
     Args:
       name: Name of the hyperparameter.
     """
@@ -520,19 +528,20 @@ class HParams(object):
       del self._hparam_types[name]
 
   def parse(self, values):
-    """Override hyperparameter values, parsing new values from a string.
+    """Override existing hyperparameter values, parsing new values from a string.
 
     See parse_values for more detail on the allowed format for values.
 
     Args:
-      values: String.  Comma separated list of `name=value` pairs where
-        'value' must follow the syntax described above.
+      values: String.  Comma separated list of `name=value` pairs where 'value'
+        must follow the syntax described above.
 
     Returns:
       The `HParams` instance.
 
     Raises:
-      ValueError: If `values` cannot be parsed.
+      ValueError: If `values` cannot be parsed or a hyperparameter in `values`
+      doesn't exist.
     """
     type_map = dict()
     for name, t in self._hparam_types.items():
@@ -543,7 +552,7 @@ class HParams(object):
     return self.override_from_dict(values_map)
 
   def override_from_dict(self, values_dict):
-    """Override hyperparameter values, parsing new values from a dictionary.
+    """Override existing hyperparameter values, parsing new values from a dictionary.
 
     Args:
       values_dict: Dictionary of name:value pairs.
@@ -552,6 +561,7 @@ class HParams(object):
       The `HParams` instance.
 
     Raises:
+      KeyError: If a hyperparameter in `values_dict` doesn't exist.
       ValueError: If `values_dict` cannot be parsed.
     """
     for name, value in values_dict.items():
@@ -591,7 +601,7 @@ class HParams(object):
         sort_keys=sort_keys)
 
   def parse_json(self, values_json):
-    """Override hyperparameter values, parsing new values from a json object.
+    """Override existing hyperparameter values, parsing new values from a json object.
 
     Args:
       values_json: String containing a json object of name:value pairs.
@@ -600,6 +610,7 @@ class HParams(object):
       The `HParams` instance.
 
     Raises:
+      KeyError: If a hyperparameter in `values_json` doesn't exist.
       ValueError: If `values_json` cannot be parsed.
     """
     values_map = json.loads(values_json)
diff --git a/tensorflow/contrib/training/python/training/hparam_test.py b/tensorflow/contrib/training/python/training/hparam_test.py
index 660c97f25e8458c345c8914bcaf98f37d047e50e..a990e04711ce68bd928a508484f0d6f657dd2f8c 100644
--- a/tensorflow/contrib/training/python/training/hparam_test.py
+++ b/tensorflow/contrib/training/python/training/hparam_test.py
@@ -216,6 +216,14 @@ class HParamsTest(test.TestCase):
     self.assertTrue(isinstance(parse_dict['arr'], dict))
     self.assertDictEqual(parse_dict['arr'], {1: 10})
 
+  def testParseValuesWithIndexAssigment1_IgnoreUnknown(self):
+    """Assignment to an index position."""
+    parse_dict = hparam.parse_values(
+        'arr[1]=10,b=5', {'arr': int}, ignore_unknown=True)
+    self.assertEqual(len(parse_dict), 1)
+    self.assertTrue(isinstance(parse_dict['arr'], dict))
+    self.assertDictEqual(parse_dict['arr'], {1: 10})
+
   def testParseValuesWithIndexAssigment2(self):
     """Assignment to multiple index positions."""
     parse_dict = hparam.parse_values('arr[0]=10,arr[5]=20', {'arr': int})
@@ -223,6 +231,14 @@ class HParamsTest(test.TestCase):
     self.assertTrue(isinstance(parse_dict['arr'], dict))
     self.assertDictEqual(parse_dict['arr'], {0: 10, 5: 20})
 
+  def testParseValuesWithIndexAssigment2_IgnoreUnknown(self):
+    """Assignment to multiple index positions."""
+    parse_dict = hparam.parse_values(
+        'arr[0]=10,arr[5]=20,foo=bar', {'arr': int}, ignore_unknown=True)
+    self.assertEqual(len(parse_dict), 1)
+    self.assertTrue(isinstance(parse_dict['arr'], dict))
+    self.assertDictEqual(parse_dict['arr'], {0: 10, 5: 20})
+
   def testParseValuesWithIndexAssigment3(self):
     """Assignment to index positions in multiple names."""
     parse_dict = hparam.parse_values('arr[0]=10,arr[1]=20,L[5]=100,L[10]=200',
@@ -234,6 +250,17 @@ class HParamsTest(test.TestCase):
     self.assertTrue(isinstance(parse_dict['L'], dict))
     self.assertDictEqual(parse_dict['L'], {5: 100, 10: 200})
 
+  def testParseValuesWithIndexAssigment3_IgnoreUnknown(self):
+    """Assignment to index positions in multiple names."""
+    parse_dict = hparam.parse_values(
+        'arr[0]=10,C=5,arr[1]=20,B[0]=kkk,L[5]=100,L[10]=200',
+        {'arr': int, 'L': int}, ignore_unknown=True)
+    self.assertEqual(len(parse_dict), 2)
+    self.assertTrue(isinstance(parse_dict['arr'], dict))
+    self.assertDictEqual(parse_dict['arr'], {0: 10, 1: 20})
+    self.assertTrue(isinstance(parse_dict['L'], dict))
+    self.assertDictEqual(parse_dict['L'], {5: 100, 10: 200})
+
   def testParseValuesWithIndexAssigment4(self):
     """Assignment of index positions and scalars."""
     parse_dict = hparam.parse_values('x=10,arr[1]=20,y=30',
@@ -246,6 +273,17 @@ class HParamsTest(test.TestCase):
     self.assertEqual(parse_dict['x'], 10)
     self.assertEqual(parse_dict['y'], 30)
 
+  def testParseValuesWithIndexAssigment4_IgnoreUnknown(self):
+    """Assignment of index positions and scalars."""
+    parse_dict = hparam.parse_values(
+        'x=10,foo[0]=bar,arr[1]=20,zzz=78,y=30',
+        {'x': int, 'y': int, 'arr': int}, ignore_unknown=True)
+    self.assertEqual(len(parse_dict), 3)
+    self.assertTrue(isinstance(parse_dict['arr'], dict))
+    self.assertDictEqual(parse_dict['arr'], {1: 20})
+    self.assertEqual(parse_dict['x'], 10)
+    self.assertEqual(parse_dict['y'], 30)
+
   def testParseValuesWithIndexAssigment5(self):
     """Different variable types."""
     parse_dict = hparam.parse_values('a[0]=5,b[1]=true,c[2]=abc,d[3]=3.14', {
@@ -264,24 +302,55 @@ class HParamsTest(test.TestCase):
     self.assertTrue(isinstance(parse_dict['d'], dict))
     self.assertDictEqual(parse_dict['d'], {3: 3.14})
 
+  def testParseValuesWithIndexAssigment5_IgnoreUnknown(self):
+    """Different variable types."""
+    parse_dict = hparam.parse_values(
+        'a[0]=5,cc=4,b[1]=true,c[2]=abc,mm=2,d[3]=3.14',
+        {'a': int, 'b': bool, 'c': str, 'd': float},
+        ignore_unknown=True)
+    self.assertEqual(set(parse_dict.keys()), {'a', 'b', 'c', 'd'})
+    self.assertTrue(isinstance(parse_dict['a'], dict))
+    self.assertDictEqual(parse_dict['a'], {0: 5})
+    self.assertTrue(isinstance(parse_dict['b'], dict))
+    self.assertDictEqual(parse_dict['b'], {1: True})
+    self.assertTrue(isinstance(parse_dict['c'], dict))
+    self.assertDictEqual(parse_dict['c'], {2: 'abc'})
+    self.assertTrue(isinstance(parse_dict['d'], dict))
+    self.assertDictEqual(parse_dict['d'], {3: 3.14})
+
   def testParseValuesWithBadIndexAssigment1(self):
     """Reject assignment of list to variable type."""
     with self.assertRaisesRegexp(ValueError,
                                  r'Assignment of a list to a list index.'):
       hparam.parse_values('arr[1]=[1,2,3]', {'arr': int})
 
+  def testParseValuesWithBadIndexAssigment1_IgnoreUnknown(self):
+    """Reject assignment of list to variable type."""
+    with self.assertRaisesRegexp(ValueError,
+                                 r'Assignment of a list to a list index.'):
+      hparam.parse_values(
+          'arr[1]=[1,2,3],c=8', {'arr': int}, ignore_unknown=True)
+
   def testParseValuesWithBadIndexAssigment2(self):
     """Reject if type missing."""
     with self.assertRaisesRegexp(ValueError,
                                  r'Unknown hyperparameter type for arr'):
       hparam.parse_values('arr[1]=5', {})
 
+  def testParseValuesWithBadIndexAssigment2_IgnoreUnknown(self):
+    """Ignore missing type."""
+    hparam.parse_values('arr[1]=5', {}, ignore_unknown=True)
+
   def testParseValuesWithBadIndexAssigment3(self):
     """Reject type of the form name[index]."""
     with self.assertRaisesRegexp(ValueError,
                                  'Unknown hyperparameter type for arr'):
       hparam.parse_values('arr[1]=1', {'arr[1]': int})
 
+  def testParseValuesWithBadIndexAssigment3_IgnoreUnknown(self):
+    """Ignore type of the form name[index]."""
+    hparam.parse_values('arr[1]=1', {'arr[1]': int}, ignore_unknown=True)
+
   def testWithReusedVariables(self):
     with self.assertRaisesRegexp(ValueError,
                                  'Multiple assignments to variable \'x\''):
diff --git a/tensorflow/contrib/training/python/training/training.py b/tensorflow/contrib/training/python/training/training.py
index c272a2ac144068cfb7355c2647eebf5bd0ce9d50..4ceb6e9350f5167efc8f7266d4e748cc6fa4ffd6 100644
--- a/tensorflow/contrib/training/python/training/training.py
+++ b/tensorflow/contrib/training/python/training/training.py
@@ -244,7 +244,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
@@ -354,11 +353,11 @@ def multiply_gradients(grads_and_vars, gradient_multipliers):
         raise ValueError('Requested multiple of `None` gradient.')
 
       if isinstance(grad, ops.IndexedSlices):
-        tmp = grad.values * constant_op.constant(
+        tmp = grad.values * ops.convert_to_tensor(
             gradient_multipliers[key], dtype=grad.dtype)
         grad = ops.IndexedSlices(tmp, grad.indices, grad.dense_shape)
       else:
-        grad *= constant_op.constant(
+        grad *= ops.convert_to_tensor(
             gradient_multipliers[key], dtype=grad.dtype)
     multiplied_grads_and_vars.append((grad, var))
   return multiplied_grads_and_vars
@@ -419,7 +418,7 @@ def create_train_op(total_loss,
     update_ops = set(update_ops)
   if not global_update_ops.issubset(update_ops):
     logging.warning('update_ops in create_train_op does not contain all the '
-                    ' update_ops in GraphKeys.UPDATE_OPS')
+                    'update_ops in GraphKeys.UPDATE_OPS')
 
   # Make sure update_ops are computed before total_loss.
   if update_ops:
@@ -433,7 +432,7 @@ def create_train_op(total_loss,
   else:
     # Make sure that variables_to_train are in tf.trainable_variables()
     for v in variables_to_train:
-      assert v in tf_variables.trainable_variables()
+      assert v.trainable or v in tf_variables.trainable_variables()
 
   assert variables_to_train
 
diff --git a/tensorflow/contrib/util/BUILD b/tensorflow/contrib/util/BUILD
index d9ccda8e89a4c9a1b3f3d24915b9ad3fb4d9be5f..ada08f95ae46ea06b3896ca3b1603277d62bf6fc 100644
--- a/tensorflow/contrib/util/BUILD
+++ b/tensorflow/contrib/util/BUILD
@@ -16,10 +16,15 @@ cc_library(
     srcs = ["convert_graphdef_memmapped_format_lib.cc"],
     hdrs = ["convert_graphdef_memmapped_format_lib.h"],
     deps = [
+        "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:sendrecv_ops_op_lib",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core/kernels:immutable_constant_op",
     ],
diff --git a/tensorflow/contrib/verbs/rdma_mgr.cc b/tensorflow/contrib/verbs/rdma_mgr.cc
index 2784bf124ceaacd8e01f0653287fa7f006d0d608..2f2375427862ad1e99a0e6bfc506382d200e9b1d 100644
--- a/tensorflow/contrib/verbs/rdma_mgr.cc
+++ b/tensorflow/contrib/verbs/rdma_mgr.cc
@@ -277,9 +277,18 @@ void RdmaMgr::InitAllocators() {
   ProcessState::singleton()->AddCPUFreeVisitor(free_visitor);
 
 #if GOOGLE_CUDA
+  GPUProcessState::singleton()->AddCUDAHostAllocVisitor(0, alloc_visitor);
+  GPUProcessState::singleton()->AddCUDAHostFreeVisitor(0, free_visitor);
+
   if (IsGDRAvailable()) {
     // Note we don't free allocated GPU memory so there is no free visitor
-    int32_t bus_id = TryToReadNumaNode(rdma_adapter_->context_->device) + 1;
+
+    // TODO: This is to fix the 'invalid use of member in static member function
+    // bug'.
+    //       Waiting for better implementation.
+    //       int32_t bus_id = TryToReadNumaNode(rdma_adapter_->context_->device)
+    //       + 1;
+    int32_t bus_id = 0;
 
     SubAllocator::Visitor cuda_alloc_visitor = [](void* ptr, int gpu_id,
                                                   size_t num_bytes) {
@@ -288,9 +297,6 @@ void RdmaMgr::InitAllocators() {
     };
     GPUProcessState::singleton()->AddGPUAllocVisitor(bus_id,
                                                      cuda_alloc_visitor);
-    GPUProcessState::singleton()->AddCUDAHostAllocVisitor(bus_id,
-                                                          alloc_visitor);
-    GPUProcessState::singleton()->AddCUDAHostFreeVisitor(bus_id, free_visitor);
     LOG(INFO) << "Instrumenting GPU allocator with bus_id " << bus_id;
   }
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/verbs/verbs_server_lib.cc b/tensorflow/contrib/verbs/verbs_server_lib.cc
index 5b72b1604aca2e0c593978c6104322372788eb3c..d07fd5ae6e9cc0dbf67c6b6a4e8db086b4c74aa1 100644
--- a/tensorflow/contrib/verbs/verbs_server_lib.cc
+++ b/tensorflow/contrib/verbs/verbs_server_lib.cc
@@ -33,6 +33,8 @@ RendezvousMgrInterface* NewRdmaRendezvousMgr(const WorkerEnv* env) {
   return new RdmaRendezvousMgr(env);
 }
 
+std::once_flag reg_mem_visitors_call;
+
 }  // namespace
 
 VerbsServer::VerbsServer(const ServerDef& server_def, Env* env)
@@ -76,14 +78,13 @@ Status VerbsServer::ChannelCacheFactory(const ServerDef& server_def,
   return Status::OK();
 }
 
-namespace {
-std::once_flag reg_mem_visitors_call;
-}  // namespace
-
 Status VerbsServer::Init(ServiceInitFunction service_func,
                          RendezvousMgrCreationFunction rendezvous_mgr_func) {
   std::call_once(reg_mem_visitors_call, []() { RdmaMgr::RegMemVisitors(); });
-  Status s = GrpcServer::Init(service_func, rendezvous_mgr_func);
+  GrpcServerOptions opts;
+  opts.service_func = service_func;
+  opts.rendezvous_mgr_func = rendezvous_mgr_func;
+  Status s = GrpcServer::Init(opts);
   {
     mutex_lock l(mu_);
     CHECK_EQ(verbs_state_, DISCONNECTED);
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 66714235b535c14a8f13c40bb2a4df8d7494dc05..06c108b38fbf1d4b796c313ce700332803c73ef9 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -49,7 +49,7 @@
 # filegroup ":android_proto_srcs" - Protos
 # filegroup ":android_srcs" - Core sources
 # cc_library ":android_tensorflow_lib" - Native library
-# cc_library ":android_tensorflow_lib_selective_registration" - Native library
+# cc_library ":android_tensorflow_lib_lite" - Native library, without ops,
 #   supporting SELECTIVE_REGISTRATION feature.
 # portable_proto_library ":android_proto_lib" (Google-internal)
 #
@@ -70,10 +70,14 @@ package(default_visibility = [
 
 licenses(["notice"])  # Apache 2.0
 
+# Export the BUILD file so automated tooling can check licenses
+exports_files(["BUILD"])
+
 load(
     "//tensorflow:tensorflow.bzl",
     "cc_header_only_library",
     "if_android",
+    "if_emscripten",
     "if_ios",
     "if_linux_x86_64",
     "if_mobile",
@@ -84,10 +88,12 @@ load(
     "tf_copts",
     "tf_cuda_library",
     "tf_features_nomodules_if_android",
+    "tf_features_nomodules_if_emscripten",
     "tf_gen_op_libs",
     "tf_generate_proto_text_sources",
     "tf_genrule_cmd_append_to_srcs",
     "tf_opts_nortti_if_android",
+    "tf_opts_nortti_if_emscripten",
     "transitive_hdrs",
 )
 load("//tensorflow:tensorflow.bzl", "tf_cc_test_mkl")
@@ -113,7 +119,6 @@ load(
     "tf_additional_device_tracer_test_flags",
     "tf_additional_gdr_lib_defines",
     "tf_additional_human_readable_json_deps",
-    "tf_additional_logger_deps",
     "tf_additional_lib_defines",
     "tf_additional_lib_deps",
     "tf_additional_lib_hdrs",
@@ -123,7 +128,6 @@ load(
     "tf_additional_libdevice_srcs",
     "tf_additional_minimal_lib_srcs",
     "tf_additional_mpi_lib_defines",
-    "tf_additional_proto_compiler_hdrs",
     "tf_additional_proto_hdrs",
     "tf_additional_proto_srcs",
     "tf_additional_test_deps",
@@ -142,6 +146,7 @@ load(
     "tf_protos_grappler",
     "tf_protos_grappler_impl",
     "tf_pyclif_proto_library",
+    "tf_grpc_service_all",
 )
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
@@ -179,7 +184,6 @@ COMMON_PROTO_SRCS = [
     "framework/function.proto",
     "framework/graph.proto",
     "framework/graph_transfer_info.proto",
-    "framework/iterator.proto",
     "framework/kernel_def.proto",
     "framework/log_memory.proto",
     "framework/node_def.proto",
@@ -200,10 +204,12 @@ COMMON_PROTO_SRCS = [
     "protobuf/cluster.proto",
     "protobuf/debug.proto",
     "protobuf/device_properties.proto",
+    "protobuf/graph_debug_info.proto",
     "protobuf/queue_runner.proto",
     "protobuf/rewriter_config.proto",
     "protobuf/tensor_bundle.proto",
     "protobuf/saver.proto",
+    "protobuf/verifier_config.proto",
     "util/event.proto",
     "util/memmapped_file_system.proto",
     "util/saved_tensor_slice.proto",
@@ -223,13 +229,15 @@ CORE_PROTO_SRCS = COMMON_PROTO_SRCS + ERROR_CODES_PROTO_SRCS
 # ones with individual proto_library targets.
 ADDITIONAL_CORE_PROTO_SRCS = [
     "example/example_parser_configuration.proto",
-    "protobuf/checkpointable_object_graph.proto",
+    "protobuf/trackable_object_graph.proto",
     "protobuf/control_flow.proto",
     # TODO(ebrevdo): Re-enable once CriticalSection is in core.
     # "protobuf/critical_section.proto",
     "protobuf/meta_graph.proto",
     "protobuf/named_tensor.proto",
     "protobuf/saved_model.proto",
+    "protobuf/saved_object_graph.proto",
+    "protobuf/struct.proto",
     "protobuf/tensorflow_server.proto",
     "protobuf/transport_options.proto",
     "util/test_log.proto",
@@ -412,9 +420,8 @@ cc_library(
     name = "platform_protobuf",
     srcs = tf_platform_hdrs([
         "protobuf.h",
-    ]) + tf_platform_srcs([
-        "protobuf.cc",
     ]) + [
+        "platform/protobuf.cc",
         "platform/protobuf_util.cc",
         "lib/core/status.h",
     ],
@@ -433,6 +440,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "grpc_services",
+    srcs = [],
+    hdrs = [
+        "platform/grpc_services.h",
+    ],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = tf_grpc_service_all(),
+)
+
 cc_library(
     name = "human_readable_json",
     srcs = tf_platform_srcs(["human_readable_json.cc"]),
@@ -447,14 +465,11 @@ cc_library(
 
 cc_library(
     name = "logger",
-    srcs = tf_platform_srcs(["logger.cc"]),
-    hdrs = ["platform/logger.h"] + tf_platform_hdrs(["logger.h"]),
+    srcs = ["platform/logger.cc"],
+    hdrs = ["platform/logger.h"],
     copts = tf_copts(),
     visibility = ["//visibility:public"],
-    deps = [
-        ":lib",
-        ":lib_internal",
-    ] + tf_additional_logger_deps(),
+    deps = [":lib_proto_parsing"],
 )
 
 filegroup(
@@ -505,6 +520,7 @@ cc_library(
         ":platform_port",
         ":platform_protobuf",
         "//tensorflow/core/platform/default/build_config:env",
+        "//tensorflow/core/platform/default/build_config:port",
     ],
 )
 
@@ -660,7 +676,7 @@ cc_library(
     name = "lib_proto_compiler",
     hdrs = [
         "platform/protobuf_compiler.h",
-    ] + tf_additional_proto_compiler_hdrs(),
+    ],
     copts = tf_copts(),
     deps = tf_lib_proto_compiler_deps() + [
         ":lib_proto_parsing",
@@ -1018,6 +1034,7 @@ cc_library(
         ":lib",
         ":lib_internal",
         ":protos_all_cc",
+        "//tensorflow/core/util/proto:proto_utils",
     ],
 )
 
@@ -1044,13 +1061,13 @@ cc_library(
         "platform/default/integral_types.h",
         "platform/default/logging.h",
         "platform/default/mutex.h",
-        "platform/default/protobuf.h",
         "platform/default/thread_annotations.h",
         "platform/dynamic_annotations.h",
         "platform/macros.h",
         "platform/mutex.h",
         "platform/platform.h",
         "platform/prefetch.h",
+        "platform/protobuf.h",
         "platform/thread_annotations.h",
         "platform/types.h",
         "platform/cpu_info.h",
@@ -1075,6 +1092,7 @@ tf_gen_op_libs(
         "tensor_forest_ops",
         "candidate_sampling_ops",
         "checkpoint_ops",
+        "clustering_ops",
         "collective_ops",
         "control_flow_ops",
         "ctc_ops",
@@ -1100,6 +1118,7 @@ tf_gen_op_libs(
         "parsing_ops",
         "random_grad",
         "random_ops",
+        "stateful_random_ops",
         "remote_fused_graph_ops",
         "rpc_ops",
         "scoped_allocator_ops",
@@ -1134,6 +1153,13 @@ tf_gen_op_libs(
     deps = [":protos_all_cc"],
 )
 
+tf_gen_op_libs(
+    op_lib_names = [
+        "mkl_array_ops",
+    ],
+    deps = [":protos_all_cc"],
+)
+
 tf_gen_op_libs(
     op_lib_names = [
         "audio_ops",
@@ -1154,6 +1180,29 @@ tf_gen_op_libs(
     deps = [":lib"],
 )
 
+tf_gen_op_libs(
+    op_lib_names = [
+        "tpu_configuration_ops",
+        "tpu_cross_replica_ops",
+        "tpu_embedding_ops",
+        "tpu_functional_ops",
+        "tpu_heartbeat_ops",
+        "tpu_host_compute_ops",
+        "tpu_infeed_ops",
+        "tpu_outfeed_ops",
+        "tpu_ordinal_selector_ops",
+        "tpu_replication_ops",
+    ],
+    deps = [
+        ":lib",
+        ":lib_proto_parsing",
+        ":protos_all_cc",
+        "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_cc",
+        "//tensorflow/core/tpu:tpu_embedding_optimization_parameters_utils",
+        "//tensorflow/core/tpu:tpu_embedding_output_layout_utils",
+    ],
+)
+
 # And one for all user ops
 cc_library(
     name = "user_ops_op_lib",
@@ -1229,6 +1278,7 @@ cc_library(
         ":tensor_forest_ops_op_lib",
         ":candidate_sampling_ops_op_lib",
         ":checkpoint_ops_op_lib",
+        ":clustering_ops_op_lib",
         ":collective_ops_op_lib",
         ":control_flow_ops_op_lib",
         ":ctc_ops_op_lib",
@@ -1254,6 +1304,7 @@ cc_library(
         ":parsing_ops_op_lib",
         ":ragged_ops",
         ":random_ops_op_lib",
+        ":stateful_random_ops_op_lib",
         ":remote_fused_graph_ops_op_lib",
         ":resource_variable_ops_op_lib",
         ":rpc_ops_op_lib",
@@ -1268,10 +1319,23 @@ cc_library(
         ":state_ops_op_lib",
         ":stateless_random_ops_op_lib",
         ":string_ops_op_lib",
+        ":tpu_configuration_ops_op_lib",
+        ":tpu_cross_replica_ops_op_lib",
+        ":tpu_embedding_ops_op_lib",
+        ":tpu_functional_ops_op_lib",
+        ":tpu_heartbeat_ops_op_lib",
+        ":tpu_host_compute_ops_op_lib",
+        ":tpu_infeed_ops_op_lib",
+        ":tpu_outfeed_ops_op_lib",
+        ":tpu_ordinal_selector_ops_op_lib",
+        ":tpu_replication_ops_op_lib",
         ":training_ops_op_lib",
         ":user_ops_op_lib",
         ":word2vec_ops",
-    ] + if_mkl([":mkl_nn_ops_op_lib"]) + tf_additional_cloud_op_deps(),
+    ] + if_mkl([
+        ":mkl_array_ops_op_lib",
+        ":mkl_nn_ops_op_lib",
+    ]) + tf_additional_cloud_op_deps(),
     alwayslink = 1,
 )
 
@@ -1372,8 +1436,8 @@ cc_library(
 
 # This includes implementations of all kernels built into TensorFlow.
 cc_library(
-    name = "all_kernels_statically_linked",
-    visibility = ["//visibility:private"],
+    name = "all_kernels_impl",
+    visibility = ["//tensorflow/core:__subpackages__"],
     deps = [
         "//tensorflow/core/kernels:array",
         "//tensorflow/core/kernels:audio",
@@ -1383,12 +1447,12 @@ cc_library(
         "//tensorflow/core/kernels:tensor_forest_ops",
         "//tensorflow/core/kernels:candidate_sampler_ops",
         "//tensorflow/core/kernels:checkpoint_ops",
+        "//tensorflow/core/kernels:clustering_ops",
         "//tensorflow/core/kernels:collective_ops",
         "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/kernels:ctc_ops",
         "//tensorflow/core/kernels:cudnn_rnn_kernels",
         "//tensorflow/core/kernels:data_flow",
-        "//tensorflow/core/kernels:dataset_ops",
         "//tensorflow/core/kernels:decode_proto_op",
         "//tensorflow/core/kernels:encode_proto_op",
         "//tensorflow/core/kernels:fake_quant_ops",
@@ -1399,18 +1463,20 @@ cc_library(
         "//tensorflow/core/kernels:image",
         "//tensorflow/core/kernels:io",
         "//tensorflow/core/kernels:linalg",
-        "//tensorflow/core/kernels:list_kernels",
         "//tensorflow/core/kernels:lookup",
         "//tensorflow/core/kernels:logging",
         "//tensorflow/core/kernels:manip",
         "//tensorflow/core/kernels:math",
         "//tensorflow/core/kernels:multinomial_op",
+        "//tensorflow/core/kernels:mutex_ops",
         "//tensorflow/core/kernels:nn",
         "//tensorflow/core/kernels:parameterized_truncated_normal_op",
         "//tensorflow/core/kernels:parsing",
         "//tensorflow/core/kernels:partitioned_function_ops",
+        "//tensorflow/core/kernels:pooling_ops",
         "//tensorflow/core/kernels:ragged_ops",
         "//tensorflow/core/kernels:random_ops",
+        "//tensorflow/core/kernels:stateful_random_ops",
         "//tensorflow/core/kernels:random_poisson_op",
         "//tensorflow/core/kernels:remote_fused_graph_ops",
         "//tensorflow/core/kernels:required",
@@ -1441,6 +1507,7 @@ cc_library(
         "//tensorflow/core/kernels:mkl_identity_op",
         "//tensorflow/core/kernels:mkl_input_conversion_op",
         "//tensorflow/core/kernels:mkl_lrn_op",
+        "//tensorflow/core/kernels:mkl_requantize_ops",
         "//tensorflow/core/kernels:mkl_pooling_ops",
         "//tensorflow/core/kernels:mkl_relu_op",
         "//tensorflow/core/kernels:mkl_reshape_op",
@@ -1462,8 +1529,13 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = if_dynamic_kernels(
         [],
-        otherwise = [":all_kernels_statically_linked"],
-    ),
+        otherwise = [":all_kernels_impl"],
+    ) + [
+        # TODO(gunan): Work on the API between these and rest of TF and make
+        # these also dynamically loading.
+        "//tensorflow/core/kernels:dataset_ops",  # Depends on grappler
+        "//tensorflow/core/kernels:list_kernels",  # Depends on variant_op_registry.h
+    ],
 )
 
 tf_cuda_library(
@@ -1524,6 +1596,7 @@ cc_library(
         ":framework_internal",
         ":lib",
         ":lib_internal",
+        ":ops",
         ":protos_all_cc",
         ":shape_inference_testutil",
         ":tensor_testutil",
@@ -1611,6 +1684,9 @@ filegroup(
             "**/*main.cc",
             "debug/**/*",
             "framework/op_gen_*",
+            "framework/node_def_util.*",
+            "framework/op_kernel.*",
+            "framework/dataset.*",
             "lib/jpeg/**/*",
             "lib/png/**/*",
             "lib/gif/**/*",
@@ -1619,7 +1695,6 @@ filegroup(
             "util/reporter.*",
             "platform/**/cuda_libdevice_path.*",
             "platform/**/logger.cc",
-            "platform/**/logger.h",
             "platform/default/test_benchmark.*",
             "platform/cuda.h",
             "platform/google/**/*",
@@ -1654,6 +1729,9 @@ filegroup(
             "common_runtime/**/*.cc",
             "graph/**/*.h",
             "graph/**/*.cc",
+            "framework/node_def_util.*",
+            "framework/op_kernel.*",
+            "framework/dataset.*",
         ],
         exclude = [
             "**/*test.*",
@@ -1743,11 +1821,35 @@ cc_library(
 cc_library(
     name = "mobile_additional_lib_deps",
     deps = tf_additional_lib_deps() + [
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
 )
 
+cc_library(
+    name = "emscripten_tensorflow_lib_lite_nortti_lite_protos_no_runtime",
+    srcs = if_emscripten(["//tensorflow/core:mobile_srcs_no_runtime"]),
+    copts = ["-DSUPPORT_SELECTIVE_REGISTRATION"] + tf_opts_nortti_if_emscripten(),
+    defines = ["TENSORFLOW_LITE_PROTOS"],
+    linkopts = ["-lz"],
+    tags = [
+        "manual",
+        "notap",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":emscripten_proto_lib_no_rtti_lite_runtime",
+        ":mobile_additional_lib_deps",
+        ":stats_calculator_portable",
+        "//third_party/eigen3",
+        "@double_conversion//:double-conversion",
+        "@nsync//:nsync_cpp",
+        "@zlib_archive//:zlib",
+    ],
+    alwayslink = 1,
+)
+
 # Native library support for iOS applications.
 #
 # bazel  build --config=ios_x86_64 \
@@ -1829,27 +1931,6 @@ cc_library(
     alwayslink = 1,
 )
 
-# Android library for use with the SELECTIVE_REGISTRATION feature.
-# Does not contain operators. In contrast to android_tensorflow_lib_lite,
-# this links in framework support for all types, relying on selective
-# registration of ops to prune code size.
-#
-# TODO(gonnet): Move all users of these aliases to the corresponding
-#     :android_tensorflow_lib_lite* targets and remove.
-alias(
-    name = "android_tensorflow_lib_selective_registration",
-    actual = ":android_tensorflow_lib_lite",
-    visibility = ["//visibility:public"],
-)
-
-# Android library for use with the SELECTIVE_REGISTRATION feature with
-# no proto_rtti.
-alias(
-    name = "android_tensorflow_lib_selective_registration_nortti",
-    actual = ":android_tensorflow_lib_lite_nortti",
-    visibility = ["//visibility:public"],
-)
-
 filegroup(
     name = "android_op_registrations_and_gradients",
     srcs = glob(
@@ -1862,6 +1943,7 @@ filegroup(
             "**/*testutil*",
             "**/*testlib*",
             "**/*main.cc",
+            "**/tpu_*",
         ],
     ),
     visibility = ["//visibility:public"],
@@ -1964,6 +2046,14 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "rocm",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core/platform/default/build_config:rocm",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 # Clif-related proto libraries.
 
@@ -2023,6 +2113,13 @@ tf_pyclif_proto_library(
     visibility = ["//visibility:public"],
 )
 
+tf_pyclif_proto_library(
+    name = "framework/step_stats_pyclif",
+    proto_lib = ":protos_all_cc",
+    proto_srcfile = "framework/step_stats.proto",
+    visibility = ["//visibility:public"],
+)
+
 tf_pyclif_proto_library(
     name = "framework/types_pyclif",
     proto_lib = ":protos_all_cc",
@@ -2200,6 +2297,7 @@ cc_library(
         ],
     }),
     deps = tf_additional_lib_deps() + [
+        "@com_google_absl//absl/meta:type_traits",
         "@com_google_absl//absl/strings",
         "//third_party/eigen3",
         "@com_google_absl//absl/base:core_headers",
@@ -2214,7 +2312,6 @@ cc_library(
             "lib/**/*.cc",
             "platform/*.cc",
             "platform/profile_utils/**/*.cc",
-        ] + [
             "framework/resource_handle.cc",
             "util/env_var.cc",
         ],
@@ -2232,6 +2329,7 @@ cc_library(
             "platform/**/logging.cc",
             "platform/**/human_readable_json.cc",
             "platform/abi.cc",
+            "platform/protobuf.cc",
         ],
     ) + tf_additional_lib_srcs(
         exclude = [
@@ -2258,6 +2356,8 @@ cc_library(
         ":lib_proto_parsing",
         ":abi",
         ":core_stringpiece",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         "//third_party/eigen3",
         "//tensorflow/core/platform/default/build_config:platformlib",
         "@snappy",
@@ -2354,7 +2454,12 @@ cc_library(
 
 cc_library(
     name = "tflite_portable_logging",
-    srcs = [],
+    srcs = [
+    ] + if_ios([
+        "platform/default/logging.cc",
+        "platform/env_time.cc",
+        "platform/posix/env_time.cc",
+    ]),
     hdrs = [
         "lib/bfloat16/bfloat16.h",
         "platform/default/integral_types.h",
@@ -2363,7 +2468,7 @@ cc_library(
         "platform/macros.h",
         "platform/platform.h",
         "platform/types.h",
-    ] + if_windows(["platform/windows/integral_types.h"]),
+    ] + if_windows(["platform/windows/integral_types.h"]) + if_ios(["platform/env_time.h"]),
     copts = tf_copts(),
     linkopts = ["-ldl"],
     deps = [
@@ -2632,7 +2737,6 @@ tf_cuda_library(
             "example/**/*.cc",
             "framework/**/*.cc",
             "util/**/*.cc",
-        ] + [
             "graph/edgeset.cc",
             "graph/graph.cc",
             "graph/graph_def_builder.cc",
@@ -2773,6 +2877,7 @@ cc_library(
 # in this library.
 GRAPH_HDRS = [
     "graph/algorithm.h",
+    "graph/collective_order.h",
     "graph/colors.h",
     "graph/control_flow.h",
     "graph/costmodel.h",
@@ -2799,6 +2904,7 @@ tf_cuda_library(
     name = "graph",
     srcs = [
         "graph/algorithm.cc",
+        "graph/collective_order.cc",
         "graph/colors.cc",
         "graph/control_flow.cc",
         "graph/costmodel.cc",
@@ -2816,6 +2922,9 @@ tf_cuda_library(
         ":proto_text",
         ":protos_all_cc",
         "//third_party/eigen3",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2830,12 +2939,16 @@ CORE_CPU_BASE_HDRS = GRAPH_HDRS + [
     "framework/versions.h",
     "common_runtime/process_function_library_runtime.h",
     "common_runtime/function.h",
+    "common_runtime/scoped_allocator.h",
+    "common_runtime/scoped_allocator_mgr.h",
 ]
 
 tf_cuda_library(
     name = "core_cpu_base",
     srcs = [
         "common_runtime/eval_const_tensor.cc",
+        "common_runtime/scoped_allocator.cc",
+        "common_runtime/scoped_allocator_mgr.cc",
         "common_runtime/shape_refiner.cc",
         "common_runtime/shape_refiner.h",
         "framework/versions.h",
@@ -2868,6 +2981,7 @@ tf_cuda_library(
 
 CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/allocator_retry.h",
+    "common_runtime/shared_counter.h",
     "common_runtime/base_collective_executor.h",
     "common_runtime/bfc_allocator.h",
     "common_runtime/hierarchical_tree_broadcaster.h",
@@ -2892,9 +3006,11 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/lower_if_while.h",
     "common_runtime/lower_while_op.h",
     "common_runtime/memory_types.h",
+    "common_runtime/metrics.h",
     "common_runtime/mkl_cpu_allocator.h",
     "common_runtime/optimization_registry.h",
     "common_runtime/pending_counts.h",
+    "common_runtime/partitioning_utils.h",
     "common_runtime/placer.h",
     "common_runtime/process_util.h",
     "common_runtime/profile_handler.h",
@@ -2902,8 +3018,8 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/rendezvous_mgr.h",
     "common_runtime/rendezvous_util.h",
     "common_runtime/ring_reducer.h",
-    "common_runtime/scoped_allocator.h",
-    "common_runtime/scoped_allocator_mgr.h",
+    "common_runtime/ring_alg.h",
+    "common_runtime/ring_gatherer.h",
     "common_runtime/session_factory.h",
     "common_runtime/single_threaded_cpu_device.h",
     "common_runtime/stats_publisher_interface.h",
@@ -2928,6 +3044,8 @@ tf_cuda_library(
         "common_runtime/collective_param_resolver_local.cc",
         "common_runtime/collective_rma_local.cc",
         "common_runtime/collective_util.cc",
+        "common_runtime/colocation_graph.cc",
+        "common_runtime/colocation_graph.h",
         "common_runtime/constant_folding.cc",
         "common_runtime/copy_tensor.cc",
         "common_runtime/costmodel_manager.cc",
@@ -2948,9 +3066,11 @@ tf_cuda_library(
         "common_runtime/lower_if_while.cc",
         "common_runtime/lower_while_op.cc",
         "common_runtime/memory_types.cc",
+        "common_runtime/metrics.cc",
         "common_runtime/mkl_cpu_allocator.cc",
         "common_runtime/optimization_registry.cc",
         "common_runtime/parallel_concat_optimizer.cc",
+        "common_runtime/partitioning_utils.cc",
         "common_runtime/placer.cc",
         "common_runtime/pool_allocator.cc",
         "common_runtime/process_function_library_runtime.cc",
@@ -2959,9 +3079,9 @@ tf_cuda_library(
         "common_runtime/renamed_device.cc",
         "common_runtime/rendezvous_mgr.cc",
         "common_runtime/rendezvous_util.cc",
+        "common_runtime/ring_alg.cc",
+        "common_runtime/ring_gatherer.cc",
         "common_runtime/ring_reducer.cc",
-        "common_runtime/scoped_allocator.cc",
-        "common_runtime/scoped_allocator_mgr.cc",
         "common_runtime/session.cc",
         "common_runtime/session_factory.cc",
         "common_runtime/session_options.cc",
@@ -2989,8 +3109,9 @@ tf_cuda_library(
         ":proto_text",
         ":protos_all_cc",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         "//third_party/eigen3",
-        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/utils:functions",
     ] + mkl_deps(),
     alwayslink = 1,
 )
@@ -3048,15 +3169,6 @@ cc_library(
     deps = [":lib_internal"],
 )
 
-tf_cuda_library(
-    name = "metrics",
-    srcs = ["common_runtime/metrics.cc"],
-    hdrs = ["common_runtime/metrics.h"],
-    deps = [
-        ":lib",
-    ],
-)
-
 tf_cuda_library(
     name = "direct_session_internal",
     srcs = ["common_runtime/direct_session.cc"],
@@ -3073,7 +3185,6 @@ tf_cuda_library(
         ":graph",
         ":lib",
         ":lib_internal",
-        ":metrics",
         ":proto_text",
         ":protos_all_cc",
         "//tensorflow/core/debug:debug_graph_utils",
@@ -3440,6 +3551,7 @@ tf_cc_tests(
         "platform/vmodule_benchmark_test.cc",
     ],
     deps = [
+        ":core_cpu_internal",
         ":lib",
         ":lib_internal",
         ":lib_test_internal",
@@ -3505,6 +3617,29 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "platform_fake_python_env_test",
+    size = "small",
+    srcs = ["platform/fake_python_env_test.cc"],
+    args = [
+        "/some/path/to/pythontest.runfiles/org_tensorflow/stuff/to/run.py",
+    ],
+    tags = [
+        "local",
+        "no_windows",
+        "nogpu",
+        "nomac",
+        "notap",
+    ],
+    deps = [
+        ":lib",
+        ":lib_internal",
+        ":lib_test_internal",
+        ":test",
+        ":test_main",
+    ],
+)
+
 tf_cc_test(
     name = "platform_abi_test",
     size = "small",
@@ -3626,6 +3761,20 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "lib_strings_proto_serialization_test",
+    srcs = ["lib/strings/proto_serialization_test.cc"],
+    deps = [
+        ":lib",
+        ":lib_internal",
+        ":lib_test_internal",
+        ":protos_all_cc",
+        ":test",
+        ":test_main",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
 tf_cc_test(
     name = "lib_random_weighted_picker_test",
     size = "medium",
@@ -3678,7 +3827,6 @@ tf_cc_tests(
     srcs = [
         "common_runtime/buf_rendezvous_test.cc",
         "common_runtime/collective_executor_mgr_test.cc",
-        "common_runtime/collective_param_resolver_local_test.cc",
         "common_runtime/collective_rma_local_test.cc",
         "common_runtime/device_resolver_local_test.cc",
         "common_runtime/device_set_test.cc",
@@ -3794,6 +3942,7 @@ tf_cc_tests(
     name = "higher_level_tests_needing_kernels",
     size = "small",
     srcs = [
+        "common_runtime/collective_param_resolver_local_test.cc",
         "graph/graph_constructor_test.cc",
     ],
     linkopts = select({
@@ -3833,7 +3982,6 @@ tf_cc_test(
         "ops/cudnn_rnn_ops_test.cc",
     ],
     deps = [
-        ":cudnn_rnn_ops",
         "//tensorflow/core",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -3843,6 +3991,27 @@ tf_cc_test(
     ],
 )
 
+tf_cc_tests(
+    name = "collective_order_test",
+    size = "small",
+    srcs = [
+        "graph/collective_order_test.cc",
+    ],
+    deps = [
+        ":core",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":framework",
+        ":framework_internal",
+        ":lib",
+        ":lib_internal",
+        ":ops",
+        ":protos_all_cc",
+        ":test",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 tf_cc_tests_gpu(
     name = "ring_reducer_test",
     size = "medium",
@@ -3872,6 +4041,35 @@ tf_cc_tests_gpu(
     ],
 )
 
+tf_cc_tests_gpu(
+    name = "ring_gatherer_test",
+    size = "medium",
+    srcs = [
+        "common_runtime/ring_gatherer_test.cc",
+    ],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":all_kernels",
+        ":core",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session_internal",
+        ":framework",
+        ":framework_internal",
+        ":gpu_runtime",
+        ":lib",
+        ":lib_internal",
+        ":ops",
+        ":protos_all_cc",
+        ":protos_test_cc",
+        ":test",
+        ":test_main",
+        ":testlib",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
 tf_cc_tests_gpu(
     name = "hierarchical_tree_broadcaster_test",
     size = "medium",
@@ -4059,20 +4257,6 @@ tf_cuda_cc_test(
     ],
 )
 
-tf_cc_test_gpu(
-    name = "cuda_libdevice_path_test",
-    size = "small",
-    srcs = ["platform/cuda_libdevice_path_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
-    tags = tf_cuda_tests_tags(),
-    deps = [
-        ":cuda_libdevice_path",
-        ":lib",
-        ":test",
-        ":test_main",
-    ],
-)
-
 tf_cuda_only_cc_test(
     name = "util_cuda_kernel_helper_test",
     srcs = [
@@ -4206,7 +4390,7 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "common_runtime_process_function_library_runtime_test",
     size = "small",
     srcs = ["common_runtime/process_function_library_runtime_test.cc"],
@@ -4215,6 +4399,7 @@ tf_cc_test(
         ":core_cpu",
         ":core_cpu_internal",
         ":framework",
+        ":framework_internal",
         ":lib",
         ":test",
         ":test_main",
@@ -4223,6 +4408,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:resource_variable_ops",
     ],
 )
 
@@ -4264,6 +4450,27 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "common_runtime_partitioning_utils_test",
+    size = "small",
+    srcs = ["common_runtime/partitioning_utils_test.cc"],
+    deps = [
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":framework",
+        ":lib",
+        ":ops",
+        ":test",
+        ":test_main",
+        ":testlib",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:identity_op",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "common_runtime_direct_session_test",
     size = "small",
@@ -4401,7 +4608,7 @@ tf_cc_test(
         "//tensorflow/cc:scope",
         "//tensorflow/core/kernels:cwise_op",
         "//third_party/eigen3",
-    ],
+    ] + if_mkl([":mkl_array_ops_op_lib"]),
 )
 
 tf_cc_test(
@@ -4928,7 +5135,7 @@ filegroup(
 
 cc_library(
     name = "cuda_libdevice_path",
-    srcs = ["platform/cuda_libdevice_path.cc"] + tf_additional_libdevice_srcs(),
+    srcs = tf_additional_libdevice_srcs(),
     hdrs = ["platform/cuda_libdevice_path.h"],
     copts = tf_copts(),
     data = tf_additional_libdevice_data(),
@@ -4954,6 +5161,39 @@ transitive_hdrs(
 # -----------------------------------------------------------------------------
 # Google-internal targets go here (must be at the end).
 
+load("//tensorflow:tensorflow.bzl", "tf_portable_proto_library")
+
+genrule(
+    name = "emscripten_proto_config_lite_runtime",
+    outs = ["emscripten_proto_config_lite_runtime.asciipb"],
+    cmd = tf_genrule_cmd_append_to_srcs("optimize_mode:LITE_RUNTIME"),
+    visibility = ["//visibility:private"],
+)
+
+# We are keeping the "android" version of tf_android_core_proto_headers. All it does is
+# normalize CORE_PROTO_SRCS to generate valid output file names.
+tf_portable_proto_library(
+    name = "emscripten_proto_lib_no_rtti_lite_runtime",
+    config = ":emscripten_proto_config_lite_runtime",
+    copts = tf_opts_nortti_if_emscripten(),
+    features = tf_features_nomodules_if_emscripten(),
+    header_outs = tf_android_core_proto_headers(CORE_PROTO_SRCS) + ["//google/protobuf/any.proto.h"],
+    link_full_protobuf = False,
+    prefix_dir = "emscripten_proto_no_rtti",
+    proto_deps = [
+        ":protos_all_cc",
+        "@protobuf_archive//:protobuf",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+# There is currently no need for a full proto version of emscripten tf lib lite.
+alias(
+    name = "emscripten_lib_lite_no_runtime",
+    actual = "//tensorflow/core:emscripten_tensorflow_lib_lite_nortti_lite_protos_no_runtime",
+    visibility = ["//visibility:public"],
+)
+
 alias(
     name = "android_srcs_no_runtime",
     actual = ":mobile_srcs_no_runtime",
diff --git a/tensorflow/core/api_def/base_api/api_def_AllToAll.pbtxt b/tensorflow/core/api_def/base_api/api_def_AllToAll.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d6f28bd022bcd843aa3a7aeb8b1b257a3b3ddfd3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AllToAll.pbtxt
@@ -0,0 +1,67 @@
+op {
+  graph_op_name: "AllToAll"
+  in_arg {
+    name: "input"
+    description: <<END
+The local input to the sum.
+END
+  }
+  in_arg {
+    name: "group_assignment"
+    description: <<END
+An int32 tensor with shape
+[num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
+replica ids in the ith subgroup.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The exchanged result.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+The type of elements to be exchanged.
+END
+  }
+  attr {
+    name: "concat_dimension"
+    description: <<END
+The dimension number to concatenate.
+END
+  }
+  attr {
+    name: "split_dimension"
+    description: <<END
+The dimension number to split.
+END
+  }
+  attr {
+    name: "split_count"
+    description: <<END
+The number of splits, this number must equal to the sub-group
+size(group_assignment.get_shape()[1])
+END
+  }
+  summary: "An Op to exchange data across TPU replicas."
+  description: <<END
+On each replica, the input is split into `split_count` blocks along
+`split_dimension` and send to the other replicas given group_assignment. After
+receiving `split_count` - 1 blocks from other replicas, we concatenate the
+blocks along `concat_dimension` as the output.
+
+For example, suppose there are 2 TPU replicas:
+replica 0 receives input: `[[A, B]]`
+replica 1 receives input: `[[C, D]]`
+
+group_assignment=`[[0, 1]]`
+concat_dimension=0
+split_dimension=1
+split_count=2
+
+replica 0's output: `[[A], [C]]`
+replica 1's output: `[[B], [D]]`
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Case.pbtxt b/tensorflow/core/api_def/base_api/api_def_Case.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..56fef3ae6d0d452cb2caa57c36f35a04584864ee
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Case.pbtxt
@@ -0,0 +1,43 @@
+op {
+  graph_op_name: "Case"
+  in_arg {
+    name: "branch_index"
+    description: "The branch selector, an int32 Tensor."
+  }
+  in_arg {
+    name: "input"
+    description: "A list of input tensors passed to the branch function."
+  }
+  out_arg {
+    name: "output"
+    description: "A list of return values."
+  }
+  attr { name: "Tin"  description: "A list of input types." }
+  attr { name: "Tout"  description: "A list of output types." }
+  attr {
+    name: "branches"
+    description: <<END
+      A list of functions each of which takes 'inputs' and returns a list of
+      tensors, whose types are the same as what every other branch returns.
+END
+  }
+  summary: "An n-way switch statement which calls a single branch function."
+  description: <<END
+    An n-way switch statement, implementing the following:
+    ```
+    switch (branch_index) {
+      case 0:
+        output = branches[0](input);
+        break;
+      case 1:
+        output = branches[1](input);
+        break;
+      ...
+      case [[nbranches-1]]:
+      default:
+        output = branches[nbranches-1](input);
+        break;
+    }
+    ```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CollectiveGather.pbtxt b/tensorflow/core/api_def/base_api/api_def_CollectiveGather.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3cd833b9455458511787bec71d45531810574eb9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CollectiveGather.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "CollectiveGather"
+  summary: "Mutually accumulates multiple tensors of identical type and shape."
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CollectivePermute.pbtxt b/tensorflow/core/api_def/base_api/api_def_CollectivePermute.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aec724a115ed974b39a0ecd00985f97a8e7c2a97
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CollectivePermute.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "CollectivePermute"
+  in_arg {
+    name: "input"
+    description: <<END
+The local input to be permuted. Currently only supports float and
+bfloat16.
+END
+  }
+  in_arg {
+    name: "source_target_pairs"
+    description: <<END
+A tensor with shape [num_pairs, 2].
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The permuted input.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+The type of elements to be exchanged.
+END
+  }
+  summary: "An Op to permute tensors across replicated TPU instances."
+  description: <<END
+Each instance supplies its own input.
+
+For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
+source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
+`[D, A, B, C]`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CombinedNonMaxSuppression.pbtxt b/tensorflow/core/api_def/base_api/api_def_CombinedNonMaxSuppression.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..50e72a2446a9b4c304e23566fd1b3bbb974fb865
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CombinedNonMaxSuppression.pbtxt
@@ -0,0 +1,101 @@
+op {
+  graph_op_name: "CombinedNonMaxSuppression"
+  in_arg {
+    name: "boxes"
+    description: <<END
+A 4-D float tensor of shape `[batch_size, num_boxes, q, 4]`. If `q` is 1 then 
+same boxes are used for all classes otherwise, if `q` is equal to number of 
+classes, class-specific boxes are used.
+END
+  }
+  in_arg {
+    name: "scores"
+    description: <<END
+A 3-D float tensor of shape `[batch_size, num_boxes, num_classes]`
+representing a single score corresponding to each box (each row of boxes).
+END
+  }
+  in_arg {
+    name: "max_output_size_per_class"
+    description: <<END
+A scalar integer tensor representing the maximum number of 
+boxes to be selected by non max suppression per class
+END
+  }
+  in_arg {
+    name: "max_total_size"
+    description: <<END
+A scalar representing maximum number of boxes retained over all classes.
+END
+  }
+  in_arg {
+    name: "iou_threshold"
+    description: <<END
+A 0-D float tensor representing the threshold for deciding whether
+boxes overlap too much with respect to IOU.
+END
+  }
+  in_arg {
+    name: "score_threshold"
+    description: <<END
+A 0-D float tensor representing the threshold for deciding when to remove
+boxes based on score.
+END
+  }
+  attr {
+    name: "pad_per_class"
+    description: <<END
+If false, the output nmsed boxes, scores and classes
+are padded/clipped to `max_total_size`. If true, the
+output nmsed boxes, scores and classes are padded to be of length
+`max_size_per_class`*`num_classes`, unless it exceeds `max_total_size` in
+which case it is clipped to `max_total_size`. Defaults to false.
+END
+  }
+  out_arg {
+    name: "nmsed_boxes"
+    description: <<END
+A [batch_size, max_detections, 4] float32 tensor 
+containing the non-max suppressed boxes.
+END
+  }
+  out_arg {
+    name: "nmsed_scores"
+    description: <<END
+A [batch_size, max_detections] float32 tensor 
+containing the scores for the boxes.
+END
+  }
+  out_arg {
+    name: "nmsed_classes"
+    description: <<END
+A [batch_size, max_detections] float32 tensor 
+containing the classes for the boxes.
+END
+  }
+  out_arg {
+    name: "valid_detections"
+    description: <<END
+A [batch_size] int32 tensor indicating the number of
+valid detections per batch item. Only the top num_detections[i] entries in
+nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The rest of the
+entries are zero paddings.
+END
+  }
+  summary: "Greedily selects a subset of bounding boxes in descending order of score,"
+  description: <<END
+This operation performs non_max_suppression on the inputs per batch, across
+all classes.
+Prunes away boxes that have high intersection-over-union (IOU) overlap
+with previously selected boxes.  Bounding boxes are supplied as
+[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+diagonal pair of box corners and the coordinates can be provided as normalized
+(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+is agnostic to where the origin is in the coordinate system. Also note that
+this algorithm is invariant to orthogonal transformations and translations
+of the coordinate system; thus translating or reflections of the coordinate
+system result in the same boxes being selected by the algorithm.
+The output of this operation is the final boxes, scores and classes tensor
+returned after performing non_max_suppression.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ConfigureDistributedTPU.pbtxt b/tensorflow/core/api_def/base_api/api_def_ConfigureDistributedTPU.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a710f605b74773e255f790954d09653b45983c03
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ConfigureDistributedTPU.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "ConfigureDistributedTPU"
+  out_arg {
+    name: "topology"
+    description: <<END
+A serialized tensorflow.tpu.TopologyProto that describes the TPU
+topology.
+END
+  }
+  attr {
+    name: "embedding_config"
+    description: <<END
+Reserved. Do not use.
+END
+  }
+  attr {
+    name: "tpu_embedding_config"
+    description: <<END
+Serialized tensorflow.tpu.TPUEmbeddingConfiguration that
+describes the embedding lookups of the program.
+END
+  }
+  attr {
+    name: "is_global_init"
+    description: <<END
+Reserved. Do not use.
+END
+  }
+  summary: "Sets up the centralized structures for a distributed TPU system."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv2D.pbtxt
index 070d6adb978e4a62e7209f299dba08515aa21e83..d0794de4ba4a174838547865e4f1692cff503052 100644
--- a/tensorflow/core/api_def/base_api/api_def_Conv2D.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Conv2D.pbtxt
@@ -33,6 +33,15 @@ END
     name: "padding"
     description: <<END
 The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "explicit_paddings"
+    description: <<END
+If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+dimension, the amount of padding inserted before and after the dimension is
+`explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+`padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilter.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilter.pbtxt
index ff2d9d71db646a27a88763f79bb6beb6b5ede44b..c8af9ff976688a0db78d26a495543cc3c052944a 100644
--- a/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilter.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropFilter.pbtxt
@@ -41,6 +41,15 @@ END
     name: "padding"
     description: <<END
 The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "explicit_paddings"
+    description: <<END
+If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+dimension, the amount of padding inserted before and after the dimension is
+`explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+`padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInput.pbtxt
index 2de38b4263a380b5d0aec45270b9b67347c7021d..8aaae4aab6fd006931ce9f3ef1633a2c1e7c613b 100644
--- a/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Conv2DBackpropInput.pbtxt
@@ -40,6 +40,15 @@ END
     name: "padding"
     description: <<END
 The type of padding algorithm to use.
+END
+  }
+  attr {
+    name: "explicit_paddings"
+    description: <<END
+If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+dimension, the amount of padding inserted before and after the dimension is
+`explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+`padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_CrossReplicaSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_CrossReplicaSum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fd4c3437775beb0cd3f31842f3046b15ee32f9f2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CrossReplicaSum.pbtxt
@@ -0,0 +1,38 @@
+op {
+  graph_op_name: "CrossReplicaSum"
+  in_arg {
+    name: "input"
+    description: <<END
+The local input to the sum.
+END
+  }
+  in_arg {
+    name: "group_assignment"
+    description: <<END
+An int32 tensor with shape
+[num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
+replica ids in the ith subgroup.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The sum of all the distributed inputs.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+The type of elements to be summed.
+END
+  }
+  summary: "An Op to sum inputs across replicated TPU instances."
+  description: <<END
+Each instance supplies its own input.
+
+For example, suppose there are 8 TPU instances: `[A, B, C, D, E, F, G, H]`.
+Passing group_assignment=`[[0,2,4,6],[1,3,5,7]]` sets `A, C, E, G` as group 0,
+and `B, D, F, H` as group 1. Thus we get the outputs:
+`[A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H]`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CudnnRNNBackpropV3.pbtxt b/tensorflow/core/api_def/base_api/api_def_CudnnRNNBackpropV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7967ca7c5d17abd6451f0cd05c8154c3eaf4766b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CudnnRNNBackpropV3.pbtxt
@@ -0,0 +1,49 @@
+op {
+  graph_op_name: "CudnnRNNBackpropV3"
+  visibility: HIDDEN
+  summary: "Backprop step of CudnnRNNV3."
+  description: <<END
+Compute the backprop of both data and weights in a RNN. Takes an extra
+    "sequence_lengths" input than CudnnRNNBackprop.
+
+rnn_mode: Indicates the type of the RNN model.
+input_mode: Indicates whether there is a linear projection between the input and
+    the actual computation before the first layer. 'skip_input' is only allowed
+    when input_size == num_units; 'auto_select' implies 'skip_input' when
+    input_size == num_units; otherwise, it implies 'linear_input'.
+direction: Indicates whether a bidirectional model will be used. Should be
+  "unidirectional" or "bidirectional".
+dropout: Dropout probability. When set to 0., dropout is disabled.
+seed: The 1st part of a seed to initialize dropout.
+seed2: The 2nd part of a seed to initialize dropout.
+input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+    num_units].
+input_c: For LSTM, a 3-D tensor with the shape of
+    [num_layer * dir, batch, num_units]. For other models, it is ignored.
+params: A 1-D tensor that contains the weights and biases in an opaque layout.
+    The size must be created through CudnnRNNParamsSize, and initialized
+    separately. Note that they might not be compatible across different
+    generations. So it is a good idea to save and restore
+sequence_lengths: a vector of lengths of each input sequence.
+output: A 3-D tensor with the shape of [seq_length, batch_size,
+    dir * num_units].
+output_h: The same shape has input_h.
+output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+output_backprop: A 3-D tensor with the same shape as output in the forward pass.
+output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
+    pass.
+output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
+    pass.
+reserve_space: The same reserve_space produced in the forward operation.
+input_backprop: The backprop to input in the forward pass. Has the same shape
+    as input.
+input_h_backprop: The backprop to input_h in the forward pass. Has the same
+    shape as input_h.
+input_c_backprop: The backprop to input_c in the forward pass. Has the same
+    shape as input_c.
+params_backprop: The backprop to the params buffer in the forward pass. Has the
+    same shape as params.
+END
+}
+
diff --git a/tensorflow/core/api_def/base_api/api_def_CudnnRNNV3.pbtxt b/tensorflow/core/api_def/base_api/api_def_CudnnRNNV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9cde53684d0350510b18c35e991a9f526c5bb212
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CudnnRNNV3.pbtxt
@@ -0,0 +1,39 @@
+op {
+  graph_op_name: "CudnnRNNV3"
+  visibility: HIDDEN
+  summary: "A RNN backed by cuDNN."
+  description: <<END
+Computes the RNN from the input and initial states, with respect to the params
+buffer. Accepts one extra input "sequence_lengths" than CudnnRNN.
+
+rnn_mode: Indicates the type of the RNN model.
+input_mode: Indicates whether there is a linear projection between the input and
+  the actual computation before the first layer. 'skip_input' is only allowed
+  when input_size == num_units; 'auto_select' implies 'skip_input' when
+  input_size == num_units; otherwise, it implies 'linear_input'.
+direction: Indicates whether a bidirectional model will be used. Should be
+  "unidirectional" or "bidirectional".
+dropout: Dropout probability. When set to 0., dropout is disabled.
+seed: The 1st part of a seed to initialize dropout.
+seed2: The 2nd part of a seed to initialize dropout.
+input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+    num_units].
+input_c: For LSTM, a 3-D tensor with the shape of
+    [num_layer * dir, batch, num_units]. For other models, it is ignored.
+params: A 1-D tensor that contains the weights and biases in an opaque layout.
+    The size must be created through CudnnRNNParamsSize, and initialized
+    separately. Note that they might not be compatible across different
+    generations. So it is a good idea to save and restore
+sequence_lengths: a vector of lengths of each input sequence.
+output: A 3-D tensor with the shape of [seq_length, batch_size,
+    dir * num_units].
+output_h: The same shape has input_h.
+output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+is_training: Indicates whether this operation is used for inferenece or
+  training.
+reserve_space: An opaque tensor that can be used in backprop calculation. It
+  is only produced if is_training is true.
+END
+}
+
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodeProtoV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodeProtoV2.pbtxt
index c8152f53c4ded035140abd24ba006bf391641cf1..22c3524360c196bfdeda1221842c3da7af7701ef 100644
--- a/tensorflow/core/api_def/base_api/api_def_DecodeProtoV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DecodeProtoV2.pbtxt
@@ -31,7 +31,8 @@ END
   attr {
     name: "field_names"
     description: <<END
-List of strings containing proto field names.
+List of strings containing proto field names. An extension field can be decoded
+by using its full name, e.g. EXT_PACKAGE.EXT_FIELD_NAME.
 END
   }
   attr {
diff --git a/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingIntegerBatch.pbtxt b/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingIntegerBatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0317c4ea1d9345b9522fbbf978c15bc09ba534d0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingIntegerBatch.pbtxt
@@ -0,0 +1,27 @@
+op {
+  graph_op_name: "EnqueueTPUEmbeddingIntegerBatch"
+  in_arg {
+    name: "batch"
+    description: <<END
+A list of 1D tensors, one for each embedding table, containing the
+indices into the tables.
+END
+  }
+  in_arg {
+    name: "mode_override"
+    description: <<END
+A string input that overrides the mode specified in the
+TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+END
+  }
+  attr {
+    name: "device_ordinal"
+    description: <<END
+The TPU device to use. Should be >= 0 and less than the number
+of TPU cores in the task on which the node is placed.
+END
+  }
+  summary: "An op that enqueues a list of input batch tensors to TPUEmbedding."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingSparseBatch.pbtxt b/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingSparseBatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bb476ce3fcf2df6f1d8d7ce2f1085ff885343535
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingSparseBatch.pbtxt
@@ -0,0 +1,65 @@
+op {
+  graph_op_name: "EnqueueTPUEmbeddingSparseBatch"
+  in_arg {
+    name: "sample_indices"
+    description: <<END
+A list of rank 1 Tensors specifying the training example and
+feature to which the corresponding embedding_indices and aggregation_weights
+values belong. sample_indices[i] must equal b * nf + f, where nf is the
+number of features from the corresponding table, f is in [0, nf), and
+b is in [0, batch size).
+END
+  }
+  in_arg {
+    name: "embedding_indices"
+    description: <<END
+A list of rank 1 Tensors, indices into the embedding tables.
+END
+  }
+  in_arg {
+    name: "aggregation_weights"
+    description: <<END
+A list of rank 1 Tensors containing per sample -- i.e. per
+(training example, feature) -- aggregation weights.
+END
+  }
+  in_arg {
+    name: "mode_override"
+    description: <<END
+A string input that overrides the mode specified in the
+TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+END
+  }
+  attr {
+    name: "device_ordinal"
+    description: <<END
+The TPU device to use. Should be >= 0 and less than the number
+of TPU cores in the task on which the node is placed.
+END
+  }
+  attr {
+    name: "combiners"
+    description: <<END
+A list of string scalars, one for each embedding table that specify
+how to normalize the embedding activations after weighted summation.
+Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
+the sum of the weights be 0 for 'mean' or the sum of the squared weights be
+0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
+all tables.
+END
+  }
+  summary: "An op that enqueues TPUEmbedding input indices from a SparseTensor."
+  description: <<END
+This Op eases the porting of code that uses embedding_lookup_sparse(),
+although some Python preprocessing of the SparseTensor arguments to
+embedding_lookup_sparse() is required to produce the arguments to this Op,
+since only a single EnqueueTPUEmbeddingSparseBatch Op is allowed per training
+step.
+
+The tensors at corresponding positions in the three input lists
+must have the same shape, i.e. rank 1 with dim_size() equal to the total
+number of lookups into the table described by the corresponding table_id.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingSparseTensorBatch.pbtxt b/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingSparseTensorBatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8cef870a6fe9e37492997baf86e8702412b82802
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EnqueueTPUEmbeddingSparseTensorBatch.pbtxt
@@ -0,0 +1,74 @@
+op {
+  graph_op_name: "EnqueueTPUEmbeddingSparseTensorBatch"
+  in_arg {
+    name: "sample_indices"
+    description: <<END
+A list of rank 1 Tensors specifying the training example to
+which the corresponding embedding_indices and aggregation_weights values
+belong. It corresponds to sp_ids.indices[:,0] in  embedding_lookup_sparse().
+END
+  }
+  in_arg {
+    name: "embedding_indices"
+    description: <<END
+A list of rank 1 Tensors, indices into the embedding tables.
+It corresponds to sp_ids.values in embedding_lookup_sparse().
+END
+  }
+  in_arg {
+    name: "aggregation_weights"
+    description: <<END
+A list of rank 1 Tensors containing per training example
+aggregation weights. It corresponds to sp_weights.values in
+embedding_lookup_sparse().
+END
+  }
+  in_arg {
+    name: "mode_override"
+    description: <<END
+A string input that overrides the mode specified in the
+TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+END
+  }
+  attr {
+    name: "device_ordinal"
+    description: <<END
+The TPU device to use. Should be >= 0 and less than the number
+of TPU cores in the task on which the node is placed.
+END
+  }
+  attr {
+    name: "combiners"
+    description: <<END
+A list of string scalars, one for each embedding table that specify
+how to normalize the embedding activations after weighted summation.
+Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
+the sum of the weights be 0 for 'mean' or the sum of the squared weights be
+0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
+all tables.
+END
+  }
+  attr {
+    name: "table_ids"
+    description: <<END
+A list of integers specifying the identifier of the embedding table
+(offset of TableDescriptor in the TPUEmbeddingConfiguration) to lookup the
+corresponding input. The ith input is looked up using table_ids[i]. The size
+of the table_ids list must be equal to that of sample_indices,
+embedding_indices and aggregation_weights.
+END
+  }
+  summary: "Eases the porting of code that uses tf.nn.embedding_lookup_sparse()."
+  description: <<END
+sample_indices[i], embedding_indices[i] and aggregation_weights[i] correspond
+to the ith feature. table_ids[i] indicates which embedding table to look up ith
+feature.
+
+The tensors at corresponding positions in the three input lists (sample_indices,
+embedding_indices and aggregation_weights) must have the same shape, i.e. rank 1
+with dim_size() equal to the total number of lookups into the table described by
+the corresponding feature.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_EuclideanNorm.pbtxt b/tensorflow/core/api_def/base_api/api_def_EuclideanNorm.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7d815b856bfb3c97b9347aa49b22a3b2f00908b4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EuclideanNorm.pbtxt
@@ -0,0 +1,39 @@
+op {
+  graph_op_name: "EuclideanNorm"
+  endpoint {
+    name: "EuclideanNorm"
+  }
+  in_arg {
+    name: "input"
+    description: <<END
+The tensor to reduce.
+END
+  }
+  in_arg {
+    name: "reduction_indices"
+    rename_to: "axis"
+    description: <<END
+The dimensions to reduce. Must be in the range
+`[-rank(input), rank(input))`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The reduced tensor.
+END
+  }
+  attr {
+    name: "keep_dims"
+    description: <<END
+If true, retain reduced dimensions with length 1.
+END
+  }
+  summary: "Computes the euclidean norm of elements across dimensions of a tensor."
+  description: <<END
+Reduces `input` along the dimensions given in `reduction_indices`. Unless
+`keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+`reduction_indices`. If `keep_dims` is true, the reduced dimensions are
+retained with length 1.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalChooseFastestDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalChooseFastestDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7aa7a59bb67ade421ec12a9ec45326106d57ffc0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalChooseFastestDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ExperimentalChooseFastestDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalRebatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalRebatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b8455308e5c8ea178680ecdc6d443054f198ede6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalRebatchDataset.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "ExperimentalRebatchDataset"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the input dataset.
+END
+  }
+  in_arg {
+  name: "num_workers"
+  description: <<END
+A scalar representing the number of workers to distribute this batch across. As
+a result of this transformation the current batch size would end up being
+divided  by this parameter.
+END
+  }
+  summary: "Creates a dataset that changes the batch size."
+  description: <<END
+Creates a dataset that changes the batch size of the dataset to current batch
+size // num_workers.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalTakeWhileDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalTakeWhileDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..699e0c2e39a78265a7cd5a149193d6454d7ef78a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalTakeWhileDataset.pbtxt
@@ -0,0 +1,25 @@
+op {
+  graph_op_name: "ExperimentalTakeWhileDataset"
+  visibility: HIDDEN
+  in_arg {
+    name: "other_arguments"
+    description: <<END
+A list of tensors, typically values that were captured when
+building a closure for `predicate`.
+END
+  }
+  attr {
+    name: "predicate"
+    description: <<END
+A function returning a scalar boolean.
+END
+  }
+  summary: "Creates a dataset that stops iteration when predicate` is false."
+  description: <<END
+The `predicate` function must return a scalar boolean and accept the
+following arguments:
+
+* One tensor for each component of an element of `input_dataset`.
+* One tensor for each value in `other_arguments`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExtractGlimpse.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExtractGlimpse.pbtxt
index c10a1bb778e1d8b45b59113d255d69c55a224643..4b951659a2b46a7bb50f038b156b78153d738c6d 100644
--- a/tensorflow/core/api_def/base_api/api_def_ExtractGlimpse.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExtractGlimpse.pbtxt
@@ -48,6 +48,14 @@ END
     description: <<END
 indicates if the noise should be generated using a
 uniform distribution or a Gaussian distribution.
+END
+  }
+  attr {
+    name: "noise"
+    description: <<END
+indicates if the noise should `uniform`, `gaussian`, or
+`zero`. The default is `uniform` which means the the noise type
+will be decided by `uniform_noise`.
 END
   }
   summary: "Extracts a glimpse from the input tensor."
diff --git a/tensorflow/core/api_def/base_api/api_def_InfeedDequeue.pbtxt b/tensorflow/core/api_def/base_api/api_def_InfeedDequeue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..99ca55a25ff865ecd0c15f2087e15e6ee41ea6f7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InfeedDequeue.pbtxt
@@ -0,0 +1,22 @@
+op {
+  graph_op_name: "InfeedDequeue"
+  out_arg {
+    name: "output"
+    description: <<END
+A tensor that will be provided using the infeed mechanism.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of elements in the tensor.
+END
+  }
+  attr {
+    name: "shape"
+    description: <<END
+The shape of the tensor.
+END
+  }
+  summary: "A placeholder op for a value that will be fed into the computation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InfeedDequeueTuple.pbtxt b/tensorflow/core/api_def/base_api/api_def_InfeedDequeueTuple.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..61b6ded66ba2dc156526c040b44312ec1a33f603
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InfeedDequeueTuple.pbtxt
@@ -0,0 +1,22 @@
+op {
+  graph_op_name: "InfeedDequeueTuple"
+  out_arg {
+    name: "outputs"
+    description: <<END
+A list of tensors that will be provided using the infeed mechanism.
+END
+  }
+  attr {
+    name: "dtypes"
+    description: <<END
+The element types of each element in `outputs`.
+END
+  }
+  attr {
+    name: "shapes"
+    description: <<END
+The shapes of each tensor in `outputs`.
+END
+  }
+  summary: "Fetches multiple values from infeed as an XLA tuple."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InfeedEnqueue.pbtxt b/tensorflow/core/api_def/base_api/api_def_InfeedEnqueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e08f4e6ff74f10dd7ffa6df18d0c5753e9498456
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InfeedEnqueue.pbtxt
@@ -0,0 +1,38 @@
+op {
+  graph_op_name: "InfeedEnqueue"
+  in_arg {
+    name: "input"
+    description: <<END
+A tensor that will be provided using the infeed mechanism.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of elements in the tensor.
+END
+  }
+  attr {
+    name: "shape"
+    description: <<END
+The shape of the tensor.
+END
+  }
+  attr {
+    name: "layout"
+    description: <<END
+A vector holding the requested layout in minor-to-major sequence.
+If a layout attribute is passed, but its values are all -1, the layout will
+be computed by the infeed operation.
+END
+  }
+  attr {
+    name: "device_ordinal"
+    description: <<END
+The TPU device to use. This should be -1 when the Op
+is running on a TPU device, and >= 0 when the Op is running on the CPU
+device.
+END
+  }
+  summary: "An op which feeds a single Tensor value into the computation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InfeedEnqueueTuple.pbtxt b/tensorflow/core/api_def/base_api/api_def_InfeedEnqueueTuple.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f87d6d76d8bb92450d16d0935c0b533fdc02a94a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InfeedEnqueueTuple.pbtxt
@@ -0,0 +1,39 @@
+op {
+  graph_op_name: "InfeedEnqueueTuple"
+  in_arg {
+    name: "inputs"
+    description: <<END
+A list of tensors that will be provided using the infeed mechanism.
+END
+  }
+  attr {
+    name: "dtypes"
+    description: <<END
+The element types of each element in `inputs`.
+END
+  }
+  attr {
+    name: "shapes"
+    description: <<END
+The shapes of each tensor in `inputs`.
+END
+  }
+  attr {
+    name: "layouts"
+    description: <<END
+A vector holding the requested layout in minor-to-major sequence for
+all the tuple shapes, in the order the shapes appear in the "shapes" input.
+The layout elements for a sub-shape can be set to -1, in which case the
+corresponding layout will be computed by the infeed operation.
+END
+  }
+  attr {
+    name: "device_ordinal"
+    description: <<END
+The TPU device to use. This should be -1 when the Op
+is running on a TPU device, and >= 0 when the Op is running on the CPU
+device.
+END
+  }
+  summary: "Feeds multiple Tensor values into the computation as an XLA tuple."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_KMC2ChainInitialization.pbtxt b/tensorflow/core/api_def/base_api/api_def_KMC2ChainInitialization.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c6ff4b9e2d70afcb6836921a498dba69a834baec
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_KMC2ChainInitialization.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "KMC2ChainInitialization"
+  visibility: HIDDEN
+  in_arg {
+    name: "distances"
+    description: <<END
+Vector with squared distances to the closest previously sampled cluster center
+for each candidate point.
+END
+  }
+  in_arg {
+    name: "seed"
+    description: <<END
+Scalar. Seed for initializing the random number generator.
+END
+  }
+  out_arg {
+    name: "index"
+    description: <<END
+Scalar with the index of the sampled point.
+END
+  }
+  summary: "Returns the index of a data point that should be added to the seed set."
+  description: <<END
+Entries in distances are assumed to be squared distances of candidate points to
+the already sampled centers in the seed set. The op constructs one Markov chain
+of the k-MC^2 algorithm and returns the index of one candidate point to be added
+as an additional cluster center.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_KmeansPlusPlusInitialization.pbtxt b/tensorflow/core/api_def/base_api/api_def_KmeansPlusPlusInitialization.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..461820486b94808346618bf0dbc756164032a044
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_KmeansPlusPlusInitialization.pbtxt
@@ -0,0 +1,44 @@
+op {
+  graph_op_name: "KmeansPlusPlusInitialization"
+  visibility: HIDDEN
+  in_arg {
+    name: "points"
+    description: <<END
+Matrix of shape (n, d). Rows are assumed to be input points.
+END
+  }
+  in_arg {
+    name: "num_to_sample"
+    description: <<END
+Scalar. The number of rows to sample. This value must not be larger than n.
+END
+  }
+  in_arg {
+    name: "seed"
+    description: <<END
+Scalar. Seed for initializing the random number generator.
+END
+  }
+  in_arg {
+    name: "num_retries_per_sample"
+    description: <<END
+Scalar. For each row that is sampled, this parameter
+specifies the number of additional points to draw from the current
+distribution before selecting the best. If a negative value is specified, a
+heuristic is used to sample O(log(num_to_sample)) additional points.
+END
+  }
+  out_arg {
+    name: "samples"
+    description: <<END
+Matrix of shape (num_to_sample, d). The sampled rows.
+END
+  }
+  summary: "Selects num_to_sample rows of input using the KMeans++ criterion."
+  description: <<END
+Rows of points are assumed to be input points. One row is selected at random.
+Subsequent rows are sampled with probability proportional to the squared L2
+distance from the nearest row selected thus far till num_to_sample rows have
+been sampled.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingADAMParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingADAMParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..43901e1e4403a4ab0f3759470d8c26bc95ac8077
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingADAMParameters.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingADAMParameters"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the ADAM optimization algorithm.
+END
+  }
+  in_arg {
+    name: "momenta"
+    description: <<END
+Value of momenta used in the ADAM optimization algorithm.
+END
+  }
+  in_arg {
+    name: "velocities"
+    description: <<END
+Value of velocities used in the ADAM optimization algorithm.
+END
+  }
+  summary: "Load ADAM embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingADAMParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingADAMParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aaa52f6aa900a9fc1bca708b5cf4c8ed563f38a8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingADAMParametersGradAccumDebug.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingADAMParametersGradAccumDebug"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the ADAM optimization algorithm.
+END
+  }
+  in_arg {
+    name: "momenta"
+    description: <<END
+Value of momenta used in the ADAM optimization algorithm.
+END
+  }
+  in_arg {
+    name: "velocities"
+    description: <<END
+Value of velocities used in the ADAM optimization algorithm.
+END
+  }
+  in_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Value of gradient_accumulators used in the ADAM optimization algorithm.
+END
+  }
+  summary: "Load ADAM embedding parameters with debug support."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdadeltaParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdadeltaParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e306329765f692bdae321c8e4e2e2915f6cd9ad5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdadeltaParameters.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingAdadeltaParameters"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the Adadelta optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the Adadelta optimization algorithm.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+Value of updates used in the Adadelta optimization algorithm.
+END
+  }
+  summary: "Load Adadelta embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dd2e3b924b0a519fe9f7071d48b90d6e127cdf54
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the Adadelta optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the Adadelta optimization algorithm.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+Value of updates used in the Adadelta optimization algorithm.
+END
+  }
+  in_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Value of gradient_accumulators used in the Adadelta optimization algorithm.
+END
+  }
+  summary: "Load Adadelta parameters with debug support."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdagradParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdagradParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..94db7dfc263e0900e4c18e44a44dd43e23db85ca
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdagradParameters.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingAdagradParameters"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the Adagrad optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the Adagrad optimization algorithm.
+END
+  }
+  summary: "Load Adagrad embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e2d10441dbeafd7885cc2f4c376b48c47ddfd5f4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingAdagradParametersGradAccumDebug"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the Adagrad optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the Adagrad optimization algorithm.
+END
+  }
+  in_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Value of gradient_accumulators used in the Adagrad optimization algorithm.
+END
+  }
+  summary: "Load Adagrad embedding parameters with debug support."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingCenteredRMSPropParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingCenteredRMSPropParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..43cf8845861e54ac1e70061cfbcd23cd161a14ef
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingCenteredRMSPropParameters.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingCenteredRMSPropParameters"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the centered RMSProp optimization algorithm.
+END
+  }
+  in_arg {
+    name: "ms"
+    description: <<END
+Value of ms used in the centered RMSProp optimization algorithm.
+END
+  }
+  in_arg {
+    name: "mom"
+    description: <<END
+Value of mom used in the centered RMSProp optimization algorithm.
+END
+  }
+  in_arg {
+    name: "mg"
+    description: <<END
+Value of mg used in the centered RMSProp optimization algorithm.
+END
+  }
+  summary: "Load centered RMSProp embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingFTRLParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingFTRLParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d3516888c71b82213048afc449d9064a39fe9c1e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingFTRLParameters.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingFTRLParameters"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the FTRL optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the FTRL optimization algorithm.
+END
+  }
+  in_arg {
+    name: "linears"
+    description: <<END
+Value of linears used in the FTRL optimization algorithm.
+END
+  }
+  summary: "Load FTRL embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d6c80f66df6bf4238a0572e48eef3193bf946e48
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingFTRLParametersGradAccumDebug"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the FTRL optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the FTRL optimization algorithm.
+END
+  }
+  in_arg {
+    name: "linears"
+    description: <<END
+Value of linears used in the FTRL optimization algorithm.
+END
+  }
+  in_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Value of gradient_accumulators used in the FTRL optimization algorithm.
+END
+  }
+  summary: "Load FTRL embedding parameters with debug support."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMDLAdagradLightParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMDLAdagradLightParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..24b334380b91b04d7f8f482760a975c88b1b1d8c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMDLAdagradLightParameters.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingMDLAdagradLightParameters"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the MDL Adagrad Light optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the MDL Adagrad Light optimization algorithm.
+END
+  }
+  in_arg {
+    name: "weights"
+    description: <<END
+Value of weights used in the MDL Adagrad Light optimization algorithm.
+END
+  }
+  in_arg {
+    name: "benefits"
+    description: <<END
+Value of benefits used in the MDL Adagrad Light optimization algorithm.
+END
+  }
+  summary: "Load MDL Adagrad Light embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMomentumParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMomentumParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e98956e65dbb6e3a34eea38c9874db81f5b49470
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMomentumParameters.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingMomentumParameters"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the Momentum optimization algorithm.
+END
+  }
+  in_arg {
+    name: "momenta"
+    description: <<END
+Value of momenta used in the Momentum optimization algorithm.
+END
+  }
+  summary: "Load Momentum embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fff35adc68e7cefff0f551f7668755efc0e8747d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingMomentumParametersGradAccumDebug"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the Momentum optimization algorithm.
+END
+  }
+  in_arg {
+    name: "momenta"
+    description: <<END
+Value of momenta used in the Momentum optimization algorithm.
+END
+  }
+  in_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Value of gradient_accumulators used in the Momentum optimization algorithm.
+END
+  }
+  summary: "Load Momentum embedding parameters with debug support."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingProximalAdagradParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingProximalAdagradParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..da049a72c7652b8e0784cbe2cd63da72172360a1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingProximalAdagradParameters.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingProximalAdagradParameters"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the proximal Adagrad optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the proximal Adagrad optimization algorithm.
+END
+  }
+  summary: "Load proximal Adagrad embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5e0f275a44be90cb2169dec5aad76faeda6d0ded
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the proximal Adagrad optimization algorithm.
+END
+  }
+  in_arg {
+    name: "accumulators"
+    description: <<END
+Value of accumulators used in the proximal Adagrad optimization algorithm.
+END
+  }
+  in_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Value of gradient_accumulators used in the proximal Adagrad optimization algorithm.
+END
+  }
+  summary: "Load proximal Adagrad embedding parameters with debug support."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingRMSPropParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingRMSPropParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e79a3a4c58156a0ea18eba12ef3db3e272da2bcf
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingRMSPropParameters.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingRMSPropParameters"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the RMSProp optimization algorithm.
+END
+  }
+  in_arg {
+    name: "ms"
+    description: <<END
+Value of ms used in the RMSProp optimization algorithm.
+END
+  }
+  in_arg {
+    name: "mom"
+    description: <<END
+Value of mom used in the RMSProp optimization algorithm.
+END
+  }
+  summary: "Load RMSProp embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..233e5afbff369f16da51af104708568d51c00460
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the RMSProp optimization algorithm.
+END
+  }
+  in_arg {
+    name: "ms"
+    description: <<END
+Value of ms used in the RMSProp optimization algorithm.
+END
+  }
+  in_arg {
+    name: "mom"
+    description: <<END
+Value of mom used in the RMSProp optimization algorithm.
+END
+  }
+  in_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Value of gradient_accumulators used in the RMSProp optimization algorithm.
+END
+  }
+  summary: "Load RMSProp embedding parameters with debug support."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingStochasticGradientDescentParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingStochasticGradientDescentParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37d0dcc1f4e1c06a649e191df158446ceaed283d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_LoadTPUEmbeddingStochasticGradientDescentParameters.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "LoadTPUEmbeddingStochasticGradientDescentParameters"
+  in_arg {
+    name: "parameters"
+    description: <<END
+Value of parameters used in the stochastic gradient descent optimization algorithm.
+END
+  }
+  summary: "Load SGD embedding parameters."
+  description: <<END
+An op that loads optimization parameters into HBM for embedding. Must be
+preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+embedding table configuration. For example, this op is used to install
+parameters that are loaded from a checkpoint before a training loop is
+executed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MulNoNan.pbtxt b/tensorflow/core/api_def/base_api/api_def_MulNoNan.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..51a5fa95d3d90cb9ac8798492d7bc469336ccb82
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MulNoNan.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "MulNoNan"
+  endpoint {
+    name: "MulNoNan"
+  }
+  summary: "Returns x * y element-wise. Returns zero if y is zero, even if x if infinite or NaN."
+  description: <<END
+*NOTE*: `Mul` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_NearestNeighbors.pbtxt b/tensorflow/core/api_def/base_api/api_def_NearestNeighbors.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2bdf68fb9f0e42a0ac31334f122cf66357cc2579
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_NearestNeighbors.pbtxt
@@ -0,0 +1,43 @@
+op {
+  graph_op_name: "NearestNeighbors"
+  visibility: HIDDEN
+  in_arg {
+    name: "points"
+    description: <<END
+Matrix of shape (n, d). Rows are assumed to be input points.
+END
+  }
+  in_arg {
+    name: "centers"
+    description: <<END
+Matrix of shape (m, d). Rows are assumed to be centers.
+END
+  }
+  in_arg {
+    name: "k"
+    description: <<END
+Number of nearest centers to return for each point. If k is larger than m, then
+only m centers are returned.
+END
+  }
+  out_arg {
+    name: "nearest_center_indices"
+    description: <<END
+Matrix of shape (n, min(m, k)). Each row contains the indices of the centers
+closest to the corresponding point, ordered by increasing distance.
+END
+  }
+  out_arg {
+    name: "nearest_center_distances"
+    description: <<END
+Matrix of shape (n, min(m, k)). Each row contains the squared L2 distance to the
+corresponding center in nearest_center_indices.
+END
+  }
+  summary: "Selects the k nearest centers for each point."
+  description: <<END
+Rows of points are assumed to be input points. Rows of centers are assumed to be
+the list of candidate centers. For each point, the k centers that have least L2
+distance to it are computed.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_NextAfter.pbtxt b/tensorflow/core/api_def/base_api/api_def_NextAfter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5dfeef735aad6631d14e7b8211fd3b60e13d8791
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_NextAfter.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "NextAfter"
+  summary: "Returns the next representable value of `x1` in the direction of `x2`, element-wise."
+  description: <<END
+This operation returns the same result as the C++ std::nextafter function.
+
+It can also return a subnormal number.
+
+@compatibility(cpp)
+Equivalent to C++ std::nextafter function.
+@end_compatibility
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_OneHot.pbtxt b/tensorflow/core/api_def/base_api/api_def_OneHot.pbtxt
index 807b8ae31015e4bcb73e54e98d879460f0d92f62..b325df1c8c2b231f03a1960babd2d915b1b0e72d 100644
--- a/tensorflow/core/api_def/base_api/api_def_OneHot.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_OneHot.pbtxt
@@ -66,7 +66,6 @@ Examples
 =========
 
 Suppose that
-
 ```
   indices = [0, 2, -1, 1]
   depth = 3
@@ -76,16 +75,15 @@ Suppose that
 ```
 
 Then output is `[4 x 3]`:
-
-    ```output =
-      [5.0 0.0 0.0]  // one_hot(0)
-      [0.0 0.0 5.0]  // one_hot(2)
-      [0.0 0.0 0.0]  // one_hot(-1)
-      [0.0 5.0 0.0]  // one_hot(1)
-    ```
+```
+output =
+  [5.0 0.0 0.0]  // one_hot(0)
+  [0.0 0.0 5.0]  // one_hot(2)
+  [0.0 0.0 0.0]  // one_hot(-1)
+  [0.0 5.0 0.0]  // one_hot(1)
+```
 
 Suppose that
-
 ```
   indices = [0, 2, -1, 1]
   depth = 3
@@ -95,19 +93,19 @@ Suppose that
 ```
 
 Then output is `[3 x 4]`:
+```
+output =
+  [0.0 3.0 3.0 3.0]
+  [3.0 3.0 3.0 0.0]
+  [3.0 3.0 3.0 3.0]
+  [3.0 0.0 3.0 3.0]
+//  ^                one_hot(0)
+//      ^            one_hot(2)
+//          ^        one_hot(-1)
+//              ^    one_hot(1)
+```
 
-    ```output =
-      [0.0 3.0 3.0 3.0]
-      [3.0 3.0 3.0 0.0]
-      [3.0 3.0 3.0 3.0]
-      [3.0 0.0 3.0 3.0]
-    //  ^                one_hot(0)
-    //      ^            one_hot(2)
-    //          ^        one_hot(-1)
-    //              ^    one_hot(1)
-    ```
 Suppose that
-
 ```
   indices = [[0, 2], [1, -1]]
   depth = 3
@@ -117,14 +115,15 @@ Suppose that
 ```
 
 Then output is `[2 x 2 x 3]`:
-
-    ```output =
-      [
-        [1.0, 0.0, 0.0]  // one_hot(0)
-        [0.0, 0.0, 1.0]  // one_hot(2)
-      ][
-        [0.0, 1.0, 0.0]  // one_hot(1)
-        [0.0, 0.0, 0.0]  // one_hot(-1)
-      ]```
+```
+output =
+  [
+    [1.0, 0.0, 0.0]  // one_hot(0)
+    [0.0, 0.0, 1.0]  // one_hot(2)
+  ][
+    [0.0, 1.0, 0.0]  // one_hot(1)
+    [0.0, 0.0, 0.0]  // one_hot(-1)
+  ]
+```
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_OutfeedDequeue.pbtxt b/tensorflow/core/api_def/base_api/api_def_OutfeedDequeue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f262bdce969f7fbadb6f5ee6d0223cfb8cfd7aa1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OutfeedDequeue.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "OutfeedDequeue"
+  out_arg {
+    name: "output"
+    description: <<END
+A tensor that will be read from the device outfeed.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of elements in the tensor.
+END
+  }
+  attr {
+    name: "shape"
+    description: <<END
+The shape of the tensor.
+END
+  }
+  attr {
+    name: "device_ordinal"
+    description: <<END
+The TPU device to use. This should be -1 when the Op
+is running on a TPU device, and >= 0 when the Op is running on the CPU
+device.
+END
+  }
+  summary: "Retrieves a single tensor from the computation outfeed."
+  description: <<END
+This operation will block indefinitely until data is available.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_OutfeedDequeueTuple.pbtxt b/tensorflow/core/api_def/base_api/api_def_OutfeedDequeueTuple.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..457e495e3c726aefa5e64ecb800f6acf780afb1b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OutfeedDequeueTuple.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "OutfeedDequeueTuple"
+  out_arg {
+    name: "outputs"
+    description: <<END
+A list of tensors that will be read from the outfeed.
+END
+  }
+  attr {
+    name: "dtypes"
+    description: <<END
+The element types of each element in `outputs`.
+END
+  }
+  attr {
+    name: "shapes"
+    description: <<END
+The shapes of each tensor in `outputs`.
+END
+  }
+  attr {
+    name: "device_ordinal"
+    description: <<END
+The TPU device to use. This should be -1 when the Op
+is running on a TPU device, and >= 0 when the Op is running on the CPU
+device.
+END
+  }
+  summary: "Retrieve multiple values from the computation outfeed."
+  description: <<END
+This operation will block indefinitely until data is available. Output `i`
+corresponds to XLA tuple element `i`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_OutfeedEnqueue.pbtxt b/tensorflow/core/api_def/base_api/api_def_OutfeedEnqueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fa6cb96de11c8d2a9bd3c04a3dfb19be5f363027
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OutfeedEnqueue.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "OutfeedEnqueue"
+  in_arg {
+    name: "input"
+    description: <<END
+A tensor that will be inserted into the outfeed queue.
+END
+  }
+  summary: "Enqueue a Tensor on the computation outfeed."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_OutfeedEnqueueTuple.pbtxt b/tensorflow/core/api_def/base_api/api_def_OutfeedEnqueueTuple.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fb1ab3d63c0837f0bee7048d1b484f55b9dd619f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OutfeedEnqueueTuple.pbtxt
@@ -0,0 +1,11 @@
+op {
+  graph_op_name: "OutfeedEnqueueTuple"
+  in_arg {
+    name: "inputs"
+    description: <<END
+A list of tensors that will be inserted into the outfeed queue as an
+XLA tuple.
+END
+  }
+  summary: "Enqueue multiple Tensor values on the computation outfeed."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndRelu.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndRelu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..17ff15378c90f709ec6a2428a9c6408f23eeabe8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndRelu.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DAndRelu"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndReluAndRequantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndReluAndRequantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b3ab3eba2c0bf06bf8a41eabc0020582c3ada8ca
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndReluAndRequantize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DAndReluAndRequantize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndRequantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndRequantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8b00c2b7f650260d7d2150935ddfab1d65fac335
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DAndRequantize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DAndRequantize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBias.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBias.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f309f648cafb307569bdabe496ca44c8c200c585
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBias.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DWithBias"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndRelu.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndRelu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b6b73eaae3613238d17900a4f15a7ad6839d92a2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndRelu.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DWithBiasAndRelu"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndReluAndRequantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndReluAndRequantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..101f72708af5cc92155b0641a14fc89889fa7488
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndReluAndRequantize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DWithBiasAndReluAndRequantize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndRequantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndRequantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..697e26841539603ce2f6d26a082378881ce214a5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasAndRequantize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DWithBiasAndRequantize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSignedSumAndReluAndRequantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSignedSumAndReluAndRequantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0cf52d6c897f9dc4e1e4988259b1c74043203727
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSignedSumAndReluAndRequantize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSumAndRelu.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSumAndRelu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e91a2b8dc063c60cb2d8cd104bac864d063eee3b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSumAndRelu.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DWithBiasSumAndRelu"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSumAndReluAndRequantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSumAndReluAndRequantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fe3ec528bf4a64bca8531d6daa90af2b13cebcec
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizedConv2DWithBiasSumAndReluAndRequantize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizedConv2DWithBiasSumAndReluAndRequantize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RecvTPUEmbeddingActivations.pbtxt b/tensorflow/core/api_def/base_api/api_def_RecvTPUEmbeddingActivations.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d1921fd2624daf0200737dfc6dbaeee2311ad5b1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RecvTPUEmbeddingActivations.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "RecvTPUEmbeddingActivations"
+  out_arg {
+    name: "outputs"
+    description: <<END
+A TensorList of embedding activations containing one Tensor per
+embedding table in the model.
+END
+  }
+  attr {
+    name: "num_outputs"
+    description: <<END
+The number of output activation tensors, equal to the number of
+embedding tables in the model.
+END
+  }
+  attr {
+    name: "config"
+    description: <<END
+Serialized TPUEmbeddingConfiguration proto.
+END
+  }
+  summary: "An op that receives embedding activations on the TPU."
+  description: <<END
+The TPU system performs the embedding lookups and aggregations specified by
+the arguments to TPUEmbeddingEnqueue(Integer/Sparse/SparseTensor)Batch. The
+results of these aggregations are visible to the Tensorflow Graph as the
+outputs of a RecvTPUEmbeddingActivations op. This op returns a list containing
+one Tensor of activations per table specified in the model. There can be at
+most one RecvTPUEmbeddingActivations op in the TPU graph.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RegexReplace.pbtxt b/tensorflow/core/api_def/base_api/api_def_RegexReplace.pbtxt
index 70ad5219267fcc84368f072a6f5a122b6cc11a89..2cc1a55676c354c9470287ccb89e39489ab18c02 100644
--- a/tensorflow/core/api_def/base_api/api_def_RegexReplace.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RegexReplace.pbtxt
@@ -10,7 +10,7 @@ op {
   }
   in_arg {
     name: "rewrite"
-    description: "The rewrite to be applied to the matched expresion."
+    description: "The rewrite to be applied to the matched expression."
   }
   out_arg {
     name: "output"
diff --git a/tensorflow/core/api_def/base_api/api_def_RequantizationRange.pbtxt b/tensorflow/core/api_def/base_api/api_def_RequantizationRange.pbtxt
index 07bbd4ac6031765a070c5e5b4ee0726512dbb6ca..cd7d4e3ec2abf68b2c8461a5e301213e34252d1e 100644
--- a/tensorflow/core/api_def/base_api/api_def_RequantizationRange.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RequantizationRange.pbtxt
@@ -30,10 +30,12 @@ END
 The type of the input.
 END
   }
-  summary: "Given a quantized tensor described by (input, input_min, input_max), outputs a"
+  summary:
+"Computes a range that covers the actual values present in a quantized tensor."
   description: <<END
-range that covers the actual values present in that tensor.  This op is
-typically used to produce the requested_output_min and requested_output_max for
-Requantize.
+Given a quantized tensor described by `(input, input_min, input_max)`, outputs a
+range that covers the actual values present in that tensor. This op is typically
+used to produce the `requested_output_min` and `requested_output_max` for
+`Requantize`.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_RequantizationRangePerChannel.pbtxt b/tensorflow/core/api_def/base_api/api_def_RequantizationRangePerChannel.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..58cf1222500e6bf58a22beb17ffccf2949dd4c81
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RequantizationRangePerChannel.pbtxt
@@ -0,0 +1,48 @@
+op {
+  graph_op_name: "RequantizationRangePerChannel"
+  visibility : HIDDEN
+  in_arg {
+    name: "input"
+    description: <<END
+The original input tensor.
+END
+  }
+  in_arg {
+    name: "input_min"
+    description: <<END
+The minimum value of the input tensor
+END
+  }
+  in_arg {
+    name: "input_max"
+    description: <<END
+The maximum value of the input tensor.
+END
+  }
+  out_arg {
+    name: "output_min"
+    description: <<END
+The minimum value of the final output tensor
+END
+  }
+  out_arg {
+    name: "output_max"
+    description: <<END
+The maximum value of the final output tensor.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+The quantized type of input tensor that needs to be converted. 
+END
+  }
+  attr {
+    name: "clip_value_max"
+    description: <<END
+The maximum value of the output that needs to be clipped.
+Example: set this to 6 for Relu6. 
+END
+  }
+  summary: "Computes requantization range per channel."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Requantize.pbtxt b/tensorflow/core/api_def/base_api/api_def_Requantize.pbtxt
index 1b03f63b261e00c6b1dfdc0b1f11c69d71b536eb..23e1656288d6c97b8facd7eafb5ba5cd862dade1 100644
--- a/tensorflow/core/api_def/base_api/api_def_Requantize.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Requantize.pbtxt
@@ -48,13 +48,15 @@ END
 The type of the output. Should be a lower bit depth than Tinput.
 END
   }
-  summary: "Convert the quantized \'input\' tensor into a lower-precision \'output\', using the"
+  summary: 
+"Converts the quantized `input` tensor into a lower-precision `output`."
   description: <<END
-output range specified with 'requested_output_min' and 'requested_output_max'.
+Converts the quantized `input` tensor into a lower-precision `output`, using the
+output range specified with `requested_output_min` and `requested_output_max`.
 
-[input_min, input_max] are scalar floats that specify the range for the float
-interpretation of the 'input' data. For example, if input_min is -1.0f and
-input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+`[input_min, input_max]` are scalar floats that specify the range for the float
+interpretation of the `input` data. For example, if `input_min` is -1.0f and
+`input_max` is 1.0f, and we are dealing with `quint16` quantized data, then a 0
 value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_RequantizePerChannel.pbtxt b/tensorflow/core/api_def/base_api/api_def_RequantizePerChannel.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bce6c3dd5a0b5e6f4e7fe2aa990827a4020bed17
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RequantizePerChannel.pbtxt
@@ -0,0 +1,65 @@
+op {
+  graph_op_name: "RequantizePerChannel"
+  visibility : HIDDEN
+  in_arg {
+    name: "input"
+    description: <<END
+The original input tensor.
+END
+  }
+  in_arg {
+    name: "input_min"
+    description: <<END
+The minimum value of the input tensor
+END
+  }
+  in_arg {
+    name: "input_max"
+    description: <<END
+The maximum value of the input tensor.
+END
+  }
+  in_arg {
+    name: "requested_output_min"
+    description: <<END
+The minimum value of the output tensor requested.
+END
+  }
+  in_arg {
+    name: "requested_output_max"
+    description: <<END
+The maximum value of the output tensor requested.
+END
+  }  
+  out_arg {
+    name: "output"
+    description: <<END
+Output tensor.
+END
+  }
+  out_arg {
+    name: "output_min"
+    description: <<END
+The minimum value of the final output tensor
+END
+  }
+  out_arg {
+    name: "output_max"
+    description: <<END
+The maximum value of the final output tensor.
+END
+  }
+  attr {
+    name: "T"
+    description: <<END
+The quantized type of input tensor that needs to be converted. 
+END
+  }
+  attr {
+    name: "out_type"
+    description: <<END
+The quantized type of output tensor that needs to be converted.
+END
+  }
+  summary: "Requantizes input with min and max values known per channel."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt
index d9c4d5a4a4008c439ece7fde52a2913f6a50956d..b0458207e6eb8b18a21e1f67b84e691fb5601e9a 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt
@@ -28,10 +28,8 @@ be protected by a lock; otherwise the behavior is undefined,
 but may exhibit less contention.
 END
   }
-  summary: "Adds sparse `updates` to individual values or slices within a given"
+  summary: "Applies sparse addition to individual values or slices in a Variable."
   description: <<END
-variable according to `indices`.
-
 `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 
 `indices` must be integer tensor, containing indices into `ref`.
@@ -44,24 +42,24 @@ dimension of `ref`.
 `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 
 ```
-[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
 ```
 
-For example, say we want to update 4 scattered elements to a rank-1 tensor to
-8 elements. In Python, that update would look like this:
+For example, say we want to add 4 scattered elements to a rank-1 tensor to
+8 elements. In Python, that addition would look like this:
 
 ```python
-    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
-    indices = tf.constant([[4], [3], [1] ,[7]])
-    updates = tf.constant([9, 10, 11, 12])
-    update = tf.scatter_nd_add(ref, indices, updates)
-    with tf.Session() as sess:
-      print sess.run(update)
+ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+indices = tf.constant([[4], [3], [1], [7]])
+updates = tf.constant([9, 10, 11, 12])
+add = tf.scatter_nd_add(ref, indices, updates)
+with tf.Session() as sess:
+  print sess.run(add)
 ```
 
 The resulting update to ref would look like this:
 
-    [1, 12, 3, 14, 14, 6, 7, 20]
+    [1, 13, 3, 14, 14, 6, 7, 20]
 
 See `tf.scatter_nd` for more details about how to make updates to
 slices.
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdSub.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f12f4b5f34767e54bdd9c4ede9cb2c495eda723f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdSub.pbtxt
@@ -0,0 +1,67 @@
+op {
+  graph_op_name: "ResourceScatterNdSub"
+  in_arg {
+    name: "ref"
+    description: <<END
+A resource handle. Must be from a VarHandleOp.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A Tensor. Must be one of the following types: int32, int64.
+A tensor of indices into ref.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A Tensor. Must have the same type as ref. A tensor of
+values to add to ref.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+An optional bool. Defaults to True. If True, the assignment will
+be protected by a lock; otherwise the behavior is undefined,
+but may exhibit less contention.
+END
+  }
+  summary: "Applies sparse subtraction to individual values or slices in a Variable."
+  description: <<END
+`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+`indices` must be integer tensor, containing indices into `ref`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+The innermost dimension of `indices` (with length `K`) corresponds to
+indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+dimension of `ref`.
+
+`updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+```
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+```
+
+For example, say we want to subtract 4 scattered elements from a rank-1 tensor
+with 8 elements. In Python, that subtraction would look like this:
+
+```python
+ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+indices = tf.constant([[4], [3], [1], [7]])
+updates = tf.constant([9, 10, 11, 12])
+sub = tf.scatter_nd_sub(ref, indices, updates)
+with tf.Session() as sess:
+  print sess.run(sub)
+```
+
+The resulting update to ref would look like this:
+
+    [1, -9, 3, -6, -4, 6, 7, -4]
+
+See `tf.scatter_nd` for more details about how to make updates to
+slices.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingADAMParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingADAMParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3de7ad97b128aa61fbcc2b5101f18c5a23919150
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingADAMParameters.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingADAMParameters"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the ADAM optimization algorithm.
+END
+  }
+  out_arg {
+    name: "momenta"
+    description: <<END
+Parameter momenta updated by the ADAM optimization algorithm.
+END
+  }
+  out_arg {
+    name: "velocities"
+    description: <<END
+Parameter velocities updated by the ADAM optimization algorithm.
+END
+  }
+  summary: "Retrieve ADAM embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingADAMParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingADAMParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ecfd080120e8c25031a312a5834dfeee247cda0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingADAMParametersGradAccumDebug.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the ADAM optimization algorithm.
+END
+  }
+  out_arg {
+    name: "momenta"
+    description: <<END
+Parameter momenta updated by the ADAM optimization algorithm.
+END
+  }
+  out_arg {
+    name: "velocities"
+    description: <<END
+Parameter velocities updated by the ADAM optimization algorithm.
+END
+  }
+  out_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Parameter gradient_accumulators updated by the ADAM optimization algorithm.
+END
+  }
+  summary: "Retrieve ADAM embedding parameters with debug support."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdadeltaParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdadeltaParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3c7e6881aa22fe648dddb0b58ad3c5285e0e1fb1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdadeltaParameters.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingAdadeltaParameters"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the Adadelta optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the Adadelta optimization algorithm.
+END
+  }
+  out_arg {
+    name: "updates"
+    description: <<END
+Parameter updates updated by the Adadelta optimization algorithm.
+END
+  }
+  summary: "Retrieve Adadelta embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5ee8cdaa882c63d7cb3cc9ddcca63b0c87219c7d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the Adadelta optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the Adadelta optimization algorithm.
+END
+  }
+  out_arg {
+    name: "updates"
+    description: <<END
+Parameter updates updated by the Adadelta optimization algorithm.
+END
+  }
+  out_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Parameter gradient_accumulators updated by the Adadelta optimization algorithm.
+END
+  }
+  summary: "Retrieve Adadelta embedding parameters with debug support."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdagradParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdagradParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6f070f5483ec3ba80994fe056a83a040966cdcbe
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdagradParameters.pbtxt
@@ -0,0 +1,22 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingAdagradParameters"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the Adagrad optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the Adagrad optimization algorithm.
+END
+  }
+  summary: "Retrieve Adagrad embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..078cda00d6fd1ad4d5393b4e24d16149e98be0b1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the Adagrad optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the Adagrad optimization algorithm.
+END
+  }
+  out_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Parameter gradient_accumulators updated by the Adagrad optimization algorithm.
+END
+  }
+  summary: "Retrieve Adagrad embedding parameters with debug support."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingCenteredRMSPropParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingCenteredRMSPropParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2a4cc4e5087f70452bc3580ab4321277343a9e91
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingCenteredRMSPropParameters.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingCenteredRMSPropParameters"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the centered RMSProp optimization algorithm.
+END
+  }
+  out_arg {
+    name: "ms"
+    description: <<END
+Parameter ms updated by the centered RMSProp optimization algorithm.
+END
+  }
+  out_arg {
+    name: "mom"
+    description: <<END
+Parameter mom updated by the centered RMSProp optimization algorithm.
+END
+  }
+  out_arg {
+    name: "mg"
+    description: <<END
+Parameter mg updated by the centered RMSProp optimization algorithm.
+END
+  }
+  summary: "Retrieve centered RMSProp embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingFTRLParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingFTRLParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..daf87deff23db6d82179d74d8514115ca8c1939d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingFTRLParameters.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingFTRLParameters"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the FTRL optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the FTRL optimization algorithm.
+END
+  }
+  out_arg {
+    name: "linears"
+    description: <<END
+Parameter linears updated by the FTRL optimization algorithm.
+END
+  }
+  summary: "Retrieve FTRL embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2f72d1d1ce1289d5735869981515a2b04b86e2a2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingFTRLParametersGradAccumDebug.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the FTRL optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the FTRL optimization algorithm.
+END
+  }
+  out_arg {
+    name: "linears"
+    description: <<END
+Parameter linears updated by the FTRL optimization algorithm.
+END
+  }
+  out_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Parameter gradient_accumulators updated by the FTRL optimization algorithm.
+END
+  }
+  summary: "Retrieve FTRL embedding parameters with debug support."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMDLAdagradLightParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMDLAdagradLightParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..478fbda1853e0035226b267c0330870cbc47b72a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMDLAdagradLightParameters.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingMDLAdagradLightParameters"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the MDL Adagrad Light optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the MDL Adagrad Light optimization algorithm.
+END
+  }
+  out_arg {
+    name: "weights"
+    description: <<END
+Parameter weights updated by the MDL Adagrad Light optimization algorithm.
+END
+  }
+  out_arg {
+    name: "benefits"
+    description: <<END
+Parameter benefits updated by the MDL Adagrad Light optimization algorithm.
+END
+  }
+  summary: "Retrieve MDL Adagrad Light embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMomentumParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMomentumParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f71e620c39c4f2830a53912f624b5478c24f46a5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMomentumParameters.pbtxt
@@ -0,0 +1,22 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingMomentumParameters"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the Momentum optimization algorithm.
+END
+  }
+  out_arg {
+    name: "momenta"
+    description: <<END
+Parameter momenta updated by the Momentum optimization algorithm.
+END
+  }
+  summary: "Retrieve Momentum embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0f00680bb35906a122c38eaead9702b842e62082
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the Momentum optimization algorithm.
+END
+  }
+  out_arg {
+    name: "momenta"
+    description: <<END
+Parameter momenta updated by the Momentum optimization algorithm.
+END
+  }
+  out_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Parameter gradient_accumulators updated by the Momentum optimization algorithm.
+END
+  }
+  summary: "Retrieve Momentum embedding parameters with debug support."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingProximalAdagradParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingProximalAdagradParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e279ca4fe9cc2c1768164053ce88617a9fa1bc9f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingProximalAdagradParameters.pbtxt
@@ -0,0 +1,22 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingProximalAdagradParameters"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the proximal Adagrad optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the proximal Adagrad optimization algorithm.
+END
+  }
+  summary: "Retrieve proximal Adagrad embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37ae5db81c2d7fc24d0d175e6085e5868c5d0224
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the proximal Adagrad optimization algorithm.
+END
+  }
+  out_arg {
+    name: "accumulators"
+    description: <<END
+Parameter accumulators updated by the proximal Adagrad optimization algorithm.
+END
+  }
+  out_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Parameter gradient_accumulators updated by the proximal Adagrad optimization algorithm.
+END
+  }
+  summary: "Retrieve proximal Adagrad embedding parameters with debug support."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingRMSPropParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingRMSPropParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aad8ca8dae67daaac71bb6b5eaaa661d6bc1c173
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingRMSPropParameters.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingRMSPropParameters"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the RMSProp optimization algorithm.
+END
+  }
+  out_arg {
+    name: "ms"
+    description: <<END
+Parameter ms updated by the RMSProp optimization algorithm.
+END
+  }
+  out_arg {
+    name: "mom"
+    description: <<END
+Parameter mom updated by the RMSProp optimization algorithm.
+END
+  }
+  summary: "Retrieve RMSProp embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1d9bd6659c5f01d8929ded1b9ff17f710eef4a84
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug.pbtxt
@@ -0,0 +1,34 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the RMSProp optimization algorithm.
+END
+  }
+  out_arg {
+    name: "ms"
+    description: <<END
+Parameter ms updated by the RMSProp optimization algorithm.
+END
+  }
+  out_arg {
+    name: "mom"
+    description: <<END
+Parameter mom updated by the RMSProp optimization algorithm.
+END
+  }
+  out_arg {
+    name: "gradient_accumulators"
+    description: <<END
+Parameter gradient_accumulators updated by the RMSProp optimization algorithm.
+END
+  }
+  summary: "Retrieve RMSProp embedding parameters with debug support."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingStochasticGradientDescentParameters.pbtxt b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingStochasticGradientDescentParameters.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..33ca8a7612c204d2cd95650f955ffdbc944fd0c2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RetrieveTPUEmbeddingStochasticGradientDescentParameters.pbtxt
@@ -0,0 +1,16 @@
+op {
+  graph_op_name: "RetrieveTPUEmbeddingStochasticGradientDescentParameters"
+  out_arg {
+    name: "parameters"
+    description: <<END
+Parameter parameters updated by the stochastic gradient descent optimization algorithm.
+END
+  }
+  summary: "Retrieve SGD embedding parameters."
+  description: <<END
+An op that retrieves optimization parameters from embedding to host
+memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+the correct embedding table configuration. For example, this op is
+used to retrieve updated parameters before saving a checkpoint.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScaleAndTranslate.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScaleAndTranslate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ff6794b07e2b4bf70430f779c7789af8c04d8875
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ScaleAndTranslate.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ScaleAndTranslate"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScaleAndTranslateGrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScaleAndTranslateGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3cda76226cf346b1f528ec8ad53a16684457270e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ScaleAndTranslateGrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ScaleAndTranslateGrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNdAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNdAdd.pbtxt
index 5929425bc80f218627a7977a7b4e869715f7963b..b8fbcbbed29de68088db9ee12ae86cde5c7d6aa8 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterNdAdd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNdAdd.pbtxt
@@ -35,14 +35,12 @@ be protected by a lock; otherwise the behavior is undefined,
 but may exhibit less contention.
 END
   }
-  summary: "Applies sparse addition between `updates` and individual values or slices"
+  summary: "Applies sparse addition to individual values or slices in a Variable."
   description: <<END
-within a given variable according to `indices`.
-
 `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 
 `indices` must be integer tensor, containing indices into `ref`.
-It must be shape `\\([d_0, ..., d_{Q-2}, K]\\)` where `0 < K <= P`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
 
 The innermost dimension of `indices` (with length `K`) corresponds to
 indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
@@ -50,17 +48,21 @@ dimension of `ref`.
 
 `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 
-$$[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].$$
+```
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+```
 
-For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
-elements. In Python, that addition would look like this:
+For example, say we want to add 4 scattered elements to a rank-1 tensor to
+8 elements. In Python, that addition would look like this:
 
-    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-    indices = tf.constant([[4], [3], [1], [7]])
-    updates = tf.constant([9, 10, 11, 12])
-    add = tf.scatter_nd_add(ref, indices, updates)
-    with tf.Session() as sess:
-      print sess.run(add)
+```python
+ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+indices = tf.constant([[4], [3], [1], [7]])
+updates = tf.constant([9, 10, 11, 12])
+add = tf.scatter_nd_add(ref, indices, updates)
+with tf.Session() as sess:
+  print sess.run(add)
+```
 
 The resulting update to ref would look like this:
 
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNdSub.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNdSub.pbtxt
index 67346f051e75b68bc98b0e9026849f1c0f512939..b557addb7ce872edb76199a071907c59c8454abb 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterNdSub.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNdSub.pbtxt
@@ -35,14 +35,14 @@ be protected by a lock; otherwise the behavior is undefined,
 but may exhibit less contention.
 END
   }
-  summary: "Applies sparse subtraction between `updates` and individual values or slices"
+  summary: "Applies sparse subtraction to individual values or slices in a Variable."
   description: <<END
 within a given variable according to `indices`.
 
 `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 
 `indices` must be integer tensor, containing indices into `ref`.
-It must be shape \\([d_0, ..., d_{Q-2}, K]\\) where `0 < K <= P`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
 
 The innermost dimension of `indices` (with length `K`) corresponds to
 indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
@@ -50,17 +50,21 @@ dimension of `ref`.
 
 `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 
-$$[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].$$
+```
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+```
 
 For example, say we want to subtract 4 scattered elements from a rank-1 tensor
 with 8 elements. In Python, that subtraction would look like this:
 
-    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-    indices = tf.constant([[4], [3], [1], [7]])
-    updates = tf.constant([9, 10, 11, 12])
-    sub = tf.scatter_nd_sub(ref, indices, updates)
-    with tf.Session() as sess:
-      print sess.run(sub)
+```python
+ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+indices = tf.constant([[4], [3], [1], [7]])
+updates = tf.constant([9, 10, 11, 12])
+sub = tf.scatter_nd_sub(ref, indices, updates)
+with tf.Session() as sess:
+  print sess.run(sub)
+```
 
 The resulting update to ref would look like this:
 
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt
index d33a36ce06c37092bd25e241b36f1c564070c6e2..d5643c8a79a92fad53b1737a80172fb56004327f 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentMax.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the maximum along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 Computes a tensor such that
@@ -29,5 +29,15 @@ If the max is empty for a given segment ID `i`, `output[i] = 0`.
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
 </div>
+
+For example:
+
+```
+c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+tf.segment_max(c, tf.constant([0, 0, 1]))
+# ==> [[4, 3, 3, 4],
+#      [5, 6, 7, 8]]
+```
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt
index afdc39da96df01185af600409435faca49cabc0e..b03649ab077d893ceb17704c57060cde99be1db2 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentMean.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the mean along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 Computes a tensor such that
@@ -30,5 +30,15 @@ If the mean is empty for a given segment ID `i`, `output[i] = 0`.
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
 </div>
+
+For example:
+
+```
+c = tf.constant([[1.0,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+tf.segment_mean(c, tf.constant([0, 0, 1]))
+# ==> [[2.5, 2.5, 2.5, 2.5],
+#      [5, 6, 7, 8]]
+```
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt
index 026b5b3991f9cbbfce9add2ff4cb7e370a1cc799..6796678555ef8f3bbf27c742db6f1c0c30c483bf 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentMin.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the minimum along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 Computes a tensor such that
@@ -29,5 +29,14 @@ If the min is empty for a given segment ID `i`, `output[i] = 0`.
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
 </div>
+
+For example:
+
+```
+c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+tf.segment_min(c, tf.constant([0, 0, 1]))
+# ==> [[1, 2, 2, 1],
+#      [5, 6, 7, 8]]
+```
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt
index a168eed87f668c97141f2c8966c68866b82477de..10b368fcca4dab3bb197609e3e10189323bf9bc7 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentProd.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the product along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 Computes a tensor such that
@@ -29,5 +29,15 @@ If the product is empty for a given segment ID `i`, `output[i] = 1`.
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
 </div>
+
+For example:
+
+```
+c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+tf.segment_prod(c, tf.constant([0, 0, 1]))
+# ==> [[4, 6, 6, 4],
+#      [5, 6, 7, 8]]
+```
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt
index 876b8608240df108c66bceefa5f7eba82ddb7524..487a6d10746ce684c9f0e27f6336c22994fee3b4 100644
--- a/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SegmentSum.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the sum along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 Computes a tensor such that
@@ -29,5 +29,15 @@ If the sum is empty for a given segment ID `i`, `output[i] = 0`.
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
 </div>
+
+For example:
+
+```
+c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+tf.segment_sum(c, tf.constant([0, 0, 1]))
+# ==> [[5, 5, 5, 5],
+#      [5, 6, 7, 8]]
+```
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SendTPUEmbeddingGradients.pbtxt b/tensorflow/core/api_def/base_api/api_def_SendTPUEmbeddingGradients.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9a3be3d8eaff24b9d1e58b84ff6709f9448fa186
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SendTPUEmbeddingGradients.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "SendTPUEmbeddingGradients"
+  in_arg {
+    name: "inputs"
+    description: <<END
+A TensorList of gradients with which to update embedding tables.
+This argument has the same length and shapes as the return value of
+RecvTPUEmbeddingActivations, but contains gradients of the model's loss
+with respect to the embedding activations. The embedding tables are updated
+from these gradients via the optimizer specified in the TPU embedding
+configuration given to tpu.initialize_system.
+END
+  }
+  in_arg {
+    name: "learning_rates"
+    description: <<END
+A TensorList of float32 scalars, one for each dynamic learning
+rate tag: see the comments in
+//third_party/tensorflow/core/protobuf/tpu/optimization_parameters.proto.
+Multiple tables can share the same dynamic learning rate tag as specified
+in the configuration. If the learning rates for all tables are constant,
+this list should be empty.
+END
+  }
+  attr {
+    name: "config"
+    description: <<END
+Serialized TPUEmbeddingConfiguration proto.
+END
+  }
+  summary: "Performs gradient updates of embedding tables."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ShardDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ShardDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cd537e05d7c97a2d43c14916b44eb32ae9b2efc1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ShardDataset.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "ShardDataset"
+  visibility: HIDDEN
+  in_arg {
+    name: "num_shards"
+    description: <<END
+An integer representing the number of shards operating in parallel.
+END
+  }
+  in_arg {
+    name: "index"
+    description: <<END
+An integer representing the current worker index.
+END
+  }
+  summary: "Creates a `Dataset` that includes only 1/`num_shards` of this dataset."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ShutdownDistributedTPU.pbtxt b/tensorflow/core/api_def/base_api/api_def_ShutdownDistributedTPU.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..87d4e8d1a88dc13d49948bb847a9d876055f53f6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ShutdownDistributedTPU.pbtxt
@@ -0,0 +1,7 @@
+op {
+  graph_op_name: "ShutdownDistributedTPU"
+  summary: "Shuts down a running distributed TPU system."
+  description: <<END
+The op returns an error if no system is running.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt
index 138a6366c8aa1e5b9d876621b93c7d36f16f38e2..0bbc0780dfee4af8cfcb036264969dbd4ec7bbdf 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMean.pbtxt
@@ -21,9 +21,7 @@ END
   }
   summary: "Computes the mean along sparse segments of a tensor."
   description: <<END
-Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-for an explanation of segments.
+See `tf.sparse.segment_sum` for usage examples.
 
 Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
 dimension, selecting a subset of dimension 0, specified by `indices`.
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
index b8073d88ac3d10cad6bc7771d3fe28bae905d8e5..65b2358830ed9eeccbc099b055b9f8b50e92afdc 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
@@ -31,7 +31,7 @@ Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
 misisng, the `output` tensor at that position will be zeroed.
 
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt
index 945bbdcf627c48047ffa65c4c4e5124cbd96e54b..a28bd1a646445c660183d6e35b9e6df64637c4f3 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtN.pbtxt
@@ -23,8 +23,7 @@ END
   description: <<END
 N is the size of the segment being reduced.
 
-Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-for an explanation of segments.
+See `tf.sparse.segment_sum` for usage examples.
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
index ff328c8a6195f9aca515de4d8a682b50df92117e..8a5d2bb02c4c42e3d67f6e01b8c609be84575270 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
@@ -33,7 +33,7 @@ Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
 misisng, the `output` tensor at that position will be zeroed.
 
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt
index a68e14607f81e999f95e85b4481fb0474e691aa4..d7494dc8deb37927dce09ce3a854339c27758286 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSum.pbtxt
@@ -22,7 +22,7 @@ END
   summary: "Computes the sum along sparse segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
index aa5c1fc8d0d698008787418ef24ecb3c0c635f6a..039ca9a23ba1abf2d67327ddb72bb49b8de1ab68 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
@@ -31,7 +31,7 @@ Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
 misisng, the `output` tensor at that position will be zeroed.
 
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/sparse#Segmentation)
 for an explanation of segments.
 
 For example:
diff --git a/tensorflow/core/api_def/base_api/api_def_StatefulStandardNormal.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatefulStandardNormal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d963c55278d5b5638497d74677e6329a3aa615e0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatefulStandardNormal.pbtxt
@@ -0,0 +1,31 @@
+op {
+  graph_op_name: "StatefulStandardNormal"
+  in_arg {
+    name: "resource"
+    description: <<END
+The handle of the resource variable that stores the state of the RNG.
+END
+  }
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A tensor of the specified shape filled with random normal values.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs random values from a normal distribution."
+  description: <<END
+The generated values will have mean 0 and standard deviation 1.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatefulStandardNormalV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatefulStandardNormalV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8c145e05ac7d9d928dd22721a394df497f1aabab
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatefulStandardNormalV2.pbtxt
@@ -0,0 +1,37 @@
+op {
+  graph_op_name: "StatefulStandardNormalV2"
+  in_arg {
+    name: "resource"
+    description: <<END
+The handle of the resource variable that stores the state of the RNG.
+END
+  }
+  in_arg {
+    name: "algorithm"
+    description: <<END
+The RNG algorithm.
+END
+  }
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A tensor of the specified shape filled with random normal values.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs random values from a normal distribution."
+  description: <<END
+The generated values will have mean 0 and standard deviation 1.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StaticRegexReplace.pbtxt b/tensorflow/core/api_def/base_api/api_def_StaticRegexReplace.pbtxt
index e382bcec814ecd2944bdb5ba5bffbc6d980479e4..8bb88f491abb4f4142724509690b336578aec791 100644
--- a/tensorflow/core/api_def/base_api/api_def_StaticRegexReplace.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_StaticRegexReplace.pbtxt
@@ -14,7 +14,7 @@ op {
   }
   attr {
     name: "rewrite"
-    description: "The rewrite to be applied to the matched expresion."
+    description: "The rewrite to be applied to the matched expression."
   }
   attr {
     name: "replace_global"
diff --git a/tensorflow/core/api_def/base_api/api_def_StridedSliceAssign.pbtxt b/tensorflow/core/api_def/base_api/api_def_StridedSliceAssign.pbtxt
index 0fc89576ad29939837da7c55e393a0baeca90e5e..c5177612ef45573d2244eaaefceae8d0dbfbf2d5 100644
--- a/tensorflow/core/api_def/base_api/api_def_StridedSliceAssign.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_StridedSliceAssign.pbtxt
@@ -4,7 +4,7 @@ op {
   description: <<END
 The values of `value` are assigned to the positions in the variable
 `ref` that are selected by the slice parameters. The slice parameters
-`begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+`begin`, `end`, `strides`, etc. work exactly as in `StridedSlice`.
 
 NOTE this op currently does not support broadcasting and so `value`'s
 shape must be exactly the shape produced by the slice of `ref`.
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUCompilationResult.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUCompilationResult.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..98643e295c331f2ca17f27ead5e19c08c14a5523
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUCompilationResult.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TPUCompilationResult"
+  summary: "CompilationResultProto indicating the status of the TPU compilation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUEmbeddingActivations.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUEmbeddingActivations.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0763a2644f8e8a59ea2e942386412e27b765f4ac
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUEmbeddingActivations.pbtxt
@@ -0,0 +1,37 @@
+op {
+  graph_op_name: "TPUEmbeddingActivations"
+  in_arg {
+    name: "embedding_variable"
+    description: <<END
+A trainable variable, enabling optimizers to find this op.
+END
+  }
+  in_arg {
+    name: "sliced_activations"
+    description: <<END
+The embedding activations Tensor to return.
+END
+  }
+  attr {
+    name: "table_id"
+    description: <<END
+The id of the table in the embedding layer configuration from which
+these activations were computed.
+END
+  }
+  attr {
+    name: "lookup_id"
+    description: <<END
+Identifier of the set of embedding indices which produced these
+activations.
+END
+  }
+  summary: "An op enabling differentiation of TPU Embeddings."
+  description: <<END
+This op simply returns its first input, which is assumed to have been sliced
+from the Tensors returned by TPUEmbeddingDequeueActivations. The presence of
+this op, and its first argument being a trainable Variable, enables automatic
+differentiation of graphs containing embeddings via the TPU Embedding Python
+libraries.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUOrdinalSelector.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUOrdinalSelector.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3c72abc68477184c582fdc26f8db7687b149b134
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUOrdinalSelector.pbtxt
@@ -0,0 +1,15 @@
+op {
+  graph_op_name: "TPUOrdinalSelector"
+  out_arg {
+    name: "device_ordinals"
+    description: <<END
+A vector 1 or more TPU cores.
+END
+  }
+  summary: "A TPU core selector Op."
+  description: <<END
+This Op produces a set of TPU cores (for warm-up) or a single TPU core
+(for regular inference) to execute the TPU program on. The output is
+consumed by TPUPartitionedCall.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUPartitionedCall.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUPartitionedCall.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4f384c21a9ce76c21a6b675048083172816d3b1e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUPartitionedCall.pbtxt
@@ -0,0 +1,40 @@
+op {
+  graph_op_name: "TPUPartitionedCall"
+  in_arg {
+    name: "args"
+    description: <<END
+The arguments to the function.
+END
+  }
+  in_arg {
+    name: "device_ordinal"
+    description: <<END
+The TPU device ordinal to run the function on.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The output of the function call.
+END
+  }
+  attr {
+    name: "Tin"
+    description: <<END
+The types of the arguments to the function.
+END
+  }
+  attr {
+    name: "Tout"
+    description: <<END
+The types of the outputs of the function.
+END
+  }
+  attr {
+    name: "f"
+    description: <<END
+The function to call.
+END
+  }
+  summary: "Calls a function placed on a specified TPU device."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUReplicate.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUReplicate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5664dfb9e6e2b8c896e3f26ed8b7863c1c4beec1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUReplicate.pbtxt
@@ -0,0 +1,99 @@
+op {
+  graph_op_name: "TPUReplicate"
+  in_arg {
+    name: "inputs"
+    description: <<END
+the inputs to 'computation', flattened, in replica-major order.
+END
+  }
+  in_arg {
+    name: "broadcast_inputs"
+    description: <<END
+additional arguments to broadcast to all replicas. The
+broadcast inputs are appended to the per-replica inputs when calling
+computation.
+END
+  }
+  in_arg {
+    name: "guaranteed_constants"
+    description: <<END
+arguments which have been guaranteed to not
+change their values during the session lifetime. These contain tensors marked as
+constant using the GuaranteeConstOp.
+END
+  }
+  out_arg {
+    name: "outputs"
+    description: <<END
+the outputs of 'computation'.
+END
+  }
+  attr {
+    name: "computation"
+    description: <<END
+a function containing the computation to run.
+END
+  }
+  attr {
+    name: "num_replicas"
+    description: <<END
+the number of replicas of the computation to run.
+END
+  }
+  attr {
+    name: "num_cores_per_replica"
+    description: <<END
+the number of logical cores in each replica.
+END
+  }
+  attr {
+    name: "topology"
+    description: <<END
+A serialized tensorflow.tpu.TopologyProto that describes the TPU
+topology.
+END
+  }
+  attr {
+    name: "use_tpu"
+    description: <<END
+a bool indicating if this computation will run on TPU or CPU/GPU.
+Currently, only supports a default placement (computation is placed on GPU
+if one is available, and on CPU if not).
+END
+  }
+  attr {
+    name: "device_assignment"
+    description: <<END
+a flattened array with shape
+[replica, num_cores_per_replica, mesh_dimension] that maps the coordinates
+of logical cores in each replica of a computation to physical coordinates in
+the TPU topology.
+END
+  }
+  attr {
+    name: "Tinputs"
+    description: <<END
+the types of the arguments to 'computation'.
+END
+  }
+  attr {
+    name: "Tbroadcast_inputs"
+    description: <<END
+the types of the additional arguments to broadcast to all
+replicas.
+END
+  }
+  attr {
+    name: "Tguaranteed_constants"
+    description: <<END
+the types of the arguments to 'guaranteed_constants'.
+END
+  }
+  attr {
+    name: "output_types"
+    description: <<END
+the types of the outputs of 'computation'.
+END
+  }
+  summary: "Runs replicated computations on a distributed TPU system."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUReplicateMetadata.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUReplicateMetadata.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b30d1b6218c3a6ea36d796ba77b2a7808b0be083
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUReplicateMetadata.pbtxt
@@ -0,0 +1,46 @@
+op {
+  graph_op_name: "TPUReplicateMetadata"
+  attr {
+    name: "num_replicas"
+    description: <<END
+Number of replicas of the computation
+END
+  }
+  attr {
+    name: "num_cores_per_replica"
+    description: <<END
+Number of cores per replica. Used for model parallelism.
+END
+  }
+  attr {
+    name: "topology"
+    description: <<END
+TopologyProto indicating the topology of the TPU pod slice.
+END
+  }
+  attr {
+    name: "use_tpu"
+    description: <<END
+Whether to place the computation on the TPU.
+END
+  }
+  attr {
+    name: "device_assignment"
+    description: <<END
+The assignment of devices for the computation.
+END
+  }
+  attr {
+    name: "computation_shape"
+    description: <<END
+DEPRECATED. Use num_cores_per_replica instead.
+END
+  }
+  attr {
+    name: "host_compute_core"
+  }
+  attr {
+    name: "padding_map"
+  }
+  summary: "Metadata indicaitng how the TPU computation should be replicated."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUReplicatedInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUReplicatedInput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1bba92e7843c3be7cb9536a58cf93ee5ec6b916f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUReplicatedInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TPUReplicatedInput"
+  summary: "Connects N inputs to an N-way replicated TPU computation."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TPUReplicatedOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUReplicatedOutput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cab78c7496e85751ee7ba5cc0a92620751d6b2d0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TPUReplicatedOutput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TPUReplicatedOutput"
+  summary: "Connects outputs of an N-way replicated computation to N outputs."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListConcatV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListConcatV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9b2af2c23b715f5cdb804dd449bf1001a444e686
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListConcatV2.pbtxt
@@ -0,0 +1,18 @@
+op {
+  graph_op_name: "TensorListConcatV2"
+  summary: "Concats all tensors in the list along the 0th dimension."
+  description: <<END
+Requires that all tensors have the same shape except the first dimension.
+
+input_handle: The input list.
+element_shape: The shape of the uninitialized elements in the list. If the first
+  dimension is not -1, it is assumed that all list elements have the same
+  leading dim.
+leading_dims: The list of leading dims of uninitialized list elements. Used if
+  the leading dim of input_handle.element_shape or the element_shape input arg
+  is not already set.
+tensor: The concated result.
+lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
+
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListResize.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListResize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5b34f8cec7e1c62142d280ad43e11c14afef30e5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListResize.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "TensorListResize"
+  summary: "Resizes the list."
+  description: <<END
+
+input_handle: the input list
+size: size of the output list
+
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListScatterIntoExistingList.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListScatterIntoExistingList.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..23da422971c485e8e8bba1a6b6cf1d9605d8ffbf
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListScatterIntoExistingList.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "TensorListScatterIntoExistingList"
+  summary: "Scatters tensor at indices in an input list."
+  description: <<END
+Each member of the TensorList corresponds to one row of the input tensor,
+specified by the given index (see `tf.gather`).
+
+input_handle: The list to scatter into.
+tensor: The input tensor.
+indices: The indices used to index into the list.
+output_handle: The TensorList.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListScatterV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListScatterV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1f520900fc0ce06d3fd6bb9bff4e164260ba71f0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListScatterV2.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "TensorListScatterV2"
+  summary: "Creates a TensorList by indexing into a Tensor."
+  description: <<END
+Each member of the TensorList corresponds to one row of the input tensor,
+specified by the given index (see `tf.gather`).
+
+tensor: The input tensor.
+indices: The indices used to index into the list.
+element_shape: The shape of the elements in the list (can be less specified than
+  the shape of the tensor).
+num_elements: The size of the output list. Must be large enough to accommodate
+  the largest index in indices. If -1, the list is just large enough to include
+  the largest index in indices.
+output_handle: The TensorList.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TridiagonalSolve.pbtxt b/tensorflow/core/api_def/base_api/api_def_TridiagonalSolve.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..80f3675d551ce90435fb7b78969f70986a4e8c02
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TridiagonalSolve.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "TridiagonalSolve"
+  visibility: HIDDEN
+  in_arg {
+    name: "diagonals"
+    description: <<END
+Shape is `[..., 3, M]`.
+END
+  }
+  in_arg {
+    name: "rhs"
+    description: <<END
+Shape is `[..., M, K]`.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Shape is `[..., M, K]`.
+END
+  }
+
+  summary: "Solves tridiagonal systems of equations."
+  description: <<END
+`diagonals` is a tensor of shape `[..., 3, M]` whose inner-most 2 dimensions
+represent matrices with three rows being the superdiagonal, diagonals, and
+subdiagonals, in order. The last element of the superdiagonal and the first
+element of the subdiagonal is ignored.
+`rhs` is a tensor of shape `[..., M, K]`, representing K right-hand sides per
+each left-hand side.
+The output is a tensor of shape `[..., M, K]` containing the solutions.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnicodeDecode.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnicodeDecode.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9b3f69023f1167fc3964a82a1e425d619ecc5521
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnicodeDecode.pbtxt
@@ -0,0 +1,76 @@
+op {
+  graph_op_name: "UnicodeDecode"
+  in_arg {
+    name: "input"
+    description: <<END
+The text to be decoded. Can have any shape. Note that the output is flattened
+to a vector of char values.
+END
+  }
+  out_arg {
+    name: "row_splits"
+    description: <<END
+A 1D int32 tensor containing the row splits.
+END
+  }
+  out_arg {
+    name: "char_values"
+    description: <<END
+A 1D int32 Tensor containing the decoded codepoints.
+END
+  }
+  attr {
+    name: "input_encoding"
+    description: <<END
+Text encoding of the input strings. This is any of the encodings supported
+by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+END
+  }
+  attr {
+    name: "errors"
+    description: <<END
+Error handling policy when there is invalid formatting found in the input.
+The value of 'strict' will cause the operation to produce a InvalidArgument
+error on any invalid input formatting. A value of 'replace' (the default) will
+cause the operation to replace any invalid formatting in the input with the
+`replacement_char` codepoint. A value of 'ignore' will cause the operation to
+skip any invalid formatting in the input and produce no corresponding output
+character.
+END
+  }
+  attr {
+    name: "replacement_char"
+    description: <<END
+The replacement character codepoint to be used in place of any invalid
+formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+be used. The default value is the default unicode replacement character is
+0xFFFD or U+65533.)
+END
+  }
+  attr {
+    name: "replace_control_characters"
+    description: <<END
+Whether to replace the C0 control characters (00-1F) with the
+`replacement_char`. Default is false.
+END
+  }
+  summary: <<END
+Decodes each string in `input` into a sequence of Unicode code points.
+END
+  description: <<END
+The character codepoints for all strings are returned using a single vector
+`char_values`, with strings expanded to characters in row-major order.
+
+The `row_splits` tensor indicates where the codepoints for
+each input string begin and end within the `char_values` tensor.
+In particular, the values for the `i`th
+string (in row-major order) are stored in the slice
+`[row_splits[i]:row_splits[i+1]]`. Thus:
+
+* `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
+  character in the `i`th string (in row-major order).
+* `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
+  string (in row-major order).
+END
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
index 7a60e4387ad0078d51eba026fcd2d9454a50e4ec..f282b9fab56a2735519ec56d3292867feb84750a 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMax.pbtxt
@@ -3,7 +3,8 @@ op {
   in_arg {
     name: "segment_ids"
     description: <<END
-A tensor whose shape is a prefix of `data.shape`.END
+A tensor whose shape is a prefix of `data.shape`.
+END
   }
   out_arg {
     name: "output"
@@ -16,7 +17,7 @@ END
   summary: "Computes the maximum along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 This operator is similar to the unsorted segment sum operator found
@@ -36,5 +37,15 @@ dropped, and will not be included in the result.
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
 </div>
+
+For example:
+
+``` python
+c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+tf.unsorted_segment_max(c, tf.constant([0, 1, 0]), num_segments=2)
+# ==> [[ 4,  3, 3, 4],
+#       [5,  6, 7, 8]]
+```
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt
index 7e139ddf4d9fac5cd47fdb56927cb325be45d54d..0360cc09d064295a45032905f652e56055b92986 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentMin.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the minimum along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 This operator is similar to the unsorted segment sum operator found
@@ -31,6 +31,15 @@ If the minimum is empty for a given segment ID `i`, it outputs the largest
 possible value for the specific numeric type,
 `output[i] = numeric_limits<T>::max()`.
 
+For example:
+
+``` python
+c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+tf.unsorted_segment_min(c, tf.constant([0, 1, 0]), num_segments=2)
+# ==> [[ 1,  2, 2, 1],
+#       [5,  6, 7, 8]]
+```
+
 If the given segment ID `i` is negative, then the corresponding value is
 dropped, and will not be included in the result.
 END
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt
index 9c8ea3b620832dba4f18a1cfbac953bad8bb6f56..67de4734bdedcf82acdcf993e1fb3e36d7b140d9 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentProd.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the product along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 This operator is similar to the unsorted segment sum operator found
@@ -28,6 +28,15 @@ entries belonging to a segment such that:
 \\(output_i = \prod_{j...} data[j...]\\) where the product is over tuples
 `j...` such that `segment_ids[j...] == i`.
 
+For example:
+
+``` python
+c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+tf.unsorted_segment_prod(c, tf.constant([0, 1, 0]), num_segments=2)
+# ==> [[ 4,  6, 6, 4],
+#       [5,  6, 7, 8]]
+```
+
 If there is no entry for a given segment ID `i`, it outputs 1.
 
 If the given segment ID `i` is negative, then the corresponding value is
diff --git a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
index 7e5d9265c2ead2028fa8bb80076ea40f858cff39..08139235f4a792c353dda4db667a135c823c6e5a 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnsortedSegmentSum.pbtxt
@@ -17,7 +17,7 @@ END
   summary: "Computes the sum along segments of a tensor."
   description: <<END
 Read
-[the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+[the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 for an explanation of segments.
 
 Computes a tensor such that
@@ -35,5 +35,13 @@ added to the sum of the segment.
 <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
 <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
 </div>
+
+``` python
+c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+tf.unsorted_segment_sum(c, tf.constant([0, 1, 0]), num_segments=2)
+# ==> [[ 5,  5, 5, 5],
+#       [5,  6, 7, 8]]
+```
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_WorkerHeartbeat.pbtxt b/tensorflow/core/api_def/base_api/api_def_WorkerHeartbeat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e886b04ca69e61bc76e2207b33f661f74506e9b3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_WorkerHeartbeat.pbtxt
@@ -0,0 +1,20 @@
+op {
+  graph_op_name: "WorkerHeartbeat"
+  in_arg {
+    name: "request"
+    description: <<END
+A string tensor containing a serialized WorkerHeartbeatRequest
+END
+  }
+  out_arg {
+    name: "response"
+    description: <<END
+A string tensor containing a serialized WorkerHeartbeatResponse
+END
+  }
+  summary: "Worker heartbeat op."
+  description: <<END
+Heartbeats may be sent periodically to indicate the coordinator is still active,
+to retrieve the current worker status and to expedite shutdown when necessary.
+END
+}
diff --git a/tensorflow/core/api_def/excluded_ops.cc b/tensorflow/core/api_def/excluded_ops.cc
index 02026e94abc5b3284578859e157279b27ba84446..65d2102ac80579b0ba6f9510cd7a95300cd10a3f 100644
--- a/tensorflow/core/api_def/excluded_ops.cc
+++ b/tensorflow/core/api_def/excluded_ops.cc
@@ -24,9 +24,9 @@ const std::unordered_set<std::string>* GetExcludedOps() {
            "GcsConfigureBlockCache", "GcsConfigureCredentials",
 #ifdef INTEL_MKL
            // QuantizedFusedOps for Intel CPU
-           "QuantizedConv2DAndRequantize", "QuantizedConv2DWithBias",
-           "QuantizedConv2DWithBiasAndRequantize", "QuantizedConv2DAndRelu",
-           "QuantizedConv2DAndReluAndRequantize",
+           "QuantizedConcatV2", "QuantizedConv2DAndRequantize",
+           "QuantizedConv2DWithBias", "QuantizedConv2DWithBiasAndRequantize",
+           "QuantizedConv2DAndRelu", "QuantizedConv2DAndReluAndRequantize",
            "QuantizedConv2DWithBiasAndRelu",
            "QuantizedConv2DWithBiasAndReluAndRequantize",
            "QuantizedConv2DWithBiasSumAndRelu",
diff --git a/tensorflow/core/api_def/java_api/api_def_Abort.pbtxt b/tensorflow/core/api_def/java_api/api_def_Abort.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..58448c2d17b6f4d323e5b4c041bcbdf559c98a5a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Abort.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Abort"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Abs.pbtxt b/tensorflow/core/api_def/java_api/api_def_Abs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ece45cf73f336792c532040370cc44f8709e397a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Abs.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Abs"
+  endpoint {
+    name: "math.Abs"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AccumulateNV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_AccumulateNV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0c7a080c1159a1b4885c6d8e14cddf3ca4d07ae1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AccumulateNV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AccumulateNV2"
+  endpoint {
+    name: "math.AccumulateN"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AccumulatorApplyGradient.pbtxt b/tensorflow/core/api_def/java_api/api_def_AccumulatorApplyGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..49b7acad7d829838c3ba40cbdb97f1bafc96306d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AccumulatorApplyGradient.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AccumulatorApplyGradient"
+  endpoint {
+    name: "train.AccumulatorApplyGradient"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AccumulatorNumAccumulated.pbtxt b/tensorflow/core/api_def/java_api/api_def_AccumulatorNumAccumulated.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1c42e819bf4d5e7bf80d42f96d13961c844f0eb8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AccumulatorNumAccumulated.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AccumulatorNumAccumulated"
+  endpoint {
+    name: "train.AccumulatorNumAccumulated"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AccumulatorSetGlobalStep.pbtxt b/tensorflow/core/api_def/java_api/api_def_AccumulatorSetGlobalStep.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ca85302cdb4bbb1833eda5f63ce15a925ba5ee3e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AccumulatorSetGlobalStep.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AccumulatorSetGlobalStep"
+  endpoint {
+    name: "train.AccumulatorSetGlobalStep"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AccumulatorTakeGradient.pbtxt b/tensorflow/core/api_def/java_api/api_def_AccumulatorTakeGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4883802c637e0d9298b4807b9ef25c2e32f2476e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AccumulatorTakeGradient.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AccumulatorTakeGradient"
+  endpoint {
+    name: "train.AccumulatorTakeGradient"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Acos.pbtxt b/tensorflow/core/api_def/java_api/api_def_Acos.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..847986b429de1f041e28819c33d6a1894f91f229
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Acos.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Acos"
+  endpoint {
+    name: "math.Acos"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Acosh.pbtxt b/tensorflow/core/api_def/java_api/api_def_Acosh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..76d8f5fad05aae0372fae02d03c4f1da9af7343d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Acosh.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Acosh"
+  endpoint {
+    name: "math.Acosh"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Add.pbtxt b/tensorflow/core/api_def/java_api/api_def_Add.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4f78ccc9ea6ec7e2ca5960d384dab1ae0b85cb47
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Add.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Add"
+  endpoint {
+    name: "math.Add"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AddManySparseToTensorsMap.pbtxt b/tensorflow/core/api_def/java_api/api_def_AddManySparseToTensorsMap.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e009ba19d34f742823d00e4740260a35ef0e7b95
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AddManySparseToTensorsMap.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AddManySparseToTensorsMap"
+  endpoint {
+    name: "sparse.AddManySparseToTensorsMap"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AddN.pbtxt b/tensorflow/core/api_def/java_api/api_def_AddN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..20d469ae731ec7cde431988ff198474c67c9d694
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AddN.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AddN"
+  endpoint {
+    name: "math.AddN"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AddSparseToTensorsMap.pbtxt b/tensorflow/core/api_def/java_api/api_def_AddSparseToTensorsMap.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0bb20186de38f681d659c744f686738adb5e76cf
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AddSparseToTensorsMap.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AddSparseToTensorsMap"
+  endpoint {
+    name: "sparse.AddSparseToTensorsMap"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AddV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_AddV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a070c6a51939639b7820572d8d464c79a7cd1ccb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AddV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AddV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AdjustContrast.pbtxt b/tensorflow/core/api_def/java_api/api_def_AdjustContrast.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..daad141027a7d6c36a0624c7ce3b92a7cb409b6c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AdjustContrast.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AdjustContrast"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AdjustContrastv2.pbtxt b/tensorflow/core/api_def/java_api/api_def_AdjustContrastv2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..81f565c1d594754a3889abb0debee81ab8bf746d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AdjustContrastv2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AdjustContrastv2"
+  endpoint {
+    name: "image.AdjustContrast"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AdjustHue.pbtxt b/tensorflow/core/api_def/java_api/api_def_AdjustHue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0847cad4031f281f65ea19dfddccdbf1f25bc5e1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AdjustHue.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AdjustHue"
+  endpoint {
+    name: "image.AdjustHue"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AdjustSaturation.pbtxt b/tensorflow/core/api_def/java_api/api_def_AdjustSaturation.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d685636eb12426b4755b67d55fd5f986b7a285e4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AdjustSaturation.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AdjustSaturation"
+  endpoint {
+    name: "image.AdjustSaturation"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_All.pbtxt b/tensorflow/core/api_def/java_api/api_def_All.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a6459c56b71f359bad5a2fda9e605eb25471e5a1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_All.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "All"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AllCandidateSampler.pbtxt b/tensorflow/core/api_def/java_api/api_def_AllCandidateSampler.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..607c208a460b923df35da8f542402380c8cdebae
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AllCandidateSampler.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AllCandidateSampler"
+  endpoint {
+    name: "random.AllCandidateSampler"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Angle.pbtxt b/tensorflow/core/api_def/java_api/api_def_Angle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a92ccf357dbd1be80b946ea6683e48f30de5f918
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Angle.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Angle"
+  endpoint {
+    name: "math.Angle"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AnonymousIterator.pbtxt b/tensorflow/core/api_def/java_api/api_def_AnonymousIterator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..894f85ae88e7961db328d842a358879df71dd9dc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AnonymousIterator.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AnonymousIterator"
+  endpoint {
+    name: "data.AnonymousIterator"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Any.pbtxt b/tensorflow/core/api_def/java_api/api_def_Any.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..20b36eda3f8c4cb231b39a2a88f45f756ab42326
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Any.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Any"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyAdaMax.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyAdaMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..583f164e06c17f1f0192a2a30d22665f05d0f2df
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyAdaMax.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyAdaMax"
+  endpoint {
+    name: "train.ApplyAdaMax"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyAdadelta.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyAdadelta.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e672a8ef03bcec665878fd2c927cff7458b70af6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyAdadelta.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyAdadelta"
+  endpoint {
+    name: "train.ApplyAdadelta"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyAdagrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..980c57c5fecc0d93655efd781efcadfa2163061c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyAdagrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyAdagrad"
+  endpoint {
+    name: "train.ApplyAdagrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyAdagradDA.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyAdagradDA.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..815df985ef98d18fd45bce603416aea4e1c90387
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyAdagradDA.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyAdagradDA"
+  endpoint {
+    name: "train.ApplyAdagradDa"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyAdam.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyAdam.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..56461b1d3d582d728976a3685ab3d42d4fa90caa
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyAdam.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyAdam"
+  endpoint {
+    name: "train.ApplyAdam"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyAddSign.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyAddSign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b54ff6eca44a4103e08bef4f69f86e5283949863
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyAddSign.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyAddSign"
+  endpoint {
+    name: "train.ApplyAddSign"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyCenteredRMSProp.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyCenteredRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1b831bca43675334edc0e7a0cc2565d3e1019f9b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyCenteredRMSProp.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyCenteredRMSProp"
+  endpoint {
+    name: "train.ApplyCenteredRmsProp"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyFtrl.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyFtrl.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..176de19a9a7f8ac71bbb8038aa20dc26b19b9452
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyFtrl.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ApplyFtrl"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyFtrlV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyFtrlV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..da0fc8fcbf794ed17e4c04291719b67721669da6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyFtrlV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyFtrlV2"
+  endpoint {
+    name: "train.ApplyFtrl"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyGradientDescent.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyGradientDescent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1fa569ed329d73b5179fd0d00c2d21035299820e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyGradientDescent.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyGradientDescent"
+  endpoint {
+    name: "train.ApplyGradientDescent"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyMomentum.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..96c21199f0902a97846e86362c64b49491fdea57
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyMomentum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyMomentum"
+  endpoint {
+    name: "train.ApplyMomentum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyPowerSign.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyPowerSign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e5c22347556d0cccf335ac9d5f217b5c459e5afc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyPowerSign.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyPowerSign"
+  endpoint {
+    name: "train.ApplyPowerSign"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyProximalAdagrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyProximalAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a52d8c3591c13a8d9843856f4845cf4b762183fa
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyProximalAdagrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyProximalAdagrad"
+  endpoint {
+    name: "train.ApplyProximalAdagrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyProximalGradientDescent.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyProximalGradientDescent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..74ea29cf8882436e9d27a3ddcc1b43ff7a87b460
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyProximalGradientDescent.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyProximalGradientDescent"
+  endpoint {
+    name: "train.ApplyProximalGradientDescent"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApplyRMSProp.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApplyRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..90171ccc759c1cef4cccc2c5ee44bfd7571d0145
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApplyRMSProp.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApplyRMSProp"
+  endpoint {
+    name: "train.ApplyRmsProp"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ApproximateEqual.pbtxt b/tensorflow/core/api_def/java_api/api_def_ApproximateEqual.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..029dc6d29305049af5c818d05f5a4b13e53443ea
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ApproximateEqual.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ApproximateEqual"
+  endpoint {
+    name: "math.ApproximateEqual"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ArgMax.pbtxt b/tensorflow/core/api_def/java_api/api_def_ArgMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f9effd49c4a68f79de7473308490c576775ae2fc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ArgMax.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ArgMax"
+  endpoint {
+    name: "math.ArgMax"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ArgMin.pbtxt b/tensorflow/core/api_def/java_api/api_def_ArgMin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5ff04c0d1ab01ab2757fd18dff22755681f0a96d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ArgMin.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ArgMin"
+  endpoint {
+    name: "math.ArgMin"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AsString.pbtxt b/tensorflow/core/api_def/java_api/api_def_AsString.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e8c875ea8141d52d29bd7ef467f97d01b201187c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AsString.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AsString"
+  endpoint {
+    name: "dtypes.AsString"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Asin.pbtxt b/tensorflow/core/api_def/java_api/api_def_Asin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ffc8e3e570eeee82c69503e9f56f5ff2c9ebc19
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Asin.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Asin"
+  endpoint {
+    name: "math.Asin"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Asinh.pbtxt b/tensorflow/core/api_def/java_api/api_def_Asinh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e3b30dd51254efd628e41615a9d08dc100f284f4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Asinh.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Asinh"
+  endpoint {
+    name: "math.Asinh"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Assert.pbtxt b/tensorflow/core/api_def/java_api/api_def_Assert.pbtxt
index b1f868897d5b88ac76eb8f85ace99c4ce3c3e037..a9e107b4780ab2405db65cf29a369495051b4c64 100644
--- a/tensorflow/core/api_def/java_api/api_def_Assert.pbtxt
+++ b/tensorflow/core/api_def/java_api/api_def_Assert.pbtxt
@@ -1,4 +1,6 @@
 op {
-  graph_op_name: "Assert" #TODO(karllessard) escape that reserved name
-  visibility: HIDDEN
+  graph_op_name: "Assert"
+  endpoint {
+    name: "AssertThat"
+  }
 }
diff --git a/tensorflow/core/api_def/java_api/api_def_Assign.pbtxt b/tensorflow/core/api_def/java_api/api_def_Assign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..15d778f61e86ada53f5be1c7e2fc29c78f37333b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Assign.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Assign"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AssignAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_AssignAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a4118b64afd98192523d372aed99b8717d3ca9fb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AssignAdd.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "AssignAdd"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AssignAddVariableOp.pbtxt b/tensorflow/core/api_def/java_api/api_def_AssignAddVariableOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..05fecb191bf75f92f23a384f864d3d8c33d43489
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AssignAddVariableOp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "AssignAddVariableOp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AssignSub.pbtxt b/tensorflow/core/api_def/java_api/api_def_AssignSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aaf9246a6ac2d809afec616842c11fdaa48c37e5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AssignSub.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "AssignSub"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AssignSubVariableOp.pbtxt b/tensorflow/core/api_def/java_api/api_def_AssignSubVariableOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6e8791aed2d9f7a72ae18e343c307cb46dd52694
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AssignSubVariableOp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "AssignSubVariableOp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AssignVariableOp.pbtxt b/tensorflow/core/api_def/java_api/api_def_AssignVariableOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..980e6968269e1ac35193920575f0619a4fba4a16
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AssignVariableOp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "AssignVariableOp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Atan.pbtxt b/tensorflow/core/api_def/java_api/api_def_Atan.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e51aee9abc4d4b966dc59af4004b89618b9b09e5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Atan.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Atan"
+  endpoint {
+    name: "math.Atan"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Atan2.pbtxt b/tensorflow/core/api_def/java_api/api_def_Atan2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..302b05f9dce7c8383253f06d0f5f60191e110d54
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Atan2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Atan2"
+  endpoint {
+    name: "math.Atan2"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Atanh.pbtxt b/tensorflow/core/api_def/java_api/api_def_Atanh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b9c4a4115443525152aaef949a30106b6a3cbeb8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Atanh.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Atanh"
+  endpoint {
+    name: "math.Atanh"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AudioSpectrogram.pbtxt b/tensorflow/core/api_def/java_api/api_def_AudioSpectrogram.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bd8f3a5e3353d1acabf0e264c4de09416af49ec0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AudioSpectrogram.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AudioSpectrogram"
+  endpoint {
+    name: "audio.AudioSpectrogram"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AudioSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_AudioSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..13d30de29dc78642b421087040000dc97b8c7963
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AudioSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AudioSummary"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AudioSummaryV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_AudioSummaryV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e4eda8b09ab44f933ef8ae650cfc39aaeece8d7b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AudioSummaryV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AudioSummaryV2"
+  endpoint {
+    name: "summary.AudioSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AvgPool.pbtxt b/tensorflow/core/api_def/java_api/api_def_AvgPool.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..10d87802f0d85379c5789b897bd08dab1d5ec1a7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AvgPool.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AvgPool"
+  endpoint {
+    name: "nn.AvgPool"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AvgPool3D.pbtxt b/tensorflow/core/api_def/java_api/api_def_AvgPool3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1ae2794f48b9b1174cf8de0f3d18259a2ab0d3a3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AvgPool3D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AvgPool3D"
+  endpoint {
+    name: "nn.AvgPool3d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AvgPool3DGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_AvgPool3DGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..09aba78ca209abe86700e6afa5181e7222e1e580
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AvgPool3DGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AvgPool3DGrad"
+  endpoint {
+    name: "nn.AvgPool3dGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_AvgPoolGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_AvgPoolGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fc8fec89b992c961a5f78208801d5a7a1e754d53
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_AvgPoolGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "AvgPoolGrad"
+  endpoint {
+    name: "nn.AvgPoolGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Barrier.pbtxt b/tensorflow/core/api_def/java_api/api_def_Barrier.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6e282ca7b390c9c2334224dc8049e828582de370
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Barrier.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Barrier"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BarrierClose.pbtxt b/tensorflow/core/api_def/java_api/api_def_BarrierClose.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0307318763b8450b7a0f42b0df90bae64162e394
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BarrierClose.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BarrierClose"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BarrierIncompleteSize.pbtxt b/tensorflow/core/api_def/java_api/api_def_BarrierIncompleteSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fb11b18e951e75e476fddd2c7f876c69013bef5f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BarrierIncompleteSize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BarrierIncompleteSize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BarrierInsertMany.pbtxt b/tensorflow/core/api_def/java_api/api_def_BarrierInsertMany.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..32e29f00158ae147399dd9d71a5f0a5d1fa95d52
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BarrierInsertMany.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BarrierInsertMany"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BarrierReadySize.pbtxt b/tensorflow/core/api_def/java_api/api_def_BarrierReadySize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0ed50b257994ed0466eb5f26612d02f306ddd8ab
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BarrierReadySize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BarrierReadySize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BarrierTakeMany.pbtxt b/tensorflow/core/api_def/java_api/api_def_BarrierTakeMany.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..21f08878c6d76a4426da0448cc55e44283d25305
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BarrierTakeMany.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BarrierTakeMany"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Batch.pbtxt b/tensorflow/core/api_def/java_api/api_def_Batch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2c21faf72d5c0850d9761f8c98ee9ee892e9c293
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Batch.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Batch"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchCholesky.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchCholesky.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..15048109fd64c3f2ef66341f96f87fe7cbe3717a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchCholesky.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchCholesky"
+  endpoint {
+    name: "linalg.BatchCholesky"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchCholeskyGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchCholeskyGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eb0e2c6bc83c3ff93e9a635fae0e3e23b7333a23
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchCholeskyGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchCholeskyGrad"
+  endpoint {
+    name: "linalg.BatchCholeskyGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0bb7298ba90625fcb6a9b5227277db9b86e21bf6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchDataset"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchDatasetV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchDatasetV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cd81b0b1cf44c20c0e8c3d51deb77e450e8a5b96
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchDatasetV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchDatasetV2"
+  endpoint {
+    name: "data.BatchDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchFFT.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchFFT.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4dda7c1fb61ac2c6336582b99c2b4ebc23cc808b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchFFT.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchFFT"
+  endpoint {
+    name: "signal.BatchFft"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchFFT2D.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchFFT2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e11860138a23888b5b51634bf0e6082570d15fc9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchFFT2D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchFFT2D"
+  endpoint {
+    name: "signal.BatchFft2d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchFFT3D.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchFFT3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3be0b516d0e73acada03f1be1dd0816def291c1a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchFFT3D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchFFT3D"
+  endpoint {
+    name: "signal.BatchFft3d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchFunction.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchFunction.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8789dc6acb6355a0079dd85d36a0da9e1c675a94
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchFunction.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BatchFunction"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchIFFT.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchIFFT.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..de37ada148acde00333b377288876df6d38994c1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchIFFT.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchIFFT"
+  endpoint {
+    name: "signal.BatchIfft"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchIFFT2D.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchIFFT2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ae7fb4cb0ae2dbd476617a350be79f8107af4f8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchIFFT2D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchIFFT2D"
+  endpoint {
+    name: "signal.BatchIfft2d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchIFFT3D.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchIFFT3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0ecb52714b53419447922b5aa97cb18f3c413b56
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchIFFT3D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchIFFT3D"
+  endpoint {
+    name: "signal.BatchIfft3d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchMatMul.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchMatMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..95aa6446157deca2318b9e0ae417b18748b01f31
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchMatMul.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchMatMul"
+  endpoint {
+    name: "linalg.BatchMatMul"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchMatrixBandPart.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchMatrixBandPart.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..de989c6d527e45322fd7cd668a67afaffb32e9c7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchMatrixBandPart.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchMatrixBandPart"
+  endpoint {
+    name: "linalg.BatchMatrixBandPart"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchMatrixDeterminant.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchMatrixDeterminant.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a45fe25d10c0dbc205d4e5d1424c3a6c5ae9d166
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchMatrixDeterminant.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchMatrixDeterminant"
+  endpoint {
+    name: "linalg.BatchMatrixDeterminant"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchMatrixDiag.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchMatrixDiag.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d85d76f02f04114d5ef8a12bad6136d550b4eb95
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchMatrixDiag.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchMatrixDiag"
+  endpoint {
+    name: "linalg.BatchMatrixDiag"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchMatrixDiagPart.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchMatrixDiagPart.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4b5350b11eedcbd8b47ec7977bd275f633671561
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchMatrixDiagPart.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchMatrixDiagPart"
+  endpoint {
+    name: "linalg.BatchMatrixDiagPart"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchMatrixInverse.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchMatrixInverse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f40ea50d4bcb878ce04609460bada01c17ccad2c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchMatrixInverse.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchMatrixInverse"
+  endpoint {
+    name: "linalg.BatchMatrixInverse"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchMatrixSetDiag.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchMatrixSetDiag.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ac4cd6889b63a562643d5d1bbd4d9b0686d224ff
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchMatrixSetDiag.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchMatrixSetDiag"
+  endpoint {
+    name: "linalg.BatchMatrixSetDiag"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchMatrixSolve.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchMatrixSolve.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..97435acb4e49cc1c2cf10e969dd9ab052da5f61a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchMatrixSolve.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchMatrixSolve"
+  endpoint {
+    name: "linalg.BatchMatrixSolve"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchMatrixSolveLs.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchMatrixSolveLs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aee0b4add3577ee97e5a4eac802e6fda47153585
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchMatrixSolveLs.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchMatrixSolveLs"
+  endpoint {
+    name: "linalg.BatchMatrixSolveLs"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchMatrixTriangularSolve.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchMatrixTriangularSolve.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..554eff15747871acdb5248b1488004e5705d1fb9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchMatrixTriangularSolve.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchMatrixTriangularSolve"
+  endpoint {
+    name: "linalg.BatchMatrixTriangularSolve"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchNormWithGlobalNormalization.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchNormWithGlobalNormalization.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8285ac284d8427cc7334747891e799e3ebc441b9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchNormWithGlobalNormalization.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchNormWithGlobalNormalization"
+  endpoint {
+    name: "nn.BatchNormWithGlobalNormalization"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchNormWithGlobalNormalizationGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchNormWithGlobalNormalizationGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7b18bf52accb2ef990ba96719d8fa97643fff4ea
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchNormWithGlobalNormalizationGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchNormWithGlobalNormalizationGrad"
+  endpoint {
+    name: "nn.BatchNormWithGlobalNormalizationGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchSelfAdjointEig.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchSelfAdjointEig.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..517030fd692d4c8641615338eb4e376cbaaa86a4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchSelfAdjointEig.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BatchSelfAdjointEig"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchSelfAdjointEigV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchSelfAdjointEigV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9c973443902563cce4adda3fdc6d526d6fa740e1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchSelfAdjointEigV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchSelfAdjointEigV2"
+  endpoint {
+    name: "linalg.BatchSelfAdjointEig"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchSvd.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchSvd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8696359df8d5b130979681e190aaab89c230243e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchSvd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchSvd"
+  endpoint {
+    name: "linalg.BatchSvd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchToSpace.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchToSpace.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..affbc519e514e39a86736121c56947fcf9075353
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchToSpace.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BatchToSpace"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BatchToSpaceND.pbtxt b/tensorflow/core/api_def/java_api/api_def_BatchToSpaceND.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6c7d2fbdb9fed77d3c9484b2a8442e7a16179641
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BatchToSpaceND.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BatchToSpaceND"
+  endpoint {
+    name: "BatchToSpaceNd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BesselI0e.pbtxt b/tensorflow/core/api_def/java_api/api_def_BesselI0e.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..84eb3b5e71d6e67ce36e9ed0103468442a974fe1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BesselI0e.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BesselI0e"
+  endpoint {
+    name: "math.BesselI0e"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BesselI1e.pbtxt b/tensorflow/core/api_def/java_api/api_def_BesselI1e.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..43f9113b0bbe53a076719226b659f5598bb1c919
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BesselI1e.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BesselI1e"
+  endpoint {
+    name: "math.BesselI1e"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Betainc.pbtxt b/tensorflow/core/api_def/java_api/api_def_Betainc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6e9956d9ec72df62cc5db845c8f15753d2e1bc7d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Betainc.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Betainc"
+  endpoint {
+    name: "math.Betainc"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BiasAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_BiasAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eb3be23bd9a14b376c2e127137a694afbf95bd32
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BiasAdd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BiasAdd"
+  endpoint {
+    name: "nn.BiasAdd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BiasAddGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_BiasAddGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4e040bf6df807f7395381572dee931ec188ea724
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BiasAddGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BiasAddGrad"
+  endpoint {
+    name: "nn.BiasAddGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BiasAddV1.pbtxt b/tensorflow/core/api_def/java_api/api_def_BiasAddV1.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..651c434e6459a9e1bbc2bb399572a3752bdb9569
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BiasAddV1.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BiasAddV1"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BigQueryReader.pbtxt b/tensorflow/core/api_def/java_api/api_def_BigQueryReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5b6e11687a2d73e706ebaa33c3c122bb43796f97
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BigQueryReader.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BigQueryReader"
+  endpoint {
+    name: "io.BigQueryReader"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Bincount.pbtxt b/tensorflow/core/api_def/java_api/api_def_Bincount.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b894fd6ec5e5266bfdafd4866e4099479f0aecea
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Bincount.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Bincount"
+  endpoint {
+    name: "math.Bincount"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Bitcast.pbtxt b/tensorflow/core/api_def/java_api/api_def_Bitcast.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9d2db26851d02076c17f802a89d04e257f407f68
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Bitcast.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Bitcast"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BitwiseAnd.pbtxt b/tensorflow/core/api_def/java_api/api_def_BitwiseAnd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..db5fada2461e313e40a755b0974cc061a960e1c7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BitwiseAnd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BitwiseAnd"
+  endpoint {
+    name: "bitwise.BitwiseAnd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BitwiseOr.pbtxt b/tensorflow/core/api_def/java_api/api_def_BitwiseOr.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8f9d1bc2fe4dcf6d9a7836e18f62edeb02795547
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BitwiseOr.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BitwiseOr"
+  endpoint {
+    name: "bitwise.BitwiseOr"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BitwiseXor.pbtxt b/tensorflow/core/api_def/java_api/api_def_BitwiseXor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..28f405b8adac55d336985aa74f1dc44dbe2e2d46
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BitwiseXor.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BitwiseXor"
+  endpoint {
+    name: "bitwise.BitwiseXor"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesBucketize.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesBucketize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..acec845ca4708c0a0e7d90d5ce380dab3f074eb0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesBucketize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesBucketize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fa22216ed5b92a30ff7d64f1924d128b45cf5111
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesCalculateBestGainsPerFeature"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesCenterBias.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesCenterBias.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1b65775a126ff9d29635cd066214a6ef48c4b604
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesCenterBias.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesCenterBias"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesCreateEnsemble.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesCreateEnsemble.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..194251d4337bc3df80c33d8ad3fa2281df74c110
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesCreateEnsemble.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesCreateEnsemble"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesCreateQuantileStreamResource.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesCreateQuantileStreamResource.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7d9c8c9229c61e75f6bc8d6fdc08fa2617077f48
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesCreateQuantileStreamResource.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesCreateQuantileStreamResource"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesDeserializeEnsemble.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesDeserializeEnsemble.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2ed17ca30f495d8de16c62861f7365b79ca01040
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesDeserializeEnsemble.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesDeserializeEnsemble"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesEnsembleResourceHandleOp.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesEnsembleResourceHandleOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a44c86614a30ff8f2686191cadb8d386f3c493a1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesEnsembleResourceHandleOp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesEnsembleResourceHandleOp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesExampleDebugOutputs.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesExampleDebugOutputs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4d6f276911617edc905d37b699087912e96a2179
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesExampleDebugOutputs.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesExampleDebugOutputs"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesGetEnsembleStates.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesGetEnsembleStates.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a591013fb71b03a9c543443376833fd4ce1e278e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesGetEnsembleStates.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesGetEnsembleStates"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesMakeQuantileSummaries.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesMakeQuantileSummaries.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4e147765a7b84fd03819aa1d6623d0bbaf6c5bfc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesMakeQuantileSummaries.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesMakeQuantileSummaries"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesMakeStatsSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesMakeStatsSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bbee8bb47ca54e1e4b4a11abfd061d5feb688533
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesMakeStatsSummary.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesMakeStatsSummary"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesPredict.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesPredict.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e25d43a18fb382d56a5485439bca40587e337bad
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesPredict.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesPredict"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceAddSummaries.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceAddSummaries.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d9fe96d3cd1b8dca091617fd9eb958ea9fcdfdab
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceAddSummaries.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesQuantileStreamResourceAddSummaries"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceDeserialize.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceDeserialize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..86e80902417f877ff8ad5622519f06a60a9ea820
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceDeserialize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesQuantileStreamResourceDeserialize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceFlush.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceFlush.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c98375bb24119c30d6a4c33e74d274c4a72e01ab
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceFlush.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesQuantileStreamResourceFlush"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceGetBucketBoundaries.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceGetBucketBoundaries.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e0421be40aabb4a17ec2df719a6917968c5dfd40
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceGetBucketBoundaries.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesQuantileStreamResourceGetBucketBoundaries"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceHandleOp.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceHandleOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b7f5e8aa65d8d913a3702d47948d25a33f29d5d7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesQuantileStreamResourceHandleOp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesQuantileStreamResourceHandleOp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesSerializeEnsemble.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesSerializeEnsemble.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..db5807344bd6ff0556a6d8a335cd432b223ef075
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesSerializeEnsemble.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesSerializeEnsemble"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesTrainingPredict.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesTrainingPredict.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b722233953b6e6b11daf38818bc44b030960b6da
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesTrainingPredict.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesTrainingPredict"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BoostedTreesUpdateEnsemble.pbtxt b/tensorflow/core/api_def/java_api/api_def_BoostedTreesUpdateEnsemble.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fb642dd430e3ffa97910a41335c459ea1378a441
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BoostedTreesUpdateEnsemble.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BoostedTreesUpdateEnsemble"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BroadcastArgs.pbtxt b/tensorflow/core/api_def/java_api/api_def_BroadcastArgs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..484742a2d02739a4129961768fd7221d1976a05d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BroadcastArgs.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BroadcastArgs"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BroadcastGradientArgs.pbtxt b/tensorflow/core/api_def/java_api/api_def_BroadcastGradientArgs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..50f631b2a694ee353551f1c345872da56c8d4ed3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BroadcastGradientArgs.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BroadcastGradientArgs"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BroadcastTo.pbtxt b/tensorflow/core/api_def/java_api/api_def_BroadcastTo.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..127458816ce278404877c255a581618c6e236fac
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BroadcastTo.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "BroadcastTo"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Bucketize.pbtxt b/tensorflow/core/api_def/java_api/api_def_Bucketize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5a99712fd6cebd5a4b3d53f65903524d01821aa0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Bucketize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Bucketize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_BytesProducedStatsDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_BytesProducedStatsDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cd7f24d961415c3329ba8f564edfcde49e02077d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_BytesProducedStatsDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "BytesProducedStatsDataset"
+  endpoint {
+    name: "data.BytesProducedStatsDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CTCBeamSearchDecoder.pbtxt b/tensorflow/core/api_def/java_api/api_def_CTCBeamSearchDecoder.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..39739f03a315996d455af77c9743e71c7707e48f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CTCBeamSearchDecoder.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CTCBeamSearchDecoder"
+  endpoint {
+    name: "nn.CtcBeamSearchDecoder"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CTCGreedyDecoder.pbtxt b/tensorflow/core/api_def/java_api/api_def_CTCGreedyDecoder.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..009742f097389146c8d9d432860bfcbbe5151a39
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CTCGreedyDecoder.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CTCGreedyDecoder"
+  endpoint {
+    name: "nn.CtcGreedyDecoder"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CTCLoss.pbtxt b/tensorflow/core/api_def/java_api/api_def_CTCLoss.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dbeefa4017181ed291b0de2777f7c8fcee3af1fe
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CTCLoss.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CTCLoss"
+  endpoint {
+    name: "nn.CtcLoss"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CacheDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_CacheDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..11c26c1dfc58eff917bfbc41c32a42c1ad39a9de
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CacheDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CacheDataset"
+  endpoint {
+    name: "data.CacheDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Cast.pbtxt b/tensorflow/core/api_def/java_api/api_def_Cast.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ea9f812e2a1b25c14022588dcf1dbeca0a05d5ee
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Cast.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Cast"
+  endpoint {
+    name: "dtypes.Cast"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Ceil.pbtxt b/tensorflow/core/api_def/java_api/api_def_Ceil.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d1a75f27d9ae6494d9fb38d7295d97a416b5a731
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Ceil.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Ceil"
+  endpoint {
+    name: "math.Ceil"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CheckNumerics.pbtxt b/tensorflow/core/api_def/java_api/api_def_CheckNumerics.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..731e9030a039aed7d4c899aca24ccec5635e0fcc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CheckNumerics.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CheckNumerics"
+  endpoint {
+    name: "math.CheckNumerics"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Cholesky.pbtxt b/tensorflow/core/api_def/java_api/api_def_Cholesky.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a60c4e3663e28128303435f845db9f319f1dd6b7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Cholesky.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Cholesky"
+  endpoint {
+    name: "linalg.Cholesky"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CholeskyGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_CholeskyGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2601d41554206fb268b00add8493d2184dee5ffa
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CholeskyGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CholeskyGrad"
+  endpoint {
+    name: "linalg.CholeskyGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ClipByValue.pbtxt b/tensorflow/core/api_def/java_api/api_def_ClipByValue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c4e055c117c140e9e027983917b31014a6892690
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ClipByValue.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ClipByValue"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CloseSummaryWriter.pbtxt b/tensorflow/core/api_def/java_api/api_def_CloseSummaryWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d5fbe557db0b3583db341692279ab262715900de
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CloseSummaryWriter.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CloseSummaryWriter"
+  endpoint {
+    name: "summary.CloseSummaryWriter"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CollectiveBcastRecv.pbtxt b/tensorflow/core/api_def/java_api/api_def_CollectiveBcastRecv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ada333e4466d26cb892e979e5b7eac141ac922f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CollectiveBcastRecv.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CollectiveBcastRecv"
+  endpoint {
+    name: "collective.BroadcastRecv"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CollectiveBcastSend.pbtxt b/tensorflow/core/api_def/java_api/api_def_CollectiveBcastSend.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..18b4bef345e4f8d1667860eae6b6612643076376
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CollectiveBcastSend.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CollectiveBcastSend"
+  endpoint {
+    name: "collective.BroadcastSend"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CollectiveReduce.pbtxt b/tensorflow/core/api_def/java_api/api_def_CollectiveReduce.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6226cc05ec3eef71864af69372273011d2d4c14c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CollectiveReduce.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CollectiveReduce"
+  endpoint {
+    name: "collective.AllReduce"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CompareAndBitpack.pbtxt b/tensorflow/core/api_def/java_api/api_def_CompareAndBitpack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d744fbbc90fcc631ab626fd7ab9fedcb795cb88b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CompareAndBitpack.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CompareAndBitpack"
+  endpoint {
+    name: "math.CompareAndBitpack"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Complex.pbtxt b/tensorflow/core/api_def/java_api/api_def_Complex.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4889360a96af146a97ef22add49c1d8167e07697
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Complex.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Complex"
+  endpoint {
+    name: "dtypes.Complex"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ComplexAbs.pbtxt b/tensorflow/core/api_def/java_api/api_def_ComplexAbs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..42a6a3c6a1c56f00f89d3bfdab13806f4acb5031
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ComplexAbs.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ComplexAbs"
+  endpoint {
+    name: "math.ComplexAbs"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ComputeAccidentalHits.pbtxt b/tensorflow/core/api_def/java_api/api_def_ComputeAccidentalHits.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ca9e590fbce09a0e7a64229077320e1507f8fa84
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ComputeAccidentalHits.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ComputeAccidentalHits"
+  endpoint {
+    name: "nn.ComputeAccidentalHits"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Concat.pbtxt b/tensorflow/core/api_def/java_api/api_def_Concat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e2fc7eef887c053fa3e7c0a2a1d5065332022018
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Concat.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Concat"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ConcatOffset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ConcatOffset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e8e23cf5593b274732fd9461ceecdbdaaad8476f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ConcatOffset.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ConcatOffset"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ConcatV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ConcatV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7035796981f4ce98c27488e3f5aef49dad4ed8cd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ConcatV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ConcatV2"
+  endpoint {
+    name: "Concat"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ConcatenateDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ConcatenateDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ec8253e9b5bf1a69d1c9fbc15cc32a688b749ba6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ConcatenateDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ConcatenateDataset"
+  endpoint {
+    name: "data.ConcatenateDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ConditionalAccumulator.pbtxt b/tensorflow/core/api_def/java_api/api_def_ConditionalAccumulator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..08431982daa9e259676c26fcda8311912dfba423
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ConditionalAccumulator.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ConditionalAccumulator"
+  endpoint {
+    name: "train.ConditionalAccumulator"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Conj.pbtxt b/tensorflow/core/api_def/java_api/api_def_Conj.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7de199b55fa9dd93f1c7741a7c2fcba555b8b406
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Conj.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Conj"
+  endpoint {
+    name: "math.Conj"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ConjugateTranspose.pbtxt b/tensorflow/core/api_def/java_api/api_def_ConjugateTranspose.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..42173088ae0e45c959ffc2ae92f03dba1f1caae3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ConjugateTranspose.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ConjugateTranspose"
+  endpoint {
+    name: "linalg.ConjugateTranspose"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Const.pbtxt b/tensorflow/core/api_def/java_api/api_def_Const.pbtxt
index 2dbdca34e0072e4b92f9f9ae7f721c1485d75285..a73f1e6c3ad9193587bd3e48c536edd79dd9448b 100644
--- a/tensorflow/core/api_def/java_api/api_def_Const.pbtxt
+++ b/tensorflow/core/api_def/java_api/api_def_Const.pbtxt
@@ -1,4 +1,4 @@
 op {
-  graph_op_name: "Const" #TODO(karllessard) escape that reserved name
-  visibility: HIDDEN
+  graph_op_name: "Const"
+  visibility: SKIP
 }
diff --git a/tensorflow/core/api_def/java_api/api_def_ConsumeMutexLock.pbtxt b/tensorflow/core/api_def/java_api/api_def_ConsumeMutexLock.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1e0d136bc2f5b70fbf7557a8aa2bc37678e8240a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ConsumeMutexLock.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ConsumeMutexLock"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ControlTrigger.pbtxt b/tensorflow/core/api_def/java_api/api_def_ControlTrigger.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4517b4373f3f736eca06e3e1b6f015be141af29b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ControlTrigger.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ControlTrigger"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Conv2D.pbtxt b/tensorflow/core/api_def/java_api/api_def_Conv2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..21d1398e0980311593564c142ff094786f7a2b05
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Conv2D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Conv2D"
+  endpoint {
+    name: "nn.Conv2d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Conv2DBackpropFilter.pbtxt b/tensorflow/core/api_def/java_api/api_def_Conv2DBackpropFilter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30eb55c6f2872a63963d202f8f7d13bbb892d7e4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Conv2DBackpropFilter.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Conv2DBackpropFilter"
+  endpoint {
+    name: "nn.Conv2dBackpropFilter"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Conv2DBackpropInput.pbtxt b/tensorflow/core/api_def/java_api/api_def_Conv2DBackpropInput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7c98646c137bf97bafb0ff82c9416374effd2c21
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Conv2DBackpropInput.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Conv2DBackpropInput"
+  endpoint {
+    name: "nn.Conv2dBackpropInput"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Conv3D.pbtxt b/tensorflow/core/api_def/java_api/api_def_Conv3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6ee1befcff19c373b34ce171db21fc8d60ae04dd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Conv3D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Conv3D"
+  endpoint {
+    name: "nn.Conv3d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Conv3DBackpropFilter.pbtxt b/tensorflow/core/api_def/java_api/api_def_Conv3DBackpropFilter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2e5f6c99d50d275804eb4971c0fcc1b730afbf3a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Conv3DBackpropFilter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Conv3DBackpropFilter"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Conv3DBackpropFilterV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_Conv3DBackpropFilterV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0643cc14a9a362472cdd3f634b0d5debef825e89
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Conv3DBackpropFilterV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Conv3DBackpropFilterV2"
+  endpoint {
+    name: "nn.Conv3dBackpropFilter"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Conv3DBackpropInput.pbtxt b/tensorflow/core/api_def/java_api/api_def_Conv3DBackpropInput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cbb2c9f136b6577ad5f17773b81e0fb87b266bb3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Conv3DBackpropInput.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Conv3DBackpropInput"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Conv3DBackpropInputV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_Conv3DBackpropInputV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..33c8f5a3ce0577f7dd2f92188af1c38b1ac6e4c4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Conv3DBackpropInputV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Conv3DBackpropInputV2"
+  endpoint {
+    name: "nn.Conv3dBackpropInput"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Cos.pbtxt b/tensorflow/core/api_def/java_api/api_def_Cos.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..db1f62806e255fcb750eecd2a88844b2d530162c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Cos.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Cos"
+  endpoint {
+    name: "math.Cos"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Cosh.pbtxt b/tensorflow/core/api_def/java_api/api_def_Cosh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a4b5e752bf5d5149d32b119ae7b5debbc805d162
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Cosh.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Cosh"
+  endpoint {
+    name: "math.Cosh"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CountUpTo.pbtxt b/tensorflow/core/api_def/java_api/api_def_CountUpTo.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eb9f328ce0cf96eb582577b599d2b7197866c913
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CountUpTo.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "CountUpTo"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CreateSummaryDbWriter.pbtxt b/tensorflow/core/api_def/java_api/api_def_CreateSummaryDbWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..299f881dd44cd7fe92d3e24b99581e74d1001bbe
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CreateSummaryDbWriter.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CreateSummaryDbWriter"
+  endpoint {
+    name: "summary.CreateSummaryDbWriter"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CreateSummaryFileWriter.pbtxt b/tensorflow/core/api_def/java_api/api_def_CreateSummaryFileWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..26c7941ce57670ab0c6cc30ef2bc958edf95b391
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CreateSummaryFileWriter.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CreateSummaryFileWriter"
+  endpoint {
+    name: "summary.CreateSummaryFileWriter"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CropAndResize.pbtxt b/tensorflow/core/api_def/java_api/api_def_CropAndResize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cbf9aa8f99639083cecd895accd85ee90aa2297c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CropAndResize.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CropAndResize"
+  endpoint {
+    name: "image.CropAndResize"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CropAndResizeGradBoxes.pbtxt b/tensorflow/core/api_def/java_api/api_def_CropAndResizeGradBoxes.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..44354bdfa03fee68e594f2d1265a61c81c074510
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CropAndResizeGradBoxes.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CropAndResizeGradBoxes"
+  endpoint {
+    name: "image.CropAndResizeGradBoxes"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CropAndResizeGradImage.pbtxt b/tensorflow/core/api_def/java_api/api_def_CropAndResizeGradImage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0618db9a8d715ddf854f5f6e13b11f2376a07bc1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CropAndResizeGradImage.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CropAndResizeGradImage"
+  endpoint {
+    name: "image.CropAndResizeGradImage"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Cross.pbtxt b/tensorflow/core/api_def/java_api/api_def_Cross.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c027884250e09948595d8bdef720f2534f91da54
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Cross.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Cross"
+  endpoint {
+    name: "linalg.Cross"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CudnnRNN.pbtxt b/tensorflow/core/api_def/java_api/api_def_CudnnRNN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7e88d20713f0ae44678f5bddf6e05fefb8cda3f2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CudnnRNN.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CudnnRNN"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CudnnRNNBackprop.pbtxt b/tensorflow/core/api_def/java_api/api_def_CudnnRNNBackprop.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9c9fc5f029a0e76f85ac57f8b143d2a2e9ddb731
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CudnnRNNBackprop.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "CudnnRNNBackprop"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CudnnRNNBackpropV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_CudnnRNNBackpropV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c2e7ebc27d69eb5ef5a9bf79a2730d242899f226
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CudnnRNNBackpropV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CudnnRNNBackpropV2"
+  endpoint {
+    name: "nn.CudnnRnnBackprop"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CudnnRNNCanonicalToParams.pbtxt b/tensorflow/core/api_def/java_api/api_def_CudnnRNNCanonicalToParams.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8d351fa860fc8c99099f241beb756ba4362d2124
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CudnnRNNCanonicalToParams.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CudnnRNNCanonicalToParams"
+  endpoint {
+    name: "nn.CudnnRnnCanonicalToParams"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CudnnRNNParamsSize.pbtxt b/tensorflow/core/api_def/java_api/api_def_CudnnRNNParamsSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3f1193fe6068e7443df5d88293dde0fdd6375ea6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CudnnRNNParamsSize.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CudnnRNNParamsSize"
+  endpoint {
+    name: "nn.CudnnRnnParamsSize"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CudnnRNNParamsToCanonical.pbtxt b/tensorflow/core/api_def/java_api/api_def_CudnnRNNParamsToCanonical.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d2e4c6201e1b511637d71a612ba5e807215b2321
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CudnnRNNParamsToCanonical.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CudnnRNNParamsToCanonical"
+  endpoint {
+    name: "nn.CudnnRnnParamsToCanonical"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_CudnnRNNV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_CudnnRNNV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e6dd5f42fc9a33e83b7746799f5944350e344653
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_CudnnRNNV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CudnnRNNV2"
+  endpoint {
+    name: "nn.CudnnRnn"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Cumprod.pbtxt b/tensorflow/core/api_def/java_api/api_def_Cumprod.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0cb7862413daf26daac361d9ee6540f612bad19b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Cumprod.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Cumprod"
+  endpoint {
+    name: "math.Cumprod"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Cumsum.pbtxt b/tensorflow/core/api_def/java_api/api_def_Cumsum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e7d90765326c89a3661317056d06329fab35940d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Cumsum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Cumsum"
+  endpoint {
+    name: "math.Cumsum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DataFormatDimMap.pbtxt b/tensorflow/core/api_def/java_api/api_def_DataFormatDimMap.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..36ea17793fde8ab968cd871ff02c32b310f5f912
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DataFormatDimMap.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DataFormatDimMap"
+  endpoint {
+    name: "nn.DataFormatDimMap"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DataFormatVecPermute.pbtxt b/tensorflow/core/api_def/java_api/api_def_DataFormatVecPermute.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b6b7e2dc7672de636e61d8c2f5874be2337deba4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DataFormatVecPermute.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DataFormatVecPermute"
+  endpoint {
+    name: "nn.DataFormatVecPermute"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DatasetToGraph.pbtxt b/tensorflow/core/api_def/java_api/api_def_DatasetToGraph.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2e7d48961db295e6e2ef3d6ab403e61697e52ed8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DatasetToGraph.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DatasetToGraph"
+  endpoint {
+    name: "data.DatasetToGraph"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DatasetToSingleElement.pbtxt b/tensorflow/core/api_def/java_api/api_def_DatasetToSingleElement.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0ac42e0e9369b886b7889e08af861d4c6e967a43
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DatasetToSingleElement.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DatasetToSingleElement"
+  endpoint {
+    name: "data.DatasetToSingleElement"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DatasetToTFRecord.pbtxt b/tensorflow/core/api_def/java_api/api_def_DatasetToTFRecord.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3d388570630ae1f993df4577b263d8f16fcbc3f0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DatasetToTFRecord.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DatasetToTFRecord"
+  endpoint {
+    name: "data.DatasetToTfRecord"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DebugGradientIdentity.pbtxt b/tensorflow/core/api_def/java_api/api_def_DebugGradientIdentity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7d50c5c8687a2cb3f550a04654fc6f0d7ec86a89
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DebugGradientIdentity.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DebugGradientIdentity"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DebugGradientRefIdentity.pbtxt b/tensorflow/core/api_def/java_api/api_def_DebugGradientRefIdentity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5e14e5fffd6e3683eec6eca65f587b5f0ab0016b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DebugGradientRefIdentity.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DebugGradientRefIdentity"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DecodeAndCropJpeg.pbtxt b/tensorflow/core/api_def/java_api/api_def_DecodeAndCropJpeg.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c07bb7a1bdf4de0860b001ba246ec231fafb1edc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DecodeAndCropJpeg.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeAndCropJpeg"
+  endpoint {
+    name: "image.DecodeAndCropJpeg"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DecodeBase64.pbtxt b/tensorflow/core/api_def/java_api/api_def_DecodeBase64.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..49c93453f7b9ea52e122ece339f2845e36570bb1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DecodeBase64.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeBase64"
+  endpoint {
+    name: "io.DecodeBase64"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DecodeBmp.pbtxt b/tensorflow/core/api_def/java_api/api_def_DecodeBmp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..049cfa153d190f1c63e800f7da4f38a417f4bde8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DecodeBmp.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeBmp"
+  endpoint {
+    name: "image.DecodeBmp"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DecodeCSV.pbtxt b/tensorflow/core/api_def/java_api/api_def_DecodeCSV.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1d60107adab5f3ef845556ccd752bc10dd8f48be
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DecodeCSV.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeCSV"
+  endpoint {
+    name: "io.DecodeCsv"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DecodeCompressed.pbtxt b/tensorflow/core/api_def/java_api/api_def_DecodeCompressed.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..91327a92ecb0c8e69441344e2b19986441f4a29e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DecodeCompressed.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeCompressed"
+  endpoint {
+    name: "io.DecodeCompressed"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DecodeGif.pbtxt b/tensorflow/core/api_def/java_api/api_def_DecodeGif.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..355643ff77cb4d9b75f6f17cd3ef13ab6ef45a66
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DecodeGif.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeGif"
+  endpoint {
+    name: "image.DecodeGif"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DecodeJSONExample.pbtxt b/tensorflow/core/api_def/java_api/api_def_DecodeJSONExample.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6ecba5ab0534cc2e80fa51b4f9904b0df4ae0d7d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DecodeJSONExample.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeJSONExample"
+  endpoint {
+    name: "io.DecodeJsonExample"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DecodeJpeg.pbtxt b/tensorflow/core/api_def/java_api/api_def_DecodeJpeg.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c0ebf2e315f160e10b5d66adac9ad472308040d5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DecodeJpeg.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeJpeg"
+  endpoint {
+    name: "image.DecodeJpeg"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DecodePng.pbtxt b/tensorflow/core/api_def/java_api/api_def_DecodePng.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d94537dc92891066f56e8a2f50fd924f8d251927
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DecodePng.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodePng"
+  endpoint {
+    name: "image.DecodePng"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DecodeProtoV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_DecodeProtoV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ba118cb0e67dd2ab8b763286110647b19d9ded8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DecodeProtoV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeProtoV2"
+  endpoint {
+    name: "DecodeProto"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DecodeRaw.pbtxt b/tensorflow/core/api_def/java_api/api_def_DecodeRaw.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..73067173edd90183457312494f681883836a6d5a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DecodeRaw.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeRaw"
+  endpoint {
+    name: "io.DecodeRaw"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DecodeWav.pbtxt b/tensorflow/core/api_def/java_api/api_def_DecodeWav.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9b249cc6e95b74ea835dceb8bd46910355fbee38
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DecodeWav.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeWav"
+  endpoint {
+    name: "audio.DecodeWav"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DeepCopy.pbtxt b/tensorflow/core/api_def/java_api/api_def_DeepCopy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..88a87c9291887c5614f4f88cb941c253c9420689
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DeepCopy.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "DeepCopy"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DeleteSessionTensor.pbtxt b/tensorflow/core/api_def/java_api/api_def_DeleteSessionTensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1865b461de785cef8d53d2fb143419c86bb3981a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DeleteSessionTensor.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "DeleteSessionTensor"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DenseToDenseSetOperation.pbtxt b/tensorflow/core/api_def/java_api/api_def_DenseToDenseSetOperation.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f85def92ee3b7d7bb2fa3fa1650e1ad7d4ed49e6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DenseToDenseSetOperation.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DenseToDenseSetOperation"
+  endpoint {
+    name: "sparse.DenseToDenseSetOperation"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DenseToSparseBatchDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_DenseToSparseBatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..76f6ba0b8ac2180d8d19c388df0a1969d8ec2168
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DenseToSparseBatchDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DenseToSparseBatchDataset"
+  endpoint {
+    name: "data.DenseToSparseBatchDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DenseToSparseSetOperation.pbtxt b/tensorflow/core/api_def/java_api/api_def_DenseToSparseSetOperation.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..11fbef8ff1fdcefa68b8cb9242efe8ec69507bed
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DenseToSparseSetOperation.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DenseToSparseSetOperation"
+  endpoint {
+    name: "sparse.DenseToSparseSetOperation"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DepthToSpace.pbtxt b/tensorflow/core/api_def/java_api/api_def_DepthToSpace.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0d2cbd2b904a98661ccd2b8c16f764f8107e822f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DepthToSpace.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DepthToSpace"
+  endpoint {
+    name: "nn.DepthToSpace"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DepthwiseConv2dNative.pbtxt b/tensorflow/core/api_def/java_api/api_def_DepthwiseConv2dNative.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1aaa480fefd8815630ba5707ddae43de72e7e776
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DepthwiseConv2dNative.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DepthwiseConv2dNative"
+  endpoint {
+    name: "nn.DepthwiseConv2dNative"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt b/tensorflow/core/api_def/java_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1a62d8cf632d72b58277c15bb0e393a3901fbac4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DepthwiseConv2dNativeBackpropFilter"
+  endpoint {
+    name: "nn.DepthwiseConv2dNativeBackpropFilter"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt b/tensorflow/core/api_def/java_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9106dd2f8fe103043969947740b8539364032cba
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DepthwiseConv2dNativeBackpropInput"
+  endpoint {
+    name: "nn.DepthwiseConv2dNativeBackpropInput"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Dequantize.pbtxt b/tensorflow/core/api_def/java_api/api_def_Dequantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ee4daa2f7e746e9a24f0d60208c33bf39b7073a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Dequantize.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Dequantize"
+  endpoint {
+    name: "quantization.Dequantize"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DeserializeIterator.pbtxt b/tensorflow/core/api_def/java_api/api_def_DeserializeIterator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bdd03f5dc672f40a238cf7dcc72840592f8838c8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DeserializeIterator.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DeserializeIterator"
+  endpoint {
+    name: "data.DeserializeIterator"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DeserializeManySparse.pbtxt b/tensorflow/core/api_def/java_api/api_def_DeserializeManySparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..826d49f54655aa2472c7a34a6a40ae2ec54bd32e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DeserializeManySparse.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DeserializeManySparse"
+  endpoint {
+    name: "io.DeserializeManySparse"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DeserializeSparse.pbtxt b/tensorflow/core/api_def/java_api/api_def_DeserializeSparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e6f24bb6257d6922398a325997e94143188443aa
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DeserializeSparse.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DeserializeSparse"
+  endpoint {
+    name: "sparse.DeserializeSparse"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DestroyResourceOp.pbtxt b/tensorflow/core/api_def/java_api/api_def_DestroyResourceOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..733e5e5029c85bcf8b6ed1f7b73849876f1c3db8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DestroyResourceOp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "DestroyResourceOp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DestroyTemporaryVariable.pbtxt b/tensorflow/core/api_def/java_api/api_def_DestroyTemporaryVariable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bd416eb68fb46513aa79e32957c943b64a154924
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DestroyTemporaryVariable.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "DestroyTemporaryVariable"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Diag.pbtxt b/tensorflow/core/api_def/java_api/api_def_Diag.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..374b3c97e17002f6c77759c847be5a0cb3835ec8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Diag.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Diag"
+  endpoint {
+    name: "linalg.TensorDiag"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DiagPart.pbtxt b/tensorflow/core/api_def/java_api/api_def_DiagPart.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..70db2357d0612181119564e775f63ac03ce35df5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DiagPart.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DiagPart"
+  endpoint {
+    name: "linalg.TensorDiagPart"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Digamma.pbtxt b/tensorflow/core/api_def/java_api/api_def_Digamma.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..68dc74c64ead3b739ce19b5b5a6c9fbc7253c85c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Digamma.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Digamma"
+  endpoint {
+    name: "math.Digamma"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Dilation2D.pbtxt b/tensorflow/core/api_def/java_api/api_def_Dilation2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..914ea29812ce67c38d92a86d2d9f1ee8f6dc2255
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Dilation2D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Dilation2D"
+  endpoint {
+    name: "nn.Dilation2d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Dilation2DBackpropFilter.pbtxt b/tensorflow/core/api_def/java_api/api_def_Dilation2DBackpropFilter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..db3c68e088ef20312d3fc96b7cb3f064c343f1e5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Dilation2DBackpropFilter.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Dilation2DBackpropFilter"
+  endpoint {
+    name: "nn.Dilation2dBackpropFilter"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Dilation2DBackpropInput.pbtxt b/tensorflow/core/api_def/java_api/api_def_Dilation2DBackpropInput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c935144f7af3e944608945faf085169492450f69
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Dilation2DBackpropInput.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Dilation2DBackpropInput"
+  endpoint {
+    name: "nn.Dilation2dBackpropInput"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Div.pbtxt b/tensorflow/core/api_def/java_api/api_def_Div.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2abba7f05f35f7ca834ed224df8f7462f7d62ca8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Div.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Div"
+  endpoint {
+    name: "math.Div"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DivNoNan.pbtxt b/tensorflow/core/api_def/java_api/api_def_DivNoNan.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c124044604b779de8bbab012c1953c8ff98edfad
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DivNoNan.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DivNoNan"
+  endpoint {
+    name: "math.DivNoNan"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DrawBoundingBoxes.pbtxt b/tensorflow/core/api_def/java_api/api_def_DrawBoundingBoxes.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2e7954e2b7ffe576e81e7a93aad7bb082d2a94fc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DrawBoundingBoxes.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DrawBoundingBoxes"
+  endpoint {
+    name: "image.DrawBoundingBoxes"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DynamicPartition.pbtxt b/tensorflow/core/api_def/java_api/api_def_DynamicPartition.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cc585676e486989591e774e0e8237cfc57166998
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DynamicPartition.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "DynamicPartition"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_DynamicStitch.pbtxt b/tensorflow/core/api_def/java_api/api_def_DynamicStitch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ac1fef4b6afd3905383f14e080e072f537eedd78
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_DynamicStitch.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "DynamicStitch"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_EagerPyFunc.pbtxt b/tensorflow/core/api_def/java_api/api_def_EagerPyFunc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e097041d731610447e2f67115373d004bb982f0e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_EagerPyFunc.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "EagerPyFunc"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_EditDistance.pbtxt b/tensorflow/core/api_def/java_api/api_def_EditDistance.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ca65c2c6e5821d79e60b3b6c6305de6b5c3ff4bb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_EditDistance.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "EditDistance"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Elu.pbtxt b/tensorflow/core/api_def/java_api/api_def_Elu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bfe8d972cf69cab5d3ce847f9507c0ee9c8b5072
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Elu.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Elu"
+  endpoint {
+    name: "nn.Elu"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_EluGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_EluGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3757357c000b902ee793c2da072fbac8e4c28c4c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_EluGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "EluGrad"
+  endpoint {
+    name: "nn.EluGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Empty.pbtxt b/tensorflow/core/api_def/java_api/api_def_Empty.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6522f51d9dcc34a529f70efbae3da15df1132c96
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Empty.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Empty"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_EmptyTensorList.pbtxt b/tensorflow/core/api_def/java_api/api_def_EmptyTensorList.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ef3f533964cd10318ec8ff2e97c2e64a6aa146b5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_EmptyTensorList.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "EmptyTensorList"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_EncodeBase64.pbtxt b/tensorflow/core/api_def/java_api/api_def_EncodeBase64.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..66f19def9aec58a9ce6221564da6c209eb118ea2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_EncodeBase64.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "EncodeBase64"
+  endpoint {
+    name: "io.EncodeBase64"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_EncodeJpeg.pbtxt b/tensorflow/core/api_def/java_api/api_def_EncodeJpeg.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1e151665f87203f6d56cc2c03225827ed128fdc1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_EncodeJpeg.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "EncodeJpeg"
+  endpoint {
+    name: "image.EncodeJpeg"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_EncodePng.pbtxt b/tensorflow/core/api_def/java_api/api_def_EncodePng.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7a8d713c865b825e9c896e56964e300ce82deda6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_EncodePng.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "EncodePng"
+  endpoint {
+    name: "image.EncodePng"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_EncodeProto.pbtxt b/tensorflow/core/api_def/java_api/api_def_EncodeProto.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ac6a04b4bc2958c9bb7628949928b258d1e23059
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_EncodeProto.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "EncodeProto"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_EncodeWav.pbtxt b/tensorflow/core/api_def/java_api/api_def_EncodeWav.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f3b22fde666b83bbde15d5f54c131660c171a61d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_EncodeWav.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "EncodeWav"
+  endpoint {
+    name: "audio.EncodeWav"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_EnqueueInQueueDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_EnqueueInQueueDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..26051ab446f9a5f8405de5fae67992ee1c993167
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_EnqueueInQueueDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "EnqueueInQueueDataset"
+  endpoint {
+    name: "data.EnqueueInQueueDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_EnsureShape.pbtxt b/tensorflow/core/api_def/java_api/api_def_EnsureShape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6238947598f2640e0f1b6a1a88d7700fd62b9cbe
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_EnsureShape.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "EnsureShape"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Enter.pbtxt b/tensorflow/core/api_def/java_api/api_def_Enter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ffc10c91beb9e9181c7543f94266dd15b9ee14cf
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Enter.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Enter"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Equal.pbtxt b/tensorflow/core/api_def/java_api/api_def_Equal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c2256c24337b6bc7d4e50ba1368a484fa87b4776
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Equal.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Equal"
+  endpoint {
+    name: "math.Equal"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Erf.pbtxt b/tensorflow/core/api_def/java_api/api_def_Erf.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9efcc3983c411a43910807d059582bb35e9f16e3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Erf.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Erf"
+  endpoint {
+    name: "math.Erf"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Erfc.pbtxt b/tensorflow/core/api_def/java_api/api_def_Erfc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c0f4db61ff44ba4d88717d8daa7e1c4665323943
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Erfc.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Erfc"
+  endpoint {
+    name: "math.Erfc"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Exit.pbtxt b/tensorflow/core/api_def/java_api/api_def_Exit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6215cd22299cc41a0bb6f9c1bb0e4239e9f67efe
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Exit.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Exit"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Exp.pbtxt b/tensorflow/core/api_def/java_api/api_def_Exp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b2790c8306f0277bb7613528557c8598afc5dbf6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Exp.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Exp"
+  endpoint {
+    name: "math.Exp"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExpandDims.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExpandDims.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..66902ccb5b0e152b2504469d94b305fb0dd8a64f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExpandDims.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ExpandDims"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalAssertNextDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalAssertNextDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cec4c229e4a2a17aecf54717e5541edc7edf3b91
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalAssertNextDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalAssertNextDataset"
+  endpoint {
+    name: "data.ExperimentalAssertNextDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalCSVDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalCSVDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..51fdd9f0b045360ed717b602361670ea9c908f5e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalCSVDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalCSVDataset"
+  endpoint {
+    name:  "data.ExperimentalCsvDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalDirectedInterleaveDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalDirectedInterleaveDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..77fe42fd94db16cc2d0fb414543a2872c0527aa3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalDirectedInterleaveDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalDirectedInterleaveDataset"
+  endpoint {
+    name: "data.ExperimentalDirectedInterleaveDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalFunctionBufferingResource.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalFunctionBufferingResource.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..320f4affb54d00d4994726e9f538e5eed919b632
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalFunctionBufferingResource.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalFunctionBufferingResource"
+  endpoint {
+    name: "data.ExperimentalFunctionBufferingResource"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalFunctionBufferingResourceGetNext.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalFunctionBufferingResourceGetNext.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d3dbe7600a5bf3322ba5895b8d4f94ee63d4b27a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalFunctionBufferingResourceGetNext.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalFunctionBufferingResourceGetNext"
+  endpoint {
+    name: "data.ExperimentalFunctionBufferingResourceGetNext"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalFunctionBufferingResourceReset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalFunctionBufferingResourceReset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6909e8678a3bb4d592a15dc022868963548e0c46
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalFunctionBufferingResourceReset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalFunctionBufferingResourceReset"
+  endpoint {
+    name: "data.ExperimentalFunctionBufferingResourceReset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalIdentityIndexedDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalIdentityIndexedDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c49c6de2177a96ef0e366cf788e9b10506dedb36
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalIdentityIndexedDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalIdentityIndexedDataset"
+  endpoint {
+    name: "data.ExperimentalIdentityIndexedDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalIgnoreErrorsDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalIgnoreErrorsDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7750a43de2806f86fd44bff9b2a4c43a373e0b3e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalIgnoreErrorsDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalIgnoreErrorsDataset"
+  endpoint {
+    name: "data.ExperimentalIgnoreErrorsDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalIndexedDatasetGet.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalIndexedDatasetGet.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..96a3befe8bb49bcef4d90cabf6402185d059b5f0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalIndexedDatasetGet.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalIndexedDatasetGet"
+  endpoint {
+    name: "data.ExperimentalIndexedDatasetGet"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalIndexedDatasetMaterialize.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalIndexedDatasetMaterialize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..731309d3291fb3c5107a0bb603bd01a108d333b5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalIndexedDatasetMaterialize.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalIndexedDatasetMaterialize"
+  endpoint {
+    name: "data.ExperimentalIndexedDatasetMaterialize"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalIteratorGetDevice.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalIteratorGetDevice.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..838d579ef742d4da801a1adb8509a33091820ad5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalIteratorGetDevice.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalIteratorGetDevice"
+  endpoint {
+    name: "data.ExperimentalIteratorGetDevice"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalLMDBDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalLMDBDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a092b1cf396772dcbd309a1365fbfe08ae8dfb1b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalLMDBDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalLMDBDataset"
+  endpoint {
+    name: "data.ExperimentalLmdbDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalMapDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalMapDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bea6dffd9c16305796d09602b1b7ca12f5374969
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalMapDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalMapDataset"
+  endpoint {
+    name: "data.ExperimentalMapDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalMaterializedIndexDatasetHandle.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalMaterializedIndexDatasetHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..06632e9041d8827e95063f025b83fa47252534cd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalMaterializedIndexDatasetHandle.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalMaterializedIndexDatasetHandle"
+  endpoint {
+    name: "data.ExperimentalMaterializedIndexDatasetHandle"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalNonSerializableDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalNonSerializableDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7b26cf129cf0ca37d69f27e4b7f51c3b76f254cf
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalNonSerializableDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalNonSerializableDataset"
+  endpoint {
+    name: "data.ExperimentalNonSerializableDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalNumaMapAndBatchDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalNumaMapAndBatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a2c0d92f896a8191e428f76ee88d49b0e5ef2a86
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalNumaMapAndBatchDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalNumaMapAndBatchDataset"
+  endpoint {
+    name: "data.ExperimentalNumaMapAndBatchDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalSleepDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalSleepDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9e95c55ac2628f0eaa858b81844e57f76226d9f4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalSleepDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalSleepDataset"
+  endpoint {
+    name: "data.ExperimentalSleepDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalThreadPoolDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalThreadPoolDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3e4aef1c68324b4c12c2c76c5a8947c567bff134
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalThreadPoolDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalThreadPoolDataset"
+  endpoint {
+    name: "data.ExperimentalThreadPoolDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalThreadPoolHandle.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalThreadPoolHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..73cf6767b3691ccfdc5fdb1c95d3d3edfe82fb14
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalThreadPoolHandle.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalThreadPoolHandle"
+  endpoint {
+    name: "data.ExperimentalThreadPoolHandle"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExperimentalUniqueDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExperimentalUniqueDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0d834193ddbdc90b3a695489c1e5df06f1c3fc99
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExperimentalUniqueDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExperimentalUniqueDataset"
+  endpoint {
+    name: "data.ExperimentalUniqueDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Expm1.pbtxt b/tensorflow/core/api_def/java_api/api_def_Expm1.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..71a8fcf02250b4886d5f37b88eeb969ae8b96cf1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Expm1.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Expm1"
+  endpoint {
+    name: "math.Expm1"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExtractGlimpse.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExtractGlimpse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3591f93f71f27e465d65c8bb8d521ed350781786
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExtractGlimpse.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExtractGlimpse"
+  endpoint {
+    name: "image.ExtractGlimpse"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExtractImagePatches.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExtractImagePatches.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7cdcfd752855f13714a4ebb9b80eed9bec65165a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExtractImagePatches.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExtractImagePatches"
+  endpoint {
+    name: "image.ExtractImagePatches"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExtractJpegShape.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExtractJpegShape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c95fcc9cef4f657a89fd8c531d970e4587cc6205
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExtractJpegShape.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ExtractJpegShape"
+  endpoint {
+    name: "image.ExtractJpegShape"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ExtractVolumePatches.pbtxt b/tensorflow/core/api_def/java_api/api_def_ExtractVolumePatches.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6f61c8321097957f62f8872dfd84880de3da4019
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ExtractVolumePatches.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ExtractVolumePatches"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FFT.pbtxt b/tensorflow/core/api_def/java_api/api_def_FFT.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d9655903086119a4cea7adb97cea89793b34109f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FFT.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FFT"
+  endpoint {
+    name: "signal.Fft"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FFT2D.pbtxt b/tensorflow/core/api_def/java_api/api_def_FFT2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..474103076b96682fba824bc633d77ec4588c0ea9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FFT2D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FFT2D"
+  endpoint {
+    name: "signal.Fft2d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FFT3D.pbtxt b/tensorflow/core/api_def/java_api/api_def_FFT3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8e1606b8f9df51cdd04707483fd2ec59fd049855
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FFT3D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FFT3D"
+  endpoint {
+    name: "signal.Fft3d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FIFOQueue.pbtxt b/tensorflow/core/api_def/java_api/api_def_FIFOQueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e5b2f73c55d5a3ac3ec7193ba7dd1da147ffc96d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FIFOQueue.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FIFOQueue"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FIFOQueueV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_FIFOQueueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c0861a6e8dda0abe83925f8163babab778d71e28
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FIFOQueueV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FIFOQueueV2"
+  endpoint {
+    name: "io.FifoQueue"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Fact.pbtxt b/tensorflow/core/api_def/java_api/api_def_Fact.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..436664e554b2829dbe257b819842f9dc70d1eb0d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Fact.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Fact"
+  endpoint {
+    name: "math.Fact"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FakeParam.pbtxt b/tensorflow/core/api_def/java_api/api_def_FakeParam.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ac8f751442c2f5864b51812688c514cd36509368
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FakeParam.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "FakeParam"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxArgs.pbtxt b/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxArgs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..809d231a55ca4be0a563fed29ab0493608f271b9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxArgs.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FakeQuantWithMinMaxArgs"
+  endpoint {
+    name: "quantization.FakeQuantWithMinMaxArgs"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxArgsGradient.pbtxt b/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxArgsGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..50d0f51a1409436c9f4ca7c7519c8df16b482792
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxArgsGradient.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FakeQuantWithMinMaxArgsGradient"
+  endpoint {
+    name: "quantization.FakeQuantWithMinMaxArgsGradient"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxVars.pbtxt b/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxVars.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b86258aab2bfb40c4dbc8e1bb3d5960773a767f2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxVars.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FakeQuantWithMinMaxVars"
+  endpoint {
+    name: "quantization.FakeQuantWithMinMaxVars"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxVarsGradient.pbtxt b/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxVarsGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3c1343423c18fe3eebf2eafbfaea73217b262f66
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxVarsGradient.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FakeQuantWithMinMaxVarsGradient"
+  endpoint {
+    name: "quantization.FakeQuantWithMinMaxVarsGradient"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxVarsPerChannel.pbtxt b/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxVarsPerChannel.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..afe45a290d30f204ff132d165aa46fca3f55e747
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxVarsPerChannel.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FakeQuantWithMinMaxVarsPerChannel"
+  endpoint {
+    name: "quantization.FakeQuantWithMinMaxVarsPerChannel"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt b/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9dd62fdffd0fbcb2fdfc6fc7348bb206cdcaef33
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FakeQuantWithMinMaxVarsPerChannelGradient.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+  endpoint {
+    name: "quantization.FakeQuantWithMinMaxVarsPerChannelGradient"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FakeQueue.pbtxt b/tensorflow/core/api_def/java_api/api_def_FakeQueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8960966f0842cbc586abdf37975a162fd9a47915
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FakeQueue.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FakeQueue"
+  endpoint {
+    name: "io.FakeQueue"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Fill.pbtxt b/tensorflow/core/api_def/java_api/api_def_Fill.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3997328ed3100f5ffe6c22b9f481fb5421304353
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Fill.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Fill"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FilterByLastComponentDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_FilterByLastComponentDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b7111f48fa921a7ec0f91f668f1ba607d4666ff6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FilterByLastComponentDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FilterByLastComponentDataset"
+  endpoint {
+    name: "data.FilterByLastComponentDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FilterDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_FilterDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..930fff419151a687fc5520435da4502c98ef272a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FilterDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FilterDataset"
+  endpoint {
+    name: "data.FilterDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FixedLengthRecordDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_FixedLengthRecordDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d4f23d94c03c5f9d5c6578c1a7fd8b32cd9434e8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FixedLengthRecordDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FixedLengthRecordDataset"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FixedLengthRecordDatasetV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_FixedLengthRecordDatasetV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b8012bbe1684a9f48b9c2829c080ad16b7697848
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FixedLengthRecordDatasetV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FixedLengthRecordDatasetV2"
+  endpoint {
+    name: "data.FixedLengthRecordDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FixedLengthRecordReader.pbtxt b/tensorflow/core/api_def/java_api/api_def_FixedLengthRecordReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f76cd494561027929a7011dffc2552bf3c53047f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FixedLengthRecordReader.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FixedLengthRecordReader"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FixedLengthRecordReaderV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_FixedLengthRecordReaderV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f897c21365b024fc7e698691627dc8bb2968674e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FixedLengthRecordReaderV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FixedLengthRecordReaderV2"
+  endpoint {
+    name: "io.FixedLengthRecordReader"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FixedUnigramCandidateSampler.pbtxt b/tensorflow/core/api_def/java_api/api_def_FixedUnigramCandidateSampler.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eb9c68d4dbaac22e1ac55d495712a854fae40db5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FixedUnigramCandidateSampler.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FixedUnigramCandidateSampler"
+  endpoint {
+    name: "nn.FixedUnigramCandidateSampler"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FlatMapDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_FlatMapDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d6e96cb4e00e534cea88ca52379d4ba361e84dc1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FlatMapDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FlatMapDataset"
+  endpoint {
+    name: "data.FlatMapDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Floor.pbtxt b/tensorflow/core/api_def/java_api/api_def_Floor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a2b80f97e0598345138dafa9f8fd7e6986c0a6d6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Floor.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Floor"
+  endpoint {
+    name: "math.Floor"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FloorDiv.pbtxt b/tensorflow/core/api_def/java_api/api_def_FloorDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..054d85f55c9b4c5dc13bf63ce1e5f5efec82bd5f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FloorDiv.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FloorDiv"
+  endpoint {
+    name: "math.FloorDiv"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FloorMod.pbtxt b/tensorflow/core/api_def/java_api/api_def_FloorMod.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ff2216a9357fe72429fa95046cdf81e147229a62
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FloorMod.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FloorMod"
+  endpoint {
+    name: "math.FloorMod"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FlushSummaryWriter.pbtxt b/tensorflow/core/api_def/java_api/api_def_FlushSummaryWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..feaa3a6dc22dc4486189c5d030d81cdeb76d30a9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FlushSummaryWriter.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FlushSummaryWriter"
+  endpoint {
+    name: "summary.FlushSummaryWriter"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_For.pbtxt b/tensorflow/core/api_def/java_api/api_def_For.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30363d1e9637d4c15146cf91b190e95f34aa773f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_For.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "For"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FractionalAvgPool.pbtxt b/tensorflow/core/api_def/java_api/api_def_FractionalAvgPool.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fc2e6ca54b802a5ddec908853ccec47d6725b52b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FractionalAvgPool.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FractionalAvgPool"
+  endpoint {
+    name: "nn.FractionalAvgPool"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FractionalAvgPoolGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_FractionalAvgPoolGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4e11d5e3950feaaa58f54f626334a8a9cee98e19
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FractionalAvgPoolGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FractionalAvgPoolGrad"
+  endpoint {
+    name: "nn.FractionalAvgPoolGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FractionalMaxPool.pbtxt b/tensorflow/core/api_def/java_api/api_def_FractionalMaxPool.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..061b358ec27ab86c844e1669e73a935fe1d7170e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FractionalMaxPool.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FractionalMaxPool"
+  endpoint {
+    name: "nn.FractionalMaxPool"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FractionalMaxPoolGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_FractionalMaxPoolGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c70e6d721e5bed4a62ed170d9ced09a061210f5c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FractionalMaxPoolGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FractionalMaxPoolGrad"
+  endpoint {
+    name: "nn.FractionalMaxPoolGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FusedBatchNorm.pbtxt b/tensorflow/core/api_def/java_api/api_def_FusedBatchNorm.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9d6166fe816f02456a31a45bf1e24dd1dc120cbc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FusedBatchNorm.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FusedBatchNorm"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FusedBatchNormGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_FusedBatchNormGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5e1d066d8dc70891256d25bc36a32aa18a2fd958
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FusedBatchNormGrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FusedBatchNormGrad"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FusedBatchNormGradV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_FusedBatchNormGradV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8f333c91f4ffbd25d7928a842d14333ef10c35bd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FusedBatchNormGradV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FusedBatchNormGradV2"
+  endpoint {
+    name: "nn.FusedBatchNormGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FusedBatchNormV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_FusedBatchNormV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8a4e76c94989ad22bd571a3b82b21bb97be49c8f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FusedBatchNormV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FusedBatchNormV2"
+  endpoint {
+    name: "nn.FusedBatchNorm"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FusedPadConv2D.pbtxt b/tensorflow/core/api_def/java_api/api_def_FusedPadConv2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e7ee10e0c58f7e454a17a2f0f047e6be0f49327e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FusedPadConv2D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FusedPadConv2D"
+  endpoint {
+    name: "nn.FusedPadConv2d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_FusedResizeAndPadConv2D.pbtxt b/tensorflow/core/api_def/java_api/api_def_FusedResizeAndPadConv2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6948fc1b87d2d6b250520e67053a329407268e09
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_FusedResizeAndPadConv2D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "FusedResizeAndPadConv2D"
+  endpoint {
+    name: "nn.FusedResizeAndPadConv2d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Gather.pbtxt b/tensorflow/core/api_def/java_api/api_def_Gather.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5c4ccda48bf15552b05c8d6895576d3cf74dfc6d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Gather.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Gather"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GatherNd.pbtxt b/tensorflow/core/api_def/java_api/api_def_GatherNd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..257c0316ea0ae7e9ae007684e2074a33605f60e3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GatherNd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "GatherNd"
+  endpoint {
+    name: "GatherNd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GatherV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_GatherV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0927e77a9688e6ae338a6643bff19e20333ab13c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GatherV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "GatherV2"
+  endpoint {
+    name: "Gather"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GcsConfigureBlockCache.pbtxt b/tensorflow/core/api_def/java_api/api_def_GcsConfigureBlockCache.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1ba3044d4c349e5397d96033f37f817395b6d553
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GcsConfigureBlockCache.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "GcsConfigureBlockCache"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GcsConfigureCredentials.pbtxt b/tensorflow/core/api_def/java_api/api_def_GcsConfigureCredentials.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..98bd555fb87506a57afca2dc86e6157adb534683
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GcsConfigureCredentials.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "GcsConfigureCredentials"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GenerateBigQueryReaderPartitions.pbtxt b/tensorflow/core/api_def/java_api/api_def_GenerateBigQueryReaderPartitions.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..956f40762d7499f4dbfbb083ae1f28b3190ff968
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GenerateBigQueryReaderPartitions.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "GenerateBigQueryReaderPartitions"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GenerateVocabRemapping.pbtxt b/tensorflow/core/api_def/java_api/api_def_GenerateVocabRemapping.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9aac3b17f3cf91ac26b19ccd82147a0dd11e9141
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GenerateVocabRemapping.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "GenerateVocabRemapping"
+  endpoint {
+    name: "train.GenerateVocabRemapping"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GeneratorDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_GeneratorDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b1719005e99077e857295eb72e681875eeb50dd3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GeneratorDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "GeneratorDataset"
+  endpoint {
+    name: "data.GeneratorDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GetSessionHandle.pbtxt b/tensorflow/core/api_def/java_api/api_def_GetSessionHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0ee6fe18a2a768bfb451d16630ce613c0cd31fbf
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GetSessionHandle.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "GetSessionHandle"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GetSessionHandleV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_GetSessionHandleV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ba89942d77f11166cab0406a8a309feb9a43e881
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GetSessionHandleV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "GetSessionHandleV2"
+  endpoint {
+    name: "GetSessionHandle"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GetSessionTensor.pbtxt b/tensorflow/core/api_def/java_api/api_def_GetSessionTensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..34b6e627cdc9ffbc72d2ef390c6a3c7d61d45d9b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GetSessionTensor.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "GetSessionTensor"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Greater.pbtxt b/tensorflow/core/api_def/java_api/api_def_Greater.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..594f9276be1292f2499f2338213ed8a222af486d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Greater.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Greater"
+  endpoint {
+    name: "math.Greater"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GreaterEqual.pbtxt b/tensorflow/core/api_def/java_api/api_def_GreaterEqual.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..17ea8696b0dc9e84f1cef1ac9555385e7e2848dd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GreaterEqual.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "GreaterEqual"
+  endpoint {
+    name: "math.GreaterEqual"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GroupByReducerDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_GroupByReducerDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1bd2c8f531b705524bf227d6ed141f03adf66423
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GroupByReducerDataset.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "GroupByReducerDataset"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GroupByWindowDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_GroupByWindowDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9e4c4cd4ff25cc7eaee31b017c4c95b725fee489
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GroupByWindowDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "GroupByWindowDataset"
+  endpoint {
+    name: "data.GroupByWindowDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_GuaranteeConst.pbtxt b/tensorflow/core/api_def/java_api/api_def_GuaranteeConst.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8cac25787dad3e7be49496e74e0f6361523525d2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_GuaranteeConst.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "GuaranteeConst"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_HSVToRGB.pbtxt b/tensorflow/core/api_def/java_api/api_def_HSVToRGB.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..95b042d5d688e62d6a4fcb9f8250adb2b68d35ac
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_HSVToRGB.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "HSVToRGB"
+  endpoint {
+    name: "image.HsvToRgb"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_HashTable.pbtxt b/tensorflow/core/api_def/java_api/api_def_HashTable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d48c2224f63207b9ab392659b0392ee2e850ab39
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_HashTable.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "HashTable"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_HashTableV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_HashTableV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..38cc5818d3bad14272a532db3568439667472286
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_HashTableV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "HashTableV2"
+  endpoint {
+    name: "HashTable"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_HistogramFixedWidth.pbtxt b/tensorflow/core/api_def/java_api/api_def_HistogramFixedWidth.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f64d9ae1d2387db0e88b0d28d762c294dd00d7a9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_HistogramFixedWidth.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "HistogramFixedWidth"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_HistogramSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_HistogramSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..97f28335bb96b3c98fd3f556cfbc8b77ef6763cc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_HistogramSummary.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "HistogramSummary"
+  endpoint {
+    name: "summary.HistogramSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_HostConst.pbtxt b/tensorflow/core/api_def/java_api/api_def_HostConst.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ba589e73e718d784d98afe1f04d0eacd15cf5fdc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_HostConst.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "HostConst"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IFFT.pbtxt b/tensorflow/core/api_def/java_api/api_def_IFFT.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4a15ebec7f67c4aea28391a2f3af25c7f26352cc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IFFT.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IFFT"
+  endpoint {
+    name: "signal.Ifft"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IFFT2D.pbtxt b/tensorflow/core/api_def/java_api/api_def_IFFT2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..35d696ee739951927961aa903fd92c5af4306bff
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IFFT2D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IFFT2D"
+  endpoint {
+    name: "signal.Ifft2d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IFFT3D.pbtxt b/tensorflow/core/api_def/java_api/api_def_IFFT3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..76a3164e6aaff6e9d7a18792debd32e3eba0a223
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IFFT3D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IFFT3D"
+  endpoint {
+    name: "signal.Ifft3d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IRFFT.pbtxt b/tensorflow/core/api_def/java_api/api_def_IRFFT.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7a68b01524ecd5fbc1a439c559edc67b5c843e96
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IRFFT.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IRFFT"
+  endpoint {
+    name: "signal.Irfft"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IRFFT2D.pbtxt b/tensorflow/core/api_def/java_api/api_def_IRFFT2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..239ec445d020736ed0fa642b646331d5493a0a87
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IRFFT2D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IRFFT2D"
+  endpoint {
+    name: "signal.Irfft2d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IRFFT3D.pbtxt b/tensorflow/core/api_def/java_api/api_def_IRFFT3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..87969436b7f2b5d6eb156781006121f0b7653ada
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IRFFT3D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IRFFT3D"
+  endpoint {
+    name: "signal.Irfft3d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Identity.pbtxt b/tensorflow/core/api_def/java_api/api_def_Identity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b6df3c6cfec45d6fffa9484722099e582529ba8c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Identity.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Identity"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IdentityN.pbtxt b/tensorflow/core/api_def/java_api/api_def_IdentityN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..827df10c65a190ab37d8445d11d5cfc8b7873593
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IdentityN.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "IdentityN"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IdentityReader.pbtxt b/tensorflow/core/api_def/java_api/api_def_IdentityReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..42fe85a5675796eceb9213bc986676f659d36bc0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IdentityReader.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "IdentityReader"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IdentityReaderV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_IdentityReaderV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8081ac26b3d4bebf525c9afd5a734763c3007720
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IdentityReaderV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IdentityReaderV2"
+  endpoint {
+    name: "io.IdentityReader"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_If.pbtxt b/tensorflow/core/api_def/java_api/api_def_If.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a3bc33ac2cee060877f5a10d97537d77ca60949e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_If.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "If"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Igamma.pbtxt b/tensorflow/core/api_def/java_api/api_def_Igamma.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cbdd8b984c46fa7df6f41c3e7e98a8382c194cb3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Igamma.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Igamma"
+  endpoint {
+    name: "math.Igamma"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IgammaGradA.pbtxt b/tensorflow/core/api_def/java_api/api_def_IgammaGradA.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0659c80c39fb085a0ca1629c958a7d66f19acd59
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IgammaGradA.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IgammaGradA"
+  endpoint {
+    name: "math.IgammaGradA"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Igammac.pbtxt b/tensorflow/core/api_def/java_api/api_def_Igammac.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..94f6085e1a97c3ec7b4e4a17c06ebb1c3b1c4fd2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Igammac.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Igammac"
+  endpoint {
+    name: "math.Igammac"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Imag.pbtxt b/tensorflow/core/api_def/java_api/api_def_Imag.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4227c7078fcc3fac47998d51917bb09f45a6eb3c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Imag.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Imag"
+  endpoint {
+    name: "math.Imag"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ImageSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_ImageSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1871e6b65503461e123153977f66c1b9f574e125
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ImageSummary.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ImageSummary"
+  endpoint {
+    name: "summary.ImageSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ImmutableConst.pbtxt b/tensorflow/core/api_def/java_api/api_def_ImmutableConst.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fd0384dc45203e458ca1179615b5f3c3e1ee5a86
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ImmutableConst.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ImmutableConst"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ImportEvent.pbtxt b/tensorflow/core/api_def/java_api/api_def_ImportEvent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c2d8d2eba35a341a2ab12d4cc7af9dd3b4d956a2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ImportEvent.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ImportEvent"
+  endpoint {
+    name: "summary.ImportEvent"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_InTopK.pbtxt b/tensorflow/core/api_def/java_api/api_def_InTopK.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bf90fd0f814824528815e844918b0c46de11adca
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_InTopK.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "InTopK"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_InTopKV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_InTopKV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..400ee714624943669a216be70bdb6b09a4743cba
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_InTopKV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "InTopKV2"
+  endpoint {
+    name: "nn.InTopK"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_InitializeTable.pbtxt b/tensorflow/core/api_def/java_api/api_def_InitializeTable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30e3d66bfe9c5285165808ee74de2e21abac7dd0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_InitializeTable.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "InitializeTable"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_InitializeTableFromTextFile.pbtxt b/tensorflow/core/api_def/java_api/api_def_InitializeTableFromTextFile.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..786e22cd474647bf203cfdb58d4e2ef027f37ee9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_InitializeTableFromTextFile.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "InitializeTableFromTextFile"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_InitializeTableFromTextFileV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_InitializeTableFromTextFileV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9a4f70220184eb5eb36d14c0066a4e7fdf837abc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_InitializeTableFromTextFileV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "InitializeTableFromTextFileV2"
+  endpoint {
+    name: "InitializeTableFromTextFile"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_InitializeTableV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_InitializeTableV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d7a9a813d070b6bda559a70bb31f4e4096000661
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_InitializeTableV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "InitializeTableV2"
+  endpoint {
+    name: "InitializeTable"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_InplaceAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_InplaceAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3d157ab7f83f6d7efaa1a996cfa5ed42f16d5fde
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_InplaceAdd.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "InplaceAdd"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_InplaceSub.pbtxt b/tensorflow/core/api_def/java_api/api_def_InplaceSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b2ed54964304de7f813ba7e14c250eec5a53bb77
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_InplaceSub.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "InplaceSub"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_InplaceUpdate.pbtxt b/tensorflow/core/api_def/java_api/api_def_InplaceUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..91041b43abd3e8d06e31fe2d5f3b9f1ecd96aaea
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_InplaceUpdate.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "InplaceUpdate"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_InterleaveDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_InterleaveDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..25e40ac2dbe604f7f6165da35c1d674e07fdbb2c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_InterleaveDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "InterleaveDataset"
+  endpoint {
+    name: "data.InterleaveDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Inv.pbtxt b/tensorflow/core/api_def/java_api/api_def_Inv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..49f3e6c0429b85d7b03e34b9c408a95d0a112151
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Inv.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Inv"
+  endpoint {
+    name: "linalg.Inv"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_InvGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_InvGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d3bfa78e99ce6c734d6acb9e606666737322c477
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_InvGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "InvGrad"
+  endpoint {
+    name: "nn.InvGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Invert.pbtxt b/tensorflow/core/api_def/java_api/api_def_Invert.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9898bfa003d040cb23c00655a8fe41241261d702
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Invert.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Invert"
+  endpoint {
+    name: "bitwise.Invert"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_InvertPermutation.pbtxt b/tensorflow/core/api_def/java_api/api_def_InvertPermutation.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9ee103f554e47d090f133c6cb72edc67a4c430f5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_InvertPermutation.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "InvertPermutation"
+  endpoint {
+    name: "math.InvertPermutation"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IsBoostedTreesEnsembleInitialized.pbtxt b/tensorflow/core/api_def/java_api/api_def_IsBoostedTreesEnsembleInitialized.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..35f17b79243e0f651bd24f0da7675e84fe632935
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IsBoostedTreesEnsembleInitialized.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "IsBoostedTreesEnsembleInitialized"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IsBoostedTreesQuantileStreamResourceInitialized.pbtxt b/tensorflow/core/api_def/java_api/api_def_IsBoostedTreesQuantileStreamResourceInitialized.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d6cd2dcc45133637a8462f8176e02159d1968371
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IsBoostedTreesQuantileStreamResourceInitialized.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "IsBoostedTreesQuantileStreamResourceInitialized"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IsFinite.pbtxt b/tensorflow/core/api_def/java_api/api_def_IsFinite.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fce589039916324a4493cf4000ff8685087b214d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IsFinite.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IsFinite"
+  endpoint {
+    name: "math.IsFinite"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IsInf.pbtxt b/tensorflow/core/api_def/java_api/api_def_IsInf.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..823c1d72812c30e14b6b080dc47366f3c341a85c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IsInf.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IsInf"
+  endpoint {
+    name: "math.IsInf"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IsNan.pbtxt b/tensorflow/core/api_def/java_api/api_def_IsNan.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..58805bf99f60467680d1fac62a3cdc78bb1dd746
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IsNan.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IsNan"
+  endpoint {
+    name: "math.IsNan"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IsVariableInitialized.pbtxt b/tensorflow/core/api_def/java_api/api_def_IsVariableInitialized.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7bf51da2da97ea46cd8465d4d8f86b4851ebc64a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IsVariableInitialized.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "IsVariableInitialized"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Iterator.pbtxt b/tensorflow/core/api_def/java_api/api_def_Iterator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0f4d9967c3cc1650b65a9e75624fa7fc21358910
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Iterator.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Iterator"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IteratorFromStringHandle.pbtxt b/tensorflow/core/api_def/java_api/api_def_IteratorFromStringHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0a4e443cde0a3ed5239aaa73b31288f1c593d0fd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IteratorFromStringHandle.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "IteratorFromStringHandle"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IteratorFromStringHandleV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_IteratorFromStringHandleV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..86745a3a56470c32f5a59b7323b549d6959b0008
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IteratorFromStringHandleV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IteratorFromStringHandleV2"
+  endpoint {
+    name: "data.IteratorFromStringHandle"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IteratorGetNext.pbtxt b/tensorflow/core/api_def/java_api/api_def_IteratorGetNext.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2248ff9f5f5efffe2fee50d109bfe3404b8f7cc0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IteratorGetNext.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IteratorGetNext"
+  endpoint {
+    name: "data.IteratorGetNext"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IteratorGetNextAsOptional.pbtxt b/tensorflow/core/api_def/java_api/api_def_IteratorGetNextAsOptional.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ae02a0e0171a9eee8aa925c6e9f22c427bfda2bd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IteratorGetNextAsOptional.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IteratorGetNextAsOptional"
+  endpoint {
+    name: "data.IteratorGetNextAsOptional"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IteratorGetNextSync.pbtxt b/tensorflow/core/api_def/java_api/api_def_IteratorGetNextSync.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4aa7c07a7741752fd90f5ff592f54cd5c8b21b3c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IteratorGetNextSync.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IteratorGetNextSync"
+  endpoint {
+    name: "data.IteratorGetNextSync"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IteratorToStringHandle.pbtxt b/tensorflow/core/api_def/java_api/api_def_IteratorToStringHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7413ec846e7e6a44c35c34ee8ed35f418946bfc0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IteratorToStringHandle.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IteratorToStringHandle"
+  endpoint {
+    name: "data.IteratorToStringHandle"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_IteratorV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_IteratorV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..87720d441b368abf1d5532b59019c7860c5739a1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_IteratorV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "IteratorV2"
+  endpoint {
+    name: "data.Iterator"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_L2Loss.pbtxt b/tensorflow/core/api_def/java_api/api_def_L2Loss.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c348e0f0e0b5524f9f98310d9be7392579a7a5fd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_L2Loss.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "L2Loss"
+  endpoint {
+    name: "nn.L2Loss"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LMDBReader.pbtxt b/tensorflow/core/api_def/java_api/api_def_LMDBReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b6cd7bdbb7ee578434d69bd943fc0f6d5c7b486a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LMDBReader.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LMDBReader"
+  endpoint {
+    name: "io.LmdbReader"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LRN.pbtxt b/tensorflow/core/api_def/java_api/api_def_LRN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d16fea31843a5cb785988cf72fd86a60247479e2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LRN.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LRN"
+  endpoint {
+    name: "nn.LocalResponseNormalization"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LRNGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_LRNGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a50e738d785b2ca8c5b032221aa9259e4bb521a4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LRNGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LRNGrad"
+  endpoint {
+    name: "nn.LocalResponseNormalizationGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LatencyStatsDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_LatencyStatsDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bf0bf2a5ed712c8bf36af21fb8a59dab9eec82ab
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LatencyStatsDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LatencyStatsDataset"
+  endpoint {
+    name: "data.LatencyStatsDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LeakyRelu.pbtxt b/tensorflow/core/api_def/java_api/api_def_LeakyRelu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..31a4f01167bda19909928b34e32b11746aadca61
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LeakyRelu.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LeakyRelu"
+  endpoint {
+    name: "nn.LeakyRelu"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LeakyReluGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_LeakyReluGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9899c64c13e12c5184b09f0935b0ee360d41edc9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LeakyReluGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LeakyReluGrad"
+  endpoint {
+    name: "data.LeakyReluGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LearnedUnigramCandidateSampler.pbtxt b/tensorflow/core/api_def/java_api/api_def_LearnedUnigramCandidateSampler.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5f193da1be112a169f632611c64328cbf3d0dadc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LearnedUnigramCandidateSampler.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LearnedUnigramCandidateSampler"
+  endpoint {
+    name: "nn.LearnedUnigramCandidateSampler"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LeftShift.pbtxt b/tensorflow/core/api_def/java_api/api_def_LeftShift.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..44a8727e40f4415c3ee197c64f4ea8a93c46a621
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LeftShift.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LeftShift"
+  endpoint {
+    name: "bitwise.LeftShift"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Less.pbtxt b/tensorflow/core/api_def/java_api/api_def_Less.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..577d2556b81c37282cea21b342b9ea557f531590
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Less.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Less"
+  endpoint {
+    name: "math.Less"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LessEqual.pbtxt b/tensorflow/core/api_def/java_api/api_def_LessEqual.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6cad35c62263042f6683c5f8437c84b345462e53
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LessEqual.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LessEqual"
+  endpoint {
+    name: "math.LessEqual"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Lgamma.pbtxt b/tensorflow/core/api_def/java_api/api_def_Lgamma.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eb7bc9660c01044fb8ff0282a50a04c79b257536
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Lgamma.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Lgamma"
+  endpoint {
+    name: "math.Lgamma"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LinSpace.pbtxt b/tensorflow/core/api_def/java_api/api_def_LinSpace.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..599c310021a0aa1a511b818949a5816574dce0d9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LinSpace.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LinSpace"
+  endpoint {
+    name: "LinSpace"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ListDiff.pbtxt b/tensorflow/core/api_def/java_api/api_def_ListDiff.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aa94c958f17463d2c616306acbe000acd465fafc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ListDiff.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ListDiff"
+  endpoint {
+    name: "SetDiff1d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LoadAndRemapMatrix.pbtxt b/tensorflow/core/api_def/java_api/api_def_LoadAndRemapMatrix.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..54ee68fde44b6c8954927532d5953ae49ef08e1e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LoadAndRemapMatrix.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LoadAndRemapMatrix"
+  endpoint {
+    name: "linalg.LoadAndRemapMatrix"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Log.pbtxt b/tensorflow/core/api_def/java_api/api_def_Log.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9d11c26c71ab1c783a09b76fc89dc0f47903880b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Log.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Log"
+  endpoint {
+    name: "math.Log"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Log1p.pbtxt b/tensorflow/core/api_def/java_api/api_def_Log1p.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6cc1d6e6c82254efac0d941fb97538f501d76daf
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Log1p.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Log1p"
+  endpoint {
+    name: "math.Log1p"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LogMatrixDeterminant.pbtxt b/tensorflow/core/api_def/java_api/api_def_LogMatrixDeterminant.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5e52d9ecedce8a98977bfaa2035f22e18c3171b8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LogMatrixDeterminant.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LogMatrixDeterminant"
+  endpoint {
+    name: "linalg.LogMatrixDeterminant"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LogSoftmax.pbtxt b/tensorflow/core/api_def/java_api/api_def_LogSoftmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..19518a71ea474e4182cbda83b6ff54bcf1b92618
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LogSoftmax.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LogSoftmax"
+  endpoint {
+    name: "nn.LogSoftmax"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LogUniformCandidateSampler.pbtxt b/tensorflow/core/api_def/java_api/api_def_LogUniformCandidateSampler.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bdcf01c20fde6dab975a7299f494212b40a6cc6c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LogUniformCandidateSampler.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LogUniformCandidateSampler"
+  endpoint {
+    name: "random.LogUniformCandidateSampler"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LogicalAnd.pbtxt b/tensorflow/core/api_def/java_api/api_def_LogicalAnd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..12921dd932c3e89f70f43724bb503424ffd4d672
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LogicalAnd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LogicalAnd"
+  endpoint {
+    name: "math.LogicalAnd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LogicalNot.pbtxt b/tensorflow/core/api_def/java_api/api_def_LogicalNot.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9e0960958ed00291bec72e81a127608659df9ebd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LogicalNot.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LogicalNot"
+  endpoint {
+    name: "math.LogicalNot"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LogicalOr.pbtxt b/tensorflow/core/api_def/java_api/api_def_LogicalOr.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6c834e464146a3dcaf23afca7da925a7347d6117
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LogicalOr.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LogicalOr"
+  endpoint {
+    name: "math.LogicalOr"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LookupTableExport.pbtxt b/tensorflow/core/api_def/java_api/api_def_LookupTableExport.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..29885222a491a4c8499ab6a9b18fd7a3e7d28415
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LookupTableExport.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LookupTableExport"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LookupTableExportV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_LookupTableExportV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d780f2a21d7f60a9004b62def0e5be5b21354e5d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LookupTableExportV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LookupTableExportV2"
+  endpoint {
+    name: "LookupTableExport"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LookupTableFind.pbtxt b/tensorflow/core/api_def/java_api/api_def_LookupTableFind.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..23f7facaa24f796ca65864771701af7eb2a69d76
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LookupTableFind.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LookupTableFind"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LookupTableFindV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_LookupTableFindV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2247547b62b358e4f94d324feba0a15706bfd0fa
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LookupTableFindV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LookupTableFindV2"
+  endpoint {
+    name: "LookupTableFind"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LookupTableImport.pbtxt b/tensorflow/core/api_def/java_api/api_def_LookupTableImport.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f87ea9c0736ce28c1d28ca9dbc9cd7eebce32e48
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LookupTableImport.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LookupTableImport"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LookupTableImportV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_LookupTableImportV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a39cffa12d1a11129870b7110e64a1c9b22ab2dc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LookupTableImportV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LookupTableImportV2"
+  endpoint {
+    name: "LookupTableImport"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LookupTableInsert.pbtxt b/tensorflow/core/api_def/java_api/api_def_LookupTableInsert.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a45b3f52a5b87c47255caf9c1e94a64520734a0e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LookupTableInsert.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LookupTableInsert"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LookupTableInsertV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_LookupTableInsertV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..037b743b6be20ac5313218549d2f7fe100f1f40d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LookupTableInsertV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LookupTableInsertV2"
+  endpoint {
+    name: "LookupTableInsert"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LookupTableRemoveV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_LookupTableRemoveV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..61f6d8db36a1c50659bb1a905832caff878cafa4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LookupTableRemoveV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LookupTableRemoveV2"
+  endpoint {
+    name: "LookupTableRemove"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LookupTableSize.pbtxt b/tensorflow/core/api_def/java_api/api_def_LookupTableSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..391dc5dfadf027c40e15c523bb91873daae187e8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LookupTableSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "LookupTableSize"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LookupTableSizeV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_LookupTableSizeV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ad646e25a6b4ce9f529f2aab60b79b767a284fba
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LookupTableSizeV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "LookupTableSizeV2"
+  endpoint {
+    name: "LookupTableSize"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LoopCond.pbtxt b/tensorflow/core/api_def/java_api/api_def_LoopCond.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..492f78f62ee1ece2ce7dfecfbca63db711d9b847
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LoopCond.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "LoopCond"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_LowerBound.pbtxt b/tensorflow/core/api_def/java_api/api_def_LowerBound.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..31f1d3038ca1a41ed8e57af2233ee95af29ca67e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_LowerBound.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "LowerBound"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MakeIterator.pbtxt b/tensorflow/core/api_def/java_api/api_def_MakeIterator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9dfa761370d0e817785fa714788cf88d98721a4a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MakeIterator.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MakeIterator"
+  endpoint {
+    name: "data.MakeIterator"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MapAndBatchDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_MapAndBatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cb96bf63d8f0d15bb47f92a7f8e1ea055ed8208f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MapAndBatchDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MapAndBatchDataset"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MapAndBatchDatasetV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_MapAndBatchDatasetV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b29c21888fae479bc7ced724a711bd724e71241f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MapAndBatchDatasetV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MapAndBatchDatasetV2"
+  endpoint {
+    name: "data.MapAndBatchDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MapClear.pbtxt b/tensorflow/core/api_def/java_api/api_def_MapClear.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3ed9bf8a5d8641c8ca136feaf788fceeb185c5d8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MapClear.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "MapClear"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MapDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_MapDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fdec9eb857411cc007769bb4e8e28eaffda9f60f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MapDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MapDataset"
+  endpoint {
+    name: "data.MapDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MapDefun.pbtxt b/tensorflow/core/api_def/java_api/api_def_MapDefun.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..43b1dc722c031d7303b3e8f640c40c617fd88ab5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MapDefun.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "MapDefun"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MapIncompleteSize.pbtxt b/tensorflow/core/api_def/java_api/api_def_MapIncompleteSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..659993e42b0d707b7eccca92fe7bee2b5b6865ed
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MapIncompleteSize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "MapIncompleteSize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MapPeek.pbtxt b/tensorflow/core/api_def/java_api/api_def_MapPeek.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eb1bd158f0dec1fd2955a28aea210a73c1d26ad2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MapPeek.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "MapPeek"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MapSize.pbtxt b/tensorflow/core/api_def/java_api/api_def_MapSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4da151152c90e0175ede0f74cd130812f88f6232
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MapSize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "MapSize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MapStage.pbtxt b/tensorflow/core/api_def/java_api/api_def_MapStage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6d9f66cfc4884c7e86e97d82c3f017c59e7b189a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MapStage.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "MapStage"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MapUnstage.pbtxt b/tensorflow/core/api_def/java_api/api_def_MapUnstage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bb118f0fcb9787ce3277643c056cae525e770462
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MapUnstage.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "MapUnstage"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MapUnstageNoKey.pbtxt b/tensorflow/core/api_def/java_api/api_def_MapUnstageNoKey.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1004e96482a6de52b4fec020c9fd620e7b43534c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MapUnstageNoKey.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "MapUnstageNoKey"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatMul.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fe4b8405b9c62c757dcee1fe1b4c7579d1a33458
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatMul.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatMul"
+  endpoint {
+    name: "linalg.MatMul"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatchingFiles.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatchingFiles.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bb7b096895760ee466675450aca627a7b42cdbd7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatchingFiles.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatchingFiles"
+  endpoint {
+    name: "io.MatchingFiles"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatchingFilesDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatchingFilesDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..749257c37b5b9a88058464547091aba401d5490a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatchingFilesDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatchingFilesDataset"
+  endpoint {
+    name: "data.MatchingFilesDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatrixBandPart.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatrixBandPart.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eaf426c00eff9ff469ff72240229bde9da946d5b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatrixBandPart.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatrixBandPart"
+  endpoint {
+    name: "linalg.BandPart"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatrixDeterminant.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatrixDeterminant.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b56d2dad3a8a1509fd1b859b754974e9aab1c4d4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatrixDeterminant.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatrixDeterminant"
+  endpoint {
+    name: "linalg.Det"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatrixDiag.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatrixDiag.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..839cd82b8910cf7d30a73e6a85b1e8b60bc20d0a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatrixDiag.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatrixDiag"
+  endpoint {
+    name: "linalg.Diag"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatrixDiagPart.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatrixDiagPart.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..008f75c1e99b199fbde137ea809ed9987c211d09
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatrixDiagPart.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatrixDiagPart"
+  endpoint {
+    name: "linalg.DiagPart"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatrixExponential.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatrixExponential.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fb232dab983c8ab1c9911ec1c2861a359106e5d8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatrixExponential.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatrixExponential"
+  endpoint {
+    name: "linalg.MatrixExponential"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatrixInverse.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatrixInverse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..68721fc78db0cb4a0979e8e1208e539d1ec53e16
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatrixInverse.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatrixInverse"
+  endpoint {
+    name: "linalg.Inv"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatrixLogarithm.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatrixLogarithm.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..04137ffae7980d597783494a011c881227a68be2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatrixLogarithm.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatrixLogarithm"
+  endpoint {
+    name: "linalg.MatrixLogarithm"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatrixSetDiag.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatrixSetDiag.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..61001fa38cf2fb324bb5ad6ad5b28fa4ed189513
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatrixSetDiag.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatrixSetDiag"
+  endpoint {
+    name: "linalg.SetDiag"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatrixSolve.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatrixSolve.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..02c21448bba33e86593e36a8cd9f1c190235ee89
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatrixSolve.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatrixSolve"
+  endpoint {
+    name: "linalg.Solve"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatrixSolveLs.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatrixSolveLs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9cee578ec287b45025561d5957bdf2ce1ad12ad4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatrixSolveLs.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatrixSolveLs"
+  endpoint {
+    name: "linalg.MatrixSolveLs"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatrixSquareRoot.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatrixSquareRoot.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..14c7624fe37a9846d804c2006789e2beeb006c79
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatrixSquareRoot.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatrixSquareRoot"
+  endpoint {
+    name: "linalg.Sqrtm"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MatrixTriangularSolve.pbtxt b/tensorflow/core/api_def/java_api/api_def_MatrixTriangularSolve.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1f61e99efe411f7f21ed902ed6ce3edc54b8fc48
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MatrixTriangularSolve.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MatrixTriangularSolve"
+  endpoint {
+    name: "linalg.TriangularSolve"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Max.pbtxt b/tensorflow/core/api_def/java_api/api_def_Max.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..03868720edf1e010b7d4c5d70ae39822b55274dd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Max.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Max"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MaxPool.pbtxt b/tensorflow/core/api_def/java_api/api_def_MaxPool.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5ebc9e6a6f970dac1870800afe56257e9a754ded
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MaxPool.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MaxPool"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MaxPool3D.pbtxt b/tensorflow/core/api_def/java_api/api_def_MaxPool3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..17aeb6a8c9313a9b9a954952f58c5413446f3c19
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MaxPool3D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MaxPool3D"
+  endpoint {
+    name: "nn.MaxPool3d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MaxPool3DGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_MaxPool3DGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ca7a7be835b5761d7079c17c69a8fe2f7aef9f2b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MaxPool3DGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MaxPool3DGrad"
+  endpoint {
+    name: "nn.MaxPool3dGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MaxPool3DGradGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_MaxPool3DGradGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c70aa3fe30a26a250a72c09d050bb8b764fa19e2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MaxPool3DGradGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MaxPool3DGradGrad"
+  endpoint {
+    name: "nn.MaxPool3dGradGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MaxPoolGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_MaxPoolGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9ad85fa08e6f35bb69fb48b1de44e9314bec60fc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MaxPoolGrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MaxPoolGrad"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MaxPoolGradGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_MaxPoolGradGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3375ebc77d5b8ebb7c6c61eb89653600d6dd47b5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MaxPoolGradGrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MaxPoolGradGrad"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MaxPoolGradGradV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_MaxPoolGradGradV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2ca8a7b02217d3561bde7dbfae737067ae442d96
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MaxPoolGradGradV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MaxPoolGradGradV2"
+  endpoint {
+    name: "nn.MaxPoolGradGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MaxPoolGradGradWithArgmax.pbtxt b/tensorflow/core/api_def/java_api/api_def_MaxPoolGradGradWithArgmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d43cf7447cc3f50495d3b0c3dde4c3c436f1a19d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MaxPoolGradGradWithArgmax.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MaxPoolGradGradWithArgmax"
+  endpoint {
+    name: "nn.MaxPoolGradGradWithArgmax"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MaxPoolGradV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_MaxPoolGradV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..556dd0be502c343a23ba522d77989aa6384d6979
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MaxPoolGradV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MaxPoolGradV2"
+  endpoint {
+    name: "nn.MaxPoolGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MaxPoolGradWithArgmax.pbtxt b/tensorflow/core/api_def/java_api/api_def_MaxPoolGradWithArgmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c10701f555eee78bfbad8ae67937693d764047dd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MaxPoolGradWithArgmax.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MaxPoolGradWithArgmax"
+  endpoint {
+    name: "nn.MaxPoolGradWithArgmax"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MaxPoolV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_MaxPoolV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..846349435533953c6d0a3be3a4bbc4c0b9631bf9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MaxPoolV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MaxPoolV2"
+  endpoint {
+    name: "nn.MaxPool"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MaxPoolWithArgmax.pbtxt b/tensorflow/core/api_def/java_api/api_def_MaxPoolWithArgmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..43630534cb70e52425cb4d188b889a7ed2984c4d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MaxPoolWithArgmax.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MaxPoolWithArgmax"
+  endpoint {
+    name: "nn.MaxPoolWithArgmax"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Maximum.pbtxt b/tensorflow/core/api_def/java_api/api_def_Maximum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1df9c6053050c3defd97f39c8ea02c134d2cc1a7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Maximum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Maximum"
+  endpoint {
+    name: "math.Maximum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Mean.pbtxt b/tensorflow/core/api_def/java_api/api_def_Mean.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7bdcdc3d74e58ccd5aa754440be5c7e241c448d5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Mean.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Mean"
+  endpoint {
+    name: "math.Mean"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Merge.pbtxt b/tensorflow/core/api_def/java_api/api_def_Merge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..954d5085adf1ef31e37889fdacedd10e41f28b36
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Merge.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Merge"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MergeSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_MergeSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f52c7c0996b8fa5b1d75e97cc4f7ae78df08b561
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MergeSummary.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MergeSummary"
+  endpoint {
+    name: "summary.MergeSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MergeV2Checkpoints.pbtxt b/tensorflow/core/api_def/java_api/api_def_MergeV2Checkpoints.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8899c8c4ed768cd74d87ee89fd1c00f344163919
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MergeV2Checkpoints.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MergeV2Checkpoints"
+  endpoint {
+    name: "train.MergeV2Checkpoints"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Mfcc.pbtxt b/tensorflow/core/api_def/java_api/api_def_Mfcc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6cb04e73ff400129bc0cd02568c7dbd365c026fa
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Mfcc.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Mfcc"
+  endpoint {
+    name: "audio.Mfcc"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Min.pbtxt b/tensorflow/core/api_def/java_api/api_def_Min.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..72894c1ffd44d179583a9b87d04a8d5f7ee807c3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Min.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Min"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Minimum.pbtxt b/tensorflow/core/api_def/java_api/api_def_Minimum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..69f76a982995cee304e9d877b3d9e126c0c553a4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Minimum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Minimum"
+  endpoint {
+    name: "math.Minimum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MirrorPad.pbtxt b/tensorflow/core/api_def/java_api/api_def_MirrorPad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e1cb766f8fdca3fdcd60376e020a6dcf22ff04b9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MirrorPad.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "MirrorPad"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MirrorPadGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_MirrorPadGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ddd8ab3ba1d8f94b1e175781e4070f1893ac7db8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MirrorPadGrad.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "MirrorPadGrad"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Mod.pbtxt b/tensorflow/core/api_def/java_api/api_def_Mod.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..76fbbe97a89cf80b756198be85f8072eeda7835d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Mod.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Mod"
+  endpoint {
+    name: "math.Mod"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ModelDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ModelDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..143c7afd720c64e581e36bec25af9f2c3cb62378
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ModelDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ModelDataset"
+  endpoint {
+    name: "data.ModelDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Mul.pbtxt b/tensorflow/core/api_def/java_api/api_def_Mul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..605e110931e21d73e738190aa70207989e334bb5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Mul.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Mul"
+  endpoint {
+    name: "math.Mul"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MultiDeviceIterator.pbtxt b/tensorflow/core/api_def/java_api/api_def_MultiDeviceIterator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..81eabf9bdcfac310fc15eb652585a930a025246d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MultiDeviceIterator.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MultiDeviceIterator"
+  endpoint {
+    name: "data.MultiDeviceIterator"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MultiDeviceIteratorFromStringHandle.pbtxt b/tensorflow/core/api_def/java_api/api_def_MultiDeviceIteratorFromStringHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4006f72d5204cad2b502f3e87579ad31bd623100
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MultiDeviceIteratorFromStringHandle.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MultiDeviceIteratorFromStringHandle"
+  endpoint {
+    name: "data.MultiDeviceIteratorFromStringHandle"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MultiDeviceIteratorGetNextFromShard.pbtxt b/tensorflow/core/api_def/java_api/api_def_MultiDeviceIteratorGetNextFromShard.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a7e6fc1508249b937a1b454e11aa75c2073999f1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MultiDeviceIteratorGetNextFromShard.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MultiDeviceIteratorGetNextFromShard"
+  endpoint {
+    name: "data.MultiDeviceIteratorGetNextFromShard"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MultiDeviceIteratorInit.pbtxt b/tensorflow/core/api_def/java_api/api_def_MultiDeviceIteratorInit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1663bc5c22659d3087eb7048677de01d1a4fce8f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MultiDeviceIteratorInit.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MultiDeviceIteratorInit"
+  endpoint {
+    name: "data.MultiDeviceIteratorInit"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MultiDeviceIteratorToStringHandle.pbtxt b/tensorflow/core/api_def/java_api/api_def_MultiDeviceIteratorToStringHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ff061da390b0bef3c53514aa8abcbfb7a954ad04
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MultiDeviceIteratorToStringHandle.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MultiDeviceIteratorToStringHandle"
+  endpoint {
+    name: "data.MultiDeviceIteratorToStringHandle"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Multinomial.pbtxt b/tensorflow/core/api_def/java_api/api_def_Multinomial.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bd98bb10b4b25591684f6a7423827ee171bf1855
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Multinomial.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Multinomial"
+  endpoint {
+    name: "random.Multinomial"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MutableDenseHashTable.pbtxt b/tensorflow/core/api_def/java_api/api_def_MutableDenseHashTable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d7494815d8d2ca6fcbd2020d6b180851e9844e78
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MutableDenseHashTable.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MutableDenseHashTable"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MutableDenseHashTableV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_MutableDenseHashTableV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d1f7f26848b5304afebe5ff87e77af5ec91350dc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MutableDenseHashTableV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MutableDenseHashTableV2"
+  endpoint {
+    name: "MutableDenseHashTable"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MutableHashTable.pbtxt b/tensorflow/core/api_def/java_api/api_def_MutableHashTable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c446ff8b27e76beb50a7cb4eb6249297f74ac342
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MutableHashTable.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MutableHashTable"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MutableHashTableOfTensors.pbtxt b/tensorflow/core/api_def/java_api/api_def_MutableHashTableOfTensors.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..76df883d7d4fea91ef58a07513be9b0ece097b65
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MutableHashTableOfTensors.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MutableHashTableOfTensors"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MutableHashTableOfTensorsV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_MutableHashTableOfTensorsV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f6d7451267bf09d2f07a2fbf365a4c9578a745ce
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MutableHashTableOfTensorsV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MutableHashTableOfTensorsV2"
+  endpoint {
+    name: "MutableHashTableOfTensors"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MutableHashTableV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_MutableHashTableV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..45d619d6747742866c6d70bdc3044d6d69f8717e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MutableHashTableV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MutableHashTableV2"
+  endpoint {
+    name: "MutableHashTable"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MutexLock.pbtxt b/tensorflow/core/api_def/java_api/api_def_MutexLock.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..75c7be5286ab638ad8ea1075879a03bbe24a5447
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MutexLock.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "MutexLock"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_MutexV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_MutexV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f89cd106432d0a7323cee0c0b4d32f62d5996b0a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_MutexV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "MutexV2"
+  endpoint {
+    name: "Mutex"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NcclAllReduce.pbtxt b/tensorflow/core/api_def/java_api/api_def_NcclAllReduce.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c7133d4a4a8455f84dad8b2305f8ab560042bc15
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NcclAllReduce.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "NcclAllReduce"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NcclBroadcast.pbtxt b/tensorflow/core/api_def/java_api/api_def_NcclBroadcast.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b9b1a345c933deb321e4a8eb0e919b5f8166e7cf
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NcclBroadcast.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "NcclBroadcast"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NcclReduce.pbtxt b/tensorflow/core/api_def/java_api/api_def_NcclReduce.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..18dc89017782ea1264229f5efc650044adf6bdfe
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NcclReduce.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "NcclReduce"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Neg.pbtxt b/tensorflow/core/api_def/java_api/api_def_Neg.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c7e9ede2a568f4a33c8a10611c5215270e01ea41
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Neg.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Neg"
+  endpoint {
+    name: "math.Neg"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NegTrain.pbtxt b/tensorflow/core/api_def/java_api/api_def_NegTrain.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eb62186362a511b39ab39b42b8e56ea6993a6f71
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NegTrain.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "NegTrain"
+  endpoint {
+    name: "train.NegTrain"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NextIteration.pbtxt b/tensorflow/core/api_def/java_api/api_def_NextIteration.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bc63f6ada14200917958aff04dabcb13486d1572
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NextIteration.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "NextIteration"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NoOp.pbtxt b/tensorflow/core/api_def/java_api/api_def_NoOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..337fb5da146b30bed15fc3f7082e5f91acfa7dd8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NoOp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "NoOp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NonMaxSuppression.pbtxt b/tensorflow/core/api_def/java_api/api_def_NonMaxSuppression.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..49ac0de4ce7635b9b750fada815f33c24cc5c86d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NonMaxSuppression.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "NonMaxSuppression"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NonMaxSuppressionV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_NonMaxSuppressionV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fabf5c62157c7ae9148d2f43382e4d13af2be2d3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NonMaxSuppressionV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "NonMaxSuppressionV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NonMaxSuppressionV3.pbtxt b/tensorflow/core/api_def/java_api/api_def_NonMaxSuppressionV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0aefcb55098ccc2226447fc080bfa3fdd56f9faa
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NonMaxSuppressionV3.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "NonMaxSuppressionV3"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NonMaxSuppressionV4.pbtxt b/tensorflow/core/api_def/java_api/api_def_NonMaxSuppressionV4.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e71de7f4a6a2a36f9b8a3bb7c29a28c70fcd790b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NonMaxSuppressionV4.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "NonMaxSuppressionV4"
+  endpoint {
+    name: "image.NonMaxSuppression"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NonMaxSuppressionWithOverlaps.pbtxt b/tensorflow/core/api_def/java_api/api_def_NonMaxSuppressionWithOverlaps.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..06fa52920d2b93e0df8b8032d9b89d7b59e78f17
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NonMaxSuppressionWithOverlaps.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "NonMaxSuppressionWithOverlaps"
+  endpoint {
+    name: "image.NonMaxSuppressionWithOverlaps"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NotEqual.pbtxt b/tensorflow/core/api_def/java_api/api_def_NotEqual.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0d99af40b5d326a92180194dc8a6d01cf578b5a8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NotEqual.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "NotEqual"
+  endpoint {
+    name: "math.NotEqual"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_NthElement.pbtxt b/tensorflow/core/api_def/java_api/api_def_NthElement.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..57097e634aadf233de10770ea14faa0a2ec2e9dc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_NthElement.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "NthElement"
+  endpoint {
+    name: "nn.NthElement"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OneHot.pbtxt b/tensorflow/core/api_def/java_api/api_def_OneHot.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..66872d5eb8ead8571136a57f9ba2e5cb7e04c9e2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OneHot.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "OneHot"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OneShotIterator.pbtxt b/tensorflow/core/api_def/java_api/api_def_OneShotIterator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..39af8cefde4f88d5a2973f93f3b861374af94de0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OneShotIterator.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "OneShotIterator"
+  endpoint {
+    name: "data.OneShotIterator"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OnesLike.pbtxt b/tensorflow/core/api_def/java_api/api_def_OnesLike.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..97abe0814a9b988999e965cc5c5d61c62a4a4763
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OnesLike.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "OnesLike"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OptimizeDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_OptimizeDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e7ddf97d1ab755efe9b3bdb05552f8ec39f86d1f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OptimizeDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "OptimizeDataset"
+  endpoint {
+    name: "data.OptimizeDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OptionalFromValue.pbtxt b/tensorflow/core/api_def/java_api/api_def_OptionalFromValue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d251fd5d9482e6d6e93cad59e9c2b5be3f01f32e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OptionalFromValue.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "OptionalFromValue"
+  endpoint {
+    name: "data.OptionalFromValue"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OptionalGetValue.pbtxt b/tensorflow/core/api_def/java_api/api_def_OptionalGetValue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7fcdb5ac6943fe0a44b838e2adbb8f95e01a2f5d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OptionalGetValue.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "OptionalGetValue"
+  endpoint {
+    name: "data.OptionalGetValue"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OptionalHasValue.pbtxt b/tensorflow/core/api_def/java_api/api_def_OptionalHasValue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ffa15b564c547dbe391232d9eff5375b1b40242
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OptionalHasValue.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "OptionalHasValue"
+  endpoint {
+    name: "data.OptionalHasValue"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OptionalNone.pbtxt b/tensorflow/core/api_def/java_api/api_def_OptionalNone.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cec29a42ae2e8cb1765369bb1c586ec73e5735c9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OptionalNone.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "OptionalNone"
+  endpoint {
+    name: "data.OptionalNone"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OrderedMapClear.pbtxt b/tensorflow/core/api_def/java_api/api_def_OrderedMapClear.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e36b2aa3e4f46c0e725b6b17817869900de5cd33
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OrderedMapClear.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "OrderedMapClear"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OrderedMapIncompleteSize.pbtxt b/tensorflow/core/api_def/java_api/api_def_OrderedMapIncompleteSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c609e9e50a2e4e88700788cd81b0a9a649df1e42
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OrderedMapIncompleteSize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "OrderedMapIncompleteSize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OrderedMapPeek.pbtxt b/tensorflow/core/api_def/java_api/api_def_OrderedMapPeek.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..06fc2182773658f2b8bcc2bf32550ef32faa3e64
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OrderedMapPeek.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "OrderedMapPeek"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OrderedMapSize.pbtxt b/tensorflow/core/api_def/java_api/api_def_OrderedMapSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7beef3f376ba6d7917bb753394984c5b65ae8108
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OrderedMapSize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "OrderedMapSize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OrderedMapStage.pbtxt b/tensorflow/core/api_def/java_api/api_def_OrderedMapStage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8b579d21a0e38923edd9dcb306f085f3be12828a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OrderedMapStage.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "OrderedMapStage"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OrderedMapUnstage.pbtxt b/tensorflow/core/api_def/java_api/api_def_OrderedMapUnstage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d3d6862fbe0ae91e9f3e7089a6f65a52f9570832
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OrderedMapUnstage.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "OrderedMapUnstage"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_OrderedMapUnstageNoKey.pbtxt b/tensorflow/core/api_def/java_api/api_def_OrderedMapUnstageNoKey.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3d275c85d942a1dd6175c1b7fcddf2644200772e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_OrderedMapUnstageNoKey.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "OrderedMapUnstageNoKey"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Pack.pbtxt b/tensorflow/core/api_def/java_api/api_def_Pack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d9e9897d77cde18efc7a9f6831a71e0819280198
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Pack.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Pack"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Pad.pbtxt b/tensorflow/core/api_def/java_api/api_def_Pad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f83f451552d924b37b214e58849a2da38772957f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Pad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Pad"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PadV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_PadV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ffc121645600ef2f159b3356407e540e2565a1c3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PadV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "PadV2"
+  endpoint {
+    name: "Pad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PaddedBatchDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_PaddedBatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7cec77427cfe6314eb223bf28898853a4667856e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PaddedBatchDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PaddedBatchDataset"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PaddedBatchDatasetV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_PaddedBatchDatasetV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..22dfe84f0cadd59a83cae827deb4ba9c33fbe19d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PaddedBatchDatasetV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "PaddedBatchDatasetV2"
+  endpoint {
+    name: "data.PaddedBatchDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PaddingFIFOQueue.pbtxt b/tensorflow/core/api_def/java_api/api_def_PaddingFIFOQueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..03db4bf185d78bbafd9040aa9f6d39dcdf1e81d3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PaddingFIFOQueue.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PaddingFIFOQueue"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PaddingFIFOQueueV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_PaddingFIFOQueueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..605025be7910993c691c22c2eb9cd681145b9e58
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PaddingFIFOQueueV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "PaddingFIFOQueueV2"
+  endpoint {
+    name: "io.PaddingFifoQueue"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ParallelConcat.pbtxt b/tensorflow/core/api_def/java_api/api_def_ParallelConcat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0b17c7d256926f975afb2b21bb83f4298248bfeb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ParallelConcat.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ParallelConcat"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ParallelDynamicStitch.pbtxt b/tensorflow/core/api_def/java_api/api_def_ParallelDynamicStitch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..79a55b763f356478d6cb35adda051ea9ab9d8b5a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ParallelDynamicStitch.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ParallelDynamicStitch"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ParallelInterleaveDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ParallelInterleaveDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6a985d24fa74062c2262961abacd7e60654617b5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ParallelInterleaveDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ParallelInterleaveDataset"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ParallelInterleaveDatasetV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ParallelInterleaveDatasetV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6c74149a6c3497ec75f62c0c227d09558b543493
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ParallelInterleaveDatasetV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ParallelInterleaveDatasetV2"
+  endpoint {
+    name: "ParallelInterleaveDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ParallelMapDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ParallelMapDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5091bb9cec21eff4ee4b168bf11e3001abe4fe9f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ParallelMapDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ParallelMapDataset"
+  endpoint {
+    name: "data.ParallelMapDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ParameterizedTruncatedNormal.pbtxt b/tensorflow/core/api_def/java_api/api_def_ParameterizedTruncatedNormal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..26ca2fc86fa2fbc2aaefac6b0be210e98dd3947e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ParameterizedTruncatedNormal.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ParameterizedTruncatedNormal"
+  endpoint {
+    name: "random.ParameterizedTruncatedNormal"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ParseExample.pbtxt b/tensorflow/core/api_def/java_api/api_def_ParseExample.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6e7384f2e337ff2d4d597cb1ee4229e3ef7a01ac
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ParseExample.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ParseExample"
+  endpoint {
+    name: "io.ParseExample"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ParseExampleDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ParseExampleDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4309645093d8dd55e92a39da2963e7c7f7c14041
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ParseExampleDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ParseExampleDataset"
+  endpoint {
+    name: "data.ParseExampleDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ParseSequenceExample.pbtxt b/tensorflow/core/api_def/java_api/api_def_ParseSequenceExample.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..09ee715ac715f5cad68141e337c575f040019db5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ParseSequenceExample.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ParseSequenceExample"
+  endpoint {
+    name: "io.ParseSequenceExample"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ParseSingleExample.pbtxt b/tensorflow/core/api_def/java_api/api_def_ParseSingleExample.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7559957b35e1169edc0a399448656c355a41e007
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ParseSingleExample.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ParseSingleExample"
+  endpoint {
+    name: "io.ParseSingleExample"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ParseSingleSequenceExample.pbtxt b/tensorflow/core/api_def/java_api/api_def_ParseSingleSequenceExample.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..00eb325b2a437e955e9fcb9c18f241726892e1c0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ParseSingleSequenceExample.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ParseSingleSequenceExample"
+  endpoint {
+    name: "io.ParseSingleSequenceExample"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ParseTensor.pbtxt b/tensorflow/core/api_def/java_api/api_def_ParseTensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a78cdc7f5c44df056fb587cdd263062fd333f227
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ParseTensor.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ParseTensor"
+  endpoint {
+    name: "io.ParseTensor"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PartitionedCall.pbtxt b/tensorflow/core/api_def/java_api/api_def_PartitionedCall.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1ac10b602803a38abb00dd3766ca409cdab51c27
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PartitionedCall.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "PartitionedCall"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Placeholder.pbtxt b/tensorflow/core/api_def/java_api/api_def_Placeholder.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5e6daa2ae42f1bab3bfc00f0630b1727c77352d4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Placeholder.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Placeholder"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PlaceholderV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_PlaceholderV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..419bdf10f794b051b16ef45e5762cac4a8bff087
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PlaceholderV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PlaceholderV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PlaceholderWithDefault.pbtxt b/tensorflow/core/api_def/java_api/api_def_PlaceholderWithDefault.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..59067a9c688782a998fda06206b6ec460b882afd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PlaceholderWithDefault.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "PlaceholderWithDefault"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Polygamma.pbtxt b/tensorflow/core/api_def/java_api/api_def_Polygamma.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..746b3375a0f05c5c31b4298d534e8d5177586d90
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Polygamma.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Polygamma"
+  endpoint {
+    name: "math.Polygamma"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PopulationCount.pbtxt b/tensorflow/core/api_def/java_api/api_def_PopulationCount.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6aacdf4d1218bc5d05c8332beff51a34a34f0377
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PopulationCount.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "PopulationCount"
+  endpoint {
+    name: "math.PopulationCount"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Pow.pbtxt b/tensorflow/core/api_def/java_api/api_def_Pow.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e7eaaed6952d221c074466024e7f5e49ea47c7ab
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Pow.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Pow"
+  endpoint {
+    name: "math.Pow"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PrefetchDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_PrefetchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..beaad84d15344f0bb26421c4757f4e508e1f0c3a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PrefetchDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "PrefetchDataset"
+  endpoint {
+    name: "data.PrefetchDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7c9d509b16346776a9e085edab06f0587d73d0ae
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "PrependFromQueueAndPaddedBatchDataset"
+  endpoint {
+    name: "data.PrependFromQueueAndPaddedBatchDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PreventGradient.pbtxt b/tensorflow/core/api_def/java_api/api_def_PreventGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4731f21af4c8a56dde6c8c9e573d75fad1effffe
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PreventGradient.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "PreventGradient"
+  endpoint {
+    name: "train.PreventGradient"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Print.pbtxt b/tensorflow/core/api_def/java_api/api_def_Print.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..21a085a1c2c5d4568f5609e9efc080efaefe7c59
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Print.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Print"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PrintV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_PrintV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d1e4d74b1e3b8a324abbf04d4f9ca37b814f20bd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PrintV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "PrintV2"
+  endpoint {
+    name: "Print"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PriorityQueue.pbtxt b/tensorflow/core/api_def/java_api/api_def_PriorityQueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0a9909d122f8b3b60d605d2ce458c57b907b4fdc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PriorityQueue.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PriorityQueue"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PriorityQueueV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_PriorityQueueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1f6a6f2906b00e538215af03249594faea9141c2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PriorityQueueV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "PriorityQueueV2"
+  endpoint {
+    name: "io.PriorityQueue"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Prod.pbtxt b/tensorflow/core/api_def/java_api/api_def_Prod.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7a98972a871d4ed3be1554f184b33b236b6bca67
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Prod.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Prod"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PyFunc.pbtxt b/tensorflow/core/api_def/java_api/api_def_PyFunc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5f1f7c47ca040c2b2a9392523be7f955b5310725
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PyFunc.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PyFunc"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_PyFuncStateless.pbtxt b/tensorflow/core/api_def/java_api/api_def_PyFuncStateless.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..684ef58d1b258c4448df5095d3dfcc4050d5f583
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_PyFuncStateless.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "PyFuncStateless"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Qr.pbtxt b/tensorflow/core/api_def/java_api/api_def_Qr.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c0d31e79778dce19d70db6d1824b00451e3e280b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Qr.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Qr"
+  endpoint {
+    name: "linalg.Qr"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantize.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e6ba0ce8b8a239c5c9d27a3a8fe853b616e940f6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizeAndDequantize"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..678a77113cf483c90240a1c5802849bddbf4b219
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV3.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c9e52e770a1f48d965b4775174e1f4f471f02017
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV3.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV3"
+  endpoint {
+    name: "quantization.QuantizeAndDequantize"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizeDownAndShrinkRange.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizeDownAndShrinkRange.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7119f53cb29a9c23771b5e0c288dc1f0a515cac2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizeDownAndShrinkRange.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizeDownAndShrinkRange"
+  endpoint {
+    name: "quantization.QuantizeDownAndShrinkRange"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizeV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizeV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..25c9c3bdce467e06b92929dfc2476f5704443442
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizeV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizeV2"
+  endpoint {
+    name: "quantization.Quantize"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1a2bfa36ed7d7bb0bbf9a5d46dc637e6746b99b8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedAdd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedAdd"
+  endpoint {
+    name: "math.QuantizedAdd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedAvgPool.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedAvgPool.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7f16fb046dd975fe8480ab6d415a101e28256d28
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedAvgPool.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedAvgPool"
+  endpoint {
+    name: "nn.QuantizedAvgPool"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedBatchNormWithGlobalNormalization.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedBatchNormWithGlobalNormalization.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2101ce5d69071c1d6e8f0667b8492d86fcc2aa70
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedBatchNormWithGlobalNormalization.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedBatchNormWithGlobalNormalization"
+  endpoint {
+    name: "nn.QuantizedBatchNormWithGlobalNormalization"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedBiasAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedBiasAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c18fb1c5741abf83da47e75aff393380e7917ec7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedBiasAdd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedBiasAdd"
+  endpoint {
+    name: "nn.QuantizedBiasAdd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedConcat.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedConcat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cb5d0c0fda46090ae282265b501147c1ceb32fbc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedConcat.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "QuantizedConcat"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedConv2D.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedConv2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9909157603eb329af85d3b574eca44adb4f8d979
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedConv2D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedConv2D"
+  endpoint {
+    name: "nn.QuantizedConv2d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedInstanceNorm.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedInstanceNorm.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bbd2e7fc5ec85c5f343450df78575dbff8e8b988
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedInstanceNorm.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedInstanceNorm"
+  endpoint {
+    name: "nn.QuantizedInstanceNorm"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedMatMul.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedMatMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7962cbade6f9b4a4b59795b1312c43587e4fb2c1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedMatMul.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedMatMul"
+  endpoint {
+    name: "linalg.QuantizedMatMul"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedMaxPool.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedMaxPool.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..57e900494e994bcead9de2c302741bfdf2f1e7e5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedMaxPool.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedMaxPool"
+  endpoint {
+    name: "nn.QuantizedMaxPool"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedMul.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..be23ef706e206b45f4d75173fb3aa952617de8fa
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedMul.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedMul"
+  endpoint {
+    name: "math.QuantizedMul"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedRelu.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedRelu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7b9a11640ba6ac6b3483e256694ea37aeac8153e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedRelu.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedRelu"
+  endpoint {
+    name: "nn.QuantizedRelu"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedRelu6.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedRelu6.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6a60e2112ee4609596e6c2adb652314d9159bbf4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedRelu6.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedRelu6"
+  endpoint {
+    name: "nn.QuantizedRelu6"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedReluX.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedReluX.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cc47d322b270118b9e8497826525ad8f7e9f13d5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedReluX.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedReluX"
+  endpoint {
+    name: "nn.QuantizedReluX"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedReshape.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedReshape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4557853d94f550776c5d4e257d3872ca7f5e9e0e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedReshape.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "QuantizedReshape"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizedResizeBilinear.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizedResizeBilinear.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..81dca490944cb07f90d00e76baf40559776eff7d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizedResizeBilinear.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QuantizedResizeBilinear"
+  endpoint {
+    name: "image.QuantizedResizeBilinear"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueClose.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueClose.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4a6bada741da9c3f0634320be8b6a122c6d9d490
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueClose.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueClose"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueCloseV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueCloseV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ce779650e5bf3f58bc4bd4320013ad2c1c6eaf88
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueCloseV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QueueCloseV2"
+  endpoint {
+    name: "io.QueueClose"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueDequeue.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueDequeue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..45c811a6b44f63a6aa5abe19e97d3df6623629d9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueDequeue.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueDequeue"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueDequeueMany.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueDequeueMany.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9e088ef2587c071dc07706c1f8d38e5b63dc9b27
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueDequeueMany.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueDequeueMany"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueDequeueManyV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueDequeueManyV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..10fe198ff261c6638710b43f92060a577009e7f6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueDequeueManyV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QueueDequeueManyV2"
+  endpoint {
+    name: "io.QueueDequeueMany"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueDequeueUpTo.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueDequeueUpTo.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b96e568c4114f2044c00b982227e7b2438cf09d2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueDequeueUpTo.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueDequeueUpTo"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueDequeueUpToV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueDequeueUpToV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fadea0926b6b7f80733775246669a7d7e0f36014
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueDequeueUpToV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QueueDequeueUpToV2"
+  endpoint {
+    name: "io.QueueDequeueUpTo"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueDequeueV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueDequeueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7ba03afbfebddb0bde4399ae36319848683a978d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueDequeueV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QueueDequeueV2"
+  endpoint {
+    name: "io.QueueDequeue"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueEnqueue.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueEnqueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2945c46d6eb5a9cbe0d468cac90e62c83aa22395
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueEnqueue.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueEnqueue"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueEnqueueMany.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueEnqueueMany.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..442ddcbc038ab14aa6f72bb49414c77bd63bdc00
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueEnqueueMany.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueEnqueueMany"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueEnqueueManyV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueEnqueueManyV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..be3fed47896d4a56f77b23395d94145d4fdbf463
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueEnqueueManyV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QueueEnqueueManyV2"
+  endpoint {
+    name: "io.QueueEnqueueMany"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueEnqueueV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueEnqueueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e71a2211e1e72cd2df836fdf2fe91ad88e0aed7c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueEnqueueV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QueueEnqueueV2"
+  endpoint {
+    name: "io.QueueEnqueue"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueIsClosed.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueIsClosed.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0b51b208b76ddca86a31a5e30def3f972b244024
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueIsClosed.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueIsClosed"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueIsClosedV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueIsClosedV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..148d313a6d317479a8bf70d0ee82a55b8cb7a57a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueIsClosedV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QueueIsClosedV2"
+  endpoint {
+    name: "io.QueueIsClosed"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueSize.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9bd7244d6814624d15c7d631f825e806bbbc6a2d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueSize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QueueSize"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QueueSizeV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_QueueSizeV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e93e07a2b32e9cc96620c3d3c68f5446068a69e7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QueueSizeV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "QueueSizeV2"
+  endpoint {
+    name: "io.QueueSize"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RFFT.pbtxt b/tensorflow/core/api_def/java_api/api_def_RFFT.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9576600e756b00f1a7f8d01ad89955bc91e7d726
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RFFT.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RFFT"
+  endpoint {
+    name: "signal.Rfft"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RFFT2D.pbtxt b/tensorflow/core/api_def/java_api/api_def_RFFT2D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..41d638b26a8a3f6824dcf00429cc7de533362e1f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RFFT2D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RFFT2D"
+  endpoint {
+    name: "signal.Rfft2d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RFFT3D.pbtxt b/tensorflow/core/api_def/java_api/api_def_RFFT3D.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7a762d22e5cdf8d0ec910bc23ca54f39ba07d06e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RFFT3D.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RFFT3D"
+  endpoint {
+    name: "signal.Rfft3d"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RGBToHSV.pbtxt b/tensorflow/core/api_def/java_api/api_def_RGBToHSV.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1b35891ae2cc53ca905c8fbf99f02c6f0a7bb49a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RGBToHSV.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RGBToHSV"
+  endpoint {
+    name: "image.RgbToHsv"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RaggedGather.pbtxt b/tensorflow/core/api_def/java_api/api_def_RaggedGather.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f060daeb6571631f2bb029676b3aa5b3a28be6d7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RaggedGather.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "RaggedGather"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RaggedRange.pbtxt b/tensorflow/core/api_def/java_api/api_def_RaggedRange.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b1a5bab0acee54ec89a67b5e63edce5a6b080d3a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RaggedRange.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "RaggedRange"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RaggedTensorToSparse.pbtxt b/tensorflow/core/api_def/java_api/api_def_RaggedTensorToSparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f049f47b46c45d9a44e85f57f9821e69c1ea869e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RaggedTensorToSparse.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "RaggedTensorToSparse"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RandomCrop.pbtxt b/tensorflow/core/api_def/java_api/api_def_RandomCrop.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a3b8a3cecda8720f9fae41fd4fd7501c0c8c0414
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RandomCrop.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RandomCrop"
+  endpoint {
+    name: "image.RandomCrop"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RandomDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_RandomDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..43921e6eafe6d49d5dd9f28e7d6d7f3e4c206527
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RandomDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RandomDataset"
+  endpoint {
+    name: "data.RandomDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RandomGamma.pbtxt b/tensorflow/core/api_def/java_api/api_def_RandomGamma.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..927f2c56937f08a440cd94b09ca90b553df9182a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RandomGamma.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RandomGamma"
+  endpoint {
+    name: "random.RandomGamma"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RandomGammaGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_RandomGammaGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9257495c9bd15239036c6a3971f60dd7fa2c6466
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RandomGammaGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RandomGammaGrad"
+  endpoint {
+    name: "random.RandomGammaGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RandomPoisson.pbtxt b/tensorflow/core/api_def/java_api/api_def_RandomPoisson.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d1ea79502415c8096097ffc16e99979ac0b3383a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RandomPoisson.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RandomPoisson"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RandomPoissonV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_RandomPoissonV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5efe01bf4011c628a1b54e95de3a3bd83f529ff2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RandomPoissonV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RandomPoissonV2"
+  endpoint {
+    name: "random.RandomPoisson"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RandomShuffle.pbtxt b/tensorflow/core/api_def/java_api/api_def_RandomShuffle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6dcd12fd375d7db664b4070efaed4a8525e3a4bc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RandomShuffle.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RandomShuffle"
+  endpoint {
+    name: "random.RandomShuffle"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RandomShuffleQueue.pbtxt b/tensorflow/core/api_def/java_api/api_def_RandomShuffleQueue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9660121a073124e7dd600d4d70c60195fcd4b1cb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RandomShuffleQueue.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RandomShuffleQueue"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RandomShuffleQueueV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_RandomShuffleQueueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..779363303cac64b4037d3f1fb0de883760424247
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RandomShuffleQueueV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RandomShuffleQueueV2"
+  endpoint {
+    name: "io.RandomShuffleQueue"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RandomStandardNormal.pbtxt b/tensorflow/core/api_def/java_api/api_def_RandomStandardNormal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..413fc87bdf06dcbe59dbbbb8c549512d16afe885
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RandomStandardNormal.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RandomStandardNormal"
+  endpoint {
+    name: "random.RandomStandardNormal"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RandomUniform.pbtxt b/tensorflow/core/api_def/java_api/api_def_RandomUniform.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2a93df83df65462a717b7aa347e69c2036d4efb6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RandomUniform.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RandomUniform"
+  endpoint {
+    name: "random.RandomUniform"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RandomUniformInt.pbtxt b/tensorflow/core/api_def/java_api/api_def_RandomUniformInt.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a1383f406a384905e9b0862e6f07345149da9eeb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RandomUniformInt.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RandomUniformInt"
+  endpoint {
+    name: "random.RandomUniformInt"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Range.pbtxt b/tensorflow/core/api_def/java_api/api_def_Range.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..24f3787a8e38f2deb446724cd35ca7acfa57a424
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Range.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Range"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RangeDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_RangeDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b6180261b053e1949778e7fd327d1e5db53c19b0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RangeDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RangeDataset"
+  endpoint {
+    name: "data.RangeDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Rank.pbtxt b/tensorflow/core/api_def/java_api/api_def_Rank.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..baa84aab10d65e5cbc58d2d2ab0ece807c7ef8ff
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Rank.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Rank"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReadFile.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReadFile.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f74250d42f76b10d18d7fa39fb3f43099e7eb137
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReadFile.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ReadFile"
+  endpoint {
+    name: "io.ReadFile"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReadVariableOp.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReadVariableOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..018886d5b825586a687024f513065cf4b24456bc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReadVariableOp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ReadVariableOp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderNumRecordsProduced.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderNumRecordsProduced.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b087d11182e77847ff8adac8241aa1f67ae2a3c1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderNumRecordsProduced.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderNumRecordsProduced"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderNumRecordsProducedV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderNumRecordsProducedV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..54a30abe18728a01409fa5ec1786a2b75ea37ef9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderNumRecordsProducedV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ReaderNumRecordsProducedV2"
+  endpoint {
+    name: "io.ReaderNumRecordsProduced"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderNumWorkUnitsCompleted.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderNumWorkUnitsCompleted.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e30e97fd08cf750a9d332adaf276c4be508993fe
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderNumWorkUnitsCompleted.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderNumWorkUnitsCompleted"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderNumWorkUnitsCompletedV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderNumWorkUnitsCompletedV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0904ba19e5398ba60054ca9c31646e718a8daabb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderNumWorkUnitsCompletedV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ReaderNumWorkUnitsCompletedV2"
+  endpoint {
+    name: "io.ReaderNumWorkUnitsCompleted"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderRead.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderRead.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8f98d88bda84ebff75f84989b1983dbccec23a22
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderRead.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderRead"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderReadUpTo.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderReadUpTo.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d418b00b2736061599852356aa99e9acb7da6c3c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderReadUpTo.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderReadUpTo"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderReadUpToV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderReadUpToV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..777d09fa2c6c5cc9a6d2c8210dde6bfcb6777120
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderReadUpToV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ReaderReadUpToV2"
+  endpoint {
+    name: "io.ReaderReadUpTo"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderReadV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderReadV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a5d45bd1db9a05597e531aa87a3e68797af9e1ab
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderReadV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ReaderReadV2"
+  endpoint {
+    name: "io.ReaderRead"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderReset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderReset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e6041caabd1590d077abaff3bf169314a2d7c558
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderReset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderReset"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderResetV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderResetV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..265a3442f5345e2cfdcae269e8d10ab0f336d6e4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderResetV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ReaderResetV2"
+  endpoint {
+    name: "io.ReaderReset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderRestoreState.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderRestoreState.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0aa0ec595d600d338e03d6520bf4824ee25199ab
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderRestoreState.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderRestoreState"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderRestoreStateV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderRestoreStateV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4728ce7796bed9d8591bfb104235abb3fa87d105
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderRestoreStateV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ReaderRestoreStateV2"
+  endpoint {
+    name: "io.ReaderRestoreState"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderSerializeState.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderSerializeState.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5e23e285fb13e692191cea3b6d6b9272b765337d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderSerializeState.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ReaderSerializeState"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReaderSerializeStateV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReaderSerializeStateV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aa396095b176239ffbae67be76d644c0cb421183
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReaderSerializeStateV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ReaderSerializeStateV2"
+  endpoint {
+    name: "io.ReaderSerializeState"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Real.pbtxt b/tensorflow/core/api_def/java_api/api_def_Real.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3aaea928dec5b68cb501cfa48882abf8fd720b6b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Real.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Real"
+  endpoint {
+    name: "math.Real"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RealDiv.pbtxt b/tensorflow/core/api_def/java_api/api_def_RealDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..415bd29da04f1244c521530a17488c9623048ef3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RealDiv.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RealDiv"
+  endpoint {
+    name: "math.RealDiv"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Reciprocal.pbtxt b/tensorflow/core/api_def/java_api/api_def_Reciprocal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1c0d787c24ebf717fb1b7ba227fa28cbaf05f115
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Reciprocal.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Reciprocal"
+  endpoint {
+    name: "math.Reciprocal"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReciprocalGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReciprocalGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..68879669b5f76a3b8751aa8b7d690f64dc5ead5b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReciprocalGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ReciprocalGrad"
+  endpoint {
+    name: "math.ReciprocalGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RecordInput.pbtxt b/tensorflow/core/api_def/java_api/api_def_RecordInput.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c4807c68dee88b427a8ad77caec06c2755ccd790
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RecordInput.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RecordInput"
+  endpoint {
+    name: "random.RecordInput"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReduceDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReduceDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b16c5dbb96c7f5005025b47b4cf39be6a66b76d6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReduceDataset.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ReduceDataset"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReduceJoin.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReduceJoin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..79193222018b7f09dc0db09b718762f79f13033d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReduceJoin.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ReduceJoin"
+  endpoint {
+    name: "strings.ReduceJoin"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RefEnter.pbtxt b/tensorflow/core/api_def/java_api/api_def_RefEnter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9cd2281bc6a1f738a990546044050970c08219be
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RefEnter.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "RefEnter"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RefExit.pbtxt b/tensorflow/core/api_def/java_api/api_def_RefExit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..67e8d39c9af68802cbcd99b58e4b4a99034734dc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RefExit.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "RefExit"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RefIdentity.pbtxt b/tensorflow/core/api_def/java_api/api_def_RefIdentity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..53483bd1bb79064dda0a95717c41b06ab99b3852
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RefIdentity.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "RefIdentity"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RefMerge.pbtxt b/tensorflow/core/api_def/java_api/api_def_RefMerge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6ea3145841b3496d13d88327e49033f9c2ad49c8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RefMerge.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "RefMerge"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RefNextIteration.pbtxt b/tensorflow/core/api_def/java_api/api_def_RefNextIteration.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5d008204b7ad34421c90cbed831ea3349502e14b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RefNextIteration.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "RefNextIteration"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RefSelect.pbtxt b/tensorflow/core/api_def/java_api/api_def_RefSelect.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d7cda2d5b3020400dcffd2b78906af16df45b063
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RefSelect.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "RefSelect"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RefSwitch.pbtxt b/tensorflow/core/api_def/java_api/api_def_RefSwitch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..78261d8b7e60dbcd57213bcee9ad01fbf9cd8e2a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RefSwitch.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "RefSwitch"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RegexFullMatch.pbtxt b/tensorflow/core/api_def/java_api/api_def_RegexFullMatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7f88e24eac6c94f58c2bd7a431ea022ac2c2e1d8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RegexFullMatch.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RegexFullMatch"
+  endpoint {
+    name: "strings.RegexFullMatch"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RegexReplace.pbtxt b/tensorflow/core/api_def/java_api/api_def_RegexReplace.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..01c9e93cab7b8253b518302853a2ab2cba6f748c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RegexReplace.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RegexReplace"
+  endpoint {
+    name: "strings.RegexReplace"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Relu.pbtxt b/tensorflow/core/api_def/java_api/api_def_Relu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..39d7fec4526622788a9ef85bb0a23d5a1b97646a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Relu.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Relu"
+  endpoint {
+    name: "nn.Relu"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Relu6.pbtxt b/tensorflow/core/api_def/java_api/api_def_Relu6.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fcc012b5033a1311bc57bd82a8a33beb0f38c9a6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Relu6.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Relu6"
+  endpoint {
+    name: "nn.Relu6"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Relu6Grad.pbtxt b/tensorflow/core/api_def/java_api/api_def_Relu6Grad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..33e959cc7b36a8fc03f57a3c82ab0f1734696e43
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Relu6Grad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Relu6Grad"
+  endpoint {
+    name: "nn.Relu6Grad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReluGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReluGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ec4a8b5f972d1075dde62c74ad9ae987c4e8984c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReluGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ReluGrad"
+  endpoint {
+    name: "nn.ReluGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RemoteCall.pbtxt b/tensorflow/core/api_def/java_api/api_def_RemoteCall.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a0ddb01784792f3c215cbdf60fd51748f16b5916
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RemoteCall.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "RemoteCall"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RemoteFusedGraphExecute.pbtxt b/tensorflow/core/api_def/java_api/api_def_RemoteFusedGraphExecute.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b73e633ef2c45c8e9e11f2a46b6d5fd5f33fd3e1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RemoteFusedGraphExecute.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "RemoteFusedGraphExecute"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RepeatDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_RepeatDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..871824f8867b1e1b7d777711b4a27b1df27f81da
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RepeatDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RepeatDataset"
+  endpoint {
+    name: "data.RepeatDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RequantizationRange.pbtxt b/tensorflow/core/api_def/java_api/api_def_RequantizationRange.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d365e8992558ce804133114c2fbcf59db30d4379
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RequantizationRange.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RequantizationRange"
+  endpoint {
+    name: "quantization.RequantizationRange"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Requantize.pbtxt b/tensorflow/core/api_def/java_api/api_def_Requantize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d397cde4db932987e3a2a8880673c2f7e2561d1a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Requantize.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Requantize"
+  endpoint {
+    name: "quantization.Requantize"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Reshape.pbtxt b/tensorflow/core/api_def/java_api/api_def_Reshape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4bf3a409d1afcd5c2b763037a5b1b05073499701
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Reshape.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Reshape"
+  endpoint {
+    name: "Reshape"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResizeArea.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResizeArea.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5358c18d4b423749e63bc77bcfe206005988a9d7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResizeArea.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResizeArea"
+  endpoint {
+    name: "image.ResizeArea"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResizeBicubic.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResizeBicubic.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0d0942e26624981a4e1966765fee11bb390c0813
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResizeBicubic.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResizeBicubic"
+  endpoint {
+    name: "image.ResizeBicubic"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResizeBicubicGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResizeBicubicGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..12e61dc8238e78f6f4e55f2cdd224488212b7a52
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResizeBicubicGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResizeBicubicGrad"
+  endpoint {
+    name: "image.ResizeBicubicGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResizeBilinear.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResizeBilinear.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ad123744a920406562327e368b2acacdd3ab719a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResizeBilinear.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResizeBilinear"
+  endpoint {
+    name: "image.ResizeBilinear"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResizeBilinearGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResizeBilinearGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..04f3e9f19efb782f33eac84cb4aa747588217991
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResizeBilinearGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResizeBilinearGrad"
+  endpoint {
+    name: "image.ResizeBilinearGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResizeNearestNeighbor.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResizeNearestNeighbor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..86ad39a51738067d9e4c5fc5fec98c070f1cc504
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResizeNearestNeighbor.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResizeNearestNeighbor"
+  endpoint {
+    name: "image.ResizeNearestNeighbor"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResizeNearestNeighborGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResizeNearestNeighborGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..70eeb906fab4e3bcb3884841d2dd777016958973
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResizeNearestNeighborGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResizeNearestNeighborGrad"
+  endpoint {
+    name: "image.ResizeNearestNeighborGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdaMax.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdaMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ff57bd5849a9b291fee68fb658f4df651dbdff13
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdaMax.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyAdaMax"
+  endpoint {
+    name: "train.ResourceApplyAdaMax"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdadelta.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdadelta.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d4369f0eade908f5e7c45f9089167f5433357a51
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdadelta.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyAdadelta"
+  endpoint {
+    name: "train.ResourceApplyAdadelta"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdagrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9bf7c20ee7b17be285e5f876c90f57eb854ecef0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdagrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyAdagrad"
+  endpoint {
+    name: "train.ResourceApplyAdagrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdagradDA.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdagradDA.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..afddaaff5737e5e3cdd0dd660a9d6db3fbd0e64f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdagradDA.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyAdagradDA"
+  endpoint {
+    name: "train.ResourceApplyAdagradDa"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdam.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdam.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..390bd999c4555ec12401c8c49309a6400c281e5d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyAdam.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyAdam"
+  endpoint {
+    name: "train.ResourceApplyAdam"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyAddSign.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyAddSign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bf944477be351677541625b38e9fafe62eb0030e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyAddSign.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyAddSign"
+  endpoint {
+    name: "train.ResourceApplyAddSign"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyCenteredRMSProp.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyCenteredRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..85c97b430a83aeda97859392d5064320e3828b4d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyCenteredRMSProp.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyCenteredRMSProp"
+  endpoint {
+    name: "train.ResourceApplyCenteredRmsProp"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyFtrl.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyFtrl.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..61bec5bb10983da12d8acb36d3d6e7afcaf2416e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyFtrl.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyFtrl"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyFtrlV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyFtrlV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8209fd607e172f72b5a39ad52bca683fd13eb56d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyFtrlV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyFtrlV2"
+  endpoint {
+    name: "train.ResourceApplyFtrl"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyGradientDescent.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyGradientDescent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a54fed14d1733fdaab577c1cf04ab59eb50b35b2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyGradientDescent.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyGradientDescent"
+  endpoint {
+    name: "train.ResourceApplyGradientDescent"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyMomentum.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..debb0a8131eb9542ca95ce4fc01f96298b2ae3ab
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyMomentum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyMomentum"
+  endpoint {
+    name: "train.ResourceApplyMomentum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyPowerSign.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyPowerSign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..96df22c81f0cd66054372545cbcccf2b8a8d7d7e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyPowerSign.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyPowerSign"
+  endpoint {
+    name: "train.ResourceApplyPowerSign"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyProximalAdagrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyProximalAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..809b8b3af3baacfeb28ec282f33b9aa64920676a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyProximalAdagrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyProximalAdagrad"
+  endpoint {
+    name: "train.ResourceApplyProximalAdagrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyProximalGradientDescent.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyProximalGradientDescent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c9ff5a499d5dba071f1a3cea8e2266602e1fc88c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyProximalGradientDescent.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyProximalGradientDescent"
+  endpoint {
+    name: "train.ResourceApplyProximalGradientDescent"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceApplyRMSProp.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceApplyRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fa3adf759e0d9e1fe9d0cb0d9f18a77722b197d0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceApplyRMSProp.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceApplyRMSProp"
+  endpoint {
+    name: "train.ResourceApplyRmsProp"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceCountUpTo.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceCountUpTo.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..439c1f17557de26121c00074122f224cfd1bdd1f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceCountUpTo.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ResourceCountUpTo"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceGather.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceGather.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..79c6e8abbcb14e0075854d6f1584f7d8ff4e0759
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceGather.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ResourceGather"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceScatterAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceScatterAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e4184e33bf00d1a5c8825e406da311eb371436ba
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceScatterAdd.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ResourceScatterAdd"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceScatterDiv.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceScatterDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3e21c24a588968daef1d519d039230e6b02b7617
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceScatterDiv.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ResourceScatterDiv"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceScatterMax.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceScatterMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d25b14272d63e750d163906bbfff7b5244446f87
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceScatterMax.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ResourceScatterMax"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceScatterMin.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceScatterMin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6243cc1ae3e54e4d77a7615c6260206a95242fbe
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceScatterMin.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ResourceScatterMin"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceScatterMul.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceScatterMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..393e5556c0beb1e10e412813bf571100d3770efd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceScatterMul.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ResourceScatterMul"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceScatterNdAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceScatterNdAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2fd38f7be87241c77c213babd43ea91f0743d4e9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceScatterNdAdd.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ResourceScatterNdAdd"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceScatterNdUpdate.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceScatterNdUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..732de5f1cc2de7389075f192fe94ba9f7a8e1406
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceScatterNdUpdate.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ResourceScatterNdUpdate"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceScatterSub.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceScatterSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..77081dda4d59a9bee17a69493dc85800f81bcdc3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceScatterSub.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ResourceScatterSub"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceScatterUpdate.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceScatterUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9c2cc0ec210c944220558ffb4c1a5bdcc255a446
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceScatterUpdate.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ResourceScatterUpdate"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyAdadelta.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyAdadelta.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c6e3ae2219c7a13f1ce07a0c0a68c9827dd50f5e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyAdadelta.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceSparseApplyAdadelta"
+  endpoint {
+    name: "train.ResourceSparseApplyAdadelta"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyAdagrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5be4d6199b55f8a75b8506d6491dcff2136f6143
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyAdagrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceSparseApplyAdagrad"
+  endpoint {
+    name: "train.ResourceSparseApplyAdagrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyAdagradDA.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyAdagradDA.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0547687d6408a1ab7caf1471415552ce269affc7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyAdagradDA.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceSparseApplyAdagradDA"
+  endpoint {
+    name: "train.ResourceSparseApplyAdagradDa"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyCenteredRMSProp.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyCenteredRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..632b0ab4c20c36f2f61a1d771de799d7d8ae5f92
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyCenteredRMSProp.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceSparseApplyCenteredRMSProp"
+  endpoint {
+    name: "train.ResourceSparseApplyCenteredRmsProp"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyFtrl.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyFtrl.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2e6fed94691a2490311c04e0d0d5a6bc0ab0d786
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyFtrl.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceSparseApplyFtrl"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyFtrlV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyFtrlV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cd126d78ab6ff277299b8a763a11e4d37c2e3904
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyFtrlV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceSparseApplyFtrlV2"
+  endpoint {
+    name: "train.ResourceSparseApplyFtrl"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyMomentum.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7e00039e01411cc473efb03bd2bbaebb8bf1ee14
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyMomentum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceSparseApplyMomentum"
+  endpoint {
+    name: "train.ResourceSparseApplyMomentum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyProximalAdagrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyProximalAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..04fe8504e5dc019e467024a8e0a5146bb8b6cd46
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyProximalAdagrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceSparseApplyProximalAdagrad"
+  endpoint {
+    name: "train.ResourceSparseApplyProximalAdagrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyProximalGradientDescent.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyProximalGradientDescent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..27df43c9c505c3ff25e6b1fcc08aee1efb71867d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyProximalGradientDescent.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceSparseApplyProximalGradientDescent"
+  endpoint {
+    name: "train.ResourceSparseApplyProximalGradientDescent"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyRMSProp.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ec8910a88a725bb7a6b5327e9ebd44122d272ff4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceSparseApplyRMSProp.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ResourceSparseApplyRMSProp"
+  endpoint {
+    name: "train.ResourceSparseApplyRmsProp"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ResourceStridedSliceAssign.pbtxt b/tensorflow/core/api_def/java_api/api_def_ResourceStridedSliceAssign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..83805389b98dec01432cb3aa184ec5c5db0ccec4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ResourceStridedSliceAssign.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ResourceStridedSliceAssign"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Restore.pbtxt b/tensorflow/core/api_def/java_api/api_def_Restore.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5e5b021b084b66f94e400544433455feb28d95e6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Restore.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Restore"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RestoreSlice.pbtxt b/tensorflow/core/api_def/java_api/api_def_RestoreSlice.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0728f5908b2cb9164df8e03a8218542278ac3261
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RestoreSlice.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RestoreSlice"
+  endpoint {
+    name: "train.RestoreSlice"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RestoreV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_RestoreV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..909968873f25553704dfac60cd6be8dd8ab8f5dc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RestoreV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RestoreV2"
+  endpoint {
+    name: "train.Restore"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Reverse.pbtxt b/tensorflow/core/api_def/java_api/api_def_Reverse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d2a199d2fcde0776e90f2760ecfe89e26f733d6c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Reverse.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Reverse"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReverseSequence.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReverseSequence.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..87638c0dcc99a3c1f4bc86caf00c61d551a64b50
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReverseSequence.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ReverseSequence"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ReverseV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_ReverseV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..71efbe1892e71f26cf8bf1f8f52a86d45851588f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ReverseV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ReverseV2"
+  endpoint {
+    name: "Reverse"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RightShift.pbtxt b/tensorflow/core/api_def/java_api/api_def_RightShift.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..68fab3e8cf87574a13436df6a96400e32168ff5c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RightShift.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RightShift"
+  endpoint {
+    name: "bitwise.RightShift"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Rint.pbtxt b/tensorflow/core/api_def/java_api/api_def_Rint.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..48fbcc7c346fe14302800cc3fe7fe78b325d9819
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Rint.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Rint"
+  endpoint {
+    name: "math.Rint"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Roll.pbtxt b/tensorflow/core/api_def/java_api/api_def_Roll.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..50f7915a65a91d5e02085c794d19d84976e54c4a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Roll.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Roll"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Round.pbtxt b/tensorflow/core/api_def/java_api/api_def_Round.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dd612a33d63407af961de2f564feaaebc61f6d3c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Round.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Round"
+  endpoint {
+    name: "math.Round"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Rpc.pbtxt b/tensorflow/core/api_def/java_api/api_def_Rpc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0d1e2b90e6a7b08f546b67a2767bfa14e9b0e534
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Rpc.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Rpc"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Rsqrt.pbtxt b/tensorflow/core/api_def/java_api/api_def_Rsqrt.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..06b1b81ecd47413d456d1b3a38ef8f6ae5045821
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Rsqrt.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Rsqrt"
+  endpoint {
+    name: "math.Rsqrt"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_RsqrtGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_RsqrtGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..88073b6f25459375d01446352617f5528a1cce40
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_RsqrtGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "RsqrtGrad"
+  endpoint {
+    name: "math.RsqrtGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SampleDistortedBoundingBox.pbtxt b/tensorflow/core/api_def/java_api/api_def_SampleDistortedBoundingBox.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3dffd53b05918d96d574b1265d6eb45963bdb8f8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SampleDistortedBoundingBox.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SampleDistortedBoundingBox"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SampleDistortedBoundingBoxV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_SampleDistortedBoundingBoxV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..655731413788b10d93f35f49153c3db77df75ee3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SampleDistortedBoundingBoxV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SampleDistortedBoundingBoxV2"
+  endpoint {
+    name: "image.SampleDistortedBoundingBox"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Save.pbtxt b/tensorflow/core/api_def/java_api/api_def_Save.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..36d44001d5b081a2216df2f6912b238243551fed
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Save.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Save"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SaveSlices.pbtxt b/tensorflow/core/api_def/java_api/api_def_SaveSlices.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b33603568009a406e0628d5a6f2ba5592383d2d4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SaveSlices.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SaveSlices"
+  endpoint {
+    name: "train.SaveSlices"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SaveV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_SaveV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..644d1824aa15831139e2c245ae1ff517aca74f82
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SaveV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SaveV2"
+  endpoint {
+    name: "train.Save"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScalarSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScalarSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c339ce0a7a55bd2d2b284260ffb3d6ef56e06046
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScalarSummary.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ScalarSummary"
+  endpoint {
+    name: "summary.ScalarSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScanDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScanDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..89b63c53f70d3092acd4da749f17d49ab793532b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScanDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ScanDataset"
+  endpoint {
+    name: "data.ScanDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScatterAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScatterAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..41c63dc0a4064cd9ad370e94ebd1c1a3b18ab43c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScatterAdd.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ScatterAdd"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScatterDiv.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScatterDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5754249eafcf294fb55b13e462db0310d2de3284
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScatterDiv.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ScatterDiv"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScatterMax.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScatterMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aa6375cbd76f1c569d1ab5d9e0c1f6dcd8c7d200
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScatterMax.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ScatterMax"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScatterMin.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScatterMin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ea007120c36c942a69bc2689da67487d5a5f6367
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScatterMin.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ScatterMin"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScatterMul.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScatterMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f1d91258e4bdd247077b7278ff2ab7160c777406
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScatterMul.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ScatterMul"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScatterNd.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScatterNd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ef01b2fcacb9bed05eb38de1fa93013a1408cd3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScatterNd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ScatterNd"
+  endpoint {
+    name: "ScatterNd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScatterNdAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScatterNdAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bea152a9da5a1ce7959751c26c8c26b25f430ce4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScatterNdAdd.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ScatterNdAdd"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScatterNdNonAliasingAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScatterNdNonAliasingAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4dd756bfc0c3631a3bbb53d9ab041b61326dcd2f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScatterNdNonAliasingAdd.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ScatterNdNonAliasingAdd"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScatterNdSub.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScatterNdSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..384e79d64ef908b1df0dd3117097d5b9181d64ce
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScatterNdSub.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ScatterNdSub"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScatterNdUpdate.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScatterNdUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..92fce7f0ac3b91f77841ae9d53b99d5bfc2956e6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScatterNdUpdate.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ScatterNdUpdate"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScatterSub.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScatterSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5baaa4f6045eeb9bd89a4ec66e7fcaff6ad8abc6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScatterSub.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ScatterSub"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ScatterUpdate.pbtxt b/tensorflow/core/api_def/java_api/api_def_ScatterUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..83ac128ed60ed25b5f601950007ea581c19a1277
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ScatterUpdate.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ScatterUpdate"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SdcaFprint.pbtxt b/tensorflow/core/api_def/java_api/api_def_SdcaFprint.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ce179918cd07b727bfe7aab129edc82b8de9fc3a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SdcaFprint.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SdcaFprint"
+  endpoint {
+    name: "train.SdcaFprint"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SdcaOptimizer.pbtxt b/tensorflow/core/api_def/java_api/api_def_SdcaOptimizer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fab6393f602d6bf8831a8579ba101f917eb7daf5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SdcaOptimizer.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SdcaOptimizer"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SdcaOptimizerV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_SdcaOptimizerV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b72ee64e501aa0c550cf31082217b58d76aebc82
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SdcaOptimizerV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SdcaOptimizerV2"
+  endpoint {
+    name: "train.SdcaOptimizer"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SdcaShrinkL1.pbtxt b/tensorflow/core/api_def/java_api/api_def_SdcaShrinkL1.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..83993bcf1490cdd74aa278fe8548d5967e64c84c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SdcaShrinkL1.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SdcaShrinkL1"
+  endpoint {
+    name: "train.SdcaShrinkL1"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SegmentMax.pbtxt b/tensorflow/core/api_def/java_api/api_def_SegmentMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6ac26c9e9e3964eeda25bd56de2d4e3bd2b634f3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SegmentMax.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SegmentMax"
+  endpoint {
+    name: "math.SegmentMax"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SegmentMean.pbtxt b/tensorflow/core/api_def/java_api/api_def_SegmentMean.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..982db87bf09f7fd5dc6066f8019cb3af21fde183
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SegmentMean.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SegmentMean"
+  endpoint {
+    name: "math.SegmentMean"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SegmentMin.pbtxt b/tensorflow/core/api_def/java_api/api_def_SegmentMin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7a403b6c63d073e60250f0220aea982cb2596205
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SegmentMin.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SegmentMin"
+  endpoint {
+    name: "math.SegmentMin"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SegmentProd.pbtxt b/tensorflow/core/api_def/java_api/api_def_SegmentProd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1bf280edc4310fbf158be33e579b1a525e6152b4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SegmentProd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SegmentProd"
+  endpoint {
+    name: "math.SegmentProd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SegmentSum.pbtxt b/tensorflow/core/api_def/java_api/api_def_SegmentSum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3dcbc352253da3fa65ad488fa1c5db1db7dae98e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SegmentSum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SegmentSum"
+  endpoint {
+    name: "math.SegmentSum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Select.pbtxt b/tensorflow/core/api_def/java_api/api_def_Select.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eeff79284c2ea2d4aeb2519b316d935922bf3e06
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Select.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Select"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SelfAdjointEig.pbtxt b/tensorflow/core/api_def/java_api/api_def_SelfAdjointEig.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dc25ae9de2513a136d7889953dbbd9239ff3393e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SelfAdjointEig.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SelfAdjointEig"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SelfAdjointEigV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_SelfAdjointEigV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c79f08ac32360f616e547285858fe7548b221774
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SelfAdjointEigV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SelfAdjointEigV2"
+  endpoint {
+    name: "linalg.SelfAdjointEig"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Selu.pbtxt b/tensorflow/core/api_def/java_api/api_def_Selu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7002d5be1221d6d34ed7f1cace1f6d672a8974fc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Selu.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Selu"
+  endpoint {
+    name: "nn.Selu"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SeluGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_SeluGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b34e2f223934da532d6099a452506245304d5879
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SeluGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SeluGrad"
+  endpoint {
+    name: "nn.SeluGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SerializeIterator.pbtxt b/tensorflow/core/api_def/java_api/api_def_SerializeIterator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37789c753b4762e699a7059211db51d0ea352668
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SerializeIterator.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SerializeIterator"
+  endpoint {
+    name: "data.SerializeIterator"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SerializeManySparse.pbtxt b/tensorflow/core/api_def/java_api/api_def_SerializeManySparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..567a8e4b7f0f39a5cc3b52f170ec5504364b20e0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SerializeManySparse.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SerializeManySparse"
+  endpoint {
+    name: "io.SerializeManySparse"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SerializeSparse.pbtxt b/tensorflow/core/api_def/java_api/api_def_SerializeSparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ad86c7095c366b68ac99098a90c8e3fe95cd13ae
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SerializeSparse.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SerializeSparse"
+  endpoint {
+    name: "io.SerializeSparse"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SerializeTensor.pbtxt b/tensorflow/core/api_def/java_api/api_def_SerializeTensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..169120a0635b812d1f585d0d1b7fa33873817ce8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SerializeTensor.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SerializeTensor"
+  endpoint {
+    name: "io.SerializeTensor"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SetSize.pbtxt b/tensorflow/core/api_def/java_api/api_def_SetSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1c000e9c8aacdb7c9b76bc4cc0b7160759bf73fc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SetSize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "SetSize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SetStatsAggregatorDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_SetStatsAggregatorDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f57abe5a667bf4cffee537a3948ef69e2577fdde
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SetStatsAggregatorDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SetStatsAggregatorDataset"
+  endpoint {
+    name: "data.SetStatsAggregatorDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Shape.pbtxt b/tensorflow/core/api_def/java_api/api_def_Shape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5074000b53745c40ab39a046e0ea05a47f148991
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Shape.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Shape"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ShapeN.pbtxt b/tensorflow/core/api_def/java_api/api_def_ShapeN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b4bfb91118a30cefaf847c12452cdea89ff3be2b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ShapeN.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ShapeN"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ShardedFilename.pbtxt b/tensorflow/core/api_def/java_api/api_def_ShardedFilename.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8efd0afb8b3ce3b0b19929926be53e880d2282d7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ShardedFilename.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ShardedFilename"
+  endpoint {
+    name: "io.ShardedFilename"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ShardedFilespec.pbtxt b/tensorflow/core/api_def/java_api/api_def_ShardedFilespec.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e31cac1040c390ba30c9af34eef6ec28436a91d7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ShardedFilespec.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ShardedFilespec"
+  endpoint {
+    name: "io.ShardedFilespec"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ShuffleAndRepeatDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ShuffleAndRepeatDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..543d5a109c7fc602b5511aadcef81a107ccadc64
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ShuffleAndRepeatDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ShuffleAndRepeatDataset"
+  endpoint {
+    name: "data.ShuffleAndRepeatDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ShuffleDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ShuffleDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..36f4979c969a0863f4083c8c268383dd6de62602
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ShuffleDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ShuffleDataset"
+  endpoint {
+    name: "data.ShuffleDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Sigmoid.pbtxt b/tensorflow/core/api_def/java_api/api_def_Sigmoid.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b6e16a41104972d93738f09940ff8370032f6afe
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Sigmoid.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Sigmoid"
+  endpoint {
+    name: "math.Sigmoid"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SigmoidGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_SigmoidGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bb141abe3ba6e89b112dba88d0244cca3fbb46f0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SigmoidGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SigmoidGrad"
+  endpoint {
+    name: "math.SigmoidGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Sign.pbtxt b/tensorflow/core/api_def/java_api/api_def_Sign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..435fb9e825d16f4e3eabbe4faef499b4b008e1bb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Sign.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Sign"
+  endpoint {
+    name: "math.Sign"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Sin.pbtxt b/tensorflow/core/api_def/java_api/api_def_Sin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2fc95755bac13de35ca248bd38f7a6e2c79e2e02
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Sin.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Sin"
+  endpoint {
+    name: "math.Sin"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Sinh.pbtxt b/tensorflow/core/api_def/java_api/api_def_Sinh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f343685e80bf1cf683991f67a13a7c4f91dad831
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Sinh.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Sinh"
+  endpoint {
+    name: "math.Sinh"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Size.pbtxt b/tensorflow/core/api_def/java_api/api_def_Size.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a41cddd8ac7aadfc2b11599305fa0b921e5ca1a6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Size.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Size"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SkipDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_SkipDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..39bce67a3e445e6b656b8f0979d8533a8d9cf53b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SkipDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SkipDataset"
+  endpoint {
+    name: "data.SkipDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Skipgram.pbtxt b/tensorflow/core/api_def/java_api/api_def_Skipgram.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d095c7b61b9c772cd2ac09c8333b15077f4ef78e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Skipgram.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Skipgram"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Slice.pbtxt b/tensorflow/core/api_def/java_api/api_def_Slice.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..adfe6fa4fd90657eb1d06ff285b07d0d81651c82
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Slice.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Slice"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SlideDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_SlideDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bc284c2833a6cc502d12155e0ce9ca09fef120cb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SlideDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SlideDataset"
+  endpoint {
+    name: "data.SlideDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Snapshot.pbtxt b/tensorflow/core/api_def/java_api/api_def_Snapshot.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6e49c1a5431b7ea49037bc97a9f2190ea425c013
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Snapshot.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Snapshot"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Softmax.pbtxt b/tensorflow/core/api_def/java_api/api_def_Softmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cb27a04d217da850d99923ce4fa3a8d04f20c25a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Softmax.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Softmax"
+  endpoint {
+    name: "nn.Softmax"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SoftmaxCrossEntropyWithLogits.pbtxt b/tensorflow/core/api_def/java_api/api_def_SoftmaxCrossEntropyWithLogits.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e064562c0f25e4f63353a36d08206b25eb0d4d08
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SoftmaxCrossEntropyWithLogits.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SoftmaxCrossEntropyWithLogits"
+  endpoint {
+    name: "nn.SoftmaxCrossEntropyWithLogits"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Softplus.pbtxt b/tensorflow/core/api_def/java_api/api_def_Softplus.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..671656c28736a0376a89cf1ed5c1b29edd646fc0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Softplus.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Softplus"
+  endpoint {
+    name: "math.Softplus"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SoftplusGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_SoftplusGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d1b074a9b7a2e6bee8dcbae3f5eed3610a753e80
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SoftplusGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SoftplusGrad"
+  endpoint {
+    name: "math.SoftplusGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Softsign.pbtxt b/tensorflow/core/api_def/java_api/api_def_Softsign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..23aa1e3c58bce96423ed388b059b1bd66c8135ec
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Softsign.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Softsign"
+  endpoint {
+    name: "nn.Softsign"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SoftsignGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_SoftsignGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..73faf74511fbe36a045c653c01e4ee0e5b811186
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SoftsignGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SoftsignGrad"
+  endpoint {
+    name: "nn.SoftsignGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SpaceToBatch.pbtxt b/tensorflow/core/api_def/java_api/api_def_SpaceToBatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4fe7232e544111b861ec9b36afc7cd369ca35903
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SpaceToBatch.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SpaceToBatch"
+  endpoint {
+    name: "nn.SpaceToBatch"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SpaceToBatchND.pbtxt b/tensorflow/core/api_def/java_api/api_def_SpaceToBatchND.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6612b48286c065fd933687b2e9c09162aad4f231
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SpaceToBatchND.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SpaceToBatchND"
+  endpoint {
+    name: "SpaceToBatchNd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SpaceToDepth.pbtxt b/tensorflow/core/api_def/java_api/api_def_SpaceToDepth.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cb421c75db0d3eb0d6d0a9168cc8b772f6c1588a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SpaceToDepth.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SpaceToDepth"
+  endpoint {
+    name: "nn.SpaceToDepth"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseAccumulatorApplyGradient.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseAccumulatorApplyGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cdb0b14b6711badf9f0a304dc0411eadc1f59611
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseAccumulatorApplyGradient.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseAccumulatorApplyGradient"
+  endpoint {
+    name: "sparse.SparseAccumulatorApplyGradient"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseAccumulatorTakeGradient.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseAccumulatorTakeGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8fc1e70959691bf7b521da47173cf75d9592521b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseAccumulatorTakeGradient.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseAccumulatorTakeGradient"
+  endpoint {
+    name: "sparse.SparseAccumulatorTakeGradient"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0761f2ed1623643bba0e5557f566dd96534f7962
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseAdd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseAdd"
+  endpoint {
+    name: "sparse.SparseAdd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseAddGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseAddGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6529c46a17edec341a694f74d41c6000d732cf40
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseAddGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseAddGrad"
+  endpoint {
+    name: "sparse.SparseAddGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseApplyAdadelta.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseApplyAdadelta.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7122f210a432b31ee54b603f12450de181472288
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseApplyAdadelta.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseApplyAdadelta"
+  endpoint {
+    name: "train.SparseApplyAdadelta"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseApplyAdagrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseApplyAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..184a8cfb2fb98053189b611ed7688e30ba9f4f3d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseApplyAdagrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseApplyAdagrad"
+  endpoint {
+    name: "train.SparseApplyAdagrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseApplyAdagradDA.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseApplyAdagradDA.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..950dc00dd3578b8c9128f3f4f1e208bdee339713
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseApplyAdagradDA.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseApplyAdagradDA"
+  endpoint {
+    name: "train.SparseApplyAdagradDa"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseApplyCenteredRMSProp.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseApplyCenteredRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..090536f5eb9fc447e22ef4f8971446cb0ca99b7d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseApplyCenteredRMSProp.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseApplyCenteredRMSProp"
+  endpoint {
+    name: "train.SparseApplyCenteredRmsProp"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseApplyFtrl.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseApplyFtrl.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e961fb7f6b1922911954abd39033626feeace80f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseApplyFtrl.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseApplyFtrl"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseApplyFtrlV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseApplyFtrlV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..43b9833451a18e99b3c864453e1e777b0e7ee48f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseApplyFtrlV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseApplyFtrlV2"
+  endpoint {
+    name: "train.SparseApplyFtrl"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseApplyMomentum.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseApplyMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f7e79c5e7e43cbf594bfe94405f0239ad39343db
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseApplyMomentum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseApplyMomentum"
+  endpoint {
+    name: "train.SparseApplyMomentum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseApplyProximalAdagrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseApplyProximalAdagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ac6cf771cb608ba6f96b67374f439dc7cfd9e7b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseApplyProximalAdagrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseApplyProximalAdagrad"
+  endpoint {
+    name: "train.SparseApplyProximalAdagrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseApplyProximalGradientDescent.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseApplyProximalGradientDescent.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..926ed2c1d4d4c7dcbc8d8c4f51b33ec5a410f389
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseApplyProximalGradientDescent.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseApplyProximalGradientDescent"
+  endpoint {
+    name: "train.SparseApplyProximalGradientDescent"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseApplyRMSProp.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseApplyRMSProp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3e39f4ffa58dc1fafc6eddc30479275a1a6f4e52
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseApplyRMSProp.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseApplyRMSProp"
+  endpoint {
+    name: "train.SparseApplyRmsProp"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseConcat.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseConcat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ceb600a42c45fed650d09d82570d92bec7a0e93
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseConcat.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseConcat"
+  endpoint {
+    name: "sparse.SparseConcat"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseConditionalAccumulator.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseConditionalAccumulator.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3dc2c1ea8a00778af4a440a33015b7cb7c4cce08
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseConditionalAccumulator.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseConditionalAccumulator"
+  endpoint {
+    name: "sparse.SparseConditionalAccumulator"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseCross.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseCross.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..130f333d35bfc0a4ba4d430217e1682d0e2794ec
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseCross.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseCross"
+  endpoint {
+    name: "sparse.SparseCross"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseDenseCwiseAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseDenseCwiseAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..013b7eede948c672379e163a58b805614262df87
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseDenseCwiseAdd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseDenseCwiseAdd"
+  endpoint {
+    name: "sparse.SparseDenseCwiseAdd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseDenseCwiseDiv.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseDenseCwiseDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8cf56d7b41897a85cd7a0a2f94c3b753b8335f48
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseDenseCwiseDiv.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseDenseCwiseDiv"
+  endpoint {
+    name: "sparse.SparseDenseCwiseDiv"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseDenseCwiseMul.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseDenseCwiseMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37c50f232767cb6a3c9dbe3bb7953da3a63a64e5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseDenseCwiseMul.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseDenseCwiseMul"
+  endpoint {
+    name: "sparse.SparseDenseCwiseMul"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseFillEmptyRows.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseFillEmptyRows.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3adddbd34bffeb005f9c69b75c0a48b8e25413d5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseFillEmptyRows.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseFillEmptyRows"
+  endpoint {
+    name: "sparse.SparseFillEmptyRows"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseFillEmptyRowsGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseFillEmptyRowsGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..708069d028dcfa31183bf2c45cda5d7dc1762aa5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseFillEmptyRowsGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseFillEmptyRowsGrad"
+  endpoint {
+    name: "sparse.SparseFillEmptyRowsGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseMatMul.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseMatMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..445d53b02364de43e5191cd6e2753214aa0bbb5f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseMatMul.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseMatMul"
+  endpoint {
+    name: "sparse.SparseMatMul"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseReduceMax.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseReduceMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a7467b9b474415da3ae4c6aaaa924984b0a066a7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseReduceMax.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseReduceMax"
+  endpoint {
+    name: "sparse.SparseReduceMax"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseReduceMaxSparse.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseReduceMaxSparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..987e98467eda2adb3ff4729acfcf35f0136abbd4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseReduceMaxSparse.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseReduceMaxSparse"
+  endpoint {
+    name: "sparse.SparseReduceMaxSparse"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseReduceSum.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseReduceSum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..739fb5fb952c23970ab9c40ae20062682fffba34
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseReduceSum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseReduceSum"
+  endpoint {
+    name: "sparse.SparseReduceSum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseReduceSumSparse.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseReduceSumSparse.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..64d6d45f1a2bf009fce79e62641d87f9f60122f1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseReduceSumSparse.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseReduceSumSparse"
+  endpoint {
+    name: "sparse.SparseReduceSumSparse"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseReorder.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseReorder.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..202066e76f502c8b94d2cdfd55dcc25ab6f8a6d1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseReorder.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseReorder"
+  endpoint {
+    name: "sparse.SparseReorder"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseReshape.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseReshape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0a393a6105f751df49a231182f9f86b515502e1e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseReshape.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseReshape"
+  endpoint {
+    name: "sparse.SparseReshape"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSegmentMean.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSegmentMean.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9187dbed670b534721163f4387f4ff1d671f2b74
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSegmentMean.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSegmentMean"
+  endpoint {
+    name: "sparse.SparseSegmentMean"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSegmentMeanGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSegmentMeanGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1b7d5bbcf0c7fa9c3771ab03a6e5b0d549a4362f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSegmentMeanGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSegmentMeanGrad"
+  endpoint {
+    name: "sparse.SparseSegmentMeanGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bb3ac2256d121319355b798390c129c29c1b6144
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSegmentMeanWithNumSegments.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSegmentMeanWithNumSegments"
+  endpoint {
+    name: "sparse.SparseSegmentMeanWithNumSegments"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSegmentSqrtN.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSegmentSqrtN.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2cecff503ff5534c1367f9c67d9598f7df7d92c2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSegmentSqrtN.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSegmentSqrtN"
+  endpoint {
+    name: "sparse.SparseSegmentSqrtN"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSegmentSqrtNGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSegmentSqrtNGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e6973eb773a1fdfe1f2aa33c659ed4b8997fe4fe
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSegmentSqrtNGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSegmentSqrtNGrad"
+  endpoint {
+    name: "sparse.SparseSegmentSqrtNGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..78aed85f0a96f9ae941bd32afec406b3e0f17b58
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSegmentSqrtNWithNumSegments.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSegmentSqrtNWithNumSegments"
+  endpoint {
+    name: "sparse.SparseSegmentSqrtNWithNumSegments"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSegmentSum.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSegmentSum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f18f35166700173e20a5d4af3326d395a589f680
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSegmentSum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSegmentSum"
+  endpoint {
+    name: "sparse.SparseSegmentSum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSegmentSumWithNumSegments.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8e384857cc76b0f79cdbca3714a6de88def13355
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSegmentSumWithNumSegments.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSegmentSumWithNumSegments"
+  endpoint {
+    name: "sparse.SparseSegmentSumWithNumSegments"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSlice.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSlice.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1d794df9252472b618e92e3763b70e01364e5281
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSlice.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSlice"
+  endpoint {
+    name: "sparse.SparseSlice"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSliceGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSliceGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..979326c0fc78effddb67aa9a726b8dd174fb6fdb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSliceGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSliceGrad"
+  endpoint {
+    name: "sparse.SparseSliceGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSoftmax.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSoftmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..56c96640cb447f7b2956acb581c618275f1fb025
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSoftmax.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSoftmax"
+  endpoint {
+    name: "sparse.SparseSoftmax"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSoftmaxCrossEntropyWithLogits.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSoftmaxCrossEntropyWithLogits.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7627d5f6074cc919e5b325179412bc38d1bd2159
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSoftmaxCrossEntropyWithLogits.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSoftmaxCrossEntropyWithLogits"
+  endpoint {
+    name: "nn.SparseSoftmaxCrossEntropyWithLogits"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSparseMaximum.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSparseMaximum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..314309621389a7cd4004c9ec37144d331c6728c9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSparseMaximum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSparseMaximum"
+  endpoint {
+    name: "sparse.SparseSparseMaximum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSparseMinimum.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSparseMinimum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fc04bb4fed527b8d6543ea489396fb514f3a28ff
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSparseMinimum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSparseMinimum"
+  endpoint {
+    name: "sparse.SparseSparseMinimum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseSplit.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseSplit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0afc95199aa602dc1103fa02d1a0e586b78b08e1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseSplit.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseSplit"
+  endpoint {
+    name: "sparse.SparseSplit"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseTensorDenseAdd.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseTensorDenseAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fc5c882f8342d7f0fc1b12539c89c44631da2b6b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseTensorDenseAdd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseTensorDenseAdd"
+  endpoint {
+    name: "sparse.SparseTensorDenseAdd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseTensorDenseMatMul.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseTensorDenseMatMul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cd1f1e09e948aa94aac5650e3ebd847455449e99
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseTensorDenseMatMul.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseTensorDenseMatMul"
+  endpoint {
+    name: "sparse.SparseTensorDenseMatMul"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseTensorSliceDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseTensorSliceDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bb0d1d7a949617eda6bf23be393d31f6bcc6e343
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseTensorSliceDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseTensorSliceDataset"
+  endpoint {
+    name: "data.SparseTensorSliceDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseToDense.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseToDense.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..68df155e4619dd45f1681130db20c4e5a8cc0874
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseToDense.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseToDense"
+  endpoint {
+    name: "sparse.SparseToDense"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SparseToSparseSetOperation.pbtxt b/tensorflow/core/api_def/java_api/api_def_SparseToSparseSetOperation.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fb04366feadde65034765f5b47458b17def23cd2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SparseToSparseSetOperation.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SparseToSparseSetOperation"
+  endpoint {
+    name: "sparse.SparseToSparseSetOperation"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Split.pbtxt b/tensorflow/core/api_def/java_api/api_def_Split.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ffb7b52e091a35a0f8109d99c497208aa9774f24
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Split.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Split"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SplitV.pbtxt b/tensorflow/core/api_def/java_api/api_def_SplitV.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..94f4a08d70df26359e2aa32806ecb1f8f933be3a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SplitV.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "SplitV"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SqlDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_SqlDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8764e81af254e977d660b36a51a28e434979d83a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SqlDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SqlDataset"
+  endpoint {
+    name: "data.SqlDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Sqrt.pbtxt b/tensorflow/core/api_def/java_api/api_def_Sqrt.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..daa2a1ee86a6ec1a6de09886bb54b55f55b6dec5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Sqrt.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Sqrt"
+  endpoint {
+    name: "math.Sqrt"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SqrtGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_SqrtGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8eca1b89b31b4fac524edbe17f777fdd85824fee
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SqrtGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SqrtGrad"
+  endpoint {
+    name: "math.SqrtGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Square.pbtxt b/tensorflow/core/api_def/java_api/api_def_Square.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..561e10e020b9264176156be36f05c7b48deb0d7a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Square.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Square"
+  endpoint {
+    name: "math.Square"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SquaredDifference.pbtxt b/tensorflow/core/api_def/java_api/api_def_SquaredDifference.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..752dbcce7b81485ef6a6b5ed79e86a91999cdbaf
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SquaredDifference.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SquaredDifference"
+  endpoint {
+    name: "math.SquaredDifference"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Squeeze.pbtxt b/tensorflow/core/api_def/java_api/api_def_Squeeze.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..264c38460364d80e035b9c0347af67196d8ca00d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Squeeze.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Squeeze"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Stack.pbtxt b/tensorflow/core/api_def/java_api/api_def_Stack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8370beee630ace617651a4dafb56fc9e68998280
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Stack.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Stack"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StackClose.pbtxt b/tensorflow/core/api_def/java_api/api_def_StackClose.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ac3c410c1cc90739d4b657c6f6ffe66ac2e5a115
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StackClose.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StackClose"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StackCloseV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_StackCloseV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..28aff9e191371f024b0c8e97336c3d14487dcd0f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StackCloseV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StackCloseV2"
+  endpoint {
+    name: "StackClose"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StackPop.pbtxt b/tensorflow/core/api_def/java_api/api_def_StackPop.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b8658ecbad4fb889968d19a02a52ed18d32fa6c0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StackPop.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StackPop"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StackPopV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_StackPopV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d2ecf4e5a8782c0a39895020887232a6027bef41
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StackPopV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StackPopV2"
+  endpoint {
+    name: "StackPop"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StackPush.pbtxt b/tensorflow/core/api_def/java_api/api_def_StackPush.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d08fa27b2104f150227281ca3a76fbc2ce9ff001
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StackPush.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StackPush"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StackPushV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_StackPushV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..519fd6c6b28b2b199a9cfc4bbfc9e937a24c7ba7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StackPushV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StackPushV2"
+  endpoint {
+    name: "StackPush"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StackV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_StackV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..725e469a031e6077b0c907b574fdca9607bc57b1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StackV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StackV2"
+  endpoint {
+    name: "Stack"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Stage.pbtxt b/tensorflow/core/api_def/java_api/api_def_Stage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..87e0c7d9811f01aef0e35973512d53ea320c11d8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Stage.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Stage"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StageClear.pbtxt b/tensorflow/core/api_def/java_api/api_def_StageClear.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..26890a55b39827dad13ffdf701cf78c62c0a8f90
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StageClear.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "StageClear"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StagePeek.pbtxt b/tensorflow/core/api_def/java_api/api_def_StagePeek.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7c3ed3dc91c51d9a5c16d3ac3780310b4c9cdc8c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StagePeek.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "StagePeek"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StageSize.pbtxt b/tensorflow/core/api_def/java_api/api_def_StageSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d8188c3e0b30e20d734f8273b6b8cfb9c52e30df
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StageSize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "StageSize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StatefulPartitionedCall.pbtxt b/tensorflow/core/api_def/java_api/api_def_StatefulPartitionedCall.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2e6decf19adcc07a3b6d26bfef92af0909206432
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StatefulPartitionedCall.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "StatefulPartitionedCall"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StatelessIf.pbtxt b/tensorflow/core/api_def/java_api/api_def_StatelessIf.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37c7b9a9629fb8353c1d6c8b58d35d44b73f717c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StatelessIf.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "StatelessIf"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StatelessMultinomial.pbtxt b/tensorflow/core/api_def/java_api/api_def_StatelessMultinomial.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8cfbbfb2c256278b2af37c2083fea371343097fa
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StatelessMultinomial.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StatelessMultinomial"
+  endpoint {
+    name: "random.StatelessMultinomial"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StatelessRandomNormal.pbtxt b/tensorflow/core/api_def/java_api/api_def_StatelessRandomNormal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4e648cfa691ae0aeda8ed7d5d3a6692b15c40f6e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StatelessRandomNormal.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StatelessRandomNormal"
+  endpoint {
+    name: "random.StatelessRandomNormal"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StatelessRandomUniform.pbtxt b/tensorflow/core/api_def/java_api/api_def_StatelessRandomUniform.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6f8f328b41f3fda2e6a1394a15a62a35b112db69
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StatelessRandomUniform.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StatelessRandomUniform"
+  endpoint {
+    name: "random.StatelessRandomUniform"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StatelessRandomUniformInt.pbtxt b/tensorflow/core/api_def/java_api/api_def_StatelessRandomUniformInt.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2fec4a7cc622f76209b6da7c6170889b5cf4615d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StatelessRandomUniformInt.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StatelessRandomUniformInt"
+  endpoint {
+    name: "random.StatelessRandomUniformInt"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StatelessTruncatedNormal.pbtxt b/tensorflow/core/api_def/java_api/api_def_StatelessTruncatedNormal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c05071dd737de6cdfc5e8f818250fdbad9169540
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StatelessTruncatedNormal.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StatelessTruncatedNormal"
+  endpoint {
+    name: "random.StatelessTruncatedNormal"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StatelessWhile.pbtxt b/tensorflow/core/api_def/java_api/api_def_StatelessWhile.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1a4252c90ae791719eac3903ad7313e3dd472ac6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StatelessWhile.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "StatelessWhile"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StaticRegexFullMatch.pbtxt b/tensorflow/core/api_def/java_api/api_def_StaticRegexFullMatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..66b841ad74b26af5ec51653df35e68729d3aba31
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StaticRegexFullMatch.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StaticRegexFullMatch"
+  endpoint {
+    name: "strings.StaticRegexFullMatch"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StaticRegexReplace.pbtxt b/tensorflow/core/api_def/java_api/api_def_StaticRegexReplace.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..305bc8a3caf84c9f92636c04290ac61dd98c3799
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StaticRegexReplace.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StaticRegexReplace"
+  endpoint {
+    name: "strings.StaticRegexReplace"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StatsAggregatorHandle.pbtxt b/tensorflow/core/api_def/java_api/api_def_StatsAggregatorHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..301dc982e3266d0fc2fb6f97ed90e3276aff4f7b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StatsAggregatorHandle.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StatsAggregatorHandle"
+  endpoint {
+    name: "data.StatsAggregatorHandle"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StatsAggregatorSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_StatsAggregatorSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f83c8f5a4f66c639045813897d19b2b5b52a2457
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StatsAggregatorSummary.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StatsAggregatorSummary"
+  endpoint {
+    name: "summary.StatsAggregatorSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StopGradient.pbtxt b/tensorflow/core/api_def/java_api/api_def_StopGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ad8473e8161e97652889e7bc6749b6837a5b2419
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StopGradient.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "StopGradient"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StridedSlice.pbtxt b/tensorflow/core/api_def/java_api/api_def_StridedSlice.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b0c301dab134e0cde6389fff5bbaee96649ea0bc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StridedSlice.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "StridedSlice"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StridedSliceAssign.pbtxt b/tensorflow/core/api_def/java_api/api_def_StridedSliceAssign.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6850dc2d1b2c8120b70fdbbbca2ca3cc9eb5a423
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StridedSliceAssign.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "StridedSliceAssign"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StridedSliceGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_StridedSliceGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b03204bcc4d2e7e84e98bd54105dcfa465a883a1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StridedSliceGrad.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "StridedSliceGrad"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StringFormat.pbtxt b/tensorflow/core/api_def/java_api/api_def_StringFormat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cdd03139966e6fc662093cc55989dfb83a250aa2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StringFormat.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StringFormat"
+  endpoint {
+    name: "strings.StringFormat"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StringJoin.pbtxt b/tensorflow/core/api_def/java_api/api_def_StringJoin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b43ff157cd529481c2c3d634e0445492412bc477
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StringJoin.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StringJoin"
+  endpoint {
+    name: "strings.Join"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StringLength.pbtxt b/tensorflow/core/api_def/java_api/api_def_StringLength.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c8eb48cc3c720ee057647443b58ed79c38996d09
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StringLength.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StringLength"
+  endpoint {
+    name: "strings.StringLength"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StringSplit.pbtxt b/tensorflow/core/api_def/java_api/api_def_StringSplit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0e6d1851df16d31df70bedd52f8a2d0861637e85
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StringSplit.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StringSplit"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StringSplitV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_StringSplitV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..18c71d6bd7c90de2c93f185afd00fe1685f74709
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StringSplitV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StringSplitV2"
+  endpoint {
+    name: "strings.StringSplit"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StringStrip.pbtxt b/tensorflow/core/api_def/java_api/api_def_StringStrip.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..01691211ff6e2b4df5117bf0eb388865d92abc36
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StringStrip.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StringStrip"
+  endpoint {
+    name: "strings.Strip"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StringToHashBucket.pbtxt b/tensorflow/core/api_def/java_api/api_def_StringToHashBucket.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..765517578d0ff7d3212055b91e08d90afee92671
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StringToHashBucket.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StringToHashBucket"
+  endpoint {
+    name: "strings.ToHashBucket"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StringToHashBucketFast.pbtxt b/tensorflow/core/api_def/java_api/api_def_StringToHashBucketFast.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..de08bc2d36ff4d7fe15ccbb8dc55389a5835261a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StringToHashBucketFast.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StringToHashBucketFast"
+  endpoint {
+    name: "strings.ToHashBucketFast"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StringToHashBucketStrong.pbtxt b/tensorflow/core/api_def/java_api/api_def_StringToHashBucketStrong.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..15b9138238a64a8594c027e21a38cc9731750a37
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StringToHashBucketStrong.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StringToHashBucketStrong"
+  endpoint {
+    name: "strings.ToHashBucketStrong"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_StringToNumber.pbtxt b/tensorflow/core/api_def/java_api/api_def_StringToNumber.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..196f694da5cf2993dd1b420f86aad5b9a26c3251
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_StringToNumber.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "StringToNumber"
+  endpoint {
+    name: "strings.ToNumber"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Sub.pbtxt b/tensorflow/core/api_def/java_api/api_def_Sub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e79c6a0036bb17bb9e38f5edf0baa5cfb8c1f7d5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Sub.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Sub"
+  endpoint {
+    name: "math.Sub"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Substr.pbtxt b/tensorflow/core/api_def/java_api/api_def_Substr.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..78f34550a5b98608ef09e2d18c769d078c5feb96
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Substr.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Substr"
+  endpoint {
+    name: "strings.Substr"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Sum.pbtxt b/tensorflow/core/api_def/java_api/api_def_Sum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3999fa6ed12982a23fa37e9af1709f1c80a66e37
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Sum.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Sum"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_SummaryWriter.pbtxt b/tensorflow/core/api_def/java_api/api_def_SummaryWriter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8338c0fa1889e8cdcb82bbe4fa1c165485b38215
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SummaryWriter.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SummaryWriter"
+  endpoint {
+    name: "summary.SummaryWriter"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Svd.pbtxt b/tensorflow/core/api_def/java_api/api_def_Svd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..889d41628187892ed86bd394e46dda21e567cbdd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Svd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Svd"
+  endpoint {
+    name: "linalg.Svd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Switch.pbtxt b/tensorflow/core/api_def/java_api/api_def_Switch.pbtxt
index 0d3362a91e151093292ba6a30fd1554b6f3fba11..edd9255452dd97c0c7107d98063e13a382430da7 100644
--- a/tensorflow/core/api_def/java_api/api_def_Switch.pbtxt
+++ b/tensorflow/core/api_def/java_api/api_def_Switch.pbtxt
@@ -1,4 +1,6 @@
 op {
-  graph_op_name: "Switch" #TODO(karllessard) escape that reserved name
-  visibility: HIDDEN
+  graph_op_name: "Switch"
+  endpoint {
+    name: "SwitchCond"
+  }
 }
diff --git a/tensorflow/core/api_def/java_api/api_def_SymbolicGradient.pbtxt b/tensorflow/core/api_def/java_api/api_def_SymbolicGradient.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6c6e68ae7408d6c043ed38cac4f1400c71b048ae
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_SymbolicGradient.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "SymbolicGradient"
+  endpoint {
+    name: "train.SymbolicGradient"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TFRecordDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_TFRecordDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f1d42edd63d757a3d84b7160223192314631ca63
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TFRecordDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TFRecordDataset"
+  endpoint {
+    name: "data.TfRecordDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TFRecordReader.pbtxt b/tensorflow/core/api_def/java_api/api_def_TFRecordReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9ffbeba0ec979524d588f0f92632b145fd01fa13
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TFRecordReader.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TFRecordReader"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TFRecordReaderV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TFRecordReaderV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7d252e4942bab615ee3b80cfb4e03c947de52b0d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TFRecordReaderV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TFRecordReaderV2"
+  endpoint {
+    name: "io.TfRecordReader"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TakeDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_TakeDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2b0c0544fbc29ff63bd364206817ce1584aea6ac
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TakeDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TakeDataset"
+  endpoint {
+    name: "data.TakeDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TakeManySparseFromTensorsMap.pbtxt b/tensorflow/core/api_def/java_api/api_def_TakeManySparseFromTensorsMap.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..620e9fb0120eb5e06672022fa1a18661ddb28f9f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TakeManySparseFromTensorsMap.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TakeManySparseFromTensorsMap"
+  endpoint {
+    name: "sparse.TakeManySparseFromTensorsMap"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Tan.pbtxt b/tensorflow/core/api_def/java_api/api_def_Tan.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..560ca546b765d6aeef71da8a7aed031ff3dc59d8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Tan.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Tan"
+  endpoint {
+    name: "math.Tan"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Tanh.pbtxt b/tensorflow/core/api_def/java_api/api_def_Tanh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..90e441808f9de9d18e98cd6fcdf2012204f3b61c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Tanh.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Tanh"
+  endpoint {
+    name: "math.Tanh"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TanhGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_TanhGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c0837d04874dc57246813993d4a9722a64c9a4e0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TanhGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TanhGrad"
+  endpoint {
+    name: "math.TanhGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TemporaryVariable.pbtxt b/tensorflow/core/api_def/java_api/api_def_TemporaryVariable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ce760f9c94f66f4f4bdfeac0ceaab9f3f04d3bb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TemporaryVariable.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TemporaryVariable"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArray.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArray.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e315486af2ab109b011349ffa9d44b7b5cfde945
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArray.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArray"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayClose.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayClose.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..951ace8005b8b17cf1b02c15834049b3c2226566
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayClose.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayClose"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayCloseV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayCloseV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6fd2d02592ee647ea9b07c7d3ce8f59df14e72ce
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayCloseV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayCloseV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayCloseV3.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayCloseV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4a065779790d682b4e5e7db020e8f1110a3a6d28
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayCloseV3.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorArrayCloseV3"
+  endpoint {
+    name: "TensorArrayClose"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayConcat.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayConcat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f5071110939f85c0887814462ed1c39e27d0b132
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayConcat.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayConcat"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayConcatV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayConcatV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e92cebf2a794a4b6a72be7bf8180b0a224c4579d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayConcatV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayConcatV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayConcatV3.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayConcatV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..34d09c901a07d3133723d0cdc8e7206adaddb2a7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayConcatV3.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorArrayConcatV3"
+  endpoint {
+    name: "TensorArrayConcat"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayGather.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayGather.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..95866b9778e731b47c1dd101130261fe31e091e2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayGather.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayGather"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayGatherV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayGatherV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f75b50c6670fcbc9db7b773e804b37b7be35f470
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayGatherV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayGatherV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayGatherV3.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayGatherV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b792ee988267c1f28c484cacba7600e3cea4a8a3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayGatherV3.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorArrayGatherV3"
+  endpoint {
+    name: "TensorArrayGather"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..beb9b5ca12d754b1d4d064d36b05c8d8f623f9cc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayGrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayGrad"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayGradV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayGradV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..41d25a491015bcef366bb0b90039909601724de0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayGradV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayGradV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayGradV3.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayGradV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2c9adebd044c921ce437df32ae42649c7aadc2db
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayGradV3.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorArrayGradV3"
+  endpoint {
+    name: "TensorArrayGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayGradWithShape.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayGradWithShape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d6804fb697bcfea87fbeb8b78914db19603c32cb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayGradWithShape.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorArrayGradWithShape"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayPack.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayPack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..030950b06fcbf2fd5e0c2eed99e154640e0adbec
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayPack.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorArrayPack"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayRead.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayRead.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..72704746a5c94077640156107d48f9f1bf30b79d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayRead.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayRead"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayReadV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayReadV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..43cd0a2b7867455348183b5c375643e44cd5da11
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayReadV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayReadV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayReadV3.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayReadV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e6d38d64df94575e015c96501ad2c44c0dc6bce3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayReadV3.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorArrayReadV3"
+  endpoint {
+    name: "TensorArrayRead"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayScatter.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayScatter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..76092a45ed2d9b76b68e964d15372a1d45974cb4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayScatter.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayScatter"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayScatterV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayScatterV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7dba0fab4cd0b7337da7cd136dfbebd1b2d4ac2c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayScatterV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayScatterV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayScatterV3.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayScatterV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..179c9611f5854171ef5427c2e2db67ea18e6ae5e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayScatterV3.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorArrayScatterV3"
+  endpoint {
+    name: "TensorArrayScatter"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArraySize.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArraySize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fb2be098c67c53ce43acfa6bc11f05b0babfa037
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArraySize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArraySize"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArraySizeV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArraySizeV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8e8e44cfe2969e81a6f130578475db23d83f47f7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArraySizeV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArraySizeV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArraySizeV3.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArraySizeV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2df9a2d3f135038d4f61eeaae61681b19adb730f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArraySizeV3.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorArraySizeV3"
+  endpoint {
+    name: "TensorArraySize"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArraySplit.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArraySplit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..105031eb983d2f681c6c7dd12b557e4a509f805e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArraySplit.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArraySplit"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArraySplitV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArraySplitV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ef5d88832aab642546c5df9523e33a4643a2dc03
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArraySplitV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArraySplitV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArraySplitV3.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArraySplitV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..721af074d0d538a95e5dbb95a3071b9e8b2f3ddc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArraySplitV3.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorArraySplitV3"
+  endpoint {
+    name: "TensorArraySplit"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayUnpack.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayUnpack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a9011de23ea12eee3de3f3ba83ff86907c6c967b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayUnpack.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorArrayUnpack"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..43a441a07101408b31362e302dedadb8d5585ffd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayV3.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2b87617a1f1712ffe9bbabe6aed81e4e5a0abbee
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayV3.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorArrayV3"
+  endpoint {
+    name: "TensorArray"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayWrite.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayWrite.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2462dae80d96a1c90d85ca48a7dbbe528c3e996a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayWrite.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayWrite"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayWriteV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayWriteV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9f670ae18162956a19edf6708072ca02c716adb8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayWriteV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayWriteV2"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorArrayWriteV3.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorArrayWriteV3.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7321057b2f0da7a789ac86ee4567464477576df0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorArrayWriteV3.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorArrayWriteV3"
+  endpoint {
+    name: "TensorArrayWrite"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ed0ead6e7ab7187a67c152d94834cdb0cc0ccfac
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorDataset"
+  endpoint {
+    name: "data.TensorDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorForestCreateTreeVariable.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorForestCreateTreeVariable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8e2410a0dd7dca78eaff1096365392beb44393d3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorForestCreateTreeVariable.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorForestCreateTreeVariable"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorForestTreeDeserialize.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorForestTreeDeserialize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..724bdb282d0252652865c6625dd6ddbce964c918
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorForestTreeDeserialize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorForestTreeDeserialize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorForestTreeIsInitializedOp.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorForestTreeIsInitializedOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7e93af8508f3b8a6ca3ffe505a567ec2f6bc548b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorForestTreeIsInitializedOp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorForestTreeIsInitializedOp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorForestTreePredict.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorForestTreePredict.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..73770fa2913ec24662b0d8a43f57c5d1d99d91df
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorForestTreePredict.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorForestTreePredict"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorForestTreeResourceHandleOp.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorForestTreeResourceHandleOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c2ef0ee5dd34187ce814beb23edceb47d6dca988
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorForestTreeResourceHandleOp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorForestTreeResourceHandleOp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorForestTreeSerialize.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorForestTreeSerialize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d10f9e632b649036fdcf34ae35014070df42ac63
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorForestTreeSerialize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorForestTreeSerialize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorForestTreeSize.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorForestTreeSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9d81f1ea8c01d3a9a359f7eb16e2a8d61e7255e9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorForestTreeSize.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorForestTreeSize"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListConcatLists.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListConcatLists.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3fa6265e1082369a9c42c3286b44da800496de6b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListConcatLists.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListConcatLists"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListElementShape.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListElementShape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..396a0cfa8fe7142defa30d046c834773bf5118d5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListElementShape.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListElementShape"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListFromTensor.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListFromTensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3179feddd6042ae483e11f73500ef8088ac3555e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListFromTensor.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListFromTensor"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListGather.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListGather.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..580d34b68f5fbc5d2c75ba492589b73146d5f261
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListGather.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListGather"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListGetItem.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListGetItem.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2c47208fa0525ccc7f91711bade66b0c86b914a7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListGetItem.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListGetItem"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListLength.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListLength.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2ec689d71c821a0648c10f94b6699a07f709baca
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListLength.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListLength"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListPopBack.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListPopBack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8d0d6ed55b34ef3a3016b6ec085f0987ff1cc562
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListPopBack.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListPopBack"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListPushBack.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListPushBack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..285351cf4f308e9d330ae2cc6aff034ec9911d85
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListPushBack.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListPushBack"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListPushBackBatch.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListPushBackBatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1f33d4926018f6ebad79c7e2e69fca9a1966eb5f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListPushBackBatch.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListPushBackBatch"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListReserve.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListReserve.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..533660068879237f7bde3d5f8cc51c6163c11c51
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListReserve.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListReserve"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListScatter.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListScatter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f3a56f12928141b2541cf009b603982ca864870e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListScatter.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListScatter"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListSetItem.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListSetItem.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..002e2a9bd37c2e6a2b41ba43237278bc42119bf7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListSetItem.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListSetItem"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorListStack.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorListStack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b4b03c86e574c85a65b7b91bb73ae349d9783125
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorListStack.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TensorListStack"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorSliceDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorSliceDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3729a025e66e30c558b283d1ba596d812bbea044
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorSliceDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorSliceDataset"
+  endpoint {
+    name: "data.TensorSliceDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..165478d3a0f764dfdd46c451b653952d53be3c9d
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorSummary.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorSummary"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TensorSummaryV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TensorSummaryV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c285ada0128fca3f97d3c14f60ca15906d9cb4eb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TensorSummaryV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TensorSummaryV2"
+  endpoint {
+    name: "summary.TensorSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TextLineDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_TextLineDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c3c75d5703b52ad0656f84cbef8ec11a0010198b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TextLineDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TextLineDataset"
+  endpoint {
+    name: "data.TextLineDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TextLineReader.pbtxt b/tensorflow/core/api_def/java_api/api_def_TextLineReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f86b15cf86b34b3aed2121aa040ead096ae48102
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TextLineReader.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TextLineReader"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TextLineReaderV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TextLineReaderV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ee57dd84082dee03df452967437261eb9dbfaea6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TextLineReaderV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TextLineReaderV2"
+  endpoint {
+    name: "io.TextLineReader"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ThreadUnsafeUnigramCandidateSampler.pbtxt b/tensorflow/core/api_def/java_api/api_def_ThreadUnsafeUnigramCandidateSampler.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2ef4a834781fa308678561e06fab079b0c8e76bc
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ThreadUnsafeUnigramCandidateSampler.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ThreadUnsafeUnigramCandidateSampler"
+  endpoint {
+    name: "random.ThreadUnsafeUnigramCandidateSampler"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Tile.pbtxt b/tensorflow/core/api_def/java_api/api_def_Tile.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8e4e63a2228d126561142d678c0454fed22dad1b
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Tile.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Tile"
+  endpoint {
+    name: "Tile"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TileGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_TileGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7cd975c02f6b74f95b01f3fae4f94bfec0a72490
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TileGrad.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TileGrad"
+  endpoint {
+    name: "train.TileGrad"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Timestamp.pbtxt b/tensorflow/core/api_def/java_api/api_def_Timestamp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9ebc664ae15f45b937760beffbd2de1570c6ad44
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Timestamp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Timestamp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TopK.pbtxt b/tensorflow/core/api_def/java_api/api_def_TopK.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bb090aa6f141d84f8b85513ae55f95da9827813e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TopK.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TopK"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TopKV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_TopKV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2b0dcf7c2a2658c16fe6a1a1c22bd2ad4fab1190
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TopKV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TopKV2"
+  endpoint {
+    name: "nn.TopK"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Transpose.pbtxt b/tensorflow/core/api_def/java_api/api_def_Transpose.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ad7900c00b21cc2c1921899f39e562b7096d0832
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Transpose.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Transpose"
+  endpoint {
+    name: "linalg.Transpose"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TruncateDiv.pbtxt b/tensorflow/core/api_def/java_api/api_def_TruncateDiv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4fbd6823401778512d1aec18e24b9870daf3bd90
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TruncateDiv.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TruncateDiv"
+  endpoint {
+    name: "math.TruncateDiv"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TruncateMod.pbtxt b/tensorflow/core/api_def/java_api/api_def_TruncateMod.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7d1ae9a14fafc8556828dc29484bdbc269e9ac56
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TruncateMod.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TruncateMod"
+  endpoint {
+    name: "math.TruncateMod"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TruncatedNormal.pbtxt b/tensorflow/core/api_def/java_api/api_def_TruncatedNormal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b2dd52c955841971bedae10fc0301affd783969a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TruncatedNormal.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "TruncatedNormal"
+  endpoint {
+    name: "random.TruncatedNormal"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_TryRpc.pbtxt b/tensorflow/core/api_def/java_api/api_def_TryRpc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7ca476086a8a0a135d9c02388e3eead5e4f7f5d0
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_TryRpc.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "TryRpc"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Unbatch.pbtxt b/tensorflow/core/api_def/java_api/api_def_Unbatch.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..513b05593a1bbca7bef198a6d36efdd8e986eb30
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Unbatch.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Unbatch"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UnbatchDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_UnbatchDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..24907c804b0c80d8d3038a8eddb1fd412b9e3ab5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UnbatchDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "UnbatchDataset"
+  endpoint {
+    name: "data.UnbatchDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UnbatchGrad.pbtxt b/tensorflow/core/api_def/java_api/api_def_UnbatchGrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ce612f84f6f5f66e0e3a8523d57c13cb0d9e7a90
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UnbatchGrad.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "UnbatchGrad"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UnicodeDecodeWithOffsets.pbtxt b/tensorflow/core/api_def/java_api/api_def_UnicodeDecodeWithOffsets.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..16cc033140c37d00fd4057d68fb07711903fa790
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UnicodeDecodeWithOffsets.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "UnicodeDecodeWithOffsets"
+  endpoint {
+    name: "strings.UnicodeDecodeWithOffsets"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UnicodeScript.pbtxt b/tensorflow/core/api_def/java_api/api_def_UnicodeScript.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a873151d5fc612e67ae2d0ae1d95c85ce7c774d2
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UnicodeScript.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "UnicodeScript"
+  endpoint {
+    name: "strings.UnicodeScript"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UnicodeTranscode.pbtxt b/tensorflow/core/api_def/java_api/api_def_UnicodeTranscode.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..131cc6169c7771653f31e830e5947d02d8874d1a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UnicodeTranscode.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "UnicodeTranscode"
+  endpoint {
+    name: "strings.UnicodeTranscode"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UniformCandidateSampler.pbtxt b/tensorflow/core/api_def/java_api/api_def_UniformCandidateSampler.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d7a56c2a6ebd96f7a2d321748bc402a9e007c6da
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UniformCandidateSampler.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "UniformCandidateSampler"
+  endpoint {
+    name: "random.UniformCandidateSampler"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Unique.pbtxt b/tensorflow/core/api_def/java_api/api_def_Unique.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8cc8ec0feddaf2fc89e57121cf23e5c58f0861f5
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Unique.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Unique"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UniqueV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_UniqueV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6fe22cb1020a9378a2d591f0dd5257a80014f7f9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UniqueV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "UniqueV2"
+  endpoint {
+    name: "Unique"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UniqueWithCounts.pbtxt b/tensorflow/core/api_def/java_api/api_def_UniqueWithCounts.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0248fab17e6594c357b18a4b0d12273b94181d0f
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UniqueWithCounts.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "UniqueWithCounts"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UniqueWithCountsV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_UniqueWithCountsV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eb157451143c0a795704755f02850afafa765175
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UniqueWithCountsV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "UniqueWithCountsV2"
+  endpoint {
+    name: "UniqueWithCounts"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Unpack.pbtxt b/tensorflow/core/api_def/java_api/api_def_Unpack.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c3ad63e8f8027ec67d5827ee7bac88a19b316187
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Unpack.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Unpack"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UnravelIndex.pbtxt b/tensorflow/core/api_def/java_api/api_def_UnravelIndex.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1a08dc8f1e70acfc7bf7760c648087ce022f8835
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UnravelIndex.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "UnravelIndex"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UnsortedSegmentMax.pbtxt b/tensorflow/core/api_def/java_api/api_def_UnsortedSegmentMax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a58d8740d56eabd28212dd3059eec59822869d03
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UnsortedSegmentMax.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "UnsortedSegmentMax"
+  endpoint {
+    name: "math.UnsortedSegmentMax"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UnsortedSegmentMin.pbtxt b/tensorflow/core/api_def/java_api/api_def_UnsortedSegmentMin.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2f17ab4624e736489b6804d0c1123b3436bd542c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UnsortedSegmentMin.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "UnsortedSegmentMin"
+  endpoint {
+    name: "math.UnsortedSegmentMin"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UnsortedSegmentProd.pbtxt b/tensorflow/core/api_def/java_api/api_def_UnsortedSegmentProd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c594941bcc0bdc12d0eafe35d676bb7c7c99dfe7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UnsortedSegmentProd.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "UnsortedSegmentProd"
+  endpoint {
+    name: "math.UnsortedSegmentProd"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UnsortedSegmentSum.pbtxt b/tensorflow/core/api_def/java_api/api_def_UnsortedSegmentSum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e10cf8a6c2c076314ee749ba7d307921d411b994
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UnsortedSegmentSum.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "UnsortedSegmentSum"
+  endpoint {
+    name: "math.UnsortedSegmentSum"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Unstage.pbtxt b/tensorflow/core/api_def/java_api/api_def_Unstage.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..31a97cf84db28567856c72d53e4c7f54124504dd
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Unstage.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Unstage"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_UpperBound.pbtxt b/tensorflow/core/api_def/java_api/api_def_UpperBound.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..229a6ddfc365d0b89845478741c48c6cc67348b1
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_UpperBound.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "UpperBound"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_VarHandleOp.pbtxt b/tensorflow/core/api_def/java_api/api_def_VarHandleOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30bdace0e6e9e796233cf8056147ca3884b2b4af
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_VarHandleOp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "VarHandleOp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_VarIsInitializedOp.pbtxt b/tensorflow/core/api_def/java_api/api_def_VarIsInitializedOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a3992019bc753352ad573a2eb0061fa1583c5133
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_VarIsInitializedOp.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "VarIsInitializedOp"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Variable.pbtxt b/tensorflow/core/api_def/java_api/api_def_Variable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0978e61451b6dd1f2fdcd3f5f8625f3e6ccee777
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Variable.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Variable"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_VariableShape.pbtxt b/tensorflow/core/api_def/java_api/api_def_VariableShape.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..38c63b5b7015c09bf2046ae0cf670732c6dd84f4
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_VariableShape.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "VariableShape"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_VariableV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_VariableV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c566dd1e79ffb289c2127a077232a952f54f7038
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_VariableV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "VariableV2"
+  endpoint {
+    name: "Variable"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Where.pbtxt b/tensorflow/core/api_def/java_api/api_def_Where.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f130181a6e3afe451f68509d4f8c01155d93f77c
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Where.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "Where"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_While.pbtxt b/tensorflow/core/api_def/java_api/api_def_While.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9d0f3b07633aa8c97428f09dd27af93b2a89855e
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_While.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "While"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_WholeFileReader.pbtxt b/tensorflow/core/api_def/java_api/api_def_WholeFileReader.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aa839ed38019de504c3c92dd1795cf109de9d0c6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_WholeFileReader.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WholeFileReader"
+  visibility: SKIP
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_WholeFileReaderV2.pbtxt b/tensorflow/core/api_def/java_api/api_def_WholeFileReaderV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e031d705fb2dd266da7dd436b5bc68811cdce2b9
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_WholeFileReaderV2.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "WholeFileReaderV2"
+  endpoint {
+    name: "io.WholeFileReader"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_WindowDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_WindowDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..69f12c55e1d1bdcfaf6752778408432d9db20c90
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_WindowDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "WindowDataset"
+  endpoint {
+    name: "data.WindowDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_WriteAudioSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_WriteAudioSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fcd0df85c2c4bc7f5061bb6d2f4ca5b74ff0e4c7
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_WriteAudioSummary.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "WriteAudioSummary"
+  endpoint {
+    name: "summary.WriteAudioSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_WriteFile.pbtxt b/tensorflow/core/api_def/java_api/api_def_WriteFile.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a2d6a5bace44397b51f0fa67dc55d1ded73febc8
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_WriteFile.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "WriteFile"
+  endpoint {
+    name: "io.WriteFile"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_WriteGraphSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_WriteGraphSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8e461bbba333a73cf99c9004dcc31e5fdb343422
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_WriteGraphSummary.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "WriteGraphSummary"
+  endpoint {
+    name: "summary.WriteGraphSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_WriteHistogramSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_WriteHistogramSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c991079032593a7c8811283bda4ee8e318786831
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_WriteHistogramSummary.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "WriteHistogramSummary"
+  endpoint {
+    name: "summary.WriteHistogramSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_WriteImageSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_WriteImageSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..08bf0adb2f26e8d2d308b5753bb2fcd0637328f3
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_WriteImageSummary.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "WriteImageSummary"
+  endpoint {
+    name: "summary.WriteImageSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_WriteScalarSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_WriteScalarSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7bc7c16a0c7d19937c0acb4bfde7d89ad79628d6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_WriteScalarSummary.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "WriteScalarSummary"
+  endpoint {
+    name: "summary.WriteScalarSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_WriteSummary.pbtxt b/tensorflow/core/api_def/java_api/api_def_WriteSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1141cb6dbb16b984057aab3053b9bca770cabbad
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_WriteSummary.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "WriteSummary"
+  endpoint {
+    name: "summary.WriteSummary"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Xdivy.pbtxt b/tensorflow/core/api_def/java_api/api_def_Xdivy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..13a94b8a56eed2b3c132a0baa169ab00732105b6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Xdivy.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Xdivy"
+  endpoint {
+    name: "math.Xdivy"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Xlogy.pbtxt b/tensorflow/core/api_def/java_api/api_def_Xlogy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..52f457d6458dec89f5a1aa8e2b5ec978f3bafcfa
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Xlogy.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Xlogy"
+  endpoint {
+    name: "math.Xlogy"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ZerosLike.pbtxt b/tensorflow/core/api_def/java_api/api_def_ZerosLike.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ef17aba9b4431e04f6e78fc9d6099db4ac3eb7a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ZerosLike.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "ZerosLike"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_Zeta.pbtxt b/tensorflow/core/api_def/java_api/api_def_Zeta.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b602bbc7e1f5e877d64b2636b6e49b8c226735f6
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_Zeta.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Zeta"
+  endpoint {
+    name: "math.Zeta"
+  }
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_ZipDataset.pbtxt b/tensorflow/core/api_def/java_api/api_def_ZipDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e32362bef2b9c2d042dc097a7c321d0261ce787a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_ZipDataset.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "ZipDataset"
+  endpoint {
+    name: "data.ZipDataset"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AsString.pbtxt b/tensorflow/core/api_def/python_api/api_def_AsString.pbtxt
index d51defc376ff9a0961ed5bd43b848ea3f6df288d..bc8cc309f552e93e1dd6ff1fb0d74f8fda0cd1f7 100644
--- a/tensorflow/core/api_def/python_api/api_def_AsString.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_AsString.pbtxt
@@ -2,6 +2,10 @@ op {
   graph_op_name: "AsString"
   endpoint {
     name: "dtypes.as_string"
+    deprecation_version: 2
+  }
+  endpoint {
+    name: "strings.as_string"
   }
   endpoint {
     name: "as_string"
diff --git a/tensorflow/core/api_def/python_api/api_def_AudioSpectrogram.pbtxt b/tensorflow/core/api_def/python_api/api_def_AudioSpectrogram.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..38e4b38f3f8a5c386a0c4b56d5469ab0c5dd0a7d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AudioSpectrogram.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AudioSpectrogram"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AvgPool3D.pbtxt b/tensorflow/core/api_def/python_api/api_def_AvgPool3D.pbtxt
index cc16523a1567e8d7f2d0146c1c44d9ef11b6c6d5..72c281de342e553280c029d98a275395a93896d0 100644
--- a/tensorflow/core/api_def/python_api/api_def_AvgPool3D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_AvgPool3D.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "AvgPool3D"
-  endpoint {
-    name: "nn.avg_pool3d"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Case.pbtxt b/tensorflow/core/api_def/python_api/api_def_Case.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a4c8193a35061a93dc21f1ac02bde318095fbf7b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Case.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Case"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt
index 590b37c95fb2a43e49d5c5ae4dcfe8cc499a4c6d..edbcba26ce3d31cc8c3d9aecb9efc5286ddd7002 100644
--- a/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt
@@ -2,6 +2,7 @@ op {
   graph_op_name: "Conv3DBackpropFilterV2"
   endpoint {
     name: "nn.conv3d_backprop_filter"
+    deprecation_version: 2
   }
   endpoint {
     name: "nn.conv3d_backprop_filter_v2"
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeWav.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeWav.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..28f4514bd88e116b77ecf7f4d6a6660518b85a1f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeWav.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "DecodeWav"
+  endpoint {
+    name: "audio.decode_wav"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Dilation2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_Dilation2D.pbtxt
index 1bd83d906152d2e5792fecd5e80e339e0c67e7a5..97af07e0012ea99a69175e6ed5628566bf8b6873 100644
--- a/tensorflow/core/api_def/python_api/api_def_Dilation2D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Dilation2D.pbtxt
@@ -1,7 +1,4 @@
 op {
   graph_op_name: "Dilation2D"
-  endpoint {
-    name: "nn.dilation2d"
-    deprecation_version: 2
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_EncodeWav.pbtxt b/tensorflow/core/api_def/python_api/api_def_EncodeWav.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..edcf2bded125cba51053d5b401d03f21b8649595
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_EncodeWav.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "EncodeWav"
+  endpoint {
+    name: "audio.encode_wav"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_EuclideanNorm.pbtxt b/tensorflow/core/api_def/python_api/api_def_EuclideanNorm.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a3ea8859b5426926230f81a9ec31a6083d3a11dd
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_EuclideanNorm.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "EuclideanNorm"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ExtractGlimpse.pbtxt b/tensorflow/core/api_def/python_api/api_def_ExtractGlimpse.pbtxt
index ed8abdfcd7f3171d431adf07d47eb3bfc60d1e8f..f1fc72c4ca18afbb4ce597dc17a513634d2423d0 100644
--- a/tensorflow/core/api_def/python_api/api_def_ExtractGlimpse.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ExtractGlimpse.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "ExtractGlimpse"
-  endpoint {
-    name: "image.extract_glimpse"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_KMC2ChainInitialization.pbtxt b/tensorflow/core/api_def/python_api/api_def_KMC2ChainInitialization.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3e8ec113f81ee7b8049b19201da41ae9206c63cb
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_KMC2ChainInitialization.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "KMC2ChainInitialization"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_KmeansPlusPlusInitialization.pbtxt b/tensorflow/core/api_def/python_api/api_def_KmeansPlusPlusInitialization.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..83aa3a3a69f4120f14bc0dc72d368953132b7eee
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_KmeansPlusPlusInitialization.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "KmeansPlusPlusInitialization"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MaxPool3D.pbtxt b/tensorflow/core/api_def/python_api/api_def_MaxPool3D.pbtxt
index e8576c9ff2e0729235d9bca70c369536dacaa08e..47016b9d6949b3cb8558b9d6b794183e9f8e7517 100644
--- a/tensorflow/core/api_def/python_api/api_def_MaxPool3D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_MaxPool3D.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "MaxPool3D"
-  endpoint {
-    name: "nn.max_pool3d"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_MaxPoolWithArgmax.pbtxt b/tensorflow/core/api_def/python_api/api_def_MaxPoolWithArgmax.pbtxt
index 13a1a0b5df4d73884d267777ccf5ad6a44fcdbd4..c57cfc7727a5ebb87f219ad4ec9576a05ac68a69 100644
--- a/tensorflow/core/api_def/python_api/api_def_MaxPoolWithArgmax.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_MaxPoolWithArgmax.pbtxt
@@ -1,7 +1,4 @@
 op {
   graph_op_name: "MaxPoolWithArgmax"
-  endpoint {
-    name: "nn.max_pool_with_argmax"
-    deprecation_version: 2
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Mfcc.pbtxt b/tensorflow/core/api_def/python_api/api_def_Mfcc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..21a0c8b8b2448ec38b6819712ef0980f47afdd84
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Mfcc.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Mfcc"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MulNoNan.pbtxt b/tensorflow/core/api_def/python_api/api_def_MulNoNan.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..35b2f309fb3850a4393464209422822eebd9e2a4
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_MulNoNan.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "MulNoNan"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_NearestNeighbors.pbtxt b/tensorflow/core/api_def/python_api/api_def_NearestNeighbors.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ad05bb34862362530d90a1df67a1b46376b107ab
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_NearestNeighbors.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "NearestNeighbors"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_NextAfter.pbtxt b/tensorflow/core/api_def/python_api/api_def_NextAfter.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9188a0d6bd8c41ea766ff91bca9a2df97145bb13
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_NextAfter.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "NextAfter"
+  endpoint {
+    name: "math.nextafter"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceScatterNdSub.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceScatterNdSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ff1d01db6bf5279c99c9305c1eec97ed8b6e84f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceScatterNdSub.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceScatterNdSub"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ScaleAndTranslate.pbtxt b/tensorflow/core/api_def/python_api/api_def_ScaleAndTranslate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c10e7ab752d86c3cea5f92ce05ceb514cb58c319
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ScaleAndTranslate.pbtxt
@@ -0,0 +1,7 @@
+op {
+  graph_op_name: "ScaleAndTranslate"
+  visibility: HIDDEN
+  endpoint {
+    name: "image.scale_and_translate"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StatefulStandardNormal.pbtxt b/tensorflow/core/api_def/python_api/api_def_StatefulStandardNormal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fb3c3dbdb110c7a04a44b7c201bc3b432565139c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StatefulStandardNormal.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StatefulStandardNormal"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StatefulStandardNormalV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_StatefulStandardNormalV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a1816dd734fadd538ad81f243c203e3c1ccae259
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StatefulStandardNormalV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StatefulStandardNormalV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListConcatV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListConcatV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..237774a388d4e6d4079e401b35286cf1b91ce85c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListConcatV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListConcatV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListResize.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListResize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0d689d4f2b16a9e18064fe9c8be09650a3e4a641
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListResize.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListResize"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListScatterIntoExistingList.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListScatterIntoExistingList.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..20d9a43be4168af3f79d96224eb2fbefaa5752a2
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListScatterIntoExistingList.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListScatterIntoExistingList"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListScatterV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListScatterV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d2a92a529d361cd3684d4306cb82bb3648e2b7e9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListScatterV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListScatterV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
index c4bc1a684cb3ffaa30cdaece041fc51c266a3782..1fc077af92c719bf2c5d87eded55275032891f5d 100644
--- a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
+++ b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
@@ -75,7 +75,8 @@ class AccumulateNV2RemovePass : public GraphOptimizationPass {
   Status rewriteNode(Node* n, Graph* g) {
     AttrSlice n_attrs = n->attrs();
     auto base_make_node = [n, &n_attrs](const string& op, const string& name) {
-      NodeBuilder node_builder(name, op);
+      NodeDebugInfo debug_info(*n);
+      NodeBuilder node_builder(name, op, OpRegistry::Global(), &debug_info);
 
       // The pieces of AccumulateNV2 should all be on the same node.
       node_builder.Device(n->requested_device());
diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
index 92e56df18105218fc8a5112a880b6c999f1a2649..c9e3cf40860a2b68024dc0fc61e8c65640cbc9be 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.cc
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -63,7 +63,7 @@ int64 CollectiveAdapter::AlignedChunkElts(int64 elt_bytes, int64 total_elts,
       (chunk_bytes < EIGEN_MAX_ALIGN_BYTES)
           ? (EIGEN_MAX_ALIGN_BYTES - chunk_bytes)
           : (EIGEN_MAX_ALIGN_BYTES - (chunk_bytes % EIGEN_MAX_ALIGN_BYTES));
-  CHECK_EQ(0, diff % elt_bytes);
+  DCHECK_EQ(0, diff % elt_bytes);
   base_chunk_elts += (diff / elt_bytes);
   DCHECK_EQ(0, ((base_chunk_elts * elt_bytes) % EIGEN_MAX_ALIGN_BYTES))
       << "total_elts=" << total_elts << " num_chunks=" << num_chunks
@@ -78,17 +78,23 @@ class CollectiveAdapterImpl : public CollectiveAdapter {
  public:
   // Takes ownership of output and prepares to properly alias its chunks.
   // Ownership is taken because the shape may temporarily change.
-  CollectiveAdapterImpl(Tensor* output, int64 num_chunks, Allocator* allocator)
+  CollectiveAdapterImpl(Tensor* output, int64 num_chunks, Allocator* allocator,
+                        bool align_chunks)
       : output_(std::move(*output)),
         dt_(output_.dtype()),
         old_shape_(output_.shape()),
         num_chunks_(num_chunks),
         allocator_(allocator),
         total_elts_(output_.NumElements()),
-        chunk_elts_(AlignedChunkElts(sizeof(T), total_elts_, num_chunks_)),
+        chunk_elts_(align_chunks
+                        ? AlignedChunkElts(sizeof(T), total_elts_, num_chunks_)
+                        : total_elts_ / num_chunks_),
         data_start_(reinterpret_cast<T*>(DMAHelper::base(&output_))),
         data_end_(data_start_ + total_elts_) {
-    CHECK_GT(chunk_elts_, 0);
+    if (!align_chunks) {
+      DCHECK_EQ(total_elts_, num_chunks_ * chunk_elts_);
+    }
+    DCHECK_GT(chunk_elts_, 0);
     Flatten();
   }
 
@@ -176,19 +182,24 @@ class CollectiveAdapterImpl : public CollectiveAdapter {
 }  // namespace
 
 CollectiveAdapter* MakeCollectiveAdapter(Tensor* output, int num_chunks,
-                                         Allocator* allocator) {
+                                         Allocator* allocator,
+                                         bool align_chunks) {
   switch (output->dtype()) {
     case DT_FLOAT:
-      return new CollectiveAdapterImpl<float>(output, num_chunks, allocator);
+      return new CollectiveAdapterImpl<float>(output, num_chunks, allocator,
+                                              align_chunks);
       break;
     case DT_DOUBLE:
-      return new CollectiveAdapterImpl<double>(output, num_chunks, allocator);
+      return new CollectiveAdapterImpl<double>(output, num_chunks, allocator,
+                                               align_chunks);
       break;
     case DT_INT32:
-      return new CollectiveAdapterImpl<int32>(output, num_chunks, allocator);
+      return new CollectiveAdapterImpl<int32>(output, num_chunks, allocator,
+                                              align_chunks);
       break;
     case DT_INT64:
-      return new CollectiveAdapterImpl<int64>(output, num_chunks, allocator);
+      return new CollectiveAdapterImpl<int64>(output, num_chunks, allocator,
+                                              align_chunks);
       break;
     default:
       LOG(FATAL) << "Unsupported type " << output->dtype()
@@ -227,6 +238,7 @@ void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
 
   Tensor* output = ctx->mutable_output(0);
   const Tensor* input = (col_params.instance.type == REDUCTION_COLLECTIVE ||
+                         col_params.instance.type == GATHER_COLLECTIVE ||
                          (col_params.instance.type == BROADCAST_COLLECTIVE &&
                           col_params.is_source))
                             ? &ctx->input(0)
@@ -296,4 +308,42 @@ Status BaseCollectiveExecutor::CreateCollective(
   return status;
 }
 
+bool BaseCollectiveExecutor::CheckDependencies(
+    const CollectiveParams& col_params) {
+  for (int32 instance : col_params.instance.impl_details.dependencies) {
+    auto find_iter = launched_.find(instance);
+    if (find_iter == launched_.end() || find_iter->second != 0) {
+      VLOG(1) << "Collective " << col_params.ToString()
+              << " blocked by instance " << instance;
+      return false;
+    }
+  }
+  return true;
+}
+
+void BaseCollectiveExecutor::WaitForDependencies(
+    const CollectiveParams& col_params) {
+  mutex_lock l(launch_mu_);
+  while (!CheckDependencies(col_params)) {
+    launch_cv_.wait(l);
+  }
+  VLOG(1) << "Unblocking collective " << col_params.ToString();
+}
+
+void BaseCollectiveExecutor::Launched(const CollectiveParams& col_params) {
+  mutex_lock l(launch_mu_);
+  if (launched_.find(col_params.instance.instance_key) == launched_.end()) {
+    const string& task_name =
+        col_params.instance.task_names[col_params.default_rank];
+    const int32 num_devices =
+        col_params.instance.num_devices_per_task.at(task_name);
+    launched_[col_params.instance.instance_key] = num_devices;
+  }
+  if (--launched_[col_params.instance.instance_key] == 0) {
+    VLOG(1) << "Unblocking dependencies for collective instance "
+            << col_params.instance.instance_key;
+    launch_cv_.notify_all();
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/base_collective_executor.h b/tensorflow/core/common_runtime/base_collective_executor.h
index 09826a8814511cb46c907b983f240fe17df70e3d..bc85b5af5f87bd6d5fc1cdc28b17248eeb33a25d 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.h
+++ b/tensorflow/core/common_runtime/base_collective_executor.h
@@ -78,9 +78,15 @@ class CollectiveAdapter {
 };
 
 // Create a CollectiveAdaptor wrapping 'output', specialized to its
-// data-type and shape.
+// data-type and shape.  If align_chunks == true then chunk size may
+// be larger than output->NumElements() / num_chunks and one or more
+// of the suffix chunks may be empty.  Chunks will be arranged to start
+// and end on alignment boundaries.  If align_chunks == false then
+// output->NumElements() % num_chunks must be 0 and all chunks will
+// have exactly the same size, ignoring alignment issues.
 CollectiveAdapter* MakeCollectiveAdapter(Tensor* output, int num_chunks,
-                                         Allocator* allocator);
+                                         Allocator* allocator,
+                                         bool align_chunks = true);
 
 // Default implementation of CollectiveExecutor.  Delegates the actual
 // work of moving data to a class specialized for the operation type,
@@ -135,15 +141,33 @@ class BaseCollectiveExecutor : public CollectiveExecutor {
                                client_locality, done);
   }
 
+  // If we need to enforce an ordering on any portion of collective
+  // implementation, and the ordering is encoded via attribute on the collective
+  // op, this function will block until all dependencies for this collective
+  // have completed.
+  void WaitForDependencies(const CollectiveParams& col_params) override;
+  // Record that this collective has completed the portion of the implementation
+  // that needs to be ordered wrt other collectives, to unblock any of its
+  // dependent ops.
+  void Launched(const CollectiveParams& col_params) override;
+
  protected:
   const int64 step_id_;
   const DeviceMgr* dev_mgr_;  // Not owned.
   std::unique_ptr<PerStepCollectiveRemoteAccess> remote_access_;
   const string* gpu_ring_order_;  // Not owned.
+  mutex launch_mu_;
+  condition_variable launch_cv_;
+  // collective instance key -> number of local devices for which NCCL ops have
+  // been launched.
+  std::unordered_map<int32, int32> launched_ GUARDED_BY(launch_mu_);
 
  private:
   Status CreateCollective(const CollectiveParams& col_params,
                           CollectiveImplementationInterface** col_impl);
+  // Check if all ops on which this collective depends on have launched.
+  bool CheckDependencies(const CollectiveParams& col_params)
+      EXCLUSIVE_LOCKS_REQUIRED(launch_mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 3843ea9e60cfbac4c428174f9b2201ccafaf505e..0e4ddb102002ec2802761e05013834cf491f7980 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/bfc_allocator.h"
 
 #include "tensorflow/core/common_runtime/allocator_retry.h"
+#include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -152,6 +153,7 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
   c->allocation_id = -1;
   c->prev = kInvalidChunkHandle;
   c->next = kInvalidChunkHandle;
+  c->freed_count = 0;
 
   region_manager_.set_handle(c->ptr, h);
 
@@ -180,29 +182,46 @@ void BFCAllocator::DeallocateChunk(ChunkHandle h) {
   free_chunks_list_ = h;
 }
 
-void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes) {
+void* BFCAllocator::AllocateRawInternalWithRetry(
+    size_t unused_alignment, size_t num_bytes,
+    const AllocationAttributes& allocation_attr) {
   // Fast path: Try once to allocate without getting the retry_helper_ involved
-  void* r = AllocateRawInternal(unused_alignment, num_bytes, false);
+  uint64 freed_by_count = 0;
+  if (allocation_attr.freed_by_func != nullptr) {
+    freed_by_count = allocation_attr.freed_by_func();
+  }
+  void* r =
+      AllocateRawInternal(unused_alignment, num_bytes, false, freed_by_count);
   if (r != nullptr) {
     return r;
   } else {
     static const int64 kMaxMillisToWait = 10000;  // 10 seconds
-    return retry_helper_.AllocateRaw(
-        [this](size_t a, size_t nb, bool v) {
-          return AllocateRawInternal(a, nb, v);
+    r = retry_helper_.AllocateRaw(
+        [this, &allocation_attr](size_t a, size_t nb, bool v) {
+          uint64 freed_by_count = 0;
+          if (allocation_attr.freed_by_func != nullptr) {
+            freed_by_count = allocation_attr.freed_by_func();
+          }
+          return AllocateRawInternal(a, nb, v, freed_by_count);
         },
         kMaxMillisToWait, unused_alignment, num_bytes);
+    return r;
   }
 }
 
 void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes,
                                 const AllocationAttributes& allocation_attr) {
+  VLOG(1) << "AllocateRaw " << Name() << "  " << num_bytes;
   if (allocation_attr.no_retry_on_failure) {
     // Return immediately upon the first failure if this is for allocating an
     // optional scratch space.
     bool dump_log_on_failure = VLOG_IS_ON(2);
-    void* result =
-        AllocateRawInternal(unused_alignment, num_bytes, dump_log_on_failure);
+    uint64 freed_by_count = 0;
+    if (allocation_attr.freed_by_func != nullptr) {
+      freed_by_count = allocation_attr.freed_by_func();
+    }
+    void* result = AllocateRawInternal(unused_alignment, num_bytes,
+                                       dump_log_on_failure, freed_by_count);
     if (result == nullptr) {
       static std::atomic<int32> log_counter{0};
       int32 counter_value = log_counter.load(std::memory_order_relaxed);
@@ -218,7 +237,8 @@ void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes,
     }
     return result;
   } else {
-    return AllocateRaw(unused_alignment, num_bytes);
+    return AllocateRawInternalWithRetry(unused_alignment, num_bytes,
+                                        allocation_attr);
   }
 }
 
@@ -233,7 +253,8 @@ size_t BFCAllocator::RoundedBytes(size_t bytes) {
 
 void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
                                         size_t num_bytes,
-                                        bool dump_log_on_failure) {
+                                        bool dump_log_on_failure,
+                                        uint64 freed_before) {
   if (num_bytes == 0) {
     LOG(ERROR) << "tried to allocate 0 bytes";
     return nullptr;
@@ -247,14 +268,14 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
   BinNum bin_num = BinNumForSize(rounded_bytes);
 
   mutex_lock l(lock_);
-  void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes);
+  void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
   if (ptr != nullptr) {
     return ptr;
   }
 
   // Try to extend
   if (Extend(unused_alignment, rounded_bytes)) {
-    ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes);
+    ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
     if (ptr != nullptr) {
       return ptr;
     }
@@ -274,7 +295,7 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
 }
 
 void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
-                                 size_t num_bytes) {
+                                 size_t num_bytes, uint64 freed_before) {
   // First identify the first bin that could satisfy rounded_bytes.
   for (; bin_num < kNumBins; bin_num++) {
     // Start searching from the first bin for the smallest chunk that fits
@@ -285,6 +306,9 @@ void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
       const BFCAllocator::ChunkHandle h = (*citer);
       BFCAllocator::Chunk* chunk = ChunkFromHandle(h);
       DCHECK(!chunk->in_use());
+      if (freed_before > 0 && freed_before < chunk->freed_count) {
+        continue;
+      }
       if (chunk->size >= rounded_bytes) {
         // We found an existing chunk that fits us that wasn't in use, so remove
         // it from the free bin structure prior to using.
@@ -311,10 +335,10 @@ void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
         // Update stats.
         ++stats_.num_allocs;
         stats_.bytes_in_use += chunk->size;
-        stats_.max_bytes_in_use =
-            std::max(stats_.max_bytes_in_use, stats_.bytes_in_use);
-        stats_.max_alloc_size =
-            std::max<std::size_t>(stats_.max_alloc_size, chunk->size);
+        stats_.peak_bytes_in_use =
+            std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use);
+        stats_.largest_alloc_size =
+            std::max<std::size_t>(stats_.largest_alloc_size, chunk->size);
 
         VLOG(4) << "Returning: " << chunk->ptr;
         if (VLOG_IS_ON(4)) {
@@ -347,6 +371,9 @@ void BFCAllocator::SplitChunk(BFCAllocator::ChunkHandle h, size_t num_bytes) {
   // The new chunk is not in use.
   new_chunk->allocation_id = -1;
 
+  // It inherits the freed time.
+  new_chunk->freed_count = c->freed_count;
+
   // Maintain the pointers.
   // c <-> c_neighbor becomes
   // c <-> new_chunk <-> c_neighbor
@@ -364,6 +391,7 @@ void BFCAllocator::SplitChunk(BFCAllocator::ChunkHandle h, size_t num_bytes) {
 }
 
 void BFCAllocator::DeallocateRaw(void* ptr) {
+  VLOG(1) << "DeallocateRaw " << Name() << " " << RequestedSize(ptr);
   DeallocateRawInternal(ptr);
   retry_helper_.NotifyDealloc();
 }
@@ -415,6 +443,9 @@ void BFCAllocator::Merge(BFCAllocator::ChunkHandle h1,
   // Set the new size
   c1->size += c2->size;
 
+  // Pick latest free time.
+  c1->freed_count = std::max(c1->freed_count, c2->freed_count);
+
   DeleteChunk(h2);
 }
 
@@ -460,6 +491,11 @@ void BFCAllocator::FreeAndMaybeCoalesce(BFCAllocator::ChunkHandle h) {
   // Mark the chunk as no longer in use.
   c->allocation_id = -1;
 
+  // Optionally record the free time.
+  if (timing_counter_) {
+    c->freed_count = timing_counter_->next();
+  }
+
   // Updates the stats.
   stats_.bytes_in_use -= c->size;
 
@@ -630,7 +666,10 @@ void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
         in_use_by_size[c->size]++;
       }
       LOG(INFO) << (c->in_use() ? "Chunk" : "Free ") << " at " << c->ptr
-                << " of size " << c->size;
+                << " of size " << c->size
+                << (timing_counter_
+                        ? strings::StrCat(" freed_count ", c->freed_count)
+                        : "");
       h = c->next;
     }
   }
@@ -647,16 +686,16 @@ void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
   LOG(INFO) << "Stats: \n" << stats_.DebugString();
 }
 
-void BFCAllocator::GetStats(AllocatorStats* stats) {
+absl::optional<AllocatorStats> BFCAllocator::GetStats() {
   mutex_lock l(lock_);
-  *stats = stats_;
+  return stats_;
 }
 
 void BFCAllocator::ClearStats() {
   mutex_lock l(lock_);
   stats_.num_allocs = 0;
-  stats_.max_bytes_in_use = stats_.bytes_in_use;
-  stats_.max_alloc_size = 0;
+  stats_.peak_bytes_in_use = stats_.bytes_in_use;
+  stats_.largest_alloc_size = 0;
 }
 
 std::array<BFCAllocator::BinDebugInfo, BFCAllocator::kNumBins>
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index 2d74bf2b286a1fac4d3f9b3921fef7a5b838fce8..b0fd0d8667da96d3590965ae3e05675968389089 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/allocator_retry.h"
+#include "tensorflow/core/common_runtime/shared_counter.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -50,9 +51,14 @@ class BFCAllocator : public Allocator {
   ~BFCAllocator() override;
 
   string Name() override { return name_; }
-  void* AllocateRaw(size_t alignment, size_t num_bytes) override;
+
+  void* AllocateRaw(size_t alignment, size_t num_bytes) override {
+    return AllocateRaw(alignment, num_bytes, AllocationAttributes());
+  }
+
   void* AllocateRaw(size_t alignment, size_t num_bytes,
                     const AllocationAttributes& allocation_attr) override;
+
   void DeallocateRaw(void* ptr) override;
 
   bool TracksAllocationSizes() override;
@@ -63,15 +69,23 @@ class BFCAllocator : public Allocator {
 
   int64 AllocationId(const void* ptr) override;
 
-  void GetStats(AllocatorStats* stats) override;
+  absl::optional<AllocatorStats> GetStats() override;
 
   void ClearStats() override;
 
+  void SetTimingCounter(SharedCounter* sc) { timing_counter_ = sc; }
+
  private:
   struct Bin;
 
   void* AllocateRawInternal(size_t alignment, size_t num_bytes,
-                            bool dump_log_on_failure);
+                            bool dump_log_on_failure,
+                            uint64 freed_before_count);
+
+  void* AllocateRawInternalWithRetry(
+      size_t alignment, size_t num_bytes,
+      const AllocationAttributes& allocation_attr);
+
   void DeallocateRawInternal(void* ptr);
 
   // A ChunkHandle is an index into the chunks_ vector in BFCAllocator
@@ -126,6 +140,9 @@ class BFCAllocator : public Allocator {
     // What bin are we in?
     BinNum bin_num = kInvalidBinNum;
 
+    // Optional count when this chunk was most recently made free.
+    uint64 freed_count = 0;
+
     bool in_use() const { return allocation_id != -1; }
 
     string DebugString(BFCAllocator* a,
@@ -314,8 +331,8 @@ class BFCAllocator : public Allocator {
 
   // Returns a pointer to an underlying allocated chunk of size
   // 'rounded_bytes'.
-  void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes)
-      EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes,
+                     uint64 freed_before) EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Splits the chunk specified by 'h' into two chunks, one at least
   // of size 'num_bytes'.
@@ -420,6 +437,7 @@ class BFCAllocator : public Allocator {
 
   std::unique_ptr<SubAllocator> sub_allocator_;
   string name_;
+  SharedCounter* timing_counter_ = nullptr;
 
   // Structures mutable after construction
   mutable mutex lock_;
diff --git a/tensorflow/core/common_runtime/buf_rendezvous_test.cc b/tensorflow/core/common_runtime/buf_rendezvous_test.cc
index 0e798235bf0649428409a2fa72ac3067736c347a..7621787dec76850e346f65b3883cb2b5073c0077 100644
--- a/tensorflow/core/common_runtime/buf_rendezvous_test.cc
+++ b/tensorflow/core/common_runtime/buf_rendezvous_test.cc
@@ -109,7 +109,7 @@ TEST_F(BufRendezvousTest, CorrectUseConsumerFirst) {
 TEST_F(BufRendezvousTest, ErrorDuplicatePut) {
   bool prod_callback_called = false;
   br_->ProvideBuf("key0", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
-                  [this, &prod_callback_called](const Status& s) {
+                  [&prod_callback_called](const Status& s) {
                     prod_callback_called = true;
                   });
   Status bad_status;
@@ -129,11 +129,11 @@ TEST_F(BufRendezvousTest, ErrorDuplicatePut) {
 
 TEST_F(BufRendezvousTest, ErrorDeleteNonEmpty) {
   Status cons_status;
-  br_->ConsumeBuf(
-      "key0", [this, &cons_status](const Status& s, BufRendezvous::Hook* h) {
-        cons_status = s;
-        EXPECT_EQ(h, nullptr);
-      });
+  br_->ConsumeBuf("key0",
+                  [&cons_status](const Status& s, BufRendezvous::Hook* h) {
+                    cons_status = s;
+                    EXPECT_EQ(h, nullptr);
+                  });
   EXPECT_TRUE(cons_status.ok());
   br_.reset();
   EXPECT_FALSE(cons_status.ok());
@@ -146,13 +146,13 @@ TEST_F(BufRendezvousTest, AbortNonEmpty) {
   Status prod_status;
   Notification prod_note;
   Notification cons_note;
-  br_->ConsumeBuf("key0", [this, &cons_note, &cons_status](
-                              const Status& s, BufRendezvous::Hook* h) {
+  br_->ConsumeBuf("key0", [&cons_note, &cons_status](const Status& s,
+                                                     BufRendezvous::Hook* h) {
     cons_status = s;
     cons_note.Notify();
   });
   br_->ProvideBuf("key1", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
-                  [this, &prod_note, &prod_status](const Status& s) {
+                  [&prod_note, &prod_status](const Status& s) {
                     prod_status = s;
                     prod_note.Notify();
                   });
@@ -175,13 +175,13 @@ TEST_F(BufRendezvousTest, UseAfterAbort) {
   Status prod_status;
   Notification prod_note;
   Notification cons_note;
-  br_->ConsumeBuf("key0", [this, &cons_note, &cons_status](
-                              const Status& s, BufRendezvous::Hook* h) {
+  br_->ConsumeBuf("key0", [&cons_note, &cons_status](const Status& s,
+                                                     BufRendezvous::Hook* h) {
     cons_status = s;
     cons_note.Notify();
   });
   br_->ProvideBuf("key1", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
-                  [this, &prod_note, &prod_status](const Status& s) {
+                  [&prod_note, &prod_status](const Status& s) {
                     prod_status = s;
                     prod_note.Notify();
                   });
diff --git a/tensorflow/core/common_runtime/build_graph_options.cc b/tensorflow/core/common_runtime/build_graph_options.cc
index 00f7a8e6452f9cc234c5868437d40ccc99dbaf87..b095fcfa3bb1fea6fccafa3015734e71582a6829 100644
--- a/tensorflow/core/common_runtime/build_graph_options.cc
+++ b/tensorflow/core/common_runtime/build_graph_options.cc
@@ -35,6 +35,19 @@ string BuildGraphOptions::DebugString() const {
   if (collective_graph_key != kNoCollectiveGraphKey) {
     strings::StrAppend(&rv, "\ncollective_graph_key: ", collective_graph_key);
   }
+  string collective_order_str;
+  switch (collective_order) {
+    case GraphCollectiveOrder::kNone:
+      collective_order_str = "none";
+      break;
+    case GraphCollectiveOrder::kEdges:
+      collective_order_str = "edges";
+      break;
+    case GraphCollectiveOrder::kAttrs:
+      collective_order_str = "attrs";
+      break;
+  }
+  strings::StrAppend(&rv, "\ncollective_order: ", collective_order_str);
   return rv;
 }
 
diff --git a/tensorflow/core/common_runtime/build_graph_options.h b/tensorflow/core/common_runtime/build_graph_options.h
index 3d0f242ea5177fd5a99a925f998ec5252a313327..24b71cc741df325617b0c129b4b592c28fcc57cd 100644
--- a/tensorflow/core/common_runtime/build_graph_options.h
+++ b/tensorflow/core/common_runtime/build_graph_options.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/core/graph/collective_order.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
@@ -34,6 +35,11 @@ struct BuildGraphOptions {
   static const int64 kNoCollectiveGraphKey = 0;
   int64 collective_graph_key = kNoCollectiveGraphKey;
 
+  // If not `kNone`, order all CollectiveReduce operations statically and
+  // deterministically.  If `kEdges`, encode dependencies as explicit control
+  // edges, if `kAttrs` encode as attribute on collective op.
+  GraphCollectiveOrder collective_order = GraphCollectiveOrder::kNone;
+
   string DebugString() const;
 };
 
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr_test.cc b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
index f3d86aa633938042b862613162d1c2a94b0fe35a..3eef5ed0a0c5984474c4d75ae417c030d269290d 100644
--- a/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
+++ b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
@@ -44,7 +44,7 @@ class CollectiveExecutorMgrTest : public ::testing::Test {
     std::unique_ptr<DeviceResolverInterface> drl(
         new DeviceResolverLocal(device_mgr_.get()));
     std::unique_ptr<ParamResolverInterface> prl(
-        new CollectiveParamResolverLocal(device_mgr_.get(), drl.get(),
+        new CollectiveParamResolverLocal(cp, device_mgr_.get(), drl.get(),
                                          task_name));
     cme_.reset(new CollectiveExecutorMgr(cp, device_mgr_.get(), std::move(drl),
                                          std::move(prl)));
@@ -73,11 +73,11 @@ TEST_F(CollectiveExecutorMgrTest, StepSequenceRelated) {
   EXPECT_EQ(CollectiveExecutor::kInvalidId, cme_->NextStepId(123));
   Notification ss_note;
   Status ss_status;
-  cme_->RefreshStepIdSequenceAsync(
-      123, [this, &ss_status, &ss_note](const Status& s) {
-        ss_status = s;
-        ss_note.Notify();
-      });
+  cme_->RefreshStepIdSequenceAsync(123,
+                                   [&ss_status, &ss_note](const Status& s) {
+                                     ss_status = s;
+                                     ss_note.Notify();
+                                   });
   ss_note.WaitForNotification();
   EXPECT_FALSE(ss_status.ok());
   EXPECT_EQ(ss_status.error_message(),
@@ -87,7 +87,7 @@ TEST_F(CollectiveExecutorMgrTest, StepSequenceRelated) {
   GetStepSequenceRequest* req = nullptr;
   GetStepSequenceResponse* resp = nullptr;
   cme_->GetStepSequenceAsync(req, resp,
-                             [this, &gs_status, &gs_note](const Status& s) {
+                             [&gs_status, &gs_note](const Status& s) {
                                gs_status = s;
                                gs_note.Notify();
                              });
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index a8e3f4c881afc9c37ce4b5196c32ec591be5506d..a76708385be59a2c6cec556d6ab1124b9c2bf541 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -37,9 +38,12 @@ void CollectiveParamResolverLocal::InstanceRec::WaitForOutMu(mutex_lock& lock) {
 }
 
 CollectiveParamResolverLocal::CollectiveParamResolverLocal(
-    const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
-    const string& task_name)
-    : dev_mgr_(dev_mgr), dev_resolver_(dev_resolver), task_name_(task_name) {}
+    const ConfigProto& config, const DeviceMgr* dev_mgr,
+    DeviceResolverInterface* dev_resolver, const string& task_name)
+    : nccl_(config.experimental().collective_nccl()),
+      dev_mgr_(dev_mgr),
+      dev_resolver_(dev_resolver),
+      task_name_(task_name) {}
 
 void CollectiveParamResolverLocal::CompleteGroupAsync(
     const CompleteGroupRequest* request, CompleteGroupResponse* response,
@@ -140,7 +144,6 @@ void CollectiveParamResolverLocal::CompleteGroupLocal(
 }
 
 namespace {
-
 struct DevRec {
   string task;
   string device;
@@ -316,29 +319,28 @@ GlobalDeviceMap EstablishGlobalRank(
 // cp->same_num_devices_per_task.  Requires cp->instance.task_names
 // be sorted.
 void SetDevPerTask(CollectiveParams* cp) {
-  cp->instance.same_num_devices_per_task = false;
-  if (cp->instance.task_names.empty()) return;
-  int dev_per_task = -1;
-  int count = 0;
+  cp->instance.num_devices_per_task.clear();
   const string* last_task_name = &cp->instance.task_names[0];
+  int count = 0;
   for (const string& task_name : cp->instance.task_names) {
-    if (task_name != *last_task_name) {
-      CHECK_GT(count, 0);
-      if (dev_per_task < 0) {
-        dev_per_task = count;
-      } else {
-        CHECK_GT(dev_per_task, 0);
-        if (count != dev_per_task) return;
-      }
+    if (task_name == *last_task_name) {
+      ++count;
+    } else {
+      cp->instance.num_devices_per_task[*last_task_name] = count;
       count = 1;
       last_task_name = &task_name;
-    } else {
-      ++count;
     }
   }
-  CHECK_GT(count, 0);
-  if ((dev_per_task > 0) && (count != dev_per_task)) {
-    return;
+  cp->instance.num_devices_per_task[*last_task_name] = count;
+
+  cp->instance.same_num_devices_per_task = false;
+  int dev_per_task = -1;
+  for (const auto& task_dev : cp->instance.num_devices_per_task) {
+    if (dev_per_task == -1) {
+      dev_per_task = task_dev.second;
+    } else if (dev_per_task != task_dev.second) {
+      return;
+    }
   }
   cp->instance.same_num_devices_per_task = true;
   CHECK_EQ((cp->group.group_size % cp->group.num_tasks), 0);
@@ -358,7 +360,7 @@ void SortDevicesAndTasks(CollectiveParams* cp) {
   for (int i = 0; i < perm.size(); ++i) {
     perm[i] = i;
   }
-  std::sort(perm.begin(), perm.end(), [cp](const int& a, const int& b) {
+  std::sort(perm.begin(), perm.end(), [cp](int a, int b) {
     return cp->instance.device_names[a] < cp->instance.device_names[b];
   });
   std::vector<string> new_devs;
@@ -398,7 +400,6 @@ void CollectiveParamResolverLocal::SetDefaultRank(const string& device,
 void CollectiveParamResolverLocal::InitInstanceSharedParams(
     const GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir,
     const StatusCallback& done) {
-  VLOG(1) << "InitInstanceSharedParams " << ir;
   ir->shared.instance = cp->instance;
   {
     mutex_lock gl(gr->mu);
@@ -412,8 +413,8 @@ void CollectiveParamResolverLocal::InitInstanceSharedParams(
   }
   ir->shared.default_rank = -1;
 
-  // Sort devce_names lexicographcally, keeping task_names in
-  // corresponding order.
+  // Sort device_names lexicographically, keeping task_names in corresponding
+  // order.  Also set number of devices per task.
   SortDevicesAndTasks(&ir->shared);
 
   // Get Locality data for all devices.
@@ -583,7 +584,7 @@ void CollectiveParamResolverLocal::CallInitInstanceSharedParams(
 void CollectiveParamResolverLocal::CompleteParamsAsync(
     const string& device, CollectiveParams* cp, CancellationManager* cancel_mgr,
     const StatusCallback& done) {
-  VLOG(1) << "CompleteParams " << device << " for " << cp << ": "
+  VLOG(1) << "CompleteParams local " << device << " for " << cp << ": "
           << cp->ToString();
   CompleteGroupLocal(
       device, cp,
@@ -605,6 +606,27 @@ void CollectiveParamResolverLocal::CompleteInstanceAsync(
                        "intended only for non-distributed deployment."));
 }
 
+// TODO(b/111897089): we need a better way to pick the collective
+// implementation.  The ideal way would depend upon the topology and link
+// strength before picking a particular implementation.
+void CollectiveParamResolverLocal::AssignCollectiveType(CollectiveParams* cp) {
+  if (cp->instance.type == BROADCAST_COLLECTIVE) {
+    cp->instance.impl_details.collective_name = "HierarchicalTreeBroadcast";
+  } else if (cp->instance.type == REDUCTION_COLLECTIVE) {
+    if (nccl_) {
+      cp->instance.impl_details.collective_name = "NcclReduce";
+    } else {
+      cp->instance.impl_details.collective_name = "RingReduce";
+    }
+  } else if (cp->instance.type == GATHER_COLLECTIVE) {
+    cp->instance.impl_details.collective_name = "RingGather";
+  } else {
+    cp->instance.impl_details.collective_name = "undef";
+  }
+  VLOG(1) << "AssignCollectiveType "
+          << cp->instance.impl_details.collective_name;
+}
+
 void CollectiveParamResolverLocal::CompleteInstanceLocal(
     const string& device, const GroupRec* gr, CollectiveParams* cp,
     bool is_source, const StatusCallback& done) {
@@ -641,48 +663,57 @@ void CollectiveParamResolverLocal::CompleteInstanceFromInitializedIRec(
     // custom operator= does a deep copy.
     cp->instance = ir->shared.instance;
   }
-  // Populate the fields common across task, also default_rank.
+  // Populate the fields common across task.
+  AssignCollectiveType(cp);
   SetDefaultRank(device, cp);
   CompleteTaskIsLocal(task_name_, cp);
-  // TODO(b/113171733): we need a better way to pick the collective
-  // implementation.  The ideal way would depend upon the topology and link
-  // strength before picking a particular implementation.
-  cp->instance.impl_details.collective_name =
-      (cp->instance.type == BROADCAST_COLLECTIVE) ? "HierarchicalTreeBroadcast"
-                                                  : "RingReduce";
+
   CollectiveImplementationInterface* col_impl;
-  Status lookup_status = CollectiveRegistry::LookupParamResolverInstance(
+  Status status = CollectiveRegistry::LookupParamResolverInstance(
       cp->instance.impl_details.collective_name, &col_impl);
-  if (!lookup_status.ok()) {
-    done(lookup_status);
+  if (status.ok()) {
+    status = col_impl->InitializeInstanceBeforeGroupDiscovery(cp);
+  }
+  if (!status.ok()) {
+    done(status);
     return;
   }
-  // If broadcast, may need to wait for source discovery.
-  if (cp->instance.type == BROADCAST_COLLECTIVE) {
-    CompleteInstanceSource(ir, cp, is_source,
-                           [col_impl, ir, device, cp, done](InstanceRec* irec) {
-                             CHECK_EQ(ir, irec);
-                             Status s;
-                             {
-                               mutex_lock l(irec->out_mu);
-                               irec->WaitForOutMu(l);
-                               s = irec->status;
-                               cp->source_rank = irec->source_rank;
-                             }
-                             if (s.ok()) {
-                               s = col_impl->InitializeCollectiveParams(cp);
-                             }
-                             done(s);
-                           });
+
+  //  We may need to wait for the group if:
+  //  * this is a broadcast, for source discovery;
+  //  * we are using NCCL with more than 1 worker, for the communicator key from
+  //    rank 0.
+  bool broadcast = cp->instance.type == BROADCAST_COLLECTIVE;
+  bool nccl = cp->instance.type == REDUCTION_COLLECTIVE &&
+              cp->instance.impl_details.collective_name == "NcclReduce" &&
+              cp->group.num_tasks > 1;
+  if (broadcast || nccl) {
+    WaitForGroup(ir, cp, is_source, broadcast, nccl,
+                 [col_impl, ir, device, cp, done](InstanceRec* irec) {
+                   Status s;
+                   if (ir != irec) {
+                     s = errors::Internal("Expected ir ", ir, " and irec ",
+                                          irec, " to be equal");
+                   } else {
+                     mutex_lock l(irec->out_mu);
+                     irec->WaitForOutMu(l);
+                     s = irec->status;
+                     cp->source_rank = irec->source_rank;
+                     cp->instance.communicator_key = irec->communicator_key;
+                   }
+                   if (s.ok()) {
+                     s = col_impl->InitializeCollectiveParams(cp);
+                   }
+                   done(s);
+                 });
   } else {
     done(col_impl->InitializeCollectiveParams(cp));
   }
 }
 
-void CollectiveParamResolverLocal::CompleteInstanceSource(InstanceRec* ir,
-                                                          CollectiveParams* cp,
-                                                          bool is_source,
-                                                          const IRConsumer& f) {
+void CollectiveParamResolverLocal::WaitForGroup(
+    InstanceRec* ir, CollectiveParams* cp, bool is_source, bool init_source,
+    bool init_nccl, const IRConsumer& f) {
   std::vector<IRConsumer> ready_waiters;
   {
     mutex_lock l(ir->out_mu);
@@ -692,7 +723,8 @@ void CollectiveParamResolverLocal::CompleteInstanceSource(InstanceRec* ir,
     if (!ir->known[cp->default_rank]) {
       ir->known[cp->default_rank] = true;
       ++ir->known_count;
-      if (is_source) {
+      if (init_source && is_source) {
+        // Initialize source rank.
         if (ir->source_rank >= 0) {
           ir->status = errors::Internal("Instance ", cp->instance.instance_key,
                                         " already has source ", ir->source_rank,
@@ -702,13 +734,26 @@ void CollectiveParamResolverLocal::CompleteInstanceSource(InstanceRec* ir,
           ir->source_rank = cp->default_rank;
         }
       }
+      if (init_nccl && cp->default_rank == 0) {
+        // Initialize communicator key.
+        if (!ir->communicator_key.empty()) {
+          ir->status =
+              errors::Internal("Instance ", cp->instance.instance_key,
+                               " already has communicator_key ",
+                               str_util::CEscape(ir->communicator_key),
+                               ", received second claim from device ",
+                               cp->instance.device_names[cp->default_rank]);
+        } else {
+          ir->communicator_key = cp->instance.communicator_key;
+        }
+      }
     }
     if (ir->known_count < ir->shared.group.group_size) {
       ir->known_waiters.push_back(f);
       return;
     }
     CHECK_EQ(ir->known_count, ir->shared.group.group_size);
-    if (ir->source_rank < 0) {
+    if (init_source && ir->source_rank < 0) {
       // NOTE(ayushd): changing the error message below would also require
       // updating CompleteParamsBroadcastForgotSend test in
       // CollectiveParamResolverLocalTest.
@@ -718,6 +763,13 @@ void CollectiveParamResolverLocal::CompleteInstanceSource(InstanceRec* ir,
                            "could mean that there were group_size=",
                            ir->known_count, " BcastRecvs but no BcastSend.");
     }
+    if (init_nccl && ir->communicator_key.empty()) {
+      ir->status = errors::Internal(
+          "Instance ", cp->instance.instance_key, " device ",
+          cp->instance.device_names[cp->default_rank],
+          " did not find rank 0 for setting communicator key.  This is an "
+          "internal error in collective param resolution");
+    }
     if (!ir->known_waiters.empty()) {
       ready_waiters = std::move(ir->known_waiters);
     }
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.h b/tensorflow/core/common_runtime/collective_param_resolver_local.h
index 365bddc787a7ba3d97f2df29b4ebd2a3c7118ef7..08e2f338f3c642cdfa6cd2df824cb1177c4b4911 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.h
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 class CompleteGroupRequest;
@@ -36,7 +37,8 @@ class DeviceMgr;
 // group leader for param resolution in a multi-task context.
 class CollectiveParamResolverLocal : public ParamResolverInterface {
  public:
-  CollectiveParamResolverLocal(const DeviceMgr* dev_mgr,
+  CollectiveParamResolverLocal(const ConfigProto& config,
+                               const DeviceMgr* dev_mgr,
                                DeviceResolverInterface* dev_resolver,
                                const string& task_name);
 
@@ -130,8 +132,10 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
     Status status GUARDED_BY(out_mu);
 
     // These fields are used to count the instances that have called
-    // in and become known while resolving broadcast source identity.
+    // in and become known while resolving broadcast source identity and
+    // communicator key.
     int source_rank GUARDED_BY(out_mu);
+    string communicator_key GUARDED_BY(out_mu);
     int known_count GUARDED_BY(out_mu);
     std::vector<bool> known GUARDED_BY(out_mu);
     std::vector<IRConsumer> known_waiters GUARDED_BY(out_mu);
@@ -197,10 +201,10 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
                                            const StatusCallback& done)
       LOCKS_EXCLUDED(ir->out_mu);
 
-  // Complete source data for a broadcast instance.
+  // Complete source data and/or nccl communicator key.
   // Precondition: *cp has complete group data and default_rank.
-  void CompleteInstanceSource(InstanceRec* ir, CollectiveParams* cp,
-                              bool is_source, const IRConsumer& f)
+  void WaitForGroup(InstanceRec* ir, CollectiveParams* cp, bool is_source,
+                    bool init_source, bool init_nccl, const IRConsumer& f)
       LOCKS_EXCLUDED(ir->out_mu);
 
   // If cp.device_names contains only devices local to this process
@@ -216,10 +220,15 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
   // current ordering of cp->instance.device_names.
   void SetDefaultRank(const string& device, CollectiveParams* cp);
 
+  // Sets cp->instance.type based on collective op type, and attempts to assign
+  // best implementation.
+  void AssignCollectiveType(CollectiveParams* cp);
+
   // Helper to grab status under lock, invoke callback out of lock.
   void CallbackWithStatus(const InstanceRecCallback& done, InstanceRec* irec)
       LOCKS_EXCLUDED(irec->out_mu);
 
+  const bool nccl_;
   const DeviceMgr* dev_mgr_;
   DeviceResolverInterface* dev_resolver_;  // Not owned.
   string task_name_;
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
index 94d889c40dff89204ccfc43478f8732815a4ead4..70eb9f8081aedfde33e3eb67b478c72ca2dee72f 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
@@ -41,8 +41,8 @@ class CollectiveParamResolverLocalTest : public ::testing::Test {
     TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices));
     device_mgr_.reset(new DeviceMgr(std::move(devices)));
     drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
-    prl_.reset(new CollectiveParamResolverLocal(device_mgr_.get(), drl_.get(),
-                                                task_name));
+    prl_.reset(new CollectiveParamResolverLocal(cp, device_mgr_.get(),
+                                                drl_.get(), task_name));
   }
 
   void RunCompleteDefaultRanking(
@@ -175,7 +175,7 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsReduction1Task) {
     Env::Default()->SchedClosure([this, i, cp, &note, &statuses]() {
       prl_->CompleteParamsAsync(cp->instance.device_names[0], cp,
                                 nullptr /*CancellationManager*/,
-                                [this, &statuses, &note, i](const Status& s) {
+                                [&statuses, &note, i](const Status& s) {
                                   statuses[i] = s;
                                   note[i].Notify();
                                 });
diff --git a/tensorflow/core/common_runtime/collective_rma_local_test.cc b/tensorflow/core/common_runtime/collective_rma_local_test.cc
index 4263f3a4add524bf59e7c08cfb5d927ac9e23e06..2e9d8cd394e36ed6dbbd5cb6e49687b633bf9186 100644
--- a/tensorflow/core/common_runtime/collective_rma_local_test.cc
+++ b/tensorflow/core/common_runtime/collective_rma_local_test.cc
@@ -46,8 +46,8 @@ class CollectiveRemoteAccessLocalTest : public ::testing::Test {
     TF_CHECK_OK(DeviceFactory::AddDevices(options, kTaskName, &devices));
     device_mgr_.reset(new DeviceMgr(std::move(devices)));
     drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
-    prl_.reset(new CollectiveParamResolverLocal(device_mgr_.get(), drl_.get(),
-                                                kTaskName));
+    prl_.reset(new CollectiveParamResolverLocal(cp, device_mgr_.get(),
+                                                drl_.get(), kTaskName));
     rma_.reset(new CollectiveRemoteAccessLocal(device_mgr_.get(), drl_.get(),
                                                kStepId));
   }
@@ -70,7 +70,7 @@ TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU0) {
                      "key_0", cpu0 /*to_device*/, nullptr /*to_device_ctx*/,
                      attr /*to_alloc_attr*/, &sink_tensor, dev_locality,
                      0 /*stream_index*/,
-                     [this, &recv_note, &recv_status](const Status& s) {
+                     [&recv_note, &recv_status](const Status& s) {
                        recv_status = s;
                        recv_note.Notify();
                      });
@@ -85,7 +85,7 @@ TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU0) {
   rma_->PostToPeer(kTaskName + "/device:CPU:0", kTaskName, "key_0",
                    cpu0 /*from_device*/, nullptr /*from_device_ctx*/,
                    attr /*to_alloc_attr*/, &source_tensor, dev_locality,
-                   [this, &send_note, &send_status](const Status& s) {
+                   [&send_note, &send_status](const Status& s) {
                      send_status = s;
                      send_note.Notify();
                    });
@@ -113,7 +113,7 @@ TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU1_2) {
                      "key_0", cpu2 /*to_device*/, nullptr /*to_device_ctx*/,
                      attr /*to_alloc_attr*/, &sink_tensor, dev_locality,
                      0 /*stream_index*/,
-                     [this, &recv_note, &recv_status](const Status& s) {
+                     [&recv_note, &recv_status](const Status& s) {
                        recv_status = s;
                        recv_note.Notify();
                      });
@@ -130,7 +130,7 @@ TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU1_2) {
   rma_->PostToPeer(kTaskName + "/device:CPU:2", kTaskName, "key_0",
                    cpu1 /*from_device*/, nullptr /*from_device_ctx*/,
                    attr /*to_alloc_attr*/, &source_tensor, dev_locality,
-                   [this, &send_note, &send_status](const Status& s) {
+                   [&send_note, &send_status](const Status& s) {
                      send_status = s;
                      send_note.Notify();
                    });
diff --git a/tensorflow/core/common_runtime/collective_util.cc b/tensorflow/core/common_runtime/collective_util.cc
index 195521a0784fd43f7bcd1b98065c7fcb641d52b4..bee4a13d1826f894b6d81539d7439a37ed1a8cfa 100644
--- a/tensorflow/core/common_runtime/collective_util.cc
+++ b/tensorflow/core/common_runtime/collective_util.cc
@@ -79,5 +79,36 @@ string SubdivPermDebugString(const CollectiveParams& col_params) {
   return buf;
 }
 
+SubContext::SubContext(OpKernelContext* ctx, OpKernelContext::Params* params,
+                       OpKernel* op, Tensor* output, Tensor* input)
+    : sub_params_(*params),
+      sub_inputs_({output, input}),
+      sub_input_attr_({ctx->input_alloc_attr(0), ctx->input_alloc_attr(0)}),
+      sub_input_dc_(
+          {ctx->input_device_context(0), ctx->input_device_context(0)}) {
+  sub_params_.op_kernel = op;
+  sub_params_.inputs = &sub_inputs_;
+  sub_params_.input_alloc_attrs = &sub_input_attr_;
+  sub_params_.input_device_contexts = &sub_input_dc_;
+  sub_params_.eigen_gpu_device = nullptr;
+  sub_params_.ensure_eigen_gpu_device();
+  sub_params_.forward_from_array = &forward_from_;
+  sub_ctx_.reset(new OpKernelContext(&sub_params_, 1));
+}
+
+Status ComputeBinOp(OpKernelContext* op_ctx, OpKernelContext::Params* params,
+                    Device* device, OpKernel* op, Tensor* output,
+                    Tensor* input) {
+  // Prepare an OpKernelContext that is identical to that of the original Op
+  // (i.e. the collective), except for the input output sizes and identities and
+  // the Op itself.
+  // TODO(ayushd, tucker): Is it possible to cache and reuse these objects?
+  // They're mostly identical inside one device execution.
+  std::unique_ptr<SubContext> sub_ctx(
+      new SubContext(op_ctx, params, op, output, input));
+  device->Compute(op, sub_ctx->sub_ctx_.get());
+  return sub_ctx->sub_ctx_->status();
+}
+
 }  // namespace collective_util
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_util.h b/tensorflow/core/common_runtime/collective_util.h
index ebb5731becadec3b88bea86641887c31b63ae3a5..01fb8b8c81cd2f4dc390c2b6467d7c54c7753bf0 100644
--- a/tensorflow/core/common_runtime/collective_util.h
+++ b/tensorflow/core/common_runtime/collective_util.h
@@ -32,6 +32,27 @@ Status InitializeDeviceAndLocality(const DeviceMgr* dev_mgr,
                                    DeviceLocality* device_locality);
 string SubdivPermDebugString(const CollectiveParams& col_params);
 
+// Used for executing a sub-operation, e.g. a merge_op instance, with
+// an OpKernelContext based on the one passed into this Op.
+class SubContext {
+ public:
+  OpKernelContext::Params sub_params_;
+  gtl::InlinedVector<TensorValue, 4> sub_inputs_;
+  gtl::InlinedVector<AllocatorAttributes, 4> sub_input_attr_;
+  gtl::InlinedVector<DeviceContext*, 4> sub_input_dc_;
+  // Used only for Binary and Unary Ops for which we require
+  // the calculation to be in-place on the first input.
+  int forward_from_ = 0;
+  std::unique_ptr<OpKernelContext> sub_ctx_;
+  SubContext(OpKernelContext* ctx, OpKernelContext::Params* params,
+             OpKernel* op, Tensor* output, Tensor* input);
+  ~SubContext() = default;
+};
+
+Status ComputeBinOp(OpKernelContext* op_ctx, OpKernelContext::Params* params,
+                    Device* device, OpKernel* op, Tensor* output,
+                    Tensor* input);
+
 }  // namespace collective_util
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/colocation_graph.cc b/tensorflow/core/common_runtime/colocation_graph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..046f00ccc56020baac9a4a8cf2cb94d8125fbc8f
--- /dev/null
+++ b/tensorflow/core/common_runtime/colocation_graph.cc
@@ -0,0 +1,994 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/colocation_graph.h"
+
+#include <memory>
+#include <set>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/dump_graph.h"
+#include "tensorflow/core/util/port.h"
+
+namespace tensorflow {
+
+namespace {
+
+// We hoist the conversion from C-style string literal to StringPiece here,
+// so that we can avoid the many repeated calls to strlen().
+const StringPiece kColocationAttrNameStringPiece(kColocationAttrName);
+const StringPiece kColocationGroupPrefixStringPiece(kColocationGroupPrefix);
+
+// Returns a list of devices having type in supported_device_types.  The
+// returned list is sorted by preferred type (higher numeric type is preferred).
+std::vector<Device*> FilterSupportedDevices(
+    const std::vector<Device*>& devices,
+    const PrioritizedDeviceTypeVector& supported_device_types,
+    const Device* default_device) {
+  Device* filtered_default_device = nullptr;
+  std::vector<std::pair<Device*, int32>> prioritized_filtered_devices;
+  for (const auto& supported_device_type : supported_device_types) {
+    for (Device* device : devices) {
+      if (DeviceType(device->attributes().device_type()) ==
+          supported_device_type.first) {
+        if (device == default_device) {
+          filtered_default_device = device;
+        } else {
+          prioritized_filtered_devices.emplace_back(
+              device, supported_device_type.second);
+        }
+      }
+    }
+  }
+
+  auto device_sort = [](const std::pair<Device*, int32>& a,
+                        const std::pair<Device*, int32>& b) {
+    if (a.second != b.second) {
+      return a.second > b.second;
+    }
+
+    auto a_priority =
+        DeviceSet::DeviceTypeOrder(DeviceType(a.first->device_type()));
+    auto b_priority =
+        DeviceSet::DeviceTypeOrder(DeviceType(b.first->device_type()));
+    // First sort by prioritized device type (higher is preferred) and
+    // then by device name (lexicographically).
+    if (a_priority != b_priority) {
+      return a_priority > b_priority;
+    }
+    return StringPiece(a.first->name()) < StringPiece(b.first->name());
+  };
+  std::sort(prioritized_filtered_devices.begin(),
+            prioritized_filtered_devices.end(), device_sort);
+
+  std::vector<Device*> filtered_devices;
+  if (filtered_default_device != nullptr) {
+    filtered_devices.emplace_back(filtered_default_device);
+  }
+  for (const auto& prioritized_filtered_device : prioritized_filtered_devices) {
+    filtered_devices.push_back(prioritized_filtered_device.first);
+  }
+  return filtered_devices;
+}
+
+// Using absl::StrJoin with lambda does not work in tf-lite builds.
+std::vector<string> DevicesToString(const std::vector<Device*> devices) {
+  std::vector<string> v;
+  v.reserve(devices.size());
+  for (Device* d : devices) {
+    v.push_back(d->name());
+  }
+  return v;
+}
+
+// Using absl::StrJoin with lambda does not work in tf-lite builds.
+std::vector<string> DeviceTypeAndPriorityToString(
+    const PrioritizedDeviceTypeVector& devices) {
+  std::vector<string> v;
+  v.reserve(devices.size());
+  for (const std::pair<DeviceType, int32>& device_and_type : devices) {
+    v.push_back(DeviceTypeString(device_and_type.first));
+  }
+  return v;
+}
+
+// While Placer can override requested device on ops processing
+// resources, i.e. node that take (and potentially return) a resource,
+// it must not override requested device on ops generating a resource,
+// e.g. VarHandleOp, _Arg. Such ops are currently no-input, single resource/ref
+// output nodes.
+bool IsResourceGeneratorNode(const Node& node) {
+  return node.num_inputs() == 0 && node.num_outputs() == 1 &&
+         (IsRefType(node.output_type(0)) || node.output_type(0) == DT_RESOURCE);
+}
+
+bool IsExemptFromResourceInputColocation(const Node* node) {
+  // Note: Partitioned function calls, which place and partition their
+  // function bodies, are exempt from this check: they forward resource and
+  // ref inputs to operations that are appropriately placed, instead of
+  // dereferencing them.
+  const string& op_type = node->op_def().name();
+  return op_type == "PartitionedCall" || op_type == "StatefulPartitionedCall";
+}
+
+bool HasPriorities(const PrioritizedDeviceTypeVector& device_types) {
+  for (const auto& prioritized_device_type : device_types) {
+    if (prioritized_device_type.second != 0) return true;
+  }
+  return false;
+}
+
+bool ArePrioritiesSame(const PrioritizedDeviceTypeVector& a_types,
+                       const PrioritizedDeviceTypeVector& b_types) {
+  if (a_types.size() != b_types.size()) {
+    return false;
+  }
+  for (int i = 0; i < a_types.size(); ++i) {
+    if (a_types[i].first != b_types[i].first) {
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace
+
+Status Member::SetParentAndSupportedDevices(
+    const Node& node, const std::vector<DeviceType>& types) {
+  int id = node.id();
+  if (id < 0) {
+    return errors::Internal("Placer should not be creating a Member for node: ",
+                            node.DebugString());
+  }
+  parent_ = id;
+  return SupportedDeviceTypesForNode(types, node.def(),
+                                     &supported_device_types_);
+}
+
+Status Member::SetAssignedDeviceName(const string& device_name) {
+  if (DeviceNameUtils::HasSomeDetails(requested_device_name_)) {
+    return errors::Internal(
+        "Setting assigned device name when there is a requested device set "
+        "is unsupported");
+  }
+  if (!DeviceNameUtils::ParseFullName(device_name, &assigned_device_name_)) {
+    return errors::Internal("Malformed assigned device '", device_name, "'");
+  }
+  // Set requested device to assigned_device to maintain the invariant that
+  // requested is a specialization of assigned.
+  requested_device_name_ = assigned_device_name_;
+  return Status::OK();
+}
+
+Status Member::SetRequestedDeviceName(const Node& node) {
+  if (!DeviceNameUtils::ParseFullName(node.requested_device(),
+                                      &requested_device_name_)) {
+    return errors::InvalidArgument("Malformed device specification '",
+                                   node.requested_device(),
+                                   "' in node: ", node.DebugString());
+  }
+  if (DeviceNameUtils::HasSomeDetails(assigned_device_name_)) {
+    return errors::Internal(
+        "Setting requested device name when there is an assigned device set "
+        "is unsupported");
+  }
+  return Status::OK();
+}
+
+Status Member::EnsureCompatibilityAcrossResourceEdge(
+    const Node& src, const Member& src_root,
+    const Node& dst, /*dst_root is this*/
+    bool log_device_placement) {
+  if (!DeviceNameUtils::AreCompatibleDevNames(src_root.assigned_device_name_,
+                                              assigned_device_name_)) {
+    return errors::InvalidArgument(
+        "Cannot place the graph because a reference or resource edge "
+        "connects colocation groups with incompatible assigned devices: ",
+        DeviceNameUtils::ParsedNameToString(src_root.assigned_device_name_),
+        " vs ", DeviceNameUtils::ParsedNameToString(assigned_device_name_));
+  }
+
+  if (DeviceNameUtils::AreCompatibleDevNames(src_root.requested_device_name_,
+                                             requested_device_name_)) {
+    return Status::OK();
+  }
+
+  // If we are here, assigned devices are compatible but requested ones are
+  // not. We will be overriding the requested device for destination node, but
+  // need to preserve the invariant that it will be a specialization of
+  // the assigned device.
+  if (log_device_placement) {
+    LOG(INFO) << "Ignoring device specification "
+              << DeviceNameUtils::ParsedNameToString(requested_device_name_)
+              << " for node '" << dst.name()
+              << "' because the input edge from '" << src.name()
+              << "' is a reference connection and already has a device "
+                 "field set to "
+              << DeviceNameUtils::ParsedNameToString(
+                     src_root.requested_device_name_);
+  }
+  requested_device_name_ = src_root.requested_device_name_;
+  DeviceNameUtils::EnsureSpecification(&requested_device_name_,
+                                       assigned_device_name_);
+  return Status::OK();
+}
+
+void Member::Merge(std::vector<Member>* tree, int x_root, int y_root,
+                   Member** new_root, Member** old_root, bool dry_run) {
+  Member& x_root_member = (*tree)[x_root];
+  Member& y_root_member = (*tree)[y_root];
+
+  // Merge the sets by setting the parent pointer of the smaller tree's root
+  // node to point to the root of the larger tree. Together with path
+  // compression in ColocationGraph::FindRoot, this ensures that we do not
+  // experience pathological performance on graphs such as chains.
+  int new_root_id, old_root_id;
+  if (x_root_member.rank_ < y_root_member.rank_) {
+    // The tree rooted at x_root is shallower, so connect it to
+    // y_root. The rank of y_root is unchanged because its new
+    // child has strictly less rank.
+    if (!dry_run) {
+      x_root_member.parent_ = y_root;
+    }
+    new_root_id = y_root;
+    old_root_id = x_root;
+  } else if (x_root_member.rank_ > y_root_member.rank_) {
+    // The tree rooted at y_root is shallower, so connect it to
+    // x_root. The rank of x_root is unchanged because its new
+    // child has strictly less rank.
+    if (!dry_run) {
+      y_root_member.parent_ = x_root;
+    }
+    new_root_id = x_root;
+    old_root_id = y_root;
+  } else {
+    if (!dry_run) {
+      // Both trees have the same rank, so break the tie by choosing
+      // x_root as the new root.
+      y_root_member.parent_ = x_root;
+      // Increment the rank of the tree rooted at x_root, because it
+      // is now strictly deeper than before.
+      ++x_root_member.rank_;
+    }
+    new_root_id = x_root;
+    old_root_id = y_root;
+  }
+
+  *new_root = &(*tree)[new_root_id];
+  *old_root = &(*tree)[old_root_id];
+}
+
+// tree is non-const because we can change some `parent` pointers in some
+// members for more efficient future lookups. The vector itself is not
+// changed.
+int Member::FindRoot(std::vector<Member>* tree, int node_id) {
+  Member& member = (*tree)[node_id];
+  if (member.parent_ == node_id) {
+    // member.parent is the root of this disjoint tree.  Do nothing.
+  } else {
+    member.parent_ = FindRoot(tree, member.parent_);
+  }
+  // Now it is guaranteed that member.parent is the root of this disjoint
+  // tree.
+  return member.parent_;
+}
+
+Status Member::MergeDeviceNames(const Member& other,
+                                bool allow_soft_placement) {
+  // Assuming the "requested is a specialization of assigned" invariant holds
+  // for this and `other`, it will hold after the two merges below.
+  DeviceNameUtils::ParsedName assigned_device_name_copy = assigned_device_name_;
+  TF_RETURN_IF_ERROR(DeviceNameUtils::MergeDevNames(
+      &assigned_device_name_copy, other.assigned_device_name_));
+
+  DeviceNameUtils::ParsedName requested_device_name_copy =
+      requested_device_name_;
+  TF_RETURN_IF_ERROR(DeviceNameUtils::MergeDevNames(
+      &requested_device_name_copy, other.requested_device_name_,
+      allow_soft_placement));
+
+  // We checked for all errors, now change the devices.
+  assigned_device_name_ = assigned_device_name_copy;
+  requested_device_name_ = requested_device_name_copy;
+  return Status::OK();
+}
+
+// Updates this to contain the intersection of the device types in
+// this and "other".
+bool Member::MergeSupportedDevices(const Member& other) {
+  // Generate intersection with priorities.
+  // Each vector contains the same device types but with different priorities.
+  // The priorities are taken from the corresponding source vector.
+  PrioritizedDeviceTypeVector target_intersection;
+  PrioritizedDeviceTypeVector other_intersection;
+  for (const auto& prioritized_device_type : supported_device_types_) {
+    bool found = false;
+    for (const auto& other_prioritized_device_type :
+         other.supported_device_types_) {
+      if (prioritized_device_type.first ==
+          other_prioritized_device_type.first) {
+        found = true;
+        other_intersection.push_back(other_prioritized_device_type);
+        break;
+      }
+    }
+    if (found) {
+      target_intersection.push_back(prioritized_device_type);
+    }
+  }
+
+  // Sort the devices by priority order.
+  auto device_sort = [](const std::pair<DeviceType, int32>& a,
+                        const std::pair<DeviceType, int32>& b) {
+    // First look at set priorities.
+    if (a.second != b.second) {
+      return a.second > b.second;
+    }
+    // Then fallback to default priorities.
+    auto a_priority = DeviceSet::DeviceTypeOrder(a.first);
+    auto b_priority = DeviceSet::DeviceTypeOrder(b.first);
+    if (a_priority != b_priority) {
+      return a_priority > b_priority;
+    }
+    // Finally just look at the Device type strings.
+    return a.first.type_string() < b.first.type_string();
+  };
+
+  std::sort(target_intersection.begin(), target_intersection.end(),
+            device_sort);
+  std::sort(other_intersection.begin(), other_intersection.end(), device_sort);
+
+  PrioritizedDeviceTypeVector result;
+
+  bool is_target_prioritized = HasPriorities(target_intersection);
+  bool is_other_prioritized = HasPriorities(other_intersection);
+  if (!is_target_prioritized && !is_other_prioritized) {
+    // If neither are prioritized then we just return the original i.e. target
+    // prioritization.
+    result = target_intersection;
+  } else if (is_target_prioritized && !is_other_prioritized) {
+    // If only one is prioritized, then we respect priorities of that in the
+    // intersection.
+    result = target_intersection;
+  } else if (!is_target_prioritized && is_other_prioritized) {
+    result = other_intersection;
+  } else {
+    // If both have priorities and agree then we go with that. If the
+    // prioritization order is different, then we just fallback to the default
+    // i.e. what the DeviceTypeOrder suggests. In that case, we also set the
+    // merged priorities to 0, so that downstream merges work correctly as well.
+    if (ArePrioritiesSame(target_intersection, other_intersection)) {
+      result = target_intersection;
+    } else {
+      for (const auto& prioritized_device : target_intersection) {
+        result.push_back(std::make_pair(prioritized_device.first, 0));
+      }
+      std::sort(result.begin(), result.end(), device_sort);
+    }
+  }
+
+  if (result.empty()) {
+    return false;
+  }
+  supported_device_types_ = result;
+  return true;
+}
+
+Status Member::AssignDevice(const Node& node, bool allow_soft_placement) {
+  if (node.assigned_device_name_index() == assigned_device_name_index_) {
+    return Status::OK();
+  }
+
+  DeviceNameUtils::ParsedName parsed;
+  DeviceNameUtils::ParseFullName(node.assigned_device_name(), &parsed);
+  Status s = DeviceNameUtils::MergeDevNames(&assigned_device_name_, parsed,
+                                            allow_soft_placement);
+  if (!s.ok()) {
+    return errors::Internal(
+        "Constraining by assigned device should not cause an error. Original "
+        "root's assigned device name: ",
+        DeviceNameUtils::ParsedNameToString(assigned_device_name_),
+        " node's assigned device name \"", node.assigned_device_name(),
+        ". Error: ", s.error_message());
+  }
+  s = DeviceNameUtils::MergeDevNames(&requested_device_name_, parsed,
+                                     allow_soft_placement);
+  if (!s.ok()) {
+    return errors::Internal(
+        "Constraining by assigned device should not cause an error. Original "
+        "root's requested device name: \"",
+        DeviceNameUtils::ParsedNameToString(requested_device_name_),
+        "\", node's assigned device name \"", node.assigned_device_name(),
+        "\". Error: ", s.error_message());
+  }
+
+  assigned_device_name_index_ = node.assigned_device_name_index();
+  // Clear cached possible_devices, if any.
+  possible_devices_.clear();
+  return Status::OK();
+}
+string Member::DebugString() {
+  return absl::StrCat(
+      "Member(assigned_device_name_index_=", assigned_device_name_index_,
+      " requested_device_name_=",
+      DeviceNameUtils::ParsedNameToString(requested_device_name_),
+      " assigned_device_name_=",
+      DeviceNameUtils::ParsedNameToString(assigned_device_name_),
+      " supported_device_types_=[",
+      absl::StrJoin(DeviceTypeAndPriorityToString(supported_device_types_),
+                    ", "),
+      "] possible_devices_=[",
+      absl::StrJoin(DevicesToString(possible_devices_), ", "), "]");
+}
+ColocationGraph::ColocationGraph(const Graph* graph,
+                                 const DeviceSet* device_set,
+                                 const Device* default_device,
+                                 bool allow_soft_placement,
+                                 bool log_device_placement)
+    : graph_(graph),
+      device_set_(device_set),
+      device_types_(device_set->PrioritizedDeviceTypeList()),
+      default_device_(default_device),
+      allow_soft_placement_(allow_soft_placement),
+      log_device_placement_(log_device_placement) {
+  members_.resize(graph->num_node_ids());
+}
+
+// Adds each node of the Graph to this ColocationGraph as a singleton.
+//
+// NOTE: The implementation assumes that the ids of nodes passed to
+// this method are dense and zero-based; the memory used will be linear in
+// the largest node ID.
+// NOTE: If this method returns an error, *this is left in an undefined
+// state.
+Status ColocationGraph::ColocateAllNodes() {
+  // This maps from a colocation group identifier to the 'root' of that
+  // colocation group.  Note that the keys in this map are StringPiece; the
+  // actual strings are stored under the NodeDef.  The lifetime of this map
+  // is limited to this ColocateAllNodes() method, and no part of the
+  // NodeDef trees are changed during the lifetime of this method, so using
+  // StringPiece as a key is safe.
+  //
+  // Also, as a further optimization, we remove the "loc:@" prefix from
+  // "class" attribute values, when they are used as keys in this table.
+  // This allows us to use StringPiece values that refer to substrings of
+  // 'string' values stored in NodeDef attribute lists, as well as StringPiece
+  // values that refer to 'string' values from NodeDef::name(), without
+  // performing any string allocations.
+  std::unordered_map<StringPiece, const Node*, StringPieceHasher>
+      colocation_group_root;
+
+  for (const Node* node : graph_->op_nodes()) {
+    // When adding the node, identify whether it is part of a colocation
+    // group.
+
+    // This code is effectively the equivalent of GetNodeAttr() for a string
+    // array, but it avoids all internal allocations (the allocation of the
+    // backing store of the std::vector<string> as well as the copies of the
+    // strings within it).  Instead, we combine the query of the colocation
+    // attribute with the calls to ColocateNodeToGroup.
+    bool found_spec = false;
+    const AttrValue* attr_value =
+        node->attrs().Find(kColocationAttrNameStringPiece);
+    if (attr_value != nullptr && attr_value->has_list()) {
+      for (const string& class_spec : attr_value->list().s()) {
+        StringPiece spec(class_spec);
+        if (str_util::ConsumePrefix(&spec, kColocationGroupPrefixStringPiece)) {
+          found_spec = true;
+          TF_RETURN_IF_ERROR(
+              ColocateNodeToGroup(&colocation_group_root, node, spec));
+        }
+      }
+    }
+
+    // TODO(iga): Even when the node has a spec, we need to colocate the
+    // node to its "name group" because other nodes can still use
+    // "loc:@<this_node_name>" in their colocation specs.
+    if (!found_spec) {
+      // If the node does not specify a colocation group, then use the
+      // name of this node as the colocation group.
+      TF_RETURN_IF_ERROR(
+          ColocateNodeToGroup(&colocation_group_root, node, node->name()));
+    }
+  }
+
+  return Status::OK();
+}
+
+Status ColocationGraph::ColocateResourceOrRefEdge(Node* src, Node* dst) {
+  // Colocate `src` and `dst` to maintain the invariant that nodes
+  // connected by reference edges are colocated.
+  int src_root_id = FindRoot(src->id());
+  int dst_root_id = FindRoot(dst->id());
+  auto& src_root = members_[src_root_id];
+  auto& dst_root = members_[dst_root_id];
+
+  TF_RETURN_IF_ERROR(dst_root.EnsureCompatibilityAcrossResourceEdge(
+      *src, src_root, *dst, log_device_placement_));
+  Status status = ColocateNodes(*src, src_root_id, *dst, dst_root_id);
+  if (!status.ok()) {
+    return AttachDef(
+        errors::InvalidArgument("Nodes were connected by a "
+                                "reference connection (requiring them to "
+                                "be on the same device), but the two nodes "
+                                "were assigned two different devices: ",
+                                status.error_message()),
+        *dst);
+  }
+  return Status::OK();
+}
+
+Status ColocationGraph::ColocateResourceAndRefEdges() {
+  // Enumerate the constraint edges, and use them to update the disjoint
+  // node set.
+  // If `node` has an input edge with reference type, add an edge from the
+  // source of that edge to `node`.
+  for (const Edge* edge : graph_->edges()) {
+    if (edge->IsControlEdge()) {
+      continue;
+    }
+    Node* src = edge->src();
+    Node* dst = edge->dst();
+    DataType input_type = dst->input_type(edge->dst_input());
+    if ((input_type == DT_RESOURCE || IsRefType(input_type)) &&
+        !IsExemptFromResourceInputColocation(dst)) {
+      TF_RETURN_IF_ERROR(ColocateResourceOrRefEdge(src, dst));
+    }
+  }
+  return Status::OK();
+}
+
+Status ColocationGraph::Initialize() {
+  TF_RETURN_IF_ERROR(InitializeMembers());
+  TF_RETURN_IF_ERROR(ColocateResourceAndRefEdges());
+  TF_RETURN_IF_ERROR(ColocateAllNodes());
+  return Status::OK();
+}
+
+Status ColocationGraph::ColocateNodeToGroup(
+    std::unordered_map<StringPiece, const Node*, StringPieceHasher>*
+        colocation_group_root,
+    const Node* node, StringPiece colocation_group) {
+  const Node*& root_node = (*colocation_group_root)[colocation_group];
+  if (root_node == nullptr) {
+    // This is the first node of the colocation group, so
+    // designate this node as the 'root' of that colocation group.
+    root_node = node;
+  } else {
+    // Try to colocate the node with the root.  If there is an
+    // error, return it.
+    Status s = ColocateNodes(*node, *root_node);
+    if (!s.ok()) {
+      if (!allow_soft_placement_) {
+        return AttachDef(s, *node);
+      }
+      if (log_device_placement_) {
+        LOG(INFO) << "Ignoring request to colocate node '" << node->name()
+                  << "' with nodes in colocation group '" << colocation_group
+                  << "' because soft placement is on and an attempt at doing "
+                     "so resulted in the following error: "
+                  << AttachDef(s, *node).ToString();
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// Merge the (possibly disjoint) sets containing nodes "x" and
+// "y". Returns OK if the all nodes in the union of these sets can
+// be placed on the same device type.
+//
+// NOTE: If this method returns an error, *this is left in an undefined
+// state.
+Status ColocationGraph::ColocateNodes(const Node& x, const Node& y) {
+  int x_root = FindRoot(x.id());
+  int y_root = FindRoot(y.id());
+  return ColocateNodes(x, x_root, y, y_root);
+}
+
+// This overload of ColocateNodes() allows a caller to provide the root node
+// ids for the two nodes. For large graphs, this noticeably reduces the
+// graph load time.
+Status ColocationGraph::ColocateNodes(const Node& x, int x_root, const Node& y,
+                                      int y_root) {
+  if (x_root == y_root) {
+    return Status::OK();
+  }
+
+  Member* new_root_member;
+  Member* old_root_member;
+  Member::Merge(&members_, x_root, y_root, &new_root_member, &old_root_member,
+                /*dry_run=*/true);
+
+  // Merge the partial device specifications, and ensure that they are
+  // compatible. NULL options_ is treated as allowing soft placement.
+  // If there is an error, nothing is modified.
+  // TODO(mrry): Consider enriching the error message by pointing
+  // out which nodes have the explicit partial device
+  // specifications that caused this conflict.
+  Status s = new_root_member->MergeDeviceNames(*old_root_member,
+                                               allow_soft_placement_);
+  if (!s.ok()) {
+    return errors::InvalidArgument(
+        "Cannot colocate nodes ",
+        errors::FormatColocationNodeForError(x.name()), " and ",
+        errors::FormatColocationNodeForError(y.name()), ": ",
+        s.error_message());
+  }
+
+  // Ensure that the common root has at least one supported device
+  // type, by computing the intersection of
+  // new_root_member.supported_device_types and
+  // old_root_member.supported_device_types.
+  if (!new_root_member->MergeSupportedDevices(*old_root_member)) {
+    return errors::InvalidArgument(
+        "Cannot colocate nodes ",
+        errors::FormatColocationNodeForError(x.name()), " and ",
+        errors::FormatColocationNodeForError(y.name()),
+        " because no device type supports both of those nodes and the "
+        "other nodes colocated with them.",
+        DebugInfo(x_root), DebugInfo(y_root));
+  }
+
+  // All error checks are done, merge the colocation graphs.
+  Member::Merge(&members_, x_root, y_root, &new_root_member, &old_root_member,
+                /*dry_run=*/false);
+  return Status::OK();
+}
+
+// Limits the possible devices of `node`'s colocation group to the device
+// to which `node` is assigned. This makes sure that all nodes in this
+// colocation group will be assigned to the same device. Without this
+// explicit restriction, heuristics can choose a different possible device
+// for other nodes in the group.
+Status ColocationGraph::LimitToAssignedDevice(const Node& node) {
+  if (node.assigned_device_name_index() < 0) {
+    return errors::Internal(
+        "Expected an assigned node as argument to LimitToAssignedDevice but "
+        "got: ",
+        node.DebugString());
+  }
+  int root = FindRoot(node.id());
+  Member& root_member = members_[root];
+  return root_member.AssignDevice(node, allow_soft_placement_);
+}
+
+// For the given node, subject to the constraints previously given
+// to this ColocationGraph, set its assigned_device_name. Returns OK
+// if a satisfying device can be found, otherwise an error.
+//
+// Note: This method returns a pointer to a field within members_.
+// The caller must not use the returned pointer after there is any possibility
+// that the members_[i].possible_devices field has been modified.
+Status ColocationGraph::GetDevicesForNode(
+    Node* node, const std::vector<Device*>** possible_devices) {
+  *possible_devices = nullptr;
+  const int node_root = FindRoot(node->id());
+  if (!members_[node_root].possible_devices().empty()) {
+    *possible_devices = &members_[node_root].possible_devices();
+    return Status::OK();
+  }
+
+  // We have not yet computed the possible devices for the
+  // colocated node set containing 'node', so we do so now using the
+  // constraints on the root node.
+
+  // "devices" will contain the set of feasible placements for the
+  // colocated node set containing 'node'.
+  std::vector<Device*> devices;
+  if (DeviceNameUtils::HasSomeDetails(
+          members_[node_root].requested_device_name())) {
+    // The root node has a (possibly partial) device
+    // specification, so enumerate the physical devices that
+    // conform to it.
+    device_set_->FindMatchingDevices(
+        members_[node_root].requested_device_name(), &devices);
+
+    if (!devices.empty()) {
+      // Filter devices into those that are compatible with the root
+      // node (and its children).
+      devices = FilterSupportedDevices(
+          devices, members_[node_root].supported_device_types(),
+          default_device_);
+    }
+
+    // Perform soft placement if allow_soft_placement_ is set.
+    if (devices.empty() && allow_soft_placement_) {
+      // The soft_device_name is the same as the node's device name
+      // without specifying the device type or ID.
+      DeviceNameUtils::ParsedName soft_device_name =
+          members_[node_root].requested_device_name();
+      soft_device_name.type.clear();
+      soft_device_name.has_type = false;
+      soft_device_name.has_id = false;
+      device_set_->FindMatchingDevices(soft_device_name, &devices);
+      if (!devices.empty()) {
+        devices = FilterSupportedDevices(
+            devices, members_[node_root].supported_device_types(),
+            default_device_);
+      }
+    }
+
+    if (devices.empty()) {
+      // Return an error when a physical device that matches an explicit
+      // device specification is not found. This ensures that we don't
+      // assign a node to GPU when the user wanted to force it on CPU.
+      string debug_info = DebugInfo(node_root);
+
+      DeviceNameUtils::ParsedName specified_device_name;
+      if (DeviceNameUtils::ParseFullName(node->requested_device(),
+                                         &specified_device_name) &&
+          specified_device_name ==
+              members_[node_root].requested_device_name()) {
+        // The specified device and merged set device match, and
+        // will appear in the GraphDef (for debugging), so just
+        // print the specified device.
+        std::vector<Device*> devices_matching_nodedef;
+        device_set_->FindMatchingDevices(specified_device_name,
+                                         &devices_matching_nodedef);
+        if (devices_matching_nodedef.empty()) {
+          // Sometimes it is almost impossible to understand the problem
+          // without a list of available devices.
+          std::vector<string> device_names;
+          for (const Device* device : device_set_->devices()) {
+            device_names.push_back(device->name());
+          }
+          std::sort(device_names.begin(), device_names.end());
+
+          string gpu_msg = "";
+          if (!IsGoogleCudaEnabled() &&
+              str_util::Lowercase(specified_device_name.type) == "gpu") {
+            gpu_msg =
+                " The requested device appears to be a GPU, but CUDA is not "
+                "enabled.";
+          }
+
+          return errors::InvalidArgument(
+              errors::FormatNodeNameForError(node->name()),
+              "was explicitly assigned to ", node->requested_device(),
+              " but available devices are [ ",
+              str_util::Join(device_names, ", "), " ]. Make sure ",
+              "the device specification refers to a valid device.", gpu_msg);
+        } else if (specified_device_name.has_type) {
+          return errors::InvalidArgument(
+              "Could not satisfy explicit device specification '",
+              node->requested_device(), "' because no supported kernel for ",
+              specified_device_name.type, " devices is available.", debug_info,
+              "\nRegistered kernels:\n",
+              KernelsRegisteredForOp(node->type_string()));
+        } else {
+          return errors::InvalidArgument(
+              "Could not satisfy explicit device specification '",
+              node->requested_device(), debug_info);
+        }
+      } else {
+        // The specified device may be a valid device but the
+        // merged set device is different, so print both.
+        return errors::InvalidArgument(
+            "Could not satisfy explicit device specification '",
+            node->requested_device(), "' because the node ",
+            errors::FormatColocationNodeForError(node->name()),
+            " was colocated with a group of nodes that ",
+            "required incompatible device '",
+            DeviceNameUtils::ParsedNameToString(
+                members_[node_root].requested_device_name()),
+            "'", debug_info);
+      }
+    }
+  } else {
+    // The device is completely unspecified, so enumerate the devices that
+    // support all of the nodes in the set.
+    if (device_set_->devices().empty()) {
+      return errors::Internal("No devices are registered");
+    }
+    devices = FilterSupportedDevices(
+        device_set_->devices(), members_[node_root].supported_device_types(),
+        default_device_);
+
+    if (devices.empty()) {
+      return errors::InvalidArgument(
+          "Node had no OpKernel registered to support this operation: ",
+          "Operation was ", node->type_string(), " and inputs were ",
+          DataTypeVectorString(node->input_types()), DebugInfo(node_root));
+    }
+  }
+
+  // Cache the result of the possible devices for this node group.
+  members_[node_root].set_possible_devices(std::move(devices));
+  *possible_devices = &members_[node_root].possible_devices();
+  return Status::OK();
+}
+
+Status ColocationGraph::InitializeMembers() {
+  for (Node* node : graph_->op_nodes()) {
+    Status status = InitializeMember(*node, &members_[node->id()]);
+    if (!status.ok()) {
+      return AttachDef(status, *node);
+    }
+  }
+  return Status::OK();
+}
+
+string ColocationGraph::DebugString() {
+  std::unordered_set<int> roots;
+  std::vector<string> root_strings;
+  for (const Node* node : graph_->nodes()) {
+    if (!node->IsOp()) {
+      continue;
+    }
+    int node_root = FindRoot(node->id());
+    if (roots.count(node_root) == 0) {
+      root_strings.push_back(DebugInfo(node_root));
+      roots.insert(node_root);
+    }
+  }
+  return absl::StrJoin(root_strings, "\n");
+}
+
+// Returns debugging info for the node referred to by 'node_root'.
+string ColocationGraph::DebugInfo(const int node_root) {
+  string text(
+      "\nColocation Debug Info:\n"
+      "Colocation group had the following types and devices: ");
+
+  // If this node is part of a colocation group, then we want to
+  // collect the mapping of ops to supported devices, so that
+  // the user can see why an unsatisfiable placement occurred.
+
+  std::unordered_map<string, string> type_to_devices;
+  std::vector<const Node*> colocation_nodes;
+  int num_nodes_found = 0;
+
+  for (const Node* node : graph_->nodes()) {
+    if (!node->IsOp()) {
+      continue;
+    }
+    int id = node->id();
+    if (FindRoot(id) != node_root) {
+      continue;
+    }
+    ++num_nodes_found;
+    colocation_nodes.push_back(node);
+    const string& op_type = node->type_string();
+    string devices_registered;
+    for (const auto& device_type : members_[id].supported_device_types()) {
+      strings::StrAppend(&devices_registered,
+                         DeviceTypeString(device_type.first), " ");
+    }
+
+    type_to_devices[op_type] = std::move(devices_registered);
+  }
+
+  for (const auto& td : type_to_devices) {
+    strings::StrAppend(&text, "\n", td.first, ": ", td.second);
+  }
+  strings::StrAppend(&text,
+                     "\n\nColocation members and user-requested devices:");
+  for (const Node* node : colocation_nodes) {
+    strings::StrAppend(&text, "\n  ", node->name(), " (", node->type_string(),
+                       ") ", node->requested_device());
+  }
+  strings::StrAppend(&text, "\n");
+
+  if (num_nodes_found <= 0) {
+    text.clear();
+  }
+  return text;
+}
+
+Status ColocationGraph::InitializeMemberWithAssignedDevice(
+    const string& assigned_device_name, const string& node_type,
+    bool must_be_full_name, Member* member) {
+  // This node has already been assigned to a device, so we
+  // respect this placement, after sanity-checking it.
+  // NOTE: Since any assignment must have been performed by
+  // the TensorFlow runtime, we consider errors in this branch to
+  // be INTERNAL.
+  TF_RETURN_IF_ERROR(member->SetAssignedDeviceName(assigned_device_name));
+  if (!must_be_full_name) {
+    return Status::OK();
+  }
+  // Since assigned device must be a full specification, do extra checks.
+  const Device* assigned_device =
+      device_set_->FindDeviceByName(assigned_device_name);
+  if (assigned_device == nullptr) {
+    return errors::Internal("Assigned device '", assigned_device_name,
+                            "' does not match any device");
+  }
+
+  for (const auto& d : member->supported_device_types()) {
+    if (DeviceType(assigned_device->attributes().device_type()) == d.first) {
+      return Status::OK();
+    }
+  }
+
+  return errors::Internal("Assigned device '", assigned_device_name,
+                          "' does not have registered OpKernel support "
+                          "for ",
+                          node_type);
+}
+
+Status ColocationGraph::InitializeMember(const Node& node, Member* member) {
+  TF_RETURN_IF_ERROR(member->SetParentAndSupportedDevices(node, device_types_));
+
+  if (node.has_assigned_device_name()) {
+    TF_RETURN_IF_ERROR(InitializeMemberWithAssignedDevice(
+        node.assigned_device_name(), node.type_string(), true, member));
+  } else {
+    // This node has not yet been assigned to a device, so we
+    // calculate any constraints due to the set of registered
+    // kernels and any (partial) user-provided device specification
+    // in the NodeDef.
+
+    // If no kernels are registered for this op type, fail with an error.
+    if (member->supported_device_types().empty()) {
+      std::set<string> registered_device_types;
+      for (Device* d : device_set_->devices()) {
+        registered_device_types.insert(d->device_type());
+      }
+      std::vector<string> attr_key_vals;
+      for (const auto& it : node.attrs()) {
+        const string& name = it.first;
+        const AttrValue& attr_value = it.second;
+        attr_key_vals.push_back(
+            strings::StrCat(name, "=", SummarizeAttrValue(attr_value)));
+      }
+      return errors::InvalidArgument(
+          "No OpKernel was registered to support Op '", node.type_string(),
+          "' used by ", errors::FormatNodeNameForError(node.name()),
+          "with these attrs: [", str_util::Join(attr_key_vals, ", "),
+          "]\n"
+          "Registered devices: [",
+          str_util::Join(registered_device_types, ", "), "]\n",
+          "Registered kernels:\n", KernelsRegisteredForOp(node.type_string()));
+    }
+
+    // If the NodeDef contains a device, then we interpret it as a
+    // (partial) device specification.
+    if (!node.requested_device().empty()) {
+      if (IsResourceGeneratorNode(node)) {
+        // Treat requested device on resource generating nodes as assigned
+        // device so that we don't override it.
+        TF_RETURN_IF_ERROR(InitializeMemberWithAssignedDevice(
+            node.requested_device(), node.type_string(), false, member));
+      } else {
+        // The user has specified a device in the NodeDef, try to find a
+        // valid device matching their specification in the set of
+        // devices.
+        // NOTE: The full name may specify a device that is not in
+        // n.supported_device_types(), but we check that in AssignDevice().
+        TF_RETURN_IF_ERROR(member->SetRequestedDeviceName(node));
+      }
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/colocation_graph.h b/tensorflow/core/common_runtime/colocation_graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..12611496a5f53764fa13eb839753fd4289cca2d6
--- /dev/null
+++ b/tensorflow/core/common_runtime/colocation_graph.h
@@ -0,0 +1,253 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COLOCATION_GRAPH_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_COLOCATION_GRAPH_H_
+
+#include <unordered_map>
+#include <vector>
+
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/port.h"
+
+namespace tensorflow {
+
+// Represents a node in the disjoint node forest and the
+// accumulated constraints on the device used by that node.
+class Member {
+ public:
+  Member() = default;
+
+  Status SetParentAndSupportedDevices(const Node& node,
+                                      const std::vector<DeviceType>& types);
+
+  const DeviceNameUtils::ParsedName& requested_device_name() const {
+    return requested_device_name_;
+  }
+
+  Status SetAssignedDeviceName(const string& device_name);
+
+  Status SetRequestedDeviceName(const Node& node);
+
+  Status EnsureCompatibilityAcrossResourceEdge(
+      const Node& src, const Member& src_root,
+      const Node& dst, /*dst_root is this*/
+      bool log_device_placement);
+
+  const PrioritizedDeviceTypeVector& supported_device_types() const {
+    return supported_device_types_;
+  }
+
+  // If `dry_run` is true, just sets `new_root` and `old_root` and does not
+  // actually modify anything in the `tree`.
+  static void Merge(std::vector<Member>* tree, int x_root, int y_root,
+                    Member** new_root, Member** old_root, bool dry_run);
+
+  // tree is non-const because we can change some `parent` pointers in some
+  // members for more efficient future lookups. The vector itself is not
+  // changed.
+  static int FindRoot(std::vector<Member>* tree, int node_id);
+
+  Status MergeDeviceNames(const Member& other, bool allow_soft_placement);
+
+  // Updates this to contain the intersection of the device types in
+  // this and "other". If the intersection is empty, returns false and does
+  // not update this. Else returns true and updates this.
+  bool MergeSupportedDevices(const Member& other);
+
+  Status AssignDevice(const Node& node, bool allow_soft_placement);
+
+  void set_possible_devices(std::vector<Device*>&& devices) {
+    possible_devices_ = devices;
+  }
+  const std::vector<Device*>& possible_devices() { return possible_devices_; }
+
+  string DebugString();
+
+ private:
+  // The id of the node that is the parent of this one, or its own
+  // id if it is a root. parent <= 0 indicates that this member is invalid.
+  int parent_ = -1;
+
+  // A proxy for the depth of the tree that is used to prefer
+  // connecting smaller trees to larger trees when merging disjoint
+  // sets.
+  int rank_ = 0;
+
+  // Once colocation groups have been formed, the Placer starts actually
+  // choosing devices. All nodes in a group must be assigned to the same
+  // device. Once we assigned the first device to some node in this group,
+  // we set assigned_device_name_index to this device name's index in the
+  // graph.
+  // The `*_device_name_` fields will contain the parsed name of this device
+  // and `possible_devices`, if computed, will contain just this device.
+  // `assigned_device_name_index` is an optimization to avoid parsing and
+  // comparing device names. The value of -1 signals that a single device
+  // has not been chosen yet.
+  int assigned_device_name_index_ = -1;
+
+  // The merged form of the device requested for this node, with those of all of
+  // its children. requested_device_name_ is always kept a specialization (i.e.
+  // DeviceNameUtils::IsSpecialization) of assigned_device_name_. When no device
+  // is requested, this field is set to assigned_device_name_.  As a
+  // specialization of assigned_device_name_, requested_device_name_ represents
+  // the most specific form of all assigned and requested devices of this node
+  // and its children, if this node is a root. requested_device_name_ is used
+  // to finally select devices for nodes.  We can override requested devices due
+  // to resource colocation constraints but not assigned devices (unless soft
+  // placement is on).
+  DeviceNameUtils::ParsedName requested_device_name_;
+
+  // The merged form of the device assigned for this node, with
+  // those of all of its children.
+  // This field is used to raise errors due to unsatisfiable constraints.
+  // Can be a partial specification.
+  // INVARIANT: requested_device_name_ is always a
+  // DeviceNameUtils::IsSpecialization of assigned_device_name_.
+  DeviceNameUtils::ParsedName assigned_device_name_;
+
+  // The intersection of all device types supported by this node,
+  // and those of all of its children, in priority order
+  // of the preferred device.
+  PrioritizedDeviceTypeVector supported_device_types_;
+
+  // If this node is a root, stores a list of Devices to which this node
+  // and all of its children have been assigned, or nullptr if this
+  // has not yet been computed.
+  std::vector<Device*> possible_devices_;
+};  // namespace
+
+// This class maintains the connected components of a colocation
+// constraint graph, and uses this information to assign a satisfying
+// device placement to the nodes of the graph.
+//
+// The typical usage pattern is:
+//
+//   Graph graph = ...;
+//   DeviceSet device_set = ...;
+//   ColocationGraph colocation_graph(graph, device_set);
+//
+//   // Add all the nodes of the `graph` to the `colocation_graph`.
+//   for (Node* node : graph.nodes()) {
+//     TF_RETURN_IF_ERROR(colocation_graph.AddNode(*node));
+//   }
+//
+//   // Add one or more colocation constraints.
+//   Node node_1 = *graph.FindNodeId(...);
+//   Node node_2 = *graph.FindNodeId(...);
+//   TF_RETURN_IF_ERROR(colocation_graph.ColocateNodes(node_1, node_2));
+//
+//   // Assign devices based on the accumulated constraints.
+//   for (Node* node : graph.nodes()) {
+//     TF_RETURN_IF_ERROR(colocation_graph.AssignDevice(node));
+//   }
+//
+// This implementation uses the Union-Find algorithm to efficiently maintain the
+// connected components and incrementally adds edges via
+// ColocationGraph::ColocateNodes() invocations.
+//
+// ColocationGraph does not assign any devices to graph nodes. The
+// `log_device_placement` argument is used to log messages when requested
+// device is ignored.
+class ColocationGraph {
+ public:
+  ColocationGraph(const Graph* graph, const DeviceSet* device_set,
+                  const Device* default_device, bool allow_soft_placement,
+                  bool log_device_placement);
+
+  // Adds each node of the Graph to this ColocationGraph as a singleton.
+  //
+  // NOTE: The implementation assumes that the ids of nodes passed to
+  // this method are dense and zero-based; the memory used will be linear in
+  // the largest node ID.
+  // NOTE: If this method returns an error, *this is left in an undefined
+  // state.
+  Status ColocateAllNodes();
+
+  Status ColocateResourceOrRefEdge(Node* src, Node* dst);
+
+  Status ColocateResourceAndRefEdges();
+
+  Status Initialize();
+
+  Status ColocateNodeToGroup(
+      std::unordered_map<StringPiece, const Node*, StringPieceHasher>*
+          colocation_group_root,
+      const Node* node, StringPiece colocation_group);
+
+  // Merge the (possibly disjoint) sets containing nodes "x" and
+  // "y". Returns OK if the all nodes in the union of these sets can
+  // be placed on the same device type.
+  //
+  // If this method returns an error, *this is unchanged.
+  Status ColocateNodes(const Node& x, const Node& y);
+
+  // This overload of ColocateNodes() allows a caller to provide the root node
+  // ids for the two nodes. For large graphs, this noticeably reduces the
+  // graph load time.
+  // If this method returns an error, *this is unchanged.
+  Status ColocateNodes(const Node& x, int x_root, const Node& y, int y_root);
+
+  // Limits the possible devices of `node`'s colocation group to the device
+  // to which `node` is assigned. This makes sure that all nodes in this
+  // colocation group will be assigned to the same device. Without this
+  // explicit restriction, heuristics can choose a different possible device
+  // for other nodes in the group.
+  Status LimitToAssignedDevice(const Node& node);
+
+  // For the given node, subject to the constraints previously given
+  // to this ColocationGraph, set its assigned_device_name. Returns OK
+  // if a satisfying device can be found, otherwise an error.
+  //
+  // Note: This method returns a pointer to a field within members_.
+  // The caller must not use the returned pointer after there is any possibility
+  // that the members_[i].possible_devices field has been modified.
+  Status GetDevicesForNode(Node* node,
+                           const std::vector<Device*>** possible_devices);
+
+  Status InitializeMembers();
+
+  string DebugString();
+
+  // Returns debugging info for the node referred to by 'node_root'.
+  string DebugInfo(const int node_root);
+
+  Status InitializeMemberWithAssignedDevice(const string& assigned_device_name,
+                                            const string& node_type,
+                                            bool must_be_full_name,
+                                            Member* member);
+
+  Status InitializeMember(const Node& node, Member* member);
+
+  // Returns the root node of the disjoint tree to which the node with the
+  // given id is connected.
+  int FindRoot(int node_id) { return Member::FindRoot(&members_, node_id); }
+
+  const Graph* const graph_;  // Not owned.
+  std::vector<Member> members_;
+  const DeviceSet* device_set_;  // Not owned.
+  const std::vector<DeviceType> device_types_;
+  const Device* default_device_;
+  const bool allow_soft_placement_;
+  const bool log_device_placement_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_COLOCATION_GRAPH_H_
diff --git a/tensorflow/core/common_runtime/data/BUILD b/tensorflow/core/common_runtime/data/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..124862dbb73422e7645fe460576ac35c83f018aa
--- /dev/null
+++ b/tensorflow/core/common_runtime/data/BUILD
@@ -0,0 +1,35 @@
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+        "//tensorflow_models:__subpackages__",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_protos_all")
+
+cc_library(
+    name = "standalone",
+    srcs = ["standalone.cc"],
+    hdrs = ["standalone.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:session_options",
+    ],
+)
+
+tf_cc_test(
+    name = "standalone_test",
+    srcs = ["standalone_test.cc"],
+    deps = [
+        ":standalone",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ] + tf_protos_all(),
+)
diff --git a/tensorflow/core/common_runtime/data/standalone.cc b/tensorflow/core/common_runtime/data/standalone.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b05bff566f538970fa857a8a38888cd074a06c2f
--- /dev/null
+++ b/tensorflow/core/common_runtime/data/standalone.cc
@@ -0,0 +1,128 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/data/standalone.h"
+
+#include <memory>
+
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace standalone {
+
+Status Iterator::GetNext(std::vector<Tensor>* outputs, bool* end_of_input) {
+  return iterator_->GetNext(ctx_.get(), outputs, end_of_input);
+}
+
+Iterator::Iterator(IteratorBase* iterator, IteratorContext* ctx)
+    : iterator_(iterator), ctx_(ctx) {}
+
+Status Dataset::FromGraph(Params params, const GraphDef& graph_def,
+                          const string& fetch_node,
+                          std::unique_ptr<Dataset>* result) {
+  Graph graph(OpRegistry::Global());
+  TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, nullptr));
+
+  // Instantiate enough of the TensorFlow runtime to run `graph` on a single CPU
+  // device.
+  std::unique_ptr<DeviceMgr> device_mgr = MakeUnique<DeviceMgr>(
+      DeviceFactory::NewDevice("CPU", params.session_options, ""));
+  Device* device = device_mgr->ListDevices()[0];
+  // Clone the `FunctionLibraryDefinition` to extend its lifetime extends beyond
+  // the lifetime of `graph`.
+  std::unique_ptr<FunctionLibraryDefinition> flib_def =
+      MakeUnique<FunctionLibraryDefinition>(graph.flib_def());
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr =
+      MakeUnique<ProcessFunctionLibraryRuntime>(
+          device_mgr.get(), Env::Default(), TF_GRAPH_DEF_VERSION,
+          flib_def.get(), OptimizerOptions{}, nullptr /* parent */);
+
+  // Run graph up to `output_node` and extract the `DatasetBase` stored in the
+  // DT_VARIANT output tensor.
+  data::DatasetBase* dataset;
+  {
+    std::vector<Tensor> outputs;
+    GraphRunner graph_runner(device);
+    TF_RETURN_IF_ERROR(graph_runner.Run(&graph, pflr->GetFLR("/device:CPU:0"),
+                                        {}, {fetch_node}, &outputs));
+    TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(outputs[0], &dataset));
+    // NOTE(mrry): The dataset is currently owned by `outputs[0]`, so acquire an
+    // additional reference.
+    dataset->Ref();
+  }
+
+  std::unique_ptr<thread::ThreadPool> pool(
+      NewThreadPoolFromSessionOptions(params.session_options));
+  *result =
+      WrapUnique(new Dataset(dataset, device_mgr.release(), pflr.release(),
+                             flib_def.release(), pool.release()));
+  return Status::OK();
+}  // static
+
+Status Dataset::MakeIterator(std::unique_ptr<Iterator>* result) {
+  // Create an `IteratorContext`, which bundles together the necessary runtime
+  // support to create and get elements from an iterator.
+  std::unique_ptr<IteratorContext> ctx;
+  {
+    // NOTE(mrry): In the current API, an `IteratorContext` is always initially
+    // created from an `OpKernelContext*`, so we need to create a fake
+    // `OpKernelContext` with the appropriate subset of parameters.
+    OpKernelContext::Params op_params;
+    op_params.function_library = pflr_->GetFLR("/device:CPU:0");
+    op_params.device = device_mgr_->ListDevices()[0];
+    op_params.runner = &runner_;
+    OpKernelContext op_ctx(&op_params, 0);
+    IteratorContext::Params params(&op_ctx);
+    params.function_handle_cache = function_handle_cache_.get();
+    ctx = MakeUnique<IteratorContext>(std::move(params));
+  }
+
+  // Create the iterator from the dataset.
+  std::unique_ptr<IteratorBase> iterator;
+  TF_RETURN_IF_ERROR(dataset_->MakeIterator(ctx.get(), "iterator", &iterator));
+
+  *result = WrapUnique(new Iterator(iterator.release(), ctx.release()));
+
+  return Status::OK();
+}
+
+Dataset::Dataset(DatasetBase* dataset, DeviceMgr* device_mgr,
+                 ProcessFunctionLibraryRuntime* pflr,
+                 FunctionLibraryDefinition* flib_def, thread::ThreadPool* pool)
+    : dataset_(dataset),
+      device_mgr_(device_mgr),
+      flib_def_(flib_def),
+      pflr_(pflr),
+      pool_(pool) {
+  runner_ = [this](std::function<void()> c) { pool_->Schedule(std::move(c)); };
+  function_handle_cache_ =
+      MakeUnique<FunctionHandleCache>(pflr_->GetFLR("/device:CPU:0"));
+}
+
+Dataset::~Dataset() { dataset_->Unref(); }
+
+}  // namespace standalone
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/data/standalone.h b/tensorflow/core/common_runtime/data/standalone.h
new file mode 100644
index 0000000000000000000000000000000000000000..ecea5ba21d0e807b72808c31336916b5f12cb854
--- /dev/null
+++ b/tensorflow/core/common_runtime/data/standalone.h
@@ -0,0 +1,122 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DATA_STANDALONE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_DATA_STANDALONE_H_
+
+#include <memory>
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace data {
+namespace standalone {
+
+// The purpose of the API in this file is to facilitate standalone execution of
+// a tf.data input pipeline graph.
+//
+// The API exposes two abstractions -- a `Dataset` and an `Iterator` -- which
+// encapsulate TensorFlow runtime.
+//
+// The `Dataset` abstraction represents an input pipeline as a collection
+// of data sources and a logical plan of transformations that operate over the
+// data.
+//
+// The `Iterator` abstraction represents an execution of an input pipeline that
+// can be used to enumerate its elements.
+//
+// Example usage:
+//
+//   // Create a `Dataset` by running the `graph_def` graph and fetching the
+//   // output of the `fetch_node` node.
+//   tensorflow::data:standalone::Dataset::Params params;
+//   std::unique_ptr<tensorflow::data::standalone::Dataset> dataset;
+//   Status s = tensorflow::data::standalone::Dataset::FromGraph(
+//      params, graph_def, fetch_node, &dataset);
+//   if (!s.ok()) { /* error handling */ }
+//
+//   std::unique_ptr<tensorflow::data::standalone::Iterator> iterator;
+//   s = dataset->MakeIterator(&iterator);
+//   if (!s.ok()) { /* error handling */ }
+//
+//   bool end_of_input = false;
+//   while (!end_of_input) {
+//     std::vector<tensorflow::Tensor> outputs;
+//     s = iterator->GetNext(&outputs, &end_of_input);
+//     if (!s.ok()) { /* error handling */ }
+//     if (!end_of_input) { /* output handling */ }
+//   }
+
+class Dataset;
+
+// Represents an execution of an input pipeline that can be used to enumerate
+// its elements.
+class Iterator {
+ public:
+  // Returns the next element of the input pipeline (if there is one) and an
+  // indication of whether the end of the input pipeline has been reached.
+  Status GetNext(std::vector<Tensor>* outputs, bool* end_of_input);
+
+ private:
+  friend class Dataset;
+
+  Iterator(IteratorBase* iterator, IteratorContext* ctx);
+
+  std::unique_ptr<IteratorBase> iterator_;
+  std::unique_ptr<IteratorContext> ctx_;
+};
+
+// Represents an input pipeline as a collection of data sources and a logical
+// plan of transformations that operate over the data.
+class Dataset {
+ public:
+  // Parameters for `Dataset` creation (e.g. TensorFlow runtime configuration).
+  struct Params {
+    SessionOptions session_options;
+  };
+
+  // Creates a new `Dataset` instance by running the TensorFlow graph `graph`
+  // and fetching the output of the `fetch_node` node.
+  static Status FromGraph(Params params, const GraphDef& graph_def,
+                          const string& fetch_node,
+                          std::unique_ptr<Dataset>* result);
+
+  ~Dataset();
+
+  // Creates an iterator for this dataset.
+  Status MakeIterator(std::unique_ptr<Iterator>* result);
+
+ private:
+  Dataset(DatasetBase* dataset, DeviceMgr* device_mgr,
+          ProcessFunctionLibraryRuntime* pflr,
+          FunctionLibraryDefinition* flib_def, thread::ThreadPool* pool);
+
+  DatasetBase* dataset_;  // owned
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  std::unique_ptr<thread::ThreadPool> pool_;
+  std::unique_ptr<FunctionHandleCache> function_handle_cache_;
+  std::function<void(std::function<void()>)> runner_;
+};
+
+}  // namespace standalone
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DATA_STANDALONE_H_
diff --git a/tensorflow/core/common_runtime/data/standalone_test.cc b/tensorflow/core/common_runtime/data/standalone_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7e7a7a9b6195c247d94ed137f4bce18cee9851b4
--- /dev/null
+++ b/tensorflow/core/common_runtime/data/standalone_test.cc
@@ -0,0 +1,188 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/data/standalone.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace data {
+namespace standalone {
+namespace {
+
+constexpr const char* const kGraphProto = R"proto(
+  node {
+    name: "Const/_0"
+    op: "Const"
+    attr {
+      key: "dtype"
+      value { type: DT_INT64 }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_INT64
+          tensor_shape {}
+          int64_val: 0
+        }
+      }
+    }
+  }
+  node {
+    name: "Const/_1"
+    op: "Const"
+    attr {
+      key: "dtype"
+      value { type: DT_INT64 }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_INT64
+          tensor_shape {}
+          int64_val: 10
+        }
+      }
+    }
+  }
+  node {
+    name: "Const/_2"
+    op: "Const"
+    attr {
+      key: "dtype"
+      value { type: DT_INT64 }
+    }
+    attr {
+      key: "value"
+      value {
+        tensor {
+          dtype: DT_INT64
+          tensor_shape {}
+          int64_val: 1
+        }
+      }
+    }
+  }
+  node {
+    name: "RangeDataset/_3"
+    op: "RangeDataset"
+    input: "Const/_0"
+    input: "Const/_1"
+    input: "Const/_2"
+    attr {
+      key: "output_shapes"
+      value { list { shape { unknown_rank: true } } }
+    }
+    attr {
+      key: "output_types"
+      value { list { type: DT_INT64 } }
+    }
+  }
+  node {
+    name: "MapDataset/_4"
+    op: "MapDataset"
+    input: "RangeDataset/_3"
+    attr {
+      key: "Targuments"
+      value { list {} }
+    }
+    attr {
+      key: "f"
+      value { func { name: "Dataset_map_<lambda>_10" } }
+    }
+    attr {
+      key: "output_shapes"
+      value { list { shape {} } }
+    }
+    attr {
+      key: "output_types"
+      value { list { type: DT_INT64 } }
+    }
+    attr {
+      key: "preserve_cardinality"
+      value { b: false }
+    }
+    attr {
+      key: "use_inter_op_parallelism"
+      value { b: true }
+    }
+  }
+  library {
+    function {
+      signature {
+        name: "Dataset_map_<lambda>_10"
+        input_arg { name: "arg0" type: DT_INT64 }
+        output_arg { name: "mul" type: DT_INT64 }
+        description: "Wrapper for passing nested structures to and from tf.data functions."
+      }
+      node_def {
+        name: "mul_0"
+        op: "Mul"
+        input: "arg0"
+        input: "arg0"
+        attr {
+          key: "T"
+          value { type: DT_INT64 }
+        }
+      }
+      ret { key: "mul" value: "mul_0:z:0" }
+    }
+  }
+  versions { producer: 27 min_consumer: 12 }
+)proto";
+
+TEST(Scalar, Standalone) {
+  GraphDef graph_def;
+  protobuf::TextFormat::ParseFromString(kGraphProto, &graph_def);
+  struct TestCase {
+    string fetch_node;
+    std::vector<int64> expected_outputs;
+  };
+  auto test_cases = {
+      TestCase{"RangeDataset/_3", {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}},
+      TestCase{"MapDataset/_4", {0, 1, 4, 9, 16, 25, 36, 49, 64, 81}},
+  };
+  for (auto test_case : test_cases) {
+    std::unique_ptr<Dataset> dataset;
+    auto s = Dataset::FromGraph({}, graph_def, test_case.fetch_node, &dataset);
+    TF_EXPECT_OK(s);
+    std::unique_ptr<Iterator> iterator;
+    s = dataset->MakeIterator(&iterator);
+    TF_EXPECT_OK(s);
+    bool end_of_input = false;
+    for (int num_outputs = 0; !end_of_input; ++num_outputs) {
+      std::vector<tensorflow::Tensor> outputs;
+      s = iterator->GetNext(&outputs, &end_of_input);
+      TF_EXPECT_OK(s);
+      if (!end_of_input) {
+        EXPECT_EQ(outputs[0].scalar<int64>()(),
+                  test_case.expected_outputs[num_outputs]);
+      } else {
+        EXPECT_EQ(test_case.expected_outputs.size(), num_outputs);
+      }
+    }
+  }
+}
+
+}  // namespace
+}  // namespace standalone
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h
index 8dfbb21eda641ff9f70c58f1f4bf150ba4cceef3..64119e85c79953760422a13e95c2a63f0bae6b7d 100644
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@@ -44,6 +44,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/types.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -122,9 +123,27 @@ class Device : public DeviceBase {
   // version.
   virtual void Sync(const DoneCallback& done);
 
-  // Override this to return true for devices that require a Sync() call before
-  // session completion.
-  virtual bool RequiresSyncOnCompletion() const { return false; }
+  // On session completion, the executor may call Device::Sync() depending on
+  // flag settings. Override this to return false for devices that don't allow
+  // such calls. Instead, these devices must use other mechanisms (such as
+  // num_deferred_ops) to ensure the device has finished processing necessary
+  // work at session completion. In addition, for these devices, RefreshStatus
+  // must be called at session completion to retrieve execution result status.
+  //
+  // Devices that override this function must also implement RefreshStatus.
+  virtual bool AllowsSyncOnCompletion() const { return true; }
+
+  // This is used in conjunction with AllowsSyncOnCompletion to allow the
+  // executor to get execution result status at session completion.
+  //
+  // For supported devices, this call returns the underlying device stream's
+  // current status in a non-blocking way, without using blocking calls such as
+  // Stream::BlockHostUntilDone or Device::Sync. When applicable, the device
+  // status is also updated with the retrieved stream status.
+  virtual Status RefreshStatus() {
+    return errors::Unimplemented(
+        "RefreshStatus is not supported on this device.");
+  }
 
   // Optionally modify the device's GraphDef before execution.
   //
diff --git a/tensorflow/core/common_runtime/device_resolver_local_test.cc b/tensorflow/core/common_runtime/device_resolver_local_test.cc
index 54f1119e139886096cb7c2007e584003992d86c2..62e82bcc5a3bcaf9faff62b1df557d0c99ddaccc 100644
--- a/tensorflow/core/common_runtime/device_resolver_local_test.cc
+++ b/tensorflow/core/common_runtime/device_resolver_local_test.cc
@@ -56,7 +56,7 @@ TEST_F(DeviceResolverLocalTest, GetDeviceLocalitiesKnown) {
   Notification note;
   Status status;
   drl_->GetDeviceLocalitiesAsync(cp.instance, &localities,
-                                 [this, &note, &status](const Status& s) {
+                                 [&note, &status](const Status& s) {
                                    status = s;
                                    note.Notify();
                                  });
@@ -74,7 +74,7 @@ TEST_F(DeviceResolverLocalTest, GetDeviceLocalitiesUnknown) {
   Notification note;
   Status status;
   drl_->GetDeviceLocalitiesAsync(cp.instance, &localities,
-                                 [this, &note, &status](const Status& s) {
+                                 [&note, &status](const Status& s) {
                                    status = s;
                                    note.Notify();
                                  });
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 0434ca47b68f28ff65cb3d5e165bc5545ebe96f0..40a1ffc42da5020b62932812f3939cbdb7686aff 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -58,6 +58,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
+#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -302,10 +303,8 @@ DirectSession::DirectSession(const SessionOptions& options,
   if (!status.ok()) {
     LOG(ERROR) << status.error_message();
   }
-  // NOTE(mrry): We do not need to use a unique string for the session
-  // handle, because DirectSession owns its devices. This may change
-  // in future versions.
-  session_handle_ = "direct";
+  session_handle_ =
+      strings::StrCat("direct", strings::FpToString(random::New64()));
   int devices_added = 0;
   if (options.config.log_device_placement()) {
     const string mapping_str = device_mgr_->DeviceMappingString();
@@ -370,6 +369,7 @@ Status DirectSession::MaybeInitializeExecutionState(
   GraphExecutionStateOptions options;
   options.device_set = &device_set_;
   options.session_options = &options_;
+  options.session_handle = session_handle_;
   // TODO(mrry,suharshs): We explicitly copy `graph` so that
   // `MakeForBaseGraph()` can take ownership of its
   // contents. Previously this happened implicitly in calls to the
@@ -501,7 +501,8 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
       std::unique_ptr<DeviceResolverInterface> drl(
           new DeviceResolverLocal(device_mgr_.get()));
       std::unique_ptr<ParamResolverInterface> cprl(
-          new CollectiveParamResolverLocal(device_mgr_.get(), drl.get(),
+          new CollectiveParamResolverLocal(options_.config, device_mgr_.get(),
+                                           drl.get(),
                                            "/job:localhost/replica:0/task:0"));
       collective_executor_mgr_.reset(new CollectiveExecutorMgr(
           options_.config, device_mgr_.get(), std::move(drl), std::move(cprl)));
@@ -532,6 +533,7 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
   CancellationManager step_cancellation_manager;
   args.cancellation_manager = &step_cancellation_manager;
   args.session_state = &session_state_;
+  args.session_handle = session_handle_;
   args.tensor_store = &run_state.tensor_store;
   args.step_container = &run_state.step_container;
   args.sync_on_finish = sync_on_finish_;
@@ -718,7 +720,7 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
       exec_and_lib.graph->ToGraphDef(partition_graph_def);
     }
   }
-  UpdateGraphExecTime(Env::Default()->NowMicros() - start_time_usecs);
+  metrics::UpdateGraphExecTime(Env::Default()->NowMicros() - start_time_usecs);
 
   return Status::OK();
 }
@@ -887,6 +889,7 @@ Status DirectSession::PRunSetup(const std::vector<string>& input_names,
     SchedClosure(pool, std::move(c));
   };
   args.session_state = &session_state_;
+  args.session_handle = session_handle_;
   args.tensor_store = &run_state->tensor_store;
   args.step_container = &run_state->step_container;
   if (LogMemory::IsEnabled()) {
@@ -1189,6 +1192,12 @@ Status DirectSession::CreateExecutors(
   options.use_function_convention = !run_state_args->is_partial_run;
   options.collective_graph_key =
       callable_options.run_options().experimental().collective_graph_key();
+  if (options_.config.experimental()
+          .collective_deterministic_sequential_execution()) {
+    options.collective_order = GraphCollectiveOrder::kEdges;
+  } else if (options_.config.experimental().collective_nccl()) {
+    options.collective_order = GraphCollectiveOrder::kAttrs;
+  }
 
   std::unique_ptr<FunctionInfo> func_info(new FunctionInfo);
   std::unique_ptr<ExecutorsAndKeys> ek(new ExecutorsAndKeys);
@@ -1464,6 +1473,7 @@ Status DirectSession::CreateGraphs(
     prune_options.device_set = &device_set_;
     prune_options.session_options = &options_;
     prune_options.stateful_placements = stateful_placements_;
+    prune_options.session_handle = session_handle_;
     TF_RETURN_IF_ERROR(GraphExecutionState::MakeForPrunedGraph(
         execution_state_->original_graph_def().library(), prune_options,
         execution_state_->original_graph_def(), subgraph_options,
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 6754e9cfb71700090049107cf4dd122175527ffe..bcac34154407eb461a80fd3d638ee51a88f3d7fa 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -317,6 +317,7 @@ class DirectSession : public Session {
   std::vector<Device*> devices_;  // not owned
   DeviceSet device_set_;
 
+  // Unique session identifier.
   string session_handle_;
   mutex graph_state_lock_;
   bool graph_created_ GUARDED_BY(graph_state_lock_) = false;
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index 86890ba07d8b9a4320c47ffde1b3b8d78d15ac5a..aef64da79492c238713953b0958089f4abd501a2 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -61,6 +61,10 @@ tf_cuda_library(
             "//tensorflow/core:lib_internal",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core:session_options",
+            "//tensorflow/core/distributed_runtime:collective_param_resolver_distributed",
+            "//tensorflow/core/distributed_runtime:device_resolver_distributed",
+            "//tensorflow/core/distributed_runtime:rpc_collective_executor_mgr",
+            "//tensorflow/core/distributed_runtime:worker_cache",
             "//tensorflow/core/distributed_runtime:server_lib",
             "//tensorflow/core/distributed_runtime:worker_session",
             "//tensorflow/core/distributed_runtime/eager:eager_client",
@@ -102,6 +106,7 @@ tf_cuda_library(
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//conditions:default": [
+            "@com_google_absl//absl/strings",
             "//tensorflow/core:core_cpu_lib",
             "//tensorflow/core:framework",
             "//tensorflow/core:framework_internal",
@@ -150,6 +155,7 @@ tf_cuda_library(
     deps = [
         ":attr_builder",
         "@farmhash_archive//:farmhash",
+        "@com_google_absl//absl/strings",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:android_tensorflow_lib_lite",
@@ -161,6 +167,7 @@ tf_cuda_library(
             "//tensorflow/core:lib",
             "//tensorflow/core:lib_internal",
             "//tensorflow/core:protos_all_cc",
+            "//tensorflow/core/grappler/optimizers:meta_optimizer",
         ],
     }),
 )
@@ -175,12 +182,22 @@ tf_cc_test(
         "//tensorflow/cc:client_session",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
+        "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:sendrecv_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/kernels:constant_op",
+        "//tensorflow/core/kernels:matmul_op",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -199,6 +216,7 @@ cc_library(
         ":eager_operation",
         ":kernel_and_device",
         ":tensor_handle",
+        "@com_google_absl//absl/strings",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:android_tensorflow_lib_lite",
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.cc b/tensorflow/core/common_runtime/eager/attr_builder.cc
index a750f8cbba4de4abd33d6ec395b6b0a5fb76cc67..77be4c951e6a85e71a3d19f5cf43099027c80696 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder.cc
@@ -42,7 +42,7 @@ const uint32 kIsList = 1U << 31;
 AttrTypeMap* DefaultFunctionAttrTypeMap() {
   AttrTypeMap* map = new AttrTypeMap();
   (*map)["executor_type"] = TF_ATTR_STRING;
-  (*map)["config"] = TF_ATTR_STRING;
+  (*map)["config_proto"] = TF_ATTR_STRING;
   return map;
 }
 
@@ -125,6 +125,7 @@ Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out,
   template <>                                                                \
   AttrBuilder& AttrBuilder::Set(StringPiece attr_name, value_type&& value) { \
     value_field.push_back(std::make_pair(string(attr_name), value));         \
+    cached_cache_key_ = absl::nullopt;                                       \
     return *this;                                                            \
   }
 
@@ -231,7 +232,17 @@ inline tensorflow::Fprint128 CacheKeyHelper(StringPiece s, uint64 b) {
 
 }  // namespace
 
-tensorflow::Fprint128 AttrBuilder::CacheKey(const string& device) const {
+tensorflow::Fprint128 AttrBuilder::CacheKey(const string& device) {
+  if (!cached_cache_key_ || device != device_for_cached_cache_key_) {
+    cached_cache_key_ = BuildCacheKeyForDevice(device);
+    device_for_cached_cache_key_ = device;
+  }
+
+  return *cached_cache_key_;
+}
+
+tensorflow::Fprint128 AttrBuilder::BuildCacheKeyForDevice(
+    const string& device) const {
   tensorflow::Fprint128 f = tensorflow::Fingerprint128(op_name_);
   f = tensorflow::FingerprintCat128(f, tensorflow::Fingerprint128(device));
   if (node_def_ != nullptr) {
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.h b/tensorflow/core/common_runtime/eager/attr_builder.h
index 5e0172dfd328dbd4f16abdce879be1d1338e692c..1b3fbcbd4a89a069c603d1f7d5c77d54ea5b06b0 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.h
+++ b/tensorflow/core/common_runtime/eager/attr_builder.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/util/tensor_slice_reader_cache.h"
 
@@ -53,10 +54,6 @@ Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out,
 Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
                       TF_AttrType* out, unsigned char* is_list);
 
-// Looks for 'attr_name' in 'm' and sets 'out' and 'is_list'.
-Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
-                      TF_AttrType* out, unsigned char* is_list);
-
 // KernelAndDevice::Init needs a NodeDef only to pass the attribute map through.
 // An AttrBuilder is a convenience class to help with that - providing a smaller
 // interface than NodeDefBuilder and avoiding expensive (unnecessary?) sanity
@@ -74,7 +71,7 @@ Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
 // AttrBuilder a;
 // a.NumInputs(2);
 // a.Set("T", TF_FLOAT);
-// uint64 cache_key = a.CacheKey("cpu:0");
+// tensorflow::Fprint128 cache_key = a.CacheKey("cpu:0");
 // const NodeDef& n = a.BuildNodeDef();
 //
 // Note that all calls to Set and NumInputs should happen before calling
@@ -100,10 +97,11 @@ class AttrBuilder {
   AttrBuilder& Set(StringPiece attr_name, T&& value) {
     MayBeInitializeNodeDef();
     SetInAttrValueMap(node_def_->mutable_attr(), string(attr_name), value);
+    cached_cache_key_ = absl::nullopt;
     return *this;
   }
 
-  tensorflow::Fprint128 CacheKey(const string& device) const;
+  tensorflow::Fprint128 CacheKey(const string& device);
 
   void FillAttrValueMap(AttrValueMap* m) const { FillAttrValueMap(m, true); }
   const NodeDef& BuildNodeDef();
@@ -112,6 +110,8 @@ class AttrBuilder {
   template <class T>
   using AttrVec = tensorflow::gtl::InlinedVector<std::pair<string, T>, 2>;
 
+  tensorflow::Fprint128 BuildCacheKeyForDevice(const string& device) const;
+
   void MayBeInitializeNodeDef();
   // Fill `m` with the attr-value pairs set via AttrBuilder::Set() so far, as
   // well as any default attr-value pairs from the associated op_def, if there
@@ -148,6 +148,9 @@ class AttrBuilder {
   int num_inputs_;
   std::unique_ptr<NodeDef> node_def_;
   bool node_def_finalized_;
+
+  absl::optional<tensorflow::Fprint128> cached_cache_key_;
+  string device_for_cached_cache_key_;
 };  // namespace tensorflow
 
 template <>
diff --git a/tensorflow/core/common_runtime/eager/attr_builder_test.cc b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
index 220cc6f5ce0bff32cfdc8d4e837c6900c773728e..31c998a670a0a6613bbaca437d8d3e4f9f976443 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder_test.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
@@ -41,8 +41,10 @@ TEST(AttrTypeMap, Lookup) {
   Status s = AttrTypeMapForOp("SomeFunctionName", &m, &is_function);
   EXPECT_TRUE(s.ok());
   EXPECT_TRUE(is_function);
+  ASSERT_NE(m->end(), m->find("executor_type"));
   EXPECT_EQ(TF_ATTR_STRING, m->find("executor_type")->second);
-  EXPECT_EQ(TF_ATTR_STRING, m->find("config")->second);
+  ASSERT_NE(m->end(), m->find("config_proto"));
+  EXPECT_EQ(TF_ATTR_STRING, m->find("config_proto")->second);
 
   is_function = true;
   s = AttrTypeMapForOp("MatMul", &m, &is_function);
@@ -67,5 +69,18 @@ TEST(AttrTypeMap, Lookup) {
   EXPECT_NE(is_list, 0);
 }
 
+TEST(AttrTypeMap, CacheKey) {
+  AttrBuilder a("op_name");
+  a.NumInputs(2);
+  a.Set("T", TF_FLOAT);
+  tensorflow::Fprint128 cache_key = a.CacheKey("cpu:0");
+
+  ASSERT_FALSE(cache_key == a.CacheKey("cpu:1"));
+  ASSERT_TRUE(cache_key == a.CacheKey("cpu:0"));
+
+  a.Set("x", 1.0);
+  ASSERT_FALSE(cache_key == a.CacheKey("cpu:0"));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 1727c045604bd19e038857fa34780f34cbb05d44..cdd5632f9de00dfc07a1df4906cfcc70d99d69cb 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -20,6 +20,11 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_resolver_local.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/process_util.h"
+#ifndef __ANDROID__
+#include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h"
+#endif
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/util/env_var.h"
@@ -54,8 +59,8 @@ EagerContext::EagerContext(const SessionOptions& opts,
       rendezvous_(rendezvous),
       thread_pool_(NewThreadPoolFromSessionOptions(opts)),
       pflr_(new ProcessFunctionLibraryRuntime(
-          device_mgr, opts.env, TF_GRAPH_DEF_VERSION, &func_lib_def_, {},
-          thread_pool_.get())),
+          device_mgr, opts.env, TF_GRAPH_DEF_VERSION, &func_lib_def_,
+          opts.config.graph_options().optimizer_options(), thread_pool_.get())),
       log_device_placement_(opts.config.log_device_placement()),
       num_active_steps_(0),
       async_default_(async),
@@ -63,7 +68,7 @@ EagerContext::EagerContext(const SessionOptions& opts,
       env_(opts.env),
       use_send_tensor_rpc_(false),
       pin_small_ops_to_cpu_(ReadBoolFromEnvVar(
-          "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING", true)) {
+          "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING", false)) {
   if (device_mgr_owned) {
     local_device_manager_.reset(device_mgr);
     local_unowned_device_manager_ = nullptr;
@@ -78,7 +83,8 @@ EagerContext::EagerContext(const SessionOptions& opts,
   std::unique_ptr<DeviceResolverInterface> drl(
       new DeviceResolverLocal(local_device_mgr()));
   std::unique_ptr<ParamResolverInterface> cprl(new CollectiveParamResolverLocal(
-      local_device_mgr(), drl.get(), "/job:localhost/replica:0/task:0"));
+      opts.config, local_device_mgr(), drl.get(),
+      "/job:localhost/replica:0/task:0"));
   collective_executor_mgr_.reset(new CollectiveExecutorMgr(
       opts.config, local_device_mgr(), std::move(drl), std::move(cprl)));
 }
@@ -204,6 +210,14 @@ EagerContext::~EagerContext() {
   executor_.WaitForAllPendingNodes().IgnoreError();
   ClearCaches();
   rendezvous_->Unref();
+
+  for (auto& thread : child_threads_) {
+    thread.reset();
+  }
+}
+
+void EagerContext::AddChildThread(std::unique_ptr<Thread> thread) {
+  child_threads_.push_back(std::move(thread));
 }
 
 bool EagerContext::FindFunctionByName(const string& name) {
@@ -231,6 +245,29 @@ Status EagerContext::FindDeviceByName(const string& name, Device** result) {
   return Status::OK();
 }
 
+void EagerContext::ClearRunMetadata() {
+  if (metadata_listener_ != nullptr) {
+    metadata_listener_->BeforeClearRunMetadata();
+  }
+  run_metadata_.Clear();
+}
+
+Status EagerContext::RegisterRunMetadataListener(
+    RunMetadataListener* listener) {
+  mutex_lock l(metadata_mu_);
+  if (metadata_listener_ != nullptr) {
+    return Status(error::Code::INVALID_ARGUMENT,
+                  "Cannot run two eager profiler at the same time");
+  }
+  metadata_listener_ = listener;
+  return Status::OK();
+}
+
+void EagerContext::ClearRunMetadataListener() {
+  mutex_lock l(metadata_mu_);
+  metadata_listener_ = nullptr;
+}
+
 void EagerContext::StartStep() {
   mutex_lock ml(metadata_mu_);
   num_active_steps_++;
@@ -314,10 +351,28 @@ void EagerContext::AddKernelToCache(Fprint128 cache_key,
   gtl::InsertOrUpdate(&kernel_cache_, cache_key, kernel);
 }
 
-void EagerContext::SetShouldStoreMetadata(bool value) {
-  should_store_metadata_.store(value);
-  if (!value) {
-    mutex_lock ml(metadata_mu_);
+bool EagerContext::ShouldStoreGraphs() {
+  mutex_lock ml(metadata_mu_);
+  return should_store_graphs_.load() || metadata_listener_ != nullptr;
+}
+
+bool EagerContext::ShouldStoreStepStats() {
+  mutex_lock ml(metadata_mu_);
+  return should_store_step_stats_.load() || metadata_listener_ != nullptr;
+}
+
+void EagerContext::SetShouldStoreGraphs(bool value) {
+  mutex_lock ml(metadata_mu_);
+  should_store_graphs_.store(value);
+  if (!value || metadata_listener_ != nullptr) {
+    run_metadata_.Clear();
+  }
+}
+
+void EagerContext::SetShouldStoreStepStats(bool value) {
+  mutex_lock ml(metadata_mu_);
+  should_store_step_stats_.store(value);
+  if (!value || metadata_listener_ != nullptr) {
     run_metadata_.Clear();
   }
 }
@@ -364,6 +419,36 @@ Status EagerContext::GetClientAndContextID(Device* device,
   return Status::OK();
 }
 
+Status EagerContext::StoreCollectiveOpsServer(
+    std::unique_ptr<ServerInterface> server, DeviceMgr* device_mgr,
+    CollectiveExecutorMgrInterface* rpc_collective_executor_mgr) {
+  collective_executor_mgr_.reset(nullptr);
+  unowned_collective_executor_mgr_ = rpc_collective_executor_mgr;
+
+  local_device_manager_.reset(nullptr);
+  local_unowned_device_manager_ = device_mgr;
+
+  devices_ = local_unowned_device_manager_->ListDevices();
+  devices_map_.clear();
+
+  InitDeviceMapAndAsync();
+  ClearCaches();
+
+  pflr_.reset(new ProcessFunctionLibraryRuntime(
+      local_unowned_device_manager_, env_, TF_GRAPH_DEF_VERSION, &func_lib_def_,
+      {}, thread_pool_.get()));
+
+  // Memory leak!
+  if (server_ != nullptr) {
+    LOG(WARNING) << "Unable to destroy server_ object, so releasing instead. "
+                    "Servers don't support clean shutdown.";
+    server_.release();
+  }
+  server_ = std::move(server);
+
+  return Status::OK();
+}
+
 void EagerContext::InitializeRemote(
     std::unique_ptr<ServerInterface> server,
     std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index cdef94789337550fdaa760638f098ba47af5dfdb..330936e2330f86d6cba4ec3602b3c03a937ee42c 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -29,9 +29,12 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/example/example.pb.h"
+#include "tensorflow/core/platform/env.h"
 #ifndef __ANDROID__
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
 #endif
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/log_memory.h"
@@ -65,6 +68,12 @@ enum ContextDevicePlacementPolicy {
   DEVICE_PLACEMENT_SILENT_FOR_INT32 = 3,
 };
 
+class RunMetadataListener {
+ public:
+  virtual ~RunMetadataListener() {}
+  virtual void BeforeClearRunMetadata() = 0;
+};
+
 class EagerContext {
  public:
   // TODO: remove this constructor once we migrate all callers to the next one.
@@ -85,6 +94,8 @@ class EagerContext {
     return pflr_->GetFLR(d->name());
   }
 
+  ProcessFunctionLibraryRuntime* pflr() const { return pflr_.get(); }
+
   // True if running in asynchronous mode.
   bool Async() const;
 
@@ -130,7 +141,7 @@ class EagerContext {
 
   Status FindDeviceByName(const string& name, Device** result);
 
-  Device* HostCPU() { return devices_[0]; }
+  Device* HostCPU() const { return devices_[0]; }
 
   GraphCollector* GetGraphCollector() { return &graph_collector_; }
 
@@ -144,21 +155,26 @@ class EagerContext {
 
   void AddKernelToCache(Fprint128 cache_key, KernelAndDevice* kernel);
 
-  bool LogDevicePlacement() { return log_device_placement_; }
-  bool LogMemory() { return log_memory_; }
+  bool LogDevicePlacement() const { return log_device_placement_; }
+  bool LogMemory() const { return log_memory_; }
 
-  Rendezvous* GetRendezvous() { return rendezvous_; }
+  Rendezvous* GetRendezvous() const { return rendezvous_; }
+  CollectiveExecutorMgrInterface* collective_executor_mgr() {
+    return (collective_executor_mgr_ != nullptr)
+               ? collective_executor_mgr_.get()
+               : unowned_collective_executor_mgr_;
+  }
   std::unique_ptr<CollectiveExecutor::Handle> GetCollectiveExecutorHandle() {
     return std::unique_ptr<CollectiveExecutor::Handle>(
         new CollectiveExecutor::Handle(
-            collective_executor_mgr_->FindOrCreate(0), true /*inherit_ref*/));
+            collective_executor_mgr()->FindOrCreate(0), true /*inherit_ref*/));
   }
 
   const tensorflow::DeviceMgr* local_device_mgr() const {
     return (local_device_manager_ != nullptr) ? local_device_manager_.get()
                                               : local_unowned_device_manager_;
   }
-  const tensorflow::DeviceMgr* remote_device_mgr() {
+  const tensorflow::DeviceMgr* remote_device_mgr() const {
     return remote_device_manager_.get();
   }
 
@@ -166,10 +182,17 @@ class EagerContext {
   void ReleaseDeviceMgr() { local_device_manager_.release(); }
 
   // TODO(apassos) clean up RunMetadata storage.
-  mutex* MetadataMu() { return &metadata_mu_; }
-  bool ShouldStoreMetadata() { return should_store_metadata_.load(); }
-  void SetShouldStoreMetadata(bool value);
+  mutex* MetadataMu() LOCK_RETURNED(metadata_mu_) { return &metadata_mu_; }
+  bool ShouldStoreStepStats() LOCKS_EXCLUDED(metadata_mu_);
+  void SetShouldStoreStepStats(bool value);
+  bool ShouldStoreGraphs() LOCKS_EXCLUDED(metadata_mu_);
+  void SetShouldStoreGraphs(bool value);
   RunMetadata* RunMetadataProto() { return &run_metadata_; }
+  void ClearRunMetadata() EXCLUSIVE_LOCKS_REQUIRED(metadata_mu_);
+
+  Status RegisterRunMetadataListener(RunMetadataListener* listener)
+      LOCKS_EXCLUDED(metadata_mu_);
+  void ClearRunMetadataListener() LOCKS_EXCLUDED(metadata_mu_);
 
   void StartStep();
   void EndStep();
@@ -204,6 +227,10 @@ class EagerContext {
     return active_remote_contexts_.find(context_id) !=
            active_remote_contexts_.end();
   }
+
+  Status StoreCollectiveOpsServer(
+      std::unique_ptr<ServerInterface> server, DeviceMgr* device_mgr,
+      CollectiveExecutorMgrInterface* rpc_collective_executor_mgr);
 #endif
 
   // If true, then tensors should be shipped across processes via the
@@ -214,6 +241,9 @@ class EagerContext {
 
   tensorflow::Env* TFEnv() const { return env_; }
 
+  // All child threads will be reset() when destructing EagerContext.
+  void AddChildThread(std::unique_ptr<Thread> thread);
+
  private:
   void InitDeviceMapAndAsync();
   Status MaybeRegisterFunctionRemotely(const FunctionDef& fdef);
@@ -256,9 +286,11 @@ class EagerContext {
       GUARDED_BY(cache_mu_);
 
   // Whether we should compute RunMetadata.
-  std::atomic<bool> should_store_metadata_{false};
+  std::atomic<bool> should_store_step_stats_{false};
+  std::atomic<bool> should_store_graphs_{false};
   mutex metadata_mu_;
   RunMetadata run_metadata_ GUARDED_BY(metadata_mu_);
+  RunMetadataListener* metadata_listener_ GUARDED_BY(metadata_mu_) = nullptr;
   GraphCollector graph_collector_;
   const bool log_device_placement_;
   // EagerExecutor for async execution.
@@ -280,6 +312,7 @@ class EagerContext {
   Env* const env_;
 
   std::unique_ptr<CollectiveExecutorMgrInterface> collective_executor_mgr_;
+  CollectiveExecutorMgrInterface* unowned_collective_executor_mgr_ = nullptr;
 
 #ifndef __ANDROID__
   void CloseRemoteContexts();
@@ -308,6 +341,7 @@ class EagerContext {
 
   bool use_send_tensor_rpc_;
   const bool pin_small_ops_to_cpu_;
+  std::vector<std::unique_ptr<tensorflow::Thread>> child_threads_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/copy_to_device_node.h b/tensorflow/core/common_runtime/eager/copy_to_device_node.h
index 5bc3bb689e076467672af85d28bb340b56e7ee79..a807e7f68d3cffe0c71393acb537c6b3a732fde6 100644
--- a/tensorflow/core/common_runtime/eager/copy_to_device_node.h
+++ b/tensorflow/core/common_runtime/eager/copy_to_device_node.h
@@ -30,7 +30,7 @@ class CopyToDeviceNode : public EagerNode {
         src_(src),
         dstd_(dstd),
         ctx_(ctx),
-        dst_(new TensorHandle(id, dstd_, dstd_, src->dtype, ctx)) {
+        dst_(new TensorHandle(id, dstd_, dstd_, nullptr, src->dtype, ctx)) {
     src_->Ref();
     dst_->Ref();
   }
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.cc b/tensorflow/core/common_runtime/eager/eager_operation.cc
index 381b05ada8594fde1aa917053acd0371167f66ed..b10320ca30bd4423bc755722dafb85908d922f8e 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.cc
+++ b/tensorflow/core/common_runtime/eager/eager_operation.cc
@@ -30,4 +30,28 @@ void EagerOperation::AddInput(tensorflow::TensorHandle* h) {
   inputs_.push_back(h);
   attrs_.NumInputs(static_cast<int>(inputs_.size()));
 }
+
+void EagerOperation::ConsumeInput(tensorflow::TensorHandle* h) {
+  inputs_.push_back(h);
+  attrs_.NumInputs(static_cast<int>(inputs_.size()));
+}
+
+string EagerOperation::DebugString() const {
+  string out;
+  VLOG(1) << "EagerOperation::DebugString() over " << this;
+
+  strings::StrAppend(&out, "Name: ", name_, "\n");
+  strings::StrAppend(
+      &out, "Device: ", Device() ? Device()->DebugString() : "[]", "\n");
+  for (const auto& input : inputs_) {
+    VLOG(1) << "Input ptr: " << input;
+    strings::StrAppend(&out, "Input: ", input->DebugString(), "\n");
+  }
+
+  NodeDef ndef;
+  Attrs().FillAttrValueMap(ndef.mutable_attr());
+  strings::StrAppend(&out, "Attrs: ", ndef.DebugString(), "\n");
+  return out;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index 935ca7f9aa766a69582b4c94fec6c508e3f5a369..23a2d1bf986d8cd2b1670432e48ff3c6b3a1ee1c 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -53,6 +53,7 @@ class EagerOperation {
     return &inputs_;
   }
   void AddInput(tensorflow::TensorHandle* h);
+  void ConsumeInput(tensorflow::TensorHandle* h);
 
   const tensorflow::string& Name() const { return name_; }
   const tensorflow::AttrTypeMap* AttrTypes() const { return attr_types_; }
@@ -63,6 +64,8 @@ class EagerOperation {
 
   void SetUseXla(bool use_xla) { use_xla_ = use_xla; }
 
+  string DebugString() const;
+
  private:
   tensorflow::EagerContext* ctx_;  // Must outlive the EagerOperation.
   const tensorflow::string name_;
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 783baa96c92f224e45404e5f6586011599f02292..f2af6267bf88f2a3cacceee6e8847ab1958f40d3 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/strings/match.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
@@ -24,6 +25,9 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/execute_node.h"
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #ifndef __ANDROID__
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
@@ -81,18 +85,20 @@ int StepStatsDeviceIndex(StepStats* step_stats, EagerContext* ctx,
 // tensor handle.
 //
 // The passed in *handle will be Unreffed if it is replaced.
-Status MaybeCopyInputToExpectedDevice(EagerOperation* op, int i,
-                                      const Device* expected_device,
+//
+// `op_device` is passed in explicitly because `op->device()` might be unset
+// and we might have selected some specific device to run this op on.
+Status MaybeCopyInputToExpectedDevice(EagerOperation* op,
+                                      const Device* op_device, int i,
+                                      const Device* expected_input_device,
                                       RunMetadata* run_metadata,
                                       TensorHandle** handle) {
   EagerContext* ctx = op->EagerContext();
   Device* handle_device = (*handle)->device();
   const Device* actual_device =
       handle_device == nullptr ? ctx->HostCPU() : handle_device;
-  const Device* op_device =
-      op->Device() == nullptr ? ctx->HostCPU() : op->Device();
 
-  if (expected_device != actual_device) {
+  if (expected_input_device != actual_device) {
     switch (ctx->GetDevicePlacementPolicy()) {
       case DEVICE_PLACEMENT_SILENT_FOR_INT32:
         // TODO(xpan): See if we could bubble python related error up
@@ -108,7 +114,7 @@ Status MaybeCopyInputToExpectedDevice(EagerOperation* op, int i,
             "Tensors on conflicting devices:"
             " cannot compute ",
             op->Name(), " as input #", i, " was expected to be on ",
-            expected_device->name(), " but is actually on ",
+            expected_input_device->name(), " but is actually on ",
             actual_device->name(), " (operation running on ", op_device->name(),
             ")",
             " Tensors can be copied explicitly using .gpu() or .cpu() "
@@ -119,9 +125,10 @@ Status MaybeCopyInputToExpectedDevice(EagerOperation* op, int i,
             " may slow down your model");
       case DEVICE_PLACEMENT_WARN:
         LOG(WARNING) << "before computing " << op->Name() << " input #" << i
-                     << " was expected to be on " << expected_device->name()
-                     << " but is actually on " << actual_device->name()
-                     << " (operation running on " << op_device->name()
+                     << " was expected to be on "
+                     << expected_input_device->name() << " but is actually on "
+                     << actual_device->name() << " (operation running on "
+                     << op_device->name()
                      << "). This triggers a copy which can be a performance "
                         "bottleneck.";
         break;
@@ -133,7 +140,7 @@ Status MaybeCopyInputToExpectedDevice(EagerOperation* op, int i,
     auto pre_time_nanos = Env::Default()->NowNanos();
     TensorHandle* result_handle = nullptr;
     Status status = EagerCopyToDevice(
-        *handle, ctx, expected_device->name().c_str(), &result_handle);
+        *handle, ctx, expected_input_device->name().c_str(), &result_handle);
     if (run_metadata != nullptr) {
       auto* step_stats = run_metadata->mutable_step_stats();
       MaybeInitializeStepStats(step_stats, ctx);
@@ -155,10 +162,10 @@ Status MaybeCopyInputToExpectedDevice(EagerOperation* op, int i,
     }
     if (!status.ok()) {
       if (result_handle != nullptr) result_handle->Unref();
-      return errors::Internal("Failed copying input tensor from ",
-                              actual_device->name(), " to ",
-                              expected_device->name(), " in order to run ",
-                              op->Name(), ": ", status.error_message());
+      return errors::Internal(
+          "Failed copying input tensor from ", actual_device->name(), " to ",
+          expected_input_device->name(), " in order to run ", op->Name(), ": ",
+          status.error_message());
     }
 
     (*handle)->Unref();
@@ -168,19 +175,18 @@ Status MaybeCopyInputToExpectedDevice(EagerOperation* op, int i,
 }
 
 Status ValidateInputTypeAndPlacement(EagerContext* ctx, Device* op_device,
-                                     EagerOperation* op, const OpKernel* kernel,
+                                     EagerOperation* op,
+                                     const KernelAndDevice* kernel,
                                      RunMetadata* run_metadata) {
-  Device* host_device = ctx->HostCPU();
-  const MemoryTypeVector& memtypes = kernel->input_memory_types();
-  if (memtypes.size() != op->Inputs().size()) {
-    return errors::InvalidArgument("expected ", memtypes.size(),
+  if (kernel->num_inputs() != op->Inputs().size()) {
+    return errors::InvalidArgument("expected ", kernel->num_inputs(),
                                    " inputs, got ", op->Inputs().size());
   }
   for (int i = 0; i < op->Inputs().size(); ++i) {
-    const Device* expected_device =
-        memtypes[i] == HOST_MEMORY ? host_device : op_device;
+    const Device* expected_device = kernel->InputDevice(i);
     TF_RETURN_IF_ERROR(MaybeCopyInputToExpectedDevice(
-        op, i, expected_device, run_metadata, &((*op->MutableInputs())[i])));
+        op, op_device, i, expected_device, run_metadata,
+        &((*op->MutableInputs())[i])));
     tensorflow::TensorHandle* handle = op->Inputs()[i];
     if (handle->dtype != kernel->input_type(i)) {
       return errors::InvalidArgument(
@@ -246,6 +252,99 @@ bool OnSameTask(EagerContext* ctx, Device* first, Device* second) {
          first->parsed_name().task == second->parsed_name().task;
 }
 
+// Gets the CPU device on the task of device.
+Status CPUDeviceOnTask(EagerContext* ctx, tensorflow::Device* device,
+                       tensorflow::Device** cpu_device) {
+  string cpu_device_name;
+  TF_RETURN_IF_ERROR(DeviceNameUtils::DeviceNameToCpuDeviceName(
+      device->name(), &cpu_device_name));
+
+  return ctx->FindDeviceByName(cpu_device_name, cpu_device);
+}
+
+inline tensorflow::Fprint128 FingerprintCat128(const tensorflow::Fprint128& a,
+                                               const tensorflow::Fprint128& b) {
+  return {tensorflow::FingerprintCat64(a.low64, b.low64),
+          tensorflow::FingerprintCat64(a.high64, b.high64)};
+}
+
+Status FindDeviceFromName(const EagerContext* ctx, const char* device_name,
+                          Device** device) {
+  *device = ctx->HostCPU();
+  if (device_name == nullptr || strlen(device_name) == 0) {
+    return Status::OK();
+  }
+
+  auto status = ctx->local_device_mgr()->LookupDevice(device_name, device);
+  if (status.ok()) {
+    return status;
+  }
+
+  if (ctx->remote_device_mgr() != nullptr) {
+    return ctx->remote_device_mgr()->LookupDevice(device_name, device);
+  }
+
+  return status;
+}
+
+bool IsMultiDevice(const FunctionDef* fdef, const string& op_device) {
+  if (fdef == nullptr) {
+    // Primitive op.
+    return false;
+  }
+
+  // Run all functions as multi-device.
+  return true;
+
+  // We can eliminate some overhead by running simple functions using regular
+  // CallOp kernel. However, it is tricky to figure out which functions should
+  // be run using CallOp. Also, currently CallOp runs neither optimization
+  // passes (needed for TPU/XLA) nor grappler.
+  // Here are some cases where a function should be run in multi-device mode:
+  //  - Function takes at least two resources on different devices.
+  //  - Function takes a resource on deviceA and a body op explicitly placed
+  //  on deviceB.
+  //  - Function has a colocation constraint.
+  //  - Function has an explicit device annotation (which might not be using
+  //    full canonical device name) different from op_device. Note that false
+  //    positives are ok.
+  //  - Function has a node or a (node) attribute that can potentially make
+  //    the function multi-device after a rewrite pass (e.g. various XLA/TPU
+  //    special nodes and attributes)
+}
+
+Status AddInputDevicesToCacheKey(const EagerContext* ctx,
+                                 const EagerOperation* op,
+                                 std::vector<Device*>* input_dev_ptrs,
+                                 Fprint128* cache_key) {
+  input_dev_ptrs->reserve(op->Inputs().size());
+  Device* cpu_device = ctx->HostCPU();
+  for (TensorHandle* tensor_handle : op->Inputs()) {
+    string device_name;
+    if (tensor_handle->dtype == DT_RESOURCE) {
+      // Use the resource's actual device because it is the device that will
+      // influence partitioning the multi-device function.
+      const Tensor* tensor;
+      TF_RETURN_IF_ERROR(tensor_handle->Tensor(&tensor));
+      const ResourceHandle& handle = tensor->flat<ResourceHandle>()(0);
+      device_name = handle.device();
+
+      Device* input_device;
+      TF_RETURN_IF_ERROR(
+          FindDeviceFromName(ctx, device_name.c_str(), &input_device));
+      input_dev_ptrs->push_back(input_device);
+    } else if (MTypeFromDType(tensor_handle->dtype) == HOST_MEMORY) {
+      input_dev_ptrs->push_back(cpu_device);
+    } else {
+      Device* device = tensor_handle->device();
+      device_name = device != nullptr ? device->name() : cpu_device->name();
+      input_dev_ptrs->push_back(device == nullptr ? cpu_device : device);
+    }
+    *cache_key = FingerprintCat128(*cache_key, Fingerprint128(device_name));
+  }
+  return Status::OK();
+}
+
 Status EagerLocalExecute(EagerOperation* op,
                          gtl::InlinedVector<TensorHandle*, 2>* retvals,
                          int* num_retvals) {
@@ -254,18 +353,34 @@ Status EagerLocalExecute(EagerOperation* op,
   if (!status.ok()) return status;
   Device* device = op->Device();
 
-  Fprint128 cache_key = op->MutableAttrs()->CacheKey(
-      device == nullptr ? "unspecified" : device->name());
+  const string& maybe_unspecified_device_name =
+      device == nullptr ? "unspecified" : device->name();
+  Fprint128 cache_key =
+      op->MutableAttrs()->CacheKey(maybe_unspecified_device_name);
+
+  bool is_multi_device_function = IsMultiDevice(
+      ctx->FindFunctionDef(op->Name()), maybe_unspecified_device_name);
+
+  std::vector<Device*> input_dev_ptrs;
+  if (is_multi_device_function) {
+    TF_RETURN_IF_ERROR(
+        AddInputDevicesToCacheKey(ctx, op, &input_dev_ptrs, &cache_key));
+  }
+
   KernelAndDevice* kernel = ctx->GetCachedKernel(cache_key);
   if (kernel == nullptr) {
+    VLOG(2) << "Creating new kernel for " << op->Name() << " on device "
+            << maybe_unspecified_device_name;
     // If we are running a function on explicitly requested TPU,
     // compile it with XLA.
     // Note that it is not ideal, but currently ok, to set this
     // attribute after computing the kernel cache key above.
+    bool compile_with_xla = false;
     if (op->is_function() && device != nullptr &&
         (device->device_type() == "TPU" || device->device_type() == "XLA_GPU" ||
          device->device_type() == "XLA_CPU")) {
       op->MutableAttrs()->Set(kXlaCompileAttr, true);
+      compile_with_xla = true;
     }
 
     const NodeDef& ndef = op->MutableAttrs()->BuildNodeDef();
@@ -273,7 +388,6 @@ Status EagerLocalExecute(EagerOperation* op,
       status = SelectDevice(ndef, ctx, &device);
       if (!status.ok()) return status;
     }
-    CHECK(device != nullptr);
     if (ctx->LogDevicePlacement()) {
       LOG(INFO) << "Executing op " << ndef.op() << " in device "
                 << device->name();
@@ -285,9 +399,35 @@ Status EagerLocalExecute(EagerOperation* op,
           "Unable to find a FunctionLibraryRuntime corresponding to device ",
           device->name());
     }
-    kernel = new KernelAndDevice(ctx->GetRendezvous(), ctx->LogMemory(),
-                                 ctx->GetCollectiveExecutorHandle());
-    status = KernelAndDevice::Init(ndef, flr, ctx->runner(), kernel);
+    GraphCollector* graph_collector = nullptr;
+    if (ctx->ShouldStoreGraphs()) {
+      graph_collector = ctx->GetGraphCollector();
+    }
+    // Treat the function as multi_device only when we are not compiling
+    // it wholly with XLA. When compiling wholly with XLA, flr->CreateKernel
+    // will create an XlaLaunchOp kernel to compile and run the function.
+    if (is_multi_device_function && !compile_with_xla) {
+      // Multi-device functions don't use the rendezvous from eager context.
+      // If we use that rendezvous, multiple concurrent calls to the same
+      // function will likely result in collisions. However, this also means
+      // that we don't support legitimate sending/receiving across function
+      // boundary.
+      VLOG(2) << "Running " << ndef.op() << " using multi-device function. "
+              << "compile_with_xla=" << compile_with_xla
+              << ". Full node_def=" << ndef.DebugString();
+      kernel = new KernelAndDeviceFunc(
+          flr, ctx->pflr(), std::move(input_dev_ptrs), ctx->runner(),
+          ctx->GetCollectiveExecutorHandle(), ctx->HostCPU());
+    } else {
+      VLOG(2) << "Running " << ndef.op() << " using op kernel. "
+              << "compile_with_xla=" << compile_with_xla
+              << ". Full node_def=" << ndef.DebugString();
+      kernel = new KernelAndDeviceOp(
+          ctx->GetRendezvous(), ctx->LogMemory(), flr, ctx->runner(),
+          ctx->GetCollectiveExecutorHandle(), ctx->HostCPU());
+    }
+
+    status = kernel->Init(ndef, graph_collector);
     if (!status.ok()) {
       delete kernel;
       return status;
@@ -309,14 +449,16 @@ Status EagerLocalExecute(EagerOperation* op,
     device = kernel->device();
   }
   status = ValidateInputTypeAndPlacement(
-      ctx, device, op, kernel->kernel(),
-      ctx->ShouldStoreMetadata() ? ctx->RunMetadataProto() : nullptr);
+      ctx, device, op, kernel,
+      ctx->ShouldStoreStepStats() ? ctx->RunMetadataProto() : nullptr);
   if (!status.ok()) return status;
   std::unique_ptr<NodeExecStats> maybe_stats;
   StepStats* maybe_step_stats = nullptr;
   GraphCollector* graph_collector = nullptr;
-  if (ctx->ShouldStoreMetadata()) {
+  if (ctx->ShouldStoreGraphs()) {
     graph_collector = ctx->GetGraphCollector();
+  }
+  if (ctx->ShouldStoreStepStats()) {
     maybe_step_stats = ctx->RunMetadataProto()->mutable_step_stats();
     int64 now_nanos = Env::Default()->NowNanos();
     maybe_stats.reset(new NodeExecStats);
@@ -333,12 +475,15 @@ Status EagerLocalExecute(EagerOperation* op,
   if (ctx->Async()) {
     // Note that for async mode, execution order will make sure that all
     // input handles are ready before executing them.
-    // TODO(agarwal): Consider executing "cheap" kernels inline for performance.
+    // TODO(agarwal): Consider executing "cheap" kernels inline for
+    // performance.
     tensorflow::uint64 id = ctx->NextId();
     for (int i = 0; i < *num_retvals; ++i) {
-      (*retvals)[i] = new TensorHandle(id, /* d= */ kernel->OutputDevice(i),
-                                       /* op_device= */ kernel->device(),
-                                       output_dtypes[i], ctx);
+      (*retvals)[i] = new TensorHandle(
+          id, /* d= */ kernel->OutputDevice(i),
+          /* op_device= */ kernel->device(),
+          /* resource_device= */ kernel->OutputResourceDevice(i),
+          output_dtypes[i], ctx);
     }
     EagerNode* node = new ExecuteNode(
         id, ctx, op->Device(), op->Inputs(), kernel, maybe_stats.release(),
@@ -347,9 +492,9 @@ Status EagerLocalExecute(EagerOperation* op,
   } else {
     // Execute checks if retvals[i] is nullptr or not to figure if it needs to
     // allocate it.
-    status = EagerExecute(ctx, op->Device(), op->Inputs(), kernel,
-                          maybe_stats.get(), maybe_step_stats, graph_collector,
-                          retvals->data(), *num_retvals);
+    status = EagerKernelExecute(ctx, op->Device(), op->Inputs(), kernel,
+                                maybe_stats.get(), maybe_step_stats,
+                                graph_collector, retvals->data(), *num_retvals);
   }
 
   return status;
@@ -361,9 +506,9 @@ std::function<void()> GetRemoteTensorDestructor(
     uint64 op_id, int output_num) {
   return [ctx, eager_client, context_id, op_id, output_num]() {
     if (!ctx->HasActiveRemoteContext(context_id)) {
-      // This means that this tensor was pointing to a remote device, which has
-      // been changed out from under us. Simply return since there is nothing we
-      // can do.
+      // This means that this tensor was pointing to a remote device, which
+      // has been changed out from under us. Simply return since there is
+      // nothing we can do.
       return tensorflow::Status::OK();
     }
 
@@ -423,10 +568,10 @@ Status EagerRemoteSendTensor(EagerContext* ctx, TensorHandle* h,
 
   Device* tensor_handle_device = h->device();
 
-  // AsProtoTensorContent doesn't work when the tensor is on the GPU, hence copy
-  // it to the CPU before copying it out.
-  // TODO(nareshmodi): this is currently slow, but can be fixed by making tensor
-  // handles aware of more than one device.
+  // AsProtoTensorContent doesn't work when the tensor is on the GPU, hence
+  // copy it to the CPU before copying it out.
+  // TODO(nareshmodi): this is currently slow, but can be fixed by making
+  // tensor handles aware of more than one device.
   TensorHandle* actual_handle;
   if (tensor_handle_device != nullptr &&
       tensor_handle_device->device_type() != "CPU") {
@@ -458,7 +603,8 @@ Status EagerRemoteSendTensor(EagerContext* ctx, TensorHandle* h,
 
   *result = new TensorHandle(id, /*output_num=*/0, /*remote_shape_node_id=*/0,
                              tensor->dtype(), std::move(destructor),
-                             recv_device, recv_device, ctx);
+                             /*d=*/recv_device, /*op_device=*/recv_device,
+                             /*resource_device=*/nullptr, ctx);
   (*result)->SetRemoteShape(MakeUnique<TensorShape>(tensor->shape()));
 
   actual_handle->Unref();
@@ -494,10 +640,16 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
         // explicitly copy, and instead depend on the copy to happen locally
         // when the op is executed on the device.
         !OnSameTask(ctx, op->Device(), input_device)) {
+      tensorflow::Device* remote_cpu_device;
+      TF_RETURN_IF_ERROR(
+          CPUDeviceOnTask(ctx, op->Device(), &remote_cpu_device));
       // TODO(b/110044833): It's possible the same tensor gets copied to the
       // remote device repeatedly.
+      // Always copy to the remote CPU so that the actual device can be
+      // correctly determined after the kernel is selected/instantiated, since
+      // the op might have its inputs on host memory.
       TF_RETURN_IF_ERROR(MaybeCopyInputToExpectedDevice(
-          op, i, op->Device(), /* run_metadata= */ nullptr,
+          op, op->Device(), i, remote_cpu_device, /* run_metadata= */ nullptr,
           &(*op->MutableInputs())[i]));
     }
 
@@ -537,14 +689,25 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
 
   const tensorflow::uint64 id = remote_op->id();
   for (int i = 0; i < *num_retvals; i++) {
-    // TODO(nareshmodi): Change the callback to instead add the decref to a list
-    // of pending decrefs that we can send as a batch with the next execute.
+    // TODO(nareshmodi): Change the callback to instead add the decref to a
+    // list of pending decrefs that we can send as a batch with the next
+    // execute.
     std::function<void()> destructor =
         GetRemoteTensorDestructor(ctx, eager_client, context_id, id, i);
 
-    retvals[i] = new TensorHandle(remote_op->id(), i, remote_node_id,
-                                  output_dtypes[i], std::move(destructor),
-                                  op_device, op_device, op->EagerContext());
+    // The device_ and resource_device_ or this TensorHandle are not correct.
+    // It is pretty hard to make it correct because for multi-device functions,
+    // we don't know the output device until the function is instantiated.
+    // Luckily, we don't need to know the correct remote device here. We just
+    // need to know that it is remote. If we need to copy this tensor to this
+    // process, the remote end will know the correct device of this handle.
+    retvals[i] = new TensorHandle(
+        remote_op->id(), i, remote_node_id, output_dtypes[i],
+        std::move(destructor),
+        /*d=*/op_device, /*op_device=*/op_device,
+        /*resource_device=*/output_dtypes[i] == DT_RESOURCE ? op_device
+                                                            : nullptr,
+        op->EagerContext());
   }
 
   if (is_async) {
@@ -600,13 +763,16 @@ bool IsPinnableOp(const string& op_type) {
   static const gtl::FlatSet<string>* unpinnable_ops = new gtl::FlatSet<string>({
       "RandomUniform",
       "RandomUniformInt",
-      "RandomNormal",
+      "RandomStandardNormal",
       "StatelessRandomUniform",
       "StatelessRandomUniformInt",
       "StatelessRandomNormal",
   });
 
-  return unpinnable_ops->find(op_type) == unpinnable_ops->end();
+  // XRT ops refer to per-device handles that are not safe to move between
+  // devices.
+  return unpinnable_ops->find(op_type) == unpinnable_ops->end() &&
+         !absl::StartsWith(op_type, "XRT");
 }
 
 // The Op device may be updated if:
@@ -619,43 +785,56 @@ bool IsPinnableOp(const string& op_type) {
 // "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING" to "0" or "false".
 Status MaybeUpdateOpDevice(EagerOperation* op) {
   EagerContext* ctx = op->EagerContext();
-  bool device_set_for_resource_variable = false;
   bool all_inputs_eligible_for_cpu_pinning =
-      ctx->PinSmallOpsToCPU() && IsPinnableOp(op->Name());
-
+      ctx->PinSmallOpsToCPU() && !op->is_function() && IsPinnableOp(op->Name());
+  Device* op_device = op->Device() == nullptr ? ctx->HostCPU() : op->Device();
   for (int i = 0; i < op->Inputs().size(); ++i) {
-    Device* input_op_device = op->Inputs()[i]->op_device();
-    VLOG(2) << "for op " << op->Name() << " input " << i << " "
-            << DataTypeString(op->Inputs()[i]->dtype) << " "
-            << (input_op_device == nullptr ? "cpu" : input_op_device->name())
-            << " " << (op->Device() == nullptr ? "cpu" : op->Device()->name());
-    if (op->Inputs()[i]->dtype == DT_RESOURCE &&
-        (input_op_device != op->Device() || input_op_device == nullptr)) {
-      Device* d = input_op_device == nullptr ? ctx->HostCPU() : input_op_device;
-      VLOG(1) << "Changing device of operation " << op->Name() << " to "
-              << d->name() << " because input #" << i
-              << " is a resource in this device.";
-      op->SetDevice(d);
-
-      device_set_for_resource_variable = true;
+    TensorHandle* tensor_handle = op->Inputs()[i];
+    if (tensor_handle->dtype == DT_RESOURCE) {
+      Device* resource_device = tensor_handle->resource_device();
+      VLOG(2) << "for op " << op->Name() << " input " << i << " "
+              << DataTypeString(tensor_handle->dtype)
+              << " input device = " << resource_device->name()
+              << ", op device = " << op_device->name();
+      // We check for `op->Device() == nullptr` because it can be later
+      // interpreted as unspecified device and a different device can
+      // be selected based on device priority. If any input to an op
+      // is a resource we must pin it to prevent different device selection.
+      // TODO(iga): null device can mean "unspecified" or "CPU". Clean this up.
+      if (resource_device != op_device || op->Device() == nullptr) {
+        VLOG(1) << (resource_device != op_device ? "Changing " : "Setting ")
+                << "device of operation " << op->Name() << " to "
+                << resource_device->name() << " because input #" << i
+                << " is a resource in this device.";
+        op->SetDevice(resource_device);
+      }
       all_inputs_eligible_for_cpu_pinning = false;
+      // No point in looking at other inputs. If there are other resources,
+      // they must have the same device and we already declared the op to be
+      // ineligible for CPU pinning.
+      break;
     } else if (all_inputs_eligible_for_cpu_pinning) {
-      TensorHandle* handle = op->Inputs()[i];
+      Device* input_device = tensor_handle->device();
+      input_device = input_device == nullptr ? ctx->HostCPU() : input_device;
+      VLOG(2) << "for op " << op->Name() << " input " << i << " "
+              << DataTypeString(tensor_handle->dtype)
+              << " input device = " << input_device->name()
+              << ", op device = " << op_device->name();
 
       // Input is on CPU.
-      if (input_op_device != nullptr && input_op_device != ctx->HostCPU()) {
+      if (input_device != ctx->HostCPU()) {
         all_inputs_eligible_for_cpu_pinning = false;
         continue;
       }
 
-      if (handle->dtype != DataType::DT_INT32 &&
-          handle->dtype != DataType::DT_INT64) {
+      if (tensor_handle->dtype != DataType::DT_INT32 &&
+          tensor_handle->dtype != DataType::DT_INT64) {
         all_inputs_eligible_for_cpu_pinning = false;
         continue;
       }
 
       int64 num_elements;
-      TF_RETURN_IF_ERROR(handle->NumElements(&num_elements));
+      TF_RETURN_IF_ERROR(tensor_handle->NumElements(&num_elements));
       if (num_elements > 64) {
         all_inputs_eligible_for_cpu_pinning = false;
       }
@@ -697,12 +876,12 @@ Status EagerExecute(EagerOperation* op,
   return EagerRemoteExecute(op, retvals->data(), num_retvals);
 }
 
-Status EagerExecute(EagerContext* ctx, Device* device,
-                    const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
-                    KernelAndDevice* kernel, NodeExecStats* maybe_stats,
-                    StepStats* maybe_step_stats,
-                    GraphCollector* graph_collector, TensorHandle** retvals,
-                    int num_retvals) {
+Status EagerKernelExecute(EagerContext* ctx, Device* device,
+                          const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
+                          KernelAndDevice* kernel, NodeExecStats* maybe_stats,
+                          StepStats* maybe_step_stats,
+                          GraphCollector* graph_collector,
+                          TensorHandle** retvals, int num_retvals) {
   if (device == nullptr) {
     // TODO(apassos) debug how the assignment below might return a different
     // device from the one requested above.
@@ -710,24 +889,62 @@ Status EagerExecute(EagerContext* ctx, Device* device,
   }
 
   std::vector<Tensor> outputs(1);
-  const MemoryTypeVector* output_memory_types = nullptr;
-  output_memory_types = &kernel->kernel()->output_memory_types();
-  std::vector<Tensor> inputs(op_inputs.size());
+
+  // If there are multiple references to a TensorHandle in 'op_inputs' we must
+  // increment the reference count of the corresponding Tensor or risk it being
+  // overwritten during kernel execution. The reference count is incremented
+  // below when we insert a copy of the Tensor into protected_tensors, and will
+  // be decremented once execution is complete.
+  std::vector<tensorflow::Tensor> protected_tensors;
   for (int i = 0; i < op_inputs.size(); ++i) {
-    const Tensor* input_tensor = nullptr;
-    TF_RETURN_IF_ERROR(op_inputs[i]->Tensor(&input_tensor));
-    inputs[i] = *input_tensor;
+    if (!op_inputs[i]->RefCountIsOne()) {
+      const Tensor* input_tensor = nullptr;
+      TF_RETURN_IF_ERROR(op_inputs[i]->Tensor(&input_tensor));
+      protected_tensors.push_back(*input_tensor);
+    }
   }
+
+  gtl::InlinedVector<TensorValue, 4> input_vector(op_inputs.size());
+  for (int i = 0; i < op_inputs.size(); ++i) {
+    TF_RETURN_IF_ERROR(op_inputs[i]->TensorValue(&input_vector[i]));
+  }
+
   //  TODO(apassos) figure out how to record stats for ops which are a part of
   //  functions.
   // TODO(agarwal): change Run to take vector of handles ?
   ScopedStepContainer* container = ctx->StepContainer();
   if (container == nullptr) {
-    TF_RETURN_IF_ERROR(kernel->Run(&inputs, &outputs, maybe_stats,
+    TF_RETURN_IF_ERROR(kernel->Run(input_vector, &outputs, maybe_stats,
                                    maybe_step_stats, graph_collector));
   } else {
-    TF_RETURN_IF_ERROR(kernel->Run(container, &inputs, &outputs, maybe_stats,
-                                   maybe_step_stats, graph_collector));
+    TF_RETURN_IF_ERROR(kernel->Run(container, input_vector, &outputs,
+                                   maybe_stats, maybe_step_stats,
+                                   graph_collector));
+  }
+  if (graph_collector != nullptr) {
+    mutex_lock ml(*ctx->MetadataMu());
+    {
+      GraphCollector* collector = ctx->GetGraphCollector();
+      mutex_lock mll(collector->mu);
+
+      // Adding to partition graphs for backward compatibility.
+      for (const auto& graph : collector->partitioned_graphs) {
+        *ctx->RunMetadataProto()->add_partition_graphs() = graph;
+      }
+
+      if (collector->dirty) {
+        auto* function_graphs = ctx->RunMetadataProto()->add_function_graphs();
+        *function_graphs->mutable_post_optimization_graph() =
+            collector->optimized_graph;
+        *function_graphs->mutable_pre_optimization_graph() =
+            collector->raw_graph;
+        for (const auto& graph : collector->partitioned_graphs) {
+          *function_graphs->add_partition_graphs() = graph;
+        }
+      }
+
+      collector->ClearGraphs();
+    }
   }
   if (maybe_stats != nullptr) {
     int64 nanos = Env::Default()->NowNanos();
@@ -737,34 +954,28 @@ Status EagerExecute(EagerContext* ctx, Device* device,
     maybe_stats->set_all_end_rel_micros(nanos / EnvTime::kMicrosToNanos -
                                         maybe_stats->all_start_micros());
     maybe_stats->set_all_end_rel_nanos(nanos - maybe_stats->all_start_nanos());
-    mutex_lock ml(*ctx->MetadataMu());
-    if (ctx->ShouldStoreMetadata()) {
+    if (ctx->ShouldStoreStepStats()) {
+      mutex_lock ml(*ctx->MetadataMu());
       {
-        GraphCollector* collector = ctx->GetGraphCollector();
-        mutex_lock mll(collector->mu);
-        for (const auto& graph : collector->graphs) {
-          *ctx->RunMetadataProto()->add_partition_graphs() = graph;
+        auto* step_stats = ctx->RunMetadataProto()->mutable_step_stats();
+        // Lazily initialize the RunMetadata with information about all devices
+        // if this is the first call.
+        while (step_stats->dev_stats_size() < ctx->devices()->size()) {
+          step_stats->add_dev_stats();
         }
-        collector->graphs.clear();
-      }
-      auto* step_stats = ctx->RunMetadataProto()->mutable_step_stats();
-      // Lazily initialize the RunMetadata with information about all devices if
-      // this is the first call.
-      while (step_stats->dev_stats_size() < ctx->devices()->size()) {
-        step_stats->add_dev_stats();
-      }
-      // Find the current device's index.
-      int device_idx = 0;
-      for (int i = 0; i < ctx->devices()->size(); ++i) {
-        if (ctx->devices()->at(i) == device) {
-          device_idx = i;
-          break;
+        // Find the current device's index.
+        int device_idx = 0;
+        for (int i = 0; i < ctx->devices()->size(); ++i) {
+          if (ctx->devices()->at(i) == device) {
+            device_idx = i;
+            break;
+          }
         }
+        // Populate the device stats for this device.
+        auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
+        dev_stats->set_device(device->name());
+        *dev_stats->add_node_stats() = *maybe_stats;
       }
-      // Populate the device stats for this device.
-      auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
-      dev_stats->set_device(device->name());
-      *dev_stats->add_node_stats() = *maybe_stats;
     }
   }
   DCHECK_EQ(num_retvals, outputs.size());
@@ -775,8 +986,8 @@ Status EagerExecute(EagerContext* ctx, Device* device,
                            /* op_device= */ device, ctx);
     } else {
       // In the async case, the retval is not a nullptr, and its device is
-      // already set since all TensorHandles always have their device set during
-      // construction.
+      // already set since all TensorHandles always have their device set
+      // during construction.
       DCHECK_EQ(device, retvals[i]->op_device());
       DCHECK_EQ(kernel->OutputDevice(i), retvals[i]->device());
 
@@ -807,25 +1018,6 @@ Status LocalEagerCopyToDevice(TensorHandle* h, EagerContext* ctx, Device* dstd,
   }
 }
 
-Status FindDeviceFromName(EagerContext* ctx, const char* device_name,
-                          Device** device) {
-  *device = ctx->HostCPU();
-  if (device_name == nullptr || strlen(device_name) == 0) {
-    return Status::OK();
-  }
-
-  auto status = ctx->local_device_mgr()->LookupDevice(device_name, device);
-  if (status.ok()) {
-    return status;
-  }
-
-  if (ctx->remote_device_mgr() != nullptr) {
-    return ctx->remote_device_mgr()->LookupDevice(device_name, device);
-  }
-
-  return status;
-}
-
 Status ExecuteSend(EagerContext* ctx, tensorflow::Device* device,
                    TensorHandle* h, StringPiece wire_id,
                    const string& recv_device) {
@@ -887,8 +1079,8 @@ Status ExecuteRecv(EagerContext* ctx, tensorflow::Device* device,
   return Status::OK();
 }
 
-// This gets a unique wire ID. We add a random identifier so that if the worker
-// has other clients that it is servicing, we don't have any collision.
+// This gets a unique wire ID. We add a random identifier so that if the
+// worker has other clients that it is servicing, we don't have any collision.
 string GetUniqueWireID() {
   static tensorflow::uint64 random_seed = random::New64();
   static tensorflow::mutex wireid_mutex(tensorflow::LINKER_INITIALIZED);
diff --git a/tensorflow/core/common_runtime/eager/execute.h b/tensorflow/core/common_runtime/eager/execute.h
index 6143a52d4b9c83444eb98567decf26dbfca58504..4945688900a71e9a1343ad7fdad285fb5775aa3b 100644
--- a/tensorflow/core/common_runtime/eager/execute.h
+++ b/tensorflow/core/common_runtime/eager/execute.h
@@ -41,14 +41,14 @@ Status EagerExecute(
     tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2>* retvals,
     int* num_retvals);
 
-// Low-level utility to execute the kernel specified by kernel on device device,
-// with the inputs op_inputs, in the context ctx.
-Status EagerExecute(EagerContext* ctx, Device* device,
-                    const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
-                    KernelAndDevice* kernel, NodeExecStats* maybe_stats,
-                    StepStats* maybe_step_stats,
-                    GraphCollector* graph_collector, TensorHandle** retvals,
-                    int num_retvals);
+// Low-level utility to execute the kernel specified by kernel on device
+// 'device', with the inputs op_inputs, in the context 'ctx'.
+Status EagerKernelExecute(EagerContext* ctx, Device* device,
+                          const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
+                          KernelAndDevice* kernel, NodeExecStats* maybe_stats,
+                          StepStats* maybe_step_stats,
+                          GraphCollector* graph_collector,
+                          TensorHandle** retvals, int num_retvals);
 
 // Low-level utility to copy a tensor handle from one device to another.
 Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
diff --git a/tensorflow/core/common_runtime/eager/execute_node.h b/tensorflow/core/common_runtime/eager/execute_node.h
index a99d509dd60c4ad50b67ef237423570d7b595234..4459e3221b9f2387867e1efed4324322619e4388 100644
--- a/tensorflow/core/common_runtime/eager/execute_node.h
+++ b/tensorflow/core/common_runtime/eager/execute_node.h
@@ -64,7 +64,7 @@ class ExecuteNode : public EagerNode {
   }
 
   tensorflow::Status Run() override {
-    const Status status = EagerExecute(
+    const Status status = EagerKernelExecute(
         ctx_, op_device_, inputs_, kernel_, maybe_stats_.get(),
         maybe_step_stats_, graph_collector_, retvals_.begin(), retvals_.size());
     if (status.ok()) {
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 317e9a16074b37ef6ecaf1d7f8c1a2daa412f75e..60b807588228c91194590558ba1c96513187a458 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -15,49 +15,116 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 
+#include "absl/strings/match.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/tensor_slice_reader_cache.h"
+#ifndef __ANDROID__
+#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+#endif
 
 namespace tensorflow {
 
-// static
-Status KernelAndDevice::Init(const NodeDef& ndef, FunctionLibraryRuntime* flr,
-                             std::function<void(std::function<void()>)>* runner,
-                             KernelAndDevice* out) {
+KernelAndDeviceFunc::~KernelAndDeviceFunc() {
+  if (handle_ != kInvalidHandle) {
+    Status status = pflr_->ReleaseHandle(handle_);
+    if (!status.ok()) {
+      LOG(INFO) << "Ignoring error status when releasing multi-device function "
+                   "handle "
+                << status.ToString();
+    }
+  }
+}
+
+Status KernelAndDeviceOp::Init(const NodeDef& ndef,
+                               GraphCollector* graph_collector) {
   OpKernel* k = nullptr;
-  TF_RETURN_IF_ERROR(flr->CreateKernel(ndef, &k));
-  out->device_ = flr->device();
-  out->kernel_.reset(k);
-  out->flr_ = flr;
-  out->runner_ = runner;
-  out->default_runner_ = [](std::function<void()> f) { f(); };
-
-  // Update output_dtypes_.
+  TF_RETURN_IF_ERROR(flr_->CreateKernel(ndef, &k));
+  kernel_.reset(k);
+  return Status::OK();
+}
+
+Status KernelAndDeviceFunc::Init(const NodeDef& ndef,
+                                 GraphCollector* graph_collector) {
   const OpDef* op_def = nullptr;
   const FunctionDef* function_def =
-      flr->GetFunctionLibraryDefinition()->Find(ndef.op());
+      flr_->GetFunctionLibraryDefinition()->Find(ndef.op());
   if (function_def != nullptr) {
     op_def = &(function_def->signature());
   } else {
     TF_RETURN_IF_ERROR(OpDefForOp(ndef.op().c_str(), &op_def));
   }
-  return OutputTypesForNode(ndef, *op_def, &out->output_dtypes_);
+  TF_RETURN_IF_ERROR(
+      InOutTypesForNode(ndef, *op_def, &input_dtypes_, &output_dtypes_));
+
+  FunctionLibraryRuntime::InstantiateOptions options;
+  options.target = device_->name();
+  options.is_multi_device_function = true;
+  for (const Device* device : input_devices_) {
+    options.input_devices.push_back(device->name());
+  }
+
+  const auto& it = ndef.attr().find("executor_type");
+  if (it != ndef.attr().end()) {
+    options.executor_type = it->second.s();
+  }
+#ifndef __ANDROID__
+  // Android tf library does not include grappler.
+  const auto& config_it = ndef.attr().find("config_proto");
+  if (it != ndef.attr().end()) {
+    ConfigProto config_proto;
+    if (!config_proto.ParseFromString(config_it->second.s())) {
+      return errors::InvalidArgument(
+          "Failed to parse config_proto attribute as tensorflow::ConfigProto "
+          "proto.");
+    }
+    grappler::GrapplerItem::OptimizationOptions optimization_options;
+
+    // Tensorflow 2.0 in eager mode with automatic control dependencies will
+    // prune all nodes that are not in the transitive fanin of the fetch nodes.
+    // However because the function will be executed via FunctionLibraryRuntime,
+    // and current function implementation does not prune stateful and dataset
+    // ops, we rely on Grappler to do the correct graph pruning.
+    optimization_options.allow_pruning_stateful_and_dataset_ops = true;
+
+    // All the nested function calls will be executed and optimized via
+    // PartitionedCallOp, there is no need to optimize functions now.
+    optimization_options.optimize_function_library = false;
+
+    options.optimize_graph_fn = std::bind(
+        grappler::OptimizeGraph, std::placeholders::_1, std::placeholders::_2,
+        std::placeholders::_3, std::placeholders::_4, std::placeholders::_5,
+        config_proto, function_def->signature().name(), optimization_options,
+        std::placeholders::_6);
+  }
+#endif
+  options.graph_collector = graph_collector;
+
+  TF_RETURN_IF_ERROR(
+      pflr_->Instantiate(ndef.op(), AttrSlice(ndef), options, &handle_));
+  return pflr_->GetOutputDevices(handle_, &output_devices_);
+  return Status::OK();
 }
 
-Status KernelAndDevice::Run(std::vector<Tensor>* inputs,
+Status KernelAndDevice::Run(const gtl::InlinedVector<TensorValue, 4>& inputs,
                             std::vector<Tensor>* outputs, NodeExecStats* stats,
                             StepStats* step_stats,
                             GraphCollector* graph_collector) {
@@ -68,16 +135,41 @@ Status KernelAndDevice::Run(std::vector<Tensor>* inputs,
                    graph_collector);
 }
 
-Status KernelAndDevice::Run(ScopedStepContainer* step_container,
-                            std::vector<Tensor>* inputs,
-                            std::vector<Tensor>* outputs, NodeExecStats* stats,
-                            StepStats* step_stats,
-                            GraphCollector* graph_collector) {
-  gtl::InlinedVector<TensorValue, 4> input_vector;
-  for (Tensor& t : *inputs) {
-    input_vector.push_back(TensorValue(&t));
+namespace {
+void UpdateStats(OpKernelContext* context,
+                 StepStatsCollector* step_stats_collector,
+                 NodeExecStats* stats) {
+  for (const auto& allocator_pair : context->ConsumeWrappedAllocators()) {
+    AllocatorMemoryUsed* memory = stats->add_memory();
+    memory->set_allocator_name(allocator_pair.first->Name());
+    auto sizes = allocator_pair.second->GetSizes();
+    memory->set_total_bytes(std::get<0>(sizes));
+    memory->set_peak_bytes(std::get<1>(sizes));
+    memory->set_live_bytes(std::get<2>(sizes));
+
+    absl::optional<AllocatorStats> allocator_stats =
+        allocator_pair.first->GetStats();
+    if (stats) {
+      memory->set_allocator_bytes_in_use(allocator_stats->bytes_in_use);
+    }
+    allocator_pair.second->GetRecordsAndUnRef();
+  }
+  auto* ms = stats->mutable_memory_stats();
+  ms->set_temp_memory_size(context->temp_memory_allocated());
+  for (const auto& alloc_id : context->persistent_alloc_ids()) {
+    ms->mutable_persistent_tensor_alloc_ids()->Add(alloc_id);
   }
 
+  ms->set_persistent_memory_size(context->persistent_memory_allocated());
+  step_stats_collector->Finalize();
+}
+}  // anonymous namespace
+
+Status KernelAndDeviceOp::Run(ScopedStepContainer* step_container,
+                              const gtl::InlinedVector<TensorValue, 4>& inputs,
+                              std::vector<Tensor>* outputs,
+                              NodeExecStats* stats, StepStats* step_stats,
+                              GraphCollector* graph_collector) {
   std::vector<AllocatorAttributes> out_attrs(kernel_->num_outputs());
   for (size_t i = 0; i < out_attrs.size(); ++i) {
     out_attrs[i].set_on_host(kernel_->output_memory_types()[i] ==
@@ -85,7 +177,7 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container,
   }
 
   gtl::InlinedVector<DeviceContext*, 4> input_device_contexts;
-  for (int i = 0; i < inputs->size(); i++) {
+  for (int i = 0; i < inputs.size(); i++) {
     DeviceContext* device_context = nullptr;
     if (device_->tensorflow_gpu_device_info() != nullptr) {
       device_context = device_->tensorflow_gpu_device_info()->default_context;
@@ -96,7 +188,7 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container,
   OpKernelContext::Params params;
   params.device = device_;
   params.frame_iter = FrameAndIter(0, 0);
-  params.inputs = &input_vector;
+  params.inputs = &inputs;
   params.op_kernel = kernel_.get();
   params.resource_manager = device_->resource_manager();
   params.output_attr_array = gtl::vector_as_array(&out_attrs);
@@ -104,6 +196,7 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container,
   params.slice_reader_cache = &slice_reader_cache_;
   params.rendezvous = rendez_;
   params.cancellation_manager = &cm_;
+  cm_.Reset();
   params.log_memory = log_memory_;
   std::unique_ptr<StepStatsCollector> step_stats_collector;
   if (stats != nullptr) {
@@ -112,11 +205,7 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container,
     params.stats_collector = step_stats_collector.get();
     params.graph_collector = graph_collector;
   }
-  if (runner_ == nullptr) {
-    params.runner = &default_runner_;
-  } else {
-    params.runner = runner_;
-  }
+  params.runner = runner_ != nullptr ? runner_ : &default_runner_;
 
   params.step_container = step_container;
   params.collective_executor =
@@ -134,7 +223,16 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container,
     device_->ComputeAsync(async, &context, [&done]() { done.Notify(); });
     done.WaitForNotification();
   } else {
-    device_->Compute(kernel_.get(), &context);
+    const string& op_name = kernel_->name();
+    // If tracing if off, the overheads of ScopedAnnotation and ScopedActivity
+    // are negligible.
+    if (device_->TraceUsingAnnotations()) {
+      tracing::ScopedAnnotation activity(op_name, kernel_->type_string());
+      device_->Compute(kernel_.get(), &context);
+    } else {
+      tracing::ScopedActivity activity(op_name, kernel_->type_string());
+      device_->Compute(kernel_.get(), &context);
+    }
   }
   if (!context.status().ok()) return context.status();
 
@@ -143,37 +241,112 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container,
     outputs->push_back(Tensor(*context.mutable_output(i)));
   }
   if (stats != nullptr) {
-    for (const auto& allocator_pair : context.ConsumeWrappedAllocators()) {
-      AllocatorMemoryUsed* memory = stats->add_memory();
-      memory->set_allocator_name(allocator_pair.first->Name());
-      auto sizes = allocator_pair.second->GetSizes();
-      memory->set_total_bytes(std::get<0>(sizes));
-      memory->set_peak_bytes(std::get<1>(sizes));
-      memory->set_live_bytes(std::get<2>(sizes));
-
-      AllocatorStats allocator_stats;
-      allocator_pair.first->GetStats(&allocator_stats);
-      memory->set_allocator_bytes_in_use(allocator_stats.bytes_in_use);
-      allocator_pair.second->GetRecordsAndUnRef();
-    }
-    auto* ms = stats->mutable_memory_stats();
-    ms->set_temp_memory_size(context.temp_memory_allocated());
-    for (const auto& alloc_id : context.persistent_alloc_ids()) {
-      ms->mutable_persistent_tensor_alloc_ids()->Add(alloc_id);
-    }
+    UpdateStats(&context, step_stats_collector.get(), stats);
+  }
+  return Status::OK();
+}
 
-    ms->set_persistent_memory_size(context.persistent_memory_allocated());
+Status KernelAndDeviceFunc::Run(
+    ScopedStepContainer* step_container,
+    const gtl::InlinedVector<TensorValue, 4>& inputs,
+    std::vector<Tensor>* outputs, NodeExecStats* stats, StepStats* step_stats,
+    GraphCollector* graph_collector) {
+  FunctionLibraryRuntime::Options opts;
+  // We don't pass rendezvous from eager context because we can get tensor
+  // name collisions in send/recv ops when running multiple instances
+  // of the same multi-device function concurrently. Instead, we ask the
+  // function library runtime to create a new for this call. We could have
+  // created one here but it requires more state to be kept in
+  // KernelAndDeviceFunc.
+  opts.rendezvous = nullptr;
+  opts.create_rendezvous = true;
+  opts.cancellation_manager = &cm_;
+  cm_.Reset();
+  // eager runtime does not yet support collective ops.
+  opts.collective_executor = nullptr;
+  opts.allow_dead_tensors = true;
+  opts.step_container = step_container;
+  opts.collective_executor =
+      collective_executor_ ? collective_executor_->get() : nullptr;
+
+  std::unique_ptr<StepStatsCollector> step_stats_collector;
+  if (stats != nullptr) {
+    step_stats_collector.reset(new StepStatsCollector(step_stats));
+  }
+  opts.stats_collector = step_stats_collector.get();
+  opts.runner = (runner_ == nullptr) ? &default_runner_ : runner_;
+
+  Notification done;
+  Status status;
+  outputs->clear();
+  std::vector<Tensor> input_vector;
+  input_vector.reserve(inputs.size());
+  for (const TensorValue& tensor_value : inputs) {
+    input_vector.push_back(*tensor_value.tensor);
+  }
+
+  flr_->Run(opts, handle_, input_vector, outputs,
+            [&status, &done](const Status& s) {
+              status = s;
+              done.Notify();
+            });
+  done.WaitForNotification();
+
+  if (step_stats_collector != nullptr) {
     step_stats_collector->Finalize();
   }
-  return Status::OK();
+  return status;
+}
+
+tensorflow::Device* KernelAndDeviceOp::OutputDevice(int idx) const {
+  if (kernel_->output_memory_types()[idx] == HOST_MEMORY) {
+    return nullptr;
+  }
+  return device_;
 }
 
-tensorflow::Device* KernelAndDevice::OutputDevice(int idx) const {
-  if (device_ != nullptr &&
-      kernel_->output_memory_types()[idx] == HOST_MEMORY) {
+tensorflow::Device* KernelAndDeviceFunc::OutputDevice(int idx) const {
+  if (output_dtypes_[idx] == DT_RESOURCE) {
     return nullptr;
   }
+  return output_devices_[idx];
+}
+
+tensorflow::Device* KernelAndDeviceOp::OutputResourceDevice(int idx) const {
+  if (kernel_->output_type(idx) == DT_RESOURCE) {
+    return device_;
+  }
+  return nullptr;
+}
+
+tensorflow::Device* KernelAndDeviceFunc::OutputResourceDevice(int idx) const {
+  if (output_dtypes_[idx] == DT_RESOURCE) {
+    return output_devices_[idx];
+  }
+  return nullptr;
+}
+
+DataType KernelAndDeviceOp::input_type(int i) const {
+  return kernel_->input_type(i);
+}
+
+DataType KernelAndDeviceFunc::input_type(int i) const {
+  return input_dtypes_[i];
+}
+
+Device* KernelAndDeviceOp::InputDevice(int i) const {
+  if (kernel_->input_memory_types()[i] == HOST_MEMORY) {
+    return host_cpu_device_;
+  }
   return device_;
 }
 
+Device* KernelAndDeviceFunc::InputDevice(int i) const {
+  if (input_dtypes_[i] == DT_RESOURCE) {
+    return host_cpu_device_;
+  } else {
+    return input_devices_[i];
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index ee430b7fc70e1f4e5256e9dd28f4240ce57de86a..027168dd6eac59a758e943eebfd2af36fd5d0914 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -38,8 +38,13 @@ namespace tensorflow {
 // include the proto header
 class NodeExecStats;
 class StepStats;
+class ProcessFunctionLibraryRuntime;
+class FunctionLibraryRuntime;
 
-// KernelAndDevice encapsulates an instantiated kernel and the device it is on.
+// KernelAndDevice encapsulates the logic needed to run a computation eagerly.
+// The computation can be a single instantiated kernel (implemented by
+// KernelAndDeviceOp below) or a multi-device function (implemented by
+// KernelAndDeviceFunc below).
 //
 // Also see:
 // https://www.tensorflow.org/code/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
@@ -47,59 +52,171 @@ class StepStats;
 // https://www.tensorflow.org/code/tensorflow/core/kernels/ops_testutil.h
 class KernelAndDevice {
  public:
-  // Populates 'out' with a kernel appropriate for 'ndef'.
+  // Populates this with a kernel appropriate for 'ndef'.
   //
   // The provided FunctionLibraryRuntime MUST outlive all calls to
   // Run() on the returned KernelAndDevice.
-  static Status Init(const NodeDef& ndef, FunctionLibraryRuntime* flr,
-                     std::function<void(std::function<void()>)>* runner,
-                     KernelAndDevice* out);
-
-  KernelAndDevice(tensorflow::Rendezvous* rendez, bool log_memory)
-      : KernelAndDevice(rendez, log_memory, nullptr) {}
+  virtual Status Init(const NodeDef& ndef, GraphCollector* graph_collector) = 0;
 
+  // Non-multi-device functions are run using regular CallOp and look like
+  // primitive operations from KernelAndDevice perspective.
   KernelAndDevice(
-      tensorflow::Rendezvous* rendez, bool log_memory,
-      std::unique_ptr<CollectiveExecutor::Handle> collective_executor)
-      : device_(nullptr),
-        flr_(nullptr),
-        rendez_(rendez),
-        log_memory_(log_memory),
+      FunctionLibraryRuntime* flr,
+      std::function<void(std::function<void()>)>* runner,
+      std::unique_ptr<CollectiveExecutor::Handle> collective_executor,
+      Device* host_cpu_device)
+      : device_(flr->device()),
+        host_cpu_device_(host_cpu_device),
+        flr_(flr),
+        runner_(runner),
+        default_runner_([](std::function<void()> f) { f(); }),
         collective_executor_(std::move(collective_executor)) {}
 
-  // TODO(ashankar): Handle list-valued inputs.
-  Status Run(std::vector<Tensor>* inputs, std::vector<Tensor>* outputs,
-             NodeExecStats* stats, StepStats* step_stats,
-             GraphCollector* graph_collector);
+  virtual ~KernelAndDevice() {}
 
-  Status Run(ScopedStepContainer* step_container, std::vector<Tensor>* inputs,
+  // TODO(ashankar): Handle list-valued inputs.
+  Status Run(const gtl::InlinedVector<TensorValue, 4>& inputs,
              std::vector<Tensor>* outputs, NodeExecStats* stats,
              StepStats* step_stats, GraphCollector* graph_collector);
 
-  Device* OutputDevice(int idx) const;
-
-  const OpKernel* kernel() const { return kernel_.get(); }
-
+  virtual Status Run(ScopedStepContainer* step_container,
+                     const gtl::InlinedVector<TensorValue, 4>& inputs,
+                     std::vector<Tensor>* outputs, NodeExecStats* stats,
+                     StepStats* step_stats,
+                     GraphCollector* graph_collector) = 0;
+
+  virtual Device* InputDevice(int i) const = 0;
+  virtual Device* OutputDevice(int idx) const = 0;
+  // If idx'th output is a resource, returns the device backing the resource.
+  // Else, returns nullptr.
+  virtual Device* OutputResourceDevice(int idx) const = 0;
+
+  // Returns nullptr for functions.
+  virtual const OpKernel* kernel() const = 0;
+
+  // Returns the device on which this kernel will run. In the case of
+  // multi-device functions, this is the default device that is passed to the
+  // placer but actual computation can happen on a different set of devices.
+  // Also, outputs can be produced on devices different from what this method
+  // returns.
   Device* device() const { return device_; }
 
-  const DataTypeVector& output_dtypes() { return output_dtypes_; }
+  virtual const DataTypeVector& output_dtypes() const = 0;
 
- private:
+  virtual DataType input_type(int i) const = 0;
+  virtual int num_inputs() const = 0;
+  virtual int num_outputs() const = 0;
+
+ protected:
   // TODO(apassos) Consider a shared cancellation manager. Note that this
   // cancellation manager is not useful to actually cancel anything, and is
   // provided here only for the few kernels which can't handle one being
   // missing.
   CancellationManager cm_;
+  Device* const device_;           // non-null
+  Device* const host_cpu_device_;  // non-null
+  FunctionLibraryRuntime* const flr_;
+  std::function<void(std::function<void()>)>* const runner_;
+  std::function<void(std::function<void()>)> default_runner_;
+  const std::unique_ptr<CollectiveExecutor::Handle> collective_executor_;
+};
+
+// Represents an op kernel and the device it will be run on.
+class KernelAndDeviceOp final : public KernelAndDevice {
+ public:
+  KernelAndDeviceOp(
+      tensorflow::Rendezvous* rendez, bool log_memory,
+      FunctionLibraryRuntime* flr,
+      std::function<void(std::function<void()>)>* runner,
+      std::unique_ptr<CollectiveExecutor::Handle> collective_executor,
+      Device* host_cpu_device)
+      : KernelAndDevice(flr, runner, std::move(collective_executor),
+                        host_cpu_device),
+        rendez_(rendez),
+        log_memory_(log_memory) {}
+
+  Status Init(const NodeDef& ndef, GraphCollector* graph_collector) override;
+
+  using KernelAndDevice::Run;
+
+  Status Run(ScopedStepContainer* step_container,
+             const gtl::InlinedVector<TensorValue, 4>& inputs,
+             std::vector<Tensor>* outputs, NodeExecStats* stats,
+             StepStats* step_stats, GraphCollector* graph_collector) override;
+
+  const OpKernel* kernel() const override { return kernel_.get(); }
+
+  Device* InputDevice(int i) const override;
+  Device* OutputDevice(int idx) const override;
+  Device* OutputResourceDevice(int idx) const override;
+
+  DataType input_type(int i) const override;
+  const DataTypeVector& output_dtypes() const override {
+    return kernel_->output_types();
+  }
+  int num_inputs() const override { return kernel_->num_inputs(); }
+  int num_outputs() const override { return kernel_->num_outputs(); }
+
+ private:
   std::unique_ptr<OpKernel> kernel_;
-  Device* device_;
-  FunctionLibraryRuntime* flr_;
+  Rendezvous* const rendez_;
   checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_;
-  Rendezvous* rendez_;
-  DataTypeVector output_dtypes_;
-  std::function<void(std::function<void()>)>* runner_;
-  std::function<void(std::function<void()>)> default_runner_;
   const bool log_memory_;
-  const std::unique_ptr<CollectiveExecutor::Handle> collective_executor_;
+};
+
+// Represents a multi-device function. Functions can also be run using
+// various function-calling kernels including CallOp and PartitionedCallOp.
+// In such cases, KernelAndDeviceOp is used.
+class KernelAndDeviceFunc final : public KernelAndDevice {
+ public:
+  KernelAndDeviceFunc(
+      FunctionLibraryRuntime* flr, ProcessFunctionLibraryRuntime* pflr,
+      std::vector<Device*> input_devices,
+      std::function<void(std::function<void()>)>* runner,
+      std::unique_ptr<CollectiveExecutor::Handle> collective_executor,
+      Device* host_cpu_device)
+      : KernelAndDevice(flr, runner, std::move(collective_executor),
+                        host_cpu_device),
+        pflr_(pflr),
+        handle_(kInvalidHandle),
+        input_devices_(std::move(input_devices)) {}
+
+  virtual ~KernelAndDeviceFunc();
+
+  Status Init(const NodeDef& ndef, GraphCollector* graph_collector) override;
+
+  using KernelAndDevice::Run;
+
+  Status Run(ScopedStepContainer* step_container,
+             const gtl::InlinedVector<TensorValue, 4>& inputs,
+             std::vector<Tensor>* outputs, NodeExecStats* stats,
+             StepStats* step_stats, GraphCollector* graph_collector) override;
+
+  const OpKernel* kernel() const override { return nullptr; }
+
+  Device* InputDevice(int i) const override;
+  Device* OutputDevice(int idx) const override;
+  Device* OutputResourceDevice(int idx) const override;
+
+  DataType input_type(int i) const override;
+  const DataTypeVector& output_dtypes() const override {
+    return output_dtypes_;
+  }
+  int num_inputs() const override { return input_dtypes_.size(); }
+  int num_outputs() const override { return output_dtypes_.size(); }
+
+ private:
+  ProcessFunctionLibraryRuntime* const pflr_;
+  FunctionLibraryRuntime::Handle handle_;
+  // CPU devices are null. Resource handles' devices are actual backing
+  // devices.
+  std::vector<Device*> output_devices_;
+  // CPU devices are not null. Resource handles' devices are actual backing
+  // devices.
+  std::vector<Device*> input_devices_;
+
+  DataTypeVector input_dtypes_;
+  DataTypeVector output_dtypes_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
index 3ffed3ce321e79d021c302acf444f93cc9ccce53..703f3eb9b750f031ff0f69b3395a32c1d9414168 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
@@ -27,10 +27,13 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace {
@@ -41,20 +44,27 @@ class TestEnv {
     std::vector<std::unique_ptr<Device>> devices;
     devices.push_back(
         DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+    cpu_device_ = devices.back().get();
     device_mgr_ = absl::make_unique<DeviceMgr>(std::move(devices));
-    flib_runtime_ = NewFunctionLibraryRuntime(
-        device_mgr_.get(), Env::Default(), device_mgr_->ListDevices()[0],
-        TF_GRAPH_DEF_VERSION, &flib_def_, nullptr, {}, nullptr);
-  }
+    OptimizerOptions opts;
+    pflr_ = tensorflow::MakeUnique<ProcessFunctionLibraryRuntime>(
+        device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, &flib_def_,
+        opts, /*default_thread_pool=*/nullptr, /*cluster_flr=*/nullptr);
 
-  FunctionLibraryRuntime* function_library_runtime() const {
-    return flib_runtime_.get();
+    flr_ = pflr_->GetFLR("/job:a/replica:0/task:0/device:CPU:0");
+    CHECK(flr_ != nullptr);
   }
 
+  FunctionLibraryRuntime* function_library_runtime() const { return flr_; }
+  ProcessFunctionLibraryRuntime* pflr() const { return pflr_.get(); }
+  Device* cpu_device() { return cpu_device_; }
+
  private:
   FunctionLibraryDefinition flib_def_;
   std::unique_ptr<DeviceMgr> device_mgr_;
-  std::unique_ptr<FunctionLibraryRuntime> flib_runtime_;
+  FunctionLibraryRuntime* flr_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  Device* cpu_device_;
 };
 
 void BM_CreateGraph(int iters) {
@@ -106,11 +116,11 @@ void BM_KernelAndDeviceInit(int iters) {
                    .NumInputs(2)
                    .BuildNodeDef());
   TestEnv env;
-  KernelAndDevice k(nullptr, false);
+  KernelAndDeviceOp k(nullptr, false, env.function_library_runtime(), nullptr,
+                      nullptr, env.cpu_device());
   tensorflow::testing::StartTiming();
   for (int i = 0; i < iters; ++i) {
-    TF_CHECK_OK(KernelAndDevice::Init(ndef, env.function_library_runtime(),
-                                      nullptr, &k));
+    TF_CHECK_OK(k.Init(ndef, nullptr));
   }
 }
 BENCHMARK(BM_KernelAndDeviceInit);
@@ -118,9 +128,9 @@ BENCHMARK(BM_KernelAndDeviceInit);
 void BM_KernelAndDeviceRun(int iters) {
   tensorflow::testing::StopTiming();
   Tensor t(Input({{1.0f, 2.0f}, {3.0f, 4.0f}}).tensor());
-  std::vector<Tensor> inputs;
-  inputs.push_back(t);
-  inputs.push_back(t);
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  inputs.push_back(TensorValue(&t));
+  inputs.push_back(TensorValue(&t));
   std::vector<Tensor> outputs;
   NodeDef ndef(AttrBuilder("MatMul")
                    .Set("T", DT_FLOAT)
@@ -129,12 +139,12 @@ void BM_KernelAndDeviceRun(int iters) {
                    .NumInputs(inputs.size())
                    .BuildNodeDef());
   TestEnv env;
-  KernelAndDevice kernel(nullptr, false);
-  TF_CHECK_OK(KernelAndDevice::Init(ndef, env.function_library_runtime(),
-                                    nullptr, &kernel));
+  KernelAndDeviceOp k(nullptr, false, env.function_library_runtime(), nullptr,
+                      nullptr, env.cpu_device());
+  TF_CHECK_OK(k.Init(ndef, nullptr));
   tensorflow::testing::StartTiming();
   for (int i = 0; i < iters; ++i) {
-    TF_CHECK_OK(kernel.Run(&inputs, &outputs, nullptr, nullptr, nullptr));
+    TF_CHECK_OK(k.Run(inputs, &outputs, nullptr, nullptr, nullptr));
   }
 }
 BENCHMARK(BM_KernelAndDeviceRun);
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index 0acd1609361453a0901e346f3b9d76e6e3a7b872..e44a97b2655fee02b77c965dcc8d3aa04dbcd091 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -22,16 +22,17 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/substitute.h"
 #include "tensorflow/core/common_runtime/copy_tensor.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
-#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -44,6 +45,74 @@ limitations under the License.
 
 namespace tensorflow {
 
+TensorHandle::TensorHandle(const class Tensor& t, Device* d, Device* op_device,
+                           EagerContext* ctx)
+    : dtype(t.dtype()),
+      node_id_(0),
+      tensor_(t),
+      device_(d),
+      op_device_(op_device),
+      resource_device_(GetResourceDevice(t, ctx)),
+      remote_op_id_(-1),
+      remote_output_num_(-1),
+      remote_shape_node_id_(-1),
+      ctx_(ctx),
+      is_ready_(true) {}
+
+TensorHandle::TensorHandle(uint64 node_id, Device* d, Device* op_device,
+                           Device* resource_device, DataType dtype,
+                           EagerContext* ctx)
+    : dtype(dtype),
+      node_id_(node_id),
+      tensor_(dtype),
+      device_(d),
+      op_device_(op_device),
+      resource_device_(resource_device),
+      remote_op_id_(-1),
+      remote_output_num_(-1),
+      remote_shape_node_id_(-1),
+      ctx_(ctx),
+      is_ready_(ctx == nullptr) {
+  DCHECK_GT(node_id_, 0);
+  DCHECK(dtype == DT_RESOURCE ? resource_device_ != nullptr
+                              : resource_device_ == nullptr);
+}
+
+TensorHandle::TensorHandle(int64 op_id, int32 output_num,
+                           uint64 remote_shape_node_id, DataType dtype,
+                           std::function<void()> call_on_destroy, Device* d,
+                           Device* op_device, Device* resource_device,
+                           EagerContext* ctx)
+    : dtype(dtype),
+      node_id_(0),
+      device_(d),
+      op_device_(op_device),
+      resource_device_(resource_device),
+      remote_op_id_(op_id),
+      remote_output_num_(output_num),
+      remote_shape_node_id_(remote_shape_node_id),
+      call_on_destroy_(std::move(call_on_destroy)),
+      ctx_(ctx),
+      is_ready_(true) {
+  DCHECK(IsRemote()) << "Op ID and output num should be >= 0. Op ID: " << op_id
+                     << ", Output num: " << output_num;
+  DCHECK(dtype == DT_RESOURCE ? resource_device_ != nullptr
+                              : resource_device_ == nullptr);
+}
+
+TensorHandle::TensorHandle(OutputGraphNode symbolic_tensor, DataType dtype)
+    : dtype(dtype),
+      node_id_(0),
+      device_(nullptr),
+      op_device_(nullptr),
+      resource_device_(nullptr),
+      remote_op_id_(-1),
+      remote_output_num_(-1),
+      remote_shape_node_id_(-1),
+      ctx_(nullptr),
+      is_ready_(true),
+      symbolic_tensor(new OutputGraphNode(symbolic_tensor)) {}
+
 bool TensorHandle::IsReady() {
   if (node_id_ == 0) return true;
   mutex_lock l(ctx_mutex_);
@@ -79,6 +148,13 @@ Status TensorHandle::Tensor(const tensorflow::Tensor** t) {
   return Status::OK();
 }
 
+Status TensorHandle::TensorValue(tensorflow::TensorValue* t) {
+  TF_RETURN_IF_ERROR(WaitReady());
+  DCHECK(IsReady());
+  *t = tensorflow::TensorValue(&tensor_);
+  return Status::OK();
+}
+
 Status TensorHandle::TensorAndDevice(const tensorflow::Tensor** tensor,
                                      tensorflow::Device** device,
                                      tensorflow::Device** op_device) {
@@ -239,4 +315,31 @@ Status TensorHandle::CopyToDevice(EagerContext* ctx, tensorflow::Device* dstd,
   return status;
 }
 
+Device* GetResourceDevice(const Tensor& t, EagerContext* ctx) {
+  if (t.dtype() != DT_RESOURCE) {
+    return nullptr;
+  }
+  const ResourceHandle& resource_handle = t.flat<ResourceHandle>()(0);
+  const auto& map = *ctx->device_map();
+  auto it = map.find(resource_handle.device());
+  DCHECK(it != map.end());
+  return it->second;
+}
+
+string TensorHandle::DebugString() const {
+  VLOG(1) << "Calling TensorHandle::DebugString() on " << this;
+
+  if (symbolic_tensor) {
+    return absl::Substitute("TF_Output($0, $1)", symbolic_tensor->oper,
+                            symbolic_tensor->index);
+  }
+
+  string out;
+  strings::StrAppend(&out, "Device: ", device_ ? device_->DebugString() : "[]");
+  // Consider supporting non-CPU tensors (when device_ is non-NULL) if needed.
+  strings::StrAppend(&out, ", Tensor: ", device_ ? "?" : tensor_.DebugString(),
+                     "\n");
+  return out;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index 0fdd31ab5fcfe99c92074fc69d831d17f46d607e..ac99fdbe294a7f50606653694a593ed16c363f7b 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
-#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/rendezvous.h"
@@ -42,59 +41,37 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
 
+struct TF_Operation;
+
 namespace tensorflow {
 
+// This struct is isomorphic to TF_Output, but we cannot use the latter here due
+// to layering concerns (TF_Output is defined at the C API layer).
+struct OutputGraphNode {
+  TF_Operation* oper;
+  int index;  // The index of the output within oper.
+};
+
 // Associates a Tensor and a Device, used in the eager runtime. Internal version
 // of the TFE_TensorHandle struct and the python EagerTensor class
 // (unrelated to python TensorHandle).
 class TensorHandle : public core::RefCounted {
  public:
-  TensorHandle(const Tensor& t, Device* d, Device* op_device, EagerContext* ctx)
-      : dtype(t.dtype()),
-        node_id_(0),
-        tensor_(t),
-        device_(d),
-        op_device_(op_device),
-        remote_op_id_(-1),
-        remote_output_num_(-1),
-        remote_shape_node_id_(-1),
-        ctx_(ctx),
-        is_ready_(true) {}
-
-  TensorHandle(uint64 node_id, Device* d, Device* op_device, DataType dtype,
-               EagerContext* ctx)
-      : dtype(dtype),
-        node_id_(node_id),
-        tensor_(dtype),
-        device_(d),
-        op_device_(op_device),
-        remote_op_id_(-1),
-        remote_output_num_(-1),
-        remote_shape_node_id_(-1),
-        ctx_(ctx),
-        is_ready_(ctx == nullptr) {
-    DCHECK_GT(node_id_, 0);
-  }
+  TensorHandle(const Tensor& t, Device* d, Device* op_device,
+               EagerContext* ctx);
+  TensorHandle(uint64 node_id, Device* d, Device* op_device,
+               Device* resource_device, DataType dtype, EagerContext* ctx);
 
   // Remote tensor handle constructor.
   TensorHandle(int64 op_id, int32 output_num, uint64 remote_shape_node_id,
                DataType dtype, std::function<void()> call_on_destroy, Device* d,
-               Device* op_device, EagerContext* ctx)
-      : dtype(dtype),
-        node_id_(0),
-        device_(d),
-        op_device_(op_device),
-        remote_op_id_(op_id),
-        remote_output_num_(output_num),
-        remote_shape_node_id_(remote_shape_node_id),
-        call_on_destroy_(std::move(call_on_destroy)),
-        ctx_(ctx),
-        is_ready_(true) {
-    DCHECK(IsRemote()) << "Op ID and output num should be >= 0. Op ID: "
-                       << op_id << ", Output num: " << output_num;
-  }
+               Device* op_device, Device* resource_device, EagerContext* ctx);
+
+  // Symbolic tensor constructor.
+  TensorHandle(OutputGraphNode symbolic_tensor, DataType dtype);
 
   ~TensorHandle() override {
+    VLOG(1) << "Deleting internal TensorHandle " << this;
     if (call_on_destroy_) {
       call_on_destroy_();
     }
@@ -102,9 +79,11 @@ class TensorHandle : public core::RefCounted {
 
   Status Tensor(const tensorflow::Tensor** t);
 
-  tensorflow::Device* device() const { return device_; }
+  Status TensorValue(tensorflow::TensorValue* t);
 
+  tensorflow::Device* device() const { return device_; }
   tensorflow::Device* op_device() const { return op_device_; }
+  tensorflow::Device* resource_device() const { return resource_device_; }
 
   Status TensorAndDevice(const tensorflow::Tensor** tensor,
                          tensorflow::Device** device,
@@ -146,17 +125,21 @@ class TensorHandle : public core::RefCounted {
            (ctx_ == nullptr || ctx_->HostCPU() == device_);
   }
 
+  bool IsRemote();
+
+  OutputGraphNode* getSymbolicTensor() const { return symbolic_tensor.get(); }
+
+  string DebugString() const;
+
  private:
   // If the contents of the Tensor pointed to by this handle is yet to be
-  // computed by a EagerNode, this function will block till that compuatation is
+  // computed by a EagerNode, this function will block till that computation is
   // done and the handle is "ready".
   Status WaitReady();
   Status WaitForNode(uint64 node_id, bool return_if_is_ready);
 
   bool IsReady();
 
-  bool IsRemote();
-
   // Id for the EagerNode that will compute the value pointed to by this handle.
   // If the value is 0, the handle is already ready, but not vice-versa.
   const uint64 node_id_;
@@ -177,6 +160,10 @@ class TensorHandle : public core::RefCounted {
   // device_ for constant tensors.
   tensorflow::Device* const op_device_;
 
+  // If the tensor dtype is DT_RESOURCE, resource_device_ holds the device
+  // backing the resource. Else resource_device_ is nullptr.
+  tensorflow::Device* const resource_device_;
+
   // IDs required when this class is representing a remote tensor handle.
   const int64 remote_op_id_;
   const int32 remote_output_num_;
@@ -195,8 +182,17 @@ class TensorHandle : public core::RefCounted {
   // `ctx` object is not owned and should outlive this handle.
   EagerContext* ctx_ GUARDED_BY(ctx_mutex_);
   bool is_ready_ GUARDED_BY(ctx_mutex_);
+
+  // When non-NULL, this tensor handle instance represents a symbolic tensor
+  // (corresponding to a graph node), whose concrete value is to be produced by
+  // executing that graph node.
+  std::unique_ptr<OutputGraphNode> symbolic_tensor;
 };
 
+// If tensor's dtype is DT_RESOURCE, returns the device backing the resource.
+// Else, returns nullptr.
+Device* GetResourceDevice(const Tensor& t, EagerContext* ctx);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_TENSOR_HANDLE_H_
diff --git a/tensorflow/core/common_runtime/eval_const_tensor.cc b/tensorflow/core/common_runtime/eval_const_tensor.cc
index 87749da7afed9f67c469cbcd63e685c2c534a4bb..fb51e2dec3ac63f64cd70bececa5734bb5afc8a4 100644
--- a/tensorflow/core/common_runtime/eval_const_tensor.cc
+++ b/tensorflow/core/common_runtime/eval_const_tensor.cc
@@ -19,11 +19,11 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 6b3284b84a0d2741f315c3f91db35eebc68f9e98..e4022ad3af764c002f6784f7283bc8d8ac603489 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -46,6 +46,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -55,9 +56,11 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/context.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/profile_utils/cpu_utils.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
@@ -131,6 +134,16 @@ struct EdgeInfo {
   int input_slot;
 };
 
+// Time the execution of kernels (in CPU cycles).  Used to dynamically identify
+// inexpensive kernels which can be dispatched inline.
+struct KernelTimer {
+  uint64 start_cycles = profile_utils::CpuUtils::GetCurrentClockCycle();
+
+  uint64 ElapsedCycles() {
+    return profile_utils::CpuUtils::GetCurrentClockCycle() - start_cycles;
+  }
+};
+
 struct NodeItem {
   NodeItem() {}
 
@@ -140,7 +153,6 @@ struct NodeItem {
   // The kernel for this node.
   OpKernel* kernel = nullptr;
 
-  bool kernel_is_expensive : 1;  // True iff kernel->IsExpensive()
   bool kernel_is_async : 1;      // True iff kernel->AsAsync() != nullptr
   bool is_merge : 1;             // True iff IsMerge(node)
   bool is_enter : 1;             // True iff IsEnter(node)
@@ -625,7 +637,6 @@ Status ExecutorImpl::Initialize() {
       return s;
     }
     CHECK(item->kernel);
-    item->kernel_is_expensive = item->kernel->IsExpensive();
     item->kernel_is_async = (item->kernel->AsAsync() != nullptr);
     item->is_merge = IsMerge(n);
     item->is_enter = IsEnter(n);
@@ -1235,6 +1246,7 @@ class ExecutorState {
   Rendezvous* rendezvous_;
   CollectiveExecutor* collective_executor_ = nullptr;
   SessionState* session_state_;
+  string session_handle_;
   TensorStore* tensor_store_;
   // Step-local container.
   ScopedStepContainer* step_container_;
@@ -1266,6 +1278,11 @@ class ExecutorState {
 
   std::atomic_int_fast32_t num_outstanding_ops_;
 
+  // Available via OpKernelContext to every OpKernel invocation.
+  mutex num_deferred_ops_mu_;
+  condition_variable num_deferred_ops_cv_;
+  int64 num_deferred_ops_ GUARDED_BY(num_deferred_ops_mu_) = 0;
+
   mutex mu_;
   Status status_ GUARDED_BY(mu_);
 
@@ -1343,6 +1360,9 @@ class ExecutorState {
 
   // Clean up when this executor is done.
   void Finish();
+  // Schedule Finish() on a separate thread if it needs to wait for deferred
+  // async ops to complete; otherwise run it on the current thread.
+  void ScheduleFinish();
 
   // A standalone routine for this expression so that we can express
   // that we don't want thread safety analysis on this reference (it's
@@ -1362,6 +1382,7 @@ ExecutorState::ExecutorState(const Executor::Args& args, ExecutorImpl* impl)
       rendezvous_(args.rendezvous),
       collective_executor_(args.collective_executor),
       session_state_(args.session_state),
+      session_handle_(args.session_handle),
       tensor_store_(args.tensor_store),
       step_container_(args.step_container),
       stats_collector_(args.stats_collector),
@@ -1580,7 +1601,8 @@ bool MightTrace(const NodeItem& item,
     if (using_annotations) {
       return trace_collector->IsEnabledForAnnotations();
     } else {
-      return trace_collector->IsEnabledForActivities(item.kernel_is_expensive);
+      return trace_collector->IsEnabledForActivities(
+          item.kernel->IsExpensive());
     }
   }
   return false;
@@ -1606,6 +1628,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
   params.rendezvous = rendezvous_;
   params.collective_executor = collective_executor_;
   params.session_state = session_state_;
+  params.session_handle = session_handle_;
   params.tensor_store = tensor_store_;
   params.cancellation_manager = cancellation_manager_;
   params.call_frame = call_frame_;
@@ -1618,6 +1641,15 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
   params.input_alloc_attrs = &input_alloc_attrs;
   params.runner = &runner_;
   params.stats_collector = stats_collector_;
+  params.inc_num_deferred_ops_function = [this]() {
+    mutex_lock lock(num_deferred_ops_mu_);
+    num_deferred_ops_++;
+  };
+  params.dec_num_deferred_ops_function = [this]() {
+    mutex_lock lock(num_deferred_ops_mu_);
+    num_deferred_ops_--;
+    num_deferred_ops_cv_.notify_all();
+  };
 
   Status s;
   NodeExecStatsInterface* stats = nullptr;
@@ -1751,7 +1783,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
           const bool completed =
               NodeDone(s, state->item->node, ready, stats, nullptr);
           delete state;
-          if (completed) Finish();
+          if (completed) ScheduleFinish();
         };
         nodestats::SetOpStart(stats);
         device->ComputeAsync(async, &state->ctx, done);
@@ -1780,12 +1812,18 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
                 op_name,
                 strings::StrCat(op_kernel->type_string(), "#id=", step_id_,
                                 "#"),
-                item.kernel_is_expensive);
+                item.kernel->IsExpensive());
             device->Compute(op_kernel, &ctx);
           }
         } else {
           // In the common case, avoid creating any tracing objects.
-          device->Compute(op_kernel, &ctx);
+          if (op_kernel->IsExpensive()) {
+            KernelTimer timer;
+            device->Compute(op_kernel, &ctx);
+            op_kernel->UpdateCostEstimate(timer.ElapsedCycles());
+          } else {
+            device->Compute(op_kernel, &ctx);
+          }
         }
 
         nodestats::SetOpEnd(stats);
@@ -1832,7 +1870,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
   }  // while !inline_ready.empty()
 
   // This thread of computation is done if completed = true.
-  if (completed) Finish();
+  if (completed) ScheduleFinish();
 }
 
 Status ExecutorState::PrepareInputs(const NodeItem& item, Entry* first_input,
@@ -1883,7 +1921,7 @@ Status ExecutorState::PrepareInputs(const NodeItem& item, Entry* first_input,
       inp->tensor = entry->val.get();
     } else {
       {
-        mutex_lock ml(*entry->ref_mu);
+        tf_shared_lock ml(*entry->ref_mu);
         if (!entry->ref->IsInitialized() && !IsInitializationOp(item.node)) {
           return AttachDef(errors::FailedPrecondition(
                                "Attempting to use uninitialized value ",
@@ -1899,7 +1937,7 @@ Status ExecutorState::PrepareInputs(const NodeItem& item, Entry* first_input,
         // tensor but is given a ref to a tensor.  Need to deref it
         // under the mutex.
         {
-          mutex_lock l(*(entry->ref_mu));
+          tf_shared_lock l(*(entry->ref_mu));
           DCHECK(!entry->val_field_is_set);
           entry->val.Init(*entry->ref);
           entry->val_field_is_set = true;
@@ -1988,7 +2026,7 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
       // Sanity check of output tensor types.
       DataType dtype;
       if (val.is_ref()) {
-        mutex_lock ml(*val.mutex_if_ref);
+        tf_shared_lock ml(*val.mutex_if_ref);
         dtype = MakeRefType(val->dtype());
       } else {
         dtype = val->dtype();
@@ -2005,7 +2043,7 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
             Tensor to_log;
             {
               // Dereference the tensor under the lock.
-              mutex_lock l(*out->ref_mu);
+              tf_shared_lock l(*out->ref_mu);
               to_log = *out->ref;
             }
             LogMemory::RecordTensorOutput(ctx->op_kernel().name(),
@@ -2218,6 +2256,7 @@ void ExecutorState::ScheduleReady(const TaggedNodeSeq& ready,
   if (stats_collector_) {
     scheduled_nsec = nodestats::NowInNsec();
   }
+
   if (inline_ready == nullptr) {
     // Schedule to run all the ready ops in thread pool.
     for (auto& tagged_node : ready) {
@@ -2225,11 +2264,12 @@ void ExecutorState::ScheduleReady(const TaggedNodeSeq& ready,
     }
     return;
   }
+
   const GraphView& gview = impl_->gview_;
   const TaggedNode* curr_expensive_node = nullptr;
   for (auto& tagged_node : ready) {
     const NodeItem& item = *gview.node(tagged_node.node->id());
-    if (tagged_node.is_dead || !item.kernel_is_expensive) {
+    if (tagged_node.is_dead || !item.kernel->IsExpensive()) {
       // Inline this inexpensive node.
       inline_ready->push_back(tagged_node);
     } else {
@@ -2386,6 +2426,25 @@ void ExecutorState::DumpState() {
   }
 }
 
+void ExecutorState::ScheduleFinish() {
+  int num_deferred_ops;
+  {
+    mutex_lock lock(num_deferred_ops_mu_);
+    num_deferred_ops = num_deferred_ops_;
+  }
+  if (num_deferred_ops > 0) {
+    // Finish() may be blocked waiting for deferred async ops to complete. The
+    // execution of deferred async ops may be waiting for non-enqueued ops of
+    // other executors to complete. So running Finish() on the current thread
+    // (inter-op threadpool thread) may lead to a deadlock due to threadpool
+    // exhaustion. Instead, we run it on a separate thread to unblock the
+    // threadpool thread.
+    Env::Default()->SchedClosure([this]() { Finish(); });
+  } else {
+    Finish();
+  }
+}
+
 void ExecutorState::Finish() {
   mu_.lock();
   auto status = status_;
@@ -2395,7 +2454,59 @@ void ExecutorState::Finish() {
   CHECK(done_cb != nullptr);
   Device* device = impl_->params_.device;
 
-  if ((sync_on_finish_ && status.ok()) || device->RequiresSyncOnCompletion()) {
+  // There are several potential race conditions below. To name a few:
+  // 1. Even if the device's status is OK at the precise moment when
+  // num_deferred_ops_ reaches 0, it could go bad before device->RefreshStatus()
+  // is called below, caused by work enqueued onto the same device by other
+  // concurrent ExecutorState objects.
+  // 2. Some implementations of Device::RefreshStatus, such as
+  // XlaDevice::RefreshStatus, may be inherently racy because it releases the
+  // device mutex after a stream pointer is acquired and before the stream is
+  // queried for status.
+  // 3. It's the same for some implementations of Device::Sync, such as
+  // XlaDevice::Sync.
+  //
+  // However, these race conditions are acceptable because a stream (and
+  // therefore an XlaDevice) can only go from OK to not-OK, never the opposite,
+  // which means we will at worst report errors when there isn't any, never the
+  // opposite.
+
+  // If inc_num_deferred_ops_function has ever been called, ExecutorState must
+  // wait for all corresponding dec_num_deferred_ops_function calls to happen
+  // regardless of status. This ensures that dec_num_deferred_ops_function can
+  // safely use ExecutorState's resources.
+  {
+    mutex_lock lock(num_deferred_ops_mu_);
+    while (num_deferred_ops_ > 0) {
+      num_deferred_ops_cv_.wait(lock);
+    }
+  }
+
+  // An early exit for devices don't allow sync on completion. Ops that run on
+  // these devices should have used num_deferred_ops correctly to ensure the
+  // device has finished all relevant work at this point.
+  if (!device->AllowsSyncOnCompletion()) {
+    status.Update(device->RefreshStatus());
+    if (!status.ok()) {
+      // In device async execution mode, it's possible for device execution to
+      // lag behind ExecutorState scheduling so much that this is the first
+      // place a device execution error surfaces.
+      // If so, all ExecutorState::NodeDone calls have already happened with OK
+      // status. This is the last defense where StartCancel must be called to
+      // abort all computation still running on any device.
+      // TODO(b/124523000): Always call Finish in a separate thread, so even if
+      // StartCancel blocks the current thread's execution, we won't encounter
+      // deadlocks caused by inter-op thread exhaustion.
+      if (cancellation_manager_) {
+        cancellation_manager_->StartCancel();
+      }
+    }
+    delete this;
+    runner([=]() { done_cb(status); });
+    return;
+  }
+
+  if (sync_on_finish_ && status.ok()) {
     // Block until the device has finished all queued operations. For
     // devices like GPUs that continue to execute Ops after their Compute
     // methods have completed, this ensures that control is not returned to
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index 34bf73972f57306eb9cfda08d8277f0bedfcafa9..4be60c67713bc801a8249201d65a5dbc26646138 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/session_state.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
@@ -87,6 +88,8 @@ class Executor {
     CallFrameInterface* call_frame = nullptr;
     CancellationManager* cancellation_manager = nullptr;
     SessionState* session_state = nullptr;
+    // Unique session identifier. Can be empty.
+    string session_handle;
     TensorStore* tensor_store = nullptr;
     ScopedStepContainer* step_container = nullptr;
     CollectiveExecutor* collective_executor = nullptr;
@@ -171,41 +174,40 @@ class ExecutorBarrier {
 
   mutable mutex mu_;
   int pending_ GUARDED_BY(mu_) = 0;
-  Status status_ GUARDED_BY(mu_);
+  StatusGroup status_group_ GUARDED_BY(mu_);
 
   void WhenDone(const Status& s) {
-    bool error = false;
     Rendezvous* error_rendez = nullptr;
     StatusCallback done = nullptr;
     Status status;
+
     {
       mutex_lock l(mu_);
-      // If we are the first error encountered, mark the status
-      // appropriately and later trigger an abort of the Rendezvous
-      // object by this thread only.
-      if (status_.ok() && !s.ok()) {
-        error = true;
+
+      // If we are the first error encountered, trigger an abort of the
+      // Rendezvous object by this thread only.
+      if (status_group_.ok() && !s.ok()) {
         error_rendez = rendez_;
         error_rendez->Ref();
-        status_ = s;
       }
 
+      status_group_.Update(s);
+
       // If this is the last call to WhenDone, call the final callback
       // below.
       if (--pending_ == 0) {
         CHECK(done_cb_ != nullptr);
         std::swap(done, done_cb_);
-      }
-
-      if (!status_.ok()) {
-        status = status_;
+        status = status_group_.as_status();
       }
     }
 
-    if (error) {
-      error_rendez->StartAbort(status);
+    if (error_rendez != nullptr) {
+      error_rendez->StartAbort(
+          errors::Aborted("Stopping remaining executors."));
       error_rendez->Unref();
     }
+
     if (done != nullptr) {
       delete this;
       done(status);
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 7eb622dc117f40a68079e6cea1a829227acfed7a..99841588dcc685f4da12c15ccabaedc06e351746 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -104,6 +104,10 @@ static Node* AddIdentity(Graph* g, Endpoint input) {
   NodeDef ndef;
   ndef.set_name(g->NewName(kNodeLabel));
   ndef.set_op("Identity");
+  // NOTE(skyewm): we explicitly set the device here to address a multi-GPU
+  // performance issue where this Identity would be placed alone on a GPU,
+  // causing unnecessary device traffic. See b/122483225 for details.
+  ndef.set_device(input.node->def().device());
   ndef.add_input(input.name());
   AddNodeAttr("T", BaseType(input.dtype()), &ndef);
   Status s;
@@ -453,7 +457,9 @@ class CallOp : public AsyncOpKernel {
   CallOp(FunctionLibraryRuntime::Handle handle, OpKernelConstruction* ctx)
       : AsyncOpKernel(ctx), handle_(handle) {}
 
-  ~CallOp() override {}
+  ~CallOp() override {
+    // TODO(iga): Release the cached handle_
+  }
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     FunctionLibraryRuntime* lib = ctx->function_library();
@@ -628,11 +634,20 @@ bool FunctionLibraryRuntimeImpl::IsLocalTarget(
     const InstantiateOptions& options) {
   if (device_ == nullptr) return true;
   if (options.target.empty()) return true;
+  if (options.is_multi_device_function) return false;
   Device* target_device;
   if (!device_mgr_->LookupDevice(options.target, &target_device).ok()) {
+    VLOG(1) << "Not instantiating function in FLR because failed to "
+            << "find device " << options.target << " in device manager";
+    return false;
+  }
+  if (target_device != device_) {
+    VLOG(1) << "Not instantiating function in FLR because target device "
+            << options.target
+            << " is different from FLR's device: " << device_->DebugString();
     return false;
   }
-  return target_device == device_;
+  return true;
 }
 
 Status FunctionLibraryRuntimeImpl::Instantiate(
@@ -732,15 +747,32 @@ Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) {
   if (h == kInvalidLocalHandle) {
     return parent_->ReleaseHandle(handle);
   }
-  mutex_lock l(mu_);
-  CHECK_EQ(1, items_.count(h));
-  std::unique_ptr<Item>& item = items_[h];
-  --item->instantiation_counter;
-  if (item->instantiation_counter == 0) {
-    items_.erase(h);
-    TF_RETURN_IF_ERROR(parent_->RemoveHandle(handle));
+
+  std::unique_ptr<Item> item_to_delete;
+  Status parent_status;
+  {
+    mutex_lock l(mu_);
+    auto it = items_.find(h);
+    if (it == items_.end()) {
+      return errors::Internal(
+          "Inconsistent FunctionLibraryRuntime. Expected to find an item for "
+          "handle ",
+          h, " but found none");
+    }
+    std::unique_ptr<Item>& item = it->second;
+    --item->instantiation_counter;
+    if (item->instantiation_counter == 0) {
+      // We don't simply erase h's item because that would trigger
+      // item destruction while holding mu_. Item destruction can
+      // trigger graph destruction. If the graph contains kernels like
+      // CallOp or PartitionCallOp, their destructors will release cached
+      // function handles, resulting in deadlock here.
+      item_to_delete = std::move(item);
+      items_.erase(h);
+      parent_status = parent_->RemoveHandle(handle);
+    }
   }
-  return Status::OK();
+  return parent_status;
 }
 
 void DumpGraph(StringPiece label, const Graph* g) {
@@ -754,23 +786,41 @@ void DumpGraph(StringPiece label, const Graph* g) {
   }
 }
 
-void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g) {
+void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g,
+                   const GraphOptimizer::Options& graph_optimizer_options) {
   OptimizerOptions opts;
   opts.set_do_common_subexpression_elimination(true);
   opts.set_do_function_inlining(true);
   opts.set_do_constant_folding(true);
   GraphOptimizer optimizer(opts);
-  optimizer.Optimize(lib, lib->env(), lib->device(), g, /*shape_map=*/nullptr);
+  optimizer.Optimize(lib, lib->env(), lib->device(), g,
+                     graph_optimizer_options);
+}
+
+void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g) {
+  OptimizeGraph(lib, g, GraphOptimizer::Options());
 }
 
 namespace {
 // Removes all stateless nodes that do not contribute to a return
-// value from the function body.  Unlike `RemoveDeadNodes()`, which is
+// value from the function body. Unlike `RemoveDeadNodes()`, which is
 // triggered by `OptimizerOptions.do_function_inlining`, this pass
 // ignores the SINK node, from which (by definition) all nodes are
-// reverse reachable.
-void PruneFunctionBody(Graph* g) {
-  VLOG(2) << "Pruning function body";
+// reverse reachable, and preserves all nodes that are reachable from
+// control output nodes.
+//
+// TODO(ezhulenev, skyewm): Function body should not have special treatment of
+// stateful ops, graph should encode nodes that must execute with `control_ret`
+// and `control_output`.
+void PruneFunctionBody(const FunctionDef& fdef, Graph* g) {
+  VLOG(2) << "Pruning function body: function_name=" << fdef.signature().name();
+
+  // `control_ret` nodes must be always executed.
+  std::unordered_set<StringPiece, StringPieceHasher> control_ret_nodes;
+  for (const auto& control_ret : fdef.control_ret()) {
+    control_ret_nodes.insert(control_ret.second);
+  }
+
   std::unordered_set<const Node*> nodes;
   for (auto n : g->nodes()) {
     // NOTE(mrry): "_Retval" nodes are stateful, and so will be added
@@ -781,7 +831,8 @@ void PruneFunctionBody(Graph* g) {
     // still needed. It would be preferable to prune entire loops and/or
     // conditionals if they are not used in the graph.
     if (n->IsControlFlow() ||
-        (n->op_def().is_stateful() && n->type_string() != kArgOp)) {
+        (n->op_def().is_stateful() && n->type_string() != kArgOp) ||
+        (control_ret_nodes.find(n->name()) != control_ret_nodes.end())) {
       nodes.insert(n);
     }
   }
@@ -808,7 +859,7 @@ Status FunctionLibraryRuntimeImpl::CreateItem(Item** item) {
   std::unique_ptr<Graph> g(new Graph(lib_def));
   CopyGraph(*fbody->graph, g.get());
 
-  PruneFunctionBody(g.get());
+  PruneFunctionBody(fbody->fdef, g.get());
   optimizer_.Optimize(this, env(), device(), &g, /*shape_map=*/nullptr);
   TF_RETURN_IF_ERROR(EnsureMemoryTypes(DeviceType(device()->device_type()),
                                        device()->name(), g.get()));
@@ -1354,6 +1405,10 @@ static bool ValidateInlining(const Node* node, const FunctionBody* fbody) {
   if (static_cast<size_t>(node->num_outputs()) != fbody->ret_nodes.size()) {
     return false;
   }
+  // TODO(ezhulenev): Currently common_runtime function inlining can't guarantee
+  // that all side-effectful ops will be executed after inlining. See Grappler
+  // function_optimizer for details. Unify all function inlining mechanism.
+  // Do not inline if `!fbody->control_ret_nodes.empty()`.
   for (int i = 0; i < node->num_inputs(); ++i) {
     if (node->input_type(i) != fbody->arg_types[i]) return false;
   }
@@ -1386,6 +1441,7 @@ void InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
     if (e->IsControlEdge()) {
       if (input_control_node == nullptr) {
         input_control_node = AddNoOp(g);
+        input_control_node->set_requested_device(caller->def().device());
       }
       g->AddControlEdge(e->src(), input_control_node);
     } else {
@@ -1407,6 +1463,12 @@ void InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
     if (override_device || ndef.device().empty()) {
       ndef.set_device(caller->def().device());
     }
+    for (auto& attr : *ndef.mutable_attr()) {
+      if (attr.first == "_class") {
+        attr.second.set_s(
+            strings::StrCat(caller->name(), "/", attr.second.s()));
+      }
+    }
     Node* clone = g->AddNode(ndef, &s);
     TF_CHECK_OK(s);
     node_map[n->id()] = clone;
@@ -1586,6 +1648,13 @@ void ToGraphDef(const Graph* g, GraphDef* gdef, bool pretty) {
     for (const auto& attr : n->attrs()) {
       (*ndef->mutable_attr())[attr.first] = attr.second;
     }
+
+    if (!n->assigned_device_name().empty()) {
+      ndef->set_device(n->assigned_device_name());
+    } else {
+      ndef->set_device(n->requested_device());
+    }
+
     inputs.clear();
     inputs.resize(n->num_inputs());
     for (const Edge* e : n->in_edges()) {
@@ -1633,6 +1702,7 @@ FunctionBody::FunctionBody(const FunctionDef& f, DataTypeSlice arg_t,
       graph(g),
       arg_types(arg_t.begin(), arg_t.end()),
       ret_types(ret_t.begin(), ret_t.end()) {
+  // 1. Find regular Arg/Ret nodes.
   this->arg_nodes.resize(arg_types.size());
   this->ret_nodes.resize(ret_types.size());
   for (Node* n : this->graph->op_nodes()) {
@@ -1650,6 +1720,17 @@ FunctionBody::FunctionBody(const FunctionDef& f, DataTypeSlice arg_t,
     CHECK_LT(index, node_vec->size());
     (*node_vec)[index] = n;
   }
+  // 2. Find ControlRet nodes that must be always executed.
+  std::unordered_set<StringPiece, StringPieceHasher> control_ret_node_names;
+  for (const auto& control_ret : fdef.control_ret()) {
+    control_ret_node_names.insert(control_ret.second);
+  }
+  this->control_ret_nodes.reserve(control_ret_node_names.size());
+  for (Node* n : this->graph->op_nodes()) {
+    if (control_ret_node_names.count(n->name()) > 0) {
+      this->control_ret_nodes.push_back(n);
+    }
+  }
 }
 
 FunctionBody::~FunctionBody() { delete this->graph; }
diff --git a/tensorflow/core/common_runtime/function.h b/tensorflow/core/common_runtime/function.h
index eeca66f5d0bdef6b036b77b170ccd07945be28b7..37df90f5d98e6a2bc3a7aea7357a352dc51b3084 100644
--- a/tensorflow/core/common_runtime/function.h
+++ b/tensorflow/core/common_runtime/function.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/graph/graph.h"
@@ -78,6 +79,7 @@ struct FunctionBody {
   DataTypeVector ret_types;
   gtl::InlinedVector<Node*, 4> arg_nodes;
   gtl::InlinedVector<Node*, 4> ret_nodes;
+  gtl::InlinedVector<Node*, 4> control_ret_nodes;
 
   FunctionBody() {}
   FunctionBody(const FunctionDef& f, DataTypeSlice arg_types,
@@ -133,6 +135,8 @@ void DumpGraph(StringPiece label, const Graph* g);
 // OptimizeGraph mutates **g extensively and replaces '*g' with a
 // complete copy. Therefore, the caller should not keep any references
 // to nodes *g.
+void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g,
+                   const GraphOptimizer::Options& graph_optimizer_options);
 void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g);
 
 // Convert the Graph of a function to a GraphDef.
@@ -157,6 +161,8 @@ FunctionBody* SymbolicGradient(const FunctionBody& f);
 // to "fbody". Replaces the "caller" with fbody->graph and connects
 // edges properly. "override_device" specifies whether inlining should replace
 // explicitly specified devices inside fbody with the callee's device.
+//
+// TODO(ezhulenev): Return Status::error if function inlining failed.
 void InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
                         Node* caller, const FunctionBody* fbody,
                         bool override_device = true);
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index cab95cb596858f99285c3cfc5673f87b70368a32..83694d2c40709f7b76dad002cf028a105a8cd15c 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -246,9 +246,10 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     if (!status.ok()) return status;
 
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
-    EXPECT_TRUE(errors::IsInvalidArgument(status2));
-    EXPECT_TRUE(
-        str_util::StrContains(status2.error_message(), "remote execution."));
+    EXPECT_TRUE(errors::IsNotFound(status2))
+        << "Actual status: " << status2.ToString();
+    EXPECT_TRUE(str_util::StrContains(status2.error_message(), "Handle"));
+    EXPECT_TRUE(str_util::StrContains(status2.error_message(), "not found"));
 
     return status;
   }
@@ -316,9 +317,9 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     if (!status.ok()) return status;
 
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
-    EXPECT_TRUE(errors::IsInvalidArgument(status2));
-    EXPECT_TRUE(
-        str_util::StrContains(status2.error_message(), "remote execution."));
+    EXPECT_TRUE(errors::IsNotFound(status2));
+    EXPECT_TRUE(str_util::StrContains(status2.error_message(), "Handle"));
+    EXPECT_TRUE(str_util::StrContains(status2.error_message(), "not found"));
 
     return status;
   }
@@ -944,6 +945,48 @@ TEST_F(FunctionLibraryRuntimeTest, PruneBody) {
   EXPECT_EQ(expected_node_names, executed_node_names);
 }
 
+TEST_F(FunctionLibraryRuntimeTest, DoNotPruneControlOutputsFromBody) {
+  // `add` node is not required to compute regular output `o`, but it must
+  // execute because it is in `control_ret`.
+  const FunctionDef func =
+      FDH::Create("FunctionWithControlOutputs", {"i: float"}, {"o: float"}, {},
+                  {
+                      {{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
+                      {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}},
+                  },
+                  /*ret_def=*/{{"o", "ret:z:0"}},
+                  /*control_ret_def=*/{{"must_execute", "add"}});
+
+  Init({func});
+
+  auto x = test::AsTensor<float>({1.25});
+  Tensor z;
+
+  FunctionLibraryRuntime::Handle handle;
+  TF_CHECK_OK(Instantiate(flr1_, "FunctionWithControlOutputs", {}, &handle));
+
+  StepStats stats;
+  StepStatsCollector stats_collector(&stats);
+  FunctionLibraryRuntime::Options opts;
+  opts.stats_collector = &stats_collector;
+  TF_CHECK_OK(Run(flr1_, handle, opts, {x}, {&z}));
+  TF_CHECK_OK(flr1_->ReleaseHandle(handle));
+
+  TF_CHECK_OK(
+      InstantiateAndRun(flr1_, "FunctionWithControlOutputs", {}, {x}, {&z}));
+  test::ExpectTensorEqual<float>(z, test::AsTensor<float>({1.25 * 1.25}));
+
+  stats_collector.FinalizeAndSwap(&stats);
+
+  std::set<string> expected_node_names(
+      {"_SOURCE", "i", "add", "ret", "o_RetVal"});
+  std::set<string> executed_node_names;
+  for (const auto& node_stats : stats.dev_stats()[0].node_stats()) {
+    executed_node_names.insert(node_stats.node_name());
+  }
+  EXPECT_EQ(expected_node_names, executed_node_names);
+}
+
 // Constant folding generates names using a global counter.
 // This function invokes constant folding and parses the counter
 // from the generated node name.
diff --git a/tensorflow/core/common_runtime/function_threadpool_test.cc b/tensorflow/core/common_runtime/function_threadpool_test.cc
index 1b803736fb881c8f133198ab39e5801a357c5659..1dca25e0064e12c9b21c76102278e1bebdc67a4a 100644
--- a/tensorflow/core/common_runtime/function_threadpool_test.cc
+++ b/tensorflow/core/common_runtime/function_threadpool_test.cc
@@ -149,9 +149,9 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     if (!status.ok()) return status;
 
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
-    EXPECT_TRUE(errors::IsInvalidArgument(status2));
-    EXPECT_TRUE(
-        str_util::StrContains(status2.error_message(), "remote execution."));
+    EXPECT_TRUE(errors::IsNotFound(status2));
+    EXPECT_TRUE(str_util::StrContains(status2.error_message(), "Handle"));
+    EXPECT_TRUE(str_util::StrContains(status2.error_message(), "not found"));
 
     return status;
   }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
index 60e82ed13bc1362f40dedfb93e5c001d946bf77f..9c0abd97a1825bbaf7bdc76473bb79a817497e99 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@@ -36,14 +36,17 @@ namespace tensorflow {
 namespace {
 
 static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use,
-                       int64 max_bytes_in_use, int64 max_alloc_size) {
-  AllocatorStats stats;
-  a->GetStats(&stats);
-  LOG(INFO) << "Alloc stats: " << std::endl << stats.DebugString();
-  EXPECT_EQ(stats.bytes_in_use, bytes_in_use);
-  EXPECT_EQ(stats.max_bytes_in_use, max_bytes_in_use);
-  EXPECT_EQ(stats.num_allocs, num_allocs);
-  EXPECT_EQ(stats.max_alloc_size, max_alloc_size);
+                       int64 peak_bytes_in_use, int64 largest_alloc_size) {
+  absl::optional<AllocatorStats> stats = a->GetStats();
+  EXPECT_TRUE(stats);
+  if (!stats) {
+    return;
+  }
+  LOG(INFO) << "Alloc stats: " << std::endl << stats->DebugString();
+  EXPECT_EQ(stats->bytes_in_use, bytes_in_use);
+  EXPECT_EQ(stats->peak_bytes_in_use, peak_bytes_in_use);
+  EXPECT_EQ(stats->num_allocs, num_allocs);
+  EXPECT_EQ(stats->largest_alloc_size, largest_alloc_size);
 }
 
 TEST(GPUBFCAllocatorTest, NoDups) {
@@ -291,9 +294,10 @@ TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) {
     a.DeallocateRaw(existing_ptrs[i]);
   }
 
-  AllocatorStats stats;
-  a.GetStats(&stats);
-  LOG(INFO) << "Alloc stats: \n" << stats.DebugString();
+  absl::optional<AllocatorStats> stats = a.GetStats();
+  if (stats) {
+    LOG(INFO) << "Alloc stats: \n" << stats->DebugString();
+  }
 }
 
 TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
index d85ca8892f6d19c2c10a5f35368a476506ecc370..4be1bbb7df37c1aa954ea3350f82eee5b15ad1bf 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #ifdef GOOGLE_CUDA
 #include "cuda/include/cuda.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
+#include "tensorflow/stream_executor/cuda/cuda_driver_wrapper.h"
 #endif  // GOOGLE_CUDA
 
 #include "tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h"
@@ -41,7 +42,7 @@ void* GPUcudaMallocAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   // allocate with cudaMalloc
   se::cuda::ScopedActivateExecutorContext scoped_activation{stream_exec_};
   CUdeviceptr rv = 0;
-  CUresult res = cuMemAlloc(&rv, num_bytes);
+  CUresult res = tensorflow::wrap::cuMemAlloc(&rv, num_bytes);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "cuMemAlloc failed to allocate " << num_bytes;
     return nullptr;
@@ -54,7 +55,8 @@ void* GPUcudaMallocAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
 void GPUcudaMallocAllocator::DeallocateRaw(void* ptr) {
 #ifdef GOOGLE_CUDA
   // free with cudaFree
-  CUresult res = cuMemFree(reinterpret_cast<CUdeviceptr>(ptr));
+  CUresult res =
+      tensorflow::wrap::cuMemFree(reinterpret_cast<CUdeviceptr>(ptr));
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "cuMemFree failed to free " << ptr;
   }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
index 989ddbe4af53ee200f994ea8e3f2ae42e5bcab7f..0727196e1ceed88063a666a6a45fb139386203aa 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
@@ -44,8 +44,9 @@ bool CheckMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
   se::DeviceMemory<int64> gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}};
   int64 tmp[MASK_WORDS];
 
-  if (!exec->SynchronousMemcpy(&tmp, gpu_ptr, MASK_BYTES)) {
-    LOG(FATAL) << "Could not copy debug mask";
+  Status result = exec->SynchronousMemcpyD2H(gpu_ptr, MASK_BYTES, tmp);
+  if (!result.ok()) {
+    LOG(FATAL) << "Could not copy debug mask, " << result;
   }
 
   bool ok = true;
@@ -63,8 +64,9 @@ bool CheckMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
 
 void InitMask(se::StreamExecutor* exec, void* ptr, int64* mask) {
   se::DeviceMemory<int64> gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}};
-  if (!exec->SynchronousMemcpy(&gpu_ptr, mask, MASK_BYTES)) {
-    LOG(FATAL) << "Could not copy debug mask";
+  Status result = exec->SynchronousMemcpyH2D(mask, MASK_BYTES, &gpu_ptr);
+  if (!result.ok()) {
+    LOG(FATAL) << "Could not copy debug mask, " << result;
   }
 }
 
@@ -130,8 +132,8 @@ int64 GPUDebugAllocator::AllocationId(const void* ptr) {
                                        MASK_BYTES);
 }
 
-void GPUDebugAllocator::GetStats(AllocatorStats* stats) {
-  base_allocator_->GetStats(stats);
+absl::optional<AllocatorStats> GPUDebugAllocator::GetStats() {
+  return base_allocator_->GetStats();
 }
 
 void GPUDebugAllocator::ClearStats() { base_allocator_->ClearStats(); }
@@ -171,8 +173,10 @@ void* GPUNanResetAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   se::DeviceMemory<float> nan_ptr{
       se::DeviceMemoryBase{static_cast<float*>(allocated_ptr), req_size}};
 
-  if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) {
-    LOG(ERROR) << "Could not initialize to NaNs";
+  Status result =
+      stream_exec_->SynchronousMemcpyH2D(&nans[0], req_size, &nan_ptr);
+  if (!result.ok()) {
+    LOG(ERROR) << "Could not initialize to NaNs, " << result;
   }
 
   return allocated_ptr;
@@ -185,8 +189,10 @@ void GPUNanResetAllocator::DeallocateRaw(void* ptr) {
                             std::nanf(""));
     se::DeviceMemory<float> nan_ptr{
         se::DeviceMemoryBase{static_cast<float*>(ptr), req_size}};
-    if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) {
-      LOG(ERROR) << "Could not initialize to NaNs";
+    Status result =
+        stream_exec_->SynchronousMemcpyH2D(&nans[0], req_size, &nan_ptr);
+    if (!result.ok()) {
+      LOG(ERROR) << "Could not initialize to NaNs, " << result;
     }
   }
 
@@ -202,8 +208,8 @@ size_t GPUNanResetAllocator::AllocatedSize(const void* ptr) {
   return base_allocator_->AllocatedSize(ptr);
 }
 
-void GPUNanResetAllocator::GetStats(AllocatorStats* stats) {
-  base_allocator_->GetStats(stats);
+absl::optional<AllocatorStats> GPUNanResetAllocator::GetStats() {
+  return base_allocator_->GetStats();
 }
 
 void GPUNanResetAllocator::ClearStats() { base_allocator_->ClearStats(); }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
index 17757a106c5c20939b2c2d3525efc1ad659c2902..fa0394c19d0f6c910aeb5847a2e765f292f9de88 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
@@ -43,7 +43,7 @@ class GPUDebugAllocator : public Allocator {
   size_t RequestedSize(const void* ptr) override;
   size_t AllocatedSize(const void* ptr) override;
   int64 AllocationId(const void* ptr) override;
-  void GetStats(AllocatorStats* stats) override;
+  absl::optional<AllocatorStats> GetStats() override;
   void ClearStats() override;
 
   // For testing.
@@ -71,7 +71,7 @@ class GPUNanResetAllocator : public Allocator {
   void DeallocateRaw(void* ptr) override;
   size_t RequestedSize(const void* ptr) override;
   size_t AllocatedSize(const void* ptr) override;
-  void GetStats(AllocatorStats* stats) override;
+  absl::optional<AllocatorStats> GetStats() override;
   void ClearStats() override;
 
  private:
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 5152d97fdefed688ba05043072ff6df635471ed9..607193a0013878f33bb96c8b2540f067da8b3284 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -276,6 +276,24 @@ BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
       sync_every_op_(sync_every_op),
       max_streams_(max_streams) {
   GPUProcessState::singleton()->EnableGPUDevice();
+  pending_cap_ = options.config.gpu_options().experimental().pending_cap();
+  timestamped_allocator_ =
+      options.config.gpu_options().experimental().timestamped_allocator();
+  if (timestamped_allocator_ || pending_cap_ > 0) {
+    SharedCounter* timing_counter = nullptr;
+    if (timestamped_allocator_) {
+      // In this case the SharedCounter was already created and set in the
+      // associated Allocator, with ownership by GPUProcessState.
+      // The GPUKernelTracker will use this SharedCounter, instead of
+      // owning its own.
+      timing_counter =
+          GPUProcessState::singleton()->GPUAllocatorCounter(tf_gpu_id);
+      DCHECK(timing_counter);
+    } else {
+      DCHECK_GT(pending_cap_, 0);
+    }
+    kernel_tracker_.reset(new GPUKernelTracker(Env::Default(), timing_counter));
+  }
 }
 
 BaseGPUDevice::~BaseGPUDevice() {
@@ -508,6 +526,10 @@ void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
       if (idc->stream() != stream) stream->ThenWaitFor(idc->stream());
     }
   }
+  if (pending_cap_ > 0) {
+    DCHECK(kernel_tracker_);
+    kernel_tracker_->PauseWhilePendingExceeds(pending_cap_);
+  }
   se::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()};
   op_kernel->Compute(context);
   if (context->status().ok()) {
@@ -525,6 +547,14 @@ void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
       VLOG(1) << "GpuDevice::ComputeHelper scheduled "
               << ComputeOpKernelDebugString(*op_kernel, stream_id);
     }
+    if (kernel_tracker_) {
+      GPUKernelTracker* tracker = kernel_tracker_.get();
+      DCHECK(tracker);
+      uint64 queued_count = tracker->RecordQueued();
+      em_->ThenExecute(stream, [op_kernel, tracker, queued_count]() {
+        tracker->RecordTerminated(queued_count);
+      });
+    }
   } else {
     if (vlog_1) {
       VLOG(1) << "GpuDevice::ComputeHelper failed to schedule "
@@ -601,7 +631,9 @@ Status BaseGPUDevice::MaybeCopyTensorToGPU(
         [to, copy](StatusCallback done_,
                    // Begin unbound arguments.
                    const Status& s) {
-          *to = std::move(*copy);
+          if (s.ok()) {
+            *to = std::move(*copy);
+          }
           delete copy;
           done_(s);
         },
@@ -629,7 +661,8 @@ Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
 
   if (parsed.dtype() == DT_VARIANT) {
     const Variant* from = parsed.flat<Variant>().data();
-    Tensor copy(cpu_allocator(), DT_VARIANT, parsed.shape());
+    int numa_node = attributes().locality().numa_node();
+    Tensor copy(cpu_allocator(numa_node), DT_VARIANT, parsed.shape());
     Variant* copy_variant = copy.flat<Variant>().data();
 
     std::list<Notification> notifications;
@@ -718,8 +751,8 @@ Status ParseVisibleDeviceList(const string& visible_device_list,
       if (!strings::safe_strto32(platform_gpu_id_str, &platform_gpu_id)) {
         return errors::InvalidArgument(
             "Could not parse entry in 'visible_device_list': '",
-            platform_gpu_id_str, "'. visible_device_list = ",
-            visible_device_list);
+            platform_gpu_id_str,
+            "'. visible_device_list = ", visible_device_list);
       }
       if (platform_gpu_id < 0 ||
           platform_gpu_id >= gpu_manager->VisibleDeviceCount()) {
@@ -954,15 +987,15 @@ Status BaseGPUDeviceFactory::CreateDevices(
     for (PlatformGpuId platform_gpu_id : valid_platform_gpu_ids) {
       err = cudaSetDevice(platform_gpu_id.value());
       if (err != cudaSuccess) {
-        return errors::Internal("cudaSetDevice() on GPU:",
-                                platform_gpu_id.value(), " failed. Status: ",
-                                cudaGetErrorString(err));
+        return errors::Internal(
+            "cudaSetDevice() on GPU:", platform_gpu_id.value(),
+            " failed. Status: ", cudaGetErrorString(err));
       }
       err = cudaFree(nullptr);
       if (err != cudaSuccess) {
         return errors::Internal("CUDA runtime implicit initialization on GPU:",
-                                platform_gpu_id.value(), " failed. Status: ",
-                                cudaGetErrorString(err));
+                                platform_gpu_id.value(),
+                                " failed. Status: ", cudaGetErrorString(err));
       }
     }
     // Reset to the original device.
@@ -1097,21 +1130,24 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(
                             tf_gpu_id.value(), " with ", memory_limit,
                             " bytes of memory.");
   }
-  AllocatorStats stats;
-  gpu_allocator->GetStats(&stats);
+  absl::optional<AllocatorStats> stats = gpu_allocator->GetStats();
+  if (!stats) {
+    return errors::Internal("No allocator statistics");
+  }
   // 'memory_limit' is the required memory size, but if the allocator with given
   // tf_gpu_id was created before, we'll use it instead of creating a new one
   // (as TF gpu device is a shared resource), in which case the actual memory
   // limit represented by 'stats.bytes_limit' used by that allocator may be
   // different (which should be an error).
   //
-  // TODO(laigd): report error if memory_limit doesn't match stats.bytes_limit.
+  // TODO(laigd): report error if memory_limit doesn't match stats->bytes_limit.
+  int64 bytes_limit = stats->bytes_limit ? *stats->bytes_limit : 0;
   std::unique_ptr<BaseGPUDevice> gpu_device = CreateGPUDevice(
-      options, device_name, static_cast<Bytes>(stats.bytes_limit), dev_locality,
+      options, device_name, static_cast<Bytes>(bytes_limit), dev_locality,
       tf_gpu_id, GetShortDeviceDescription(platform_gpu_id, desc),
       gpu_allocator, ProcessState::singleton()->GetCPUAllocator(numa_node));
   LOG(INFO) << "Created TensorFlow device (" << device_name << " with "
-            << (stats.bytes_limit >> 20) << " MB memory) -> physical GPU ("
+            << (bytes_limit >> 20) << " MB memory) -> physical GPU ("
             << GetShortDeviceDescription(platform_gpu_id, desc) << ")";
   TF_RETURN_IF_ERROR(gpu_device->Init(options));
   devices->push_back(std::move(gpu_device));
@@ -1514,6 +1550,115 @@ Status BaseGPUDeviceFactory::GetValidDeviceIds(
   return Status::OK();
 }
 
+uint64 BaseGPUDevice::SafeAllocFrontier() {
+  if (timestamped_allocator_) {
+    return kernel_tracker_->LastTerminatedCount();
+  } else {
+    return 0;
+  }
+}
+
+int BaseGPUDevice::PendingKernels() {
+  if (kernel_tracker_) {
+    return kernel_tracker_->NumPending();
+  }
+  return 0;
+}
+
+uint64 GPUKernelTracker::RecordQueued() {
+  mutex_lock l(mu_);
+  uint64 queued_count = timing_counter_->next();
+  VLOG(2) << "RecordQueued queued_count=" << queued_count
+          << " first_available_=" << first_available_
+          << " last_completed_=" << last_completed_
+          << " num_pending_=" << num_pending_;
+  pending_kernels_[first_available_].queued_count = queued_count;
+  pending_kernels_[first_available_].terminated = false;
+  ++first_available_;
+  ++num_pending_;
+  if (first_available_ >= pending_kernels_.size()) {
+    first_available_ = 0;
+  }
+  if (first_available_ == last_completed_) {
+    // Ring buffer is full: double it.  All of the same valid PendingKernel
+    // entries exist after the copy, they are just shifted to begin
+    // at index 0 in the new array.
+    std::vector<PendingKernel> new_buffer(pending_kernels_.size() * 2);
+    for (int i = 0; i < pending_kernels_.size(); ++i) {
+      int j = (i + last_completed_) % pending_kernels_.size();
+      new_buffer[i] = pending_kernels_[j];
+    }
+    last_completed_ = 0;
+    first_available_ = pending_kernels_.size();
+    pending_kernels_.swap(new_buffer);
+    VLOG(1) << "last_completed_=" << last_completed_
+            << " first_available_=" << first_available_
+            << " num_pending_=" << num_pending_;
+  }
+  DCHECK_NE(first_available_, last_completed_) << "exhausted pending_kernels";
+  return queued_count;
+}
+
+void GPUKernelTracker::RecordTerminated(uint64 queued_count) {
+  mutex_lock l(mu_);
+  VLOG(2) << "RecordTerminated queued_count=" << queued_count
+          << " first_available_=" << first_available_
+          << " last_completed_=" << last_completed_
+          << " num_pending_=" << num_pending_ << " LC="
+          << ((last_completed_ >= 0)
+                  ? pending_kernels_[last_completed_].queued_count
+                  : -1);
+  DCHECK_NE(first_available_, last_completed_);
+  DCHECK_GT(num_pending_, 0);
+  // Starting just past the last completed entry, find the entry with
+  // this queued_count and mark it done.
+  int index = (last_completed_ + 1) % pending_kernels_.size();
+  while (true) {
+    if (index == first_available_) {
+      // This should never happen.
+      LOG(FATAL) << "Failed to find " << queued_count  // Crash OK
+                 << " in queue";
+    }
+    if (pending_kernels_[index].queued_count == queued_count) {
+      pending_kernels_[index].terminated = true;
+      break;
+    }
+    index = (index + 1) % pending_kernels_.size();
+  }
+  // Next move last_completed_ forward past all completed kernels.  In theory
+  // kernels should always complete in queued order so we should be able to
+  // advance the completed frontier to the last queued PendingKernel.  In
+  // practice we occassionally see the termination callbacks arrive out of order
+  // probably because of thread scheduling.  Eventually we may support out-of-
+  // order completion involving multple compute streams so here we follow a
+  // conservative approach and wait for every single callback to arrive before
+  // advancing the frontier.
+  while (true) {
+    int next_index = (last_completed_ + 1) % pending_kernels_.size();
+    if (next_index == first_available_) break;
+    if (pending_kernels_[next_index].terminated) {
+      last_completed_ = next_index;
+    } else {
+      break;
+    }
+  }
+  // Last decrease num_pending before maybe waking a waiter.
+  --num_pending_;
+  pending_decreased_.notify_one();
+}
+
+uint64 GPUKernelTracker::LastTerminatedCount() {
+  mutex_lock l(mu_);
+  if (last_completed_ < 0) {
+    // This is an edge case that can be encountered only at the beginning of
+    // execution.  There's not yet a safe threshold count. We don't want to
+    // return 0 since that bypasses the count mechanism in BFCAllocator, so
+    // return the least non-zero value.
+    return 1;
+  }
+  return pending_kernels_[last_completed_].queued_count;
+}
+
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index d002d02c51d073ef3019fa1659d555b5d092d883..f8f2a2e2f3221cc3b6acd571ebf2037fb33cf17f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu_device_context.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
+#include "tensorflow/core/common_runtime/shared_counter.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -46,6 +47,7 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
+class GPUKernelTracker;
 
 class BaseGPUDevice : public LocalDevice {
  public:
@@ -114,6 +116,17 @@ class BaseGPUDevice : public LocalDevice {
     return scoped_allocator_mgr_.get();
   }
 
+  // The following two functions always return 0 unless one of the
+  // related experimental config options has been specified.
+
+  // If returned value is > 0 then GPU Memory chunks freed before this count
+  // are guaranteed not to be in use by any kernel pending on this device.
+  uint64 SafeAllocFrontier() override;
+
+  // Returns the number of kernels that have been queued for execution on
+  // the compute stream and are not yet known to have completed.
+  int PendingKernels();
+
  protected:
   Allocator* gpu_allocator_;  // not owned
   Allocator* cpu_allocator_;  // not owned
@@ -141,6 +154,9 @@ class BaseGPUDevice : public LocalDevice {
   const int32 max_streams_;
   std::unique_ptr<EventMgr> em_;
   std::unique_ptr<thread::ThreadPool> thread_pool_;
+  std::unique_ptr<GPUKernelTracker> kernel_tracker_;
+  int pending_cap_ = 0;
+  bool timestamped_allocator_ = false;
 
   // Initialize scractch buffers used by Eigen.
   Status InitScratchBuffers();
@@ -163,6 +179,83 @@ class BaseGPUDevice : public LocalDevice {
                               StatusCallback done);
 };
 
+// A per-compute-stream utility that keeps track of kernels that have been
+// queued for execution but may not yet have terminated, and also the queued
+// time of the most recently terminated kernel.
+class GPUKernelTracker {
+ public:
+  // If we're going to share a SharedCounter with an allocator, it's owned
+  // by the allocator because allocators are initialized once per process.
+  // Devices are per-session.
+  explicit GPUKernelTracker(Env* env, SharedCounter* timing_counter)
+      : env_(env), timing_counter_(timing_counter), pending_kernels_(64) {
+    if (!timing_counter_) {
+      // There's not a preexisting counter owned by GPUProcessState, i.e.
+      // pending_cap > 0 but timestamped_allocator == false.
+      owned_counter_.reset(new SharedCounter);
+      timing_counter_ = owned_counter_.get();
+    }
+  }
+
+  // Record that a GPU kernel has just been enqueued on the compute stream.
+  // Inserts a new timing counter value in a new PendingKernel record appended
+  // to the end of the ring buffer then returns that same count.
+  uint64 RecordQueued();
+
+  // Takes a count value returned by RecordQueued and finds the corresponding
+  // PendingKernel record in the ring buffer.  Marks the kernel as completed and
+  // advances the completion frontier accordingly.
+  void RecordTerminated(uint64 at_count);
+
+  // Returns the largest timing count such that all kernels queued no
+  // later than that count are known to have terminated.
+  uint64 LastTerminatedCount();
+
+  // Returns the number of kernels enqueued that are not yet known to
+  // have terminated.
+  int NumPending() {
+    mutex_lock l(mu_);
+    return num_pending_;
+  }
+
+  // Yield current thread until number of pending kernels no longer
+  // exceeds the cap.
+  void PauseWhilePendingExceeds(int cap) {
+    mutex_lock l(mu_);
+    while (num_pending_ > cap) {
+      pending_decreased_.wait(l);
+    }
+  }
+
+ private:
+  Env* env_;
+  SharedCounter* timing_counter_;
+  std::unique_ptr<SharedCounter> owned_counter_;
+
+  // Records when a kernel was queued for execution.  Kernel launches are
+  // identified by a unique count value from a per-GPU device timing counter.
+  struct PendingKernel {
+    uint64 queued_count;
+    bool terminated;
+    PendingKernel(const PendingKernel& pk)
+        : queued_count(pk.queued_count), terminated(pk.terminated) {}
+    PendingKernel() : queued_count(0), terminated(false) {}
+  };
+  mutex mu_;
+  // Ring buffer of PendingKernel records.
+  std::vector<PendingKernel> pending_kernels_ GUARDED_BY(mu_);
+  // Next unused slot in pending_kernels_.
+  int first_available_ GUARDED_BY(mu_) = 0;
+  // Last completed PendingKernel such that all prior PendingKernels are
+  // also completed.  With out-of-order completion there may be a mixture
+  // of completed and uncompleted entries between last_completed_ and
+  // first_available_, hence num_pending_ is not guaranteed equal to
+  // their differerence.
+  int last_completed_ GUARDED_BY(mu_) = -1;
+  int num_pending_ GUARDED_BY(mu_) = 0;
+  condition_variable pending_decreased_ GUARDED_BY(mu_);
+};
+
 class BaseGPUDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
index 8dc719732927880e6ebb628962160c4a90b1f25c..962891894ad63c40036a153ebe5d4666f0e43049 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
 #include "tensorflow/core/common_runtime/threadpool_device.h"
+#include "tensorflow/core/platform/numa.h"
 
 namespace tensorflow {
 
@@ -81,7 +82,8 @@ class GPUCompatibleCPUDevice : public ThreadPoolDevice {
   GPUCompatibleCPUDevice(const SessionOptions& options, const string& name,
                          Bytes memory_limit, const DeviceLocality& locality,
                          Allocator* allocator)
-      : ThreadPoolDevice(options, name, memory_limit, locality, allocator) {
+      : ThreadPoolDevice(options, name, memory_limit, locality, allocator),
+        numa_node_(locality.numa_node()) {
     if (options.config.has_gpu_options()) {
       force_gpu_compatible_ =
           options.config.gpu_options().force_gpu_compatible();
@@ -92,7 +94,7 @@ class GPUCompatibleCPUDevice : public ThreadPoolDevice {
   Allocator* GetAllocator(AllocatorAttributes attr) override {
     GPUProcessState* ps = GPUProcessState::singleton();
     if (attr.gpu_compatible() || force_gpu_compatible_) {
-      return ps->GetCUDAHostAllocator(0);
+      return ps->GetCUDAHostAllocator(numa_node_);
     } else {
       // Call the parent's implementation.
       return ThreadPoolDevice::GetAllocator(attr);
@@ -101,6 +103,7 @@ class GPUCompatibleCPUDevice : public ThreadPoolDevice {
 
  private:
   bool force_gpu_compatible_ = false;
+  int numa_node_ = port::kNUMANoAffinity;
 };
 
 // The associated factory.
@@ -113,10 +116,17 @@ class GPUCompatibleCPUDeviceFactory : public DeviceFactory {
     if (iter != options.config.device_count().end()) {
       n = iter->second;
     }
+    int num_numa_nodes = options.config.experimental().use_numa_affinity()
+                             ? port::NUMANumNodes()
+                             : 1;
     for (int i = 0; i < n; i++) {
       string name = strings::StrCat(name_prefix, "/device:CPU:", i);
+      int numa_node = i % num_numa_nodes;
+      DeviceLocality locality;
+      locality.set_numa_node(numa_node);
       devices->push_back(absl::make_unique<GPUCompatibleCPUDevice>(
-          options, name, Bytes(256 << 20), DeviceLocality(), cpu_allocator()));
+          options, name, Bytes(256 << 20), DeviceLocality(),
+          ProcessState::singleton()->GetCPUAllocator(numa_node)));
     }
 
     return Status::OK();
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index ae623b2adbe152de6cbad248db234ac5469f83e1..2628cd413faf63fdf9eee82e263dabc75ca01669 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -276,6 +277,70 @@ TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) {
   allocator->DeallocateRaw(ptr);
 }
 
+class GPUKernelTrackerTest : public ::testing::Test {
+ protected:
+  void SetUp() {
+    timing_counter_.reset(new SharedCounter);
+    kernel_tracker_.reset(
+        new GPUKernelTracker(Env::Default(), timing_counter_.get()));
+  }
+
+  std::unique_ptr<GPUKernelTracker> kernel_tracker_;
+  std::unique_ptr<SharedCounter> timing_counter_;
+};
+
+TEST_F(GPUKernelTrackerTest, basic) {
+  EXPECT_EQ(0, kernel_tracker_->NumPending());
+  // 1 is the expected value when no kernels have yet terminated.
+  EXPECT_EQ(1, kernel_tracker_->LastTerminatedCount());
+
+  std::deque<int64> queued_counts;
+  for (int i = 0; i < 32; ++i) {
+    queued_counts.push_back(kernel_tracker_->RecordQueued());
+  }
+  EXPECT_EQ(32, kernel_tracker_->NumPending());
+  EXPECT_EQ(1, kernel_tracker_->LastTerminatedCount());
+
+  // Mature the kernels in order until empty.
+  while (!queued_counts.empty()) {
+    int64 x = queued_counts.front();
+    queued_counts.pop_front();
+    kernel_tracker_->RecordTerminated(x);
+    EXPECT_EQ(queued_counts.size(), kernel_tracker_->NumPending());
+    EXPECT_EQ(x, kernel_tracker_->LastTerminatedCount());
+  }
+  EXPECT_EQ(timing_counter_->get(), kernel_tracker_->LastTerminatedCount());
+
+  // Next inject so many kernel events that the ring buffer needs
+  // to grow a couple of times, while maturing a few in random order
+  // to introduce gaps between last_completed_ and first_available_.
+  int64 lower_bound = timing_counter_->get();
+  for (int i = 0; i < 1111; ++i) {
+    queued_counts.push_back(kernel_tracker_->RecordQueued());
+    int64 upper_bound = timing_counter_->get();
+    if (0 == (i % 16)) {
+      size_t index = (random::New64() % queued_counts.size());
+      kernel_tracker_->RecordTerminated(queued_counts[index]);
+      queued_counts.erase(queued_counts.begin() + index);
+      EXPECT_LE(lower_bound, kernel_tracker_->LastTerminatedCount());
+      EXPECT_GE(upper_bound, kernel_tracker_->LastTerminatedCount());
+    }
+  }
+
+  // Next mature the remaining kernels in order until empty.
+  while (!queued_counts.empty()) {
+    int64 x = queued_counts.front();
+    queued_counts.pop_front();
+    kernel_tracker_->RecordTerminated(x);
+    EXPECT_EQ(queued_counts.size(), kernel_tracker_->NumPending());
+    // There may be a gap here where we find a kernel that got terminated
+    // out of order, earlier, so the LastTerminatedCount can actually
+    // jump past x.
+    EXPECT_LE(x, kernel_tracker_->LastTerminatedCount());
+  }
+  EXPECT_EQ(timing_counter_->get(), kernel_tracker_->LastTerminatedCount());
+}
+
 }  // namespace tensorflow
 
 #endif
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
index 3c1c31aa732d373e76599cdc8fe8ae8561765c9c..6531d6d367b1407d89da16f2023f72b75903daf9 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
@@ -241,7 +241,9 @@ void EventMgr::QueueInUse(se::Stream* stream, InUse iu) {
 // events have recorded, and then retire them.  Initial observations
 // suggest that typical behavior in a TensorFlow program is to have
 // 0-3 events pending most of the time, but there are occasionally
-// spikes of up to several hundred outstanding.
+// spikes of up to several hundred outstanding.  (If GPUKernelTracker
+// is used to cap pending kernels there should never be more than
+// that many.)
 //
 // NOTE: If all events are on the same stream, no later event will
 // complete before an earlier event, except possibly if the earlier
@@ -249,13 +251,10 @@ void EventMgr::QueueInUse(se::Stream* stream, InUse iu) {
 // looking past the first kPending event.  However, if we're using
 // multiple streams there may be some gain in looking deeper.
 // As a compromise, PollEvent() calls that are triggered by the queueing
-// of a single event never look past the first kPending event.  Calls
-// coming from the dedicated polling thread always sweep the full queue.
-//
-// Note that allowing the queue to grow very long could cause overall
-// GPU memory use to spike needlessly.  An alternative strategy would
-// be to throttle new Op execution until the pending event queue
-// clears.
+// of a single event never look past the first kPending event.  Consequently
+// those calls do an expected constant amount of work, unaffected by the
+// length of the pending queue.  Calls coming from the dedicated
+// polling thread always sweep the full queue.
 void EventMgr::PollEvents(bool is_dedicated_poller,
                           gtl::InlinedVector<InUse, 4>* to_free) {
   VLOG(2) << "PollEvents  free_events_ " << free_events_.size()
diff --git a/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.cc
index 613633eb910381b530b350a22c0b557bb108e968..23758966068ccd326d474e36ee62e37cbcd63476 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_managed_allocator.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #ifdef GOOGLE_CUDA
+#include "tensorflow/stream_executor/cuda/cuda_driver_wrapper.h"
 #define EIGEN_USE_GPU
 #endif
 
@@ -24,7 +25,11 @@ namespace tensorflow {
 void* GpuManagedAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   void* ptr = nullptr;
 #ifdef GOOGLE_CUDA
-  CHECK_EQ(cudaMallocManaged(&ptr, num_bytes), cudaSuccess);
+  CUdeviceptr result = 0;
+  CHECK_EQ(tensorflow::wrap::cuMemAllocManaged(&result, num_bytes,
+                                               CU_MEM_ATTACH_GLOBAL),
+           CUDA_SUCCESS);
+  ptr = reinterpret_cast<void*>(result);
 #endif
   CHECK(!(reinterpret_cast<uintptr_t>(ptr) & (alignment - 1)));
   return ptr;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
index 8167cfb9d7dc6cd91a17323b3083d1823cbaa5e0..39883d34f9e5a731cac4c6172c6a4caed9db6602 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/common_runtime/pool_allocator.h"
+#include "tensorflow/core/common_runtime/shared_counter.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/tracking_allocator.h"
@@ -90,7 +91,7 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
   }
 
   AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()];
-  if (allocator_parts.allocator.get() == nullptr) {
+  if (allocator_parts.allocator == nullptr) {
     // Validate allocator types.
     if (!allocator_type.empty() && allocator_type != "BFC") {
       LOG(ERROR) << "Invalid allocator type: " << allocator_type;
@@ -110,9 +111,15 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
         (options.per_process_gpu_memory_fraction() > 1.0 ||
          options.experimental().use_unified_memory()),
         gpu_visitors_[bus_id], {});
-    Allocator* gpu_allocator =
+    GPUBFCAllocator* gpu_bfc_allocator =
         new GPUBFCAllocator(sub_allocator, total_bytes, options,
                             strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc"));
+    Allocator* gpu_allocator = gpu_bfc_allocator;
+    SharedCounter* timing_counter = nullptr;
+    if (options.experimental().timestamped_allocator()) {
+      timing_counter = new SharedCounter;
+      gpu_bfc_allocator->SetTimingCounter(timing_counter);
+    }
 
     // If true, checks for memory overwrites by writing
     // distinctive patterns on both ends of allocated memory.
@@ -137,7 +144,9 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
       recording_allocator = new internal::RecordingAllocator(
           &process_state_->mem_desc_map_, gpu_allocator, md, &mu_);
     }
-    allocator_parts = {std::unique_ptr<Allocator>(gpu_allocator), sub_allocator,
+    allocator_parts = {std::unique_ptr<Allocator>(gpu_allocator),
+                       std::unique_ptr<SharedCounter>(timing_counter),
+                       sub_allocator,
                        std::unique_ptr<Allocator>(recording_allocator)};
   }
   if (process_state_->ProcessState::FLAGS_brain_gpu_record_mem_types) {
@@ -151,6 +160,22 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
 #endif  // GOOGLE_CUDA
 }
 
+SharedCounter* GPUProcessState::GPUAllocatorCounter(TfGpuId tf_gpu_id) {
+  DCHECK(process_state_);
+#if GOOGLE_CUDA
+  GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
+  mutex_lock l(mu_);
+  if (tf_gpu_id.value() >= static_cast<int64>(gpu_allocators_.size())) {
+    return nullptr;
+  }
+
+  AllocatorParts& allocator_parts = gpu_allocators_[tf_gpu_id.value()];
+  return allocator_parts.counter.get();
+#else
+  return nullptr;
+#endif
+}
+
 Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
   CHECK(process_state_);
   if (!HasGPUDevice() ||
@@ -224,6 +249,7 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
       allocator = new TrackingAllocator(allocator, true);
     }
     cuda_host_allocators_.push_back({std::unique_ptr<Allocator>(allocator),
+                                     std::unique_ptr<SharedCounter>(nullptr),
                                      sub_allocator,
                                      std::unique_ptr<Allocator>(nullptr)});
     AllocatorParts& allocator_parts = cuda_host_allocators_.back();
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.h b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
index df51c10c8065fa94d736c8f4dfa76faebdc8bc62..861157ca0c6150be6e430e184fe6adb1a326f6fd 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/process_state.h"
+#include "tensorflow/core/common_runtime/shared_counter.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
@@ -33,6 +34,7 @@ namespace tensorflow {
 
 class Allocator;
 class PoolAllocator;
+class SharedCounter;
 
 // Singleton that manages per-process state when GPUs are present.
 class GPUProcessState {
@@ -108,6 +110,8 @@ class GPUProcessState {
   // Returns bus_id for the given GPU id.
   virtual int BusIdForGPU(TfGpuId tf_gpu_id);
 
+  SharedCounter* GPUAllocatorCounter(TfGpuId tf_gpu_id);
+
  protected:
   // GPUProcessState is a singleton that should not normally be deleted except
   // at process shutdown.
@@ -132,6 +136,7 @@ class GPUProcessState {
 
   struct AllocatorParts {
     std::unique_ptr<Allocator> allocator;
+    std::unique_ptr<SharedCounter> counter;
     SubAllocator* sub_allocator;  // owned by allocator
     std::unique_ptr<Allocator> recording_allocator;
   };
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc b/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc
index 4bc88ffc8c3950176ae05f32c774f2f2971a4e34..0ef39fb3d78044a8611b315afbdeb4975a3af15f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc
@@ -37,6 +37,14 @@ void GPUDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
   GPUUtil::CopyGPUTensorToCPU(device, this, device_tensor, cpu_tensor, done);
 }
 
+void GPUDeviceContext::CopyTensorInSameDevice(const Tensor* input_tensor,
+                                              Device* device,
+                                              Tensor* output_tensor,
+                                              StatusCallback done) const {
+  GPUUtil::CopyGPUTensorToSameGPU(device, this, input_tensor, output_tensor,
+                                  done);
+}
+
 Status GPUDeviceContext::ThenExecute(Device* device, se::Stream* stream,
                                      std::function<void()> func) {
   const DeviceBase::GpuDeviceInfo* gpu_info =
diff --git a/tensorflow/core/common_runtime/gpu_device_context.h b/tensorflow/core/common_runtime/gpu_device_context.h
index 3603808152748009f29d1d01f0eeee0dd8b6ab0e..f5135267241db94a0afdd9845b09dbfdda242ecc 100644
--- a/tensorflow/core/common_runtime/gpu_device_context.h
+++ b/tensorflow/core/common_runtime/gpu_device_context.h
@@ -57,6 +57,10 @@ class GPUDeviceContext : public DeviceContext {
                              Device* device, Tensor* cpu_tensor,
                              StatusCallback done) override;
 
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Device* device,
+                              Tensor* output_tensor,
+                              StatusCallback done) const override;
+
   void MaintainLifetimeOnStream(const Tensor* t,
                                 se::Stream* stream) const override {}
 
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 880806f120d010a812bbced62409a1ff5ed8e9d7..b185ea1fa50fb866b36e928f441cd267a8da4301 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/common_runtime/placer.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -32,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/collective_order.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/subgraph.h"
@@ -59,6 +61,7 @@ GraphExecutionState::GraphExecutionState(
     : stateful_placements_(options.stateful_placements),
       device_set_(options.device_set),
       session_options_(options.session_options),
+      session_handle_(options.session_handle),
       flib_def_(new FunctionLibraryDefinition(OpRegistry::Global(),
                                               graph_def->library())),
       graph_(nullptr) {
@@ -198,6 +201,7 @@ Status GraphExecutionState::Extend(
   GraphExecutionStateOptions combined_options;
   combined_options.device_set = device_set_;
   combined_options.session_options = session_options_;
+  combined_options.session_handle = session_handle_;
   combined_options.stateful_placements = stateful_placements_;
 
   // NOTE(mrry): `gdef` is no longer valid after the constructor
@@ -546,10 +550,6 @@ Status GraphExecutionState::InitBaseGraph(const BuildGraphOptions& options) {
   std::unique_ptr<Graph> new_graph(new Graph(OpRegistry::Global()));
   GraphConstructorOptions opts;
   TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, *graph_def, new_graph.get()));
-  for (const Node* n : new_graph->nodes()) {
-    VLOG(2) << "Mapping " << n->name() << " to " << n->cost_id();
-    node_name_to_cost_id_map_[n->name()] = n->cost_id();
-  }
   if (session_options_ &&
       session_options_->config.graph_options().place_pruned_graph()) {
     // Rewrite the graph before placement.
@@ -562,6 +562,7 @@ Status GraphExecutionState::InitBaseGraph(const BuildGraphOptions& options) {
   RestoreStatefulNodes(new_graph.get());
 
   GraphOptimizationPassOptions optimization_options;
+  optimization_options.session_handle = session_handle_;
   optimization_options.session_options = session_options_;
   optimization_options.graph = &new_graph;
   optimization_options.flib_def = flib_def_.get();
@@ -578,6 +579,11 @@ Status GraphExecutionState::InitBaseGraph(const BuildGraphOptions& options) {
   TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
       OptimizationPassRegistry::POST_PLACEMENT, optimization_options));
 
+  for (const Node* n : new_graph->nodes()) {
+    VLOG(2) << "Mapping " << n->name() << " to " << n->cost_id();
+    node_name_to_cost_id_map_[n->name()] = n->cost_id();
+  }
+
   SaveStatefulNodes(new_graph.get());
   graph_ = new_graph.release();
   return Status::OK();
@@ -727,6 +733,7 @@ Status GraphExecutionState::OptimizeGraph(
 Status GraphExecutionState::BuildGraph(const BuildGraphOptions& options,
                                        std::unique_ptr<ClientGraph>* out) {
   VLOG(1) << "BuildGraph";
+  const uint64 start_time_usecs = Env::Default()->NowMicros();
   if (!graph_) {
     // It is only valid to call this method directly when the original graph
     // was created with the option `place_pruned_graph == false`.
@@ -815,6 +822,12 @@ Status GraphExecutionState::BuildGraph(const BuildGraphOptions& options,
     }
   }
 
+  // Make collective execution order deterministic if needed.
+  if (options.collective_order != GraphCollectiveOrder::kNone) {
+    TF_RETURN_IF_ERROR(
+        OrderCollectives(optimized_graph.get(), options.collective_order));
+  }
+
   // Copy the extracted graph in order to make its node ids dense,
   // since the local CostModel used to record its stats is sized by
   // the largest node id.
@@ -824,7 +837,7 @@ Status GraphExecutionState::BuildGraph(const BuildGraphOptions& options,
   CopyGraph(*optimized_graph, &dense_copy->graph);
 
   // TODO(vrv): We should check invariants of the graph here.
-
+  metrics::UpdateGraphBuildTime(Env::Default()->NowMicros() - start_time_usecs);
   *out = std::move(dense_copy);
   return Status::OK();
 }
diff --git a/tensorflow/core/common_runtime/graph_execution_state.h b/tensorflow/core/common_runtime/graph_execution_state.h
index 9cabe478a68a72252579755dca1e8957242344ba..56315bb1ef7947d788a7ada6ef0fa14f50e2a978 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.h
+++ b/tensorflow/core/common_runtime/graph_execution_state.h
@@ -41,6 +41,8 @@ struct RewriteGraphMetadata;
 struct GraphExecutionStateOptions {
   const DeviceSet* device_set = nullptr;
   const SessionOptions* session_options = nullptr;
+  // Unique session identifier. Can be empty.
+  string session_handle;
   // A map from node name to device name, representing the unchangeable
   // placement of stateful nodes.
   std::unordered_map<string, string> stateful_placements;
@@ -192,6 +194,8 @@ class GraphExecutionState {
   GraphDef original_graph_def_;            // Immutable after ctor.
   const DeviceSet* device_set_;            // Not owned
   const SessionOptions* session_options_;  // Not owned
+  // Unique session identifier. Can be empty.
+  string session_handle_;
 
   // Map from name to Node for the full graph in placed_.
   NodeNameToCostIdMap node_name_to_cost_id_map_;
diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index 37a979a8f1929ed6312dc79354a3c206f7c4c5f4..7905944fb18105e38059a892d32b9509273a7742 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -38,8 +38,7 @@ void GraphOptimizer::Optimize(
     std::unique_ptr<Graph>* graph,
     const std::unordered_map<string, std::vector<PartialTensorShape>>*
         shape_map,
-    const std::function<bool(const Node*)>& cse_consider_fn,
-    const std::function<bool(const Node*)>& cf_consider_fn) {
+    const NodePredicate& cse_consider_fn, const NodePredicate& cf_consider_fn) {
   Graph* g = graph->get();
   DumpGraph("Initial", g);
 
@@ -103,4 +102,11 @@ void GraphOptimizer::Optimize(
   DumpGraph("ReCopy", graph->get());
 }
 
+void GraphOptimizer::Optimize(FunctionLibraryRuntime* runtime, Env* env,
+                              Device* device, std::unique_ptr<Graph>* graph,
+                              const Options& options) {
+  Optimize(runtime, env, device, graph, options.shape_map,
+           options.cse_consider_fn, options.cf_consider_fn);
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/common_runtime/graph_optimizer.h b/tensorflow/core/common_runtime/graph_optimizer.h
index 789cc5694219e1386bde0fb1821dfdc9928523f1..05150608f02ab52fe135b003dddbcee7783e11a7 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.h
+++ b/tensorflow/core/common_runtime/graph_optimizer.h
@@ -26,6 +26,28 @@ namespace tensorflow {
 
 class GraphOptimizer {
  public:
+  using NodePredicate = std::function<bool(const Node*)>;
+
+  struct Options {
+    // If not null it maps from nodes in graph to partially-known
+    // shapes of their outputs, and may be used, e.g., in the constant folding
+    // pass. The use of shape_map implies that the mapping from node name to the
+    // vector of partial shapes of its outputs is stable, i.e., no optimization
+    // pass may replace a node with a different node of the same name that has a
+    // different number of outputs, or outputs with different known shapes.
+    // TODO(b/65453533) introduce a unique way to name nodes in a graph.
+    std::unordered_map<string, std::vector<PartialTensorShape>>* shape_map =
+        nullptr;
+
+    // If not null then only nodes for which cse_consider_fn returns true will
+    // be considered for CSE.
+    NodePredicate cse_consider_fn = nullptr;
+
+    // If not null then only nodes for which cf_consider_fn returns true will be
+    // considered for CF.
+    NodePredicate cf_consider_fn = nullptr;
+  };
+
   GraphOptimizer(const OptimizerOptions& opts);
   ~GraphOptimizer();
 
@@ -34,26 +56,17 @@ class GraphOptimizer {
   // on which the 'graph' will execute. It's passed to the optimizers
   // so that they can respect constraints if any, that should be
   // respected.
-  //
-  // If shape_map is not null it maps from nodes in graph to partially-known
-  // shapes of their outputs, and may be used, e.g., in the constant folding
-  // pass. The use of shape_map implies that the mapping from node name to the
-  // vector of partial shapes of its outputs is stable, i.e., no optimization
-  // pass may replace a node with a different node of the same name that has a
-  // different number of outputs, or outputs with different known shapes.
-  // TODO(b/65453533) introduce a unique way to name nodes in a graph.
-  //
-  // If cse_consider_fn is not null then only nodes for which cse_consider_fn
-  // returns true will be considered for CSE.
-  // If cf_consider_fn is not null then only nodes for which cf_consider_fn
-  // returns true will be considered for CF.
+  void Optimize(FunctionLibraryRuntime* runtime, Env* env, Device* device,
+                std::unique_ptr<Graph>* graph,
+                const Options& graph_optimizer_options);
+  // DEPRECATED: Consider passing a GraphOptimizer::Options object instead.
   void Optimize(
       FunctionLibraryRuntime* runtime, Env* env, Device* device,
       std::unique_ptr<Graph>* graph,
       const std::unordered_map<string, std::vector<PartialTensorShape>>*
           shape_map,
-      const std::function<bool(const Node*)>& cse_consider_fn = nullptr,
-      const std::function<bool(const Node*)>& cf_consider_fn = nullptr);
+      const NodePredicate& cse_consider_fn = nullptr,
+      const NodePredicate& cf_consider_fn = nullptr);
 
   const OptimizerOptions& options() { return opts_; }
 
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h
index ceb9baad30b214e5d3bec0cdbb470474d84e7227..76392b8e59e904d3bde7739f640ab92ff53aa96b 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h
@@ -41,6 +41,11 @@ class HierarchicalTreeBroadcaster : public CollectiveImplementationInterface {
   // and device_locality.  Also saves the CollectiveContext in this object.
   Status InitializeCollectiveContext(CollectiveContext* col_ctx) override;
 
+  // No-op for hierarchical tree broadcaster.
+  Status InitializeInstanceBeforeGroupDiscovery(CollectiveParams*) override {
+    return Status::OK();
+  }
+
   // Begins async execution of the hierarchical tree broadcast.
   // Must be called in a blockable thread.
   // TODO(b/80529858): remove the previous warning when we have a dedicated
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
index f0656ff53332d7dd4f21d9d874846c16fb669681..12af4a82019d5f30b417b98f66bf9bc95bd15442 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
@@ -616,7 +616,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
         auto* dev_info = device_->tensorflow_gpu_device_info();
         CHECK(dev_info);
         dev_info->default_context->CopyCPUTensorToDevice(
-            &cpu_tensor, device_, &tensor_, [this, &notification](Status s) {
+            &cpu_tensor, device_, &tensor_, [&notification](Status s) {
               TF_CHECK_OK(s);
               notification.Notify();
             });
diff --git a/tensorflow/core/common_runtime/local_device.cc b/tensorflow/core/common_runtime/local_device.cc
index f1fcca194e9ef56bf7b96e6c73717db7620b9812..2a6d6f5a7aefd0f191ebf0ee05164ab0985a9be6 100644
--- a/tensorflow/core/common_runtime/local_device.cc
+++ b/tensorflow/core/common_runtime/local_device.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/common_runtime/process_state.h"
+#include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_feature_guard.h"
@@ -53,15 +54,22 @@ struct LocalDevice::EigenThreadPoolInfo {
 
   explicit EigenThreadPoolInfo(const SessionOptions& options, int numa_node,
                                Allocator* allocator) {
+    // Use session setting if specified.
     int32 intra_op_parallelism_threads =
         options.config.intra_op_parallelism_threads();
+    // If no session setting, use environment setting.
     if (intra_op_parallelism_threads == 0) {
-      intra_op_parallelism_threads = port::NumSchedulableCPUs();
-      if (numa_node != port::kNUMANoAffinity) {
-        // Assume that CPUs are equally distributed over available NUMA nodes.
-        // This may not be true, but there isn't currently a better way of
-        // determining the number of CPUs specific to the requested node.
-        intra_op_parallelism_threads /= port::NUMANumNodes();
+      static int env_num_threads = NumIntraOpThreadsFromEnvironment();
+      intra_op_parallelism_threads = env_num_threads;
+      // If no session setting or environment, compute a reasonable default.
+      if (intra_op_parallelism_threads == 0) {
+        intra_op_parallelism_threads = port::NumSchedulableCPUs();
+        if (numa_node != port::kNUMANoAffinity) {
+          // Assume that CPUs are equally distributed over available NUMA nodes.
+          // This may not be true, but there isn't currently a better way of
+          // determining the number of CPUs specific to the requested node.
+          intra_op_parallelism_threads /= port::NUMANumNodes();
+        }
       }
     }
     ThreadOptions thread_opts;
diff --git a/tensorflow/core/common_runtime/lower_if_op.cc b/tensorflow/core/common_runtime/lower_if_op.cc
index 9738006f5ca9eb821439a9ad507aec3db434946c..241c403087c814717d873fc3d4d4c2c4f71e50ae 100644
--- a/tensorflow/core/common_runtime/lower_if_op.cc
+++ b/tensorflow/core/common_runtime/lower_if_op.cc
@@ -89,6 +89,7 @@ class CondBuilder {
   const FunctionLibraryDefinition& flib_;
   string name_;
 
+  NodeDebugInfo debug_info_;
   NodeBuilder then_call_builder_;
   NodeBuilder else_call_builder_;
 };
@@ -100,8 +101,11 @@ CondBuilder::CondBuilder(Node* if_op, const string& then_fn_name,
       graph_(graph),
       flib_(flib),
       name_(if_op->name()),
-      then_call_builder_(NewName("then"), then_fn_name, graph->op_registry()),
-      else_call_builder_(NewName("else"), else_fn_name, graph->op_registry()) {
+      debug_info_(*if_op_),
+      then_call_builder_(NewName("then"), then_fn_name, graph->op_registry(),
+                         &debug_info_),
+      else_call_builder_(NewName("else"), else_fn_name, graph->op_registry(),
+                         &debug_info_) {
   TF_CHECK_OK(if_op_->input_tensor(0, &pred_));
   then_call_builder_.Device(if_op_->requested_device());
   else_call_builder_.Device(if_op_->requested_device());
@@ -111,23 +115,23 @@ Status CondBuilder::CreatePivotNodes() {
   // Construct the basic cond body (consisting of feeding in the predicate to
   // create pivot nodes).
   Node* switch_pred;
-  TF_RETURN_IF_ERROR(
-      NodeBuilder(NewName("switch_pred"), "Switch", graph_->op_registry())
-          .Input(NodeOut(pred_))
-          .Input(NodeOut(pred_))
-          .Device(if_op_->requested_device())
-          .Finalize(graph_, &switch_pred));
+  TF_RETURN_IF_ERROR(NodeBuilder(NewName("switch_pred"), "Switch",
+                                 graph_->op_registry(), &debug_info_)
+                         .Input(NodeOut(pred_))
+                         .Input(NodeOut(pred_))
+                         .Device(if_op_->requested_device())
+                         .Finalize(graph_, &switch_pred));
   control_predecessor_ = switch_pred;
-  TF_RETURN_IF_ERROR(
-      NodeBuilder(NewName("pivot_f"), "Identity", graph_->op_registry())
-          .Input(switch_pred, kElseBranch)
-          .Device(if_op_->requested_device())
-          .Finalize(graph_, &pivot_f_));
-  TF_RETURN_IF_ERROR(
-      NodeBuilder(NewName("pivot_t"), "Identity", graph_->op_registry())
-          .Input(switch_pred, kThenBranch)
-          .Device(if_op_->requested_device())
-          .Finalize(graph_, &pivot_t_));
+  TF_RETURN_IF_ERROR(NodeBuilder(NewName("pivot_f"), "Identity",
+                                 graph_->op_registry(), &debug_info_)
+                         .Input(switch_pred, kElseBranch)
+                         .Device(if_op_->requested_device())
+                         .Finalize(graph_, &pivot_f_));
+  TF_RETURN_IF_ERROR(NodeBuilder(NewName("pivot_t"), "Identity",
+                                 graph_->op_registry(), &debug_info_)
+                         .Input(switch_pred, kThenBranch)
+                         .Device(if_op_->requested_device())
+                         .Finalize(graph_, &pivot_t_));
   return Status::OK();
 }
 
@@ -137,12 +141,13 @@ string CondBuilder::NewName(const string& infix) {
 
 Status CondBuilder::AddInput(Node* src, int src_output) {
   Node* input;
-  TF_RETURN_IF_ERROR(
-      NodeBuilder(NewName(src->name()), "Switch", graph_->op_registry())
-          .Input(src, src_output)
-          .Input(pred_)
-          .Device(if_op_->requested_device())
-          .Finalize(graph_, &input));
+  NodeDebugInfo debug_info(*src);
+  TF_RETURN_IF_ERROR(NodeBuilder(NewName(src->name()), "Switch",
+                                 graph_->op_registry(), &debug_info)
+                         .Input(src, src_output)
+                         .Input(pred_)
+                         .Device(if_op_->requested_device())
+                         .Finalize(graph_, &input));
   then_call_builder_.Input(input, kThenBranch);
   else_call_builder_.Input(input, kElseBranch);
   return Status::OK();
@@ -178,7 +183,8 @@ Status CondBuilder::AddOutputs() {
   outputs_.resize(merges.size());
   for (int i = 0; i < then_call_node_->num_outputs(); ++i) {
     TF_RETURN_IF_ERROR(
-        NodeBuilder(graph_->NewName("merge"), "Merge", graph_->op_registry())
+        NodeBuilder(graph_->NewName("merge"), "Merge", graph_->op_registry(),
+                    &debug_info_)
             .Input({NodeOut(then_call_node_, i), NodeOut(else_call_node_, i)})
             .Device(if_op_->requested_device())
             .Finalize(graph_, &merges[i]));
diff --git a/tensorflow/core/common_runtime/lower_while_op.cc b/tensorflow/core/common_runtime/lower_while_op.cc
index 6f9921a7968b9cad4bc96b21600fdb026636bc2a..f1c3bbd552812bf7c6b0e3866cdf2c4ebd452f6b 100644
--- a/tensorflow/core/common_runtime/lower_while_op.cc
+++ b/tensorflow/core/common_runtime/lower_while_op.cc
@@ -53,9 +53,10 @@ using NodeOut = NodeBuilder::NodeOut;
 class LowerWhileHelper {
  public:
   static Status Run(Node* while_op, const string& cond_fn_name,
-                    const string& body_fn_name, Graph* graph,
-                    const FunctionLibraryDefinition& flib) {
-    LowerWhileHelper helper(while_op, cond_fn_name, body_fn_name, graph, flib);
+                    const string& body_fn_name, int parallel_iterations,
+                    Graph* graph, const FunctionLibraryDefinition& flib) {
+    LowerWhileHelper helper(while_op, cond_fn_name, body_fn_name,
+                            parallel_iterations, graph, flib);
     return helper.RunInternal();
   }
 
@@ -64,8 +65,8 @@ class LowerWhileHelper {
   // and body functions named `cond_fn_name` and `body_fn_name` respectively in
   // the given graph.
   LowerWhileHelper(Node* while_op, const string& cond_fn_name,
-                   const string& body_fn_name, Graph* graph,
-                   const FunctionLibraryDefinition& flib);
+                   const string& body_fn_name, int parallel_iterations,
+                   Graph* graph, const FunctionLibraryDefinition& flib);
 
   Status RunInternal();
 
@@ -132,7 +133,10 @@ class LowerWhileHelper {
   const FunctionLibraryDefinition& flib_;
   // Name of the `while_op_`.
   string name_;
+  // Max number of parallel_iterations for the while loop.
+  const int parallel_iterations_;
 
+  NodeDebugInfo debug_info_;
   NodeBuilder cond_call_builder_;
   NodeBuilder body_call_builder_;
 
@@ -146,14 +150,19 @@ class LowerWhileHelper {
 };
 
 LowerWhileHelper::LowerWhileHelper(Node* while_op, const string& cond_fn_name,
-                                   const string& body_fn_name, Graph* graph,
+                                   const string& body_fn_name,
+                                   int parallel_iterations, Graph* graph,
                                    const FunctionLibraryDefinition& flib)
     : while_op_(while_op),
       graph_(graph),
       flib_(flib),
       name_(while_op->name()),
-      cond_call_builder_(NewName("cond"), cond_fn_name, graph->op_registry()),
-      body_call_builder_(NewName("body"), body_fn_name, graph->op_registry()),
+      parallel_iterations_(parallel_iterations),
+      debug_info_(*while_op_),
+      cond_call_builder_(NewName("cond"), cond_fn_name, graph->op_registry(),
+                         &debug_info_),
+      body_call_builder_(NewName("body"), body_fn_name, graph->op_registry(),
+                         &debug_info_),
       num_loop_inputs_(while_op_->num_inputs()) {
   // We intentionally `resize` instead of `reserve` space in `enter_nodes_`
   // because we need to set it's elements out of order in `CreateEnterNodes`.
@@ -186,11 +195,13 @@ Status LowerWhileHelper::CreateEnterNodes() {
   TF_RETURN_IF_ERROR(while_op_->input_edges(&edges));
   for (const Edge* edge : edges) {
     Node* enter_node;
-    TF_RETURN_IF_ERROR(
-        NodeBuilder(NewName("enter"), "Enter", graph_->op_registry())
-            .Input(NodeOut(edge->src(), edge->src_output()))
-            .Attr("frame_name", name_)
-            .Finalize(graph_, &enter_node));
+    TF_RETURN_IF_ERROR(NodeBuilder(NewName("enter"), "Enter",
+                                   graph_->op_registry(), &debug_info_)
+                           .Input(NodeOut(edge->src(), edge->src_output()))
+                           .Attr("frame_name", name_)
+                           .Attr("parallel_iterations", parallel_iterations_)
+                           .Device(while_op_->requested_device())
+                           .Finalize(graph_, &enter_node));
     enter_nodes_[edge->dst_input()] = enter_node;
   }
   // Create a NoOp node that takes incoming control inputs of the original While
@@ -203,10 +214,11 @@ Status LowerWhileHelper::CreateEnterNodes() {
   }
   if (!control_inputs.empty()) {
     Node* incoming_control_node;
-    TF_RETURN_IF_ERROR(
-        NodeBuilder(NewName("LoopControlInputs"), "NoOp", graph_->op_registry())
-            .ControlInputs(control_inputs)
-            .Finalize(graph_, &incoming_control_node));
+    TF_RETURN_IF_ERROR(NodeBuilder(NewName("LoopControlInputs"), "NoOp",
+                                   graph_->op_registry(), &debug_info_)
+                           .ControlInputs(control_inputs)
+                           .Device(while_op_->requested_device())
+                           .Finalize(graph_, &incoming_control_node));
     for (Node* n : enter_nodes_) {
       graph_->AddControlEdge(incoming_control_node, n);
     }
@@ -218,8 +230,10 @@ Status LowerWhileHelper::CreateMergeNodes() {
   for (Node* enter_node : enter_nodes_) {
     Node* merge_node;
     TF_RETURN_IF_ERROR(
-        NodeBuilder(NewName("merge"), "Merge", graph_->op_registry())
+        NodeBuilder(NewName("merge"), "Merge", graph_->op_registry(),
+                    &debug_info_)
             .Input({NodeOut(enter_node, 0), NodeOut(enter_node, 0)})
+            .Device(while_op_->requested_device())
             .Finalize(graph_, &merge_node));
     merge_nodes_.emplace_back(merge_node);
   }
@@ -230,15 +244,17 @@ Status LowerWhileHelper::CreateCondFuncCallNode() {
   for (Node* merge_node : merge_nodes_) {
     cond_call_builder_.Input(NodeOut(merge_node, 0));
   }
+  cond_call_builder_.Device(while_op_->requested_device());
   TF_RETURN_IF_ERROR(cond_call_builder_.Finalize(graph_, &cond_call_node_));
   // Add a control edge to make sure the Const nodes in the cond function
   // are in the same frame as the rest of the function, otherwise
   // `BuildControlFlowInfo` throws an error.
   graph_->AddControlEdge(merge_nodes_[0], cond_call_node_);
-  TF_RETURN_IF_ERROR(
-      NodeBuilder(NewName("LoopCond"), "LoopCond", graph_->op_registry())
-          .Input(NodeOut(cond_call_node_, 0))
-          .Finalize(graph_, &loop_cond_node_));
+  TF_RETURN_IF_ERROR(NodeBuilder(NewName("LoopCond"), "LoopCond",
+                                 graph_->op_registry(), &debug_info_)
+                         .Input(NodeOut(cond_call_node_, 0))
+                         .Device(while_op_->requested_device())
+                         .Finalize(graph_, &loop_cond_node_));
   return Status::OK();
 }
 
@@ -255,11 +271,12 @@ Status LowerWhileHelper::CreateSwitchNodes() {
     if (IsRefType(merge_nodes_[i]->output_type(0))) {
       op_type = "RefSwitch";
     }
-    TF_RETURN_IF_ERROR(
-        NodeBuilder(NewName(op_name), op_type, graph_->op_registry())
-            .Input(NodeOut(merge_nodes_[i], 0))
-            .Input(NodeOut(loop_cond_node_, 0))
-            .Finalize(graph_, &switch_node));
+    TF_RETURN_IF_ERROR(NodeBuilder(NewName(op_name), op_type,
+                                   graph_->op_registry(), &debug_info_)
+                           .Input(NodeOut(merge_nodes_[i], 0))
+                           .Input(NodeOut(loop_cond_node_, 0))
+                           .Device(while_op_->requested_device())
+                           .Finalize(graph_, &switch_node));
     switch_nodes_.emplace_back(switch_node);
   }
   return Status::OK();
@@ -269,6 +286,7 @@ Status LowerWhileHelper::CreateBodyFuncCallNode() {
   for (Node* switch_node : switch_nodes_) {
     body_call_builder_.Input(NodeOut(switch_node, 1));
   }
+  body_call_builder_.Device(while_op_->requested_device());
   TF_RETURN_IF_ERROR(body_call_builder_.Finalize(graph_, &body_call_node_));
   // Add a control edge to make sure the Const nodes in the body function
   // are in the same frame as the rest of the function, otherwise
@@ -282,10 +300,11 @@ Status LowerWhileHelper::CreateBodyFuncCallNode() {
   if (IsRefType(switch_nodes_[0]->output_type(1))) {
     op_type = "RefIdentity";
   }
-  TF_RETURN_IF_ERROR(
-      NodeBuilder(NewName("loop_body_control"), op_type, graph_->op_registry())
-          .Input(NodeOut(switch_nodes_[0], 1))
-          .Finalize(graph_, &body_control_node_));
+  TF_RETURN_IF_ERROR(NodeBuilder(NewName("loop_body_control"), op_type,
+                                 graph_->op_registry(), &debug_info_)
+                         .Input(NodeOut(switch_nodes_[0], 1))
+                         .Device(while_op_->requested_device())
+                         .Finalize(graph_, &body_control_node_));
   graph_->AddControlEdge(body_control_node_, body_call_node_);
   return Status::OK();
 }
@@ -295,10 +314,11 @@ Status LowerWhileHelper::CreateExitNodes() {
   outputs.reserve(num_loop_inputs_);
   for (Node* switch_node : switch_nodes_) {
     Node* exit_node;
-    TF_RETURN_IF_ERROR(
-        NodeBuilder(NewName("exit"), "Exit", graph_->op_registry())
-            .Input(NodeOut(switch_node, 0))
-            .Finalize(graph_, &exit_node));
+    TF_RETURN_IF_ERROR(NodeBuilder(NewName("exit"), "Exit",
+                                   graph_->op_registry(), &debug_info_)
+                           .Input(NodeOut(switch_node, 0))
+                           .Device(while_op_->requested_device())
+                           .Finalize(graph_, &exit_node));
     exit_nodes_.emplace_back(exit_node);
     outputs.emplace_back(NodeOut(exit_node, 0));
   }
@@ -307,8 +327,9 @@ Status LowerWhileHelper::CreateExitNodes() {
   // original functional While op. This is used for
   // 1. Rewiring the control edges with the original while op as src.
   // 2. Fetching the output of the While node by name in calls to sess.run.
-  NodeBuilder ib(name_, "IdentityN");
+  NodeBuilder ib(name_, "IdentityN", OpRegistry::Global(), &debug_info_);
   ib.Input(outputs);
+  ib.Device(while_op_->requested_device());
   TF_RETURN_IF_ERROR(ib.Finalize(graph_, &lowered_while_output_));
   return Status::OK();
 }
@@ -317,8 +338,9 @@ Status LowerWhileHelper::CreateNextIterationNodes() {
   for (int i = 0; i < num_loop_inputs_; i++) {
     Node* next_iteration;
     TF_RETURN_IF_ERROR(NodeBuilder(NewName("next_iteration"), "NextIteration",
-                                   graph_->op_registry())
+                                   graph_->op_registry(), &debug_info_)
                            .Input(NodeOut(body_call_node_, i))
+                           .Device(while_op_->requested_device())
                            .Finalize(graph_, &next_iteration));
     next_iterations_nodes_.emplace_back(next_iteration);
   }
@@ -387,9 +409,15 @@ Status RewriteWhileNode(Node* n, Graph* g,
   if (body_attr == nullptr) {
     return errors::InvalidArgument("While body function missing");
   }
+  const AttrValue* parallel_iterations_attr =
+      n->attrs().Find("parallel_iterations");
+  if (parallel_iterations_attr == nullptr) {
+    return errors::InvalidArgument("parallel_iterations attr missing");
+  }
 
-  TF_RETURN_IF_ERROR(LowerWhileHelper::Run(n, cond_attr->func().name(),
-                                           body_attr->func().name(), g, flib));
+  TF_RETURN_IF_ERROR(LowerWhileHelper::Run(
+      n, cond_attr->func().name(), body_attr->func().name(),
+      parallel_iterations_attr->i(), g, flib));
   g->RemoveNode(n);
 
   return Status::OK();
diff --git a/tensorflow/core/common_runtime/lower_while_op_test.cc b/tensorflow/core/common_runtime/lower_while_op_test.cc
index 24fd4ed5bb5939e066fa5b8d75b9b9c3aaf5895a..fcb10bc75dbe574efee9c4c28ab00dcb55c194d3 100644
--- a/tensorflow/core/common_runtime/lower_while_op_test.cc
+++ b/tensorflow/core/common_runtime/lower_while_op_test.cc
@@ -66,6 +66,7 @@ TEST(LowerWhileOpTest, Simple) {
                    .Attr("T", {DT_INT32})
                    .Attr("cond", cond_func)
                    .Attr("body", body_func)
+                   .Attr("parallel_iterations", 100)
                    .Attr(LowerIfWhilePass::kLowerUsingSwitchMergeAttr, true)
                    .Finalize(root.graph(), &while_node));
   TF_ASSERT_OK(root.DoShapeInference(while_node));
@@ -97,6 +98,7 @@ TEST(LowerWhileOpTest, Simple) {
   for (const auto* op : graph->op_nodes()) {
     if (op->IsEnter()) {
       ++enter_count;
+      ASSERT_EQ(op->attrs().Find("parallel_iterations")->i(), 100);
     }
     if (op->IsExit()) {
       ++exit_count;
diff --git a/tensorflow/core/common_runtime/metrics.cc b/tensorflow/core/common_runtime/metrics.cc
index f4c94ed7ec0cb1c5e8b341b75f1d075d30d6125a..fcdab26d3dbc4dee7a825d581543f7619860f225 100644
--- a/tensorflow/core/common_runtime/metrics.cc
+++ b/tensorflow/core/common_runtime/metrics.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "tensorflow/core/lib/monitoring/counter.h"
 
 namespace tensorflow {
-
+namespace metrics {
 namespace {
 
 auto* graph_runs = monitoring::Counter<0>::New(
@@ -28,8 +28,55 @@ auto* graph_runs = monitoring::Counter<0>::New(
 auto* graph_run_time_usecs = monitoring::Counter<0>::New(
     "/tensorflow/core/graph_run_time_usecs",
     "The total time spent on executing graphs in microseconds.");
+
+auto* tf_data_autotune_counter = monitoring::Counter<1>::New(
+    "/tensorflow/data/autotune", "tf.data autotuning", "name");
+
+auto* tf_data_bytes_read_counter = monitoring::Counter<1>::New(
+    "/tensorflow/data/bytes_read",
+    "The number of bytes read by tf.data Dataset sources.", "name");
+
+auto* tf_data_elements_counter = monitoring::Counter<1>::New(
+    "/tensorflow/data/elements", "tf.data elements", "name");
+
+auto* tf_data_optimization_counter = monitoring::Counter<1>::New(
+    "/tensorflow/data/optimization", "tf.data optimization", "name");
+
+auto* build_graph_calls = monitoring::Counter<0>::New(
+    "/tensorflow/core/graph_build_calls",
+    "The number of times TensorFlow has created a new client graph. "
+    "A client graph is a sub-graph of the full graph, induced by a set of "
+    "options, including the requested feeds and fetches. It includes time "
+    "spent optimizing the graph with Grappler, and time spent pruning the "
+    "sub-graph.");
+
+auto* build_graph_time_usecs = monitoring::Counter<0>::New(
+    "/tensorflow/core/graph_build_time_usecs",
+    "The amount of time TensorFlow has spent creating new client graphs in "
+    "microseconds. "
+    "A client graph is a sub-graph of the full graph, induced by a set of "
+    "options, including the requested feeds and fetches. It includes time "
+    "spent optimizing the graph with Grappler, and time spent pruning the "
+    "sub-graph.");
+
 }  // namespace
 
+void RecordTFDataAutotune(const string& name) {
+  tf_data_autotune_counter->GetCell(name)->IncrementBy(1);
+}
+
+void RecordTFDataBytesRead(const string& name, int64 num_bytes) {
+  tf_data_bytes_read_counter->GetCell(name)->IncrementBy(num_bytes);
+}
+
+void RecordTFDataElements(const string& name, int64 num_elements) {
+  tf_data_elements_counter->GetCell(name)->IncrementBy(num_elements);
+}
+
+void RecordTFDataOptimization(const string& name, int64 num_changes) {
+  tf_data_optimization_counter->GetCell(name)->IncrementBy(num_changes);
+}
+
 void UpdateGraphExecTime(const uint64 running_time_usecs) {
   if (running_time_usecs > 0) {
     graph_runs->GetCell()->IncrementBy(1);
@@ -37,4 +84,12 @@ void UpdateGraphExecTime(const uint64 running_time_usecs) {
   }
 }
 
+void UpdateGraphBuildTime(const uint64 running_time_usecs) {
+  if (running_time_usecs > 0) {
+    build_graph_calls->GetCell()->IncrementBy(1);
+    build_graph_time_usecs->GetCell()->IncrementBy(running_time_usecs);
+  }
+}
+
+}  // namespace metrics
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/metrics.h b/tensorflow/core/common_runtime/metrics.h
index d3430c9f030998f118c1626e6bbed93dd316a525..bc73da4acc37008bbd50f3162f1f0d7bd3f4d865 100644
--- a/tensorflow/core/common_runtime/metrics.h
+++ b/tensorflow/core/common_runtime/metrics.h
@@ -19,9 +19,48 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
+namespace metrics {
+
+// Records that a tf.data.Dataset executed by the program used autotuning.
+//
+// The `name` argument identifies the Dataset type (e.g. "ParallelMap").
+void RecordTFDataAutotune(const string& name);
+
+// Records the number of bytes read from the filesystem by a tf.data.Dataset
+// source.
+//
+// The `name` argument identifies the Dataset type (e.g. "TFRecordDataset").
+void RecordTFDataBytesRead(const string& name, int64 num_bytes);
+
+// Records the number of elements produced by a tf.data.Dataset.
+//
+// The `name` argument identifies the Dataset type (e.g. "Batch" or "Map").
+void RecordTFDataElements(const string& name, int64 num_elements);
+
+// Records the number of independent graph changes resulting from the applicaton
+// of a tf.data optimization.
+//
+// The `name` argument identifies the optimization (e.g. "noop_eliminiation").
+void RecordTFDataOptimization(const string& name, int64 num_changes);
 
 void UpdateGraphExecTime(const uint64 running_time_usecs);
 
+// Updates the metrics stored about time spent building graphs.
+//
+// By "GraphBuild", we refer to building a client graph, which is a sub-graph of
+// the full graph, induced by a set of options. In particular, these options
+// include the feeds and fetches requested.
+//
+// This includes time spent:
+//   * optimizing the graphs with Grappler
+//   * pruning the sub-graph (unless the place_pruned_graph option is set)
+//
+// When executing eagerly, this will not record any activity.
+//
+// TODO(jtkeeling): Should we record building/optimizing tf.functions?
+void UpdateGraphBuildTime(const uint64 running_time_usecs);
+
+}  // namespace metrics
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_METRICS_H_
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index 429b19599b63740370ae49d7dbe9edcdf1e2c0ce..b467e7b311e3fe73d2eb094e5d92f124a8266a0b 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -39,6 +39,8 @@ typedef unsigned int uint;
 
 namespace tensorflow {
 
+static bool mkl_small_allocator_collect_stats = false;
+
 class MklSubAllocator : public BasicCPUAllocator {
  public:
   MklSubAllocator() : BasicCPUAllocator(port::kNUMANoAffinity, {}, {}) {}
@@ -62,15 +64,8 @@ class MklSmallSizeAllocator : public Allocator {
   inline string Name() override { return name_; }
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
-    void* ptr = sub_allocator_->Alloc(alignment, num_bytes);
-    if (ptr != nullptr) {
-      std::pair<void*, size_t> map_val(ptr, num_bytes);
-      mutex_lock l(mutex_);
-      // Check that insertion in the hash map was successful.
-      CHECK(map_.insert(map_val).second);
-      // Increment statistics for small-size allocations.
-      IncrementStats(num_bytes);
-    }
+    void* ptr = port::AlignedMalloc(num_bytes, alignment);
+    if (mkl_small_allocator_collect_stats) IncrementStats(num_bytes);
     return ptr;
   }
 
@@ -80,50 +75,42 @@ class MklSmallSizeAllocator : public Allocator {
       return;
     }
 
-    mutex_lock l(mutex_);
-    auto map_iter = map_.find(ptr);
-    if (map_iter != map_.end()) {
-      // Call free visitors.
-      size_t dealloc_bytes = map_iter->second;
-      sub_allocator_->Free(ptr, dealloc_bytes);
-      DecrementStats(dealloc_bytes);
-      map_.erase(map_iter);
-    } else {
-      LOG(ERROR) << "tried to deallocate invalid pointer";
-      return;
+    if (mkl_small_allocator_collect_stats) {
+      const size_t alloc_size = port::MallocExtension_GetAllocatedSize(ptr);
+      DecrementStats(alloc_size);
     }
+    port::AlignedFree(ptr);
   }
 
-  inline bool IsSmallSizeAllocation(const void* ptr) const {
+  absl::optional<AllocatorStats> GetStats() override {
     mutex_lock l(mutex_);
-    return map_.find(ptr) != map_.end();
-  }
-
-  void GetStats(AllocatorStats* stats) override {
-    mutex_lock l(mutex_);
-    *stats = stats_;
+    return stats_;
   }
 
   void ClearStats() override {
     mutex_lock l(mutex_);
-    stats_.Clear();
+    stats_.num_allocs = 0;
+    stats_.peak_bytes_in_use = 0;
+    stats_.largest_alloc_size = 0;
+    stats_.bytes_in_use = 0;
+    stats_.bytes_limit = 0;
   }
 
  private:
   // Increment statistics for the allocator handling small allocations.
-  inline void IncrementStats(size_t alloc_size)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+  inline void IncrementStats(size_t alloc_size) LOCKS_EXCLUDED(mutex_) {
+    mutex_lock l(mutex_);
     ++stats_.num_allocs;
     stats_.bytes_in_use += alloc_size;
-    stats_.max_bytes_in_use =
-        std::max(stats_.max_bytes_in_use, stats_.bytes_in_use);
-    stats_.max_alloc_size =
-        std::max(alloc_size, static_cast<size_t>(stats_.max_alloc_size));
+    stats_.peak_bytes_in_use =
+        std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use);
+    stats_.largest_alloc_size =
+        std::max(alloc_size, static_cast<size_t>(stats_.largest_alloc_size));
   }
 
   // Decrement statistics for the allocator handling small allocations.
-  inline void DecrementStats(size_t dealloc_size)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+  inline void DecrementStats(size_t dealloc_size) LOCKS_EXCLUDED(mutex_) {
+    mutex_lock l(mutex_);
     stats_.bytes_in_use -= dealloc_size;
   }
 
@@ -135,10 +122,6 @@ class MklSmallSizeAllocator : public Allocator {
   // Allocator name
   string name_;
 
-  // Hash map to keep track of "small" allocations
-  // We do not use BFC allocator for small allocations.
-  std::unordered_map<const void*, size_t> map_ GUARDED_BY(mutex_);
-
   // Allocator stats for small allocs
   AllocatorStats stats_ GUARDED_BY(mutex_);
 };
@@ -215,43 +198,72 @@ class MklCPUAllocator : public Allocator {
   }
 
   inline string Name() override { return kName; }
+  inline bool IsSmallSizeAllocation(const void* ptr) const
+      LOCKS_EXCLUDED(mutex_) {
+    mutex_lock l(mutex_);
+    return large_allocations_map_.find(ptr) == large_allocations_map_.end();
+  }
+  // AddLargeAllocMap and RemoveLargeAllocMap are always called with a lock held
+  inline void AddLargeAllocMap(void* ptr, size_t num_bytes)
+      EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+    if (ptr != nullptr) {
+      std::pair<void*, size_t> map_val(ptr, num_bytes);
+      large_allocations_map_.insert(map_val);
+    }
+  }
+  inline void RemoveLargeAllocMap(void* ptr) EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+    auto map_iter = large_allocations_map_.find(ptr);
+    if (map_iter != large_allocations_map_.end()) {
+      large_allocations_map_.erase(map_iter);
+    } else {
+      LOG(ERROR) << "tried to deallocate invalid pointer";
+    }
+    return;
+  }
 
   inline void* AllocateRaw(size_t alignment, size_t num_bytes) override {
     // If the allocation size is less than threshold, call small allocator,
     // otherwise call large-size allocator (BFC). We found that BFC allocator
     // does not deliver good performance for small allocations when
     // inter_op_parallelism_threads is high.
-    return (num_bytes < kSmallAllocationsThreshold)
-               ? small_size_allocator_->AllocateRaw(alignment, num_bytes)
-               : large_size_allocator_->AllocateRaw(alignment, num_bytes);
+    if (num_bytes < kSmallAllocationsThreshold) {
+      return small_size_allocator_->AllocateRaw(alignment, num_bytes);
+    } else {
+      mutex_lock l(mutex_);
+      void* ptr = large_size_allocator_->AllocateRaw(alignment, num_bytes);
+      AddLargeAllocMap(ptr, num_bytes);
+      return ptr;
+    }
   }
 
   inline void DeallocateRaw(void* ptr) override {
     // Check if ptr is for "small" allocation. If it is, then call Free
     // directly. Otherwise, call BFC to handle free.
-    if (small_size_allocator_->IsSmallSizeAllocation(ptr)) {
+    if (IsSmallSizeAllocation(ptr)) {
       small_size_allocator_->DeallocateRaw(ptr);
     } else {
+      mutex_lock l(mutex_);
+      RemoveLargeAllocMap(ptr);
       large_size_allocator_->DeallocateRaw(ptr);
     }
   }
 
-  void GetStats(AllocatorStats* stats) override {
-    AllocatorStats l_stats, s_stats;
-    small_size_allocator_->GetStats(&s_stats);
-    large_size_allocator_->GetStats(&l_stats);
+  absl::optional<AllocatorStats> GetStats() override {
+    auto s_stats = small_size_allocator_->GetStats();
+    auto l_stats = large_size_allocator_->GetStats();
 
     // Combine statistics from small-size and large-size allocator.
-    stats->num_allocs = l_stats.num_allocs + s_stats.num_allocs;
-    stats->bytes_in_use = l_stats.bytes_in_use + s_stats.bytes_in_use;
-    stats->max_bytes_in_use =
-        l_stats.max_bytes_in_use + s_stats.max_bytes_in_use;
+    stats_.num_allocs = l_stats->num_allocs + s_stats->num_allocs;
+    stats_.bytes_in_use = l_stats->bytes_in_use + s_stats->bytes_in_use;
+    stats_.peak_bytes_in_use =
+        l_stats->peak_bytes_in_use + s_stats->peak_bytes_in_use;
 
     // Since small-size allocations go to MklSmallSizeAllocator,
     // max_alloc_size from large_size_allocator would be the maximum
     // size allocated by MklCPUAllocator.
-    stats->max_alloc_size = l_stats.max_alloc_size;
-    stats->bytes_limit = std::max(s_stats.bytes_limit, l_stats.bytes_limit);
+    stats_.largest_alloc_size = l_stats->largest_alloc_size;
+    stats_.bytes_limit = std::max(s_stats->bytes_limit, l_stats->bytes_limit);
+    return stats_;
   }
 
   void ClearStats() override {
@@ -299,6 +311,13 @@ class MklCPUAllocator : public Allocator {
   MklSmallSizeAllocator* small_size_allocator_;  // owned by this class.
 
   SubAllocator* sub_allocator_;  // not owned by this class
+  mutable mutex mutex_;
+  AllocatorStats stats_ GUARDED_BY(mutex_);
+
+  // Hash map to keep track of "BFC" allocations
+  // We do not use BFC allocator for small allocations.
+  std::unordered_map<const void*, size_t> large_allocations_map_
+      GUARDED_BY(mutex_);
 
   // Size in bytes that defines the upper-bound for "small" allocations.
   // Any allocation below this threshold is "small" allocation.
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator_test.cc b/tensorflow/core/common_runtime/mkl_cpu_allocator_test.cc
index e08ab5763856956b435b7eb0451d8316af2d9337..ee1d9cd281bb5514074dd71ba2bdc2379c1ebfc1 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator_test.cc
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator_test.cc
@@ -24,22 +24,21 @@ limitations under the License.
 namespace tensorflow {
 
 TEST(MKLBFCAllocatorTest, TestMaxLimit) {
-  AllocatorStats stats;
   setenv(MklCPUAllocator::kMaxLimitStr, "1000", 1);
   MklCPUAllocator a;
   TF_EXPECT_OK(a.Initialize());
-  a.GetStats(&stats);
-  EXPECT_EQ(stats.bytes_limit, 1000);
+  auto stats = a.GetStats();
+  EXPECT_EQ(stats->bytes_limit, 1000);
 
   unsetenv(MklCPUAllocator::kMaxLimitStr);
   TF_EXPECT_OK(a.Initialize());
-  a.GetStats(&stats);
+  stats = a.GetStats();
   uint64 max_mem_bytes = MklCPUAllocator::kDefaultMaxLimit;
 #if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE)
   max_mem_bytes =
       (uint64)sysconf(_SC_PHYS_PAGES) * (uint64)sysconf(_SC_PAGESIZE);
 #endif
-  EXPECT_EQ(stats.bytes_limit, max_mem_bytes);
+  EXPECT_EQ(stats->bytes_limit, max_mem_bytes);
 
   setenv(MklCPUAllocator::kMaxLimitStr, "wrong-input", 1);
   EXPECT_TRUE(errors::IsInvalidArgument(a.Initialize()));
diff --git a/tensorflow/core/common_runtime/optimization_registry.cc b/tensorflow/core/common_runtime/optimization_registry.cc
index 9be540b0192416b6dfa636b054bd174bb8376eec..e7db3aed27e9dfeb6e2c051c482bb64f4f74f415 100644
--- a/tensorflow/core/common_runtime/optimization_registry.cc
+++ b/tensorflow/core/common_runtime/optimization_registry.cc
@@ -41,14 +41,20 @@ Status OptimizationPassRegistry::RunGrouping(
         Status s = pass->Run(options);
         if (!s.ok()) return s;
         if (VLOG_IS_ON(1)) {
-          DumpGraphToFile(
-              strings::StrCat("after_phase_", phase.first, "_", pass->name()),
-              **options.graph);
+          if (options.graph) {
+            DumpGraphToFile(
+                strings::StrCat(
+                    "after_phase_", phase.first, "_", pass->name(), "_",
+                    reinterpret_cast<uintptr_t>((*options.graph).get())),
+                **options.graph);
+          }
           if (options.partition_graphs) {
             for (auto& part : *options.partition_graphs) {
               DumpGraphToFile(
-                  strings::StrCat("after_phase_", phase.first, "_",
-                                  pass->name(), "_partition_", part.first),
+                  strings::StrCat(
+                      "after_phase_", phase.first, "_", pass->name(),
+                      "_partition_", part.first, "_",
+                      reinterpret_cast<uintptr_t>(part.second.get())),
                   *part.second);
             }
           }
diff --git a/tensorflow/core/common_runtime/optimization_registry.h b/tensorflow/core/common_runtime/optimization_registry.h
index 6fcd2afd2752007996d16358d5118211357fe6c6..0e31f389aa71a5734b1f11b95a056c0d07aabeb9 100644
--- a/tensorflow/core/common_runtime/optimization_registry.h
+++ b/tensorflow/core/common_runtime/optimization_registry.h
@@ -35,6 +35,7 @@ struct SessionOptions;
 // as a key into a state dictionary if it wants to keep state across
 // calls.
 struct GraphOptimizationPassOptions {
+  // Filled in by DirectSession for PRE_PLACEMENT optimizations. Can be empty.
   string session_handle;
   const SessionOptions* session_options = nullptr;
   const CostModel* cost_model = nullptr;
@@ -94,6 +95,10 @@ class OptimizationPassRegistry {
   void Register(Grouping grouping, int phase,
                 std::unique_ptr<GraphOptimizationPass> pass);
 
+  const std::map<Grouping, GraphOptimizationPasses>& groups() {
+    return groups_;
+  }
+
   // Run all passes in grouping, ordered by phase, with the same
   // options.
   Status RunGrouping(Grouping grouping,
diff --git a/tensorflow/core/common_runtime/parallel_concat_optimizer.cc b/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
index 6af4ca4d961d96a46be67e3770434e380658f32a..ecb2670a74b9387f46ed21eb5bc40c87136b3254 100644
--- a/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
+++ b/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
@@ -52,7 +52,8 @@ class ParallelConcatRemovePass : public GraphOptimizationPass {
       AttrSlice n_attrs = n->attrs();
       auto base_make_node = [n, &n_attrs](const string& op,
                                           const string& name) {
-        NodeBuilder node_builder(name, op);
+        NodeDebugInfo debug_info(*n);
+        NodeBuilder node_builder(name, op, OpRegistry::Global(), &debug_info);
         node_builder.Device(n->requested_device());
         string colo;
         if (GetNodeAttr(n_attrs, "_class", &colo).ok()) {
diff --git a/tensorflow/core/common_runtime/partitioning_utils.cc b/tensorflow/core/common_runtime/partitioning_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d51caaea8f1d12b472232718c973749e47146728
--- /dev/null
+++ b/tensorflow/core/common_runtime/partitioning_utils.cc
@@ -0,0 +1,143 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/partitioning_utils.h"
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_partition.h"
+
+namespace tensorflow {
+
+Status PartitionFunctionGraph(
+    const DeviceSet& device_set, std::unique_ptr<Graph> graph,
+    std::unordered_map<string, std::unique_ptr<Graph>>* subgraphs) {
+  PartitionOptions partition_options;
+  partition_options.node_to_loc = [](const Node* node) {
+    // TODO(iga): To support the distributed case, first split the graph by
+    // worker (e.g,. using the master session's `SplitByWorker` policy), and
+    // then recursively partition the per-worker shards at the remote worker(s).
+    // Currently, we simply split the graph at device boundaries.
+    return node->assigned_device_name();
+  };
+  int64 edge_name_counter = 0;
+  partition_options.new_name = [&edge_name_counter](const string& prefix) {
+    return strings::StrCat(prefix, "/_", ++edge_name_counter);
+  };
+  partition_options.get_incarnation =
+      [&device_set](const string& name) -> int64 {
+    const Device* d = device_set.FindDeviceByName(name);
+    if (d == nullptr) {
+      return PartitionOptions::kIllegalIncarnation;
+    } else {
+      return d->attributes().incarnation();
+    }
+  };
+  partition_options.control_flow_added = false;
+  std::unordered_map<string, GraphDef> partitions;
+  TF_RETURN_IF_ERROR(Partition(partition_options, graph.get(), &partitions));
+
+  for (const auto& partition : partitions) {
+    const string& device = partition.first;
+    const GraphDef& graph_def = partition.second;
+    // Each partition gets a copy of all the
+    // std::unique_ptr<Graph> subgraph(new Graph(graph->flib_def()));
+    std::unique_ptr<Graph> subgraph(
+        new Graph(graph->flib_def().ReachableDefinitions(graph_def)));
+    FunctionLibraryDefinition global_flib(OpRegistry::Global(), {});
+    TF_CHECK_OK(subgraph->AddFunctionLibrary(global_flib.ToProto()));
+    GraphConstructorOptions opts;
+    opts.allow_internal_ops = true;
+    opts.expect_device_spec = true;
+    TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, graph_def, subgraph.get()));
+    subgraphs->emplace(device, std::move(subgraph));
+  }
+
+  return Status::OK();
+}
+
+Status UpdateArgAndRetvalMetadata(
+    Graph* subgraph, std::vector<int>* arg_indices,
+    std::vector<int>* ret_indices,
+    std::vector<AllocatorAttributes>* arg_alloc_attrs,
+    std::vector<AllocatorAttributes>* ret_alloc_attrs) {
+  std::vector<std::pair<Node*, int>> arg_nodes;
+  std::vector<std::pair<Node*, int>> ret_nodes;
+  const AttrValue* attr_value;
+
+  // Find the Arg and Retval nodes, along with their corresponding indices
+  // in the original function.
+  for (Node* node : subgraph->op_nodes()) {
+    string node_type = node->type_string();
+    if (node_type == FunctionLibraryDefinition::kArgOp) {
+      TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
+      int index = static_cast<int>(attr_value->i());
+      arg_indices->push_back(index);
+      arg_nodes.push_back(std::make_pair(node, index));
+    } else if (node_type == FunctionLibraryDefinition::kRetOp) {
+      TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
+      int index = static_cast<int>(attr_value->i());
+      ret_indices->push_back(index);
+      ret_nodes.push_back(std::make_pair(node, index));
+    }
+  }
+
+  for (int i = 0; i < arg_nodes.size(); ++i) {
+    Node* arg = arg_nodes[i].first;
+    arg->AddAttr("index", i);
+    TF_RETURN_IF_ERROR(arg->attrs().Find("T", &attr_value));
+    AllocatorAttributes alloc_attr;
+    DataType type = attr_value->type();
+    if (MTypeFromDType(type) == HOST_MEMORY) {
+      alloc_attr.set_on_host(true);
+    }
+    arg_alloc_attrs->push_back(alloc_attr);
+  }
+  for (int i = 0; i < ret_nodes.size(); ++i) {
+    Node* ret = ret_nodes[i].first;
+    ret->AddAttr("index", i);
+    TF_RETURN_IF_ERROR(ret->attrs().Find("T", &attr_value));
+    AllocatorAttributes alloc_attr;
+    DataType type = attr_value->type();
+    if (MTypeFromDType(type) == HOST_MEMORY) {
+      alloc_attr.set_on_host(true);
+    }
+    ret_alloc_attrs->push_back(alloc_attr);
+  }
+
+  return Status::OK();
+}
+
+std::vector<Tensor> GetArgsForIndices(const std::vector<int>& indices,
+                                      gtl::ArraySlice<Tensor> arguments) {
+  std::vector<Tensor> args;
+  args.reserve(indices.size());
+  for (int i : indices) {
+    args.push_back(arguments[i]);
+  }
+  return args;
+}
+
+string FunctionNameGenerator::GetName() {
+  for (;; ++counter_) {
+    const string candidate = strings::StrCat(name_, "_", counter_);
+    if (flib_def_->Find(candidate) == nullptr) {
+      return candidate;
+    }
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/partitioning_utils.h b/tensorflow/core/common_runtime/partitioning_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..c282647e7027414b4f925d1d6d93fcc1624dc81a
--- /dev/null
+++ b/tensorflow/core/common_runtime/partitioning_utils.h
@@ -0,0 +1,90 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PARTITIONING_UTILS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_PARTITIONING_UTILS_H_
+
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Given a `device_set` and a `graph`, partitions the `graph` into
+// `subgraphs`. `subgraphs` maps device names to the graph assigned to that
+// device. `graph` must have been placed (e.g. by running Placer),
+// i.e. all nodes must have an assigned_device set.
+// `graph` is non-const because the underlying Partition() function transforms
+// the graph to correctly partition distributed control flow.
+Status PartitionFunctionGraph(
+    const DeviceSet& device_set, std::unique_ptr<Graph> graph,
+    std::unordered_map<string, std::unique_ptr<Graph>>* subgraphs);
+
+// Each subgraph produced by partitioning the function body contains a subset
+// of the original `Arg` and `Retval` nodes. This function performs
+// bookkeeping to track which `Arg` and `Retval` nodes were placed on a
+// particular device / subgraph.
+//
+// More specifically, this function
+//  (1) rewrites the indices of the `Arg` and `Retval` nodes placed
+//      on a particular device.  When a function is parittioned each
+//      partition, `subgraph`, get a subset of the arguments and
+//      return values. The `index` attributes of these _Arg and _Retval
+//      nodes reflect the indices of these parameters in the original
+//      function. To convert `subgraph` to a function, we need to replace
+//      there original indices with 0, 1, 2, ... .
+//
+//      The argument and return value order in the partitioned function is
+//      determined by the node iteration order in `subgraph`. This order
+//      is also used in UpdateArgAndRetvalMetadata. This is fine because the
+//      node iteration order is deterministic - it follows the node ids.
+//  (2) records the subsets of `Arg` and `Retval` nodes assigned to the
+//      device in `*_indices`, and
+//  (3) records which `Arg` and `Retval` nodes live in host memory in
+//      `*_alloc_attrs`.
+Status UpdateArgAndRetvalMetadata(
+    Graph* subgraph, std::vector<int>* arg_indices,
+    std::vector<int>* ret_indices,
+    std::vector<AllocatorAttributes>* arg_alloc_attrs,
+    std::vector<AllocatorAttributes>* ret_alloc_attrs);
+
+// Extracts tensors at `indices` from `arguments`.
+std::vector<Tensor> GetArgsForIndices(const std::vector<int>& indices,
+                                      gtl::ArraySlice<Tensor> arguments);
+
+// Utility for generating function names not present in `flib_def`, using
+// given `name` as the base for the name.
+class FunctionNameGenerator {
+ public:
+  // `flib_def` must outlive this.
+  FunctionNameGenerator(const FunctionLibraryDefinition* flib_def,
+                        const string& name)
+      : flib_def_(flib_def), name_(name), counter_(0) {}
+
+  // Returns a function name not present in `flib_def` using `name` as
+  // the base and appending a numeric suffix.
+  string GetName();
+
+ private:
+  const FunctionLibraryDefinition* flib_def_;
+  const string name_;
+  uint32 counter_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_PARTITIONING_UTILS_H_
diff --git a/tensorflow/core/common_runtime/partitioning_utils_test.cc b/tensorflow/core/common_runtime/partitioning_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0d4e36222ba7809dae73fb6eaaceda7fd497288a
--- /dev/null
+++ b/tensorflow/core/common_runtime/partitioning_utils_test.cc
@@ -0,0 +1,207 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/partitioning_utils.h"
+
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/function_testlib.h"
+#include "tensorflow/core/common_runtime/placer.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+class PartitioningUtilsTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    SessionOptions options;
+    auto* device_count = options.config.mutable_device_count();
+    device_count->insert({"CPU", 2});
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, "/job:a/replica:0/task:0",
+                                          &devices));
+    device0_ = devices[0].get();
+    device1_ = devices[1].get();
+    device_mgr_.reset(new DeviceMgr(std::move(devices)));
+
+    for (auto d : device_mgr_->ListDevices()) {
+      device_set_.AddDevice(d);
+    }
+  }
+
+  void SwapGraph(Graph* graph, bool assign_device = false) {
+    Scope s = Scope::NewRootScope();
+    if (assign_device) {
+      s = s.WithDevice(device0_->name());
+    }
+    auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s.WithOpName("y"), DT_FLOAT, 1);
+    auto id_x = ops::Identity(s.WithOpName("id_x"), x);
+    auto id_y = ops::Identity(s.WithOpName("id_y"), y);
+    auto dx_retval = ops::_Retval(s.WithOpName("retval1"), id_y, 0);
+    auto dy_retval = ops::_Retval(s.WithOpName("retval2"), id_x, 1);
+    TF_ASSERT_OK(s.ToGraph(graph));
+
+    if (assign_device) {
+      Placer placer(graph, &device_set_, nullptr, /* No session options */
+                    device0_);
+      TF_ASSERT_OK(placer.Run());
+    }
+  }
+
+  void TwoDeviceSwapGraph(Graph* graph) {
+    Scope s = Scope::NewRootScope();
+    Scope s1 = s.WithDevice("/job:a/replica:0/task:0/device:CPU:0");
+    Scope s2 = s.WithDevice("/job:a/replica:0/task:0/device:CPU:1");
+    auto x = ops::_Arg(s1.WithOpName("x"), DT_FLOAT, 0);
+    auto y = ops::_Arg(s2.WithOpName("y"), DT_FLOAT, 1);
+    auto id_x = ops::Identity(s1.WithOpName("id_x"), x);
+    auto id_y = ops::Identity(s2.WithOpName("id_y"), y);
+    auto dx_retval = ops::_Retval(s2.WithOpName("retval1"), id_y, 0);
+    auto dy_retval = ops::_Retval(s1.WithOpName("retval2"), id_x, 1);
+    TF_ASSERT_OK(s.ToGraph(graph));
+    Placer placer(graph, &device_set_, nullptr, /* No session options */
+                  device0_);
+    TF_ASSERT_OK(placer.Run());
+  }
+
+  // Fills subgraph with an identify function arg->identity->ret
+  // where each node has type `dtype` and arg/ret nodes have
+  // indices `arg_index` and `ret_index`.
+  void SubGraph(Graph* subgraph, DataType dtype, int arg_index, int ret_index) {
+    Scope s = Scope::NewRootScope();
+    Scope s1 = s.WithDevice("/job:a/replica:0/task:0/device:CPU:0");
+    auto x = ops::_Arg(s1.WithOpName("x"), dtype, arg_index);
+    auto id_x = ops::Identity(s1.WithOpName("id_x"), x);
+    auto dx_retval = ops::_Retval(s1.WithOpName("retval1"), id_x, ret_index);
+    TF_ASSERT_OK(s.ToGraph(subgraph));
+    Placer placer(subgraph, &device_set_, nullptr, /* No session options */
+                  device0_);
+    TF_ASSERT_OK(placer.Run());
+  }
+
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  Device* device0_ = nullptr;  // Not owned. (Owned by device_mgr_.)
+  Device* device1_ = nullptr;  // Not owned. (Owned by device_mgr_.)
+  DeviceSet device_set_;
+};
+
+TEST_F(PartitioningUtilsTest, GraphWithoutAssignedDevicesFails) {
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+  SwapGraph(graph.get());
+
+  std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
+  Status status =
+      PartitionFunctionGraph(device_set_, std::move(graph), &subgraphs);
+  ASSERT_TRUE(errors::IsInvalidArgument(status)) << status.ToString();
+}
+
+TEST_F(PartitioningUtilsTest, OneDevice) {
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+  SwapGraph(graph.get(), true);
+  int num_nodes = graph->num_op_nodes();
+
+  std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
+  Status status =
+      PartitionFunctionGraph(device_set_, std::move(graph), &subgraphs);
+  ASSERT_TRUE(status.ok()) << status.ToString();
+
+  ASSERT_EQ(1, subgraphs.size());
+  const auto& pair = *subgraphs.begin();
+  ASSERT_EQ("/job:a/replica:0/task:0/device:CPU:0", pair.first);
+  ASSERT_EQ(num_nodes, pair.second->num_op_nodes());
+}
+
+TEST_F(PartitioningUtilsTest, TwoDevices) {
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+  TwoDeviceSwapGraph(graph.get());
+
+  std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
+  Status status =
+      PartitionFunctionGraph(device_set_, std::move(graph), &subgraphs);
+  ASSERT_TRUE(status.ok()) << status.ToString();
+
+  ASSERT_EQ(2, subgraphs.size());
+
+  const auto& part1 = subgraphs["/job:a/replica:0/task:0/device:CPU:0"];
+  ASSERT_EQ(3, part1->num_op_nodes());
+  const auto& part2 = subgraphs["/job:a/replica:0/task:0/device:CPU:1"];
+  ASSERT_EQ(3, part2->num_op_nodes());
+}
+
+void CheckIndices(const std::vector<int>& expected,
+                  const std::vector<int>& actual) {
+  ASSERT_EQ(expected.size(), actual.size());
+  for (int i = 0; i < expected.size(); ++i) {
+    ASSERT_EQ(expected[i], actual[i]) << " at index " << i;
+  }
+}
+
+void CheckAlloc(const std::vector<bool>& expected,
+                const std::vector<AllocatorAttributes>& actual) {
+  ASSERT_EQ(expected.size(), actual.size());
+  for (int i = 0; i < expected.size(); ++i) {
+    ASSERT_EQ(expected[i], actual[i].on_host()) << " at index " << i;
+  }
+}
+
+void CheckIndex(const Node& node, int expected_index) {
+  const AttrValue* attr_value;
+  TF_ASSERT_OK(node.attrs().Find("index", &attr_value));
+  int index = static_cast<int>(attr_value->i());
+  ASSERT_EQ(expected_index, index);
+}
+
+TEST_F(PartitioningUtilsTest, UpdateArgsAndRets) {
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+  SubGraph(graph.get(), DT_FLOAT, 3, 5);
+
+  std::vector<int> arg_indices;
+  std::vector<int> ret_indices;
+  std::vector<AllocatorAttributes> arg_alloc_attrs;
+  std::vector<AllocatorAttributes> ret_alloc_attrs;
+
+  Status status =
+      UpdateArgAndRetvalMetadata(graph.get(), &arg_indices, &ret_indices,
+                                 &arg_alloc_attrs, &ret_alloc_attrs);
+  ASSERT_TRUE(status.ok()) << status.ToString();
+
+  CheckIndices({3}, arg_indices);
+  CheckIndices({5}, ret_indices);
+  CheckAlloc({false}, arg_alloc_attrs);
+  CheckAlloc({false}, ret_alloc_attrs);
+
+  std::unordered_map<string, Node*> nodes = graph->BuildNodeNameIndex();
+  ASSERT_EQ(1, nodes.count("x"));
+  CheckIndex(*nodes["x"], 0);
+  ASSERT_EQ(1, nodes.count("retval1"));
+  CheckIndex(*nodes["retval1"], 0);
+}
+
+}  // anonymous namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index 515c1971d9d5cb179b7b9764ff3462579e742dfc..b2f4f1aa49b0fea64e8db0eb34101edde0adc738 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/common_runtime/colocation_graph.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
@@ -32,765 +34,14 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/dump_graph.h"
 #include "tensorflow/core/util/port.h"
 
 namespace tensorflow {
 
 namespace {
 
-// We hoist the conversion from C-style string literal to StringPiece here,
-// so that we can avoid the many repeated calls to strlen().
-const StringPiece kColocationAttrNameStringPiece(kColocationAttrName);
-const StringPiece kColocationGroupPrefixStringPiece(kColocationGroupPrefix);
-
-// Returns a list of devices having type in supported_device_types.  The
-// returned list is sorted by preferred type (higher numeric type is preferred).
-std::vector<Device*> FilterSupportedDevices(
-    const std::vector<Device*>& devices,
-    const PrioritizedDeviceTypeVector& supported_device_types,
-    const Device* default_device) {
-  Device* filtered_default_device = nullptr;
-  std::vector<std::pair<Device*, int32>> prioritized_filtered_devices;
-  for (const auto& supported_device_type : supported_device_types) {
-    for (Device* device : devices) {
-      if (DeviceType(device->attributes().device_type()) ==
-          supported_device_type.first) {
-        if (device == default_device) {
-          filtered_default_device = device;
-        } else {
-          prioritized_filtered_devices.emplace_back(
-              device, supported_device_type.second);
-        }
-      }
-    }
-  }
-
-  auto device_sort = [](const std::pair<Device*, int32>& a,
-                        const std::pair<Device*, int32>& b) {
-    if (a.second != b.second) {
-      return a.second > b.second;
-    }
-
-    auto a_priority =
-        DeviceSet::DeviceTypeOrder(DeviceType(a.first->device_type()));
-    auto b_priority =
-        DeviceSet::DeviceTypeOrder(DeviceType(b.first->device_type()));
-    // First sort by prioritized device type (higher is preferred) and
-    // then by device name (lexicographically).
-    if (a_priority != b_priority) {
-      return a_priority > b_priority;
-    }
-    return StringPiece(a.first->name()) < StringPiece(b.first->name());
-  };
-  std::sort(prioritized_filtered_devices.begin(),
-            prioritized_filtered_devices.end(), device_sort);
-
-  std::vector<Device*> filtered_devices;
-  if (filtered_default_device != nullptr) {
-    filtered_devices.emplace_back(filtered_default_device);
-  }
-  for (const auto& prioritized_filtered_device : prioritized_filtered_devices) {
-    filtered_devices.push_back(prioritized_filtered_device.first);
-  }
-  return filtered_devices;
-}
-
-// This class maintains the connected components of a colocation
-// constraint graph, and uses this information to assign a satisfying
-// device placement to the nodes of the graph.
-//
-// The typical usage pattern is:
-//
-//   Graph graph = ...;
-//   DeviceSet device_set = ...;
-//   ColocationGraph colocation_graph(graph, device_set);
-//
-//   // Add all the nodes of the `graph` to the `colocation_graph`.
-//   for (Node* node : graph.nodes()) {
-//     TF_RETURN_IF_ERROR(colocation_graph.AddNode(*node));
-//   }
-//
-//   // Add one or more colocation constraints.
-//   Node node_1 = *graph.FindNodeId(...);
-//   Node node_2 = *graph.FindNodeId(...);
-//   TF_RETURN_IF_ERROR(colocation_graph.ColocateNodes(node_1, node_2));
-//
-//   // Assign devices based on the accumulated constraints.
-//   for (Node* node : graph.nodes()) {
-//     TF_RETURN_IF_ERROR(colocation_graph.AssignDevice(node));
-//   }
-//
-// This implementation uses the Union-Find algorithm to efficiently maintain the
-// connected components and incrementally adds edges via
-// ColocationGraph::ColocateNodes() invocations.
-class ColocationGraph {
- public:
-  ColocationGraph(Graph* graph, const DeviceSet* device_set,
-                  bool allow_soft_placement, const Device* default_device)
-      : graph_(graph),
-        device_set_(device_set),
-        device_types_(device_set->PrioritizedDeviceTypeList()),
-        allow_soft_placement_(allow_soft_placement),
-        default_device_(default_device) {
-    members_.resize(graph->num_node_ids());
-  }
-
-  // Adds each node of the Graph to this ColocationGraph as a singleton.
-  //
-  // NOTE: The implementation assumes that the ids of nodes passed to
-  // this method are dense and zero-based; the memory used will be linear in
-  // the largest node ID.
-  // NOTE: If this method returns an error, *this is left in an undefined
-  // state.
-  Status ColocateAllNodes() {
-    // This maps from a colocation group identifier to the 'root' of that
-    // colocation group.  Note that the keys in this map are StringPiece; the
-    // actual strings are stored under the NodeDef.  The lifetime of this map
-    // is limited to this ColocateAllNodes() method, and no part of the
-    // NodeDef trees are changed during the lifetime of this method, so using
-    // StringPiece as a key is safe.
-    //
-    // Also, as a further optimization, we remove the "loc:@" prefix from
-    // "class" attribute values, when they are used as keys in this table.
-    // This allows us to use StringPiece values that refer to substrings of
-    // 'string' values stored in NodeDef attribute lists, as well as StringPiece
-    // values that refer to 'string' values from NodeDef::name(), without
-    // performing any string allocations.
-    std::unordered_map<StringPiece, const Node*, StringPieceHasher>
-        colocation_group_root;
-
-    for (Node* node : graph_->op_nodes()) {
-      // When adding the node, identify whether it is part of a colocation
-      // group.
-
-      // This code is effectively the equivalent of GetNodeAttr() for a string
-      // array, but it avoids all internal allocations (the allocation of the
-      // backing store of the std::vector<string> as well as the copies of the
-      // strings within it).  Instead, we combine the query of the colocation
-      // attribute with the calls to ColocateNodeToGroup.
-      bool found_spec = false;
-      const AttrValue* attr_value =
-          node->attrs().Find(kColocationAttrNameStringPiece);
-      if (attr_value != nullptr && attr_value->has_list()) {
-        for (const string& class_spec : attr_value->list().s()) {
-          StringPiece spec(class_spec);
-          if (str_util::ConsumePrefix(&spec,
-                                      kColocationGroupPrefixStringPiece)) {
-            found_spec = true;
-            TF_RETURN_IF_ERROR(
-                ColocateNodeToGroup(&colocation_group_root, node, spec));
-          }
-        }
-      }
-
-      if (!found_spec) {
-        // If the node does not specify a colocation group, then use the
-        // name of this node as the colocation group.
-        TF_RETURN_IF_ERROR(
-            ColocateNodeToGroup(&colocation_group_root, node, node->name()));
-      }
-    }
-
-    return Status::OK();
-  }
-
-  Status ColocateNodeToGroup(
-      std::unordered_map<StringPiece, const Node*, StringPieceHasher>*
-          colocation_group_root,
-      Node* node, StringPiece colocation_group) {
-    const Node*& root_node = (*colocation_group_root)[colocation_group];
-    if (root_node == nullptr) {
-      // This is the first node of the colocation group, so
-      // designate this node as the 'root' of that colocation group.
-      root_node = node;
-    } else {
-      // Try to colocate the node with the root.  If there is an
-      // error, return it.
-      Status s = ColocateNodes(*node, *root_node);
-      if (!s.ok()) {
-        return AttachDef(s, *node);
-      }
-    }
-    return Status::OK();
-  }
-
-  // Merge the (possibly disjoint) sets containing nodes "x" and
-  // "y". Returns OK if the all nodes in the union of these sets can
-  // be placed on the same device type.
-  //
-  // NOTE: If this method returns an error, *this is left in an undefined
-  // state.
-  Status ColocateNodes(const Node& x, const Node& y) {
-    int x_root = FindRoot(x.id());
-    int y_root = FindRoot(y.id());
-    return ColocateNodes(x, x_root, y, y_root);
-  }
-
-  // This overload of ColocateNodes() allows a caller to provide the root node
-  // ids for the two nodes. For large graphs, this noticeably reduces the
-  // graph load time.
-  Status ColocateNodes(const Node& x, int x_root, const Node& y, int y_root) {
-    if (x_root == y_root) {
-      return Status::OK();
-    }
-
-    DCHECK_EQ(x_root, FindRoot(x.id()));
-    DCHECK_EQ(y_root, FindRoot(y.id()));
-
-    Member& x_root_member = members_[x_root];
-    Member& y_root_member = members_[y_root];
-
-    // Merge the sets by setting the parent pointer of the smaller tree's root
-    // node to point to the root of the larger tree. Together with path
-    // compression in ColocationGraph::FindRoot, this ensures that we do not
-    // experience pathological performance on graphs such as chains.
-    int new_root, old_root;
-    if (x_root_member.rank < y_root_member.rank) {
-      // The tree rooted at x_root is shallower, so connect it to
-      // y_root. The rank of y_root is unchanged because its new
-      // child has strictly less rank.
-      x_root_member.parent = y_root;
-      new_root = y_root;
-      old_root = x_root;
-    } else if (x_root_member.rank > y_root_member.rank) {
-      // The tree rooted at y_root is shallower, so connect it to
-      // x_root. The rank of x_root is unchanged because its new
-      // child has strictly less rank.
-      y_root_member.parent = x_root;
-      new_root = x_root;
-      old_root = y_root;
-    } else {
-      // Both trees have the same rank, so break the tie by choosing
-      // x_root as the new root.
-      y_root_member.parent = x_root;
-      // Increment the rank of the tree rooted at x_root, because it
-      // is now strictly deeper than before.
-      ++x_root_member.rank;
-      new_root = x_root;
-      old_root = y_root;
-    }
-
-    Member& new_root_member = members_[new_root];
-    Member& old_root_member = members_[old_root];
-
-    // Merge the partial device specifications, and ensure that they are
-    // compatible. NULL options_ is treated as allowing soft placement.
-    // TODO(mrry): Consider enriching the error message by pointing
-    // out which nodes have the explicit partial device
-    // specifications that caused this conflict.
-    Status s = DeviceNameUtils::MergeDevNames(&new_root_member.device_name,
-                                              old_root_member.device_name,
-                                              allow_soft_placement_);
-    if (!s.ok()) {
-      return errors::InvalidArgument(
-          "Cannot colocate nodes ",
-          errors::FormatColocationNodeForError(x.name()), " and ",
-          errors::FormatColocationNodeForError(y.name()), ": ",
-          s.error_message());
-    }
-
-    // Ensure that the common root has at least one supported device
-    // type, by computing the intersection of
-    // new_root_member.supported_device_types and
-    // old_root_member.supported_device_types.
-    MergeSupportedDevices(&new_root_member.supported_device_types,
-                          old_root_member.supported_device_types);
-    if (new_root_member.supported_device_types.empty()) {
-      return errors::InvalidArgument(
-          "Cannot colocate nodes ",
-          errors::FormatColocationNodeForError(x.name()), " and ",
-          errors::FormatColocationNodeForError(y.name()),
-          " because no device type supports both of those nodes and the "
-          "other nodes colocated with them.",
-          DebugInfo(x_root), DebugInfo(y_root));
-    }
-
-    return Status::OK();
-  }
-
-  // For the given node, subject to the constraints previously given
-  // to this ColocationGraph, set its assigned_device_name. Returns OK
-  // if a satisfying device can be found, otherwise an error.
-  //
-  // Note: This method returns a pointer to a field within members_.
-  // The caller must not use the returned pointer after there is any possibility
-  // that the members_[i].possible_devices field has been modified.
-  Status GetDevicesForNode(Node* node,
-                           std::vector<Device*>** possible_devices) {
-    *possible_devices = nullptr;
-    const int node_root = FindRoot(node->id());
-    if (!members_[node_root].possible_devices.empty()) {
-      *possible_devices = &members_[node_root].possible_devices;
-      return Status::OK();
-    }
-
-    // We have not yet computed the possible devices for the
-    // colocated node set containing 'node', so we do so now using the
-    // constraints on the root node.
-
-    // "devices" will contain the set of feasible placements for the
-    // colocated node set containing 'node'.
-    std::vector<Device*> devices;
-    if (DeviceNameUtils::HasSomeDetails(members_[node_root].device_name)) {
-      // The root node has a (possibly partial) device
-      // specification, so enumerate the physical devices that
-      // conform to it.
-      device_set_->FindMatchingDevices(members_[node_root].device_name,
-                                       &devices);
-
-      if (!devices.empty()) {
-        // Filter devices into those that are compatible with the root
-        // node (and its children).
-        devices = FilterSupportedDevices(
-            devices, members_[node_root].supported_device_types,
-            default_device_);
-      }
-
-      // Perform soft placement if allow_soft_placement_ is set.
-      if (devices.empty() && allow_soft_placement_) {
-        // The soft_device_name is the same as the node's device name
-        // without specifying the device type or ID.
-        DeviceNameUtils::ParsedName soft_device_name =
-            members_[node_root].device_name;
-        soft_device_name.type.clear();
-        soft_device_name.has_type = false;
-        soft_device_name.has_id = false;
-        device_set_->FindMatchingDevices(soft_device_name, &devices);
-        if (!devices.empty()) {
-          devices = FilterSupportedDevices(
-              devices, members_[node_root].supported_device_types,
-              default_device_);
-        }
-      }
-
-      if (devices.empty()) {
-        // Return an error when a physical device that matches an explicit
-        // device specification is not found. This ensures that we don't
-        // assign a node to GPU when the user wanted to force it on CPU.
-        string debug_info = DebugInfo(node_root);
-
-        DeviceNameUtils::ParsedName specified_device_name;
-        if (DeviceNameUtils::ParseFullName(node->requested_device(),
-                                           &specified_device_name) &&
-            specified_device_name == members_[node_root].device_name) {
-          // The specified device and merged set device match, and
-          // will appear in the GraphDef (for debugging), so just
-          // print the specified device.
-          std::vector<Device*> devices_matching_nodedef;
-          device_set_->FindMatchingDevices(specified_device_name,
-                                           &devices_matching_nodedef);
-          if (devices_matching_nodedef.empty()) {
-            // Sometimes it is almost impossible to understand the problem
-            // without a list of available devices.
-            std::vector<string> device_names;
-            for (const Device* device : device_set_->devices()) {
-              device_names.push_back(device->name());
-            }
-            std::sort(device_names.begin(), device_names.end());
-
-            string gpu_msg = "";
-            if (!IsGoogleCudaEnabled() &&
-                str_util::Lowercase(specified_device_name.type) == "gpu") {
-              gpu_msg =
-                  " The requested device appears to be a GPU, but CUDA is not "
-                  "enabled.";
-            }
-
-            return errors::InvalidArgument(
-                errors::FormatNodeNameForError(node->name()),
-                "was explicitly assigned to ", node->requested_device(),
-                " but available devices are [ ",
-                str_util::Join(device_names, ", "), " ]. Make sure ",
-                "the device specification refers to a valid device.", gpu_msg);
-          } else if (specified_device_name.has_type) {
-            return errors::InvalidArgument(
-                "Could not satisfy explicit device specification '",
-                node->requested_device(), "' because no supported kernel for ",
-                specified_device_name.type, " devices is available.",
-                debug_info, "\nRegistered kernels:\n",
-                KernelsRegisteredForOp(node->type_string()));
-          } else {
-            return errors::InvalidArgument(
-                "Could not satisfy explicit device specification '",
-                node->requested_device(), debug_info);
-          }
-        } else {
-          // The specified device may be a valid device but the
-          // merged set device is different, so print both.
-          return errors::InvalidArgument(
-              "Could not satisfy explicit device specification '",
-              node->requested_device(), "' because the node ",
-              errors::FormatColocationNodeForError(node->name()),
-              " was colocated with a group of nodes that ",
-              "required incompatible device '",
-              DeviceNameUtils::ParsedNameToString(
-                  members_[node_root].device_name),
-              "'", debug_info);
-        }
-      }
-    } else {
-      // The device is completely unspecified, so enumerate the devices that
-      // support all of the nodes in the set.
-      if (device_set_->devices().empty()) {
-        return errors::Internal("No devices are registered");
-      }
-      devices = FilterSupportedDevices(
-          device_set_->devices(), members_[node_root].supported_device_types,
-          default_device_);
-
-      if (devices.empty()) {
-        return errors::InvalidArgument(
-            "Node had no OpKernel registered to support this operation: ",
-            "Operation was ", node->type_string(), " and inputs were ",
-            DataTypeVectorString(node->input_types()), DebugInfo(node_root));
-      }
-    }
-
-    // Cache the result of the possible devices for this node group.
-    members_[node_root].possible_devices = std::move(devices);
-    *possible_devices = &members_[node_root].possible_devices;
-    return Status::OK();
-  }
-
-  Status InitializeMembers() {
-    for (Node* node : graph_->nodes()) {
-      if (!node->IsOp()) {
-        continue;
-      }
-      Status status = InitializeMember(*node, &members_[node->id()]);
-      if (!status.ok()) {
-        return AttachDef(status, *node);
-      }
-    }
-    return Status::OK();
-  }
-
-  // Represents a node in the disjoint node set forest, and the
-  // accumulated constraints on the device used by that node.
-  struct Member {
-    Member() = default;
-    // The id of the node that is the parent of this one, or its own
-    // id if it is a root. parent <= 0 indicates that this member is invalid.
-    int parent = -1;
-
-    // A proxy for the depth of the tree that is used to prefer
-    // connecting smaller trees to larger trees when merging disjoint
-    // sets.
-    int rank = 0;
-
-    // The intersection of all device types supported by this node,
-    // and those of all of its children, in priority order
-    // of the preferred device.
-    PrioritizedDeviceTypeVector supported_device_types;
-
-    // The merged form of the device requested for this node, with
-    // those of all of its children.
-    DeviceNameUtils::ParsedName device_name;
-
-    // If this node is a root, stores a list of Devices to which this node
-    // and all of its children have been assigned, or nullptr if this
-    // has not yet been computed.
-    std::vector<Device*> possible_devices;
-  };
-
-  // Returns debugging info for the node referred to by 'node_root'.
-  string DebugInfo(const int node_root) {
-    string text(
-        "\nColocation Debug Info:\n"
-        "Colocation group had the following types and devices: ");
-
-    // If this node is part of a colocation group, then we want to
-    // collect the mapping of ops to supported devices, so that
-    // the user can see why an unsatisfiable placement occurred.
-
-    std::unordered_map<string, string> type_to_devices;
-    std::vector<const Node*> colocation_nodes;
-    int num_nodes_found = 0;
-
-    for (const Node* node : graph_->nodes()) {
-      if (!node->IsOp()) {
-        continue;
-      }
-      int id = node->id();
-      if (FindRoot(id) != node_root) {
-        continue;
-      }
-      ++num_nodes_found;
-      colocation_nodes.push_back(node);
-      const string& op_type = node->type_string();
-      string devices_registered;
-      for (const auto& device_type : members_[id].supported_device_types) {
-        strings::StrAppend(&devices_registered,
-                           DeviceTypeString(device_type.first), " ");
-      }
-
-      type_to_devices[op_type] = std::move(devices_registered);
-    }
-
-    for (const auto& td : type_to_devices) {
-      strings::StrAppend(&text, "\n", td.first, ": ", td.second);
-    }
-    strings::StrAppend(&text,
-                       "\n\nColocation members and user-requested devices:");
-    for (const Node* node : colocation_nodes) {
-      strings::StrAppend(&text, "\n  ", node->name(), " (", node->type_string(),
-                         ") ", node->requested_device());
-    }
-    strings::StrAppend(&text, "\n");
-
-    if (num_nodes_found <= 1) {
-      text.clear();
-    }
-    return text;
-  }
-
-  Status InitializeMember(const Node& node, Member* member) {
-    const int id = node.id();
-    DCHECK_GE(id, 0);
-    member->parent = id;
-    TF_RETURN_IF_ERROR(SupportedDeviceTypesForNode(
-        device_types_, node.def(), &member->supported_device_types));
-
-    if (node.has_assigned_device_name()) {
-      // This node has already been assigned to a device, so we
-      // respect this placement, after sanity-checking it.  The
-      // device_name and supported_device_types for this node reflect
-      // the assigned device, so any nodes colocated with this node
-      // will be assigned to the same device (assuming this is
-      // possible).
-      // NOTE: Since any assignment must have been performed by
-      // the TensorFlow runtime, we consider errors in this branch to
-      // be INTERNAL.
-      const string& assigned_device_name = node.assigned_device_name();
-      if (!DeviceNameUtils::ParseFullName(assigned_device_name,
-                                          &member->device_name)) {
-        return errors::Internal("Malformed assigned device '",
-                                assigned_device_name, "'");
-      }
-      const Device* assigned_device =
-          device_set_->FindDeviceByName(assigned_device_name);
-      if (assigned_device == nullptr) {
-        return errors::Internal("Assigned device '", assigned_device_name,
-                                "' does not match any device");
-      }
-
-      for (const auto& d : member->supported_device_types) {
-        if (DeviceType(assigned_device->attributes().device_type()) ==
-            d.first) {
-          return Status::OK();
-        }
-      }
-
-      return errors::Internal("Assigned device '", assigned_device_name,
-                              "' does not have registered OpKernel support "
-                              "for ",
-                              node.type_string());
-    } else {
-      // This node has not yet been assigned to a device, so we
-      // calculate any constraints due to the set of registered
-      // kernels and any (partial) user-provided device specification
-      // in the NodeDef.
-
-      // If no kernels are registered for this op type, fail with an error.
-      if (member->supported_device_types.empty()) {
-        std::set<string> registered_device_types;
-        for (Device* d : device_set_->devices()) {
-          registered_device_types.insert(d->device_type());
-        }
-        std::vector<string> attr_key_vals;
-        for (const auto& it : node.attrs()) {
-          const string& name = it.first;
-          const AttrValue& attr_value = it.second;
-          attr_key_vals.push_back(
-              strings::StrCat(name, "=", SummarizeAttrValue(attr_value)));
-        }
-        return errors::InvalidArgument(
-            "No OpKernel was registered to support Op '", node.type_string(),
-            "' used by ", errors::FormatNodeNameForError(node.name()),
-            "with these attrs: [", str_util::Join(attr_key_vals, ", "),
-            "]\n"
-            "Registered devices: [",
-            str_util::Join(registered_device_types, ", "), "]\n",
-            "Registered kernels:\n",
-            KernelsRegisteredForOp(node.type_string()));
-      }
-
-      // If the NodeDef contains a device, then we interpret it as a
-      // (partial) device specification.
-      if (!node.requested_device().empty()) {
-        // The user has specified a device in the NodeDef, try to find a
-        // valid device matching their specification in the set of
-        // devices.
-        // NOTE: The full name may specify a device that is not in
-        // n.supported_device_types(), but we check that in AssignDevice().
-        if (!DeviceNameUtils::ParseFullName(node.requested_device(),
-                                            &member->device_name)) {
-          return errors::InvalidArgument("Malformed device specification '",
-                                         node.requested_device(), "'");
-        }
-      }
-    }
-    return Status::OK();
-  }
-
-  static bool HasPriorities(const PrioritizedDeviceTypeVector& device_types) {
-    for (const auto& prioritized_device_type : device_types) {
-      if (prioritized_device_type.second != 0) return true;
-    }
-    return false;
-  }
-
-  static bool ArePrioritiesSame(const PrioritizedDeviceTypeVector& a_types,
-                                const PrioritizedDeviceTypeVector& b_types) {
-    if (a_types.size() != b_types.size()) {
-      return false;
-    }
-    for (int i = 0; i < a_types.size(); ++i) {
-      if (a_types[i].first != b_types[i].first) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  // Updates target to contain the intersection of the device types in
-  // "target" and "other".
-  static void MergeSupportedDevices(PrioritizedDeviceTypeVector* target,
-                                    const PrioritizedDeviceTypeVector& other) {
-    PrioritizedDeviceTypeVector temp = *target;
-    target->clear();
-
-    // Generate intersection with priorities.
-    PrioritizedDeviceTypeVector target_intersection;
-    PrioritizedDeviceTypeVector other_intersection;
-    for (const auto& prioritized_device_type : temp) {
-      bool found = false;
-      for (const auto& other_prioritized_device_type : other) {
-        if (prioritized_device_type.first ==
-            other_prioritized_device_type.first) {
-          found = true;
-          other_intersection.push_back(other_prioritized_device_type);
-          break;
-        }
-      }
-      if (found) {
-        target_intersection.push_back(prioritized_device_type);
-      }
-    }
-
-    // Sort the devices by priority order.
-    auto device_sort = [](const std::pair<DeviceType, int32>& a,
-                          const std::pair<DeviceType, int32>& b) {
-      // First look at set priorities.
-      if (a.second != b.second) {
-        return a.second > b.second;
-      }
-      // Then fallback to default priorities.
-      auto a_priority = DeviceSet::DeviceTypeOrder(a.first);
-      auto b_priority = DeviceSet::DeviceTypeOrder(b.first);
-      if (a_priority != b_priority) {
-        return a_priority > b_priority;
-      }
-      // Finally just look at the Device type strings.
-      return a.first.type_string() < b.first.type_string();
-    };
-
-    std::sort(target_intersection.begin(), target_intersection.end(),
-              device_sort);
-    std::sort(other_intersection.begin(), other_intersection.end(),
-              device_sort);
-
-    bool is_target_prioritized = HasPriorities(target_intersection);
-    bool is_other_prioritized = HasPriorities(other_intersection);
-    // If neither are prioritized then we just return the original i.e. target
-    // prioritization.
-    if (!is_target_prioritized && !is_other_prioritized) {
-      *target = target_intersection;
-    }
-    // If only one is prioritized, then we respect priorities of that in the
-    // intersection.
-    if (is_target_prioritized && !is_other_prioritized) {
-      *target = target_intersection;
-    }
-    if (!is_target_prioritized && is_other_prioritized) {
-      *target = other_intersection;
-    }
-    // If both have priorities and agree then we go with that. If the
-    // prioritization order is different, then we just fallback to the default
-    // i.e. what the DeviceTypeOrder suggests. In that case, we also set the
-    // merged priorities to 0, so that downstream merges work correctly as well.
-    if (is_target_prioritized && is_other_prioritized) {
-      bool priorities_agree =
-          ArePrioritiesSame(target_intersection, other_intersection);
-      if (priorities_agree) {
-        *target = target_intersection;
-      } else {
-        for (const auto& prioritized_device : target_intersection) {
-          target->push_back(std::make_pair(prioritized_device.first, 0));
-        }
-        std::sort(target->begin(), target->end(), device_sort);
-      }
-    }
-  }
-
-  // Returns the root node of the disjoint tree to which the node with the
-  // given id is connected.
-  int FindRoot(int node_id) {
-    Member& member = members_[node_id];
-    DCHECK_GE(member.parent, 0);
-    if (member.parent == node_id) {
-      // member.parent is the root of this disjoint tree.  Do nothing.
-    } else {
-      member.parent = FindRoot(member.parent);
-    }
-    // Now it is guaranteed that member.parent is the root of this disjoint
-    // tree.
-    DCHECK_GE(member.parent, 0);
-    return member.parent;
-  }
-
-  // Ensures that the devices of 'dst's resource and reference match the device
-  // specified for 'src', which is an input of 'dst' with a partially or fully
-  // specified device.
-  Status VerifyResourceAndRefInputsCanBeColocated(
-      const Node* dst, const Node* src,
-      const DeviceNameUtils::ParsedName& src_parsed_name) {
-    std::vector<const Edge*> edges;
-    TF_RETURN_IF_ERROR(dst->input_edges(&edges));
-    for (const Edge* edge : edges) {
-      DataType input_type = dst->input_type(edge->dst_input());
-      if (input_type == DT_RESOURCE || IsRefType(input_type)) {
-        const Node* input_node = edge->src();
-        if (input_node == src) {
-          continue;
-        }
-        const auto& input_root = members_[FindRoot(input_node->id())];
-        const auto& input_parsed_name = input_root.device_name;
-        if (DeviceNameUtils::HasSomeDetails(input_parsed_name) &&
-            !DeviceNameUtils::AreCompatibleDevNames(input_parsed_name,
-                                                    src_parsed_name)) {
-          return AttachDef(
-              errors::InvalidArgument(
-                  "Could not colocate node with its "
-                  "resource and reference inputs; devices ",
-                  DeviceNameUtils::ParsedNameToString(input_parsed_name),
-                  " and ", DeviceNameUtils::ParsedNameToString(src_parsed_name),
-                  " are not compatible."),
-              *dst);
-        }
-      }
-    }
-    return Status::OK();
-  }
-
-  Graph* const graph_;  // Not owned.
-  std::vector<Member> members_;
-  const DeviceSet* device_set_;  // Not owned.
-  const std::vector<DeviceType> device_types_;
-  const bool allow_soft_placement_;
-  const Device* default_device_;
-};
-
 // Returns true if the node has no inputs and produces outputs
 // that are consumed by a single node.
 //
@@ -802,13 +53,27 @@ bool IsGeneratorNode(const Node* node) {
          !IsRefType(node->output_type(0));
 }
 
-bool IsExemptFromResourceInputColocation(const Node* node) {
-  // Note: Partitioned function calls, which place and partition their
-  // function bodies, are exempt from this check: they forward resource and
-  // ref inputs to operations that are appropriately placed, instead of
-  // dereferencing them.
-  const string& op_type = node->op_def().name();
-  return op_type == "PartitionedCall" || op_type == "StatefulPartitionedCall";
+void LogDeviceAssignment(const Node* node, bool log_device_placement) {
+  // Log placement if log_device_placement is set.
+  if (log_device_placement) {
+    printf("%s: (%s): %s\n", node->name().c_str(), node->type_string().c_str(),
+           node->assigned_device_name().c_str());
+    LOG(INFO) << node->name() << ": "
+              << "(" << node->type_string() << ")"
+              << node->assigned_device_name();
+  }
+}
+
+Status AssignAndLog(int assigned_device, Node* node,
+                    ColocationGraph* colocation_graph,
+                    bool log_device_placement) {
+  node->set_assigned_device_name_index(assigned_device);
+
+  // Constraint the group of node to the assigned device.
+  TF_RETURN_IF_ERROR(colocation_graph->LimitToAssignedDevice(*node));
+
+  LogDeviceAssignment(node, log_device_placement);
+  return Status::OK();
 }
 
 }  // namespace
@@ -832,103 +97,32 @@ Status Placer::Run() {
     return errors::FailedPrecondition("No devices are registered");
   }
 
+  if (VLOG_IS_ON(3)) {
+    DumpGraphToFile("placer_input", *graph_, nullptr, "/tmp");
+    for (const Node* node : graph_->op_nodes()) {
+      VLOG(3) << "    " << node->name() << ": requested: '"
+              << node->requested_device() << "' assigned: '"
+              << node->assigned_device_name() << "'";
+    }
+  }
+
   ColocationGraph colocation_graph(
-      graph_, devices_,
+      graph_, devices_, default_device_,
       options_ == nullptr || options_->config.allow_soft_placement(),
-      default_device_);
-
-  TF_RETURN_IF_ERROR(colocation_graph.InitializeMembers());
+      log_device_placement_);
 
-  // 1. First add all of the nodes. Note that steps (1) and (2)
-  // requires two passes over the nodes because the graph (and hence
-  // the constraints) may not be acyclic.
-  TF_RETURN_IF_ERROR(colocation_graph.ColocateAllNodes());
+  TF_RETURN_IF_ERROR(colocation_graph.Initialize());
 
-  // 2. Enumerate the constraint edges, and use them to update the disjoint
+  // For each node, assign a device based on the constraints in the disjoint
   // node set.
-
-  // If `node` has an input edge with reference type, add an edge from the
-  // source of that edge to `node`.
-  for (const Edge* edge : graph_->edges()) {
-    if (edge->IsControlEdge()) {
-      continue;
-    }
-    Node* src = edge->src();
-    Node* dst = edge->dst();
-    DataType input_type = dst->input_type(edge->dst_input());
-    if ((input_type == DT_RESOURCE || IsRefType(input_type)) &&
-        !IsExemptFromResourceInputColocation(dst)) {
-      // Colocate `src` and `dst` to maintain the invariant that nodes connected
-      // by reference edges are colocated.
-      int src_root_id = colocation_graph.FindRoot(src->id());
-      int dst_root_id = colocation_graph.FindRoot(dst->id());
-      auto& src_root = colocation_graph.members_[src_root_id];
-      auto& dst_root = colocation_graph.members_[dst_root_id];
-      // If both the source node and this node have partially
-      // specified a device, then 'node's device should be
-      // cleared: the reference edge forces 'node' to be on the
-      // same device as the source node.
-      const auto& source_parsed_name = src_root.device_name;
-      const auto& dest_parsed_name = dst_root.device_name;
-      if (DeviceNameUtils::HasSomeDetails(source_parsed_name) &&
-          DeviceNameUtils::HasSomeDetails(dest_parsed_name)) {
-        // Ignore a specified device for 'dst' if the two names were
-        // incompatible.
-        if (!DeviceNameUtils::AreCompatibleDevNames(source_parsed_name,
-                                                    dest_parsed_name)) {
-          TF_RETURN_IF_ERROR(
-              colocation_graph.VerifyResourceAndRefInputsCanBeColocated(
-                  dst, src, source_parsed_name));
-          if (log_device_placement_) {
-            LOG(INFO) << "Ignoring device specification "
-                      << DeviceNameUtils::ParsedNameToString(dest_parsed_name)
-                      << " for node '" << dst->name()
-                      << "' because the input edge from '" << src->name()
-                      << "' is a reference connection and already has a device "
-                         "field set to "
-                      << DeviceNameUtils::ParsedNameToString(
-                             source_parsed_name);
-          }
-
-          // Make 'dst' colocated with the source
-          dst_root.device_name = source_parsed_name;
-        } else {
-          bool source_subset_of_dest = DeviceNameUtils::IsSpecification(
-              source_parsed_name, dest_parsed_name);
-          bool dest_subset_of_source = DeviceNameUtils::IsSpecification(
-              dest_parsed_name, source_parsed_name);
-
-          if (source_subset_of_dest && !dest_subset_of_source) {
-            src_root.device_name = dest_parsed_name;
-          } else {
-            dst_root.device_name = source_parsed_name;
-          }
-        }
-      }
-
-      Status status =
-          colocation_graph.ColocateNodes(*src, src_root_id, *dst, dst_root_id);
-      if (!status.ok()) {
-        return AttachDef(
-            errors::InvalidArgument("Nodes were connected by a "
-                                    "reference connection (requiring them to "
-                                    "be on the same device), but the two nodes "
-                                    "were assigned two different devices: ",
-                                    status.error_message()),
-            *dst);
-      }
-    }
-  }
-
-  // 3. For each node, assign a device based on the constraints in the
-  // disjoint node set.
   std::vector<Node*> second_pass;
   for (Node* node : graph_->op_nodes()) {
     // The graph may have come pre-populated by the framework with assigned
     // devices (e.g., for stateful placements), so the placer should not try to
     // place nodes that are already placed.
     if (node->has_assigned_device_name()) {
-      LogDeviceAssignment(node);
+      TF_RETURN_IF_ERROR(colocation_graph.LimitToAssignedDevice(*node));
+      LogDeviceAssignment(node, log_device_placement_);
       continue;
     }
 
@@ -943,7 +137,7 @@ Status Placer::Run() {
       continue;
     }
 
-    std::vector<Device*>* devices;
+    const std::vector<Device*>* devices;
     Status status = colocation_graph.GetDevicesForNode(node, &devices);
     if (!status.ok()) {
       return AttachDef(
@@ -984,13 +178,14 @@ Status Placer::Run() {
       assigned_device = graph_->InternDeviceName((*devices)[0]->name());
     }
 
-    AssignAndLog(assigned_device, node);
+    TF_RETURN_IF_ERROR(AssignAndLog(assigned_device, node, &colocation_graph,
+                                    log_device_placement_));
   }
 
-  // 4. Perform a second pass assignment for those nodes explicitly
+  // Perform a second pass assignment for those nodes explicitly
   // skipped during the first pass.
   for (Node* node : second_pass) {
-    std::vector<Device*>* devices;
+    const std::vector<Device*>* devices;
     Status status = colocation_graph.GetDevicesForNode(node, &devices);
     if (!status.ok()) {
       return AttachDef(
@@ -1023,9 +218,13 @@ Status Placer::Run() {
       assigned_device = graph_->InternDeviceName((*devices)[0]->name());
     }
 
-    AssignAndLog(assigned_device, node);
+    TF_RETURN_IF_ERROR(AssignAndLog(assigned_device, node, &colocation_graph,
+                                    log_device_placement_));
   }
 
+  if (VLOG_IS_ON(3)) {
+    DumpGraphToFile("placer_output", *graph_, nullptr, "/tmp");
+  }
   return Status::OK();
 }
 
@@ -1046,20 +245,4 @@ bool Placer::CanAssignToDevice(const string& candidate_device_name,
   return false;
 }
 
-void Placer::AssignAndLog(int assigned_device, Node* node) const {
-  node->set_assigned_device_name_index(assigned_device);
-  LogDeviceAssignment(node);
-}
-
-void Placer::LogDeviceAssignment(const Node* node) const {
-  // Log placement if log_device_placement is set.
-  if (log_device_placement_) {
-    printf("%s: (%s): %s\n", node->name().c_str(), node->type_string().c_str(),
-           node->assigned_device_name().c_str());
-    LOG(INFO) << node->name() << ": "
-              << "(" << node->type_string() << ")"
-              << node->assigned_device_name();
-  }
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/placer.h b/tensorflow/core/common_runtime/placer.h
index e3e8f3790c5fc1d6223a9e6ba1d3aa79eca0d3e3..e6c5a89c717f21824ed150808c2f02b55a19509c 100644
--- a/tensorflow/core/common_runtime/placer.h
+++ b/tensorflow/core/common_runtime/placer.h
@@ -88,11 +88,6 @@ class Placer {
   bool CanAssignToDevice(const string& candidate_device_name,
                          const std::vector<Device*>& devices) const;
 
-  // Assigns 'node's devices to 'assigned_device', and logs the
-  // placement if the SessionOptions entry in 'options_' requests it.
-  void AssignAndLog(int assigned_device, Node* node) const;
-  void LogDeviceAssignment(const Node* node) const;
-
   Graph* const graph_;              // Not owned.
   const DeviceSet* const devices_;  // Not owned.
   const SessionOptions* options_;   // Not owned.
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index 04e77e55f62e1bd9345c8e9113407bbf0a375774..ece4fe058b10ca886da8e8afecb748f69f2e22c7 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -24,11 +25,15 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
@@ -40,6 +45,16 @@ limitations under the License.
 
 namespace tensorflow {
 
+using ::tensorflow::test::function::GDef;
+using ::tensorflow::test::function::NDef;
+using FDH = ::tensorflow::FunctionDefHelper;
+
+constexpr char kCPU[] = "/device:fakecpu:0";
+constexpr char kGPU[] = "/device:fakegpu:0";
+
+constexpr char kFullCPU[] = "/job:a/replica:0/task:0/device:fakecpu:0";
+constexpr char kFullGPU[] = "/job:a/replica:0/task:0/device:fakegpu:0";
+
 namespace {
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -210,6 +225,16 @@ class PlacerTest : public ::testing::Test {
     return Status::OK();
   }
 
+  Status BuildGraph(const GraphDef& graph_def, Graph* out_graph) {
+    GraphConstructorOptions opts;
+    TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, graph_def, out_graph));
+    nodes_by_name_.clear();
+    for (Node* node : out_graph->nodes()) {
+      nodes_by_name_[node->name()] = node->id();
+    }
+    return Status::OK();
+  }
+
   // Invokes the Placer on "graph". If no DeviceSet is specified, the
   // placement will use the default DeviceSet (of 10 CPU and 10 GPU devices).
   //
@@ -248,6 +273,16 @@ class PlacerTest : public ::testing::Test {
                              const DeviceType& expected_device_type);
 };
 
+// Fixture that add a parameter for allow_soft_placement.
+// Test cases that want to test behavior with and without soft placement
+// can use this fixture instead of PlacerTest.
+class SoftPlacementPlacerTest : public PlacerTest,
+                                public ::testing::WithParamInterface<bool> {};
+
+INSTANTIATE_TEST_SUITE_P(, SoftPlacementPlacerTest,
+                         ::testing::Values(false, true),
+                         ::testing::PrintToStringParamName());
+
 #define EXPECT_COLOCATED(g, name_a, name_b)                         \
   do {                                                              \
     Graph& g_ = (g);                                                \
@@ -866,7 +901,7 @@ TEST_F(PlacerTest, TestResourceHandle) {
 }
 
 TEST_F(PlacerTest, TestResourceHandlesOnDifferentDevicesFails) {
-  auto handle_test = [this](bool allow_soft_placement) {
+  auto handle_test = [this](bool allow_soft_placement, bool set_assigned) {
     Graph g(OpRegistry::Global());
     {  // Scope for temporary variables used to construct g.
       GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
@@ -878,27 +913,41 @@ TEST_F(PlacerTest, TestResourceHandlesOnDifferentDevicesFails) {
                     b.opts().WithName("two_handles_in"));
       TF_EXPECT_OK(BuildGraph(b, &g));
 
-      GetNodeByName(g, "var_cpu")
-          ->set_assigned_device_name(
-              "/job:a/replica:0/task:0/device:fakecpu:0");
-      GetNodeByName(g, "var_gpu")
-          ->set_assigned_device_name(
-              "/job:a/replica:0/task:0/device:fakegpu:0");
+      if (set_assigned) {
+        GetNodeByName(g, "var_cpu")
+            ->set_assigned_device_name(
+                "/job:a/replica:0/task:0/device:fakecpu:0");
+        GetNodeByName(g, "var_gpu")
+            ->set_assigned_device_name(
+                "/job:a/replica:0/task:0/device:fakegpu:0");
+      } else {
+        GetNodeByName(g, "var_cpu")
+            ->set_requested_device("/job:a/replica:0/task:0/device:fakecpu:0");
+        GetNodeByName(g, "var_gpu")
+            ->set_requested_device("/job:a/replica:0/task:0/device:fakegpu:0");
+      }
     }
 
     SessionOptions options;
     options.config.set_allow_soft_placement(allow_soft_placement);
     options.config.set_log_device_placement(true);
     Status s = Place(&g, &options);
-    EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
+    EXPECT_EQ(error::INVALID_ARGUMENT, s.code()) << s.ToString();
     EXPECT_TRUE(str_util::StrContains(
         s.error_message(),
-        "Could not colocate node with its resource and reference inputs"));
+        "Cannot place the graph because a reference or resource edge "
+        "connects "
+        "colocation groups with incompatible assigned devices: "
+        "/job:a/replica:0/task:0/device:fakegpu:0 vs "
+        "/job:a/replica:0/task:0/device:fakecpu:0"));
+
     return Status::OK();
   };
 
-  TF_EXPECT_OK(handle_test(false));
-  TF_EXPECT_OK(handle_test(true));
+  TF_EXPECT_OK(handle_test(false, false));
+  TF_EXPECT_OK(handle_test(false, true));
+  TF_EXPECT_OK(handle_test(true, false));
+  TF_EXPECT_OK(handle_test(true, true));
 }
 
 // Test that an assignment of an operator to the wrong device
@@ -1034,7 +1083,7 @@ TEST_F(PlacerTest, TestMultipleColocationGroups) {
   EXPECT_COLOCATED(g, "in", "foo");
 }
 
-TEST_F(PlacerTest, TestInvalidMultipleColocationGroups) {
+TEST_P(SoftPlacementPlacerTest, TestInvalidMultipleColocationGroups) {
   Graph g(OpRegistry::Global());
   {  // Scope for temporary variables used to construct g.
     GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
@@ -1051,12 +1100,24 @@ TEST_F(PlacerTest, TestInvalidMultipleColocationGroups) {
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  Status s = Place(&g);
-  EXPECT_TRUE(str_util::StrContains(
-      s.error_message(),
-      "Cannot colocate nodes {{colocation_node foo}} and "
-      "{{colocation_node in}} because no device type supports both of those "
-      "nodes and the other nodes colocated with them"));
+  bool allow_soft_placement = GetParam();
+  SessionOptions options;
+  options.config.set_allow_soft_placement(allow_soft_placement);
+  options.config.set_log_device_placement(true);
+  Status s = Place(&g, &options);
+  if (allow_soft_placement) {
+    EXPECT_EQ(error::OK, s.code()) << s.ToString();
+    EXPECT_DEVICE_TYPE(g, "in", "FakeCPU");
+    EXPECT_DEVICE_TYPE(g, "colocated_1", "FakeCPU");
+    EXPECT_DEVICE_TYPE(g, "foo", "FakeGPU");
+  } else {
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(),
+        "Cannot colocate nodes {{colocation_node foo}} and "
+        "{{colocation_node in}} because no device type supports both of those "
+        "nodes and the other nodes colocated with them"))
+        << s.ToString();
+  }
 }
 
 TEST_F(PlacerTest, TestColocationGroupWithReferenceConnections) {
@@ -1086,7 +1147,8 @@ TEST_F(PlacerTest, TestColocationGroupWithReferenceConnections) {
   EXPECT_COLOCATED(g, "var2", "assign1");
 }
 
-TEST_F(PlacerTest, TestColocationGroupWithUnsatisfiableReferenceConnections) {
+TEST_P(SoftPlacementPlacerTest,
+       TestColocationGroupWithUnsatisfiableReferenceConnections) {
   Graph g(OpRegistry::Global());
   {  // Scope for temporary variables used to construct g.
     GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
@@ -1116,12 +1178,22 @@ TEST_F(PlacerTest, TestColocationGroupWithUnsatisfiableReferenceConnections) {
     TF_EXPECT_OK(BuildGraph(b, &g));
   }
 
-  Status s = Place(&g);
-  EXPECT_TRUE(str_util::StrContains(
-      s.error_message(),
-      "Cannot colocate nodes {{colocation_node var3}} and {{colocation_node "
-      "assign3}} because no device type supports both of those nodes and the "
-      "other nodes colocated with them."));
+  bool allow_soft_placement = GetParam();
+  SessionOptions options;
+  options.config.set_allow_soft_placement(allow_soft_placement);
+  options.config.set_log_device_placement(true);
+  Status s = Place(&g, &options);
+  if (allow_soft_placement) {
+    EXPECT_EQ(error::OK, s.code()) << s.ToString();
+  } else {
+    EXPECT_EQ(error::INVALID_ARGUMENT, s.code()) << s.ToString();
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(),
+        "Cannot colocate nodes {{colocation_node assign3}} and "
+        "{{colocation_node var2}} because no device type supports both of "
+        "those nodes and the other nodes colocated with them."))
+        << s.ToString();
+  }
 }
 
 TEST_F(PlacerTest, TestColocationAndReferenceConnections) {
@@ -1617,5 +1689,160 @@ TEST_F(PlacerTest, TestGeneratorNodeDoesntFollowNonColocatedConsumers) {
   EXPECT_DEVICE_TYPE(g, "in", "FakeGPU");
 }
 
+REGISTER_KERNEL_BUILDER(Name("_Arg").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("_Arg").Device("FakeGPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("_Retval").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("_Retval").Device("FakeGPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("Identity").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("Identity").Device("FakeGPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("Const").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("Const").Device("FakeGPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("Mul").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("Mul").Device("FakeGPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("Add").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("Add").Device("FakeGPU"), DummyOp);
+
+TEST_P(SoftPlacementPlacerTest,
+       RequestedDeviceOnResourceGeneratorIsTreatedAsAssigned) {
+  /*
+   *    a:RES:GPU  b:RES:CPU
+   *       |         |
+   *       |         |
+   *       v         v
+   *      id1       id2
+   *     @loc:id2
+   */
+  FunctionDef func = test::function::ResourceOutput();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}, kGPU),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}, kCPU),
+          NDef("id1", "Identity", {"a"},
+               {{"T", DT_RESOURCE},
+                {"_class", gtl::ArraySlice<string>({"loc:@id2"})}}),
+          NDef("id2", "Identity", {"b"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_ASSERT_OK(BuildGraph(graph, &g));
+
+  bool allow_soft_placement = GetParam();
+  SessionOptions options;
+  options.config.set_allow_soft_placement(allow_soft_placement);
+  options.config.set_log_device_placement(true);
+  Status s = Place(&g, &options);
+  if (allow_soft_placement) {
+    EXPECT_EQ(error::OK, s.code()) << s.ToString();
+    EXPECT_DEVICE_TYPE(g, "a", "FakeGPU");
+    EXPECT_DEVICE_TYPE(g, "id1", "FakeGPU");
+    EXPECT_DEVICE_TYPE(g, "b", "FakeCPU");
+    EXPECT_DEVICE_TYPE(g, "id2", "FakeCPU");
+  } else {
+    EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(),
+        "Cannot colocate nodes {{colocation_node id2}} and {{colocation_node "
+        "id1}}: Cannot merge devices with incompatible types: "
+        "'/device:fakecpu:0' and '/device:fakegpu:0'"))
+        << s.ToString();
+  }
+}
+
+TEST_F(PlacerTest, RequestedDeviceCanBeOverridden) {
+  /*
+   *     a:RES      b:RES
+   *       |         |
+   *     id_a:GPU   id_b:CPU
+   *       |         |
+   *       v         v
+   *      id1       id2
+   *     @loc:id2
+   */
+  FunctionDef func = test::function::ResourceOutput();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("id_a", "Identity", {"a"}, {{"T", DT_RESOURCE}}, kGPU),
+          NDef("id_b", "Identity", {"b"}, {{"T", DT_RESOURCE}}, kCPU),
+          NDef("id1", "Identity", {"id_a"},
+               {{"T", DT_RESOURCE},
+                {"_class", gtl::ArraySlice<string>({"loc:@id2"})}}),
+          NDef("id2", "Identity", {"id_b"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_ASSERT_OK(BuildGraph(graph, &g));
+  TF_ASSERT_OK(Place(&g));
+
+  // All should be colocated
+  EXPECT_COLOCATED(g, "a", "b");
+  EXPECT_COLOCATED(g, "id_a", "id_b");
+  EXPECT_COLOCATED(g, "id1", "id2");
+  EXPECT_COLOCATED(g, "a", "id_a");
+  EXPECT_COLOCATED(g, "a", "id1");
+}
+
+TEST_P(SoftPlacementPlacerTest,
+       AssignedDevicesAreNotOverriddenDueToResourcesAndColocation) {
+  /*
+   *     a:RES      b:RES
+   *       |         |
+   *     id_a:GPU   id_b:CPU
+   *       |         |
+   *       v         v
+   *      id1       id2
+   *     @loc:id2
+   */
+  FunctionDef func = test::function::ResourceOutput();
+  GraphDef graph = GDef(
+      {
+          NDef("a", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("b", "_Arg", {}, {{"T", DT_RESOURCE}}),
+          NDef("id_a", "Identity", {"a"}, {{"T", DT_RESOURCE}}),
+          NDef("id_b", "Identity", {"b"}, {{"T", DT_RESOURCE}}),
+          NDef("id1", "Identity", {"id_a"},
+               {{"T", DT_RESOURCE},
+                {"_class", gtl::ArraySlice<string>({"loc:@id2"})}}),
+          NDef("id2", "Identity", {"id_b"}, {{"T", DT_RESOURCE}}),
+      },
+      // FunctionLib
+      {func});
+
+  Graph g(OpRegistry::Global());
+  TF_ASSERT_OK(BuildGraph(graph, &g));
+  std::unordered_map<string, Node*> nodes = g.BuildNodeNameIndex();
+  GetNodeByName(g, "id_a")->set_assigned_device_name(kFullGPU);
+  GetNodeByName(g, "id_b")->set_assigned_device_name(kFullCPU);
+
+  bool allow_soft_placement = GetParam();
+
+  SessionOptions options;
+  options.config.set_allow_soft_placement(allow_soft_placement);
+  Status s = Place(&g, &options);
+  if (allow_soft_placement) {
+    EXPECT_EQ(error::OK, s.code()) << s.ToString();
+    EXPECT_DEVICE_TYPE(g, "a", "FakeGPU");
+    EXPECT_DEVICE_TYPE(g, "id_a", "FakeGPU");
+    EXPECT_DEVICE_TYPE(g, "id1", "FakeGPU");
+    EXPECT_DEVICE_TYPE(g, "b", "FakeCPU");
+    EXPECT_DEVICE_TYPE(g, "id_b", "FakeCPU");
+    EXPECT_DEVICE_TYPE(g, "id2", "FakeCPU");
+  } else {
+    EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(),
+        "Cannot colocate nodes {{colocation_node id2}} and {{colocation_node "
+        "id1}}: Cannot merge devices with incompatible types: "
+        "'/job:a/replica:0/task:0/device:fakecpu:0' and "
+        "'/job:a/replica:0/task:0/device:fakegpu:0'"))
+        << s.ToString();
+  }
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/pool_allocator.h b/tensorflow/core/common_runtime/pool_allocator.h
index 8be9c7b678e2bbe7659c9e22e31cb595ce704307..603e28b39e171e2de911f88a12ace9f93c421add 100644
--- a/tensorflow/core/common_runtime/pool_allocator.h
+++ b/tensorflow/core/common_runtime/pool_allocator.h
@@ -99,8 +99,6 @@ class PoolAllocator : public Allocator {
     return pool_size_limit_;
   }
 
-  void GetStats(AllocatorStats* stats) override { stats->Clear(); }
-
  private:
   struct PtrRecord {
     void* ptr;
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index c43a9d7dc211dd82a1b5771ad22888a2ba275a48..6db1fae77ce5550fb5980246ffeea34d6020f877 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -16,11 +16,27 @@ limitations under the License.
 
 #include <utility>
 
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/common_runtime/partitioning_utils.h"
+#include "tensorflow/core/common_runtime/placer.h"
+#include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/common_runtime/rendezvous_util.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_partition.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/ptr_util.h"
+#include "tensorflow/core/util/reffed_status_callback.h"
 
 namespace tensorflow {
 
@@ -52,13 +68,13 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
       parent_(parent) {
   if (device_mgr == nullptr) {
     flr_map_[nullptr] = NewFunctionLibraryRuntime(
-        nullptr, env, nullptr, graph_def_version, lib_def, default_thread_pool,
+        nullptr, env, nullptr, graph_def_version, lib_def_, default_thread_pool,
         optimizer_options, this);
     return;
   }
   for (Device* d : device_mgr->ListDevices()) {
     flr_map_[d] = NewFunctionLibraryRuntime(
-        device_mgr, env, d, graph_def_version, lib_def, default_thread_pool,
+        device_mgr, env, d, graph_def_version, lib_def_, default_thread_pool,
         optimizer_options, this);
   }
 }
@@ -77,13 +93,13 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
       parent_(parent) {
   if (device_mgr == nullptr) {
     flr_map_[nullptr] = NewFunctionLibraryRuntime(
-        nullptr, env, nullptr, graph_def_version, lib_def, default_thread_pool,
+        nullptr, env, nullptr, graph_def_version, lib_def_, default_thread_pool,
         optimizer_options, std::move(custom_kernel_creator), this);
     return;
   }
   for (Device* d : device_mgr->ListDevices()) {
     flr_map_[d] = NewFunctionLibraryRuntime(
-        device_mgr, env, d, graph_def_version, lib_def, default_thread_pool,
+        device_mgr, env, d, graph_def_version, lib_def_, default_thread_pool,
         optimizer_options, custom_kernel_creator, this);
   }
 }
@@ -126,7 +142,7 @@ void ProcessFunctionLibraryRuntime::ReceiveTensorsAsync(
 }
 
 Status ProcessFunctionLibraryRuntime::GetDeviceIncarnation(
-    const string& device_name, int64* incarnation) {
+    const string& device_name, int64* incarnation) const {
   FunctionLibraryRuntime* flr = GetFLR(device_name);
   if (flr == nullptr) {
     return errors::InvalidArgument("Device name: ", device_name, " not found");
@@ -136,7 +152,7 @@ Status ProcessFunctionLibraryRuntime::GetDeviceIncarnation(
 }
 
 Status ProcessFunctionLibraryRuntime::GetDeviceContext(
-    const string& device_name, DeviceContext** device_context) {
+    const string& device_name, DeviceContext** device_context) const {
   *device_context = nullptr;
   FunctionLibraryRuntime* flr = GetFLR(device_name);
   if (flr == nullptr) {
@@ -181,9 +197,26 @@ FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::AddHandle(
     const string& function_key, const string& device_name,
     FunctionLibraryRuntime::LocalHandle local_handle) {
   mutex_lock l(mu_);
+  return AddHandleLocked(function_key, device_name, local_handle);
+}
+
+FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::AddHandleLocked(
+    const string& function_key, const string& device_name,
+    FunctionLibraryRuntime::LocalHandle local_handle) {
+  auto h = next_handle_;
+  function_data_[h] =
+      MakeUnique<FunctionData>(device_name, local_handle, function_key);
+  table_[function_key] = h;
+  next_handle_++;
+  return h;
+}
+
+FunctionLibraryRuntime::Handle
+ProcessFunctionLibraryRuntime::AddMultiDeviceHandle(
+    std::unique_ptr<MultiDeviceFunctionData> data, const string& function_key) {
+  mutex_lock l(mu_);
   auto h = next_handle_;
-  function_data_[h] = MakeUnique<FunctionData>(
-      device_name, local_handle, function_key);
+  mdevice_data_[h] = std::move(data);
   table_[function_key] = h;
   next_handle_++;
   return h;
@@ -196,14 +229,20 @@ FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::GetHandle(
 }
 
 bool ProcessFunctionLibraryRuntime::IsInstantiatedOnDevice(
-    const string& device_name, FunctionLibraryRuntime::Handle handle) {
+    const string& device_name, FunctionLibraryRuntime::Handle handle) const {
   return GetHandleOnDevice(device_name, handle) != kInvalidHandle;
 }
 
 FunctionLibraryRuntime::LocalHandle
 ProcessFunctionLibraryRuntime::GetHandleOnDevice(
-    const string& device_name, FunctionLibraryRuntime::Handle handle) {
+    const string& device_name, FunctionLibraryRuntime::Handle handle) const {
   tf_shared_lock l(mu_);
+
+  auto miter = mdevice_data_.find(handle);
+  if (miter != mdevice_data_.end()) {
+    return kInvalidLocalHandle;
+  }
+
   auto iter = function_data_.find(handle);
   if (iter == function_data_.end()) {
     return kInvalidLocalHandle;
@@ -216,7 +255,7 @@ ProcessFunctionLibraryRuntime::GetHandleOnDevice(
 }
 
 string ProcessFunctionLibraryRuntime::GetDeviceName(
-    FunctionLibraryRuntime::Handle handle) {
+    FunctionLibraryRuntime::Handle handle) const {
   tf_shared_lock l(mu_);
   auto iter = function_data_.find(handle);
   CHECK(iter != function_data_.end());
@@ -224,10 +263,520 @@ string ProcessFunctionLibraryRuntime::GetDeviceName(
   return function_data->target_device();
 }
 
+ProcessFunctionLibraryRuntime::MultiDeviceFunctionData*
+ProcessFunctionLibraryRuntime::IsMultiDevice(
+    FunctionLibraryRuntime::Handle handle) const {
+  tf_shared_lock l(mu_);
+  const auto& it = mdevice_data_.find(handle);
+  if (it != mdevice_data_.end()) {
+    return it->second.get();
+  }
+  return nullptr;
+}
+
+namespace {
+// Sets `group` to the first colocation group specified in `node`. If no
+// group is specified, does not touch `group`.
+void GetColocationGroup(const Node* node, string* group) {
+  // We hoist the conversion from C-style string literal to string here,
+  // so that we can avoid the many repeated calls to strlen().
+  static const StringPiece kColocationAttrNameStringPiece(kColocationAttrName);
+  const AttrValue* attr_value =
+      node->attrs().Find(kColocationAttrNameStringPiece);
+  if (attr_value != nullptr && attr_value->has_list() &&
+      attr_value->list().s_size() > 0) {
+    *group = attr_value->list().s(0);
+  }
+}
+
+}  // anonymous namespace
+
+Status ProcessFunctionLibraryRuntime::PinArgsAndRets(
+    const std::vector<string>& input_devices,
+    const std::vector<string>& output_devices, const DeviceSet& device_set,
+    Graph* graph) const {
+  // If output_devices are not specified, we want to set the output device
+  // based on the device of the output producing node. The output producing
+  // node can be an arg node because functions can simply return their
+  // arguments. To make sure that the output producing nodes have assigned
+  // devices, we assign them to arguments first.
+  for (Node* node : graph->op_nodes()) {
+    if (node->type_string() == FunctionLibraryDefinition::kArgOp) {
+      const AttrValue* attr_value;
+      TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
+      int64 index = attr_value->i();
+      node->set_assigned_device_name(input_devices[index]);
+    }
+  }
+
+  for (Node* node : graph->op_nodes()) {
+    if (node->type_string() == FunctionLibraryDefinition::kRetOp) {
+      if (output_devices.empty()) {
+        // If output_devices are empty, the node producing retval
+        // must have explicitly assigned device or a colocation constraint
+        // to a node with explicitly assigned device.
+        for (const auto& it : node->in_edges()) {
+          if (!it->IsControlEdge()) {
+            Node* src_node = it->src();
+            const string* src_device = &src_node->requested_device();
+            string colocation_group = "";
+            GetColocationGroup(src_node, &colocation_group);
+            while (src_device->empty() && colocation_group.empty() &&
+                   src_node->IsIdentity()) {
+              src_node = *src_node->in_nodes().begin();
+              src_device = &src_node->requested_device();
+              if (src_device->empty()) {
+                // Some node (e.g. _Args) can have no requested_device,
+                // but have assigned_device.
+                src_device = &src_node->assigned_device_name();
+              }
+
+              GetColocationGroup(src_node, &colocation_group);
+            }
+
+            if (!colocation_group.empty()) {
+              AttrValue::ListValue colo_attr;
+              colo_attr.add_s(colocation_group);
+              std::vector<string> colo_slice = {colocation_group};
+              node->AddAttr(kColocationAttrName, colo_slice);
+            } else if (!src_device->empty()) {
+              // src_device can be a partially specified device. Find the
+              // matching device in the device_set.
+              DeviceNameUtils::ParsedName parsed;
+              if (!DeviceNameUtils::ParseFullName(*src_device, &parsed)) {
+                return errors::InvalidArgument(
+                    "Failed to parse explicit device specification ",
+                    *src_device);
+              }
+              std::vector<Device*> matching_devices;
+              device_set.FindMatchingDevices(parsed, &matching_devices);
+              if (matching_devices.empty()) {
+                return errors::InvalidArgument(
+                    "Unable to find any devices for spec ", *src_device);
+              } else if (matching_devices.size() != 1) {
+                // Convert a vector of devices to a string.
+                // Using absl::StrJoin did not work in Android builds.
+                string devices = "[";
+                for (Device* device : matching_devices) {
+                  devices.append(device->name());
+                  devices.append(", ");
+                }
+                if (devices.size() > 2) {
+                  devices.resize(devices.size() - 2);
+                }
+                devices.append("]");
+
+                return errors::InvalidArgument(
+                    "When FunctionLibraryRuntime::Options.output_devices are "
+                    "not specified for a multi-device function, the device "
+                    "specification on the output node must match exactly one "
+                    "device. Matched devices are ",
+                    devices);
+              }
+              node->set_assigned_device_name(matching_devices[0]->name());
+            }
+          }
+        }
+      } else {
+        const AttrValue* attr_value;
+        TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
+        int64 index = attr_value->i();
+        // output_devices size is checked in InstantiateMultiDevice
+        DCHECK_GT(output_devices.size(), index);
+        node->set_assigned_device_name(output_devices[index]);
+      }
+    }
+  }
+  return Status::OK();
+}
+
+namespace {
+
+Status ValidateNoListArguments(
+    const protobuf::RepeatedPtrField<OpDef::ArgDef>& args, const char* arg_type,
+    const string& function_name) {
+  for (const OpDef::ArgDef& arg : args) {
+    if (!arg.number_attr().empty() || !arg.type_list_attr().empty()) {
+      return errors::InvalidArgument(
+          "Function ", function_name, " has an ", arg_type, " named \"",
+          arg.name(),
+          "\" that is a list of tensors."
+          " Multi-device functions support only single-tensor inputs "
+          " and outputs");
+    }
+  }
+  return Status::OK();
+}
+
+Status ValidateMultiDeviceOptions(
+    const FunctionDef& fdef,
+    const FunctionLibraryRuntime::InstantiateOptions& options) {
+  const OpDef& signature = fdef.signature();
+  // Multi-device functions don't currently support list inputs or outputs
+  TF_RETURN_IF_ERROR(ValidateNoListArguments(signature.input_arg(), "input",
+                                             signature.name()));
+  TF_RETURN_IF_ERROR(ValidateNoListArguments(signature.output_arg(), "output",
+                                             signature.name()));
+
+  if (fdef.attr().count(FunctionLibraryDefinition::kIntsOnDeviceAttr) != 0 &&
+      fdef.attr().at(FunctionLibraryDefinition::kIntsOnDeviceAttr).b()) {
+    return errors::Unimplemented(
+        "Function '", signature.name(), "' has `",
+        FunctionLibraryDefinition::kIntsOnDeviceAttr,
+        "` attribute set. This attribute is not currently supported by "
+        "multi-device functions.");
+  }
+
+  if (options.input_devices.size() != signature.input_arg_size()) {
+    return errors::InvalidArgument(
+        "InstantiateOptions.input_devices must have the same length "
+        "as the number of arguments: input_devices length = ",
+        options.input_devices.size(),
+        " number of arguments = ", signature.input_arg_size());
+  }
+  if (!options.output_devices.empty() &&
+      options.output_devices.size() != signature.output_arg_size()) {
+    return errors::InvalidArgument(
+        "InstantiateOptions.output_devices must either be empty or have "
+        "the same length as the number of arguments: output_devices length "
+        "= ",
+        options.output_devices.size(),
+        " number of arguments = ", signature.output_arg_size());
+  }
+
+  if (!options.state_handle.empty()) {
+    return errors::Unimplemented(
+        "InstantiateOptions.state_handle is not supported for multi-device "
+        "functions. Function: ",
+        signature.name());
+  }
+  if (options.create_kernels_eagerly) {
+    return errors::Unimplemented(
+        "InstantiateOptions.create_kernels_eagerly is not supported for "
+        "multi-device functions. Function: ",
+        signature.name());
+  }
+
+  return Status::OK();
+}
+
+Status GetGraphAndRets(const string& function_name, AttrSlice attrs,
+                       const FunctionDef* fdef,
+                       const FunctionLibraryDefinition* lib_def,
+                       std::unique_ptr<Graph>* graph,
+                       std::vector<string>* ret_node_names,
+                       std::vector<string>* control_ret_node_names) {
+  auto get_func_sig = [lib_def](const string& op, const OpDef** sig) {
+    return lib_def->LookUpOpDef(op, sig);
+  };
+  FunctionBody* tmp_fbody;
+  // TODO(iga): FunctionDefToBodyHelper copies fdef. Avoid this copy.
+  TF_RETURN_IF_ERROR(
+      FunctionDefToBodyHelper(*fdef, attrs, lib_def, get_func_sig, &tmp_fbody));
+  if (tmp_fbody == nullptr) {
+    LOG(ERROR) << "Failed to get FunctionBody for \"" << function_name << "\"";
+    return errors::Internal("Failed to construct FunctionBody for ",
+                            function_name);
+  }
+  std::unique_ptr<FunctionBody> fbody(tmp_fbody);
+  *graph = std::unique_ptr<Graph>(fbody->graph);
+  fbody->graph = nullptr;
+  ret_node_names->reserve(fbody->ret_nodes.size());
+  for (const Node* node : fbody->ret_nodes) {
+    ret_node_names->push_back(node->name());
+  }
+  control_ret_node_names->reserve(fbody->control_ret_nodes.size());
+  for (const Node* node : fbody->control_ret_nodes) {
+    control_ret_node_names->push_back(node->name());
+  }
+  return Status::OK();
+}
+
+}  // anonymous namespace
+
+Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
+    const string& function_name, AttrSlice attrs,
+    const FunctionLibraryRuntime::InstantiateOptions& options,
+    FunctionLibraryRuntime::Handle* handle) {
+  // Check if this function has already been instantiated.
+  const string& function_key = Canonicalize(function_name, attrs, options);
+
+  {
+    mutex_lock l(mu_);
+    const auto& it = table_.find(function_key);
+    if (it != table_.end()) {
+      *handle = it->second;
+      ++mdevice_data_[*handle]->instantiation_counter_;
+      return Status::OK();
+    }
+  }
+
+  VLOG(1) << "Instantiating MultiDevice function \"" << function_name
+          << "\" on default device " << options.target;
+
+  const FunctionLibraryDefinition* lib_def =
+      options.overlay_lib == nullptr ? lib_def_ : options.overlay_lib;
+
+  const FunctionDef* fdef = lib_def->Find(function_name);
+  if (fdef == nullptr) {
+    return errors::InvalidArgument("Failed to find function \"", function_name,
+                                   "\" in function library: ", lib_def);
+  }
+
+  TF_RETURN_IF_ERROR(ValidateMultiDeviceOptions(*fdef, options));
+
+  std::unique_ptr<Graph> graph;
+  std::vector<string> ret_node_names;
+  std::vector<string> control_ret_node_names;
+
+  TF_RETURN_IF_ERROR(GetGraphAndRets(function_name, attrs, fdef, lib_def,
+                                     &graph, &ret_node_names,
+                                     &control_ret_node_names));
+
+  if (options.graph_collector != nullptr) {
+    GraphDef def;
+    graph->ToGraphDef(&def);
+    *def.mutable_library() = lib_def->ReachableDefinitions(def).ToProto();
+    options.graph_collector->CollectRawGraph(def);
+  }
+
+  DeviceSet device_set;
+  for (auto d : device_mgr_->ListDevices()) {
+    device_set.AddDevice(d);
+  }
+
+  TF_RETURN_IF_ERROR(PinArgsAndRets(
+      options.input_devices, options.output_devices, device_set, graph.get()));
+
+  // Make the FunctionLibraryRuntime's device the default device if
+  // nothing else is hard coded. This allows the same function definition
+  // to be specialized to different devices depending on the
+  // PartitionedCallOp's device.
+  FunctionLibraryRuntime* flr = GetFLR(options.target);
+  if (flr == nullptr) {
+    return errors::InvalidArgument(
+        "Cannot instantiate multi-device function with target device ",
+        options.target);
+  }
+
+  std::unique_ptr<MultiDeviceFunctionData> data =
+      MakeUnique<MultiDeviceFunctionData>(function_name, function_key,
+                                          ret_node_names.size(),
+                                          lib_def->ReachableDefinitions(*fdef));
+
+  GraphOptimizationPassOptions optimization_options;
+  // TODO(iga): Thread other relevant options from SessionOptions.
+  SessionOptions session_options;
+  session_options.env = flr->env();
+  optimization_options.session_options = &session_options;
+  optimization_options.graph = &graph;
+  optimization_options.flib_def = &data->overlay_lib_;
+  optimization_options.device_set = &device_set;
+
+  DumpGraph("Before running PRE_PLACEMENT passes", graph.get());
+  TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
+      OptimizationPassRegistry::PRE_PLACEMENT, optimization_options));
+
+  DumpGraph("Before calling Placer", graph.get());
+  Placer placer(graph.get(), &device_set, nullptr, /* No session options */
+                flr->device() /* Default device */);
+  TF_RETURN_IF_ERROR(placer.Run());
+
+  DumpGraph("Before running POST_PLACEMENT passes", graph.get());
+  TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
+      OptimizationPassRegistry::POST_PLACEMENT, optimization_options));
+
+  Device* cpu_device;
+  TF_RETURN_IF_ERROR(device_mgr_->LookupDevice("CPU:0", &cpu_device));
+
+  if (options.optimize_graph_fn) {
+    DumpGraph("Before running graph optimization fn", graph.get());
+    Status status = options.optimize_graph_fn(
+        std::move(ret_node_names), std::move(control_ret_node_names),
+        &data->overlay_lib_, device_set, cpu_device, &graph);
+    if (!status.ok()) {
+      LOG(WARNING) << "Ignoring multi-device function optimization failure: "
+                   << status.ToString();
+    }
+    DumpGraph("After optimization", graph.get());
+  }
+
+  DumpGraph("Before running POST_REWRITE_FOR_EXEC passes", graph.get());
+  TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
+      OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, optimization_options));
+  DumpGraph("After all optimization passes", graph.get());
+
+  if (options.graph_collector != nullptr) {
+    GraphDef def;
+    graph->ToGraphDef(&def);
+    *def.mutable_library() = lib_def->ReachableDefinitions(def).ToProto();
+    options.graph_collector->CollectOptimizedGraph(def);
+  }
+
+  std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
+  TF_RETURN_IF_ERROR(
+      PartitionFunctionGraph(device_set, std::move(graph), &subgraphs));
+
+  if (options.graph_collector != nullptr) {
+    for (const auto& pair : subgraphs) {
+      GraphDef def;
+      pair.second->ToGraphDef(&def);
+      *def.mutable_library() = lib_def->ReachableDefinitions(def).ToProto();
+      options.graph_collector->CollectPartitionedGraph(def);
+    }
+  }
+
+  int i = 0;
+  FunctionNameGenerator name_generator(&data->overlay_lib_, function_name);
+  for (const auto& pair : subgraphs) {
+    i += 1;
+    // TODO(iga): Fail gracefully if the set of devices corresponds
+    // to more than one address space.
+    const string& target = pair.first;
+    Graph* subgraph = pair.second.get();
+
+    ComponentFunctionData* comp_data = &data->glue_[target];
+    TF_RETURN_IF_ERROR(UpdateArgAndRetvalMetadata(
+        subgraph, &comp_data->arg_indices_, &comp_data->ret_indices_,
+        &comp_data->arg_alloc_attrs_, &comp_data->ret_alloc_attrs_));
+    FunctionDef shard;
+    string unique_name = name_generator.GetName();
+    TF_RETURN_IF_ERROR(GraphToFunctionDef(*subgraph, unique_name, &shard));
+    FunctionLibraryRuntime* target_flr = GetFLR(target);
+    TF_RETURN_IF_ERROR(data->overlay_lib_.AddFunctionDef(shard));
+    FunctionLibraryRuntime::InstantiateOptions opts;
+    opts.executor_type = options.executor_type;
+    opts.target = target;
+    opts.overlay_lib = &data->overlay_lib_;
+    FunctionLibraryRuntime::Handle component_handle;
+
+    TF_RETURN_IF_ERROR(target_flr->Instantiate(
+        unique_name, AttrSlice(&shard.attr()), opts, &component_handle));
+    VLOG(1) << "Instantiated component function " << unique_name
+            << " on device " << target << " with component handle "
+            << component_handle;
+    VLOG(2) << DebugString(shard);
+    comp_data->handle_ = component_handle;
+  }
+
+  *handle = AddMultiDeviceHandle(std::move(data), function_key);
+  VLOG(2) << "Instantiated MultiDevice function \"" << function_name
+          << "\" with handle " << *handle;
+  return Status::OK();
+}
+
+Status ProcessFunctionLibraryRuntime::GetOutputDevices(
+    FunctionLibraryRuntime::Handle handle,
+    std::vector<Device*>* output_devices) const {
+  const MultiDeviceFunctionData* data = IsMultiDevice(handle);
+  if (data == nullptr) {
+    return errors::InvalidArgument(
+        "Failed for find multi-device function handle ", handle);
+  }
+
+  for (const auto& pair : data->glue_) {
+    const ComponentFunctionData& comp_data = pair.second;
+    DCHECK(comp_data.ret_alloc_attrs_.size() == comp_data.ret_indices_.size());
+
+    const string& target = pair.first;
+    FunctionLibraryRuntime* target_flr = GetFLR(target);
+    Device* target_device = target_flr->device();
+    const FunctionBody* fbody = target_flr->GetFunctionBody(comp_data.handle_);
+    DCHECK(fbody != nullptr);
+
+    output_devices->resize(data->num_outputs_);
+    for (int j = 0; j < comp_data.ret_indices_.size(); ++j) {
+      int ret_index = comp_data.ret_indices_[j];
+      if (fbody->ret_types[j] == DT_RESOURCE) {
+        (*output_devices)[ret_index] = target_device;
+      } else {
+        (*output_devices)[ret_index] =
+            comp_data.ret_alloc_attrs_[j].on_host() ? nullptr : target_device;
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+void ProcessFunctionLibraryRuntime::RunMultiDevice(
+    const FunctionLibraryRuntime::Options& opts,
+    FunctionLibraryRuntime::Handle handle, gtl::ArraySlice<Tensor> args,
+    std::vector<Tensor>* rets,
+    FunctionLibraryRuntime::DoneCallback done) const {
+  if (opts.create_rendezvous) {
+    // FLR->Run() is the default entry point. It checks for cancellation,
+    // creates rendezvous, etc.
+    // Letting create_rendezvous through will do the wrong thing - each
+    // component function will get a separate rendezvous created by its FLR.
+    done(
+        errors::Internal("Cannot call ProcessFunctionLibraryRuntime::Run with "
+                         "create_rendezvous=true. Please run the function "
+                         "using FunctionLibraryRuntime::Run"));
+    return;
+  }
+
+  const MultiDeviceFunctionData* data = IsMultiDevice(handle);
+  if (data == nullptr) {
+    done(
+        errors::InvalidArgument("Failed for find multi-device function handle ",
+                                handle, ". Was the function instantiated?"));
+    return;
+  }
+
+  if (data->glue_.empty()) {
+    // Trivial case where the function body is empty.
+    done(Status::OK());
+    return;
+  }
+
+  auto* refcounted_done = new ReffedStatusCallback(std::move(done));
+  for (int i = 0; i < data->glue_.size(); ++i) {
+    refcounted_done->Ref();
+  }
+
+  FunctionLibraryRuntime::Options opts_copy = opts;
+  for (const auto& pair : data->glue_) {
+    const string& target = pair.first;
+    const ComponentFunctionData& comp_data = pair.second;
+    FunctionLibraryRuntime::Handle handle = pair.second.handle_;
+    VLOG(1) << "Running function shard on device " << target << " with handle "
+            << handle;
+
+    opts_copy.args_alloc_attrs = comp_data.arg_alloc_attrs_;
+    opts_copy.rets_alloc_attrs = comp_data.ret_alloc_attrs_;
+    opts_copy.remote_execution = false;
+    std::vector<Tensor> comp_args =
+        GetArgsForIndices(comp_data.arg_indices_, args);
+    std::vector<Tensor>* comp_rets = new std::vector<Tensor>;
+    rets->resize(data->num_outputs_);
+    GetFLR(target)->Run(
+        opts_copy, handle, comp_args, comp_rets,
+        [comp_rets, rets, comp_data, refcounted_done](const Status& status) {
+          if (!status.ok()) {
+            LOG(ERROR) << "Component function execution failed: " << status;
+            refcounted_done->UpdateStatus(status);
+          } else {
+            for (int i = 0; i < comp_rets->size(); ++i) {
+              (*rets)[comp_data.ret_indices_[i]] = (*comp_rets)[i];
+            }
+          }
+          delete comp_rets;
+          // refcounted_done is thread-safe
+          refcounted_done->Unref();
+        });
+  }
+  refcounted_done->Unref();
+}
+
 Status ProcessFunctionLibraryRuntime::Instantiate(
     const string& function_name, AttrSlice attrs,
     const FunctionLibraryRuntime::InstantiateOptions& options,
     FunctionLibraryRuntime::Handle* handle) {
+  if (options.is_multi_device_function) {
+    return InstantiateMultiDevice(function_name, attrs, options, handle);
+  }
+
   *handle = kInvalidHandle;
   FunctionLibraryRuntime* flr = GetFLR(options.target);
   if (flr != nullptr) {
@@ -247,11 +796,7 @@ Status ProcessFunctionLibraryRuntime::Instantiate(
     FunctionLibraryRuntime::Handle h =
         gtl::FindWithDefault(table_, function_key, kInvalidHandle);
     if (h == kInvalidHandle || function_data_.count(h) == 0) {
-      h = next_handle_;
-      function_data_[h] = MakeUnique<FunctionData>(
-          options.target, kInvalidHandle, function_key);
-      table_[function_key] = h;
-      next_handle_++;
+      h = AddHandleLocked(function_key, options.target, kInvalidHandle);
     }
     f = function_data_[h].get();
     *handle = h;
@@ -272,8 +817,48 @@ Status ProcessFunctionLibraryRuntime::RemoveHandle(
   return Status::OK();
 }
 
+Status ProcessFunctionLibraryRuntime::ReleaseMultiDeviceHandle(
+    FunctionLibraryRuntime::Handle handle) {
+  std::unique_ptr<MultiDeviceFunctionData> mdata;
+  {
+    mutex_lock l(mu_);
+    auto it = mdevice_data_.find(handle);
+    --it->second->instantiation_counter_;
+    if (it->second->instantiation_counter_ != 0) {
+      return Status::OK();
+    }
+    mdata = std::move(it->second);
+    table_.erase(mdata->function_key_);
+    mdevice_data_.erase(it);
+  }
+
+  // If we are here we are releasing the last instantiation of `handle`.
+  // Release all component function handles.
+  Status overall_status;
+  for (const auto& it : mdata->glue_) {
+    const string& device = it.first;
+    FunctionLibraryRuntime::Handle flr_handle = it.second.handle_;
+    FunctionLibraryRuntime* flr = GetFLR(device);
+    if (flr == nullptr) {
+      return errors::InvalidArgument(
+          "Failed to find FunctionLibraryRuntime for device ", device,
+          " when releasing multi-device function handle ", handle);
+    }
+    Status status = flr->ReleaseHandle(flr_handle);
+    if (!status.ok()) {
+      overall_status = status;
+    }
+  }
+
+  return overall_status;
+}
+
 Status ProcessFunctionLibraryRuntime::ReleaseHandle(
     FunctionLibraryRuntime::Handle handle) {
+  if (IsMultiDevice(handle)) {
+    return ReleaseMultiDeviceHandle(handle);
+  }
+
   FunctionLibraryRuntime* flr = nullptr;
   string target_device;
   {
@@ -291,12 +876,15 @@ Status ProcessFunctionLibraryRuntime::ReleaseHandle(
 void ProcessFunctionLibraryRuntime::Run(
     const FunctionLibraryRuntime::Options& opts,
     FunctionLibraryRuntime::Handle handle, gtl::ArraySlice<Tensor> args,
-    std::vector<Tensor>* rets, FunctionLibraryRuntime::DoneCallback done) {
-  if (!opts.remote_execution) {
-    done(errors::InvalidArgument(
-        "ProcessFunctionLibraryRuntime::Run should only be called when there ",
-        "is a remote execution."));
-    return;
+    std::vector<Tensor>* rets,
+    FunctionLibraryRuntime::DoneCallback done) const {
+  bool multi_device;
+  {
+    tf_shared_lock l(mu_);
+    multi_device = mdevice_data_.find(handle) != mdevice_data_.end();
+  }
+  if (multi_device) {
+    return RunMultiDevice(opts, handle, args, rets, done);
   }
 
   FunctionLibraryRuntime* flr = nullptr;
@@ -313,6 +901,15 @@ void ProcessFunctionLibraryRuntime::Run(
     target_device = function_data->target_device();
     local_handle = function_data->local_handle();
   }
+
+  if (!opts.remote_execution) {
+    done(
+        errors::InvalidArgument("ProcessFunctionLibraryRuntime::Run should "
+                                "only be called for multi-device functions or "
+                                "for remote execution."));
+    return;
+  }
+
   flr = GetFLR(target_device);
   if (flr != nullptr) {
     auto rendezvous = opts.rendezvous;
@@ -374,7 +971,7 @@ Status ProcessFunctionLibraryRuntime::Clone(
     Env* env, int graph_def_version, const OptimizerOptions& optimizer_options,
     CustomKernelCreator custom_kernel_creator,
     std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
-    std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr) {
+    std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr) const {
   out_lib_def->reset(new FunctionLibraryDefinition(*lib_def_));
   out_pflr->reset(new ProcessFunctionLibraryRuntime(
       device_mgr_, env, graph_def_version, out_lib_def->get(),
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index 53815715d8b9d033f5600320108cb443c36b3e93..a08e84510737190c628775f6a8002a1190056207 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/protobuf/config.pb.h"
@@ -79,7 +80,8 @@ class ProcessFunctionLibraryRuntime {
   FunctionLibraryRuntime* GetFLR(const string& device_name) const;
 
   // Returns the device incarnation for the given device_name.
-  Status GetDeviceIncarnation(const string& device_name, int64* incarnation);
+  Status GetDeviceIncarnation(const string& device_name,
+                              int64* incarnation) const;
 
   // For a given canonicalized key signature of the function instantiated
   // on device `device_name` and a `local_handle`, creates a handle and returns
@@ -94,14 +96,23 @@ class ProcessFunctionLibraryRuntime {
 
   // For the given handle instantiated on device `device_name` returns the local
   // index of instantiation of that function. If the function was not
-  // instantiated on `device_name` returns kInvalidLocalHandle.
+  // instantiated on `device_name` or the function is multi-device,
+  // returns kInvalidLocalHandle.
   FunctionLibraryRuntime::LocalHandle GetHandleOnDevice(
-      const string& device_name, FunctionLibraryRuntime::Handle handle);
+      const string& device_name, FunctionLibraryRuntime::Handle handle) const;
+
+  // Fills `output_devices` with the devices on which the results will
+  // be produced. If some output is produced on CPU, the corresponding Device*
+  // is set to nullptr. If some output is DT_RESOURCE, the corresponding Device*
+  // is set to the device backing the resource.
+  // REQUIRES: `handle` identifies a multi-device function.
+  Status GetOutputDevices(FunctionLibraryRuntime::Handle handle,
+                          std::vector<Device*>* output_devices) const;
 
   // Returns true if function with handle `handle` was instantiated on device
-  // `device_name`.
+  // `device_name`. Returns false for multi-device functions.
   bool IsInstantiatedOnDevice(const string& device_name,
-                              FunctionLibraryRuntime::Handle handle);
+                              FunctionLibraryRuntime::Handle handle) const;
 
   // Instantiates the function. See framework/function.h for more details.
   // Allows for function_name to be instantiated on different devices
@@ -114,6 +125,9 @@ class ProcessFunctionLibraryRuntime {
   // tells it to release it. If the `handle` isnt' needed at all, the local FLR
   // might call RemoveHandle on this to get rid of the state owned by the Proc
   // FLR.
+  // For multi-device functions, calls ReleaseHandle on local FLRs for each
+  // component function that is part of this multi-device function.
+  // Each local FLR might call RemoveHandle on this.
   Status ReleaseHandle(FunctionLibraryRuntime::Handle handle);
 
   // Runs the function with given `handle`. Function could have been
@@ -121,17 +135,78 @@ class ProcessFunctionLibraryRuntime {
   void Run(const FunctionLibraryRuntime::Options& opts,
            FunctionLibraryRuntime::Handle handle, gtl::ArraySlice<Tensor> args,
            std::vector<Tensor>* rets,
-           FunctionLibraryRuntime::DoneCallback done);
+           FunctionLibraryRuntime::DoneCallback done) const;
 
  private:
+  friend class FunctionLibraryRuntimeImpl;
+
+  using DeviceAndFHandle = std::pair<string, FunctionLibraryRuntime::Handle>;
+  using ArgAndRetIndices = std::pair<std::vector<int>, std::vector<int>>;
+  using ArgAndRetAllocAttrs = std::pair<std::vector<AllocatorAttributes>,
+                                        std::vector<AllocatorAttributes>>;
+
+  FunctionLibraryRuntime::Handle AddHandleLocked(
+      const string& function_key, const string& device_name,
+      FunctionLibraryRuntime::LocalHandle local_handle)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Structure to keep track of how a component function (a single-device
+  // piece of a multi-device function) fits into the multi-device function.
+  struct ComponentFunctionData {
+    // The handle for the instantiated component function.
+    FunctionLibraryRuntime::Handle handle_;
+    // arg_indices_.size() is the number of arguments to the component function.
+    // The i'th argument of the component function comes from the
+    // `arg_indices_[i]`th argument of the multi-device function.
+    std::vector<int> arg_indices_;
+    // ret_indices_.size() is the number of return value of the component
+    // function.  The i'th return value of the component function goes to the
+    // `ret_indices_[i]`th return value of the multi-device function.
+    std::vector<int> ret_indices_;
+    // arg_alloc_attrs_[i] are the allocator attributes of the i'th argument to
+    // the component function.
+    std::vector<AllocatorAttributes> arg_alloc_attrs_;
+    // ret_alloc_attrs_[i] are the allocator attributes of the i'th return value
+    // of the component function.
+    std::vector<AllocatorAttributes> ret_alloc_attrs_;
+  };
+
+  // Data structure holding information for a single instantiated multi-device
+  // function.
+  // The fields are filled in during instantiation. Once the object is
+  // added to mdevice_data_, all fields are constant.
+  struct MultiDeviceFunctionData {
+    MultiDeviceFunctionData(const string& function_name,
+                            const string& function_key, int num_outputs,
+                            const FunctionLibraryDefinition& overlay_lib)
+        : num_outputs_(num_outputs),
+          instantiation_counter_(1),
+          function_name_(function_name),
+          function_key_(function_key),
+          overlay_lib_(overlay_lib) {}
+
+    // Stored here to resize the output tensor vector when function is run.
+    const int num_outputs_;
+    uint64 instantiation_counter_;
+    const string function_name_;
+    const string function_key_;
+    // The overlay library holding component function definitions as well as
+    // the definitions of functions they call.
+    FunctionLibraryDefinition overlay_lib_;
+
+    // Maps the device name to the information about the component function
+    // be run on this device.
+    std::unordered_map<string, ComponentFunctionData> glue_;
+  };
+
   // For a given device_name, returns a DeviceContext for copying
   // tensors to/from the device.
   Status GetDeviceContext(const string& device_name,
-                          DeviceContext** device_context);
+                          DeviceContext** device_context) const;
 
   // Looks up the information for the given `handle` and returns the name
   // of the device where the function is registered.
-  string GetDeviceName(FunctionLibraryRuntime::Handle handle);
+  string GetDeviceName(FunctionLibraryRuntime::Handle handle) const;
 
   // Removes handle from the state owned by this object.
   Status RemoveHandle(FunctionLibraryRuntime::Handle handle);
@@ -140,12 +215,39 @@ class ProcessFunctionLibraryRuntime {
                const OptimizerOptions& optimizer_options,
                CustomKernelCreator custom_kernel_creator,
                std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
-               std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr);
-
-  friend class FunctionLibraryRuntimeImpl;
-
-  mutable mutex mu_;
-
+               std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr) const;
+
+  Status ReleaseMultiDeviceHandle(FunctionLibraryRuntime::Handle handle);
+
+  // If handle represents a multi-device function, returns the multi-device
+  // data associated with handle. Else, nullptr.
+  MultiDeviceFunctionData* IsMultiDevice(
+      FunctionLibraryRuntime::Handle handle) const;
+
+  Status InstantiateMultiDevice(
+      const string& function_name, AttrSlice attrs,
+      const FunctionLibraryRuntime::InstantiateOptions& options,
+      FunctionLibraryRuntime::Handle* handle);
+
+  FunctionLibraryRuntime::Handle AddMultiDeviceHandle(
+      const std::unique_ptr<MultiDeviceFunctionData> data,
+      const string& function_key);
+
+  // TODO(iga): Reword
+  // Pins each arg that emits a `DT_RESOURCE` tensor to the device on which the
+  // corresponding resource lives. This ensures that the Placer assigns ops that
+  // access these resources to the appropriate devices.
+  Status PinArgsAndRets(const std::vector<string>& input_devices,
+                        const std::vector<string>& output_devices,
+                        const DeviceSet& device_set, Graph* graph) const;
+
+  void RunMultiDevice(const FunctionLibraryRuntime::Options& opts,
+                      FunctionLibraryRuntime::Handle handle,
+                      gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
+                      FunctionLibraryRuntime::DoneCallback done) const;
+
+  // Data structure holding information for a single instantiated remote
+  // (to be executed on `target_device`) function.
   class FunctionData {
    public:
     FunctionData(const string& target_device,
@@ -181,15 +283,26 @@ class ProcessFunctionLibraryRuntime {
     Notification init_done_;
   };
 
+  mutable mutex mu_;
+
   const DeviceMgr* const device_mgr_;
   const FunctionLibraryDefinition* lib_def_;
   thread::ThreadPool* default_thread_pool_;
-  // Holds all the function invocations here.
+
+  // Holds all the function instantiations. Maps function_keys to handles.
   std::unordered_map<string, FunctionLibraryRuntime::Handle> table_
       GUARDED_BY(mu_);
+
+  // Function data for instantitated remote functions.
   std::unordered_map<FunctionLibraryRuntime::Handle,
                      std::unique_ptr<FunctionData>>
       function_data_ GUARDED_BY(mu_);
+
+  // Function data for instantiated multi-device functions.
+  std::unordered_map<FunctionLibraryRuntime::Handle,
+                     std::unique_ptr<MultiDeviceFunctionData>>
+      mdevice_data_ GUARDED_BY(mu_);
+
   std::unordered_map<Device*, std::unique_ptr<FunctionLibraryRuntime>> flr_map_;
   int next_handle_ GUARDED_BY(mu_);
   DistributedFunctionLibraryRuntime* const parent_;
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index 21cb62118aebafa8a03903296b65f0617510f080..b4d3ac0df304e7caf0b742d018d43c9def2d76e6 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -21,7 +21,9 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/resource_var.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/type_index.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -29,6 +31,11 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
 
+#ifdef GOOGLE_CUDA
+#include "cuda/include/cuda.h"
+#include "cuda/include/cuda_runtime_api.h"
+#endif  // GOOGLE_CUDA
+
 namespace tensorflow {
 namespace {
 
@@ -65,9 +72,18 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
     std::vector<std::unique_ptr<Device>> devices;
     TF_CHECK_OK(DeviceFactory::AddDevices(options, "/job:a/replica:0/task:0",
                                           &devices));
-    device0_ = devices[0].get();
-    device1_ = devices[1].get();
     device_mgr_.reset(new DeviceMgr(std::move(devices)));
+    TF_CHECK_OK(device_mgr_->LookupDevice(
+        "/job:a/replica:0/task:0/device:CPU:0", &device0_));
+    TF_CHECK_OK(device_mgr_->LookupDevice(
+        "/job:a/replica:0/task:0/device:CPU:1", &device1_));
+    // If no GPU is available, gpu_device_ will remain nullptr.
+    Status status = device_mgr_->LookupDevice(
+        "/job:a/replica:0/task:0/device:GPU:0", &gpu_device_);
+    if (!status.ok()) {
+      CHECK_EQ(nullptr, gpu_device_);
+    }
+
     FunctionDefLibrary proto;
     for (const auto& fdef : flib) *(proto.add_function()) = fdef;
     lib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), proto));
@@ -86,6 +102,55 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
     return proc_flr_->Instantiate(name, attrs, instantiate_opts, handle);
   }
 
+  Tensor GPUToCPU(const Tensor& device_tensor) {
+#ifdef GOOGLE_CUDA
+    CHECK(gpu_device_);
+    CHECK(gpu_device_->tensorflow_gpu_device_info() != nullptr);
+    DeviceContext* device_context =
+        gpu_device_->tensorflow_gpu_device_info()->default_context;
+
+    Notification n;
+    Status status;
+    Tensor cpu_tensor(device_tensor.dtype(), device_tensor.shape());
+    device_context->CopyDeviceTensorToCPU(&device_tensor, "", gpu_device_,
+                                          &cpu_tensor,
+                                          [&n, &status](const Status& s) {
+                                            status = s;
+                                            n.Notify();
+                                          });
+    n.WaitForNotification();
+    CHECK(status.ok());
+    return cpu_tensor;
+#else
+    CHECK(false);
+#endif  // GOOGLE_CUDA
+  }
+
+  Tensor CPUToGPU(const Tensor& cpu_tensor) {
+#ifdef GOOGLE_CUDA
+    CHECK(gpu_device_);
+    CHECK(gpu_device_->tensorflow_gpu_device_info() != nullptr);
+    DeviceContext* device_context =
+        gpu_device_->tensorflow_gpu_device_info()->default_context;
+
+    Notification n;
+    Status status;
+    Tensor device_tensor(gpu_device_->GetAllocator({}), cpu_tensor.dtype(),
+                         cpu_tensor.shape(), {});
+    device_context->CopyCPUTensorToDevice(&cpu_tensor, gpu_device_,
+                                          &device_tensor,
+                                          [&n, &status](const Status& s) {
+                                            status = s;
+                                            n.Notify();
+                                          });
+    n.WaitForNotification();
+    CHECK(status.ok());
+    return device_tensor;
+#else
+    CHECK(false);
+#endif  // GOOGLE_CUDA
+  }
+
   Status Run(const string& name, FunctionLibraryRuntime::Options opts,
              test::function::Attrs attrs,
              const FunctionLibraryRuntime::InstantiateOptions& instantiate_opts,
@@ -135,7 +200,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
                      done2.Notify();
                    });
     done2.WaitForNotification();
-    EXPECT_TRUE(errors::IsNotFound(status));
+    EXPECT_TRUE(errors::IsNotFound(status)) << "Actual status: " << status;
     EXPECT_TRUE(str_util::StrContains(status.error_message(), "not found."));
 
     return Status::OK();
@@ -144,6 +209,8 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
   std::unique_ptr<DeviceMgr> device_mgr_;
   Device* device0_ = nullptr;  // Not owned. (Owned by device_mgr_.)
   Device* device1_ = nullptr;  // Not owned. (Owned by device_mgr_.)
+  // Remains as nullptr if no GPU is available.
+  Device* gpu_device_ = nullptr;  // Not owned. (Owned by device_mgr_.)
   std::unique_ptr<FunctionLibraryDefinition> lib_def_;
   std::unique_ptr<TestClusterFLR> cluster_flr_;
   std::unique_ptr<ProcessFunctionLibraryRuntime> proc_flr_;
@@ -345,5 +412,300 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, ClusterFLRParallelTest) {
   rendezvous_->Unref();
 }
 
+bool IsCUDATensor(const Tensor& t) {
+#ifdef GOOGLE_CUDA
+  cudaPointerAttributes attributes;
+  cudaError_t err =
+      cudaPointerGetAttributes(&attributes, t.tensor_data().data());
+  if (err == cudaErrorInvalidValue) return false;
+  CHECK_EQ(cudaSuccess, err) << cudaGetErrorString(err);
+  return (attributes.memoryType == cudaMemoryTypeDevice);
+#else
+  CHECK(false)
+      << "IsCUDATensor should not be called when CUDA is not available";
+#endif  // GOOGLE_CUDA
+}
+
+void TestTwoDeviceMult(
+    ProcessFunctionLibraryRuntimeTest* fixture,
+    const FunctionLibraryRuntime::InstantiateOptions& inst_opts,
+    const string& error = "") {
+  fixture->Init({test::function::TwoDeviceMult()});
+  FunctionLibraryRuntime::Options opts;
+  auto x = test::AsTensor<float>({1, 2, 3});
+  Tensor y_cpu;
+  Tensor y_gpu;
+  Status status = fixture->Run("TwoDeviceMult", opts, {{"T", DT_FLOAT}},
+                               inst_opts, {x}, {&y_cpu, &y_gpu});
+  if (!error.empty()) {
+    EXPECT_TRUE(errors::IsInvalidArgument(status))
+        << "Actual status: " << status;
+    EXPECT_TRUE(str_util::StrContains(status.error_message(), error))
+        << "Actual error message: " << status.error_message();
+    fixture->rendezvous_->Unref();
+    return;
+  }
+
+  EXPECT_TRUE(status.ok()) << "Actual status: " << status;
+  EXPECT_FALSE(IsCUDATensor(y_cpu));
+  test::ExpectTensorEqual<float>(y_cpu, test::AsTensor<float>({2, 4, 6}));
+
+  EXPECT_TRUE(IsCUDATensor(y_gpu));
+  Tensor y_gpu_on_cpu = fixture->GPUToCPU(y_gpu);
+  test::ExpectTensorEqual<float>(y_gpu_on_cpu,
+                                 test::AsTensor<float>({3, 6, 9}));
+  fixture->rendezvous_->Unref();
+}
+
+void TestTwoDeviceInputOutput(
+    ProcessFunctionLibraryRuntimeTest* fixture,
+    const FunctionLibraryRuntime::InstantiateOptions& inst_opts) {
+  if (fixture->gpu_device_ == nullptr) {
+    GTEST_SKIP() << "No GPUs available";
+  }
+  fixture->Init({test::function::TwoDeviceInputOutput()});
+  FunctionLibraryRuntime::Options opts;
+  Tensor x1 = test::AsTensor<float>({1, 2});
+  if (str_util::StrContains(inst_opts.input_devices[0], "GPU")) {
+    x1 = fixture->CPUToGPU(x1);
+  }
+  Tensor x2 = test::AsTensor<float>({10, 20});
+  if (str_util::StrContains(inst_opts.input_devices[1], "GPU")) {
+    x2 = fixture->CPUToGPU(x2);
+  }
+
+  Tensor y1;
+  Tensor y2;
+  TF_CHECK_OK(fixture->Run("TwoDeviceInputOutput", opts, {{"T", DT_FLOAT}},
+                           inst_opts, {x1, x2}, {&y1, &y2}));
+
+  if (str_util::StrContains(inst_opts.output_devices[0], "GPU")) {
+    EXPECT_TRUE(IsCUDATensor(y1));
+    y1 = fixture->GPUToCPU(y1);
+  } else {
+    EXPECT_FALSE(IsCUDATensor(y1));
+  }
+  test::ExpectTensorEqual<float>(y1, test::AsTensor<float>({2, 4}));
+
+  if (str_util::StrContains(inst_opts.output_devices[1], "GPU")) {
+    EXPECT_TRUE(IsCUDATensor(y2));
+    y2 = fixture->GPUToCPU(y2);
+  } else {
+    EXPECT_FALSE(IsCUDATensor(y2));
+  }
+  test::ExpectTensorEqual<float>(y2, test::AsTensor<float>({30, 60}));
+
+  fixture->rendezvous_->Unref();
+}
+
+std::vector<string> CompleteDevices(const std::vector<string>& v) {
+  std::vector<string> result;
+  result.reserve(v.size());
+  for (const string& s : v) {
+    result.push_back(strings::StrCat("/job:a/replica:0/task:0/device:", s));
+  }
+  return result;
+}
+
+FunctionLibraryRuntime::InstantiateOptions MakeOptions(
+    const string& target, const std::vector<string>& input_devices,
+    const std::vector<string>& output_devices) {
+  FunctionLibraryRuntime::InstantiateOptions inst_opts;
+  inst_opts.target = target;
+  inst_opts.input_devices = CompleteDevices(input_devices);
+  inst_opts.output_devices = CompleteDevices(output_devices);
+  inst_opts.is_multi_device_function = true;
+  return inst_opts;
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ExplicitOutputDevice) {
+  if (gpu_device_ == nullptr) {
+    GTEST_SKIP() << "No GPUs available";
+  }
+  TestTwoDeviceMult(this, MakeOptions("CPU:0", {"CPU:0"}, {"CPU:0", "GPU:0"}));
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_InferredOutputDevice) {
+  if (gpu_device_ == nullptr) {
+    GTEST_SKIP() << "No GPUs available";
+  }
+  TestTwoDeviceMult(this, MakeOptions("CPU:0", {"CPU:0"}, {}));
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ErrorWhenNoInputDevices) {
+  if (gpu_device_ == nullptr) {
+    GTEST_SKIP() << "No GPUs available";
+  }
+  TestTwoDeviceMult(this, MakeOptions("CPU:0", {}, {}),
+                    "input_devices must have the same length");
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest,
+       MultiDevice_ErrorWhenTooManyInputDevices) {
+  if (gpu_device_ == nullptr) {
+    GTEST_SKIP() << "No GPUs available";
+  }
+  TestTwoDeviceMult(this, MakeOptions("CPU:0", {"CPU:0", "CPU:1"}, {}),
+                    "input_devices must have the same length");
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest,
+       MultiDevice_ErrorWhenTooManyOutputDevices) {
+  TestTwoDeviceMult(
+      this, MakeOptions("CPU:0", {"CPU:0"}, {"CPU:0", "GPU:0", "CPU:1"}),
+      "output_devices must either be empty or have the same length");
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest,
+       MultiDevice_ErrorWhenBadTargetDevice) {
+  TestTwoDeviceMult(
+      this, MakeOptions("GPU:11", {"CPU:0"}, {"CPU:0", "GPU:0"}),
+      "Cannot instantiate multi-device function with target device GPU:11");
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ErrorWhenListInput) {
+  const FunctionDef& def = test::function::FuncWithListInput();
+  Init({def});
+  FunctionLibraryRuntime::Handle handle;
+  Status status = proc_flr_->Instantiate(
+      "FuncWithListInput", test::function::Attrs({{"T", DT_FLOAT}, {"N", 1}}),
+      MakeOptions("CPU:0", {"CPU:0"}, {}), &handle);
+  ASSERT_TRUE(errors::IsInvalidArgument(status)) << "Actual status: " << status;
+  ASSERT_TRUE(str_util::StrContains(
+      status.error_message(),
+      "FuncWithListInput has an input named \"x1\" that is a list of tensors"))
+      << "Actual error message: " << status.error_message();
+  rendezvous_->Unref();
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ErrorWhenListOutput) {
+  const FunctionDef& def = test::function::FuncWithListOutput();
+  Init({def});
+  FunctionLibraryRuntime::Handle handle;
+  Status status = proc_flr_->Instantiate(
+      "FuncWithListOutput", test::function::Attrs({{"T", DT_FLOAT}, {"N", 1}}),
+      MakeOptions("CPU:0", {}, {"CPU:0"}), &handle);
+  ASSERT_TRUE(errors::IsInvalidArgument(status)) << "Actual status: " << status;
+  ASSERT_TRUE(str_util::StrContains(
+      status.error_message(),
+      "FuncWithListOutput has an output named \"y\" that is a list of tensors"))
+      << "Actual error message: " << status.error_message();
+  rendezvous_->Unref();
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest,
+       MultiDevice_ExplicitMultiInputOutput) {
+  TestTwoDeviceInputOutput(
+      this, MakeOptions("CPU:0", {"CPU:0", "GPU:0"}, {"CPU:0", "GPU:0"}));
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_FlipInputs) {
+  TestTwoDeviceInputOutput(
+      this, MakeOptions("CPU:0", {"GPU:0", "CPU:0"}, {"CPU:0", "GPU:0"}));
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_FlipOutputs) {
+  TestTwoDeviceInputOutput(
+      this, MakeOptions("CPU:0", {"CPU:0", "GPU:0"}, {"GPU:0", "CPU:0"}));
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_FlipBoth) {
+  TestTwoDeviceInputOutput(
+      this, MakeOptions("CPU:0", {"GPU:0", "CPU:0"}, {"GPU:0", "CPU:0"}));
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_EmptyBodySwap) {
+  if (gpu_device_ == nullptr) {
+    GTEST_SKIP() << "No GPUs available";
+  }
+  FunctionLibraryRuntime::InstantiateOptions inst_opts =
+      MakeOptions("CPU:0", {"GPU:0", "CPU:0"}, {"CPU:0", "GPU:0"});
+  Init({test::function::EmptyBodySwap()});
+
+  Tensor x1 = CPUToGPU(test::AsTensor<float>({1, 2}));
+  Tensor x2 = test::AsTensor<float>({10, 20});
+  Tensor y1;
+  Tensor y2;
+  TF_CHECK_OK(Run("EmptyBodySwap", {}, {{"T", DT_FLOAT}}, inst_opts, {x1, x2},
+                  {&y1, &y2}));
+
+  EXPECT_FALSE(IsCUDATensor(y1));
+  test::ExpectTensorEqual<float>(y1, test::AsTensor<float>({10, 20}));
+
+  EXPECT_TRUE(IsCUDATensor(y2));
+  y2 = GPUToCPU(y2);
+  test::ExpectTensorEqual<float>(y2, test::AsTensor<float>({1, 2}));
+
+  rendezvous_->Unref();
+}
+
+Tensor GetResourceHandle(const string& var_name, const string& container,
+                         const string& device_name) {
+  ResourceHandle handle;
+  handle.set_device(device_name);
+  handle.set_container(container);
+  handle.set_name(var_name);
+  handle.set_hash_code(MakeTypeIndex<Var>().hash_code());
+  handle.set_maybe_type_name(MakeTypeIndex<Var>().name());
+  Tensor tensor(DT_RESOURCE, TensorShape({}));
+  tensor.scalar<ResourceHandle>()() = handle;
+  return tensor;
+}
+
+void TestResourceOutputAndUse(ProcessFunctionLibraryRuntimeTest* fixture,
+                              const string& resource_return_device) {
+  if (fixture->gpu_device_ == nullptr) {
+    GTEST_SKIP() << "No GPUs available";
+  }
+  FunctionLibraryRuntime::InstantiateOptions inst_opts = MakeOptions(
+      "CPU:0", {"GPU:0", "GPU:0"}, {resource_return_device, "GPU:0"});
+  fixture->Init({test::function::ResourceOutput(),
+                 test::function::ReadResourceVariable()});
+
+  // Make resource var
+  Tensor resource_value = fixture->CPUToGPU(test::AsTensor<float>({10, 20}));
+  Var* resource = new Var(DT_FLOAT);
+  *resource->tensor() = resource_value;
+  resource->is_initialized = true;
+  ResourceMgr* mgr = fixture->gpu_device_->resource_manager();
+  Status status = mgr->Create(mgr->default_container(), "my_gpu_var", resource);
+  ASSERT_TRUE(status.ok()) << status.error_message();
+
+  // Run the function taking a resource and outputing it
+  Tensor x1 = fixture->CPUToGPU(test::AsTensor<float>({1, 2}));
+  Tensor x2 = GetResourceHandle("my_gpu_var", mgr->default_container(),
+                                "/job:a/replica:0/task:0/device:GPU:0");
+  Tensor returned_handle;
+  Tensor y2;
+  TF_CHECK_OK(fixture->Run("ResourceOutput", {}, {{"T", DT_FLOAT}}, inst_opts,
+                           {x1, x2}, {&returned_handle, &y2}));
+
+  EXPECT_FALSE(IsCUDATensor(returned_handle));
+  EXPECT_TRUE(IsCUDATensor(y2));
+  y2 = fixture->GPUToCPU(y2);
+  test::ExpectTensorEqual<float>(y2, test::AsTensor<float>({2, 4}));
+
+  // Read the variable using the handle returned from previous function to
+  // make sure the handle and read value is on the right device.
+  inst_opts = MakeOptions("GPU:0", {"GPU:0"}, {"GPU:0"});
+  Tensor read_resource;
+  TF_CHECK_OK(fixture->Run("ReadResourceVariable", {}, {{"T", DT_FLOAT}},
+                           inst_opts, {returned_handle}, {&read_resource}));
+  EXPECT_TRUE(IsCUDATensor(read_resource));
+  read_resource = fixture->GPUToCPU(read_resource);
+  test::ExpectTensorEqual<float>(read_resource,
+                                 test::AsTensor<float>({10, 20}));
+
+  fixture->rendezvous_->Unref();
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ResourceOutput_GPU) {
+  TestResourceOutputAndUse(this, "GPU:0");
+}
+
+TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_ResourceOutput_CPU) {
+  TestResourceOutputAndUse(this, "CPU:0");
+}
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/process_state.cc b/tensorflow/core/common_runtime/process_state.cc
index 3d8ac9b1344d8f2ca210451194adf4607dd52b7d..fdb79767ec1d92f70b18d96023a0818f45cbc91c 100644
--- a/tensorflow/core/common_runtime/process_state.cc
+++ b/tensorflow/core/common_runtime/process_state.cc
@@ -34,11 +34,15 @@ namespace tensorflow {
 
 /*static*/ ProcessState* ProcessState::singleton() {
   static ProcessState* instance = new ProcessState;
+  static std::once_flag f;
+  std::call_once(f, []() {
+    AllocatorFactoryRegistry::singleton()->process_state_ = instance;
+  });
+
   return instance;
 }
 
-ProcessState::ProcessState() : numa_enabled_(false) {
-}
+ProcessState::ProcessState() : numa_enabled_(false) {}
 
 string ProcessState::MemDesc::DebugString() {
   return strings::StrCat((loc == CPU ? "CPU " : "GPU "), dev_index,
@@ -72,7 +76,7 @@ Allocator* ProcessState::GetCPUAllocator(int numa_node) {
     }
     Allocator* allocator = nullptr;
     SubAllocator* sub_allocator =
-        (alloc_visitors_defined || use_bfc_allocator)
+        (numa_enabled_ || alloc_visitors_defined || use_bfc_allocator)
             ? new BasicCPUAllocator(
                   numa_enabled_ ? numa_node : port::kNUMANoAffinity,
                   cpu_alloc_visitors_, cpu_free_visitors_)
@@ -93,7 +97,7 @@ Allocator* ProcessState::GetCPUAllocator(int numa_node) {
                            "bfc_cpu_allocator_for_gpu" /*name*/);
       VLOG(2) << "Using BFCAllocator with memory limit of "
               << cpu_mem_limit_in_mb << " MB for ProcessState CPU allocator";
-    } else if (alloc_visitors_defined) {
+    } else if (sub_allocator) {
       DCHECK(sub_allocator);
       allocator =
           new PoolAllocator(100 /*pool_size_limit*/, true /*auto_resize*/,
@@ -103,7 +107,7 @@ Allocator* ProcessState::GetCPUAllocator(int numa_node) {
               << " numa_node=" << numa_node;
     } else {
       DCHECK(!sub_allocator);
-      allocator = cpu_allocator();
+      allocator = cpu_allocator_base();
     }
     if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) {
       // Wrap the allocator to track allocation ids for better logging
@@ -138,7 +142,7 @@ void ProcessState::AddCPUFreeVisitor(SubAllocator::Visitor visitor) {
 void ProcessState::TestOnlyReset() {
   mutex_lock lock(mu_);
   // Don't delete this value because it's static.
-  Allocator* default_cpu_allocator = cpu_allocator();
+  Allocator* default_cpu_allocator = cpu_allocator_base();
   mem_desc_map_.clear();
   for (Allocator* a : cpu_allocators_) {
     if (a != default_cpu_allocator) delete a;
diff --git a/tensorflow/core/common_runtime/process_state.h b/tensorflow/core/common_runtime/process_state.h
index 6849d305b3c5577485e83ed7d2e9521dce20a452..bc877db99e451577fe5104fd24200da80d0a11a6 100644
--- a/tensorflow/core/common_runtime/process_state.h
+++ b/tensorflow/core/common_runtime/process_state.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/allocator_registry.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
@@ -34,7 +35,7 @@ class PoolAllocator;
 
 // Singleton that manages per-process state, e.g. allocation of
 // shared resources.
-class ProcessState {
+class ProcessState : public ProcessStateInterface {
  public:
   static ProcessState* singleton();
 
@@ -129,7 +130,7 @@ class RecordingAllocator : public Allocator {
   bool TracksAllocationSizes() override { return a_->TracksAllocationSizes(); }
   size_t RequestedSize(const void* p) override { return a_->RequestedSize(p); }
   size_t AllocatedSize(const void* p) override { return a_->AllocatedSize(p); }
-  void GetStats(AllocatorStats* stats) override { a_->GetStats(stats); }
+  absl::optional<AllocatorStats> GetStats() override { return a_->GetStats(); }
   void ClearStats() override { a_->ClearStats(); }
   ProcessState::MDMap* mm_;  // not owned
   Allocator* a_;             // not owned
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index e1dc08d64545ece29a8aa2ab2612dd3cd994559e..d42b8d55e4f50606578cf249e1f245b72cd7bd24 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -34,14 +34,23 @@ namespace tensorflow {
 
 namespace {
 
+int32 DefaultNumInterOpThreads() {
+  // Use environment setting if specified (init once)
+  static int env_num_threads = NumInterOpThreadsFromEnvironment();
+  if (env_num_threads > 0) {
+    return env_num_threads;
+  }
+
+  // Default to using the number of cores available in the process.
+  return port::NumSchedulableCPUs();
+}
+
 static thread::ThreadPool* InitComputePool(const SessionOptions& options) {
   int32 inter_op_parallelism_threads =
       options.config.inter_op_parallelism_threads();
   if (inter_op_parallelism_threads == 0) {
-    // Default to using the number of cores available in the process.
-    inter_op_parallelism_threads = port::NumSchedulableCPUs();
+    inter_op_parallelism_threads = DefaultNumInterOpThreads();
   }
-
   return new thread::ThreadPool(Env::Default(), "Compute",
                                 inter_op_parallelism_threads);
 }
@@ -53,6 +62,18 @@ thread::ThreadPool* ComputePool(const SessionOptions& options) {
   return compute_pool;
 }
 
+int32 NumInterOpThreadsFromEnvironment() {
+  int32 num;
+  const char* val = std::getenv("TF_NUM_INTEROP_THREADS");
+  return (val && strings::safe_strto32(val, &num)) ? num : 0;
+}
+
+int32 NumIntraOpThreadsFromEnvironment() {
+  int32 num;
+  const char* val = std::getenv("TF_NUM_INTRAOP_THREADS");
+  return (val && strings::safe_strto32(val, &num)) ? num : 0;
+}
+
 int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
   const int32 inter_op = options.config.inter_op_parallelism_threads();
   if (inter_op != 0) return inter_op;
@@ -67,7 +88,7 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
 #endif  // _OPENMP
     DCHECK_GE(mkl_intra_op, 1);
     const int32 mkl_inter_op = std::max(
-        (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2);
+        (DefaultNumInterOpThreads() + mkl_intra_op - 1) / mkl_intra_op, 2);
     VLOG(0)
         << "Creating new thread pool with default inter op setting: "
         << mkl_inter_op
@@ -75,8 +96,7 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
     return mkl_inter_op;
   }
 #endif  // INTEL_MKL
-  // Default to using the number of cores available in the process.
-  return port::NumSchedulableCPUs();
+  return DefaultNumInterOpThreads();
 }
 
 thread::ThreadPool* NewThreadPoolFromSessionOptions(
diff --git a/tensorflow/core/common_runtime/process_util.h b/tensorflow/core/common_runtime/process_util.h
index 5d9266671617320eea4cea60de1ebd7210f3b674..7ad658be9f785032c85f20224a4d592ded7e283c 100644
--- a/tensorflow/core/common_runtime/process_util.h
+++ b/tensorflow/core/common_runtime/process_util.h
@@ -30,7 +30,18 @@ namespace tensorflow {
 // using 'options'.  Caller does not take ownership over threadpool.
 thread::ThreadPool* ComputePool(const SessionOptions& options);
 
-// Returns number of inter op threads.
+// Returns the TF_NUM_INTEROP_THREADS environment value, or 0 if not specified.
+int32 NumInterOpThreadsFromEnvironment();
+
+// Returns the TF_NUM_INTRAOP_THREADS environment value, or 0 if not specified.
+int32 NumIntraOpThreadsFromEnvironment();
+
+// Returns the number of inter op threads specified in `options` or a default.
+// If no value is specified in the provided options, then the function returns
+// the value defined in the TF_NUM_INTEROP_THREADS environment variable.
+// If neither a value is specified in the options or in the environment,
+// this function will return a reasonable default value based on the number
+// of schedulable CPUs, and any MKL and OpenMP configurations.
 int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options);
 
 // Creates a thread pool with number of inter op threads.
diff --git a/tensorflow/core/common_runtime/ring_alg.cc b/tensorflow/core/common_runtime/ring_alg.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c20cc74bf70e3340848666a179c1bb3617a4ede6
--- /dev/null
+++ b/tensorflow/core/common_runtime/ring_alg.cc
@@ -0,0 +1,430 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/ring_alg.h"
+
+#include <stdlib.h>
+#include <atomic>
+#include <functional>
+#include <utility>
+
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/collective_util.h"
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/types.h"
+
+// Set true for greater intelligibility of debug mode log messages.
+#define READABLE_KEYS false
+// A ring algorithm exchanges chunks of tensor between devices.  The chunk size
+// depends on the number of subdivisions specified in the algorithm.  If the
+// user does not specify the number of subdivisions we may infer the number
+// dynamically so that the resulting chunk size does not exceed
+// kMaxChunkSizeBytes, empirically set at 4 MiB.
+constexpr size_t kMaxChunkSizeBytes = (4 * 1024 * 1024);
+// kMaxSubdivsPerDev is used to give an upper bound on the number of
+// subdivisions dynamically generated.  A reasonable value would be a small
+// multiple of the number of NICs adjacent to each device.
+constexpr int kMaxSubdivsPerDevice = 2;
+
+namespace tensorflow {
+namespace {
+// Each CollectiveOp implementation is free to define its own
+// BufRendezvous key format.  This function produces the key used by
+// RingAlg instances.  Note that the exec_key will differentiate between
+// different instances consequently we don't need to further differentiate
+// between subclasses of RingAlg.
+string RingAlgBufKey(const string& name, const string& exec_key, int pass,
+                     int section, int source_rank) {
+  if (READABLE_KEYS) {
+    return strings::StrCat(name, "(", exec_key, "):pass(", pass, "):section(",
+                           section, "):srcrank(", source_rank, ")");
+  } else {
+    // TODO(b/78352018): Try out some kind of denser encoding, e.g. 128 bit
+    // hash.
+    return strings::StrCat(exec_key, ":", pass, ":", section, ":", source_rank);
+  }
+}
+
+}  // namespace
+
+void RingAlg::PCQueue::Enqueue(RingField* rf) {
+  mutex_lock l(pcq_mu_);
+  deque_.push_back(rf);
+  if (waiter_count_ > 0) {
+    cv_.notify_one();
+  }
+}
+
+RingAlg::RingField* RingAlg::PCQueue::Dequeue() {
+  mutex_lock l(pcq_mu_);
+  if (deque_.empty()) {
+    ++waiter_count_;
+    while (deque_.empty()) {
+      cv_.wait(l);
+    }
+    --waiter_count_;
+  }
+  RingField* rf = deque_.front();
+  deque_.pop_front();
+  return rf;
+}
+
+RingAlg::RingAlg(CollectiveType type, const string& name)
+    : type_(type),
+      name_(name),
+      col_ctx_(nullptr),
+      col_params_(nullptr),
+      done_(nullptr),
+      group_size_(-1),
+      num_subdivs_(-1) {}
+
+namespace {
+Status GenerateSubdivsInCollectiveParams(CollectiveParams* col_params) {
+  if (col_params->instance.shape.num_elements() == 0) {
+    return errors::Internal("shape in CollectiveParams should be non-empty");
+  }
+  const int kAvgDevPerTask =
+      col_params->group.group_size / col_params->group.num_tasks;
+  const int kMaxNumSubdivs = kMaxSubdivsPerDevice * kAvgDevPerTask;
+  if (kMaxNumSubdivs <= 0) {
+    return errors::Internal("Unexpected kMaxNumSubdivs ", kMaxNumSubdivs,
+                            " in ",
+                            col_params->instance.impl_details.collective_name);
+  }
+  // NOTE(ayushd): If no subdiv_offsets have been specified, dynamically add
+  // as many offsets as needed so that the size of tensor chunks <=
+  // kMaxChunkSizeBytes.  Empirically, chunks that are too small or too large
+  // lead to worse performance.
+  int num_subdivs = 0;
+  const size_t tensor_size = col_params->instance.shape.num_elements() *
+                             DataTypeSize(col_params->instance.data_type);
+  size_t chunk_size;
+  do {
+    ++num_subdivs;
+    int num_chunks = col_params->group.group_size * num_subdivs;
+    chunk_size = tensor_size / num_chunks;
+    VLOG(2) << "num_subdivs " << num_subdivs << " num_chunks " << num_chunks
+            << " chunk_size " << chunk_size;
+  } while (chunk_size > kMaxChunkSizeBytes && num_subdivs < kMaxNumSubdivs);
+  if (num_subdivs <= 0) {
+    return errors::Internal("Unexpected num_subdivs ", num_subdivs, " in ",
+                            col_params->instance.impl_details.collective_name);
+  }
+
+  int subdiv_stride = kAvgDevPerTask / num_subdivs;
+  if (subdiv_stride == 0) subdiv_stride = 1;
+  col_params->instance.impl_details.subdiv_offsets.reserve(num_subdivs);
+  for (int sdi = 0; sdi < num_subdivs; ++sdi) {
+    int subdiv_offset = subdiv_stride * sdi;
+    if (sdi % 2 == 1) subdiv_offset *= -1;
+    col_params->instance.impl_details.subdiv_offsets.push_back(subdiv_offset);
+  }
+
+  if (VLOG_IS_ON(2)) {
+    string subdiv_buf;
+    for (const int subdiv_offset :
+         col_params->instance.impl_details.subdiv_offsets) {
+      strings::StrAppend(&subdiv_buf, " ", subdiv_offset);
+    }
+    VLOG(2) << "Dynamically generated " << num_subdivs
+            << " subdiv_offsets:" << subdiv_buf << " tensor_size "
+            << tensor_size << " chunk_size " << chunk_size;
+  }
+
+  return Status::OK();
+}
+}  // namespace
+
+Status RingAlg::InitializeCollectiveParams(CollectiveParams* col_params) {
+  const string& device_name =
+      col_params->instance.device_names[col_params->default_rank];
+  // Each subdiv permutation is a ring formed by rotating each
+  // single-task subsequence of devices by an offset.  This makes most
+  // sense when each task has the same number of devices but we can't
+  // depend on that being the case so we'll compute something that
+  // works in any case.
+
+  // Start by counting the devices in each task.
+  // Precondition: device_names must be sorted so that all devices in
+  // the same task are adjacent.
+  VLOG(2) << "Sorted task names: "
+          << str_util::Join(col_params->instance.task_names, ", ");
+  std::vector<int> dev_per_task;
+  const string* prior_task_name = &col_params->instance.task_names[0];
+  int dev_count = 1;
+  for (int di = 1; di < col_params->group.group_size; ++di) {
+    if (col_params->instance.task_names[di] != *prior_task_name) {
+      dev_per_task.push_back(dev_count);
+      dev_count = 1;
+      prior_task_name = &col_params->instance.task_names[di];
+    } else {
+      ++dev_count;
+    }
+  }
+  dev_per_task.push_back(dev_count);
+  DCHECK_EQ(col_params->group.num_tasks, dev_per_task.size());
+
+  if (col_params->instance.impl_details.subdiv_offsets.empty()) {
+    TF_RETURN_IF_ERROR(GenerateSubdivsInCollectiveParams(col_params));
+  }
+
+  // Generate a ring permutation for requested offset.
+  VLOG(2) << "Setting up perms for col_params " << col_params
+          << " subdiv_permutations "
+          << &col_params->instance.impl_details.subdiv_permutations;
+  col_params->instance.impl_details.subdiv_permutations.resize(
+      col_params->instance.impl_details.subdiv_offsets.size());
+  col_params->subdiv_rank.resize(
+      col_params->instance.impl_details.subdiv_offsets.size(), -1);
+  for (int sdi = 0;
+       sdi < col_params->instance.impl_details.subdiv_offsets.size(); ++sdi) {
+    std::vector<int>& perm =
+        col_params->instance.impl_details.subdiv_permutations[sdi];
+    DCHECK_EQ(perm.size(), 0);
+    int offset = col_params->instance.impl_details.subdiv_offsets[sdi];
+    // A negative subdivision offset is interpreted as follows:
+    //  1. Reverse the local device ordering.
+    //  2. Begin the subdivision at abs(offset) in the reversed ordering.
+    bool reverse = false;
+    if (offset < 0) {
+      offset = abs(offset);
+      reverse = true;
+    }
+    int prior_dev_count = 0;  // sum over prior worker device counts
+    for (int ti = 0; ti < col_params->group.num_tasks; ++ti) {
+      for (int di = 0; di < dev_per_task[ti]; ++di) {
+        int di_offset = (di + offset) % dev_per_task[ti];
+        int offset_di =
+            reverse ? (dev_per_task[ti] - (di_offset + 1)) : di_offset;
+        // Device index in global subdivision permutation.
+        int permuted_di = prior_dev_count + offset_di;
+        int rank = static_cast<int>(perm.size());
+        perm.push_back(permuted_di);
+        if (col_params->instance.device_names[permuted_di] == device_name) {
+          DCHECK_EQ(permuted_di, col_params->default_rank);
+          col_params->subdiv_rank[sdi] = rank;
+        }
+      }
+      prior_dev_count += dev_per_task[ti];
+    }
+    DCHECK_EQ(col_params->group.group_size, perm.size());
+  }
+
+  VLOG(2) << collective_util::SubdivPermDebugString(*col_params);
+  return Status::OK();
+}
+
+Status RingAlg::InitializeCollectiveContext(CollectiveContext* col_ctx) {
+  DCHECK(col_ctx->dev_mgr);
+  col_ctx_ = col_ctx;
+  col_params_ = &col_ctx->col_params;
+  return collective_util::InitializeDeviceAndLocality(
+      col_ctx->dev_mgr, col_ctx->device_name, &col_ctx->device,
+      &col_ctx->device_locality);
+}
+
+string RingAlg::TensorDebugString(const Tensor& tensor) {
+  const DeviceBase::GpuDeviceInfo* gpu_device_info =
+      col_ctx_->op_ctx->device()->tensorflow_gpu_device_info();
+  if (gpu_device_info) {
+    Tensor cpu_tensor(tensor.dtype(), tensor.shape());
+    Notification note;
+    gpu_device_info->default_context->CopyDeviceTensorToCPU(
+        &tensor, "" /*tensor_name*/, col_ctx_->device, &cpu_tensor,
+        [&note](const Status& s) {
+          DCHECK(s.ok());
+          note.Notify();
+        });
+    note.WaitForNotification();
+    return cpu_tensor.SummarizeValue(64);
+  } else {
+    return tensor.SummarizeValue(64);
+  }
+}
+
+void RingAlg::StartAbort(const Status& s) {
+  // In abort mode we stop issuing additional ProvideBuf
+  // and ConsumeBuf calls, but we need to wait for all of the
+  // outstanding callbacks to be invoked before quitting.
+  bool abort_started = false;
+  {
+    mutex_lock l(status_mu_);
+    if (status_.ok()) {
+      LOG(ERROR) << "Aborting Ring" << name_ << " with " << s;
+      abort_started = true;
+      status_.Update(s);
+    }
+  }
+  // If this is the initial entry to abort mode then invoke StartAbort
+  // on the CollectiveExecutor that invoked us.  That should start
+  // cancellation on all of the outstanding CollectiveRemoteAccess
+  // actions.
+  if (abort_started) {
+    col_ctx_->col_exec->StartAbort(s);
+  }
+}
+
+void RingAlg::Finish(bool ok) {
+  if (ok) {
+    // Recover the output from the adaptor.
+    ca_->ConsumeFinalValue(col_ctx_->output);
+  }
+  Status s;
+  {
+    mutex_lock l(status_mu_);
+    s = status_;
+  }
+  rfv_.clear();  // Give up Refs on output tensor.
+  done_(s);
+}
+
+// At the beginning of the algorithm initialize a RingField struct for
+// every independent field of the tensor.
+void RingAlg::InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
+                            int field_idx) {
+  // Note on field indexing: There are group_size_ devices in the
+  // instance, implying the same number of chunks per tensor, where a
+  // chunk is the unit of data transferred in a time step.  However, if
+  // a device can simultaneously send data by 2 or more independent
+  // channels we can speed up the transfer by subdividing chunks and
+  // processing multiple subdivisions at once.  So the actual number
+  // of RingFields is group_size_ * num_subdivs_.
+  DCHECK_EQ(field_idx, (chunk_idx * num_subdivs_) + subdiv_idx);
+  rf->chunk_idx = chunk_idx;
+  rf->subdiv_idx = subdiv_idx;
+  rf->sc_idx = field_idx;
+  rf->rank = col_params_->subdiv_rank[subdiv_idx];
+  rf->second_pass = false;
+  rf->action = RF_INIT;
+  // Recv from the device with preceding rank within the subdivision.
+  int recv_from_rank = (rf->rank + (group_size_ - 1)) % group_size_;
+  int send_to_rank = (rf->rank + 1) % group_size_;
+  rf->recv_dev_idx = col_params_->instance.impl_details
+                         .subdiv_permutations[subdiv_idx][recv_from_rank];
+  int send_dev_idx = col_params_->instance.impl_details
+                         .subdiv_permutations[subdiv_idx][send_to_rank];
+  rf->recv_is_remote = !col_params_->task.is_local[rf->recv_dev_idx];
+  rf->send_is_remote = !col_params_->task.is_local[send_dev_idx];
+  if (ca_->ChunkBytes(rf->sc_idx) > 0) {
+    // In pass 0 we skip Recv when rank = chunk_idx
+    rf->do_recv = (rf->chunk_idx != rf->rank);
+    // In pass 0 we skip Send when rank = chunk_idx-1
+    rf->do_send =
+        (rf->rank != ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
+  }
+  rf->is_final =
+      (rf->rank == ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
+  if (rf->do_send || rf->do_recv) {
+    rf->chunk = ca_->ChunkAlias(rf->sc_idx);
+  }
+  VLOG(2) << this << " InitRingField " << rf->DebugString() << " chunk "
+          << ca_->TBounds(rf->chunk);
+}
+
+// When a RingField transitions from first to second recompute the
+// do_send and do_recv values.
+void RingAlg::AdvanceToSecondPass(RingField* rf) {
+  VLOG(3) << "IncrRingField old value " << rf->DebugString();
+  DCHECK(!rf->second_pass);
+  rf->second_pass = true;
+  rf->action = RF_INIT;
+  if (ca_->ChunkBytes(rf->sc_idx) > 0) {
+    // In pass 1 the send/no-send boundary moves down 1 place.
+    rf->do_recv =
+        (rf->rank != ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
+    rf->do_send =
+        (rf->rank != ((rf->chunk_idx + (group_size_ - 2)) % group_size_));
+  }
+  rf->is_final =
+      (rf->rank == ((rf->chunk_idx + (group_size_ - 2)) % group_size_));
+  VLOG(3) << "IncrRingField new value " << rf->DebugString();
+}
+
+string RingAlg::RingField::DebugString() const {
+  string rv = strings::StrCat("RingField rank=", rank, " chunk_idx=", chunk_idx,
+                              " subdiv=", subdiv_idx, " sc_idx=", sc_idx,
+                              " action=", action);
+  strings::StrAppend(&rv, " pass=", second_pass);
+  strings::StrAppend(&rv, " do_send=", do_send, " do_recv=", do_recv,
+                     " is_final=", is_final, " recv_is_remote=", recv_is_remote,
+                     " recv_dev_idx=", recv_dev_idx, " sc_idx=", sc_idx);
+  return rv;
+}
+
+void RingAlg::DispatchSend(RingField* rf, const StatusCallback& done) {
+  DCHECK(rf->do_send);
+  string send_buf_key = RingAlgBufKey(name_, col_ctx_->exec_key,
+                                      rf->second_pass, rf->sc_idx, rf->rank);
+  VLOG(3) << "DispatchSend rank=" << col_params_->default_rank << " send key "
+          << send_buf_key << " chunk " << ca_->TBounds(rf->chunk) << " sc_idx "
+          << rf->sc_idx;
+  int send_to_rank = (rf->rank + 1) % group_size_;
+  int send_to_dev_idx = col_params_->instance.impl_details
+                            .subdiv_permutations[rf->subdiv_idx][send_to_rank];
+  col_ctx_->col_exec->PostToPeer(
+      col_params_->instance.device_names[send_to_dev_idx],
+      col_params_->instance.task_names[send_to_dev_idx], send_buf_key,
+      col_ctx_->device, col_ctx_->op_ctx->op_device_context(),
+      col_ctx_->op_ctx->output_alloc_attr(0), &rf->chunk,
+      col_ctx_->device_locality, done);
+}
+
+void RingAlg::DispatchRecv(RingField* rf, const StatusCallback& done) {
+  DCHECK(rf->do_recv);
+  string recv_buf_key =
+      RingAlgBufKey(name_, col_ctx_->exec_key, rf->second_pass, rf->sc_idx,
+                    (rf->rank + (group_size_ - 1)) % group_size_);
+  VLOG(3) << "DispatchRecv rank=" << col_params_->default_rank << " recv key "
+          << recv_buf_key << " chunk " << ca_->TBounds(rf->chunk) << " into "
+          << ((col_params_->merge_op != nullptr) ? "tmp_chunk" : "chunk");
+  Tensor* dst_tensor = (!rf->second_pass && (col_params_->merge_op != nullptr))
+                           ? &rf->tmp_chunk
+                           : &rf->chunk;
+  col_ctx_->col_exec->RecvFromPeer(
+      col_params_->instance.device_names[rf->recv_dev_idx],
+      col_params_->instance.task_names[rf->recv_dev_idx],
+      col_params_->task.is_local[rf->recv_dev_idx], recv_buf_key,
+      col_ctx_->device, col_ctx_->op_ctx->op_device_context(),
+      col_ctx_->op_ctx->output_alloc_attr(0), dst_tensor,
+      col_ctx_->device_locality, rf->subdiv_idx, done);
+}
+
+string RingAlg::FieldState() {
+  string s = strings::StrCat(
+      "Ring", name_, " ", strings::Hex(reinterpret_cast<uint64>(this)),
+      " exec ", col_ctx_->exec_key, " step_id=", col_ctx_->step_id,
+      " state of all ", rfv_.size(), " fields:");
+  for (int i = 0; i < rfv_.size(); ++i) {
+    s.append("\n");
+    s.append(rfv_[i].DebugString());
+  }
+  return s;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/ring_alg.h b/tensorflow/core/common_runtime/ring_alg.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc07618f8805e3a9abcaf575c3d2984aa27948b7
--- /dev/null
+++ b/tensorflow/core/common_runtime/ring_alg.h
@@ -0,0 +1,124 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_RING_ALG_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_RING_ALG_H_
+
+#include <deque>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/framework/collective.h"
+
+namespace tensorflow {
+class Device;
+
+// Basic ring-algorithm implementation to be further specialized
+// for specific collective functions.
+class RingAlg : public CollectiveImplementationInterface {
+ public:
+  explicit RingAlg(CollectiveType type, const string& name);
+  ~RingAlg() override {}
+
+  // Establishes the requested number of subdivision permutations based on the
+  // ring order implicit in the device order.
+  Status InitializeCollectiveParams(CollectiveParams* col_params) override;
+
+  // Initializes members of CollectiveContext not yet initialized, i.e. device
+  // and device_locality.  Also saves the CollectiveContext in this object.
+  Status InitializeCollectiveContext(CollectiveContext* col_ctx) override;
+
+  // No-op for ring alg.
+  Status InitializeInstanceBeforeGroupDiscovery(CollectiveParams*) override {
+    return Status::OK();
+  }
+
+ protected:
+  // Called when a bad status is received that implies we should terminate
+  // execution and return a bad status.
+  void StartAbort(const Status& s);
+  void Finish(bool ok);
+
+  // Current status of a RingField
+  enum RingFieldAction {
+    RF_INIT = 0,    // Just initialized for a pass
+    RF_RECV,        // Recv pending
+    RF_REDUCE,      // Reduce pending
+    RF_FINALIZE,    // FinalOp pending
+    RF_SEND_READY,  // Ready to send
+    RF_SEND,        // Send pending
+    RF_DONE,        // No more work
+  };
+
+  // Tracks progress of actions on a single subfield of the entire tensor.
+  struct RingField {
+    int16 chunk_idx;     // major division index
+    int16 subdiv_idx;    // minor division index
+    int16 sc_idx;        // subchunk index
+    int16 rank;          // rank within subdiv permutation
+    int16 recv_dev_idx;  // dev from which value should be recv'd
+    RingFieldAction action;
+    bool second_pass;
+    bool recv_is_remote = false;
+    bool send_is_remote = false;
+    bool do_send = false;   // is the value sent in this pass?
+    bool do_recv = false;   // is the value recv'd in this pass?
+    bool is_final = false;  // is the last field in the pass for this rank
+    Tensor chunk;           // alias to field values
+    Tensor tmp_chunk;
+    Status status;
+    string DebugString() const;
+  };
+  virtual void InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
+                             int field_idx);
+  void AdvanceToSecondPass(RingField* rf);
+  void DispatchSend(RingField* rf, const StatusCallback& done);
+  void DispatchRecv(RingField* rf, const StatusCallback& done);
+
+  // For constructing log messages for debugging.
+  string FieldState();
+  string TensorDebugString(const Tensor& tensor);
+
+  // Producer/Consumer Queue of RingField structs.
+  class PCQueue {
+   public:
+    void Enqueue(RingField* rf);
+    RingField* Dequeue();
+
+   private:
+    mutex pcq_mu_;
+    condition_variable cv_;
+    int waiter_count_ GUARDED_BY(pcq_mu_) = 0;
+    std::deque<RingField*> deque_ GUARDED_BY(pcq_mu_);
+  };
+
+  const CollectiveType type_;
+  const string name_;
+  CollectiveContext* col_ctx_;          // Not owned
+  const CollectiveParams* col_params_;  // Not owned
+  StatusCallback done_;
+  int group_size_;
+  int num_subdivs_;
+  Tensor group_size_tensor_;
+  Notification group_size_tensor_ready_;
+  std::unique_ptr<CollectiveAdapter> ca_;
+  mutex status_mu_;
+  Status status_ GUARDED_BY(status_mu_);
+  std::vector<RingField> rfv_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_RING_ALG_H_
diff --git a/tensorflow/core/common_runtime/ring_gatherer.cc b/tensorflow/core/common_runtime/ring_gatherer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..58251fc171459ee35820c3157c48e4222e9f1ec2
--- /dev/null
+++ b/tensorflow/core/common_runtime/ring_gatherer.cc
@@ -0,0 +1,266 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/ring_gatherer.h"
+
+#include <stdlib.h>
+#include <atomic>
+#include <functional>
+#include <utility>
+
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/collective_util.h"
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+Status RingGatherer::InitializeCollectiveParams(CollectiveParams* col_params) {
+  DCHECK_EQ(col_params->instance.type, GATHER_COLLECTIVE);
+  DCHECK_EQ(col_params->instance.impl_details.collective_name, "RingGather");
+  // TODO(tucker): Maybe add subdiv support.  It's only useful with
+  // multiple NICS, and maybe gather performance isn't important enough.
+  // For now, there must always be only a single subdiv at offset 0.
+  if (!col_params->instance.impl_details.subdiv_offsets.empty() &&
+      (col_params->instance.impl_details.subdiv_offsets.size() > 1 ||
+       col_params->instance.impl_details.subdiv_offsets[0] != 0)) {
+    return errors::InvalidArgument(
+        "RingGather cannot take any subdiv offset other than 0.");
+  }
+  if (col_params->instance.impl_details.subdiv_offsets.empty()) {
+    col_params->instance.impl_details.subdiv_offsets.push_back(0);
+  }
+  return RingAlg::InitializeCollectiveParams(col_params);
+}
+
+void RingGatherer::Run(StatusCallback done) {
+  DCHECK(col_ctx_);
+  DCHECK(col_params_);
+  done_ = std::move(done);
+  group_size_ = col_params_->group.group_size;
+  num_subdivs_ = static_cast<int>(
+      col_params_->instance.impl_details.subdiv_permutations.size());
+  DCHECK_GT(num_subdivs_, 0);
+
+  if (VLOG_IS_ON(1)) {
+    string buf;
+    for (int r = 0; r < col_params_->instance.device_names.size(); ++r) {
+      strings::StrAppend(&buf, "dev ", r, " : ",
+                         col_params_->instance.device_names[r], "\n");
+    }
+    for (int sd = 0;
+         sd < col_params_->instance.impl_details.subdiv_permutations.size();
+         ++sd) {
+      strings::StrAppend(&buf, "\nsubdiv ", sd, " perm: ");
+      for (auto x :
+           col_params_->instance.impl_details.subdiv_permutations[sd]) {
+        strings::StrAppend(&buf, x, ", ");
+      }
+    }
+    VLOG(1) << "RingGatherer::Run for device " << col_ctx_->device_name
+            << " default_rank " << col_params_->default_rank << "\n"
+            << buf;
+  }
+
+  // Prepare to alias fields within the output.
+  AllocatorAttributes attr = col_ctx_->op_ctx->output_alloc_attr(0);
+  ca_.reset(MakeCollectiveAdapter(col_ctx_->output, group_size_ * num_subdivs_,
+                                  col_ctx_->device->GetAllocator(attr),
+                                  false /*align_chunks*/));
+
+  // Start by copying input to the rank-specific offset of output.
+  // We are running in a blockable thread and the callback can't block so
+  // just wait here on the copy.
+  Notification note;
+  Status status;
+  Tensor alias_chunk(ca_->ChunkAlias(col_params_->subdiv_rank[0]));
+  CollectiveRemoteAccessLocal::MemCpyAsync(
+      col_ctx_->op_ctx->input_device_context(0),
+      col_ctx_->op_ctx->op_device_context(), col_ctx_->device, col_ctx_->device,
+      col_ctx_->op_ctx->input_alloc_attr(0),
+      col_ctx_->op_ctx->output_alloc_attr(0), col_ctx_->input, &alias_chunk,
+      0 /*dev_to_dev_stream_index*/, [&note, &status](const Status& s) {
+        status.Update(s);
+        note.Notify();
+      });
+  note.WaitForNotification();
+  if (!status.ok()) {
+    done_(status);
+    return;
+  }
+  Finish(RunAsyncParts());
+}
+
+bool RingGatherer::RunAsyncParts() {
+  // This function orchestrates RingGatherer actions on behalf of a
+  // single device. It is entered by a blockable thread that
+  // loops within it until all actions assigned to that device
+  // complete. Hence function local variables are accessible only by that
+  // one thread and do not require an explicit mutex.
+  rfv_.clear();
+  rfv_.resize(group_size_ * num_subdivs_);
+  PCQueue ready_queue;
+  for (int chunk_idx = 0; chunk_idx < group_size_; ++chunk_idx) {
+    for (int subdiv_idx = 0; subdiv_idx < num_subdivs_; ++subdiv_idx) {
+      int rf_index = (chunk_idx * num_subdivs_) + subdiv_idx;
+      InitRingField(&rfv_[rf_index], chunk_idx, subdiv_idx, rf_index);
+      ready_queue.Enqueue(&rfv_[rf_index]);
+    }
+  }
+  const DeviceBase::GpuDeviceInfo* gpu_info =
+      col_ctx_->device->tensorflow_gpu_device_info();
+  if (gpu_info) {
+    // Wait for all currently queued events on the CPU compute stream to
+    // complete before proceeding.  The previous InitRingField calls allocated
+    // temp memory buffers that are not guaranteed to be valid (e.g. for RDMA
+    // write) unless we do.
+    Notification note;
+    Status s = gpu_info->default_context->ThenExecute(
+        col_ctx_->device, gpu_info->stream, [&note]() { note.Notify(); });
+    if (s.ok()) {
+      note.WaitForNotification();
+    } else {
+      mutex_lock l(status_mu_);
+      status_ =
+          errors::Internal("Failed to dispatch ThenExecute in RingGatherer");
+      return false;
+    }
+  }
+
+  int field_done_count = 0;
+  int send_pending_count = 0;
+  int recv_pending_count = 0;
+  std::atomic<bool> aborted(false);
+
+  // Loop until all RingFields have advanced to completion.
+  while (field_done_count < rfv_.size()) {
+    VLOG(4) << FieldState();
+    // Wait for a RingField to appear in the ready_queue.
+    RingField* rf = ready_queue.Dequeue();
+    // Advance the RingField to its next action and execute, repeating
+    // until either an async action has been started or the RingField
+    // is done.
+    bool dispatched = false;  // true if async action was initiated
+    do {
+      if (aborted) {
+        // Requeue this RingField to be counted off below.
+        ready_queue.Enqueue(rf);
+        break;
+      }
+      switch (rf->action) {
+        case RF_INIT:
+          if (rf->do_recv) {
+            rf->action = RF_RECV;
+            auto requeue = [this, rf, &ready_queue, &aborted](Status s) {
+              if (!s.ok()) {
+                aborted = true;
+                StartAbort(s);
+              }
+              ready_queue.Enqueue(rf);
+            };
+            DispatchRecv(rf, requeue);
+            dispatched = true;
+            ++recv_pending_count;
+          } else {
+            rf->action = RF_SEND_READY;
+          }
+          break;
+        case RF_RECV:
+          DCHECK_GT(recv_pending_count, 0);
+          --recv_pending_count;
+          rf->action = RF_SEND_READY;
+          break;
+        case RF_REDUCE:
+          // Never used for Gather, so just fall through.
+          TF_FALLTHROUGH_INTENDED;
+        case RF_FINALIZE:
+          // Never used for Gather, so just fall through.
+          TF_FALLTHROUGH_INTENDED;
+        case RF_SEND_READY:
+          if (rf->do_send) {
+            rf->action = RF_SEND;
+            auto send_complete = [this, rf, &ready_queue, &aborted](Status s) {
+              if (!s.ok()) {
+                aborted = true;
+                StartAbort(s);
+              }
+              ready_queue.Enqueue(rf);
+            };
+            DispatchSend(rf, send_complete);
+            dispatched = true;
+            ++send_pending_count;
+          } else {
+            rf->action = RF_DONE;
+          }
+          break;
+        case RF_SEND:
+          DCHECK_GT(send_pending_count, 0);
+          --send_pending_count;
+          rf->action = RF_DONE;
+          break;
+        case RF_DONE:
+          break;
+      }
+      if (rf->action == RF_DONE) {
+        // There's only one pass.
+        ++field_done_count;
+        break;  // from do while(!dispatched)
+      }
+    } while (!dispatched);
+    if (aborted) break;
+  }  // while (field_done_count < number of fields)
+
+  if (aborted) {
+    // All of the pending data actions should be aborted; field the
+    // callbacks and clear the queue before quitting.
+    while ((send_pending_count > 0) || (recv_pending_count > 0)) {
+      RingField* rf = ready_queue.Dequeue();
+      switch (rf->action) {
+        case RF_RECV:
+          --recv_pending_count;
+          break;
+        case RF_SEND:
+          --send_pending_count;
+          break;
+        default: {
+        }  // Ignore any other actions
+      }
+    }
+  }
+
+  DCHECK_EQ(send_pending_count, 0);
+  DCHECK_EQ(recv_pending_count, 0);
+
+  VLOG(2) << this << " device=" << col_ctx_->device_name << " finish;"
+          << " final value " << TensorDebugString(ca_->Value());
+  return !aborted;
+}
+
+REGISTER_COLLECTIVE(RingGather, RingGatherer);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/ring_gatherer.h b/tensorflow/core/common_runtime/ring_gatherer.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee9634834d2b6c9d986cfb1841ae03c51e22564b
--- /dev/null
+++ b/tensorflow/core/common_runtime/ring_gatherer.h
@@ -0,0 +1,51 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_RING_GATHERER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_RING_GATHERER_H_
+
+#include <deque>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/ring_alg.h"
+#include "tensorflow/core/framework/collective.h"
+
+namespace tensorflow {
+class Device;
+
+// Ring-algorithm implementation of collective all-gather.
+class RingGatherer : public RingAlg {
+ public:
+  RingGatherer() : RingAlg(GATHER_COLLECTIVE, "Gather") {}
+  ~RingGatherer() override {}
+
+  Status InitializeCollectiveParams(CollectiveParams* col_params) override;
+
+  // Begins async execution of the ring gather algorithm.
+  // Must be called in a blockable thread.
+  // TODO(b/80529858): remove the previous warning when we have a dedicated
+  // collective threadpool.
+  void Run(StatusCallback done) override;
+
+ private:
+  bool RunAsyncParts();
+
+  friend class RingGathererTest;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_RING_GATHERER_H_
diff --git a/tensorflow/core/common_runtime/ring_gatherer_test.cc b/tensorflow/core/common_runtime/ring_gatherer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..97ff7b58fa700d72bde145c0cb789228cf163cc6
--- /dev/null
+++ b/tensorflow/core/common_runtime/ring_gatherer_test.cc
@@ -0,0 +1,651 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/ring_gatherer.h"
+
+#include <algorithm>
+#include "absl/memory/memory.h"
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/test_collective_executor_mgr.h"
+#include "tensorflow/core/common_runtime/threadpool_device.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+
+// Wraps CollectiveRemoteAccessLocal with the ability to return an
+// error status to the N'th action.
+class FailTestRMA : public CollectiveRemoteAccessLocal {
+ public:
+  FailTestRMA(const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
+              int64 step_id, int fail_after)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
+        fail_after_(fail_after) {}
+
+  bool MaybeFail(const StatusCallback& done) {
+    bool fail_now = false;
+    {
+      mutex_lock l(mu_);
+      if (fail_after_ > 0) {
+        fail_now = (--fail_after_ == 0);
+      }
+    }
+    if (fail_now) {
+      done(errors::Internal("Deliberate failure"));
+      return true;
+    }
+    return false;
+  }
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,
+                    int dev_to_dev_stream_index,
+                    const StatusCallback& done) override {
+    if (MaybeFail(done)) return;
+    CollectiveRemoteAccessLocal::RecvFromPeer(
+        peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx,
+        to_alloc_attr, to_tensor, client_locality, dev_to_dev_stream_index,
+        done);
+  }
+
+  void PostToPeer(const string& peer_device, const string& peer_task,
+                  const string& key, Device* from_device,
+                  DeviceContext* from_device_ctx,
+                  const AllocatorAttributes& from_alloc_attr,
+                  const Tensor* from_tensor,
+                  const DeviceLocality& client_locality,
+                  const StatusCallback& done) override {
+    if (MaybeFail(done)) return;
+    CollectiveRemoteAccessLocal::PostToPeer(
+        peer_device, peer_task, key, from_device, from_device_ctx,
+        from_alloc_attr, from_tensor, client_locality, done);
+  }
+
+  mutex mu_;
+  int fail_after_ GUARDED_BY(mu_);
+};
+
+std::unique_ptr<OpKernel> GetKernel(const NodeDef& node,
+                                    const DeviceType& device_type,
+                                    DeviceBase* device) {
+  Status status;
+  std::unique_ptr<OpKernel> k = CreateOpKernel(
+      device_type, device, device->GetAllocator(AllocatorAttributes()), node,
+      TF_GRAPH_DEF_VERSION, &status);
+  if (!status.ok()) {
+    LOG(FATAL) << status;
+  }
+  return k;
+}
+
+static int64 kStepId = 123;
+
+class RingGathererTest : public ::testing::Test {
+ protected:
+  RingGathererTest() : device_type_(DEVICE_CPU) {}
+
+#ifdef GOOGLE_CUDA
+  void InitGPUDevices() {
+    auto device_factory = DeviceFactory::GetFactory("GPU");
+    CHECK(device_factory);
+    SessionOptions options;
+    Status s = device_factory->CreateDevices(
+        options, "/job:worker/replica:0/task:0", &gpu_devices_);
+    CHECK(s.ok());
+  }
+#endif
+
+  ~RingGathererTest() override {
+    stop_ = true;
+    for (auto i : instances_) delete i;
+    if (col_exec_) col_exec_->Unref();
+  }
+
+  void Init(int num_workers, int num_devices, DataType dtype,
+            const DeviceType& device_type, int num_subdivs, int fail_after) {
+#ifdef GOOGLE_CUDA
+    InitGPUDevices();
+#endif
+    device_type_ = device_type;
+    std::vector<std::unique_ptr<Device>> local_devices;
+    SessionOptions sess_opts;
+    sess_opts.env = Env::Default();
+    Bytes mem_limit(4 << 20);
+    DeviceLocality dev_locality;
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        if (device_type == DEVICE_CPU) {
+          string dev_name =
+              strings::StrCat("/job:worker/replica:0/task:", wi, "/cpu:", di);
+          local_devices.push_back(absl::make_unique<ThreadPoolDevice>(
+              sess_opts, dev_name, mem_limit, dev_locality, cpu_allocator()));
+        } else if (device_type == DEVICE_GPU && !gpu_devices_.empty()) {
+          int dev_idx = (wi * num_devices) + di;
+          if (dev_idx >= static_cast<int>(gpu_devices_.size())) {
+            LOG(INFO) << "dev_mgr has access to limited GPUs, reusing for more "
+                         "than one ring node.";
+          } else {
+            local_devices.push_back(std::move(gpu_devices_[dev_idx]));
+          }
+        } else {
+          LOG(FATAL) << "Unsupported device_type " << device_type;
+        }
+      }
+    }
+    if (!dev_mgr_ || device_type == DEVICE_CPU) {
+      LOG(ERROR) << "resetting dev_mgr for " << local_devices.size()
+                 << " devices: ";
+      dev_mgr_.reset(new DeviceMgr(std::move(local_devices)));
+    }
+    if (!gpu_ring_order_) gpu_ring_order_.reset(new string());
+    dev_resolver_.reset(new DeviceResolverLocal(dev_mgr_.get()));
+    rma_ = new FailTestRMA(dev_mgr_.get(), dev_resolver_.get(), kStepId,
+                           fail_after);
+    col_exec_ = new BaseCollectiveExecutor(
+        &col_exec_mgr_, rma_, kStepId, dev_mgr_.get(), gpu_ring_order_.get());
+    col_params_.name = "test_collective";
+    static const int kGroupKey = 5;
+    col_params_.group.group_key = kGroupKey;
+    col_params_.group.device_type = device_type;
+    col_params_.group.group_size = num_workers * num_devices;
+    static const int kInstanceKey = 17;
+    col_params_.instance.instance_key = kInstanceKey;
+    col_params_.instance.impl_details.subdiv_offsets.clear();
+    col_params_.instance.type = GATHER_COLLECTIVE;
+    col_params_.instance.impl_details.collective_name = "RingGather";
+    col_params_.instance.data_type = dtype;
+    col_params_.instance.impl_details.subdiv_permutations.resize(num_subdivs);
+    col_params_.subdiv_rank.resize(num_subdivs);
+    int subdiv_stride = num_devices / num_subdivs;
+    for (int sdi = 0; sdi < num_subdivs; ++sdi) {
+      col_params_.instance.impl_details.subdiv_offsets.push_back(sdi *
+                                                                 subdiv_stride);
+      col_params_.subdiv_rank[sdi] = sdi * subdiv_stride;
+    }
+
+    // Set up a local device ring order that's not just 0,1,2...
+    std::vector<int> local_ring_order;
+    for (int di = 0; di < num_devices; ++di) {
+      local_ring_order.push_back(di);
+    }
+    for (int di = 0; di < num_devices; ++di) {
+      bool is_odd = ((di % 2) == 1);
+      int other = (di + (is_odd ? 7 : 3)) % num_devices;
+      if (di == other) continue;
+      iter_swap(local_ring_order.begin() + di,
+                local_ring_order.begin() + other);
+    }
+    string lro_buf;
+    for (auto d : local_ring_order) strings::StrAppend(&lro_buf, d, ", ");
+    VLOG(1) << "local_ring_order " << lro_buf;
+
+    // Set up all of the fake device contexts.
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        string task_name = strings::StrCat("/job:worker/replica:0/task:", wi);
+        string dev_name = strings::StrCat(task_name, "/cpu:", di);
+        if (device_type == DEVICE_GPU) {
+          dev_name =
+              strings::StrCat(task_name, "/gpu:", di % gpu_devices_.size());
+        }
+        col_params_.instance.device_names.push_back(dev_name);
+        col_params_.instance.task_names.push_back(task_name);
+        // Normally each device would set is_local to its own perspective but
+        // this test runs in a single process so is_local is always true.
+        col_params_.task.is_local.push_back(true);
+        for (int sdi = 0; sdi < num_subdivs; ++sdi) {
+          int rotated_di =
+              (di + col_params_.instance.impl_details.subdiv_offsets[sdi]) %
+              num_devices;
+          col_params_.instance.impl_details.subdiv_permutations[sdi].push_back(
+              wi * num_devices + local_ring_order[rotated_di]);
+        }
+      }
+    }
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        int rank = wi * num_devices + di;
+        instances_.push_back(new DeviceInstance(
+            rank, col_params_.instance.device_names[rank], device_type_, this));
+      }
+    }
+  }
+
+  void Gather(int fail_after) {
+    std::atomic<int> done(0);
+    for (auto di : instances_) {
+      SchedClosure([di, &done] {
+        di->DoGather();
+        ++done;
+      });
+      if (fail_after > 0) {
+        // Stagger the op execution starts.
+        Env::Default()->SleepForMicroseconds(100);
+      }
+    }
+    while (done < static_cast<int>(instances_.size())) {
+      if (stop_) break;
+      Env::Default()->SleepForMicroseconds(1000);
+    }
+  }
+
+  template <typename T>
+  void RunTest(DataType dtype, const DeviceType& device_type, int num_workers,
+               int num_devices, int num_subdivs, int tensor_len,
+               int fail_after) {
+    Init(num_workers, num_devices, dtype, device_type, num_subdivs, fail_after);
+    int32 output_len = tensor_len * num_workers * num_devices;
+    std::vector<T> expected(output_len, 0.0);
+    for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
+      DeviceInstance* instance = instances_[di];
+      int32 instance_offset = di * tensor_len;
+      instance->InitTensor(dtype, TensorShape({tensor_len}),
+                           [instance_offset, &expected, dtype, di](Tensor* t) {
+                             for (size_t i = 0; i < t->NumElements(); ++i) {
+                               // The cast is necessary to prevent clang-tidy
+                               // from insisting that a faster non-open source
+                               // function be substituted.
+                               float value =
+                                   pow(10, static_cast<double>(di)) * i;
+                               if (dtype == DT_INT32 || dtype == DT_INT64) {
+                                 value = di * 10 + i;
+                               }
+                               t->flat<T>()(i) = static_cast<T>(value);
+                               expected[instance_offset + i] = value;
+                             }
+                           });
+    }
+    Gather(fail_after);
+    if (fail_after > 0) {
+      // Confirm that every device terminated with the expected error status.
+      for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
+        EXPECT_EQ("Deliberate failure",
+                  instances_[di]->status_.error_message());
+      }
+    } else {
+      // Confirm that every device accumulated the same set of correct
+      // values.
+      for (int di = 0; di < static_cast<int>(instances_.size()); ++di) {
+        TF_EXPECT_OK(instances_[di]->status_);
+        Tensor* inst = &instances_[di]->output_tensor_;
+        CHECK(inst);
+        Tensor actual(dtype, TensorShape({output_len}));
+        if (device_type_ == DEVICE_CPU) {
+          CHECK(actual.CopyFrom(*inst, inst->shape()));
+          VLOG(1) << "actual " << actual.SummarizeValue(100);
+        } else if (device_type_ == DEVICE_GPU) {
+          Notification note;
+          Device* dev = instances_[di]->device_;
+          auto* dev_info = dev->tensorflow_gpu_device_info();
+          CHECK(dev_info);
+          dev_info->default_context->CopyDeviceTensorToCPU(
+              inst, "" /*tensor_name*/, dev, &actual, [&note](const Status& s) {
+                CHECK(s.ok());
+                note.Notify();
+              });
+          note.WaitForNotification();
+        }
+
+        auto alias = actual.template unaligned_flat<T>();
+        for (int i = 0; i < output_len; ++i) {
+          switch (dtype) {
+            case DT_FLOAT:
+              EXPECT_FLOAT_EQ(expected[i], alias(i))
+                  << "Mismatch at device " << di << " index " << i;
+              break;
+            case DT_DOUBLE:
+              EXPECT_DOUBLE_EQ(expected[i], alias(i))
+                  << "Mismatch at device " << di << " index " << i;
+              break;
+            case DT_INT32:
+            case DT_INT64:
+              EXPECT_EQ(expected[i], alias(i))
+                  << "Mismatch at device " << di << " index " << i;
+              break;
+            default:
+              LOG(FATAL) << "unimplemented";
+          }
+        }
+      }
+    }
+  }
+
+  std::unique_ptr<OpKernel> GetCollectiveGather(const CollectiveParams& params,
+                                                Tensor* input,
+                                                const DeviceType& device_type,
+                                                DeviceBase* device) {
+    mutex_lock l(mu_);
+    NodeDef node_def;
+    NodeDefBuilder builder(
+        strings::StrCat("collective_gather_", gather_counter_++),
+        "CollectiveGather");
+    TF_CHECK_OK(builder.Attr("T", params.instance.data_type)
+                    .Attr("group_size", params.group.group_size)
+                    .Attr("group_key", params.group.group_key)
+                    .Attr("instance_key", params.instance.instance_key)
+                    .Attr("shape", params.instance.shape)
+                    .Input(FakeInput(params.instance.data_type))
+                    .Finalize(&node_def));
+    return GetKernel(node_def, device_type, device);
+  }
+
+  void RunSubdivPermsTest(
+      CollectiveParams* cp,
+      const std::vector<std::vector<int>>& expected_subdiv_perms,
+      const std::vector<int>& expected_subdiv_rank) {
+    col_exec_ = nullptr;
+    cp->instance.impl_details.subdiv_permutations.clear();
+    cp->subdiv_rank.clear();
+    // Create a stub ring gatherer only for testing param initialization.
+    RingGatherer gatherer;
+    TF_CHECK_OK(gatherer.InitializeCollectiveParams(cp));
+    EXPECT_EQ(expected_subdiv_perms,
+              cp->instance.impl_details.subdiv_permutations);
+    EXPECT_EQ(expected_subdiv_rank, cp->subdiv_rank);
+  }
+
+  class DeviceInstance {
+   public:
+    DeviceInstance(int rank, const string& dev_name,
+                   const DeviceType& device_type, RingGathererTest* parent)
+        : parent_(parent),
+          dev_name_(dev_name),
+          device_type_(device_type),
+          rank_(rank) {
+      TF_CHECK_OK(parent_->dev_mgr_->LookupDevice(dev_name, &device_))
+          << "Couldn't find device " << dev_name
+          << " existing devices: " << parent_->dev_mgr_->DebugString();
+      col_params_.name = parent_->col_params_.name;
+      col_params_.group.group_key = parent_->col_params_.group.group_key;
+      col_params_.group.device_type = parent_->col_params_.group.device_type;
+      col_params_.group.group_size = parent_->col_params_.group.group_size;
+      col_params_.instance = parent->col_params_.instance;
+      col_params_.task.is_local = parent_->col_params_.task.is_local;
+      col_params_.subdiv_rank = parent_->col_params_.subdiv_rank;
+
+      int num_subdivs = static_cast<int>(col_params_.subdiv_rank.size());
+      int group_size = col_params_.group.group_size;
+      CHECK_EQ(group_size,
+               static_cast<int>(col_params_.instance.device_names.size()));
+      // Id of this device is at rank position in first subdiv perm.
+      int my_device_id =
+          col_params_.instance.impl_details.subdiv_permutations[0][rank];
+      col_params_.default_rank = my_device_id;
+      // Set rank for all other subdivs by finding that device_id.
+      for (int sdi = 0; sdi < num_subdivs; ++sdi) {
+        for (int r = 0; r < static_cast<int>(col_params_.instance.impl_details
+                                                 .subdiv_permutations[sdi]
+                                                 .size());
+             ++r) {
+          if (my_device_id ==
+              col_params_.instance.impl_details.subdiv_permutations[sdi][r]) {
+            col_params_.subdiv_rank[sdi] = r;
+            break;
+          }
+        }
+      }
+    }
+
+    void InitTensor(DataType dtype, const TensorShape& shape,
+                    const std::function<void(Tensor*)>& init_f) {
+      input_tensor_ =
+          Tensor(device_->GetAllocator(AllocatorAttributes()), dtype, shape);
+      if (device_type_ == DEVICE_CPU) {
+        init_f(&input_tensor_);
+      } else if (device_type_ == DEVICE_GPU) {
+        Tensor cpu_tensor(dtype, shape);
+        init_f(&cpu_tensor);
+        auto* dev_info = device_->tensorflow_gpu_device_info();
+        CHECK(dev_info);
+        Notification note;
+        dev_info->default_context->CopyCPUTensorToDevice(
+            &cpu_tensor, device_, &input_tensor_, [&note](const Status& s) {
+              CHECK(s.ok());
+              note.Notify();
+            });
+        note.WaitForNotification();
+      } else {
+        LOG(FATAL) << "Unsupported device_type " << device_type_;
+      }
+    }
+
+    void DoGather() {
+      // Prepare an OpKernelContext.
+      OpKernelContext::Params op_params;
+      op_params.step_id = kStepId;
+      op_params.device = device_;
+      gtl::InlinedVector<TensorValue, 4> inputs;
+      inputs.push_back(TensorValue(&input_tensor_));
+      op_params.inputs = &inputs;
+      gtl::InlinedVector<AllocatorAttributes, 4> input_aa(
+          {AllocatorAttributes()});
+      op_params.input_alloc_attrs = &input_aa;
+      gtl::InlinedVector<DeviceContext*, 4> input_dc;
+      DeviceContext* dev_ctx = nullptr;
+      auto* dev_info = device_->tensorflow_gpu_device_info();
+      if (dev_info) {
+        dev_ctx = dev_info->default_context;
+        dev_ctx->Ref();
+      } else {
+        dev_ctx = new DeviceContext;
+      }
+      input_dc.push_back(dev_ctx);
+      op_params.input_device_contexts = &input_dc;
+      op_params.op_device_context = dev_ctx;
+      AllocatorAttributes generic_alloc_attr;
+      op_params.output_attr_array = &generic_alloc_attr;
+      std::unique_ptr<OpKernel> op = parent_->GetCollectiveGather(
+          col_params_, &input_tensor_, DEVICE_CPU, device_);
+      op_params.op_kernel = op.get();
+      OpKernelContext ctx(&op_params, 1);
+
+      // We never actually execute the kernel, so we need to do the output
+      // allocation it would do, ourselves.
+      Tensor* output_tensor_ptr = nullptr;
+      TensorShape output_shape({static_cast<int64>(
+          parent_->instances_.size() * input_tensor_.shape().num_elements())});
+      TF_CHECK_OK(ctx.forward_input_or_allocate_output({0}, 0, output_shape,
+                                                       &output_tensor_ptr));
+      CHECK_EQ(output_tensor_ptr, ctx.mutable_output(0));
+      // Prepare a RingGatherer instance.
+      string exec_key =
+          strings::StrCat(col_params_.instance.instance_key, ":0:0");
+      RingGatherer gatherer;
+      CollectiveContext col_ctx(parent_->col_exec_, parent_->dev_mgr_.get(),
+                                &ctx, &op_params, col_params_, exec_key,
+                                kStepId, &input_tensor_, output_tensor_ptr);
+      TF_CHECK_OK(gatherer.InitializeCollectiveContext(&col_ctx));
+
+      // Run the all-gather.
+      gatherer.Run([this](Status s) { status_ = s; });
+      if (status_.ok()) {
+        CHECK(output_tensor_.CopyFrom(*ctx.mutable_output(0),
+                                      ctx.mutable_output(0)->shape()));
+      }
+
+      dev_ctx->Unref();
+    }
+
+    const Tensor& input_tensor() { return input_tensor_; }
+    const Tensor& output_tensor() { return output_tensor_; }
+
+    RingGathererTest* parent_;
+    string dev_name_;
+    DeviceType device_type_;
+    int rank_;
+    Tensor input_tensor_;
+    Tensor output_tensor_;
+    Device* device_;
+    CollectiveParams col_params_;
+    std::unique_ptr<CollectiveAdapter> ca_;
+    std::unique_ptr<OpKernelContext> ctx_;
+    Status status_;
+  };
+
+  bool stop_ = false;
+  DeviceType device_type_;
+  TestCollectiveExecutorMgr col_exec_mgr_;
+  CollectiveExecutor* col_exec_;
+  CollectiveRemoteAccessLocal* rma_;
+  std::unique_ptr<DeviceResolverLocal> dev_resolver_;
+  std::vector<DeviceInstance*> instances_;
+  CollectiveParams col_params_;
+  std::vector<std::unique_ptr<tensorflow::Device>> gpu_devices_;
+  std::unique_ptr<tensorflow::DeviceMgr> dev_mgr_;
+  std::unique_ptr<string> gpu_ring_order_;
+  mutex mu_;
+  int32 gather_counter_ GUARDED_BY(mu_) = 0;
+};
+
+CollectiveParams SetUpCollectiveParams(const int num_devs_per_task,
+                                       const int num_tasks) {
+  CollectiveParams cp;
+  const int kNumDevs = num_devs_per_task * num_tasks;
+  cp.group.group_key = 1;
+  cp.group.group_size = kNumDevs;
+  cp.group.device_type = DeviceType("GPU");
+  cp.group.num_tasks = num_tasks;
+  cp.instance.instance_key = 3;
+  cp.instance.type = GATHER_COLLECTIVE;
+  cp.instance.data_type = DataType(DT_FLOAT);
+  cp.instance.shape = TensorShape({kNumDevs * kNumDevs});
+  cp.instance.impl_details.collective_name = "RingGather";
+  cp.instance.impl_details.subdiv_offsets.push_back(0);
+  cp.is_source = false;
+  for (int i = 0; i < kNumDevs; ++i) {
+    int task_id = i / num_devs_per_task;
+    int dev_id = i % num_devs_per_task;
+    string task_name = strings::StrCat("/job:worker/replica:0/task:", task_id);
+    string device_name = strings::StrCat(task_name, "/device:GPU:", dev_id);
+    cp.instance.task_names.push_back(task_name);
+    cp.instance.device_names.push_back(device_name);
+  }
+  return cp;
+}
+
+TEST_F(RingGathererTest, InitializeParams) {
+  const int kNumDevsPerTask = 8;
+  const int kNumTasks = 3;
+  CollectiveParams cp = SetUpCollectiveParams(kNumDevsPerTask, kNumTasks);
+
+  cp.default_rank = 0;
+  cp.instance.impl_details.subdiv_offsets = {};
+  RunSubdivPermsTest(&cp, {{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                            12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}},
+                     {0});
+
+  cp.instance.impl_details.subdiv_offsets = {0};
+  RunSubdivPermsTest(&cp, {{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                            12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}},
+                     {0});
+
+  cp.default_rank = 3;
+  cp.instance.impl_details.subdiv_offsets = {};
+  RunSubdivPermsTest(&cp, {{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                            12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}},
+                     {3});
+}
+
+// TODO(b/113171733): change to use TEST_P.
+#define DEF_TEST(B, T, W, D, S, L, A)                                         \
+  TEST_F(RingGathererTest,                                                    \
+         DaTy##B##_DevTy##T##_Wkr##W##_Dev##D##_Sdiv##S##_Len##L##_Abrt##A) { \
+    DataType dtype = DT_##B;                                                  \
+    switch (dtype) {                                                          \
+      case DT_FLOAT: {                                                        \
+        RunTest<float>(dtype, DEVICE_##T, W, D, S, L, A);                     \
+      } break;                                                                \
+      case DT_DOUBLE: {                                                       \
+        RunTest<double>(dtype, DEVICE_##T, W, D, S, L, A);                    \
+      } break;                                                                \
+      case DT_INT32: {                                                        \
+        RunTest<int32>(dtype, DEVICE_##T, W, D, S, L, A);                     \
+      } break;                                                                \
+      case DT_INT64: {                                                        \
+        RunTest<int64>(dtype, DEVICE_##T, W, D, S, L, A);                     \
+      } break;                                                                \
+      default:                                                                \
+        LOG(FATAL) << "Unimplemented";                                        \
+    }                                                                         \
+  }
+
+#ifndef GOOGLE_CUDA
+// Success tests
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 1, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 2, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 8, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 16, 0)
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 1001, 0)
+DEF_TEST(FLOAT, CPU, 2, 4, 1, 128, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 1001, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 4096, 0)
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 0)
+DEF_TEST(FLOAT, CPU, 4, 4, 1, 32768, 0)
+DEF_TEST(DOUBLE, CPU, 1, 2, 1, 1001, 0)
+DEF_TEST(DOUBLE, CPU, 2, 8, 1, 4095, 0)
+DEF_TEST(INT32, CPU, 1, 2, 1, 1001, 0)
+DEF_TEST(INT32, CPU, 2, 8, 1, 4095, 0)
+DEF_TEST(INT64, CPU, 1, 2, 1, 1001, 0)
+DEF_TEST(INT64, CPU, 2, 8, 1, 4095, 0)
+
+// Failure tests
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 1)
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 7)
+DEF_TEST(FLOAT, CPU, 2, 8, 1, 9408, 11)
+#endif
+
+#ifdef GOOGLE_CUDA
+// GPU tests.  So long as the device names are all in a single tasks we
+// bypass inter-worker routing code and can fake multiple GPUs with a single
+// GPU, from the perspective of the RingGatherer logic.  So these tests
+// are all single-worker.
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 1, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 2, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 8, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 16, 0)
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 1001, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 1001, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 4096, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 4095, 0)
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 32768, 0)
+DEF_TEST(FLOAT, GPU, 1, 4, 1, 32768, 0)
+DEF_TEST(DOUBLE, GPU, 1, 2, 1, 1001, 0)
+// INT32 values are never on the GPU.
+// DEF_TEST(INT32, GPU, 1, 1, 1, 1001, 0)
+DEF_TEST(INT64, GPU, 1, 2, 1, 1001, 0)
+
+// Failure tests
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 9408, 2)
+DEF_TEST(FLOAT, GPU, 1, 8, 1, 9408, 5)
+#endif
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/ring_reducer.cc b/tensorflow/core/common_runtime/ring_reducer.cc
index 092f15e49e330de21452e0f7b4d8cc51607a44ed..3328804cdfb00ecbbc473add3984b414add06b1e 100644
--- a/tensorflow/core/common_runtime/ring_reducer.cc
+++ b/tensorflow/core/common_runtime/ring_reducer.cc
@@ -39,212 +39,15 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/types.h"
 
-// Set true for greater intelligibility of debug mode log messages.
-#define READABLE_KEYS false
-// RingReduce algorithm exchanges chunks of tensor between devices.  The chunk
-// size depends on the number of subdivisions specified in the algorithm.  If
-// the user does not specify the number of subdivisions, we infer the number
-// dynamically so that the resulting chunk size does not exceed
-// kMaxChunkSizeBytes, empirically set at 4 MiB.
-constexpr size_t kMaxChunkSizeBytes = (4 * 1024 * 1024);
-// kMaxSubdivsPerDev is used to give an upper bound on the number of
-// subdivisions dynamically generated.  A reasonable value would be a small
-// multiple of the number of NICs adjacent to each device.
-constexpr int kMaxSubdivsPerDevice = 2;
-
 namespace tensorflow {
-namespace {
-// Each CollectiveOp implementation is free to define its own
-// BufRendezvous key format.  This function produces the key used by
-// RingReducer.
-string RingReduceBufKey(const string& exec_key, int pass, int section,
-                        int source_rank) {
-  if (READABLE_KEYS) {
-    return strings::StrCat("rred(", exec_key, "):pass(", pass, "):section(",
-                           section, "):srcrank(", source_rank, ")");
-  } else {
-    // TODO(b/78352018): Try out some kind of denser encoding, e.g. 128 bit
-    // hash.
-    return strings::StrCat(exec_key, ":", pass, ":", section, ":", source_rank);
-  }
-}
-
-}  // namespace
-
-void RingReducer::PCQueue::Enqueue(RingField* rf) {
-  mutex_lock l(pcq_mu_);
-  deque_.push_back(rf);
-  if (waiter_count_ > 0) {
-    cv_.notify_one();
-  }
-}
-
-RingReducer::RingField* RingReducer::PCQueue::Dequeue() {
-  mutex_lock l(pcq_mu_);
-  if (deque_.empty()) {
-    ++waiter_count_;
-    while (deque_.empty()) {
-      cv_.wait(l);
-    }
-    --waiter_count_;
-  }
-  RingField* rf = deque_.front();
-  deque_.pop_front();
-  return rf;
-}
-
-RingReducer::RingReducer()
-    : col_ctx_(nullptr),
-      col_params_(nullptr),
-      done_(nullptr),
-      group_size_(-1),
-      num_subdivs_(-1) {}
 
 RingReducer::~RingReducer() { group_size_tensor_ready_.WaitForNotification(); }
 
-Status GenerateSubdivsInCollectiveParams(CollectiveParams* col_params) {
-  if (col_params->instance.shape.num_elements() == 0) {
-    return errors::Internal("shape in CollectiveParams should be non-empty");
-  }
-  const int kAvgDevPerTask =
-      col_params->group.group_size / col_params->group.num_tasks;
-  const int kMaxNumSubdivs = kMaxSubdivsPerDevice * kAvgDevPerTask;
-  if (kMaxNumSubdivs <= 0) {
-    return errors::Internal("Unexpected kMaxNumSubdivs ", kMaxNumSubdivs,
-                            " in RingReducer");
-  }
-  // NOTE(ayushd): If no subdiv_offsets have been specified, dynamically add
-  // as many offsets as needed so that the size of tensor chunks <=
-  // kMaxChunkSizeBytes.  Empirically, chunks that are too small or too large
-  // lead to worse performance.
-  int num_subdivs = 0;
-  const size_t tensor_size = col_params->instance.shape.num_elements() *
-                             DataTypeSize(col_params->instance.data_type);
-  size_t chunk_size;
-  do {
-    ++num_subdivs;
-    int num_chunks = col_params->group.group_size * num_subdivs;
-    chunk_size = tensor_size / num_chunks;
-    VLOG(2) << "num_subdivs " << num_subdivs << " num_chunks " << num_chunks
-            << " chunk_size " << chunk_size;
-  } while (chunk_size > kMaxChunkSizeBytes && num_subdivs < kMaxNumSubdivs);
-  if (num_subdivs <= 0) {
-    return errors::Internal("Unexpected num_subdivs ", num_subdivs,
-                            " in RingReducer");
-  }
-
-  int subdiv_stride = kAvgDevPerTask / num_subdivs;
-  if (subdiv_stride == 0) subdiv_stride = 1;
-  col_params->instance.impl_details.subdiv_offsets.reserve(num_subdivs);
-  for (int sdi = 0; sdi < num_subdivs; ++sdi) {
-    int subdiv_offset = subdiv_stride * sdi;
-    if (sdi % 2 == 1) subdiv_offset *= -1;
-    col_params->instance.impl_details.subdiv_offsets.push_back(subdiv_offset);
-  }
-
-  if (VLOG_IS_ON(2)) {
-    string subdiv_buf;
-    for (const int subdiv_offset :
-         col_params->instance.impl_details.subdiv_offsets) {
-      strings::StrAppend(&subdiv_buf, " ", subdiv_offset);
-    }
-    VLOG(2) << "Dynamically generated " << num_subdivs
-            << " subdiv_offsets:" << subdiv_buf << " tensor_size "
-            << tensor_size << " chunk_size " << chunk_size;
-  }
-
-  return Status::OK();
-}
-
 Status RingReducer::InitializeCollectiveParams(CollectiveParams* col_params) {
   // TODO(b/113171733): change CHECKs to return errors.
   CHECK_EQ(col_params->instance.type, REDUCTION_COLLECTIVE);
   CHECK_EQ(col_params->instance.impl_details.collective_name, "RingReduce");
-  const string& device_name =
-      col_params->instance.device_names[col_params->default_rank];
-  // Each subdiv permutation is a ring formed by rotating each
-  // single-task subsequence of devices by an offset.  This makes most
-  // sense when each task has the same number of devices but we can't
-  // depend on that being the case so we'll compute something that
-  // works in any case.
-
-  // Start by counting the devices in each task.
-  // Precondition: device_names must be sorted so that all devices in
-  // the same task are adjacent.
-  VLOG(2) << "Sorted task names: "
-          << str_util::Join(col_params->instance.task_names, ", ");
-  std::vector<int> dev_per_task;
-  const string* prior_task_name = &col_params->instance.task_names[0];
-  int dev_count = 1;
-  for (int di = 1; di < col_params->group.group_size; ++di) {
-    if (col_params->instance.task_names[di] != *prior_task_name) {
-      dev_per_task.push_back(dev_count);
-      dev_count = 1;
-      prior_task_name = &col_params->instance.task_names[di];
-    } else {
-      ++dev_count;
-    }
-  }
-  dev_per_task.push_back(dev_count);
-  CHECK_EQ(col_params->group.num_tasks, dev_per_task.size());
-
-  if (col_params->instance.impl_details.subdiv_offsets.empty()) {
-    TF_RETURN_IF_ERROR(GenerateSubdivsInCollectiveParams(col_params));
-  }
-
-  // Generate a ring permutation for requested offset.
-  VLOG(2) << "Setting up perms for col_params " << col_params
-          << " subdiv_permutations "
-          << &col_params->instance.impl_details.subdiv_permutations;
-  col_params->instance.impl_details.subdiv_permutations.resize(
-      col_params->instance.impl_details.subdiv_offsets.size());
-  col_params->subdiv_rank.resize(
-      col_params->instance.impl_details.subdiv_offsets.size(), -1);
-  for (int sdi = 0;
-       sdi < col_params->instance.impl_details.subdiv_offsets.size(); ++sdi) {
-    std::vector<int>& perm =
-        col_params->instance.impl_details.subdiv_permutations[sdi];
-    CHECK_EQ(perm.size(), 0);
-    int offset = col_params->instance.impl_details.subdiv_offsets[sdi];
-    // A negative subdivision offset is interpreted as follows:
-    //  1. Reverse the local device ordering.
-    //  2. Begin the subdivision at abs(offset) in the reversed ordering.
-    bool reverse = false;
-    if (offset < 0) {
-      offset = abs(offset);
-      reverse = true;
-    }
-    int prior_dev_count = 0;  // sum over prior worker device counts
-    for (int ti = 0; ti < col_params->group.num_tasks; ++ti) {
-      for (int di = 0; di < dev_per_task[ti]; ++di) {
-        int di_offset = (di + offset) % dev_per_task[ti];
-        int offset_di =
-            reverse ? (dev_per_task[ti] - (di_offset + 1)) : di_offset;
-        // Device index in global subdivision permutation.
-        int permuted_di = prior_dev_count + offset_di;
-        int rank = static_cast<int>(perm.size());
-        perm.push_back(permuted_di);
-        if (col_params->instance.device_names[permuted_di] == device_name) {
-          CHECK_EQ(permuted_di, col_params->default_rank);
-          col_params->subdiv_rank[sdi] = rank;
-        }
-      }
-      prior_dev_count += dev_per_task[ti];
-    }
-    CHECK_EQ(col_params->group.group_size, perm.size());
-  }
-
-  VLOG(2) << collective_util::SubdivPermDebugString(*col_params);
-  return Status::OK();
-}
-
-Status RingReducer::InitializeCollectiveContext(CollectiveContext* col_ctx) {
-  CHECK(col_ctx->dev_mgr);
-  col_ctx_ = col_ctx;
-  col_params_ = &col_ctx->col_params;
-  return collective_util::InitializeDeviceAndLocality(
-      col_ctx->dev_mgr, col_ctx->device_name, &col_ctx->device,
-      &col_ctx->device_locality);
+  return RingAlg::InitializeCollectiveParams(col_params);
 }
 
 void RingReducer::Run(StatusCallback done) {
@@ -303,25 +106,6 @@ void RingReducer::Run(StatusCallback done) {
   ContinueAfterInputCopy();
 }
 
-string RingReducer::TensorDebugString(const Tensor& tensor) {
-  const DeviceBase::GpuDeviceInfo* gpu_device_info =
-      col_ctx_->op_ctx->device()->tensorflow_gpu_device_info();
-  if (gpu_device_info) {
-    Tensor cpu_tensor(tensor.dtype(), tensor.shape());
-    Notification note;
-    gpu_device_info->default_context->CopyDeviceTensorToCPU(
-        &tensor, "" /*tensor_name*/, col_ctx_->device, &cpu_tensor,
-        [&note](const Status& s) {
-          CHECK(s.ok());
-          note.Notify();
-        });
-    note.WaitForNotification();
-    return cpu_tensor.SummarizeValue(64);
-  } else {
-    return tensor.SummarizeValue(64);
-  }
-}
-
 // Note that this function is blocking and must not run in any thread
 // which cannot be blocked.
 void RingReducer::ContinueAfterInputCopy() {
@@ -358,201 +142,16 @@ void RingReducer::ContinueAfterInputCopy() {
   Finish(RunAsyncParts());
 }
 
-void RingReducer::StartAbort(const Status& s) {
-  // In abort mode we stop issuing additional ProvideBuf
-  // and ConsumeBuf calls, but we need to wait for all of the
-  // outstanding callbacks to be invoked before quitting.
-  bool abort_started = false;
-  {
-    mutex_lock l(status_mu_);
-    if (status_.ok()) {
-      LOG(ERROR) << "Aborting RingReduce with " << s;
-      abort_started = true;
-      status_.Update(s);
-    }
-  }
-  // If this is the initial entry to abort mode then invoke StartAbort
-  // on the CollectiveExecutor that invoked us.  That should start
-  // cancellation on all of the outstanding CollectiveRemoteAccess
-  // actions.
-  if (abort_started) {
-    col_ctx_->col_exec->StartAbort(s);
-  }
-}
-
-void RingReducer::Finish(bool ok) {
-  if (ok) {
-    // Recover the output from the adaptor.
-    ca_->ConsumeFinalValue(col_ctx_->output);
-  }
-  Status s;
-  {
-    mutex_lock l(status_mu_);
-    s = status_;
-  }
-  rfv_.clear();  // Give up Refs on output tensor.
-  done_(s);
-}
-
-RingReducer::SubContext::SubContext(OpKernelContext* ctx,
-                                    OpKernelContext::Params* params,
-                                    OpKernel* op, Tensor* output, Tensor* input)
-    : sub_params_(*params),
-      sub_inputs_({output, input}),
-      sub_input_attr_({ctx->input_alloc_attr(0), ctx->input_alloc_attr(0)}),
-      sub_input_dc_(
-          {ctx->input_device_context(0), ctx->input_device_context(0)}) {
-  sub_params_.op_kernel = op;
-  sub_params_.inputs = &sub_inputs_;
-  sub_params_.input_alloc_attrs = &sub_input_attr_;
-  sub_params_.input_device_contexts = &sub_input_dc_;
-  sub_params_.eigen_gpu_device = nullptr;
-  sub_params_.ensure_eigen_gpu_device();
-  sub_params_.forward_from_array = &forward_from_;
-  sub_ctx_ = new OpKernelContext(&sub_params_, 1);
-}
-
-Status RingReducer::ComputeBinOp(Device* device, OpKernel* op, Tensor* output,
-                                 Tensor* input) {
-  // Prepare an OpKernelContext that is identical to that of the original Op
-  // (i.e. the collective), except for the input output sizes and identities and
-  // the Op itself.
-  // TODO(tucker): Is it possible to cache and reuse these objects?  They're
-  // mostly identical inside one device execution.
-  std::unique_ptr<SubContext> sub_ctx(
-      new SubContext(col_ctx_->op_ctx, col_ctx_->op_params, op, output, input));
-  device->Compute(op, sub_ctx->sub_ctx_);
-  return sub_ctx->sub_ctx_->status();
-}
-
-// At the beginning of the algorithm initialize a RingField struct for
-// every independent field of the tensor.
 void RingReducer::InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
                                 int field_idx) {
-  // Note on field indexing: There are group_size_ devices in the
-  // instance, implying the same number of chunks per tensor, where a
-  // chunk is the unit of data transferred in a time step.  However, if
-  // a device can simultaneously send data by 2 or more independent
-  // channels we can speed up the transfer by subdividing chunks and
-  // processing multiple subdivisions at once.  So the actual number
-  // of RingFields is group_size_ * num_subdivs_.
-  DCHECK_EQ(field_idx, (chunk_idx * num_subdivs_) + subdiv_idx);
-  rf->chunk_idx = chunk_idx;
-  rf->subdiv_idx = subdiv_idx;
-  rf->sc_idx = field_idx;
-  rf->rank = col_params_->subdiv_rank[subdiv_idx];
-  rf->second_pass = false;
-  rf->action = RF_INIT;
-  // Recv from the device with preceding rank within the subdivision.
-  int recv_from_rank = (rf->rank + (group_size_ - 1)) % group_size_;
-  int send_to_rank = (rf->rank + 1) % group_size_;
-  rf->recv_dev_idx = col_params_->instance.impl_details
-                         .subdiv_permutations[subdiv_idx][recv_from_rank];
-  int send_dev_idx = col_params_->instance.impl_details
-                         .subdiv_permutations[subdiv_idx][send_to_rank];
-  rf->recv_is_remote = !col_params_->task.is_local[rf->recv_dev_idx];
-  rf->send_is_remote = !col_params_->task.is_local[send_dev_idx];
-  if (ca_->ChunkBytes(rf->sc_idx) > 0) {
-    // In pass 0 we skip Recv when rank = chunk_idx
-    rf->do_recv = (rf->chunk_idx != rf->rank);
-    // In pass 0 we skip Send when rank = chunk_idx-1
-    rf->do_send =
-        (rf->rank != ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
-  }
-  rf->is_final =
-      (rf->rank == ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
-  if (rf->do_send || rf->do_recv) {
-    rf->chunk = ca_->ChunkAlias(rf->sc_idx);
-    CHECK(rf->chunk.IsAligned()) << rf->DebugString();
-  }
+  RingAlg::InitRingField(rf, chunk_idx, subdiv_idx, field_idx);
   if (rf->do_recv) {
     rf->tmp_chunk = ca_->TempChunk(rf->sc_idx);
-    CHECK(rf->tmp_chunk.IsAligned()) << rf->DebugString();
   }
-  VLOG(2) << this << " InitRingField " << rf->DebugString() << " chunk "
-          << ca_->TBounds(rf->chunk);
-}
-
-// When a RingField transitions from first to second recompute the
-// do_send and do_recv values.
-void RingReducer::AdvanceToSecondPass(RingField* rf) {
-  VLOG(3) << "IncrRingField old value " << rf->DebugString();
-  CHECK(!rf->second_pass);
-  rf->second_pass = true;
-  rf->action = RF_INIT;
-  if (ca_->ChunkBytes(rf->sc_idx) > 0) {
-    // In pass 1 the send/no-send boundary moves down 1 place.
-    rf->do_recv =
-        (rf->rank != ((rf->chunk_idx + (group_size_ - 1)) % group_size_));
-    rf->do_send =
-        (rf->rank != ((rf->chunk_idx + (group_size_ - 2)) % group_size_));
-  }
-  rf->is_final =
-      (rf->rank == ((rf->chunk_idx + (group_size_ - 2)) % group_size_));
-  VLOG(3) << "IncrRingField new value " << rf->DebugString();
-}
-
-string RingReducer::RingField::DebugString() const {
-  string rv = strings::StrCat("RingField rank=", rank, " chunk_idx=", chunk_idx,
-                              " subdiv=", subdiv_idx, " sc_idx=", sc_idx,
-                              " action=", action);
-  strings::StrAppend(&rv, " pass=", second_pass);
-  strings::StrAppend(&rv, " do_send=", do_send, " do_recv=", do_recv,
-                     " is_final=", is_final, " recv_is_remote=", recv_is_remote,
-                     " recv_dev_idx=", recv_dev_idx, " sc_idx=", sc_idx);
-  return rv;
-}
-
-void RingReducer::DispatchSend(RingField* rf, const StatusCallback& done) {
-  CHECK(rf->do_send);
-  string send_buf_key = RingReduceBufKey(col_ctx_->exec_key, rf->second_pass,
-                                         rf->sc_idx, rf->rank);
-  VLOG(3) << "DispatchSend rank=" << col_params_->default_rank << " send key "
-          << send_buf_key << " chunk " << ca_->TBounds(rf->chunk) << " sc_idx "
-          << rf->sc_idx;
-  int send_to_rank = (rf->rank + 1) % group_size_;
-  int send_to_dev_idx = col_params_->instance.impl_details
-                            .subdiv_permutations[rf->subdiv_idx][send_to_rank];
-  col_ctx_->col_exec->PostToPeer(
-      col_params_->instance.device_names[send_to_dev_idx],
-      col_params_->instance.task_names[send_to_dev_idx], send_buf_key,
-      col_ctx_->device, col_ctx_->op_ctx->op_device_context(),
-      col_ctx_->op_ctx->output_alloc_attr(0), &rf->chunk,
-      col_ctx_->device_locality, done);
-}
-
-void RingReducer::DispatchRecv(RingField* rf, const StatusCallback& done) {
-  CHECK(rf->do_recv);
-  string recv_buf_key =
-      RingReduceBufKey(col_ctx_->exec_key, rf->second_pass, rf->sc_idx,
-                       (rf->rank + (group_size_ - 1)) % group_size_);
-  VLOG(3) << "DispatchRecv rank=" << col_params_->default_rank << " recv key "
-          << recv_buf_key << " chunk " << ca_->TBounds(rf->chunk) << " into "
-          << ((col_params_->merge_op != nullptr) ? "tmp_chunk" : "chunk");
-  Tensor* dst_tensor = (!rf->second_pass && (col_params_->merge_op != nullptr))
-                           ? &rf->tmp_chunk
-                           : &rf->chunk;
-  col_ctx_->col_exec->RecvFromPeer(
-      col_params_->instance.device_names[rf->recv_dev_idx],
-      col_params_->instance.task_names[rf->recv_dev_idx],
-      col_params_->task.is_local[rf->recv_dev_idx], recv_buf_key,
-      col_ctx_->device, col_ctx_->op_ctx->op_device_context(),
-      col_ctx_->op_ctx->output_alloc_attr(0), dst_tensor,
-      col_ctx_->device_locality, rf->subdiv_idx, done);
-}
-
-string RingReducer::FieldState() {
-  string s = strings::StrCat(
-      "RingReducer ", strings::Hex(reinterpret_cast<uint64>(this)), " exec ",
-      col_ctx_->exec_key, " step_id=", col_ctx_->step_id, " state of all ",
-      rfv_.size(), " fields:");
-  for (int i = 0; i < rfv_.size(); ++i) {
-    s.append("\n");
-    s.append(rfv_[i].DebugString());
-  }
-  return s;
 }
 
+// At the beginning of the algorithm initialize a RingField struct for
+// every independent field of the tensor.
 bool RingReducer::RunAsyncParts() {
   // This function orchestrates RingReduce actions on behalf of a
   // single device. It is entered by a blockable thread that
@@ -632,9 +231,9 @@ bool RingReducer::RunAsyncParts() {
           --recv_pending_count;
           if (!rf->second_pass) {
             rf->action = RF_REDUCE;
-            Status s =
-                ComputeBinOp(col_ctx_->device, col_params_->merge_op.get(),
-                             &rf->chunk, &rf->tmp_chunk);
+            Status s = collective_util::ComputeBinOp(
+                col_ctx_->op_ctx, col_ctx_->op_params, col_ctx_->device,
+                col_params_->merge_op.get(), &rf->chunk, &rf->tmp_chunk);
             if (!s.ok()) {
               aborted = true;
               StartAbort(s);
@@ -647,9 +246,9 @@ bool RingReducer::RunAsyncParts() {
           if (!rf->second_pass && col_params_->final_op.get() && rf->is_final) {
             rf->action = RF_FINALIZE;
             group_size_tensor_ready_.WaitForNotification();
-            Status s =
-                ComputeBinOp(col_ctx_->device, col_params_->final_op.get(),
-                             &rf->chunk, &group_size_tensor_);
+            Status s = collective_util::ComputeBinOp(
+                col_ctx_->op_ctx, col_ctx_->op_params, col_ctx_->device,
+                col_params_->final_op.get(), &rf->chunk, &group_size_tensor_);
             if (!s.ok()) {
               aborted = true;
               StartAbort(s);
diff --git a/tensorflow/core/common_runtime/ring_reducer.h b/tensorflow/core/common_runtime/ring_reducer.h
index 0848e37b5225b16a82e19943a3bcc57148fd744c..a681fabd2bdf1c7e3765ede3098ebb5bf596a881 100644
--- a/tensorflow/core/common_runtime/ring_reducer.h
+++ b/tensorflow/core/common_runtime/ring_reducer.h
@@ -21,122 +21,36 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/ring_alg.h"
 #include "tensorflow/core/framework/collective.h"
 
 namespace tensorflow {
 class Device;
 
 // Ring-algorithm implementation of collective all-reduce.
-class RingReducer : public CollectiveImplementationInterface {
+class RingReducer : public RingAlg {
  public:
-  RingReducer();
+  RingReducer() : RingAlg(REDUCTION_COLLECTIVE, "Reduce") {}
   ~RingReducer() override;
 
-  // Establishes the requested number of subdivision permutations based on the
-  // ring order implicit in the device order.
-  Status InitializeCollectiveParams(CollectiveParams* col_params) override;
-
-  // Initializes members of CollectiveContext not yet initialized, i.e. device
-  // and device_locality.  Also saves the CollectiveContext in this object.
-  Status InitializeCollectiveContext(CollectiveContext* col_ctx) override;
-
   // Begins async execution of the ring reduce algorithm.
   // Must be called in a blockable thread.
   // TODO(b/80529858): remove the previous warning when we have a dedicated
   // collective threadpool.
   void Run(StatusCallback done) override;
 
- private:
-  // Called when a bad status is received that implies we should terminate
-  // execution and return a bad status.
-  void StartAbort(const Status& s);
-  void ContinueAfterInputCopy();
-  void Finish(bool ok);
-  Status ComputeBinOp(Device* device, OpKernel* op, Tensor* output,
-                      Tensor* input);
-  bool RunAsyncParts();
-
-  // Used for executing a sub-operation, e.g. a merge_op instance, with
-  // an OpKernelContext based on the one passed into this Op.
-  class SubContext {
-   public:
-    OpKernelContext::Params sub_params_;
-    gtl::InlinedVector<TensorValue, 4> sub_inputs_;
-    gtl::InlinedVector<AllocatorAttributes, 4> sub_input_attr_;
-    gtl::InlinedVector<DeviceContext*, 4> sub_input_dc_;
-    // Used only for Binary and Unary Ops for which we require
-    // the calculation to be in-place on the first input.
-    int forward_from_ = 0;
-    OpKernelContext* sub_ctx_;
-    SubContext(OpKernelContext* ctx, OpKernelContext::Params* params,
-               OpKernel* op, Tensor* output, Tensor* input);
-    ~SubContext() { delete sub_ctx_; }
-  };
-
-  // Current status of a RingField
-  enum RingFieldAction {
-    RF_INIT = 0,    // Just initialized for a pass
-    RF_RECV,        // Recv pending
-    RF_REDUCE,      // Reduce pending
-    RF_FINALIZE,    // FinalOp pending
-    RF_SEND_READY,  // Ready to send
-    RF_SEND,        // Send pending
-    RF_DONE,        // No more work
-  };
+  Status InitializeCollectiveParams(CollectiveParams* col_params) override;
 
-  // Tracks progress of actions on a single subfield of the entire tensor.
-  struct RingField {
-    int16 chunk_idx;     // major division index
-    int16 subdiv_idx;    // minor division index
-    int16 sc_idx;        // subchunk index
-    int16 rank;          // rank within subdiv permutation
-    int16 recv_dev_idx;  // dev from which value should be recv'd
-    RingFieldAction action;
-    bool second_pass;
-    bool recv_is_remote = false;
-    bool send_is_remote = false;
-    bool do_send = false;   // is the value sent in this pass?
-    bool do_recv = false;   // is the value recv'd in this pass?
-    bool is_final = false;  // is the last field in the pass for this rank
-    Tensor chunk;           // alias to field values
-    Tensor tmp_chunk;
-    Status status;
-    string DebugString() const;
-  };
-  void AdvanceToSecondPass(RingField* rf);
+ protected:
   void InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
-                     int field_idx);
-  void DispatchSend(RingField* rf, const StatusCallback& done);
-  void DispatchRecv(RingField* rf, const StatusCallback& done);
-
-  // For constructing log messages for debugging.
-  string FieldState();
-  string TensorDebugString(const Tensor& tensor);
-
-  // Producer/Consumer Queue of RingField structs.
-  class PCQueue {
-   public:
-    void Enqueue(RingField* rf);
-    RingField* Dequeue();
+                     int field_idx) override;
 
-   private:
-    mutex pcq_mu_;
-    condition_variable cv_;
-    int waiter_count_ GUARDED_BY(pcq_mu_) = 0;
-    std::deque<RingField*> deque_ GUARDED_BY(pcq_mu_);
-  };
+ private:
+  void ContinueAfterInputCopy();
+  bool RunAsyncParts();
 
-  CollectiveContext* col_ctx_;          // Not owned
-  const CollectiveParams* col_params_;  // Not owned
-  StatusCallback done_;
-  int group_size_;
-  int num_subdivs_;
   Tensor group_size_tensor_;
   Notification group_size_tensor_ready_;
-  std::unique_ptr<CollectiveAdapter> ca_;
-  mutex status_mu_;
-  Status status_ GUARDED_BY(status_mu_);
-  std::vector<RingField> rfv_;
 
   friend class RingReducerTest;
 };
diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
index 7feb29a6dbbb17d73967344ad07db9d234411840..7f18cdb5e2caec7690c8f96c6deb32319acb2e10 100644
--- a/tensorflow/core/common_runtime/ring_reducer_test.cc
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -335,19 +335,20 @@ class RingReducerTest : public ::testing::Test {
           note.WaitForNotification();
         }
 
+        auto alias = actual.template unaligned_flat<T>();
         for (int i = 0; i < tensor_len; ++i) {
           switch (dtype) {
             case DT_FLOAT:
-              EXPECT_FLOAT_EQ(expected[i], actual.template flat<T>()(i))
+              EXPECT_FLOAT_EQ(expected[i], alias(i))
                   << "Mismatch at device " << di << " index " << i;
               break;
             case DT_DOUBLE:
-              EXPECT_DOUBLE_EQ(expected[i], actual.template flat<T>()(i))
+              EXPECT_DOUBLE_EQ(expected[i], alias(i))
                   << "Mismatch at device " << di << " index " << i;
               break;
             case DT_INT32:
             case DT_INT64:
-              EXPECT_EQ(expected[i], actual.template flat<T>()(i))
+              EXPECT_EQ(expected[i], alias(i))
                   << "Mismatch at device " << di << " index " << i;
               break;
             default:
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index 9488a447789e67f3a9e73af43a0f3a849457e51f..8f28d2790358456df1414ba201d58e29e80221c9 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/eval_const_tensor.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/public/session.h"
diff --git a/tensorflow/core/common_runtime/shared_counter.h b/tensorflow/core/common_runtime/shared_counter.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e378524b203b1b1089fe6836a57d1effb961db5
--- /dev/null
+++ b/tensorflow/core/common_runtime/shared_counter.h
@@ -0,0 +1,31 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SHARED_COUNTER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SHARED_COUNTER_H_
+
+namespace tensorflow {
+// A lightweight thread-safe monotone counter for establishing
+// temporal ordering.
+class SharedCounter {
+ public:
+  int64 get() { return value_; }
+  int64 next() { return ++value_; }
+
+ private:
+  std::atomic<int64> value_{0};
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SHARED_COUNTER_H_
diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc
index 49265445659ff1daa30b632f60c03845d4a6a7f7..318cfec21a8be19bdad362c45b11398c33438bbb 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.cc
+++ b/tensorflow/core/common_runtime/step_stats_collector.cc
@@ -176,9 +176,10 @@ void NodeExecStatsWrapper::AddAllocation(
   memory->set_peak_bytes(std::get<1>(sizes));
   memory->set_live_bytes(std::get<2>(sizes));
 
-  AllocatorStats stats;
-  allocator->GetStats(&stats);
-  memory->set_allocator_bytes_in_use(stats.bytes_in_use);
+  absl::optional<AllocatorStats> stats = allocator->GetStats();
+  if (stats) {
+    memory->set_allocator_bytes_in_use(stats->bytes_in_use);
+  }
   allocations_.push_back(std::make_pair(memory, tracking_allocator));
 }
 
@@ -409,6 +410,21 @@ void StepStatsCollector::Save(const string& device,
   }
 }
 
+void StepStatsCollector::SaveThreadName(const string& device,
+                                        const uint32 thread_id,
+                                        const string& thread_name) {
+  VLOG(1) << "Save dev " << device << " thread id " << thread_id << " name "
+          << thread_name;
+  {
+    mutex_lock l(mu_);
+    if (finalized_) {
+      LOG(WARNING) << "thread_name saved after finalize will not be collected.";
+    }
+    auto& thread_names_map = thread_names_[device];
+    thread_names_map[thread_id] = thread_name;
+  }
+}
+
 NodeExecStatsInterface* StepStatsCollector::CreateNodeExecStats(
     const Node* node) {
   // Only collect statistics for non-transfer nodes.
@@ -531,5 +547,15 @@ void StepStatsCollector::FinalizeInternal() {
       stats->stats()->Swap(dss->add_node_stats());
     }
   }
+  for (const auto& device_thread : thread_names_) {
+    if (dev_stats_pb.find(device_thread.first) == dev_stats_pb.end()) {
+      // skip device without DeviceStepStats.
+      continue;
+    }
+    DeviceStepStats* dss = dev_stats_pb.at(device_thread.first);
+    for (const auto& thread_name : device_thread.second) {
+      (*dss->mutable_thread_names())[thread_name.first] = thread_name.second;
+    }
+  }
 }
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/step_stats_collector.h b/tensorflow/core/common_runtime/step_stats_collector.h
index 7d34383ce8209c9f4b889410a96bce02f6702a64..dfcc51ff4c79b386c327dcf9503c7ee35b20d2c4 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.h
+++ b/tensorflow/core/common_runtime/step_stats_collector.h
@@ -175,6 +175,10 @@ class StepStatsCollector : public StepStatsCollectorInterface {
   void Save(const string& device, NodeExecStats* node_stats_pb);
   void Save(const string& device, NodeExecStatsWrapper* node_stats);
 
+  // Saves thread name.
+  void SaveThreadName(const string& device, const uint32 thread_id,
+                      const string& thread_name);
+
   NodeExecStatsInterface* CreateNodeExecStats(const Node* node) override;
   string ReportAllocsOnResourceExhausted(const string& err) override;
 
@@ -191,12 +195,14 @@ class StepStatsCollector : public StepStatsCollectorInterface {
   static const uint64 kMaxCollectedNodes = 1 << 20;
 
   typedef std::vector<std::unique_ptr<NodeExecStatsWrapper>> NodeStatsVector;
+  typedef std::unordered_map<uint32, string> ThreadNamesMap;
 
   void FinalizeInternal() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   mutex mu_;
   bool finalized_ GUARDED_BY(mu_);
   std::unordered_map<string, NodeStatsVector> dev_stats_ GUARDED_BY(mu_);
+  std::unordered_map<string, ThreadNamesMap> thread_names_ GUARDED_BY(mu_);
   StepStats* step_stats_ GUARDED_BY(mu_);
   uint64 collected_nodes_ GUARDED_BY(mu_) = 0;
 };
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index 591c22b8f625554acfe25d744cb53998f551ff29..f8c07dde46caab062b86a934186c39777485f4d0 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -221,11 +221,17 @@ tf_cc_test(
     deps = [
         ":debug_grpc_testlib",
         ":debug_io_utils",
+        "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:master_proto_cc",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:sendrecv_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
diff --git a/tensorflow/core/debug/debug_graph_utils.cc b/tensorflow/core/debug/debug_graph_utils.cc
index 5fc95a8f20d2b3f1b37a660e17d0efee17aacb94..b69eb1da39e68cc470d0d64c69c28ce1a3f6477c 100644
--- a/tensorflow/core/debug/debug_graph_utils.cc
+++ b/tensorflow/core/debug/debug_graph_utils.cc
@@ -299,7 +299,7 @@ Status DebugNodeInserter::CreateCopyNode(
 
   auto builder = NodeDefBuilder(copy_node_name, copy_op_name)
                      .Input(src_node_name, src_output, src_dt)
-                     .Attr("debug_ops_spec", std::move(debug_ops_spec));
+                     .Attr("debug_ops_spec", debug_ops_spec);
 
   if (!builder.Finalize(&node_def).ok()) {
     return Status(
diff --git a/tensorflow/core/debug/debug_grpc_testlib.cc b/tensorflow/core/debug/debug_grpc_testlib.cc
index f70931e926507c72287588da278a3b8d6bb19122..4927caf5a3285a3855d27b614bf597943059e2fb 100644
--- a/tensorflow/core/debug/debug_grpc_testlib.cc
+++ b/tensorflow/core/debug/debug_grpc_testlib.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/debug/debug_graph_utils.h"
 #include "tensorflow/core/debug/debugger_event_metadata.pb.h"
 #include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index 6994dec3b59d997650e07ba9a6fd14233022b201..ebcb046003437eb9fab452c5337204cb249c510c 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/debug/debugger_event_metadata.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/hash/hash.h"
@@ -730,7 +731,7 @@ Status DebugGrpcChannel::Connect(const int64 timeout_micros) {
   ::grpc::ChannelArguments args;
   args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
   // Avoid problems where default reconnect backoff is too long (e.g., 20 s).
-  args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
+  args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 1000);
   channel_ = ::grpc::CreateCustomChannel(
       server_stream_addr_, ::grpc::InsecureChannelCredentials(), args);
   if (!channel_->WaitForConnected(
diff --git a/tensorflow/core/debug/debug_io_utils_test.cc b/tensorflow/core/debug/debug_io_utils_test.cc
index 82e0ae5edb1eccd35c7c76da0a8a2ee9ea12d9fd..0926a82fade31904376fac277273b20b13367167 100644
--- a/tensorflow/core/debug/debug_io_utils_test.cc
+++ b/tensorflow/core/debug/debug_io_utils_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/debug/debug_node_key.h"
 #include "tensorflow/core/debug/debugger_event_metadata.pb.h"
 #include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index e388d3e6f0f5636c044c36ee03c826f1872cac9f..af744ce790bf5d39895a41fff8f77650f7adc19a 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -17,7 +17,6 @@ filegroup(
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
-load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests")
 load("//tensorflow:tensorflow.bzl", "tf_copts")
 
 # For platform specific build config
@@ -298,6 +297,7 @@ cc_library(
     deps = [
         ":call_options",
         ":message_wrappers",
+        ":request_id",
         "//tensorflow/core:lib",
         "//tensorflow/core:master_proto_cc",
     ],
@@ -311,6 +311,7 @@ cc_library(
         ":call_options",
         ":master_env",
         ":master_session",
+        ":recent_request_ids",
         ":remote_device",
         ":worker_cache",
         ":worker_interface",
@@ -425,7 +426,6 @@ cc_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:metrics",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/debug",
@@ -587,13 +587,17 @@ tf_cc_test(
         ":collective_param_resolver_distributed",
         ":device_resolver_distributed",
         ":test_utils",
+        "//tensorflow/core:collective_ops_op_lib",
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:no_op_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:sendrecv_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:collective_ops",
     ],
 )
 
@@ -640,13 +644,14 @@ tf_cuda_cc_test(
         "manual",  # TODO(b/27683709): Re-enable when not flaky.
         "notap",  # TODO(b/27683709): Re-enable when not flaky.
         "noguitar",  # TODO(b/27683709): Re-enable when not flaky.
-        "nooss",  # TODO(b/27683709): Re-enable when not flaky.
+        "no_oss",  # TODO(b/27683709): Re-enable when not flaky.
     ],
     deps = [
         ":master",
         ":remote_device",
         ":worker_interface",
         "//tensorflow:grpc++",
+        "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -655,6 +660,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:master_proto_cc",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:state_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -712,9 +718,14 @@ tf_cuda_cc_test(
     tags = tf_cuda_tests_tags(),
     deps = [
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:bitwise_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
@@ -754,6 +765,7 @@ cc_library(
     srcs = ["recent_request_ids.cc"],
     hdrs = ["recent_request_ids.h"],
     deps = [
+        ":message_wrappers",
         "//tensorflow/core:lib",
         "//tensorflow/core:worker_proto_cc",
     ],
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index de6e4b4a7c51379f6492314de3dc8c69f424c769..a642313275d01f2575575fcb17de8a496cf09239 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -293,8 +293,11 @@ void BaseRemoteRendezvous::RecvAsync(const ParsedKey& parsed,
                                      const Rendezvous::Args& recv_args,
                                      DoneCallback done) {
   VLOG(1) << "RemoteRendezvous Recv " << this << " " << parsed.FullKey();
-  CHECK(is_initialized()) << "RecvAsync called when uninitialized.";
   Status s = ValidateDevices(parsed, false /*!is_src*/);
+  if (s.ok() && !is_initialized()) {
+    s.Update(errors::Internal(
+        "RecvAsync called when uninitialized (key:", parsed.FullKey(), ")."));
+  }
   if (!s.ok()) {
     done(s, Args(), recv_args, Tensor(), false);
     return;
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
index 1dd10d309b5f5acad2acab660aa709a9c0e9751d..443759ab740b99860e9d50a6a112a2a054d39f1a 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
@@ -82,15 +82,21 @@ CollectiveParamResolverDistributed::CollectiveParamResolverDistributed(
     const ConfigProto& config, const DeviceMgr* dev_mgr,
     DeviceResolverDistributed* dev_resolver, WorkerCacheInterface* worker_cache,
     const string& task_name)
-    : CollectiveParamResolverLocal(dev_mgr, dev_resolver, task_name),
+    : CollectiveParamResolverLocal(config, dev_mgr, dev_resolver, task_name),
       worker_cache_(worker_cache),
       group_leader_(task_name == config.experimental().collective_group_leader()
                         ? ""
-                        : config.experimental().collective_group_leader()) {}
+                        : config.experimental().collective_group_leader()) {
+  VLOG(1) << "CompleteParamResolverDistributed ctor task={" << task_name
+          << "} config.collective_group_leader={"
+          << config.experimental().collective_group_leader() << "}";
+}
 
 void CollectiveParamResolverDistributed::CompleteParamsAsync(
     const string& device, CollectiveParams* cp, CancellationManager* cancel_mgr,
     const StatusCallback& done) {
+  VLOG(1) << "CompleteParams distributed " << device << " for " << cp << ": "
+          << cp->ToString();
   CompleteGroupDistributed(device, cp, cancel_mgr,
                            [this, device, cp, cancel_mgr, done](
                                const Status& s, const GroupRec* gr) {
@@ -181,6 +187,10 @@ void CollectiveParamResolverDistributed::CompleteInstanceAsync(
                           ir->WaitForOutMu(l);
                           response->set_instance_key(cp->instance.instance_key);
                           response->set_source_rank(ir->source_rank);
+                          if (!cp->instance.communicator_key.empty()) {
+                            response->set_communicator_key(
+                                cp->instance.communicator_key);
+                          }
                           done_and_cleanup(fi_status);
                         } else {
                           done_and_cleanup(fi_status);
@@ -283,8 +293,10 @@ void CollectiveParamResolverDistributed::UpdateInstanceCache(
   using InstanceRecPointer = InstanceRec*;
   InstanceRecPointer* irp = new InstanceRecPointer(nullptr);
   int32 source_rank = resp.source_rank();
+  string communicator_key = resp.communicator_key();
 
-  auto continue_with_ir = [this, cp, irp, source_rank, done](const Status& s) {
+  auto continue_with_ir = [cp, irp, source_rank, communicator_key,
+                           done](const Status& s) {
     if (!s.ok()) {
       done(s);
       delete irp;
@@ -306,6 +318,19 @@ void CollectiveParamResolverDistributed::UpdateInstanceCache(
         }
         ir->source_rank = source_rank;
       }
+      if (ir->communicator_key != communicator_key) {
+        if (!ir->communicator_key.empty()) {
+          ir->status = errors::Internal(
+              "UpdateInstanceCache: CompleteInstanceResponse for instance ",
+              cp->instance.instance_key,
+              " gives communicator_key with size =", communicator_key.size(),
+              " but cache already holds communicator_key with size=",
+              ir->communicator_key.size());
+          status = ir->status;
+          break;
+        }
+        ir->communicator_key = communicator_key;
+      }
       if (ir->known_count < cp->group.group_size) {
         ir->known_count = cp->group.group_size;
         if (ir->known.size() != cp->group.group_size) {
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
index 40b18d321a1cb3fafeaa4b864e737f6d86695842..823d7d5eb980bcbc414e4683cc7fabd154e7c28d 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
@@ -268,6 +268,8 @@ class DeviceResDistTest : public ::testing::Test {
         EXPECT_EQ(cp_[idx].instance.device_names[idx], device_name);
         EXPECT_EQ(cp_[idx].instance.task_names[idx], task_name);
         if (idx > 0) {
+          EXPECT_EQ(cp_[0].instance.communicator_key,
+                    cp_[idx].instance.communicator_key);
           for (int i = 0; i < dev_count; ++i) {
             EXPECT_EQ(cp_[0].instance.device_names[i],
                       cp_[idx].instance.device_names[i]);
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
index 55b2657e74ef5c2be8c1b0f11d4a00186e063e31..6f08943e2d2408d0623cca05dd4e69fb74783d5e 100644
--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -84,6 +84,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/common_runtime/eager:tensor_handle",
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index ee5823e314f777f758a6c0d8ef7129c4bbd2916c..144113a04309d8dde8b8eebdce4485f828732595 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -356,6 +356,12 @@ Status GraphMgr::RecvOutputs(const int64 step_id, NamedTensors* out) {
   Rendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id);
   Status s = RecvOutputsFromRendezvous(rendezvous, out, Rendezvous::Args());
   rendezvous->Unref();
+  if (!s.ok()) {
+    // Failing to fetch the outputs should not be possible, so rewrite the error
+    // status to an INTERNAL error.
+    s = errors::Internal("Failed to fetch outputs for step ", step_id,
+                         ". (Original error message: ", s.ToString(), ")");
+  }
   return s;
 }
 
@@ -451,7 +457,8 @@ void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
       cancellation_manager,
       [item, rendezvous, ce_handle, done, start_time_usecs](const Status& s) {
         done(s);
-        UpdateGraphExecTime(Env::Default()->NowMicros() - start_time_usecs);
+        metrics::UpdateGraphExecTime(Env::Default()->NowMicros() -
+                                     start_time_usecs);
         rendezvous->Unref();
         item->Unref();
         delete ce_handle;
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index 269f620e42e61b67477f9d73336a6e8da63b2eff..fc8d2871ac770bcea9104a206acabcd44ebde77f 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -65,7 +65,8 @@ Master::Master(MasterEnv* env, double session_gc_seconds)
     : env_(env),
       last_1000_steps_(1000),
       step_count_(0),
-      session_gc_seconds_(session_gc_seconds) {
+      session_gc_seconds_(session_gc_seconds),
+      recent_request_ids_(10000) {
   // Right now, a master service must be co-located with a device.
   // Otherwise, fetches do not work.
   CHECK(!env->local_devices.empty());
@@ -510,6 +511,12 @@ void Master::ExtendSession(const ExtendSessionRequest* req,
 
 void Master::PartialRunSetup(const PartialRunSetupRequest* req,
                              PartialRunSetupResponse* resp, MyClosure done) {
+  Status s = recent_request_ids_.TrackUnique(req->request_id(),
+                                             "PartialRunSetup (Master)", *req);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
   auto session = FindMasterSession(req->session_handle());
   if (session == nullptr) {
     done(errors::Aborted("Session ", req->session_handle(), " is not found."));
@@ -525,6 +532,12 @@ void Master::PartialRunSetup(const PartialRunSetupRequest* req,
 
 void Master::RunStep(CallOptions* opts, const RunStepRequestWrapper* req,
                      MutableRunStepResponseWrapper* resp, MyClosure done) {
+  Status s = recent_request_ids_.TrackUnique(req->request_id(),
+                                             "RunStep (Master)", req);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
   auto start_time = env_->env->NowMicros();
   auto session = FindMasterSession(req->session_handle());
   if (session == nullptr) {
@@ -664,6 +677,12 @@ void Master::Reset(const ResetRequest* req, ResetResponse* resp,
 
 void Master::MakeCallable(const MakeCallableRequest* req,
                           MakeCallableResponse* resp, MyClosure done) {
+  Status s = recent_request_ids_.TrackUnique(req->request_id(),
+                                             "MakeCallable (Master)", *req);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
   auto session = FindMasterSession(req->session_handle());
   if (session == nullptr) {
     done(errors::Aborted("Session ", req->session_handle(), " is not found."));
@@ -681,6 +700,12 @@ void Master::MakeCallable(const MakeCallableRequest* req,
 
 void Master::RunCallable(CallOptions* opts, const RunCallableRequest* req,
                          RunCallableResponse* resp, MyClosure done) {
+  Status s = recent_request_ids_.TrackUnique(req->request_id(),
+                                             "RunCallable (Master)", *req);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
   auto session = FindMasterSession(req->session_handle());
   if (session == nullptr) {
     done(errors::Aborted("Session ", req->session_handle(), " is not found."));
diff --git a/tensorflow/core/distributed_runtime/master.h b/tensorflow/core/distributed_runtime/master.h
index dbb337fd484960fbd3bfe47d0bfe0497985de66f..0524582ac78846fe192e8de47419280c6dde6177 100644
--- a/tensorflow/core/distributed_runtime/master.h
+++ b/tensorflow/core/distributed_runtime/master.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/master_env.h"
 #include "tensorflow/core/distributed_runtime/master_session.h"
+#include "tensorflow/core/distributed_runtime/recent_request_ids.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/macros.h"
@@ -95,6 +96,9 @@ class Master {
   // closed automatically.
   const double session_gc_seconds_;
 
+  // Used to track ids for incoming requests so we can detect duplicates.
+  RecentRequestIds recent_request_ids_;
+
   // Call CleanupAll on all workers.
   void CleanupWorkers(const ResetRequest& reset);
 
diff --git a/tensorflow/core/distributed_runtime/master_interface.h b/tensorflow/core/distributed_runtime/master_interface.h
index a8ae3cba3cdd3f02aae823d893e027b2bccae2c9..cde47fb9caf55f35db481fec8ae69ad6e6fcd8ed 100644
--- a/tensorflow/core/distributed_runtime/master_interface.h
+++ b/tensorflow/core/distributed_runtime/master_interface.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/message_wrappers.h"
+#include "tensorflow/core/distributed_runtime/request_id.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/protobuf/master.pb.h"
@@ -66,7 +67,9 @@ class MasterInterface {
   // The message returned from this method must only be used in a
   // `RunStep()` call on the same `MasterInterface` instance.
   virtual MutableRunStepRequestWrapper* CreateRunStepRequest() {
-    return new MutableProtoRunStepRequest;
+    MutableProtoRunStepRequest* ret = new MutableProtoRunStepRequest;
+    ret->request_.set_request_id(GetUniqueRequestId());
+    return ret;
   }
 
   // Returns a response object for use in calls to
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index bc8ba6e47d5c66eab72eacd1f4d9a65a4b9cae6c..2f14967656fc832550c310d4c2b7821061d11e75 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -292,8 +292,8 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
     if (tot >= 0.1 * 1048576.0) {
       bytes = strings::Printf("[%.1fMB] ", tot / 1048576.0);
     }
-    return strings::StrCat(bytes, stats.node_name(), " = ",
-                           details.type_string, details.detail_text);
+    return strings::StrCat(bytes, stats.node_name(), " = ", details.type_string,
+                           details.detail_text);
   }
 
   // Send/Recv nodes that are the result of client-added
@@ -512,25 +512,25 @@ class RunManyGraphs {
     if (resp->status_code() != error::Code::OK) {
       // resp->status_code will only be non-OK if s.ok().
       mutex_lock l(mu_);
-      UpdateStatusLocked(
+      ReportBadStatus(
           Status(resp->status_code(), resp->status_error_message()));
     } else if (!s.ok()) {
       mutex_lock l(mu_);
-      UpdateStatusLocked(s);
+      ReportBadStatus(s);
     }
     pending_.DecrementCount();
   }
 
   void StartCancel() {
     mutex_lock l(mu_);
-    UpdateStatusLocked(errors::Cancelled("RunManyGraphs"));
+    ReportBadStatus(errors::Cancelled("RunManyGraphs"));
   }
 
   void Wait() { pending_.Wait(); }
 
   Status status() const {
     mutex_lock l(mu_);
-    return status_;
+    return status_group_.as_status();
   }
 
  private:
@@ -538,15 +538,17 @@ class RunManyGraphs {
 
   BlockingCounter pending_;
   mutable mutex mu_;
-  Status status_ GUARDED_BY(mu_);
+  StatusGroup status_group_ GUARDED_BY(mu_);
 
-  void UpdateStatusLocked(const Status& s) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    if (status_.ok()) {
-      status_ = s;
+  void ReportBadStatus(const Status& s) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    // Start cancellation if we aren't already in an error state.
+    if (status_group_.ok()) {
       for (Call& call : calls_) {
         call.opts.StartCancel();
       }
     }
+
+    status_group_.Update(s);
   }
 
   TF_DISALLOW_COPY_AND_ASSIGN(RunManyGraphs);
@@ -1079,17 +1081,18 @@ void CopyAndSortStrings(size_t size,
 }  // namespace
 
 void BuildBuildGraphOptions(const RunStepRequestWrapper& req,
+                            const ConfigProto& config,
                             BuildGraphOptions* opts) {
   CallableOptions* callable_opts = &opts->callable_options;
-  CopyAndSortStrings(req.num_feeds(),
-                     [&req](size_t i) { return req.feed_name(i); },
-                     callable_opts->mutable_feed());
-  CopyAndSortStrings(req.num_fetches(),
-                     [&req](size_t i) { return req.fetch_name(i); },
-                     callable_opts->mutable_fetch());
-  CopyAndSortStrings(req.num_targets(),
-                     [&req](size_t i) { return req.target_name(i); },
-                     callable_opts->mutable_target());
+  CopyAndSortStrings(
+      req.num_feeds(), [&req](size_t i) { return req.feed_name(i); },
+      callable_opts->mutable_feed());
+  CopyAndSortStrings(
+      req.num_fetches(), [&req](size_t i) { return req.fetch_name(i); },
+      callable_opts->mutable_fetch());
+  CopyAndSortStrings(
+      req.num_targets(), [&req](size_t i) { return req.target_name(i); },
+      callable_opts->mutable_target());
 
   if (!req.options().debug_options().debug_tensor_watch_opts().empty()) {
     *callable_opts->mutable_run_options()->mutable_debug_options() =
@@ -1098,19 +1101,25 @@ void BuildBuildGraphOptions(const RunStepRequestWrapper& req,
 
   opts->collective_graph_key =
       req.options().experimental().collective_graph_key();
+  if (config.experimental().collective_deterministic_sequential_execution()) {
+    opts->collective_order = GraphCollectiveOrder::kEdges;
+  } else if (config.experimental().collective_nccl()) {
+    opts->collective_order = GraphCollectiveOrder::kAttrs;
+  }
 }
 
 void BuildBuildGraphOptions(const PartialRunSetupRequest& req,
                             BuildGraphOptions* opts) {
   CallableOptions* callable_opts = &opts->callable_options;
-  CopyAndSortStrings(req.feed_size(), [&req](size_t i) { return req.feed(i); },
-                     callable_opts->mutable_feed());
-  CopyAndSortStrings(req.fetch_size(),
-                     [&req](size_t i) { return req.fetch(i); },
-                     callable_opts->mutable_fetch());
-  CopyAndSortStrings(req.target_size(),
-                     [&req](size_t i) { return req.target(i); },
-                     callable_opts->mutable_target());
+  CopyAndSortStrings(
+      req.feed_size(), [&req](size_t i) { return req.feed(i); },
+      callable_opts->mutable_feed());
+  CopyAndSortStrings(
+      req.fetch_size(), [&req](size_t i) { return req.fetch(i); },
+      callable_opts->mutable_fetch());
+  CopyAndSortStrings(
+      req.target_size(), [&req](size_t i) { return req.target(i); },
+      callable_opts->mutable_target());
 
   // TODO(cais): Add TFDBG support to partial runs.
 }
@@ -1850,7 +1859,7 @@ Status MasterSession::DoRunWithLocalExecution(
 
   // Prepare.
   BuildGraphOptions bgopts;
-  BuildBuildGraphOptions(req, &bgopts);
+  BuildBuildGraphOptions(req, session_opts_.config, &bgopts);
   ReffedClientGraph* rcg = nullptr;
   int64 count;
   TF_RETURN_IF_ERROR(StartStep(bgopts, false, &rcg, &count));
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.cc b/tensorflow/core/distributed_runtime/message_wrappers.cc
index 40bf564cab6fe465ed66639f42fe0daeb149f132..c9bc558964c2e7a704c5e2e9f52a1f794065a7d3 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.cc
+++ b/tensorflow/core/distributed_runtime/message_wrappers.cc
@@ -97,6 +97,10 @@ bool InMemoryRunStepRequest::store_errors_in_response_body() const {
   return store_errors_in_response_body_;
 }
 
+int64 InMemoryRunStepRequest::request_id() const {
+  return 0;  // no need to track request id for local version.
+}
+
 void InMemoryRunStepRequest::set_store_errors_in_response_body(
     bool store_errors) {
   store_errors_in_response_body_ = store_errors;
@@ -210,6 +214,10 @@ void MutableProtoRunStepRequest::set_store_errors_in_response_body(
   request_.set_store_errors_in_response_body(store_errors);
 }
 
+int64 MutableProtoRunStepRequest::request_id() const {
+  return request_.request_id();
+}
+
 string MutableProtoRunStepRequest::DebugString() const {
   return request_.DebugString();
 }
@@ -272,6 +280,8 @@ bool ProtoRunStepRequest::store_errors_in_response_body() const {
   return request_->store_errors_in_response_body();
 }
 
+int64 ProtoRunStepRequest::request_id() const { return request_->request_id(); }
+
 string ProtoRunStepRequest::DebugString() const {
   return request_->DebugString();
 }
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.h b/tensorflow/core/distributed_runtime/message_wrappers.h
index 474ac0e186a203464ff64e1cbea2b4faaf87b05b..2cdbd1bfaf1be1fb646926ae82488f88377d491d 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.h
+++ b/tensorflow/core/distributed_runtime/message_wrappers.h
@@ -87,6 +87,8 @@ class RunStepRequestWrapper {
   // truncate long metadata messages.
   virtual bool store_errors_in_response_body() const = 0;
 
+  virtual int64 request_id() const = 0;
+
   // Returns a human-readable representation of this message for debugging.
   virtual string DebugString() const = 0;
 
@@ -127,6 +129,7 @@ class InMemoryRunStepRequest : public MutableRunStepRequestWrapper {
   string DebugString() const override;
   const RunStepRequest& ToProto() const override;
   bool store_errors_in_response_body() const override;
+  int64 request_id() const override;
 
   // MutableRunStepRequestWrapper methods.
   void set_session_handle(const string& handle) override;
@@ -177,6 +180,7 @@ class MutableProtoRunStepRequest : public MutableRunStepRequestWrapper {
   string DebugString() const override;
   const RunStepRequest& ToProto() const override;
   bool store_errors_in_response_body() const override;
+  int64 request_id() const override;
 
   // MutableRunStepRequestWrapper methods.
   void set_session_handle(const string& handle) override;
@@ -189,6 +193,7 @@ class MutableProtoRunStepRequest : public MutableRunStepRequestWrapper {
 
  private:
   RunStepRequest request_;
+  friend class MasterInterface;
 };
 
 // Wrapper for immutable RunStep requests that use a non-owned
@@ -216,6 +221,7 @@ class ProtoRunStepRequest : public RunStepRequestWrapper {
   string DebugString() const override;
   const RunStepRequest& ToProto() const override;
   bool store_errors_in_response_body() const override;
+  int64 request_id() const override;
 
  private:
   const RunStepRequest* const request_;  // Not owned.
@@ -234,7 +240,7 @@ class ProtoRunStepRequest : public RunStepRequestWrapper {
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-// Abstract interface for an immutable RunStepRequest message.
+// Abstract interface for an immutable RunGraphRequest message.
 //
 // This interface is typically used by server-side components in the
 // TensorFlow worker.
diff --git a/tensorflow/core/distributed_runtime/partial_run_mgr_test.cc b/tensorflow/core/distributed_runtime/partial_run_mgr_test.cc
index 5f7c0cb3cae7c97fac4b4c335a617687f31bd3b5..a2b799c3e42bf5609a37edf89fdbb99a96856a68 100644
--- a/tensorflow/core/distributed_runtime/partial_run_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/partial_run_mgr_test.cc
@@ -139,7 +139,7 @@ TEST_P(StatusPropagationTest, PartialRunDoneFirst) {
 // ExecutorDone and PartialRunDone.
 Status ExecutorError() { return errors::Internal("executor error"); }
 Status PartialRunError() { return errors::Internal("partial run error"); }
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     PartialRunMgr, StatusPropagationTest,
     ::testing::Values(
         StatusTestParam{Status::OK(), Status::OK(), Status::OK()},
diff --git a/tensorflow/core/distributed_runtime/recent_request_ids.cc b/tensorflow/core/distributed_runtime/recent_request_ids.cc
index 4f6866c5d154ba023b0923af67fe00a7a69b459d..2c953e12c06287e88b1a68bfab48a7234207046a 100644
--- a/tensorflow/core/distributed_runtime/recent_request_ids.cc
+++ b/tensorflow/core/distributed_runtime/recent_request_ids.cc
@@ -28,12 +28,10 @@ RecentRequestIds::RecentRequestIds(int num_tracked_request_ids)
   set_.reserve(num_tracked_request_ids);
 }
 
-Status RecentRequestIds::TrackUnique(int64 request_id,
-                                     const string& method_name,
-                                     const protobuf::Message& request) {
+bool RecentRequestIds::Insert(int64 request_id) {
   if (request_id == 0) {
     // For backwards compatibility, allow all requests with request_id 0.
-    return Status::OK();
+    return true;
   }
 
   mutex_lock l(mu_);
@@ -43,9 +41,7 @@ Status RecentRequestIds::TrackUnique(int64 request_id,
     // request_id's age in the circular_buffer_ if it's tracked again. Strict
     // LRU is not useful here because returning this error will close the
     // current Session.
-    return errors::Aborted("The same ", method_name,
-                           " request was received twice. ",
-                           request.ShortDebugString());
+    return false;
   }
 
   // Remove the oldest request_id from the set_. circular_buffer_ is
@@ -54,7 +50,30 @@ Status RecentRequestIds::TrackUnique(int64 request_id,
   set_.erase(circular_buffer_[next_index_]);
   circular_buffer_[next_index_] = request_id;
   next_index_ = (next_index_ + 1) % circular_buffer_.size();
-  return Status::OK();
+  return true;
+}
+
+Status RecentRequestIds::TrackUnique(int64 request_id,
+                                     const string& method_name,
+                                     const protobuf::Message& request) {
+  if (Insert(request_id)) {
+    return Status::OK();
+  } else {
+    return errors::Aborted("The same ", method_name,
+                           " request was received twice. ",
+                           request.ShortDebugString());
+  }
+}
+Status RecentRequestIds::TrackUnique(int64 request_id,
+                                     const string& method_name,
+                                     const RunStepRequestWrapper* wrapper) {
+  if (Insert(request_id)) {
+    return Status::OK();
+  } else {
+    return errors::Aborted("The same ", method_name,
+                           " request was received twice. ",
+                           wrapper->ToProto().ShortDebugString());
+  }
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/recent_request_ids.h b/tensorflow/core/distributed_runtime/recent_request_ids.h
index 11cf937c94659d85e3dc88350f20e107a27fab62..4094fcbde72ae97fcc5655a030fdf69426b093c8 100644
--- a/tensorflow/core/distributed_runtime/recent_request_ids.h
+++ b/tensorflow/core/distributed_runtime/recent_request_ids.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "tensorflow/core/distributed_runtime/message_wrappers.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -58,8 +59,13 @@ class RecentRequestIds {
   // ShortDebugString are added to returned errors.
   Status TrackUnique(int64 request_id, const string& method_name,
                      const protobuf::Message& request);
+  // Overloaded versions of the above function for wrapped protos.
+  Status TrackUnique(int64 request_id, const string& method_name,
+                     const RunStepRequestWrapper* wrapper);
 
  private:
+  bool Insert(int64 request_id);
+
   mutex mu_;
   // next_index_ indexes into circular_buffer_, and points to the next storage
   // space to use. When the buffer is full, next_index_ points at the oldest
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 273709a01fd799f7f4aa8afc80d3bdfc48d36322..dd22e74ac5466ea675487ee8fbed92164bf38c78 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -46,6 +46,7 @@ cc_library(
         "//tensorflow/core:lib",
         # Required to be able to overload TensorResponse parsing.
         "//tensorflow/core/distributed_runtime:tensor_coding",
+        "//tensorflow/core:lib_internal",
     ],
 )
 
@@ -159,6 +160,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "grpc_response_cache",
+    srcs = ["grpc_response_cache.cc"],
+    hdrs = ["grpc_response_cache.h"],
+    deps = [
+        ":grpc_util",
+        "//tensorflow/core:lib",
+    ],
+)
+
 tf_cuda_library(
     name = "grpc_worker_service",
     srcs = ["grpc_worker_service.cc"],
@@ -166,6 +177,7 @@ tf_cuda_library(
     deps = [
         ":async_service_interface",
         ":grpc_call",
+        ":grpc_response_cache",
         ":grpc_tensor_coding",
         ":grpc_util",
         ":grpc_worker_service_impl",
@@ -183,6 +195,7 @@ tf_cuda_library(
         "//tensorflow/core/distributed_runtime:worker_cache",
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/distributed_runtime:worker_session",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -313,9 +326,15 @@ tf_cc_binary(
         ":grpc_server_lib",
         "//tensorflow:grpc++",
         "//tensorflow/core:core_cpu",
+        "//tensorflow/core:data_flow_ops_op_lib",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lookup_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:sendrecv_ops_op_lib",
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/kernels:data_flow",
     ],
@@ -330,10 +349,14 @@ tf_cc_binary(
     deps = [
         ":grpc_server_lib",
         "//tensorflow:grpc++",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:bitwise_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:state_ops_op_lib",
         "//tensorflow/core:testlib",
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/kernels:constant_op",
@@ -384,6 +407,7 @@ cc_library(
         "//tensorflow/core/distributed_runtime:local_master",
         "//tensorflow/core/distributed_runtime:master_interface",
         "//tensorflow/core/distributed_runtime:message_wrappers",
+        "//tensorflow/core/distributed_runtime:request_id",
     ],
     alwayslink = 1,
 )
@@ -473,6 +497,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:master_proto_cc",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:state_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
index 781b7d65cdd184363d7c7650305bd62f3129c271..64c221805b072313fc9fba20fa9cdefe8cea9bfc 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
@@ -62,7 +62,7 @@ Status ValidateHostPortPair(const string& host_port) {
   args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
   // NOTE(mrry): Some versions of gRPC use a 20-second minimum backoff
   // on connection failure, which makes our tests time out.
-  args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
+  args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 1000);
   if (rpc_options != nullptr) {
     if (rpc_options->compression_algorithm() == "deflate") {
       args.SetCompressionAlgorithm(GRPC_COMPRESS_DEFLATE);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
index b832a2115cb809b5561fc55ab8d9057f2274dcd8..a84559098a3ad59cd1cabe91bb8546194da105e5 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/protobuf/master.pb.h"
 
@@ -43,104 +44,139 @@ class GrpcRemoteMaster : public MasterInterface {
   Status CreateSession(CallOptions* call_options,
                        const CreateSessionRequest* request,
                        CreateSessionResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::CreateSession);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::CreateSession);
   }
 
   Status ExtendSession(CallOptions* call_options,
                        const ExtendSessionRequest* request,
                        ExtendSessionResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::ExtendSession);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::ExtendSession);
   }
 
   Status PartialRunSetup(CallOptions* call_options,
                          const PartialRunSetupRequest* request,
                          PartialRunSetupResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::PartialRunSetup);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::PartialRunSetup);
   }
 
   Status RunStep(CallOptions* call_options, RunStepRequestWrapper* request,
                  MutableRunStepResponseWrapper* response) override {
-    ::grpc::ClientContext ctx;
-    auto trace = TraceRpc("RunStep/Client", &ctx);
-    return Call(&ctx, call_options, &request->ToProto(),
-                get_proto_from_wrapper(response), &MasterServiceStub::RunStep);
+    return CallWithRetry(call_options, &request->ToProto(),
+                         get_proto_from_wrapper(response),
+                         &MasterServiceStub::RunStep, "RunStep/Client");
   }
 
   Status CloseSession(CallOptions* call_options,
                       const CloseSessionRequest* request,
                       CloseSessionResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::CloseSession);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::CloseSession);
   }
 
   Status ListDevices(CallOptions* call_options,
                      const ListDevicesRequest* request,
                      ListDevicesResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::ListDevices);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::ListDevices);
   }
 
   Status Reset(CallOptions* call_options, const ResetRequest* request,
                ResetResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::Reset);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::Reset);
   }
 
   Status MakeCallable(CallOptions* call_options,
                       const MakeCallableRequest* request,
                       MakeCallableResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::MakeCallable);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::MakeCallable);
   }
   Status RunCallable(CallOptions* call_options,
                      const RunCallableRequest* request,
                      RunCallableResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::RunCallable);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::RunCallable);
   }
   Status ReleaseCallable(CallOptions* call_options,
                          const ReleaseCallableRequest* request,
                          ReleaseCallableResponse* response) override {
-    ::grpc::ClientContext ctx;
-    return Call(&ctx, call_options, request, response,
-                &MasterServiceStub::ReleaseCallable);
+    return CallWithRetry(call_options, request, response,
+                         &MasterServiceStub::ReleaseCallable);
   }
 
  private:
   // Start tracing, attaching a unique ID to both the trace and the RPC.
-  tracing::ScopedActivity TraceRpc(StringPiece name,
-                                   ::grpc::ClientContext* ctx) {
+  tracing::ScopedActivity* NewTraceRpc(StringPiece name,
+                                       ::grpc::ClientContext* ctx) {
     string trace_id = strings::StrCat(tracing::GetUniqueArg());
     ctx->AddMetadata(GrpcIdKey(), trace_id);
-    return tracing::ScopedActivity(name, trace_id);
-  }
-
-  void SetDeadline(::grpc::ClientContext* ctx, int64 time_in_ms) {
-    if (time_in_ms > 0) {
-      ctx->set_deadline(gpr_time_from_millis(time_in_ms, GPR_TIMESPAN));
-    }
+    return new tracing::ScopedActivity(name, trace_id);
   }
 
   template <typename Request, typename Response>
-  Status Call(::grpc::ClientContext* ctx, CallOptions* call_options,
-              const Request* request, Response* response,
-              ::grpc::Status (MasterServiceStub::*pfunc)(::grpc::ClientContext*,
-                                                         const Request&,
-                                                         Response*)) {
-    ctx->set_fail_fast(false);
-    SetDeadline(ctx, call_options->GetTimeout());
-    return FromGrpcStatus((stub_.get()->*pfunc)(ctx, *request, response));
+  Status CallWithRetry(CallOptions* call_options, const Request* request,
+                       Response* response,
+                       ::grpc::Status (MasterServiceStub::*pfunc)(
+                           ::grpc::ClientContext*, const Request&, Response*),
+                       string trace_string = {}) {
+    int64 timeout_in_ms = call_options->GetTimeout();
+    int64 expired_time_micros = Env::Default()->NowMicros();
+    if (timeout_in_ms > 0) {
+      expired_time_micros += (timeout_in_ms / 1000.);
+    }
+    Status s;
+    for (int num_retries = 0;; ++num_retries) {
+      ::grpc::ClientContext ctx;
+      std::unique_ptr<tracing::ScopedActivity> trace;
+      if (!trace_string.empty()) {
+        trace.reset(NewTraceRpc(trace_string, &ctx));
+      }
+      ctx.set_fail_fast(false);
+      if (timeout_in_ms > 0) {
+        // We do not modify the timeout here to match legacy behavior. However,
+        // this could violate the contract of tensorflow::Session. If we retry
+        // an RPC just before the deadline is exceeded, we will still set the
+        // timeout to the original value. This leads to the overall timeout
+        // being double what was expected.
+        // TODO(b/117162170): investigate fixing this behavior for legacy and
+        // gRPC RPC layers.
+        ctx.set_deadline(gpr_time_from_millis(timeout_in_ms, GPR_TIMESPAN));
+      }
+      s = FromGrpcStatus((stub_.get()->*pfunc)(&ctx, *request, response));
+      if (!errors::IsUnavailable(s)) {
+        return s;
+      }
+      // TODO(b/117162170): we may want to make this configurable.
+      constexpr int kMaxRetries = 10;
+      LOG(WARNING) << "RPC failed with status = \"" << s
+                   << "\" and grpc_error_string = \""
+                   << ctx.debug_error_string() << "\", maybe retrying the RPC";
+      if (num_retries >= kMaxRetries) {
+        LOG(WARNING) << "Too many retries, returning last status: " << s;
+        return s;
+      }
+      const int64 now_micros = Env::Default()->NowMicros();
+      const int64 deadline_with_backoff_micros =
+          now_micros + ComputeBackoffMicroseconds(num_retries);
+      // Wait for a short period of time before retrying the RPC.  If our
+      // backoff would put us past the RPC deadline, we truncate it to ensure
+      // our RPC starts before the deadline.
+      const auto backoff_until =
+          (timeout_in_ms <= 0 ||
+           expired_time_micros > deadline_with_backoff_micros)
+              ? deadline_with_backoff_micros
+              : expired_time_micros;
+      Env::Default()->SleepForMicroseconds(backoff_until - now_micros);
+      if (Env::Default()->NowMicros() > expired_time_micros &&
+          timeout_in_ms > 0) {
+        // If timeout_in_ms is set, exit the retry loop on timeout.
+        return errors::DeadlineExceeded(ctx.debug_error_string());
+      }
+    }
   }
 
   std::unique_ptr<MasterServiceStub> stub_;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 2daefcb399c79324f80278340967b679be5c6574..2479e7368be8de810db36cb18c887977aeae2472 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -39,6 +39,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+const int kMaxWorkerRpcRetries = 10;
+
 class GrpcRemoteWorker : public WorkerInterface {
  public:
   explicit GrpcRemoteWorker(SharedGrpcChannelPtr channel,
@@ -259,17 +261,19 @@ class GrpcRemoteWorker : public WorkerInterface {
   // given callback, `done`, will be called when the RPC completes.
   void IssueRequest(const protobuf::Message* request,
                     protobuf::Message* response, const ::grpc::string& method,
-                    StatusCallback done, CallOptions* call_opts = nullptr) {
+                    StatusCallback done, CallOptions* call_opts = nullptr,
+                    int max_retries = kMaxWorkerRpcRetries) {
     new RPCState<protobuf::Message>(&stub_, cq_, method, *request, response,
                                     std::move(done), call_opts,
-                                    callback_threadpool_);
+                                    callback_threadpool_, max_retries);
   }
   void IssueRequest(const protobuf::Message* request, TensorResponse* response,
                     const ::grpc::string& method, StatusCallback done,
-                    CallOptions* call_opts = nullptr) {
+                    CallOptions* call_opts = nullptr,
+                    int max_retries = kMaxWorkerRpcRetries) {
     new RPCState<TensorResponse>(&stub_, cq_, method, *request, response,
                                  std::move(done), call_opts,
-                                 callback_threadpool_);
+                                 callback_threadpool_, max_retries);
   }
 
   // Helper function for initializing the RpcMethod objects below.
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.cc b/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.cc
new file mode 100644
index 0000000000000000000000000000000000000000..613c290905d4e8914761b130d9353536023f5856
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.cc
@@ -0,0 +1,183 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+
+struct WorkerCacheEntry {
+  enum class State {
+    PENDING = 0,
+    ACTIVE = 1,
+    FINISHED = 2,
+  };
+
+  State state = State::PENDING;
+  int64 expires_seconds;
+
+  ::grpc::ByteBuffer response_buf;
+  Status response_status;
+
+  // Additional retries may arrive while a request is still executing.  The
+  // callbacks for these calls are queued in `callbacks` and evaluated after
+  // the original request is completed.
+  std::vector<std::pair<RPCResponse, StatusCallback>> callbacks;
+};
+
+void RPCResponse::Encode(::grpc::ByteBuffer* tgt) const {
+  if (buf_ != nullptr) {
+    *tgt = *buf_;
+  } else {
+    CHECK(msg_ != nullptr);
+    ::grpc::Slice slice(msg_->ByteSizeLong());
+    msg_->SerializeWithCachedSizesToArray(
+        const_cast<uint8*>(reinterpret_cast<const uint8*>(slice.begin())));
+    ::grpc::ByteBuffer tmp(&slice, 1);
+    tgt->Swap(&tmp);
+  }
+}
+
+void RPCResponse::CopyFrom(const ::grpc::ByteBuffer& src) {
+  if (buf_ != nullptr) {
+    *buf_ = src;
+    return;
+  }
+
+  CHECK(msg_ != nullptr);
+  // We create a single slice when encoding protocol messages.
+  std::vector<::grpc::Slice> slices;
+  if (src.Dump(&slices).ok()) {
+    msg_->ParseFromArray(slices[0].begin(), slices[0].size());
+  } else {
+    LOG(ERROR) << "Failed to decode cached buffer.";
+  }
+}
+
+void GrpcResponseCache::LookupOrCompute(const string& key, RPCResponse response,
+                                        ComputeFunc compute_func,
+                                        StatusCallback done_cb) {
+  VLOG(1) << "Lookup " << key;
+  std::shared_ptr<WorkerCacheEntry> req;
+  MaybeCleanup();
+  {
+    mutex_lock m(mu_);
+
+    if (requests_.find(key) != requests_.end()) {
+      req = requests_[key];
+    } else {
+      req.reset(new WorkerCacheEntry);
+      requests_[key] = req;
+    }
+
+    if (req->state == WorkerCacheEntry::State::FINISHED) {
+      if (req->expires_seconds > Env::Default()->NowSeconds()) {
+        VLOG(1) << "Reuse cached response for " << key;
+        response.CopyFrom(req->response_buf);
+        done_cb(req->response_status);
+        return;
+      }
+      VLOG(1) << "Found expired cache entry for " << key;
+      req->state = WorkerCacheEntry::State::PENDING;
+      req->response_buf.Clear();
+    }
+
+    req->callbacks.push_back(std::make_pair(response, done_cb));
+
+    if (req->state == WorkerCacheEntry::State::ACTIVE) {
+      VLOG(1) << "Found active request for " << key
+              << ".  Adding entry to response queue.";
+      return;
+    }
+
+    VLOG(2) << "No cache entry for " << key << ", running user computation.";
+    req->state = WorkerCacheEntry::State::ACTIVE;
+    req->expires_seconds = Env::Default()->NowSeconds() + expire_time_seconds_;
+  }
+
+  compute_func([this, key, req, response](Status status) {
+    mutex_lock m(mu_);
+    response.Encode(&req->response_buf);
+    current_bytes_ += req->response_buf.Length();
+
+    req->response_status = status;
+    req->state = WorkerCacheEntry::State::FINISHED;
+
+    VLOG(1) << "Operation for " << key << " finished. "
+            << "Status: " << status << ", " << req->response_buf.Length()
+            << " response bytes, " << req->callbacks.size()
+            << " pending callbacks.";
+    for (auto& cb : req->callbacks) {
+      cb.first.CopyFrom(req->response_buf);
+      cb.second(req->response_status);
+    }
+    req->callbacks.clear();
+  });
+}
+
+// Remove all stale or expired cache entries if the cache is full.
+void GrpcResponseCache::MaybeCleanup() {
+  mutex_lock m(mu_);
+  if (current_bytes_ < max_bytes_) {
+    return;
+  }
+
+  VLOG(1) << "Cleanup: " << current_bytes_ << " -> " << max_bytes_;
+  std::vector<std::pair<string, std::shared_ptr<WorkerCacheEntry>>>
+      ordered_entries;
+  ordered_entries.reserve(requests_.size());
+  for (const auto& p : requests_) {
+    ordered_entries.push_back(std::make_pair(p.first, p.second));
+  }
+
+  std::sort(ordered_entries.begin(), ordered_entries.end(),
+            [](const std::pair<string, std::shared_ptr<WorkerCacheEntry>>& a,
+               const std::pair<string, std::shared_ptr<WorkerCacheEntry>>& b) {
+              return a.second->expires_seconds > b.second->expires_seconds;
+            });
+
+  std::unordered_map<string, std::shared_ptr<WorkerCacheEntry>> kept;
+  int64 now = Env::Default()->NowSeconds();
+  int64 bytes_used = 0;
+
+  // Always keep active requests.
+  for (auto& pair : ordered_entries) {
+    if (pair.second->state != WorkerCacheEntry::State::FINISHED) {
+      kept.insert(pair);
+    }
+  }
+
+  // Keep unexpired, finished requests up to half of max_bytes_.  This reduces
+  // chances of overfilling the cache when active requests complete and
+  // amortizes cache cleanup cost.
+  for (auto& pair : ordered_entries) {
+    if (pair.second->expires_seconds < now || bytes_used >= max_bytes_ / 2) {
+      break;
+    }
+
+    if (pair.second->state == WorkerCacheEntry::State::FINISHED) {
+      kept.insert(pair);
+      bytes_used += pair.second->response_buf.Length();
+    }
+  }
+
+  VLOG(1) << "Cleaned cache.  Bytes used: " << current_bytes_ << " -> "
+          << bytes_used << ". Cache size: " << requests_.size() << " -> "
+          << kept.size();
+  current_bytes_ = bytes_used;
+  std::swap(requests_, kept);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h b/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..0892d9f788d165f11803c676717e63585ca808a2
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h
@@ -0,0 +1,91 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_RESPONSE_CACHE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_RESPONSE_CACHE_H_
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+// gRPC response caching.  Most WorkerService methods cannot be retried directly
+// as they will fail or deadlock.  To enable retrying, we can instead cache
+// responses for a short period of time and reply to duplicate requests from the
+// cache.
+namespace tensorflow {
+
+// Union type to aid caching of either raw buffers (for RecvTensor RPCs) and
+// protocol buffer messages (for all other RPCs).
+class RPCResponse {
+ public:
+  explicit RPCResponse() : buf_(nullptr), msg_(nullptr) {}
+  explicit RPCResponse(::grpc::ByteBuffer* b) : buf_(b), msg_(nullptr) {}
+  explicit RPCResponse(protobuf::Message* m) : buf_(nullptr), msg_(m) {}
+
+  // Encode this response into the target buffer.
+  void Encode(::grpc::ByteBuffer* tgt) const;
+
+  // Copy from `src`: if this is a buffer, make a shallow copy.
+  // For protocol messages, parse the response from `src`.
+  void CopyFrom(const ::grpc::ByteBuffer& src);
+
+ private:
+  ::grpc::ByteBuffer* buf_;
+  protobuf::Message* msg_;
+};
+
+typedef std::function<void(StatusCallback)> ComputeFunc;
+struct WorkerCacheEntry;
+
+// Track and cache the state of worker service RPCs.  An RPC can be in 3 states:
+//
+// * PENDING: this is the first call of the RPC, and it will transition to
+// * ACTIVE: another thread is active processing this RPC
+// * FINISHED: the worker has finished processing the method
+//
+// The response from completed RPCs are LRU cached until either `max_bytes`
+// bytes are in use by the cache or they expire (according to `expire_time`).
+class GrpcResponseCache {
+ public:
+  GrpcResponseCache(int64 max_bytes, int64 expire_time_seconds)
+      : max_bytes_(max_bytes), expire_time_seconds_(expire_time_seconds) {}
+
+  // Lookup the result for key.
+  // If it is finished, invoke `done_cb` immediately after filling `response`.
+  // If active, done_db will be invoked when the current call completes.
+  // Otherwise, invoke `compute_func` to fill the cache and invoke done_cb.
+  void LookupOrCompute(const string& key, RPCResponse response,
+                       ComputeFunc compute_func, StatusCallback done_cb);
+
+  // Remove all stale or expired cache entries if the cache is full.
+  void MaybeCleanup();
+
+ private:
+  int64 current_bytes_ GUARDED_BY(mu_) = 0;
+  const int64 max_bytes_;
+  const int64 expire_time_seconds_;
+
+  std::unordered_map<string, std::shared_ptr<WorkerCacheEntry>> requests_
+      GUARDED_BY(mu_);
+  mutex mu_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_RESPONSE_CACHE_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
index 4f5975bbc11a6217355c1fcf368996a0fca45969..3635caf3d104760d9200497f6f25d3f0fdfde48c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
@@ -159,7 +159,7 @@ GrpcRPCFactory::ChannelPtr GrpcRPCFactory::CreateChannelForAddress(
 
   // Set a standard backoff timeout of 1s instead of the
   // (sometimes default) 20s.
-  args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
+  args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 1000);
   return ::grpc::CreateCustomChannel(
       /*target=*/address, ::grpc::InsecureChannelCredentials(), args);
 }
@@ -210,7 +210,7 @@ void GrpcRPCFactory::StartCall(const Tensor& address_t, const Tensor& method_t,
       get_stub(index), &completion_queue_, *get_method_ptr(index),
       call->request(), call->response(),
       /*done=*/[call](const Status& s) { call->Done(s); }, call->call_opts(),
-      nullptr /*threadpool*/, fail_fast_, timeout_in_ms_);
+      nullptr /*threadpool*/, fail_fast_, timeout_in_ms_, 0 /* max_retries */);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index cbd5cd927e7d73fd0ed28a910c89eef1f73b0d91..f087a39f019974a273b1f94fd13c7c3fad00ee29 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -110,12 +110,9 @@ GrpcServer::~GrpcServer() {
   // - worker_env_.compute_pool
 }
 
-Status GrpcServer::Init(
-    ServiceInitFunction service_func,
-    const RendezvousMgrCreationFunction& rendezvous_mgr_func,
-    const CollectiveMgrCreationFunction& collective_mgr_func,
-    const WorkerCreationFunction& worker_func,
-    const StatsPublisherFactory& stats_factory) {
+void GrpcServer::MaybeMutateBuilder(::grpc::ServerBuilder* builder) {}
+
+Status GrpcServer::Init(const GrpcServerOptions& opts) {
   mutex_lock l(mu_);
   CHECK_EQ(state_, NEW);
   master_env_.env = env_;
@@ -163,9 +160,9 @@ Status GrpcServer::Init(
   worker_env_.device_mgr = new DeviceMgr(std::move(devices));
   master_env_.local_devices = worker_env_.device_mgr->ListDevices();
   worker_env_.local_devices = worker_env_.device_mgr->ListDevices();
-  worker_env_.rendezvous_mgr = rendezvous_mgr_func == nullptr
+  worker_env_.rendezvous_mgr = opts.rendezvous_mgr_func == nullptr
                                    ? new RpcRendezvousMgr(&worker_env_)
-                                   : rendezvous_mgr_func(&worker_env_);
+                                   : opts.rendezvous_mgr_func(&worker_env_);
   string unused;
   string default_worker_name;
   if (!DeviceNameUtils::SplitDeviceName(master_env_.local_devices[0]->name(),
@@ -191,21 +188,23 @@ Status GrpcServer::Init(
   builder.AddListeningPort(strings::StrCat("0.0.0.0:", requested_port),
                            GetServerCredentials(server_def_), &bound_port_);
   builder.SetMaxMessageSize(std::numeric_limits<int32>::max());
+
   builder.SetOption(
       std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption));
   // Allow subclasses to specify more args to pass to the gRPC server.
   MaybeMutateBuilder(&builder);
   master_impl_ = CreateMaster(&master_env_);
   master_service_ = NewGrpcMasterService(master_impl_.get(), config, &builder);
-  worker_impl_ = worker_func ? worker_func(&worker_env_, config)
-                             : NewGrpcWorker(&worker_env_, config);
-  worker_service_ =
-      NewGrpcWorkerService(worker_impl_.get(), &builder).release();
+  worker_impl_ = opts.worker_func ? opts.worker_func(&worker_env_, config)
+                                  : NewGrpcWorker(&worker_env_, config);
+  worker_service_ = NewGrpcWorkerService(worker_impl_.get(), &builder,
+                                         opts.worker_service_options)
+                        .release();
   eager_service_ = new eager::GrpcEagerServiceImpl(&worker_env_, &builder);
 
   // extra service:
-  if (service_func != nullptr) {
-    service_func(&worker_env_, &builder);
+  if (opts.service_func != nullptr) {
+    opts.service_func(&worker_env_, &builder);
   }
   server_ = builder.BuildAndStart();
 
@@ -219,9 +218,9 @@ Status GrpcServer::Init(
       WorkerCacheFactory(worker_cache_factory_options, &worker_cache));
   CHECK_NE(nullptr, worker_cache);
 
-  if (collective_mgr_func) {
+  if (opts.collective_mgr_func) {
     worker_env_.collective_executor_mgr =
-        collective_mgr_func(config, &worker_env_, worker_cache);
+        opts.collective_mgr_func(config, &worker_env_, worker_cache);
     if (!worker_env_.collective_executor_mgr) {
       return errors::Internal(
           "collective_mgr_func did not return CollectiveExecutorMgr");
@@ -253,6 +252,7 @@ Status GrpcServer::Init(
   master_env_.ops = OpRegistry::Global();
   master_env_.worker_cache = worker_cache;
   master_env_.collective_executor_mgr = worker_env_.collective_executor_mgr;
+  StatsPublisherFactory stats_factory = opts.stats_factory;
   master_env_.master_session_factory =
       [config, stats_factory](
           SessionOptions options, const MasterEnv* env,
@@ -279,31 +279,6 @@ Status GrpcServer::Init(
   return Status::OK();
 }
 
-Status GrpcServer::Init(
-    ServiceInitFunction service_func,
-    const RendezvousMgrCreationFunction& rendezvous_mgr_func,
-    const CollectiveMgrCreationFunction& collective_mgr_func,
-    const WorkerCreationFunction& worker_func) {
-  return Init(std::move(service_func), rendezvous_mgr_func, collective_mgr_func,
-              worker_func, CreateNoOpStatsPublisher);
-}
-
-Status GrpcServer::Init(
-    ServiceInitFunction service_func,
-    const RendezvousMgrCreationFunction& rendezvous_mgr_func,
-    const CollectiveMgrCreationFunction& collective_mgr_func) {
-  return Init(std::move(service_func), rendezvous_mgr_func, collective_mgr_func,
-              nullptr);
-}
-
-Status GrpcServer::Init(
-    ServiceInitFunction service_func,
-    const RendezvousMgrCreationFunction& rendezvous_mgr_func) {
-  return Init(std::move(service_func), rendezvous_mgr_func, nullptr, nullptr);
-}
-
-Status GrpcServer::Init() { return Init(nullptr, nullptr, nullptr, nullptr); }
-
 Status GrpcServer::ParseChannelSpec(const WorkerCacheFactoryOptions& options,
                                     GrpcChannelSpec* channel_spec) {
   for (const auto& job : options.cluster_def->job()) {
@@ -454,7 +429,9 @@ Status GrpcServer::Create(const ServerDef& server_def, Env* env,
   std::unique_ptr<GrpcServer> ret(
       new GrpcServer(server_def, env == nullptr ? Env::Default() : env));
   ServiceInitFunction service_func = nullptr;
-  Status s = ret->Init(service_func, NewRpcRendezvousMgr, nullptr);
+  GrpcServerOptions options;
+  options.rendezvous_mgr_func = NewRpcRendezvousMgr;
+  Status s = ret->Init(options);
   if (!s.ok()) {
     LOG(ERROR) << s;
     return s;
@@ -468,8 +445,9 @@ Status GrpcServer::Create(const ServerDef& server_def, Env* env,
                           std::unique_ptr<GrpcServer>* out_server) {
   std::unique_ptr<GrpcServer> ret(
       new GrpcServer(server_def, env == nullptr ? Env::Default() : env));
-  ServiceInitFunction service_func = nullptr;
-  Status s = ret->Init(service_func, NewRpcRendezvousMgr, nullptr);
+  GrpcServerOptions options;
+  options.rendezvous_mgr_func = NewRpcRendezvousMgr;
+  Status s = ret->Init(options);
   if (!s.ok()) {
     LOG(ERROR) << s;
     return s;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index c1395abddebd1af780ade4884b3f5af239c5fb0e..f66d7eb82e8d9bcd43868a5b65c08248f7d860da 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERVER_LIB_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERVER_LIB_H_
 
+// GrpcServer manages the lifecycle of an Eager, Worker and Master service.
+
 #include <memory>
 
 #include "grpcpp/grpcpp.h"
@@ -26,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/master_env.h"
 #include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
@@ -57,12 +60,21 @@ typedef std::function<std::unique_ptr<GrpcWorker>(WorkerEnv*,
                                                   const ConfigProto& config)>
     WorkerCreationFunction;
 
+struct GrpcServerOptions {
+  ServiceInitFunction service_func = nullptr;
+  RendezvousMgrCreationFunction rendezvous_mgr_func = nullptr;
+  CollectiveMgrCreationFunction collective_mgr_func = nullptr;
+  WorkerCreationFunction worker_func = nullptr;
+  StatsPublisherFactory stats_factory = CreateNoOpStatsPublisher;
+  GrpcWorkerServiceOptions worker_service_options;
+};
+
 class GrpcServer : public ServerInterface {
  protected:
   GrpcServer(const ServerDef& server_def, Env* env);
   // Allow children classes to override this and provide custom args to the
   // server before it is constructed. Default behavior is to do nothing.
-  virtual void MaybeMutateBuilder(::grpc::ServerBuilder* builder) {}
+  virtual void MaybeMutateBuilder(::grpc::ServerBuilder* builder);
 
  public:
   static Status Create(const ServerDef& server_def, Env* env,
@@ -86,25 +98,7 @@ class GrpcServer : public ServerInterface {
   std::shared_ptr<GrpcChannelCache> channel_cache() { return channel_cache_; }
 
  protected:
-  Status Init(ServiceInitFunction service_func,
-              const RendezvousMgrCreationFunction& rendezvous_mgr_func,
-              const CollectiveMgrCreationFunction& collective_mgr_func,
-              const WorkerCreationFunction& worker_func,
-              const StatsPublisherFactory& stats_factory);
-
-  Status Init(ServiceInitFunction service_func,
-              const RendezvousMgrCreationFunction& rendezvous_mgr_func,
-              const CollectiveMgrCreationFunction& collective_mgr_func,
-              const WorkerCreationFunction& worker_func);
-
-  Status Init(ServiceInitFunction service_func,
-              const RendezvousMgrCreationFunction& rendezvous_mgr_func,
-              const CollectiveMgrCreationFunction& collective_mgr_func);
-
-  Status Init(ServiceInitFunction service_func,
-              const RendezvousMgrCreationFunction& rendezvous_mgr_func);
-
-  Status Init();
+  Status Init(const GrpcServerOptions& opts = GrpcServerOptions());
 
   // A subclass can override this method to support secure credentials.
   virtual std::shared_ptr<::grpc::ServerCredentials> GetServerCredentials(
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
index 32063fecbbef4347bcdbfbdfda32f008015b5975..c14bfd2155fb4b2276642e220176a3658448f350 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/local_master.h"
 #include "tensorflow/core/distributed_runtime/master_interface.h"
+#include "tensorflow/core/distributed_runtime/request_id.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_remote_master.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -312,6 +313,7 @@ Status GrpcSession::PRunSetup(const std::vector<string>& input_names,
   for (const string& target : target_nodes) {
     req.add_target(target);
   }
+  req.set_request_id(GetUniqueRequestId());
   call_options.SetTimeout(options_.config.operation_timeout_in_ms());
   TF_RETURN_IF_ERROR(master_->PartialRunSetup(&call_options, &req, &resp));
   *handle = resp.partial_run_handle();
@@ -408,6 +410,7 @@ Status GrpcSession::MakeCallable(const CallableOptions& callable_options,
   MakeCallableRequest req;
   TF_RETURN_IF_ERROR(Handle(req.mutable_session_handle()));
   *req.mutable_options() = callable_options;
+  req.set_request_id(GetUniqueRequestId());
   MakeCallableResponse resp;
   CallOptions call_options;
   call_options.SetTimeout(options_.config.operation_timeout_in_ms());
@@ -423,6 +426,7 @@ Status GrpcSession::RunCallable(CallableHandle handle,
   RunCallableRequest req;
   TF_RETURN_IF_ERROR(Handle(req.mutable_session_handle()));
   req.set_handle(handle);
+  req.set_request_id(GetUniqueRequestId());
   for (const Tensor& feed : feed_tensors) {
     feed.AsProtoTensorContent(req.mutable_feed()->Add());
   }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index b67f3c4563107882a556e83c07ee20ca69b3f3b4..0ca64dc159b6680342a9937480a1d67135ad6197 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -26,11 +26,15 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/notification.h"
 
 namespace tensorflow {
 
 // Object allocated per active RPC.
+// Manage the state of a single asynchronous RPC request.  If `max_retries`
+// is greater than 0, the request will be retried for any transient failures
+// as long as the overall deadline has not elapsed.
 template <class Response>
 class RPCState : public GrpcClientCQTag {
  public:
@@ -38,34 +42,55 @@ class RPCState : public GrpcClientCQTag {
   RPCState(::grpc::GenericStub* stub, ::grpc::CompletionQueue* cq,
            const ::grpc::string& method, const protobuf::Message& request,
            Response* response, StatusCallback done, CallOptions* call_opts,
-           thread::ThreadPool* threadpool)
+           thread::ThreadPool* threadpool, int32 max_retries = 0)
       : RPCState(stub, cq, method, request, response, std::move(done),
                  call_opts, threadpool, /*fail_fast=*/false,
-                 /*timeout_in_ms=*/0) {}
+                 /*timeout_in_ms=*/0, max_retries) {}
 
   template <typename Request>
   RPCState(::grpc::GenericStub* stub, ::grpc::CompletionQueue* cq,
            const ::grpc::string& method, const Request& request,
            Response* response, StatusCallback done, CallOptions* call_opts,
-           thread::ThreadPool* threadpool, bool fail_fast, int64 timeout_in_ms)
-      : call_opts_(call_opts), threadpool_(threadpool), done_(std::move(done)) {
-    context_.set_fail_fast(fail_fast);
-    if (timeout_in_ms > 0) {
-      context_.set_deadline(gpr_time_from_millis(timeout_in_ms, GPR_TIMESPAN));
-    }
-
-    if (call_opts) {
-      call_opts->SetCancelCallback([this]() { context_.TryCancel(); });
-    }
-
+           thread::ThreadPool* threadpool, bool fail_fast, int64 timeout_in_ms,
+           int32 max_retries)
+      : call_opts_(call_opts),
+        threadpool_(threadpool),
+        done_(std::move(done)),
+        cq_(cq),
+        stub_(stub),
+        method_(method),
+        max_retries_(max_retries),
+        timeout_in_ms_(timeout_in_ms),
+        fail_fast_(fail_fast) {
     response_ = response;
     ::grpc::Status s = GrpcMaybeUnparseProto(request, &request_buf_);
     if (!s.ok()) {
       LOG(ERROR) << "GrpcMaybeUnparseProto returned with non-ok status: "
                  << s.error_message();
+      // Skip retry logic if we fail to parse our request.
+      done_(FromGrpcStatus(s));
+      delete this;
+      return;
+    }
+    StartCall();
+  }
+
+  void StartCall() {
+    context_.reset(new ::grpc::ClientContext());
+    context_->set_fail_fast(fail_fast_);
+
+    if (timeout_in_ms_ > 0) {
+      context_->set_deadline(
+          gpr_time_from_millis(timeout_in_ms_, GPR_TIMESPAN));
     }
-    call_ =
-        std::move(stub->PrepareUnaryCall(&context_, method, request_buf_, cq));
+    if (call_opts_) {
+      call_opts_->SetCancelCallback([this]() { context_->TryCancel(); });
+    }
+
+    VLOG(2) << "Starting call: " << method_;
+
+    call_ = std::move(
+        stub_->PrepareUnaryCall(context_.get(), method_, request_buf_, cq_));
     call_->StartCall();
     call_->Finish(&response_buf_, &status_, this);
   }
@@ -88,10 +113,26 @@ class RPCState : public GrpcClientCQTag {
         threadpool_->Schedule([this]() { ParseAndCallDone(); });
       } else {
         ParseAndCallDone();
-        return;
       }
+      return;
+    }
+
+    VLOG(1) << method_ << " returned with non-ok status: " << s
+            << " Retries: " << num_retries_ << " Max: " << max_retries_ << "\n"
+            << context_->debug_error_string();
+    // Retry if we have any attempts left
+    if (++num_retries_ <= max_retries_ &&
+        (errors::IsUnavailable(s) || errors::IsUnknown(s))) {
+      response_buf_.Clear();
+      VLOG(1) << "Retrying call for " << method_ << "Retry: " << num_retries_
+              << " of " << max_retries_;
+      StartCall();
     } else {
-      VLOG(2) << "Call returned with non-ok status: " << s;
+      // Attach additional GRPC error information if any to the final status
+      s = Status(s.code(),
+                 strings::StrCat(s.error_message(),
+                                 "\nAdditional GRPC error information:\n",
+                                 context_->debug_error_string()));
       done_(s);
       delete this;
     }
@@ -108,7 +149,7 @@ class RPCState : public GrpcClientCQTag {
 
  private:
   CallOptions* call_opts_;
-  ::grpc::ClientContext context_;
+  std::unique_ptr<::grpc::ClientContext> context_;
   thread::ThreadPool* threadpool_;
   std::unique_ptr<::grpc::GenericClientAsyncResponseReader> call_;
   Response* response_;
@@ -116,6 +157,15 @@ class RPCState : public GrpcClientCQTag {
   ::grpc::ByteBuffer response_buf_;
   ::grpc::Status status_;
   StatusCallback done_;
+  int64 timeout_in_ms_;
+
+  size_t num_retries_ = 0;
+  size_t max_retries_;
+
+  ::grpc::CompletionQueue* cq_;
+  ::grpc::GenericStub* stub_;
+  ::grpc::string method_;
+  bool fail_fast_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.cc b/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
index e211c33732b26777697f11178909edaf6c9b65ed..471e2c16b348e12eca094247c729008a936174f7 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.cc
@@ -15,9 +15,61 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
+#include "tensorflow/core/lib/random/random.h"
 
 namespace tensorflow {
 
+namespace {
+
+double GenerateUniformRandomNumber() {
+  return random::New64() * (1.0 / std::numeric_limits<uint64>::max());
+}
+
+double GenerateUniformRandomNumberBetween(double a, double b) {
+  if (a == b) return a;
+  DCHECK_LT(a, b);
+  return a + GenerateUniformRandomNumber() * (b - a);
+}
+
+}  // namespace
+
+int64 ComputeBackoffMicroseconds(int current_retry_attempt, int64 min_delay,
+                                 int64 max_delay) {
+  DCHECK_GE(current_retry_attempt, 0);
+
+  // This function with the constants below is calculating:
+  //
+  // (0.4 * min_delay) + (random[0.6,1.0] * min_delay * 1.3^retries)
+  //
+  // Note that there is an extra truncation that occurs and is documented in
+  // comments below.
+  constexpr double kBackoffBase = 1.3;
+  constexpr double kBackoffRandMult = 0.4;
+
+  // This first term does not vary with current_retry_attempt or a random
+  // number. It exists to ensure the final term is >= min_delay
+  const double first_term = kBackoffRandMult * min_delay;
+
+  // This is calculating min_delay * 1.3^retries
+  double uncapped_second_term = min_delay;
+  while (current_retry_attempt > 0 &&
+         uncapped_second_term < max_delay - first_term) {
+    current_retry_attempt--;
+    uncapped_second_term *= kBackoffBase;
+  }
+  // Note that first_term + uncapped_second_term can exceed max_delay here
+  // because of the final multiply by kBackoffBase.  We fix that problem with
+  // the min() below.
+  double second_term = std::min(uncapped_second_term, max_delay - first_term);
+
+  // This supplies the random jitter to ensure that retried don't cause a
+  // thundering herd problem.
+  second_term *=
+      GenerateUniformRandomNumberBetween(1.0 - kBackoffRandMult, 1.0);
+
+  return std::max(static_cast<int64>(first_term + second_term), min_delay);
+}
+
 ::grpc::Status GrpcMaybeUnparseProto(const protobuf::Message& src,
                                      grpc::ByteBuffer* dst) {
   bool own_buffer;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.h b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
index 45259aa2ece9698d7ffb5a850b716de442f7497f..976f3e6452a7673455d8c2d0946257ee54d762fe 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
@@ -29,6 +29,15 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Given the total number of RPC retries attempted, return a randomized
+// amount of time to delay before retrying the request.
+//
+// The average computed backoff increases with the number of RPCs attempted.
+// See implementation for details on the calculations.
+int64 ComputeBackoffMicroseconds(int current_retry_attempt,
+                                 int64 min_delay = 1000,
+                                 int64 max_delay = 10000000);
+
 // Thin wrapper around ::grpc::ProtoBufferReader to give TensorResponse an
 // efficient byte reader from which to decode a RecvTensorResponse.
 class GrpcByteSource : public TensorResponse::Source {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index de80992095d13fa38172b3a30c5fdd6c177994e1..904862100e460d811dc03648ff2b8aa4f26f672c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -16,10 +16,14 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h"
 
 #include <deque>
+#include <memory>
+#include <unordered_map>
+#include <vector>
 
 #include "grpcpp/alarm.h"
 #include "grpcpp/server_builder.h"
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/common_runtime/buf_rendezvous.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -31,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_call.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h"
@@ -41,7 +46,12 @@ limitations under the License.
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/protobuf/transport_options.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
@@ -50,37 +60,6 @@ namespace tensorflow {
 
 namespace {
 
-class GrpcWorkerService : public AsyncServiceInterface {
-  // TODO(ncteisen): consider adding a config var or flag for this
-  static constexpr const size_t kGrpcWorkerServiceThreadCount = 8;
-
- public:
-  GrpcWorkerService(GrpcWorker* worker, ::grpc::ServerBuilder* builder)
-      : is_shutdown_(false) {
-    builder->RegisterService(&worker_service_);
-    for (int i = 0; i < kGrpcWorkerServiceThreadCount; i++) {
-      threads_.emplace_back(
-          new GrpcWorkerServiceThread(worker, builder, &worker_service_));
-    }
-  }
-
-  void Shutdown() override {
-    bool did_shutdown = false;
-    {
-      mutex_lock l(service_shutdown_mu_);
-      if (!is_shutdown_) {
-        LOG(INFO) << "Shutting down GrpcWorkerService.";
-        is_shutdown_ = true;
-        did_shutdown = true;
-      }
-    }
-    if (did_shutdown) {
-      for (auto& worker_thread : threads_) {
-        worker_thread->Shutdown();
-      }
-    }
-  }
-
 // This macro creates a new request for the given RPC method name
 // (e.g., `ENQUEUE_REQUEST(GetStatus, false);`), and enqueues it on
 // `this->cq_`.
@@ -105,311 +84,344 @@ class GrpcWorkerService : public AsyncServiceInterface {
     }                                                                        \
   } while (0)
 
-  // This method blocks forever handling requests from the completion queue.
-  void HandleRPCsLoop() override {
-    for (auto& worker_thread : threads_) {
-      worker_thread->Start();
-    }
-    for (auto& worker_thread : threads_) {
-      worker_thread->Join();
-    }
+#define SETUP_FOR_REQUEST(method, default_depth, supports_cancel)              \
+  for (int i = 0;                                                              \
+       i < gtl::FindWithDefault(queue_depth_,                                  \
+                                static_cast<int>(GrpcWorkerMethod::k##method), \
+                                default_depth);                                \
+       ++i) {                                                                  \
+    ENQUEUE_REQUEST(method, supports_cancel);                                  \
   }
 
- private:
-  // Thread wrapping class that drives work over a single gRPC
-  // CompletionQueue.
-  class GrpcWorkerServiceThread {
-   public:
-    explicit GrpcWorkerServiceThread(
-        GrpcWorker* worker, ::grpc::ServerBuilder* builder,
-        grpc::WorkerService::AsyncService* worker_service)
-        : worker_(worker),
-          worker_service_(worker_service),
-          is_shutdown_(false) {
-      cq_ = builder->AddCompletionQueue();
-    }
-
-    void Start() {
-      thread_.reset(worker_->env()->env->StartThread(
-          ThreadOptions(), "grpc_worker_service",
-          [this]() { HandleRPCsLoop(); }));
-    }
-
-    void Join() { thread_.reset(); }  // Blocks until thread exits
-
-    void Shutdown() {
-      {
-        mutex_lock lock(shutdown_mu_);
-        is_shutdown_ = true;
-      }
-      cq_->Shutdown();
-    }
-
-   private:
-    void HandleRPCsLoop() {
-      // TODO(ncteisen): This may require performance engineering. We can
-      // change the number of threads, the number of handlers per thread,
-      // or even decide to specialize certain threads to certain methods.
-      ENQUEUE_REQUEST(GetStatus, false);
-      ENQUEUE_REQUEST(CreateWorkerSession, false);
-      ENQUEUE_REQUEST(DeleteWorkerSession, false);
-      ENQUEUE_REQUEST(CleanupAll, false);
-      ENQUEUE_REQUEST(RegisterGraph, false);
-      ENQUEUE_REQUEST(DeregisterGraph, false);
-
-      // TODO(ncteisen): Determine a better policy for enqueuing the
-      // appropriate number of each request type.
-      for (int i = 0; i < 1000; ++i) {
-        EnqueueRecvTensorRequestRaw();
-      }
-      for (int i = 0; i < 500; ++i) {
-        ENQUEUE_REQUEST(RecvBuf, true);
-      }
-      for (int i = 0; i < 100; ++i) {
-        ENQUEUE_REQUEST(RunGraph, true);
-      }
-      for (int i = 0; i < 100; ++i) {
-        ENQUEUE_REQUEST(CleanupGraph, false);
-      }
-
-      ENQUEUE_REQUEST(Logging, false);
-      ENQUEUE_REQUEST(Tracing, false);
+// GrpcWorkerService spawns one or more GrpcWorkerServiceThreads to service
+// requests.  Each thread operates on an independent completion queue.
+class GrpcWorkerServiceThread {
+ public:
+  explicit GrpcWorkerServiceThread(
+      GrpcWorker* worker, ::grpc::ServerBuilder* builder,
+      std::unordered_map<int, int> queue_depth, GrpcResponseCache* cache,
+      grpc::WorkerService::AsyncService* worker_service)
+      : worker_(worker),
+        queue_depth_(queue_depth),
+        cache_(cache),
+        worker_service_(worker_service),
+        is_shutdown_(false) {
+    cq_ = builder->AddCompletionQueue();
+  }
 
-      for (int i = 0; i < 10; ++i) {
-        ENQUEUE_REQUEST(CompleteGroup, true);
-        ENQUEUE_REQUEST(CompleteInstance, true);
-        ENQUEUE_REQUEST(GetStepSequence, true);
-      }
+  void Start() {
+    thread_.reset(
+        worker_->env()->env->StartThread(ThreadOptions(), "grpc_worker_service",
+                                         [this]() { HandleRPCsLoop(); }));
+  }
 
-      void* tag;
-      bool ok;
+  void Join() { thread_.reset(); }  // Blocks until thread exits
 
-      while (cq_->Next(&tag, &ok)) {
-        UntypedCall<GrpcWorkerServiceThread>::Tag* callback_tag =
-            static_cast<UntypedCall<GrpcWorkerServiceThread>::Tag*>(tag);
-        CHECK(callback_tag);
-        callback_tag->OnCompleted(this, ok);
-      }
+  void Shutdown() {
+    {
+      mutex_lock lock(shutdown_mu_);
+      is_shutdown_ = true;
     }
+    cq_->Shutdown();
+  }
 
-   private:
-    void Schedule(std::function<void()> f) {
-      worker_->env()->compute_pool->Schedule(std::move(f));
+ private:
+  // Add one or more completion queue entries for each worker method, then
+  // begin servicing requests from the completion queue.
+  void HandleRPCsLoop() {
+    // TODO(ncteisen): This may require performance engineering. We can
+    // change the number of threads, the number of handlers per thread,
+    // or even decide to specialize certain threads to certain methods.
+    SETUP_FOR_REQUEST(GetStatus, 1, false);
+    SETUP_FOR_REQUEST(CreateWorkerSession, 1, false);
+    SETUP_FOR_REQUEST(DeleteWorkerSession, 1, false);
+    SETUP_FOR_REQUEST(CleanupAll, 1, false);
+    SETUP_FOR_REQUEST(RegisterGraph, 1, false);
+    SETUP_FOR_REQUEST(DeregisterGraph, 1, false);
+    SETUP_FOR_REQUEST(Logging, 1, false);
+    SETUP_FOR_REQUEST(Tracing, 1, false);
+    SETUP_FOR_REQUEST(CompleteGroup, 10, true);
+    SETUP_FOR_REQUEST(CompleteInstance, 10, true);
+    SETUP_FOR_REQUEST(GetStepSequence, 10, true);
+    SETUP_FOR_REQUEST(RecvBuf, 500, true);
+    SETUP_FOR_REQUEST(RunGraph, 100, true);
+    SETUP_FOR_REQUEST(CleanupGraph, 100, false);
+
+    // TODO(ncteisen): Determine a better policy for enqueuing the
+    // appropriate number of each request type.
+    for (int i = 0;
+         i < gtl::FindWithDefault(
+                 queue_depth_, static_cast<int>(GrpcWorkerMethod::kRecvTensor),
+                 1000);
+         ++i) {
+      EnqueueRecvTensorRequestRaw();
     }
 
-    // The following section contains one request handler method per
-    // RPC. The `FooHandler` method is called (indirectly) by
-    // `HandleRPCsLoop()` when the next Foo RPC is received. Each
-    // `FooHandler` call schedules a closure on `worker_->env()->compute_pool`,
-    // and is responsible for requesting the next Foo call by calling
-    // `ENQUEUE_REQUEST(Foo)`.
-
-    template <class RequestMessage, class ResponseMessage>
-    using WorkerCall =
-        Call<GrpcWorkerServiceThread, grpc::WorkerService::AsyncService,
-             RequestMessage, ResponseMessage>;
-
-    void GetStatusHandler(
-        WorkerCall<GetStatusRequest, GetStatusResponse>* call) {
-      Schedule([this, call]() {
-        Status s = worker_->GetStatus(&call->request, &call->response);
-        call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(GetStatus, false);
-    }
+    void* tag;
+    bool ok;
 
-    void CreateWorkerSessionHandler(
-        WorkerCall<CreateWorkerSessionRequest, CreateWorkerSessionResponse>*
-            call) {
-      Schedule([this, call]() {
-        Status s =
-            worker_->CreateWorkerSession(&call->request, &call->response);
-        call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(CreateWorkerSession, false);
+    while (cq_->Next(&tag, &ok)) {
+      UntypedCall<GrpcWorkerServiceThread>::Tag* callback_tag =
+          static_cast<UntypedCall<GrpcWorkerServiceThread>::Tag*>(tag);
+      CHECK(callback_tag);
+      callback_tag->OnCompleted(this, ok);
     }
+  }
 
-    void DeleteWorkerSessionHandler(
-        WorkerCall<DeleteWorkerSessionRequest, DeleteWorkerSessionResponse>*
-            call) {
-      Schedule([this, call]() {
-        Status s =
-            worker_->DeleteWorkerSession(&call->request, &call->response);
-        call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(DeleteWorkerSession, false);
-    }
+ private:
+  void Schedule(std::function<void()> f) {
+    worker_->env()->compute_pool->Schedule(std::move(f));
+  }
 
-    void CleanupAllHandler(
-        WorkerCall<CleanupAllRequest, CleanupAllResponse>* call) {
-      Schedule([this, call]() {
-        Status s = worker_->CleanupAll(&call->request, &call->response);
-        call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(CleanupAll, false);
-    }
+  // The following section contains one request handler method per
+  // RPC. The `FooHandler` method is called (indirectly) by
+  // `HandleRPCsLoop()` when the next Foo RPC is received. Each
+  // `FooHandler` call schedules a closure on `worker_->env()->compute_pool`,
+  // and is responsible for requesting the next Foo call by calling
+  // `ENQUEUE_REQUEST(Foo)`.
+  template <class RequestMessage, class ResponseMessage>
+  using WorkerCall =
+      Call<GrpcWorkerServiceThread, grpc::WorkerService::AsyncService,
+           RequestMessage, ResponseMessage>;
+
+  // Handle all non-cancellable simple methods with a standard wrapper.
+#define HANDLE_CALL(method)                                                   \
+  void method##Handler(WorkerCall<method##Request, method##Response>* call) { \
+    Schedule([this, call]() {                                                 \
+      Status s = worker_->method(&call->request, &call->response);            \
+      if (!s.ok()) {                                                          \
+        VLOG(1) << "Bad response from " << #method << ": " << s;              \
+      }                                                                       \
+      call->SendResponse(ToGrpcStatus(s));                                    \
+    });                                                                       \
+    ENQUEUE_REQUEST(method, false);                                           \
+  }
 
-    void RegisterGraphHandler(
-        WorkerCall<RegisterGraphRequest, RegisterGraphResponse>* call) {
-      Schedule([this, call]() {
-        Status s = worker_->RegisterGraph(&call->request, &call->response);
-        call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(RegisterGraph, false);
-    }
+  HANDLE_CALL(GetStatus);
+  HANDLE_CALL(CreateWorkerSession);
+  HANDLE_CALL(DeleteWorkerSession);
+  HANDLE_CALL(CleanupAll);
+  HANDLE_CALL(RegisterGraph);
+  HANDLE_CALL(DeregisterGraph);
+  HANDLE_CALL(CleanupGraph);
+  HANDLE_CALL(Logging);
+  HANDLE_CALL(Tracing);
+
+#undef HANDLE_CALL
+
+  void GetStepSequenceHandler(
+      WorkerCall<GetStepSequenceRequest, GetStepSequenceResponse>* call) {
+    Schedule([this, call]() {
+      worker_->GetStepSequenceAsync(
+          &call->request, &call->response, [call](const Status& s) {
+            VLOG(1) << "Bad response from GetStepSequence:" << s;
+            call->SendResponse(ToGrpcStatus(s));
+          });
+    });
+    ENQUEUE_REQUEST(GetStepSequence, true);
+  }
 
-    void DeregisterGraphHandler(
-        WorkerCall<DeregisterGraphRequest, DeregisterGraphResponse>* call) {
-      Schedule([this, call]() {
-        Status s = worker_->DeregisterGraph(&call->request, &call->response);
+  void RunGraphHandler(WorkerCall<RunGraphRequest, RunGraphResponse>* call) {
+    Schedule([this, call]() {
+      CallOptions* call_opts = new CallOptions;
+      ProtoRunGraphRequest* wrapped_request =
+          new ProtoRunGraphRequest(&call->request);
+      NonOwnedProtoRunGraphResponse* wrapped_response =
+          new NonOwnedProtoRunGraphResponse(&call->response);
+      call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+      auto done_cb = [call, call_opts, wrapped_request,
+                      wrapped_response](const Status& s) {
+        VLOG(1) << "RunGraph::Done";
+        if (!s.ok()) {
+          VLOG(1) << "Bad response from RunGraph:" << s;
+        }
+        call->ClearCancelCallback();
+        delete call_opts;
+        delete wrapped_request;
+        delete wrapped_response;
         call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(DeregisterGraph, false);
-    }
+      };
 
-    void RunGraphHandler(WorkerCall<RunGraphRequest, RunGraphResponse>* call) {
-      Schedule([this, call]() {
-        CallOptions* call_opts = new CallOptions;
-        ProtoRunGraphRequest* wrapped_request =
-            new ProtoRunGraphRequest(&call->request);
-        NonOwnedProtoRunGraphResponse* wrapped_response =
-            new NonOwnedProtoRunGraphResponse(&call->response);
-        call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+      auto compute_fn = [this, call_opts, wrapped_request,
+                         wrapped_response](StatusCallback done) {
         worker_->RunGraphAsync(call_opts, wrapped_request, wrapped_response,
-                               [call, call_opts, wrapped_request,
-                                wrapped_response](const Status& s) {
-                                 call->ClearCancelCallback();
-                                 delete call_opts;
-                                 delete wrapped_request;
-                                 delete wrapped_response;
-                                 call->SendResponse(ToGrpcStatus(s));
-                               });
-      });
-      ENQUEUE_REQUEST(RunGraph, true);
-    }
+                               done);
+      };
+
+      if (cache_) {
+        string request_key = call->request.ShortDebugString();
+        cache_->LookupOrCompute(request_key, RPCResponse(&call->response),
+                                compute_fn, done_cb);
+      } else {
+        compute_fn(done_cb);
+      }
+    });
+    ENQUEUE_REQUEST(RunGraph, true);
+  }
 
-    void RecvTensorHandlerRaw(
-        WorkerCall<RecvTensorRequest, ::grpc::ByteBuffer>* call) {
-      Schedule([this, call]() {
-        CallOptions* call_opts = new CallOptions;
-        call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+  void RecvTensorHandlerRaw(
+      WorkerCall<RecvTensorRequest, ::grpc::ByteBuffer>* call) {
+    Schedule([this, call]() {
+      CallOptions* call_opts = new CallOptions;
+      call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+
+      auto done_cb = [call, call_opts](const Status& s) {
+        call->ClearCancelCallback();
+        delete call_opts;
+        if (!s.ok()) {
+          VLOG(1) << "Bad response from RecvTensor:" << s;
+        }
+        call->SendResponse(ToGrpcStatus(s));
+      };
+
+      auto compute_fn = [this, &call_opts, &call](StatusCallback done) {
         worker_->GrpcRecvTensorAsync(call_opts, &call->request, &call->response,
-                                     [call, call_opts](const Status& s) {
-                                       call->ClearCancelCallback();
-                                       delete call_opts;
-                                       call->SendResponse(ToGrpcStatus(s));
-                                     });
-      });
-      EnqueueRecvTensorRequestRaw();
-    }
+                                     done);
+      };
+
+      if (cache_) {
+        string request_key = call->request.ShortDebugString();
+        cache_->LookupOrCompute(request_key, RPCResponse(&call->response),
+                                compute_fn, done_cb);
+      } else {
+        compute_fn(done_cb);
+      }
+    });
+    EnqueueRecvTensorRequestRaw();
+  }
 
-    void CleanupGraphHandler(
-        WorkerCall<CleanupGraphRequest, CleanupGraphResponse>* call) {
-      Schedule([this, call]() {
-        Status s = worker_->CleanupGraph(&call->request, &call->response);
-        call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(CleanupGraph, false);
-    }
+  void RecvBufHandler(WorkerCall<RecvBufRequest, RecvBufResponse>* call) {
+    Schedule([this, call]() {
+      CallOptions* call_opts = new CallOptions;
+      call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+      worker_->RecvBufAsync(call_opts, &call->request, &call->response,
+                            [call, call_opts](const Status& s) {
+                              call->ClearCancelCallback();
+                              delete call_opts;
+                              if (!s.ok()) {
+                                VLOG(1) << "Bad response from RecvBuf:" << s;
+                              }
+                              call->SendResponse(ToGrpcStatus(s));
+                            });
+    });
+    ENQUEUE_REQUEST(RecvBuf, true);
+  }
 
-    void LoggingHandler(WorkerCall<LoggingRequest, LoggingResponse>* call) {
-      Schedule([this, call]() {
-        Status s = worker_->Logging(&call->request, &call->response);
-        call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(Logging, false);
-    }
+  void CompleteGroupHandler(
+      WorkerCall<CompleteGroupRequest, CompleteGroupResponse>* call) {
+    Schedule([this, call]() {
+      CallOptions* call_opts = new CallOptions;
+      call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+      worker_->CompleteGroupAsync(
+          call_opts, &call->request, &call->response,
+          [call, call_opts](const Status& s) {
+            call->ClearCancelCallback();
+            delete call_opts;
+            if (!s.ok()) {
+              VLOG(1) << "Bad response from CompleteGroup:" << s;
+            }
+            call->SendResponse(ToGrpcStatus(s));
+          });
+    });
+    ENQUEUE_REQUEST(CompleteGroup, true);
+  }
 
-    void TracingHandler(WorkerCall<TracingRequest, TracingResponse>* call) {
-      Schedule([this, call]() {
-        Status s = worker_->Tracing(&call->request, &call->response);
-        call->SendResponse(ToGrpcStatus(s));
-      });
-      ENQUEUE_REQUEST(Tracing, false);
-    }
+  void CompleteInstanceHandler(
+      WorkerCall<CompleteInstanceRequest, CompleteInstanceResponse>* call) {
+    Schedule([this, call]() {
+      CallOptions* call_opts = new CallOptions;
+      call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+      worker_->CompleteInstanceAsync(
+          call_opts, &call->request, &call->response,
+          [call, call_opts](const Status& s) {
+            call->ClearCancelCallback();
+            delete call_opts;
+            if (!s.ok()) {
+              VLOG(1) << "Bad response from CompleteInstance:" << s;
+            }
+            call->SendResponse(ToGrpcStatus(s));
+          });
+    });
+    ENQUEUE_REQUEST(CompleteInstance, false);
+  }
+#undef ENQUEUE_REQUEST
 
-    void RecvBufHandler(WorkerCall<RecvBufRequest, RecvBufResponse>* call) {
-      Schedule([this, call]() {
-        CallOptions* call_opts = new CallOptions;
-        call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
-        worker_->RecvBufAsync(call_opts, &call->request, &call->response,
-                              [call, call_opts](const Status& s) {
-                                call->ClearCancelCallback();
-                                delete call_opts;
-                                call->SendResponse(ToGrpcStatus(s));
-                              });
-      });
-      ENQUEUE_REQUEST(RecvBuf, true);
+  void EnqueueRecvTensorRequestRaw() {
+    mutex_lock l(shutdown_mu_);
+    if (!is_shutdown_) {
+      Call<GrpcWorkerServiceThread, grpc::WorkerService::AsyncService,
+           RecvTensorRequest, ::grpc::ByteBuffer>::
+          EnqueueRequestForMethod(
+              worker_service_, cq_.get(),
+              static_cast<int>(GrpcWorkerMethod::kRecvTensor),
+              &GrpcWorkerServiceThread::RecvTensorHandlerRaw,
+              true /* supports cancel*/);
     }
+  }
 
-    void CompleteGroupHandler(
-        WorkerCall<CompleteGroupRequest, CompleteGroupResponse>* call) {
-      Schedule([this, call]() {
-        CallOptions* call_opts = new CallOptions;
-        call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
-        worker_->CompleteGroupAsync(call_opts, &call->request, &call->response,
-                                    [call, call_opts](const Status& s) {
-                                      call->ClearCancelCallback();
-                                      delete call_opts;
-                                      call->SendResponse(ToGrpcStatus(s));
-                                    });
-      });
-      ENQUEUE_REQUEST(CompleteGroup, true);
-    }
+  GrpcWorker* const worker_ = nullptr;  // Not owned.
+  std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
+  std::unique_ptr<Thread> thread_;
+  std::unordered_map<int, int> queue_depth_;
+  GrpcResponseCache* cache_;
+  grpc::WorkerService::AsyncService* const worker_service_;
 
-    void CompleteInstanceHandler(
-        WorkerCall<CompleteInstanceRequest, CompleteInstanceResponse>* call) {
-      Schedule([this, call]() {
-        CallOptions* call_opts = new CallOptions;
-        call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
-        worker_->CompleteInstanceAsync(call_opts, &call->request,
-                                       &call->response,
-                                       [call, call_opts](const Status& s) {
-                                         call->ClearCancelCallback();
-                                         delete call_opts;
-                                         call->SendResponse(ToGrpcStatus(s));
-                                       });
-      });
-      ENQUEUE_REQUEST(CompleteInstance, false);
+  mutex shutdown_mu_;
+  bool is_shutdown_ GUARDED_BY(shutdown_mu_);
+  TF_DISALLOW_COPY_AND_ASSIGN(GrpcWorkerServiceThread);
+};
+
+class GrpcWorkerService : public AsyncServiceInterface {
+ public:
+  GrpcWorkerService(GrpcWorker* worker, ::grpc::ServerBuilder* builder,
+                    GrpcWorkerServiceOptions options)
+      : is_shutdown_(false) {
+    builder->RegisterService(&worker_service_);
+    if (options.response_cache_bytes > 0) {
+      cache_.reset(
+          new GrpcResponseCache(options.response_cache_bytes,
+                                options.response_cache_expires_seconds));
     }
 
-    void GetStepSequenceHandler(
-        WorkerCall<GetStepSequenceRequest, GetStepSequenceResponse>* call) {
-      Schedule([this, call]() {
-        worker_->GetStepSequenceAsync(
-            &call->request, &call->response,
-            [call](const Status& s) { call->SendResponse(ToGrpcStatus(s)); });
-      });
-      ENQUEUE_REQUEST(GetStepSequence, true);
+    for (int i = 0; i < options.num_serving_threads; i++) {
+      threads_.emplace_back(
+          new GrpcWorkerServiceThread(worker, builder, options.queue_depth,
+                                      cache_.get(), &worker_service_));
     }
-#undef ENQUEUE_REQUEST
+  }
 
-    void EnqueueRecvTensorRequestRaw() {
-      mutex_lock l(shutdown_mu_);
+  void Shutdown() override {
+    bool did_shutdown = false;
+    {
+      mutex_lock l(service_shutdown_mu_);
       if (!is_shutdown_) {
-        Call<GrpcWorkerServiceThread, grpc::WorkerService::AsyncService,
-             RecvTensorRequest, ::grpc::ByteBuffer>::
-            EnqueueRequestForMethod(
-                worker_service_, cq_.get(),
-                static_cast<int>(GrpcWorkerMethod::kRecvTensor),
-                &GrpcWorkerServiceThread::RecvTensorHandlerRaw,
-                true /* supports cancel*/);
+        LOG(INFO) << "Shutting down GrpcWorkerService.";
+        is_shutdown_ = true;
+        did_shutdown = true;
       }
     }
+    if (did_shutdown) {
+      for (auto& worker_thread : threads_) {
+        worker_thread->Shutdown();
+      }
+    }
+  }
 
-    GrpcWorker* const worker_ = nullptr;  // Not owned.
-    std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
-    std::unique_ptr<Thread> thread_;
-    grpc::WorkerService::AsyncService* const worker_service_;
-
-    mutex shutdown_mu_;
-    bool is_shutdown_ GUARDED_BY(shutdown_mu_);
-    TF_DISALLOW_COPY_AND_ASSIGN(GrpcWorkerServiceThread);
-  };  // GrpcWorkerServiceThread
+  // This method blocks forever handling requests from the completion queue.
+  void HandleRPCsLoop() override {
+    for (auto& worker_thread : threads_) {
+      worker_thread->Start();
+    }
+    for (auto& worker_thread : threads_) {
+      worker_thread->Join();
+    }
+  }
 
+ private:
   grpc::WorkerService::AsyncService worker_service_;
   std::vector<std::unique_ptr<GrpcWorkerServiceThread>> threads_;
 
+  std::unique_ptr<GrpcResponseCache> cache_;
   mutex service_shutdown_mu_;
   bool is_shutdown_ GUARDED_BY(service_shutdown_mu_);
 
@@ -454,11 +466,14 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
     return;
   }
 
-  // Request the tensor associated with the rendezvous key. Any time
-  // while waiting for the tensor to be produced, up until the start
-  // of execution of the callback lambda body below, an RPC
-  // cancellation should abort the rendezvous.
-  opts->SetCancelCallback([this, step_id]() { AbortStep(step_id); });
+  // Request the tensor associated with the rendezvous key.
+  // Note that we log the cancellation here but do not abort the current step.
+  // gRPC can generate cancellations in response to transient network failures,
+  // and aborting the step eliminates the opportunity for client side retries.
+  // Repeated client failures will eventually cause the step to be aborted by
+  // the client.
+  opts->SetCancelCallback(
+      [step_id]() { LOG(WARNING) << "RecvTensor cancelled for " << step_id; });
   env_->rendezvous_mgr->RecvLocalAsync(
       step_id, parsed,
       [opts, response, done, src_dev, request](
@@ -640,9 +655,10 @@ std::unique_ptr<GrpcWorker> NewGrpcWorker(WorkerEnv* env,
 }
 
 std::unique_ptr<AsyncServiceInterface> NewGrpcWorkerService(
-    GrpcWorker* worker, ::grpc::ServerBuilder* builder) {
+    GrpcWorker* worker, ::grpc::ServerBuilder* builder,
+    GrpcWorkerServiceOptions options) {
   return std::unique_ptr<AsyncServiceInterface>(
-      new GrpcWorkerService(worker, builder));
+      new GrpcWorkerService(worker, builder, options));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
index 996617d385d1c0e397c30eeceb4f737690fb9490..8f2830c899b9b9854e0b6f02e23651ebd1b06491 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
@@ -16,7 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_H_
 
+#include <memory>
+#include <unordered_map>
 #include "tensorflow/core/distributed_runtime/recent_request_ids.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_response_cache.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h"
 #include "tensorflow/core/distributed_runtime/worker.h"
 
 namespace grpc {
@@ -57,9 +61,19 @@ class GrpcWorker : public Worker {
 std::unique_ptr<GrpcWorker> NewGrpcWorker(WorkerEnv* worker_env,
                                           const ConfigProto& config);
 
+struct GrpcWorkerServiceOptions {
+  // Map from GrpcWorkerMethod id to queue depth.  If set this overrides the
+  // default queue depth for a method.
+  std::unordered_map<int, int> queue_depth;
+  int num_serving_threads = 8;
+  int64 response_cache_bytes = 0;
+  int64 response_cache_expires_seconds = 0;
+};
+
 // Returns an implementation of WorkerService rpc service.
 std::unique_ptr<AsyncServiceInterface> NewGrpcWorkerService(
-    GrpcWorker* worker, ::grpc::ServerBuilder* builder);
+    GrpcWorker* worker, ::grpc::ServerBuilder* builder,
+    GrpcWorkerServiceOptions opts = GrpcWorkerServiceOptions());
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
index 7915c3aafd8a97de2830962d2851b247e7d4db4a..d2ae4eeaeec3c50d5101ec46a468d753cb0f3980 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
@@ -88,6 +88,7 @@ enum class GrpcWorkerMethod {
   kCompleteInstance,
   kGetStepSequence,
 };
+
 static const int kGrpcNumWorkerMethods =
     static_cast<int>(GrpcWorkerMethod::kGetStepSequence) + 1;
 
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 9fb920404f987d6b5b324cce4155da40c7e753b4..ee561e1a8a02a78256b97f5ce015f99ef148a591 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -71,9 +71,12 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
     req_.set_request_id(GetUniqueRequestId());
   }
 
-  void Reset(WorkerCacheInterface* wc) {
-    wc->ReleaseWorker(src_worker_, wi_);
-    wi_ = nullptr;
+  void Reset() {
+    // The RpcRemoteRendezvous using this object is responsible for calling
+    // ReleaseWorker() before Reset().
+    DCHECK_EQ(static_cast<WorkerInterface*>(nullptr), wi_)
+        << "Leaking WorkerInterface in RpcRecvTensorCall::Reset().";
+
     alloc_attrs_ = AllocatorAttributes();
     dst_device_ = nullptr;
     // We don't clear opts_ and assume that Init will set up the state for
@@ -89,9 +92,8 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
 
   ~RpcRecvTensorCall() override {
     // Since only the RpcRecvTensorFreeList will delete an
-    // RpcRecvTensorCall, and it always sets this->wi_ to null when
-    // a call object is released to it, we can assert that this->wi_ is
-    // always null at the point of deletion.
+    // RpcRecvTensorCall, we require that ReleaseWorker() has been called before
+    // the user releases a Call object to the free list.
     CHECK_EQ(static_cast<WorkerInterface*>(nullptr), wi_)
         << "Leaking WorkerInterface in RpcRecvTensorCall destructor.";
   }
@@ -113,6 +115,13 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
     return status_;
   }
 
+  void ReleaseWorker(WorkerCacheInterface* worker_cache) {
+    DCHECK_NE(static_cast<WorkerInterface*>(nullptr), wi_)
+        << "RpcRecvTensorCall::ReleaseWorker() called twice.";
+    worker_cache->ReleaseWorker(src_worker_, wi_);
+    wi_ = nullptr;
+  }
+
   const Tensor& tensor() const { return resp_.tensor(); }
 
   bool is_dead() const { return resp_.metadata().is_dead(); }
@@ -144,7 +153,7 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
 
   string src_worker_;
   string src_rel_device_;
-  WorkerInterface* wi_;
+  WorkerInterface* wi_;  // Not owned.
   AllocatorAttributes alloc_attrs_;
   Device* dst_device_;
   CallOptions opts_;
@@ -180,8 +189,8 @@ class RpcRecvTensorFreeList {
     return new RpcRecvTensorCall;
   }
 
-  void Release(RpcRecvTensorCall* obj, WorkerCacheInterface* wc) {
-    obj->Reset(wc);
+  void Release(RpcRecvTensorCall* obj) {
+    obj->Reset();
     {
       mutex_lock l(mu_);
       if (objects_.size() < kMaxObjects) {
@@ -220,6 +229,9 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
                          " is invalid remote source device.");
   }
   WorkerSession* sess = session();
+  // The worker will be released in a subsequent call to
+  // `sess->worker_cache->ReleaseWorker()` (if the call has not yet been
+  // initialized) or `call->ReleaseWorker()` (if it has been initialized).
   WorkerInterface* rwi = sess->worker_cache->CreateWorker(call->src_worker_);
   if (s.ok() && rwi == nullptr) {
     s = errors::Internal("No worker known as ", call->src_worker_);
@@ -233,7 +245,7 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
     if (rwi != nullptr) {
       sess->worker_cache->ReleaseWorker(call->src_worker_, rwi);
     }
-    get_call_freelist()->Release(call, sess->worker_cache.get());
+    get_call_freelist()->Release(call);
     done(s, Args(), recv_args, Tensor{}, false);
     return;
   }
@@ -246,10 +258,12 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
 
   // RendezvousMgr already aborted, shouldn't send RPC call any more
   if (!call->status().ok()) {
+    // NOTE: `*sess` can potentially be deleted before we return from
+    // `call->done()(...)`, so we must release the worker before calling the
+    // callback.
+    call->ReleaseWorker(sess->worker_cache.get());
     call->done()(call->status(), Args(), Args(), Tensor(), false);
-    session()->worker_cache->ReleaseWorker(call->src_worker_, call->wi_);
-    call->wi_ = nullptr;
-    get_call_freelist()->Release(call, session()->worker_cache.get());
+    get_call_freelist()->Release(call);
     return;
   }
 
@@ -261,10 +275,12 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
     // If StartAbort was called prior to DeregisterCall, then the
     // current status should be bad.
     Status s = call->status();
+    // NOTE: `*session()` can potentially be deleted before we return from
+    // `call->done()(...)`, so we must release the worker before calling the
+    // callback.
+    call->ReleaseWorker(session()->worker_cache.get());
     call->done()(s, Args(), call->recv_args(), call->tensor(), call->is_dead());
-    session()->worker_cache->ReleaseWorker(call->src_worker_, call->wi_);
-    call->wi_ = nullptr;
-    get_call_freelist()->Release(call, session()->worker_cache.get());
+    get_call_freelist()->Release(call);
     Unref();
   });
 }
diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
index c9581fa00f3e946b212717107809182a6a5d00f2..98eb1467700a5e3259a3635f71c5cebae094751f 100644
--- a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
@@ -56,7 +56,7 @@ class RpcCollectiveExecutorMgr : public CollectiveExecutorMgr {
   void RetireStepId(int64 graph_key, int64 step_id) override;
 
  protected:
-  CollectiveExecutor* Create(int64 step_id) override;
+  virtual CollectiveExecutor* Create(int64 step_id) override;
 
   WorkerCacheInterface* const worker_cache_;  // Not owned.
   const string task_name_;
diff --git a/tensorflow/core/distributed_runtime/tensor_coding.cc b/tensorflow/core/distributed_runtime/tensor_coding.cc
index fe2d1a12934dde814344b70f52fbc972f74347e0..6d20e7cfcada3e3396611143bfcb148ee2a8f0c2 100644
--- a/tensorflow/core/distributed_runtime/tensor_coding.cc
+++ b/tensorflow/core/distributed_runtime/tensor_coding.cc
@@ -68,13 +68,14 @@ Status TensorResponse::InitFrom(RecvTensorResponse* response) {
   return s;
 }
 
-void TensorResponse::InitPartial(const RecvTensorResponse& response) {
+void TensorResponse::InitPartial(const RecvTensorResponse& response,
+                                 const AllocationAttributes& allocation_attr) {
   // Everything except content is present in *response.  Content will
   // arrive later; allocate a Tensor with appropriate storage for that
   // content.
   meta_ = response;
   TensorShape shape(meta_.tensor().tensor_shape());
-  Tensor t(allocator_, meta_.tensor().dtype(), shape);
+  Tensor t(allocator_, meta_.tensor().dtype(), shape, allocation_attr);
   tensor_ = std::move(t);
 }
 
diff --git a/tensorflow/core/distributed_runtime/tensor_coding.h b/tensorflow/core/distributed_runtime/tensor_coding.h
index 4c34297990d399e4e42f5776cd23fb660c9090c5..86d95a30631493c713f24cbc2e04a09da80e00b8 100644
--- a/tensorflow/core/distributed_runtime/tensor_coding.h
+++ b/tensorflow/core/distributed_runtime/tensor_coding.h
@@ -76,7 +76,8 @@ class TensorResponse {
 
   // Initialize tensor metadata from response and allocate
   // uninitialized backing storage for actual contents.
-  void InitPartial(const RecvTensorResponse& response);
+  void InitPartial(const RecvTensorResponse& response,
+                   const AllocationAttributes& allocation_attr);
 
   // Return a reference to the parsed tensor.  The tensor will remain
   // live only until *this is destroyed or modified.
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index f42143e5824827e35a97ac25cb80b0e2c82e716e..c6e34c568e73d9dc4ccf007088c79a054f8f1a18 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -104,7 +104,8 @@ void Worker::AbortStep(int64 step_id) {
     // Delay a bit before aborting the step. This way, the root
     // cause may return first back to the client instead of this
     // cancellation generated abort error.
-    rendez->StartAbort(errors::Aborted("Step ", step_id));
+    rendez->StartAbort(errors::Aborted("Step ", step_id,
+                                       " cancelled.  Cancelling rendezvous."));
     rendez->Unref();
   });
 }
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index 89c49a2ad050bfe067e9557aabd2916fba812fb0..f21f76fec53d2deac4a0f6467c8744e086c637b7 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -26,14 +26,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-void AllocatorStats::Clear() {
-  this->num_allocs = 0;
-  this->bytes_in_use = 0;
-  this->max_bytes_in_use = 0;
-  this->max_alloc_size = 0;
-  this->bytes_limit = 0;
-}
-
 string AllocatorStats::DebugString() const {
   return strings::Printf(
       "Limit:        %20lld\n"
@@ -41,8 +33,8 @@ string AllocatorStats::DebugString() const {
       "MaxInUse:     %20lld\n"
       "NumAllocs:    %20lld\n"
       "MaxAllocSize: %20lld\n",
-      this->bytes_limit, this->bytes_in_use, this->max_bytes_in_use,
-      this->num_allocs, this->max_alloc_size);
+      this->bytes_limit ? *this->bytes_limit : 0, this->bytes_in_use,
+      this->peak_bytes_in_use, this->num_allocs, this->largest_alloc_size);
 }
 
 constexpr size_t Allocator::kAllocatorAlignment;
@@ -132,10 +124,10 @@ class CPUAllocator : public Allocator {
       mutex_lock l(mu_);
       ++stats_.num_allocs;
       stats_.bytes_in_use += alloc_size;
-      stats_.max_bytes_in_use =
-          std::max<int64>(stats_.max_bytes_in_use, stats_.bytes_in_use);
-      stats_.max_alloc_size =
-          std::max<int64>(stats_.max_alloc_size, alloc_size);
+      stats_.peak_bytes_in_use =
+          std::max<int64>(stats_.peak_bytes_in_use, stats_.bytes_in_use);
+      stats_.largest_alloc_size =
+          std::max<int64>(stats_.largest_alloc_size, alloc_size);
 
       if (stats_.bytes_in_use > TotalAllocationWarningBytes() &&
           total_allocation_warning_count_ < kMaxTotalAllocationWarnings) {
@@ -158,16 +150,16 @@ class CPUAllocator : public Allocator {
     port::AlignedFree(ptr);
   }
 
-  void GetStats(AllocatorStats* stats) override {
+  absl::optional<AllocatorStats> GetStats() override {
     mutex_lock l(mu_);
-    *stats = stats_;
+    return stats_;
   }
 
   void ClearStats() override {
     mutex_lock l(mu_);
     stats_.num_allocs = 0;
-    stats_.max_bytes_in_use = stats_.bytes_in_use;
-    stats_.max_alloc_size = 0;
+    stats_.peak_bytes_in_use = stats_.bytes_in_use;
+    stats_.largest_alloc_size = 0;
   }
 
   size_t AllocatedSizeSlow(const void* ptr) override {
@@ -216,15 +208,33 @@ class CPUAllocatorFactory : public AllocatorFactory {
 REGISTER_MEM_ALLOCATOR("DefaultCPUAllocator", 100, CPUAllocatorFactory);
 }  // namespace
 
-Allocator* cpu_allocator() {
+Allocator* cpu_allocator_base() {
   static Allocator* cpu_alloc =
       AllocatorFactoryRegistry::singleton()->GetAllocator();
+  // TODO(tucker): This really seems wrong.  It's only going to be effective on
+  // the first call in a process (but the desired effect is associated with a
+  // session), and we probably ought to be tracking the highest level Allocator,
+  // not the lowest.  Revisit the advertised semantics of the triggering option.
   if (cpu_allocator_collect_full_stats && !cpu_alloc->TracksAllocationSizes()) {
     cpu_alloc = new TrackingAllocator(cpu_alloc, true);
   }
   return cpu_alloc;
 }
 
+Allocator* cpu_allocator(int numa_node) {
+  // Correctness relies on devices being created prior to the first call
+  // to cpu_allocator, if devices are ever to be created in the process.
+  // Device creation in turn triggers ProcessState creation and the availability
+  // of the correct access pointer via this function call.
+  static ProcessStateInterface* ps =
+      AllocatorFactoryRegistry::singleton()->process_state();
+  if (ps) {
+    return ps->GetCPUAllocator(numa_node);
+  } else {
+    return cpu_allocator_base();
+  }
+}
+
 SubAllocator::SubAllocator(const std::vector<Visitor>& alloc_visitors,
                            const std::vector<Visitor>& free_visitors)
     : alloc_visitors_(alloc_visitors), free_visitors_(free_visitors) {}
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 531ea73e89277c83cfede50fce0de08b65c5e5a5..4dc5eaf16d7f3eb034e44898f61dab33ba4c8d82 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -20,11 +20,14 @@ limitations under the License.
 
 #include <limits>
 
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -45,23 +48,31 @@ struct AllocationAttributes {
   // which Op is performing the allocation, and sets this flag to
   // true.
   bool allocation_will_be_logged = false;
+  // EXPERIMENTAL: If provided, then evaluates to a timing count such that only
+  // a memory chunk whose last-freed count is at this value or earlier may be
+  // returned.
+  std::function<uint64()> freed_by_func = nullptr;
 };
 
-// Runtime statistics collected by an allocator.
+// Runtime statistics collected by an allocator. Exactly the same as
+// stream_executor::AllocatorStats, but independently defined to preserve the
+// mutual independence of StreamExecutor and TensorFlow.
 struct AllocatorStats {
-  int64 num_allocs;        // Number of allocations.
-  int64 bytes_in_use;      // Number of bytes in use.
-  int64 max_bytes_in_use;  // The maximum bytes in use.
-  int64 max_alloc_size;    // The max single allocation seen.
+  int64 num_allocs;          // Number of allocations.
+  int64 bytes_in_use;        // Number of bytes in use.
+  int64 peak_bytes_in_use;   // The peak bytes in use.
+  int64 largest_alloc_size;  // The largest single allocation seen.
 
-  // The upper limit what the allocator can allocate, if such a limit
-  // is known. Certain allocator may return 0 to indicate the limit is
-  // unknown.
-  int64 bytes_limit;
+  // The upper limit of bytes of user allocatable device memory, if such a limit
+  // is known.
+  absl::optional<int64> bytes_limit;
 
-  AllocatorStats() { Clear(); }
+  AllocatorStats()
+      : num_allocs(0),
+        bytes_in_use(0),
+        peak_bytes_in_use(0),
+        largest_alloc_size(0) {}
 
-  void Clear();
   string DebugString() const;
 };
 
@@ -193,7 +204,7 @@ class Allocator {
   }
 
   // Fills in 'stats' with statistics collected by this allocator.
-  virtual void GetStats(AllocatorStats* stats) { stats->Clear(); }
+  virtual absl::optional<AllocatorStats> GetStats() { return absl::nullopt; }
 
   // Clears the internal stats except for the `in_use` field.
   virtual void ClearStats() {}
@@ -375,10 +386,16 @@ struct AllocatorAttributes {
 };
 
 // Returns a trivial implementation of Allocator, which is a process singleton.
-// Access through this function is only intended for use in tests and auxiliary
-// processing.  Performance sensitive uses should always obtain allocators from
-// ProcessState.
-Allocator* cpu_allocator();
+// Access through this function is only intended for use by restricted parts
+// of the infrastructure.
+Allocator* cpu_allocator_base();
+
+// If available, calls ProcessState::GetCPUAllocator(numa_node).
+// If not, falls back to cpu_allocator_base().
+// Intended for use in contexts where ProcessState is not visible at
+// compile time. Where ProcessState is visible, it's preferable to
+// call it directly.
+Allocator* cpu_allocator(int numa_node = port::kNUMANoAffinity);
 
 // If 'enable' is true, the default CPU allocator implementation will collect
 // AllocatorStats. By default, it's disabled.
diff --git a/tensorflow/core/framework/allocator_registry.h b/tensorflow/core/framework/allocator_registry.h
index e907c52ba99e4a5ed6f67629fd9c2d7acef563b0..d9f3280c62d7c1a4a2bb7a3de117768f836653af 100644
--- a/tensorflow/core/framework/allocator_registry.h
+++ b/tensorflow/core/framework/allocator_registry.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/numa.h"
 
@@ -43,6 +44,13 @@ class AllocatorFactory {
   virtual SubAllocator* CreateSubAllocator(int numa_node) = 0;
 };
 
+// ProcessState is defined in a package that cannot be a dependency of
+// framework.  This definition allows us to access the one method we need.
+class ProcessStateInterface {
+ public:
+  virtual Allocator* GetCPUAllocator(int numa_node) = 0;
+};
+
 // A singleton registry of AllocatorFactories.
 //
 // Allocators should be obtained through ProcessState or cpu_allocator()
@@ -72,6 +80,12 @@ class AllocatorFactoryRegistry {
   // Returns the singleton value.
   static AllocatorFactoryRegistry* singleton();
 
+  ProcessStateInterface* process_state() const { return process_state_; }
+
+ protected:
+  friend class ProcessState;
+  ProcessStateInterface* process_state_ = nullptr;
+
  private:
   mutex mu_;
   bool first_alloc_made_ = false;
diff --git a/tensorflow/core/framework/allocator_test.cc b/tensorflow/core/framework/allocator_test.cc
index a409cb2de7fbae20f435f464ca07155a36fede4a..85e8ba6a71b7760b004b9d2ebbc425ddff5fbf17 100644
--- a/tensorflow/core/framework/allocator_test.cc
+++ b/tensorflow/core/framework/allocator_test.cc
@@ -25,20 +25,23 @@ limitations under the License.
 namespace tensorflow {
 
 static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use,
-                       int64 max_bytes_in_use, int64 max_alloc_size) {
-  AllocatorStats stats;
-  a->GetStats(&stats);
-  LOG(INFO) << "Alloc stats: \n" << stats.DebugString();
+                       int64 peak_bytes_in_use, int64 largest_alloc_size) {
+  absl::optional<AllocatorStats> stats = a->GetStats();
+  EXPECT_TRUE(stats);
+  if (!stats) {
+    return;
+  }
+  LOG(INFO) << "Alloc stats: \n" << stats->DebugString();
 #if defined(PLATFORM_GOOGLE) && defined(NDEBUG)
   // NOTE: allocator stats expectation depends on the system malloc,
   // and can vary as that changes.
   static const int64 kSlop = 5 * 1024;
-  EXPECT_GT(stats.bytes_in_use, bytes_in_use - kSlop);
-  EXPECT_LT(stats.bytes_in_use, bytes_in_use + kSlop);
-  EXPECT_GT(stats.max_bytes_in_use, max_bytes_in_use - kSlop);
-  EXPECT_LT(stats.max_bytes_in_use, max_bytes_in_use + kSlop);
-  EXPECT_EQ(stats.num_allocs, num_allocs);
-  EXPECT_EQ(stats.max_alloc_size, max_alloc_size);
+  EXPECT_GT(stats->bytes_in_use, bytes_in_use - kSlop);
+  EXPECT_LT(stats->bytes_in_use, bytes_in_use + kSlop);
+  EXPECT_GT(stats->peak_bytes_in_use, peak_bytes_in_use - kSlop);
+  EXPECT_LT(stats->peak_bytes_in_use, peak_bytes_in_use + kSlop);
+  EXPECT_EQ(stats->num_allocs, num_allocs);
+  EXPECT_EQ(stats->largest_alloc_size, largest_alloc_size);
 #endif
 }
 
diff --git a/tensorflow/core/framework/attr_value_util.cc b/tensorflow/core/framework/attr_value_util.cc
index 79966f06922a62c7d04648f4a2829d05861cd76b..43b435270c49087b43ce101991686e8c9c069de2 100644
--- a/tensorflow/core/framework/attr_value_util.cc
+++ b/tensorflow/core/framework/attr_value_util.cc
@@ -54,9 +54,7 @@ uint64 TensorProtoHash(const TensorProto& tp) {
   DCHECK(success);
   TensorProto p;
   tensor.AsProtoTensorContent(&p);
-  string s;
-  SerializeToStringDeterministic(p, &s);
-  return Hash64(s);
+  return DeterministicProtoHash64(p);
 }
 
 // Do not create large tensors in memory, compute hash based on TensorProto
@@ -64,12 +62,8 @@ uint64 TensorProtoHash(const TensorProto& tp) {
 // different hash code if they are defined with different TensorProto
 // representations.
 uint64 FastTensorProtoHash(const TensorProto& tp) {
-  string s;
   if (TensorByteSize(tp) > kMaxAttrValueTensorByteSize) {
-    string s;
-    bool success = SerializeToStringDeterministic(tp, &s);
-    DCHECK(success);
-    return Hash64(s);
+    return DeterministicProtoHash64(tp);
   } else {
     return TensorProtoHash(tp);
   }
@@ -95,11 +89,7 @@ bool AreTensorProtosEqual(const TensorProto& lhs, const TensorProto& rhs) {
   TensorProto rhs_tp;
   rhs_t.AsProtoTensorContent(&rhs_tp);
 
-  string lhs_str, rhs_str;
-  SerializeToStringDeterministic(lhs_tp, &lhs_str);
-  SerializeToStringDeterministic(rhs_tp, &rhs_str);
-
-  return lhs_str == rhs_str;
+  return AreSerializedProtosEqual(lhs_tp, rhs_tp);
 }
 
 // Do not construct large tensors in memory, compare equality using TensorProto
@@ -139,9 +129,7 @@ uint64 AttrValueHash(const AttrValue& a, const TensorProtoHasher& tensor_hash) {
   }
 
   // If `a` is not a tensor or func, get a hash of serialized string.
-  string s;
-  SerializeToStringDeterministic(a, &s);
-  return Hash64(s);
+  return DeterministicProtoHash64(a);
 }
 
 bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b,
@@ -175,10 +163,7 @@ bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b,
 
   // All other fields in AttrValue have deterministic representations.
   // It is safe to compare their serialized strings.
-  string a_str, b_str;
-  SerializeToStringDeterministic(a, &a_str);
-  SerializeToStringDeterministic(b, &b_str);
-  return a_str == b_str;
+  return AreSerializedProtosEqual(a, b);
 }
 
 string SummarizeString(const string& str) {
diff --git a/tensorflow/core/framework/bfloat16_test.cc b/tensorflow/core/framework/bfloat16_test.cc
index ce97085494175e57b41215779b32234c1c1d5f3c..7da1727e47cee69f9cfbbb5cb9473bc2a76bb220 100644
--- a/tensorflow/core/framework/bfloat16_test.cc
+++ b/tensorflow/core/framework/bfloat16_test.cc
@@ -75,7 +75,7 @@ TEST_P(Bfloat16Test, TruncateTest) {
   EXPECT_EQ(GetParam().expected_rounding, float(rounded));
 }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     Bfloat16Test_Instantiation, Bfloat16Test,
     ::testing::Values(
         Bfloat16TestParam{
diff --git a/tensorflow/core/framework/cancellation.cc b/tensorflow/core/framework/cancellation.cc
index af59500aee32d83dadb7cf94f6d277819f6c65c4..7f639b5ca9a5fd6219b900f08965acaf2c6ee923 100644
--- a/tensorflow/core/framework/cancellation.cc
+++ b/tensorflow/core/framework/cancellation.cc
@@ -27,6 +27,12 @@ CancellationManager::CancellationManager()
       is_cancelled_(false),
       next_cancellation_token_(0) {}
 
+void CancellationManager::Reset() {
+  mutex_lock l(mu_);
+  is_cancelling_ = false;
+  is_cancelled_.store(false);
+}
+
 void CancellationManager::StartCancel() {
   gtl::FlatMap<CancellationToken, CancelCallback> callbacks_to_run;
   {
diff --git a/tensorflow/core/framework/cancellation.h b/tensorflow/core/framework/cancellation.h
index 7a5d9424867d35a4ca07e690230c73afff0b2940..51b200423ec11fba771d233e6985c62708f901ac 100644
--- a/tensorflow/core/framework/cancellation.h
+++ b/tensorflow/core/framework/cancellation.h
@@ -56,6 +56,9 @@ class CancellationManager {
   // Returns true iff StartCancel() has been called.
   bool IsCancelled() { return is_cancelled_.load(std::memory_order_acquire); }
 
+  // Resets the cancellation manager to its original pre-cancelled state.
+  void Reset();
+
   // Returns a token that must be used in calls to RegisterCallback
   // and DeregisterCallback.
   CancellationToken get_cancellation_token();
diff --git a/tensorflow/core/framework/collective.cc b/tensorflow/core/framework/collective.cc
index 7fa58347f258acf327e112f4c9cd58c37134ceee..b83d183f14b28672f8da47ae642a386c69253a9b 100644
--- a/tensorflow/core/framework/collective.cc
+++ b/tensorflow/core/framework/collective.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
@@ -64,7 +65,9 @@ CollInstanceParams& CollInstanceParams::operator=(
     device_names.assign(other.device_names.begin(), other.device_names.end());
     task_names.assign(other.task_names.begin(), other.task_names.end());
     same_num_devices_per_task = other.same_num_devices_per_task;
+    num_devices_per_task = other.num_devices_per_task;
     gpu_ring_order = other.gpu_ring_order;
+    communicator_key = other.communicator_key;
     impl_details.subdiv_offsets.assign(
         other.impl_details.subdiv_offsets.begin(),
         other.impl_details.subdiv_offsets.end());
@@ -76,6 +79,7 @@ CollInstanceParams& CollInstanceParams::operator=(
     impl_details.subdiv_source_rank.assign(
         other.impl_details.subdiv_source_rank.begin(),
         other.impl_details.subdiv_source_rank.end());
+    impl_details.dependencies = other.impl_details.dependencies;
   }
   return *this;
 }
@@ -91,6 +95,13 @@ string CollInstanceParams::ToString() const {
   for (const auto& n : task_names) {
     strings::StrAppend(&v, n, ", ");
   }
+  strings::StrAppend(&v, "} num_devices_per_task={");
+  for (const auto dpt : num_devices_per_task) {
+    strings::StrAppend(&v, dpt.first, ": ", dpt.second, ", ");
+  }
+  strings::StrAppend(&v, "}, collective_name=", impl_details.collective_name,
+                     ", communicator_key=", str_util::CEscape(communicator_key),
+                     ", subdiv_offsets={");
   strings::StrAppend(&v, "}, subdiv_offsets={");
   for (const auto& d : impl_details.subdiv_offsets) {
     strings::StrAppend(&v, d, ",");
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index 0321429702af74dfb18ca631b0314c705150ec06..e00cc17961cb89cfdad8d33cbca758d80a5ca274 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -42,6 +42,7 @@ class Tensor;
 enum CollectiveType {
   REDUCTION_COLLECTIVE = 0,
   BROADCAST_COLLECTIVE,
+  GATHER_COLLECTIVE,
   UNDEFINED_COLLECTIVE,
 };
 
@@ -70,6 +71,8 @@ struct CollImplDetails {
   std::vector<std::vector<int>> subdiv_permutations;
   std::vector<int> subdiv_offsets;
   std::vector<int> subdiv_source_rank;  // rank of source in each subdiv
+  std::vector<int32>
+      dependencies;  // collective instances on which this node depends
 };
 
 // Data common to all members of a collective instance.
@@ -85,9 +88,13 @@ struct CollInstanceParams {
   std::vector<string> task_names;
   // True if every task has the same number of devices.
   bool same_num_devices_per_task = false;
+  // Task -> number of devices on that task.
+  std::unordered_map<string, int32> num_devices_per_task;
   // If passed in to GPUOptions in ConfigProto, defines a good ring order for
   // GPUs.  Assumes same GPU configuration at each worker.
   string gpu_ring_order = "";
+  // Valid when using a communicator-based collective mechanism, e.g. NCCL.
+  string communicator_key;
   CollImplDetails impl_details;
   string ToString() const;
   CollInstanceParams& operator=(const struct CollInstanceParams& other);
@@ -269,6 +276,21 @@ class CollectiveExecutor : public PeerAccessInterface, public core::RefCounted {
 
   virtual PerStepCollectiveRemoteAccess* remote_access() { return nullptr; }
 
+  // `WaitForDependencies` and `Launched` are used for fine-grained control of
+  // execution order between collective instances.  These functions are intended
+  // to be called in `Run` function of collective implementations, and may be
+  // used to make part, or whole, of the collective execution ordered with
+  // respect to other collective instances.
+  //
+  // `WaitForDependencies` will block until it is safe to continue the callee's
+  // execution, where safety is defined as: ordered with respect to the
+  // collective instances defined in the callee's `wait_for` attribute.
+  virtual void WaitForDependencies(const CollectiveParams& col_params) {}
+  // `Launched` unblocks the dependent collective instances by recording that
+  // this callee device has completed the critical portion of the collective
+  // execution.
+  virtual void Launched(const CollectiveParams& col_params) {}
+
   // Used to designate an invalid group or instance key.
   static int64 kInvalidId;
 
@@ -347,7 +369,8 @@ class CollectiveImplementationInterface {
 
   // Initializes the portions of `col_params` specific to this
   // implementation.  Called exactly once for every Collective instance during
-  // the CollectiveParams resolution process when the graph is first executed.
+  // the CollectiveParams resolution process when the graph is first executed,
+  // at the end of `CompleteInstanceLocal()`.
   // NOTE(ayushd): This is effectively a static function because it modifies the
   // `col_params` passed in and should not manipulate any data members.  However
   // because it is virtual and needs to be implemented by every derived class we
@@ -360,6 +383,14 @@ class CollectiveImplementationInterface {
   // object.
   virtual Status InitializeCollectiveContext(CollectiveContext* col_ctx) = 0;
 
+  // Initializes instance params at the beginning of `CompleteInstanceLocal()`,
+  // unlike `InitializeCollectiveParams` which is called at the end.  This
+  // function is called before all devices in the instance are discovered, and
+  // may be used to broadcast data via the shared `InstanceRec` object in
+  // collective param resolution to all devices.
+  virtual Status InitializeInstanceBeforeGroupDiscovery(
+      CollectiveParams* col_params) = 0;
+
   // Processes and moves data according to the logic of this Collective
   // implementation.  Relies on appropriate initialization of op-specific
   // CollectiveParams in InitializeCollectiveParams(), as well as appropriate
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index bf2d902af41c690be25a170da6fc22a4902e2d50..5c974a76aca76f14ef166d285733c5e2f9ad723b 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
 
@@ -37,6 +38,11 @@ Status GetWindowedOutputSizeVerboseV2(int64 input_size, int64 filter_size,
       *output_size = (input_size - effective_filter_size + stride) / stride;
       *padding_before = *padding_after = 0;
       break;
+    case Padding::EXPLICIT:
+      *output_size = (input_size + *padding_before + *padding_after -
+                      effective_filter_size + stride) /
+                     stride;
+      break;
     case Padding::SAME:
       *output_size = (input_size + stride - 1) / stride;
       const int64 padding_needed =
@@ -71,6 +77,11 @@ Status GetWindowedOutputSizeVerbose(int64 input_size, int64 filter_size,
 Status GetWindowedOutputSize(int64 input_size, int64 filter_size, int64 stride,
                              Padding padding_type, int64* output_size,
                              int64* padding_size) {
+  if (padding_type == Padding::EXPLICIT) {
+    return errors::Internal(
+        "GetWindowedOutputSize does not handle EXPLICIT padding; call "
+        "GetWindowedOutputSizeVerbose instead");
+  }
   int64 padding_after_unused;
   return GetWindowedOutputSizeVerbose(input_size, filter_size, stride,
                                       padding_type, output_size, padding_size,
@@ -81,6 +92,11 @@ Status GetWindowedOutputSizeV2(int64 input_size, int64 filter_size,
                                int64 dilation_rate, int64 stride,
                                Padding padding_type, int64* output_size,
                                int64* padding_size) {
+  if (padding_type == Padding::EXPLICIT) {
+    return errors::Internal(
+        "GetWindowedOutputSizeV2 does not handle EXPLICIT padding; call "
+        "GetWindowedOutputSizeVerboseV2 instead");
+  }
   int64 padding_after_unused;
   return GetWindowedOutputSizeVerboseV2(input_size, filter_size, dilation_rate,
                                         stride, padding_type, output_size,
@@ -123,8 +139,8 @@ Status GetWindowedOutputSizeFromDimsV2(
     shape_inference::InferenceContext* c,
     shape_inference::DimensionHandle input_size,
     shape_inference::DimensionOrConstant filter_size, int64 dilation_rate,
-    int64 stride, Padding padding_type,
-    shape_inference::DimensionHandle* output_size) {
+    int64 stride, Padding padding_type, int64 padding_before,
+    int64 padding_after, shape_inference::DimensionHandle* output_size) {
   if (stride <= 0) {
     return errors::InvalidArgument("Stride must be > 0, but got ", stride);
   }
@@ -137,6 +153,11 @@ Status GetWindowedOutputSizeFromDimsV2(
   // See also the parallel implementation in GetWindowedOutputSizeVerbose.
   switch (padding_type) {
     case Padding::VALID:
+      padding_before = padding_after = 0;
+      TF_FALLTHROUGH_INTENDED;
+    case Padding::EXPLICIT:
+      TF_RETURN_IF_ERROR(
+          c->Add(input_size, padding_before + padding_after, &input_size));
       if (dilation_rate > 1) {
         DimensionHandle window_size;
         TF_RETURN_IF_ERROR(
@@ -166,13 +187,26 @@ Status GetWindowedOutputSizeFromDims(
     shape_inference::DimensionHandle input_size,
     shape_inference::DimensionOrConstant filter_size, int64 stride,
     Padding padding_type, shape_inference::DimensionHandle* output_size) {
+  if (padding_type == Padding::EXPLICIT) {
+    return errors::Internal(
+        "GetWindowedOutputSizeFromDims does not handle EXPLICIT padding; call "
+        "GetWindowedOutputSizeFromDimsV2 instead");
+  }
   return GetWindowedOutputSizeFromDimsV2(c, input_size, filter_size,
                                          /*dilation_rate=*/1, stride,
-                                         padding_type, output_size);
+                                         padding_type,
+                                         // Give dummy values of -1 to
+                                         // padding_before and padding_after,
+                                         // since explicit padding is not used.
+                                         -1, -1, output_size);
 }
 
 Status UnchangedShape(shape_inference::InferenceContext* c) {
   c->set_output(0, c->input(0));
+  auto* handle_data = c->input_handle_shapes_and_types(0);
+  if (handle_data != nullptr) {
+    c->set_output_handle_shapes_and_types(0, *handle_data);
+  }
   return Status::OK();
 }
 
@@ -371,7 +405,10 @@ Status ShapeFromDimensions(DimensionHandle batch_dim,
   return tensorflow::Status::OK();
 }
 
-Status Conv2DShape(shape_inference::InferenceContext* c) {
+namespace {
+
+Status Conv2DShapeImpl(shape_inference::InferenceContext* c,
+                       bool supports_explicit_padding) {
   string data_format_str, filter_format_str;
   if (!c->GetAttr("data_format", &data_format_str).ok()) {
     data_format_str = "NHWC";
@@ -464,13 +501,35 @@ Status Conv2DShape(shape_inference::InferenceContext* c) {
   Padding padding;
   TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
 
+  std::vector<int64> explicit_paddings;
+  if (supports_explicit_padding) {
+    Status s = c->GetAttr("explicit_paddings", &explicit_paddings);
+    // Use the default value, which is an empty list, if the attribute is not
+    // found. Otherwise return the error to the caller.
+    if (!s.ok() && !errors::IsNotFound(s)) {
+      return s;
+    }
+    TF_RETURN_IF_ERROR(CheckValidPadding(padding, explicit_paddings,
+                                         /*num_dims=*/4, data_format));
+  } else {
+    DCHECK(padding != Padding::EXPLICIT);
+  }
+
   DimensionHandle output_rows, output_cols;
+  int64 pad_rows_before = -1, pad_rows_after = -1;
+  int64 pad_cols_before = -1, pad_cols_after = -1;
+  if (padding == Padding::EXPLICIT) {
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'H',
+                             &pad_rows_before, &pad_rows_after);
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'W',
+                             &pad_cols_before, &pad_cols_after);
+  }
   TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
       c, input_spatial_dims[0], filter_rows_dim, dilation_rows, stride_rows,
-      padding, &output_rows));
+      padding, pad_rows_before, pad_rows_after, &output_rows));
   TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
       c, input_spatial_dims[1], filter_cols_dim, dilation_cols, stride_cols,
-      padding, &output_cols));
+      padding, pad_cols_before, pad_cols_after, &output_cols));
 
   ShapeHandle output_shape;
   TF_RETURN_IF_ERROR(
@@ -480,6 +539,19 @@ Status Conv2DShape(shape_inference::InferenceContext* c) {
   return Status::OK();
 }
 
+}  // namespace
+
+// Shape function for Conv2D-like operations that support explicit padding.
+Status Conv2DShapeWithExplicitPadding(shape_inference::InferenceContext* c) {
+  return Conv2DShapeImpl(c, true);
+}
+
+// Shape function for Conv2D-like operations that do not support explicit
+// padding.
+Status Conv2DShape(shape_inference::InferenceContext* c) {
+  return Conv2DShapeImpl(c, false);
+}
+
 // TODO(mjanusz): Unify all conv/pooling shape functions.
 Status Conv3DShape(shape_inference::InferenceContext* c) {
   ShapeHandle input_shape;
@@ -551,13 +623,13 @@ Status Conv3DShape(shape_inference::InferenceContext* c) {
 
   TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
       c, in_planes_dim, filter_planes_dim, dilation_planes, stride_planes,
-      padding, &output_planes));
+      padding, -1, -1, &output_planes));
   TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
-      c, in_rows_dim, filter_rows_dim, dilation_rows, stride_rows, padding,
-      &output_rows));
+      c, in_rows_dim, filter_rows_dim, dilation_rows, stride_rows, padding, -1,
+      -1, &output_rows));
   TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
-      c, in_cols_dim, filter_cols_dim, dilation_cols, stride_cols, padding,
-      &output_cols));
+      c, in_cols_dim, filter_cols_dim, dilation_cols, stride_cols, padding, -1,
+      -1, &output_cols));
 
   ShapeHandle output_shape;
   if (data_format == "NCDHW") {
@@ -1231,6 +1303,12 @@ Status ConcatV2Shape(InferenceContext* c) {
                            c->num_inputs() - 1 /* dim_index */);
 }
 
+Status QuantizedConcatV2Shape(InferenceContext* c, int num_inputs_to_concat) {
+  return ConcatShapeHelper(c, 0 /* start_value_index */,
+                           num_inputs_to_concat /* end_value_index */,
+                           num_inputs_to_concat /* dim_index */);
+}
+
 Status BroadcastBinaryOpOutputShapeFnHelper(InferenceContext* c,
                                             ShapeHandle shape_x,
                                             ShapeHandle shape_y,
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index 362899b947b1fd479d227ac5421a5f458405f3c6..d421844ee607b18132f4657e7562dec04253c2fa 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -38,11 +38,12 @@ namespace tensorflow {
 //
 // Padding (P): the padding we apply to the input tensor along each
 // dimension. This is usually used to make sure that the spatial dimensions
-// do not shrink when we progress with convolutions. Two types of padding are
-// often used:
+// do not shrink when we progress with convolutions. This function supports two
+// types of padding.
 //   SAME: the pad value is computed so that the output will have size H/S.
 //   VALID: no padding is carried out.
-// The padded area is zero-filled.
+// If you want to use EXPLICIT padding, GetWindowedOutputSizeVerbose must be
+// called instead. Note the padded area is zero-filled.
 //
 // The output dimensions for convolution and many other operations, when given
 // all the parameters above, are as follows:
@@ -95,6 +96,9 @@ Status GetWindowedOutputSize(int64 input_size, int64 filter_size, int64 stride,
 //   When the stride is 1, the expression simplifies to
 //     H' = H-K'+1.
 //
+// If you want to use EXPLICIT padding, GetWindowedOutputSizeVerboseV2 must be
+// called instead
+//
 // TODO(b/67112639): Merge V2 versions and the original versions eventually.
 Status GetWindowedOutputSizeV2(int64 input_size, int64 filter_size,
                                int64 dilation_rate, int64 stride,
@@ -102,9 +106,12 @@ Status GetWindowedOutputSizeV2(int64 input_size, int64 filter_size,
                                int64* padding_size);
 
 // Returns the same output dimensions as in GetWindowedOutputSize, but returns
-// verbose padding dimensions (before/after). Any excess padding
-// (caused by an odd padding size value) is added to the 'padding_after'
-// dimension.
+// verbose padding dimensions (before/after), and EXPLICIT padding is supported.
+// When padding_type is EXPLICIT, *padding_before and *padding_after must
+// already point to initialized integers with the padding amounts. Otherwise,
+// *padding_before and *padding_after are set by this function, and any
+// excess padding (caused by an odd padding size value) is added to the
+// 'padding_after' dimension.
 Status GetWindowedOutputSizeVerbose(int64 input_size, int64 filter_size,
                                     int64 stride, Padding padding_type,
                                     int64* output_size, int64* padding_before,
@@ -122,7 +129,8 @@ Status GetWindowedOutputSizeVerboseV2(int64 input_size, int64 filter_size,
 // of the output tensor and padding to be applied to the input tensor at the
 // lower end of every dimension. Use for 3D convolutions, where the input data
 // is padded with zeros, as well as for 3D avg/max pooling, where the input data
-// is padded with invalid values that are not considered for pooling.
+// is padded with invalid values that are not considered for pooling. EXPLICIT
+// padding is not supported.
 Status Get3dOutputSize(const std::array<int64, 3>& input,
                        const std::array<int64, 3>& window,
                        const std::array<int64, 3>& strides,
@@ -140,21 +148,23 @@ Status Get3dOutputSizeV2(const std::array<int64, 3>& input,
 
 namespace shape_inference {
 
-// Like GetWindowedOutputSize, but deals with DimensionHandles.
+// Like GetWindowedOutputSize, but deals with DimensionHandles. Does not support
+// EXPLICIT padding.
 Status GetWindowedOutputSizeFromDims(InferenceContext* c,
                                      DimensionHandle input_size,
                                      DimensionOrConstant filter_size,
                                      int64 stride, Padding padding_type,
                                      DimensionHandle* output_size);
 
-// The V2 version computes the same outputs with arbitrary dilation_rate. For
-// detailed equations, refer to the comments for GetWindowedOutputSizeV2().
-Status GetWindowedOutputSizeFromDimsV2(InferenceContext* c,
-                                       DimensionHandle input_size,
-                                       DimensionOrConstant filter_size,
-                                       int64 dilation_rate, int64 stride,
-                                       Padding padding_type,
-                                       DimensionHandle* output_size);
+// The V2 version computes the same outputs with arbitrary dilation_rate, and
+// supports EXPLICIT padding. For detailed equations, refer to the comments
+// for GetWindowedOutputSizeV2(). The 'padding_before' and 'padding_after'
+// parameters are only used if padding_type == EXPLICIT.
+Status GetWindowedOutputSizeFromDimsV2(
+    InferenceContext* c, DimensionHandle input_size,
+    DimensionOrConstant filter_size, int64 dilation_rate, int64 stride,
+    Padding padding_type, int64 padding_before, int64 padding_after,
+    DimensionHandle* output_size);
 
 // Transfers shape of input(0) to output(0).
 Status UnchangedShape(shape_inference::InferenceContext* c);
@@ -222,7 +232,11 @@ Status BiasAddShape(shape_inference::InferenceContext* c);
 // Shape function for BiasAddGrad-like operations.
 Status BiasAddGradShape(shape_inference::InferenceContext* c);
 
-// Shape function for Conv2D-like operations.
+// Shape function for Conv2D-like operations that support explicit padding.
+Status Conv2DShapeWithExplicitPadding(shape_inference::InferenceContext* c);
+
+// Shape function for Conv2D-like operations that do not support explicit
+// padding.
 Status Conv2DShape(shape_inference::InferenceContext* c);
 
 // Shape function for Conv3D-like operations.
@@ -265,6 +279,8 @@ Status ConcatShape(shape_inference::InferenceContext* c,
 // Shape function for concat operations.
 Status ConcatV2Shape(shape_inference::InferenceContext* c);
 
+Status QuantizedConcatV2Shape(InferenceContext* c, int num_inputs_to_concat);
+
 // Shape function for binary operators that broadcast their inputs
 // and with output to output_index.
 // Note: out cannot be NULL.
diff --git a/tensorflow/core/framework/common_shape_fns_test.cc b/tensorflow/core/framework/common_shape_fns_test.cc
index 7c395679d304ffab1dfeff6804eede0d09b63734..b94925c04ee2794033b072a1bc62cf841081a769 100644
--- a/tensorflow/core/framework/common_shape_fns_test.cc
+++ b/tensorflow/core/framework/common_shape_fns_test.cc
@@ -408,12 +408,14 @@ TEST(CommonShapeFnsTest, BiasAddGradShapeTest) {
 TEST(CommonShapeFnsTest, Conv2DShapeTest) {
   ShapeInferenceTestOp op("Conv2D");
   auto set_op = [&op](const std::vector<int32>& strides, const string& padding,
-                      const string& data_format, const string& filter_format) {
+                      const string& data_format, const string& filter_format,
+                      const std::vector<int32>& explicit_paddings = {}) {
     TF_CHECK_OK(NodeDefBuilder("test", "Conv2D")
                     .Input("input", 0, DT_FLOAT)
                     .Input("filter", 0, DT_FLOAT)
                     .Attr("strides", strides)
                     .Attr("padding", padding)
+                    .Attr("explicit_paddings", explicit_paddings)
                     .Attr("data_format", data_format)
                     .Attr("filter_format", filter_format)
                     .Finalize(&op.node_def));
@@ -536,19 +538,73 @@ TEST(CommonShapeFnsTest, Conv2DShapeTest) {
   INFER_OK(op, "[1,?,4,1];[?,?,?,?]", "[d0_0,?,2,d1_3]");
   INFER_OK(op, "[1,4,?,1];[?,?,?,?]", "[d0_0,2,?,d1_3]");
   INFER_OK(op, "[1,4,4,?];[?,?,?,?]", "[d0_0,2,2,d1_3]");
+
+  // Some tests for "EXPLICIT" padding
+
+  // 4x4 input, 1x1 filter, 1x1 stride, [0, 2, 1, 4] padding
+  set_op({{1, 1, 1, 1}}, "EXPLICIT", "NHWC", "HWIO", {0, 0, 0, 2, 1, 4, 0, 0});
+  INFER_OK(op, "[1,4,4,1];[1,1,1,1]", "[d0_0,6,9,d1_3]");
+
+  // 3x3 input, 2x2 filter, 1x1 stride, [1, 0, 1, 2] padding
+  set_op({{1, 1, 1, 1}}, "EXPLICIT", "NHWC", "HWIO", {0, 0, 1, 0, 1, 2, 0, 0});
+  INFER_OK(op, "[1,3,3,1];[2,2,1,1]", "[d0_0,3,5,d1_3]");
+
+  // 4x4 input, 2x2 filter, 2x2 stride, [3, 2, 1, 0] padding
+  set_op({{1, 2, 2, 1}}, "EXPLICIT", "NHWC", "HWIO", {0, 0, 3, 2, 1, 0, 0, 0});
+  INFER_OK(op, "[1,4,4,2];[2,2,2,3]", "[d0_0,4,2,d1_3]");
+
+  // 2x2 input, 2x1 filter, 1x2 stride, [1, 1, 2, 2] padding
+  set_op({{1, 1, 2, 1}}, "EXPLICIT", "NHWC", "HWIO", {0, 0, 1, 1, 2, 2, 0, 0});
+  INFER_OK(op, "[1,2,2,1];[2,1,1,1]", "[d0_0,3,3,d1_3]");
+
+  // Unknown dims in the critical fields lead to partial inference.
+  INFER_OK(op, "[1,4,4,1];[2,1,1,1]", "[d0_0,5,4,d1_3]");
+  INFER_OK(op, "[1,?,4,1];[2,1,1,1]", "[d0_0,?,4,d1_3]");
+  INFER_OK(op, "[1,4,?,1];[2,1,1,1]", "[d0_0,5,?,d1_3]");
+  INFER_OK(op, "[1,4,4,?];[2,1,1,1]", "[d0_0,5,4,d1_3]");
+  INFER_OK(op, "[1,4,4,1];[?,1,1,1]", "[d0_0,?,4,d1_3]");
+  INFER_OK(op, "[1,4,4,1];[2,?,1,1]", "[d0_0,5,?,d1_3]");
+
+  // Explicit padding errors
+  // Negative padding
+  set_op({{1, 1, 1, 1}}, "EXPLICIT", "NHWC", "HWIO", {0, 0, 0, -1, 0, 0, 0, 0});
+  INFER_ERROR("must be nonnegative", op, "[1,2,2,1];[1,1,1,1]");
+
+  // Too little padding (7 explicit paddings instead of 8)
+  set_op({{1, 1, 1, 1}}, "EXPLICIT", "NHWC", "HWIO", {0, 0, 0, 0, 0, 0, 0});
+  INFER_ERROR("must contain 8 values", op, "[1,2,2,1];[1,1,1,1]");
+
+  // Too much padding (9 explicit paddings instead of 8)
+  set_op({{1, 1, 1, 1}}, "EXPLICIT", "NHWC", "HWIO",
+         {0, 0, 0, 0, 0, 0, 0, 0, 0});
+  INFER_ERROR("must contain 8 values", op, "[1,2,2,1];[1,1,1,1]");
+
+  // Padding in batch dimension
+  set_op({{1, 1, 1, 1}}, "EXPLICIT", "NHWC", "HWIO", {1, 0, 0, 0, 0, 0, 0, 0});
+  INFER_ERROR("batch or depth dimensions", op, "[1,2,2,1];[1,1,1,1]");
+
+  // Padding in depth dimension
+  set_op({{1, 1, 1, 1}}, "EXPLICIT", "NHWC", "HWIO", {0, 0, 0, 0, 0, 0, 1, 0});
+  INFER_ERROR("batch or depth dimensions", op, "[1,2,2,1];[1,1,1,1]");
+
+  // Padding explicit_paddings when padding is not EXPLICIT
+  set_op({{1, 1, 1, 1}}, "VALID", "NHWC", "HWIO", {0, 0, 0, 0, 0, 0, 0, 0});
+  INFER_ERROR("must be empty", op, "[1,2,2,1];[1,1,1,1]");
 }
 
 TEST(CommonShapeFnsTest, Conv2DDilatedShapeTest) {
   ShapeInferenceTestOp op("Conv2D");
   auto set_op = [&op](const std::vector<int32>& dilations,
                       const std::vector<int32>& strides, const string& padding,
-                      const string& data_format) {
+                      const string& data_format,
+                      const std::vector<int32>& explicit_paddings = {}) {
     TF_CHECK_OK(NodeDefBuilder("test", "Conv2D")
                     .Input("input", 0, DT_FLOAT)
                     .Input("filter", 0, DT_FLOAT)
                     .Attr("dilations", dilations)
                     .Attr("strides", strides)
                     .Attr("padding", padding)
+                    .Attr("explicit_paddings", explicit_paddings)
                     .Attr("data_format", data_format)
                     .Finalize(&op.node_def));
   };
@@ -628,6 +684,28 @@ TEST(CommonShapeFnsTest, Conv2DDilatedShapeTest) {
   // 4x4 input, 2x2 filter, 2x2 dilations, 1x1 stride
   set_op({{1, 2, 2, 1}}, {{1, 1, 1, 1}}, "SAME", "NHWC");
   INFER_OK(op, "[1,4,4,1];[2,2,1,1]", "[d0_0,d0_1,d0_2,d1_3]");
+
+  // Some tests for "EXPLICIT" padding
+
+  // 4x4 input, 1x1 filter, 2x1 dilations, 1x1 stride, [0, 2, 1, 4] padding
+  set_op({{1, 2, 1, 1}}, {{1, 1, 1, 1}}, "EXPLICIT", "NHWC",
+         {0, 0, 0, 2, 1, 4, 0, 0});
+  INFER_OK(op, "[1,4,4,1];[1,1,1,1]", "[d0_0,6,9,d1_3]");
+
+  // 3x3 input, 2x2 filter, 2x2 dilations, 1x1 stride, [1, 0, 1, 2] padding
+  set_op({{1, 2, 2, 1}}, {{1, 1, 1, 1}}, "EXPLICIT", "NHWC",
+         {0, 0, 1, 0, 1, 2, 0, 0});
+  INFER_OK(op, "[1,3,3,1];[2,2,1,1]", "[d0_0,2,4,d1_3]");
+
+  // 4x4 input, 2x2 filter, 1x2 dilations, 2x2 stride, [3, 2, 1, 0] padding
+  set_op({{1, 1, 2, 1}}, {{1, 2, 2, 1}}, "EXPLICIT", "NHWC",
+         {0, 0, 3, 2, 1, 0, 0, 0});
+  INFER_OK(op, "[1,4,4,1];[2,2,1,1]", "[d0_0,4,2,d1_3]");
+
+  // 4x4 input, 2x2 filter, 2x2 dilations, 1x1 stride, [1, 1, 2, 2] padding
+  set_op({{1, 2, 2, 1}}, {{1, 1, 1, 1}}, "EXPLICIT", "NHWC",
+         {0, 0, 1, 1, 2, 2, 0, 0});
+  INFER_OK(op, "[1,4,4,1];[2,2,1,1]", "[d0_0,4,6,d1_3]");
 }
 
 TEST(CommonShapeFnsTest, Conv3DShapeTest) {
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index 6e214332710c9f2e854db99ec588424c8df81145..5fed06ed6e7d8f6e4808272c69dd8eb4ec7e1ea5 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -349,7 +349,7 @@ Status GetDatasetFromVariantTensor(const Tensor& tensor,
 }
 
 Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor) {
-  if (!(tensor->dtype() == DT_VARIANT ||
+  if (!(tensor->dtype() == DT_VARIANT &&
         TensorShapeUtils::IsScalar(tensor->shape()))) {
     return errors::InvalidArgument(
         "Dataset tensor must be a scalar of dtype DT_VARIANT.");
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 7d3776a6ec92b5ab6befbab3162c3d4937c4fe70..cca10fa49e86c062a7d6fa8b25901c7c1fb87d95 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -50,8 +50,6 @@ class GraphDefBuilder;
 class Node;
 
 namespace data {
-// A constant that can be used to enable auto-tuning.
-constexpr int kAutoTune = -1;
 
 constexpr int kInfiniteCardinality = -1;
 constexpr int kUnknownCardinality = -2;
@@ -723,36 +721,36 @@ class DatasetBaseIterator : public IteratorBase {
     return model::MakeUnknownNode(std::move(args));
   }
 
-  // When performance modeling is enabled, this method records the fact that
-  // this iterator has dequeued a element from an internal buffer.
+  // When modeling is enabled, this method records the fact that this iterator
+  // has dequeued an element from an internal buffer.
   void RecordBufferDequeue(IteratorContext* ctx,
                            const std::vector<Tensor>& element) {
-    if (node_) {
+    if (collect_resource_usage(ctx)) {
       node_->add_buffered_bytes(-GetAllocatedBytes(element));
     }
   }
 
-  // When performance modeling is enabled, this method records the fact that
-  // this iterator has enqueued a element in an internal buffer.
+  // When modeling is enabled, this method records the fact that this iterator
+  // has enqueued an element in an internal buffer.
   void RecordBufferEnqueue(IteratorContext* ctx,
                            const std::vector<Tensor>& element) {
-    if (node_) {
+    if (collect_resource_usage(ctx)) {
       node_->add_buffered_bytes(GetAllocatedBytes(element));
     }
   }
 
-  // When performance modeling is enabled, this method records the fact that
-  // this iterator has produced an element.
+  // When modeling is enabled, this method records the fact that this iterator
+  // has produced an element.
   void RecordElement(IteratorContext* ctx) {
     if (node_) {
       node_->record_element();
     }
   }
 
-  // When performance modeling is enabled, this method records the fact that
-  // a thread of this iterator has started work.
+  // When modeling is enabled, this method records the fact that a thread of
+  // this iterator has started work.
   void RecordStart(IteratorContext* ctx, bool stop_output = false) {
-    if (node_) {
+    if (collect_resource_usage(ctx)) {
       int64 now_nanos = Env::Default()->NowNanos();
       if (stop_output && node_->output()) {
         node_->output()->record_stop(now_nanos);
@@ -761,10 +759,10 @@ class DatasetBaseIterator : public IteratorBase {
     }
   }
 
-  // When performance modeling is enabled, this method records the fact that
-  // a thread of this iterator has stopped work.
+  // When modeling is enabled, this method records the fact that a thread of
+  // this iterator has stopped work.
   void RecordStop(IteratorContext* ctx, bool start_output = false) {
-    if (node_) {
+    if (collect_resource_usage(ctx)) {
       int64 now_nanos = Env::Default()->NowNanos();
       node_->record_stop(now_nanos);
       if (start_output && node_->output()) {
@@ -774,6 +772,11 @@ class DatasetBaseIterator : public IteratorBase {
   }
 
  private:
+  inline bool collect_resource_usage(IteratorContext* ctx) {
+    auto model = ctx->model();
+    return model && model->collect_resource_usage() && node_;
+  }
+
   BaseParams params_;
 };
 
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index 446c31b17f2904da3143438304d6407bd65c450c..89ba662b69b060b1b76a0a22630acd4ecb80bed6 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -82,6 +82,13 @@ class DeviceContext : public core::RefCounted {
     done(errors::Internal("Unrecognized device type in CPU-to-device Copy"));
   }
 
+  // Copies a tensor in this device.
+  virtual void CopyTensorInSameDevice(const Tensor* input_tensor,
+                                      Device* device, Tensor* output_tensor,
+                                      StatusCallback done) const {
+    done(errors::Unimplemented("Copy in same device not implemented."));
+  }
+
   // "device_tensor" is a tensor on a non-CPU device.  Copies
   // device_tensor into "cpu_tensor".  "cpu_tensor" must be allocated
   // to be of the same size as "device_tensor".
@@ -239,6 +246,15 @@ class DeviceBase {
     return errors::Internal("Device does not implement MakeTensorFromProto()");
   }
 
+  // Some devices (i.e. GPUs) may free device memory prior to its actual use
+  // being completed on the assumption that subsequent allocations can only be
+  // used serially with respect to pending uses.  If this function returns a
+  // non-zero value it is the value of a device-specific counter such that any
+  // device memory tagged with an earlier freed-at count is really unencumbered
+  // by pending uses.  For this to be useful the device memory allocator must
+  // be tagging deallocated memory chunks using the same counter.
+  virtual uint64 SafeAllocFrontier() { return 0; }
+
  protected:
   // Does not take ownership.
   void set_tensorflow_device_thread_pool(thread::ThreadPool* thread_pool) {
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index b69a40f3128905960cc054ddea7cc20b5d4583a3..35d04eb7278f35ff4118c082adf5f7a320cf923c 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -569,6 +569,9 @@ string Print(const FunctionDef& fdef) {
   for (const auto& n : fdef.node_def()) {
     strings::StrAppend(&out, "  ", Print(n), "\n");
   }
+  for (const auto& cr : fdef.control_ret()) {
+    strings::StrAppend(&out, "  @return ", cr.first, " = ", cr.second, "\n");
+  }
   for (const auto& r : fdef.ret()) {
     strings::StrAppend(&out, "  return ", r.first, " = ", r.second, "\n");
   }
@@ -673,6 +676,7 @@ Status AddDefaultAttrs(const string& op,
 
 }  // end namespace
 
+// TODO(shikharagarwal): Transmit original node names correctly in file.
 Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
                            GetFunctionSignature get_function,
                            InstantiationResult* result) {
@@ -681,8 +685,9 @@ Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
   const OpDef& sig = fdef.signature();
   TF_RETURN_IF_ERROR(ValidateSignatureWithAttrs(sig, attr_values));
 
-  bool ints_on_device = fdef.attr().count("experimental_ints_on_device") != 0 &&
-                        fdef.attr().at("experimental_ints_on_device").b();
+  bool ints_on_device =
+      fdef.attr().count(FunctionLibraryDefinition::kIntsOnDeviceAttr) != 0 &&
+      fdef.attr().at(FunctionLibraryDefinition::kIntsOnDeviceAttr).b();
 
   FunctionInstantiationHelper helper(get_function, result);
   Status s;
@@ -823,6 +828,12 @@ bool FunctionDefsEqual(const FunctionDef& f1, const FunctionDef& f2) {
   std::map<string, string> ret2(f2.ret().begin(), f2.ret().end());
   if (ret1 != ret2) return false;
 
+  std::map<string, string> control_ret1(f1.control_ret().begin(),
+                                        f1.control_ret().end());
+  std::map<string, string> control_ret2(f2.control_ret().begin(),
+                                        f2.control_ret().end());
+  if (control_ret1 != control_ret2) return false;
+
   return true;
 }
 
@@ -847,6 +858,14 @@ uint64 FunctionDefHash(const FunctionDef& fdef) {
     h = Hash64(p.second.data(), p.second.size(), h);
   }
 
+  // control output names
+  std::map<string, string> control_ret(fdef.control_ret().begin(),
+                                       fdef.control_ret().end());
+  for (const auto& p : control_ret) {
+    h = Hash64(p.first.data(), p.first.size(), h);
+    h = Hash64(p.second.data(), p.second.size(), h);
+  }
+
   return h;
 }
 
@@ -867,7 +886,8 @@ string FunctionLibraryRuntime::ExecutorType(const InstantiateOptions& options,
 string Canonicalize(const string& funcname, AttrSlice attrs,
                     const FunctionLibraryRuntime::InstantiateOptions& options) {
   std::vector<string> entries;
-  entries.reserve(options.target.empty() ? attrs.size() : (attrs.size() + 1));
+  entries.reserve(attrs.size() + static_cast<int>(options.target.empty()) +
+                  options.input_devices.size());
   for (auto p : attrs) {
     if (p.first != kExecutorAttr) {
       entries.push_back(strings::StrCat(p.first, "=", Print(p.second)));
@@ -877,6 +897,14 @@ string Canonicalize(const string& funcname, AttrSlice attrs,
     entries.push_back(
         strings::StrCat("_target", "=", str_util::CEscape(options.target)));
   }
+  for (int i = 0; i < options.input_devices.size(); ++i) {
+    entries.push_back(strings::StrCat(
+        "_input_dev", i, "=", str_util::CEscape(options.input_devices[i])));
+  }
+  for (int i = 0; i < options.output_devices.size(); ++i) {
+    entries.push_back(strings::StrCat(
+        "_output_dev", i, "=", str_util::CEscape(options.output_devices[i])));
+  }
   if (options.overlay_lib) {
     entries.push_back(strings::StrCat(
         "_overlay_lib", "=", reinterpret_cast<uintptr_t>(options.overlay_lib)));
@@ -1328,7 +1356,7 @@ GET_ATTR(bool)
 
 namespace {
 
-constexpr char kExperimentalApiImplements[] = "experimental_api_implements";
+constexpr char kApiImplements[] = "api_implements";
 
 absl::flat_hash_set<string> ReachableFunctions(
     const FunctionLibraryDefinition& flib,
@@ -1336,10 +1364,10 @@ absl::flat_hash_set<string> ReachableFunctions(
   // Functions that are reachable from the graph.
   absl::flat_hash_set<string> reachable_funcs;
 
-  // For any functions, if it has attribute "experimental_api_implements" =
+  // For any functions, if it has attribute "api_implements" =
   // "some_interface" and it is reachable, then it means any other
   // function with same attribute name and value could also be potentially
-  // reachable, eg via experimental_implementation_selector swapping the
+  // reachable, eg via implementation_selector swapping the
   // nodedef.
   absl::flat_hash_set<string> reachable_api_interface;
 
@@ -1389,7 +1417,7 @@ absl::flat_hash_set<string> ReachableFunctions(
     const string& func_name = func->signature().name();
     reachable_funcs.insert(func_name);
 
-    const auto attr_it = func->attr().find(kExperimentalApiImplements);
+    const auto attr_it = func->attr().find(kApiImplements);
     if (attr_it != func->attr().end()) {
       reachable_api_interface.insert(attr_it->second.s());
     }
@@ -1405,7 +1433,7 @@ absl::flat_hash_set<string> ReachableFunctions(
 
   for (const auto& func_name : flib.ListFunctionNames()) {
     const auto& func_def = flib.Find(func_name);
-    const auto attr_it = func_def->attr().find(kExperimentalApiImplements);
+    const auto attr_it = func_def->attr().find(kApiImplements);
     if (attr_it != func_def->attr().end()) {
       if (reachable_api_interface.contains(attr_it->second.s())) {
         reachable_funcs.insert(func_name);
@@ -1490,6 +1518,9 @@ NodeDef FunctionDefHelper::Node::ToNodeDef() const {
   for (const string& d : this->dep) {
     n.add_input(strings::StrCat("^", d));
   }
+  if (!this->device.empty()) {
+    n.set_device(this->device);
+  }
   return n;
 }
 
@@ -1498,7 +1529,8 @@ FunctionDef FunctionDefHelper::Create(
     const string& function_name, gtl::ArraySlice<string> in_def,
     gtl::ArraySlice<string> out_def, gtl::ArraySlice<string> attr_def,
     gtl::ArraySlice<Node> node_def,
-    gtl::ArraySlice<std::pair<string, string>> ret_def) {
+    gtl::ArraySlice<std::pair<string, string>> ret_def,
+    gtl::ArraySlice<std::pair<string, string>> control_ret_def) {
   FunctionDef fdef;
 
   // Signature
@@ -1506,6 +1538,7 @@ FunctionDef FunctionDefHelper::Create(
   for (const auto& i : in_def) b.Input(i);
   for (const auto& o : out_def) b.Output(o);
   for (const auto& a : attr_def) b.Attr(a);
+  for (const auto& c : control_ret_def) b.ControlOutput(c.first);
 
   OpRegistrationData op_reg_data;
   TF_CHECK_OK(b.Finalize(&op_reg_data));
@@ -1521,6 +1554,11 @@ FunctionDef FunctionDefHelper::Create(
     fdef.mutable_ret()->insert({r.first, r.second});
   }
 
+  // Control returns
+  for (const auto& cr : control_ret_def) {
+    fdef.mutable_control_ret()->insert({cr.first, cr.second});
+  }
+
   auto* op_def_registry = OpRegistry::Global();
   // Check if any op is stateful.
   for (const auto& n : node_def) {
@@ -1532,9 +1570,20 @@ FunctionDef FunctionDefHelper::Create(
       fdef.mutable_signature()->set_is_stateful(true);
     }
   }
+
   return fdef;
 }
 
+/* static */
+FunctionDef FunctionDefHelper::Create(
+    const string& function_name, gtl::ArraySlice<string> in_def,
+    gtl::ArraySlice<string> out_def, gtl::ArraySlice<string> attr_def,
+    gtl::ArraySlice<Node> node_def,
+    gtl::ArraySlice<std::pair<string, string>> ret_def) {
+  return Create(function_name, in_def, out_def, attr_def, node_def, ret_def,
+                /*control_ret_def=*/{});
+}
+
 /* static */
 FunctionDef FunctionDefHelper::Define(const string& name,
                                       gtl::ArraySlice<string> arg_def,
@@ -1639,4 +1688,4 @@ Status GetOpGradientCreator(const string& op, Creator* creator) {
 
 }  // end namespace gradient
 
-}  // end namespace tensorflow
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 9cf4b0f4cdf1d4c3604eebcf33bb51274578d73c..4f0d595ed3bca0c9c4430512e8b42203f2b56de9 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/selective_registration.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
@@ -35,6 +36,8 @@ namespace tensorflow {
 
 class CancellationManager;
 class CollectiveExecutor;
+class DeviceSet;
+class Graph;
 class GraphDef;
 class OpKernel;
 class ProcessFunctionLibraryRuntime;
@@ -114,13 +117,28 @@ class FunctionDefHelper {
     std::vector<string> arg;
     std::vector<std::pair<string, AttrValueWrapper>> attr;
     std::vector<string> dep;
+    string device;
 
     NodeDef ToNodeDef() const;
   };
 
-  // The Create() function uses the new NodeDef field.  `ret_def`
-  // holds a mapping from the function output names from `out_def` to
-  // the node outputs from `node_def`.
+  // Creates a FunctionDef from the given parameters. Node inputs must use
+  // function encoding (node_name:output_name[:output_index]).
+  // - `ret_def` holds a mapping from the function output names from `out_def`
+  //   to the node outputs from `node_def`.
+  // - `control_ret_def` holds a mapping from the function control
+  //   output names to the nodes from `node_def`.
+  static FunctionDef Create(
+      const string& function_name, gtl::ArraySlice<string> in_def,
+      gtl::ArraySlice<string> out_def, gtl::ArraySlice<string> attr_def,
+      gtl::ArraySlice<Node> node_def,
+      gtl::ArraySlice<std::pair<string, string>> ret_def,
+      gtl::ArraySlice<std::pair<string, string>> control_ret_def);
+
+  // Creates a FunctionDef from the given parameters. Node inputs must use
+  // function encoding (node_name:output_name[:output_index]).
+  // - `ret_def` holds a mapping from the function output names from `out_def`
+  //   to the node outputs from `node_def`.
   static FunctionDef Create(const string& function_name,
                             gtl::ArraySlice<string> in_def,
                             gtl::ArraySlice<string> out_def,
@@ -128,7 +146,6 @@ class FunctionDefHelper {
                             gtl::ArraySlice<Node> node_def,
                             gtl::ArraySlice<std::pair<string, string>> ret_def);
 
-  // The two Define() functions use the old FunctionDef::Node field.
   // TODO(josh11b): Get rid of these and transition to the one above.
   static FunctionDef Define(const string& function_name,
                             gtl::ArraySlice<string> arg_def,
@@ -382,6 +399,8 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   static constexpr const char* const kDeviceArgOp = "_DeviceArg";
   static constexpr const char* const kRetOp = "_Retval";
   static constexpr const char* const kDeviceRetOp = "_DeviceRetval";
+  static constexpr const char* const kIntsOnDeviceAttr =
+      "experimental_ints_on_device";
 
   static constexpr const char* const kGradientOp = "SymbolicGradient";
   static constexpr const char* const kFuncAttr = "f";
@@ -489,6 +508,27 @@ class FunctionLibraryRuntime {
     // instantiated on the local device.
     string target;
 
+    // Should the function be instantiated as a multi-device function?
+    bool is_multi_device_function = false;
+
+    // For multi-device functions, a vector of canonical device names for
+    // function's inputs. The device of resource inputs must be the device
+    // backing the resource, not the CPU device backing the resource handle.
+    // Must have the same length as number of inputs to the function.
+    std::vector<string> input_devices;
+
+    // For multi-device functions, a vector of canonical device names for
+    // function's outputs. The device of resource outputs should be the CPU
+    // device, not the device backing the resource.
+    // If specified, must have the same length as the number of function
+    // outputs.
+    // If not specified, output devices are picked automatically. If operations
+    // producing the output tensors have explicit device specification, they
+    // will be respected. These device specifications must identify a unique
+    // device, i.e.  a general specification like "job:foo" matching multiple
+    // devices will result in an error.
+    std::vector<string> output_devices;
+
     // This interface is EXPERIMENTAL and subject to change.
     //
     // If non-null, the runtime will use `overlay_lib` to resolve
@@ -523,6 +563,18 @@ class FunctionLibraryRuntime {
     // instantiation time, rather than on the first run. This can be used to
     // surface errors earlier.
     bool create_kernels_eagerly = false;
+
+    // If provided, this optimization function will be invoked before
+    // the placer for multi-device functions.
+    std::function<Status(std::vector<string> /*ret_node_names*/,
+                         std::vector<string> /*keep_node_names*/,
+                         FunctionLibraryDefinition*, const DeviceSet&,
+                         Device* /*cpu_device*/, std::unique_ptr<Graph>*)>
+        optimize_graph_fn;
+
+    // If set, partitioned functions will be added to `graph_collector`.
+    // `graph_collector` must be alive during the call to Instantiate.
+    GraphCollector* graph_collector = nullptr;
   };
   typedef uint64 Handle;
   virtual Status Instantiate(const string& function_name, AttrSlice attrs,
diff --git a/tensorflow/core/framework/function.proto b/tensorflow/core/framework/function.proto
index e69d3938d93d109a7cb0c940e8f981d30f464599..64f406bfd73c847e64d58553143aa91b2dc5f424 100644
--- a/tensorflow/core/framework/function.proto
+++ b/tensorflow/core/framework/function.proto
@@ -76,6 +76,10 @@ message FunctionDef {
   // A mapping from the output arg names from `signature` to the
   // outputs from `node_def` that should be returned by the function.
   map<string, string> ret = 4;
+
+  // A mapping from control output names from `signature` to node names in
+  // `node_def` which should be control outputs of this function.
+  map<string, string> control_ret = 6;
 }
 
 // GradientDef defines the gradient function of a function defined in
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index 75d45fa2c84ebc340dfb79b76f7b406d7a099c1f..6fbbabfc95d13d7574f578ddd05f9887435aa0d1 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -156,6 +156,48 @@ ControlDep(x:int32) -> (y:int32) {
   EXPECT_EQ(DebugString(result.nodes), e2);
 }
 
+TEST(TFunc, ControlRet) {
+  auto fdef = FDH::Create(
+      // Name
+      "ControlRet",
+      // Inputs
+      {"x: int32"},
+      // Outputs
+      {"y: int32"},
+      // Attrs
+      {},
+      // Nodes
+      {
+          {{"a"}, "Identity", {"x"}, {{"T", DT_INT32}}},
+      },
+      // Returns
+      {{"y", "a:output:0"}},
+      // Control returns
+      {{"must_execute", "a"}});
+
+  const char* e = R"P(
+ControlRet(x:int32) -> (y:int32) {
+  a = Identity[T=int32](x)
+  @return must_execute = a
+  return y = a:output:0
+}
+)P";
+  EXPECT_EQ(DebugString(fdef), e);
+
+  // Instantiate one with T=float
+  InstantiationResult result;
+  TF_ASSERT_OK(
+      InstantiateFunction(fdef, Attrs({{"T", DT_FLOAT}}), GetOpSig, &result));
+  const char* e2 = R"P(
+(x:int32) -> (a:int32) {
+  a = Identity[T=int32](x)
+}
+)P";
+  EXPECT_EQ(result.arg_types, DataTypeVector({DT_INT32}));
+  EXPECT_EQ(result.ret_types, DataTypeVector({DT_INT32}));
+  EXPECT_EQ(DebugString(result.nodes), e2);
+}
+
 REGISTER_OP("HasDefaultType")
     .Output("out: T")
     .Attr("T: {float, double, int32, int64} = DT_FLOAT");
@@ -505,7 +547,8 @@ TEST(TFunc, IntsOnDeviceArgNotSet) {
 
 TEST(TFunc, IntsOnDeviceArgSet) {
   auto fdef = test::function::XTimesTwoInt32();
-  (*fdef.mutable_attr())["experimental_ints_on_device"].set_b(true);
+  (*fdef.mutable_attr())[FunctionLibraryDefinition::kIntsOnDeviceAttr].set_b(
+      true);
   InstantiationResult result;
   TF_ASSERT_OK(InstantiateFunction(fdef, AttrSlice(), GetOpSig, &result));
   EXPECT_EQ(5, result.nodes.size());
@@ -1319,7 +1362,7 @@ TEST(FunctionLibraryDefinitionTest, ReachableDefinitions) {
 
     if (!interface_name.empty()) {
       auto* attr = func_def.mutable_attr();
-      (*attr)["experimental_api_implements"].set_s(interface_name);
+      (*attr)["api_implements"].set_s(interface_name);
     }
     return func_def;
   };
diff --git a/tensorflow/core/framework/function_testlib.cc b/tensorflow/core/framework/function_testlib.cc
index 0445c242e95f490a10e9d54f986dd6b281fb6e0a..0bc07d7f91cf63e93b1188b163d00767fa73a3d8 100644
--- a/tensorflow/core/framework/function_testlib.cc
+++ b/tensorflow/core/framework/function_testlib.cc
@@ -135,6 +135,114 @@ FunctionDef XTimesTwo() {
       });
 }
 
+FunctionDef TwoDeviceMult() {
+  const Tensor kTwo = test::AsScalar<int64>(2);
+  const Tensor kThree = test::AsScalar<int64>(3);
+  return FDH::Create(
+      // Name
+      "TwoDeviceMult",
+      // Args
+      {"x: T"},
+      // Return values
+      {"y_cpu: T", "y_gpu: T"},
+      // Attr def
+      {"T: {float, double, int32, int64}"},
+      // Nodes
+      {
+          {{"num_2"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_INT64}}},
+          {{"num_3"}, "Const", {}, {{"value", kThree}, {"dtype", DT_INT64}}},
+          {{"factor_2"},
+           "Cast",
+           {"num_2:output:0"},
+           {{"SrcT", DT_INT64}, {"DstT", "$T"}}},
+          {{"factor_3"},
+           "Cast",
+           {"num_3:output:0"},
+           {{"SrcT", DT_INT64}, {"DstT", "$T"}}},
+          {{"y_cpu"},
+           "Mul",
+           {"x", "factor_2:y:0"},
+           {{"T", "$T"}},
+           {},
+           "/device:CPU:0"},
+          {{"y_gpu"},
+           "Mul",
+           {"x", "factor_3:y:0"},
+           {{"T", "$T"}},
+           {},
+           "/device:GPU:0"},
+      },
+      {{"y_cpu", "y_cpu:z:0"}, {"y_gpu", "y_gpu:z:0"}});
+}
+
+FunctionDef TwoDeviceInputOutput() {
+  const Tensor kTwo = test::AsScalar<float>(2);
+  const Tensor kThree = test::AsScalar<float>(3);
+  return FDH::Create(
+      // Name
+      "TwoDeviceInputOutput",
+      // Args
+      {"x1: T", "x2: T"},
+      // Return values
+      {"y_cpu: T", "y_gpu: T"},
+      // Attr def
+      {"T: {float}"},
+      // Nodes
+      {
+          {{"num_2"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_FLOAT}}},
+          {{"num_3"}, "Const", {}, {{"value", kThree}, {"dtype", DT_FLOAT}}},
+          {{"y_cpu"},
+           "Mul",
+           {"x1", "num_2:output:0"},
+           {{"T", "$T"}},
+           {},
+           "/device:CPU:0"},
+          {{"y_gpu"},
+           "Mul",
+           {"x2", "num_3:output:0"},
+           {{"T", "$T"}},
+           {},
+           "/device:GPU:0"},
+      },
+      {{"y_cpu", "y_cpu:z:0"}, {"y_gpu", "y_gpu:z:0"}});
+}
+
+FunctionDef FuncWithListInput() {
+  const Tensor kTwo = test::AsScalar<float>(2);
+  return FDH::Create(
+      // Name
+      "FuncWithListInput",
+      // Args
+      {"x1: N * T"},
+      // Return values
+      {},
+      // Attr def
+      {"T: {float}", "N: int >= 1"},
+      // Nodes
+      {
+          {{"num_2"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_FLOAT}}},
+      },
+      {});
+}
+
+FunctionDef FuncWithListOutput() {
+  const Tensor kTwo = test::AsScalar<float>(2);
+  return FDH::Create(
+      // Name
+      "FuncWithListOutput",
+      // Args
+      {},
+      // Return values
+      {"y: N * T"},
+      // Attr def
+      {"T: {float}", "N: int >= 1"},
+      // Nodes
+      {
+          {{"num_2"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_FLOAT}}},
+      },
+      {{"y", "num_2:output:0"}});
+}
+
 FunctionDef XAddX() {
   return FDH::Define(
       // Name
@@ -243,6 +351,58 @@ FunctionDef Swap() {
        {{"o1"}, "Identity", {"i0"}, {{"T", "$T"}}}});
 }
 
+FunctionDef EmptyBodySwap() {
+  return FDH::Create(
+      // Name
+      "EmptyBodySwap",
+      // Args
+      {"i0: T", "i1: T"},
+      // Return values
+      {"o0: T", "o1: T"},
+      // Attr def
+      {"T: {float, double}"},
+      // Nodes
+      {},
+      // Output mapping
+      {{"o0", "i1"}, {"o1", "i0"}});
+}
+
+FunctionDef ResourceOutput() {
+  const Tensor kTwo = test::AsScalar<float>(2);
+  return FDH::Create(
+      // Name
+      "ResourceOutput",
+      // Args
+      {"x: float", "y: resource"},
+      // Return values
+      {"y_out: resource", "two_x: float"},
+      // Attr def
+      {},
+      // Nodes
+      {
+          {{"two"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_FLOAT}}},
+          {{"mul"}, "Mul", {"x", "two:output:0"}, {{"T", DT_FLOAT}}, {}},
+      },
+      {{"y_out", "y"}, {"two_x", "mul:z:0"}});
+}
+
+FunctionDef ReadResourceVariable() {
+  return FDH::Create(
+      // Name
+      "ReadResourceVariable",
+      // Args
+      {"x: resource"},
+      // Return values
+      {"y: float"},
+      // Attr def
+      {},
+      // Nodes
+      {
+          {{"read"}, "ReadVariableOp", {"x"}, {{"dtype", DT_FLOAT}}, {}},
+      },
+      {{"y", "read:value:0"}});
+}
+
 FunctionDef InvalidControlFlow() {
   return FDH::Create(
       // Name
diff --git a/tensorflow/core/framework/function_testlib.h b/tensorflow/core/framework/function_testlib.h
index a01743423bbfd5c684e82768ee347f1d0734fc04..28532b29d4509105c4b6b7c203e9e81c5780a58f 100644
--- a/tensorflow/core/framework/function_testlib.h
+++ b/tensorflow/core/framework/function_testlib.h
@@ -63,6 +63,21 @@ GraphDef GDef(gtl::ArraySlice<NodeDef> nodes,
 // x:T -> x * 2.
 FunctionDef XTimesTwo();
 
+// x:T -> cpu(x * 2) + cpu(x * 3).
+FunctionDef TwoDeviceTimesFive();
+
+// x:T -> cpu(x * 2), gpu(x * 3).
+FunctionDef TwoDeviceMult();
+
+// cpu(x):T, gpu(y):T -> cpu(x * 2), gpu(y * 3).
+FunctionDef TwoDeviceInputOutput();
+
+// Function taking a list of Tensors as input.
+FunctionDef FuncWithListInput();
+
+// Function returning a list of Tensors as output.
+FunctionDef FuncWithListOutput();
+
 // x:T -> x + x.
 FunctionDef XAddX();
 
@@ -90,6 +105,15 @@ FunctionDef RandomUniform();
 // x:T, y:T -> y:T, x:T
 FunctionDef Swap();
 
+// x:T, y:T -> y:T, x:T, the body has no nodes.
+FunctionDef EmptyBodySwap();
+
+// x:float, y:resource -> y:resource, 2*x:float.
+FunctionDef ResourceOutput();
+
+// x:resource -> y:float.
+FunctionDef ReadResourceVariable();
+
 // Contains malformed control flow which can't be run by the executor.
 FunctionDef InvalidControlFlow();
 
diff --git a/tensorflow/core/framework/graph_to_functiondef.cc b/tensorflow/core/framework/graph_to_functiondef.cc
index b2bc414c496338c382b5f3f194fcb778c08706fa..44b22f93c1d4908e3c7765c2b8bddc74a8a22a37 100644
--- a/tensorflow/core/framework/graph_to_functiondef.cc
+++ b/tensorflow/core/framework/graph_to_functiondef.cc
@@ -165,6 +165,7 @@ Status GraphToFunctionDef(const Graph& graph, const string& name,
       node_def->set_device(node->assigned_device_name());
     }
     node_def->set_name(node_names.Uniquify(node->name()));
+    MergeDebugInfo(NodeDebugInfo(node->def()), node_def);
 
     // Reset input names based on graph rather than the NodeDef.
     node_def->clear_input();
diff --git a/tensorflow/core/framework/graph_to_functiondef_test.cc b/tensorflow/core/framework/graph_to_functiondef_test.cc
index 587e2c07ac046e7476a2da53a9ef4d8b3651410a..c3cc1a743311b71b6604e08c6ebf3ff2d130444b 100644
--- a/tensorflow/core/framework/graph_to_functiondef_test.cc
+++ b/tensorflow/core/framework/graph_to_functiondef_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
@@ -28,6 +29,14 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+FunctionDef RemoveDebugInfo(const FunctionDef& def) {
+  FunctionDef copy = def;
+  for (auto& node_def : *copy.mutable_node_def()) {
+    node_def.clear_experimental_debug_info();
+  }
+  return copy;
+}
+
 bool EqualFunctionDef(const FunctionDef& a, const FunctionDef& b,
                       string* diff) {
   // TODO(phawkins) use a more sophisticated equality test.
@@ -78,7 +87,8 @@ TEST(GraphToFunctionDefTest, Basics) {
       {{"h_0", "G:sum:0"}});  // return values
 
   string diff;
-  bool fdefs_equal = EqualFunctionDef(fdef_expected, fdef, &diff);
+  bool fdefs_equal =
+      EqualFunctionDef(fdef_expected, RemoveDebugInfo(fdef), &diff);
   EXPECT_TRUE(fdefs_equal) << diff;
 }
 
@@ -111,7 +121,8 @@ TEST(GraphToFunctionDefTest, ControlDependencies) {
       {{"c", "b:y:0"}});  // return values
 
   string diff;
-  bool fdefs_equal = EqualFunctionDef(fdef_expected, fdef, &diff);
+  bool fdefs_equal =
+      EqualFunctionDef(fdef_expected, RemoveDebugInfo(fdef), &diff);
   EXPECT_TRUE(fdefs_equal) << diff;
 }
 
diff --git a/tensorflow/core/framework/iterator.proto b/tensorflow/core/framework/iterator.proto
deleted file mode 100644
index f015342e13313ea69838030ae4c1ccda6c1628f7..0000000000000000000000000000000000000000
--- a/tensorflow/core/framework/iterator.proto
+++ /dev/null
@@ -1,18 +0,0 @@
-syntax = "proto3";
-
-package tensorflow;
-option cc_enable_arenas = true;
-option java_outer_classname = "IteratorProtos";
-option java_multiple_files = true;
-option java_package = "org.tensorflow.util";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
-
-// Protocol buffer representing the metadata for an iterator's state stored
-// as a Variant tensor.
-message IteratorStateMetadata {
-  // A user-specified version string.
-  string version = 1;
-
-  // Keys for tensors in the VariantTensorDataProto.
-  repeated string keys = 2;
-}
diff --git a/tensorflow/core/framework/lookup_interface.h b/tensorflow/core/framework/lookup_interface.h
index d33945fd1b0c44264855ed518714eb35faf4b29f..7e5dbe5632becb40fd75763eb4be9dfdc09ec82b 100644
--- a/tensorflow/core/framework/lookup_interface.h
+++ b/tensorflow/core/framework/lookup_interface.h
@@ -131,7 +131,7 @@ class LookupInterface : public ResourceBase {
   // - the default_value tensor shape matches the table's value shape.
   Status CheckFindArguments(const Tensor& keys, const Tensor& default_value);
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat("A lookup table of size: ", size());
   }
 
diff --git a/tensorflow/core/framework/memory_types.cc b/tensorflow/core/framework/memory_types.cc
index 6dff6fe654a51d3c274f7e2c7ca34961eb4f3c2a..8caea351be4442d348f4405bf4385a1349fc197b 100644
--- a/tensorflow/core/framework/memory_types.cc
+++ b/tensorflow/core/framework/memory_types.cc
@@ -62,7 +62,7 @@ void MemoryTypesHelper(const NameRangeMap& name_map,
 
 bool IsFunctionCallOp(const string& op_type) {
   return op_type == "SymbolicGradient" || op_type == "PartitionedCall" ||
-         op_type == "StatefulPartitionedCall";
+         op_type == "StatefulPartitionedCall" || op_type == "While";
 }
 
 }  // namespace
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index 3bd5b725b860ff522dba5be86ef7ab64b387b03e..a1c87a3f4210b7fb95597bed03a4d922a81fbfdf 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -29,6 +29,32 @@ std::shared_ptr<Parameter> MakeParameter(const string& name,
 
 namespace {
 
+// Given the average time between output events (`output_time`), the average
+// time between input events (`input_time`) and the buffer size, the method
+// computes the expected time an input event will have to wait.
+//
+// The wait time is approximated as the product of the probability the buffer
+// will be empty and the time it takes to produce an element into the buffer.
+//
+// The formula used for computing the probability is derived by modeling the
+// problem as an M/M/1/K queue
+// (https://en.wikipedia.org/wiki/Birth%E2%80%93death_process#M/M/1/K_queue).
+int64 ComputeWaitTime(int64 output_time, int64 input_time, int64 buffer_size) {
+  if (output_time == 0 || input_time == 0) {
+    return output_time;
+  }
+  if (input_time == output_time) {
+    const double p_buffer_empty = 1.0L / static_cast<double>(buffer_size + 1);
+    return p_buffer_empty * output_time;
+  }
+  const double alpha = 1.0L / static_cast<double>(input_time);
+  const double beta = 1.0L / static_cast<double>(output_time);
+  const double p_buffer_empty =
+      (1.0L - beta / alpha) /
+      (1.0L - std::pow((beta / alpha), static_cast<double>(buffer_size + 1)));
+  return p_buffer_empty * output_time;
+}
+
 // The first input of InterleaveMany corresponds to the input dataset whose
 // elements are used to create the (derived) input datasets whose elements are
 // interleaved as output.
@@ -119,8 +145,8 @@ class AsyncInterleaveMany : public Node {
         static_cast<double>(OutputTimeForInputs(input_times) -
                             inputs_.front()->OutputTime(input_times)) /
         static_cast<double>(inputs_.size() - 1) / parallelism;
-    return std::max(0LL,
-                    NanosPerElementLocked() + output_time - old_input_time);
+    return ComputeWaitTime(NanosPerElementLocked() + output_time,
+                           old_input_time, parallelism);
   }
 
   int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
@@ -202,7 +228,7 @@ class AsyncKnownRatio : public Node {
     if (ratio_ == 0.0) {
       int64 output_time =
           static_cast<double>(NanosPerElementLocked()) / parallelism;
-      return std::max(0LL, output_time - input_times->back());
+      return ComputeWaitTime(output_time, input_times->back(), parallelism);
     }
     int64 old_input_time = input_times->back();
     int64 new_input_time = static_cast<int64>(
@@ -213,7 +239,7 @@ class AsyncKnownRatio : public Node {
     int64 output_time = static_cast<int64>(
         static_cast<double>(NanosPerElementLocked()) / parallelism +
         ratio_ * OutputTimeForInputs(input_times));
-    return std::max(0LL, output_time - old_input_time);
+    return ComputeWaitTime(output_time, old_input_time, parallelism);
   }
 
   int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
@@ -354,8 +380,15 @@ std::shared_ptr<Node> Model::AddNode(Node::Factory factory, const string& name,
     output_ = node;
   }
   if (output) {
+    VLOG(3) << "Adding " << node->name() << "(id:" << node->id()
+            << ") as input for " << output->name() << "(id:" << output->id()
+            << ")";
     output->add_input(node);
+  } else {
+    VLOG(3) << "Adding " << node->name() << "(id:" << node->id() << ")";
   }
+  collect_resource_usage_ =
+      collect_resource_usage_ || node->has_tunable_parameters();
   lookup_table_.insert(std::make_pair(name, node));
   return node;
 }
@@ -441,7 +474,7 @@ void Model::RecordElement(const string& name) {
 void Model::RecordStart(const string& name, bool stop_output) {
   tf_shared_lock l(mu_);
   auto node = gtl::FindOrNull(lookup_table_, name);
-  if (node) {
+  if (collect_resource_usage_ && node) {
     int64 now_nanos = Env::Default()->NowNanos();
     if (stop_output && (*node)->output()) {
       (*node)->output()->record_stop(now_nanos);
@@ -453,7 +486,7 @@ void Model::RecordStart(const string& name, bool stop_output) {
 void Model::RecordStop(const string& name, bool start_output) {
   tf_shared_lock l(mu_);
   auto node = gtl::FindOrNull(lookup_table_, name);
-  if (node) {
+  if (collect_resource_usage_ && node) {
     int64 now_nanos = Env::Default()->NowNanos();
     (*node)->record_stop(now_nanos);
     if (start_output && (*node)->output()) {
@@ -465,8 +498,12 @@ void Model::RecordStop(const string& name, bool start_output) {
 void Model::RemoveNode(const string& name) {
   mutex_lock l(mu_);
   auto node = gtl::FindOrNull(lookup_table_, name);
-  if (node && (*node)->output()) {
-    (*node)->output()->remove_input(*node);
+  if (node) {
+    if ((*node)->output()) {
+      (*node)->output()->remove_input(*node);
+    }
+    VLOG(3) << "Removing " << (*node)->name() << "(id:" << (*node)->id() << ")";
+    remove_node_hook_(*node);
   }
   lookup_table_.erase(name);
 }
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index 10059bbfd5a89a3b24ce3daf981408564a5351b2..7fac1753a6332e1db4d01c15e68242ac15b388ca 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -34,18 +34,24 @@ namespace tensorflow {
 namespace data {
 namespace model {
 
+// A constant that can be used to enable auto-tuning.
+constexpr int kAutoTune = -1;
+
 // Represents thread-safe state that can be shared between an input pipeline and
 // the performance model.
 struct SharedState {
  public:
   SharedState(int64 value, std::shared_ptr<mutex> mu,
               std::shared_ptr<condition_variable> cond_var)
-      : value(value), mu(std::move(mu)), cond_var(std::move(cond_var)) {}
+      : value(value),
+        mu(std::move(mu)),
+        cond_var(std::move(cond_var)),
+        tunable(value == kAutoTune) {}
 
   int64 value;
   std::shared_ptr<mutex> mu;
   std::shared_ptr<condition_variable> cond_var;
-  bool tunable = false;
+  const bool tunable;
 };
 
 // Represents a parameter.
@@ -136,6 +142,15 @@ class Node {
     return buffered_bytes_;
   }
 
+  // Indicates whether the node has tunable parameters.
+  bool has_tunable_parameters() const LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    for (const auto& pair : parameters_) {
+      if (pair.second->state->tunable) return true;
+    }
+    return false;
+  }
+
   // Returns the unique node ID.
   int64 id() const LOCKS_EXCLUDED(mu_) { return id_; }
 
@@ -295,7 +310,7 @@ class Node {
   std::map<string, std::shared_ptr<Parameter>> parameters_ GUARDED_BY(mu_);
   std::list<std::shared_ptr<Node>> inputs_ GUARDED_BY(mu_);
 
-  // The reference to the output node is not owned so that that deletion of a
+  // The reference to the output node is not owned so that deletion of a
   // node results in recursive deletion of the subtree rooted in the node.
   Node* const output_;
 };
@@ -344,7 +359,22 @@ std::shared_ptr<Node> MakeUnknownNode(Node::Args args);
 // implementation of `DatasetBase` and `DatasetBaseIterator` respectively.
 class Model {
  public:
-  Model() = default;
+  using NodeHook = std::function<void(std::shared_ptr<Node>)>;
+
+  // Creates a new model.
+  //
+  // The `remove_node_hook` argument can be used to specify functionality that
+  // should be invoked before a node is removed from the model. The hook can be
+  // used for dependency injection -- to allow the model to invoke functionality
+  // from modules that it could not depend on statically.
+  Model(NodeHook remove_node_hook)
+      : collect_resource_usage_(false),
+        remove_node_hook_(std::move(remove_node_hook)) {
+    DCHECK(remove_node_hook_ != nullptr);
+  }
+
+  // Indicates whether to collect resource usage.
+  bool collect_resource_usage() const { return collect_resource_usage_; }
 
   // Adds a node with the given name and given output.
   std::shared_ptr<Node> AddNode(Node::Factory factory, const string& name,
@@ -388,6 +418,17 @@ class Model {
   int64 id_counter_ GUARDED_BY(mu_) = 1;
   std::shared_ptr<Node> output_ GUARDED_BY(mu_);
   std::map<string, std::shared_ptr<Node>> lookup_table_ GUARDED_BY(mu_);
+
+  // Indicates whether the modeling framework should collect resource usage
+  // (e.g. CPU, memory). The logic for collecting this information assumes that
+  // the collection is not repeatedly disabled and enabled. As a consequence,
+  // the implementation starts collecting resource usage when it encounters a
+  // tunable parameter (because the information is used for for tuning the value
+  // of the parameter) and never stops.
+  std::atomic<bool> collect_resource_usage_;
+
+  // A hook invoked immediately before a node is removed from the model.
+  const NodeHook remove_node_hook_;
 };
 
 }  // namespace model
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index 90bd570f90cdab2182f3d46e009b2cd972667ef9..1d7f407e180d37a61bfb3191dbd04f9bb1ca60d5 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -57,33 +57,36 @@ TEST_P(AsyncInterleaveManyTest, Model) {
   });
   std::vector<int64> input_times(1, input_time);
   async_interleave_many->add_processing_time(100);
-  EXPECT_EQ(100, async_interleave_many->processing_time());
-  EXPECT_EQ(0, async_interleave_many->ProcessingTime());
-  EXPECT_EQ(0, async_interleave_many->OutputTime(&input_times));
+  EXPECT_EQ(async_interleave_many->processing_time(), 100);
+  EXPECT_EQ(async_interleave_many->ProcessingTime(), 0);
+  EXPECT_EQ(async_interleave_many->OutputTime(&input_times), 0);
   async_interleave_many->record_element();
-  EXPECT_EQ(1, async_interleave_many->num_elements());
-  EXPECT_EQ(100, async_interleave_many->ProcessingTime());
-  EXPECT_EQ(std::max(0LL, 100 - input_time),
-            async_interleave_many->OutputTime(&input_times));
+  EXPECT_EQ(async_interleave_many->num_elements(), 1);
+  EXPECT_EQ(async_interleave_many->ProcessingTime(), 100);
+  EXPECT_LE(async_interleave_many->OutputTime(&input_times), 100);
+  EXPECT_GE(async_interleave_many->OutputTime(&input_times), 0);
   source1->add_processing_time(200);
   source2->add_processing_time(300);
-  EXPECT_EQ(100, async_interleave_many->ProcessingTime());
-  EXPECT_EQ(std::max(0LL, 100 - input_time),
-            async_interleave_many->OutputTime(&input_times));
+  EXPECT_EQ(async_interleave_many->ProcessingTime(), 100);
+  EXPECT_LE(async_interleave_many->OutputTime(&input_times), 100);
+  EXPECT_GE(async_interleave_many->OutputTime(&input_times), 0);
   source1->record_element();
   source2->record_element();
-  EXPECT_EQ(100 + 250, async_interleave_many->ProcessingTime());
-  EXPECT_EQ(std::max(0LL, 100 + 250 / parallelism - input_time),
-            async_interleave_many->OutputTime(&input_times));
+  EXPECT_EQ(async_interleave_many->ProcessingTime(), 100 + 250);
+  EXPECT_LE(async_interleave_many->OutputTime(&input_times),
+            100 + 250 / parallelism);
+  EXPECT_GE(async_interleave_many->OutputTime(&input_times), 0);
   async_interleave_many->record_element();
-  EXPECT_EQ(50 + 250, async_interleave_many->ProcessingTime());
-  EXPECT_EQ(std::max(0LL, 50 + 250 / parallelism - input_time),
-            async_interleave_many->OutputTime(&input_times));
+  EXPECT_EQ(async_interleave_many->ProcessingTime(), 50 + 250);
+  EXPECT_LE(async_interleave_many->OutputTime(&input_times),
+            50 + 250 / parallelism);
+  EXPECT_GE(async_interleave_many->OutputTime(&input_times), 0);
 }
 
-INSTANTIATE_TEST_CASE_P(Test, AsyncInterleaveManyTest,
-                        ::testing::Combine(::testing::Values(1, 2),
-                                           ::testing::Values(0, 50, 100, 200)));
+INSTANTIATE_TEST_SUITE_P(Test, AsyncInterleaveManyTest,
+                         ::testing::Combine(::testing::Values(1, 2),
+                                            ::testing::Values(0, 50, 100,
+                                                              200)));
 
 class AsyncKnownRatioTest
     : public ::testing::TestWithParam<std::tuple<int64, int64, int64>> {};
@@ -106,53 +109,58 @@ TEST_P(AsyncKnownRatioTest, Model) {
   async_known_many->add_input(source2);
   std::vector<int64> input_times(1, input_time);
   source1->add_processing_time(100);
-  EXPECT_EQ(0, async_known_many->ProcessingTime());
-  EXPECT_EQ(0, async_known_many->OutputTime(&input_times));
+  EXPECT_EQ(async_known_many->ProcessingTime(), 0);
+  EXPECT_EQ(async_known_many->OutputTime(&input_times), 0);
   source2->add_processing_time(200);
-  EXPECT_EQ(0, async_known_many->ProcessingTime());
-  EXPECT_EQ(0, async_known_many->OutputTime(&input_times));
+  EXPECT_EQ(async_known_many->ProcessingTime(), 0);
+  EXPECT_EQ(async_known_many->OutputTime(&input_times), 0);
   source1->record_element();
-  EXPECT_EQ(num_inputs_per_output * 100, async_known_many->ProcessingTime());
-  EXPECT_EQ(std::max(0LL, num_inputs_per_output * 100 - input_time),
-            async_known_many->OutputTime(&input_times));
+  EXPECT_EQ(async_known_many->ProcessingTime(), num_inputs_per_output * 100);
+  EXPECT_LE(async_known_many->OutputTime(&input_times),
+            num_inputs_per_output * 100);
+  EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
   source2->record_element();
-  EXPECT_EQ(num_inputs_per_output * (100 + 200),
-            async_known_many->ProcessingTime());
-  EXPECT_EQ(std::max(0LL, num_inputs_per_output * (100 + 200) - input_time),
-            async_known_many->OutputTime(&input_times));
+  EXPECT_EQ(async_known_many->ProcessingTime(),
+            num_inputs_per_output * (100 + 200));
+  EXPECT_LE(async_known_many->OutputTime(&input_times),
+            num_inputs_per_output * (100 + 200));
+  EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
   source1->record_element();
-  EXPECT_EQ(num_inputs_per_output * (50 + 200),
-            async_known_many->ProcessingTime());
-  EXPECT_EQ(std::max(0LL, num_inputs_per_output * (50 + 200) - input_time),
-            async_known_many->OutputTime(&input_times));
+  EXPECT_EQ(async_known_many->ProcessingTime(),
+            num_inputs_per_output * (50 + 200));
+  EXPECT_LE(async_known_many->OutputTime(&input_times),
+            num_inputs_per_output * (50 + 200));
+  EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
   source2->record_element();
-  EXPECT_EQ(num_inputs_per_output * (50 + 100),
-            async_known_many->ProcessingTime());
-  EXPECT_EQ(std::max(0LL, num_inputs_per_output * (50 + 100) - input_time),
-            async_known_many->OutputTime(&input_times));
+  EXPECT_EQ(async_known_many->ProcessingTime(),
+            num_inputs_per_output * (50 + 100));
+  EXPECT_LE(async_known_many->OutputTime(&input_times),
+            num_inputs_per_output * (50 + 100));
+  EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
   async_known_many->add_processing_time(128);
-  EXPECT_EQ(num_inputs_per_output * (50 + 100),
-            async_known_many->ProcessingTime());
-  EXPECT_EQ(std::max(0LL, num_inputs_per_output * (50 + 100) - input_time),
-            async_known_many->OutputTime(&input_times));
+  EXPECT_EQ(async_known_many->ProcessingTime(),
+            num_inputs_per_output * (50 + 100));
+  EXPECT_LE(async_known_many->OutputTime(&input_times),
+            num_inputs_per_output * (50 + 100));
+  EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
   async_known_many->record_element();
-  EXPECT_EQ(num_inputs_per_output * (50 + 100) + 128,
-            async_known_many->ProcessingTime());
-  EXPECT_EQ(std::max(0LL, num_inputs_per_output * (50 + 100) +
-                              128 / parallelism - input_time),
-            async_known_many->OutputTime(&input_times));
+  EXPECT_EQ(async_known_many->ProcessingTime(),
+            num_inputs_per_output * (50 + 100) + 128);
+  EXPECT_LE(async_known_many->OutputTime(&input_times),
+            num_inputs_per_output * (50 + 100) + 128 / parallelism);
+  EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
   async_known_many->record_element();
-  EXPECT_EQ(num_inputs_per_output * (50 + 100) + 64,
-            async_known_many->ProcessingTime());
-  EXPECT_EQ(std::max(0LL, num_inputs_per_output * (50 + 100) +
-                              64 / parallelism - input_time),
-            async_known_many->OutputTime(&input_times));
+  EXPECT_EQ(async_known_many->ProcessingTime(),
+            num_inputs_per_output * (50 + 100) + 64);
+  EXPECT_LE(async_known_many->OutputTime(&input_times),
+            num_inputs_per_output * (50 + 100) + 64 / parallelism);
+  EXPECT_GE(async_known_many->OutputTime(&input_times), 0);
 }
 
-INSTANTIATE_TEST_CASE_P(Test, AsyncKnownRatioTest,
-                        ::testing::Combine(::testing::Values(1, 2, 4, 8),
-                                           ::testing::Values(0, 50, 100, 200),
-                                           ::testing::Values(0, 1, 2, 4)));
+INSTANTIATE_TEST_SUITE_P(Test, AsyncKnownRatioTest,
+                         ::testing::Combine(::testing::Values(1, 2, 4, 8),
+                                            ::testing::Values(0, 50, 100, 200),
+                                            ::testing::Values(0, 1, 2, 4)));
 
 TEST(InterleaveManyTest, Model) {
   std::shared_ptr<Node> interleave_many =
@@ -168,24 +176,24 @@ TEST(InterleaveManyTest, Model) {
   interleave_many->add_input(source2);
   std::vector<int64> input_times(1, 0);
   interleave_many->add_processing_time(100);
-  EXPECT_EQ(100, interleave_many->processing_time());
-  EXPECT_EQ(0, interleave_many->ProcessingTime());
-  EXPECT_EQ(0, interleave_many->OutputTime(&input_times));
+  EXPECT_EQ(interleave_many->processing_time(), 100);
+  EXPECT_EQ(interleave_many->ProcessingTime(), 0);
+  EXPECT_EQ(interleave_many->OutputTime(&input_times), 0);
   interleave_many->record_element();
-  EXPECT_EQ(1, interleave_many->num_elements());
-  EXPECT_EQ(100, interleave_many->ProcessingTime());
-  EXPECT_EQ(100, interleave_many->OutputTime(&input_times));
+  EXPECT_EQ(interleave_many->num_elements(), 1);
+  EXPECT_EQ(interleave_many->ProcessingTime(), 100);
+  EXPECT_EQ(interleave_many->OutputTime(&input_times), 100);
   source1->add_processing_time(200);
   source2->add_processing_time(300);
-  EXPECT_EQ(100, interleave_many->ProcessingTime());
-  EXPECT_EQ(100, interleave_many->OutputTime(&input_times));
+  EXPECT_EQ(interleave_many->ProcessingTime(), 100);
+  EXPECT_EQ(interleave_many->OutputTime(&input_times), 100);
   source1->record_element();
   source2->record_element();
-  EXPECT_EQ(350, interleave_many->ProcessingTime());
-  EXPECT_EQ(350, interleave_many->OutputTime(&input_times));
+  EXPECT_EQ(interleave_many->ProcessingTime(), 350);
+  EXPECT_EQ(interleave_many->OutputTime(&input_times), 350);
   interleave_many->record_element();
-  EXPECT_EQ(300, interleave_many->ProcessingTime());
-  EXPECT_EQ(300, interleave_many->OutputTime(&input_times));
+  EXPECT_EQ(interleave_many->ProcessingTime(), 300);
+  EXPECT_EQ(interleave_many->OutputTime(&input_times), 300);
 }
 
 class KnownRatioTest : public ::testing::TestWithParam<int64> {};
@@ -202,59 +210,59 @@ TEST_P(KnownRatioTest, Model) {
   known_many->add_input(source2);
   std::vector<int64> input_times(1, 0);
   source1->add_processing_time(100);
-  EXPECT_EQ(0, known_many->ProcessingTime());
-  EXPECT_EQ(0, known_many->OutputTime(&input_times));
+  EXPECT_EQ(known_many->ProcessingTime(), 0);
+  EXPECT_EQ(known_many->OutputTime(&input_times), 0);
   source2->add_processing_time(200);
-  EXPECT_EQ(0, known_many->ProcessingTime());
-  EXPECT_EQ(0, known_many->OutputTime(&input_times));
+  EXPECT_EQ(known_many->ProcessingTime(), 0);
+  EXPECT_EQ(known_many->OutputTime(&input_times), 0);
   source1->record_element();
-  EXPECT_EQ(num_inputs_per_output * 100, known_many->ProcessingTime());
-  EXPECT_EQ(num_inputs_per_output * 100, known_many->OutputTime(&input_times));
+  EXPECT_EQ(known_many->ProcessingTime(), num_inputs_per_output * 100);
+  EXPECT_EQ(known_many->OutputTime(&input_times), num_inputs_per_output * 100);
   source2->record_element();
-  EXPECT_EQ(num_inputs_per_output * (100 + 200), known_many->ProcessingTime());
-  EXPECT_EQ(num_inputs_per_output * (100 + 200),
-            known_many->OutputTime(&input_times));
+  EXPECT_EQ(known_many->ProcessingTime(), num_inputs_per_output * (100 + 200));
+  EXPECT_EQ(known_many->OutputTime(&input_times),
+            num_inputs_per_output * (100 + 200));
   source1->record_element();
-  EXPECT_EQ(num_inputs_per_output * (50 + 200), known_many->ProcessingTime());
-  EXPECT_EQ(num_inputs_per_output * (50 + 200),
-            known_many->OutputTime(&input_times));
+  EXPECT_EQ(known_many->ProcessingTime(), num_inputs_per_output * (50 + 200));
+  EXPECT_EQ(known_many->OutputTime(&input_times),
+            num_inputs_per_output * (50 + 200));
   source2->record_element();
-  EXPECT_EQ(num_inputs_per_output * (50 + 100), known_many->ProcessingTime());
-  EXPECT_EQ(num_inputs_per_output * (50 + 100),
-            known_many->OutputTime(&input_times));
+  EXPECT_EQ(known_many->ProcessingTime(), num_inputs_per_output * (50 + 100));
+  EXPECT_EQ(known_many->OutputTime(&input_times),
+            num_inputs_per_output * (50 + 100));
   known_many->add_processing_time(128);
-  EXPECT_EQ(num_inputs_per_output * (50 + 100), known_many->ProcessingTime());
-  EXPECT_EQ(num_inputs_per_output * (50 + 100),
-            known_many->OutputTime(&input_times));
+  EXPECT_EQ(known_many->ProcessingTime(), num_inputs_per_output * (50 + 100));
+  EXPECT_EQ(known_many->OutputTime(&input_times),
+            num_inputs_per_output * (50 + 100));
   known_many->record_element();
-  EXPECT_EQ(num_inputs_per_output * (50 + 100) + 128,
-            known_many->ProcessingTime());
-  EXPECT_EQ(num_inputs_per_output * (50 + 100) + 128,
-            known_many->OutputTime(&input_times));
+  EXPECT_EQ(known_many->ProcessingTime(),
+            num_inputs_per_output * (50 + 100) + 128);
+  EXPECT_EQ(known_many->OutputTime(&input_times),
+            num_inputs_per_output * (50 + 100) + 128);
   known_many->record_element();
-  EXPECT_EQ(num_inputs_per_output * (50 + 100) + 64,
-            known_many->ProcessingTime());
-  EXPECT_EQ(num_inputs_per_output * (50 + 100) + 64,
-            known_many->OutputTime(&input_times));
+  EXPECT_EQ(known_many->ProcessingTime(),
+            num_inputs_per_output * (50 + 100) + 64);
+  EXPECT_EQ(known_many->OutputTime(&input_times),
+            num_inputs_per_output * (50 + 100) + 64);
 }
 
-INSTANTIATE_TEST_CASE_P(Test, KnownRatioTest, ::testing::Values(0, 1, 2, 4));
+INSTANTIATE_TEST_SUITE_P(Test, KnownRatioTest, ::testing::Values(0, 1, 2, 4));
 
 TEST(SourceTest, Model) {
   std::shared_ptr<Node> source = model::MakeSourceNode({0, "source", nullptr});
   std::vector<int64> input_times(1, 0);
   source->add_processing_time(100);
-  EXPECT_EQ(100, source->processing_time());
-  EXPECT_EQ(0, source->ProcessingTime());
-  EXPECT_EQ(0, source->OutputTime(&input_times));
+  EXPECT_EQ(source->processing_time(), 100);
+  EXPECT_EQ(source->ProcessingTime(), 0);
+  EXPECT_EQ(source->OutputTime(&input_times), 0);
   source->record_element();
-  EXPECT_EQ(1, source->num_elements());
-  EXPECT_EQ(100, source->ProcessingTime());
-  EXPECT_EQ(100, source->OutputTime(&input_times));
+  EXPECT_EQ(source->num_elements(), 1);
+  EXPECT_EQ(source->ProcessingTime(), 100);
+  EXPECT_EQ(source->OutputTime(&input_times), 100);
   source->record_element();
-  EXPECT_EQ(2, source->num_elements());
-  EXPECT_EQ(50, source->ProcessingTime());
-  EXPECT_EQ(50, source->OutputTime(&input_times));
+  EXPECT_EQ(source->num_elements(), 2);
+  EXPECT_EQ(source->ProcessingTime(), 50);
+  EXPECT_EQ(source->OutputTime(&input_times), 50);
 }
 
 TEST(UnknownRatioTest, Model) {
@@ -268,24 +276,24 @@ TEST(UnknownRatioTest, Model) {
   unknown_many->add_input(source2);
   std::vector<int64> input_times(1, 0);
   unknown_many->add_processing_time(100);
-  EXPECT_EQ(100, unknown_many->processing_time());
-  EXPECT_EQ(0, unknown_many->ProcessingTime());
-  EXPECT_EQ(0, unknown_many->OutputTime(&input_times));
+  EXPECT_EQ(unknown_many->processing_time(), 100);
+  EXPECT_EQ(unknown_many->ProcessingTime(), 0);
+  EXPECT_EQ(unknown_many->OutputTime(&input_times), 0);
   unknown_many->record_element();
-  EXPECT_EQ(1, unknown_many->num_elements());
-  EXPECT_EQ(100, unknown_many->ProcessingTime());
-  EXPECT_EQ(100, unknown_many->OutputTime(&input_times));
+  EXPECT_EQ(unknown_many->num_elements(), 1);
+  EXPECT_EQ(unknown_many->ProcessingTime(), 100);
+  EXPECT_EQ(unknown_many->OutputTime(&input_times), 100);
   source1->add_processing_time(100);
   source2->add_processing_time(200);
-  EXPECT_EQ(100, unknown_many->ProcessingTime());
-  EXPECT_EQ(100, unknown_many->OutputTime(&input_times));
+  EXPECT_EQ(unknown_many->ProcessingTime(), 100);
+  EXPECT_EQ(unknown_many->OutputTime(&input_times), 100);
   source1->record_element();
   source2->record_element();
-  EXPECT_EQ(400, unknown_many->ProcessingTime());
-  EXPECT_EQ(400, unknown_many->OutputTime(&input_times));
+  EXPECT_EQ(unknown_many->ProcessingTime(), 400);
+  EXPECT_EQ(unknown_many->OutputTime(&input_times), 400);
   unknown_many->record_element();
-  EXPECT_EQ(200, unknown_many->ProcessingTime());
-  EXPECT_EQ(200, unknown_many->OutputTime(&input_times));
+  EXPECT_EQ(unknown_many->ProcessingTime(), 200);
+  EXPECT_EQ(unknown_many->OutputTime(&input_times), 200);
 }
 
 TEST(UnknownTest, Model) {
@@ -299,35 +307,35 @@ TEST(UnknownTest, Model) {
   unknown->add_input(source2);
   std::vector<int64> input_times(1, 0);
   source1->add_processing_time(100);
-  EXPECT_EQ(0, unknown->ProcessingTime());
-  EXPECT_EQ(0, unknown->OutputTime(&input_times));
+  EXPECT_EQ(unknown->ProcessingTime(), 0);
+  EXPECT_EQ(unknown->OutputTime(&input_times), 0);
   source2->add_processing_time(100);
-  EXPECT_EQ(0, unknown->ProcessingTime());
-  EXPECT_EQ(0, unknown->OutputTime(&input_times));
+  EXPECT_EQ(unknown->ProcessingTime(), 0);
+  EXPECT_EQ(unknown->OutputTime(&input_times), 0);
   source1->record_element();
-  EXPECT_EQ(100, unknown->ProcessingTime());
-  EXPECT_EQ(100, unknown->OutputTime(&input_times));
+  EXPECT_EQ(unknown->ProcessingTime(), 100);
+  EXPECT_EQ(unknown->OutputTime(&input_times), 100);
   source2->record_element();
-  EXPECT_EQ(200, unknown->ProcessingTime());
-  EXPECT_EQ(200, unknown->OutputTime(&input_times));
+  EXPECT_EQ(unknown->ProcessingTime(), 200);
+  EXPECT_EQ(unknown->OutputTime(&input_times), 200);
   source1->record_element();
-  EXPECT_EQ(150, unknown->ProcessingTime());
-  EXPECT_EQ(150, unknown->OutputTime(&input_times));
+  EXPECT_EQ(unknown->ProcessingTime(), 150);
+  EXPECT_EQ(unknown->OutputTime(&input_times), 150);
   source2->record_element();
-  EXPECT_EQ(100, unknown->ProcessingTime());
-  EXPECT_EQ(100, unknown->OutputTime(&input_times));
+  EXPECT_EQ(unknown->ProcessingTime(), 100);
+  EXPECT_EQ(unknown->OutputTime(&input_times), 100);
   // Unknown node processing time should not affect its ProcessingTime() or
   // OutputTime().
   unknown->add_processing_time(100);
-  EXPECT_EQ(100, unknown->processing_time());
-  EXPECT_EQ(100, unknown->ProcessingTime());
-  EXPECT_EQ(100, unknown->OutputTime(&input_times));
+  EXPECT_EQ(unknown->processing_time(), 100);
+  EXPECT_EQ(unknown->ProcessingTime(), 100);
+  EXPECT_EQ(unknown->OutputTime(&input_times), 100);
   // Unknown node number of elements should not affect its ProcessingTime() or
   // OutputTime().
   unknown->record_element();
-  EXPECT_EQ(1, unknown->num_elements());
-  EXPECT_EQ(100, unknown->ProcessingTime());
-  EXPECT_EQ(100, unknown->OutputTime(&input_times));
+  EXPECT_EQ(unknown->num_elements(), 1);
+  EXPECT_EQ(unknown->ProcessingTime(), 100);
+  EXPECT_EQ(unknown->OutputTime(&input_times), 100);
 }
 
 class TestNode : public model::Node {
@@ -355,35 +363,35 @@ class TestNode : public model::Node {
 TEST(SetterGetterTest, Node) {
   std::shared_ptr<TestNode> node =
       std::make_shared<TestNode>(model::Node::Args{-1, "TestNode", nullptr});
-  EXPECT_EQ(-1, node->id());
-  EXPECT_EQ("TestNode", node->name());
-  EXPECT_EQ(nullptr, node->output());
+  EXPECT_EQ(node->id(), -1);
+  EXPECT_EQ(node->name(), "TestNode");
+  EXPECT_EQ(node->output(), nullptr);
 
-  EXPECT_EQ(0, node->buffered_bytes());
+  EXPECT_EQ(node->buffered_bytes(), 0);
   node->add_buffered_bytes(42);
-  EXPECT_EQ(42, node->buffered_bytes());
+  EXPECT_EQ(node->buffered_bytes(), 42);
 
-  EXPECT_EQ(0, node->processing_time());
+  EXPECT_EQ(node->processing_time(), 0);
   node->record_start(1);
-  EXPECT_EQ(0, node->processing_time());
+  EXPECT_EQ(node->processing_time(), 0);
   node->record_stop(41);
-  EXPECT_EQ(40, node->processing_time());
+  EXPECT_EQ(node->processing_time(), 40);
   node->add_processing_time(2);
-  EXPECT_EQ(42, node->processing_time());
+  EXPECT_EQ(node->processing_time(), 42);
 
   std::shared_ptr<TestNode> input =
       std::make_shared<TestNode>(model::Node::Args{-1, "TestInput", node});
-  EXPECT_EQ(node.get(), input->output());
-  EXPECT_EQ(0, node->inputs().size());
+  EXPECT_EQ(input->output(), node.get());
+  EXPECT_EQ(node->inputs().size(), 0);
   node->add_input(input);
-  EXPECT_EQ(1, node->inputs().size());
-  EXPECT_EQ(input, node->inputs().front());
+  EXPECT_EQ(node->inputs().size(), 1);
+  EXPECT_EQ(node->inputs().front(), input);
   node->remove_input(input);
-  EXPECT_EQ(0, node->inputs().size());
+  EXPECT_EQ(node->inputs().size(), 0);
 
-  EXPECT_EQ(0, node->num_elements());
+  EXPECT_EQ(node->num_elements(), 0);
   node->record_element();
-  EXPECT_EQ(1, node->num_elements());
+  EXPECT_EQ(node->num_elements(), 1);
 }
 
 }  // namespace
diff --git a/tensorflow/core/framework/node_def.proto b/tensorflow/core/framework/node_def.proto
index 0a095f903f9f6b98b3247c547aaa4e21964f003e..73cbc9600c54e82a5e541d88eefcf679d241928c 100644
--- a/tensorflow/core/framework/node_def.proto
+++ b/tensorflow/core/framework/node_def.proto
@@ -60,4 +60,18 @@ message NodeDef {
   // attr's type field.
   // TODO(josh11b): Add some examples here showing best practices.
   map<string, AttrValue> attr = 5;
+
+  message ExperimentalDebugInfo {
+    // Opaque string inserted into error messages created by the runtime.
+    //
+    // This is intended to store the list of names of the nodes from the
+    // original graph that this node was derived. For example if this node, say
+    // C, was result of a fusion of 2 nodes A and B, then 'original_node' would
+    // be {A, B}. This information can be used to map errors originating at the
+    // current node to some top level source code.
+    repeated string original_node_names = 1;
+  };
+
+  // This stores debug information associated with the node.
+  ExperimentalDebugInfo experimental_debug_info = 6;
 };
diff --git a/tensorflow/core/framework/node_def_builder.cc b/tensorflow/core/framework/node_def_builder.cc
index 348a825af91f4c6093f35d9d564f111a971cde18..4808967ca6a1139cccf58ed1897306a5d54b3f1e 100644
--- a/tensorflow/core/framework/node_def_builder.cc
+++ b/tensorflow/core/framework/node_def_builder.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -37,7 +38,8 @@ void NodeDefBuilder::NodeOut::Reset(StringPiece n, int i, DataType dt) {
 }
 
 NodeDefBuilder::NodeDefBuilder(StringPiece name, StringPiece op_name,
-                               const OpRegistryInterface* op_registry) {
+                               const OpRegistryInterface* op_registry,
+                               const NodeDebugInfo* debug) {
   node_def_.set_name(string(name));
   const Status status = op_registry->LookUpOpDef(string(op_name), &op_def_);
   if (status.ok()) {
@@ -46,6 +48,13 @@ NodeDefBuilder::NodeDefBuilder(StringPiece name, StringPiece op_name,
     errors_.push_back(status.error_message());
     inputs_specified_ = 0;
   }
+  if (debug != nullptr) MergeDebugInfo(*debug, &node_def_);
+}
+
+NodeDefBuilder::NodeDefBuilder(StringPiece name, StringPiece op_name,
+                               const NodeDebugInfo& debug)
+    : NodeDefBuilder(name, op_name) {
+  MergeDebugInfo(debug, &node_def_);
 }
 
 NodeDefBuilder::NodeDefBuilder(StringPiece name, const OpDef* op_def)
diff --git a/tensorflow/core/framework/node_def_builder.h b/tensorflow/core/framework/node_def_builder.h
index ad07ec548003b5218179c75232c9247f3656574e..63d856d16c6e1dfedcfe44ff21b3222c8cc7c172 100644
--- a/tensorflow/core/framework/node_def_builder.h
+++ b/tensorflow/core/framework/node_def_builder.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -63,7 +64,10 @@ class NodeDefBuilder {
   // specified by calling the methods below.
   // REQUIRES: The OpDef must satisfy ValidateOpDef().
   NodeDefBuilder(StringPiece name, StringPiece op_name,
-                 const OpRegistryInterface* op_registry = OpRegistry::Global());
+                 const OpRegistryInterface* op_registry = OpRegistry::Global(),
+                 const NodeDebugInfo* debug = nullptr);
+  NodeDefBuilder(StringPiece name, StringPiece op_name,
+                 const NodeDebugInfo& debug);
   // REQUIRES: in addition, *op_def must outlive *this.
   NodeDefBuilder(StringPiece name, const OpDef* op_def);
 
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 95a787b2df02d48f316653ee5059b4f7e80f73e1..fee52375c139ada0e457efe1247a18d471e8aa46 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -106,13 +106,50 @@ string SummarizeAttrs(const NodeDef& node_def) {
   return SummarizeAttrsHelper(node_def, node_def.device());
 }
 
+string FormatNodeForError(const NodeDebugInfo& debug_info) {
+  return debug_info.original_node_names.empty()
+             ? errors::FormatNodeNameForError(debug_info.name)
+             : errors::FormatNodeNamesForError(debug_info.original_node_names);
+}
+
 string FormatNodeForError(const Node& node) {
-  return FormatNodeDefForError(node.def());
+  return FormatNodeForError(NodeDebugInfo(node));
 }
 
 string FormatNodeDefForError(const NodeDef& node_def) {
-  VLOG(1) << "Error in the node: " << SummarizeNodeDef(node_def);
-  return errors::FormatNodeNameForError(node_def.name());
+  return FormatNodeForError(NodeDebugInfo(node_def));
+}
+
+void GetMergedOriginalNodeNames(const NodeDebugInfo& from,
+                                const NodeDebugInfo& to,
+                                std::set<string>* names) {
+  if (!from.original_node_names.empty()) {
+    names->insert(from.original_node_names.begin(),
+                  from.original_node_names.end());
+  } else {
+    names->insert(from.name);
+  }
+  names->insert(to.original_node_names.begin(), to.original_node_names.end());
+}
+
+void MergeDebugInfo(const NodeDebugInfo& from, Node* to) {
+  std::set<string> names;
+  GetMergedOriginalNodeNames(from, NodeDebugInfo(*to), &names);
+  to->set_original_node_names({names.begin(), names.end()});
+}
+
+void MergeDebugInfo(const NodeDebugInfo& from, NodeDef* to) {
+  std::set<string> names;
+  GetMergedOriginalNodeNames(from, NodeDebugInfo(*to), &names);
+  to->mutable_experimental_debug_info()->clear_original_node_names();
+  if (!names.empty()) {
+    *to->mutable_experimental_debug_info()->mutable_original_node_names() = {
+        names.begin(), names.end()};
+  }
+}
+
+void MergeDebugInfo(const NodeDef& from, NodeDef* to) {
+  MergeDebugInfo(NodeDebugInfo(from), to);
 }
 
 const AttrValue* AttrSlice::Find(StringPiece attr_name) const {
@@ -478,10 +515,13 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
           ". (Check whether your GraphDef-interpreting binary is up to date "
           "with your GraphDef-generating binary.).");
     }
-    TF_RETURN_WITH_CONTEXT_IF_ERROR(
-        ValidateAttrValue(attr.second, *iter->second),
-        "; NodeDef: ", FormatNodeDefForError(node_def), "; ",
-        SummarizeOpDef(op_def));
+    // If attr value is placeholder, do not check it.
+    if (attr.second.placeholder().empty()) {
+      TF_RETURN_WITH_CONTEXT_IF_ERROR(
+          ValidateAttrValue(attr.second, *iter->second),
+          "; NodeDef: ", FormatNodeDefForError(node_def), "; ",
+          SummarizeOpDef(op_def));
+    }
     // Keep track of which attr names have (not) been found in the NodeDef.
     op_attrs.erase(iter);
   }
@@ -660,15 +700,23 @@ Status ValidateExternalNodeDefSyntax(const NodeDef& node_def) {
   return Status::OK();
 }
 
-Status AttachDef(const Status& status, const NodeDef& node_def) {
+Status AttachDef(const Status& status, const NodeDef& node_def,
+                 bool allow_multiple_formatted_node) {
   Status ret = status;
-  errors::AppendToMessage(
-      &ret, strings::StrCat(" [[", FormatNodeDefForError(node_def), "]]"));
+  string node_error;
+  if (!allow_multiple_formatted_node &&
+      status.error_message().find("{{node ") != string::npos) {
+    node_error = node_def.name();
+  } else {
+    node_error = FormatNodeDefForError(node_def);
+  }
+  errors::AppendToMessage(&ret, strings::StrCat(" [[", node_error, "]]"));
   return ret;
 }
 
-Status AttachDef(const Status& status, const Node& node) {
-  return AttachDef(status, node.def());
+Status AttachDef(const Status& status, const Node& node,
+                 bool allow_multiple_formatted_node) {
+  return AttachDef(status, node.def(), allow_multiple_formatted_node);
 }
 
 void AddNodeAttr(StringPiece name, const AttrValue& value, NodeDef* node_def) {
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index f682bb15355550622e8bbe384df790f1022bd630..598a3fb601086d34d72fa795eae1b94aab24f33b 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -29,6 +29,7 @@ limitations under the License.
 namespace tensorflow {
 
 class Node;
+struct NodeDebugInfo;
 
 // We forward declare protos so that kernels don't need to depend on them
 class NodeDef;
@@ -56,6 +57,12 @@ string SummarizeAttrs(const NodeDef& node_def);
 string FormatNodeForError(const Node& node);
 string FormatNodeDefForError(const NodeDef& node_def);
 
+// Merges the original node names from the debug information of 'from' to the
+// debug information of 'to'.
+void MergeDebugInfo(const NodeDebugInfo& from, Node* to);
+void MergeDebugInfo(const NodeDebugInfo& from, NodeDef* to);
+void MergeDebugInfo(const NodeDef& from, NodeDef* to);
+
 typedef protobuf::Map<string, AttrValue> AttrValueMap;
 
 // Adds an attr with name <name> and value <value> to *node_def.
@@ -308,10 +315,14 @@ void AddDefaultsToNodeDef(const OpDef& op_def, NodeDef* node_def);
 // NodeName     = [A-Za-z0-9.], [A-Za-z0-9_./] *
 Status ValidateExternalNodeDefSyntax(const NodeDef& node_def);
 
-// Returns "status" with kernel's NodeDef attached as additional text
-// in the error message.
-Status AttachDef(const Status& status, const NodeDef& node_def);
-Status AttachDef(const Status& status, const Node& node);
+// Returns "status" with formatted NodeDef attached as additional text
+// in the error message. If 'allow_multiple_formatted_node' is false and there
+// is already a formatted NodeDef present in 'status', we simply attach the name
+// of the NodeDef instead of the formatted string.
+Status AttachDef(const Status& status, const NodeDef& node_def,
+                 bool allow_multiple_formatted_node = false);
+Status AttachDef(const Status& status, const Node& node,
+                 bool allow_multiple_formatted_node = false);
 
 // Appends the given prefix and suffix to the original node name in order to
 // make the name unique. If it's an "Enter" node, use the same way to reset
diff --git a/tensorflow/core/framework/node_def_util_test.cc b/tensorflow/core/framework/node_def_util_test.cc
index d9d437024ab0f330f56901dc8da8faae794c61c4..7b2506336968dc94fc65c84bbb43551d7ebd44ea 100644
--- a/tensorflow/core/framework/node_def_util_test.cc
+++ b/tensorflow/core/framework/node_def_util_test.cc
@@ -573,5 +573,29 @@ TEST(FormatNodeForErrorTest, NodeDef) {
   EXPECT_EQ("{{node enter}}", FormatNodeDefForError(node_def));
 }
 
+TEST(AttachDef, AllowMultipleFormattedNode) {
+  NodeDef a;
+  a.set_name("a");
+  NodeDef b;
+  b.set_name("b");
+  Status s = Status(error::CANCELLED, "Error");
+  Status s2 = AttachDef(s, a, true);
+  EXPECT_EQ("Error\n\t [[{{node a}}]]", s2.error_message());
+  Status s3 = AttachDef(s2, b, true);
+  EXPECT_EQ("Error\n\t [[{{node a}}]]\n\t [[{{node b}}]]", s3.error_message());
+}
+
+TEST(AttachDef, DisallowMultipleFormattedNode) {
+  NodeDef a;
+  a.set_name("a");
+  NodeDef b;
+  b.set_name("b");
+  Status s = Status(error::CANCELLED, "Error");
+  Status s2 = AttachDef(s, a, false);
+  EXPECT_EQ("Error\n\t [[{{node a}}]]", s2.error_message());
+  Status s3 = AttachDef(s2, b, false);
+  EXPECT_EQ("Error\n\t [[{{node a}}]]\n\t [[b]]", s3.error_message());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op.cc b/tensorflow/core/framework/op.cc
index b8309eafb05251235bfaaa7b5489cac06f0024dc..b29d7ae77f031a9fff0dfa6280a43dba75f4ab71 100644
--- a/tensorflow/core/framework/op.cc
+++ b/tensorflow/core/framework/op.cc
@@ -60,6 +60,21 @@ void OpRegistry::Register(const OpRegistrationDataFactory& op_data_factory) {
 
 Status OpRegistry::LookUp(const string& op_type_name,
                           const OpRegistrationData** op_reg_data) const {
+  {
+    tf_shared_lock l(mu_);
+    if (initialized_) {
+      if (const OpRegistrationData* res =
+              gtl::FindWithDefault(registry_, op_type_name, nullptr)) {
+        *op_reg_data = res;
+        return Status::OK();
+      }
+    }
+  }
+  return LookUpSlow(op_type_name, op_reg_data);
+}
+
+Status OpRegistry::LookUpSlow(const string& op_type_name,
+                              const OpRegistrationData** op_reg_data) const {
   *op_reg_data = nullptr;
   const OpRegistrationData* res = nullptr;
 
diff --git a/tensorflow/core/framework/op.h b/tensorflow/core/framework/op.h
index 81ed5f95f0bf020780f1d71692388885ce702b70..538ce04ef44f591c7090489f7723121ee362e54f 100644
--- a/tensorflow/core/framework/op.h
+++ b/tensorflow/core/framework/op.h
@@ -144,6 +144,9 @@ class OpRegistry : public OpRegistryInterface {
   Status RegisterAlreadyLocked(const OpRegistrationDataFactory& op_data_factory)
       const EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
+  Status LookUpSlow(const string& op_type_name,
+                    const OpRegistrationData** op_reg_data) const;
+
   mutable mutex mu_;
   // Functions in deferred_ may only be called with mu_ held.
   mutable std::vector<OpRegistrationDataFactory> deferred_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/framework/op_def.proto b/tensorflow/core/framework/op_def.proto
index aea2d2bb09a2c2c80ae02b10b1222d6882606c3c..e44ecc9f6236210b3bcb21a4914243741c632d2c 100644
--- a/tensorflow/core/framework/op_def.proto
+++ b/tensorflow/core/framework/op_def.proto
@@ -54,6 +54,10 @@ message OpDef {
   // Description of the output(s).
   repeated ArgDef output_arg = 3;
 
+  // Named control outputs for this operation. Useful only for composite
+  // operations (i.e. functions) which want to name different control outputs.
+  repeated string control_output = 20;
+
   // Description of the graph-construction-time configuration of this
   // Op.  That is to say, this describes the attr fields that will
   // be specified in the NodeDef.
diff --git a/tensorflow/core/framework/op_def_builder.cc b/tensorflow/core/framework/op_def_builder.cc
index 8a9bb6318211ad1537727d7e60945897c4a9a63d..0a62a2e871ab1bfeea4c7cbc14e93173bbc1a3c1 100644
--- a/tensorflow/core/framework/op_def_builder.cc
+++ b/tensorflow/core/framework/op_def_builder.cc
@@ -316,6 +316,14 @@ bool ConsumeInOutTimesType(StringPiece* sp, StringPiece* out) {
       .GetResult(sp, out);
 }
 
+bool ConsumeControlOutName(StringPiece* sp, StringPiece* out) {
+  return Scanner(*sp)
+      .One(Scanner::LETTER)
+      .Any(Scanner::LETTER_DIGIT_UNDERSCORE)
+      .StopCapture()
+      .GetResult(sp, out);
+}
+
 #define VERIFY(expr, ...)                                             \
   do {                                                                \
     if (!(expr)) {                                                    \
@@ -409,6 +417,25 @@ void FinalizeInputOrOutput(StringPiece spec, bool is_output, OpDef* op_def,
 
 #undef VERIFY
 
+string ControlOutError(StringPiece orig, const string& op_name) {
+  return strings::StrCat(" from ControlOutput(\"", orig, "\") for Op ",
+                         op_name);
+}
+
+void FinalizeControlOutput(StringPiece name, OpDef* op_def,
+                           std::vector<string>* errors) {
+  StringPiece orig(name);
+
+  // Parse control output name.
+  StringPiece tmp_name;
+  if (!ConsumeControlOutName(&orig, &tmp_name)) {
+    errors->push_back(strings::StrCat("Trouble parsing 'name:'",
+                                      ControlOutError(orig, op_def->name())));
+  }
+
+  *op_def->add_control_output() = string(tmp_name.data(), tmp_name.size());
+}
+
 int num_leading_spaces(StringPiece s) {
   size_t i = 0;
   while (i < s.size() && s[i] == ' ') {
@@ -545,6 +572,11 @@ OpDefBuilder& OpDefBuilder::Output(string spec) {
   return *this;
 }
 
+OpDefBuilder& OpDefBuilder::ControlOutput(string name) {
+  control_outputs_.push_back(std::move(name));
+  return *this;
+}
+
 #ifndef TF_LEAN_BINARY
 OpDefBuilder& OpDefBuilder::Doc(string text) {
   if (!doc_.empty()) {
@@ -614,6 +646,9 @@ Status OpDefBuilder::Finalize(OpRegistrationData* op_reg_data) const {
   for (StringPiece output : outputs_) {
     FinalizeInputOrOutput(output, true, op_def, &errors);
   }
+  for (StringPiece control_output : control_outputs_) {
+    FinalizeControlOutput(control_output, op_def, &errors);
+  }
   FinalizeDoc(doc_, op_def, &errors);
 
   if (errors.empty()) return Status::OK();
diff --git a/tensorflow/core/framework/op_def_builder.h b/tensorflow/core/framework/op_def_builder.h
index 8077b20598c210d9266c168569f3d9a3a190c097..38d3f5cfc608d19b90b56b648b2ffb6bccbdd8f3 100644
--- a/tensorflow/core/framework/op_def_builder.h
+++ b/tensorflow/core/framework/op_def_builder.h
@@ -28,6 +28,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+class FunctionDefHelper;
+
 namespace shape_inference {
 class InferenceContext;
 }
@@ -150,12 +152,20 @@ class OpDefBuilder {
   Status Finalize(OpRegistrationData* op_reg_data) const;
 
  private:
+  friend class FunctionDefHelper;
+
+  // Adds control output to this OpDefBuilder (and returns *this).
+  // The <name> must be a valid node name (matches regexp
+  // [a-zA-Z][a-zA-Z0-9_]*). Named control output can only exist for functions.
+  OpDefBuilder& ControlOutput(string name);
+
   OpDef* op_def() { return &op_reg_data_.op_def; }
 
   OpRegistrationData op_reg_data_;
   std::vector<string> attrs_;
   std::vector<string> inputs_;
   std::vector<string> outputs_;
+  std::vector<string> control_outputs_;
   string doc_;
   std::vector<string> errors_;
 };
diff --git a/tensorflow/core/framework/op_def_util.cc b/tensorflow/core/framework/op_def_util.cc
index 3597f43d51987b0d46df90ad0db964927f16adf0..9c47ac0f017779fccfa40ab521a161e83fd1e7df 100644
--- a/tensorflow/core/framework/op_def_util.cc
+++ b/tensorflow/core/framework/op_def_util.cc
@@ -114,6 +114,8 @@ Status ValidateAttrValue(const AttrValue& attr_value,
         length = attr_value.list().shape_size();
       } else if (attr.type() == "list(tensor)") {
         length = attr_value.list().tensor_size();
+      } else if (attr.type() == "list(func)") {
+        length = attr_value.list().func_size();
       }
       if (length < attr.minimum()) {
         return errors::InvalidArgument(
@@ -833,25 +835,37 @@ bool OpDefEqual(const OpDef& o1, const OpDef& o2) {
   // Compare it separately here instead of serializing below.
   if (!RepeatedAttrDefEqual(o1.attr(), o2.attr())) return false;
 
-  // Clear attr field, serialize, and compare serialized strings
+  // `control_output` order doesn't matter.
+  std::set<string> control_output1(o1.control_output().begin(),
+                                   o1.control_output().end());
+  std::set<string> control_output2(o2.control_output().begin(),
+                                   o2.control_output().end());
+  if (control_output1 != control_output2) return false;
+
+  // Clear `attr` and `control_output` fields, serialize, and compare serialized
+  // strings.
   OpDef o1_copy = o1;
   OpDef o2_copy = o2;
   o1_copy.clear_attr();
+  o1_copy.clear_control_output();
   o2_copy.clear_attr();
-  string s1, s2;
-  SerializeToStringDeterministic(o1_copy, &s1);
-  SerializeToStringDeterministic(o2_copy, &s2);
-  if (s1 != s2) return false;
-  return true;
+  o2_copy.clear_control_output();
+
+  return AreSerializedProtosEqual(o1_copy, o2_copy);
 }
 
 uint64 OpDefHash(const OpDef& o) {
   uint64 h = RepeatedAttrDefHash(o.attr());
+
+  // Compute deterministic order-independent control outputs hash.
+  std::set<string> control_output(o.control_output().begin(),
+                                  o.control_output().end());
+  for (const auto& co : control_output) h = Hash64Combine(h, Hash64(co));
+
   OpDef o_copy = o;
   o_copy.clear_attr();
-  string s;
-  SerializeToStringDeterministic(o_copy, &s);
-  return Hash64(s.data(), s.size(), h);
+  o_copy.clear_control_output();
+  return DeterministicProtoHash64(o_copy, h);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index 505ab547755b46e0ff4af9920df6eb8961a4a9db..92a7038a404d2bf7f5bbf1e643f727f8c3dfc74a 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/util/proto/proto_utils.h"
 
 namespace tensorflow {
 
@@ -488,14 +489,21 @@ Status ApiDefMap::LoadFile(Env* env, const string& filename) {
   if (filename.empty()) return Status::OK();
   string contents;
   TF_RETURN_IF_ERROR(ReadFileToString(env, filename, &contents));
-  TF_RETURN_IF_ERROR(LoadApiDef(contents));
+  Status status = LoadApiDef(contents);
+  if (!status.ok()) {
+    // Return failed status annotated with filename to aid in debugging.
+    return Status(status.code(),
+                  strings::StrCat("Error parsing ApiDef file ", filename, ": ",
+                                  status.error_message()));
+  }
   return Status::OK();
 }
 
 Status ApiDefMap::LoadApiDef(const string& api_def_file_contents) {
   const string contents = PBTxtFromMultiline(api_def_file_contents);
   ApiDefs api_defs;
-  protobuf::TextFormat::ParseFromString(contents, &api_defs);
+  TF_RETURN_IF_ERROR(
+      proto_utils::ParseTextFormatFromString(contents, &api_defs));
   for (const auto& api_def : api_defs.op()) {
     // Check if the op definition is loaded. If op definition is not
     // loaded, then we just skip this ApiDef.
diff --git a/tensorflow/core/framework/op_gen_lib_test.cc b/tensorflow/core/framework/op_gen_lib_test.cc
index e0e77c74495d62d0d0d2bc1c75d50fb1963bdcfd..6b43d7dc68d2120b5bc06cb6eaa12ef460188bda 100644
--- a/tensorflow/core/framework/op_gen_lib_test.cc
+++ b/tensorflow/core/framework/op_gen_lib_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_gen_lib.h"
 
 #include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -39,7 +40,7 @@ constexpr char kTestOpList[] = R"(op {
     version: 123
     explanation: "foo"
   }
-)";
+})";
 
 constexpr char kTestApiDef[] = R"(op {
   graph_op_name: "testop"
@@ -455,6 +456,18 @@ op {
   ASSERT_EQ(tensorflow::error::FAILED_PRECONDITION, status.code());
 }
 
+TEST(OpGenLibTest, ApiDefInvalidSyntax) {
+  const string api_def = R"pb(
+    op { bad_op_name: "testop" }
+  )pb";
+
+  OpList op_list;
+  ApiDefMap api_map(op_list);
+  // Loading with invalid syntax (e.g. unrecognized field name) should fail.
+  auto status = api_map.LoadApiDef(api_def);
+  ASSERT_EQ(tensorflow::error::INVALID_ARGUMENT, status.code());
+}
+
 TEST(OpGenLibTest, ApiDefUpdateDocs) {
   const string op_list1 = R"(op {
   name: "testop"
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index e3cb4a40ec5503307813d292f4f538fb8577a25b..16ca40c31c73e0cab9cab408d59ac230b95e6cde 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -20,6 +20,9 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <cstdlib>
+#include <cstring>
+
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/graph.pb_text.h"
@@ -102,7 +105,8 @@ OpKernel::OpKernel(OpKernelConstruction* context,
       graph_def_version_(context->graph_def_version()),
       is_internal_(str_util::StartsWith(type_string(), "_")),
       input_name_map_(context->num_inputs()),
-      output_name_map_(context->num_outputs()) {
+      output_name_map_(context->num_outputs()),
+      cost_estimate_(OpKernel::kInitialCostEstimateCycles) {
   OP_REQUIRES_OK(context,
                  NameRangesForNode(*def_, *context->op_def_, &input_name_map_,
                                    &output_name_map_));
@@ -117,11 +121,21 @@ OpKernel::OpKernel(OpKernelConstruction* context,
 
 OpKernel::~OpKernel() {}
 
+const uint64 OpKernel::kInitialCostEstimateCycles;
+const uint64 OpKernel::kOpIsExpensiveThresholdCycles;
+const uint64 OpKernel::kCostDecay;
+
 const string& OpKernel::name() const { return def_->name(); }
 const string& OpKernel::type_string() const { return def_->op(); }
 const string& OpKernel::requested_device() const { return def_->device(); }
 const string& OpKernel::requested_input(int i) const { return def_->input(i); }
 
+// This static function exists only because device_attributes.pb.h is
+// already included here, and it can't be introduced elsewhere.
+/*static*/ int OpKernel::DeviceNumaNode(const DeviceBase* device) {
+  return device->attributes().locality().numa_node();
+}
+
 Status OpKernel::InputRange(StringPiece input_name, int* start,
                             int* stop) const {
   const auto result = input_name_map_.find(input_name);
@@ -401,7 +415,7 @@ Tensor OpKernelContext::mutable_input(int index, bool lock_held) {
     record_tensor_reference(tensor);
     return tensor;
   } else {
-    mutex_lock l(*input_ref_mutex(index));
+    tf_shared_lock l(*input_ref_mutex(index));
     Tensor& tensor = *((*params_->inputs)[index].tensor);
     record_tensor_reference(tensor);
     return tensor;
@@ -593,7 +607,7 @@ Status OpKernelContext::mutable_input(StringPiece name, Tensor* tensor,
   if (lock_held) {
     *tensor = *(*params_->inputs)[start].tensor;
   } else {
-    mutex_lock l(*input_ref_mutex(start));
+    tf_shared_lock l(*input_ref_mutex(start));
     *tensor = *(*params_->inputs)[start].tensor;
   }
   record_tensor_reference(*tensor);
@@ -987,6 +1001,12 @@ static Status IsProbablySafeToLoad(const string& path) {
 
 void LoadDynamicKernelsInternal() {
   Env* env = Env::Default();
+
+  // Override to allow loading unsafe packages for development.
+  // DO NOT USE UNLESS YOU KNOW WHAT ABI ISSUES YOU CAN ENCOUNTER.
+  bool override_abi_check =
+      strcmp(getenv("TF_REALLY_LOAD_UNSAFE_PACKAGES"), "1") == 0;
+
   string bazel_kernel_dir = io::JoinPath(env->GetRunfilesDir(),
                                          "tensorflow",
                                          "core",
@@ -999,7 +1019,12 @@ void LoadDynamicKernelsInternal() {
       string fullpath = io::JoinPath(bazel_kernel_dir, file);
       if (env->MatchPath(fullpath, dll_spec)) {
         Status s = IsProbablySafeToLoad(fullpath);
-        if (s.ok()) {
+        if (!s.ok() && override_abi_check) {
+          LOG(WARNING) << "Loading UNSAFE library " << fullpath
+                       << " because ABI check override is set: "
+                       << s.error_message();
+        }
+        if (s.ok() || override_abi_check) {
           // TODO(gunan): Store the handles to the opened files.
           void* unused_filehandle;
           TF_CHECK_OK(env->LoadLibrary(fullpath.c_str(), &unused_filehandle));
@@ -1063,6 +1088,11 @@ void OpKernelRegistrar::InitInternal(const KernelDef* kernel_def,
   delete kernel_def;
 }
 
+OpKernel* OpKernelRegistrar::PtrOpKernelFactory::Create(
+    OpKernelConstruction* context) {
+  return (*create_func_)(context);
+}
+
 }  // namespace kernel_factory
 
 namespace {
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 19a0c5e5be2e8cbb16d55db21d4d425d9add2974..ff0b44650ce5e306c73fc7c1a37d4d6ec2b3e6c2 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_OP_KERNEL_H_
 #define TENSORFLOW_CORE_FRAMEWORK_OP_KERNEL_H_
 
+#include <atomic>
 #include <functional>
 
 #include <utility>
@@ -47,6 +48,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/profile_utils/cpu_utils.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -116,10 +118,34 @@ class OpKernel {
   virtual AsyncOpKernel* AsAsync() { return nullptr; }
   virtual const AsyncOpKernel* AsAsync() const { return nullptr; }
 
+  // Initial time (in CPU cycles) we expect an operation to take.  Used to
+  // determine whether an operation should be place in a threadpool.  Operations
+  // start out "expensive".
+  static const uint64 kInitialCostEstimateCycles = 100 * 1000 * 1000;
+  static const uint64 kOpIsExpensiveThresholdCycles = 5000;
+  static const uint64 kCostDecay = 10;
+
   // Returns true iff this op kernel is considered "expensive". The
   // runtime may use this flag to optimize graph execution for example
   // to "inline" inexpensive kernels.
-  virtual bool IsExpensive() { return expensive_; }
+  virtual bool IsExpensive() {
+    return expensive_ && (cost_estimate_.load(std::memory_order_relaxed) >
+                          kOpIsExpensiveThresholdCycles);
+  }
+
+  // Updates the dynamic cost estimate, which is used to determine whether this
+  // op is expensive. The new cost estimate is a weighted average of the old
+  // cost estimate and the latest cost.
+  void UpdateCostEstimate(uint64 elapsed_cycles) {
+    // N.B. Updates to `cost_estimate_` are atomic but unlocked.  Simulataneous
+    // updates may result in one or more updates being ignored.  This does not
+    // affect correctness but may slow down the update frequency.
+    cost_estimate_.store(
+        (kCostDecay - 1) * cost_estimate_.load(std::memory_order_relaxed) /
+                kCostDecay +
+            (elapsed_cycles / kCostDecay),
+        std::memory_order_relaxed);
+  }
 
   // Accessors.
   const NodeDef& def() const { return *def_; }
@@ -171,6 +197,8 @@ class OpKernel {
   // TODO(irving): Move to TensorShapeUtils once !allow_legacy_scalars
   Status MakeShape(const Tensor& shape, TensorShape* out) const;
 
+  static int DeviceNumaNode(const DeviceBase* device);
+
  private:
   const std::unique_ptr<const NodeDef> def_;
   const DataTypeVector input_types_;
@@ -182,6 +210,7 @@ class OpKernel {
   NameRangeMap input_name_map_;
   NameRangeMap output_name_map_;
   bool expensive_;
+  std::atomic_uint_fast64_t cost_estimate_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(OpKernel);
 };
@@ -202,8 +231,6 @@ class AsyncOpKernel : public OpKernel {
   const AsyncOpKernel* AsAsync() const final { return this; }
 
   void Compute(OpKernelContext* context) final;
-
-  bool IsExpensive() override { return true; }
 };
 
 // Wraps a tensor that is held by an Op across calls to Compute(). For
@@ -376,7 +403,9 @@ class OpArgIterator {
   using iterator_category = std::forward_iterator_tag;
   using value_type = ElementType;
   using pointer = ElementType*;
+  using const_pointer = const ElementType*;
   using reference = ElementType&;
+  using const_reference = const ElementType&;
   using difference_type = ptrdiff_t;
 
   OpArgIterator(const ListType* list, int i) : list_(list), i_(i) {}
@@ -405,6 +434,9 @@ class OpArgIterator {
   reference operator*() { return (*list_)[i_]; }
   pointer operator->() { return &(*list_)[i_]; }
 
+  const_reference operator*() const { return (*list_)[i_]; }
+  const_pointer operator->() const { return &(*list_)[i_]; }
+
  private:
   const ListType* const list_;
   int i_;
@@ -493,11 +525,42 @@ struct TensorValue {
 // Used to store partitioned graphs from function-calling ops.
 struct GraphCollector {
   mutex mu;
-  std::vector<GraphDef> graphs GUARDED_BY(mu);
+  std::vector<GraphDef> partitioned_graphs GUARDED_BY(mu);
+  GraphDef raw_graph GUARDED_BY(mu);
+  GraphDef optimized_graph GUARDED_BY(mu);
+
+  bool dirty GUARDED_BY(mu);
+
+  GraphCollector() : dirty(false) {}
+
+  void CollectRawGraph(const GraphDef& graph) {
+    mutex_lock ml(mu);
+    raw_graph.MergeFrom(graph);
+    dirty = true;
+  }
+
+  void CollectOptimizedGraph(const GraphDef& graph) {
+    mutex_lock ml(mu);
+    optimized_graph.MergeFrom(graph);
+    dirty = true;
+  }
+
+  void CollectPartitionedGraph(const GraphDef& graph) {
+    mutex_lock ml(mu);
+    partitioned_graphs.push_back(graph);
+    dirty = true;
+  }
+
+  void ClearGraphs() EXCLUSIVE_LOCKS_REQUIRED(mu) {
+    raw_graph.Clear();
+    optimized_graph.Clear();
+    partitioned_graphs.clear();
+    dirty = false;
+  }
 
-  void CollectGraph(const GraphDef& graph) {
+  bool HasUpdatedGraphs() {
     mutex_lock ml(mu);
-    graphs.push_back(graph);
+    return dirty;
   }
 };
 
@@ -574,6 +637,9 @@ class OpKernelContext {
     // The session state for this op.
     SessionState* session_state = nullptr;
 
+    // Unique session identifier. Can be empty.
+    string session_handle;
+
     // The tensor store for this op.
     TensorStore* tensor_store = nullptr;
 
@@ -611,6 +677,10 @@ class OpKernelContext {
     static const int kNoReservation = -1;
     // Values in [0,...) represent reservations for the indexed output.
     const int* forward_from_array = nullptr;
+
+    // For tracking actively running deferred ops.
+    std::function<void()> inc_num_deferred_ops_function = []() {};
+    std::function<void()> dec_num_deferred_ops_function = []() {};
   };
 
   // params must outlive the OpKernelContext.
@@ -1002,6 +1072,9 @@ class OpKernelContext {
   // An op kernel can access the session state it belongs to.
   SessionState* session_state() const { return params_->session_state; }
 
+  // Unique identifier of the session it belongs to. Can be empty.
+  string session_handle() const { return params_->session_handle; }
+
   // An op kernel can access the tensor store of the run it belongs to.
   TensorStore* tensor_store() const { return params_->tensor_store; }
 
@@ -1134,6 +1207,24 @@ class OpKernelContext {
 
   bool input_is_ref(int index) const;
 
+  // Used by OpKernel implementations to track actively running deferred ops.
+  //
+  // A deferred op is one whose Compute method returns (or whose ComputeAsync
+  // method invokes the callback) when work is scheduled onto a device. At that
+  // point, we don't know when the work will actually complete (or if it has
+  // already completed) on the device. These functions allow the executor to
+  // track the status of deferred ops and act accordingly.
+  //
+  // Deferred OpKernel implementations must use these methods to get two
+  // functions. It then must call these two functions in pairs, before and after
+  // device execution, respectively.
+  TF_MUST_USE_RESULT std::function<void()> inc_num_deferred_ops_function() {
+    return params_->inc_num_deferred_ops_function;
+  }
+  TF_MUST_USE_RESULT std::function<void()> dec_num_deferred_ops_function() {
+    return params_->dec_num_deferred_ops_function;
+  }
+
  private:
   Allocator* get_allocator(AllocatorAttributes attr);
 
@@ -1376,23 +1467,21 @@ class OpKernelRegistrar {
     // Perform the check in the header to allow compile-time optimization
     // to a no-op, allowing the linker to remove the kernel symbols.
     if (kernel_def != nullptr) {
-      struct PtrOpKernelFactory : public OpKernelFactory {
-        explicit PtrOpKernelFactory(
-            OpKernel* (*create_func)(OpKernelConstruction*))
-            : create_func_(create_func) {}
-
-        OpKernel* Create(OpKernelConstruction* context) override {
-          return (*create_func_)(context);
-        }
-
-        OpKernel* (*create_func_)(OpKernelConstruction*);
-      };
       InitInternal(kernel_def, kernel_class_name,
                    absl::make_unique<PtrOpKernelFactory>(create_fn));
     }
   }
 
  private:
+  struct PtrOpKernelFactory : public OpKernelFactory {
+    explicit PtrOpKernelFactory(OpKernel* (*create_func)(OpKernelConstruction*))
+        : create_func_(create_func) {}
+
+    OpKernel* Create(OpKernelConstruction* context) override;
+
+    OpKernel* (*create_func_)(OpKernelConstruction*);
+  };
+
   void InitInternal(const KernelDef* kernel_def, StringPiece kernel_class_name,
                     std::unique_ptr<OpKernelFactory> factory);
 };
diff --git a/tensorflow/core/framework/op_segment.cc b/tensorflow/core/framework/op_segment.cc
index 75ed4a4eaf231839999efa285c88e2bceda61a07..f7e194baeede8deb529aa7d1f4a0ba3ccc44e792 100644
--- a/tensorflow/core/framework/op_segment.cc
+++ b/tensorflow/core/framework/op_segment.cc
@@ -104,7 +104,8 @@ bool OpSegment::ShouldOwnKernel(FunctionLibraryRuntime* lib,
                                 const string& node_op) {
   // OpSegment should not own kernel if the node is stateless, or a function.
   return lib->IsStateful(node_op) &&
-         lib->GetFunctionLibraryDefinition()->Find(node_op) == nullptr;
+         lib->GetFunctionLibraryDefinition()->Find(node_op) == nullptr &&
+         node_op != "PartitionedCall" && node_op != "StatefulPartitionedCall";
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/framework/ops_util.cc b/tensorflow/core/framework/ops_util.cc
index e8cf014ca03457e4673a14765cee5a05746b901a..4e603b9598fc43f894415b9b8aef6f641e484b6a 100644
--- a/tensorflow/core/framework/ops_util.cc
+++ b/tensorflow/core/framework/ops_util.cc
@@ -30,6 +30,9 @@ Eigen::PaddingType BrainPadding2EigenPadding(Padding padding) {
       return Eigen::PADDING_VALID;
     case Padding::SAME:
       return Eigen::PADDING_SAME;
+    case Padding::EXPLICIT:
+      LOG(FATAL) << "Eigen does not have explicit padding enum "  // Crash OK
+                    "value";
   }
   return Eigen::PADDING_SAME;  // Prevent compiler warning about missing return
 }
diff --git a/tensorflow/core/framework/queue_interface.h b/tensorflow/core/framework/queue_interface.h
index 4ca4416c5ac1471247758cd943d52a7c65f7afaf..9395cce1644f7e8fd09cf40a48b2d7a5abb30bb2 100644
--- a/tensorflow/core/framework/queue_interface.h
+++ b/tensorflow/core/framework/queue_interface.h
@@ -85,11 +85,11 @@ class QueueInterface : public ResourceBase {
   virtual Status MatchesNodeDef(const NodeDef& node_def) = 0;
 
   // Returns the number of elements in the queue.
-  virtual int32 size() = 0;
+  virtual int32 size() const = 0;
 
   virtual const DataTypeVector& component_dtypes() const = 0;
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat("A Queue of size: ", size());
   }
 
diff --git a/tensorflow/core/framework/reader_base.cc b/tensorflow/core/framework/reader_base.cc
index f84ef0f953cf23e3fb2af210706586f95cfbb8ad..ed4ff240393eab495e04b85d80b25377c578ac1e 100644
--- a/tensorflow/core/framework/reader_base.cc
+++ b/tensorflow/core/framework/reader_base.cc
@@ -241,7 +241,7 @@ Status ReaderBase::RestoreBaseState(const ReaderBaseState& state) {
   num_records_produced_ = state.num_records_produced();
   work_ = state.current_work();
   if (work_started_ < 0 || work_finished_ < 0 || num_records_produced_ < 0) {
-#ifdef __ANDROID__
+#if defined(__ANDROID__) || defined(__EMSCRIPTEN__)
     const string debug_string = "<debug state not available>";
 #else
     const string debug_string = state.DebugString();
@@ -251,7 +251,7 @@ Status ReaderBase::RestoreBaseState(const ReaderBaseState& state) {
         debug_string);
   }
   if (work_started_ > work_finished_) {
-#ifdef __ANDROID__
+#if defined(__ANDROID__) || (__EMSCRIPTEN__)
     const string debug_string = "<debug state not available>";
 #else
     const string debug_string = state.DebugString();
diff --git a/tensorflow/core/framework/reader_interface.h b/tensorflow/core/framework/reader_interface.h
index f894acbe1d5119081f088bb091049342b881f340..e47644cb8f27af63e1a96d9c3d44d84e8a55224d 100644
--- a/tensorflow/core/framework/reader_interface.h
+++ b/tensorflow/core/framework/reader_interface.h
@@ -76,7 +76,7 @@ class ReaderInterface : public ResourceBase {
   // Note: Must Reset on error.
   virtual Status RestoreState(const string& state) = 0;
 
-  string DebugString() override { return "a reader"; }
+  string DebugString() const override { return "a reader"; }
 
  protected:
   virtual ~ReaderInterface() {}
diff --git a/tensorflow/core/framework/rendezvous_test.cc b/tensorflow/core/framework/rendezvous_test.cc
index de148f0bd3474421c1361cf7ae4aa681107aa883..7a777f064c7b517de9f9c1c14648e5ff32ca4b5e 100644
--- a/tensorflow/core/framework/rendezvous_test.cc
+++ b/tensorflow/core/framework/rendezvous_test.cc
@@ -278,6 +278,12 @@ class DummyDeviceContext : public DeviceContext {
   ~DummyDeviceContext() override {}
   int stream_id() const { return stream_id_; }
 
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Device* device,
+                              Tensor* output_tensor,
+                              StatusCallback done) const override {
+    done(Status::OK());
+  }
+
  private:
   const int stream_id_;
 };
diff --git a/tensorflow/core/framework/resource_handle.h b/tensorflow/core/framework/resource_handle.h
index db213669a3f30b3b5587a4d587e2bfb039dacdda..d1f6771bf31e492ac47eb260c7d701d7a6c97b36 100644
--- a/tensorflow/core/framework/resource_handle.h
+++ b/tensorflow/core/framework/resource_handle.h
@@ -67,6 +67,11 @@ class ResourceHandle {
 
   string DebugString() const;
 
+  // GUID for anonymous resources. Resources with this shared_name will have
+  // their shared_name replaced with a GUID at creation time
+  static constexpr const char* ANONYMOUS_NAME =
+      "cd2c89b7-88b7-44c8-ad83-06c2a9158347";
+
  public:
   string device_;
   string container_;
diff --git a/tensorflow/core/framework/resource_mgr.cc b/tensorflow/core/framework/resource_mgr.cc
index 9f3204ab96050a1cc06ab3052741f0044369b83e..6a94ff6642e6f50655083756ae24a2c2b97bc7ec 100644
--- a/tensorflow/core/framework/resource_mgr.cc
+++ b/tensorflow/core/framework/resource_mgr.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <atomic>
+
 #include "tensorflow/core/framework/resource_mgr.h"
 
 #include "tensorflow/core/framework/device_attributes.pb.h"
@@ -26,6 +28,10 @@ limitations under the License.
 #include "tensorflow/core/platform/demangle.h"
 
 namespace tensorflow {
+
+// Used to generate unique names for anonymous variables
+static std::atomic<int64> current_id_;
+
 ResourceHandle MakeResourceHandle(OpKernelContext* ctx, const string& container,
                                   const string& name,
                                   const TypeIndex& type_index) {
@@ -38,7 +44,11 @@ ResourceHandle MakeResourceHandle(OpKernelContext* ctx, const string& container,
     actual_container = ctx->resource_manager()->default_container();
   }
   result.set_container(actual_container);
-  result.set_name(name);
+  if (name == ResourceHandle::ANONYMOUS_NAME) {
+    result.set_name(strings::StrCat("_AnonymousVar", current_id_.fetch_add(1)));
+  } else {
+    result.set_name(name);
+  }
   result.set_hash_code(type_index.hash_code());
   result.set_maybe_type_name(type_index.name());
   return result;
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index 3195cd2e9dccaaf26ac6111a78acdb7278ea92e7..da547d5829f846ae87857c410d731bcc9457cd3b 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -77,7 +77,7 @@ namespace tensorflow {
 class ResourceBase : public core::RefCounted {
  public:
   // Returns a debug string for *this.
-  virtual string DebugString() = 0;
+  virtual string DebugString() const = 0;
 
   // Returns memory used by this resource.
   virtual int64 MemoryUsed() const { return 0; }
@@ -89,9 +89,17 @@ class ScopedStepContainer {
   // step_id: the unique ID of this step. Doesn't have to be sequential, just
   // has to be unique.
   // cleanup: callback to delete a container of this name.
+  // prefix: optional string prefix to disambiguate step containers.
   ScopedStepContainer(const int64 step_id,
                       std::function<void(const string&)> cleanup)
       : name_(strings::StrCat("__per_step_", step_id)), cleanup_(cleanup) {}
+
+  ScopedStepContainer(const int64 step_id,
+                      std::function<void(const string&)> cleanup,
+                      const string& prefix)
+      : name_(strings::StrCat("__", prefix, "_per_step_", step_id)),
+        cleanup_(cleanup) {}
+
   ~ScopedStepContainer() { cleanup_(name_); }
 
   const string& name() const { return name_; }
@@ -124,14 +132,14 @@ class ResourceMgr {
   //
   // REQUIRES: std::is_base_of<ResourceBase, T>
   // REQUIRES: resource != nullptr
-  template <typename T>
+  template <typename T, bool use_dynamic_cast = false>
   Status Lookup(const string& container, const string& name,
                 T** resource) const TF_MUST_USE_RESULT;
 
   // Similar to Lookup, but looks up multiple resources at once, with only a
   // single lock acquisition.  If containers_and_names[i] is uninitialized
   // then this function does not modify resources[i].
-  template <typename T>
+  template <typename T, bool use_dynamic_cast = false>
   Status LookupMany(absl::Span<std::pair<const string*, const string*> const>
                         containers_and_names,
                     std::vector<std::unique_ptr<T, core::RefCountDeleter>>*
@@ -147,7 +155,7 @@ class ResourceMgr {
   //
   // REQUIRES: std::is_base_of<ResourceBase, T>
   // REQUIRES: resource != nullptr
-  template <typename T>
+  template <typename T, bool use_dynamic_cast = false>
   Status LookupOrCreate(const string& container, const string& name,
                         T** resource,
                         std::function<Status(T**)> creator) TF_MUST_USE_RESULT;
@@ -188,7 +196,7 @@ class ResourceMgr {
   mutable mutex mu_;
   std::unordered_map<string, Container*> containers_ GUARDED_BY(mu_);
 
-  template <typename T>
+  template <typename T, bool use_dynamic_cast = false>
   Status LookupInternal(const string& container, const string& name,
                         T** resource) const
       SHARED_LOCKS_REQUIRED(mu_) TF_MUST_USE_RESULT;
@@ -259,7 +267,7 @@ Status CreateResource(OpKernelContext* ctx, const ResourceHandle& p, T* value);
 //
 // If the lookup is successful, the caller takes the ownership of one ref on
 // `*value`, and must call its `Unref()` method when it has finished using it.
-template <typename T>
+template <typename T, bool use_dynamic_cast = false>
 Status LookupResource(OpKernelContext* ctx, const ResourceHandle& p, T** value);
 
 // Looks up multiple resources pointed by a sequence of resource handles.  If
@@ -429,15 +437,15 @@ Status ResourceMgr::Create(const string& container, const string& name,
   return DoCreate(container, MakeTypeIndex<T>(), name, resource);
 }
 
-template <typename T>
+template <typename T, bool use_dynamic_cast>
 Status ResourceMgr::Lookup(const string& container, const string& name,
                            T** resource) const {
   CheckDeriveFromResourceBase<T>();
   tf_shared_lock l(mu_);
-  return LookupInternal(container, name, resource);
+  return LookupInternal<T, use_dynamic_cast>(container, name, resource);
 }
 
-template <typename T>
+template <typename T, bool use_dynamic_cast>
 Status ResourceMgr::LookupMany(
     absl::Span<std::pair<const string*, const string*> const>
         containers_and_names,
@@ -447,8 +455,9 @@ Status ResourceMgr::LookupMany(
   resources->resize(containers_and_names.size());
   for (size_t i = 0; i < containers_and_names.size(); ++i) {
     T* resource;
-    Status s = LookupInternal(*containers_and_names[i].first,
-                              *containers_and_names[i].second, &resource);
+    Status s = LookupInternal<T, use_dynamic_cast>(
+        *containers_and_names[i].first, *containers_and_names[i].second,
+        &resource);
     if (s.ok()) {
       (*resources)[i].reset(resource);
     }
@@ -456,7 +465,18 @@ Status ResourceMgr::LookupMany(
   return Status::OK();
 }
 
+// Simple wrapper to allow conditional dynamic / static casts.
+template <typename T, bool use_dynamic_cast>
+struct TypeCastFunctor {
+  static T* Cast(ResourceBase* r) { return static_cast<T*>(r); }
+};
+
 template <typename T>
+struct TypeCastFunctor<T, true> {
+  static T* Cast(ResourceBase* r) { return dynamic_cast<T*>(r); }
+};
+
+template <typename T, bool use_dynamic_cast>
 Status ResourceMgr::LookupInternal(const string& container, const string& name,
                                    T** resource) const {
   ResourceBase* found = nullptr;
@@ -464,12 +484,12 @@ Status ResourceMgr::LookupInternal(const string& container, const string& name,
   if (s.ok()) {
     // It's safe to down cast 'found' to T* since
     // typeid(T).hash_code() is part of the map key.
-    *resource = static_cast<T*>(found);
+    *resource = TypeCastFunctor<T, use_dynamic_cast>::Cast(found);
   }
   return s;
 }
 
-template <typename T>
+template <typename T, bool use_dynamic_cast>
 Status ResourceMgr::LookupOrCreate(const string& container, const string& name,
                                    T** resource,
                                    std::function<Status(T**)> creator) {
@@ -478,11 +498,11 @@ Status ResourceMgr::LookupOrCreate(const string& container, const string& name,
   Status s;
   {
     tf_shared_lock l(mu_);
-    s = LookupInternal(container, name, resource);
+    s = LookupInternal<T, use_dynamic_cast>(container, name, resource);
     if (s.ok()) return s;
   }
   mutex_lock l(mu_);
-  s = LookupInternal(container, name, resource);
+  s = LookupInternal<T, use_dynamic_cast>(container, name, resource);
   if (s.ok()) return s;
   TF_RETURN_IF_ERROR(creator(resource));
   s = DoCreate(container, MakeTypeIndex<T>(), name, *resource);
@@ -558,11 +578,12 @@ Status CreateResource(OpKernelContext* ctx, const ResourceHandle& p, T* value) {
   return ctx->resource_manager()->Create(p.container(), p.name(), value);
 }
 
-template <typename T>
+template <typename T, bool use_dynamic_cast>
 Status LookupResource(OpKernelContext* ctx, const ResourceHandle& p,
                       T** value) {
   TF_RETURN_IF_ERROR(internal::ValidateDeviceAndType<T>(ctx, p));
-  return ctx->resource_manager()->Lookup(p.container(), p.name(), value);
+  return ctx->resource_manager()->Lookup<T, use_dynamic_cast>(p.container(),
+                                                              p.name(), value);
 }
 
 template <typename T>
@@ -619,20 +640,31 @@ ResourceHandleOp<T>::ResourceHandleOp(OpKernelConstruction* context)
 
 template <typename T>
 void ResourceHandleOp<T>::Compute(OpKernelContext* ctx) {
-  if (!initialized_.load()) {
-    mutex_lock ml(mutex_);
-    // Checking again to see if another thread has initialized the resource.
+  if (name_ == ResourceHandle::ANONYMOUS_NAME) {
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    Tensor handle;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_temp(DT_RESOURCE, TensorShape({}), &handle, attr));
+    handle.scalar<ResourceHandle>()() =
+        MakeResourceHandle<T>(ctx, container_, name_);
+    ctx->set_output(0, handle);
+  } else {
     if (!initialized_.load()) {
-      AllocatorAttributes attr;
-      attr.set_on_host(true);
-      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_RESOURCE, TensorShape({}),
-                                             &resource_, attr));
-      resource_.scalar<ResourceHandle>()() =
-          MakeResourceHandle<T>(ctx, container_, name_);
-      initialized_.store(true);
+      mutex_lock ml(mutex_);
+      // Checking again to see if another thread has initialized the resource.
+      if (!initialized_.load()) {
+        AllocatorAttributes attr;
+        attr.set_on_host(true);
+        OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_RESOURCE, TensorShape({}),
+                                               &resource_, attr));
+        resource_.scalar<ResourceHandle>()() =
+            MakeResourceHandle<T>(ctx, container_, name_);
+        initialized_.store(true);
+      }
     }
+    ctx->set_output(0, resource_);
   }
-  ctx->set_output(0, resource_);
 }
 
 template <typename T>
diff --git a/tensorflow/core/framework/resource_mgr_test.cc b/tensorflow/core/framework/resource_mgr_test.cc
index 7c7f0af0ce46abbde5b66facf4d33db47f9773b8..1c785736e60b2f03899924f34a207066582a590e 100644
--- a/tensorflow/core/framework/resource_mgr_test.cc
+++ b/tensorflow/core/framework/resource_mgr_test.cc
@@ -32,7 +32,7 @@ class Resource : public ResourceBase {
   explicit Resource(const string& label) : label_(label) {}
   ~Resource() override {}
 
-  string DebugString() override { return strings::StrCat("R/", label_); }
+  string DebugString() const override { return strings::StrCat("R/", label_); }
 
  private:
   string label_;
@@ -43,7 +43,7 @@ class Other : public ResourceBase {
   explicit Other(const string& label) : label_(label) {}
   ~Other() override {}
 
-  string DebugString() override { return strings::StrCat("O/", label_); }
+  string DebugString() const override { return strings::StrCat("O/", label_); }
 
  private:
   string label_;
@@ -245,7 +245,7 @@ class StubDevice : public DeviceBase {
 // Empty stub resource for testing resource handles.
 class StubResource : public ResourceBase {
  public:
-  string DebugString() override { return ""; }
+  string DebugString() const override { return ""; }
   int value_{0};
 };
 
@@ -305,7 +305,7 @@ TEST(ResourceHandleTest, DifferentDevice) {
 // Other stub resource to test type-checking of resource handles.
 class OtherStubResource : public ResourceBase {
  public:
-  string DebugString() override { return ""; }
+  string DebugString() const override { return ""; }
 };
 
 TEST(ResourceHandleTest, DifferentType) {
diff --git a/tensorflow/core/framework/resource_op_kernel_test.cc b/tensorflow/core/framework/resource_op_kernel_test.cc
index c1e503dc57643d2023d89f317a6c5ff643a3c60b..7a2a87045bf20970a6a996cb9d32b264af0662c7 100644
--- a/tensorflow/core/framework/resource_op_kernel_test.cc
+++ b/tensorflow/core/framework/resource_op_kernel_test.cc
@@ -46,7 +46,7 @@ class StubDevice : public DeviceBase {
 // Stub resource for testing resource op kernel.
 class StubResource : public ResourceBase {
  public:
-  string DebugString() override { return ""; }
+  string DebugString() const override { return ""; }
   int code;
 };
 
diff --git a/tensorflow/core/framework/resource_var.h b/tensorflow/core/framework/resource_var.h
index ff7b3e78a711a717d44e1e2ca307d6fef05243d9..9387b6c23c77dadfd423865b23bc7dc5fdf41672 100644
--- a/tensorflow/core/framework/resource_var.h
+++ b/tensorflow/core/framework/resource_var.h
@@ -20,14 +20,46 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Resource stored by variables in the resource manager
-// (new, resource-style version).
+// Resource stored by variables in the resource manager (new, resource-style
+// version).
+//
+// These variables have a mixed access mode: they can operate on copy-on-write
+// mode (the default) or copy-on-read mode (used only for sparse access).
+//
+// When copy-on-write mode is enabled reading the value of the variable involves
+// grabbing its mutex in shared mode and aliasing the internal tensor as the
+// output of the read operation, increasing its reference count. Writing,
+// conversely, works by, under an exclusive lock, detecting whether there are
+// outstanding aliases of the tensor, using the reference count, copying the
+// tensor if they exist, and writing to either the original or a copy with no
+// outstanding aliases. Sparse operations are not supported in copy-on-write
+// mode.
+//
+// When a variable is accessed sparsely it switches to copy-on-read mode. To
+// switch we need to grab an exclusive lock and might (if there are aliases)
+// need to copy the entire tensor. Once copy-on-read mode is enabled, no tensor
+// is allowed to alias the variable's internal tensor. This means dense reads
+// must return a copy of the variable, done while holding a shared lock. Dense
+// writes do not need to check whether aliases exist, and can always write
+// directly to the buffer without making a copy, while holding an exclusive
+// lock. Sparse reads and sparse writes, on the other hand, can be done under a
+// shared or exclusive mutex (the damage from writes under a shared mutex is
+// limited since no other buffer is allowed to alias the variable's
+// buffer). Using an exclusive mutex disallows concurrent writes and concurrent
+// sparse reads, providing some extra safety at the expense of performance,
+// while shared mutex allow for "hogwild" behavior. Doing sparse writes under a
+// shared mutex prevents them from overlapping with dense writes, which is
+// necessary as dense writes can change the shape the of the tensor.
+//
+// Transitioning a variable from copy-on-read mode to copy-on-write mode is
+// currently not supported. To upgrade a variable from copy-on-write to
+// copy-on-read use `EnsureSparseVariableAccess()`, and then grab the variable's
+// mutex as desired. To access the variable in dense mode grab the mutex either
+// directly or via `MaybeLockVariableInputMutexesInOrder` on all variables being
+// modified and then call `PrepareToUpdateVariable` on them in any order.
 class Var : public ResourceBase {
  public:
   explicit Var(DataType dtype) : tensor_(dtype) {}
-  // Not copyable or movable.
-  Var(const Var&) = delete;
-  Var& operator=(const Var&) = delete;
 
   // When locking multiple variables, the locks must be acquired in order of
   // increasing mu() address.
@@ -35,7 +67,7 @@ class Var : public ResourceBase {
   mutex* mu() { return &mu_; }
   Tensor* tensor() { return &tensor_; }
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat(DataTypeString(tensor_.dtype()), "/",
                            tensor_.shape().DebugString());
   }
@@ -48,11 +80,19 @@ class Var : public ResourceBase {
   bool is_initialized = false;  // GUARDED_BY(mu_) but annotalysis doesn't like
                                 // it.
 
+  // Also fake-guarded by mu_. Should be set to True whenever any sparse
+  // operation uses the variable. Once this is true no tensor is allowed to
+  // alias the memory of the variable, and we always copy the variable on
+  // reads. This allows sparse operations to happen with only a shared lock if
+  // so desired.
+  std::atomic<bool> copy_on_read_mode{false};
+
  private:
   mutex mu_;
   Tensor tensor_;
 
   ~Var() override {}
+  TF_DISALLOW_COPY_AND_ASSIGN(Var);
 };
 
 }  //  end namespace tensorflow
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 4dcc80680ff7c62b31fb266c0f5cd80a9325fe81..18a278f07ff4e5b07061047021a86411e04e2511 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/shape_inference.h"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/node_def.pb_text.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/scanner.h"
@@ -1259,7 +1259,6 @@ bool InferenceContext::RelaxHandleShapesAndMergeTypes(
     return false;
   }
   std::vector<ShapeAndType> new_values(shapes_and_types.size());
-  bool refined = false;
   for (int i = 0; i < shapes_and_types.size(); ++i) {
     const ShapeAndType& existing = (*to_update)[i];
     if (shapes_and_types[i].dtype == existing.dtype) {
@@ -1269,16 +1268,9 @@ bool InferenceContext::RelaxHandleShapesAndMergeTypes(
         return false;
       } else {
         new_values[i].dtype = shapes_and_types[i].dtype;
-        refined = true;
       }
     }
     Relax(existing.shape, shapes_and_types[i].shape, &new_values[i].shape);
-    if (!existing.shape.SameHandle(new_values[i].shape)) {
-      refined = true;
-    }
-  }
-  if (!refined) {
-    return false;
   }
   to_update->swap(new_values);
   return true;
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index e3885b7d9e8a3f746d0cc2121dad71221d4ec06b..bf8b633c0137f856932689aed18456e8946eb778 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -588,9 +588,9 @@ class InferenceContext {
   // position idx with the specified shapes and types. This requires idx to be
   // in the [0, num_inputs) range.
   //
-  // If the relax is successful and any of the new shapes differs from the old
-  // one, or any of the old dtypes was DT_INVALID, store the new shapes and
-  // return true.  Return false otherwise.
+  // If the relax is successful (sizes are the same, old dtypes match new ones
+  // or are DT_INVALID), then store the relaxed shapes and return true.
+  // Return false otherwise.
   //
   // See 'RelaxInput' function for full details and examples.
   bool RelaxInputHandleShapesAndMergeTypes(
diff --git a/tensorflow/core/framework/stats_aggregator.h b/tensorflow/core/framework/stats_aggregator.h
index af53ed0a3ca64aefe310db3b2d07ce6a18afa181..7c960840d7446889bee1ba22cdbb4af072acd53e 100644
--- a/tensorflow/core/framework/stats_aggregator.h
+++ b/tensorflow/core/framework/stats_aggregator.h
@@ -83,7 +83,7 @@ class StatsAggregatorResource : public ResourceBase {
     return stats_aggregator_;
   }
 
-  string DebugString() { return "StatsAggregatorResource"; }
+  string DebugString() const override { return "StatsAggregatorResource"; }
 
  private:
   const std::shared_ptr<StatsAggregator> stats_aggregator_;
diff --git a/tensorflow/core/framework/step_stats.proto b/tensorflow/core/framework/step_stats.proto
index 67cc9e38459a00394c45bc74b5a966e6128b204a..f8cab135aba799d67183f0978ee1166aba533b99 100644
--- a/tensorflow/core/framework/step_stats.proto
+++ b/tensorflow/core/framework/step_stats.proto
@@ -77,6 +77,8 @@ message NodeExecStats {
 message DeviceStepStats {
   string device = 1;
   repeated NodeExecStats node_stats = 2;
+  // Its key is thread id.
+  map<uint32, string> thread_names = 3;
 }
 
 message StepStats {
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index 7e841489eb35d4ec3d18fe255472107ef9d60efe..ecbffecd66d691e3e1b1722625381665ce61ffcc 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -650,14 +651,21 @@ void Tensor::CopyFromInternal(const Tensor& other, const TensorShape& shape) {
   }
 }
 
-void Tensor::UnsafeCopyFromInternal(const Tensor& other, DataType dtype,
-                                    const TensorShape& shape) {
+Status Tensor::BitcastFrom(const Tensor& other, DataType dtype,
+                           const TensorShape& shape) {
   int in_size = DataTypeSize(other.dtype());
   int out_size = DataTypeSize(dtype);
-  CHECK_NE(in_size, 0);
-  CHECK_NE(out_size, 0);
-  CHECK_EQ(shape.num_elements() * out_size,
-           other.shape().num_elements() * in_size);
+  if (in_size == 0) {
+    return errors::InvalidArgument("other tensor has zero-sized data type");
+  }
+  if (out_size == 0) {
+    return errors::InvalidArgument("specified output type is zero-sized");
+  }
+  if (shape.num_elements() * out_size !=
+      other.shape().num_elements() * in_size) {
+    return errors::InvalidArgument(
+        "input and output shapes/data type sizes are not compatible");
+  }
   shape_ = shape;
   shape_.set_data_type(dtype);
   if (buf_ != other.buf_) {
@@ -665,6 +673,7 @@ void Tensor::UnsafeCopyFromInternal(const Tensor& other, DataType dtype,
     buf_ = other.buf_;
     RefIfNonNull(buf_);
   }
+  return Status::OK();
 }
 
 // Notice that buf_ either points to a regular TensorBuffer or a SubBuffer.
@@ -932,10 +941,18 @@ namespace {
 // logic is so simple we can just replicate it here, where it is close to its
 // usage and easy to change later. And there's the extra benefit of not
 // accessing an 'internal' namespace.
-inline const strings::AlphaNum& PrintOneElement(const strings::AlphaNum& a) {
+inline const strings::AlphaNum& PrintOneElement(const strings::AlphaNum& a,
+                                                bool print_v2) {
   return a;
 }
-inline float PrintOneElement(const Eigen::half& h) {
+inline string PrintOneElement(const string& a, bool print_v2) {
+  if (print_v2) {
+    return "\"" + str_util::CEscape(a) + "\"";
+  } else {
+    return str_util::CEscape(a);
+  }
+}
+inline float PrintOneElement(const Eigen::half& h, bool print_v2) {
   return static_cast<float>(h);
 }
 
@@ -957,7 +974,7 @@ void PrintOneDim(int dim_index, const gtl::InlinedVector<int64, 4>& shape,
         return;
       }
       if (i > 0) strings::StrAppend(result, " ");
-      strings::StrAppend(result, PrintOneElement(data[(*data_index)++]));
+      strings::StrAppend(result, PrintOneElement(data[(*data_index)++], false));
     }
     return;
   }
@@ -1000,7 +1017,7 @@ void PrintOneDimV2(int dim_index, const gtl::InlinedVector<int64, 4>& shape,
   // We have recursed beyond all the dimensions into a single element
   // of the tensor.
   if (dim_index == num_dims) {
-    strings::StrAppend(result, PrintOneElement(data[data_index]));
+    strings::StrAppend(result, PrintOneElement(data[data_index], true));
     return;
   }
 
@@ -1048,7 +1065,7 @@ string SummarizeArray(int64 limit, int64 num_elts,
   if (shape.empty()) {
     for (int64 i = 0; i < limit; ++i) {
       if (i > 0) strings::StrAppend(&ret, " ");
-      strings::StrAppend(&ret, PrintOneElement(array[i]));
+      strings::StrAppend(&ret, PrintOneElement(array[i], print_v2));
     }
     if (num_elts > limit) strings::StrAppend(&ret, "...");
     return ret;
@@ -1123,6 +1140,9 @@ string Tensor::SummarizeValue(int64 max_entries, bool print_v2) const {
       // will emit "1 0..." which is more compact.
       return SummarizeArray<bool>(limit, num_elts, shape_, data, print_v2);
       break;
+    case DT_STRING:
+      return SummarizeArray<string>(limit, num_elts, shape_, data, print_v2);
+      break;
     default: {
       // All irregular cases
       string ret;
@@ -1134,9 +1154,6 @@ string Tensor::SummarizeValue(int64 max_entries, bool print_v2) const {
       for (size_t i = 0; i < limit; ++i) {
         if (i > 0) strings::StrAppend(&ret, " ");
         switch (dtype()) {
-          case DT_STRING:
-            strings::StrAppend(&ret, str_util::CEscape(flat<string>()(i)));
-            break;
           case DT_VARIANT: {
             const Variant& v = flat<Variant>()(i);
             strings::StrAppend(&ret, v.DebugString());
@@ -1166,10 +1183,15 @@ bool Tensor::SharesBufferWith(const Tensor& b) const {
          buf_->root_buffer() == b.buf_->root_buffer();
 }
 
-string Tensor::DebugString() const {
+string Tensor::DebugString(int num_values) const {
   return strings::StrCat("Tensor<type: ", DataTypeString(dtype()),
                          " shape: ", shape().DebugString(),
-                         " values: ", SummarizeValue(3), ">");
+                         " values: ", SummarizeValue(num_values), ">");
+}
+
+string Tensor::DeviceSafeDebugString() const {
+  return strings::StrCat("Tensor<type: ", DataTypeString(dtype()),
+                         " shape: ", shape().DebugString(), ">");
 }
 
 void Tensor::FillDescription(TensorDescription* description) const {
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 6e03cf9f6f47c89289ffaec507f56d8c734e52a9..6454cb818f2e3e237ca4bc49070399f3fff31dd7 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -45,6 +45,7 @@ class TensorBuffer;
 class TensorCApi;
 class TensorDescription;
 class TensorProto;
+class Var;
 
 namespace batch_util {
 Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index);
@@ -525,7 +526,16 @@ class Tensor {
   string SummarizeValue(int64 max_entries, bool print_v2 = false) const;
 
   /// A human-readable summary of the tensor suitable for debugging.
-  string DebugString() const;
+  // `num_values` is the number of actual data values in the tensor
+  // included in the message. If the tensor might be resident in
+  // GPU/TPU memory use DeviceSafeDebugString instead.
+  string DebugString(int num_values) const;
+  string DebugString() const { return DebugString(3); }
+
+  // Variant of DebugString() that should be used for possibly non-CPU tensors.
+  // If the tensor is not resident on CPU, we can't read its values as
+  // DebugString() does.
+  string DeviceSafeDebugString() const;
 
   /// Fill in the `TensorDescription` proto with metadata about the
   /// tensor that is useful for monitoring and debugging.
@@ -544,12 +554,37 @@ class Tensor {
   /// REQUIRES: `DataTypeCanUseMemcpy(dtype())`.
   StringPiece tensor_data() const;
 
-  /// Copy the other tensor into this tensor and reshape it and reinterpret the
-  /// buffer's datatype.
+  /// Copy the other tensor into this tensor, reshape it and reinterpret the
+  /// buffer's datatype. If Status::OK() is returned, the two tensors now share
+  /// the same underlying storage.
+  ///
+  /// This call requires that the `other` tensor and the given type and shape
+  /// are "compatible" (i.e. they occupy the same number of bytes).
+  ///
+  /// Specifically:
+  ///
+  /// shape.num_elements() * DataTypeSize(type)
+  ///
+  /// must equal
+  ///
+  /// other.num_elements() * DataTypeSize(other.dtype())
+  ///
+  /// In addition, this function requires:
+  ///   * DataTypeSize(other.dtype()) != 0
+  ///   * DataTypeSize(type) != 0
+  ///
+  /// If any of the requirements are not met, errors::InvalidArgument is
+  /// returned.
+  Status BitcastFrom(const Tensor& other, DataType dtype,
+                     const TensorShape& shape);
+
+  /// Like BitcastFrom, but CHECK fails if any preconditions are not met.
   ///
-  /// This tensor shares other's underlying storage.
-  void UnsafeCopyFromInternal(const Tensor&, DataType dtype,
-                              const TensorShape&);
+  /// Deprecated. Use BitcastFrom instead and check the returned Status.
+  void UnsafeCopyFromInternal(const Tensor& other, DataType dtype,
+                              const TensorShape& shape) {
+    TF_CHECK_OK(BitcastFrom(other, dtype, shape));
+  }
 
  private:
   // Returns true if the refcount on buf_ and any possible underlying root
@@ -581,14 +616,19 @@ class Tensor {
   friend class XlaTensor;             // For access to RefCountIsOne().
   friend class XlaTensorBuffer;  // For access to the private constructor taking
                                  // the buffer
+  friend class Var;
   template <typename Device, typename T>
   friend class AssignVariableOp;  // For access to RefCountIsOne().
   template <typename Device, typename T>
   friend Status PrepareToUpdateVariable(
-      OpKernelContext* ctx, Tensor* tensor);  // For access to RefCountIsOne().
+      OpKernelContext* ctx, Tensor* tensor,
+      bool copy_on_read_mode);  // For access to RefCountIsOne().
+  template <typename Device, typename T>
+  friend Status EnsureSparseVariableAccess(
+      OpKernelContext* ctx, Var* var);  // For access to RefCountIsOne().
   friend Status batch_util::CopyElementToSlice(
       Tensor element, Tensor* parent,
-      int64 index);                // For access to RefCountIsOne().
+      int64 index);  // For access to RefCountIsOne().
   friend Status batch_util::MaybeMoveSliceToElement(
       Tensor* parent, Tensor* element,
       int64 index);  // For access to RefCountIsOne().
diff --git a/tensorflow/core/framework/tensor_shape.cc b/tensorflow/core/framework/tensor_shape.cc
index 5e0b976e1736dff6b8a18c7b801cb6d1ef500f11..7158f1925f65483c3087a6bfc480e5647eacb5d6 100644
--- a/tensorflow/core/framework/tensor_shape.cc
+++ b/tensorflow/core/framework/tensor_shape.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor_shape.h"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -136,6 +136,89 @@ template <class Shape>
 TensorShapeBase<Shape>::TensorShapeBase(gtl::ArraySlice<int64> dim_sizes) {
   set_tag(REP16);
   set_data_type(DT_INVALID);
+  InitDims(dim_sizes);
+}
+
+// Returns true iff partial is true and val is < 0.
+// REQUIRES: val < kMaxRep16
+// REQUIRES: partial || val >= 0
+static inline bool Set16(bool partial, uint16* dst, int dim, int64 val) {
+  if (partial) {
+    if (val < 0) {
+      dst[dim] = std::numeric_limits<uint16>::max();
+      return true;
+    }
+  } else {
+    CHECK_GE(val, 0);
+  }
+  dst[dim] = val;
+  return false;
+}
+
+template <class Shape>
+void TensorShapeBase<Shape>::InitDims(gtl::ArraySlice<int64> dim_sizes) {
+  DCHECK_EQ(tag(), REP16);
+
+  // Allow sizes that are under kint64max^0.25 so that 4-way multiplication
+  // below cannot overflow.
+  static const uint64 kMaxSmall = 0xd744;
+  static_assert(kMaxSmall * kMaxSmall * kMaxSmall * kMaxSmall <= kint64max,
+                "bad overflow check");
+  bool large_size = false;
+  for (auto s : dim_sizes) {
+    if (s > kMaxSmall) {
+      large_size = true;
+      break;
+    }
+  }
+
+  if (!large_size) {
+    // Every size fits in 16 bits; use fast-paths for dims in {1,2,3,4}.
+    uint16* dst = as16()->dims_;
+    switch (dim_sizes.size()) {
+      case 1: {
+        set_ndims_byte(1);
+        const int64 size = dim_sizes[0];
+        const bool neg = Set16(kIsPartial, dst, 0, size);
+        set_num_elements(neg ? -1 : size);
+        return;
+      }
+      case 2: {
+        set_ndims_byte(2);
+        const int64 size0 = dim_sizes[0];
+        const int64 size1 = dim_sizes[1];
+        bool neg = Set16(kIsPartial, dst, 0, size0);
+        neg |= Set16(kIsPartial, dst, 1, size1);
+        set_num_elements(neg ? -1 : (size0 * size1));
+        return;
+      }
+      case 3: {
+        set_ndims_byte(3);
+        const int64 size0 = dim_sizes[0];
+        const int64 size1 = dim_sizes[1];
+        const int64 size2 = dim_sizes[2];
+        bool neg = Set16(kIsPartial, dst, 0, size0);
+        neg |= Set16(kIsPartial, dst, 1, size1);
+        neg |= Set16(kIsPartial, dst, 2, size2);
+        set_num_elements(neg ? -1 : (size0 * size1 * size2));
+        return;
+      }
+      case 4: {
+        set_ndims_byte(4);
+        const int64 size0 = dim_sizes[0];
+        const int64 size1 = dim_sizes[1];
+        const int64 size2 = dim_sizes[2];
+        const int64 size3 = dim_sizes[3];
+        bool neg = Set16(kIsPartial, dst, 0, size0);
+        neg |= Set16(kIsPartial, dst, 1, size1);
+        neg |= Set16(kIsPartial, dst, 2, size2);
+        neg |= Set16(kIsPartial, dst, 3, size3);
+        set_num_elements(neg ? -1 : (size0 * size1 * size2 * size3));
+        return;
+      }
+    }
+  }
+
   set_ndims_byte(0);
   set_num_elements(1);
   for (int64 s : dim_sizes) {
diff --git a/tensorflow/core/framework/tensor_shape.h b/tensorflow/core/framework/tensor_shape.h
index 625d88ec1bdcdd9765dd64b09a1bad51f7fa3370..3473a441f2cdcc9b6932fcc1e78071ab8b7fa1fd 100644
--- a/tensorflow/core/framework/tensor_shape.h
+++ b/tensorflow/core/framework/tensor_shape.h
@@ -256,6 +256,7 @@ class TensorShapeBase : public TensorShapeRep {
 
  private:
   void RecomputeNumElements();
+  void InitDims(gtl::ArraySlice<int64> dim_sizes);
 
   // True for PartialTensorShape, false for TensorShape
   static constexpr bool kIsPartial =
diff --git a/tensorflow/core/framework/tensor_shape_test.cc b/tensorflow/core/framework/tensor_shape_test.cc
index 6329aa6d8edf3795ed8018b7802661749683fe41..d25652ce81815e636b8f1a188171eec4cedb9689 100644
--- a/tensorflow/core/framework/tensor_shape_test.cc
+++ b/tensorflow/core/framework/tensor_shape_test.cc
@@ -684,6 +684,15 @@ static std::vector<int64> MakeSizes(int arg) {
   return sizes;
 }
 
+static void BM_TensorShape_Init(int iters, int arg) {
+  auto sizes = MakeSizes(arg);
+  while (--iters > 0) {
+    TensorShape shape(sizes);
+    tensorflow::testing::DoNotOptimize(shape.num_elements());
+  }
+}
+BENCHMARK(BM_TensorShape_Init)->Arg(0)->Arg(1)->Arg(2)->Arg(3)->Arg(4);
+
 static void BM_TensorShape_Assign(int iters, int arg) {
   TensorShape s(MakeSizes(arg));
   while (--iters > 0) {
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index 713f91fe04c6fe498209d88193f6fbb1729ec57c..d4aed387610579dc02a7566fdda44d042d203c35 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -1370,7 +1370,7 @@ TEST(SummarizeValue, STRING) {
   EXPECT_EQ("one two three four five", x.SummarizeValue(16));
   x = MkTensor<string>(DT_STRING, TensorShape({5, 1, 5}),
                        {"one", "two", "three", "four", "five"});
-  EXPECT_EQ("one two three four five one...", x.SummarizeValue(6));
+  EXPECT_EQ("[[one two three four five]][[one...]]...", x.SummarizeValue(6));
 }
 
 TEST(SummarizeValue, INT32_PRINT_V2) {
@@ -1423,11 +1423,16 @@ TEST(SummarizeValue, BOOL_PRINT_V2) {
 TEST(SummarizeValue, STRING_PRINT_V2) {
   Tensor x = MkTensor<string>(DT_STRING, TensorShape({5}),
                               {"one", "two", "three", "four", "five"});
-  EXPECT_EQ("[one two three four five]", x.SummarizeValue(16, true));
-  EXPECT_EQ("[one two three four five]", x.SummarizeValue(-1, true));
-  x = MkTensor<string>(DT_STRING, TensorShape({5, 1, 5}),
+  EXPECT_EQ("[\"one\" \"two\" \"three\" \"four\" \"five\"]",
+            x.SummarizeValue(16, true));
+  EXPECT_EQ("[\"one\" \"two\" \"three\" \"four\" \"five\"]",
+            x.SummarizeValue(-1, true));
+  EXPECT_EQ("[\"one\" \"two\" ... \"four\" \"five\"]",
+            x.SummarizeValue(2, true));
+  x = MkTensor<string>(DT_STRING, TensorShape({2, 2}),
                        {"one", "two", "three", "four", "five"});
-  EXPECT_EQ("[one two three four five one...]", x.SummarizeValue(6, true));
+  EXPECT_EQ("[[\"one\" \"two\"]\n [\"three\" \"four\"]]",
+            x.SummarizeValue(16, true));
 }
 
 void BM_CreateAndDestroy(int iters) {
diff --git a/tensorflow/core/framework/tensor_testutil.h b/tensorflow/core/framework/tensor_testutil.h
index 31630028516a4f7896986220f4ff0bd8f09fd37a..b58292b3b0225e6f2df7710347019a1c6d7bc512 100644
--- a/tensorflow/core/framework/tensor_testutil.h
+++ b/tensorflow/core/framework/tensor_testutil.h
@@ -206,7 +206,7 @@ struct Expector<T, true> {
     const T* b = y.flat<T>().data();
     for (int i = 0; i < size; ++i) {
       EXPECT_TRUE(Near(a[i], b[i], abs_err))
-          << "a = " << a[i] << " b = " << b << " index = " << i;
+          << "a = " << a[i] << " b = " << b[i] << " index = " << i;
     }
   }
 };
diff --git a/tensorflow/core/framework/tensor_util.cc b/tensorflow/core/framework/tensor_util.cc
index 65f6dc1c00b5123287212eae39dc607ad8f68e29..65fb72145f20012ed5217bddcafdf74bfd268cd0 100644
--- a/tensorflow/core/framework/tensor_util.cc
+++ b/tensorflow/core/framework/tensor_util.cc
@@ -15,10 +15,15 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor_util.h"
 
+#include <cmath>
 #include <vector>
+
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/tensor_coding.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace tensor {
@@ -37,10 +42,10 @@ Tensor DeepCopy(const Tensor& other) {
              other_data.size());
     }
   } else if (other.dtype() == DT_STRING) {
-    tmp.flat<string>() = other.flat<string>();
+    tmp.unaligned_flat<string>() = other.unaligned_flat<string>();
   } else {
     CHECK_EQ(DT_VARIANT, other.dtype());
-    tmp.flat<Variant>() = other.flat<Variant>();
+    tmp.unaligned_flat<Variant>() = other.unaligned_flat<Variant>();
   }
   return tmp;
 }
@@ -175,7 +180,175 @@ void SetTensorProtoShape(std::vector<size_t> shape,
     shape_proto->mutable_dim()->Add()->set_size(dim);
   }
 }
+
+template <typename T>
+bool CompressTensorContent(float min_compression_ratio,
+                           const TensorShape& shape, TensorProto* tensor) {
+  using TypeHelper = internal::TensorProtoHelper<T>;
+  using FieldType = typename internal::TensorProtoHelper<T>::FieldType;
+  const int64 num_tensor_values = shape.num_elements();
+  const int64 num_bytes = tensor->tensor_content().size();
+  const int64 num_raw_values = num_bytes / sizeof(T);
+  if (num_raw_values != num_tensor_values) {
+    // Invalid or too small.
+    return false;
+  }
+  int64 last_offset = num_bytes - 1;
+  int64 prev_offset = last_offset - sizeof(T);
+  // Inspect individual raw bytes sizeof(T) bytes apart in adjacent elements,
+  // starting from the end, to find the last pair of elements that are not
+  // identical.
+  while (prev_offset >= 0) {
+    if (tensor->tensor_content()[prev_offset] !=
+        tensor->tensor_content()[last_offset]) {
+      break;
+    }
+    --last_offset;
+    --prev_offset;
+  }
+  // Round up to the next whole number of element of type T.
+  const int64 new_num_values = last_offset / sizeof(T) + 1;
+  if (new_num_values * sizeof(FieldType) >
+      static_cast<int64>(num_bytes / min_compression_ratio)) {
+    return false;
+  }
+  // Copy values to truncated repeated field.
+  if (sizeof(FieldType) == sizeof(T)) {
+    FieldType* dst_ptr =
+        TypeHelper::AppendUninitialized(new_num_values, tensor);
+    port::CopySubrangeToArray(tensor->tensor_content(), 0,
+                              new_num_values * sizeof(T),
+                              reinterpret_cast<char*>(dst_ptr));
+    tensor->clear_tensor_content();
+  } else if (sizeof(T) > 1) {
+    // Copy raw bytes to temp array first, then cast.
+    gtl::InlinedVector<T, 64> tmp(new_num_values);
+    port::CopySubrangeToArray(tensor->tensor_content(), 0,
+                              new_num_values * sizeof(T),
+                              reinterpret_cast<char*>(tmp.data()));
+    tensor->clear_tensor_content();
+    TypeHelper::AddValues(tmp.data(), tmp.data() + tmp.size(), tensor);
+  } else {
+    // Copy and cast, one byte at a time.
+    for (int64 i = 0; i < new_num_values; ++i) {
+      char c = tensor->tensor_content()[i];
+      TypeHelper::AddValue(static_cast<FieldType>(c), tensor);
+    }
+    tensor->clear_tensor_content();
+  }
+  return true;
+}
+
+template <typename T>
+inline bool PackedValuesNotEqual(T a, T b) {
+  return a != b;
+}
+template <>
+inline bool PackedValuesNotEqual(float a, float b) {
+  return reinterpret_cast<int32_t&>(a) != reinterpret_cast<int32_t&>(b);
+}
+template <>
+inline bool PackedValuesNotEqual(double a, double b) {
+  return reinterpret_cast<int64_t&>(a) != reinterpret_cast<int64_t&>(b);
+}
+
+template <typename T>
+bool CompressRepeatedField(float min_compression_ratio,
+                           const TensorShape& shape, TensorProto* tensor) {
+  using TypeHelper = internal::TensorProtoHelper<T>;
+  using FieldType = typename internal::TensorProtoHelper<T>::FieldType;
+  const int64 num_tensor_values = shape.num_elements();
+  const int64 num_proto_values = TypeHelper::NumValues(*tensor);
+  if (num_proto_values != num_tensor_values) {
+    // Already compressed or invalid.
+    return false;
+  }
+  T prev_value = TypeHelper::GetValue(num_proto_values - 1, *tensor);
+  int64 last_index = 0;
+  for (int64 i = num_proto_values - 2; i >= 0 && last_index == 0; --i) {
+    const T cur_value = TypeHelper::GetValue(i, *tensor);
+    if (PackedValuesNotEqual(cur_value, prev_value)) {
+      last_index = i + 1;
+    }
+    prev_value = cur_value;
+  }
+  const int64 num_truncated_proto_values = last_index + 1;
+  const int64 num_bytes_as_field =
+      num_truncated_proto_values * sizeof(FieldType);
+  const int64 num_bytes_as_tensor_content = num_tensor_values * sizeof(T);
+  const int64 num_bytes_before = num_proto_values * sizeof(FieldType);
+  if (std::min(num_bytes_as_field, num_bytes_as_tensor_content) >
+      static_cast<int64>(num_bytes_before / min_compression_ratio)) {
+    return false;
+  }
+  if (num_bytes_as_field <= num_bytes_as_tensor_content) {
+    TypeHelper::Truncate(num_truncated_proto_values, tensor);
+  } else {
+    gtl::InlinedVector<T, 64> tmp(num_tensor_values);
+    TypeHelper::CopyValues(tmp.begin(), *tensor);
+    TypeHelper::Truncate(0, tensor);
+    port::CopyFromArray(tensor->mutable_tensor_content(),
+                        reinterpret_cast<const char*>(tmp.data()),
+                        num_bytes_as_tensor_content);
+  }
+  return true;
+}
+
+template <typename T>
+bool CompressTensorProtoInPlaceImpl(int64 min_num_elements,
+                                    float min_compression_ratio,
+                                    TensorProto* tensor) {
+  const TensorShape shape(tensor->tensor_shape());
+  const int64 num_tensor_values = shape.num_elements();
+  if (num_tensor_values < min_num_elements) {
+    return false;
+  }
+  if (tensor->tensor_content().empty()) {
+    return CompressRepeatedField<T>(min_compression_ratio, shape, tensor);
+  } else {
+    return CompressTensorContent<T>(min_compression_ratio, shape, tensor);
+  }
+  return true;
+}
+
 }  // namespace internal
 
+#define HANDLE_COMPRESS_CASE(TF_TYPE)                                  \
+  case TF_TYPE:                                                        \
+    return internal::CompressTensorProtoInPlaceImpl<                   \
+        EnumToDataType<TF_TYPE>::Type>(min_num_elements,               \
+                                       min_compression_ratio, tensor); \
+    break
+
+bool CompressTensorProtoInPlace(int64 min_num_elements,
+                                float min_compression_ratio,
+                                TensorProto* tensor) {
+  switch (tensor->dtype()) {
+    HANDLE_COMPRESS_CASE(DT_FLOAT);
+    HANDLE_COMPRESS_CASE(DT_DOUBLE);
+    HANDLE_COMPRESS_CASE(DT_UINT8);
+    HANDLE_COMPRESS_CASE(DT_INT8);
+    HANDLE_COMPRESS_CASE(DT_UINT16);
+    HANDLE_COMPRESS_CASE(DT_INT16);
+    HANDLE_COMPRESS_CASE(DT_UINT32);
+    HANDLE_COMPRESS_CASE(DT_INT32);
+    HANDLE_COMPRESS_CASE(DT_UINT64);
+    HANDLE_COMPRESS_CASE(DT_INT64);
+    HANDLE_COMPRESS_CASE(DT_BOOL);
+    HANDLE_COMPRESS_CASE(DT_QUINT8);
+    HANDLE_COMPRESS_CASE(DT_QINT8);
+    HANDLE_COMPRESS_CASE(DT_QUINT16);
+    HANDLE_COMPRESS_CASE(DT_QINT16);
+    HANDLE_COMPRESS_CASE(DT_QINT32);
+    // TODO(rmlarsen): Add support for complex and half float types.
+    //    HANDLE_COMPRESS_CASE(DT_HALF);
+    //    HANDLE_COMPRESS_CASE(DT_BFLOAT16);
+    default:
+      return false;
+  }
+}
+
+#undef HANDLE_COMPRESS_CASE
+
 }  // namespace tensor
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/tensor_util.h b/tensorflow/core/framework/tensor_util.h
index a7cf600bab9b2d260277b682946467e9c43f745c..6b7a559a8e43db7083a5b44070c3841db863fec4 100644
--- a/tensorflow/core/framework/tensor_util.h
+++ b/tensorflow/core/framework/tensor_util.h
@@ -16,11 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_TENSOR_UTIL_H_
 #define TENSORFLOW_CORE_FRAMEWORK_TENSOR_UTIL_H_
 
+#include <algorithm>
+#include <vector>
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
 
-#include <vector>
 namespace tensorflow {
 namespace tensor {
 
@@ -60,84 +63,129 @@ namespace internal {
 void SetTensorProtoShape(std::vector<size_t> shape,
                          TensorShapeProto* shape_proto);
 
-// Defines value type dependent methods to manipulate `TensorProto`.
-// Class specializations has to define following methods:
-//   static DataType GetDataType()
-//   static void AddValue(Type value, TensorProto* proto)
 template <typename Type>
-class TensorProtoHelper : public std::false_type {};
+class TensorProtoFieldHelper : public std::false_type {};
+
+#define DEFINE_PROTO_FIELD_HELPER(TYPE, FIELDNAME)                            \
+  template <>                                                                 \
+  class TensorProtoFieldHelper<TYPE> : public std::true_type {                \
+   public:                                                                    \
+    typedef decltype(                                                         \
+        std::declval<TensorProto>().FIELDNAME##_val(0)) FieldType;            \
+    typedef decltype(                                                         \
+        std::declval<TensorProto>().FIELDNAME##_val()) RepeatedFieldType;     \
+    typedef decltype(std::declval<TensorProto>().mutable_##FIELDNAME##_val()) \
+        MutableRepeatedFieldType;                                             \
+    static MutableRepeatedFieldType GetMutableField(TensorProto* proto) {     \
+      return proto->mutable_##FIELDNAME##_val();                              \
+    }                                                                         \
+    static RepeatedFieldType& GetField(const TensorProto& proto) {            \
+      return proto.FIELDNAME##_val();                                         \
+    }                                                                         \
+  }
 
-template <>
-class TensorProtoHelper<string> : public std::true_type {
+DEFINE_PROTO_FIELD_HELPER(float, float);
+DEFINE_PROTO_FIELD_HELPER(double, double);
+DEFINE_PROTO_FIELD_HELPER(int8, int);
+DEFINE_PROTO_FIELD_HELPER(uint8, int);
+DEFINE_PROTO_FIELD_HELPER(int16, int);
+DEFINE_PROTO_FIELD_HELPER(uint16, int);
+DEFINE_PROTO_FIELD_HELPER(int32, int);
+DEFINE_PROTO_FIELD_HELPER(uint32, uint32);
+DEFINE_PROTO_FIELD_HELPER(int64, int64);
+DEFINE_PROTO_FIELD_HELPER(uint64, uint64);
+DEFINE_PROTO_FIELD_HELPER(bool, bool);
+DEFINE_PROTO_FIELD_HELPER(qint8, int);
+DEFINE_PROTO_FIELD_HELPER(quint8, int);
+DEFINE_PROTO_FIELD_HELPER(qint16, int);
+DEFINE_PROTO_FIELD_HELPER(quint16, int);
+DEFINE_PROTO_FIELD_HELPER(qint32, int);
+// TODO(rmlarsen): Add support for complex and half float types.
+// DEFINE_PROTO_FIELD_HELPER(Eigen::hals, half);
+// DEFINE_PROTO_FIELD_HELPER(qint32, half);
+
+#undef DEFINE_PROTO_HELPER
+
+template <typename T>
+class TensorProtoHelper : public std::true_type {
  public:
-  static DataType GetDataType() { return DataType::DT_STRING; }
-  static void AddValue(const string& value, TensorProto* proto) {
-    *proto->mutable_string_val()->Add() = value;
+  using FieldHelper = TensorProtoFieldHelper<T>;
+  using FieldType = typename TensorProtoFieldHelper<T>::FieldType;
+
+  static DataType GetDataType() { return DataTypeToEnum<T>::value; }
+
+  static size_t NumValues(const TensorProto& proto) {
+    return FieldHelper::GetField(proto).size();
   }
-};
 
-template <>
-class TensorProtoHelper<int32> : public std::true_type {
- public:
-  static DataType GetDataType() { return DataType::DT_INT32; }
-  static void AddValue(int32 value, TensorProto* proto) {
-    proto->mutable_int_val()->Add(value);
+  static void AddValue(const T& value, TensorProto* proto) {
+    FieldHelper::GetMutableField(proto)->Add(static_cast<FieldType>(value));
   }
-};
 
-template <>
-class TensorProtoHelper<int64> : public std::true_type {
- public:
-  static DataType GetDataType() { return DataType::DT_INT64; }
-  static void AddValue(int64 value, TensorProto* proto) {
-    proto->mutable_int64_val()->Add(value);
+  static T GetValue(size_t index, const TensorProto& proto) {
+    return static_cast<T>(FieldHelper::GetField(proto).Get(index));
   }
-};
 
-template <>
-class TensorProtoHelper<uint32> : public std::true_type {
- public:
-  static DataType GetDataType() { return DataType::DT_UINT32; }
-  static void AddValue(uint32 value, TensorProto* proto) {
-    proto->mutable_uint32_val()->Add(value);
+  template <typename IterType>
+  static void AddValues(IterType begin, IterType end, TensorProto* proto) {
+    using SrcType = typename std::iterator_traits<IterType>::value_type;
+    size_t n = std::distance(begin, end);
+    FieldType* dst_ptr = AppendUninitialized(n, proto);
+    if (std::is_same<SrcType, FieldType>::value) {
+      std::copy(begin, end, dst_ptr);
+    } else {
+      std::transform(begin, end, dst_ptr, [](const SrcType& x) -> FieldType {
+        return static_cast<FieldType>(x);
+      });
+    }
   }
-};
 
-template <>
-class TensorProtoHelper<uint64> : public std::true_type {
- public:
-  static DataType GetDataType() { return DataType::DT_UINT64; }
-  static void AddValue(uint64 value, TensorProto* proto) {
-    proto->mutable_uint64_val()->Add(value);
+  template <typename IterType>
+  static void CopyValues(IterType dst, const TensorProto& proto) {
+    using DstType = typename std::iterator_traits<IterType>::value_type;
+    auto begin = FieldHelper::GetField(proto).begin();
+    auto end = FieldHelper::GetField(proto).end();
+    if (std::is_same<DstType, FieldType>::value) {
+      std::copy(begin, end, dst);
+    } else {
+      std::transform(begin, end, dst, [](const FieldType& x) -> DstType {
+        return static_cast<DstType>(x);
+      });
+    }
   }
-};
 
-template <>
-class TensorProtoHelper<float> : public std::true_type {
- public:
-  static DataType GetDataType() { return DataType::DT_FLOAT; }
-  static void AddValue(float value, TensorProto* proto) {
-    proto->mutable_float_val()->Add(value);
+  static void Truncate(size_t new_size, TensorProto* proto) {
+    FieldHelper::GetMutableField(proto)->Truncate(new_size);
   }
-};
 
-template <>
-class TensorProtoHelper<double> : public std::true_type {
- public:
-  static DataType GetDataType() { return DataType::DT_DOUBLE; }
-  static void AddValue(double value, TensorProto* proto) {
-    proto->mutable_double_val()->Add(value);
+  static FieldType* AppendUninitialized(size_t n, TensorProto* proto) {
+    auto* field = FieldHelper::GetMutableField(proto);
+    field->Reserve(field->size() + n);
+    return reinterpret_cast<FieldType*>(field->AddNAlreadyReserved(n));
   }
 };
 
+// Specialization for string.
 template <>
-class TensorProtoHelper<bool> : public std::true_type {
+class TensorProtoHelper<string> : public std::true_type {
  public:
-  static DataType GetDataType() { return DataType::DT_BOOL; }
-  static void AddValue(bool value, TensorProto* proto) {
-    proto->mutable_bool_val()->Add(value);
+  static DataType GetDataType() { return DataType::DT_STRING; }
+  static void AddValue(const string& value, TensorProto* proto) {
+    *proto->mutable_string_val()->Add() = value;
+  }
+  template <typename IterType>
+  static void AddValues(IterType begin, IterType end, TensorProto* proto) {
+    for (IterType it = begin; it != end; ++it) {
+      AddValue(*it, proto);
+    }
+  }
+  template <typename IterType>
+  static void CopyToTensorContent(IterType begin, IterType end,
+                                  TensorProto* proto) {
+    AddValues(begin, end, proto);
   }
 };
+
 }  // namespace internal
 
 // Creates a 'TensorProto' with specified shape and values.
@@ -149,15 +197,52 @@ typename std::enable_if<internal::TensorProtoHelper<Type>::value,
 CreateTensorProto(const std::vector<Type>& values,
                   const std::vector<size_t>& shape) {
   TensorProto tensor;
+  TensorShapeProto tensor_shape_proto;
+  internal::SetTensorProtoShape(shape, &tensor_shape_proto);
+  if (TensorShape(tensor_shape_proto).num_elements() != values.size()) {
+    LOG(ERROR) << "Shape and number of values (" << values.size()
+               << ") are incompatible.";
+    return tensor;
+  }
   using TypeHelper = internal::TensorProtoHelper<Type>;
   tensor.set_dtype(TypeHelper::GetDataType());
-  internal::SetTensorProtoShape(shape, tensor.mutable_tensor_shape());
-  for (const auto& value : values) {
-    TypeHelper::AddValue(value, &tensor);
-  }
+  tensor.mutable_tensor_shape()->Swap(&tensor_shape_proto);
+  TypeHelper::AddValues(values.begin(), values.end(), &tensor);
   return tensor;
 }
 
+// Converts values in tensor to run-length encoded compressed form.
+//
+// The elements of a tensor can be stored in a TensorProto in one of the
+// following two forms:
+// 1. As a raw byte string in the field `tensor_content` containing the
+//    serialized in-memory representation of the tensor.
+// 2. As values of a repeated field depending on the datatype, e.g. that
+//    values of a DT_FLOAT tensor would be stored in the repeated field
+//    `float_val`.
+// Storage scheme 2 may use a simple form of run-length encoding to compress
+// data: If the values contains a tail of identical values, the repeated field
+// will be truncated such that the number of values in the repeated field is
+// less than the number of elements implied by the field`tensor_shape`. The
+// original tensor can be recovered by repeating the final value in the repeated
+// field.
+//
+// The TensorProto will be compressed if a) the tensor contains at least
+// min_num_elements elements and b) the compressed tensor proto is would be at
+// most the size of the original tensor proto divided by min_compression_ratio.
+//
+// Returns true if the tensor was compressed.
+bool CompressTensorProtoInPlace(int64 min_num_elements,
+                                float min_compression_ratio,
+                                TensorProto* tensor);
+
+inline bool CompressTensorProtoInPlace(TensorProto* tensor) {
+  static const int64 kDefaultMinNumElements = 64;
+  static const float kDefaultMinCompressionRatio = 2.0f;
+  return CompressTensorProtoInPlace(kDefaultMinNumElements,
+                                    kDefaultMinCompressionRatio, tensor);
+}
+
 }  // namespace tensor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/framework/tensor_util_test.cc b/tensorflow/core/framework/tensor_util_test.cc
index 2b4e1cad2fa24c00f1efc703cd040a105fa68bfe..dbc0ac59f61d4a1b1c4a715f22082d3782f76aed 100644
--- a/tensorflow/core/framework/tensor_util_test.cc
+++ b/tensorflow/core/framework/tensor_util_test.cc
@@ -17,7 +17,11 @@ limitations under the License.
 
 #include <vector>
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -145,6 +149,68 @@ TEST(TensorUtil, DeepCopySlice) {
   }
 }
 
+TEST(TensorUtil, DeepCopySliceString) {
+  Tensor x(DT_STRING, TensorShape({10}));
+  x.flat<string>().setConstant("hello");
+
+  // Slice 'x' -- y still refers to the same buffer.
+  Tensor y = x.Slice(3, 7);
+
+  // Do a deep copy of y, which is a slice.
+  Tensor z = tensor::DeepCopy(y);
+
+  // Set x to be different.
+  x.flat<string>().setConstant("goodbye");
+
+  EXPECT_EQ(TensorShape({10}), x.shape());
+  EXPECT_EQ(TensorShape({4}), y.shape());
+  EXPECT_EQ(TensorShape({4}), z.shape());
+  EXPECT_EQ(DT_STRING, x.dtype());
+  EXPECT_EQ(DT_STRING, y.dtype());
+  EXPECT_EQ(DT_STRING, z.dtype());
+
+  // x and y should now all be 'goodbye', but z should be 'hello'.
+  for (int i = 0; i < 10; ++i) {
+    EXPECT_EQ("goodbye", x.flat<string>()(i));
+  }
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ("goodbye", y.unaligned_flat<string>()(i));
+    EXPECT_EQ("hello", z.flat<string>()(i));
+  }
+}
+
+TEST(TensorUtil, DeepCopySliceVariant) {
+  Tensor x(DT_VARIANT, TensorShape({10}));
+  x.flat<Variant>().setConstant(Tensor(42.0f));
+
+  // Slice 'x' -- y still refers to the same buffer.
+  Tensor y = x.Slice(3, 7);
+
+  // Do a deep copy of y, which is a slice.
+  Tensor z = tensor::DeepCopy(y);
+
+  // Set x to be different.
+  x.flat<Variant>().setConstant(Tensor("foo"));
+
+  EXPECT_EQ(TensorShape({10}), x.shape());
+  EXPECT_EQ(TensorShape({4}), y.shape());
+  EXPECT_EQ(TensorShape({4}), z.shape());
+  EXPECT_EQ(DT_VARIANT, x.dtype());
+  EXPECT_EQ(DT_VARIANT, y.dtype());
+  EXPECT_EQ(DT_VARIANT, z.dtype());
+
+  // Each element of x and y should now be a DT_STRING Tensor containing "foo",
+  // but each element of z should be a DT_FLOAT tensor containing 42.0.
+  for (int i = 0; i < 10; ++i) {
+    EXPECT_EQ("foo", x.flat<Variant>()(i).get<Tensor>()->scalar<string>()());
+  }
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ("foo",
+              y.unaligned_flat<Variant>()(i).get<Tensor>()->scalar<string>()());
+    EXPECT_EQ(42.0, z.flat<Variant>()(i).get<Tensor>()->scalar<float>()());
+  }
+}
+
 TEST(TensorUtil, Concat) {
   std::vector<int64> sizes = {1, 4, 5};
   std::vector<Tensor> to_concat;
@@ -366,5 +432,135 @@ TEST(TensorProtoUtil, CreatesBoolTensorProto) {
             "bool_val: false\n");
 }
 
+TEST(TensorProtoUtil, CompressTensorProtoInPlaceTooSmall) {
+  const int kLength = 63;
+  TensorProto tensor_proto =
+      tensor::CreateTensorProto(std::vector<float>(kLength), {kLength});
+  EXPECT_FALSE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+  tensor_proto =
+      tensor::CreateTensorProto(std::vector<int>(kLength), {kLength});
+  EXPECT_FALSE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+  tensor_proto =
+      tensor::CreateTensorProto(std::vector<uint8>(kLength), {kLength});
+  EXPECT_FALSE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+  tensor_proto =
+      tensor::CreateTensorProto(std::vector<bool>(kLength), {kLength});
+  EXPECT_FALSE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+}
+
+TEST(TensorProtoUtil, CompressTensorProtoInPlaceAllEqual) {
+  const int kLength = 64;
+  TensorProto tensor_proto =
+      tensor::CreateTensorProto(std::vector<float>(kLength), {kLength});
+  EXPECT_TRUE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+  EXPECT_EQ(tensor::internal::TensorProtoHelper<float>::NumValues(tensor_proto),
+            1);
+
+  tensor_proto =
+      tensor::CreateTensorProto(std::vector<int>(kLength), {kLength});
+  EXPECT_TRUE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+  EXPECT_EQ(tensor::internal::TensorProtoHelper<int>::NumValues(tensor_proto),
+            1);
+
+  tensor_proto =
+      tensor::CreateTensorProto(std::vector<uint8>(kLength), {kLength});
+  EXPECT_TRUE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+  EXPECT_EQ(tensor::internal::TensorProtoHelper<uint8>::NumValues(tensor_proto),
+            1);
+  tensor_proto =
+      tensor::CreateTensorProto(std::vector<bool>(kLength), {kLength});
+  EXPECT_TRUE(tensor::CompressTensorProtoInPlace(&tensor_proto));
+  EXPECT_EQ(tensor::internal::TensorProtoHelper<bool>::NumValues(tensor_proto),
+            1);
+}
+
+template <typename T>
+std::vector<T> VectorWithConstantTail(int size, int tail_length) {
+  CHECK_LE(tail_length, size);
+  std::vector<T> v(size, T(0));
+  std::iota(v.begin(), v.end() - tail_length, T(1));
+  return v;
+}
+
+template <typename T>
+TensorProto CreateAsProtoTensorContent(int size, int tail_length) {
+  auto values = VectorWithConstantTail<T>(size, tail_length);
+  Tensor tensor(DataTypeToEnum<T>::value, TensorShape({size}));
+  std::copy(values.begin(), values.end(), tensor.flat<T>().data());
+  TensorProto tensor_proto;
+  tensor.AsProtoTensorContent(&tensor_proto);
+  return tensor_proto;
+}
+
+template <typename T>
+TensorProto CreateAsProtoField(int size, int tail_length) {
+  auto values = VectorWithConstantTail<T>(size, tail_length);
+  Tensor tensor(DataTypeToEnum<T>::value, TensorShape({size}));
+  std::copy(values.begin(), values.end(), tensor.flat<T>().data());
+  TensorProto tensor_proto;
+  tensor.AsProtoField(&tensor_proto);
+  return tensor_proto;
+}
+
+template <typename T>
+void CompareTensorValues(const TensorProto& x, const TensorProto& y) {
+  Tensor x_t;
+  EXPECT_TRUE(x_t.FromProto(x));
+  Tensor y_t;
+  EXPECT_TRUE(y_t.FromProto(y));
+  test::ExpectTensorEqual<T>(x_t, y_t);
+}
+
+template <typename T>
+void ConstantTailTest(int64 length, int64 tail_length, bool as_field) {
+  using TensorProtoHelper = tensor::internal::TensorProtoHelper<T>;
+  using FieldType = typename TensorProtoHelper::FieldType;
+  const float kMinCompressionRatio = 2.0;
+  const int64 kMinSize = 64;
+  TensorProto tensor_proto =
+      as_field ? CreateAsProtoField<T>(length, tail_length)
+               : CreateAsProtoTensorContent<T>(length, tail_length);
+  TensorProto original_tensor_proto = tensor_proto;
+  int64 original_size = length * (as_field ? sizeof(FieldType) : sizeof(T));
+  int64 size_as_tensor_content = length * sizeof(T);
+  int64 size_as_field =
+      std::min(length, (length - tail_length + 1)) * sizeof(FieldType);
+  bool will_compress = std::min(size_as_tensor_content, size_as_field) <=
+                       static_cast<int64>(original_size / kMinCompressionRatio);
+
+  EXPECT_EQ(tensor::CompressTensorProtoInPlace(kMinSize, kMinCompressionRatio,
+                                               &tensor_proto),
+            will_compress);
+  if (will_compress) {
+    if (size_as_tensor_content < size_as_field) {
+      EXPECT_EQ(TensorProtoHelper::NumValues(tensor_proto), 0);
+      EXPECT_FALSE(tensor_proto.tensor_content().empty());
+    } else {
+      EXPECT_LE(TensorProtoHelper::NumValues(tensor_proto),
+                (length - tail_length + 1));
+      EXPECT_TRUE(tensor_proto.tensor_content().empty());
+    }
+  }
+  CompareTensorValues<T>(tensor_proto, original_tensor_proto);
+}
+
+TEST(TensorProtoUtil, CompressTensorProtoConstantTail) {
+  const int kLength = 64;
+  for (bool as_field : {true, false}) {
+    for (int tail_length : {0, 1, 2, 32, 33, 63, 64}) {
+      ConstantTailTest<float>(kLength, tail_length, as_field);
+      ConstantTailTest<double>(kLength, tail_length, as_field);
+      ConstantTailTest<int32>(kLength, tail_length, as_field);
+      ConstantTailTest<uint32>(kLength, tail_length, as_field);
+      ConstantTailTest<int64>(kLength, tail_length, as_field);
+      ConstantTailTest<uint64>(kLength, tail_length, as_field);
+      ConstantTailTest<int8>(kLength, tail_length, as_field);
+      ConstantTailTest<uint8>(kLength, tail_length, as_field);
+      ConstantTailTest<int16>(kLength, tail_length, as_field);
+      ConstantTailTest<uint16>(kLength, tail_length, as_field);
+    }
+  }
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/tracking_allocator.cc b/tensorflow/core/framework/tracking_allocator.cc
index 2df402573a58ad3728e03a22d391b32766c49b00..ff454f5847563bb696afecb79eae1743241628a5 100644
--- a/tensorflow/core/framework/tracking_allocator.cc
+++ b/tensorflow/core/framework/tracking_allocator.cc
@@ -152,8 +152,8 @@ int64 TrackingAllocator::AllocationId(const void* ptr) {
   }
 }
 
-void TrackingAllocator::GetStats(AllocatorStats* stats) {
-  allocator_->GetStats(stats);
+absl::optional<AllocatorStats> TrackingAllocator::GetStats() {
+  return allocator_->GetStats();
 }
 
 void TrackingAllocator::ClearStats() { allocator_->ClearStats(); }
diff --git a/tensorflow/core/framework/tracking_allocator.h b/tensorflow/core/framework/tracking_allocator.h
index 5eafce662ec491de2410e5bfdd6e5a69ecaea199..3b45d1cab80f3a82329d19bd9408a2909673de0b 100644
--- a/tensorflow/core/framework/tracking_allocator.h
+++ b/tensorflow/core/framework/tracking_allocator.h
@@ -66,7 +66,7 @@ class TrackingAllocator : public Allocator {
   size_t RequestedSize(const void* ptr) override;
   size_t AllocatedSize(const void* ptr) override;
   int64 AllocationId(const void* ptr) override;
-  void GetStats(AllocatorStats* stats) override;
+  absl::optional<AllocatorStats> GetStats() override;
   void ClearStats() override;
 
   // If the underlying allocator tracks allocation sizes, this returns
diff --git a/tensorflow/core/framework/tracking_allocator_test.cc b/tensorflow/core/framework/tracking_allocator_test.cc
index 2cdc7edd2d1e9f2634a96e85879dc45a53f633cc..554af609866e059bc3002a2c5097664d6b173c92 100644
--- a/tensorflow/core/framework/tracking_allocator_test.cc
+++ b/tensorflow/core/framework/tracking_allocator_test.cc
@@ -44,7 +44,7 @@ class TestableSizeTrackingAllocator : public Allocator {
     EXPECT_NE(size_map_.end(), iter);
     return iter->second;
   }
-  void GetStats(AllocatorStats* stats) override { stats->Clear(); }
+  absl::optional<AllocatorStats> GetStats() override { return absl::nullopt; }
 
  private:
   std::unordered_map<const void*, size_t> size_map_;
@@ -58,7 +58,7 @@ class NoMemoryAllocator : public Allocator {
   }
   void DeallocateRaw(void* ptr) override {}
   bool TracksAllocationSizes() override { return true; }
-  void GetStats(AllocatorStats* stats) override { stats->Clear(); }
+  absl::optional<AllocatorStats> GetStats() override { return absl::nullopt; }
 };
 
 TEST(TrackingAllocatorTest, SimpleNoTracking) {
diff --git a/tensorflow/core/framework/variant_op_registry.cc b/tensorflow/core/framework/variant_op_registry.cc
index ef5b240aeaa8faef08d4c004f0f6d42e9516c48f..b5107a02a7fa2efeebbfc66a8539590727698882 100644
--- a/tensorflow/core/framework/variant_op_registry.cc
+++ b/tensorflow/core/framework/variant_op_registry.cc
@@ -37,57 +37,6 @@ UnaryVariantOpRegistry* UnaryVariantOpRegistry::Global() {
   return global_unary_variant_op_registry;
 }
 
-UnaryVariantOpRegistry::VariantShapeFn* UnaryVariantOpRegistry::GetShapeFn(
-    const TypeIndex& type_index) {
-  auto found = shape_fns.find(type_index);
-  if (found == shape_fns.end()) return nullptr;
-  return &found->second;
-}
-
-void UnaryVariantOpRegistry::RegisterShapeFn(const TypeIndex& type_index,
-                                             const VariantShapeFn& shape_fn) {
-  VariantShapeFn* existing = GetShapeFn(type_index);
-  CHECK_EQ(existing, nullptr)
-      << "Unary VariantShapeFn for type_index: "
-      << port::MaybeAbiDemangle(type_index.name()) << " already registered";
-  shape_fns.insert(std::pair<TypeIndex, VariantShapeFn>(type_index, shape_fn));
-}
-
-Status GetUnaryVariantShape(const Tensor& variant_tensor, TensorShape* shape) {
-  CHECK_EQ(variant_tensor.dtype(), DT_VARIANT);
-  CHECK_EQ(variant_tensor.dims(), 0);
-  const Variant& v = variant_tensor.scalar<Variant>()();
-  UnaryVariantOpRegistry::VariantShapeFn* shape_fn =
-      UnaryVariantOpRegistry::Global()->GetShapeFn(v.TypeId());
-  if (shape_fn == nullptr) {
-    return errors::Internal(
-        "No unary variant shape function found for Variant type_index: ",
-        port::MaybeAbiDemangle(v.TypeId().name()));
-  }
-  return (*shape_fn)(v, shape);
-}
-
-// Add some basic registrations for use by others, e.g., for testing.
-namespace {
-template <typename T>
-Status ScalarShape(const T&, TensorShape* shape) {
-  *shape = TensorShape({});
-  return Status::OK();
-}
-}  // namespace
-
-#define REGISTER_VARIANT_SHAPE_TYPE(T) \
-  REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(T, ScalarShape<T>);
-
-// No encode/shape registered for std::complex<> and Eigen::half
-// objects yet.
-REGISTER_VARIANT_SHAPE_TYPE(int);
-REGISTER_VARIANT_SHAPE_TYPE(float);
-REGISTER_VARIANT_SHAPE_TYPE(bool);
-REGISTER_VARIANT_SHAPE_TYPE(double);
-
-#undef REGISTER_VARIANT_SHAPE_TYPE
-
 UnaryVariantOpRegistry::VariantDecodeFn* UnaryVariantOpRegistry::GetDecodeFn(
     StringPiece type_name) {
   auto found = decode_fns.find(type_name);
@@ -177,6 +126,37 @@ Status VariantDeviceCopy(
   return (*device_copy_fn)(from, to, copy_fn);
 }
 
+namespace {
+template <typename T>
+Status DeviceCopyPrimitiveType(
+    const T& in, T* out,
+    const UnaryVariantOpRegistry::AsyncTensorDeviceCopyFn& copier) {
+  // Dummy copy, we don't actually bother copying to the device and back for
+  // testing.
+  *out = in;
+  return Status::OK();
+}
+}  // namespace
+
+#define REGISTER_VARIANT_DEVICE_COPY_TYPE(T)            \
+  INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION( \
+      T, VariantDeviceCopyDirection::HOST_TO_DEVICE,    \
+      DeviceCopyPrimitiveType<T>);                      \
+  INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION( \
+      T, VariantDeviceCopyDirection::DEVICE_TO_HOST,    \
+      DeviceCopyPrimitiveType<T>);                      \
+  INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION( \
+      T, VariantDeviceCopyDirection::DEVICE_TO_DEVICE,  \
+      DeviceCopyPrimitiveType<T>);
+
+// No zeros_like registered for std::complex<> or Eigen::half objects yet.
+REGISTER_VARIANT_DEVICE_COPY_TYPE(int);
+REGISTER_VARIANT_DEVICE_COPY_TYPE(float);
+REGISTER_VARIANT_DEVICE_COPY_TYPE(double);
+REGISTER_VARIANT_DEVICE_COPY_TYPE(bool);
+
+#undef REGISTER_VARIANT_DEVICE_COPY_TYPE
+
 // Special casing UnaryOpFn per op and per device.
 UnaryVariantOpRegistry::VariantUnaryOpFn* UnaryVariantOpRegistry::GetUnaryOpFn(
     VariantUnaryOp op, StringPiece device, const TypeIndex& type_index) {
diff --git a/tensorflow/core/framework/variant_op_registry.h b/tensorflow/core/framework/variant_op_registry.h
index 7eb37e859f51992cf74a12736f5099839db5e1fd..488a606f6ee4564abaa0113f9886166afc76dacd 100644
--- a/tensorflow/core/framework/variant_op_registry.h
+++ b/tensorflow/core/framework/variant_op_registry.h
@@ -58,7 +58,6 @@ enum VariantDeviceCopyDirection {
 
 class UnaryVariantOpRegistry {
  public:
-  typedef std::function<Status(const Variant& v, TensorShape*)> VariantShapeFn;
   typedef std::function<bool(Variant*)> VariantDecodeFn;
   typedef std::function<Status(OpKernelContext*, const Variant&, Variant*)>
       VariantUnaryOpFn;
@@ -93,13 +92,6 @@ class UnaryVariantOpRegistry {
                                AsyncTensorDeviceCopyFn copy_fn)>
       AsyncVariantDeviceCopyFn;
 
-  // Add a shape lookup function to the registry.
-  void RegisterShapeFn(const TypeIndex& type_index,
-                       const VariantShapeFn& shape_fn);
-
-  // Returns nullptr if no shape function was found for the given TypeIndex.
-  VariantShapeFn* GetShapeFn(const TypeIndex& type_index);
-
   // Add a decode function to the registry.
   void RegisterDecodeFn(const string& type_name,
                         const VariantDecodeFn& decode_fn);
@@ -154,7 +146,6 @@ class UnaryVariantOpRegistry {
     std::size_t operator()(const TypeIndex& x) const { return x.hash_code(); }
   };
 
-  gtl::FlatMap<TypeIndex, VariantShapeFn, TypeIndexHash> shape_fns;
   gtl::FlatMap<StringPiece, VariantDecodeFn, StringPieceHasher> decode_fns;
 
   // Map std::pair<Direction, type_name> to function.
@@ -235,15 +226,6 @@ inline bool operator==(const UnaryVariantOpRegistry::FuncTuple<Op>& lhs,
   return (lhs.op_type_ == rhs.op_type_) && (lhs.device_ == rhs.device_) &&
          (lhs.type_index_ == rhs.type_index_);
 }
-// Gets a TensorShape from a Tensor containing a scalar Variant.
-// Returns an Internal error if the Variant does not have a registered shape
-// function, or if it's a serialized Variant that cannot be decoded.
-//
-// REQUIRES:
-//   variant_tensor.dtype() == DT_VARIANT
-//   variant_tensor.dims() == 0
-//
-Status GetUnaryVariantShape(const Tensor& variant_tensor, TensorShape* shape);
 
 // Decodes the Variant whose data_type has a registered decode
 // function.  Returns an Internal error if the Variant does not have a
@@ -326,29 +308,6 @@ Status BinaryOpVariants(OpKernelContext* ctx, VariantBinaryOp op,
 
 namespace variant_op_registry_fn_registration {
 
-template <typename T>
-class UnaryVariantShapeRegistration {
- public:
-  typedef std::function<Status(const T& t, TensorShape*)> LocalVariantShapeFn;
-
-  UnaryVariantShapeRegistration(const TypeIndex& type_index,
-                                const LocalVariantShapeFn& shape_fn) {
-    const string type_index_name = port::MaybeAbiDemangle(type_index.name());
-    UnaryVariantOpRegistry::Global()->RegisterShapeFn(
-        type_index,
-        [type_index_name, shape_fn](const Variant& v,
-                                    TensorShape* s) -> Status {
-          const T* t = v.get<T>();
-          if (t == nullptr) {
-            return errors::Internal(
-                "VariantShapeFn: Could not access object, type_index: ",
-                type_index_name);
-          }
-          return shape_fn(*t, s);
-        });
-  }
-};
-
 template <typename T>
 class UnaryVariantDecodeRegistration {
  public:
@@ -471,23 +430,6 @@ class UnaryVariantBinaryOpRegistration {
 
 };  // namespace variant_op_registry_fn_registration
 
-// Register a unary shape variant function with the signature:
-//    Status ShapeFn(const T& t, TensorShape* s);
-// to Variants having TypeIndex type_index.
-#define REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(T, shape_function) \
-  REGISTER_UNARY_VARIANT_SHAPE_FUNCTION_UNIQ_HELPER(             \
-      __COUNTER__, T, MakeTypeIndex<T>(), shape_function)
-
-#define REGISTER_UNARY_VARIANT_SHAPE_FUNCTION_UNIQ_HELPER(ctr, T, type_index, \
-                                                          shape_function)     \
-  REGISTER_UNARY_VARIANT_SHAPE_FUNCTION_UNIQ(ctr, T, type_index, shape_function)
-
-#define REGISTER_UNARY_VARIANT_SHAPE_FUNCTION_UNIQ(ctr, T, type_index,         \
-                                                   shape_function)             \
-  static variant_op_registry_fn_registration::UnaryVariantShapeRegistration<T> \
-      register_unary_variant_op_shape_registration_fn_##ctr(type_index,        \
-                                                            shape_function)
-
 // Register a unary decode variant function for the given type.
 #define REGISTER_UNARY_VARIANT_DECODE_FUNCTION(T, type_name) \
   REGISTER_UNARY_VARIANT_DECODE_FUNCTION_UNIQ_HELPER(__COUNTER__, T, type_name)
diff --git a/tensorflow/core/framework/variant_op_registry_test.cc b/tensorflow/core/framework/variant_op_registry_test.cc
index b2443e8676e7b986992fd130d5e162818e5fe075..e1a46ebd59d6ae8503d5ae3b31d4f31c7a6f1be1 100644
--- a/tensorflow/core/framework/variant_op_registry_test.cc
+++ b/tensorflow/core/framework/variant_op_registry_test.cc
@@ -39,13 +39,6 @@ namespace {
 
 struct VariantValue {
   string TypeName() const { return "TEST VariantValue"; }
-  static Status ShapeFn(const VariantValue& v, TensorShape* s) {
-    if (v.early_exit) {
-      return errors::InvalidArgument("early exit!");
-    }
-    *s = TensorShape({-0xdeadbeef});
-    return Status::OK();
-  }
   static Status CPUZerosLikeFn(OpKernelContext* ctx, const VariantValue& v,
                                VariantValue* v_out) {
     if (v.early_exit) {
@@ -89,8 +82,6 @@ struct VariantValue {
   int value;
 };
 
-REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(VariantValue, VariantValue::ShapeFn);
-
 REGISTER_UNARY_VARIANT_DECODE_FUNCTION(VariantValue, "TEST VariantValue");
 
 INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION(
@@ -113,38 +104,6 @@ REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(ADD_VARIANT_BINARY_OP, DEVICE_GPU,
 
 }  // namespace
 
-TEST(VariantOpShapeRegistryTest, TestBasic) {
-  class Blah {};
-  EXPECT_EQ(UnaryVariantOpRegistry::Global()->GetShapeFn(MakeTypeIndex<Blah>()),
-            nullptr);
-
-  auto* shape_fn = UnaryVariantOpRegistry::Global()->GetShapeFn(
-      MakeTypeIndex<VariantValue>());
-  EXPECT_NE(shape_fn, nullptr);
-  TensorShape shape;
-
-  VariantValue vv_early_exit{true /* early_exit */};
-  Variant v = vv_early_exit;
-  Status s0 = (*shape_fn)(v, &shape);
-  EXPECT_FALSE(s0.ok());
-  EXPECT_TRUE(str_util::StrContains(s0.error_message(), "early exit!"));
-
-  VariantValue vv_ok{false /* early_exit */};
-  v = vv_ok;
-  TF_EXPECT_OK((*shape_fn)(v, &shape));
-  EXPECT_EQ(shape, TensorShape({-0xdeadbeef}));
-}
-
-TEST(VariantOpShapeRegistryTest, TestDuplicate) {
-  UnaryVariantOpRegistry registry;
-  UnaryVariantOpRegistry::VariantShapeFn f;
-  class FjFjFj {};
-  const auto kTypeIndex = MakeTypeIndex<FjFjFj>();
-  registry.RegisterShapeFn(kTypeIndex, f);
-  EXPECT_DEATH(registry.RegisterShapeFn(kTypeIndex, f),
-               "FjFjFj already registered");
-}
-
 TEST(VariantOpDecodeRegistryTest, TestBasic) {
   EXPECT_EQ(UnaryVariantOpRegistry::Global()->GetDecodeFn("YOU SHALL NOT PASS"),
             nullptr);
diff --git a/tensorflow/core/framework/variant_tensor_data.cc b/tensorflow/core/framework/variant_tensor_data.cc
index 3e67e4a86405819925f153400340145821cce414..993a8989b708c448653bab374dd25bc907b7bf0c 100644
--- a/tensorflow/core/framework/variant_tensor_data.cc
+++ b/tensorflow/core/framework/variant_tensor_data.cc
@@ -20,14 +20,10 @@ limitations under the License.
 
 namespace tensorflow {
 
-VariantTensorData::VariantTensorData() {}
-
 VariantTensorData::VariantTensorData(VariantTensorDataProto proto) {
   FromProto(std::move(proto));
 }
 
-VariantTensorData::~VariantTensorData() {}
-
 int VariantTensorData::tensors_size() const { return tensors_.size(); }
 
 const Tensor& VariantTensorData::tensors(int index) const {
@@ -43,6 +39,12 @@ Tensor* VariantTensorData::add_tensors() {
   return &(tensors_[tensors_.size() - 1]);
 }
 
+template <typename... TensorConstructorArgs>
+Tensor* VariantTensorData::add_tensor(TensorConstructorArgs&&... args) {
+  tensors_.emplace_back(std::forward<TensorConstructorArgs>(args)...);
+  return &tensors_.back();
+}
+
 void VariantTensorData::ToProto(VariantTensorDataProto* proto) const {
   proto->set_type_name(type_name());
   proto->set_metadata(metadata_);
diff --git a/tensorflow/core/framework/variant_tensor_data.h b/tensorflow/core/framework/variant_tensor_data.h
index 8c69c870345a68a2c5fc5f1f33015c7bb97c123e..d98cf6b5e1fb8c6d541aad2c2127c2ca9033792c 100644
--- a/tensorflow/core/framework/variant_tensor_data.h
+++ b/tensorflow/core/framework/variant_tensor_data.h
@@ -37,11 +37,11 @@ class VariantTensorDataProto;
 // separate so that kernels do not need to depend on protos.
 class VariantTensorData {
  public:
-  VariantTensorData();
+  VariantTensorData() = default;
+
   // TODO(b/118823936): This silently returns if the proto is invalid.
   // Consider calling FromProto explicitly instead.
   VariantTensorData(VariantTensorDataProto proto);
-  ~VariantTensorData();
 
   // Name of the type of objects being serialized.
   const string& type_name() const { return type_name_; }
@@ -68,6 +68,11 @@ class VariantTensorData {
   const std::vector<Tensor>& tensors() const;
   Tensor* add_tensors();
 
+  // A more general version of add_tensors. Parameters are perfectly forwarded
+  // to the constructor of the tensor added here.
+  template <typename... TensorConstructorArgs>
+  Tensor* add_tensor(TensorConstructorArgs&&... args);
+
   // Conversion to and from VariantTensorDataProto
   void ToProto(VariantTensorDataProto* proto) const;
   // This allows optimizations via std::move.
diff --git a/tensorflow/core/framework/variant_test.cc b/tensorflow/core/framework/variant_test.cc
index 08d09de7b845101cd2c9604b2ea44bbe25a94171..8947f93887a78659e2e0a0bcd06cedc1ab733d99 100644
--- a/tensorflow/core/framework/variant_test.cc
+++ b/tensorflow/core/framework/variant_test.cc
@@ -186,7 +186,7 @@ TEST(VariantTest, TensorListTest) {
   x.Encode(&serialized);
 
   Variant y = TensorList();
-  y.Decode(std::move(serialized));
+  y.Decode(serialized);
 
   const TensorList& decoded_vec = *y.get<TensorList>();
   for (int i = 0; i < 4; ++i) {
diff --git a/tensorflow/core/graph/algorithm.cc b/tensorflow/core/graph/algorithm.cc
index 9b4200e0b47ec37ddbef1e375e1955c6ec814caf..5ad1c19dc1a7bbbd087628a41f613d9d44377147 100644
--- a/tensorflow/core/graph/algorithm.cc
+++ b/tensorflow/core/graph/algorithm.cc
@@ -22,25 +22,29 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
-
-void DFS(const Graph& g, const std::function<void(Node*)>& enter,
-         const std::function<void(Node*)>& leave,
-         const NodeComparator& stable_comparator,
-         const EdgeFilter& edge_filter) {
+namespace {
+template <typename T>
+void DFSFromHelper(const Graph& g, gtl::ArraySlice<T> start,
+                   const std::function<void(T)>& enter,
+                   const std::function<void(T)>& leave,
+                   const NodeComparator& stable_comparator,
+                   const EdgeFilter& edge_filter) {
   // Stack of work to do.
   struct Work {
-    Node* node;
+    T node;
     bool leave;  // Are we entering or leaving n?
   };
-  std::vector<Work> stack;
-  stack.push_back(Work{g.source_node(), false});
+  std::vector<Work> stack(start.size());
+  for (int i = 0; i < start.size(); ++i) {
+    stack[i] = Work{start[i], false};
+  }
 
   std::vector<bool> visited(g.num_node_ids(), false);
   while (!stack.empty()) {
     Work w = stack.back();
     stack.pop_back();
 
-    Node* n = w.node;
+    T n = w.node;
     if (w.leave) {
       leave(n);
       continue;
@@ -80,6 +84,23 @@ void DFS(const Graph& g, const std::function<void(Node*)>& enter,
     }
   }
 }
+}  // namespace
+
+void DFS(const Graph& g, const std::function<void(Node*)>& enter,
+         const std::function<void(Node*)>& leave,
+         const NodeComparator& stable_comparator,
+         const EdgeFilter& edge_filter) {
+  DFSFromHelper(g, {g.source_node()}, enter, leave, stable_comparator,
+                edge_filter);
+}
+
+void DFSFrom(const Graph& g, gtl::ArraySlice<const Node*> start,
+             const std::function<void(const Node*)>& enter,
+             const std::function<void(const Node*)>& leave,
+             const NodeComparator& stable_comparator,
+             const EdgeFilter& edge_filter) {
+  DFSFromHelper(g, start, enter, leave, stable_comparator, edge_filter);
+}
 
 void ReverseDFS(const Graph& g, const std::function<void(Node*)>& enter,
                 const std::function<void(Node*)>& leave,
@@ -222,11 +243,12 @@ bool FixupSourceAndSinkEdges(Graph* g) {
   bool changed = false;
   for (Node* n : g->nodes()) {
     if (!n->IsSource() && n->in_edges().empty()) {
-      g->AddControlEdge(g->source_node(), n);
+      g->AddControlEdge(g->source_node(), n,
+                        true /* skip test for duplicates */);
       changed = true;
     }
     if (!n->IsSink() && n->out_edges().empty()) {
-      g->AddControlEdge(n, g->sink_node());
+      g->AddControlEdge(n, g->sink_node(), true /* skip test for duplicates */);
       changed = true;
     }
   }
diff --git a/tensorflow/core/graph/algorithm.h b/tensorflow/core/graph/algorithm.h
index 45f8a29a92d5201af626c77a6aa07daf1a756b6d..3479605df86e37dc52388651d049968d02239e19 100644
--- a/tensorflow/core/graph/algorithm.h
+++ b/tensorflow/core/graph/algorithm.h
@@ -55,6 +55,18 @@ extern void DFS(const Graph& g, const std::function<void(Node*)>& enter,
                 const NodeComparator& stable_comparator = {},
                 const EdgeFilter& edge_filter = {});
 
+// Perform a depth-first-search on g starting at the 'start' nodes.
+// If enter is not empty, calls enter(n) before visiting any children of n.
+// If leave is not empty, calls leave(n) after visiting all children of n.
+// If stable_comparator is set, a stable ordering of visit is achieved by
+// sorting a node's neighbors first before visiting them.
+// If edge_filter is set then ignores edges for which edge_filter returns false.
+extern void DFSFrom(const Graph& g, gtl::ArraySlice<const Node*> start,
+                    const std::function<void(const Node*)>& enter,
+                    const std::function<void(const Node*)>& leave,
+                    const NodeComparator& stable_comparator = {},
+                    const EdgeFilter& edge_filter = {});
+
 // Perform a reverse depth-first-search on g starting at the sink node.
 // If enter is not empty, calls enter(n) before visiting any parents of n.
 // If leave is not empty, calls leave(n) after visiting all parents of n.
diff --git a/tensorflow/core/graph/collective_order.cc b/tensorflow/core/graph/collective_order.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e835259d64e85e929fc42d6bcd5e7d429154183e
--- /dev/null
+++ b/tensorflow/core/graph/collective_order.cc
@@ -0,0 +1,205 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/graph/collective_order.h"
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/graph/algorithm.h"
+
+namespace tensorflow {
+namespace {
+
+// Find all CollectiveReduce nodes and the existing data dependencies between
+// them.
+Status DiscoverDataDependencies(
+    const Graph* graph, std::vector<Node*>* collective_nodes,
+    std::vector<int32>* instance_keys,
+    absl::flat_hash_map<Node*, absl::flat_hash_set<int32>>* data_dependencies) {
+  Status s;
+  // Algorithm: do Reverse DFS starting at sink.  `node_leave` is called when
+  // all parents of `node` have been visited.  At that point,
+  // `data_dependencies[node]` is a list containing `instance_key` of every
+  // `CollectiveReduce` on which `node` has a data dependency.
+  // For this node's children, add all these instance keys.  Also, if this node
+  // is collective, add as a dependency for the children.
+  auto node_leave = [collective_nodes, instance_keys, data_dependencies,
+                     &s](Node* node) {
+    int32 instance_key;
+    bool enter_node =
+        node->IsCollective() && node->type_string() == "CollectiveReduce";
+    if (enter_node) {
+      Status get_attr_status =
+          GetNodeAttr(node->attrs(), "instance_key", &instance_key);
+      s.Update(get_attr_status);
+      collective_nodes->push_back(node);
+      instance_keys->push_back(instance_key);
+      VLOG(2) << "collective node " << node->DebugString();
+    }
+    // Avoid reference invalidation of `node_deps`.
+    data_dependencies->reserve(data_dependencies->size() + 1 +
+                               node->out_edges().size());
+    const auto& node_deps = (*data_dependencies)[node];
+    for (const Edge* out_edge : node->out_edges()) {
+      auto& child_deps = (*data_dependencies)[out_edge->dst()];
+      child_deps.insert(node_deps.begin(), node_deps.end());
+      if (enter_node && s.ok()) {
+        child_deps.insert(instance_key);
+      }
+    }
+  };
+  ReverseDFS(*graph, nullptr, node_leave);
+  return s;
+}
+
+// Given a list of `collective_nodes` and `data_dependencies` between the
+// collective nodes, create control dependencies between concurrent collectives
+// and store in `dependency_edges`.
+// If there exists an edge a -> b then `dependency_edges[a]` contains `b`
+Status CreateControlDependencies(
+    const std::vector<Node*>& collective_nodes,
+    const std::vector<int32>& instance_keys,
+    absl::flat_hash_map<Node*, absl::flat_hash_set<int32>>* data_dependencies,
+    absl::flat_hash_map<Node*, absl::flat_hash_set<Node*>>* dependency_edges) {
+  // If there exists some path a -> ... -> b then `all_paths[a]` contains `b`
+  absl::flat_hash_map<Node*, absl::flat_hash_set<Node*>> all_paths;
+  for (int i = 0; i < collective_nodes.size() - 1; i++) {
+    if (!collective_nodes[i]->IsCollective() ||
+        collective_nodes[i]->type_string() != "CollectiveReduce") {
+      return errors::Internal("Unexpected node ",
+                              collective_nodes[i]->DebugString());
+    }
+    const auto& deps_i = (*data_dependencies)[collective_nodes[i]];
+    for (int j = i + 1; j < collective_nodes.size(); j++) {
+      if (collective_nodes[i]->requested_device() !=
+          collective_nodes[j]->requested_device()) {
+        continue;
+      }
+      if (instance_keys[i] == instance_keys[j]) {
+        return errors::Internal("Unexpected same instance_key ",
+                                instance_keys[i],
+                                " on 2 nodes with the same device ",
+                                collective_nodes[i]->requested_device());
+      }
+      const auto& deps_j = (*data_dependencies)[collective_nodes[j]];
+      if (deps_i.find(instance_keys[j]) == deps_i.end() &&
+          deps_j.find(instance_keys[i]) == deps_j.end()) {
+        int src_idx = instance_keys[i] > instance_keys[j] ? i : j;
+        int dst_idx = instance_keys[i] > instance_keys[j] ? j : i;
+        Node* src_node = collective_nodes[src_idx];
+        Node* dst_node = collective_nodes[dst_idx];
+        VLOG(1) << "Adding control dependency from node " << src_node->name()
+                << " instance " << instance_keys[src_idx] << " to node "
+                << dst_node->name() << " instance " << instance_keys[dst_idx];
+        (*dependency_edges)[src_node].insert(dst_node);
+        auto& src_paths = all_paths[src_node];
+        src_paths.insert(dst_node);
+        for (Node* downstream_node : all_paths[dst_node]) {
+          src_paths.insert(downstream_node);
+        }
+      }
+    }
+  }
+
+  // Prune dependency edges so that if there are edges a -> b, b -> c, and a ->
+  // c, then remove a -> c.  This dependency would be handled naturally during
+  // op scheduling.
+  for (int i = 0; i < collective_nodes.size(); ++i) {
+    Node* node = collective_nodes[i];
+    auto& neighbor_set = (*dependency_edges)[node];
+    std::vector<Node*> neighbor_list(neighbor_set.begin(), neighbor_set.end());
+    // For all n1, n2 in `neighbor_list` if there is a path from n1 -> n2 then
+    // eliminate n2 from `neighbor_set` and `neighbor_list`.  We remove from
+    // `neighbor_list` by replacing with a `nullptr`, hence the `nullptr` checks
+    // below.
+    for (int j = 0; j < neighbor_list.size(); ++j) {
+      Node* n1 = neighbor_list[j];
+      if (n1 == nullptr) continue;
+      auto& n1_paths = all_paths[n1];
+      for (int k = 0; k < neighbor_list.size(); ++k) {
+        Node* n2 = neighbor_list[k];
+        if (j == k || n2 == nullptr) continue;
+        if (n1_paths.find(n2) != n1_paths.end()) {
+          neighbor_set.erase(n2);
+          neighbor_list[k] = nullptr;
+        }
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+// Insert control dependencies defined by `dependency_edges` in `graph`.  If
+// `order_type` is `kEdges`, insert explicit control edges, else if `order_type`
+// is `kAttrs`, encode depdencies as an attribute on collective node.
+Status InsertControlDependencies(
+    Graph* graph, GraphCollectiveOrder order_type,
+    const absl::flat_hash_map<Node*, absl::flat_hash_set<Node*>>&
+        dependency_edges) {
+  if (order_type == GraphCollectiveOrder::kEdges) {
+    for (const auto& pair : dependency_edges) {
+      Node* src_node = pair.first;
+      for (Node* dst_node : pair.second) {
+        graph->AddControlEdge(src_node, dst_node);
+      }
+    }
+  } else if (order_type == GraphCollectiveOrder::kAttrs) {
+    // `wait_for` is the inverse of `dependency_edges`, i.e. `wait_for[node]`
+    // contains the list of instance keys for which `node` must wait.
+    absl::flat_hash_map<Node*, absl::flat_hash_set<int32>> wait_for;
+    for (const auto& pair : dependency_edges) {
+      int32 src_instance;
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(pair.first->attrs(), "instance_key", &src_instance));
+      for (Node* dst_node : pair.second) {
+        wait_for[dst_node].insert(src_instance);
+      }
+    }
+    for (const auto& pair : wait_for) {
+      std::vector<int32> wait_for_list(pair.second.begin(), pair.second.end());
+      pair.first->ClearAttr("wait_for");
+      pair.first->AddAttr("wait_for", wait_for_list);
+    }
+  } else {
+    return errors::Internal("Unexpected GraphCollectiveOrder type ",
+                            static_cast<int>(order_type));
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status OrderCollectives(Graph* graph, GraphCollectiveOrder order_type) {
+  // `instance_keys[i]` corresponds to `collective_nodes[i]`
+  std::vector<Node*> collective_nodes;
+  std::vector<int32> instance_keys;
+  // node -> set of collectives on which node depends.
+  absl::flat_hash_map<Node*, absl::flat_hash_set<int32>> data_dependencies;
+  TF_RETURN_IF_ERROR(DiscoverDataDependencies(
+      graph, &collective_nodes, &instance_keys, &data_dependencies));
+
+  if (collective_nodes.empty()) return Status::OK();
+
+  absl::flat_hash_map<Node*, absl::flat_hash_set<Node*>> dependency_edges;
+  // For all pairs of collective nodes n1 and n2 on the same device, if n1 does
+  // not depend on n2 and n2 does not depend on n1, then they are potentially
+  // concurrent.  Create an arbitrary, deterministic ordering between them.
+  TF_RETURN_IF_ERROR(CreateControlDependencies(
+      collective_nodes, instance_keys, &data_dependencies, &dependency_edges));
+
+  return InsertControlDependencies(graph, order_type, dependency_edges);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/graph/collective_order.h b/tensorflow/core/graph/collective_order.h
new file mode 100644
index 0000000000000000000000000000000000000000..67a1427a96635f08d0fbe9f77f92d4d213a93dd8
--- /dev/null
+++ b/tensorflow/core/graph/collective_order.h
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_GRAPH_COLLECTIVE_ORDER_H_
+#define TENSORFLOW_CORE_GRAPH_COLLECTIVE_ORDER_H_
+
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+enum class GraphCollectiveOrder { kNone, kEdges, kAttrs };
+
+// Introduces a deterministic execution order between potentially concurrent
+// CollectiveOps.  This may be used to execute collectives in the same order
+// across all workers in a distributed execution, if all workers are executing
+// the same graph.
+// If `order_type` is `kEdges`, introduce the ordering in the form of explicit
+// control edges between collective graph nodes.  If `order_type` is `kAttrs`,
+// add an attribute to the node which may be used by collective executor to
+// ensure the required ordering.
+Status OrderCollectives(Graph* graph, GraphCollectiveOrder order_type);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_COLLECTIVE_ORDER_H_
diff --git a/tensorflow/core/graph/collective_order_test.cc b/tensorflow/core/graph/collective_order_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9a158e5c3fd040ca2242249aec51f701e785a4b6
--- /dev/null
+++ b/tensorflow/core/graph/collective_order_test.cc
@@ -0,0 +1,235 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/graph/collective_order.h"
+
+#include <gmock/gmock.h>
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/graph_def_builder_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+using ::testing::UnorderedElementsAreArray;
+
+REGISTER_OP("TestParams").Output("o: float");
+
+// Verifies that the list of collective nodes in `graph` matches
+// `expected_collective_nodes`, and that the list of control edges between these
+// collective nodes matches `expected_collective_control_edges`.
+void VerifyGraph(const Graph& graph,
+                 const std::vector<string>& expected_collective_nodes,
+                 const std::vector<std::pair<string, string>>&
+                     expected_collective_control_edges) {
+  std::vector<string> actual_collective_nodes;
+  std::vector<std::pair<string, string>> actual_collective_control_edges;
+  for (const Node* src : graph.nodes()) {
+    if (!src->IsCollective()) {
+      continue;
+    }
+    actual_collective_nodes.push_back(src->name());
+    for (const Edge* edge : src->out_edges()) {
+      VLOG(2) << "collective edge " << edge->src()->name() << " -> "
+              << edge->dst()->name();
+      // Add all control edges found except those to `_SINK`.
+      if (!edge->IsControlEdge() || edge->dst()->name() == "_SINK") {
+        continue;
+      }
+      actual_collective_control_edges.emplace_back(src->name(),
+                                                   edge->dst()->name());
+    }
+  }
+  EXPECT_THAT(actual_collective_nodes,
+              UnorderedElementsAreArray(expected_collective_nodes));
+  EXPECT_THAT(actual_collective_control_edges,
+              UnorderedElementsAreArray(expected_collective_control_edges));
+}
+
+// Verifies that the `wait_for` attribute on collective nodes matches
+// `wait_for_map`.
+void VerifyAttrs(
+    const Graph& graph,
+    const std::unordered_map<string, std::vector<int32>> wait_for_map) {
+  for (const Node* node : graph.nodes()) {
+    if (node->IsCollective() ||
+        wait_for_map.find(node->name()) == wait_for_map.end()) {
+      continue;
+    }
+    std::vector<int32> wait_for_actual;
+    TF_EXPECT_OK(GetNodeAttr(node->attrs(), "wait_for", &wait_for_actual));
+    auto wait_for_expected = wait_for_map.at(node->name());
+    EXPECT_THAT(wait_for_actual, UnorderedElementsAreArray(wait_for_expected));
+  }
+}
+
+Node* CollectiveReduceNode(GraphDefBuilder* builder, Node* input,
+                           const string& name, const string& device,
+                           int instance_key) {
+  Node* collective_node =
+      ops::UnaryOp("CollectiveReduce", input,
+                   builder->opts()
+                       .WithName(name)
+                       .WithDevice(device)
+                       .WithAttr("T", DT_FLOAT)
+                       .WithAttr("group_size", 2)
+                       .WithAttr("group_key", 1)
+                       .WithAttr("instance_key", instance_key)
+                       .WithAttr("merge_op", "Add")
+                       .WithAttr("final_op", "Id")
+                       .WithAttr("subdiv_offsets", {1}));
+  return collective_node;
+}
+
+// Initialize the following graph:
+//
+//       (cpu0) (cpu1)
+//         a      b
+//         |      |
+//         c1     c1
+//         |      |
+//         id     id
+//        /  \   /  \
+//       c2  c3 c2  c3
+//
+// Here ci denotes a collective node with `instance_key` i.  `a` and `b` are
+// inputs, `id` is identity node.
+std::unique_ptr<Graph> InitGraph() {
+  GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+  const string dev0 = "/job:localhost/replica:0/task:0/device:CPU:0";
+  const string dev1 = "/job:localhost/replica:0/task:0/device:CPU:1";
+  Node* a = ops::SourceOp("TestParams",
+                          builder.opts().WithName("a").WithDevice(dev0));
+  Node* b = ops::SourceOp("TestParams",
+                          builder.opts().WithName("b").WithDevice(dev1));
+  Node* c1_0 = CollectiveReduceNode(&builder, a, "c1_0", dev0, 1);
+  Node* c1_1 = CollectiveReduceNode(&builder, b, "c1_1", dev1, 1);
+  Node* id0 = ops::UnaryOp(
+      "Identity", c1_0,
+      builder.opts().WithName("id0").WithDevice(dev0).WithAttr("T", DT_FLOAT));
+  Node* id1 = ops::UnaryOp(
+      "Identity", c1_1,
+      builder.opts().WithName("id1").WithDevice(dev1).WithAttr("T", DT_FLOAT));
+  CollectiveReduceNode(&builder, id0, "c2_0", dev0, 2);
+  CollectiveReduceNode(&builder, id1, "c2_1", dev1, 2);
+  CollectiveReduceNode(&builder, id0, "c3_0", dev0, 3);
+  CollectiveReduceNode(&builder, id1, "c3_1", dev1, 3);
+
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+  Status s = GraphDefBuilderToGraph(builder, graph.get());
+  if (!s.ok()) {
+    LOG(FATAL) << "Error building graph " << s;
+  }
+  return graph;
+}
+
+// Tests that in the graph created by `InitGraph`, exactly 2 control edges are
+// added after calling `OrderCollectives`: c3_0 -> c2_0 and c3_1 -> c2_1.
+TEST(CollectiveOrderTest, SimpleOrder) {
+  std::unique_ptr<Graph> graph = InitGraph();
+  TF_EXPECT_OK(OrderCollectives(graph.get(), GraphCollectiveOrder::kEdges));
+  VerifyGraph(*graph, {"c1_0", "c1_1", "c2_0", "c2_1", "c3_0", "c3_1"},
+              {{"c3_0", "c2_0"}, {"c3_1", "c2_1"}});
+}
+
+TEST(CollectiveOrderTest, SimpleOrderAttr) {
+  std::unique_ptr<Graph> graph = InitGraph();
+  TF_EXPECT_OK(OrderCollectives(graph.get(), GraphCollectiveOrder::kAttrs));
+  VerifyAttrs(*graph, {{"c2_0", {3}}, {"c2_1", {3}}});
+}
+
+// Initialize the following graph:
+//
+//         a
+//         |
+//         c1
+//        /  \
+//       c4  id
+//          /  \
+//         c2  c3
+//
+// Here ci denotes a collective node with `instance_key` i.  `a` is an input,
+// `id` is identity node.
+std::unique_ptr<Graph> InitGraph2() {
+  GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+  const string dev0 = "/job:localhost/replica:0/task:0/device:CPU:0";
+  Node* a = ops::SourceOp("TestParams",
+                          builder.opts().WithName("a").WithDevice(dev0));
+  Node* c1 = CollectiveReduceNode(&builder, a, "c1", dev0, 1);
+  CollectiveReduceNode(&builder, c1, "c4", dev0, 4);
+  Node* id = ops::UnaryOp(
+      "Identity", c1,
+      builder.opts().WithName("id").WithDevice(dev0).WithAttr("T", DT_FLOAT));
+  CollectiveReduceNode(&builder, id, "c2", dev0, 2);
+  CollectiveReduceNode(&builder, id, "c3", dev0, 3);
+
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+  Status s = GraphDefBuilderToGraph(builder, graph.get());
+  if (!s.ok()) {
+    LOG(FATAL) << "Error building graph " << s;
+  }
+  return graph;
+}
+
+// Tests that in the graph created by `InitGraph2`, we add the following control
+// edges after calling `OrderCollectives`: c4 -> c3, c3 -> c2.  c4->c2 is
+// pruned because it follows from the other two edges.
+TEST(CollectiveOrderTest, SimpleOrder2) {
+  std::unique_ptr<Graph> graph = InitGraph2();
+  TF_EXPECT_OK(OrderCollectives(graph.get(), GraphCollectiveOrder::kEdges));
+  VerifyGraph(*graph, {"c1", "c2", "c3", "c4"}, {{"c4", "c3"}, {"c3", "c2"}});
+}
+
+// Initialize the following graph:
+//
+//         w   x   y   z
+//         |   |   |   |
+//         c1  c2  c3  c4
+//
+std::unique_ptr<Graph> InitGraphForPruning() {
+  GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+  const string dev0 = "/job:localhost/replica:0/task:0/device:CPU:0";
+  Node* w = ops::SourceOp("TestParams",
+                          builder.opts().WithName("w").WithDevice(dev0));
+  Node* x = ops::SourceOp("TestParams",
+                          builder.opts().WithName("x").WithDevice(dev0));
+  Node* y = ops::SourceOp("TestParams",
+                          builder.opts().WithName("y").WithDevice(dev0));
+  Node* z = ops::SourceOp("TestParams",
+                          builder.opts().WithName("z").WithDevice(dev0));
+  CollectiveReduceNode(&builder, w, "c1", dev0, 1);
+  CollectiveReduceNode(&builder, x, "c2", dev0, 2);
+  CollectiveReduceNode(&builder, y, "c3", dev0, 3);
+  CollectiveReduceNode(&builder, z, "c4", dev0, 4);
+
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+  Status s = GraphDefBuilderToGraph(builder, graph.get());
+  if (!s.ok()) {
+    LOG(FATAL) << "Error building graph " << s;
+  }
+  return graph;
+}
+
+// Tests that in the graph created by `InitGraphForPruning`, we only add c4 ->
+// c3, c3 -> c2, c2 -> c1, and other edges are pruned away.
+TEST(CollectiveOrderTest, Pruning) {
+  std::unique_ptr<Graph> graph = InitGraphForPruning();
+  TF_EXPECT_OK(OrderCollectives(graph.get(), GraphCollectiveOrder::kAttrs));
+  VerifyAttrs(*graph, {{"c3", {4}}, {"c2", {3}}, {"c1", {2}}});
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/graph/control_flow.cc b/tensorflow/core/graph/control_flow.cc
index 8e1e56d29bc474dedf7c0b01dbdf8099ebf86c4d..66237a349796929d17bab473a390e9bba35480ad 100644
--- a/tensorflow/core/graph/control_flow.cc
+++ b/tensorflow/core/graph/control_flow.cc
@@ -59,7 +59,7 @@ Status ValidateControlFlowInfo(const Graph* graph,
           "Invalid loop structure: Mismatched parent frames for \"",
           cf.frame_name, "\": \"", parent->name, "\" vs \"", frame.parent->name,
           "\". The node giving this error: ", FormatNodeForError(*node),
-          "This is an internal bug, please file a bug report with "
+          ". This is an internal bug, please file a bug report with "
           "instructions on how to reproduce the error.");
     }
     if (IsLoopCond(node)) {
diff --git a/tensorflow/core/graph/edgeset.cc b/tensorflow/core/graph/edgeset.cc
index 2e0c67146169d4b0fe3bbb548c70451b2b1907b9..e3b88994b5e24fae7c76137e920bb46f4f01aa29 100644
--- a/tensorflow/core/graph/edgeset.cc
+++ b/tensorflow/core/graph/edgeset.cc
@@ -38,9 +38,8 @@ std::pair<EdgeSet::const_iterator, bool> EdgeSet::insert(value_type value) {
     }
     // array is full. convert to set.
     s = new std::set<const Edge*>;
-    for (int i = 0; i < kInline; i++) {
-      s->insert(static_cast<const Edge*>(ptrs_[i]));
-    }
+    s->insert(reinterpret_cast<const Edge**>(std::begin(ptrs_)),
+              reinterpret_cast<const Edge**>(std::end(ptrs_)));
     ptrs_[0] = this;
     ptrs_[1] = s;
     // fall through.
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 550e3ef915290c499c904c14e2ca8c5fa7e4a981..3ea222c13c5aa06f708bce61454cef9c24e56c8b 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -216,6 +216,16 @@ void Node::set_requested_device(const string& device) {
   props_->node_def.set_device(device);
 }
 
+void Node::set_original_node_names(const std::vector<string>& names) {
+  MaybeCopyOnWrite();
+  props_->node_def.mutable_experimental_debug_info()
+      ->clear_original_node_names();
+  if (!names.empty()) {
+    *props_->node_def.mutable_experimental_debug_info()
+         ->mutable_original_node_names() = {names.begin(), names.end()};
+  }
+}
+
 Status Node::input_edge(int idx, const Edge** e) const {
   if (idx < 0 || idx >= num_inputs()) {
     return errors::InvalidArgument("Invalid input_edge index: ", idx, ", Node ",
@@ -293,6 +303,16 @@ Status Node::input_tensor(int idx, OutputTensor* t) const {
   return Status::OK();
 }
 
+// NodeDebugInfo
+
+NodeDebugInfo::NodeDebugInfo(const Node& n) : NodeDebugInfo(n.def()) {}
+NodeDebugInfo::NodeDebugInfo(const NodeDef& ndef) : name(ndef.name()) {
+  if (ndef.has_experimental_debug_info()) {
+    const auto& names = ndef.experimental_debug_info().original_node_names();
+    original_node_names.assign(names.begin(), names.end());
+  }
+}
+
 // InputTensor
 
 bool InputTensor::operator==(const InputTensor& other) const {
@@ -555,7 +575,13 @@ Status Graph::AddWhileInputHack(Node* new_src, int new_src_index, Node* dst) {
         dst->DebugString());
   }
   TF_RETURN_IF_ERROR(IsValidOutputTensor(new_src, new_src_index));
-  int dst_index = dst->in_edges().size();
+  // Find the current number of data inputs. We'll add the new edge to the next
+  // missing data input.
+  int dst_index = 0;
+  for (const Edge* edge : dst->in_edges()) {
+    if (edge->IsControlEdge()) continue;
+    ++dst_index;
+  }
   TF_RETURN_IF_ERROR(IsValidInputTensor(dst, dst_index));
   AddEdge(new_src, new_src_index, dst, dst_index);
   dst->MaybeCopyOnWrite();
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 667eaba24c3341cbafc68c92ac5e9fa23dbe669d..289a3d2a2307280830e23b8b12513e20feccb153 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -120,6 +120,10 @@ class Node {
   int assigned_device_name_index() const { return assigned_device_name_index_; }
   void set_assigned_device_name_index(int index);
 
+  // Sets 'original_node_names' field of this node's DebugInfo proto to
+  // 'names'.
+  void set_original_node_names(const std::vector<string>& names);
+
   // Read only access to attributes
   AttrSlice attrs() const;
 
@@ -290,6 +294,15 @@ class Node {
   TF_DISALLOW_COPY_AND_ASSIGN(Node);
 };
 
+// Stores debug information associated with the Node.
+struct NodeDebugInfo {
+  const string name;
+  std::vector<string> original_node_names;
+
+  NodeDebugInfo(const Node& n);
+  NodeDebugInfo(const NodeDef& ndef);
+};
+
 // Represents an input of a node, i.e., the `index`-th input to `node`.
 struct InputTensor {
   Node* node;
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index f6d83d5f6fff9be372e512e2ff7b8366201bdd81..ac1b690df315a0086fe00f0a720ecf87534452f2 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -35,6 +35,8 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/strings/scanner.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -268,22 +270,20 @@ class GraphConstructor {
     int gdef_index;
     Node* node;  // nullptr until the NodeDef is converted to a Node.
   };
-  // TODO(vrv): Profile this data structure to see if we should use an
-  // alternative implementation of std::unordered_map.
-  std::unordered_map<StringPiece, NodeInfo, StringPieceHasher> gdef_nodes_;
+  gtl::FlatMap<StringPiece, NodeInfo, StringPieceHasher> gdef_nodes_;
 
   // Prefixes already used in the GraphDef being imported.
-  std::unordered_set<StringPiece, StringPieceHasher> gdef_prefixes_;
+  gtl::FlatSet<StringPiece, StringPieceHasher> gdef_prefixes_;
 
   // Mapping from node name to the existing node in g_.
-  std::unordered_map<StringPiece, Node*, StringPieceHasher> existing_nodes_;
+  gtl::FlatMap<StringPiece, Node*, StringPieceHasher> existing_nodes_;
 
   // Prefixes already used in the graph.
-  std::unordered_set<StringPiece, StringPieceHasher> existing_prefixes_;
+  gtl::FlatSet<StringPiece, StringPieceHasher> existing_prefixes_;
 
   // Imported node names that have been uniquified. The key is the original
   // name, the value is the new unique name.
-  std::unordered_map<string, string> uniquified_names_;
+  gtl::FlatMap<string, string> uniquified_names_;
 
   // Index of NodeDefs in node_defs_ with all inputs already converted. We use a
   // (sorted) set so nodes are created in the order defined in the GraphDef.
@@ -360,7 +360,7 @@ bool NodeNameInValues(const std::vector<string>& control_dependencies,
 // Adds any prefixes of `node_name` (not including the full name itself) to
 // `prefixes`.
 void AddPrefixes(StringPiece node_name,
-                 std::unordered_set<StringPiece, StringPieceHasher>* prefixes) {
+                 gtl::FlatSet<StringPiece, StringPieceHasher>* prefixes) {
   size_t idx = -1;
   while ((idx = node_name.find('/', idx + 1)) != StringPiece::npos) {
     prefixes->insert(node_name.substr(0, idx));
@@ -857,7 +857,7 @@ void GraphConstructor::UpdateUniquifiedColocationNames() {
     for (int i = 0; i < coloc_values.size(); ++i) {
       StringPiece val(coloc_values[i]);
       if (str_util::ConsumePrefix(&val, kColocationGroupPrefix)) {
-        const auto& name_pair = uniquified_names_.find(string(val));
+        auto name_pair = uniquified_names_.find(string(val));
         if (name_pair == uniquified_names_.end()) continue;
         updated = true;
         coloc_values[i] =
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index 9c640c42a5891b632e18517c848cc9a0c76a0f45..00c7a5b091c0dbfbcf08a3611faaab4d41a08152 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -58,22 +59,15 @@ struct DupRecvKey {
   int src_output_slot;       // Edge's src node output slot
   GraphDef* dst_graph;       // Edge's dst node is in this subgraph
   bool recv_output_on_host;  // The output of recv is on host
-};
 
-struct DupRecvKeyHash {
-  size_t operator()(const DupRecvKey& k) const {
-    size_t h = Hash64(reinterpret_cast<const char*>(&k.src_node_id),
-                      sizeof(k.src_node_id), k.src_output_slot);
-    h = Hash64(reinterpret_cast<const char*>(&k.dst_graph), sizeof(k.dst_graph),
-               h);
-    h = Hash64(reinterpret_cast<const char*>(&k.recv_output_on_host),
-               sizeof(k.recv_output_on_host), h);
-    return h;
+  template <typename H>
+  friend H AbslHashValue(H h, const DupRecvKey& c) {
+    return H::combine(std::move(h), c.src_node_id, c.src_output_slot,
+                      reinterpret_cast<std::uintptr_t>(c.dst_graph),
+                      c.recv_output_on_host);
   }
-};
 
-struct DupRecvKeyEq {
-  bool operator()(const DupRecvKey& x, const DupRecvKey& y) const {
+  friend bool operator==(const DupRecvKey& x, const DupRecvKey& y) {
     return (x.src_node_id == y.src_node_id) &&
            (x.src_output_slot == y.src_output_slot) &&
            (x.dst_graph == y.dst_graph) &&
@@ -88,19 +82,26 @@ struct RecvInfo {
   int64 start_time;
 };
 
-typedef std::unordered_map<DupRecvKey, RecvInfo, DupRecvKeyHash, DupRecvKeyEq>
-    DupRecvTable;
+typedef absl::flat_hash_map<DupRecvKey, RecvInfo> DupRecvTable;
 
-struct PairIntHash {
- public:
-  std::size_t operator()(const std::pair<int, int>& x) const {
-    return std::hash<int>()(x.first) ^ std::hash<int>()(x.second);
-  }
-};
 // A map used to store memory types for the inputs/outputs of every node.
 // The key is a pair of ints consisting of a node id and input/output index.
-typedef std::unordered_map<std::pair<int, int>, MemoryType, PairIntHash>
-    MemoryTypeMap;
+// TODO(power): migrate back to std::pair when absl::Hash is fixed for MSVC.
+struct NodePort {
+  int node_id;
+  int index;
+
+  friend bool operator==(const NodePort& x, const NodePort& y) {
+    return x.node_id == y.node_id && x.index == y.index;
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const NodePort& c) {
+    return H::combine(std::move(h), c.node_id, c.index);
+  }
+};
+
+typedef absl::flat_hash_map<NodePort, MemoryType> MemoryTypeMap;
 
 // We collect the following information about the graph before performing
 // graph partitioning.
@@ -209,7 +210,8 @@ NodeDef* AddSend(const PartitionOptions& opts, const GraphInfo& g_info,
   // NOTE(yuanbyu): Only cast for cross-device send/recv.
   if (dtype != cast_dtype && !NeedSameDeviceSendRecv(edge, g_info)) {
     const string cast_op = (host_memory) ? "_HostCast" : "Cast";
-    NodeDefBuilder cast_builder(opts.new_name(src->name()), cast_op);
+    NodeDefBuilder cast_builder(opts.new_name(src->name()), cast_op,
+                                NodeDebugInfo(*src));
     cast_builder.Device(src->assigned_device_name()).Input(send_from);
     if (opts.scheduling_for_recvs) {
       cast_builder.Attr("_start_time", start_time);
@@ -233,7 +235,8 @@ NodeDef* AddSend(const PartitionOptions& opts, const GraphInfo& g_info,
 
   // Add the send node.
   const string send_op = (host_memory) ? "_HostSend" : "_Send";
-  NodeDefBuilder send_builder(opts.new_name(src->name()), send_op);
+  NodeDefBuilder send_builder(opts.new_name(src->name()), send_op,
+                              NodeDebugInfo(*src));
   SetSendRecvAttrs(opts, edge, &send_builder);
   send_builder.Device(src->assigned_device_name()).Input(send_from);
   if (opts.scheduling_for_recvs) {
@@ -268,7 +271,8 @@ NodeDef* AddRecv(const PartitionOptions& opts, const GraphInfo& g_info,
 
   // Add the recv node.
   const string recv_op = (host_memory) ? "_HostRecv" : "_Recv";
-  NodeDefBuilder recv_builder(opts.new_name(src->name()), recv_op);
+  NodeDefBuilder recv_builder(opts.new_name(src->name()), recv_op,
+                              NodeDebugInfo(*src));
   SetSendRecvAttrs(opts, edge, &recv_builder);
   recv_builder.Device(dst->assigned_device_name())
       .Attr("tensor_type", cast_dtype);
@@ -280,7 +284,8 @@ NodeDef* AddRecv(const PartitionOptions& opts, const GraphInfo& g_info,
   // Add the cast node (from cast_dtype to dtype) or an Identity node.
   if (dtype != cast_dtype) {
     const string cast_op = (host_memory) ? "_HostCast" : "Cast";
-    NodeDefBuilder cast_builder(opts.new_name(src->name()), cast_op);
+    NodeDefBuilder cast_builder(opts.new_name(src->name()), cast_op,
+                                NodeDebugInfo(*src));
     cast_builder.Attr("DstT", dtype);
     cast_builder.Device(dst->assigned_device_name())
         .Input(recv->name(), 0, cast_dtype);
@@ -290,7 +295,8 @@ NodeDef* AddRecv(const PartitionOptions& opts, const GraphInfo& g_info,
     return cast;
   } else if (edge->IsControlEdge()) {
     // An Identity is only needed for control edges.
-    NodeDefBuilder id_builder(opts.new_name(src->name()), "Identity");
+    NodeDefBuilder id_builder(opts.new_name(src->name()), "Identity",
+                              NodeDebugInfo(*src));
     id_builder.Device(dst->assigned_device_name())
         .Input(recv->name(), 0, cast_dtype);
     NodeDef* id = gdef->add_node();
@@ -559,10 +565,10 @@ Status BuildMemoryDeviceInfo(const Graph& g, GraphInfo* info) {
 
     int node_id = node->id();
     info->device_types[node_id] = DeviceType(parsed.type);
-    for (size_t i = 0; i < input_memory_types.size(); ++i) {
+    for (int i = 0; i < input_memory_types.size(); ++i) {
       info->input_types[{node_id, i}] = input_memory_types[i];
     }
-    for (size_t i = 0; i < output_memory_types.size(); ++i) {
+    for (int i = 0; i < output_memory_types.size(); ++i) {
       info->output_types[{node_id, i}] = output_memory_types[i];
     }
   }
@@ -982,6 +988,7 @@ Status Partition(const PartitionOptions& opts, Graph* g,
     GraphDef* dst_graph = &(*partitions)[dstp];
     NodeDef* dst_def = dst_graph->add_node();
     *dst_def = dst->def();
+    MergeDebugInfo(NodeDebugInfo(dst->def()), dst_def);
     dst_def->set_device(dst->assigned_device_name());
     dst_def->clear_input();  // Inputs are filled below
     if (opts.need_to_record_start_times) {
diff --git a/tensorflow/core/graph/graph_test.cc b/tensorflow/core/graph/graph_test.cc
index 333c32567fc9b922951b558c86f29087da770894..602578a83a3fcc01dbb61841051da92ffc366144 100644
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@@ -661,6 +661,10 @@ TEST_F(GraphTest, BuildNodeNameIndex) {
 }
 
 REGISTER_OP("Input").Output("y: float");
+REGISTER_OP("Output")
+    .Input("x: N * float")
+    .Attr("N: int >= 1")
+    .Output("y: float");
 REGISTER_OP("In2Out1").Input("a: float").Input("b: float").Output("y: float");
 REGISTER_OP("In4Out1")
     .Input("a: float")
@@ -713,7 +717,14 @@ GraphDef CreateGraphDef(int num_nodes, int num_edges_per_node) {
     }
     s += strings::Printf("'in%04d' ] } ", rnd.Uniform(kNumInNodes));
   }
-
+  // Add a single sink node. Otherwise a lot of time is spent in
+  // FixupSourceAndSinkEdges().
+  s += strings::Printf("node { name: 'out' op: 'Output' input: [ ");
+  for (int op = 0; op < num_nodes - 1; op++) {
+    s += strings::Printf("'op%05d', ", op);
+  }
+  s += strings::Printf("'op%05d' ], attr: { key: 'N' value { i: %d } } } ",
+                       num_nodes - 1, num_nodes);
   GraphDef graph_def;
   CHECK(protobuf::TextFormat::ParseFromString(s, &graph_def));
   return graph_def;
@@ -799,5 +810,44 @@ BENCHMARK(BM_GraphCreation)->ArgPair(1 << 9, 16);
 BENCHMARK(BM_GraphCreation)->ArgPair(1 << 12, 16);
 BENCHMARK(BM_GraphCreation)->ArgPair(1 << 15, 16);
 
+static void BM_ToGraphDef(int iters, int num_nodes, int num_edges_per_node) {
+  testing::StopTiming();
+  const GraphDef graph_def = CreateGraphDef(num_nodes, num_edges_per_node);
+  const auto registry = OpRegistry::Global();
+  GraphConstructorOptions opts;
+  // Warmup step.
+  Graph graph(registry);
+  TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, &graph));
+  int64 sum = 0;
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    GraphDef graph_def;
+    graph.ToGraphDef(&graph_def);
+    sum += graph_def.node_size();
+  }
+  VLOG(1) << sum;
+  testing::StopTiming();
+}
+BENCHMARK(BM_ToGraphDef)->ArgPair(10, 2);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 6, 2);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 9, 2);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 12, 2);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 15, 2);
+BENCHMARK(BM_ToGraphDef)->ArgPair(10, 4);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 6, 4);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 9, 4);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 12, 4);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 15, 4);
+BENCHMARK(BM_ToGraphDef)->ArgPair(10, 8);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 6, 8);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 9, 8);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 12, 8);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 15, 8);
+BENCHMARK(BM_ToGraphDef)->ArgPair(10, 16);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 6, 16);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 9, 16);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 12, 16);
+BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 15, 16);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index 990b2fe9b04770dc875b949ec3e17c321fe018be..f36ca8c5a843c8f2e5e2860e8416d0533dc940ed 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -96,7 +96,7 @@ static inline bool IsMklOp(const string& op_name, DataType T) {
 
   // Restrict quantized ops to QUINT8 and QINT8 for now
   if (kernel.find(kMklQuantizedOpLabelPattern) != string::npos) {
-    return (T == DT_QUINT8 || T == DT_QINT8);
+    return (T == DT_QUINT8 || T == DT_QINT8 || T == DT_QINT32);
   }
   // Restrict regular ops to FLOAT
   if (kernel.find(kMklOpLabelPattern) != string::npos) {
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 52b46600943b31f4d0205d0eb120cc282c78240f..d5dcd16be7f8d459189774b9bd672b7c5dcdb7b9 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -258,9 +258,16 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.conv3d = "Conv3D";
     csinfo_.conv3d_grad_input = "Conv3DBackpropInputV2";
     csinfo_.conv3d_grad_filter = "Conv3DBackpropFilterV2";
+    csinfo_.depthwise_conv2d = "DepthwiseConv2dNative";
+    csinfo_.depthwise_conv2d_grad_input = "DepthwiseConv2dNativeBackpropInput";
+    csinfo_.depthwise_conv2d_grad_filter =
+        "DepthwiseConv2dNativeBackpropFilter";
     csinfo_.fused_batch_norm = "FusedBatchNorm";
     csinfo_.fused_batch_norm_grad = "FusedBatchNormGrad";
+    csinfo_.fused_conv2d = "_FusedConv2D";
     csinfo_.identity = "Identity";
+    csinfo_.leakyrelu = "LeakyRelu";
+    csinfo_.leakyrelu_grad = "LeakyReluGrad";
     csinfo_.lrn = "LRN";
     csinfo_.lrn_grad = "LRNGrad";
     csinfo_.matmul = "MatMul";
@@ -274,13 +281,16 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
     csinfo_.mkl_conv2d_grad_filter_with_bias =
         "_MklConv2DBackpropFilterWithBias";
+    csinfo_.mkl_depthwise_conv2d_grad_input =
+        "_MklDepthwiseConv2dNativeBackpropInput";
+    csinfo_.mkl_depthwise_conv2d_grad_filter =
+        "_MklDepthwiseConv2dNativeBackpropFilter";
+    csinfo_.mkl_fused_conv2d = "_MklFusedConv2D";
     csinfo_.mkl_pad_with_conv2d = "_MklPadWithConv2D";
+    csinfo_.mkl_pad_with_fused_conv2d = "_MklPadWithFusedConv2D";
     csinfo_.pad = "Pad";
     csinfo_.pad_with_conv2d = "__MklDummyPadWithConv2D";
-// Temporarily don't convert quantized operators into MKL versions for now.
-// TODO(Intel-tf) Once all the relevant PRs have been merged then remove
-// the ifdef.
-#ifdef INTEL_MKL_QUANTIZED
+    csinfo_.pad_with_fused_conv2d = "__MklDummyPadWithFusedConv2D";
     csinfo_.quantized_avg_pool = "QuantizedAvgPool";
     csinfo_.quantized_concatv2 = "QuantizedConcatV2";
     csinfo_.quantized_conv2d = "QuantizedConv2D";
@@ -302,14 +312,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
         "QuantizedConv2DWithBiasSumAndReluAndRequantize";
     csinfo_.quant_conv2d_with_bias_signed_sum_and_relu_and_requantize =
         "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize";
-#endif
     csinfo_.relu = "Relu";
     csinfo_.relu_grad = "ReluGrad";
     csinfo_.relu6 = "Relu6";
     csinfo_.relu6_grad = "Relu6Grad";
-#ifdef INTEL_MKL_QUANTIZED
     csinfo_.requantize = "Requantize";
-#endif
     csinfo_.tanh = "Tanh";
     csinfo_.tanh_grad = "TanhGrad";
     csinfo_.reshape = "Reshape";
@@ -352,9 +359,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                       CopyAttrsConcatV2, AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv2d,
                       mkl_op_registry::GetMklOpName(csinfo_.conv2d),
-                      CopyAttrsConv, AlwaysRewrite});
+                      CopyAttrsConvCheckConstFilter, AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv2d_with_bias, csinfo_.mkl_conv2d_with_bias,
-                      CopyAttrsConv, AlwaysRewrite});
+                      CopyAttrsConvCheckConstFilter, AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv2d_grad_filter,
                       mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_filter),
                       CopyAttrsConv, AlwaysRewrite});
@@ -366,13 +373,24 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                       CopyAttrsConv, AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv3d,
                       mkl_op_registry::GetMklOpName(csinfo_.conv3d),
-                      CopyAttrsConv, AlwaysRewrite});
+                      CopyAttrsConvCheckConstFilter, AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv3d_grad_filter,
                       mkl_op_registry::GetMklOpName(csinfo_.conv3d_grad_filter),
                       CopyAttrsConv, AlwaysRewrite});
     rinfo_.push_back({csinfo_.conv3d_grad_input,
                       mkl_op_registry::GetMklOpName(csinfo_.conv3d_grad_input),
                       CopyAttrsConv, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.depthwise_conv2d,
+                      mkl_op_registry::GetMklOpName(csinfo_.depthwise_conv2d),
+                      CopyAttrsConv2DDepthwiseCheckConstFilter, AlwaysRewrite});
+    rinfo_.push_back(
+        {csinfo_.depthwise_conv2d_grad_input,
+         mkl_op_registry::GetMklOpName(csinfo_.depthwise_conv2d_grad_input),
+         CopyAttrsConv2DDepthwise, AlwaysRewrite});
+    rinfo_.push_back(
+        {csinfo_.depthwise_conv2d_grad_filter,
+         mkl_op_registry::GetMklOpName(csinfo_.depthwise_conv2d_grad_filter),
+         CopyAttrsConv2DDepthwise, AlwaysRewrite});
     rinfo_.push_back({csinfo_.fused_batch_norm,
                       mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm),
                       CopyAttrsFusedBatchNorm, AlwaysRewrite});
@@ -380,6 +398,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
         {csinfo_.fused_batch_norm_grad,
          mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
          CopyAttrsFusedBatchNorm, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.fused_conv2d, csinfo_.mkl_fused_conv2d,
+                      CopyAttrsFusedConv2D, FusedConv2DRewrite});
     rinfo_.push_back({csinfo_.identity,
                       mkl_op_registry::GetMklOpName(csinfo_.identity),
                       CopyAttrsDataType, AlwaysRewrite});
@@ -388,6 +408,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.lrn_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.lrn_grad),
                       CopyAttrsLRN, LrnGradRewrite});
+    rinfo_.push_back({csinfo_.leakyrelu,
+                      mkl_op_registry::GetMklOpName(csinfo_.leakyrelu),
+                      CopyAttrsLeakyRelu, LeakyReluRewrite});
+    rinfo_.push_back({csinfo_.leakyrelu_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.leakyrelu_grad),
+                      CopyAttrsLeakyRelu, LeakyReluRewrite});
     rinfo_.push_back({csinfo_.max_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.max_pool),
                       CopyAttrsPooling, NonDepthBatchWisePoolRewrite});
@@ -407,7 +433,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                       CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.pad_with_conv2d, csinfo_.mkl_pad_with_conv2d,
                       CopyAttrsPadWithConv2D, AlwaysRewrite});
-#ifdef INTEL_MKL_QUANTIZED
+    rinfo_.push_back({csinfo_.pad_with_fused_conv2d,
+                      csinfo_.mkl_pad_with_fused_conv2d,
+                      CopyAttrsPadWithFusedConv2D, AlwaysRewrite});
     rinfo_.push_back({csinfo_.quantized_avg_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.quantized_avg_pool),
                       CopyAttrsQuantizedPooling, AlwaysRewrite});
@@ -463,7 +491,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
          mkl_op_registry::GetMklOpName(
              csinfo_.quant_conv2d_with_bias_signed_sum_and_relu_and_requantize),
          CopyAttrsQuantizedConv2D, AlwaysRewrite});
-#endif
     rinfo_.push_back({csinfo_.relu, mkl_op_registry::GetMklOpName(csinfo_.relu),
                       CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.relu_grad,
@@ -475,11 +502,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.relu6_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.relu6_grad),
                       CopyAttrsDataType, AlwaysRewrite});
-#ifdef INTEL_MKL_QUANTIZED
     rinfo_.push_back({csinfo_.requantize,
                       mkl_op_registry::GetMklOpName(csinfo_.requantize),
                       CopyAttrsRequantize, AlwaysRewrite});
-#endif
     /*
     rinfo_.push_back({csinfo_.tanh,
                       mkl_op_registry::GetMklOpName(csinfo_.tanh),
@@ -517,10 +542,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     minfo_.push_back({csinfo_.conv2d_grad_filter, csinfo_.bias_add_grad,
                       csinfo_.conv2d_grad_filter_with_bias,
                       GetConv2DBackpropFilterOrBiasAddGrad});
-    minfo_.push_back(
-        {csinfo_.pad, csinfo_.conv2d, csinfo_.pad_with_conv2d, GetPadOrConv2D});
     // Merge Pad and Conv2d, only if the pad op is "Pad"
     // Doesn't merge if pad op is "PadV2" or "MirrorPad"
+    minfo_.push_back(
+        {csinfo_.pad, csinfo_.conv2d, csinfo_.pad_with_conv2d, GetPadOrConv2D});
+
+    minfo_.push_back({csinfo_.pad, csinfo_.fused_conv2d,
+                      csinfo_.pad_with_fused_conv2d, GetPadOrFusedConv2D});
 
     // The fusion patterns in "finfo_" that show up first will get applied
     // first, for example, graph "A->B->C-D" and finfo_ is {A->B->C to ABC,
@@ -663,9 +691,15 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string conv3d;
     string conv3d_grad_input;
     string conv3d_grad_filter;
+    string depthwise_conv2d;
+    string depthwise_conv2d_grad_input;
+    string depthwise_conv2d_grad_filter;
     string fused_batch_norm;
     string fused_batch_norm_grad;
+    string fused_conv2d;
     string identity;
+    string leakyrelu;
+    string leakyrelu_grad;
     string lrn;
     string lrn_grad;
     string matmul;
@@ -679,10 +713,15 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string mkl_conv2d_grad_filter;
     string mkl_conv2d_grad_filter_with_bias;
     string mkl_conv2d_with_bias;
+    string mkl_depthwise_conv2d_grad_input;
+    string mkl_depthwise_conv2d_grad_filter;
+    string mkl_fused_conv2d;
     string mkl_pad_with_conv2d;
+    string mkl_pad_with_fused_conv2d;
     string mul;
     string pad;
     string pad_with_conv2d;
+    string pad_with_fused_conv2d;
     string quantized_avg_pool;
     string quantized_conv2d;
     string quantized_conv2d_with_requantize;
@@ -822,6 +861,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     CHECK_NOTNULL(m);
     Node* n = nullptr;
 
+    DataType T_m;
+    TF_CHECK_OK(GetNodeAttr(m->def(), "T", &T_m));
+
+    // Don't try to merge if datatype is not DT_FLOAT
+    if (T_m != DT_FLOAT) return n;
+
     if (m->type_string() == csinfo_.bias_add) {
       // If a is BiasAdd, then Conv2D is 0th input of BiasAdd.
       TF_CHECK_OK(m->input_node(0, &n));
@@ -856,6 +901,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     DCHECK(m);
     Node* n = nullptr;
 
+    DataType T_m;
+    TF_CHECK_OK(GetNodeAttr(m->def(), "T", &T_m));
+
+    // Don't try to merge if datatype is not DT_FLOAT
+    if (T_m != DT_FLOAT) return n;
+
     const Node* conv_node;
     if (m->type_string() == csinfo_.pad) {
       // If m is Pad, then Conv2D is the output of Pad.
@@ -896,6 +947,59 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
     return n;
   }
+
+  // Find Pad or _FusedConv2D node that can be merged with input node 'm'.
+  // If input 'm' is Pad, then check if there exists _FusedConv2D node that can
+  // be merged with 'm'. If input 'm' is _FusedConv2D, then check if there
+  // exists Pad node that can be merged with 'm'.
+  static Node* GetPadOrFusedConv2D(const Node* m) {
+    DCHECK(m);
+    Node* n = nullptr;
+
+    const Node* conv_node;
+    if (m->type_string() == csinfo_.pad) {
+      // If m is Pad, then _FusedConv2D is the output of Pad.
+      for (const Edge* e : m->out_edges()) {
+        if (!e->IsControlEdge() &&
+            e->dst()->type_string() == csinfo_.fused_conv2d) {
+          n = e->dst();
+          conv_node = n;
+          break;
+        }
+      }
+    } else {
+      DCHECK_EQ(m->type_string(), csinfo_.fused_conv2d);
+      // If m is _FusedConv2D, Go over all input edges
+      // and search for Pad node.
+      for (const Edge* e : m->in_edges()) {
+        if (!e->IsControlEdge() && e->src()->type_string() == csinfo_.pad) {
+          n = e->src();
+          conv_node = m;
+          break;
+        }
+      }
+    }
+    // Check if only VALID type of padding is used or not.
+    if (n != nullptr) {
+      string padding;
+      TF_CHECK_OK(GetNodeAttr(conv_node->def(), "padding", &padding));
+      if (padding != "VALID") {
+        // Then do not merge.
+        n = nullptr;
+        VLOG(1) << "MklLayoutRewritePass: Could match Pad and _FusedConv2D "
+                << "nodes but cannot merge them. Only conv ops with padding "
+                << "type VALID can be merged with Pad op Input node: "
+                << m->DebugString();
+      }
+    } else {
+      VLOG(1) << "MklLayoutRewritePass: Could not find matching "
+              << "Pad and _FusedConv2D node for merging. Input node: "
+              << m->DebugString();
+    }
+
+    return n;
+  }
+
   // Find Conv2DBackpropFilter or BiasAddGrad node that can be merged with input
   // node 'm'. If input 'm' is Conv2DBackpropFilter, then check if there exists
   // BiasAddGrad node that can be merged with 'm'. If input 'm' is BiasAddGrad,
@@ -914,6 +1018,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     CHECK_NOTNULL(m);
     Node* n = nullptr;
 
+    DataType T_m;
+    TF_CHECK_OK(GetNodeAttr(m->def(), "T", &T_m));
+
+    // Don't try to merge if datatype is not DT_FLOAT
+    if (T_m != DT_FLOAT) return n;
+
     if (m->type_string() == csinfo_.bias_add_grad) {
       // Get 1st input 'g' of BiasAddGrad.
       Node* g = nullptr;
@@ -1009,7 +1119,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
             e->dst_input() == kPermTensorIndex) {
           // we find the "perm" node, now try to retrieve its value.
           const TensorProto* proto = nullptr;
-          DCHECK(GetNodeAttr(perm_node->def(), "value", &proto).ok());
+          TF_CHECK_OK(GetNodeAttr(perm_node->def(), "value", &proto));
 
           DataType type;
           GetNodeAttr(perm_node->def(), "dtype", &type);
@@ -1142,6 +1252,30 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return do_rewrite;
   }
 
+  // MKL-DNN's LeakyRelu(feature) = feature          (if feature > 0), or
+  //                                feature * alpha  (otherwise),
+  // while TensorFlow's LeakyRelu(feature) = max(feature, feature * alpha).
+  // These two algorithms are not consistent when alpha > 1,
+  // so we only rewrite LeakyRelu to MKL OP when alpha <= 1.
+  static bool LeakyReluRewrite(const Node* n) {
+    DCHECK(n);
+
+    float alpha;
+    bool has_attr = GetNodeAttr(n->def(), "alpha", &alpha).ok();
+    DCHECK(has_attr);
+
+    // If the alpha of LeakyRelu is less than 1, rewrite the node.
+    // Otherwise eigen node is used instead.
+    if (alpha <= 1) {
+      return true;
+    }
+    VLOG(1) << "LeakyReluRewrite: The model sets alpha is greater than 1 "
+            << "which case is not optimized by Intel MKL, thus using Eigen op"
+            << "for LeakyRelu ";
+
+    return false;
+  }
+
   static bool MaxpoolGradRewrite(const Node* n) {
     CHECK_NOTNULL(n);
     bool do_rewrite = false;
@@ -1174,6 +1308,23 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return false;
   }
 
+  static bool FusedConv2DRewrite(const Node* n) {
+    // MKL DNN currently doesn't support all fusions that grappler fuses
+    // together with Conv2D (ex. batchnorm). We rewrite _FusedConv2D only if
+    // it includes those we support.
+    DataType T;
+    if (!GetNodeAttr(n->def(), "T", &T).ok() ||
+        !mkl_op_registry::IsMklOp(csinfo_.mkl_fused_conv2d, T)) {
+      return false;
+    }
+
+    std::vector<string> fused_ops;
+    TF_CHECK_OK(GetNodeAttr(n->def(), "fused_ops", &fused_ops));
+    return (fused_ops == std::vector<string>{"BiasAdd"} ||
+            fused_ops == std::vector<string>{"Relu"} ||
+            fused_ops == std::vector<string>{"BiasAdd", "Relu"});
+  }
+
   // Rewrites input node to a new node specified by its matching rewrite info.
   //
   // Method first searches matching rewrite info for input node and then
@@ -1331,17 +1482,35 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                                 bool change_format = false);
   static void CopyAttrsConv(const Node* orig_node, NodeBuilder* nb,
                             bool change_format = false);
+  static void CopyAttrsConv2DDepthwise(const Node* orig_node, NodeBuilder* nb,
+                                       bool change_format = false);
+  static void CopyAttrsConv2DDepthwiseCheckConstFilter(
+      const Node* orig_node, NodeBuilder* nb, bool change_format = false);
+  static void CopyAttrsConvCheckConstFilter(const Node* orig_node,
+                                            NodeBuilder* nb,
+                                            bool change_format = false);
   static void CopyAttrsDataType(const Node* orig_node, NodeBuilder* nb,
                                 bool change_format = false);
   static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb,
                                       bool change_format = false);
+  static void CopyAttrsLeakyRelu(const Node* orig_node, NodeBuilder* nb,
+                                 bool change_format = false);
+  static void CopyAttrsFusedConv2D(const Node* orig_node, NodeBuilder* nb,
+                                   bool change_format = false);
   static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb,
                            bool change_format = false);
   static void CopyAttrsPadWithConv2D(const Node* orig_node, NodeBuilder* nb,
                                      bool change_format = false);
+  static void CopyAttrsPadWithFusedConv2D(const Node* orig_node,
+                                          NodeBuilder* nb,
+                                          bool change_format = false);
   static void CopyAttrsFromPadAndConv2D(const Node* orig_node1,
                                         const Node* orig_node2, NodeBuilder* nb,
                                         bool change_format = false);
+  static void CopyAttrsFromPadAndFusedConv2D(const Node* orig_node1,
+                                             const Node* orig_node2,
+                                             NodeBuilder* nb,
+                                             bool change_format = false);
   static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb,
                                bool change_format = false);
   static void CopyAttrsQuantizedPooling(const Node* orig_node, NodeBuilder* nb,
@@ -1358,6 +1527,10 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                              bool change_format = false);
   static void CopyAttrsSplit(const Node* orig_node, NodeBuilder* nb,
                              bool change_format = false);
+  static void CopyFormatAttrsConv(const Node* orig_node, NodeBuilder* nb,
+                                  const std::vector<int32>& strides,
+                                  const std::vector<int32>& dilations,
+                                  bool change_format = false);
 
   // Generate a graph node in graph 'g' representing a dummy Mkl tensor node,
   // using node for original node 'orig_node' and return it in '*out'.
@@ -1554,12 +1727,14 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
     CHECK_NOTNULL(filter_node);
 
     // Now check which nodes receive from filter_node. Filter feeds as
-    // 2nd input (slot 1) of _MklConv2D and _MklConv2DWithBias.
+    // 2nd input (slot 1) of _MklConv2D, _MklConv2DWithBias, and
+    // _MklFusedConv2D.
     for (const Edge* e : filter_node->out_edges()) {
       if ((e->dst()->type_string() == csinfo_.mkl_conv2d ||
-           // add check for mkl_pad_with_conv2d
            e->dst()->type_string() == csinfo_.mkl_pad_with_conv2d ||
-           e->dst()->type_string() == csinfo_.mkl_conv2d_with_bias) &&
+           e->dst()->type_string() == csinfo_.mkl_pad_with_fused_conv2d ||
+           e->dst()->type_string() == csinfo_.mkl_conv2d_with_bias ||
+           e->dst()->type_string() == csinfo_.mkl_fused_conv2d) &&
           e->dst_input() == kConv2DFilterInputSlotIdx
           /* filter is 2nd input of Conv2D and _MklConv2D. */) {
         if (conv2d_node != nullptr) {
@@ -1860,10 +2035,10 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
 // Op-specific functions to copy attributes from old node to new node
 //////////////////////////////////////////////////////////////////////////
 
-void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node, NodeBuilder* nb,
-                                         bool change_format) {
+void MklLayoutRewritePass::CopyAttrsConvCheckConstFilter(const Node* orig_node,
+                                                         NodeBuilder* nb,
+                                                         bool change_format) {
   DataType T;
-  string data_format;
   string padding;
   std::vector<int32> strides;
   std::vector<int32> dilations;
@@ -1874,44 +2049,37 @@ void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node, NodeBuilder* nb,
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
 
+  Node* filter_node = nullptr;
+  orig_node->input_node(1, &filter_node);
+
   // Add attributes to new node.
   nb->Attr("T", T);
   nb->Attr("padding", padding);
+  nb->Attr("is_filter_const", filter_node->IsConstant());
 
-  if (!change_format) {
-    nb->Attr("strides", strides);
-    nb->Attr("dilations", dilations);
+  // Add attributes related to `data_format`.
+  CopyFormatAttrsConv(orig_node, nb, strides, dilations, change_format);
+}
 
-    TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
-    nb->Attr("data_format", data_format);
-  } else {
-    std::vector<int32> new_strides;
-    std::vector<int32> new_dilations;
-    if (strides.size() == 5) {
-      // "strides" and "dilations" also need to be changed according to
-      // "data_format",
-      // in this case, is "NDHWC" to "NCDHW".
-      new_strides = {strides[NDHWC::dim::N], strides[NDHWC::dim::C],
-                     strides[NDHWC::dim::D], strides[NDHWC::dim::H],
-                     strides[NDHWC::dim::W]};
+void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node, NodeBuilder* nb,
+                                         bool change_format) {
+  DataType T;
+  string padding;
+  std::vector<int32> strides;
+  std::vector<int32> dilations;
 
-      new_dilations = {dilations[NDHWC::dim::N], dilations[NDHWC::dim::C],
-                       dilations[NDHWC::dim::D], dilations[NDHWC::dim::H],
-                       dilations[NDHWC::dim::W]};
-    } else {
-      // "strides" and "dilations" also need to be changed according to
-      // "data_format",
-      // in this case, is "NHWC" to "NCHW".
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
 
-      new_strides = {strides[NHWC::dim::N], strides[NHWC::dim::C],
-                     strides[NHWC::dim::H], strides[NHWC::dim::W]};
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("padding", padding);
 
-      new_dilations = {dilations[NHWC::dim::N], dilations[NHWC::dim::C],
-                       dilations[NHWC::dim::H], dilations[NHWC::dim::W]};
-    }
-    nb->Attr("strides", new_strides);
-    nb->Attr("dilations", new_dilations);
-  }
+  // Add attributes related to `data_format`.
+  CopyFormatAttrsConv(orig_node, nb, strides, dilations, change_format);
 }
 
 // Used in rinfo when replacing __MklDummyPadWithConv2D by _MklPadWithConv2D
@@ -1936,16 +2104,38 @@ void MklLayoutRewritePass::CopyAttrsPadWithConv2D(const Node* orig_node,
       GetNodeAttr(orig_node->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tpaddings", &Tpaddings));
 
+  Node* filter_node = nullptr;
+  orig_node->input_node(1, &filter_node);
+
   // Add attributes to new node.
   nb->Attr("T", T);
   nb->Attr("strides", strides);
   nb->Attr("dilations", dilations);
   nb->Attr("padding", padding);
+  nb->Attr("is_filter_const", filter_node->IsConstant());
   nb->Attr("data_format", data_format);
   nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
   nb->Attr("Tpaddings", Tpaddings);
 }
 
+void MklLayoutRewritePass::CopyAttrsPadWithFusedConv2D(const Node* orig_node,
+                                                       NodeBuilder* nb,
+                                                       bool change_format) {
+  DataType Tpaddings;
+
+  CopyAttrsFusedConv2D(orig_node, nb, change_format);
+
+  // Get attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tpaddings", &Tpaddings));
+  // Check if filter is a constant.
+  Node* filter_node = nullptr;
+  orig_node->input_node(1, &filter_node);
+
+  // Add attributes to new node.
+  nb->Attr("Tpaddings", Tpaddings);
+  nb->Attr("is_filter_const", filter_node->IsConstant());
+}
+
 // Used with MergePadWithConv2D
 void MklLayoutRewritePass::CopyAttrsFromPadAndConv2D(const Node* orig_node1,
                                                      const Node* orig_node2,
@@ -1980,6 +2170,93 @@ void MklLayoutRewritePass::CopyAttrsFromPadAndConv2D(const Node* orig_node1,
   nb->Attr("Tpaddings", Tpaddings);
 }
 
+void MklLayoutRewritePass::CopyAttrsFromPadAndFusedConv2D(
+    const Node* fused_conv2d, const Node* pad, NodeBuilder* nb,
+    bool change_format) {
+  DataType T;
+  int num_args;
+  string data_format;
+  string padding;
+  std::vector<int32> strides;
+  std::vector<int32> dilations;
+  float epsilon;
+  std::vector<string> fused_ops;
+  DataType Tpaddings;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "num_args", &num_args));
+  TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "data_format", &data_format));
+  TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "dilations", &dilations));
+  TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "fused_ops", &fused_ops));
+  TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "epsilon", &epsilon));
+  TF_CHECK_OK(GetNodeAttr(pad->def(), "Tpaddings", &Tpaddings));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("num_args", num_args);
+  nb->Attr("strides", strides);
+  nb->Attr("padding", padding);
+  nb->Attr("data_format", data_format);
+  nb->Attr("dilations", dilations);
+  nb->Attr("epsilon", epsilon);
+  nb->Attr("Tpaddings", Tpaddings);
+  nb->Attr("fused_ops", fused_ops);
+}
+
+void MklLayoutRewritePass::CopyAttrsConv2DDepthwise(const Node* orig_node,
+                                                    NodeBuilder* nb,
+                                                    bool change_format) {
+  DataType T;
+  string data_format;
+  string padding;
+  std::vector<int32> strides;
+  std::vector<int32> dilations;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("strides", strides);
+  nb->Attr("dilations", dilations);
+  nb->Attr("padding", padding);
+  nb->Attr("data_format", data_format);
+}
+
+void MklLayoutRewritePass::CopyAttrsConv2DDepthwiseCheckConstFilter(
+    const Node* orig_node, NodeBuilder* nb, bool change_format) {
+  DataType T;
+  string data_format;
+  string padding;
+  std::vector<int32> strides;
+  std::vector<int32> dilations;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+
+  Node* filter_node = nullptr;
+  orig_node->input_node(1, &filter_node);
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("strides", strides);
+  nb->Attr("dilations", dilations);
+  nb->Attr("padding", padding);
+  nb->Attr("is_filter_const", filter_node->IsConstant());
+  nb->Attr("data_format", data_format);
+}
+
 void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb,
                                          bool change_format) {
   DataType T;
@@ -2035,6 +2312,21 @@ void MklLayoutRewritePass::CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb,
   nb->Attr("beta", beta);
 }
 
+void MklLayoutRewritePass::CopyAttrsLeakyRelu(const Node* orig_node,
+                                              NodeBuilder* nb,
+                                              bool change_format) {
+  DataType T;
+  float alpha;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "alpha", &alpha));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("alpha", alpha);
+}
+
 void MklLayoutRewritePass::CopyAttrsPooling(const Node* orig_node,
                                             NodeBuilder* nb,
                                             bool change_format) {
@@ -2107,16 +2399,21 @@ void MklLayoutRewritePass::CopyAttrsQuantizedConv2D(const Node* orig_node,
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
 
+  Node* filter_node = nullptr;
+  orig_node->input_node(1, &filter_node);
+
   // Add attributes to new node.
   nb->Attr("Tinput", Tinput);
   nb->Attr("Tfilter", Tfilter);
   nb->Attr("out_type", out_type);
   nb->Attr("padding", padding);
+  nb->Attr("is_filter_const", filter_node->IsConstant());
   nb->Attr("strides", strides);
   nb->Attr("dilations", dilations);
   nb->Attr("T", out_type);  // added "T" for facilitating MklToTf conversion.
   nb->Attr("data_format", data_format);
-  // Requantization attr Tbias
+
+  // Requantization attr Tbias.
   DataType Tbias;
   Status bias_status = GetNodeAttr(orig_node->def(), "Tbias", &Tbias);
   if (bias_status.ToString() == "OK") nb->Attr("Tbias", Tbias);
@@ -2145,6 +2442,7 @@ void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
   // Get all attributes from old node.
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tshape", &Tshape));
+
   // Add attributes to new node.
   nb->Attr("T", T);
   nb->Attr("Tshape", Tshape);
@@ -2158,6 +2456,7 @@ void MklLayoutRewritePass::CopyAttrsSlice(const Node* orig_node,
   // Get all attributes from old node.
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Index", &Index));
+
   // Add attributes to new node.
   nb->Attr("T", T);
   nb->Attr("Index", Index);
@@ -2180,6 +2479,45 @@ void MklLayoutRewritePass::CopyAttrsSplit(const Node* orig_node,
   nb->Attr("data_format", data_format);
 }
 
+void MklLayoutRewritePass::CopyFormatAttrsConv(
+    const Node* orig_node, NodeBuilder* nb, const std::vector<int32>& strides,
+    const std::vector<int32>& dilations, bool change_format) {
+  string data_format;
+
+  if (!change_format) {
+    nb->Attr("strides", strides);
+    nb->Attr("dilations", dilations);
+
+    TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+    nb->Attr("data_format", data_format);
+  } else {
+    std::vector<int32> new_strides;
+    std::vector<int32> new_dilations;
+    if (strides.size() == 5) {
+      // `strides` and `dilations` also need to be changed according to
+      // `data_format`. In this case, from `NDHWC` to `NCDHW`.
+      new_strides = {strides[NDHWC::dim::N], strides[NDHWC::dim::C],
+                     strides[NDHWC::dim::D], strides[NDHWC::dim::H],
+                     strides[NDHWC::dim::W]};
+
+      new_dilations = {dilations[NDHWC::dim::N], dilations[NDHWC::dim::C],
+                       dilations[NDHWC::dim::D], dilations[NDHWC::dim::H],
+                       dilations[NDHWC::dim::W]};
+    } else {
+      // `strides` and `dilations` also need to be changed according to
+      // `data_format`. In this case, from `NHWC` to `NCHW`.
+
+      new_strides = {strides[NHWC::dim::N], strides[NHWC::dim::C],
+                     strides[NHWC::dim::H], strides[NHWC::dim::W]};
+
+      new_dilations = {dilations[NHWC::dim::N], dilations[NHWC::dim::C],
+                       dilations[NHWC::dim::H], dilations[NHWC::dim::W]};
+    }
+    nb->Attr("strides", new_strides);
+    nb->Attr("dilations", new_dilations);
+  }
+}
+
 void MklLayoutRewritePass::CopyAttrsConcat(const Node* orig_node,
                                            NodeBuilder* nb,
                                            bool change_format) {
@@ -2234,6 +2572,43 @@ void MklLayoutRewritePass::CopyAttrsFusedBatchNorm(const Node* orig_node,
   nb->Attr("is_training", is_training);
 }
 
+void MklLayoutRewritePass::CopyAttrsFusedConv2D(const Node* orig_node,
+                                                NodeBuilder* nb,
+                                                bool change_format) {
+  DataType T;
+  int num_args;
+  float epsilon;
+  string data_format;
+  string padding;
+  std::vector<int32> strides;
+  std::vector<int32> dilations;
+  std::vector<string> fused_ops;
+
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "num_args", &num_args));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "fused_ops", &fused_ops));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "epsilon", &epsilon));
+
+  Node* filter_node = nullptr;
+  orig_node->input_node(1, &filter_node);
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("num_args", num_args);
+  nb->Attr("strides", strides);
+  nb->Attr("padding", padding);
+  nb->Attr("is_filter_const", filter_node->IsConstant());
+  nb->Attr("data_format", data_format);
+  nb->Attr("dilations", dilations);
+  nb->Attr("fused_ops", fused_ops);
+  nb->Attr("epsilon", epsilon);
+}
+
 //////////////////////////////////////////////////////////////////////////
 //           Helper functions related to node merge pass
 //////////////////////////////////////////////////////////////////////////
@@ -2300,7 +2675,7 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
   std::vector<int32> strides;
   std::vector<int32> dilations;
   string data_format_pred, data_format_succ;
-  bool use_cudnn_on_gnu;
+  bool use_cudnn_on_gpu;
   TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred));
   TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ));
   TF_CHECK_OK(GetNodeAttr(pred->def(), "padding", &padding));
@@ -2308,7 +2683,7 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
   TF_CHECK_OK(GetNodeAttr(pred->def(), "dilations", &dilations));
   TF_CHECK_OK(GetNodeAttr(pred->def(), "data_format", &data_format_pred));
   TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
-  TF_CHECK_OK(GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
   // We check to ensure that data formats of both succ and pred are same.
   // We expect them to be same, so we can enforce this as assert.
   // But assert can be too strict, so we enforce this as a check.
@@ -2364,7 +2739,7 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
   nb.Input(succ_in[1].first, succ_in[1].second);  // In2 of BiasAdd
 
   // Copy attributes from Conv2D to Conv2DWithBias.
-  CopyAttrsConv(const_cast<const Node*>(pred), &nb);
+  CopyAttrsConvCheckConstFilter(const_cast<const Node*>(pred), &nb);
 
   // Copy the device assigned to old node to new node.
   nb.Device(succ->def().device());
@@ -2433,11 +2808,15 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
 
 Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr<Graph>* g,
                                                 Node* m, Node* n) {
-  DCHECK(((m->type_string() == csinfo_.pad &&
-           n->type_string() == csinfo_.conv2d)) ||
-         ((n->type_string() == csinfo_.pad &&
-           m->type_string() == csinfo_.conv2d)));
-
+  DCHECK((m->type_string() == csinfo_.pad &&
+          (n->type_string() == csinfo_.conv2d ||
+           n->type_string() == csinfo_.fused_conv2d)) ||
+         (n->type_string() == csinfo_.pad &&
+          (m->type_string() == csinfo_.conv2d ||
+           m->type_string() == csinfo_.fused_conv2d)));
+
+  bool is_fused_conv2d = n->type_string() == csinfo_.fused_conv2d ||
+                         m->type_string() == csinfo_.fused_conv2d;
   // Conv2D is successor node, and Pad predecessor node.
   Node* pred = m->type_string() == csinfo_.pad ? m : n;
   Node* succ = m->type_string() == csinfo_.pad ? n : m;
@@ -2448,18 +2827,14 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr<Graph>* g,
   std::vector<int32> strides;
   std::vector<int32> dilations;
   string data_format_pred, data_format_succ;
-  bool use_cudnn_on_gnu;
+
   TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred));
   TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ));
   TF_CHECK_OK(GetNodeAttr(succ->def(), "padding", &padding));
   TF_CHECK_OK(GetNodeAttr(succ->def(), "strides", &strides));
   TF_CHECK_OK(GetNodeAttr(succ->def(), "dilations", &dilations));
-  // Data format for pad is not available and not necessary, thus
-  // dont need to match data format for Pad
-  TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
-  TF_CHECK_OK(GetNodeAttr(succ->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
-  // Check if the data types and devices of both succ and pred are the same.
-  // Assert is not used,  because it can be too strict.
+  // Check if the devices of both succ and pred are the same.
+  // Assert is not used because it can be too strict.
   // Don't need to check for data formats because it is not available in Pad.
   if (T_pred != T_succ ||
       pred->assigned_device_name() != succ->assigned_device_name() ||
@@ -2503,29 +2878,45 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr<Graph>* g,
   }
   DCHECK_EQ(PadDataInputEdges, 2);
 
-  // Conv2D must have 2 data inputs: pad output and Filter
+  // Conv2D must have 2 data inputs: Pad output and Filter
+  // FusedConv2D have 3 data inputs: Pad output, Filter and Args;
   int ConvDataInputEdges = 0;
   for (const Edge* e : succ->in_edges()) {
     if (!e->IsControlEdge()) {
       ConvDataInputEdges++;
     }
   }
-  DCHECK_EQ(ConvDataInputEdges, 2);
+
+  DCHECK_EQ(ConvDataInputEdges, is_fused_conv2d ? 3 : 2);
 
   // We will use the node name of Conv2D as the name of new node
   // Build new node. We use same name as original node, but change the op
   // name.
-  NodeBuilder nb(succ->name(), csinfo_.pad_with_conv2d);
+
+  NodeBuilder nb(succ->name(), is_fused_conv2d ? csinfo_.pad_with_fused_conv2d
+                                               : csinfo_.pad_with_conv2d);
   nb.Input(pred_in[0].first, pred_in[0].second);  // In1 (input data)  of Pad
   // pred_in[1] will be 2nd Tensorflow tensor for Conv2D.
   nb.Input(succ_in[1].first, succ_in[1].second);  // In2 (filter) of conv2d
   // In1 of Conv2D is same as output of Pad.
   // Thus, only need to add In2 of Conv2D
-  nb.Input(pred_in[1].first, pred_in[1].second);  // In2 (paddings) of Pad
 
-  // Copy attributes from Pad and conv2D to PadWithConv2D.
-  CopyAttrsFromPadAndConv2D(const_cast<const Node*>(succ),
-                            const_cast<const Node*>(pred), &nb);
+  if (is_fused_conv2d) {
+    // FusedConv2D has one additional input, args
+    std::vector<NodeBuilder::NodeOut> args;
+    args.emplace_back(succ_in[2].first, succ_in[2].second);
+    nb.Input(gtl::ArraySlice<NodeBuilder::NodeOut>{
+        args});                                     // In3 (args) of FusedConv2D
+    nb.Input(pred_in[1].first, pred_in[1].second);  // In2 (paddings) of Pad
+    // Copy attributes from Pad and FusedConv2D to PadWithFusedConv2D.
+    CopyAttrsFromPadAndFusedConv2D(const_cast<const Node*>(succ),
+                                   const_cast<const Node*>(pred), &nb);
+  } else {
+    nb.Input(pred_in[1].first, pred_in[1].second);  // In2 (paddings) of Pad
+    // Copy attributes from Pad and conv2D to PadWithConv2D.
+    CopyAttrsFromPadAndConv2D(const_cast<const Node*>(succ),
+                              const_cast<const Node*>(pred), &nb);
+  }
 
   // Copy the device assigned to old node to new node.
   nb.Device(succ->def().device());
@@ -2723,10 +3114,12 @@ Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* m,
         m->type_string() == csinfo_.conv2d))) {
     return this->MergeConv2DWithBiasAdd(g, m, n);
   }
-  if (((m->type_string() == csinfo_.pad &&
-        n->type_string() == csinfo_.conv2d)) ||
-      ((n->type_string() == csinfo_.pad &&
-        m->type_string() == csinfo_.conv2d))) {
+  if ((m->type_string() == csinfo_.pad &&
+       (n->type_string() == csinfo_.conv2d ||
+        (n->type_string() == csinfo_.fused_conv2d && FusedConv2DRewrite(n)))) ||
+      (n->type_string() == csinfo_.pad &&
+       (m->type_string() == csinfo_.conv2d ||
+        (m->type_string() == csinfo_.fused_conv2d && FusedConv2DRewrite(m))))) {
     return this->MergePadWithConv2D(g, m, n);
   }
 
@@ -2783,9 +3176,7 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
   // Set the Mkl layer label for this op.
   if (DataTypeIsQuantized(orig_node->input_type(0)) ||
       DataTypeIsQuantized(orig_node->output_type(0))) {
-#ifdef INTEL_MKL_QUANTIZED
     nb.Attr("_kernel", mkl_op_registry::kMklQuantizedOpLabel);
-#endif
   } else {
     nb.Attr("_kernel", mkl_op_registry::kMklOpLabel);
   }
@@ -2839,7 +3230,6 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
 // Current implementation reflects only QuantizedConv2D and its fused Ops.
 const MklLayoutRewritePass::RewriteInfo*
 MklLayoutRewritePass::CheckForQuantizedNodeRewrite(const Node* n) const {
-#ifdef INTEL_MKL_QUANTIZED
   DataType Tinput, Tfilter;
   if (!(GetNodeAttr(n->def(), "Tinput", &Tinput).ok() &&
         GetNodeAttr(n->def(), "Tfilter", &Tfilter).ok())) {
@@ -2853,7 +3243,6 @@ MklLayoutRewritePass::CheckForQuantizedNodeRewrite(const Node* n) const {
       }
     }
   }
-#endif
   return nullptr;
 }
 
@@ -2880,7 +3269,9 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   // names do not match Mkl node names.
   if (n->type_string() != csinfo_.conv2d_with_bias &&
       n->type_string() != csinfo_.pad_with_conv2d &&
+      n->type_string() != csinfo_.pad_with_fused_conv2d &&
       n->type_string() != csinfo_.conv2d_grad_filter_with_bias &&
+      n->type_string() != csinfo_.fused_conv2d &&
       !mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(n->type_string()),
                                 T)) {
     return nullptr;
@@ -2999,8 +3390,9 @@ Status MklLayoutRewritePass::FuseTransposeMklOpTranspose(
   for (const Edge* e : transpose_to_nchw->out_edges()) {
     if (!e->IsControlEdge()) {
       const int kTransposeWithMklOpOutputSlot = 0;
-      DCHECK((*g)->AddEdge(new_node, kTransposeWithMklOpOutputSlot, e->dst(),
-                           e->dst_input()));
+      auto new_edge = (*g)->AddEdge(new_node, kTransposeWithMklOpOutputSlot,
+                                    e->dst(), e->dst_input());
+      DCHECK(new_edge);
     }
   }
 
@@ -3201,7 +3593,6 @@ bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
 
   DumpGraph("After running MklLayoutRewritePass(NodeMerge)", &**g);
 
-#ifdef ENABLE_TRANSPOSE_OPTIMIZATION
   order.clear();
   GetReversePostOrder(**g, &order);  // This will give us topological sort.
   for (Node* n : order) {
@@ -3223,7 +3614,6 @@ bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
     }
   }
   DumpGraph("After running MklLayoutRewritePass(NodeFusion)", &**g);
-#endif  // ENABLE_TRANSPOSE_OPTIMIZATION
 
   order.clear();
   GetReversePostOrder(**g, &order);  // This will give us topological sort.
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 04c4b85d64d63f275a08abb86d7bf3393398dc67..cc4e9c7ca0fbf20652586f8bebb0a045d6adb12c 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -123,6 +123,21 @@ class MklLayoutPassTest : public ::testing::Test {
     return result;
   }
 
+  // Returns the attribute value only from the first node
+  template <typename T>
+  T DoMklLayoutOptimizationPassGetAttrVal(const string& attr,
+                                          const string& node_name) {
+    DoMklLayoutOptimizationPass();
+    T attr_val;
+    for (const Node* n : graph_.nodes()) {
+      if (IncludeNode(n) && n->type_string() == node_name) {
+        TF_CHECK_OK(GetNodeAttr(n->def(), attr, &attr_val));
+        return attr_val;
+      }
+    }
+    return attr_val;
+  }
+
   const string& OriginalGraph() const { return original_; }
 
   Graph graph_;
@@ -133,6 +148,7 @@ REGISTER_OP("Input").Output("o: float").SetIsStateful();
 REGISTER_OP("InputList").Output("o: N * float").Attr("N: int").SetIsStateful();
 REGISTER_OP("HalfInput").Output("o: half").SetIsStateful();
 REGISTER_OP("Int32Input").Output("o: int32").SetIsStateful();
+REGISTER_OP("DoubleInput").Output("o: double").SetIsStateful();
 REGISTER_OP("_MklInput").Output("o: uint8").SetIsStateful();
 REGISTER_OP("_MklInput2")
     .Output("o: uint8")
@@ -142,7 +158,7 @@ REGISTER_OP("Output2").Input("i: float").Input("i1: float").SetIsStateful();
 REGISTER_OP("Output").Input("i: float").SetIsStateful();
 
 /////////////////////////////////////////////////////////////////////
-//  Unit tests related to node merge optiimization
+//  Unit tests related to node merge optimization
 /////////////////////////////////////////////////////////////////////
 
 TEST_F(MklLayoutPassTest, Basic) {
@@ -706,7 +722,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Negative) {
       "C:control->DMT/_0:control;C:control->DMT/_1:control;"
       "D->E:1;DMT/_0->E:2;DMT/_1->E:3;E->Z;Y->Z:1");
 }
-#ifdef ENABLE_TRANSPOSE_OPTIMIZATION
+
 TEST_F(MklLayoutPassTest, NodeMerge_TransposeConv2DTranspose_Positive) {
   InitGraph(
       "node { name: 'Input0' op: 'Input'}"
@@ -1015,7 +1031,6 @@ TEST_F(MklLayoutPassTest, NodeMerge_TransposeConv2DTranspose_Negative) {
       "Transpose0:control->DMT/"
       "_1:control;Transpose1->Relu;Transpose1:control->DMT/_2:control");
 }
-#endif  // ENABLE_TRANSPOSE_OPTIMIZATION
 
 /////////////////////////////////////////////////////////////////////
 //  Unit tests related to rewriting node to Mkl node
@@ -1044,6 +1059,28 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Basic) {
             "DMT/_1->C:3");
 }
 
+// Test case for the Depthwise FWD pass
+TEST_F(MklLayoutPassTest, NodeRewrite_DepthwiseConv2dNative_Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'DepthwiseConv2dNative'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Input);B(Input);C(_MklDepthwiseConv2dNative);D(Zeta);DMT/_0(Const);"
+      "DMT/_1(Const)|A->C;A:control->DMT/_0:control;"
+      "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"
+      "DMT/_1->C:3");
+}
+
 // 2 Conv2D Ops in sequence. Both should get transformed and 1st Conv2D will
 // have 2 outputs, both of which will be inputs to next Conv2D.
 TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) {
@@ -1096,235 +1133,741 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Negative_UnsupportedType) {
             "A->C;B->C:1;B->D;C->D:1");
 }
 
-TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_Positive) {
+// Rewrite test for _FusedConv2D Op with BiasAdd fusion
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedConv2D_Positive1) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'B' op: 'Input'}"
       "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      "node { name: 'D' op: '_FusedConv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
       " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
       " input: ['A', 'B', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }");
+      " input: ['D', 'C'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropFilter);"
-            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|"
-            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "A:control->DMT/_2:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
-            "DMT/_1->D:4;DMT/_2->D:5");
+            "A(Input);B(Input);C(Input);D(_MklFusedConv2D);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->D:1;C->D:2;C->E:1;D->E;"
+            "DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
 }
 
-TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradInput_Positive) {
+// Rewrite test for _FusedConv2D Op with Relu fusion
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedConv2D_Positive2) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'B' op: 'Input'}"
       "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Conv2DBackpropInput'"
+      "node { name: 'D' op: '_FusedConv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
       " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['B', 'A', 'C']}"
+      " attr { key: 'fused_ops'        value { list: {s: 'Relu'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['A', 'B', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }");
+      " input: ['D', 'C'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropInput);"
-            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|"
-            "A->D:1;A->E;B->D;B:control->DMT/_0:control;"
-            "B:control->DMT/_1:control;B:control->DMT/_2:control;C->D:2;"
-            "D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+            "A(Input);B(Input);C(Input);D(_MklFusedConv2D);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->D:1;C->D:2;C->E:1;D->E;"
+            "DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
 }
 
-// Check that we never rewrite BiasAddGrad.
-TEST_F(MklLayoutPassTest, NodeRewrite_BiasAddGrad_Positive) {
+// Rewrite test for _FusedConv2D Op with BiasAdd+Relu fusion
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedConv2D_Positive3) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
       "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Polygamma'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A']}"
-      "node { name: 'E' op: 'BiasAddGrad'"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: '_FusedConv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D'] }");
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'"
+      "             value { list: {s: 'BiasAdd', s: 'Relu'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['D', 'C'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Polygamma);D(Zeta);E(BiasAddGrad)|"
-            "A->C;A->D:1;B->C:1;C->D;D->E");
+            "A(Input);B(Input);C(Input);D(_MklFusedConv2D);DMT/_0(Const);"
+            "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->D:1;C->D:2;C->E:1;D->E;"
+            "DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
 }
 
-// Check that we never rewrite BiasAddGrad.
-TEST_F(MklLayoutPassTest, NodeRewrite_BiasAddGrad_Positive1) {
+// Rewrite test for _FusedConv2D Op with unsupported fusion
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedConv2D_Negative1) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
       "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'MatMul'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'transpose_a'      value { b: false } }"
-      " attr { key: 'transpose_b'      value { b: false } }"
-      " input: ['A', 'B']}"
-      "node { name: 'D' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A']}"
-      "node { name: 'E' op: 'BiasAddGrad'"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: '_FusedConv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D'] }");
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'Unsupported'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['D', 'C'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(MatMul);D(Zeta);E(BiasAddGrad)|"
-            "A->C;A->D:1;B->C:1;C->D;D->E");
+            "A(Input);B(Input);C(Input);D(_FusedConv2D);E(Zeta)|A->D;"
+            "B->D:1;C->D:2;C->E:1;D->E");
 }
 
-// Check that we never rewrite BiasAddGrad.
-TEST_F(MklLayoutPassTest, NodeRewrite_BiasAddGrad_Positive2) {
+// Rewrite test for _FusedConv2D Op with unsupported type
+TEST_F(MklLayoutPassTest, NodeRewrite_FusedConv2D_Negative2) {
   InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'C' op: '_MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      "node { name: 'A' op: 'DoubleInput'}"
+      "node { name: 'B' op: 'DoubleInput'}"
+      "node { name: 'C' op: 'DoubleInput'}"
+      "node { name: 'D' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_DOUBLE } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
       " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['A', 'B', 'M', 'N']}"
-      "node { name: 'D' op: 'Zeta'"
-      " attr {key: 'T'                 value { type: DT_FLOAT } }"
-      " input: ['C', 'A']}"
-      "node { name: 'E' op: 'BiasAddGrad'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " input: ['D'] }");
+      " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_DOUBLE } }"
+      " input: ['D', 'C'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklConv2D);D(Zeta);E(BiasAddGrad);"
-            "M(_MklInput);N(_MklInput)|A->C;A->D:1;B->C:1;C->D;D->E;"
-            "M->C:2;N->C:3");
+            "A(DoubleInput);B(DoubleInput);C(DoubleInput);"
+            "D(_FusedConv2D);E(Zeta)|A->D;B->D:1;C->D:2;C->E:1;D->E");
 }
 
-// Concat Op test: Concat with no Mkl layer feeding it
-TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Basic) {
+// Merge test for PadWithFusedConv2D Op with BiasAdd fusion
+// padding is VALID type
+// A = input(image), B = input(paddings), C = Pad(A, B) = input of conv2D,
+// D = input(filter), E = input(bias), F = _FusedConv2D(C, D, E)
+// G = Zeta(F, E)
+// After layout pass
+// _MklPadWithFusedConv2D(A, D, E, B, DMT/_0, DMT/_1, DMT/_2, DMT/_3)
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithFusedConv2D_Positive1) {
   InitGraph(
-      "node { name: 'A' op: 'Const' "
-      " attr { key: 'dtype' value { type: DT_INT32 } }"
-      " attr { key: 'value' value { "
-      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
-      "    int_val: 0 } } } }"
-      "node { name: 'B' op: 'InputList'"
-      " attr { key: 'N'                value { i: 2 } }}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Concat'"
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'N'                value { i: 2 } }"
-      " input: ['A', 'B:0', 'B:1']}"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D'] }");
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['C', 'D', 'E']}"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['F', 'E'] }");
   EXPECT_EQ(
       DoMklLayoutOptimizationPass(),
-      "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
-      "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;"
-      "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
-      "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+      "A(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);DMT/"
+      "_2(Const);DMT/_3(Const);E(Input);F(_MklPadWithFusedConv2D);"
+      "G(Zeta)|A->F;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+      "A:control->DMT/_2:control;A:control->DMT/_3:control;B->F:3;D->F:1;DMT/"
+      "_0->F:4;DMT/_1->F:5;DMT/_2->F:6;DMT/_3->F:7;E->F:2;E->G:1;F->G");
 }
 
-// Concat with 2 Mkl layers feeding it
-TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) {
+// Merge test for PadWithFusedConv2D Op with BiasAdd+Relu fusion
+// padding is VALID type
+// A = input(image), B = input(paddings), C = Pad(A, B) = input of conv2D,
+// D = input(filter), E = input(bias), F = _FusedConv2D(C, D, E) (With relu)
+// G = Zeta(F, E)
+// After layout pass
+// _MklPadWithFusedConv2D(A, D, E, B, DMT/_0, DMT/_1, DMT/_2, DMT/_3)
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithFusedConv2D_Positive2) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
       "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Conv2D'"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: '_FusedConv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
       " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'"
+      "             value { list: {s: 'BiasAdd', s: 'Relu'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['C', 'D', 'E']}"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['F', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);DMT/"
+            "_2(Const);DMT/_3(Const);E(Input);F(_MklPadWithFusedConv2D);"
+            "G(Zeta)|A->F;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;B->F:3;"
+            "D->F:1;DMT/_0->F:4;DMT/_1->F:5;DMT/_2->F:6;DMT/"
+            "_3->F:7;E->F:2;E->G:1;F->G");
+}
+
+// Merge test for PadWithFusedConv2D Op with unsupported fusion
+// padding is VALID type
+// A = input(image), B = input(paddings), C = Pad(A, B) = input of conv2D,
+// D = input(filter), E = input(bias),
+// F = _FusedConv2D(C, D, E) (With Unsupported), G = Zeta(F, E)
+// After layout pass - No merging
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithFusedConv2D_Negative1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
       " input: ['A', 'B']}"
-      "node { name: 'F' op: 'Conv2D'"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: '_FusedConv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
       " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['C', 'D']}"
-      "node { name: 'G' op: 'Const' "
-      " attr { key: 'dtype' value { type: DT_INT32 } }"
-      " attr { key: 'value' value { "
-      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
-      "    int_val: 0 } } } }"
-      "node { name: 'H' op: 'Concat'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'N'                value { i: 2 } }"
-      " input: ['G', 'E', 'F']}"
-      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'H'] }");
+      " attr { key: 'fused_ops'        value { list: {s: 'Unsupported'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['C', 'D', 'E']}"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['F', 'E'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
-            "F(_MklConv2D);G(Const);H(_MklConcat);I(Zeta)|A->E;A->I;"
-            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "B->E:1;C->F;C:control->DMT/_2:control;C:control->DMT/_3:control;"
-            "D->F:1;DMT/_0->E:2;DMT/_1->E:3;DMT/_2->F:2;DMT/_3->F:3;"
-            "DMT/_4->H:3;E->H:1;E:2->H:4;F->H:2;F:2->H:5;G->H;"
-            "G:control->DMT/_4:control;H->I:1");
+            "A(Input);B(Int32Input);C(Pad);D(Input);E(Input);F(_FusedConv2D);G("
+            "Zeta)|A->C;B->C:1;C->F;D->F:1;E->F:2;E->G:1;F->G");
 }
 
-// Concat with 1 Mkl and 1 non-Mkl layer feeding it
-TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_MixedMkl) {
+// Merge test for PadWithFusedConv2D Op with BiasAdd fusion
+// padding is SAME type
+// A = input(image), B = input(paddings), C = Pad(A, B) = input of conv2D,
+// D = input(filter), E = input(bias), F = _FusedConv2D(C,D,E)
+// G = Zeta(F,E)
+// After layout pass - No merging
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithFusedConv2D_Negative2) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
       "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Conv2D'"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: '_FusedConv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
       " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
       " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['C', 'D', 'E']}"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['F', 'E'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Pad);D(Input);DMT/_0(Const);DMT/"
+            "_1(Const);DMT/_2(Const);E(Input);F(_MklFusedConv2D);G(Zeta)|A->C;"
+            "B->C:1;C->F;C:control->DMT/_0:control;C:control->DMT/_1:control;"
+            "C:control->DMT/_2:control;D->F:1;DMT/_0->F:3;DMT/_1->F:4;DMT/"
+            "_2->F:5;E->F:2;E->G:1;F->G");
+}
+
+// Merge test for PadWithFusedConv2D Op with BiasAdd+Relu fusion
+// padding is SAME type
+// A = input(image), B = input(paddings), C = Pad(A, B) = input of conv2D,
+// D = input(filter), E = input(bias), F = _FusedConv2D(C,D,E)(With relu)
+// G = Zeta(F,E)
+// After layout pass - No merging
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithFusedConv2D_Negative3) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
       " input: ['A', 'B']}"
-      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'D']}"
-      "node { name: 'G' op: 'Const' "
-      " attr { key: 'dtype' value { type: DT_INT32 } }"
-      " attr { key: 'value' value { "
-      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
-      "    int_val: 0 } } } }"
-      "node { name: 'H' op: 'Concat'"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: '_FusedConv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'N'                value { i: 2 } }"
-      " input: ['G', 'E', 'F']}"
-      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'H'] }");
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'"
+      "             value { list: {s: 'BiasAdd', s: 'Relu'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['C', 'D', 'E']}"
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['F', 'E'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Zeta);G(Const);"
-            "H(_MklConcat);I(Zeta)|A->E;A->I;A:control->DMT/_0:control;"
-            "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
-            "DMT/_1->E:3;DMT/_2->H:3;DMT/_3->H:5;E->H:1;E:2->H:4;F->H:2;"
-            "G->H;G:control->DMT/_2:control;G:control->DMT/_3:control;H->I:1");
+            "A(Input);B(Int32Input);C(Pad);D(Input);DMT/_0(Const);DMT/"
+            "_1(Const);DMT/_2(Const);E(Input);F(_MklFusedConv2D);G(Zeta)|A->C;"
+            "B->C:1;C->F;C:control->DMT/_0:control;C:control->DMT/_1:control;"
+            "C:control->DMT/_2:control;D->F:1;DMT/_0->F:3;DMT/_1->F:4;DMT/"
+            "_2->F:5;E->F:2;E->G:1;F->G");
 }
 
-// ConcatV2 Op test: ConcatV2 with no Mkl layer feeding it
-TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Basic) {
+// Tests that there are no duplicate input control edges after merge.
+// If both the merging ops have input control edges from a common op
+// then, the merged op will have only one control edge from that
+// common op. This test only add additional input control edge check
+// based on the previous test NodeMerge_PadWithFusedConv2D_Positive1
+// padding is VALID type
+// A = input(image), X = input, B = input(paddings),
+// C = Pad(A, B) = input of conv2D,
+// D = input(filter), E = input(bias), F = _FusedConv2D(C, D, E)
+// G = Zeta(F, E)
+// X:control->C:control
+// X:control->F:control
+// After layout pass:
+// _MklPadWithFusedConv2D(A, D, B, F, DMT/_0, DMT/_1, DMT/_2, DMT/_3)
+// X:control->E:control (only one control edge)
+TEST_F(MklLayoutPassTest, Input_ControlEdge_PadWithFusedConv2D_Positive) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
   InitGraph(
-      "node { name: 'A' op: 'Const' "
-      " attr { key: 'dtype' value { type: DT_INT32 } }"
-      " attr { key: 'value' value { "
+      "node { name: 'X' op: 'Input'}"
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['C', 'D', 'E']}"
+      "node { name: 'G' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['F', 'E']}");
+  Node* x = FindNode("X");
+  Node* c = FindNode("C");
+  Node* f = FindNode("F");
+  const Edge* edge = graph_.AddControlEdge(x, c);
+  const Edge* edge_1 = graph_.AddControlEdge(x, f);
+  ASSERT_NE(edge, nullptr);
+  ASSERT_NE(edge_1, nullptr);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);E(Input);F(_MklPadWithFusedConv2D);"
+            "G(Zeta);X(Input)|A->F;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;A:control->DMT/_2:control;"
+            "A:control->DMT/_3:control;B->F:3;D->F:1;DMT/_0->F:4;"
+            "DMT/_1->F:5;DMT/_2->F:6;DMT/_3->F:7;E->F:2;E->G:1;F->G;"
+            "X:control->F:control");
+}
+
+// ts that there are no duplicate output control edges after merge.
+// If both the merging ops have output control edge to a common op,
+// then after merge, the merged op will have only one control edge
+// to that commom op. This test only add additional output control edge check
+// based on the previous test NodeMerge_PadWithFusedConv2D_Positive1
+// padding is VALID type
+// A = input(image), B = input(paddings), C = Pad(A, B) = input of conv2D,
+// D = input(filter), E = input(bias), F = _FusedConv2D(C, D, E)
+// G = Zeta(F, E), X = input
+// C:control->X:control
+// F:control->X:control
+// After layout pass:
+// _MklPadWithFusedConv2D(A, D, B, F, DMT/_0, DMT/_1, DMT/_2, DMT/_2)
+// F:control->X:control (only one control edge)
+TEST_F(MklLayoutPassTest, Output_ControlEdge_PadWithFusedConv2D_Positive) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'X' op: 'Input'}"
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['C', 'D', 'E']}"
+      "node { name: 'G' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['F', 'E']}");
+  Node* x = FindNode("X");
+  Node* c = FindNode("C");
+  Node* f = FindNode("F");
+  const Edge* edge = graph_.AddControlEdge(c, x);
+  const Edge* edge_1 = graph_.AddControlEdge(f, x);
+  ASSERT_NE(edge, nullptr);
+  ASSERT_NE(edge_1, nullptr);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);DMT/"
+            "_2(Const);DMT/_3(Const);E(Input);F(_MklPadWithFusedConv2D);"
+            "G(Zeta);X(Input)|A->F;A:control->DMT/_0:control;A:control->DMT/"
+            "_1:control;A:control->DMT/_2:control;A:control->DMT/"
+            "_3:control;B->F:3;D->F:1;DMT/_0->F:4;DMT/_1->F:5;DMT/_2->F:6;DMT/"
+            "_3->F:7;E->F:2;E->G:1;F->G;F:control->X:control");
+}
+
+// Pad + _FusedConv2D with padding is VALID,
+// Input node pointing to both Pad and _FusedConv2D
+// Output of both Pad and _FusedConv2D feeds one node (G as Output2)
+// A = input(as image), B = input(as paddings), C = Pad(A, B)
+// E = input(as bias), F = _FusedConv2D(C, A, E), G = Output(C, F)
+// After layout pass - No merging, since Pad and _FusedConv2D both
+// feed to the same node (Z)
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithFusedConv2D_Common_InOutput) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'E' op: 'Input'}"
+      "node { name: 'F' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['C', 'A', 'E']}"
+      "node { name: 'G' op: 'Output2'"
+      " input: ['C', 'F']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Pad);DMT/_0(Const);DMT/_1(Const);DMT/"
+            "_2(Const);E(Input);F(_MklFusedConv2D);G(Output2)|A->C;A->F:1;B->C:"
+            "1;C->F;C->G;C:control->DMT/_0:control;C:control->DMT/"
+            "_1:control;C:control->DMT/_2:control;DMT/_0->F:3;DMT/_1->F:4;DMT/"
+            "_2->F:5;E->F:2;F->G:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropFilter);"
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|"
+            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:4;DMT/_2->D:5");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradInput_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Conv2DBackpropInput'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['B', 'A', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(_MklConv2DBackpropInput);"
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|"
+            "A->D:1;A->E;B->D;B:control->DMT/_0:control;"
+            "B:control->DMT/_1:control;B:control->DMT/_2:control;C->D:2;"
+            "D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+TEST_F(MklLayoutPassTest,
+       NodeRewrite_DepthwiseConv2dNativeGradFilter_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'DepthwiseConv2dNativeBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(_"
+            "MklDepthwiseConv2dNativeBackpropFilter);"
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|"
+            "A->D;A->E;A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->D:1;C->D:2;D->E:1;DMT/_0->D:3;"
+            "DMT/_1->D:4;DMT/_2->D:5");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_DepthwiseConv2dNativeGradInput_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'DepthwiseConv2dNativeBackpropInput'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['B', 'A', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D(_"
+            "MklDepthwiseConv2dNativeBackpropInput);"
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|"
+            "A->D:1;A->E;B->D;B:control->DMT/_0:control;"
+            "B:control->DMT/_1:control;B:control->DMT/_2:control;C->D:2;"
+            "D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// Check that we never rewrite BiasAddGrad.
+TEST_F(MklLayoutPassTest, NodeRewrite_BiasAddGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Polygamma'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Polygamma);D(Zeta);E(BiasAddGrad)|"
+            "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+// Check that we never rewrite BiasAddGrad.
+TEST_F(MklLayoutPassTest, NodeRewrite_BiasAddGrad_Positive1) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'MatMul'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'transpose_a'      value { b: false } }"
+      " attr { key: 'transpose_b'      value { b: false } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(MatMul);D(Zeta);E(BiasAddGrad)|"
+            "A->C;A->D:1;B->C:1;C->D;D->E");
+}
+
+// Check that we never rewrite BiasAddGrad.
+TEST_F(MklLayoutPassTest, NodeRewrite_BiasAddGrad_Positive2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B', 'M', 'N']}"
+      "node { name: 'D' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A']}"
+      "node { name: 'E' op: 'BiasAddGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Zeta);E(BiasAddGrad);"
+            "M(_MklInput);N(_MklInput)|A->C;A->D:1;B->C:1;C->D;D->E;"
+            "M->C:2;N->C:3");
+}
+
+// Concat Op test: Concat with no Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['A', 'B:0', 'B:1']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Const);B(InputList);C(Input);D(_MklConcat);DMT/_0(Const);"
+      "DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;A:control->DMT/_0:control;"
+      "A:control->DMT/_1:control;A:control->DMT/_2:control;B->D:1;"
+      "B:1->D:2;C->E;D->E:1;DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");
+}
+
+// Concat with 2 Mkl layers feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['G', 'E', 'F']}"
+      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(_MklConv2D);"
+            "F(_MklConv2D);G(Const);H(_MklConcat);I(Zeta)|A->E;A->I;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "B->E:1;C->F;C:control->DMT/_2:control;C:control->DMT/_3:control;"
+            "D->F:1;DMT/_0->E:2;DMT/_1->E:3;DMT/_2->F:2;DMT/_3->F:3;"
+            "DMT/_4->H:3;E->H:1;E:2->H:4;F->H:2;F:2->H:5;G->H;"
+            "G:control->DMT/_4:control;H->I:1");
+}
+
+// Concat with 1 Mkl and 1 non-Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_MixedMkl) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B']}"
+      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D']}"
+      "node { name: 'G' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'H' op: 'Concat'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['G', 'E', 'F']}"
+      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'H'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);DMT/_3(Const);E(_MklConv2D);F(Zeta);G(Const);"
+            "H(_MklConcat);I(Zeta)|A->E;A->I;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->E:1;C->F;D->F:1;DMT/_0->E:2;"
+            "DMT/_1->E:3;DMT/_2->H:3;DMT/_3->H:5;E->H:1;E:2->H:4;F->H:2;"
+            "G->H;G:control->DMT/_2:control;G:control->DMT/_3:control;H->I:1");
+}
+
+// ConcatV2 Op test: ConcatV2 with no Mkl layer feeding it
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Basic) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT32 } }"
+      " attr { key: 'value' value { "
       "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
       "    int_val: 0 } } } }"
       "node { name: 'B' op: 'InputList'"
@@ -1461,65 +2004,144 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ReluGrad_Positive) {
 TEST_F(MklLayoutPassTest, NodeRewrite_ReluReluGrad_Positive) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Relu'"
+      "node { name: 'B' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'ReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklRelu);C(_MklReluGrad);D(Zeta);DMT/_0(Const);"
+            "DMT/_1(Const)|A->B;A->C;A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;"
+            "DMT/_1->C:2");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Relu6_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu6'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(_MklRelu6);C(Zeta);DMT/_0(Const)|A->B;A->C;"
+            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Relu6Grad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'Relu6Grad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklRelu6Grad);D(Zeta);DMT/_0(Const);"
+            "DMT/_1(Const)|A->C;A->D;A:control->DMT/_0:control;"
+            "A:control->DMT/_1:control;B->C:1;C->D:1;DMT/_0->C:2;DMT/_1->C:3");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_Relu6Relu6Grad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Relu6'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " input: ['A'] }"
-      "node { name: 'C' op: 'ReluGrad'"
+      "node { name: 'C' op: 'Relu6Grad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
       " input: ['A', 'B'] }"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['A', 'C'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklRelu);C(_MklReluGrad);D(Zeta);DMT/_0(Const);"
+            "A(Input);B(_MklRelu6);C(_MklRelu6Grad);D(Zeta);DMT/_0(Const);"
             "DMT/_1(Const)|A->B;A->C;A->D;A:control->DMT/_0:control;"
             "A:control->DMT/_1:control;B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;"
             "DMT/_1->C:2");
 }
 
-TEST_F(MklLayoutPassTest, NodeRewrite_Relu6_Positive) {
+TEST_F(MklLayoutPassTest, NodeRewrite_LeakyRelu_Positive) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Relu6'"
+      "node { name: 'B' op: 'LeakyRelu'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'            value { f: 0.1 } }"
       " input: ['A'] }"
       "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['A', 'B'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklRelu6);C(Zeta);DMT/_0(Const)|A->B;A->C;"
+            "A(Input);B(_MklLeakyRelu);C(Zeta);DMT/_0(Const)|A->B;A->C;"
             "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
 }
 
-TEST_F(MklLayoutPassTest, NodeRewrite_Relu6Grad_Positive) {
+TEST_F(MklLayoutPassTest, NodeRewrite_LeakyRelu_Negative) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LeakyRelu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'            value { f: 2.0 } }"
+      " input: ['A'] }"
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'B'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(LeakyRelu);C(Zeta)|A->B;A->C;B->C:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_LeakyReluGrad_Positive) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
       "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Relu6Grad'"
+      "node { name: 'C' op: 'LeakyReluGrad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'            value { f: 0.1 } }"
       " input: ['A', 'B'] }"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['A', 'C'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklRelu6Grad);D(Zeta);DMT/_0(Const);"
+            "A(Input);B(Input);C(_MklLeakyReluGrad);D(Zeta);DMT/_0(Const);"
             "DMT/_1(Const)|A->C;A->D;A:control->DMT/_0:control;"
             "A:control->DMT/_1:control;B->C:1;C->D:1;DMT/_0->C:2;DMT/_1->C:3");
 }
 
-TEST_F(MklLayoutPassTest, NodeRewrite_Relu6Relu6Grad_Positive) {
+TEST_F(MklLayoutPassTest, NodeRewrite_LeakyReluGrad_Negative) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Relu6'"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'LeakyReluGrad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'            value { f: 2.0 } }"
+      " input: ['A', 'B'] }"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'C'] }");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Input);B(Input);C(LeakyReluGrad);D(Zeta)|A->C;A->D;B->C:1;C->D:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeRewrite_LeakyReluLeakyReluGrad_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'LeakyRelu'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'            value { f: 0.1 } }"
       " input: ['A'] }"
-      "node { name: 'C' op: 'Relu6Grad'"
+      "node { name: 'C' op: 'LeakyReluGrad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'alpha'            value { f: 0.1 } }"
       " input: ['A', 'B'] }"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['A', 'C'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklRelu6);C(_MklRelu6Grad);D(Zeta);DMT/_0(Const);"
-            "DMT/_1(Const)|A->B;A->C;A->D;A:control->DMT/_0:control;"
-            "A:control->DMT/_1:control;B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;"
-            "DMT/_1->C:2");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Input);B(_MklLeakyRelu);C(_MklLeakyReluGrad);D(Zeta);DMT/_0(Const);"
+      "DMT/_1(Const)|A->B;A->C;A->D;A:control->DMT/_0:control;"
+      "A:control->DMT/_1:control;B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;"
+      "DMT/_1->C:2");
 }
 
 TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_Positive) {
@@ -2096,6 +2718,29 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) {
             "A->D;A->E;B->D:1;C->D:2;D->E:1");
 }
 
+TEST_F(MklLayoutPassTest,
+       NodeRewrite_DepthwiseConv2dNativeGradFilter_DeviceTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'DepthwiseConv2dNativeBackpropFilter'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['A', 'D'] }",
+      kGPUDevice);
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Input);D("
+            "DepthwiseConv2dNativeBackpropFilter);E(Zeta)|"
+            "A->D;A->E;B->D:1;C->D:2;D->E:1");
+}
+
 TEST_F(MklLayoutPassTest, NodeRewrite_Relu_DeviceTest) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
@@ -2304,6 +2949,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Slice_DeviceTest) {
 
 /////////////////////////////////////////////////////////////////////
 //         Post-rewrite fixup pass test
+/////////////////////////////////////////////////////////////////////
 
 TEST_F(MklLayoutPassTest, PostRewriteFixUpPass) {
   InitGraph(
@@ -2334,6 +2980,302 @@ TEST_F(MklLayoutPassTest, PostRewriteFixUpPass) {
 }
 
 /////////////////////////////////////////////////////////////////////
+//         Unit tests related to filter caching.
+//
+// These tests check if the attribute `is_filter_const` is set to true
+// when filter is a constant and false otherwise for various operators
+// such as Conv2D, Conv2DWithBias, Conv3D etc.
+/////////////////////////////////////////////////////////////////////
+
+// Conv2D op where filter is a constant.
+TEST_F(MklLayoutPassTest, Conv2D_FilterCaching_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Const' "  // Filter
+      " attr { key: 'dtype' value { type: DT_FLOAT } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_FLOAT tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_TRUE(DoMklLayoutOptimizationPassGetAttrVal<bool>("is_filter_const",
+                                                          "_MklConv2D"));
+}
+
+// Conv2D op where filter is NOT a constant.
+TEST_F(MklLayoutPassTest, Conv2D_FilterCaching_Negative) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"  // Filter
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_FALSE(DoMklLayoutOptimizationPassGetAttrVal<bool>("is_filter_const",
+                                                           "_MklConv2D"));
+}
+
+// Conv2D + BiasAdd fusion where filter is a constant.
+TEST_F(MklLayoutPassTest, Conv2DWithBias_FilterCaching_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Const'"  // Filter
+      " attr { key: 'dtype' value { type: DT_FLOAT } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_FLOAT tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_TRUE(DoMklLayoutOptimizationPassGetAttrVal<bool>(
+      "is_filter_const", "_MklConv2DWithBias"));
+}
+
+// Conv2D + BiasAdd fusion where filter is NOT a constant.
+TEST_F(MklLayoutPassTest, Conv2DWithBias_FilterCaching_Negative) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"  // Filter
+      "node { name: 'C' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'BiasAdd'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_FALSE(DoMklLayoutOptimizationPassGetAttrVal<bool>(
+      "is_filter_const", "_MklConv2DWithBias"));
+}
+
+// Conv3D op where filter is a constant.
+TEST_F(MklLayoutPassTest, Conv3D_FilterCaching_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Const' "  // Filter
+      " attr { key: 'dtype' value { type: DT_FLOAT } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_FLOAT tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'C' op: 'Conv3D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCDHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1, "
+      "i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1, "
+      "i:1} } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_TRUE(DoMklLayoutOptimizationPassGetAttrVal<bool>("is_filter_const",
+                                                          "_MklConv3D"));
+}
+
+// Conv3D op where filter is NOT a constant.
+TEST_F(MklLayoutPassTest, Conv3D_FilterCaching_Negative) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"  // Filter
+      "node { name: 'C' op: 'Conv3D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCDHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1, "
+      "i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1, "
+      "i:1} } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_FALSE(DoMklLayoutOptimizationPassGetAttrVal<bool>("is_filter_const",
+                                                           "_MklConv3D"));
+}
+
+// Pad + Conv2D fusion where filter is a constant.
+TEST_F(MklLayoutPassTest, PadWithConv2D_FilterCaching_Positive) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Const'"  // Filter
+      " attr { key: 'dtype' value { type: DT_FLOAT } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_FLOAT tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_TRUE(DoMklLayoutOptimizationPassGetAttrVal<bool>("is_filter_const",
+                                                          "_MklPadWithConv2D"));
+}
+
+// Pad + Conv2D fusion where filter is NOT a constant.
+TEST_F(MklLayoutPassTest, PadWithConv2D_FilterCaching_Negative) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"  // Filter
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_FALSE(DoMklLayoutOptimizationPassGetAttrVal<bool>(
+      "is_filter_const", "_MklPadWithConv2D"));
+}
+
+// _FusedConv2D + BiasAdd fusion where filter is a constant.
+TEST_F(MklLayoutPassTest, FusedConv2DWithBias_FilterCaching_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Const'"  // Filter
+      " attr { key: 'dtype' value { type: DT_FLOAT } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_FLOAT tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['D', 'C'] }");
+  EXPECT_TRUE(DoMklLayoutOptimizationPassGetAttrVal<bool>("is_filter_const",
+                                                          "_MklFusedConv2D"));
+}
+
+// _FusedConv2D + BiasAdd fusion where filter is NOT a constant.
+TEST_F(MklLayoutPassTest, FusedConv2DWithBias_FilterCaching_Negative) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"  // Filter
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: '_FusedConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'num_args'         value { i: 1 } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
+      " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " input: ['A', 'B', 'C']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['D', 'C'] }");
+  EXPECT_FALSE(DoMklLayoutOptimizationPassGetAttrVal<bool>("is_filter_const",
+                                                           "_MklFusedConv2D"));
+}
+
+// Depthwise Conv2D op where filter is a constant.
+TEST_F(MklLayoutPassTest, DepthwiseConv2dNative_FilterCaching_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Const'"  // Filter
+      " attr { key: 'dtype' value { type: DT_FLOAT } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_FLOAT tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'C' op: 'DepthwiseConv2dNative'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_TRUE(DoMklLayoutOptimizationPassGetAttrVal<bool>(
+      "is_filter_const", "_MklDepthwiseConv2dNative"));
+}
+
+// Depthwise Conv2D op where filter is NOT a constant.
+TEST_F(MklLayoutPassTest, DepthwiseConv2dNative_FilterCaching_Negative) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"  // Filter
+      "node { name: 'C' op: 'DepthwiseConv2dNative'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['B', 'C'] }");
+  EXPECT_FALSE(DoMklLayoutOptimizationPassGetAttrVal<bool>(
+      "is_filter_const", "_MklDepthwiseConv2dNative"));
+}
 
 static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
   testing::StopTiming();
diff --git a/tensorflow/core/graph/node_builder.cc b/tensorflow/core/graph/node_builder.cc
index a91e6dd05738ae8242c812970e8bbc4a10c7675a..6dc9a50b98a9b2fefc2a0e66809f528d6fc7567f 100644
--- a/tensorflow/core/graph/node_builder.cc
+++ b/tensorflow/core/graph/node_builder.cc
@@ -38,8 +38,9 @@ NodeBuilder::NodeOut::NodeOut()
     : node(nullptr), error(true), index(0), dt(DT_FLOAT) {}
 
 NodeBuilder::NodeBuilder(StringPiece name, StringPiece op_name,
-                         const OpRegistryInterface* op_registry)
-    : def_builder_(name, op_name, op_registry) {}
+                         const OpRegistryInterface* op_registry,
+                         const NodeDebugInfo* debug)
+    : def_builder_(name, op_name, op_registry, debug) {}
 
 NodeBuilder::NodeBuilder(StringPiece name, const OpDef* op_def)
     : def_builder_(name, op_def) {}
diff --git a/tensorflow/core/graph/node_builder.h b/tensorflow/core/graph/node_builder.h
index b1dc2ae92f14ba4519d98a4c556c1d06e14b6b5d..51e044cd8b2ee7a70dbf197c16925a0b972e9365 100644
--- a/tensorflow/core/graph/node_builder.h
+++ b/tensorflow/core/graph/node_builder.h
@@ -77,7 +77,8 @@ class NodeBuilder {
   // specified by calling the methods below.
   // REQUIRES: The OpDef must satisfy ValidateOpDef().
   NodeBuilder(StringPiece name, StringPiece op_name,
-              const OpRegistryInterface* op_registry = OpRegistry::Global());
+              const OpRegistryInterface* op_registry = OpRegistry::Global(),
+              const NodeDebugInfo* debug = nullptr);
   NodeBuilder(StringPiece name, const OpDef* op_def);
 
   // Create a NodeBuilder from an existing NodeDefBuilder.
diff --git a/tensorflow/core/graph/optimizer_cse.cc b/tensorflow/core/graph/optimizer_cse.cc
index 4073255db3f7cbcd697f3cb2781e04b3b01634c1..19afeb6badbc6c1528a3ea19b8b14eb98296c731 100644
--- a/tensorflow/core/graph/optimizer_cse.cc
+++ b/tensorflow/core/graph/optimizer_cse.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/hash/hash.h"
@@ -213,6 +214,7 @@ bool OptimizerCSE::Optimize(
         g_->AddEdge(*candidate, e->src_output(), e->dst(), e->dst_input());
       }
 
+      MergeDebugInfo(NodeDebugInfo(*n), *candidate);
       g_->RemoveNode(n);
       changed = true;
     }
diff --git a/tensorflow/core/graph/validate.cc b/tensorflow/core/graph/validate.cc
index e44eb91d4883f3e8a6ad34e96d8dcd9d9076298b..4487f738c8e97e803618ae483b4551b47fd14c33 100644
--- a/tensorflow/core/graph/validate.cc
+++ b/tensorflow/core/graph/validate.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/graph/validate.h"
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -113,5 +115,16 @@ Status ValidateGraphHasNoCycle(const Graph& graph) {
   return Status::OK();
 }
 
+Status VerifyNoDuplicateNodeNames(const GraphDef& graph) {
+  absl::flat_hash_set<absl::string_view> nodes;
+  for (const auto& node : graph.node()) {
+    if (nodes.contains(node.name())) {
+      return errors::AlreadyExists("Node already exists: ", node.name());
+    }
+    nodes.insert(node.name());
+  }
+  return Status::OK();
+}
+
 }  // namespace graph
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/validate.h b/tensorflow/core/graph/validate.h
index 08879dca6037bcab21f4cbf107b3829c1b6600e8..bfb3a25ac91761449b1762fa2125d7758cc8c560 100644
--- a/tensorflow/core/graph/validate.h
+++ b/tensorflow/core/graph/validate.h
@@ -59,6 +59,9 @@ void GetOpListForValidation(
 // be less than the total node count.
 Status ValidateGraphHasNoCycle(const Graph& graph);
 
+// Returns OK if the graph has no duplicate node names.
+Status VerifyNoDuplicateNodeNames(const GraphDef& graph);
+
 }  // namespace graph
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/graph/validate_test.cc b/tensorflow/core/graph/validate_test.cc
index d58cdc3c5baf02f89cff52ef0396816cb00b48a3..f6a0d2614acfe147eb65b75fb843bc84d0b6dbeb 100644
--- a/tensorflow/core/graph/validate_test.cc
+++ b/tensorflow/core/graph/validate_test.cc
@@ -147,5 +147,36 @@ TEST(GetOpListForValidationTest, ShouldStripDocs) {
   EXPECT_TRUE(found_has_docs);
 }
 
+TEST(VerifyNoDuplicateNodeNames, NoDuplicateNodeNames) {
+  const string graph_def_str =
+      "node { name: 'A' op: 'FloatInput' }"
+      "node { name: 'B' op: 'Int32Input' }"
+      "node { "
+      "       name: 'C' op: 'Sum' "
+      "       attr { key: 'T' value { type: DT_FLOAT } }"
+      "       input: ['A', 'B'] "
+      "}";
+  GraphDef graph_def;
+  auto parser = protobuf::TextFormat::Parser();
+  CHECK(parser.MergeFromString(graph_def_str, &graph_def)) << graph_def_str;
+  TF_ASSERT_OK(graph::VerifyNoDuplicateNodeNames(graph_def));
+}
+
+TEST(VerifyNoDuplicateNodeNames, DuplicateNodeNames) {
+  const string graph_def_str =
+      "node { name: 'A' op: 'FloatInput' }"
+      "node { name: 'A' op: 'Int32Input' }"
+      "node { "
+      "       name: 'C' op: 'Sum' "
+      "       attr { key: 'T' value { type: DT_FLOAT } }"
+      "       input: ['A', 'A'] "
+      "}";
+  GraphDef graph_def;
+  auto parser = protobuf::TextFormat::Parser();
+  CHECK(parser.MergeFromString(graph_def_str, &graph_def)) << graph_def_str;
+  EXPECT_EQ(graph::VerifyNoDuplicateNodeNames(graph_def).code(),
+            tensorflow::error::ALREADY_EXISTS);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index f353d789d47030afda5d9680cca8094d48b827f1..77307708fab5c99cb52ad652c72220efc89f8337 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -1,7 +1,6 @@
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
 
 cc_library(
     name = "op_types",
@@ -27,6 +26,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -44,6 +44,7 @@ tf_cc_test(
         "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -62,6 +63,35 @@ tf_cuda_library(
     ],
 )
 
+cc_library(
+    name = "graph_topology_view",
+    srcs = ["graph_topology_view.cc"],
+    hdrs = ["graph_topology_view.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_view",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cc_test(
+    name = "graph_topology_view_test",
+    srcs = ["graph_topology_view_test.cc"],
+    deps = [
+        ":graph_topology_view",
+        ":graph_view",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "graph_view",
     srcs = ["graph_view.cc"],
@@ -176,12 +206,17 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":graph_view",
-        ":grappler_item",
+        ":op_types",
         ":utils",
+        "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -191,10 +226,15 @@ tf_cc_test(
     deps = [
         ":grappler_item",
         ":mutable_graph_view",
+        ":utils",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index e4f6bf7c862302a217c122cff726b7ab925cc482..e7c72b80886f53ad4068f048dda99ee7dca92293 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -227,14 +227,14 @@ Status SingleMachine::GetPeakMemoryUsage(
 
   device_peak_memory->clear();
   for (Device* device : devices) {
-    AllocatorStats stats;
     auto* allocator = device->GetAllocator(AllocatorAttributes());
     if (!allocator->TracksAllocationSizes()) {
       return Status(error::INVALID_ARGUMENT,
                     "Tracking allocation is not enabled.");
     }
-    allocator->GetStats(&stats);
-    (*device_peak_memory)[device->name()] = stats.max_bytes_in_use;
+    absl::optional<AllocatorStats> stats = allocator->GetStats();
+    (*device_peak_memory)[device->name()] =
+        (stats ? stats->peak_bytes_in_use : 0);
   }
 
   return Status::OK();
diff --git a/tensorflow/core/grappler/clusters/utils_test.cc b/tensorflow/core/grappler/clusters/utils_test.cc
index 3863d62980fb20611285d3efeade1aa998f1a1f3..3cf72fd8170ca271124d59135d592e2db1ba9b67 100644
--- a/tensorflow/core/grappler/clusters/utils_test.cc
+++ b/tensorflow/core/grappler/clusters/utils_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/device_properties.pb.h"
@@ -82,12 +83,14 @@ TEST(UtilsTest, GetDeviceInfo) {
 
 #if GOOGLE_CUDA
   // Invalid platform GPU id.
-  GpuIdManager::InsertTfPlatformGpuIdPair(TfGpuId(0), PlatformGpuId(100));
+  TF_ASSERT_OK(
+      GpuIdManager::InsertTfPlatformGpuIdPair(TfGpuId(0), PlatformGpuId(100)));
   properties = GetDeviceInfo(device);
   EXPECT_EQ("UNKNOWN", properties.type());
 
   // Valid platform GPU id.
-  GpuIdManager::InsertTfPlatformGpuIdPair(TfGpuId(1), PlatformGpuId(0));
+  TF_ASSERT_OK(
+      GpuIdManager::InsertTfPlatformGpuIdPair(TfGpuId(1), PlatformGpuId(0)));
   device.id = 1;
   properties = GetDeviceInfo(device);
   EXPECT_EQ("GPU", properties.type());
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.cc b/tensorflow/core/grappler/clusters/virtual_cluster.cc
index dbd8f26c286f07107a63e9c745c442b171f29aaa..118f74e8b01171e3780317b4ea36750c66a22b98 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.cc
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.cc
@@ -67,13 +67,17 @@ Status VirtualCluster::Run(const GraphDef& graph,
                            const std::vector<string>& fetch,
                            RunMetadata* metadata) {
   // Initialize a virtual scheduler to process the graph. Make sure to use
-  // static shape inference to prevent the schedulrer from calling the Run
-  // method on the cluster, and create an infinite loop.
+  // static shape inference to prevent the scheduler from calling the Run
+  // method on the cluster and creating an infinite loop.
   GrapplerItem item;
   item.graph = graph;
   item.feed = feed;
   item.fetch = fetch;
-  VirtualScheduler scheduler(true, this, node_manager_.get());
+  // Note that we do not use aggressive shape inference to preserve unknown
+  // shapes from the input graph.
+  VirtualScheduler scheduler(/*use_static_shapes=*/true,
+                             /*use_aggressive_shape_inference=*/false, this,
+                             node_manager_.get());
   TF_RETURN_IF_ERROR(scheduler.Init(&item));
 
   if (metadata) {
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 5090e62b2ccfb00241e2b9c87d1922320646632e..35ca93d9345d30c834c753e9c3ef7b25ca5ed8d5 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -41,8 +41,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":utils",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
         "//tensorflow/core/grappler/utils:functions",
         "//tensorflow/core/grappler/utils:topological_sort",
         "//tensorflow/core/grappler:mutable_graph_view",
@@ -54,6 +52,7 @@ cc_library(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:evaluation_utils",
     ] + tf_protos_grappler(),
 )
 
@@ -132,9 +131,6 @@ tf_cuda_library(
     name = "utils",
     srcs = ["utils.cc"],
     hdrs = ["utils.h"],
-    cuda_deps = [
-        "@local_config_cuda//cuda:cudnn_header",
-    ],
     visibility = ["//visibility:public"],
     deps = [
         "//third_party/eigen3",
@@ -173,6 +169,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
 
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
index b7804ffaa5378c67028b39819a07fc00719c9896..5baf306f6fe39e80fc006ed1183eb70aa5fb5180 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
@@ -104,19 +104,19 @@ AnalyticalCostEstimator::AnalyticalCostEstimator(Cluster* cluster,
                                                  bool use_static_shapes)
     : AnalyticalCostEstimator(
           cluster, absl::make_unique<OpLevelCostEstimator>(),
-          ReadyNodeManagerFactory("FirstReady"), use_static_shapes, nullptr) {}
+          ReadyNodeManagerFactory("FirstReady"), use_static_shapes) {}
 
 AnalyticalCostEstimator::AnalyticalCostEstimator(
     Cluster* cluster, std::unique_ptr<OpLevelCostEstimator> node_estimator,
-    std::unique_ptr<ReadyNodeManager> node_manager, bool use_static_shapes,
-    RunMetadata* run_metadata)
+    std::unique_ptr<ReadyNodeManager> node_manager, bool use_static_shapes)
     : cluster_(cluster),
       node_estimator_(std::move(node_estimator)),
       node_manager_(std::move(node_manager)),
-      use_static_shapes_(use_static_shapes),
-      run_metadata_(run_metadata) {
-  scheduler_ = absl::make_unique<VirtualScheduler>(use_static_shapes_, cluster_,
-                                                   node_manager_.get());
+      use_static_shapes_(use_static_shapes) {
+  // Use aggressive static shape inference to minimize unknown shapes.
+  scheduler_ = absl::make_unique<VirtualScheduler>(
+      use_static_shapes_,
+      /*use_aggressive_shape_inference=*/true, cluster_, node_manager_.get());
 }
 
 Status AnalyticalCostEstimator::Initialize(const GrapplerItem& item) {
@@ -124,9 +124,8 @@ Status AnalyticalCostEstimator::Initialize(const GrapplerItem& item) {
   return Status::OK();
 }
 
-// TODO(b/67607683): unify logic with VirtualCluster logic
 Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
-                                             CostGraphDef* cost_graph,
+                                             RunMetadata* run_metadata,
                                              Costs* costs) const {
   GrapplerItem item = item_;
   item.graph = optimized_graph;
@@ -138,7 +137,9 @@ Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
   }
 
   gtl::FlatMap<string, CostGraphDef::Node*> name_to_cost_node;
-  if (cost_graph) {
+  CostGraphDef* cost_graph = nullptr;
+  if (run_metadata) {
+    cost_graph = run_metadata->mutable_cost_graph();
     // TODO(pcma): Clear nodes in cost_graph after we make sure we always pass
     // in an empty cost_graph (a non-empty but incomplete cost_graph will cause
     // problems, e.g., no node_id in cost_graph)
@@ -179,18 +180,13 @@ Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
     }
   }
 
-  *costs = scheduler_->Summary(run_metadata_);
-  // run_metadata_ gets step_stats and parition_graphs from Summary.
-  // Note that cost_graph could already point to the cost_graph field of
-  // run_metadata_, since both are set by the caller.
-  if (run_metadata_ && cost_graph &&
-      run_metadata_->mutable_cost_graph() != cost_graph)
-    *run_metadata_->mutable_cost_graph() = *cost_graph;
+  // run_metadata gets step_stats and partition_graphs from Summary.
+  *costs = scheduler_->Summary(run_metadata);
 
   if (VLOG_IS_ON(1)) {
     bool verbose = VLOG_IS_ON(2);
-    if (run_metadata_) {
-      VLOG(1) << GetStatsStringFromRunMetadata(*run_metadata_, verbose);
+    if (run_metadata) {
+      VLOG(1) << GetStatsStringFromRunMetadata(*run_metadata, verbose);
     } else {
       RunMetadata run_metadata;
       scheduler_->GenerateRunMetadata(&run_metadata);
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator.h b/tensorflow/core/grappler/costs/analytical_cost_estimator.h
index 2629672459c512a22a861bd5c0dfe0207afc38a0..d058ba411527f0c001d59ac4aaa8aeea3d422c77 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.h
@@ -34,21 +34,16 @@ class Cluster;
 struct GrapplerItem;
 
 // Estimate the cost of running a Grappler item based on the theoretical
-// performance of the hardware that will run the model.
+// performance of the hardware that will run the model. Note that this
+// internally uses aggressive shape inference with static shape inference.
 class AnalyticalCostEstimator : public CostEstimator {
  public:
   // Does not take ownership of cluster.
   AnalyticalCostEstimator(Cluster* cluster, bool use_static_shapes);
-  // Does not take ownership of cluster or run_metadata
-  //
-  // When metadata is provided, step_stats and partition_graphs fields will
-  // always be filled during PredictCosts, and the cost_graph field of metadata
-  // will be filled only when cost_graph is not nullptr when invoking
-  // PredictCosts.
   AnalyticalCostEstimator(Cluster* cluster,
                           std::unique_ptr<OpLevelCostEstimator> node_estimator,
                           std::unique_ptr<ReadyNodeManager> node_manager,
-                          bool use_static_shapes, RunMetadata* run_metadata);
+                          bool use_static_shapes);
   ~AnalyticalCostEstimator() override {}
 
   // Initializes the estimator for the specified grappler item.
@@ -56,10 +51,10 @@ class AnalyticalCostEstimator : public CostEstimator {
   Status Initialize(const GrapplerItem& item) override;
 
   // Predict the performance of each node of the optimized graph and annotate
-  // the CostGraphDef with the corresponding estimates. Also returns the
+  // the RunMetadata with the corresponding estimates. Also returns the
   // expected cost for the whole graph.
-  Status PredictCosts(const GraphDef& optimized_graph, CostGraphDef* cost_graph,
-                      Costs* cost) const override;
+  Status PredictCosts(const GraphDef& optimized_graph,
+                      RunMetadata* run_metadata, Costs* cost) const override;
 
   const VirtualScheduler* GetScheduler() const { return scheduler_.get(); }
 
@@ -70,8 +65,6 @@ class AnalyticalCostEstimator : public CostEstimator {
   std::unique_ptr<ReadyNodeManager> node_manager_;
   bool use_static_shapes_;
   std::unique_ptr<VirtualScheduler> scheduler_;
-
-  RunMetadata* run_metadata_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
index a9a1abfa989c9d8276b6ae263b95e7a71be41c8a..fdc6b79c8298b7051d1c9247431c151827288720 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/virtual_scheduler.h"
 
 #include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/costs/analytical_cost_estimator.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -98,11 +97,11 @@ TEST_F(AnalyticalCostEstimatorTest, SimpleTest) {
   AnalyticalCostEstimator estimator(cluster_.get(), true);
   TF_ASSERT_OK(estimator.Initialize(item));
 
-  CostGraphDef cost_graph;
+  RunMetadata run_metadata;
   Costs summary;
-  TF_ASSERT_OK(estimator.PredictCosts(item.graph, &cost_graph, &summary));
+  TF_ASSERT_OK(estimator.PredictCosts(item.graph, &run_metadata, &summary));
 
-  EXPECT_EQ(Costs::NanoSeconds(9151), summary.execution_time);
+  EXPECT_EQ(Costs::NanoSeconds(9157), summary.execution_time);
   // Note there are totally 17 nodes (RandomUniform creates 2 nodes), but
   // grappler will not process "label", therefore we have 15 here instead
   EXPECT_EQ(15, summary.num_ops_total);
diff --git a/tensorflow/core/grappler/costs/cost_estimator.h b/tensorflow/core/grappler/costs/cost_estimator.h
index e3b3a36b096da807d05bee50d52a7a5c37884b52..d85ae0b77f923e9c7678eb9d8dd0a9f128ac5846 100644
--- a/tensorflow/core/grappler/costs/cost_estimator.h
+++ b/tensorflow/core/grappler/costs/cost_estimator.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <cmath>
 #include <unordered_map>
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 class GraphDef;
@@ -215,14 +216,14 @@ class CostEstimator {
 
   // Predicts the cost of running the given optimized version of the grappler
   // item.
-  // If a CostGraphDef is passed, it will be populated with detailed information
+  // If a RunMetadata is passed, it will be populated with detailed information
   // about the cost of running each operation of the optimized graph.
   // if a double value is passed, it will be set to a value that reflects the
   // overall cost of running the graph (e.g. the latency of the computation).
   // Returns a status that indicate is the performance could be estimated or
   // not.
   virtual Status PredictCosts(const GraphDef& optimized_graph,
-                              CostGraphDef* cost_graph, Costs* cost) const = 0;
+                              RunMetadata* run_metadata, Costs* cost) const = 0;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 1df26d94d1fe1ed35765291da6c7d2eae513e713..6907988d08f507b8bc4c4e4c3560f06973a0aed3 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -15,17 +15,12 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 
-#include <limits>
-#include <list>
-#include <queue>
-#include <unordered_map>
-#include <unordered_set>
-#include "absl/memory/memory.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -33,19 +28,24 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/utils.h"
 #include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/evaluation_utils.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/functions.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 namespace grappler {
+
 namespace {
 
 using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeAndType;
 using shape_inference::ShapeHandle;
+using TensorVector = gtl::InlinedVector<TensorValue, 4>;
 
 template <typename Handle>
 struct HashHandle {
@@ -410,6 +410,7 @@ NodeDef MakeConstNodeDefFromShape(InferenceContext* ic,
   return MakeConstNodeDefFromTensorProto(
       ic, MakeTensorProtoFromShape(ic, shape, tensor_as_shape, dtype), dtype);
 }
+
 }  // namespace
 
 // Queue of nodes to process. Nodes can be enqueued in any order, but will be
@@ -419,9 +420,11 @@ NodeDef MakeConstNodeDefFromShape(InferenceContext* ic,
 // information is refined.
 class TopoQueue {
  public:
-  explicit TopoQueue(const std::unordered_map<const NodeDef*, int>& topo_order)
-      : topo_order_(topo_order) {}
+  explicit TopoQueue(const std::vector<const NodeDef*>& topo_order)
+      : topo_order_(TopoOrder(topo_order)) {}
+
   void push(const NodeDef* n) { queue_.emplace(n, topo_order_.at(n)); }
+
   const NodeDef* pop() {
     CHECK(!empty());
     auto it = queue_.begin();
@@ -442,10 +445,99 @@ class TopoQueue {
       return lhs.second < rhs.second;
     }
   };
-  const std::unordered_map<const NodeDef*, int>& topo_order_;
+
+  const std::unordered_map<const NodeDef*, int> TopoOrder(
+      const std::vector<const NodeDef*>& topo_order) const {
+    std::unordered_map<const NodeDef*, int> map;
+    map.reserve(topo_order.size());
+    for (int i = 0; i < topo_order.size(); ++i) {
+      map.emplace(topo_order[i], i);
+    }
+    return map;
+  }
+
+  const std::unordered_map<const NodeDef*, int> topo_order_;
   std::set<NodeAndId, OrderByIdAscending> queue_;
 };
 
+bool IsNumericType(const DataType dtype) {
+  static const gtl::FlatSet<DataType>* const kRealNumberTypes =
+      CHECK_NOTNULL((new gtl::FlatSet<DataType>{
+          // Floating point.
+          DT_BFLOAT16,
+          DT_HALF,
+          DT_FLOAT,
+          DT_DOUBLE,
+          // Int / UInt.
+          DT_INT8,
+          DT_INT16,
+          DT_INT32,
+          DT_INT64,
+          DT_UINT8,
+          DT_UINT16,
+          DT_UINT32,
+          DT_UINT64,
+          // Quantized Int.
+          DT_QINT8,
+          DT_QUINT8,
+          DT_QINT16,
+          DT_QUINT16,
+          DT_QINT32,
+          // Bool.
+          DT_BOOL,
+      }));
+  return kRealNumberTypes->find(dtype) != kRealNumberTypes->end();
+}
+
+bool IsWhiteListedOpTypeForEvaluateNode(const string& op_type) {
+  static const gtl::FlatSet<string>* const kOpTpeWhitelist =
+      CHECK_NOTNULL((new gtl::FlatSet<string>{
+          // Unary arithmetic ops
+          "Floor",
+          "Round",
+          "Sqrt",
+          "Square",
+          "Sign",
+          // Binary arithmetic ops
+          "Add",
+          "Div",
+          "FloorDiv",
+          "FloorMod",
+          "Greater",
+          "GreaterEqual",
+          "Less",
+          "LessEqual",
+          "LogicalAnd",
+          "LogicalNot",
+          "LogicalOr",
+          "Maximum",
+          "Minimum",
+          "Mod",
+          "Mul",
+          "NotEqual",
+          "QuantizedAdd",
+          "QuantizedMul",
+          "SquareDifference",
+          "Sub",
+          "TruncateDiv",
+          "TruncateMod",
+          "RealDiv",
+          // N-ary arithemtic ops
+          "AddN",
+          // Others
+          "StridedSlice",
+          "OnesLike",
+          "ZerosLike",
+          "Concat",
+          "ConcatV2",
+          "Split",
+          "Range",
+          "Fill",
+          "Cast",
+      }));
+  return kOpTpeWhitelist->find(op_type) != kOpTpeWhitelist->end();
+}
+
 // Processes symbolic shapes.
 // Each symbolic shape or dimension is represented by a handle. Unlike the TF
 // shape refiner which creates new handles every time it processes an unknown
@@ -455,10 +547,12 @@ class SymbolicShapeRefiner {
  public:
   explicit SymbolicShapeRefiner(
       const GraphView& graph,
-      const std::unordered_map<string, std::unordered_set<int>>& fed_ports)
+      const std::unordered_map<string, std::unordered_set<int>>& fed_ports,
+      const bool aggressive_shape_inference)
       : graph_(graph),
         function_library_(OpRegistry::Global(), graph.graph()->library()),
-        fed_ports_(fed_ports) {
+        fed_ports_(fed_ports),
+        aggressive_shape_inference_(aggressive_shape_inference) {
     graph_def_version_ = graph.graph()->versions().producer();
     node_to_context_.reserve(graph.graph()->node_size());
   }
@@ -538,8 +632,8 @@ class SymbolicShapeRefiner {
                                           " was not found in the graph.");
       }
 
-      InferenceContext* input_inference_context = GetContext(input_node);
-      if (input_inference_context == nullptr) {
+      InferenceContext* input_ic = GetContext(input_node);
+      if (input_ic == nullptr) {
         return errors::FailedPrecondition(
             "Inference context has not been created for ", input_tensor.node());
       }
@@ -547,8 +641,8 @@ class SymbolicShapeRefiner {
       int output_port_num = input_tensor.index();
       AttrValue attr_output_shape;
       TensorShapeProto proto;
-      const auto& handle = input_inference_context->output(output_port_num);
-      input_inference_context->ShapeHandleToProto(handle, &proto);
+      const auto& handle = input_ic->output(output_port_num);
+      input_ic->ShapeHandleToProto(handle, &proto);
       // There may be dim.size < -1 in SymbolicShapeRefiner. Change those to -1.
       for (int i = 0; i < proto.dim_size(); i++) {
         if (proto.dim(i).size() < -1) {
@@ -600,7 +694,7 @@ class SymbolicShapeRefiner {
     ctx->output_tensor_protos.resize(grappler_function_item.output_size(),
                                      nullptr);
     for (auto const& out_arg : grappler_function_item.outputs()) {
-      if (out_arg.output_tensors.size() > 1) {
+      if (out_arg.output_nodes.size() > 1) {
         // TODO(jmdecker): Handle case of multiple output tensors
         return errors::Unimplemented(
             "Output arguments with multiple output tensors are not yet "
@@ -609,7 +703,7 @@ class SymbolicShapeRefiner {
 
       // It is guaranteed that output_tensors does not contain any control
       // inputs, so port_id >= 0.
-      TensorId out_tensor = ParseTensorName(out_arg.output_tensors[0]);
+      TensorId out_tensor = ParseTensorName(out_arg.output_nodes[0]);
 
       const NodeDef* retnode = gv.GetNode(out_tensor.node());
       if (retnode == nullptr) {
@@ -646,145 +740,132 @@ class SymbolicShapeRefiner {
     return Status::OK();
   }
 
+  // Prepares input shapes/values/handles, then runs shape inference, and
+  // finally sets output shapes/values/handles.
   Status UpdateNode(const NodeDef* node, bool* refined) {
-    NodeContext* node_context = GetNodeContext(node);
-    if (node_context == nullptr) {
+    NodeContext* ctx = GetNodeContext(node);
+    if (ctx == nullptr) {
       TF_RETURN_IF_ERROR(AddNode(node));
-      node_context = CHECK_NOTNULL(GetNodeContext(node));
+      ctx = CHECK_NOTNULL(GetNodeContext(node));
       *refined = true;
     }
 
     // Check if the shapes of the nodes in the fan-in of this node have changed,
     // and if they have, update the node input shapes.
-    InferenceContext* inference_context = node_context->inference_context.get();
-    std::vector<Tensor> const_values(inference_context->num_inputs());
-    std::vector<const Tensor*> input_tensors(inference_context->num_inputs(),
-                                             nullptr);
-    std::vector<ShapeHandle> input_tensors_as_shapes(
-        inference_context->num_inputs());
-    node_context->input_tensor_protos.resize(inference_context->num_inputs(),
-                                             nullptr);
-
-    for (int dst_input = 0; dst_input < inference_context->num_inputs();
-         ++dst_input) {
-      GraphView::InputPort port(node, dst_input);
-      for (const GraphView::OutputPort fanin : graph_.GetFanin(port)) {
-        int src_output = fanin.port_id;
-        const NodeDef* input = fanin.node;
-        NodeContext* c = GetNodeContext(input);
-        if (c == nullptr) {
-          return errors::FailedPrecondition(
-              "Input ", dst_input, " ('", input->name(), "') for '",
-              node->name(),
-              "' was not previously added to SymbolicShapeRefiner.");
-        }
+    InferenceContext* ic = ctx->inference_context.get();
+    std::vector<Tensor> const_values(ic->num_inputs());
+    std::vector<const Tensor*> input_tensors(ic->num_inputs(), nullptr);
+    std::vector<ShapeHandle> input_tensors_as_shapes(ic->num_inputs());
+    ctx->input_tensor_protos.resize(ic->num_inputs(), nullptr);
+
+    for (int dst_input = 0; dst_input < ic->num_inputs(); ++dst_input) {
+      const GraphView::InputPort port(node, dst_input);
+      const GraphView::OutputPort fanin = graph_.GetRegularFanin(port);
+      int src_output = fanin.port_id;
+      const NodeDef* src = fanin.node;
+      NodeContext* src_ctx = GetNodeContext(src);
+      InferenceContext* src_ic = src_ctx->inference_context.get();
+      if (src_ctx == nullptr) {
+        return errors::FailedPrecondition(
+            "Input ", dst_input, " ('", src->name(), "') for '", node->name(),
+            "' was not previously added to SymbolicShapeRefiner.");
+      }
 
-        if (src_output >= c->inference_context->num_outputs())
-          return errors::OutOfRange("src_output = ", src_output,
-                                    ", but num_outputs is only ",
-                                    c->inference_context->num_outputs());
+      if (src_output >= src_ic->num_outputs()) {
+        return errors::OutOfRange("src_output = ", src_output,
+                                  ", but num_outputs is only ",
+                                  src_ic->num_outputs());
+      }
 
-        // Propagate input node's NodeContext info to the current node's
-        // NodeContext:
-        // output_tensor_protos to input_tensor_protos and input_tensors, and
-        // output_tensors_as_shapes to input_tensors_as_shapes.
+      // Propagate input node's NodeContext info to the current node's
+      // NodeContext:
+      // output_tensor_protos to input_tensor_protos and input_tensors, and
+      // output_tensors_as_shapes to input_tensors_as_shapes.
 
-        if (c->output_tensors_as_shapes.size() > src_output) {
-          input_tensors_as_shapes[dst_input] =
-              c->output_tensors_as_shapes[src_output];
-        }
+      if (src_ctx->output_tensors_as_shapes.size() > src_output) {
+        input_tensors_as_shapes[dst_input] =
+            src_ctx->output_tensors_as_shapes[src_output];
+      }
 
-        if (c->output_tensor_protos.size() > src_output) {
-          auto* tensor_proto = c->output_tensor_protos[src_output];
-          if (tensor_proto != nullptr &&
-              const_values[dst_input].FromProto(*tensor_proto)) {
-            input_tensors[dst_input] = &const_values[dst_input];
-            node_context->input_tensor_protos[dst_input] = tensor_proto;
-
-            if (!inference_context->FullyDefined(
-                    input_tensors_as_shapes[dst_input])) {
-              // Shape from a Const is not fully defined when the Const has
-              // value -1 (e.g., Reshape(x, Const(-1)) to reshape an arbitrary
-              // tensor x to a vector).
-              // It's possible that the same Const with -1 is used in many
-              // places, but that doesn't mean the resultant shapes are
-              // identical. e.g., x1 = Reshape(x, c) and y1 = Reshape(y, c),
-              // where c is -1. In this case, shape inference yields both x1 and
-              // y1 as rank 1, size unknown, but still the shapes of x1 and y1
-              // can be different. (even if we use different Const(-1) for x1
-              // and x2, graph optimzier may merge them to single Const through
-              // duplicate removal.)
-              // If we reuse output_tensors_as_shapes to input_tensors_as_shapes
-              // by copying ShapeHandle, they share the same Shape object, and
-              // SymbolicShapeManager, later in InferStatically(), assigns the
-              // same symbolic dim value (unique value < -1); in the above
-              // Reshape example, the shapes of x1 and y1 become, for example,
-              // [-278] and graph optimizer may yield incorrect output 'cause it
-              // assumes x1 and y1 have the same shape.
-              // To prevent this, we re-create a ShapeHandle from the Const
-              // tensor, instead of reusing output_tensors_as_shapes (so that
-              // ShapeHandles of the const fanouts have the same values,
-              // but different Shape objects -- SymbolicShapeManager assigns
-              // different symbol id to each fanout shape).
-              // TODO(dyoon): clean up the way values are propagated.
-              MaybeTensorValueToShape(inference_context,
-                                      const_values[dst_input],
-                                      &input_tensors_as_shapes[dst_input]);
-            }
+      if (src_ctx->output_tensor_protos.size() > src_output) {
+        auto* tensor_proto = src_ctx->output_tensor_protos[src_output];
+        if (tensor_proto != nullptr &&
+            const_values[dst_input].FromProto(*tensor_proto)) {
+          input_tensors[dst_input] = &const_values[dst_input];
+          ctx->input_tensor_protos[dst_input] = tensor_proto;
+
+          if (!ic->FullyDefined(input_tensors_as_shapes[dst_input])) {
+            // Shape from a Const is not fully defined when the Const has
+            // value -1 (e.g., Reshape(x, Const(-1)) to reshape an arbitrary
+            // tensor x to a vector).
+            // It's possible that the same Const with -1 is used in many
+            // places, but that doesn't mean the resultant shapes are
+            // identical. e.g., x1 = Reshape(x, c) and y1 = Reshape(y, c),
+            // where c is -1. In this case, shape inference yields both x1 and
+            // y1 as rank 1, size unknown, but still the shapes of x1 and y1
+            // can be different. (even if we use different Const(-1) for x1
+            // and x2, graph optimzier may merge them to single Const through
+            // duplicate removal.)
+            // If we reuse output_tensors_as_shapes to input_tensors_as_shapes
+            // by copying ShapeHandle, they share the same Shape object, and
+            // SymbolicShapeManager, later in InferStatically(), assigns the
+            // same symbolic dim value (unique value < -1); in the above
+            // Reshape example, the shapes of x1 and y1 become, for example,
+            // [-278] and graph optimizer may yield incorrect output 'cause it
+            // assumes x1 and y1 have the same shape.
+            // To prevent this, we re-create a ShapeHandle from the Const
+            // tensor, instead of reusing output_tensors_as_shapes (so that
+            // ShapeHandles of the const fanouts have the same values,
+            // but different Shape objects -- SymbolicShapeManager assigns
+            // different symbol id to each fanout shape).
+            // TODO(dyoon): clean up the way values are propagated.
+            MaybeTensorValueToShape(ic, const_values[dst_input],
+                                    &input_tensors_as_shapes[dst_input]);
           }
         }
+      }
 
-        DCHECK_GE(dst_input, 0);
-        // NOTE: we check only shape is refined; we do not (yet) check whether
-        // tensor value is refined.
-        if (!*refined && !inference_context->input(dst_input).SameHandle(
-                             c->inference_context->output(src_output))) {
-          *refined = true;
-        }
-        inference_context->SetInput(dst_input,
-                                    c->inference_context->output(src_output));
-
-        if (!*refined &&
-            inference_context->requested_input_tensor_as_partial_shape(
-                dst_input)) {
-          // The input value may have changed. Since we have no way to know if
-          // that's indeed the case, err on the safe side.
-          *refined = true;
-        }
+      // NOTE: we check only shape is refined; we do not (yet) check whether
+      // tensor value is refined.
+      if (!*refined &&
+          !ic->input(dst_input).SameHandle(src_ic->output(src_output))) {
+        *refined = true;
+      }
+      ic->SetInput(dst_input, src_ic->output(src_output));
 
-        // Also propagate handle shape and dtype of edges which are carrying
-        // resource handles.
-        if (node_context->input_types[dst_input] == DT_RESOURCE) {
-          auto* outputs =
-              c->inference_context->output_handle_shapes_and_types(src_output);
-          if (!outputs) continue;
-          auto* inputs =
-              inference_context->input_handle_shapes_and_types(dst_input);
-
-          if (!inputs || !EquivalentShapesAndTypes(*outputs, *inputs)) {
-            *refined = true;
-          }
-          inference_context->set_input_handle_shapes_and_types(dst_input,
-                                                               *outputs);
-        }
+      if (!*refined && ic->requested_input_tensor_as_partial_shape(dst_input)) {
+        // The input value may have changed. Since we have no way to know if
+        // that's indeed the case, err on the safe side.
+        *refined = true;
+      }
+
+      // Also propagate handle shape and dtype of edges which are carrying
+      // resource handles.
+      if (ctx->input_types[dst_input] == DT_RESOURCE) {
+        auto* outputs = src_ic->output_handle_shapes_and_types(src_output);
+        if (!outputs) continue;
+        auto* inputs = ic->input_handle_shapes_and_types(dst_input);
+
+        if (!inputs || !EquivalentShapesAndTypes(*outputs, *inputs))
+          *refined = true;
+        ic->set_input_handle_shapes_and_types(dst_input, *outputs);
       }
     }
 
     // Make sure we schedule the fanout of resources (which have no input)
     // whenever the resources are updated.
-    *refined |= inference_context->num_inputs() == 0;
+    *refined |= ic->num_inputs() == 0;
 
     if (!*refined) {
       // No input shape has changed, we're done.
       return Status::OK();
     }
 
-    node_context->inference_context->set_input_tensors(input_tensors);
-    node_context->inference_context->set_input_tensors_as_shapes(
-        input_tensors_as_shapes);
+    ic->set_input_tensors(input_tensors);
+    ic->set_input_tensors_as_shapes(input_tensors_as_shapes);
 
     // Properly handle function nodes.
-    if (node_context->op_data && node_context->op_data->is_function_op) {
+    if (ctx->op_data && ctx->op_data->is_function_op) {
       // TODO(jmdecker): Detect if the input shapes have changed for this
       // function. Note that when we hit a function call node, refined will be
       // true, as the updates to the call node will have changed, even if it's
@@ -801,7 +882,7 @@ class SymbolicShapeRefiner {
     }
 
     // Update the shapes of the outputs.
-    return InferShapes(*node, node_context);
+    return InferShapes(*node, ctx);
   }
 
   Status SetUnknownShape(const NodeDef* node, int output_port) {
@@ -1011,6 +1092,198 @@ class SymbolicShapeRefiner {
     return dim;
   }
 
+  // Returns true if all the output tensors have known values.
+  bool AllOutputValuesKnown(NodeContext* c) {
+    InferenceContext* ic = c->inference_context.get();
+    if (c->output_tensors_as_shapes.size() < ic->num_outputs() &&
+        c->output_tensor_protos.size() < ic->num_outputs()) {
+      return false;
+    } else {
+      // Checks if we can get output value via either output_tensor_proto or
+      // output_tensors_as_shapes.
+      for (int i = 0; i < ic->num_outputs(); i++) {
+        if (c->output_tensor_protos.size() > i &&
+            c->output_tensor_protos[i] != nullptr) {
+          continue;
+        }
+        if (c->output_tensors_as_shapes.size() > i &&
+            ic->FullyDefined(c->output_tensors_as_shapes[i])) {
+          continue;
+        }
+
+        // Unknown for output[i].
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Returns true if we can infer output tensors' values -- we know values of
+  // all the input tensors.
+  bool AllInputValuesKnown(NodeContext* c) {
+    InferenceContext* ic = c->inference_context.get();
+
+    // Check inputs are fully defined and values are known.
+    for (int i = 0; i < ic->num_inputs(); i++) {
+      const Tensor* tensor = ic->input_tensor(i);
+      // Note that we don't check c->input_tensor_protos[i], as UpdateNode()
+      // already converted it to ic->input_tensor(i);
+      const ShapeHandle& input_tensors_as_shape =
+          ic->input_tensors_as_shapes()[i];
+      // Either input_tensor is valid or input_tensors_as_shape, which has
+      // value of input tensors as shape format, should be fully defined.
+      if (tensor == nullptr && !ic->FullyDefined(input_tensors_as_shape)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Returns true if we want to update output values with running EvaluateNode()
+  // for this op, based on op type, data type, and size.
+  bool ShouldUpdateOutputValues(NodeContext* c, int64 max_size) {
+    InferenceContext* ic = c->inference_context.get();
+
+    // Due to the cost of running EvaluateNode(), we limit only to white listed
+    // op types.
+    if (!IsWhiteListedOpTypeForEvaluateNode(c->op_data->op_def.name())) {
+      return false;
+    }
+
+    // Check input dtypes are number types.
+    for (const auto& input_type : c->input_types) {
+      if (!IsNumericType(input_type)) {
+        return false;
+      }
+    }
+
+    // Check output dtypes are number types.
+    for (const auto& output_type : c->output_types) {
+      if (!IsNumericType(output_type)) {
+        return false;
+      }
+    }
+
+    // Check if the number of elements of each of input tensor is no larger than
+    // the given max size.
+    for (int i = 0; i < ic->num_inputs(); i++) {
+      const Tensor* tensor = ic->input_tensor(i);
+      const ShapeHandle& input_shape_handle = ic->input(i);
+      if (tensor != nullptr) {
+        if (tensor->NumElements() > max_size) {
+          return false;
+        }
+      } else if (ic->Value(ic->NumElements(input_shape_handle)) > max_size) {
+        return false;
+      }
+    }
+
+    // Check if we know the shape of each output tensor, and the number of
+    // elements is larger than the given max size.
+    for (int i = 0; i < ic->num_outputs(); i++) {
+      const ShapeHandle& shape_handle = ic->output(i);
+      if (!ic->FullyDefined(shape_handle) ||
+          ic->Value(ic->NumElements(shape_handle)) > max_size) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Create input tensors from the NodeConext.
+  void CreateInputTensors(NodeContext* c,
+                          std::vector<Tensor>* input_tensor_vector,
+                          TensorVector* inputs) {
+    InferenceContext* ic = c->inference_context.get();
+    for (int i = 0; i < ic->num_inputs(); i++) {
+      if (ic->input_tensor(i)) {
+        input_tensor_vector->at(i) = *ic->input_tensor(i);
+        inputs->emplace_back(&input_tensor_vector->at(i));
+        // Note that we don't check c->input_tensor_protos[i], as UpdateNode()
+        // already converted it to ic->input_tensor(i);
+      } else {
+        // Create Tensor from input_tensors_as_shapes, and then emplace it
+        // back to inputs.
+        // Note that input_tensors_as_shapes is scalar or vector.
+        const ShapeHandle& shape_handle = ic->input_tensors_as_shapes()[i];
+        const DataType& data_type = c->input_types[i];
+        int32 rank = ic->Rank(shape_handle);
+        if (rank < 1) {
+          input_tensor_vector->emplace_back(Tensor(data_type, {}));
+        } else {
+          input_tensor_vector->emplace_back(Tensor(data_type, {rank}));
+        }
+        auto* tensor = &input_tensor_vector->back();
+        if (data_type == DT_INT32) {
+          auto flat = tensor->flat<int32>();
+          for (int j = 0; j < rank; j++) {
+            int32 dim = ic->Value(ic->Dim(shape_handle, j));
+            flat(j) = dim;
+          }
+        } else {
+          auto flat = tensor->flat<int64>();
+          for (int j = 0; j < rank; j++) {
+            int64 dim = ic->Value(ic->Dim(shape_handle, j));
+            flat(j) = dim;
+          }
+        }
+        inputs->emplace_back(tensor);
+      }
+    }
+  }
+
+  // Run a node to infer output values, and add it to the NodeContext.
+  Status UpdateOutputValues(const NodeDef& node, NodeContext* c) {
+    InferenceContext* ic = c->inference_context.get();
+
+    // Input to EvaluateNode()
+    TensorVector inputs;
+    // Container for temporaily created tensor object.
+    std::vector<Tensor> input_tensor_vector(ic->num_inputs());
+    CreateInputTensors(c, &input_tensor_vector, &inputs);
+
+    // Output for EvaluateNode() and output tensor clean up object.
+    TensorVector outputs;
+    auto outputs_cleanup = gtl::MakeCleanup([&outputs] {
+      for (const auto& output : outputs) {
+        if (output.tensor) {
+          delete output.tensor;
+        }
+      }
+    });
+
+    TF_RETURN_IF_ERROR(EvaluateNode(node, inputs, /*cpu_device=*/nullptr,
+                                    &resource_mgr_, &outputs));
+    c->output_tensors_as_shapes.resize(outputs.size());
+    c->output_tensor_protos.resize(outputs.size(), nullptr);
+    for (int k = 0; k < outputs.size(); k++) {
+      const auto& t = outputs[k];
+      // Override output shape.
+      ShapeHandle output_shape;
+      TF_RETURN_IF_ERROR(
+          ic->MakeShapeFromTensorShape(t->shape(), &output_shape));
+      if (ic->FullyDefined(ic->output(k)) &&
+          !EquivalentShapes(ic->output(k), output_shape)) {
+        LOG(WARNING) << "UpdateOutputValues() -- node: " << node.name()
+                     << ", inferred output shape "
+                     << "doesn't match for k=" << k << ": "
+                     << "ic->output(k): " << ic->DebugString(ic->output(k))
+                     << ", output_shape: " << ic->DebugString(output_shape)
+                     << " -- " << node.DebugString();
+      }
+      ic->set_output(k, output_shape);
+      // Set output_tensors_as_shape.
+      MaybeTensorValueToShape(ic, *t.tensor, &c->output_tensors_as_shapes[k]);
+
+      // Set output_tensor_protos.
+      TensorProto tensor_proto;
+      t->AsProtoTensorContent(&tensor_proto);
+      const_tensors_to_propagate_.push_back(tensor_proto);
+      c->output_tensor_protos[k] = &const_tensors_to_propagate_.back();
+    }
+    return Status::OK();
+  }
+
   Status MaybeUpdateNodeContextOutput(const NodeDef& node, const bool is_fed,
                                       NodeContext* c) {
     // Propagate tensors and shape tensors unless the node is fed.
@@ -1041,7 +1314,7 @@ class SymbolicShapeRefiner {
           // Propagate size value.
           int64 sz = ic->Value(size);
           bool valid = false;
-          if (node.attr().at("T").type() == DT_INT32) {
+          if (node.attr().at("out_type").type() == DT_INT32) {
             if (sz < std::numeric_limits<int32>::max()) {
               const_tensors_to_propagate_.push_back(
                   MakeIntegerScalarTensorProto(DT_INT32, sz));
@@ -1201,6 +1474,19 @@ class SymbolicShapeRefiner {
         }
       }
     }
+
+    if (aggressive_shape_inference_) {
+      // Update output tensor values using EvaluateNode() if we can.
+      // Due to the cost of EvaluateNode(), we run it only for certain op types
+      // (white listed) and small integer tensors.
+
+      const int max_element_size = 17;  // Max up to 4x4 matrix or similar.
+      if (AllOutputValuesKnown(c) || !AllInputValuesKnown(c) ||
+          !ShouldUpdateOutputValues(c, max_element_size)) {
+        return Status::OK();
+      }
+      UpdateOutputValues(node, c).IgnoreError();  // This is optional.
+    }
     return Status::OK();
   }
 
@@ -1327,6 +1613,10 @@ class SymbolicShapeRefiner {
   // may resize and copy the objects into a new buffer, then the existing
   // pointers become dangling pointers.
   std::list<TensorProto> const_tensors_to_propagate_;
+
+  // For more aggressive shape and value inference.
+  bool aggressive_shape_inference_;
+  ResourceMgr resource_mgr_;
 };
 
 // Keep track of shapes and dimensions in a graph.
@@ -1403,45 +1693,52 @@ Status GraphProperties::RelaxEnqueueShapesAndMergeTypes(
 
 // Compute the output shape of the merge node as the union of the available
 // input shapes.
-Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
-                                        const NodeDef* node,
-                                        bool* new_shapes) const {
-  InferenceContext* c = shape_refiner->GetContext(node);
-  if (!c) {
+Status GraphProperties::UpdateMerge(SymbolicShapeRefiner* shape_refiner,
+                                    const NodeDef* node,
+                                    bool* new_shapes) const {
+  InferenceContext* ic = shape_refiner->GetContext(node);
+  if (!ic) {
     // Now we can run shape inference
     TF_RETURN_IF_ERROR(shape_refiner->AddNode(node));
-    c = CHECK_NOTNULL(shape_refiner->GetContext(node));
+    ic = CHECK_NOTNULL(shape_refiner->GetContext(node));
     *new_shapes = true;
 
     // Infer the shape of the second output once and for all since it never
     // changes.
-    ShapeHandle out1 = c->Scalar();
-    c->set_output(1, out1);
+    ShapeHandle out1 = ic->Scalar();
+    ic->set_output(1, out1);
   }
 
   ShapeHandle out;
+  const std::vector<ShapeAndType>* out_handle = nullptr;
   bool out_initialized = false;
-  for (const GraphView::Edge fanin :
-       shape_refiner->graph().GetFaninEdges(*node, false)) {
-    InferenceContext* in = shape_refiner->GetContext(fanin.src.node);
-    if (!in) {
+  for (const GraphView::Edge fanin : shape_refiner->graph().GetFaninEdges(
+           *node, /*include_controlling_edges=*/false)) {
+    InferenceContext* src_ic = shape_refiner->GetContext(fanin.src.node);
+    if (!src_ic) {
       // Handling a loop for the first time, the back edge won't have any shape
       // info.
       continue;
     }
-    ShapeHandle input = in->output(fanin.src.port_id);
-    CHECK_EQ(fanin.dst.node, node);
-    c->SetInput(fanin.dst.port_id, input);
+    ShapeHandle input = src_ic->output(fanin.src.port_id);
+    ic->SetInput(fanin.dst.port_id, input);
+    auto* input_handle =
+        src_ic->output_handle_shapes_and_types(fanin.src.port_id);
+    if (input_handle)
+      ic->set_input_handle_shapes_and_types(fanin.dst.port_id, *input_handle);
     if (!out_initialized) {
       out_initialized = true;
       out = input;
-      continue;
+      out_handle = input_handle;
+    } else {
+      // Note here only out, not out_handle, is modified.
+      out = shape_refiner->OutputAsUnion(node, 0, input, out);
     }
-    out = shape_refiner->OutputAsUnion(node, 0, input, out);
   }
 
-  if (*new_shapes || !shape_refiner->EquivalentShapes(out, c->output(0))) {
-    c->set_output(0, out);
+  if (*new_shapes || !shape_refiner->EquivalentShapes(out, ic->output(0))) {
+    ic->set_output(0, out);
+    if (out_handle) ic->set_output_handle_shapes_and_types(0, *out_handle);
     *new_shapes = true;
   }
 
@@ -1451,26 +1748,26 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
 // Manually propagate the input shape for Enter nodes.
 Status GraphProperties::UpdateEnter(SymbolicShapeRefiner* shape_refiner,
                                     const NodeDef* node, bool* new_shapes) {
-  auto enter_ctx = shape_refiner->GetContext(node);
-  if (!enter_ctx) {
+  InferenceContext* ic = shape_refiner->GetContext(node);
+  if (!ic) {
     TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(node, new_shapes));
-    enter_ctx = shape_refiner->GetContext(node);
+    ic = shape_refiner->GetContext(node);
   }
 
-  GraphView::InputPort inp(node, 0);
-  GraphView::OutputPort fanin = shape_refiner->graph().GetRegularFanin(inp);
+  GraphView::InputPort port(node, 0);
+  GraphView::OutputPort fanin = shape_refiner->graph().GetRegularFanin(port);
 
-  InferenceContext* in = shape_refiner->GetContext(fanin.node);
-  ShapeHandle input = in->output(fanin.port_id);
-  if (!enter_ctx->output(0).SameHandle(input)) {
-    enter_ctx->SetInput(0, input);
-    enter_ctx->set_output(0, input);
+  InferenceContext* src_ic = shape_refiner->GetContext(fanin.node);
+  ShapeHandle input = src_ic->output(fanin.port_id);
+  if (!ic->output(0).SameHandle(input)) {
+    ic->SetInput(0, input);
+    ic->set_output(0, input);
     *new_shapes = true;
   }
-  auto* outputs = in->output_handle_shapes_and_types(fanin.port_id);
+  auto* outputs = src_ic->output_handle_shapes_and_types(fanin.port_id);
   if (outputs) {
-    enter_ctx->set_input_handle_shapes_and_types(0, *outputs);
-    enter_ctx->set_output_handle_shapes_and_types(0, *outputs);
+    ic->set_input_handle_shapes_and_types(0, *outputs);
+    ic->set_output_handle_shapes_and_types(0, *outputs);
     *new_shapes = true;
   }
   return Status::OK();
@@ -1486,7 +1783,7 @@ Status GraphProperties::UpdateShapes(
     TF_RETURN_IF_ERROR(UpdateEnter(shape_refiner, n, new_shapes));
   } else if (IsMerge(*n)) {
     // Properly handle merge nodes.
-    TF_RETURN_IF_ERROR(UpdateMergeNode(shape_refiner, n, new_shapes));
+    TF_RETURN_IF_ERROR(UpdateMerge(shape_refiner, n, new_shapes));
   } else if (IsEnqueue(*n)) {
     // Make sure the shapes of enqueued tensors are propagated to the queue
     // itself.
@@ -1533,8 +1830,8 @@ Status GraphProperties::PropagateShapes(
       TF_RETURN_IF_ERROR(
           UpdateShapes(shape_refiner, resource_handles, n, &updated));
       if (updated) {
-        for (const GraphView::InputPort& fanout :
-             shape_refiner->graph().GetFanouts(*n, false)) {
+        for (const auto& fanout : shape_refiner->graph().GetFanouts(
+                 *n, /*include_controlled_nodes=*/false)) {
           new_shapes->push(fanout.node);
         }
         // Make sure the corresponding queue nodes are (re)processed.
@@ -1559,7 +1856,7 @@ Status GraphProperties::PropagateShapes(
 Status GraphProperties::UpdateQueue(const NodeDef* queue_node,
                                     SymbolicShapeRefiner* shape_refiner,
                                     bool* new_shapes) {
-  auto ctx = shape_refiner->GetNodeContext(queue_node);
+  auto* ctx = shape_refiner->GetNodeContext(queue_node);
   if (!ctx) {
     TF_RETURN_IF_ERROR(shape_refiner->AddNode(queue_node));
     ctx = CHECK_NOTNULL(shape_refiner->GetNodeContext(queue_node));
@@ -1650,7 +1947,8 @@ Status GraphProperties::UpdateEnqueue(
   return Status::OK();
 }
 
-Status GraphProperties::InferStatically(bool assume_valid_feeds) {
+Status GraphProperties::InferStatically(bool assume_valid_feeds,
+                                        bool aggressive_shape_inference) {
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item_.graph.library());
   std::unordered_map<string, std::unordered_set<int>> fed_ports;
@@ -1709,7 +2007,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
   }
 
   std::unordered_map<const NodeDef*, const NodeDef*> resource_handles;
-  std::vector<std::pair<const NodeDef*, const NodeDef*>> extra_deps;
+  std::vector<TopologicalDependency> extra_deps;
   for (const auto& resource : resources) {
     for (const NodeDef* src : resource.second.first) {
       resource_handles[src] = resource.first;
@@ -1721,8 +2019,8 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
     }
   }
 
-  std::unordered_map<const NodeDef*, int> topo_order;
-  Status s = ComputeTopologicalOrder(item_.graph, &topo_order, &extra_deps);
+  std::vector<const NodeDef*> topo_order;
+  Status s = ComputeTopologicalOrder(item_.graph, extra_deps, &topo_order);
   if (!s.ok()) {
     if (extra_deps.empty()) {
       return s;
@@ -1731,12 +2029,12 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
       // order. This will make the shape inference less precise but since this
       // isn't common it's not worth to figure out where to break the loop and
       // do a proper relaxation.
-      TF_RETURN_IF_ERROR(
-          ComputeTopologicalOrder(item_.graph, &topo_order, nullptr));
+      TF_RETURN_IF_ERROR(ComputeTopologicalOrder(item_.graph, &topo_order));
     }
   }
 
-  SymbolicShapeRefiner refiner(graph_view, fed_ports);
+  SymbolicShapeRefiner refiner(graph_view, fed_ports,
+                               aggressive_shape_inference);
 
   TopoQueue new_shapes(topo_order);
   // Also seed the propagation of shapes in the fanout of primary inputs.
diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h
index fbae1ca5b437c1d73c38da3ef580a9e49e8c84c5..3fcad6eb1b17e0c0239c5daf17bfcf717b5e3305 100644
--- a/tensorflow/core/grappler/costs/graph_properties.h
+++ b/tensorflow/core/grappler/costs/graph_properties.h
@@ -46,7 +46,16 @@ class GraphProperties {
   // However, it can help infer shapes in the fanout of fed nodes (even though
   // the correctness of these shapes can't be guaranteed), so in some cases
   // (such as simulation or scheduling) it makes sense of keep these shapes.
-  Status InferStatically(bool assume_valid_feeds);
+  // aggressive_shape_inference option executes nodes on the host to identify
+  // output values when possible and does other aggressive strategies.
+  // Similar to assuming_valid_feeds, this may cause incorrectness in graph
+  // analyses, but is useful for simulation or scheduling.
+  Status InferStatically(bool assume_valid_feeds,
+                         bool aggressive_shape_inference);
+  Status InferStatically(bool assume_valid_feeds) {
+    return InferStatically(assume_valid_feeds,
+                           /*aggressive_shape_inference=*/false);
+  }
   // Infer the shape by running the graph on the specified cluster and recording
   // the shapes of the processed tensors.
   Status InferDynamically(Cluster* cluster);
@@ -102,8 +111,8 @@ class GraphProperties {
 
   // Update the output shapes of a Merge node, and enqueue its fanout in
   // new_shapes if needed.
-  Status UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
-                         const NodeDef* node, bool* new_shapes) const;
+  Status UpdateMerge(SymbolicShapeRefiner* shape_refiner, const NodeDef* node,
+                     bool* new_shapes) const;
   // Process the Enter node, and enqueue its fanout in new_shapes if needed.
   static Status UpdateEnter(SymbolicShapeRefiner* shape_refiner,
                             const NodeDef* node, bool* new_shapes);
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 5aae773994c3136b3f41b2ae7934073cbb1daf98..fa6b05bd154b824893e3baf1ec3f64192ab00e4f 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -304,7 +304,6 @@ TEST_F(GraphPropertiesTest, ReadVariableOpAfterEnter) {
                   .Input("Enter", 0, DT_RESOURCE)
                   .Finalize(item.graph.add_node()));
 
-  // LOG(INFO) << item.graph.DebugString();
   GraphProperties properties(item);
   TF_CHECK_OK(properties.InferStatically(false));
   const auto props = properties.GetOutputProperties("ReadVariableOpAfterEnter");
@@ -342,6 +341,44 @@ TEST_F(GraphPropertiesTest, VarHandles) {
   EXPECT_EQ(7, prop.shape().dim(1).size());
 }
 
+TEST_F(GraphPropertiesTest, WhileLoopWithVarHandleOpInput) {
+  // Test graph is first generated in python using:
+  /*
+    i0 = tf.constant(0)
+    v = tf.get_variable(initializer=i0, name='loop_var', use_resource=True)
+    def cond(i, x):
+      return i < 3
+    def body(i, x):
+      return i + 1, x + x
+    v, y = tf.while_loop(cond, body, loop_vars=[v, tf.constant(1)])
+  */
+  // and then modified by hand such that the ReadVariableOp is inside the loop
+  // body instead of outside the while loop (which is the case when constructed
+  // using the python API), such that we have the following pattern: VarHandleOp
+  // -> Enter -> Switch -> ReadVariableOp -> other parts of loop body. Note
+  // DT_RESOURCE is passed all the way until ReadVariableOp.
+  GrapplerItem item;
+  string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
+                                 "while_loop_var_handle_op.pbtxt");
+  TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically(false));
+
+  std::vector<string> resource_nodes{
+      "loop_var",       "while/Enter",         "while/Merge", "while/Switch",
+      "while/Identity", "while/NextIteration", "while/Exit"};
+  for (const string& node : resource_nodes) {
+    const auto props = properties.GetOutputProperties(node);
+    EXPECT_GE(props.size(), 1);  // Merge has 2 outputs.
+    EXPECT_EQ("resource: []", PropToString(props[0]));
+  }
+
+  // After ReadVariableOp, the shape should be recovered.
+  const auto props = properties.GetOutputProperties("while/ReadVariableOp");
+  EXPECT_EQ(1, props.size());
+  EXPECT_EQ("int32: []", PropToString(props[0]));
+}
+
 TEST_F(GraphPropertiesTest, QueueWithOnlyDequeue_NoShapeAttr) {
   tensorflow::Scope root = tensorflow::Scope::NewRootScope();
   auto q1 = ops::FIFOQueue(root.WithOpName("Queue1"), {DataType::DT_FLOAT});
@@ -938,6 +975,52 @@ TEST_F(GraphPropertiesTest, IdentityPassingShape) {
   EXPECT_EQ("float: [5,5]", PropToString(out_prop0));
 }
 
+TEST_F(GraphPropertiesTest, SkippingValueInferenceForLargeTensors) {
+  // When using aggressive_shape_inference, we run EvaluateNode() for
+  // whitelisted ops and small input / output tensors. For instance, Fill op is
+  // evaluated and produces output tensor value if output tensor size is smal
+  // (currently, fewer than 17 elements); otherwise we don't run EvalauteNode().
+  // This is to avoid wasting time and memory for producing huge tensors (e.g.,
+  // initializing a large table using Fill.
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output a = ops::Const(s.WithOpName("a"), 4, {2});  // 4x4
+    Output b = ops::Const(s.WithOpName("const"), 0.1f, {});
+    // Shape described by a is small; expect output values of Fill op.
+    Output c = ops::Fill(s.WithOpName("fill"), a, b);
+
+    GrapplerItem item;
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    GraphProperties properties(item);
+    TF_CHECK_OK(properties.InferStatically(
+        /*assume_valid_feeds=*/false,
+        /*aggressive_shape_inference=*/true));
+    const auto out_props = properties.GetOutputProperties("fill");
+    const OpInfo::TensorProperties out_prop0 = out_props[0];
+    EXPECT_EQ("float: [4,4]", PropToString(out_prop0));
+    EXPECT_TRUE(out_prop0.has_value());
+  }
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output a = ops::Const(s.WithOpName("a"), 1000, {4});  // 1000x1000x1000x1000
+    Output b = ops::Const(s.WithOpName("const"), 0.1f, {});
+    // Shape described by a is huge; in that case we skip value inference.
+    // Otherwise, it'd be too much overhead.
+    Output c = ops::Fill(s.WithOpName("fill"), a, b);
+
+    GrapplerItem item;
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    GraphProperties properties(item);
+    TF_CHECK_OK(properties.InferStatically(
+        /*assume_valid_feeds=*/false,
+        /*aggressive_shape_inference=*/true));
+    const auto out_props = properties.GetOutputProperties("fill");
+    const OpInfo::TensorProperties out_prop0 = out_props[0];
+    EXPECT_EQ("float: [1000,1000,1000,1000]", PropToString(out_prop0));
+    EXPECT_FALSE(out_prop0.has_value());
+  }
+}
+
 TEST_F(GraphPropertiesTest, PackWithConstInput) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output a = ops::Const(s.WithOpName("a"), 1, {});
@@ -1625,6 +1708,91 @@ TEST_F(GraphPropertiesTest, StridedSlicesOfShapes) {
   EXPECT_EQ(shape_a.dim(1).size(), shape_o2.dim(0).size());
 }
 
+TEST_F(GraphPropertiesTest, StridedSliceOfShapeWithShrinkAxisMask) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  Output placeholder =
+      ops::Placeholder(scope.WithOpName("input_placeholder"), DT_FLOAT,
+                       ops::Placeholder::Shape(TensorShape({5, 480, 40, 1})));
+  auto input_shape = ops::Shape(scope.WithOpName("input_shape"), placeholder);
+
+  Output begin = ops::Const(scope.WithOpName("begin"), {0}, {1});
+  Output end = ops::Const(scope.WithOpName("end"), {3}, {1});
+  Output stride = ops::Const(scope.WithOpName("stride"), {1}, {1});
+
+  Output slice =
+      ops::StridedSlice(scope.WithOpName("slice"), input_shape, begin, end,
+                        stride, ops::StridedSlice::ShrinkAxisMask(1));
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  // Without aggresive shape inference, it cannot infer output value of
+  // StridedSlice with ShrinkAxisMask.
+  {
+    GraphProperties properties(item);
+    TF_CHECK_OK(properties.InferStatically(
+        /*assume_valid_feeds=*/false,
+        /*aggressive_shape_inference=*/false));
+    EXPECT_FALSE(properties.GetOutputProperties("slice").at(0).has_value());
+  }
+
+  // InferStatically with aggresive shape inference can infer output value of
+  // StridedSlice with ShrinkAxisMask.
+  {
+    GraphProperties properties(item);
+    TF_CHECK_OK(properties.InferStatically(
+        /*assume_valid_feeds=*/false,
+        /*aggressive_shape_inference=*/true));
+    EXPECT_TRUE(properties.GetOutputProperties("slice").at(0).has_value());
+    const auto slice_value =
+        properties.GetOutputProperties("slice").at(0).value();
+    ExpectTensorValues({5}, slice_value);
+  }
+}
+
+TEST_F(GraphPropertiesTest, ValuePropagationThroughArithmeticOps) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Const(s.WithOpName("a"), {5, 7}, {2});
+  Output b = ops::Const(s.WithOpName("b"), {8, 8}, {2});
+  Output c = ops::Const(s.WithOpName("c"), {2, 2}, {2});
+
+  Output a1 = ops::OnesLike(s.WithOpName("a1"), a);
+  Output a_plus_one = ops::Add(s.WithOpName("a_plus_one"), a, a1);
+  Output a_plus_a = ops::Add(s.WithOpName("a_plus_a"), a, a);
+  Output b_plus_2a = ops::Add(s.WithOpName("b_plus_2a"), b, a_plus_a);
+  Output c_plus_b_plus_2a =
+      ops::Add(s.WithOpName("c_plus_b_plus_2a"), c, b_plus_2a);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically(
+      /*assume_valid_feeds=*/false,
+      /*aggressive_shape_inference=*/true));
+
+  // Check output shapes and values.
+  const auto& a_plus_one_prop = properties.GetOutputProperties("a_plus_one")[0];
+  EXPECT_EQ("int32: [2]", PropToString(a_plus_one_prop));
+  EXPECT_TRUE(a_plus_one_prop.has_value());
+  ExpectTensorValues({6, 8}, a_plus_one_prop.value());
+
+  const auto& a_plus_a_prop = properties.GetOutputProperties("a_plus_a")[0];
+  EXPECT_EQ("int32: [2]", PropToString(a_plus_a_prop));
+  EXPECT_TRUE(a_plus_a_prop.has_value());
+  ExpectTensorValues({10, 14}, a_plus_a_prop.value());
+
+  const auto& b_plus_2a_prop = properties.GetOutputProperties("b_plus_2a")[0];
+  EXPECT_EQ("int32: [2]", PropToString(b_plus_2a_prop));
+  EXPECT_TRUE(b_plus_2a_prop.has_value());
+  ExpectTensorValues({18, 22}, b_plus_2a_prop.value());
+
+  const auto& c_plus_b_plus_2a_prop =
+      properties.GetOutputProperties("c_plus_b_plus_2a")[0];
+  EXPECT_EQ("int32: [2]", PropToString(c_plus_b_plus_2a_prop));
+  EXPECT_TRUE(c_plus_b_plus_2a_prop.has_value());
+  ExpectTensorValues({20, 24}, c_plus_b_plus_2a_prop.value());
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/graph_properties_testdata/large_function_graph.pbtxt b/tensorflow/core/grappler/costs/graph_properties_testdata/large_function_graph.pbtxt
index 415c347a1d2d563099490b780e10008508259027..d4e23e901a46a8524c2b2ef7d2311b9cf48850e7 100644
--- a/tensorflow/core/grappler/costs/graph_properties_testdata/large_function_graph.pbtxt
+++ b/tensorflow/core/grappler/costs/graph_properties_testdata/large_function_graph.pbtxt
@@ -511,6 +511,13 @@ library {
           s: "VALID"
         }
       }
+      attr {
+        key: "explicit_paddings"
+        value {
+          list {
+          }
+        }
+      }
       attr {
         key: "strides"
         value {
diff --git a/tensorflow/core/grappler/costs/graph_properties_testdata/while_loop_var_handle_op.pbtxt b/tensorflow/core/grappler/costs/graph_properties_testdata/while_loop_var_handle_op.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..048b8a23dca5b7c10b1b2e131863e1f0665fdc73
--- /dev/null
+++ b/tensorflow/core/grappler/costs/graph_properties_testdata/while_loop_var_handle_op.pbtxt
@@ -0,0 +1,291 @@
+node {
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value { type: DT_INT32 }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {}
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "loop_var"
+  op: "VarHandleOp"
+  attr {
+    key: "_class"
+    value { list { s: "loc:@loop_var" } }
+  }
+  attr {
+    key: "container"
+    value { s: "" }
+  }
+  attr {
+    key: "dtype"
+    value { type: DT_INT32 }
+  }
+  attr {
+    key: "shape"
+    value { shape {} }
+  }
+  attr {
+    key: "shared_name"
+    value { s: "loop_var" }
+  }
+}
+node {
+  name: "Const_1"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value { type: DT_INT32 }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {}
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "while/Enter"
+  op: "Enter"
+  input: "loop_var"
+  attr {
+    key: "T"
+    value { type: DT_RESOURCE }
+  }
+  attr {
+    key: "frame_name"
+    value { s: "while/while_context" }
+  }
+  attr {
+    key: "is_constant"
+    value { b: false }
+  }
+  attr {
+    key: "parallel_iterations"
+    value { i: 10 }
+  }
+}
+node {
+  name: "while/Enter_1"
+  op: "Enter"
+  input: "Const_1"
+  attr {
+    key: "T"
+    value { type: DT_INT32 }
+  }
+  attr {
+    key: "frame_name"
+    value { s: "while/while_context" }
+  }
+  attr {
+    key: "is_constant"
+    value { b: false }
+  }
+  attr {
+    key: "parallel_iterations"
+    value { i: 10 }
+  }
+}
+node {
+  name: "while/Merge"
+  op: "Merge"
+  input: "while/Enter"
+  input: "while/NextIteration"
+  attr {
+    key: "N"
+    value { i: 2 }
+  }
+  attr {
+    key: "T"
+    value { type: DT_RESOURCE }
+  }
+}
+node {
+  name: "while/Merge_1"
+  op: "Merge"
+  input: "while/Enter_1"
+  input: "while/NextIteration_1"
+  attr {
+    key: "N"
+    value { i: 2 }
+  }
+  attr {
+    key: "T"
+    value { type: DT_INT32 }
+  }
+}
+node {
+  name: "while/Less/y"
+  op: "Const"
+  input: "^while/Merge"
+  attr {
+    key: "dtype"
+    value { type: DT_INT32 }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {}
+        int_val: 3
+      }
+    }
+  }
+}
+node {
+  name: "while/Less"
+  op: "Less"
+  input: "while/Merge"
+  input: "while/Less/y"
+  attr {
+    key: "T"
+    value { type: DT_INT32 }
+  }
+}
+node { name: "while/LoopCond" op: "LoopCond" input: "while/Less" }
+node {
+  name: "while/Switch"
+  op: "Switch"
+  input: "while/Merge"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value { type: DT_RESOURCE }
+  }
+  attr {
+    key: "_class"
+    value { list { s: "loc:@while/Merge" } }
+  }
+}
+node {
+  name: "while/Switch_1"
+  op: "Switch"
+  input: "while/Merge_1"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value { type: DT_INT32 }
+  }
+  attr {
+    key: "_class"
+    value { list { s: "loc:@while/Merge_1" } }
+  }
+}
+node {
+  name: "while/Identity"
+  op: "Identity"
+  input: "while/Switch:1"
+  attr {
+    key: "T"
+    value { type: DT_RESOURCE }
+  }
+}
+node {
+  name: "while/ReadVariableOp"
+  op: "ReadVariableOp"
+  input: "while/Identity"
+  attr {
+    key: "dtype"
+    value { type: DT_INT32 }
+  }
+}
+node {
+  name: "while/Identity_1"
+  op: "Identity"
+  input: "while/Switch_1:1"
+  attr {
+    key: "T"
+    value { type: DT_INT32 }
+  }
+}
+node {
+  name: "while/add/y"
+  op: "Const"
+  input: "^while/ReadVariableOp"
+  attr {
+    key: "dtype"
+    value { type: DT_INT32 }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {}
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "while/add"
+  op: "Add"
+  input: "while/ReadVariableOp"
+  input: "while/add/y"
+  attr {
+    key: "T"
+    value { type: DT_INT32 }
+  }
+}
+node {
+  name: "while/add_1"
+  op: "Add"
+  input: "while/Identity_1"
+  input: "while/Identity_1"
+  attr {
+    key: "T"
+    value { type: DT_INT32 }
+  }
+}
+node {
+  name: "while/NextIteration"
+  op: "NextIteration"
+  input: "while/Identity"
+  attr {
+    key: "T"
+    value { type: DT_RESOURCE }
+  }
+}
+node {
+  name: "while/NextIteration_1"
+  op: "NextIteration"
+  input: "while/add_1"
+  attr {
+    key: "T"
+    value { type: DT_INT32 }
+  }
+}
+node {
+  name: "while/Exit"
+  op: "Exit"
+  input: "while/Switch"
+  attr {
+    key: "T"
+    value { type: DT_RESOURCE }
+  }
+}
+node {
+  name: "while/Exit_1"
+  op: "Exit"
+  input: "while/Switch_1"
+  attr {
+    key: "T"
+    value { type: DT_INT32 }
+  }
+}
+versions { producer: 27 }
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
index 833205ac6f12a73d96c93455bb355ee511d6700a..088ce566580c4f23c9927adfb927fbf9afd34017 100644
--- a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
@@ -51,8 +51,12 @@ Status MeasuringCostEstimator::Initialize(const GrapplerItem& item) {
 }
 
 Status MeasuringCostEstimator::PredictCosts(const GraphDef& optimized_graph,
-                                            CostGraphDef* cost_graph,
+                                            RunMetadata* run_metadata,
                                             Costs* costs) const {
+  CostGraphDef* cost_graph = nullptr;
+  if (run_metadata) {
+    cost_graph = run_metadata->mutable_cost_graph();
+  }
   const bool running_simulation = (cluster_->type() == "virtual");
 
   std::vector<double> times(measurement_steps_);
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.h b/tensorflow/core/grappler/costs/measuring_cost_estimator.h
index 3e741c91997403e7eae438d2dd72c9a70da9316a..67145f5241ef8a5c101d5305889ff5fee823cceb 100644
--- a/tensorflow/core/grappler/costs/measuring_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.h
@@ -54,12 +54,12 @@ class MeasuringCostEstimator : public CostEstimator {
   // This implementation always returns OK.
   Status Initialize(const GrapplerItem& item) override;
 
-  // Runs the optimized version of the graph on the cluster, measure
-  // the runtimes of each operation, and annotated the CostGraphDef
-  // with the corresponding measurements.
+  // Runs the optimized version of the graph on the cluster, measures
+  // the runtimes of each operation, and annotates the CostGraphDef of
+  // RunMetadata with the corresponding measurements.
   // Returns the average latency for the whole graph.
-  Status PredictCosts(const GraphDef& optimized_graph, CostGraphDef* cost_graph,
-                      Costs* overall_cost) const override;
+  Status PredictCosts(const GraphDef& optimized_graph,
+                      RunMetadata* run_metadata, Costs* cost) const override;
 
  private:
   Cluster* cluster_;  // Not owned.
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 0e55209238555deb88d69ba97fc4df8cb11d3677..11877d87513ec8f2965a0e3a7fc7a11e0a551728 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -27,7 +27,6 @@ namespace tensorflow {
 namespace grappler {
 
 constexpr int kOpsPerMac = 2;
-constexpr char kConst[] = "Const";
 constexpr char kGuaranteeConst[] = "GuaranteeConst";
 constexpr char kConv2d[] = "Conv2D";
 constexpr char kConv2dBackpropFilter[] = "Conv2DBackpropFilter";
@@ -50,10 +49,9 @@ constexpr char kSqueeze[] = "Squeeze";
 constexpr char kRecv[] = "_Recv";
 constexpr char kSend[] = "_Send";
 constexpr char kBatchMatMul[] = "BatchMatMul";
-constexpr char kVariable[] = "Variable";
-constexpr char kVariableV2[] = "VariableV2";
 constexpr char kRank[] = "Rank";
 constexpr char kShape[] = "Shape";
+constexpr char kShapeN[] = "ShapeN";
 constexpr char kSize[] = "Size";
 constexpr char kStopGradient[] = "StopGradient";
 constexpr char kPreventGradient[] = "PreventGradient";
@@ -66,31 +64,39 @@ constexpr char kAvgPool[] = "AvgPool";
 constexpr char kAvgPoolGrad[] = "AvgPoolGrad";
 constexpr char kFusedBatchNorm[] = "FusedBatchNorm";
 constexpr char kFusedBatchNormGrad[] = "FusedBatchNormGrad";
+constexpr char kQuantizedMatMul[] = "QuantizedMatMul";
 constexpr char kQuantizedMatMulV2[] = "QuantizedMatMulV2";
+// Persistent ops.
+constexpr char kConst[] = "Const";
+constexpr char kVariable[] = "Variable";
+constexpr char kVariableV2[] = "VariableV2";
+constexpr char kAutoReloadVariable[] = "AutoReloadVariable";
+constexpr char kVarHandleOp[] = "VarHandleOp";
+constexpr char kReadVariableOp[] = "ReadVariableOp";
 
 static const Costs::Duration kMinComputeTime(1);
 
 namespace {
 
-string GetDataFormat(const OpInfo& op_features) {
+string GetDataFormat(const OpInfo& op_info) {
   string data_format = "NHWC";  // Default format.
-  if (op_features.attr().find("data_format") != op_features.attr().end()) {
-    data_format = op_features.attr().at("data_format").s();
+  if (op_info.attr().find("data_format") != op_info.attr().end()) {
+    data_format = op_info.attr().at("data_format").s();
   }
   return data_format;
 }
 
-string GetFilterFormat(const OpInfo& op_features) {
+string GetFilterFormat(const OpInfo& op_info) {
   string filter_format = "HWIO";  // Default format.
-  if (op_features.attr().find("filter_format") != op_features.attr().end()) {
-    filter_format = op_features.attr().at("filter_format").s();
+  if (op_info.attr().find("filter_format") != op_info.attr().end()) {
+    filter_format = op_info.attr().at("filter_format").s();
   }
   return filter_format;
 }
 
-Padding GetPadding(const OpInfo& op_features) {
-  if (op_features.attr().find("padding") != op_features.attr().end() &&
-      op_features.attr().at("padding").s() == "VALID") {
+Padding GetPadding(const OpInfo& op_info) {
+  if (op_info.attr().find("padding") != op_info.attr().end() &&
+      op_info.attr().at("padding").s() == "VALID") {
     return Padding::VALID;
   }
   return Padding::SAME;  // Default padding.
@@ -107,11 +113,11 @@ bool IsTraining(const OpInfo& op_info) {
 // TODO(dyoon): support non-4D tensors in the c ost functions of convolution
 // related ops (Conv, Pool, BatchNorm, and their backprops) and the related
 // helper functions.
-std::vector<int64> GetStrides(const OpInfo& op_features) {
-  if (op_features.attr().find("strides") != op_features.attr().end()) {
-    const auto strides = op_features.attr().at("strides").list().i();
-    CHECK(strides.size() == 4) << "Attr strides is not a length-4 vector: "
-                               << op_features.DebugString();
+std::vector<int64> GetStrides(const OpInfo& op_info) {
+  if (op_info.attr().find("strides") != op_info.attr().end()) {
+    const auto strides = op_info.attr().at("strides").list().i();
+    CHECK(strides.size() == 4)
+        << "Attr strides is not a length-4 vector: " << op_info.DebugString();
     return {strides[0], strides[1], strides[2], strides[3]};
   }
   return {1, 1, 1, 1};
@@ -238,6 +244,7 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
       {kMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kSparseMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kBatchMatMul, wrap(&OpLevelCostEstimator::PredictBatchMatMul)},
+      {kQuantizedMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kQuantizedMatMulV2, wrap(&OpLevelCostEstimator::PredictMatMul)},
 
       {kNoOp, wrap(&OpLevelCostEstimator::PredictNoOp)},
@@ -258,12 +265,9 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
       {kRecv, wrap(&OpLevelCostEstimator::PredictIdentity)},
       {kSend, wrap(&OpLevelCostEstimator::PredictIdentity)},
 
-      {kConst, wrap(&OpLevelCostEstimator::PredictVariable)},
-      {kVariable, wrap(&OpLevelCostEstimator::PredictVariable)},
-      {kVariableV2, wrap(&OpLevelCostEstimator::PredictVariable)},
-
       {kRank, wrap(&OpLevelCostEstimator::PredictMetadata)},
       {kShape, wrap(&OpLevelCostEstimator::PredictMetadata)},
+      {kShapeN, wrap(&OpLevelCostEstimator::PredictMetadata)},
       {kSize, wrap(&OpLevelCostEstimator::PredictMetadata)},
       {kMaxPool, wrap(&OpLevelCostEstimator::PredictMaxPool)},
       {kMaxPoolGrad, wrap(&OpLevelCostEstimator::PredictMaxPoolGrad)},
@@ -274,6 +278,11 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
        wrap(&OpLevelCostEstimator::PredictFusedBatchNormGrad)},
   };
 
+  persistent_ops_ = {
+      kConst,       kVariable,       kVariableV2, kAutoReloadVariable,
+      kVarHandleOp, kReadVariableOp,
+  };
+
 #define EIGEN_COST(X) Eigen::internal::functor_traits<Eigen::internal::X>::Cost
 
   // Quantize = apply min and max bounds, multiply by scale factor and round.
@@ -288,6 +297,12 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
       {"Atan", EIGEN_COST(scalar_atan_op<float>)},
       {"Atan2", EIGEN_COST(scalar_quotient_op<float>) +
                     EIGEN_COST(scalar_atan_op<float>)},
+      // For now, we use Eigen cost model for float to int16 cast as an example
+      // case; Eigen cost model is zero when src and dst types are identical,
+      // and it uses AddCost (1) when different. We may implement a separate
+      // cost functions for cast ops, using the actual input and output types.
+      {"Cast", Eigen::internal::functor_traits<
+                   Eigen::internal::scalar_cast_op<float, int16>>::Cost},
       {"Ceil", EIGEN_COST(scalar_ceil_op<float>)},
       {"Cos", EIGEN_COST(scalar_cos_op<float>)},
       {"Dequantize", EIGEN_COST(scalar_product_op<float>)},
@@ -353,23 +368,27 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
 }
 
 Costs OpLevelCostEstimator::PredictCosts(const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
-  auto it = device_cost_impl_.find(op_features.op());
-  if (it == device_cost_impl_.end()) {
-    if (elementwise_ops_.find(op_features.op()) != elementwise_ops_.end()) {
-      return PredictCwiseOp(op_context);
-    }
+  const auto& op_info = op_context.op_info;
+  auto it = device_cost_impl_.find(op_info.op());
+  if (it != device_cost_impl_.end()) {
+    std::function<Costs(const OpContext&)> estimator = it->second;
+    Costs costs = estimator(op_context);
+    VLOG(1) << "Operation " << op_info.op() << " takes "
+            << costs.execution_time.count() << " ns.";
+    return costs;
+  }
 
-    VLOG(1) << "Missing accurate estimator for op: " << op_features.op();
+  if (persistent_ops_.find(op_info.op()) != persistent_ops_.end()) {
+    return PredictVariable(op_context);
+  }
 
-    return PredictCostOfAnUnknownOp(op_context);
+  if (elementwise_ops_.find(op_info.op()) != elementwise_ops_.end()) {
+    return PredictCwiseOp(op_context);
   }
 
-  std::function<Costs(const OpContext&)> estimator = it->second;
-  Costs costs = estimator(op_context);
-  VLOG(1) << "Operation " << op_features.op() << " takes "
-          << costs.execution_time.count() << " ns.";
-  return costs;
+  VLOG(1) << "Missing accurate estimator for op: " << op_info.op();
+
+  return PredictCostOfAnUnknownOp(op_context);
 }
 
 DeviceInfo OpLevelCostEstimator::GetDeviceInfo(
@@ -424,39 +443,38 @@ DeviceInfo OpLevelCostEstimator::GetDeviceInfo(
 }
 
 Costs OpLevelCostEstimator::PredictCwiseOp(const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
+  const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
   // For unary or binary element-wise operations, op count is the element count
   // of any input. We use the count for the largest input here to be more robust
   // in case that the shape is unknown or partially known for other input.
-  int64 op_count =
-      CalculateLargestInputCount(op_features, &found_unknown_shapes);
+  int64 op_count = CalculateLargestInputCount(op_info, &found_unknown_shapes);
   // If output shape is available, try use the element count calcuated from
   // that.
-  if (op_features.outputs_size() > 0) {
-    op_count =
-        std::max(op_count, CalculateTensorElementCount(op_features.outputs(0),
-                                                       &found_unknown_shapes));
+  if (op_info.outputs_size() > 0) {
+    op_count = std::max(
+        op_count,
+        CalculateTensorElementCount(op_info.outputs(0), &found_unknown_shapes));
   }
   // For binary ops, calculate the output shape possibly resulting from
   // broadcasting.
-  if (op_features.inputs_size() >= 2) {
-    op_count = std::max(op_count,
-                        CwiseOutputElementCount(op_features.inputs(0).shape(),
-                                                op_features.inputs(1).shape()));
+  if (op_info.inputs_size() >= 2) {
+    op_count =
+        std::max(op_count, CwiseOutputElementCount(op_info.inputs(0).shape(),
+                                                   op_info.inputs(1).shape()));
   }
 
   int op_cost = 1;
   bool is_known_elementwise_op = false;
-  auto it = elementwise_ops_.find(op_features.op());
+  auto it = elementwise_ops_.find(op_info.op());
   if (it != elementwise_ops_.end()) {
     op_cost = it->second;
     is_known_elementwise_op = true;
   } else {
-    LOG(WARNING) << "Not a cwise op: " << op_features.op();
+    LOG(WARNING) << "Not a cwise op: " << op_info.op();
   }
 
-  Costs costs = PredictOpCountBasedCost(op_count * op_cost, op_features);
+  Costs costs = PredictOpCountBasedCost(op_count * op_cost, op_info);
   if (found_unknown_shapes || !is_known_elementwise_op) {
     costs.inaccurate = true;
   }
@@ -521,8 +539,10 @@ Costs OpLevelCostEstimator::PredictOpCountBasedCost(
                       device_info.intermediate_write_gb_per_sec)
           : 0;
 
-  Costs::NanoSeconds intermediate_memory_cost(intermediate_read_time +
-                                              intermediate_write_time);
+  Costs::NanoSeconds intermediate_memory_cost =
+      compute_memory_overlap_
+          ? std::max(intermediate_read_time, intermediate_write_time)
+          : (intermediate_read_time + intermediate_write_time);
   VLOG(1) << "Op:" << op_info.op() << " Size (KB):" << (total_io_bytes) / 1e3
           << " Intermediate Memory Time (ns):"
           << intermediate_memory_cost.count();
@@ -536,17 +556,17 @@ Costs OpLevelCostEstimator::PredictOpCountBasedCost(
 }
 
 int64 OpLevelCostEstimator::CountConv2DOperations(
-    const OpInfo& op_features, bool* found_unknown_shapes) const {
-  return CountConv2DOperations(op_features, nullptr, found_unknown_shapes);
+    const OpInfo& op_info, bool* found_unknown_shapes) const {
+  return CountConv2DOperations(op_info, nullptr, found_unknown_shapes);
 }
 
 // Helper to translate the positional arguments into named fields.
 OpLevelCostEstimator::ConvolutionDimensions
 OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
     const TensorShapeProto& original_image_shape,
-    const TensorShapeProto& original_filter_shape, const OpInfo& op_features,
+    const TensorShapeProto& original_filter_shape, const OpInfo& op_info,
     bool* found_unknown_shapes) {
-  VLOG(2) << "op features: " << op_features.DebugString();
+  VLOG(2) << "op features: " << op_info.DebugString();
   VLOG(2) << "Original image shape: " << original_image_shape.DebugString();
   VLOG(2) << "Original filter shape: " << original_filter_shape.DebugString();
   auto image_shape =
@@ -557,7 +577,7 @@ OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
   VLOG(2) << "Filter shape: " << filter_shape.DebugString();
 
   int x_index, y_index, channel_index;
-  const string& data_format = GetDataFormat(op_features);
+  const string& data_format = GetDataFormat(op_info);
   if (data_format == "NCHW") {
     x_index = 2;
     y_index = 3;
@@ -568,7 +588,7 @@ OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
     y_index = 2;
     channel_index = 3;
   }
-  const string& filter_format = GetFilterFormat(op_features);
+  const string& filter_format = GetFilterFormat(op_info);
   int filter_x_index, filter_y_index, in_channel_index, out_channel_index;
   if (filter_format == "HWIO") {
     filter_x_index = 0;
@@ -588,8 +608,8 @@ OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
   int64 iz = image_shape.dim(channel_index).size();
   int64 kx = filter_shape.dim(filter_x_index).size();
   int64 ky = filter_shape.dim(filter_y_index).size();
-  std::vector<int64> strides = GetStrides(op_features);
-  const auto padding = GetPadding(op_features);
+  std::vector<int64> strides = GetStrides(op_info);
+  const auto padding = GetPadding(op_info);
   int64 sx = strides[x_index];
   int64 sy = strides[y_index];
   int64 ox = GetOutputSize(ix, kx, sx, padding);
@@ -617,14 +637,13 @@ OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
 }
 
 int64 OpLevelCostEstimator::CountConv2DOperations(
-    const OpInfo& op_features, ConvolutionDimensions* conv_info,
+    const OpInfo& op_info, ConvolutionDimensions* conv_info,
     bool* found_unknown_shapes) const {
-  DCHECK(op_features.op() == kConv2d ||
-         op_features.op() == kDepthwiseConv2dNative)
+  DCHECK(op_info.op() == kConv2d || op_info.op() == kDepthwiseConv2dNative)
       << "Invalid Operation: not Conv2D nor DepthwiseConv2dNative";
 
   ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
-      op_features.inputs(0).shape(), op_features.inputs(1).shape(), op_features,
+      op_info.inputs(0).shape(), op_info.inputs(1).shape(), op_info,
       found_unknown_shapes);
 
   //  in DepthwiseConv2dNative conv_dims.oz is actually the channel depth
@@ -635,7 +654,7 @@ int64 OpLevelCostEstimator::CountConv2DOperations(
   int64 ops = conv_dims.batch;
   ops *= conv_dims.ox * conv_dims.oy;
   ops *= conv_dims.kx * conv_dims.ky;
-  if (op_features.op() == kConv2d) {
+  if (op_info.op() == kConv2d) {
     ops *= conv_dims.iz * conv_dims.oz;
   } else {
     // To ensure output tensor dims to be correct for DepthwiseConv2DNative,
@@ -652,32 +671,32 @@ int64 OpLevelCostEstimator::CountConv2DOperations(
 }
 
 int64 OpLevelCostEstimator::CountMatMulOperations(
-    const OpInfo& op_features, bool* found_unknown_shapes) const {
-  return CountMatMulOperations(op_features, nullptr, found_unknown_shapes);
+    const OpInfo& op_info, bool* found_unknown_shapes) const {
+  return CountMatMulOperations(op_info, nullptr, found_unknown_shapes);
 }
 
 // TODO(nishantpatil): Create separate estimator for Sparse Matmul
 int64 OpLevelCostEstimator::CountMatMulOperations(
-    const OpInfo& op_features, MatMulDimensions* mat_mul,
+    const OpInfo& op_info, MatMulDimensions* mat_mul,
     bool* found_unknown_shapes) const {
   double ops = 0;
 
-  if (op_features.inputs_size() < 2) {
-    LOG(ERROR) << "Need 2 inputs but got " << op_features.inputs_size();
+  if (op_info.inputs_size() < 2) {
+    LOG(ERROR) << "Need 2 inputs but got " << op_info.inputs_size();
     // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
     return 0;
   }
 
-  auto& a_matrix = op_features.inputs(0);
-  auto& b_matrix = op_features.inputs(1);
+  auto& a_matrix = op_info.inputs(0);
+  auto& b_matrix = op_info.inputs(1);
 
   bool transpose_a = false;
   bool transpose_b = false;
 
   double m_dim, n_dim, k_dim, k_dim_b = 0;
 
-  for (const auto& item : op_features.attr()) {
+  for (const auto& item : op_info.attr()) {
     VLOG(1) << "Key:" << item.first
             << " Value:" << SummarizeAttrValue(item.second);
     if (item.first == "transpose_a" && item.second.b() == true)
@@ -729,23 +748,23 @@ int64 OpLevelCostEstimator::CountMatMulOperations(
 }
 
 int64 OpLevelCostEstimator::CountBatchMatMulOperations(
-    const OpInfo& op_features, bool* found_unknown_shapes) const {
-  if (op_features.op() != kBatchMatMul) {
-    LOG(ERROR) << "Invalid Operation: " << op_features.op();
+    const OpInfo& op_info, bool* found_unknown_shapes) const {
+  if (op_info.op() != kBatchMatMul) {
+    LOG(ERROR) << "Invalid Operation: " << op_info.op();
     // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
     return 0;
   }
-  if (op_features.inputs_size() != 2) {
-    LOG(ERROR) << "Expected 2 inputs but got " << op_features.inputs_size();
+  if (op_info.inputs_size() != 2) {
+    LOG(ERROR) << "Expected 2 inputs but got " << op_info.inputs_size();
     // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
     return 0;
   }
 
   double ops = 0;
-  const auto& a_input = op_features.inputs(0);
-  const auto& b_input = op_features.inputs(1);
+  const auto& a_input = op_info.inputs(0);
+  const auto& b_input = op_info.inputs(1);
 
   // BatchMatMul requires inputs of at least matrix shape (rank 2).
   // The two most minor dimensions of each input are matrices that
@@ -795,24 +814,24 @@ int64 OpLevelCostEstimator::CountBatchMatMulOperations(
 
   // Build the MatMul. Note that values are ignored here since we are just
   // counting ops (e.g. only shapes matter).
-  OpInfo matmul_op_features;
-  matmul_op_features.set_op("MatMul");
+  OpInfo matmul_op_info;
+  matmul_op_info.set_op("MatMul");
 
   AttrValue transpose_a;
   transpose_a.set_b(false);
-  if (op_features.attr().find("adj_x") != op_features.attr().end()) {
-    transpose_a.set_b(op_features.attr().at("adj_x").b());
+  if (op_info.attr().find("adj_x") != op_info.attr().end()) {
+    transpose_a.set_b(op_info.attr().at("adj_x").b());
   }
-  (*matmul_op_features.mutable_attr())["transpose_a"] = transpose_a;
+  (*matmul_op_info.mutable_attr())["transpose_a"] = transpose_a;
 
   AttrValue transpose_b;
   transpose_b.set_b(false);
-  if (op_features.attr().find("adj_y") != op_features.attr().end()) {
-    transpose_b.set_b(op_features.attr().at("adj_y").b());
+  if (op_info.attr().find("adj_y") != op_info.attr().end()) {
+    transpose_b.set_b(op_info.attr().at("adj_y").b());
   }
-  (*matmul_op_features.mutable_attr())["transpose_b"] = transpose_b;
+  (*matmul_op_info.mutable_attr())["transpose_b"] = transpose_b;
 
-  OpInfo::TensorProperties* a_matrix = matmul_op_features.add_inputs();
+  OpInfo::TensorProperties* a_matrix = matmul_op_info.add_inputs();
   a_matrix->set_dtype(a_input.dtype());
   TensorShapeProto* a_matrix_shape = a_matrix->mutable_shape();
   for (int i = std::max(0, a_input_shape.dim_size() - matrix_rank);
@@ -820,7 +839,7 @@ int64 OpLevelCostEstimator::CountBatchMatMulOperations(
     *(a_matrix_shape->add_dim()) = a_input_shape.dim(i);
   }
 
-  OpInfo::TensorProperties* b_matrix = matmul_op_features.add_inputs();
+  OpInfo::TensorProperties* b_matrix = matmul_op_info.add_inputs();
   b_matrix->set_dtype(b_input.dtype());
   TensorShapeProto* b_matrix_shape = b_matrix->mutable_shape();
   for (int i = std::max(0, b_input_shape.dim_size() - matrix_rank);
@@ -830,7 +849,7 @@ int64 OpLevelCostEstimator::CountBatchMatMulOperations(
 
   for (int i = 0; i < num_matmuls; ++i) {
     bool matmul_unknown_shapes = false;
-    ops += CountMatMulOperations(matmul_op_features, &matmul_unknown_shapes);
+    ops += CountMatMulOperations(matmul_op_info, &matmul_unknown_shapes);
     *found_unknown_shapes |= matmul_unknown_shapes;
   }
   return ops;
@@ -888,16 +907,16 @@ bool GetTensorShapeProtoFromTensorProto(const TensorProto& tensor_proto,
 
 // TODO(cliffy): Dedup this method and CountConv2DBackpropFilterOperations.
 int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
-    const OpInfo& op_features, ConvolutionDimensions* returned_conv_dims,
+    const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims,
     bool* found_unknown_shapes) const {
   int64 ops = 0;
 
-  DCHECK(op_features.op() == kConv2dBackpropInput ||
-         op_features.op() == kDepthwiseConv2dNativeBackpropInput)
+  DCHECK(op_info.op() == kConv2dBackpropInput ||
+         op_info.op() == kDepthwiseConv2dNativeBackpropInput)
       << "Invalid Operation: not kConv2dBackpropInput nor"
          "kDepthwiseConv2dNativeBackpropInput";
 
-  if (op_features.inputs_size() < 2) {
+  if (op_info.inputs_size() < 2) {
     // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
     return ops;
@@ -905,12 +924,12 @@ int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
 
   TensorShapeProto input_shape;
   bool shape_found = false;
-  if (op_features.inputs(0).has_value()) {
-    const TensorProto& value = op_features.inputs(0).value();
+  if (op_info.inputs(0).has_value()) {
+    const TensorProto& value = op_info.inputs(0).value();
     shape_found = GetTensorShapeProtoFromTensorProto(value, &input_shape);
   }
-  if (!shape_found && op_features.outputs_size() == 1) {
-    input_shape = op_features.outputs(0).shape();
+  if (!shape_found && op_info.outputs_size() == 1) {
+    input_shape = op_info.outputs(0).shape();
     shape_found = true;
   }
   if (!shape_found) {
@@ -923,13 +942,12 @@ int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
   }
 
   ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
-      input_shape, op_features.inputs(1).shape(), op_features,
-      found_unknown_shapes);
+      input_shape, op_info.inputs(1).shape(), op_info, found_unknown_shapes);
 
   ops = conv_dims.batch;
   ops *= conv_dims.ox * conv_dims.oy;
   ops *= conv_dims.kx * conv_dims.ky;
-  if (op_features.op() == kConv2dBackpropInput) {
+  if (op_info.op() == kConv2dBackpropInput) {
     ops *= conv_dims.iz * conv_dims.oz;
   } else {
     // conv_dims always use forward path definition regardless
@@ -938,7 +956,7 @@ int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
   }
   ops *= kOpsPerMac;
 
-  VLOG(1) << "Operations for" << op_features.op() << "  " << ops;
+  VLOG(1) << "Operations for" << op_info.op() << "  " << ops;
 
   if (returned_conv_dims != nullptr) {
     *returned_conv_dims = conv_dims;
@@ -947,23 +965,23 @@ int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
 }
 
 int64 OpLevelCostEstimator::CountConv2DBackpropFilterOperations(
-    const OpInfo& op_features, ConvolutionDimensions* returned_conv_dims,
+    const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims,
     bool* found_unknown_shapes) const {
   int64 ops = 0;
 
-  DCHECK(op_features.op() == kConv2dBackpropFilter ||
-         op_features.op() == kDepthwiseConv2dNativeBackpropFilter)
+  DCHECK(op_info.op() == kConv2dBackpropFilter ||
+         op_info.op() == kDepthwiseConv2dNativeBackpropFilter)
       << "Invalid Operation: not kConv2dBackpropFilter nor"
          "kDepthwiseConv2dNativeBackpropFilter";
 
   TensorShapeProto filter_shape;
   bool shape_found = false;
-  if (op_features.inputs_size() >= 2 && op_features.inputs(1).has_value()) {
-    const TensorProto& value = op_features.inputs(1).value();
+  if (op_info.inputs_size() >= 2 && op_info.inputs(1).has_value()) {
+    const TensorProto& value = op_info.inputs(1).value();
     shape_found = GetTensorShapeProtoFromTensorProto(value, &filter_shape);
   }
-  if (!shape_found && op_features.outputs_size() == 1) {
-    filter_shape = op_features.outputs(0).shape();
+  if (!shape_found && op_info.outputs_size() == 1) {
+    filter_shape = op_info.outputs(0).shape();
     shape_found = true;
   }
   if (!shape_found) {
@@ -975,19 +993,18 @@ int64 OpLevelCostEstimator::CountConv2DBackpropFilterOperations(
     *found_unknown_shapes = true;
   }
 
-  if (op_features.inputs_size() < 1) {
+  if (op_info.inputs_size() < 1) {
     // TODO(pcma): Try to separate invalid inputs from unknown shapes
     *found_unknown_shapes = true;
     return ops;
   }
   ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
-      op_features.inputs(0).shape(), filter_shape, op_features,
-      found_unknown_shapes);
+      op_info.inputs(0).shape(), filter_shape, op_info, found_unknown_shapes);
 
   ops = conv_dims.batch;
   ops *= conv_dims.ox * conv_dims.oy;
   ops *= conv_dims.kx * conv_dims.ky;
-  if (op_features.op() == kConv2dBackpropFilter) {
+  if (op_info.op() == kConv2dBackpropFilter) {
     ops *= conv_dims.iz * conv_dims.oz;
   } else {
     // conv_dims always use forward path definition regardless
@@ -995,7 +1012,7 @@ int64 OpLevelCostEstimator::CountConv2DBackpropFilterOperations(
     ops *= conv_dims.oz;
   }
   ops *= kOpsPerMac;
-  VLOG(1) << "Operations for" << op_features.op() << "  " << ops;
+  VLOG(1) << "Operations for" << op_info.op() << "  " << ops;
 
   if (returned_conv_dims != nullptr) {
     *returned_conv_dims = conv_dims;
@@ -1026,9 +1043,9 @@ int64 OpLevelCostEstimator::CalculateTensorSize(
 }
 
 int64 OpLevelCostEstimator::CalculateInputSize(
-    const OpInfo& op_features, bool* found_unknown_shapes) const {
+    const OpInfo& op_info, bool* found_unknown_shapes) const {
   int64 total_input_size = 0;
-  for (auto& input : op_features.inputs()) {
+  for (auto& input : op_info.inputs()) {
     int64 input_size = CalculateTensorSize(input, found_unknown_shapes);
     total_input_size += input_size;
     VLOG(1) << "Input Size: " << input_size
@@ -1038,9 +1055,9 @@ int64 OpLevelCostEstimator::CalculateInputSize(
 }
 
 int64 OpLevelCostEstimator::CalculateLargestInputCount(
-    const OpInfo& op_features, bool* found_unknown_shapes) const {
+    const OpInfo& op_info, bool* found_unknown_shapes) const {
   int64 largest_input_count = 0;
-  for (auto& input : op_features.inputs()) {
+  for (auto& input : op_info.inputs()) {
     int64 input_count =
         CalculateTensorElementCount(input, found_unknown_shapes);
     if (input_count > largest_input_count) {
@@ -1053,10 +1070,10 @@ int64 OpLevelCostEstimator::CalculateLargestInputCount(
 }
 
 int64 OpLevelCostEstimator::CalculateOutputSize(
-    const OpInfo& op_features, bool* found_unknown_shapes) const {
+    const OpInfo& op_info, bool* found_unknown_shapes) const {
   int64 total_output_size = 0;
   // use float as default for calculations
-  for (const auto& output : op_features.outputs()) {
+  for (const auto& output : op_info.outputs()) {
     DataType dt = output.dtype();
     const auto& original_output_shape = output.shape();
     int64 output_size = DataTypeSize(BaseType(dt));
@@ -1074,10 +1091,10 @@ int64 OpLevelCostEstimator::CalculateOutputSize(
 }
 
 Costs OpLevelCostEstimator::PredictConv2D(const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
+  const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
   auto costs = PredictOpCountBasedCost(
-      CountConv2DOperations(op_features, &found_unknown_shapes), op_features);
+      CountConv2DOperations(op_info, &found_unknown_shapes), op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
@@ -1085,12 +1102,12 @@ Costs OpLevelCostEstimator::PredictConv2D(const OpContext& op_context) const {
 
 Costs OpLevelCostEstimator::PredictConv2DBackpropInput(
     const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
+  const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
   auto costs =
       PredictOpCountBasedCost(CountConv2DBackpropInputOperations(
-                                  op_features, nullptr, &found_unknown_shapes),
-                              op_features);
+                                  op_info, nullptr, &found_unknown_shapes),
+                              op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
@@ -1098,12 +1115,12 @@ Costs OpLevelCostEstimator::PredictConv2DBackpropInput(
 
 Costs OpLevelCostEstimator::PredictConv2DBackpropFilter(
     const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
+  const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
   auto costs =
       PredictOpCountBasedCost(CountConv2DBackpropFilterOperations(
-                                  op_features, nullptr, &found_unknown_shapes),
-                              op_features);
+                                  op_info, nullptr, &found_unknown_shapes),
+                              op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
@@ -1198,26 +1215,26 @@ Costs OpLevelCostEstimator::PredictFusedConv2DBiasActivation(
 }
 
 Costs OpLevelCostEstimator::PredictMatMul(const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
+  const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
   auto costs = PredictOpCountBasedCost(
-      CountMatMulOperations(op_features, &found_unknown_shapes), op_features);
+      CountMatMulOperations(op_info, &found_unknown_shapes), op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
 }
 
 Costs OpLevelCostEstimator::PredictNoOp(const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
-  VLOG(1) << "Op:" << op_features.op() << " Execution Time 0 (ns)";
+  const auto& op_info = op_context.op_info;
+  VLOG(1) << "Op:" << op_info.op() << " Execution Time 0 (ns)";
   return Costs::ZeroCosts();
 }
 
 Costs OpLevelCostEstimator::PredictIdentity(const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
-  VLOG(1) << "Op:" << op_features.op() << " Execution Time 0 (ns)";
+  const auto& op_info = op_context.op_info;
+  VLOG(1) << "Op:" << op_info.op() << " Execution Time 0 (ns)";
   Costs result = Costs::ZeroCosts();
-  result.max_memory = CalculateOutputSize(op_features, &result.inaccurate);
+  result.max_memory = CalculateOutputSize(op_info, &result.inaccurate);
   result.num_ops_with_unknown_shapes = result.inaccurate;
   // Assign the minimum amount of time we can represent to the identity op since
   // it tends to be really cheap.
@@ -1227,34 +1244,32 @@ Costs OpLevelCostEstimator::PredictIdentity(const OpContext& op_context) const {
 }
 
 Costs OpLevelCostEstimator::PredictVariable(const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
-  VLOG(1) << "Op:" << op_features.op() << " Execution Time 0 (ns)";
+  const auto& op_info = op_context.op_info;
+  VLOG(1) << "Op:" << op_info.op() << " Execution Time 0 (ns)";
   Costs result = Costs::ZeroCosts();
-  result.persistent_memory =
-      CalculateOutputSize(op_features, &result.inaccurate);
+  result.persistent_memory = CalculateOutputSize(op_info, &result.inaccurate);
   result.num_ops_with_unknown_shapes = result.inaccurate;
 
   result.compute_time = kMinComputeTime;
-  result.execution_time = result.execution_time;
+  result.execution_time = result.compute_time;
   return result;
 }
 
 Costs OpLevelCostEstimator::PredictBatchMatMul(
     const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
+  const auto& op_info = op_context.op_info;
   bool found_unknown_shapes = false;
   Costs costs = PredictOpCountBasedCost(
-      CountBatchMatMulOperations(op_features, &found_unknown_shapes),
-      op_features);
+      CountBatchMatMulOperations(op_info, &found_unknown_shapes), op_info);
   costs.inaccurate = found_unknown_shapes;
   costs.num_ops_with_unknown_shapes = found_unknown_shapes;
   return costs;
 }
 
 Costs OpLevelCostEstimator::PredictMetadata(const OpContext& op_context) const {
-  const auto& op_features = op_context.op_info;
+  const auto& op_info = op_context.op_info;
   Costs costs = Costs::ZeroCosts();
-  costs.max_memory = CalculateOutputSize(op_features, &costs.inaccurate);
+  costs.max_memory = CalculateOutputSize(op_info, &costs.inaccurate);
   costs.num_ops_with_unknown_shapes = costs.inaccurate;
   // Metadata operations are so cheap we assume they take the minimum amount of
   // time we can represent (1 ns).
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index 84dd9213f773b538db71f0999c7ffd0b34e1881c..ace8fb218c75886b8bd215a0b04c98ef89f1fed6 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -16,10 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_
 #define TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_
 
-#include <functional>
-#include <map>
-#include <string>
-
 #include "tensorflow/core/grappler/costs/cost_estimator.h"
 #include "tensorflow/core/grappler/costs/op_context.h"
 #include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
@@ -79,24 +75,23 @@ class OpLevelCostEstimator {
     int64 sy;         // Stride y.
     Padding padding;  // SAME or VALID.
   };
-  int64 CountConv2DOperations(const OpInfo& op_features,
+  int64 CountConv2DOperations(const OpInfo& op_info,
                               bool* found_unknown_shapes) const;
-  int64 CountConv2DOperations(const OpInfo& op_features,
+  int64 CountConv2DOperations(const OpInfo& op_info,
                               ConvolutionDimensions* conv_info,
                               bool* found_unknown_shapes) const;
-  int64 CountMatMulOperations(const OpInfo& op_features,
+  int64 CountMatMulOperations(const OpInfo& op_info,
                               bool* found_unknown_shapes) const;
-  int64 CountMatMulOperations(const OpInfo& op_features,
-                              MatMulDimensions* mat_mul,
+  int64 CountMatMulOperations(const OpInfo& op_info, MatMulDimensions* mat_mul,
                               bool* found_unknown_shapes) const;
-  int64 CountBatchMatMulOperations(const OpInfo& op_features,
+  int64 CountBatchMatMulOperations(const OpInfo& op_info,
                                    bool* found_unknown_shapes) const;
-  int64 CountConv2DBackpropInputOperations(const OpInfo& op_features,
-                                           ConvolutionDimensions* conv_info,
-                                           bool* found_unknown_shapes) const;
-  int64 CountConv2DBackpropFilterOperations(const OpInfo& op_features,
-                                            ConvolutionDimensions* conv_info,
-                                            bool* found_unknown_shapes) const;
+  int64 CountConv2DBackpropInputOperations(
+      const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims,
+      bool* found_unknown_shapes) const;
+  int64 CountConv2DBackpropFilterOperations(
+      const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims,
+      bool* found_unknown_shapes) const;
 
   // Calculate the element count of an input/output tensor.
   int64 CalculateTensorElementCount(const OpInfo::TensorProperties& tensor,
@@ -108,17 +103,17 @@ class OpLevelCostEstimator {
 
   // Calculate the element count of the largest
   // input of specified TensorFlow op.
-  int64 CalculateLargestInputCount(const OpInfo& op_features,
+  int64 CalculateLargestInputCount(const OpInfo& op_info,
                                    bool* found_unknown_shapes) const;
 
   // Calculate the total size in bytes of the all
   // the inputs of specified TensorFlow op.
-  int64 CalculateInputSize(const OpInfo& op_features,
+  int64 CalculateInputSize(const OpInfo& op_info,
                            bool* found_unknown_shapes) const;
 
   // Calculate the total size in bytes of the all
   // the outputs of specified TensorFlow op.
-  int64 CalculateOutputSize(const OpInfo& op_features,
+  int64 CalculateOutputSize(const OpInfo& op_info,
                             bool* found_unknown_shapes) const;
 
   // This family of routines predicts the costs to
@@ -198,6 +193,7 @@ class OpLevelCostEstimator {
   // If true, assume compute and memory overlap; hence, the op cost is max of
   // compute_time and memory_time, insteaf of sum of those two.
   bool compute_memory_overlap_;
+  std::set<string> persistent_ops_;
 
  private:
   friend class OpLevelCostEstimatorTest;
@@ -205,4 +201,5 @@ class OpLevelCostEstimator {
 
 }  // end namespace grappler
 }  // end namespace tensorflow
+
 #endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_OP_LEVEL_COST_ESTIMATOR_H_
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index c9ce63a8ef2aa301f690cec16fcd03fb83309c7c..04c6ada2bf690a8d528eac60e0984873aeba2b5c 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -28,9 +28,34 @@ namespace tensorflow {
 namespace grappler {
 
 namespace {
+
+// TODO(dyoon): Consider to use this Test class for all the test cases, and then
+// remove friend in the OpLevelCostEstimator class header.
+class TestOpLevelCostEstimator : public OpLevelCostEstimator {
+ public:
+  TestOpLevelCostEstimator() {
+    compute_memory_overlap_ = true;
+    device_info_ = DeviceInfo();
+  }
+  ~TestOpLevelCostEstimator() override {}
+
+  void SetDeviceInfo(const DeviceInfo& device_info) {
+    device_info_ = device_info;
+  }
+
+  void SetComputeMemoryOverlap(bool value) { compute_memory_overlap_ = value; }
+
+ protected:
+  DeviceInfo GetDeviceInfo(const DeviceProperties& device) const override {
+    return device_info_;
+  }
+
+  DeviceInfo device_info_;
+};
+
 // Wrangles the minimum number of proto fields to set up a matrix.
-void DescribeMatrix(int rows, int columns, OpInfo* op_features) {
-  auto input = op_features->add_inputs();
+void DescribeMatrix(int rows, int columns, OpInfo* op_info) {
+  auto input = op_info->add_inputs();
   auto shape = input->mutable_shape();
   auto shape_rows = shape->add_dim();
   shape_rows->set_size(rows);
@@ -39,8 +64,8 @@ void DescribeMatrix(int rows, int columns, OpInfo* op_features) {
   input->set_dtype(DT_FLOAT);
 }
 
-void SetCpuDevice(OpInfo* op_features) {
-  auto device = op_features->mutable_device();
+void SetCpuDevice(OpInfo* op_info) {
+  auto device = op_info->mutable_device();
   device->set_type("CPU");
   device->set_num_cores(10);
   device->set_bandwidth(10000000);  // 10000000 KB/s = 10 GB/s
@@ -413,15 +438,14 @@ class OpLevelCostEstimatorTest : public ::testing::Test {
     return estimator_.PredictCosts(op_context);
   }
 
-  int64 CountMatMulOperations(const OpInfo& op_features,
+  int64 CountMatMulOperations(const OpInfo& op_info,
                               bool* found_unknown_shapes) const {
-    return estimator_.CountMatMulOperations(op_features, found_unknown_shapes);
+    return estimator_.CountMatMulOperations(op_info, found_unknown_shapes);
   }
 
-  int64 CountBatchMatMulOperations(const OpInfo& op_features,
+  int64 CountBatchMatMulOperations(const OpInfo& op_info,
                                    bool* found_unknown_shapes) const {
-    return estimator_.CountBatchMatMulOperations(op_features,
-                                                 found_unknown_shapes);
+    return estimator_.CountBatchMatMulOperations(op_info, found_unknown_shapes);
   }
 
   void SetComputeMemoryOverlap(bool value) {
@@ -475,6 +499,26 @@ class OpLevelCostEstimatorTest : public ::testing::Test {
   OpLevelCostEstimator estimator_;
 };
 
+TEST_F(OpLevelCostEstimatorTest, TestPersistentOpCosts) {
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  std::unordered_set<string> persisent_ops = {
+      "Const",       "Variable",       "VariableV2", "AutoReloadVariable",
+      "VarHandleOp", "ReadVariableOp",
+  };
+  // Minmum cost for all persistent ops.
+  for (const auto& op : persisent_ops) {
+    op_context.op_info.set_op(op);
+    auto cost = estimator_.PredictCosts(op_context);
+    EXPECT_EQ(Costs::Duration(0), cost.memory_time);
+    EXPECT_EQ(Costs::Duration(1), cost.compute_time);
+    EXPECT_EQ(Costs::Duration(1), cost.execution_time);
+    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  }
+}
+
 TEST_F(OpLevelCostEstimatorTest, TestGatherCosts) {
   OpContext op_context;
   SetCpuDevice(&op_context.op_info);
@@ -712,6 +756,16 @@ TEST_F(OpLevelCostEstimatorTest, ReluExecutionTime) {
   EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
 }
 
+TEST_F(OpLevelCostEstimatorTest, CastExecutionTime) {
+  auto cost = PredictCosts(DescribeUnaryOp("Cast", 1000));
+  EXPECT_EQ(Costs::Duration(800), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(100), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(900), cost.execution_time);
+  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_FALSE(cost.inaccurate);
+  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+}
+
 TEST_F(OpLevelCostEstimatorTest, UnknownOrPartialShape) {
   {
     auto cost = PredictCosts(DescribeMatMul(2, 4, 7, 7));
@@ -936,7 +990,7 @@ TEST_F(OpLevelCostEstimatorTest, PredictMaxPoolGrad) {
   };
 
   {
-    // Typical 3xz3 window with 2x2 stride.
+    // Typical 3x3 window with 2x2 stride.
     auto costs = predict_max_pool_grad(10, 20, 384, 3, 2, "SAME");
     EXPECT_EQ(Costs::Duration(1996800), costs.execution_time);
     EXPECT_EQ(Costs::Duration(614400), costs.compute_time);
@@ -977,7 +1031,7 @@ TEST_F(OpLevelCostEstimatorTest, PredictAvgPool) {
   };
 
   {
-    // Typical 3xz3 window with 2x2 stride.
+    // Typical 3x3 window with 2x2 stride.
     auto costs = predict_avg_pool(10, 20, 384, 3, 2, "SAME");
     EXPECT_EQ(Costs::Duration(1113600), costs.execution_time);
     EXPECT_EQ(Costs::Duration(345600), costs.compute_time);
@@ -1199,5 +1253,59 @@ TEST_F(OpLevelCostEstimatorTest, MaybeGetMinimumShape) {
     ExpectTensorShape({10, 20}, y);
   }
 }
+
+TEST_F(OpLevelCostEstimatorTest, IntermediateRdWrBandwidth) {
+  TestOpLevelCostEstimator estimator;
+
+  // Compute limited.
+  estimator.SetDeviceInfo(DeviceInfo(/*gigaops=*/1,
+                                     /*gb_per_sec=*/1));
+  estimator.SetComputeMemoryOverlap(true);
+  auto cost = estimator.PredictCosts(
+      DescribeConvolution(16, 19, 19, 48, 48, 5, 5, 256));
+  EXPECT_EQ(Costs::Duration(3548774400), cost.execution_time);
+  EXPECT_EQ(cost.execution_time, cost.compute_time);
+
+  estimator.SetComputeMemoryOverlap(false);
+  cost = estimator.PredictCosts(
+      DescribeConvolution(16, 19, 19, 48, 48, 5, 5, 256));
+  EXPECT_EQ(Costs::Duration(3551112192), cost.execution_time);
+  EXPECT_EQ(cost.execution_time, cost.compute_time + cost.memory_time +
+                                     cost.intermediate_memory_time);
+
+  // Memory limited.
+  estimator.SetDeviceInfo(DeviceInfo(/*gigaops=*/99999,
+                                     /*gb_per_sec=*/1));
+  estimator.SetComputeMemoryOverlap(true);
+  cost = estimator.PredictCosts(
+      DescribeConvolution(16, 19, 19, 48, 48, 5, 5, 256));
+  EXPECT_EQ(Costs::Duration(2337792), cost.execution_time);
+  EXPECT_EQ(cost.execution_time, cost.memory_time);
+
+  estimator.SetComputeMemoryOverlap(false);
+  cost = estimator.PredictCosts(
+      DescribeConvolution(16, 19, 19, 48, 48, 5, 5, 256));
+  EXPECT_EQ(Costs::Duration(2373281), cost.execution_time);
+  EXPECT_EQ(cost.execution_time, cost.compute_time + cost.memory_time +
+                                     cost.intermediate_memory_time);
+
+  // Intermediate memory bandwidth limited.
+  estimator.SetDeviceInfo(DeviceInfo(/*gigaops=*/99999,
+                                     /*gb_per_sec=*/9999,
+                                     /*intermediate_read_gb_per_sec=*/1,
+                                     /*intermediate_write_gb_per_sec=*/1));
+  estimator.SetComputeMemoryOverlap(true);
+  cost = estimator.PredictCosts(
+      DescribeConvolution(16, 19, 19, 48, 48, 5, 5, 256));
+  EXPECT_EQ(Costs::Duration(2337792), cost.execution_time);
+  EXPECT_EQ(cost.execution_time, cost.intermediate_memory_time);
+
+  estimator.SetComputeMemoryOverlap(false);
+  cost = estimator.PredictCosts(
+      DescribeConvolution(16, 19, 19, 48, 48, 5, 5, 256));
+  EXPECT_EQ(Costs::Duration(2373515), cost.execution_time);
+  EXPECT_EQ(cost.execution_time, cost.compute_time + cost.memory_time +
+                                     cost.intermediate_memory_time);
+}
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/utils.cc b/tensorflow/core/grappler/costs/utils.cc
index 7d868a3679e5b3d5759fdd951e726cfe7af3babf..d45bb14e07072fff1742e243f6b0bc15b51c62c6 100644
--- a/tensorflow/core/grappler/costs/utils.cc
+++ b/tensorflow/core/grappler/costs/utils.cc
@@ -20,12 +20,6 @@ limitations under the License.
 
 #include "third_party/eigen3/Eigen/Core"
 
-#if GOOGLE_CUDA
-#include "cuda/include/cuda.h"
-#include "cuda/include/cuda_runtime_api.h"
-#include "cuda/include/cudnn.h"
-#endif
-
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
diff --git a/tensorflow/core/grappler/costs/virtual_placer.cc b/tensorflow/core/grappler/costs/virtual_placer.cc
index 8f5f16e4904002cfb5b2e0e6df4a3103e8114a7e..146eecf5bcbbbccb5fcdfef7170cc442e96f7c4c 100644
--- a/tensorflow/core/grappler/costs/virtual_placer.cc
+++ b/tensorflow/core/grappler/costs/virtual_placer.cc
@@ -87,6 +87,7 @@ VirtualPlacer::VirtualPlacer(const Cluster* cluster) {
       default_device_name_ = devices_.begin()->first;  // Any device.
     }
   }
+  VLOG(3) << "default device name: " << default_device_name_;
 
   // Scan the device names from the cluster, and if there is one job name used,
   // use it for canonical device name.
@@ -102,14 +103,15 @@ VirtualPlacer::VirtualPlacer(const Cluster* cluster) {
       }
     }
   }
-  // If there is only  type of job name in all the devices in the cluster, use
-  // that one as default job name; otherwise, use localhost.
+  // If there is only one type of job name in all the devices in the cluster,
+  // use that one as default job name; otherwise, use localhost.
   // TODO(dyoon): this should be improved, especially when the cluster is
   // composed of multiple worker, PS, and other types of jobs.
   if (job_names_from_cluster.size() == 1) {
     auto it = job_names_from_cluster.begin();
     default_job_name_lowercase_ = *it;
   }
+  VLOG(3) << "default job name: " << default_job_name_lowercase_;
 }
 
 const DeviceProperties& VirtualPlacer::get_device(const NodeDef& node) const {
diff --git a/tensorflow/core/grappler/costs/virtual_placer.h b/tensorflow/core/grappler/costs/virtual_placer.h
index fee5ce0f510014988656f418b857a73b8d68b807..e17ece7c1a840f66d335c205d3fa759965bc2b52 100644
--- a/tensorflow/core/grappler/costs/virtual_placer.h
+++ b/tensorflow/core/grappler/costs/virtual_placer.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_PLACER_H_
 #define TENSORFLOW_CORE_GRAPPLER_COSTS_VIRTUAL_PLACER_H_
 
-#include <unordered_map>
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/device_properties.pb.h"
 
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index ae5200b359232153f96c9ffa21a505d2a056d55d..0aac0348b512d2e8040a9ac1337ceb9c12a09206 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/costs/virtual_scheduler.h"
 
-#include <math.h>
-
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -38,6 +36,12 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
+// Optional attribute name for Switch op as a vector of int that tells
+// which branch the Switch output is taken on every round of execution.
+// We use this side information, if provided, for scheduling ops after Switch
+// correctly (e.g., While loop).
+constexpr char kOutputSlots[] = "_output_slot_vector";
+
 Costs CombineCosts(const Costs& left, const Costs& right) {
   CHECK_NE(left.max_memory, kMemoryUnknown);
   CHECK_NE(left.max_per_op_buffers, kMemoryUnknown);
@@ -306,43 +310,25 @@ ReadyNodeManager* VirtualScheduler::ReadyNodeManagerFactory(
   LOG(FATAL) << "Not a valid ready node manager: " << ready_node_manager;
 }
 
-VirtualScheduler::VirtualScheduler(const GrapplerItem* grappler_item,
-                                   const bool use_static_shapes,
-                                   Cluster* cluster,
-                                   ReadyNodeManager* ready_nodes)
-    : ready_nodes_(ready_nodes),
-      graph_costs_(Costs::ZeroCosts()),
-      graph_properties_(new GraphProperties(*grappler_item)),
-      cluster_(cluster),
-      grappler_item_(grappler_item),
-      use_static_shapes_(use_static_shapes),
-      placer_(cluster) {
-  graph_costs_.num_ops_total = 0;
-  initialized_ = false;
-}
-
 VirtualScheduler::VirtualScheduler(const bool use_static_shapes,
+                                   const bool use_aggressive_shape_inference,
                                    Cluster* cluster,
                                    ReadyNodeManager* ready_nodes)
     : ready_nodes_(ready_nodes),
       graph_costs_(Costs::ZeroCosts()),
       cluster_(cluster),
       use_static_shapes_(use_static_shapes),
+      use_aggressive_shape_inference_(use_aggressive_shape_inference),
       placer_(cluster) {
   graph_costs_.num_ops_total = 0;
   initialized_ = false;
+  track_mem_usage_snapshot_ = VLOG_IS_ON(1);
 }
 
 Status VirtualScheduler::Init(const GrapplerItem* item) {
   grappler_item_ = item;
   graph_properties_ = absl::make_unique<GraphProperties>(*item);
 
-  return Init();
-}
-
-// TODO(pcma): Merge with Init(const GrapplerItem* item) when this
-// deprecated API is deleted
-Status VirtualScheduler::Init() {
   initialized_ = false;
 
   // Clear all internal states so that the VirtualScheduler is reusable for
@@ -366,7 +352,8 @@ Status VirtualScheduler::Init() {
 
   // Construct graph properties.
   if (use_static_shapes_) {
-    TF_RETURN_IF_ERROR(graph_properties_->InferStatically(true));
+    TF_RETURN_IF_ERROR(graph_properties_->InferStatically(
+        true, use_aggressive_shape_inference_));
   } else {
     TF_RETURN_IF_ERROR(graph_properties_->InferDynamically(cluster_));
   }
@@ -400,6 +387,8 @@ Status VirtualScheduler::Init() {
     name_to_node[node->name()] = node;
   }
 
+  // Traverse the graph to check if the graph is annotated with Switch outputs.
+  // Also record _Send nodes.
   // TODO(dyoon): Instead of identifying _Send node here manually, add _Send
   // to _Recv as control dependency when creating GrapplerItem.
   std::unordered_map<string, const NodeDef*> name_to_send;
@@ -408,6 +397,11 @@ Status VirtualScheduler::Init() {
       const auto& attr = node.attr();
       name_to_send[attr.at("tensor_name").s()] = &node;
     }
+
+    if (IsSwitch(node)) {
+      const auto& attr = node.attr();
+      if (attr.count(kOutputSlots) > 0) switch_outputs_annotated_ = true;
+    }
   }
 
   // To reuse _Recv ops.
@@ -562,7 +556,7 @@ void VirtualScheduler::MaybeUpdateInputOutput(const NodeDef* node) {
       inputs.push_back(control_message);
       outputs.push_back(control_message);
     } else {
-      auto output_properties =
+      const auto& output_properties =
           graph_properties_->GetOutputProperties(NodeName(input_source_name));
       // Like with HasInputProperties, if a node does not have output
       // properties, it's likely it was pruned during the shape inference run.
@@ -769,6 +763,82 @@ Costs& VirtualScheduler::FindOrCreateZero(const string& op_name,
   return it->second;
 }
 
+// Check Switch outputs in updated MetaGraphDef, add corresponding nodes to
+// ready queue.
+// Fallback to add all outputs if fail to find the actual output.
+bool VirtualScheduler::AddSwitchOutputsToReadyQueue(
+    const NodeDef* node, int curr_iter, const Costs::Duration& curr_time) {
+  if (node->attr().count(kOutputSlots) == 0) return false;
+
+  auto& node_state = node_map_[node];
+  const auto& slot_vector = node->attr().at(kOutputSlots);
+  if (slot_vector.list().i_size() <= curr_iter) {
+    // Sometimes we encounter infinite loop. Fall back to add all outputs.
+    return false;
+  }
+
+  int slot = slot_vector.list().i(curr_iter);
+  for (const auto& port_num_output_pair : node_state.outputs) {
+    if (port_num_output_pair.first != slot) continue;
+
+    for (auto* output_node : port_num_output_pair.second) {
+      auto& output_state = node_map_[output_node];
+      output_state.num_inputs_ready++;
+      // Execute a node as soon as all its inputs are ready. Merge nodes
+      // are special since they run as soon as one of their inputs becomes
+      // available.
+      if (output_state.num_inputs_ready == output_state.inputs.size() ||
+          IsMerge(*output_node)) {
+        // This output node is now ready.
+        output_state.time_ready = curr_time;
+        ready_nodes_->AddNode(output_node);
+        VLOG(3) << "Node " << node->name() << " iter " << curr_iter << "/"
+                << slot_vector.list().i_size() << " Add Switch output " << slot
+                << ": " << output_node->name();
+      }
+    }
+    return true;
+  }
+
+  return false;
+}
+
+void VirtualScheduler::AddOutputNodesToReadyQueue(
+    const NodeDef* node, const Costs::Duration& curr_time) {
+  auto& node_state = node_map_[node];
+  int curr_iter = node_state.num_executed_times;
+  ++node_state.num_executed_times;
+
+  if (switch_outputs_annotated_) {
+    // If the graph is annotated with StepStats, reset num_inputs_ready so we
+    // can schedule the node multiple times.
+    node_state.num_inputs_ready = 0;
+
+    // For Switch node, get output branch from updated MetaGraphDef.
+    if (IsSwitch(*node) &&
+        AddSwitchOutputsToReadyQueue(node, curr_iter, curr_time))
+      return;
+  }
+
+  // Increment num_inputs_ready of the output nodes and maybe add to ready
+  // nodes.
+  for (const auto& port_num_output_pair : node_state.outputs) {
+    for (auto* output_node : port_num_output_pair.second) {
+      auto& output_state = node_map_[output_node];
+      output_state.num_inputs_ready++;
+      // Execute a node as soon as all its inputs are ready. Merge nodes are
+      // special since they run as soon as one of their inputs becomes
+      // available.
+      if (output_state.num_inputs_ready == output_state.inputs.size() ||
+          IsMerge(*output_node)) {
+        // This output node is now ready.
+        output_state.time_ready = curr_time;
+        ready_nodes_->AddNode(output_node);
+      }
+    }
+  }
+}
+
 bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
   // Update graph_costs_ and per-op costs.
   graph_costs_ = CombineCosts(graph_costs_, node_costs);
@@ -778,13 +848,16 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
   auto& op_cost = FindOrCreateZero(op_name, &op_to_cost_);
   op_cost = CombineCosts(op_cost, node_costs);
 
-  // Also keep track of op counts and costs per op (with their shapes).
-  OpContext op_context = GetCurrNode();
-  string node_description = GetOpDescription(op_context.op_info);
-  op_counts_[node_description] += 1;
-  op_costs_[node_description] =
-      std::make_pair(node_costs.execution_time.asMicroSeconds().count(),
-                     !node_costs.inaccurate);
+  if (VLOG_IS_ON(2)) {
+    // Also keep track of op counts and costs per op (with their shapes).
+    OpContext op_context = GetCurrNode();
+
+    string node_description = GetOpDescription(op_context.op_info);
+    op_counts_[node_description] += 1;
+    op_costs_[node_description] =
+        std::make_pair(node_costs.execution_time.asMicroSeconds().count(),
+                       !node_costs.inaccurate);
+  }
 
   // Update node and device states.
   auto& node_state = node_map_[node];
@@ -793,6 +866,10 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
   // Node is scheduled when the device is available AND all the inputs are
   // ready; hence, time_scheduled is time_ready if time_ready > device curr
   // time.
+  // TODO(andiryxu): Current node_state result only records the last execution.
+  // With annotated MetaGraph we can schedule a node for multiple times.
+  // Refine NodeState structure accordingly, e.g. record time_scheduled in a
+  // vector.
   node_state.time_scheduled =
       std::max(device.GetCurrTime(), node_state.time_ready);
   // Override device curr time with the time_scheduled.
@@ -826,22 +903,8 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
           << ", scheduled: " << node_state.time_scheduled.count()
           << ", finished: " << node_state.time_finished.count();
 
-  // Increment num_inputs_ready of the output nodes and maybe add to ready nodes
-  for (const auto& port_num_output_pair : node_state.outputs) {
-    for (auto* output_node : port_num_output_pair.second) {
-      auto& output_state = node_map_[output_node];
-      output_state.num_inputs_ready++;
-      // Execute a node as soon as all its inputs are ready. Merge nodes are
-      // special since they run as soon as one of their inputs becomes
-      // available.
-      if (output_state.num_inputs_ready == output_state.inputs.size() ||
-          IsMerge(*output_node)) {
-        // This output node is now ready.
-        output_state.time_ready = curr_time;
-        ready_nodes_->AddNode(output_node);
-      }
-    }
-  }
+  // Check outputs, add ready nodes to queue.
+  AddOutputNodesToReadyQueue(node, curr_time);
 
   // Increment num_outputs_executed of the input nodes and maybe update memory.
   for (const auto& input_port : node_state.inputs) {
@@ -868,7 +931,10 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
     // check max memory usage.
     if (device.memory_usage > device.max_memory_usage) {
       device.max_memory_usage = device.memory_usage;
-      device.mem_usage_snapshot_at_peak = device.nodes_in_memory;
+
+      if (track_mem_usage_snapshot_) {
+        device.mem_usage_snapshot_at_peak = device.nodes_in_memory;
+      }
     }
   }
 
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 6a835f32d16d0850c06891f656b2bec910e26b78..d96371bcab5db2d3ef730bf1eec8fe7f733bf4f6 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -70,11 +70,15 @@ struct NodeState {
   // Each output port uses up memory space from time_scheduled to its
   // time_no_references.
 
+  // How many times this node has been executed, e.g. in a while loop.
+  int num_executed_times;
+
   NodeState() {
     num_inputs_ready = 0;
     time_ready = Costs::Duration::max();
     time_scheduled = Costs::Duration::max();
     time_finished = Costs::Duration::max();
+    num_executed_times = 0;
     // Note that num_outputs_executed and time_no_references are not initialized
     // here, since we don't know the size (i.e., # outputs for this node).
   }
@@ -256,16 +260,9 @@ std::unique_ptr<ReadyNodeManager> ReadyNodeManagerFactory(
 // dependencies, device, etc.
 class VirtualScheduler {
  public:
-  // TODO(pcma): Modify power_analyzer.cc to use new API's.
-  // DEPRECATED
-  VirtualScheduler(const GrapplerItem* grappler_item,
-                   const bool use_static_shapes, Cluster* cluster,
-                   ReadyNodeManager* ready_nodes);
-  // DEPRECATED
-  Status Init();
-
   // Does not take ownership of cluster or ready_nodes.
-  VirtualScheduler(bool use_static_shapes, Cluster* cluster,
+  VirtualScheduler(const bool use_static_shapes,
+                   const bool use_aggressive_shape_inference, Cluster* cluster,
                    ReadyNodeManager* ready_nodes);
   // Initializes the scheduler for the specific grappler item.
   // Should be called immediately after the c'tor or when the scheduler will be
@@ -305,6 +302,8 @@ class VirtualScheduler {
     return &node_map_;
   }
 
+  void enable_mem_usage_tracking() { track_mem_usage_snapshot_ = true; }
+
  private:
   // Constants.
   const string kAttrInputSrc = "input_source_";
@@ -328,6 +327,10 @@ class VirtualScheduler {
                           std::map<string, Costs>* op_cost);
   float Round2(const float x) const;
   bool IsPersistentNode(const NodeDef* node) const;
+  bool AddSwitchOutputsToReadyQueue(const NodeDef* node, int curr_iter,
+                                    const Costs::Duration& curr_time);
+  void AddOutputNodesToReadyQueue(const NodeDef* node,
+                                  const Costs::Duration& curr_time);
 
   // Scheduler states:
   ReadyNodeManager* ready_nodes_;  // Not owned.
@@ -356,6 +359,12 @@ class VirtualScheduler {
   const GrapplerItem* grappler_item_;  // Not owned.
   bool use_static_shapes_;
   bool initialized_;
+  bool track_mem_usage_snapshot_;
+  const bool use_aggressive_shape_inference_;
+
+  // Whether the input graph includes Switch nodes annotated with output slots
+  // information.
+  bool switch_outputs_annotated_ = false;
 
   VirtualPlacer placer_;  // owned.
 };
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
index 0a695458e17a576ecda631b576d4ace4aa947dbc..128cb986f11ba4f4bb13583cb293183194e1c744 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler_test.cc
@@ -30,8 +30,13 @@ namespace grappler {
 // Class for testing virtual scheduler.
 class TestVirtualScheduler : public VirtualScheduler {
  public:
-  TestVirtualScheduler(const bool use_static_shapes, Cluster* cluster)
-      : VirtualScheduler(use_static_shapes, cluster, &ready_node_manager_) {}
+  TestVirtualScheduler(const bool use_static_shapes,
+                       const bool use_aggressive_shape_inference,
+                       Cluster* cluster)
+      : VirtualScheduler(use_static_shapes, use_aggressive_shape_inference,
+                         cluster, &ready_node_manager_) {
+    enable_mem_usage_tracking();
+  }
 
   FRIEND_TEST(VirtualSchedulerTest, MemoryUsage);
   FRIEND_TEST(VirtualSchedulerTest, ControlDependency);
@@ -66,7 +71,8 @@ class VirtualSchedulerTest : public ::testing::Test {
     devices[kCPU1] = cpu_device;
     cluster_ = absl::make_unique<VirtualCluster>(devices);
     scheduler_ = absl::make_unique<TestVirtualScheduler>(
-        /* use_static_shapes = */ true, cluster_.get());
+        /*use_static_shapes=*/true,
+        /*use_aggressive_shape_inference=*/true, cluster_.get());
   }
 
   NodeDef node1_, node2_, node3_, node4_, node5_, node6_;
@@ -867,6 +873,439 @@ versions {
     grappler_item_->fetch = {"while/Exit", "while/Exit_1"};
   }
 
+  // A simple while loop strengthened with Switch outputs.
+  void CreateGrapplerItemWithLoopSwitchOutputs() {
+    // Test graph produced in python using:
+    /*
+      with tf.Graph().as_default():
+      i0 = tf.constant(0)
+      m0 = tf.ones([2, 2])
+      c = lambda i, m: i < 10
+      b = lambda i, m: [i+1, tf.concat([m, m], axis=0)]
+      r = tf.while_loop(
+      c, b, loop_vars=[i0, m0],
+      shape_invariants=[i0.get_shape(), tf.TensorShape([None, 2])])
+      with open('/tmp/graph.pbtxt', 'w') as f:
+      f.write(str(tf.get_default_graph().as_graph_def()))
+    */
+    const string gdef_ascii = R"EOF(
+node {
+  name: "Const"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "ones"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+          dim {
+            size: 2
+          }
+        }
+        float_val: 1.0
+      }
+    }
+  }
+}
+node {
+  name: "while/Enter"
+  op: "Enter"
+  input: "Const"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/Enter_1"
+  op: "Enter"
+  input: "ones"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "frame_name"
+    value {
+      s: "while/while/"
+    }
+  }
+  attr {
+    key: "is_constant"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "parallel_iterations"
+    value {
+      i: 10
+    }
+  }
+}
+node {
+  name: "while/Merge"
+  op: "Merge"
+  input: "while/Enter"
+  input: "while/NextIteration"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Merge_1"
+  op: "Merge"
+  input: "while/Enter_1"
+  input: "while/NextIteration_1"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/Less/y"
+  op: "Const"
+  input: "^while/Merge"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 10
+      }
+    }
+  }
+}
+node {
+  name: "while/Less"
+  op: "Less"
+  input: "while/Merge"
+  input: "while/Less/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/LoopCond"
+  op: "LoopCond"
+  input: "while/Less"
+}
+node {
+  name: "while/Switch"
+  op: "Switch"
+  input: "while/Merge"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge"
+      }
+    }
+  }
+  attr {
+    key: "_output_slot_vector"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/Switch_1"
+  op: "Switch"
+  input: "while/Merge_1"
+  input: "while/LoopCond"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@while/Merge_1"
+      }
+    }
+  }
+  attr {
+    key: "_output_slot_vector"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/Identity"
+  op: "Identity"
+  input: "while/Switch:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Identity_1"
+  op: "Identity"
+  input: "while/Switch_1:1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/add/y"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 1
+      }
+    }
+  }
+}
+node {
+  name: "while/add"
+  op: "Add"
+  input: "while/Identity"
+  input: "while/add/y"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/concat/axis"
+  op: "Const"
+  input: "^while/Identity"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "while/concat"
+  op: "ConcatV2"
+  input: "while/Identity_1"
+  input: "while/Identity_1"
+  input: "while/concat/axis"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "Tidx"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/NextIteration"
+  op: "NextIteration"
+  input: "while/add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/NextIteration_1"
+  op: "NextIteration"
+  input: "while/concat"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "while/Exit"
+  op: "Exit"
+  input: "while/Switch"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "while/Exit_1"
+  op: "Exit"
+  input: "while/Switch_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+versions {
+  producer: 21
+}
+  )EOF";
+
+    grappler_item_.reset(new GrapplerItem);
+    CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii,
+                                                &grappler_item_->graph));
+    grappler_item_->id = "test_graph";
+    grappler_item_->fetch = {"while/Exit", "while/Exit_1"};
+  }
+
+  // Create a FusedBatchNorm op that has multiple output ports.
   void CreateGrapplerItemWithInterDeviceTransfers() {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice(kCPU0);
 
@@ -1940,6 +2379,89 @@ TEST_F(VirtualSchedulerTest, WhileLoop) {
   ValidateDependencyChain(start_times, {"while/Switch_1", "while/Exit_1"});
 }
 
+TEST_F(VirtualSchedulerTest, WhileLoopWithSwitchOutputs) {
+  // Init.
+  CreateGrapplerItemWithLoopSwitchOutputs();
+  InitScheduler();
+
+  // Runs the scheduler.
+  RunScheduler("");
+
+  RunMetadata metadata;
+  scheduler_->Summary(&metadata);
+
+  // Nodes in topological order:
+  // * const, ones
+  // * while/Enter, while/Enter_1
+  // * while/Merge, while/Merge_1
+  // * while/Less/y
+  // * while/Less
+  // * while/LoopCond
+  // * while/Switch, while/Switch_1
+  // * while/Identity, while/Identity_1, while/Exit, while/Exit_1
+  // * while/add/y, while/concat/axis
+  // * while/add, while/concat
+  // * while/NextIteration, while/NextIteration_1
+
+  int num_next_iteration = 0;
+  int num_next_iteration_1 = 0;
+  int num_exit = 0;
+  int num_exit_1 = 0;
+  int64 next_iter_start_micro;
+  int64 next_iter_1_start_micro;
+  int64 exit_start_micro;
+  int64 exit_1_start_micro;
+
+  std::unordered_map<string, int64> start_times;
+  for (const auto& device_step_stats : metadata.step_stats().dev_stats()) {
+    for (const auto& stats : device_step_stats.node_stats()) {
+      start_times[stats.node_name()] = stats.all_start_micros();
+      if (stats.node_name() == "while/NextIteration") {
+        ++num_next_iteration;
+        next_iter_start_micro = stats.all_start_micros();
+      } else if (stats.node_name() == "while/NextIteration_1") {
+        ++num_next_iteration_1;
+        next_iter_1_start_micro = stats.all_start_micros();
+      } else if (stats.node_name() == "while/Exit") {
+        ++num_exit;
+        exit_start_micro = stats.all_start_micros();
+      } else if (stats.node_name() == "while/Exit_1") {
+        ++num_exit_1;
+        exit_1_start_micro = stats.all_start_micros();
+      }
+    }
+  }
+
+  // Makes sure we run the loop body for ten times.
+  EXPECT_EQ(10, num_next_iteration);
+  EXPECT_EQ(10, num_next_iteration_1);
+  EXPECT_EQ(1, num_exit);
+  EXPECT_EQ(1, num_exit_1);
+
+  // Start times of while/NextIteration and while/NextIteration_1 should be
+  // different, so should be those of while/Exit and while/Exit_1.
+  EXPECT_NE(next_iter_start_micro, next_iter_1_start_micro);
+  EXPECT_NE(exit_start_micro, exit_1_start_micro);
+
+  // Checks dependency among the nodes; no matter what scheduling mechanism we
+  // use, the scheduled ops should follow these dependency chains.
+  // We have to break the loop into two parts, identified by Switch outputs.
+  ValidateDependencyChain(
+      start_times,
+      {"Const", "while/Enter", "while/Merge", "while/Less/y", "while/Less",
+       "while/LoopCond", "while/Switch", "while/Exit"});
+  ValidateDependencyChain(start_times, {"while/Identity", "while/add/y",
+                                        "while/add", "while/NextIteration"});
+  ValidateDependencyChain(
+      start_times, {"ones", "while/Enter_1", "while/Merge_1", "while/Switch_1",
+                    "while/Exit_1"});
+  ValidateDependencyChain(start_times, {"while/Identity_1", "while/concat",
+                                        "while/NextIteration_1"});
+  ValidateDependencyChain(
+      start_times, {"while/Identity", "while/concat/axis", "while/concat"});
+  ValidateDependencyChain(start_times, {"while/Identity", "while/add"});
+}
+
 TEST_F(VirtualSchedulerTest, InterDeviceTransfer) {
   // Init.
   CreateGrapplerItemWithInterDeviceTransfers();
diff --git a/tensorflow/core/grappler/devices.cc b/tensorflow/core/grappler/devices.cc
index 3268697671b9ba47e489d5037af9a7267353b448..ddde6a504e0c490dee5312ad717d3dba68b184aa 100644
--- a/tensorflow/core/grappler/devices.cc
+++ b/tensorflow/core/grappler/devices.cc
@@ -47,9 +47,13 @@ int GetNumAvailableGPUs() {
       }
     }
   }
-#endif  // GOOGLE_CUDA
   LOG(INFO) << "Number of eligible GPUs (core count >= 8): "
             << num_eligible_gpus;
+#else
+  LOG(INFO) << "Number of eligible GPUs (core count >= 8): "
+            << num_eligible_gpus
+            << " (Note: TensorFlow was not compiled with CUDA support)";
+#endif  // GOOGLE_CUDA
   return num_eligible_gpus;
 }
 
diff --git a/tensorflow/core/grappler/graph_topology_view.cc b/tensorflow/core/grappler/graph_topology_view.cc
new file mode 100644
index 0000000000000000000000000000000000000000..79e2f9a92fda7bbbc9018f678aeee2b95d763ffc
--- /dev/null
+++ b/tensorflow/core/grappler/graph_topology_view.cc
@@ -0,0 +1,191 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/graph_topology_view.h"
+
+#include <algorithm>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+
+template <typename T>
+inline void SortAndRemoveDuplicates(T* v) {
+  std::sort(v->begin(), v->end());
+  v->erase(std::unique(v->begin(), v->end()), v->end());
+}
+
+}  // namespace
+
+Status GraphTopologyView::InitializeFromGraph(
+    const GraphDef& graph,
+    const absl::Span<const GraphView::Edge> ephemeral_edges) {
+  if (graph_ != nullptr) {
+    return errors::InvalidArgument("GraphTopologyView is already initialized.");
+  }
+
+  graph_ = &graph;
+  num_nodes_ = graph.node_size();
+  index_to_node_name_.resize(num_nodes_);
+  node_name_to_index_.rehash(num_nodes_);
+  fanins_.resize(num_nodes_);
+  fanouts_.resize(num_nodes_);
+
+  // Build map from name to index and vice versa.
+  for (int node_idx = 0; node_idx < num_nodes_; ++node_idx) {
+    const NodeDef& node = graph.node(node_idx);
+    node_name_to_index_.emplace(node.name(), node_idx);
+    index_to_node_name_.emplace_back(node.name());
+  }
+
+  // 1. Add ephemeral edges to the adjacency lists.
+  for (const GraphView::Edge& edge : ephemeral_edges) {
+    const auto src = node_name_to_index_.find(edge.src.node->name());
+    const bool valid_src = src != node_name_to_index_.end();
+
+    if (!valid_src) {
+      const string error_message =
+          absl::StrCat("Non-existent src node: ", edge.src.node->name());
+      if (skip_invalid_edges_) {
+        VLOG(0) << "Skip error: " << error_message;
+      } else {
+        return errors::InvalidArgument(error_message);
+      }
+    }
+
+    const auto dst = node_name_to_index_.find(edge.dst.node->name());
+    const bool valid_dst = dst != node_name_to_index_.end();
+
+    if (!valid_dst) {
+      const string error_message =
+          absl::StrCat("Non-existent dst node: ", edge.dst.node->name());
+      if (skip_invalid_edges_) {
+        VLOG(0) << "Skip error: " << error_message;
+      } else {
+        return errors::InvalidArgument(error_message);
+      }
+    }
+
+    if (valid_dst && valid_src) {
+      const int src_idx = src->second;
+      const int dst_idx = dst->second;
+      fanins_[dst_idx].push_back(src_idx);
+      fanouts_[src_idx].push_back(dst_idx);
+    }
+  }
+
+  // 2. Add graph edges to the adjacency lists.
+  for (int node_idx = 0; node_idx < num_nodes_; ++node_idx) {
+    const NodeDef& node = graph.node(node_idx);
+    fanins_[node_idx].reserve(node.input_size());
+
+    for (const string& input : node.input()) {
+      TensorId tensor = ParseTensorName(input);
+      const auto it = node_name_to_index_.find(tensor.node());
+      const bool valid_input = it != node_name_to_index_.end();
+
+      if (!valid_input) {
+        const string error_message = absl::StrCat("Non-existent input ", input,
+                                                  " in node ", node.name());
+        if (skip_invalid_edges_) {
+          VLOG(3) << "Skip error: " << error_message;
+        } else {
+          return errors::InvalidArgument(error_message);
+        }
+      }
+
+      if (valid_input) {
+        const int input_idx = it->second;
+        fanins_[node_idx].push_back(input_idx);
+        fanouts_[input_idx].push_back(node_idx);
+      }
+    }
+
+    // Dedup the input list while it's still hot in cache.
+    SortAndRemoveDuplicates(&fanins_[node_idx]);
+  }
+
+  // Dedup outputs for all the graph nodes.
+  for (int node_idx = 0; node_idx < num_nodes_; ++node_idx) {
+    SortAndRemoveDuplicates(&fanouts_[node_idx]);
+  }
+
+  return Status::OK();
+}
+
+Status GraphTopologyView::InitializeFromGraph(const GraphDef& graph) {
+  return InitializeFromGraph(graph, absl::Span<GraphView::Edge>());
+}
+
+bool GraphTopologyView::HasNode(const absl::string_view node_name) const {
+  DCHECK(is_initialized()) << "GraphTopologyView is not initialized";
+  const auto it = node_name_to_index_.find(node_name);
+  return it != node_name_to_index_.end();
+}
+
+const NodeDef* GraphTopologyView::GetNode(
+    const absl::string_view node_name) const {
+  DCHECK(is_initialized()) << "GraphTopologyView is not initialized";
+  const auto it = node_name_to_index_.find(node_name);
+  return it == node_name_to_index_.end() ? nullptr : &graph_->node(it->second);
+}
+
+const NodeDef* GraphTopologyView::GetNode(int node_idx) const {
+  DCHECK(is_initialized()) << "GraphTopologyView is not initialized";
+  DCHECK(node_idx >= 0 && node_idx < num_nodes_) << "node_idx is out of range";
+  return &graph_->node(node_idx);
+}
+
+const absl::optional<int> GraphTopologyView::GetNodeIndex(
+    const absl::string_view node_name) const {
+  DCHECK(is_initialized()) << "GraphTopologyView is not initialized";
+  const auto it = node_name_to_index_.find(node_name);
+  DCHECK(it != node_name_to_index_.end()) << "Node doesn't exist in a graph";
+  return it == node_name_to_index_.end() ? absl::nullopt
+                                         : absl::make_optional(it->second);
+}
+
+const absl::optional<int> GraphTopologyView::GetNodeIndex(
+    const NodeDef& node) const {
+  return GetNodeIndex(node.name());
+}
+
+const absl::InlinedVector<int, 4>& GraphTopologyView::GetFanin(
+    int node_idx) const {
+  DCHECK(is_initialized()) << "GraphTopologyView is not initialized";
+  const bool is_valid_node_idx = node_idx >= 0 && node_idx < num_nodes_;
+  DCHECK(is_valid_node_idx) << "node_idx is out of range";
+  return is_valid_node_idx ? fanins_[node_idx] : empty_fanin_;
+}
+
+const absl::InlinedVector<int, 2>& GraphTopologyView::GetFanout(
+    int node_idx) const {
+  DCHECK(is_initialized()) << "GraphTopologyView is not initialized";
+  const bool is_valid_node_idx = node_idx >= 0 && node_idx < num_nodes_;
+  DCHECK(is_valid_node_idx) << "node_idx is out of range";
+  return is_valid_node_idx ? fanouts_[node_idx] : empty_fanout_;
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/graph_topology_view.h b/tensorflow/core/grappler/graph_topology_view.h
new file mode 100644
index 0000000000000000000000000000000000000000..c40d0093b9063f4e0dadaa6c607154fdbb4986ab
--- /dev/null
+++ b/tensorflow/core/grappler/graph_topology_view.h
@@ -0,0 +1,111 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPH_TOPOLOGY_VIEW_H_
+#define TENSORFLOW_CORE_GRAPPLER_GRAPH_TOPOLOGY_VIEW_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/grappler/graph_view.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// GraphTopologyView is a helper class to simplify `node-to-node` connectivity
+// traversals. Regular `GraphView` simplifies `tensor-to-tensor` traversals:
+// connections between output tensors and inputs of a consumer nodes. For the
+// topology view we are focused on nodes connected to nodes, and it's irrelevant
+// if this connection is formed by one or multiple individual tensors.
+//
+// Example:
+//   a = Placeholder(..)
+//   b = Placeholder(..)
+//   c = AddN([a, a, b])
+//
+// GraphView edges:         [a:0 -> c:0, a:0 -> c:1, b:0 -> c:3]
+// GraphTopologyView edges: [a -> c, b -> c]
+//
+// GraphView is used for exploring single node fanins and fanouts, and
+// GraphTopologyView is focused on efficient full graph traversals (computing
+// graph node properties from transitive fanouts, etc...).
+class GraphTopologyView {
+ public:
+  GraphTopologyView() = default;
+  explicit GraphTopologyView(bool skip_invalid_edges)
+      : skip_invalid_edges_(skip_invalid_edges) {}
+
+  // Initialize graph topology view from the graph. It's possible to pass
+  // additional edges that do not exist in a graph, but must be respected when
+  // computing graph topology. Example: Tensorflow runtime allows concurrent
+  // execution of dequeue/enqueue ops from the same queue resource, but we might
+  // want to enforce ordering between them for the purpose of graph analysis.
+  Status InitializeFromGraph(const GraphDef& graph,
+                             absl::Span<const GraphView::Edge> ephemeral_edges);
+  Status InitializeFromGraph(const GraphDef& graph);
+
+  bool is_initialized() const { return graph_ != nullptr; }
+  int num_nodes() const { return num_nodes_; }
+  const GraphDef* graph() const { return graph_; }
+
+  // Returns true iff the node exists in the underlying graph.
+  bool HasNode(absl::string_view node_name) const;
+
+  // Finds a node by name or returns `nullptr` if it's not in the graph.
+  const NodeDef* GetNode(absl::string_view node_name) const;
+  // Returns a node corresponding to the given node index.
+  const NodeDef* GetNode(int node_idx) const;
+
+  // Returns a node index for the given node name, if the name exists in the
+  // underlying graph. Otherwise returns empty optional.
+  const absl::optional<int> GetNodeIndex(absl::string_view node_name) const;
+  // Returns a node index for the given node, if the node belongs to the
+  // underlying graph. Otherwise returns empty optional.
+  const absl::optional<int> GetNodeIndex(const NodeDef& node) const;
+
+  // Returns all the node indexes that are in the direct fanin of the given
+  // node. If the `node_idx` is outside of [0, num_nodes_) returns empty vector.
+  const absl::InlinedVector<int, 4>& GetFanin(int node_idx) const;
+  // Returns all the node indexes that are in the direct fanout of the given
+  // node. If the `node_idx` is outside of [0, num_nodes_) returns empty vector.
+  const absl::InlinedVector<int, 2>& GetFanout(int node_idx) const;
+
+ private:
+  // If true, all invalid edges and inputs (srd, dst or input node not found in
+  // a graph) will be skipped, otherwise initialization will fail with error.
+  bool skip_invalid_edges_ = false;
+
+  // WARN: `graph_` must outlive this object and graph nodes must not be
+  // destructed, because node names captured with absl::string_view.
+  const GraphDef* graph_ = nullptr;  // do not own
+  int num_nodes_ = 0;
+  std::vector<absl::string_view> index_to_node_name_;
+  absl::flat_hash_map<absl::string_view, int> node_name_to_index_;
+  std::vector<absl::InlinedVector<int, 4>> fanins_;   // node_idx->input nodes
+  std::vector<absl::InlinedVector<int, 2>> fanouts_;  // node_idx->output nodes
+
+  // We need a valid reference to return from GetFanin/GetFanout if the
+  // `node_idx` argument is outside of the [0, num_nodes_) range.
+  absl::InlinedVector<int, 4> empty_fanin_;
+  absl::InlinedVector<int, 2> empty_fanout_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_GRAPH_TOPOLOGY_VIEW_H_
diff --git a/tensorflow/core/grappler/graph_topology_view_test.cc b/tensorflow/core/grappler/graph_topology_view_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..36d3a2017cc5ef965a26b0bdbbbdde441fb633db
--- /dev/null
+++ b/tensorflow/core/grappler/graph_topology_view_test.cc
@@ -0,0 +1,117 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/graph_topology_view.h"
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class GraphTopologyViewTest : public ::testing::Test {
+ protected:
+  using NodeConfig = std::pair<string, std::vector<string>>;
+
+  static GraphDef CreateGraph(const std::vector<NodeConfig>& nodes) {
+    GraphDef graph;
+
+    for (const NodeConfig& node : nodes) {
+      const auto& node_name = node.first;
+      const auto& node_inputs = node.second;
+
+      NodeDef node_def;
+      node_def.set_name(node_name);
+      for (const string& input : node_inputs) {
+        node_def.add_input(input);
+      }
+
+      *graph.add_node() = std::move(node_def);
+    }
+
+    return graph;
+  }
+};
+
+TEST_F(GraphTopologyViewTest, SimpleGraph) {
+  const GraphDef graph = CreateGraph({
+      {"a", {}},          // idx: 0
+      {"b", {}},          // idx: 1
+      {"c", {"a", "b"}},  // idx: 2
+      {"d", {"a", "c"}},  // idx: 3
+  });
+
+  GraphTopologyView graph_view;
+  TF_CHECK_OK(graph_view.InitializeFromGraph(graph));
+
+  EXPECT_TRUE(graph_view.is_initialized());
+
+  const NodeDef* a_by_name = graph_view.GetNode("a");
+  const NodeDef* a_by_idx = graph_view.GetNode(0);
+  ASSERT_TRUE(a_by_name);
+  ASSERT_TRUE(a_by_idx);
+  EXPECT_EQ(a_by_name, a_by_idx);
+
+  const NodeDef* b_by_name = graph_view.GetNode("b");
+  const NodeDef* b_by_idx = graph_view.GetNode(1);
+  ASSERT_TRUE(b_by_name);
+  ASSERT_TRUE(b_by_idx);
+  EXPECT_EQ(b_by_name, b_by_idx);
+
+  const absl::optional<int> b_idx = graph_view.GetNodeIndex(*b_by_name);
+  ASSERT_TRUE(b_idx.has_value());
+  EXPECT_EQ(b_idx.value(), 1);
+
+  const absl::optional<int> c_idx = graph_view.GetNodeIndex("c");
+  ASSERT_TRUE(c_idx.has_value());
+  EXPECT_EQ(c_idx.value(), 2);
+
+  using Fanin = absl::InlinedVector<int, 4>;
+  EXPECT_EQ(graph_view.GetFanin(0), Fanin());
+  EXPECT_EQ(graph_view.GetFanin(1), Fanin());
+  EXPECT_EQ(graph_view.GetFanin(2), Fanin({0, 1}));
+  EXPECT_EQ(graph_view.GetFanin(3), Fanin({0, 2}));
+
+  using Fanout = absl::InlinedVector<int, 2>;
+  EXPECT_EQ(graph_view.GetFanout(0), Fanout({2, 3}));
+  EXPECT_EQ(graph_view.GetFanout(1), Fanout({2}));
+  EXPECT_EQ(graph_view.GetFanout(2), Fanout({3}));
+  EXPECT_EQ(graph_view.GetFanout(3), Fanout());
+}
+
+TEST_F(GraphTopologyViewTest, GraphWithALoop) {
+  const GraphDef graph = CreateGraph({
+      {"a", {}},               // idx: 0
+      {"b", {}},               // idx: 1
+      {"c", {"a", "b", "d"}},  // idx: 2 <<<--- 'c' and 'd' have a loop
+      {"d", {"a", "c"}},       // idx: 3
+  });
+
+  GraphTopologyView graph_view;
+  TF_CHECK_OK(graph_view.InitializeFromGraph(graph));
+  EXPECT_TRUE(graph_view.is_initialized());
+
+  using Fanin = absl::InlinedVector<int, 4>;
+  EXPECT_EQ(graph_view.GetFanin(2), Fanin({0, 1, 3}));
+  EXPECT_EQ(graph_view.GetFanin(3), Fanin({0, 2}));
+
+  using Fanout = absl::InlinedVector<int, 2>;
+  EXPECT_EQ(graph_view.GetFanout(2), Fanout({3}));
+  EXPECT_EQ(graph_view.GetFanout(3), Fanout({2}));
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/graph_view.cc b/tensorflow/core/grappler/graph_view.cc
index ba9d2eb32181940bc430771db281c6cea8cb48c4..be9b9c36c71c6f8282862de85a211358fa826186 100644
--- a/tensorflow/core/grappler/graph_view.cc
+++ b/tensorflow/core/grappler/graph_view.cc
@@ -66,28 +66,27 @@ int OpInputPortIdToArgId(const NodeDef& node, const OpDef& op, int port_id) {
 bool HasSingleFanoutNode(const GraphView& graph_view, const NodeDef* node,
                          int port) {
   const auto output = GraphView::OutputPort(node, port);
-  const auto fanout = graph_view.GetFanout(output);
-  return fanout.size() <= 1;
+  return graph_view.GetFanout(output).size() <= 1;
 }
 
 bool HasFanouts(const GraphView& graph_view, const NodeDef* node, int port) {
   const auto output = GraphView::OutputPort(node, port);
-  const auto fanout = graph_view.GetFanout(output);
-  return !fanout.empty();
+  return !graph_view.GetFanout(output).empty();
 }
 
-bool NoControlFanin(const GraphView& graph_view, const NodeDef* node) {
-  const auto control_port = GraphView::InputPort(node, -1);
-  return graph_view.GetFanin(control_port).empty();
+bool HasControlFanin(const GraphView& graph_view, const NodeDef* node) {
+  const auto control_port = GraphView::InputPort(node, Graph::kControlSlot);
+  return !graph_view.GetFanin(control_port).empty();
 }
 
-bool NoControlFanout(const GraphView& graph_view, const NodeDef* node) {
-  const auto control_port = GraphView::OutputPort(node, -1);
-  return graph_view.GetFanout(control_port).empty();
+bool HasControlFanout(const GraphView& graph_view, const NodeDef* node) {
+  const auto control_port = GraphView::OutputPort(node, Graph::kControlSlot);
+  return !graph_view.GetFanout(control_port).empty();
 }
 
-bool NoControlFaninOrFanout(const GraphView& graph_view, const NodeDef* node) {
-  return NoControlFanin(graph_view, node) && NoControlFanout(graph_view, node);
+bool HasControlFaninOrFanout(const GraphView& graph_view, const NodeDef* node) {
+  return HasControlFanin(graph_view, node) ||
+         HasControlFanout(graph_view, node);
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h
index 0a47b2256583f35e6ef413b50fdc8eea2bdc978d..63c58a0aede059c6def5eca322ce3c491ea709b7 100644
--- a/tensorflow/core/grappler/graph_view.h
+++ b/tensorflow/core/grappler/graph_view.h
@@ -111,40 +111,52 @@ class GraphViewInternal {
 
   GraphDefT* graph() const { return graph_; }
 
-  // Find a node by name or return `nullptr` if it's not in a graph view.
+  // Finds a node by name or return `nullptr` if it's not in the graph view.
   NodeDefT* GetNode(absl::string_view node_name) const {
     return gtl::FindWithDefault(nodes_, node_name, nullptr);
   }
 
-  // Get the specified input port. Note that the special '-1' port_id can be
+  // Checks if a node by name is in the graph view.
+  bool HasNode(absl::string_view node_name) const {
+    return GetNode(node_name) != nullptr;
+  }
+
+  // Gets the specified input port. Note that the special '-1' port_id can be
   // used to access the controlling nodes (i.e. the nodes connected to node_name
   // through an incoming control dependency).
   InputPort GetInputPort(absl::string_view node_name, int port_id) const {
     return InputPort(GetNode(node_name), port_id);
   }
 
-  // Get the specified output port. Note that the special '-1' port_id can be
+  // Gets the specified output port. Note that the special '-1' port_id can be
   // used to access the controlled nodes (i.e. the nodes connected to node_name
   // through an outgoing control dependency).
   OutputPort GetOutputPort(absl::string_view node_name, int port_id) const {
     return OutputPort(GetNode(node_name), port_id);
   }
 
-  // Get the input (resp. output) port(s) in the immediate fanout (resp. fanin)
-  // of an output (resp. input) port.
+  // Gets the input port(s) in the immediate fanout of an output port.
   const absl::flat_hash_set<InputPort>& GetFanout(
       const OutputPort& port) const {
     return gtl::FindWithDefault(fanouts_, port, fanout_not_found_value_);
   }
 
+  // Gets the output port(s) in the immediate fanin of an input port.
   absl::flat_hash_set<OutputPort> GetFanin(const InputPort& port) const {
-    if (port.port_id >= 0) return {GetRegularFanin(port)};
+    if (port.port_id >= 0) {
+      OutputPort regular_fanin = GetRegularFanin(port);
+      if (regular_fanin.node == nullptr) {
+        return {};
+      }
+      return {regular_fanin};
+    }
 
     // Collect fanin for the control input.
     absl::flat_hash_set<OutputPort> result;
-    for (int i = port.node->input_size() - 1; i >= 0; --i) {
+    const int first_control_port =
+        gtl::FindWithDefault(max_regular_input_port_, port.node, -1) + 1;
+    for (int i = first_control_port; i < port.node->input_size(); ++i) {
       TensorId tensor_id = ParseTensorName(port.node->input(i));
-      if (tensor_id.index() >= 0) break;  // we reached regular inputs
 
       auto it = nodes_.find(tensor_id.node());
       if (it != nodes_.end()) result.emplace(it->second, tensor_id.index());
@@ -153,27 +165,53 @@ class GraphViewInternal {
   }
 
   // Special case: regular (i.e. non-control) input ports can only have one
-  // fanin.
+  // fanin. If port.port_id is out of range or is a control dependency, then an
+  // empty OutputPort is returned.
   const OutputPort GetRegularFanin(const InputPort& port) const {
-    DCHECK_GE(port.port_id, 0);
-    if (port.port_id < 0) return OutputPort();
+    if (port.port_id < 0 ||
+        port.port_id >
+            gtl::FindWithDefault(max_regular_input_port_, port.node, -1)) {
+      return OutputPort();
+    }
 
     TensorId tensor_id = ParseTensorName(port.node->input(port.port_id));
     return GetOutputPort(tensor_id.node(), tensor_id.index());
   }
 
-  // Get all the input (resp. output) ports in the immediate fanout (resp
-  // fanin) of a node. Include the controlling nodes iff
-  // include_controlling_nodes is true.
+  // Checks if a tensor id is a fanin of the node.
+  bool HasFanin(const NodeDefT& node, const TensorId& fanin) const {
+    int end = node.input_size();
+    if (end == 0 || fanin.index() < -1) {
+      return false;
+    }
+
+    const int num_regular_fanins =
+        gtl::FindWithDefault(max_regular_input_port_, &node, -1) + 1;
+    int start = 0;
+    if (fanin.index() > -1) {
+      end = num_regular_fanins;
+    } else {
+      start = num_regular_fanins;
+    }
+    for (int i = start; i < end; ++i) {
+      if (ParseTensorName(node.input(i)) == fanin) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Gets all the input ports in the immediate fanout of a node. Include the
+  // controlled nodes iff include_controlled_nodes is true.
   absl::flat_hash_set<InputPort> GetFanouts(
-      const NodeDef& node, bool include_controlled_nodes) const {
+      const NodeDefT& node, bool include_controlled_nodes) const {
     absl::flat_hash_set<InputPort> result;
 
     OutputPort port;
     port.node = const_cast<NodeDefT*>(&node);
     const int first_port_id = include_controlled_nodes ? -1 : 0;
     const int last_port_id =
-        gtl::FindWithDefault(max_regular_output_port_, port.node, -1);
+        gtl::FindWithDefault(max_regular_output_port_, &node, -1);
 
     for (int i = first_port_id; i <= last_port_id; ++i) {
       port.port_id = i;
@@ -185,12 +223,17 @@ class GraphViewInternal {
     return result;
   }
 
+  // Gets all the output ports in the immediate fanin of a node. Include the
+  // controlling nodes iff include_controlling_nodes is true.
   absl::flat_hash_set<OutputPort> GetFanins(
-      const NodeDef& node, bool include_controlling_nodes) const {
+      const NodeDefT& node, bool include_controlling_nodes) const {
     absl::flat_hash_set<OutputPort> result;
-    for (int i = 0; i < node.input_size(); ++i) {
+    const int max_input_port =
+        include_controlling_nodes
+            ? node.input_size() - 1
+            : gtl::FindWithDefault(max_regular_input_port_, &node, -1);
+    for (int i = 0; i <= max_input_port; ++i) {
       TensorId tensor_id = ParseTensorName(node.input(i));
-      if (tensor_id.index() < 0 && !include_controlling_nodes) break;
 
       auto it = nodes_.find(tensor_id.node());
       if (it != nodes_.end()) result.emplace(it->second, tensor_id.index());
@@ -198,29 +241,25 @@ class GraphViewInternal {
     return result;
   }
 
-  // Get the number of ports in the immediate fanin of a node. Count the
+  // Gets the number of ports in the immediate fanin of a node. Count the
   // controlling nodes iff include_controlling_nodes is true.
-  int NumFanins(const NodeDef& node, bool include_controlling_nodes) const {
-    int count = 0;
-    for (const string& input : node.input()) {
-      if (!include_controlling_nodes && IsControlInput(input)) {
-        break;
-      }
-      count += 1;
+  int NumFanins(const NodeDefT& node, bool include_controlling_nodes) const {
+    if (include_controlling_nodes) {
+      return node.input_size();
     }
-    return count;
+    return gtl::FindWithDefault(max_regular_input_port_, &node, -1) + 1;
   }
 
-  // Get the number of ports in the immediate fanout of a node. Count the
-  // controlling nodes iff include_controlling_nodes is true.
-  int NumFanouts(const NodeDef& node, bool include_controlling_nodes) const {
+  // Gets the number of ports in the immediate fanout of a node. Count the
+  // controlled nodes iff include_controlled_nodes is true.
+  int NumFanouts(const NodeDefT& node, bool include_controlled_nodes) const {
     int count = 0;
 
     OutputPort port;
     port.node = const_cast<NodeDefT*>(&node);
-    const int first_port_id = include_controlling_nodes ? -1 : 0;
+    const int first_port_id = include_controlled_nodes ? -1 : 0;
     const int last_port_id =
-        gtl::FindWithDefault(max_regular_output_port_, port.node, -1);
+        gtl::FindWithDefault(max_regular_output_port_, &node, -1);
 
     for (int i = first_port_id; i <= last_port_id; ++i) {
       port.port_id = i;
@@ -231,10 +270,10 @@ class GraphViewInternal {
     return count;
   }
 
-  // Get all the edges in the immediate fanout (resp fanin) of a node.
-  // Include the control edges iff include_controlling_edges is true.
+  // Gets all the edges in the immediate fanout of a node. Include the
+  // controlled edges iff include_controlled_edges is true.
   absl::flat_hash_set<Edge> GetFanoutEdges(
-      const NodeDef& node, bool include_controlled_edges) const {
+      const NodeDefT& node, bool include_controlled_edges) const {
     absl::flat_hash_set<Edge> result;
 
     OutputPort port;
@@ -248,25 +287,29 @@ class GraphViewInternal {
       auto it = fanouts_.find(port);
       if (it != fanouts_.end()) {
         for (auto itr = it->second.begin(); itr != it->second.end(); ++itr) {
-          result.emplace(/*src*/ OutputPort(const_cast<NodeDefT*>(&node), i),
-                         /*dst*/ *itr);
+          result.emplace(/*src=*/port, /*dst=*/*itr);
         }
       }
     }
     return result;
   }
 
+  // Gets all the edges in the immediate fanin of a node. Include the
+  // controlling edges iff include_controlling_edges is true.
   absl::flat_hash_set<Edge> GetFaninEdges(
-      const NodeDef& node, bool include_controlling_edges) const {
+      const NodeDefT& node, bool include_controlling_edges) const {
     absl::flat_hash_set<Edge> result;
-    for (int i = 0; i < node.input_size(); ++i) {
+    const int max_input_port =
+        include_controlling_edges
+            ? node.input_size() - 1
+            : gtl::FindWithDefault(max_regular_input_port_, &node, -1);
+    for (int i = 0; i <= max_input_port; ++i) {
       TensorId tensor_id = ParseTensorName(node.input(i));
-      if (tensor_id.index() < 0 && !include_controlling_edges) break;
 
       auto it = nodes_.find(tensor_id.node());
       if (it != nodes_.end()) {
-        result.emplace(/*src*/ OutputPort(it->second, tensor_id.index()),
-                       /*dst*/ InputPort(const_cast<NodeDefT*>(&node), i));
+        result.emplace(/*src=*/OutputPort(it->second, tensor_id.index()),
+                       /*dst=*/InputPort(const_cast<NodeDefT*>(&node), i));
       }
     }
     return result;
@@ -275,14 +318,24 @@ class GraphViewInternal {
  protected:
   explicit GraphViewInternal(GraphDefT* graph) : graph_(graph) {}
 
+  Status AddUniqueNode(NodeDefT* node) {
+    auto inserted = nodes_.emplace(node->name(), node);
+    return inserted.second
+               ? Status::OK()
+               : errors::InvalidArgument("Non unique node name detected: ",
+                                         node->name());
+  }
+
+  // TODO(ezhulenev): Remove this function.
   void AddUniqueNodeOrDie(NodeDefT* node) {
-    auto result = nodes_.emplace(node->name(), node);
-    // TODO(ezhulenev): Replace CHECK with factory method returning
-    // absl::StatusOr (when available).
-    CHECK(result.second) << "Non unique node name detected: " << node->name();
+    Status st = AddUniqueNode(node);
+    CHECK(st.ok()) << st.error_message();
   }
 
+  // TODO(lyandy): Checks for self loops, Switch control dependencies, fanins
+  // exist, and all regular fanins come before controlling fanins.
   void AddFanouts(NodeDefT* node) {
+    int max_input_port = -1;
     for (int i = 0; i < node->input_size(); ++i) {
       TensorId tensor_id = ParseTensorName(node->input(i));
       OutputPort output(nodes_[tensor_id.node()], tensor_id.index());
@@ -290,11 +343,15 @@ class GraphViewInternal {
       if (output.port_id < 0) {
         fanouts_[output].emplace(node, -1);
       } else {
+        max_input_port = i;
         max_regular_output_port_[output.node] =
             std::max(max_regular_output_port_[output.node], output.port_id);
         fanouts_[output].emplace(node, i);
       }
     }
+    if (max_input_port > -1) {
+      max_regular_input_port_[node] = max_input_port;
+    }
   }
 
   // Access to the mutable internal state for MutableGraphView.
@@ -304,7 +361,11 @@ class GraphViewInternal {
     return fanouts_;
   }
 
-  absl::flat_hash_map<const NodeDef*, int>& max_regular_output_port() {
+  absl::flat_hash_map<const NodeDefT*, int>& max_regular_input_port() {
+    return max_regular_input_port_;
+  }
+
+  absl::flat_hash_map<const NodeDefT*, int>& max_regular_output_port() {
     return max_regular_output_port_;
   }
 
@@ -317,10 +378,13 @@ class GraphViewInternal {
   // A mapping from the output port to all inputs that read from it.
   absl::flat_hash_map<OutputPort, absl::flat_hash_set<InputPort>> fanouts_;
 
+  // Keep a maximum index of input tensors of the node.
+  absl::flat_hash_map<const NodeDefT*, int> max_regular_input_port_;
+
   // Keep a maximum index of tensor fetched from the node. It doesn't guarantee
   // that all tensors in the [0, max_regular_output_port] range are actually
   // fetched by other nodes.
-  absl::flat_hash_map<const NodeDef*, int> max_regular_output_port_;
+  absl::flat_hash_map<const NodeDefT*, int> max_regular_output_port_;
 
   // If the node has no fanouts at given output port (output tensor consumers)
   // we return a reference to this set from `GetFanout` (we can't construct new
@@ -348,10 +412,12 @@ bool HasSingleFanoutNode(const GraphView& graph_view, const NodeDef* node,
 
 // Returns true if node has at least one fanout node at given output port.
 bool HasFanouts(const GraphView& graph_view, const NodeDef* node, int port = 0);
-
-bool NoControlFanin(const GraphView& graph_view, const NodeDef* node);
-bool NoControlFanout(const GraphView& graph_view, const NodeDef* node);
-bool NoControlFaninOrFanout(const GraphView& graph_view, const NodeDef* node);
+// Returns true if the node has at least one input control dependency.
+bool HasControlFanin(const GraphView& graph_view, const NodeDef* node);
+// Returns true if the node has at least one output control dependency.
+bool HasControlFanout(const GraphView& graph_view, const NodeDef* node);
+// Returns true if the node has at least one input or output control dependency.
+bool HasControlFaninOrFanout(const GraphView& graph_view, const NodeDef* node);
 
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/graph_view_test.cc b/tensorflow/core/grappler/graph_view_test.cc
index cbf859a4a99d7c434a4a65185c8962ea539c1aed..839057065b4e3f13dc55b9c0a7ddcfd94a165376 100644
--- a/tensorflow/core/grappler/graph_view_test.cc
+++ b/tensorflow/core/grappler/graph_view_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -42,26 +43,24 @@ TEST_F(GraphViewTest, OpPortIdToArgIdShapeN) {
 
   const OpDef* a_op_def = nullptr;
   const OpDef* b_op_def = nullptr;
-  EXPECT_TRUE(
-      OpRegistry::Global()->LookUpOpDef(a_node_def.op(), &a_op_def).ok());
-  EXPECT_TRUE(
-      OpRegistry::Global()->LookUpOpDef(b_node_def.op(), &b_op_def).ok());
+  TF_EXPECT_OK(OpRegistry::Global()->LookUpOpDef(a_node_def.op(), &a_op_def));
+  TF_EXPECT_OK(OpRegistry::Global()->LookUpOpDef(b_node_def.op(), &b_op_def));
 
   // Const has 0 inputs, 1 output.
-  EXPECT_EQ(-1, OpInputPortIdToArgId(a_node_def, *a_op_def, 0));
-  EXPECT_EQ(0, OpOutputPortIdToArgId(a_node_def, *a_op_def, 0));
-  EXPECT_EQ(-1, OpOutputPortIdToArgId(a_node_def, *a_op_def, 1));
+  EXPECT_EQ(OpInputPortIdToArgId(a_node_def, *a_op_def, 0), -1);
+  EXPECT_EQ(OpOutputPortIdToArgId(a_node_def, *a_op_def, 0), 0);
+  EXPECT_EQ(OpOutputPortIdToArgId(a_node_def, *a_op_def, 1), -1);
 
   // ShapeN has N=3 inputs and outputs.
-  EXPECT_EQ(0, OpInputPortIdToArgId(b_node_def, *b_op_def, 0));
-  EXPECT_EQ(0, OpInputPortIdToArgId(b_node_def, *b_op_def, 1));
-  EXPECT_EQ(0, OpInputPortIdToArgId(b_node_def, *b_op_def, 2));
-  EXPECT_EQ(-1, OpInputPortIdToArgId(b_node_def, *b_op_def, 3));
-  EXPECT_EQ(0, OpOutputPortIdToArgId(b_node_def, *b_op_def, 0));
-  EXPECT_EQ(0, OpOutputPortIdToArgId(b_node_def, *b_op_def, 1));
-  EXPECT_EQ(0, OpOutputPortIdToArgId(b_node_def, *b_op_def, 2));
-  EXPECT_EQ(-1, OpOutputPortIdToArgId(b_node_def, *b_op_def, 3));
-  EXPECT_EQ(-1, OpOutputPortIdToArgId(b_node_def, *b_op_def, 4));
+  EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 0), 0);
+  EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 1), 0);
+  EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 2), 0);
+  EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 3), -1);
+  EXPECT_EQ(OpOutputPortIdToArgId(b_node_def, *b_op_def, 0), 0);
+  EXPECT_EQ(OpOutputPortIdToArgId(b_node_def, *b_op_def, 1), 0);
+  EXPECT_EQ(OpOutputPortIdToArgId(b_node_def, *b_op_def, 2), 0);
+  EXPECT_EQ(OpOutputPortIdToArgId(b_node_def, *b_op_def, 3), -1);
+  EXPECT_EQ(OpOutputPortIdToArgId(b_node_def, *b_op_def, 4), -1);
 }
 
 TEST_F(GraphViewTest, OpPortIdToArgIdSparseSplit) {
@@ -76,22 +75,21 @@ TEST_F(GraphViewTest, OpPortIdToArgIdSparseSplit) {
 
     const NodeDef& b_node_def = *graph_view.GetNode("b");
     const OpDef* b_op_def = nullptr;
-    EXPECT_TRUE(
-        OpRegistry::Global()->LookUpOpDef(b_node_def.op(), &b_op_def).ok());
+    TF_EXPECT_OK(OpRegistry::Global()->LookUpOpDef(b_node_def.op(), &b_op_def));
 
     // We have 4 inputs.
-    EXPECT_EQ(0, OpInputPortIdToArgId(b_node_def, *b_op_def, 0));
-    EXPECT_EQ(1, OpInputPortIdToArgId(b_node_def, *b_op_def, 1));
-    EXPECT_EQ(2, OpInputPortIdToArgId(b_node_def, *b_op_def, 2));
-    EXPECT_EQ(3, OpInputPortIdToArgId(b_node_def, *b_op_def, 3));
-    EXPECT_EQ(-1, OpInputPortIdToArgId(b_node_def, *b_op_def, 4));
+    EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 0), 0);
+    EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 1), 1);
+    EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 2), 2);
+    EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 3), 3);
+    EXPECT_EQ(OpInputPortIdToArgId(b_node_def, *b_op_def, 4), -1);
 
     for (int port_id = 0; port_id <= num_splits * 3; ++port_id) {
       int arg_id = -1;
       if (port_id < num_splits * 3) {
         arg_id = port_id / num_splits;
       }
-      EXPECT_EQ(arg_id, OpOutputPortIdToArgId(b_node_def, *b_op_def, port_id));
+      EXPECT_EQ(OpOutputPortIdToArgId(b_node_def, *b_op_def, port_id), arg_id);
     }
   }
 }
@@ -110,18 +108,17 @@ TEST_F(GraphViewTest, ParseSingleExample) {
   const NodeDef& c_node_def = *graph_view.GetNode("c");
 
   const OpDef* c_op_def = nullptr;
-  EXPECT_TRUE(
-      OpRegistry::Global()->LookUpOpDef(c_node_def.op(), &c_op_def).ok());
-
-  EXPECT_EQ(0, OpOutputPortIdToArgId(c_node_def, *c_op_def, 0));
-  EXPECT_EQ(0, OpOutputPortIdToArgId(c_node_def, *c_op_def, 1));
-  EXPECT_EQ(1, OpOutputPortIdToArgId(c_node_def, *c_op_def, 2));
-  EXPECT_EQ(1, OpOutputPortIdToArgId(c_node_def, *c_op_def, 3));
-  EXPECT_EQ(2, OpOutputPortIdToArgId(c_node_def, *c_op_def, 4));
-  EXPECT_EQ(2, OpOutputPortIdToArgId(c_node_def, *c_op_def, 5));
-  EXPECT_EQ(3, OpOutputPortIdToArgId(c_node_def, *c_op_def, 6));
-  EXPECT_EQ(3, OpOutputPortIdToArgId(c_node_def, *c_op_def, 7));
-  EXPECT_EQ(-1, OpOutputPortIdToArgId(c_node_def, *c_op_def, 8));
+  TF_EXPECT_OK(OpRegistry::Global()->LookUpOpDef(c_node_def.op(), &c_op_def));
+
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 0), 0);
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 1), 0);
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 2), 1);
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 3), 1);
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 4), 2);
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 5), 2);
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 6), 3);
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 7), 3);
+  EXPECT_EQ(OpOutputPortIdToArgId(c_node_def, *c_op_def, 8), -1);
 }
 
 TEST_F(GraphViewTest, BasicGraph) {
@@ -132,26 +129,26 @@ TEST_F(GraphViewTest, BasicGraph) {
   GraphView graph(&item.graph);
 
   GraphView::InputPort input = graph.GetInputPort("AddN", 0);
-  EXPECT_EQ("AddN", input.node->name());
-  EXPECT_EQ(0, input.port_id);
+  EXPECT_EQ(input.node->name(), "AddN");
+  EXPECT_EQ(input.port_id, 0);
   GraphView::OutputPort fanin = graph.GetRegularFanin(input);
-  EXPECT_EQ("Square", fanin.node->name());
-  EXPECT_EQ(0, fanin.port_id);
+  EXPECT_EQ(fanin.node->name(), "Square");
+  EXPECT_EQ(fanin.port_id, 0);
 
   input = graph.GetInputPort("AddN", 1);
-  EXPECT_EQ("AddN", input.node->name());
-  EXPECT_EQ(1, input.port_id);
+  EXPECT_EQ(input.node->name(), "AddN");
+  EXPECT_EQ(input.port_id, 1);
   fanin = graph.GetRegularFanin(input);
-  EXPECT_EQ("Square_1", fanin.node->name());
-  EXPECT_EQ(0, fanin.port_id);
+  EXPECT_EQ(fanin.node->name(), "Square_1");
+  EXPECT_EQ(fanin.port_id, 0);
 
   GraphView::OutputPort output = graph.GetOutputPort("AddN", 0);
-  EXPECT_EQ("AddN", output.node->name());
-  EXPECT_EQ(0, output.port_id);
-  EXPECT_EQ(2, graph.GetFanout(output).size());
+  EXPECT_EQ(output.node->name(), "AddN");
+  EXPECT_EQ(output.port_id, 0);
+  EXPECT_EQ(graph.GetFanout(output).size(), 2);
   for (auto fanout : graph.GetFanout(output)) {
     if (fanout.node->name() == "AddN_2" || fanout.node->name() == "AddN_3") {
-      EXPECT_EQ(0, fanout.port_id);
+      EXPECT_EQ(fanout.port_id, 0);
     } else {
       // Invalid fanout
       EXPECT_FALSE(true);
@@ -159,7 +156,7 @@ TEST_F(GraphViewTest, BasicGraph) {
   }
 
   const NodeDef* add_node = graph.GetNode("AddN");
-  EXPECT_NE(nullptr, add_node);
+  EXPECT_NE(add_node, nullptr);
 
   absl::flat_hash_set<string> fanouts;
   absl::flat_hash_set<string> expected_fanouts = {"AddN_2:0", "AddN_3:0"};
@@ -190,44 +187,106 @@ TEST_F(GraphViewTest, ControlDependencies) {
   GraphView graph(&item.graph);
 
   GraphView::OutputPort output = graph.GetOutputPort("a", -1);
-  EXPECT_EQ("a", output.node->name());
-  EXPECT_EQ(-1, output.port_id);
+  EXPECT_EQ(output.node->name(), "a");
+  EXPECT_EQ(output.port_id, -1);
   auto fanout = graph.GetFanout(output);
-  EXPECT_EQ(1, fanout.size());
-  EXPECT_EQ("d", (*fanout.begin()).node->name());
-  EXPECT_EQ(-1, (*fanout.begin()).port_id);
+  EXPECT_EQ(fanout.size(), 1);
+  EXPECT_EQ((*fanout.begin()).node->name(), "d");
+  EXPECT_EQ((*fanout.begin()).port_id, -1);
 
   output = graph.GetOutputPort("a", 0);
-  EXPECT_EQ("a", output.node->name());
-  EXPECT_EQ(0, output.port_id);
+  EXPECT_EQ(output.node->name(), "a");
+  EXPECT_EQ(output.port_id, 0);
   fanout = graph.GetFanout(output);
-  EXPECT_EQ(1, fanout.size());
-  EXPECT_EQ("b", (*fanout.begin()).node->name());
-  EXPECT_EQ(0, (*fanout.begin()).port_id);
+  EXPECT_EQ(fanout.size(), 1);
+  EXPECT_EQ((*fanout.begin()).node->name(), "b");
+  EXPECT_EQ((*fanout.begin()).port_id, 0);
 
   GraphView::InputPort input = graph.GetInputPort("d", -1);
-  EXPECT_EQ("d", input.node->name());
-  EXPECT_EQ(-1, input.port_id);
+  EXPECT_EQ(input.node->name(), "d");
+  EXPECT_EQ(input.port_id, -1);
   auto fanin = graph.GetFanin(input);
-  EXPECT_EQ(1, fanin.size());
-  EXPECT_EQ("a", (*fanin.begin()).node->name());
-  EXPECT_EQ(-1, (*fanin.begin()).port_id);
+  EXPECT_EQ(fanin.size(), 1);
+  EXPECT_EQ((*fanin.begin()).node->name(), "a");
+  EXPECT_EQ((*fanin.begin()).port_id, -1);
 
   input = graph.GetInputPort("d", 0);
-  EXPECT_EQ("d", input.node->name());
-  EXPECT_EQ(0, input.port_id);
+  EXPECT_EQ(input.node->name(), "d");
+  EXPECT_EQ(input.port_id, 0);
   fanin = graph.GetFanin(input);
-  EXPECT_EQ(1, fanin.size());
-  EXPECT_EQ("b", (*fanin.begin()).node->name());
-  EXPECT_EQ(0, (*fanin.begin()).port_id);
+  EXPECT_EQ(fanin.size(), 1);
+  EXPECT_EQ((*fanin.begin()).node->name(), "b");
+  EXPECT_EQ((*fanin.begin()).port_id, 0);
 
   input = graph.GetInputPort("d", 1);
-  EXPECT_EQ("d", input.node->name());
-  EXPECT_EQ(1, input.port_id);
+  EXPECT_EQ(input.node->name(), "d");
+  EXPECT_EQ(input.port_id, 1);
   fanin = graph.GetFanin(input);
-  EXPECT_EQ(1, fanin.size());
-  EXPECT_EQ("c", (*fanin.begin()).node->name());
-  EXPECT_EQ(0, (*fanin.begin()).port_id);
+  EXPECT_EQ(fanin.size(), 1);
+  EXPECT_EQ((*fanin.begin()).node->name(), "c");
+  EXPECT_EQ((*fanin.begin()).port_id, 0);
+}
+
+TEST_F(GraphViewTest, HasNode) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  GraphView graph(&item.graph);
+
+  EXPECT_EQ(graph.HasNode("a"), true);
+  EXPECT_EQ(graph.HasNode("b"), false);
+}
+
+TEST_F(GraphViewTest, HasFanin) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
+  Output b = ops::Square(s.WithOpName("b"), {a});
+  Output c = ops::Sqrt(s.WithOpName("c"), {b});
+  Output d = ops::AddN(s.WithOpName("d").WithControlDependencies(a), {b, c});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  GraphView graph(&item.graph);
+
+  const NodeDef* d_node = graph.GetNode("d");
+  EXPECT_NE(d_node, nullptr);
+
+  EXPECT_EQ(graph.HasFanin(*d_node, {"a", Graph::kControlSlot}), true);
+  EXPECT_EQ(graph.HasFanin(*d_node, {"a", 0}), false);
+  EXPECT_EQ(graph.HasFanin(*d_node, {"b", 0}), true);
+  EXPECT_EQ(graph.HasFanin(*d_node, {"b", Graph::kControlSlot}), false);
+  EXPECT_EQ(graph.HasFanin(*d_node, {"c", 0}), true);
+  EXPECT_EQ(graph.HasFanin(*d_node, {"c", Graph::kControlSlot}), false);
+}
+
+TEST_F(GraphViewTest, GetRegularFaninPortOutOfBounds) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
+  Output b = ops::Square(s.WithOpName("b"), {});
+  Output c = ops::Sqrt(s.WithOpName("c"), {b});
+  Output d = ops::AddN(s.WithOpName("d").WithControlDependencies(a), {b, c});
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  GraphView graph(&item.graph);
+
+  const NodeDef* b_node = graph.GetNode("b");
+  EXPECT_NE(b_node, nullptr);
+  const NodeDef* c_node = graph.GetNode("c");
+  EXPECT_NE(c_node, nullptr);
+  const NodeDef* d_node = graph.GetNode("d");
+  EXPECT_NE(d_node, nullptr);
+
+  auto d_output_0 = graph.GetRegularFanin({d_node, 0});
+  EXPECT_EQ(d_output_0, GraphView::OutputPort(b_node, 0));
+  auto d_output_1 = graph.GetRegularFanin({d_node, 1});
+  EXPECT_EQ(d_output_1, GraphView::OutputPort(c_node, 0));
+  auto d_output_2 = graph.GetRegularFanin({d_node, 2});
+  EXPECT_EQ(d_output_2, GraphView::OutputPort());
+  auto d_output_control = graph.GetRegularFanin({d_node, Graph::kControlSlot});
+  EXPECT_EQ(d_output_control, GraphView::OutputPort());
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/grappler_item.cc b/tensorflow/core/grappler/grappler_item.cc
index 74bde67f198f8c6d31273861cf9b35537909447c..bc95c9cf72ab06ce8f3ed0126ad42f62cfe2ace7 100644
--- a/tensorflow/core/grappler/grappler_item.cc
+++ b/tensorflow/core/grappler/grappler_item.cc
@@ -43,7 +43,7 @@ GrapplerItem GrapplerItem::WithGraph(GraphDef&& graph_def) const {
   item.save_restore_loc_tensor = save_restore_loc_tensor;
   item.queue_runners = queue_runners;
   item.devices_ = devices_;
-  item.allowed_optimizations_ = allowed_optimizations_;
+  item.optimization_options_ = optimization_options_;
   item.graph.Swap(&graph_def);
   return item;
 }
@@ -114,6 +114,18 @@ std::unordered_set<string> GrapplerItem::NodesToPreserve() const {
       result.insert(NodeName(queue_runner.cancel_op_name()));
     }
   }
+
+  // Tensorflow functions do not prune stateful or dataset-output ops from
+  // the function body (see PruneFunctionBody in common_runtime/function.cc).
+  if (!optimization_options_.allow_pruning_stateful_and_dataset_ops) {
+    FunctionLibraryDefinition fn_library(OpRegistry::Global(), graph.library());
+    for (const NodeDef& node : graph.node()) {
+      if (IsStateful(node, &fn_library) || IsDataset(node)) {
+        result.insert(node.name());
+      }
+    }
+  }
+
   return result;
 }
 
@@ -166,13 +178,13 @@ Status GrapplerItem::InferDevicesFromGraph() {
 
 void GrapplerItem::ClearDevices() { devices_.clear(); }
 
-const GrapplerItem::AllowedOptimizations& GrapplerItem::allowed_optimizations()
+const GrapplerItem::OptimizationOptions& GrapplerItem::optimization_options()
     const {
-  return allowed_optimizations_;
+  return optimization_options_;
 }
 
-GrapplerItem::AllowedOptimizations& GrapplerItem::allowed_optimizations() {
-  return allowed_optimizations_;
+GrapplerItem::OptimizationOptions& GrapplerItem::optimization_options() {
+  return optimization_options_;
 }
 
 std::vector<const NodeDef*> ComputeTransitiveFanin(
diff --git a/tensorflow/core/grappler/grappler_item.h b/tensorflow/core/grappler/grappler_item.h
index 9051542988c4261aacb5fc25c8e6e2f1d35adfa0..57949b322d61273d607b50c27d995db79cbc9391 100644
--- a/tensorflow/core/grappler/grappler_item.h
+++ b/tensorflow/core/grappler/grappler_item.h
@@ -81,18 +81,23 @@ struct GrapplerItem {
   // fetch nodes, keep_ops, init_ops.
   std::unordered_set<string> NodesToPreserve() const;
 
-  // Restrict types of optimizations that are allowed for this GrapplerItem.
-  struct AllowedOptimizations {
+  struct OptimizationOptions {
     // Is it allowed to add nodes to the graph that do not have registered
     // gradient function.
-    bool non_differentiable_rewrites = true;
-    // By default we are not allowed to inline ops with side effects into the
-    // main graph, because we can't guarantee that after pruning these ops will
-    // be executed. However if we are optimizing a function library (see
-    // meta_optimizer.cc) and a graph was instantiated by a function definition,
-    // we can do that, because functions guarantee that all side effects will be
-    // executed (see function_optimizer.cc for details).
-    bool inline_ops_with_side_effects = false;
+    bool allow_non_differentiable_rewrites = true;
+
+    // Tensorflow function execution semantics is slightly different from the
+    // main Tensorflow graph, and we need to make sure that we do not change it
+    // by running Grappler optimizer passes. One main difference is that
+    // functions do not prune ops with side-effects and dataset-output ops (see
+    // PruneFunctionBody in common_runtime/function.cc).
+    bool allow_pruning_stateful_and_dataset_ops = true;
+
+    // If true Grappler will optimize the main graph, and also all functions in
+    // the graph function library (function can't be polymorphic, it can't have
+    // undefined type parameters in the function signature, or placeholder
+    // attributes in the function body).
+    bool optimize_function_library = true;
   };
 
   const std::unordered_set<string>& devices() const;
@@ -109,8 +114,8 @@ struct GrapplerItem {
   // Clears a set of available devices.
   void ClearDevices();
 
-  const AllowedOptimizations& allowed_optimizations() const;
-  AllowedOptimizations& allowed_optimizations();
+  const OptimizationOptions& optimization_options() const;
+  OptimizationOptions& optimization_options();
 
  private:
   // TODO(ezhulenev) Make GrapplerItem a class and hide all public data members.
@@ -121,7 +126,7 @@ struct GrapplerItem {
   // Example of a fully defined name: "/job:work/replica:1/task:1/device:CPU:0"
   std::unordered_set<string> devices_;
 
-  AllowedOptimizations allowed_optimizations_;
+  OptimizationOptions optimization_options_;
 };
 
 // Return the transitive fanin of a set of terminal nodes.
diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index 9224ee7849211f849c3655d6faea18dcc32b8e17..fc55fb5b3d2f905fc0fab837a9345b7e396acd13 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -103,7 +103,12 @@ Status OptimizeGraph(const GraphDef& graph_def_arg, GraphDef* output_graph_def,
 
   // Instantiate all variables for function library runtime creation.
   std::vector<std::unique_ptr<Device>> devices;
-  TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(
+  // Only CPU device is used so instead of calling DeviceFactory::AddDevices()
+  // with dummy session config, which will conflict with user defined options
+  // and create unwanted devices, call cpu_factory->CreateDevices() to get CPU
+  // only devices.
+  DeviceFactory* cpu_factory = DeviceFactory::GetFactory("CPU");
+  TF_RETURN_IF_ERROR(cpu_factory->CreateDevices(
       options, "/job:localhost/replica:0/task:0", &devices));
   Device* cpu_device = devices[0].get();
   std::unique_ptr<DeviceMgr> dvc_mgr(new DeviceMgr(std::move(devices)));
diff --git a/tensorflow/core/grappler/inputs/BUILD b/tensorflow/core/grappler/inputs/BUILD
index ffa204028cca828147810c99277fdcd9cb05f5ee..286c30cd356baf408bb227236d9369f81ab8b1ad 100644
--- a/tensorflow/core/grappler/inputs/BUILD
+++ b/tensorflow/core/grappler/inputs/BUILD
@@ -49,7 +49,11 @@ cc_library(
     deps = [
         ":input_yielder",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:functional_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:sendrecv_ops_op_lib",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/kernels:aggregate_ops",
diff --git a/tensorflow/core/grappler/mutable_graph_view.cc b/tensorflow/core/grappler/mutable_graph_view.cc
index 1a4754153bca9bb7ee019b9b9ea67e6ce3cb5f89..6a5e60edccee09f861a66d3ec94e7f7a5297031d 100644
--- a/tensorflow/core/grappler/mutable_graph_view.cc
+++ b/tensorflow/core/grappler/mutable_graph_view.cc
@@ -14,14 +14,405 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/mutable_graph_view.h"
+
+#include <algorithm>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace grappler {
 
+namespace {
+
+bool IsTensorIdPortValid(const TensorId& tensor_id) {
+  return tensor_id.index() >= Graph::kControlSlot;
+}
+
+bool IsTensorIdRegular(const TensorId& tensor_id) {
+  return tensor_id.index() > Graph::kControlSlot;
+}
+
+bool IsTensorIdControlling(const TensorId& tensor_id) {
+  return tensor_id.index() == Graph::kControlSlot;
+}
+
+bool IsOutputPortControlling(const MutableGraphView::OutputPort& port) {
+  return port.port_id == Graph::kControlSlot;
+}
+
+// Determines if node is an Identity where it's first regular input is a Switch
+// node.
+bool IsIdentityConsumingSwitch(const MutableGraphView& graph,
+                               const NodeDef& node) {
+  if ((IsIdentity(node) || IsIdentityNSingleInput(node)) &&
+      node.input_size() > 0) {
+    TensorId tensor_id = ParseTensorName(node.input(0));
+    if (IsTensorIdControlling(tensor_id)) {
+      return false;
+    }
+
+    NodeDef* input_node = graph.GetNode(tensor_id.node());
+    return IsSwitch(*input_node);
+  }
+  return false;
+}
+
+// Determines if node input can be deduped by regular inputs when used as a
+// control dependency. Specifically, if a node is an Identity that leads to a
+// Switch node, when used as a control dependency, that control dependency
+// should not be deduped even though the same node is used as a regular input.
+bool CanDedupControlWithRegularInput(const MutableGraphView& graph,
+                                     const NodeDef& control_node) {
+  return !IsIdentityConsumingSwitch(graph, control_node);
+}
+
+// Determines if node input can be deduped by regular inputs when used as a
+// control dependency. Specifically, if a node is an Identity that leads to a
+// Switch node, when used as a control dependency, that control dependency
+// should not be deduped even though the same node is used as a regular input.
+bool CanDedupControlWithRegularInput(const MutableGraphView& graph,
+                                     absl::string_view control_node_name) {
+  NodeDef* control_node = graph.GetNode(control_node_name);
+  DCHECK(control_node != nullptr)
+      << "Didn't find a node for control dependency: " << control_node_name;
+  return CanDedupControlWithRegularInput(graph, *control_node);
+}
+
+bool HasRegularFaninNode(const MutableGraphView& graph, const NodeDef& node,
+                         absl::string_view fanin_node_name) {
+  const int num_regular_fanins =
+      graph.NumFanins(node, /*include_controlling_nodes=*/false);
+  for (int i = 0; i < num_regular_fanins; ++i) {
+    if (ParseTensorName(node.input(i)).node() == fanin_node_name) {
+      return true;
+    }
+  }
+  return false;
+}
+
+using FanoutsMap =
+    absl::flat_hash_map<MutableGraphView::OutputPort,
+                        absl::flat_hash_set<MutableGraphView::InputPort>>;
+
+void SwapControlledFanoutInputs(const MutableGraphView& graph,
+                                const FanoutsMap::iterator& control_fanouts,
+                                absl::string_view to_node_name) {
+  absl::string_view from_node_name(control_fanouts->first.node->name());
+  string control = TensorIdToString({to_node_name, Graph::kControlSlot});
+  for (const auto& control_fanout : control_fanouts->second) {
+    const int start = graph.NumFanins(*control_fanout.node,
+                                      /*include_controlling_nodes=*/false);
+    for (int i = start; i < control_fanout.node->input_size(); ++i) {
+      TensorId tensor_id = ParseTensorName(control_fanout.node->input(i));
+      if (tensor_id.node() == from_node_name) {
+        control_fanout.node->set_input(i, control);
+        break;
+      }
+    }
+  }
+}
+
+void SwapRegularFanoutInputs(FanoutsMap* fanouts, NodeDef* from_node,
+                             absl::string_view to_node_name, int max_port) {
+  MutableGraphView::OutputPort port;
+  port.node = from_node;
+  for (int i = 0; i <= max_port; ++i) {
+    port.port_id = i;
+    auto it = fanouts->find(port);
+    if (it == fanouts->end()) {
+      continue;
+    }
+    string input = TensorIdToString({to_node_name, i});
+    for (const auto& fanout : it->second) {
+      fanout.node->set_input(fanout.port_id, input);
+    }
+  }
+}
+
+using MaxOutputPortsMap = absl::flat_hash_map<const NodeDef*, int>;
+
+void SwapFanoutInputs(const MutableGraphView& graph, FanoutsMap* fanouts,
+                      MaxOutputPortsMap* max_output_ports, NodeDef* from_node,
+                      NodeDef* to_node) {
+  auto from_control_fanouts = fanouts->find({from_node, Graph::kControlSlot});
+  if (from_control_fanouts != fanouts->end()) {
+    SwapControlledFanoutInputs(graph, from_control_fanouts, to_node->name());
+  }
+  auto to_control_fanouts = fanouts->find({to_node, Graph::kControlSlot});
+  if (to_control_fanouts != fanouts->end()) {
+    SwapControlledFanoutInputs(graph, to_control_fanouts, from_node->name());
+  }
+  auto from_max_port = max_output_ports->find(from_node);
+  if (from_max_port != max_output_ports->end()) {
+    SwapRegularFanoutInputs(fanouts, from_node, to_node->name(),
+                            from_max_port->second);
+  }
+  auto to_max_port = max_output_ports->find(to_node);
+  if (to_max_port != max_output_ports->end()) {
+    SwapRegularFanoutInputs(fanouts, to_node, from_node->name(),
+                            to_max_port->second);
+  }
+}
+
+void SwapFanoutsMapValues(FanoutsMap* fanouts,
+                          const MutableGraphView::OutputPort& from_port,
+                          const FanoutsMap::iterator& from_fanouts,
+                          const MutableGraphView::OutputPort& to_port,
+                          const FanoutsMap::iterator& to_fanouts) {
+  const bool from_exists = from_fanouts != fanouts->end();
+  const bool to_exists = to_fanouts != fanouts->end();
+
+  if (from_exists && to_exists) {
+    std::swap(from_fanouts->second, to_fanouts->second);
+  } else if (from_exists) {
+    fanouts->emplace(to_port, std::move(from_fanouts->second));
+    fanouts->erase(from_port);
+  } else if (to_exists) {
+    fanouts->emplace(from_port, std::move(to_fanouts->second));
+    fanouts->erase(to_port);
+  }
+}
+
+void SwapRegularFanoutsAndMaxPortValues(FanoutsMap* fanouts,
+                                        MaxOutputPortsMap* max_output_ports,
+                                        NodeDef* from_node, NodeDef* to_node) {
+  auto from_max_port = max_output_ports->find(from_node);
+  auto to_max_port = max_output_ports->find(to_node);
+  bool from_exists = from_max_port != max_output_ports->end();
+  bool to_exists = to_max_port != max_output_ports->end();
+
+  auto forward_fanouts = [fanouts](NodeDef* from, NodeDef* to, int start,
+                                   int end) {
+    for (int i = start; i <= end; ++i) {
+      MutableGraphView::OutputPort from_port(from, i);
+      auto from_fanouts = fanouts->find(from_port);
+      if (from_fanouts != fanouts->end()) {
+        MutableGraphView::OutputPort to_port(to, i);
+        fanouts->emplace(to_port, std::move(from_fanouts->second));
+        fanouts->erase(from_port);
+      }
+    }
+  };
+
+  if (from_exists && to_exists) {
+    const int from = from_max_port->second;
+    const int to = to_max_port->second;
+    const int shared = std::min(from, to);
+    for (int i = 0; i <= shared; ++i) {
+      MutableGraphView::OutputPort from_port(from_node, i);
+      auto from_fanouts = fanouts->find(from_port);
+      MutableGraphView::OutputPort to_port(to_node, i);
+      auto to_fanouts = fanouts->find(to_port);
+      SwapFanoutsMapValues(fanouts, from_port, from_fanouts, to_port,
+                           to_fanouts);
+    }
+    if (to > from) {
+      forward_fanouts(to_node, from_node, shared + 1, to);
+    } else if (from > to) {
+      forward_fanouts(from_node, to_node, shared + 1, from);
+    }
+
+    std::swap(from_max_port->second, to_max_port->second);
+  } else if (from_exists) {
+    forward_fanouts(from_node, to_node, 0, from_max_port->second);
+
+    max_output_ports->emplace(to_node, from_max_port->second);
+    max_output_ports->erase(from_node);
+  } else if (to_exists) {
+    forward_fanouts(to_node, from_node, 0, to_max_port->second);
+
+    max_output_ports->emplace(from_node, to_max_port->second);
+    max_output_ports->erase(to_node);
+  }
+}
+
+bool HasFanoutValue(const FanoutsMap& fanouts, const FanoutsMap::iterator& it) {
+  return it != fanouts.end() && !it->second.empty();
+}
+
+Status MutationError(absl::string_view function_name, absl::string_view params,
+                     absl::string_view msg) {
+  return errors::InvalidArgument(absl::Substitute(
+      "MutableGraphView::$0($1) error: $2.", function_name, params, msg));
+}
+
+using ErrorHandler = std::function<Status(absl::string_view)>;
+
+ErrorHandler UpdateFanoutsError(absl::string_view from_node_name,
+                                absl::string_view to_node_name) {
+  return [from_node_name, to_node_name](absl::string_view msg) {
+    string params = absl::Substitute("from_node_name='$0', to_node_name='$1'",
+                                     from_node_name, to_node_name);
+    return MutationError("UpdateFanouts", params, msg);
+  };
+}
+
+Status CheckFaninIsRegular(const TensorId& fanin, ErrorHandler handler) {
+  if (!IsTensorIdRegular(fanin)) {
+    return handler(absl::Substitute("fanin '$0' must be a regular tensor id",
+                                    fanin.ToString()));
+  }
+  return Status::OK();
+}
+
+Status CheckFaninIsValid(const TensorId& fanin, ErrorHandler handler) {
+  if (!IsTensorIdPortValid(fanin)) {
+    return handler(absl::Substitute("fanin '$0' must be a valid tensor id",
+                                    fanin.ToString()));
+  }
+  return Status::OK();
+}
+
+Status CheckAddingFaninToSelf(absl::string_view node_name,
+                              const TensorId& fanin, ErrorHandler handler) {
+  if (node_name == fanin.node()) {
+    return handler(
+        absl::Substitute("can't add fanin '$0' to self", fanin.ToString()));
+  }
+  return Status::OK();
+}
+
+Status CheckRemovingFaninFromSelf(absl::string_view node_name,
+                                  const TensorId& fanin, ErrorHandler handler) {
+  if (node_name == fanin.node()) {
+    return handler(absl::Substitute("can't remove fanin '$0' from self",
+                                    fanin.ToString()));
+  }
+  return Status::OK();
+}
+
+string NodeMissingErrorMsg(absl::string_view node_name) {
+  return absl::Substitute("node '$0' was not found", node_name);
+}
+
+Status CheckNodeExists(absl::string_view node_name, NodeDef* node,
+                       ErrorHandler handler) {
+  if (node == nullptr) {
+    return handler(NodeMissingErrorMsg(node_name));
+  }
+  return Status::OK();
+}
+
+Status CheckPortRange(int port, int min, int max, ErrorHandler handler) {
+  if (port < min || port > max) {
+    if (max < min) {
+      return handler("no available ports as node has no regular fanins");
+    }
+    return handler(
+        absl::Substitute("port must be in range [$0, $1]", min, max));
+  }
+  return Status::OK();
+}
+
+string SwapNodeNamesSwitchControlErrorMsg(absl::string_view node_name) {
+  return absl::Substitute(
+      "can't swap node name '$0' as it will become a Switch control dependency",
+      node_name);
+}
+
+string GeneratedNameForIdentityConsumingSwitch(
+    const MutableGraphView::OutputPort& fanin) {
+  return AddPrefixToNodeName(
+      absl::StrCat(fanin.node->name(), "_", fanin.port_id),
+      kMutableGraphViewCtrl);
+}
+
+}  // namespace
+
+void MutableGraphView::AddAndDedupFanouts(NodeDef* node) {
+  // TODO(lyandy): Checks for self loops, Switch control dependencies, fanins
+  // exist, and all regular fanins come before controlling fanins.
+  absl::flat_hash_set<absl::string_view> fanins;
+  absl::flat_hash_set<absl::string_view> controlling_fanins;
+  int max_input_port = -1;
+  int pos = 0;
+  const int last_idx = node->input_size() - 1;
+  int last_pos = last_idx;
+  while (pos <= last_pos) {
+    TensorId tensor_id = ParseTensorName(node->input(pos));
+    absl::string_view input_node_name = tensor_id.node();
+    bool is_control_input = IsTensorIdControlling(tensor_id);
+    bool can_dedup_control_with_regular_input =
+        CanDedupControlWithRegularInput(*this, input_node_name);
+    bool can_dedup_control =
+        is_control_input && (can_dedup_control_with_regular_input ||
+                             (!can_dedup_control_with_regular_input &&
+                              controlling_fanins.contains(input_node_name)));
+    if (!gtl::InsertIfNotPresent(&fanins, input_node_name) &&
+        can_dedup_control) {
+      node->mutable_input()->SwapElements(pos, last_pos);
+      --last_pos;
+    } else {
+      OutputPort output(nodes()[input_node_name], tensor_id.index());
+
+      if (is_control_input) {
+        fanouts()[output].emplace(node, Graph::kControlSlot);
+      } else {
+        max_input_port = pos;
+        max_regular_output_port()[output.node] =
+            std::max(max_regular_output_port()[output.node], output.port_id);
+        fanouts()[output].emplace(node, pos);
+      }
+      ++pos;
+    }
+    if (is_control_input) {
+      controlling_fanins.insert(input_node_name);
+    }
+  }
+
+  if (last_pos < last_idx) {
+    node->mutable_input()->DeleteSubrange(last_pos + 1, last_idx - last_pos);
+  }
+
+  if (max_input_port > -1) {
+    max_regular_input_port()[node] = max_input_port;
+  }
+}
+
+void MutableGraphView::UpdateMaxRegularOutputPortForRemovedFanin(
+    const OutputPort& fanin,
+    const absl::flat_hash_set<InputPort>& fanin_fanouts) {
+  int max_port = max_regular_output_port()[fanin.node];
+  if (!fanin_fanouts.empty() || max_port != fanin.port_id) {
+    return;
+  }
+  bool updated_max_port = false;
+  for (int i = fanin.port_id - 1; i >= 0; --i) {
+    OutputPort fanin_port(fanin.node, i);
+    if (!fanouts()[fanin_port].empty()) {
+      max_regular_output_port()[fanin.node] = i;
+      updated_max_port = true;
+      break;
+    }
+  }
+  if (!updated_max_port) {
+    max_regular_output_port().erase(fanin.node);
+  }
+}
+
+void MutableGraphView::UpdateMaxRegularOutputPortForAddedFanin(
+    const OutputPort& fanin) {
+  if (max_regular_output_port()[fanin.node] < fanin.port_id) {
+    max_regular_output_port()[fanin.node] = fanin.port_id;
+  }
+}
+
 const absl::flat_hash_set<MutableGraphView::InputPort>&
 MutableGraphView::GetFanout(const GraphView::OutputPort& port) const {
   return GetFanout(MutableGraphView::OutputPort(const_cast<NodeDef*>(port.node),
@@ -46,30 +437,309 @@ NodeDef* MutableGraphView::AddNode(NodeDef&& node) {
 
   AddUniqueNodeOrDie(node_in_graph);
 
-  AddFanouts(node_in_graph);
+  AddAndDedupFanouts(node_in_graph);
   return node_in_graph;
 }
 
-void MutableGraphView::UpdateFanouts(absl::string_view from_node,
-                                     absl::string_view to_node) {
-  NodeDef* from_node_ptr = GetNode(from_node);
-  NodeDef* to_node_ptr = GetNode(to_node);
-  if (from_node_ptr && to_node_ptr) {
-    UpdateFanouts(from_node_ptr, to_node_ptr);
-  } else if (!from_node_ptr) {
-    LOG(WARNING) << absl::Substitute(
-        "Can't update fanouts from '$0' to '$1', from node was not found.",
-        from_node, to_node);
-  } else {
-    LOG(WARNING) << absl::Substitute(
-        "Can't update fanouts from '$0' to '$1', to node was not found.",
-        from_node, to_node);
+Status MutableGraphView::AddSubgraph(GraphDef&& subgraph) {
+  // 1. Add all new functions and check that functions with the same name
+  // have identical definition.
+  const int function_size = subgraph.library().function_size();
+  if (function_size > 0) {
+    absl::flat_hash_map<absl::string_view, const FunctionDef*> graph_fdefs;
+    for (const FunctionDef& fdef : graph()->library().function()) {
+      graph_fdefs.emplace(fdef.signature().name(), &fdef);
+    }
+
+    for (FunctionDef& fdef : *subgraph.mutable_library()->mutable_function()) {
+      const auto graph_fdef = graph_fdefs.find(fdef.signature().name());
+
+      if (graph_fdef == graph_fdefs.end()) {
+        VLOG(3) << "Add new function definition: " << fdef.signature().name();
+        graph()->mutable_library()->add_function()->Swap(&fdef);
+      } else {
+        if (!FunctionDefsEqual(fdef, *graph_fdef->second)) {
+          return MutationError(
+              "AddSubgraph",
+              absl::Substitute("function_size=$0", function_size),
+              absl::StrCat(
+                  "Found different function definition with the same name: ",
+                  fdef.signature().name()));
+        }
+      }
+    }
   }
+
+  // 2. Add all nodes to the underlying graph.
+  int node_size_before = graph()->node_size();
+
+  for (NodeDef& node : *subgraph.mutable_node()) {
+    auto* node_in_graph = graph()->add_node();
+    node_in_graph->Swap(&node);
+    TF_RETURN_IF_ERROR(AddUniqueNode(node_in_graph));
+  }
+
+  // TODO(ezhulenev, lyandy): Right now AddAndDedupFanouts do not check that
+  // fanins actually exists in the graph, and there is already TODO for that.
+
+  for (int i = node_size_before; i < graph()->node_size(); ++i) {
+    NodeDef* node = graph()->mutable_node(i);
+    AddAndDedupFanouts(node);
+  }
+
+  return Status::OK();
 }
 
-void MutableGraphView::UpdateFanouts(NodeDef* from_node, NodeDef* to_node) {
-  VLOG(0) << absl::Substitute("Update fanouts from '$0' to '$1'.",
+Status MutableGraphView::UpdateNode(
+    absl::string_view node_name, absl::string_view op, absl::string_view device,
+    absl::Span<const std::pair<string, AttrValue>> attrs) {
+  auto error_status = [node_name, op, device, attrs](absl::string_view msg) {
+    std::vector<string> attr_strs;
+    attr_strs.reserve(attrs.size());
+    for (const auto& attr : attrs) {
+      string attr_str = absl::Substitute("('$0', $1)", attr.first,
+                                         attr.second.ShortDebugString());
+      attr_strs.push_back(attr_str);
+    }
+    string params =
+        absl::Substitute("node_name='$0', op='$1', device='$2', attrs={$3}",
+                         node_name, op, device, absl::StrJoin(attr_strs, ", "));
+    return MutationError("UpdateNodeOp", params, msg);
+  };
+
+  NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+
+  MutableGraphView::OutputPort control_port(node, Graph::kControlSlot);
+  auto control_fanouts = GetFanout(control_port);
+  if (op == "Switch" && !control_fanouts.empty()) {
+    return error_status(
+        "can't change node op to Switch when node drives a control dependency "
+        "(alternatively, we could add the identity node needed, but it seems "
+        "like an unlikely event and probably a mistake)");
+  }
+
+  if (node->device() != device) {
+    node->set_device(string(device));
+  }
+  node->mutable_attr()->clear();
+  for (const auto& attr : attrs) {
+    (*node->mutable_attr())[attr.first] = attr.second;
+  }
+
+  if (node->op() == op) {
+    return Status::OK();
+  }
+
+  node->set_op(string(op));
+
+  if (CanDedupControlWithRegularInput(*this, *node)) {
+    for (const auto& control_fanout : control_fanouts) {
+      if (HasRegularFaninNode(*this, *control_fanout.node, node->name())) {
+        RemoveControllingFaninInternal(control_fanout.node, node);
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+Status MutableGraphView::UpdateNodeName(absl::string_view from_node_name,
+                                        absl::string_view to_node_name,
+                                        bool update_fanouts) {
+  auto error_status = [from_node_name, to_node_name,
+                       update_fanouts](absl::string_view msg) {
+    string params = absl::Substitute(
+        "from_node_name='$0', to_node_name='$1', update_fanouts=$2",
+        from_node_name, to_node_name, update_fanouts);
+    return MutationError("UpdateNodeName", params, msg);
+  };
+
+  NodeDef* node = GetNode(from_node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(from_node_name, node, error_status));
+
+  if (node->name() == to_node_name) {
+    return Status::OK();
+  }
+  if (HasNode(to_node_name)) {
+    return error_status(
+        "can't update node name because new node name is in use");
+  }
+  auto max_output_port = max_regular_output_port().find(node);
+  const bool has_max_output_port =
+      max_output_port != max_regular_output_port().end();
+  auto control_fanouts = fanouts().find({node, Graph::kControlSlot});
+
+  if (update_fanouts) {
+    SwapControlledFanoutInputs(*this, control_fanouts, to_node_name);
+    if (has_max_output_port) {
+      SwapRegularFanoutInputs(&fanouts(), node, to_node_name,
+                              max_output_port->second);
+    }
+  } else if (has_max_output_port ||
+             HasFanoutValue(fanouts(), control_fanouts)) {
+    return error_status("can't update node name because node has fanouts");
+  }
+
+  nodes().erase(node->name());
+  node->set_name(string(to_node_name));
+  nodes().emplace(node->name(), node);
+  return Status::OK();
+}
+
+Status MutableGraphView::SwapNodeNames(absl::string_view from_node_name,
+                                       absl::string_view to_node_name,
+                                       bool update_fanouts) {
+  auto error_status = [from_node_name, to_node_name,
+                       update_fanouts](absl::string_view msg) {
+    string params = absl::Substitute(
+        "from_node_name='$0', to_node_name='$1', update_fanouts=$2",
+        from_node_name, to_node_name, update_fanouts);
+    return MutationError("SwapNodeNames", params, msg);
+  };
+
+  NodeDef* from_node = GetNode(from_node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(from_node_name, from_node, error_status));
+  if (from_node_name == to_node_name) {
+    return Status::OK();
+  }
+  NodeDef* to_node = GetNode(to_node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(to_node_name, to_node, error_status));
+
+  auto swap_names = [this, from_node, to_node]() {
+    nodes().erase(from_node->name());
+    nodes().erase(to_node->name());
+    std::swap(*from_node->mutable_name(), *to_node->mutable_name());
+    nodes().emplace(from_node->name(), from_node);
+    nodes().emplace(to_node->name(), to_node);
+  };
+
+  if (update_fanouts) {
+    SwapFanoutInputs(*this, &fanouts(), &max_regular_output_port(), from_node,
+                     to_node);
+    swap_names();
+    return Status::OK();
+  }
+
+  bool from_is_switch = IsSwitch(*from_node);
+  MutableGraphView::OutputPort to_control(to_node, Graph::kControlSlot);
+  auto to_control_fanouts = fanouts().find(to_control);
+  if (from_is_switch && HasFanoutValue(fanouts(), to_control_fanouts)) {
+    return error_status(SwapNodeNamesSwitchControlErrorMsg(from_node_name));
+  }
+
+  bool to_is_switch = IsSwitch(*to_node);
+  MutableGraphView::OutputPort from_control(from_node, Graph::kControlSlot);
+  auto from_control_fanouts = fanouts().find(from_control);
+  if (to_is_switch && HasFanoutValue(fanouts(), from_control_fanouts)) {
+    return error_status(SwapNodeNamesSwitchControlErrorMsg(to_node_name));
+  }
+
+  // Swap node names.
+  swap_names();
+
+  // Swap controlling fanouts.
+  SwapFanoutsMapValues(&fanouts(), from_control, from_control_fanouts,
+                       to_control, to_control_fanouts);
+
+  // Swap regular fanouts.
+  SwapRegularFanoutsAndMaxPortValues(&fanouts(), &max_regular_output_port(),
+                                     from_node, to_node);
+
+  // Update fanins to remove self loops.
+  auto update_fanins = [this](NodeDef* node, absl::string_view old_node_name) {
+    for (int i = 0; i < node->input_size(); ++i) {
+      TensorId tensor_id = ParseTensorName(node->input(i));
+      if (tensor_id.node() == node->name()) {
+        const int idx = tensor_id.index();
+        const int node_idx =
+            IsTensorIdControlling(tensor_id) ? Graph::kControlSlot : i;
+
+        MutableGraphView::OutputPort from_fanin(node, idx);
+        absl::flat_hash_set<InputPort>* from_fanouts = &fanouts()[from_fanin];
+        from_fanouts->erase({node, node_idx});
+        UpdateMaxRegularOutputPortForRemovedFanin(from_fanin, *from_fanouts);
+
+        MutableGraphView::OutputPort to_fanin(nodes().at(old_node_name), idx);
+        fanouts()[to_fanin].insert({node, node_idx});
+        UpdateMaxRegularOutputPortForAddedFanin(to_fanin);
+        node->set_input(i, TensorIdToString({old_node_name, idx}));
+      }
+    }
+  };
+  update_fanins(from_node, to_node->name());
+  update_fanins(to_node, from_node->name());
+
+  // Dedup control dependencies.
+  auto dedup_control_fanouts =
+      [this](NodeDef* node, const FanoutsMap::iterator& control_fanouts) {
+        if (CanDedupControlWithRegularInput(*this, *node) &&
+            control_fanouts != fanouts().end()) {
+          for (const auto& control_fanout : control_fanouts->second) {
+            if (HasRegularFaninNode(*this, *control_fanout.node,
+                                    node->name())) {
+              RemoveControllingFaninInternal(control_fanout.node, node);
+            }
+          }
+        }
+      };
+  auto dedup_switch_control = [this, dedup_control_fanouts](NodeDef* node) {
+    OutputPort port;
+    port.node = node;
+    const int max_port =
+        gtl::FindWithDefault(max_regular_output_port(), node, -1);
+    for (int i = 0; i <= max_port; ++i) {
+      port.port_id = i;
+      auto it = fanouts().find(port);
+      if (it == fanouts().end()) {
+        continue;
+      }
+      for (const auto& fanout : it->second) {
+        auto fanout_controls =
+            fanouts().find({fanout.node, Graph::kControlSlot});
+        dedup_control_fanouts(fanout.node, fanout_controls);
+      }
+    }
+  };
+
+  if (!from_is_switch) {
+    if (to_is_switch) {
+      dedup_switch_control(from_node);
+    } else {
+      dedup_control_fanouts(from_node, from_control_fanouts);
+    }
+  }
+  if (!to_is_switch) {
+    if (from_is_switch) {
+      dedup_switch_control(to_node);
+    } else {
+      dedup_control_fanouts(to_node, to_control_fanouts);
+    }
+  }
+
+  return Status::OK();
+}
+
+Status MutableGraphView::UpdateFanouts(absl::string_view from_node_name,
+                                       absl::string_view to_node_name) {
+  NodeDef* from_node = GetNode(from_node_name);
+  TF_RETURN_IF_ERROR(
+      CheckNodeExists(from_node_name, from_node,
+                      UpdateFanoutsError(from_node_name, to_node_name)));
+  NodeDef* to_node = GetNode(to_node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(
+      to_node_name, to_node, UpdateFanoutsError(from_node_name, to_node_name)));
+
+  return UpdateFanoutsInternal(from_node, to_node);
+}
+
+Status MutableGraphView::UpdateFanoutsInternal(NodeDef* from_node,
+                                               NodeDef* to_node) {
+  VLOG(2) << absl::Substitute("Update fanouts from '$0' to '$1'.",
                               from_node->name(), to_node->name());
+  if (from_node == to_node) {
+    return Status::OK();
+  }
 
   // Update internal state with the new output_port->input_port edge.
   const auto add_edge = [this](const OutputPort& output_port,
@@ -83,6 +753,32 @@ void MutableGraphView::UpdateFanouts(NodeDef* from_node, NodeDef* to_node) {
     fanouts()[output_port].erase(input_port);
   };
 
+  // For the control fanouts we do not know the input index in a NodeDef,
+  // so we have to traverse all control inputs.
+
+  auto control_fanouts =
+      GetFanout(GraphView::OutputPort(from_node, Graph::kControlSlot));
+
+  bool to_node_is_switch = IsSwitch(*to_node);
+  for (const InputPort& control_port : control_fanouts) {
+    // Node can't be control dependency of itself.
+    if (control_port.node == to_node) continue;
+
+    // Can't add Switch node as a control dependency.
+    if (to_node_is_switch) {
+      // Trying to add a Switch as a control dependency, which if allowed will
+      // make the graph invalid.
+      return UpdateFanoutsError(from_node->name(), to_node->name())(
+          absl::Substitute("can't update fanouts to node '$0' as it will "
+                           "become a Switch control dependency",
+                           to_node->name()));
+    }
+
+    NodeDef* node = control_port.node;
+    RemoveControllingFaninInternal(node, from_node);
+    AddFaninInternal(node, {to_node, Graph::kControlSlot});
+  }
+
   // First we update regular fanouts. For the regular fanouts
   // `input_port:port_id` is the input index in NodeDef.
 
@@ -101,87 +797,816 @@ void MutableGraphView::UpdateFanouts(NodeDef* from_node, NodeDef* to_node) {
     // AddAndUpdateFanoutsWithoutSelfLoops test for an example).
     if (input_port.node == to_node) {
       keep_max_regular_output_port =
-          std::max(keep_max_regular_output_port, input_port.port_id);
+          std::max(keep_max_regular_output_port, output_port.port_id);
       continue;
     }
 
     // Update input at destination node.
     input_port.node->set_input(
         input_port.port_id,
-        output_port.port_id == 0
-            ? to_node->name()
-            : absl::StrCat(to_node->name(), ":", output_port.port_id));
+        TensorIdToString({to_node->name(), output_port.port_id}));
 
     // Remove old edge between the `from_node` and the fanout node.
     remove_edge(output_port, input_port);
     // Add an edge between the `to_node` and new fanout node.
     add_edge(OutputPort(to_node, output_port.port_id), input_port);
+    // Dedup control dependency.
+    if (CanDedupControlWithRegularInput(*this, *to_node)) {
+      RemoveControllingFaninInternal(input_port.node, to_node);
+    }
   }
 
-  // For the control fanouts we do not know the input index in a NodeDef,
-  // so we have to traverse all control inputs.
+  // Because we update all regular fanouts of `from_node`, we can just copy
+  // the value `num_regular_outputs`.
+  max_regular_output_port()[to_node] = max_regular_output_port()[from_node];
 
-  auto control_fanouts =
-      GetFanout(GraphView::OutputPort(from_node, Graph::kControlSlot));
-  if (control_fanouts.empty()) return;
+  // Check if all fanouts were updated to read from the `to_node`.
+  if (keep_max_regular_output_port >= 0) {
+    max_regular_output_port()[from_node] = keep_max_regular_output_port;
+  } else {
+    max_regular_output_port().erase(from_node);
+  }
 
-  const string from_control_input = absl::StrCat("^", from_node->name());
-  const string to_control_input = absl::StrCat("^", to_node->name());
+  return Status::OK();
+}
 
-  for (const InputPort& control_port : control_fanouts) {
-    // Node can't be control dependency of itself.
-    if (control_port.node == to_node) continue;
+bool MutableGraphView::AddFaninInternal(NodeDef* node,
+                                        const OutputPort& fanin) {
+  int num_regular_fanins =
+      NumFanins(*node, /*include_controlling_nodes=*/false);
+  bool input_is_control = IsOutputPortControlling(fanin);
+  bool can_dedup_control_with_regular_input =
+      CanDedupControlWithRegularInput(*this, *fanin.node);
+  // Don't add duplicate control dependencies.
+  if (input_is_control) {
+    const int start =
+        can_dedup_control_with_regular_input ? 0 : num_regular_fanins;
+    for (int i = start; i < node->input_size(); ++i) {
+      if (ParseTensorName(node->input(i)).node() == fanin.node->name()) {
+        return false;
+      }
+    }
+  }
 
-    // Find and update input corresponding to control dependency.
-    NodeDef* node = control_port.node;
-    for (int i = node->input_size() - 1; i >= 0; --i) {
-      const string& input = node->input(i);
-      if (!IsControlInput(input)) break;  // we reached regular inputs
-      if (input == from_control_input) {
-        node->set_input(i, to_control_input);
+  InputPort input;
+  input.node = node;
+  input.port_id = input_is_control ? Graph::kControlSlot : num_regular_fanins;
+
+  node->add_input(TensorIdToString({fanin.node->name(), fanin.port_id}));
+  if (!input_is_control) {
+    const int last_node_input = node->input_size() - 1;
+    // If there are control dependencies in node, move newly inserted fanin to
+    // be before such control dependencies.
+    if (num_regular_fanins < last_node_input) {
+      node->mutable_input()->SwapElements(last_node_input, num_regular_fanins);
+    }
+  }
+
+  fanouts()[fanin].insert(input);
+  if (max_regular_output_port()[fanin.node] < fanin.port_id) {
+    max_regular_output_port()[fanin.node] = fanin.port_id;
+  }
+
+  // Update max input port and dedup control dependencies.
+  if (!input_is_control) {
+    max_regular_input_port()[node] = num_regular_fanins;
+    if (can_dedup_control_with_regular_input) {
+      RemoveControllingFaninInternal(node, fanin.node);
+    }
+  }
+
+  return true;
+}
+
+Status MutableGraphView::AddRegularFanin(absl::string_view node_name,
+                                         const TensorId& fanin) {
+  auto error_status = [node_name, fanin](absl::string_view msg) {
+    string params = absl::Substitute("node_name='$0', fanin='$1'", node_name,
+                                     fanin.ToString());
+    return MutationError("AddRegularFanin", params, msg);
+  };
+
+  TF_RETURN_IF_ERROR(CheckFaninIsRegular(fanin, error_status));
+  TF_RETURN_IF_ERROR(CheckAddingFaninToSelf(node_name, fanin, error_status));
+  NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+  NodeDef* fanin_node = GetNode(fanin.node());
+  TF_RETURN_IF_ERROR(CheckNodeExists(fanin.node(), fanin_node, error_status));
+
+  AddFaninInternal(node, {fanin_node, fanin.index()});
+  return Status::OK();
+}
+
+Status MutableGraphView::AddRegularFaninByPort(absl::string_view node_name,
+                                               int port,
+                                               const TensorId& fanin) {
+  auto error_status = [node_name, port, fanin](absl::string_view msg) {
+    string params = absl::Substitute("node_name='$0', port=$1, fanin='$2'",
+                                     node_name, port, fanin.ToString());
+    return MutationError("AddRegularFaninByPort", params, msg);
+  };
+
+  TF_RETURN_IF_ERROR(CheckFaninIsRegular(fanin, error_status));
+  TF_RETURN_IF_ERROR(CheckAddingFaninToSelf(node_name, fanin, error_status));
+  NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+  const int num_regular_fanins =
+      NumFanins(*node, /*include_controlling_nodes=*/false);
+  TF_RETURN_IF_ERROR(
+      CheckPortRange(port, /*min=*/0, num_regular_fanins, error_status));
+  NodeDef* fanin_node = GetNode(fanin.node());
+  TF_RETURN_IF_ERROR(CheckNodeExists(fanin.node(), fanin_node, error_status));
+
+  const int last_node_input = node->input_size();
+  node->add_input(TensorIdToString(fanin));
+  node->mutable_input()->SwapElements(num_regular_fanins, last_node_input);
+  for (int i = num_regular_fanins - 1; i >= port; --i) {
+    TensorId tensor_id = ParseTensorName(node->input(i));
+    OutputPort fanin_port(nodes()[tensor_id.node()], tensor_id.index());
+    absl::flat_hash_set<InputPort>* fanouts_set = &fanouts()[fanin_port];
+    fanouts_set->erase({node, i});
+    fanouts_set->insert({node, i + 1});
+    node->mutable_input()->SwapElements(i, i + 1);
+  }
+
+  OutputPort fanin_port(fanin_node, fanin.index());
+  fanouts()[fanin_port].insert({node, port});
+  UpdateMaxRegularOutputPortForAddedFanin(fanin_port);
+
+  max_regular_input_port()[node] = num_regular_fanins;
+  if (CanDedupControlWithRegularInput(*this, *fanin_node)) {
+    RemoveControllingFaninInternal(node, fanin_node);
+  }
+
+  return Status::OK();
+}
+
+NodeDef* MutableGraphView::GetControllingFaninToAdd(absl::string_view node_name,
+                                                    const OutputPort& fanin,
+                                                    string* error_msg) {
+  if (!IsSwitch(*fanin.node)) {
+    return fanin.node;
+  } else {
+    if (IsOutputPortControlling(fanin)) {
+      // Can't add a Switch node control dependency.
+      TensorId tensor_id(fanin.node->name(), fanin.port_id);
+      *error_msg = absl::Substitute(
+          "can't add fanin '$0' as it will become a Switch control dependency",
+          tensor_id.ToString());
+      return nullptr;
+    }
+    // We can't anchor control dependencies directly on the switch node: unlike
+    // other nodes only one of the outputs of the switch node will be generated
+    // when the switch node is executed, and we need to make sure the control
+    // dependency is only triggered when the corresponding output is triggered.
+    // We start by looking for an identity node connected to the output of the
+    // switch node, and use it to anchor the control dependency.
+    for (const auto& fanout : GetFanout(fanin)) {
+      if (IsIdentity(*fanout.node) || IsIdentityNSingleInput(*fanout.node)) {
+        if (fanout.node->name() == node_name) {
+          *error_msg =
+              absl::Substitute("can't add found fanin '$0' to self",
+                               AsControlDependency(fanout.node->name()));
+          return nullptr;
+        }
+        return fanout.node;
       }
     }
 
-    // Remove old edge between the `from_node` and the fanout node.
-    remove_edge(OutputPort(from_node, Graph::kControlSlot), control_port);
-    // Add an edge between the `to_node` and new fanout node.
-    add_edge(OutputPort(to_node, Graph::kControlSlot), control_port);
+    // No node found, check if node to be created is itself.
+    if (GeneratedNameForIdentityConsumingSwitch(fanin) == node_name) {
+      *error_msg = absl::Substitute("can't add generated fanin '$0' to self",
+                                    AsControlDependency(string(node_name)));
+    }
   }
+  return nullptr;
+}
 
-  // Because we update all regular fanouts of `from_node`, we can just copy
-  // the value `num_regular_outputs`.
-  max_regular_output_port()[to_node] = max_regular_output_port()[from_node];
+NodeDef* MutableGraphView::GetOrCreateIdentityConsumingSwitch(
+    const OutputPort& fanin) {
+  // We haven't found an existing node where we can anchor the control
+  // dependency: add a new identity node.
+  string identity_name = GeneratedNameForIdentityConsumingSwitch(fanin);
+  NodeDef* identity_node = GetNode(identity_name);
+  if (identity_node == nullptr) {
+    NodeDef new_node;
+    new_node.set_name(identity_name);
+    new_node.set_op("Identity");
+    new_node.set_device(fanin.node->device());
+    (*new_node.mutable_attr())["T"].set_type(fanin.node->attr().at("T").type());
+    new_node.add_input(TensorIdToString({fanin.node->name(), fanin.port_id}));
+    identity_node = AddNode(std::move(new_node));
+  }
+  return identity_node;
+}
 
-  // Check if all fanouts were updated to read from the `to_node`.
-  if (keep_max_regular_output_port >= 0) {
-    max_regular_output_port()[from_node] = keep_max_regular_output_port;
+Status MutableGraphView::AddControllingFanin(absl::string_view node_name,
+                                             const TensorId& fanin) {
+  auto error_status = [node_name, fanin](absl::string_view msg) {
+    string params = absl::Substitute("node_name='$0', fanin='$1'", node_name,
+                                     fanin.ToString());
+    return MutationError("AddControllingFanin", params, msg);
+  };
+
+  TF_RETURN_IF_ERROR(CheckFaninIsValid(fanin, error_status));
+  TF_RETURN_IF_ERROR(CheckAddingFaninToSelf(node_name, fanin, error_status));
+  NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+  NodeDef* fanin_node = GetNode(fanin.node());
+  TF_RETURN_IF_ERROR(CheckNodeExists(fanin.node(), fanin_node, error_status));
+
+  OutputPort fanin_port(fanin_node, fanin.index());
+
+  string error_msg = "";
+  NodeDef* control_node = GetControllingFaninToAdd(
+      node_name, {fanin_node, fanin.index()}, &error_msg);
+  if (!error_msg.empty()) {
+    return error_status(error_msg);
+  }
+  if (control_node == nullptr) {
+    control_node = GetOrCreateIdentityConsumingSwitch(fanin_port);
+  }
+  AddFaninInternal(node, {control_node, Graph::kControlSlot});
+
+  return Status::OK();
+}
+
+bool MutableGraphView::RemoveRegularFaninInternal(NodeDef* node,
+                                                  const OutputPort& fanin) {
+  auto remove_input = [this, node](const OutputPort& fanin_port,
+                                   int node_input_port, bool update_max_port) {
+    InputPort input(node, node_input_port);
+
+    absl::flat_hash_set<InputPort>* fanouts_set = &fanouts()[fanin_port];
+    fanouts_set->erase(input);
+    if (update_max_port) {
+      UpdateMaxRegularOutputPortForRemovedFanin(fanin_port, *fanouts_set);
+    }
+    return fanouts_set;
+  };
+
+  auto mutable_inputs = node->mutable_input();
+  bool modified = false;
+  const int num_regular_fanins =
+      NumFanins(*node, /*include_controlling_nodes=*/false);
+  int i;
+  int curr_pos = 0;
+  for (i = 0; i < num_regular_fanins; ++i) {
+    TensorId tensor_id = ParseTensorName(node->input(i));
+    if (tensor_id.node() == fanin.node->name() &&
+        tensor_id.index() == fanin.port_id) {
+      remove_input(fanin, i, /*update_max_port=*/true);
+      modified = true;
+    } else if (modified) {
+      // Regular inputs will need to have their ports updated.
+      OutputPort fanin_port(nodes()[tensor_id.node()], tensor_id.index());
+      auto fanouts_set = remove_input(fanin_port, i, /*update_max_port=*/false);
+      fanouts_set->insert({node, curr_pos});
+      // Shift inputs to be retained.
+      mutable_inputs->SwapElements(i, curr_pos);
+      ++curr_pos;
+    } else {
+      // Skip inputs to be retained until first modification.
+      ++curr_pos;
+    }
+  }
+
+  if (modified) {
+    const int last_regular_input_port = curr_pos - 1;
+    if (last_regular_input_port < 0) {
+      max_regular_input_port().erase(node);
+    } else {
+      max_regular_input_port()[node] = last_regular_input_port;
+    }
+    if (curr_pos < i) {
+      // Remove fanins from node inputs.
+      mutable_inputs->DeleteSubrange(curr_pos, i - curr_pos);
+    }
+  }
+
+  return modified;
+}
+
+Status MutableGraphView::RemoveRegularFanin(absl::string_view node_name,
+                                            const TensorId& fanin) {
+  auto error_status = [node_name, fanin](absl::string_view msg) {
+    string params = absl::Substitute("node_name='$0', fanin='$1'", node_name,
+                                     fanin.ToString());
+    return MutationError("RemoveRegularFanin", params, msg);
+  };
+
+  TF_RETURN_IF_ERROR(CheckFaninIsRegular(fanin, error_status));
+  TF_RETURN_IF_ERROR(
+      CheckRemovingFaninFromSelf(node_name, fanin, error_status));
+  NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+  NodeDef* fanin_node = GetNode(fanin.node());
+  TF_RETURN_IF_ERROR(CheckNodeExists(fanin.node(), fanin_node, error_status));
+
+  RemoveRegularFaninInternal(node, {fanin_node, fanin.index()});
+  return Status::OK();
+}
+
+Status MutableGraphView::RemoveRegularFaninByPort(absl::string_view node_name,
+                                                  int port) {
+  auto error_status = [node_name, port](absl::string_view msg) {
+    string params =
+        absl::Substitute("node_name='$0', port=$1", node_name, port);
+    return MutationError("RemoveRegularFaninByPort", params, msg);
+  };
+
+  NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+  const int last_regular_fanin_port =
+      gtl::FindWithDefault(max_regular_input_port(), node, -1);
+  TF_RETURN_IF_ERROR(
+      CheckPortRange(port, /*min=*/0, last_regular_fanin_port, error_status));
+
+  TensorId tensor_id = ParseTensorName(node->input(port));
+  OutputPort fanin_port(nodes()[tensor_id.node()], tensor_id.index());
+  fanouts()[fanin_port].erase({node, port});
+  auto mutable_inputs = node->mutable_input();
+  for (int i = port + 1; i <= last_regular_fanin_port; ++i) {
+    TensorId tensor_id = ParseTensorName(node->input(i));
+    OutputPort fanin_port(nodes()[tensor_id.node()], tensor_id.index());
+    absl::flat_hash_set<InputPort>* fanouts_set = &fanouts()[fanin_port];
+    fanouts_set->erase({node, i});
+    fanouts_set->insert({node, i - 1});
+    mutable_inputs->SwapElements(i - 1, i);
+  }
+  const int last_node_input = node->input_size() - 1;
+  if (last_regular_fanin_port < last_node_input) {
+    mutable_inputs->SwapElements(last_regular_fanin_port, last_node_input);
+  }
+  mutable_inputs->RemoveLast();
+
+  const int updated_last_regular_input_port = last_regular_fanin_port - 1;
+  if (updated_last_regular_input_port < 0) {
+    max_regular_input_port().erase(node);
   } else {
-    max_regular_output_port().erase(from_node);
+    max_regular_input_port()[node] = updated_last_regular_input_port;
+  }
+
+  return Status::OK();
+}
+
+bool MutableGraphView::RemoveControllingFaninInternal(NodeDef* node,
+                                                      NodeDef* fanin_node) {
+  for (int i = node->input_size() - 1; i >= 0; --i) {
+    TensorId tensor_id = ParseTensorName(node->input(i));
+    if (tensor_id.index() > Graph::kControlSlot) {
+      break;
+    }
+    if (tensor_id.node() == fanin_node->name()) {
+      fanouts()[{fanin_node, Graph::kControlSlot}].erase(
+          {node, Graph::kControlSlot});
+      node->mutable_input()->SwapElements(i, node->input_size() - 1);
+      node->mutable_input()->RemoveLast();
+      return true;
+    }
+  }
+  return false;
+}
+
+Status MutableGraphView::RemoveControllingFanin(
+    absl::string_view node_name, absl::string_view fanin_node_name) {
+  auto error_status = [node_name, fanin_node_name](absl::string_view msg) {
+    string params = absl::Substitute("node_name='$0', fanin_node_name='$1'",
+                                     node_name, fanin_node_name);
+    return MutationError("RemoveControllingFanin", params, msg);
+  };
+
+  TF_RETURN_IF_ERROR(CheckRemovingFaninFromSelf(
+      node_name, {fanin_node_name, Graph::kControlSlot}, error_status));
+  NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+  NodeDef* fanin_node = GetNode(fanin_node_name);
+  TF_RETURN_IF_ERROR(
+      CheckNodeExists(fanin_node_name, fanin_node, error_status));
+
+  RemoveControllingFaninInternal(node, fanin_node);
+  return Status::OK();
+}
+
+Status MutableGraphView::RemoveAllFanins(absl::string_view node_name,
+                                         bool keep_controlling_fanins) {
+  NodeDef* node = GetNode(node_name);
+  if (node == nullptr) {
+    string params =
+        absl::Substitute("node_name='$0', keep_controlling_fanins=$1",
+                         node_name, keep_controlling_fanins);
+    return MutationError("RemoveAllFanins", params,
+                         NodeMissingErrorMsg(node_name));
+  }
+
+  if (node->input().empty()) {
+    return Status::OK();
+  }
+
+  const int num_regular_fanins =
+      NumFanins(*node, /*include_controlling_nodes=*/false);
+  RemoveFaninsInternal(node, keep_controlling_fanins);
+  if (keep_controlling_fanins) {
+    if (num_regular_fanins == 0) {
+      return Status::OK();
+    } else if (num_regular_fanins < node->input_size()) {
+      node->mutable_input()->DeleteSubrange(0, num_regular_fanins);
+    } else {
+      node->clear_input();
+    }
+  } else {
+    node->clear_input();
   }
+  return Status::OK();
 }
 
-void MutableGraphView::DeleteNodes(const std::set<string>& nodes_to_delete) {
-  for (const string& node_name_to_delete : nodes_to_delete)
-    RemoveFanouts(nodes().at(node_name_to_delete));
-  for (const string& node_name_to_delete : nodes_to_delete)
+Status MutableGraphView::UpdateFanin(absl::string_view node_name,
+                                     const TensorId& from_fanin,
+                                     const TensorId& to_fanin) {
+  auto error_status = [node_name, from_fanin, to_fanin](absl::string_view msg) {
+    string params =
+        absl::Substitute("node_name='$0', from_fanin='$1', to_fanin='$2'",
+                         node_name, from_fanin.ToString(), to_fanin.ToString());
+    return MutationError("UpdateFanin", params, msg);
+  };
+
+  TF_RETURN_IF_ERROR(CheckFaninIsValid(from_fanin, error_status));
+  TF_RETURN_IF_ERROR(CheckFaninIsValid(to_fanin, error_status));
+  NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+  NodeDef* from_fanin_node = GetNode(from_fanin.node());
+  TF_RETURN_IF_ERROR(
+      CheckNodeExists(from_fanin.node(), from_fanin_node, error_status));
+  NodeDef* to_fanin_node = GetNode(to_fanin.node());
+  TF_RETURN_IF_ERROR(
+      CheckNodeExists(to_fanin.node(), to_fanin_node, error_status));
+
+  // When replacing a non control dependency fanin with a control dependency, or
+  // vice versa, remove and add, so ports can be updated properly in fanout(s).
+  bool to_fanin_is_control = IsTensorIdControlling(to_fanin);
+  if (to_fanin_is_control && IsSwitch(*to_fanin_node)) {
+    // Can't add Switch node as a control dependency.
+    return error_status(
+        absl::Substitute("can't update to fanin '$0' as it will become a "
+                         "Switch control dependency",
+                         to_fanin.ToString()));
+  }
+  if (node_name == from_fanin.node() || node_name == to_fanin.node()) {
+    return error_status("can't update fanin to or from self");
+  }
+
+  if (from_fanin == to_fanin) {
+    return Status::OK();
+  }
+
+  bool from_fanin_is_control = IsTensorIdControlling(from_fanin);
+  if (from_fanin_is_control || to_fanin_is_control) {
+    bool modified = false;
+    if (from_fanin_is_control) {
+      modified |= RemoveControllingFaninInternal(node, from_fanin_node);
+    } else {
+      modified |= RemoveRegularFaninInternal(
+          node, {from_fanin_node, from_fanin.index()});
+    }
+    if (modified) {
+      AddFaninInternal(node, {to_fanin_node, to_fanin.index()});
+    }
+    return Status::OK();
+  }
+
+  // In place mutation of regular fanins, requires no shifting of ports.
+  string to_fanin_string = TensorIdToString(to_fanin);
+  const int num_regular_fanins =
+      NumFanins(*node, /*include_controlling_nodes=*/false);
+  bool modified = false;
+  absl::flat_hash_set<InputPort>* from_fanin_port_fanouts = nullptr;
+  absl::flat_hash_set<InputPort>* to_fanin_port_fanouts = nullptr;
+  for (int i = 0; i < num_regular_fanins; ++i) {
+    if (ParseTensorName(node->input(i)) == from_fanin) {
+      InputPort input(node, i);
+      if (from_fanin_port_fanouts == nullptr) {
+        OutputPort from_fanin_port(from_fanin_node, from_fanin.index());
+        from_fanin_port_fanouts = &fanouts()[from_fanin_port];
+      }
+      from_fanin_port_fanouts->erase(input);
+
+      if (to_fanin_port_fanouts == nullptr) {
+        OutputPort to_fanin_port(to_fanin_node, to_fanin.index());
+        to_fanin_port_fanouts = &fanouts()[to_fanin_port];
+      }
+      to_fanin_port_fanouts->insert(input);
+
+      node->set_input(i, to_fanin_string);
+      modified = true;
+    }
+  }
+
+  // Dedup control dependencies and update max regular output ports.
+  if (modified) {
+    UpdateMaxRegularOutputPortForRemovedFanin(
+        {from_fanin_node, from_fanin.index()}, *from_fanin_port_fanouts);
+    if (max_regular_output_port()[to_fanin_node] < to_fanin.index()) {
+      max_regular_output_port()[to_fanin_node] = to_fanin.index();
+    }
+    if (CanDedupControlWithRegularInput(*this, *to_fanin_node)) {
+      RemoveControllingFaninInternal(node, to_fanin_node);
+    }
+  }
+
+  return Status::OK();
+}
+
+Status MutableGraphView::UpdateRegularFaninByPort(absl::string_view node_name,
+                                                  int port,
+                                                  const TensorId& fanin) {
+  auto error_status = [node_name, port, fanin](absl::string_view msg) {
+    string params = absl::Substitute("node_name='$0', port=$1, fanin='$2'",
+                                     node_name, port, fanin.ToString());
+    return MutationError("UpdateRegularFaninByPort", params, msg);
+  };
+
+  TF_RETURN_IF_ERROR(CheckFaninIsRegular(fanin, error_status));
+  TF_RETURN_IF_ERROR(CheckAddingFaninToSelf(node_name, fanin, error_status));
+  NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+  const int last_regular_fanin_port =
+      gtl::FindWithDefault(max_regular_input_port(), node, -1);
+  TF_RETURN_IF_ERROR(
+      CheckPortRange(port, /*min=*/0, last_regular_fanin_port, error_status));
+  NodeDef* fanin_node = GetNode(fanin.node());
+  TF_RETURN_IF_ERROR(CheckNodeExists(fanin.node(), fanin_node, error_status));
+
+  TensorId tensor_id = ParseTensorName(node->input(port));
+  if (tensor_id == fanin) {
+    return Status::OK();
+  }
+
+  InputPort input(node, port);
+  OutputPort from_fanin_port(nodes()[tensor_id.node()], tensor_id.index());
+  absl::flat_hash_set<InputPort>* from_fanouts = &fanouts()[from_fanin_port];
+  from_fanouts->erase(input);
+  UpdateMaxRegularOutputPortForRemovedFanin(from_fanin_port, *from_fanouts);
+
+  OutputPort to_fanin_port(fanin_node, fanin.index());
+  fanouts()[to_fanin_port].insert(input);
+  UpdateMaxRegularOutputPortForAddedFanin(to_fanin_port);
+
+  node->set_input(port, TensorIdToString(fanin));
+
+  if (CanDedupControlWithRegularInput(*this, *fanin_node)) {
+    RemoveControllingFaninInternal(node, fanin_node);
+  }
+
+  return Status::OK();
+}
+
+Status MutableGraphView::SwapRegularFaninsByPorts(absl::string_view node_name,
+                                                  int from_port, int to_port) {
+  auto error_status = [node_name, from_port, to_port](absl::string_view msg) {
+    string params = absl::Substitute("node_name='$0', from_port=$1, to_port=$2",
+                                     node_name, from_port, to_port);
+    return MutationError("SwapRegularFaninsByPorts", params, msg);
+  };
+
+  NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+  const int last_regular_fanin_port =
+      gtl::FindWithDefault(max_regular_input_port(), node, -1);
+  TF_RETURN_IF_ERROR(CheckPortRange(from_port, /*min=*/0,
+                                    last_regular_fanin_port, error_status));
+  TF_RETURN_IF_ERROR(CheckPortRange(to_port, /*min=*/0, last_regular_fanin_port,
+                                    error_status));
+
+  if (from_port == to_port) {
+    return Status::OK();
+  }
+  TensorId from_fanin = ParseTensorName(node->input(from_port));
+  TensorId to_fanin = ParseTensorName(node->input(to_port));
+  if (from_fanin == to_fanin) {
+    return Status::OK();
+  }
+
+  InputPort from_input(node, from_port);
+  InputPort to_input(node, to_port);
+  NodeDef* from_fanin_node = GetNode(from_fanin.node());
+  absl::flat_hash_set<InputPort>* from_fanouts =
+      &fanouts()[{from_fanin_node, from_fanin.index()}];
+  from_fanouts->erase(from_input);
+  from_fanouts->insert(to_input);
+  NodeDef* to_fanin_node = GetNode(to_fanin.node());
+  absl::flat_hash_set<InputPort>* to_fanouts =
+      &fanouts()[{to_fanin_node, to_fanin.index()}];
+  to_fanouts->erase(to_input);
+  to_fanouts->insert(from_input);
+
+  node->mutable_input()->SwapElements(from_port, to_port);
+
+  return Status::OK();
+}
+
+Status MutableGraphView::UpdateAllRegularFaninsToControlling(
+    absl::string_view node_name) {
+  auto error_status = [node_name](absl::string_view msg) {
+    string params = absl::Substitute("node_name='$0'", node_name);
+    return MutationError("UpdateAllRegularFaninsToControlling", params, msg);
+  };
+
+  NodeDef* node = GetNode(node_name);
+  TF_RETURN_IF_ERROR(CheckNodeExists(node_name, node, error_status));
+
+  const int num_regular_fanins =
+      NumFanins(*node, /*include_controlling_nodes=*/false);
+  std::vector<OutputPort> regular_fanins;
+  regular_fanins.reserve(num_regular_fanins);
+  std::vector<NodeDef*> controlling_fanins;
+  controlling_fanins.reserve(num_regular_fanins);
+
+  // Get all regular fanins and derive controlling fanins.
+  for (int i = 0; i < num_regular_fanins; ++i) {
+    TensorId tensor_id = ParseTensorName(node->input(i));
+    OutputPort fanin_port(nodes()[tensor_id.node()], tensor_id.index());
+
+    string error_msg = "";
+    NodeDef* control_node =
+        GetControllingFaninToAdd(node_name, fanin_port, &error_msg);
+    if (!error_msg.empty()) {
+      return error_status(error_msg);
+    }
+
+    regular_fanins.push_back(fanin_port);
+    controlling_fanins.push_back(control_node);
+  }
+
+  // Replace regular fanins with controlling fanins and dedup.
+  int pos = 0;
+  InputPort input_port(node, Graph::kControlSlot);
+  absl::flat_hash_set<absl::string_view> controls;
+  for (int i = 0; i < num_regular_fanins; ++i) {
+    OutputPort fanin_port = regular_fanins[i];
+    NodeDef* control = controlling_fanins[i];
+    if (control == nullptr) {
+      control = GetOrCreateIdentityConsumingSwitch(fanin_port);
+    }
+    fanouts()[fanin_port].erase({node, i});
+    if (controls.contains(control->name())) {
+      continue;
+    }
+    controls.insert(control->name());
+    node->set_input(pos, AsControlDependency(control->name()));
+    fanouts()[{control, Graph::kControlSlot}].insert(input_port);
+    ++pos;
+  }
+
+  // Shift existing controlling fanins and dedup.
+  for (int i = num_regular_fanins; i < node->input_size(); ++i) {
+    TensorId tensor_id = ParseTensorName(node->input(i));
+    if (controls.contains(tensor_id.node())) {
+      continue;
+    }
+    controls.insert(tensor_id.node());
+    node->mutable_input()->SwapElements(pos, i);
+    ++pos;
+  }
+
+  // Remove duplicate controls and leftover regular fanins.
+  node->mutable_input()->DeleteSubrange(pos, node->input_size() - pos);
+  max_regular_input_port().erase(node);
+
+  return Status::OK();
+}
+
+Status MutableGraphView::CheckNodesCanBeDeleted(
+    const absl::flat_hash_set<string>& nodes_to_delete) {
+  std::vector<string> missing_nodes;
+  std::vector<string> nodes_with_fanouts;
+  for (const string& node_name_to_delete : nodes_to_delete) {
+    NodeDef* node = GetNode(node_name_to_delete);
+    if (node == nullptr) {
+      // Can't delete missing node.
+      missing_nodes.push_back(node_name_to_delete);
+      continue;
+    }
+    const int max_port = gtl::FindWithDefault(max_regular_output_port(), node,
+                                              Graph::kControlSlot);
+    for (int i = Graph::kControlSlot; i <= max_port; ++i) {
+      auto it = fanouts().find({node, i});
+      bool has_retained_fanout = false;
+      if (it != fanouts().end()) {
+        for (const auto& fanout : it->second) {
+          // Check if fanouts are of nodes to be deleted, and if so, they can be
+          // ignored, as they will be removed also.
+          if (!nodes_to_delete.contains(fanout.node->name())) {
+            // Removing node will leave graph in an invalid state.
+            has_retained_fanout = true;
+            break;
+          }
+        }
+      }
+      if (has_retained_fanout) {
+        nodes_with_fanouts.push_back(node_name_to_delete);
+        break;
+      }
+    }
+  }
+
+  // Error message can get quite long, so we only show the first 5 node names.
+  auto sort_and_sample = [](std::vector<string>* s) {
+    constexpr int kMaxNodeNames = 5;
+    std::sort(s->begin(), s->end());
+    if (s->size() > kMaxNodeNames) {
+      return absl::StrCat(
+          absl::StrJoin(s->begin(), s->begin() + kMaxNodeNames, ", "), ", ...");
+    }
+    return absl::StrJoin(*s, ", ");
+  };
+
+  if (!missing_nodes.empty()) {
+    VLOG(2) << absl::Substitute("Attempting to delete missing node(s) [$0].",
+                                sort_and_sample(&missing_nodes));
+  }
+  if (!nodes_with_fanouts.empty()) {
+    std::vector<string> input_node_names(nodes_to_delete.begin(),
+                                         nodes_to_delete.end());
+    string params = absl::Substitute("nodes_to_delete={$0}",
+                                     sort_and_sample(&input_node_names));
+    string error_msg =
+        absl::Substitute("can't delete node(s) with retained fanouts(s) [$0]",
+                         sort_and_sample(&nodes_with_fanouts));
+    return MutationError("DeleteNodes", params, error_msg);
+  }
+
+  return Status::OK();
+}
+
+Status MutableGraphView::DeleteNodes(
+    const absl::flat_hash_set<string>& nodes_to_delete) {
+  TF_RETURN_IF_ERROR(CheckNodesCanBeDeleted(nodes_to_delete));
+
+  // Find nodes in internal state and delete.
+  for (const string& node_name_to_delete : nodes_to_delete) {
+    NodeDef* node = GetNode(node_name_to_delete);
+    if (node != nullptr) {
+      RemoveFaninsInternal(node, /*keep_controlling_fanins=*/false);
+      RemoveFanoutsInternal(node);
+    }
+  }
+  for (const string& node_name_to_delete : nodes_to_delete) {
     nodes().erase(node_name_to_delete);
-  EraseNodesFromGraph(nodes_to_delete, graph());
+  }
+
+  // Find nodes in graph and delete by partitioning into nodes to retain and
+  // nodes to delete based on input set of nodes to delete by name.
+  // TODO(lyandy): Use a node name->idx hashmap if this is a performance
+  // bottleneck.
+  int pos = 0;
+  const int last_idx = graph()->node_size() - 1;
+  int last_pos = last_idx;
+  while (pos <= last_pos) {
+    if (nodes_to_delete.contains(graph()->node(pos).name())) {
+      graph()->mutable_node()->SwapElements(pos, last_pos);
+      --last_pos;
+    } else {
+      ++pos;
+    }
+  }
+  if (last_pos < last_idx) {
+    graph()->mutable_node()->DeleteSubrange(last_pos + 1, last_idx - last_pos);
+  }
+
+  return Status::OK();
 }
 
-void MutableGraphView::RemoveFanouts(NodeDef* deleted_node) {
+void MutableGraphView::RemoveFaninsInternal(NodeDef* deleted_node,
+                                            bool keep_controlling_fanins) {
   for (int i = 0; i < deleted_node->input_size(); ++i) {
     TensorId tensor_id = ParseTensorName(deleted_node->input(i));
+    bool is_control = IsTensorIdControlling(tensor_id);
+    if (keep_controlling_fanins && is_control) {
+      break;
+    }
     OutputPort fanin(nodes()[tensor_id.node()], tensor_id.index());
 
     InputPort input;
     input.node = deleted_node;
-    if (tensor_id.index() < 0)
-      input.port_id = Graph::kControlSlot;
-    else
-      input.port_id = i;
+    input.port_id = is_control ? Graph::kControlSlot : i;
+
+    auto it = fanouts().find(fanin);
+    if (it != fanouts().end()) {
+      absl::flat_hash_set<InputPort>* fanouts_set = &it->second;
+      fanouts_set->erase(input);
+      UpdateMaxRegularOutputPortForRemovedFanin(fanin, *fanouts_set);
+    }
+  }
+  max_regular_input_port().erase(deleted_node);
+}
 
-    fanouts()[fanin].erase(input);
+void MutableGraphView::RemoveFanoutsInternal(NodeDef* deleted_node) {
+  const int max_port =
+      gtl::FindWithDefault(max_regular_output_port(), deleted_node, -1);
+  for (int i = Graph::kControlSlot; i <= max_port; ++i) {
+    fanouts().erase({deleted_node, i});
   }
+  max_regular_output_port().erase(deleted_node);
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/mutable_graph_view.h b/tensorflow/core/grappler/mutable_graph_view.h
index 355dd6c491763e96b509ce42977e2cf0f5db2eb5..a09c147be6c1a044d558c202e2047ae6a5d12916 100644
--- a/tensorflow/core/grappler/mutable_graph_view.h
+++ b/tensorflow/core/grappler/mutable_graph_view.h
@@ -16,11 +16,26 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_MUTABLE_GRAPH_VIEW_H_
 #define TENSORFLOW_CORE_GRAPPLER_MUTABLE_GRAPH_VIEW_H_
 
+#include <set>
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace grappler {
 
+const char kMutableGraphViewCtrl[] = "ConstantFoldingCtrl";
+
 // A utility class to simplify the traversal of a GraphDef that, unlike
 // GraphView, supports updating the graph.  Note that you should not modify the
 // graph separately, because the view will get out of sync.
@@ -29,7 +44,7 @@ class MutableGraphView : public internal::GraphViewInternal<GraphDef, NodeDef> {
  public:
   explicit MutableGraphView(GraphDef* graph) : GraphViewInternal(graph) {
     for (NodeDef& node : *graph->mutable_node()) AddUniqueNodeOrDie(&node);
-    for (NodeDef& node : *graph->mutable_node()) AddFanouts(&node);
+    for (NodeDef& node : *graph->mutable_node()) AddAndDedupFanouts(&node);
   }
 
   // Lookup fanouts/fanins using immutable ports.
@@ -48,40 +63,266 @@ class MutableGraphView : public internal::GraphViewInternal<GraphDef, NodeDef> {
   // node in graph.
   NodeDef* AddNode(NodeDef&& node);
 
-  // Updates all fanouts (input ports fetching output tensors) from `from_node`
-  // to the `to_node`, including control dependencies.
+  // Adds all nodes from the `subgraph` to the underlying graph and updates the
+  // view. `subgraph` doesn't have to be a valid graph definition on it's own,
+  // it can have edges to the nodes that are not in it, however after adding
+  // it to the underlying graph, final graph must be valid.
+  //
+  // If subgraph function library is not empty, all new functions will be added
+  // to the graph. Functions that appear with the same name in both subgraph and
+  // the graph represented by *this, must have identical function definitions.
+  //
+  // IMPORTANT: All nodes and functions of the given subgraph moved into the
+  // underlying graph, which leaves subgraph in valid but undefined state.
+  Status AddSubgraph(GraphDef&& subgraph);
+
+  // Updates node `node_name` op, device, and attributes. This will clear any
+  // existing attributes. If it is not possible to update the node or if the
+  // node does not exist, an error will be returned and nothing will be modified
+  // in the graph.
+  Status UpdateNode(absl::string_view node_name, absl::string_view op,
+                    absl::string_view device,
+                    absl::Span<const std::pair<string, AttrValue>> attrs);
+
+  // Updates node `from_node_name` name to `to_node_name`. If `to_node_name` is
+  // in use, node `from_node_name` does not exist, or node `from_node_name` has
+  // fanouts and `update_fanouts` is set to false, an error will be returned and
+  // nothing will be modified in the graph.
+  Status UpdateNodeName(absl::string_view from_node_name,
+                        absl::string_view to_node_name, bool update_fanouts);
+
+  // Swap node names `from_node_name` and `to_node_name`. Self loops of one node
+  // are removed by updating the inputs introducing self loops to use the other
+  // node's name. Setting `update_fanouts` to false will exclude other fanouts
+  // from having their inputs updated, but inputs introducing self loops will
+  // always be updated regardless of `update_fanouts.
+  //
+  // Example:
+  //   1. foo(other:3, bar:2, ^bar)
+  //   2. bar(foo:3, other:1, foo:1, ^foo)
+  //   3. other(foo:5, bar:6)
+  //
+  // After calling SwapNodeNames("foo", "bar", false):
+  //   1. bar(other:3, foo:2, ^foo)
+  //   2. foo(bar:3, other:1, bar:1, ^bar)
+  //   3. other(foo:5, bar:6)
+  //
+  // After calling SwapNodeNames("foo", "bar", true):
+  //   1. bar(other:3, foo:2, ^foo)
+  //   2. foo(bar:3, other:1, bar:1, ^bar)
+  //   3. other(bar:5, foo:6)
   //
-  // Example: We have 2 nodes that use `bar` node output tensors as inputs:
-  //   1. foo1(bar:0, bar:1, other:0, ^bar)
+  // If it is not possible to swap node names (i.e. nodes do not exist or Switch
+  // control dependency may be introduced), an error will be returned and
+  // nothing will be modified in the graph.
+  Status SwapNodeNames(absl::string_view from_node_name,
+                       absl::string_view to_node_name, bool update_fanouts);
+
+  // Updates all fanouts (input ports fetching output tensors) from
+  // `from_node_name` to the `to_node_name`, including control dependencies.
+  //
+  // Example: We have 3 nodes that use `bar` node output tensors as inputs:
+  //   1. foo1(bar:0, bar:1, other:0)
   //   2. foo2(bar:1, other:1)
+  //   3. foo3(other:2, ^bar)
   //
-  // After calling ForwardOutputs(bar, new_bar):
-  //   1. foo1(new_bar:0, new_bar:1, other:0, ^new_bar)
+  // After calling UpdateFanouts(bar, new_bar):
+  //   1. foo1(new_bar:0, new_bar:1, other:0)
   //   2. foo2(new_bar:1, other:1)
-  void UpdateFanouts(absl::string_view from_node, absl::string_view to_node);
+  //   3. foo3(other:2, ^new_bar)
+  Status UpdateFanouts(absl::string_view from_node_name,
+                       absl::string_view to_node_name);
+
+  // Adds regular fanin `fanin` to node `node_name`. If the node or fanin do not
+  // exist in the graph, nothing will be modified in the graph. Otherwise fanin
+  // will be added after existing non control dependency fanins. Control
+  // dependencies will be deduped. To add control dependencies, use
+  // AddControllingFanin.
+  Status AddRegularFanin(absl::string_view node_name, const TensorId& fanin);
+
+  // Adds regular fanin `fanin` to node `node_name` at port `port`. If the node
+  // or fanin do not exist in the graph, nothing will be modified in the graph.
+  // Otherwise fanin will be inserted at port `port`. Control dependencies will
+  // be deduped. To add control dependencies, use AddControllingFanin.
+  //
+  // If the port is not a valid port (less than 0 or greater than the number of
+  // regular fanins), this will result in an error and the node will not be
+  // modified.
+  Status AddRegularFaninByPort(absl::string_view node_name, int port,
+                               const TensorId& fanin);
+
+  // Adds control dependency `fanin` to the target node named `node_name`. To
+  // add regular fanins, use AddRegularFanin.
+  //
+  // Case 1: If the fanin is not a Switch node, the control dependency is simply
+  // added to the target node:
+  //
+  //   fanin -^> target node.
+  //
+  // Case 2: If the fanin is a Switch node, we cannot anchor a control
+  // dependency on it, because unlike other nodes, only one of its outputs will
+  // be generated when the node is activated. In this case, we try to find an
+  // Identity/IdentityN node in the fanout of the relevant port of the Switch
+  // and add it as a fanin to the target node. If no such Identity/IdentityN
+  // node can be found, a new Identity node will be created. In both cases, we
+  // end up with:
+  //
+  //   fanin -> Identity{N} -^> target node.
+  //
+  // If the control dependency being added is redundant (control dependency
+  // already exists or control dependency can be deduped from regular fanins),
+  // this will not result in an error and the node will not be modified.
+  Status AddControllingFanin(absl::string_view node_name,
+                             const TensorId& fanin);
+
+  // Removes regular fanin `fanin` from node `node_name`. If the node or fanin
+  // do not exist in the graph, nothing will be modified in the graph. If there
+  // are multiple inputs that match the fanin, all of them will be removed. To
+  // remove controlling fanins, use RemoveControllingFanin.
+  //
+  // If the fanin being removed doesn't exist in the node's inputs, this will
+  // not result in an error and the node will not be modified.
+  Status RemoveRegularFanin(absl::string_view node_name, const TensorId& fanin);
+
+  // Removes regular fanin at port `port` from node `node_name`. If the node
+  // does not exist in the graph, nothing will be modified in the graph.
+  // To remove controlling fanins, use RemoveControllingFanin.
+  //
+  // If the port is not a valid port (less than 0 or greater than the last index
+  // of the regular fanins), this will result in an error and the node will not
+  // be modified.
+  Status RemoveRegularFaninByPort(absl::string_view node_name, int port);
+
+  // Removes control dependency `fanin_node_name` from the target node named
+  // `node_name`. If the node or fanin do not exist in the graph, nothing will
+  // be modified in the graph. To remove regular fanins, use RemoveRegualrFanin.
+  //
+  // If the fanin being removed doesn't exist in the node's inputs, this will
+  // not result in an error and the node will not be modified.
+  Status RemoveControllingFanin(absl::string_view node_name,
+                                absl::string_view fanin_node_name);
+
+  // Removes all fanins from node `node_name`. Control dependencies will be
+  // retained if keep_controlling_fanins is true.
+  //
+  // If no fanins are removed, this will not result in an error and the node
+  // will not be modified.
+  Status RemoveAllFanins(absl::string_view node_name,
+                         bool keep_controlling_fanins);
+
+  // Replaces all fanins `from_fanin` with `to_fanin` in node `node_name`. If
+  // the fanins or node do not exist, nothing will be modified in the graph.
+  // Control dependencies will be deduped.
+  //
+  // If the fanin being updated doesn't exist in the node's inputs, this will
+  // not result in an error and the node will not be modified.
+  Status UpdateFanin(absl::string_view node_name, const TensorId& from_fanin,
+                     const TensorId& to_fanin);
+
+  // Replaces fanin at port `port` in node `node_name` with fanin `fanin`. If
+  // the fanins or node do not exist, nothing will be modified in the graph.
+  // Control dependencies will be deduped.
+  //
+  // If the port is not a valid port (less than 0 or greater than the last index
+  // of the regular fanins), this will result in an error and the node will not
+  // be modified.
+  Status UpdateRegularFaninByPort(absl::string_view node_name, int port,
+                                  const TensorId& fanin);
+
+  // Swaps fanins at ports `from_port` and `to_port` in node `node_name`. If the
+  // node does not exist, nothing will be modified in the graph.
+  //
+  // If the ports are not a valid port (less than 0 or greater than the last
+  // index of the regular fanins), this will result in an error and the node
+  // will not be modified.
+  Status SwapRegularFaninsByPorts(absl::string_view node_name, int from_port,
+                                  int to_port);
+
+  // Updates all regular fanins to equivalent controlling fanins. If it is not
+  // possible, an error will be returned and nothing will be modified in the
+  // graph.
+  Status UpdateAllRegularFaninsToControlling(absl::string_view node_name);
 
-  // Deletes nodes from the graph.
-  void DeleteNodes(const std::set<string>& nodes_to_delete);
+  // Deletes nodes from the graph. If a node can't be safely removed,
+  // specifically if a node still has fanouts, an error will be returned. Nodes
+  // that can't be found are ignored.
+  Status DeleteNodes(const absl::flat_hash_set<string>& nodes_to_delete);
 
  private:
+  // Adds fanouts for fanins of node to graph, while deduping control
+  // dependencies from existing control dependencies and regular fanins. Note,
+  // node inputs will be mutated if control dependencies can be deduped.
+  void AddAndDedupFanouts(NodeDef* node);
+
+  // Finds next output port smaller than fanin.port_id and update. The
+  // max_regular_output_port is only updated if fanin.port_id is the same as the
+  // current max_regular_output_port and if the fanouts set is empty. If there
+  // are no regular outputs, max_regular_output_port will be erased.
+  void UpdateMaxRegularOutputPortForRemovedFanin(
+      const OutputPort& fanin,
+      const absl::flat_hash_set<InputPort>& fanin_fanouts);
+
+  // Updates max regular output port for newly added fanin by checking the
+  // current max and updating if the newly added fanin is of a larger port.
+  void UpdateMaxRegularOutputPortForAddedFanin(const OutputPort& fanin);
+
   // Updates all fanouts (input ports fetching output tensors) from `from_node`
   // to the `to_node`, including control dependencies.
   //
-  // Example: We have 2 nodes that use `bar` node output tensors as inputs:
-  //   1. foo1(bar:0, bar:1, other:0, ^bar)
+  // Example: We have 3 nodes that use `bar` node output tensors as inputs:
+  //   1. foo1(bar:0, bar:1, other:0)
   //   2. foo2(bar:1, other:1)
+  //   3. foo3(other:2, ^bar)
   //
-  // After calling ForwardOutputs(bar, new_bar):
-  //   1. foo1(new_bar:0, new_bar:1, other:0, ^new_bar)
+  // After calling UpdateFanouts(bar, new_bar):
+  //   1. foo1(new_bar:0, new_bar:1, other:0)
   //   2. foo2(new_bar:1, other:1)
+  //   3. foo3(other:2, ^new_bar)
   //
   // IMPORTANT: If `from_node` or `to_node` is not in the underlying graph, the
   // behavior is undefined.
-  void UpdateFanouts(NodeDef* from_node, NodeDef* to_node);
+  Status UpdateFanoutsInternal(NodeDef* from_node, NodeDef* to_node);
+
+  // Adds fanin to node. If fanin is a control dependency, existing control
+  // dependencies will be checked first before adding. Otherwise fanin will be
+  // added after existing non control dependency inputs.
+  bool AddFaninInternal(NodeDef* node, const OutputPort& fanin);
+
+  // Finds control dependency node to be used based on fanin. If fanin is not a
+  // Switch node, fanin.node is simply returned. Otherwise this will try to find
+  // a candidate Identity node consuming fanin, as the control dependency. If it
+  // is not possible or will introduce a self loop, an error message will be
+  // set. If nullptr is returned with no error
+  // GetOrCreateIdentityConsumingSwitch should be called to generate the new
+  // Identity node.
+  NodeDef* GetControllingFaninToAdd(absl::string_view node_name,
+                                    const OutputPort& fanin, string* error_msg);
+
+  // Finds a generated Identity node consuming Switch node `fanin.node` at port
+  // `fanin.port_id`. If such a node does not exist, a new Identity node will be
+  // created.
+  NodeDef* GetOrCreateIdentityConsumingSwitch(const OutputPort& fanin);
+
+  // Removes all instances of regular fanin `fanin` from node `node`.
+  bool RemoveRegularFaninInternal(NodeDef* node, const OutputPort& fanin);
+
+  // Removes controlling fanin `fanin_node` from node if such controlling fanin
+  // exists.
+  bool RemoveControllingFaninInternal(NodeDef* node, NodeDef* fanin_node);
+
+  // Checks if nodes to be deleted are missing or have any fanouts that will
+  // remain in the graph. If node is removed in either case, the graph will
+  // enter an invalid state.
+  Status CheckNodesCanBeDeleted(
+      const absl::flat_hash_set<string>& nodes_to_delete);
+
+  // Removes fanins of the deleted node from internal state. Control
+  // dependencies are retained iff keep_controlling_fanins is true.
+  void RemoveFaninsInternal(NodeDef* deleted_node,
+                            bool keep_controlling_fanins);
 
-  // Remove fanouts of the deleted node from internal state (including control
-  // dependencies).
-  void RemoveFanouts(NodeDef* deleted_node);
+  // Removes fanouts of the deleted node from internal state.
+  void RemoveFanoutsInternal(NodeDef* deleted_node);
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/mutable_graph_view_test.cc b/tensorflow/core/grappler/mutable_graph_view_test.cc
index c1b3f8c01cf3dbb570d64845fb7097d1b309fc30..07818d1f526b5b7d7897fd5db2c561b5d90965c7 100644
--- a/tensorflow/core/grappler/mutable_graph_view_test.cc
+++ b/tensorflow/core/grappler/mutable_graph_view_test.cc
@@ -14,10 +14,16 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "absl/strings/substitute.h"
+#include "absl/types/span.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -25,6 +31,723 @@ namespace grappler {
 namespace {
 
 using ::tensorflow::test::function::NDef;
+using FDH = FunctionDefHelper;
+
+void CompareNodeFanins(const MutableGraphView& graph, NodeDef* node,
+                       absl::Span<const string> fanins) {
+  ASSERT_EQ(node->input_size(), fanins.size());
+  for (int i = 0; i < node->input_size(); ++i) {
+    TensorId tensor_id = ParseTensorName(fanins[i]);
+    EXPECT_EQ(ParseTensorName(node->input(i)), tensor_id);
+    int port;
+    if (tensor_id.index() == Graph::kControlSlot) {
+      port = Graph::kControlSlot;
+    } else {
+      port = i;
+    }
+    MutableGraphView::InputPort input_port(node, port);
+    MutableGraphView::OutputPort output_port =
+        graph.GetOutputPort(tensor_id.node(), tensor_id.index());
+    EXPECT_TRUE(graph.GetFanin(input_port).contains(output_port));
+    EXPECT_TRUE(graph.GetFanout(output_port).contains(input_port));
+  }
+}
+
+void CompareNodeFanouts(const MutableGraphView& graph, NodeDef* node,
+                        absl::Span<const string> fanouts) {
+  auto node_fanouts =
+      graph.GetFanouts(*node, /*include_controlled_nodes=*/true);
+  EXPECT_EQ(node_fanouts.size(), fanouts.size());
+  for (const string& fanout : fanouts) {
+    TensorId tensor_id = ParseTensorName(fanout);
+    MutableGraphView::InputPort input_port(graph.GetNode(tensor_id.node()),
+                                           tensor_id.index());
+    EXPECT_TRUE(node_fanouts.contains(input_port));
+  }
+}
+
+void CheckNode(const MutableGraphView& graph, absl::string_view node_name,
+               absl::string_view op, absl::string_view device,
+               absl::Span<const std::pair<string, FDH::AttrValueWrapper>> attrs,
+               absl::Span<const string> fanins,
+               absl::Span<const string> fanouts) {
+  NodeDef* node = graph.GetNode(node_name);
+  ASSERT_NE(node, nullptr);
+  EXPECT_EQ(node->op(), op);
+  EXPECT_EQ(node->device(), device);
+  EXPECT_EQ(node->attr_size(), attrs.size());
+  for (const auto& attr : attrs) {
+    auto it = node->attr().find(attr.first);
+    ASSERT_NE(it, node->attr().end());
+    EXPECT_TRUE(AreAttrValuesEqual(it->second, attr.second.proto));
+  }
+  CompareNodeFanins(graph, node, fanins);
+  CompareNodeFanouts(graph, node, fanouts);
+}
+
+void CheckGraph(const MutableGraphView& mutable_graph) {
+  GraphView immutable_graph(mutable_graph.graph());
+  EXPECT_EQ(mutable_graph.graph()->node_size(),
+            immutable_graph.graph()->node_size());
+  EXPECT_EQ(mutable_graph.graph(), immutable_graph.graph());
+
+  auto check_edges =
+      [](const absl::flat_hash_set<MutableGraphView::Edge>& mutable_edges,
+         const absl::flat_hash_set<GraphView::Edge>& immutable_edges) {
+        EXPECT_EQ(mutable_edges.size(), immutable_edges.size());
+        for (const auto& fanin_edge : mutable_edges) {
+          GraphView::Edge immutable_edge(
+              {fanin_edge.src.node, fanin_edge.src.port_id},
+              {fanin_edge.dst.node, fanin_edge.dst.port_id});
+          EXPECT_TRUE(immutable_edges.contains(immutable_edge));
+        }
+      };
+
+  // Check graph connectivity.
+  for (auto& node : *mutable_graph.graph()->mutable_node()) {
+    EXPECT_EQ(&node, immutable_graph.GetNode(node.name()));
+
+    auto mutable_fanins =
+        mutable_graph.GetFanins(node, /*include_controlling_nodes=*/true);
+    auto immutable_fanins =
+        immutable_graph.GetFanins(node, /*include_controlling_nodes=*/true);
+    EXPECT_EQ(mutable_fanins.size(), immutable_fanins.size());
+    for (const auto& fanin : mutable_fanins) {
+      GraphView::OutputPort immutable_fanin(fanin.node, fanin.port_id);
+      EXPECT_TRUE(immutable_fanins.contains(immutable_fanin));
+    }
+
+    auto mutable_fanouts =
+        mutable_graph.GetFanouts(node, /*include_controlled_nodes=*/true);
+    auto immutable_fanouts =
+        immutable_graph.GetFanouts(node, /*include_controlled_nodes=*/true);
+    EXPECT_EQ(mutable_fanouts.size(), immutable_fanouts.size());
+    for (const auto& fanout : mutable_fanouts) {
+      GraphView::InputPort immutable_fanout(fanout.node, fanout.port_id);
+      EXPECT_TRUE(immutable_fanouts.contains(immutable_fanout));
+    }
+
+    auto mutable_fanin_edges =
+        mutable_graph.GetFaninEdges(node, /*include_controlling_edges=*/true);
+    auto immutable_fanin_edges =
+        immutable_graph.GetFaninEdges(node, /*include_controlling_edges=*/true);
+    check_edges(mutable_fanin_edges, immutable_fanin_edges);
+
+    auto mutable_fanout_edges =
+        mutable_graph.GetFanoutEdges(node, /*include_controlled_edges=*/true);
+    auto immutable_fanout_edges =
+        immutable_graph.GetFanoutEdges(node, /*include_controlled_edges=*/true);
+    check_edges(mutable_fanout_edges, immutable_fanout_edges);
+  }
+}
+
+TEST(MutableGraphViewTest, AddSubgraph) {
+  GraphDef graph_def = test::function::GDef(
+      {
+          NDef("foo", "NotImportant", {}, {}),
+          NDef("bar", "NotImportant", {}, {}),
+          NDef("baz", "NotImportant", {"foo", "bar"}),
+      },
+      /*funcs=*/{});
+  MutableGraphView graph(&graph_def);
+
+  // `s/bar` node has inputs that are valid only if we add subgraph into the
+  // original graph.
+  GraphDef subgraph = test::function::GDef(
+      {
+          NDef("s/n0", "NotImportant", {}, {}),
+          NDef("s/n1", "NotImportant", {"bar", "s/n0"}, {}),
+      },
+      /*funcs=*/{});
+
+  TF_EXPECT_OK(graph.AddSubgraph(std::move(subgraph)));
+
+  // Fanins and fanouts must be updated for the nodes of the original graph, and
+  // added subgraph.
+  CheckNode(graph, "bar", "NotImportant", "", {}, {}, {"baz:1", "s/n1"});
+  CheckNode(graph, "s/n1", "NotImportant", "", {}, {"bar", "s/n0"}, {});
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, AddSubgraphAndAddFunction) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+
+  FunctionDef x_times_two = test::function::XTimesTwo();
+  GraphDef subgraph = test::function::GDef({}, {x_times_two});
+
+  TF_EXPECT_OK(graph.AddSubgraph(std::move(subgraph)));
+  EXPECT_EQ(graph_def.library().function_size(), 1);
+}
+
+TEST(MutableGraphViewTest, AddSubgraphAndSkipSameFunction) {
+  FunctionDef x_times_two = test::function::XTimesTwo();
+
+  GraphDef graph_def = test::function::GDef({}, {x_times_two});
+  MutableGraphView graph(&graph_def);
+
+  GraphDef subgraph = test::function::GDef({}, {x_times_two});
+
+  TF_EXPECT_OK(graph.AddSubgraph(std::move(subgraph)));
+  EXPECT_EQ(graph_def.library().function_size(), 1);
+}
+
+TEST(MutableGraphViewTest, AddSubgraphAndFailIfFunctionDifferent) {
+  FunctionDef x_times_four = test::function::XTimesFour();
+  x_times_four.mutable_signature()->set_name("XTimesTwo");
+
+  GraphDef graph_def = test::function::GDef({}, {x_times_four});
+  MutableGraphView graph(&graph_def);
+
+  FunctionDef x_times_two = test::function::XTimesTwo();
+  GraphDef subgraph = test::function::GDef({}, {x_times_two});
+
+  Status status = graph.AddSubgraph(std::move(subgraph));
+  EXPECT_FALSE(status.ok());
+  EXPECT_EQ(status.error_message(),
+            "MutableGraphView::AddSubgraph(function_size=1) error: Found "
+            "different function definition with the same name: XTimesTwo.");
+}
+
+TEST(MutableGraphViewTest, UpdateNodeNoDedupControlDependency) {
+  constexpr char kDevice[] = "/device:foo:0";
+  GraphDef graph_def = test::function::GDef(
+      {NDef("bar_1", "Switch", {}, {}), NDef("bar_2", "Identity", {"bar_1:1"}),
+       NDef("other", "NotImportant", {}, {}),
+       NDef("foo_1", "NotImportant", {"bar_2", "other", "bar_2:1", "^bar_2"}),
+       NDef("foo_2", "NotImportant", {"other:1", "bar_2:2", "^bar_2"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  AttrValue list_value;
+  list_value.mutable_list()->add_type(DT_FLOAT);
+  TF_EXPECT_OK(
+      graph.UpdateNode("bar_2", "IdentityN", kDevice, {{"T", list_value}}));
+
+  CheckNode(graph, "bar_1", "Switch", "", {}, {}, {"bar_2"});
+  CheckNode(graph, "bar_2", "IdentityN", kDevice, {{"T", list_value}},
+            {"bar_1:1"}, {"foo_1", "foo_1:2", "^foo_1", "foo_2:1", "^foo_2"});
+  CheckNode(graph, "other", "NotImportant", "", {}, {}, {"foo_1:1", "foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"bar_2", "other", "bar_2:1", "^bar_2"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"other:1", "bar_2:2", "^bar_2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, UpdateNodeDedupControlDependency) {
+  constexpr char kDevice[] = "/device:foo:0";
+  GraphDef graph_def = test::function::GDef(
+      {NDef("bar_1", "Switch", {}, {}), NDef("bar_2", "Identity", {"bar_1:1"}),
+       NDef("other", "NotImportant", {}, {}),
+       NDef("foo_1", "NotImportant", {"bar_2", "other", "bar_2:1", "^bar_2"}),
+       NDef("foo_2", "NotImportant", {"other:1", "bar_2:2", "^bar_2"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.UpdateNode("bar_2", "NotImportant", kDevice, {}));
+
+  CheckNode(graph, "bar_1", "Switch", "", {}, {}, {"bar_2"});
+  CheckNode(graph, "bar_2", "NotImportant", kDevice, {}, {"bar_1:1"},
+            {"foo_1", "foo_1:2", "foo_2:1"});
+  CheckNode(graph, "other", "NotImportant", "", {}, {}, {"foo_1:1", "foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"bar_2", "other", "bar_2:1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {}, {"other:1", "bar_2:2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, UpdateNodeSwitchNoControlDependency) {
+  constexpr char kDevice[] = "/device:foo:0";
+  GraphDef graph_def =
+      test::function::GDef({NDef("foo", "NotImportant", {}, {}),
+                            NDef("bar", "NotImportant", {"foo:1"})},
+                           /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.UpdateNode("foo", "Switch", kDevice, {}));
+
+  CheckNode(graph, "foo", "Switch", kDevice, {}, {}, {"bar"});
+  CheckNode(graph, "bar", "NotImportant", "", {}, {"foo:1"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, UpdateNodeSwitchControlDependency) {
+  constexpr char kDevice[] = "/device:foo:0";
+  GraphDef graph_def =
+      test::function::GDef({NDef("foo", "NotImportant", {}, {}),
+                            NDef("bar", "NotImportant", {"^foo"})},
+                           /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  AttrValue attr;
+  attr.set_type(DT_FLOAT);
+  Status s = graph.UpdateNode("foo", "Switch", kDevice, {{"T", attr}});
+  EXPECT_FALSE(s.ok());
+  string expected_msg =
+      "MutableGraphView::UpdateNodeOp(node_name='foo', op='Switch', "
+      "device='/device:foo:0', attrs={('T', type: DT_FLOAT)}) error: can't "
+      "change node op to Switch when node drives a control dependency "
+      "(alternatively, we could add the identity node needed, but it seems "
+      "like an unlikely event and probably a mistake).";
+  EXPECT_EQ(s.error_message(), expected_msg);
+
+  CheckNode(graph, "foo", "NotImportant", "", {}, {}, {"^bar"});
+  CheckNode(graph, "bar", "NotImportant", "", {}, {"^foo"}, {});
+
+  CheckGraph(graph);
+}
+
+absl::flat_hash_map<string, std::vector<string>> GetNodeInputsFromGraph(
+    const GraphDef& graph, absl::string_view node_to_exclude) {
+  absl::flat_hash_map<string, std::vector<string>> node_inputs;
+  for (const auto& node : graph.node()) {
+    if (node.name() == node_to_exclude) {
+      continue;
+    }
+    node_inputs[node.name()] =
+        std::vector<string>(node.input().begin(), node.input().end());
+  }
+  return node_inputs;
+}
+
+void CheckUnmodifiedNodeFanins(
+    const GraphDef& graph, absl::string_view node_to_exclude,
+    const absl::flat_hash_map<string, std::vector<string>>&
+        unmodified_node_inputs) {
+  for (const auto& node : graph.node()) {
+    if (node.name() == node_to_exclude) {
+      continue;
+    }
+    auto it = unmodified_node_inputs.find(node.name());
+    ASSERT_NE(it, unmodified_node_inputs.end());
+    ASSERT_EQ(it->second.size(), node.input_size());
+    for (int i = 0; i < node.input_size(); ++i) {
+      EXPECT_EQ(node.input(i), it->second[i]);
+    }
+  }
+}
+
+void TestUpdateNodeName(absl::string_view from_node_name, bool node_exists,
+                        absl::string_view to_node_name, bool update_fanouts,
+                        bool success, const string& error_msg,
+                        absl::Span<const string> expected_fanins) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a"}),
+       NDef("c", "NotImportant", {}, {})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* node = graph.GetNode(from_node_name);
+  if (node_exists) {
+    EXPECT_NE(node, nullptr);
+  } else {
+    EXPECT_EQ(node, nullptr);
+  }
+
+  absl::flat_hash_map<string, std::vector<string>> unmodified_node_inputs =
+      GetNodeInputsFromGraph(graph_def, from_node_name);
+
+  Status s = graph.UpdateNodeName(from_node_name, to_node_name, update_fanouts);
+  EXPECT_EQ(s.ok(), success);
+  string updated_node_name;
+  if (success) {
+    updated_node_name = string(to_node_name);
+  } else {
+    updated_node_name = string(from_node_name);
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+  if (node_exists) {
+    EXPECT_EQ(node->name(), updated_node_name);
+    CompareNodeFanins(graph, node, expected_fanins);
+  }
+
+  CheckUnmodifiedNodeFanins(graph_def, updated_node_name,
+                            unmodified_node_inputs);
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, UpdateNodeName) {
+  string error_msg;
+  // Node has no fanouts.
+  TestUpdateNodeName("b", /*node_exists=*/true, "d", /*update_fanouts=*/false,
+                     /*success=*/true, error_msg, {"a"});
+  // Node has fanouts and rename to self.
+  TestUpdateNodeName("b", /*node_exists=*/true, "b", /*update_fanouts=*/false,
+                     /*success=*/true, error_msg, {"a"});
+  // Node has no fanouts and rename to self.
+  TestUpdateNodeName("a", /*node_exists=*/true, "a", /*update_fanouts=*/false,
+                     /*success=*/true, error_msg, {});
+
+  // New node name is in use.
+  error_msg =
+      "MutableGraphView::UpdateNodeName(from_node_name='c', to_node_name='b', "
+      "update_fanouts=false) error: can't update node name because new node "
+      "name is in use.";
+  TestUpdateNodeName("c", /*node_exists=*/true, "b", /*update_fanouts=*/false,
+                     /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::UpdateNodeName(from_node_name='a', to_node_name='b', "
+      "update_fanouts=true) error: can't update node name because new node "
+      "name is in use.";
+  TestUpdateNodeName("a", /*node_exists=*/true, "b", /*update_fanouts=*/true,
+                     /*success=*/false, error_msg, {});
+  // Node has fanouts.
+  error_msg =
+      "MutableGraphView::UpdateNodeName(from_node_name='a', to_node_name='d', "
+      "update_fanouts=false) error: can't update node name because node has "
+      "fanouts.";
+  TestUpdateNodeName("a", /*node_exists=*/true, "d", /*update_fanouts=*/false,
+                     /*success=*/false, error_msg, {});
+  // Node does not exist.
+  error_msg =
+      "MutableGraphView::UpdateNodeName(from_node_name='d', to_node_name='e', "
+      "update_fanouts=false) error: node 'd' was not found.";
+  TestUpdateNodeName("d", /*node_exists=*/false, "e", /*update_fanouts=*/false,
+                     /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::UpdateNodeName(from_node_name='d', to_node_name='e', "
+      "update_fanouts=true) error: node 'd' was not found.";
+  TestUpdateNodeName("d", /*node_exists=*/false, "e", /*update_fanouts=*/true,
+                     /*success=*/false, error_msg, {});
+}
+
+TEST(MutableGraphViewTest, UpdateNodeNameWithFanouts) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a:2"}),
+       NDef("c", "NotImportant", {"b", "^a"}),
+       NDef("d", "NotImportant", {"^b", "^a"}),
+       NDef("e", "NotImportant", {"b:2", "c:4", "b:1", "^a"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.UpdateNodeName("b", "f", /*update_fanouts=*/true));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"f", "^c", "^d", "^e"});
+  CheckNode(graph, "f", "NotImportant", "", {}, {"a:2"},
+            {"c", "^d", "e", "e:2"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"f", "^a"}, {"e:1"});
+  CheckNode(graph, "d", "NotImportant", "", {}, {"^f", "^a"}, {});
+  CheckNode(graph, "e", "NotImportant", "", {}, {"f:2", "c:4", "f:1", "^a"},
+            {});
+
+  CheckGraph(graph);
+}
+
+GraphDef SimpleSwapNodeNamesMutationGraph() {
+  return test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("switch_1", "Switch", {"a"}),
+       NDef("identity_1", "Identity", {"switch_1:1"}),
+       NDef("b", "NotImportant", {}, {}), NDef("switch_2", "Switch", {"b"}),
+       NDef("identity_2", "Identity", {"switch_2:0"}),
+       NDef("foo_1", "NotImportant", {"identity_1", "^identity_1"}),
+       NDef("foo_2", "NotImportant", {"identity_2", "^identity_2"})},
+      /*funcs=*/{});
+}
+
+void TestSwapNodeNames(bool update_fanouts) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.SwapNodeNames("foo_1", "foo_2", update_fanouts));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"switch_1"});
+  CheckNode(graph, "switch_1", "Switch", "", {}, {"a"}, {"identity_1"});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"switch_1:1"},
+            {"foo_2", "^foo_2"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"switch_2"});
+  CheckNode(graph, "switch_2", "Switch", "", {}, {"b"}, {"identity_2"});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"switch_2:0"},
+            {"foo_1", "^foo_1"});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"identity_1", "^identity_1"}, {});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"identity_2", "^identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphView, SwapNodeNames) {
+  TestSwapNodeNames(/*update_fanouts=*/false);
+  TestSwapNodeNames(/*update_fanouts=*/true);
+}
+
+void TestSwapNodeNamesWithSameNames(bool update_fanouts) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.SwapNodeNames("identity_1", "identity_1", update_fanouts));
+
+  // No changes to graph.
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"switch_1"});
+  CheckNode(graph, "switch_1", "Switch", "", {}, {"a"}, {"identity_1"});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"switch_1:1"},
+            {"foo_1", "^foo_1"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"switch_2"});
+  CheckNode(graph, "switch_2", "Switch", "", {}, {"b"}, {"identity_2"});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"switch_2:0"},
+            {"foo_2", "^foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"identity_1", "^identity_1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"identity_2", "^identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphView, SwapNodeNamesSameName) {
+  TestSwapNodeNamesWithSameNames(/*update_fanouts=*/false);
+  TestSwapNodeNamesWithSameNames(/*update_fanouts=*/true);
+}
+
+TEST(MutableGraphView, SwapNodeNamesBetweenSwitches) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(
+      graph.SwapNodeNames("switch_1", "switch_2", /*update_fanouts=*/false));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"switch_2"});
+  CheckNode(graph, "switch_2", "Switch", "", {}, {"a"}, {"identity_2"});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"switch_1:1"},
+            {"foo_1", "^foo_1"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"switch_1"});
+  CheckNode(graph, "switch_1", "Switch", "", {}, {"b"}, {"identity_1"});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"switch_2:0"},
+            {"foo_2", "^foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"identity_1", "^identity_1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"identity_2", "^identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphView, SwapNodeNamesBetweenSwitchesAndUpdateFanouts) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(
+      graph.SwapNodeNames("switch_1", "switch_2", /*update_fanouts=*/true));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"switch_2"});
+  CheckNode(graph, "switch_2", "Switch", "", {}, {"a"}, {"identity_1"});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"switch_2:1"},
+            {"foo_1", "^foo_1"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"switch_1"});
+  CheckNode(graph, "switch_1", "Switch", "", {}, {"b"}, {"identity_2"});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"switch_1:0"},
+            {"foo_2", "^foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"identity_1", "^identity_1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"identity_2", "^identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphView, SwapNodeNamesSwitchAndNonSwitch) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.SwapNodeNames("a", "switch_1", /*update_fanouts=*/false));
+
+  // Dedup controls and fix self loop.
+  CheckNode(graph, "switch_1", "NotImportant", "", {}, {}, {"a", "identity_1"});
+  CheckNode(graph, "a", "Switch", "", {}, {"switch_1"}, {});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"switch_1:1"}, {"foo_1"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"switch_2"});
+  CheckNode(graph, "switch_2", "Switch", "", {}, {"b"}, {"identity_2"});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"switch_2:0"},
+            {"foo_2", "^foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {}, {"identity_1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"identity_2", "^identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphView, SwapNodeNamesSwitchAndNonSwitchAndUpdateFanouts) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.SwapNodeNames("a", "switch_1", /*update_fanouts=*/true));
+
+  CheckNode(graph, "switch_1", "NotImportant", "", {}, {}, {"a"});
+  CheckNode(graph, "a", "Switch", "", {}, {"switch_1"}, {"identity_1"});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"a:1"},
+            {"foo_1", "^foo_1"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"switch_2"});
+  CheckNode(graph, "switch_2", "Switch", "", {}, {"b"}, {"identity_2"});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"switch_2:0"},
+            {"foo_2", "^foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"identity_1", "^identity_1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"identity_2", "^identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphView, SwapNodeNamesNonSwitchAndSwitch) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.SwapNodeNames("switch_2", "b", /*update_fanouts=*/false));
+
+  // Dedup controls and fix self loop.
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"switch_1"});
+  CheckNode(graph, "switch_1", "Switch", "", {}, {"a"}, {"identity_1"});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"switch_1:1"},
+            {"foo_1", "^foo_1"});
+  CheckNode(graph, "switch_2", "NotImportant", "", {}, {}, {"b", "identity_2"});
+  CheckNode(graph, "b", "Switch", "", {}, {"switch_2"}, {});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"switch_2:0"}, {"foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"identity_1", "^identity_1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {}, {"identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphView, SwapNodeNamesNonSwitchAndSwitchAndUpdateFanouts) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.SwapNodeNames("switch_2", "b", /*update_fanouts=*/true));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"switch_1"});
+  CheckNode(graph, "switch_1", "Switch", "", {}, {"a"}, {"identity_1"});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"switch_1:1"},
+            {"foo_1", "^foo_1"});
+  CheckNode(graph, "switch_2", "NotImportant", "", {}, {}, {"b"});
+  CheckNode(graph, "b", "Switch", "", {}, {"switch_2"}, {"identity_2"});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"b:0"},
+            {"foo_2", "^foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"identity_1", "^identity_1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"identity_2", "^identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+void TestSwapNodeNamesSimpleSelfLoop(bool update_fanouts) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {"b:7"}), NDef("b", "NotImportant", {"a:10"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.SwapNodeNames("a", "b", update_fanouts));
+
+  // No self loops.
+  CheckNode(graph, "a", "NotImportant", "", {}, {"b:10"}, {"b:0"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a:7"}, {"a:0"});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphView, SwapNodeNamesSelfLoops) {
+  TestSwapNodeNamesSimpleSelfLoop(/*update_fanouts=*/false);
+  TestSwapNodeNamesSimpleSelfLoop(/*update_fanouts=*/true);
+}
+
+void TestSwapNodeNamesError(absl::string_view from_node_name,
+                            absl::string_view to_node_name, bool update_fanouts,
+                            const string& error_msg) {
+  GraphDef graph_def = SimpleSwapNodeNamesMutationGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  Status s = graph.SwapNodeNames(from_node_name, to_node_name, update_fanouts);
+  EXPECT_EQ(s.ok(), false);
+  EXPECT_EQ(s.error_message(), error_msg);
+
+  // No changes to graph.
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"switch_1"});
+  CheckNode(graph, "switch_1", "Switch", "", {}, {"a"}, {"identity_1"});
+  CheckNode(graph, "identity_1", "Identity", "", {}, {"switch_1:1"},
+            {"foo_1", "^foo_1"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"switch_2"});
+  CheckNode(graph, "switch_2", "Switch", "", {}, {"b"}, {"identity_2"});
+  CheckNode(graph, "identity_2", "Identity", "", {}, {"switch_2:0"},
+            {"foo_2", "^foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"identity_1", "^identity_1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"identity_2", "^identity_2"}, {});
+
+  CheckGraph(graph);
+}
+
+// TODO(lyandy): add tests with update_fanouts == true.
+TEST(MutableGraphView, SwapNodeNamesError) {
+  string error_msg;
+  // Missing nodes.
+  error_msg =
+      "MutableGraphView::SwapNodeNames(from_node_name='foo_3', "
+      "to_node_name='foo_2', update_fanouts=false) error: node 'foo_3' was not "
+      "found.";
+  TestSwapNodeNamesError("foo_3", "foo_2", /*update_fanouts=*/false, error_msg);
+  error_msg =
+      "MutableGraphView::SwapNodeNames(from_node_name='foo_3', "
+      "to_node_name='foo_2', update_fanouts=true) error: node 'foo_3' was not "
+      "found.";
+  TestSwapNodeNamesError("foo_3", "foo_2", /*update_fanouts=*/true, error_msg);
+  error_msg =
+      "MutableGraphView::SwapNodeNames(from_node_name='foo_1', "
+      "to_node_name='foo_4', update_fanouts=false) error: node 'foo_4' was not "
+      "found.";
+  TestSwapNodeNamesError("foo_1", "foo_4", /*update_fanouts=*/false, error_msg);
+  error_msg =
+      "MutableGraphView::SwapNodeNames(from_node_name='foo_1', "
+      "to_node_name='foo_4', update_fanouts=true) error: node 'foo_4' was not "
+      "found.";
+  TestSwapNodeNamesError("foo_1", "foo_4", /*update_fanouts=*/true, error_msg);
+  error_msg =
+      "MutableGraphView::SwapNodeNames(from_node_name='foo_5', "
+      "to_node_name='foo_6', update_fanouts=false) error: node 'foo_5' was not "
+      "found.";
+  TestSwapNodeNamesError("foo_5", "foo_6", /*update_fanouts=*/false, error_msg);
+  error_msg =
+      "MutableGraphView::SwapNodeNames(from_node_name='foo_5', "
+      "to_node_name='foo_6', update_fanouts=true) error: node 'foo_5' was not "
+      "found.";
+  TestSwapNodeNamesError("foo_5", "foo_6", /*update_fanouts=*/true, error_msg);
+
+  // Switch control dependencies.
+  error_msg =
+      "MutableGraphView::SwapNodeNames(from_node_name='switch_2', "
+      "to_node_name='identity_1', update_fanouts=false) error: can't swap node "
+      "name 'switch_2' as it will become a Switch control dependency.";
+  TestSwapNodeNamesError("switch_2", "identity_1", /*update_fanouts=*/false,
+                         error_msg);
+  error_msg =
+      "MutableGraphView::SwapNodeNames(from_node_name='identity_2', "
+      "to_node_name='switch_1', update_fanouts=false) error: can't swap node "
+      "name 'switch_1' as it will become a Switch control dependency.";
+  TestSwapNodeNamesError("identity_2", "switch_1", /*update_fanouts=*/false,
+                         error_msg);
+}
 
 TEST(MutableGraphViewTest, AddAndUpdateFanouts) {
   // Actual node.op() is not important in this test.
@@ -32,114 +755,2245 @@ TEST(MutableGraphViewTest, AddAndUpdateFanouts) {
       {NDef("bar", "NotImportant", {}, {}),
        NDef("other", "NotImportant", {}, {}),
        NDef("foo_1", "NotImportant", {"bar", "other", "bar:1", "^bar"}),
-       NDef("foo_2", "NotImportant", {"other:1", "bar:2", "^bar"})},
-      /* empty function library */ {});
+       NDef("foo_2", "NotImportant", {"other:1", "bar:2", "^bar"}),
+       NDef("foo_3", "NotImportant", {"other:2", "^bar"})},
+      /*funcs=*/{});
 
   MutableGraphView graph(&graph_def);
 
   NodeDef* new_bar = graph.AddNode(NDef("new_bar", "NotImportant", {}, {}));
-  NodeDef* bar = graph.GetNode("bar");
-
-  graph.UpdateFanouts(bar->name(), new_bar->name());
-
-  // Fanout nodes must have their inputs updated.
-  NodeDef* foo_1 = graph.GetNode("foo_1");
-  ASSERT_NE(foo_1, nullptr);
-  ASSERT_EQ(foo_1->input_size(), 4);
-  EXPECT_EQ(foo_1->input(0), "new_bar");
-  EXPECT_EQ(foo_1->input(1), "other");
-  EXPECT_EQ(foo_1->input(2), "new_bar:1");
-  EXPECT_EQ(foo_1->input(3), "^new_bar");
-
-  NodeDef* foo_2 = graph.GetNode("foo_2");
-  ASSERT_NE(foo_2, nullptr);
-  ASSERT_EQ(foo_2->input_size(), 3);
-  EXPECT_EQ(foo_2->input(0), "other:1");
-  EXPECT_EQ(foo_2->input(1), "new_bar:2");
-  EXPECT_EQ(foo_2->input(2), "^new_bar");
-
-  // And fanouts mapping must be also updated for both nodes.
-  bool include_control_fanouts = true;
-  auto old_node_fanouts = graph.GetFanouts(*bar, include_control_fanouts);
-  auto new_node_fanouts = graph.GetFanouts(*new_bar, include_control_fanouts);
-
-  EXPECT_TRUE(old_node_fanouts.empty());
-  EXPECT_EQ(new_node_fanouts.count(MutableGraphView::InputPort(foo_1, 0)), 1);
-  EXPECT_EQ(new_node_fanouts.count(MutableGraphView::InputPort(foo_1, 2)), 1);
-  EXPECT_EQ(new_node_fanouts.count(MutableGraphView::InputPort(foo_1, -1)), 1);
-  EXPECT_EQ(new_node_fanouts.count(MutableGraphView::InputPort(foo_2, 1)), 1);
-  EXPECT_EQ(new_node_fanouts.count(MutableGraphView::InputPort(foo_2, -1)), 1);
+
+  TF_EXPECT_OK(graph.UpdateFanouts("bar", new_bar->name()));
+
+  // Fanins and fanouts must be updated.
+  CheckNode(graph, "bar", "NotImportant", "", {}, {}, {});
+  CheckNode(graph, "other", "NotImportant", "", {}, {},
+            {"foo_1:1", "foo_2", "foo_3"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"new_bar", "other", "new_bar:1"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {}, {"other:1", "new_bar:2"},
+            {});
+  CheckNode(graph, "foo_3", "NotImportant", "", {}, {"other:2", "^new_bar"},
+            {});
+  CheckNode(graph, "new_bar", "NotImportant", "", {}, {},
+            {"foo_1:0", "foo_1:2", "foo_2:1", "^foo_3"});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, AddAndUpdateFanoutsKeepControls) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("bar_1", "Switch", {}, {}), NDef("bar_2", "Identity", {"bar_1:1"}),
+       NDef("other", "NotImportant", {}, {}),
+       NDef("foo_1", "NotImportant", {"bar_2", "other", "bar_2:1", "^bar_2"}),
+       NDef("foo_2", "NotImportant", {"other:1", "bar_2:2", "^bar_2"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* new_bar = graph.AddNode(NDef("new_bar", "Identity", {"bar_1:2"}));
+
+  TF_EXPECT_OK(graph.UpdateFanouts("bar_2", new_bar->name()));
+
+  // Fanins and fanouts must be updated.
+  CheckNode(graph, "bar_1", "Switch", "", {}, {}, {"bar_2", "new_bar"});
+  CheckNode(graph, "bar_2", "Identity", "", {}, {"bar_1:1"}, {});
+  CheckNode(graph, "other", "NotImportant", "", {}, {}, {"foo_1:1", "foo_2"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {},
+            {"new_bar", "other", "new_bar:1", "^new_bar"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {},
+            {"other:1", "new_bar:2", "^new_bar"}, {});
+  CheckNode(graph, "new_bar", "Identity", "", {}, {"bar_1:2"},
+            {"foo_1", "foo_1:2", "^foo_1", "foo_2:1", "^foo_2"});
+
+  CheckGraph(graph);
 }
 
 TEST(MutableGraphViewTest, AddAndUpdateFanoutsWithoutSelfLoops) {
   // Actual node.op() is not important in this test.
   GraphDef graph_def =
       test::function::GDef({NDef("bar", "NotImportant", {}, {}),
-                            NDef("foo", "NotImportant", {"bar", "^bar"})},
-                           /* empty function library */ {});
+                            NDef("foo_1", "NotImportant", {"bar", "^bar"}),
+                            NDef("foo_2", "NotImportant", {"^bar"})},
+                           /*funcs=*/{});
 
   MutableGraphView graph(&graph_def);
 
   // `new_bar` reads the output of an original `bar` node.
   NodeDef* new_bar = graph.AddNode(NDef("new_bar", "NewBar", {"bar"}, {}));
-  NodeDef* bar = graph.GetNode("bar");
 
-  graph.UpdateFanouts("bar", new_bar->name());
+  TF_EXPECT_OK(graph.UpdateFanouts("bar", new_bar->name()));
+
+  // Fanins and fanouts must be updated.
+  CheckNode(graph, "bar", "NotImportant", "", {}, {}, {"new_bar"});
+  CheckNode(graph, "foo_1", "NotImportant", "", {}, {"new_bar"}, {});
+  CheckNode(graph, "foo_2", "NotImportant", "", {}, {"^new_bar"}, {});
+  CheckNode(graph, "new_bar", "NewBar", "", {}, {"bar"}, {"foo_1", "^foo_2"});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, UpdateFanoutsToSwitchWithControlFromSwitch) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "Switch", {}, {}),
+       NDef("c", "NotImportant", {}, {}), NDef("d", "NotImportant", {}, {}),
+       NDef("e", "NotImportant", {"c", "b", "^a", "^d"})},
+      /*funcs=*/{});
 
-  // Foo node must read from `new_bar`.
-  NodeDef* foo = graph.GetNode("foo");
-  ASSERT_NE(foo, nullptr);
-  ASSERT_EQ(foo->input_size(), 2);
-  EXPECT_EQ(foo->input(0), "new_bar");
-  EXPECT_EQ(foo->input(1), "^new_bar");
+  MutableGraphView graph(&graph_def);
 
-  // And the `new_bar` should read from the original `bar`.
-  ASSERT_EQ(new_bar->input_size(), 1);
-  ASSERT_EQ(new_bar->input(0), "bar");
+  Status s = graph.UpdateFanouts("a", "b");
+  EXPECT_FALSE(s.ok());
+  string expected_msg =
+      "MutableGraphView::UpdateFanouts(from_node_name='a', to_node_name='b') "
+      "error: can't update fanouts to node 'b' as it will become a Switch "
+      "control dependency.";
+  EXPECT_EQ(s.error_message(), expected_msg);
+  s = graph.UpdateFanouts("d", "b");
+  EXPECT_FALSE(s.ok());
+  expected_msg =
+      "MutableGraphView::UpdateFanouts(from_node_name='d', to_node_name='b') "
+      "error: can't update fanouts to node 'b' as it will become a Switch "
+      "control dependency.";
+  EXPECT_EQ(s.error_message(), expected_msg);
 
-  // And fanouts mapping must be also updated for both nodes.
-  bool include_control_fanouts = true;
-  auto bar_fanouts = graph.GetFanouts(*bar, include_control_fanouts);
-  auto new_bar_fanouts = graph.GetFanouts(*new_bar, include_control_fanouts);
+  EXPECT_EQ(graph.graph()->node_size(), 5);
 
-  EXPECT_EQ(bar_fanouts.size(), 1);
-  EXPECT_EQ(bar_fanouts.count(MutableGraphView::InputPort(new_bar, 0)), 1);
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"^e"});
+  CheckNode(graph, "b", "Switch", "", {}, {}, {"e:1"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {}, {"e:0"});
+  CheckNode(graph, "d", "NotImportant", "", {}, {}, {"^e"});
+  CheckNode(graph, "e", "NotImportant", "", {}, {"c", "b", "^a", "^d"}, {});
 
-  EXPECT_EQ(new_bar_fanouts.size(), 2);
-  EXPECT_EQ(new_bar_fanouts.count(MutableGraphView::InputPort(foo, 0)), 1);
-  EXPECT_EQ(new_bar_fanouts.count(MutableGraphView::InputPort(foo, -1)), 1);
+  CheckGraph(graph);
 }
 
-TEST(MutableGraphViewTest, DeleteNodes) {
+TEST(MutableGraphViewTest, UpdateFanoutsToSwitchWithNoControlFromSwitch) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "Switch", {}, {}),
+       NDef("c", "NotImportant", {}, {}), NDef("d", "NotImportant", {}, {}),
+       NDef("e", "NotImportant", {"c", "b", "^a", "^d"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.UpdateFanouts("c", "b"));
+
+  EXPECT_EQ(graph.graph()->node_size(), 5);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"^e"});
+  CheckNode(graph, "b", "Switch", "", {}, {}, {"e:0", "e:1"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {}, {});
+  CheckNode(graph, "d", "NotImportant", "", {}, {}, {"^e"});
+  CheckNode(graph, "e", "NotImportant", "", {}, {"b", "b", "^a", "^d"}, {});
+
+  CheckGraph(graph);
+}
+
+GraphDef SimpleMutateFaninGraph() {
   // Actual node.op() is not important in this test.
   GraphDef graph_def = test::function::GDef(
-      {NDef("bar", "NotImportant", {}, {}),
-       NDef("other", "NotImportant", {}, {}),
-       NDef("foo_1", "NotImportant", {"bar", "other", "bar:1", "^bar"}),
-       NDef("foo_2", "NotImportant", {"other:1", "bar:2", "^bar"})},
-      /* empty function library */ {});
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {}, {}),
+       NDef("c", "NotImportant", {}, {}), NDef("d", "NotImportant", {}, {}),
+       NDef("foo_1", "NotImportant", {"a"}),
+       NDef("foo_2", "NotImportant", {"b", "^a", "^c"}),
+       NDef("foo_3", "NotImportant", {"b", "a:1", "a:1"}),
+       NDef("foo_4", "NotImportant", {"a", "b:2", "b:2", "^c", "^d"}),
+       NDef("foo_5", "NotImportant", {}),
+       NDef("foo_6", "NotImportant", {"^a", "^b"})},
+      /*funcs=*/{});
+  return graph_def;
+}
+
+void TestAddRegularFanin(absl::string_view node_name, bool node_exists,
+                         const TensorId& fanin_to_add, bool success,
+                         const string& error_msg,
+                         absl::Span<const string> expected_fanins) {
+  GraphDef graph_def = SimpleMutateFaninGraph();
 
   MutableGraphView graph(&graph_def);
 
-  EXPECT_NE(graph.GetNode("foo_1"), nullptr);
-  graph.DeleteNodes({"foo_1"});
+  NodeDef* node = graph.GetNode(node_name);
+  if (node_exists) {
+    EXPECT_NE(node, nullptr);
+  } else {
+    EXPECT_EQ(node, nullptr);
+  }
 
-  EXPECT_EQ(graph.GetNode("foo_1"), nullptr);
+  absl::flat_hash_map<string, std::vector<string>> unmodified_node_inputs =
+      GetNodeInputsFromGraph(graph_def, node_name);
+
+  Status s = graph.AddRegularFanin(node_name, fanin_to_add);
+  EXPECT_EQ(s.ok(), success);
+  if (!success) {
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+  if (node_exists) {
+    CompareNodeFanins(graph, node, expected_fanins);
+  }
+
+  CheckUnmodifiedNodeFanins(graph_def, node_name, unmodified_node_inputs);
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, AddRegularFanin) {
+  string error_msg;
+  // Add input to node with 1 input 0 controls.
+  TestAddRegularFanin("foo_1", /*node_exists=*/true, {"b", 1}, /*success=*/true,
+                      error_msg, {"a", "b:1"});
+  // Add input to node with multiple inputs and 0 controls.
+  TestAddRegularFanin("foo_3", /*node_exists=*/true, {"b", 2}, /*success=*/true,
+                      error_msg, {"b", "a:1", "a:1", "b:2"});
+  // Add input to node with 1 input multiple controls.
+  TestAddRegularFanin("foo_2", /*node_exists=*/true, {"a", 0}, /*success=*/true,
+                      error_msg, {"b", "a", "^c"});
+  // Add input to node with multiple inputs and controls.
+  TestAddRegularFanin("foo_4", /*node_exists=*/true, {"a", 1}, /*success=*/true,
+                      error_msg, {"a", "b:2", "b:2", "a:1", "^d", "^c"});
+  // Add input to node with 0 inputs 0 controls.
+  TestAddRegularFanin("foo_5", /*node_exists=*/true, {"a", 1}, /*success=*/true,
+                      error_msg, {"a:1"});
+  // Add input to node with 0 inputs multiple controls.
+  TestAddRegularFanin("foo_6", /*node_exists=*/true, {"c", 1}, /*success=*/true,
+                      error_msg, {"c:1", "^b", "^a"});
+
+  // Add control to node with 1 input 0 controls.
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_1', fanin='^b') error: "
+      "fanin '^b' must be a regular tensor id.";
+  TestAddRegularFanin("foo_1", /*node_exists=*/true, {"b", Graph::kControlSlot},
+                      /*success=*/false, error_msg, {"a"});
+  // Add control to node with multiple inputs and 0 controls.
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_3', fanin='^c') error: "
+      "fanin '^c' must be a regular tensor id.";
+  TestAddRegularFanin("foo_3", /*node_exists=*/true, {"c", Graph::kControlSlot},
+                      /*success=*/false, error_msg, {"b", "a:1", "a:1"});
+  // Add control to node with 1 input multiple controls.
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_2', fanin='^d') error: "
+      "fanin '^d' must be a regular tensor id.";
+  TestAddRegularFanin("foo_2", /*node_exists=*/true, {"d", Graph::kControlSlot},
+                      /*success=*/false, error_msg, {"b", "^a", "^c"});
+  // Add control to node with multiple input multiple controls.
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_4', fanin='^a') error: "
+      "fanin '^a' must be a regular tensor id.";
+  TestAddRegularFanin("foo_4", /*node_exists=*/true, {"a", Graph::kControlSlot},
+                      /*success=*/false, error_msg,
+                      {"a", "b:2", "b:2", "^c", "^d"});
+  // Add control to node with 0 inputs 0 controls.
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_5', fanin='^a') error: "
+      "fanin '^a' must be a regular tensor id.";
+  TestAddRegularFanin("foo_5", /*node_exists=*/true, {"a", Graph::kControlSlot},
+                      /*success=*/false, error_msg, {});
+  // Add control to node with 0 inputs multiple controls.
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_6', fanin='^c') error: "
+      "fanin '^c' must be a regular tensor id.";
+  TestAddRegularFanin("foo_6", /*node_exists=*/true, {"c", Graph::kControlSlot},
+                      /*success=*/false, error_msg, {"^a", "^b"});
+  // Add control to node with control that already exists.
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_2', fanin='^a') error: "
+      "fanin '^a' must be a regular tensor id.";
+  TestAddRegularFanin("foo_2", /*node_exists=*/true, {"a", Graph::kControlSlot},
+                      /*success=*/false, error_msg, {"b", "^a", "^c"});
+
+  // Add fanin to node where node is missing.
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_missing', fanin='a:0') "
+      "error: node 'foo_missing' was not found.";
+  TestAddRegularFanin("foo_missing", /*node_exists=*/false, {"a", 0},
+                      /*success=*/false, error_msg, {});
+  // Add fanin to node where fanin is missing.
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_1', "
+      "fanin='bar_missing:0') error: node 'bar_missing' was not found.";
+  TestAddRegularFanin("foo_1", /*node_exists=*/true, {"bar_missing", 0},
+                      /*success=*/false, error_msg, {"a"});
+  // Add fanin to node where node and fanin are missing.
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_missing', "
+      "fanin='bar_missing:0') error: node 'foo_missing' was not found.";
+  TestAddRegularFanin("foo_missing", /*node_exists=*/false, {"bar_missing", 0},
+                      /*success=*/false, error_msg, {});
+  // Add control fanin to node where node and fanin are missing.
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_missing', "
+      "fanin='^bar_missing') error: fanin '^bar_missing' must be a regular "
+      "tensor id.";
+  TestAddRegularFanin("foo_missing", /*node_exists=*/false,
+                      {"bar_missing", Graph::kControlSlot},
+                      /*success=*/false, error_msg, {});
+
+  // Add self to create cycle.
+  error_msg =
+      "MutableGraphView::AddRegularFanin(node_name='foo_6', fanin='foo_6:2') "
+      "error: can't add fanin 'foo_6:2' to self.";
+  TestAddRegularFanin("foo_6", /*node_exists=*/true, {"foo_6", 2},
+                      /*success=*/false, error_msg, {"^a", "^b"});
+}
+
+void TestAddRegularFaninByPort(absl::string_view node_name, bool node_exists,
+                               int port, const TensorId& fanin_to_add,
+                               bool success, const string& error_msg,
+                               absl::Span<const string> expected_fanins) {
+  GraphDef graph_def = SimpleMutateFaninGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* node = graph.GetNode(node_name);
+  if (node_exists) {
+    EXPECT_NE(node, nullptr);
+  } else {
+    EXPECT_EQ(node, nullptr);
+  }
+
+  absl::flat_hash_map<string, std::vector<string>> unmodified_node_inputs =
+      GetNodeInputsFromGraph(graph_def, node_name);
+
+  Status s = graph.AddRegularFaninByPort(node_name, port, fanin_to_add);
+  EXPECT_EQ(s.ok(), success);
+  if (!success) {
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+  if (node_exists) {
+    CompareNodeFanins(graph, node, expected_fanins);
+  }
+
+  CheckUnmodifiedNodeFanins(graph_def, node_name, unmodified_node_inputs);
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, AddRegularFaninByPort) {
+  string error_msg;
+  // Add input at start to node with some inputs and no controls.
+  TestAddRegularFaninByPort("foo_3", /*node_exists=*/true, /*port=*/0, {"d", 2},
+                            /*success=*/true, error_msg,
+                            {"d:2", "b", "a:1", "a:1"});
+  // Add input at end to node with some inputs and no controls.
+  TestAddRegularFaninByPort("foo_3", /*node_exists=*/true, /*port=*/3, {"d", 2},
+                            /*success=*/true, error_msg,
+                            {"b", "a:1", "a:1", "d:2"});
+  // Add input in middle to node with some inputs and no controls.
+  TestAddRegularFaninByPort("foo_3", /*node_exists=*/true, /*port=*/2, {"d", 2},
+                            /*success=*/true, error_msg,
+                            {"b", "a:1", "d:2", "a:1"});
+  // Add input at start to node with some inputs and some controls.
+  TestAddRegularFaninByPort("foo_2", /*node_exists=*/true, /*port=*/0, {"d", 2},
+                            /*success=*/true, error_msg,
+                            {"d:2", "b", "^c", "^a"});
+  // Add input at end to node with some inputs and some controls.
+  TestAddRegularFaninByPort("foo_2", /*node_exists=*/true, /*port=*/1, {"d", 2},
+                            /*success=*/true, error_msg,
+                            {"b", "d:2", "^c", "^a"});
+  // Add input in middle to node with some inputs and some controls, and dedup
+  // controls.
+  TestAddRegularFaninByPort("foo_4", /*node_exists=*/true, /*port=*/2, {"d", 2},
+                            /*success=*/true, error_msg,
+                            {"a", "b:2", "d:2", "b:2", "^c"});
+  // Add input to node with no inputs and no controls.
+  TestAddRegularFaninByPort("foo_5", /*node_exists=*/true, /*port=*/0, {"d", 2},
+                            /*success=*/true, error_msg, {"d:2"});
+  // Add input to node with no inputs and some controls.
+  TestAddRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/0, {"d", 2},
+                            /*success=*/true, error_msg, {"d:2", "^b", "^a"});
+  // Add fanin should dedup control.
+  TestAddRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/0, {"b", 2},
+                            /*success=*/true, error_msg, {"b:2", "^a"});
+
+  // Add controlling fanin.
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_4', port=2, "
+      "fanin='^d') error: fanin '^d' must be a regular tensor id.";
+  TestAddRegularFaninByPort(
+      "foo_4", /*node_exists=*/true, /*port=*/2, {"d", Graph::kControlSlot},
+      /*success=*/false, error_msg, {"a", "b:2", "b:2", "^c", "^d"});
+
+  // Add fanin at out of bounds port.
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_5', port=-1, "
+      "fanin='d:2') error: port must be in range [0, 0].";
+  TestAddRegularFaninByPort("foo_5", /*node_exists=*/true, /*port=*/-1,
+                            {"d", 2},
+                            /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_5', port=1, "
+      "fanin='d:2') error: port must be in range [0, 0].";
+  TestAddRegularFaninByPort("foo_5", /*node_exists=*/true, /*port=*/1, {"d", 2},
+                            /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_6', port=-1, "
+      "fanin='d:2') error: port must be in range [0, 0].";
+  TestAddRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/-1,
+                            {"d", 2},
+                            /*success=*/false, error_msg, {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_6', port=1, "
+      "fanin='d:2') error: port must be in range [0, 0].";
+  TestAddRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/1, {"d", 2},
+                            /*success=*/false, error_msg, {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_4', port=-1, "
+      "fanin='d:2') error: port must be in range [0, 3].";
+  TestAddRegularFaninByPort(
+      "foo_4", /*node_exists=*/true, /*port=*/-1, {"d", 2},
+      /*success=*/false, error_msg, {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_4', port=4, "
+      "fanin='d:2') error: port must be in range [0, 3].";
+  TestAddRegularFaninByPort("foo_4", /*node_exists=*/true, /*port=*/4, {"d", 2},
+                            /*success=*/false, error_msg,
+                            {"a", "b:2", "b:2", "^c", "^d"});
+
+  // Add fanin to node where node is missing.
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_missing', "
+      "port=0, fanin='a:0') error: node 'foo_missing' was not found.";
+  TestAddRegularFaninByPort("foo_missing", /*node_exists=*/false, /*port=*/0,
+                            {"a", 0},
+                            /*success=*/false, error_msg, {});
+  // Add fanin to node where fanin is missing.
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_1', port=0, "
+      "fanin='bar_missing:0') error: node 'bar_missing' was not found.";
+  TestAddRegularFaninByPort("foo_1", /*node_exists=*/true, /*port=*/0,
+                            {"bar_missing", 0},
+                            /*success=*/false, error_msg, {"a"});
+  // Add fanin to node where node and fanin are missing.
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_missing', "
+      "port=0, fanin='bar_missing:0') error: node 'foo_missing' was not found.";
+  TestAddRegularFaninByPort("foo_missing", /*node_exists=*/false, /*port=*/0,
+                            {"bar_missing", 0},
+                            /*success=*/false, error_msg, {});
+
+  // Add self to create cycle.
+  error_msg =
+      "MutableGraphView::AddRegularFaninByPort(node_name='foo_6', port=0, "
+      "fanin='foo_6:2') error: can't add fanin 'foo_6:2' to self.";
+  TestAddRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/0,
+                            {"foo_6", 2},
+                            /*success=*/false, error_msg, {"^a", "^b"});
+}
+
+void CheckFanoutRemoved(const MutableGraphView& graph, const TensorId& fanin,
+                        absl::string_view node_name) {
+  MutableGraphView::OutputPort output_port =
+      graph.GetOutputPort(fanin.node(), fanin.index());
+  auto fanouts = graph.GetFanout(output_port);
+  for (auto fanout : fanouts) {
+    EXPECT_NE(fanout.node->name(), fanin.node());
+  }
+}
+
+void TestRemoveRegularFanin(absl::string_view node_name, bool node_exists,
+                            const TensorId& fanin_to_remove, bool success,
+                            const string& error_msg,
+                            absl::Span<const string> expected_fanins) {
+  GraphDef graph_def = SimpleMutateFaninGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* node = graph.GetNode(node_name);
+  if (node_exists) {
+    EXPECT_NE(nullptr, node);
+  } else {
+    EXPECT_EQ(nullptr, node);
+  }
+
+  absl::flat_hash_map<string, std::vector<string>> unmodified_node_inputs =
+      GetNodeInputsFromGraph(graph_def, node_name);
+
+  Status s = graph.RemoveRegularFanin(node_name, fanin_to_remove);
+  EXPECT_EQ(s.ok(), success);
+  if (!success) {
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+  if (node_exists) {
+    CompareNodeFanins(graph, node, expected_fanins);
+    if (success) {
+      CheckFanoutRemoved(graph, fanin_to_remove, node_name);
+    }
+  }
+
+  CheckUnmodifiedNodeFanins(graph_def, node_name, unmodified_node_inputs);
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, RemoveRegularFanin) {
+  string error_msg;
+  // Remove input from node with 1 input 0 controls.
+  TestRemoveRegularFanin("foo_1", /*node_exists=*/true, {"a", 0},
+                         /*success=*/true, error_msg, {});
+  // Remove input from node with multiple inputs and 0 controls.
+  TestRemoveRegularFanin("foo_3", /*node_exists=*/true, {"a", 1},
+                         /*success=*/true, error_msg, {"b"});
+  // Remove input from node with 1 input multiple controls.
+  TestRemoveRegularFanin("foo_2", /*node_exists=*/true, {"b", 0},
+                         /*success=*/true, error_msg, {"^a", "^c"});
+  // Remove input from node with multiple inputs and controls.
+  TestRemoveRegularFanin("foo_4", /*node_exists=*/true, {"b", 2},
+                         /*success=*/true, error_msg, {"a", "^c", "^d"});
+  // Remove input from node with multiple inputs and controls, and results in
+  // shifting of ports.
+  TestRemoveRegularFanin("foo_4", /*node_exists=*/true, {"a", 0},
+                         /*success=*/true, error_msg,
+                         {"b:2", "b:2", "^c", "^d"});
+
+  // Remove control from node with 1 input multiple controls.
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_2', fanin='^a') "
+      "error: fanin '^a' must be a regular tensor id.";
+  TestRemoveRegularFanin("foo_2", /*node_exists=*/true,
+                         {"a", Graph::kControlSlot},
+                         /*success=*/false, error_msg, {"b", "^a", "^c"});
+  // Remove control from node with multiple input multiple controls.
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_4', fanin='^d') "
+      "error: fanin '^d' must be a regular tensor id.";
+  TestRemoveRegularFanin(
+      "foo_4", /*node_exists=*/true, {"d", Graph::kControlSlot},
+      /*success=*/false, error_msg, {"a", "b:2", "b:2", "^c", "^d"});
+  // Remove control from node with 0 inputs multiple controls.
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_6', fanin='^a') "
+      "error: fanin '^a' must be a regular tensor id.";
+  TestRemoveRegularFanin("foo_6", /*node_exists=*/true,
+                         {"a", Graph::kControlSlot},
+                         /*success=*/false, error_msg, {"^a", "^b"});
+
+  // Remove input from node with 0 inputs 0 controls.
+  error_msg = "";
+  TestRemoveRegularFanin("foo_5", /*node_exists=*/true, {"a", 1},
+                         /*success=*/true, error_msg, {});
+  // Remove input from node with 0 inputs multiple controls.
+  TestRemoveRegularFanin("foo_6", /*node_exists=*/true, {"a", 1},
+                         /*success=*/true, error_msg, {"^a", "^b"});
+
+  // Remove control from node with 1 input 0 controls.
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_1', fanin='^b') "
+      "error: fanin '^b' must be a regular tensor id.";
+  TestRemoveRegularFanin("foo_1", /*node_exists=*/true,
+                         {"b", Graph::kControlSlot},
+                         /*success=*/false, error_msg, {"a"});
+  // Remove control from node with multiple inputs and 0 controls.
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_3', fanin='^c') "
+      "error: fanin '^c' must be a regular tensor id.";
+  TestRemoveRegularFanin("foo_3", /*node_exists=*/true,
+                         {"c", Graph::kControlSlot},
+                         /*success=*/false, error_msg, {"b", "a:1", "a:1"});
+  // Remove control from node with 0 inputs 0 controls.
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_5', fanin='^a') "
+      "error: fanin '^a' must be a regular tensor id.";
+  TestRemoveRegularFanin("foo_5", /*node_exists=*/true,
+                         {"a", Graph::kControlSlot},
+                         /*success=*/false, error_msg, {});
+
+  // Remove fanin from node where node is missing.
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_missing', "
+      "fanin='a:0') error: node 'foo_missing' was not found.";
+  TestRemoveRegularFanin("foo_missing", /*node_exists=*/false, {"a", 0},
+                         /*success=*/false, error_msg, {});
+  // Remove fanin from node where fanin is missing.
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_1', "
+      "fanin='bar_missing:0') error: node 'bar_missing' was not found.";
+  TestRemoveRegularFanin("foo_1", /*node_exists=*/true, {"bar_missing", 0},
+                         /*success=*/false, error_msg, {"a"});
+  // Remove fanin from node where node and fanin are missing.
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_missing', "
+      "fanin='bar_missing:0') error: node 'foo_missing' was not found.";
+  TestRemoveRegularFanin("foo_missing", /*node_exists=*/false,
+                         {"bar_missing", 0}, /*success=*/false, error_msg, {});
+  // Remove control from node where node and fanin are missing.
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_missing', "
+      "fanin='^bar_missing') error: fanin '^bar_missing' must be a regular "
+      "tensor id.";
+  TestRemoveRegularFanin("foo_missing", /*node_exists=*/false,
+                         {"bar_missing", Graph::kControlSlot},
+                         /*success=*/false, error_msg, {});
+
+  // Remove self.
+  error_msg =
+      "MutableGraphView::RemoveRegularFanin(node_name='foo_6', "
+      "fanin='foo_6:2') error: can't remove fanin 'foo_6:2' from self.";
+  TestRemoveRegularFanin("foo_6", /*node_exists=*/true, {"foo_6", 2},
+                         /*success=*/false, error_msg, {"^a", "^b"});
+}
+
+void TestRemoveRegularFaninByPort(absl::string_view node_name, bool node_exists,
+                                  int port, bool success,
+                                  const string& error_msg,
+                                  absl::Span<const string> expected_fanins) {
+  GraphDef graph_def = SimpleMutateFaninGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* node = graph.GetNode(node_name);
+  if (node_exists) {
+    EXPECT_NE(nullptr, node);
+  } else {
+    EXPECT_EQ(nullptr, node);
+  }
+
+  absl::flat_hash_map<string, std::vector<string>> unmodified_node_inputs =
+      GetNodeInputsFromGraph(graph_def, node_name);
+
+  Status s = graph.RemoveRegularFaninByPort(node_name, port);
+  EXPECT_EQ(s.ok(), success);
+  if (!success) {
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+  if (node_exists) {
+    CompareNodeFanins(graph, node, expected_fanins);
+  }
+
+  CheckUnmodifiedNodeFanins(graph_def, node_name, unmodified_node_inputs);
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, RemoveRegularFaninByPort) {
+  string error_msg;
+  // Remove input at start of node with some inputs and no controls.
+  TestRemoveRegularFaninByPort("foo_3", /*node_exists=*/true, /*port=*/0,
+                               /*success=*/true, error_msg, {"a:1", "a:1"});
+  // Remove input at end of node with some inputs and no controls.
+  TestRemoveRegularFaninByPort("foo_3", /*node_exists=*/true, /*port=*/2,
+                               /*success=*/true, error_msg, {"b", "a:1"});
+  // Remove input in middle of node with some inputs and no controls.
+  TestRemoveRegularFaninByPort("foo_3", /*node_exists=*/true, /*port=*/1,
+                               /*success=*/true, error_msg, {"b", "a:1"});
+  // Remove input at start of node with some inputs and some controls.
+  TestRemoveRegularFaninByPort("foo_4", /*node_exists=*/true, /*port=*/0,
+                               /*success=*/true, error_msg,
+                               {"b:2", "b:2", "^d", "^c"});
+  // Remove input at end of node with some inputs and some controls.
+  TestRemoveRegularFaninByPort("foo_4", /*node_exists=*/true, /*port=*/2,
+                               /*success=*/true, error_msg,
+                               {"a", "b:2", "^d", "^c"});
+  // Remove input in middle of node with some inputs and some controls.
+  TestRemoveRegularFaninByPort("foo_4", /*node_exists=*/true, /*port=*/1,
+                               /*success=*/true, error_msg,
+                               {"a", "b:2", "^d", "^c"});
+
+  // Remove input from node with no inputs and no controls.
+  error_msg =
+      "MutableGraphView::RemoveRegularFaninByPort(node_name='foo_5', port=0) "
+      "error: no available ports as node has no regular fanins.";
+  TestRemoveRegularFaninByPort("foo_5", /*node_exists=*/true, /*port=*/0,
+                               /*success=*/false, error_msg, {});
+  // Remove input from node with no inputs and some controls.
+  error_msg =
+      "MutableGraphView::RemoveRegularFaninByPort(node_name='foo_6', port=1) "
+      "error: no available ports as node has no regular fanins.";
+  TestRemoveRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/1,
+                               /*success=*/false, error_msg, {"^a", "^b"});
+
+  // Remove fanin at out of bounds port.
+  error_msg =
+      "MutableGraphView::RemoveRegularFaninByPort(node_name='foo_3', port=-1) "
+      "error: port must be in range [0, 2].";
+  TestRemoveRegularFaninByPort("foo_3", /*node_exists=*/true, /*port=*/-1,
+                               /*success=*/false, error_msg,
+                               {"b", "a:1", "a:1"});
+  error_msg =
+      "MutableGraphView::RemoveRegularFaninByPort(node_name='foo_3', port=3) "
+      "error: port must be in range [0, 2].";
+  TestRemoveRegularFaninByPort("foo_3", /*node_exists=*/true, /*port=*/3,
+                               /*success=*/false, error_msg,
+                               {"b", "a:1", "a:1"});
+  error_msg =
+      "MutableGraphView::RemoveRegularFaninByPort(node_name='foo_4', port=-1) "
+      "error: port must be in range [0, 2].";
+  TestRemoveRegularFaninByPort("foo_4", /*node_exists=*/true, /*port=*/-1,
+                               /*success=*/false, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::RemoveRegularFaninByPort(node_name='foo_4', port=3) "
+      "error: port must be in range [0, 2].";
+  TestRemoveRegularFaninByPort("foo_4", /*node_exists=*/true, /*port=*/3,
+                               /*success=*/false, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+
+  // Remove fanin from node where node is missing.
+  error_msg =
+      "MutableGraphView::RemoveRegularFaninByPort(node_name='foo_missing', "
+      "port=0) error: node 'foo_missing' was not found.";
+  TestRemoveRegularFaninByPort("foo_missing", /*node_exists=*/false, /*port=*/0,
+                               /*success=*/false, error_msg, {});
+}
+
+void TestRemoveAllFanins(absl::string_view node_name, bool node_exists,
+                         bool keep_controlling_nodes, bool success,
+                         const string& error_msg,
+                         absl::Span<const string> expected_fanins) {
+  GraphDef graph_def = SimpleMutateFaninGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* node = graph.GetNode(node_name);
+  absl::flat_hash_set<string> fanin_strings;
+  if (node_exists) {
+    EXPECT_NE(node, nullptr);
+    fanin_strings.insert(node->input().begin(), node->input().end());
+  } else {
+    EXPECT_EQ(node, nullptr);
+  }
+
+  absl::flat_hash_map<string, std::vector<string>> unmodified_node_inputs =
+      GetNodeInputsFromGraph(graph_def, node_name);
+
+  Status s = graph.RemoveAllFanins(node_name, keep_controlling_nodes);
+  EXPECT_EQ(s.ok(), success);
+  if (!success) {
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+  if (node_exists) {
+    CompareNodeFanins(graph, node, expected_fanins);
+    if (success) {
+      TensorId tensor_id;
+      auto retained_inputs = absl::flat_hash_set<string>(node->input().begin(),
+                                                         node->input().end());
+      for (const string& fanin : fanin_strings) {
+        if (!retained_inputs.contains(fanin)) {
+          tensor_id = ParseTensorName(fanin);
+          CheckFanoutRemoved(graph, tensor_id, node_name);
+        }
+      }
+    }
+  }
+
+  CheckUnmodifiedNodeFanins(graph_def, node_name, unmodified_node_inputs);
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, RemoveAllFanins) {
+  string error_msg;
+  // Remove all fanins from node with no control dependencies.
+  TestRemoveAllFanins("foo_3", /*node_exists=*/true,
+                      /*keep_controlling_nodes=*/false,
+                      /*success=*/true, error_msg, {});
+  // Remove all fanins from node with control dependencies.
+  TestRemoveAllFanins("foo_4", /*node_exists=*/true,
+                      /*keep_controlling_nodes=*/false,
+                      /*success=*/true, error_msg, {});
+
+  // Remove all fanins from node with no control dependencies and preserve
+  // control dependencies.
+  TestRemoveAllFanins("foo_3", /*node_exists=*/true,
+                      /*keep_controlling_nodes=*/true,
+                      /*success=*/true, error_msg, {});
+  // Remove all fanins from node with control dependencies and preserve control
+  // dependencies.
+  TestRemoveAllFanins("foo_4", /*node_exists=*/true,
+                      /*keep_controlling_nodes=*/true,
+                      /*success=*/true, error_msg, {"^c", "^d"});
+
+  // Remove all fanins from node with no fanins.
+  TestRemoveAllFanins("foo_5", /*node_exists=*/true,
+                      /*keep_controlling_nodes=*/false,
+                      /*success=*/true, error_msg, {});
+  TestRemoveAllFanins("foo_5", /*node_exists=*/true,
+                      /*keep_controlling_nodes=*/true,
+                      /*success=*/true, error_msg, {});
+
+  // Remove all fanins from node with only control dependencies.
+  TestRemoveAllFanins("foo_6", /*node_exists=*/true,
+                      /*keep_controlling_nodes=*/false,
+                      /*success=*/true, error_msg, {});
+  TestRemoveAllFanins("foo_6", /*node_exists=*/true,
+                      /*keep_controlling_nodes=*/true,
+                      /*success=*/true, error_msg, {"^a", "^b"});
+
+  // Remove all fanins from node where node is missing.
+  error_msg =
+      "MutableGraphView::RemoveAllFanins(node_name='foo_missing', "
+      "keep_controlling_fanins=false) error: node 'foo_missing' was not found.";
+  TestRemoveAllFanins("foo_missing", /*node_exists=*/false,
+                      /*keep_controlling_nodes=*/false,
+                      /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::RemoveAllFanins(node_name='foo_missing', "
+      "keep_controlling_fanins=true) error: node 'foo_missing' was not found.";
+  TestRemoveAllFanins("foo_missing", /*node_exists=*/false,
+                      /*keep_controlling_nodes=*/true,
+                      /*success=*/false, error_msg, {});
+}
+
+void TestUpdateFanin(absl::string_view node_name, bool node_exists,
+                     const TensorId& from_fanin, const TensorId& to_fanin,
+                     bool success, const string& error_msg,
+                     absl::Span<const string> expected_fanins) {
+  GraphDef graph_def = SimpleMutateFaninGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* node = graph.GetNode(node_name);
+  if (node_exists) {
+    EXPECT_NE(node, nullptr);
+  } else {
+    EXPECT_EQ(node, nullptr);
+  }
+
+  absl::flat_hash_map<string, std::vector<string>> unmodified_node_inputs =
+      GetNodeInputsFromGraph(graph_def, node_name);
+
+  Status s = graph.UpdateFanin(node_name, from_fanin, to_fanin);
+  EXPECT_EQ(s.ok(), success);
+  if (!success) {
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+  if (node_exists) {
+    CompareNodeFanins(graph, node, expected_fanins);
+    if (success) {
+      CheckFanoutRemoved(graph, from_fanin, node_name);
+    }
+  }
+
+  CheckUnmodifiedNodeFanins(graph_def, node_name, unmodified_node_inputs);
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, UpdateFanin) {
+  string error_msg;
+  // Update fanin from non control to non control.
+  TestUpdateFanin("foo_4", /*node_exists=*/true, {"b", 2}, {"b", 3},
+                  /*success=*/true, error_msg, {"a", "b:3", "b:3", "^c", "^d"});
+  // Update fanin from non control to control.
+  TestUpdateFanin("foo_4", /*node_exists=*/true, {"b", 2},
+                  {"b", Graph::kControlSlot},
+                  /*success=*/true, error_msg, {"a", "^c", "^d", "^b"});
+  // Update fanin from control to non control.
+  TestUpdateFanin(
+      "foo_4", /*node_exists=*/true, {"d", Graph::kControlSlot}, {"d", 1},
+      /*success=*/true, error_msg, {"a", "b:2", "b:2", "d:1", "^c"});
+  // Update fanin from control to control.
+  TestUpdateFanin("foo_4", /*node_exists=*/true, {"c", Graph::kControlSlot},
+                  {"b", Graph::kControlSlot}, /*success=*/true, error_msg,
+                  {"a", "b:2", "b:2", "^d"});
+  // Update fanin from control to existing control.
+  TestUpdateFanin("foo_4", /*node_exists=*/true, {"c", Graph::kControlSlot},
+                  {"d", Graph::kControlSlot}, /*success=*/true, error_msg,
+                  {"a", "b:2", "b:2", "^d"});
+
+  // Update fanin of node where from and to fanins are the same.
+  TestUpdateFanin("foo_1", /*node_exists=*/true, {"a", -1}, {"a", -1},
+                  /*success=*/true, error_msg, {"a"});
+  TestUpdateFanin("foo_1", /*node_exists=*/true, {"a", 0}, {"a", 0},
+                  /*success=*/true, error_msg, {"a"});
+  TestUpdateFanin("foo_1", /*node_exists=*/true, {"a", 1}, {"a", 1},
+                  /*success=*/true, error_msg, {"a"});
+
+  // Update fanin of node where node is missing.
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_missing', "
+      "from_fanin='a:0', to_fanin='a:1') error: node 'foo_missing' was not "
+      "found.";
+  TestUpdateFanin("foo_missing", /*node_exists=*/false, {"a", 0}, {"a", 1},
+                  /*success=*/false, error_msg, {});
+  // Update fanin of node where from fanin is missing.
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_1', "
+      "from_fanin='from_bar_missing:0', to_fanin='a:1') error: node "
+      "'from_bar_missing' was not found.";
+  TestUpdateFanin("foo_1", /*node_exists=*/true, {"from_bar_missing", 0},
+                  {"a", 1},
+                  /*success=*/false, error_msg, {"a"});
+  // Update fanin of node where to fanin is missing.
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_1', from_fanin='a:0', "
+      "to_fanin='to_bar_missing:1') error: node 'to_bar_missing' was not "
+      "found.";
+  TestUpdateFanin("foo_1", /*node_exists=*/true, {"a", 0},
+                  {"to_bar_missing", 1}, /*success=*/false, error_msg, {"a"});
+  // Update fanin of node where from/to fanins and node are missing.
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_missing', "
+      "from_fanin='from_bar_missing:0', to_fanin='to_bar_missing:1') error: "
+      "node 'foo_missing' was not found.";
+  TestUpdateFanin("foo_missing", /*node_exists=*/false, {"from_bar_missing", 0},
+                  {"to_bar_missing", 1},
+                  /*success=*/false, error_msg, {});
+  // Update fanin of node where from fanin is invalid.
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_1', from_fanin='a:-2', "
+      "to_fanin='a:0') error: fanin 'a:-2' must be a valid tensor id.";
+  TestUpdateFanin("foo_1", /*node_exists=*/true, {"a", -2}, {"a", 0},
+                  /*success=*/false, error_msg, {"a"});
+  // Update fanin of node where to fanin is invalid.
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_1', from_fanin='a:0', "
+      "to_fanin='a:-2') error: fanin 'a:-2' must be a valid tensor id.";
+  TestUpdateFanin("foo_1", /*node_exists=*/true, {"a", 0}, {"a", -2},
+                  /*success=*/false, error_msg, {"a"});
+  // Update fanin of node where from/to fanins are invalid and missing and node
+  // is missing.
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_missing', "
+      "from_fanin='from_bar_missing:-2', to_fanin='to_bar_missing:-3') error: "
+      "fanin 'from_bar_missing:-2' must be a valid tensor id.";
+  TestUpdateFanin("foo_missing", /*node_exists=*/false,
+                  {"from_bar_missing", -2}, {"to_bar_missing", -3},
+                  /*success=*/false, error_msg, {});
+
+  // Update to self to create cycle.
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_4', from_fanin='b:2', "
+      "to_fanin='foo_4:3') error: can't update fanin to or from self.";
+  TestUpdateFanin("foo_4", /*node_exists=*/true, {"b", 2}, {"foo_4", 3},
+                  /*success=*/false, error_msg,
+                  {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_4', from_fanin='b:2', "
+      "to_fanin='^foo_4') error: can't update fanin to or from self.";
+  TestUpdateFanin(
+      "foo_4", /*node_exists=*/true, {"b", 2}, {"foo_4", Graph::kControlSlot},
+      /*success=*/false, error_msg, {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_4', from_fanin='^c', "
+      "to_fanin='foo_4:4') error: can't update fanin to or from self.";
+  TestUpdateFanin(
+      "foo_4", /*node_exists=*/true, {"c", Graph::kControlSlot}, {"foo_4", 4},
+      /*success=*/false, error_msg, {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::UpdateFanin(node_name='foo_4', from_fanin='^c', "
+      "to_fanin='^foo_4') error: can't update fanin to or from self.";
+  TestUpdateFanin("foo_4", /*node_exists=*/true, {"c", Graph::kControlSlot},
+                  {"foo_4", Graph::kControlSlot}, /*success=*/false, error_msg,
+                  {"a", "b:2", "b:2", "^c", "^d"});
+}
+
+void TestUpdateFaninFromFaninToNodeAsSwitchControl(const TensorId& fanin) {
+  string tensor_id_str = TensorIdToString(fanin);
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "Switch", {}, {}),
+       NDef("c", "NotImportant", {tensor_id_str})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  Status s = graph.UpdateFanin("c", fanin, {"b", Graph::kControlSlot});
+  EXPECT_FALSE(s.ok());
+  string expected_msg = absl::Substitute(
+      "MutableGraphView::UpdateFanin(node_name='c', from_fanin='$0', "
+      "to_fanin='^b') error: can't update to fanin '^b' as it will become a "
+      "Switch control dependency.",
+      fanin.ToString());
+  EXPECT_EQ(s.error_message(), expected_msg);
+
+  EXPECT_EQ(graph.graph()->node_size(), 3);
+
+  string fanout = IsControlInput(fanin) ? AsControlDependency("c") : "c";
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {fanout});
+  CheckNode(graph, "b", "Switch", "", {}, {}, {});
+  CheckNode(graph, "c", "NotImportant", "", {}, {tensor_id_str}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, UpdateFaninToNodeAsSwitchControl) {
+  TestUpdateFaninFromFaninToNodeAsSwitchControl({"a", 0});
+  TestUpdateFaninFromFaninToNodeAsSwitchControl({"a", 1});
+  TestUpdateFaninFromFaninToNodeAsSwitchControl({"a", Graph::kControlSlot});
+}
+
+void TestUpdateRegularFaninByPort(absl::string_view node_name, bool node_exists,
+                                  int port, const TensorId& fanin, bool success,
+                                  const string& error_msg,
+                                  absl::Span<const string> expected_fanins) {
+  GraphDef graph_def = SimpleMutateFaninGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* node = graph.GetNode(node_name);
+  if (node_exists) {
+    EXPECT_NE(node, nullptr);
+  } else {
+    EXPECT_EQ(node, nullptr);
+  }
+
+  absl::flat_hash_map<string, std::vector<string>> unmodified_node_inputs =
+      GetNodeInputsFromGraph(graph_def, node_name);
+
+  Status s = graph.UpdateRegularFaninByPort(node_name, port, fanin);
+  EXPECT_EQ(s.ok(), success);
+  if (!success) {
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+  if (node_exists) {
+    CompareNodeFanins(graph, node, expected_fanins);
+  }
+
+  CheckUnmodifiedNodeFanins(graph_def, node_name, unmodified_node_inputs);
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, UpdateRegularFaninByPort) {
+  string error_msg;
+  // Update input at start to node with some inputs and no controls.
+  TestUpdateRegularFaninByPort(
+      "foo_3", /*node_exists=*/true, /*port=*/0, {"d", 2},
+      /*success=*/true, error_msg, {"d:2", "a:1", "a:1"});
+  // Update input at end to node with some inputs and no controls.
+  TestUpdateRegularFaninByPort(
+      "foo_3", /*node_exists=*/true, /*port=*/2, {"d", 2},
+      /*success=*/true, error_msg, {"b", "a:1", "d:2"});
+  // Update input in middle to node with some inputs and no controls.
+  TestUpdateRegularFaninByPort(
+      "foo_3", /*node_exists=*/true, /*port=*/1, {"d", 2},
+      /*success=*/true, error_msg, {"b", "d:2", "a:1"});
+  // Update input at start to node with some inputs and some controls, and dedup
+  // controls.
+  TestUpdateRegularFaninByPort(
+      "foo_4", /*node_exists=*/true, /*port=*/0, {"d", 2},
+      /*success=*/true, error_msg, {"d:2", "b:2", "b:2", "^c"});
+  // Update input at end to node with some inputs and some controls, and dedup
+  // controls.
+  TestUpdateRegularFaninByPort(
+      "foo_4", /*node_exists=*/true, /*port=*/2, {"d", 2},
+      /*success=*/true, error_msg, {"a", "b:2", "d:2", "^c"});
+  // Update input in middle to node with some inputs and some controls and
+  // dedup controls.
+  TestUpdateRegularFaninByPort(
+      "foo_4", /*node_exists=*/true, /*port=*/1, {"d", 2},
+      /*success=*/true, error_msg, {"a", "d:2", "b:2", "^c"});
+
+  // Update input to controlling fanin.
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_4', port=1, "
+      "fanin='^d') error: fanin '^d' must be a regular tensor id.";
+  TestUpdateRegularFaninByPort(
+      "foo_4", /*node_exists=*/true, /*port=*/1, {"d", Graph::kControlSlot},
+      /*success=*/false, error_msg, {"a", "b:2", "b:2", "^c", "^d"});
+
+  // Update fanin at out of bounds port.
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_5', port=-1, "
+      "fanin='d:2') error: no available ports as node has no regular fanins.";
+  TestUpdateRegularFaninByPort("foo_5", /*node_exists=*/true, /*port=*/-1,
+                               {"d", 2},
+                               /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_5', port=0, "
+      "fanin='d:2') error: no available ports as node has no regular fanins.";
+  TestUpdateRegularFaninByPort("foo_5", /*node_exists=*/true, /*port=*/0,
+                               {"d", 2},
+                               /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_5', port=1, "
+      "fanin='d:2') error: no available ports as node has no regular fanins.";
+  TestUpdateRegularFaninByPort("foo_5", /*node_exists=*/true, /*port=*/1,
+                               {"d", 2},
+                               /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_6', port=-1, "
+      "fanin='d:2') error: no available ports as node has no regular fanins.";
+  TestUpdateRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/-1,
+                               {"d", 2},
+                               /*success=*/false, error_msg, {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_6', port=0, "
+      "fanin='d:2') error: no available ports as node has no regular fanins.";
+  TestUpdateRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/0,
+                               {"d", 2},
+                               /*success=*/false, error_msg, {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_6', port=1, "
+      "fanin='d:2') error: no available ports as node has no regular fanins.";
+  TestUpdateRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/1,
+                               {"d", 2},
+                               /*success=*/false, error_msg, {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_3', port=-1, "
+      "fanin='d:2') error: port must be in range [0, 2].";
+  TestUpdateRegularFaninByPort(
+      "foo_3", /*node_exists=*/true, /*port=*/-1, {"d", 2},
+      /*success=*/false, error_msg, {"b", "a:1", "a:1"});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_3', port=3, "
+      "fanin='d:2') error: port must be in range [0, 2].";
+  TestUpdateRegularFaninByPort(
+      "foo_3", /*node_exists=*/true, /*port=*/3, {"d", 2},
+      /*success=*/false, error_msg, {"b", "a:1", "a:1"});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_4', port=-1, "
+      "fanin='d:2') error: port must be in range [0, 2].";
+  TestUpdateRegularFaninByPort(
+      "foo_4", /*node_exists=*/true, /*port=*/-1, {"d", 2},
+      /*success=*/false, error_msg, {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_4', port=3, "
+      "fanin='d:2') error: port must be in range [0, 2].";
+  TestUpdateRegularFaninByPort(
+      "foo_4", /*node_exists=*/true, /*port=*/3, {"d", 2},
+      /*success=*/false, error_msg, {"a", "b:2", "b:2", "^c", "^d"});
+
+  // Update fanin to node where node is missing.
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_missing', "
+      "port=0, fanin='a:0') error: node 'foo_missing' was not found.";
+  TestUpdateRegularFaninByPort("foo_missing", /*node_exists=*/false,
+                               /*port=*/0, {"a", 0},
+                               /*success=*/false, error_msg, {});
+  // Update fanin to node where fanin is missing.
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_1', port=0, "
+      "fanin='bar_missing:0') error: node 'bar_missing' was not "
+      "found.";
+  TestUpdateRegularFaninByPort("foo_1", /*node_exists=*/true, /*port=*/0,
+                               {"bar_missing", 0},
+                               /*success=*/false, error_msg, {"a"});
+  // Update fanin to node where node and fanin are missing.
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_missing', "
+      "port=0, fanin='bar_missing:0') error: node 'foo_missing' was not found.";
+  TestUpdateRegularFaninByPort("foo_missing", /*node_exists=*/false,
+                               /*port=*/0, {"bar_missing", 0},
+                               /*success=*/false, error_msg, {});
+
+  // Update self to create cycle.
+  error_msg =
+      "MutableGraphView::UpdateRegularFaninByPort(node_name='foo_6', port=0, "
+      "fanin='foo_6:2') error: can't add fanin 'foo_6:2' to self.";
+  TestUpdateRegularFaninByPort("foo_6", /*node_exists=*/true, /*port=*/0,
+                               {"foo_6", 2},
+                               /*success=*/false, error_msg, {"^a", "^b"});
+}
+
+void TestSwapRegularFaninsByPorts(absl::string_view node_name, bool node_exists,
+                                  int from_port, int to_port, bool success,
+                                  const string& error_msg,
+                                  absl::Span<const string> expected_fanins) {
+  GraphDef graph_def = SimpleMutateFaninGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* node = graph.GetNode(node_name);
+  if (node_exists) {
+    EXPECT_NE(node, nullptr);
+  } else {
+    EXPECT_EQ(node, nullptr);
+  }
+
+  absl::flat_hash_map<string, std::vector<string>> unmodified_node_inputs =
+      GetNodeInputsFromGraph(graph_def, node_name);
+
+  Status s = graph.SwapRegularFaninsByPorts(node_name, from_port, to_port);
+  EXPECT_EQ(s.ok(), success);
+  if (!success) {
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+  if (node_exists) {
+    CompareNodeFanins(graph, node, expected_fanins);
+  }
+
+  CheckUnmodifiedNodeFanins(graph_def, node_name, unmodified_node_inputs);
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, SwapRegularFaninsByPorts) {
+  string error_msg;
+  // Swapping first and last regular fanins
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/2, /*success=*/true, error_msg,
+                               {"a:1", "a:1", "b"});
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/2,
+                               /*to_port=*/0, /*success=*/true, error_msg,
+                               {"a:1", "a:1", "b"});
+  // Swapping first and last regular fanins, in node with controls.
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/2, /*success=*/true, error_msg,
+                               {"b:2", "b:2", "a", "^c", "^d"});
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/2,
+                               /*to_port=*/0, /*success=*/true, error_msg,
+                               {"b:2", "b:2", "a", "^c", "^d"});
+  // Swapping middle regular fanin.
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/1, /*success=*/true, error_msg,
+                               {"a:1", "b", "a:1"});
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/1,
+                               /*to_port=*/0, /*success=*/true, error_msg,
+                               {"a:1", "b", "a:1"});
+  // Swapping middle regular fanin, in node with controls.
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/1, /*success=*/true, error_msg,
+                               {"b:2", "a", "b:2", "^c", "^d"});
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/1,
+                               /*to_port=*/0, /*success=*/true, error_msg,
+                               {"b:2", "a", "b:2", "^c", "^d"});
+  // Swapping same port.
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/1,
+                               /*to_port=*/1, /*success=*/true, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+  // Swapping same fanin but different port.
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/1,
+                               /*to_port=*/2, /*success=*/true, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+
+  // Swaping fanins at out of bounds ports.
+  // Node with no regular fanins and no controls.
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_5', "
+      "from_port=-1, to_port=0) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_5", /*node_exists=*/true, /*from_port=*/-1,
+                               /*to_port=*/0, /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_5', "
+      "from_port=0, to_port=-1) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_5", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/-1, /*success=*/false, error_msg,
+                               {});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_5', "
+      "from_port=0, to_port=0) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_5", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/0, /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_5', "
+      "from_port=0, to_port=1) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_5", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/1, /*success=*/false, error_msg, {});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_5', "
+      "from_port=1, to_port=0) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_5", /*node_exists=*/true, /*from_port=*/1,
+                               /*to_port=*/0, /*success=*/false, error_msg, {});
+  // Node with no regular fanins and some controls.
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_6', "
+      "from_port=-1, to_port=0) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_6", /*node_exists=*/true, /*from_port=*/-1,
+                               /*to_port=*/0, /*success=*/false, error_msg,
+                               {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_6', "
+      "from_port=0, to_port=-1) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_6", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/-1, /*success=*/false, error_msg,
+                               {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_6', "
+      "from_port=0, to_port=0) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_6", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/0, /*success=*/false, error_msg,
+                               {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_6', "
+      "from_port=0, to_port=1) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_6", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/1, /*success=*/false, error_msg,
+                               {"^a", "^b"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_6', "
+      "from_port=1, to_port=0) error: no available ports as node has no "
+      "regular fanins.";
+  TestSwapRegularFaninsByPorts("foo_6", /*node_exists=*/true, /*from_port=*/1,
+                               /*to_port=*/0, /*success=*/false, error_msg,
+                               {"^a", "^b"});
+  // Node with regular fanins and no controls.
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_3', "
+      "from_port=-1, to_port=0) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/-1,
+                               /*to_port=*/0, /*success=*/false, error_msg,
+                               {"b", "a:1", "a:1"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_3', "
+      "from_port=0, to_port=-1) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/-1, /*success=*/false, error_msg,
+                               {"b", "a:1", "a:1"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_3', "
+      "from_port=0, to_port=3) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/3, /*success=*/false, error_msg,
+                               {"b", "a:1", "a:1"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_3', "
+      "from_port=3, to_port=0) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/3,
+                               /*to_port=*/0, /*success=*/false, error_msg,
+                               {"b", "a:1", "a:1"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_3', "
+      "from_port=-1, to_port=3) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/-1,
+                               /*to_port=*/3, /*success=*/false, error_msg,
+                               {"b", "a:1", "a:1"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_3', "
+      "from_port=3, to_port=-1) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_3", /*node_exists=*/true, /*from_port=*/3,
+                               /*to_port=*/-1, /*success=*/false, error_msg,
+                               {"b", "a:1", "a:1"});
+  // Node with regular fanins and controls.
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_4', "
+      "from_port=-1, to_port=0) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/-1,
+                               /*to_port=*/0, /*success=*/false, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_4', "
+      "from_port=0, to_port=-1) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/-1, /*success=*/false, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_4', "
+      "from_port=0, to_port=3) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/0,
+                               /*to_port=*/3, /*success=*/false, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_4', "
+      "from_port=3, to_port=0) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/3,
+                               /*to_port=*/0, /*success=*/false, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_4', "
+      "from_port=-1, to_port=3) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/-1,
+                               /*to_port=*/3, /*success=*/false, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_4', "
+      "from_port=3, to_port=-1) error: port must be in range [0, 2].";
+  TestSwapRegularFaninsByPorts("foo_4", /*node_exists=*/true, /*from_port=*/3,
+                               /*to_port=*/-1, /*success=*/false, error_msg,
+                               {"a", "b:2", "b:2", "^c", "^d"});
+
+  // Swapping fanin to node where node is missing.
+  error_msg =
+      "MutableGraphView::SwapRegularFaninsByPorts(node_name='foo_missing', "
+      "from_port=0, to_port=1) error: node 'foo_missing' was not found.";
+  TestSwapRegularFaninsByPorts("foo_missing", /*node_exists=*/false,
+                               /*from_port=*/0, /*to_port=*/1,
+                               /*success=*/false, error_msg, {});
+}
+
+TEST(MutableGraphViewTest, DedupControllingFaninsOnGraphInit) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {}, {}),
+       NDef("c", "Switch", {}, {}), NDef("d", "Identity", {"c:1"}),
+       NDef("foo_1", "IdentityN", {"a", "b:1", "^b"}),
+       NDef("foo_2", "IdentityN", {"a", "^b", "^b"}),
+       NDef("foo_3", "IdentityN", {"a", "b:1", "^b", "^b"}),
+       NDef("foo_4", "IdentityN", {"a:2", "b:1", "^b", "^b", "^a", "^a"}),
+       NDef("foo_5", "NotImportant", {"a:2", "b:1", "^b", "^b", "^a", "^a"}),
+       NDef("foo_6", "Identity", {"d", "^d"}),
+       NDef("foo_7", "NotImportant",
+            {"a:3", "b:2", "d", "^d", "^d", "^a", "^b", "^a", "^b"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  EXPECT_EQ(graph.graph()->node_size(), 11);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {},
+            {"foo_1", "foo_2", "foo_3", "foo_4", "foo_5", "foo_7"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {},
+            {"foo_1:1", "^foo_2", "foo_3:1", "foo_4:1", "foo_5:1", "foo_7:1"});
+  CheckNode(graph, "c", "Switch", "", {}, {}, {"d"});
+  CheckNode(graph, "d", "Identity", "", {}, {"c:1"},
+            {"foo_6", "^foo_6", "foo_7:2", "^foo_7"});
+  CheckNode(graph, "foo_1", "IdentityN", "", {}, {"a", "b:1"}, {});
+  CheckNode(graph, "foo_2", "IdentityN", "", {}, {"a", "^b"}, {});
+  CheckNode(graph, "foo_3", "IdentityN", "", {}, {"a", "b:1"}, {});
+  CheckNode(graph, "foo_4", "IdentityN", "", {}, {"a:2", "b:1"}, {});
+  CheckNode(graph, "foo_5", "NotImportant", "", {}, {"a:2", "b:1"}, {});
+  CheckNode(graph, "foo_6", "Identity", "", {}, {"d", "^d"}, {});
+  CheckNode(graph, "foo_7", "NotImportant", "", {}, {"a:3", "b:2", "d", "^d"},
+            {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, DedupControllingFaninsOnAddFanin) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"^a"}),
+       NDef("c", "NotImportant", {"a:1"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.AddRegularFanin("b", {"a", 2}));
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a:2"}, {});
+
+  TF_EXPECT_OK(graph.AddControllingFanin("c", {"a", Graph::kControlSlot}));
+  CheckNode(graph, "c", "NotImportant", "", {}, {"a:1"}, {});
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"b:0", "c:0"});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, NoDedupControllingFaninsOnAddFanin) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "Switch", {}, {}), NDef("b", "Identity", {"a:1"}),
+       NDef("c", "", {}, {}), NDef("d", "", {}, {})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.AddRegularFanin("c", {"b", 2}));
+  CheckNode(graph, "c", "", "", {}, {"b:2"}, {});
+  TF_EXPECT_OK(graph.AddControllingFanin("c", {"b", Graph::kControlSlot}));
+  CheckNode(graph, "c", "", "", {}, {"b:2", "^b"}, {});
+  TF_EXPECT_OK(graph.AddControllingFanin("c", {"b", Graph::kControlSlot}));
+  CheckNode(graph, "c", "", "", {}, {"b:2", "^b"}, {});
+  TF_EXPECT_OK(graph.AddRegularFanin("c", {"b", 2}));
+  CheckNode(graph, "c", "", "", {}, {"b:2", "b:2", "^b"}, {});
+
+  TF_EXPECT_OK(graph.AddControllingFanin("d", {"b", Graph::kControlSlot}));
+  CheckNode(graph, "d", "", "", {}, {"^b"}, {});
+  TF_EXPECT_OK(graph.AddControllingFanin("d", {"b", Graph::kControlSlot}));
+  CheckNode(graph, "d", "", "", {}, {"^b"}, {});
+
+  CheckNode(graph, "a", "Switch", "", {}, {}, {"b"});
+  CheckNode(graph, "b", "Identity", "", {}, {"a:1"},
+            {"c:0", "c:1", "^c", "^d"});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, DedupControllingFaninsOnAddFaninByPort) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def =
+      test::function::GDef({NDef("a", "NotImportant", {}, {}),
+                            NDef("b", "NotImportant", {"c", "^a"}),
+                            NDef("c", "NotImportant", {"a:1"})},
+                           /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.AddRegularFaninByPort("b", 0, {"a", 2}));
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a:2", "c"}, {});
+
+  TF_EXPECT_OK(graph.AddControllingFanin("c", {"a", Graph::kControlSlot}));
+  CheckNode(graph, "c", "NotImportant", "", {}, {"a:1"}, {"b:1"});
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"b:0", "c:0"});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, NoDedupControllingFaninsOnAddFaninByPort) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "Switch", {}, {}), NDef("b", "Identity", {"a:1"}),
+       NDef("c", "", {}, {}), NDef("d", "", {"c:2"}, {})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.AddRegularFaninByPort("d", 1, {"b", 2}));
+  CheckNode(graph, "d", "", "", {}, {"c:2", "b:2"}, {});
+  TF_EXPECT_OK(graph.AddControllingFanin("d", {"b", Graph::kControlSlot}));
+  CheckNode(graph, "d", "", "", {}, {"c:2", "b:2", "^b"}, {});
+  TF_EXPECT_OK(graph.AddRegularFaninByPort("d", 0, {"b", 2}));
+  CheckNode(graph, "d", "", "", {}, {"b:2", "c:2", "b:2", "^b"}, {});
+
+  CheckNode(graph, "a", "Switch", "", {}, {}, {"b:0"});
+  CheckNode(graph, "b", "Identity", "", {}, {"a:1"}, {"d:0", "d:2", "^d"});
+  CheckNode(graph, "c", "", "", {}, {}, {"d:1"});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, DedupControllingFaninsOnUpdateFanin) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {}, {}),
+       NDef("c", "NotImportant", {"a:1", "^b"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.UpdateFanin("c", {"a", 1}, {"b", 2}));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"c"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"b:2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, NoDedupControllingFaninsOnUpdateFanin) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "Switch", {}, {}), NDef("b", "Identity", {"a:1"}),
+       NDef("c", "Identity", {"a:2"}), NDef("d", "NotImportant", {"c", "^b"}),
+       NDef("e", "NotImportant", {"b", "^c"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.UpdateFanin("d", {"b", Graph::kControlSlot},
+                                 {"c", Graph::kControlSlot}));
+  CheckNode(graph, "d", "NotImportant", "", {}, {"c", "^c"}, {});
+
+  TF_EXPECT_OK(graph.UpdateFanin("e", {"b", 0}, {"c", 3}));
+  CheckNode(graph, "e", "NotImportant", "", {}, {"c:3", "^c"}, {});
+
+  TF_EXPECT_OK(graph.UpdateFanin("e", {"c", 3}, {"c", Graph::kControlSlot}));
+  CheckNode(graph, "e", "NotImportant", "", {}, {"^c"}, {});
+
+  CheckNode(graph, "a", "Switch", "", {}, {}, {"b:0", "c:0"});
+  CheckNode(graph, "b", "Identity", "", {}, {"a:1"}, {});
+  CheckNode(graph, "c", "Identity", "", {}, {"a:2"}, {"d:0", "^d", "^e"});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, DedupControllingFaninsOnUpdateFaninByPort) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {}, {}),
+       NDef("c", "NotImportant", {"a:1", "^b"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.UpdateRegularFaninByPort("c", 0, {"b", 2}));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"c"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"b:2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, NoDedupControllingFaninsOnUpdateFaninByPort) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "Switch", {}, {}), NDef("b", "Identity", {"a:1"}),
+       NDef("c", "Identity", {"a:2"}), NDef("d", "NotImportant", {"c", "^b"}),
+       NDef("e", "NotImportant", {"b", "^c"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.UpdateRegularFaninByPort("d", 0, {"b", 1}));
+  CheckNode(graph, "d", "NotImportant", "", {}, {"b:1", "^b"}, {});
+
+  TF_EXPECT_OK(graph.UpdateRegularFaninByPort("e", 0, {"c", 2}));
+  CheckNode(graph, "e", "NotImportant", "", {}, {"c:2", "^c"}, {});
+
+  CheckNode(graph, "a", "Switch", "", {}, {}, {"b:0", "c:0"});
+  CheckNode(graph, "b", "Identity", "", {}, {"a:1"}, {"d:0", "^d"});
+  CheckNode(graph, "c", "Identity", "", {}, {"a:2"}, {"e:0", "^e"});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, UpdateMaxRegularOutputPortOnAddFanin) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a:1"}),
+       NDef("c", "NotImportant", {"^b"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.AddRegularFanin("c", {"a", 3}));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"b", "c"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a:1"}, {"^c"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"a:3", "^b"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, UpdateMaxRegularOutputPortOnRemoveFanin) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a:1"}),
+       NDef("c", "NotImportant", {"a:2"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.RemoveRegularFanin("c", {"a", 2}));
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"b"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a:1"}, {});
+  CheckNode(graph, "c", "NotImportant", "", {}, {}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, KeepMaxRegularOutputPortOnRemoveFanin) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a:1"}),
+       NDef("c", "NotImportant", {"a:2"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.RemoveRegularFanin("b", {"a", 1}));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"c"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"a:2"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, UpdateMaxRegularOutputPortOnUpdateFanin) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a:1"}),
+       NDef("c", "NotImportant", {"a:2"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.UpdateFanin("c", {"a", 2}, {"b", 3}));
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"b"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a:1"}, {"c"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"b:3"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, AddControllingFaninMissing) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {}, {})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+  // Missing fanin.
+  Status s = graph.AddControllingFanin("a", {"c", Graph::kControlSlot});
+  EXPECT_FALSE(s.ok());
+  string expected_msg =
+      "MutableGraphView::AddControllingFanin(node_name='a', fanin='^c') error: "
+      "node 'c' was not found.";
+  EXPECT_EQ(s.error_message(), expected_msg);
+  // Missing node.
+  s = graph.AddControllingFanin("d", {"a", Graph::kControlSlot});
+  EXPECT_FALSE(s.ok());
+  expected_msg =
+      "MutableGraphView::AddControllingFanin(node_name='d', fanin='^a') error: "
+      "node 'd' was not found.";
+  EXPECT_EQ(s.error_message(), expected_msg);
+  // Missing node and fanin.
+  s = graph.AddControllingFanin("c", {"d", Graph::kControlSlot});
+  EXPECT_FALSE(s.ok());
+  expected_msg =
+      "MutableGraphView::AddControllingFanin(node_name='c', fanin='^d') error: "
+      "node 'c' was not found.";
+  EXPECT_EQ(s.error_message(), expected_msg);
+
+  ASSERT_EQ(graph.graph()->node_size(), 2);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, AddControllingFaninExistingControl) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {}, {})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+  TF_EXPECT_OK(graph.AddControllingFanin("a", {"b", Graph::kControlSlot}));
+  TF_EXPECT_OK(graph.AddControllingFanin("a", {"b", Graph::kControlSlot}));
+
+  ASSERT_EQ(graph.graph()->node_size(), 2);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {"^b"}, {});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"^a"});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, AddControllingFaninNotSwitch) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {}, {})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+  TF_EXPECT_OK(graph.AddControllingFanin("a", {"b", 2}));
+  TF_EXPECT_OK(graph.AddControllingFanin("a", {"b", 2}));
+
+  ASSERT_EQ(graph.graph()->node_size(), 2);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {"^b"}, {});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"^a"});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, AddControllingFaninSwitch) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "Switch", {}, {})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  Status s = graph.AddControllingFanin("a", {"b", Graph::kControlSlot});
+  EXPECT_FALSE(s.ok());
+  string expected_msg =
+      "MutableGraphView::AddControllingFanin(node_name='a', fanin='^b') error: "
+      "can't add fanin '^b' as it will become a Switch control dependency.";
+  EXPECT_EQ(s.error_message(), expected_msg);
+
+  ASSERT_EQ(graph.graph()->node_size(), 2);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {});
+  CheckNode(graph, "b", "Switch", "", {}, {}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, AddControllingFaninSwitchWithIdentity) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("switch", "Switch", {}, {}),
+       NDef("identity", "Identity", {"switch"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.AddControllingFanin("a", {"switch", 0}));
+  TF_EXPECT_OK(graph.AddControllingFanin("a", {"switch", 0}));
+
+  ASSERT_EQ(graph.graph()->node_size(), 3);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {"^identity"}, {});
+  CheckNode(graph, "switch", "Switch", "", {}, {}, {"identity"});
+  CheckNode(graph, "identity", "Identity", "", {}, {"switch"}, {"^a"});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, AddControllingFaninSwitchWithNoExistingIdentity) {
+  constexpr char kDevice[] = "/device:foo:0";
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}),
+       NDef("switch", "Switch", {}, {{"T", DT_FLOAT}}, kDevice)},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.AddControllingFanin("a", {"switch", 0}));
+  TF_EXPECT_OK(graph.AddControllingFanin("a", {"switch", 0}));
+
+  ASSERT_EQ(graph.graph()->node_size(), 3);
+
+  CheckNode(graph, "a", "NotImportant", "", {},
+            {"^ConstantFoldingCtrl/switch_0"}, {});
+  CheckNode(graph, "switch", "Switch", kDevice, {{"T", DT_FLOAT}}, {},
+            {"ConstantFoldingCtrl/switch_0"});
+  CheckNode(graph, "ConstantFoldingCtrl/switch_0", "Identity", kDevice,
+            {{"T", DT_FLOAT}}, {"switch"}, {"^a"});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, AddControllingFaninSwitchWithExistingAddedIdentity) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("switch", "Switch", {}, {}),
+       NDef("ConstantFoldingCtrl/switch_0", "Identity", {"switch"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.AddControllingFanin("a", {"switch", 0}));
+  TF_EXPECT_OK(graph.AddControllingFanin("a", {"switch", 0}));
+
+  ASSERT_EQ(graph.graph()->node_size(), 3);
+
+  CheckNode(graph, "a", "NotImportant", "", {},
+            {"^ConstantFoldingCtrl/switch_0"}, {});
+  CheckNode(graph, "switch", "Switch", "", {}, {},
+            {"ConstantFoldingCtrl/switch_0"});
+  CheckNode(graph, "ConstantFoldingCtrl/switch_0", "Identity", "", {},
+            {"switch"}, {"^a"});
+
+  CheckGraph(graph);
+}
+
+void TestAddControllingFaninSelfLoops(absl::string_view node_name,
+                                      const TensorId& fanin,
+                                      const string& error_msg) {
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}),
+       NDef("b", "Switch", {}, {{"T", DT_FLOAT}}),
+       NDef("c", "Identity", {"b:0"}), NDef("d", "Identity", {"b:1"}),
+       NDef("e", "NotImportant", {"^a"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  Status s = graph.AddControllingFanin(node_name, fanin);
+  EXPECT_FALSE(s.ok());
+  EXPECT_EQ(s.error_message(), error_msg);
+
+  EXPECT_EQ(graph.graph()->node_size(), 5);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"^e"});
+  CheckNode(graph, "b", "Switch", "", {{"T", DT_FLOAT}}, {}, {"c", "d"});
+  CheckNode(graph, "c", "Identity", "", {}, {"b"}, {});
+  CheckNode(graph, "d", "Identity", "", {}, {"b:1"}, {});
+  CheckNode(graph, "e", "NotImportant", "", {}, {"^a"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, AddControllingFaninSelfLoops) {
+  string error_msg =
+      "MutableGraphView::AddControllingFanin(node_name='a', fanin='^a') error: "
+      "can't add fanin '^a' to self.";
+  TestAddControllingFaninSelfLoops("a", {"a", Graph::kControlSlot}, error_msg);
+
+  // Adding Switch control dependency to Identity consumer. Node `c` is
+  // consuming `b:0`, so adding `b:0` as a control dependency, because it is a
+  // Switch, should trigger a lookup of outputs. As `c` is a consumer and an
+  // Identity, this will introduce a self loop, so no control dependency should
+  // be added.
+  error_msg =
+      "MutableGraphView::AddControllingFanin(node_name='c', fanin='b:0') "
+      "error: can't add found fanin '^c' to self.";
+  TestAddControllingFaninSelfLoops("c", {"b", 0}, error_msg);
+
+  // Adding Switch control dependency to Identity consumer. Node `d` is
+  // consuming `b:1`, so adding `b:1` as a control dependency, because it is a
+  // Switch, should trigger a lookup of outputs. As `d` is a consumer and an
+  // Identity, this will introduce a self loop, so no control dependency should
+  // be added.
+  error_msg =
+      "MutableGraphView::AddControllingFanin(node_name='d', fanin='b:1') "
+      "error: can't add found fanin '^d' to self.";
+  TestAddControllingFaninSelfLoops("d", {"b", 1}, error_msg);
+}
+
+TEST(MutableGraphViewTest, AddControllingFaninSelfLoopsGeneratedIdentity) {
+  GraphDef graph_def =
+      test::function::GDef({NDef("a", "NotImportant", {}, {}),
+                            NDef("b", "Switch", {}, {{"T", DT_FLOAT}}),
+                            NDef("c", "NotImportant", {}),
+                            NDef("ConstantFoldingCtrl/b_1", "Identity", {})},
+                           /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  // Adding Switch control dependency to Identity node of the same name as a
+  // generated Identity node for pinning the control dependency. Because there
+  // are no consumers of `b:1`, there will be an attempt to generate an Identity
+  // node, with name `ConstantFoldingCtrl/b_1`. As the input node is of the same
+  // name, we will introduce a self loop, so no control dependency should be
+  // added.
+  Status s = graph.AddControllingFanin("ConstantFoldingCtrl/b_1", {"b", 1});
+  EXPECT_FALSE(s.ok());
+  string expected_msg =
+      "MutableGraphView::AddControllingFanin(node_name='ConstantFoldingCtrl/"
+      "b_1', fanin='b:1') error: can't add generated fanin "
+      "'^ConstantFoldingCtrl/b_1' to self.";
+  EXPECT_EQ(s.error_message(), expected_msg);
+
+  EXPECT_EQ(graph.graph()->node_size(), 4);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {});
+  CheckNode(graph, "b", "Switch", "", {{"T", DT_FLOAT}}, {}, {});
+  CheckNode(graph, "c", "NotImportant", "", {}, {}, {});
+  CheckNode(graph, "ConstantFoldingCtrl/b_1", "Identity", "", {}, {}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, RemoveControllingFaninMissing) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {}, {}),
+       NDef("c", "NotImportant", {}, {}),
+       NDef("d", "NotImportant", {"^a", "^b"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.RemoveControllingFanin("d", "c"));
+
+  ASSERT_EQ(graph.graph()->node_size(), 4);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"^d"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"^d"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {}, {});
+  CheckNode(graph, "d", "NotImportant", "", {}, {"^a", "^b"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, RemoveControllingFaninExisting) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {}, {}),
+       NDef("c", "NotImportant", {}, {}),
+       NDef("d", "NotImportant", {"^a", "^b", "^c"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.RemoveControllingFanin("d", "a"));
+  TF_EXPECT_OK(graph.RemoveControllingFanin("d", "a"));
+
+  ASSERT_EQ(graph.graph()->node_size(), 4);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {});
+  CheckNode(graph, "b", "NotImportant", "", {}, {}, {"^d"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {}, {"^d"});
+  CheckNode(graph, "d", "NotImportant", "", {}, {"^c", "^b"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, RemoveControllingFaninOnRegularFanin) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a"}),
+       NDef("c", "NotImportant", {"a", "b"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.RemoveControllingFanin("c", "a"));
+  TF_EXPECT_OK(graph.RemoveControllingFanin("c", "b"));
+
+  ASSERT_EQ(graph.graph()->node_size(), 3);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"b", "c"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a"}, {"c:1"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"a", "b"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, RemoveControllingFaninSelfLoop) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a"}),
+       NDef("c", "NotImportant", {"a", "b"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  Status s = graph.RemoveControllingFanin("c", "c");
+  EXPECT_FALSE(s.ok());
+  string expected_msg =
+      "MutableGraphView::RemoveControllingFanin(node_name='c', "
+      "fanin_node_name='c') error: can't remove fanin '^c' from "
+      "self.";
+  EXPECT_EQ(s.error_message(), expected_msg);
+
+  ASSERT_EQ(graph.graph()->node_size(), 3);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"b", "c"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a"}, {"c:1"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"a", "b"}, {});
+
+  CheckGraph(graph);
+}
+
+void TestUpdateAllRegularFaninsToControlling(
+    absl::string_view node_name, bool node_exists, bool success,
+    const string& error_msg, absl::Span<const string> expected_fanins) {
+  constexpr char kDevice[] = "/device:foo:0";
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}),
+       NDef("switch", "Switch", {}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("b", "NotImportant", {"switch:1"}, {}),
+       NDef("ConstantFoldingCtrl/switch_1", "Identity", {"switch:1"},
+            {{"T", DT_FLOAT}}, kDevice),
+       NDef("c", "NotImportant", {"a", "^b"}, {}),
+       NDef("d", "NotImportant", {"b", "c"}, {}),
+       NDef("e", "NotImportant", {"^d"}, {})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* node = graph.GetNode(node_name);
+  if (node_exists) {
+    EXPECT_NE(node, nullptr);
+  } else {
+    EXPECT_EQ(node, nullptr);
+  }
+
+  absl::flat_hash_map<string, std::vector<string>> unmodified_node_inputs =
+      GetNodeInputsFromGraph(graph_def, node_name);
+
+  Status s = graph.UpdateAllRegularFaninsToControlling(node_name);
+  EXPECT_EQ(s.ok(), success);
+  if (!success) {
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+  if (node_exists) {
+    CompareNodeFanins(graph, node, expected_fanins);
+  }
+
+  CheckUnmodifiedNodeFanins(graph_def, node_name, unmodified_node_inputs);
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, UpdateAllRegularFaninsToControlling) {
+  string error_msg;
+  // Nodes with some regular fanins and some controls.
+  TestUpdateAllRegularFaninsToControlling("a", /*node_exists=*/true,
+                                          /*success=*/true, error_msg, {});
+  TestUpdateAllRegularFaninsToControlling("c", /*node_exists=*/true,
+                                          /*success=*/true, error_msg,
+                                          {"^a", "^b"});
+  TestUpdateAllRegularFaninsToControlling("d", /*node_exists=*/true,
+                                          /*success=*/true, error_msg,
+                                          {"^b", "^c"});
+  TestUpdateAllRegularFaninsToControlling("e", /*node_exists=*/true,
+                                          /*success=*/true, error_msg, {"^d"});
+
+  // Use existing Identity to pin control dependency of Switch.
+  TestUpdateAllRegularFaninsToControlling("b", /*node_exists=*/true,
+                                          /*success=*/true, error_msg,
+                                          {"^ConstantFoldingCtrl/switch_1"});
+
+  // Missing node.
+  error_msg =
+      "MutableGraphView::UpdateAllRegularFaninsToControlling(node_name='f') "
+      "error: node 'f' was not found.";
+  TestUpdateAllRegularFaninsToControlling("f", /*node_exists=*/false,
+                                          /*success=*/false, error_msg, {});
+
+  // Error in getting controlling fanin.
+  error_msg =
+      "MutableGraphView::UpdateAllRegularFaninsToControlling(node_name='"
+      "ConstantFoldingCtrl/switch_1') error: can't add found fanin "
+      "'^ConstantFoldingCtrl/switch_1' to self.";
+  TestUpdateAllRegularFaninsToControlling("ConstantFoldingCtrl/switch_1",
+                                          /*node_exists=*/true,
+                                          /*success=*/false, error_msg,
+                                          {"switch:1"});
+}
+
+TEST(MutableGraphViewTest, UpdateAllRegularFaninsToControllingConsumingSwitch) {
+  constexpr char kDevice[] = "/device:foo:0";
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}),
+       NDef("switch", "Switch", {}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("b", "NotImportant", {"switch:1"}, {})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  TF_EXPECT_OK(graph.UpdateAllRegularFaninsToControlling("b"));
+
+  EXPECT_EQ(graph.graph()->node_size(), 4);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {});
+  CheckNode(graph, "switch", "Switch", kDevice, {{"T", DT_FLOAT}}, {},
+            {"ConstantFoldingCtrl/switch_1"});
+  CheckNode(graph, "b", "NotImportant", "", {},
+            {"^ConstantFoldingCtrl/switch_1"}, {});
+  CheckNode(graph, "ConstantFoldingCtrl/switch_1", "Identity", kDevice,
+            {{"T", DT_FLOAT}}, {"switch:1"}, {"^b"});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, DeleteNodes) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("bar", "NotImportant", {}, {}),
+       NDef("other", "NotImportant", {}, {}),
+       NDef("foo_1", "NotImportant", {"bar", "other", "bar:1", "^bar"}),
+       NDef("foo_2", "NotImportant", {"other:1", "bar:2", "^bar"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
+
+  EXPECT_NE(graph.GetNode("foo_1"), nullptr);
+  TF_EXPECT_OK(graph.DeleteNodes({"foo_1"}));
+
+  EXPECT_EQ(graph.graph()->node_size(), 3);
+  EXPECT_EQ(graph.GetNode("foo_1"), nullptr);
+
+  CheckNode(graph, "bar", "NotImportant", "", {}, {}, {"foo_2:1"});
+  CheckNode(graph, "other", "NotImportant", "", {}, {}, {"foo_2"});
+  CheckNode(graph, "foo_2", "NotImportant", "", {}, {"other:1", "bar:2"}, {});
+
+  CheckGraph(graph);
+}
+
+GraphDef SimpleDeleteNodeGraph() {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a:2"}),
+       NDef("c", "NotImportant", {"a:5", "^b"}), NDef("d", "NotImportant", {}),
+       NDef("e", "NotImportant", {"d:2"}),
+       NDef("f", "NotImportant", {"d:3", "^e"})},
+      /*funcs=*/{});
+  return graph_def;
+}
+
+TEST(MutableGraphViewTest, DeleteNodesWithFanoutsBeingDeleted) {
+  GraphDef graph_def = SimpleDeleteNodeGraph();
+
+  MutableGraphView graph(&graph_def);
+  EXPECT_NE(graph.GetNode("a"), nullptr);
+  EXPECT_NE(graph.GetNode("b"), nullptr);
+  EXPECT_NE(graph.GetNode("c"), nullptr);
+  TF_EXPECT_OK(graph.DeleteNodes({"c", "a", "b"}));
+
+  EXPECT_EQ(graph.graph()->node_size(), 3);
+  EXPECT_EQ(graph.GetNode("a"), nullptr);
+  EXPECT_EQ(graph.GetNode("b"), nullptr);
+  EXPECT_EQ(graph.GetNode("c"), nullptr);
+
+  CheckNode(graph, "d", "NotImportant", "", {}, {}, {"e", "f"});
+  CheckNode(graph, "e", "NotImportant", "", {}, {"d:2"}, {"^f"});
+  CheckNode(graph, "f", "NotImportant", "", {}, {"d:3", "^e"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, DeleteMissingNodes) {
+  GraphDef graph_def = SimpleDeleteNodeGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  EXPECT_EQ(graph.GetNode("g"), nullptr);
+  EXPECT_EQ(graph.GetNode("h"), nullptr);
+  TF_EXPECT_OK(graph.DeleteNodes({"g", "h"}));
+
+  EXPECT_EQ(graph.graph()->node_size(), 6);
+  EXPECT_EQ(graph.GetNode("g"), nullptr);
+  EXPECT_EQ(graph.GetNode("h"), nullptr);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"b", "c"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a:2"}, {"^c"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"a:5", "^b"}, {});
+  CheckNode(graph, "d", "NotImportant", "", {}, {}, {"e", "f"});
+  CheckNode(graph, "e", "NotImportant", "", {}, {"d:2"}, {"^f"});
+  CheckNode(graph, "f", "NotImportant", "", {}, {"d:3", "^e"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, DeleteMissingNodesAndNodesWithFanoutsBeingDeleted) {
+  GraphDef graph_def = SimpleDeleteNodeGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  EXPECT_NE(graph.GetNode("d"), nullptr);
+  EXPECT_NE(graph.GetNode("e"), nullptr);
+  EXPECT_NE(graph.GetNode("f"), nullptr);
+  TF_EXPECT_OK(graph.DeleteNodes({"d", "e", "f", "g", "h"}));
+
+  EXPECT_EQ(graph.graph()->node_size(), 3);
+  EXPECT_EQ(graph.GetNode("d"), nullptr);
+  EXPECT_EQ(graph.GetNode("e"), nullptr);
+  EXPECT_EQ(graph.GetNode("f"), nullptr);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"b", "c"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a:2"}, {"^c"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"a:5", "^b"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, DeleteNodesWithError) {
+  GraphDef graph_def = SimpleDeleteNodeGraph();
+
+  MutableGraphView graph(&graph_def);
+
+  Status s = graph.DeleteNodes({"b", "a"});
+  EXPECT_FALSE(s.ok());
+  string error_msg =
+      "MutableGraphView::DeleteNodes(nodes_to_delete={a, b}) error: can't "
+      "delete node(s) with retained fanouts(s) [a, b].";
+  EXPECT_EQ(s.error_message(), error_msg);
+
+  EXPECT_EQ(graph.graph()->node_size(), 6);
+
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"b", "c"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a:2"}, {"^c"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"a:5", "^b"}, {});
+  CheckNode(graph, "d", "NotImportant", "", {}, {}, {"e", "f"});
+  CheckNode(graph, "e", "NotImportant", "", {}, {"d:2"}, {"^f"});
+  CheckNode(graph, "f", "NotImportant", "", {}, {"d:3", "^e"}, {});
+
+  CheckGraph(graph);
+}
+
+TEST(MutableGraphViewTest, DeleteNodesWithLargeError) {
+  // Actual node.op() is not important in this test.
+  GraphDef graph_def = test::function::GDef(
+      {NDef("a", "NotImportant", {}, {}), NDef("b", "NotImportant", {"a:2"}),
+       NDef("c", "NotImportant", {"^b"}), NDef("d", "NotImportant", {"c:6"}),
+       NDef("e", "NotImportant", {"d:2"}),
+       NDef("f", "NotImportant", {"d:3", "^e"}),
+       NDef("g", "NotImportant", {"f"}), NDef("h", "NotImportant", {"a"}),
+       NDef("i", "NotImportant", {"b"}), NDef("j", "NotImportant", {"c"}),
+       NDef("k", "NotImportant", {"d"}), NDef("l", "NotImportant", {"e"}),
+       NDef("m", "NotImportant", {"f"})},
+      /*funcs=*/{});
+
+  MutableGraphView graph(&graph_def);
 
-  NodeDef* bar = graph.GetNode("bar");
-  NodeDef* other = graph.GetNode("other");
-  NodeDef* foo_2 = graph.GetNode("foo_2");
+  Status s = graph.DeleteNodes({"a", "b", "c", "d", "e", "f"});
+  EXPECT_FALSE(s.ok());
+  string error_msg =
+      "MutableGraphView::DeleteNodes(nodes_to_delete={a, b, c, d, e, ...}) "
+      "error: can't delete node(s) with retained fanouts(s) [a, b, c, d, e, "
+      "...].";
+  EXPECT_EQ(s.error_message(), error_msg);
 
-  bool include_control_fanouts = true;
-  auto bar_fanouts = graph.GetFanouts(*bar, include_control_fanouts);
-  auto other_fanouts = graph.GetFanouts(*other, include_control_fanouts);
+  EXPECT_EQ(graph.graph()->node_size(), 13);
 
-  EXPECT_EQ(bar_fanouts.size(), 2);
-  EXPECT_EQ(bar_fanouts.count(MutableGraphView::InputPort(foo_2, 1)), 1);
-  EXPECT_EQ(bar_fanouts.count(MutableGraphView::InputPort(foo_2, -1)), 1);
+  CheckNode(graph, "a", "NotImportant", "", {}, {}, {"b", "h"});
+  CheckNode(graph, "b", "NotImportant", "", {}, {"a:2"}, {"^c", "i"});
+  CheckNode(graph, "c", "NotImportant", "", {}, {"^b"}, {"d", "j"});
+  CheckNode(graph, "d", "NotImportant", "", {}, {"c:6"}, {"e", "f", "k"});
+  CheckNode(graph, "e", "NotImportant", "", {}, {"d:2"}, {"^f", "l"});
+  CheckNode(graph, "f", "NotImportant", "", {}, {"d:3", "^e"}, {"g", "m"});
+  CheckNode(graph, "g", "NotImportant", "", {}, {"f"}, {});
+  CheckNode(graph, "h", "NotImportant", "", {}, {"a"}, {});
+  CheckNode(graph, "i", "NotImportant", "", {}, {"b"}, {});
+  CheckNode(graph, "j", "NotImportant", "", {}, {"c"}, {});
+  CheckNode(graph, "k", "NotImportant", "", {}, {"d"}, {});
+  CheckNode(graph, "l", "NotImportant", "", {}, {"e"}, {});
+  CheckNode(graph, "m", "NotImportant", "", {}, {"f"}, {});
 
-  EXPECT_EQ(other_fanouts.size(), 1);
-  EXPECT_EQ(other_fanouts.count(MutableGraphView::InputPort(foo_2, 0)), 1);
+  CheckGraph(graph);
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index 38fc1fff329eda5b80bb771442f2c543bd27e85d..59400dc479b70a14f9af2443bcd9bfd7fe2f14c9 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -47,6 +47,12 @@ bool IsAnyDiv(const NodeDef& node) {
          node.op() == "FloorDiv" || node.op() == "TruncateDiv";
 }
 
+bool IsAnyMaxPool(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "MaxPool" || op == "MaxPoolV2" || op == "MaxPool3D" ||
+         op == "MaxPoolWithArgmax" || op == "FractionalMaxPool";
+}
+
 bool IsApproximateEqual(const NodeDef& node) {
   return node.op() == "ApproximateEqual";
 }
@@ -164,18 +170,13 @@ bool IsDiv(const NodeDef& node) { return node.op() == "Div"; }
 bool IsElementWiseMonotonic(const NodeDef& node, bool* is_non_decreasing) {
   static const gtl::FlatSet<string>* const kMonotonicNonDecreasingOps =
       CHECK_NOTNULL((new gtl::FlatSet<string>{
-          "Asinh", "Atanh",   "Ceil",  "Elu",  "Erf",  "Exp",   "Expm1",
-          "Floor", "Log",     "Log1p", "Relu", "Relu", "Relu6", "Rint",
-          "Selu",  "Sigmoid", "Sign",  "Sinh", "Sqrt", "Tanh",
+          "Acosh", "Asin", "Asinh",    "Atan",     "Atanh", "Ceil",
+          "Elu",   "Erf",  "Exp",      "Expm1",    "Floor", "Log",
+          "Log1p", "Relu", "Relu6",    "Rint",     "Selu",  "Sigmoid",
+          "Sign",  "Sinh", "Softsign", "Softplus", "Sqrt",  "Tanh",
       }));
   static const gtl::FlatSet<string>* const kMonotonicNonIncreasingOps =
-      CHECK_NOTNULL((new gtl::FlatSet<string>{
-          "Inv",
-          "Reciprocal",
-          "Erfc",
-          "Rsqrt",
-          "Neg",
-      }));
+      CHECK_NOTNULL((new gtl::FlatSet<string>{"Acos", "Erfc", "Neg", "Rsqrt"}));
   if (kMonotonicNonDecreasingOps->count(node.op()) > 0) {
     if (is_non_decreasing) {
       *is_non_decreasing = true;
@@ -247,6 +248,11 @@ bool IsIdentityNSingleInput(const NodeDef& node) {
          node.attr().at("T").list().type_size() == 1;
 }
 
+bool IsIf(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "If" || op == "StatelessIf";
+}
+
 bool IsIgamma(const NodeDef& node) { return node.op() == "Igamma"; }
 
 bool IsIgammac(const NodeDef& node) { return node.op() == "Igammac"; }
@@ -273,8 +279,8 @@ bool IsLogicalOr(const NodeDef& node) { return node.op() == "LogicalOr"; }
 
 bool IsMatMul(const NodeDef& node) {
   const auto& op = node.op();
-  return op == "MatMul" || op == "BatchMatMul" || op == "QuantizedMatMul" ||
-         op == "SparseMatMul";
+  return op == "MatMul" || op == "BatchMatMul" || op == "SparseMatMul" ||
+         IsQuantizedMatMul(node);
 }
 
 bool IsMax(const NodeDef& node) { return node.op() == "Max"; }
@@ -315,6 +321,8 @@ bool IsNextIteration(const NodeDef& node) {
   return op == "NextIteration" || op == "RefNextIteration";
 }
 
+bool IsOnesLike(const NodeDef& node) { return node.op() == "OnesLike"; }
+
 bool IsPack(const NodeDef& node) { return node.op() == "Pack"; }
 
 bool IsPad(const NodeDef& node) {
@@ -336,10 +344,16 @@ bool IsPolygamma(const NodeDef& node) { return node.op() == "Polygamma"; }
 
 bool IsPow(const NodeDef& node) { return node.op() == "Pow"; }
 
-bool IsPrint(const NodeDef& node) { return node.op() == "Print"; }
+bool IsPrint(const NodeDef& node) {
+  return node.op() == "Print" || node.op() == "PrintV2";
+}
 
 bool IsProd(const NodeDef& node) { return node.op() == "Prod"; }
 
+bool IsQuantizedMatMul(const NodeDef& node) {
+  return node.op() == "QuantizedMatMul" || node.op() == "QuantizedMatMulV2";
+}
+
 bool IsQueue(const NodeDef& node) {
   return str_util::EndsWith(node.op(), "QueueV2");
 }
@@ -350,6 +364,10 @@ bool IsRandomShuffle(const NodeDef& node) {
 
 bool IsRank(const NodeDef& node) { return node.op() == "Rank"; }
 
+bool IsReadVariableOp(const NodeDef& node) {
+  return node.op() == "ReadVariableOp";
+}
+
 bool IsReal(const NodeDef& node) { return node.op() == "Real"; }
 
 bool IsRealDiv(const NodeDef& node) { return node.op() == "RealDiv"; }
@@ -524,6 +542,13 @@ bool IsVariable(const NodeDef& node) {
          op == "VarHandleOp" || op == "ReadVariableOp";
 }
 
+bool IsWhile(const NodeDef& node) {
+  const auto& op = node.op();
+  return op == "While" || op == "StatelessWhile";
+}
+
+bool IsZerosLike(const NodeDef& node) { return node.op() == "ZerosLike"; }
+
 bool IsZeta(const NodeDef& node) { return node.op() == "Zeta"; }
 
 namespace {
@@ -551,6 +576,29 @@ bool MaybeHasRefInput(const NodeDef& node) {
   return false;
 }
 
+bool IsDataset(const NodeDef& node) {
+  const string& op = node.op();
+  // See `GetNodeClassForOp` in core/graph/graph.cc.
+  return op == "IteratorGetNext" || op == "IteratorGetNextSync" ||
+         op == "DatasetToSingleElement" || op == "ReduceDataset";
+}
+
+bool IsStateful(const NodeDef node, const OpRegistryInterface* op_registry) {
+  const OpDef* op_def = nullptr;
+  const string& op_name = node.op();
+  Status status = op_registry->LookUpOpDef(op_name, &op_def);
+  if (!status.ok()) {
+    LOG(WARNING) << "Failed to lookup OpDef for " << op_name
+                 << ". Error: " << status.error_message();
+    return false;
+  }
+  return op_def->is_stateful();
+}
+
+bool IsStateful(const NodeDef node) {
+  return IsStateful(node, OpRegistry::Global());
+}
+
 bool IsFreeOfSideEffect(const NodeDef& node,
                         const OpRegistryInterface* op_registry) {
   // Placeholders must be preserved to keep the graph feedable.
@@ -696,7 +744,6 @@ bool IsUnaryElementWise(const NodeDef& node) {
           "Asin",
           "Asinh",
           "Atan",
-          "Atan2",
           "Atanh",
           "Ceil",
           "ComplexAbs",
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 67897e8512d7dc6e4774c066297674629dd4f714..bc1bb33772d8c8b664aeef280c779206f6b4e5fa 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -28,6 +28,7 @@ bool IsAll(const NodeDef& node);
 bool IsAngle(const NodeDef& node);
 bool IsAny(const NodeDef& node);
 bool IsAnyDiv(const NodeDef& node);
+bool IsAnyMaxPool(const NodeDef& node);
 bool IsApproximateEqual(const NodeDef& node);
 bool IsAvgPoolGrad(const NodeDef& node);
 bool IsAssert(const NodeDef& node);
@@ -75,6 +76,7 @@ bool IsHistogramSummary(const NodeDef& node);
 bool IsIdentity(const NodeDef& node);
 bool IsIdentityN(const NodeDef& node);
 bool IsIdentityNSingleInput(const NodeDef& node);
+bool IsIf(const NodeDef& node);
 bool IsIgamma(const NodeDef& node);
 bool IsIgammac(const NodeDef& node);
 bool IsImag(const NodeDef& node);
@@ -99,10 +101,12 @@ bool IsMod(const NodeDef& node);
 bool IsMul(const NodeDef& node);
 bool IsMatMul(const NodeDef& node);
 bool IsNextIteration(const NodeDef& node);
+bool IsOnesLike(const NodeDef& node);
 bool IsPack(const NodeDef& node);
 bool IsPad(const NodeDef& node);
 bool IsPack(const NodeDef& node);
 bool IsPartitionedCall(const NodeDef& node);
+bool IsQuantizedMatMul(const NodeDef& node);
 bool IsNeg(const NodeDef& node);
 bool IsNoOp(const NodeDef& node);
 bool IsNotEqual(const NodeDef& node);
@@ -114,6 +118,7 @@ bool IsPow(const NodeDef& node);
 bool IsQueue(const NodeDef& node);
 bool IsRandomShuffle(const NodeDef& node);
 bool IsRank(const NodeDef& node);
+bool IsReadVariableOp(const NodeDef& node);
 bool IsReal(const NodeDef& node);
 bool IsRealDiv(const NodeDef& node);
 bool IsRelu(const NodeDef& node);
@@ -167,6 +172,8 @@ bool IsTruncateDiv(const NodeDef& node);
 bool IsTruncateMod(const NodeDef& node);
 bool IsUnpack(const NodeDef& node);
 bool IsVariable(const NodeDef& node);
+bool IsWhile(const NodeDef& node);
+bool IsZerosLike(const NodeDef& node);
 bool IsZeta(const NodeDef& node);
 
 // Return true if the op is an aggregation (e.g. Add, AddN).
@@ -181,6 +188,14 @@ bool IsCommutative(const NodeDef& node);
 // value.
 bool IsPersistent(const NodeDef& node);
 
+// Returns true if the node belongs to the NC_DATASET class (see graph/graph.h).
+bool IsDataset(const NodeDef& node);
+
+// Returns true if the node op is marked as stateful, or if it was not found in
+// op_registry.
+bool IsStateful(const NodeDef node, const OpRegistryInterface* op_registry);
+bool IsStateful(const NodeDef node);  // use OpRegistry::Global()
+
 bool IsFreeOfSideEffect(const NodeDef& node,
                         const OpRegistryInterface* op_registry);
 bool IsFreeOfSideEffect(const NodeDef& node);  // use OpRegistry::Global()
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 79578cb3ce0733bcfce1a382414c20881879e3e3..cdf6180ff5fd3a7da3f1b3cfeb0905ee06e8b54b 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -1,9 +1,7 @@
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_kernel_library")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
-load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
 # Platform specific build config
 load(
@@ -103,6 +101,7 @@ cc_library(
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:symbolic_shapes",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -110,6 +109,9 @@ tf_cc_test(
     name = "constant_folding_test",
     srcs = ["constant_folding_test.cc"],
     shard_count = 5,
+    # Running cuda on cpu will trigger tests guarded by GOOGLE_CUDA but NCHW
+    # won't be available, which result in test failures. So disable that.
+    tags = ["no_cuda_on_cpu_tap"],
     deps = [
         ":constant_folding",
         ":dependency_optimizer",
@@ -145,12 +147,16 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_topology_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/utils:functions",
+        "//tensorflow/core/grappler/utils:traversal",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
@@ -177,6 +183,7 @@ tf_cuda_cc_test(
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/utils:grappler_test",
+        "@com_google_absl//absl/algorithm:container",
     ],
 )
 
@@ -252,12 +259,32 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_topology_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:symbolic_shapes",
         "//tensorflow/core/grappler/utils:topological_sort",
+        "//tensorflow/core/grappler/utils:traversal",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+    ],
+)
+
+cc_library(
+    name = "arithmetic_optimizer_test_utils",
+    testonly = 1,
+    hdrs = [
+        "arithmetic_optimizer_test_utils.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":arithmetic_optimizer",
+        ":constant_folding",
+        ":model_pruner",
+        "//tensorflow/core:test",
+        "//tensorflow/core/grappler/utils:grappler_test",
     ],
 )
 
@@ -267,7 +294,7 @@ tf_cuda_cc_test(
     srcs = ["arithmetic_optimizer_test.cc"],
     deps = [
         ":arithmetic_optimizer",
-        ":constant_folding",
+        ":arithmetic_optimizer_test_utils",
         ":model_pruner",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
@@ -282,7 +309,6 @@ tf_cuda_cc_test(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
-        "//tensorflow/core/grappler/utils:grappler_test",
     ],
 )
 
@@ -297,14 +323,19 @@ cc_library(
         ":constant_folding",
         ":graph_optimizer",
         "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:topological_sort",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -409,6 +440,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_topology_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
@@ -507,9 +539,9 @@ cc_library(
         ":custom_graph_optimizer_registry",
         ":debug_stripper",
         ":dependency_optimizer",
-        ":experimental_implementation_selector",
         ":function_optimizer",
         ":graph_optimizer",
+        ":implementation_selector",
         ":layout_optimizer",
         ":loop_optimizer",
         ":memory_optimizer",
@@ -524,10 +556,12 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/utils:colocation",
         "//tensorflow/core/grappler/utils:functions",
         "//tensorflow/core/grappler/utils:topological_sort",
-        "@com_google_absl//absl/container:flat_hash_set",
+        "//tensorflow/core/grappler/verifiers:graph_verifier",
+        "//tensorflow/core/grappler/verifiers:structure_verifier",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -540,6 +574,7 @@ tf_cuda_cc_test(
         ":custom_graph_optimizer_registry",
         ":meta_optimizer",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
@@ -606,16 +641,18 @@ cc_library(
         ":constant_folding",
         ":evaluation_utils",
         ":graph_optimizer",
-        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_topology_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:frame",
+        "//tensorflow/core/grappler/utils:traversal",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
@@ -674,7 +711,7 @@ tf_cc_test(
     ],
 )
 
-cc_library(
+tf_kernel_library(
     name = "remapper",
     srcs = ["remapper.cc"],
     hdrs = [
@@ -691,6 +728,7 @@ cc_library(
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/utils:symbolic_shapes",
         "//tensorflow/core/grappler/utils:topological_sort",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
@@ -702,6 +740,7 @@ tf_cuda_cc_test(
     deps = [
         ":remapper",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -758,7 +797,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":graph_optimizer",
-        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -848,9 +887,9 @@ tf_cc_test(
 )
 
 cc_library(
-    name = "experimental_implementation_selector",
-    srcs = ["experimental_implementation_selector.cc"],
-    hdrs = ["experimental_implementation_selector.h"],
+    name = "implementation_selector",
+    srcs = ["implementation_selector.cc"],
+    hdrs = ["implementation_selector.h"],
     deps = [
         ":custom_graph_optimizer",
         ":custom_graph_optimizer_registry",
@@ -866,14 +905,14 @@ cc_library(
 )
 
 tf_cc_test(
-    name = "experimental_implementation_selector_test",
+    name = "implementation_selector_test",
     size = "small",
-    srcs = ["experimental_implementation_selector_test.cc"],
+    srcs = ["implementation_selector_test.cc"],
     deps = [
         ":custom_graph_optimizer",
         ":custom_graph_optimizer_registry",
-        ":experimental_implementation_selector",
         ":function_api_info",
+        ":implementation_selector",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index d35c00f29ecb1c1acedb41c29f08d20decf6476e..902cb3fdd84465d79d52e496315597c533e69252 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -22,6 +22,8 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -31,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/graph_topology_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
@@ -38,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/symbolic_shapes.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/grappler/utils/traversal.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/hash/hash.h"
@@ -2717,21 +2721,27 @@ class OptimizeMaxOrMinOfMonotonicStage : public ArithmeticOptimizerStage {
   ~OptimizeMaxOrMinOfMonotonicStage() override = default;
 
   bool IsSupported(const NodeDef* node) const override {
-    return IsMax(*node) || IsMin(*node);
+    return IsMax(*node) || IsMin(*node) || IsAnyMaxPool(*node);
   }
 
   Status TrySimplify(NodeDef* reduction_node,
                      string* simplified_node_name) override {
+    if (IsInPreserveSet(*reduction_node)) {
+      return Status::OK();
+    }
     NodeDef* inner_function;
     TF_RETURN_IF_ERROR(GetInputNode(reduction_node->input(0), &inner_function));
     // Optimize only if:
     // 0. inner_function is not in the preserve set,
     // 1. inner_function's Op is element-wise monotonic
     // 2. inner_function's output is not being consumed elsewhere.
+    // 3. is monotonic increasing if reduction_node is a pooling operation
+    //    since we don't have MinPool operations.
     bool is_non_decreasing = false;
     if (!IsInPreserveSet(*inner_function) &&
         IsElementWiseMonotonic(*inner_function, &is_non_decreasing) &&
-        ctx().node_map->GetOutputs(inner_function->name()).size() == 1) {
+        ctx().node_map->GetOutputs(inner_function->name()).size() == 1 &&
+        (is_non_decreasing || !IsAnyMaxPool(*reduction_node))) {
       // Swap the first inputs of the inner function Op & the reduction Op.
       NodeDef* inner_input;
       TF_RETURN_IF_ERROR(GetInputNode(inner_function->input(0), &inner_input));
@@ -3232,13 +3242,17 @@ class UniqueNodes {
   }
 
  private:
-  uint64 ComputeSignature(const NodeDef& node) const;
+  uint64 ComputeSignature(const NodeDef& node);
   bool SameNode(const NodeDef& node1, const NodeDef& node2) const;
 
-  std::unordered_map<uint64, std::vector<NodeDef*>> rep_;
+  absl::flat_hash_map<uint64, std::vector<NodeDef*>> rep_;
+  absl::flat_hash_map<const NodeDef*, uint64> memoized_signatures_;
 };
 
-uint64 UniqueNodes::ComputeSignature(const NodeDef& node) const {
+uint64 UniqueNodes::ComputeSignature(const NodeDef& node) {
+  auto it = memoized_signatures_.find(&node);
+  if (it != memoized_signatures_.end()) return it->second;
+
   uint64 h = Hash64(node.op());
   h = Hash64Combine(Hash64(node.device()), h);
 
@@ -3252,6 +3266,7 @@ uint64 UniqueNodes::ComputeSignature(const NodeDef& node) const {
     h = Hash64CombineUnordered(Hash64(attr.first), h);
     h = Hash64CombineUnordered(FastAttrValueHash(attr.second), h);
   }
+  memoized_signatures_.emplace(&node, h);
   return h;
 }
 
@@ -3272,31 +3287,29 @@ bool UniqueNodes::SameNode(const NodeDef& node1, const NodeDef& node2) const {
   // Compare inputs.
   if (IsCommutative(node1)) {
     std::vector<string> inputs1(node1.input().begin(), node1.input().end());
-    std::vector<string> inputs2(node2.input().begin(), node2.input().end());
     std::sort(inputs1.begin(), inputs1.end());
+    std::vector<string> inputs2(node2.input().begin(), node2.input().end());
     std::sort(inputs2.begin(), inputs2.end());
     return inputs1 == inputs2;
   } else {
-    std::vector<string> regular_inputs1;
-    std::vector<string> regular_inputs2;
-    std::vector<string> ctrl_inputs1;
-    std::vector<string> ctrl_inputs2;
-    for (int index = 0; index < node1.input_size(); ++index) {
+    // The order or ordinary inputs matters.
+    int index = 0;
+    for (; index < node1.input_size(); ++index) {
       if (IsControlInput(node1.input(index))) {
-        ctrl_inputs1.push_back(node1.input(index));
-        ctrl_inputs2.push_back(node2.input(index));
-      } else {
-        regular_inputs1.push_back(node1.input(index));
-        regular_inputs2.push_back(node2.input(index));
+        break;
+      } else if (node1.input(index) != node2.input(index)) {
+        return false;
       }
     }
-    if (regular_inputs1 != regular_inputs2) {
-      return false;
-    }
-    std::sort(ctrl_inputs1.begin(), ctrl_inputs1.end());
-    std::sort(ctrl_inputs2.begin(), ctrl_inputs2.end());
-    if (ctrl_inputs1 != ctrl_inputs2) {
-      return false;
+    // The order of control inputs does not matter.
+    if (index < node1.input_size()) {
+      std::vector<string> ctrl_inputs1(node1.input().begin() + index,
+                                       node1.input().end());
+      std::sort(ctrl_inputs1.begin(), ctrl_inputs1.end());
+      std::vector<string> ctrl_inputs2(node2.input().begin() + index,
+                                       node2.input().end());
+      std::sort(ctrl_inputs2.begin(), ctrl_inputs2.end());
+      return ctrl_inputs1 != ctrl_inputs2;
     }
   }
 
@@ -3313,25 +3326,6 @@ bool UniqueNodes::SameNode(const NodeDef& node1, const NodeDef& node2) const {
   return true;
 }
 
-namespace {
-
-bool FeedsInPlaceOp(const SimpleGraphView& graph_view, const NodeDef& node) {
-  const std::unordered_set<string> op_types_to_traverse = {
-      node.op(),    "Identity", "IdentityN", "Reshape",
-      "ExpandDims", "Enter",    "Switch",    "Merge"};
-  int node_idx = graph_view.index(node.name());
-  std::set<int> node_fanout;
-  graph_view.DepthFirstSearch(op_types_to_traverse, node_idx, &node_fanout);
-  for (int fanout : node_fanout) {
-    if (ModifiesInputsInPlace(graph_view.graph()->node(fanout))) {
-      return true;
-    }
-  }
-  return false;
-}
-
-}  // namespace
-
 bool ArithmeticOptimizer::CanDedup(const NodeDef& node) const {
   if (nodes_to_preserve_.find(node.name()) != nodes_to_preserve_.end()) {
     return false;
@@ -3342,31 +3336,48 @@ bool ArithmeticOptimizer::CanDedup(const NodeDef& node) const {
   if (node.device().find("SPU") != string::npos) {
     return false;
   }
-  // Workaround for Assert mistakenly being labeled as stateful.
-  if (IsAssert(node)) {
+  // Workaround for Assert and Print mistakenly being labeled as stateful.
+  if (IsAssert(node) || IsPrint(node)) {
     return true;
   }
   return IsFreeOfSideEffect(node);
 }
 
 void ArithmeticOptimizer::DedupComputations() {
-  bool stop = true;
-  SimpleGraphView graph_view;
-  if (!graph_view.Initialize(*optimized_graph_).ok()) {
-    LOG(WARNING) << "Failed to build SimpleGraphView.";
+  GraphTopologyView graph_view;
+  if (!graph_view.InitializeFromGraph(*optimized_graph_).ok()) {
+    LOG(WARNING) << "Failed to initialize GraphTopologyView.";
     return;
   }
-  std::set<int> duplicates;
+
+  const absl::flat_hash_set<string> ops_to_traverse = {
+      "Identity", "IdentityN", "Reshape", "ExpandDims",
+      "Enter",    "Switch",    "Merge"};
+
   // Populate feed_inplace_op;
-  std::unordered_set<NodeDef*> feeds_inplace_op;
-  for (int i = 0; i < optimized_graph_->node_size(); ++i) {
-    if (FeedsInPlaceOp(graph_view, optimized_graph_->node(i))) {
-      feeds_inplace_op.insert(optimized_graph_->mutable_node(i));
+  absl::flat_hash_set<const NodeDef*> feeds_inplace_op;
+
+  for (const NodeDef& root : optimized_graph_->node()) {
+    if (feeds_inplace_op.find(&root) != feeds_inplace_op.end()) continue;
+
+    if (ModifiesInputsInPlace(root)) {
+      const auto is_continue_traversal = [&](const NodeDef* node) -> bool {
+        return node->op() == root.op() || ops_to_traverse.count(node->op()) > 0;
+      };
+
+      DfsTraversal(graph_view, {&root}, TraversalDirection::kFollowInputs,
+                   DfsPredicates::Advance(is_continue_traversal),
+                   DfsCallbacks::PreOrder([&](const NodeDef* node) {
+                     feeds_inplace_op.insert(node);
+                   }));
     }
   }
+
+  bool stop = true;
+  std::set<int> duplicates;
+  UniqueNodes nodes;
   do {
     stop = true;
-    UniqueNodes nodes;
     for (int i = 0; i < optimized_graph_->node_size(); ++i) {
       if (duplicates.find(i) != duplicates.end()) {
         continue;
@@ -3571,7 +3582,7 @@ Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
 
   // Disable restricted graph rewrites.
   options_.unary_ops_composition &=
-      item.allowed_optimizations().non_differentiable_rewrites;
+      item.optimization_options().allow_non_differentiable_rewrites;
 
   if (options_.dedup_computations) {
     DedupComputations();
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index 35d22898f6c15afd63df8b6136fad1f346172cd5..277833462298b8286fe30fb12454c64058b9a8a0 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -20,10 +20,9 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
-#include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/optimizers/arithmetic_optimizer_test_utils.h"
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
 #include "tensorflow/core/grappler/utils.h"
-#include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -92,211 +91,6 @@ void VerifyGraphsMatch(const GraphDef& original_graph,
 }
 }  // namespace
 
-class ArithmeticOptimizerTest : public GrapplerTest {
- protected:
-  // Optimize a graph using ArithmeticOptimizer and prune all the nodes that no
-  // longer have any output consumers.
-  void OptimizeAndPrune(ArithmeticOptimizer* optimizer, GrapplerItem* item,
-                        GraphDef* output) {
-    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
-    item->graph.Swap(output);
-    output->Clear();
-    TF_EXPECT_OK(ModelPruner().Optimize(nullptr, *item, output));
-  }
-
-  // Run ArithmeticOptimizer twice to make sure the rewrite is idempotent.
-  void OptimizeTwice(ArithmeticOptimizer* optimizer, GrapplerItem* item,
-                     GraphDef* output) {
-    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
-    item->graph.Swap(output);
-    output->Clear();
-    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
-  }
-
-  // Run ArithmeticOptimizer twice to make sure the rewrite is idempotent.
-  // Optionally run a constant folding pass before pruning.
-  void OptimizeTwiceAndPrune(ArithmeticOptimizer* optimizer, GrapplerItem* item,
-                             GraphDef* output, bool const_folding = false) {
-    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
-
-    item->graph.Swap(output);
-    output->Clear();
-    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
-
-    if (const_folding) {
-      item->graph.Swap(output);
-      output->Clear();
-      TF_EXPECT_OK(ConstantFolding(/*cpu_device=*/nullptr)
-                       .Optimize(nullptr, *item, output));
-    }
-
-    item->graph.Swap(output);
-    output->Clear();
-    TF_EXPECT_OK(ModelPruner().Optimize(nullptr, *item, output));
-  }
-
-  // TODO(ezhulenev): Make private. After migration to stages each test
-  // should explicitly enable required optimization for tests isolation
-  void DisableAllStages(ArithmeticOptimizer* optimizer) {
-    ArithmeticOptimizer::ArithmeticOptimizerOptions options;
-    options.dedup_computations = false;
-    options.combine_add_to_addn = false;
-    options.convert_sqrt_div_to_rsqrt_mul = false;
-    options.convert_pow = false;
-    options.convert_log1p = false;
-    options.optimize_max_or_min_of_monotonic = false;
-    options.fold_conjugate_into_transpose = false;
-    options.fold_multiply_into_conv = false;
-    options.fold_transpose_into_matmul = false;
-    options.hoist_common_factor_out_of_aggregation = false;
-    options.hoist_cwise_unary_chains = false;
-    options.minimize_broadcasts = false;
-    options.remove_identity_transpose = false;
-    options.remove_involution = false;
-    options.remove_idempotent = false;
-    options.remove_redundant_bitcast = false;
-    options.remove_redundant_cast = false;
-    options.remove_redundant_reshape = false;
-    options.remove_negation = false;
-    options.remove_logical_not = false;
-    options.reorder_cast_like_and_value_preserving = false;
-    options.replace_mul_with_square = false;
-    options.simplify_aggregation = false;
-    options.unary_ops_composition = false;
-    optimizer->options_ = options;
-  }
-
-  void DisableAddToAddNCombining(ArithmeticOptimizer* optimizer) {
-    optimizer->options_.combine_add_to_addn = false;
-  }
-
-  void EnableOnlyAddToAddNCombining(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.combine_add_to_addn = true;
-  }
-
-  void EnableOnlyFoldConjugateIntoTranspose(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.fold_conjugate_into_transpose = true;
-  }
-
-  void EnableOnlyFoldMultipleIntoConv(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.fold_multiply_into_conv = true;
-  }
-
-  void EnableOnlyFoldTransposeIntoMatMul(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.fold_transpose_into_matmul = true;
-  }
-
-  void EnableOnlyHoistCommonFactor(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.hoist_common_factor_out_of_aggregation = true;
-  }
-
-  void EnableOnlyMinimizeBroadcasts(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.minimize_broadcasts = true;
-  }
-
-  void EnableOnlyRemoveIdentityTranspose(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_identity_transpose = true;
-  }
-
-  void EnableOnlyRemoveInvolution(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_involution = true;
-  }
-
-  void EnableOnlyRemoveRedundantBitcast(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_redundant_bitcast = true;
-  }
-
-  void EnableOnlyRemoveRedundantCast(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_redundant_cast = true;
-  }
-
-  void EnableOnlyRemoveRedundantReshape(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_redundant_reshape = true;
-  }
-
-  void EnableOnlyRemoveNegation(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_negation = true;
-  }
-
-  void EnableOnlyReorderCastAndTranspose(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.reorder_cast_like_and_value_preserving = true;
-  }
-
-  void EnableOnlyReplaceMulWithSquare(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.replace_mul_with_square = true;
-  }
-
-  void EnableOnlyHoistCWiseUnaryChains(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.hoist_cwise_unary_chains = true;
-  }
-
-  void EnableOnlySqrtDivToRsqrtMul(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.convert_sqrt_div_to_rsqrt_mul = true;
-  }
-
-  void EnableOnlyConvertPow(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.convert_pow = true;
-  }
-
-  void EnableOnlyRemoveIdempotent(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_idempotent = true;
-  }
-
-  void EnableOnlyRemoveLogicalNot(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_logical_not = true;
-  }
-
-  void EnableOnlySimplifyAggregation(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.simplify_aggregation = true;
-  }
-
-  void EnableOnlyLog1p(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.convert_log1p = true;
-  }
-
-  void EnableOnlyOptimizeMaxOrMinOfMonotonic(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.optimize_max_or_min_of_monotonic = true;
-  }
-
-  void EnableOnlyExpm1(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.convert_expm1 = true;
-  }
-
-  void EnableOnlyUnaryOpsComposition(ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.unary_ops_composition = true;
-  }
-
-  void EnableOnlyRemoveStackStridedSliceSameAxis(
-      ArithmeticOptimizer* optimizer) {
-    DisableAllStages(optimizer);
-    optimizer->options_.remove_stack_strided_slice_same_axis = true;
-  }
-};
-
 TEST_F(ArithmeticOptimizerTest, NoOp) {
   // This trivial graph is so basic there's nothing to optimize.
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
@@ -3490,6 +3284,35 @@ TEST_F(ArithmeticOptimizerTest,
   VerifyGraphsMatch(item.graph, output, __LINE__);
 }
 
+TEST_F(ArithmeticOptimizerTest,
+       OptimizeMaxOrMinOfMonotonicElementWiseDoNotChangeFetchNodeReduction) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto x = ops::Const(s.WithOpName("x"), {2, 3}, {1, 2});
+  Output reshape = ops::Reshape(s.WithOpName("reshape"), x, {-1});
+  Output y = ops::Neg(s.WithOpName("y"), reshape);
+  Output z = ops::Max(s.WithOpName("z"), y, {0});
+
+  GrapplerItem item;
+  item.fetch = {"z"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  ASSERT_EQ(1, tensors_expected.size());
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyOptimizeMaxOrMinOfMonotonic(&optimizer);
+  OptimizeTwice(&optimizer, &item, &output);
+
+  // Should be a NoOp since we are not allowed to change the output of fetch
+  // nodes.
+  VerifyGraphsMatch(item.graph, output, __LINE__);
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  ASSERT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int>(tensors[0], tensors_expected[0]);
+  test::ExpectTensorEqual<int>(tensors[0], Tensor(-2));
+}
+
 TEST_F(ArithmeticOptimizerTest,
        OptimizeMaxOrMinOfMonotonicElementWiseNonIncreasing) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -3532,6 +3355,75 @@ TEST_F(ArithmeticOptimizerTest,
   EXPECT_EQ(2, required_node_count);
 }
 
+TEST_F(ArithmeticOptimizerTest,
+       OptimizeMaxOrMinOfMonotonicElementWiseNonIncreasingDoNotChangeMaxPool) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto x = ops::Const(s.WithOpName("x"), 1.5f, {3, 3, 3, 1});
+  Output neg = ops::Neg(s.WithOpName("neg"), x);
+  Output max_pool = ops::MaxPool(s.WithOpName("max_pool"), neg, {1, 2, 2, 1},
+                                 {1, 2, 2, 1}, "VALID");
+
+  GrapplerItem item;
+  item.fetch = {"max_pool"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  ASSERT_EQ(1, tensors_expected.size());
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyOptimizeMaxOrMinOfMonotonic(&optimizer);
+  OptimizeTwice(&optimizer, &item, &output);
+
+  // Should be a NoOp
+  VerifyGraphsMatch(item.graph, output, __LINE__);
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  ASSERT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+TEST_F(ArithmeticOptimizerTest, OptimizeMaxOrMinOfMonotonicElementWiseMaxPool) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto x = ops::Const(s.WithOpName("x"), 1.5f, {3, 3, 3, 1});
+  Output sqrt = ops::Sqrt(s.WithOpName("sqrt"), x);
+  Output max_pool = ops::MaxPool(s.WithOpName("max_pool"), sqrt, {1, 2, 2, 1},
+                                 {1, 2, 2, 1}, "VALID");
+  Output final_out = ops::Identity(s.WithOpName("final_out"), max_pool);
+
+  GrapplerItem item;
+  item.fetch = {"final_out"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyOptimizeMaxOrMinOfMonotonic(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+  EXPECT_EQ(item.graph.node_size(), output.node_size());
+  // Check if the inputs are switched
+  int required_node_count = 0;
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    if (node.name() == "sqrt") {
+      EXPECT_EQ("Sqrt", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("max_pool", node.input(0));
+      ++required_node_count;
+    } else if (node.name() == "max_pool") {
+      EXPECT_EQ("MaxPool", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      ++required_node_count;
+    }
+  }
+  EXPECT_EQ(2, required_node_count);
+}
+
 TEST_F(ArithmeticOptimizerTest, UnaryOpsComposition) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test_utils.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..94d0adc60923b0b41147891f843b98af76477653
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test_utils.h
@@ -0,0 +1,236 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_TEST_UTILS_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_TEST_UTILS_H_
+
+#include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/optimizers/model_pruner.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class ArithmeticOptimizerTest : public GrapplerTest {
+ protected:
+  // Optimize a graph using ArithmeticOptimizer and prune all the nodes that no
+  // longer have any output consumers.
+  void OptimizeAndPrune(ArithmeticOptimizer* optimizer, GrapplerItem* item,
+                        GraphDef* output) {
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+    item->graph.Swap(output);
+    output->Clear();
+    TF_EXPECT_OK(ModelPruner().Optimize(nullptr, *item, output));
+  }
+
+  // Run ArithmeticOptimizer twice to make sure the rewrite is idempotent.
+  void OptimizeTwice(ArithmeticOptimizer* optimizer, GrapplerItem* item,
+                     GraphDef* output) {
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+    item->graph.Swap(output);
+    output->Clear();
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+  }
+
+  // Run ArithmeticOptimizer twice to make sure the rewrite is idempotent.
+  // Optionally run a constant folding pass before pruning.
+  void OptimizeTwiceAndPrune(ArithmeticOptimizer* optimizer, GrapplerItem* item,
+                             GraphDef* output, bool const_folding = false) {
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+
+    item->graph.Swap(output);
+    output->Clear();
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+
+    if (const_folding) {
+      item->graph.Swap(output);
+      output->Clear();
+      TF_EXPECT_OK(ConstantFolding(/*cpu_device=*/nullptr)
+                       .Optimize(nullptr, *item, output));
+    }
+
+    item->graph.Swap(output);
+    output->Clear();
+    TF_EXPECT_OK(ModelPruner().Optimize(nullptr, *item, output));
+  }
+
+  // TODO(ezhulenev): Make private. After migration to stages each test
+  // should explicitly enable required optimization for tests isolation
+  void DisableAllStages(ArithmeticOptimizer* optimizer) {
+    ArithmeticOptimizer::ArithmeticOptimizerOptions options;
+    options.dedup_computations = false;
+    options.combine_add_to_addn = false;
+    options.convert_sqrt_div_to_rsqrt_mul = false;
+    options.convert_pow = false;
+    options.convert_log1p = false;
+    options.optimize_max_or_min_of_monotonic = false;
+    options.fold_conjugate_into_transpose = false;
+    options.fold_multiply_into_conv = false;
+    options.fold_transpose_into_matmul = false;
+    options.hoist_common_factor_out_of_aggregation = false;
+    options.hoist_cwise_unary_chains = false;
+    options.minimize_broadcasts = false;
+    options.remove_identity_transpose = false;
+    options.remove_involution = false;
+    options.remove_idempotent = false;
+    options.remove_redundant_bitcast = false;
+    options.remove_redundant_cast = false;
+    options.remove_redundant_reshape = false;
+    options.remove_negation = false;
+    options.remove_logical_not = false;
+    options.reorder_cast_like_and_value_preserving = false;
+    options.replace_mul_with_square = false;
+    options.simplify_aggregation = false;
+    options.unary_ops_composition = false;
+    optimizer->options_ = options;
+  }
+
+  void DisableAddToAddNCombining(ArithmeticOptimizer* optimizer) {
+    optimizer->options_.combine_add_to_addn = false;
+  }
+
+  void EnableOnlyAddToAddNCombining(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.combine_add_to_addn = true;
+  }
+
+  void EnableOnlyFoldConjugateIntoTranspose(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.fold_conjugate_into_transpose = true;
+  }
+
+  void EnableOnlyFoldMultipleIntoConv(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.fold_multiply_into_conv = true;
+  }
+
+  void EnableOnlyFoldTransposeIntoMatMul(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.fold_transpose_into_matmul = true;
+  }
+
+  void EnableOnlyHoistCommonFactor(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.hoist_common_factor_out_of_aggregation = true;
+  }
+
+  void EnableOnlyMinimizeBroadcasts(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.minimize_broadcasts = true;
+  }
+
+  void EnableOnlyRemoveIdentityTranspose(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_identity_transpose = true;
+  }
+
+  void EnableOnlyRemoveInvolution(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_involution = true;
+  }
+
+  void EnableOnlyRemoveRedundantBitcast(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_redundant_bitcast = true;
+  }
+
+  void EnableOnlyRemoveRedundantCast(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_redundant_cast = true;
+  }
+
+  void EnableOnlyRemoveRedundantReshape(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_redundant_reshape = true;
+  }
+
+  void EnableOnlyRemoveNegation(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_negation = true;
+  }
+
+  void EnableOnlyReorderCastAndTranspose(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.reorder_cast_like_and_value_preserving = true;
+  }
+
+  void EnableOnlyReplaceMulWithSquare(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.replace_mul_with_square = true;
+  }
+
+  void EnableOnlyHoistCWiseUnaryChains(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.hoist_cwise_unary_chains = true;
+  }
+
+  void EnableOnlySqrtDivToRsqrtMul(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.convert_sqrt_div_to_rsqrt_mul = true;
+  }
+
+  void EnableOnlyConvertPow(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.convert_pow = true;
+  }
+
+  void EnableOnlyRemoveIdempotent(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_idempotent = true;
+  }
+
+  void EnableOnlyRemoveLogicalNot(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_logical_not = true;
+  }
+
+  void EnableOnlySimplifyAggregation(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.simplify_aggregation = true;
+  }
+
+  void EnableOnlyLog1p(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.convert_log1p = true;
+  }
+
+  void EnableOnlyOptimizeMaxOrMinOfMonotonic(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.optimize_max_or_min_of_monotonic = true;
+  }
+
+  void EnableOnlyExpm1(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.convert_expm1 = true;
+  }
+
+  void EnableOnlyUnaryOpsComposition(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.unary_ops_composition = true;
+  }
+
+  void EnableOnlyRemoveStackStridedSliceSameAxis(
+      ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_stack_strided_slice_same_axis = true;
+  }
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_TEST_UTILS_H_
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 5e3e5d6af9a7dd435a15f83e94434de0c25ed7aa..37fa7d9a3e3cb35032ed0003fb45bfaa70fc3309 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -17,6 +17,10 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 
+#include <cmath>
+
+#include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -34,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/evaluation_utils.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/symbolic_shapes.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -167,6 +172,55 @@ bool HasTPUAttributes(const NodeDef& node) {
   return false;
 }
 
+template <typename T>
+bool PackedValuesNotEqual(T a, T b) {
+  return a != b;
+}
+
+template <>
+bool PackedValuesNotEqual(float a, float b) {
+  return reinterpret_cast<int32_t&>(a) != reinterpret_cast<int32_t&>(b);
+}
+
+template <>
+bool PackedValuesNotEqual(double a, double b) {
+  return reinterpret_cast<int64_t&>(a) != reinterpret_cast<int64_t&>(b);
+}
+
+float QuantizedTypeMinAsFloat(DataType data_type) {
+  switch (data_type) {
+    case DT_QINT8:
+      return Eigen::NumTraits<qint8>::lowest();
+    case DT_QUINT8:
+      return Eigen::NumTraits<quint8>::lowest();
+    case DT_QINT16:
+      return Eigen::NumTraits<qint16>::lowest();
+    case DT_QUINT16:
+      return Eigen::NumTraits<quint16>::lowest();
+    case DT_QINT32:
+      return Eigen::NumTraits<qint32>::lowest();
+    default:
+      return 0.0f;
+  }
+}
+
+float QuantizedTypeMaxAsFloat(DataType data_type) {
+  switch (data_type) {
+    case DT_QINT8:
+      return Eigen::NumTraits<qint8>::highest();
+    case DT_QUINT8:
+      return Eigen::NumTraits<quint8>::highest();
+    case DT_QINT16:
+      return Eigen::NumTraits<qint16>::highest();
+    case DT_QUINT16:
+      return Eigen::NumTraits<quint16>::highest();
+    case DT_QINT32:
+      return Eigen::NumTraits<qint32>::highest();
+    default:
+      return 0.0f;
+  }
+}
+
 }  // namespace
 
 ConstantFolding::ConstantFolding(RewriterConfig::Toggle opt_level,
@@ -716,6 +770,61 @@ Status ConstantFolding::MaterializeReductionIndices(
   return Status::OK();
 }
 
+Status ConstantFolding::MaterializeConstantValuedNode(
+    NodeDef* node, const GraphProperties& properties) {
+  // Nodes that generate constant-valued outputs can be represented compactly in
+  // compressed format, regardless of their shape.
+  const std::vector<OpInfo::TensorProperties>& output_props =
+      properties.GetOutputProperties(node->name());
+  if (output_props.size() != 1) return Status::OK();
+  const auto& output_shape = output_props[0].shape();
+  if (!PartialTensorShape(output_shape).IsFullyDefined()) {
+    return Status::OK();
+  }
+  if (IsFill(*node)) {
+    const auto output_dtype = output_props[0].dtype();
+    NodeDef* input_node = nullptr;
+    for (int i = 0; i < 2; ++i) {
+      input_node = node_map_->GetNode(NodeName(node->input(i)));
+      if (input_node == nullptr || !IsReallyConstant(*input_node)) {
+        return Status::OK();
+      }
+    }
+    TF_RETURN_IF_ERROR(CheckAttrExists(*input_node, "value"));
+    const TensorProto& input_tensor = input_node->attr().at("value").tensor();
+    // TODO(rmlarsen): Handle the case where the value is stored in
+    // tensor_content.
+    if (!input_tensor.tensor_content().empty()) {
+      return Status::OK();
+    }
+    TensorProto* tensor = (*node->mutable_attr())["value"].mutable_tensor();
+    // Copy the input tensor to the fill node, set the output shape, and
+    // change the nodd type to Const.
+    *tensor = input_tensor;
+    *(tensor->mutable_tensor_shape()) = output_shape;
+    (*node->mutable_attr())["dtype"].set_type(output_dtype);
+    node->mutable_attr()->erase("T");
+    node->mutable_attr()->erase("index_type");
+    node->set_op("Const");
+    for (int i = 0; i < 2; i++) {
+      // Change inputs to a control inputs.
+      const string ctrl_dep = AsControlDependency(node->input(i));
+      node_map_->UpdateInput(node->name(), node->input(i), ctrl_dep);
+      node->set_input(i, ctrl_dep);
+    }
+    graph_modified_ = true;
+  } else {
+    double value =
+        (IsZerosLike(*node) ? 0.0 : (IsOnesLike(*node) ? 1.0 : -1.0));
+    bool success = false;
+    if (value >= 0) {
+      TF_RETURN_IF_ERROR(ReplaceOperationWithConstant(
+          value, properties, output_shape, node, graph_, &success));
+    }
+  }
+  return Status::OK();
+}
+
 Status ConstantFolding::MaterializeConstants(
     const GraphProperties& properties) {
   const int node_count = graph_->node_size();
@@ -726,6 +835,8 @@ Status ConstantFolding::MaterializeConstants(
       TF_RETURN_IF_ERROR(MaterializeBroadcastGradientArgs(node, properties));
     } else if (IsReduction(node)) {
       TF_RETURN_IF_ERROR(MaterializeReductionIndices(&node, properties));
+    } else if (IsFill(node) || IsZerosLike(node) || IsOnesLike(node)) {
+      TF_RETURN_IF_ERROR(MaterializeConstantValuedNode(&node, properties));
     }
   }
   return Status::OK();
@@ -751,6 +862,12 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
   if (ModifiesFrameInfo(node)) {
     return false;
   }
+
+  // Removing LoopCond nodes can screw up the partitioner.
+  if (node.op() == "LoopCond") {
+    return false;
+  }
+
   // Skip constants, they're already folded
   if (IsConstant(node)) {
     return false;
@@ -864,6 +981,11 @@ Status CreateConstantTensorAttrValue(DataType type, double value,
       SET_TENSOR_VAL_CASE(DT_UINT16, int32, int);
       SET_TENSOR_VAL_CASE(DT_INT8, int32, int);
       SET_TENSOR_VAL_CASE(DT_UINT8, int32, int);
+      SET_TENSOR_VAL_CASE(DT_QINT32, int32, int);
+      SET_TENSOR_VAL_CASE(DT_QINT16, int32, int);
+      SET_TENSOR_VAL_CASE(DT_QUINT16, int32, int);
+      SET_TENSOR_VAL_CASE(DT_QINT8, int32, int);
+      SET_TENSOR_VAL_CASE(DT_QUINT8, int32, int);
       SET_TENSOR_VAL_CASE(DT_BOOL, bool, bool);
     default:
       return errors::InvalidArgument("Unsupported type: ", type);
@@ -891,12 +1013,48 @@ DataType GetDataTypeFromNodeOrProps(const NodeDef& node,
   return dtype;
 }
 
+// Checks whether the shape of the const input of the Mul op is valid to perform
+// the MulConvPushDown optimization.
+bool IsValidConstShapeForMulConvPushDown(
+    const string& data_format, const TensorShapeProto& filter_shape,
+    const TensorShapeProto& mul_const_input_shape) {
+  // If the const is a scalar, or it has fewer or same number of dimensions
+  // than the filter and it only has single element, the optimization should
+  // work.
+  if (mul_const_input_shape.dim_size() <= data_format.size() &&
+      TensorShape(mul_const_input_shape).num_elements() == 1) {
+    return true;
+  }
+
+  // Otherwise, check the eligibility according to data format.
+  if (data_format == "NHWC" || data_format == "NDHWC") {
+    TensorShapeProto new_filter_shape;
+    if (!ShapeAfterBroadcast(filter_shape, mul_const_input_shape,
+                             &new_filter_shape)) {
+      return false;
+    }
+    if (!ShapesSymbolicallyEqual(filter_shape, new_filter_shape)) {
+      return false;
+    }
+    // Only the last dimension could be larger than one, since broadcasting over
+    // the last dimension (the output channel) will result in invalid filter.
+    for (int i = 0; i < mul_const_input_shape.dim_size() - 1; ++i) {
+      if (mul_const_input_shape.dim(i).size() > 1) return false;
+    }
+    return true;
+  } else if (data_format == "NCHW" || data_format == "NCDHW") {
+    // TODO(laigd): support NCHW and NCDHW (b/111214513).
+    return false;
+  }
+  return false;
+}
+
 }  // namespace
 
 // static
 Status ConstantFolding::CreateNodeDef(const string& name,
-                                      const TensorValue& tensor,
-                                      NodeDef* node) {
+                                      const TensorValue& tensor, NodeDef* node,
+                                      size_t original_size) {
   node->set_name(name);
   node->set_op("Const");
 
@@ -911,29 +1069,28 @@ Status ConstantFolding::CreateNodeDef(const string& name,
   // Use the packed representation whenever possible to avoid generating large
   // graphdefs. Moreover, avoid repeating the last values if they're equal.
   if (tensor->NumElements() > 4) {
-#define POPULATE_TENSOR_PROTO(tensor, t, TYPE, NAME)                  \
-  {                                                                   \
-    const TYPE* val_ptr = tensor->flat<TYPE>().data();                \
-    TYPE last = *val_ptr;                                             \
-    int64 last_index = 0;                                             \
-    for (int64 i = 0; i < tensor->NumElements(); ++i) {               \
-      TYPE cur = *val_ptr++;                                          \
-      if (cur != last) {                                              \
-        last = cur;                                                   \
-        last_index = i;                                               \
-      }                                                               \
-    }                                                                 \
-    if (last_index < kint32max) {                                     \
-      optimized = true;                                               \
-      encoded_size = (last_index + 1) * sizeof(NAME);                 \
-      t->mutable_##NAME##_val()->Reserve(last_index + 1);             \
-      t->mutable_##NAME##_val()->AddNAlreadyReserved(last_index + 1); \
-      val_ptr = tensor->flat<TYPE>().data();                          \
-      for (int64 i = 0; i <= last_index; ++i) {                       \
-        t->set_##NAME##_val(i, *val_ptr++);                           \
-      }                                                               \
-    }                                                                 \
-  }                                                                   \
+#define POPULATE_TENSOR_PROTO(tensor, t, TYPE, NAME)                      \
+  {                                                                       \
+    const auto* val_ptr = tensor->flat<TYPE>().data();                    \
+    auto last = *val_ptr;                                                 \
+    int64 last_index = 0;                                                 \
+    for (int64 i = 0; i < tensor->NumElements(); ++i) {                   \
+      TYPE cur = *val_ptr++;                                              \
+      if (PackedValuesNotEqual(cur, last)) {                              \
+        last = cur;                                                       \
+        last_index = i;                                                   \
+      }                                                                   \
+    }                                                                     \
+    if (last_index < kint32max) {                                         \
+      optimized = true;                                                   \
+      encoded_size = (last_index + 1) * sizeof(NAME);                     \
+      t->mutable_##NAME##_val()->Reserve(last_index + 1);                 \
+      const auto* src_ptr = tensor->flat<TYPE>().data();                  \
+      auto* dst_ptr =                                                     \
+          t->mutable_##NAME##_val()->AddNAlreadyReserved(last_index + 1); \
+      std::copy(src_ptr, src_ptr + last_index + 1, dst_ptr);              \
+    }                                                                     \
+  }                                                                       \
   break
 
     switch (tensor->dtype()) {
@@ -969,16 +1126,19 @@ Status ConstantFolding::CreateNodeDef(const string& name,
     t->set_dtype(tensor->dtype());
     tensor->shape().AsProto(t->mutable_tensor_shape());
   } else {
+    // DT_HALF, DT_BFLOAT16, DT_QINT32, DT_QINT16, DT_QUINT16, DT_QINT8,
+    // DT_QUINT8
     tensor->AsProtoTensorContent(t);
     encoded_size = t->tensor_content().size();
   }
   node->mutable_attr()->insert({"value", attr_tensor});
 
-  if (encoded_size < 10 * 1024 * 1024) {
-    return Status::OK();
+  if (encoded_size > original_size && encoded_size >= 10 * 1024 * 1024) {
+    return errors::InvalidArgument(
+        strings::StrCat("Can't fold ", name, ", its size would be too large (",
+                        encoded_size, " >= ", 10 * 1024 * 1024, " bytes)"));
   }
-  return errors::InvalidArgument(
-      strings::StrCat("Can't fold ", name, ", its size would be too large"));
+  return Status::OK();
 }
 
 Status ConstantFolding::EvaluateNode(const NodeDef& node,
@@ -1004,6 +1164,7 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
     }
   });
 
+  size_t total_inputs_size = 0;
   for (const auto& input : node.input()) {
     const TensorId input_tensor = ParseTensorName(input);
     if (input_tensor.index() < 0) {
@@ -1021,6 +1182,7 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
     Tensor* value = new Tensor(raw_val.dtype(), raw_val.tensor_shape());
     CHECK(value->FromProto(raw_val));
     inputs.emplace_back(value);
+    total_inputs_size += value->TotalBytes();
   }
 
   TF_RETURN_IF_ERROR(EvaluateNode(node, inputs, &output_tensors));
@@ -1035,7 +1197,8 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
       node_name = strings::StrCat(node_name, "-", i);
     }
     if (output_tensors[i].tensor) {
-      Status s = CreateNodeDef(node_name, output_tensors[i], &outputs->at(i));
+      Status s = CreateNodeDef(node_name, output_tensors[i], &outputs->at(i),
+                               total_inputs_size);
       if (!s.ok()) {
         *result_too_large = true;
         return s;
@@ -1049,98 +1212,103 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
   return Status::OK();
 }
 
-Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph,
-                                 bool* result_too_large) {
-  if (IsMerge(*node)) {
-    // Merge nodes are special, in the sense that they execute as soon as one of
-    // their input is ready. We can therefore fold a merge node iff it has at
-    // least one constant input without control dependency.
-    // We still need to ensure that the nodes in the fanin of the merge node are
-    // scheduled. We'll therefore add a control dependency from the merge node
-    // to the folded constant. We end up with:
-    //  * the merge node and its inputs are preserved as is
-    //  * a new constant node C1, driven by the merge node through a control
-    //  dependency, initialized to the value of the folded input
-    //  * a new constant node C2, driven by the merge node through a control
-    //  dependency, initialized to the index of the folded input
-    //  * the fanout of the merge nodes is rewired to be driven by either C1 or
-    //  C2.
-    for (int input_index = 0; input_index < node->input_size(); ++input_index) {
-      const auto& input = node->input(input_index);
-      if (IsControlInput(input)) {
-        // Try the next input.
-        continue;
-      }
-      NodeDef* input_node = node_map_->GetNode(input);
-      if (!IsReallyConstant(*input_node)) {
-        continue;
-      }
-      bool valid_input = true;
-      for (const string& fanin_of_input : input_node->input()) {
-        if (IsControlInput(fanin_of_input)) {
-          valid_input = false;
-          break;
-        }
-      }
-      if (!valid_input) {
-        // Try the next input
-        continue;
+Status ConstantFolding::FoldMergeNode(NodeDef* node, GraphDef* output_graph) {
+  // Merge nodes are special, in the sense that they execute as soon as one of
+  // their input is ready. We can therefore fold a merge node iff it has at
+  // least one constant input without control dependency.
+  // We still need to ensure that the nodes in the fanin of the merge node are
+  // scheduled. We'll therefore add a control dependency from the merge node
+  // to the folded constant. We end up with:
+  //  * the merge node and its inputs are preserved as is
+  //  * a new constant node C1, driven by the merge node through a control
+  //  dependency, initialized to the value of the folded input
+  //  * a new constant node C2, driven by the merge node through a control
+  //  dependency, initialized to the index of the folded input
+  //  * the fanout of the merge nodes is rewired to be driven by either C1 or
+  //  C2.
+  for (int input_index = 0; input_index < node->input_size(); ++input_index) {
+    const auto& input = node->input(input_index);
+    if (IsControlInput(input)) {
+      // Try the next input.
+      continue;
+    }
+    NodeDef* input_node = node_map_->GetNode(input);
+    if (!IsReallyConstant(*input_node)) {
+      continue;
+    }
+    bool valid_input = true;
+    for (const string& fanin_of_input : input_node->input()) {
+      if (IsControlInput(fanin_of_input)) {
+        valid_input = false;
+        break;
       }
+    }
+    if (!valid_input) {
+      // Try the next input
+      continue;
+    }
 
-      string const_out_name = OptimizedNodeName(*node, "_const");
-      string const_index_name = OptimizedNodeName(*node, "_index");
-      if (node_map_->GetNode(const_out_name) ||
-          node_map_->GetNode(const_index_name)) {
-        // Intended name already exists.
-        return errors::AlreadyExists(
-            strings::StrCat(const_out_name, " or ", const_index_name,
-                            " already present in the graph"));
-      }
-
-      NodeDef* const_out = output_graph->add_node();
-      *const_out = *input_node;
-      const_out->set_name(const_out_name);
-      const_out->set_device(node->device());
-      *const_out->add_input() = AsControlDependency(*node);
-      node_map_->AddNode(const_out->name(), const_out);
-      node_map_->AddOutput(node->name(), const_out->name());
-
-      NodeDef* const_index = output_graph->add_node();
-      const_index->set_op("Const");
-      Tensor index(DT_INT32, TensorShape({}));
-      index.flat<int32>()(0) = input_index;
-      (*const_index->mutable_attr())["dtype"].set_type(DT_INT32);
-      index.AsProtoTensorContent(
-          (*const_index->mutable_attr())["value"].mutable_tensor());
-      const_index->set_name(const_index_name);
-      const_index->set_device(node->device());
-      *const_index->add_input() = AsControlDependency(*node);
-      node_map_->AddNode(const_index->name(), const_index);
-      node_map_->AddOutput(node->name(), const_index->name());
-
-      auto outputs = node_map_->GetOutputs(node->name());
-      for (NodeDef* output : outputs) {
-        for (int i = 0; i < output->input_size(); i++) {
-          int port;
-          string node_name = ParseNodeName(output->input(i), &port);
-          if (node_name == node->name()) {
-            if (port == 0) {
-              *output->mutable_input(i) = const_out->name();
-              node_map_->AddOutput(const_out->name(), output->name());
-            } else if (port == 1) {
-              *output->mutable_input(i) = const_index->name();
-              node_map_->AddOutput(const_index->name(), output->name());
-            } else {
-              // This is a control dependency (or an invalid edge since the
-              // merge node has only 2 inputs): preserve them.
-            }
+    string const_out_name = OptimizedNodeName(*node, "_const");
+    string const_index_name = OptimizedNodeName(*node, "_index");
+    if (node_map_->GetNode(const_out_name) ||
+        node_map_->GetNode(const_index_name)) {
+      // Intended name already exists.
+      return errors::AlreadyExists(
+          strings::StrCat(const_out_name, " or ", const_index_name,
+                          " already present in the graph"));
+    }
+
+    NodeDef* const_out = output_graph->add_node();
+    *const_out = *input_node;
+    const_out->set_name(const_out_name);
+    const_out->set_device(node->device());
+    *const_out->add_input() = AsControlDependency(*node);
+    node_map_->AddNode(const_out->name(), const_out);
+    node_map_->AddOutput(node->name(), const_out->name());
+
+    NodeDef* const_index = output_graph->add_node();
+    const_index->set_op("Const");
+    Tensor index(DT_INT32, TensorShape({}));
+    index.flat<int32>()(0) = input_index;
+    (*const_index->mutable_attr())["dtype"].set_type(DT_INT32);
+    index.AsProtoTensorContent(
+        (*const_index->mutable_attr())["value"].mutable_tensor());
+    const_index->set_name(const_index_name);
+    const_index->set_device(node->device());
+    *const_index->add_input() = AsControlDependency(*node);
+    node_map_->AddNode(const_index->name(), const_index);
+    node_map_->AddOutput(node->name(), const_index->name());
+
+    auto outputs = node_map_->GetOutputs(node->name());
+    for (NodeDef* output : outputs) {
+      for (int i = 0; i < output->input_size(); i++) {
+        int port;
+        string node_name = ParseNodeName(output->input(i), &port);
+        if (node_name == node->name()) {
+          if (port == 0) {
+            *output->mutable_input(i) = const_out->name();
+            node_map_->AddOutput(const_out->name(), output->name());
+          } else if (port == 1) {
+            *output->mutable_input(i) = const_index->name();
+            node_map_->AddOutput(const_index->name(), output->name());
+          } else {
+            // This is a control dependency (or an invalid edge since the
+            // merge node has only 2 inputs): preserve them.
           }
         }
       }
-      return Status::OK();
     }
     return Status::OK();
   }
+  return Status::OK();
+}
+
+Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph,
+                                 bool* result_too_large) {
+  *result_too_large = false;
+  if (IsMerge(*node)) {
+    return FoldMergeNode(node, output_graph);
+  }
 
   std::vector<NodeDef> const_nodes;
   TF_RETURN_IF_ERROR(
@@ -1385,7 +1553,8 @@ bool ConstantFolding::IsOnes(const NodeDef& node) const {
   if (feed_nodes_.find(node.name()) != feed_nodes_.end()) {
     return false;
   }
-  if (node.op() == "OnesLike") return true;
+  if (IsOnesLike(node)) return true;
+  if (IsZerosLike(node)) return false;
   if (node.op() == "Fill") {
     NodeDef* values = node_map_->GetNode(NodeName(node.input(1)));
     return values != nullptr && IsOnes(*values);
@@ -1407,6 +1576,11 @@ bool ConstantFolding::IsOnes(const NodeDef& node) const {
     IS_ONES_CASE(DT_INT16);
     IS_ONES_CASE(DT_INT32);
     IS_ONES_CASE(DT_INT64);
+    IS_ONES_CASE(DT_QINT32);
+    IS_ONES_CASE(DT_QINT16);
+    IS_ONES_CASE(DT_QUINT16);
+    IS_ONES_CASE(DT_QINT8);
+    IS_ONES_CASE(DT_QUINT8);
     default:
       VLOG(1) << "Unsupported type " << DataTypeString(dtype);
       return false;
@@ -1418,7 +1592,8 @@ bool ConstantFolding::IsZeros(const NodeDef& node) const {
   if (feed_nodes_.find(node.name()) != feed_nodes_.end()) {
     return false;
   }
-  if (node.op() == "ZerosLike") return true;
+  if (IsOnesLike(node)) return false;
+  if (IsZerosLike(node)) return true;
   if (node.op() == "Fill") {
     NodeDef* values = node_map_->GetNode(NodeName(node.input(1)));
     return values != nullptr && IsZeros(*values);
@@ -1440,6 +1615,11 @@ bool ConstantFolding::IsZeros(const NodeDef& node) const {
     IS_ZEROS_CASE(DT_INT16);
     IS_ZEROS_CASE(DT_INT32);
     IS_ZEROS_CASE(DT_INT64);
+    IS_ZEROS_CASE(DT_QINT32);
+    IS_ZEROS_CASE(DT_QINT16);
+    IS_ZEROS_CASE(DT_QUINT16);
+    IS_ZEROS_CASE(DT_QINT8);
+    IS_ZEROS_CASE(DT_QUINT8);
     default:
       VLOG(1) << "Unsupported type " << DataTypeString(dtype);
       return false;
@@ -1552,6 +1732,7 @@ Status ConstantFolding::ReplaceOperationWithConstant(
     node->set_input(i, ctrl_dep);
   }
   *success = true;
+  graph_modified_ = true;
   return Status::OK();
 }
 
@@ -1691,12 +1872,12 @@ Status ConstantFolding::SimplifyNode(bool use_shape_info, NodeDef* node,
     return Status::OK();
   }
 
-  if (ConstantPushDown(node)) {
+  if (ConstantPushDown(optimized_graph, node)) {
     graph_modified_ = true;
     return Status::OK();
   }
 
-  if (MulConvPushDown(node, *properties)) {
+  if (MulConvPushDown(optimized_graph, node, *properties)) {
     graph_modified_ = true;
     return Status::OK();
   }
@@ -2448,6 +2629,7 @@ Status ConstantFolding::SimplifyArithmeticOperations(
   *success = false;
   const bool is_mul = IsMul(*node) || IsLogicalAnd(*node);
   const bool is_matmul = IsMatMul(*node);
+  const bool is_quantized_matmul = IsQuantizedMatMul(*node);
   const bool is_add = IsAdd(*node) || IsBiasAdd(*node) || IsLogicalOr(*node);
   const bool is_sub = IsSub(*node);
   const bool is_any_div = IsAnyDiv(*node);
@@ -2542,6 +2724,10 @@ Status ConstantFolding::SimplifyArithmeticOperations(
         if (!replace_op_status.ok()) {
           return replace_op_status;
         } else if (replace_succeed) {
+          if (is_quantized_matmul) {
+            TF_RETURN_IF_ERROR(
+                AddQuantizedMatMulMinMaxOutConstNodes(node, optimized_graph));
+          }
           *success = true;
           return Status::OK();
         }
@@ -2606,7 +2792,8 @@ bool ConstantFolding::ReduceDivToReciprocalMul(GraphDef* optimized_graph,
   return false;
 }
 
-bool ConstantFolding::ConstantPushDown(NodeDef* node) {
+bool ConstantFolding::ConstantPushDown(GraphDef* optimized_graph,
+                                       NodeDef* node) {
   // Consider the transformation
   //
   //                      +                +       = parent
@@ -2674,9 +2861,10 @@ bool ConstantFolding::ConstantPushDown(NodeDef* node) {
       // edge. We can replace such a control edge with a control edge from A
       // to C.
       CHECK(MaybeRemoveControlInput(op_child_node->name(), const_child_node,
-                                    graph_, node_map_.get()));
-      NodeDef* other_leaf = left_leaf_is_constant ? left_leaf : right_leaf;
-      MaybeAddControlInput(other_leaf->name(), const_child_node, graph_,
+                                    optimized_graph, node_map_.get()));
+      string other_leaf_input = left_leaf_is_constant ? op_child_node->input(0)
+                                                      : op_child_node->input(1);
+      MaybeAddControlInput(other_leaf_input, const_child_node, optimized_graph,
                            node_map_.get());
     }
 
@@ -2693,7 +2881,7 @@ bool ConstantFolding::ConstantPushDown(NodeDef* node) {
   return false;
 }
 
-bool ConstantFolding::MulConvPushDown(NodeDef* node,
+bool ConstantFolding::MulConvPushDown(GraphDef* optimized_graph, NodeDef* node,
                                       const GraphProperties& properties) {
   // Push down multiplication on ConvND.
   //                       *                  ConvND
@@ -2703,115 +2891,110 @@ bool ConstantFolding::MulConvPushDown(NodeDef* node,
   //                 X  C1                       C1  C2
   //
   // where C1 and C2 are constants and X is non-constant.
-  if (IsMul(*node) && NumNonControlInputs(*node) == 2) {
-    NodeDef* mul_left_child = node_map_->GetNode(node->input(0));
-    NodeDef* mul_right_child = node_map_->GetNode(node->input(1));
-    // One child must be constant, and the second must be Conv op.
-    const bool left_child_is_constant = IsReallyConstant(*mul_left_child);
-    const bool right_child_is_constant = IsReallyConstant(*mul_right_child);
-    if (!left_child_is_constant && !right_child_is_constant) {
-      return false;
-    }
-    NodeDef* conv_node =
-        left_child_is_constant ? mul_right_child : mul_left_child;
-    if (!IsConv2D(*conv_node) && !IsConv3D(*conv_node)) {
-      return false;
-    }
-    if (node->device() != mul_left_child->device() ||
-        node->device() != mul_right_child->device()) {
-      return false;
-    }
-
-    // Make sure that it is safe to change the value of the convolution
-    // output.
-    if (conv_node->input_size() < 2 ||
-        NumNonControlOutputs(*conv_node, *node_map_) > 1 ||
-        nodes_to_preserve_.find(conv_node->name()) !=
-            nodes_to_preserve_.end()) {
-      return false;
-    }
-
-    // Identify the nodes to swap.
-    NodeDef* conv_left_child = node_map_->GetNode(conv_node->input(0));
-    NodeDef* conv_right_child = node_map_->GetNode(conv_node->input(1));
-    const bool conv_left_is_constant = IsReallyConstant(*conv_left_child);
-    const bool conv_right_is_constant = IsReallyConstant(*conv_right_child);
-    if (!conv_left_is_constant && !conv_right_is_constant) {
-      // At least one of the convolution inputs should be constant.
-      return false;
-    }
-    if (conv_left_is_constant && conv_right_is_constant) {
-      // Leverage regular constant folding to handle this.
-      return false;
-    }
-    const auto& mul_props = properties.GetOutputProperties(node->name());
-    const auto& conv_props = properties.GetOutputProperties(conv_node->name());
-    if (mul_props.empty() || conv_props.empty()) {
-      return false;
-    }
-    const auto& mul_shape = mul_props[0].shape();
-    const auto& conv_shape = conv_props[0].shape();
-    if (!ShapesSymbolicallyEqual(mul_shape, conv_shape)) {
-      return false;
-    }
+  if (!IsMul(*node) || NumNonControlInputs(*node) != 2) return false;
+
+  NodeDef* mul_left_child = node_map_->GetNode(node->input(0));
+  NodeDef* mul_right_child = node_map_->GetNode(node->input(1));
+  // One child must be constant, and the second must be Conv op.
+  const bool left_child_is_constant = IsReallyConstant(*mul_left_child);
+  const bool right_child_is_constant = IsReallyConstant(*mul_right_child);
+  if (!left_child_is_constant && !right_child_is_constant) {
+    return false;
+  }
+  NodeDef* conv_node =
+      left_child_is_constant ? mul_right_child : mul_left_child;
+  if (!IsConv2D(*conv_node) && !IsConv3D(*conv_node)) {
+    return false;
+  }
+  if (node->device() != mul_left_child->device() ||
+      node->device() != mul_right_child->device()) {
+    return false;
+  }
 
-    const auto& input_props = properties.GetInputProperties(conv_node->name());
-    if (input_props.size() < 2) {
-      return false;
-    }
-    const auto& filter_shape = input_props[1].shape();
+  // Make sure that it is safe to change the value of the convolution
+  // output.
+  if (conv_node->input_size() < 2 ||
+      NumNonControlOutputs(*conv_node, *node_map_) > 1 ||
+      nodes_to_preserve_.find(conv_node->name()) != nodes_to_preserve_.end()) {
+    return false;
+  }
 
-    NodeDef* const_node =
-        left_child_is_constant ? mul_left_child : mul_right_child;
-    const auto& const_props =
-        properties.GetOutputProperties(const_node->name());
-    if (const_props.empty()) {
-      return false;
-    }
-    const auto& const_shape = const_props[0].shape();
+  // Identify the nodes to swap.
+  NodeDef* conv_left_child = node_map_->GetNode(conv_node->input(0));
+  NodeDef* conv_right_child = node_map_->GetNode(conv_node->input(1));
+  const bool conv_left_is_constant = IsReallyConstant(*conv_left_child);
+  const bool conv_right_is_constant = IsReallyConstant(*conv_right_child);
+  if (!conv_left_is_constant && !conv_right_is_constant) {
+    // At least one of the convolution inputs should be constant.
+    return false;
+  }
+  if (conv_left_is_constant && conv_right_is_constant) {
+    // Leverage regular constant folding to handle this.
+    return false;
+  }
+  const auto& mul_props = properties.GetOutputProperties(node->name());
+  const auto& conv_props = properties.GetOutputProperties(conv_node->name());
+  if (mul_props.empty() || conv_props.empty()) {
+    return false;
+  }
+  const auto& mul_shape = mul_props[0].shape();
+  const auto& conv_shape = conv_props[0].shape();
+  if (!ShapesSymbolicallyEqual(mul_shape, conv_shape)) {
+    return false;
+  }
 
-    TensorShapeProto new_filter_shape;
-    if (!ShapeAfterBroadcast(filter_shape, const_shape, &new_filter_shape)) {
-      return false;
-    }
-    if (!ShapesSymbolicallyEqual(filter_shape, new_filter_shape)) {
-      return false;
-    }
+  const auto& input_props = properties.GetInputProperties(conv_node->name());
+  if (input_props.size() < 2) {
+    return false;
+  }
+  const auto& filter_shape = input_props[1].shape();
 
-    string mul_new_name =
-        AddPrefixToNodeName("merged_input", conv_node->name());
-    if (node_map_->NodeExists(mul_new_name)) {
-      return false;
-    }
-    // Make sure we don't introduce loops in the graph by removing control
-    // dependencies from the conv2d node to c2.
-    NodeDef* conv_const_node =
-        conv_left_is_constant ? conv_left_child : conv_right_child;
-    if (MaybeRemoveControlInput(conv_node->name(), const_node, graph_,
-                                node_map_.get())) {
-      // Add a control dep from c1 to c2 to ensure c2 is in the right frame
-      *const_node->add_input() = AsControlDependency(*conv_const_node);
-    }
-
-    conv_node->set_name(node->name());
-    node->set_name(mul_new_name);
-    if (conv_left_is_constant) {
-      node_map_->UpdateInput(conv_node->name(), node->input(0), mul_new_name);
-      conv_node->set_input(0, mul_new_name);
-    } else {
-      node_map_->UpdateInput(conv_node->name(), node->input(1), mul_new_name);
-      conv_node->set_input(1, mul_new_name);
-    }
-    if (left_child_is_constant) {
-      node->set_input(1, conv_const_node->name());
-    } else {
-      node->set_input(0, conv_const_node->name());
-    }
-    node_map_->AddNode(mul_new_name, node);
+  NodeDef* const_node =
+      left_child_is_constant ? mul_left_child : mul_right_child;
+  const auto& const_props = properties.GetOutputProperties(const_node->name());
+  if (const_props.empty()) {
+    return false;
+  }
+  const auto& const_shape = const_props[0].shape();
+  if (!IsValidConstShapeForMulConvPushDown(
+          conv_node->attr().at("data_format").s(), filter_shape, const_shape)) {
+    return false;
+  }
 
-    return true;
+  string mul_new_name = AddPrefixToNodeName("merged_input", conv_node->name());
+  if (node_map_->NodeExists(mul_new_name)) {
+    return false;
   }
-  return false;
+  // Make sure we don't introduce loops in the graph by removing control
+  // dependencies from the conv2d node to c2.
+  string conv_const_input =
+      conv_left_is_constant ? conv_node->input(0) : conv_node->input(1);
+  if (MaybeRemoveControlInput(conv_node->name(), const_node, optimized_graph,
+                              node_map_.get())) {
+    // Add a control dep from c1 to c2 to ensure c2 is in the right frame
+    MaybeAddControlInput(conv_const_input, const_node, optimized_graph,
+                         node_map_.get());
+  }
+
+  conv_node->set_name(node->name());
+  node->set_name(mul_new_name);
+  if (conv_left_is_constant) {
+    node_map_->UpdateInput(conv_node->name(), node->input(0), mul_new_name);
+    conv_node->set_input(0, mul_new_name);
+  } else {
+    node_map_->UpdateInput(conv_node->name(), node->input(1), mul_new_name);
+    conv_node->set_input(1, mul_new_name);
+  }
+  NodeDef* conv_const_node =
+      conv_left_is_constant ? conv_left_child : conv_right_child;
+  if (left_child_is_constant) {
+    node->set_input(1, conv_const_node->name());
+  } else {
+    node->set_input(0, conv_const_node->name());
+  }
+  node_map_->AddNode(mul_new_name, node);
+
+  return true;
 }
 
 bool ConstantFolding::PartialConstPropThroughIdentityN(NodeDef* node) {
@@ -3112,6 +3295,65 @@ bool ConstantFolding::MergeConcat(const GraphProperties& properties,
   return true;
 }
 
+Status ConstantFolding::AddQuantizedMatMulMinMaxOutConstNodes(
+    NodeDef* node, GraphDef* optimized_graph) {
+  auto add_quantized_out = [this, node, optimized_graph](
+                               const string& out_const_name, int index) {
+    NodeDef* out_node = optimized_graph->add_node();
+    Tensor value(DT_FLOAT, TensorShape({}));
+    const bool is_min = index == 1;
+    const DataType type_attr = node->attr().at("dtype").type();
+
+    value.flat<float>()(0) = is_min ? QuantizedTypeMinAsFloat(type_attr)
+                                    : QuantizedTypeMaxAsFloat(type_attr);
+    TF_RETURN_IF_ERROR(
+        CreateNodeDef(out_const_name, TensorValue(&value), out_node));
+    node_map_->AddNode(out_const_name, out_node);
+    out_node->set_device(node->device());
+
+    // Copy all inputs from node.
+    out_node->mutable_input()->CopyFrom(node->input());
+    for (const string& input : out_node->input()) {
+      node_map_->AddOutput(NodeName(input), out_const_name);
+    }
+
+    // Update output nodes consuming node:index to new const node.
+    string old_input = absl::StrCat(node->name(), ":", index);
+    int old_node_count = 0;
+    auto outputs = node_map_->GetOutputs(node->name());
+    for (const auto& output : outputs) {
+      for (int i = 0; i < output->input_size(); ++i) {
+        if (output->input(i) == old_input) {
+          output->set_input(i, out_const_name);
+          node_map_->AddOutput(out_const_name, output->name());
+        } else if (NodeName(output->input(i)) == node->name()) {
+          ++old_node_count;
+        }
+      }
+      if (old_node_count == 0) {
+        node_map_->RemoveOutput(node->name(), output->name());
+      }
+    }
+
+    return Status::OK();
+  };
+  const string min_out_const_name =
+      OptimizedNodeName(*node, "-quantized_matmul_min_out");
+  const string max_out_const_name =
+      OptimizedNodeName(*node, "-quantized_matmul_max_out");
+  if (node_map_->GetNode(min_out_const_name) == nullptr &&
+      node_map_->GetNode(max_out_const_name) == nullptr) {
+    TF_RETURN_IF_ERROR(add_quantized_out(min_out_const_name, 1));
+    TF_RETURN_IF_ERROR(add_quantized_out(max_out_const_name, 2));
+  } else {
+    return errors::Internal(absl::Substitute(
+        "Can't create Const for QuantizedMatMul min_out/max_out of "
+        "node '$0' because of node name conflict",
+        node->name()));
+  }
+  return Status::OK();
+}
+
 Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
                                             const GrapplerItem& item,
                                             GraphDef* optimized_graph) {
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index 0b778882d7d4d89d83de5d6bd5a6f9c827cf5bf8..418176c8932639f4f8bbef8f636c33b56d36f1c2 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -35,8 +35,10 @@ const char kConstantFoldingCtrl[] = "ConstantFoldingCtrl";
 // Constant folding optimization for a graph.
 class ConstantFolding : public GraphOptimizer {
  public:
+  // The size limit will only be considered if the newly created node is greater
+  // than original_size (optional).
   static Status CreateNodeDef(const string& name, const TensorValue& tensor,
-                              NodeDef* node);
+                              NodeDef* node, size_t original_size = 0);
   static string AddControlDependency(const string& input_name, GraphDef* graph,
                                      NodeMap* node_map);
 
@@ -65,8 +67,10 @@ class ConstantFolding : public GraphOptimizer {
                                           const GraphProperties& properties);
   Status MaterializeReductionIndices(NodeDef* node,
                                      const GraphProperties& properties);
-
+  Status MaterializeConstantValuedNode(NodeDef* node,
+                                       const GraphProperties& properties);
   Status MaterializeConstants(const GraphProperties& properties);
+
   bool IsFoldable(const NodeDef& node) const;
 
   Status EvaluateNode(const NodeDef& node,
@@ -76,6 +80,7 @@ class ConstantFolding : public GraphOptimizer {
   Status EvaluateOneFoldable(const NodeDef& node, std::vector<NodeDef>* outputs,
                              bool* result_too_large);
 
+  Status FoldMergeNode(NodeDef* node, GraphDef* output_graph);
   Status FoldNode(NodeDef* node, GraphDef* output_graph,
                   bool* result_too_large);
 
@@ -124,11 +129,12 @@ class ConstantFolding : public GraphOptimizer {
 
   // Pushes down constants on '+' and '*' operators if applicable. Returns true
   // the transformation applied successfully.
-  bool ConstantPushDown(NodeDef* node);
+  bool ConstantPushDown(GraphDef* optimized_graph, NodeDef* node);
 
   // Aggregate constants present around a conv operator. Returns true if the
   // transformation was applied successfully.
-  bool MulConvPushDown(NodeDef* node, const GraphProperties& properties);
+  bool MulConvPushDown(GraphDef* optimized_graph, NodeDef* node,
+                       const GraphProperties& properties);
 
   // Strength reduces floating point division by a constant Div(x, const) to
   // multiplication by the reciprocal Mul(x, Reciprocal(const)).
@@ -230,6 +236,9 @@ class ConstantFolding : public GraphOptimizer {
   bool MergeConcat(const GraphProperties& properties, bool use_shape_info,
                    GraphDef* optimized_graph, NodeDef* node);
 
+  Status AddQuantizedMatMulMinMaxOutConstNodes(NodeDef* node,
+                                               GraphDef* optimized_graph);
+
   // Points to an externally provided device or to owned_device_;
   RewriterConfig::Toggle opt_level_;
   DeviceBase* cpu_device_;
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 192f48272f9ed08b2b6424f3c8e33d1afafdb56d..76e149d0ae3c5766205e42f6e2486a825d382b66 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -119,6 +119,100 @@ class ConstantFoldingTest : public GrapplerTest {
       }
     }
   }
+
+  void MulConvPushDownTest(const TensorShape& input_shape,
+                           const TensorShape& filter_shape,
+                           const TensorShape& mul_const_input_shape,
+                           const bool use_3d_conv, const char* padding,
+                           const char* data_format, const bool expect_folded) {
+    // Tests if the following rewrite is performed:
+    //
+    //         *                       Conv2D
+    //        / \                       / \
+    //       c  Conv2D        -->      x  (c * filter)
+    //           / \
+    //          x  filter
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+    Tensor filter_values(DT_FLOAT, filter_shape);
+    for (int i = 0; i < filter_values.NumElements(); ++i) {
+      filter_values.flat<float>()(i) = std::sqrt(static_cast<float>(i));
+    }
+    Output filter =
+        ops::Const(s.WithOpName("filter"), Input::Initializer(filter_values));
+
+    Output input = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                                    ops::Placeholder::Shape(input_shape));
+
+    Output conv;
+    if (use_3d_conv) {
+      conv = ops::Conv3D(s.WithOpName("conv"), input, filter, {1, 1, 1, 1, 1},
+                         padding, ops::Conv3D::DataFormat(data_format));
+    } else {
+      conv = ops::Conv2D(s.WithOpName("conv"), input, filter, {1, 1, 1, 1},
+                         padding, ops::Conv2D::DataFormat(data_format));
+    }
+    Tensor mul_const_input(DT_FLOAT, mul_const_input_shape);
+    for (int i = 0; i < mul_const_input.NumElements(); ++i) {
+      mul_const_input.flat<float>()(i) = static_cast<float>(i + 3);
+    }
+    Output c =
+        ops::Const(s.WithOpName("c"), Input::Initializer(mul_const_input));
+    Output mul = ops::Mul(s.WithOpName("mul"), c, conv);
+
+    GrapplerItem item;
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+    ConstantFolding optimizer(/*cpu_device=*/nullptr);
+    GraphDef output;
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
+    TF_EXPECT_OK(status);
+
+    EXPECT_EQ(5, output.node_size());
+    int found = 0;
+    if (expect_folded) {
+      for (const auto& node : output.node()) {
+        if (node.name() == "mul") {
+          found++;
+          EXPECT_EQ(use_3d_conv ? "Conv3D" : "Conv2D", node.op());
+          EXPECT_EQ(2, node.input_size());
+          EXPECT_EQ("x", node.input(0));
+          EXPECT_EQ("conv/merged_input", node.input(1));
+        } else if (node.name() == "conv/merged_input") {
+          found++;
+          EXPECT_EQ("Const", node.op());
+          EXPECT_EQ(0, node.input_size());
+        }
+      }
+    } else {
+      for (const auto& node : output.node()) {
+        if (node.name() == "mul") {
+          found++;
+          EXPECT_EQ("Mul", node.op());
+          EXPECT_EQ(2, node.input_size());
+          EXPECT_EQ("c", node.input(0));
+          EXPECT_EQ("conv", node.input(1));
+        } else if (node.name() == "conv") {
+          found++;
+          EXPECT_EQ(use_3d_conv ? "Conv3D" : "Conv2D", node.op());
+          EXPECT_EQ(2, node.input_size());
+          EXPECT_EQ("x", node.input(0));
+          EXPECT_EQ("filter", node.input(1));
+        }
+      }
+    }
+    EXPECT_EQ(2, found);
+
+    // Check that const folded multiplication node has the expected value.
+    std::vector<string> fetch = {"mul"};
+    Tensor value(DT_FLOAT, input_shape);
+    for (int i = 0; i < value.NumElements(); ++i) {
+      value.flat<float>()(i) = i;
+    }
+    auto actual = EvaluateNodes(output, fetch, {{"x", value}});
+    auto expected = EvaluateNodes(item.graph, fetch, {{"x", value}});
+    test::ExpectTensorEqual<float>(expected[0], actual[0]);
+  }
 };
 
 TEST_F(ConstantFoldingTest, SimpleFolding) {
@@ -242,73 +336,147 @@ TEST_F(ConstantFoldingTest, AddTree) {
   }
 }
 
-TEST_F(ConstantFoldingTest, ConvPushDownTest) {
-  // Tests if the following rewrite is performed:
-  //
-  //         *                       Conv2D
-  //        / \                       / \
-  //       c  Conv2D        -->      x  (c * filter)
-  //           / \
-  //          x  filter
-  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_ScalarConst) {
+  for (string data_format : {
+         "NHWC",
+#if GOOGLE_CUDA
+             "NCHW"
+#endif  // GOOGLE_CUDA
+       }) {
+    MulConvPushDownTest(
+        /*input_shape=*/data_format == "NHWC" ? TensorShape{4, 10, 10, 3}
+                                              : TensorShape{4, 3, 10, 10},
+        /*filter_shape=*/{2, 2, 3, 5},
+        /*mul_const_input_shape=*/{},
+        /*use_3d_conv=*/false,
+        /*padding=*/"VALID", data_format.c_str(),
+        /*expect_folded=*/true);
+  }
+}
 
-  int input_depth = 3;
-  int filter_count = 5;
-  int filter_size = 2;
-  TensorShape filter_shape(
-      {filter_size, filter_size, input_depth, filter_count});
-  Tensor filter_values(DT_FLOAT, filter_shape);
-  for (int i = 0; i < filter_values.NumElements(); ++i) {
-    filter_values.flat<float>()(i) = std::sqrt(static_cast<float>(i));
-  }
-  Output filter =
-      ops::Const(s.WithOpName("filter"), Input::Initializer(filter_values));
-
-  int batch_size = 4;
-  int input_dim = 10;
-  TensorShape input_shape({batch_size, input_dim, input_dim, input_depth});
-  Output input = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
-                                  ops::Placeholder::Shape(input_shape));
-
-  Output conv =
-      ops::Conv2D(s.WithOpName("conv"), input, filter, {1, 1, 1, 1}, "VALID");
-  Output c = ops::Const(s.WithOpName("c"), 3.0f, {1});
-  Output mul = ops::Mul(s.WithOpName("mul"), c, conv);
+TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_SingletonConst) {
+  for (string data_format : {
+         "NHWC",
+#if GOOGLE_CUDA
+             "NCHW"
+#endif  // GOOGLE_CUDA
+       }) {
+    for (auto mul_const_input_shape :
+         {TensorShape{1}, TensorShape{1, 1, 1, 1}}) {
+      MulConvPushDownTest(
+          /*input_shape=*/data_format == "NHWC" ? TensorShape{4, 10, 10, 3}
+                                                : TensorShape{4, 3, 10, 10},
+          /*filter_shape=*/{2, 2, 3, 5}, mul_const_input_shape,
+          /*use_3d_conv=*/false,
+          /*padding=*/"VALID", data_format.c_str(),
+          /*expect_folded=*/true);
+    }
+  }
+}
 
-  GrapplerItem item;
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+TEST_F(ConstantFoldingTest,
+       MulConvPushDownTest_Conv2D_SingletonConst_ShapeMismatch) {
+  for (string data_format : {
+         "NHWC",
+#if GOOGLE_CUDA
+             "NCHW"
+#endif  // GOOGLE_CUDA
+       }) {
+    MulConvPushDownTest(
+        /*input_shape=*/data_format == "NHWC" ? TensorShape{4, 10, 10, 3}
+                                              : TensorShape{4, 3, 10, 10},
+        /*filter_shape=*/{2, 2, 3, 5},
+        /*mul_const_input_shape=*/{1, 1, 1, 1, 1},
+        /*use_3d_conv=*/false,
+        /*padding=*/"VALID", data_format.c_str(),
+        /*expect_folded=*/false);
+  }
+}
 
-  ConstantFolding optimizer(/*cpu_device=*/nullptr);
-  GraphDef output;
-  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
-  TF_EXPECT_OK(status);
+TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_3x1x3Const) {
+  for (auto data_format : {
+         "NHWC",
+#if GOOGLE_CUDA
+             "NCHW"
+#endif  // GOOGLE_CUDA
+       }) {
+    MulConvPushDownTest(
+        /*input_shape=*/{3, 3, 3, 3},
+        /*filter_shape=*/{3, 3, 3, 3},
+        /*mul_const_input_shape=*/{3, 1, 3},
+        /*use_3d_conv=*/false,
+        /*padding=*/"SAME", data_format,
+        /*expect_folded=*/false);
+  }
+}
 
-  EXPECT_EQ(5, output.node_size());
-  int found = 0;
-  for (const auto& node : output.node()) {
-    if (node.name() == "mul") {
-      found++;
-      EXPECT_EQ("Conv2D", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("x", node.input(0));
-      EXPECT_EQ("conv/merged_input", node.input(1));
-    } else if (node.name() == "conv/merged_input") {
-      found++;
-      EXPECT_EQ("Const", node.op());
-      EXPECT_EQ(0, node.input_size());
-    }
+TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_NHWC_VectorLikeConst) {
+  for (auto mul_const_input_shape :
+       {TensorShape{3}, TensorShape{1, 3}, TensorShape{1, 1, 1, 3}}) {
+    MulConvPushDownTest(
+        /*input_shape=*/{3, 3, 3, 3},
+        /*filter_shape=*/{3, 3, 3, 3}, mul_const_input_shape,
+        /*use_3d_conv=*/false,
+        /*padding=*/"SAME",
+        /*data_format=*/"NHWC",
+        /*expect_folded=*/true);
   }
-  EXPECT_EQ(2, found);
+}
 
-  // Check that const folded multiplication node has the expected value.
-  std::vector<string> fetch = {"mul"};
-  Tensor value(DT_FLOAT, input_shape);
-  for (int i = 0; i < value.NumElements(); ++i) {
-    value.flat<float>()(i) = i;
+#if GOOGLE_CUDA
+TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_NCHW_VectorLikeConst) {
+  for (auto mul_const_input_shape :
+       {TensorShape{3}, TensorShape{3, 1, 1}, TensorShape{1, 3, 1, 1}}) {
+    MulConvPushDownTest(
+        /*input_shape=*/{3, 3, 3, 3},
+        /*filter_shape=*/{3, 3, 3, 3}, mul_const_input_shape,
+        /*use_3d_conv=*/false,
+        /*padding=*/"SAME",
+        /*data_format=*/"NCHW",
+        // TODO(laigd): optimization should happen in this case.
+        /*expect_folded=*/false);
   }
-  auto actual = EvaluateNodes(output, fetch, {{"x", value}});
-  auto expected = EvaluateNodes(item.graph, fetch, {{"x", value}});
-  test::ExpectTensorEqual<float>(expected[0], actual[0]);
+}
+#endif  // GOOGLE_CUDA
+
+TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_3x1Const) {
+  for (auto data_format : {
+         "NHWC",
+#if GOOGLE_CUDA
+             "NCHW"
+#endif  // GOOGLE_CUDA
+       }) {
+    MulConvPushDownTest(
+        /*input_shape=*/{3, 3, 3, 3},
+        /*filter_shape=*/{3, 3, 3, 3},
+        /*mul_const_input_shape=*/{3, 1},
+        /*use_3d_conv=*/false,
+        /*padding=*/"SAME", data_format,
+        /*expect_folded=*/false);
+  }
+}
+
+TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv3D_NDHWC_1x1x3Const) {
+  MulConvPushDownTest(
+      /*input_shape=*/{3, 3, 3, 3, 3},
+      /*filter_shape=*/{3, 3, 3, 3, 3},
+      /*mul_const_input_shape=*/{1, 1, 3},
+      /*use_3d_conv=*/true,
+      /*padding=*/"SAME",
+      /*data_format=*/"NDHWC",
+      /*expect_folded=*/true);
+}
+
+TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv3D_NCDHW_3x1x1x1Const) {
+  MulConvPushDownTest(
+      /*input_shape=*/{3, 3, 3, 3, 3},
+      /*filter_shape=*/{3, 3, 3, 3, 3},
+      /*mul_const_input_shape=*/{3, 1, 1, 1},
+      /*use_3d_conv=*/true,
+      /*padding=*/"SAME",
+      /*data_format=*/"NDHWC",
+      // TODO(laigd): optimization should happen in this case.
+      /*expect_folded=*/false);
 }
 
 TEST_F(ConstantFoldingTest, NeutralElement) {
@@ -378,7 +546,7 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
     const string ones_name = strings::StrCat("ones", suffix);
     const string ctrl_zeros_name = strings::StrCat("^zeros", suffix);
     const string ctrl_ones_name = strings::StrCat("^ones", suffix);
-    EXPECT_EQ(27, output.node_size());
+    EXPECT_EQ(const_type == kFill ? 31 : 27, output.node_size());
     for (int i = 0; i < output.node_size(); ++i) {
       const NodeDef& node = output.node(i);
       const string& name = node.name();
@@ -1601,7 +1769,7 @@ TEST_F(ConstantFoldingTest, SplitRemoval) {
   AddNode("split_dim", "Const", {}, {}, &want);
   AddNode("s1", "Identity", {"in1", AsControlDependency("split_dim")}, {},
           &want);
-  AddNode("s2", "Split", {"in2", "split_dim"}, {}, &want);
+  AddNode("s2", "Split", {"split_dim", "in2"}, {}, &want);
   AddNode("out", "Add", {"s1", "s2"}, {}, &want);
 
   CompareGraphs(want, got);
@@ -3466,6 +3634,88 @@ TEST_F(ConstantFoldingCastConstTest, CastConstFolding) {
   }
 }
 
+TEST_F(ConstantFoldingTest, MaterializeConstantValuedNode) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  Output x =
+      ops::Placeholder(scope.WithOpName("x"), DT_FLOAT,
+                       ops::Placeholder::Shape(TensorShape({1, 2, 3, 4})));
+  Output ones_like = ops::OnesLike(scope.WithOpName("ones_like"), x);
+  Output zeros_like = ops::ZerosLike(scope.WithOpName("zeros_like"), x);
+  Output fill = ops::Fill(scope.WithOpName("fill"), {4, 3, 2, 1}, 42);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  item.fetch = {"ones_like", "zeros_like", "fill"};
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({1, 2, 3, 4}));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {{"x", x_t}});
+
+  ConstantFolding optimizer(/*opt_level=*/RewriterConfig::AGGRESSIVE,
+                            /*cpu_device=*/nullptr);
+  GraphDef output;
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(output.node_size(), 6);
+  for (const auto& node : output.node()) {
+    if (node.name() != "x") {
+      EXPECT_EQ(node.op(), "Const");
+    }
+    if (node.name() == "ones_like" || node.name() == "zeros_like") {
+      ASSERT_EQ(node.input_size(), 1);
+      EXPECT_EQ(node.input(0), "^x");
+    }
+    if (node.name() == "fill") {
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0)[0], '^');
+      EXPECT_EQ(node.input(1)[0], '^');
+    }
+  }
+  auto tensors = EvaluateNodes(output, item.fetch, {{"x", x_t}});
+  ASSERT_EQ(item.fetch.size(), tensors.size());
+  ASSERT_EQ(tensors_expected.size(), tensors.size());
+  for (int i = 0; i < tensors.size(); i++) {
+    if (item.fetch[i] == "fill") {
+      test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
+    } else {
+      test::ExpectTensorEqual<float>(tensors_expected[i], tensors[i]);
+    }
+  }
+}
+
+TEST_F(ConstantFoldingTest, BitcastDenormalFloats) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  Tensor x_t(DT_INT64, TensorShape({2, 2}));
+  x_t.flat<int64>()(0) = 9223372036854775807L;
+  x_t.flat<int64>()(1) = 1L;
+  x_t.flat<int64>()(2) = 9223372036854775807L;
+  x_t.flat<int64>()(3) = 1L;
+  Output x = ops::Const(scope.WithOpName("x"), x_t);
+  Output y = ops::Bitcast(scope.WithOpName("y"), x, DT_FLOAT);
+  Output z = ops::Bitcast(scope.WithOpName("z"), y, DT_INT64);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+  item.fetch = {"z"};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {});
+
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
+  GraphDef output;
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  ASSERT_EQ(output.node_size(), 1);
+  const NodeDef& node = output.node(0);
+  EXPECT_EQ(node.name(), "z");
+  EXPECT_EQ(node.op(), "Const");
+
+  auto tensors = EvaluateNodes(output, item.fetch, {});
+  ASSERT_EQ(tensors.size(), 1);
+  ASSERT_EQ(tensors_expected.size(), 1);
+  test::ExpectTensorEqual<int64>(tensors[0], tensors_expected[0]);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index 7593023ff4d649c623db9be98ac52ef6b799219f..5f060789889b230070f0ead6df16c24d43755cd4 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -3,16 +3,43 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_protos_all")
 
+package(default_visibility = [
+    "//tensorflow/core/grappler/optimizers/data:__subpackages__",
+    "//tensorflow/core/kernels/data:__pkg__",
+    "//tensorflow/core/kernels/data/experimental:__pkg__",
+])
+
+cc_library(
+    name = "data",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":filter_fusion",
+        ":hoist_random_uniform",
+        ":latency_all_edges",
+        ":make_numa_aware",
+        ":make_sloppy",
+        ":map_and_batch_fusion",
+        ":map_and_filter_fusion",
+        ":map_fusion",
+        ":map_parallelization",
+        ":map_vectorization",
+        ":meta_optimizer",
+        ":noop_elimination",
+        ":shuffle_and_repeat_fusion",
+    ],
+)
+
 cc_library(
     name = "filter_fusion",
     srcs = ["filter_fusion.cc"],
     hdrs = [
         "filter_fusion.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
         ":fusion_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:grappler_item",
@@ -20,16 +47,15 @@ cc_library(
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/utils:topological_sort",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core:lib_internal",
     ] + tf_protos_all(),
+    alwayslink = 1,
 )
 
 tf_cc_test(
     name = "filter_fusion_test",
     srcs = ["filter_fusion_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":filter_fusion",
         ":graph_test_utils",
@@ -48,7 +74,6 @@ cc_library(
     hdrs = [
         "fusion_utils.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
         ":function_utils",
@@ -60,6 +85,7 @@ cc_library(
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/kernels:functional_ops",
         "//tensorflow/core/kernels:control_flow_ops",
+        "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core:lib_internal",
     ] + tf_protos_all(),
@@ -68,7 +94,6 @@ cc_library(
 tf_cc_test(
     name = "fusion_utils_test",
     srcs = ["fusion_utils_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":function_utils",
         ":fusion_utils",
@@ -87,7 +112,6 @@ cc_library(
     hdrs = [
         "function_utils.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
         "//tensorflow/core:framework",
@@ -101,19 +125,19 @@ cc_library(
 tf_cc_test(
     name = "function_utils_test",
     srcs = ["function_utils_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":function_utils",
+        ":graph_utils",
         "//tensorflow/core:framework",
+        "//tensorflow/core:ops",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/tools/graph_transforms:transform_utils",
-    ],
+    ] + tf_protos_all(),
 )
 
 cc_library(
@@ -122,7 +146,6 @@ cc_library(
     hdrs = [
         "graph_utils.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -136,7 +159,6 @@ cc_library(
 tf_cc_test(
     name = "graph_utils_test",
     srcs = ["graph_utils_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
         "//tensorflow/core:core_cpu",
@@ -145,7 +167,6 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
     ],
 )
@@ -157,7 +178,6 @@ cc_library(
     hdrs = [
         "graph_test_utils.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -174,26 +194,26 @@ cc_library(
     hdrs = [
         "hoist_random_uniform.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":function_utils",
         ":graph_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core:lib_internal",
     ] + tf_protos_all(),
+    alwayslink = 1,
 )
 
 tf_cc_test(
     name = "hoist_random_uniform_test",
     srcs = ["hoist_random_uniform_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_test_utils",
         ":graph_utils",
@@ -212,42 +232,54 @@ cc_library(
     hdrs = [
         "latency_all_edges.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
+        ":optimizer_base",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
     ] + tf_protos_all(),
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "latency_all_edges_test",
+    srcs = ["latency_all_edges_test.cc"],
+    deps = [
+        ":graph_utils",
+        ":latency_all_edges",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+    ],
 )
 
 cc_library(
     name = "make_numa_aware",
     srcs = ["make_numa_aware.cc"],
     hdrs = ["make_numa_aware.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core/grappler:mutable_graph_view",
-        "//tensorflow/core:lib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
     ] + tf_protos_all(),
+    alwayslink = 1,
 )
 
 tf_cc_test(
     name = "make_numa_aware_test",
     srcs = ["make_numa_aware_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_test_utils",
         ":graph_utils",
@@ -264,21 +296,20 @@ cc_library(
     name = "make_sloppy",
     srcs = ["make_sloppy.cc"],
     hdrs = ["make_sloppy.h"],
-    visibility = ["//visibility:public"],
     deps = [
+        ":optimizer_base",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
     ] + tf_protos_all(),
+    alwayslink = 1,
 )
 
 tf_cc_test(
     name = "make_sloppy_test",
     srcs = ["make_sloppy_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_test_utils",
         ":graph_utils",
@@ -297,24 +328,24 @@ cc_library(
     hdrs = [
         "map_and_batch_fusion.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
     ] + tf_protos_all(),
+    alwayslink = 1,
 )
 
 tf_cc_test(
     name = "map_and_batch_fusion_test",
     srcs = ["map_and_batch_fusion_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
         ":map_and_batch_fusion",
@@ -331,27 +362,27 @@ cc_library(
     hdrs = [
         "map_and_filter_fusion.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
         ":fusion_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/utils:topological_sort",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core:lib_internal",
     ] + tf_protos_all(),
+    alwayslink = 1,
 )
 
 tf_cc_test(
     name = "map_and_filter_fusion_test",
     srcs = ["map_and_filter_fusion_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_test_utils",
         ":graph_utils",
@@ -370,10 +401,11 @@ cc_library(
     hdrs = [
         "map_fusion.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
         ":fusion_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:grappler_item",
@@ -381,20 +413,20 @@ cc_library(
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/utils:topological_sort",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core:lib_internal",
     ] + tf_protos_all(),
+    alwayslink = 1,
 )
 
 tf_cc_test(
     name = "map_fusion_test",
     srcs = ["map_fusion_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_test_utils",
         ":graph_utils",
         ":map_fusion",
+        "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -410,25 +442,24 @@ cc_library(
     hdrs = [
         "map_parallelization.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
+        ":function_utils",
         ":graph_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core/grappler:mutable_graph_view",
-        "//tensorflow/core:lib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/utils:topological_sort",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
     ] + tf_protos_all(),
+    alwayslink = 1,
 )
 
 tf_cc_test(
     name = "map_parallelization_test",
     srcs = ["map_parallelization_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_test_utils",
         ":graph_utils",
@@ -447,62 +478,111 @@ cc_library(
     hdrs = [
         "map_vectorization.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":function_utils",
         ":graph_utils",
+        ":optimizer_base",
         ":vectorization_utils",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core:lib_internal",
     ] + tf_protos_all(),
+    alwayslink = 1,
 )
 
 tf_cc_test(
     name = "map_vectorization_test",
     srcs = ["map_vectorization_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
         ":map_vectorization",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:bitwise_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:direct_session",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:spectral_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/utils:topological_sort",
+        "//tensorflow/core/kernels:math",
+        "//tensorflow/core/kernels/data",
     ],
 )
 
+cc_library(
+    name = "meta_optimizer",
+    srcs = ["meta_optimizer.cc"],
+    hdrs = ["meta_optimizer.h"],
+    deps = [
+        "@com_google_absl//absl/container:flat_hash_map",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:arithmetic_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "//tensorflow/core/grappler/optimizers:dependency_optimizer",
+        "//tensorflow/core/grappler/optimizers:function_optimizer",
+        "//tensorflow/core/grappler/optimizers:model_pruner",
+        "//tensorflow/core/grappler/optimizers:shape_optimizer",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:ptr_util",
+    ] + tf_protos_all(),
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "rebatch",
+    srcs = ["rebatch.cc"],
+    hdrs = ["rebatch.h"],
+    deps = [
+        ":graph_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "//tensorflow/core/grappler/utils:functions",
+        "//tensorflow/core:lib",
+    ] + tf_protos_all(),
+    alwayslink = 1,
+)
+
 cc_library(
     name = "noop_elimination",
     srcs = ["noop_elimination.cc"],
     hdrs = [
         "noop_elimination.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
     ] + tf_protos_all(),
+    alwayslink = 1,
 )
 
 tf_cc_test(
     name = "noop_elimination_test",
     srcs = ["noop_elimination_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
         ":noop_elimination",
@@ -513,30 +593,42 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "optimizer_base",
+    srcs = ["optimizer_base.cc"],
+    hdrs = [
+        "optimizer_base.h",
+    ],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
+    ],
+)
+
 cc_library(
     name = "shuffle_and_repeat_fusion",
     srcs = ["shuffle_and_repeat_fusion.cc"],
     hdrs = [
         "shuffle_and_repeat_fusion.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
+        ":optimizer_base",
+        "@com_google_absl//absl/container:flat_hash_set",
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
     ] + tf_protos_all(),
+    alwayslink = 1,
 )
 
 tf_cc_test(
     name = "shuffle_and_repeat_fusion_test",
     srcs = ["shuffle_and_repeat_fusion_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
         ":shuffle_and_repeat_fusion",
@@ -547,47 +639,12 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "data",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":filter_fusion",
-        ":hoist_random_uniform",
-        ":latency_all_edges",
-        ":make_numa_aware",
-        ":make_sloppy",
-        ":map_and_batch_fusion",
-        ":map_and_filter_fusion",
-        ":map_fusion",
-        ":map_parallelization",
-        ":map_vectorization",
-        ":noop_elimination",
-        ":shuffle_and_repeat_fusion",
-    ],
-    alwayslink = 1,
-)
-
-tf_cc_test(
-    name = "latency_all_edges_test",
-    srcs = ["latency_all_edges_test.cc"],
-    deps = [
-        ":graph_utils",
-        ":latency_all_edges",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//tensorflow/core/grappler:grappler_item",
-    ],
-)
-
 cc_library(
     name = "vectorization_utils",
     srcs = ["vectorization_utils.cc"],
     hdrs = [
         "vectorization_utils.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":function_utils",
         ":graph_utils",
@@ -608,7 +665,6 @@ cc_library(
 tf_cc_test(
     name = "vectorization_utils_test",
     srcs = ["vectorization_utils_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
         ":function_utils",
@@ -620,16 +676,27 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         # For ops we need registered
         "//tensorflow/core/kernels/data:dataset_ops",
+        "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:logging_ops",
+        "//tensorflow/core:logging_ops_op_lib",
         "//tensorflow/core/kernels:math",
+        "//tensorflow/core:spectral_ops_op_lib",
+        "//tensorflow/core:bitwise_ops_op_lib",
+        "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core/kernels:nn",
+        "//tensorflow/core:sendrecv_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core/kernels:parsing",
+        "//tensorflow/core:parsing_ops_op_lib",
         "//tensorflow/tools/graph_transforms:transform_utils",
     ] + tf_protos_all(),
 )
diff --git a/tensorflow/core/grappler/optimizers/data/filter_fusion.cc b/tensorflow/core/grappler/optimizers/data/filter_fusion.cc
index 89b568ecf161cda08f1b71b369c3edb1d43f2a7f..7a20b8042bf27b4151e7063dad1e2b188ca2d3a4 100644
--- a/tensorflow/core/grappler/optimizers/data/filter_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/filter_fusion.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/filter_fusion.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
@@ -57,14 +58,16 @@ NodeDef MakeFusedFilterNode(const NodeDef& first_filter_node,
 
 }  // namespace
 
-Status FilterFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
-                              GraphDef* output) {
+Status FilterFusion::OptimizeAndCollectStats(Cluster* cluster,
+                                             const GrapplerItem& item,
+                                             GraphDef* output,
+                                             OptimizationStats* stats) {
   GraphDef sorted_old_graph = item.graph;
   TF_RETURN_IF_ERROR(TopologicalSort(&sorted_old_graph));
   *output = sorted_old_graph;
 
   MutableGraphView graph(output);
-  std::set<string> nodes_to_delete;
+  absl::flat_hash_set<string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              output->library());
 
@@ -109,7 +112,8 @@ Status FilterFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
     const auto* fused_filter_node = graph.AddNode(MakeFusedFilterNode(
         *first_filter_node, *second_filter_node, *fused_predicate, &graph));
 
-    graph.UpdateFanouts(second_filter_node->name(), fused_filter_node->name());
+    TF_RETURN_IF_ERROR(graph.UpdateFanouts(second_filter_node->name(),
+                                           fused_filter_node->name()));
 
     // TODO(prazek): we should run some optimizations on the fused filter
     // functions, or make sure that optimization passes run after filter
@@ -119,9 +123,10 @@ Status FilterFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
     // they are not used anymore.
     nodes_to_delete.insert(first_filter_node->name());
     nodes_to_delete.insert(second_filter_node->name());
+    stats->num_changes++;
   }
 
-  graph.DeleteNodes(nodes_to_delete);
+  TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
   return Status::OK();
 }
 
@@ -132,5 +137,5 @@ void FilterFusion::Feedback(Cluster* cluster, const GrapplerItem& item,
 
 REGISTER_GRAPH_OPTIMIZER_AS(FilterFusion, "filter_fusion");
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/filter_fusion.h b/tensorflow/core/grappler/optimizers/data/filter_fusion.h
index 91a0364a46121aefbd7140ef5fc0a72291c5bf82..ac0326c0ec24bea74d0473ef8ca2fb95cb97e4c8 100644
--- a/tensorflow/core/grappler/optimizers/data/filter_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/filter_fusion.h
@@ -16,13 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FILTER_FUSION_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FILTER_FUSION_H_
 
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
 
 // This optimization fuses filter transformations.
-class FilterFusion : public CustomGraphOptimizer {
+class FilterFusion : public TFDataOptimizerBase {
  public:
   FilterFusion() = default;
   ~FilterFusion() override = default;
@@ -34,14 +34,15 @@ class FilterFusion : public CustomGraphOptimizer {
     return Status::OK();
   }
 
-  Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* output) override;
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
 };
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_FILTER_FUSION_H_
diff --git a/tensorflow/core/grappler/optimizers/data/function_utils.cc b/tensorflow/core/grappler/optimizers/data/function_utils.cc
index 311df15bc2728a57a66e58cbe3217d3cf03e44dd..20536910db12607bcef9155d739251648696a0c7 100644
--- a/tensorflow/core/grappler/optimizers/data/function_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/function_utils.cc
@@ -171,6 +171,57 @@ void SetUniqueFunctionNodeName(StringPiece prefix, FunctionDef* function,
   node->set_name(std::move(name));
 }
 
-}  // end namespace function_utils
-}  // end namespace grappler
-}  // end namespace tensorflow
+bool IsFunctionStateful(const FunctionLibraryDefinition& library,
+                        const FunctionDef& function_def, bool skip_assert) {
+  if (!function_def.signature().is_stateful()) return false;
+
+  for (const NodeDef& node_def : function_def.node_def()) {
+    if (IsNodeStateful(library, node_def, skip_assert)) return true;
+  }
+  return false;
+}
+
+bool IsNodeStateful(const FunctionLibraryDefinition& library,
+                    const NodeDef& node, bool skip_assert) {
+  const OpDef* op_def;
+  Status s = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def);
+
+  if (!s.ok()) return true;
+
+  if (!op_def->is_stateful()) return false;
+
+  if (skip_assert && op_def->name() == "Assert") {
+    return false;
+  }
+
+  if (op_def->name() == "If") {
+    const FunctionDef* then_func =
+        library.Find(node.attr().at("then_branch").func().name());
+    const FunctionDef* else_func =
+        library.Find(node.attr().at("else_branch").func().name());
+    if ((then_func != nullptr &&
+         !IsFunctionStateful(library, *then_func, skip_assert)) &&
+        (else_func != nullptr &&
+         !IsFunctionStateful(library, *else_func, skip_assert))) {
+      return false;
+    }
+  }
+
+  if (op_def->name() == "While") {
+    const FunctionDef* cond_func =
+        library.Find(node.attr().at("cond").func().name());
+    const FunctionDef* body_func =
+        library.Find(node.attr().at("body").func().name());
+    if ((cond_func != nullptr &&
+         !IsFunctionStateful(library, *cond_func, skip_assert)) &&
+        (body_func != nullptr &&
+         !IsFunctionStateful(library, *body_func, skip_assert))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace function_utils
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/function_utils.h b/tensorflow/core/grappler/optimizers/data/function_utils.h
index d4ce824652beaca77198a87a6fcb5c342a35b4b1..79271e8ad0c330318ed4538c46158967758e5747 100644
--- a/tensorflow/core/grappler/optimizers/data/function_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/function_utils.h
@@ -101,6 +101,22 @@ int FindFunctionNodeWithOp(StringPiece op, const FunctionDef& function);
 void SetUniqueFunctionNodeName(StringPiece prefix, FunctionDef* function,
                                NodeDef* node);
 
+// Checks if the function is stateful by checking the function graph for
+// stateful ops. Because the "If" and "While" ops are conservatively marked as
+// stateful, the check recurses into their graph to determine whether they are
+// actually stateful. The `skip_assert` argument determines whether the "Assert"
+// op should be treated as stateful or not.
+bool IsFunctionStateful(const FunctionLibraryDefinition& library,
+                        const FunctionDef& function_def,
+                        bool skip_assert = false);
+
+// Checks if the node is stateful. Because the "If" or "While" ops are
+// conservatively marked as stateful, the check recurses into their graph to
+// determine whether they are actually stateful. The `skip_assert` argument
+// determines whether the "Assert" op  should be treated as stateful or not.
+bool IsNodeStateful(const FunctionLibraryDefinition& library,
+                    const NodeDef& node, bool skip_assert = false);
+
 }  // end namespace function_utils
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/function_utils_test.cc b/tensorflow/core/grappler/optimizers/data/function_utils_test.cc
index 3739e20eb1444fa24ec5553b8a133d8d96c5d714..8ae0cde4cd1ba20c8259ae9ac7e7a767f7b542e4 100644
--- a/tensorflow/core/grappler/optimizers/data/function_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/function_utils_test.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/data/function_utils.h"
 
 #include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
@@ -158,6 +160,692 @@ TEST(FunctionUtilsTest, AddNodeToFunctionDef) {
   }
 }
 
+// Graph containing function with "If" and "Assert" Op.
+/*
+  @eager_function.defun
+  def test_function():
+    pred = constant_op.constant(True)
+
+    def fn1():
+      return control_flow_ops.no_op()
+
+    def fn2():
+      return control_flow_ops.Assert(False, ["Wrong branch!!!"])
+
+    return control_flow_ops.cond(pred, fn1, fn2)
+
+  r = test_function()
+*/
+// Following proto is generated in python using the above code block, to
+// regenerate get the graph_def from the default graph/specified graph for the
+// code block (e.g ops.get_default_graph.as_graph_def()).
+constexpr char kCondGraphProto[] = R"proto(
+  node {
+    name: "StatefulPartitionedCall"
+    op: "StatefulPartitionedCall"
+    attr {
+      key: "Tin"
+      value { list {} }
+    }
+    attr {
+      key: "Tout"
+      value { list { type: DT_BOOL } }
+    }
+    attr {
+      key: "_gradient_op_type"
+      value { s: "PartitionedCall-20" }
+    }
+    attr {
+      key: "config"
+      value { s: "" }
+    }
+    attr {
+      key: "config_proto"
+      value { s: "" }
+    }
+    attr {
+      key: "executor_type"
+      value { s: "" }
+    }
+    attr {
+      key: "f"
+      value { func { name: "__inference_test_function_19" } }
+    }
+  }
+  library {
+    function {
+      signature {
+        name: "cond_true_3"
+        input_arg { name: "identity_const" type: DT_BOOL }
+        output_arg { name: "identity_1" type: DT_BOOL }
+      }
+      node_def { name: "NoOp" op: "NoOp" }
+      node_def {
+        name: "Identity"
+        op: "Identity"
+        input: "identity_const"
+        input: "^NoOp"
+        attr {
+          key: "T"
+          value { type: DT_BOOL }
+        }
+      }
+      node_def {
+        name: "Identity_1"
+        op: "Identity"
+        input: "Identity:output:0"
+        attr {
+          key: "T"
+          value { type: DT_BOOL }
+        }
+      }
+      ret { key: "identity_1" value: "Identity_1:output:0" }
+    }
+    function {
+      signature {
+        name: "cond_false_4"
+        input_arg { name: "identity_const" type: DT_BOOL }
+        output_arg { name: "identity_1" type: DT_BOOL }
+        is_stateful: true
+      }
+      node_def {
+        name: "Assert/Const"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_STRING }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_STRING
+              tensor_shape {}
+              string_val: "Wrong branch!!!"
+            }
+          }
+        }
+      }
+      node_def {
+        name: "Assert/Assert/condition"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_BOOL }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_BOOL
+              tensor_shape {}
+              bool_val: false
+            }
+          }
+        }
+      }
+      node_def {
+        name: "Assert/Assert/data_0"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_STRING }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_STRING
+              tensor_shape {}
+              string_val: "Wrong branch!!!"
+            }
+          }
+        }
+      }
+      node_def {
+        name: "Assert/Assert"
+        op: "Assert"
+        input: "Assert/Assert/condition:output:0"
+        input: "Assert/Assert/data_0:output:0"
+        attr {
+          key: "T"
+          value { list { type: DT_STRING } }
+        }
+        attr {
+          key: "summarize"
+          value { i: 3 }
+        }
+      }
+      node_def {
+        name: "Identity"
+        op: "Identity"
+        input: "identity_const"
+        input: "^Assert/Assert"
+        attr {
+          key: "T"
+          value { type: DT_BOOL }
+        }
+      }
+      node_def {
+        name: "Identity_1"
+        op: "Identity"
+        input: "Identity:output:0"
+        input: "^Assert/Assert"
+        attr {
+          key: "T"
+          value { type: DT_BOOL }
+        }
+      }
+      ret { key: "identity_1" value: "Identity_1:output:0" }
+    }
+    function {
+      signature {
+        name: "__inference_test_function_19"
+        output_arg { name: "identity" type: DT_BOOL }
+        is_stateful: true
+      }
+      node_def {
+        name: "Const"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_BOOL }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_BOOL
+              tensor_shape {}
+              bool_val: true
+            }
+          }
+        }
+      }
+      node_def {
+        name: "cond"
+        op: "If"
+        input: "Const:output:0"
+        input: "Const:output:0"
+        attr {
+          key: "Tcond"
+          value { type: DT_BOOL }
+        }
+        attr {
+          key: "Tin"
+          value { list { type: DT_BOOL } }
+        }
+        attr {
+          key: "Tout"
+          value { list { type: DT_BOOL } }
+        }
+        attr {
+          key: "_lower_using_switch_merge"
+          value { b: true }
+        }
+        attr {
+          key: "else_branch"
+          value { func { name: "cond_false_4" } }
+        }
+        attr {
+          key: "output_shapes"
+          value { list { shape {} } }
+        }
+        attr {
+          key: "then_branch"
+          value { func { name: "cond_true_3" } }
+        }
+      }
+      node_def {
+        name: "cond/Identity"
+        op: "Identity"
+        input: "cond:output:0"
+        attr {
+          key: "T"
+          value { type: DT_BOOL }
+        }
+      }
+      node_def {
+        name: "Identity"
+        op: "Identity"
+        input: "cond/Identity:output:0"
+        input: "^cond"
+        attr {
+          key: "T"
+          value { type: DT_BOOL }
+        }
+      }
+      ret { key: "identity" value: "Identity:output:0" }
+    }
+  }
+  versions { producer: 27 min_consumer: 12 })proto";
+
+// Graph containing function with "While" Op in python.
+/*
+  @eager_function.defun
+  def test_function():
+    return control_flow_ops.while_loop(
+        lambda i: i < 3, lambda i: i + 1, [0], maximum_iterations=1)
+
+  r = test_function()
+*/
+// Following proto is generated in python using the above code block, to
+// regenerate get the graph_def from the default graph/specified graph for the
+// code block (e.g ops.get_default_graph.as_graph_def()).
+constexpr char kWhileGraphProto[] = R"proto(
+  node {
+    name: "StatefulPartitionedCall"
+    op: "StatefulPartitionedCall"
+    attr {
+      key: "Tin"
+      value { list {} }
+    }
+    attr {
+      key: "Tout"
+      value { list { type: DT_INT32 } }
+    }
+    attr {
+      key: "_gradient_op_type"
+      value { s: "PartitionedCall-35" }
+    }
+    attr {
+      key: "config"
+      value { s: "" }
+    }
+    attr {
+      key: "config_proto"
+      value { s: "" }
+    }
+    attr {
+      key: "executor_type"
+      value { s: "" }
+    }
+    attr {
+      key: "f"
+      value { func { name: "__inference_test_function_34" } }
+    }
+  }
+  library {
+    function {
+      signature {
+        name: "while_body_5"
+        input_arg { name: "while_loop_counter" type: DT_INT32 }
+        input_arg { name: "const" type: DT_INT32 }
+        input_arg { name: "maximum_iterations" type: DT_INT32 }
+        output_arg { name: "identity" type: DT_INT32 }
+        output_arg { name: "identity_1" type: DT_INT32 }
+        output_arg { name: "identity_2" type: DT_INT32 }
+      }
+      node_def {
+        name: "add/y"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_INT32 }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_INT32
+              tensor_shape {}
+              int_val: 1
+            }
+          }
+        }
+      }
+      node_def {
+        name: "add"
+        op: "Add"
+        input: "const"
+        input: "add/y:output:0"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "add_1/y"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_INT32 }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_INT32
+              tensor_shape {}
+              int_val: 1
+            }
+          }
+        }
+      }
+      node_def {
+        name: "add_1"
+        op: "Add"
+        input: "while_loop_counter"
+        input: "add_1/y:output:0"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "Identity"
+        op: "Identity"
+        input: "add_1:z:0"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "Identity_1"
+        op: "Identity"
+        input: "add:z:0"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "Identity_2"
+        op: "Identity"
+        input: "maximum_iterations"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      ret { key: "identity" value: "Identity:output:0" }
+      ret { key: "identity_1" value: "Identity_1:output:0" }
+      ret { key: "identity_2" value: "Identity_2:output:0" }
+    }
+    function {
+      signature {
+        name: "__inference_test_function_34"
+        output_arg { name: "identity" type: DT_INT32 }
+        is_stateful: true
+      }
+      node_def {
+        name: "maximum_iterations"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_INT32 }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_INT32
+              tensor_shape {}
+              int_val: 1
+            }
+          }
+        }
+      }
+      node_def {
+        name: "Const"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_INT32 }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_INT32
+              tensor_shape {}
+              int_val: 0
+            }
+          }
+        }
+      }
+      node_def {
+        name: "while/loop_counter"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_INT32 }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_INT32
+              tensor_shape {}
+              int_val: 0
+            }
+          }
+        }
+      }
+      node_def {
+        name: "while"
+        op: "While"
+        input: "while/loop_counter:output:0"
+        input: "Const:output:0"
+        input: "maximum_iterations:output:0"
+        attr {
+          key: "T"
+          value { list { type: DT_INT32 type: DT_INT32 type: DT_INT32 } }
+        }
+        attr {
+          key: "_lower_using_switch_merge"
+          value { b: true }
+        }
+        attr {
+          key: "body"
+          value { func { name: "while_body_5" } }
+        }
+        attr {
+          key: "cond"
+          value { func { name: "while_cond_4" } }
+        }
+        attr {
+          key: "output_shapes"
+          value {
+            list {
+              shape {}
+              shape {}
+              shape {}
+            }
+          }
+        }
+      }
+      node_def {
+        name: "while/Identity"
+        op: "Identity"
+        input: "while:output:0"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "while/Identity_1"
+        op: "Identity"
+        input: "while:output:1"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "while/Identity_2"
+        op: "Identity"
+        input: "while:output:2"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "Identity"
+        op: "Identity"
+        input: "while/Identity_1:output:0"
+        input: "^while"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      ret { key: "identity" value: "Identity:output:0" }
+    }
+    function {
+      signature {
+        name: "while_cond_4"
+        input_arg { name: "while_loop_counter" type: DT_INT32 }
+        input_arg { name: "const" type: DT_INT32 }
+        input_arg { name: "less_maximum_iterations" type: DT_INT32 }
+        output_arg { name: "identity" type: DT_BOOL }
+      }
+      node_def {
+        name: "Less"
+        op: "Less"
+        input: "while_loop_counter"
+        input: "less_maximum_iterations"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "Less_1/y"
+        op: "Const"
+        attr {
+          key: "dtype"
+          value { type: DT_INT32 }
+        }
+        attr {
+          key: "value"
+          value {
+            tensor {
+              dtype: DT_INT32
+              tensor_shape {}
+              int_val: 3
+            }
+          }
+        }
+      }
+      node_def {
+        name: "Less_1"
+        op: "Less"
+        input: "const"
+        input: "Less_1/y:output:0"
+        attr {
+          key: "T"
+          value { type: DT_INT32 }
+        }
+      }
+      node_def {
+        name: "LogicalAnd"
+        op: "LogicalAnd"
+        input: "Less:z:0"
+        input: "Less_1:z:0"
+      }
+      node_def {
+        name: "Identity"
+        op: "Identity"
+        input: "LogicalAnd:z:0"
+        attr {
+          key: "T"
+          value { type: DT_BOOL }
+        }
+      }
+      ret { key: "identity" value: "Identity:output:0" }
+    }
+  }
+  versions { producer: 27 min_consumer: 12 })proto";
+
+// TODO(shivaniagrawal): split the test into multiple tests for better
+// readability and add full coverage i.e. add/separate out the tests for all
+// branches of IsNodeStateful and IsFunctionStateful:
+// - test for IsNodeStateful for Cond that has a stateful branch
+// - test for IsNodeStateful for Cond that does not have a stateful branches
+// - test for IsNodeStateful for While that has a stateful branch
+// - test for IsNodeStateful for While that does not have a stateful branches
+// - test for IsNodeStateful for Assert
+// - test for IsNodeStateful for a stateful op
+// - test for IsNodeStateful for a stateless op
+//
+// - test for IsFunctionStateful for a function that contains a Cond
+// - test for IsFunctionStateful for a function that contains a While
+// - test for IsFunctionStateful for a function that contains an Assert (and no
+//   other stateful op)
+// - test for IsFunctionStateful for a function that contains a stateful op
+//   other than Assert
+// - test for IsFunctionStateful for a function that does not contain a stateful
+//   op
+
+TEST(FunctionUtilsTest, IsFunctionStateful) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* nodeA = graph_utils::AddNode("", "A", {}, {}, &graph);
+  FunctionDef* function = graph_def.mutable_library()->add_function();
+  *function = test::function::XTimesTwo();
+
+  FunctionLibraryDefinition lib_def(OpRegistry::Global(),
+                                    *graph_def.mutable_library());
+
+  EXPECT_FALSE(IsFunctionStateful(lib_def, *function));
+
+  // Op "A" is not a registered Op.
+  EXPECT_TRUE(IsNodeStateful(lib_def, *nodeA));
+
+  // Get graph_def for the graph `kCondGraphProto`, graph with function
+  // containing "If" and "Assert" Op.
+
+  GraphDef graph_def_cond;
+  protobuf::TextFormat::ParseFromString(kCondGraphProto, &graph_def_cond);
+  FunctionLibraryDefinition cond_lib(OpRegistry::Global(),
+                                     graph_def_cond.library());
+
+  const FunctionDef* no_op_fnc = cond_lib.Find("cond_true_3");
+
+  EXPECT_FALSE(IsFunctionStateful(cond_lib, *no_op_fnc));
+  EXPECT_FALSE(IsFunctionStateful(cond_lib, *no_op_fnc, true));
+
+  const FunctionDef* assert_func = cond_lib.Find("cond_false_4");
+
+  EXPECT_TRUE(IsFunctionStateful(cond_lib, *assert_func));
+  EXPECT_FALSE(IsFunctionStateful(cond_lib, *assert_func, true));
+
+  EXPECT_TRUE(ContainsFunctionNodeWithOp("Const", *assert_func));
+  EXPECT_TRUE(ContainsFunctionNodeWithOp("Assert", *assert_func));
+
+  for (auto node : assert_func->node_def()) {
+    if (node.op() == "Const") {
+      EXPECT_FALSE(IsNodeStateful(lib_def, node));
+    }
+    if (node.op() == "Assert") {
+      EXPECT_TRUE(IsNodeStateful(lib_def, node));
+      EXPECT_FALSE(IsNodeStateful(lib_def, node, true));
+    }
+  }
+
+  const FunctionDef* cond_func = cond_lib.Find("__inference_test_function_19");
+
+  EXPECT_TRUE(IsFunctionStateful(cond_lib, *cond_func));
+  EXPECT_FALSE(IsFunctionStateful(cond_lib, *cond_func, true));
+
+  // Get graph def for the graph `kWhileGraphProto`, graph with function
+  // containing "While" Op.
+
+  GraphDef graph_def_while;
+  protobuf::TextFormat::ParseFromString(kWhileGraphProto, &graph_def_while);
+
+  FunctionLibraryDefinition while_lib(OpRegistry::Global(),
+                                      graph_def_while.library());
+  const FunctionDef* while_function =
+      while_lib.Find("__inference_test_function_34");
+  EXPECT_FALSE(IsFunctionStateful(while_lib, *while_function));
+  EXPECT_FALSE(IsFunctionStateful(while_lib, *while_function, true));
+}
 }  // namespace
 }  // namespace function_utils
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/data/fusion_utils.cc b/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
index b3bfee138ffd9254e4a28bf87906b543defb95bc..d5308ad31a87f3cb0d129721af899c52787de3f2 100644
--- a/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
@@ -471,6 +471,6 @@ FunctionDef* FuseFunctions(
   return fused_function;
 }
 
-}  // end namespace fusion_utils
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace fusion_utils
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
index 9d8b388a3a8bca1fb560e5acc94d50f3d82ed30d..82ca0146b97c2503371042bc070611cefbc40678 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
@@ -42,7 +42,7 @@ NodeDef MakeMapAndBatchNode(StringPiece name, StringPiece input_node_name,
                             StringPiece function_name) {
   return test::function::NDef(
       name, "ExperimentalMapAndBatchDataset",
-      {string(input_node_name), "", string(batch_size_node_name),
+      {string(input_node_name), string(batch_size_node_name),
        string(num_parallel_calls_node_name), string(drop_remainder_node_name)},
       {{"f", FunctionDefHelper::FunctionRef(string(function_name))},
        {"Targuments", {}},
@@ -68,7 +68,7 @@ NodeDef MakeParallelInterleaveNode(StringPiece name,
                                    StringPiece function_name, bool sloppy) {
   return test::function::NDef(
       name, "ParallelInterleaveDatasetV2",
-      {string(input_node_name), "", string(cycle_length_node_name),
+      {string(input_node_name), string(cycle_length_node_name),
        string(block_length_node_name), string(num_parallel_calls_node_name)},
       {
           {"f", FunctionDefHelper::FunctionRef(string(function_name))},
@@ -107,6 +107,6 @@ NodeDef MakeParseExampleNode(StringPiece name, StringPiece input_node_name,
       });
 }
 
-}  // end namespace graph_tests_utils
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace graph_tests_utils
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/graph_test_utils.h b/tensorflow/core/grappler/optimizers/data/graph_test_utils.h
index a2707ee7b7f3888212f2402617d2063f1feb9c8d..3750e2d5cce66a6644eea69cac7531efb308d055 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_test_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/graph_test_utils.h
@@ -56,8 +56,8 @@ NodeDef MakeParseExampleNode(StringPiece name, StringPiece input_node_name,
                              StringPiece num_parallel_calls_node_name,
                              bool sloppy);
 
-}  // end namespace graph_tests_utils
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace graph_tests_utils
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_GRAPH_TEST_UTILS_H_
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
index 90208c1fba6b089f57b303827cf1327ad43bf736..7bcc12c3a4c3cb92df8557d2a3d194397ccc3782 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
@@ -232,6 +232,13 @@ NodeDef* GetInputNode(const NodeDef& node, const MutableGraphView& graph) {
   return graph.GetRegularFanin(input_port).node;
 }
 
+NodeDef* GetInputNode(const NodeDef& node, const MutableGraphView& graph,
+                      int64 i) {
+  if (node.input_size() <= i) return nullptr;
+  MutableGraphView::InputPort input_port = graph.GetInputPort(node.name(), i);
+  return graph.GetRegularFanin(input_port).node;
+}
+
 void SetUniqueGraphNodeName(StringPiece prefix, GraphDef* graph,
                             NodeDef* node) {
   string name = string(prefix);
@@ -293,6 +300,6 @@ Status EnsureNodeNamesUnique(Graph* g) {
 
   return Status::OK();
 }
-}  // end namespace graph_utils
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace graph_utils
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.h b/tensorflow/core/grappler/optimizers/data/graph_utils.h
index d130fee2047e5be49857dea6ac6489f93088aa50..22298cc311b2659a8fcc4556bc147ee775c1c6cf 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.h
@@ -108,6 +108,10 @@ int FindGraphNodeWithOp(StringPiece op, const GraphDef& graph);
 // Gets the 0th input to a node in the graph.
 NodeDef* GetInputNode(const NodeDef& node, const MutableGraphView& graph);
 
+// Gets the ith input to a node in the graph.
+NodeDef* GetInputNode(const NodeDef& node, const MutableGraphView& graph,
+                      int64 i);
+
 // Returns the list of indices of all nodes with the given op or empty list if
 // no such node exists.
 std::vector<int> FindAllGraphNodesWithOp(const string& op,
@@ -140,8 +144,8 @@ void ConcatAttributeList(const string& attribute_name, const NodeDef& first,
 // and renaming nodes does not mutate any edges.
 Status EnsureNodeNamesUnique(Graph* g);
 
-}  // end namespace graph_utils
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace graph_utils
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_GRAPH_UTILS_H_
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
index 5c0f03dca8774d64395c8bc0f2c1334a45bfe9dc..879cecd13d3e27249dca09d386200bc0a106ee65 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
@@ -109,7 +109,7 @@ TEST(GraphUtilsTest, ContainsGraphNodeWithName) {
   AddNode("A", "OpA", {}, {}, &graph);
   EXPECT_TRUE(ContainsGraphNodeWithName("A", *graph.graph()));
 
-  graph.DeleteNodes({"A"});
+  EXPECT_TRUE(graph.DeleteNodes({"A"}).ok());
   EXPECT_TRUE(!ContainsGraphNodeWithName("A", *graph.graph()));
 }
 
@@ -131,7 +131,7 @@ TEST(GraphUtilsTest, ContainsNodeWithOp) {
   AddNode("A", "OpA", {}, {}, &graph);
   EXPECT_TRUE(ContainsNodeWithOp("OpA", *graph.graph()));
 
-  graph.DeleteNodes({"A"});
+  EXPECT_TRUE(graph.DeleteNodes({"A"}).ok());
   EXPECT_TRUE(!ContainsNodeWithOp("OpA", *graph.graph()));
 }
 
@@ -143,7 +143,7 @@ TEST(GraphUtilsTest, FindGraphNodeWithName) {
   AddNode("A", "OpA", {}, {}, &graph);
   EXPECT_NE(FindGraphNodeWithName("A", *graph.graph()), -1);
 
-  graph.DeleteNodes({"A"});
+  EXPECT_TRUE(graph.DeleteNodes({"A"}).ok());
   EXPECT_EQ(FindGraphNodeWithName("A", *graph.graph()), -1);
 }
 
@@ -164,10 +164,10 @@ TEST(GraphUtilsTest, FindGraphNodeWithOp) {
 
   AddNode("A", "OpA", {}, {}, &graph);
   AddNode("B", "OpB", {"A"}, {}, &graph);
-  AddNode("A2", "OpA", {"B"}, {}, &graph);
+  AddNode("A2", "OpA", {"A"}, {}, &graph);
   EXPECT_EQ(FindGraphNodeWithOp("OpA", *graph.graph()), 0);
 
-  graph.DeleteNodes({"B"});
+  EXPECT_TRUE(graph.DeleteNodes({"B"}).ok());
   EXPECT_EQ(FindGraphNodeWithOp("OpB", *graph.graph()), -1);
   EXPECT_EQ(FindGraphNodeWithName("A2", *graph.graph()), 1);
 }
@@ -186,7 +186,7 @@ TEST(GraphUtilsTest, FindAllGraphNodesWithOp) {
   EXPECT_EQ(result_indices.at(0), 0);
   EXPECT_EQ(result_indices.at(1), 2);
 
-  graph.DeleteNodes({"A2"});
+  EXPECT_TRUE(graph.DeleteNodes({"A2"}).ok());
   std::vector<int> result_indices_new =
       FindAllGraphNodesWithOp("OpA", *graph.graph());
   EXPECT_EQ(result_indices_new.size(), 1);
@@ -201,7 +201,7 @@ TEST(GraphUtilsTest, SetUniqueGraphNodeName) {
   NodeDef* node2 = AddNode("", "A", {}, {}, &graph);
   EXPECT_NE(node1->name(), node2->name());
 
-  graph.DeleteNodes({node1->name()});
+  EXPECT_TRUE(graph.DeleteNodes({node1->name()}).ok());
   NodeDef* node3 = AddNode("", "A", {}, {}, &graph);
   EXPECT_NE(node2->name(), node3->name());
 }
@@ -228,6 +228,21 @@ TEST(GraphUtilsTest, GetInputNode) {
   EXPECT_EQ(GetInputNode(*node1, graph), nullptr);
 }
 
+TEST(GraphUtilsTest, GetIthInputNode) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+
+  NodeDef* node1 = AddNode("", "A", {}, {}, &graph);
+  NodeDef* node2 = AddNode("", "A", {}, {}, &graph);
+  NodeDef* node3 = AddNode("", "A", {node1->name(), node2->name()}, {}, &graph);
+
+  EXPECT_EQ(GetInputNode(*node3, graph), node1);
+  EXPECT_EQ(GetInputNode(*node3, graph, 1), node2);
+  EXPECT_EQ(GetInputNode(*node3, graph, 0), node1);
+  EXPECT_EQ(GetInputNode(*node3, graph, 2), nullptr);
+  EXPECT_EQ(GetInputNode(*node1, graph), nullptr);
+}
+
 TEST(GraphUtilsTest, EnsureNodeNamesUnique) {
   Graph g(OpRegistry::Global());
 
diff --git a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
index 60755256d83d74287748125e18ccd8a63a1b4759..e29b620140236aa8852d7bd36799b99ce62c1f0d 100644
--- a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
+++ b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/hoist_random_uniform.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
@@ -173,7 +174,7 @@ const FunctionDef* MakeLessStatefulFunction(const FunctionDef& map_function,
   return stateless_function;
 }
 // This function returns true if function is stateful and has single
-// RandomUniform op and no other stateful ops except Assert.
+// RandomUniform op and no other stateful ops except Assert and If/While.
 // `is_stateful_after_hoisting` is set to true if RandomUniform is the only
 // stateful op and hoisting can be performed.
 bool CanHoistRandomUniform(const FunctionDef& map_function,
@@ -188,10 +189,10 @@ bool CanHoistRandomUniform(const FunctionDef& map_function,
   for (const auto& node : map_function.node_def()) {
     const OpDef* op_def;
     TF_CHECK_OK(library.LookUpOpDef(node.op(), &op_def));
-    // Skip stateless nodes and assert, as it does not actually have a state.
     if (!op_def->is_stateful()) continue;
 
-    if (op_def->name() == "Assert") {
+    if (!function_utils::IsNodeStateful(library, node, true)) {
+      // Skip ops that are marked stateful but are in fact not stateful.
       have_other_stateful_ops = true;
       continue;
     }
@@ -220,12 +221,14 @@ int NumberOfPlaceholders(const NodeDef& map_node) {
 
 }  // namespace
 
-Status HoistRandomUniform::Optimize(Cluster* cluster, const GrapplerItem& item,
-                                    GraphDef* output) {
+Status HoistRandomUniform::OptimizeAndCollectStats(Cluster* cluster,
+                                                   const GrapplerItem& item,
+                                                   GraphDef* output,
+                                                   OptimizationStats* stats) {
   *output = item.graph;
 
   MutableGraphView graph(output);
-  std::set<string> nodes_to_delete;
+  absl::flat_hash_set<string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item.graph.library());
 
@@ -266,14 +269,16 @@ Status HoistRandomUniform::Optimize(Cluster* cluster, const GrapplerItem& item,
     const auto* stateless_map = graph.AddNode(
         MakeStatelessMap(*map_node, *zip_node, *stateless_func, &graph));
 
-    graph.UpdateFanouts(map_node->name(), stateless_map->name());
+    TF_RETURN_IF_ERROR(
+        graph.UpdateFanouts(map_node->name(), stateless_map->name()));
 
     // TODO(b/116285210): we could also remove map functions from library if
     // they are not used anymore.
     nodes_to_delete.insert(map_node->name());
+    stats->num_changes++;
   }
 
-  graph.DeleteNodes(nodes_to_delete);
+  TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
   return Status::OK();
 }
 
@@ -285,5 +290,5 @@ void HoistRandomUniform::Feedback(Cluster* cluster, const GrapplerItem& item,
 
 REGISTER_GRAPH_OPTIMIZER_AS(HoistRandomUniform, "hoist_random_uniform");
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.h b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.h
index d1bcf6782d60b6a41482730b9d7ec9f2c4b43119..94db9f72a453e5567d493434682c5d2e8d59cf82 100644
--- a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.h
+++ b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_HOIST_RANDOM_UNIFORM_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_HOIST_RANDOM_UNIFORM_H_
 
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -30,7 +30,7 @@ namespace grappler {
 // `stateless_random_uniform`.
 // TODO(prazek): for now only `RandomUniform` is handled, but we could handle
 // `RandomUniformInt` similarly.
-class HoistRandomUniform : public CustomGraphOptimizer {
+class HoistRandomUniform : public TFDataOptimizerBase {
  public:
   HoistRandomUniform() = default;
   ~HoistRandomUniform() override = default;
@@ -42,14 +42,15 @@ class HoistRandomUniform : public CustomGraphOptimizer {
     return Status::OK();
   }
 
-  Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* output) override;
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
 };
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_HOIST_RANDOM_UNIFORM_H_
diff --git a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
index 52b4b785a3d09ca7f3bec3373d9dd1c8de444a87..9bff0685ba061fb090309b4179fcb9f4419ddb8b 100644
--- a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
+++ b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 
@@ -36,8 +37,8 @@ constexpr char kInsertOpName[] = "ExperimentalLatencyStatsDataset";
 NodeDef MakeLatencyNode(const NodeDef& node, MutableGraphView* graph) {
   NodeDef new_node;
   new_node.set_op(kInsertOpName);
-  graph_utils::SetUniqueGraphNodeName(
-      strings::StrCat(kInsertOpName, "_generated"), graph->graph(), &new_node);
+  graph_utils::SetUniqueGraphNodeName(strings::StrCat(kInsertOpName),
+                                      graph->graph(), &new_node);
   // Set the input of LatencyDataset node as `node`
   new_node.add_input(node.name());
 
@@ -63,8 +64,10 @@ NodeDef MakeLatencyNode(const NodeDef& node, MutableGraphView* graph) {
 
 }  // namespace
 
-Status LatencyAllEdges::Optimize(Cluster* cluster, const GrapplerItem& item,
-                                 GraphDef* output) {
+Status LatencyAllEdges::OptimizeAndCollectStats(Cluster* cluster,
+                                                const GrapplerItem& item,
+                                                GraphDef* output,
+                                                OptimizationStats* stats) {
   *output = item.graph;
   MutableGraphView graph(output);
 
@@ -72,10 +75,7 @@ Status LatencyAllEdges::Optimize(Cluster* cluster, const GrapplerItem& item,
   // TODO(shivaniagrawal): Add Op to return Latency for the particular Op than
   // for the edge (e2 - e1?).
   for (const NodeDef& node : item.graph.node()) {
-    if (node.op().rfind("Dataset") != node.op().size() - strlen("Dataset") ||
-        node.attr().empty() ||
-        node.name().rfind("_generated") ==
-            node.name().size() - strlen("_generated")) {
+    if (!str_util::EndsWith(node.op(), "Dataset") || node.attr().empty()) {
       // TODO(b/111805951): Replace this with non-approximate way to check if
       // node corresponds to a `Dataset` op.
       continue;
@@ -86,18 +86,11 @@ Status LatencyAllEdges::Optimize(Cluster* cluster, const GrapplerItem& item,
     if (fanout.size() > 1) {
       LOG(WARNING) << node.name() << " has fanout size " << fanout.size();
       continue;
-    } else {  // fanout will have size 0 for last dataset node in the pipeline.
-      if (fanout.size() == 1) {
-        NodeDef* output_node = (*(fanout.begin())).node;
-        if (output_node->name().rfind("_generated") ==
-            output_node->name().size() - strlen("_generated")) {
-          continue;
-        }
-      }
     }
-
+    // fanout will have size 0 for last dataset node in the pipeline.
     NodeDef* latency_node = graph.AddNode(MakeLatencyNode(node, &graph));
-    graph.UpdateFanouts(node.name(), latency_node->name());
+    TF_RETURN_IF_ERROR(graph.UpdateFanouts(node.name(), latency_node->name()));
+    stats->num_changes++;
   }
   return Status::OK();
 }
@@ -109,5 +102,5 @@ void LatencyAllEdges::Feedback(Cluster* cluster, const GrapplerItem& item,
 
 REGISTER_GRAPH_OPTIMIZER_AS(LatencyAllEdges, "latency_all_edges");
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/latency_all_edges.h b/tensorflow/core/grappler/optimizers/data/latency_all_edges.h
index f6c71a9ec7d8c9c98a5d4e58894f11b35e7b8772..313d108286b7595f2370ef2f9276353e9ef7e58f 100644
--- a/tensorflow/core/grappler/optimizers/data/latency_all_edges.h
+++ b/tensorflow/core/grappler/optimizers/data/latency_all_edges.h
@@ -16,12 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_LATENCY_ALL_EDGES_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_LATENCY_ALL_EDGES_H_
 
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
 
-class LatencyAllEdges : public CustomGraphOptimizer {
+class LatencyAllEdges : public TFDataOptimizerBase {
  public:
   LatencyAllEdges() = default;
   ~LatencyAllEdges() override = default;
@@ -33,14 +33,15 @@ class LatencyAllEdges : public CustomGraphOptimizer {
     return Status::OK();
   }
 
-  Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* output) override;
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
 };
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_LATENCY_ALL_EDGES_H_
diff --git a/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc b/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc
index d428d04a66659cd3b961428e3762ea3ab81ad69e..426c1dca5bb2c112d47b440a672b5a720a994cdf 100644
--- a/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc
@@ -30,9 +30,9 @@ TEST(LatencyAllEdgesTest, AddLatenciesAfterTensorMapPrefetch) {
   using test::function::NDef;
   GrapplerItem item;
   NodeDef component_node =
-      NDef("component_nodes", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}});
+      NDef("component_node", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}});
   NodeDef from_tensor_node =
-      NDef("from_tensor_nodes", "TensorDataset", {"component_nodes"},
+      NDef("from_tensor_node", "TensorDataset", {"component_node"},
            {{"Toutput_types", {}}, {"output_shapes", {}}});
 
   NodeDef captured_input_node = NDef("captured_input_node", "Const", {},
diff --git a/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc b/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc
index 72c27a1d4afb8f3766a1f7c56ade37b1e161a039..221f4c252583c6f29aba4d22920a60a75568115f 100644
--- a/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc
+++ b/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/make_numa_aware.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -37,20 +38,23 @@ NodeDef MakeNumaAwareNode(const NodeDef& node, MutableGraphView* graph) {
 
 }  // namespace
 
-Status MakeNumaAware::Optimize(Cluster* cluster, const GrapplerItem& item,
-                               GraphDef* output) {
+Status MakeNumaAware::OptimizeAndCollectStats(Cluster* cluster,
+                                              const GrapplerItem& item,
+                                              GraphDef* output,
+                                              OptimizationStats* stats) {
   *output = item.graph;
   MutableGraphView graph(output);
-  std::set<string> nodes_to_delete;
+  absl::flat_hash_set<string> nodes_to_delete;
 
   for (const NodeDef& node : item.graph.node()) {
     if (node.op() != "ExperimentalMapAndBatchDataset") continue;
 
     auto* numa_node = graph.AddNode(MakeNumaAwareNode(node, &graph));
-    graph.UpdateFanouts(node.name(), numa_node->name());
+    TF_RETURN_IF_ERROR(graph.UpdateFanouts(node.name(), numa_node->name()));
     nodes_to_delete.insert(node.name());
+    stats->num_changes++;
   }
-  graph.DeleteNodes(nodes_to_delete);
+  TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/data/make_numa_aware.h b/tensorflow/core/grappler/optimizers/data/make_numa_aware.h
index 48a7d8145f0157c6cea1633edb68d9ee3ee08de1..81dbb31e6d55c3a8f86be945afcef588efe2d6e3 100644
--- a/tensorflow/core/grappler/optimizers/data/make_numa_aware.h
+++ b/tensorflow/core/grappler/optimizers/data/make_numa_aware.h
@@ -16,12 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_NUMA_AWARE_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_NUMA_AWARE_H_
 
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
 
-class MakeNumaAware : public CustomGraphOptimizer {
+class MakeNumaAware : public TFDataOptimizerBase {
  public:
   MakeNumaAware() = default;
   ~MakeNumaAware() override = default;
@@ -33,8 +33,9 @@ class MakeNumaAware : public CustomGraphOptimizer {
     return Status::OK();
   }
 
-  Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* output) override;
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override {}
diff --git a/tensorflow/core/grappler/optimizers/data/make_sloppy.cc b/tensorflow/core/grappler/optimizers/data/make_sloppy.cc
index 1cfaef3ffb270cc338aaaef601f5f6037740112e..1de0c46427aa7812329aa657fd2c1f0611655ad3 100644
--- a/tensorflow/core/grappler/optimizers/data/make_sloppy.cc
+++ b/tensorflow/core/grappler/optimizers/data/make_sloppy.cc
@@ -25,8 +25,10 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-Status MakeSloppy::Optimize(Cluster* cluster, const GrapplerItem& item,
-                            GraphDef* output) {
+Status MakeSloppy::OptimizeAndCollectStats(Cluster* cluster,
+                                           const GrapplerItem& item,
+                                           GraphDef* output,
+                                           OptimizationStats* stats) {
   *output = item.graph;
   MutableGraphView graph(output);
 
@@ -35,6 +37,7 @@ Status MakeSloppy::Optimize(Cluster* cluster, const GrapplerItem& item,
         node.op() == "ParallelMapDataset" ||
         node.op() == "ParseExampleDataset") {
       (*node.mutable_attr())["sloppy"].set_b(true);
+      stats->num_changes++;
     }
   }
   return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/data/make_sloppy.h b/tensorflow/core/grappler/optimizers/data/make_sloppy.h
index 9dcab1038de3f6c39c4db4954903465bc0a6146d..cf42e841989da351c7203da6d01dac9c398c0cc9 100644
--- a/tensorflow/core/grappler/optimizers/data/make_sloppy.h
+++ b/tensorflow/core/grappler/optimizers/data/make_sloppy.h
@@ -16,12 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_SLOPPY_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAKE_SLOPPY_H_
 
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
 
-class MakeSloppy : public CustomGraphOptimizer {
+class MakeSloppy : public TFDataOptimizerBase {
  public:
   MakeSloppy() = default;
   ~MakeSloppy() override = default;
@@ -33,8 +33,9 @@ class MakeSloppy : public CustomGraphOptimizer {
     return Status::OK();
   }
 
-  Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* output) override;
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override {}
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
index 84c4d82f6a38dd81e88374c6ce6a7a6082451a38..5d26d1abe48fa9cc9217b34d3990b306d3f6a494 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
@@ -98,11 +99,13 @@ NodeDef MakeMapAndBatchNode(const NodeDef& map_node, const NodeDef& batch_node,
 
 }  // namespace
 
-Status MapAndBatchFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
-                                   GraphDef* output) {
+Status MapAndBatchFusion::OptimizeAndCollectStats(Cluster* cluster,
+                                                  const GrapplerItem& item,
+                                                  GraphDef* output,
+                                                  OptimizationStats* stats) {
   *output = item.graph;
   MutableGraphView graph(output);
-  std::set<string> nodes_to_delete;
+  absl::flat_hash_set<string> nodes_to_delete;
   for (const NodeDef& node : item.graph.node()) {
     if (node.op() != "BatchDataset" && node.op() != "BatchDatasetV2") {
       continue;
@@ -120,14 +123,16 @@ Status MapAndBatchFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
 
     auto* new_node =
         graph.AddNode(MakeMapAndBatchNode(*map_node, batch_node, &graph));
-    graph.UpdateFanouts(batch_node.name(), new_node->name());
+    TF_RETURN_IF_ERROR(
+        graph.UpdateFanouts(batch_node.name(), new_node->name()));
 
     // Mark the `Map` and `Batch` nodes for removal.
     nodes_to_delete.insert(map_node->name());
     nodes_to_delete.insert(batch_node.name());
+    stats->num_changes++;
   }
 
-  graph.DeleteNodes(nodes_to_delete);
+  TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
   return Status::OK();
 }
 
@@ -139,5 +144,5 @@ void MapAndBatchFusion::Feedback(Cluster* cluster, const GrapplerItem& item,
 
 REGISTER_GRAPH_OPTIMIZER_AS(MapAndBatchFusion, "map_and_batch_fusion");
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h
index 2c64831105295391f77e7e8be554b25fa85a5779..ef3a218bf340d96e9b95eb0175d5cb6167c5a208 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h
@@ -16,12 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_AND_BATCH_FUSION_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_AND_BATCH_FUSION_H_
 
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
 
-class MapAndBatchFusion : public CustomGraphOptimizer {
+class MapAndBatchFusion : public TFDataOptimizerBase {
  public:
   MapAndBatchFusion() = default;
   ~MapAndBatchFusion() override = default;
@@ -33,14 +33,15 @@ class MapAndBatchFusion : public CustomGraphOptimizer {
     return Status::OK();
   }
 
-  Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* output) override;
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
 };
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_AND_BATCH_FUSION_H_
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
index 233d7968c8965a5ec2389aa297da72a9708b9257..e257683b35d7ca8a60d0dc7324ffd5ad7f270175 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
@@ -92,8 +93,10 @@ NodeDef MakeFilterByLastComponentNode(const NodeDef& fused_map_node,
 
 }  // namespace
 
-Status MapAndFilterFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
-                                    GraphDef* output) {
+Status MapAndFilterFusion::OptimizeAndCollectStats(Cluster* cluster,
+                                                   const GrapplerItem& item,
+                                                   GraphDef* output,
+                                                   OptimizationStats* stats) {
   GraphDef sorted_old_graph = item.graph;
   TF_RETURN_IF_ERROR(TopologicalSort(&sorted_old_graph));
   // TODO(prazek): We might have some problems with performance if we copy
@@ -101,7 +104,7 @@ Status MapAndFilterFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
   *output = sorted_old_graph;
 
   MutableGraphView graph(output);
-  std::set<string> nodes_to_delete;
+  absl::flat_hash_set<string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item.graph.library());
   auto get_map_node = [](const NodeDef& node) -> const NodeDef* {
@@ -155,16 +158,18 @@ Status MapAndFilterFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
     const auto* filter_by_component = graph.AddNode(
         MakeFilterByLastComponentNode(*fused_maps, *filter_node, &graph));
 
-    graph.UpdateFanouts(filter_node->name(), filter_by_component->name());
+    TF_RETURN_IF_ERROR(
+        graph.UpdateFanouts(filter_node->name(), filter_by_component->name()));
     TF_RETURN_IF_ERROR(function_library.AddFunctionDef(*fused_function));
 
     // TODO(prazek): we could also remove functions from library if they are not
     // used anymore.
     nodes_to_delete.insert(map_node->name());
     nodes_to_delete.insert(filter_node->name());
+    stats->num_changes++;
   }
 
-  graph.DeleteNodes(nodes_to_delete);
+  TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
   return Status::OK();
 }
 
@@ -176,5 +181,5 @@ void MapAndFilterFusion::Feedback(Cluster* cluster, const GrapplerItem& item,
 
 REGISTER_GRAPH_OPTIMIZER_AS(MapAndFilterFusion, "map_and_filter_fusion");
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h
index ba25ca0591043989b97c62a7adb32eeeb193694e..8b3c95d37c109e2752c80b3696462d06a0797680 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_AND_FILTER_FUSION_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_AND_FILTER_FUSION_H_
 
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -26,7 +26,7 @@ namespace grappler {
 // component. The FilterDataset is transformed to FilterByLastComponent - a
 // custom kernel that filters elements based on a value of the boolean
 // component.
-class MapAndFilterFusion : public CustomGraphOptimizer {
+class MapAndFilterFusion : public TFDataOptimizerBase {
  public:
   MapAndFilterFusion() = default;
   ~MapAndFilterFusion() override = default;
@@ -38,14 +38,15 @@ class MapAndFilterFusion : public CustomGraphOptimizer {
     return Status::OK();
   }
 
-  Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* output) override;
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
 };
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_AND_FILTER_FUSION_H_
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
index 6b8015f96a29ac2fa2de3871a678a1b82efb12ff..ce41f7069cc5d54287ba6c8d546e57ca7293de8b 100644
--- a/tensorflow/core/grappler/optimizers/data/map_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/map_fusion.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
@@ -77,14 +78,16 @@ NodeDef MakeFusedNode(const NodeDef& parent_map_node, const NodeDef& map_node,
 
 }  // namespace
 
-Status MapFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
-                           GraphDef* output) {
+Status MapFusion::OptimizeAndCollectStats(Cluster* cluster,
+                                          const GrapplerItem& item,
+                                          GraphDef* output,
+                                          OptimizationStats* stats) {
   GraphDef sorted_old_graph = item.graph;
   TF_RETURN_IF_ERROR(TopologicalSort(&sorted_old_graph));
   *output = sorted_old_graph;
 
   MutableGraphView graph(output);
-  std::set<string> nodes_to_delete;
+  absl::flat_hash_set<string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item.graph.library());
 
@@ -130,7 +133,8 @@ Status MapFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
     const auto* fused_maps_node = graph.AddNode(
         MakeFusedNode(*parent_map_node, *map_node, *fused_function, &graph));
 
-    graph.UpdateFanouts(map_node->name(), fused_maps_node->name());
+    TF_RETURN_IF_ERROR(
+        graph.UpdateFanouts(map_node->name(), fused_maps_node->name()));
 
     // TODO(prazek): we should run some optimizations on the fused map
     // functions, or make sure that optimization passes run after map
@@ -141,9 +145,10 @@ Status MapFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
     // they are not used anymore.
     nodes_to_delete.insert(parent_map_node->name());
     nodes_to_delete.insert(map_node->name());
+    stats->num_changes++;
   }
 
-  graph.DeleteNodes(nodes_to_delete);
+  TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
   return Status::OK();
 }
 
@@ -154,5 +159,5 @@ void MapFusion::Feedback(Cluster* cluster, const GrapplerItem& item,
 
 REGISTER_GRAPH_OPTIMIZER_AS(MapFusion, "map_fusion");
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion.h b/tensorflow/core/grappler/optimizers/data/map_fusion.h
index a6a06592b80823458ee6ae3b655aecacbdfbb93b..c9960c721789002daeeea91f5fbbfe0dc9f30968 100644
--- a/tensorflow/core/grappler/optimizers/data/map_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion.h
@@ -16,13 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_FUSION_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_FUSION_H_
 
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
 
 // This optimization fuses map transformations by merging their map functions.
-class MapFusion : public CustomGraphOptimizer {
+class MapFusion : public TFDataOptimizerBase {
  public:
   MapFusion() = default;
   ~MapFusion() override = default;
@@ -34,14 +34,15 @@ class MapFusion : public CustomGraphOptimizer {
     return Status::OK();
   }
 
-  Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* output) override;
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
 };
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_FUSION_H_
diff --git a/tensorflow/core/grappler/optimizers/data/map_parallelization.cc b/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
index 8e49f908a77288c8e99b62706578d86a272ab682..90dd885c7fc75954e4207876ac154bec0e9d3093 100644
--- a/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
@@ -15,12 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/map_parallelization.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/data/function_utils.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/grappler/utils.h"
 
@@ -28,33 +30,21 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-bool CanParallelize(const FunctionDef& function,
-                    const FunctionLibraryDefinition& library) {
-  if (!function.signature().is_stateful()) return true;
-
-  for (const auto& node : function.node_def()) {
-    const OpDef* op_def;
-    TF_CHECK_OK(library.LookUpOpDef(node.op(), &op_def));
-    // Assert is marked as stateful, but it does not have any state (except
-    // changing io).  Similarly to CUDA, we do not give guarantee that the
-    // assert operation that would fail would be the first one, so that we can
-    // parallelize it.
-    if (op_def->is_stateful() && op_def->name() != "Assert") return false;
-  }
-
-  return true;
-}
-
-NodeDef MakeParallelMap(const NodeDef& map_node, MutableGraphView* graph) {
-  NodeDef parallel_map = map_node;
-  graph_utils::SetUniqueGraphNodeName("parallel_map", graph->graph(),
+constexpr char kMapDataset[] = "MapDataset";
+constexpr char kParallelMapDataset[] = "ParallelMapDataset";
+constexpr int kAutotune = -1;
+
+NodeDef MakeParallelMap(const string& name, MutableGraphView* graph) {
+  // The inputs of the node to be parallelized could be changed by the
+  // optimization pass, so we need to look it up in the modified graph.
+  int index = graph_utils::FindGraphNodeWithName(name, *graph->graph());
+  DCHECK_NE(index, -1) << "Failed to find node " << name
+                       << " in the optimized graph.";
+  NodeDef parallel_map = graph->graph()->node(index);
+  graph_utils::SetUniqueGraphNodeName(kParallelMapDataset, graph->graph(),
                                       &parallel_map);
-  parallel_map.set_op("ParallelMapDataset");
-  // TODO(b/114475558): We want to set `num_parallel_calls` to a special value,
-  // so that dynamic tunning will pick the optimal value at runtime. Because
-  // this feature is not yet implemented, we set it to 2, which is the smallest
-  // value that introduces parallelism.
-  auto* num_parallel_calls = graph_utils::AddScalarConstNode(2, graph);
+  parallel_map.set_op(kParallelMapDataset);
+  auto* num_parallel_calls = graph_utils::AddScalarConstNode(kAutotune, graph);
   parallel_map.add_input(num_parallel_calls->name());
 
   return parallel_map;
@@ -62,15 +52,17 @@ NodeDef MakeParallelMap(const NodeDef& map_node, MutableGraphView* graph) {
 
 }  // namespace
 
-Status MapParallelization::Optimize(Cluster* cluster, const GrapplerItem& item,
-                                    GraphDef* output) {
+Status MapParallelization::OptimizeAndCollectStats(Cluster* cluster,
+                                                   const GrapplerItem& item,
+                                                   GraphDef* output,
+                                                   OptimizationStats* stats) {
   *output = item.graph;
   MutableGraphView graph(output);
-  std::set<string> nodes_to_delete;
+  absl::flat_hash_set<string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item.graph.library());
   auto get_map_node = [](const NodeDef& node) -> const NodeDef* {
-    if (node.op() == "MapDataset") return &node;
+    if (node.op() == kMapDataset) return &node;
     return nullptr;
   };
 
@@ -80,14 +72,18 @@ Status MapParallelization::Optimize(Cluster* cluster, const GrapplerItem& item,
 
     auto* function =
         function_library.Find(map_node->attr().at("f").func().name());
-    if (!CanParallelize(*function, function_library)) continue;
+    if (function_utils::IsFunctionStateful(function_library, *function, true))
+      continue;
 
-    auto* parallel_map = graph.AddNode(MakeParallelMap(*map_node, &graph));
-    graph.UpdateFanouts(map_node->name(), parallel_map->name());
+    auto* parallel_map =
+        graph.AddNode(MakeParallelMap(map_node->name(), &graph));
+    TF_RETURN_IF_ERROR(
+        graph.UpdateFanouts(map_node->name(), parallel_map->name()));
     nodes_to_delete.insert(map_node->name());
+    stats->num_changes++;
   }
 
-  graph.DeleteNodes(nodes_to_delete);
+  TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
   return Status::OK();
 }
 
@@ -99,5 +95,5 @@ void MapParallelization::Feedback(Cluster* cluster, const GrapplerItem& item,
 
 REGISTER_GRAPH_OPTIMIZER_AS(MapParallelization, "map_parallelization");
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/map_parallelization.h b/tensorflow/core/grappler/optimizers/data/map_parallelization.h
index ac9cf7e12af344da2079637db9f3c51012c5ccd5..8e71dadcb858bbee5f94a3e51038350e46f542ce 100644
--- a/tensorflow/core/grappler/optimizers/data/map_parallelization.h
+++ b/tensorflow/core/grappler/optimizers/data/map_parallelization.h
@@ -16,13 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_PARALLELIZATION_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_PARALLELIZATION_H_
 
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
 
 // This optimization parallelizes MapDataset when function is stateless.
-class MapParallelization : public CustomGraphOptimizer {
+class MapParallelization : public TFDataOptimizerBase {
  public:
   MapParallelization() = default;
   ~MapParallelization() override = default;
@@ -34,14 +34,15 @@ class MapParallelization : public CustomGraphOptimizer {
     return Status::OK();
   }
 
-  Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* output) override;
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
 };
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_PARALLELIZATION_H_
diff --git a/tensorflow/core/grappler/optimizers/data/map_vectorization.cc b/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
index 3401dcc6f23bae1b2e77d5ea18a94f382fee4fb8..5c8f7805330d94c0cd9d568a549744c9a645a855 100644
--- a/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_vectorization.cc
@@ -16,8 +16,10 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/data/map_vectorization.h"
 #include "tensorflow/core/grappler/optimizers/data/vectorization_utils.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor.pb.h"  // NOLINT
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
@@ -28,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/data/function_utils.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 
@@ -35,6 +38,21 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
+constexpr char kCastOp[] = "Cast";
+constexpr char kRealDivOp[] = "RealDiv";
+constexpr char kSubOp[] = "Sub";
+constexpr char kMulOp[] = "Mul";
+constexpr char kAddOp[] = "Add";
+constexpr char kEqualOp[] = "Equal";
+constexpr char kCeilOp[] = "Ceil";
+constexpr char kBatchOp[] = "BatchDataset";
+constexpr char kBatchV2Op[] = "BatchDatasetV2";
+constexpr char kExperimentalMapAndBatchOp[] = "ExperimentalMapAndBatchDataset";
+constexpr char kMapOp[] = "MapDataset";
+constexpr char kParallelMapOp[] = "ParallelMapDataset";
+constexpr char kChooseFastestOp[] = "ExperimentalChooseFastestDataset";
+constexpr int kAutotune = -1;
+
 // Returns a FunctionDef containing a MapDefun op that wraps the original
 // function.
 FunctionDef* CreateMapDefunWrapper(const NodeDef& map_node,
@@ -100,7 +118,6 @@ FunctionDef* AddVectorizedFunction(const NodeDef& map_node,
   const NodeDef& map_defun_node = vectorized_func->node_def(0);
   DCHECK_EQ(map_defun_node.op(), "MapDefun");
 
-  // TODO(b/116285210): Unreferenced functions should get cleaned up later
   FunctionDef* result;
   Status s = vectorization_utils::VectorizeMapDefun(
       *vectorized_func, map_defun_node, library, &result);
@@ -120,6 +137,7 @@ bool IsOutputShapesFullyDefined(const NodeDef& node) {
   const auto& shapes = shapes_attr->list().shape();
 
   for (const TensorShapeProto& shape : shapes) {
+    if (shape.unknown_rank()) return false;
     for (const auto& dim : shape.dim()) {
       if (dim.size() == -1) {
         return false;
@@ -129,34 +147,68 @@ bool IsOutputShapesFullyDefined(const NodeDef& node) {
   return true;
 }
 
-bool IsStatefulFn(const FunctionLibraryDefinition& library,
-                  const FunctionDef& function_def) {
-  for (const NodeDef& node_def : function_def.node_def()) {
-    const OpDef* op_def;
-    Status s = library.LookUpOpDef(node_def.op(), &op_def);
-    if (!s.ok() || op_def->is_stateful()) {
-      return true;
-    }
+// Returns a mapping from input names to the [start, end) indices of the input
+// in the node's input list.
+Status GetInputMap(const NodeDef& node, NameRangeMap* result) {
+  const OpRegistrationData* op_reg_data;  // Owned by global op registry
+  TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUp(node.op(), &op_reg_data));
+
+  return NameRangesForNode(node, op_reg_data->op_def, result,
+                           /*outputs=*/nullptr);
+}
+
+Status CopyInputs(StringPiece input_name, const NameRangeMap& input_map,
+                  const NodeDef& from, NodeDef* to) {
+  const auto* range = gtl::FindOrNull(input_map, input_name);
+  if (range == nullptr) {
+    return errors::Internal(
+        "Failed to copy inputs: did not find inputs with name: ", input_name,
+        ", in node with name: ", from.name());
+  }
+  for (int i = range->first; i < range->second; ++i) {
+    to->add_input(from.input(i));
   }
-  return false;
+
+  return Status::OK();
 }
 
-NodeDef MakeNewBatchNode(const NodeDef& old_batch_node,
-                         const NodeDef& input_node,
-                         const FunctionDef& vectorized_func,
-                         MutableGraphView* graph) {
+Status GetInputNodeName(StringPiece input_name, const NameRangeMap& input_map,
+                        const NodeDef& node, string* result) {
+  const auto* range = gtl::FindOrNull(input_map, input_name);
+  if (range == nullptr) {
+    return errors::Internal(
+        "Failed to get input node name: did not find input with name: ",
+        input_name, ", in node with name: ", node.name());
+  }
+  if (range->second - range->first > 1) {
+    return errors::Internal("Tried to get single input name for a list input.");
+  }
+  *result = node.input(range->first);
+  return Status::OK();
+}
+
+Status AddNewBatchNode(const NodeDef& old_batch_node, const NodeDef& input_node,
+                       const FunctionDef& vectorized_func,
+                       MutableGraphView* graph, NodeDef** new_batch_node) {
   NodeDef batch_node;
-  batch_node.set_op(old_batch_node.op());
+  batch_node.set_op(old_batch_node.op() == kBatchOp ? kBatchOp : kBatchV2Op);
   graph_utils::SetUniqueGraphNodeName(batch_node.op(), graph->graph(),
                                       &batch_node);
 
   // Set the `input_dataset` input argument
   batch_node.add_input(input_node.name());
-  // Set the `batch_size` input_argument
-  batch_node.add_input(old_batch_node.input(1));
-  if (batch_node.op() == "BatchDatasetV2") {
-    // Set the `drop_remainder` input argument
-    batch_node.add_input(old_batch_node.input(2));
+
+  NameRangeMap input_map;
+  TF_RETURN_IF_ERROR(GetInputMap(old_batch_node, &input_map));
+
+  // Set the `batch_size` input argument
+  TF_RETURN_IF_ERROR(
+      CopyInputs("batch_size", input_map, old_batch_node, &batch_node));
+
+  // Set the `drop_remainder` input argument
+  if (batch_node.op() != kBatchOp) {
+    TF_RETURN_IF_ERROR(
+        CopyInputs("drop_remainder", input_map, old_batch_node, &batch_node));
   }
 
   // Set attrs
@@ -166,34 +218,227 @@ NodeDef MakeNewBatchNode(const NodeDef& old_batch_node,
   }
   (*batch_node.mutable_attr())["output_types"] = output_types;
 
+  // It is safe to assume that input_node has the "output_shapes" attr here,
+  // because earlier we checked that the input node has fully defined output
+  // shapes.
   auto& output_shapes_attr = (*batch_node.mutable_attr())["output_shapes"];
   const auto& input_shapes =
       input_node.attr().at("output_shapes").list().shape();
-  int64 batch_size =
-      old_batch_node.attr().at("output_shapes").list().shape()[0].dim(0).size();
+
+  int64 batch_size = -1;
+  for (const auto& shape :
+       old_batch_node.attr().at("output_shapes").list().shape()) {
+    if (!shape.unknown_rank()) {
+      batch_size = shape.dim(0).size();
+      break;
+    }
+  }
+
   for (size_t i = 0; i < input_shapes.size(); ++i) {
+    // Note: We already checked earlier that input shapes are all fully defined.
     TensorShapeProto* shape = output_shapes_attr.mutable_list()->add_shape();
     TensorShapeProto_Dim* dim = shape->add_dim();
     dim->set_size(batch_size);
     shape->MergeFrom(input_shapes.Get(i));
   }
-  return batch_node;
+
+  *new_batch_node = graph->AddNode(std::move(batch_node));
+  return Status::OK();
 }
 
-NodeDef MakeNewMapNode(const NodeDef& old_map_node,
-                       const NodeDef& old_batch_node,
-                       const NodeDef& new_batch_node,
-                       const FunctionDef& vectorized_func,
+NodeDef* AddCastNode(const string& input, DataType src_t, DataType dst_t,
+                     MutableGraphView* graph) {
+  NodeDef cast_node;
+  cast_node.set_op(kCastOp);
+  cast_node.add_input(input);
+  graph_utils::SetUniqueGraphNodeName(cast_node.op(), graph->graph(),
+                                      &cast_node);
+  AddNodeAttr("SrcT", src_t, &cast_node);
+  AddNodeAttr("DstT", dst_t, &cast_node);
+
+  return graph->AddNode(std::move(cast_node));
+}
+
+NodeDef* AddEqualityNode(const string& input_x, const string& input_y,
+                         DataType t, MutableGraphView* graph) {
+  NodeDef equal_node;
+  equal_node.set_op(kEqualOp);
+  equal_node.add_input(input_x);
+  equal_node.add_input(input_y);
+  graph_utils::SetUniqueGraphNodeName(equal_node.op(), graph->graph(),
+                                      &equal_node);
+  AddNodeAttr("T", t, &equal_node);
+
+  return graph->AddNode(std::move(equal_node));
+}
+
+NodeDef* AddCeilNode(const string& input, MutableGraphView* graph) {
+  NodeDef ceil_node;
+  ceil_node.set_op(kCeilOp);
+  graph_utils::SetUniqueGraphNodeName(ceil_node.op(), graph->graph(),
+                                      &ceil_node);
+  AddNodeAttr("T", DT_FLOAT, &ceil_node);
+  ceil_node.add_input(input);
+
+  return graph->AddNode(std::move(ceil_node));
+}
+
+NodeDef* AddBinaryNode(const string& input_x, const string& input_y,
+                       const string& op, DataType type,
+                       MutableGraphView* graph) {
+  NodeDef node;
+  node.set_op(op);
+  node.add_input(input_x);
+  node.add_input(input_y);
+  graph_utils::SetUniqueGraphNodeName(op, graph->graph(), &node);
+  AddNodeAttr("T", type, &node);
+
+  return graph->AddNode(std::move(node));
+}
+
+NodeDef* AddIntAddNode(const string& input_x, const string& input_y,
                        MutableGraphView* graph) {
+  return AddBinaryNode(input_x, input_y, kAddOp, DT_INT32, graph);
+}
+
+NodeDef* AddFloatDivNode(const string& input_x, const string& input_y,
+                         MutableGraphView* graph) {
+  return AddBinaryNode(input_x, input_y, kRealDivOp, DT_FLOAT, graph);
+}
+
+NodeDef* AddIntSubNode(const string& input_x, const string& input_y,
+                       MutableGraphView* graph) {
+  return AddBinaryNode(input_x, input_y, kSubOp, DT_INT32, graph);
+}
+
+NodeDef* AddIntMulNode(const string& input_x, const string& input_y,
+                       MutableGraphView* graph) {
+  return AddBinaryNode(input_x, input_y, kMulOp, DT_INT32, graph);
+}
+
+// Create a new node for the num_parallel_calls input argument according to the
+// following formula:
+//
+// Let N = old num_parallel_calls, N' = new num_parallel_calls, and B =
+// batch_size.
+//     N' = ceil(N // B) * (1 - (N == -1)) + N * (N == -1)
+//
+// i.e. N' = -1 if N = -1 (autotune)
+//      N' = ceil(N // B) otherwise.
+// Note that "ceil" is necessary so N' != 0.
+//
+// For non-autotune values of `num_parallel_call`, we divide it by `batch_size`
+// to limit memory consumption by the map buffer.
+//
+// TODO(rachelim): Evaluate the performance of other potential transformations
+// to `num_parallel_calls`:
+//   1) use the autotune value (i.e. -1)
+//   2) use the original value
+Status MakeNumParallelCallsInput(const NodeDef& old_map_node,
+                                 const NodeDef& old_batch_node,
+                                 const NameRangeMap& input_map,
+                                 MutableGraphView* graph, string* result) {
+  string num_parallel_calls_name;
+  TF_RETURN_IF_ERROR(GetInputNodeName("num_parallel_calls", input_map,
+                                      old_map_node, &num_parallel_calls_name));
+
+  NodeDef* float_num_parallel_calls;
+  NodeDef* float_batch_size;
+  NodeDef* bool_is_autotune;
+
+  // Cast the old num_parallel_calls and batch_size arguments to DT_FLOAT before
+  // dividing.
+  if (old_map_node.op() == kExperimentalMapAndBatchOp) {
+    auto autotune_val =
+        graph_utils::AddScalarConstNode(static_cast<int64>(kAutotune), graph);
+    bool_is_autotune = AddEqualityNode(
+        autotune_val->name(), num_parallel_calls_name, DT_INT64, graph);
+
+    float_num_parallel_calls =
+        AddCastNode(num_parallel_calls_name, DT_INT64, DT_FLOAT, graph);
+
+    string batch_size_name;
+    TF_RETURN_IF_ERROR(GetInputNodeName("batch_size", input_map, old_map_node,
+                                        &batch_size_name));
+
+    float_batch_size = AddCastNode(batch_size_name, DT_INT64, DT_FLOAT, graph);
+  } else {
+    auto autotune_val =
+        graph_utils::AddScalarConstNode(static_cast<int>(kAutotune), graph);
+    bool_is_autotune = AddEqualityNode(
+        autotune_val->name(), num_parallel_calls_name, DT_INT32, graph);
+
+    float_num_parallel_calls =
+        AddCastNode(num_parallel_calls_name, DT_INT32, DT_FLOAT, graph);
+
+    float_batch_size =
+        AddCastNode(old_batch_node.input(1), DT_INT64, DT_FLOAT, graph);
+  }
+
+  // Divide
+  auto div_node = AddFloatDivNode(float_num_parallel_calls->name(),
+                                  float_batch_size->name(), graph);
+
+  // Ceil
+  auto float_ceil_node = AddCeilNode(div_node->name(), graph);
+
+  // Cast back to DT_INT32
+  auto int_ceil_node =
+      AddCastNode(float_ceil_node->name(), DT_FLOAT, DT_INT32, graph);
+
+  // is_autotune = int(num_parallel_calls == -1)
+  auto int_is_autotune =
+      AddCastNode(bool_is_autotune->name(), DT_BOOL, DT_INT32, graph);
+
+  // is_not_autotune = 1 - is_autotune
+  auto int_is_not_autotune =
+      AddIntSubNode(graph_utils::AddScalarConstNode(1, graph)->name(),
+                    int_is_autotune->name(), graph);
+
+  auto mul_1 =
+      AddIntMulNode(int_ceil_node->name(), int_is_not_autotune->name(), graph);
+
+  NodeDef* mul_2;
+  if (old_map_node.op() == kExperimentalMapAndBatchOp) {
+    auto int_num_parallel_calls =
+        AddCastNode(num_parallel_calls_name, DT_INT64, DT_INT32, graph);
+    mul_2 = AddIntMulNode(int_num_parallel_calls->name(),
+                          int_is_autotune->name(), graph);
+  } else {
+    mul_2 =
+        AddIntMulNode(num_parallel_calls_name, int_is_autotune->name(), graph);
+  }
+
+  auto add_node = AddIntAddNode(mul_1->name(), mul_2->name(), graph);
+
+  *result = add_node->name();
+  return Status::OK();
+}
+
+Status AddNewMapNode(const NodeDef& old_map_node, const NodeDef& old_batch_node,
+                     const NodeDef& new_batch_node,
+                     const FunctionDef& vectorized_func,
+                     MutableGraphView* graph, NodeDef** new_map_node) {
   NodeDef map_node;
-  map_node.set_op(old_map_node.op());
+  map_node.set_op(old_map_node.op() == kMapOp ? kMapOp : kParallelMapOp);
   graph_utils::SetUniqueGraphNodeName(map_node.op(), graph->graph(), &map_node);
 
   // Set the `input_dataset` input argument
   map_node.add_input(new_batch_node.name());
-  for (int i = 1; i < old_map_node.input_size(); i++) {
-    // Set the `other_arguments` and `num_parallel_calls` input arguments
-    map_node.add_input(old_map_node.input(i));
+
+  NameRangeMap input_map;
+  TF_RETURN_IF_ERROR(GetInputMap(old_map_node, &input_map));
+
+  // Set the `other_arguments` input argument
+  TF_RETURN_IF_ERROR(
+      CopyInputs("other_arguments", input_map, old_map_node, &map_node));
+
+  // Set the `num_parallel_calls` input argument
+  if (old_map_node.op() != kMapOp) {
+    string num_parallel_calls;
+    TF_RETURN_IF_ERROR(MakeNumParallelCallsInput(
+        old_map_node, old_batch_node, input_map, graph, &num_parallel_calls));
+    map_node.add_input(std::move(num_parallel_calls));
   }
 
   // Set attrs
@@ -206,71 +451,134 @@ NodeDef MakeNewMapNode(const NodeDef& old_map_node,
   }
 
   (*map_node.mutable_attr())["use_inter_op_parallelism"].set_b(true);
-
-  return map_node;
+  *new_map_node = graph->AddNode(std::move(map_node));
+  return Status::OK();
 }
 
-}  // namespace
+Status AddNewChooseFastestNode(gtl::ArraySlice<NodeDef> input_nodes,
+                               MutableGraphView* graph,
+                               NodeDef** new_choose_fastest_node) {
+  NodeDef choose_fastest_node;
+  choose_fastest_node.set_op(kChooseFastestOp);
+  graph_utils::SetUniqueGraphNodeName(choose_fastest_node.op(), graph->graph(),
+                                      &choose_fastest_node);
+
+  // Set the `input_datasets` input argument.
+  for (const auto& node_def : input_nodes) {
+    choose_fastest_node.add_input(node_def.name());
+  }
+  AddNodeAttr("N", static_cast<int>(input_nodes.size()), &choose_fastest_node);
+  AddNodeAttr("num_experiments", 10, &choose_fastest_node);
 
-Status MapVectorization::Optimize(Cluster* cluster, const GrapplerItem& item,
-                                  GraphDef* output) {
-  *output = item.graph;
-  MutableGraphView graph(output);
-  std::set<string> nodes_to_delete;
+  for (auto key : {"output_shapes", "output_types"}) {
+    graph_utils::CopyAttribute(key, input_nodes[0], &choose_fastest_node);
+  }
 
-  for (const NodeDef& node : item.graph.node()) {
-    // Find Map->Batch nodes.
-    // TODO(rachelim): Optimize MapAndBatchDataset[V2] as well.
-    if (node.op() != "BatchDataset" && node.op() != "BatchDatasetV2") {
-      continue;
-    }
+  *new_choose_fastest_node = graph->AddNode(std::move(choose_fastest_node));
+  return Status::OK();
+}
 
-    const NodeDef& batch_node(node);
-    NodeDef* node2 = graph_utils::GetInputNode(batch_node, graph);
-    if (node2->op() != "MapDataset" && node2->op() != "ParallelMapDataset") {
-      continue;
+// Given an input pipeline graph and a query node, tries to the node to the
+// 'batch' node in a input_dataset->map->batch pattern, or the 'map_and_batch'
+// node in an input_dataset->map_and_batch pattern.
+bool FindMapAndBatchPattern(const MutableGraphView& graph, const NodeDef& node,
+                            const FunctionLibraryDefinition& function_library,
+                            const NodeDef** batch_node_output,
+                            const NodeDef** map_node_output,
+                            const NodeDef** input_node_output,
+                            const FunctionDef** map_fn_output) {
+  const FunctionDef*& map_fn = *map_fn_output;
+  const NodeDef*& batch_node = *batch_node_output;
+  const NodeDef*& map_node = *map_node_output;
+  const NodeDef*& input_node = *input_node_output;
+
+  if (node.op() == kExperimentalMapAndBatchOp) {
+    batch_node = &node;
+    map_node = &node;
+  } else if (node.op() == kBatchOp || node.op() == kBatchV2Op) {
+    batch_node = &node;
+    map_node = graph_utils::GetInputNode(*batch_node, graph);
+    if (map_node->op() != kMapOp && map_node->op() != kParallelMapOp) {
+      return false;
     }
+    if (!IsOutputShapesFullyDefined(*map_node)) {
+      // If any of the map func outputs have an unknown shape, don't
+      // optimize, so that batching errors surface as before.
+      VLOG(1) << "Cannot vectorize dataset.map().batch() because the map "
+                 "dataset does not have fully defined output shapes.";
+      return false;
+    }
+  } else {
+    return false;
+  }
+
+  // Input to the map node
+  input_node = graph_utils::GetInputNode(*map_node, graph);
+  DCHECK_NE(input_node, nullptr);
+
+  if (!IsOutputShapesFullyDefined(*input_node)) {
+    // If any of the inputs have an unknown shape, don't optimize, since
+    // inputs might not be batchable.
+    VLOG(1) << "Cannot vectorize dataset.map().batch() because the input "
+               "dataset does not have fully defined output shapes.";
+    return false;
+  }
+
+  map_fn = function_library.Find(map_node->attr().at("f").func().name());
+
+  if (function_utils::IsFunctionStateful(function_library, *map_fn)) {
+    VLOG(1) << "Cannot vectorize dataset.map().batch() because the map "
+               "function is stateful.";
+    return false;
+  }
 
-    // Use a more descriptive variable name now that we know the node type.
-    NodeDef* map_node = node2;
-    // Input to the map node
-    NodeDef* input_node = graph_utils::GetInputNode(*map_node, graph);
-    CHECK_NOTNULL(input_node);
+  return true;
+}
 
-    FunctionDefLibrary* library = output->mutable_library();
+}  // namespace
 
+Status MapVectorization::OptimizeAndCollectStats(Cluster* cluster,
+                                                 const GrapplerItem& item,
+                                                 GraphDef* output,
+                                                 OptimizationStats* stats) {
+  *output = item.graph;
+  MutableGraphView graph(output);
+  absl::flat_hash_set<string> nodes_to_delete;
+
+  FunctionDefLibrary* library = output->mutable_library();
+
+  for (const NodeDef& node : item.graph.node()) {
     FunctionLibraryDefinition function_library(OpRegistry::Global(), *library);
-    const FunctionDef* orig_func =
-        function_library.Find(map_node->attr().at("f").func().name());
-
-    // Check that this is a valid optimization.
-    if (!IsOutputShapesFullyDefined(*input_node) ||
-        !IsOutputShapesFullyDefined(*map_node) ||
-        IsStatefulFn(function_library, *orig_func)) {
-      // 1. If any of the inputs have an unknown shape, don't optimize, since
-      // inputs might not be batchable.
-      // 2. If any of the map func outputs have an unknown shape, don't
-      // optimize, so that batching errors surface as before.
-      // 3. If the function is stateful, don't vectorize it.
+    const NodeDef* map_node;
+    const NodeDef* batch_node;
+    const NodeDef* input_node;
+    const FunctionDef* map_func;
+    if (!FindMapAndBatchPattern(graph, node, function_library, &batch_node,
+                                &map_node, &input_node, &map_func)) {
       continue;
     }
 
     FunctionDef* vectorized_func =
-        AddVectorizedFunction(*map_node, *orig_func, library);
+        AddVectorizedFunction(*map_node, *map_func, library);
     CHECK_NOTNULL(vectorized_func);
 
-    auto* new_batch_node = graph.AddNode(
-        MakeNewBatchNode(batch_node, *input_node, *vectorized_func, &graph));
+    NodeDef* new_batch_node;
+    TF_RETURN_IF_ERROR(AddNewBatchNode(
+        *batch_node, *input_node, *vectorized_func, &graph, &new_batch_node));
+
+    NodeDef* new_map_node;
+    TF_RETURN_IF_ERROR(AddNewMapNode(*map_node, *batch_node, *new_batch_node,
+                                     *vectorized_func, &graph, &new_map_node));
 
-    auto* new_map_node = graph.AddNode(MakeNewMapNode(
-        *map_node, batch_node, *new_batch_node, *vectorized_func, &graph));
-    graph.UpdateFanouts(batch_node.name(), new_map_node->name());
+    NodeDef* new_choose_fastest_node;
+    TF_RETURN_IF_ERROR(AddNewChooseFastestNode(
+        {*new_map_node, *batch_node}, &graph, &new_choose_fastest_node));
 
-    // Mark the `Map` and `Batch` nodes for removal.
-    nodes_to_delete.insert(map_node->name());
-    nodes_to_delete.insert(batch_node.name());
+    // Make output of Batch point to ChooseFastest instead.
+    TF_RETURN_IF_ERROR(graph.UpdateFanouts(batch_node->name(),
+                                           new_choose_fastest_node->name()));
+    stats->num_changes++;
   }
-  graph.DeleteNodes(nodes_to_delete);
   return Status::OK();
 }
 
@@ -282,5 +590,5 @@ void MapVectorization::Feedback(Cluster* cluster, const GrapplerItem& item,
 
 REGISTER_GRAPH_OPTIMIZER_AS(MapVectorization, "map_vectorization");
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/map_vectorization.h b/tensorflow/core/grappler/optimizers/data/map_vectorization.h
index cc56a8ee5e4e2d0b180047da5368c82ac719ddc1..88ec9cfec627637b305a41c87ffda9a8e0b8955a 100644
--- a/tensorflow/core/grappler/optimizers/data/map_vectorization.h
+++ b/tensorflow/core/grappler/optimizers/data/map_vectorization.h
@@ -16,12 +16,29 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_VECTORIZATION_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_VECTORIZATION_H_
 
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
 
-class MapVectorization : public CustomGraphOptimizer {
+// This optimizer rewrites dataset.map(map_fn, ...).batch(...) and
+// dataset.apply(tf.data.experimental.map_and_batch(map_fn, ...)) patterns in an
+// input pipeline. It vectorizes the map_fn, such that this segment can be
+// rewritten as dataset.batch().map(vectorized_map_fn). This is more performant
+// when the map_fn is cheap, because it amortizes the cost of running a map
+// function over a larger batch.
+//
+// From:
+//      input --> map --> batch --> output
+//              (or map_and_batch)
+//
+// To:
+//      input --> map --> batch --------+
+//        |     (or map_and_batch)      |
+//        |                             v
+//        +-----> batch --> map --> choose_fastest --> output
+//
+class MapVectorization : public TFDataOptimizerBase {
  public:
   MapVectorization() = default;
   ~MapVectorization() override = default;
@@ -33,14 +50,15 @@ class MapVectorization : public CustomGraphOptimizer {
     return Status::OK();
   }
 
-  Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* output) override;
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
 };
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_VECTORIZATION_H_
diff --git a/tensorflow/core/grappler/optimizers/data/map_vectorization_test.cc b/tensorflow/core/grappler/optimizers/data/map_vectorization_test.cc
index f4faf415496f306cb9ced961c1a8c12e11cb167c..30385c7d45687e5d15c0afabead30553dcada2be 100644
--- a/tensorflow/core/grappler/optimizers/data/map_vectorization_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_vectorization_test.cc
@@ -17,195 +17,545 @@ limitations under the License.
 
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
 namespace grappler {
 namespace {
 
-using test::function::GDef;
+constexpr char kConstOp[] = "Const";
+constexpr char kRangeOp[] = "RangeDataset";
+constexpr char kBatchOp[] = "BatchDataset";
+constexpr char kBatchV2Op[] = "BatchDatasetV2";
+constexpr char kExperimentalMapAndBatchOp[] = "ExperimentalMapAndBatchDataset";
+constexpr char kMapOp[] = "MapDataset";
+constexpr char kParallelMapOp[] = "ParallelMapDataset";
+constexpr char kChooseFastestOp[] = "ExperimentalChooseFastestDataset";
+constexpr char kAttrNameF[] = "f";
+constexpr char kAttrNameTarguments[] = "Targuments";
+constexpr char kAttrNameOutputTypes[] = "output_types";
+constexpr char kAttrNameOutputShapes[] = "output_shapes";
+constexpr char kAttrNameInterOpParallelism[] = "use_inter_op_parallelism";
+constexpr char kAttrNamePreserveCardinality[] = "preserve_cardinality";
+constexpr char kAttrNameSloppy[] = "sloppy";
+constexpr char kAttrNameValue[] = "value";
+constexpr char kAttrNameDtype[] = "dtype";
+
 using test::function::NDef;
 
-NodeDef MakeMapNodeHelper(StringPiece name, StringPiece input_node_name,
-                          StringPiece function_name, StringPiece map_op_name,
-                          gtl::ArraySlice<PartialTensorShape> output_shapes,
-                          gtl::ArraySlice<DataType> output_types) {
-  return test::function::NDef(
-      name, map_op_name, {string(input_node_name)},
-      {{"f", FunctionDefHelper::FunctionRef(string(function_name))},
-       {"Targuments", {}},
-       {"output_shapes", output_shapes},
-       {"output_types", output_types}});
-}
-
-NodeDef MakeMapNode(StringPiece name, StringPiece input_node_name,
-                    StringPiece function_name,
-                    gtl::ArraySlice<PartialTensorShape> output_shapes,
-                    gtl::ArraySlice<DataType> output_types) {
-  return MakeMapNodeHelper(name, input_node_name, function_name, "MapDataset",
-                           output_shapes, output_types);
-}
-
-NodeDef MakeBatchNode(StringPiece name, StringPiece input_node_name,
-                      StringPiece input_batch_size_name,
-                      gtl::ArraySlice<PartialTensorShape> output_shapes,
-                      gtl::ArraySlice<DataType> output_types) {
-  return NDef(
-      name, "BatchDataset",
-      {string(input_node_name), string(input_batch_size_name)},
-      {{"output_types", output_types}, {"output_shapes", output_shapes}});
-}
-
-NodeDef MakeBatchV2Node(StringPiece name, StringPiece input_node_name,
-                        StringPiece input_batch_size_name,
-                        StringPiece input_drop_remainder_name,
-                        gtl::ArraySlice<PartialTensorShape> output_shapes,
-                        gtl::ArraySlice<DataType> output_types) {
-  return NDef(
-      name, "BatchDatasetV2",
-      {string(input_node_name), string(input_batch_size_name),
-       string(input_drop_remainder_name)},
-      {{"output_types", output_types}, {"output_shapes", output_shapes}});
-}
-
-NodeDef MakeRangeNode(StringPiece name, gtl::ArraySlice<string> inputs) {
-  return NDef(name, "RangeDataset", inputs,
-              {{"output_shapes", gtl::ArraySlice<TensorShape>({{}})},
-               {"output_types", gtl::ArraySlice<DataType>({DT_INT64})}});
-}
-
-TEST(MapVectorizationTest, VectorizeMapWithBatch) {
+// Adds a simple vectorizable map function that is akin to
+// dataset.map(lambda x: tf.identity(x))
+FunctionDef* AddMapFn(MutableGraphView* graph) {
+  FunctionDef* map_fn = graph->graph()->mutable_library()->add_function();
+  *map_fn = FunctionDefHelper::Create(
+      /*function_name=*/"map_fn",
+      /*in_def=*/{"x: int64"},
+      /*out_def=*/{"res: int64"},
+      /*attr_def=*/{},
+      /*node_def=*/{{{"node"}, "Identity", {"x"}, {{"T", DT_INT64}}}},
+      /*ret_def=*/{{"res", "node:output"}});
+
+  return map_fn;
+}
+
+NodeDef* AddMapNode(MutableGraphView* graph, const string& input_dataset,
+                    const string& map_fn, int num_parallel_calls = 0) {
+  NodeDef result;
+  if (num_parallel_calls) {
+    auto num_parallel_calls_node =
+        graph_utils::AddScalarConstNode(num_parallel_calls, graph);
+    result =
+        NDef(/*name=*/"map", /*op=*/kParallelMapOp,
+             /*inputs=*/{input_dataset, num_parallel_calls_node->name()},
+             /*attrs=*/
+             {{kAttrNameF, FunctionDefHelper::FunctionRef(map_fn)},
+              {kAttrNameTarguments, gtl::ArraySlice<DataType>({})},
+              {kAttrNameOutputTypes, gtl::ArraySlice<DataType>({DT_INT64})},
+              {kAttrNameOutputShapes, gtl::ArraySlice<TensorShape>({{}})},
+              {kAttrNameInterOpParallelism, false},
+              {kAttrNameSloppy, true},
+              {kAttrNamePreserveCardinality, true}});
+  } else {
+    result =
+        NDef(/*name=*/"map", /*op=*/kMapOp,
+             /*inputs=*/{input_dataset},
+             /*attrs=*/
+             {{kAttrNameF, FunctionDefHelper::FunctionRef(map_fn)},
+              {kAttrNameTarguments, gtl::ArraySlice<DataType>({})},
+              {kAttrNameOutputTypes, gtl::ArraySlice<DataType>({DT_INT64})},
+              {kAttrNameOutputShapes, gtl::ArraySlice<TensorShape>({{}})},
+              {kAttrNameInterOpParallelism, false},
+              {kAttrNamePreserveCardinality, true}});
+  }
+
+  graph_utils::SetUniqueGraphNodeName(result.name(), graph->graph(), &result);
+  return graph->AddNode(std::move(result));
+}
+
+NodeDef* AddBatchNode(MutableGraphView* graph, const string& input_dataset,
+                      bool v2 = false, int64 batch_size = 10) {
+  NodeDef result;
+  auto batch_size_node = graph_utils::AddScalarConstNode(batch_size, graph);
+
+  if (v2) {
+    // BatchDatasetV2
+    auto drop_remainder = graph_utils::AddScalarConstNode(true, graph);
+    result = NDef(
+        /*name=*/"batch", /*op=*/kBatchV2Op,
+        /*inputs=*/
+        {input_dataset, batch_size_node->name(), drop_remainder->name()},
+        /*attrs=*/
+        {{kAttrNameOutputTypes, gtl::ArraySlice<DataType>({DT_INT64})},
+         {kAttrNameOutputShapes, gtl::ArraySlice<TensorShape>({{10, 1}})}});
+  } else {
+    result =
+        NDef(/*name=*/"batch", /*op=*/kBatchOp,
+             /*inputs=*/{input_dataset, batch_size_node->name()},
+             /*attrs=*/
+             {{kAttrNameOutputTypes, gtl::ArraySlice<DataType>({DT_INT64})},
+              {kAttrNameOutputShapes,
+               gtl::ArraySlice<PartialTensorShape>({{v2 ? 10 : -1, 1}})}});
+  }
+
+  graph_utils::SetUniqueGraphNodeName(result.name(), graph->graph(), &result);
+  return graph->AddNode(std::move(result));
+}
+
+NodeDef* AddRangeNode(MutableGraphView* graph) {
+  auto start = graph_utils::AddScalarConstNode(static_cast<int64>(0), graph);
+  auto stop = graph_utils::AddScalarConstNode(static_cast<int64>(10), graph);
+  auto step = graph_utils::AddScalarConstNode(static_cast<int64>(1), graph);
+
+  NodeDef result =
+      NDef(/*name=*/"range", /*op=*/kRangeOp,
+           /*inputs=*/{start->name(), stop->name(), step->name()},
+           /*attrs=*/
+           {{kAttrNameOutputShapes, gtl::ArraySlice<TensorShape>({{}})},
+            {kAttrNameOutputTypes, gtl::ArraySlice<DataType>({DT_INT64})}});
+
+  graph_utils::SetUniqueGraphNodeName(result.name(), graph->graph(), &result);
+  return graph->AddNode(std::move(result));
+}
+
+void CheckNotVectorized(const GraphDef& output, const string& map_op,
+                        const string& batch_op, const string& map_input_name) {
+  ASSERT_EQ(graph_utils::FindAllGraphNodesWithOp(map_op, output).size(), 1);
+  ASSERT_EQ(graph_utils::FindAllGraphNodesWithOp(batch_op, output).size(), 1);
+  const NodeDef& map_node =
+      output.node(graph_utils::FindGraphNodeWithOp(map_op, output));
+  const NodeDef& batch_node =
+      output.node(graph_utils::FindGraphNodeWithOp(batch_op, output));
+  EXPECT_EQ(map_node.input(0), map_input_name);
+  EXPECT_EQ(batch_node.input(0), map_node.name());
+}
+
+void CheckBranch(const GraphDef& graph, string input_name,
+                 gtl::ArraySlice<string> ops, const string& terminal_input) {
+  for (int i = 0, size = ops.size(); i < size; ++i) {
+    const NodeDef& input_node =
+        graph.node(graph_utils::FindGraphNodeWithName(input_name, graph));
+    EXPECT_EQ(input_node.op(), ops[size - i - 1]);
+    input_name = input_node.input(0);
+  }
+  EXPECT_EQ(input_name, terminal_input);
+}
+
+// Checks that a graph has undergone the map_vectorization transformation
+// successfully, whereby the new graph has the shape:
+//
+//    input_node --> new batch --> new map -------+
+//         |                                      |
+//         |                                      v
+//         +-------> old map --> old batch ---> choose_fastest
+//
+void CheckVectorized(const GraphDef& output, const string& map_op,
+                     const string& batch_op, const string& map_input_name,
+                     bool fused = false) {
+  ASSERT_EQ(graph_utils::FindAllGraphNodesWithOp(map_op, output).size(), 2);
+  ASSERT_EQ(graph_utils::FindAllGraphNodesWithOp(batch_op, output).size(), 2);
+  ASSERT_EQ(
+      graph_utils::FindAllGraphNodesWithOp(kChooseFastestOp, output).size(), 1);
+  const NodeDef& choose_fastest_node =
+      output.node(graph_utils::FindGraphNodeWithOp(kChooseFastestOp, output));
+
+  // Branch 0: vectorized
+  CheckBranch(output, choose_fastest_node.input(0), {batch_op, map_op},
+              map_input_name);
+
+  // Branch 1: original
+  CheckBranch(output, choose_fastest_node.input(1), {map_op, batch_op},
+              map_input_name);
+
+  const NodeDef& vectorized_map_node = output.node(
+      graph_utils::FindGraphNodeWithName(choose_fastest_node.input(0), output));
+  // Check that the function is actually vectorized.
+  // The vectorization of the identity function is itself.
+  string function_name =
+      vectorized_map_node.attr().at(kAttrNameF).func().name();
+  int found =
+      graph_utils::FindGraphFunctionWithName(function_name, output.library());
+  ASSERT_NE(found, -1);
+  const auto& function = output.library().function(found);
+  EXPECT_EQ(function.node_def(0).op(), "Identity");
+}
+
+// Checks that a graph has undergone the map_vectorization transformation
+// successfully, whereby the new graph has the shape:
+//
+//    input_node --> new batch -> new map --------+
+//         |                                      |
+//         |                                      v
+//         +-------> old map_and_batch ---> choose_fastest
+//
+void CheckVectorizedFused(const GraphDef& output,
+                          const string& map_input_name) {
+  ASSERT_EQ(graph_utils::FindAllGraphNodesWithOp(kParallelMapOp, output).size(),
+            1);
+  ASSERT_EQ(graph_utils::FindAllGraphNodesWithOp(kBatchV2Op, output).size(), 1);
+  ASSERT_EQ(
+      graph_utils::FindAllGraphNodesWithOp(kExperimentalMapAndBatchOp, output)
+          .size(),
+      1);
+  ASSERT_EQ(
+      graph_utils::FindAllGraphNodesWithOp(kChooseFastestOp, output).size(), 1);
+  const NodeDef& choose_fastest_node =
+      output.node(graph_utils::FindGraphNodeWithOp(kChooseFastestOp, output));
+
+  // Branch 0: vectorized
+  CheckBranch(output, choose_fastest_node.input(0),
+              {kBatchV2Op, kParallelMapOp}, map_input_name);
+
+  // Branch 1: original
+  CheckBranch(output, choose_fastest_node.input(1),
+              {kExperimentalMapAndBatchOp}, map_input_name);
+
+  const NodeDef& vectorized_map_node = output.node(
+      graph_utils::FindGraphNodeWithName(choose_fastest_node.input(0), output));
+  // Check that the function is actually vectorized.
+  // The vectorization of the identity function is itself.
+  string function_name =
+      vectorized_map_node.attr().at(kAttrNameF).func().name();
+  int found =
+      graph_utils::FindGraphFunctionWithName(function_name, output.library());
+  ASSERT_NE(found, -1);
+  const auto& function = output.library().function(found);
+  EXPECT_EQ(function.node_def(0).op(), "Identity");
+}
+
+class MapThenBatchTest
+    : public ::testing::TestWithParam<std::tuple<int, bool>> {};
+
+TEST_P(MapThenBatchTest, IsVectorized) {
+  int num_parallel_calls = std::get<0>(GetParam());
+  bool use_batch_v2 = std::get<1>(GetParam());
   GrapplerItem item;
-  item.graph = GDef(
-      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
-       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
-       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       NDef("batch_size", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       MakeRangeNode("range", {"start", "stop", "step"}),
-       MakeMapNode("map", "range", "XTimesTwo", {{}}, {DT_INT32}),
-       MakeBatchNode("batch", "map", "batch_size", {{-1}}, {DT_INT32})},
-      // FunctionLib
-      {
-          test::function::XTimesTwo(),
-      });
+  MutableGraphView graph(&item.graph);
+  auto range_node = AddRangeNode(&graph);
+  auto map_fn = AddMapFn(&graph);
+  auto map_node = AddMapNode(&graph, range_node->name(),
+                             map_fn->signature().name(), num_parallel_calls);
+  auto batch_node = AddBatchNode(&graph, map_node->name(), use_batch_v2);
   MapVectorization optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  CheckVectorized(output, map_node->op(), batch_node->op(), range_node->name());
+}
 
-  EXPECT_EQ(graph_utils::FindAllGraphNodesWithOp("MapDataset", output).size(),
-            1);
-  EXPECT_EQ(graph_utils::FindAllGraphNodesWithOp("BatchDataset", output).size(),
-            1);
-  const NodeDef& map_node =
-      output.node(graph_utils::FindGraphNodeWithOp("MapDataset", output));
-  const NodeDef& batch_node =
-      output.node(graph_utils::FindGraphNodeWithOp("BatchDataset", output));
-  EXPECT_EQ(map_node.input(0), batch_node.name());
-  EXPECT_EQ(batch_node.input(0), "range");
+INSTANTIATE_TEST_SUITE_P(MapThenBatchTest, MapThenBatchTest,
+                         ::testing::Combine(::testing::Values(0, 12),
+                                            ::testing::Bool()));
+
+NodeDef* AddMapAndBatchNode(MutableGraphView* graph,
+                            const string& input_dataset, const string& map_fn,
+                            int64 batch_size = 10,
+                            int64 num_parallel_calls = 12) {
+  auto batch_size_node = graph_utils::AddScalarConstNode(batch_size, graph);
+  auto num_parallel_calls_node =
+      graph_utils::AddScalarConstNode(num_parallel_calls, graph);
+  auto drop_remainder = graph_utils::AddScalarConstNode(true, graph);
+
+  NodeDef result =
+      NDef(/*name=*/"map_and_batch",
+           /*op=*/kExperimentalMapAndBatchOp,
+           /*inputs=*/
+           {input_dataset, batch_size_node->name(),
+            num_parallel_calls_node->name(), drop_remainder->name()},
+           /*attrs=*/
+           {{kAttrNameF, FunctionDefHelper::FunctionRef(map_fn)},
+            {kAttrNameTarguments, gtl::ArraySlice<DataType>({})},
+            {kAttrNameOutputTypes, gtl::ArraySlice<DataType>({DT_INT64})},
+            {kAttrNameOutputShapes,
+             gtl::ArraySlice<PartialTensorShape>({{10, 1}})}});
+
+  graph_utils::SetUniqueGraphNodeName(result.name(), graph->graph(), &result);
+  return graph->AddNode(std::move(result));
 }
 
-TEST(MapVectorizationTest, VectorizeMapWithBatchV2) {
+TEST(MapVectorizationTest, VectorizeExperimentalMapAndBatch) {
   GrapplerItem item;
-  item.graph = GDef(
-      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
-       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
-       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       NDef("batch_size", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       NDef("drop_remainder", "Const", {},
-            {{"value", false}, {"dtype", DT_BOOL}}),
-       MakeRangeNode("range", {"start", "stop", "step"}),
-       MakeMapNode("map", "range", "XTimesTwo", {{}}, {DT_INT32}),
-       MakeBatchV2Node("batch", "map", "batch_size", "drop_remainder", {{-1}},
-                       {DT_INT32})},
-      // FunctionLib
-      {
-          test::function::XTimesTwo(),
-      });
+  MutableGraphView graph(&item.graph);
+  auto range_node = AddRangeNode(&graph);
+  auto map_fn = AddMapFn(&graph);
+  auto map_and_batch_node = AddMapAndBatchNode(&graph, range_node->name(),
+                                               map_fn->signature().name());
+  ASSERT_NE(map_and_batch_node, nullptr);
+
   MapVectorization optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  CheckVectorizedFused(output, "range");
+}
 
-  EXPECT_EQ(graph_utils::FindAllGraphNodesWithOp("MapDataset", output).size(),
-            1);
-  EXPECT_EQ(
-      graph_utils::FindAllGraphNodesWithOp("BatchDatasetV2", output).size(), 1);
-  const NodeDef& map_node =
-      output.node(graph_utils::FindGraphNodeWithOp("MapDataset", output));
-  const NodeDef& batch_node =
-      output.node(graph_utils::FindGraphNodeWithOp("BatchDatasetV2", output));
-  EXPECT_EQ(map_node.input(0), batch_node.name());
-  EXPECT_EQ(batch_node.input(0), "range");
+void EvaluateNodes(const GraphDef& graph,
+                   const std::vector<string>& output_tensor_names,
+                   std::vector<Tensor>* output_tensors) {
+  std::unique_ptr<Session> session(NewSession(SessionOptions()));
+  TF_CHECK_OK(session->Create(graph));
+  TF_CHECK_OK(session->Run({}, output_tensor_names, {}, output_tensors));
+}
+
+void CheckNumParallelCalls(const GraphDef& output,
+                           int expected_num_parallel_calls) {
+  // Run the graph to see that the new num_parallel_calls is computed correctly.
+  const NodeDef& choose_fastest_node =
+      output.node(graph_utils::FindGraphNodeWithOp(kChooseFastestOp, output));
+  const NodeDef& vectorized_map_node = output.node(
+      graph_utils::FindGraphNodeWithName(choose_fastest_node.input(0), output));
+  const string& num_parallel_calls = vectorized_map_node.input(1);
+  std::vector<Tensor> output_tensors;
+  EvaluateNodes(output, {num_parallel_calls}, &output_tensors);
+
+  test::ExpectTensorEqual<int>(
+      output_tensors.at(0),
+      Tensor(static_cast<int32>(expected_num_parallel_calls)));
+}
+
+struct TestStruct {
+  int original_num_parallel_calls;
+  int batch_size;
+  int expected_num_parallel_calls;
+};
+
+class NumParallelCallsTest : public ::testing::TestWithParam<TestStruct> {};
+
+TEST_P(NumParallelCallsTest, TestCorrectNumParallelCalls) {
+  auto params = GetParam();
+
+  GrapplerItem item;
+  MutableGraphView graph(&item.graph);
+  auto range_node = AddRangeNode(&graph);
+  auto map_fn = AddMapFn(&graph);
+  auto map_node =
+      AddMapNode(&graph, range_node->name(), map_fn->signature().name(),
+                 params.original_num_parallel_calls);
+  auto batch_node = AddBatchNode(&graph, map_node->name(), /*v2=*/true,
+                                 /*batch_size=*/params.batch_size);
+  MapVectorization optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  CheckVectorized(output, map_node->op(), batch_node->op(), range_node->name());
+
+  CheckNumParallelCalls(output, params.expected_num_parallel_calls);
 }
 
-TEST(MapVectorizationTest, VectorizeWithUndefinedOutputShape) {
+TEST_P(NumParallelCallsTest, TestCorrectNumParallelCallsFused) {
+  auto params = GetParam();
+
   GrapplerItem item;
-  item.graph = GDef(
-      {NDef("batch_size", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       NDef("input", "InputDataset", {},
-            {{"output_types", gtl::ArraySlice<DataType>({DT_INT32})}}),
-       MakeMapNode("map", "input", "XTimesTwo", {{}}, {DT_INT32}),
-       MakeBatchNode("batch", "map", "batch_size", {{-1}}, {DT_INT32})},
-      // FunctionLib
-      {
-          test::function::XTimesTwo(),
-      });
+  MutableGraphView graph(&item.graph);
+  auto range_node = AddRangeNode(&graph);
+  auto map_fn = AddMapFn(&graph);
+  auto map_and_batch_node =
+      AddMapAndBatchNode(&graph, range_node->name(), map_fn->signature().name(),
+                         params.batch_size, params.original_num_parallel_calls);
+  ASSERT_NE(map_and_batch_node, nullptr);
+
   MapVectorization optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  CheckVectorizedFused(output, range_node->name());
+
+  CheckNumParallelCalls(output, params.expected_num_parallel_calls);
 }
 
-TEST(MapVectorizationTest, VectorizeWithUndefinedOutputTypes) {
+INSTANTIATE_TEST_SUITE_P(
+    NumParallelCalls, NumParallelCallsTest,
+    ::testing::Values(TestStruct({1, 1, 1}), TestStruct({2, 10, 1}),
+                      TestStruct({4, 3, 2}), TestStruct({10, 1, 10}),
+                      TestStruct({-1, 1, -1}), TestStruct({-1, 10, -1})));
+
+class ChainedMapAndBatchTest
+    : public ::testing::TestWithParam<std::tuple<bool, bool>> {};
+
+// Tests:
+// 1) map.batch.map.batch
+// 2) map.batch.map_and_batch
+// 3) map_and_batch.map.batch
+// 4) map_and_batch.map_and_batch
+TEST_P(ChainedMapAndBatchTest, IsVectorized) {
+  GrapplerItem item;
+  MutableGraphView graph(&item.graph);
+  auto input_node = AddRangeNode(&graph);
+
+  auto map_fn = AddMapFn(&graph);
+
+  auto make_map_and_batch = [&graph, map_fn](NodeDef* input, bool fuse) {
+    if (fuse) {
+      return AddMapAndBatchNode(&graph, input->name(),
+                                map_fn->signature().name());
+    }
+    auto map_node =
+        AddMapNode(&graph, input->name(), map_fn->signature().name(), true);
+    auto batch_node = AddBatchNode(&graph, map_node->name(), true);
+    return batch_node;
+  };
+
+  bool fuse_0 = std::get<0>(GetParam());
+  bool fuse_1 = std::get<1>(GetParam());
+  auto map_and_batch_0 = make_map_and_batch(input_node, fuse_0);
+  auto map_and_batch_1 = make_map_and_batch(map_and_batch_0, fuse_1);
+  ASSERT_NE(map_and_batch_1, nullptr);
+
+  MapVectorization optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  TF_ASSERT_OK(TopologicalSort(&output));
+
+  std::vector<int> choose_fastest_nodes =
+      graph_utils::FindAllGraphNodesWithOp(kChooseFastestOp, output);
+  ASSERT_EQ(choose_fastest_nodes.size(), 2);
+
+  std::vector<string> fused_sequence({kExperimentalMapAndBatchOp});
+  std::vector<string> unfused_sequence({kParallelMapOp, kBatchV2Op});
+  const NodeDef& range_node =
+      output.node(graph_utils::FindGraphNodeWithOp(kRangeOp, output));
+  const NodeDef& choose_fastest_0 = output.node(choose_fastest_nodes[0]);
+  CheckBranch(output, choose_fastest_0.input(0), {kBatchV2Op, kParallelMapOp},
+              range_node.name());
+  CheckBranch(output, choose_fastest_0.input(1),
+              fuse_0 ? fused_sequence : unfused_sequence, range_node.name());
+
+  const NodeDef& choose_fastest_1 = output.node(choose_fastest_nodes[1]);
+  CheckBranch(output, choose_fastest_1.input(0), {kBatchV2Op, kParallelMapOp},
+              choose_fastest_0.name());
+  CheckBranch(output, choose_fastest_1.input(1),
+              fuse_1 ? fused_sequence : unfused_sequence,
+              choose_fastest_0.name());
+}
+
+INSTANTIATE_TEST_SUITE_P(ChainedMapAndBatchTest, ChainedMapAndBatchTest,
+                         ::testing::Combine(::testing::Bool(),
+                                            ::testing::Bool()));
+
+// Not all dataset types have "output_shapes" and "output_types"
+// attrs defined. Add a generic input node which may not have these attrs
+// defined.
+NodeDef* AddArbitraryInputNode(MutableGraphView* graph,
+                               std::vector<PartialTensorShape>* output_shapes,
+                               std::vector<DataType>* output_types) {
+  std::vector<std::pair<string, FunctionDefHelper::AttrValueWrapper>> attrs;
+  if (output_shapes) {
+    attrs.push_back({kAttrNameOutputShapes, *output_shapes});
+  }
+  if (output_types) {
+    attrs.push_back({kAttrNameOutputTypes, *output_types});
+  }
+
+  NodeDef result = NDef(/*name=*/"input", /*op=*/"InputDataset",
+                        /*inputs=*/{},
+                        /*attrs=*/attrs);
+
+  graph_utils::SetUniqueGraphNodeName(result.name(), graph->graph(), &result);
+  return graph->AddNode(std::move(result));
+}
+
+TEST(MapVectorizationTest, VectorizeWithUndefinedOutputShapes) {
+  // Tests that the optimization doesn't break when the input to MapDataset
+  // doesn't have an output_shapes attr defined. In this case, the map and
+  // batch swap does not occur.
   GrapplerItem item;
-  item.graph = GDef(
-      {NDef("batch_size", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       NDef("input", "InputDataset", {},
-            {{"output_shapes", gtl::ArraySlice<TensorShape>({{}})}}),
-       MakeMapNode("map", "input", "XTimesTwo", {{}}, {DT_INT32}),
-       MakeBatchNode("batch", "map", "batch_size", {{-1}}, {DT_INT32})},
-      // FunctionLib
-      {
-          test::function::XTimesTwo(),
-      });
+  MutableGraphView graph(&item.graph);
+  std::vector<DataType> input_types({DT_INT64});
+  auto input_node = AddArbitraryInputNode(&graph, nullptr, &input_types);
+  auto map_fn = AddMapFn(&graph);
+  auto map_node =
+      AddMapNode(&graph, input_node->name(), map_fn->signature().name());
+  auto batch_node = AddBatchNode(&graph, map_node->name());
   MapVectorization optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  CheckNotVectorized(output, map_node->op(), batch_node->op(),
+                     input_node->name());
 }
 
-TEST(MapVectorizationTest, VectorizeWithFullyDefinedFunction) {
+TEST(MapVectorizationTest, VectorizeWithUnknownRank) {
+  // Tests that the optimization doesn't break when the input to MapDataset
+  // has components with unknown rank. In this case, the optimization does not
+  // occur.
   GrapplerItem item;
-  item.graph = GDef(
-      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
-       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
-       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       NDef("batch_size", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       MakeRangeNode("range", {"start", "stop", "step"}),
-       MakeMapNode("map", "range", "Func", {{}}, {DT_INT32}),
-       MakeBatchNode("batch", "map", "batch_size", {{-1}}, {DT_INT32})},
-      // FunctionLib
-      {FunctionDefHelper::Create(
-          "Func", {"x: int64", "y: int64"}, {"res: int64", "res2: int64"}, {},
-          {{{"o"}, "Mul", {"x", "x"}, {{"T", DT_INT64}}}},
-          {{"res", "o:z"}, {"res2", "o:z"}})});
+  MutableGraphView graph(&item.graph);
+  std::vector<PartialTensorShape> input_shapes({{}});
+  std::vector<DataType> input_types({DT_INT64});
+  auto input_node = AddArbitraryInputNode(&graph, &input_shapes, &input_types);
+  auto map_fn = AddMapFn(&graph);
+  auto map_node =
+      AddMapNode(&graph, input_node->name(), map_fn->signature().name());
+  auto batch_node = AddBatchNode(&graph, map_node->name());
   MapVectorization optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  CheckNotVectorized(output, map_node->op(), batch_node->op(),
+                     input_node->name());
+}
 
-  EXPECT_EQ(graph_utils::FindAllGraphNodesWithOp("MapDataset", output).size(),
-            1);
-  EXPECT_EQ(graph_utils::FindAllGraphNodesWithOp("BatchDataset", output).size(),
-            1);
-  const NodeDef& map_node =
-      output.node(graph_utils::FindGraphNodeWithOp("MapDataset", output));
-  const NodeDef& batch_node =
-      output.node(graph_utils::FindGraphNodeWithOp("BatchDataset", output));
-  EXPECT_EQ(map_node.input(0), batch_node.name());
-  EXPECT_EQ(batch_node.input(0), "range");
+TEST(MapVectorizationTest, VectorizeWithUnknownDim) {
+  // Tests that the optimization doesn't break when the input to MapDataset
+  // has components with unknown dimensions. In this case, the optimization does
+  // not occur.
+  GrapplerItem item;
+  MutableGraphView graph(&item.graph);
+  std::vector<PartialTensorShape> input_shapes({{-1, 2}});
+  std::vector<DataType> input_types({DT_INT64});
+  auto input_node = AddArbitraryInputNode(&graph, &input_shapes, &input_types);
+  auto map_fn = AddMapFn(&graph);
+  auto map_node =
+      AddMapNode(&graph, input_node->name(), map_fn->signature().name());
+  auto batch_node = AddBatchNode(&graph, map_node->name());
+  MapVectorization optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  CheckNotVectorized(output, map_node->op(), batch_node->op(),
+                     input_node->name());
+}
+
+TEST(MapVectorizationTest, VectorizeWithUndefinedOutputTypes) {
+  // Tests that the optimization doesn't break when the input doesn't have
+  // an output_types attr defined. The output_types of the input node, even
+  // if not present, can be inferred from the map function input signature.
+  GrapplerItem item;
+  MutableGraphView graph(&item.graph);
+  std::vector<PartialTensorShape> input_shapes({{1}});
+  auto input_node = AddArbitraryInputNode(&graph, &input_shapes, nullptr);
+  auto map_fn = AddMapFn(&graph);
+  auto map_node =
+      AddMapNode(&graph, input_node->name(), map_fn->signature().name());
+  auto batch_node = AddBatchNode(&graph, map_node->name());
+  MapVectorization optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  CheckVectorized(output, map_node->op(), batch_node->op(), input_node->name());
 }
 
+// TODO(rachelim): Add test that has a polymorphic function.
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..584759f85d468157bbda142a5ebf654d264753cf
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
@@ -0,0 +1,125 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/meta_optimizer.h"
+
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/dependency_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/function_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/model_pruner.h"
+#include "tensorflow/core/grappler/optimizers/shape_optimizer.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace grappler {
+
+Status TFDataMetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+                                     GraphDef* output) {
+  // Stores the optimized item so far.
+  GrapplerItem optimized_item = item;
+
+  // Perform optimizations in a meaningful order.
+  for (const auto& optimization :
+       {"noop_elimination",
+        "shuffle_and_repeat_fusion",
+        "map_fusion",
+        "filter_fusion",
+        "map_and_filter_fusion",
+        "hoist_random_uniform",
+        "map_parallelization",
+        "map_and_batch_fusion",
+        "map_vectorization",
+        "make_numa_aware",
+        "latency_all_edges",
+        "make_sloppy",
+        "pruning",
+        "function",
+        "shape",
+        "arithmetic",
+        "dependency"}) {
+    TF_RETURN_IF_ERROR(
+        ApplyOptimization(optimization, cluster, &optimized_item));
+  }
+
+  // Store the final result of all the optimizations in `output`.
+  output->Swap(&optimized_item.graph);
+  return Status::OK();
+}
+
+Status TFDataMetaOptimizer::ApplyOptimization(const string& name,
+                                              Cluster* cluster,
+                                              GrapplerItem* item) const {
+  GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
+
+  const auto* optimizer = gtl::FindOrNull(enabled_optimizers_, name);
+  if (!optimizer) {
+    return Status::OK();
+  }
+
+  GraphDef result;
+  (*optimizer)->set_deadline_usec(this->deadline_usec());
+  TF_RETURN_IF_ERROR((*optimizer)->Optimize(cluster, *item, &result));
+  item->graph.Swap(&result);
+
+  return Status::OK();
+}
+
+Status TFDataMetaOptimizer::Init(
+    const tensorflow::RewriterConfig_CustomGraphOptimizer* config) {
+  if (!config) return Status::OK();
+
+  // Initialize custom tf.data optimizers based on config.
+  auto& optimizers = config->parameter_map().at("optimizers").list().s();
+  for (const auto& optimizer_name : optimizers) {
+    auto optimizer =
+        CustomGraphOptimizerRegistry::CreateByNameOrNull(optimizer_name);
+    if (optimizer) {
+      // None of our data optimizers implement a meaningful Init function.
+      // This returns an error in case any of them does.
+      TF_RETURN_IF_ERROR(optimizer->Init());
+      enabled_optimizers_[optimizer_name] = std::move(optimizer);
+    } else {
+      // This should never happen.
+      return errors::Internal(
+          "Tried to register a dataset optimizer that doesn't exist: ",
+          optimizer_name);
+    }
+  }
+
+  // Initialize standard grappler optimizers.
+  enabled_optimizers_["pruning"] = MakeUnique<ModelPruner>();
+  enabled_optimizers_["function"] =
+      MakeUnique<FunctionOptimizer>(RewriterConfig::ON);
+  enabled_optimizers_["shape"] = MakeUnique<ShapeOptimizer>();
+  enabled_optimizers_["arithmetic"] = MakeUnique<ArithmeticOptimizer>();
+  enabled_optimizers_["dependency"] = MakeUnique<DependencyOptimizer>();
+
+  return Status::OK();
+}
+
+void TFDataMetaOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
+                                   const GraphDef& optimize_output,
+                                   double result) {
+  // no-op
+}
+
+REGISTER_GRAPH_OPTIMIZER_AS(TFDataMetaOptimizer, "tf_data_meta_optimizer");
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/meta_optimizer.h b/tensorflow/core/grappler/optimizers/data/meta_optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b65e7027777b165737b444106897c0bb97778450
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/meta_optimizer.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_META_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_META_OPTIMIZER_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// This optimizer performs tf.data-specific optimizations by invoking
+// other optimizers.
+class TFDataMetaOptimizer : public CustomGraphOptimizer {
+ public:
+  TFDataMetaOptimizer() = default;
+  ~TFDataMetaOptimizer() override = default;
+
+  string name() const override { return "tf_data_meta_optimizer"; };
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override;
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* output) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+
+ private:
+  absl::flat_hash_map<string, std::unique_ptr<GraphOptimizer>>
+      enabled_optimizers_;
+
+  // Applies an optimization with the specified name on `item`, and stores
+  // the result in `item.graph`
+  Status ApplyOptimization(const string& name, Cluster* cluster,
+                           GrapplerItem* item) const;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_META_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/data/noop_elimination.cc b/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
index bd405c8329464793ee42757bc7ee1a3f34826bd9..851bbbdc1a28b91742bbfef3e98a4562b340a6c0 100644
--- a/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
+++ b/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/noop_elimination.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
@@ -70,21 +71,24 @@ bool IsNoOp(const NodeDef& node, const MutableGraphView& graph) {
 
 }  // namespace
 
-Status NoOpElimination::Optimize(Cluster* cluster, const GrapplerItem& item,
-                                 GraphDef* output) {
+Status NoOpElimination::OptimizeAndCollectStats(Cluster* cluster,
+                                                const GrapplerItem& item,
+                                                GraphDef* output,
+                                                OptimizationStats* stats) {
   *output = item.graph;
   MutableGraphView graph(output);
-  std::set<string> nodes_to_delete;
+  absl::flat_hash_set<string> nodes_to_delete;
   for (const NodeDef& node : item.graph.node()) {
     if (!IsNoOp(node, graph)) continue;
 
     NodeDef* const parent = graph_utils::GetInputNode(node, graph);
-    graph.UpdateFanouts(node.name(), parent->name());
+    TF_RETURN_IF_ERROR(graph.UpdateFanouts(node.name(), parent->name()));
 
     nodes_to_delete.insert(node.name());
+    stats->num_changes++;
   }
 
-  graph.DeleteNodes(nodes_to_delete);
+  TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
   return Status::OK();
 }
 
@@ -95,5 +99,5 @@ void NoOpElimination::Feedback(Cluster* cluster, const GrapplerItem& item,
 
 REGISTER_GRAPH_OPTIMIZER_AS(NoOpElimination, "noop_elimination");
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/noop_elimination.h b/tensorflow/core/grappler/optimizers/data/noop_elimination.h
index a65fccd882b782d4c6ead5ef9cb15e2cebd05e6f..11d86ad2a388da852cd4495b23277d6aecc143b6 100644
--- a/tensorflow/core/grappler/optimizers/data/noop_elimination.h
+++ b/tensorflow/core/grappler/optimizers/data/noop_elimination.h
@@ -16,14 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_NOOP_ELIMINATION_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_NOOP_ELIMINATION_H_
 
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
 
 // This class eliminates tf.data transformations such as `take(n)` (for n < 0),
 // `skip(0)`, `repeat(1)`, or `prefetch(0)`.
-class NoOpElimination : public CustomGraphOptimizer {
+class NoOpElimination : public TFDataOptimizerBase {
  public:
   NoOpElimination() = default;
   ~NoOpElimination() override = default;
@@ -35,14 +35,15 @@ class NoOpElimination : public CustomGraphOptimizer {
     return Status::OK();
   }
 
-  Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* output) override;
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
 };
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_NOOP_ELIMINATION_H_
diff --git a/tensorflow/core/grappler/optimizers/data/optimizer_base.cc b/tensorflow/core/grappler/optimizers/data/optimizer_base.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7fc0da357953906be87b02b2da10795b6e668cba
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/optimizer_base.cc
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+#include "tensorflow/core/common_runtime/metrics.h"
+
+namespace tensorflow {
+namespace grappler {
+
+Status TFDataOptimizerBase::Optimize(Cluster* cluster, const GrapplerItem& item,
+                                     GraphDef* output) {
+  OptimizationStats stats;
+  Status s = OptimizeAndCollectStats(cluster, item, output, &stats);
+  if (s.ok() && stats.num_changes > 0) {
+    metrics::RecordTFDataOptimization(name(), stats.num_changes);
+  }
+  return s;
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/optimizer_base.h b/tensorflow/core/grappler/optimizers/data/optimizer_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..45af5a4b7d4dcea9f3a1d6e31a8f8f10880f9d0b
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/optimizer_base.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_OPTIMIZER_BASE_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_OPTIMIZER_BASE_H_
+
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// A base class for tf.data optimizers.
+class TFDataOptimizerBase : public CustomGraphOptimizer {
+ public:
+  struct OptimizationStats {
+    // Identifies the number of independent graph changes for an optimization.
+    int64 num_changes = 0;
+  };
+
+  TFDataOptimizerBase() = default;
+  ~TFDataOptimizerBase() override = default;
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* output) final;
+
+  virtual Status OptimizeAndCollectStats(Cluster* cluster,
+                                         const GrapplerItem& item,
+                                         GraphDef* output,
+                                         OptimizationStats* stats) = 0;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_OPTIMIZER_BASE_H_
diff --git a/tensorflow/core/grappler/optimizers/data/rebatch.cc b/tensorflow/core/grappler/optimizers/data/rebatch.cc
new file mode 100644
index 0000000000000000000000000000000000000000..14c3931267598e2a8c9914f3f20cdec54fc5eb5e
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/rebatch.cc
@@ -0,0 +1,313 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/rebatch.h"
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/utils/functions.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace grappler {
+
+Status RebatchOptimizer::Init(
+    const tensorflow::RewriterConfig_CustomGraphOptimizer* config) {
+  if (!config) return Status::OK();
+
+  num_workers_ = config->parameter_map().at("num_workers").i();
+  return Status::OK();
+}
+
+namespace {
+
+constexpr char kCastOp[] = "Cast";
+constexpr char kRealDivOp[] = "RealDiv";
+constexpr char kConstOp[] = "Const";
+
+constexpr std::array<const char*, 5> kBatchDatasetOps = {
+    "BatchDataset",
+    "BatchDatasetV2",
+    "ExperimentalMapAndBatchDataset",
+    "PaddedBatchDataset",
+    "PaddedBatchDatasetV2"
+};
+
+constexpr std::array<const char*, 2> kMultipleInputsDatasetOps = {
+    "ConcatenateDataset",
+    "ZipDataset"
+};
+
+constexpr std::array<const char*, 17> kPassThroughOps = {
+    "CacheDataset",
+    "FilterDataset",
+    "FilterByLastComponentDataset",
+    "Identity",
+    "MapDataset",
+    "ModelDataset",
+    "OptimizeDataset",
+    "ParallelMapDataset",
+    "PrefetchDataset",
+    "ReduceDataset",
+    "RepeatDataset",
+    "ShardDataset",
+    "ShuffleAndRepeatDataset",
+    "ShuffleDataset",
+    "SkipDataset",
+    "TakeDataset",
+    "WindowDataset"
+};
+
+constexpr std::array<const char*, 3> kFuncDatasetOps = {
+    "FlatMapDataset",
+    "InterleaveDataset",
+    "ParallelInterleaveDatasetV2"
+};
+
+constexpr std::array<const char*, 9> kSourceDatasetOps = {
+    "FixedLengthRecordDataset",
+    "FixedLengthRecordDatasetV2",
+    "GeneratorDataset",
+    "RangeDataset",
+    "SparseTensorsSliceDataset",
+    "TensorDataset",
+    "TensorSliceDataset",
+    "TextLineDataset",
+    "TFRecordDataset"
+};
+
+NodeDef* AddCastNode(const string& input, DataType src_t, DataType dst_t,
+                     MutableGraphView* graph) {
+  NodeDef cast_node;
+  cast_node.set_op(kCastOp);
+  cast_node.add_input(input);
+  graph_utils::SetUniqueGraphNodeName(cast_node.op(), graph->graph(),
+                                      &cast_node);
+  AddNodeAttr("SrcT", src_t, &cast_node);
+  AddNodeAttr("DstT", dst_t, &cast_node);
+
+  return graph->AddNode(std::move(cast_node));
+}
+
+NodeDef* AddBinaryNode(const string& input_x, const string& input_y,
+                       const string& op, DataType type,
+                       MutableGraphView* graph) {
+  NodeDef node;
+  node.set_op(op);
+  node.add_input(input_x);
+  node.add_input(input_y);
+  graph_utils::SetUniqueGraphNodeName(op, graph->graph(), &node);
+  AddNodeAttr("T", type, &node);
+
+  return graph->AddNode(std::move(node));
+}
+
+NodeDef* AddFloatDivNode(const string& input_x, const string& input_y,
+                         MutableGraphView* graph) {
+  return AddBinaryNode(input_x, input_y, kRealDivOp, DT_FLOAT, graph);
+}
+
+template <std::size_t SIZE>
+bool IsDatasetNodeOfType(const NodeDef& node,
+                         const std::array<const char*, SIZE>& arr) {
+  for (const auto& dataset_op_name : arr) {
+    if (node.op() == dataset_op_name) return true;
+  }
+  return false;
+}
+
+// Given a "batch" dataset node, modifies the batch_size input to divide the
+// current batch size by num_workers.
+Status MutateBatchSize(const NodeDef& node, int64 num_workers,
+                       MutableGraphView* graph) {
+  // TODO(rohanj): Fix up the output_shapes attribute as well. For this Dataset
+  // as well as all the downstream datasets.
+  // For all the batching datasets the batch_size is input number 1.
+  NodeDef* batch_size_node = graph_utils::GetInputNode(node, *graph, 1);
+  // By the time this optimization is run, the batch_size is computed and
+  // is a constant.
+  if (batch_size_node->op() != kConstOp) {
+    return errors::Internal("Batch size node should be a Const. Obtained: ",
+                            batch_size_node->op(), " instead.");
+  }
+  Tensor batch_size_tensor;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(*batch_size_node, "value", &batch_size_tensor));
+  if (!TensorShapeUtils::IsScalar(batch_size_tensor.shape())) {
+    return errors::Internal("Batch size node shape should be scalar");
+  }
+  int64 batch_size = batch_size_tensor.scalar<int64>()();
+  if (batch_size % num_workers != 0) {
+    return errors::InvalidArgument(
+        "Batch size: ", batch_size,
+        " is not divisible by num_workers: ", num_workers);
+  }
+  batch_size /= num_workers;
+  NodeDef* new_batch_size_node =
+      graph_utils::AddScalarConstNode<int64>(batch_size, graph);
+  // We don't call UpdateFanouts here because CSE elimination might lead to
+  // multiple nodes sharing the same batch size constant node. This is also
+  // why we don't delete batch_size_node as well.
+  TF_RETURN_IF_ERROR(graph->UpdateRegularFaninByPort(
+      node.name(), 1, {new_batch_size_node->name(), 0}));
+  return Status::OK();
+}
+
+// There is one Sink node at least that is added to the end of the graph. We
+// find that node and return it. It is possible that there are multiple
+// Identity ops from the final Dataset op to that Sink node, but the recursive
+// graph traversal handles that.
+Status FindSinkNode(const GraphDef& graph_def, NodeDef* sink_node) {
+  absl::flat_hash_map<string, int> all_node_names;
+  absl::flat_hash_map<string, int> node_input_map;
+  for (int i = 0; i < graph_def.node_size(); ++i) {
+    all_node_names.insert_or_assign(graph_def.node(i).name(), i);
+    node_input_map.insert_or_assign(graph_def.node(i).name(), 0);
+  }
+  // Counts how many graph nodes is this node the input to. Candidate sink
+  // nodes are ones which are inputs into zero nodes.
+  for (const NodeDef& node : graph_def.node()) {
+    for (const string& input_name : node.input()) {
+      node_input_map[input_name]++;
+    }
+  }
+  for (const auto& it : node_input_map) {
+    if (it.second == 0) {
+      const NodeDef& sink_graph_node = graph_def.node(all_node_names[it.first]);
+      // Sometimes the searching surfaces Arg nodes in function cases that
+      // have no input. This check rejects those.
+      if (sink_graph_node.input_size() == 0) {
+        continue;
+      }
+      *sink_node = sink_graph_node;
+      return Status::OK();
+    }
+  }
+  return errors::InvalidArgument("Failed to find a sink node");
+}
+
+Status OptimizeGraph(const GrapplerItem& item, int64 num_workers,
+                     GraphDef* output);
+
+// Helper function that starts from a node in the graph and recurses into its
+// inputs trying to find a BatchDataset type operation to modify. During the
+// recursion it handles four kinds of cases.
+// 1. BatchDataset type ops: Mutates the batch_size input node and stops.
+// 2. Zip / Concatenate dataset ops: Recurses into all inputs to these ops
+//      as they are datasets themselves.
+// 3. Core dataset ops + Identity op: Recurses into first input parameter.
+// 4. FlatMap type mapping dataset ops: Recurses into the function definition.
+Status RecursivelyHandleOp(const NodeDef& node, int64 num_workers,
+                           FunctionLibraryDefinition* flib,
+                           MutableGraphView* graph) {
+  if (IsDatasetNodeOfType(node, kBatchDatasetOps)) {
+    return MutateBatchSize(node, num_workers, graph);
+  } else if (IsDatasetNodeOfType(node, kMultipleInputsDatasetOps)) {
+    // For all multiple input datasets, all inputs are datasets themselves.
+    for (int i = 0; i < node.input_size(); ++i) {
+      NodeDef* input_node = graph_utils::GetInputNode(node, *graph, i);
+      TF_RETURN_IF_ERROR(
+          RecursivelyHandleOp(*input_node, num_workers, flib, graph));
+    }
+  } else if (IsDatasetNodeOfType(node, kPassThroughOps)) {
+    // For all the dataset ops that are pass through, the input dataset is
+    // input 0.
+    NodeDef* input_node = graph_utils::GetInputNode(node, *graph, 0);
+    TF_RETURN_IF_ERROR(
+        RecursivelyHandleOp(*input_node, num_workers, flib, graph));
+  } else if (IsDatasetNodeOfType(node, kFuncDatasetOps)) {
+    const string func_name = node.attr().at("f").func().name();
+    const FunctionDef* fdef = flib->Find(func_name);
+    GrapplerFunctionItem f_item;
+    TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(
+        *fdef, *flib, graph->graph()->versions().producer(), &f_item));
+    GraphDef optimized_func_graph;
+    Status s = OptimizeGraph(f_item, num_workers, &optimized_func_graph);
+    if (s.ok()) {
+      // Function body optimization might have created new specialized
+      // functions for each instantiation context. Add them to the library.
+      for (const FunctionDef& func_def :
+           optimized_func_graph.library().function()) {
+        if (flib->Find(func_def.signature().name()) == nullptr) {
+          TF_RETURN_IF_ERROR(flib->AddFunctionDef(func_def));
+        }
+      }
+
+      // Convert optimized graph back to FunctionDef.
+      FunctionDef optimized_func;
+      f_item.SwapFunctionBody(std::move(optimized_func_graph));
+      TF_RETURN_IF_ERROR(MakeFunctionDef(f_item, *flib, &optimized_func));
+
+      // Replace optimized function with a new FunctionDef.
+      TF_RETURN_IF_ERROR(flib->ReplaceFunction(func_name, optimized_func));
+    }
+  } else if (IsDatasetNodeOfType(node, kSourceDatasetOps)) {
+    return errors::InvalidArgument(
+        "Reached a source dataset: ", node.op(),
+        " without encountering a batch transformation.");
+  } else {
+    return errors::InvalidArgument("Encountered an unsupported op: ",
+                                   node.op());
+  }
+  return Status::OK();
+}
+
+// Helper function that given a GrapplerItem generates a mutated graph def
+// with the batch size changed. The GrapplerItem could be generated from the
+// main graph or could be a function graph.
+Status OptimizeGraph(const GrapplerItem& item, int64 num_workers,
+                     GraphDef* output) {
+  *output = item.graph;
+  MutableGraphView graph(output);
+
+  FunctionLibraryDefinition flib(OpRegistry::Global(), item.graph.library());
+
+  NodeDef sink_node;
+  TF_RETURN_IF_ERROR(FindSinkNode(item.graph, &sink_node));
+  TF_RETURN_IF_ERROR(
+      RecursivelyHandleOp(sink_node, num_workers, &flib, &graph));
+  *output->mutable_library() = flib.ToProto();
+  return Status::OK();
+}
+
+}  // anonymous namespace
+
+Status RebatchOptimizer::OptimizeAndCollectStats(Cluster* cluster,
+                                                 const GrapplerItem& item,
+                                                 GraphDef* output,
+                                                 OptimizationStats* stats) {
+  *output = item.graph;
+  MutableGraphView graph(output);
+
+  TF_RETURN_IF_ERROR(OptimizeGraph(item, num_workers_, output));
+  stats->num_changes++;
+  return Status::OK();
+}
+
+void RebatchOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
+                                const GraphDef& optimize_output,
+                                double result) {}
+
+REGISTER_GRAPH_OPTIMIZER_AS(RebatchOptimizer, "tf_data_rebatcher");
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/rebatch.h b/tensorflow/core/grappler/optimizers/data/rebatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..29a610002645b9dd88d8a278f68094b2121697ac
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/rebatch.h
@@ -0,0 +1,52 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_REBATCH_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_REBATCH_H_
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// This optimizer changes the batch size of the output dataset by dividing the
+// current batch size by parameter `num_workers`. Currently, this works only
+// for very simple pipelines with a single BatchDatasetV2 transformation.
+class RebatchOptimizer : public TFDataOptimizerBase {
+ public:
+  RebatchOptimizer() = default;
+  ~RebatchOptimizer() override = default;
+
+  string name() const override { return "tf_data_rebatcher"; }
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override;
+
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+
+ private:
+  int64 num_workers_;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_REBATCH_H_
diff --git a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
index d9af78d38cd590f5eecefe4d70c7e45dd94985c0..0563460b29505f1b054a57624d470c4e642bea2f 100644
--- a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h"
 
-#include "tensorflow/core/framework/attr_value.pb.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -34,12 +34,12 @@ constexpr char kFusedOpName[] = "ShuffleAndRepeatDataset";
 
 }  // namespace
 
-Status ShuffleAndRepeatFusion::Optimize(Cluster* cluster,
-                                        const GrapplerItem& item,
-                                        GraphDef* output) {
+Status ShuffleAndRepeatFusion::OptimizeAndCollectStats(
+    Cluster* cluster, const GrapplerItem& item, GraphDef* output,
+    OptimizationStats* stats) {
   *output = item.graph;
   MutableGraphView graph(output);
-  std::set<string> nodes_to_delete;
+  absl::flat_hash_set<string> nodes_to_delete;
 
   auto make_shuffle_and_repeat_node = [&output](const NodeDef& shuffle_node,
                                                 const NodeDef& repeat_node) {
@@ -86,14 +86,16 @@ Status ShuffleAndRepeatFusion::Optimize(Cluster* cluster,
 
     NodeDef* shuffle_and_repeat_node =
         graph.AddNode(make_shuffle_and_repeat_node(shuffle_node, repeat_node));
-    graph.UpdateFanouts(repeat_node.name(), shuffle_and_repeat_node->name());
+    TF_RETURN_IF_ERROR(graph.UpdateFanouts(repeat_node.name(),
+                                           shuffle_and_repeat_node->name()));
 
     // Mark the `Shuffle` and `Repeat` nodes for removal.
     nodes_to_delete.insert(shuffle_node.name());
     nodes_to_delete.insert(repeat_node.name());
+    stats->num_changes++;
   }
 
-  graph.DeleteNodes(nodes_to_delete);
+  TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
   return Status::OK();
 }
 
@@ -107,5 +109,5 @@ void ShuffleAndRepeatFusion::Feedback(Cluster* cluster,
 REGISTER_GRAPH_OPTIMIZER_AS(ShuffleAndRepeatFusion,
                             "shuffle_and_repeat_fusion");
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h
index c8fa53edce38531671aa481c1dffbc5b8a28046b..3738d141c3a582fb9b214686a58e36e8869cea4e 100644
--- a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h
@@ -16,12 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SHUFFLE_AND_REPEAT_FUSION_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SHUFFLE_AND_REPEAT_FUSION_H_
 
-#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
 
-class ShuffleAndRepeatFusion : public CustomGraphOptimizer {
+class ShuffleAndRepeatFusion : public TFDataOptimizerBase {
  public:
   ShuffleAndRepeatFusion() = default;
   ~ShuffleAndRepeatFusion() override = default;
@@ -33,14 +33,15 @@ class ShuffleAndRepeatFusion : public CustomGraphOptimizer {
     return Status::OK();
   }
 
-  Status Optimize(Cluster* cluster, const GrapplerItem& item,
-                  GraphDef* output) override;
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
 };
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SHUFFLE_AND_REPEAT_FUSION_H_
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
index 0eee91f241a8e3c09b93a159c93addb43e749b02..0f34d2b7ebe59244a9b02b5209732fc830cd6729 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/data/vectorization/vectorizer_registry.h"
-#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc b/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
index 60c557d557e31173135cf9639efbf345a586faa1..1969ff00e4ae5147f183e1230986b4d2b4620fc7 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/vectorization_utils.cc
@@ -46,7 +46,7 @@ namespace {
 // Describes a tensor with its operation Node and output position
 typedef std::pair<Node*, int> TensorDesc;
 
-const char* const kRetValOp = "_Retval";
+constexpr char kRetValOp[] = "_Retval";
 
 void ReplaceEdgeSources(const TensorDesc& old_src, const TensorDesc& new_src,
                         Graph* graph) {
@@ -415,6 +415,10 @@ Status Vectorization::Initialize(const FunctionDef& outer_scope,
 // NodeBuilder
 Status Vectorization::StackTensor(WrappedTensor* unstacked,
                                   TensorDesc* result) {
+  if (unstacked->node->output_type(unstacked->output_index) == DT_VARIANT) {
+    // TODO(b/124069171): "ExpandDims" doesn't work with Variant tensors.
+    return errors::Unimplemented("Cannot stack tensor with Variant type.");
+  }
   // Note that all these nodes are necessary as the size of the batch may not be
   // constant.
   if (unstacked->stacked) {
@@ -643,6 +647,6 @@ Status VectorizeMapDefun(const FunctionDef& outer_scope,
   return Vectorization(lib).Vectorize(outer_scope, map_defun_node, result);
 }
 
-}  // end namespace vectorization_utils
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace vectorization_utils
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization_utils.h b/tensorflow/core/grappler/optimizers/data/vectorization_utils.h
index bd7d3909003d0b32938d939fbf87b809b4aed0dd..f5183fd4ff905baf3ba52dc1a1bae53928603657 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/vectorization_utils.h
@@ -90,8 +90,8 @@ Status VectorizeMapDefun(const FunctionDef& outer_scope,
                          const NodeDef& map_defun_node, FunctionDefLibrary* lib,
                          FunctionDef** result);
 
-}  // end namespace vectorization_utils
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace vectorization_utils
+}  // namespace grappler
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_VECTORIZATION_UTILS_H_
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index 7fee3ae9d51bcdb234945a6000985fb5531000a0..2dfa5e99d6af91cd6b7786fda111be2f6259db35 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -15,13 +15,13 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/dependency_optimizer.h"
 
-#include <unordered_map>
-#include <unordered_set>
-
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/utils.h"
@@ -38,20 +38,15 @@ namespace grappler {
 
 namespace {
 
-bool RemoveInput(NodeDef* node, const string& input, NodeMap* node_map) {
-  bool removed_input = false;
-  int pos = 0;
-  while (pos < node->input_size()) {
-    if (node->input(pos) == input) {
-      node->mutable_input()->SwapElements(pos, node->input_size() - 1);
-      node->mutable_input()->RemoveLast();
-      node_map->RemoveOutput(NodeName(input), node->name());
-      removed_input = true;
-    } else {
-      ++pos;
-    }
+// Builds a map from the &graph->node(i) to i.
+absl::flat_hash_map<const NodeDef*, int> BuildNodeToIdx(const GraphDef& graph) {
+  // Set up &node -> index map.
+  absl::flat_hash_map<const NodeDef*, int> node_to_idx;
+  for (int i = 0; i < graph.node_size(); ++i) {
+    const NodeDef& node = graph.node(i);
+    node_to_idx[&node] = i;
   }
-  return removed_input;
+  return node_to_idx;
 }
 
 }  // namespace
@@ -68,7 +63,9 @@ bool DependencyOptimizer::SafeToRemoveIdentity(const NodeDef& node) const {
     // The output values of this node may be needed.
     return false;
   }
-  const NodeDef* input = node_map_->GetNode(NodeName(node.input(0)));
+  MutableGraphView::OutputPort port = graph_view_->GetRegularFanin(
+      MutableGraphView::InputPort(const_cast<NodeDef*>(&node), 0));
+  NodeDef* input = port.node;
   CHECK(input != nullptr) << "node = " << node.name()
                           << " input = " << node.input(0);
   // Don't remove Identity nodes corresponding to Variable reads or following
@@ -77,22 +74,28 @@ bool DependencyOptimizer::SafeToRemoveIdentity(const NodeDef& node) const {
     return false;
   } else if (IsSwitch(*input)) {
     // Don't turn Identity nodes following Switch into NoOp or remove them
-    // if it requires anchoring a control dependencies the Switch node, which
+    // if it requires anchoring a control dependencies to the Switch node, which
     // is not valid.
-    if (str_util::StartsWith(node.name(), kConstantFoldingCtrl)) {
-      // TODO(rmlarsen): Try to remove this artificial contraint.
+    MutableGraphView::OutputPort control_port(const_cast<NodeDef*>(&node),
+                                              Graph::kControlSlot);
+    auto control_fanouts = graph_view_->GetFanout(control_port);
+    if (!control_fanouts.empty()) {
       return false;
     }
   }
-  for (auto consumer : node_map_->GetOutputs(node.name())) {
-    if (node.input_size() > 1 && IsMerge(*consumer)) {
+  bool node_has_multiple_inputs =
+      graph_view_->NumFanins(node, /*include_controlling_nodes=*/true) > 1;
+
+  auto fanouts =
+      graph_view_->GetFanouts(node, /*include_controlled_nodes=*/true);
+  for (auto fanout : fanouts) {
+    if (node_has_multiple_inputs && IsMerge(*fanout.node)) {
       return false;
     }
     if (IsSwitch(*input)) {
-      for (const string& consumer_input : consumer->input()) {
-        if (consumer_input == AsControlDependency(node.name())) {
-          return false;
-        }
+      if (graph_view_->HasFanin(*fanout.node,
+                                {node.name(), Graph::kControlSlot})) {
+        return false;
       }
     }
   }
@@ -116,7 +119,7 @@ bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) const {
   if (!status.ok() || op_def->output_arg_size() == 0) {
     return false;
   }
-  const std::unordered_set<string> do_not_rewrite_ops{
+  const absl::flat_hash_set<string> do_not_rewrite_ops{
       "Assert",     "CheckNumerics",         "_Retval",
       "_Arg",       "_ParallelConcatUpdate", "TPUExecute",
       "TPUCompile", "ControlTrigger"};
@@ -126,7 +129,7 @@ bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) const {
   if (!SafeToRemoveIdentity(node)) {
     return false;
   }
-  if (NumNonControlOutputs(node, *node_map_) > 0) {
+  if (graph_view_->NumFanouts(node, /*include_controlled_nodes=*/false) > 0) {
     // The output values of this node may be needed.
     return false;
   }
@@ -134,61 +137,61 @@ bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) const {
 }
 
 int DependencyOptimizer::NumEdgesIfBypassed(
-    const NodeDef& node, const std::vector<NodeDef*>& output_nodes) const {
+    const NodeDef& node, int num_controlling_fanins,
+    const absl::flat_hash_set<MutableGraphView::Edge>& fanin_edges,
+    const absl::flat_hash_set<MutableGraphView::Edge>& fanout_edges,
+    int num_unique_fanout_nodes) const {
   const bool is_multi_input_identity_n =
       IsIdentityN(node) && !IsIdentityNSingleInput(node);
-  const int num_outputs = output_nodes.size();
-  const int num_inputs = node.input_size();
+  const int num_fanins = fanin_edges.size();
 
   if (is_multi_input_identity_n) {
     // multi-input identity_n with input/output control dependencies will likely
     // increase number of edges after optimization.
-    int num_edges_if_bypassed(0);
-    for (string input_node_name : node.input()) {
-      if (IsControlInput(input_node_name)) {
-        num_edges_if_bypassed += num_outputs;
+    int num_edges_if_bypassed = 0;
+    int num_non_controlling_fanins = num_fanins - num_controlling_fanins;
+    num_edges_if_bypassed += num_non_controlling_fanins;
+    num_edges_if_bypassed += num_controlling_fanins * num_unique_fanout_nodes;
+
+    for (const auto& fanout : fanout_edges) {
+      if (fanout.dst.port_id == Graph::kControlSlot) {
+        num_edges_if_bypassed += num_fanins;
       } else {
         ++num_edges_if_bypassed;
       }
     }
-
-    for (auto consumer : output_nodes) {
-      for (int j = 0; j < consumer->input_size(); ++j) {
-        const TensorId consumer_input = ParseTensorName(consumer->input(j));
-        if (consumer_input.node() == node.name()) {
-          if (IsControlInput(consumer_input)) {
-            num_edges_if_bypassed += num_inputs;
-          } else {
-            ++num_edges_if_bypassed;
-          }
-        }
-      }
-    }
     return num_edges_if_bypassed;
   } else {
-    return num_inputs * num_outputs;
+    return num_fanins * num_unique_fanout_nodes;
   }
 }
 
 bool DependencyOptimizer::BypassingNodeIsBeneficial(
-    const NodeDef& node, const std::vector<NodeDef*>& input_nodes,
-    const std::vector<NodeDef*>& output_nodes) const {
+    const NodeDef& node, int num_controlling_fanins,
+    const absl::flat_hash_set<MutableGraphView::Edge>& fanin_edges,
+    const absl::flat_hash_set<MutableGraphView::Edge>& fanout_edges) const {
   const bool is_identity = IsIdentity(node) || IsIdentityNSingleInput(node);
   const bool is_multi_input_identity_n =
       IsIdentityN(node) && !IsIdentityNSingleInput(node);
-  const int num_outputs = output_nodes.size();
-  const int num_inputs = node.input_size();
+  const int num_fanins = fanin_edges.size();
+  absl::flat_hash_set<NodeDef*> unique_fanout_nodes;
+  for (const auto& fanout_edge : fanout_edges) {
+    unique_fanout_nodes.insert(fanout_edge.dst.node);
+  }
+  const int num_unique_fanout_nodes = unique_fanout_nodes.size();
 
-  if (NumEdgesIfBypassed(node, output_nodes) > num_inputs + num_outputs) {
+  if (NumEdgesIfBypassed(node, num_controlling_fanins, fanin_edges,
+                         fanout_edges, num_unique_fanout_nodes) >
+      num_fanins + num_unique_fanout_nodes) {
     return false;
   }
 
   // Make sure that we don't increase the number of edges that cross
   // device boundaries.
-  if ((num_inputs == 1 && num_outputs > 1 &&
-       input_nodes[0]->device() != node.device()) ||
-      (num_inputs > 1 && num_outputs == 1 &&
-       output_nodes[0]->device() != node.device())) {
+  if ((num_fanins == 1 && num_unique_fanout_nodes > 1 &&
+       fanin_edges.begin()->src.node->device() != node.device()) ||
+      (num_fanins > 1 && num_unique_fanout_nodes == 1 &&
+       fanout_edges.begin()->dst.node->device() != node.device())) {
     return false;
   }
 
@@ -197,114 +200,90 @@ bool DependencyOptimizer::BypassingNodeIsBeneficial(
   // cost before and after.
   const string& node_dev = node.device();
   int num_cross_in = 0;
-  for (NodeDef* input_node : input_nodes) {
-    num_cross_in += static_cast<int>(input_node->device() != node_dev);
+  for (const auto& fanin : fanin_edges) {
+    num_cross_in += static_cast<int>(fanin.src.node->device() != node_dev);
   }
   int num_cross_out = 0;
-  for (NodeDef* output_node : output_nodes) {
-    num_cross_out += static_cast<int>(output_node->device() != node_dev);
-  }
-
-  if ((is_identity || is_multi_input_identity_n) && num_cross_in > 0 &&
-      num_cross_out > 0) {
-    // This identity node follows a device crossing, so it might be
-    // following a _Recv node after partioning. Do not remove such nodes,
-    // unless they only have consumers on the same device as themselves.
-    return false;
+  for (const auto& fanout : unique_fanout_nodes) {
+    num_cross_out += static_cast<int>(fanout->device() != node_dev);
   }
 
   // Make sure we do not increase the number of device crossings.
   const int num_cross_before = num_cross_in + num_cross_out;
   int num_cross_after = 0;
-  for (NodeDef* input_node : input_nodes) {
-    for (NodeDef* output_node : output_nodes) {
+  for (const auto& fanin : fanin_edges) {
+    for (const auto& fanout : unique_fanout_nodes) {
       num_cross_after +=
-          static_cast<int>(input_node->device() != output_node->device());
+          static_cast<int>(fanin.src.node->device() != fanout->device());
     }
   }
   if (num_cross_after > num_cross_before) {
     return false;
   }
+
+  if ((is_identity || is_multi_input_identity_n) && num_cross_in > 0 &&
+      num_cross_out > 0 && num_cross_after > 0) {
+    // This identity node follows a device crossing, so it might be
+    // following a _Recv node after partioning. Do not remove such nodes,
+    // unless they only have consumers on the same device as themselves.
+    return false;
+  }
+
   return true;
 }
 
-void DependencyOptimizer::OptimizeNode(int node_idx,
-                                       SetVector<int>* nodes_to_simplify,
-                                       std::set<int>* nodes_to_delete) {
-  NodeDef* node = optimized_graph_->mutable_node(node_idx);
+Status DependencyOptimizer::OptimizeNode(
+    const string& node_name, SetVector<string>* nodes_to_simplify,
+    absl::flat_hash_set<string>* nodes_to_delete) {
+  NodeDef* node = graph_view_->GetNode(node_name);
   const bool is_noop = IsNoOp(*node);
   const bool is_identity = IsIdentity(*node) || IsIdentityNSingleInput(*node);
   const bool is_multi_input_identity =
       IsIdentityN(*node) && !IsIdentityNSingleInput(*node);
-  const string node_name = node->name();
-  // Constant nodes with no input control dependency are always executed early,
-  // so we can prune all their output control dependencies.
-  if (IsConstant(*node) && node->input_size() == 0) {
-    const std::set<NodeDef*> output_nodes = node_map_->GetOutputs(node_name);
-    for (NodeDef* fanout : output_nodes) {
-      bool optimize_fanout = false;
-      bool data_connection = false;
-      for (int i = fanout->input_size() - 1; i >= 0; --i) {
-        const TensorId input_tensor = ParseTensorName(fanout->input(i));
-        if (input_tensor.node() == node_name) {
-          if (input_tensor.index() < 0) {
-            fanout->mutable_input()->SwapElements(i, fanout->input_size() - 1);
-            fanout->mutable_input()->RemoveLast();
-            optimize_fanout = true;
-          } else {
-            data_connection = true;
-          }
-        }
-      }
-      if (optimize_fanout) {
-        nodes_to_simplify->PushBack(node_to_idx_[fanout]);
-        if (!data_connection) {
-          node_map_->RemoveOutput(node_name, fanout->name());
-        }
-      }
+  // WARNING: This is a strong assumption based on the executor behavior that
+  // constant nodes with no input control dependency are always executed early.
+  // In this case we then can prune all their output control dependencies.
+  if (IsConstant(*node) &&
+      graph_view_->NumFanins(*node, /*include_controlling_nodes=*/true) == 0) {
+    MutableGraphView::OutputPort control_port(node, Graph::kControlSlot);
+    auto control_fanouts = graph_view_->GetFanout(control_port);
+    for (const auto& fanout : control_fanouts) {
+      TF_RETURN_IF_ERROR(
+          graph_view_->RemoveControllingFanin(fanout.node->name(), node_name));
+      nodes_to_simplify->PushBack(fanout.node->name());
     }
-    if (node_map_->GetOutputs(node_name).empty() && fetch_nodes_known_ &&
+
+    if (graph_view_->NumFanouts(*node, /*include_controlled_nodes=*/true) ==
+            0 &&
+        fetch_nodes_known_ &&
         nodes_to_preserve_.find(node_name) == nodes_to_preserve_.end()) {
       // Mark the node for deletion.
-      nodes_to_delete->insert(node_to_idx_[node]);
+      nodes_to_delete->insert(node_name);
     }
-    return;
+    return Status::OK();
   }
 
   // Change ops that only have control dependencies as outputs to NoOps.
   if (!is_noop && SafeToConvertToNoOp(*node)) {
-    VLOG(1) << "***** Replacing  " << node_name << " (" << node->op()
+    VLOG(1) << "***** Replacing " << node_name << " (" << node->op()
             << ") with NoOp.";
     // The outputs of this node are not consumed. Replace its inputs with
     // control dependencies and replace the op itself with the NoOp op.
-    std::unordered_set<string> ctrl_inputs;
-    int pos = 0;
-    while (pos < node->input_size()) {
-      const string old_input = node->input(pos);
-      if (IsControlInput(old_input)) {
-        if (!ctrl_inputs.insert(old_input).second) {
-          // We found a duplicate control input. Remove it.
-          node->mutable_input()->SwapElements(pos, node->input_size() - 1);
-          node->mutable_input()->RemoveLast();
-        } else {
-          ++pos;
-        }
-        continue;
-      }
-      // Replace a normal input with a control input.
-      const string ctrl_input = ConstantFolding::AddControlDependency(
-          old_input, optimized_graph_, node_map_.get());
-      ctrl_inputs.insert(ctrl_input);
-      node->set_input(pos, ctrl_input);
-      node_map_->UpdateInput(node_name, old_input, ctrl_input);
-      const NodeDef* old_input_node = node_map_->GetNode(old_input);
-      nodes_to_simplify->PushBack(node_to_idx_[old_input_node]);
-      ++pos;
+    const int num_regular_fanins =
+        graph_view_->NumFanins(*node, /*include_controlling_nodes=*/false);
+    absl::flat_hash_set<string> regular_fanin_names;
+    for (int i = 0; i < num_regular_fanins; ++i) {
+      regular_fanin_names.emplace(ParseTensorName(node->input(i)).node());
+    }
+    TF_RETURN_IF_ERROR(
+        graph_view_->UpdateAllRegularFaninsToControlling(node_name));
+    TF_RETURN_IF_ERROR(
+        graph_view_->UpdateNode(node_name, "NoOp", node->device(), {}));
+    for (const string& regular_fanin_name : regular_fanin_names) {
+      nodes_to_simplify->PushBack(regular_fanin_name);
     }
-    node->set_op("NoOp");
-    node->clear_attr();
-    nodes_to_simplify->PushBack(node_to_idx_[node]);
-    return;
+    nodes_to_simplify->PushBack(node_name);
+    return Status::OK();
   }
 
   // Remove NoOp nodes if the product of their fan-in and fan-out is less than
@@ -357,154 +336,131 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
 
   if (is_noop || ((is_identity || is_multi_input_identity) &&
                   SafeToRemoveIdentity(*node))) {
-    const auto& output_node_set = node_map_->GetOutputs(node_name);
-    const std::vector<NodeDef*> output_nodes(output_node_set.begin(),
-                                             output_node_set.end());
-    const int num_inputs = node->input_size();
-    std::vector<NodeDef*> input_nodes;
-    for (int i = 0; i < num_inputs; ++i) {
-      NodeDef* input_node = node_map_->GetNode(node->input(i));
-      if (input_node == nullptr) {
-        LOG(ERROR) << "Invalid input " << node->input(i);
-        return;
+    auto fanin_edges =
+        graph_view_->GetFaninEdges(*node, /*include_controlling_edges=*/true);
+    std::vector<NodeDef*> controlling_fanins;
+    controlling_fanins.reserve(fanin_edges.size());
+    for (const auto& fanin_edge : fanin_edges) {
+      if (fanin_edge.src.port_id == Graph::kControlSlot) {
+        controlling_fanins.push_back(fanin_edge.src.node);
       }
-      input_nodes.push_back(input_node);
     }
-
-    if (!BypassingNodeIsBeneficial(*node, input_nodes, output_nodes)) {
-      return;
+    auto fanout_edges =
+        graph_view_->GetFanoutEdges(*node, /*include_controlled_edges=*/true);
+    if (!BypassingNodeIsBeneficial(*node, controlling_fanins.size(),
+                                   fanin_edges, fanout_edges)) {
+      return Status::OK();
     }
 
     VLOG(1) << "***** Rerouting input around\n" << node->DebugString();
-    // Now remove the node and re-wire its inputs to its outputs.
-    for (auto consumer : output_nodes) {
-      bool updated_consumer = false;
-      VLOG(1) << "consumer before:\n" << consumer->DebugString();
-      for (int i = 0; i < num_inputs; ++i) {
-        const NodeDef* input = input_nodes[i];
-        // Forward dependency from input to consumer if it doesn't already
-        // depend on it.
-        if ((is_identity && i == 0) ||
-            (is_multi_input_identity && !IsControlInput(node->input(i)))) {
-          // Replace regular input from Identity node.
-          string new_input;
-          const string& input_to_forward = node->input(i);
-          CHECK(!IsControlInput(input_to_forward));
-          for (int j = 0; j < consumer->input_size(); ++j) {
-            const TensorId old_input = ParseTensorName(consumer->input(j));
-            if (old_input.node() == node_name) {
-              if (old_input.index() == i) {
-                // Regular input
-                new_input = input_to_forward;
-                node_map_->UpdateInput(consumer->name(), old_input.ToString(),
-                                       new_input);
-                consumer->set_input(j, new_input);
-              } else if (old_input.index() == -1) {
-                // Control dependency
-                new_input = AsControlDependency(NodeName(input_to_forward));
-                node_map_->UpdateInput(consumer->name(), old_input.ToString(),
-                                       new_input);
-                consumer->set_input(j, new_input);
-              }
-            }
-          }
-          updated_consumer = true;
-        } else {
-          // Forward dependency from input to consumer if it doesn't already
-          // depend on it.
-          if (node_map_->GetOutputs(input->name()).count(consumer) == 0) {
-            consumer->add_input(AsControlDependency(input->name()));
-            node_map_->AddOutput(input->name(), consumer->name());
-            nodes_to_simplify->PushBack(node_to_idx_[input]);
-            updated_consumer = true;
-          }
-        }
+
+    absl::flat_hash_set<NodeDef*> processed_nodes;
+    for (const auto& fanout_edge : fanout_edges) {
+      NodeDef* consumer = fanout_edge.dst.node;
+      const int src_port = fanout_edge.src.port_id;
+      if ((is_identity && src_port == 0) ||
+          (is_multi_input_identity && src_port > Graph::kControlSlot)) {
+        // Identity regular fanins.
+        const string& input_to_forwards = node->input(src_port);
+        TF_RETURN_IF_ERROR(graph_view_->UpdateRegularFaninByPort(
+            consumer->name(), fanout_edge.dst.port_id,
+            ParseTensorName(input_to_forwards)));
+      } else if (is_identity || is_multi_input_identity) {
+        // Identity control dependency.
+        // TODO(lyandy): Handle IdentityN properly here by adding all regular
+        // fanins as controlling fanins.
+        const string& node_first_input = node->input(0);
+        TF_RETURN_IF_ERROR(graph_view_->UpdateFanin(
+            consumer->name(), {node_name, Graph::kControlSlot},
+            {ParseTensorName(node_first_input).node(), Graph::kControlSlot}));
+      } else {
+        // NoOp.
+        TF_RETURN_IF_ERROR(
+            graph_view_->RemoveControllingFanin(consumer->name(), node_name));
       }
-      // Remove dependency on node from consumer.
-      updated_consumer |= RemoveInput(consumer, AsControlDependency(node_name),
-                                      node_map_.get());
-      if (updated_consumer) {
-        nodes_to_simplify->PushBack(node_to_idx_[consumer]);
+      processed_nodes.insert(consumer);
+      nodes_to_simplify->PushBack(consumer->name());
+    }
+    for (const auto& processed_node : processed_nodes) {
+      // Forward dependency from input to consumer if it doesn't already
+      // depend on it.
+      for (const auto& controlling_fanin : controlling_fanins) {
+        TF_RETURN_IF_ERROR(graph_view_->AddControllingFanin(
+            processed_node->name(),
+            {controlling_fanin->name(), Graph::kControlSlot}));
+        nodes_to_simplify->PushBack(controlling_fanin->name());
       }
-      VLOG(1) << "consumer after:\n" << consumer->DebugString();
     }
-    node_map_->RemoveOutputs(node_name);
+
     if (fetch_nodes_known_ &&
         nodes_to_preserve_.find(node_name) == nodes_to_preserve_.end()) {
-      // Mark the node for deletion.
-      nodes_to_delete->insert(node_idx);
-
       // Disconnect the node from its inputs to enable further optimizations.
-      node_map_->RemoveInputs(node_name);
-      node->clear_input();
+      TF_RETURN_IF_ERROR(graph_view_->RemoveAllFanins(
+          node_name, /*keep_controlling_fanins=*/false));
+      // Mark the node for deletion.
+      nodes_to_delete->insert(node_name);
     }
   }
-}
-
-void DependencyOptimizer::CleanControlInputs() {
-  for (int i = 0; i < optimized_graph_->node_size(); ++i) {
-    DedupControlInputs(optimized_graph_->mutable_node(i));
-  }
+  return Status::OK();
 }
 
 Status DependencyOptimizer::OptimizeDependencies() {
-  SetVector<int> nodes_to_simplify;
-  std::set<int> nodes_to_delete;
-  for (int i = 0; i < optimized_graph_->node_size(); ++i) {
-    const NodeDef& node = optimized_graph_->node(i);
+  SetVector<string> nodes_to_simplify;
+  absl::flat_hash_set<string> nodes_to_delete;
+  for (int i = 0; i < graph_view_->graph()->node_size(); ++i) {
+    const NodeDef& node = graph_view_->graph()->node(i);
     if (IsNoOp(node) || IsIdentity(node) || IsIdentityN(node) ||
         IsConstant(node) || SafeToConvertToNoOp(node)) {
-      nodes_to_simplify.PushBack(i);
+      nodes_to_simplify.PushBack(node.name());
     }
   }
   while (!nodes_to_simplify.Empty()) {
-    int node_to_simplify = nodes_to_simplify.PopBack();
+    string node_to_simplify = nodes_to_simplify.PopBack();
     // Discard nodes that were marked for deletion already.
     while (nodes_to_delete.find(node_to_simplify) != nodes_to_delete.end()) {
       node_to_simplify = nodes_to_simplify.PopBack();
     }
-    OptimizeNode(node_to_simplify, &nodes_to_simplify, &nodes_to_delete);
+    TF_RETURN_IF_ERROR(
+        OptimizeNode(node_to_simplify, &nodes_to_simplify, &nodes_to_delete));
   }
 
   if (fetch_nodes_known_) {
     VLOG(1) << "Deleted " << nodes_to_delete.size() << " out of "
-            << optimized_graph_->node_size() << " nodes.";
-    EraseNodesFromGraph(nodes_to_delete, optimized_graph_);
-    node_map_.reset(new NodeMap(optimized_graph_));
-    BuildNodeToIdx();
+            << graph_view_->graph()->node_size() << " nodes.";
+    TF_RETURN_IF_ERROR(graph_view_->DeleteNodes(nodes_to_delete));
   }
   return Status::OK();
 }
 
 Status DependencyOptimizer::TransitiveReduction() {
   // PRECONDITION: optimized_graph_ must be sorted topologically.
-  const int num_nodes = optimized_graph_->node_size();
+  GraphDef* graph = graph_view_->graph();
+  auto node_to_idx = BuildNodeToIdx(*graph);
+  const int num_nodes = graph->node_size();
   // Set up a compressed version of the graph to save a constant factor in the
   // expensive algorithm below. Also cache the set of control outputs and the
   // highest index of a target of any control output from each node.
   int num_controls = 0;
   std::vector<gtl::InlinedVector<int, 4>> inputs(num_nodes);
-  std::vector<gtl::InlinedVector<std::pair<int, int>, 2>> control_outputs(
-      num_nodes);
+  std::vector<gtl::InlinedVector<int, 2>> control_outputs(num_nodes);
   for (int node_idx = 0; node_idx < num_nodes; ++node_idx) {
-    const NodeDef& node = optimized_graph_->node(node_idx);
+    const NodeDef& node = graph->node(node_idx);
     if (ModifiesFrameInfo(node) || !HasOpDef(node)) {
       // Ignore function nodes and nodes that modify frame info.
       continue;
     }
-    for (int input_slot = 0; input_slot < node.input_size(); ++input_slot) {
-      const string& input = node.input(input_slot);
-      const NodeDef* input_node = node_map_->GetNode(input);
+    for (const string& input : node.input()) {
+      const NodeDef* input_node = graph_view_->GetNode(NodeName(input));
       if (ModifiesFrameInfo(*input_node) || IsMerge(*input_node)) {
         // Ignore edges from nodes that modify frame info and from Merge nodes,
         // because we cannot know which of it's input paths executes.
         continue;
       }
-      const int input_node_idx = node_to_idx_[input_node];
+      const int input_node_idx = node_to_idx[input_node];
       inputs[node_idx].push_back(input_node_idx);
       if (IsControlInput(input)) {
         ++num_controls;
-        control_outputs[input_node_idx].emplace_back(node_idx, input_slot);
+        control_outputs[input_node_idx].emplace_back(node_idx);
       }
     }
   }
@@ -519,14 +475,12 @@ Status DependencyOptimizer::TransitiveReduction() {
   // such that when we swap them out so we don't clobber the
   // node(target).input() repeated field.
   typedef std::pair<int, int> InputSlotAndSource;
-  std::unordered_map<
-      int, std::set<InputSlotAndSource, std::greater<InputSlotAndSource>>>
-      control_edges_to_remove;
+  absl::flat_hash_map<int, absl::flat_hash_set<int>> control_edges_to_remove;
   for (int source = 0; source < num_nodes; ++source) {
     int highest_control_target = -1;
     for (const auto& control_output : control_outputs[source]) {
-      if (control_output.first > highest_control_target) {
-        highest_control_target = control_output.first;
+      if (control_output > highest_control_target) {
+        highest_control_target = control_output;
       }
     }
     if (highest_control_target <= source) {
@@ -556,26 +510,21 @@ Status DependencyOptimizer::TransitiveReduction() {
     // longer than 1, there exists an alternate path, and we can eliminate the
     // redundant direct control dependency.
     for (const auto& control_output : control_outputs[source]) {
-      const int target = control_output.first;
+      const int target = control_output;
       if (longest_distance[target] > 1) {
-        const int input_slot = control_output.second;
-        control_edges_to_remove[target].emplace(input_slot, source);
+        control_edges_to_remove[target].emplace(source);
       }
     }
   }
 
   for (const auto& it : control_edges_to_remove) {
     const int target = it.first;
-    NodeDef* target_node = optimized_graph_->mutable_node(target);
-    for (const InputSlotAndSource& slot_and_source : it.second) {
-      const int input_slot = slot_and_source.first;
-      const int source = slot_and_source.second;
-      const NodeDef& source_node = optimized_graph_->node(source);
-      CHECK_LT(input_slot, target_node->input_size());
-      target_node->mutable_input()->SwapElements(input_slot,
-                                                 target_node->input_size() - 1);
-      node_map_->RemoveOutput(source_node.name(), target_node->name());
-      target_node->mutable_input()->RemoveLast();
+    const NodeDef& target_node = graph->node(target);
+    const string target_node_name = target_node.name();
+    for (const int& source : it.second) {
+      const NodeDef& source_node = graph->node(source);
+      TF_RETURN_IF_ERROR(graph_view_->RemoveControllingFanin(
+          target_node_name, source_node.name()));
       ++num_controls_removed;
     }
   }
@@ -584,26 +533,17 @@ Status DependencyOptimizer::TransitiveReduction() {
   return Status::OK();
 }
 
-void DependencyOptimizer::BuildNodeToIdx() {
-  // Set up &node -> index map.
-  node_to_idx_.clear();
-  for (int i = 0; i < optimized_graph_->node_size(); ++i) {
-    const NodeDef& node = optimized_graph_->node(i);
-    node_to_idx_[&node] = i;
-  }
-}
-
 // Suppose there are cross-device control inputs to node C from multiple nodes
 // that are located on another device, e.g., we have control edges:
 // A->C, B->C
 // where A and B are on device X and C is on device Y.
 // We can reduce cross-device communication by introducing an intermediate
 // NoOp node C' on device X and rewriting the control edges to:
-// A->C', B->C', C' -> C
-void DependencyOptimizer::GroupCrossDeviceControlEdges() {
-  const int num_nodes = optimized_graph_->node_size();
+// A->C', B->C', C'->C
+Status DependencyOptimizer::GroupCrossDeviceControlEdges() {
+  const int num_nodes = graph_view_->graph()->node_size();
   for (int i = 0; i < num_nodes; ++i) {
-    NodeDef* node = optimized_graph_->mutable_node(i);
+    NodeDef* node = graph_view_->graph()->mutable_node(i);
     if (node->device().empty()) continue;
 
     // Creates new noop nodes for devices on which multiple control inputs are
@@ -614,88 +554,71 @@ void DependencyOptimizer::GroupCrossDeviceControlEdges() {
     // that device.
     std::map<string, NodeDef*> noops;
     int num_noops = 0;
-    for (int j = 0; j < node->input_size(); ++j) {
-      if (IsControlInput(node->input(j))) {
-        const NodeDef* input = node_map_->GetNode(node->input(j));
-        if (input != nullptr && !input->device().empty() &&
-            input->device() != node->device()) {
-          auto emplace_result = noops.emplace(input->device(), nullptr);
-          if (!emplace_result.second &&
-              emplace_result.first->second == nullptr) {
-            // This is the second cross-device control input from the same
-            // device. Creates an intermediate noop node on that device.
-            string group_name;
-            NodeDef* noop;
-            // Creates a fresh node name; there may be conflicting names from
-            // a previous iteration of the optimizer.
-            do {
-              group_name = AddPrefixToNodeName(
-                  node->name(),
-                  strings::StrCat("GroupCrossDeviceControlEdges_", num_noops));
-              noop = node_map_->GetNode(group_name);
-              ++num_noops;
-            } while (noop != nullptr);
-            noop = optimized_graph_->add_node();
-            noop->set_name(group_name);
-            noop->set_device(input->device());
-            noop->set_op("NoOp");
-            node_map_->AddNode(noop->name(), noop);
-            emplace_result.first->second = noop;
-          }
+    auto controlling_fanins = graph_view_->GetFanin(
+        MutableGraphView::InputPort(node, Graph::kControlSlot));
+    for (const auto& controlling_fanin : controlling_fanins) {
+      const NodeDef* fanin_node = controlling_fanin.node;
+      if (!fanin_node->device().empty() &&
+          fanin_node->device() != node->device()) {
+        auto emplace_result = noops.emplace(fanin_node->device(), nullptr);
+        if (!emplace_result.second && emplace_result.first->second == nullptr) {
+          // This is the second cross-device control input from the same
+          // device. Creates an intermediate noop node on that device.
+          string group_name;
+          NodeDef* noop;
+          // Creates a fresh node name; there may be conflicting names from
+          // a previous iteration of the optimizer.
+          do {
+            group_name = AddPrefixToNodeName(
+                node->name(),
+                strings::StrCat("GroupCrossDeviceControlEdges_", num_noops));
+            noop = graph_view_->GetNode(group_name);
+            ++num_noops;
+          } while (noop != nullptr);
+          NodeDef new_node;
+          new_node.set_name(group_name);
+          new_node.set_device(fanin_node->device());
+          new_node.set_op("NoOp");
+          emplace_result.first->second =
+              graph_view_->AddNode(std::move(new_node));
         }
       }
     }
 
     // Reroute existing control edges to go via the newly introduced NoOp nodes.
-    int pos = 0;
-    while (pos < node->input_size()) {
-      const string& input_name = node->input(pos);
-      if (IsControlInput(input_name)) {
-        NodeDef* input = node_map_->GetNode(input_name);
-        if (input == nullptr) {
-          ++pos;
-        } else {
-          auto it = noops.find(input->device());
-          if (it == noops.end() || it->second == nullptr) {
-            ++pos;
-          } else {
-            node->mutable_input()->SwapElements(pos, node->input_size() - 1);
-            node->mutable_input()->RemoveLast();
-            it->second->add_input(AsControlDependency(*input));
-            node_map_->UpdateOutput(input_name, node->name(),
-                                    it->second->name());
-          }
-        }
-      } else {
-        ++pos;
+    for (const auto& controlling_fanin : controlling_fanins) {
+      auto it = noops.find(controlling_fanin.node->device());
+      if (it != noops.end() && it->second != nullptr) {
+        TF_RETURN_IF_ERROR(graph_view_->RemoveControllingFanin(
+            node->name(), controlling_fanin.node->name()));
+        TF_RETURN_IF_ERROR(graph_view_->AddControllingFanin(
+            it->second->name(),
+            {controlling_fanin.node->name(), Graph::kControlSlot}));
       }
     }
     for (const auto& entry : noops) {
       if (entry.second) {
-        node->add_input(AsControlDependency(*entry.second));
-        node_map_->AddOutput(entry.second->name(), node->name());
+        TF_RETURN_IF_ERROR(graph_view_->AddControllingFanin(
+            node->name(), {entry.second->name(), Graph::kControlSlot}));
       }
     }
   }
+  return Status::OK();
 }
 
 Status DependencyOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                      GraphDef* optimized_graph) {
-  optimized_graph_ = optimized_graph;
-  *optimized_graph_ = item.graph;
+  *optimized_graph = item.graph;
   nodes_to_preserve_ = item.NodesToPreserve();
   fetch_nodes_known_ = !item.fetch.empty();
-  CleanControlInputs();
+  graph_view_.reset(new MutableGraphView(optimized_graph));
 
   const int num_iterations = 2;
   for (int iteration = 0; iteration < num_iterations; ++iteration) {
     GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
     Status topo_sort_status;
     // Perform topological sort to prepare the graph for transitive reduction.
-    topo_sort_status = TopologicalSort(optimized_graph_);
-    // Set up index-based graph datastructures to speed up analysis steps below.
-    node_map_.reset(new NodeMap(optimized_graph_));
-    BuildNodeToIdx();
+    topo_sort_status = TopologicalSort(optimized_graph);
 
     if (topo_sort_status.ok()) {
       // Remove redundant control dependencies.
@@ -709,10 +632,7 @@ Status DependencyOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     // nodes.
     TF_RETURN_IF_ERROR(OptimizeDependencies());
 
-    // Dedup control inputs.
-    CleanControlInputs();
-
-    GroupCrossDeviceControlEdges();
+    TF_RETURN_IF_ERROR(GroupCrossDeviceControlEdges());
   }
 
   return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.h b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
index 7b032673fb3456a724d8021a5dcebc8b4c957ba8..a60e7a352272748d3b2bb7163e73e96a4f46338d 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
@@ -17,6 +17,10 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DEPENDENCY_OPTIMIZER_H_
 
 #include <unordered_set>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
@@ -30,8 +34,7 @@ namespace grappler {
 class DependencyOptimizer : public GraphOptimizer {
  public:
   DependencyOptimizer() {}
-  explicit DependencyOptimizer(RewriterConfig::Toggle opt_level)
-      : opt_level_(opt_level) {}
+  explicit DependencyOptimizer(RewriterConfig::Toggle opt_level) {}
   ~DependencyOptimizer() override {}
 
   string name() const override { return "dependency_optimizer"; };
@@ -46,24 +49,25 @@ class DependencyOptimizer : public GraphOptimizer {
   // Returns true if bypassing node does not increase the number of edges or
   // number of edges crossing a device boundary.
   bool BypassingNodeIsBeneficial(
-      const NodeDef& node, const std::vector<NodeDef*>& input_nodes,
-      const std::vector<NodeDef*>& output_nodes) const;
-  int NumEdgesIfBypassed(const NodeDef& node,
-                         const std::vector<NodeDef*>& output_nodes) const;
+      const NodeDef& node, int num_controlling_fanins,
+      const absl::flat_hash_set<MutableGraphView::Edge>& fanin_edges,
+      const absl::flat_hash_set<MutableGraphView::Edge>& fanout_edges) const;
+  int NumEdgesIfBypassed(
+      const NodeDef& node, int num_controlling_fanins,
+      const absl::flat_hash_set<MutableGraphView::Edge>& fanin_edges,
+      const absl::flat_hash_set<MutableGraphView::Edge>& fanout_edges,
+      int num_unique_fanout_nodes) const;
   // Returns true if node is not an Identity node or if it is an Identity
   // that is safe to remove.
   bool SafeToRemoveIdentity(const NodeDef& node) const;
   // Returns true if it is safe to convert node to NoOp.
   bool SafeToConvertToNoOp(const NodeDef& node) const;
-  // Removes all duplicate control dependencies.
-  void CleanControlInputs();
-  // Builds a map from the &optimized_graph_->node(i) to i.
-  void BuildNodeToIdx();
-  // Tries to optimize the node with the given index, possibly additional
+  // Tries to optimize the node with the given node name, possibly additional
   // optimizations by inserting nodes in nodes_to_simplify, and pruning nodes by
   // inserting them in nodes_to_delete.
-  void OptimizeNode(int node_idx, SetVector<int>* nodes_to_simplify,
-                    std::set<int>* nodes_to_delete);
+  Status OptimizeNode(const string& node_name,
+                      SetVector<string>* nodes_to_simplify,
+                      absl::flat_hash_set<string>* nodes_to_delete);
   // Eliminates redundant control dependencies by computing the transitive
   // reduction of the graph.
   Status TransitiveReduction();
@@ -71,14 +75,11 @@ class DependencyOptimizer : public GraphOptimizer {
   Status OptimizeDependencies();
   // Replaces multiple cross-device control edges from the same device with a
   // single control edge.
-  void GroupCrossDeviceControlEdges();
+  Status GroupCrossDeviceControlEdges();
 
-  RewriterConfig::Toggle opt_level_;
   bool fetch_nodes_known_;
   std::unordered_set<string> nodes_to_preserve_;
-  std::unique_ptr<NodeMap> node_map_;
-  std::unordered_map<const NodeDef*, int> node_to_idx_;
-  GraphDef* optimized_graph_;  // Not owned.
+  std::unique_ptr<MutableGraphView> graph_view_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
index 8d70d9d5c73690e87d84cf941c749948e47ace26..80a0189b302366c420fc26bb8dc3ac469839e3b0 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
@@ -41,13 +41,32 @@ void VerifyGraphsEqual(const GraphDef& original_graph,
     const NodeDef& optimized = optimized_graph.node(i);
     EXPECT_EQ(original.name(), optimized.name()) << func;
     EXPECT_EQ(original.op(), optimized.op()) << func;
-    EXPECT_EQ(original.input_size(), optimized.input_size()) << func;
+    ASSERT_EQ(original.input_size(), optimized.input_size()) << func;
     for (int j = 0; j < original.input_size(); ++j) {
       EXPECT_EQ(original.input(j), optimized.input(j)) << func;
     }
   }
 }
 
+bool NodeHasControllingFanins(const NodeDef& node,
+                              const absl::flat_hash_set<string>& expected) {
+  absl::flat_hash_set<string> actual;
+  for (const string& fanin : node.input()) {
+    if (IsControlInput(fanin)) {
+      actual.insert(fanin);
+    }
+  }
+  if (actual.size() != expected.size()) {
+    return false;
+  }
+  for (const auto& expected_fanin : expected) {
+    if (!actual.contains(expected_fanin)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 TEST_F(DependencyOptimizerTest, NoOp) {
   // This trivial graph is so basic there's nothing to optimize.
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
@@ -89,12 +108,12 @@ TEST_F(DependencyOptimizerTest, DependenciesDrivenByConstants) {
   TF_EXPECT_OK(status);
 
   // The 'z' node should have been optimized away leaving only 5 nodes.
-  EXPECT_EQ(5, output.node_size());
+  EXPECT_EQ(output.node_size(), 5);
 
-  for (const NodeDef& node : item.graph.node()) {
+  for (const NodeDef& node : output.node()) {
     if (node.name() == "id1" || node.name() == "id2") {
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("add", node.input(0));
+      ASSERT_EQ(node.input_size(), 1);
+      EXPECT_EQ(node.input(0), "add");
     }
   }
 }
@@ -123,30 +142,30 @@ TEST_F(DependencyOptimizerTest, ChangeToNoop) {
   status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  EXPECT_EQ(item.graph.node_size(), output.node_size());
+  EXPECT_EQ(output.node_size(), item.graph.node_size());
   int found = 0;
-  for (int i = 0; i < item.graph.node_size(); ++i) {
-    const NodeDef& node = item.graph.node(i);
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
     // "add" should get turned into a NoOp and removed.
-    EXPECT_NE("add", node.name());
+    EXPECT_NE(node.name(), "add");
     if (node.name() == "id1") {
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("x", node.input(0));
-      EXPECT_EQ("^y", node.input(1));
+      EXPECT_EQ(node.op(), "Identity");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "x");
+      EXPECT_EQ(node.input(1), "^y");
       ++found;
     } else if (node.name() == "id2") {
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("y", node.input(0));
-      EXPECT_EQ("^x", node.input(1));
+      EXPECT_EQ(node.op(), "Identity");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "y");
+      EXPECT_EQ(node.input(1), "^x");
       ++found;
     }
   }
-  EXPECT_EQ(2, found);
+  EXPECT_EQ(found, 2);
 }
 
-TEST_F(DependencyOptimizerTest, ChangeToNoop_RepeatedInput) {
+TEST_F(DependencyOptimizerTest, ChangeToNoopRepeatedInput) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output x = ops::RandomUniform(s.WithOpName("x"), {1, 2}, DT_FLOAT);
   Output add = ops::Add(s.WithOpName("add"), x, x);
@@ -164,25 +183,24 @@ TEST_F(DependencyOptimizerTest, ChangeToNoop_RepeatedInput) {
   item.graph.Swap(&output);
   status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
-  LOG(INFO) << output.DebugString();
 
-  EXPECT_EQ(item.graph.node_size(), output.node_size());
+  EXPECT_EQ(output.node_size(), item.graph.node_size());
   int found = 0;
   for (int i = 0; i < item.graph.node_size(); ++i) {
     const NodeDef& node = item.graph.node(i);
     // "add" should get turned into a NoOp and removed.
-    EXPECT_NE("add", node.name());
+    EXPECT_NE(node.name(), "add");
     if (node.name() == "id1") {
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ(node.op(), "Identity");
+      EXPECT_EQ(node.input_size(), 1);
+      EXPECT_EQ(node.input(0), "x");
       ++found;
     }
   }
-  EXPECT_EQ(1, found);
+  EXPECT_EQ(found, 1);
 }
 
-TEST_F(DependencyOptimizerTest, ChangeToNoop_SwitchIdentity) {
+TEST_F(DependencyOptimizerTest, ChangeToNoopSwitchIdentity) {
   // This tests that we don't try to repeatedly add Identity nodes
   // with names like "ConstantFoldingCtrl/foo/bar/switch_$port" when
   // multiple nodes reading the same output of a Switch node get
@@ -220,23 +238,23 @@ TEST_F(DependencyOptimizerTest, ChangeToNoop_SwitchIdentity) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  EXPECT_EQ(item.graph.node_size() - 1, output.node_size());
+  EXPECT_EQ(output.node_size(), item.graph.node_size() - 1);
   for (int i = 0; i < output.node_size(); ++i) {
     const NodeDef& node = output.node(i);
     // "neg" should be eliminated.
-    EXPECT_NE("neg", node.name());
+    EXPECT_NE(node.name(), "neg");
     // A control dep from "^ConstantFoldingCtrl/switch_1"
     // should be attached to "c1".
     if (node.name() == "c1") {
-      EXPECT_EQ("Const", node.op());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("^ConstantFoldingCtrl/switch_1", node.input(0));
+      EXPECT_EQ(node.op(), "Const");
+      ASSERT_EQ(node.input_size(), 1);
+      EXPECT_EQ(node.input(0), "^ConstantFoldingCtrl/switch_1");
     }
   }
 }
 
 // TODO(rmlarsen): Add test to make sure we skip Switch and Merge.
-TEST_F(DependencyOptimizerTest, ChangeToNoop_NoFetch) {
+TEST_F(DependencyOptimizerTest, ChangeToNoopNoFetch) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output x = ops::RandomUniform(s.WithOpName("x"), {1, 2}, DT_FLOAT);
   Output y = ops::RandomUniform(s.WithOpName("y"), {1, 2}, DT_FLOAT);
@@ -258,7 +276,7 @@ TEST_F(DependencyOptimizerTest, ChangeToNoop_NoFetch) {
   VerifyGraphsEqual(item.graph, output, __FUNCTION__);
 }
 
-TEST_F(DependencyOptimizerTest, RemoveNoOps_EmptyInputOrOutput) {
+TEST_F(DependencyOptimizerTest, RemoveNoOpsEmptyInputOrOutput) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output x = ops::RandomUniform(s, {1, 2}, DT_FLOAT);
   auto noop1 = ops::NoOp(s);
@@ -278,18 +296,18 @@ TEST_F(DependencyOptimizerTest, RemoveNoOps_EmptyInputOrOutput) {
   status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  EXPECT_EQ(item.graph.node_size(), output.node_size());
+  EXPECT_EQ(output.node_size(), item.graph.node_size());
   for (const NodeDef& node : output.node()) {
     if (node.name() == "NoOp" || node.name() == "NoOp_1") {
       EXPECT_EQ(0, node.input_size());
     } else if (node.name() == "Identity") {
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("RandomUniform", node.input(0));
+      ASSERT_EQ(node.input_size(), 1);
+      EXPECT_EQ(node.input(0), "RandomUniform");
     }
   }
 }
 
-TEST_F(DependencyOptimizerTest, RemoveNoOps_DeviceBoundaries) {
+TEST_F(DependencyOptimizerTest, RemoveNoOpsDeviceBoundaries) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output x = ops::RandomUniform(s.WithOpName("x").WithDevice("/CPU:0"), {1, 2},
                                 DT_FLOAT);
@@ -324,7 +342,7 @@ TEST_F(DependencyOptimizerTest, RemoveNoOps_DeviceBoundaries) {
   VerifyGraphsEqual(item.graph, output, __FUNCTION__);
 }
 
-TEST_F(DependencyOptimizerTest, RemoveIdentityOps_DeviceBoundaries) {
+TEST_F(DependencyOptimizerTest, RemoveIdentityOpsDeviceBoundaries) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output x = ops::RandomUniform(s.WithOpName("x").WithDevice("/CPU:0"), {1, 2},
                                 DT_FLOAT);
@@ -356,7 +374,34 @@ TEST_F(DependencyOptimizerTest, RemoveIdentityOps_DeviceBoundaries) {
   VerifyGraphsEqual(item.graph, output, __FUNCTION__);
 }
 
-TEST_F(DependencyOptimizerTest, RemoveNoOps_SingleInputOrOutput) {
+TEST_F(DependencyOptimizerTest, RemoveIdentityOpsIdenticalDevices) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::RandomUniform(s.WithOpName("x").WithDevice("/CPU:0"), {1, 2},
+                                DT_FLOAT);
+  auto id_a = ops::Identity(s.WithOpName("id_a").WithDevice("/CPU:1"), x);
+  Output id =
+      ops::Identity(s.WithControlDependencies(id_a).WithDevice("/CPU:0"), id_a);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch.push_back("Identity");
+
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(output.node_size(), item.graph.node_size() - 1);
+  for (const NodeDef& node : output.node()) {
+    EXPECT_NE(node.name(), "id_a");
+    if (node.name() == "Identity") {
+      ASSERT_EQ(node.input_size(), 1);
+      EXPECT_EQ(node.input(0), "x");
+    }
+  }
+}
+
+TEST_F(DependencyOptimizerTest, RemoveNoOpsSingleInputOrOutput) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output x = ops::RandomUniform(s.WithOpName("x"), {1, 2}, DT_FLOAT);
   Output y = ops::RandomUniform(s.WithOpName("y"), {1, 2}, DT_FLOAT);
@@ -383,15 +428,17 @@ TEST_F(DependencyOptimizerTest, RemoveNoOps_SingleInputOrOutput) {
   status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  EXPECT_EQ(item.graph.node_size(), output.node_size());
+  EXPECT_EQ(output.node_size(), item.graph.node_size());
   for (const NodeDef& node : output.node()) {
     if (node.name() == "NoOp" || node.name() == "NoOp_1") {
-      EXPECT_EQ(0, node.input_size());
+      EXPECT_EQ(node.input_size(), 0);
     } else if (node.name() == "Identity") {
-      EXPECT_EQ("x", node.input(0));
+      ASSERT_EQ(node.input_size(), 1);
+      EXPECT_EQ(node.input(0), "x");
     } else if (node.name() == "Identity_1") {
-      EXPECT_EQ("y", node.input(0));
-      EXPECT_EQ("^x", node.input(1));
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "y");
+      EXPECT_EQ(node.input(1), "^x");
     }
   }
 }
@@ -436,48 +483,46 @@ TEST_F(DependencyOptimizerTest, RemoveIdentity) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  EXPECT_EQ(item.graph.node_size() - 3, output.node_size());
+  EXPECT_EQ(output.node_size(), item.graph.node_size() - 3);
   int found = 0;
   for (const NodeDef& node : output.node()) {
-    EXPECT_NE("id_a", node.name());
-    EXPECT_NE("id_b", node.name());
-    EXPECT_NE("id_c", node.name());
+    EXPECT_NE(node.name(), "id_a");
+    EXPECT_NE(node.name(), "id_b");
+    EXPECT_NE(node.name(), "id_c");
     if (node.name() == "a_a" || node.name() == "a_b") {
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("x", node.input(0));
+      ASSERT_EQ(node.input_size(), 1);
+      EXPECT_EQ(node.input(0), "x");
       ++found;
     }
     if (node.name() == "a_c" || node.name() == "a_d") {
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("z", node.input(0));
-      EXPECT_EQ("^x", node.input(1));
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "z");
+      EXPECT_EQ(node.input(1), "^x");
       ++found;
     }
     if (node.name() == "b_a") {
-      EXPECT_EQ(3, node.input_size());
-      EXPECT_EQ("x", node.input(0));
-      EXPECT_EQ("^y", node.input(1));
-      EXPECT_EQ("^z", node.input(2));
+      ASSERT_EQ(node.input_size(), 3);
+      EXPECT_EQ(node.input(0), "x");
+      EXPECT_TRUE(NodeHasControllingFanins(node, {"^y", "^z"}));
       ++found;
     }
     if (node.name() == "c_a") {
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("x", node.input(0));
-      EXPECT_EQ("^y", node.input(1));
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "x");
+      EXPECT_EQ(node.input(1), "^y");
       ++found;
     }
     if (node.name() == "c_b") {
-      EXPECT_EQ(3, node.input_size());
-      EXPECT_EQ("z", node.input(0));
-      EXPECT_EQ("^x", node.input(1));
-      EXPECT_EQ("^y", node.input(2));
+      ASSERT_EQ(node.input_size(), 3);
+      EXPECT_EQ(node.input(0), "z");
+      EXPECT_TRUE(NodeHasControllingFanins(node, {"^x", "^y"}));
       ++found;
     }
   }
   EXPECT_EQ(found, 7);
 }
 
-TEST_F(DependencyOptimizerTest, RemoveIdentity_RepeatedInputs) {
+TEST_F(DependencyOptimizerTest, RemoveIdentityRepeatedInputs) {
   // Corner cases with repeated inputs.
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
   ops::Variable x(scope.WithOpName("x"), {}, DT_BOOL);
@@ -503,35 +548,35 @@ TEST_F(DependencyOptimizerTest, RemoveIdentity_RepeatedInputs) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  EXPECT_EQ(item.graph.node_size() - 1, output.node_size());
+  EXPECT_EQ(output.node_size(), item.graph.node_size() - 1);
   int found = 0;
   for (const NodeDef& node : output.node()) {
-    EXPECT_NE("id0", node.name());
+    EXPECT_NE(node.name(), "id0");
     if (node.name() == "or0") {
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("switch:1", node.input(0));
-      EXPECT_EQ("switch:1", node.input(1));
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "switch:1");
+      EXPECT_EQ(node.input(1), "switch:1");
       ++found;
     }
     if (node.name() == "or1") {
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("switch:1", node.input(0));
-      EXPECT_EQ("y", node.input(1));
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "switch:1");
+      EXPECT_EQ(node.input(1), "y");
       ++found;
     }
     if (node.name() == "or2") {
       // or1 should be unchanged.
-      EXPECT_EQ(3, node.input_size());
-      EXPECT_EQ("y", node.input(0));
-      EXPECT_EQ("y", node.input(1));
-      EXPECT_EQ("^id1", node.input(2));
+      ASSERT_EQ(node.input_size(), 3);
+      EXPECT_EQ(node.input(0), "y");
+      EXPECT_EQ(node.input(1), "y");
+      EXPECT_EQ(node.input(2), "^id1");
       ++found;
     }
   }
   EXPECT_EQ(found, 3);
 }
 
-TEST_F(DependencyOptimizerTest, Transitive_Reduction_Simple) {
+TEST_F(DependencyOptimizerTest, TransitiveReductionSimple) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
   Output x = ops::Square(s.WithOpName("x"), c);
@@ -546,13 +591,13 @@ TEST_F(DependencyOptimizerTest, Transitive_Reduction_Simple) {
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
-  EXPECT_EQ(4, output.node_size());
-  EXPECT_EQ("neg2", output.node(3).name());
-  EXPECT_EQ(1, output.node(3).input_size());
-  EXPECT_EQ("neg1", output.node(3).input(0));
+  ASSERT_EQ(output.node_size(), 4);
+  EXPECT_EQ(output.node(3).name(), "neg2");
+  ASSERT_EQ(output.node(3).input_size(), 1);
+  EXPECT_EQ(output.node(3).input(0), "neg1");
 }
 
-TEST_F(DependencyOptimizerTest, ChangeToNoop_Identity) {
+TEST_F(DependencyOptimizerTest, ChangeToNoopIdentity) {
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
   ops::Variable v_in(scope.WithOpName("v_in"), {3}, DT_FLOAT);
   Output id_after_var = ops::Identity(scope.WithOpName("id_after_var"), v_in);
@@ -583,18 +628,18 @@ TEST_F(DependencyOptimizerTest, ChangeToNoop_Identity) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  EXPECT_EQ(item.graph.node_size() - 2, output.node_size());
+  EXPECT_EQ(output.node_size(), item.graph.node_size() - 2);
   bool found = false;
   for (int i = 0; i < output.node_size(); ++i) {
     const NodeDef& node = output.node(i);
     // "id0" and "id1" but neither "ConstantFoldingCtrl/switch_1",
     // "id_after_var, nor "id2"" should be eliminated.
-    EXPECT_NE("id0", node.name());
-    EXPECT_NE("id1", node.name());
+    EXPECT_NE(node.name(), "id0");
+    EXPECT_NE(node.name(), "id1");
     if (node.name() == "c1") {
-      EXPECT_EQ("Const", node.op());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("^ConstantFoldingCtrl/switch_1", node.input(0));
+      EXPECT_EQ(node.op(), "Const");
+      ASSERT_EQ(node.input_size(), 1);
+      EXPECT_EQ(node.input(0), "^ConstantFoldingCtrl/switch_1");
       found = true;
     }
   }
@@ -624,17 +669,17 @@ TEST_F(DependencyOptimizerTest, IdentityInputs) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  EXPECT_EQ(6, output.node_size());
-  EXPECT_EQ("out1", output.node(4).name());
-  EXPECT_EQ(1, output.node(4).input_size());
-  EXPECT_EQ("s", output.node(4).input(0));
+  ASSERT_EQ(output.node_size(), 6);
+  EXPECT_EQ(output.node(4).name(), "out2");
+  ASSERT_EQ(output.node(4).input_size(), 1);
+  EXPECT_EQ(output.node(4).input(0), "s:1");
 
-  EXPECT_EQ("out2", output.node(5).name());
-  EXPECT_EQ(1, output.node(5).input_size());
-  EXPECT_EQ("s:1", output.node(5).input(0));
+  EXPECT_EQ(output.node(5).name(), "out1");
+  ASSERT_EQ(output.node(5).input_size(), 1);
+  EXPECT_EQ(output.node(5).input(0), "s");
 }
 
-TEST_F(DependencyOptimizerTest, RemoveIdentityN_SwitchInput) {
+TEST_F(DependencyOptimizerTest, RemoveIdentityNSwitchInput) {
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
   Output b = ops::Placeholder(scope.WithOpName("b"), DT_BOOL);
   Output x = ops::RandomUniform(scope.WithOpName("x"), {1, 2}, DT_FLOAT);
@@ -661,27 +706,27 @@ TEST_F(DependencyOptimizerTest, RemoveIdentityN_SwitchInput) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  EXPECT_EQ(8, output.node_size());
+  ASSERT_EQ(output.node_size(), 8);
 
   auto out1_node = output.node(7);
-  EXPECT_EQ("out1", out1_node.name());
-  EXPECT_EQ(1, out1_node.input_size());
-  EXPECT_EQ("s", out1_node.input(0));
+  EXPECT_EQ(out1_node.name(), "out1");
+  ASSERT_EQ(out1_node.input_size(), 1);
+  EXPECT_EQ(out1_node.input(0), "s");
 
-  auto out2_node = output.node(4);
-  EXPECT_EQ("out2", out2_node.name());
-  EXPECT_EQ(1, out2_node.input_size());
-  EXPECT_EQ("s:1", out2_node.input(0));
+  auto out2_node = output.node(6);
+  EXPECT_EQ(out2_node.name(), "out2");
+  ASSERT_EQ(out2_node.input_size(), 1);
+  EXPECT_EQ(out2_node.input(0), "s:1");
 
   auto out3_node = output.node(5);
-  EXPECT_EQ("out3", out3_node.name());
-  EXPECT_EQ(1, out3_node.input_size());
-  EXPECT_EQ("s", out3_node.input(0));
-
-  auto out4_node = output.node(6);
-  EXPECT_EQ("out4", out4_node.name());
-  EXPECT_EQ(1, out4_node.input_size());
-  EXPECT_EQ("s:1", out4_node.input(0));
+  EXPECT_EQ(out3_node.name(), "out3");
+  ASSERT_EQ(out3_node.input_size(), 1);
+  EXPECT_EQ(out3_node.input(0), "s");
+
+  auto out4_node = output.node(4);
+  EXPECT_EQ(out4_node.name(), "out4");
+  ASSERT_EQ(out4_node.input_size(), 1);
+  EXPECT_EQ(out4_node.input(0), "s:1");
 }
 
 TEST_F(DependencyOptimizerTest, DoNotRemoveIdentityNWithControlDependency) {
@@ -704,11 +749,11 @@ TEST_F(DependencyOptimizerTest, DoNotRemoveIdentityNWithControlDependency) {
   Status status = optimizer.Optimize(nullptr, item, &optimized_graph_def);
   TF_EXPECT_OK(status);
 
-  EXPECT_EQ(6, optimized_graph_def.node_size());
+  EXPECT_EQ(optimized_graph_def.node_size(), 6);
 }
 
 TEST_F(DependencyOptimizerTest,
-       Identity_DeviceCrossing_ConsumerOnDifferentDevice) {
+       IdentityDeviceCrossingConsumerOnDifferentDevice) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output x_on_1 =
       ops::Const(s.WithOpName("x_on_1").WithDevice("/gpu:1"), {1.0f}, {});
@@ -730,7 +775,7 @@ TEST_F(DependencyOptimizerTest,
   VerifyGraphsEqual(item.graph, output, __FUNCTION__);
 }
 
-TEST_F(DependencyOptimizerTest, Identity_DeviceCrossing_ConsumerOnSameDevice) {
+TEST_F(DependencyOptimizerTest, IdentityDeviceCrossingConsumerOnSameDevice) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output x_on_1 =
       ops::Const(s.WithOpName("x_on_1").WithDevice("/gpu:1"), {1.0f}, {});
@@ -748,12 +793,14 @@ TEST_F(DependencyOptimizerTest, Identity_DeviceCrossing_ConsumerOnSameDevice) {
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
-  LOG(INFO) << output.DebugString();
-  EXPECT_EQ(3, output.node_size());
+
+  EXPECT_EQ(output.node_size(), 3);
   for (const auto& node : output.node()) {
-    EXPECT_NE("x_on_2", node.name());
+    EXPECT_NE(node.name(), "x_on_2");
     if (node.name() == "result") {
-      EXPECT_EQ("x_on_1", node.input(0));
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "x_on_1");
+      EXPECT_EQ(node.input(1), "one_on_2");
     }
   }
 }
@@ -781,25 +828,25 @@ TEST_F(DependencyOptimizerTest, RemoveGreaterEqualWithNoOp) {
   for (const NodeDef& node : output.node()) {
     if (node.name() == "x") {
       count++;
-      EXPECT_EQ("Placeholder", node.op());
-      EXPECT_EQ(0, node.input_size());
+      EXPECT_EQ(node.op(), "Placeholder");
+      EXPECT_EQ(node.input_size(), 0);
     } else if (node.name() == "y") {
       count++;
-      EXPECT_EQ("Placeholder", node.op());
-      EXPECT_EQ(0, node.input_size());
+      EXPECT_EQ(node.op(), "Placeholder");
+      EXPECT_EQ(node.input_size(), 0);
     } else if (node.name() == "GreaterEqual") {
       count++;
     } else if (node.name() == "NoOp") {
       count++;
     } else if (node.name() == "z") {
       count++;
-      EXPECT_EQ("Add", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("x", node.input(0));
-      EXPECT_EQ("y", node.input(1));
+      EXPECT_EQ(node.op(), "Add");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "x");
+      EXPECT_EQ(node.input(1), "y");
     }
   }
-  EXPECT_EQ(3, count);
+  EXPECT_EQ(count, 3);
 }
 
 TEST_F(DependencyOptimizerTest, GroupCrossDeviceControlDeps) {
diff --git a/tensorflow/core/grappler/optimizers/function_api_info.cc b/tensorflow/core/grappler/optimizers/function_api_info.cc
index 497ad6032ea80b22e5b5e2b23b2860b7c99fc57b..9f6352f1f2efa4b299dff163858ad5b4c88b41b8 100644
--- a/tensorflow/core/grappler/optimizers/function_api_info.cc
+++ b/tensorflow/core/grappler/optimizers/function_api_info.cc
@@ -29,10 +29,10 @@ FunctionApiInfo::~FunctionApiInfo() {}
 Status FunctionApiInfo::Init(const FunctionDef& function_def) {
   function_type_ = FunctionApiInfo::FunctionType::INFERENCE;
   for (const auto& attr : function_def.attr()) {
-    if (attr.first == "experimental_api_preferred_device") {
+    if (attr.first == "api_preferred_device") {
       preferred_device_ = attr.second.s();
     }
-    if (attr.first == "experimental_api_implements") {
+    if (attr.first == "api_implements") {
       interface_name_ = attr.second.s();
     }
     if (attr.first == "forward_function_name") {
diff --git a/tensorflow/core/grappler/optimizers/function_api_info.h b/tensorflow/core/grappler/optimizers/function_api_info.h
index 9a5f548951f0931e98fbe4074f7bbd9aacab0c6e..ffa53a7d8d94e29a1e3b6e214a18903e98f47cda 100644
--- a/tensorflow/core/grappler/optimizers/function_api_info.h
+++ b/tensorflow/core/grappler/optimizers/function_api_info.h
@@ -80,6 +80,8 @@ class FunctionLibraryApiInfo {
       const string& function_name, std::vector<string>* other_functions) const;
 
   const FunctionApiInfo* GetApiInfo(const string& function_name) const;
+  bool empty() const { return func_info_.empty(); }
+  std::size_t size() const { return func_info_.size(); }
 
  private:
   // Map between function name to function details.
diff --git a/tensorflow/core/grappler/optimizers/function_api_info_test.cc b/tensorflow/core/grappler/optimizers/function_api_info_test.cc
index b683d26b32f04759b658e9e0704f1b6b661fe178..9bb517faa31f1e347810ed8884b6a2c16b26104b 100644
--- a/tensorflow/core/grappler/optimizers/function_api_info_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_api_info_test.cc
@@ -58,9 +58,9 @@ void PopulateFunction(const string& name, const string& api_interface_name,
 
   auto* func_attr = func_def->mutable_attr();
   if (!api_interface_name.empty())
-    (*func_attr)["experimental_api_implements"].set_s(api_interface_name);
+    (*func_attr)["api_implements"].set_s(api_interface_name);
   if (!preferred_device.empty())
-    (*func_attr)["experimental_api_preferred_device"].set_s(preferred_device);
+    (*func_attr)["api_preferred_device"].set_s(preferred_device);
   if (!forward_function_name.empty())
     (*func_attr)["forward_function_name"].set_s(forward_function_name);
   if (!backward_function_name.empty())
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 8beebb90496005dea556ec90de24072a6e6fd9b6..b722e6b3328342cd01a8a55d5f809bf04218fd5b 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -15,10 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/function_optimizer.h"
 
-#include <unordered_map>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/substitute.h"
@@ -26,6 +27,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/lower_if_while.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/common_runtime/placer.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/attr_value_util.h"
@@ -38,11 +41,13 @@ limitations under the License.
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/tensor_id.h"
+#include "tensorflow/core/grappler/graph_topology_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/functions.h"
+#include "tensorflow/core/grappler/utils/traversal.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 
 namespace tensorflow {
@@ -65,6 +70,14 @@ constexpr char kFuncAttrName[] = "f";
 
 constexpr char kNoInlineAttr[] = "_noinline";
 
+// Name of the node that will have control edges from function input nodes, and
+// also used as a new destination for incoming control edges.
+constexpr char kInputsReadyNodeName[] = "inputs_ready";
+
+// Name of the node that will have control edges from function output nodes, and
+// also used as a new source of outgoing control edges.
+constexpr char kOutputsReadyNodeName[] = "outputs_ready";
+
 bool AttrIsTrue(const FunctionDef& func, const string& attr) {
   return func.attr().count(attr) != 0 && func.attr().at(attr).b();
 }
@@ -163,10 +176,10 @@ struct FunctionSpecializationSignature {
 
   string func_name;
   bool is_in_fetch_set;
-  gtl::FlatSet<OutputPort> active_outputs;
-  std::unordered_map<string, DataType> type_parameters;
-  std::unordered_map<string, AttrValue> body_parameters;
-  std::unordered_map<InputPort, string> const_inputs;
+  absl::flat_hash_set<OutputPort> active_outputs;
+  absl::flat_hash_map<string, DataType> type_parameters;
+  absl::flat_hash_map<string, AttrValue> body_parameters;
+  absl::flat_hash_map<InputPort, string> const_inputs;
 
   bool operator==(const FunctionSpecializationSignature& other) const {
     bool equals = func_name == other.func_name &&
@@ -189,48 +202,45 @@ struct FunctionSpecializationSignature {
     return true;
   }
 
-  // TODO(ezhulenev): Migrate to AbslHashValue.
-  // TODO(ezhulenev): Optimize performance by computing hashes of unordered
-  // values first, and then compute a hash of sorted hashes.
-  struct Hash {
-    uint64 operator()(FunctionSpecializationSignature const& s) const {
-      uint64 h = Hash64(s.func_name);
-      h = Hash64Combine(std::hash<bool>()(s.is_in_fetch_set), h);
-
-      // Use std::set/std::map for deterministic iteration order.
-
-      std::set<OutputPort> active_outputs(s.active_outputs.begin(),
-                                          s.active_outputs.end());
-      for (const auto& active_output : active_outputs) {
-        h = Hash64Combine(std::hash<int>()(active_output), h);
-      }
-
-      std::map<string, DataType> types(s.type_parameters.begin(),
-                                       s.type_parameters.end());
-      for (const auto& pair : types) {
-        AttrValue attr_value;
-        attr_value.set_type(pair.second);
-        h = Hash64Combine(Hash64(pair.first), h);
-        h = Hash64Combine(AttrValueHash(attr_value), h);
-      }
-
-      std::map<string, AttrValue> body(s.body_parameters.begin(),
-                                       s.body_parameters.end());
-      for (const auto& pair : body) {
-        h = Hash64Combine(Hash64(pair.first), h);
-        h = Hash64Combine(FastAttrValueHash(pair.second), h);
-      }
-
-      std::map<InputPort, string> inputs(s.const_inputs.begin(),
-                                         s.const_inputs.end());
-      for (const auto& pair : inputs) {
-        h = Hash64Combine(std::hash<int>()(pair.first), h);
-        h = Hash64Combine(Hash64(pair.second), h);
-      }
-
-      return h;
-    }
-  };
+  template <typename H>
+  friend H AbslHashValue(H h, const FunctionSpecializationSignature& s) {
+    H base = H::combine(std::move(h), s.func_name, s.is_in_fetch_set);
+
+    // First pre-compute hashes for all values in collections with
+    // non-deterministic iteration order.
+    std::vector<uint64> hashes;
+    hashes.reserve(s.active_outputs.size()         //
+                   + s.type_parameters.size() * 2  //
+                   + s.body_parameters.size() * 2  //
+                   + s.const_inputs.size() * 2);
+
+    absl::c_transform(s.active_outputs, std::back_inserter(hashes),
+                      hash<OutputPort>());
+
+    using TypeParam = std::pair<const string, DataType>;
+    absl::c_for_each(s.type_parameters, [&hashes](const TypeParam& type_param) {
+      AttrValue attr_value;
+      attr_value.set_type(type_param.second);
+      hashes.push_back(Hash64(type_param.first));
+      hashes.push_back(AttrValueHash(attr_value));
+    });
+
+    using BodyParam = std::pair<const string, AttrValue>;
+    absl::c_for_each(s.body_parameters, [&hashes](const BodyParam& body_param) {
+      hashes.push_back(Hash64(body_param.first));
+      hashes.push_back(FastAttrValueHash(body_param.second));
+    });
+
+    using ConstInput = std::pair<const InputPort, string>;
+    absl::c_for_each(s.const_inputs, [&hashes](const ConstInput& const_input) {
+      hashes.push_back(hash<InputPort>()(const_input.first));
+      hashes.push_back(Hash64(const_input.second));
+    });
+
+    // Combine all pre-computed hashes in a deterministic order.
+    absl::c_sort(hashes);
+    return H::combine_contiguous(std::move(base), hashes.data(), hashes.size());
+  }
 };
 
 struct FunctionSpecialization {
@@ -238,39 +248,39 @@ struct FunctionSpecialization {
   // True if the function caller node is in GrapplerItem fetch set.
   bool is_in_fetch_set;
   // Names of the tensors that were pushed down into the function body.
-  gtl::FlatSet<string> const_inputs;
+  absl::flat_hash_set<string> const_inputs;
   // Control dependencies of pushed down const inputs have to be attached to
   // function caller node.
-  gtl::FlatSet<string> control_deps;
+  absl::flat_hash_set<string> control_deps;
   // Output tensors (ports) that consumed by other nodes in the graph or in a
   // GrapplerItem fetch set.
-  gtl::FlatSet<int> active_outputs;
+  absl::flat_hash_set<int> active_outputs;
   // Mapping from original function output port to the output port of
   // specialized function. If function specialization changes the number of
   // function outputs it's required to update all node consumers.
   std::vector<std::pair<int, int>> output_mapping;
 };
 
+// Function optimizer context initialized once for each optimization pass, and
+// it uses the latest available graph (for the first iteration it will be the
+// GrapplerItem.graph, for next iterations it will be the output of previous
+// function optimizer pass).
 class FunctionOptimizerContext {
  public:
-  explicit FunctionOptimizerContext(RewriterConfig::Toggle opt_level,
-                                    const GrapplerItem& item)
-      : grappler_item_id_(item.id),
-        graph_version_(item.graph.versions().producer()),
+  explicit FunctionOptimizerContext(const GrapplerItem& item,
+                                    RewriterConfig::Toggle opt_level,
+                                    const GraphDef& graph)
+      : item_(&item),
         opt_level_(opt_level),
-        allowed_optimizations_(item.allowed_optimizations()),
-        function_library_(OpRegistry::Global(), item.graph.library()),
-        available_device_names_(item.devices().begin(), item.devices().end()),
-        graph_view_(&item.graph) {
-    InitializeTrulyConstNodes(item);
-    InitializeFetchNodes(item);
-  }
+        function_library_(OpRegistry::Global(), graph.library()),
+        truly_const_nodes_(InferTrulyConstNodes(item, graph)),
+        graph_view_(&graph) {}
 
-  const RewriterConfig::Toggle opt_level() const { return opt_level_; }
+  const GrapplerItem& item() const { return *item_; }
 
-  const GrapplerItem::AllowedOptimizations& allowed_optimizations() const {
-    return allowed_optimizations_;
-  }
+  const int graph_version() const { return item_->graph.versions().producer(); }
+
+  RewriterConfig::Toggle opt_level() const { return opt_level_; }
 
   const FunctionLibraryDefinition& function_library() const {
     return function_library_;
@@ -285,25 +295,22 @@ class FunctionOptimizerContext {
     return flr_;
   }
 
-  const gtl::FlatMap<SafeTensorId, SafeTensorId, SafeTensorId::Hasher>&
+  const absl::flat_hash_map<SafeTensorId, SafeTensorId, SafeTensorId::Hasher>&
   tensor_mapping() const {
     return tensor_mapping_;
   }
 
-  const gtl::FlatMap<string, std::vector<string>>& control_overrides() const {
+  const absl::flat_hash_map<string, std::vector<string>>& control_overrides()
+      const {
     return control_overrides_;
   }
 
   const GraphView& graph_view() const { return graph_view_; }
 
-  const string& grappler_item_id() const { return grappler_item_id_; }
-
-  const gtl::FlatSet<string>& fetch_tensors() const { return fetch_tensors_; }
-
   const DeviceSet* devices() const {
     // Create fake devices lazily only if we need a DeviceSet.
-    if (available_devices_.empty() && !available_device_names_.empty()) {
-      for (const string& name : available_device_names_) {
+    if (available_devices_.empty() && !item_->devices().empty()) {
+      for (const string& name : item_->devices()) {
         auto device = absl::make_unique<FakeDevice>(name);
         available_device_set_.AddDevice(device.get());
         available_devices_.push_back(std::move(device));
@@ -313,7 +320,9 @@ class FunctionOptimizerContext {
   }
 
   bool IsFetchNode(const string& node_name) const {
-    return fetch_nodes_.find(node_name) != fetch_nodes_.end();
+    return absl::c_any_of(item_->fetch, [&](const string& fetch) {
+      return ParseTensorName(fetch).node() == node_name;
+    });
   }
 
   bool IsTrulyConst(const string& name) const {
@@ -335,6 +344,11 @@ class FunctionOptimizerContext {
   }
 
   void AddTensorMapping(const SafeTensorId& from, const SafeTensorId& to) {
+    DCHECK(from.index() != Graph::kControlSlot)
+        << "Tensor mapping must be from regular tensor";
+    DCHECK(to.index() != Graph::kControlSlot)
+        << "Tensor mapping must be to regular tensor";
+
     auto inserted = tensor_mapping_.insert({from, to});
     DCHECK(inserted.second)
         << "Failed to insert duplicated tensor mapping: "
@@ -349,8 +363,7 @@ class FunctionOptimizerContext {
       if (from_idx != to_idx) {
         SafeTensorId from_tensor(func_node, from_idx);
         SafeTensorId to_tensor(func_node, to_idx);
-        auto inserted = tensor_mapping_.insert({from_tensor, to_tensor});
-        DCHECK(inserted.second);
+        AddTensorMapping(from_tensor, to_tensor);
       }
     }
   }
@@ -364,24 +377,21 @@ class FunctionOptimizerContext {
   }
 
  private:
-  void InitializeTrulyConstNodes(const GrapplerItem& item) {
-    gtl::FlatSet<string> feed_nodes;
+  static absl::flat_hash_map<string, const NodeDef*> InferTrulyConstNodes(
+      const GrapplerItem& item, const GraphDef& graph) {
+    absl::flat_hash_set<absl::string_view> feed_nodes;
     for (const auto& feed : item.feed) {
-      feed_nodes.insert(NodeName(feed.first));
+      feed_nodes.insert(feed.first);
     }
 
-    for (const NodeDef& node : item.graph.node()) {
-      if (IsConstant(node) && feed_nodes.count(node.name()) == 0) {
-        truly_const_nodes_[node.name()] = &node;
+    absl::flat_hash_map<string, const NodeDef*> const_nodes;
+    for (const NodeDef& node : graph.node()) {
+      if (IsConstant(node) && !feed_nodes.contains(node.name())) {
+        const_nodes[node.name()] = &node;
       }
     }
-  }
 
-  void InitializeFetchNodes(const GrapplerItem& item) {
-    for (const string& fetch : item.fetch) {
-      fetch_tensors_.insert(fetch);
-      fetch_nodes_.insert(NodeName(fetch));
-    }
+    return const_nodes;
   }
 
   void InitializeFunctionLibraryRuntime() {
@@ -393,16 +403,16 @@ class FunctionOptimizerContext {
       OptimizerOptions optimizer_opts;
       optimizer_opts.set_do_function_inlining(true);
       process_flr_.reset(new ProcessFunctionLibraryRuntime(
-          device_mgr_.get(), env, graph_version_, &function_library_,
-          optimizer_opts));
+          device_mgr_.get(), env, item_->graph.versions().producer(),
+          &function_library_, optimizer_opts));
       flr_ = process_flr_->GetFLR(device_mgr_->ListDevices()[0]->name());
     }
   }
 
-  const string grappler_item_id_;
-  const int graph_version_;
-  const RewriterConfig::Toggle opt_level_;
-  const GrapplerItem::AllowedOptimizations allowed_optimizations_;
+  const GrapplerItem* item_;  // must outlive this object
+  RewriterConfig::Toggle opt_level_;
+
+  // Function library constructed from current graph.
   FunctionLibraryDefinition function_library_;
 
   // These fields initialized lazily only if needed.
@@ -410,28 +420,20 @@ class FunctionOptimizerContext {
   std::unique_ptr<ProcessFunctionLibraryRuntime> process_flr_;
   FunctionLibraryRuntime* flr_ = nullptr;
 
-  // Fully defined names of the devices available to the GrapplerItem.
-  const gtl::FlatSet<string> available_device_names_;
-
   // List of available `FakedDevices` (lazily initialized, see devices()).
   mutable std::vector<std::unique_ptr<Device>> available_devices_;
 
   // DeviceSet of fake devices (`FakeDevice`) constructed from
-  // available_devices_ (lazily initialized).
+  // item_.devices() (lazily initialized).
   mutable DeviceSet available_device_set_;
 
   // Nodes that are Const and not in feed.
-  std::unordered_map<string, const NodeDef*> truly_const_nodes_;
+  absl::flat_hash_map<string, const NodeDef*> truly_const_nodes_;
   // Specialized functions.
-  std::unordered_map<FunctionSpecializationSignature,
-                     const FunctionSpecialization,
-                     FunctionSpecializationSignature::Hash>
+  absl::flat_hash_map<FunctionSpecializationSignature,
+                      const FunctionSpecialization>
       specialized_functions_;
 
-  // GrapplerItem.fetch is a vector of tensors.
-  gtl::FlatSet<string> fetch_tensors_;  // format: node_name:port
-  gtl::FlatSet<string> fetch_nodes_;    // format: node_name
-
   // After function inlining and specialization, the optimized graph might be in
   // invalid state, nodes can read from non-existing function call nodes that
   // were inlined, or they can read from output index that is no longer valid
@@ -439,7 +441,7 @@ class FunctionOptimizerContext {
   //
   // Tensor mapping that has to be applied to the graph after all functions
   // optimizations (invalidated tensor id -> optimized graph tensor id).
-  gtl::FlatMap<SafeTensorId, SafeTensorId, SafeTensorId::Hasher>
+  absl::flat_hash_map<SafeTensorId, SafeTensorId, SafeTensorId::Hasher>
       tensor_mapping_;
 
   // When we inline a function into the optimized graph, we no longer have the
@@ -448,7 +450,7 @@ class FunctionOptimizerContext {
   // to all side-effectful ops inside the function body.
   //
   // Invalidated function call node name -> Inlined side-effectful nodes
-  gtl::FlatMap<string, std::vector<string>> control_overrides_;
+  absl::flat_hash_map<string, std::vector<string>> control_overrides_;
 
   // Use graph view to find active outputs of the function caller nodes.
   GraphView graph_view_;
@@ -472,10 +474,10 @@ const FunctionDef* FindFunctionCall(const FunctionOptimizerContext& ctx,
   return ctx.function_library().Find(node.op());
 }
 
-gtl::FlatSet<int> GetActiveOutputs(const NodeDef& node,
-                                   const FunctionOptimizerContext& ctx,
-                                   int size_hint = 0) {
-  gtl::FlatSet<int> active_outputs;
+absl::flat_hash_set<int> GetActiveOutputs(const NodeDef& node,
+                                          const FunctionOptimizerContext& ctx,
+                                          int size_hint = 0) {
+  absl::flat_hash_set<int> active_outputs;
   active_outputs.reserve(static_cast<size_t>(size_hint));
 
   // 1. Output can be consumed by the other graph node.
@@ -486,9 +488,11 @@ gtl::FlatSet<int> GetActiveOutputs(const NodeDef& node,
   }
 
   // 2. Or it can be in a fetch set.
-  for (const string& fetch_tensor : ctx.fetch_tensors()) {
-    int port = NodePositionIfSameNode(fetch_tensor, node.name());
-    if (port >= 0) active_outputs.insert(port);
+  for (const string& fetch : ctx.item().fetch) {
+    TensorId fetch_tensor = ParseTensorName(fetch);
+    if (fetch_tensor.node() == node.name()) {
+      active_outputs.insert(fetch_tensor.index());
+    }
   }
 
   return active_outputs;
@@ -508,7 +512,7 @@ bool HasUnusedOutputs(const NodeDef& func_node, const FunctionDef& func,
   // number of output args is the same as number of possible function caller
   // node outputs.
   int num_outputs = func.signature().output_arg_size();
-  const gtl::FlatSet<int> active_outputs =
+  const absl::flat_hash_set<int> active_outputs =
       GetActiveOutputs(func_node, ctx, /*size_hind*/ num_outputs);
 
   return active_outputs.size() != num_outputs;
@@ -519,7 +523,7 @@ bool HasUnusedOutputs(const NodeDef& func_node, const FunctionDef& func,
 FunctionDefLibrary PruneFunctionLibrary(const FunctionLibraryDefinition& flib,
                                         const GraphDef& optimized_graph) {
   FunctionLibraryDefinition pruned_flib =
-      ReachableFunctionLibraryDefinition(flib, optimized_graph);
+      flib.ReachableDefinitions(optimized_graph);
 
   int pruned_functions = static_cast<int>(pruned_flib.num_functions()) -
                          static_cast<int>(flib.num_functions());
@@ -534,8 +538,8 @@ FunctionDefLibrary PruneFunctionLibrary(const FunctionLibraryDefinition& flib,
 Status PushDownConstInputs(const NodeDef& func_node,
                            const FunctionOptimizerContext& ctx,
                            GrapplerFunctionItem* item,
-                           gtl::FlatSet<string>* const_inputs,
-                           gtl::FlatSet<string>* control_deps) {
+                           absl::flat_hash_set<string>* const_inputs,
+                           absl::flat_hash_set<string>* control_deps) {
   // Record node control dependencies in the control_deps set.
   const auto record_control_deps = [&](const NodeDef* const_input) {
     for (int i = const_input->input_size() - 1; i >= 0; --i) {
@@ -585,7 +589,7 @@ void RemovePushedDownConstInputs(const FunctionSpecialization& specialization,
 
   // Attach control dependencies of pushed down const input to the caller node.
   if (!specialization.control_deps.empty()) {
-    gtl::FlatSet<string> existing_control_deps;
+    absl::flat_hash_set<string> existing_control_deps;
 
     for (const string& input : keep_inputs) {
       existing_control_deps.insert(AsControlDependency(NodeName(input)));
@@ -746,14 +750,12 @@ Status InitializeFunctionSpecializationSignature(
 string SpecializedFunctionName(const FunctionOptimizerContext& ctx,
                                const FunctionDef& func,
                                const NodeDef& func_node) {
-  return absl::Substitute("$0_specialized_for_$1_at_$2",
-                          func.signature().name(),
-                          absl::StrReplaceAll(func_node.name(), {{"/", "_"}}),
-                          ctx.grappler_item_id());
+  return absl::Substitute(
+      "$0_specialized_for_$1_at_$2", func.signature().name(),
+      absl::StrReplaceAll(func_node.name(), {{"/", "_"}}), ctx.item().id);
 }
 
 Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
-                          const int graph_def_version,
                           FunctionOptimizerContext* ctx,
                           GraphDef* optimized_graph) {
   VLOG(2) << "Specialize function call: " << SummarizeNodeDef(func_node);
@@ -792,13 +794,13 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
   // Make a GrapplerFunctionItem and convert it back to FunctionDef after
   // pushing all constant inputs into the function body.
   GrapplerFunctionItem item;
-  TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, func_instantiation_attr,
-                                              flib, graph_def_version, &item));
+  TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(
+      func, func_instantiation_attr, flib, ctx->graph_version(), &item));
 
   // Push const inputs into the function body, and keep track of their control
   // dependencies.
-  gtl::FlatSet<string> const_inputs;
-  gtl::FlatSet<string> control_deps;
+  absl::flat_hash_set<string> const_inputs;
+  absl::flat_hash_set<string> control_deps;
   TF_RETURN_IF_ERROR(PushDownConstInputs(func_node, *ctx, &item, &const_inputs,
                                          &control_deps));
 
@@ -806,8 +808,17 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
   // update outputs for the fetch nodes, so we just skip them.
   std::vector<std::pair<int, int>> output_mapping;
   if (!signature.is_in_fetch_set) {
-    TF_RETURN_IF_ERROR(
-        RemoveUnusedOutputs(signature.active_outputs, &item, &output_mapping));
+    int num_func_outputs = 0;
+    for (const auto& out_arg : item.outputs()) {
+      num_func_outputs += out_arg.output_nodes.size();
+    }
+
+    absl::flat_hash_set<int> remove;
+    for (int i = 0; i < num_func_outputs; ++i) {
+      if (!signature.active_outputs.count(i)) remove.insert(i);
+    }
+
+    TF_RETURN_IF_ERROR(RemoveFunctionOutputs(remove, &item, &output_mapping));
   }
 
   // TODO(ezhulenev): Push down known input shapes.
@@ -962,8 +973,10 @@ NodeDef InlinedFunctionInputsNode(const NodeDef& func_node,
 
 // Create an IdentityN node to hook the function outputs to: this ensures that
 // the function body is fully evaluated before its fanout gets scheduled.
-NodeDef InlinedFunctionOutputsNode(const NodeDef& func_node,
-                                   const GrapplerFunctionItem& item) {
+NodeDef InlinedFunctionOutputsNode(
+    const NodeDef& func_node, const GrapplerFunctionItem& item,
+    const absl::flat_hash_map<absl::string_view, absl::string_view>
+        output_tensors) {
   NodeDef outputs;
   outputs.set_name(func_node.name());
   outputs.set_op("IdentityN");
@@ -972,7 +985,8 @@ NodeDef InlinedFunctionOutputsNode(const NodeDef& func_node,
       (*outputs.mutable_attr())["T"].mutable_list();
 
   for (const OutputArgExpansion& output_arg : item.outputs()) {
-    for (const string& output_tensor : output_arg.output_tensors) {
+    for (const string& output_node : output_arg.output_nodes) {
+      const absl::string_view output_tensor = output_tensors.at(output_node);
       type_list->add_type(output_arg.data_type);
       outputs.add_input(strings::StrCat(func_node.name(), "/", output_tensor));
     }
@@ -983,7 +997,6 @@ NodeDef InlinedFunctionOutputsNode(const NodeDef& func_node,
 
 Status InlineDirectFunctionCall(const NodeDef& func_node,
                                 const FunctionDef& func,
-                                const int graph_def_version,
                                 const FunctionOptimizerContext& ctx,
                                 GraphDef* optimized_graph) {
   VLOG(2) << "Inline direct function call: " << SummarizeNodeDef(func_node);
@@ -995,7 +1008,7 @@ Status InlineDirectFunctionCall(const NodeDef& func_node,
   GrapplerFunctionItem item;
   Status item_status = MakeGrapplerFunctionItem(func, func_instantiation_attr,
                                                 ctx.function_library(),
-                                                graph_def_version, &item);
+                                                ctx.graph_version(), &item);
 
   if (!item_status.ok()) {
     return errors::InvalidArgument("Failed to inline function ", func_node.op(),
@@ -1004,29 +1017,51 @@ Status InlineDirectFunctionCall(const NodeDef& func_node,
   }
 
   // Mapping from input placeholder name to function input position.
-  int idx = 0;
-  std::unordered_map<string, int> input_placeholders_idx;
+  absl::flat_hash_map<absl::string_view, int> input_placeholders_idx;
   for (const InputArgExpansion& input_arg : item.inputs()) {
     for (const string& placeholder : input_arg.placeholders) {
-      input_placeholders_idx[placeholder] = idx++;
+      const int idx = input_placeholders_idx.size();
+      input_placeholders_idx[placeholder] = idx;
+    }
+  }
+
+  // Bypass identity nodes added to the graph in place of function outputs.
+  absl::flat_hash_set<absl::string_view> output_nodes;
+  for (const OutputArgExpansion& output_arg : item.outputs()) {
+    for (const string& output_node : output_arg.output_nodes) {
+      output_nodes.insert(output_node);
     }
   }
 
+  // For each function output value we added an identity node that reads the
+  // tensor from one of the function body nodes. When we inline function into
+  // the main graph we want to bypass these nodes, so we keep a mapping from
+  // 'output node name' -> 'output tensor name'.
+  absl::flat_hash_map<absl::string_view, absl::string_view> output_tensors;
+
   // Hook inlined function inputs to IdentityN node.
   NodeDef* func_inputs = optimized_graph->add_node();
   *func_inputs = InlinedFunctionInputsNode(func_node, item);
 
   for (NodeDef& func_body_node : *item.mutable_function_body().mutable_node()) {
-    if (item.IsInputPlaceholder(func_body_node.name())) {
-      // Turn input placeholders into identity nodes.
+    const string& node_name = func_body_node.name();
+
+    // Skip output identity node, and update a mapping to the output tensor.
+    if (IsIdentity(func_body_node) && output_nodes.count(node_name)) {
+      output_tensors.emplace(node_name, func_body_node.input(0));
+      continue;
+    }
+
+    // Turn placeholders added in place of input arguments into identity nodes.
+    const auto input_placeholder_idx = input_placeholders_idx.find(node_name);
+    if (input_placeholder_idx != input_placeholders_idx.end()) {
       CHECK_EQ(0, func_body_node.input_size());
       func_body_node.set_op("Identity");
       (*func_body_node.mutable_attr())["T"] = func_body_node.attr().at("dtype");
       func_body_node.mutable_attr()->erase("dtype");
       func_body_node.mutable_attr()->erase("shape");
-      int input_idx = input_placeholders_idx[func_body_node.name()];
-      func_body_node.add_input(
-          strings::StrCat(func_inputs->name(), ":", input_idx));
+      func_body_node.add_input(strings::StrCat(func_inputs->name(), ":",
+                                               input_placeholder_idx->second));
     } else {
       // Update the input names if any.
       for (string& input : *func_body_node.mutable_input()) {
@@ -1050,41 +1085,16 @@ Status InlineDirectFunctionCall(const NodeDef& func_node,
     // Make sure the node is placed.
     func_body_node.set_device(func_node.device());
 
-    // Move the function body node to the optimized graph.
-    const auto move_node_to_optimized_graph = [&]() {
-      // Annotate the node with the function attributes.
-      for (const auto& attr : func.attr()) {
-        func_body_node.mutable_attr()->insert(attr);
-      }
-      // Move the node to the main graph.
-      optimized_graph->add_node()->Swap(&func_body_node);
-    };
-
-    // Check if a body node is itself a function call and can be inlined.
-    const FunctionDef* func_body_node_func =
-        FindFunctionCall(ctx, func_body_node);
-
-    if (func_body_node_func != nullptr) {
-      Status inlinable = IsInlinableDirectFunctionCall(
-          ctx, *func_body_node_func, func_body_node);
-      if (inlinable.ok()) {
-        TF_RETURN_IF_ERROR(
-            InlineDirectFunctionCall(func_body_node, *func_body_node_func,
-                                     graph_def_version, ctx, optimized_graph));
-      } else {
-        VLOG(2) << "Can't inline nested direct function call: "
-                << inlinable.error_message();
-        move_node_to_optimized_graph();
-      }
-
-    } else {
-      move_node_to_optimized_graph();
-    }
+    // Move the node to the main graph.
+    optimized_graph->add_node()->Swap(&func_body_node);
   }
 
+  DCHECK(output_tensors.size() == item.output_size())
+      << "Each function output must be mapped to an output tensor";
+
   // Hook inlined function outputs to IdentityN node.
   NodeDef* func_outputs = optimized_graph->add_node();
-  *func_outputs = InlinedFunctionOutputsNode(func_node, item);
+  *func_outputs = InlinedFunctionOutputsNode(func_node, item, output_tensors);
 
   return Status::OK();
 }
@@ -1134,12 +1144,35 @@ Status InlineSymbolicGradient(const NodeDef& node,
   TF_RETURN_IF_ERROR(
       ConvertGraphDefToGraph(graph_ctor_opts, graph_def, &graph));
 
-  // Recursively inline the functions until there is nothing more to inline. We
-  // should at least expand one function.
-  int counter = 0;
-  while (counter < 50 && ExpandInlineFunctions(
-                             ctx->mutable_function_library_runtime(), &graph)) {
-    ++counter;
+  FunctionLibraryRuntime* flr = ctx->mutable_function_library_runtime();
+
+  // 1. Inline symbolic gradient node.
+  const bool expanded = ExpandInlineFunctions(flr, &graph);
+  DCHECK(expanded) << "Didn't expand SymbolicGradient op";
+
+  // TODO(ezhulenev): InlineFunctionBody in common_runtime/function silently
+  // fails to inline function into the graph, and leaves the graph unmodified.
+  // We check that graph has our symbolic gradient inlined, otherwise we return
+  // a error.
+  const auto is_symbolic_gradient_op = [&](const Node* node) {
+    return node->name() == inlined->name() &&
+           node->type_string() == "SymbolicGradient";
+  };
+  for (Node* node : graph.nodes()) {
+    if (is_symbolic_gradient_op(node)) {
+      return errors::Internal("Failed to inline symbolic gradient node: ",
+                              SummarizeNode(*node));
+    }
+  }
+
+  // 2. Recursively inline nested function calls.
+  int iteration = 0;
+  while (ExpandInlineFunctions(flr, &graph)) {
+    if (++iteration >= 50) {
+      VLOG(2) << "Break symbolic gradient inlining loop at iteration #"
+              << iteration;
+      break;
+    }
   }
 
   GraphDef inlined_graph_def;
@@ -1196,12 +1229,26 @@ Status InlineSymbolicGradient(const NodeDef& node,
 // dependency tracking via input/output control edges, and we relax some of the
 // constraints that we have for direct function call inlining.
 //
-// "When a `PartitionedCallOp` function has a resource (DT_RESOURCE data type)
-// input argument it "captures" the mutable resource.  This is implemented by
-// automatically adding a incoming control edge from the previous side-effectful
-// op touching that resource, and an outgoing control edge to the next
-// side-effectful op using the same resource. This serializes the mutations of
-// the resource to make graph execution deterministic.
+// Automatic control dependency rules:
+//
+// 1) "When a `PartitionedCallOp` function has a resource (DT_RESOURCE data
+//    type) input argument it "captures" the mutable resource.  This is
+//    implemented by automatically adding a incoming control edge from the
+//    previous side-effectful op touching that resource, and an outgoing control
+//    edge to the next side-effectful op using the same resource. This
+//    serializes the mutations of the resource to make graph execution
+//    deterministic.
+//
+// 2) All stateful ops inside a function body are guaranteed to execute in
+//    program order, this is achieved by adding control edges between stateful
+//    ops at graph construction time.
+//
+// 3) Furthermore, all ops accepting the same resource as an input are
+//    guaranteed to run in program order. This is also done by adding control
+//    edges at graph construction time. The last op touching the resource
+//    will have an outgoing control edge to all function return nodes, which
+//    will guarantee that all side effects to the resource will happen before
+//    function completion.
 //
 // Function call inlining must preserve side effect visibility:
 //
@@ -1210,17 +1257,106 @@ Status InlineSymbolicGradient(const NodeDef& node,
 // 2) All side effects to the captured resources, that happened inside function
 //    body, must be visible to every op/function using that resource after the
 //    function call completed.
-
-// To guarantee that these properties are preserved after inlining we do:
 //
-// 1) Forward all input control dependencies from the function call node to the
-//    inlined function inputs (Identity nodes).
-// 2) Each side-effectful op inside function body adds itself as a control
-//    dependency to all the nodes in output control set of function call node.
+// To guarantee that these properties are preserved after inlining we:
+//
+// 1) Create "input_control" NoOp. Function call node incoming control edges
+//    will be forwarded *to* this node. Function inputs (Identity nodes) will
+//    have a control edge *from* this node. If function has no inputs, by
+//    construction it must have nodes without inputs in the function body, and
+//    in this case these nodes will have a control edge *from* this node.
+
+// 2) Create "output_control" NoOp. All nodes that have incoming control edge
+//    *from* the function call node, will be forwarded to this node. Function
+//    outputs (Identity nodes) will have a control edge *to* this node. This
+//    will guarantee that nodes that have control dependency on the function
+//    call, will observe all side-effects (guaranteed by graph construction with
+//    automatic control dependencies tracking).
 //
-// We do not add any other control dependencies to/from function body nodes,
-// because they are pure functions of input tensors, and can be freely
-// reordered.
+// If after function instantiation we find a stateful or a dataset op inside
+// the function body, that is not reachable from any of the function outputs (or
+// if the function has no outputs), we do not inline it, because we can't
+// guarantee that these nodes will be executed in correct order (or executed at
+// all) after inlining.
+//
+// We do not try to add any extra control edges to make sure that all
+// side-effectful nodes will be executed, that should be handled at graph
+// construction time.
+
+struct MaybeDeadOutput {
+  const NodeDef* dead_tensor_src;
+  const NodeDef* output_node_dst;
+};
+
+// Finds all function outputs that might return a dead tensor. This can happen
+// if there is no `Merge` node on the path from the `Switch` node, to the
+// function output.
+Status MaybeDeadOutputs(const FunctionOptimizerContext& ctx,
+                        const GrapplerFunctionItem& item,
+                        std::vector<MaybeDeadOutput>* maybe_dead) {
+  VLOG(3) << "Find function outputs that might return dead tensors: item.id="
+          << item.id;
+  DCHECK(maybe_dead->empty()) << "Input argument must be an empty vector";
+
+  std::vector<const NodeDef*> dead_tensor_srcs;
+  for (const NodeDef& node : item.graph.node()) {
+    if (IsSwitch(node)) {
+      VLOG(4) << "Add dead tensors source. Switch node: " << node.name();
+      dead_tensor_srcs.push_back(&node);
+      continue;
+    }
+
+    // Regular (aka 'direct') function call can also produce dead tensors if
+    // the function body has mergeless switches.
+    const FunctionDef* func = ctx.function_library().Find(node.op());
+    if (func != nullptr) {
+      GrapplerFunctionItem func_item;
+      TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(
+          *func, FunctionInstantiationAttributes(*func, node),
+          ctx.function_library(), ctx.graph_version(), &func_item));
+
+      std::vector<MaybeDeadOutput> func_dead_outputs;
+      TF_RETURN_IF_ERROR(MaybeDeadOutputs(ctx, func_item, &func_dead_outputs));
+
+      if (!func_dead_outputs.empty()) {
+        VLOG(4) << "Add dead tensors source. Function call: " << node.op()
+                << " node=" << node.name();
+        dead_tensor_srcs.push_back(&node);
+      }
+    }
+  }
+
+  // If we do not have dead tensor sources in the function body, it's
+  // guaranteed that all output tensors can't become dead.
+  if (dead_tensor_srcs.empty()) return Status::OK();
+
+  // Names of the function body nodes that return function output values.
+  absl::flat_hash_set<absl::string_view> output_nodes;
+  for (const auto& output_expansion : item.outputs()) {
+    for (const auto& output_node : output_expansion.output_nodes) {
+      output_nodes.insert(output_node);
+    }
+  }
+
+  GraphTopologyView topology_view;
+  TF_RETURN_IF_ERROR(topology_view.InitializeFromGraph(item.graph));
+
+  for (const NodeDef* dead_tensor_src : dead_tensor_srcs) {
+    DfsTraversal(topology_view, {dead_tensor_src},
+                 TraversalDirection::kFollowOutputs,
+                 // Stop traversal when reached first `Merge` node.
+                 DfsPredicates::Advance(
+                     [](const NodeDef* node) { return !IsMerge(*node); }),
+                 // If we reached output node, add MaybeDeadOutput edge.
+                 DfsCallbacks::PreOrder([&](const NodeDef* node) {
+                   if (output_nodes.find(node->name()) != output_nodes.end()) {
+                     maybe_dead->push_back({dead_tensor_src, node});
+                   }
+                 }));
+  }
+
+  return Status::OK();
+}
 
 // Returns `Status::OK()` iff `node` is an indirect function call of `func`, and
 // we know how to inline it into the main graph, otherwise returns and error
@@ -1256,29 +1392,163 @@ Status IsInlinableIndirectFunctionCall(const FunctionOptimizerContext& ctx,
         SummarizeNodeDef(func_node));
   }
 
-  // We can't inline functions with `Switch` nodes in the function body, because
-  // they might have dead tensors as a function output argument (we need all
-  // intermediate tensors to compute the function gradient). `PartitionedCallOp`
-  // invokes functions with `allow_dead_tensors = true` to reset dead flag,
-  // and return default initialized tensors instead of a dead tensors.
-  // TODO(ezhulenev): Do the liveness analysis and add
-  // `IdentitytWithResurrection` nodes after all potentially dead output
-  // tensors?
-  if (absl::c_any_of(func.node_def(), IsSwitch)) {
-    return errors::FailedPrecondition(
-        "Can't inline function with `Switch` nodes in the function body: ",
-        SummarizeNodeDef(func_node));
+  return Status::OK();
+}
+
+// Checks that all side-effects will be executed in well defined order. We do it
+// by checking if there is a path from stateful/dataset ops to one of the output
+// nodes.
+Status CheckThatSideEffectsWillExecute(
+    const FunctionOptimizerContext& ctx,
+    const GraphTopologyView& graph_topo_view,
+    const absl::flat_hash_set<string> output_nodes) {
+  // In aggressive mode we just print a warning for side-effectful nodes that
+  // might not be executed after inlining.
+  const bool aggressive = ctx.opt_level() == RewriterConfig::AGGRESSIVE;
+
+  for (const NodeDef& func_body_node : graph_topo_view.graph()->node()) {
+    const bool node_must_execute =
+        IsDataset(func_body_node) ||
+        IsStateful(func_body_node, &ctx.function_library());
+
+    // If op has DT_RESOURCE argument it will be marked as stateful, though if
+    // it only reads from that resource, it's allowed to prune it, because it
+    // can't produce any visible side-effects.
+    const bool read_only = IsReadVariableOp(func_body_node);
+
+    if (read_only || !node_must_execute) continue;
+
+    VLOG(3) << "Check that node " << func_body_node.name()
+            << " will execute after inlining.";
+    bool will_execute = false;
+
+    // Check if we reached one of the output nodes.
+    const auto callbacks = DfsCallbacks::PreOrder([&](const NodeDef* node) {
+      if (output_nodes.contains(node->name())) {
+        VLOG(4) << "Found a path to output node: " << node->name();
+        will_execute = true;
+      }
+    });
+
+    // Stop if we already proved that node will execute.
+    const auto predicates = DfsPredicates::Enter(
+        [&](const NodeDef* node) { return !will_execute; });
+
+    DfsTraversal(graph_topo_view, {&func_body_node},
+                 TraversalDirection::kFollowOutputs, predicates, callbacks);
+
+    if (!will_execute) {
+      const string error_message = absl::StrCat(
+          "Can't guarantee execution of a side-effectful node, that is not "
+          "reachable from function outputs. Function body node: ",
+          SummarizeNodeDef(func_body_node));
+
+      if (aggressive) {
+        LOG(WARNING) << error_message;
+      } else {
+        return errors::Internal(error_message);
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+Status PlaceInlinedFunctionBody(
+    const NodeDef& func_node, const GrapplerFunctionItem& item,
+    const absl::flat_hash_map<absl::string_view, int>& input_placeholders_idx,
+    FunctionOptimizerContext* ctx, GraphDef* placed_graph_def) {
+  // Control flow lowering and Placer works with a Graph object.
+  std::unique_ptr<Graph> func_body_graph =
+      absl::make_unique<Graph>(ctx->function_library());
+
+  GraphConstructorOptions opts;
+  TF_RETURN_IF_ERROR(
+      ConvertGraphDefToGraph(opts, item.graph, func_body_graph.get()));
+
+  // ------------------------------------------------------------------------ //
+  // Grappler receives the graph after PRE_PLACEMENT, Placer, and POST_PLACEMENT
+  // passes, so each node has a valid device assignment. Also V2 control
+  // flow ops (functional If and While) should have been lowered to V1 control
+  // flow (Switch and Merge nodes). To keep the graph valid for execution we
+  // must assign device to every inlined graph node, and also lower the control
+  // flow.
+
+  GraphOptimizationPassOptions opt_options;
+  opt_options.graph = &func_body_graph;
+  opt_options.flib_def = ctx->mutable_function_library();
+
+  // TODO(ezhulenev): Should we run full PRE_PLACEMENT pass here? And
+  // POST_PLACEMENT after placer?
+  LowerIfWhilePass pass;
+  TF_RETURN_IF_ERROR(pass.Run(opt_options));
+
+  // ------------------------------------------------------------------------ //
+  // Before placing the function body nodes we pin input placeholders to the
+  // same device as their corresponding input nodes.
+
+  for (Node* func_body_node : func_body_graph->nodes()) {
+    const auto input_placeholder_idx =
+        input_placeholders_idx.find(func_body_node->name());
+
+    if (input_placeholder_idx != input_placeholders_idx.end()) {
+      const int input_idx = input_placeholder_idx->second;
+      const GraphView::OutputPort output_port =
+          ctx->graph_view().GetRegularFanin({&func_node, input_idx});
+
+      VLOG(3) << "Pin inlined function input node '" << func_body_node->name()
+              << "' to the '" << output_port.node->device() << "' device.";
+      func_body_node->set_requested_device(output_port.node->device());
+    }
   }
 
+  // ------------------------------------------------------------------------ //
+  // After placing nodes corresponding to the function inputs, we need to assign
+  // device placements to all other function body nodes.
+
+  const DeviceSet* devices = ctx->devices();
+
+  if (devices->devices().empty()) {
+    // If there are no devices available for placer, we just put all nodes to
+    // the same device as a function caller node. This can happen if Grappler is
+    // running "offline", without active runtime session, for example as a part
+    // of a batch job for graph analysis/optimization.
+    VLOG(3) << "Assign function call node device to all function body nodes. "
+            << "Device: " << func_node.device();
+    for (Node* func_body_node : func_body_graph->nodes()) {
+      func_body_node->set_requested_device(func_node.device());
+    }
+  } else {
+    // If we are running in an active runtime session, Grappler will get the
+    // graph after initial placing is done, and we should have devices for the
+    // placer.
+    VLOG(3) << "Run placer for instantiated function body. Devices: ["
+            << absl::StrJoin(
+                   devices->devices(), ", ",
+                   [](string* out, const Device* d) { out->append(d->name()); })
+            << "]";
+
+    // Use function caller node device as a default for placer.
+    const Device* default_device =
+        devices->FindDeviceByName(func_node.device());
+
+    Placer placer(func_body_graph.get(), devices,
+                  nullptr /* No session options */, default_device);
+    TF_RETURN_IF_ERROR(placer.Run());
+  }
+
+  // Convert Graph back to the placed GraphDef.
+  func_body_graph->ToGraphDef(placed_graph_def);
+
   return Status::OK();
 }
 
 Status InlineIndirectFunctionCall(const NodeDef& func_node,
                                   const FunctionDef& func,
-                                  const int graph_def_version,
                                   FunctionOptimizerContext* ctx,
                                   GraphDef* optimized_graph) {
   VLOG(2) << "Inline indirect function call: " << SummarizeNodeDef(func_node);
+  VLOG(4) << "Inlined function definition: " << DebugString(func);
   TF_RETURN_IF_ERROR(IsInlinableIndirectFunctionCall(*ctx, func, func_node));
 
   const AttrSlice func_instantiation_attr =
@@ -1287,7 +1557,7 @@ Status InlineIndirectFunctionCall(const NodeDef& func_node,
   GrapplerFunctionItem item;
   Status item_status = MakeGrapplerFunctionItem(func, func_instantiation_attr,
                                                 ctx->function_library(),
-                                                graph_def_version, &item);
+                                                ctx->graph_version(), &item);
 
   if (!item_status.ok()) {
     return errors::InvalidArgument("Failed to inline function ", func_node.op(),
@@ -1295,6 +1565,26 @@ Status InlineIndirectFunctionCall(const NodeDef& func_node,
                                    ". Error: ", item_status.error_message());
   }
 
+  // `PartitionedCallOp` invokes functions with `allow_dead_tensors = true` to
+  // reset dead flag, and return default initialized tensors instead of a dead
+  // tensors. There is no way to express this in a regular Tensorflow graph, so
+  // we choose not to inline if a function can have dead tensors as an output
+  // position. In practice `mergeless switches` should not exists in a function
+  // body, because tf-eager will only use v2 control flow ops.
+  std::vector<MaybeDeadOutput> maybe_dead_outputs;
+  TF_RETURN_IF_ERROR(MaybeDeadOutputs(*ctx, item, &maybe_dead_outputs));
+  if (!maybe_dead_outputs.empty()) {
+    struct MaybeDeadOutputFormatter {
+      void operator()(string* out, const MaybeDeadOutput& md) const {
+        absl::StrAppend(out, SummarizeNodeDef(*md.dead_tensor_src));
+      }
+    };
+    return errors::FailedPrecondition(
+        "Can't inline function with dead outputs. Dead tensor sources (size = ",
+        maybe_dead_outputs.size(), "): ",
+        absl::StrJoin(maybe_dead_outputs, "\n", MaybeDeadOutputFormatter()));
+  }
+
   GraphView::InputPort control_input_port =
       ctx->graph_view().GetInputPort(func_node.name(), Graph::kControlSlot);
   GraphView::OutputPort control_output_port =
@@ -1328,111 +1618,155 @@ Status InlineIndirectFunctionCall(const NodeDef& func_node,
     inputs.push_back(tensor_id);
   }
 
-  // If we have a node inside the function body without inputs (e.g. Const), we
-  // must attach a control dependency to it, to make sure that if a function
-  // call happens inside a loop, the node will be evaluated in correct frame.
-  //
-  // If the function call node has no inputs and no control dependencies, it
-  // means that it can't be a function call inside a loop, and we can safely
-  // insert that node without inputs into the main graph.
-  //
-  // TODO(ezhulenev): Use FrameMap (see grappler/utils/frame.h) to find out if
-  // the function is called inside a loop.
-  std::vector<string> empty_inputs_hook;
-  if (!item.inputs().empty()) {
-    const InputArgExpansion& arg0 = item.inputs()[0];
-    DCHECK(!arg0.placeholders.empty());
-    empty_inputs_hook.push_back(AsControlDependency(AddPrefixToNodeName(
-        arg0.placeholders[0], /*prefix=*/func_node.name())));
-  } else if (!happens_before.empty()) {
-    empty_inputs_hook.push_back(AsControlDependency(happens_before[0]));
-  }
-
   // Mapping from input placeholder name to function input position.
-  int idx = 0;
   absl::flat_hash_map<absl::string_view, int> input_placeholders_idx;
   for (const InputArgExpansion& input_arg : item.inputs()) {
     for (const string& placeholder : input_arg.placeholders) {
-      input_placeholders_idx[placeholder] = idx++;
+      const int idx = input_placeholders_idx.size();
+      input_placeholders_idx[placeholder] = idx;
     }
   }
 
   const string prefix = strings::StrCat(func_node.name(), "/");
 
   // ------------------------------------------------------------------------ //
-  // First we need to assign device placements to all function body nodes.
+  // For each function output value we added an identity node that reads the
+  // tensor from one of the function body nodes. When we inline function into
+  // the main graph we want to bypass these nodes, so we keep a mapping from
+  // 'output node name' -> 'output tensor name'.
+  absl::flat_hash_map<string, string> output_tensors;
 
-  GraphDef placed_graph_def;
-
-  const DeviceSet* devices = ctx->devices();
+  // Unique names of nodes producing tensors in `output_tensors`.
+  absl::flat_hash_set<string> output_tensors_nodes;
 
-  if (devices->devices().empty()) {
-    // If there are no devices available for placer, we just put all nodes to
-    // the same device as a function caller node. This can happen if Grappler is
-    // running "offline", without active runtime session, for example as a part
-    // of a batch job for graph analysis/optimization.
-    VLOG(3) << "Assign function call node device to all function body nodes. "
-            << "Device: " << func_node.device();
-    placed_graph_def = item.mutable_function_body();
-    for (NodeDef& node : *placed_graph_def.mutable_node()) {
-      node.set_device(func_node.device());
+  // Identity nodes added to the function body in place of function outputs.
+  absl::flat_hash_set<string> output_nodes;
+  for (const OutputArgExpansion& output_arg : item.outputs()) {
+    for (const string& output_node : output_arg.output_nodes) {
+      output_nodes.insert(output_node);
     }
-  } else {
-    // If we are running in an active runtime session, Grappler will get the
-    // graph after initial placing is done, and we should have devices for the
-    // placer.
-    VLOG(3) << "Run placer for instantiated function body. Devices: ["
-            << absl::StrJoin(
-                   devices->devices(), ", ",
-                   [](string* out, const Device* d) { out->append(d->name()); })
-            << "]";
+  }
 
-    // Construct a Graph object from the instantiated function body.
-    GraphConstructorOptions opts;
-    Graph graph(ctx->function_library());
-    TF_RETURN_IF_ERROR(
-        ConvertGraphDefToGraph(opts, item.function_body(), &graph));
+  for (const NodeDef& func_body_node : item.graph.node()) {
+    const string& node_name = func_body_node.name();
 
-    // Use function caller node device as a default for placer.
-    const Device* default_device =
-        devices->FindDeviceByName(func_node.device());
+    if (IsIdentity(func_body_node) && output_nodes.count(node_name)) {
+      const string& output_tensor = func_body_node.input(0);
+      output_tensors.emplace(node_name, output_tensor);
 
-    Placer placer(&graph, devices, nullptr, /* No session options */
-                  default_device);
-    TF_RETURN_IF_ERROR(placer.Run());
+      SafeTensorId tensor_id = ParseTensorName(output_tensor);
+      output_tensors_nodes.insert(tensor_id.node());
+    }
+  }
+
+  // ------------------------------------------------------------------------ //
+  // IMPORTANT: Actual inputs will be added to the following nodes at the very
+  // last stage, because we don't want to have invalid edges in a function body
+  // graph (control edges that depend on the nodes in the "outer" optimized
+  // graph).
+
+  // If one of the function inputs is a dead tensor, we must not execute any of
+  // the function body nodes, and let the dead tensor flag propagate through the
+  // inlined function body. We add NoOp inputs_ready node, and add control edges
+  // to it from all input nodes. Inlined function arguments (Identity nodes)
+  // will have a control dependency on it.
+  //
+  // If the function call node has incoming control edges, we will update them
+  // to use this node as destination, to ensure side-effects execution order.
+  NodeDef* inputs_ready_node = nullptr;
+  if (func_node.input_size() > 0) {
+    inputs_ready_node = item.graph.add_node();
+    inputs_ready_node->set_op("NoOp");
+    inputs_ready_node->set_name(kInputsReadyNodeName);
+  }
+
+  // All nodes that have control edge from the function call node, will be
+  // updated to have a control edge from 'outputs_ready_node`. This node will
+  // have control edges from all function outputs. This a "barrier" that
+  // guarantees that all function side effects were executed, and it will also
+  // allow to propagate deadness flag (if there is a deadness mismatch between
+  // output nodes).
+  NodeDef* outputs_ready_node = nullptr;
+  if (item.output_size() > 0 || !happens_after.empty()) {
+    outputs_ready_node = item.graph.add_node();
+    outputs_ready_node->set_op("NoOp");
+    outputs_ready_node->set_name(kOutputsReadyNodeName);
+  }
 
-    // Convert Graph back to the GraphDef.
-    graph.ToGraphDef(&placed_graph_def);
+  // ------------------------------------------------------------------------ //
+  // If we have a node inside the function body without inputs (e.g. Const), we
+  // must attach a control dependency to it, to make sure that if a function
+  // call happens inside a loop, the node will be evaluated in correct frame.
+  //
+  // If the function call node has no inputs and no control dependencies, it
+  // means that it can't be a function call inside a loop, and we can safely
+  // insert that node without inputs into the main graph.
+  //
+  // TODO(ezhulenev): Use FrameMap (see grappler/utils/frame.h) to find out if
+  // the function is called inside a loop.
+  std::vector<string> empty_inputs_hook;
+  if (inputs_ready_node != nullptr) {
+    empty_inputs_hook.push_back(inputs_ready_node->name());
   }
 
+  // ------------------------------------------------------------------------ //
+  // Grappler called after PRE_PLACEMENT and PLACEMENT passes, so we have to
+  // make sure that after inlining all nodes will have valid device assignment.
+
+  GraphDef placed_graph_def;
+  TF_RETURN_IF_ERROR(PlaceInlinedFunctionBody(
+      func_node, item, input_placeholders_idx, ctx, &placed_graph_def));
+
   // ------------------------------------------------------------------------ //
   // After all nodes placed we need to prepare them for inlining into the
   // optimized graph: turn placeholders into identities, update nodes
   // connectivity, etc...
 
+  const auto inlined_node_name = [&func_node](const string& name) -> string {
+    return AddPrefixToNodeName(name, /*prefix=*/func_node.name());
+  };
+
   for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) {
-    if (item.IsInputPlaceholder(func_body_node.name())) {
-      // Turn input placeholders into identity node.
+    const string& node_name = func_body_node.name();
+
+    // Turn placeholders added in place of input arguments into identity nodes.
+    const auto input_placeholder_idx = input_placeholders_idx.find(node_name);
+    if (input_placeholder_idx != input_placeholders_idx.end()) {
       DCHECK_EQ(0, func_body_node.input_size());
       func_body_node.set_op("Identity");
       (*func_body_node.mutable_attr())["T"] = func_body_node.attr().at("dtype");
       func_body_node.mutable_attr()->erase("dtype");
       func_body_node.mutable_attr()->erase("shape");
-      int input_idx = input_placeholders_idx[func_body_node.name()];
-      func_body_node.add_input(strings::StrCat(inputs[input_idx].ToString()));
-
-      // All side effects must happen before inputs can start executing.
-      for (const string& hb_node : happens_before) {
-        func_body_node.add_input(AsControlDependency(hb_node));
+      const int input_idx = input_placeholder_idx->second;
+      func_body_node.add_input(inputs[input_idx].ToString());
+
+      // Add a control dependency on 'inputs_ready' node, to guarantee that all
+      // inputs are alive and all side-effects executed before function body.
+      if (inputs_ready_node) {
+        func_body_node.add_input(
+            AsControlDependency(inlined_node_name(inputs_ready_node->name())));
       }
-
     } else {
       // Update inputs of the regular function body nodes.
       for (string& input : *func_body_node.mutable_input()) {
-        input = AddPrefixToNodeName(input, /*prefix=*/func_node.name());
+        input = inlined_node_name(input);
       }
-      if (func_body_node.input_size() == 0 && !empty_inputs_hook.empty()) {
-        *func_body_node.add_input() = empty_inputs_hook[0];
+
+      // Check if we need to ensure node execution in correct loop frame.
+      bool node_needs_empty_inputs_hook =
+          // We have a node to hook and node has no inputs.
+          !empty_inputs_hook.empty() && func_body_node.input_size() == 0 &&
+          // Inputs ready node will always have edge from main graph. If
+          // function call has no regular and control inputs, we will not add
+          // inputs_ready node to the function body graph.
+          node_name != kInputsReadyNodeName &&
+          // Outputs ready node might not have any inputs (in case function has
+          // no outputs), so we must make sure it's executed in correct frame.
+          (node_name != kOutputsReadyNodeName || item.output_size() == 0);
+
+      if (node_needs_empty_inputs_hook) {
+        *func_body_node.add_input() =
+            AsControlDependency(inlined_node_name(empty_inputs_hook[0]));
       }
     }
 
@@ -1450,96 +1784,201 @@ Status InlineIndirectFunctionCall(const NodeDef& func_node,
     AddDefaultsToNodeDef(*op_def, &func_body_node);
   }
 
-  // Construct a graph view for the preprocessed function body graph.
-  GraphView placed_graph_view(&placed_graph_def);
+  // ------------------------------------------------------------------------ //
+  // Check that after inlining all side-effects will be executed in well defined
+  // order. We do it by checking if there is a path from stateful/dataset ops to
+  // one of the output nodes.
+
+  // Because we rename all the nodes before inlining, we need a copy of
+  // output_nodes with a new names.
+  absl::flat_hash_set<string> inlined_output_nodes;
+  for (const string& output_node : output_nodes) {
+    inlined_output_nodes.insert(inlined_node_name(output_node));
+  }
+  const auto is_inlined_output_node = [&](const NodeDef& node) -> bool {
+    return inlined_output_nodes.find(node.name()) != inlined_output_nodes.end();
+  };
 
-  // Keep track of side-effectful ops inside function body. Each outgoing
-  // control edge from the function call node, must be replaced with control
-  // edges from inlined side-effectful ops.
-  std::vector<string> side_effectful_nodes;
+  // Construct a graph topology view for DFS traversals (skip invalid edges for
+  // input nodes connected to nodes in the optimized graph).
+  GraphTopologyView placed_topo_view(/*skip_invalid_edges=*/true);
+  TF_RETURN_IF_ERROR(placed_topo_view.InitializeFromGraph(placed_graph_def));
+  TF_RETURN_IF_ERROR(CheckThatSideEffectsWillExecute(*ctx, placed_topo_view,
+                                                     inlined_output_nodes));
 
-  // We have to make sure that all side-effectful nodes inside a function body
-  // will be executed after function inlining.
-  for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) {
-    if (!IsFreeOfSideEffect(func_body_node, &ctx->function_library())) {
-      int num_fanouts = placed_graph_view.NumFanouts(
-          func_body_node, /*include_controlling_nodes=*/true);
-
-      // If the node doesn't have any outgoing edges and we do not have any
-      // nodes in the `happens_after` set, we can't inline a function and
-      // guarantee that side-effects will be executed. The only exception if we
-      // do function library optimization, and the GrapplerItem was constructed
-      // for the function body, because functions have strict semantics.
-
-      if (num_fanouts == 0 && happens_after.empty() &&
-          !ctx->allowed_optimizations().inline_ops_with_side_effects) {
-        return errors::Internal(
-            "Can't inline a function with a side-effectful op with empty "
-            "fanouts and empty output control edge set. Function body node: ",
-            SummarizeNodeDef(func_body_node));
+  // ------------------------------------------------------------------------ //
+  // Move all the nodes to the optimized graph after successful preprocessing.
+
+  if (inputs_ready_node != nullptr) {
+    string inlined_node = inlined_node_name(inputs_ready_node->name());
+    absl::optional<int> node_idx = placed_topo_view.GetNodeIndex(inlined_node);
+
+    absl::flat_hash_set<string> input_nodes;
+    for (const string& input : func_node.input()) {
+      const SafeTensorId tensor = ParseTensorName(input);
+      if (input_nodes.insert(tensor.node()).second) {
+        placed_graph_def.mutable_node(*node_idx)->add_input(
+            AsControlDependency(tensor.node()));
       }
+    }
+  }
+
+  if (outputs_ready_node != nullptr) {
+    string inlined_node = inlined_node_name(outputs_ready_node->name());
+    absl::optional<int> node_idx = placed_topo_view.GetNodeIndex(inlined_node);
 
-      side_effectful_nodes.push_back(func_body_node.name());
+    // Add control edges from all nodes producing output tensors.
+    for (const string& node_name : output_tensors_nodes) {
+      placed_graph_def.mutable_node(*node_idx)->add_input(
+          AsControlDependency(inlined_node_name(node_name)));
     }
+
+    // Forward all control dependencies in the optimized graph to the new node.
+    ctx->AddControlOverrides(func_node, {inlined_node});
   }
 
-  // Move all the nodes to the optimized graph after successful preprocessing.
   for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) {
+    // Skip output identity nodes.
+    if (IsIdentity(func_body_node) && is_inlined_output_node(func_body_node))
+      continue;
+
     optimized_graph->add_node()->Swap(&func_body_node);
   }
 
-  // TODO(ezhulenev): Inline nested indirect function calls.
-
   // Indirect function call is fully inlined into the optimized graph, and we do
   // not copy the original function call node, so we have to setup tensor
   // mapping from old output tensors, to the outputs of inlined nodes.
   int output_idx = 0;
   for (const OutputArgExpansion& output : item.outputs()) {
-    for (const string& output_tensor : output.output_tensors) {
+    for (const string& output_node : output.output_nodes) {
+      const string& output_tensor = output_tensors.at(output_node);
+
       const SafeTensorId from_tensor(func_node.name(), output_idx++);
-      const SafeTensorId to_tensor = ParseTensorName(
-          AddPrefixToNodeName(output_tensor, /*prefix=*/func_node.name()));
-      ctx->AddTensorMapping(from_tensor, to_tensor);
+      const SafeTensorId to_tensor = ParseTensorName(output_tensor);
+
+      const SafeTensorId inlined_to_tensor =
+          SafeTensorId(absl::StrCat(func_node.name(), "/", to_tensor.node()),
+                       to_tensor.index());
+
+      ctx->AddTensorMapping(from_tensor, inlined_to_tensor);
     }
   }
 
-  // After inlining we'll have to forward all control dependencies from function
-  // call node to all side-effectful ops inside function body.
-  ctx->AddControlOverrides(func_node, side_effectful_nodes);
-
   VLOG(3) << "Successfully inlined indirect function call: "
           << SummarizeNodeDef(func_node);
+
   return Status::OK();
 }
 
-}  // namespace
+// Restores graph invariants after function specialization and inlining: all
+// inputs must be connected to valid nodes.
+Status RestoreGraphInvariants(const FunctionOptimizerContext& ctx,
+                              GraphDef* optimized_graph) {
+  // After function specialization and inlining graph might be in invalid
+  // state, and some nodes can read tensors that do not exists anymore in the
+  // optimized graph: function call node was fully inlined into the graph, or
+  // output index was invalidated by the output pruning.
 
-Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
-                                   GraphDef* optimized_graph) {
-  // Nothing to do here.
-  if (item.graph.library().function_size() == 0) {
-    *optimized_graph = item.graph;
-    return Status::OK();
+  if (!ctx.tensor_mapping().empty()) {
+    for (NodeDef& node : *optimized_graph->mutable_node()) {
+      for (int idx = 0; idx < node.input_size(); ++idx) {
+        TensorId input_tensor = ParseTensorName(node.input(idx));
+        if (input_tensor.index() == Graph::kControlSlot) break;
+
+        auto mapping = ctx.tensor_mapping().find(input_tensor);
+        if (mapping != ctx.tensor_mapping().end()) {
+          node.set_input(idx, mapping->second.ToString());
+        }
+      }
+    }
   }
 
-  FunctionOptimizerContext ctx(opt_level_, item);
+  // Function inlining instantiates function body directly into the optimized
+  // graph, and we might end up with control dependencies to the nodes that no
+  // longer exist in a graph. We need to apply control overrides to all
+  // invalidated nodes, and rewire control dependencies to the control outputs
+  // node (it's also possible to rewrite singe control edge into multiple edges
+  // to inlined side-effectful nodes).
+
+  if (!ctx.control_overrides().empty()) {
+    for (NodeDef& node : *optimized_graph->mutable_node()) {
+      // Keep track of new control inputs to the node.
+      absl::flat_hash_set<string> add_ctrl_inputs;
+
+      // Remove all invalidated control inputs.
+      for (int idx = 0; idx < node.input_size(); /* see below */) {
+        // TODO(ezhulenev): Use non-allocating TensorId after migrating
+        // `control_overrides()` to absl::flat_hash_set.
+        SafeTensorId input_tensor = ParseTensorName(node.input(idx));
+
+        auto overrides = ctx.control_overrides().find(input_tensor.node());
+        if (overrides != ctx.control_overrides().end()) {
+          // If this happens it's a bug in the function inlining.
+          if (input_tensor.index() != Graph::kControlSlot) {
+            return errors::Internal(
+                "Illegal input edge from inlined function call node");
+          }
+          // Remove control dependency to the inlined function call node.
+          node.mutable_input()->SwapElements(idx, node.input_size() - 1);
+          node.mutable_input()->RemoveLast();
+
+          // Keep track of all overrides.
+          for (const string& override : overrides->second) {
+            add_ctrl_inputs.insert(AsControlDependency(override));
+          }
+        } else {
+          // Go to the next input only if the current one was not invalidated,
+          // otherwise we need to check the swapped input as well.
+          ++idx;
+        }
+      }
+
+      // Add overrides to the node inputs.
+      for (const string& ctrl_input : add_ctrl_inputs) {
+        node.add_input(ctrl_input);
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+}  // namespace
+
+Status FunctionOptimizer::RunFunctionOptimizerPass(
+    const GrapplerItem& item, const GraphDef& graph, const int iteration,
+    std::unordered_set<string>* skip_nodes, GraphDef* optimized_graph,
+    bool* graph_has_unoptimized_function_calls) const {
+  VLOG(3) << absl::Substitute(
+      "Run function optimizer pass (iteration = $0): grappler_item_id = $1",
+      iteration, item.id);
+
+  FunctionOptimizerContext ctx(item, opt_level_, graph);
 
   bool inline_gradients = options_.enable_symbolic_gradient_inlining;
   bool inline_func = options_.enable_function_inlining;
   bool specialize_func = options_.enable_function_specialization;
 
-  for (const NodeDef& node : item.graph.node()) {
+  for (const NodeDef& node : graph.node()) {
     // Each node optimization can modify optimized graph only by adding new
     // nodes, we can check node size to make sure that graph was not modified.
     const int num_nodes_before = optimized_graph->node_size();
     const auto is_graph_modified = [&]() {
       int num_nodes = optimized_graph->node_size();
-      CHECK_GE(num_nodes, num_nodes_before) << "Nodes should not be removed";
+      DCHECK_GE(num_nodes, num_nodes_before) << "Nodes should not be removed";
       return num_nodes > num_nodes_before;
     };
 
-    // Add a copy of an input graph node to the optimized graph.
-    const auto add_node_copy = [&]() { *optimized_graph->add_node() = node; };
+    // Copy node from the `graph` to the `optimized_graph`.
+    const auto copy_node = [&]() { *optimized_graph->add_node() = node; };
+
+    // If we already failed to optimize this node during one of the previous
+    // passes, we just give up, and do not try on more time.
+    if (skip_nodes->find(node.name()) != skip_nodes->end()) {
+      VLOG(3) << "Skip optimization for node: " << node.name();
+      copy_node();
+      continue;
+    }
 
 // Skip errors if optimized graph was not modified before error happened.
 #define TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(...)                     \
@@ -1549,7 +1988,8 @@ Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
       return _status;                                              \
     if (TF_PREDICT_FALSE(!_status.ok() && !is_graph_modified())) { \
       VLOG(3) << "Skip error: " << _status.error_message();        \
-      add_node_copy();                                             \
+      skip_nodes->insert(node.name());                             \
+      copy_node();                                                 \
     }                                                              \
   } while (0)
 
@@ -1567,6 +2007,9 @@ Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
         TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
             InlineSymbolicGradient(node, &ctx, optimized_graph));
         continue;
+      } else {
+        VLOG(2) << "Skip SymbolicGradient inlining: function=" << f_name;
+        skip_nodes->insert(node.name());
       }
     }
 
@@ -1579,7 +2022,6 @@ Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
 
     if (func != nullptr) {
       const string& func_name = func->signature().name();
-      const int graph_def_version = item.graph.versions().producer();
 
       const bool is_direct_func = IsDirectFunctionCall(*func, node);
       const bool is_indirect_func = IsIndirectFunctionCall(*func, node);
@@ -1588,11 +2030,12 @@ Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
       if (inline_func && is_direct_func) {
         Status inlinable = IsInlinableDirectFunctionCall(ctx, *func, node);
         if (inlinable.ok()) {
-          TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(InlineDirectFunctionCall(
-              node, *func, graph_def_version, ctx, optimized_graph));
+          TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
+              InlineDirectFunctionCall(node, *func, ctx, optimized_graph));
           continue;
         } else {
           VLOG(2) << inlinable.error_message();
+          skip_nodes->insert(node.name());
         }
       }
 
@@ -1600,11 +2043,12 @@ Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
       if (inline_func && is_indirect_func) {
         Status inlinable = IsInlinableIndirectFunctionCall(ctx, *func, node);
         if (inlinable.ok()) {
-          TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(InlineIndirectFunctionCall(
-              node, *func, graph_def_version, &ctx, optimized_graph));
+          TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
+              InlineIndirectFunctionCall(node, *func, &ctx, optimized_graph));
           continue;
         } else {
           VLOG(2) << inlinable.error_message();
+          skip_nodes->insert(node.name());
         }
       }
 
@@ -1621,95 +2065,95 @@ Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
         // TODO(ezhulenev): Specialize function call if input has a known shape.
         // Specialize function body for its instantiation attributes and inputs.
         TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
-            SpecializeFunction(node, *func, item.graph.versions().producer(),
-                               &ctx, optimized_graph));
+            SpecializeFunction(node, *func, &ctx, optimized_graph));
         continue;
+      } else {
+        VLOG(2) << "Skip function specialization: " << func->signature().name();
+        skip_nodes->insert(node.name());
       }
     }
 
     // ---------------------------------------------------------------------- //
     // If we reached this point, node was not handled by any of the stages
-    // (inline, specialize), simply add a copy to the graph.
-    add_node_copy();
+    // (inline, specialize), simply copy the node to the optimized graph.
+    copy_node();
 
 #undef TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED
   }
 
-  // After function specialization and inlining graph might be in invalid
-  // state, and some nodes can read tensors that do not exists anymore in the
-  // optimized graph: function call node was fully inlined into the graph, or
-  // output index was invalidated by the output pruning.
+  TF_RETURN_IF_ERROR(RestoreGraphInvariants(ctx, optimized_graph));
 
-  if (!ctx.tensor_mapping().empty()) {
-    for (NodeDef& node : *optimized_graph->mutable_node()) {
-      for (int idx = 0; idx < node.input_size(); ++idx) {
-        TensorId input_tensor = ParseTensorName(node.input(idx));
-        if (input_tensor.index() == Graph::kControlSlot) break;
+  // Preserve the graph version.
+  *optimized_graph->mutable_versions() = graph.versions();
 
-        auto mapping = ctx.tensor_mapping().find(input_tensor);
-        if (mapping != ctx.tensor_mapping().end()) {
-          node.set_input(idx, mapping->second.ToString());
-        }
-      }
+  // Prune unreachable function from the library.
+  if (options_.enable_trim_function_library) {
+    *optimized_graph->mutable_library() =
+        PruneFunctionLibrary(ctx.function_library(), *optimized_graph);
+  } else {
+    *optimized_graph->mutable_library() = ctx.function_library().ToProto();
+  }
+
+  // Before returning we check if after single optimization pass we have more
+  // unoptimized function calls.
+  *graph_has_unoptimized_function_calls = false;
+  for (const NodeDef& node : optimized_graph->node()) {
+    // Check if we can inline symbolic gradient.
+    if (IsSymbolicGradient(node) && inline_gradients &&
+        skip_nodes->count(node.name()) == 0) {
+      *graph_has_unoptimized_function_calls = true;
+      break;
     }
-  }
 
-  // Function inlining instantiates function body directly into the optimized
-  // graph, and we might end up with control dependencies to the nodes that no
-  // longer exist in a graph. We need to apply control overrides to all
-  // invalidated nodes, and rewire control dependencies to the inlined
-  // side-effectful function body nodes.
+    // Check if after inlining we have unoptimized function calls.
+    const FunctionDef* func = FindFunctionCall(ctx, node);
+    if (func != nullptr && !MarkedSpecialized(*func) &&
+        skip_nodes->count(node.name()) == 0) {
+      *graph_has_unoptimized_function_calls = true;
+      break;
+    }
+  }
 
-  // TODO(ezhulenev): With nested function call inlining, single pass over
-  // `control_overrides` might not bring the graph into a valid state,
-  // continue until it converges and all invalidated control dependencies
-  // removed.
+  return Status::OK();
+}
 
-  if (!ctx.control_overrides().empty()) {
-    for (NodeDef& node : *optimized_graph->mutable_node()) {
-      // Keep track of new control inputs to the node.
-      gtl::FlatSet<string> add_ctrl_inputs;
+Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
+                                   GraphDef* optimized_graph) {
+  // Nothing to do here.
+  if (item.graph.library().function_size() == 0) {
+    *optimized_graph = item.graph;
+    return Status::OK();
+  }
 
-      // Remove all invalidated control inputs.
-      for (int idx = 0; idx < node.input_size(); /* see below */) {
-        // TODO(ezhulenev): Use non-allocating TensorId after migrating
-        // `control_overrides()` to absl::flat_hash_set.
-        SafeTensorId input_tensor = ParseTensorName(node.input(idx));
+  // Do not retry failed function inlining or specialization.
+  std::unordered_set<string> skip_nodes;
+  bool graph_has_unoptimized_function_calls = false;
 
-        auto overrides = ctx.control_overrides().find(input_tensor.node());
-        if (overrides != ctx.control_overrides().end()) {
-          // If this happens it's a bug in the function inlining.
-          if (input_tensor.index() != Graph::kControlSlot) {
-            return errors::Internal(
-                "Illegal input edge from inlined function call node");
-          }
-          // Remove control dependency to the inlined function call node.
-          node.mutable_input()->SwapElements(idx, node.input_size() - 1);
-          node.mutable_input()->RemoveLast();
+  // We'll keep running function optimizer pass until we inlined and optimized
+  // all function call nodes.
+  int iteration = 0;
+  constexpr int kMaxIterations = 50;
 
-          // Keep track of all overrides.
-          for (const string& override : overrides->second) {
-            add_ctrl_inputs.insert(AsControlDependency(override));
-          }
-        } else {
-          // Go to the next input only if the current one was not invalidated,
-          // otherwise we need to check the swapped input as well.
-          ++idx;
-        }
-      }
+  // 1. Run first optimizer pass with GrapplerItem.graph.
+  TF_RETURN_IF_ERROR(RunFunctionOptimizerPass(
+      item, item.graph, 0, &skip_nodes, optimized_graph,
+      &graph_has_unoptimized_function_calls));
 
-      // Add overrides to the node inputs.
-      for (const string& ctrl_input : add_ctrl_inputs) {
-        node.add_input(ctrl_input);
-      }
+  // 2. If after function inlining we have unoptimized function calls, we have
+  // to run function optimization pass one more time.
+  while (graph_has_unoptimized_function_calls) {
+    if (iteration++ > kMaxIterations) {
+      VLOG(1) << "Break function optimizer loop at iteration #" << iteration;
+      break;
     }
-  }
 
-  *optimized_graph->mutable_versions() = item.graph.versions();
-  *optimized_graph->mutable_library() =
-      options_.enable_trim_function_library
-          ? PruneFunctionLibrary(ctx.function_library(), *optimized_graph)
-          : ctx.function_library().ToProto();
+    GraphDef workspace_graph;
+    workspace_graph.Swap(optimized_graph);
+
+    TF_RETURN_IF_ERROR(RunFunctionOptimizerPass(
+        item, workspace_graph, iteration, &skip_nodes, optimized_graph,
+        &graph_has_unoptimized_function_calls));
+  }
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.h b/tensorflow/core/grappler/optimizers/function_optimizer.h
index 4352555064c43c8db40157ace2fca9479907df8e..ab90281509fc1f4a80a82bd6e1ab830e22200838 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.h
@@ -48,6 +48,16 @@ class FunctionOptimizer : public GraphOptimizer {
     bool enable_trim_function_library = true;
   };
 
+  // Runs a single function optimizer pass over the `graph`. All nodes that are
+  // not function calls will be copied from the `graph` to the
+  // `optimized_graph`. Function call nodes inlined or specialized, and
+  // instantiated function body or specialized function call nodes will be added
+  // to the `optimized_graph`.
+  Status RunFunctionOptimizerPass(
+      const GrapplerItem& item, const GraphDef& graph, const int iteration,
+      std::unordered_set<string>* skip_nodes, GraphDef* optimized_graph,
+      bool* graph_has_unoptimized_function_calls) const;
+
   RewriterConfig::Toggle opt_level_;
   FunctionOptimizerOptions options_;
 };
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index c971eec3f4dae5cc3457ad802700ee4f3086eb90..2787d9d5253ebc0e1f8179549d1613fdf9301c96 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/function_optimizer.h"
+
+#include "absl/algorithm/container.h"
 #include "tensorflow/cc/ops/functional_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/function_testlib.h"
@@ -660,7 +662,7 @@ TEST_F(FunctionOptimizerTest, InlineSymbolicGradient_IdentityFunc) {
   test::ExpectTensorEqual<float>(expected[0], optimized[0]);
 }
 
-TEST_F(FunctionOptimizerTest, InlineSymbolicGradient_NoInlineFunc) {
+TEST_F(FunctionOptimizerTest, InlineSymbolicGradientNoInlineFunc) {
   FunctionOptimizer optimizer(RewriterConfig::ON);
 
   FunctionDef func = FunctionDefHelper::Define(
@@ -734,9 +736,13 @@ TEST_F(FunctionOptimizerTest, InlineIndirectFunctionSimpleFunction) {
        NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
 
        // Function must be inlined and all nodes placed on a valid device.
-       NDef("c/x", "Identity", {"a:0"}, {{"T", DT_FLOAT}}, kDevice),
-       NDef("c/y", "Identity", {"b:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("c/inputs_ready", "NoOp", {"^a", "^b"}, {}, kDevice),
+       NDef("c/x", "Identity", {"a:0", "^c/inputs_ready"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("c/y", "Identity", {"b:0", "^c/inputs_ready"}, {{"T", DT_FLOAT}},
+            kDevice),
        NDef("c/mul", "Mul", {"c/x", "c/y"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("c/outputs_ready", "NoOp", {"^c/mul"}, {}, kDevice),
 
        NDef("d", "Identity", {"c/mul:0"}, {{"T", DT_FLOAT}}, kDevice)},
       // Function library.
@@ -831,36 +837,51 @@ TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithControlDependencies) {
             kDevice),
 
        // Function body of a first function call inlined into the graph.
-       NDef("f1/x", "Identity", {"a:0", "^init_v"}, {{"T", DT_FLOAT}}, kDevice),
-       NDef("f1/y", "Identity", {"b:0", "^init_v"}, {{"T", DT_FLOAT}}, kDevice),
-       NDef("f1/v", "Identity", {"v:0", "^init_v"}, {{"T", DT_RESOURCE}},
+       NDef("f1/inputs_ready", "NoOp", {"^a", "^b", "^v", "^init_v"}, {},
+            kDevice),
+
+       NDef("f1/x", "Identity", {"a:0", "^f1/inputs_ready"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("f1/y", "Identity", {"b:0", "^f1/inputs_ready"}, {{"T", DT_FLOAT}},
             kDevice),
-       NDef("f1/one", "Const", {"^f1/x"},
+       NDef("f1/v", "Identity", {"v:0", "^f1/inputs_ready"},
+            {{"T", DT_RESOURCE}}, kDevice),
+
+       NDef("f1/one", "Const", {"^f1/inputs_ready"},
             {{"dtype", DT_FLOAT}, {"value", kOne}}, kDevice),
        NDef("f1/add", "AssignAddVariableOp", {"f1/v", "f1/one"},
             {{"dtype", DT_FLOAT}}, kDevice),
        NDef("f1/mul", "Mul", {"f1/x", "f1/y", "^f1/add"}, {{"T", DT_FLOAT}},
             kDevice),
 
+       NDef("f1/outputs_ready", "NoOp", {"^f1/mul"}, {}, kDevice),
+
        // Function body of a second function call also inlined into the graph,
        // and input nodes read directly from the inlined nodes of the first
        // function call.
-       NDef("f2/x", "Identity", {"f1/mul:0", "^f1/add"}, {{"T", DT_FLOAT}},
-            kDevice),
-       NDef("f2/y", "Identity", {"f1/mul:0", "^f1/add"}, {{"T", DT_FLOAT}},
+       NDef("f2/inputs_ready", "NoOp", {"^v", "^f1/outputs_ready"}, {},
             kDevice),
-       NDef("f2/v", "Identity", {"v:0", "^f1/add"}, {{"T", DT_RESOURCE}},
-            kDevice),
-       NDef("f2/one", "Const", {"^f2/x"},
+
+       NDef("f2/x", "Identity", {"f1/mul:0", "^f2/inputs_ready"},
+            {{"T", DT_FLOAT}}, kDevice),
+       NDef("f2/y", "Identity", {"f1/mul:0", "^f2/inputs_ready"},
+            {{"T", DT_FLOAT}}, kDevice),
+       NDef("f2/v", "Identity", {"v:0", "^f2/inputs_ready"},
+            {{"T", DT_RESOURCE}}, kDevice),
+
+       NDef("f2/one", "Const", {"^f2/inputs_ready"},
             {{"dtype", DT_FLOAT}, {"value", kOne}}, kDevice),
        NDef("f2/add", "AssignAddVariableOp", {"f2/v", "f2/one"},
             {{"dtype", DT_FLOAT}}, kDevice),
        NDef("f2/mul", "Mul", {"f2/x", "f2/y", "^f2/add"}, {{"T", DT_FLOAT}},
             kDevice),
 
+       NDef("f2/outputs_ready", "NoOp", {"^f2/mul"}, {}, kDevice),
+
        // Return values read directly from inlined nodes.
        NDef("out_1", "Identity", {"f2/mul:0"}, {{"T", DT_FLOAT}}, kDevice),
-       NDef("out_2", "ReadVariableOp", {"v", "^f1/add", "^f2/add"},
+       NDef("out_2", "ReadVariableOp",
+            {"v", "^f1/outputs_ready", "^f2/outputs_ready"},
             {{"dtype", DT_FLOAT}}, kDevice)},
 
       // Function library.
@@ -924,10 +945,15 @@ TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithDevicePlacement) {
       {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu0),
        NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu1),
 
-       // Function must be inlined and `mul` node placed on a requested device.
-       NDef("c/x", "Identity", {"a:0"}, {{"T", DT_FLOAT}}, cpu1),
-       NDef("c/y", "Identity", {"b:0"}, {{"T", DT_FLOAT}}, cpu1),
+       // Function must be inlined and `mul` node placed on a requested device,
+       // and input `Identity` nodes must be colocated with their source nodes.
+       NDef("c/inputs_ready", "NoOp", {"^a", "^b"}, {}, cpu0),
+       NDef("c/x", "Identity", {"a:0", "^c/inputs_ready"}, {{"T", DT_FLOAT}},
+            cpu0),
+       NDef("c/y", "Identity", {"b:0", "^c/inputs_ready"}, {{"T", DT_FLOAT}},
+            cpu1),
        NDef("c/mul", "Mul", {"c/x", "c/y"}, {{"T", DT_FLOAT}}, cpu1),
+       NDef("c/outputs_ready", "NoOp", {"^c/mul"}, {}, cpu0),
 
        NDef("d", "Identity", {"c/mul:0"}, {{"T", DT_FLOAT}}, cpu0)},
       // Function library.
@@ -936,7 +962,8 @@ TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithDevicePlacement) {
   CompareGraphs(expected, optimized_graph);
 }
 
-TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithoutSideEffects) {
+TEST_F(FunctionOptimizerTest,
+       InlineIndirectFunctionWithControlDependencyAndNoSideEffects) {
   using test::function::NDef;
   using FDH = FunctionDefHelper;
 
@@ -994,16 +1021,24 @@ TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithoutSideEffects) {
        NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
 
        // Function body of a first function call inlined into the graph.
-       NDef("f1/x", "Identity", {"a:0"}, {{"T", DT_FLOAT}}, kDevice),
-       NDef("f1/y", "Identity", {"b:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f1/inputs_ready", "NoOp", {"^a", "^b"}, {}, kDevice),
+       NDef("f1/x", "Identity", {"a:0", "^f1/inputs_ready"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("f1/y", "Identity", {"b:0", "^f1/inputs_ready"}, {{"T", DT_FLOAT}},
+            kDevice),
        NDef("f1/mul", "Mul", {"f1/x", "f1/y"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f1/outputs_ready", "NoOp", {"^f1/mul"}, {}, kDevice),
 
        // Function body of a second function call also inlined into the graph,
        // and input nodes read directly from the inlined nodes of the first
        // function call, and control dependency edge removed.
-       NDef("f2/x", "Identity", {"f1/mul:0"}, {{"T", DT_FLOAT}}, kDevice),
-       NDef("f2/y", "Identity", {"f1/mul:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f2/inputs_ready", "NoOp", {"^f1/outputs_ready"}, {}, kDevice),
+       NDef("f2/x", "Identity", {"f1/mul:0", "^f2/inputs_ready"},
+            {{"T", DT_FLOAT}}, kDevice),
+       NDef("f2/y", "Identity", {"f1/mul:0", "^f2/inputs_ready"},
+            {{"T", DT_FLOAT}}, kDevice),
        NDef("f2/mul", "Mul", {"f2/x", "f2/y"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f2/outputs_ready", "NoOp", {"^f2/mul"}, {}, kDevice),
 
        // Return directly from inlined node of f2.
        NDef("out", "Identity", {"f2/mul:0"}, {{"T", DT_FLOAT}}, kDevice)},
@@ -1024,6 +1059,371 @@ TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithoutSideEffects) {
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
 
+TEST_F(FunctionOptimizerTest, InlineIndirectFunctionDoNotInlineDeadOutputs) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  // Function output can be dead.
+  FunctionDef dead_outputs = FunctionDefHelper::Create(
+      "DeadOutputs", {"x:T", "cond:bool"}, {"z:T"}, {"T: {float, double}"},
+      {
+          {{"switch"}, "Switch", {"x", "cond"}, {{"T", "$T"}}},
+          {{"if_false"}, "Identity", {"switch:output_false:0"}, {{"T", "$T"}}},
+          {{"if_true"}, "Identity", {"switch:output_true:0"}, {{"T", "$T"}}},
+      },
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "if_false:output:0"}});
+
+  // Simple proxy functions that calls DeadOutputs from the function body.
+  FunctionDef proxy_func = FunctionDefHelper::Create(
+      "Proxy", {"x:T", "cond:bool"}, {"z:T"}, {"T: {float, double}"},
+      {{{"dead"}, "DeadOutputs", {"x", "cond"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "dead:z:0"}});
+
+  // Build a graph to compute:
+  //   a: float
+  //   b: bool
+  //   fn0 = DeadOutputs(x, b)
+  //   fn1 = Proxy(x, b)
+  //   out0 = Identity(fn0)
+  //   out1 = Identity(fn1)
+  //   return [out0, out1]
+  //
+  GrapplerItem item;
+  item.fetch = {"out0", "out1"};
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_BOOL}}, kDevice),
+
+       NDef("fn0", "PartitionedCall", {"a", "b"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_BOOL}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("DeadOutputs", {{"T", DT_FLOAT}})}},
+            kDevice),
+
+       NDef("fn1", "PartitionedCall", {"a", "b"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_BOOL}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("Proxy", {{"T", DT_FLOAT}})}},
+            kDevice),
+
+       NDef("out0", "Identity", {"fn0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("out1", "Identity", {"fn1"}, {{"T", DT_FLOAT}}, kDevice)},
+      // Function library.
+      {dead_outputs, proxy_func});
+
+  GraphDef optimized_graph;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &optimized_graph));
+
+  GraphDef expected = item.graph;
+  CompareGraphs(expected, optimized_graph);
+
+  const Tensor one = test::AsScalar<float>(1.0);
+  item.feed.emplace_back("a", one);
+  item.feed.emplace_back("b", test::AsScalar<bool>(false));
+
+  auto tensors = EvaluateFetchNodes(item);
+  ASSERT_EQ(tensors.size(), 2);
+  test::ExpectTensorEqual<float>(tensors[0], one);
+  test::ExpectTensorEqual<float>(tensors[1], one);
+}
+
+TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithMergedDeadTensors) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  // Function output can't be dead because it goes through the Merge node.
+  FunctionDef no_dead_outputs = FunctionDefHelper::Create(
+      "NoDeadOutputs", {"x:T", "cond:bool"}, {"z:T"}, {"T: {float, double}"},
+      {
+          {{"switch"}, "Switch", {"x", "cond"}, {{"T", "$T"}}},
+          {{"if_false"}, "Identity", {"switch:output_false:0"}, {{"T", "$T"}}},
+          {{"if_true"}, "Identity", {"switch:output_true:0"}, {{"T", "$T"}}},
+          {{"merge"},
+           "Merge",
+           {"if_false:output:0", "if_true:output:0"},
+           {{"T", "$T"}, {"N", 2}}},
+      },
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "merge:output:0"}});
+
+  // Build a graph to compute:
+  //   a: float
+  //   b: bool
+  //   d = DeadOutputs(x, b)
+  //   out = Identity(d)
+  //   return out
+  //
+  GrapplerItem item;
+  item.fetch = {"out"};
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_BOOL}}, kDevice),
+
+       NDef("fn", "PartitionedCall", {"a", "b"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_BOOL}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("NoDeadOutputs", {{"T", DT_FLOAT}})}},
+            kDevice),
+
+       NDef("out", "Identity", {"fn"}, {{"T", DT_FLOAT}}, kDevice)},
+      // Function library.
+      {no_dead_outputs});
+
+  GraphDef optimized_graph;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &optimized_graph));
+
+  GraphDef expected = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_BOOL}}, kDevice),
+
+       // Function body of a first function call inlined into the graph.
+       NDef("fn/inputs_ready", "NoOp", {"^a", "^b"}, {}, kDevice),
+       NDef("fn/x", "Identity", {"a:0", "^fn/inputs_ready"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("fn/cond", "Identity", {"b:0", "^fn/inputs_ready"},
+            {{"T", DT_BOOL}}, kDevice),
+       NDef("fn/switch", "Switch", {"fn/x:0", "fn/cond:0"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("fn/if_false", "Identity", {"fn/switch:0"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("fn/if_true", "Identity", {"fn/switch:1"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("fn/merge", "Merge", {"fn/if_false:0", "fn/if_true:0"},
+            {{"T", DT_FLOAT}, {"N", 2}}, kDevice),
+       NDef("fn/outputs_ready", "NoOp", {"^fn/merge"}, {}, kDevice),
+
+       // Return directly from inlined node.
+       NDef("out", "Identity", {"fn/merge:0"}, {{"T", DT_FLOAT}}, kDevice)},
+
+      // Function library.
+      {no_dead_outputs});
+
+  CompareGraphs(expected, optimized_graph);
+
+  const Tensor one = test::AsScalar<float>(1.0);
+  item.feed.emplace_back("a", one);
+  item.feed.emplace_back("b", test::AsScalar<bool>(false));
+
+  auto tensors_expected = EvaluateFetchNodes(item);
+  ASSERT_EQ(tensors_expected.size(), 1);
+
+  GrapplerItem optimized = item.WithGraph(std::move(optimized_graph));
+  auto tensors = EvaluateFetchNodes(optimized);
+  ASSERT_EQ(tensors.size(), 1);
+
+  test::ExpectTensorEqual<float>(tensors[0], tensors_expected[0]);
+}
+
+TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithNestedFunctionCall) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+
+  // `Square` implemented in terms of PartitionedCall to `MyMul`.
+  FunctionDef square_func = FunctionDefHelper::Create(
+      "MySquare", {"x:T"}, {"output:T"}, {"T: {float, double}"},
+      {{{"square"},
+        "PartitionedCall",
+        {"x", "x"},
+        {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+         {"Tout", DataTypeSlice{DT_FLOAT}},
+         {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"output", "square:output:0"}});
+
+  // Build a graph to compute:
+  //   b = Square(a)
+  //   c = Identity(b)
+  //   return c
+  GrapplerItem item;
+  item.fetch = {"c"};
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "PartitionedCall", {"a"},
+            {{"Tin", DataTypeSlice{DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MySquare", {{"T", DT_FLOAT}})}},
+            kDevice),
+       NDef("c", "Identity", {"b"}, {{"T", DT_FLOAT}}, kDevice)},
+      /* Function library */
+      {mul_func, square_func});
+
+  GraphDef optimized_graph;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &optimized_graph));
+
+  GraphDef expected = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       // Inlined inputs of `b` node.
+       NDef("b/inputs_ready", "NoOp", {"^a"}, {}, kDevice),
+       NDef("b/x", "Identity", {"a:0", "^b/inputs_ready"}, {{"T", DT_FLOAT}},
+            kDevice),
+
+       // Inlined inputs of `square` node inside inlined `MySquare` function.
+       NDef("b/square/inputs_ready", "NoOp", {"^b/x"}, {}, kDevice),
+       NDef("b/square/x", "Identity", {"b/x:0", "^b/square/inputs_ready"},
+            {{"T", DT_FLOAT}}, kDevice),
+       NDef("b/square/y", "Identity", {"b/x:0", "^b/square/inputs_ready"},
+            {{"T", DT_FLOAT}}, kDevice),
+
+       // Inlined mul node from the `MyMul` function.
+       NDef("b/square/mul", "Mul", {"b/square/x", "b/square/y"},
+            {{"T", DT_FLOAT}}, kDevice),
+
+       NDef("b/square/outputs_ready", "NoOp", {"^b/square/mul"}, {}, kDevice),
+       NDef("b/outputs_ready", "NoOp", {"^b/square/outputs_ready"}, {},
+            kDevice),
+
+       NDef("c", "Identity", {"b/square/mul:0"}, {{"T", DT_FLOAT}}, kDevice)},
+      // Function library.
+      {mul_func});
+
+  CompareGraphs(expected, optimized_graph);
+
+  Tensor three = test::AsScalar<float>(3.0f);
+  item.feed.emplace_back("a", three);
+
+  GrapplerItem optimized = item.WithGraph(std::move(optimized_graph));
+  auto tensors_expected = EvaluateFetchNodes(item);
+  auto tensors = EvaluateFetchNodes(optimized);
+  ASSERT_EQ(tensors_expected.size(), 1);
+  ASSERT_EQ(tensors.size(), tensors_expected.size());
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
+TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithFunctionalControlFlow) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  FunctionDef add_func = FunctionDefHelper::Create(
+      "MyAdd", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"add"}, "Add", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "add:z:0"}});
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+
+  // Compute: return cond ? a + b : a * b
+  FunctionDef add_or_mul_func = FunctionDefHelper::Create(
+      "AddOrMul", {"cond:bool", "x:float", "y:float"}, {"z:float"}, {},
+      {
+          {{"if_node"},
+           "If",
+           {"cond", "x", "y"},
+           {
+               {"Tcond", DT_BOOL},
+               {"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+               {"Tout", DataTypeSlice{DT_FLOAT}},
+               {"then_branch", FDH::FunctionRef("MyAdd", {{"T", DT_FLOAT}})},
+               {"else_branch", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})},
+               {"_lower_using_switch_merge", true},
+           }},
+      },
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "if_node:output:0"}});
+
+  // Build a computation graph for:
+  //   is_add: bool
+  //   a: float
+  //   b: float
+  //   c = AddOrMul(is_add, a, b)  # is_add ? a + b : a * b
+  //   d = Identity(c)
+  //   return d
+
+  // c = MyMul(a, b)
+  GrapplerItem item;
+  item.fetch = {"d"};
+  item.graph = test::function::GDef(
+      {NDef("is_add", "Placeholder", {}, {{"dtype", DT_BOOL}}, kDevice),
+       NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       NDef("c", "PartitionedCall", {"is_add", "a", "b"},
+            {{"Tin", DataTypeSlice{DT_BOOL, DT_FLOAT, DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("AddOrMul")}},
+            kDevice),
+
+       NDef("d", "Identity", {"c"}, {{"T", DT_FLOAT}}, kDevice)},
+      // Function library.
+      {add_or_mul_func, add_func, mul_func});
+
+  GraphDef optimized_graph;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &optimized_graph));
+
+  const auto count_nodes_with_op = [&](const string& op) {
+    return absl::c_count_if(optimized_graph.node(), [&](const NodeDef& node) {
+      return node.op() == op;
+    });
+  };
+
+  // All `PartitionedCall` nodes in the optimized graph must be inlined, and
+  // `If` node must be lowered to `Switch` and `Merge` nodes.
+  EXPECT_EQ(count_nodes_with_op("PartitionedCall"), 0);
+  EXPECT_EQ(count_nodes_with_op("If"), 0);
+  EXPECT_EQ(count_nodes_with_op("Switch"), 3);
+  EXPECT_EQ(count_nodes_with_op("Merge"), 1);
+
+  GrapplerItem optimized = item.WithGraph(std::move(optimized_graph));
+
+  Tensor one = test::AsScalar<float>(1.0);
+  Tensor two = test::AsScalar<float>(2.0);
+  Tensor three = test::AsScalar<float>(3.0);
+
+  const auto feed_args = [&](bool is_add) {
+    std::vector<std::pair<string, Tensor>> feed;
+    feed.emplace_back("a", one);
+    feed.emplace_back("b", two);
+    feed.emplace_back("is_add", test::AsScalar<bool>(is_add));
+    return feed;
+  };
+
+  {  // Check 'is_add == true': a + b
+    item.feed = feed_args(true);
+    optimized.feed = feed_args(true);
+
+    auto tensors_expected = EvaluateFetchNodes(item);
+    ASSERT_EQ(tensors_expected.size(), 1);
+    test::ExpectTensorEqual<float>(tensors_expected[0], three);
+
+    auto tensors = EvaluateFetchNodes(optimized);
+    ASSERT_EQ(tensors.size(), tensors_expected.size());
+    test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+  }
+
+  {  // Check 'is_add == false': a * b
+    item.feed = feed_args(false);
+    optimized.feed = feed_args(false);
+
+    auto tensors_expected = EvaluateFetchNodes(item);
+    ASSERT_EQ(tensors_expected.size(), 1);
+    test::ExpectTensorEqual<float>(tensors_expected[0], two);
+
+    auto tensors = EvaluateFetchNodes(optimized);
+    ASSERT_EQ(tensors.size(), tensors_expected.size());
+    test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+  }
+}
+
 TEST_F(FunctionOptimizerTest, SpecializeFunctionXTimesTwo) {
   using test::function::NDef;
 
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer.h b/tensorflow/core/grappler/optimizers/graph_optimizer.h
index e587a2b2af74cb417ac58f672a4cc5526335d0a8..44dfe0de7890f09feb0b2cbfc450ddb9e37fc3cd 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer.h
@@ -39,7 +39,7 @@ class GraphOptimizer {
   // Routine called to allow an algorithm to propose a rewritten graph
   // for the graph, feeds and fetches in "item" to run more efficiently
   // on "cluster".
-  // Returns true iff it managed to generate a solution, false otherwise.
+  // Returns an error status if it failed to generate a solution.
   virtual Status Optimize(Cluster* cluster, const GrapplerItem& item,
                           GraphDef* optimized_graph) = 0;
 
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
index 99fcb31523800c76b8c413da92576fc16092f588..19dc2c8ad95ad86b9843406468163dfba5944f88 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
@@ -238,10 +238,10 @@ class GraphOptimizerStagePipeline {
         // Each stage must be "error safe" (just like exception safe). In
         // case of any error it must leave optimized graph unmodified.
         if (!stage_status.ok()) {
-          LOG(WARNING) << "Failed to run optimizer " << stage->optimizer_name()
-                       << ", stage " << stage->stage_name() << " node "
-                       << node->name()
-                       << ". Error: " << stage_status.error_message();
+          VLOG(2) << "Failed to run optimizer " << stage->optimizer_name()
+                  << ", stage " << stage->stage_name() << " node "
+                  << node->name()
+                  << ". Error: " << stage_status.error_message();
         }
         if (break_predicate_(*result)) return true;
       }
diff --git a/tensorflow/core/grappler/optimizers/experimental_implementation_selector.cc b/tensorflow/core/grappler/optimizers/implementation_selector.cc
similarity index 93%
rename from tensorflow/core/grappler/optimizers/experimental_implementation_selector.cc
rename to tensorflow/core/grappler/optimizers/implementation_selector.cc
index 75ad8bffefd8aa00bb1ba88c10ed9b1170a0d25f..a370bf9934e8b6eb057d9ead6558b5ecf57edaef 100644
--- a/tensorflow/core/grappler/optimizers/experimental_implementation_selector.cc
+++ b/tensorflow/core/grappler/optimizers/implementation_selector.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/grappler/optimizers/experimental_implementation_selector.h"
+#include "tensorflow/core/grappler/optimizers/implementation_selector.h"
 
 #include <string>
 
@@ -101,14 +101,14 @@ Status UpdateNodeDef(NodeDef* node_def, const string& funcName,
   return Status::OK();
 }
 
-Status ExperimentalImplementationSelector::LoadFunctions(
+Status ImplementationSelector::LoadFunctions(
     const GraphDef& graph) {
   lib_info_.reset(new FunctionLibraryApiInfo);
   TF_RETURN_IF_ERROR(lib_info_->Init(graph.library()));
   return Status::OK();
 }
 
-Status ExperimentalImplementationSelector::MaybeOptimizeFunctionCall(
+Status ImplementationSelector::MaybeOptimizeFunctionCall(
     NodeDef* node_def) const {
   // There are two ways of calling functions:
   //  1. By specifying an op name as a function name, or
@@ -170,12 +170,16 @@ Status ExperimentalImplementationSelector::MaybeOptimizeFunctionCall(
   return Status::OK();
 }
 
-Status ExperimentalImplementationSelector::SelectImplementation(
+Status ImplementationSelector::SelectImplementation(
     GraphDef* graph) const {
   if (!graph->has_library()) {
     VLOG(2) << "Skipping graph since it does not have function def";
     return Status::OK();
   }
+  if (lib_info_->empty()) {
+    VLOG(2) << "Skipping optimization since lib_info is empty";
+    return Status::OK();
+  }
 
   for (int k = 0; k < graph->node_size(); ++k)
     TF_RETURN_IF_ERROR(MaybeOptimizeFunctionCall(graph->mutable_node(k)));
@@ -183,9 +187,9 @@ Status ExperimentalImplementationSelector::SelectImplementation(
   return Status::OK();
 }
 
-Status ExperimentalImplementationSelector::Optimize(Cluster* cluster,
-                                                    const GrapplerItem& item,
-                                                    GraphDef* optimized_graph) {
+Status ImplementationSelector::Optimize(Cluster* cluster,
+                                        const GrapplerItem& item,
+                                        GraphDef* optimized_graph) {
   *optimized_graph = item.graph;
   TF_RETURN_IF_ERROR(LoadFunctions(*optimized_graph));
   return SelectImplementation(optimized_graph);
diff --git a/tensorflow/core/grappler/optimizers/experimental_implementation_selector.h b/tensorflow/core/grappler/optimizers/implementation_selector.h
similarity index 80%
rename from tensorflow/core/grappler/optimizers/experimental_implementation_selector.h
rename to tensorflow/core/grappler/optimizers/implementation_selector.h
index 82f7473a14ec9b20492ac7acef3b72e919040ece..c206d21640b4816d2af46b0581eb410564aca175 100644
--- a/tensorflow/core/grappler/optimizers/experimental_implementation_selector.h
+++ b/tensorflow/core/grappler/optimizers/implementation_selector.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_EXPERIMENTAL_IMPLEMENTATION_SELECTOR_H_
-#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_EXPERIMENTAL_IMPLEMENTATION_SELECTOR_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_IMPLEMENTATION_SELECTOR_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_IMPLEMENTATION_SELECTOR_H_
 
 #include <string>
 
@@ -33,7 +33,6 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-// -- EXPERIMENTAL --
 // This transformation replaces function calls by the appropriate function
 // definition based on properties of the runtime system. For instance,
 // we may choose one implementation over another if we have a GPU with
@@ -45,12 +44,12 @@ namespace grappler {
 //
 // For instance, the python code might specify:
 // @Defun(tf.float32,
-//        experimental_api_implements='plus_one',
-//        experimental_api_preferred_device='GPU')
+//        api_implements='plus_one',
+//        api_preferred_device='GPU')
 // def plus_one_gpu(x): return x + 1.0
 //
 // @Defun(tf.float32,
-//        experimental_api_implements='plus_one')
+//        api_implements='plus_one')
 // def plus_one_reference_implementation(x): return x + 1.0
 // input = tf.constant(2.0, dtype=tf.float32)
 //
@@ -62,21 +61,21 @@ namespace grappler {
 // `plus_one_reference_implementation` based on the availability of the GPU.
 //
 // Available annotations:
-//  - experimental_api_implements(string): all functions mapping to the same
+//  - api_implements(string): all functions mapping to the same
 //    string can be interchanged. For now, all functions must have the same
 //    signature and overloads are not allowed. Defuns within defuns are
 //    allowed.
-//  - experimental_api_preferred_device(string): sets which device is preferred.
-class ExperimentalImplementationSelector : public CustomGraphOptimizer {
+//  - api_preferred_device(string): sets which device is preferred.
+class ImplementationSelector : public CustomGraphOptimizer {
  public:
-  ExperimentalImplementationSelector() = default;
-  ~ExperimentalImplementationSelector() override = default;
+  ImplementationSelector() = default;
+  ~ImplementationSelector() override = default;
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     return Status::OK();
   }
   string name() const override {
-    return "experimental_implementation_selector";
+    return "implementation_selector";
   }
 
   // This call is not thread-safe.
@@ -106,10 +105,10 @@ class ExperimentalImplementationSelector : public CustomGraphOptimizer {
 
   std::unique_ptr<FunctionLibraryApiInfo> lib_info_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(ExperimentalImplementationSelector);
+  TF_DISALLOW_COPY_AND_ASSIGN(ImplementationSelector);
 };
 
 }  // namespace grappler
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_EXPERIMENTAL_IMPLEMENTATION_SELECTOR_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_IMPLEMENTATION_SELECTOR_H_
diff --git a/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc b/tensorflow/core/grappler/optimizers/implementation_selector_test.cc
similarity index 82%
rename from tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc
rename to tensorflow/core/grappler/optimizers/implementation_selector_test.cc
index e330835e9bc4fea33928e376a3fd98ebe34a74ee..e2f58964a2a089a0cfda57449f288925ed71d858 100644
--- a/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc
+++ b/tensorflow/core/grappler/optimizers/implementation_selector_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/grappler/optimizers/experimental_implementation_selector.h"
+#include "tensorflow/core/grappler/optimizers/implementation_selector.h"
 
 #include <algorithm>
 #include <memory>
@@ -38,15 +38,14 @@ namespace {
 constexpr char CpuDevice[] = "/device:CPU:0";
 constexpr char GpuDevice[] = "/device:GPU:0";
 
-class ExperimentalImplementationSelectorTest : public GrapplerTest {};
+class ImplementationSelectorTest : public GrapplerTest {};
 
-TEST_F(ExperimentalImplementationSelectorTest, NoUpdate) {
+TEST_F(ImplementationSelectorTest, NoUpdate) {
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {CpuDevice});
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
 
-  std::unique_ptr<CustomGraphOptimizer> optimizer(
-      new ExperimentalImplementationSelector);
+  std::unique_ptr<CustomGraphOptimizer> optimizer(new ImplementationSelector);
   ASSERT_NE(nullptr, optimizer);
   TF_ASSERT_OK(optimizer->Init());
 
@@ -58,19 +57,19 @@ TEST_F(ExperimentalImplementationSelectorTest, NoUpdate) {
   EXPECT_EQ(item.graph.node_size(), output.node_size());
 }
 
-TEST_F(ExperimentalImplementationSelectorTest, SwapImplementation) {
+TEST_F(ImplementationSelectorTest, SwapImplementation) {
   using test::function::NDef;
   auto cpu_def = test::function::XTimesTwo();
   auto* func_attr = cpu_def.mutable_attr();
-  (*func_attr)["experimental_api_implements"].set_s("times_two");
-  (*func_attr)["experimental_api_preferred_device"].set_s("CPU");
+  (*func_attr)["api_implements"].set_s("times_two");
+  (*func_attr)["api_preferred_device"].set_s("CPU");
 
   auto gpu_def = test::function::XAddX();
   auto* func2_attr = gpu_def.mutable_attr();
-  (*func2_attr)["experimental_api_implements"].set_s("times_two");
-  (*func2_attr)["experimental_api_preferred_device"].set_s("GPU");
+  (*func2_attr)["api_implements"].set_s("times_two");
+  (*func2_attr)["api_preferred_device"].set_s("GPU");
 
-  ExperimentalImplementationSelector optimizer;
+  ImplementationSelector optimizer;
   GraphDef output;
   GrapplerItem item;
   item.graph = test::function::GDef(
@@ -96,19 +95,19 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementation) {
   }
 }
 
-TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationEval) {
+TEST_F(ImplementationSelectorTest, SwapImplementationEval) {
   using test::function::NDef;
   auto cpu_def = test::function::XTimesTwo();
   auto* func_attr = cpu_def.mutable_attr();
-  (*func_attr)["experimental_api_implements"].set_s("random_boost");
-  (*func_attr)["experimental_api_preferred_device"].set_s("CPU");
+  (*func_attr)["api_implements"].set_s("random_boost");
+  (*func_attr)["api_preferred_device"].set_s("CPU");
 
   auto gpu_def = test::function::XTimesFour();
   auto* func2_attr = gpu_def.mutable_attr();
-  (*func2_attr)["experimental_api_implements"].set_s("random_boost");
-  (*func2_attr)["experimental_api_preferred_device"].set_s("GPU");
+  (*func2_attr)["api_implements"].set_s("random_boost");
+  (*func2_attr)["api_preferred_device"].set_s("GPU");
 
-  ExperimentalImplementationSelector optimizer;
+  ImplementationSelector optimizer;
   GraphDef output;
   GrapplerItem item;
   item.graph = test::function::GDef(
@@ -133,7 +132,7 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationEval) {
                                  test::AsScalar<float>(2.0f));
 }
 
-TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationWithGradient) {
+TEST_F(ImplementationSelectorTest, SwapImplementationWithGradient) {
   using test::function::NDef;
   using FDH = FunctionDefHelper;
   // boost_1 returns the doubled input and a const as the internal state, the
@@ -146,8 +145,8 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationWithGradient) {
       /* Mapping between function returns and function node outputs. */
       {{"z", "boost:z:0"}, {"s", "one:output:0"}});
   auto* boost_1_attr = boost_1.mutable_attr();
-  (*boost_1_attr)["experimental_api_implements"].set_s("random_boost");
-  (*boost_1_attr)["experimental_api_preferred_device"].set_s("CPU");
+  (*boost_1_attr)["api_implements"].set_s("random_boost");
+  (*boost_1_attr)["api_preferred_device"].set_s("CPU");
   (*boost_1_attr)["backward_function_name"].set_s("BoostCpuGradient");
 
   FunctionDef boost_1_gradient = FDH::Create(
@@ -157,8 +156,8 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationWithGradient) {
       /* Mapping between function returns and function node outputs. */
       {{"dx", "grad:z:0"}});
   auto* boost_1_grad_attr = boost_1_gradient.mutable_attr();
-  (*boost_1_grad_attr)["experimental_api_implements"].set_s("random_boost");
-  (*boost_1_grad_attr)["experimental_api_preferred_device"].set_s("CPU");
+  (*boost_1_grad_attr)["api_implements"].set_s("random_boost");
+  (*boost_1_grad_attr)["api_preferred_device"].set_s("CPU");
   (*boost_1_grad_attr)["forward_function_name"].set_s("BoostCpu");
 
   // boost_2 return the input * 4, and with two extra internal states.
@@ -171,8 +170,8 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationWithGradient) {
       /* Mapping between function returns and function node outputs. */
       {{"z", "boost:z:0"}, {"s1", "one:output:0"}, {"s2", "two:output:0"}});
   auto* boost_2_attr = boost_2_func.mutable_attr();
-  (*boost_2_attr)["experimental_api_implements"].set_s("random_boost");
-  (*boost_2_attr)["experimental_api_preferred_device"].set_s("GPU");
+  (*boost_2_attr)["api_implements"].set_s("random_boost");
+  (*boost_2_attr)["api_preferred_device"].set_s("GPU");
   (*boost_2_attr)["backward_function_name"].set_s("BoostGpuGradient");
 
   FunctionDef boost_2_gradient = FDH::Create(
@@ -182,8 +181,8 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationWithGradient) {
       /* Mapping between function returns and function node outputs. */
       {{"dx", "grad:z:0"}});
   auto* boost_2_grad_attr = boost_2_gradient.mutable_attr();
-  (*boost_2_grad_attr)["experimental_api_implements"].set_s("random_boost");
-  (*boost_2_grad_attr)["experimental_api_preferred_device"].set_s("GPU");
+  (*boost_2_grad_attr)["api_implements"].set_s("random_boost");
+  (*boost_2_grad_attr)["api_preferred_device"].set_s("GPU");
   (*boost_2_grad_attr)["forward_function_name"].set_s("BoostGpu");
 
   // Define the forward function with f = boost2 function but with CPU device.
@@ -203,7 +202,7 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationWithGradient) {
             {"f", FDH::FunctionRef("Boost2Gradient")}},
            CpuDevice);
 
-  ExperimentalImplementationSelector optimizer;
+  ImplementationSelector optimizer;
   GraphDef output;
   GrapplerItem item;
   item.graph = test::function::GDef(
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 8f25a1c8c1c48281fb44c01a142348863836d5aa..e9b706a58371cad72ef4b0652bc86364d7c4f5c0 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -503,6 +503,7 @@ class NodeProcessor : public GraphProcessor {
       UpdateAttrKSize();
       UpdateAttrStrides();
       UpdateAttrDilations();
+      UpdateAttrExplicitPaddings();
       UpdateAttrShape();
       TF_RETURN_IF_ERROR(AddLayoutTransposeToInputs());
       TF_RETURN_IF_ERROR(AddLayoutTransposeToOutputs());
@@ -753,6 +754,28 @@ class NodeProcessor : public GraphProcessor {
     }
   }
 
+  void UpdateAttrExplicitPaddings() {
+    if (node_->attr().find("explicit_paddings") != node_->attr().end()) {
+      auto list = node_->mutable_attr()->at("explicit_paddings").mutable_list();
+      int size = list->i_size();
+      if (size == 8) {
+        int64 height_before = list->i(2);
+        int64 height_after = list->i(3);
+        int64 width_before = list->i(4);
+        int64 width_after = list->i(5);
+        list->set_i(2, 0);
+        list->set_i(3, 0);
+        list->set_i(4, height_before);
+        list->set_i(5, height_after);
+        list->set_i(6, width_before);
+        list->set_i(7, width_after);
+      } else if (size != 0) {
+        LOG(ERROR) << "Cannot handle explicit_paddings attribute of size "
+                   << size;
+      }
+    }
+  }
+
   void UpdateAttrDataFormat() {
     if (node_->attr().find("data_format") != node_->attr().end()) {
       if (node_->attr().at("data_format").s().compare("NHWC") == 0) {
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
index 20e47c1b26b173c18eefd01ba7bdb87781a4c59b..eb2a8e87dde605d7a5867ca84f1c5260c42077e4 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/clusters/single_machine.h"
@@ -80,8 +81,13 @@ class LayoutOptimizerTest : public GrapplerTest {
     Output filter =
         ops::Const(s->WithOpName("Filter"), Input::Initializer(filter_data));
 
+    ops::Conv2D::Attrs attrs;
+    if (padding == "EXPLICIT") {
+      attrs = attrs.ExplicitPaddings({0, 0, 1, 2, 3, 4, 0, 0});
+    }
+
     Output conv = ops::Conv2D(s->WithOpName("Conv2D").WithDevice(device), input,
-                              filter, {1, stride, stride, 1}, padding);
+                              filter, {1, stride, stride, 1}, padding, attrs);
     return conv;
   }
 
@@ -100,6 +106,28 @@ class LayoutOptimizerTest : public GrapplerTest {
     int input_depth = 3;
     int filter_count = 2;
     int stride = 1;
+    int dilation = dilated ? 2 : 1;
+    int64 padding_top = 1;
+    int64 padding_bottom = 2;
+    int64 padding_left = 3;
+    int64 padding_right = 4;
+    int64 output_height;
+    int64 output_width;
+    Padding padding_enum;
+    if (padding == "SAME") {
+      padding_enum = SAME;
+    } else if (padding == "VALID") {
+      padding_enum = VALID;
+    } else {
+      CHECK_EQ(padding, "EXPLICIT");
+      padding_enum = EXPLICIT;
+    }
+    TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
+        input_height, filter_size, dilation, stride, padding_enum,
+        &output_height, &padding_top, &padding_bottom));
+    TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
+        input_width, filter_size, dilation, stride, padding_enum, &output_width,
+        &padding_left, &padding_right));
     TensorShape input_sizes_shape({4});
     Tensor input_data(DT_INT32, input_sizes_shape);
     test::FillValues<int>(&input_data,
@@ -112,8 +140,6 @@ class LayoutOptimizerTest : public GrapplerTest {
     Output filter =
         ops::Variable(s->WithOpName("Filter"), filter_shape, DT_FLOAT);
 
-    int output_height = input_height;
-    int output_width = input_width;
     TensorShape output_shape(
         {batch_size, output_height, output_width, filter_count});
     Tensor output_data(DT_FLOAT, output_shape);
@@ -124,10 +150,21 @@ class LayoutOptimizerTest : public GrapplerTest {
     Output conv_backprop_input;
     Output input_sizes_i =
         ops::Identity(s->WithOpName("InputSizesIdentity"), input_sizes);
-    ops::Conv2DBackpropInput::Attrs attrs;
-    if (dilated) {
-      attrs = attrs.Dilations({1, 2, 2, 1});
+    std::vector<int> dilations{1, dilation, dilation, 1};
+    std::vector<int> explicit_paddings;
+    if (padding == "EXPLICIT") {
+      explicit_paddings = {0,
+                           0,
+                           static_cast<int>(padding_top),
+                           static_cast<int>(padding_bottom),
+                           static_cast<int>(padding_left),
+                           static_cast<int>(padding_right),
+                           0,
+                           0};
     }
+    auto attrs =
+        ops::Conv2DBackpropInput::Attrs().Dilations(dilations).ExplicitPaddings(
+            explicit_paddings);
     if (const_input_size) {
       conv_backprop_input = ops::Conv2DBackpropInput(
           s->WithOpName("Conv2DBackpropInput"), input_sizes, filter, output,
@@ -186,7 +223,7 @@ class LayoutOptimizerTest : public GrapplerTest {
 
 TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "SAME");
+  auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "EXPLICIT");
   Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
@@ -306,6 +343,19 @@ TEST_F(LayoutOptimizerTest, NotEqualSizeWithValidPadding) {
   EXPECT_TRUE(node_map.GetNode("Conv2D-0-TransposeNHWCToNCHW-LayoutOptimizer"));
 }
 
+TEST_F(LayoutOptimizerTest, ExplicitPadding) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "EXPLICIT");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  EXPECT_TRUE(node_map.GetNode("Conv2D-0-TransposeNHWCToNCHW-LayoutOptimizer"));
+}
+
 TEST_F(LayoutOptimizerTest, Pad) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   auto conv = SimpleConv2D(&s, 4, 2, "VALID");
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
index 36064738408c744db53cb9e95645d6a2968b1746..54776e7f80c32ec1de70c1f132d0ebf31d50a72e 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
@@ -30,12 +30,14 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/grappler/graph_topology_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/optimizers/evaluation_utils.h"
 #include "tensorflow/core/grappler/utils/frame.h"
+#include "tensorflow/core/grappler/utils/traversal.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -451,16 +453,29 @@ Status LoopInvariantNodeMotionOptimizer::Optimize() {
 }
 
 std::vector<int> GetStackPushNodesToConvert(
-    const SimpleGraphView& graph_view,
+    const GraphTopologyView& graph_view,
     const std::unordered_set<string>& nodes_to_preserve, int stack_node_idx) {
   VLOG(1) << "Stack node: " << graph_view.graph()->node(stack_node_idx).name();
+
   const std::unordered_set<string> op_types_to_traverse(
       {"Stack", "StackV2", "Enter", "RefEnter", "Switch", "RefSwitch",
        "Identity", "RefIdentity"});
+  const auto is_op_to_traverse = [&](const NodeDef* node) -> bool {
+    return op_types_to_traverse.find(node->op()) != op_types_to_traverse.end();
+  };
+
   std::vector<int> nodes_to_convert;
-  std::set<int> fanout;
-  graph_view.DepthFirstSearch(op_types_to_traverse, stack_node_idx, &fanout);
-  for (int fanout_idx : fanout) {
+  std::vector<int> fanouts;
+
+  DfsTraversal(graph_view, {graph_view.GetNode(stack_node_idx)},
+               TraversalDirection::kFollowOutputs,
+               DfsPredicates::Advance(is_op_to_traverse),
+               DfsCallbacks::PreOrder([&](const NodeDef* node) {
+                 const absl::optional<int> idx = graph_view.GetNodeIndex(*node);
+                 fanouts.push_back(idx.value());
+               }));
+
+  for (int fanout_idx : fanouts) {
     const NodeDef& fanout_node = graph_view.graph()->node(fanout_idx);
     VLOG(1) << "Fanout " << fanout_idx << " : " << fanout_node.name();
     if (IsStackPushOp(fanout_node)) {
@@ -468,13 +483,12 @@ std::vector<int> GetStackPushNodesToConvert(
       // happen when the graph we have contains only the forward pass for a loop
       // (as when the forward and backward passes are split across different
       // functions).
-      if (graph_view.has_node(fanout_node.input(0))) {
-        const NodeDef* stack_node =
-            &graph_view.node(graph_view.index(fanout_node.input(0)));
+      if (graph_view.HasNode(fanout_node.input(0))) {
+        const NodeDef* stack_node = graph_view.GetNode(fanout_node.input(0));
         while (stack_node->op() != "Stack" && stack_node->op() != "StackV2" &&
                stack_node->input_size() > 0 &&
-               graph_view.has_node(stack_node->input(0))) {
-          stack_node = &graph_view.node(graph_view.index(stack_node->input(0)));
+               graph_view.HasNode(stack_node->input(0))) {
+          stack_node = graph_view.GetNode(stack_node->input(0));
         }
         if (nodes_to_preserve.find(stack_node->name()) ==
             nodes_to_preserve.end()) {
@@ -488,7 +502,7 @@ std::vector<int> GetStackPushNodesToConvert(
                    op_types_to_traverse.end()) {
       continue;
     } else if (!IsStackPopOp(fanout_node) ||
-               (!graph_view.outputs(fanout_idx).empty() ||
+               (!graph_view.GetFanout(fanout_idx).empty() ||
                 nodes_to_preserve.find(fanout_node.name()) !=
                     nodes_to_preserve.end())) {
       // The node is either a stack pop with consumers or something unexpected
@@ -497,14 +511,16 @@ std::vector<int> GetStackPushNodesToConvert(
       break;
     }
   }
+
   return nodes_to_convert;
 }
 
 Status RemoveStackOps(const std::unordered_set<string>& nodes_to_preserve,
                       GraphDef* optimized_graph) {
   NodeMap node_map(optimized_graph);
-  SimpleGraphView graph_view;
-  TF_RETURN_IF_ERROR(graph_view.Initialize(*optimized_graph));
+  GraphTopologyView graph_view;
+  TF_RETURN_IF_ERROR(graph_view.InitializeFromGraph(*optimized_graph));
+
   for (int node_idx = 0; node_idx < optimized_graph->node_size(); ++node_idx) {
     if (IsStackOp(optimized_graph->node(node_idx))) {
       for (int push_node_idx : GetStackPushNodesToConvert(
@@ -565,8 +581,19 @@ Status EvaluateBoolOpForConstantOperands(const NodeDef& op_node,
   return Status::OK();
 }
 
+// TODO(lyandy): Consolidate with ConstantFolding implementation.
+bool IsReallyConstant(const NodeDef& node,
+                      const absl::flat_hash_set<string>& feed_nodes) {
+  if (!IsConstant(node)) {
+    return false;
+  }
+  // If the node is fed it's not constant anymore.
+  return feed_nodes.find(node.name()) == feed_nodes.end();
+}
+
 Status CheckForDeadFanout(const MutableGraphView& view,
                           const NodeDef& switch_node, const NodeMap& node_map,
+                          const absl::flat_hash_set<string>& feed_nodes,
                           DeviceBase* cpu_device, ResourceMgr* resource_mgr,
                           bool* has_dead_fanout, int* dead_fanout) {
   *has_dead_fanout = false;
@@ -575,7 +602,7 @@ Status CheckForDeadFanout(const MutableGraphView& view,
       view.GetRegularFanin(switch_loopcond_port).node;
 
   // CASE 1: Control is a constant.
-  if (IsConstant(*switch_predicate)) {
+  if (IsReallyConstant(*switch_predicate, feed_nodes)) {
     Tensor selector;
     CHECK(selector.FromProto(switch_predicate->attr().at("value").tensor()));
     *has_dead_fanout = true;
@@ -614,7 +641,7 @@ Status CheckForDeadFanout(const MutableGraphView& view,
     if (IsMerge(*node)) {
       merge_node = node;
     }
-    if (IsConstant(*node)) {
+    if (IsReallyConstant(*node, feed_nodes)) {
       constant_ctrl_input = node;
       constant_index = i;
     }
@@ -630,7 +657,7 @@ Status CheckForDeadFanout(const MutableGraphView& view,
     if (IsEnter(*node)) {
       enter_node = node;
     }
-    if (IsConstant(*node)) {
+    if (IsReallyConstant(*node, feed_nodes)) {
       constant_init_node = node;
     }
   }
@@ -638,7 +665,7 @@ Status CheckForDeadFanout(const MutableGraphView& view,
     if (constant_init_node != nullptr) return Status::OK();
     for (const auto& input : enter_node->input()) {
       NodeDef* node = node_map.GetNode(input);
-      if (IsConstant(*node)) {
+      if (IsReallyConstant(*node, feed_nodes)) {
         constant_init_node = node;
       }
     }
@@ -694,8 +721,12 @@ Status LoopOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     // TODO(srjoglekar): Figure out if we can optimize NodeMap creations across
     // optimizer passes.
     NodeMap node_map(optimized_graph);
-    TF_RETURN_IF_ERROR(
-        RemoveDeadBranches(item.NodesToPreserve(), node_map, optimized_graph));
+    absl::flat_hash_set<string> feed_nodes;
+    for (const auto& feed : item.feed) {
+      feed_nodes.insert(NodeName(feed.first));
+    }
+    TF_RETURN_IF_ERROR(RemoveDeadBranches(item.NodesToPreserve(), node_map,
+                                          feed_nodes, optimized_graph));
   }
 
   return Status::OK();
@@ -703,7 +734,8 @@ Status LoopOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
 Status LoopOptimizer::RemoveDeadBranches(
     const std::unordered_set<string>& nodes_to_preserve,
-    const NodeMap& node_map, GraphDef* optimized_graph) {
+    const NodeMap& node_map, const absl::flat_hash_set<string>& feed_nodes,
+    GraphDef* optimized_graph) {
   std::unordered_set<const NodeDef*> dead_nodes;
   std::unordered_map<NodeDef*, std::set<int>> dead_merge_inputs;
   // TODO(bsteiner): also rewrite switches as identity. For now we just record
@@ -721,9 +753,9 @@ Status LoopOptimizer::RemoveDeadBranches(
 
     int dead_fanout;
     bool has_dead_fanout;
-    TF_RETURN_IF_ERROR(CheckForDeadFanout(view, node, node_map, cpu_device_,
-                                          resource_mgr_.get(), &has_dead_fanout,
-                                          &dead_fanout));
+    TF_RETURN_IF_ERROR(CheckForDeadFanout(view, node, node_map, feed_nodes,
+                                          cpu_device_, resource_mgr_.get(),
+                                          &has_dead_fanout, &dead_fanout));
     if (!has_dead_fanout) {
       continue;
     }
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.h b/tensorflow/core/grappler/optimizers/loop_optimizer.h
index d467237a9a704a81a0ecc1da71531868c7f3a49b..7fa1976f348391438d62ce51fb9b8f06f34e15a2 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.h
@@ -60,7 +60,9 @@ class LoopOptimizer : public GraphOptimizer {
   };
 
   Status RemoveDeadBranches(const std::unordered_set<string>& nodes_to_preserve,
-                            const NodeMap& node_map, GraphDef* optimized_graph);
+                            const NodeMap& node_map,
+                            const absl::flat_hash_set<string>& feed_nodes,
+                            GraphDef* optimized_graph);
 
   RewriterConfig::Toggle opt_level_;
   DeviceBase* cpu_device_;
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
index 587767c23c370ca1f747fc5b4e2bfa4cba3ae10d..db4494d42e774bf2db3791564d062141e00d1609 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
@@ -504,11 +504,11 @@ void VerifyGraphsEqual(const GraphDef& original_graph,
   for (int i = 0; i < original_graph.node_size(); ++i) {
     const NodeDef& original = original_graph.node(i);
     const NodeDef& optimized = optimized_graph.node(i);
-    EXPECT_EQ(original.name(), optimized.name()) << func;
-    EXPECT_EQ(original.op(), optimized.op()) << func;
-    EXPECT_EQ(original.input_size(), optimized.input_size()) << func;
+    EXPECT_EQ(optimized.name(), original.name()) << func;
+    EXPECT_EQ(optimized.op(), original.op()) << func;
+    ASSERT_EQ(optimized.input_size(), original.input_size()) << func;
     for (int j = 0; j < original.input_size(); ++j) {
-      EXPECT_EQ(original.input(j), optimized.input(j)) << func;
+      EXPECT_EQ(optimized.input(j), original.input(j)) << func;
     }
   }
 }
@@ -528,7 +528,7 @@ TEST_F(LoopOptimizerTest, NoOp) {
   VerifyGraphsEqual(item.graph, output, __FUNCTION__);
 }
 
-TEST_F(LoopOptimizerTest, RemovePush_NoOp) {
+TEST_F(LoopOptimizerTest, RemovePushNoOp) {
   GrapplerItem item;
   GraphDef& graph = item.graph;
   AddSimpleNode("c", "Const", {}, &graph);
@@ -557,7 +557,7 @@ TEST_F(LoopOptimizerTest, RemovePush_NoOp) {
   VerifyGraphsEqual(item.graph, output, __FUNCTION__);
 }
 
-TEST_F(LoopOptimizerTest, RemovePush_NoPopButStackLives) {
+TEST_F(LoopOptimizerTest, RemovePushNoPopButStackLives) {
   GrapplerItem item;
   GraphDef& graph = item.graph;
   AddSimpleNode("c", "Const", {}, &graph);
@@ -609,32 +609,32 @@ TEST_F(LoopOptimizerTest, RemovePushWithoutMatchingPop) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  EXPECT_EQ(13, output.node_size());
+  EXPECT_EQ(output.node_size(), 13);
   for (int i = 0; i < output.node_size(); ++i) {
     const NodeDef& node = output.node(i);
     if (node.name() == "push1") {
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("c", node.input(0));
-      EXPECT_EQ("^stack1", node.input(1));
+      EXPECT_EQ(node.op(), "Identity");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "c");
+      EXPECT_EQ(node.input(1), "^stack1");
     } else if (node.name() == "push2") {
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("enter_c", node.input(0));
-      EXPECT_EQ("^enter_stack2", node.input(1));
+      EXPECT_EQ(node.op(), "Identity");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "enter_c");
+      EXPECT_EQ(node.input(1), "^enter_stack2");
     } else if (node.name() == "push3") {
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("c", node.input(0));
-      EXPECT_EQ("^stack3", node.input(1));
+      EXPECT_EQ(node.op(), "Identity");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "c");
+      EXPECT_EQ(node.input(1), "^stack3");
     } else {
       const NodeDef& orig_node = item.graph.node(i);
-      EXPECT_EQ(orig_node.ShortDebugString(), node.ShortDebugString());
+      EXPECT_EQ(node.ShortDebugString(), orig_node.ShortDebugString());
     }
   }
 }
 
-TEST_F(LoopOptimizerTest, RemoveDeadBranches_ConstantCondition) {
+TEST_F(LoopOptimizerTest, RemoveDeadBranchesConstantCondition) {
   Scope scope = Scope::NewRootScope();
   Output v_in = ops::Variable(scope.WithOpName("v_in"), {3}, DT_FLOAT);
 
@@ -691,57 +691,57 @@ TEST_F(LoopOptimizerTest, RemoveDeadBranches_ConstantCondition) {
 
   for (const NodeDef& node : output.node()) {
     // These nodes should have been pruned
-    EXPECT_NE("Square1", node.name());
-    EXPECT_NE("Sqrt2", node.name());
-    EXPECT_NE("m5", node.name());
-    EXPECT_NE("m7", node.name());
+    EXPECT_NE(node.name(), "Square1");
+    EXPECT_NE(node.name(), "Sqrt2");
+    EXPECT_NE(node.name(), "m5");
+    EXPECT_NE(node.name(), "m7");
 
     if (node.name() == "m1") {
       // sqrt1 is dead
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("square1", node.input(0));
+      EXPECT_EQ(node.op(), "Identity");
+      ASSERT_EQ(node.input_size(), 1);
+      EXPECT_EQ(node.input(0), "square1");
     } else if (node.name() == "m2") {
       // both inputs are alive
-      EXPECT_EQ("Merge", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("v_in", node.input(0));
-      EXPECT_EQ("square1", node.input(1));
+      EXPECT_EQ(node.op(), "Merge");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "v_in");
+      EXPECT_EQ(node.input(1), "square1");
     } else if (node.name() == "m3") {
       // sqrt1 is dead
-      EXPECT_EQ("Identity", node.op());
-      EXPECT_EQ(1, node.input_size());
-      EXPECT_EQ("v_in", node.input(0));
+      EXPECT_EQ(node.op(), "Identity");
+      ASSERT_EQ(node.input_size(), 1);
+      EXPECT_EQ(node.input(0), "v_in");
     } else if (node.name() == "m4") {
       // both inputs are alive
-      EXPECT_EQ("Merge", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("square1", node.input(0));
-      EXPECT_EQ("sqrt2", node.input(1));
+      EXPECT_EQ(node.op(), "Merge");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "square1");
+      EXPECT_EQ(node.input(1), "sqrt2");
     } else if (node.name() == "m6") {
       // both inputs are alive and the control dependency can get triggered
-      EXPECT_EQ("Merge", node.op());
-      EXPECT_EQ(3, node.input_size());
-      EXPECT_EQ("v_in", node.input(0));
-      EXPECT_EQ("square1", node.input(1));
-      EXPECT_EQ("^sqrt2", node.input(2));
+      EXPECT_EQ(node.op(), "Merge");
+      ASSERT_EQ(node.input_size(), 3);
+      EXPECT_EQ(node.input(0), "v_in");
+      EXPECT_EQ(node.input(1), "square1");
+      EXPECT_EQ(node.input(2), "^sqrt2");
     } else if (node.name() == "m8") {
       // The node is to be preserved because of a fetch
-      EXPECT_EQ("Merge", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("id1", node.input(0));
-      EXPECT_EQ("id2", node.input(1));
+      EXPECT_EQ(node.op(), "Merge");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "id1");
+      EXPECT_EQ(node.input(1), "id2");
     } else if (node.name() == "m9") {
       // The node is to be preserved because of a fetch
-      EXPECT_EQ("Merge", node.op());
-      EXPECT_EQ(2, node.input_size());
-      EXPECT_EQ("id3", node.input(0));
-      EXPECT_EQ("id4", node.input(1));
+      EXPECT_EQ(node.op(), "Merge");
+      ASSERT_EQ(2, node.input_size());
+      EXPECT_EQ(node.input(0), "id3");
+      EXPECT_EQ(node.input(1), "id4");
     }
   }
 }
 
-TEST_F(LoopOptimizerTest, RemoveDeadBranches_FullyRemoveDeadBranches) {
+TEST_F(LoopOptimizerTest, RemoveDeadBranchesFullyRemoveDeadBranches) {
   const string gdef_ascii = R"EOF(
 node {
   name: "episodicreplaybuffer_add_readvariableop_resource"
@@ -1153,7 +1153,7 @@ versions {
       << "Merge node was deleted, but it shouldn't have been.";
 }
 
-TEST_F(LoopOptimizerTest, RemoveDeadBranches_ZeroIterWhile) {
+TEST_F(LoopOptimizerTest, RemoveDeadBranchesZeroIterWhile) {
   const string gdef_ascii = R"EOF(
 node {
   name: "Const"
@@ -1358,15 +1358,15 @@ versions {
   CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &item.graph));
   item.fetch = {"while/Exit"};
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
-  EXPECT_EQ(1, tensors_expected.size());
+  ASSERT_EQ(tensors_expected.size(), 1);
 
   LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE, nullptr);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_CHECK_OK(status);
   auto tensors_got = EvaluateNodes(output, item.fetch);
-  EXPECT_EQ(1, tensors_got.size());
-  test::ExpectTensorEqual<int32>(tensors_expected[0], tensors_got[0]);
+  ASSERT_EQ(tensors_got.size(), 1);
+  test::ExpectTensorEqual<int32>(tensors_got[0], tensors_expected[0]);
 
   int nodes_present = 0;
   for (const NodeDef& node : output.node()) {
@@ -1382,7 +1382,200 @@ versions {
     }
     ++nodes_present;
   }
-  EXPECT_EQ(8, nodes_present);
+  EXPECT_EQ(nodes_present, 8);
+}
+
+TEST_F(LoopOptimizerTest, RemoveDeadBranchesConstantFeed) {
+  const string gdef_ascii = R"EOF(
+node {
+  name: "Const"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        string_val: "I\'m a value!"
+      }
+    }
+  }
+}
+node {
+  name: "cond/Switch_1"
+  op: "Switch"
+  input: "Const"
+  input: "Const_1"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@Const"
+      }
+    }
+  }
+}
+node {
+  name: "Const_1"
+  op: "Const"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_BOOL
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_BOOL
+        tensor_shape {
+        }
+        bool_val: true
+      }
+    }
+  }
+}
+node {
+  name: "cond/Switch"
+  op: "Switch"
+  input: "Const_1"
+  input: "Const_1"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+}
+node {
+  name: "cond/switch_t"
+  op: "Identity"
+  input: "cond/Switch:1"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_BOOL
+    }
+  }
+}
+node {
+  name: "cond/Const"
+  op: "Const"
+  input: "^cond/switch_t"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        string_val: ""
+      }
+    }
+  }
+}
+node {
+  name: "cond/Merge"
+  op: "Merge"
+  input: "cond/Switch_1"
+  input: "cond/Const"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "N"
+    value {
+      i: 2
+    }
+  }
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+}
+node {
+  name: "Identity"
+  op: "Identity"
+  input: "cond/Merge"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_STRING
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 27
+}
+  )EOF";
+
+  GrapplerItem item;
+  CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &item.graph));
+  item.fetch = {"Identity"};
+  Tensor feed_tensor(DT_BOOL, {});
+  feed_tensor.flat<bool>()(0) = false;
+  item.feed.push_back({"Const_1", feed_tensor});
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  ASSERT_EQ(tensors_expected.size(), 1);
+
+  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE, nullptr);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_CHECK_OK(status);
+  auto tensors_got = EvaluateNodes(output, item.fetch);
+  ASSERT_EQ(tensors_got.size(), 1);
+  test::ExpectTensorEqual<string>(tensors_got[0], tensors_expected[0]);
+
+  EXPECT_EQ(output.node_size(), 8);
+
+  // No rewrite because branch has a constant feed node.
+  bool found = false;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "cond/Merge") {
+      EXPECT_EQ(node.op(), "Merge");
+      ASSERT_EQ(node.input_size(), 2);
+      EXPECT_EQ(node.input(0), "cond/Switch_1");
+      EXPECT_EQ(node.input(1), "cond/Const");
+      found = true;
+      break;
+    }
+  }
+  EXPECT_TRUE(found);
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index 227c2bb8b0f3d3e6809f65f3b3716270b0c2c6e5..b50d50f84245a5910ccf9cde5166465f4d9e9310 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_memory.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/costs/utils.h"
+#include "tensorflow/core/grappler/graph_topology_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
@@ -188,13 +189,14 @@ std::vector<RecomputedSubGraph> GetOpGroupsToRecompute(
       }
     }
     // Recompute only nodes which eventually feed into a target node.
-    connected_subgraph(node_map,
-                       true,   // Collect inputs
-                       false,  // Collect outputs
-                       [&unpruned_recompute_nodes](const NodeDef& node) {
-                         return unpruned_recompute_nodes.count(&node) != 0;
-                       },
-                       &current_recomputation.recomputed_source_nodes);
+    connected_subgraph(
+        node_map,
+        true,   // Collect inputs
+        false,  // Collect outputs
+        [&unpruned_recompute_nodes](const NodeDef& node) {
+          return unpruned_recompute_nodes.count(&node) != 0;
+        },
+        &current_recomputation.recomputed_source_nodes);
     if (current_recomputation.target_nodes.empty()) {
       continue;
     }
@@ -498,6 +500,16 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
   // Look for AddN nodes (and equivalent) and record input names.
   MutableGraphView view(&item->graph);
 
+  // It's ok to use immutable GraphTopologyView here, because we do not destroy
+  // any of the nodes in the underlying graph, we only add new nodes.
+  GraphTopologyView graph_topology;
+  Status initialized_topology = graph_topology.InitializeFromGraph(item->graph);
+  if (!initialized_topology.ok()) {
+    VLOG(1) << "Failed to initialize graph topology view: "
+            << initialized_topology.error_message();
+    return false;
+  }
+
   std::unordered_map<string, std::unordered_set<NodeDef*>> addn_list;
   for (NodeDef& node : *item->graph.mutable_node()) {
     if (!IsAddN(node) && node.op() != "AccumulateNV2") {
@@ -579,12 +591,11 @@ bool SchedulingPass(Cluster* cluster, GrapplerItem* item) {
 
     // Compute a topological ordering for the node fanin.
     std::unordered_map<const NodeDef*, int> topo_order;
-    ReverseDfs(view, {node}, nullptr,
-               [&topo_order](const NodeDef* n) {
-                 int topo_index = topo_order.size();
-                 topo_order[n] = topo_index;
-               },
-               nullptr);
+    DfsTraversal(graph_topology, {node}, TraversalDirection::kFollowInputs,
+                 DfsCallbacks::PostOrder([&topo_order](const NodeDef* n) {
+                   int topo_index = static_cast<int>(topo_order.size());
+                   topo_order[n] = topo_index;
+                 }));
 
     std::vector<int> input_topo_index;
 
@@ -702,6 +713,13 @@ Status BuildSwapPair(NodeDef* node, int input_to_swap,
                      const std::unordered_map<string, const NodeDef*>& name_map,
                      GraphDef* graph,
                      std::pair<NodeDef*, NodeDef*>* swap_pair) {
+  string task, device;
+  if (!DeviceNameUtils::SplitDeviceName(node->device(), &task, &device) ||
+      !str_util::StrContains(device, DEVICE_GPU)) {
+    return errors::InvalidArgument("Can't swap input ", input_to_swap,
+                                   " of node ", node->name(),
+                                   " since it is not on GPU");
+  }
   const OpDef* op_def;
   TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUpOpDef(node->op(), &op_def));
   DataType input_type;
@@ -1252,46 +1270,55 @@ Status RelaxAllocatorConstraints(GraphDef* optimized_graph) {
     return Status::OK();
   }
 
-  std::unordered_set<int> optimized_nodes;
-  SimpleGraphView graph_view;
-  TF_RETURN_IF_ERROR(graph_view.Initialize(*optimized_graph));
+  GraphTopologyView graph_view;
+  TF_RETURN_IF_ERROR(graph_view.InitializeFromGraph(*optimized_graph));
+  std::unordered_set<const NodeDef*> optimized_nodes;
+
   for (int i : assign_nodes) {
-    if (optimized_nodes.find(i) == optimized_nodes.end()) {
-      const NodeDef& assign_node = optimized_graph->node(i);
-      optimized_nodes.insert(i);
-      std::vector<int> assign_nodes_in_fanout;
-      assign_nodes_in_fanout.push_back(i);
-      std::set<int> transitive_fanout;
-      graph_view.DepthFirstSearch(std::unordered_set<string>{}, i,
-                                  &transitive_fanout);
+    const NodeDef& assign_node = optimized_graph->node(i);
+
+    if (optimized_nodes.find(&assign_node) == optimized_nodes.end()) {
+      std::vector<const NodeDef*> assign_nodes_in_fanout;
+      optimized_nodes.insert(&assign_node);
+      assign_nodes_in_fanout.push_back(&assign_node);
+
+      std::vector<const NodeDef*> transitive_fanout;
+      DfsTraversal(graph_view, {graph_view.GetNode(i)},
+                   TraversalDirection::kFollowOutputs,
+                   DfsCallbacks::PreOrder([&](const NodeDef* node) {
+                     transitive_fanout.push_back(node);
+                   }));
+
       bool relax_constraint = true;
       // If all nodes in the transitive fanout are on the same device as the
       // assign node, there is no need to allocate the output in pinned memory.
-      for (int fanout : transitive_fanout) {
-        const NodeDef& fanout_node = optimized_graph->node(fanout);
+      for (const NodeDef* fanout_node : transitive_fanout) {
+        // const NodeDef& fanout_node = optimized_graph->node(fanout);
         if (relax_constraint &&
-            (IsSend(fanout_node) ||
-             CrossesTaskOrCpuGpuBoundary(fanout_node, assign_node))) {
+            (IsSend(*fanout_node) ||
+             CrossesTaskOrCpuGpuBoundary(*fanout_node, assign_node))) {
           relax_constraint = false;
           break;
         }
-        if (optimized_nodes.find(fanout) == optimized_nodes.end() &&
-            IsAssign(fanout_node)) {
-          assign_nodes_in_fanout.push_back(fanout);
+        if (optimized_nodes.find(fanout_node) == optimized_nodes.end() &&
+            IsAssign(*fanout_node)) {
+          assign_nodes_in_fanout.push_back(fanout_node);
         }
       }
 
       if (relax_constraint) {
-        for (int assign_idx : assign_nodes_in_fanout) {
+        for (const NodeDef* assign_node_in_fanout : assign_nodes_in_fanout) {
           // If all devices match in fanout of node(i) then, by transitivity,
           // they must also match in the fanout of other assign nodes
           // in the fanout of node(i), so we can process them here,
           // and save computing their transitive fanout later.
-          optimized_nodes.insert(assign_idx);
+          optimized_nodes.insert(assign_node_in_fanout);
 
           // Set an attribute telling AssignOp to ignore allocator constraints.
+          const absl::optional<int> assign_node_idx =
+              graph_view.GetNodeIndex(*assign_node_in_fanout);
           NodeDef* assign_node_to_relax =
-              optimized_graph->mutable_node(assign_idx);
+              optimized_graph->mutable_node(assign_node_idx.value());
           (*assign_node_to_relax
                 ->mutable_attr())["_grappler_relax_allocator_constraints"]
               .set_b(true);
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 572cc41d765f5b0e285bbff3ff600c15fbed1431..36d68a7b0fbd256290c030043d2ddb5e471c6147 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -18,14 +18,16 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/auto_parallel.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
 #include "tensorflow/core/grappler/optimizers/debug_stripper.h"
 #include "tensorflow/core/grappler/optimizers/dependency_optimizer.h"
-#include "tensorflow/core/grappler/optimizers/experimental_implementation_selector.h"
 #include "tensorflow/core/grappler/optimizers/function_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/implementation_selector.h"
 #include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/loop_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/memory_optimizer.h"
@@ -37,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils/colocation.h"
 #include "tensorflow/core/grappler/utils/functions.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/grappler/verifiers/structure_verifier.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/util/dump_graph.h"
@@ -121,7 +124,8 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
   MK_OPT("scoped_allocator",
          new ScopedAllocatorOptimizer(cfg_.scoped_allocator_optimization(),
                                       cfg_.scoped_allocator_opts()));
-  MK_OPT("small_op", new PinToHostOptimizer(cfg_.pin_to_host_optimization()));
+  MK_OPT("pin_to_host",
+         new PinToHostOptimizer(cfg_.pin_to_host_optimization()));
 
   return std::unique_ptr<GraphOptimizer>();
 }
@@ -144,6 +148,9 @@ Status MetaOptimizer::InitializeOptimizers(
   if (!cfg_.disable_model_pruning()) {
     optimizers->push_back(MakeUnique<ModelPruner>());
   }
+  if (cfg_.implementation_selector() != RewriterConfig::OFF) {
+    optimizers->push_back(MakeUnique<ImplementationSelector>());
+  }
   if (cfg_.function_optimization() != RewriterConfig::OFF) {
     optimizers->push_back(
         MakeUnique<FunctionOptimizer>(cfg_.function_optimization()));
@@ -161,7 +168,7 @@ Status MetaOptimizer::InitializeOptimizers(
   if (cfg_.remapping() != RewriterConfig::OFF) {
     optimizers->push_back(MakeUnique<Remapper>(cfg_.remapping()));
   }
-  if (cfg_.pin_to_host_optimization() == RewriterConfig::ON) {
+  if (cfg_.pin_to_host_optimization() != RewriterConfig::OFF) {
     optimizers->push_back(MakeUnique<PinToHostOptimizer>());
   }
   if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) {
@@ -237,18 +244,10 @@ Status MetaOptimizer::InitializeCustomGraphOptimizers(
         pre_initialized_optimizers.end()) {
       continue;
     }
-    // Initialize the ExperimentalImplementationSelector here instead of
-    // CustomizeOptimizer registry, due the static link issue in TensorRT for
-    // double registry.
-    // TODO(laigd): Remove this hack and change it back to use the registry once
-    // the duplicate static import issue is fixed.
-    std::unique_ptr<CustomGraphOptimizer> custom_optimizer;
-    if (optimizer_config.name() == "ExperimentalImplementationSelector") {
-      custom_optimizer.reset(new ExperimentalImplementationSelector());
-    } else {
-      custom_optimizer = CustomGraphOptimizerRegistry::CreateByNameOrNull(
-          optimizer_config.name());
-    }
+
+    auto custom_optimizer = CustomGraphOptimizerRegistry::CreateByNameOrNull(
+        optimizer_config.name());
+
     if (custom_optimizer) {
       VLOG(2) << "Registered custom configurable graph optimizer: "
               << optimizer_config.name();
@@ -282,6 +281,20 @@ MetaOptimizer::GetCustomGraphOptimizerConfig(const string& name) const {
   return nullptr;
 }
 
+void MetaOptimizer::InitializeVerifiers(
+    std::vector<std::unique_ptr<GraphVerifier>>* inter_optimizer_verifiers,
+    std::vector<std::unique_ptr<GraphVerifier>>* post_optimization_verifiers)
+    const {
+  if (cfg_.inter_optimizer_verifier_config().structure_verifier() ==
+      VerifierConfig::ON) {
+    inter_optimizer_verifiers->push_back(MakeUnique<StructureVerifier>());
+  }
+  if (cfg_.post_optimization_verifier_config().structure_verifier() ==
+      VerifierConfig::ON) {
+    post_optimization_verifiers->push_back(MakeUnique<StructureVerifier>());
+  }
+}
+
 #define RUN_OPTIMIZER_OR_RETURN_IF_ERROR(optimizer)                            \
   {                                                                            \
     const Status status = RunOptimizer(optimizer, cluster, &optimized_item,    \
@@ -312,6 +325,23 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
     TF_RETURN_IF_ERROR(InitializeOptimizersByName(&optimizers));
   }
 
+  // Initialize the configured verifiers.
+  std::vector<std::unique_ptr<GraphVerifier>> inter_optimizer_verifiers;
+  std::vector<std::unique_ptr<GraphVerifier>> post_optimization_verifiers;
+  InitializeVerifiers(&inter_optimizer_verifiers, &post_optimization_verifiers);
+  if (inter_optimizer_verifiers.empty()) {
+    VLOG(2) << "No inter optimizer verifiers have been configured";
+  } else {
+    VLOG(2) << inter_optimizer_verifiers.size()
+            << " inter optimizer verifiers have been configured";
+  }
+  if (post_optimization_verifiers.empty()) {
+    VLOG(2) << "No post optimization verifiers have been configured";
+  } else {
+    VLOG(2) << post_optimization_verifiers.size()
+            << " post optimization verifiers have been configured";
+  }
+
   VLOG(2) << "Optimize GrapplerItem: item.id=" << item.id
           << " num_optimizers=" << optimizers.size()
           << ", num nodes = " << item.graph.node_size();
@@ -342,6 +372,12 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
     }
 
     VLOG(4) << "Starting optimization iteration " << iteration;
+    if (VLOG_IS_ON(4)) {
+      DumpGraphDefToFile(
+          strings::StrCat("before_MetaOptimizer_iteration_", iteration, "_",
+                          reinterpret_cast<uintptr_t>(optimized_graph)),
+          *optimized_graph);
+    }
     for (const auto& optimizer : optimizers) {
       GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
       // Some optimizers can run only once.
@@ -356,6 +392,28 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
         continue;
       }
       RUN_OPTIMIZER_OR_RETURN_IF_ERROR(optimizer.get());
+
+      if (VLOG_IS_ON(4)) {
+        DumpGraphDefToFile(
+            strings::StrCat("after_MetaOptimizer_iteration_", iteration, "_",
+                            optimizer->name(), "_",
+                            reinterpret_cast<uintptr_t>(optimized_graph)),
+            *optimized_graph);
+      }
+      for (const auto& verifier : inter_optimizer_verifiers) {
+        // TODO(ashwinm): Need to enforce verification_deadline.
+        TF_RETURN_IF_ERROR(verifier->Verify(*optimized_graph));
+      }
+    }
+    if (VLOG_IS_ON(4)) {
+      DumpGraphDefToFile(
+          strings::StrCat("after_MetaOptimizer_iteration_", iteration, "_",
+                          reinterpret_cast<uintptr_t>(optimized_graph)),
+          *optimized_graph);
+    }
+    // TODO(ashwinm): Need to enforce verification_deadline.
+    for (const auto& verifier : post_optimization_verifiers) {
+      TF_RETURN_IF_ERROR(verifier->Verify(*optimized_graph));
     }
   }
 
@@ -425,6 +483,14 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   VLOG(1) << "Starting optimization for grappler item: " << item.id;
   optimization_results_.clear();
 
+  // Constructs a FunctionLibraryDefinition with functions that are reachable
+  // from the nodes of the graph.
+  const auto minimized_flib =
+      [](const GraphDef& graph) -> FunctionLibraryDefinition {
+    return FunctionLibraryDefinition(OpRegistry::Global(), graph.library())
+        .ReachableDefinitions(graph);
+  };
+
   // 0. Original graph might contain a huge function library, that is mostly
   // unused. This library copied over by each individual Grappler optimizer,
   // which adds a huge overhead. Before starting optimization passes we just
@@ -434,11 +500,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   GraphDef trimmed_graph;  // do not copy graph with a potentially huge library
   *trimmed_graph.mutable_node() = item.graph.node();
   *trimmed_graph.mutable_versions() = item.graph.versions();
-  *trimmed_graph.mutable_library() =
-      grappler::ReachableFunctionLibraryDefinition(
-          FunctionLibraryDefinition(OpRegistry::Global(), item.graph.library()),
-          item.graph)
-          .ToProto();
+  *trimmed_graph.mutable_library() = minimized_flib(item.graph).ToProto();
 
   GrapplerItem trimmed_item = item.WithGraph(std::move(trimmed_graph));
 
@@ -464,16 +526,16 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   if (IsTPUGraphDef(*optimized_graph)) {
     VLOG(2) << "Skipping optimizing funcs for TPU graphs";
     if (VLOG_IS_ON(1)) {
-      DumpGraphDefToFile("after_MetaOptimizer", *optimized_graph);
+      DumpGraphDefToFile(
+          strings::StrCat("after_MetaOptimizer_",
+                          reinterpret_cast<uintptr_t>(optimized_graph)),
+          *optimized_graph);
     }
     return Status::OK();
   }
 
   // 2. Optimize functions reachable from the optimized graph.
-  FunctionLibraryDefinition flib = ReachableFunctionLibraryDefinition(
-      FunctionLibraryDefinition(OpRegistry::Global(),
-                                optimized_graph->library()),
-      *optimized_graph);
+  FunctionLibraryDefinition flib = minimized_flib(*optimized_graph);
 
   // Find functions for which we might need to compute a gradient at runtime.
   absl::flat_hash_set<string> differentiable_functions;
@@ -486,7 +548,8 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   // Optimize each function only once.
   absl::flat_hash_set<string> optimized_funcs;
-  bool optimize_function_library = true;
+  bool optimize_function_library =
+      item.optimization_options().optimize_function_library;
 
   while (optimize_function_library) {
     optimize_function_library = false;
@@ -524,7 +587,8 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       // can't perform non-differentiable rewrites.
       if (differentiable_functions.find(func_name) !=
           differentiable_functions.end()) {
-        func_item.allowed_optimizations().non_differentiable_rewrites = false;
+        func_item.optimization_options().allow_non_differentiable_rewrites =
+            false;
       }
 
       // Function item is allowed to use all devices from the main graph.
@@ -533,9 +597,12 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
         VLOG(3) << added_devices.error_message();
       }
 
-      // We can safely inline nested function calls with side-effectful ops into
-      // the function body (see function_optimizer.cc for details).
-      func_item.allowed_optimizations().inline_ops_with_side_effects = true;
+      // We are not allowed to prune certain types of ops from the graph
+      // instantiated by the function definition, because we must guarantee
+      // function execution semantics wrt side effects (see
+      // function_optimizer.cc).
+      func_item.optimization_options().allow_pruning_stateful_and_dataset_ops =
+          false;
 
       // Optimize function body graph.
       GraphDef optimized_func_graph;
@@ -570,7 +637,10 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
           << " functions: " << str_util::Join(optimized_funcs, ", ");
 
   if (VLOG_IS_ON(1)) {
-    DumpGraphDefToFile("after_MetaOptimizer", *optimized_graph);
+    DumpGraphDefToFile(
+        strings::StrCat("after_MetaOptimizer_",
+                        reinterpret_cast<uintptr_t>(optimized_graph)),
+        *optimized_graph);
   }
   return Status::OK();
 }
@@ -607,7 +677,7 @@ bool MetaOptimizerEnabled(const ConfigProto& cfg) {
          rewrite_cfg.memory_optimization() != RewriterConfig::NO_MEM_OPT ||
          rewrite_cfg.debug_stripper() == RewriterConfig::ON ||
          rewrite_cfg.scoped_allocator_optimization() == RewriterConfig::ON ||
-         rewrite_cfg.pin_to_host_optimization() == RewriterConfig::ON ||
+         rewrite_cfg.pin_to_host_optimization() != RewriterConfig::OFF ||
          !rewrite_cfg.optimizers().empty() ||
          !rewrite_cfg.custom_optimizers().empty();
 }
@@ -625,5 +695,86 @@ Status RunMetaOptimizer(const GrapplerItem& item, const ConfigProto& cfg,
   return status;
 }
 
+Status OptimizeGraph(
+    std::vector<string> ret_node_names, std::vector<string> keep_node_names,
+    FunctionLibraryDefinition* flib, const DeviceSet& device_set,
+    Device* cpu_device, const ConfigProto& config_proto,
+    const string& grappler_item_id,
+    const GrapplerItem::OptimizationOptions& optimization_options,
+    std::unique_ptr<tensorflow::Graph>* g) {
+  if (!tensorflow::grappler::MetaOptimizerEnabled(config_proto)) {
+    return Status::OK();
+  }
+
+  tensorflow::grappler::GrapplerItem item;
+  item.id = grappler_item_id;
+  item.optimization_options() = optimization_options;
+
+  // Add all available devices so that inlined function can be placed.
+  for (const Device* d : device_set.devices()) {
+    Status added_device = item.AddDevice(d->name());
+    if (!added_device.ok()) VLOG(3) << added_device.error_message();
+  }
+
+  // Add fetches so that the graph can be pruned.
+  item.fetch.swap(ret_node_names);
+
+  // Add noes that can't be removed from the graph.
+  item.keep_ops = std::move(keep_node_names);
+
+  (*g)->ToGraphDef(&item.graph);
+
+  if (flib) {
+    *item.graph.mutable_library() = flib->ToProto();
+  }
+
+  tensorflow::GraphDef out_graph;
+
+  tensorflow::grappler::VirtualCluster cluster(&device_set);
+
+  // TODO(nareshmodi): Consider adding and using the more generic GraphOptions
+  // proto (which also contain the OptimizerOptions).
+  TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
+      item, config_proto, cpu_device, &cluster, &out_graph));
+
+  std::unique_ptr<tensorflow::Graph> optimized_graph(
+      new tensorflow::Graph(OpRegistry::Global()));
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(GraphConstructorOptions(),
+                                            out_graph, optimized_graph.get()));
+
+  // Copy optimized functions back to the overlay lib.
+  if (flib) {
+    for (const FunctionDef& fdef : out_graph.library().function()) {
+      const string& func_name = fdef.signature().name();
+      if (flib->Contains(func_name)) {
+        TF_RETURN_IF_ERROR(flib->ReplaceFunction(func_name, fdef));
+      } else {
+        TF_RETURN_IF_ERROR(flib->AddFunctionDef(fdef));
+      }
+    }
+  }
+
+  *g = std::move(optimized_graph);
+
+  // The graph conversion sets the requested device names but not the
+  // assigned device names. However, since at this point the graph is
+  // placed TF expects an assigned device name for every node. Therefore
+  // we copy the requested device into the assigned device field.
+  for (Node* node : (*g)->nodes()) {
+    if (node->IsOp() && node->assigned_device_name().empty()) {
+      if (node->requested_device().empty()) {
+        return errors::Internal(
+            "Either placer did not place the node or Grappler did not "
+            "copy the assigned device. Contact Grappler team since latter "
+            "is more likely. Node=",
+            node->name(), " Graph: ", (*g)->ToGraphDefDebug().DebugString());
+      }
+      node->set_assigned_device_name(node->requested_device());
+    }
+  }
+
+  return Status::OK();
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
index a06da4394e4b8a4d8e75855a0a432114f7d7fcb3..b8f0c8e6ff56e2f497144417082d88916b3362ec 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -16,12 +16,17 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
 
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/grappler/verifiers/graph_verifier.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
+#include "tensorflow/core/protobuf/verifier_config.pb.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -60,6 +65,12 @@ class MetaOptimizer : public GraphOptimizer {
   const RewriterConfig::CustomGraphOptimizer* GetCustomGraphOptimizerConfig(
       const string& name) const;
 
+  // Initialiaze active verifiers from the RewriterConfig toggles.
+  void InitializeVerifiers(
+      std::vector<std::unique_ptr<GraphVerifier>>* inter_optimizer_verifiers,
+      std::vector<std::unique_ptr<GraphVerifier>>* post_optimization_verifiers)
+      const;
+
   // Run optimization pass over a single GrapplerItem. Meta optimizer might run
   // multiple such passes: 1) for the main graph 2) for the function library
   Status OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
@@ -99,6 +110,32 @@ Status RunMetaOptimizer(const GrapplerItem& item, const ConfigProto& cfg,
                         DeviceBase* cpu_device, Cluster* cluster,
                         GraphDef* optimized_graph);
 
+// Wrapper around RunMetaOptimizer convenient for optimizing
+// function graphs.
+//
+// Runs grappler optimizations on `g` based on `config_proto`.
+// `ret_node_names`: a vector of node names whose outputs are returned,
+//    aka fetches. when `g` represent a function, these are _Retval nodes.
+// `lib`: function library to use with `g`.
+// `device_set`: the set of devices that graph can refer to.
+// `cpu_device`: the CPU device.
+// `config_proto`: Grapper configuration.
+// `grappler_item_id': Grappler item id (e.g. optimized function name).
+// `optimization_options`: Grappler optimization constraints that are known only
+//    at runtime.
+//
+// **g is a graph constructed based on the runtime library 'lib'.
+// OptimizeGraph mutates **g extensively and replaces '*g' with a
+// complete copy. Therefore, the caller should not keep any references
+// to nodes *g.
+Status OptimizeGraph(
+    std::vector<string> ret_node_names, std::vector<string> keep_node_names,
+    FunctionLibraryDefinition* lib, const DeviceSet& device_set,
+    Device* cpu_device, const ConfigProto& config_proto,
+    const string& grappler_item_id,
+    const GrapplerItem::OptimizationOptions& optimization_options,
+    std::unique_ptr<tensorflow::Graph>* g);
+
 }  // namespace grappler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index 12db5d6ca9b001fa04e42e6d228fe6289d87726e..0970134ed2b88f2ddd4e25962604aba666733e85 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -15,9 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 
+#include "absl/strings/match.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
@@ -27,7 +29,9 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -87,12 +91,12 @@ REGISTER_GRAPH_OPTIMIZER(TestOptimizerWithParams);
 // Record various properties of the GrapplerItems passed for optimization.
 class GrapplerItemPropertiesAccumulator : public CustomGraphOptimizer {
  public:
-  static void SetAllowedOptimizations(
-      gtl::FlatMap<string, GrapplerItem::AllowedOptimizations>*
-          allowed_optimizations) {
-    allowed_optimizations_ = allowed_optimizations;
+  static void SetOptimizationOptions(
+      gtl::FlatMap<string, GrapplerItem::OptimizationOptions>*
+          optimization_options) {
+    optimization_options_ = optimization_options;
   }
-  static void ResetAllowedOptimizations() { allowed_optimizations_ = nullptr; }
+  static void ResetOptimizationOptions() { optimization_options_ = nullptr; }
 
   GrapplerItemPropertiesAccumulator() {}
   string name() const override {
@@ -107,8 +111,8 @@ class GrapplerItemPropertiesAccumulator : public CustomGraphOptimizer {
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override {
     *optimized_graph = item.graph;
-    if (allowed_optimizations_) {
-      allowed_optimizations_->insert({item.id, item.allowed_optimizations()});
+    if (optimization_options_) {
+      optimization_options_->insert({item.id, item.optimization_options()});
     }
     return Status::OK();
   }
@@ -117,12 +121,12 @@ class GrapplerItemPropertiesAccumulator : public CustomGraphOptimizer {
                 const GraphDef& optimized_graph, double result) override {}
 
  private:
-  static gtl::FlatMap<string, GrapplerItem::AllowedOptimizations>*
-      allowed_optimizations_;
+  static gtl::FlatMap<string, GrapplerItem::OptimizationOptions>*
+      optimization_options_;
 };
 
-gtl::FlatMap<string, GrapplerItem::AllowedOptimizations>*
-    GrapplerItemPropertiesAccumulator::allowed_optimizations_;
+gtl::FlatMap<string, GrapplerItem::OptimizationOptions>*
+    GrapplerItemPropertiesAccumulator::optimization_options_;
 
 REGISTER_GRAPH_OPTIMIZER(GrapplerItemPropertiesAccumulator);
 
@@ -231,7 +235,7 @@ TEST_F(MetaOptimizerTest, RunToggleOptimizersAndCustomGraphOptimizerTwice) {
 TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
   using test::function::NDef;
 
-  // Enable ony function optimization.
+  // Enable only function optimization.
   ConfigProto config_proto;
   auto& rewriter_config =
       *config_proto.mutable_graph_options()->mutable_rewrite_options();
@@ -254,13 +258,13 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
   FunctionDef mul_func = FunctionDefHelper::Create(
       "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
       {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
-      /* Mapping between function returns and function node outputs. */
+      /*ret_def=*/
       {{"z", "mul:z:0"}});
 
   FunctionDef square_func = FunctionDefHelper::Create(
       "MySquare", {"x:T"}, {"z:T"}, {"T: {float, double}"},
       {{{"my_mul"}, "MyMul", {"x", "x"}, {{"T", "$T"}}}},
-      /* Mapping between function returns and function node outputs. */
+      /*ret_def=*/
       {{"z", "my_mul:z:0"}});
   (*square_func.mutable_attr())["_noinline"].set_b(true);
 
@@ -268,7 +272,7 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
       "MyQuadratic", {"x:T"}, {"z:T"}, {"T: {float, double}"},
       {{{"square"}, "MySquare", {"x"}, {{"T", "$T"}}},
        {{"quadratic"}, "MySquare", {"square:z"}, {{"T", "$T"}}}},
-      /* Mapping between function returns and function node outputs. */
+      /*ret_def=*/
       {{"z", "quadratic:z:0"}});
   (*quadratic_func.mutable_attr())["_noinline"].set_b(true);
 
@@ -290,7 +294,7 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
        // Forward outputs
        NDef("out_s", "Identity", {"square:0"}, {{"T", DT_FLOAT}}, kDevice),
        NDef("out_q", "Identity", {"quadratic:0"}, {{"T", DT_INT32}}, kDevice)},
-      // FunctionLib
+      /*funcs=*/
       {mul_func, square_func, quadratic_func});
 
   GraphDef output;
@@ -300,7 +304,7 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
                                            output.library());
 
   // Specialized and optimized functions should be added to the graph.
-  EXPECT_EQ(6, optimized_flib.num_functions());
+  EXPECT_EQ(5, optimized_flib.num_functions());
 
   // Get a specialized function name.
   const auto specialized_name = [](const string& fn, const string& node,
@@ -314,25 +318,22 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
       specialized_name("MyQuadratic", "quadratic", "tf_graph");
 
   // MySquare should be specialized and optimized for 3 instantiations:
-  //   1. 'square' node in the main graph
-  //   2. 'square' node in the MyQuadratic specialization (not in a fetch set)
-  //   3. 'quadratic' node in the MyQuadratic specialization (is in a fetch set)
+  //   1.  'square' node in the main graph
+  //   2.  'square' node in the MyQuadratic specialization
+  //   3*. 'quadratic' node in the MyQuadratic specialization
+  //        has identical instantiation context to #2
 
   const string optimized_1 = specialized_name("MySquare", "square", "tf_graph");
   const string optimized_2 =
       specialized_name("MySquare", "square", optimized_0);
-  const string optimized_3 =
-      specialized_name("MySquare", "quadratic", optimized_0);
 
   const FunctionDef* optimized_func_0 = optimized_flib.Find(optimized_0);
   const FunctionDef* optimized_func_1 = optimized_flib.Find(optimized_1);
   const FunctionDef* optimized_func_2 = optimized_flib.Find(optimized_2);
-  const FunctionDef* optimized_func_3 = optimized_flib.Find(optimized_3);
 
   ASSERT_NE(optimized_func_0, nullptr);
   ASSERT_NE(optimized_func_1, nullptr);
   ASSERT_NE(optimized_func_2, nullptr);
-  ASSERT_NE(optimized_func_3, nullptr);
 
   // Graph should call optimized function.
   int count = 0;
@@ -351,13 +352,13 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
     if (node.name() == "square" && ++count) {
       EXPECT_EQ(optimized_2, node.op());
     } else if (node.name() == "quadratic" && ++count) {
-      EXPECT_EQ(optimized_3, node.op());
+      EXPECT_EQ(optimized_2, node.op());
     }
   }
   EXPECT_EQ(2, count);
 
-  const std::vector<const FunctionDef*> optimized_funcs = {
-      optimized_func_1, optimized_func_2, optimized_func_3};
+  const std::vector<const FunctionDef*> optimized_funcs = {optimized_func_1,
+                                                           optimized_func_2};
 
   // MyMul should be inlined into all optimized versions of MySquare.
   for (const FunctionDef* optimized_func : optimized_funcs) {
@@ -403,6 +404,97 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
   test::ExpectTensorEqual<int>(tensors_expected[1], tensors[1]);
 }
 
+TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryPruneUnusedOutputs) {
+  using test::function::NDef;
+
+  ConfigProto config_proto;
+  MetaOptimizer optimizer(nullptr, config_proto);
+
+  // MyMul computes x*y three times and has three output values.
+  FunctionDef my_mul = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z0:T", "z1:T", "z2:T"}, {"T: {float, int32}"},
+      {{{"output0"}, "Mul", {"x", "y"}, {{"T", "$T"}}},
+       {{"output1"}, "Mul", {"x", "y"}, {{"T", "$T"}}},
+       {{"output2"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /*ret_def=*/
+      {{"z0", "output0:z:0"}, {"z1", "output1:z:0"}, {"z2", "output2:z:0"}});
+
+  // Call MyMyl and forward all three outputs.
+  FunctionDef my_fwd = FunctionDefHelper::Create(
+      "Fwd", {"x:T", "y:T"}, {"z0:T", "z1:T", "z2:T"}, {"T: {float, int32}"},
+      {{{"output"}, "MyMul", {"x", "y"}, {{"T", "$T"}}}},
+      /*ret_def=*/
+      {{"z0", "output:z0:0"}, {"z1", "output:z1:0"}, {"z2", "output:z2:0"}});
+
+  // Mark both functions as `_noinline` to trigger specialization.
+  (*my_mul.mutable_attr())["_noinline"].set_b(true);
+  (*my_fwd.mutable_attr())["_noinline"].set_b(true);
+  /*funcs=*/
+  std::vector<FunctionDef> function_library = {my_mul, my_fwd};
+
+  // Tensorflow graph:
+  //   a = Placeholder[T=float]
+  //   b = Placeholder[T=float]
+  //   fwd = Fwd(a, b)
+  //
+  // Fetch fwd:2 via Identity node.
+  GrapplerItem item;
+  item.id = "tf_graph";
+  item.fetch = {"ret"};
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("fwd", "Fwd", {"a", "b"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("ret", "Identity", {"fwd:2"}, {{"T", DT_FLOAT}}, kDevice)},
+      function_library);
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  FunctionLibraryDefinition optimized_flib(OpRegistry::Global(),
+                                           output.library());
+
+  // Specialized functions should be added to the graph.
+  EXPECT_EQ(3, optimized_flib.num_functions());
+
+  // Expected names of the specialized functions.
+  const string specialized_my_fwd = "Fwd_specialized_for_fwd_at_tf_graph";
+  const string specialized_my_mul =
+      absl::StrCat("MyMul_specialized_for_output_at_", specialized_my_fwd);
+
+  // Specialized MyMul should have just one output argument.
+  FunctionDef expected_my_mul = FunctionDefHelper::Create(
+      specialized_my_mul, {"x:float", "y:float"}, {"z2:float"}, {},
+      {{{"output2"}, "Mul", {"x", "y"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/
+      {{"z2", "output2:z:0"}});
+
+  // Specialized Fwd should also have just one output argument.
+  FunctionDef expected_my_fwd = FunctionDefHelper::Create(
+      specialized_my_fwd, {"x:float", "y:float"}, {"z2:float"}, {},
+      {{{"output"}, specialized_my_mul, {"x", "y"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/
+      {{"z2", "output:z2:0"}});
+
+  const FunctionDef* my_mul_spec = optimized_flib.Find(specialized_my_mul);
+  const FunctionDef* my_fwd_spec = optimized_flib.Find(specialized_my_fwd);
+
+  ASSERT_NE(my_mul_spec, nullptr);
+  ASSERT_NE(my_fwd_spec, nullptr);
+
+  CompareFunctions(expected_my_mul, *my_mul_spec);
+  CompareFunctions(expected_my_fwd, *my_fwd_spec);
+
+  item.feed.emplace_back("a", test::AsScalar<float>(2.0f));
+  item.feed.emplace_back("b", test::AsScalar<float>(4.0f));
+  auto tensors_expected = EvaluateFetchNodes(item);
+
+  GrapplerItem optimized = item.WithGraph(std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
 TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryPruneFunctionBody) {
   using test::function::NDef;
 
@@ -425,7 +517,7 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryPruneFunctionBody) {
       "MyFunc", {"x:T", "y:T"}, {"z1:T", "z2:T"}, {"T: {float, double}"},
       {{{"mul1"}, "Mul", {"x", "y"}, {{"T", "$T"}}},
        {{"mul2"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
-      /* Mapping between function returns and function node outputs. */
+      /*ret_def=*/
       {{"z1", "mul1:z:0"}, {"z2", "mul2:z:0"}});
   (*my_func.mutable_attr())["_noinline"].set_b(true);
 
@@ -449,7 +541,7 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryPruneFunctionBody) {
        // Read outputs of function call nodes
        NDef("out_fn1", "Identity", {"fn1:0"}, {{"T", DT_FLOAT}}, kDevice),
        NDef("out_fn2", "Identity", {"fn2:1"}, {{"T", DT_FLOAT}}, kDevice)},
-      // FunctionLib
+      /*funcs=*/
       {my_func});
 
   GraphDef output;
@@ -515,10 +607,9 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryWithRestrictions) {
 
   // We will record what type of optimizations meta optimizer allows for each
   // GrapplerItem (main graph and graphs for each function).
-  gtl::FlatMap<string, GrapplerItem::AllowedOptimizations>
-      allowed_optimizations;
-  GrapplerItemPropertiesAccumulator::SetAllowedOptimizations(
-      &allowed_optimizations);
+  gtl::FlatMap<string, GrapplerItem::OptimizationOptions> optimization_options;
+  GrapplerItemPropertiesAccumulator::SetOptimizationOptions(
+      &optimization_options);
 
   // Just record properties of optimized Grappler items.
   ConfigProto config_proto;
@@ -532,17 +623,17 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryWithRestrictions) {
   MetaOptimizer optimizer(nullptr, config_proto);
 
   // Define simple function library with two identical mul functions.
-  FunctionDef mul_func_1 = FunctionDefHelper::Create(
-      "MyMul1", {"x:float", "y:float"}, {"z:float"}, {},
-      {{{"mul"}, "Mul", {"x", "y"}, {}}},
-      /* Mapping between function returns and function node outputs. */
-      {{"z", "mul:z:0"}});
-
-  FunctionDef mul_func_2 = FunctionDefHelper::Create(
-      "MyMul2", {"x:float", "y:float"}, {"z:float"}, {},
-      {{{"mul"}, "Mul", {"x", "y"}, {}}},
-      /* Mapping between function returns and function node outputs. */
-      {{"z", "mul:z:0"}});
+  FunctionDef mul_func_1 =
+      FunctionDefHelper::Create("MyMul1", {"x:float", "y:float"}, {"z:float"},
+                                {}, {{{"mul"}, "Mul", {"x", "y"}, {}}},
+                                /*ret_def=*/
+                                {{"z", "mul:z:0"}});
+
+  FunctionDef mul_func_2 =
+      FunctionDefHelper::Create("MyMul2", {"x:float", "y:float"}, {"z:float"},
+                                {}, {{{"mul"}, "Mul", {"x", "y"}, {}}},
+                                /*ret_def=*/
+                                {{"z", "mul:z:0"}});
 
   // Tensorflow graph:
   //
@@ -568,7 +659,7 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryWithRestrictions) {
              {"Tin", DataTypeSlice{DT_FLOAT}},
              {"Tout", DataTypeSlice{DT_FLOAT, DT_FLOAT}}},
             kDevice)},
-      // FunctionLib
+      /*funcs=*/
       {mul_func_1, mul_func_2});
   item.fetch = {"mul_1", "mul_2", "dx"};
 
@@ -577,22 +668,23 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryWithRestrictions) {
 
   // Our custom optimizer must be called for the main graph and for the two
   // functions.
-  ASSERT_EQ(allowed_optimizations.size(), 3);
-
-  auto allowed_optimizations_main =
-      gtl::FindOrNull(allowed_optimizations, "main");
-  ASSERT_NE(allowed_optimizations_main, nullptr);
-  EXPECT_TRUE(allowed_optimizations_main->non_differentiable_rewrites);
-
-  auto allowed_optimizations_my_mul_1 =
-      gtl::FindOrNull(allowed_optimizations, "MyMul1");
-  ASSERT_NE(allowed_optimizations_my_mul_1, nullptr);
-  EXPECT_TRUE(allowed_optimizations_my_mul_1->non_differentiable_rewrites);
-
-  auto allowed_optimizations_my_mul_2 =
-      gtl::FindOrNull(allowed_optimizations, "MyMul2");
-  ASSERT_NE(allowed_optimizations_my_mul_2, nullptr);
-  EXPECT_FALSE(allowed_optimizations_my_mul_2->non_differentiable_rewrites);
+  ASSERT_EQ(optimization_options.size(), 3);
+
+  auto optimization_options_main =
+      gtl::FindOrNull(optimization_options, "main");
+  ASSERT_NE(optimization_options_main, nullptr);
+  EXPECT_TRUE(optimization_options_main->allow_non_differentiable_rewrites);
+
+  auto optimization_options_my_mul_1 =
+      gtl::FindOrNull(optimization_options, "MyMul1");
+  ASSERT_NE(optimization_options_my_mul_1, nullptr);
+  EXPECT_TRUE(optimization_options_my_mul_1->allow_non_differentiable_rewrites);
+
+  auto optimization_options_my_mul_2 =
+      gtl::FindOrNull(optimization_options, "MyMul2");
+  ASSERT_NE(optimization_options_my_mul_2, nullptr);
+  EXPECT_FALSE(
+      optimization_options_my_mul_2->allow_non_differentiable_rewrites);
 }
 
 class SleepingOptimizer : public CustomGraphOptimizer {
@@ -660,6 +752,191 @@ TEST_F(MetaOptimizerTest, OptimizerDoesNotTimeOut) {
   EXPECT_EQ(item.graph.node_size() + 1, output.node_size());
 }
 
+TEST_F(MetaOptimizerTest, RunPostOptimizationVerifiersOnValidGraph) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  ConfigProto config_proto;
+  auto& post_optimization_verifier_config =
+      *config_proto.mutable_graph_options()
+           ->mutable_rewrite_options()
+           ->mutable_post_optimization_verifier_config();
+  post_optimization_verifier_config.set_structure_verifier(VerifierConfig::ON);
+
+  MetaOptimizer optimizer(nullptr, config_proto);
+  GraphDef output;
+  const Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+}
+
+TEST_F(MetaOptimizerTest, RunInterOptimizerVerifiersOnValidGraph) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  ConfigProto config_proto;
+  auto& inter_optimizer_verifier_config =
+      *config_proto.mutable_graph_options()
+           ->mutable_rewrite_options()
+           ->mutable_inter_optimizer_verifier_config();
+  inter_optimizer_verifier_config.set_structure_verifier(VerifierConfig::ON);
+
+  MetaOptimizer optimizer(nullptr, config_proto);
+  GraphDef output;
+  const Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+}
+
+TEST_F(MetaOptimizerTest, RunPostOptimizationVerifiersOnInvalidGraph) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  gtl::FlatMap<string, GrapplerItem::OptimizationOptions> optimization_options;
+  GrapplerItemPropertiesAccumulator::SetOptimizationOptions(
+      &optimization_options);
+
+  // Define simple function library with two identical mul functions.
+  FunctionDef mul_func_1 =
+      FunctionDefHelper::Create("MyMul1", {"x:float", "y:float"}, {"z:float"},
+                                {}, {{{"mul"}, "Mul", {"x", "y"}, {}}},
+                                /*ret_def=*/
+                                {{"z", "mul:z:0"}});
+
+  FunctionDef mul_func_2 =
+      FunctionDefHelper::Create("MyMul2", {"x:float", "y:float"}, {"z:float"},
+                                {}, {{{"mul"}, "Mul", {"x", "y"}, {}}},
+                                /*ret_def=*/
+                                {{"z", "mul:z:0"}});
+
+  // Tensorflow graph:
+  //
+  //   x0 = tf.Placeholder(tf.float);
+  //   x1 = tf.Placeholder(tf.float);
+  //   dy = tf.Placeholder(tf.float);
+  //
+  //   mul_1 = MyMul1(x0, x1);
+  //   mul_2 = MyMul2(x0, x1);
+  //   dx = SymbolicGradient({x0, x1, dy}, f=MyMul2)
+  GrapplerItem item;
+  item.id = "main";
+  item.graph = test::function::GDef(
+      {NDef("x0", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("x1", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("dy", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       // Calls into function library
+       NDef("mul_1", "MyMul1", {"x0", "x1"}, {}, kDevice),
+       NDef("mul_2", "MyMul2", {"x0", "x1"}, {}, kDevice),
+       // Symbolic gradient of a MyMul2
+       NDef("dx", "SymbolicGradient", {"x0", "x1", "dy"},
+            {{"f", FDH::FunctionRef("MyMul2", {})},
+             {"Tin", DataTypeSlice{DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT, DT_FLOAT}}},
+            kDevice)},
+      /*funcs=*/
+      {mul_func_1, mul_func_2});
+  item.fetch = {"mul_1", "mul_2", "dx"};
+
+  GraphDef output;
+
+  // Call Optimize with post optimization verifiers.
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
+
+  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
+  rewriter_config.add_optimizers("GrapplerItemPropertiesAccumulator");
+  rewriter_config.set_min_graph_nodes(-1);
+  auto& post_optimization_verifier_config =
+      *config_proto.mutable_graph_options()
+           ->mutable_rewrite_options()
+           ->mutable_post_optimization_verifier_config();
+  post_optimization_verifier_config.set_structure_verifier(VerifierConfig::ON);
+
+  MetaOptimizer optimizer_with_post_verifiers(nullptr, config_proto);
+  Status status =
+      optimizer_with_post_verifiers.Optimize(nullptr, item, &output);
+  EXPECT_EQ(status.code(), errors::Code::INVALID_ARGUMENT);
+  EXPECT_TRUE(absl::StrContains(
+      status.error_message(),
+      "NodeDef expected inputs 'float' do not match 3 inputs specified"));
+}
+
+TEST_F(MetaOptimizerTest, RunInterOptimizerVerifiersOnInvalidGraph) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  gtl::FlatMap<string, GrapplerItem::OptimizationOptions> optimization_options;
+  GrapplerItemPropertiesAccumulator::SetOptimizationOptions(
+      &optimization_options);
+
+  // Define simple function library with two identical mul functions.
+  FunctionDef mul_func_1 =
+      FunctionDefHelper::Create("MyMul1", {"x:float", "y:float"}, {"z:float"},
+                                {}, {{{"mul"}, "Mul", {"x", "y"}, {}}},
+                                /*ret_def=*/
+                                {{"z", "mul:z:0"}});
+
+  FunctionDef mul_func_2 =
+      FunctionDefHelper::Create("MyMul2", {"x:float", "y:float"}, {"z:float"},
+                                {}, {{{"mul"}, "Mul", {"x", "y"}, {}}},
+                                /*ret_def=*/
+                                {{"z", "mul:z:0"}});
+
+  // Tensorflow graph:
+  //
+  //   x0 = tf.Placeholder(tf.float);
+  //   x1 = tf.Placeholder(tf.float);
+  //   dy = tf.Placeholder(tf.float);
+  //
+  //   mul_1 = MyMul1(x0, x1);
+  //   mul_2 = MyMul2(x0, x1);
+  //   dx = SymbolicGradient({x0, x1, dy}, f=MyMul2)
+  GrapplerItem item;
+  item.id = "main";
+  item.graph = test::function::GDef(
+      {NDef("x0", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("x1", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("dy", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("x1", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       // Calls into function library
+       NDef("mul_1", "MyMul1", {"x0", "x1"}, {}, kDevice),
+       NDef("mul_2", "MyMul2", {"x0", "x1"}, {}, kDevice),
+       // Symbolic gradient of a MyMul2
+       NDef("dx", "SymbolicGradient", {"x0", "x1", "dy"},
+            {{"f", FDH::FunctionRef("MyMul2", {})},
+             {"Tin", DataTypeSlice{DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT, DT_FLOAT}}},
+            kDevice)},
+      /*funcs=*/
+      {mul_func_1, mul_func_2});
+  item.fetch = {"mul_1", "mul_2", "dx"};
+
+  GraphDef output;
+
+  // Call Optimize with post optimization verifiers.
+  ConfigProto config_proto;
+  // Call Optimize with inter optimizer verifiers.
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
+  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
+  rewriter_config.add_optimizers("GrapplerItemPropertiesAccumulator");
+  rewriter_config.set_min_graph_nodes(-1);
+  auto& inter_optimizer_verifier_config =
+      *config_proto.mutable_graph_options()
+           ->mutable_rewrite_options()
+           ->mutable_inter_optimizer_verifier_config();
+  inter_optimizer_verifier_config.set_structure_verifier(VerifierConfig::ON);
+
+  MetaOptimizer optimizer_with_inter_verifiers(nullptr, config_proto);
+  Status status =
+      optimizer_with_inter_verifiers.Optimize(nullptr, item, &output);
+  EXPECT_EQ(status.code(), errors::Code::INVALID_ARGUMENT);
+  EXPECT_TRUE(absl::StrContains(
+      status.error_message(),
+      "NodeDef expected inputs 'float' do not match 3 inputs specified"));
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.h b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.h
index d557a03463f2b9c0355def1da9bde38a1d51f27f..44f26461c0e1445bc198eace681c6c4c8493c38b 100644
--- a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.h
@@ -38,9 +38,8 @@ string TryFindHostDevice(const gtl::FlatSet<string>& devices,
 // gpu->gpu->gpu may have been better/faster. We should probably fix this.
 class PinToHostOptimizer : public GraphOptimizer {
  public:
-  PinToHostOptimizer() : opt_level_(RewriterConfig::DEFAULT) {}
-  explicit PinToHostOptimizer(RewriterConfig::Toggle opt_level)
-      : opt_level_(opt_level) {}
+  PinToHostOptimizer() {}
+  explicit PinToHostOptimizer(RewriterConfig::Toggle opt_level) {}
 
   ~PinToHostOptimizer() override {}
 
@@ -51,9 +50,6 @@ class PinToHostOptimizer : public GraphOptimizer {
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimized_graph, double result) override {}
-
- private:
-  RewriterConfig::Toggle opt_level_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 3fb3f2b0ec75d1a628445a2f5e4d58e7a498c893..193772fcda23378850485db105fc2d3ebef1d8ab 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/symbolic_shapes.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -60,17 +61,30 @@ struct RemapperContext {
 
 // FusedBatchNorm that can be replaced with a cheaper set of primitives.
 struct FusedBatchNorm {
+  FusedBatchNorm() = default;
+  explicit FusedBatchNorm(const NodeDef* fused_batch_norm)
+      : fused_batch_norm(fused_batch_norm) {}
+
   const NodeDef* fused_batch_norm = nullptr;
 };
 
 // Conv2D node followed by a BiasAdd.
 struct Conv2DWithBiasAdd {
+  Conv2DWithBiasAdd() = default;
+  Conv2DWithBiasAdd(const NodeDef* conv2d, const NodeDef* bias_add)
+      : conv2d(conv2d), bias_add(bias_add) {}
+
   const NodeDef* conv2d = nullptr;
   const NodeDef* bias_add = nullptr;
 };
 
 // Conv2D node followed by a BiasAdd and Relu.
 struct Conv2DWithBiasAddAndRelu {
+  Conv2DWithBiasAddAndRelu() = default;
+  Conv2DWithBiasAddAndRelu(const NodeDef* conv2d, const NodeDef* bias_add,
+                           const NodeDef* relu)
+      : conv2d(conv2d), bias_add(bias_add), relu(relu) {}
+
   const NodeDef* conv2d = nullptr;
   const NodeDef* bias_add = nullptr;
   const NodeDef* relu = nullptr;
@@ -78,6 +92,11 @@ struct Conv2DWithBiasAddAndRelu {
 
 // Conv2D node followed by a Squeeze and BiasAdd.
 struct Conv2DWithSqueezeAndBiasAdd {
+  Conv2DWithSqueezeAndBiasAdd() = default;
+  Conv2DWithSqueezeAndBiasAdd(const NodeDef* conv2d, const NodeDef* squeeze,
+                              const NodeDef* bias_add)
+      : conv2d(conv2d), squeeze(squeeze), bias_add(bias_add) {}
+
   const NodeDef* conv2d = nullptr;
   const NodeDef* squeeze = nullptr;
   const NodeDef* bias_add = nullptr;
@@ -85,6 +104,11 @@ struct Conv2DWithSqueezeAndBiasAdd {
 
 // Conv2D node followed by a FusedBatchNorm.
 struct Conv2DWithBatchNorm {
+  Conv2DWithBatchNorm() = default;
+  Conv2DWithBatchNorm(const NodeDef* conv2d, const NodeDef* fused_batch_norm,
+                      float epsilon = 0.0)
+      : conv2d(conv2d), fused_batch_norm(fused_batch_norm), epsilon(epsilon) {}
+
   const NodeDef* conv2d = nullptr;
   const NodeDef* fused_batch_norm = nullptr;
   float epsilon = 0.0;
@@ -92,16 +116,23 @@ struct Conv2DWithBatchNorm {
 
 // Conv2D node followed by a FusedBatchNorm and Relu.
 struct Conv2DWithBatchNormAndRelu {
+  Conv2DWithBatchNormAndRelu() = default;
+  Conv2DWithBatchNormAndRelu(const NodeDef* conv2d,
+                             const NodeDef* fused_batch_norm,
+                             const NodeDef* relu, float epsilon = 0.0)
+      : conv2d(conv2d),
+        fused_batch_norm(fused_batch_norm),
+        relu(relu),
+        epsilon(epsilon) {}
+
   const NodeDef* conv2d = nullptr;
   const NodeDef* fused_batch_norm = nullptr;
   const NodeDef* relu = nullptr;
   float epsilon = 0.0;
 };
 
-bool IsFloatOrDoubleDataType(const NodeDef* node,
-                             const string& type_attr = "T") {
-  DataType dtype = GetDataTypeFromAttr(*node, type_attr);
-  return dtype == DT_FLOAT || dtype == DT_DOUBLE;
+bool IsInPreserveSet(const RemapperContext& ctx, const NodeDef* node) {
+  return ctx.nodes_to_preserve.count(node->name()) > 0;
 }
 
 bool HaveSameDataType(const NodeDef* lhs, const NodeDef* rhs,
@@ -119,91 +150,165 @@ bool HasDataType(const NodeDef* node, const DataType& expected,
   return dtype == expected;
 }
 
-bool IsInPreserveSet(const RemapperContext& ctx, const NodeDef* node) {
-  return ctx.nodes_to_preserve.count(node->name()) > 0;
+bool IsCpuCompatibleDataType(const NodeDef* node,
+                             const string& type_attr = "T") {
+  DataType dtype = GetDataTypeFromAttr(*node, type_attr);
+  return dtype == DT_FLOAT || dtype == DT_DOUBLE;
+}
+
+bool IsGpuCompatibleDataType(const NodeDef* node,
+                             const string& type_attr = "T") {
+  DataType dtype = GetDataTypeFromAttr(*node, type_attr);
+  return dtype == DT_FLOAT;
+}
+
+bool IsCpuCompatibleDataFormat(const NodeDef* conv2d) {
+  DCHECK(IsConv2D(*conv2d)) << "Expected Conv2D op";
+  const string& data_format = conv2d->attr().at(kDataFormat).s();
+  return data_format == "NHWC";
+}
+
+bool IsGpuCompatibleDataFormat(const NodeDef* conv2d) {
+  DCHECK(IsConv2D(*conv2d)) << "Expected Conv2D op";
+  const string& data_format = conv2d->attr().at(kDataFormat).s();
+  return data_format == "NHWC" || data_format == "NCHW";
 }
 
-bool FindConv2DWithBias(const RemapperContext& ctx, const NodeDef* node,
-                        Conv2DWithBiasAdd* matched) {
+bool IsCpuCompatibleConv2D(const NodeDef* conv2d) {
+  DCHECK(IsConv2D(*conv2d)) << "Expected Conv2D op";
+  return NodeIsOnCpu(conv2d) && IsCpuCompatibleDataType(conv2d) &&
+         IsCpuCompatibleDataFormat(conv2d);
+}
+
+bool IsGpuCompatibleConv2D(const NodeDef* conv2d) {
+  DCHECK(IsConv2D(*conv2d)) << "Expected Conv2D op";
+  return NodeIsOnGpu(conv2d) && IsGpuCompatibleDataType(conv2d) &&
+         IsGpuCompatibleDataFormat(conv2d);
+}
+
+// Checks if we can rewrite a pattern to the `_FusedConv2D` on CPU device.
+template <typename Pattern>
+bool IsCpuCompatible(const Pattern& matched) {
+  return IsCpuCompatibleConv2D(matched.conv2d);
+}
+
+// Checks if we can rewrite a pattern to the `_FusedConv2D` on GPU device.
+bool IsGpuCompatible(const RemapperContext& ctx,
+                     const Conv2DWithBiasAddAndRelu& matched) {
+  const std::vector<OpInfo::TensorProperties>& input_props =
+      ctx.graph_properties.GetInputProperties(matched.conv2d->name());
+  const TensorShapeProto& filter_shape =
+      input_props.size() >= 2 ? input_props[1].shape() : TensorShapeProto();
+
+  // FusedConv2D on GPU with 1x1 convolution is marginally faster than
+  // in-graph computation in micro benchmarks (see kernels/conv_ops_test.cc),
+  // and significantly slower in large scale benchmarks.
+  bool is_spatial_conv = Rank(filter_shape) == 4 &&          //
+                         IsKnown(filter_shape.dim(1)) &&     //
+                         IsKnown(filter_shape.dim(2)) &&     //
+                         filter_shape.dim(1).size() != 1 &&  //
+                         filter_shape.dim(2).size() != 1;
+
+  return is_spatial_conv && IsGpuCompatibleConv2D(matched.conv2d);
+}
+bool IsGpuCompatible(const RemapperContext& ctx,
+                     const Conv2DWithBiasAdd& matched) {
+  return false;
+}
+bool IsGpuCompatible(const RemapperContext& ctx,
+                     const Conv2DWithSqueezeAndBiasAdd& matched) {
+  return false;
+}
+
+// Returns true if the given pattern is supported on the assigned device.
+template <typename Pattern>
+bool IsDeviceCompatible(const RemapperContext& ctx, Pattern& matched) {
+  return IsCpuCompatible(matched) || IsGpuCompatible(ctx, matched);
+}
+
+bool FindConv2DWithBias(const RemapperContext& ctx, const NodeDef* bias_add,
+                        Conv2DWithBiasAdd* matched,
+                        bool check_device_compatible = true) {
   if (!EigenSupportsContractionOutputKernel()) return false;
 
   // Root of the pattern must be a BiasAdd.
-  if (!node) return false;
-  if (!IsBiasAdd(*node)) return false;
-  if (!NodeIsOnCpu(node)) return false;
-  if (!IsFloatOrDoubleDataType(node)) return false;
-  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+  if (bias_add == nullptr || !IsBiasAdd(*bias_add) ||
+      HasControlFaninOrFanout(ctx.graph_view, bias_add))
+    return false;
 
-  // Input to the BiasAdd must be a Conv2D in NHWC format.
-  const auto input_port = GraphView::InputPort(node, 0);
+  // Input to the BiasAdd must be a Conv2D.
+  const auto input_port = GraphView::InputPort(bias_add, 0);
   const auto conv2d = ctx.graph_view.GetRegularFanin(input_port);
-  if (!conv2d.node) return false;
-  if (!IsConv2D(*conv2d.node)) return false;
-  if (conv2d.node->attr().at(kDataFormat).s() != "NHWC") return false;
-  if (!NodeIsOnCpu(conv2d.node)) return false;
-  if (!HaveSameDataType(node, conv2d.node)) return false;
-  if (!NoControlFaninOrFanout(ctx.graph_view, conv2d.node)) return false;
-  if (!HasSingleFanoutNode(ctx.graph_view, conv2d.node)) return false;
-  if (IsInPreserveSet(ctx, conv2d.node)) return false;
+
+  if (!conv2d.node || !IsConv2D(*conv2d.node) ||
+      !HaveSameDataType(bias_add, conv2d.node) ||
+      HasControlFaninOrFanout(ctx.graph_view, conv2d.node) ||
+      !HasSingleFanoutNode(ctx.graph_view, conv2d.node) ||
+      IsInPreserveSet(ctx, conv2d.node))
+    return false;
+
+  // Check that data type and data format are supported on assigned device.
+  const Conv2DWithBiasAdd pattern{conv2d.node, bias_add};
+  if (check_device_compatible && !IsDeviceCompatible(ctx, pattern)) {
+    return false;
+  }
 
   // We successfully found a Conv2D+BiasAdd pattern.
-  matched->conv2d = conv2d.node;
-  matched->bias_add = node;
+  *matched = pattern;
 
   return true;
 }
 
-bool FindConv2DWithBiasAndRelu(const RemapperContext& ctx, const NodeDef* node,
+bool FindConv2DWithBiasAndRelu(const RemapperContext& ctx, const NodeDef* relu,
                                Conv2DWithBiasAddAndRelu* matched) {
   if (!EigenSupportsContractionOutputKernel()) return false;
 
   // Root of the pattern must be a Relu.
-  if (!node) return false;
-  if (!IsRelu(*node)) return false;
-  if (!NodeIsOnCpu(node)) return false;
-  if (!IsFloatOrDoubleDataType(node)) return false;
-  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+  if (!relu || !IsRelu(*relu) || HasControlFaninOrFanout(ctx.graph_view, relu))
+    return false;
 
   // And input to Relu must match Conv2DWithBiasAdd pattern.
-  const auto input_port = GraphView::InputPort(node, 0);
+  const auto input_port = GraphView::InputPort(relu, 0);
   const auto bias_add = ctx.graph_view.GetRegularFanin(input_port);
 
   Conv2DWithBiasAdd base;
-  if (!FindConv2DWithBias(ctx, bias_add.node, &base)) return false;
-  if (!HasSingleFanoutNode(ctx.graph_view, base.bias_add)) return false;
-  if (!HaveSameDataType(node, base.bias_add)) return false;
-  if (IsInPreserveSet(ctx, base.bias_add)) return false;
+  if (!FindConv2DWithBias(ctx, bias_add.node, &base,
+                          /*check_device_compatible=*/false) ||
+      !HasSingleFanoutNode(ctx.graph_view, base.bias_add) ||
+      !HaveSameDataType(relu, base.bias_add) ||
+      IsInPreserveSet(ctx, base.bias_add))
+    return false;
+
+  // Check that data type and data format are supported on assigned device.
+  const Conv2DWithBiasAddAndRelu pattern{base.conv2d, base.bias_add, relu};
+  if (!IsDeviceCompatible(ctx, pattern)) return false;
 
   // We successfully found a Conv2D+BiasAdd+Relu pattern.
-  matched->conv2d = base.conv2d;
-  matched->bias_add = base.bias_add;
-  matched->relu = node;
+  *matched = pattern;
 
   return true;
 }
 
 bool FindConv2DWithSqueezeAndBias(const RemapperContext& ctx,
-                                  const NodeDef* node,
+                                  const NodeDef* bias_add,
                                   Conv2DWithSqueezeAndBiasAdd* matched) {
   if (!EigenSupportsContractionOutputKernel()) return false;
 
   // Root of the pattern must be a BiasAdd.
-  if (node == nullptr) return false;
-  if (node->op() != "BiasAdd") return false;
-  if (!NodeIsOnCpu(node)) return false;
-  if (!IsFloatOrDoubleDataType(node)) return false;
-  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+  if (!bias_add || !IsBiasAdd(*bias_add) ||
+      HasControlFaninOrFanout(ctx.graph_view, bias_add))
+    return false;
 
   // Input to the BiasAdd must be a Squeeze.
-  const auto bias_input_port = GraphView::InputPort(node, 0);
+  const auto bias_input_port = GraphView::InputPort(bias_add, 0);
   const auto squeeze = ctx.graph_view.GetRegularFanin(bias_input_port);
-  if (squeeze.node == nullptr) return false;
-  if (squeeze.node->op() != "Squeeze") return false;
-  if (!NodeIsOnCpu(squeeze.node)) return false;
-  if (!HaveSameDataType(node, squeeze.node, "T")) return false;
-  if (!NoControlFaninOrFanout(ctx.graph_view, squeeze.node)) return false;
-  if (!HasSingleFanoutNode(ctx.graph_view, squeeze.node)) return false;
-  if (IsInPreserveSet(ctx, squeeze.node)) return false;
+
+  if (!squeeze.node || !IsSqueeze(*squeeze.node) ||
+      !HaveSameDataType(bias_add, squeeze.node, "T") ||
+      HasControlFaninOrFanout(ctx.graph_view, squeeze.node) ||
+      !HasSingleFanoutNode(ctx.graph_view, squeeze.node) ||
+      IsInPreserveSet(ctx, squeeze.node))
+    return false;
 
   // Squeeze must not squeeze output channel dimension.
   std::vector<int32> dims;
@@ -212,67 +317,72 @@ bool FindConv2DWithSqueezeAndBias(const RemapperContext& ctx,
     if (dim == 3) return false;
   }
 
-  // Input to the Squeeze must be a Conv2D in NHWC format.
+  // Input to the Squeeze must be a Conv2D.
   const auto squeeze_input_port = GraphView::InputPort(squeeze.node, 0);
   const auto conv2d = ctx.graph_view.GetRegularFanin(squeeze_input_port);
-  if (conv2d.node == nullptr) return false;
-  if (conv2d.node->op() != "Conv2D") return false;
-  if (conv2d.node->attr().at("data_format").s() != "NHWC") return false;
-  if (!NodeIsOnCpu(conv2d.node)) return false;
-  if (!HaveSameDataType(node, conv2d.node, "T")) return false;
-  if (!NoControlFaninOrFanout(ctx.graph_view, conv2d.node)) return false;
-  if (!HasSingleFanoutNode(ctx.graph_view, conv2d.node)) return false;
-  if (IsInPreserveSet(ctx, conv2d.node)) return false;
+
+  if (!conv2d.node || !IsConv2D(*conv2d.node) ||
+      !HaveSameDataType(bias_add, conv2d.node, "T") ||
+      HasControlFaninOrFanout(ctx.graph_view, conv2d.node) ||
+      !HasSingleFanoutNode(ctx.graph_view, conv2d.node) ||
+      IsInPreserveSet(ctx, conv2d.node))
+    return false;
+
+  // Check that data type and data format are supported on assigned device.
+  const Conv2DWithSqueezeAndBiasAdd pattern{conv2d.node, squeeze.node,
+                                            bias_add};
+  if (!IsDeviceCompatible(ctx, pattern)) return false;
 
   // We successfully found a Conv2D+Squeeze+BiasAdd pattern.
-  matched->conv2d = conv2d.node;
-  matched->squeeze = squeeze.node;
-  matched->bias_add = node;
+  *matched = pattern;
 
   return true;
 }
 
-bool FindConv2DWithBatchNorm(const RemapperContext& ctx, const NodeDef* node,
+bool FindConv2DWithBatchNorm(const RemapperContext& ctx,
+                             const NodeDef* batch_norm,
                              Conv2DWithBatchNorm* matched) {
   if (!EigenSupportsContractionOutputKernel()) return false;
 
   // Root of the pattern must be a FusedBatchNorm or a FusedBatchNormV2.
-  if (node == nullptr) return false;
-  if (!IsFusedBatchNorm(*node)) return false;
-  if (!NodeIsOnCpu(node)) return false;
-  if (!HasDataType(node, DT_FLOAT)) return false;
+  if (!batch_norm || !IsFusedBatchNorm(*batch_norm)) return false;
 
   // V2 has a separate data type for the scale/offset/mean/variance inputs.
-  if (node->op() == "FusedBatchNormV2" && !HasDataType(node, DT_FLOAT, "U"))
+  if (batch_norm->op() == "FusedBatchNormV2" &&
+      !HasDataType(batch_norm, DT_FLOAT, "U"))
     return false;
 
   // Check that batch normalization is in inference mode.
-  const auto& attr = node->attr();
+  const auto& attr = batch_norm->attr();
   if (attr.count(kIsTraining) > 0 && attr.at(kIsTraining).b()) return false;
 
   // Check that only 0th output is consumed by other nodes.
-  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
-  if (HasFanouts(ctx.graph_view, node, 1)) return false;  // batch_mean
-  if (HasFanouts(ctx.graph_view, node, 2)) return false;  // batch_variance
-  if (HasFanouts(ctx.graph_view, node, 3)) return false;  // reserve_space_1
-  if (HasFanouts(ctx.graph_view, node, 4)) return false;  // reserve_space_2
+  if (HasControlFaninOrFanout(ctx.graph_view, batch_norm) ||
+      HasFanouts(ctx.graph_view, batch_norm, 1) ||  // batch_mean
+      HasFanouts(ctx.graph_view, batch_norm, 2) ||  // batch_variance
+      HasFanouts(ctx.graph_view, batch_norm, 3) ||  // reserve_space_1
+      HasFanouts(ctx.graph_view, batch_norm, 4))    // reserve_space_2
+    return false;
 
-  // Input to the FusedBatchNorm must be a Conv2D in NHWC format.
-  const auto input_port = GraphView::InputPort(node, 0);
+  // Input to the FusedBatchNorm must be a Conv2D.
+  const auto input_port = GraphView::InputPort(batch_norm, 0);
   const auto conv2d = ctx.graph_view.GetRegularFanin(input_port);
-  if (conv2d.node == nullptr) return false;
-  if (!IsConv2D(*conv2d.node)) return false;
-  if (conv2d.node->attr().at(kDataFormat).s() != "NHWC") return false;
-  if (!NodeIsOnCpu(conv2d.node)) return false;
-  if (!HaveSameDataType(node, conv2d.node)) return false;
-  if (!NoControlFaninOrFanout(ctx.graph_view, conv2d.node)) return false;
-  if (!HasSingleFanoutNode(ctx.graph_view, conv2d.node)) return false;
-  if (IsInPreserveSet(ctx, conv2d.node)) return false;
+
+  if (!conv2d.node || !IsConv2D(*conv2d.node) ||               //
+      !NodeIsOnCpu(conv2d.node) ||                             //
+      !HaveSameDataType(batch_norm, conv2d.node) ||            //
+      !IsCpuCompatibleDataType(conv2d.node) ||                 //
+      !IsCpuCompatibleDataFormat(conv2d.node) ||               //
+      HasControlFaninOrFanout(ctx.graph_view, conv2d.node) ||  //
+      !HasSingleFanoutNode(ctx.graph_view, conv2d.node) ||     //
+      IsInPreserveSet(ctx, conv2d.node))
+    return false;
 
   // We successfully found a Conv2D+FusedBatchNorm pattern.
   matched->conv2d = conv2d.node;
-  matched->fused_batch_norm = node;
-  if (!GetNodeAttr(*node, "epsilon", &matched->epsilon).ok()) return false;
+  matched->fused_batch_norm = batch_norm;
+  if (!GetNodeAttr(*batch_norm, "epsilon", &matched->epsilon).ok())
+    return false;
 
   return true;
 }
@@ -283,21 +393,19 @@ bool FindConv2DWithBatchNormAndRelu(const RemapperContext& ctx,
   if (!EigenSupportsContractionOutputKernel()) return false;
 
   // Root of the pattern must be a Relu.
-  if (node == nullptr) return false;
-  if (!IsRelu(*node)) return false;
-  if (!NodeIsOnCpu(node)) return false;
-  if (!IsFloatOrDoubleDataType(node)) return false;
-  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+  if (!node || !IsRelu(*node) || HasControlFaninOrFanout(ctx.graph_view, node))
+    return false;
 
   // And input to Relu must match Conv2DWithBatchNorm pattern.
   const auto input_port = GraphView::InputPort(node, 0);
   const auto batch_norm = ctx.graph_view.GetRegularFanin(input_port);
 
   Conv2DWithBatchNorm base;
-  if (!FindConv2DWithBatchNorm(ctx, batch_norm.node, &base)) return false;
-  if (!HasSingleFanoutNode(ctx.graph_view, base.fused_batch_norm)) return false;
-  if (!HaveSameDataType(node, base.fused_batch_norm)) return false;
-  if (IsInPreserveSet(ctx, base.fused_batch_norm)) return false;
+  if (!FindConv2DWithBatchNorm(ctx, batch_norm.node, &base) ||
+      !HasSingleFanoutNode(ctx.graph_view, base.fused_batch_norm) ||
+      !HaveSameDataType(node, base.fused_batch_norm) ||
+      IsInPreserveSet(ctx, base.fused_batch_norm))
+    return false;
 
   // We successfully found a Conv2D+FusedBatchNorm+Relu pattern.
   matched->conv2d = base.conv2d;
@@ -355,9 +463,7 @@ bool FindFusedBatchNorm(const RemapperContext& ctx, const NodeDef* node,
   return true;
 }
 
-void CopyConv2DAttributes(const NodeDef* conv2d, NodeDef* fused_conv2d,
-                          const std::vector<string>& fused_ops = {},
-                          int num_args = 1, float epsilon = 0.0) {
+void CopyConv2DAttributes(const NodeDef* conv2d, NodeDef* fused_conv2d) {
   auto* attr = fused_conv2d->mutable_attr();
   auto src_attr = conv2d->attr();
 
@@ -366,53 +472,66 @@ void CopyConv2DAttributes(const NodeDef* conv2d, NodeDef* fused_conv2d,
   (*attr)["padding"] = src_attr.at("padding");
   (*attr)["dilations"] = src_attr.at("dilations");
   (*attr)["data_format"] = src_attr.at("data_format");
+  (*attr)["use_cudnn_on_gpu"] = src_attr.at("use_cudnn_on_gpu");
+}
 
-  auto* fused_ops_attr = (*attr)["fused_ops"].mutable_list();
-  for (const string& fused_op : fused_ops) {
-    fused_ops_attr->add_s(fused_op);
-  }
-
+void SetFusedConv2DAttributes(
+    NodeDef* fused_conv2d, const absl::Span<const absl::string_view> fused_ops,
+    int num_args = 1, float epsilon = 0.0) {
+  auto* attr = fused_conv2d->mutable_attr();
+  SetAttrValue(fused_ops, &(*attr)["fused_ops"]);
   SetAttrValue(num_args, &(*attr)["num_args"]);
-  // Required only for FusedBatchNorm.
-  SetAttrValue(epsilon, &(*attr)["epsilon"]);
+  SetAttrValue(epsilon, &(*attr)["epsilon"]);  // required only for BatchNorm
 }
 
 void AddFusedConv2DNode(
-    const Conv2DWithBiasAdd& matched, GraphDef* optimized_graph,
+    const RemapperContext& ctx, const Conv2DWithBiasAdd& matched,
+    GraphDef* optimized_graph,
     absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
-  VLOG(2) << "Fuse Conv2D with BiasAdd: bias_add=" << matched.bias_add->name()
+  DCHECK(IsDeviceCompatible(ctx, matched))
+      << "Unsupported fused Conv2D pattern";
+
+  VLOG(2) << "Fuse Conv2D with BiasAdd: "
+          << " bias_add=" << matched.bias_add->name()
           << " conv2d=" << matched.conv2d->name();
 
   NodeDef* fused_conv2d = optimized_graph->add_node();
-  fused_conv2d->set_name(matched.bias_add->name());
   fused_conv2d->set_op(kFusedConv2D);
-  fused_conv2d->set_device(matched.bias_add->device());
+  fused_conv2d->set_name(matched.bias_add->name());
+  fused_conv2d->set_device(matched.conv2d->device());
   fused_conv2d->add_input(matched.conv2d->input(0));    // 0: input
   fused_conv2d->add_input(matched.conv2d->input(1));    // 1: filter
   fused_conv2d->add_input(matched.bias_add->input(1));  // 2: bias
 
-  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"BiasAdd"});
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d);
+  SetFusedConv2DAttributes(fused_conv2d, {"BiasAdd"});
 
   invalidated_nodes->insert(matched.bias_add);
   invalidated_nodes->insert(matched.conv2d);
 }
 
 void AddFusedConv2DNode(
-    const Conv2DWithBiasAddAndRelu& matched, GraphDef* optimized_graph,
+    const RemapperContext& ctx, const Conv2DWithBiasAddAndRelu& matched,
+    GraphDef* optimized_graph,
     absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
-  VLOG(2) << "Fuse Conv2D with BiasAdd and Relu: relu=" << matched.relu->name()
+  DCHECK(IsDeviceCompatible(ctx, matched))
+      << "Unsupported fused Conv2D pattern";
+
+  VLOG(2) << "Fuse Conv2D with BiasAdd and Relu: "
+          << " relu=" << matched.relu->name()
           << " bias_add=" << matched.bias_add->name()
           << " conv2d=" << matched.conv2d->name();
 
   NodeDef* fused_conv2d = optimized_graph->add_node();
   fused_conv2d->set_name(matched.relu->name());
   fused_conv2d->set_op(kFusedConv2D);
-  fused_conv2d->set_device(matched.relu->device());
+  fused_conv2d->set_device(matched.conv2d->device());
   fused_conv2d->add_input(matched.conv2d->input(0));    // 0: input
   fused_conv2d->add_input(matched.conv2d->input(1));    // 1: filter
   fused_conv2d->add_input(matched.bias_add->input(1));  // 2: bias
 
-  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"BiasAdd", "Relu"});
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d);
+  SetFusedConv2DAttributes(fused_conv2d, {"BiasAdd", "Relu"});
 
   invalidated_nodes->insert(matched.relu);
   invalidated_nodes->insert(matched.bias_add);
@@ -420,8 +539,12 @@ void AddFusedConv2DNode(
 }
 
 void AddFusedConv2DNode(
-    const Conv2DWithSqueezeAndBiasAdd& matched, GraphDef* optimized_graph,
+    const RemapperContext& ctx, const Conv2DWithSqueezeAndBiasAdd& matched,
+    GraphDef* optimized_graph,
     absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
+  DCHECK(IsDeviceCompatible(ctx, matched))
+      << "Unsupported fused Conv2D pattern";
+
   VLOG(2) << "Fuse Conv2D with Squeeze and BiasAdd: "
           << " bias_add=" << matched.bias_add->name()
           << " squeeze=" << matched.squeeze->name()
@@ -431,13 +554,14 @@ void AddFusedConv2DNode(
   // has single consumer (only the squeeze node).
   NodeDef* fused_conv2d = optimized_graph->add_node();
   fused_conv2d->set_name(matched.conv2d->name());
-  fused_conv2d->set_op("_FusedConv2D");
+  fused_conv2d->set_op(kFusedConv2D);
   fused_conv2d->set_device(matched.conv2d->device());
   fused_conv2d->add_input(matched.conv2d->input(0));    // 0: input
   fused_conv2d->add_input(matched.conv2d->input(1));    // 1: filter
   fused_conv2d->add_input(matched.bias_add->input(1));  // 2: bias
 
-  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"BiasAdd"});
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d);
+  SetFusedConv2DAttributes(fused_conv2d, {"BiasAdd"});
 
   // Replace BiasAdd node with a Squeeze.
   NodeDef* remapped_squeeze = optimized_graph->add_node();
@@ -460,7 +584,7 @@ void AddFusedConv2DNode(
   NodeDef* fused_conv2d = optimized_graph->add_node();
   fused_conv2d->set_name(matched.fused_batch_norm->name());
   fused_conv2d->set_op(kFusedConv2D);
-  fused_conv2d->set_device(matched.fused_batch_norm->device());
+  fused_conv2d->set_device(matched.conv2d->device());
   fused_conv2d->add_input(matched.conv2d->input(0));            // 0: input
   fused_conv2d->add_input(matched.conv2d->input(1));            // 1: filter
   fused_conv2d->add_input(matched.fused_batch_norm->input(1));  // 2: scale
@@ -468,8 +592,9 @@ void AddFusedConv2DNode(
   fused_conv2d->add_input(matched.fused_batch_norm->input(3));  // 4: mean
   fused_conv2d->add_input(matched.fused_batch_norm->input(4));  // 5: variance
 
-  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"FusedBatchNorm"},
-                       /*num_args*/ 4, /*epsilon*/ matched.epsilon);
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d);
+  SetFusedConv2DAttributes(fused_conv2d, {"FusedBatchNorm"},
+                           /*num_args=*/4, /*epsilon=*/matched.epsilon);
 
   invalidated_nodes->insert(matched.fused_batch_norm);
   invalidated_nodes->insert(matched.conv2d);
@@ -486,7 +611,7 @@ void AddFusedConv2DNode(
   NodeDef* fused_conv2d = optimized_graph->add_node();
   fused_conv2d->set_name(matched.relu->name());
   fused_conv2d->set_op(kFusedConv2D);
-  fused_conv2d->set_device(matched.fused_batch_norm->device());
+  fused_conv2d->set_device(matched.conv2d->device());
   fused_conv2d->add_input(matched.conv2d->input(0));            // 0: input
   fused_conv2d->add_input(matched.conv2d->input(1));            // 1: filter
   fused_conv2d->add_input(matched.fused_batch_norm->input(1));  // 2: scale
@@ -494,8 +619,9 @@ void AddFusedConv2DNode(
   fused_conv2d->add_input(matched.fused_batch_norm->input(3));  // 4: mean
   fused_conv2d->add_input(matched.fused_batch_norm->input(4));  // 5: variance
 
-  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"FusedBatchNorm", "Relu"},
-                       /*num_args*/ 4, /*epsilon*/ matched.epsilon);
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d);
+  SetFusedConv2DAttributes(fused_conv2d, {"FusedBatchNorm", "Relu"},
+                           /*num_args=*/4, /*epsilon=*/matched.epsilon);
 
   invalidated_nodes->insert(matched.relu);
   invalidated_nodes->insert(matched.fused_batch_norm);
@@ -679,21 +805,25 @@ Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
 
     // Remap Conv2D+BiasAdd into the _FusedConv2D.
     if (FindConv2DWithBias(ctx, &node, &conv2d_with_bias)) {
-      AddFusedConv2DNode(conv2d_with_bias, optimized_graph, &invalidated_nodes);
+      AddFusedConv2DNode(ctx, conv2d_with_bias, optimized_graph,
+                         &invalidated_nodes);
       continue;
     }
 
     // Remap Conv2D+BiasAdd+Relu into the _FusedConv2D.
     if (FindConv2DWithBiasAndRelu(ctx, &node, &conv2d_with_bias_and_relu)) {
-      AddFusedConv2DNode(conv2d_with_bias_and_relu, optimized_graph,
+      AddFusedConv2DNode(ctx, conv2d_with_bias_and_relu, optimized_graph,
                          &invalidated_nodes);
       continue;
     }
 
+// TODO(penporn):
+// Remove this once TF-MKL supports _FusedConv2D with these operations.
+#ifndef INTEL_MKL
     // Remap Conv2D+Squeeze+BiasAdd into the _FusedConv2D+Squeeze.
     if (FindConv2DWithSqueezeAndBias(ctx, &node,
                                      &conv2d_with_squeeze_and_bias)) {
-      AddFusedConv2DNode(conv2d_with_squeeze_and_bias, optimized_graph,
+      AddFusedConv2DNode(ctx, conv2d_with_squeeze_and_bias, optimized_graph,
                          &invalidated_nodes);
       continue;
     }
@@ -712,6 +842,7 @@ Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
                          &invalidated_nodes);
       continue;
     }
+#endif  // !INTEL_MKL
 
     // Infer properties lazily in case they are not needed.
     if (!ctx.inferred_graph_properties && IsFusedBatchNormCandidate(node)) {
diff --git a/tensorflow/core/grappler/optimizers/remapper.h b/tensorflow/core/grappler/optimizers/remapper.h
index c18413e4e72bb970e1e15bca25fcc6316c5ac327..804338f4d21eeb3d48f64a933386caa114640ea6 100644
--- a/tensorflow/core/grappler/optimizers/remapper.h
+++ b/tensorflow/core/grappler/optimizers/remapper.h
@@ -26,7 +26,7 @@ namespace grappler {
 // nodes to decrease the amount of operations needed to perform a computation.
 class Remapper : public GraphOptimizer {
  public:
-  explicit Remapper(RewriterConfig::Toggle opt_level) : opt_level_(opt_level) {}
+  explicit Remapper(RewriterConfig::Toggle opt_level) {}
 
   ~Remapper() override {}
 
@@ -37,9 +37,6 @@ class Remapper : public GraphOptimizer {
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimized_graph, double result) override;
-
- private:
-  RewriterConfig::Toggle opt_level_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/shape_optimizer.h b/tensorflow/core/grappler/optimizers/shape_optimizer.h
index b7f84a1e5dbe7dd1e2d21e3752522b3f237e2d7c..d9c1fefb194ce0fe2be921d17c9aaa782aa4ee39 100644
--- a/tensorflow/core/grappler/optimizers/shape_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/shape_optimizer.h
@@ -30,9 +30,8 @@ namespace grappler {
 // information.
 class ShapeOptimizer : public GraphOptimizer {
  public:
-  ShapeOptimizer() : opt_level_(RewriterConfig::ON) {}
-  explicit ShapeOptimizer(RewriterConfig::Toggle opt_level)
-      : opt_level_(opt_level) {}
+  ShapeOptimizer() {}
+  explicit ShapeOptimizer(RewriterConfig::Toggle opt_level) {}
 
   ~ShapeOptimizer() override {}
 
@@ -43,9 +42,6 @@ class ShapeOptimizer : public GraphOptimizer {
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimized_graph, double result) override;
-
- private:
-  RewriterConfig::Toggle opt_level_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 29775442629dd5a56776f2d0005f9ba50c2da84b..7d4dfb052071ce374f7361eaed19f2e94daf64e9 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include <queue>
 #include <vector>
 
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -38,7 +40,7 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 template <typename T>
-bool SafeSetScalarTensorValue(double value, Tensor* tensor) {
+bool SafeSetDoubleScalarTensorValue(double value, Tensor* tensor) {
   using RealType = typename Eigen::NumTraits<T>::Real;
   if (value > static_cast<double>(Eigen::NumTraits<RealType>::highest()) ||
       value < static_cast<double>(Eigen::NumTraits<RealType>::lowest())) {
@@ -48,6 +50,17 @@ bool SafeSetScalarTensorValue(double value, Tensor* tensor) {
   return true;
 }
 
+template <typename T>
+bool SafeSetIntScalarTensorValue(int value, Tensor* tensor) {
+  using RealType = typename Eigen::NumTraits<T>::Real;
+  if (value > static_cast<int>(Eigen::NumTraits<RealType>::highest()) ||
+      value < static_cast<int>(Eigen::NumTraits<RealType>::lowest())) {
+    return false;
+  }
+  tensor->flat<T>()(0) = static_cast<T>(value);
+  return true;
+}
+
 // Is 'node' an operator that consumes only the shape of its input, not the
 // data itself?
 // TODO(ezhulenev): move to op_types.h. Requires to break circular dependency.
@@ -144,11 +157,16 @@ void NodeMap::UpdateOutput(const string& node_name,
   outputs.insert(nodes_[NodeName(new_output_name)]);
 }
 
+string TensorIdToString(const TensorId& tensor_id) {
+  return tensor_id.index() == 0 ? string(tensor_id.node())
+                                : tensor_id.ToString();
+}
+
 bool IsSameInput(const string& name1, const string& name2) {
   if (name1 == name2) return true;
   TensorId tensor1 = ParseTensorName(name1);
   TensorId tensor2 = ParseTensorName(name2);
-  return tensor1.node() == tensor2.node() && tensor1.index() == tensor2.index();
+  return tensor1 == tensor2;
 }
 
 bool IsControlInput(const string& name) {
@@ -161,10 +179,10 @@ string AddPrefixToNodeName(const string& name, const string& prefix,
                            const string& delimiter) {
   if (!name.empty()) {
     if (name[0] == '^') {
-      return strings::StrCat("^", prefix, delimiter, name.substr(1));
+      return absl::StrCat("^", prefix, delimiter, name.substr(1));
     }
   }
-  return strings::StrCat(prefix, delimiter, name);
+  return absl::StrCat(prefix, delimiter, name);
 }
 
 string AddPrefixToNodeName(const string& name, const string& prefix) {
@@ -188,20 +206,26 @@ bool ExecuteWithTimeout(std::function<void()> fn, const int64 timeout_in_ms,
 }
 
 string AsControlDependency(const NodeDef& node) {
-  return strings::StrCat("^", node.name());
+  return absl::StrCat("^", node.name());
 }
 
 string AsControlDependency(const string& node_name) {
   CHECK(!node_name.empty());
   return (!node_name.empty() && node_name[0] == '^')
              ? node_name
-             : strings::StrCat("^", node_name);
+             : absl::StrCat("^", node_name);
 }
 
 bool NodeIsOnCpu(const NodeDef* node) {
   string task, device;
   return DeviceNameUtils::SplitDeviceName(node->device(), &task, &device) &&
-         str_util::StartsWith(device, DEVICE_CPU);
+         absl::StartsWith(device, DEVICE_CPU);
+}
+
+bool NodeIsOnGpu(const NodeDef* node) {
+  string task, device;
+  return DeviceNameUtils::SplitDeviceName(node->device(), &task, &device) &&
+         absl::StartsWith(device, DEVICE_GPU);
 }
 
 int NumOutputs(const NodeDef& node, GraphDef* graph) {
@@ -397,152 +421,50 @@ void EraseNodesFromGraph(const std::set<string>& nodes_to_delete,
   EraseNodesFromGraphImpl(nodes_idx_to_delete, graph);
 }
 
-Status SimpleGraphView::Initialize(
-    const GraphDef& graph,
-    const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
-        extra_dependencies,
-    bool dedup_inputs, bool dedup_outputs) {
-  graph_ = &graph;
-  const int num_nodes = graph.node_size();
-  inputs_.clear();
-  inputs_.resize(num_nodes);
-  outputs_.clear();
-  outputs_.resize(num_nodes);
-  name_to_index_.clear();
-  name_to_index_.reserve(num_nodes);
-  index_to_name_.clear();
-  index_to_name_.reserve(num_nodes);
-
-  // Build map from name to index and vice versa.
-  for (int node_idx = 0; node_idx < num_nodes; ++node_idx) {
-    const NodeDef& node = graph.node(node_idx);
-    name_to_index_.emplace(node.name(), node_idx);
-    index_to_name_.push_back(node.name());
-  }
-
-  if (extra_dependencies) {
-    for (const auto& dep : *extra_dependencies) {
-      auto itr_src = name_to_index_.find(dep.first->name());
-      if (itr_src == name_to_index_.end()) {
-        return errors::InvalidArgument("Non-existent src ", dep.first->name());
-      }
-      auto itr_tgt = name_to_index_.find(dep.second->name());
-      if (itr_tgt == name_to_index_.end()) {
-        return errors::InvalidArgument("Non-existent tgt ", dep.second->name());
-      }
-      const int src_idx = itr_src->second;
-      const int tgt_idx = itr_tgt->second;
-      inputs_[tgt_idx].push_back(src_idx);
-      outputs_[src_idx].push_back(tgt_idx);
-    }
-  }
-
-  // Build forward and reverse adjacency lists.
-  for (int node_idx = 0; node_idx < num_nodes; ++node_idx) {
-    const NodeDef& node = graph.node(node_idx);
-    inputs_[node_idx].reserve(node.input_size());
-    for (const string& input : node.input()) {
-      auto it = name_to_index_.find(NodeName(input));
-      if (it == name_to_index_.end()) {
-        return errors::InvalidArgument("Non-existent input ", input,
-                                       " for node ", node.name());
-      }
-      const int input_idx = it->second;
-      inputs_[node_idx].push_back(input_idx);
-      outputs_[input_idx].push_back(node_idx);
-    }
-    if (dedup_inputs) {
-      // Dedup the input list while it's still hot in cache.
-      STLSortAndRemoveDuplicates(&inputs_[node_idx]);
-    }
-  }
-
-  // Dedup outputs.
-  if (dedup_outputs) {
-    for (int node_idx = 0; node_idx < num_nodes; ++node_idx) {
-      STLSortAndRemoveDuplicates(&outputs_[node_idx]);
-    }
-  }
-  return Status::OK();
-}
-
-void SimpleGraphView::DepthFirstSearch(
-    const std::unordered_set<string>& op_types_to_traverse, int root_node,
-    std::set<int>* nodes_found) const {
-  nodes_found->clear();
-  const string& op_type = graph_->node(root_node).op();
-  if (!op_types_to_traverse.empty() &&
-      op_types_to_traverse.find(op_type) == op_types_to_traverse.end()) {
-    return;
-  }
-  std::vector<int> stack;
-  stack.reserve(32);
-  stack.push_back(root_node);
-  while (!stack.empty()) {
-    const int node_idx = stack.back();
-    stack.pop_back();
-    nodes_found->insert(node_idx);
-    const string& op_type = graph_->node(node_idx).op();
-    if (op_types_to_traverse.empty() ||
-        op_types_to_traverse.find(op_type) != op_types_to_traverse.end()) {
-      for (auto output_idx : this->outputs(node_idx)) {
-        if (nodes_found->find(output_idx) == nodes_found->end()) {
-          stack.push_back(output_idx);
-        }
-      }
-    }
-  }
-}
-
-string SimpleGraphView::PrintToString() const {
-  string str;
-  for (int i = 0; i < num_nodes(); ++i) {
-    strings::StrAppend(&str, "Node ", i, "'", node_name(i), "'\n", "Inputs: [");
-    for (int input : inputs(i)) {
-      strings::StrAppend(&str, input, " '", node_name(input), "', ");
-    }
-    strings::StrAppend(&str, "]\n", "Outputs: [");
-    for (int j = 0; j < outputs(i).size(); ++j) {
-      const int output = outputs(i)[j];
-      if (j > 0) {
-        strings::StrAppend(&str, ", ");
-      }
-      strings::StrAppend(&str, output, " '", node_name(output), "'");
-    }
-    strings::StrAppend(&str, "]\n");
-  }
-  return str;
-}
+#define HANDLE_DOUBLE_CASE(DTYPE)                                     \
+  case DTYPE:                                                         \
+    if (!SafeSetDoubleScalarTensorValue<EnumToDataType<DTYPE>::Type>( \
+            static_cast<double>(value), tensor)) {                    \
+      return errors::InvalidArgument("Cannot store value ", value,    \
+                                     " in tensor of type " #DTYPE);   \
+    }                                                                 \
+    break
 
-#define HANDLE_CASE(DTYPE)                                          \
-  case DTYPE:                                                       \
-    if (!SafeSetScalarTensorValue<EnumToDataType<DTYPE>::Type>(     \
-            static_cast<double>(value), tensor)) {                  \
-      return errors::InvalidArgument("Cannot store value ", value,  \
-                                     " in tensor of type " #DTYPE); \
-    }                                                               \
+#define HANDLE_INT_CASE(DTYPE)                                               \
+  case DTYPE:                                                                \
+    if (!SafeSetIntScalarTensorValue<EnumToDataType<DTYPE>::Type>(value,     \
+                                                                  tensor)) { \
+      return errors::InvalidArgument("Cannot store value ", value,           \
+                                     " in tensor of type " #DTYPE);          \
+    }                                                                        \
     break
 
 Status SetTensorValue(DataType dtype, int value, Tensor* tensor) {
   // TODO(rmlarsen): Support more general shapes.
+  // TODO(lyandy): Change `value` to be int64 once int64 -> qint32 is supported.
   if (tensor->NumElements() != 1) {
     return errors::InvalidArgument(
         "Expected scalar tensor, got num_elements = ", tensor->NumElements());
   }
   switch (dtype) {
-    HANDLE_CASE(DT_HALF);
-    HANDLE_CASE(DT_BFLOAT16);
-    HANDLE_CASE(DT_BOOL);
-    HANDLE_CASE(DT_FLOAT);
-    HANDLE_CASE(DT_DOUBLE);
-    HANDLE_CASE(DT_UINT8);
-    HANDLE_CASE(DT_INT8);
-    HANDLE_CASE(DT_UINT16);
-    HANDLE_CASE(DT_INT16);
-    HANDLE_CASE(DT_INT32);
-    HANDLE_CASE(DT_INT64);
-    HANDLE_CASE(DT_COMPLEX64);
-    HANDLE_CASE(DT_COMPLEX128);
+    HANDLE_DOUBLE_CASE(DT_HALF);
+    HANDLE_DOUBLE_CASE(DT_BFLOAT16);
+    HANDLE_DOUBLE_CASE(DT_BOOL);
+    HANDLE_DOUBLE_CASE(DT_FLOAT);
+    HANDLE_DOUBLE_CASE(DT_DOUBLE);
+    HANDLE_DOUBLE_CASE(DT_UINT8);
+    HANDLE_DOUBLE_CASE(DT_INT8);
+    HANDLE_DOUBLE_CASE(DT_UINT16);
+    HANDLE_DOUBLE_CASE(DT_INT16);
+    HANDLE_DOUBLE_CASE(DT_INT32);
+    HANDLE_DOUBLE_CASE(DT_INT64);
+    HANDLE_DOUBLE_CASE(DT_COMPLEX64);
+    HANDLE_DOUBLE_CASE(DT_COMPLEX128);
+    HANDLE_INT_CASE(DT_QINT8);
+    HANDLE_INT_CASE(DT_QUINT8);
+    HANDLE_INT_CASE(DT_QINT16);
+    HANDLE_INT_CASE(DT_QUINT16);
+    HANDLE_INT_CASE(DT_QINT32);
     default:
       return errors::InvalidArgument("Unsupported type ",
                                      DataTypeString(dtype));
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index b1e2d4e9cb5bbe15508695595de4e00f7313c401..9053ae4c07dae96c96bac416cf9e175c88462c33 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -100,6 +100,10 @@ class SetVector {
   std::vector<T> vector_;
 };
 
+// Returns formatted string from TensorId specific to grappler. Specifically,
+// for the 0 port (first output), only the node name is returned.
+string TensorIdToString(const TensorId& tensor_id);
+
 // True iff 'name' refers to a control inputs, i.e. a node name prefixed with
 // the ^ character.
 bool IsControlInput(const string& name);
@@ -238,6 +242,9 @@ string AsControlDependency(const string& node);
 // Returns true if the node is assigned to run on CPU device.
 bool NodeIsOnCpu(const NodeDef* node);
 
+// Returns true if the node is assigned to run on GPU device.
+bool NodeIsOnGpu(const NodeDef* node);
+
 // Returns the number of outputs of a node according to its OpDef. Note that
 // some of the outputs may be unconnected.
 int NumOutputs(const NodeDef& node, GraphDef* graph);
@@ -298,68 +305,6 @@ void EraseNodesFromGraph(std::vector<int>&& nodes_to_delete, GraphDef* graph);
 void EraseNodesFromGraph(const std::set<string>& nodes_to_delete,
                          GraphDef* graph);
 
-class SimpleGraphView {
- public:
-  // Build a graph view for the specified graphdef.
-  Status Initialize(const GraphDef& graph) {
-    return Initialize(graph, nullptr, true, true);
-  }
-  // Build a graph view for the specified graphdef augmented with the additional
-  // edges specified in 'extra_dependencies' if any. Note that
-  // extra_dependencies can be null.
-  Status Initialize(
-      const GraphDef& graph,
-      const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
-          extra_dependencies) {
-    return Initialize(graph, extra_dependencies, true, true);
-  }
-  Status Initialize(
-      const GraphDef& graph,
-      const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
-          extra_dependencies,
-      bool dedup_inputs, bool dedup_outputs);
-
-  const GraphDef* graph() const { return graph_; }
-  inline int num_nodes() const { return index_to_name_.size(); }
-  inline bool has_node(const string& node_name) const {
-    return name_to_index_.find(node_name) != name_to_index_.end();
-  }
-  inline const int index(const string& node_name) const {
-    const auto& it = name_to_index_.find(node_name);
-    DCHECK(it != name_to_index_.end());
-    return it == name_to_index_.end() ? -1 : it->second;
-  }
-  inline const NodeDef& node(int node_idx) const {
-    return graph_->node(node_idx);
-  }
-  inline const string& node_name(int node_idx) const {
-    return index_to_name_[node_idx];
-  }
-  inline const gtl::InlinedVector<int, 4>& inputs(int node_idx) const {
-    return inputs_[node_idx];
-  }
-  inline const gtl::InlinedVector<int, 2>& outputs(int node_idx) const {
-    return outputs_[node_idx];
-  }
-
-  // Traverse the graph starting at `node_idx`, collecting indices of nodes
-  // visited in nodes_found. If a node has an op in `op_types_to_traverse`, the
-  // walk continues to its children. It is assumed that *graph_ was not modified
-  // after the call to Initialize().
-  // If `op_types_to_traverse` is empty the DFS will traverse any node type.
-  void DepthFirstSearch(const std::unordered_set<string>& op_types_to_traverse,
-                        int node_idx, std::set<int>* nodes_found) const;
-
-  string PrintToString() const;
-
- private:
-  const GraphDef* graph_;  // Not owned.
-  std::vector<string> index_to_name_;
-  gtl::FlatMap<string, int> name_to_index_;
-  std::vector<gtl::InlinedVector<int, 4>> inputs_;
-  std::vector<gtl::InlinedVector<int, 2>> outputs_;
-};
-
 }  // end namespace grappler
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index c0f19d3828ac1581a937531318ff62875fbf3bc7..1fd0a02b65e3a212780b6fdabadce98833b3ebda 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -48,8 +48,11 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_topology_view",
+        "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -58,10 +61,11 @@ tf_cc_test(
     srcs = ["topological_sort_test.cc"],
     deps = [
         ":topological_sort",
-        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -101,8 +105,7 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/grappler:graph_view",
-        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core/grappler:graph_topology_view",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
@@ -116,6 +119,8 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -138,6 +143,7 @@ cc_library(
         "//tensorflow/core:test",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
+        "@com_google_absl//absl/algorithm:container",
     ],
 )
 
@@ -173,6 +179,10 @@ cc_library(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -191,6 +201,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index 57863a71f35f176e3935e2121f5650a58c72d642..2ec9794b68aad4b322e280eda033b26d7e592913 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -14,8 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/grappler/utils/functions.h"
 
-#include <unordered_map>
-
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
@@ -28,7 +29,6 @@ limitations under the License.
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/scanner.h"
 
 namespace tensorflow {
@@ -76,16 +76,6 @@ Status ResolveFunctionBodyNodeAttrPlaceholders(
 
 }  // namespace
 
-FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
-    const FunctionLibraryDefinition& flib, const GraphDef& graph) {
-  return flib.ReachableDefinitions(graph);
-}
-
-FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
-    const FunctionLibraryDefinition& flib, const FunctionDef& func) {
-  return flib.ReachableDefinitions(func);
-}
-
 void GrapplerFunctionConnectivity::RegisterInputArgExpansion(
     InputArgExpansion input_arg_expansion) {
   string input_name = input_arg_expansion.input_name;
@@ -94,7 +84,7 @@ void GrapplerFunctionConnectivity::RegisterInputArgExpansion(
   for (int i = 0; i < placeholders.size(); ++i) {
     const string& placeholder = input_arg_expansion.placeholders[i];
     input_arg_placeholders_.insert(
-        {placeholder, InputArgPlaceholder{input_name, /*input_position=*/i}});
+        {placeholder, InputArgPlaceholder{input_name, /*input_index=*/i}});
   }
   input_arg_expansions_.insert(
       {std::move(input_name), std::move(input_arg_expansion)});
@@ -193,7 +183,7 @@ Status GrapplerFunctionConnectivity::ExpandFunctionDefInput(
           // If position is not defined expand node output range
           for (int i = output_range.first; i < output_range.second; ++i) {
             graph_def_inputs->push_back(
-                i == 0 ? node_name : strings::StrCat(node_name, ":", i));
+                i == 0 ? node_name : absl::StrCat(node_name, ":", i));
           }
         } else {
           if (position > (output_range.second - output_range.first)) {
@@ -203,7 +193,7 @@ Status GrapplerFunctionConnectivity::ExpandFunctionDefInput(
           }
           int pos = output_range.first + position;
           graph_def_inputs->push_back(
-              pos == 0 ? node_name : strings::StrCat(node_name, ":", pos));
+              pos == 0 ? node_name : absl::StrCat(node_name, ":", pos));
         }
 
         return Status::OK();
@@ -232,39 +222,39 @@ Status GrapplerFunctionConnectivity::ExpandNodeInputs(
 
 Status GrapplerFunctionConnectivity::AsFunctionDefInput(
     const string& graph_def_input, string* func_def_input) const {
-  using gtl::FindOrNull;
-
   if (IsControlInput(graph_def_input)) {
     *func_def_input = graph_def_input;
     return Status::OK();
   }
 
-  int position;
-  string node_name = ParseNodeName(graph_def_input, &position);
-  CHECK_GE(position, 0);
+  const TensorId tensor = ParseTensorName(graph_def_input);
+  DCHECK_GE(tensor.index(), 0);
+
+  const absl::string_view node_name = tensor.node();
+  const int index = tensor.index();
 
   // Check if it's an input arg placeholder
-  if (position == 0) {
-    const InputArgPlaceholder* placeholder =
-        FindOrNull(input_arg_placeholders_, node_name);
-    if (placeholder != nullptr) {
-      *func_def_input = strings::StrCat(placeholder->input_name, ":",
-                                        placeholder->input_position);
+  if (tensor.index() == 0) {
+    const auto is_input_placeholder = input_arg_placeholders_.find(node_name);
+    if (is_input_placeholder != input_arg_placeholders_.end()) {
+      const InputArgPlaceholder& placeholder = is_input_placeholder->second;
+      *func_def_input =
+          absl::StrCat(placeholder.input_name, ":", placeholder.input_index);
       return Status::OK();
     }
   }
 
   // It must be output from one of the function body nodes
-  const tensorflow::NameRangeMap* outputs_range_map =
-      FindOrNull(function_body_outputs_, node_name);
-  if (outputs_range_map != nullptr) {
-    for (const auto& el : *outputs_range_map) {
+  const auto is_body_output = function_body_outputs_.find(tensor.node());
+  if (is_body_output != function_body_outputs_.end()) {
+    const tensorflow::NameRangeMap& outputs_range_map = is_body_output->second;
+
+    for (const auto& el : outputs_range_map) {
       const auto& output_name = el.first;
       const auto& output_range = el.second;
-      if (position >= output_range.first && position < output_range.second) {
-        int pos = position - output_range.first;
-        *func_def_input =
-            strings::StrCat(node_name, ":", output_name, ":", pos);
+      if (index >= output_range.first && index < output_range.second) {
+        int pos = index - output_range.first;
+        *func_def_input = absl::StrCat(node_name, ":", output_name, ":", pos);
         return Status::OK();
       }
     }
@@ -321,15 +311,14 @@ GrapplerFunctionItem::GrapplerFunctionItem(
     string func_name, string description, AttrSlice func_attr,
     std::vector<InputArgExpansion> input_arg_expansions,
     std::vector<OutputArgExpansion> output_arg_expansions,
-    std::vector<string> keep_nodes, const int graph_def_version,
+    std::vector<ControlOutput> control_outputs, const int graph_def_version,
     const bool is_stateful, GraphDef&& function_body)
     : description_(std::move(description)),
-      func_attr_(std::move(func_attr)),
+      func_attr_(func_attr),
       input_arg_expansions_(std::move(input_arg_expansions)),
       output_arg_expansions_(std::move(output_arg_expansions)),
+      control_outputs_(std::move(control_outputs)),
       is_stateful_(is_stateful) {
-  // Move assign GrapplerItem members.
-  keep_ops = std::move(keep_nodes);
   id = std::move(func_name);
   graph = std::move(function_body);
 
@@ -338,15 +327,22 @@ GrapplerFunctionItem::GrapplerFunctionItem(
   for (const InputArgExpansion& input_arg : input_arg_expansions_) {
     for (const string& placeholder : input_arg.placeholders) {
       feed.push_back({placeholder, Tensor()});
-      input_arg_placeholders_.insert(placeholder);
     }
   }
   // Fill the fetch nodes with outputs.
   for (const OutputArgExpansion& output_arg : output_arg_expansions_) {
-    for (const string& output_tensor : output_arg.output_tensors) {
-      fetch.push_back(output_tensor);
+    for (const string& output_node : output_arg.output_nodes) {
+      fetch.push_back(output_node);
     }
   }
+  // We must keep all control output nodes.
+  for (const ControlOutput& control_output : control_outputs_) {
+    keep_ops.push_back(control_output.node_name);
+  }
+
+  // Tensorflow functions execution semantics is different from the main graph,
+  // and we need to preserve it when we do graph optimizations.
+  optimization_options().allow_pruning_stateful_and_dataset_ops = false;
 }
 
 const string& GrapplerFunctionItem::description() const { return description_; }
@@ -363,11 +359,6 @@ const std::size_t GrapplerFunctionItem::input_size() const {
   return input_arg_expansions_.size();
 }
 
-bool GrapplerFunctionItem::IsInputPlaceholder(const string& node_name) const {
-  return input_arg_placeholders_.find(node_name) !=
-         input_arg_placeholders_.end();
-}
-
 const std::vector<OutputArgExpansion>& GrapplerFunctionItem::outputs() const {
   return output_arg_expansions_;
 }
@@ -380,6 +371,15 @@ const std::size_t GrapplerFunctionItem::output_size() const {
   return output_arg_expansions_.size();
 }
 
+const std::vector<ControlOutput>& GrapplerFunctionItem::control_outputs()
+    const {
+  return control_outputs_;
+}
+
+const std::size_t GrapplerFunctionItem::control_output_size() const {
+  return control_outputs_.size();
+}
+
 const AttrSlice& GrapplerFunctionItem::func_attr() const { return func_attr_; }
 
 const GraphDef& GrapplerFunctionItem::function_body() const { return graph; }
@@ -422,7 +422,7 @@ bool IsParametrized(const FunctionDef& func) {
 
 Status InstantiationTypeParameters(
     const FunctionDef& func, const AttrSlice& func_instantiation_attr,
-    std::unordered_map<string, DataType>* type_parameters) {
+    absl::flat_hash_map<string, DataType>* type_parameters) {
   if (!type_parameters->empty()) {
     return errors::InvalidArgument("Type parameters output map must be empty");
   }
@@ -450,7 +450,7 @@ Status InstantiationTypeParameters(
 
 Status InstantiationBodyParameters(
     const FunctionDef& func, const AttrSlice& func_instantiation_attr,
-    std::unordered_map<string, AttrValue>* body_parameters) {
+    absl::flat_hash_map<string, AttrValue>* body_parameters) {
   if (!body_parameters->empty()) {
     return errors::InvalidArgument("Body parameters output map must be empty");
   }
@@ -510,8 +510,7 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 
   // Function body shares the library with the graph that instantiated it. We do
   // not need a full copy of the function library, just the reachable subset.
-  *function_body.mutable_library() =
-      ReachableFunctionLibraryDefinition(flib, func).ToProto();
+  *function_body.mutable_library() = flib.ReachableDefinitions(func).ToProto();
 
   VLOG(3) << absl::Substitute(
       "Deleted $0 unreachable functions from the Grappler function item "
@@ -521,12 +520,18 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 
   // TODO(ezhulenev): support functions with tensor sequence inputs/outputs
 
-  // Make sure that there is no tensor sequences in outputs
+  // Make sure that there are no tensor lists in inputs or outputs.
+  for (const OpDef::ArgDef& input : signature.input_arg()) {
+    if (!input.type_list_attr().empty() || !input.number_attr().empty()) {
+      return errors::InvalidArgument(
+          "Inputs with lists of tensors are not supported. Input: ",
+          input.name());
+    }
+  }
   for (const OpDef::ArgDef& output : signature.output_arg()) {
     if (!output.type_list_attr().empty() || !output.number_attr().empty()) {
       return errors::InvalidArgument(
-          "Outputs with sequence of tensors are not supported. Unsupported "
-          "output: ",
+          "Outputs with lists of tensors are not supported. Output: ",
           output.name());
     }
   }
@@ -536,13 +541,6 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 
   // For each input argument create a placeholder in function body.
   for (const OpDef::ArgDef& input : signature.input_arg()) {
-    if (!input.type_list_attr().empty() || !input.number_attr().empty()) {
-      return errors::InvalidArgument(
-          "Inputs with sequence of tensors are not supported. Unsupported "
-          "input: ",
-          input.name());
-    }
-
     DataType input_data_type;
     TF_RETURN_IF_ERROR(instantiation.GetArgType(input, &input_data_type));
 
@@ -561,9 +559,25 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
     inputs.push_back(std::move(input_expansion));
   }
 
-  std::vector<string> keep_nodes;
-  // Add all function nodes to the function body
+  // Keep names of all nodes in the function body to guarantee that we do not
+  // add an identity with a duplicate name.
+  absl::flat_hash_set<absl::string_view> func_body_nodes;
+
+  // Generate unique output node name: "${out_arg_name}_output_node_${index}".
+  const auto output_node_name = [&func_body_nodes](const OpDef::ArgDef& out,
+                                                   int index) -> string {
+    string name = absl::StrCat(out.name(), "_output_node_", index);
+    int i = 1;
+    while (func_body_nodes.find(name) != func_body_nodes.end()) {
+      name = absl::StrCat(out.name(), "_output_node_", index, "_", i++);
+    }
+    return name;
+  };
+
+  // Add all function nodes to the function body.
   for (const NodeDef& func_def_node : func.node_def()) {
+    func_body_nodes.insert(func_def_node.name());
+
     NodeDef* new_node = function_body.add_node();
     *new_node = func_def_node;
 
@@ -577,11 +591,6 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
     // Register node output range in a function connectivity.
     TF_RETURN_IF_ERROR(RegisterFunctionBodyOutputs(*registration, func_def_node,
                                                    &connectivity));
-
-    // Ops with side effects must be preserved in a function body.
-    if (!IsFreeOfSideEffect(func_def_node)) {
-      keep_nodes.push_back(func_def_node.name());
-    }
   }
 
   // Rewrite inputs to use GraphDef format
@@ -591,8 +600,13 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 
   std::vector<OutputArgExpansion> outputs;
   outputs.reserve(signature.output_arg_size());
-  // Add function outputs
+
+  // For each function output argument we create an Identity node in the
+  // function body, that reads output tensor from the function body node.
   for (const OpDef::ArgDef& out : signature.output_arg()) {
+    DataType output_data_type;
+    TF_RETURN_IF_ERROR(instantiation.GetArgType(out, &output_data_type));
+
     std::vector<string> output_tensors;
     auto ret = func.ret().find(out.name());
     TF_RETURN_IF_ERROR(
@@ -602,23 +616,40 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
             // Otherwise output must be one of the function inputs
             : connectivity.ExpandFunctionDefInput(out.name(), &output_tensors));
 
-    DataType output_data_type;
-    TF_RETURN_IF_ERROR(instantiation.GetArgType(out, &output_data_type));
+    absl::InlinedVector<string, 1> output_nodes;
+    for (int i = 0; i < output_tensors.size(); ++i) {
+      const string& output_tensor = output_tensors[i];
+
+      NodeDef* identity = function_body.add_node();
+      identity->set_name(output_node_name(out, i));
+      identity->set_op("Identity");
+      (*identity->mutable_attr())["T"].set_type(output_data_type);
+      identity->add_input(output_tensor);
+
+      output_nodes.push_back(identity->name());
+    }
 
     OutputArgExpansion output{/*output_name=*/out.name(),
                               /*data_type=*/output_data_type,
                               /*is_ref=*/out.is_ref(),
-                              /*output_tensors=*/std::move(output_tensors)};
+                              /*output_nodes=*/std::move(output_nodes)};
     outputs.push_back(std::move(output));
   }
 
-  bool is_stateful = signature.is_stateful();
+  // Control outputs ensure that all side-effectful nodes in the function body
+  // will execute, even if they are not required to compute regular output args.
+  std::vector<ControlOutput> control_outputs;
+  control_outputs.reserve(func.control_ret_size());
+  for (const auto& control_ret : func.control_ret()) {
+    control_outputs.push_back({control_ret.first, control_ret.second});
+  }
 
   *item = GrapplerFunctionItem(
-      /*func_name=*/signature.name(), /*description=*/signature.description(),
+      /*func_name=*/signature.name(),
+      /*description=*/signature.description(),
       /*func_attr=*/AttrSlice(&func.attr()), std::move(inputs),
-      std::move(outputs), std::move(keep_nodes), graph_def_version, is_stateful,
-      std::move(function_body));
+      std::move(outputs), std::move(control_outputs), graph_def_version,
+      signature.is_stateful(), std::move(function_body));
   return Status::OK();
 }
 
@@ -645,7 +676,7 @@ Status RegisterGrapplerFunctionConnectivity(
   return Status::OK();
 }
 
-Status ReplaceInputWithConst(const NodeDef& input_const, int input_position,
+Status ReplaceInputWithConst(const NodeDef& input_const, int input_index,
                              GrapplerFunctionItem* item) {
   if (!IsConstant(input_const)) {
     return errors::InvalidArgument("Input node ", input_const.name(),
@@ -657,7 +688,7 @@ Status ReplaceInputWithConst(const NodeDef& input_const, int input_position,
   // Find input arg expansion and input placeholder position in it for the
   // given function input position.
   InputArgExpansion* input_arg_expansion = nullptr;
-  int placeholder_idx = input_position;
+  int placeholder_idx = input_index;
 
   for (InputArgExpansion& input : inputs) {
     if (placeholder_idx < input.placeholders.size()) {
@@ -668,14 +699,12 @@ Status ReplaceInputWithConst(const NodeDef& input_const, int input_position,
   }
 
   if (input_arg_expansion == nullptr) {
-    return errors::InvalidArgument(
-        "Input placeholder not found: input_position=", input_position,
-        " function=", item->id);
+    return errors::InvalidArgument("Input placeholder not found: input_index=",
+                                   input_index, " function=", item->id);
   }
 
   // Delete placeholder from input expansion.
   string placeholder_name = input_arg_expansion->placeholders[placeholder_idx];
-  item->input_arg_placeholders_.erase(placeholder_name);
   input_arg_expansion->placeholders.erase(
       input_arg_expansion->placeholders.begin() + placeholder_idx);
 
@@ -699,43 +728,46 @@ Status ReplaceInputWithConst(const NodeDef& input_const, int input_position,
   return Status::OK();
 }
 
-Status RemoveUnusedOutputs(const gtl::FlatSet<int>& active_outputs,
-                           GrapplerFunctionItem* item,
-                           std::vector<std::pair<int, int>>* output_mapping) {
+Status RemoveFunctionOutputs(const absl::flat_hash_set<int>& remove_outputs,
+                             GrapplerFunctionItem* item,
+                             std::vector<std::pair<int, int>>* output_mapping) {
   DCHECK(output_mapping->empty());
 
-  // Do some sanity checking of the active outputs positions.
-  for (int active_output : active_outputs) {
-    if (active_output < 0 || active_output >= item->output_size()) {
+  // Code below assumes that we do not support tensor list outputs and there is
+  // a 1-to-1 mapping between output tensor and output argument expansion.
+  for (const OutputArgExpansion& out_arg : item->outputs()) {
+    DCHECK(out_arg.output_nodes.size() == 1)
+        << "Output arg expansion must have single output";
+  }
+
+  // Do some sanity checking of the removed outputs positions.
+  for (int remove_output : remove_outputs) {
+    if (remove_output < 0 || remove_output >= item->output_size()) {
       return errors::InvalidArgument(
-          "Active output position is out of bound: active_output=",
-          active_output, " num_output_args=", item->output_size());
+          "Function output index is out of bound: index=", remove_output,
+          " max_output_index=", item->output_size());
     }
   }
 
-  gtl::FlatSet<const OutputArgExpansion*> unused_output_args;
-
-  const auto is_unused_output_arg = [&](const OutputArgExpansion& output) {
-    return unused_output_args.find(&output) != unused_output_args.end();
+  absl::flat_hash_set<const OutputArgExpansion*> remove_output_args;
+  const auto is_remove_output_arg = [&](const OutputArgExpansion& output) {
+    return remove_output_args.find(&output) != remove_output_args.end();
   };
 
   for (int i = 0; i < item->output_size(); ++i) {
     const OutputArgExpansion& output = item->output(i);
-    DCHECK(output.output_tensors.size() == 1)
-        << "Output arg expansion must have single tensor";
-
-    if (active_outputs.find(i) == active_outputs.end()) {
-      VLOG(3) << "Remove unused output: output_name=" << output.output_name
-              << " output_position=" << i;
-      unused_output_args.insert(&output);
-    } else if (!unused_output_args.empty()) {
+    if (remove_outputs.find(i) != remove_outputs.end()) {
+      VLOG(3) << "Remove functions output: output_name=" << output.output_name
+              << "(index = " << i << ")";
+      remove_output_args.insert(&output);
+    } else if (!remove_output_args.empty()) {
       // Add output mapping only if output position changed.
-      output_mapping->push_back({i, i - unused_output_args.size()});
+      output_mapping->push_back({i, i - remove_output_args.size()});
     }
   }
 
   auto& o = item->output_arg_expansions_;
-  o.erase(std::remove_if(o.begin(), o.end(), is_unused_output_arg), o.end());
+  o.erase(std::remove_if(o.begin(), o.end(), is_remove_output_arg), o.end());
 
   return Status::OK();
 }
@@ -747,6 +779,55 @@ Status MakeFunctionDef(const GrapplerFunctionItem& item,
   func->mutable_signature()->set_description(item.description());
   func->mutable_signature()->set_is_stateful(item.is_stateful());
 
+  // Keep track of placeholders that were added to the graph in place of
+  // expanded function input arguments.
+  absl::flat_hash_set<absl::string_view> input_placeholders;
+  for (const InputArgExpansion& input_arg : item.inputs()) {
+    for (const string& placeholder : input_arg.placeholders) {
+      input_placeholders.insert(placeholder);
+    }
+  }
+
+  // Keep track of identity nodes that were added to the graph in place of
+  // expanded function output arguments.
+  absl::flat_hash_set<absl::string_view> output_nodes;
+  for (const OutputArgExpansion& output_arg : item.outputs()) {
+    for (const string& output_node : output_arg.output_nodes) {
+      output_nodes.insert(output_node);
+    }
+  }
+
+  // If the output identity node was not modified by any optimizer, we can
+  // bypass it and returns the function value from its input.
+  absl::flat_hash_map<absl::string_view, string> output_tensors;
+  for (const NodeDef& func_body_node : item.function_body().node()) {
+    if (!IsIdentity(func_body_node)) continue;
+
+    const string& node_name = func_body_node.name();
+    if (output_nodes.find(node_name) != output_nodes.end()) {
+      // Grappler optimizers might optimize nodes in the fanin of the output
+      // node, and forward their control dependencies. We can't express control
+      // dependencies in a function signature, so we have to keep the node.
+      if (func_body_node.input_size() == 1) {
+        VLOG(3) << "Bypass function output node: " << node_name << " -> "
+                << func_body_node.input(0);
+        output_tensors.emplace(node_name, func_body_node.input(0));
+      } else {
+        VLOG(3) << "Keep function output node: " << node_name;
+      }
+    }
+  }
+
+  // Return output tensor name (input of the output node) if it's safe to bypass
+  // output node, otherwise returns the output node name.
+  const auto output_tensor =
+      [&output_tensors](const OutputArgExpansion& output_arg) -> const string& {
+    const string& output_node = output_arg.output_nodes[0];
+    const auto is_output_tensor = output_tensors.find(output_node);
+    return is_output_tensor == output_tensors.end() ? output_node
+                                                    : is_output_tensor->second;
+  };
+
   // Build a GrapplerFunctionConnectivity from inputs and new function body.
   GrapplerFunctionConnectivity connectivity;
   TF_RETURN_IF_ERROR(
@@ -754,8 +835,8 @@ Status MakeFunctionDef(const GrapplerFunctionItem& item,
 
   // Add function input arguments.
   for (const InputArgExpansion& input_arg : item.inputs()) {
-    CHECK(input_arg.placeholders.size() == 1)  // do some sanity checking
-        << "Inputs of tensor sequences are not supported";
+    DCHECK(input_arg.placeholders.size() == 1)  // do some sanity checking
+        << "Inputs of tensor lists are not supported";
 
     OpDef::ArgDef arg_def;
     arg_def.set_name(input_arg.input_name);
@@ -766,8 +847,8 @@ Status MakeFunctionDef(const GrapplerFunctionItem& item,
 
   // Add function output arguments.
   for (const OutputArgExpansion& output_arg : item.outputs()) {
-    CHECK(output_arg.output_tensors.size() == 1)  // do some sanity checking
-        << "Outputs of tensor sequences are not supported";
+    DCHECK(output_arg.output_nodes.size() == 1)  // do some sanity checking
+        << "Outputs of tensor lists are not supported";
 
     OpDef::ArgDef arg_def;
     arg_def.set_name(output_arg.output_name);
@@ -775,11 +856,16 @@ Status MakeFunctionDef(const GrapplerFunctionItem& item,
     arg_def.set_is_ref(output_arg.is_ref);
     *func->mutable_signature()->add_output_arg() = arg_def;
 
-    string ret;
-    for (const string& output_tensor : output_arg.output_tensors) {
-      TF_RETURN_IF_ERROR(connectivity.AsFunctionDefInput(output_tensor, &ret));
-      (*func->mutable_ret())[output_arg.output_name] = ret;
-    }
+    TF_RETURN_IF_ERROR(connectivity.AsFunctionDefInput(
+        output_tensor(output_arg),
+        &(*func->mutable_ret())[output_arg.output_name]));
+  }
+
+  // Add function control outputs.
+  for (const ControlOutput& control_out : item.control_outputs()) {
+    func->mutable_control_ret()->insert(
+        {control_out.output_name, control_out.node_name});
+    *func->mutable_signature()->add_control_output() = control_out.output_name;
   }
 
   // Copy function definition specific attributes.
@@ -790,12 +876,16 @@ Status MakeFunctionDef(const GrapplerFunctionItem& item,
   }
 
   // Copy function body nodes to the FunctionDef and update input format
-  for (const NodeDef& func_body_node : item.function_body().node()) {
-    // Do not copy input placeholders
-    if (item.IsInputPlaceholder(func_body_node.name())) continue;
+  for (const NodeDef& func_node : item.function_body().node()) {
+    const string& name = func_node.name();
+
+    // Do not copy input placeholders.
+    if (IsPlaceholder(func_node) && input_placeholders.count(name)) continue;
+    // Do not copy output nodes that we bypassed.
+    if (IsIdentity(func_node) && output_tensors.count(name)) continue;
 
     NodeDef* func_def_node = func->add_node_def();
-    *func_def_node = func_body_node;
+    *func_def_node = func_node;
     TF_RETURN_IF_ERROR(connectivity.AsFunctionDefNode(func_def_node));
   }
 
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index 038cf5f527e0f32cc10e123bb0cab357e5902463..d450f6a41fcf926def615c34b4acc725fae5b3d7 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -18,7 +18,10 @@ limitations under the License.
 
 #include <memory>
 #include <string>
-#include <unordered_map>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -30,12 +33,20 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-// Returns a copy of FunctionLibraryDefinition with subset of functions that are
-// reachable from the nodes of the graph.
-FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
-    const FunctionLibraryDefinition& flib, const GraphDef& graph);
-FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
-    const FunctionLibraryDefinition& flib, const FunctionDef& func);
+// WARNING(ezhulenev): Currently we do not support functions with inputs or
+// outputs instantiated into multiple tensors. This can happen if the
+// input/output type is 'T*N' or 'list(type)'. This is enforced by multiple
+// checks across this file and also function_optimizer.cc. InputArgExpansion and
+// OutputArgExpansion already support lists of tensors, but that's pretty much
+// it, all other code is written with assumption that expansions are always of
+// size 1. MakeGrapplerFunctionItem will gracefully fail with Status error.
+//
+// This is a low priority feature, because in practice we don't see a lot (any
+// at all?) functions with such arguments. Tensorflow-Eager always produces
+// functions with plain input/output arguments.
+
+// TODO(ezhulenev): Support inputs and outputs of type 'T*N'.
+// TODO(ezhulenev): Support inputs and outputs of type 'list(type)'.
 
 // Depending on the function instantiation attributes, input argument to the
 // function might be a single tensor, list of tensors of the same type, or a
@@ -44,30 +55,29 @@ FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
 // InputArgExpansion keeps track of the placeholders that were added to the
 // function body in place of function inputs and a resolved input data type.
 struct InputArgExpansion {
-  // TODO(ezhulenev): Add support for functions with tensor sequence inputs of
-  // different data types.
-  // TODO(ezhulenev): Support type parametrized inputs?
-  string input_name;                 // name of the function input argument
-  DataType data_type;                // input data type
-  bool is_ref;                       // if true, inputs are required to be refs
-  std::vector<string> placeholders;  // names of placeholder nodes in the
-                                     // function body
+  string input_name;
+  DataType data_type;
+  bool is_ref;
+  absl::InlinedVector<string, 1> placeholders;
 };
 
 // Depending on the function instantiation attributes, output argument is mapped
 // to one or more outputs of one of the function body nodes.
 //
-// OutputArgExpansion keeps mapping from a function output arg to the output
-// tensors of a function body nodes and a resolved output data type
+// OutputArgExpansion keeps track of the Identity nodes that were added to the
+// function body to forward output tensors. Adding these output nodes allows
+// nested function inlining and specialization (see function optimizer).
 struct OutputArgExpansion {
-  // TODO(ezhulenev): Add support for functions with tensor sequence outputs of
-  // different data types.
-  // TODO(ezhulenev): Support type parametrized outputs?
-  string output_name;                  // name of the function output argument
-  DataType data_type;                  // output data type
-  bool is_ref;                         // if true, outputs are refs
-  std::vector<string> output_tensors;  // names of output tensor from the
-                                       // function body nodes
+  string output_name;
+  DataType data_type;
+  bool is_ref;
+  absl::InlinedVector<string, 1> output_nodes;
+};
+
+// A mapping from control output name to node name in function body graph.
+struct ControlOutput {
+  string output_name;
+  string node_name;
 };
 
 // FunctionDef uses different connectivity encoding for the function body nodes,
@@ -81,44 +91,46 @@ class GrapplerFunctionConnectivity {
   void RegisterFunctionBodyOutputs(const string& node_name,
                                    tensorflow::NameRangeMap&& outputs);
 
-  // Expand input encoded in FunctionDef format (name[:output][:position]) into
+  // Expands input encoded in FunctionDef format (name[:output][:position]) into
   // multiple inputs in GraphDef format (name[:position]).
   Status ExpandFunctionDefInput(const string& func_def_input,
                                 std::vector<string>* graph_def_inputs) const;
 
-  // Update Node inputs from FunctionDef to GraphDef format.
+  // Updates Node inputs from FunctionDef to GraphDef format.
   Status ExpandNodeInputs(NodeDef* function_body_node) const;
 
   // When expanding inputs in function def format, single input might be
   // expanded into multiple tensors. When converting back to the function def
   // format from graph def format, it's always a 1-to-1 relationship.
-  // FunctionDef built from GrapplerFunctionItem is always specialized to it's
+  // FunctionDef built from GrapplerFunctionItem is always specialized to its
   // instantiation attributes and length of input args (and node def outputs) is
   // known.
 
-  // Map from GraphDef input format to FunctionDef input format using registered
-  // input arg expansion and function body outputs.
+  // Converts input name from GraphDef format (name[:position]) to the
+  // FunctionDef input format (name[:output][:position]) using registered input
+  // arg expansion and function body outputs.
   Status AsFunctionDefInput(const string& graph_def_input,
                             string* func_def_input) const;
 
-  // Update Node inputs from GraphDef to FunctionDef format.
+  // Updates Node inputs from GraphDef to FunctionDef format.
   Status AsFunctionDefNode(NodeDef* function_body_node) const;
 
  private:
   // Mapping from input name to input arg expansion.
-  std::unordered_map<string, InputArgExpansion> input_arg_expansions_;
+  absl::flat_hash_map<string, InputArgExpansion> input_arg_expansions_;
   // Mapping from function body node name to output names range map.
-  std::unordered_map<string, tensorflow::NameRangeMap> function_body_outputs_;
+  absl::flat_hash_map<string, tensorflow::NameRangeMap> function_body_outputs_;
 
+  // For each placeholder added to the function instantiation graph, we keep a
+  // mapping back to the function input argument name and index.
   struct InputArgPlaceholder {
-    string input_name;   // Name of the function input argument.
-    int input_position;  // Index of a tensor in the function input argument
-                         // expansion, it can be greater than `0` if input
-                         // argument is a list of tensors (aka list(type)).
+    string input_name;  // Name of the function input argument.
+    int input_index;    // Index of a tensor in the function input argument
+                        // expansion, it can be greater than `0` if input
+                        // argument is a list of tensors (aka list(type)).
   };
-
   // Mapping from input arg placeholder to the function input tensor.
-  std::unordered_map<string, InputArgPlaceholder> input_arg_placeholders_;
+  absl::flat_hash_map<string, InputArgPlaceholder> input_arg_placeholders_;
 };
 
 // Get Function type attributes using attributes of a node that instantiated
@@ -147,8 +159,6 @@ class GrapplerFunctionItem : public GrapplerItem {
 
   const string& description() const;
 
-  bool IsInputPlaceholder(const string& node_name) const;
-
   const std::vector<InputArgExpansion>& inputs() const;
   const InputArgExpansion& input(int i) const;
   const std::size_t input_size() const;
@@ -157,6 +167,9 @@ class GrapplerFunctionItem : public GrapplerItem {
   const OutputArgExpansion& output(int i) const;
   const std::size_t output_size() const;
 
+  const std::vector<ControlOutput>& control_outputs() const;
+  const std::size_t control_output_size() const;
+
   const AttrSlice& func_attr() const;
   const GraphDef& function_body() const;
   GraphDef& mutable_function_body();
@@ -171,16 +184,17 @@ class GrapplerFunctionItem : public GrapplerItem {
                                          GrapplerFunctionItem*);
   friend Status ReplaceInputWithConst(const NodeDef&, int,
                                       GrapplerFunctionItem*);
-  friend Status RemoveUnusedOutputs(
-      const gtl::FlatSet<int>& active_outputs, GrapplerFunctionItem* item,
-      std::vector<std::pair<int, int>>* output_mapping);
+  friend Status RemoveFunctionOutputs(const absl::flat_hash_set<int>&,
+                                      GrapplerFunctionItem*,
+                                      std::vector<std::pair<int, int>>*);
 
   GrapplerFunctionItem(string func_name, string description,
                        AttrSlice func_attr,
                        std::vector<InputArgExpansion> input_arg_expansions,
                        std::vector<OutputArgExpansion> output_arg_expansions,
-                       std::vector<string> keep_nodes, int graph_def_version,
-                       bool is_stateful, GraphDef&& function_body);
+                       std::vector<ControlOutput> control_outputs,
+                       int graph_def_version, bool is_stateful,
+                       GraphDef&& function_body);
 
   string description_;
   AttrSlice func_attr_;  // Attributes specific to function definition that
@@ -188,17 +202,16 @@ class GrapplerFunctionItem : public GrapplerItem {
 
   std::vector<InputArgExpansion> input_arg_expansions_;
   std::vector<OutputArgExpansion> output_arg_expansions_;
+  std::vector<ControlOutput> control_outputs_;
 
-  std::set<string> input_arg_placeholders_;
-
-  bool is_stateful_;
+  bool is_stateful_ = false;
 };
 
 // Check if function input/output types are fully defined only at instantiation
-// time (parametrized by it's instantiation node).
+// time (parametrized by its instantiation node).
 bool HasParametrizedType(const FunctionDef& func);
 
-// Check if a function body is parametrized by it's instantiation node. Function
+// Check if a function body is parametrized by its instantiation node. Function
 // body is parametrized, if it has at least one node with a 'placeholder'
 // attribute.
 bool HasParametrizedBody(const FunctionDef& func);
@@ -210,14 +223,14 @@ bool IsParametrized(const FunctionDef& func);
 // caller node. Return error if type can't be resolved.
 Status InstantiationTypeParameters(
     const FunctionDef& func, const AttrSlice& func_instantiation_attr,
-    std::unordered_map<string, DataType>* type_parameters);
+    absl::flat_hash_map<string, DataType>* type_parameters);
 
 // Resolve function instantiation body parameters (values for the function body
 // attr placeholders) from the attributes of the caller node. Return error if
 // type can't be resolved.
 Status InstantiationBodyParameters(
     const FunctionDef& func, const AttrSlice& func_instantiation_attr,
-    std::unordered_map<string, AttrValue>* body_parameters);
+    absl::flat_hash_map<string, AttrValue>* body_parameters);
 
 // Register GrapplerFunctionItem input arg expansion and function body outputs
 // in the GrapplerFunctionConnectivity. Use function library definition to
@@ -227,18 +240,19 @@ Status RegisterGrapplerFunctionConnectivity(
     GrapplerFunctionConnectivity* connectivity);
 
 // Replace one of the function inputs with a constant.
-Status ReplaceInputWithConst(const NodeDef& input_const, int input_position,
+Status ReplaceInputWithConst(const NodeDef& input_const, int input_index,
                              GrapplerFunctionItem* item);
 
-// Remove function output arguments that do not have any active outputs (output
-// tensor connected to other node inputs or in a fetch set). Active outputs uses
-// GraphDef output position encoding, and multiple active outputs could
-// potentially be connected to the same output argument (in case of tensor list
-// outputs). Add output mapping for all active outputs that changed it's output
-// position (std::pair<old position, new position>).
-Status RemoveUnusedOutputs(const gtl::FlatSet<int>& active_outputs,
-                           GrapplerFunctionItem* item,
-                           std::vector<std::pair<int, int>>* output_mapping);
+// Removes outputs from instantiated grappler function item. Function node
+// outputs use GraphDef output index encoding, and multiple outputs might belong
+// to the same output argument expansion (in case of tensor list outputs). For
+// all active function outputs that changed its output index, this function adds
+// an output mapping (std::pair<old index, new index>).
+Status RemoveFunctionOutputs(const absl::flat_hash_set<int>& remove_outputs,
+                             GrapplerFunctionItem* item,
+                             std::vector<std::pair<int, int>>* output_mapping);
+
+// TODO(ezhulenev, b/120103818): Add RemoveFunctionInputs.
 
 // Make a GrapplerFunctionItem from the function definition and function
 // instantiation attributes (caller node attributes). Returns error if the given
@@ -253,7 +267,7 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 // fully defined (no type or body parametrization).
 // TODO(ezhulenev): Support parametrized functions without fully defined
 // instantiation attributes? Do we ever want to optimize parametrized function
-// without specializing it to it's instantiation attributes (at least types)?
+// without specializing it to its instantiation attributes (at least types)?
 Status MakeGrapplerFunctionItem(const FunctionDef& func,
                                 const FunctionLibraryDefinition& flib,
                                 int graph_def_version,
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index 8639dec05a1eb8aa7afcadc20ee9f8949bfeae14..813e6a318cf69db536bb6859f1937a3366d03d70 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/utils/functions.h"
+
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -77,7 +79,7 @@ TEST_F(FunctionsTest, InstantiationParameters) {
   func_instantiation_attr["B"].set_type(DT_INT32);
   func_instantiation_attr["C"].set_type(DT_DOUBLE);
 
-  std::unordered_map<string, DataType> type_parameters;
+  absl::flat_hash_map<string, DataType> type_parameters;
   TF_EXPECT_OK(InstantiationTypeParameters(
       func, AttrSlice(&func_instantiation_attr), &type_parameters));
 
@@ -86,7 +88,7 @@ TEST_F(FunctionsTest, InstantiationParameters) {
   EXPECT_EQ(DT_INT32, type_parameters["B"]);
   EXPECT_EQ(DT_DOUBLE, type_parameters["C"]);
 
-  std::unordered_map<string, AttrValue> body_parameters;
+  absl::flat_hash_map<string, AttrValue> body_parameters;
   TF_EXPECT_OK(InstantiationBodyParameters(
       func, AttrSlice(&func_instantiation_attr), &body_parameters));
 
@@ -247,15 +249,16 @@ TEST_F(FunctionsTest, FromSimpleFunctionDef) {
                                         flib, TF_GRAPH_DEF_VERSION, &item));
 
   EXPECT_EQ("XTimesTwo", item.id);
-  EXPECT_EQ(4, item.function_body().node_size());
+  EXPECT_EQ(5, item.function_body().node_size());
 
   EXPECT_EQ(1, item.input_size());
   EXPECT_EQ("x", item.input(0).input_name);
-  EXPECT_EQ(std::vector<string>{"x"}, item.input(0).placeholders);
+  ASSERT_EQ(1, item.input(0).placeholders.size());
+  EXPECT_EQ("x", item.input(0).placeholders[0]);
 
   EXPECT_EQ(1, item.output_size());
   EXPECT_EQ("y", item.output(0).output_name);
-  EXPECT_EQ("y", item.output(0).output_tensors[0]);
+  EXPECT_EQ("y_output_node_0", item.output(0).output_nodes[0]);
 
   int count = 0;
   for (const NodeDef &node : item.function_body().node()) {
@@ -277,9 +280,13 @@ TEST_F(FunctionsTest, FromSimpleFunctionDef) {
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("x", node.input(0));
       EXPECT_EQ("scale", node.input(1));
+    } else if (node.name() == "y_output_node_0" && ++count) {
+      EXPECT_EQ("Identity", node.op());
+      ASSERT_EQ(1, node.input_size());
+      EXPECT_EQ("y", node.input(0));
     }
   }
-  EXPECT_EQ(4, count);
+  EXPECT_EQ(5, count);
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithMultiOutputNodes) {
@@ -324,7 +331,7 @@ TEST_F(FunctionsTest, FromFunctionDefWithMultiOutputNodes) {
                                         flib, TF_GRAPH_DEF_VERSION, &item));
 
   EXPECT_EQ("SubGrad", item.id);
-  EXPECT_EQ(12, item.function_body().node_size());
+  EXPECT_EQ(14, item.function_body().node_size());
 
   ASSERT_EQ(3, item.input_size());
   EXPECT_EQ("x", item.input(0).input_name);
@@ -332,8 +339,8 @@ TEST_F(FunctionsTest, FromFunctionDefWithMultiOutputNodes) {
   EXPECT_EQ("dz", item.input(2).input_name);
 
   ASSERT_EQ(2, item.output_size());
-  EXPECT_EQ("dx", item.output(0).output_tensors[0]);
-  EXPECT_EQ("dy", item.output(1).output_tensors[0]);
+  EXPECT_EQ("dx_output_node_0", item.output(0).output_nodes[0]);
+  EXPECT_EQ("dy_output_node_0", item.output(1).output_nodes[0]);
 
   int count = 0;
   for (const NodeDef &node : item.function_body().node()) {
@@ -357,9 +364,17 @@ TEST_F(FunctionsTest, FromFunctionDefWithMultiOutputNodes) {
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("gy", node.input(0));
       EXPECT_EQ("rx:1", node.input(1));
+    } else if (node.name() == "dx_output_node_0" && ++count) {
+      EXPECT_EQ("Identity", node.op());
+      ASSERT_EQ(1, node.input_size());
+      EXPECT_EQ("dx", node.input(0));
+    } else if (node.name() == "dy_output_node_0" && ++count) {
+      EXPECT_EQ("Identity", node.op());
+      ASSERT_EQ(1, node.input_size());
+      EXPECT_EQ("dy", node.input(0));
     }
   }
-  EXPECT_EQ(6, count);
+  EXPECT_EQ(8, count);
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithNestedFuncs) {
@@ -470,7 +485,7 @@ TEST_F(FunctionsTest, FromFunctionDefWithOutputMappings) {
                                         flib, TF_GRAPH_DEF_VERSION, &item));
 
   EXPECT_EQ(1, item.output_size());
-  EXPECT_EQ("Exp", item.output(0).output_tensors[0]);
+  EXPECT_EQ("out_output_node_0", item.output(0).output_nodes[0]);
 
   int count = 0;
   for (const NodeDef &node : item.function_body().node()) {
@@ -486,9 +501,13 @@ TEST_F(FunctionsTest, FromFunctionDefWithOutputMappings) {
       EXPECT_EQ("Exp", node.op());
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("Linear_func", node.input(0));
+    } else if (node.name() == "out_output_node_0" && ++count) {
+      EXPECT_EQ("Identity", node.op());
+      ASSERT_EQ(1, node.input_size());
+      EXPECT_EQ("Exp", node.input(0));
     }
   }
-  EXPECT_EQ(3, count);
+  EXPECT_EQ(4, count);
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithInputForwarding) {
@@ -515,27 +534,44 @@ TEST_F(FunctionsTest, FromFunctionDefWithInputForwarding) {
                                         flib, TF_GRAPH_DEF_VERSION, &item));
 
   EXPECT_EQ("ForwardInputs", item.id);
-  EXPECT_EQ(5, item.function_body().node_size());
+  EXPECT_EQ(8, item.function_body().node_size());
 
   EXPECT_EQ(3, item.output_size());
-  EXPECT_EQ("in0", item.output(0).output_tensors[0]);
-  EXPECT_EQ("arg2", item.output(1).output_tensors[0]);
-  EXPECT_EQ("arg3", item.output(2).output_tensors[0]);
+  EXPECT_EQ("out0_output_node_0", item.output(0).output_nodes[0]);
+  EXPECT_EQ("arg2_output_node_0", item.output(1).output_nodes[0]);
+  EXPECT_EQ("arg3_output_node_0", item.output(2).output_nodes[0]);
 
   int count = 0;
+
+  const auto is_arg_placeholder = [](const string &name) {
+    return name == "in0" || name == "in1" || name == "arg2" || name == "arg3" ||
+           name == "arg4";
+  };
+
   for (const NodeDef &node : item.function_body().node()) {
-    EXPECT_TRUE(node.name() == "in0" || node.name() == "in1" ||
-                node.name() == "arg2" || node.name() == "arg3" ||
-                node.name() == "arg4");
-    count++;
-    EXPECT_EQ("Placeholder", node.op());
-    if (node.name() == "arg3") {
-      EXPECT_EQ(DT_INT32, node.attr().at("dtype").type());
-    } else {
-      EXPECT_EQ(DT_FLOAT, node.attr().at("dtype").type());
+    if (is_arg_placeholder(node.name()) && node.op() == "Placeholder") {
+      count++;
+      if (node.name() == "arg3") {
+        EXPECT_EQ(DT_INT32, node.attr().at("dtype").type());
+      } else {
+        EXPECT_EQ(DT_FLOAT, node.attr().at("dtype").type());
+      }
+      continue;
+    }
+
+    EXPECT_EQ("Identity", node.op());
+    ASSERT_EQ(1, node.input_size());
+    EXPECT_TRUE(is_arg_placeholder(node.input(0)));
+
+    if (node.name() == "out0_output_node_0" && ++count) {
+      EXPECT_EQ("in0", node.input(0));
+    } else if (node.name() == "arg2_output_node_0" && ++count) {
+      EXPECT_EQ("arg2", node.input(0));
+    } else if (node.name() == "arg3_output_node_0" && ++count) {
+      EXPECT_EQ("arg3", node.input(0));
     }
   }
-  EXPECT_EQ(5, count);
+  EXPECT_EQ(8, count);
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithoutInput) {
@@ -564,16 +600,22 @@ TEST_F(FunctionsTest, FromFunctionDefWithoutInput) {
 
   EXPECT_EQ(0, item.input_size());
   EXPECT_EQ(1, item.output_size());
-  EXPECT_EQ("o", item.output(0).output_tensors[0]);
+  EXPECT_EQ("o_output_node_0", item.output(0).output_nodes[0]);
+  EXPECT_EQ(3, item.function_body().node_size());
 
-  EXPECT_EQ(2, item.function_body().node_size());
   const NodeDef &two = item.function_body().node(0);
   EXPECT_EQ("two", two.name());
   EXPECT_EQ(0, two.input_size());
+
   const NodeDef &cast = item.function_body().node(1);
   EXPECT_EQ("o", cast.name());
   EXPECT_EQ(1, cast.input_size());
   EXPECT_EQ("two", cast.input(0));
+
+  const NodeDef &retval = item.function_body().node(2);
+  EXPECT_EQ("o_output_node_0", retval.name());
+  EXPECT_EQ(1, retval.input_size());
+  EXPECT_EQ("o", retval.input(0));
 }
 
 TEST_F(FunctionsTest, FromFunctionDefWithSideEffectfulOps) {
@@ -599,8 +641,41 @@ TEST_F(FunctionsTest, FromFunctionDefWithSideEffectfulOps) {
   EXPECT_EQ(3, item.function_body().node_size());
   EXPECT_EQ(1, item.input_size());
   EXPECT_EQ(0, item.output_size());
+
+  const auto &opts = item.optimization_options();
+  EXPECT_FALSE(opts.allow_pruning_stateful_and_dataset_ops);
+}
+
+TEST_F(FunctionsTest, FromFunctionDefWithControlOutputs) {
+  const Tensor kOne = test::AsScalar<float>(1.0);
+  FunctionDef func = FunctionDefHelper::Create(
+      "WithControlOutputs", /*in_def=*/{"x: Ref(float)"}, /*out_def=*/{}, {},
+      {
+          {{"one"}, "Const", {}, {{"value", kOne}, {"dtype", DT_FLOAT}}},
+          {{"update"}, "AssignAdd", {"x", "one:output:0"}, {{"T", DT_FLOAT}}},
+      },
+      {}, {{"side_effects", "update"}});
+
+  protobuf::Map<string, AttrValue> func_instantiation_attr;
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func,
+                                        AttrSlice(&func_instantiation_attr),
+                                        flib, TF_GRAPH_DEF_VERSION, &item));
+
+  EXPECT_EQ("WithControlOutputs", item.id);
+  EXPECT_EQ(3, item.function_body().node_size());
+  EXPECT_EQ(1, item.input_size());
+  EXPECT_EQ(0, item.output_size());
+
   ASSERT_EQ(1, item.keep_ops.size());
   EXPECT_EQ("update", item.keep_ops[0]);
+
+  ASSERT_EQ(1, item.control_output_size());
+  const ControlOutput &ctrl = item.control_outputs()[0];
+  EXPECT_EQ("side_effects", ctrl.output_name);
+  EXPECT_EQ("update", ctrl.node_name);
 }
 
 TEST_F(FunctionsTest, MakeFunctionDef) {
@@ -673,7 +748,7 @@ TEST_F(FunctionsTest, ReplaceInputWithConst) {
   EXPECT_EQ(2, item.input_size());
   EXPECT_EQ(1, item.output_size());
 
-  ASSERT_EQ(3, item.function_body().node_size());
+  ASSERT_EQ(4, item.function_body().node_size());
 
   const NodeDef &input_x = item.function_body().node(0);
   const NodeDef &input_y = item.function_body().node(1);
@@ -747,8 +822,9 @@ TEST_F(FunctionsTest, SwapFunctionBodyAndMakeFunctionDef) {
       {{"z", "output:z:0"}});
 
   GraphDef id_func_body = test::function::GDef(
-      {/* pass input to output through identity */
-       NDef("output", "Identity", {"x"}, {{"T", "float"}})});
+      {/* Read and return input argument through Identity node. */
+       NDef("read_x", "Identity", {"x"}, {{"T", "float"}}),
+       NDef("z_output_node_0", "Identity", {"read_x"}, {{"T", "float"}})});
 
   protobuf::Map<string, AttrValue> func_instantiation_attr;
   func_instantiation_attr["T"].set_type(DT_FLOAT);
@@ -771,29 +847,26 @@ TEST_F(FunctionsTest, SwapFunctionBodyAndMakeFunctionDef) {
   // Check that graph body was updated.
   int count = 0;
   for (const NodeDef &node : specialized.node_def()) {
-    if (node.name() == "output" && ++count) {
+    if (node.name() == "read_x" && ++count) {
       EXPECT_EQ("Identity", node.op());
       EXPECT_EQ("x:0", node.input(0));
     }
   }
   EXPECT_EQ(1, count);
 
-  // And return tensor mapping was updated with a new output name (z->output).
-  EXPECT_EQ("output:output:0", (*specialized.mutable_ret())["z"]);
+  // And return tensor mapping was updated with a new output name (z->read_x).
+  EXPECT_EQ("read_x:output:0", (*specialized.mutable_ret())["z"]);
 }
 
 TEST_F(FunctionsTest, FunctionDefGrapplerFunctionItemRoundTrip) {
-  FunctionDef func = FunctionDefHelper::Define(
-      // Name
-      "DoNothing",
-      // Args
-      {"i: int32"},
-      // Return values
-      {"o: int32"},
-      // Attr def
-      {},
-      // Nodes
-      {{{"o"}, "Identity", {"i"}, {{"T", DT_INT32}}}});
+  FunctionDef func = FunctionDefHelper::Create(
+      "DoNothing", /*in_def=*/{"i: int32"}, /*out_def*/ {"o: int32"},
+      /*attr_def*/ {},
+      {
+          {{"id"}, "Identity", {"i"}, {{"T", DT_INT32}}},
+      },
+      /*ret_def=*/{{"o", "id:output:0"}},
+      /*control_ret_def=*/{{"must_execute", "id"}});
 
   constexpr char description[] = "This is a helpful description.";
   func.mutable_signature()->set_description(description);
diff --git a/tensorflow/core/grappler/utils/grappler_test.cc b/tensorflow/core/grappler/utils/grappler_test.cc
index 576494cad55e22ba8457f30d0ea79b53f6f5de78..3a0eec68d1c6adc4236ab2e0e79c8cb66a19b098 100644
--- a/tensorflow/core/grappler/utils/grappler_test.cc
+++ b/tensorflow/core/grappler/utils/grappler_test.cc
@@ -14,7 +14,11 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/utils/grappler_test.h"
+
 #include <memory>
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
@@ -23,6 +27,46 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+namespace {
+void CompareGraphNodes(protobuf::RepeatedPtrField<NodeDef>* want,
+                       protobuf::RepeatedPtrField<NodeDef>* got) {
+  auto comparator = [](const NodeDef& n1, const NodeDef& n2) -> bool {
+    return n1.name() < n2.name();
+  };
+
+  std::sort(want->begin(), want->end(), comparator);
+  std::sort(got->begin(), got->end(), comparator);
+
+  ASSERT_EQ(want->size(), got->size());
+
+  for (int i = 0; i < want->size(); ++i) {
+    NodeDef& want_node = (*want)[i];
+    NodeDef& got_node = (*got)[i];
+
+    EXPECT_EQ(want_node.op(), got_node.op());
+    EXPECT_EQ(want_node.name(), got_node.name());
+    EXPECT_EQ(want_node.device(), got_node.device());
+    ASSERT_EQ(want_node.input_size(), got_node.input_size());
+
+    // Order of control dependencies doesn't matter, so we sort them first.
+    const auto is_control = [](const string& input) -> bool {
+      return ParseTensorName(input).index() < 0;
+    };
+
+    auto want_inputs = want_node.mutable_input();
+    auto got_inputs = got_node.mutable_input();
+    std::sort(absl::c_find_if(*want_inputs, is_control), want_inputs->end());
+    std::sort(absl::c_find_if(*got_inputs, is_control), got_inputs->end());
+
+    for (int j = 0; j < want_node.input_size(); ++j) {
+      const TensorId want_tensor = ParseTensorName(want_node.input(j));
+      const TensorId got_tensor = ParseTensorName(got_node.input(j));
+      EXPECT_EQ(want_tensor.ToString(), got_tensor.ToString());
+    }
+  }
+}
+}  // namespace
+
 GrapplerTest::GrapplerTest() {
   // Turn off all the automatic optimizations to ensure that we run the graph
   // exactly as it is given to us. This ensures that we can compare the results
@@ -37,6 +81,7 @@ GrapplerTest::GrapplerTest() {
   cfg->set_debug_stripper(RewriterConfig::OFF);
   cfg->set_dependency_optimization(RewriterConfig::OFF);
   cfg->set_function_optimization(RewriterConfig::OFF);
+  cfg->set_implementation_selector(RewriterConfig::OFF);
   cfg->set_layout_optimizer(RewriterConfig::OFF);
   cfg->set_loop_optimization(RewriterConfig::OFF);
   cfg->set_pin_to_host_optimization(RewriterConfig::OFF);
@@ -94,34 +139,35 @@ NodeDef* GrapplerTest::AddNode(
 }
 
 void GrapplerTest::CompareGraphs(GraphDef want, GraphDef got) const {
-  auto comparator = [](const NodeDef& n1, const NodeDef& n2) -> bool {
-    return n1.name() < n2.name();
-  };
-  std::sort(want.mutable_node()->begin(), want.mutable_node()->end(),
-            comparator);
-  std::sort(got.mutable_node()->begin(), got.mutable_node()->end(), comparator);
+  CompareGraphNodes(want.mutable_node(), got.mutable_node());
+}
 
-  for (int i = 0; i < want.node_size(); ++i) {
-    std::sort(want.mutable_node(i)->mutable_input()->begin(),
-              want.mutable_node(i)->mutable_input()->end());
-  }
-  for (int i = 0; i < got.node_size(); ++i) {
-    std::sort(got.mutable_node(i)->mutable_input()->begin(),
-              got.mutable_node(i)->mutable_input()->end());
-  }
+void GrapplerTest::CompareFunctions(FunctionDef want, FunctionDef got) const {
+  CompareGraphNodes(want.mutable_node_def(), got.mutable_node_def());
+}
 
-  ASSERT_EQ(want.node_size(), got.node_size());
-  for (int i = 0; i < want.node_size(); ++i) {
-    EXPECT_EQ(want.node(i).op(), got.node(i).op());
-    EXPECT_EQ(want.node(i).name(), got.node(i).name());
-    EXPECT_EQ(want.node(i).device(), got.node(i).device());
+void GrapplerTest::CompareNodes(const NodeDef& want, const NodeDef& got) const {
+  EXPECT_EQ(want.name(), got.name());
+  EXPECT_EQ(want.op(), got.op());
 
-    ASSERT_EQ(want.node(i).input_size(), got.node(i).input_size());
-    for (int j = 0; j < want.node(i).input_size(); ++j) {
-      const TensorId want_tensor = ParseTensorName(want.node(i).input(j));
-      const TensorId got_tensor = ParseTensorName(got.node(i).input(j));
-      EXPECT_EQ(want_tensor.ToString(), got_tensor.ToString());
-    }
+  std::vector<string> want_inputs(want.input().begin(), want.input().end());
+  std::vector<string> got_inputs(got.input().begin(), got.input().end());
+  EXPECT_EQ(want_inputs, got_inputs);
+
+  const auto attr_name = [](const std::pair<const string, AttrValue>& attr) {
+    return attr.first;
+  };
+
+  std::vector<string> want_attrs;
+  std::vector<string> got_attrs;
+  absl::c_transform(want.attr(), std::back_inserter(want_attrs), attr_name);
+  absl::c_transform(got.attr(), std::back_inserter(got_attrs), attr_name);
+  absl::c_sort(want_attrs);
+  absl::c_sort(got_attrs);
+  EXPECT_EQ(want_attrs, got_attrs);
+
+  for (const string& attr : want_attrs) {
+    EXPECT_TRUE(AreAttrValuesEqual(want.attr().at(attr), got.attr().at(attr)));
   }
 }
 
diff --git a/tensorflow/core/grappler/utils/grappler_test.h b/tensorflow/core/grappler/utils/grappler_test.h
index 0cfd740dcbe15e0571bc159858c0ed33c2071cb8..26c1db37405a48a7252f388a3e659b8d07c569ae 100644
--- a/tensorflow/core/grappler/utils/grappler_test.h
+++ b/tensorflow/core/grappler/utils/grappler_test.h
@@ -49,13 +49,32 @@ class GrapplerTest : public ::testing::Test {
                    const std::vector<std::pair<string, AttrValue>>& attributes,
                    GraphDef* graph) const;
 
+  // Checks if two graphs are equal. Both graphs must have the same set of nodes
+  // with the same inputs and attributes. Nodes can be in different order.
+  //
+  // NOTE: This function uses EXPECT/ASSERT macros to check node properties
+  // equality, and adds all failuires to the current test.
   void CompareGraphs(GraphDef want, GraphDef got) const;
 
-  // Check if node 'src' is directly connected to the input($position) of 'dst'.
+  // Checks if two nodes have the same name, op, inputs and attributes.
+  //
+  // NOTE: This function uses EXPECT/ASSERT macros to check node properties
+  // equality, and adds all failuires to the current test.
+  void CompareNodes(const NodeDef& want, const NodeDef& got) const;
+
+  // Checks if two functions are equal. Both functions must have the same set of
+  // nodes with the same inputs and attributes. Nodes can be in different order.
+  //
+  // NOTE: This function uses EXPECT/ASSERT macros to check node properties
+  // equality, and adds all failures to the current test.
+  void CompareFunctions(FunctionDef want, FunctionDef got) const;
+
+  // Checks if node 'src' is directly connected to the input($position) of
+  // 'dst'.
   bool IsNodesDirectlyConnected(const NodeMap& node_map, const string& src,
                                 const string& dst, int position = 0);
 
-  // Count nodes of the given op-type in a graph.
+  // Counts nodes of the given op-type in a graph.
   int CountOpNodes(const GraphDef& graph, const string& op);
 
   // Get a random tensor with given shape.
diff --git a/tensorflow/core/grappler/utils/topological_sort.cc b/tensorflow/core/grappler/utils/topological_sort.cc
index 63ca92c69e1c11a90e7870f1509228d90239fa72..a6d0f5037bb35cbbb909cbb4049153f0d1013c64 100644
--- a/tensorflow/core/grappler/utils/topological_sort.cc
+++ b/tensorflow/core/grappler/utils/topological_sort.cc
@@ -14,10 +14,15 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/utils/topological_sort.h"
+
 #include <algorithm>
 #include <deque>
 #include <unordered_map>
+
+#include "absl/types/span.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/graph_topology_view.h"
+#include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -25,27 +30,46 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+namespace {
+
+std::vector<GraphView::Edge> MakeEphemeralEdges(
+    const absl::Span<const TopologicalDependency> extra_dependencies) {
+  std::vector<GraphView::Edge> ephemeral_edges;
+  ephemeral_edges.reserve(extra_dependencies.size());
+  for (const auto& dep : extra_dependencies) {
+    ephemeral_edges.emplace_back(
+        GraphView::OutputPort(dep.from, Graph::kControlSlot),
+        GraphView::InputPort(dep.to, Graph::kControlSlot));
+  }
+  return ephemeral_edges;
+}
+
 // Kahn's algorithm is implemented.
 // For details, see https://en.wikipedia.org/wiki/Topological_sorting
 Status ComputeTopologicalOrder(
-    const GraphDef& graph, std::vector<int>* ready_nodes,
-    const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
-        extra_dependencies) {
-  SimpleGraphView graph_view;
-  TF_RETURN_IF_ERROR(graph_view.Initialize(graph, extra_dependencies));
+    const GraphDef& graph,
+    const absl::Span<const TopologicalDependency> extra_dependencies,
+    std::vector<int>* ready_nodes) {
+  GraphTopologyView graph_view;
+  TF_RETURN_IF_ERROR(graph_view.InitializeFromGraph(
+      graph, MakeEphemeralEdges(extra_dependencies)));
+
+  // Keep track of how many inputs are ready for the given node.
+  std::vector<int> num_ready_inputs(graph.node_size(), 0);
 
-  ready_nodes->reserve(graph_view.num_nodes());
+  // We'll push index of ready nodes to this output vector.
+  ready_nodes->reserve(graph.node_size());
 
   int front = 0;
   int back = 0;
-  std::vector<int> num_ready_inputs(graph_view.num_nodes(), 0);
-  for (int i = 0; i < graph_view.num_nodes(); i++) {
-    if (graph_view.inputs(i).empty()) {
+
+  for (int i = 0; i < graph.node_size(); i++) {
+    if (graph_view.GetFanin(i).empty()) {
       ready_nodes->push_back(i);
       back++;
     }
     if (IsMerge(graph.node(i))) {
-      for (int input : graph_view.inputs(i)) {
+      for (int input : graph_view.GetFanin(i)) {
         if (IsNextIteration(graph.node(input))) {
           num_ready_inputs[i]++;
         }
@@ -55,9 +79,9 @@ Status ComputeTopologicalOrder(
 
   while (front != back) {
     int ready_node = (*ready_nodes)[front];
-    for (int fanout : graph_view.outputs(ready_node)) {
+    for (int fanout : graph_view.GetFanout(ready_node)) {
       ++num_ready_inputs[fanout];
-      if (num_ready_inputs[fanout] == graph_view.inputs(fanout).size()) {
+      if (num_ready_inputs[fanout] == graph_view.GetFanin(fanout).size()) {
         ready_nodes->push_back(fanout);
         ++back;
       }
@@ -72,23 +96,32 @@ Status ComputeTopologicalOrder(
   return Status::OK();
 }
 
+}  // namespace
+
 Status ComputeTopologicalOrder(
-    const GraphDef& graph, std::unordered_map<const NodeDef*, int>* topo_order,
-    const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
-        extra_dependencies) {
+    const GraphDef& graph,
+    const absl::Span<const TopologicalDependency> extra_dependencies,
+    std::vector<const NodeDef*>* topo_order) {
   std::vector<int> ready_nodes;
   TF_RETURN_IF_ERROR(
-      ComputeTopologicalOrder(graph, &ready_nodes, extra_dependencies));
-  topo_order->reserve(graph.node_size());
-  for (int i = 0; i < ready_nodes.size(); ++i) {
-    (*topo_order)[&graph.node(ready_nodes[i])] = i;
+      ComputeTopologicalOrder(graph, extra_dependencies, &ready_nodes));
+
+  topo_order->reserve(ready_nodes.size());
+  for (int ready_node_idx : ready_nodes) {
+    topo_order->emplace_back(&graph.node(ready_node_idx));
   }
+
   return Status::OK();
 }
 
+Status ComputeTopologicalOrder(const GraphDef& graph,
+                               std::vector<const NodeDef*>* topo_order) {
+  return ComputeTopologicalOrder(graph, {}, topo_order);
+}
+
 Status ReversedTopologicalSort(GraphDef* graph) {
   std::vector<int> ready_nodes;
-  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(*graph, &ready_nodes, nullptr));
+  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(*graph, {}, &ready_nodes));
   std::reverse(ready_nodes.begin(), ready_nodes.end());
   PermuteNodesInPlace(graph, &ready_nodes, /*invert_permutation=*/true);
   return Status::OK();
@@ -96,7 +129,7 @@ Status ReversedTopologicalSort(GraphDef* graph) {
 
 Status TopologicalSort(GraphDef* graph) {
   std::vector<int> ready_nodes;
-  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(*graph, &ready_nodes, nullptr));
+  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(*graph, {}, &ready_nodes));
   PermuteNodesInPlace(graph, &ready_nodes, /*invert_permutation=*/true);
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/utils/topological_sort.h b/tensorflow/core/grappler/utils/topological_sort.h
index b8cf897a321877bc73946907aa11b8b2c20255e9..dd4208dfff3b28f2b55f71e0cf369b655d6f8c09 100644
--- a/tensorflow/core/grappler/utils/topological_sort.h
+++ b/tensorflow/core/grappler/utils/topological_sort.h
@@ -16,22 +16,40 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
 #define TENSORFLOW_CORE_GRAPPLER_UTILS_TOPOLOGICAL_SORT_H_
 
+#include "absl/types/span.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace grappler {
 
-// Compute a topological ordering for the graph nodes.
+// TODO(ezhulenev, b/121379902): We should be consistent with GraphTopologyView
+// and use `GraphView::Edge` to pass extra dependencies.
+struct TopologicalDependency {
+  TopologicalDependency(const NodeDef* from, const NodeDef* to)
+      : from(from), to(to) {}
+  const NodeDef* from;
+  const NodeDef* to;
+};
+
+// Computes a topological ordering for the graph nodes and outputs nodes in the
+// topological order to the `topo_order` output argument.
+//
+// It's possible to pass additional edges that do not exists in a graph, but
+// must be respected when computing graph topological order. Example: Tensorflow
+// runtime allows concurrent execution of dequeue/enqueue ops from the same
+// queue resource, but we might want to enforce ordering between them.
 Status ComputeTopologicalOrder(
-    const GraphDef& graph, std::unordered_map<const NodeDef*, int>* topo_order,
-    const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
-        extra_dependencies);
+    const GraphDef& graph,
+    absl::Span<const TopologicalDependency> extra_dependencies,
+    std::vector<const NodeDef*>* topo_order);
+Status ComputeTopologicalOrder(const GraphDef& graph,
+                               std::vector<const NodeDef*>* topo_order);
 
-// Sort a graph in topological order.
+// Sorts a graph in topological order.
 Status TopologicalSort(GraphDef* graph);
 
-// Sort a graph in topological order and reverse it.
+// Sorts a graph in topological order and reverse it.
 Status ReversedTopologicalSort(GraphDef* graph);
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/utils/topological_sort_test.cc b/tensorflow/core/grappler/utils/topological_sort_test.cc
index 48b7eb50bd9f2a4867e68291588d2e5c11a0c5c2..3868183c62d0dbdb09a65996b9de79b7a6001ca3 100644
--- a/tensorflow/core/grappler/utils/topological_sort_test.cc
+++ b/tensorflow/core/grappler/utils/topological_sort_test.cc
@@ -14,79 +14,94 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
 namespace grappler {
-namespace {
 
 class TopologicalSortTest : public ::testing::Test {
  protected:
-  static NodeDef CreateNode(const string& name,
-                            const std::vector<string>& inputs) {
-    return CreateNode(name, "", inputs);
-  }
-  static NodeDef CreateNode(const string& name, const string& op,
-                            const std::vector<string>& inputs) {
-    NodeDef node;
-    node.set_name(name);
-    if (!op.empty()) {
-      node.set_op(op);
+  struct NodeConfig {
+    NodeConfig(string name, std::vector<string> inputs)
+        : name(std::move(name)), inputs(std::move(inputs)) {}
+    NodeConfig(string name, string op, std::vector<string> inputs)
+        : name(std::move(name)), op(std::move(op)), inputs(std::move(inputs)) {}
+
+    string name;
+    string op;
+    std::vector<string> inputs;
+  };
+
+  static GraphDef CreateGraph(const std::vector<NodeConfig>& nodes) {
+    GraphDef graph;
+
+    for (const NodeConfig& node : nodes) {
+      NodeDef node_def;
+      node_def.set_name(node.name);
+      node_def.set_op(node.op);
+      for (const string& input : node.inputs) {
+        node_def.add_input(input);
+      }
+      *graph.add_node() = std::move(node_def);
     }
-    for (const string& input : inputs) {
-      node.add_input(input);
-    }
-    return node;
+
+    return graph;
   }
 };
 
 TEST_F(TopologicalSortTest, NoLoop) {
-  GraphDef graph;
-  *graph.add_node() = CreateNode("2", {"5"});
-  *graph.add_node() = CreateNode("0", {"5", "4"});
-  *graph.add_node() = CreateNode("1", {"4", "3"});
-  *graph.add_node() = CreateNode("3", {"2"});
-  *graph.add_node() = CreateNode("5", {});
-  *graph.add_node() = CreateNode("4", {});
-
-  std::unordered_map<const NodeDef*, int> topo_order;
-  TF_EXPECT_OK(ComputeTopologicalOrder(graph, &topo_order, nullptr));
+  GraphDef graph = CreateGraph({
+      {"2", {"5"}},       //
+      {"0", {"5", "4"}},  //
+      {"1", {"4", "3"}},  //
+      {"3", {"2"}},       //
+      {"5", {}},          //
+      {"4", {}}           //
+  });
+
+  std::vector<const NodeDef*> topo_order;
+  TF_EXPECT_OK(ComputeTopologicalOrder(graph, &topo_order));
 
   const std::vector<string> order = {"5", "4", "2", "0", "3", "1"};
-  for (const auto& topo : topo_order) {
-    const string& node_name = topo.first->name();
-    const int topo_order = topo.second;
-    std::cout << "Node " << node_name << " at order " << topo_order
-              << std::endl;
-    EXPECT_EQ(node_name, order[topo_order]);
+
+  ASSERT_EQ(topo_order.size(), order.size());
+  for (int i = 0; i < topo_order.size(); ++i) {
+    const NodeDef* node = topo_order[i];
+    EXPECT_EQ(node->name(), order[i]);
   }
 
   TF_EXPECT_OK(TopologicalSort(&graph));
-  for (int i = 0; i < order.size(); i++) {
+  for (int i = 0; i < topo_order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
   }
 }
 
 TEST_F(TopologicalSortTest, WithLoop) {
-  GraphDef graph;
-  // Create a loop
-  *graph.add_node() = CreateNode("2", "Merge", {"1", "5"});
-  *graph.add_node() = CreateNode("3", "Switch", {"2"});
-  *graph.add_node() = CreateNode("4", "Identity", {"3"});
-  *graph.add_node() = CreateNode("5", "NextIteration", {"4"});
-  *graph.add_node() = CreateNode("1", {});
-
-  std::unordered_map<const NodeDef*, int> topo_order;
-  TF_EXPECT_OK(ComputeTopologicalOrder(graph, &topo_order, nullptr));
+  GraphDef graph = CreateGraph({
+      // Graph with a loop.
+      {"2", "Merge", {"1", "5"}},     //
+      {"3", "Switch", {"2"}},         //
+      {"4", "Identity", {"3"}},       //
+      {"5", "NextIteration", {"4"}},  //
+      {"1", {}}                       //
+  });
+
+  std::vector<const NodeDef*> topo_order;
+  TF_EXPECT_OK(ComputeTopologicalOrder(graph, &topo_order));
 
   const std::vector<string> order = {"1", "2", "3", "4", "5"};
-  for (const auto& topo : topo_order) {
-    const string& node_name = topo.first->name();
-    const int topo_order = topo.second;
-    EXPECT_EQ(node_name, order[topo_order]);
+
+  ASSERT_EQ(topo_order.size(), order.size());
+  for (int i = 0; i < topo_order.size(); ++i) {
+    const NodeDef* node = topo_order[i];
+    EXPECT_EQ(node->name(), order[i]);
   }
 
   TF_EXPECT_OK(TopologicalSort(&graph));
@@ -96,12 +111,13 @@ TEST_F(TopologicalSortTest, WithLoop) {
 }
 
 TEST_F(TopologicalSortTest, WithIllegalLoop) {
-  GraphDef graph;
   // A loop without Merge and NextIteration is illegal and the original node
   // order and graph will be preserved.
-  *graph.add_node() = CreateNode("2", {"1", "3"});
-  *graph.add_node() = CreateNode("3", {"2"});
-  *graph.add_node() = CreateNode("1", {});
+  GraphDef graph = CreateGraph({
+      {"2", {"1", "3"}},  //
+      {"3", {"2"}},       //
+      {"1", {}}           //
+  });
 
   EXPECT_FALSE(TopologicalSort(&graph).ok());
   std::vector<string> order = {"2", "3", "1"};
@@ -111,9 +127,10 @@ TEST_F(TopologicalSortTest, WithIllegalLoop) {
 }
 
 TEST_F(TopologicalSortTest, DuplicatedInputs) {
-  GraphDef graph;
-  *graph.add_node() = CreateNode("2", {"1", "1"});
-  *graph.add_node() = CreateNode("1", {});
+  GraphDef graph = CreateGraph({
+      {"2", {"1", "1"}},  //
+      {"1", {}}           //
+  });
 
   TF_EXPECT_OK(TopologicalSort(&graph));
   std::vector<string> order = {"1", "2"};
@@ -123,12 +140,13 @@ TEST_F(TopologicalSortTest, DuplicatedInputs) {
 }
 
 TEST_F(TopologicalSortTest, Idempotent) {
-  GraphDef graph;
-  *graph.add_node() = CreateNode("1", {});
-  *graph.add_node() = CreateNode("2", {});
-  *graph.add_node() = CreateNode("3", {"1", "2"});
-  *graph.add_node() = CreateNode("4", {"1", "3"});
-  *graph.add_node() = CreateNode("5", {"2", "3"});
+  GraphDef graph = CreateGraph({
+      {"1", {}},          //
+      {"2", {}},          //
+      {"3", {"1", "2"}},  //
+      {"4", {"1", "3"}},  //
+      {"5", {"2", "3"}}   //
+  });
 
   TF_EXPECT_OK(TopologicalSort(&graph));
   std::vector<string> order = {"1", "2", "3", "4", "5"};
@@ -136,7 +154,7 @@ TEST_F(TopologicalSortTest, Idempotent) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
   }
 
-  // Run topo sort again to verify that it is idenpotent.
+  // Run topo sort again to verify that it is idempotent.
   TF_EXPECT_OK(TopologicalSort(&graph));
   for (int i = 0; i < order.size(); i++) {
     EXPECT_EQ(graph.node(i).name(), order[i]);
@@ -144,35 +162,81 @@ TEST_F(TopologicalSortTest, Idempotent) {
 }
 
 TEST_F(TopologicalSortTest, ExtraDependencies) {
-  GraphDef graph;
-  *graph.add_node() = CreateNode("2", {"5"});
-  *graph.add_node() = CreateNode("0", {"5", "4"});
-  *graph.add_node() = CreateNode("1", {"4", "3"});
-  *graph.add_node() = CreateNode("3", {"2"});
-  *graph.add_node() = CreateNode("5", {});
-  *graph.add_node() = CreateNode("4", {});
+  GraphDef graph = CreateGraph({
+      {"2", {"5"}},       //
+      {"0", {"5", "4"}},  //
+      {"1", {"4", "3"}},  //
+      {"3", {"2"}},       //
+      {"5", {}},          //
+      {"4", {}}           //
+  });
 
   // Add an edge from 4 to 5.
-  std::vector<std::pair<const NodeDef*, const NodeDef*>> extra_dependencies;
-  extra_dependencies.emplace_back(&graph.node(5), &graph.node(4));
-
-  std::unordered_map<const NodeDef*, int> topo_order;
-  TF_EXPECT_OK(
-      ComputeTopologicalOrder(graph, &topo_order, &extra_dependencies));
-
-  const std::vector<string> order = {"4", "5", "2", "0", "3", "1"};
-  for (const auto& topo : topo_order) {
-    const string& node_name = topo.first->name();
-    const int topo_order = topo.second;
-    EXPECT_EQ(node_name, order[topo_order]);
+  std::vector<TopologicalDependency> extra_dependencies;
+  extra_dependencies.push_back({&graph.node(5), &graph.node(4)});
+
+  std::vector<const NodeDef*> topo_order;
+  TF_EXPECT_OK(ComputeTopologicalOrder(graph, extra_dependencies, &topo_order));
+
+  const std::vector<string> valid_order_1 = {"4", "5", "2", "0", "3", "1"};
+  const std::vector<string> valid_order_2 = {"4", "5", "0", "2", "3", "1"};
+
+  ASSERT_EQ(topo_order.size(), valid_order_1.size());
+
+  std::vector<string> computed_order(6, "");
+  for (int i = 0; i < topo_order.size(); ++i) {
+    const NodeDef* node = topo_order[i];
+    computed_order[i] = node->name();
   }
+  EXPECT_TRUE(computed_order == valid_order_1 ||
+              computed_order == valid_order_2);
 
-  // Add an edge from 0 to 4. This will create a loop
-  extra_dependencies.emplace_back(&graph.node(1), &graph.node(5));
+  // Add an edge from `0` to `4`. This will create a loop.
+  extra_dependencies.push_back({&graph.node(1), &graph.node(5)});
   EXPECT_FALSE(
-      ComputeTopologicalOrder(graph, &topo_order, &extra_dependencies).ok());
+      ComputeTopologicalOrder(graph, extra_dependencies, &topo_order).ok());
+}
+
+static void BM_ComputeTopologicalOrder(int iters, int size) {
+  testing::StopTiming();
+
+  random::PhiloxRandom philox(0x12345);
+  random::SimplePhilox rnd(&philox);
+
+  string prefix = "long_node_name_prefix_to_measure_string_copy_overhead";
+
+  GraphDef graph;
+  for (int i = 0; i < size; ++i) {
+    const string name = absl::StrCat(prefix, i);
+    const uint32 num_inputs = rnd.Uniform(std::min(i, 5));
+
+    NodeDef node;
+    node.set_name(name);
+    for (int n = 0; n < num_inputs; ++n) {
+      const uint32 input_node = rnd.Uniform(i);
+      node.add_input(absl::StrCat(prefix, input_node));
+    }
+
+    *graph.add_node() = std::move(node);
+  }
+
+  testing::StartTiming();
+  std::vector<const NodeDef*> topo_order;
+  for (int i = 0; i < iters; i++) {
+    topo_order.clear();
+    Status st = ComputeTopologicalOrder(graph, &topo_order);
+    CHECK(st.ok()) << "Failed to compute topological order";
+  }
+  testing::StopTiming();
 }
+BENCHMARK(BM_ComputeTopologicalOrder)
+    ->Arg(10)
+    ->Arg(100)
+    ->Arg(1000)
+    ->Arg(10000)
+    ->Arg(25000)
+    ->Arg(50000)
+    ->Arg(100000);
 
-}  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/traversal.cc b/tensorflow/core/grappler/utils/traversal.cc
index 6952277568676baf5812a20c4c743356eeedd40a..c602e8c0e47723b4e6ad68431e5b08b8314d1c95 100644
--- a/tensorflow/core/grappler/utils/traversal.cc
+++ b/tensorflow/core/grappler/utils/traversal.cc
@@ -17,89 +17,109 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/graph_topology_view.h"
 
 namespace tensorflow {
 namespace grappler {
 
 namespace {
 
-template <typename GraphViewType>
-void ReverseDfsInternal(
-    const GraphViewType& graph_view, const std::vector<const NodeDef*>& from,
-    const std::function<void(const NodeDef*)>& pre_order,
-    const std::function<void(const NodeDef*)>& post_order,
-    const std::function<void(const NodeDef*, const NodeDef*)>& on_back_edge) {
-  // Stack of work to do.
-  struct StackElem {
-    const NodeDef* node;
-    bool children_visited;
-    const NodeDef* src;
-  };
-  std::vector<StackElem> stack;
+struct DfsStackElem {
+  DfsStackElem(int node, bool children_visited, int src)
+      : node(node), children_visited(children_visited), src(src) {}
+  explicit DfsStackElem(int node) : DfsStackElem(node, false, -1) {}
 
+  // Index of the node in the graph ∊ [0, num_nodes).
+  int node;
+  // `True` if visited all the input/output nodes (pushed all input/output nodes
+  // to the stack).
+  bool children_visited;
+  // Index of the node in the graph, from which we entered the `node`.
+  int src;
+};
+
+enum class NodeState { kNotVisited, kVisiting, kDone };
+
+}  // namespace
+
+void DfsTraversal(const GraphTopologyView& graph_view,
+                  const absl::Span<const NodeDef* const> from,
+                  const TraversalDirection direction,
+                  const DfsPredicates& predicates,
+                  const DfsCallbacks& callbacks) {
+  std::vector<DfsStackElem> stack;
   stack.reserve(from.size());
+
   for (const NodeDef* node : from) {
-    stack.push_back(StackElem{node, false});
+    const absl::optional<int> node_idx = graph_view.GetNodeIndex(*node);
+    DCHECK(node_idx.has_value()) << "Illegal start node: " << node->name();
+    if (node_idx.has_value()) {
+      stack.emplace_back(node_idx.value());
+    }
   }
 
-  enum NodeState { NOT_VISITED = 0, VISITING = 1, DONE = 2 };
-  absl::flat_hash_map<const NodeDef*, NodeState> node_state;
+  absl::flat_hash_map<int, NodeState> node_state;
   while (!stack.empty()) {
-    StackElem w = stack.back();
+    DfsStackElem w = stack.back();
     stack.pop_back();
 
+    NodeState& state = node_state[w.node];
+    if (state == NodeState::kDone) continue;
+
+    // Skip nodes that we should not enter.
+    if (predicates.enter && !predicates.enter(graph_view.GetNode(w.node))) {
+      state = NodeState::kDone;
+      continue;
+    }
+
+    // We've processed all the children of this node.
     if (w.children_visited) {
-      // We've processed all the children of this node
-      node_state[w.node] = DONE;
-      if (post_order) {
-        post_order(w.node);
+      state = NodeState::kDone;
+      if (callbacks.post_order) {
+        callbacks.post_order(graph_view.GetNode(w.node));
       }
       continue;
     }
 
-    auto& rslt = node_state[w.node];
-    if (rslt == DONE) {
-      continue;
-    } else if (rslt == VISITING) {
-      // Loop detected
-      if (on_back_edge) {
-        on_back_edge(w.src, w.node);
+    // Loop detected.
+    if (state == NodeState::kVisiting) {
+      if (callbacks.on_back_edge) {
+        callbacks.on_back_edge(graph_view.GetNode(w.src),
+                               graph_view.GetNode(w.node));
       }
       continue;
     }
-    rslt = VISITING;
-    if (pre_order) {
-      pre_order(w.node);
+
+    state = NodeState::kVisiting;
+    if (callbacks.pre_order) {
+      callbacks.pre_order(graph_view.GetNode(w.node));
     }
 
     // Enqueue the node again with the children_visited flag set to true.
-    stack.push_back(StackElem{w.node, true, w.src});
+    stack.emplace_back(w.node, true, w.src);
 
-    // Now enqueue the node children.
-    for (const auto fanin : graph_view.GetFanins(*w.node, true)) {
-      stack.push_back(StackElem{fanin.node, false, w.node});
+    // Check if we can continue traversal from the current node.
+    if (predicates.advance && !predicates.advance(graph_view.GetNode(w.node))) {
+      continue;
     }
-  }
-}
-
-}  // namespace
 
-void ReverseDfs(
-    const GraphView& graph_view, const std::vector<const NodeDef*>& from,
-    const std::function<void(const NodeDef*)>& pre_order,
-    const std::function<void(const NodeDef*)>& post_order,
-    const std::function<void(const NodeDef*, const NodeDef*)>& on_back_edge) {
-  ReverseDfsInternal<GraphView>(graph_view, from, pre_order, post_order,
-                                on_back_edge);
+    // Now enqueue the fanin/fanout nodes.
+    if (direction == TraversalDirection::kFollowInputs) {
+      for (const int fanin : graph_view.GetFanin(w.node)) {
+        stack.emplace_back(fanin, false, w.node);
+      }
+    } else {
+      for (const int fanout : graph_view.GetFanout(w.node)) {
+        stack.emplace_back(fanout, false, w.node);
+      }
+    }
+  }
 }
 
-void ReverseDfs(
-    const MutableGraphView& graph_view, const std::vector<const NodeDef*>& from,
-    const std::function<void(const NodeDef*)>& pre_order,
-    const std::function<void(const NodeDef*)>& post_order,
-    const std::function<void(const NodeDef*, const NodeDef*)>& on_back_edge) {
-  ReverseDfsInternal<MutableGraphView>(graph_view, from, pre_order, post_order,
-                                       on_back_edge);
+void DfsTraversal(const GraphTopologyView& graph_view,
+                  const absl::Span<const NodeDef* const> from,
+                  TraversalDirection direction, const DfsCallbacks& callbacks) {
+  DfsTraversal(graph_view, from, direction, {}, callbacks);
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/utils/traversal.h b/tensorflow/core/grappler/utils/traversal.h
index 5b7737f97eb1f8ee56efd599d6216dc4e472febd..5c9dada4933ff803c9f53fec44f74104daec11f6 100644
--- a/tensorflow/core/grappler/utils/traversal.h
+++ b/tensorflow/core/grappler/utils/traversal.h
@@ -17,29 +17,85 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPPLER_UTILS_TRAVERSAL_H_
 
 #include <functional>
-#include "tensorflow/core/grappler/graph_view.h"
-#include "tensorflow/core/grappler/mutable_graph_view.h"
+
+#include "tensorflow/core/grappler/graph_topology_view.h"
 
 namespace tensorflow {
 namespace grappler {
 
-// Traverse the graph in reverse dfs order, starting from the list of nodes
-// specified in the 'from' argument. The pre_order and post_order functors will
-// be called on each reachable node (including the 'from' nodes) in pre and post
-// order. If loops are found, the on_back_edge functor will be called on the
+enum class TraversalDirection { kFollowInputs, kFollowOutputs };
+
+// Encapsulate DFS callbacks that will be called during the graph traversal.
+//
+// If non-empty, the `pre_order` and `post_order` functors will be called on
+// each reachable node (including the `from` nodes) in pre and post order. If
+// loops are found, the `on_back_edge` functor will be called on the
 // corresponding back edges. Moreover, the pre and post order will assume that
 // these back edges will be cut.
-void ReverseDfs(
-    const GraphView& graph_view, const std::vector<const NodeDef*>& from,
-    const std::function<void(const NodeDef*)>& pre_order,
-    const std::function<void(const NodeDef*)>& post_order,
-    const std::function<void(const NodeDef*, const NodeDef*)>& on_back_edge);
-
-void ReverseDfs(
-    const MutableGraphView& graph_view, const std::vector<const NodeDef*>& from,
-    const std::function<void(const NodeDef*)>& pre_order,
-    const std::function<void(const NodeDef*)>& post_order,
-    const std::function<void(const NodeDef*, const NodeDef*)>& on_back_edge);
+struct DfsCallbacks {
+  DfsCallbacks() = default;
+  DfsCallbacks(std::function<void(const NodeDef*)> pre,
+               std::function<void(const NodeDef*)> post,
+               std::function<void(const NodeDef*, const NodeDef*)> back_edge)
+      : pre_order(std::move(pre)),
+        post_order(std::move(post)),
+        on_back_edge(std::move(back_edge)) {}
+
+  static DfsCallbacks PreOrder(std::function<void(const NodeDef*)> pre) {
+    return DfsCallbacks(std::move(pre), nullptr, nullptr);
+  }
+
+  static DfsCallbacks PostOrder(std::function<void(const NodeDef*)> post) {
+    return DfsCallbacks(nullptr, std::move(post), nullptr);
+  }
+
+  std::function<void(const NodeDef*)> pre_order;
+  std::function<void(const NodeDef*)> post_order;
+  std::function<void(const NodeDef*, const NodeDef*)> on_back_edge;
+};
+
+// Encapsulate DFS predicates for traversing the graph.
+//
+// The `enter` predicate decides if traversal should enter the node, and the
+// `advance` predicate decides if the traversal should follow inputs/outputs
+// from the node.
+//
+// If predicates are empty (default initialized), it's assumed that we can enter
+// into any node and advance from any node respectively.
+struct DfsPredicates {
+  DfsPredicates() = default;
+  DfsPredicates(std::function<bool(const NodeDef*)> enter,
+                std::function<bool(const NodeDef*)> advance)
+      : enter(std::move(enter)), advance(std::move(advance)) {}
+
+  static DfsPredicates Enter(std::function<bool(const NodeDef*)> enter) {
+    return DfsPredicates(std::move(enter), nullptr);
+  }
+
+  static DfsPredicates Advance(std::function<bool(const NodeDef*)> advance) {
+    return DfsPredicates(nullptr, std::move(advance));
+  }
+
+  std::function<bool(const NodeDef*)> enter;
+  std::function<bool(const NodeDef*)> advance;
+};
+
+// Traverse the graph in DFS order in the given direction, starting from the
+// list of nodes specified in the `from` argument. Use `predicates` to decide if
+// traversal should enter/advance to/from the graph node. These predicates also
+// applied to the `from` nodes. Call corresponding callbacks for each visited
+// node.
+void DfsTraversal(const GraphTopologyView& graph_view,
+                  absl::Span<const NodeDef* const> from,
+                  TraversalDirection direction, const DfsPredicates& predicates,
+                  const DfsCallbacks& callbacks);
+
+// Traverse the graph in DFS order in the given direction, starting from the
+// list of nodes specified in the `from` argument. Call corresponding callbacks
+// for each visited node.
+void DfsTraversal(const GraphTopologyView& graph_view,
+                  absl::Span<const NodeDef* const> from,
+                  TraversalDirection direction, const DfsCallbacks& callbacks);
 
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/traversal_test.cc b/tensorflow/core/grappler/utils/traversal_test.cc
index c040477a08970436cb07f6bb87c30e47b6b72525..7b36d328e938473333bd79044b7e953a2f25e17c 100644
--- a/tensorflow/core/grappler/utils/traversal_test.cc
+++ b/tensorflow/core/grappler/utils/traversal_test.cc
@@ -15,101 +15,222 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/utils/traversal.h"
 
-#include "tensorflow/core/lib/strings/strcat.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace grappler {
+
 namespace {
+using ::tensorflow::test::function::NDef;
+
+DfsCallbacks MkCallbacks(std::vector<string>* pre_order,
+                         std::vector<string>* post_order,
+                         std::vector<string>* back_edges) {
+  return {[pre_order](const NodeDef* n) { pre_order->push_back(n->name()); },
+          [post_order](const NodeDef* n) { post_order->push_back(n->name()); },
+          [back_edges](const NodeDef* src, const NodeDef* dst) {
+            back_edges->push_back(absl::StrCat(src->name(), "->", dst->name()));
+          }};
+}
+
+TEST(TraversalTest, OutputsDfsNoLoop) {
+  const string op = "OpIsNotImportantInThisTest";
+
+  GraphDef graph = ::tensorflow::test::function::GDef(  //
+      {NDef("2", op, {"5"}, {}),                        //
+       NDef("0", op, {"5", "4"}, {}),                   //
+       NDef("1", op, {"4", "3"}, {}),                   //
+       NDef("3", op, {"2"}, {}),                        //
+       NDef("5", op, {}, {}),                           //
+       NDef("4", op, {}, {})},                          //
+      /*funcs=*/{});
 
-class TraversalTest : public ::testing::Test {
- protected:
-  static NodeDef CreateNode(const string& name,
-                            const std::vector<string>& inputs) {
-    return CreateNode(name, "", inputs);
-  }
-  static NodeDef CreateNode(const string& name, const string& op,
-                            const std::vector<string>& inputs) {
-    NodeDef node;
-    node.set_name(name);
-    if (!op.empty()) {
-      node.set_op(op);
-    }
-    for (const string& input : inputs) {
-      node.add_input(input);
-    }
-    return node;
-  }
-};
-
-TEST_F(TraversalTest, ReverseDfsNoLoop) {
-  GraphDef graph;
-  *graph.add_node() = CreateNode("2", {"5"});
-  *graph.add_node() = CreateNode("0", {"5", "4"});
-  *graph.add_node() = CreateNode("1", {"4", "3"});
-  *graph.add_node() = CreateNode("3", {"2"});
-  *graph.add_node() = CreateNode("5", {});
-  *graph.add_node() = CreateNode("4", {});
+  std::vector<const NodeDef*> start_nodes = {&graph.node(4), &graph.node(5)};
+
+  std::vector<string> pre_order;
+  std::vector<string> post_order;
+  std::vector<string> back_edges;
+
+  GraphTopologyView graph_view;
+  TF_CHECK_OK(graph_view.InitializeFromGraph(graph));
+  DfsTraversal(graph_view, start_nodes, TraversalDirection::kFollowOutputs,
+               MkCallbacks(&pre_order, &post_order, &back_edges));
+
+  const std::vector<string> expected_pre = {"4", "1", "0", "5", "2", "3"};
+  const std::vector<string> expected_post = {"1", "0", "4", "3", "2", "5"};
+
+  EXPECT_EQ(pre_order, expected_pre);
+  EXPECT_EQ(post_order, expected_post);
+  EXPECT_TRUE(back_edges.empty());
+}
+
+TEST(TraversalTest, InputsDfsNoLoop) {
+  const string op = "OpIsNotImportantInThisTest";
+
+  GraphDef graph = ::tensorflow::test::function::GDef(  //
+      {NDef("2", op, {"5"}, {}),                        //
+       NDef("0", op, {"5", "4"}, {}),                   //
+       NDef("1", op, {"4", "3"}, {}),                   //
+       NDef("3", op, {"2"}, {}),                        //
+       NDef("5", op, {}, {}),                           //
+       NDef("4", op, {}, {})},                          //
+      /*funcs=*/{});
 
   std::vector<const NodeDef*> start_nodes = {&graph.node(1), &graph.node(2)};
+
   std::vector<string> pre_order;
   std::vector<string> post_order;
-  bool found_back_edge = false;
-  ReverseDfs(
-      GraphView(&graph), start_nodes,
-      [&pre_order](const NodeDef* n) { pre_order.push_back(n->name()); },
-      [&post_order](const NodeDef* n) { post_order.push_back(n->name()); },
-      [&found_back_edge](const NodeDef*, const NodeDef*) {
-        found_back_edge = true;
-      });
-
-  // Pre/Post order traversals are non deterministic because a node fanin is an
-  // absl::flat_hash_set with non deterministic traversal order.
-  using ValidTraversal = std::pair<std::vector<string>, std::vector<string>>;
-
-  std::set<ValidTraversal> valid_traversals = {
-      // pre_order                     post_order
-      {{"1", "4", "3", "2", "5", "0"}, {"4", "5", "2", "3", "1", "0"}},
-      {{"1", "3", "2", "5", "4", "0"}, {"5", "2", "3", "4", "1", "0"}}};
-
-  EXPECT_EQ(valid_traversals.count({pre_order, post_order}), 1);
-  EXPECT_FALSE(found_back_edge);
+  std::vector<string> back_edges;
+
+  GraphTopologyView graph_view;
+  TF_CHECK_OK(graph_view.InitializeFromGraph(graph));
+  DfsTraversal(graph_view, start_nodes, TraversalDirection::kFollowInputs,
+               MkCallbacks(&pre_order, &post_order, &back_edges));
+
+  const std::vector<string> expected_pre = {"1", "4", "3", "2", "5", "0"};
+  const std::vector<string> expected_post = {"4", "5", "2", "3", "1", "0"};
+
+  EXPECT_EQ(pre_order, expected_pre);
+  EXPECT_EQ(post_order, expected_post);
+  EXPECT_TRUE(back_edges.empty());
 }
 
-TEST_F(TraversalTest, ReverseDfsWithLoop) {
-  GraphDef graph;
-  // Create a loop
-  *graph.add_node() = CreateNode("2", "Merge", {"1", "5"});
-  *graph.add_node() = CreateNode("3", "Switch", {"2"});
-  *graph.add_node() = CreateNode("4", "Identity", {"3"});
-  *graph.add_node() = CreateNode("5", "NextIteration", {"4"});
-  *graph.add_node() = CreateNode("1", "Enter", {});
-  *graph.add_node() = CreateNode("6", "Exit", {"3"});
+TEST(TraversalTest, InputsDfsWithLoop) {
+  // Graph with a loop.
+  GraphDef graph = ::tensorflow::test::function::GDef(  //
+      {NDef("2", "Merge", {"1", "5"}, {}),              //
+       NDef("3", "Switch", {"2"}, {}),                  //
+       NDef("4", "Identity", {"3"}, {}),                //
+       NDef("5", "NextIteration", {"4"}, {}),           //
+       NDef("1", "Enter", {}, {}),                      //
+       NDef("6", "Exit", {"3"}, {})},                   //
+      /*funcs=*/{});
 
   std::vector<const NodeDef*> start_nodes = {&graph.node(5)};
+
   std::vector<string> pre_order;
   std::vector<string> post_order;
   std::vector<string> back_edges;
-  ReverseDfs(
-      GraphView(&graph), start_nodes,
-      [&pre_order](const NodeDef* n) { pre_order.push_back(n->name()); },
-      [&post_order](const NodeDef* n) { post_order.push_back(n->name()); },
-      [&back_edges](const NodeDef* src, const NodeDef* dst) {
-        back_edges.push_back(strings::StrCat(src->name(), "->", dst->name()));
-      });
-
-  // Pre/Post order traversals are non deterministic because a node fanin is an
-  // absl::flat_hash_set with non deterministic traversal order.
-  using ValidTraversal = std::pair<std::vector<string>, std::vector<string>>;
-
-  std::set<ValidTraversal> valid_traversals = {
-      // pre_order                     post_order
-      {{"6", "3", "2", "4", "5", "1"}, {"5", "4", "1", "2", "3", "6"}},
-      {{"6", "3", "2", "1", "5", "4"}, {"1", "4", "5", "2", "3", "6"}},
-      {{"6", "3", "2", "5", "4", "1"}, {"4", "5", "1", "2", "3", "6"}}};
-
-  EXPECT_EQ(valid_traversals.count({pre_order, post_order}), 1);
-  EXPECT_EQ(std::vector<string>({"4->3"}), back_edges);
+
+  GraphTopologyView graph_view;
+  TF_CHECK_OK(graph_view.InitializeFromGraph(graph));
+  DfsTraversal(graph_view, start_nodes, TraversalDirection::kFollowInputs,
+               MkCallbacks(&pre_order, &post_order, &back_edges));
+
+  const std::vector<string> expected_pre = {"6", "3", "2", "1", "5", "4"};
+  const std::vector<string> expected_post = {"1", "4", "5", "2", "3", "6"};
+  const std::vector<string> expected_edges = {"4->3"};
+
+  EXPECT_EQ(pre_order, expected_pre);
+  EXPECT_EQ(post_order, expected_post);
+  EXPECT_EQ(back_edges, expected_edges);
+}
+
+TEST(TraversalTest, OutputDfsWithLoop) {
+  // Graph with a loop.
+  GraphDef graph = ::tensorflow::test::function::GDef(  //
+      {NDef("2", "Merge", {"1", "5"}, {}),              //
+       NDef("3", "Switch", {"2"}, {}),                  //
+       NDef("4", "Identity", {"3"}, {}),                //
+       NDef("5", "NextIteration", {"4"}, {}),           //
+       NDef("1", "Enter", {}, {}),                      //
+       NDef("6", "Exit", {"3"}, {})},                   //
+      /*funcs=*/{});
+
+  std::vector<const NodeDef*> start_nodes = {&graph.node(0)};
+
+  std::vector<string> pre_order;
+  std::vector<string> post_order;
+  std::vector<string> back_edges;
+
+  GraphTopologyView graph_view;
+  TF_CHECK_OK(graph_view.InitializeFromGraph(graph));
+  DfsTraversal(graph_view, start_nodes, TraversalDirection::kFollowOutputs,
+               MkCallbacks(&pre_order, &post_order, &back_edges));
+
+  const std::vector<string> expected_pre = {"2", "3", "6", "4", "5"};
+  const std::vector<string> expected_post = {"6", "5", "4", "3", "2"};
+  const std::vector<string> expected_edges = {"5->2"};
+
+  EXPECT_EQ(pre_order, expected_pre);
+  EXPECT_EQ(post_order, expected_post);
+  EXPECT_EQ(back_edges, expected_edges);
+}
+
+TEST(TraversalTest, DfsWithEnterPredicate) {
+  const string op = "OpIsNotImportantInThisTest";
+
+  GraphDef graph = ::tensorflow::test::function::GDef(  //
+      {NDef("1", op, {}, {}),                           //       2 -> 3
+       NDef("2", op, {"1"}, {}),                        // 1 -> /      \ -> 6
+       NDef("3", op, {"2"}, {}),                        //      \      /
+       NDef("4", op, {"1"}, {}),                        //       4 -> 5
+       NDef("5", op, {"4"}, {}),                        //
+       NDef("6", op, {"3", "5"}, {})},                  //
+      /*funcs=*/{});
+
+  // Do not enter the nodes '2' and '3'.
+  const auto enter = [](const NodeDef* node) {
+    return node->name() != "2" && node->name() != "3";
+  };
+
+  std::vector<const NodeDef*> start_nodes = {&graph.node(0)};
+
+  std::vector<string> pre_order;
+  std::vector<string> post_order;
+  std::vector<string> back_edges;
+
+  GraphTopologyView graph_view;
+  TF_CHECK_OK(graph_view.InitializeFromGraph(graph));
+  DfsTraversal(graph_view, start_nodes, TraversalDirection::kFollowOutputs,
+               DfsPredicates::Enter(enter),
+               MkCallbacks(&pre_order, &post_order, &back_edges));
+
+  const std::vector<string> expected_pre = {"1", "4", "5", "6"};
+  const std::vector<string> expected_post = {"6", "5", "4", "1"};
+
+  EXPECT_EQ(pre_order, expected_pre);
+  EXPECT_EQ(post_order, expected_post);
+  EXPECT_TRUE(back_edges.empty());
+}
+
+TEST(TraversalTest, DfsWithAdvancePredicate) {
+  const string op = "OpIsNotImportantInThisTest";
+
+  GraphDef graph = ::tensorflow::test::function::GDef(  //
+      {NDef("1", op, {}, {}),                           //       2 -> 3
+       NDef("2", op, {"1"}, {}),                        // 1 -> /      \ -> 6
+       NDef("3", op, {"2"}, {}),                        //      \      /
+       NDef("4", op, {"1"}, {}),                        //       4 -> 5
+       NDef("5", op, {"4"}, {}),                        //
+       NDef("6", op, {"3", "5"}, {})},                  //
+      {} /* empty function library*/);
+
+  // Do not advance from the nodes '2' and '3'.
+  const auto advance = [](const NodeDef* node) {
+    return node->name() != "2" && node->name() != "3";
+  };
+
+  std::vector<const NodeDef*> start_nodes = {&graph.node(0)};
+
+  std::vector<string> pre_order;
+  std::vector<string> post_order;
+  std::vector<string> back_edges;
+
+  GraphTopologyView graph_view;
+  TF_CHECK_OK(graph_view.InitializeFromGraph(graph));
+  DfsTraversal(graph_view, start_nodes, TraversalDirection::kFollowOutputs,
+               DfsPredicates::Advance(advance),
+               MkCallbacks(&pre_order, &post_order, &back_edges));
+
+  const std::vector<string> expected_pre = {"1", "4", "5", "6", "2"};
+  const std::vector<string> expected_post = {"6", "5", "4", "2", "1"};
+
+  EXPECT_EQ(pre_order, expected_pre);
+  EXPECT_EQ(post_order, expected_post);
+  EXPECT_TRUE(back_edges.empty());
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
index e993391b51bfe882a1e662f220ace0542db4ffba..e30b1c5b730a2c67101b9b6364b414ea2f7003d8 100644
--- a/tensorflow/core/grappler/utils_test.cc
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <unistd.h>
 #include <limits>
 #include <memory>
+
+#include "absl/strings/substitute.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
@@ -124,56 +126,56 @@ class UtilsTest : public ::testing::Test {
 };
 
 TEST_F(UtilsTest, NodeName) {
-  EXPECT_EQ("abc", NodeName("abc"));
-  EXPECT_EQ("abc", NodeName("^abc"));
-  EXPECT_EQ("abc", NodeName("abc:0"));
-  EXPECT_EQ("abc", NodeName("^abc:0"));
-
-  EXPECT_EQ("abc/def", NodeName("abc/def"));
-  EXPECT_EQ("abc/def", NodeName("^abc/def"));
-  EXPECT_EQ("abc/def", NodeName("abc/def:1"));
-  EXPECT_EQ("abc/def", NodeName("^abc/def:1"));
-
-  EXPECT_EQ("abc/def0", NodeName("abc/def0"));
-  EXPECT_EQ("abc/def0", NodeName("^abc/def0"));
-  EXPECT_EQ("abc/def0", NodeName("abc/def0:0"));
-  EXPECT_EQ("abc/def0", NodeName("^abc/def0:0"));
-
-  EXPECT_EQ("abc/def_0", NodeName("abc/def_0"));
-  EXPECT_EQ("abc/def_0", NodeName("^abc/def_0"));
-  EXPECT_EQ("abc/def_0", NodeName("abc/def_0:3"));
-  EXPECT_EQ("abc/def_0", NodeName("^abc/def_0:3"));
-
-  EXPECT_EQ("abc/def_0", NodeName("^abc/def_0:3214"));
+  EXPECT_EQ(NodeName("abc"), "abc");
+  EXPECT_EQ(NodeName("^abc"), "abc");
+  EXPECT_EQ(NodeName("abc:0"), "abc");
+  EXPECT_EQ(NodeName("^abc:0"), "abc");
+
+  EXPECT_EQ(NodeName("abc/def"), "abc/def");
+  EXPECT_EQ(NodeName("^abc/def"), "abc/def");
+  EXPECT_EQ(NodeName("abc/def:1"), "abc/def");
+  EXPECT_EQ(NodeName("^abc/def:1"), "abc/def");
+
+  EXPECT_EQ(NodeName("abc/def0"), "abc/def0");
+  EXPECT_EQ(NodeName("^abc/def0"), "abc/def0");
+  EXPECT_EQ(NodeName("abc/def0:0"), "abc/def0");
+  EXPECT_EQ(NodeName("^abc/def0:0"), "abc/def0");
+
+  EXPECT_EQ(NodeName("abc/def_0"), "abc/def_0");
+  EXPECT_EQ(NodeName("^abc/def_0"), "abc/def_0");
+  EXPECT_EQ(NodeName("abc/def_0:3"), "abc/def_0");
+  EXPECT_EQ(NodeName("^abc/def_0:3"), "abc/def_0");
+
+  EXPECT_EQ(NodeName("^abc/def_0:3214"), "abc/def_0");
 }
 
 TEST_F(UtilsTest, NodePosition) {
-  EXPECT_EQ(2, NodePosition("abc:2"));
-  EXPECT_EQ(123, NodePosition("abc:123"));
-  EXPECT_EQ(-1, NodePosition("^abc:123"));
-  EXPECT_EQ(-1, NodePosition("^abc"));
-  EXPECT_EQ(0, NodePosition(""));
+  EXPECT_EQ(NodePosition("abc:2"), 2);
+  EXPECT_EQ(NodePosition("abc:123"), 123);
+  EXPECT_EQ(NodePosition("^abc:123"), -1);
+  EXPECT_EQ(NodePosition("^abc"), -1);
+  EXPECT_EQ(NodePosition(""), 0);
 }
 
 TEST_F(UtilsTest, NodePositionIfSameNode) {
-  EXPECT_EQ(-2, NodePositionIfSameNode(":123", ""));
-  EXPECT_EQ(-2, NodePositionIfSameNode(":", ""));
-  EXPECT_EQ(-2, NodePositionIfSameNode("", ""));
-  EXPECT_EQ(123, NodePositionIfSameNode("abc:123", "abc"));
-  EXPECT_EQ(-1, NodePositionIfSameNode("^abc", "abc"));
-  EXPECT_EQ(-1, NodePositionIfSameNode("^abc:123", "abc"));
-  EXPECT_EQ(-2, NodePositionIfSameNode("abc", "xyz"));
-  EXPECT_EQ(-2, NodePositionIfSameNode("abc", "abc/xyz"));
-  EXPECT_EQ(-2, NodePositionIfSameNode("abc/xyz", "abc"));
-  EXPECT_EQ(-2, NodePositionIfSameNode("abc:123", "xyz"));
-  EXPECT_EQ(-2, NodePositionIfSameNode("^abc", "xyz"));
-  EXPECT_EQ(-2, NodePositionIfSameNode("^abc:123", "xyz"));
+  EXPECT_EQ(NodePositionIfSameNode(":123", ""), -2);
+  EXPECT_EQ(NodePositionIfSameNode(":", ""), -2);
+  EXPECT_EQ(NodePositionIfSameNode("", ""), -2);
+  EXPECT_EQ(NodePositionIfSameNode("abc:123", "abc"), 123);
+  EXPECT_EQ(NodePositionIfSameNode("^abc", "abc"), -1);
+  EXPECT_EQ(NodePositionIfSameNode("^abc:123", "abc"), -1);
+  EXPECT_EQ(NodePositionIfSameNode("abc", "xyz"), -2);
+  EXPECT_EQ(NodePositionIfSameNode("abc", "abc/xyz"), -2);
+  EXPECT_EQ(NodePositionIfSameNode("abc/xyz", "abc"), -2);
+  EXPECT_EQ(NodePositionIfSameNode("abc:123", "xyz"), -2);
+  EXPECT_EQ(NodePositionIfSameNode("^abc", "xyz"), -2);
+  EXPECT_EQ(NodePositionIfSameNode("^abc:123", "xyz"), -2);
 }
 
 TEST_F(UtilsTest, AddNodeNamePrefix) {
-  EXPECT_EQ("OPTIMIZED/abc", AddPrefixToNodeName("abc", "OPTIMIZED"));
-  EXPECT_EQ("^OPTIMIZED/abc", AddPrefixToNodeName("^abc", "OPTIMIZED"));
-  EXPECT_EQ("OPTIMIZED/", AddPrefixToNodeName("", "OPTIMIZED"));
+  EXPECT_EQ(AddPrefixToNodeName("abc", "OPTIMIZED"), "OPTIMIZED/abc");
+  EXPECT_EQ(AddPrefixToNodeName("^abc", "OPTIMIZED"), "^OPTIMIZED/abc");
+  EXPECT_EQ(AddPrefixToNodeName("", "OPTIMIZED"), "OPTIMIZED/");
 }
 
 TEST_F(UtilsTest, ExecuteWithTimeout) {
@@ -204,17 +206,17 @@ TEST_F(UtilsTest, ExecuteWithTimeout) {
 
 TEST_F(UtilsTest, NumOutputs) {
   GraphDef graph;
-  EXPECT_EQ(2, NumOutputs(CreateConcatOffsetNode(), &graph));
-  EXPECT_EQ(5, NumOutputs(CreateFusedBatchNormNode(), &graph));
-  EXPECT_EQ(1, NumOutputs(CreateDequeueNode(), &graph));
+  EXPECT_EQ(NumOutputs(CreateConcatOffsetNode(), &graph), 2);
+  EXPECT_EQ(NumOutputs(CreateFusedBatchNormNode(), &graph), 5);
+  EXPECT_EQ(NumOutputs(CreateDequeueNode(), &graph), 1);
 }
 
 TEST_F(UtilsTest, AsControlDependency) {
   NodeDef node;
   node.set_name("foo");
-  EXPECT_EQ("^foo", AsControlDependency(node));
-  EXPECT_EQ("^foo", AsControlDependency(node.name()));
-  EXPECT_EQ("^foo", AsControlDependency("^foo"));
+  EXPECT_EQ(AsControlDependency(node), "^foo");
+  EXPECT_EQ(AsControlDependency(node.name()), "^foo");
+  EXPECT_EQ(AsControlDependency("^foo"), "^foo");
 }
 
 TEST_F(UtilsTest, GetTailOfChain) {
@@ -233,22 +235,23 @@ TEST_F(UtilsTest, GetTailOfChain) {
   GraphDef graph;
   TF_CHECK_OK(s.ToGraphDef(&graph));
 
-  ASSERT_EQ("c0", graph.node(0).name());
-  ASSERT_EQ("c1", graph.node(1).name());
-  ASSERT_EQ("neg0", graph.node(2).name());
-  ASSERT_EQ("neg1", graph.node(3).name());
-  ASSERT_EQ("neg2", graph.node(4).name());
-  ASSERT_EQ("id1", graph.node(5).name());
-  ASSERT_EQ("id2", graph.node(6).name());
-  ASSERT_EQ("noop", graph.node(7).name());
+  ASSERT_EQ(graph.node_size(), 8);
+  ASSERT_EQ(graph.node(0).name(), "c0");
+  ASSERT_EQ(graph.node(1).name(), "c1");
+  ASSERT_EQ(graph.node(2).name(), "neg0");
+  ASSERT_EQ(graph.node(3).name(), "neg1");
+  ASSERT_EQ(graph.node(4).name(), "neg2");
+  ASSERT_EQ(graph.node(5).name(), "id1");
+  ASSERT_EQ(graph.node(6).name(), "id2");
+  ASSERT_EQ(graph.node(7).name(), "noop");
 
   NodeMap node_map(&graph);
   auto is_neg = [&](const NodeDef& node) { return node.op() == "Neg"; };
   // We walk backwards, starting as "id1", so tail should be "neg1".
   NodeDef* tail = GetTailOfChain(graph.node(5), node_map,
                                  /*follow_control_input=*/false, is_neg);
-  EXPECT_NE(tail, nullptr);
-  EXPECT_EQ("neg1", tail->name());
+  ASSERT_NE(tail, nullptr);
+  EXPECT_EQ(tail->name(), "neg1");
 
   // We stop at branching nodes, so tail should be "neg2".
   auto is_neg_and_non_branching = [&](const NodeDef& node) {
@@ -257,22 +260,22 @@ TEST_F(UtilsTest, GetTailOfChain) {
   tail =
       GetTailOfChain(graph.node(5), node_map,
                      /*follow_control_input=*/false, is_neg_and_non_branching);
-  EXPECT_NE(tail, nullptr);
-  EXPECT_EQ("neg2", tail->name());
+  ASSERT_NE(tail, nullptr);
+  EXPECT_EQ(tail->name(), "neg2");
 
   // We walk backwards, starting from "noop", also following control inputs,
   // so tail should be "neg0".
   tail = GetTailOfChain(graph.node(7), node_map,
                         /*follow_control_input=*/true, is_neg);
-  EXPECT_NE(tail, nullptr);
-  EXPECT_EQ("neg0", tail->name());
+  ASSERT_NE(tail, nullptr);
+  EXPECT_EQ(tail->name(), "neg0");
 
   // We walk backwards, starting from "noop", not following control inputs,
   // so tail should be "noop" itself.
   tail = GetTailOfChain(graph.node(7), node_map,
                         /*follow_control_input=*/false, is_neg);
-  EXPECT_NE(tail, nullptr);
-  EXPECT_EQ("noop", tail->name());
+  ASSERT_NE(tail, nullptr);
+  EXPECT_EQ(tail->name(), "noop");
 }
 
 TEST_F(UtilsTest, DedupControlInputs) {
@@ -280,40 +283,40 @@ TEST_F(UtilsTest, DedupControlInputs) {
   foo.set_name("foo");
   foo.add_input("bar");
   DedupControlInputs(&foo);
-  EXPECT_EQ(1, foo.input_size());
-  EXPECT_EQ("bar", foo.input(0));
+  ASSERT_EQ(foo.input_size(), 1);
+  EXPECT_EQ(foo.input(0), "bar");
 
   foo.set_input(0, "^bar");
   DedupControlInputs(&foo);
-  EXPECT_EQ(1, foo.input_size());
-  EXPECT_EQ("^bar", foo.input(0));
+  ASSERT_EQ(foo.input_size(), 1);
+  EXPECT_EQ(foo.input(0), "^bar");
 
   foo.set_input(0, "bar");
   foo.add_input("bar");
   DedupControlInputs(&foo);
-  EXPECT_EQ(2, foo.input_size());
-  EXPECT_EQ("bar", foo.input(0));
-  EXPECT_EQ("bar", foo.input(1));
+  ASSERT_EQ(foo.input_size(), 2);
+  EXPECT_EQ(foo.input(0), "bar");
+  EXPECT_EQ(foo.input(1), "bar");
 
   foo.set_input(1, "^bar");
   DedupControlInputs(&foo);
-  EXPECT_EQ(1, foo.input_size());
-  EXPECT_EQ("bar", foo.input(0));
+  ASSERT_EQ(foo.input_size(), 1);
+  EXPECT_EQ(foo.input(0), "bar");
 
   foo.set_input(0, "^bar");
   foo.add_input("^bar");
   DedupControlInputs(&foo);
-  EXPECT_EQ(1, foo.input_size());
-  EXPECT_EQ("^bar", foo.input(0));
+  ASSERT_EQ(foo.input_size(), 1);
+  EXPECT_EQ(foo.input(0), "^bar");
 
   foo.set_input(0, "bar");
   foo.add_input("gnu");
   foo.add_input("^bar");
   foo.add_input("^gnu");
   DedupControlInputs(&foo);
-  EXPECT_EQ(2, foo.input_size());
-  EXPECT_EQ("bar", foo.input(0));
-  EXPECT_EQ("gnu", foo.input(1));
+  ASSERT_EQ(foo.input_size(), 2);
+  EXPECT_EQ(foo.input(0), "bar");
+  EXPECT_EQ(foo.input(1), "gnu");
 }
 
 TEST_F(UtilsTest, NumNonControlOutputs) {
@@ -347,14 +350,14 @@ TEST_F(UtilsTest, NumNonControlOutputs) {
   NodeMap node_map(&graph);
 
   const NodeDef* add_node = node_map.GetNode("add");
-  ASSERT_TRUE(add_node != nullptr);
+  ASSERT_NE(add_node, nullptr);
 
   // [a, b] are only non-control inputs
-  EXPECT_EQ(2, NumNonControlInputs(*add_node));
+  EXPECT_EQ(NumNonControlInputs(*add_node), 2);
   // [sqrt, shape] are non control outputs
-  EXPECT_EQ(2, NumNonControlOutputs(*add_node, node_map));
+  EXPECT_EQ(NumNonControlOutputs(*add_node, node_map), 2);
   // sqrt is the only data output
-  EXPECT_EQ(1, NumNonControlDataOutputs(*add_node, node_map));
+  EXPECT_EQ(NumNonControlDataOutputs(*add_node, node_map), 1);
 }
 
 TEST(CheckAttrExists, All) {
@@ -464,6 +467,107 @@ TEST_F(UtilsTest, SetTensorValueBFloat16IntMin) {
       Tensor(bfloat16(std::numeric_limits<int>::min())), t);
 }
 
+TEST_F(UtilsTest, TensorIdToString) {
+  EXPECT_EQ(TensorIdToString({"foo", -1}), "^foo");
+  EXPECT_EQ(TensorIdToString({"foo", 0}), "foo");
+  EXPECT_EQ(TensorIdToString({"foo", 1}), "foo:1");
+  EXPECT_EQ(TensorIdToString({"foo", 2}), "foo:2");
+}
+
+template <typename T>
+void TestSetTensorValue(DataType type, int val, bool success,
+                        absl::string_view error_msg) {
+  Tensor t(type, TensorShape({}));
+  Status s = SetTensorValue(t.dtype(), val, &t);
+  EXPECT_EQ(s.ok(), success);
+  if (s.ok()) {
+    test::ExpectTensorEqual<T>(Tensor(static_cast<T>(val)), t);
+  } else {
+    EXPECT_EQ(s.error_message(), error_msg);
+  }
+}
+
+TEST(SetTensorValueTest, Quantized) {
+  auto int_min_error = [](DataType type) {
+    return absl::Substitute(
+        "Cannot store value -2147483648 in tensor of type $0",
+        DataType_Name(type));
+  };
+  auto int_max_error = [](DataType type) {
+    return absl::Substitute(
+        "Cannot store value 2147483647 in tensor of type $0",
+        DataType_Name(type));
+  };
+  const int kMinInt = std::numeric_limits<int>::min();
+  const int kMaxInt = std::numeric_limits<int>::max();
+
+  TestSetTensorValue<qint8>(DT_QINT8, -8, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint8>(DT_QINT8, 0, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint8>(DT_QINT8, 8, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint8>(DT_QINT8, std::numeric_limits<qint8>::min(),
+                            /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint8>(DT_QINT8, std::numeric_limits<qint8>::max(),
+                            /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint8>(DT_QINT8, kMinInt, /*success=*/false,
+                            int_min_error(DT_QINT8));
+  TestSetTensorValue<qint8>(DT_QINT8, kMaxInt, /*success=*/false,
+                            int_max_error(DT_QINT8));
+
+  TestSetTensorValue<quint8>(
+      DT_QUINT8, -8, /*success=*/false,
+      /*error_msg=*/"Cannot store value -8 in tensor of type DT_QUINT8");
+  TestSetTensorValue<quint8>(DT_QUINT8, 0, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<quint8>(DT_QUINT8, 8, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<quint8>(DT_QUINT8, std::numeric_limits<quint8>::min(),
+                             /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<quint8>(DT_QUINT8, std::numeric_limits<quint8>::max(),
+                             /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<quint8>(DT_QUINT8, kMinInt, /*success=*/false,
+                             int_min_error(DT_QUINT8));
+  TestSetTensorValue<quint8>(DT_QUINT8, kMaxInt, /*success=*/false,
+                             int_max_error(DT_QUINT8));
+
+  TestSetTensorValue<qint16>(DT_QINT16, -8, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint16>(DT_QINT16, 0, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint16>(DT_QINT16, 8, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint16>(DT_QINT16, std::numeric_limits<qint16>::min(),
+                             /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint16>(DT_QINT16, std::numeric_limits<qint16>::max(),
+                             /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint16>(DT_QINT16, kMinInt, /*success=*/false,
+                             int_min_error(DT_QINT16));
+  TestSetTensorValue<qint16>(DT_QINT16, kMaxInt, /*success=*/false,
+                             int_max_error(DT_QINT16));
+
+  TestSetTensorValue<quint16>(
+      DT_QUINT16, -8, /*success=*/false,
+      /*error_msg=*/"Cannot store value -8 in tensor of type DT_QUINT16");
+  TestSetTensorValue<quint16>(DT_QUINT16, 0, /*success=*/true,
+                              /*error_msg=*/"");
+  TestSetTensorValue<quint16>(DT_QUINT16, 8, /*success=*/true,
+                              /*error_msg=*/"");
+  TestSetTensorValue<quint16>(DT_QUINT16, std::numeric_limits<quint16>::min(),
+                              /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<quint16>(DT_QUINT16, std::numeric_limits<quint16>::max(),
+                              /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<quint16>(DT_QUINT16, kMinInt, /*success=*/false,
+                              int_min_error(DT_QUINT16));
+  TestSetTensorValue<quint16>(DT_QUINT16, kMaxInt, /*success=*/false,
+                              int_max_error(DT_QUINT16));
+
+  TestSetTensorValue<qint32>(DT_QINT32, -8, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint32>(DT_QINT32, 0, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint32>(DT_QINT32, 8, /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint32>(DT_QINT32, std::numeric_limits<qint32>::min(),
+                             /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint32>(DT_QINT32, std::numeric_limits<qint32>::max(),
+                             /*success=*/true, /*error_msg=*/"");
+  TestSetTensorValue<qint32>(DT_QINT32, kMinInt, /*success=*/true,
+                             /*error_msg=*/"");
+  TestSetTensorValue<qint32>(DT_QINT32, kMaxInt, /*success=*/true,
+                             /*error_msg=*/"");
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/verifiers/BUILD b/tensorflow/core/grappler/verifiers/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e3e1538b00c5ca446deea5859771286f45736c6d
--- /dev/null
+++ b/tensorflow/core/grappler/verifiers/BUILD
@@ -0,0 +1,50 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+cc_library(
+    name = "graph_verifier",
+    hdrs = [
+        "graph_verifier.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "structure_verifier",
+    srcs = ["structure_verifier.cc"],
+    hdrs = [
+        "structure_verifier.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_verifier",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler/utils:topological_sort",
+    ],
+)
+
+tf_cc_test(
+    name = "structure_verifier_test",
+    srcs = ["structure_verifier_test.cc"],
+    deps = [
+        ":structure_verifier",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "@com_google_absl//absl/strings",
+    ],
+)
diff --git a/tensorflow/core/grappler/verifiers/graph_verifier.h b/tensorflow/core/grappler/verifiers/graph_verifier.h
new file mode 100644
index 0000000000000000000000000000000000000000..10fd201eadcfd33709c0e7d2540528ad895b3358
--- /dev/null
+++ b/tensorflow/core/grappler/verifiers/graph_verifier.h
@@ -0,0 +1,55 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_VERIFIERS_GRAPH_VERIFIER_H_
+#define TENSORFLOW_CORE_GRAPPLER_VERIFIERS_GRAPH_VERIFIER_H_
+
+#include <string>
+#include <vector>
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// An abstract interface for verifying a graph.
+// This will be used to implement specific verifiers to verify that a grappler
+// transformed graph is valid.
+// Some examples of specific verifiers are:
+// 1. A general structural verifier that verifies that the specified graph has
+//    a valid structure that meets the specification of what it means to be
+//      a valid TensorFlow graph.
+// 2. A backend specific verifier that verifies that the specified graph,
+//     generated after a grappler transformation to convert the input TensorFlow
+//     graph to a corresponding backend graph, is a valid graph in the
+//     specification of the backend.
+class GraphVerifier {
+ public:
+  GraphVerifier() {}
+  virtual ~GraphVerifier() {}
+
+  // A name for the verifier.
+  virtual string name() const = 0;
+
+  // Implement an algorithm to verify the specified graph.
+  // The return value is a Status that represents a concatenation of Status of
+  // each verification step.
+  virtual Status Verify(const GraphDef& graph) = 0;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_VERIFIERS_GRAPH_VERIFIER_H_
diff --git a/tensorflow/core/grappler/verifiers/structure_verifier.cc b/tensorflow/core/grappler/verifiers/structure_verifier.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2b438b56c4d2063aca9c4fcaf707c617067b71ed
--- /dev/null
+++ b/tensorflow/core/grappler/verifiers/structure_verifier.cc
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/verifiers/structure_verifier.h"
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/validate.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/grappler/verifiers/graph_verifier.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// TODO(ashwinm): Expand this to add more structural checks.
+Status StructureVerifier::Verify(const GraphDef& graph) {
+  StatusGroup status_group;
+
+  FunctionLibraryDefinition function_library(OpRegistry::Global(),
+                                             graph.library());
+  status_group.Update(tensorflow::graph::ValidateGraphDefAgainstOpRegistry(
+      graph, function_library));
+  status_group.Update(tensorflow::graph::VerifyNoDuplicateNodeNames(graph));
+
+  std::vector<const NodeDef*> topo_order;
+  status_group.Update(ComputeTopologicalOrder(graph, &topo_order));
+  return status_group.as_status();
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/verifiers/structure_verifier.h b/tensorflow/core/grappler/verifiers/structure_verifier.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab719f1214eebb624d50a814ce437ffe3957304d
--- /dev/null
+++ b/tensorflow/core/grappler/verifiers/structure_verifier.h
@@ -0,0 +1,43 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_VERIFIERS_STRUCTURE_VERIFIER_H_
+#define TENSORFLOW_CORE_GRAPPLER_VERIFIERS_STRUCTURE_VERIFIER_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/verifiers/graph_verifier.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Verifies the structure of a graph to ensure it is valid.
+class StructureVerifier : public GraphVerifier {
+ public:
+  StructureVerifier() {}
+  ~StructureVerifier() override {}
+
+  string name() const override { return "structure_verifier"; };
+
+  Status Verify(const GraphDef& graph) override;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_VERIFIERS_STRUCTURE_VERIFIER_H_
diff --git a/tensorflow/core/grappler/verifiers/structure_verifier_test.cc b/tensorflow/core/grappler/verifiers/structure_verifier_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d3b1d0646d9b336cd8a70d5b44bf33eed9f8432c
--- /dev/null
+++ b/tensorflow/core/grappler/verifiers/structure_verifier_test.cc
@@ -0,0 +1,116 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "absl/strings/match.h"
+#include "tensorflow/cc/ops/parsing_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/grappler/verifiers/structure_verifier.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class StructureVerifierTest : public ::testing::Test {
+ protected:
+  StructureVerifierTest() { verifier_.reset(new StructureVerifier()); }
+  void SetGraph(const string& gdef_ascii) {
+    CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &graph_));
+  }
+  GraphDef graph_;
+  std::unique_ptr<StructureVerifier> verifier_;
+};
+
+Status Scalars(shape_inference::InferenceContext* c) {
+  for (int i = 0; i < c->num_outputs(); ++i) {
+    c->set_output(i, c->Scalar());
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("TestParams").Output("o: float").SetShapeFn(Scalars);
+REGISTER_OP("TestInput")
+    .Output("a: float")
+    .Output("b: float")
+    .SetShapeFn(Scalars);
+REGISTER_OP("TestMul")
+    .Input("a: float")
+    .Input("b: float")
+    .Output("o: float")
+    .SetShapeFn(Scalars);
+
+TEST_F(StructureVerifierTest, ValidGraphs) {
+  // With scope, ops gets registered automatically.
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
+  ops::ShapeN b(s.WithOpName("b"), {a, a, a});
+
+  GraphDef graph;
+  TF_CHECK_OK(s.ToGraphDef(&graph));
+  TF_EXPECT_OK(verifier_->Verify(graph));
+
+  // With graphdef directly, relies on REGISTER_OP to register ops
+  SetGraph(
+      "node { name: 'W1' op: 'TestParams' }"
+      "node { name: 'input' op: 'TestInput' }"
+      "node { name: 't1' op: 'TestMul' input: [ 'W1', 'input:1' ] }");
+
+  TF_EXPECT_OK(verifier_->Verify(graph_));
+}
+
+TEST_F(StructureVerifierTest, OpNotRegistered) {
+  SetGraph(
+      "node { name: 'input' op: 'OpNotRegistered' }"
+      "node { name: 't1' op: 'TestMul' input: [ 'input:0', 't2' ] }"
+      "node { name: 't2' op: 'TestMul' input: [ 'input:1', 't1' ] }");
+  Status status = verifier_->Verify(graph_);
+  EXPECT_EQ(status.code(), errors::Code::NOT_FOUND);
+  EXPECT_TRUE(
+      absl::StrContains(status.error_message(), "Op type not registered"));
+}
+
+TEST_F(StructureVerifierTest, DuplicateNodeNames) {
+  SetGraph(
+      "node { name: 'A' op: 'TestParams' }"
+      "node { name: 'A' op: 'TestInput' }");
+  Status status = verifier_->Verify(graph_);
+  EXPECT_EQ(status.code(), errors::Code::ALREADY_EXISTS);
+  EXPECT_TRUE(
+      absl::StrContains(status.error_message(), "Node already exists:"));
+}
+
+TEST_F(StructureVerifierTest, GraphWithInvalidCycle) {
+  SetGraph(
+      "node { name: 'input' op: 'TestInput' }"
+      "node { name: 't1' op: 'TestMul' input: [ 'input:0', 't2' ] }"
+      "node { name: 't2' op: 'TestMul' input: [ 'input:1', 't1' ] }");
+  Status status = verifier_->Verify(graph_);
+  EXPECT_EQ(status.code(), errors::Code::INVALID_ARGUMENT);
+  EXPECT_TRUE(
+      absl::StrContains(status.error_message(),
+                        "The graph couldn't be sorted in topological order"));
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 0e5d8d765a6bfde3a0e187c0b386174d3b20a098..d176d3eca166faaf578e40246e1f98973ba82c30 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -33,6 +33,7 @@ load(
     "if_android",
     "if_not_windows",
     "tf_cc_binary",
+    "tf_cc_shared_object",
     "tf_cc_test",
     "tf_cc_test_mkl",
     "tf_cc_tests",
@@ -48,7 +49,6 @@ load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests")
 load(
     "//tensorflow/core:platform/default/build_config.bzl",
     "tf_kernel_tests_linkstatic",
-    "tf_proto_library",
 )
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
@@ -61,6 +61,7 @@ load(
     "mkl_deps",
 )
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("//tensorflow:tensorflow.bzl", "if_nccl")
 
 config_setting(
     # Add "--define tensorflow_xsmm=1" to your build command to use libxsmm for
@@ -94,13 +95,14 @@ config_setting(
 )
 
 config_setting(
-    # Add "--define tensorflow_mkldnn_contraction_kernel=1" to your build command to use mkldnn
+    # Add "--define tensorflow_mkldnn_contraction_kernel=0" to your build command to disable mkldnn
     # sgemm in Eigen tensor contractions (matrix multiplications and convolutions). The mkldnn
     # kernels are generated at runtime and use avx/avx2/fma/avx512 based on cpu status registers
-    # (https://en.wikipedia.org/wiki/CPUID).
-    name = "mkldnn_contraction_kernel",
+    # (https://en.wikipedia.org/wiki/CPUID). Default Eigen contraction kernel is
+    # Eigen::internal::gebp_kernel (general block-panel kernel).
+    name = "no_mkldnn_contraction_kernel",
     values = {
-        "define": "tensorflow_mkldnn_contraction_kernel=1",
+        "define": "tensorflow_mkldnn_contraction_kernel=0",
     },
 )
 
@@ -137,7 +139,11 @@ tf_kernel_library(
         "slice_op.h",
         "strided_slice_op.h",
         "strided_slice_op_impl.h",
-        "strided_slice_op_gpu.cu.cc",
+        "strided_slice_op_gpu_impl.h",
+        "strided_slice_op_gpu_int.cu.cc",
+        "strided_slice_op_gpu_complex.cu.cc",
+        "strided_slice_op_gpu_bool.cu.cc",
+        "strided_slice_op_gpu_number_types.cu.cc",
     ],
     deps = [
         ":bounds_check",
@@ -151,14 +157,63 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "clustering_ops",
+    prefix = "clustering_ops",
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "clustering_ops_test",
+    srcs = ["clustering_ops_test.cc"],
+    deps = [
+        ":clustering_ops",
+        "//tensorflow/core:clustering_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "collective_ops",
+    srcs = if_nccl([
+        "collective_nccl_reducer.h",
+        "collective_nccl_reducer.cc",
+    ]),
     prefix = "collective_ops",
     deps = [
-        "//tensorflow/core:collective_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+    ] + if_nccl([
+        "@local_config_nccl//:nccl",
+        "//tensorflow/core/nccl:nccl_lib",
+    ]),
+)
+
+tf_cuda_cc_test(
+    name = "collective_nccl_reducer_test",
+    size = "small",
+    srcs = ["collective_nccl_reducer_test.cc"],
+    tags = tf_cuda_tests_tags() + ["no_cuda_on_cpu_tap"],
+    deps = [
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
     ],
 )
 
@@ -220,7 +275,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
     ],
     alwayslink = 1,
@@ -311,10 +365,62 @@ tf_kernel_library(
         "//tensorflow/core/nccl:nccl_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:gpu_headers_lib",
-        "//tensorflow/core:nccl_ops_op_lib",
     ]),
 )
 
+cc_library(
+    name = "sparse_utils",
+    srcs = [
+        "sparse_utils.cc",
+    ],
+    hdrs = ["sparse_utils.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_cc_test(
+    name = "sparse_utils_test",
+    srcs = ["sparse_utils_test.cc"],
+    deps = [
+        ":sparse_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
+
+cc_library(
+    name = "tensor_flag_utils",
+    srcs = [
+        "tensor_flag_utils.cc",
+    ],
+    hdrs = ["tensor_flag_utils.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "tensor_flag_utils_test",
+    srcs = ["tensor_flag_utils_test.cc"],
+    deps = [
+        ":tensor_flag_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
+
 tf_cuda_library(
     name = "ops_testutil",
     testonly = 1,
@@ -457,12 +563,11 @@ cc_library(
     name = "batch_kernels",
     srcs = ["batch_kernels.cc"],
     deps = [
-        "//tensorflow/core:batch_ops_op_lib",
+        ":concat_lib_hdrs",
+        ":ops_util_hdrs",
+        ":split_lib_hdrs",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/kernels:concat_lib_hdrs",
-        "//tensorflow/core/kernels:ops_util_hdrs",
-        "//tensorflow/core/kernels:split_lib_hdrs",
         "//tensorflow/core/kernels/batching_util:periodic_function_dynamic",
         "//tensorflow/core/kernels/batching_util:shared_batch_scheduler_hdrs",
     ],
@@ -544,13 +649,10 @@ cc_library(
     ],
 )
 
-cc_library(
+alias(
     name = "bounds_check",
-    hdrs = ["bounds_check.h"],
+    actual = "//tensorflow/core:framework_bounds_check",
     visibility = [":friends"],
-    deps = [
-        "//tensorflow/core:framework_bounds_check",
-    ],
 )
 
 # Private support libraries ---------------------------------------------------
@@ -578,12 +680,13 @@ cc_library(
 # tensor contractions (small matrix multiplication kernel used to multiple together
 # blocks of the original tensors).
 #
-# 0) Default contraction kernel is Eigen::internal::gebp_kernel.
-#
-# 1) --define tensorflow_mkldnn_contraction_kernel=1
+# 1) Default:
 #    Use Mkldnn single threaded sgemm. The mkldnn kernels are generated at runtime and
 #    use avx/avx2/fma/avx512 based on cpu status registers (https://en.wikipedia.org/wiki/CPUID).
 #
+# 2) Eigen: --define tensorflow_mkldnn_contraction_kernel=0 (disable mkldnn)
+#    Use Eigen contraction kernel: Eigen::internal::gebp_kernel.
+#
 # If you use `tensor.contract(other_tensor)` in your code, you must include additional header
 # to get the benefit of custom contraction kernel:
 #
@@ -592,19 +695,28 @@ cc_library(
 #   #endif
 cc_library(
     name = "eigen_contraction_kernel",
+    srcs = ["eigen_contraction_kernel.cc"],
     hdrs = ["eigen_contraction_kernel.h"],
     defines = select({
-        ":mkldnn_contraction_kernel": [
+        "//tensorflow:android": [],
+        "//tensorflow:arm": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:linux_ppc64le": [],
+        ":no_mkldnn_contraction_kernel": [],
+        "//conditions:default": [
             "TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL",
             "TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL",
         ],
-        "//conditions:default": [],
     }),
     deps = [
         "//third_party/eigen3",
     ] + select({
-        ":mkldnn_contraction_kernel": ["@mkl_dnn//:mkldnn_single_threaded"],
-        "//conditions:default": [],
+        "//tensorflow:android": [],
+        "//tensorflow:arm": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:linux_ppc64le": [],
+        ":no_mkldnn_contraction_kernel": [],
+        "//conditions:default": ["@mkl_dnn//:mkldnn_single_threaded"],
     }),
 )
 
@@ -644,6 +756,26 @@ cc_header_only_library(
     deps = [":image_resizer_state"],
 )
 
+cc_library(
+    name = "sampling_kernels",
+    srcs = ["sampling_kernels.cc"],
+    hdrs = ["sampling_kernels.h"],
+    visibility = ["//visibility:private"],
+    deps = ["//tensorflow/core:lib"],
+)
+
+tf_cc_test(
+    name = "sampling_kernels_test",
+    srcs = ["sampling_kernels_test.cc"],
+    deps = [
+        ":sampling_kernels",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 # OpKernel libraries ----------------------------------------------------------
 
 ARRAY_DEPS = [
@@ -654,7 +786,6 @@ ARRAY_DEPS = [
     ":ops_util",
     ":transpose_functor",
     "//tensorflow/core:array_grad",
-    "//tensorflow/core:array_ops_op_lib",
     "//tensorflow/core:core_cpu",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
@@ -683,7 +814,6 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib",
-        "//tensorflow/core:set_ops_op_lib",
         "//third_party/eigen3",
     ],
 )
@@ -981,7 +1111,16 @@ tf_kernel_library(
     hdrs = ["tile_functor.h"],
     gpu_srcs = [
         "tile_functor.h",
-        "tile_functor_gpu.cu.cc",
+        "tile_functor_gpu.h",
+        "tile_functor_gpu_bool.cu.cc",
+        "tile_functor_gpu_complex64.cu.cc",
+        "tile_functor_gpu_complex128.cu.cc",
+        "tile_functor_gpu_double.cu.cc",
+        "tile_functor_gpu_float.cu.cc",
+        "tile_functor_gpu_half.cu.cc",
+        "tile_functor_gpu_int16.cu.cc",
+        "tile_functor_gpu_int32.cu.cc",
+        "tile_functor_gpu_int64.cu.cc",
     ],
     prefix = "tile_ops",
     deps = ARRAY_DEPS,
@@ -1047,7 +1186,6 @@ tf_kernel_library(
     srcs = ["ragged_gather_op.cc"],
     deps = [
         "//tensorflow/core:framework",
-        "//tensorflow/core:ragged_array_ops_op_lib",
     ],
 )
 
@@ -1056,13 +1194,12 @@ tf_cc_test(
     size = "small",
     srcs = ["ragged_gather_op_test.cc"],
     deps = [
+        ":ops_testutil",
         ":ragged_gather_op",
         "//tensorflow/core:framework",
-        "//tensorflow/core:ragged_array_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:ops_testutil",
     ],
 )
 
@@ -1071,7 +1208,6 @@ tf_kernel_library(
     srcs = ["ragged_range_op.cc"],
     deps = [
         "//tensorflow/core:framework",
-        "//tensorflow/core:ragged_math_ops_op_lib",
     ],
 )
 
@@ -1079,13 +1215,12 @@ tf_cc_test(
     name = "ragged_range_op_test",
     srcs = ["ragged_range_op_test.cc"],
     deps = [
+        ":ops_testutil",
         ":ragged_range_op",
         "//tensorflow/core:framework",
-        "//tensorflow/core:ragged_math_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:ops_testutil",
     ],
 )
 
@@ -1094,7 +1229,6 @@ tf_kernel_library(
     srcs = ["ragged_tensor_to_sparse_kernel.cc"],
     deps = [
         "//tensorflow/core:framework",
-        "//tensorflow/core:ragged_conversion_ops_op_lib",
     ],
 )
 
@@ -1103,14 +1237,13 @@ tf_cc_test(
     size = "small",
     srcs = ["ragged_tensor_to_sparse_kernel_test.cc"],
     deps = [
+        ":ops_testutil",
         ":ragged_tensor_to_sparse_kernel",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:ragged_conversion_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:ops_testutil",
     ],
 )
 
@@ -1119,13 +1252,12 @@ tf_kernel_library(
     srcs = ["cudnn_rnn_ops.cc"],
     visibility = ["//visibility:public"],
     deps = [
+        ":bounds_check_lib",
         ":gpu_util_hdrs",
-        "//tensorflow/core:cudnn_rnn_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor",
-        "//tensorflow/core/kernels:bounds_check_lib",
         "//third_party/eigen3",
         "@farmhash_archive//:farmhash",
     ],
@@ -1197,7 +1329,6 @@ tf_cuda_cc_test(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:math_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -1261,7 +1392,7 @@ tf_cc_test(
     }),
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "conv_ops_test",
     size = "medium",
     srcs = ["conv_ops_test.cc"],
@@ -1280,6 +1411,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "@com_google_absl//absl/algorithm:container",
     ],
 )
 
@@ -1721,7 +1853,6 @@ tf_kernel_library(
     prefix = "candidate_sampler_ops",
     deps = [
         ":range_sampler",
-        "//tensorflow/core:candidate_sampling_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
@@ -1755,7 +1886,6 @@ tf_kernel_library(
     name = "control_flow_ops",
     prefix = "control_flow_ops",
     deps = [
-        "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
@@ -1767,7 +1897,6 @@ tf_kernel_library(
     deps = [
         ":bounds_check",
         ":ops_util",
-        "//tensorflow/core:ctc_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/util/ctc:ctc_beam_search_lib",
@@ -1848,7 +1977,6 @@ DATA_FLOW_DEPS = [
     ":typed_queue",
     "//third_party/eigen3",
     "//tensorflow/core:core_cpu",
-    "//tensorflow/core:data_flow_ops_op_lib",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
@@ -1913,7 +2041,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:scoped_allocator_ops_op_lib",
     ],
 )
 
@@ -1932,7 +2059,6 @@ tf_cuda_cc_test(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:math_ops_op_lib",
         "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
@@ -1980,7 +2106,6 @@ tf_kernel_library(
 DYNAMIC_DEPS = [
     ":bounds_check",
     "//tensorflow/core:core_cpu",
-    "//tensorflow/core:data_flow_ops_op_lib",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
@@ -2013,7 +2138,6 @@ LOOKUP_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
-    "//tensorflow/core:lookup_ops_op_lib",
 ]
 
 tf_kernel_library(
@@ -2028,6 +2152,16 @@ tf_kernel_library(
     deps = LOOKUP_DEPS,
 )
 
+cc_library(
+    name = "string_view_variant_wrapper",
+    hdrs = ["string_view_variant_wrapper.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "checkpoint_ops",
     deps = [
@@ -2042,7 +2176,6 @@ tf_kernel_library(
     deps = [
         ":lookup_table_init_op",
         ":lookup_table_op",
-        "//tensorflow/core:checkpoint_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//third_party/eigen3",
@@ -2053,7 +2186,6 @@ tf_kernel_library(
     name = "load_and_remap_matrix_op",
     srcs = ["load_and_remap_matrix_op.cc"],
     deps = [
-        "//tensorflow/core:checkpoint_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -2191,15 +2323,13 @@ tf_kernel_library(
         ":bounds_check",
         ":dense_update_functor",
         ":gather_functor",
-        ":mutex_ops",
         ":scatter_functor",
-        ":state",
         ":training_op_helpers",
         ":variable_ops",
+        "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:resource_variable_ops_op_lib",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -2214,9 +2344,9 @@ tf_kernel_library(
     ],
     deps = [
         ":concat_lib",
+        ":fill_functor",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:list_ops_op_lib",
         "//third_party/eigen3",
     ],
 )
@@ -2227,7 +2357,6 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:user_ops_op_lib",
     ],
 )
 
@@ -2250,7 +2379,6 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//third_party/eigen3",
@@ -2263,13 +2391,14 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/optimizers:meta_optimizer",
         "//tensorflow/core/grappler/utils:functions",
+        "//tensorflow/stream_executor:stream",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2295,6 +2424,7 @@ cc_library(
         ":resize_bilinear_op",
         ":resize_nearest_neighbor_op",
         ":sample_distorted_bounding_box_op",
+        ":scale_and_translate_op",
     ],
 )
 
@@ -2305,7 +2435,6 @@ IMAGE_DEPS = [
     "//third_party/eigen3",
     "//tensorflow/core:framework",
     "//tensorflow/core:gif_internal",
-    "//tensorflow/core:image_ops_op_lib",
     "//tensorflow/core:jpeg_internal",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
@@ -2397,6 +2526,12 @@ tf_kernel_library(
     deps = IMAGE_DEPS,
 )
 
+tf_kernel_library(
+    name = "scale_and_translate_op",
+    prefix = "scale_and_translate_op",
+    deps = IMAGE_DEPS + [":sampling_kernels"],
+)
+
 tf_kernel_library(
     name = "random_crop_op",
     prefix = "random_crop_op",
@@ -2438,7 +2573,6 @@ tf_kernel_library(
     prefix = "encode_wav_op",
     deps = [
         ":bounds_check",
-        "//tensorflow/core:audio_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -2450,7 +2584,6 @@ tf_kernel_library(
     name = "decode_wav_op",
     prefix = "decode_wav_op",
     deps = [
-        "//tensorflow/core:audio_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -2473,6 +2606,7 @@ tf_cc_tests(
         ":eigen_helpers",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2482,8 +2616,12 @@ tf_cc_test(
     name = "eigen_mkldnn_contraction_kernel_test",
     size = "small",
     srcs = select({
-        ":mkldnn_contraction_kernel": ["eigen_mkldnn_contraction_kernel_test.cc"],
-        "//conditions:default": [],
+        "//tensorflow:android": [],
+        "//tensorflow:arm": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:linux_ppc64le": [],
+        ":no_mkldnn_contraction_kernel": [],
+        "//conditions:default": ["eigen_mkldnn_contraction_kernel_test.cc"],
     }),
     tags = ["mkldnn_contraction_kernel"],
     deps = [
@@ -2547,6 +2685,7 @@ tf_cc_tests(
         "resize_bicubic_op_test.cc",
         "resize_bilinear_op_test.cc",
         "resize_nearest_neighbor_op_test.cc",
+        "scale_and_translate_op_test.cc",
     ],
     linkopts = select({
         "//tensorflow:darwin": ["-headerpad_max_install_names"],
@@ -2556,6 +2695,7 @@ tf_cc_tests(
         ":image",
         ":ops_testutil",
         ":ops_util",
+        ":sampling_kernels",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -2635,7 +2775,6 @@ cc_library(
 IO_DEPS = [
     ":ops_util",
     "//tensorflow/core:framework",
-    "//tensorflow/core:io_ops_op_lib",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:protos_all_cc",
@@ -2679,7 +2818,6 @@ SAVE_RESTORE_DEPS = [
     ":bounds_check_lib",
     ":save_restore_tensor",
     "//tensorflow/core:framework",
-    "//tensorflow/core:io_ops_op_lib",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:protos_all_cc",
@@ -2767,6 +2905,7 @@ cc_library(
         ":self_adjoint_eig_op",
         ":self_adjoint_eig_v2_op",
         ":svd_op",
+        ":tridiagonal_solve_op",
     ],
 )
 
@@ -2798,7 +2937,6 @@ LINALG_DEPS = [
     "//third_party/eigen3",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
-    "//tensorflow/core:linalg_ops_op_lib",
 ] + if_cuda([
     ":cuda_solvers",
     ":transpose_functor",
@@ -2885,6 +3023,12 @@ tf_kernel_library(
     ]),
 )
 
+tf_kernel_library(
+    name = "tridiagonal_solve_op",
+    srcs = ["tridiagonal_solve_op.cc"],
+    deps = LINALG_DEPS,
+)
+
 tf_kernel_library(
     name = "qr_op",
     prefix = "qr_op",
@@ -2942,7 +3086,6 @@ LOGGING_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
-    "//tensorflow/core:logging_ops_op_lib",
     "//tensorflow/core:protos_all_cc",
 ]
 
@@ -3014,7 +3157,6 @@ tf_kernel_library(
         ":bounds_check",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:manip_ops_op_lib",
         "//third_party/eigen3",
     ],
 )
@@ -3046,7 +3188,6 @@ MATH_DEPS = [
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:math_grad",
-    "//tensorflow/core:math_ops_op_lib",
     "//third_party/eigen3",
 ]
 
@@ -3064,7 +3205,7 @@ tf_kernel_library(
         "//conditions:default": [],
     }),
     prefix = "sparse_matmul_op",
-    deps = MATH_DEPS + select({
+    deps = MATH_DEPS + [":eigen_contraction_kernel"] + select({
         ":xsmm": [
             "@libxsmm_archive//:xsmm_avx",
         ],
@@ -3089,6 +3230,7 @@ cc_library(
         ":fft_ops",
         ":histogram_op",
         ":matmul_op",
+        ":nextafter_op",
         ":population_count_op",
         ":reduction_ops",
         ":scan_ops",
@@ -3156,7 +3298,13 @@ tf_kernel_library(
 tf_kernel_library(
     name = "cwise_op",
     prefix = "cwise_op",
-    deps = MATH_DEPS + ["//tensorflow/core:bitwise_ops_op_lib"],
+    deps = MATH_DEPS,
+)
+
+tf_kernel_library(
+    name = "nextafter_op",
+    prefix = "nextafter_op",
+    deps = MATH_DEPS + [":cwise_op"],
 )
 
 tf_kernel_library(
@@ -3169,7 +3317,6 @@ tf_kernel_library(
     name = "fft_ops",
     prefix = "fft_ops",
     deps = MATH_DEPS + [
-        "//tensorflow/core:spectral_ops_op_lib",
     ] + if_cuda([
         "//tensorflow/core/platform/default/build_config:cufft_plugin",
     ]),
@@ -3218,7 +3365,15 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "scan_ops",
-    prefix = "scan_ops",
+    srcs = ["scan_ops.cc"],
+    hdrs = ["scan_ops.h"],
+    gpu_srcs = [
+        "scan_ops.h",
+        "scan_ops_gpu.h",
+        "scan_ops_gpu_double.cu.cc",
+        "scan_ops_gpu_float.cu.cc",
+        "scan_ops_gpu_half.cu.cc",
+    ],
     deps = MATH_DEPS + if_cuda(["@cub_archive//:cub"]),
 )
 
@@ -3368,10 +3523,7 @@ tf_cuda_cc_test(
         ":quantized_ops",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:client_session",
-        "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:framework",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -3484,27 +3636,6 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
-    name = "shape_op_test",
-    srcs = ["shape_op_test.cc"],
-    deps = [
-        ":array",
-        ":ops_testutil",
-        ":ops_util",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:client_session",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:direct_session",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:ops",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 tf_cuda_cc_test(
     name = "sparse_matmul_op_test",
     size = "small",
@@ -3624,11 +3755,12 @@ tf_kernel_library(
         ":image_resizer_state",
         ":fill_functor",
         ":ops_util",
+        "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/strings",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:nn_ops_op_lib",
     ] + select({
         ":xsmm_convolutions": [
             "@libxsmm_archive//:xsmm_avx",
@@ -3642,7 +3774,15 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "depthwise_conv_op",
-    prefix = "depthwise_conv_op",
+    srcs = ["depthwise_conv_op.cc"],
+    hdrs = ["depthwise_conv_op.h"],
+    gpu_srcs = [
+        "depthwise_conv_op.h",
+        "depthwise_conv_op_gpu.h",
+        "depthwise_conv_op_gpu_double.cu.cc",
+        "depthwise_conv_op_gpu_float.cu.cc",
+        "depthwise_conv_op_gpu_half.cu.cc",
+    ],
     deps = [
         ":bounds_check",
         ":conv_ops",
@@ -3650,7 +3790,6 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:nn_ops_op_lib",
     ] + if_cuda([
         "@cub_archive//:cub",
         "@local_config_cuda//cuda:cudnn_header",
@@ -3670,7 +3809,6 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:nn_ops_op_lib",
     ] + if_cuda([
         "@local_config_cuda//cuda:cudnn_header",
     ]),
@@ -3712,16 +3850,13 @@ NN_DEPS = [
     ":bounds_check",
     ":conv_2d",
     ":eigen_contraction_kernel",
-    ":fused_batch_norm_util_gpu",
     ":ops_util",
-    ":pooling_ops",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:nn_grad",
-    "//tensorflow/core:nn_ops_op_lib",
     "//third_party/eigen3",
-] + if_mkl(["//tensorflow/core:mkl_nn_ops_op_lib"])
+]
 
 tf_kernel_library(
     name = "batch_norm_op",
@@ -3741,6 +3876,8 @@ tf_kernel_library(
     deps = NN_DEPS + if_cuda([
         ":reduction_ops",
         "@cub_archive//:cub",
+        "//tensorflow/core:stream_executor",
+        "//tensorflow/stream_executor/cuda:cuda_stream",
     ]),
 )
 
@@ -3793,7 +3930,21 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "topk_op",
-    prefix = "topk_op",
+    srcs = ["topk_op.cc"],
+    hdrs = ["topk_op.h"],
+    gpu_srcs = [
+        "topk_op.h",
+        "topk_op_gpu.h",
+        "topk_op_gpu_double.cu.cc",
+        "topk_op_gpu_float.cu.cc",
+        "topk_op_gpu_half.cu.cc",
+        "topk_op_gpu_int64.cu.cc",
+        "topk_op_gpu_int32.cu.cc",
+        "topk_op_gpu_int16.cu.cc",
+        "topk_op_gpu_uint16.cu.cc",
+        "topk_op_gpu_int8.cu.cc",
+        "topk_op_gpu_uint8.cu.cc",
+    ],
     deps = NN_DEPS + if_cuda(["@cub_archive//:cub"]),
 )
 
@@ -3841,7 +3992,6 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_grad",
-        "//tensorflow/core:nn_ops_op_lib",
     ] + if_cuda(["@cub_archive//:cub"]),
 )
 
@@ -3893,6 +4043,7 @@ tf_cuda_cc_test(
         ":nn",
         ":ops_testutil",
         ":ops_util",
+        ":pooling_ops",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/core:core_cpu",
@@ -3949,7 +4100,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:stream_executor",
         "//third_party/eigen3",
     ],
@@ -3971,19 +4121,6 @@ tf_kernel_library(
     alwayslink = 1,
 )
 
-tf_kernel_library(
-    name = "fused_batch_norm_util",
-    gpu_srcs = [
-        "fused_batch_norm_op.h",
-        "fused_batch_norm_op.cu.cc",
-    ],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//third_party/eigen3",
-    ],
-)
-
 cc_library(
     name = "pooling_ops_hdrs",
     hdrs = [
@@ -4006,7 +4143,6 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
     ],
 )
@@ -4088,7 +4224,6 @@ cc_library(
 PARSING_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
-    "//tensorflow/core:parsing_ops_op_lib",
     "//tensorflow/core:proto_text",
     "//tensorflow/core:protos_all_cc",
 ]
@@ -4157,7 +4292,6 @@ RANDOM_OPS_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
-    "//tensorflow/core:random_ops_op_lib",
 ]
 
 tf_kernel_library(
@@ -4188,6 +4322,30 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_kernel_library(
+    name = "stateful_random_ops",
+    prefix = "stateful_random_ops",
+    deps = [
+        ":bounds_check",
+        ":dense_update_functor",
+        ":gather_functor",
+        ":mutex_ops",
+        ":random_op",
+        ":resource_variable_ops",
+        ":scatter_functor",
+        ":state",
+        ":training_op_helpers",
+        ":variable_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:variant",
+    ],
+)
+
 tf_kernel_library(
     name = "stateless_random_ops",
     prefix = "stateless_random_ops",
@@ -4196,7 +4354,6 @@ tf_kernel_library(
         ":random_op",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stateless_random_ops_op_lib",
     ],
 )
 
@@ -4211,8 +4368,6 @@ cc_library(
 REQUIRED_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
-    "//tensorflow/core:no_op_op_lib",
-    "//tensorflow/core:sendrecv_ops_op_lib",
 ]
 
 tf_kernel_library(
@@ -4273,7 +4428,6 @@ cc_library(
 SPARSE_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
-    "//tensorflow/core:sparse_ops_op_lib",
 ]
 
 tf_kernel_library(
@@ -4405,7 +4559,10 @@ tf_kernel_library(
     deps = SPARSE_DEPS + [
         ":bounds_check",
         "//third_party/eigen3",
-    ],
+    ] + if_cuda([
+        ":reduction_ops",
+        "@cub_archive//:cub",
+    ]),
 )
 
 tf_kernel_library(
@@ -4518,7 +4675,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:sdca_ops_op_lib",
         "//third_party/eigen3",
         "@farmhash_archive//:farmhash",
     ],
@@ -4530,6 +4686,7 @@ cc_library(
     srcs = ["sdca_internal.cc"],
     hdrs = ["sdca_internal.h"],
     deps = [
+        ":eigen_contraction_kernel",
         ":loss_updaters",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -4557,7 +4714,6 @@ STATE_DEPS = [
     "//third_party/eigen3",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
-    "//tensorflow/core:state_ops_op_lib",
 ] + if_sycl(["//tensorflow/core:sycl_runtime"])
 
 tf_kernel_library(
@@ -4695,7 +4851,6 @@ STRING_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
-    "//tensorflow/core:string_ops_op_lib",
 ]
 
 tf_kernel_library(
@@ -4721,6 +4876,8 @@ tf_cc_test(
     size = "small",
     srcs = ["string_format_op_test.cc"],
     deps = [
+        ":ops_testutil",
+        ":ops_util",
         ":string_format_op",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -4729,8 +4886,6 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:ops_testutil",
-        "//tensorflow/core/kernels:ops_util",
     ],
 )
 
@@ -4763,6 +4918,8 @@ tf_cc_test(
     size = "small",
     srcs = ["regex_replace_op_test.cc"],
     deps = [
+        ":ops_testutil",
+        ":ops_util",
         ":regex_replace_op",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -4771,8 +4928,6 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:ops_testutil",
-        "//tensorflow/core/kernels:ops_util",
     ],
 )
 
@@ -4787,6 +4942,8 @@ tf_cc_test(
     size = "small",
     srcs = ["string_split_op_test.cc"],
     deps = [
+        ":ops_testutil",
+        ":ops_util",
         ":string_split_op",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -4795,8 +4952,6 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:ops_testutil",
-        "//tensorflow/core/kernels:ops_util",
     ],
 )
 
@@ -4817,6 +4972,8 @@ tf_cc_test(
     size = "small",
     srcs = ["substr_op_test.cc"],
     deps = [
+        ":ops_testutil",
+        ":ops_util",
         ":substr_op",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -4826,8 +4983,6 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:ops_testutil",
-        "//tensorflow/core/kernels:ops_util",
     ],
 )
 
@@ -4846,7 +5001,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:string_ops_op_lib",
         "//third_party/eigen3",
         "//third_party/icu/data:conversion_data",
         "@icu//:common",
@@ -4868,7 +5022,6 @@ tf_kernel_library(
         ":variable_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:training_ops_op_lib",
         "//third_party/eigen3",
     ],
 )
@@ -4896,11 +5049,14 @@ tf_kernel_library(
         ":random_op",
         ":random_ops",
         ":stateless_random_ops",
+        "//third_party/eigen3",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//third_party/eigen3",
-    ],
+    ] + if_cuda([
+        ":reduction_ops",
+        "@cub_archive//:cub",
+    ]),
 )
 
 tf_cuda_cc_test(
@@ -4927,7 +5083,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:random_ops_op_lib",
     ],
 )
 
@@ -4955,7 +5110,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:random_ops_op_lib",
     ],
 )
 
@@ -5057,7 +5211,6 @@ tf_kernel_library(
     prefix = "spectrogram_op",
     deps = [
         ":spectrogram",
-        "//tensorflow/core:audio_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -5175,7 +5328,6 @@ tf_kernel_library(
     prefix = "mfcc_op",
     deps = [
         ":mfcc",
-        "//tensorflow/core:audio_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -5225,7 +5377,6 @@ filegroup(
     srcs = [
         "avgpooling_op.h",
         "batch_util.h",
-        "bounds_check.h",
         "cwise_ops.h",
         "cwise_ops_common.h",
         "cwise_ops_gradients.h",
@@ -5266,7 +5417,6 @@ filegroup(
         "assign_op.h",
         "bias_op.cc",
         "bias_op.h",
-        "bounds_check.h",
         "cast_op.cc",
         "cast_op.h",
         "cast_op_impl.h",
@@ -5412,6 +5562,7 @@ filegroup(
         "gemm_functors.h",
         "image_resizer_state.h",
         "initializable_lookup_table.h",
+        "logging_ops.h",
         "lookup_table_init_op.h",
         "lookup_table_op.h",
         "lookup_util.h",
@@ -5562,6 +5713,7 @@ filegroup(
         "decode_bmp_op.cc",
         "depthtospace_op.cc",
         "dynamic_stitch_op.cc",
+        "fft_ops.cc",
         "in_topk_op.cc",
         "initializable_lookup_table.cc",
         "logging_ops.cc",
@@ -5754,6 +5906,7 @@ filegroup(
             "mkl_*",
             "xsmm_*",
             "cwise_ops_sycl_common.h",
+            "nextafter_op.cc",
         ] + ANDROID_TEXTUAL_HDRS,
     ),
     visibility = ["//visibility:public"],
@@ -5864,12 +6017,9 @@ tf_kernel_library(
         ":ops_util",
         ":pooling_ops",
         ":quantization_utils",
-        "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
         "@gemmlowp",
     ],
@@ -6355,6 +6505,30 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test_mkl(
+    name = "mkl_quantized_concat_op_test",
+    size = "small",
+    srcs = ["mkl_quantized_concat_op_test.cc"],
+    deps = [
+        ":mkl_concat_op",
+        ":ops_testutil",
+        ":ops_util",
+        ":quantization_utils",
+        ":quantized_ops",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:mkl_array_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cc_test(
     name = "quantized_batch_norm_op_test",
     size = "small",
@@ -6439,7 +6613,6 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:remote_fused_graph_ops_op_lib",
     ],
 )
 
@@ -6479,6 +6652,7 @@ cc_library(
     srcs = ["remote_fused_graph_execute_op_test_utils.cc"],
     hdrs = ["remote_fused_graph_execute_op_test_utils.h"],
     deps = [
+        ":cwise_op",
         ":remote_fused_graph_execute_utils",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:ops",
@@ -6486,7 +6660,6 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:cwise_op",
     ],
 )
 
@@ -6497,6 +6670,7 @@ tf_cc_test(
         "remote_fused_graph_execute_utils_test.cc",
     ],
     deps = [
+        ":cwise_op",
         ":remote_fused_graph_execute_op_test_utils",
         ":remote_fused_graph_execute_utils",
         "//tensorflow/cc:cc_ops",
@@ -6512,7 +6686,6 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:cwise_op",
     ],
 )
 
@@ -6594,8 +6767,6 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:mkl_nn_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
     ] + mkl_deps(),
 )
 
@@ -6603,6 +6774,7 @@ tf_cc_test_mkl(
     name = "mkl_conv_ops_test",
     size = "small",
     srcs = ["mkl_conv_ops_test.cc"],
+    linkstatic = 1,  # Fixes dyld error on MacOS.
     deps = [
         ":ops_testutil",
         ":ops_util",
@@ -6629,8 +6801,6 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:mkl_nn_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
     ] + mkl_deps(),
 )
 
@@ -6645,8 +6815,6 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:mkl_nn_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
     ] + mkl_deps(),
 )
 
@@ -6665,8 +6833,6 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:mkl_nn_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
     ] + mkl_deps(),
 )
 
@@ -6680,8 +6846,6 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:mkl_nn_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
     ] + mkl_deps(),
 )
@@ -6696,8 +6860,6 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:mkl_nn_ops_op_lib",
-        "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
     ] + mkl_deps(),
 )
@@ -6755,6 +6917,65 @@ tf_mkl_kernel_library(
     deps = NN_DEPS + mkl_deps() + [":cwise_op"],
 )
 
+tf_mkl_kernel_library(
+    name = "mkl_requantize_ops",
+    srcs = [
+        "mkl_requantization_range_per_channel_op.cc",
+        "mkl_requantize_per_channel_op.cc",
+    ],
+    hdrs = [
+        "meta_support.h",
+        "no_op.h",
+        "reference_gemm.h",
+    ],
+    deps = if_mkl(
+        [
+            ":concat_lib_hdrs",
+            ":conv_ops",
+            ":cwise_op",
+            ":eigen_helpers",
+            ":image_resizer_state",
+            ":ops_util",
+            ":pooling_ops",
+            ":quantization_utils",
+            ":transpose_functor",
+            "//third_party/eigen3",
+            "@gemmlowp",
+            "@mkl_dnn",
+            "//tensorflow/core:core_cpu",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:lib",
+            "//third_party/mkl:intel_binary_blob",
+        ],
+    ),
+)
+
+tf_cc_test_mkl(
+    name = "mkl_requantize_ops_test",
+    size = "small",
+    srcs = ["mkl_requantize_ops_test.cc"],
+    linkstatic = 1,  # Fixes dyld error on MacOS.
+    deps = [
+        ":mkl_requantize_ops",
+        ":ops_testutil",
+        ":ops_util",
+        ":quantization_utils",
+        ":quantized_ops",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cc_test_mkl(
     name = "mkl_fused_ops_test",
     size = "small",
@@ -6835,14 +7056,13 @@ tf_kernel_library(
     name = "summary_kernels",
     srcs = ["summary_kernels.cc"],
     deps = [
-        "//tensorflow/contrib/tensorboard/db:schema",
-        "//tensorflow/contrib/tensorboard/db:summary_db_writer",
-        "//tensorflow/contrib/tensorboard/db:summary_file_writer",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:summary_ops_op_lib",
         "//tensorflow/core/lib/db:sqlite",
+        "//tensorflow/core/summary:schema",
+        "//tensorflow/core/summary:summary_db_writer",
+        "//tensorflow/core/summary:summary_file_writer",
     ],
 )
 
@@ -6852,13 +7072,13 @@ tf_kernel_library(
         "decode_proto_op.cc",
     ],
     deps = [
-        "//tensorflow/core:decode_proto_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/util/proto:decode",
         "//tensorflow/core/util/proto:descriptors",
         "//tensorflow/core/util/proto:proto_utils",
         "//third_party/eigen3",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -6866,7 +7086,6 @@ tf_kernel_library(
     name = "encode_proto_op",
     srcs = ["encode_proto_op.cc"],
     deps = [
-        "//tensorflow/core:encode_proto_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/util/proto:descriptors",
@@ -6884,7 +7103,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:rpc_ops_op_lib",
         "//tensorflow/core/util/rpc:call_container",
         "//tensorflow/core/util/rpc:rpc_factory",
         "//tensorflow/core/util/rpc:rpc_factory_registry",
@@ -6897,7 +7115,6 @@ tf_kernel_library(
     srcs = ["unicode_script_op.cc"],
     deps = [
         "//tensorflow/core:framework",
-        "//tensorflow/core:string_ops_op_lib",
         "@icu//:common",
     ],
 )
@@ -6940,3 +7157,31 @@ cc_header_only_library(
         ":cwise_lib",
     ],
 )
+
+# Library to link with when compiling the quantize and dequantize kernels directly,
+# e.g. for selective registration.
+cc_header_only_library(
+    name = "quantize_and_dequantize_op_hdrs",
+    deps = [
+        ":quantize_and_dequantize_op",
+    ],
+)
+
+cc_library(
+    name = "kernel_platform_strings",
+    srcs = ["kernel_platform_strings.h"],
+    deps = [
+        "//tensorflow/core:platform_strings",
+    ],
+    alwayslink = 1,
+)
+
+# Shared object that links all the kernels TF needs.
+tf_cc_shared_object(
+    name = "libtfkernel_all_kernels.so",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":kernel_platform_strings",
+        "//tensorflow/core:all_kernels_impl",
+    ],
+)
diff --git a/tensorflow/core/kernels/adjust_contrast_op.cc b/tensorflow/core/kernels/adjust_contrast_op.cc
index 47e10f56dfa682d97b04b78cd0e5f9a536081025..1aef0060b0c35a9cc1b451f4a579c90bc31fbaaf 100644
--- a/tensorflow/core/kernels/adjust_contrast_op.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op.cc
@@ -189,11 +189,11 @@ class AdjustContrastOpV2Base : public OpKernel {
                          const ComputeOptions& options) = 0;
 };
 
-template <typename Device>
+template <typename Device, typename T>
 class AdjustContrastOpv2;
 
 template <>
-class AdjustContrastOpv2<CPUDevice> : public AdjustContrastOpV2Base {
+class AdjustContrastOpv2<CPUDevice, float> : public AdjustContrastOpV2Base {
  public:
   explicit AdjustContrastOpv2(OpKernelConstruction* context)
       : AdjustContrastOpV2Base(context) {}
@@ -378,23 +378,32 @@ class AdjustContrastOpv2<CPUDevice> : public AdjustContrastOpV2Base {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("AdjustContrastv2").Device(DEVICE_CPU),
-                        AdjustContrastOpv2<CPUDevice>);
+REGISTER_KERNEL_BUILDER(
+    Name("AdjustContrastv2").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    AdjustContrastOpv2<CPUDevice, float>);
 
 #if GOOGLE_CUDA
 // Forward declarations of the function specializations for GPU (to prevent
 // building the GPU versions here, they will be built compiling _gpu.cu.cc).
 namespace functor {
-template <>
-void AdjustContrastv2<GPUDevice>::operator()(
-    const GPUDevice& d, typename TTypes<float, 4>::ConstTensor input,
-    typename TTypes<float>::ConstScalar contrast_factor,
-    typename TTypes<float, 4>::Tensor output);
-extern template struct AdjustContrastv2<GPUDevice>;
+
+#define DECLARE_GPU_SPEC(T)                                         \
+  template <>                                                       \
+  void AdjustContrastv2<GPUDevice, T>::operator()(                  \
+      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, \
+      typename TTypes<float>::ConstScalar contrast_factor,          \
+      typename TTypes<T, 4>::Tensor output);                        \
+  extern template struct AdjustContrastv2<GPUDevice, T>;
+
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(Eigen::half);
+
+#undef DECLARE_GPU_SPEC
+
 }  // namespace functor
 
-template <>
-class AdjustContrastOpv2<GPUDevice> : public AdjustContrastOpV2Base {
+template <typename T>
+class AdjustContrastOpv2<GPUDevice, T> : public AdjustContrastOpV2Base {
  public:
   explicit AdjustContrastOpv2(OpKernelConstruction* context)
       : AdjustContrastOpV2Base(context) {}
@@ -403,20 +412,27 @@ class AdjustContrastOpv2<GPUDevice> : public AdjustContrastOpV2Base {
                  const ComputeOptions& options) override {
     const int64 shape[4] = {options.batch, options.height, options.width,
                             options.channels};
-    functor::AdjustContrastv2<GPUDevice>()(
-        context->eigen_device<GPUDevice>(),
-        options.input->shaped<float, 4>(shape), options.factor->scalar<float>(),
-        options.output->shaped<float, 4>(shape));
+    functor::AdjustContrastv2<GPUDevice, T>()(
+        context->eigen_device<GPUDevice>(), options.input->shaped<T, 4>(shape),
+        options.factor->scalar<float>(), options.output->shaped<T, 4>(shape));
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("AdjustContrastv2").Device(DEVICE_GPU),
-                        AdjustContrastOpv2<GPUDevice>);
+#define REGISTER_GPU(T)                                                   \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("AdjustContrastv2").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      AdjustContrastOpv2<GPUDevice, T>);
+
+REGISTER_GPU(float)
+REGISTER_GPU(Eigen::half)
+
+#undef REGISTER_GPU
+
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
 template <>
-class AdjustContrastOpv2<SYCLDevice> : public AdjustContrastOpV2Base {
+class AdjustContrastOpv2<SYCLDevice, float> : public AdjustContrastOpV2Base {
  public:
   explicit AdjustContrastOpv2(OpKernelConstruction* context)
       : AdjustContrastOpV2Base(context) {}
@@ -431,8 +447,9 @@ class AdjustContrastOpv2<SYCLDevice> : public AdjustContrastOpV2Base {
         options.output->shaped<float, 4>(shape));
   }
 };
-REGISTER_KERNEL_BUILDER(Name("AdjustContrastv2").Device(DEVICE_SYCL),
-                        AdjustContrastOpv2<SYCLDevice>);
+REGISTER_KERNEL_BUILDER(
+    Name("AdjustContrastv2").Device(DEVICE_SYCL).TypeConstraint<float>("T"),
+    AdjustContrastOpv2<SYCLDevice, float>);
 #endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/adjust_contrast_op.h b/tensorflow/core/kernels/adjust_contrast_op.h
index f4a53c2ef9ca77eaa634a9a090cc98f93d179806..3e501bccee3315f15cf8009f5e04aa00d706da5c 100644
--- a/tensorflow/core/kernels/adjust_contrast_op.h
+++ b/tensorflow/core/kernels/adjust_contrast_op.h
@@ -87,11 +87,11 @@ struct AdjustContrast {
 };
 
 // Functor used by AdjustContrastOpv2 to do the computations.
-template <typename Device>
+template <typename Device, typename T>
 struct AdjustContrastv2 {
-  void operator()(const Device& d, typename TTypes<float, 4>::ConstTensor input,
+  void operator()(const Device& d, typename TTypes<T, 4>::ConstTensor input,
                   typename TTypes<float>::ConstScalar contrast_factor,
-                  typename TTypes<float, 4>::Tensor output) {
+                  typename TTypes<T, 4>::Tensor output) {
     const int batch = input.dimension(0);
     const int height = input.dimension(1);
     const int width = input.dimension(2);
@@ -138,15 +138,19 @@ struct AdjustContrastv2 {
 #endif
     Eigen::Sizes<1, 1, 1, 1> scalar;
     float num_reduced_coeffs = height * width;
-    output.device(d) =
-        (input.shuffle(reduced_dims_first).sum(reduction_axis).eval() /
-         num_reduced_coeffs)
-            .reshape(reshape_dims)
-            .broadcast(broadcast_dims);
+    output.device(d) = (input.template cast<float>()
+                            .shuffle(reduced_dims_first)
+                            .sum(reduction_axis)
+                            .eval() /
+                        num_reduced_coeffs)
+                           .template cast<T>()
+                           .reshape(reshape_dims)
+                           .broadcast(broadcast_dims);
     auto contrast_factor_tensor =
         contrast_factor.reshape(scalar).broadcast(scalar_broadcast);
-    auto adjusted = (input - output) * contrast_factor_tensor;
-    output.device(d) += adjusted;
+    auto adjusted =
+        (input - output).template cast<float>() * contrast_factor_tensor;
+    output.device(d) += adjusted.template cast<T>();
   }
 };
 
diff --git a/tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc b/tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc
index a451bfe29c76d0710e97cbe2b98a9837332014e5..1a1c2a4e1ee99cffbf5c18d849a97c36767829ee 100644
--- a/tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op_gpu.cu.cc
@@ -26,7 +26,8 @@ namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 
 // this is for v2
-template struct functor::AdjustContrastv2<GPUDevice>;
+template struct functor::AdjustContrastv2<GPUDevice, float>;
+template struct functor::AdjustContrastv2<GPUDevice, Eigen::half>;
 
 // these are for v1
 template struct functor::AdjustContrast<GPUDevice, uint8>;
diff --git a/tensorflow/core/kernels/adjust_hsv_gpu.cu.h b/tensorflow/core/kernels/adjust_hsv_gpu.cu.h
index 49df5ae296b3e2a213c436d0e4656757c49cb16e..dede7d249da978dc71214c06dd1c542f78db751e 100644
--- a/tensorflow/core/kernels/adjust_hsv_gpu.cu.h
+++ b/tensorflow/core/kernels/adjust_hsv_gpu.cu.h
@@ -91,11 +91,10 @@ inline __device__ RgbTuple hsv2rgb_cuda(const float h, const float s,
   return tuple;
 }
 
-template <bool AdjustHue, bool AdjustSaturation, bool AdjustV>
+template <bool AdjustHue, bool AdjustSaturation, bool AdjustV, typename T>
 __global__ void adjust_hsv_nhwc(const int64 number_elements,
-                                const float* const __restrict__ input,
-                                float* const output,
-                                const float* const hue_delta,
+                                const T* const __restrict__ input,
+                                T* const output, const float* const hue_delta,
                                 const float* const saturation_scale,
                                 const float* const value_scale) {
   // multiply by 3 since we're dealing with contiguous RGB bytes for each pixel
@@ -111,7 +110,9 @@ __global__ void adjust_hsv_nhwc(const int64 number_elements,
     output[idx + 2] = input[idx + 2];
     return;
   }
-  const HsvTuple hsv = rgb2hsv_cuda(input[idx], input[idx + 1], input[idx + 2]);
+  const HsvTuple hsv = rgb2hsv_cuda(static_cast<float>(input[idx]),
+                                    static_cast<float>(input[idx + 1]),
+                                    static_cast<float>(input[idx + 2]));
   float new_h = hsv.h;
   float new_s = hsv.s;
   float new_v = hsv.v;
@@ -134,9 +135,9 @@ __global__ void adjust_hsv_nhwc(const int64 number_elements,
     new_v = hsv.v * scale;
   }
   const RgbTuple rgb = hsv2rgb_cuda(new_h, new_s, new_v);
-  output[idx] = rgb.r;
-  output[idx + 1] = rgb.g;
-  output[idx + 2] = rgb.b;
+  output[idx] = static_cast<T>(rgb.r);
+  output[idx + 1] = static_cast<T>(rgb.g);
+  output[idx + 2] = static_cast<T>(rgb.b);
 }
 
 }  // namespace internal
diff --git a/tensorflow/core/kernels/adjust_hue_op.cc b/tensorflow/core/kernels/adjust_hue_op.cc
index 52dec94305d3c8558013861a44524609ad6eed7a..06de5ea3fb69c811f3057ff6829b22466a31f64a 100644
--- a/tensorflow/core/kernels/adjust_hue_op.cc
+++ b/tensorflow/core/kernels/adjust_hue_op.cc
@@ -82,7 +82,7 @@ class AdjustHueOpBase : public OpKernel {
   }
 };
 
-template <class Device>
+template <class Device, typename T>
 class AdjustHueOp;
 
 namespace internal {
@@ -196,7 +196,7 @@ static void hv_range_to_rgb(float h, float v_min, float v_max, float* r,
 }  // namespace internal
 
 template <>
-class AdjustHueOp<CPUDevice> : public AdjustHueOpBase {
+class AdjustHueOp<CPUDevice, float> : public AdjustHueOpBase {
  public:
   explicit AdjustHueOp(OpKernelConstruction* context)
       : AdjustHueOpBase(context) {}
@@ -245,12 +245,13 @@ class AdjustHueOp<CPUDevice> : public AdjustHueOpBase {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("AdjustHue").Device(DEVICE_CPU),
-                        AdjustHueOp<CPUDevice>);
+REGISTER_KERNEL_BUILDER(
+    Name("AdjustHue").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    AdjustHueOp<CPUDevice, float>);
 
 #if GOOGLE_CUDA
-template <>
-class AdjustHueOp<GPUDevice> : public AdjustHueOpBase {
+template <typename T>
+class AdjustHueOp<GPUDevice, T> : public AdjustHueOpBase {
  public:
   explicit AdjustHueOp(OpKernelConstruction* context)
       : AdjustHueOpBase(context) {}
@@ -265,17 +266,24 @@ class AdjustHueOp<GPUDevice> : public AdjustHueOpBase {
     const auto stream = device.stream();
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
     if (number_of_elements > 0) {
-      const float* input_data = input->flat<float>().data();
+      const T* input_data = input->flat<T>().data();
       const float* delta_h = delta->flat<float>().data();
-      float* const output_data = output->flat<float>().data();
-      functor::AdjustHueGPU()(&device, number_of_elements, input_data, delta_h,
-                              output_data);
+      T* const output_data = output->flat<T>().data();
+      functor::AdjustHueGPU<T>()(&device, number_of_elements, input_data,
+                                 delta_h, output_data);
     }
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("AdjustHue").Device(DEVICE_GPU),
-                        AdjustHueOp<GPUDevice>);
+#define REGISTER_GPU(T)                                            \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("AdjustHue").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      AdjustHueOp<GPUDevice, T>);
+
+REGISTER_GPU(float)
+REGISTER_GPU(Eigen::half)
+
+#undef REGISTER_GPU
 
 #endif
 
diff --git a/tensorflow/core/kernels/adjust_hue_op.h b/tensorflow/core/kernels/adjust_hue_op.h
index 983a4072bfa2ee5f44a1c5e1e1050ffa5aea5de7..6d6699de3fbcdd4e2b83f0c2a77a36422aa8e24b 100644
--- a/tensorflow/core/kernels/adjust_hue_op.h
+++ b/tensorflow/core/kernels/adjust_hue_op.h
@@ -27,10 +27,11 @@ typedef Eigen::GpuDevice GPUDevice;
 
 namespace functor {
 
+template <typename T>
 struct AdjustHueGPU {
   void operator()(GPUDevice* device, const int64 number_of_elements,
-                  const float* const input, const float* const delta,
-                  float* const output);
+                  const T* const input, const float* const delta,
+                  T* const output);
 };
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc b/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc
index a4fe5f755cafb6f30a28e87ea7febf0535c68a70..c30085269c07e2bdeae70a8729261596faeb6344 100644
--- a/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc
@@ -24,19 +24,25 @@ namespace tensorflow {
 
 namespace functor {
 
-void AdjustHueGPU::operator()(GPUDevice* device, const int64 number_of_elements,
-                              const float* const input,
-                              const float* const delta, float* const output) {
+template <typename T>
+void AdjustHueGPU<T>::operator()(GPUDevice* device,
+                                 const int64 number_of_elements,
+                                 const T* const input, const float* const delta,
+                                 T* const output) {
   const auto stream = device->stream();
   const CudaLaunchConfig config =
       GetCudaLaunchConfig(number_of_elements, *device);
   const int threads_per_block = config.thread_per_block;
   const int block_count =
       (number_of_elements + threads_per_block - 1) / threads_per_block;
-  internal::adjust_hsv_nhwc<true, false, false>
+  internal::adjust_hsv_nhwc<true, false, false, T>
       <<<block_count, threads_per_block, 0, stream>>>(
           number_of_elements, input, output, delta, nullptr, nullptr);
 }
+
+template struct AdjustHueGPU<float>;
+template struct AdjustHueGPU<Eigen::half>;
+
 }  // namespace functor
 }  // namespace tensorflow
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/adjust_saturation_op.cc b/tensorflow/core/kernels/adjust_saturation_op.cc
index f0c6ae499d4c209ef1556890e87f63085de7ea75..98264c4a1de75f7308c0b55e3d77a2dff88ebb49 100644
--- a/tensorflow/core/kernels/adjust_saturation_op.cc
+++ b/tensorflow/core/kernels/adjust_saturation_op.cc
@@ -81,7 +81,7 @@ class AdjustSaturationOpBase : public OpKernel {
   }
 };
 
-template <class Device>
+template <class Device, typename T>
 class AdjustSaturationOp;
 
 namespace internal {
@@ -173,7 +173,7 @@ static void hsv_to_rgb(float h, float s, float v, float* r, float* g,
 }  // namespace internal
 
 template <>
-class AdjustSaturationOp<CPUDevice> : public AdjustSaturationOpBase {
+class AdjustSaturationOp<CPUDevice, float> : public AdjustSaturationOpBase {
  public:
   explicit AdjustSaturationOp(OpKernelConstruction* context)
       : AdjustSaturationOpBase(context) {}
@@ -193,8 +193,8 @@ class AdjustSaturationOp<CPUDevice> : public AdjustSaturationOpBase {
         *context->device()->tensorflow_cpu_worker_threads();
     Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
           kCostPerChannel,
-          [channel_count, &input_data, &output_data, scale_h](
-              int64 start_channel, int64 end_channel) {
+          [&input_data, &output_data, scale_h](int64 start_channel,
+                                               int64 end_channel) {
             const float* p = input_data.data() + start_channel * kChannelSize;
             float* q = output_data.data() + start_channel * kChannelSize;
             for (int i = start_channel; i < end_channel; i++) {
@@ -211,12 +211,13 @@ class AdjustSaturationOp<CPUDevice> : public AdjustSaturationOpBase {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("AdjustSaturation").Device(DEVICE_CPU),
-                        AdjustSaturationOp<CPUDevice>);
+REGISTER_KERNEL_BUILDER(
+    Name("AdjustSaturation").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    AdjustSaturationOp<CPUDevice, float>);
 
 #if GOOGLE_CUDA
-template <>
-class AdjustSaturationOp<GPUDevice> : public AdjustSaturationOpBase {
+template <typename T>
+class AdjustSaturationOp<GPUDevice, T> : public AdjustSaturationOpBase {
  public:
   explicit AdjustSaturationOp(OpKernelConstruction* context)
       : AdjustSaturationOpBase(context) {}
@@ -231,17 +232,24 @@ class AdjustSaturationOp<GPUDevice> : public AdjustSaturationOpBase {
     const auto stream = device.stream();
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
     if (number_of_elements > 0) {
-      const float* input_data = input->flat<float>().data();
+      const T* input_data = input->flat<T>().data();
       const float* scale_data = scale->flat<float>().data();
-      float* const output_data = output->flat<float>().data();
-      functor::AdjustSaturationGPU()(&device, number_of_elements, input_data,
-                                     scale_data, output_data);
+      T* const output_data = output->flat<T>().data();
+      functor::AdjustSaturationGPU<T>()(&device, number_of_elements, input_data,
+                                        scale_data, output_data);
     }
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("AdjustSaturation").Device(DEVICE_GPU),
-                        AdjustSaturationOp<GPUDevice>);
+#define REGISTER_GPU(T)                                                   \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("AdjustSaturation").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      AdjustSaturationOp<GPUDevice, T>);
+
+REGISTER_GPU(float)
+REGISTER_GPU(Eigen::half)
+
+#undef REGISTER_GPU
 
 #endif
 
diff --git a/tensorflow/core/kernels/adjust_saturation_op.h b/tensorflow/core/kernels/adjust_saturation_op.h
index fd28ba536f2f4e13079a0b7ed9f4097bb10e629e..c21ce4e3608827df08c76d608fb88a5b5b99a3da 100644
--- a/tensorflow/core/kernels/adjust_saturation_op.h
+++ b/tensorflow/core/kernels/adjust_saturation_op.h
@@ -27,10 +27,11 @@ typedef Eigen::GpuDevice GPUDevice;
 
 namespace functor {
 
+template <typename T>
 struct AdjustSaturationGPU {
   void operator()(GPUDevice* device, const int64 number_of_elements,
-                  const float* const input, const float* const scale,
-                  float* const output);
+                  const T* const input, const float* const scale,
+                  T* const output);
 };
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/adjust_saturation_op_gpu.cu.cc b/tensorflow/core/kernels/adjust_saturation_op_gpu.cu.cc
index 37cfb26a47b01ca15cdb6287243a16490bb34bfb..6c70490d469fa8dbdc425f9e57b42acda14f5a58 100644
--- a/tensorflow/core/kernels/adjust_saturation_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/adjust_saturation_op_gpu.cu.cc
@@ -24,21 +24,26 @@ namespace tensorflow {
 
 namespace functor {
 
-void AdjustSaturationGPU::operator()(GPUDevice* device,
-                                     const int64 number_of_elements,
-                                     const float* const input,
-                                     const float* const scale,
-                                     float* const output) {
+template <typename T>
+void AdjustSaturationGPU<T>::operator()(GPUDevice* device,
+                                        const int64 number_of_elements,
+                                        const T* const input,
+                                        const float* const scale,
+                                        T* const output) {
   const auto stream = device->stream();
   const CudaLaunchConfig config =
       GetCudaLaunchConfig(number_of_elements, *device);
   const int threads_per_block = config.thread_per_block;
   const int block_count =
       (number_of_elements + threads_per_block - 1) / threads_per_block;
-  internal::adjust_hsv_nhwc<false, true, false>
+  internal::adjust_hsv_nhwc<false, true, false, T>
       <<<block_count, threads_per_block, 0, stream>>>(
           number_of_elements, input, output, nullptr, scale, nullptr);
 }
+
+template struct AdjustSaturationGPU<float>;
+template struct AdjustSaturationGPU<Eigen::half>;
+
 }  // namespace functor
 }  // namespace tensorflow
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/aggregate_ops.cc b/tensorflow/core/kernels/aggregate_ops.cc
index 150e8fe6379fd2a41778e94df793ba45ef0d309e..edf6d3e61e0dc4297ad330fbe43086fce0607088 100644
--- a/tensorflow/core/kernels/aggregate_ops.cc
+++ b/tensorflow/core/kernels/aggregate_ops.cc
@@ -179,20 +179,7 @@ class AddNOp<Device, Variant> : public OpKernel {
               i, " has shape: ", ctx->input(i).shape().DebugString(), "."));
     }
 
-    TensorShape common_shape;
-    OP_REQUIRES_OK(ctx, GetUnaryVariantShape(ctx->input(0), &common_shape));
-    // Step 2: access all variants and ensure shapes match.
-    for (int i = 1; i < num; ++i) {
-      TensorShape check_shape;
-      OP_REQUIRES_OK(ctx, GetUnaryVariantShape(ctx->input(i), &check_shape));
-      OP_REQUIRES(ctx, common_shape == check_shape,
-                  errors::InvalidArgument(
-                      "AddN of Variants of differing shapes; inputs[0] shape: ",
-                      common_shape.DebugString(), ", inputs[", i,
-                      "] shape: ", check_shape.DebugString()));
-    }
-
-    // Step 3: attempt to add using
+    // Step 2: attempt to add using
     //   BinaryOpVariants(ADD_VARIANT_BINARY_OP, ...)
     //   For the output create a default-constructed variant object.
     // TODO(ebrevdo): Perform summation in a tree-structure.
diff --git a/tensorflow/core/kernels/argmax_op.cc b/tensorflow/core/kernels/argmax_op.cc
index c731b64993b3a6cebfb46eca9221ca28b729e845..778f818a61a54ec1aa78b93a8f5b8e61755a341f 100644
--- a/tensorflow/core/kernels/argmax_op.cc
+++ b/tensorflow/core/kernels/argmax_op.cc
@@ -25,13 +25,13 @@ limitations under the License.
 
 #include <memory>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 
diff --git a/tensorflow/core/kernels/attention_ops.cc b/tensorflow/core/kernels/attention_ops.cc
index ce2fce92e4ee8cbd7bdc578d92103a5bd5da0629..f555c0fd67968cbbe98ae1e27908374d41aab1ab 100644
--- a/tensorflow/core/kernels/attention_ops.cc
+++ b/tensorflow/core/kernels/attention_ops.cc
@@ -34,7 +34,31 @@ class ExtractGlimpseOp : public OpKernel {
   explicit ExtractGlimpseOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("normalized", &normalized_));
     OP_REQUIRES_OK(context, context->GetAttr("centered", &centered_));
-    OP_REQUIRES_OK(context, context->GetAttr("uniform_noise", &uniform_noise_));
+    bool uniform_noise = false;
+    string noise;
+    OP_REQUIRES_OK(context, context->GetAttr("uniform_noise", &uniform_noise));
+    OP_REQUIRES_OK(context, context->GetAttr("noise", &noise));
+    OP_REQUIRES(context,
+                !(uniform_noise && (!noise.empty() && noise != "uniform")),
+                errors::InvalidArgument("The uniform_noise and noise could not "
+                                        "be specified at the same time"));
+    if (noise.empty()) {
+      noise_ = uniform_noise ? Eigen::ExtractGlimpsesNoiseMode::UNIFORM
+                             : Eigen::ExtractGlimpsesNoiseMode::GAUSSIAN;
+    } else {
+      OP_REQUIRES(context,
+                  noise == "uniform" || noise == "gaussian" || noise == "zero",
+                  errors::InvalidArgument(
+                      "The noise could only be uniform, gaussian, or zero, got",
+                      noise));
+      if (noise == "uniform") {
+        noise_ = Eigen::ExtractGlimpsesNoiseMode::UNIFORM;
+      } else if (noise == "gaussian") {
+        noise_ = Eigen::ExtractGlimpsesNoiseMode::GAUSSIAN;
+      } else {
+        noise_ = Eigen::ExtractGlimpsesNoiseMode::ZERO;
+      }
+    }
   }
 
   // Expect input tensor of rank 4 with dimensions (batch_size, height, width,
@@ -98,13 +122,13 @@ class ExtractGlimpseOp : public OpKernel {
         context->eigen_cpu_device()) =
         Eigen::ExtractGlimpses(input.tensor<float, 4>().swap_layout(),
                                output_width, output_height, offset_vec,
-                               normalized_, centered_, uniform_noise_);
+                               normalized_, centered_, noise_);
   }
 
  private:
   bool normalized_;
   bool centered_;
-  bool uniform_noise_;
+  Eigen::ExtractGlimpsesNoiseMode noise_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("ExtractGlimpse").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index aa9123582210bdf31993e9d8c58ba90cc02acc5e..89d742c2dafcfd593f0166816d28ec65cb9ac9f9 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -247,7 +247,6 @@ class Barrier : public ResourceBase {
           keys = t[1];
           values.insert(values.begin(), t.begin() + 2, t.end());
           callback(indices, keys, values);
-          return;
         });
   }
 
@@ -300,7 +299,7 @@ class Barrier : public ResourceBase {
     ready_queue_->Unref();
   }
 
-  string DebugString() override { return "A barrier"; }
+  string DebugString() const override { return "A barrier"; }
 
  protected:
   template <typename T>
@@ -509,7 +508,7 @@ class BarrierOpKernel : public AsyncOpKernel {
     Barrier* barrier = nullptr;
     OP_REQUIRES_OK_ASYNC(ctx, GetResourceFromContext(ctx, "handle", &barrier),
                          callback);
-    ComputeAsync(ctx, barrier, [this, callback, barrier]() {
+    ComputeAsync(ctx, barrier, [callback, barrier]() {
       barrier->Unref();
       callback();
     });
@@ -618,7 +617,6 @@ class TakeManyOp : public BarrierOpKernel {
             values_output.set(i, values[i]);
           }
           callback();
-          return;
         });
   }
 
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 35ddda0ec04da6f3b6f11606ecb019e38698c6d7..338f61ff6642cbc604bb77dfe1908fe28b9fc142 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -233,7 +233,7 @@ class BatchResource : public ResourceBase {
     return Status::OK();
   }
 
-  string DebugString() final { return "BatchResource"; }
+  string DebugString() const final { return "BatchResource"; }
 
   // Ingests data from one invocation of the batch op. The data is enqueued to
   // be combined with others into a batch, asynchronously.
@@ -720,8 +720,7 @@ class BatchFunctionKernel : public AsyncOpKernel {
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
     BatchResource* br;
-    std::function<Status(BatchResource * *r)> creator = [this,
-                                                         c](BatchResource** r) {
+    std::function<Status(BatchResource**)> creator = [this](BatchResource** r) {
       std::unique_ptr<BatchResource> new_resource;
       TF_RETURN_IF_ERROR(
           BatchResource::Create(num_batch_threads_, max_batch_size_,
@@ -801,16 +800,15 @@ class BatchKernel : public AsyncOpKernel {
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
     BatchResource* br;
-    std::function<Status(BatchResource * *r)> creator =
-        [this](BatchResource** r) {
-          std::unique_ptr<BatchResource> new_resource;
-          TF_RETURN_IF_ERROR(BatchResource::Create(
-              num_batch_threads_, max_batch_size_, batch_timeout_micros_,
-              max_enqueued_batches_, allowed_batch_sizes_, kInvalidHandle,
-              &new_resource));
-          *r = new_resource.release();
-          return Status::OK();
-        };
+    std::function<Status(BatchResource**)> creator = [this](BatchResource** r) {
+      std::unique_ptr<BatchResource> new_resource;
+      TF_RETURN_IF_ERROR(BatchResource::Create(
+          num_batch_threads_, max_batch_size_, batch_timeout_micros_,
+          max_enqueued_batches_, allowed_batch_sizes_, kInvalidHandle,
+          &new_resource));
+      *r = new_resource.release();
+      return Status::OK();
+    };
     OP_REQUIRES_OK_ASYNC(c,
                          c->resource_manager()->LookupOrCreate(
                              container_, shared_name_, &br, creator),
@@ -878,7 +876,7 @@ class UnbatchResource : public ResourceBase {
     timeout_enforcer_ = nullptr;
   }
 
-  string DebugString() final { return "UnbatchResource"; }
+  string DebugString() const final { return "UnbatchResource"; }
 
   Status Compute(OpKernelContext* context, AsyncOpKernel::DoneCallback done) {
     const Tensor& data_t = context->input(0);
@@ -1066,7 +1064,7 @@ class UnbatchKernel : public AsyncOpKernel {
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
     UnbatchResource* ubr;
-    std::function<Status(UnbatchResource * *r)> creator =
+    std::function<Status(UnbatchResource**)> creator =
         [this](UnbatchResource** r) {
           *r = new UnbatchResource(timeout_micros_);
           return Status::OK();
@@ -1094,7 +1092,7 @@ class UnbatchGradResource : public ResourceBase {
  public:
   UnbatchGradResource() {}
 
-  string DebugString() final { return "UnbatchGradResource"; }
+  string DebugString() const final { return "UnbatchGradResource"; }
 
   // Flushes the information for one batch, given its context and done
   // callback. Clears all information about it from the available_tensors_.
@@ -1252,8 +1250,8 @@ class UnbatchGradKernel : public AsyncOpKernel {
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
     UnbatchGradResource* ubr;
-    std::function<Status(UnbatchGradResource * *r)> creator =
-        [this](UnbatchGradResource** r) {
+    std::function<Status(UnbatchGradResource**)> creator =
+        [](UnbatchGradResource** r) {
           *r = new UnbatchGradResource();
           return Status::OK();
         };
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
index 656b6ced6de00933cfe8db7dadd1a56ade212758..bef73b0574fc684f6970e705a3b95ed54e41a369 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -125,6 +125,10 @@ class AdaptiveSharedBatchScheduler
     int max_batch_size = 1000;
     // Maximum number of enqueued (i.e. non-scheduled) batches.
     int max_enqueued_batches = 10;
+    // Amount of time non-full batches must wait before becoming schedulable.
+    // A non-zero value can improve performance by limiting the scheduling of
+    // nearly empty batches.
+    int64 batch_timeout_micros = 0;
   };
 
   using BatchProcessor = std::function<void(std::unique_ptr<Batch<TaskType>>)>;
@@ -267,8 +271,11 @@ class ASBSQueue : public BatchScheduler<TaskType> {
 template <typename TaskType>
 class ASBSBatch : public Batch<TaskType> {
  public:
-  ASBSBatch(ASBSQueue<TaskType>* queue, int64 creation_time_micros)
-      : queue_(queue), creation_time_micros_(creation_time_micros) {}
+  ASBSBatch(ASBSQueue<TaskType>* queue, int64 creation_time_micros,
+            int64 batch_timeout_micros)
+      : queue_(queue),
+        creation_time_micros_(creation_time_micros),
+        schedulable_time_micros_(creation_time_micros + batch_timeout_micros) {}
 
   ~ASBSBatch() override {}
 
@@ -276,9 +283,12 @@ class ASBSBatch : public Batch<TaskType> {
 
   int64 creation_time_micros() const { return creation_time_micros_; }
 
+  int64 schedulable_time_micros() const { return schedulable_time_micros_; }
+
  private:
   ASBSQueue<TaskType>* queue_;
   const int64 creation_time_micros_;
+  const int64 schedulable_time_micros_;
   TF_DISALLOW_COPY_AND_ASSIGN(ASBSBatch);
 };
 }  // namespace internal
@@ -377,7 +387,12 @@ void AdaptiveSharedBatchScheduler<TaskType>::AddBatch(
     bool also_schedule_closed_batch) {
   mutex_lock l(mu_);
   batches_.push_back(batch);
-  MaybeScheduleNextBatch();
+  // Maybe schedule this batch once it becomes schedulable.
+  GetEnv()->SchedClosureAfter(
+      batch->schedulable_time_micros() - batch->creation_time_micros(), [this] {
+        mutex_lock l(mu_);
+        MaybeScheduleNextBatch();
+      });
   if (also_schedule_closed_batch) {
     MaybeScheduleClosedBatch();
   }
@@ -400,21 +415,22 @@ void AdaptiveSharedBatchScheduler<TaskType>::MaybeScheduleNextBatch() {
           in_flight_batches_limit_ - in_flight_batches_) {
     return;
   }
-  auto best_it = batches_.begin();
-  double best_score =
-      (*best_it)->creation_time_micros() -
-      options_.full_batch_scheduling_boost_micros * (*best_it)->size() /
-          static_cast<double>((*best_it)->queue()->max_task_size());
-  for (auto it = batches_.begin() + 1; it != batches_.end(); it++) {
+  auto best_it = batches_.end();
+  double best_score;
+  int64 now_micros = GetEnv()->NowMicros();
+  for (auto it = batches_.begin(); it != batches_.end(); it++) {
+    if ((*it)->schedulable_time_micros() > now_micros) continue;
     const double score =
         (*it)->creation_time_micros() -
         options_.full_batch_scheduling_boost_micros * (*it)->size() /
             static_cast<double>((*it)->queue()->max_task_size());
-    if (score < best_score) {
+    if (best_it == batches_.end() || score < best_score) {
       best_score = score;
       best_it = it;
     }
   }
+  // No schedulable batches.
+  if (best_it == batches_.end()) return;
   const internal::ASBSBatch<TaskType>* batch = *best_it;
   batches_.erase(best_it);
   // Queue may destroy itself after ReleaseBatch is called.
@@ -552,7 +568,8 @@ Status ASBSQueue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
     if (!current_batch_) {
       num_enqueued_batches_++;
       current_batch_ = new_batch =
-          new ASBSBatch<TaskType>(this, scheduler_->GetEnv()->NowMicros());
+          new ASBSBatch<TaskType>(this, scheduler_->GetEnv()->NowMicros(),
+                                  options_.batch_timeout_micros);
     }
     current_batch_->AddTask(std::move(*task));
     num_enqueued_tasks_++;
diff --git a/tensorflow/core/kernels/bias_op.cc b/tensorflow/core/kernels/bias_op.cc
index d4f4b43d63b90c22abbbe82263b09353912010c8..074f64a634aa83509df6e633f1fb0153ef9d6393 100644
--- a/tensorflow/core/kernels/bias_op.cc
+++ b/tensorflow/core/kernels/bias_op.cc
@@ -19,11 +19,11 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/bias_op.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/util/tensor_format.h"
 
 #if GOOGLE_CUDA
@@ -153,13 +153,13 @@ class BiasOp : public BinaryOp<T> {
               bias.tensor<T, 1>().reshape(four_dims).broadcast(broad_cast_dims);
         } break;
         case 5: {
-          Eigen::DSizes<int32, 5> four_dims(1, channel, 1, 1, 1);
+          Eigen::DSizes<int32, 5> five_dims(1, channel, 1, 1, 1);
           Eigen::DSizes<int32, 5> broad_cast_dims(batch, 1, height, width,
                                                   depth);
           const Device& d = context->eigen_device<Device>();
           output->tensor<T, 5>().device(d) =
               input.tensor<T, 5>() +
-              bias.tensor<T, 1>().reshape(four_dims).broadcast(broad_cast_dims);
+              bias.tensor<T, 1>().reshape(five_dims).broadcast(broad_cast_dims);
         } break;
         default:
           OP_REQUIRES(context, false,
@@ -269,28 +269,24 @@ class BiasGradOp : public OpKernel {
       output->template flat<T>().setZero();
     } else {
       // Added by intel_tf to support NCHW on CPU regardless of MKL used or not.
-      // TODO(yongtang): Add 3/4/5 dimensional data support for NCHW format.
       if (data_format_ == FORMAT_NCHW) {
-        OP_REQUIRES(context, output_backprop.dims() == 4,
-                    errors::InvalidArgument(
-                        "NCHW format supports only 4D input/output tensor."));
-        Eigen::DSizes<Eigen::Index, 4> four_dims(batch, channel, height, width);
+        Eigen::DSizes<Eigen::Index, 3> three_dims(batch, channel,
+                                                  height * width * depth);
 #ifdef EIGEN_HAS_INDEX_LIST
         using idx0 = Eigen::type2index<0>;
         using idx2 = Eigen::type2index<2>;
-        using idx3 = Eigen::type2index<3>;
-        Eigen::IndexList<idx0, idx2, idx3> reduction_axes;
+        Eigen::IndexList<idx0, idx2> reduction_axes;
 #else
-        Eigen::array<Eigen::Index, 3> reduction_axes = {0, 2, 3};
+        Eigen::array<Eigen::Index, 2> reduction_axes = {0, 2};
 #endif
         output->template flat<T>().device(context->eigen_device<Device>()) =
             output_backprop.flat<T>()
                 .template cast<typename AccumulatorType<T>::type>()
-                .reshape(four_dims)
+                .reshape(three_dims)
                 .sum(reduction_axes)
                 .template cast<T>();  // End of code by intel_tf.
       } else {
-        Eigen::DSizes<Eigen::Index, 2> two_dims(batch * height * width,
+        Eigen::DSizes<Eigen::Index, 2> two_dims(batch * height * width * depth,
                                                 channel);
 #ifdef EIGEN_HAS_INDEX_LIST
         Eigen::IndexList<Eigen::type2index<0> > reduction_axis;
@@ -496,21 +492,21 @@ class BiasGradOp<GPUDevice, T> : public OpKernel {
 
   void ComputeWithCustomKernel(OpKernelContext* context,
                                const Tensor& output_backprop, int32 batch,
-                               int32 width, int32 height, int32 channel,
-                               Tensor* output) {
+                               int32 width, int32 height, int32 depth,
+                               int32 channel, Tensor* output) {
     BiasGradGPU<T>::compute(context->template eigen_device<Device>(),
                             output_backprop.template flat<T>().data(),
                             output->flat<T>().data(), batch, width, height,
-                            channel, data_format_);
+                            depth, channel, data_format_);
   }
 
   void ComputeWithReduceSum(OpKernelContext* context,
                             const Tensor& output_backprop, int32 batch,
-                            int32 width, int32 height, int32 channel,
-                            Tensor* output) {
+                            int32 width, int32 height, int32 depth,
+                            int32 channel, Tensor* output) {
     if (data_format_ == FORMAT_NCHW) {
       int32 row_count = batch * channel;
-      int32 col_count = height * width;
+      int32 col_count = height * width * depth;
       Tensor temp_grad_outputs;
       // For 'NCHW' format, we perform reduction twice: first HW, then N.
       TensorShape temp_grad_output_shape{row_count, col_count};
@@ -528,7 +524,7 @@ class BiasGradOp<GPUDevice, T> : public OpKernel {
                                      row_count, col_count);
     } else {
       // For 'NHWC', we simply apply reduction once on NHW.
-      int32 row_count = batch * height * width;
+      int32 row_count = batch * height * width * depth;
       int32 col_count = channel;
       BiasGradGPU<T>::DoColReduction(
           context, const_cast<T*>(output->flat<T>().data()),
@@ -561,7 +557,7 @@ class BiasGradOp<GPUDevice, T> : public OpKernel {
     int device_id = stream->parent()->device_ordinal();
     DataType dtype = output_backprop.dtype();
     BiasAddParams bias_parameters = {
-        {batch, height * width, channel},
+        {batch, height * width * depth, channel},
         data_format_,
         dtype,
         device_id,
@@ -576,7 +572,7 @@ class BiasGradOp<GPUDevice, T> : public OpKernel {
       stream->InitTimer(&timer);
       stream->ThenStartTimer(&timer);
       ComputeWithCustomKernel(context, output_backprop, batch, width, height,
-                              channel, output);
+                              depth, channel, output);
       stream->ThenStopTimer(&timer);
       uint64 elapsed_microseconds = timer.Microseconds();
       VLOG(1) << "BiasAddGrad " << bias_parameters.ToString()
@@ -589,7 +585,7 @@ class BiasGradOp<GPUDevice, T> : public OpKernel {
       // Try reduction and profile.
       stream->ThenStartTimer(&timer);
       ComputeWithReduceSum(context, output_backprop, batch, width, height,
-                           channel, output);
+                           depth, channel, output);
       stream->ThenStopTimer(&timer);
 
       elapsed_microseconds = timer.Microseconds();
@@ -610,11 +606,11 @@ class BiasGradOp<GPUDevice, T> : public OpKernel {
     // Choose the best algorithm based on autotune results.
     if (algo_config.get_mode() == BiasAddGradGPUMode::kReduction) {
       ComputeWithReduceSum(context, output_backprop, batch, width, height,
-                           channel, output);
+                           depth, channel, output);
     } else {
       // Default to the customized kernel.
       ComputeWithCustomKernel(context, output_backprop, batch, width, height,
-                              channel, output);
+                              depth, channel, output);
     }
   }
 
diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc
index 24fea8a8e6f10cea4f74e743c8aa2c6bfb49313f..006fa1dc712f7c06953f70e278fedaa3504bfcce 100644
--- a/tensorflow/core/kernels/bias_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc
@@ -195,10 +195,10 @@ __global__ void BiasGradNCHW_SharedAtomics(const T* output_backprop,
 template <typename T>
 void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
                              T* bias_backprop, int32 batch, int32 height,
-                             int32 width, int32 channel,
+                             int32 width, int32 depth, int32 channel,
                              TensorFormat data_format) {
   const int32 bias_size = channel;
-  const int32 image_size = height * width;
+  const int32 image_size = height * width * depth;
   const int32 total_count = batch * bias_size * image_size;
   if (total_count == 0) {
     return;
diff --git a/tensorflow/core/kernels/bias_op_gpu.h b/tensorflow/core/kernels/bias_op_gpu.h
index a0b2ce4f9b34b0b343de3d09374b07d554c57d15..372a403e6872dcfb0c41b0dafe5be045c3388054 100644
--- a/tensorflow/core/kernels/bias_op_gpu.h
+++ b/tensorflow/core/kernels/bias_op_gpu.h
@@ -39,7 +39,7 @@ template <typename T>
 struct BiasGradGPU {
   static void compute(const GPUDevice& device, const T* output_backprop,
                       T* bias_backprop, int32 batch, int32 height, int32 width,
-                      int32 channel, TensorFormat data_format);
+                      int32 depth, int32 channel, TensorFormat data_format);
 
   static void DoRowReduction(OpKernelContext* context, T* output,
                              const T* input, int rows, int cols);
diff --git a/tensorflow/core/kernels/bitcast_op.cc b/tensorflow/core/kernels/bitcast_op.cc
index f602cfa428a555970f35b4057c46641a3ba156dd..02c8808809e10b777d37c08be0ff907eb923c3c7 100644
--- a/tensorflow/core/kernels/bitcast_op.cc
+++ b/tensorflow/core/kernels/bitcast_op.cc
@@ -45,8 +45,7 @@ class BitcastOp : public OpKernel {
                 in_size_ >= out_size_ ||
                     (input_tensor.dims() > 0 &&
                      input_tensor.dim_size(input_tensor.dims() - 1) ==
-                         out_size_ / in_size_) ||
-                    input_tensor.dim_size(input_tensor.dims()) == -1,
+                         out_size_ / in_size_),
                 errors::InvalidArgument(
                     "Cannot bitcast from ", DataTypeString(input_data_type_),
                     " to ", DataTypeString(output_data_type_), ": shape ",
@@ -59,8 +58,9 @@ class BitcastOp : public OpKernel {
     }
     Tensor output_tensor;
 
-    output_tensor.UnsafeCopyFromInternal(input_tensor, output_data_type_,
-                                         adjusted_shape);
+    OP_REQUIRES_OK(context,
+                   output_tensor.BitcastFrom(input_tensor, output_data_type_,
+                                             adjusted_shape));
     context->set_output(0, output_tensor);
   }
 
diff --git a/tensorflow/core/kernels/boosted_trees/BUILD b/tensorflow/core/kernels/boosted_trees/BUILD
index 8f2c2dbe8a778353dff5e0b8823ac99de68282df..285cded181cb2014e50f96c957290d642fcb6810 100644
--- a/tensorflow/core/kernels/boosted_trees/BUILD
+++ b/tensorflow/core/kernels/boosted_trees/BUILD
@@ -31,7 +31,6 @@ tf_kernel_library(
     deps = [
         ":resource_ops",
         ":resources",
-        "//tensorflow/core:boosted_trees_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -60,7 +59,6 @@ tf_kernel_library(
     srcs = ["resource_ops.cc"],
     deps = [
         ":resources",
-        "//tensorflow/core:boosted_trees_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
@@ -72,7 +70,6 @@ tf_kernel_library(
     srcs = ["stats_ops.cc"],
     deps = [
         ":tree_helper",
-        "//tensorflow/core:boosted_trees_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
@@ -84,7 +81,6 @@ tf_kernel_library(
     deps = [
         ":resources",
         ":tree_helper",
-        "//tensorflow/core:boosted_trees_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
@@ -95,7 +91,6 @@ tf_kernel_library(
     name = "quantile_ops",
     srcs = ["quantile_ops.cc"],
     deps = [
-        "//tensorflow/core:boosted_trees_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/kernels/boosted_trees/quantiles:weighted_quantiles",
diff --git a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
index 4e9bab3e21f9f240d32e78a1a489033a693caa73..3aa3bb84b9b973878127fd7db1f2d652f591a34d 100644
--- a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
+++ b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
@@ -52,6 +52,16 @@ message BucketizedSplit {
   // the rule feature <= threshold.
   int32 feature_id = 1;
   int32 threshold = 2;
+  // If feature column is multivalent, this holds the index of the dimension
+  // for the split. Defaults to 0.
+  int32 dimension_id = 5;
+  enum DefaultDirection {
+    // Left is the default direction.
+    DEFAULT_LEFT = 0;
+    DEFAULT_RIGHT = 1;
+  }
+  // default direction for missing values.
+  DefaultDirection default_direction = 6;
 
   // Node children indexing into a contiguous
   // vector of nodes starting from the root.
diff --git a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
index 4ae26fb95b1bb47db6a9462670df08f1bb4e171e..04d3359a90056ad3a4f0cd168f30601c75b59b19 100644
--- a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
@@ -113,8 +113,7 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
       output_tree_ids.setConstant(latest_tree);
       auto do_work = [&resource, &batch_bucketized_features, &cached_tree_ids,
                       &cached_node_ids, &output_partial_logits,
-                      &output_node_ids, batch_size,
-                      latest_tree](int32 start, int32 end) {
+                      &output_node_ids, latest_tree](int32 start, int32 end) {
         for (int32 i = start; i < end; ++i) {
           int32 tree_id = cached_tree_ids(i);
           int32 node_id = cached_node_ids(i);
@@ -129,7 +128,9 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
             // Logic in the loop adds the cached node value again if it is a
             // leaf. If it is not a leaf anymore we need to subtract the old
             // node's value. The following logic handles both of these cases.
-            partial_tree_logit -= resource->node_value(tree_id, node_id);
+            const auto& node_logits = resource->node_value(tree_id, node_id);
+            DCHECK_EQ(node_logits.size(), 1);
+            partial_tree_logit -= node_logits[0];
           } else {
             // No cache exists, start from the very first node.
             node_id = 0;
@@ -137,7 +138,9 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
           float partial_all_logit = 0.0;
           while (true) {
             if (resource->is_leaf(tree_id, node_id)) {
-              partial_tree_logit += resource->node_value(tree_id, node_id);
+              const auto& leaf_logits = resource->node_value(tree_id, node_id);
+              DCHECK_EQ(leaf_logits.size(), 1);
+              partial_tree_logit += leaf_logits[0];
 
               // Tree is done
               partial_all_logit +=
@@ -187,9 +190,6 @@ class BoostedTreesPredictOp : public OpKernel {
                                              &num_bucketized_features_));
     OP_REQUIRES_OK(context,
                    context->GetAttr("logits_dimension", &logits_dimension_));
-    OP_REQUIRES(context, logits_dimension_ == 1,
-                errors::InvalidArgument(
-                    "Currently only one dimensional outputs are supported."));
   }
 
   void Compute(OpKernelContext* const context) override {
@@ -225,18 +225,20 @@ class BoostedTreesPredictOp : public OpKernel {
     }
 
     const int32 last_tree = resource->num_trees() - 1;
-
     auto do_work = [&resource, &batch_bucketized_features, &output_logits,
-                    batch_size, last_tree](int32 start, int32 end) {
+                    last_tree, this](int32 start, int32 end) {
       for (int32 i = start; i < end; ++i) {
-        float tree_logit = 0.0;
+        std::vector<float> tree_logits(logits_dimension_, 0.0);
         int32 tree_id = 0;
         int32 node_id = 0;
         while (true) {
           if (resource->is_leaf(tree_id, node_id)) {
-            tree_logit += resource->GetTreeWeight(tree_id) *
-                          resource->node_value(tree_id, node_id);
-
+            const float tree_weight = resource->GetTreeWeight(tree_id);
+            const auto& leaf_logits = resource->node_value(tree_id, node_id);
+            DCHECK_EQ(leaf_logits.size(), logits_dimension_);
+            for (int32 j = 0; j < logits_dimension_; ++j) {
+              tree_logits[j] += tree_weight * leaf_logits[j];
+            }
             // Stop if it was the last tree.
             if (tree_id == last_tree) {
               break;
@@ -249,7 +251,9 @@ class BoostedTreesPredictOp : public OpKernel {
                                           batch_bucketized_features);
           }
         }
-        output_logits(i, 0) = tree_logit;
+        for (int32 j = 0; j < logits_dimension_; ++j) {
+          output_logits(i, j) = tree_logits[j];
+        }
       }
     };
     // 10 is the magic number. The actual number might depend on (the number of
@@ -329,13 +333,14 @@ class BoostedTreesExampleDebugOutputsOp : public OpKernel {
     // path. Note: feature_ids has one less value than logits_path because the
     // first value of each logit path will be the bias.
     auto do_work = [&resource, &batch_bucketized_features, &output_debug_info,
-                    batch_size, last_tree](int32 start, int32 end) {
+                    last_tree](int32 start, int32 end) {
       for (int32 i = start; i < end; ++i) {
         // Proto to store debug outputs, per example.
         boosted_trees::DebugOutput example_debug_info;
         // Initial bias prediction. E.g., prediction based off training mean.
-        float tree_logit =
-            resource->GetTreeWeight(0) * resource->node_value(0, 0);
+        const auto& tree_logits = resource->node_value(0, 0);
+        DCHECK_EQ(tree_logits.size(), 1);
+        float tree_logit = resource->GetTreeWeight(0) * tree_logits[0];
         example_debug_info.add_logits_path(tree_logit);
         int32 node_id = 0;
         int32 tree_id = 0;
@@ -358,8 +363,9 @@ class BoostedTreesExampleDebugOutputsOp : public OpKernel {
             // Get logit after split.
             node_id = resource->next_node(tree_id, node_id, i,
                                           batch_bucketized_features);
-            tree_logit = resource->GetTreeWeight(tree_id) *
-                         resource->node_value(tree_id, node_id);
+            const auto& tree_logits = resource->node_value(tree_id, node_id);
+            DCHECK_EQ(tree_logits.size(), 1);
+            tree_logit = resource->GetTreeWeight(tree_id) * tree_logits[0];
             // Output logit incorporates sum of leaf logits from prior trees.
             example_debug_info.add_logits_path(tree_logit + past_trees_logit);
           }
diff --git a/tensorflow/core/kernels/boosted_trees/quantiles/quantile_stream_resource.h b/tensorflow/core/kernels/boosted_trees/quantiles/quantile_stream_resource.h
index 1c31724272ab11a20ac6f72edd87a86105dd643e..965bf2c924c8791578c5f069e40d2d748e5f3978 100644
--- a/tensorflow/core/kernels/boosted_trees/quantiles/quantile_stream_resource.h
+++ b/tensorflow/core/kernels/boosted_trees/quantiles/quantile_stream_resource.h
@@ -37,15 +37,15 @@ class BoostedTreesQuantileStreamResource : public ResourceBase {
         epsilon_(epsilon),
         num_streams_(num_streams),
         max_elements_(max_elements) {
-          streams_.reserve(num_streams_);
-          boundaries_.reserve(num_streams_);
-          for (int64 idx = 0; idx < num_streams; ++idx) {
-            streams_.push_back(QuantileStream(epsilon, max_elements));
-            boundaries_.push_back(std::vector<float>());
-          }
-        }
-
-  string DebugString() override { return "QuantileStreamResource"; }
+    streams_.reserve(num_streams_);
+    boundaries_.reserve(num_streams_);
+    for (int64 idx = 0; idx < num_streams; ++idx) {
+      streams_.push_back(QuantileStream(epsilon, max_elements));
+      boundaries_.push_back(std::vector<float>());
+    }
+  }
+
+  string DebugString() const override { return "QuantileStreamResource"; }
 
   tensorflow::mutex* mutex() { return &mu_; }
 
diff --git a/tensorflow/core/kernels/boosted_trees/resources.cc b/tensorflow/core/kernels/boosted_trees/resources.cc
index 2798722536271380697539dca4d83ca865051da6..5ab9f97992ce5390b7bd2ae774d13c0ed06ee639 100644
--- a/tensorflow/core/kernels/boosted_trees/resources.cc
+++ b/tensorflow/core/kernels/boosted_trees/resources.cc
@@ -31,7 +31,7 @@ BoostedTreesEnsembleResource::BoostedTreesEnsembleResource()
           protobuf::Arena::CreateMessage<boosted_trees::TreeEnsemble>(
               &arena_)) {}
 
-string BoostedTreesEnsembleResource::DebugString() {
+string BoostedTreesEnsembleResource::DebugString() const {
   return strings::StrCat("TreeEnsemble[size=", tree_ensemble_->trees_size(),
                          "]");
 }
@@ -82,15 +82,38 @@ int32 BoostedTreesEnsembleResource::next_node(
   return -1;
 }
 
-float BoostedTreesEnsembleResource::node_value(const int32 tree_id,
-                                               const int32 node_id) const {
+std::vector<float> BoostedTreesEnsembleResource::node_value(
+    const int32 tree_id, const int32 node_id) const {
   DCHECK_LT(tree_id, tree_ensemble_->trees_size());
   DCHECK_LT(node_id, tree_ensemble_->trees(tree_id).nodes_size());
   const auto& node = tree_ensemble_->trees(tree_id).nodes(node_id);
   if (node.node_case() == boosted_trees::Node::kLeaf) {
-    return node.leaf().scalar();
+    // TODO(crawles): only use vector leaf even if # logits=1.
+    if (node.leaf().has_vector()) {
+      std::vector<float> leaf_values;
+      const auto& leaf_value_vector = node.leaf().vector();
+      const int size = leaf_value_vector.value_size();
+      leaf_values.reserve(size);
+      for (int i = 0; i < size; ++i) {
+        leaf_values.push_back(leaf_value_vector.value(i));
+      }
+      return leaf_values;
+    } else {
+      return {node.leaf().scalar()};
+    }
   } else {
-    return node.metadata().original_leaf().scalar();
+    if (node.metadata().original_leaf().has_vector()) {
+      std::vector<float> node_values;
+      const auto& leaf_value_vector = node.metadata().original_leaf().vector();
+      const int size = leaf_value_vector.value_size();
+      node_values.reserve(size);
+      for (int i = 0; i < size; ++i) {
+        node_values.push_back(leaf_value_vector.value(i));
+      }
+      return node_values;
+    } else {
+      return {node.metadata().original_leaf().scalar()};
+    }
   }
 }
 
@@ -452,15 +475,18 @@ void BoostedTreesEnsembleResource::RecursivelyDoPostPrunePreparation(
 
     // Change node back into leaf.
     *node->mutable_leaf() = node_metadata.original_leaf();
-    const float parent_value = node_value(tree_id, node_id);
+    const auto& parent_values = node_value(tree_id, node_id);
+    DCHECK_EQ(parent_values.size(), 1);
+    const float parent_value = parent_values[0];
 
     // Save the old values of weights of children.
     (*nodes_meta)[left_id].first = node_id;
-    (*nodes_meta)[left_id].second = parent_value - node_value(tree_id, left_id);
+    (*nodes_meta)[left_id].second =
+        parent_value - node_value(tree_id, left_id)[0];
 
     (*nodes_meta)[right_id].first = node_id;
     (*nodes_meta)[right_id].second =
-        parent_value - node_value(tree_id, right_id);
+        parent_value - node_value(tree_id, right_id)[0];
 
     // Clear gain for leaf node.
     node->clear_metadata();
diff --git a/tensorflow/core/kernels/boosted_trees/resources.h b/tensorflow/core/kernels/boosted_trees/resources.h
index f961ed38142709b01ba009a4d8fb3dab2fe757c4..34a35f173c338964632b62536f21175137e9b371 100644
--- a/tensorflow/core/kernels/boosted_trees/resources.h
+++ b/tensorflow/core/kernels/boosted_trees/resources.h
@@ -48,7 +48,7 @@ class BoostedTreesEnsembleResource : public StampedResource {
  public:
   BoostedTreesEnsembleResource();
 
-  string DebugString() override;
+  string DebugString() const override;
 
   bool InitFromSerialized(const string& serialized, const int64 stamp_token);
 
@@ -68,7 +68,7 @@ class BoostedTreesEnsembleResource : public StampedResource {
       const int32 tree_id, const int32 node_id, const int32 index_in_batch,
       const std::vector<TTypes<int32>::ConstVec>& bucketized_features) const;
 
-  float node_value(const int32 tree_id, const int32 node_id) const;
+  std::vector<float> node_value(const int32 tree_id, const int32 node_id) const;
 
   void set_node_value(const int32 tree_id, const int32 node_id,
                       const float logits);
diff --git a/tensorflow/core/kernels/boosted_trees/training_ops.cc b/tensorflow/core/kernels/boosted_trees/training_ops.cc
index 973cdec13a368ff95ae3185695507c62c173675c..7c025b34b982f410ac3585855a6e14f3b99f5e2f 100644
--- a/tensorflow/core/kernels/boosted_trees/training_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/training_ops.cc
@@ -288,7 +288,9 @@ class BoostedTreesCenterBiasOp : public OpKernel {
       ensemble_resource->AddNewTreeWithLogits(kLayerByLayerTreeWeight, logits);
       current_bias = logits;
     } else {
-      current_bias = ensemble_resource->node_value(0, 0);
+      const auto& current_biases = ensemble_resource->node_value(0, 0);
+      DCHECK_EQ(current_biases.size(), 1);
+      current_bias = current_biases[0];
       continue_centering =
           std::abs(logits / current_bias) > kMinDeltaForCenterBias;
       current_bias += logits;
diff --git a/tensorflow/core/kernels/broadcast_to_op.cc b/tensorflow/core/kernels/broadcast_to_op.cc
index 2810925bbcd645f60af0e6025a74043cd45f21e7..8c4341335fbae66249259b69a9693b8bcf6073f0 100644
--- a/tensorflow/core/kernels/broadcast_to_op.cc
+++ b/tensorflow/core/kernels/broadcast_to_op.cc
@@ -15,13 +15,17 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
 
+#include "tensorflow/core/kernels/broadcast_to_op.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/broadcast_to_op.h"
+#include "tensorflow/core/util/bcast.h"
 
 namespace tensorflow {
 
@@ -43,12 +47,42 @@ class BroadcastToOp : public OpKernel {
     OP_REQUIRES_OK(ctx,
                    ctx->op_kernel().MakeShape(shape_tensor, &output_shape));
 
+    // Handle copy.
+    if (output_shape == input_shape) {
+      ctx->set_output(0, input_tensor);
+      return;
+    }
+
+    OP_REQUIRES(ctx, input_shape.dims() <= output_shape.dims(),
+                errors::InvalidArgument(
+                    "Rank of input (", input_shape.dims(),
+                    ") must be no greater than rank of output shape (",
+                    output_shape.dims(), ")."));
+
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_tensor));
-
-    const Device& d = ctx->eigen_device<Device>();
-    functor::BroadcastTo<Device, T>()(d, ctx, *output_tensor, output_shape,
-                                      input_tensor, input_shape);
+    // Handle empty case.
+    if (output_shape.num_elements() == 0) {
+      return;
+    }
+
+    // Handle broadcast from Scalar.
+    const Device& device = ctx->eigen_device<Device>();
+    if (input_shape.dims() == 0) {
+      functor::FillFunctor<Device, T>()(device, output_tensor->flat<T>(),
+                                        input_tensor.scalar<T>());
+      return;
+    }
+
+    BCast bcast(BCast::FromShape(input_shape), BCast::FromShape(output_shape),
+                /*fewer_dims_optimization=*/true);
+    OP_REQUIRES(ctx, bcast.IsValid(),
+                errors::InvalidArgument(
+                    "Incompatible shapes: ", input_shape.DebugString(), " vs. ",
+                    output_shape.DebugString()));
+
+    functor::BroadcastTo<Device, T>()(device, ctx, *output_tensor, output_shape,
+                                      input_tensor, input_shape, bcast);
   }
 };
 
@@ -65,12 +99,12 @@ TF_CALL_ALL_TYPES(REGISTER_KERNEL);
 #if GOOGLE_CUDA
 
 namespace functor {
-#define DECLARE_GPU_TEMPLATE(Type)                              \
-  template <>                                                   \
-  void BroadcastTo<GPUDevice, Type>::operator()(                \
-      const GPUDevice& d, OpKernelContext* ctx, Tensor& output, \
-      const TensorShape& output_shape, const Tensor& input,     \
-      const TensorShape& input_shape);                          \
+#define DECLARE_GPU_TEMPLATE(Type)                               \
+  template <>                                                    \
+  void BroadcastTo<GPUDevice, Type>::operator()(                 \
+      const GPUDevice& d, OpKernelContext* ctx, Tensor& output,  \
+      const TensorShape& output_shape, const Tensor& input,      \
+      const TensorShape& input_shape, const BCast& bcast) const; \
   extern template struct BroadcastTo<GPUDevice, Type>;
 
 TF_CALL_GPU_ALL_TYPES(DECLARE_GPU_TEMPLATE);
diff --git a/tensorflow/core/kernels/broadcast_to_op.h b/tensorflow/core/kernels/broadcast_to_op.h
index bc11c5f914bfcbcbbc4445cace7126717f3d8d2d..6ae860c2b2995f1a9bb5f47ad40b4546923801a6 100644
--- a/tensorflow/core/kernels/broadcast_to_op.h
+++ b/tensorflow/core/kernels/broadcast_to_op.h
@@ -23,196 +23,81 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/util/bcast.h"
 
 namespace tensorflow {
 
 namespace functor {
 
-#define BROADCAST_SHAPE(NDIMS, input_shape, output_shape)                 \
-  auto reshape = AsEigenDSizesWithPrefix<NDIMS>(input_shape);             \
-  auto broadcast = output_shape.AsEigenDSizes<NDIMS>();                   \
-  auto reshape_32bit = AsEigenDSizesWithPrefix<NDIMS, int>(input_shape);  \
-  auto broadcast_32bit = output_shape.AsEigenDSizes<NDIMS, int>();        \
-  if (input_shape.dims() > 0) {                                           \
-    for (int i = 0; i < NDIMS; i++) {                                     \
-      if (reshape[i] != broadcast[i]) {                                   \
-        OP_REQUIRES(                                                      \
-            ctx, ((reshape[i] != 0) && (broadcast[i] % reshape[i] == 0)), \
-            errors::InvalidArgument("invalid shape to broadcast from ",   \
-                                    input_shape.DebugString(), " to ",    \
-                                    output_shape.DebugString()));         \
-        broadcast[i] = broadcast[i] / reshape[i];                         \
-      } else {                                                            \
-        broadcast[i] = 1;                                                 \
-      }                                                                   \
-      if (can_use_32bit) {                                                \
-        broadcast_32bit[i] = static_cast<int>(broadcast[i]);              \
-      }                                                                   \
-    }                                                                     \
+template <typename Device, typename T>
+struct BroadcastTo {
+  template <int NDIMS>
+  void DoBCast32Bit(const Device &device, typename TTypes<T, NDIMS>::Tensor out,
+                    typename TTypes<T, NDIMS>::ConstTensor in,
+                    const typename Eigen::array<int, NDIMS> &bcast) const {
+    To32Bit(out).device(device) = To32Bit(in).broadcast(bcast);
   }
 
-#define HANDLE_BROADCAST_FROM_SCALAR()                              \
-  if (std::is_same<Eigen::GpuDevice, Device>::value) {              \
-    FillFunctor<Device, T>()(d, output_tensor.flat<T>(),            \
-                             input_tensor.scalar<T>());             \
-  } else {                                                          \
-    output.device(d) = output.constant(input_tensor.scalar<T>()()); \
+  template <int NDIMS>
+  void DoBCast(
+      const Device &device, typename TTypes<T, NDIMS>::Tensor out,
+      typename TTypes<T, NDIMS>::ConstTensor in,
+      const typename Eigen::array<Eigen::DenseIndex, NDIMS> &bcast) const {
+    out.device(device) = in.broadcast(bcast);
   }
 
-#define HANDLE_BROADCAST_CASE(dim_i)                                        \
-  case dim_i: {                                                             \
-    if (can_use_32bit) {                                                    \
-      auto input = input_tensor.tensor<T, dim_i>();                         \
-      To32Bit(output).device(d) =                                           \
-          To32Bit(input).reshape(reshape_32bit).broadcast(broadcast_32bit); \
-    } else {                                                                \
-      auto input = input_tensor.tensor<T, dim_i>();                         \
-      output.device(d) = input.reshape(reshape).broadcast(broadcast);       \
-    }                                                                       \
-  } break
-
-template <typename Device, typename T>
-struct BroadcastTo {
-  void operator()(const Device &d, OpKernelContext *ctx, Tensor &output_tensor,
-                  const TensorShape &output_shape, const Tensor &input_tensor,
-                  const TensorShape &input_shape) {
-    if (output_shape.num_elements() == 0) {
-      return;
-    }
-    if (output_shape == input_shape) {
-      output_tensor.flat<T>().device(d) = input_tensor.flat<T>();
-      return;
-    }
-
+  template <int NDIMS>
+  void ReshapeAndBCast(const Device &device, Tensor &output_tensor,
+                       const Tensor &input_tensor, const BCast &bcast) const {
     const bool can_use_32bit = std::is_same<Eigen::GpuDevice, Device>::value &&
                                output_tensor.NumElements() < kint32max &&
                                input_tensor.NumElements() < kint32max;
+    if (can_use_32bit) {
+      DoBCast32Bit<NDIMS>(
+          device, output_tensor.template shaped<T, NDIMS>(bcast.result_shape()),
+          input_tensor.template shaped<T, NDIMS>(bcast.x_reshape()),
+          BCast::ToIndexArrayType<int, NDIMS>(bcast.x_bcast()));
+    } else {
+      DoBCast<NDIMS>(
+          device, output_tensor.template shaped<T, NDIMS>(bcast.result_shape()),
+          input_tensor.template shaped<T, NDIMS>(bcast.x_reshape()),
+          BCast::ToIndexArrayType<Eigen::DenseIndex, NDIMS>(bcast.x_bcast()));
+    }
+  }
 
-    switch (output_shape.dims()) {
-      case 0: {
-        if (input_shape.dims() > 0) {
-          ctx->CtxFailure(errors::InvalidArgument(
-              "invalid shape to broadcast from ", input_shape.DebugString(),
-              " to ", output_shape.DebugString()));
-          break;
-        }
-        output_tensor.scalar<T>().device(d) = input_tensor.scalar<T>();
+  // PRECONDITION: rank(input_shape) > 0 &&
+  //               rank(input_shape) <= rank(output_shape)  &&
+  //               output_shape.num_elements() > 0.
+  void operator()(const Device &device, OpKernelContext *ctx,
+                  Tensor &output_tensor, const TensorShape &output_shape,
+                  const Tensor &input_tensor, const TensorShape &input_shape,
+                  const BCast &bcast) const {
+    const int ndims = bcast.y_reshape().size();
+    switch (ndims) {
+      case 1:
+        ReshapeAndBCast<1>(device, output_tensor, input_tensor, bcast);
+        break;
+      case 2:
+        ReshapeAndBCast<2>(device, output_tensor, input_tensor, bcast);
+        break;
+      case 3:
+        ReshapeAndBCast<3>(device, output_tensor, input_tensor, bcast);
+        break;
+      case 4:
+        ReshapeAndBCast<4>(device, output_tensor, input_tensor, bcast);
+        break;
+      case 5:
+        ReshapeAndBCast<5>(device, output_tensor, input_tensor, bcast);
         break;
-      }
-      case 1: {
-        BROADCAST_SHAPE(1, input_shape, output_shape);
-
-        auto output = output_tensor.tensor<T, 1>();
-        switch (input_shape.dims()) {
-          case 0: {
-            HANDLE_BROADCAST_FROM_SCALAR();
-          } break;
-            HANDLE_BROADCAST_CASE(1);
-          default:
-            ctx->CtxFailure(errors::InvalidArgument(
-                "invalid shape to broadcast from ", input_shape.DebugString(),
-                " to ", output_shape.DebugString()));
-            break;
-        }
-      } break;
-      case 2: {
-        BROADCAST_SHAPE(2, input_shape, output_shape);
-        auto output = output_tensor.tensor<T, 2>();
-        switch (input_shape.dims()) {
-          case 0: {
-            HANDLE_BROADCAST_FROM_SCALAR();
-          } break;
-            HANDLE_BROADCAST_CASE(1);
-            HANDLE_BROADCAST_CASE(2);
-          default:
-            ctx->CtxFailure(errors::InvalidArgument(
-                "invalid shape to broadcast from ", input_shape.DebugString(),
-                " to ", output_shape.DebugString()));
-            break;
-        }
-      } break;
-      case 3: {
-        BROADCAST_SHAPE(3, input_shape, output_shape);
-        auto output = output_tensor.tensor<T, 3>();
-        switch (input_shape.dims()) {
-          case 0: {
-            HANDLE_BROADCAST_FROM_SCALAR();
-          } break;
-            HANDLE_BROADCAST_CASE(1);
-            HANDLE_BROADCAST_CASE(2);
-            HANDLE_BROADCAST_CASE(3);
-          default:
-            ctx->CtxFailure(errors::InvalidArgument(
-                "invalid shape to broadcast from ", input_shape.DebugString(),
-                " to ", output_shape.DebugString()));
-            break;
-        }
-      } break;
-      case 4: {
-        BROADCAST_SHAPE(4, input_shape, output_shape);
-        auto output = output_tensor.tensor<T, 4>();
-        switch (input_shape.dims()) {
-          case 0: {
-            HANDLE_BROADCAST_FROM_SCALAR();
-          } break;
-            HANDLE_BROADCAST_CASE(1);
-            HANDLE_BROADCAST_CASE(2);
-            HANDLE_BROADCAST_CASE(3);
-            HANDLE_BROADCAST_CASE(4);
-          default:
-            ctx->CtxFailure(errors::InvalidArgument(
-                "invalid shape to broadcast from ", input_shape.DebugString(),
-                " to ", output_shape.DebugString()));
-            break;
-        }
-      } break;
-      case 5: {
-        BROADCAST_SHAPE(5, input_shape, output_shape);
-        auto output = output_tensor.tensor<T, 5>();
-        switch (input_shape.dims()) {
-          case 0: {
-            HANDLE_BROADCAST_FROM_SCALAR();
-          } break;
-            HANDLE_BROADCAST_CASE(1);
-            HANDLE_BROADCAST_CASE(2);
-            HANDLE_BROADCAST_CASE(3);
-            HANDLE_BROADCAST_CASE(4);
-            HANDLE_BROADCAST_CASE(5);
-          default:
-            ctx->CtxFailure(errors::InvalidArgument(
-                "invalid shape to broadcast from ", input_shape.DebugString(),
-                " to ", output_shape.DebugString()));
-            break;
-        }
-      } break;
       default:
-        ctx->CtxFailure(errors::InvalidArgument(
-            "invalid shape to broadcast from ", input_shape.DebugString(),
-            " to ", output_shape.DebugString()));
+        ctx->SetStatus(errors::Unimplemented(
+            "Broadcast between ", input_shape.DebugString(), " and ",
+            output_shape.DebugString(), " is not supported yet."));
         break;
     }
   }
-
- private:
-  template <int NDIMS, typename DimType = Eigen::DenseIndex>
-  Eigen::DSizes<DimType, NDIMS> AsEigenDSizesWithPrefix(
-      const TensorShape &shape) const {
-    Eigen::DSizes<DimType, NDIMS> dsizes;
-    for (int d = 0; d < NDIMS - shape.dims(); d++) {
-      dsizes[d] = 1;
-    }
-    for (int d = NDIMS - shape.dims(); d < NDIMS; d++) {
-      dsizes[d] =
-          static_cast<DimType>(shape.dim_size(d - (NDIMS - shape.dims())));
-    }
-    return dsizes;
-  }
 };
 
-#undef BROADCAST_SHAPE
-#undef HANDLE_BROADCAST_FROM_SCALAR
-#undef HANDLE_BROADCAST_CASE
-
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
index 3a72567655c09c7091bc917e0af9f20725f38287..5306c77102ebf70cdbcbae847d4386829ee3526b 100644
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -99,9 +99,9 @@ void CastOpBase::Compute(OpKernelContext* ctx) {
   } else {
     Tensor in;
     if (external_src_dtype_ != src_dtype_) {
-      // If the type is a quantized type we need to do an UnsafeCopyFromInternal
-      // since the src_dtype_ is different from external_src_type_.
-      in.UnsafeCopyFromInternal(inp, src_dtype_, inp.shape());
+      // If the type is a quantized type we need to do a bitcast since the
+      // src_dtype_ is different from external_src_type_.
+      OP_REQUIRES_OK(ctx, in.BitcastFrom(inp, src_dtype_, inp.shape()));
     } else {
       in = inp;
     }
diff --git a/tensorflow/contrib/factorization/kernels/clustering_ops.cc b/tensorflow/core/kernels/clustering_ops.cc
similarity index 99%
rename from tensorflow/contrib/factorization/kernels/clustering_ops.cc
rename to tensorflow/core/kernels/clustering_ops.cc
index 025534d540bb82cdb87bb2977d08dfa4f02f1bc8..7e1a1fdcd2d2f9a5d4ea5228497b515ff65e3791 100644
--- a/tensorflow/contrib/factorization/kernels/clustering_ops.cc
+++ b/tensorflow/core/kernels/clustering_ops.cc
@@ -392,7 +392,7 @@ class NearestNeighborsOp : public OpKernel {
       for (; start < limit; ++start) {
         const int64 start_row = num_points * start / num_units;
         const int64 limit_row = num_points * (start + 1) / num_units;
-        CHECK_LE(limit_row, num_points);
+        DCHECK_LE(limit_row, num_points);
         const int64 num_rows = limit_row - start_row;
         auto points_shard = points.middleRows(start_row, num_rows);
         const Eigen::VectorXf points_half_squared_norm =
@@ -430,7 +430,7 @@ class NearestNeighborsOp : public OpKernel {
       const Eigen::Ref<const Eigen::VectorXf>& centers_half_squared_norm,
       const Eigen::Ref<MatrixXi64RowMajor>& nearest_center_indices,
       const Eigen::Ref<MatrixXfRowMajor>& nearest_center_distances) {
-    CHECK_LE(k, centers.rows());
+    DCHECK_LE(k, centers.rows());
     if (centers.rows() <= kNearestNeighborsCentersMaxBlockSize) {
       FindKNearestCentersOneBlock(k, points, points_half_squared_norm, centers,
                                   centers_half_squared_norm,
@@ -451,7 +451,7 @@ class NearestNeighborsOp : public OpKernel {
       const Eigen::Ref<const Eigen::VectorXf>& centers_half_squared_norm,
       Eigen::Ref<MatrixXi64RowMajor> nearest_center_indices,
       Eigen::Ref<MatrixXfRowMajor> nearest_center_distances) {
-    CHECK_LE(k, centers.rows());
+    DCHECK_LE(k, centers.rows());
     const int64 num_points = points.rows();
     const MatrixXfRowMajor inner_product = points * centers.transpose();
     // Find nearest neighbors.
@@ -500,8 +500,8 @@ class NearestNeighborsOp : public OpKernel {
       Eigen::Ref<MatrixXfRowMajor> nearest_center_distances) {
     const int64 num_points = points.rows();
     const int64 num_centers = centers.rows();
-    CHECK_LE(k, num_centers);
-    CHECK_GT(num_centers, kNearestNeighborsCentersMaxBlockSize);
+    DCHECK_LE(k, num_centers);
+    DCHECK_GT(num_centers, kNearestNeighborsCentersMaxBlockSize);
     // Store nearest neighbors with first block of centers directly into the
     // output matrices.
     int64 out_k = std::min(k, kNearestNeighborsCentersMaxBlockSize);
diff --git a/tensorflow/contrib/factorization/kernels/clustering_ops_test.cc b/tensorflow/core/kernels/clustering_ops_test.cc
similarity index 100%
rename from tensorflow/contrib/factorization/kernels/clustering_ops_test.cc
rename to tensorflow/core/kernels/clustering_ops_test.cc
diff --git a/tensorflow/core/kernels/collective_nccl_reducer.cc b/tensorflow/core/kernels/collective_nccl_reducer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c5e6f06c6578d1a6dc777b39e8e04aa963b5aecd
--- /dev/null
+++ b/tensorflow/core/kernels/collective_nccl_reducer.cc
@@ -0,0 +1,206 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/collective_nccl_reducer.h"
+
+#ifdef GOOGLE_CUDA
+
+#include "tensorflow/core/common_runtime/collective_util.h"
+#include "tensorflow/core/nccl/nccl_manager.h"
+
+namespace tensorflow {
+namespace {
+string NcclCollectiveKey(const string& exec_key, int step_id) {
+  return strings::StrCat(exec_key, ":", step_id);
+}
+}  // namespace
+
+NcclReducer::NcclReducer() : col_ctx_(nullptr), col_params_(nullptr) {}
+
+Status NcclReducer::InitializeCollectiveParams(CollectiveParams* col_params) {
+  if (col_params->instance.type != REDUCTION_COLLECTIVE ||
+      col_params->instance.impl_details.collective_name != "NcclReduce") {
+    return errors::Internal("Unexpected collective type ",
+                            col_params->instance.type, " expected ",
+                            REDUCTION_COLLECTIVE, "; or collective name ",
+                            col_params->instance.impl_details.collective_name,
+                            " expected NcclReduce");
+  } else {
+    return Status::OK();
+  }
+}
+
+Status NcclReducer::InitializeCollectiveContext(CollectiveContext* col_ctx) {
+  col_ctx_ = col_ctx;
+  col_params_ = &col_ctx->col_params;
+  return collective_util::InitializeDeviceAndLocality(
+      col_ctx->dev_mgr, col_ctx->device_name, &col_ctx->device,
+      &col_ctx->device_locality);
+}
+
+Status NcclReducer::InitializeInstanceBeforeGroupDiscovery(
+    CollectiveParams* col_params) {
+  if (col_params->default_rank == 0 && col_params->group.num_tasks > 1) {
+    col_params->instance.communicator_key =
+        NcclManager::instance()->GenerateCommunicatorKey();
+  }
+  return Status::OK();
+}
+
+Status ReductionOp(const string& merge_op, ncclRedOp_t* reduction_op) {
+  if (merge_op == "Add") {
+    *reduction_op = ncclSum;
+    return Status::OK();
+  } else if (merge_op == "Mul") {
+    *reduction_op = ncclProd;
+    return Status::OK();
+  } else {
+    return errors::Internal("Expected merge_op to be either Add or Mul, found ",
+                            merge_op);
+  }
+}
+
+void NcclReducer::Run(StatusCallback done) {
+  ncclRedOp_t reduction_op;
+  Status s = ReductionOp(col_params_->merge_op->type_string(), &reduction_op);
+  if (!s.ok()) {
+    done(s);
+    return;
+  }
+
+  Tensor group_size;
+  Notification group_size_ready;
+  Status group_size_status;
+  if (col_params_->final_op) {
+    // Create an on-device scalar value from group_size_.
+    // TODO(ayushd, tucker): avoid this copy by either reusing across
+    // invocations or providing the scalar to the kernel in host memory.
+    Tensor group_size_val(col_ctx_->output->dtype(), TensorShape({}));
+    switch (col_ctx_->output->dtype()) {
+      case DT_FLOAT:
+        group_size_val.scalar<float>()() = col_params_->group.group_size;
+        break;
+      case DT_DOUBLE:
+        group_size_val.scalar<double>()() = col_params_->group.group_size;
+        break;
+      case DT_INT32:
+        group_size_val.scalar<int32>()() = col_params_->group.group_size;
+        break;
+      case DT_INT64:
+        group_size_val.scalar<int64>()() = col_params_->group.group_size;
+        break;
+      default:
+        done(errors::Internal("Unsupported type ", col_ctx_->output->dtype()));
+        return;
+    }
+    group_size = Tensor(
+        col_ctx_->device->GetAllocator(col_ctx_->op_ctx->input_alloc_attr(0)),
+        col_ctx_->output->dtype(), TensorShape({}));
+    DeviceContext* op_dev_ctx = col_ctx_->op_ctx->op_device_context();
+    // Enqueue copy on gpu stream.
+    op_dev_ctx->CopyCPUTensorToDevice(
+        &group_size_val, col_ctx_->device, &group_size,
+        [&group_size_ready, &group_size_status](const Status& s) {
+          group_size_status = s;
+          group_size_ready.Notify();
+        });
+  } else {
+    group_size_ready.Notify();
+  }
+
+  Notification nccl_done;
+  Status nccl_status;
+  auto* compute_stream = col_ctx_->op_ctx->op_device_context()->stream();
+  auto* gpu_info = col_ctx_->op_ctx->device()->tensorflow_gpu_device_info();
+  // `AddToAllReduce` performs consistency checks for the NCCL call and enqueues
+  // the `Participant` struct locally.  When all local participants with this
+  // `nccl_collective_key` have called `AddToAllReduce` and
+  // `SignalMultiNodeReady`, all devices at this worker are ready to process
+  // this NCCL op.
+  //
+  // The `NcclManager` uses a dedicated CUDA stream for NCCL kernels.  At this
+  // point, it synchronizes the NCCL stream with the compute stream, and then
+  // enqueues the NCCL kernel on the NCCL stream.
+  const int num_global_devices = col_params_->group.group_size;
+  const int num_local_devices = col_params_->instance.num_devices_per_task.at(
+      col_params_->instance.task_names[col_params_->default_rank]);
+  const string nccl_collective_key =
+      NcclCollectiveKey(col_ctx_->exec_key, col_ctx_->step_id);
+  auto done_callback = [&nccl_done, &nccl_status](const Status& s) {
+    nccl_status = s;
+    nccl_done.Notify();
+  };
+  auto participant = absl::make_unique<NcclManager::Participant>(
+      compute_stream->parent(), compute_stream, gpu_info->event_mgr,
+      gpu_info->gpu_id, col_ctx_->input, col_ctx_->output,
+      col_params_->default_rank, std::move(done_callback));
+  VLOG(1) << "NcclReducer calling NcclManager::AddToAllReduce num_tasks "
+          << col_params_->group.num_tasks << " current task "
+          << col_params_->instance.task_names[col_params_->default_rank]
+          << " num local devices " << num_local_devices
+          << " num global devices " << num_global_devices << " device "
+          << col_ctx_->device_name << " instance "
+          << col_params_->instance.instance_key;
+  NcclManager::instance()->AddToAllReduce(
+      std::move(participant),
+      {nccl_collective_key, num_local_devices, num_global_devices,
+       col_params_->instance.communicator_key},
+      reduction_op);
+
+  // NOTE(ayushd): We need to synchronize NCCL launches across nodes to prevent
+  // deadlocks.  In the current implementation, we define a deterministic
+  // sequential launch order between potentially concurrent collective instances
+  // by introducing control information during static graph analysis in
+  // graph/collective_order.cc.  This can be either in the form of explicit
+  // control edges or via `wait_for` attribute on the collective op.
+  //
+  // The other end of the design spectrum would have a distinguished node
+  // dynamically signal the next collective to launch to all other participants.
+  // This has higher degree of runtime coordination, but it may be able to
+  // achieve better performance if the (arbitrary) static execution order
+  // assigned in the first approach turns out to not be good from a scheduling
+  // perspective.  e.g. consider a graph in which c1, c2, and c3 are three
+  // concurrent collective instances, and the static ordering assigns c1 -> c2
+  // -> c3.  In practice, it could turn out that c3 is always ready to execute
+  // before c1 or c2.
+  //
+  // `WaitForDependencies` may block if the collective instances on which this
+  // op depends have not yet launched.  When this function returns, this op is
+  // ready to go.
+  col_ctx_->col_exec->WaitForDependencies(*col_params_);
+  NcclManager::instance()->SignalMultiNodeReady(nccl_collective_key);
+  // When all devices at this worker have called `SignalMultiNodeReady`, the
+  // `NcclManager` will enqueue the NCCL kernel on the NCCL stream.  Thus the
+  // implementation of `Launched` keeps track of the number of devices that have
+  // launched.
+  col_ctx_->col_exec->Launched(*col_params_);
+
+  // Wait for nccl op and group_size copy to succeed, then do final_op.
+  group_size_ready.WaitForNotification();
+  nccl_done.WaitForNotification();
+  Status final_status =
+      group_size_status.ok() ? nccl_status : group_size_status;
+  if (final_status.ok() && col_params_->final_op) {
+    final_status = collective_util::ComputeBinOp(
+        col_ctx_->op_ctx, col_ctx_->op_params, col_ctx_->device,
+        col_params_->final_op.get(), col_ctx_->output, &group_size);
+  }
+  done(final_status);
+}
+
+REGISTER_COLLECTIVE(NcclReduce, NcclReducer);
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/collective_nccl_reducer.h b/tensorflow/core/kernels/collective_nccl_reducer.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc70b280c5dc9eb9da72667d459ea727945d7e8a
--- /dev/null
+++ b/tensorflow/core/kernels/collective_nccl_reducer.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_REDUCER_H_
+#define TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_REDUCER_H_
+
+#include "tensorflow/core/framework/collective.h"
+
+namespace tensorflow {
+#ifdef GOOGLE_CUDA
+
+class NcclReducer : public CollectiveImplementationInterface {
+ public:
+  NcclReducer();
+  ~NcclReducer() override = default;
+
+  // No-op for this collective implementation.
+  Status InitializeCollectiveParams(CollectiveParams* col_params) override;
+
+  // Initializes the device objects and device localities.
+  Status InitializeCollectiveContext(CollectiveContext* col_ctx) override;
+
+  // Initialize nccl communicator key.
+  Status InitializeInstanceBeforeGroupDiscovery(
+      CollectiveParams* col_params) override;
+
+  // Hands off all reduce to NcclManager.
+  void Run(StatusCallback done) override;
+
+ private:
+  CollectiveContext* col_ctx_;          // Not owned
+  const CollectiveParams* col_params_;  // Not owned
+};
+
+#endif  // GOOGLE_CUDA
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_COLLECTIVE_NCCL_REDUCER_H_
diff --git a/tensorflow/core/kernels/collective_nccl_reducer_test.cc b/tensorflow/core/kernels/collective_nccl_reducer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..26c92f1f7433e34cf4e3789dcd480f8822147891
--- /dev/null
+++ b/tensorflow/core/kernels/collective_nccl_reducer_test.cc
@@ -0,0 +1,332 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/collective_nccl_reducer.h"
+
+#include <algorithm>
+#include "absl/memory/memory.h"
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/test_collective_executor_mgr.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+
+namespace tensorflow {
+static constexpr int kStepId = 10;
+
+std::unique_ptr<OpKernel> GetKernel(const NodeDef& node, DeviceBase* device) {
+  Status status;
+  std::unique_ptr<OpKernel> k = CreateOpKernel(
+      DEVICE_GPU, device, device->GetAllocator(AllocatorAttributes()), node,
+      TF_GRAPH_DEF_VERSION, &status);
+  if (!status.ok()) LOG(FATAL) << status;
+  return k;
+}
+
+std::unique_ptr<OpKernel> GetAdd(DeviceBase* device) {
+  NodeDef node_def;
+  NodeDefBuilder builder("add_node", "Add");
+  TF_CHECK_OK(builder.Attr("T", DT_FLOAT)
+                  .Input(FakeInput(DT_FLOAT))
+                  .Input(FakeInput(DT_FLOAT))
+                  .Finalize(&node_def));
+  return GetKernel(node_def, device);
+}
+
+std::unique_ptr<OpKernel> GetDiv(DeviceBase* device) {
+  NodeDef node_def;
+  NodeDefBuilder builder("add_node", "Div");
+  TF_CHECK_OK(builder.Attr("T", DT_FLOAT)
+                  .Input(FakeInput(DT_FLOAT))
+                  .Input(FakeInput(DT_FLOAT))
+                  .Finalize(&node_def));
+  return GetKernel(node_def, device);
+}
+
+class NcclReducerTest : public ::testing::Test {
+ protected:
+  ~NcclReducerTest() override {
+    if (col_exec_) col_exec_->Unref();
+  }
+
+  void InitGPUDevices() {
+    std::vector<std::unique_ptr<Device>> all_devices;
+    SessionOptions session_options;
+    session_options.config.mutable_gpu_options()
+        ->set_per_process_gpu_memory_fraction(0.1);
+    session_options.env = Env::Default();
+    Status s = DeviceFactory::GetFactory(DEVICE_GPU)
+                   ->AddDevices(session_options, "", &all_devices);
+    TF_CHECK_OK(s);
+    for (std::unique_ptr<Device>& d : all_devices) {
+      if (d->device_type() == "GPU") {
+        gpus_.emplace_back(std::move(d));
+      }
+    }
+  }
+
+  void Init(int num_ranks) {
+    setenv("NCCL_DEBUG", "INFO", 1 /* replace */);
+    setenv("NCCL_LAUNCH_MODE", "PARALLEL", 1 /* replace */);
+    InitGPUDevices();
+    std::vector<std::unique_ptr<Device>> local_devices;
+    std::vector<string> device_names;
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      if (rank < gpus_.size()) {
+        local_devices.emplace_back(std::move(gpus_[rank]));
+      }
+    }
+    int num_gpus = local_devices.size();
+    for (const auto& device : local_devices) {
+      device_names.push_back(device->name());
+      VLOG(2) << device->name();
+    }
+    if (!dev_mgr_) dev_mgr_.reset(new DeviceMgr(std::move(local_devices)));
+    col_exec_ = new BaseCollectiveExecutor(
+        &col_exec_mgr_, /*remote_access=*/nullptr, kStepId, dev_mgr_.get(),
+        /*gpu_ring_order=*/nullptr);
+
+    // Initialize collective params.
+    col_params_.name = "test_nccl_collective_op";
+    const int group_key = 5;
+    col_params_.group.group_key = group_key;
+    col_params_.group.device_type = DEVICE_GPU;
+    col_params_.group.group_size = num_ranks;
+    const int instance_key = 23;
+    col_params_.instance.instance_key = instance_key;
+    col_params_.instance.type = REDUCTION_COLLECTIVE;
+    col_params_.instance.data_type = DT_FLOAT;
+    col_params_.instance.impl_details.collective_name = "NcclReduce";
+    const string task_name = "/job:worker/replica:0/task:0";
+    col_params_.instance.num_devices_per_task[task_name] = num_ranks;
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      col_params_.instance.device_names.push_back(
+          device_names[rank % num_gpus]);
+      col_params_.instance.task_names.push_back(task_name);
+    }
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      instances_.push_back(absl::make_unique<DeviceInstance>(
+          rank, col_params_.instance.device_names[rank], this));
+    }
+  }
+
+  void Reduce() {
+    int done = 0;
+    mutex done_mu;
+    condition_variable done_cv;
+    for (const auto& instance : instances_) {
+      DeviceInstance* di = instance.get();
+      SchedClosure([di, &done, &done_mu, &done_cv] {
+        di->DoReduce();
+        mutex_lock l(done_mu);
+        ++done;
+        done_cv.notify_all();
+      });
+    }
+
+    mutex_lock l(done_mu);
+    while (done < instances_.size()) done_cv.wait(l);
+  }
+
+  void RunTest(int num_ranks, int tensor_length) {
+    Init(num_ranks);
+    std::vector<float> expected(tensor_length, 0.0);
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      DeviceInstance* instance = instances_[rank].get();
+      instance->InitTensor(DT_FLOAT, TensorShape({tensor_length}),
+                           [&expected, rank](Tensor* t) {
+                             for (size_t i = 0; i < t->NumElements(); ++i) {
+                               float value = pow(10, rank) * i;
+                               t->flat<float>()(i) = value;
+                               expected[i] += value;
+                             }
+                           });
+    }
+    Reduce();
+    // Confirm that every rank computed the same correct value.
+    for (int i = 0; i < tensor_length; ++i) {
+      expected[i] /= num_ranks;
+    }
+    for (int rank = 0; rank < instances_.size(); ++rank) {
+      TF_ASSERT_OK(instances_[rank]->status_);
+      Tensor* dev_tensor = &instances_[rank]->tensor_;
+      Tensor actual(DT_FLOAT, TensorShape({tensor_length}));
+      Notification note;
+      Device* dev = instances_[rank]->device_;
+      auto* dev_info = dev->tensorflow_gpu_device_info();
+      dev_info->default_context->CopyDeviceTensorToCPU(
+          dev_tensor, /*tensor_name=*/"", dev, &actual,
+          [&note](const Status&) { note.Notify(); });
+      note.WaitForNotification();
+      for (int i = 0; i < tensor_length; ++i) {
+        EXPECT_FLOAT_EQ(expected[i], actual.template flat<float>()(i))
+            << "Mismatch at rank " << rank << " index " << i;
+      }
+    }
+  }
+
+  std::unique_ptr<OpKernel> GetCollectiveReduce(const CollectiveParams& params,
+                                                Tensor* input,
+                                                DeviceBase* device) {
+    mutex_lock l(mu_);
+    NodeDef node_def;
+    NodeDefBuilder builder(
+        strings::StrCat("collective_reduce_", reduce_counter_++),
+        "CollectiveReduce");
+    TF_CHECK_OK(
+        builder.Attr("T", params.instance.data_type)
+            .Attr("merge_op", "Add")
+            .Attr("final_op", "Div")
+            .Attr("group_size", params.group.group_size)
+            .Attr("group_key", params.group.group_key)
+            .Attr("instance_key", params.instance.instance_key)
+            .Attr("subdiv_offsets", params.instance.impl_details.subdiv_offsets)
+            .Input(FakeInput(params.instance.data_type))
+            .Finalize(&node_def));
+    return GetKernel(node_def, device);
+  }
+
+  class DeviceInstance {
+   public:
+    DeviceInstance(int rank, const string& device_name, NcclReducerTest* parent)
+        : parent_(parent), device_name_(device_name), rank_(rank) {
+      TF_CHECK_OK(parent_->dev_mgr_->LookupDevice(device_name_, &device_))
+          << "Could not find device " << device_name_ << " existing devices "
+          << parent_->dev_mgr_->DebugString();
+      col_params_.name = parent_->col_params_.name;
+      col_params_.default_rank = rank;
+      col_params_.group.group_key = parent_->col_params_.group.group_key;
+      col_params_.group.device_type = parent_->col_params_.group.device_type;
+      col_params_.group.group_size = parent_->col_params_.group.group_size;
+      col_params_.instance = parent->col_params_.instance;
+    }
+
+    void InitTensor(DataType dtype, const TensorShape& shape,
+                    const std::function<void(Tensor*)>& init_f) {
+      tensor_ =
+          Tensor(device_->GetAllocator(AllocatorAttributes()), dtype, shape);
+      Tensor cpu_tensor(dtype, shape);
+      init_f(&cpu_tensor);
+      VLOG(2) << "cpu_tensor " << cpu_tensor.DebugString();
+      auto* dev_info = device_->tensorflow_gpu_device_info();
+      Notification note;
+      dev_info->default_context->CopyCPUTensorToDevice(
+          &cpu_tensor, device_, &tensor_,
+          [&note](const Status&) { note.Notify(); });
+      note.WaitForNotification();
+    }
+
+    void DoReduce() {
+      col_params_.merge_op = GetAdd(device_);
+      col_params_.final_op = GetDiv(device_);
+
+      // Prepare an OpKernelContext.
+      OpKernelContext::Params op_params;
+      op_params.step_id = kStepId;
+      op_params.device = device_;
+      gtl::InlinedVector<TensorValue, 4> inputs;
+      inputs.push_back(TensorValue(&tensor_));
+      op_params.inputs = &inputs;
+      gtl::InlinedVector<AllocatorAttributes, 4> input_aa(
+          {AllocatorAttributes()});
+      op_params.input_alloc_attrs = &input_aa;
+      gtl::InlinedVector<DeviceContext*, 4> input_dc;
+      DeviceContext* dev_ctx = nullptr;
+      auto* dev_info = device_->tensorflow_gpu_device_info();
+      if (dev_info) {
+        dev_ctx = dev_info->default_context;
+        dev_ctx->Ref();
+      } else {
+        dev_ctx = new DeviceContext;
+      }
+      input_dc.push_back(dev_ctx);
+      op_params.input_device_contexts = &input_dc;
+      op_params.op_device_context = dev_ctx;
+      int forward_from = 0;
+      op_params.forward_from_array = &forward_from;
+      AllocatorAttributes generic_alloc_attr;
+      op_params.output_attr_array = &generic_alloc_attr;
+      std::unique_ptr<OpKernel> op =
+          parent_->GetCollectiveReduce(col_params_, &tensor_, device_);
+      op_params.op_kernel = op.get();
+      OpKernelContext ctx(&op_params, 1);
+
+      // We never actually execute the kernel, so we need to do the output
+      // allocation it would do, ourselves.
+      Tensor* output_tensor_ptr = nullptr;
+      TF_CHECK_OK(ctx.forward_input_or_allocate_output({0}, 0, tensor_.shape(),
+                                                       &output_tensor_ptr));
+      CHECK_EQ(output_tensor_ptr, ctx.mutable_output(0));
+
+      // Prepare a NcclReducer instance.
+      string exec_key =
+          strings::StrCat(col_params_.instance.instance_key, ":0:0");
+      NcclReducer reducer;
+      CollectiveContext col_ctx(parent_->col_exec_, parent_->dev_mgr_.get(),
+                                &ctx, &op_params, col_params_, exec_key,
+                                kStepId, &tensor_, &tensor_);
+      TF_CHECK_OK(reducer.InitializeCollectiveContext(&col_ctx));
+
+      // Run the all-reduce.
+      reducer.Run([this](Status s) { status_ = s; });
+      if (status_.ok()) {
+        CHECK(tensor_.CopyFrom(*ctx.mutable_output(0), tensor_.shape()));
+      }
+
+      dev_ctx->Unref();
+    }
+
+    NcclReducerTest* parent_;
+    string device_name_;
+    int rank_;
+    Tensor tensor_;
+    Device* device_;
+    CollectiveParams col_params_;
+    Status status_;
+  };
+
+  std::vector<std::unique_ptr<tensorflow::Device>> gpus_;
+  TestCollectiveExecutorMgr col_exec_mgr_;
+  CollectiveExecutor* col_exec_;
+  std::unique_ptr<DeviceMgr> dev_mgr_;
+  std::vector<std::unique_ptr<DeviceInstance>> instances_;
+  CollectiveParams col_params_;
+  mutex mu_;
+  int32 reduce_counter_ GUARDED_BY(mu_) = 0;
+};
+
+TEST_F(NcclReducerTest, Test2Dev16Len) { RunTest(2, 16); }
+TEST_F(NcclReducerTest, Test4Dev16Len) { RunTest(4, 16); }
+TEST_F(NcclReducerTest, Test8Dev16Len) { RunTest(8, 16); }
+TEST_F(NcclReducerTest, Test8Dev128Len) { RunTest(8, 128); }
+TEST_F(NcclReducerTest, Test8Dev1045991Len) { RunTest(8, 1048576); }
+
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/core/kernels/collective_ops.cc b/tensorflow/core/kernels/collective_ops.cc
index 82e2913b64afca2e0fc8c64d1c6e366f3a2d307e..23356283bb52dc4ab7f61193211072e6f95fb1f4 100644
--- a/tensorflow/core/kernels/collective_ops.cc
+++ b/tensorflow/core/kernels/collective_ops.cc
@@ -43,16 +43,21 @@ class CollectiveOpKernel : public AsyncOpKernel {
       // Call in a blockable thread because it's not guaranteed that
       // this call cannot block.
       c->env()->SchedClosure([this, c, done, col_exec]() {
-        col_exec->CompleteParamsAsync(c->device()->name(), &col_params_,
-                                      c->cancellation_manager(),
-                                      [this, c, done](const Status& s) {
-                                        if (s.ok()) {
-                                          ComputeAsync(c, done);
-                                        } else {
-                                          c->SetStatus(s);
-                                          done();
-                                        }
-                                      });
+        VLOG(1) << "CollectiveOpKernel CompleteParams for collective "
+                << col_params_.name << " device " << c->device()->name()
+                << " group " << col_params_.group.group_key << " instance "
+                << col_params_.instance.instance_key;
+        col_exec->CompleteParamsAsync(
+            c->device()->name(), &col_params_, c->cancellation_manager(),
+            [this, c, done](const Status& s) {
+              if (s.ok()) {
+                col_params_.instance.impl_details.dependencies = dependencies_;
+                ComputeAsync(c, done);
+              } else {
+                c->SetStatus(s);
+                done();
+              }
+            });
       });
       return false;
     }
@@ -60,8 +65,60 @@ class CollectiveOpKernel : public AsyncOpKernel {
   }
 
   CollectiveParams col_params_;
+  std::vector<int32> dependencies_;
 };
 
+class CollectiveGatherOpKernel : public CollectiveOpKernel {
+ public:
+  explicit CollectiveGatherOpKernel(OpKernelConstruction* c)
+      : CollectiveOpKernel(c) {
+    col_params_.instance.type = GATHER_COLLECTIVE;
+    OP_REQUIRES_OK(c, c->GetAttr("group_size", &col_params_.group.group_size));
+    OP_REQUIRES_OK(c, c->GetAttr("group_key", &col_params_.group.group_key));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("instance_key", &col_params_.instance.instance_key));
+    OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_.instance.data_type));
+    OP_REQUIRES_OK(c, c->GetAttr("shape", &col_params_.instance.shape));
+    const NodeDef& real_node = c->def();
+    col_params_.name = strings::StrCat(real_node.name(), ": Gather");
+    col_params_.group.device_type = c->device_type();
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    CollectiveExecutor* col_exec = c->collective_executor();
+    OP_REQUIRES_ASYNC(
+        c, col_exec,
+        errors::Internal(
+            "Failed to get CollectiveExecutor from OpKernelContext for Op ",
+            col_params_.name),
+        done);
+    // Allocate output on the first pass through this function.  This must be
+    // done immediately, while we're still in the executor thread.  Otherwise
+    // the memory is not guaranteed to be unused by any concurrently executing
+    // GPU kernel.
+    if (c->mutable_output(0) == nullptr) {
+      // Allocate the output tensor.
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK_ASYNC(
+          c, c->allocate_output(0, col_params_.instance.shape, &output), done);
+    }
+    if (!CanProceedWithCompute(c, col_exec, done)) return;
+    auto actual_done = [c, done](const Status& s) {
+      OP_REQUIRES_OK_ASYNC(c, s, done);
+      done();
+    };
+    col_exec->ExecuteAsync(c, col_params_, GetCollectiveKey(c), actual_done);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(CollectiveGatherOpKernel);
+};
+
+REGISTER_KERNEL_BUILDER(Name("CollectiveGather").Device(DEVICE_CPU),
+                        CollectiveGatherOpKernel);
+REGISTER_KERNEL_BUILDER(Name("CollectiveGather").Device(DEVICE_GPU),
+                        CollectiveGatherOpKernel);
+
 class CollectiveReduceOpKernel : public CollectiveOpKernel {
  public:
   explicit CollectiveReduceOpKernel(OpKernelConstruction* c)
@@ -87,6 +144,7 @@ class CollectiveReduceOpKernel : public CollectiveOpKernel {
                     "final_op must be one of {\"Id\", \"Div\"} but got ",
                     final_op_name));
     OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_.instance.data_type));
+    OP_REQUIRES_OK(c, c->GetAttr("wait_for", &dependencies_));
 
     const NodeDef& real_node = c->def();
     col_params_.name = strings::StrCat(real_node.name(), ": Reduce(",
@@ -146,10 +204,18 @@ class CollectiveReduceOpKernel : public CollectiveOpKernel {
       col_params_.instance.shape = c->input(0).shape();
     }
     if (!CanProceedWithCompute(c, col_exec, done)) return;
-    auto actual_done = [c, col_exec, done](const Status& s) {
+
+    int32 instance_key = col_params_.instance.instance_key;
+    auto actual_done = [c, instance_key, done](const Status& s) {
       OP_REQUIRES_OK_ASYNC(c, s, done);
       done();
+      VLOG(1) << "CollectiveReduceKernel ExecuteAsync done for device "
+              << c->device()->name() << " instance " << instance_key;
     };
+    VLOG(1) << "CollectiveReduceKernel ExecuteAsync start for collective "
+            << col_params_.name << " device " << c->device()->name()
+            << " group " << col_params_.group.group_key << " instance "
+            << instance_key;
     col_exec->ExecuteAsync(c, col_params_, GetCollectiveKey(c), actual_done);
   }
 
@@ -208,10 +274,17 @@ class CollectiveBcastSendOpKernel : public CollectiveOpKernel {
                          " does not match shape of input"),
         done);
 
-    auto actual_done = [c, col_exec, done](const Status& s) {
+    int32 instance_key = col_params_.instance.instance_key;
+    auto actual_done = [c, instance_key, done](const Status& s) {
       OP_REQUIRES_OK_ASYNC(c, s, done);
       done();
+      VLOG(1) << "CollectiveBcastSendOpKernel ExecuteAsync done for device "
+              << c->device()->name() << " instance " << instance_key;
     };
+    VLOG(1) << "CollectiveBcastSendOpKernel ExecuteAsync start for collective "
+            << col_params_.name << " device " << c->device()->name()
+            << " group " << col_params_.group.group_key << " instance "
+            << instance_key;
     col_exec->ExecuteAsync(c, col_params_, GetCollectiveKey(c), actual_done);
   }
 
@@ -263,10 +336,17 @@ class CollectiveBcastRecvOpKernel : public CollectiveOpKernel {
     }
     if (!CanProceedWithCompute(c, col_exec, done)) return;
 
-    auto actual_done = [c, col_exec, done](const Status& s) {
+    int32 instance_key = col_params_.instance.instance_key;
+    auto actual_done = [c, instance_key, done](const Status& s) {
       OP_REQUIRES_OK_ASYNC(c, s, done);
       done();
+      VLOG(1) << "CollectiveBcastRecvOpKernel ExecuteAsync done for device "
+              << c->device()->name() << " instance " << instance_key;
     };
+    VLOG(1) << "CollectiveBcastRecvOpKernel ExecuteAsync start for collective "
+            << col_params_.name << " device " << c->device()->name()
+            << " group " << col_params_.group.group_key << " instance "
+            << instance_key;
     col_exec->ExecuteAsync(c, col_params_, GetCollectiveKey(c), actual_done);
   }
 
diff --git a/tensorflow/core/kernels/concat_lib_gpu.cc b/tensorflow/core/kernels/concat_lib_gpu.cc
index 93e392d3032405ea848bd2f147653c9a5c7a1818..853d7c3133d48a5ec4690403eab16d2eaa776700 100644
--- a/tensorflow/core/kernels/concat_lib_gpu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu.cc
@@ -115,7 +115,9 @@ void ConcatGPU(
 TF_CALL_GPU_NUMBER_TYPES(REGISTER);
 TF_CALL_complex64(REGISTER);
 TF_CALL_complex128(REGISTER);
+TF_CALL_int32(REGISTER);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER);
+TF_CALL_int16(REGISTER);
 TF_CALL_bfloat16(REGISTER);
 TF_CALL_bool(REGISTER);
 TF_CALL_uint8(REGISTER);
diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index a561d918bd36f711d1b813dfb533ec6d690af8ee..ae828b5bf48a4b657211cdb7efd62559ad364037 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -201,7 +201,9 @@ void ConcatGPUImpl(const Eigen::GpuDevice& gpu_device,
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPUCONCAT32);
 TF_CALL_complex64(REGISTER_GPUCONCAT32);
 TF_CALL_complex128(REGISTER_GPUCONCAT32);
+TF_CALL_int32(REGISTER_GPUCONCAT32);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER_GPUCONCAT32);
+TF_CALL_int16(REGISTER_GPUCONCAT32);
 TF_CALL_uint8(REGISTER_GPUCONCAT32);
 REGISTER_GPUCONCAT32(bfloat16);
 REGISTER_GPUCONCAT32(bool);
@@ -209,7 +211,9 @@ REGISTER_GPUCONCAT32(bool);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPUCONCAT64);
 TF_CALL_complex64(REGISTER_GPUCONCAT64);
 TF_CALL_complex128(REGISTER_GPUCONCAT64);
+TF_CALL_int32(REGISTER_GPUCONCAT64);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER_GPUCONCAT64);
+TF_CALL_int16(REGISTER_GPUCONCAT64);
 TF_CALL_uint8(REGISTER_GPUCONCAT64);
 REGISTER_GPUCONCAT64(bfloat16);
 REGISTER_GPUCONCAT64(bool);
@@ -217,7 +221,9 @@ REGISTER_GPUCONCAT64(bool);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU32);
 TF_CALL_complex64(REGISTER_GPU32);
 TF_CALL_complex128(REGISTER_GPU32);
+TF_CALL_int32(REGISTER_GPU32);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER_GPU32);
+TF_CALL_int16(REGISTER_GPU32);
 TF_CALL_uint8(REGISTER_GPU32);
 REGISTER_GPU32(bfloat16);
 REGISTER_GPU32(bool);
@@ -225,7 +231,9 @@ REGISTER_GPU32(bool);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU64);
 TF_CALL_complex64(REGISTER_GPU64);
 TF_CALL_complex128(REGISTER_GPU64);
+TF_CALL_int32(REGISTER_GPU64);  // Needed for TensorLists.
 TF_CALL_int64(REGISTER_GPU64);
+TF_CALL_int16(REGISTER_GPU64);
 TF_CALL_uint8(REGISTER_GPU64);
 REGISTER_GPU64(bfloat16);
 REGISTER_GPU64(bool);
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index ff6298351761c84bedd117e125f53b2166cd104f..72d8b45dd96b912f3d94f4c0f0495c82de53e4d4 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -18,16 +18,16 @@ limitations under the License.
 #include <limits>
 #include <vector>
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/conditional_accumulator_base.h b/tensorflow/core/kernels/conditional_accumulator_base.h
index 4a5ec6f0fb3c7272dd0684da3ce56e787848dd7d..2618ffbb099cd1619de826f6b0e4e5ae20982197 100644
--- a/tensorflow/core/kernels/conditional_accumulator_base.h
+++ b/tensorflow/core/kernels/conditional_accumulator_base.h
@@ -68,7 +68,7 @@ class ConditionalAccumulatorBase : public ResourceBase {
 
   const DataType& dtype() const { return dtype_; }
 
-  string DebugString() override { return "A conditional accumulator"; }
+  string DebugString() const override { return "A conditional accumulator"; }
 
   // SetGlobalStep is a modifier method for current_global_step.
   // It returns an InvalidArgument error if the new_global_step is less than
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 426c404f4388d4366dec4cec84c01accb5ec6cd6..5ff428dd312c6935adc56a0dbcdef76b77cb287b 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/constant_op.h"
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -32,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/platform/macros.h"
 
@@ -47,7 +47,7 @@ namespace {
 std::unique_ptr<const NodeDef> StripTensorDataFromNodeDef(
     OpKernelConstruction* ctx) {
 #ifndef __ANDROID__
-  DCHECK_EQ(NodeDef::descriptor()->field_count(), 5)
+  DCHECK_EQ(NodeDef::descriptor()->field_count(), 6)
       << "The NodeDef format has changed, and the attr-stripping code may need "
       << "to be updated.";
 #endif
@@ -61,6 +61,7 @@ std::unique_ptr<const NodeDef> StripTensorDataFromNodeDef(
   // attrs that affect the cardinality of list-typed inputs and outputs, so it
   // is safe to drop other attrs from the NodeDef.
   AddNodeAttr("dtype", ctx->output_type(0), ret);
+  MergeDebugInfo(original, ret);
   return std::unique_ptr<const NodeDef>(ret);
 }
 
@@ -261,7 +262,8 @@ class ZerosLikeOp : public OpKernel {
       const Variant& v = input.scalar<Variant>()();
       // DT_VARIANT tensors must be allocated on CPU since they wrap C++
       // objects which can not be efficiently represented in GPU memory.
-      Tensor out(cpu_allocator(), DT_VARIANT, TensorShape({}));
+      int numa_node = DeviceNumaNode(ctx->device());
+      Tensor out(cpu_allocator(numa_node), DT_VARIANT, TensorShape({}));
       Variant* out_v = &(out.scalar<Variant>()());
       OP_REQUIRES_OK(ctx, UnaryOpVariant<Device>(
                               ctx, ZEROS_LIKE_VARIANT_UNARY_OP, v, out_v));
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 36def4a53065e2c6ac68a8b67818096012104753..c0981805bbe8ec102aecbe6e019596f73ecf97e7 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -267,6 +267,7 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_REF_KERNEL);
 REGISTER_GPU_KERNEL(bool);
 REGISTER_GPU_REF_KERNEL(bool);
 REGISTER_GPU_KERNEL(uint64);
+TF_CALL_variant(REGISTER_GPU_KERNEL);
 
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
@@ -365,6 +366,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_KERNEL);
 REGISTER_GPU_KERNEL(bool);
 REGISTER_GPU_REF_KERNEL(bool);
+TF_CALL_variant(REGISTER_GPU_KERNEL);
 
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
@@ -460,6 +462,7 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_KERNEL);
 REGISTER_GPU_KERNEL(bool);
 REGISTER_GPU_REF_KERNEL(bool);
+TF_CALL_variant(REGISTER_GPU_KERNEL);
 
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
@@ -514,6 +517,7 @@ REGISTER_SYCL_HOST_KERNEL(string);
 
 REGISTER_GPU_HOST_KERNEL(int32);
 REGISTER_GPU_HOST_KERNEL(string);
+REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
@@ -540,6 +544,7 @@ REGISTER_KERNEL_BUILDER(Name("RefNextIteration").Device(DEVICE_CPU),
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 REGISTER_GPU_KERNEL(bool);
+TF_CALL_variant(REGISTER_GPU_KERNEL);
 
 #undef REGISTER_GPU_KERNEL
 
@@ -562,6 +567,7 @@ REGISTER_GPU_KERNEL(bool);
 
 REGISTER_GPU_HOST_KERNEL(int32);
 REGISTER_GPU_HOST_KERNEL(string);
+REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index bc30da40991b56adc136bbe6115db16c00a04666..5eaddec76843cea6085700ce98abd36b844990bf 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -102,6 +102,7 @@ struct LaunchConv2DBackpropFilterOp<CPUDevice, T> {
                   const Tensor& out_backprop, const Tensor& input,
                   int row_dilation, int col_dilation, int row_stride,
                   int col_stride, const Padding& padding,
+                  const std::vector<int64>& explicit_paddings,
                   Tensor* filter_backprop, TensorFormat data_format) {
     const CPUDevice& d = ctx->eigen_device<CPUDevice>();
     functor::SpatialConvolutionBackwardFilter<CPUDevice, T>()(
@@ -180,120 +181,6 @@ struct LaunchXsmmBackwardFilter<CPUDevice, float> {
 };
 #endif
 
-template <typename Device, class T>
-class Conv2DFastBackpropFilterOp : public OpKernel {
- public:
-  explicit Conv2DFastBackpropFilterOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    string data_format;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
-    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
-    OP_REQUIRES(context, data_format_ == FORMAT_NHWC,
-                errors::InvalidArgument(
-                    "Conv2DFastBackpropFilterOp only supports NHWC."));
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
-    OP_REQUIRES(context, strides_.size() == 4,
-                errors::InvalidArgument("Sliding window strides field must "
-                                        "specify 4 dimensions"));
-    OP_REQUIRES(
-        context, (strides_[0] == 1 && strides_[3] == 1),
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
-    OP_REQUIRES(context, strides_[1] > 0 && strides_[2] > 0,
-                errors::InvalidArgument(
-                    "Row and column strides should be larger than 0."));
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
-    OP_REQUIRES(context, dilations_.size() == 4,
-                errors::InvalidArgument("Sliding window dilations field must "
-                                        "specify 4 dimensions"));
-    OP_REQUIRES(context, (dilations_[0] == 1 && dilations_[3] == 1),
-                errors::InvalidArgument(
-                    "Current implementation does not yet support "
-                    "dilations in the batch and depth dimensions."));
-    // TODO(yangzihao): Add a CPU implementation for dilated convolution.
-    OP_REQUIRES(context, (dilations_[1] == 1 && dilations_[2] == 1),
-                errors::InvalidArgument(
-                    "Current Eigen and libxsmm implementations do not "
-                    "yet support dilation rates larger than 1."));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& input = context->input(0);
-    const Tensor& filter_sizes = context->input(1);
-    const Tensor& out_backprop = context->input(2);
-    OP_REQUIRES(
-        context, TensorShapeUtils::IsVector(filter_sizes.shape()),
-        errors::InvalidArgument(
-            "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
-            filter_sizes.dims()));
-    TensorShape filter_shape;
-    OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                                filter_sizes.vec<int32>(), &filter_shape));
-
-    ConvBackpropDimensions dims;
-    OP_REQUIRES_OK(
-        context,
-        ConvBackpropComputeDimensions(
-            type_string(), /*num_spatial_dims=*/2, input.shape(), filter_shape,
-            out_backprop.shape(), strides_, padding_, data_format_, &dims));
-
-    Tensor* filter_backprop = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, filter_shape, &filter_backprop));
-
-    // If there is nothing to compute, return.
-    if (filter_shape.num_elements() == 0) {
-      return;
-    }
-
-#if defined TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS && \
-    defined TENSORFLOW_USE_LIBXSMM_BACKWARD_CONVOLUTIONS
-    int64 pad_top, pad_bottom;
-    int64 pad_left, pad_right;
-    OP_REQUIRES_OK(
-        context,
-        GetWindowedOutputSizeVerbose(
-            dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
-            dims.spatial_dims[0].stride, padding_,
-            &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
-    OP_REQUIRES_OK(
-        context,
-        GetWindowedOutputSizeVerbose(
-            dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
-            dims.spatial_dims[1].stride, padding_,
-            &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
-
-    if (pad_left == pad_right && pad_top == pad_bottom) {
-      if (LaunchXsmmBackwardFilter<Device, T>()(
-              context, context->eigen_device<Device>(), input.tensor<T, 4>(),
-              filter_backprop->tensor<T, 4>(), out_backprop.tensor<T, 4>(),
-              dims.spatial_dims[0].input_size, dims.spatial_dims[1].input_size,
-              static_cast<int>(dims.spatial_dims[0].stride),
-              static_cast<int>(dims.spatial_dims[1].stride),
-              static_cast<int>(pad_top), static_cast<int>(pad_left),
-              data_format_)) {
-        return;
-      }
-    }
-#endif
-
-    LaunchConv2DBackpropFilterOp<Device, T>()(
-        context, false, false, out_backprop, input,
-        /*row_dilation=*/1, /*col_dilation=*/1, dims.spatial_dims[0].stride,
-        dims.spatial_dims[1].stride, padding_, filter_backprop, data_format_);
-  }
-
- private:
-  std::vector<int32> dilations_;
-  std::vector<int32> strides_;
-  Padding padding_;
-  TensorFormat data_format_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DFastBackpropFilterOp);
-};
-
 // Based on implementation written by Yangqing Jia (jiayq).
 template <typename Device, class T>
 class Conv2DCustomBackpropFilterOp : public OpKernel {
@@ -319,6 +206,15 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
                 errors::InvalidArgument(
                     "Row and column strides should be larger than 0."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES(
+        context, padding_ != Padding::EXPLICIT,
+        errors::Unimplemented("Current CPU implementation does not support "
+                              "EXPLICIT padding yet."));
+    std::vector<int64> explicit_paddings;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("explicit_paddings", &explicit_paddings));
+    OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings,
+                                              /*num_dims=*/4, data_format_));
     OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
     OP_REQUIRES(context, dilations_.size() == 4,
                 errors::InvalidArgument("Sliding window dilations field must "
@@ -517,12 +413,7 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
                               .Device(DEVICE_CPU)                             \
                               .Label("custom")                                \
                               .TypeConstraint<T>("T"),                        \
-                          Conv2DCustomBackpropFilterOp<CPUDevice, T>);        \
-  REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")                        \
-                              .Device(DEVICE_CPU)                             \
-                              .Label("eigen_tensor")                          \
-                              .TypeConstraint<T>("T"),                        \
-                          Conv2DFastBackpropFilterOp<CPUDevice, T>);
+                          Conv2DCustomBackpropFilterOp<CPUDevice, T>);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
@@ -587,6 +478,10 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
     use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("explicit_paddings", &explicit_paddings_));
+    OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
+                                              /*num_dims=*/4, data_format_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -626,13 +521,14 @@ class Conv2DSlowBackpropFilterOp : public OpKernel {
 
     launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop, input,
               dilation_rows, dilation_cols, stride_rows, stride_cols, padding_,
-              filter_backprop, data_format_);
+              explicit_paddings_, filter_backprop, data_format_);
   }
 
  private:
   std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
+  std::vector<int64> explicit_paddings_;
   bool use_cudnn_;
   TensorFormat data_format_;
   LaunchConv2DBackpropFilterOp<Device, T> launcher_;
@@ -646,7 +542,8 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
     OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
     const Tensor& out_backprop, const Tensor& input, int row_dilation,
     int col_dilation, int row_stride, int col_stride, const Padding& padding,
-    Tensor* filter_backprop, TensorFormat data_format) {
+    const std::vector<int64>& explicit_paddings, Tensor* filter_backprop,
+    TensorFormat data_format) {
   using se::dnn::AlgorithmConfig;
   using se::dnn::AlgorithmDesc;
   using se::dnn::ProfileResult;
@@ -661,35 +558,33 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
   TensorShape filter_shape = filter_backprop->shape();
 
   ConvBackpropDimensions dims;
-  OP_REQUIRES_OK(ctx, ConvBackpropComputeDimensionsV2(
-                          "Conv2DSlowBackpropFilter", /*num_spatial_dims=*/2,
-                          input.shape(), filter_shape, out_backprop.shape(),
-                          dilations, strides, padding, data_format, &dims));
-
-  // TODO(yangzihao): The padding computations should be done in
-  // GetWindowedOutputSize() functions.
-  const int padding_rows =
-      (padding == VALID)
-          ? 0
-          : std::max<int>(0, (dims.spatial_dims[0].output_size - 1) *
-                                     dims.spatial_dims[0].stride +
-                                 (dims.spatial_dims[0].filter_size - 1) *
-                                     dims.spatial_dims[0].dilation +
-                                 1 - dims.spatial_dims[0].input_size);
-  const int padding_cols =
-      (padding == VALID)
-          ? 0
-          : std::max<int>(0, (dims.spatial_dims[1].output_size - 1) *
-                                     dims.spatial_dims[1].stride +
-                                 (dims.spatial_dims[1].filter_size - 1) *
-                                     dims.spatial_dims[1].dilation +
-                                 1 - dims.spatial_dims[1].input_size);
-
-  // TODO(zhengxq): cuDNN only supports equal padding on both sides, so only
-  // calling it when that is true. Remove this check when (if?) cuDNN starts
-  // supporting different padding.
-  bool rows_odd = (padding_rows % 2 != 0);
-  bool cols_odd = (padding_cols % 2 != 0);
+  OP_REQUIRES_OK(
+      ctx, ConvBackpropComputeDimensionsV2(
+               "Conv2DSlowBackpropFilter", /*num_spatial_dims=*/2,
+               input.shape(), filter_shape, out_backprop.shape(), dilations,
+               strides, padding, explicit_paddings, data_format, &dims));
+
+  int64 padding_top = -1, padding_bottom = -1;
+  int64 padding_left = -1, padding_right = -1;
+  if (padding == EXPLICIT) {
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &padding_top,
+                             &padding_bottom);
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &padding_left,
+                             &padding_right);
+  }
+  int64 expected_out_rows, expected_out_cols;
+  // The function is guaranteed to succeed because we checked the output and
+  // padding was valid earlier.
+  TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
+      dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+      row_dilation, row_stride, padding, &expected_out_rows, &padding_top,
+      &padding_bottom));
+  DCHECK_EQ(dims.spatial_dims[0].output_size, expected_out_rows);
+  TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
+      dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+      col_dilation, col_stride, padding, &expected_out_cols, &padding_left,
+      &padding_right));
+  DCHECK_EQ(dims.spatial_dims[1].output_size, expected_out_cols);
 
   auto* stream = ctx->op_device_context()->stream();
   OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
@@ -711,7 +606,7 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
       dims.spatial_dims[0].filter_size == 1 &&
       dims.spatial_dims[1].filter_size == 1 && !is_grouped_convolution &&
       dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 &&
-      data_format == FORMAT_NHWC) {
+      data_format == FORMAT_NHWC && (padding == VALID || padding == SAME)) {
     const uint64 m = dims.in_depth;
     const uint64 k = dims.batch_size * dims.spatial_dims[0].input_size *
                      dims.spatial_dims[1].input_size;
@@ -779,31 +674,43 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
     return;
   }
 
+  const int64 common_padding_rows = std::min(padding_top, padding_bottom);
+  const int64 common_padding_cols = std::min(padding_left, padding_right);
   Tensor compatible_input;
-  if (rows_odd || cols_odd) {
-    // If a padding dimension is odd, we have one more element on the right
-    // side or the bottom side. This is unsupported in cudnn. Therefore,
-    // we pad that extra element and make it compatible.
+  if (padding_top != padding_bottom || padding_left != padding_right) {
+    // Pad the input in the same way we did during the forward pass, so that
+    // cuDNN receives the same input during the backward pass function as it did
+    // during the forward pass function.
+    const int64 padding_rows_diff = std::abs(padding_bottom - padding_top);
+    const int64 padding_cols_diff = std::abs(padding_right - padding_left);
+    const int64 new_in_rows =
+        dims.spatial_dims[0].input_size + padding_rows_diff;
+    const int64 new_in_cols =
+        dims.spatial_dims[1].input_size + padding_cols_diff;
+    const int64 input_pad_top = padding_top - common_padding_rows;
+    const int64 input_pad_bottom = padding_bottom - common_padding_rows;
+    const int64 input_pad_left = padding_left - common_padding_cols;
+    const int64 input_pad_right = padding_right - common_padding_cols;
     OP_REQUIRES_OK(
         ctx, ctx->allocate_temp(
                  DataTypeToEnum<T>::value,
-                 ShapeFromFormat(data_format, dims.batch_size,
-                                 dims.spatial_dims[0].input_size + rows_odd,
-                                 dims.spatial_dims[1].input_size + cols_odd,
-                                 dims.in_depth),
+                 ShapeFromFormat(data_format, dims.batch_size, new_in_rows,
+                                 new_in_cols, dims.in_depth),
                  &compatible_input));
 
     functor::PadInput<GPUDevice, T, int, 4>()(
         ctx->template eigen_device<GPUDevice>(), To32Bit(input.tensor<T, 4>()),
-        {{0, 0}}, {{rows_odd, cols_odd}},
+        {{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
+        {{static_cast<int>(input_pad_bottom),
+          static_cast<int>(input_pad_right)}},
         To32Bit(compatible_input.tensor<T, 4>()), data_format);
   } else {
     compatible_input = input;
   }
 
-  CHECK(padding_rows >= 0 && padding_cols >= 0)
-      << "Negative row or col paddings: (" << padding_rows << ", "
-      << padding_cols << ")";
+  CHECK(common_padding_rows >= 0 && common_padding_cols >= 0)  // Crash OK
+      << "Negative row or col paddings: (" << common_padding_rows << ", "
+      << common_padding_cols << ")";
   se::dnn::BatchDescriptor input_desc;
   input_desc.set_count(dims.batch_size)
       .set_height(GetTensorDim(compatible_input, data_format, 'H'))
@@ -826,8 +733,8 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
       .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation)
       .set_vertical_filter_stride(dims.spatial_dims[0].stride)
       .set_horizontal_filter_stride(dims.spatial_dims[1].stride)
-      .set_zero_padding_height(padding_rows / 2)
-      .set_zero_padding_width(padding_cols / 2)
+      .set_zero_padding_height(common_padding_rows)
+      .set_zero_padding_width(common_padding_cols)
       .set_group_count(dims.in_depth / filter_shape.dim_size(2));
 
   // NOTE(zhengxq):
@@ -903,7 +810,7 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
   auto input_ptr = AsDeviceMemory(transformed_input.template flat<T>().data(),
                                   transformed_input.template flat<T>().size());
 
-  static int64 ConvolveBackwardFilterScratchSize = GetCudnnWorkspaceLimit(
+  static int64 ConvolveBackwardFilterScratchSize = GetDnnWorkspaceLimit(
       "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB by default
   );
   int device_id = stream->parent()->device_ordinal();
@@ -922,8 +829,8 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
         dims.spatial_dims[1].dilation}},   // dilation_cols
       {{dims.spatial_dims[0].stride,       // stride_rows
         dims.spatial_dims[1].stride}},     // stride_cols
-      {{padding_rows,                      // padding_rows
-        padding_cols}},                    // padding_cols
+      {{common_padding_rows,               // padding_rows
+        common_padding_cols}},             // padding_cols
       dtype,                               // tensor datatype
       device_id,                           // device_id
   };
@@ -939,8 +846,8 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
     for (auto profile_algorithm : algorithms) {
       // TODO(zhengxq): profile each algorithm multiple times to better
       // accuracy.
-      CudnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
-                                              ctx);
+      DnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
+                                            ctx);
       ProfileResult profile_result;
       bool cudnn_launch_status =
           stream
@@ -977,8 +884,7 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
     AutoTuneConvBwdFilter::GetInstance()->Insert(conv_parameters,
                                                  algorithm_config);
   }
-  CudnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
-                                          ctx);
+  DnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize, ctx);
   bool cudnn_launch_status =
       stream
           ->ThenConvolveBackwardFilterWithAlgorithm(
@@ -1063,6 +969,7 @@ REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
                         Conv2DSlowBackpropFilterOp<GPUDevice, Eigen::half>);
 
 // To be used inside depthwise_conv_grad_op.cc.
+// TODO(reedwm): Move this and the definition to depthwise_conv_grad_op.cc.
 template struct LaunchConv2DBackpropFilterOp<GPUDevice, float>;
 template struct LaunchConv2DBackpropFilterOp<GPUDevice, Eigen::half>;
 template struct LaunchConv2DBackpropFilterOp<GPUDevice, double>;
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index e06af15f2fc5558e9810c3da525fbf3cb385e893..a988f63a6c629303fe30abe994e8941b99ac5d4d 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
+#include "absl/base/dynamic_annotations.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -106,8 +107,9 @@ struct LaunchConv2DBackpropInputOp<CPUDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& out_backprop, const Tensor& filter,
                   int row_dilation, int col_dilation, int row_stride,
-                  int col_stride, const Padding& padding, Tensor* in_backprop,
-                  TensorFormat data_format) {
+                  int col_stride, const Padding& padding,
+                  const std::vector<int64>& explicit_paddings,
+                  Tensor* in_backprop, TensorFormat data_format) {
     const CPUDevice& d = ctx->eigen_device<CPUDevice>();
     functor::SpatialConvolutionBackwardInput<CPUDevice, T>()(
         d, in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
@@ -183,120 +185,79 @@ struct LaunchXsmmBackwardInputConvolution<CPUDevice, float> {
 };
 #endif
 
-template <typename Device, class T>
-class Conv2DFastBackpropInputOp : public OpKernel {
- public:
-  explicit Conv2DFastBackpropInputOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    string data_format;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
-    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
-    OP_REQUIRES(context, data_format_ == FORMAT_NHWC,
-                errors::InvalidArgument(
-                    "Eigen Conv2DFastBackpropInputOp only supports NHWC."));
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
-    OP_REQUIRES(context, strides_.size() == 4,
-                errors::InvalidArgument("Sliding window strides field must "
-                                        "specify 4 dimensions"));
-    OP_REQUIRES(
-        context, (strides_[0] == 1 && strides_[3] == 1),
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
-    OP_REQUIRES(context, strides_[1] > 0 && strides_[2] > 0,
-                errors::InvalidArgument(
-                    "Row and column strides should be larger than 0."));
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
-    OP_REQUIRES(context, dilations_.size() == 4,
-                errors::InvalidArgument("Sliding window dilations field must "
-                                        "specify 4 dimensions"));
-    OP_REQUIRES(context, (dilations_[0] && dilations_[3]),
-                errors::InvalidArgument(
-                    "Current implementation does not yet support "
-                    "dilations in the batch and depth dimensions."));
-    // TODO(yangzihao): Add a CPU implementation for dilated convolution.
-    OP_REQUIRES(context, (dilations_[1] == 1 && dilations_[2] == 1),
-                errors::InvalidArgument(
-                    "Current Eigen and libxsmm implementations do not "
-                    "yet support dilation rates larger than 1."));
+template <typename T>
+struct Conv2DCustomBackpropInputMatMulFunctor {
+  using MatrixMap = Eigen::Map<
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+  using ConstMatrixMap = Eigen::Map<
+      const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+
+  void operator()(OpKernelContext* ctx, const T* out_data, const T* filter_data,
+                  const int filter_total_size, const int output_image_size,
+                  const int dims_out_depth, T* im2col_buf) {
+    // Compute gradient into 'im2col_buf'.
+    MatrixMap C(im2col_buf, output_image_size, filter_total_size);
+
+    ConstMatrixMap A(out_data, output_image_size, dims_out_depth);
+    ConstMatrixMap B(filter_data, filter_total_size, dims_out_depth);
+
+    C.noalias() = A * B.transpose();
   }
+};
 
-  void Compute(OpKernelContext* context) override {
-    const Tensor& input_sizes = context->input(0);
-    const Tensor& filter = context->input(1);
-    const Tensor& out_backprop = context->input(2);
-    OP_REQUIRES(
-        context, TensorShapeUtils::IsVector(input_sizes.shape()),
-        errors::InvalidArgument(
-            "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
-            input_sizes.dims()));
-    TensorShape input_shape;
-    OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                                input_sizes.vec<int32>(), &input_shape));
-
-    ConvBackpropDimensions dims;
-    OP_REQUIRES_OK(context,
-                   ConvBackpropComputeDimensions(
-                       "Conv2DFastBackpropInput", /*num_spatial_dims=*/2,
-                       input_shape, filter.shape(), out_backprop.shape(),
-                       strides_, padding_, data_format_, &dims));
-
-    Tensor* in_backprop = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, input_shape, &in_backprop));
-
-    // If there is nothing to compute, return.
-    if (input_shape.num_elements() == 0) {
-      return;
-    }
-
-#if defined TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS && \
-    defined TENSORFLOW_USE_LIBXSMM_BACKWARD_CONVOLUTIONS
-    int64 pad_top, pad_bottom;
-    int64 pad_left, pad_right;
-    OP_REQUIRES_OK(
-        context,
-        GetWindowedOutputSizeVerbose(
-            dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
-            dims.spatial_dims[0].stride, padding_,
-            &dims.spatial_dims[0].output_size, &pad_top, &pad_bottom));
-    OP_REQUIRES_OK(
-        context,
-        GetWindowedOutputSizeVerbose(
-            dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
-            dims.spatial_dims[1].stride, padding_,
-            &dims.spatial_dims[1].output_size, &pad_left, &pad_right));
-
-    if (pad_left == pad_right && pad_top == pad_bottom) {
-      if (LaunchXsmmBackwardInputConvolution<Device, T>()(
-              context, context->eigen_device<Device>(),
-              in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
-              out_backprop.tensor<T, 4>(), dims.spatial_dims[0].input_size,
-              dims.spatial_dims[1].input_size,
-              static_cast<int>(dims.spatial_dims[0].stride),
-              static_cast<int>(dims.spatial_dims[1].stride),
-              static_cast<int>(pad_top), static_cast<int>(pad_left),
-              data_format_)) {
-        return;
-      }
-    }
-#endif
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+template <>
+struct Conv2DCustomBackpropInputMatMulFunctor<float> {
+  using T = float;
+
+  void operator()(OpKernelContext* ctx, const T* out_data, const T* filter_data,
+                  const int filter_total_size, const int output_image_size,
+                  const int dims_out_depth, T* im2col_buf) {
+    // Inputs are in RowMajor order, we "cheat" by swapping the LHS and RHS:
+    //   RowMajor: C   = A   * B
+    //   ColMajor: C^T = B^T * A^T
+    //
+    // Dimension names:
+    //   out_image_size    -> ois
+    //   filter_total_size -> fts
+    //   dims_out_depth    -> dod
+    //
+    // RowMajor:
+    //   im2col      = out_data    * filter_data^T
+    //   [ois x fts] = [ois x dod] * [fts x dod]^T
+    //
+    // ColMajor:
+    //   im2col^T    = filter_data *  out_data^T
+    //   [fts x ois] = [fts x dod] * [dod x ois]*
+
+    const int m = filter_total_size;
+    const int n = output_image_size;
+    const int k = dims_out_depth;  // contraction dim
+
+    const char transposeA = 'T';  // sgemm(A) == filter_data
+    const char transposeB = 'N';  // sgemm(B) == out_data
+
+    const int ldA = dims_out_depth;
+    const int ldB = dims_out_depth;
+    const int ldC = filter_total_size;
+
+    const float alpha = 1.0;
+    const float beta = 0.0;
+
+    // mkldnn_sgemm code can't be instrumented with msan.
+    ANNOTATE_MEMORY_IS_INITIALIZED(
+        im2col_buf, filter_total_size * output_image_size * sizeof(T));
+
+    mkldnn_status_t st =
+        mkldnn_sgemm(&transposeA, &transposeB, &m, &n, &k, &alpha, filter_data,
+                     &ldA, out_data, &ldB, &beta, im2col_buf, &ldC);
 
-    LaunchConv2DBackpropInputOp<Device, T>()(
-        context, false, false, out_backprop, filter,
-        /*row_dilation=*/1, /*col_dilation=*/1, dims.spatial_dims[0].stride,
-        dims.spatial_dims[1].stride, padding_, in_backprop, data_format_);
+    OP_REQUIRES(
+        ctx, st == 0,
+        errors::Internal("Failed to call mkldnn_sgemm. Error code: ", st));
   }
-
- private:
-  std::vector<int32> dilations_;
-  std::vector<int32> strides_;
-  Padding padding_;
-  TensorFormat data_format_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DFastBackpropInputOp);
 };
+#endif
 
 // Based on implementation written by Yangqing Jia (jiayq).
 template <typename Device, class T>
@@ -336,6 +297,15 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
                 errors::InvalidArgument(
                     "Current libxsmm and customized CPU implementations do "
                     "not yet support dilation rates larger than 1."));
+    OP_REQUIRES(
+        context, padding_ != Padding::EXPLICIT,
+        errors::Unimplemented("Current CPU implementation does not support "
+                              "EXPLICIT padding yet."));
+    std::vector<int64> explicit_paddings;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("explicit_paddings", &explicit_paddings));
+    OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings,
+                                              /*num_dims=*/4, data_format_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -522,21 +492,14 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
         input_backprop_data += input_offset;
       }
     } else {
-      typedef Eigen::Map<
-          Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
-          MatrixMap;
-      typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic,
-                                             Eigen::RowMajor>>
-          ConstMatrixMap;
-
       for (int image_id = 0; image_id < dims.batch_size;
            image_id += shard_size) {
         const int shard_limit =
             std::min(static_cast<int>(shard_size),
                      static_cast<int>(dims.batch_size) - image_id);
 
-        auto shard = [&dims, &pad_top, &pad_left, &pad_bottom, &pad_right,
-                      &output_image_size, &filter_total_size,
+        auto shard = [&context, &dims, &pad_top, &pad_left, &pad_bottom,
+                      &pad_right, &output_image_size, &filter_total_size,
                       &input_backprop_data, &col_buffer_data,
                       &out_backprop_data, &filter_data, &input_offset,
                       &output_offset, &size_C](int64 start, int64 limit) {
@@ -545,13 +508,9 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
             T* input_data = input_backprop_data + shard_id * input_offset;
             const T* out_data = out_backprop_data + shard_id * output_offset;
 
-            // Compute gradient into 'im2col_buf'.
-            MatrixMap C(im2col_buf, output_image_size, filter_total_size);
-
-            ConstMatrixMap A(out_data, output_image_size, dims.out_depth);
-            ConstMatrixMap B(filter_data, filter_total_size, dims.out_depth);
-
-            C.noalias() = A * B.transpose();
+            Conv2DCustomBackpropInputMatMulFunctor<T>()(
+                context, out_data, filter_data, filter_total_size,
+                output_image_size, dims.out_depth, im2col_buf);
 
             Col2im<T>(im2col_buf, dims.in_depth,
                       dims.spatial_dims[0].input_size,
@@ -588,12 +547,7 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
                               .Device(DEVICE_CPU)                            \
                               .Label("custom")                               \
                               .TypeConstraint<T>("T"),                       \
-                          Conv2DCustomBackpropInputOp<CPUDevice, T>);        \
-  REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")                        \
-                              .Device(DEVICE_CPU)                            \
-                              .Label("eigen_tensor")                         \
-                              .TypeConstraint<T>("T"),                       \
-                          Conv2DFastBackpropInputOp<CPUDevice, T>);
+                          Conv2DCustomBackpropInputOp<CPUDevice, T>);
 
 TF_CALL_half(REGISTER_CPU_KERNELS);
 TF_CALL_float(REGISTER_CPU_KERNELS);
@@ -661,6 +615,16 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
     use_cudnn_ &= CanUseCudnn();
     cudnn_use_autotune_ = CudnnUseAutotune();
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    if (!std::is_same<Device, GPUDevice>::value) {
+      OP_REQUIRES(
+          context, padding_ != Padding::EXPLICIT,
+          errors::Unimplemented("Current CPU implementation does not support "
+                                "EXPLICIT padding yet."));
+    }
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("explicit_paddings", &explicit_paddings_));
+    OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
+                                              /*num_dims=*/4, data_format_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -694,13 +658,14 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
 
     launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop, filter,
               dilation_rows, dilation_cols, stride_rows, stride_cols, padding_,
-              in_backprop, data_format_);
+              explicit_paddings_, in_backprop, data_format_);
   }
 
  private:
   std::vector<int32> dilations_;
   std::vector<int32> strides_;
   Padding padding_;
+  std::vector<int64> explicit_paddings_;
   bool use_cudnn_;
   TensorFormat data_format_;
   LaunchConv2DBackpropInputOp<Device, T> launcher_;
@@ -714,7 +679,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
     OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
     const Tensor& out_backprop, const Tensor& filter, int row_dilation,
     int col_dilation, int row_stride, int col_stride, const Padding& padding,
-    Tensor* in_backprop, TensorFormat data_format) {
+    const std::vector<int64>& explicit_paddings, Tensor* in_backprop,
+    TensorFormat data_format) {
   using se::dnn::AlgorithmConfig;
   using se::dnn::AlgorithmDesc;
   using se::dnn::ProfileResult;
@@ -731,35 +697,33 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
 
   const TensorShape& filter_shape = filter.shape();
   ConvBackpropDimensions dims;
-  OP_REQUIRES_OK(ctx, ConvBackpropComputeDimensionsV2(
-                          "Conv2DSlowBackpropInput", /*num_spatial_dims=*/2,
-                          input_shape, filter_shape, out_backprop.shape(),
-                          dilations, strides, padding, data_format, &dims));
-
-  // TODO(yangzihao): The padding computations should be done in
-  // GetWindowedOutputSize() functions.
-  const int padding_rows =
-      (padding == VALID)
-          ? 0
-          : std::max<int>(0, (dims.spatial_dims[0].output_size - 1) *
-                                     dims.spatial_dims[0].stride +
-                                 (dims.spatial_dims[0].filter_size - 1) *
-                                     dims.spatial_dims[0].dilation +
-                                 1 - dims.spatial_dims[0].input_size);
-  const int padding_cols =
-      (padding == VALID)
-          ? 0
-          : std::max<int>(0, (dims.spatial_dims[1].output_size - 1) *
-                                     dims.spatial_dims[1].stride +
-                                 (dims.spatial_dims[1].filter_size - 1) *
-                                     dims.spatial_dims[1].dilation +
-                                 1 - dims.spatial_dims[1].input_size);
-
-  // TODO(keveman): cuDNN only supports equal padding on both sides, so only
-  // calling it when that is true. Remove this check when (if?) cuDNN starts
-  // supporting different padding.
-  bool rows_odd = (padding_rows % 2 != 0);
-  bool cols_odd = (padding_cols % 2 != 0);
+  OP_REQUIRES_OK(
+      ctx, ConvBackpropComputeDimensionsV2(
+               "Conv2DSlowBackpropInput", /*num_spatial_dims=*/2, input_shape,
+               filter_shape, out_backprop.shape(), dilations, strides, padding,
+               explicit_paddings, data_format, &dims));
+
+  int64 padding_top = -1, padding_bottom = -1;
+  int64 padding_left = -1, padding_right = -1;
+  if (padding == EXPLICIT) {
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &padding_top,
+                             &padding_bottom);
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &padding_left,
+                             &padding_right);
+  }
+  int64 expected_out_rows, expected_out_cols;
+  // The function is guaranteed to succeed because we checked the output and
+  // padding was valid earlier.
+  TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
+      dims.spatial_dims[0].input_size, dims.spatial_dims[0].filter_size,
+      row_dilation, row_stride, padding, &expected_out_rows, &padding_top,
+      &padding_bottom));
+  DCHECK_EQ(dims.spatial_dims[0].output_size, expected_out_rows);
+  TF_CHECK_OK(GetWindowedOutputSizeVerboseV2(
+      dims.spatial_dims[1].input_size, dims.spatial_dims[1].filter_size,
+      col_dilation, col_stride, padding, &expected_out_cols, &padding_left,
+      &padding_right));
+  DCHECK_EQ(dims.spatial_dims[1].output_size, expected_out_cols);
 
   auto* stream = ctx->op_device_context()->stream();
   OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
@@ -779,7 +743,7 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
   if (dims.spatial_dims[0].filter_size == 1 &&
       dims.spatial_dims[1].filter_size == 1 && !is_grouped_convolution &&
       dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 &&
-      data_format == FORMAT_NHWC) {
+      data_format == FORMAT_NHWC && (padding == VALID || padding == SAME)) {
     // 1x1 filter, so call cublas directly.
     const uint64 m = dims.batch_size * dims.spatial_dims[0].input_size *
                      dims.spatial_dims[1].input_size;
@@ -841,22 +805,28 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
     return;
   }
 
+  const int64 common_padding_rows = std::min(padding_top, padding_bottom);
+  const int64 common_padding_cols = std::min(padding_left, padding_right);
   TensorShape compatible_input_shape;
-  if (rows_odd || cols_odd) {
-    // If a padding dimension is odd, we have one more element on the right
-    // side or the bottom side. This is unsupported in cudnn. Therefore,
-    // we pad that extra element and make it compatible.
+  if (padding_top != padding_bottom || padding_left != padding_right) {
+    // Pad the input in the same way we did during the forward pass, so that
+    // cuDNN receives the same input during the backward pass function as it did
+    // during the forward pass function.
+    const int64 padding_rows_diff = std::abs(padding_bottom - padding_top);
+    const int64 padding_cols_diff = std::abs(padding_right - padding_left);
+    const int64 new_in_rows =
+        dims.spatial_dims[0].input_size + padding_rows_diff;
+    const int64 new_in_cols =
+        dims.spatial_dims[1].input_size + padding_cols_diff;
     compatible_input_shape = ShapeFromFormat(
-        data_format, dims.batch_size,
-        dims.spatial_dims[0].input_size + rows_odd,
-        dims.spatial_dims[1].input_size + cols_odd, dims.in_depth);
+        data_format, dims.batch_size, new_in_rows, new_in_cols, dims.in_depth);
   } else {
     compatible_input_shape = input_shape;
   }
 
-  CHECK(padding_rows >= 0 && padding_cols >= 0)
-      << "Negative row or col paddings: (" << padding_rows << ", "
-      << padding_cols << ")";
+  CHECK(common_padding_rows >= 0 && common_padding_cols >= 0)  // Crash OK
+      << "Negative row or col paddings: (" << common_padding_rows << ", "
+      << common_padding_cols << ")";
   se::dnn::BatchDescriptor input_desc;
   input_desc.set_count(dims.batch_size)
       .set_height(GetTensorDim(compatible_input_shape, data_format, 'H'))
@@ -879,8 +849,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
       .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation)
       .set_vertical_filter_stride(dims.spatial_dims[0].stride)
       .set_horizontal_filter_stride(dims.spatial_dims[1].stride)
-      .set_zero_padding_height(padding_rows / 2)
-      .set_zero_padding_width(padding_cols / 2)
+      .set_zero_padding_height(common_padding_rows)
+      .set_zero_padding_width(common_padding_cols)
       .set_group_count(dims.in_depth / filter_shape.dim_size(2));
 
   // NOTE(keveman):
@@ -951,10 +921,10 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
       AsDeviceMemory(pre_transformed_in_backprop.template flat<T>().data(),
                      pre_transformed_in_backprop.template flat<T>().size());
 
-  static int64 ConvolveBackwardDataScratchSize = GetCudnnWorkspaceLimit(
+  static int64 ConvolveBackwardDataScratchSize = GetDnnWorkspaceLimit(
       "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB by default
   );
-  CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize, ctx);
+  DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize, ctx);
   int device_id = stream->parent()->device_ordinal();
   DataType dtype = out_backprop.dtype();
   ConvParameters conv_parameters = {
@@ -971,8 +941,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
         dims.spatial_dims[1].dilation}},   // dilation_cols
       {{dims.spatial_dims[0].stride,       // stride_rows
         dims.spatial_dims[1].stride}},     // stride_cols
-      {{padding_rows,                      // padding_rows
-        padding_cols}},                    // padding_cols
+      {{common_padding_rows,               // padding_rows
+        common_padding_cols}},             // padding_cols
       dtype,                               // tensor data type
       device_id,                           // device_id
   };
@@ -988,8 +958,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
     for (auto profile_algorithm : algorithms) {
       // TODO(zhengxq): profile each algorithm multiple times to better
       // accuracy.
-      CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
-                                              ctx);
+      DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
+                                            ctx);
       ProfileResult profile_result;
       bool cudnn_launch_status =
           stream
@@ -1041,7 +1011,7 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
     return;
   }
 
-  if (rows_odd || cols_odd) {
+  if (padding_top != padding_bottom || padding_left != padding_right) {
     Tensor in_backprop_remove_padding;
     OP_REQUIRES_OK(
         ctx, ctx->allocate_temp(
@@ -1053,12 +1023,18 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
                                  GetTensorDim(input_shape, data_format, 'C')),
                  &in_backprop_remove_padding));
 
-    // Remove the padding for odd rows or cols.
+    // Remove the padding that was added to the input shape above.
+    const int64 input_pad_top = padding_top - common_padding_rows;
+    const int64 input_pad_bottom = padding_bottom - common_padding_rows;
+    const int64 input_pad_left = padding_left - common_padding_cols;
+    const int64 input_pad_right = padding_right - common_padding_cols;
     functor::PadInput<GPUDevice, T, int, 4>()(
         ctx->template eigen_device<GPUDevice>(),
         To32Bit(const_cast<const Tensor&>(pre_transformed_in_backprop)
                     .tensor<T, 4>()),
-        {{0, 0}}, {{-rows_odd, -cols_odd}},
+        {{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
+        {{static_cast<int>(-input_pad_bottom),
+          static_cast<int>(-input_pad_right)}},
         To32Bit(in_backprop_remove_padding.tensor<T, 4>()), FORMAT_NCHW);
 
     pre_transformed_in_backprop = in_backprop_remove_padding;
@@ -1136,6 +1112,7 @@ REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
                         Conv2DSlowBackpropInputOp<GPUDevice, Eigen::half>);
 
 // To be used inside depthwise_conv_grad_op.cc.
+// TODO(reedwm): Move this and the definition to depthwise_conv_grad_op.cc.
 template struct LaunchConv2DBackpropInputOp<GPUDevice, float>;
 template struct LaunchConv2DBackpropInputOp<GPUDevice, Eigen::half>;
 template struct LaunchConv2DBackpropInputOp<GPUDevice, double>;
diff --git a/tensorflow/core/kernels/conv_grad_ops.cc b/tensorflow/core/kernels/conv_grad_ops.cc
index 507720c998d752f7157be5340445693bf8849173..9ceb51062e832a2e59455d71a0115e98896ef276 100644
--- a/tensorflow/core/kernels/conv_grad_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_ops.cc
@@ -52,24 +52,23 @@ int ConvBackpropDimensions::SpatialPadding(const Padding& padding,
                                        1 - input_size(dim)));
 }
 
-// The V2 version computes windowed output size with arbitrary dilation_rate,
-// while the original version only handles the cases where dilation_rates equal
-// to 1.
-Status ConvBackpropExtractAndVerifyDimensionV2(
+namespace {
+
+Status ConvBackpropExtractAndVerifyDimension(
     StringPiece label, const TensorShape& input_shape,
     const TensorShape& filter_shape, const TensorShape& output_shape,
     const gtl::ArraySlice<int32>& dilations, const std::vector<int32>& strides,
-    Padding padding, int spatial_dim, int filter_spatial_dim,
-    ConvBackpropSpatialDimension* dim) {
+    Padding padding, int64 padding_before, int64 padding_after, int spatial_dim,
+    int filter_spatial_dim, ConvBackpropSpatialDimension* dim) {
   dim->input_size = input_shape.dim_size(spatial_dim);
   dim->filter_size = filter_shape.dim_size(filter_spatial_dim);
   dim->output_size = output_shape.dim_size(spatial_dim);
   dim->stride = strides[spatial_dim];
   dim->dilation = dilations[spatial_dim];
-  int64 out_size = 0, pad_size = 0;
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeV2(dim->input_size, dim->filter_size,
-                                             dim->dilation, dim->stride,
-                                             padding, &out_size, &pad_size));
+  int64 out_size = 0;
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerboseV2(
+      dim->input_size, dim->filter_size, dim->dilation, dim->stride, padding,
+      &out_size, &padding_before, &padding_after));
   if (dim->output_size != out_size) {
     return errors::InvalidArgument(
         label, ": Size of out_backprop doesn't match computed: ", "actual = ",
@@ -82,7 +81,7 @@ Status ConvBackpropExtractAndVerifyDimensionV2(
   int64 effective_filter_size = (dim->filter_size - 1) * dim->dilation + 1;
   dim->expanded_output_size = (dim->output_size - 1) * dim->stride + 1;
   const auto padded_out_size = dim->input_size + effective_filter_size - 1;
-  dim->pad_before = effective_filter_size - 1 - pad_size;
+  dim->pad_before = effective_filter_size - 1 - padding_before;
   dim->pad_after =
       padded_out_size - dim->expanded_output_size - dim->pad_before;
   VLOG(2) << label << ": expanded_out = " << dim->expanded_output_size
@@ -94,22 +93,14 @@ Status ConvBackpropExtractAndVerifyDimensionV2(
   return Status::OK();
 }
 
-Status ConvBackpropExtractAndVerifyDimension(
-    StringPiece label, const TensorShape& input_shape,
-    const TensorShape& filter_shape, const TensorShape& output_shape,
-    const std::vector<int32>& strides, Padding padding, int spatial_dim,
-    int filter_spatial_dim, ConvBackpropSpatialDimension* dim) {
-  static constexpr std::array<int32, 5> one_dilations = {{1, 1, 1, 1, 1}};
-  return ConvBackpropExtractAndVerifyDimensionV2(
-      label, input_shape, filter_shape, output_shape, one_dilations, strides,
-      padding, spatial_dim, filter_spatial_dim, dim);
-}
+}  // namespace
 
 Status ConvBackpropComputeDimensionsV2(
     StringPiece label, int num_spatial_dims, const TensorShape& input_shape,
     const TensorShape& filter_shape, const TensorShape& out_backprop_shape,
     const gtl::ArraySlice<int32>& dilations, const std::vector<int32>& strides,
-    Padding padding, TensorFormat data_format, ConvBackpropDimensions* dims) {
+    Padding padding, absl::Span<const int64> explicit_paddings,
+    TensorFormat data_format, ConvBackpropDimensions* dims) {
   // The + 2 in the following line is for the batch and feature dimensions.
   const int num_dims = num_spatial_dims + 2;
   if (input_shape.dims() != num_dims) {
@@ -152,9 +143,15 @@ Status ConvBackpropComputeDimensionsV2(
   dims->spatial_dims.resize(num_spatial_dims);
   for (int i = 0; i < num_spatial_dims; ++i) {
     int image_dim = GetTensorSpatialDimIndex(num_dims, data_format, i);
-    TF_RETURN_IF_ERROR(ConvBackpropExtractAndVerifyDimensionV2(
+    int64 padding_before = -1, padding_after = -1;
+    if (padding == EXPLICIT) {
+      padding_before = explicit_paddings[2 * image_dim];
+      padding_after = explicit_paddings[2 * image_dim + 1];
+    }
+    TF_RETURN_IF_ERROR(ConvBackpropExtractAndVerifyDimension(
         label, input_shape, filter_shape, out_backprop_shape, dilations,
-        strides, padding, image_dim, i, &dims->spatial_dims[i]));
+        strides, padding, padding_before, padding_after, image_dim, i,
+        &dims->spatial_dims[i]));
   }
   return Status::OK();
 }
@@ -169,7 +166,8 @@ Status ConvBackpropComputeDimensions(StringPiece label, int num_spatial_dims,
   static constexpr std::array<int32, 5> one_dilations = {{1, 1, 1, 1, 1}};
   return ConvBackpropComputeDimensionsV2(
       label, num_spatial_dims, input_shape, filter_shape, out_backprop_shape,
-      one_dilations, strides, padding, data_format, dims);
+      one_dilations, strides, padding, /*explicit_paddings=*/{}, data_format,
+      dims);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_grad_ops.h b/tensorflow/core/kernels/conv_grad_ops.h
index 9551959463bf1f32010b436671ff7eed1daa9d82..173f92806f911edf6dca043510b1fd9b36a0a66f 100644
--- a/tensorflow/core/kernels/conv_grad_ops.h
+++ b/tensorflow/core/kernels/conv_grad_ops.h
@@ -176,8 +176,9 @@ struct LaunchConv2DBackpropInputOp {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& out_backprop, const Tensor& filter,
                   int row_dilation, int col_dilation, int row_stride,
-                  int col_stride, const Padding& padding, Tensor* in_backprop,
-                  TensorFormat data_format);
+                  int col_stride, const Padding& padding,
+                  const std::vector<int64>& explicit_paddings,
+                  Tensor* in_backprop, TensorFormat data_format);
 };
 
 template <typename Device, typename T>
@@ -186,6 +187,7 @@ struct LaunchConv2DBackpropFilterOp {
                   const Tensor& out_backprop, const Tensor& input,
                   int row_dilation, int col_dilation, int row_stride,
                   int col_stride, const Padding& padding,
+                  const std::vector<int64>& explicit_paddings,
                   Tensor* filter_backprop, TensorFormat data_format);
 };
 
@@ -195,7 +197,8 @@ struct LaunchConv2DBackpropInputOp<Eigen::GpuDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& input, const Tensor& filter, int row_dilation,
                   int col_dilation, int row_stride, int col_stride,
-                  const Padding& padding, Tensor* output,
+                  const Padding& padding,
+                  const std::vector<int64>& explicit_paddings, Tensor* output,
                   TensorFormat data_format);
 };
 
@@ -205,6 +208,7 @@ struct LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T> {
                   const Tensor& out_backprop, const Tensor& input,
                   int row_dilation, int col_dilation, int row_stride,
                   int col_stride, const Padding& padding,
+                  const std::vector<int64>& explicit_paddings,
                   Tensor* filter_backprop, TensorFormat data_format);
 };
 #endif  // GOOGLE_CUDA
@@ -217,6 +221,8 @@ struct ConvBackpropSpatialDimension {
   int64 output_size;
   int64 stride;
   int64 dilation;
+
+  // Output size after scaling by the stride.
   int64 expanded_output_size;
 
   // Number of padding elements to be added before/after this dimension of
@@ -248,7 +254,7 @@ struct ConvBackpropDimensions {
 
 // Common code between implementations of Conv?DBackpropInput and
 // Conv?DBackpropFilter. Verifies that the dimensions all match, and computes
-// sizes/padding for the spatial dimensions.
+// sizes/padding for the spatial dimensions. Does not support explicit padding.
 Status ConvBackpropComputeDimensions(StringPiece label, int num_spatial_dims,
                                      const TensorShape& input_shape,
                                      const TensorShape& filter_shape,
@@ -257,13 +263,15 @@ Status ConvBackpropComputeDimensions(StringPiece label, int num_spatial_dims,
                                      Padding padding, TensorFormat data_format,
                                      ConvBackpropDimensions* dims);
 
-// The V2 version computes the same outputs with arbitrary dilation rate.
+// The V2 version computes the same outputs with arbitrary dilation rate and
+// supports explicit padding.
 // TODO(b/67112639): Merge V2 versions and the original versions eventually.
 Status ConvBackpropComputeDimensionsV2(
     StringPiece label, int num_spatial_dims, const TensorShape& input_shape,
     const TensorShape& filter_shape, const TensorShape& out_backprop_shape,
     const gtl::ArraySlice<int32>& dilations, const std::vector<int32>& strides,
-    Padding padding, TensorFormat data_format, ConvBackpropDimensions* dims);
+    Padding padding, absl::Span<const int64> explicit_paddings,
+    TensorFormat data_format, ConvBackpropDimensions* dims);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_CONV_GRAD_OPS_H_
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index e4c49efea0bd87fdbaa3fbdad3d5612d6b4f8a82..ca46da6ba38044b50aa6299b82f9b9cacd87bb4c 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -1152,11 +1152,11 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
     }
 
     ConvBackpropDimensions dims;
-    OP_REQUIRES_OK(context,
-                   ConvBackpropComputeDimensionsV2(
-                       "Conv3DBackpropInputOp", /*num_spatial_dims=*/3,
-                       input_shape, filter_shape, out_backprop_shape, dilation_,
-                       stride_, padding_, data_format_, &dims));
+    OP_REQUIRES_OK(context, ConvBackpropComputeDimensionsV2(
+                                "Conv3DBackpropInputOp", /*num_spatial_dims=*/3,
+                                input_shape, filter_shape, out_backprop_shape,
+                                dilation_, stride_, padding_,
+                                /*explicit_paddings=*/{}, data_format_, &dims));
 
     Tensor* in_backprop;
     OP_REQUIRES_OK(context,
@@ -1333,7 +1333,7 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
         AsDeviceMemory(pre_transformed_in_backprop.template flat<T>().data(),
                        pre_transformed_in_backprop.template flat<T>().size());
 
-    static int64 ConvolveBackwardDataScratchSize = GetCudnnWorkspaceLimit(
+    static int64 ConvolveBackwardDataScratchSize = GetDnnWorkspaceLimit(
         "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32);  // 4GB by default
 
     const int device_id = stream->parent()->device_ordinal();
@@ -1368,8 +1368,8 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
       for (auto profile_algorithm : algorithms) {
         // TODO(zhengxq): profile each algorithm multiple times to better
         // accuracy.
-        CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
-                                                context);
+        DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
+                                              context);
         ProfileResult profile_result;
         bool cudnn_launch_status =
             stream
@@ -1405,8 +1405,8 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
       AutoTuneConv3dBwdData::GetInstance()->Insert(conv_parameters,
                                                    algorithm_config);
     }
-    CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
-                                            context);
+    DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
+                                          context);
     bool cudnn_launch_status =
         stream
             ->ThenConvolveBackwardDataWithAlgorithm(
@@ -1537,11 +1537,12 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
     }
 
     ConvBackpropDimensions dims;
-    OP_REQUIRES_OK(context,
-                   ConvBackpropComputeDimensionsV2(
-                       "Conv3DBackpropFilterOp", /*num_spatial_dims=*/3,
-                       input_shape, filter_shape, out_backprop_shape, dilation_,
-                       stride_, padding_, data_format_, &dims));
+    OP_REQUIRES_OK(
+        context,
+        ConvBackpropComputeDimensionsV2(
+            "Conv3DBackpropFilterOp", /*num_spatial_dims=*/3, input_shape,
+            filter_shape, out_backprop_shape, dilation_, stride_, padding_,
+            /*explicit_paddings=*/{}, data_format_, &dims));
 
     Tensor* filter_backprop;
     OP_REQUIRES_OK(context,
@@ -1739,7 +1740,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         AsDeviceMemory(transformed_input.template flat<T>().data(),
                        transformed_input.template flat<T>().size());
 
-    static int64 ConvolveBackwardFilterScratchSize = GetCudnnWorkspaceLimit(
+    static int64 ConvolveBackwardFilterScratchSize = GetDnnWorkspaceLimit(
         "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32);  // 4GB by default
 
     const int device_id = stream->parent()->device_ordinal();
@@ -1774,8 +1775,8 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
       for (auto profile_algorithm : algorithms) {
         // TODO(zhengxq): profile each algorithm multiple times to better
         // accuracy.
-        CudnnScratchAllocator scratch_allocator(
-            ConvolveBackwardFilterScratchSize, context);
+        DnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
+                                              context);
         ProfileResult profile_result;
         bool cudnn_launch_status =
             stream
@@ -1812,8 +1813,8 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
       AutoTuneConv3dBwdFilter::GetInstance()->Insert(conv_parameters,
                                                      algorithm_config);
     }
-    CudnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
-                                            context);
+    DnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
+                                          context);
     bool cudnn_launch_status =
         stream
             ->ThenConvolveBackwardFilterWithAlgorithm(
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 74857fc2078dc3ee5e17959fc32febcdcb38a689..979c76dc3c99c950ff5d5062e3ee79d448c40fcf 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -28,13 +28,13 @@ limitations under the License.
 #include <map>
 #include <vector>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/deep_conv2d.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -122,7 +122,8 @@ struct LaunchConv2DOp<CPUDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& input, const Tensor& filter, int row_dilation,
                   int col_dilation, int row_stride, int col_stride,
-                  const Padding& padding, Tensor* output,
+                  const Padding& padding,
+                  const std::vector<int64>& explicit_paddings, Tensor* output,
                   TensorFormat data_format) {
     if (data_format != FORMAT_NHWC) {
       ctx->SetStatus(
@@ -130,6 +131,11 @@ struct LaunchConv2DOp<CPUDevice, T> {
                                 "NHWC tensor format for now."));
       return;
     }
+    // TODO(reedwm): Enable explicit padding on the CPU.
+    OP_REQUIRES(
+        ctx, padding != Padding::EXPLICIT,
+        errors::Unimplemented("Generic conv implementation does not support "
+                              "EXPLICIT padding yet."));
     const int64 in_depth = GetTensorDim(input, data_format, 'C');
     OP_REQUIRES(ctx, in_depth == filter.dim_size(2),
                 errors::Unimplemented("Generic conv implementation does not "
@@ -274,6 +280,10 @@ Status InitConv2DParameters(const OpKernelConstruction* context,
   TF_RETURN_IF_ERROR(context->GetAttr("dilations", &params->dilations));
   TF_RETURN_IF_ERROR(context->GetAttr("strides", &params->strides));
   TF_RETURN_IF_ERROR(context->GetAttr("padding", &params->padding));
+  if (context->HasAttr("explicit_paddings")) {
+    TF_RETURN_IF_ERROR(
+        context->GetAttr("explicit_paddings", &params->explicit_paddings));
+  }
   string data_format_string;
   TF_RETURN_IF_ERROR(context->GetAttr("data_format", &data_format_string));
   TF_REQUIRES(FormatFromString(data_format_string, &params->data_format),
@@ -313,6 +323,10 @@ Status InitConv2DParameters(const OpKernelConstruction* context,
       dilation_h > 0 && dilation_w > 0,
       errors::InvalidArgument("Dilated rates should be larger than 0."));
 
+  TF_RETURN_IF_ERROR(CheckValidPadding(params->padding,
+                                       params->explicit_paddings,
+                                       /*num_dims=*/4, data_format));
+
   return Status::OK();
 }
 
@@ -381,14 +395,22 @@ Status ComputeConv2DDimension(const Conv2DParameters& params,
   const int dilation_cols =
       GetTensorDim(params.dilations, params.data_format, 'W');
 
+  int64 pad_rows_before, pad_rows_after, pad_cols_before, pad_cols_after;
+  if (params.padding == Padding::EXPLICIT) {
+    GetExplicitPaddingForDim(params.explicit_paddings, params.data_format, 'H',
+                             &pad_rows_before, &pad_rows_after);
+    GetExplicitPaddingForDim(params.explicit_paddings, params.data_format, 'W',
+                             &pad_cols_before, &pad_cols_after);
+  }
+
   // Compute windowed output sizes for rows and columns.
-  int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeV2(
+  int64 out_rows = 0, out_cols = 0;
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerboseV2(
       input_rows, filter_rows, dilation_rows, stride_rows, params.padding,
-      &out_rows, &pad_rows));
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeV2(
+      &out_rows, &pad_rows_before, &pad_rows_after));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerboseV2(
       input_cols, filter_cols, dilation_cols, stride_cols, params.padding,
-      &out_cols, &pad_cols));
+      &out_cols, &pad_cols_before, &pad_cols_after));
 
   dimensions->batch = batch;
   dimensions->input_rows = input_rows;
@@ -404,8 +426,10 @@ Status ComputeConv2DDimension(const Conv2DParameters& params,
   dimensions->dilation_cols = dilation_cols;
   dimensions->out_rows = out_rows;
   dimensions->out_cols = out_cols;
-  dimensions->pad_rows = pad_rows;
-  dimensions->pad_cols = pad_cols;
+  dimensions->pad_rows_before = pad_rows_before;
+  dimensions->pad_rows_after = pad_rows_after;
+  dimensions->pad_cols_before = pad_cols_before;
+  dimensions->pad_cols_after = pad_cols_after;
 
   return Status::OK();
 }
@@ -463,33 +487,35 @@ class Conv2DOp : public BinaryOp<T> {
     }
 
 #ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS
-    if (LaunchXsmmConvOp<Device, T>::Run(
+    if (params_.padding != EXPLICIT &&
+        LaunchXsmmConvOp<Device, T>::Run(
             context, input, filter, dimensions.batch, dimensions.input_rows,
             dimensions.input_cols, dimensions.in_depth, dimensions.filter_rows,
-            dimensions.filter_cols, dimensions.pad_rows, dimensions.pad_cols,
-            dimensions.out_rows, dimensions.out_cols, dimensions.out_depth,
-            dimensions.dilation_rows, dimensions.dilation_cols,
-            dimensions.stride_rows, dimensions.stride_cols, output,
-            params_.data_format)) {
+            dimensions.filter_cols, dimensions.pad_rows_before,
+            dimensions.pad_cols_before, dimensions.out_rows,
+            dimensions.out_cols, dimensions.out_depth, dimensions.dilation_rows,
+            dimensions.dilation_cols, dimensions.stride_rows,
+            dimensions.stride_cols, output, params_.data_format)) {
       return;
     }
 #endif
 
-    if (LaunchDeepConvOp<Device, T>::Run(
+    if (params_.padding != EXPLICIT &&
+        LaunchDeepConvOp<Device, T>::Run(
             context, input, filter, dimensions.batch, dimensions.input_rows,
             dimensions.input_cols, dimensions.in_depth, dimensions.filter_rows,
-            dimensions.filter_cols, dimensions.pad_rows, dimensions.pad_cols,
-            dimensions.out_rows, dimensions.out_cols, dimensions.out_depth,
-            dimensions.dilation_rows, dimensions.dilation_cols,
-            dimensions.stride_rows, dimensions.stride_cols, output,
-            params_.data_format)) {
+            dimensions.filter_cols, dimensions.pad_rows_before,
+            dimensions.pad_cols_before, dimensions.out_rows,
+            dimensions.out_cols, dimensions.out_depth, dimensions.dilation_rows,
+            dimensions.dilation_cols, dimensions.stride_rows,
+            dimensions.stride_cols, output, params_.data_format)) {
       return;
     }
 
     launcher_(context, use_cudnn_, cudnn_use_autotune_, input, filter,
               dimensions.dilation_rows, dimensions.dilation_cols,
               dimensions.stride_rows, dimensions.stride_cols, params_.padding,
-              output, params_.data_format);
+              params_.explicit_paddings, output, params_.data_format);
   }
 
  private:
@@ -521,8 +547,8 @@ template struct LaunchConv2DOp<CPUDevice, float>;
 template struct LaunchConv2DOp<CPUDevice, double>;
 
 #if GOOGLE_CUDA
-int64 GetCudnnWorkspaceLimit(const string& envvar_in_mb,
-                             int64 default_value_in_bytes) {
+int64 GetDnnWorkspaceLimit(const string& envvar_in_mb,
+                           int64 default_value_in_bytes) {
   const char* workspace_limit_in_mb_str = getenv(envvar_in_mb.c_str());
   if (workspace_limit_in_mb_str != nullptr &&
       strcmp(workspace_limit_in_mb_str, "") != 0) {
@@ -551,7 +577,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
     const Tensor& input_param, const Tensor& filter, int row_dilation,
     int col_dilation, int row_stride, int col_stride, const Padding& padding,
-    Tensor* output, TensorFormat data_format) {
+    const std::vector<int64>& explicit_paddings, Tensor* output,
+    TensorFormat data_format) {
   using se::dnn::AlgorithmConfig;
   using se::dnn::AlgorithmDesc;
   using se::dnn::ProfileResult;
@@ -580,7 +607,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
   bool is_grouped_convolution = patch_depths != in_depths;
   if (patch_rows == 1 && patch_cols == 1 && !is_grouped_convolution &&
       row_dilation == 1 && col_dilation == 1 && row_stride == 1 &&
-      col_stride == 1 && data_format == FORMAT_NHWC) {
+      col_stride == 1 && data_format == FORMAT_NHWC &&
+      (padding == VALID || padding == SAME)) {
     // 1x1 filter, so call cublas directly.
     const uint64 m = in_batch * in_rows * in_cols;
     const uint64 k = patch_depths;
@@ -634,49 +662,78 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     return;
   }
 
-  int padding_rows = 0;
-  int padding_cols = 0;
   const int64 out_batch = GetTensorDim(*output, data_format, 'N');
   const int64 out_rows = GetTensorDim(*output, data_format, 'H');
   const int64 out_cols = GetTensorDim(*output, data_format, 'W');
   const int64 out_depths = GetTensorDim(*output, data_format, 'C');
-  if (padding == SAME) {
-    // Total padding on rows and cols is
-    // Pr = (R' - 1) * S + (Kr - 1) * Dr + 1 - R
-    // Pc = (C' - 1) * S + (Kc - 1) * Dc + 1 - C
-    // where (R', C') are output dimensions, (R, C) are input dimensions, S
-    // is stride, (Dr, Dc) are dilations, (Kr, Kc) are filter dimensions.
-    // We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top
-    // and Pc - Pc/2 on the bottom.  When Pr or Pc is odd, this means
-    // we pad more on the right and bottom than on the top and left.
-    padding_rows =
-        std::max<int>(0, (out_rows - 1) * row_stride +
-                             (patch_rows - 1) * row_dilation + 1 - in_rows);
-    padding_cols =
-        std::max<int>(0, (out_cols - 1) * col_stride +
-                             (patch_cols - 1) * col_dilation + 1 - in_cols);
-    const bool rows_odd = (padding_rows % 2 != 0);
-    const bool cols_odd = (padding_cols % 2 != 0);
-    if (rows_odd || cols_odd) {
-      Tensor transformed_input;
-      int64 new_in_rows = in_rows + rows_odd;
-      int64 new_in_cols = in_cols + cols_odd;
-      OP_REQUIRES_OK(
-          ctx,
-          ctx->allocate_temp(DataTypeToEnum<T>::value,
-                             ShapeFromFormat(data_format, in_batch, new_in_rows,
-                                             new_in_cols, in_depths),
-                             &transformed_input));
-
-      functor::PadInput<GPUDevice, T, int, 4>()(
-          ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 4>()),
-          {{0, 0}}, {{rows_odd, cols_odd}},
-          To32Bit(transformed_input.tensor<T, 4>()), data_format);
-
-      input = transformed_input;
-      in_rows = new_in_rows;
-      in_cols = new_in_cols;
+  int64 padding_top = -1, padding_bottom = -1;
+  int64 padding_left = -1, padding_right = -1;
+  if (padding == EXPLICIT) {
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &padding_top,
+                             &padding_bottom);
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &padding_left,
+                             &padding_right);
+  }
+  int64 out_rows_check, out_cols_check;
+  Status status = GetWindowedOutputSizeVerboseV2(
+      in_rows, patch_rows, row_dilation, row_stride, padding, &out_rows_check,
+      &padding_top, &padding_bottom);
+  // The status is guaranteed to be OK because we checked the output and padding
+  // was valid earlier.
+  TF_CHECK_OK(status);
+  DCHECK_EQ(out_rows, out_rows_check);
+  status = GetWindowedOutputSizeVerboseV2(in_cols, patch_cols, col_dilation,
+                                          col_stride, padding, &out_cols_check,
+                                          &padding_left, &padding_right);
+  TF_CHECK_OK(status);
+  DCHECK_EQ(out_cols, out_cols_check);
+
+  const int64 common_padding_rows = std::min(padding_top, padding_bottom);
+  const int64 common_padding_cols = std::min(padding_left, padding_right);
+  if (padding_top != padding_bottom || padding_left != padding_right) {
+    // cuDNN only supports padding the same amount on the left and right sides,
+    // and on the top and bottom sides. So we manually create a new padded
+    // input tensor such that we can pass it to cuDNN.
+
+    // TODO(reedwm): In some cases, we can avoid an allocation even if the two
+    // padding sides are different. For example, if the input is 2x2, the filter
+    // is 1x1, the stride is 2, and the padding is (1, 0, 1, 0), the result is
+    // equivalent to as if the padding is (1, 1, 1, 1). Changing the padding in
+    // such a way would allow us to avoid the allocation.
+    Tensor transformed_input;
+    const int64 padding_rows_diff = std::abs(padding_bottom - padding_top);
+    const int64 padding_cols_diff = std::abs(padding_right - padding_left);
+    const int64 new_in_rows = in_rows + padding_rows_diff;
+    const int64 new_in_cols = in_cols + padding_cols_diff;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(
+                            DataTypeToEnum<T>::value,
+                            ShapeFromFormat(data_format, in_batch, new_in_rows,
+                                            new_in_cols, in_depths),
+                            &transformed_input));
+
+    const int64 input_pad_top = padding_top - common_padding_rows;
+    const int64 input_pad_bottom = padding_bottom - common_padding_rows;
+    const int64 input_pad_left = padding_left - common_padding_cols;
+    const int64 input_pad_right = padding_right - common_padding_cols;
+    bool in_bounds =
+        FastBoundsCheck(input_pad_top, std::numeric_limits<int>::max()) &&
+        FastBoundsCheck(input_pad_bottom, std::numeric_limits<int>::max()) &&
+        FastBoundsCheck(input_pad_left, std::numeric_limits<int>::max()) &&
+        FastBoundsCheck(input_pad_right, std::numeric_limits<int>::max());
+    if (!in_bounds) {
+      ctx->SetStatus(errors::InvalidArgument("Padding is too large."));
+      return;
     }
+    functor::PadInput<GPUDevice, T, int, 4>()(
+        ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 4>()),
+        {{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
+        {{static_cast<int>(input_pad_bottom),
+          static_cast<int>(input_pad_right)}},
+        To32Bit(transformed_input.tensor<T, 4>()), data_format);
+
+    input = transformed_input;
+    in_rows = new_in_rows;
+    in_cols = new_in_cols;
   }
 
   if (data_format == FORMAT_NHWC) {
@@ -698,9 +755,9 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     }
   }
 
-  CHECK(padding_rows >= 0 && padding_cols >= 0)
-      << "Negative row or col paddings: (" << padding_rows << ", "
-      << padding_cols << ")";
+  CHECK(common_padding_rows >= 0 && common_padding_cols >= 0)  // Crash OK
+      << "Negative row or col paddings: (" << common_padding_rows << ", "
+      << common_padding_cols << ")";
   se::dnn::BatchDescriptor input_desc;
   input_desc.set_count(in_batch)
       .set_feature_map_count(in_depths)
@@ -723,8 +780,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
       .set_horizontal_dilation_rate(col_dilation)
       .set_vertical_filter_stride(row_stride)
       .set_horizontal_filter_stride(col_stride)
-      .set_zero_padding_height(padding_rows / 2)
-      .set_zero_padding_width(padding_cols / 2)
+      .set_zero_padding_height(common_padding_rows)
+      .set_zero_padding_width(common_padding_cols)
       .set_group_count(in_depths / patch_depths);
 
   Tensor transformed_filter;
@@ -759,7 +816,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
       AsDeviceMemory(transformed_output.template flat<T>().data(),
                      transformed_output.template flat<T>().size());
 
-  static int64 ConvolveScratchSize = GetCudnnWorkspaceLimit(
+  static int64 ConvolveScratchSize = GetDnnWorkspaceLimit(
       // default value is in bytes despite the name of the environment variable
       "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
   );
@@ -767,23 +824,23 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
   int device_id = stream->parent()->device_ordinal();
   DataType dtype = input.dtype();
   ConvParameters conv_parameters = {
-      in_batch,          // batch
-      in_depths,         // in_depths
-      {{in_rows,         // in_rows
-        in_cols}},       // in_cols
-      FORMAT_NCHW,       // compute_data_format
-      out_depths,        // out_depths
-      {{patch_rows,      // filter_rows
-        patch_cols,      // filter_cols
-        patch_depths}},  // filter_depths
-      {{row_dilation,    // dilation_rows
-        col_dilation}},  // dilation_cols
-      {{row_stride,      // stride_rows
-        col_stride}},    // stride_cols
-      {{padding_rows,    // padding_rows
-        padding_cols}},  // padding_cols
-      dtype,             // tensor datatype
-      device_id,         // device_id
+      in_batch,                 // batch
+      in_depths,                // in_depths
+      {{in_rows,                // in_rows
+        in_cols}},              // in_cols
+      FORMAT_NCHW,              // compute_data_format
+      out_depths,               // out_depths
+      {{patch_rows,             // filter_rows
+        patch_cols,             // filter_cols
+        patch_depths}},         // filter_depths
+      {{row_dilation,           // dilation_rows
+        col_dilation}},         // dilation_cols
+      {{row_stride,             // stride_rows
+        col_stride}},           // stride_cols
+      {{common_padding_rows,    // padding_rows
+        common_padding_cols}},  // padding_cols
+      dtype,                    // tensor datatype
+      device_id,                // device_id
   };
   AlgorithmConfig algorithm_config;
   if (cudnn_use_autotune &&
@@ -803,7 +860,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     for (auto profile_algorithm : algorithms) {
       // TODO(zhengxq): profile each algorithm multiple times to better
       // accuracy.
-      CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+      DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
       ProfileResult profile_result;
       bool cudnn_launch_status =
           stream
@@ -841,7 +898,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
     AutoTuneConv::GetInstance()->Insert(conv_parameters, algorithm_config);
   }
 
-  CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+  DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
   bool cudnn_launch_status =
       stream
           ->ThenConvolveWithAlgorithm(input_desc, input_ptr, filter_desc,
diff --git a/tensorflow/core/kernels/conv_ops.h b/tensorflow/core/kernels/conv_ops.h
index 7ec878e0b2fc6eaae2a89610a9f8491689705f0c..ccd24fcdd4c5e4945f2daf6461727e6038b4dd32 100644
--- a/tensorflow/core/kernels/conv_ops.h
+++ b/tensorflow/core/kernels/conv_ops.h
@@ -36,7 +36,8 @@ struct LaunchConv2DOp {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& input, const Tensor& filter, int row_dilation,
                   int col_dilation, int row_stride, int col_stride,
-                  const Padding& padding, Tensor* output,
+                  const Padding& padding,
+                  const std::vector<int64>& explicit_paddings, Tensor* output,
                   TensorFormat data_format);
 };
 
@@ -46,7 +47,8 @@ struct LaunchConv2DOp<Eigen::GpuDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& input, const Tensor& filter, int row_dilation,
                   int col_dilation, int row_stride, int col_stride,
-                  const Padding& padding, Tensor* output,
+                  const Padding& padding,
+                  const std::vector<int64>& explicit_paddings, Tensor* output,
                   TensorFormat data_format);
 };
 #endif  // GOOGLE_CUDA
@@ -63,7 +65,7 @@ struct Im2ColBufferResource : public ResourceBase {
   // the buffer memory held by this resource.
   mutex mu;
   T* data;
-  string DebugString() { return "Im2ColBufferResource"; }
+  string DebugString() const { return "Im2ColBufferResource"; }
 };
 
 // Convolution parameters specified by Op attributes.
@@ -72,6 +74,7 @@ struct Conv2DParameters {
   std::vector<int32> strides;
   Padding padding;
   TensorFormat data_format;
+  std::vector<int64> explicit_paddings;
 };
 
 // Convolution dimensions inferred from parameters, input and filter tensors.
@@ -94,8 +97,10 @@ struct Conv2DDimensions {
 
   int64 out_rows;
   int64 out_cols;
-  int64 pad_rows;
-  int64 pad_cols;
+  int64 pad_rows_before;
+  int64 pad_rows_after;
+  int64 pad_cols_before;
+  int64 pad_cols_after;
 };
 
 // Initializes and validates Conv2D parameters configured by OpKernel
@@ -105,7 +110,7 @@ Status InitConv2DParameters(const OpKernelConstruction* context,
 
 // Computes and validates convolutions dimensions from Conv2D parameters. If
 // parameters are valid, dimensions will be updated with derived convolution
-// dimensions, otherwise error will be returned.
+// dimensions, otherwise an error will be returned.
 Status ComputeConv2DDimension(const Conv2DParameters& params,
                               const Tensor& input, const Tensor& filter,
                               Conv2DDimensions* dimensions);
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index f20ac93b5a01cf2dbd1c53ce55c832727f49979f..5a59e20cc27cb7fe7b6fc6d9fdd160f2e3c4a983 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -407,7 +407,7 @@ struct LaunchConvOp<GPUDevice, T> {
         AsDeviceMemory(transformed_output.template flat<T>().data(),
                        transformed_output.template flat<T>().size());
 
-    static int64 ConvolveScratchSize = GetCudnnWorkspaceLimit(
+    static int64 ConvolveScratchSize = GetDnnWorkspaceLimit(
         "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32);  // 4GB by default
 
     int device_id = stream->parent()->device_ordinal();
@@ -450,7 +450,7 @@ struct LaunchConvOp<GPUDevice, T> {
       for (auto profile_algorithm : algorithms) {
         // TODO(zhengxq): profile each algorithm multiple times to better
         // accuracy.
-        CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+        DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
         ProfileResult profile_result;
         bool cudnn_launch_status =
             stream
@@ -486,7 +486,7 @@ struct LaunchConvOp<GPUDevice, T> {
       AutoTuneConv3d::GetInstance()->Insert(conv_parameters, algorithm_config);
     }
 
-    CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+    DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
     bool cudnn_launch_status =
         stream
             ->ThenConvolveWithAlgorithm(input_desc, input_ptr, filter_desc,
diff --git a/tensorflow/core/kernels/conv_ops_fused.cc b/tensorflow/core/kernels/conv_ops_fused.cc
index 798a7325cd25494d8b12447c86f4883ca038c8ca..9c807c3375bf76dfcce731029b93fbdbf0cd907a 100644
--- a/tensorflow/core/kernels/conv_ops_fused.cc
+++ b/tensorflow/core/kernels/conv_ops_fused.cc
@@ -22,29 +22,78 @@ limitations under the License.
 //
 // Kernels for convolutions fused with image transformations (resize and mirror
 // padding) defined in `conv_ops_fused_image_transform.cc`.
+//
+// For the CPU device we implement fusion with an Eigen tensor contraction
+// output kernel. For the GPU device we rely on CuDNN primitives.
+//
+// NOTE: GPU only supports fusion of Conv2D + BiasAdd + <optional Relu>.
 
+#define USE_EIGEN_TENSOR
 #define EIGEN_USE_THREADS
 
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
 #include <string>
 #include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/substitute.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/conv_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/use_cudnn.h"
+
+#if GOOGLE_CUDA
+#include "cuda/include/cudnn.h"
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#endif  // GOOGLE_CUDA
 
 namespace tensorflow {
-namespace {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace {
+// Supported Conv2D fusions. Not all of them supported on all type of devices.
+enum class FusedComputationType {
+  // NOTE(ezhulenev): CuDNN `cudnnConvolutionBiasActivationForward` supports
+  // identity activation function, it in theory should allow to fuse convolution
+  // with BiasAdd, but in practice it doesn't work, cuDNN ignores this parameter
+  // and always does Relu activation.
+  kBiasAdd,                // CPU
+  kBiasAddWithRelu,        // CPU and GPU
+  kFusedBatchNorm,         // CPU only
+  kFusedBatchNormWithRelu  // CPU only
+};
+
+// We have to pass around additional arguments for all possible fusion types.
+struct FusedComputationArgs {
+  float epsilon = 0.0;  // Used by `FusedBatchNorm` fusion only
+};
+
+template <typename Device, typename T>
+struct LaunchFusedConv2DOp {
+  void operator()(OpKernelContext* context, bool use_cudnn,
+                  bool cudnn_use_autotune, const Tensor& input,
+                  const Tensor& filter, FusedComputationType fusion,
+                  const FusedComputationArgs& fusion_args,
+                  const Conv2DParameters& params,
+                  const Conv2DDimensions& dimensions, Tensor* output);
+};
 
 // Type aliases for the unaligned tensors (tensor maps) used in output kernels.
 template <typename T>
-struct OutputTypes {
+struct Unaligned {
   // There is no guarantee that the output block passed to the output kernel
   // will be aligned.
 
@@ -99,8 +148,8 @@ struct Relu {
 // depends only on a channel value (e.g. add channel bias).
 
 // Output kernel that fuses BiasAdd operation into the output of tensor
-// contraction + any other transformation defined by Transform.
-template <typename T, typename Transform = Identity>
+// contraction + activation function defined by Activation.
+template <typename T, typename Activation = Identity>
 struct BiasAddOutputKernel {
   explicit BiasAddOutputKernel(const T* bias_data) : bias_data(bias_data) {}
 
@@ -112,13 +161,13 @@ struct BiasAddOutputKernel {
     DCHECK(params.swapped_arguments);
 
     const T* bias_base = bias_data + i;
-    typename OutputTypes<T>::ConstTensor bias(bias_base, num_rows);
+    typename Unaligned<T>::ConstTensor bias(bias_base, num_rows);
 
     for (int col = 0; col < num_cols; ++col) {
       T* output_base = &output_mapper(0, col);
-      typename OutputTypes<T>::Tensor output(output_base, num_rows);
+      typename Unaligned<T>::Tensor output(output_base, num_rows);
       const auto expr = output + bias;
-      output = Transform::template apply<decltype(expr)>(expr);
+      output = Activation::template apply<decltype(expr)>(expr);
     }
   }
 
@@ -127,8 +176,8 @@ struct BiasAddOutputKernel {
 };
 
 // Output kernel that fuses FusedBatchNorm operation into the output of tensor
-// contraction + any other transformation defined by Transform.
-template <typename T, typename Transform = Identity>
+// contraction + activation function defined by Activation.
+template <typename T, typename Activation = Identity>
 struct FusedBatchNormOutputKernel {
   FusedBatchNormOutputKernel(T epsilon, const T* scaling_factor_data,
                              const T* offset_data, const T* estimated_mean_data)
@@ -148,19 +197,19 @@ struct FusedBatchNormOutputKernel {
     const T* offset_base = offset_data + i;
     const T* mean_base = estimated_mean_data + i;
 
-    typename OutputTypes<T>::ConstTensor scaling_factor(scaling_factor_base,
-                                                        num_rows);
-    typename OutputTypes<T>::ConstTensor offset(offset_base, num_rows);
-    typename OutputTypes<T>::ConstTensor mean(mean_base, num_rows);
+    typename Unaligned<T>::ConstTensor scaling_factor(scaling_factor_base,
+                                                      num_rows);
+    typename Unaligned<T>::ConstTensor offset(offset_base, num_rows);
+    typename Unaligned<T>::ConstTensor mean(mean_base, num_rows);
 
     for (int col = 0; col < num_cols; ++col) {
       T* output_base = &output_mapper(0, col);
-      typename OutputTypes<T>::Tensor output(output_base, num_rows);
+      typename Unaligned<T>::Tensor output(output_base, num_rows);
 
       auto scaled = (output - mean) * scaling_factor;
       auto shifted = scaled + offset;
 
-      output = Transform::template apply<decltype(shifted)>(shifted);
+      output = Activation::template apply<decltype(shifted)>(shifted);
     }
   }
 
@@ -182,16 +231,18 @@ using WithFusedBatchNorm = FusedBatchNormOutputKernel<T>;
 template <typename T>
 using WithFusedBatchNormAndRelu = FusedBatchNormOutputKernel<T, Relu>;
 
+// This is CPU-only implementation that uses Eigen contraction output kernels.
+//
 // Dispatch 2D convolution to the appropriate primitive operation:
 //   (1) MatMul for the case of 1x1 convolution.
 //   (2) MatMul for the case when filter size equals to the input size.
 //   (3) General spatial 2D convolution for all other cases.
 template <typename T>
-class LaunchConv2DWithOutputKernel {
+class LaunchFusedConv2DWithOutputKernel {
  public:
-  LaunchConv2DWithOutputKernel(int row_stride, int col_stride,      //
-                               int row_dilation, int col_dilation,  //
-                               Padding padding)
+  LaunchFusedConv2DWithOutputKernel(int row_stride, int col_stride,      //
+                                    int row_dilation, int col_dilation,  //
+                                    Padding padding)
       : row_stride_(row_stride),
         col_stride_(col_stride),
         row_dilation_(row_dilation),
@@ -251,118 +302,29 @@ class LaunchConv2DWithOutputKernel {
   const Padding padding_;
 };
 
-}  // namespace
-
-// Conv2D op with fused output kernels. Supports only CPUDevice.
 template <typename T>
-class FusedConv2DOp : public OpKernel {
- public:
-  explicit FusedConv2DOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, InitConv2DParameters(context, &params_));
-
-    // 'fused_ops' and 'num_args' attributes are specified by the Grappler
-    // Remapper optimizer.
-
-    std::vector<string> fused_ops;
-    OP_REQUIRES_OK(context, context->GetAttr("fused_ops", &fused_ops));
-    OP_REQUIRES(context, !fused_ops.empty(),
-                errors::InvalidArgument(
-                    "Fused Conv2D must have at least one fused op."));
-
-    int num_args;
-    OP_REQUIRES_OK(context, context->GetAttr("num_args", &num_args));
-
-    // TODO(ezhulenev): Add support for fusion element-wise op chains defined
-    // at runtime, e.g. Relu+Sqrt+Tanh+etc...
-
-    // Match combination of fused ops to one of the supported fusions.
-    if (FusedOpsMatches(fused_ops, {"BiasAdd"})) {
-      fused_computation_ = FusedComputationType::kBiasAdd;
-    } else if (FusedOpsMatches(fused_ops, {"BiasAdd", "Relu"})) {
-      fused_computation_ = FusedComputationType::kBiasAddWithRelu;
-    } else if (FusedOpsMatches(fused_ops, {"FusedBatchNorm"})) {
-      fused_computation_ = FusedComputationType::kFusedBatchNorm;
-    } else if (FusedOpsMatches(fused_ops, {"FusedBatchNorm", "Relu"})) {
-      fused_computation_ = FusedComputationType::kFusedBatchNormWithRelu;
-    } else {
-      OP_REQUIRES(context, false,
-                  errors::Unimplemented("Fusion is not implemented: [",
-                                        str_util::Join(fused_ops, ","), "]"));
-    }
-
-    // Depending on a picked fusion type validate fusion-specific arguments.
-
-    if (fused_computation_ == FusedComputationType::kBiasAdd ||
-        fused_computation_ == FusedComputationType::kBiasAddWithRelu) {
-      OP_REQUIRES(context, num_args == 1,
-                  errors::InvalidArgument(
-                      "Fused Conv2D must have one extra argument: bias."));
-    }
-
-    if (fused_computation_ == FusedComputationType::kFusedBatchNorm ||
-        fused_computation_ == FusedComputationType::kFusedBatchNormWithRelu) {
-      OP_REQUIRES(
-          context, num_args == 4,
-          errors::InvalidArgument("Fused FusedBatchNorm must have four extra "
-                                  "arguments: scale, offset, mean, variance."));
-      OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon_));
-    }
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // Input tensor is of the following dimensions:
-    // [ batch, in_rows, in_cols, in_depth ]
-    const Tensor& input = context->input(0);
-
-    // Input filter is of the following dimensions:
-    // [ filter_rows, filter_cols, in_depth, out_depth]
-    const Tensor& filter = context->input(1);
-
-    Conv2DDimensions dimensions;
-    OP_REQUIRES_OK(context,
-                   ComputeConv2DDimension(params_, input, filter, &dimensions));
-
-    TensorShape out_shape = ShapeFromFormat(
-        params_.data_format, dimensions.batch, dimensions.out_rows,
-        dimensions.out_cols, dimensions.out_depth);
-
-    // Output tensor is of the following dimensions:
-    // [ in_batch, out_rows, out_cols, out_depth ]
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
-
-    VLOG(2) << "FusedConv2DWithBias: in_depth = " << dimensions.in_depth
-            << ", patch_depth = " << dimensions.patch_depth
-            << ", input_cols = " << dimensions.input_cols
-            << ", filter_cols = " << dimensions.filter_cols
-            << ", input_rows = " << dimensions.input_rows
-            << ", filter_rows = " << dimensions.filter_rows
-            << ", stride_rows = " << dimensions.stride_rows
-            << ", stride_cols = " << dimensions.stride_cols
-            << ", dilation_rows = " << dimensions.dilation_rows
-            << ", dilation_cols = " << dimensions.dilation_cols
-            << ", out_depth = " << dimensions.out_depth;
-
-    // If there is nothing to compute, return.
-    if (out_shape.num_elements() == 0) {
-      return;
-    }
-
-    OP_REQUIRES(context, params_.data_format == FORMAT_NHWC,
-                errors::Unimplemented("Fused conv implementation only supports "
-                                      "NHWC tensor format for now."));
+struct LaunchFusedConv2DOp<CPUDevice, T> {
+  void operator()(OpKernelContext* context, bool use_cudnn,
+                  bool cudnn_use_autotune, const Tensor& input,
+                  const Tensor& filter, const FusedComputationType fusion,
+                  const FusedComputationArgs& fusion_args,
+                  const Conv2DParameters& params,
+                  const Conv2DDimensions& dimensions, Tensor* output) {
     OP_REQUIRES(context, dimensions.in_depth == filter.dim_size(2),
                 errors::Unimplemented("Fused conv implementation does not "
                                       "support grouped convolutions for now."));
+    OP_REQUIRES(context, params.data_format == FORMAT_NHWC,
+                errors::Unimplemented("Fused conv implementation only supports "
+                                      "NHWC tensor format for now."));
 
     BiasAddArgs bias_add;
     FusedBatchNormArgs fused_batch_norm;
 
-    LaunchConv2DWithOutputKernel<T> conv2d(
+    LaunchFusedConv2DWithOutputKernel<T> conv2d(
         dimensions.stride_rows, dimensions.stride_cols,
-        dimensions.dilation_rows, dimensions.dilation_cols, params_.padding);
+        dimensions.dilation_rows, dimensions.dilation_cols, params.padding);
 
-    switch (fused_computation_) {
+    switch (fusion) {
       case FusedComputationType::kBiasAdd:
         OP_REQUIRES_OK(context, InitBiasAddArgs(context, &bias_add));
         conv2d(WithBiasAdd<T>(bias_add.bias_add_data), context, input, filter,
@@ -377,8 +339,9 @@ class FusedConv2DOp : public OpKernel {
 
       case FusedComputationType::kFusedBatchNorm:
         OP_REQUIRES_OK(context,
-                       InitFusedBatchNormArgs(context, &fused_batch_norm));
-        conv2d(WithFusedBatchNorm<T>(epsilon_,
+                       InitFusedBatchNormArgs(context, fusion_args.epsilon,
+                                              &fused_batch_norm));
+        conv2d(WithFusedBatchNorm<T>(fusion_args.epsilon,
                                      fused_batch_norm.scaling_factor.data(),
                                      fused_batch_norm.offset_data,
                                      fused_batch_norm.estimated_mean_data),
@@ -387,9 +350,10 @@ class FusedConv2DOp : public OpKernel {
 
       case FusedComputationType::kFusedBatchNormWithRelu:
         OP_REQUIRES_OK(context,
-                       InitFusedBatchNormArgs(context, &fused_batch_norm));
+                       InitFusedBatchNormArgs(context, fusion_args.epsilon,
+                                              &fused_batch_norm));
         conv2d(WithFusedBatchNormAndRelu<T>(
-                   epsilon_, fused_batch_norm.scaling_factor.data(),
+                   fusion_args.epsilon, fused_batch_norm.scaling_factor.data(),
                    fused_batch_norm.offset_data,
                    fused_batch_norm.estimated_mean_data),
                context, input, filter, output);
@@ -398,11 +362,6 @@ class FusedConv2DOp : public OpKernel {
   }
 
  private:
-  bool FusedOpsMatches(const std::vector<string>& fused_ops,
-                       const std::vector<string>& expected) const {
-    return fused_ops == expected;
-  }
-
   struct BiasAddArgs {
     const T* bias_add_data = nullptr;
   };
@@ -438,7 +397,7 @@ class FusedConv2DOp : public OpKernel {
     return Status::OK();
   }
 
-  Status InitFusedBatchNormArgs(OpKernelContext* context,
+  Status InitFusedBatchNormArgs(OpKernelContext* context, float epsilon,
                                 FusedBatchNormArgs* args) const {
     const Tensor& scale = context->input(2);
     const Tensor& offset = context->input(3);
@@ -466,44 +425,590 @@ class FusedConv2DOp : public OpKernel {
 
     // Precompute scaling factor once for all output blocks (kernels).
     args->scaling_factor =
-        (estimated_variance.flat<T>() + static_cast<T>(epsilon_)).rsqrt() *
+        (estimated_variance.flat<T>() + static_cast<T>(epsilon)).rsqrt() *
         scale.flat<T>();
 
     return Status::OK();
   }
 
 #undef TF_REQUIRES
+};
 
-  // Element-wise ops applied to the result of Conv2D.
-  // TODO(ezhulenev): Add support for runtime-defined op chains.
-  enum class FusedComputationType {
-    kBiasAdd,
-    kBiasAddWithRelu,
-    kFusedBatchNorm,
-    kFusedBatchNormWithRelu
-  };
+#if GOOGLE_CUDA
+
+// Encapsulate the default shape information that is used by the convolution
+// operation, and add an activation mode for the fusion.
+class FusedConvParameters : public ConvParameters {
+ public:
+  FusedConvParameters(const ConvParameters& base,
+                      const se::dnn::ActivationMode activation_mode)
+      : ConvParameters(base), activation_mode_(activation_mode) {}
+
+  string ToString() const {
+    return absl::StrCat(ConvParameters::ToString(), ", ", activation_mode_);
+  }
+
+ private:
+  friend bool operator==(const FusedConvParameters& lhs,
+                         const FusedConvParameters& rhs);
+
+  using ParameterDataType =
+      std::tuple<ConvParameters::ParameterDataType, se::dnn::ActivationMode>;
+
+  ParameterDataType get_data_as_tuple() const {
+    return std::make_tuple(ConvParameters::get_data_as_tuple(),
+                           activation_mode_);
+  }
+
+  se::dnn::ActivationMode activation_mode_;
+};
+
+bool operator==(const FusedConvParameters& lhs,
+                const FusedConvParameters& rhs) {
+  return lhs.get_data_as_tuple() == rhs.get_data_as_tuple();
+}
+
+bool operator!=(const FusedConvParameters& lhs,
+                const FusedConvParameters& rhs) {
+  return !(lhs == rhs);
+}
+
+// A dummy type to group forward convolution autotune results together.
+struct FusedConvAutoTuneGroup {
+  static string name() { return "FusedConv"; }
+};
+
+using AutoTuneFusedConv =
+    AutoTuneSingleton<FusedConvAutoTuneGroup, FusedConvParameters,
+                      se::dnn::AlgorithmConfig>;
+
+int64 ConvolveScratchSize() {
+  static int64 convolve_scratch_size = GetDnnWorkspaceLimit(
+      // default value is in bytes despite the name of the environment variable
+      "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
+  );
+  return convolve_scratch_size;
+}
+
+// Finds the best convolutiun algorithm for the given ConvLaunch (cuda
+// convolution on the stream) and parameters, by running all possible
+// algorithms and measuring execution time.
+// TODO(ezhulenev): Move it to conv_ops_gpu.h and share with conv_ops.cc.
+template <typename T, typename ConvLaunch>
+Status FindBestConvolveAlgorithm(const FusedConvParameters& params,
+                                 const ConvLaunch launch,
+                                 OpKernelContext* context, se::Stream* stream,
+                                 se::dnn::AlgorithmConfig* algorithm_config) {
+  // Check if we already have an algorithm selected for the given parameters.
+  if (AutoTuneFusedConv::GetInstance()->Find(params, algorithm_config)) {
+    return Status::OK();
+  }
+
+  // Find all candidate algorithms.
+  std::vector<se::dnn::AlgorithmDesc> algorithms;
+  if (!stream->parent()->GetConvolveAlgorithms(
+          params.ShouldIncludeWinogradNonfusedAlgo<T>(stream->parent()),
+          &algorithms)) {
+    return errors::Unknown(
+        "Failed to get convolution algorithm. This is probably "
+        "because cuDNN failed to initialize, so try looking to "
+        "see if a warning log message was printed above.");
+  }
+
+  se::dnn::ProfileResult best_result;
+  se::dnn::ProfileResult best_result_no_scratch;
+
+  for (auto profile_algorithm : algorithms) {
+    DnnScratchAllocator scratch_allocator(ConvolveScratchSize(), context);
+    se::dnn::ProfileResult profile_result;
+
+    bool cudnn_launch_status =
+        launch(se::dnn::AlgorithmConfig(profile_algorithm), &scratch_allocator,
+               &profile_result);
+
+    if (cudnn_launch_status && profile_result.is_valid()) {
+      if (profile_result.elapsed_time_in_ms() <
+          best_result.elapsed_time_in_ms()) {
+        best_result = profile_result;
+      }
+      if (scratch_allocator.TotalByteSize() == 0 &&
+          profile_result.elapsed_time_in_ms() <
+              best_result_no_scratch.elapsed_time_in_ms()) {
+        best_result_no_scratch = profile_result;
+      }
+    }
+  }
+
+  if (!best_result.is_valid() && !best_result_no_scratch.is_valid()) {
+    return errors::NotFound("No algorithm worked!");
+  }
+  if (best_result.is_valid()) {
+    algorithm_config->set_algorithm(best_result.algorithm());
+  }
+  if (best_result_no_scratch.is_valid()) {
+    algorithm_config->set_algorithm_no_scratch(
+        best_result_no_scratch.algorithm());
+  }
+
+  AutoTuneFusedConv::GetInstance()->Insert(params, *algorithm_config);
+  return Status::OK();
+}
+
+template <typename T>
+struct LaunchFusedConv2DOp<GPUDevice, T> {
+  void operator()(OpKernelContext* context, bool use_cudnn,
+                  bool cudnn_use_autotune, const Tensor& input_param,
+                  const Tensor& filter, FusedComputationType fusion,
+                  const FusedComputationArgs& fusion_args,
+                  const Conv2DParameters& params,
+                  const Conv2DDimensions& dimensions, Tensor* output) {
+    OP_REQUIRES(
+        context,
+        params.data_format == FORMAT_NHWC || params.data_format == FORMAT_NCHW,
+        errors::Unimplemented("Fused conv implementation only supports "
+                              "NHWC and HCHW tensor formats for now."));
+
+    auto* stream = context->op_device_context()->stream();
+    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+    OP_REQUIRES(
+        context, use_cudnn,
+        errors::Unimplemented("FusedConv2D for GPU is not currently supported "
+                              "without cudnn"));
+
+    OP_REQUIRES(
+        context, fusion == FusedComputationType::kBiasAddWithRelu,
+        errors::Unimplemented("FusedConv2D implementation only supports "
+                              "fusing with `BiasAdd + Relu` for now."));
+
+    Tensor input = input_param;
+
+    const int64 in_batch = GetTensorDim(input, params.data_format, 'N');
+    int64 in_rows = GetTensorDim(input, params.data_format, 'H');
+    int64 in_cols = GetTensorDim(input, params.data_format, 'W');
+    const int64 in_depths = GetTensorDim(input, params.data_format, 'C');
+
+    const int64 patch_rows = filter.dim_size(0);
+    const int64 patch_cols = filter.dim_size(1);
+    const int64 patch_depths = filter.dim_size(2);
+
+    int64 padding_rows = 0;
+    int64 padding_cols = 0;
+    const int64 out_batch = GetTensorDim(*output, params.data_format, 'N');
+    const int64 out_rows = GetTensorDim(*output, params.data_format, 'H');
+    const int64 out_cols = GetTensorDim(*output, params.data_format, 'W');
+    const int64 out_depths = GetTensorDim(*output, params.data_format, 'C');
+
+    // Bias of the following dimensions: [ output_depth ]
+    const Tensor& bias = context->input(2);
+    OP_REQUIRES(context, bias.dims() == 1,
+                errors::InvalidArgument("bias must be 1-dimensional",
+                                        bias.shape().DebugString()));
+    OP_REQUIRES(context, bias.dim_size(0) == out_depths,
+                errors::InvalidArgument("bias depth must be equal to out depth",
+                                        bias.shape().DebugString()));
+
+    if (params.padding == SAME) {
+      // Total padding on rows and cols is
+      // Pr = (R' - 1) * S + (Kr - 1) * Dr + 1 - R
+      // Pc = (C' - 1) * S + (Kc - 1) * Dc + 1 - C
+      // where (R', C') are output dimensions, (R, C) are input dimensions, S
+      // is stride, (Dr, Dc) are dilations, (Kr, Kc) are filter dimensions.
+      // We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top
+      // and Pc - Pc/2 on the bottom.  When Pr or Pc is odd, this means
+      // we pad more on the right and bottom than on the top and left.
+      padding_rows = std::max<int>(
+          0, (out_rows - 1) * dimensions.stride_rows +
+                 (patch_rows - 1) * dimensions.dilation_rows + 1 - in_rows);
+      padding_cols = std::max<int>(
+          0, (out_cols - 1) * dimensions.stride_cols +
+                 (patch_cols - 1) * dimensions.dilation_cols + 1 - in_cols);
+      const bool rows_odd = (padding_rows % 2 != 0);
+      const bool cols_odd = (padding_cols % 2 != 0);
+      if (rows_odd || cols_odd) {
+        Tensor transformed_input;
+        int64 new_in_rows = in_rows + rows_odd;
+        int64 new_in_cols = in_cols + cols_odd;
+        OP_REQUIRES_OK(context,
+                       context->allocate_temp(
+                           DataTypeToEnum<T>::value,
+                           ShapeFromFormat(params.data_format, in_batch,
+                                           new_in_rows, new_in_cols, in_depths),
+                           &transformed_input));
+
+        functor::PadInput<GPUDevice, T, int, 4>()(
+            context->eigen_device<GPUDevice>(),
+            To32Bit(input_param.tensor<T, 4>()), {{0, 0}},
+            {{rows_odd, cols_odd}}, To32Bit(transformed_input.tensor<T, 4>()),
+            params.data_format);
+
+        input = transformed_input;
+        in_rows = new_in_rows;
+        in_cols = new_in_cols;
+      }
+    }
+
+    if (params.data_format == FORMAT_NHWC) {
+      // Convert the input tensor from NHWC to NCHW.
+      TensorShape nchw_shape =
+          ShapeFromFormat(FORMAT_NCHW, in_batch, in_rows, in_cols, in_depths);
+      if (in_depths > 1) {
+        Tensor transformed_input;
+        OP_REQUIRES_OK(context,
+                       context->allocate_temp(DataTypeToEnum<T>::value,
+                                              nchw_shape, &transformed_input));
+        functor::NHWCToNCHW<GPUDevice, T, 4>()(
+            context->eigen_device<GPUDevice>(),
+            const_cast<const Tensor&>(input).tensor<T, 4>(),
+            transformed_input.tensor<T, 4>());
+        input = transformed_input;
+      } else {
+        // If depth <= 1, then just reshape.
+        CHECK(input.CopyFrom(input, nchw_shape));  // Crash OK
+      }
+    }
+
+    CHECK(padding_rows >= 0) << "Negative padding rows";  // Crash OK
+    CHECK(padding_cols >= 0) << "Negative padding cols";  // Crash OK
+
+    se::dnn::ActivationMode dnn_activation_mode;
+    switch (fusion) {
+      case FusedComputationType::kBiasAddWithRelu:
+        dnn_activation_mode = se::dnn::ActivationMode::kRelu;
+        break;
+      default:
+        LOG(FATAL) << "Unsupported fusion type";  // Crash OK
+    }
+
+    se::dnn::BatchDescriptor input_desc;
+    input_desc.set_count(in_batch)
+        .set_feature_map_count(in_depths)
+        .set_height(in_rows)
+        .set_width(in_cols)
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+    se::dnn::FilterDescriptor filter_desc;
+    filter_desc.set_input_filter_height(patch_rows)
+        .set_input_filter_width(patch_cols)
+        .set_input_feature_map_count(patch_depths)
+        .set_output_feature_map_count(filter.dim_size(3));
+    se::dnn::BatchDescriptor bias_desc;
+    bias_desc.set_count(1)
+        .set_height(1)
+        .set_width(1)
+        .set_feature_map_count(out_depths)
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+    se::dnn::ConvolutionDescriptor conv_desc;
+    conv_desc.set_vertical_dilation_rate(dimensions.dilation_rows)
+        .set_horizontal_dilation_rate(dimensions.dilation_cols)
+        .set_vertical_filter_stride(dimensions.stride_rows)
+        .set_horizontal_filter_stride(dimensions.stride_cols)
+        .set_zero_padding_height(padding_rows / 2)
+        .set_zero_padding_width(padding_cols / 2)
+        .set_group_count(in_depths / patch_depths);
+    se::dnn::BatchDescriptor output_desc;
+    output_desc.set_count(out_batch)
+        .set_height(out_rows)
+        .set_width(out_cols)
+        .set_feature_map_count(out_depths)
+        .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+
+    Tensor transformed_filter;
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(
+                       DataTypeToEnum<T>::value,
+                       TensorShape({filter.dim_size(3), filter.dim_size(2),
+                                    filter.dim_size(0), filter.dim_size(1)}),
+                       &transformed_filter));
+    functor::TransformFilter<GPUDevice, T, int, 4>()(
+        context->eigen_device<GPUDevice>(), FORMAT_OIHW,
+        To32Bit(filter.tensor<T, 4>()),
+        To32Bit(transformed_filter.tensor<T, 4>()));
+
+    Tensor transformed_output;
+    if (params.data_format == FORMAT_NHWC) {
+      // Only allocate temporary memory when a layout transformation is needed.
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(
+                         DataTypeToEnum<T>::value,
+                         ShapeFromFormat(FORMAT_NCHW, out_batch, out_rows,
+                                         out_cols, out_depths),
+                         &transformed_output));
+    } else {
+      transformed_output = *output;
+    }
+
+    const auto tensor_on_device = [](const Tensor& t) -> se::DeviceMemory<T> {
+      return AsDeviceMemory(t.template flat<T>().data(),
+                            t.template flat<T>().size());
+    };
+
+    se::DeviceMemory<T> input_ptr = tensor_on_device(input);
+    se::DeviceMemory<T> filter_ptr = tensor_on_device(transformed_filter);
+    se::DeviceMemory<T> bias_ptr = tensor_on_device(bias);
+    se::DeviceMemory<T> output_ptr = tensor_on_device(transformed_output);
+
+    // We do not use side inputs, so we can safely pass nullptr.
+    se::DeviceMemory<T> side_input_ptr =
+        AsDeviceMemory(static_cast<T*>(nullptr), 0);
+
+    int device_id = stream->parent()->device_ordinal();
+    DataType dtype = input.dtype();
+    FusedConvParameters conv_parameters = {
+        {
+            in_batch,                      // batch
+            in_depths,                     // in_depths
+            {{in_rows,                     // in_rows
+              in_cols}},                   // in_cols
+            FORMAT_NCHW,                   // compute_data_format
+            out_depths,                    // out_depths
+            {{patch_rows,                  // filter_rows
+              patch_cols,                  // filter_cols
+              patch_depths}},              // filter_depths
+            {{dimensions.dilation_rows,    // dilation_rows
+              dimensions.dilation_cols}},  // dilation_cols
+            {{dimensions.stride_rows,      // stride_rows
+              dimensions.stride_cols}},    // stride_cols
+            {{padding_rows,                // padding_rows
+              padding_cols}},              // padding_cols
+            dtype,                         // tensor datatype
+            device_id,                     // device_id
+        },
+        dnn_activation_mode  // activation_mode
+    };
+
+    // Launch fused convolution with given parameters and scratch allocator.
+    // Record profile result into `profile_result` if it's not nullptr.
+    const auto launch = [&](se::dnn::AlgorithmConfig algorithm_config,
+                            DnnScratchAllocator* scratch_allocator,
+                            se::dnn::ProfileResult* profile_result) -> bool {
+      return stream
+          ->ThenFusedConvolveWithAlgorithm(
+              input_desc, input_ptr,                     // input
+              /*conv_input_scale=*/1.0,                  // input_scale
+              filter_desc, filter_ptr,                   // filter
+              conv_desc,                                 // conv
+              side_input_ptr, /*side_input_scale=*/0.0,  // side_input
+              bias_desc, bias_ptr,                       // bias
+              dnn_activation_mode,                       // activation
+              output_desc, &output_ptr,                  // output
+              scratch_allocator, algorithm_config, profile_result)
+          .ok();
+    };
+
+    se::dnn::AlgorithmConfig algorithm_config;
+    if (cudnn_use_autotune) {
+      OP_REQUIRES_OK(context, FindBestConvolveAlgorithm<T>(
+                                  conv_parameters, launch, context, stream,
+                                  &algorithm_config));
+    }
+
+    DnnScratchAllocator scratch_allocator(ConvolveScratchSize(), context);
+    bool cudnn_launch_status = launch(algorithm_config, &scratch_allocator,
+                                      /*profile_result=*/nullptr);
+    OP_REQUIRES(
+        context, cudnn_launch_status,
+        errors::Internal(absl::Substitute(
+            "cuDNN launch failure: input shape($0) filter shape($1)",
+            input.shape().DebugString(), filter.shape().DebugString())));
+
+    // Convert the output tensor back from NCHW to NHWC.
+    if (params.data_format == FORMAT_NHWC) {
+      functor::NCHWToNHWC<GPUDevice, T, 4>()(
+          context->eigen_device<GPUDevice>(),
+          const_cast<const Tensor&>(transformed_output).tensor<T, 4>(),
+          output->tensor<T, 4>());
+    }
+  }
+};
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace
+
+template <typename Device, typename T>
+class FusedConv2DOp : public OpKernel {
+ public:
+  explicit FusedConv2DOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, InitConv2DParameters(context, &params_));
+
+    OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
+    use_cudnn_ &= CanUseCudnn();
+    cudnn_use_autotune_ = CudnnUseAutotune();
+
+    // 'fused_ops' and 'num_args' attributes are specified by the Grappler
+    // Remapper optimizer (see grappler/optimizers/remapper.cc).
+
+    std::vector<string> fused_ops;
+    OP_REQUIRES_OK(context, context->GetAttr("fused_ops", &fused_ops));
+    OP_REQUIRES(context, !fused_ops.empty(),
+                errors::InvalidArgument(
+                    "Fused Conv2D must have at least one fused op."));
+
+    int num_args;
+    OP_REQUIRES_OK(context, context->GetAttr("num_args", &num_args));
+
+    // TODO(ezhulenev): Add support for fusion element-wise op chains defined
+    // at runtime, e.g. Relu+Sqrt+Tanh+etc.
+
+    // Match combination of fused ops to one of the supported fusions.
+    if (FusedOpsMatchAndSupportedOnDevice(fused_ops, {"BiasAdd"},
+                                          /*cpu_only=*/true)) {
+      fused_computation_ = FusedComputationType::kBiasAdd;
+    } else if (FusedOpsMatchAndSupportedOnDevice(fused_ops, {"BiasAdd", "Relu"},
+                                                 /*cpu_only=*/false)) {
+      fused_computation_ = FusedComputationType::kBiasAddWithRelu;
+    } else if (FusedOpsMatchAndSupportedOnDevice(fused_ops, {"FusedBatchNorm"},
+                                                 /*cpu_only=*/true)) {
+      fused_computation_ = FusedComputationType::kFusedBatchNorm;
+    } else if (FusedOpsMatchAndSupportedOnDevice(fused_ops,
+                                                 {"FusedBatchNorm", "Relu"},
+                                                 /*cpu_only=*/true)) {
+      fused_computation_ = FusedComputationType::kFusedBatchNormWithRelu;
+    } else {
+      OP_REQUIRES(context, false,
+                  errors::Unimplemented("Fusion is not implemented: [",
+                                        absl::StrJoin(fused_ops, ","), "]"));
+    }
+
+    // Depending on a picked fusion type validate fusion-specific arguments.
+
+    if (fused_computation_ == FusedComputationType::kBiasAdd ||
+        fused_computation_ == FusedComputationType::kBiasAddWithRelu) {
+      OP_REQUIRES(context, num_args == 1,
+                  errors::InvalidArgument(
+                      "Fused Conv2D must have one extra argument: bias."));
+    }
+
+    if (fused_computation_ == FusedComputationType::kFusedBatchNorm ||
+        fused_computation_ == FusedComputationType::kFusedBatchNormWithRelu) {
+      OP_REQUIRES(
+          context, num_args == 4,
+          errors::InvalidArgument("Fused FusedBatchNorm must have four extra "
+                                  "arguments: scale, offset, mean, variance."));
+      OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon_));
+    }
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Input tensor is of the following dimensions:
+    // [ batch, in_rows, in_cols, in_depth ]
+    const Tensor& input = context->input(0);
+
+    // Input filter is of the following dimensions:
+    // [ filter_rows, filter_cols, in_depth, out_depth]
+    const Tensor& filter = context->input(1);
+
+    Conv2DDimensions dimensions;
+    OP_REQUIRES_OK(context,
+                   ComputeConv2DDimension(params_, input, filter, &dimensions));
+
+    TensorShape out_shape = ShapeFromFormat(
+        params_.data_format, dimensions.batch, dimensions.out_rows,
+        dimensions.out_cols, dimensions.out_depth);
+
+    // Output tensor is of the following dimensions:
+    // [ in_batch, out_rows, out_cols, out_depth ]
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+
+    VLOG(2) << "FusedConv2D: in_depth = " << dimensions.in_depth
+            << ", patch_depth = " << dimensions.patch_depth
+            << ", input_cols = " << dimensions.input_cols
+            << ", filter_cols = " << dimensions.filter_cols
+            << ", input_rows = " << dimensions.input_rows
+            << ", filter_rows = " << dimensions.filter_rows
+            << ", stride_rows = " << dimensions.stride_rows
+            << ", stride_cols = " << dimensions.stride_cols
+            << ", dilation_rows = " << dimensions.dilation_rows
+            << ", dilation_cols = " << dimensions.dilation_cols
+            << ", out_depth = " << dimensions.out_depth;
+
+    // If there is nothing to compute, return.
+    if (out_shape.num_elements() == 0) {
+      return;
+    }
+
+    FusedComputationArgs args;
+    args.epsilon = epsilon_;
+
+    LaunchFusedConv2DOp<Device, T>()(context, use_cudnn_, cudnn_use_autotune_,
+                                     input, filter, fused_computation_, args,
+                                     params_, dimensions, output);
+  }
+
+ private:
+  bool FusedOpsMatchAndSupportedOnDevice(const std::vector<string>& fused_ops,
+                                         const std::vector<string>& expected,
+                                         bool cpu_only) const {
+    if (std::is_same<Device, GPUDevice>::value && cpu_only) {
+      return false;
+    }
+    return fused_ops == expected;
+  }
 
   Conv2DParameters params_;
+  bool use_cudnn_;
+  bool cudnn_use_autotune_;
+
   FusedComputationType fused_computation_;
 
-  // FusedBatchNorm attributes.
-  float epsilon_;
+  float epsilon_;  // Used only in FusedBatchNorm fusion
 
   TF_DISALLOW_COPY_AND_ASSIGN(FusedConv2DOp);
 };
 
-#define REGISTER_FUSED_CONV2D(T)                                      \
+// Registration of the CPU implementations.
+#define REGISTER_FUSED_CPU_CONV2D(T)                                  \
   REGISTER_KERNEL_BUILDER(                                            \
       Name("_FusedConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-      FusedConv2DOp<T>);
+      FusedConv2DOp<CPUDevice, T>);
 
 // If we're using the alternative GEMM-based implementation of Conv2D for the
 // CPU implementation, don't register this EigenTensor-based version.
 // TODO(b/119765980): Upgrade upstream Eigen to set `m_can_use_xsmm=false` for
 // contractions with non-default contraction output kernels.
 #if !defined(USE_GEMM_FOR_CONV) && !defined(EIGEN_USE_LIBXSMM)
-TF_CALL_float(REGISTER_FUSED_CONV2D);
-TF_CALL_double(REGISTER_FUSED_CONV2D);
+TF_CALL_float(REGISTER_FUSED_CPU_CONV2D);
+TF_CALL_double(REGISTER_FUSED_CPU_CONV2D);
 #endif  // !USE_GEMM_FOR_CONV
 
+#undef REGISTER_FUSED_CPU_CONV2D
+
+#if GOOGLE_CUDA
+
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                              \
+  template <>                                                            \
+  void TransformFilter<GPUDevice, T, int, 4>::operator()(                \
+      const GPUDevice& d, FilterTensorFormat dst_filter_format,          \
+      typename TTypes<T, 4, int>::ConstTensor in,                        \
+      typename TTypes<T, 4, int>::Tensor out);                           \
+  extern template struct TransformFilter<GPUDevice, T, int, 4>;          \
+  template <>                                                            \
+  void PadInput<GPUDevice, T, int, 4>::operator()(                       \
+      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,    \
+      const std::array<int, 2>& padding_left,                            \
+      const std::array<int, 2>& padding_right,                           \
+      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format); \
+  extern template struct PadInput<GPUDevice, T, int, 4>
+
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+// Registration of the GPU implementations.
+#define REGISTER_FUSED_GPU_CONV2D(T)                                  \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("_FusedConv2D").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      FusedConv2DOp<GPUDevice, T>);
+
+TF_CALL_float(REGISTER_FUSED_GPU_CONV2D);
+TF_CALL_double(REGISTER_FUSED_GPU_CONV2D);
+
+#undef REGISTER_FUSED_GPU_CONV2D
+
+#endif  // GOOGLE_CUDA
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
index 7be1de29c951dca16085e35587d02eeeec01354f..0542216a23d7a24c33d7600b155ec4dc6a92ae04 100644
--- a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
+++ b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include <string>
 #include <vector>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -29,7 +30,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/conv_ops.h"
 #include "tensorflow/core/kernels/gemm_functors.h"
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index 21d135decdd459fc9bb6551f00ee5b6f546d2540..7a67658c4d88b9a5dc66635527f97719773e6f83 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -27,19 +27,19 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Get the Cudnn workspace limit from the environment variable, which is in MB.
+// Get the Dnn workspace limit from the environment variable, which is in MB.
 // Return the workspace memory limit in bytes. If no value is set, return the
 // default value.
-int64 GetCudnnWorkspaceLimit(const string& envvar_in_mb,
-                             int64 default_value_in_bytes);
+int64 GetDnnWorkspaceLimit(const string& envvar_in_mb,
+                           int64 default_value_in_bytes);
 
 // A class to provide scratch-space allocator for Stream-Executor Cudnn
 // callback. TensorFlow is responsible for releasing the temporary buffers after
 // the kernel finishes.
-class CudnnScratchAllocator : public se::ScratchAllocator {
+class DnnScratchAllocator : public se::ScratchAllocator {
  public:
-  virtual ~CudnnScratchAllocator() {}
-  CudnnScratchAllocator(int64 memory_limit, OpKernelContext* context)
+  virtual ~DnnScratchAllocator() {}
+  DnnScratchAllocator(int64 memory_limit, OpKernelContext* context)
       : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
   int64 GetMemoryLimitInBytes(se::Stream* stream) override {
     return memory_limit_;
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index bf98acdecfd1a3b8a946648c105f0d313f2296ab..a4cd67804ed11148e511b0695d82e71df12aa8ad 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/image_ops.h"
 #include "tensorflow/cc/ops/nn_ops.h"
@@ -182,7 +183,7 @@ class FusedResizePadConvOpTest : public OpsTestBase {
                                bool resize_align_corners,
                                const string& pad_mode, int stride,
                                const string& padding, DataType dtype) {
-    auto root = tensorflow::Scope::NewRootScope();
+    Scope root = tensorflow::Scope::NewRootScope();
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
 
     Tensor input_data(DT_FLOAT,
@@ -243,7 +244,7 @@ class FusedResizePadConvOpTest : public OpsTestBase {
                                       int filter_count, const string& pad_mode,
                                       int stride, const string& padding,
                                       DataType dtype) {
-    auto root = tensorflow::Scope::NewRootScope();
+    Scope root = tensorflow::Scope::NewRootScope();
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
 
     Tensor input_data(DT_FLOAT,
@@ -544,28 +545,59 @@ class FusedConv2DOpTest : public OpsTestBase {
       const Tensor& mean_data, const Tensor& variance_data, Tensor* out)>;
 
   // Runs a Tensorflow graph defined by the root scope, and fetches the result
-  // of 'fetch' node into the output Tensor.
+  // of 'fetch' node into the output Tensor. Optional `fetch_node` parameter
+  // allows to define a fetch node directly using a NodeDef for the ops that are
+  // not supported by the C++ Api.
   void RunAndFetch(const tensorflow::Scope& root, const string& fetch,
-                   Tensor* output) {
+                   Tensor* output, bool allow_gpu_device,
+                   const NodeDef* fetch_node = nullptr) {
     tensorflow::GraphDef graph;
     TF_ASSERT_OK(root.ToGraphDef(&graph));
 
-    // `FusedConv2D` is available only on CPU, and in this test we don't want to
-    // compare GPU vs CPU numbers, so place all nodes on CPU.
-    for (NodeDef& mutable_node : *graph.mutable_node()) {
-      mutable_node.set_device("/device:CPU:0");
+    if (fetch_node) {
+      *graph.add_node() = *fetch_node;
     }
 
-    // Disable Grappler constant folding for the test graphs.
+    // We really want to make sure that graph executed exactly as we passed it
+    // to the session, so we disable various optimizations.
     tensorflow::SessionOptions session_options;
+
+    // Disable common runtime constant folding.
+    session_options.config.mutable_graph_options()
+        ->mutable_optimizer_options()
+        ->set_opt_level(OptimizerOptions::L0);
+
+    // Disable Grappler optimizations for tests.
     tensorflow::RewriterConfig* cfg =
         session_options.config.mutable_graph_options()
             ->mutable_rewrite_options();
     cfg->set_constant_folding(tensorflow::RewriterConfig::OFF);
+    cfg->set_layout_optimizer(tensorflow::RewriterConfig::OFF);
+    cfg->set_remapping(tensorflow::RewriterConfig::OFF);
 
     std::unique_ptr<tensorflow::Session> session(
         tensorflow::NewSession(session_options));
 
+    std::vector<DeviceAttributes> available_devices;
+    TF_ASSERT_OK(session->ListDevices(&available_devices))
+        << "Failed to get available session devices";
+
+    // Check if session has an available GPU device.
+    const bool has_gpu_device =
+        absl::c_any_of(available_devices, [](const DeviceAttributes& device) {
+          return device.device_type() == DEVICE_GPU;
+        });
+
+    // Some of the `FusedConv2D` fusion types are implemented only for CPU, and
+    // in this test we don't want to compare GPU vs CPU numbers, so place all
+    // nodes on CPU in this case.
+    const bool place_all_on_gpu = allow_gpu_device && has_gpu_device;
+
+    const string device = place_all_on_gpu ? "/device:GPU:0" : "/device:CPU:0";
+    for (NodeDef& mutable_node : *graph.mutable_node()) {
+      mutable_node.set_device(device);
+    }
+
     TF_ASSERT_OK(session->Create(graph));
 
     std::vector<Tensor> unfused_tensors;
@@ -576,41 +608,41 @@ class FusedConv2DOpTest : public OpsTestBase {
 
   void RunConv2DWithBias(const Tensor& input_data, const Tensor& filter_data,
                          const Tensor& bias_data, Tensor* output,
-                         int stride = 1) {
-    auto root = tensorflow::Scope::NewRootScope();
+                         bool allow_gpu_device = false, int stride = 1) {
+    Scope root = tensorflow::Scope::NewRootScope();
 
-    auto conv = ops::Conv2D(
+    ops::Conv2D conv = ops::Conv2D(
         root.WithOpName("conv"),
         ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
         ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
         {1, stride, stride, 1}, "SAME");
 
-    auto with_bias = ops::BiasAdd(
+    ops::BiasAdd with_bias = ops::BiasAdd(
         root.WithOpName("with_bias"), conv,
         ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
 
-    RunAndFetch(root, "with_bias", output);
+    RunAndFetch(root, "with_bias", output, allow_gpu_device);
   }
 
   void RunConv2DWithBiasAndRelu(const Tensor& input_data,
                                 const Tensor& filter_data,
                                 const Tensor& bias_data, Tensor* output,
-                                int stride = 1) {
-    auto root = tensorflow::Scope::NewRootScope();
+                                bool allow_gpu_device = false, int stride = 1) {
+    Scope root = tensorflow::Scope::NewRootScope();
 
-    auto conv = ops::Conv2D(
+    ops::Conv2D conv = ops::Conv2D(
         root.WithOpName("conv"),
         ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
         ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
         {1, stride, stride, 1}, "SAME");
 
-    auto with_bias = ops::BiasAdd(
+    ops::BiasAdd with_bias = ops::BiasAdd(
         root.WithOpName("with_bias"), conv,
         ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
 
-    auto with_relu = ops::Relu(root.WithOpName("with_relu"), with_bias);
+    ops::Relu with_relu = ops::Relu(root.WithOpName("with_relu"), with_bias);
 
-    RunAndFetch(root, "with_relu", output);
+    RunAndFetch(root, "with_relu", output, allow_gpu_device);
   }
 
   void RunConv2DWithBatchNorm(const Tensor& input_data,
@@ -619,10 +651,10 @@ class FusedConv2DOpTest : public OpsTestBase {
                               const Tensor& offset_data,
                               const Tensor& mean_data,
                               const Tensor& variance_data, Tensor* output,
-                              int stride = 1) {
-    auto root = tensorflow::Scope::NewRootScope();
+                              bool allow_gpu_device = false, int stride = 1) {
+    Scope root = tensorflow::Scope::NewRootScope();
 
-    auto conv = ops::Conv2D(
+    ops::Conv2D conv = ops::Conv2D(
         root.WithOpName("conv"),
         ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
         ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
@@ -631,7 +663,7 @@ class FusedConv2DOpTest : public OpsTestBase {
     ops::FusedBatchNorm::Attrs attr;
     attr = attr.IsTraining(false);
 
-    auto with_fused_batch_norm = ops::FusedBatchNorm(
+    ops::FusedBatchNorm with_fused_batch_norm = ops::FusedBatchNorm(
         root.WithOpName("with_fused_batch_norm"), conv,
         ops::Const(root.WithOpName("scale"), Input::Initializer(scale_data)),
         ops::Const(root.WithOpName("offset"), Input::Initializer(offset_data)),
@@ -639,19 +671,17 @@ class FusedConv2DOpTest : public OpsTestBase {
         ops::Const(root.WithOpName("var"), Input::Initializer(variance_data)),
         attr);
 
-    RunAndFetch(root, "with_fused_batch_norm", output);
+    RunAndFetch(root, "with_fused_batch_norm", output, allow_gpu_device);
   }
 
-  void RunConv2DWithBatchNormAndRelu(const Tensor& input_data,
-                                     const Tensor& filter_data,
-                                     const Tensor& scale_data,
-                                     const Tensor& offset_data,
-                                     const Tensor& mean_data,
-                                     const Tensor& variance_data,
-                                     Tensor* output, int stride = 1) {
-    auto root = tensorflow::Scope::NewRootScope();
+  void RunConv2DWithBatchNormAndRelu(
+      const Tensor& input_data, const Tensor& filter_data,
+      const Tensor& scale_data, const Tensor& offset_data,
+      const Tensor& mean_data, const Tensor& variance_data, Tensor* output,
+      bool allow_gpu_device = false, int stride = 1) {
+    Scope root = tensorflow::Scope::NewRootScope();
 
-    auto conv = ops::Conv2D(
+    ops::Conv2D conv = ops::Conv2D(
         root.WithOpName("conv"),
         ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
         ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
@@ -660,7 +690,7 @@ class FusedConv2DOpTest : public OpsTestBase {
     ops::FusedBatchNorm::Attrs attr;
     attr = attr.IsTraining(false);
 
-    auto with_fused_batch_norm = ops::FusedBatchNorm(
+    ops::FusedBatchNorm with_fused_batch_norm = ops::FusedBatchNorm(
         root.WithOpName("with_fused_batch_norm"), conv,
         ops::Const(root.WithOpName("scale"), Input::Initializer(scale_data)),
         ops::Const(root.WithOpName("offset"), Input::Initializer(offset_data)),
@@ -668,39 +698,47 @@ class FusedConv2DOpTest : public OpsTestBase {
         ops::Const(root.WithOpName("var"), Input::Initializer(variance_data)),
         attr);
 
-    auto with_relu =
+    ops::Relu with_relu =
         ops::Relu(root.WithOpName("with_relu"), with_fused_batch_norm.y);
 
-    RunAndFetch(root, "with_relu", output);
+    RunAndFetch(root, "with_relu", output, allow_gpu_device);
   }
 
-  void RunFusedConv2DOp(const Tensor& image, const Tensor& filter,
-                        const std::vector<Tensor>& args,
+  void RunFusedConv2DOp(const Tensor& input_data, const Tensor& filter_data,
+                        const std::vector<Tensor>& args_data,
                         const std::vector<string>& fused_ops, Tensor* output,
-                        int stride = 1) {
+                        bool allow_gpu_device = false, int stride = 1) {
+    Scope root = tensorflow::Scope::NewRootScope();
+
     DataType dtype = DataTypeToEnum<T>::v();
-    int num_args = static_cast<int>(args.size());
+    int num_args = static_cast<int>(args_data.size());
 
-    TF_EXPECT_OK(NodeDefBuilder("fused_conv_op", "_FusedConv2D")
-                     .Input(FakeInput(dtype))
-                     .Input(FakeInput(dtype))
+    Output input =
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data));
+    Output filter =
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data));
+
+    std::vector<NodeDefBuilder::NodeOut> args;
+    for (int i = 0; i < num_args; ++i) {
+      Output arg = ops::Const(root.WithOpName(absl::StrCat("arg", i)),
+                              Input::Initializer(args_data[i]));
+      args.emplace_back(arg.name(), 0, dtype);
+    }
+
+    NodeDef fused_conv2d;
+    TF_EXPECT_OK(NodeDefBuilder("fused_conv", "_FusedConv2D")
+                     .Input({input.name(), 0, dtype})
+                     .Input({filter.name(), 0, dtype})
+                     .Input(args)
                      .Attr("num_args", num_args)
-                     .Input(FakeInput(num_args, dtype))
                      .Attr("T", dtype)
                      .Attr("strides", {1, stride, stride, 1})
                      .Attr("padding", "SAME")
                      .Attr("fused_ops", fused_ops)
-                     .Finalize(node_def()));
+                     .Finalize(&fused_conv2d));
 
-    TF_EXPECT_OK(InitOp());
-
-    AddInputFromArray<T>(image.shape(), image.flat<T>());
-    AddInputFromArray<T>(filter.shape(), filter.flat<T>());
-    for (const Tensor& arg : args)
-      AddInputFromArray<T>(arg.shape(), arg.flat<T>());
-    TF_ASSERT_OK(RunOpKernel());
-
-    *output = *GetOutput(0);
+    RunAndFetch(root, fused_conv2d.name(), output, allow_gpu_device,
+                &fused_conv2d);
   }
 
   void VerifyBiasAddTensorsNear(int depth, int image_width, int image_height,
@@ -732,13 +770,14 @@ class FusedConv2DOpTest : public OpsTestBase {
     ASSERT_EQ(conv_2d.dtype(), fused_conv_2d.dtype());
     ASSERT_EQ(conv_2d.shape(), fused_conv_2d.shape());
 
-    // NOTE(ezhulenev): When filter size is equal to the input image size, we
-    // effectevily do element-wise product and full sum reduction, and these
-    // operations intoroduce higher than "normal" numerical errors.
+    // NOTE(intel-tf): When filter_size is equal to the input image size,
+    // conv2d essentially is element-wise multiplication followed by
+    // a full sum reduction, which causes larger numerical error
+    // than usual cases.
     if (image_width == filter_size && image_height == filter_size) {
-      test::ExpectTensorNear<T>(conv_2d, fused_conv_2d, 1e-3);
+      test::ExpectClose(conv_2d, fused_conv_2d, /*atol=*/1e-4);
     } else {
-      test::ExpectClose(conv_2d, fused_conv_2d);
+      test::ExpectClose(conv_2d, fused_conv_2d, /*atol=*/1e-6);
     }
   }
 
@@ -781,13 +820,14 @@ class FusedConv2DOpTest : public OpsTestBase {
     ASSERT_EQ(conv_2d.dtype(), fused_conv_2d.dtype());
     ASSERT_EQ(conv_2d.shape(), fused_conv_2d.shape());
 
-    // NOTE(ezhulenev): When filter size is equal to the input image size, we
-    // effectevily do element-wise product and full sum reduction, and these
-    // operations intoroduce higher than "normal" numerical errors.
+    // NOTE(intel-tf): When filter_size is equal to the input image size,
+    // conv2d essentially is element-wise multiplication followed by
+    // a full sum reduction, which causes larger numerical error
+    // than usual cases.
     if (image_width == filter_size && image_height == filter_size) {
-      test::ExpectTensorNear<T>(conv_2d, fused_conv_2d, 1e-3);
+      test::ExpectClose(conv_2d, fused_conv_2d, /*atol=*/1e-4);
     } else {
-      test::ExpectClose(conv_2d, fused_conv_2d);
+      test::ExpectClose(conv_2d, fused_conv_2d, /*atol=*/1e-6);
     }
   }
 
@@ -825,14 +865,15 @@ class FusedConv2DOpTest : public OpsTestBase {
     const BiasAddGraphRunner run_default =
         [this](const Tensor& input_data, const Tensor& filter_data,
                const Tensor& bias_data, Tensor* out) {
-          RunConv2DWithBiasAndRelu(input_data, filter_data, bias_data, out);
+          RunConv2DWithBiasAndRelu(input_data, filter_data, bias_data, out,
+                                   /*allow_gpu_device=*/true);
         };
 
     const BiasAddGraphRunner run_fused =
         [this](const Tensor& input_data, const Tensor& filter_data,
                const Tensor& bias_data, Tensor* out) {
           RunFusedConv2DOp(input_data, filter_data, {bias_data},
-                           {"BiasAdd", "Relu"}, out);
+                           {"BiasAdd", "Relu"}, out, /*allow_gpu_device=*/true);
         };
 
     VerifyBiasAddTensorsNear(depth, image_width, image_height,
@@ -911,8 +952,8 @@ class FusedConv2DWithBiasOpTest : public FusedConv2DOpTest<T> {};
 template <typename T>
 class FusedConv2DWithBatchNormOpTest : public FusedConv2DOpTest<T> {};
 
-TYPED_TEST_CASE_P(FusedConv2DWithBiasOpTest);
-TYPED_TEST_CASE_P(FusedConv2DWithBatchNormOpTest);
+TYPED_TEST_SUITE_P(FusedConv2DWithBiasOpTest);
+TYPED_TEST_SUITE_P(FusedConv2DWithBatchNormOpTest);
 
 // -------------------------------------------------------------------------- //
 // Conv2D + BiasAdd + {Relu}                                                  //
@@ -994,29 +1035,29 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolutionAndRelu) {
   this->VerifyConv2DWithBatchNormAndRelu(filter_size, filter_count);
 }
 
-REGISTER_TYPED_TEST_CASE_P(FusedConv2DWithBiasOpTest,    //
-                           OneByOneConvolution,          //
-                           ImageSizeConvolution,         //
-                           SpatialConvolution,           //
-                           OneByOneConvolutionAndRelu,   //
-                           ImageSizeConvolutionAndRelu,  //
-                           SpatialConvolutionAndRelu);
-
-REGISTER_TYPED_TEST_CASE_P(FusedConv2DWithBatchNormOpTest,  //
-                           OneByOneConvolution,             //
-                           ImageSizeConvolution,            //
-                           SpatialConvolution,              //
-                           OneByOneConvolutionAndRelu,      //
-                           ImageSizeConvolutionAndRelu,     //
-                           SpatialConvolutionAndRelu);
+REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBiasOpTest,    //
+                            OneByOneConvolution,          //
+                            ImageSizeConvolution,         //
+                            SpatialConvolution,           //
+                            OneByOneConvolutionAndRelu,   //
+                            ImageSizeConvolutionAndRelu,  //
+                            SpatialConvolutionAndRelu);
+
+REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBatchNormOpTest,  //
+                            OneByOneConvolution,             //
+                            ImageSizeConvolution,            //
+                            SpatialConvolution,              //
+                            OneByOneConvolutionAndRelu,      //
+                            ImageSizeConvolutionAndRelu,     //
+                            SpatialConvolutionAndRelu);
 
 using FusedBiasAddDataTypes = ::testing::Types<float, double>;
-INSTANTIATE_TYPED_TEST_CASE_P(Test, FusedConv2DWithBiasOpTest,
-                              FusedBiasAddDataTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedConv2DWithBiasOpTest,
+                               FusedBiasAddDataTypes);
 
 using FusedBatchNormDataTypes = ::testing::Types<float>;
-INSTANTIATE_TYPED_TEST_CASE_P(Test, FusedConv2DWithBatchNormOpTest,
-                              FusedBatchNormDataTypes);
+INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedConv2DWithBatchNormOpTest,
+                               FusedBatchNormDataTypes);
 
 ////////////////////////////////////////////////////////////////////////////////
 // Performance benchmarks for the FusedConv2DWithBiasOp.                      //
@@ -1455,4 +1496,38 @@ BM_FusedConv2DWithBatchNormAndRelu(16, 32, 32, 128, 3, 3, 1024, cpu,
 BM_FusedConv2DWithBatchNormAndRelu(32, 32, 32, 128, 3, 3, 1024, cpu,
                                    "3x3 /b 32");
 
+#if GOOGLE_CUDA
+// -------------------------------------------------------------------------- //
+// 1x1 Convolution
+// -------------------------------------------------------------------------- //
+
+BM_Conv2D(8, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 8");
+BM_Conv2D(16, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 16");
+BM_Conv2D(32, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 32");
+
+BM_Conv2DWithBiasAndRelu(8, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 8");
+BM_Conv2DWithBiasAndRelu(16, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 16");
+BM_Conv2DWithBiasAndRelu(32, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 32");
+
+BM_FusedConv2DWithBiasAndRelu(8, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 8");
+BM_FusedConv2DWithBiasAndRelu(16, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 16");
+BM_FusedConv2DWithBiasAndRelu(32, 32, 32, 128, 1, 1, 1024, gpu, "1x1 /b 32");
+
+// -------------------------------------------------------------------------- //
+// 3x3 Convolution
+// -------------------------------------------------------------------------- //
+
+BM_Conv2D(8, 32, 32, 128, 3, 3, 1024, gpu, "3x3 /b 8");
+BM_Conv2D(16, 32, 32, 128, 3, 3, 1024, gpu, "3x3 /b 16");
+BM_Conv2D(32, 32, 32, 128, 3, 3, 1024, gpu, "3x3 /b 32");
+
+BM_Conv2DWithBiasAndRelu(8, 32, 32, 128, 3, 3, 1024, gpu, "3x3 /b 8");
+BM_Conv2DWithBiasAndRelu(16, 32, 32, 128, 3, 3, 1024, gpu, "3x3 /b 16");
+BM_Conv2DWithBiasAndRelu(32, 32, 32, 128, 3, 3, 1024, gpu, "3x3 /b 32");
+
+BM_FusedConv2DWithBiasAndRelu(8, 32, 32, 128, 3, 3, 1024, gpu, "3x3 /b 8");
+BM_FusedConv2DWithBiasAndRelu(16, 32, 32, 128, 3, 3, 1024, gpu, "3x3 /b 16");
+BM_FusedConv2DWithBiasAndRelu(32, 32, 32, 128, 3, 3, 1024, gpu, "3x3 /b 32");
+#endif
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_using_gemm.cc b/tensorflow/core/kernels/conv_ops_using_gemm.cc
index af0a9fa82ee5778fa9e18cea59cf759fa468224f..05df9e0207e505bfd5b9a3bc9c5b7b2c90a0fa30 100644
--- a/tensorflow/core/kernels/conv_ops_using_gemm.cc
+++ b/tensorflow/core/kernels/conv_ops_using_gemm.cc
@@ -49,6 +49,7 @@ limitations under the License.
 #include <string.h>
 #include <map>
 #include <vector>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -57,7 +58,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_ops.h"
 #include "tensorflow/core/kernels/gemm_functors.h"
 #include "tensorflow/core/kernels/image_resizer_state.h"
diff --git a/tensorflow/core/kernels/crop_and_resize_op.cc b/tensorflow/core/kernels/crop_and_resize_op.cc
index 99d01b4db6bac68d890d93ac55bea576f43a5994..838cedd7a4aeeee4b1871bf4c64bbc0c871fdac9 100644
--- a/tensorflow/core/kernels/crop_and_resize_op.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op.cc
@@ -23,11 +23,11 @@ limitations under the License.
 #include <string>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/crop_and_resize_op_test.cc b/tensorflow/core/kernels/crop_and_resize_op_test.cc
index 6921020d09e94fa7b99d7ca6cb95c82274b2e4c0..0eadf4c1714f6987f0a91c153f59d56ce0254014 100644
--- a/tensorflow/core/kernels/crop_and_resize_op_test.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op_test.cc
@@ -423,7 +423,7 @@ TEST_F(CropAndResizeOpTest, TestWithSharding) {
   //  ... (altogether 999 lines)
   //  0, 1, 2, ..., 998
   AddInput<float>(TensorShape({1, kLength, kLength, 1}),
-                  [kLength](int i) -> float { return i % kLength; });
+                  [](int i) -> float { return i % kLength; });
   AddInputFromArray<float>(TensorShape({2, 4}),
                            {0, 0, 0.5, 0.5, 0.5, 0.5, 1, 1});
   AddInputFromArray<int32>(TensorShape({2}), {0, 0});
@@ -437,15 +437,15 @@ TEST_F(CropAndResizeOpTest, TestWithSharding) {
   //  ... (altogether 500 lines)
   //  0, 1, 2, ..., 499
   Tensor result1(allocator(), DT_FLOAT, TensorShape({1, kHalf, kHalf, 1}));
-  test::FillFn<float>(&result1, [kHalf](int i) -> float { return i % kHalf; });
+  test::FillFn<float>(&result1, [](int i) -> float { return i % kHalf; });
 
   // Result 2:
   //  499, 500, 501, ..., 998
   //  ... (altogether 500 lines)
   //  499, 500, 501, ..., 998
   Tensor result2(allocator(), DT_FLOAT, TensorShape({1, kHalf, kHalf, 1}));
-  test::FillFn<float>(
-      &result2, [kHalf](int i) -> float { return i % kHalf + kHalf - 1; });
+  test::FillFn<float>(&result2,
+                      [](int i) -> float { return i % kHalf + kHalf - 1; });
 
   // Expected result is the concat of the two tensors.
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, kHalf, kHalf, 1}));
diff --git a/tensorflow/core/kernels/ctc_loss_op.cc b/tensorflow/core/kernels/ctc_loss_op.cc
index fb375ee4b351e4d15c234f9290ecc8780b096c32..aa68e105addab65cdc3ad468547e6e1273834077 100644
--- a/tensorflow/core/kernels/ctc_loss_op.cc
+++ b/tensorflow/core/kernels/ctc_loss_op.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 // See docs in ../ops/ctc_ops.cc.
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/ctc/ctc_loss_calculator.h"
diff --git a/tensorflow/core/kernels/cuda_solvers.cc b/tensorflow/core/kernels/cuda_solvers.cc
index a59baaa96fc73cc442287dfb4550bc2f6932956b..82d92388d401af176d6a555f4f0e51af84caef11 100644
--- a/tensorflow/core/kernels/cuda_solvers.cc
+++ b/tensorflow/core/kernels/cuda_solvers.cc
@@ -643,6 +643,50 @@ static inline Status GesvdImpl(
 
 TF_CALL_LAPACK_TYPES_NO_COMPLEX(GESVD_INSTANCE);
 
+template <typename Scalar, typename BufSizeFnT, typename SolverFnT>
+static inline Status GesvdjBatchedImpl(BufSizeFnT bufsize, SolverFnT solver,
+                                       CudaSolver* cuda_solver,
+                                       OpKernelContext* context,
+                                       cusolverDnHandle_t cusolver_dn_handle,
+                                       cusolverEigMode_t jobz, int m, int n,
+                                       Scalar* A, int lda, Scalar* S, Scalar* U,
+                                       int ldu, Scalar* V, int ldv,
+                                       int* dev_lapack_info, int batch_size) {
+  mutex_lock lock(handle_map_mutex);
+  /* Get amount of workspace memory required. */
+  int lwork;
+  /* Default parameters for gesvdj and gesvdjBatched. */
+  gesvdjInfo_t svdj_info;
+  TF_RETURN_IF_CUSOLVER_ERROR(cusolverDnCreateGesvdjInfo(&svdj_info));
+  TF_RETURN_IF_CUSOLVER_ERROR(bufsize(
+      cusolver_dn_handle, jobz, m, n, CUDAComplex(A), lda, S, CUDAComplex(U),
+      ldu, CUDAComplex(V), ldv, &lwork, svdj_info, batch_size));
+  /* Allocate device memory for workspace. */
+  auto dev_workspace =
+      cuda_solver->GetScratchSpace<Scalar>(lwork, "", /* on_host */ false);
+  TF_RETURN_IF_CUSOLVER_ERROR(solver(
+      cusolver_dn_handle, jobz, m, n, CUDAComplex(A), lda, S, CUDAComplex(U),
+      ldu, CUDAComplex(V), ldv, CUDAComplex(dev_workspace.mutable_data()),
+      lwork, dev_lapack_info, svdj_info, batch_size));
+  TF_RETURN_IF_CUSOLVER_ERROR(cusolverDnDestroyGesvdjInfo(svdj_info));
+  return Status::OK();
+}
+
+#define GESVDJBATCHED_INSTANCE(Scalar, type_prefix)                            \
+  template <>                                                                  \
+  Status CudaSolver::GesvdjBatched<Scalar>(                                    \
+      cusolverEigMode_t jobz, int m, int n, Scalar* dev_A, int lda,            \
+      Scalar* dev_S, Scalar* dev_U, int ldu, Scalar* dev_V, int ldv,           \
+      int* dev_lapack_info, int batch_size) {                                  \
+    return GesvdjBatchedImpl(DN_BUFSIZE_FN(gesvdjBatched, type_prefix),        \
+                             DN_SOLVER_FN(gesvdjBatched, type_prefix), this,   \
+                             context_, cusolver_dn_handle_, jobz, m, n, dev_A, \
+                             lda, dev_S, dev_U, ldu, dev_V, ldv,               \
+                             dev_lapack_info, batch_size);                     \
+  }
+
+TF_CALL_LAPACK_TYPES_NO_COMPLEX(GESVDJBATCHED_INSTANCE);
+
 //=============================================================================
 // Wrappers of cuBlas computational methods begin here.
 //
@@ -692,8 +736,8 @@ static inline Status GetrsBatchedImpl(
     SolverFnT solver, CudaSolver* cuda_solver, OpKernelContext* context,
     cublasHandle_t cublas_handle, cublasOperation_t trans, int n, int nrhs,
     const Scalar* const host_a_dev_ptrs[], int lda, const int* dev_pivots,
-    const Scalar* const host_b_dev_ptrs[], int ldb,
-    DeviceLapackInfo* dev_lapack_info, int batch_size) {
+    const Scalar* const host_b_dev_ptrs[], int ldb, int* host_lapack_info,
+    int batch_size) {
   mutex_lock lock(handle_map_mutex);
   using CudaScalar = typename CUDAComplexT<Scalar>::type;
   ScratchSpace<uint8> dev_a_dev_ptrs =
@@ -714,7 +758,7 @@ static inline Status GetrsBatchedImpl(
       cublas_handle, trans, n, nrhs,
       reinterpret_cast<const CudaScalar* const*>(dev_a_dev_ptrs.data()), lda,
       dev_pivots, reinterpret_cast<CudaScalar**>(dev_b_dev_ptrs.mutable_data()),
-      ldb, dev_lapack_info->mutable_data(), batch_size));
+      ldb, host_lapack_info, batch_size));
   return Status::OK();
 }
 
@@ -723,13 +767,13 @@ static inline Status GetrsBatchedImpl(
   Status CudaSolver::GetrsBatched(                                             \
       cublasOperation_t trans, int n, int nrhs,                                \
       const Scalar* const host_a_dev_ptrs[], int lda, const int* dev_pivots,   \
-      const Scalar* const host_b_dev_ptrs[], int ldb,                          \
-      DeviceLapackInfo* dev_lapack_info, int batch_size) {                     \
+      const Scalar* const host_b_dev_ptrs[], int ldb, int* host_lapack_info,   \
+      int batch_size) {                                                        \
     return GetrsBatchedImpl(reinterpret_cast<getrs_##type_prefix*>(            \
                                 BLAS_SOLVER_FN(getrsBatched, type_prefix)),    \
                             this, context_, cublas_handle_, trans, n, nrhs,    \
                             host_a_dev_ptrs, lda, dev_pivots, host_b_dev_ptrs, \
-                            ldb, dev_lapack_info, batch_size);                 \
+                            ldb, host_lapack_info, batch_size);                \
   }
 
 TF_CALL_LAPACK_TYPES(GETRS_BATCHED_INSTANCE);
diff --git a/tensorflow/core/kernels/cuda_solvers.h b/tensorflow/core/kernels/cuda_solvers.h
index 2c30d036df71f917f7e302141f577a49ed4c5112..fa8b4e241556afef82537db118706ebd35539987 100644
--- a/tensorflow/core/kernels/cuda_solvers.h
+++ b/tensorflow/core/kernels/cuda_solvers.h
@@ -235,13 +235,14 @@ class CudaSolver {
                       int batch_size) TF_MUST_USE_RESULT;
 
   // Batched linear solver using LU factorization from getrfBatched.
-  // See:
+  // Notice that lapack_info is returned on the host, as opposed to
+  // most of the other functions that return it on the device. See:
   // http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getrsbatched
   template <typename Scalar>
   Status GetrsBatched(cublasOperation_t trans, int n, int nrhs,
                       const Scalar* const dev_Aarray[], int lda,
                       const int* devIpiv, const Scalar* const dev_Barray[],
-                      int ldb, DeviceLapackInfo* dev_lapack_info,
+                      int ldb, int* host_lapack_info,
                       int batch_size) TF_MUST_USE_RESULT;
 
   // Computes matrix inverses for a batch of small matrices. Uses the outputs
@@ -311,6 +312,11 @@ class CudaSolver {
   Status Gesvd(signed char jobu, signed char jobvt, int m, int n, Scalar* dev_A,
                int lda, Scalar* dev_S, Scalar* dev_U, int ldu, Scalar* dev_VT,
                int ldvt, int* dev_lapack_info) TF_MUST_USE_RESULT;
+  template <typename Scalar>
+  Status GesvdjBatched(cusolverEigMode_t jobz, int m, int n, Scalar* dev_A,
+                       int lda, Scalar* dev_S, Scalar* dev_U, int ldu,
+                       Scalar* dev_V, int ldv, int* dev_lapack_info,
+                       int batch_size);
 
  private:
   OpKernelContext* context_;  // not owned.
diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index fbd702ef14ed2b810b9cb08679baf8688ca58d9c..196494cbcf8b7f4f670599241d5bdbb1c29c7cd1 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -105,6 +105,12 @@ class CudnnRNNForwardOpV2;
 template <typename Device, typename T>
 class CudnnRNNBackwardOpV2;
 
+template <typename Device, typename T>
+class CudnnRNNForwardOpV3;
+
+template <typename Device, typename T>
+class CudnnRNNBackwardOpV3;
+
 enum class TFRNNInputMode {
   kRNNLinearInput = 0,
   kRNNSkipInput = 1,
@@ -142,13 +148,13 @@ uint64 HashList(const std::vector<int>& list) {
 class CudnnRnnParameters {
  public:
   CudnnRnnParameters(int num_layers, int input_size, int num_units,
-                     int seq_length, int batch_size, int dir_count,
+                     int max_seq_length, int batch_size, int dir_count,
                      bool has_dropout, bool is_training, RnnMode rnn_mode,
                      TFRNNInputMode rnn_input_mode, DataType dtype)
       : num_layers_(num_layers),
         input_size_(input_size),
         num_units_(num_units),
-        seq_length_(seq_length),
+        seq_length_(max_seq_length),
         batch_size_(batch_size),
         dir_count_(dir_count),
         has_dropout_(has_dropout),
@@ -156,10 +162,11 @@ class CudnnRnnParameters {
         rnn_mode_(rnn_mode),
         rnn_input_mode_(rnn_input_mode),
         dtype_(dtype) {
-    hash_code_ = HashList(
-        {num_layers, input_size, num_units, seq_length, batch_size, dir_count,
-         static_cast<int>(has_dropout), static_cast<int>(is_training),
-         static_cast<int>(rnn_mode), static_cast<int>(rnn_input_mode), dtype});
+    hash_code_ =
+        HashList({num_layers, input_size, num_units, max_seq_length, batch_size,
+                  dir_count, static_cast<int>(has_dropout),
+                  static_cast<int>(is_training), static_cast<int>(rnn_mode),
+                  static_cast<int>(rnn_input_mode), dtype});
   }
 
   bool operator==(const CudnnRnnParameters& other) const {
@@ -493,7 +500,7 @@ struct CudnnRnnModelShapes {
   int input_size;
   int num_units;
   int dir_count;
-  int seq_length;
+  int max_seq_length;
   int batch_size;
   TensorShape input_shape;
   TensorShape output_shape;
@@ -505,9 +512,10 @@ struct CudnnRnnModelShapes {
   }
   string DebugString() const {
     return strings::Printf(
-        "[num_layers, input_size, num_units, dir_count, seq_length, "
+        "[num_layers, input_size, num_units, dir_count, max_seq_length, "
         "batch_size]: [%d, %d, %d, %d, %d, %d] ",
-        num_layers, input_size, num_units, dir_count, seq_length, batch_size);
+        num_layers, input_size, num_units, dir_count, max_seq_length,
+        batch_size);
   }
 };
 
@@ -565,7 +573,7 @@ Status ExtractForwardInput(OpKernelContext* context,
   if ((*input)->dims() != 3) {
     return errors::InvalidArgument("RNN input must be a 3-D vector.");
   }
-  model_shapes->seq_length = (*input)->dim_size(0);
+  model_shapes->max_seq_length = (*input)->dim_size(0);
   model_shapes->batch_size = (*input)->dim_size(1);
   model_shapes->input_size = (*input)->dim_size(2);
   model_shapes->input_shape = (*input)->shape();
@@ -597,17 +605,31 @@ Status ExtractForwardInput(OpKernelContext* context,
     }
   }
   model_shapes->output_shape =
-      TensorShape({model_shapes->seq_length, model_shapes->batch_size,
+      TensorShape({model_shapes->max_seq_length, model_shapes->batch_size,
                    model_shapes->dir_count * model_shapes->num_units});
   return Status::OK();
 }
 
+// Extract and checks the sequence_lengths, forward input tensors,
+// parameters, and shapes from the OpKernelContext.
+Status ExtractForwardInput(OpKernelContext* context,
+                           const CudnnModelTypes& model_types,
+                           const Tensor** input, const Tensor** input_h,
+                           const Tensor** input_c, const Tensor** params,
+                           CudnnRnnModelShapes* model_shapes,
+                           const Tensor** sequence_lengths) {
+  TF_RETURN_IF_ERROR(context->input("sequence_lengths", sequence_lengths));
+  return ExtractForwardInput(context, model_types, input, input_h, input_c,
+                             params, model_shapes);
+}
+
 template <typename T>
 Status CreateForwardAndBackwardIODescriptors(
     OpKernelContext* context, const CudnnRnnModelShapes& model_shapes,
     std::unique_ptr<RnnSequenceTensorDescriptor>* input_desc,
     std::unique_ptr<RnnStateTensorDescriptor>* state_desc,
-    std::unique_ptr<RnnSequenceTensorDescriptor>* output_desc) {
+    std::unique_ptr<RnnSequenceTensorDescriptor>* output_desc,
+    const absl::Span<const int>& seq_lengths) {
   StreamExecutor* executor = context->op_device_context()->stream()->parent();
   se::dnn::DataType data_type = ToDataType<T>::value;
 
@@ -616,11 +638,19 @@ Status CreateForwardAndBackwardIODescriptors(
   const TensorShape& output_shape = model_shapes.output_shape;
 
   DCHECK_EQ(input_shape.dims(), 3);
-  auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
-      input_shape.dim_size(0), input_shape.dim_size(1), input_shape.dim_size(2),
-      data_type);
-  TF_RETURN_IF_ERROR(input_desc_s.status());
-  *input_desc = input_desc_s.ConsumeValueOrDie();
+  if (seq_lengths.data() != nullptr) {
+    auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
+        input_shape.dim_size(0), input_shape.dim_size(1),
+        input_shape.dim_size(2), seq_lengths, data_type);
+    TF_RETURN_IF_ERROR(input_desc_s.status());
+    *input_desc = input_desc_s.ConsumeValueOrDie();
+  } else {
+    auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
+        input_shape.dim_size(0), input_shape.dim_size(1),
+        input_shape.dim_size(2), data_type);
+    TF_RETURN_IF_ERROR(input_desc_s.status());
+    *input_desc = input_desc_s.ConsumeValueOrDie();
+  }
 
   DCHECK_EQ(hidden_state_shape.dims(), 3);
   auto hidden_state_desc_s = executor->createRnnStateTensorDescriptor(
@@ -630,11 +660,20 @@ Status CreateForwardAndBackwardIODescriptors(
   *state_desc = hidden_state_desc_s.ConsumeValueOrDie();
 
   DCHECK_EQ(output_shape.dims(), 3);
-  auto output_desc_s = executor->createRnnSequenceTensorDescriptor(
-      output_shape.dim_size(0), output_shape.dim_size(1),
-      output_shape.dim_size(2), data_type);
-  TF_RETURN_IF_ERROR(output_desc_s.status());
-  *output_desc = output_desc_s.ConsumeValueOrDie();
+  if (seq_lengths.data() != nullptr) {
+    auto output_desc_s = executor->createRnnSequenceTensorDescriptor(
+        output_shape.dim_size(0), output_shape.dim_size(1),
+        output_shape.dim_size(2), seq_lengths, data_type);
+    TF_RETURN_IF_ERROR(output_desc_s.status());
+    *output_desc = output_desc_s.ConsumeValueOrDie();
+  } else {
+    auto output_desc_s = executor->createRnnSequenceTensorDescriptor(
+        output_shape.dim_size(0), output_shape.dim_size(1),
+        output_shape.dim_size(2), data_type);
+    TF_RETURN_IF_ERROR(output_desc_s.status());
+    *output_desc = output_desc_s.ConsumeValueOrDie();
+  }
+
   return Status::OK();
 }
 
@@ -648,6 +687,7 @@ Status DoForward(OpKernelContext* context, const RnnDescriptor& rnn_desc,
                  const bool is_training,
                  /* forward outputs, outputs of the function */
                  Tensor* output, Tensor* output_h, Tensor* output_c,
+                 const Tensor* sequence_lengths,
                  ScratchAllocator* reserve_space_allocator,
                  ScratchAllocator* workspace_allocator,
                  ProfileResult* output_profile_result) {
@@ -655,8 +695,14 @@ Status DoForward(OpKernelContext* context, const RnnDescriptor& rnn_desc,
   std::unique_ptr<RnnStateTensorDescriptor> state_desc;
   std::unique_ptr<RnnSequenceTensorDescriptor> output_desc;
 
+  absl::Span<const int> seq_lengths;
+  if (sequence_lengths != nullptr) {
+    seq_lengths = absl::Span<const int>(
+        sequence_lengths->template flat<int>().data(), model_shapes.batch_size);
+  }
   TF_RETURN_IF_ERROR(CreateForwardAndBackwardIODescriptors<T>(
-      context, model_shapes, &input_desc, &state_desc, &output_desc));
+      context, model_shapes, &input_desc, &state_desc, &output_desc,
+      seq_lengths));
 
   auto input_data = AsDeviceMemory<T>(input);
   auto input_h_data = AsDeviceMemory<T>(input_h);
@@ -664,6 +710,7 @@ Status DoForward(OpKernelContext* context, const RnnDescriptor& rnn_desc,
   if (model_types.HasInputC()) {
     input_c_data = AsDeviceMemory<T>(input_c);
   }
+
   auto params_data = AsDeviceMemory<T>(params);
   auto output_data = AsDeviceMemory<T>(output);
   auto output_h_data = AsDeviceMemory<T>(output_h);
@@ -696,21 +743,28 @@ Status DoBackward(
     /* forward inputs */
     const Tensor* input, const Tensor* input_h, const Tensor* input_c,
     const Tensor* params,
-    /* forward outptus */
+    /* forward outputs */
     const Tensor* output, const Tensor* output_h, const Tensor* output_c,
     /* backprop inputs */
     const Tensor* output_backprop, const Tensor* output_h_backprop,
     const Tensor* output_c_backprop, const Tensor* reserve_space,
     /* backprop outputs, output of the function */
     Tensor* input_backprop, Tensor* input_h_backprop, Tensor* input_c_backprop,
-    Tensor* params_backprop, ScratchAllocator* workspace_allocator,
+    Tensor* params_backprop, const Tensor* sequence_lengths,
+    ScratchAllocator* workspace_allocator,
     ProfileResult* output_profile_result) {
   std::unique_ptr<RnnSequenceTensorDescriptor> input_desc;
   std::unique_ptr<RnnStateTensorDescriptor> state_desc;
   std::unique_ptr<RnnSequenceTensorDescriptor> output_desc;
 
+  absl::Span<const int> seq_lengths;
+  if (sequence_lengths != nullptr) {
+    seq_lengths = absl::Span<const int>(
+        sequence_lengths->template flat<int>().data(), model_shapes.batch_size);
+  }
   TF_RETURN_IF_ERROR(CreateForwardAndBackwardIODescriptors<T>(
-      context, model_shapes, &input_desc, &state_desc, &output_desc));
+      context, model_shapes, &input_desc, &state_desc, &output_desc,
+      seq_lengths));
 
   auto input_data = AsDeviceMemory<T>(input);
   auto input_h_data = AsDeviceMemory<T>(input_h);
@@ -1162,22 +1216,31 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
 
   void Compute(OpKernelContext* context) override {
     AlgorithmConfig algo_config;
-    ComputeAndReturnAlgorithm(context, &algo_config);
+    ComputeAndReturnAlgorithm(context, &algo_config, false);
   }
 
  protected:
   virtual void ComputeAndReturnAlgorithm(OpKernelContext* context,
-                                         AlgorithmConfig* output_algo_config) {
+                                         AlgorithmConfig* output_algo_config,
+                                         bool var_seq_lengths) {
     CHECK_NE(output_algo_config, nullptr);
 
     const Tensor* input = nullptr;
     const Tensor* input_h = nullptr;
     const Tensor* input_c = nullptr;
     const Tensor* params = nullptr;
+    const Tensor* sequence_lengths = nullptr;
     CudnnRnnModelShapes model_shapes;
-    OP_REQUIRES_OK(context,
-                   ExtractForwardInput(context, model_types(), &input, &input_h,
+    if (var_seq_lengths) {
+      OP_REQUIRES_OK(
+          context, ExtractForwardInput(context, model_types(), &input, &input_h,
+                                       &input_c, &params, &model_shapes,
+                                       &sequence_lengths));
+    } else {
+      OP_REQUIRES_OK(
+          context, ExtractForwardInput(context, model_types(), &input, &input_h,
                                        &input_c, &params, &model_shapes));
+    }
     RnnInputMode input_mode;
     OP_REQUIRES_OK(context,
                    ToRNNInputMode(rnn_input_mode(), model_shapes.num_units,
@@ -1215,11 +1278,19 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
           context, GetCachedRnnDescriptor<T>(context, model_shapes, input_mode,
                                              *output_algo_config,
                                              &rnn_state_cache_, &rnn_desc_ptr));
-      launch_status = DoForward<T>(
-          context, *rnn_desc_ptr, model_types(), model_shapes, input, input_h,
-          input_c, params, is_training_, output, output_h, output_c,
-          &reserve_space_allocator, &workspace_allocator,
-          /*output_profile_result=*/nullptr);
+      if (var_seq_lengths) {
+        launch_status = DoForward<T>(
+            context, *rnn_desc_ptr, model_types(), model_shapes, input, input_h,
+            input_c, params, is_training_, output, output_h, output_c,
+            sequence_lengths, &reserve_space_allocator, &workspace_allocator,
+            /*output_profile_result=*/nullptr);
+      } else {
+        launch_status = DoForward<T>(
+            context, *rnn_desc_ptr, model_types(), model_shapes, input, input_h,
+            input_c, params, is_training_, output, output_h, output_c, nullptr,
+            &reserve_space_allocator, &workspace_allocator,
+            /*output_profile_result=*/nullptr);
+      }
     }
     OP_REQUIRES_OK(context, launch_status);
   }
@@ -1301,7 +1372,7 @@ class CudnnRNNForwardOpV2<GPUDevice, T>
   void Compute(OpKernelContext* context) override {
     AlgorithmConfig best_algo_config;
     CudnnRNNForwardOp<GPUDevice, T>::ComputeAndReturnAlgorithm(
-        context, &best_algo_config);
+        context, &best_algo_config, false);
     if (!context->status().ok()) {
       return;
     }
@@ -1354,7 +1425,7 @@ class CudnnRNNForwardOpV2<GPUDevice, T>
     const auto& modeltypes = model_types();
     CudnnRnnParameters rnn_params(
         model_shapes.num_layers, model_shapes.input_size,
-        model_shapes.num_units, model_shapes.seq_length,
+        model_shapes.num_units, model_shapes.max_seq_length,
         model_shapes.batch_size, model_shapes.dir_count,
         /*has_dropout=*/std::abs(dropout()) > 1e-8, is_training(),
         modeltypes.rnn_mode, modeltypes.rnn_input_mode, input->dtype());
@@ -1421,7 +1492,7 @@ class CudnnRNNForwardOpV2<GPUDevice, T>
       CudnnRnnAllocatorInTemp<uint8> workspace_allocator(context);
       status = DoForward<T>(
           context, *rnn_desc, model_types(), model_shapes, input, input_h,
-          input_c, params, is_training(), output, output_h, output_c,
+          input_c, params, is_training(), output, output_h, output_c, nullptr,
           &reserve_space_allocator, &workspace_allocator, &fwd_profile_result);
       if (!status.ok()) {
         continue;
@@ -1435,7 +1506,8 @@ class CudnnRNNForwardOpV2<GPUDevice, T>
             input_c, params, output, output_h, output_c, &output_backprop,
             &output_h_backprop, &output_c_backprop, &reserve_space,
             &input_backprop, &input_h_backprop, &input_c_backprop,
-            &params_backprop, &workspace_allocator, &bak_profile_result);
+            &params_backprop, nullptr, &workspace_allocator,
+            &bak_profile_result);
         if (!status.ok()) {
           continue;
         }
@@ -1480,6 +1552,50 @@ TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 #undef REGISTER_GPU
 
+template <typename T>
+class CudnnRNNForwardOpV3<GPUDevice, T>
+    : public CudnnRNNForwardOp<GPUDevice, T> {
+ private:
+  using CudnnRNNForwardOp<GPUDevice, T>::is_training;
+  using CudnnRNNKernelCommon::CreateRnnDescriptor;
+  using CudnnRNNKernelCommon::dropout;
+  using CudnnRNNKernelCommon::HasInputC;
+  using CudnnRNNKernelCommon::model_types;
+
+ public:
+  explicit CudnnRNNForwardOpV3(OpKernelConstruction* context)
+      : CudnnRNNForwardOp<GPUDevice, T>(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    AlgorithmConfig best_algo_config;
+    CudnnRNNForwardOp<GPUDevice, T>::ComputeAndReturnAlgorithm(
+        context, &best_algo_config, true);
+    if (!context->status().ok()) {
+      return;
+    }
+
+    Tensor* output_host_reserved = nullptr;
+    // TODO: Current V3 only uses the default standard algorithm to process
+    // batches with variable sequences and the inputs should be padded.
+    // Autotune is not supported yet.
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(4, {}, &output_host_reserved));
+  }
+};
+
+#define REGISTER_GPU(T)                                       \
+  REGISTER_KERNEL_BUILDER(Name("CudnnRNNV3")                  \
+                              .Device(DEVICE_GPU)             \
+                              .HostMemory("sequence_lengths") \
+                              .HostMemory("host_reserved")    \
+                              .TypeConstraint<T>("T"),        \
+                          CudnnRNNForwardOpV3<GPUDevice, T>);
+
+TF_CALL_half(REGISTER_GPU);
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
+#undef REGISTER_GPU
+
 // Run the backward operation of the RNN model.
 template <typename T>
 class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
@@ -1488,14 +1604,27 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
       : CudnnRNNKernelCommon(context) {}
 
   void Compute(OpKernelContext* context) override {
+    ComputeImpl(context, false);
+  }
+
+ protected:
+  virtual void ComputeImpl(OpKernelContext* context, bool var_seq_lengths) {
     const Tensor* input = nullptr;
     const Tensor* input_h = nullptr;
     const Tensor* input_c = nullptr;
     const Tensor* params = nullptr;
+    const Tensor* sequence_lengths = nullptr;
     CudnnRnnModelShapes model_shapes;
-    OP_REQUIRES_OK(context,
-                   ExtractForwardInput(context, model_types(), &input, &input_h,
+    if (var_seq_lengths) {
+      OP_REQUIRES_OK(
+          context, ExtractForwardInput(context, model_types(), &input, &input_h,
+                                       &input_c, &params, &model_shapes,
+                                       &sequence_lengths));
+    } else {
+      OP_REQUIRES_OK(
+          context, ExtractForwardInput(context, model_types(), &input, &input_h,
                                        &input_c, &params, &model_shapes));
+    }
     RnnInputMode input_mode;
     OP_REQUIRES_OK(context,
                    ToRNNInputMode(rnn_input_mode(), model_shapes.num_units,
@@ -1536,12 +1665,22 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
           context, GetCachedRnnDescriptor<T>(context, model_shapes, input_mode,
                                              algo_config, &rnn_state_cache_,
                                              &rnn_desc_ptr));
-      launch_status = DoBackward<T>(
-          context, *rnn_desc_ptr, model_types(), model_shapes, input, input_h,
-          input_c, params, output, output_h, output_c, output_backprop,
-          output_h_backprop, output_c_backprop, reserve_space, input_backprop,
-          input_h_backprop, input_c_backprop, params_backprop,
-          &workspace_allocator, /*output_profile_result=*/nullptr);
+      if (var_seq_lengths) {
+        launch_status = DoBackward<T>(
+            context, *rnn_desc_ptr, model_types(), model_shapes, input, input_h,
+            input_c, params, output, output_h, output_c, output_backprop,
+            output_h_backprop, output_c_backprop, reserve_space, input_backprop,
+            input_h_backprop, input_c_backprop, params_backprop,
+            sequence_lengths, &workspace_allocator,
+            /*output_profile_result=*/nullptr);
+      } else {
+        launch_status = DoBackward<T>(
+            context, *rnn_desc_ptr, model_types(), model_shapes, input, input_h,
+            input_c, params, output, output_h, output_c, output_backprop,
+            output_h_backprop, output_c_backprop, reserve_space, input_backprop,
+            input_h_backprop, input_c_backprop, params_backprop, nullptr,
+            &workspace_allocator, /*output_profile_result=*/nullptr);
+      }
     }
     OP_REQUIRES_OK(context, launch_status);
   }
@@ -1685,6 +1824,31 @@ TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 #undef REGISTER_GPU
 
+template <typename T>
+class CudnnRNNBackwardOpV3<GPUDevice, T>
+    : public CudnnRNNBackwardOp<GPUDevice, T> {
+ public:
+  explicit CudnnRNNBackwardOpV3(OpKernelConstruction* context)
+      : CudnnRNNBackwardOp<GPUDevice, T>(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    CudnnRNNBackwardOp<GPUDevice, T>::ComputeImpl(context, true);
+  }
+};
+
+#define REGISTER_GPU(T)                                       \
+  REGISTER_KERNEL_BUILDER(Name("CudnnRNNBackpropV3")          \
+                              .Device(DEVICE_GPU)             \
+                              .HostMemory("sequence_lengths") \
+                              .HostMemory("host_reserved")    \
+                              .TypeConstraint<T>("T"),        \
+                          CudnnRNNBackwardOpV3<GPUDevice, T>);
+
+TF_CALL_half(REGISTER_GPU);
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
+#undef REGISTER_GPU
+
 // TODO(zhengxq): Add the conversion of Cudnn RNN Params from and to
 // its canonical form.
 
diff --git a/tensorflow/core/kernels/cwise_op_arg.cc b/tensorflow/core/kernels/cwise_op_arg.cc
index 62ffa0718ff5287167c702841ff00511da4866b5..ea659facdc4eb5605ad6327e3c073c47eefedeec 100644
--- a/tensorflow/core/kernels/cwise_op_arg.cc
+++ b/tensorflow/core/kernels/cwise_op_arg.cc
@@ -26,9 +26,7 @@ namespace tensorflow {
 REGISTER_COMPLEX(CPU, float, complex64);
 REGISTER_COMPLEX(CPU, double, complex128);
 
-// TODO: Enable GPU support for angle op after resolving
-// build failures on GPU (See #10643 for context).
-#if 0 && GOOGLE_CUDA
+#if GOOGLE_CUDA
 REGISTER_COMPLEX(GPU, float, complex64);
 REGISTER_COMPLEX(GPU, double, complex128);
 #endif
diff --git a/tensorflow/core/kernels/cwise_op_gpu_arg.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_arg.cu.cc
index 9b3f8200bd77d3179700c1abcc0b9a74484f3f52..34028e936e483035c1d410502252261b3e424ec9 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_arg.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_arg.cu.cc
@@ -13,9 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// TODO: Enable GPU support for angle op after resolving
-// build failures on GPU (See #10643 for context).
-#if 0 && GOOGLE_CUDA
+#if GOOGLE_CUDA
 
 #include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_igammas_double.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_igammas_double.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2bcc7aa8855c47aa164caeb0c6bd82dd10306432
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_gpu_igammas_double.cu.cc
@@ -0,0 +1,29 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#include "tensorflow/core/kernels/cwise_ops_gpu_gradients.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_BINARY1(igamma, double);
+DEFINE_BINARY1(igamma_grad_a, double);
+DEFINE_BINARY1(igammac, double);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_gpu_igammas.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_igammas_float.cu.cc
similarity index 88%
rename from tensorflow/core/kernels/cwise_op_gpu_igammas.cu.cc
rename to tensorflow/core/kernels/cwise_op_gpu_igammas_float.cu.cc
index 508a47deda81d6182e2c16e83d54bbfa5c97f3fb..e6412216e93379beba3bceeaa2b165f48bd64d0f 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_igammas.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_igammas_float.cu.cc
@@ -20,9 +20,9 @@ limitations under the License.
 
 namespace tensorflow {
 namespace functor {
-DEFINE_BINARY2(igamma, float, double);
-DEFINE_BINARY2(igamma_grad_a, float, double);
-DEFINE_BINARY2(igammac, float, double);
+DEFINE_BINARY1(igamma, float);
+DEFINE_BINARY1(igamma_grad_a, float);
+DEFINE_BINARY1(igammac, float);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc
index 539f07b0d68321a0f9a33b76aca78bd9e38ce6e9..f4059b2b137ae16dfeed199aae26895f74d39133 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_mul.cu.cc
@@ -21,6 +21,7 @@ namespace tensorflow {
 namespace functor {
 DEFINE_BINARY11(mul, Eigen::half, float, double, uint8, int8, uint16, int16,
                 int32, int64, complex64, complex128);
+DEFINE_BINARY2(mul_no_nan, float, double);
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cwise_op_mul_1.cc b/tensorflow/core/kernels/cwise_op_mul_1.cc
index cff0407b83a4bafd27573325615322f92e594d46..93d6e38f54ee1867c0ea8c8e673c22925b4d607d 100644
--- a/tensorflow/core/kernels/cwise_op_mul_1.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_1.cc
@@ -19,6 +19,8 @@ namespace tensorflow {
 
 REGISTER6(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, uint8,
           int32, bfloat16);
+REGISTER2(BinaryOp, CPU, "MulNoNan", functor::mul_no_nan, float, double);
+
 #if defined(__ANDROID_TYPES_SLIM__)
 // We only register the first type when we have multi-argument calls in the
 // case where we're trying to reduce executable size, but it turns out that the
@@ -39,6 +41,7 @@ REGISTER_KERNEL_BUILDER(Name("Mul")
                             .HostMemory("z")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::mul<int32>>);
+REGISTER2(BinaryOp, GPU, "MulNoNan", functor::mul_no_nan, float, double);
 #endif
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/cwise_op_neg.cc b/tensorflow/core/kernels/cwise_op_neg.cc
index a136769b912718a5749273050a2226da3fa9e3cf..bb7d22e4dd4b101ff6d695834b881bda872cda9f 100644
--- a/tensorflow/core/kernels/cwise_op_neg.cc
+++ b/tensorflow/core/kernels/cwise_op_neg.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER7(UnaryOp, CPU, "Neg", functor::neg, float, Eigen::half, double, int32,
-          complex64, int64, complex128);
+REGISTER8(UnaryOp, CPU, "Neg", functor::neg, float, Eigen::half, double, int32,
+          complex64, int64, complex128, bfloat16);
 
 #ifdef TENSORFLOW_USE_SYCL
 REGISTER3(UnaryOp, SYCL, "Neg", functor::neg, float, double, int64);
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
index dd4e4ea547e7738b76796c0e8d174602645b83df..3b51563ca288413b389f938c9ff9810a71c09fd5 100644
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 #include "tensorflow/core/platform/prefetch.h"
 
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index abfb4a039cf85a14d8cfcd5acf96d35175cf8c95..a6a6a7c74f9d1fc1361546148b9425b87fc48698 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -22,22 +22,41 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace Eigen {
 namespace internal {
 
+#if GOOGLE_CUDA
+template <>
+struct scalar_arg_op<std::complex<float>> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_arg_op)
+  typedef typename Eigen::NumTraits<std::complex<float>>::Real result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const float operator()(
+      const std::complex<float>& a) const {
+    return ::atan2f(a.imag(), a.real());
+  }
+};
+
+template <>
+struct scalar_arg_op<std::complex<double>> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_arg_op)
+  typedef typename Eigen::NumTraits<std::complex<double>>::Real result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const double operator()(
+      const std::complex<double>& a) const {
+    return ::atan2(a.imag(), a.real());
+  }
+};
+#endif
+
+#if EIGEN_HAS_CXX11_MATH == 0
 template <typename T>
 struct scalar_asinh_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_asinh_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a) const {
-#if EIGEN_HAS_CXX11_MATH
-    return numext::asinh(a);
-#else
     return std::asinh(a);
-#endif  // EIGEN_HAS_CXX11_MATH
   }
 };
 template <typename T>
@@ -49,11 +68,7 @@ template <typename T>
 struct scalar_acosh_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_acosh_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a) const {
-#if EIGEN_HAS_CXX11_MATH
-    return numext::acosh(a);
-#else
     return std::acosh(a);
-#endif  // EIGEN_HAS_CXX11_MATH
   }
 };
 template <typename T>
@@ -65,35 +80,14 @@ template <typename T>
 struct scalar_atanh_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_atanh_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a) const {
-#if EIGEN_HAS_CXX11_MATH
-    return numext::atanh(a);
-#else
     return std::atanh(a);
-#endif  // EIGEN_HAS_CXX11_MATH
   }
 };
 template <typename T>
 struct functor_traits<scalar_atanh_op<T>> {
   enum { Cost = 5 * NumTraits<T>::MulCost, PacketAccess = false };
 };
-
-// TODO(rmlarsen): This is a workaround for upstream change
-// https://bitbucket.org/eigen/eigen/commits/f339468d04d0f87caeb6cab9aef568627e9f6ea9
-// that renamed scalar_binary_pow_op to scalar_pow_op and deleted the unary
-// version of the latter. Remove once we upgrade to Eigen 3.3.
-template <typename Scalar, typename Exponent>
-struct scalar_binary_pow_op_google {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_binary_pow_op_google)
-  EIGEN_DEVICE_FUNC inline Scalar operator()(const Scalar& a,
-                                             const Exponent& b) const {
-    return numext::pow(a, b);
-  }
-};
-
-template <typename Scalar, typename Exponent>
-struct functor_traits<scalar_binary_pow_op_google<Scalar, Exponent>> {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
-};
+#endif
 
 template <typename Scalar, typename Exponent>
 struct safe_scalar_binary_pow_op {
@@ -153,24 +147,49 @@ struct functor_traits<safe_div_or_mod_op<T, DivOrMod>> {
   };
 };
 
-template <typename T>
-struct div_no_nan_op {
-  EIGEN_EMPTY_STRUCT_CTOR(div_no_nan_op)
+template <typename T, typename Binary>
+struct no_nan_op {
+  EIGEN_EMPTY_STRUCT_CTOR(no_nan_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& a,
                                                            const T& b) const {
     if (b != 0) {
-      return scalar_quotient_op<T>()(a, b);
+      return Binary()(a, b);
     } else {
-      return 0;
+      return T(0);
     }
   }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
+  packetOp(const Packet& a, const Packet& b) const {
+    const Packet mask = pcmp_eq(b, pzero(b));
+    const Packet quotient = Binary().packetOp(a, b);
+    return pandnot(quotient, mask);
+  }
+};
+
+template <typename T>
+struct div_no_nan_op : public no_nan_op<T, scalar_quotient_op<T>> {
+  EIGEN_EMPTY_STRUCT_CTOR(div_no_nan_op)
 };
 
 template <typename T>
 struct functor_traits<div_no_nan_op<T>> {
   enum {
     Cost = functor_traits<scalar_quotient_op<T>>::Cost + NumTraits<T>::AddCost,
-    PacketAccess = false,
+    PacketAccess = true,
+  };
+};
+
+template <typename T>
+struct mul_no_nan_op : public no_nan_op<T, scalar_product_op<T>> {
+  EIGEN_EMPTY_STRUCT_CTOR(mul_no_nan_op)
+};
+
+template <typename T>
+struct functor_traits<mul_no_nan_op<T>> {
+  enum {
+    Cost = functor_traits<scalar_product_op<T>>::Cost + NumTraits<T>::AddCost,
+    PacketAccess = true,
   };
 };
 
@@ -797,6 +816,9 @@ struct mul : base<T, Eigen::internal::scalar_product_op<T>> {
   static const bool use_bcast_optimization = true;
 };
 
+template <typename T>
+struct mul_no_nan : base<T, Eigen::internal::mul_no_nan_op<T>> {};
+
 template <typename T>
 struct div : base<T, Eigen::internal::scalar_quotient_op<T>> {};
 
@@ -843,7 +865,7 @@ template <typename T>
 struct floor_div_real : base<T, Eigen::internal::google_floor_div_real<T>> {};
 
 template <typename T>
-struct pow : base<T, Eigen::internal::scalar_binary_pow_op_google<T, T>> {};
+struct pow : base<T, Eigen::internal::scalar_pow_op<T, T>> {};
 
 template <typename T>
 struct safe_pow : base<T, Eigen::internal::safe_scalar_binary_pow_op<T, T>> {
diff --git a/tensorflow/core/kernels/cwise_ops_common.h b/tensorflow/core/kernels/cwise_ops_common.h
index f77d7238aff2a47d418389b3e9f23155ba782cb1..07fe44778aca03bf267619d9db469290f669a9c2 100644
--- a/tensorflow/core/kernels/cwise_ops_common.h
+++ b/tensorflow/core/kernels/cwise_ops_common.h
@@ -264,7 +264,8 @@ class UnaryVariantOp : public OpKernel {
     const Variant& v = inp.scalar<Variant>()();
     Variant v_out;
     OP_REQUIRES_OK(ctx, UnaryOpVariant<Device>(ctx, OpEnum, v, &v_out));
-    Tensor out(cpu_allocator(), DT_VARIANT, TensorShape());
+    int numa_node = DeviceNumaNode(ctx->device());
+    Tensor out(cpu_allocator(numa_node), DT_VARIANT, TensorShape());
     out.scalar<Variant>()() = std::move(v_out);
     ctx->set_output(0, std::move(out));
   }
diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc
index 696d5840e8ce39c1bf210b54b9f28ae83cf232c7..acf7cc289933c2d42644faf63f58ec6af53957c9 100644
--- a/tensorflow/core/kernels/cwise_ops_test.cc
+++ b/tensorflow/core/kernels/cwise_ops_test.cc
@@ -45,6 +45,7 @@ int ColsFromArg(int arg) { return (arg % kRows); }
 #define BM_UNARY(DEVICE, FUNC, T, TYPE)                              \
   void BM_##DEVICE##_##FUNC##_##TYPE(int iters, int num) {           \
     const int64 tot = static_cast<int64>(iters) * num;               \
+    testing::UseRealTime();                                          \
     testing::ItemsProcessed(tot);                                    \
     testing::BytesProcessed(tot * sizeof(T));                        \
     test::Benchmark(#DEVICE, Unary<T>(#FUNC, num, TYPE)).Run(iters); \
@@ -100,6 +101,7 @@ Graph* BinaryScalar(int num, const string& func) {
 #define BM_BINARY_SCALAR(DEVICE, FUNC)                             \
   void BM_##DEVICE##_##FUNC##_scalar(int iters, int num) {         \
     const int64 tot = static_cast<int64>(iters) * num;             \
+    testing::UseRealTime();                                        \
     testing::ItemsProcessed(tot);                                  \
     testing::BytesProcessed(tot * sizeof(float));                  \
     test::Benchmark(#DEVICE, BinaryScalar(num, #FUNC)).Run(iters); \
@@ -125,6 +127,15 @@ BM_BINARY_SCALAR(gpu, Add);
 #ifdef TENSORFLOW_USE_SYCL
 BM_BINARY_SCALAR(sycl, Add);
 #endif  // TENSORFLOW_USE_SYCL
+
+BM_BINARY_SCALAR(cpu, DivNoNan);
+#if GOOGLE_CUDA
+BM_BINARY_SCALAR(gpu, DivNoNan);
+#endif  // GOOGLE_CUDA
+#ifdef TENSORFLOW_USE_SYCL
+BM_BINARY_SCALAR(sycl, DivNoNan);
+#endif  // TENSORFLOW_USE_SYCL
+
 #undef BM_BINARY_SCALAR
 
 template <class T>
@@ -146,6 +157,7 @@ Graph* BiasAdd(int rows, int cols, DataType type) {
     const int rows = RowsFromArg(arg);                                         \
     const int cols = ColsFromArg(arg);                                         \
     const int64 tot = static_cast<int64>(iters) * rows * cols;                 \
+    testing::UseRealTime();                                                    \
     testing::ItemsProcessed(tot);                                              \
     testing::BytesProcessed(tot * sizeof(C_TYPE));                             \
     test::Benchmark(#DEVICE, BiasAdd<C_TYPE>(rows, cols, TF_TYPE)).Run(iters); \
@@ -197,6 +209,7 @@ Graph* BiasAddGrad(int rows, int cols, int channels, DataType type,
     const int rows = RowsFromArg(arg);                                         \
     const int cols = ColsFromArg(arg);                                         \
     const int64 tot = static_cast<int64>(iters) * rows * cols * channels;      \
+    testing::UseRealTime();                                                    \
     testing::ItemsProcessed(tot);                                              \
     testing::BytesProcessed(tot * sizeof(C_TYPE));                             \
     test::Benchmark(#DEVICE, BiasAddGrad<C_TYPE>(rows, cols, channels,         \
@@ -259,6 +272,7 @@ Graph* BcastAdd(int rows, int cols, int dim) {
     const int rows = RowsFromArg(arg);                             \
     const int cols = ColsFromArg(arg);                             \
     const int64 tot = static_cast<int64>(iters) * rows * cols;     \
+    testing::UseRealTime();                                        \
     testing::ItemsProcessed(tot);                                  \
     testing::BytesProcessed(tot * sizeof(float));                  \
     test::Benchmark(#DEVICE, BcastAdd(rows, cols, 0)).Run(iters);  \
@@ -285,6 +299,7 @@ BM_BCAST_ADD_ROW_ALL(sycl);
     const int rows = RowsFromArg(arg);                             \
     const int cols = ColsFromArg(arg);                             \
     const int64 tot = static_cast<int64>(iters) * rows * cols;     \
+    testing::UseRealTime();                                        \
     testing::ItemsProcessed(tot);                                  \
     testing::BytesProcessed(tot * sizeof(float));                  \
     test::Benchmark(#DEVICE, BcastAdd(rows, cols, 1)).Run(iters);  \
@@ -311,6 +326,7 @@ BM_BCAST_ADD_COL_ALL(sycl);
     const int rows = RowsFromArg(arg);                                 \
     const int cols = ColsFromArg(arg);                                 \
     const int64 tot = static_cast<int64>(iters) * rows * cols;         \
+    testing::UseRealTime();                                            \
     testing::ItemsProcessed(tot);                                      \
     testing::BytesProcessed(tot * sizeof(float));                      \
     test::Benchmark(#DEVICE, BcastAdd(rows, cols, 2)).Run(iters);      \
@@ -338,6 +354,7 @@ BM_BCAST_ADD_CROSS_RC_ALL(sycl);
     const int rows = RowsFromArg(arg);                                 \
     const int cols = ColsFromArg(arg);                                 \
     const int64 tot = static_cast<int64>(iters) * rows * cols;         \
+    testing::UseRealTime();                                            \
     testing::ItemsProcessed(tot);                                      \
     testing::BytesProcessed(tot * sizeof(float));                      \
     test::Benchmark(#DEVICE, BcastAdd(rows, cols, 3)).Run(iters);      \
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index e2ab77632da4830f63d63c95c6ace5465fb46b9e..d22bf0dc57cd54b04d7178eb8bf557a8c25c549e 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -9,8 +9,8 @@ licenses(["notice"])  # Apache 2.0
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_kernel_library",
     "tf_cc_test",
+    "tf_kernel_library",
 )
 
 # TODO(mrry): Remove this empty forwarding library.
@@ -21,6 +21,27 @@ cc_library(
     deps = ["//tensorflow/core:framework"],
 )
 
+cc_library(
+    name = "dataset_test_base",
+    testonly = 1,
+    srcs = ["dataset_test_base.cc"],
+    hdrs = ["dataset_test_base.h"],
+    deps = [
+        ":dataset_utils",
+        ":iterator_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensor_testutil",
+        "//tensorflow/core:test",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:ops_testutil",
+    ],
+)
+
 cc_library(
     name = "dataset_utils",
     srcs = ["dataset_utils.cc"],
@@ -39,6 +60,7 @@ tf_cc_test(
     srcs = ["dataset_utils_test.cc"],
     deps = [
         ":dataset_utils",
+        "//tensorflow/core:framework",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
@@ -54,7 +76,6 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:variable_ops",
     ],
 )
@@ -76,13 +97,18 @@ tf_cc_test(
     srcs = ["single_threaded_executor_test.cc"],
     deps = [
         ":single_threaded_executor",
+        "//tensorflow/core:bitwise_ops_op_lib",
+        "//tensorflow/core:control_flow_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:math_ops_op_lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:random_ops_op_lib",
+        "//tensorflow/core:spectral_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -117,6 +143,17 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "shard_dataset_op",
+    srcs = ["shard_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
 tf_kernel_library(
     name = "window_dataset_op",
     srcs = ["window_dataset_op.cc"],
@@ -179,6 +216,28 @@ tf_kernel_library(
     ],
 )
 
+tf_cc_test(
+    name = "map_dataset_op_test",
+    size = "small",
+    srcs = ["map_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":map_dataset_op",
+        ":range_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:cwise_op",
+        "//tensorflow/core/kernels:function_ops",
+    ],
+)
+
 cc_library(
     name = "parallel_map_iterator",
     srcs = ["parallel_map_iterator.cc"],
@@ -189,7 +248,6 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
     ],
 )
 
@@ -343,6 +401,23 @@ tf_kernel_library(
     ],
 )
 
+tf_cc_test(
+    name = "range_dataset_op_test",
+    size = "small",
+    srcs = ["range_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":range_dataset_op",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:ptr_util",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "shuffle_dataset_op",
     srcs = ["shuffle_dataset_op.cc"],
@@ -365,6 +440,25 @@ tf_kernel_library(
     ],
 )
 
+tf_cc_test(
+    name = "sparse_tensor_slice_dataset_op_test",
+    size = "small",
+    srcs = ["sparse_tensor_slice_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":sparse_tensor_slice_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "tensor_dataset_op",
     srcs = ["tensor_dataset_op.cc"],
@@ -379,12 +473,32 @@ tf_kernel_library(
     name = "tensor_slice_dataset_op",
     srcs = ["tensor_slice_dataset_op.cc"],
     deps = [
+        ":dataset_utils",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
     ],
 )
 
+tf_cc_test(
+    name = "tensor_slice_dataset_op_test",
+    size = "small",
+    srcs = ["tensor_slice_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":tensor_slice_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "zip_dataset_op",
     srcs = ["zip_dataset_op.cc"],
@@ -411,6 +525,7 @@ tf_kernel_library(
     name = "reader_dataset_ops",
     srcs = ["reader_dataset_ops.cc"],
     deps = [
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -481,16 +596,15 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "optimize_dataset_op",
-    srcs = ["optimize_dataset_op.cc"],
+cc_library(
+    name = "graph_rewrite_dataset",
+    srcs = ["graph_rewrite_dataset.cc"],
+    hdrs = ["graph_rewrite_dataset.h"],
     deps = [
+        ":dataset_utils",
         "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:grappler_item_builder",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
@@ -501,6 +615,19 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "optimize_dataset_op",
+    srcs = ["optimize_dataset_op.cc"],
+    deps = [
+        ":graph_rewrite_dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 tf_kernel_library(
     name = "model_dataset_op",
     srcs = ["model_dataset_op.cc"],
@@ -508,8 +635,10 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -552,6 +681,7 @@ tf_kernel_library(
         ":range_dataset_op",
         ":reader_dataset_ops",
         ":repeat_dataset_op",
+        ":shard_dataset_op",
         ":shuffle_dataset_op",
         ":skip_dataset_op",
         ":sparse_tensor_slice_dataset_op",
@@ -560,6 +690,8 @@ tf_kernel_library(
         ":tensor_slice_dataset_op",
         ":window_dataset_op",
         ":zip_dataset_op",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core/kernels/data/experimental:dataset_kernels",
     ],
 )
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index 1f8d2bdbae897e471113375150935b69e47f6d84..f9ce0d9642dce1972bb94a2668b344e5f050d345 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -79,8 +79,8 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          Iterator::Params{this, strings::StrCat(prefix, "::Batch")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Batch")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index f00b38e732a7835896a275d14507e75eade05fa1..343157de6fea3df5fb7ada416f81f95534f76e1c 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -69,8 +69,8 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new FileIterator({this, strings::StrCat(prefix, "::FileCache")}));
+      return absl::make_unique<FileIterator>(
+          FileIterator::Params{this, strings::StrCat(prefix, "::FileCache")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -325,7 +325,7 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
           }
           filename_ = strings::StrCat(dataset()->filename_, "_", shard_id_);
           lockfile_ = strings::StrCat(filename_, ".lockfile");
-          writer_.reset(new BundleWriter(dataset()->env_, filename_));
+          writer_ = absl::make_unique<BundleWriter>(dataset()->env_, filename_);
           return Status::OK();
         }
 
@@ -385,7 +385,8 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
             // conditions are not met since BundleWriter's constructor creates
             // new temp files which can delete the temp files created by a
             // BundleWriter in another Session.
-            writer_.reset(new BundleWriter(dataset()->env_, filename_));
+            writer_ =
+                absl::make_unique<BundleWriter>(dataset()->env_, filename_);
             lockfile_created_ = true;
             return Status::OK();
           }
@@ -537,12 +538,14 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
         // `FileReaderIterator` and seek to the `cur_index`.
         switch (mode_) {
           case Mode::read:
-            iterator_.reset(new FileReaderIterator(
-                {dataset(), strings::StrCat(prefix(), "Impl")}));
+            iterator_ = absl::make_unique<FileReaderIterator>(
+                FileReaderIterator::Params{dataset(),
+                                           strings::StrCat(prefix(), "Impl")});
             break;
           case Mode::write:
-            iterator_.reset(new FileWriterIterator(
-                {dataset(), strings::StrCat(prefix(), "Impl")}));
+            iterator_ = absl::make_unique<FileWriterIterator>(
+                FileWriterIterator::Params{dataset(),
+                                           strings::StrCat(prefix(), "Impl")});
         }
       }
 
@@ -573,8 +576,8 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new MemoryIterator({this, strings::StrCat(prefix, "::MemoryCache")}));
+      return absl::make_unique<MemoryIterator>(MemoryIterator::Params{
+          this, strings::StrCat(prefix, "::MemoryCache")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -614,7 +617,9 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
      public:
       MemoryCache() = default;
 
-      string DebugString() override { return "CacheDataset::MemoryCache"; }
+      string DebugString() const override {
+        return "CacheDataset::MemoryCache";
+      }
 
       // Marks the cache as completed.
       void Complete() {
@@ -931,12 +936,16 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
       void InitializeIterator() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         switch (mode_) {
           case Mode::read:
-            iterator_.reset(new MemoryReaderIterator(
-                {dataset(), strings::StrCat(prefix(), "Impl")}, cache_));
+            iterator_ = absl::make_unique<MemoryReaderIterator>(
+                MemoryReaderIterator::Params{dataset(),
+                                             strings::StrCat(prefix(), "Impl")},
+                cache_);
             break;
           case Mode::write:
-            iterator_.reset(new MemoryWriterIterator(
-                {dataset(), strings::StrCat(prefix(), "Impl")}, cache_));
+            iterator_ = absl::make_unique<MemoryWriterIterator>(
+                MemoryWriterIterator::Params{dataset(),
+                                             strings::StrCat(prefix(), "Impl")},
+                cache_);
         }
       }
 
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index 973b6b06048fb715d9fd32791223cda21751b1c8..99b745b4c45c115b065fced39f7f206c240cf5ed 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/notification.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -114,8 +113,8 @@ Status CapturedFunction::Create(
   OpInputList inputs;
   TF_RETURN_IF_ERROR(ctx->input_list(argument, &inputs));
   std::vector<Tensor> arguments(inputs.begin(), inputs.end());
-  *out_function = WrapUnique(new CapturedFunction(func, std::move(arguments),
-                                                  use_inter_op_parallelism));
+  *out_function = absl::WrapUnique(new CapturedFunction(
+      func, std::move(arguments), use_inter_op_parallelism));
   return Status::OK();
 }
 
@@ -144,8 +143,10 @@ Status CapturedFunction::Instantiate(
     ret_types.push_back(ret_type);
   }
 
-  instantiated_captured_function->reset(new InstantiatedCapturedFunction(
-      lib, f_handle, std::move(ret_types), *ctx->runner(), this));
+  *instantiated_captured_function =
+      absl::WrapUnique<InstantiatedCapturedFunction>(
+          new InstantiatedCapturedFunction(lib, f_handle, std::move(ret_types),
+                                           *ctx->runner(), this));
   return Status::OK();
 }
 
@@ -422,11 +423,11 @@ void InstantiatedCapturedFunction::RunAsync(
   // (such as queue kernels) that depend on the non-nullness of
   // `OpKernelContext::cancellation_manager()`, but additional effort
   // will be required to plumb it through the `IteratorContext`.
-  CancellationManager* c_mgr = new CancellationManager;
+  CancellationManager* c_mgr = new CancellationManager();
   f_opts.cancellation_manager = c_mgr;
   std::shared_ptr<SimpleStepStatsCollector> stats_collector;
   if (ctx->model() || ctx->stats_aggregator()) {
-    stats_collector = MakeUnique<SimpleStepStatsCollector>();
+    stats_collector = absl::make_unique<SimpleStepStatsCollector>();
   }
   f_opts.stats_collector = stats_collector.get();
 
diff --git a/tensorflow/core/kernels/data/concatenate_dataset_op.cc b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
index 066b2c9aef4faaf23981b207e46c301e99360119..1d7c3a65d5cd5dca5398999073c84af1f2d5c29f 100644
--- a/tensorflow/core/kernels/data/concatenate_dataset_op.cc
+++ b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
@@ -63,8 +63,8 @@ class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Concatenate")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Concatenate")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/dataset_test_base.cc b/tensorflow/core/kernels/data/dataset_test_base.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e163dbaa3d7f64d636d249be294b70f3704e8c31
--- /dev/null
+++ b/tensorflow/core/kernels/data/dataset_test_base.cc
@@ -0,0 +1,213 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+
+namespace tensorflow {
+namespace data {
+
+Status DatasetOpsTestBase::ExpectEqual(const Tensor& a, const Tensor& b) {
+  EXPECT_EQ(a.dtype(), b.dtype());
+  switch (a.dtype()) {
+#define CASE(type)                       \
+  case DataTypeToEnum<type>::value:      \
+    test::ExpectTensorEqual<type>(a, b); \
+    break;
+    TF_CALL_NUMBER_TYPES(CASE);
+    TF_CALL_string(CASE);
+    // TODO(feihugis): figure out how to support variant tensors.
+#undef CASE
+    default:
+      return errors::Internal("Unsupported dtype", a.dtype());
+  }
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::CreateOpKernel(
+    const NodeDef& node_def, std::unique_ptr<OpKernel>* op_kernel) {
+  Status status;
+  *op_kernel =
+      tensorflow::CreateOpKernel(device_type_, device_.get(), allocator_,
+                                 node_def, TF_GRAPH_DEF_VERSION, &status);
+  return status;
+}
+
+Status DatasetOpsTestBase::CreateDataset(OpKernel* kernel,
+                                         OpKernelContext* context,
+                                         DatasetBase** const dataset) {
+  TF_RETURN_IF_ERROR(RunOpKernel(kernel, context));
+  // Assume that DatasetOp has only one output.
+  DCHECK_EQ(context->num_outputs(), 1);
+  TF_RETURN_IF_ERROR(GetDatasetFromContext(context, 0, dataset));
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::CreateIteratorContext(
+    OpKernelContext* const op_context,
+    std::unique_ptr<IteratorContext>* iterator_context) {
+  IteratorContext::Params params(op_context);
+  function_handle_cache_ = absl::make_unique<FunctionHandleCache>(flr_);
+  params.function_handle_cache = function_handle_cache_.get();
+  *iterator_context = absl::make_unique<IteratorContext>(params);
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::GetDatasetFromContext(OpKernelContext* context,
+                                                 int output_index,
+                                                 DatasetBase** const dataset) {
+  Tensor* output = context->mutable_output(output_index);
+  Status status = GetDatasetFromVariantTensor(*output, dataset);
+  (*dataset)->Ref();
+  return status;
+}
+
+Status DatasetOpsTestBase::InitThreadPool(int thread_num) {
+  if (thread_num < 1) {
+    return errors::InvalidArgument(
+        "The `thread_num` argument should be positive but got: ", thread_num);
+  }
+  thread_pool_ = absl::make_unique<thread::ThreadPool>(
+      Env::Default(), ThreadOptions(), "inter_op", thread_num);
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::InitFunctionLibraryRuntime(
+    const std::vector<FunctionDef>& flib, int cpu_num) {
+  if (cpu_num < 1) {
+    return errors::InvalidArgument(
+        "The `cpu_num` argument should be positive but got: ", cpu_num);
+  }
+  SessionOptions options;
+  auto* device_count = options.config.mutable_device_count();
+  device_count->insert({"CPU", cpu_num});
+  std::vector<std::unique_ptr<Device>> devices;
+  TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(
+      options, "/job:localhost/replica:0/task:0", &devices));
+  device_mgr_ = absl::make_unique<DeviceMgr>(std::move(devices));
+
+  FunctionDefLibrary proto;
+  for (const auto& fdef : flib) *(proto.add_function()) = fdef;
+  lib_def_ =
+      absl::make_unique<FunctionLibraryDefinition>(OpRegistry::Global(), proto);
+
+  OptimizerOptions opts;
+  pflr_ = absl::make_unique<ProcessFunctionLibraryRuntime>(
+      device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, lib_def_.get(),
+      opts, thread_pool_.get(), nullptr /* cluster_flr */);
+  flr_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
+  if (thread_pool_ == nullptr) {
+    runner_ = [](std::function<void()> fn) { fn(); };
+  } else {
+    runner_ = [this](std::function<void()> fn) {
+      thread_pool_->Schedule(std::move(fn));
+    };
+  }
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::RunOpKernel(OpKernel* op_kernel,
+                                       OpKernelContext* context) {
+  device_->Compute(op_kernel, context);
+  return context->status();
+}
+
+Status DatasetOpsTestBase::CreateOpKernelContext(
+    OpKernel* kernel, gtl::InlinedVector<TensorValue, 4>* inputs,
+    std::unique_ptr<OpKernelContext>* context) {
+  params_ = absl::make_unique<OpKernelContext::Params>();
+  params_->device = device_.get();
+  params_->resource_manager = device_->resource_manager();
+  params_->frame_iter = FrameAndIter(0, 0);
+  params_->inputs = inputs;
+  params_->op_kernel = kernel;
+  params_->function_library = flr_;
+  params_->runner = &runner_;
+  step_container_ =
+      absl::make_unique<ScopedStepContainer>(0, [](const string&) {});
+  params_->step_container = step_container_.get();
+  checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper;
+  slice_reader_cache_ =
+      absl::make_unique<checkpoint::TensorSliceReaderCacheWrapper>();
+  params_->slice_reader_cache = slice_reader_cache_.get();
+
+  // Set the allocator attributes for the outputs.
+  allocator_attrs_.clear();
+  for (int index = 0; index < params_->op_kernel->num_outputs(); index++) {
+    AllocatorAttributes attr;
+    const bool on_host =
+        (params_->op_kernel->output_memory_types()[index] == HOST_MEMORY);
+    attr.set_on_host(on_host);
+    allocator_attrs_.emplace_back(attr);
+  }
+  params_->output_attr_array = gtl::vector_as_array(&allocator_attrs_);
+
+  *context = absl::make_unique<OpKernelContext>(params_.get());
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::CreateSerializationContext(
+    std::unique_ptr<SerializationContext>* context) {
+  SerializationContext::Params params;
+  params.flib_def = lib_def_.get();
+  *context = absl::make_unique<SerializationContext>(params);
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::CheckOpKernelInput(
+    const OpKernel& kernel, const gtl::InlinedVector<TensorValue, 4>& inputs) {
+  if (kernel.input_types().size() != inputs.size()) {
+    return errors::Internal("The number of input elements should be ",
+                            kernel.input_types().size(),
+                            ", but got: ", inputs.size());
+  }
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::AddDatasetInput(
+    gtl::InlinedVector<TensorValue, 4>* inputs, DataTypeVector input_types,
+    DataType dtype, const TensorShape& shape) {
+  if (input_types.size() < inputs->size()) {
+    return errors::InvalidArgument("Adding more inputs than types: ",
+                                   inputs->size(), " vs. ", input_types.size());
+  }
+  bool is_ref = IsRefType(input_types[inputs->size()]);
+  std::unique_ptr<Tensor> input =
+      absl::make_unique<Tensor>(allocator_, dtype, shape);
+
+  if (is_ref) {
+    DataType expected_dtype = RemoveRefType(input_types[inputs->size()]);
+    if (expected_dtype != dtype) {
+      return errors::InvalidArgument("The input data type is ", dtype,
+                                     " , but expected: ", expected_dtype);
+    }
+    inputs->push_back({&lock_for_refs_, input.get()});
+  } else {
+    if (input_types[inputs->size()] != dtype) {
+      return errors::InvalidArgument(
+          "The input data type is ", dtype,
+          " , but expected: ", input_types[inputs->size()]);
+    }
+    inputs->push_back({nullptr, input.get()});
+  }
+
+  // TODO(jsimsa): Figure out how to avoid using a member variable to garbage
+  // collect the inputs.
+  tensors_.push_back(std::move(input));
+
+  return Status::OK();
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/dataset_test_base.h b/tensorflow/core/kernels/data/dataset_test_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c3a62c693028c064f19948ee782bb0bc7c2ccf9
--- /dev/null
+++ b/tensorflow/core/kernels/data/dataset_test_base.h
@@ -0,0 +1,193 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_DATASET_TEST_BASE_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_DATASET_TEST_BASE_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/iterator_ops.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+
+// Helpful functions to test Dataset op kernels.
+class DatasetOpsTestBase : public ::testing::Test {
+ public:
+  DatasetOpsTestBase()
+      : device_(DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")),
+        device_type_(DEVICE_CPU) {
+    allocator_ = device_->GetAllocator(AllocatorAttributes());
+  }
+
+  ~DatasetOpsTestBase() {}
+
+  // The method validates whether the two tensors have the same shape, dtype,
+  // and value.
+  static Status ExpectEqual(const Tensor& a, const Tensor& b);
+
+  // Creates a tensor with the specified dtype, shape, and value.
+  template <typename T>
+  static Tensor CreateTensor(TensorShape input_shape,
+                             const gtl::ArraySlice<T>& input_data) {
+    Tensor tensor(DataTypeToEnum<T>::value, input_shape);
+    test::FillValues<T>(&tensor, input_data);
+    return tensor;
+  }
+
+  // Creates a new op kernel based on the node definition.
+  Status CreateOpKernel(const NodeDef& node_def,
+                        std::unique_ptr<OpKernel>* op_kernel);
+
+  // Creates a new dataset.
+  Status CreateDataset(OpKernel* kernel, OpKernelContext* context,
+                       DatasetBase** const dataset);
+
+  // Creates a new RangeDataset op kernel. `T` specifies the output dtype of the
+  // op kernel.
+  template <typename T>
+  Status CreateRangeDatasetOpKernel(
+      StringPiece node_name, std::unique_ptr<OpKernel>* range_op_kernel) {
+    DataTypeVector dtypes({tensorflow::DataTypeToEnum<T>::value});
+    std::vector<PartialTensorShape> shapes({{}});
+    NodeDef node_def = test::function::NDef(
+        node_name, "RangeDataset", {"start", "stop", "step"},
+        {{"output_types", dtypes}, {"output_shapes", shapes}});
+
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def, range_op_kernel));
+    return Status::OK();
+  }
+
+  // Creates a new RangeDataset dataset. `T` specifies the output dtype of the
+  // RangeDataset op kernel.
+  template <typename T>
+  Status CreateRangeDataset(int64 start, int64 end, int64 step,
+                            StringPiece node_name,
+                            DatasetBase** range_dataset) {
+    std::unique_ptr<OpKernel> range_kernel;
+    TF_RETURN_IF_ERROR(CreateRangeDatasetOpKernel<T>(node_name, &range_kernel));
+    gtl::InlinedVector<TensorValue, 4> range_inputs;
+    TF_RETURN_IF_ERROR(AddDatasetInputFromArray<int64>(
+        &range_inputs, range_kernel->input_types(), TensorShape({}), {start}));
+    TF_RETURN_IF_ERROR(AddDatasetInputFromArray<int64>(
+        &range_inputs, range_kernel->input_types(), TensorShape({}), {end}));
+    TF_RETURN_IF_ERROR(AddDatasetInputFromArray<int64>(
+        &range_inputs, range_kernel->input_types(), TensorShape({}), {step}));
+    std::unique_ptr<OpKernelContext> range_context;
+    TF_RETURN_IF_ERROR(CreateOpKernelContext(range_kernel.get(), &range_inputs,
+                                             &range_context));
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*range_kernel, range_inputs));
+    TF_RETURN_IF_ERROR(RunOpKernel(range_kernel.get(), range_context.get()));
+    TF_RETURN_IF_ERROR(
+        GetDatasetFromContext(range_context.get(), 0, range_dataset));
+    return Status::OK();
+  }
+
+  // Fetches the dataset from the operation context.
+  Status GetDatasetFromContext(OpKernelContext* context, int output_index,
+                               DatasetBase** const dataset);
+
+ protected:
+  // Creates a thread pool for parallel tasks.
+  Status InitThreadPool(int thread_num);
+
+  // Initializes the runtime for computing the dataset operation and registers
+  // the input function definitions. `InitThreadPool()' needs to be called
+  // before this method if we want to run the tasks in parallel.
+  Status InitFunctionLibraryRuntime(const std::vector<FunctionDef>& flib,
+                                    int cpu_num);
+
+  // Runs an operation producing outputs.
+  Status RunOpKernel(OpKernel* op_kernel, OpKernelContext* context);
+
+  // Checks that the size of `inputs` matches the requirement of the op kernel.
+  Status CheckOpKernelInput(const OpKernel& kernel,
+                            const gtl::InlinedVector<TensorValue, 4>& inputs);
+
+  // Creates a new context for running the dataset operation.
+  Status CreateOpKernelContext(OpKernel* kernel,
+                               gtl::InlinedVector<TensorValue, 4>* inputs,
+                               std::unique_ptr<OpKernelContext>* context);
+
+  // Creates a new iterator context for iterating the dataset.
+  Status CreateIteratorContext(
+      OpKernelContext* const op_context,
+      std::unique_ptr<IteratorContext>* iterator_context);
+
+  // Creates a new serialization context for serializing the dataset and
+  // iterator.
+  Status CreateSerializationContext(
+      std::unique_ptr<SerializationContext>* context);
+
+  // Adds an arrayslice of data into the input vector. `input_types` describes
+  // the required data type for each input tensor. `shape` and `data` describes
+  // the shape and values of the current input tensor. `T` specifies the dtype
+  // of the input data.
+  template <typename T>
+  Status AddDatasetInputFromArray(gtl::InlinedVector<TensorValue, 4>* inputs,
+                                  DataTypeVector input_types,
+                                  const TensorShape& shape,
+                                  const gtl::ArraySlice<T>& data) {
+    TF_RETURN_IF_ERROR(
+        AddDatasetInput(inputs, input_types, DataTypeToEnum<T>::v(), shape));
+    test::FillValues<T>(inputs->back().tensor, data);
+    return Status::OK();
+  }
+
+ private:
+  // Adds an empty tensor with the specified dtype and shape to the input
+  // vector.
+  Status AddDatasetInput(gtl::InlinedVector<TensorValue, 4>* inputs,
+                         DataTypeVector input_types, DataType dtype,
+                         const TensorShape& shape);
+
+ protected:
+  std::unique_ptr<Device> device_;
+  DeviceType device_type_;
+  Allocator* allocator_;  // Owned by `AllocatorFactoryRegistry`.
+  std::vector<AllocatorAttributes> allocator_attrs_;
+  std::unique_ptr<ScopedStepContainer> step_container_;
+
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  FunctionLibraryRuntime* flr_;  // Owned by `pflr_`.
+  std::unique_ptr<FunctionHandleCache> function_handle_cache_;
+  std::function<void(std::function<void()>)> runner_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<FunctionLibraryDefinition> lib_def_;
+  std::unique_ptr<OpKernelContext::Params> params_;
+  std::unique_ptr<checkpoint::TensorSliceReaderCacheWrapper>
+      slice_reader_cache_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+  std::vector<std::unique_ptr<Tensor>> tensors_;  // Owns tensors.
+  mutex lock_for_refs_;  // Used as the Mutex for inputs added as refs.
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_DATASET_TEST_BASE_H_
diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc
index 4d92d314d3d207d12310bb744b5601ad922bc570..9def8c916186dc5c7d2b260a9df8e0430d9aa43a 100644
--- a/tensorflow/core/kernels/data/dataset_utils.cc
+++ b/tensorflow/core/kernels/data/dataset_utils.cc
@@ -141,5 +141,125 @@ Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected,
   return Status::OK();
 }
 
+namespace {
+
+constexpr char kDelimiter[] = "@@";
+
+}  // namespace
+
+VariantTensorDataReader::VariantTensorDataReader(
+    const tensorflow::VariantTensorData* data)
+    : data_(data) {
+  string metadata;
+  data_->get_metadata(&metadata);
+  auto keys = str_util::Split(metadata, kDelimiter, str_util::SkipEmpty());
+  for (size_t i = 0; i < keys.size(); ++i) {
+    map_[keys[i]] = i;
+  }
+}
+
+Status VariantTensorDataReader::ReadScalar(StringPiece key, int64* val) {
+  return ReadScalarInternal(key, val);
+}
+
+Status VariantTensorDataReader::ReadScalar(StringPiece key, string* val) {
+  return ReadScalarInternal(key, val);
+}
+
+Status VariantTensorDataReader::ReadTensor(StringPiece key, Tensor* val) {
+  return ReadTensorInternal(key, val);
+}
+
+bool VariantTensorDataReader::Contains(StringPiece key) {
+  return map_.find(string(key)) != map_.end();
+}
+
+template <typename T>
+Status VariantTensorDataReader::ReadScalarInternal(StringPiece key, T* val) {
+  if (map_.find(string(key)) == map_.end()) {
+    return errors::NotFound(key);
+  }
+  *val = data_->tensors(map_[string(key)]).scalar<T>()();
+  return Status::OK();
+}
+
+Status VariantTensorDataReader::ReadTensorInternal(StringPiece key,
+                                                   Tensor* val) {
+  if (map_.find(string(key)) == map_.end()) {
+    return errors::NotFound(key);
+  }
+  *val = data_->tensors(map_[string(key)]);
+  return Status::OK();
+}
+
+Status VariantTensorDataWriter::WriteScalar(StringPiece key, const int64 val) {
+  return WriteScalarInternal(key, val);
+}
+
+Status VariantTensorDataWriter::WriteScalar(StringPiece key,
+                                            const string& val) {
+  return WriteScalarInternal(key, val);
+}
+
+Status VariantTensorDataWriter::WriteTensor(StringPiece key,
+                                            const Tensor& val) {
+  return WriteTensorInternal(key, val);
+}
+
+Status VariantTensorDataWriter::Flush() {
+  string metadata;
+  for (size_t i = 0; i < keys_.size(); ++i) {
+    strings::StrAppend(&metadata, kDelimiter, keys_[i]);
+  }
+  data_->set_metadata(metadata);
+  return Status::OK();
+}
+
+template <typename T>
+Status VariantTensorDataWriter::WriteScalarInternal(StringPiece key,
+                                                    const T& val) {
+  Tensor val_t = Tensor(DataTypeToEnum<T>::v(), TensorShape({}));
+  val_t.scalar<T>()() = val;
+  return WriteTensorInternal(key, val_t);
+}
+
+Status VariantTensorDataWriter::WriteTensorInternal(StringPiece key,
+                                                    const Tensor& val) {
+  DCHECK_EQ(key.find(kDelimiter), string::npos);
+  keys_.push_back(string(key));
+  *(data_->add_tensors()) = val;
+  return Status::OK();
+}
+
+Status AddToFunctionLibrary(FunctionLibraryDefinition* base,
+                            const FunctionLibraryDefinition& to_add) {
+  for (const auto& fn : to_add.ListFunctionNames()) {
+    if (auto found = base->Find(fn)) {
+      if (!OpDefEqual(found->signature(), to_add.Find(fn)->signature())) {
+        return errors::InvalidArgument("Cannot add function '", fn,
+                                       "' because a different function with "
+                                       "the same signature already exists.");
+      }
+      TF_RETURN_IF_ERROR(base->RemoveFunction(fn));
+    }
+  }
+  return base->AddLibrary(to_add);
+}
+
+Status AddToFunctionLibrary(FunctionLibraryDefinition* base,
+                            const FunctionDefLibrary& to_add) {
+  for (const auto& fd : to_add.function()) {
+    if (auto found = base->Find(fd.signature().name())) {
+      if (!OpDefEqual(found->signature(), fd.signature())) {
+        return errors::InvalidArgument("Cannot add function '",
+                                       fd.signature().name(),
+                                       "' because a different function with "
+                                       "the same signature already exists.");
+      }
+      TF_RETURN_IF_ERROR(base->RemoveFunction(fd.signature().name()));
+    }
+  }
+  return base->AddLibrary(to_add);
+}
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index 23a3d93ed160c95099a5c8ddb237b4c055a1845c..d85e87ca098bece338aed6c1addc0f52030bdcc6 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -57,6 +57,54 @@ Status VerifyTypesMatch(const DataTypeVector& expected,
 Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected,
                               const std::vector<PartialTensorShape>& received);
 
+// Helper class for reading data from a VariantTensorData object.
+class VariantTensorDataReader : public IteratorStateReader {
+ public:
+  explicit VariantTensorDataReader(const VariantTensorData* data);
+
+  // Returns OK iff the initialization was successful.
+  Status ReadScalar(StringPiece key, int64* val) override;
+  Status ReadScalar(StringPiece key, string* val) override;
+  Status ReadTensor(StringPiece key, Tensor* val) override;
+  bool Contains(StringPiece key) override;
+
+ private:
+  template <typename T>
+  Status ReadScalarInternal(StringPiece key, T* val);
+  Status ReadTensorInternal(StringPiece key, Tensor* val);
+
+  std::map<string, size_t> map_;
+  const VariantTensorData* data_;  // Not owned.
+};
+
+// Helper class for writing data to a VariantTensorData object.
+class VariantTensorDataWriter : public IteratorStateWriter {
+ public:
+  // Does not take ownership of data.
+  explicit VariantTensorDataWriter(VariantTensorData* data) : data_(data) {}
+  Status WriteScalar(StringPiece key, const int64 val) override;
+  Status WriteScalar(StringPiece key, const string& val) override;
+  Status WriteTensor(StringPiece key, const Tensor& val) override;
+
+  // Writes the metadata to `data_`.
+  Status Flush();
+
+ private:
+  template <typename T>
+  Status WriteScalarInternal(StringPiece key, const T& val);
+  Status WriteTensorInternal(StringPiece key, const Tensor& val);
+
+  VariantTensorData* data_;
+  std::vector<string> keys_;
+};
+
+// Adds the functions in `to_add` to `base`. If a function with a matching
+// signature already exists in `base`, replaces it with the function from
+// `to_add`.
+Status AddToFunctionLibrary(FunctionLibraryDefinition* base,
+                            const FunctionLibraryDefinition& to_add);
+Status AddToFunctionLibrary(FunctionLibraryDefinition* base,
+                            const FunctionDefLibrary& to_add);
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/data/dataset_utils_test.cc b/tensorflow/core/kernels/data/dataset_utils_test.cc
index 43295b8ebb8f9df2acae8e17162f2d307dd4d9c5..bddd2d455e55f6e2656953034a27eb351128f534 100644
--- a/tensorflow/core/kernels/data/dataset_utils_test.cc
+++ b/tensorflow/core/kernels/data/dataset_utils_test.cc
@@ -14,14 +14,17 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/data/dataset_utils.h"
-
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-TEST(DatasetUtils, ComputeMoveVector) {
+TEST(DatasetUtilsTest, ComputeMoveVector) {
   struct TestCase {
     std::vector<int> indices;
     std::vector<bool> expected;
@@ -41,6 +44,125 @@ TEST(DatasetUtils, ComputeMoveVector) {
   }
 }
 
+TEST(DatasetUtilsTest, VariantTensorDataRoundtrip) {
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(writer.WriteScalar("Int64", 24));
+  Tensor input_tensor(DT_FLOAT, {1});
+  input_tensor.flat<float>()(0) = 2.0f;
+  TF_ASSERT_OK(writer.WriteTensor("Tensor", input_tensor));
+  TF_ASSERT_OK(writer.Flush());
+
+  VariantTensorDataReader reader(&data);
+  int64 val_int64;
+  TF_ASSERT_OK(reader.ReadScalar("Int64", &val_int64));
+  EXPECT_EQ(val_int64, 24);
+  Tensor val_tensor;
+  TF_ASSERT_OK(reader.ReadTensor("Tensor", &val_tensor));
+  EXPECT_EQ(input_tensor.NumElements(), val_tensor.NumElements());
+  EXPECT_EQ(input_tensor.flat<float>()(0), val_tensor.flat<float>()(0));
+}
+
+TEST(DatasetUtilsTest, VariantTensorDataNonExistentKey) {
+  VariantTensorData data;
+  strings::StrAppend(&data.metadata_, "key1", "@@");
+  data.tensors_.push_back(Tensor(DT_INT64, {1}));
+  VariantTensorDataReader reader(&data);
+  int64 val_int64;
+  string val_string;
+  Tensor val_tensor;
+  EXPECT_EQ(error::NOT_FOUND,
+            reader.ReadScalar("NonExistentKey", &val_int64).code());
+  EXPECT_EQ(error::NOT_FOUND,
+            reader.ReadScalar("NonExistentKey", &val_string).code());
+  EXPECT_EQ(error::NOT_FOUND,
+            reader.ReadTensor("NonExistentKey", &val_tensor).code());
+}
+
+TEST(DatasetUtilsTest, AddToFunctionLibrary) {
+  auto make_fn_a = [](const string& fn_name) {
+    return FunctionDefHelper::Create(
+        /*function_name=*/fn_name,
+        /*in_def=*/{"arg: int64"},
+        /*out_def=*/{"ret: int64"},
+        /*attr_def=*/{},
+        /*node_def=*/{{{"node"}, "Identity", {"arg"}, {{"T", DT_INT64}}}},
+        /*ret_def=*/{{"ret", "node:output:0"}});
+  };
+
+  auto make_fn_b = [](const string& fn_name) {
+    return FunctionDefHelper::Create(
+        /*function_name=*/fn_name,
+        /*in_def=*/{"arg: int64"},
+        /*out_def=*/{"ret: int64"},
+        /*attr_def=*/{},
+        /*node_def=*/
+        {{{"node"}, "Identity", {"arg"}, {{"T", DT_INT64}}},
+         {{"node2"}, "Identity", {"node:output:0"}, {{"T", DT_INT64}}}},
+        /*ret_def=*/{{"ret", "node2:output:0"}});
+  };
+
+  FunctionDefLibrary fdef_base;
+  *fdef_base.add_function() = make_fn_a("0");
+  *fdef_base.add_function() = make_fn_a("1");
+  *fdef_base.add_function() = make_fn_a("2");
+
+  FunctionDefLibrary fdef_to_add;
+  *fdef_to_add.add_function() = make_fn_b("0");  // Override
+  *fdef_to_add.add_function() = make_fn_a("1");  // Do nothing
+  *fdef_to_add.add_function() = make_fn_b("3");  // Add new function
+
+  FunctionLibraryDefinition flib_0(OpRegistry::Global(), fdef_base);
+  TF_ASSERT_OK(AddToFunctionLibrary(&flib_0, fdef_to_add));
+
+  FunctionLibraryDefinition flib_1(OpRegistry::Global(), fdef_base);
+  FunctionLibraryDefinition flib_to_add(OpRegistry::Global(), fdef_to_add);
+  TF_ASSERT_OK(AddToFunctionLibrary(&flib_1, flib_to_add));
+
+  for (const auto& flib : {flib_0, flib_1}) {
+    EXPECT_TRUE(FunctionDefsEqual(*flib.Find("0"), make_fn_b("0")));
+    EXPECT_TRUE(FunctionDefsEqual(*flib.Find("1"), make_fn_a("1")));
+    EXPECT_TRUE(FunctionDefsEqual(*flib.Find("2"), make_fn_a("2")));
+    EXPECT_TRUE(FunctionDefsEqual(*flib.Find("3"), make_fn_b("3")));
+  }
+}
+
+TEST(DatasetUtilsTest, AddToFunctionLibraryWithConflictingSignatures) {
+  FunctionDefLibrary fdef_base;
+  *fdef_base.add_function() = FunctionDefHelper::Create(
+      /*function_name=*/"0",
+      /*in_def=*/{"arg: int64"},
+      /*out_def=*/{"ret: int64"},
+      /*attr_def=*/{},
+      /*node_def=*/{},
+      /*ret_def=*/{{"ret", "arg"}});
+
+  FunctionDefLibrary fdef_to_add;
+  *fdef_to_add.add_function() = FunctionDefHelper::Create(
+      /*function_name=*/"0",
+      /*in_def=*/{"arg: int64"},
+      /*out_def=*/{"ret: int64", "ret2: int64"},
+      /*attr_def=*/{},
+      /*node_def=*/{},
+      /*ret_def=*/{{"ret", "arg"}, {"ret2", "arg"}});
+
+  FunctionLibraryDefinition flib_0(OpRegistry::Global(), fdef_base);
+  Status s = AddToFunctionLibrary(&flib_0, fdef_to_add);
+  EXPECT_EQ(error::Code::INVALID_ARGUMENT, s.code());
+  EXPECT_EQ(
+      "Cannot add function '0' because a different function with the same "
+      "signature already exists.",
+      s.error_message());
+
+  FunctionLibraryDefinition flib_1(OpRegistry::Global(), fdef_base);
+  FunctionLibraryDefinition flib_to_add(OpRegistry::Global(), fdef_to_add);
+  s = AddToFunctionLibrary(&flib_1, flib_to_add);
+  EXPECT_EQ(error::Code::INVALID_ARGUMENT, s.code());
+  EXPECT_EQ(
+      "Cannot add function '0' because a different function with the same "
+      "signature already exists.",
+      s.error_message());
+}
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index 7433303f77671cbf67a6365fb1d552edc7b471e0..4f59e63e37c71d48f250cb15bfae5e256753b914 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -119,11 +119,13 @@ tf_kernel_library(
     name = "map_and_batch_dataset_op",
     srcs = ["map_and_batch_dataset_op.cc"],
     deps = [
+        "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core/kernels:inplace_ops",
         "//tensorflow/core/kernels/data:captured_function",
         "//tensorflow/core/kernels/data:dataset_utils",
@@ -142,6 +144,18 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "choose_fastest_dataset_op",
+    srcs = ["choose_fastest_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:dataset",
+    ],
+)
+
 tf_kernel_library(
     name = "non_serializable_dataset_op",
     srcs = ["non_serializable_dataset_op.cc"],
@@ -156,11 +170,13 @@ tf_kernel_library(
     name = "numa_map_and_batch_dataset_op",
     srcs = ["numa_map_and_batch_dataset_op.cc"],
     deps = [
+        "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core/kernels:inplace_ops",
         "//tensorflow/core/kernels/data:captured_function",
         "@com_google_absl//absl/memory",
@@ -187,6 +203,7 @@ tf_kernel_library(
     deps = [
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core/kernels/data:parallel_map_iterator",
     ],
 )
@@ -214,6 +231,21 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "rebatch_dataset_op",
+    srcs = ["rebatch_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler/optimizers/data:rebatch",
+        "//tensorflow/core/kernels/data:graph_rewrite_dataset",
+    ],
+)
+
 tf_kernel_library(
     name = "scan_dataset_op",
     srcs = ["scan_dataset_op.cc"],
@@ -293,6 +325,20 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "take_while_dataset_op",
+    srcs = ["take_while_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:captured_function",
+        "//tensorflow/core/kernels/data:dataset_utils",
+    ],
+)
+
 tf_kernel_library(
     name = "to_tf_record_op",
     srcs = ["to_tf_record_op.cc"],
@@ -342,6 +388,7 @@ tf_kernel_library(
     name = "dataset_kernels",
     deps = [
         ":assert_next_dataset_op",
+        ":choose_fastest_dataset_op",
         ":csv_dataset_op",
         ":dense_to_sparse_batch_dataset_op",
         ":directed_interleave_dataset_op",
@@ -358,6 +405,7 @@ tf_kernel_library(
         ":parse_example_dataset_op",
         ":prefetching_kernels",
         ":random_dataset_op",
+        ":rebatch_dataset_op",
         ":scan_dataset_op",
         ":set_stats_aggregator_dataset_op",
         ":sleep_dataset_op",
@@ -365,6 +413,7 @@ tf_kernel_library(
         ":sql_dataset_op",
         ":stats_aggregator_ops",
         ":stats_dataset_ops",
+        ":take_while_dataset_op",
         ":threadpool_dataset_op",
         ":to_tf_record_op",
         ":unbatch_dataset_op",
diff --git a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
index 3e87f484b940b336ed68099df7427250a4304207..eb547133609a828f770ff5dc0acc1559f25eb3d2 100644
--- a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
@@ -61,8 +61,8 @@ class AssertNextDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Assert")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::AssertNext")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc b/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e63208f26a93d2a6b5fc265355226acb71b01bd0
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc
@@ -0,0 +1,351 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/histogram/histogram.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+static const double kPercentile = 90.0;
+
+class ChooseFastestDatasetOp : public DatasetOpKernel {
+ public:
+  explicit ChooseFastestDatasetOp(OpKernelConstruction* ctx)
+      : DatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("num_experiments", &num_experiments_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    OpInputList input_list;
+    OP_REQUIRES_OK(ctx, ctx->input_list("input_datasets", &input_list));
+    OP_REQUIRES(
+        ctx, input_list.size() > 1,
+        errors::InvalidArgument(
+            "ChooseFastestDataset must have at least two input datasets."));
+
+    std::vector<DatasetBase*> inputs;
+    inputs.reserve(input_list.size());
+    for (const auto& tensor : input_list) {
+      DatasetBase* input;
+      OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(tensor, &input));
+      inputs.push_back(input);
+    }
+
+    for (size_t i = 1, num_inputs = inputs.size(); i < num_inputs; ++i) {
+      OP_REQUIRES(
+          ctx, inputs[i]->output_dtypes() == output_types_,
+          errors::InvalidArgument(
+              "All inputs to ChooseFastestDataset "
+              "must have the same output types. Input ",
+              i, " has output types: ",
+              DataTypeVectorString(inputs[i]->output_dtypes()),
+              ". Expected: ", DataTypeVectorString(output_types_), "."));
+    }
+
+    // Merge the output shapes of all the input datasets, returning an
+    // error if any of them are incompatible.
+    for (size_t i = 1, num_inputs = inputs.size(); i < num_inputs; ++i) {
+      OP_REQUIRES(
+          ctx, inputs[i]->output_shapes().size() == output_shapes_.size(),
+          errors::InvalidArgument(
+              "All inputs to ChooseFastestDataset must have compatible outputs."
+              " Input ",
+              i, " has ", inputs[i]->output_shapes().size(),
+              " components. Expected to have ", output_shapes_.size(),
+              " components."));
+      for (size_t j = 0, num_components = output_shapes_.size();
+           j < num_components; ++j) {
+        PartialTensorShape result;
+        OP_REQUIRES(ctx,
+                    output_shapes_[j]
+                        .MergeWith(inputs[i]->output_shapes().at(j), &result)
+                        .ok(),
+                    errors::InvalidArgument(
+                        "All inputs to ChooseFastestDataset must have "
+                        "compatible output shapes. Component ",
+                        j, " of input ", i,
+                        " has shape: ", inputs[i]->output_shapes().at(j),
+                        ". Expected to be compatible with shape: ",
+                        output_shapes_[j], "."));
+        output_shapes_[j] = std::move(result);
+      }
+    }
+
+    int64 cardinality = inputs[0]->Cardinality();
+    for (size_t i = 1, num_inputs = inputs.size(); i < num_inputs; ++i) {
+      if (cardinality == kUnknownCardinality) {
+        cardinality = inputs[i]->Cardinality();
+      } else {
+        OP_REQUIRES(
+            ctx,
+            inputs[i]->Cardinality() == cardinality ||
+                inputs[i]->Cardinality() == kUnknownCardinality,
+            errors::InvalidArgument(
+                "All inputs to ChooseFastestDataset must have compatible "
+                "cardinalities. Input ",
+                i, " has cardinality: ", inputs[i]->Cardinality(),
+                ", while all prior inputs have cardinality: ", cardinality,
+                "."));
+      }
+    }
+    *output = new Dataset(ctx, std::move(inputs), output_types_, output_shapes_,
+                          cardinality, num_experiments_);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, std::vector<DatasetBase*> inputs,
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes,
+            int64 cardinality, int64 num_experiments)
+        : DatasetBase(DatasetContext(ctx)),
+          inputs_(std::move(inputs)),
+          output_types_(output_types),
+          output_shapes_(output_shapes),
+          cardinality_(cardinality),
+          num_experiments_(num_experiments) {
+      for (auto input : inputs_) {
+        input->Ref();
+      }
+    }
+
+    ~Dataset() override {
+      for (auto input : inputs_) {
+        input->Unref();
+      }
+    }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return absl::make_unique<ChooseFastestIterator>(
+          ChooseFastestIterator::Params{
+              this, strings::StrCat(prefix, "::ChooseFastest")});
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() const override {
+      return "ChooseFastestDatasetOp::Dataset";
+    }
+
+    int64 Cardinality() const override { return cardinality_; }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      std::vector<Node*> input_nodes;
+      input_nodes.reserve(inputs_.size());
+      for (const auto& input : inputs_) {
+        Node* input_node;
+        TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input, &input_node));
+        input_nodes.push_back(input_node);
+      }
+      AttrValue num_experiments_attr;
+      b->BuildAttrValue(num_experiments_, &num_experiments_attr);
+      return b->AddDataset(
+          this, {}, {std::make_pair(0, input_nodes)},
+          {std::make_pair("num_experiments", std::move(num_experiments_attr))},
+          output);
+    }
+
+   private:
+    class ChooseFastestIterator : public DatasetIterator<Dataset> {
+     public:
+      explicit ChooseFastestIterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            histograms_(dataset()->inputs_.size()) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        mutex_lock l(mu_);
+        input_impls_.resize(dataset()->inputs_.size());
+        for (size_t i = 0, num_inputs = dataset()->inputs_.size();
+             i < num_inputs; ++i) {
+          TF_RETURN_IF_ERROR(dataset()->inputs_[i]->MakeIterator(
+              ctx, strings::StrCat(prefix(), "_", i), &input_impls_[i]));
+        }
+        return Status::OK();
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+
+        // The first num_experiments_ iterations, we fire up a thread for
+        // each input that calls its GetNext function and records the time
+        // taken. We only return when all the threads have completed.
+        if (experiment_counter_ < dataset()->num_experiments_) {
+          experiment_counter_++;
+          std::vector<ThreadInfo> threads = StartThreads(ctx);
+          for (const auto& thread : threads) {
+            thread.result->notification.WaitForNotification();
+          }
+
+          *out_tensors = std::move(threads[0].result->out_tensors);
+          *end_of_sequence = threads[0].result->end_of_sequence;
+
+          if (experiment_counter_ == dataset()->num_experiments_) {
+            SelectFastestInputIndex();
+          }
+          return threads[0].result->status;
+        }
+        return input_impls_[fastest_index_]->GetNext(ctx, out_tensors,
+                                                     end_of_sequence);
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args), /*ratio=*/1);
+      }
+
+      // TODO(rachelim): Save and restore histogram state as well. Currently,
+      // if an iterator is saved and restored, the histograms start recording
+      // from scratch.
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (input_impls_.empty()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impls_empty"), ""));
+        } else {
+          for (auto& input_impl : input_impls_) {
+            TF_RETURN_IF_ERROR(SaveInput(writer, input_impl));
+          }
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("experiment_counter"),
+                                               experiment_counter_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("fastest_index"), fastest_index_));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (reader->Contains(full_name("input_impls_empty"))) {
+          input_impls_.clear();
+        } else {
+          DCHECK_EQ(input_impls_.size(), dataset()->inputs_.size());
+          for (auto& input_impl : input_impls_) {
+            TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl));
+          }
+        }
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("experiment_counter"),
+                                              &experiment_counter_));
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("fastest_index"), &fastest_index_));
+        return Status::OK();
+      }
+
+     private:
+      struct InvocationResult {
+        Notification notification;
+        Status status;
+        bool end_of_sequence;
+        std::vector<Tensor> out_tensors;
+      };
+
+      struct ThreadInfo {
+        std::unique_ptr<InvocationResult> result;
+        std::unique_ptr<Thread> thread;
+      };
+
+      std::vector<std::unique_ptr<IteratorBase>> input_impls_;
+      // For tracking the time taken for each input's iterations.
+      std::vector<histogram::Histogram> histograms_;
+
+      mutex mu_;
+      int64 experiment_counter_ GUARDED_BY(mu_) = 0;
+      int64 fastest_index_ = -1;
+
+      std::vector<ThreadInfo> StartThreads(IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        std::vector<ThreadInfo> threads(dataset()->inputs_.size());
+        for (size_t i = 0, num_inputs = dataset()->inputs_.size();
+             i < num_inputs; ++i) {
+          threads[i].result = absl::make_unique<InvocationResult>();
+          threads[i].thread.reset(ctx->env()->StartThread(
+              {}, strings::StrCat("tf_data_merge_", i),
+              std::bind(&ChooseFastestIterator::RunnerThread, this, ctx,
+                        threads[i].result.get(), i)));
+        }
+        return threads;
+      }
+
+      void RunnerThread(IteratorContext* ctx, InvocationResult* result, int i) {
+        int64 start = Env::Default()->NowNanos();
+        Status s = input_impls_[i]->GetNext(ctx, &result->out_tensors,
+                                            &result->end_of_sequence);
+        histograms_[i].Add(
+            static_cast<double>(Env::Default()->NowNanos() - start));
+
+        result->status = s;
+        result->notification.Notify();
+      }
+
+      // Select the fastest input to use based on the histograms of timings
+      // of the completed threads. The input with the best 90th percentile
+      // iteration time is selected.
+      void SelectFastestInputIndex() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        fastest_index_ = 0;
+
+        double best_percentile = histograms_[0].Percentile(kPercentile);
+        for (size_t i = 1, num_inputs = histograms_.size(); i < num_inputs;
+             ++i) {
+          double percentile = histograms_[i].Percentile(kPercentile);
+          if (percentile <= best_percentile) {
+            best_percentile = percentile;
+            fastest_index_ = i;
+          }
+        }
+      }
+    };  // class Iterator
+
+    const std::vector<DatasetBase*> inputs_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+    const int64 cardinality_;
+    const int64 num_experiments_;
+  };  // class Dataset
+
+  int64 num_experiments_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};  // class ChooseFastestDatasetOp
+
+// Register the kernel implementation for ChooseFastestDataset.
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalChooseFastestDataset").Device(DEVICE_CPU),
+    ChooseFastestDatasetOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
index f6f58fc430b41d05bccdc413c00151130bf7d36d..4435c2a131316be0b3b36fd246e40726fbd8c4bb 100644
--- a/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
-// See docs in ../ops/parsing_ops.cc.
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op.h"
@@ -159,8 +157,8 @@ class CSVDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::CSV")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::CSV")});
     }
 
     const DataTypeVector& output_dtypes() const override { return out_type_; }
diff --git a/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
index 97e64dd7444e93660afa6defa31314c909a31c7b..31f081a72773e5d0df6f51781d3f42694a986df5 100644
--- a/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
@@ -96,8 +96,8 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::DenseToSparseBatch")}));
+      return absl::make_unique<Iterator>(typename Iterator::Params{
+          this, strings::StrCat(prefix, "::DenseToSparseBatch")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc
index d8bb696167a7971ac21db4b449508946a0c7f11b..f55718a006436d0b7253607964dd44ab04690884 100644
--- a/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc
@@ -93,8 +93,8 @@ class DirectedInterleaveDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::DirectedInterleave")}));
+      return absl::make_unique<Iterator>(Iterator::Params{
+          this, strings::StrCat(prefix, "::DirectedInterleave")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
index 1c298cfdd6a3a39aabd81cb5226e03b1c3e3de63..56159593a9c8e789ae47e874825943c54e816c24 100644
--- a/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
@@ -90,8 +90,8 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::GroupByReducer")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::GroupByReducer")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -119,25 +119,25 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
       std::vector<Node*> key_func_other_arguments_node;
       DataTypeVector key_func_other_arguments_types;
       TF_RETURN_IF_ERROR(OtherArgumentsNodeAndType(
-          b, captured_key_func_, &key_func_other_arguments_node,
+          ctx, b, captured_key_func_, &key_func_other_arguments_node,
           &key_func_other_arguments_types));
 
       std::vector<Node*> init_func_other_arguments_node;
       DataTypeVector init_func_other_arguments_types;
       TF_RETURN_IF_ERROR(OtherArgumentsNodeAndType(
-          b, captured_init_func_, &init_func_other_arguments_node,
+          ctx, b, captured_init_func_, &init_func_other_arguments_node,
           &init_func_other_arguments_types));
 
       std::vector<Node*> reduce_func_other_arguments_node;
       DataTypeVector reduce_func_other_arguments_types;
       TF_RETURN_IF_ERROR(OtherArgumentsNodeAndType(
-          b, captured_reduce_func_, &reduce_func_other_arguments_node,
+          ctx, b, captured_reduce_func_, &reduce_func_other_arguments_node,
           &reduce_func_other_arguments_types));
 
       std::vector<Node*> finalize_func_other_arguments_node;
       DataTypeVector finalize_func_other_arguments_types;
       TF_RETURN_IF_ERROR(OtherArgumentsNodeAndType(
-          b, captured_finalize_func_, &finalize_func_other_arguments_node,
+          ctx, b, captured_finalize_func_, &finalize_func_other_arguments_node,
           &finalize_func_other_arguments_types));
 
       AttrValue key_func;
@@ -406,7 +406,7 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
     }
 
     Status OtherArgumentsNodeAndType(
-        DatasetGraphDefBuilder* b,
+        SerializationContext* ctx, DatasetGraphDefBuilder* b,
         const std::unique_ptr<CapturedFunction>& captured_func,
         std::vector<Node*>* other_arguments_node,
         DataTypeVector* other_arguments_types) const {
@@ -414,7 +414,13 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
       other_arguments_types->reserve(captured_func->captured_inputs().size());
       for (const Tensor& t : captured_func->captured_inputs()) {
         Node* node;
-        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        DatasetBase* input;
+        Status s = GetDatasetFromVariantTensor(t, &input);
+        if (s.ok()) {
+          TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input, &node));
+        } else {
+          TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        }
         other_arguments_node->emplace_back(node);
         other_arguments_types->emplace_back(t.dtype());
       }
diff --git a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
index 98603d5a732c8143db61535e6704d6a7b214413c..49122807b28aae48b77c1ead3be1f9e4021730ec 100644
--- a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
@@ -89,8 +89,8 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::GroupByWindow")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::GroupByWindow")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -117,20 +117,21 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
       std::vector<Node*> key_func_other_arguments_node;
       DataTypeVector key_func_other_arguments_types;
       TF_RETURN_IF_ERROR(OtherArgumentsNodeAndType(
-          b, captured_key_func_, &key_func_other_arguments_node,
+          ctx, b, captured_key_func_, &key_func_other_arguments_node,
           &key_func_other_arguments_types));
 
       std::vector<Node*> reduce_func_other_arguments_node;
       DataTypeVector reduce_func_other_arguments_types;
       TF_RETURN_IF_ERROR(OtherArgumentsNodeAndType(
-          b, captured_reduce_func_, &reduce_func_other_arguments_node,
+          ctx, b, captured_reduce_func_, &reduce_func_other_arguments_node,
           &reduce_func_other_arguments_types));
 
       std::vector<Node*> window_size_func_other_arguments_node;
       DataTypeVector window_size_func_other_arguments_types;
-      TF_RETURN_IF_ERROR(OtherArgumentsNodeAndType(
-          b, captured_window_size_func_, &window_size_func_other_arguments_node,
-          &window_size_func_other_arguments_types));
+      TF_RETURN_IF_ERROR(
+          OtherArgumentsNodeAndType(ctx, b, captured_window_size_func_,
+                                    &window_size_func_other_arguments_node,
+                                    &window_size_func_other_arguments_types));
 
       AttrValue key_func;
       b->BuildAttrValue(key_func_, &key_func);
@@ -490,7 +491,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
     };
 
     Status OtherArgumentsNodeAndType(
-        DatasetGraphDefBuilder* b,
+        SerializationContext* ctx, DatasetGraphDefBuilder* b,
         const std::unique_ptr<CapturedFunction>& captured_func,
         std::vector<Node*>* other_arguments_node,
         DataTypeVector* other_arguments_types) const {
@@ -498,7 +499,13 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
       other_arguments_types->reserve(captured_func->captured_inputs().size());
       for (const Tensor& t : captured_func->captured_inputs()) {
         Node* node;
-        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        DatasetBase* input;
+        Status s = GetDatasetFromVariantTensor(t, &input);
+        if (s.ok()) {
+          TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input, &node));
+        } else {
+          TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        }
         other_arguments_node->emplace_back(node);
         other_arguments_types->emplace_back(t.dtype());
       }
diff --git a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
index d445d9c8094eec5c9a2bff9c45e2dc28e264d096..5e07bdb32ebac39d8ea0f8b987c77615ccb7028e 100644
--- a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
@@ -45,8 +45,8 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::IgnoreErrors")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::IgnoreErrors")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc b/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc
index a07eaebdf9d645fba51945d7bd3e79b72b5e5dc2..e75e6e4b80bce5dd286ed297c1d645adcdc37a4b 100644
--- a/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
@@ -106,7 +105,7 @@ class MaterializedDatasetResource : public ResourceBase {
       const std::vector<PartialTensorShape>& output_shapes)
       : output_dtypes_(output_dtypes), output_shapes_(output_shapes) {}
 
-  string DebugString() override {
+  string DebugString() const override {
     return "Materialized IndexedDataset resource";
   }
 
@@ -424,7 +423,7 @@ class IdentityIndexedDatasetOp : public IndexedDatasetOpKernel {
 
     Status MaterializeDataset(
         std::shared_ptr<MaterializedIndexedDataset>* materialized) override {
-      materialized->reset(new Materialized(this));
+      (*materialized) = std::make_shared<Materialized>(this);
       return Status::OK();
     }
 
@@ -441,8 +440,8 @@ class IdentityIndexedDatasetOp : public IndexedDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::IdentityIndexedDataset")}));
+      return absl::make_unique<Iterator>(Iterator::Params{
+          this, strings::StrCat(prefix, "::IdentityIndexedDataset")});
     }
 
     string DebugString() const override {
diff --git a/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc b/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc
index 6248eb775e481cc5f6940b5c2131d4c963186af5..cf900f133612e3af5b06a8f599558b3caa5ff47a 100644
--- a/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include <sys/stat.h>
 
 #include "tensorflow/core/framework/dataset.h"
@@ -52,8 +51,8 @@ class LMDBDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::LMDB")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::LMDB")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
index d86c3a1a63dff8c9b0c4c1ea9bfbced6e3ddbf7e..77c1749632340a97e58e39a592b374f2ac6148bf 100644
--- a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
@@ -32,14 +33,15 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/tracing.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
+constexpr char kDatasetName[] = "MapAndBatch";
+
 // Maximum number of batch results to buffer.
-const int64 kMaxBatchResults = 16;
+constexpr int64 kMaxBatchResults = 16;
 
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
@@ -71,9 +73,10 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     int64 num_parallel_calls;
     OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls",
                                             &num_parallel_calls));
-    OP_REQUIRES(ctx, num_parallel_calls > 0 || num_parallel_calls == kAutoTune,
-                errors::InvalidArgument(
-                    "num_parallel_calls must be greater than zero."));
+    OP_REQUIRES(
+        ctx, num_parallel_calls > 0 || num_parallel_calls == model::kAutoTune,
+        errors::InvalidArgument(
+            "num_parallel_calls must be greater than zero."));
 
     bool drop_remainder;
     OP_REQUIRES_OK(ctx,
@@ -126,6 +129,10 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       };
     }
 
+    if (num_parallel_calls == model::kAutoTune) {
+      metrics::RecordTFDataAutotune(kDatasetName);
+    }
+
     *output = new Dataset(ctx, input, func_, batch_size, num_parallel_calls,
                           drop_remainder, output_types_, output_shapes_,
                           std::move(captured_func), &ctx->eigen_cpu_device(),
@@ -162,8 +169,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return MakeUnique<Iterator>(
-          Iterator::Params{this, strings::StrCat(prefix, "::MapAndBatch")},
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::", kDatasetName)},
           map_func_);
     }
 
@@ -209,7 +216,13 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       other_arguments.reserve(captured_func_->captured_inputs().size());
       for (const Tensor& t : captured_func_->captured_inputs()) {
         Node* node;
-        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        DatasetBase* input;
+        Status s = GetDatasetFromVariantTensor(t, &input);
+        if (s.ok()) {
+          TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input, &node));
+        } else {
+          TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        }
         other_arguments.emplace_back(node);
         other_arguments_types.emplace_back(t.dtype());
       }
@@ -252,7 +265,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
                                             params.dataset->batch_size_)) {
         std::vector<string> components =
             str_util::Split(params.prefix, "::", str_util::SkipEmpty());
-        prefix_end_ = components.back();
+        key_prefix_ = components.back();
       }
 
       ~Iterator() override {
@@ -268,9 +281,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
       Status Initialize(IteratorContext* ctx) override {
         mutex_lock l(*mu_);
-        if (num_parallel_calls_->value == kAutoTune) {
+        if (num_parallel_calls_->value == model::kAutoTune) {
           num_parallel_calls_->value = ctx->runner_threadpool_size();
-          num_parallel_calls_->tunable = true;
         }
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
@@ -391,8 +403,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         const auto& stats_aggregator = ctx->stats_aggregator();
         if (stats_aggregator) {
           stats_aggregator->AddScalar(
-              strings::StrCat(prefix_end_, "::active_parallel_calls"),
-              static_cast<float>(num_calls_));
+              strings::StrCat(key_prefix_, "::thread_utilization"),
+              static_cast<float>(num_calls_) /
+                  static_cast<float>(num_parallel_calls_->value));
         }
         cond_var_->notify_all();
       }
@@ -437,7 +450,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
               result->UpdateStatus(allocate_status, offset);
             } else {
               for (size_t i = 0; i < return_values->size(); ++i) {
-                const Tensor& tensor = return_values->at(i);
+                Tensor& tensor = return_values->at(i);
                 Tensor* batch = &(result->output)[i];
                 if (tensor.NumElements() !=
                     (batch->NumElements() / batch->dim_size(0))) {
@@ -455,8 +468,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
                 // TODO(mrry): Add a version of DoParallelConcat that allows us
                 // to move `tensor` where possible, to speed up string tensor
                 // batching.
-                Status copy_status =
-                    batch_util::CopyElementToSlice(tensor, batch, offset);
+                Status copy_status = batch_util::CopyElementToSlice(
+                    std::move(tensor), batch, offset);
                 if (!copy_status.ok()) {
                   result->UpdateStatus(copy_status, offset);
                   break;
@@ -631,18 +644,13 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
               num_calls_++;
             }
           }
-          const std::shared_ptr<StatsAggregator>& stats_aggregator =
-              ctx->stats_aggregator();
+          const auto& stats_aggregator = ctx->stats_aggregator();
           if (stats_aggregator) {
             mutex_lock l(*mu_);
-            // TODO(shivaniagrawal): add `parallel_calls_utilization` in the
-            // monitoring code or as histogram at fixed time intervals.
-            stats_aggregator->AddScalar(
-                strings::StrCat(prefix_end_, "::active_parallel_calls"),
-                static_cast<float>(num_calls_));
             stats_aggregator->AddScalar(
-                strings::StrCat(prefix_end_, "::num_parallel_calls"),
-                static_cast<float>(num_parallel_calls_->value));
+                strings::StrCat(key_prefix_, "::thread_utilization"),
+                static_cast<float>(num_calls_) /
+                    static_cast<float>(num_parallel_calls_->value));
           }
           for (const auto& call : new_calls) {
             CallFunction(ctx, call.first, call.second);
@@ -797,7 +805,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       int64 waiting_ GUARDED_BY(*mu_) = 0;
       // Identifies the maximum number of batch results to store.
       int64 max_batch_results_ GUARDED_BY(*mu_);
-      string prefix_end_;
+      string key_prefix_;
       std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
diff --git a/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc b/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
index aa27a13416d093dd19475b97b51ac28489d4d177..381b9691d1434fc5f1f60d694d06f9accfa15829 100644
--- a/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
@@ -61,8 +61,8 @@ class MatchingFilesDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::MatchingFiles")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::MatchingFiles")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc b/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
index 61811ea14eddc9f40987e12ce6343268da24a503..9ca8e33b946c9415d6908606d8165cccadd57258 100644
--- a/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
@@ -53,8 +53,8 @@ class NonSerializableDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::NonSerializable")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::NonSerializable")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
index 46233942f066de8fe799a958f164f8afa30e49ef..643b6460e8a838e5e9d6f35e789dc0a82e4f7cc5 100644
--- a/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/tracing.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -76,9 +75,10 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     int64 num_parallel_calls;
     OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls",
                                             &num_parallel_calls));
-    OP_REQUIRES(ctx, num_parallel_calls > 0 || num_parallel_calls == kAutoTune,
-                errors::InvalidArgument(
-                    "num_parallel_calls must be greater than zero."));
+    OP_REQUIRES(
+        ctx, num_parallel_calls > 0 || num_parallel_calls == model::kAutoTune,
+        errors::InvalidArgument(
+            "num_parallel_calls must be greater than zero."));
 
     bool drop_remainder;
     OP_REQUIRES_OK(ctx,
@@ -120,8 +120,8 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::NumaMapAndBatch")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::NumaMapAndBatch")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -168,7 +168,13 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       other_arguments.reserve(captured_func_->captured_inputs().size());
       for (const Tensor& t : captured_func_->captured_inputs()) {
         Node* node;
-        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        DatasetBase* input;
+        Status s = GetDatasetFromVariantTensor(t, &input);
+        if (s.ok()) {
+          TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input, &node));
+        } else {
+          TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        }
         other_arguments.emplace_back(node);
         other_arguments_types.emplace_back(t.dtype());
       }
@@ -214,9 +220,8 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
       Status Initialize(IteratorContext* ctx) override {
         mutex_lock l(*mu_);
-        if (num_parallel_calls_->value == kAutoTune) {
+        if (num_parallel_calls_->value == model::kAutoTune) {
           num_parallel_calls_->value = ctx->runner_threadpool_size();
-          num_parallel_calls_->tunable = true;
         }
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
@@ -315,7 +320,7 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         }
         workers_.resize(num_workers);
         for (size_t i = 0; i < num_workers; ++i) {
-          workers_[i] = MakeUnique<NumaWorkerBlock>(this);
+          workers_[i] = absl::make_unique<NumaWorkerBlock>(this);
           TF_RETURN_IF_ERROR(
               workers_[i]->manager.Restore(ctx, reader, this, i));
         }
diff --git a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
index 0230f90aba1c849483da5f8d7297c44c8a1174de..f6d522078dda68d52bd0722613ecdcfdd314faf1 100644
--- a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -113,8 +112,8 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::ParallelInterleave")}));
+      return absl::make_unique<Iterator>(Iterator::Params{
+          this, strings::StrCat(prefix, "::ParallelInterleave")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -154,7 +153,13 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       other_arguments.reserve(captured_func_->captured_inputs().size());
       for (const Tensor& t : captured_func_->captured_inputs()) {
         Node* node;
-        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        DatasetBase* input;
+        Status s = GetDatasetFromVariantTensor(t, &input);
+        if (s.ok()) {
+          TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input, &node));
+        } else {
+          TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        }
         other_arguments.emplace_back(node);
         other_arguments_types.emplace_back(t.dtype());
       }
diff --git a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
index ea99a8b32c5a945f30945369ef2ed4f4b6725887..00574057344507fe158d36c210e61f15bf92845e 100644
--- a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
@@ -183,8 +183,8 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      std::unique_ptr<ParallelMapFunctor> parse_example_functor(
-          new ParseExampleFunctor(this));
+      std::unique_ptr<ParallelMapFunctor> parse_example_functor =
+          absl::make_unique<ParseExampleFunctor>(this);
       return NewParallelMapIterator(
           {this, strings::StrCat(prefix, "::ParseExample")}, input_,
           std::move(parse_example_functor), num_parallel_calls_, sloppy_,
diff --git a/tensorflow/core/kernels/data/experimental/random_dataset_op.cc b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
index 6d85cd5c450640a0042add2ead26836433166ade..114bb6a856c90f559e2db48ba68b9d249ef75b2e 100644
--- a/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
@@ -56,8 +56,8 @@ class RandomDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Random")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Random")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0397ca01c4ee058bce8079c83c787a4f38f4f578
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
@@ -0,0 +1,101 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/kernels/data/graph_rewrite_dataset.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kOptimizerName[] = "tf_data_rebatcher";
+
+class RebatchDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit RebatchDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx),
+        graph_def_version_(ctx->graph_def_version()) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 num_workers;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_workers", &num_workers));
+    OP_REQUIRES(
+        ctx, num_workers > 0,
+        errors::InvalidArgument("num_workers must be greater than zero."));
+
+    Dataset* dataset =
+        new Dataset(ctx, input, num_workers, output_types_, output_shapes_);
+    Status s = dataset->Optimize(ctx);
+    if (s.ok()) {
+      *output = dataset;
+    } else {
+      dataset->Unref();
+      OP_REQUIRES_OK(ctx, s);
+    }
+  }
+
+ private:
+  class Dataset : public GraphRewriteDataset {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const int64 num_workers, const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : GraphRewriteDataset(ctx, input, output_types, output_shapes),
+          num_workers_(num_workers) {}
+
+    string DebugString() const override { return "RebatchDatasetOp::Dataset"; }
+
+   private:
+    bool ShouldOptimizeFunctions() override {
+      // We only want to optimize functions for some particular datasets like
+      // FlatMapDataset, InterleaveDataset etc. So we disable generalized
+      // function optimization and explicitly handle function modifications
+      // for those datasets in the rewrite.
+      return false;
+    }
+
+    RewriterConfig CreateGrapplerRewriteConfig() override {
+      RewriterConfig rewriter_config;
+      rewriter_config.set_fail_on_optimizer_errors(true);
+      rewriter_config.add_optimizers(kOptimizerName);
+      rewriter_config.set_meta_optimizer_iterations(
+          RewriterConfig_NumIterationsType_ONE);
+      auto custom_optimizer = rewriter_config.add_custom_optimizers();
+      custom_optimizer->set_name(kOptimizerName);
+      AttrValue num_workers_attr;
+      num_workers_attr.set_i(num_workers_);
+      (*custom_optimizer->mutable_parameter_map())["num_workers"] =
+          num_workers_attr;
+      return rewriter_config;
+    }
+
+    const int64 num_workers_;
+  };
+
+  const int graph_def_version_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ExperimentalRebatchDataset").Device(DEVICE_CPU),
+                        RebatchDatasetOp);
+
+}  // anonymous namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
index 0d9a629a27f907fca2214a574db1ea0074a9ed2e..55e22c1cac6090574d52a4dd154feae2701c6dab 100644
--- a/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
@@ -84,8 +84,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Scan")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Scan")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -119,7 +119,13 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
       other_arguments_types.reserve(captured_func_->captured_inputs().size());
       for (const Tensor& t : captured_func_->captured_inputs()) {
         Node* node;
-        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        DatasetBase* input;
+        Status s = GetDatasetFromVariantTensor(t, &input);
+        if (s.ok()) {
+          TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input, &node));
+        } else {
+          TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        }
         other_arguments.emplace_back(node);
         other_arguments_types.emplace_back(t.dtype());
       }
@@ -180,6 +186,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
 
         Status s = instantiated_captured_func_->Run(ctx, std::move(args),
                                                     &state_and_output);
+        DCHECK(state_and_output.size() <=
+               dataset()->state_types_.size() + output_dtypes().size());
         if (s.ok()) {
           state_.clear();
           size_t i = 0;
diff --git a/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
index fe128005faca9bd986e7c85600f7f871ebb97a25..67bb1e160b9b125c28f2fda0cdde30c12b957382 100644
--- a/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <memory>
+
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
@@ -114,8 +115,8 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::SetStatsAggregator")}));
+      return absl::make_unique<Iterator>(Iterator::Params{
+          this, strings::StrCat(prefix, "::SetStatsAggregator")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
index d2fb8ac4f33b1e844bb39cc70a47ccb15424ace7..9d63690622d5a50b22ac48c85f56f90952ae2dcc 100644
--- a/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
@@ -15,7 +15,6 @@ limitations under the License.
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -55,7 +54,7 @@ class SleepDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return MakeUnique<Iterator>(
+      return absl::make_unique<Iterator>(
           Iterator::Params{this, strings::StrCat(prefix, "::Sleep")});
     }
 
diff --git a/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
index 1ce4fbd3136d7fbd245fbb920ff658c4eae794c6..c5851eaf86b654c62457759c7835d4f274c4b9ee 100644
--- a/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include <deque>
 #include <vector>
 
@@ -86,8 +85,8 @@ class SlidingWindowDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          Iterator::Params{this, strings::StrCat(prefix, "::Slide")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Slide")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
index c16d8ed02ccdfb01a41ff9206a003f4a8c04a667..84f6fba36d197ffaa991ec7f381bdb15aebcd8d3 100644
--- a/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
@@ -91,8 +91,8 @@ class SqlDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Sql")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Sql")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
index 894465e1814cf93b02ecbbb053494d4c032fe243..1d1b788b6c12b1f68cea494b1e269869bb57d648 100644
--- a/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
@@ -108,8 +108,8 @@ class StatsAggregatorHandleOp
  private:
   Status CreateResource(StatsAggregatorResource** ret) override
       EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    *ret = new StatsAggregatorResource(
-        std::unique_ptr<StatsAggregator>(new StatsAggregatorImpl));
+    *ret =
+        new StatsAggregatorResource(absl::make_unique<StatsAggregatorImpl>());
     return Status::OK();
   }
 
diff --git a/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
index 1961f25df846e8773bf6b0266d089c9d3bac355b..be5fa4c789ba842952b01ecb256ffa57629d5afa 100644
--- a/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
@@ -12,9 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
-#include "tensorflow/core/example/example.pb.h"
-#include "tensorflow/core/example/feature.pb.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
@@ -63,8 +60,8 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::LatencyStats")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::LatencyStats")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -173,8 +170,8 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::BytesProducedStats")}));
+      return absl::make_unique<Iterator>(Iterator::Params{
+          this, strings::StrCat(prefix, "::BytesProducedStats")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc b/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3a6f70e504ec09007ac21808b1747e299d2b150d
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
@@ -0,0 +1,250 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iterator>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+// See documentation in ../../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  using LoopIteratorPredicate =
+      std::function<Status(IteratorContext*, InstantiatedCapturedFunction*,
+                           std::vector<Tensor>&, bool*)>;
+
+  explicit TakeWhileDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("predicate", &func_));
+  }
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    std::unique_ptr<CapturedFunction> captured_func;
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(func_, ctx, "other_arguments",
+                                                 &captured_func));
+
+    std::vector<int> indices;
+    OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
+    OP_REQUIRES(
+        ctx, indices.size() <= 1,
+        errors::InvalidArgument("`predicate` has more than one return value."));
+
+    LoopIteratorPredicate loop_pred;
+    if (indices.empty()) {
+      loop_pred = [](IteratorContext* ctx,
+                     InstantiatedCapturedFunction* inst_captured_func,
+                     const std::vector<Tensor>& args, bool* end_of_sequence) {
+        std::vector<Tensor> result;
+        TF_RETURN_IF_ERROR(
+            inst_captured_func->RunWithBorrowedArgs(ctx, args, &result));
+
+        if (result.size() != 1 || result[0].dtype() != DT_BOOL ||
+            result[0].NumElements() != 1) {
+          return errors::InvalidArgument(
+              "`predicate` must returns a scalar bool tensor.");
+        }
+        *end_of_sequence = !result[0].scalar<bool>()();
+        return Status::OK();
+      };
+    } else {
+      loop_pred = [indices](IteratorContext* ctx,
+                            InstantiatedCapturedFunction* inst_captured_func,
+                            const std::vector<Tensor>& args,
+                            bool* end_of_sequence) {
+        const Tensor& predicate = args[indices[0]];
+        if (predicate.dtype() != DT_BOOL || predicate.NumElements() != 1) {
+          return errors::InvalidArgument(
+              "`predicate` must returns a scalar bool tensor.");
+        }
+        *end_of_sequence = !predicate.scalar<bool>()();
+        return Status::OK();
+      };
+    }
+    *output = new Dataset(ctx, input, func_, std::move(captured_func),
+                          std::move(loop_pred));
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const NameAttrList& func,
+            std::unique_ptr<CapturedFunction> captured_func,
+            LoopIteratorPredicate loop_pred)
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          func_(func),
+          captured_func_(std::move(captured_func)),
+          loop_pred_(std::move(loop_pred)) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return MakeUnique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::TakeWhile")},
+          loop_pred_);
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() const override {
+      return "TakeWhileDatasetOp::Dataset";
+    }
+
+    int64 Cardinality() const override { return kUnknownCardinality; }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
+      Node* input_node;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node));
+
+      std::vector<Node*> other_arguments;
+      other_arguments.reserve(captured_func_->captured_inputs().size());
+      DataTypeVector other_arguments_types;
+      other_arguments_types.reserve(captured_func_->captured_inputs().size());
+      for (const Tensor& t : captured_func_->captured_inputs()) {
+        Node* node;
+        DatasetBase* input;
+        Status s = GetDatasetFromVariantTensor(t, &input);
+        if (s.ok()) {
+          TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input, &node));
+        } else {
+          TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        }
+        other_arguments.emplace_back(node);
+        other_arguments_types.emplace_back(t.dtype());
+      }
+      AttrValue f_attr;
+      b->BuildAttrValue(func_, &f_attr);
+
+      AttrValue other_arguments_types_attr;
+      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {std::make_pair(0, input_node)},
+          {std::make_pair(1, other_arguments)},
+          {std::make_pair("predicate", f_attr),
+           std::make_pair("Targuments", other_arguments_types_attr)},
+          output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params, LoopIteratorPredicate loop_pred)
+          : DatasetIterator<Dataset>(params), loop_pred_(loop_pred) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        TF_RETURN_IF_ERROR(
+            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        {
+          tf_shared_lock l(mu_);
+          if (!input_impl_) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+        }
+        if (*end_of_sequence) {
+          mutex_lock l(mu_);
+          input_impl_.reset();
+          return Status::OK();
+        }
+        return loop_pred_(ctx, instantiated_captured_func_.get(), *out_tensors,
+                          end_of_sequence);
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (input_impl_)
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+        else
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impls_empty"), ""));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (reader->Contains(full_name("input_impls_empty")))
+          input_impl_.reset();
+        else
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
+      const LoopIteratorPredicate loop_pred_;
+    };
+
+    const DatasetBase* const input_;
+    const NameAttrList func_;
+    const std::unique_ptr<CapturedFunction> captured_func_;
+    const LoopIteratorPredicate loop_pred_;
+  };
+
+  NameAttrList func_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("ExperimentalTakeWhileDataset").Device(DEVICE_CPU),
+                        TakeWhileDatasetOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
index 8ae45ed5c9d9fe199ef392a1430f359172ec5c73..e8fd051a8272c79619f3fba5bfaca7826bd486ea 100644
--- a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include <memory>
+
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/util/ptr_util.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -51,7 +50,7 @@ class ThreadPoolResource : public ResourceBase {
 
   int32 NumThreads() { return thread_pool_.NumThreads(); }
 
-  string DebugString() override { return "ThreadPoolResource"; }
+  string DebugString() const override { return "ThreadPoolResource"; }
 
  private:
   thread::ThreadPool thread_pool_;
@@ -99,8 +98,9 @@ class ThreadPoolHandleOp : public OpKernel {
                                   EXCLUSIVE_LOCKS_REQUIRED(mu_) {
                                     *ret = new ThreadPoolResource(
                                         ctx->env(), {}, display_name_,
-                                        num_threads_, max_intra_op_parallelism_,
-                                        false /* low_latency_hint */);
+                                        num_threads_,
+                                        /*low_latency_hint=*/false,
+                                        max_intra_op_parallelism_);
                                     return Status::OK();
                                   }));
       initialized_ = true;
@@ -154,8 +154,8 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::ThreadPool")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::ThreadPool")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -261,8 +261,8 @@ class MaxIntraOpParallelismDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::MaxIntraOpParallelism")}));
+      return absl::make_unique<Iterator>(Iterator::Params{
+          this, strings::StrCat(prefix, "::MaxIntraOpParallelism")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -362,7 +362,7 @@ class PrivateThreadPoolDatasetOp : public UnaryDatasetOpKernel {
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           num_threads_(num_threads) {
-      thread_pool_ = MakeUnique<thread::ThreadPool>(
+      thread_pool_ = absl::make_unique<thread::ThreadPool>(
           ctx->env(), ThreadOptions{}, "data_private_threadpool", num_threads,
           /*low_latency_hint=*/false);
       input_->Ref();
@@ -372,8 +372,8 @@ class PrivateThreadPoolDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::PrivateThreadPool")}));
+      return absl::make_unique<Iterator>(Iterator::Params{
+          this, strings::StrCat(prefix, "::PrivateThreadPool")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc b/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
index 7728baf1507c6cec2b44f41561f2ab3d04a80cc8..6cf6198432b68fe241e413b8472a2b69bbae314d 100644
--- a/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
+++ b/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function_handle_cache.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -59,18 +58,18 @@ class ToTFRecordOp : public AsyncOpKernel {
       std::unique_ptr<WritableFile> file;
       OP_REQUIRES_OK_ASYNC(ctx, ctx->env()->NewWritableFile(filename, &file),
                            done);
-      std::unique_ptr<io::RecordWriter> writer;
-      writer.reset(new io::RecordWriter(
-          file.get(), io::RecordWriterOptions::CreateRecordWriterOptions(
-                          compression_type)));
+      std::unique_ptr<io::RecordWriter> writer =
+          absl::make_unique<io::RecordWriter>(
+              file.get(), io::RecordWriterOptions::CreateRecordWriterOptions(
+                              compression_type));
 
       DatasetBase* dataset;
       OP_REQUIRES_OK_ASYNC(
           ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
       std::unique_ptr<IteratorBase> iterator;
       IteratorContext::Params params(ctx);
-      std::unique_ptr<FunctionHandleCache> function_handle_cache(
-          new FunctionHandleCache(params.lib));
+      std::unique_ptr<FunctionHandleCache> function_handle_cache =
+          absl::make_unique<FunctionHandleCache>(params.lib);
       params.function_handle_cache = function_handle_cache.get();
       IteratorContext iter_ctx(std::move(params));
 
diff --git a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
index 2626ec3ed7250b725650a76b8674e0a76ebc638f..cb26fd3e43d7c143cdc716a0ee5a4d172c98149f 100644
--- a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
@@ -58,8 +58,8 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Unbatch")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Unbatch")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
index 23dd9ff612db61829dcbae65eb3566131d032efc..57865c45fc078c663d4ea43c5ee9a6642bdc51b6 100644
--- a/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
@@ -58,8 +58,8 @@ class UniqueDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Unique")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Unique")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc b/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc
index 784f9872860fee0f929dcf4c529c17fbb15e2bc6..3b9b319ea9442c024ec22fee601085b42614836d 100644
--- a/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_by_component_dataset_op.cc
@@ -64,8 +64,8 @@ class FilterByLastComponentDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<Iterator>(new Iterator(
-          {this, strings::StrCat(prefix, "::FilterByLastComponent")}));
+      return absl::make_unique<Iterator>(Iterator::Params{
+          this, strings::StrCat(prefix, "::FilterByLastComponent")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc
index b8b657d3433422731d10a00ae6498c2f802669dd..483d42c8092356ed9fedb70222c7dc96001874b4 100644
--- a/tensorflow/core/kernels/data/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -109,7 +108,7 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return MakeUnique<Iterator>(
+      return absl::make_unique<Iterator>(
           Iterator::Params{this, strings::StrCat(prefix, "::Filter")},
           filter_pred_);
     }
@@ -137,7 +136,13 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
       other_arguments.reserve(captured_func_->captured_inputs().size());
       for (const Tensor& t : captured_func_->captured_inputs()) {
         Node* node;
-        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        DatasetBase* input;
+        Status s = GetDatasetFromVariantTensor(t, &input);
+        if (s.ok()) {
+          TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input, &node));
+        } else {
+          TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        }
         other_arguments.emplace_back(node);
         other_arguments_types.emplace_back(t.dtype());
       }
diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
index 3846334622bf48ecb5e62464f22c2fa3e7c4adc4..3f01ac556998750d02b299a76c8f81c60262f190 100644
--- a/tensorflow/core/kernels/data/flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
@@ -67,8 +67,8 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::FlatMap")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::FlatMap")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -95,7 +95,13 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
       other_arguments.reserve(captured_func_->captured_inputs().size());
       for (const Tensor& t : captured_func_->captured_inputs()) {
         Node* node;
-        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        DatasetBase* input;
+        Status s = GetDatasetFromVariantTensor(t, &input);
+        if (s.ok()) {
+          TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input, &node));
+        } else {
+          TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        }
         other_arguments.emplace_back(node);
         other_arguments_types.emplace_back(t.dtype());
       }
diff --git a/tensorflow/core/kernels/data/generator_dataset_op.cc b/tensorflow/core/kernels/data/generator_dataset_op.cc
index 48697ec6c8f05c438badedbc3234dbb1110c7088..3469743af63a4d9480de3ed9160c43a650b71410 100644
--- a/tensorflow/core/kernels/data/generator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/generator_dataset_op.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/kernels/data/generator_dataset_op.h"
+
 #include <iterator>
 #include <vector>
 
-#include "tensorflow/core/kernels/data/generator_dataset_op.h"
-
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
@@ -44,8 +44,8 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const override {
-    return std::unique_ptr<IteratorBase>(
-        new Iterator({this, strings::StrCat(prefix, "::Generator")}));
+    return absl::make_unique<Iterator>(
+        Iterator::Params{this, strings::StrCat(prefix, "::Generator")});
   }
 
   const DataTypeVector& output_dtypes() const override { return output_types_; }
@@ -71,7 +71,7 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
         : DatasetIterator<Dataset>(params) {}
 
     ~Iterator() override {
-      if (!finalized_) {
+      if (!finalized_ && initialized_) {
         std::vector<Tensor> ignored;
         Status s =
             instantiated_finalize_func_->RunInstantiated(state_, &ignored);
diff --git a/tensorflow/core/kernels/data/graph_rewrite_dataset.cc b/tensorflow/core/kernels/data/graph_rewrite_dataset.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cd8026607e73ccf42d43ddb9a1d595778879478c
--- /dev/null
+++ b/tensorflow/core/kernels/data/graph_rewrite_dataset.cc
@@ -0,0 +1,250 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/data/graph_rewrite_dataset.h"
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace data {
+
+GraphRewriteDataset::~GraphRewriteDataset() {
+  input_->Unref();
+  if (optimized_input_) {
+    optimized_input_->Unref();
+  }
+}
+
+Status GraphRewriteDataset::Optimize(OpKernelContext* ctx) {
+  GraphDefBuilder b;
+  DatasetGraphDefBuilder db(&b);
+  Node* input_node = nullptr;
+  SerializationContext::Params params;
+  std::vector<std::pair<string, Tensor>> input_list;
+  params.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
+  params.input_list = &input_list;
+  params.optimization_only = true;
+  SerializationContext serialization_ctx(params);
+  TF_RETURN_IF_ERROR(
+      db.AddInputDataset(&serialization_ctx, input_, &input_node));
+  string output_node = input_node->name();
+
+  GraphDef graph_def;
+  TF_RETURN_IF_ERROR(b.ToGraphDef(&graph_def));
+  VLOG(3) << "Before optimization: " << graph_def.DebugString();
+
+  TF_RETURN_IF_ERROR(ApplyOptimizations(ctx, &graph_def, &output_node));
+  VLOG(3) << "After optimization: " << graph_def.DebugString();
+
+  // Instantiate the optimized input pipeline by running the optimized graph
+  // using the optimized function library.
+  TF_RETURN_IF_ERROR(ctx->function_library()->Clone(&flib_def_, &pflr_, &lib_));
+
+  // Create a FunctionHandleCache.
+  function_handle_cache_ = absl::make_unique<FunctionHandleCache>(lib_);
+
+  // Some functions may have been modified without having their names
+  // changed (for example, nested dataset graphs from FlatMap or
+  // Interleave).
+  TF_RETURN_IF_ERROR(
+      AddToFunctionLibrary(flib_def_.get(), graph_def.library()));
+
+  Graph graph(OpRegistry::Global());
+  TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, nullptr));
+  std::vector<Tensor> outputs;
+  GraphRunner graph_runner(ctx->function_library()->device());
+
+  TF_RETURN_IF_ERROR(
+      graph_runner.Run(&graph, lib_, input_list, {output_node}, &outputs));
+  TF_RETURN_IF_ERROR(
+      GetDatasetFromVariantTensor(outputs[0], &optimized_input_));
+  optimized_input_->Ref();
+  return Status::OK();
+}
+
+Status GraphRewriteDataset::AsGraphDefInternal(SerializationContext* ctx,
+                                               DatasetGraphDefBuilder* b,
+                                               Node** output) const {
+  SerializationContext::Params params;
+  // The optimized input needs access to the newly optimized functions when
+  // it is serialized. Here, we use the optimized function library for
+  // serialization, which is the union of the function library from the
+  // OpKernelContext at dataset creation time and newly optimized functions.
+  // This includes all functions that optimized_input_ may use.
+  params.flib_def = flib_def_.get();
+  params.input_list = ctx->input_list();
+  params.optimization_only = ctx->optimization_only();
+  SerializationContext optimized_ctx(params);
+
+  // We only serialize the optimized dataset to avoid re-running
+  // optimizations when the input pipeline is restored from a checkpoint.
+  TF_RETURN_IF_ERROR(
+      b->AddInputDataset(&optimized_ctx, optimized_input_, output));
+  return Status::OK();
+}
+
+namespace {
+void AddFakeSinks(FunctionDef* function_def) {
+  int counter = 0;
+  for (const auto& output : function_def->signature().output_arg()) {
+    NodeDef* node = function_def->add_node_def();
+    tensorflow::grappler::function_utils::SetUniqueFunctionNodeName(
+        strings::StrCat("FakeSink", counter++), function_def, node);
+    node->set_op("Identity");
+    node->add_input(function_def->ret().at(output.name()));
+    (*node->mutable_attr())["T"].set_type(output.type());
+
+    (*function_def->mutable_ret())[output.name()] =
+        strings::StrCat(node->name(), ":output:0");
+  }
+}
+
+void RemoveFakeSinks(FunctionDef* function_def) {
+  // Map from identity node names to their input tensor strings
+  std::map<string, string> identity_map;
+  for (const auto& node : function_def->node_def()) {
+    if (node.op() == "Identity" && node.input_size() == 1) {
+      identity_map[node.name()] = node.input(0);
+    }
+  }
+  for (const auto& output_arg : function_def->signature().output_arg()) {
+    const string& tensor = function_def->ret().at(output_arg.name());
+    const string& output_node = tensor.substr(0, tensor.find(':'));
+    if (identity_map.find(output_node) != identity_map.end()) {
+      (*function_def->mutable_ret())[output_arg.name()] =
+          identity_map.at(output_node);
+    }
+  }
+}
+}  // anonymous namespace
+
+Status GraphRewriteDataset::ApplyOptimizations(OpKernelContext* ctx,
+                                               GraphDef* graph_def,
+                                               string* output_node) {
+  // Add an identity node as the fetch node, otherwise we might get
+  // 'placeholder is both fed and fetched' errors in some cases when using
+  // input list with placeholder dataset nodes.
+  NodeDef* node = graph_def->mutable_node()->Add();
+  tensorflow::grappler::graph_utils::SetUniqueGraphNodeName("Sink", graph_def,
+                                                            node);
+  node->set_op("Identity");
+  node->add_input(*output_node);
+  (*node->mutable_attr())["T"].set_type(DT_VARIANT);
+  *output_node = node->name();
+
+  // Add fake sink node to graph and functions to allow rewriting the actual
+  // sink nodes.
+  // TODO(b/118820916): When MetaOptimizer adds provisions for function
+  // retvals to be optimizable, we will no longer need this.
+  for (auto& function_def : *graph_def->mutable_library()->mutable_function()) {
+    AddFakeSinks(&function_def);
+  }
+
+  // Create metagraph.
+  MetaGraphDef meta_graph_def;
+  (*meta_graph_def.mutable_graph_def()) = *graph_def;
+
+  // Grappler determines fetch ops from collection 'train_op'.
+  CollectionDef collection_def;
+  auto node_list = collection_def.mutable_node_list();
+  node_list->add_value(*output_node);
+  (*meta_graph_def.mutable_collection_def())["train_op"] = collection_def;
+
+  // Create Grappler item.
+  tensorflow::grappler::ItemConfig item_config;
+  item_config.apply_optimizations = true;
+  std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
+      tensorflow::grappler::GrapplerItemFromMetaGraphDef(
+          "graph", meta_graph_def, item_config);
+  grappler_item->optimization_options().optimize_function_library =
+      ShouldOptimizeFunctions();
+  std::unordered_map<string, tensorflow::DeviceProperties> device_map;
+  tensorflow::grappler::VirtualCluster cluster(device_map);
+
+  // Run data optimizer using grappler's meta optimizer.
+  tensorflow::ConfigProto config;
+  *config.mutable_graph_options()->mutable_rewrite_options() =
+      CreateGrapplerRewriteConfig();
+  TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
+      *grappler_item, config, ctx->device(), &cluster, graph_def));
+
+  // Remove fake sinks after optimizations are done.
+  // TODO(b/118820916): When MetaOptimizer adds provisions for function
+  // retvals to be optimizable, we will no longer need this.
+  for (auto& function_def : *graph_def->mutable_library()->mutable_function()) {
+    RemoveFakeSinks(&function_def);
+  }
+
+  return Status::OK();
+}
+
+class GraphRewriteDataset::Iterator
+    : public DatasetIterator<GraphRewriteDataset> {
+ public:
+  explicit Iterator(const Params& params)
+      : DatasetIterator<GraphRewriteDataset>(params) {}
+
+  Status Initialize(IteratorContext* ctx) override {
+    IteratorContext::Params params(ctx);
+    params.lib = dataset()->lib_;
+    params.function_handle_cache = dataset()->function_handle_cache_.get();
+    return dataset()->optimized_input_->MakeIterator(
+        IteratorContext(std::move(params)), prefix(), &input_impl_);
+  }
+
+  Status GetNextInternal(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                         bool* end_of_sequence) override {
+    IteratorContext::Params params(ctx);
+    params.lib = dataset()->lib_;
+    params.function_handle_cache = dataset()->function_handle_cache_.get();
+    return input_impl_->GetNext(IteratorContext(std::move(params)), out_tensors,
+                                end_of_sequence);
+  }
+
+ protected:
+  std::shared_ptr<model::Node> CreateNode(
+      IteratorContext* ctx, model::Node::Args args) const override {
+    return model::MakeKnownRatioNode(std::move(args),
+                                     /*ratio=*/1);
+  }
+
+  Status SaveInternal(IteratorStateWriter* writer) override {
+    TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+    return Status::OK();
+  }
+
+  Status RestoreInternal(IteratorContext* ctx,
+                         IteratorStateReader* reader) override {
+    TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+    return Status::OK();
+  }
+
+ private:
+  std::unique_ptr<IteratorBase> input_impl_;
+};
+
+std::unique_ptr<IteratorBase> GraphRewriteDataset::MakeIteratorInternal(
+    const string& prefix) const {
+  // We do not add a token for this dataset to the prefix. The
+  // prefix is used to identify checkpoint elements and since this
+  // dataset is excluded from the checkpoint, adding a token
+  // here would result in invalid checkpoint identifiers.
+  return absl::make_unique<Iterator>(Iterator::Params{this, prefix});
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/graph_rewrite_dataset.h b/tensorflow/core/kernels/data/graph_rewrite_dataset.h
new file mode 100644
index 0000000000000000000000000000000000000000..856fcd3ea727f1223a06783b78af0efc41935516
--- /dev/null
+++ b/tensorflow/core/kernels/data/graph_rewrite_dataset.h
@@ -0,0 +1,95 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_GRAPH_REWRITE_DATASET_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_GRAPH_REWRITE_DATASET_H_
+
+#include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/grappler_item_builder.h"
+#include "tensorflow/core/grappler/optimizers/data/function_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+
+namespace tensorflow {
+namespace data {
+
+class GraphRewriteDataset : public DatasetBase {
+ public:
+  GraphRewriteDataset(OpKernelContext* ctx, const DatasetBase* input,
+                      const DataTypeVector& output_types,
+                      const std::vector<PartialTensorShape>& output_shapes)
+      : DatasetBase(DatasetContext(ctx)),
+        optimized_input_(nullptr),
+        input_(input),
+        output_types_(output_types),
+        output_shapes_(output_shapes) {
+    input_->Ref();
+  }
+
+  ~GraphRewriteDataset() override;
+
+  // Runs Grappler to transform the input dataset into optimized_input_
+  // dataset.
+  Status Optimize(OpKernelContext* ctx);
+
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const override;
+
+  const DataTypeVector& output_dtypes() const override { return output_types_; }
+
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return output_shapes_;
+  }
+
+  int64 Cardinality() const override { return input_->Cardinality(); }
+
+ protected:
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** output) const override;
+
+ private:
+  class Iterator;
+
+  // Create a Grappler RewriteConfig proto that defines the list of
+  // optimizations to be run by the Grappler Meta Optimizer.
+  virtual RewriterConfig CreateGrapplerRewriteConfig() = 0;
+
+  // Option specifying whether we want to optimize the function library as well.
+  virtual bool ShouldOptimizeFunctions() { return true; }
+
+  Status ApplyOptimizations(OpKernelContext* ctx, GraphDef* graph_def,
+                            string* output_node);
+
+  DatasetBase* optimized_input_;
+  FunctionLibraryRuntime* lib_ = nullptr;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_ = nullptr;
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_ = nullptr;
+  std::unique_ptr<FunctionHandleCache> function_handle_cache_ = nullptr;
+  const DatasetBase* input_;
+  const DataTypeVector output_types_;
+  const std::vector<PartialTensorShape> output_shapes_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_GRAPH_REWRITE_DATASET_H_
diff --git a/tensorflow/core/kernels/data/interleave_dataset_op.cc b/tensorflow/core/kernels/data/interleave_dataset_op.cc
index 54e3645612cd3905f1338fe59ab8caf0ca8941eb..69310bcff23d56414c5f689339b528b326429c9f 100644
--- a/tensorflow/core/kernels/data/interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/interleave_dataset_op.cc
@@ -89,8 +89,8 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Interleave")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Interleave")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -121,7 +121,13 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
       other_arguments.reserve(captured_func_->captured_inputs().size());
       for (const Tensor& t : captured_func_->captured_inputs()) {
         Node* node;
-        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        DatasetBase* input;
+        Status s = GetDatasetFromVariantTensor(t, &input);
+        if (s.ok()) {
+          TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input, &node));
+        } else {
+          TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        }
         other_arguments.emplace_back(node);
         other_arguments_types.emplace_back(t.dtype());
       }
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index d5b4bfa5c5e23cc6948f680ba7f49c23447464a5..7e23ca58ce7bdf65ab66675b42e2d840ab1702c9 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/threadpool_device.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_handle_cache.h"
-#include "tensorflow/core/framework/iterator.pb.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/resource_op_kernel.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
@@ -60,8 +59,8 @@ class IteratorResource : public ResourceBase {
                    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
                    FunctionLibraryRuntime* lib)
       : device_mgr_(std::move(device_mgr)),
-        iterator_state_(
-            new State(std::move(flib_def), std::move(pflr), lib, nullptr)),
+        iterator_state_(std::make_shared<State>(
+            std::move(flib_def), std::move(pflr), lib, nullptr /* iterator */)),
         output_dtypes_(output_dtypes),
         output_shapes_(output_shapes) {}
 
@@ -100,7 +99,17 @@ class IteratorResource : public ResourceBase {
       captured_state = iterator_state_;
     }
     if (captured_state) {
-      return captured_state->iterator->Save(ctx, writer);
+      SerializationContext::Params params;
+      // The iterator state may contain functions that are not present
+      // in ctx's function library. Namely, an iterator may be restored from
+      // a serialized iterator with a modified function library (for example, as
+      // a result of OptimizeDataset). These modified functions are needed
+      // to serialize the iterator again.
+      params.flib_def = captured_state->flib_def.get();
+      params.input_list = ctx->input_list();
+      params.optimization_only = ctx->optimization_only();
+      SerializationContext ctx_with_functions(params);
+      return captured_state->iterator->Save(&ctx_with_functions, writer);
     } else {
       return errors::FailedPrecondition(
           "Save() failed because the iterator has not been initialized. "
@@ -135,9 +144,16 @@ class IteratorResource : public ResourceBase {
     std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
     std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
     TF_RETURN_IF_ERROR(ctx->function_library()->Clone(&flib_def, &pflr, &lib));
-    TF_RETURN_IF_ERROR(flib_def->AddLibrary(graph_def.library()));
-    std::unique_ptr<State> new_state(new State(
-        std::move(flib_def), std::move(pflr), lib, nullptr /* iterator */));
+
+    // Some function names may be duplicated (for example, if the serialized
+    // graph has an optimized function that retains its original name). We
+    // override functions in flib_def in the event of conflict. It is
+    // safe to assume that any node in the serialized graph is referring to the
+    // serialized function when there is a conflict.
+    TF_RETURN_IF_ERROR(
+        AddToFunctionLibrary(flib_def.get(), graph_def.library()));
+    std::unique_ptr<State> new_state = absl::make_unique<State>(
+        std::move(flib_def), std::move(pflr), lib, nullptr /* iterator */);
 
     TF_RETURN_IF_ERROR(
         graph_runner.Run(&graph, new_state->lib, {}, {output_node}, &outputs));
@@ -181,10 +197,10 @@ class IteratorResource : public ResourceBase {
     std::shared_ptr<State> new_state;
     {
       tf_shared_lock l(mu_);
-      new_state.reset(new State(iterator_state_->flib_def,
-                                iterator_state_->pflr, iterator_state_->lib,
-                                nullptr /* function_handle_cache */,
-                                nullptr /* iterator */));
+      new_state = std::make_shared<State>(
+          iterator_state_->flib_def, iterator_state_->pflr,
+          iterator_state_->lib, nullptr /* function_handle_cache */,
+          nullptr /* iterator */);
     }
 
     // Ensure that the iterator has access to all functions in the current
@@ -209,8 +225,8 @@ class IteratorResource : public ResourceBase {
       new_state->lib = lib;
     }
 
-    new_state->function_handle_cache.reset(
-        new FunctionHandleCache(new_state->lib));
+    new_state->function_handle_cache =
+        absl::make_unique<FunctionHandleCache>(new_state->lib);
     // Create new iterator.
     std::unique_ptr<IteratorBase> iterator;
     IteratorContext::Params params(ctx);
@@ -230,7 +246,7 @@ class IteratorResource : public ResourceBase {
     return Status::OK();
   }
 
-  string DebugString() override { return "Iterator resource"; }
+  string DebugString() const override { return "Iterator resource"; }
 
   const DataTypeVector& output_dtypes() const { return output_dtypes_; }
 
@@ -277,124 +293,6 @@ class IteratorResource : public ResourceBase {
 
 namespace {
 
-// Helper class for reading data from a VariantTensorData object.
-class VariantTensorDataReader : public IteratorStateReader {
- public:
-  explicit VariantTensorDataReader(const VariantTensorData* data)
-      : data_(data) {
-    PreProcess();
-  }
-
-  // Returns OK iff the initialization was successful, i.e.,
-  // pre-processing did not have errors.
-  Status status() const { return status_; }
-
-  Status ReadScalar(StringPiece key, int64* val) override {
-    return ReadScalarInternal(key, val);
-  }
-
-  Status ReadScalar(StringPiece key, string* val) override {
-    return ReadScalarInternal(key, val);
-  }
-
-  Status ReadTensor(StringPiece key, Tensor* val) override {
-    return ReadTensorInternal(key, val);
-  }
-
-  bool Contains(StringPiece key) override {
-    return map_.find(string(key)) != map_.end();
-  }
-
- private:
-  void PreProcess() {
-    string metadata;
-    data_->get_metadata(&metadata);
-    IteratorStateMetadata proto;
-    if (!proto.ParseFromString(metadata)) {
-      status_ = errors::Internal("Error parsing IteratorStateMetadata.");
-      return;
-    }
-    size_t num_entries = proto.keys_size();
-    CHECK_EQ(num_entries, data_->tensors_size());
-    for (size_t i = 0; i < num_entries; i++) {
-      map_[proto.keys(i)] = i;
-    }
-  }
-
-  template <typename T>
-  Status ReadScalarInternal(StringPiece key, T* val) {
-    if (map_.find(string(key)) == map_.end()) {
-      return errors::NotFound(key);
-    }
-    *val = data_->tensors(map_[string(key)]).scalar<T>()();
-    return Status::OK();
-  }
-
-  Status ReadTensorInternal(StringPiece key, Tensor* val) {
-    if (map_.find(string(key)) == map_.end()) {
-      return errors::NotFound(key);
-    }
-    *val = data_->tensors(map_[string(key)]);
-    return Status::OK();
-  }
-
-  std::map<string, size_t> map_;
-  const VariantTensorData* data_;  // Not owned.
-  Status status_;
-};
-
-// Helper class for writing data to a VariantTensorData object.
-class VariantTensorDataWriter : public IteratorStateWriter {
- public:
-  // Does not take ownership of data.
-  explicit VariantTensorDataWriter(VariantTensorData* data) : data_(data) {}
-
-  Status WriteScalar(StringPiece key, const int64 val) override {
-    return WriteScalarInternal(key, val);
-  }
-
-  Status WriteScalar(StringPiece key, const string& val) override {
-    return WriteScalarInternal(key, val);
-  }
-
-  Status WriteTensor(StringPiece key, const Tensor& val) override {
-    return WriteTensorInternal(key, val);
-  }
-
-  // Writes the metadata to `data_`.
-  Status Flush() {
-    string metadata;
-    if (!metadata_proto_.SerializeToString(&metadata)) {
-      return errors::Internal("Unable to serialize IteratorStateMetadata.");
-    }
-    data_->set_metadata(metadata);
-    return Status::OK();
-  }
-
- private:
-  template <typename T>
-  Status WriteScalarInternal(StringPiece key, const T& val) {
-    Tensor val_t = Tensor(DataTypeToEnum<T>::v(), TensorShape({}));
-    val_t.scalar<T>()() = val;
-    return WriteTensorInternal(key, val_t);
-  }
-
-  Status WriteTensorInternal(StringPiece key, const Tensor& val) {
-    // Write key to the metadata proto. This gets written to `data_`
-    // when `Flush()` is called. We do this lazily to avoid multiple
-    // serialization calls.
-    metadata_proto_.add_keys(string(key));
-
-    // Update tensors.
-    *(data_->add_tensors()) = val;
-    return Status::OK();
-  }
-
-  VariantTensorData* data_;
-  // TODO(srbs): Set the version string.
-  IteratorStateMetadata metadata_proto_;
-};
-
 // Wrapper for encoding/decoding the iterator state stored in a Variant tensor.
 // The get() method returns an IteratorStateReader which can be used
 // to restore iterator state.
@@ -433,7 +331,7 @@ class IteratorStateVariant {
     SerializationContext::Params params;
     params.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
     SerializationContext serialization_ctx(params);
-    data_.reset(new VariantTensorData());
+    data_ = absl::make_unique<VariantTensorData>();
     data_->set_type_name(TypeName());
     VariantTensorDataWriter writer(data_.get());
     TF_RETURN_IF_ERROR(iterator_resource->Save(&serialization_ctx, &writer));
@@ -446,25 +344,20 @@ class IteratorStateVariant {
     if (data.type_name() != TypeName()) {
       return false;
     }
-    std::unique_ptr<VariantTensorData> tensor_data(new VariantTensorData);
+    std::unique_ptr<VariantTensorData> tensor_data =
+        absl::make_unique<VariantTensorData>();
     std::swap(*tensor_data, data);
-    std::unique_ptr<VariantTensorDataReader> reader(
-        new VariantTensorDataReader(tensor_data.get()));
-    status_ = reader->status();
-    if (!status_.ok()) {
-      return false;
-    }
+    std::unique_ptr<VariantTensorDataReader> reader =
+        absl::make_unique<VariantTensorDataReader>(tensor_data.get());
     data_ = std::move(tensor_data);
     reader_ = std::move(reader);
     return true;
   }
   IteratorStateReader* get() { return reader_.get(); }
-  Status status() const { return status_; }
   string DebugString() const {
     if (data_) {
-      return strings::StrCat("IteratorStateVariant<",
-                             "data: ", data_->DebugString(),
-                             " status: ", status_.ToString(), ">");
+      return strings::StrCat("IteratorStateVariant<", data_->DebugString(),
+                             ">");
     } else {
       return strings::StrCat("IteratorStateVariant<empty>");
     }
@@ -472,7 +365,6 @@ class IteratorStateVariant {
 
  private:
   std::unique_ptr<IteratorStateReader> reader_;
-  Status status_;
   std::unique_ptr<VariantTensorData> data_;
 };
 
@@ -583,12 +475,12 @@ FunctionLibraryRuntime* IteratorHandleOp::CreatePrivateFLR(
   *device_mgr = absl::make_unique<DeviceMgr>(RenamedDevice::NewRenamedDevice(
       ctx->device()->name(), down_cast<Device*>(ctx->device()),
       false /* owns_underlying */, false /* isolate_session_state */));
-  flib_def->reset(new FunctionLibraryDefinition(
-      *ctx->function_library()->GetFunctionLibraryDefinition()));
-  pflr->reset(new ProcessFunctionLibraryRuntime(
+  *flib_def = absl::make_unique<FunctionLibraryDefinition>(
+      *ctx->function_library()->GetFunctionLibraryDefinition());
+  *pflr = absl::make_unique<ProcessFunctionLibraryRuntime>(
       device_mgr->get(), ctx->env(), graph_def_version_, flib_def->get(),
-      {} /* TODO(mrry): OptimizerOptions? */,
-      nullptr /* TODO(mrry): ClusterFLR */));
+      OptimizerOptions{} /* TODO(mrry): OptimizerOptions? */,
+      nullptr /* TODO(mrry): ClusterFLR */);
 
   return (*pflr)->GetFLR(ctx->device()->name());
 }
@@ -676,9 +568,12 @@ class ToSingleElementOp : public AsyncOpKernel {
           ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
       std::unique_ptr<IteratorBase> iterator;
       IteratorContext::Params params(ctx);
-      std::unique_ptr<FunctionHandleCache> function_handle_cache(
-          new FunctionHandleCache(params.lib));
+      std::unique_ptr<FunctionHandleCache> function_handle_cache =
+          absl::make_unique<FunctionHandleCache>(params.lib);
       params.function_handle_cache = function_handle_cache.get();
+      std::unique_ptr<ResourceMgr> resource_mgr =
+          absl::make_unique<ResourceMgr>();
+      params.resource_mgr = resource_mgr.get();
       IteratorContext iter_ctx(std::move(params));
 
       OP_REQUIRES_OK_ASYNC(
@@ -689,7 +584,7 @@ class ToSingleElementOp : public AsyncOpKernel {
       // NOTE(jsimsa): We must destroy the iterator before calling `done()`, to
       // avoid destruction races.
       IteratorBase* raw_iterator = iterator.release();
-      auto cleanup = gtl::MakeCleanup([ctx, raw_iterator, done] {
+      auto cleanup = gtl::MakeCleanup([raw_iterator, done] {
         delete raw_iterator;
         done();
       });
@@ -764,9 +659,12 @@ class ReduceDatasetOp : public AsyncOpKernel {
           done);
 
       IteratorContext::Params params(ctx);
-      std::unique_ptr<FunctionHandleCache> function_handle_cache(
-          new FunctionHandleCache(params.lib));
+      std::unique_ptr<FunctionHandleCache> function_handle_cache =
+          absl::make_unique<FunctionHandleCache>(params.lib);
       params.function_handle_cache = function_handle_cache.get();
+      std::unique_ptr<ResourceMgr> resource_mgr =
+          absl::make_unique<ResourceMgr>();
+      params.resource_mgr = resource_mgr.get();
       IteratorContext iter_ctx(std::move(params));
       std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func;
       OP_REQUIRES_OK_ASYNC(
@@ -1086,78 +984,58 @@ void IteratorGetNextSyncOp::Compute(OpKernelContext* ctx) {
   }
 }
 
-namespace {
+void IteratorGetNextAsOptionalOp::ComputeAsync(OpKernelContext* ctx,
+                                               DoneCallback done) {
+  IteratorResource* iterator;
+  OP_REQUIRES_OK_ASYNC(
+      ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator), done);
+  // The call to `iterator->GetNext()` may block and depend on an
+  // inter-op thread pool thread, so we issue the call from the
+  // owned thread pool.
+  background_worker_.Schedule(std::bind(
+      [this, ctx, iterator](DoneCallback done) {
+        std::vector<Tensor> components;
+        bool end_of_sequence = false;
 
-class IteratorGetNextAsOptionalOp : public AsyncOpKernel {
- public:
-  explicit IteratorGetNextAsOptionalOp(OpKernelConstruction* ctx)
-      : AsyncOpKernel(ctx),
-        background_worker_(ctx->env(),
-                           "tf_data_iterator_get_next_as_optional") {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-  }
+        Status s = iterator->GetNext(IteratorContext(ctx), &components,
+                                     &end_of_sequence);
+        // NOTE(mrry): We must unref the iterator before calling `done()`, to
+        // avoid destruction races.
+        iterator->Unref();
 
-  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
-    IteratorResource* iterator;
-    OP_REQUIRES_OK_ASYNC(
-        ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator), done);
-    // The call to `iterator->GetNext()` may block and depend on an
-    // inter-op thread pool thread, so we issue the call from the
-    // owned thread pool.
-    background_worker_.Schedule(std::bind(
-        [this, ctx, iterator](DoneCallback done) {
-          std::vector<Tensor> components;
-          bool end_of_sequence = false;
-
-          Status s = iterator->GetNext(IteratorContext(ctx), &components,
-                                       &end_of_sequence);
-          // NOTE(mrry): We must unref the iterator before calling `done()`, to
-          // avoid destruction races.
-          iterator->Unref();
-
-          if (!s.ok()) {
-            ctx->SetStatus(s);
-          } else if (end_of_sequence) {
-            OP_REQUIRES_OK_ASYNC(ctx, WriteOptionalNoneToOutput(ctx, 0), done);
-          } else {
-            for (int i = 0; i < components.size(); ++i) {
-              OP_REQUIRES_ASYNC(
-                  ctx, components[i].dtype() == output_types_[i],
-                  errors::InvalidArgument(
-                      "The given optional does not match the expected type for "
-                      "component ",
-                      i, ". Expected: ", DataTypeString(output_types_[i]),
-                      ". Actual: ", DataTypeString(components[i].dtype()), "."),
-                  done);
-              OP_REQUIRES_ASYNC(
-                  ctx,
-                  output_shapes_[i].IsCompatibleWith(components[i].shape()),
-                  errors::InvalidArgument(
-                      "The given optional does not match the expected shape "
-                      "for component ",
-                      i, ". Expected: ", output_shapes_[i].DebugString(),
-                      ". Actual: ", components[i].shape().DebugString(), "."),
-                  done);
-            }
-
-            OP_REQUIRES_OK_ASYNC(
-                ctx,
-                WriteOptionalWithValueToOutput(ctx, 0, std::move(components)),
+        if (!s.ok()) {
+          ctx->SetStatus(s);
+        } else if (end_of_sequence) {
+          OP_REQUIRES_OK_ASYNC(ctx, WriteOptionalNoneToOutput(ctx, 0), done);
+        } else {
+          for (int i = 0; i < components.size(); ++i) {
+            OP_REQUIRES_ASYNC(
+                ctx, components[i].dtype() == output_types_[i],
+                errors::InvalidArgument(
+                    "The given optional does not match the expected type for "
+                    "component ",
+                    i, ". Expected: ", DataTypeString(output_types_[i]),
+                    ". Actual: ", DataTypeString(components[i].dtype()), "."),
+                done);
+            OP_REQUIRES_ASYNC(
+                ctx, output_shapes_[i].IsCompatibleWith(components[i].shape()),
+                errors::InvalidArgument(
+                    "The given optional does not match the expected shape "
+                    "for component ",
+                    i, ". Expected: ", output_shapes_[i].DebugString(),
+                    ". Actual: ", components[i].shape().DebugString(), "."),
                 done);
           }
-          done();
-        },
-        std::move(done)));
-  }
-
- private:
-  BackgroundWorker background_worker_;
-  DataTypeVector output_types_;
-  std::vector<PartialTensorShape> output_shapes_;
-};
 
-}  // namespace
+          OP_REQUIRES_OK_ASYNC(
+              ctx,
+              WriteOptionalWithValueToOutput(ctx, 0, std::move(components)),
+              done);
+        }
+        done();
+      },
+      std::move(done)));
+}
 
 void IteratorToStringHandleOp::Compute(OpKernelContext* ctx) {
   const Tensor& resource_handle_t = ctx->input(0);
@@ -1271,12 +1149,10 @@ class DeserializeIteratorOp : public OpKernel {
     OP_REQUIRES(ctx, wrapper != nullptr,
                 errors::InvalidArgument(
                     "DeserializeIteratorOp: Unable to parse variant tensor."));
-    OP_REQUIRES_OK(ctx, wrapper->status());
     OP_REQUIRES_OK(ctx, iterator_resource->Restore(ctx, wrapper->get()));
   }
 };
 
-
 REGISTER_KERNEL_BUILDER(Name("Iterator").Device(DEVICE_CPU), IteratorHandleOp);
 REGISTER_KERNEL_BUILDER(Name("IteratorV2").Device(DEVICE_CPU).Priority(2),
                         IteratorHandleOp);
diff --git a/tensorflow/core/kernels/data/iterator_ops.h b/tensorflow/core/kernels/data/iterator_ops.h
index cd72269859044e6efd97a10ad43bc00c90df7d7d..7d769d365e9aa8d6952a9a8cdb461bc63957d031 100644
--- a/tensorflow/core/kernels/data/iterator_ops.h
+++ b/tensorflow/core/kernels/data/iterator_ops.h
@@ -19,6 +19,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/ops_util.h"
 
 namespace tensorflow {
@@ -115,6 +117,24 @@ class IteratorGetNextOp : public AsyncOpKernel {
   BackgroundWorker background_worker_;
 };
 
+class IteratorGetNextAsOptionalOp : public AsyncOpKernel {
+ public:
+  explicit IteratorGetNextAsOptionalOp(OpKernelConstruction* ctx)
+      : AsyncOpKernel(ctx),
+        background_worker_(ctx->env(),
+                           "tf_data_iterator_get_next_as_optional") {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
+
+ private:
+  BackgroundWorker background_worker_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
 class IteratorGetNextSyncOp : public OpKernel {
  public:
   explicit IteratorGetNextSyncOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index fc6e93a81cb47372fa023a2f793d35008ab830c8..e516d7791bfdf2ac40805553dfb5a3afef64d802 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -120,7 +119,7 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return MakeUnique<Iterator>(
+      return absl::make_unique<Iterator>(
           Iterator::Params{this, strings::StrCat(prefix, "::Map")}, map_func_);
     }
 
@@ -139,7 +138,6 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, func_.name()));
       Node* input_graph_node = nullptr;
       TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
 
@@ -149,7 +147,13 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
       other_arguments.reserve(captured_func_->captured_inputs().size());
       for (const Tensor& t : captured_func_->captured_inputs()) {
         Node* node;
-        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        DatasetBase* input;
+        Status s = GetDatasetFromVariantTensor(t, &input);
+        if (s.ok()) {
+          TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input, &node));
+        } else {
+          TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        }
         other_arguments.emplace_back(node);
         other_arguments_types.emplace_back(t.dtype());
       }
diff --git a/tensorflow/core/kernels/data/map_dataset_op_test.cc b/tensorflow/core/kernels/data/map_dataset_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f9c1cf493647ad4649218dfc24f293e2845f1d22
--- /dev/null
+++ b/tensorflow/core/kernels/data/map_dataset_op_test.cc
@@ -0,0 +1,534 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/iterator_ops.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kNodeName[] = "map_dataset";
+constexpr char kOpName[] = "MapDataset";
+
+class MapDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates a new MapDataset op kernel. The `input_dataset` parameter should be
+  // same with the node name of the input dataset for the method
+  // `CreateMapDatasetContext()`. `T` specifies the output dtype of MapDataset.
+  template <typename T>
+  Status CreateMapDatasetOpKernel(const string& input_dataset,
+                                  const string& func_name,
+                                  std::unique_ptr<OpKernel>* map_kernel) {
+    FunctionDefHelper::AttrValueWrapper func =
+        FunctionDefHelper::FunctionRef(func_name, {{"T", DT_INT64}});
+
+    map_node_def_ = test::function::NDef(
+        kNodeName, kOpName, {input_dataset},
+        {{"f", func},
+         {"Targuments", {}},
+         {"output_shapes", gtl::ArraySlice<TensorShape>{{}}},
+         {"output_types",
+          gtl::ArraySlice<DataType>{tensorflow::DataTypeToEnum<T>::value}},
+         {"use_inter_op_parallelism", true},
+         {"preserve_cardinality", false}});
+    TF_CHECK_OK(CreateOpKernel(map_node_def_, map_kernel));
+    return Status::OK();
+  }
+
+  // Creates a new MapDataset op kernel context.
+  Status CreateMapDatasetContext(
+      DatasetBase* const input_dataset, OpKernel* const map_kernel,
+      std::unique_ptr<OpKernelContext>* map_context) {
+    map_inputs_.clear();
+    // Save the input dataset into a variant tensor as the input of MapDataset.
+    Tensor dataset_tensor(DT_VARIANT, TensorShape({}));
+    TF_RETURN_IF_ERROR(
+        StoreDatasetInVariantTensor(input_dataset, &dataset_tensor));
+    Variant variant = dataset_tensor.scalar<Variant>()();
+    TF_RETURN_IF_ERROR(AddDatasetInputFromArray<Variant>(
+        &map_inputs_, map_kernel->input_types(), TensorShape({}), {variant}));
+    input_dataset->Ref();
+    TF_RETURN_IF_ERROR(
+        CreateOpKernelContext(map_kernel, &map_inputs_, map_context));
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*map_kernel, map_inputs_));
+    return Status::OK();
+  }
+
+ private:
+  NodeDef map_node_def_;
+  gtl::InlinedVector<TensorValue, 4> map_inputs_;
+};
+
+struct GetNextTestParams {
+  explicit GetNextTestParams(int64 input_start, int64 input_end,
+                             int64 input_step, string input_func_name,
+                             std::vector<int64> input_expected_values,
+                             std::vector<FunctionDef> input_func_lib)
+      : start(input_start),
+        end(input_end),
+        step(input_step),
+        func_name(std::move(input_func_name)),
+        expected_values(std::move(input_expected_values)),
+        func_lib(std::move(input_func_lib)) {}
+
+  int64 start;
+  int64 end;
+  int64 step;
+  string func_name;
+  std::vector<int64> expected_values;
+  std::vector<FunctionDef> func_lib;
+};
+
+struct DatasetGetNextTest : MapDatasetOpTest,
+                            ::testing::WithParamInterface<GetNextTestParams> {};
+
+TEST_P(DatasetGetNextTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  GetNextTestParams test_params = GetParam();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_params.func_lib, cpu_num));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(test_params.start, test_params.end,
+                                         test_params.step, "range",
+                                         &range_dataset));
+  core::ScopedUnref scored_unref_range_dataset(range_dataset);
+
+  std::unique_ptr<OpKernel> map_kernel;
+  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
+      range_dataset->name(), test_params.func_name, &map_kernel));
+  std::unique_ptr<OpKernelContext> map_context;
+  TF_ASSERT_OK(
+      CreateMapDatasetContext(range_dataset, map_kernel.get(), &map_context));
+  DatasetBase* map_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(map_kernel.get(), map_context.get(), &map_dataset));
+  core::ScopedUnref scored_unref_map_dataset(map_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(map_context.get(), &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      map_dataset->MakeIterator(iterator_context.get(), "Iterator", &iterator));
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  while (!end_of_sequence) {
+    TF_EXPECT_OK(iterator->GetNext(iterator_context.get(), &out_tensors,
+                                   &end_of_sequence));
+  }
+
+  EXPECT_EQ(out_tensors.size(), test_params.expected_values.size());
+  for (size_t i = 0; i < out_tensors.size(); ++i) {
+    int64 actual_value = out_tensors[i].flat<int64>()(0);
+    int64 expect_value = test_params.expected_values[i];
+    EXPECT_EQ(actual_value, expect_value);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    MapDatasetOpTest, DatasetGetNextTest,
+    ::testing::Values(
+        GetNextTestParams(
+            0, 10, 3, "XTimesTwo", std::vector<int64>{0, 6, 12, 18},
+            std::vector<FunctionDef>{test::function::XTimesTwo()}),
+        GetNextTestParams(0, 10, 3, "XAddX", std::vector<int64>{0, 6, 12, 18},
+                          std::vector<FunctionDef>{test::function::XAddX()}),
+        GetNextTestParams(
+            10, 0, -3, "XTimesFour", std::vector<int64>{40, 28, 16, 4},
+            std::vector<FunctionDef>{test::function::XTimesTwo(),
+                                     test::function::XTimesFour()})));
+
+TEST_F(MapDatasetOpTest, DatasetName) {
+  int thread_num = 2, cpu_num = 2;
+  int64 start = 0, end = 10, step = 1;
+  FunctionDef func_def = test::function::XTimesTwo();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({func_def}, cpu_num));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateRangeDataset<int64>(start, end, step, "range", &range_dataset));
+  core::ScopedUnref scored_unref_range_dataset(range_dataset);
+
+  std::unique_ptr<OpKernel> map_kernel;
+  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
+      range_dataset->name(), func_def.signature().name(), &map_kernel));
+  std::unique_ptr<OpKernelContext> map_context;
+  TF_ASSERT_OK(
+      CreateMapDatasetContext(range_dataset, map_kernel.get(), &map_context));
+  DatasetBase* map_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(map_kernel.get(), map_context.get(), &map_dataset));
+  core::ScopedUnref scored_unref_map_dataset(map_dataset);
+
+  EXPECT_EQ(map_dataset->name(), kOpName);
+}
+
+TEST_F(MapDatasetOpTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  int64 start = 0, end = 10, step = 1;
+  FunctionDef func_def = test::function::XTimesTwo();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({func_def}, cpu_num));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateRangeDataset<int64>(start, end, step, "range", &range_dataset));
+  core::ScopedUnref scored_unref_range_dataset(range_dataset);
+
+  std::unique_ptr<OpKernel> map_kernel;
+  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
+      range_dataset->name(), func_def.signature().name(), &map_kernel));
+  std::unique_ptr<OpKernelContext> map_context;
+  TF_ASSERT_OK(
+      CreateMapDatasetContext(range_dataset, map_kernel.get(), &map_context));
+  DatasetBase* map_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(map_kernel.get(), map_context.get(), &map_dataset));
+  core::ScopedUnref scored_unref_map_dataset(map_dataset);
+
+  DataTypeVector expected_dtypes({DT_INT64});
+  EXPECT_EQ(map_dataset->output_dtypes(), expected_dtypes);
+}
+
+TEST_F(MapDatasetOpTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  int64 start = 0, end = 10, step = 1;
+  FunctionDef func_def = test::function::XTimesTwo();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({func_def}, cpu_num));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateRangeDataset<int64>(start, end, step, "range", &range_dataset));
+  core::ScopedUnref scored_unref_range_dataset(range_dataset);
+
+  std::unique_ptr<OpKernel> map_kernel;
+  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
+      range_dataset->name(), func_def.signature().name(), &map_kernel));
+  std::unique_ptr<OpKernelContext> map_context;
+  TF_ASSERT_OK(
+      CreateMapDatasetContext(range_dataset, map_kernel.get(), &map_context));
+  DatasetBase* map_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(map_kernel.get(), map_context.get(), &map_dataset));
+  core::ScopedUnref scored_unref_map_dataset(map_dataset);
+
+  std::vector<PartialTensorShape> expected_shapes({PartialTensorShape({})});
+  EXPECT_EQ(map_dataset->output_shapes().size(), expected_shapes.size());
+  for (int i = 0; i < map_dataset->output_shapes().size(); ++i) {
+    EXPECT_TRUE(
+        map_dataset->output_shapes()[i].IsIdenticalTo(expected_shapes[i]));
+  }
+}
+
+struct CardinalityTestParams {
+  explicit CardinalityTestParams(int64 input_start, int64 input_end,
+                                 int64 input_step,
+                                 int input_expected_cardinality)
+      : start(input_start),
+        end(input_end),
+        step(input_step),
+        expected_cardinality(input_expected_cardinality) {}
+
+  int64 start;
+  int64 end;
+  int64 step;
+  int expected_cardinality;
+};
+
+struct DatasetCardinalityTest
+    : MapDatasetOpTest,
+      ::testing::WithParamInterface<CardinalityTestParams> {};
+
+TEST_P(DatasetCardinalityTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  CardinalityTestParams test_params = GetParam();
+  FunctionDef func_def = test::function::XTimesTwo();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({func_def}, cpu_num));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(test_params.start, test_params.end,
+                                         test_params.step, "range",
+                                         &range_dataset));
+  core::ScopedUnref scored_unref_range_dataset(range_dataset);
+
+  std::unique_ptr<OpKernel> map_kernel;
+  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
+      range_dataset->name(), func_def.signature().name(), &map_kernel));
+  std::unique_ptr<OpKernelContext> map_context;
+  TF_ASSERT_OK(
+      CreateMapDatasetContext(range_dataset, map_kernel.get(), &map_context));
+  DatasetBase* map_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(map_kernel.get(), map_context.get(), &map_dataset));
+  core::ScopedUnref scored_unref_map_dataset(map_dataset);
+
+  EXPECT_EQ(map_dataset->Cardinality(), test_params.expected_cardinality);
+}
+
+INSTANTIATE_TEST_CASE_P(MapDatasetOpTest, DatasetCardinalityTest,
+                        ::testing::Values(CardinalityTestParams(0, 10, 1, 10),
+                                          CardinalityTestParams(0, 10, 3, 4),
+                                          CardinalityTestParams(10, 0, -3, 4)));
+
+TEST_F(MapDatasetOpTest, DatasetSave) {
+  int thread_num = 2, cpu_num = 2;
+  int64 start = 0, end = 10, step = 1;
+  FunctionDef func_def = test::function::XTimesTwo();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({func_def}, cpu_num));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateRangeDataset<int64>(start, end, step, "range", &range_dataset));
+  core::ScopedUnref scored_unref_range_dataset(range_dataset);
+
+  std::unique_ptr<OpKernel> map_kernel;
+  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
+      range_dataset->name(), func_def.signature().name(), &map_kernel));
+  std::unique_ptr<OpKernelContext> map_context;
+  TF_ASSERT_OK(
+      CreateMapDatasetContext(range_dataset, map_kernel.get(), &map_context));
+  DatasetBase* map_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(map_kernel.get(), map_context.get(), &map_dataset));
+  core::ScopedUnref scored_unref_map_dataset(map_dataset);
+
+  std::unique_ptr<SerializationContext> serialization_context;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(map_dataset->Save(serialization_context.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+TEST_F(MapDatasetOpTest, IteratorOutputDtypes) {
+  int64 start = 0, end = 10, step = 1;
+  int thread_num = 2, cpu_num = 2;
+  FunctionDef func_def = test::function::XTimesTwo();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({func_def}, cpu_num));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateRangeDataset<int64>(start, end, step, "range", &range_dataset));
+  core::ScopedUnref scored_unref_range_dataset(range_dataset);
+
+  std::unique_ptr<OpKernel> map_kernel;
+  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
+      range_dataset->name(), func_def.signature().name(), &map_kernel));
+  std::unique_ptr<OpKernelContext> map_context;
+  TF_ASSERT_OK(
+      CreateMapDatasetContext(range_dataset, map_kernel.get(), &map_context));
+  DatasetBase* map_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(map_kernel.get(), map_context.get(), &map_dataset));
+  core::ScopedUnref scored_unref_map_dataset(map_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(map_context.get(), &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      map_dataset->MakeIterator(iterator_context.get(), "Iterator", &iterator));
+  DataTypeVector expected_dtypes({DT_INT64});
+  EXPECT_EQ(iterator->output_dtypes(), expected_dtypes);
+}
+
+TEST_F(MapDatasetOpTest, IteratorOutputShapes) {
+  int64 start = 0, end = 10, step = 1;
+  int thread_num = 2, cpu_num = 2;
+  FunctionDef func_def = test::function::XTimesTwo();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({func_def}, cpu_num));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateRangeDataset<int64>(start, end, step, "range", &range_dataset));
+  core::ScopedUnref scored_unref_range_dataset(range_dataset);
+
+  std::unique_ptr<OpKernel> map_kernel;
+  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
+      range_dataset->name(), func_def.signature().name(), &map_kernel));
+  std::unique_ptr<OpKernelContext> map_context;
+  TF_ASSERT_OK(
+      CreateMapDatasetContext(range_dataset, map_kernel.get(), &map_context));
+  DatasetBase* map_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(map_kernel.get(), map_context.get(), &map_dataset));
+  core::ScopedUnref scored_unref_map_dataset(map_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(map_context.get(), &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      map_dataset->MakeIterator(iterator_context.get(), "Iterator", &iterator));
+
+  std::vector<PartialTensorShape> expected_shapes({PartialTensorShape({})});
+  EXPECT_EQ(iterator->output_shapes().size(), expected_shapes.size());
+  for (int i = 0; i < map_dataset->output_shapes().size(); ++i) {
+    EXPECT_TRUE(iterator->output_shapes()[i].IsIdenticalTo(expected_shapes[i]));
+  }
+}
+
+TEST_F(MapDatasetOpTest, IteratorOutputPrefix) {
+  int64 start = 0, end = 10, step = 1;
+  int thread_num = 2, cpu_num = 2;
+  FunctionDef func_def = test::function::XTimesTwo();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({func_def}, cpu_num));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateRangeDataset<int64>(start, end, step, "range", &range_dataset));
+  core::ScopedUnref scored_unref_range_dataset(range_dataset);
+
+  std::unique_ptr<OpKernel> map_kernel;
+  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
+      range_dataset->name(), func_def.signature().name(), &map_kernel));
+  std::unique_ptr<OpKernelContext> map_context;
+  TF_ASSERT_OK(
+      CreateMapDatasetContext(range_dataset, map_kernel.get(), &map_context));
+  DatasetBase* map_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(map_kernel.get(), map_context.get(), &map_dataset));
+  core::ScopedUnref scored_unref_map_dataset(map_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(map_context.get(), &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      map_dataset->MakeIterator(iterator_context.get(), "Iterator", &iterator));
+
+  EXPECT_EQ(iterator->prefix(), "Iterator::Map");
+}
+
+struct RoundtripTestParams {
+  explicit RoundtripTestParams(int64 input_start, int64 input_end,
+                               int64 input_step, int input_breakpoint,
+                               int64 input_expected_value,
+                               string input_func_name,
+                               std::vector<FunctionDef> input_func_lib)
+      : start(input_start),
+        end(input_end),
+        step(input_step),
+        breakpoint(input_breakpoint),
+        expected_value(input_expected_value),
+        func_name(std::move(input_func_name)),
+        func_lib(std::move(input_func_lib)) {}
+
+  int64 start;
+  int64 end;
+  int64 step;
+  int breakpoint;
+  int64 expected_value;
+  string func_name;
+  std::vector<FunctionDef> func_lib;
+};
+
+struct IteratorRoundtripTest
+    : MapDatasetOpTest,
+      ::testing::WithParamInterface<RoundtripTestParams> {};
+
+TEST_P(IteratorRoundtripTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  RoundtripTestParams test_params = GetParam();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime(test_params.func_lib, cpu_num));
+
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(CreateRangeDataset<int64>(test_params.start, test_params.end,
+                                         test_params.step, "range",
+                                         &range_dataset));
+  core::ScopedUnref scored_unref_range_dataset(range_dataset);
+
+  std::unique_ptr<OpKernel> map_kernel;
+  TF_ASSERT_OK(CreateMapDatasetOpKernel<int64>(
+      range_dataset->name(), test_params.func_name, &map_kernel));
+  std::unique_ptr<OpKernelContext> map_context;
+  TF_ASSERT_OK(
+      CreateMapDatasetContext(range_dataset, map_kernel.get(), &map_context));
+  DatasetBase* map_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(map_kernel.get(), map_context.get(), &map_dataset));
+  core::ScopedUnref scored_unref_map_dataset(map_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(map_context.get(), &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      map_dataset->MakeIterator(iterator_context.get(), "Iterator", &iterator));
+
+  std::vector<Tensor> out_tensors;
+  bool end_of_sequence = false;
+  for (int i = 0; i < test_params.breakpoint; i++) {
+    TF_EXPECT_OK(iterator->GetNext(iterator_context.get(), &out_tensors,
+                                   &end_of_sequence));
+  }
+
+  std::unique_ptr<SerializationContext> serialization_context;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(iterator->Save(serialization_context.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+  VariantTensorDataReader reader(&data);
+  TF_ASSERT_OK(iterator->Restore(iterator_context.get(), &reader));
+  TF_EXPECT_OK(iterator->GetNext(iterator_context.get(), &out_tensors,
+                                 &end_of_sequence));
+  EXPECT_EQ(out_tensors.back().flat<int64>()(0), test_params.expected_value);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    MapDatasetOpTest, IteratorRoundtripTest,
+    ::testing::Values(RoundtripTestParams(0, 10, 2, 0, 0, "XTimesTwo",
+                                          std::vector<FunctionDef>{
+                                              test::function::XTimesTwo()}),
+                      RoundtripTestParams(0, 10, 2, 4, 16, "XAddX",
+                                          std::vector<FunctionDef>{
+                                              test::function::XAddX()}),
+                      RoundtripTestParams(0, 10, 2, 6, 32, "XTimesFour",
+                                          std::vector<FunctionDef>{
+                                              test::function::XTimesTwo(),
+                                              test::function::XTimesFour()})));
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/map_defun_op.cc b/tensorflow/core/kernels/data/map_defun_op.cc
index 705b0393de09e7117457370dcf9fcdef37142109..f1be942a633c347c6c3156e6a27e4deb79be4298 100644
--- a/tensorflow/core/kernels/data/map_defun_op.cc
+++ b/tensorflow/core/kernels/data/map_defun_op.cc
@@ -92,7 +92,7 @@ class MapDefunOp : public AsyncOpKernel {
       // We use a different cancellation manager each time the function is run
       // to avoid the race condition between a function run error and other
       // functions being cancelled as a result.
-      CancellationManager* c_mgr = new CancellationManager;
+      CancellationManager* c_mgr = new CancellationManager();
       CancellationToken token = parent_mgr->get_cancellation_token();
       const bool success = parent_mgr->RegisterCallback(
           token, [c_mgr]() { c_mgr->StartCancel(); });
@@ -258,6 +258,7 @@ class MapDefunOp : public AsyncOpKernel {
             "output: ",
             index);
       }
+      Tensor* out;
       {  // Locking scope
         mutex_lock l(compute_opts_->mu);
         if (!compute_opts_->output_shapes.at(index).IsCompatibleWith(
@@ -272,15 +273,15 @@ class MapDefunOp : public AsyncOpKernel {
           // this index. Store the shape and allocate the output accordingly.
           compute_opts_->output_shapes.at(index) = val.shape();
 
-          Tensor* out = nullptr;
           TensorShape actual_shape = val.shape();
           actual_shape.InsertDim(0, compute_opts_->batch_size);
           TF_RETURN_IF_ERROR(
               compute_opts_->output.allocate(index, actual_shape, &out));
+        } else {
+          out = (compute_opts_->output)[index];
         }
-        return batch_util::CopyElementToSlice(
-            val, (compute_opts_->output)[index], iter_);
       }
+      return batch_util::CopyElementToSlice(val, out, iter_);
     }
 
    private:
diff --git a/tensorflow/core/kernels/data/model_dataset_op.cc b/tensorflow/core/kernels/data/model_dataset_op.cc
index 069d61d80d4f00eecdd77356626d7278c0842445..20254234e9da492d5b5faad502e092e15d993a91 100644
--- a/tensorflow/core/kernels/data/model_dataset_op.cc
+++ b/tensorflow/core/kernels/data/model_dataset_op.cc
@@ -13,17 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/memory/memory.h"
+#include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
-const int kOptimizationPeriodThresholdMs = 60 * EnvTime::kSecondsToMicros;
+constexpr int kOptimizationPeriodThresholdMs = 60 * EnvTime::kSecondsToMicros;
 
 class ModelDatasetOp : public UnaryDatasetOpKernel {
  public:
@@ -38,7 +41,7 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
  private:
   class Dataset : public DatasetBase {
    public:
-    explicit Dataset(OpKernelContext* ctx, const DatasetBase* input)
+    Dataset(OpKernelContext* ctx, const DatasetBase* input)
         : DatasetBase(DatasetContext(ctx)), input_(input) {
       input_->Ref();
     }
@@ -47,8 +50,8 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Model")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Model")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -76,8 +79,12 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            model_(std::make_shared<model::Model>()) {}
+          : DatasetIterator<Dataset>(params) {
+        auto remove_node_hook = [](std::shared_ptr<model::Node> node) {
+          metrics::RecordTFDataElements(node->name(), node->num_elements());
+        };
+        model_ = std::make_shared<model::Model>(std::move(remove_node_hook));
+      }
 
       ~Iterator() override {
         // Signal the optimize thread to terminate it. We will then join that
@@ -131,7 +138,8 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
       Status EnsureOptimizeThreadStarted(IteratorContext* ctx)
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         if (!optimize_thread_) {
-          std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
+          std::shared_ptr<IteratorContext> new_ctx =
+              std::make_shared<IteratorContext>(*ctx);
           optimize_thread_.reset(ctx->env()->StartThread(
               {}, "tf_data_model",
               [this, new_ctx]() { OptimizeThread(new_ctx); }));
diff --git a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
index ba2125a66eb98985ebd0ae8f55bfc239997ad6df..d78ed6006c0207425a1aaf822b3a79406a1002fe 100644
--- a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
+++ b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
@@ -59,7 +59,7 @@ class MultiDeviceIterator : public ResourceBase {
     DCHECK(lib_ != nullptr);
   }
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat("MultiDeviceIterator for ", devices_.size(),
                            " devices");
   }
@@ -81,9 +81,8 @@ class MultiDeviceIterator : public ResourceBase {
     ++incarnation_id_;
     *incarnation_id = incarnation_id_;
 
-    multi_device_buffer_.reset(
-        new MultiDeviceBuffer(devices_.size(), max_buffer_size, incarnation_id_,
-                              std::move(iterator)));
+    multi_device_buffer_ = absl::make_unique<MultiDeviceBuffer>(
+        devices_.size(), max_buffer_size, incarnation_id_, std::move(iterator));
     return Status::OK();
   }
 
@@ -152,7 +151,7 @@ class MultiDeviceIterator : public ResourceBase {
     void Reset() LOCKS_EXCLUDED(mu_) {
       {
         mutex_lock l(mu_);
-        if (!background_thread_finished_) {
+        if (background_thread_ && !background_thread_finished_) {
           cancelled_ = true;
           // Wake up the background thread.
           for (int i = 0; i < size_; ++i) {
@@ -217,10 +216,11 @@ class MultiDeviceIterator : public ResourceBase {
     void EnsureBackgroundThreadStarted(IteratorContext* ctx)
         EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (!background_thread_) {
-        background_thread_.reset(ctx->env()->StartThread(
+        auto ctx_copy = std::make_shared<IteratorContext>(*ctx);
+        background_thread_ = absl::WrapUnique<Thread>(ctx->env()->StartThread(
             {}, "tf_data_multi_device_iterator",
             std::bind(&MultiDeviceIterator::MultiDeviceBuffer::BackgroundThread,
-                      this, new IteratorContext(*ctx))));
+                      this, std::move(ctx_copy))));
       }
     }
 
@@ -258,12 +258,11 @@ class MultiDeviceIterator : public ResourceBase {
       }
     }
 
-    void BackgroundThread(IteratorContext* ctx) {
+    void BackgroundThread(std::shared_ptr<IteratorContext> ctx) {
       {
         mutex_lock l(mu_);
         background_thread_started_ = true;
       }
-      std::unique_ptr<IteratorContext> cleanup(ctx);
       int shard_to_fetch = 0;
       while (true) {
         HostBufferElement elem;
@@ -284,8 +283,8 @@ class MultiDeviceIterator : public ResourceBase {
           }
         }
 
-        elem.status =
-            host_iterator_->GetNext(ctx, &elem.value, &elem.end_of_sequence);
+        elem.status = host_iterator_->GetNext(ctx.get(), &elem.value,
+                                              &elem.end_of_sequence);
 
         if (elem.status.ok() && elem.end_of_sequence) {
           end_of_iterator = true;
@@ -360,6 +359,9 @@ class MultiDeviceIterator : public ResourceBase {
   std::unique_ptr<MultiDeviceBuffer> multi_device_buffer_ GUARDED_BY(mu_);
 };
 
+// Used to generate unique names for anonymous multi device iterators.
+static std::atomic<int64> current_id_;
+
 // Just creates a MultiDeviceIterator and returns it.
 class MultiDeviceIteratorHandleOp : public OpKernel {
  public:
@@ -389,6 +391,8 @@ class MultiDeviceIteratorHandleOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_) {
+    string unique_name = cinfo_.name();
+    string container_name = cinfo_.container();
     {
       mutex_lock l(mu_);
       if (resource_ == nullptr) {
@@ -397,37 +401,51 @@ class MultiDeviceIteratorHandleOp : public OpKernel {
         std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
         OP_REQUIRES_OK(context, context->function_library()->Clone(
                                     &flib_def, &pflr, &lib));
-        std::unique_ptr<FunctionHandleCache> function_handle_cache(
-            new FunctionHandleCache(lib));
+        std::unique_ptr<FunctionHandleCache> function_handle_cache =
+            absl::make_unique<FunctionHandleCache>(lib);
         ResourceMgr* mgr = context->resource_manager();
         OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
 
         MultiDeviceIterator* resource;
-        OP_REQUIRES_OK(context,
-                       mgr->LookupOrCreate<MultiDeviceIterator>(
-                           cinfo_.container(), cinfo_.name(), &resource,
-                           [this, lib, &flib_def, &pflr,
-                            &function_handle_cache](MultiDeviceIterator** ret)
-                               EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-                                 *ret = new MultiDeviceIterator(
-                                     output_types_, output_shapes_, devices_,
-                                     std::move(flib_def), std::move(pflr), lib,
-                                     std::move(function_handle_cache));
-                                 return Status::OK();
-                               }));
-
-        Status s = VerifyResource(resource);
-        if (TF_PREDICT_FALSE(!s.ok())) {
-          resource->Unref();
-          context->SetStatus(s);
-          return;
-        }
 
-        resource_ = resource;
+        if (name_ == ResourceHandle::ANONYMOUS_NAME) {
+          unique_name = strings::StrCat("_AnonymousMultiDeviceIterator",
+                                        current_id_.fetch_add(1));
+          container_name = "AnonymousMultiDeviceIterator";
+          resource = new MultiDeviceIterator(
+              output_types_, output_shapes_, devices_, std::move(flib_def),
+              std::move(pflr), lib, std::move(function_handle_cache));
+          // NOTE: `mgr->Create()` transfers the one reference on `resource` to
+          // `mgr`.
+          OP_REQUIRES_OK(context, mgr->Create<MultiDeviceIterator>(
+                                      container_name, unique_name, resource));
+        } else {
+          unique_name = cinfo_.name();
+          container_name = cinfo_.container();
+          OP_REQUIRES_OK(context,
+                         mgr->LookupOrCreate<MultiDeviceIterator>(
+                             container_name, unique_name, &resource,
+                             [this, lib, &flib_def, &pflr,
+                              &function_handle_cache](MultiDeviceIterator** ret)
+                                 EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                                   *ret = new MultiDeviceIterator(
+                                       output_types_, output_shapes_, devices_,
+                                       std::move(flib_def), std::move(pflr),
+                                       lib, std::move(function_handle_cache));
+                                   return Status::OK();
+                                 }));
+          Status s = VerifyResource(resource);
+          if (TF_PREDICT_FALSE(!s.ok())) {
+            resource->Unref();
+            context->SetStatus(s);
+            return;
+          }
+          resource_ = resource;
+        }
       }
     }
     OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
-                                context, 0, cinfo_.container(), cinfo_.name(),
+                                context, 0, container_name, unique_name,
                                 MakeTypeIndex<MultiDeviceIterator>()));
   }
 
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
index 9c50d8050a82397f1578ab3f577ef5ad77f81767..17094e3001738becdbc3bf4d98aaaa6a9917d054 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -14,32 +14,19 @@ limitations under the License.
 ==============================================================================*/
 #include <map>
 
-#include "tensorflow/core/common_runtime/device_mgr.h"
-#include "tensorflow/core/common_runtime/graph_runner.h"
-#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/framework/device_base.h"
-#include "tensorflow/core/framework/function_handle_cache.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
-#include "tensorflow/core/grappler/graph_view.h"
-#include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/grappler/grappler_item_builder.h"
-#include "tensorflow/core/grappler/optimizers/data/function_utils.h"
-#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
-#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
-#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/kernels/data/graph_rewrite_dataset.h"
 #include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
+constexpr char kOptimizerName[] = "tf_data_meta_optimizer";
+
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 class OptimizeDatasetOp : public UnaryDatasetOpKernel {
@@ -69,286 +56,35 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphRewriteDataset {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input,
             const std::vector<string>& optimizations,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
-        : DatasetBase(DatasetContext(ctx)),
-          optimized_input_(nullptr),
-          input_(input),
-          optimizations_(optimizations),
-          output_types_(output_types),
-          output_shapes_(output_shapes) {
-      input_->Ref();
-    }
-
-    ~Dataset() override {
-      input_->Unref();
-      if (optimized_input_) {
-        optimized_input_->Unref();
-      }
-    }
-
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      // We do not add a token for the optimization dataset to the prefix. The
-      // prefix is used to identify checkpoint elements and since the
-      // optimization dataset is excluded from the checkpoint, adding a token
-      // here would result in invalid checkpoint identifiers.
-      return std::unique_ptr<IteratorBase>(new Iterator({this, prefix}));
-    }
-
-    Status Optimize(OpKernelContext* ctx) {
-      GraphDefBuilder b;
-      DatasetGraphDefBuilder db(&b);
-      Node* input_node = nullptr;
-      SerializationContext::Params params;
-      std::vector<std::pair<string, Tensor>> input_list;
-      params.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
-      params.input_list = &input_list;
-      params.optimization_only = true;
-      SerializationContext serialization_ctx(params);
-      TF_RETURN_IF_ERROR(
-          db.AddInputDataset(&serialization_ctx, input_, &input_node));
-      string output_node = input_node->name();
-
-      GraphDef graph_def;
-      TF_RETURN_IF_ERROR(b.ToGraphDef(&graph_def));
-      VLOG(3) << "Before optimization: " << graph_def.DebugString();
-
-      TF_RETURN_IF_ERROR(ApplyOptimizations(ctx, &graph_def, &output_node));
-      VLOG(3) << "After optimization: " << graph_def.DebugString();
-
-      // Instantiate the optimized input pipeline by running the optimized graph
-      // using the optimized function library.
-      TF_RETURN_IF_ERROR(
-          ctx->function_library()->Clone(&flib_def_, &pflr_, &lib_));
-
-      // Create a FunctionHandleCache.
-      function_handle_cache_.reset(new FunctionHandleCache(lib_));
-
-      // Some functions may have been modified without having their names
-      // changed (for example, nested dataset graphs from FlatMap or
-      // Interleave). To avoid name conflicts, we remove these functions from
-      // flib_def_ before adding the optimized function library.
-      for (const FunctionDef& fd : graph_def.library().function()) {
-        if (flib_def_->Find(fd.signature().name()) != nullptr) {
-          TF_RETURN_IF_ERROR(flib_def_->RemoveFunction(fd.signature().name()));
-        }
-      }
-      TF_RETURN_IF_ERROR(flib_def_->AddLibrary(graph_def.library()));
-
-      Graph graph(OpRegistry::Global());
-      TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, nullptr));
-      std::vector<Tensor> outputs;
-      GraphRunner graph_runner(ctx->function_library()->device());
-
-      TF_RETURN_IF_ERROR(
-          graph_runner.Run(&graph, lib_, input_list, {output_node}, &outputs));
-      TF_RETURN_IF_ERROR(
-          GetDatasetFromVariantTensor(outputs[0], &optimized_input_));
-      optimized_input_->Ref();
-      return Status::OK();
-    }
-
-    const DataTypeVector& output_dtypes() const override {
-      return output_types_;
-    }
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      return output_shapes_;
-    }
+        : GraphRewriteDataset(ctx, input, output_types, output_shapes),
+          optimizations_(optimizations) {}
 
     string DebugString() const override { return "OptimizeDatasetOp::Dataset"; }
 
-    int64 Cardinality() const override { return input_->Cardinality(); }
-
-   protected:
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      // We only serialize the optimized dataset to avoid re-running
-      // optimizations when the input pipeline is restored from a checkpoint.
-      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, optimized_input_, output));
-      return Status::OK();
-    }
-
    private:
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params) {}
-
-      Status Initialize(IteratorContext* ctx) override {
-        IteratorContext::Params params(ctx);
-        params.lib = dataset()->lib_;
-        params.function_handle_cache = dataset()->function_handle_cache_.get();
-        return dataset()->optimized_input_->MakeIterator(
-            IteratorContext(std::move(params)), prefix(), &input_impl_);
-      }
-
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        IteratorContext::Params params(ctx);
-        params.lib = dataset()->lib_;
-        params.function_handle_cache = dataset()->function_handle_cache_.get();
-        return input_impl_->GetNext(IteratorContext(std::move(params)),
-                                    out_tensors, end_of_sequence);
-      }
-
-     protected:
-      std::shared_ptr<model::Node> CreateNode(
-          IteratorContext* ctx, model::Node::Args args) const override {
-        return model::MakeKnownRatioNode(std::move(args),
-                                         /*ratio=*/1);
-      }
-
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
-        return Status::OK();
-      }
-
-      Status RestoreInternal(IteratorContext* ctx,
-                             IteratorStateReader* reader) override {
-        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        return Status::OK();
-      }
-
-     private:
-      std::unique_ptr<IteratorBase> input_impl_;
-    };
-
-    void AddFakeSinks(FunctionDef* function_def) {
-      int counter = 0;
-      for (const auto& output : function_def->signature().output_arg()) {
-        NodeDef* node = function_def->add_node_def();
-        tensorflow::grappler::function_utils::SetUniqueFunctionNodeName(
-            strings::StrCat("FakeSink", counter++), function_def, node);
-        node->set_op("Identity");
-        node->add_input(function_def->ret().at(output.name()));
-        (*node->mutable_attr())["T"].set_type(output.type());
-
-        (*function_def->mutable_ret())[output.name()] =
-            strings::StrCat(node->name(), ":output:0");
+    RewriterConfig CreateGrapplerRewriteConfig() override {
+      RewriterConfig rewriter_config;
+      rewriter_config.add_optimizers(kOptimizerName);
+      rewriter_config.set_meta_optimizer_iterations(
+          RewriterConfig_NumIterationsType_ONE);
+      auto custom_optimizer = rewriter_config.add_custom_optimizers();
+      custom_optimizer->set_name(kOptimizerName);
+      auto* custom_optimizations_list =
+          (*custom_optimizer->mutable_parameter_map())["optimizers"]
+              .mutable_list();
+      for (const auto& opt : optimizations_) {
+        custom_optimizations_list->add_s(opt);
       }
+      return rewriter_config;
     }
 
-    void RemoveFakeSinks(FunctionDef* function_def) {
-      // Map from identity node names to their input tensor strings
-      std::map<string, string> identity_map;
-      for (const auto& node : function_def->node_def()) {
-        if (node.op() == "Identity" && node.input_size() == 1) {
-          identity_map[node.name()] = node.input(0);
-        }
-      }
-      for (const auto& output_arg : function_def->signature().output_arg()) {
-        const string& tensor = function_def->ret().at(output_arg.name());
-        const string& output_node = tensor.substr(0, tensor.find(':'));
-        if (identity_map.find(output_node) != identity_map.end()) {
-          (*function_def->mutable_ret())[output_arg.name()] =
-              identity_map.at(output_node);
-        }
-      }
-    }
-
-    Status ApplyOptimizations(OpKernelContext* ctx, GraphDef* graph_def,
-                              string* output_node) {
-      // Add an identity node as the fetch node, otherwise we might get
-      // 'placeholder is both fed and fetched' errors in some cases when using
-      // input list with placeholder dataset nodes.
-      NodeDef* node = graph_def->mutable_node()->Add();
-      tensorflow::grappler::graph_utils::SetUniqueGraphNodeName(
-          "Sink", graph_def, node);
-      node->set_op("Identity");
-      node->add_input(*output_node);
-      (*node->mutable_attr())["T"].set_type(DT_VARIANT);
-      *output_node = node->name();
-
-      // Add fake sink node to graph and functions to allow rewriting the actual
-      // sink nodes.
-      // TODO(b/118820916): When MetaOptimizer adds provisions for function
-      // retvals to be optimizable, we will no longer need this.
-      for (auto& function_def :
-           *graph_def->mutable_library()->mutable_function()) {
-        AddFakeSinks(&function_def);
-      }
-
-      // Create metagraph.
-      MetaGraphDef meta_graph_def;
-      (*meta_graph_def.mutable_graph_def()) = *graph_def;
-
-      // Grappler determines fetch ops from collection 'train_op'.
-      CollectionDef collection_def;
-      auto node_list = collection_def.mutable_node_list();
-      node_list->add_value(*output_node);
-      (*meta_graph_def.mutable_collection_def())["train_op"] = collection_def;
-
-      // Create Grappler item.
-      tensorflow::ConfigProto config;
-      RewriterConfig& rewriter_config =
-          *config.mutable_graph_options()->mutable_rewrite_options();
-      for (const string& optimization : optimizations_) {
-        rewriter_config.add_optimizers(optimization);
-      }
-      // If no optimizations were specified, supply a non-existent
-      // optimization to prevent Grappler from applying the default set of
-      // optimizations as some of them do not work out of the box at the
-      // moment (e.g. because we have no cost model for dataset ops).
-      if (optimizations_.empty()) {
-        rewriter_config.add_optimizers("non-existent");
-      } else {
-        // If we apply custom dataset optimizers, explicitly trigger a subset of
-        // standard grappler optimizations to further optimize modified dataset
-        // graphs (e.g. performing constant folding on merged functions,
-        // removing unused graph nodes)
-        // TODO(b/118175421): This should be part of the tf.data optimization
-        // pass manager.
-        // TODO(b/120437209): Apply `constfold` optimization when it is fixed.
-        for (const auto& optimizer :
-             {"pruning", "function", "shape", "arithmetic", "dependency"}) {
-          rewriter_config.add_optimizers(optimizer);
-        }
-      }
-      tensorflow::grappler::ItemConfig item_config;
-      item_config.apply_optimizations = true;
-      std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
-          tensorflow::grappler::GrapplerItemFromMetaGraphDef(
-              "graph", meta_graph_def, item_config);
-      std::unordered_map<string, tensorflow::DeviceProperties> device_map;
-      tensorflow::grappler::VirtualCluster cluster(device_map);
-
-      // Run optimizer.
-      if (VLOG_IS_ON(2)) {
-        LOG(INFO) << "Performing the following optimizations:";
-        for (const string& optimization : optimizations_) {
-          LOG(INFO) << "  " << optimization;
-        }
-      }
-      TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
-          *grappler_item, config, ctx->device(), &cluster, graph_def));
-
-      // Remove fake sinks after optimizations are done.
-      // TODO(b/118820916): When MetaOptimizer adds provisions for function
-      // retvals to be optimizable, we will no longer need this.
-      for (auto& function_def :
-           *graph_def->mutable_library()->mutable_function()) {
-        RemoveFakeSinks(&function_def);
-      }
-
-      return Status::OK();
-    }
-
-    DatasetBase* optimized_input_;
-    FunctionLibraryRuntime* lib_ = nullptr;
-    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_ = nullptr;
-    std::unique_ptr<FunctionLibraryDefinition> flib_def_ = nullptr;
-    std::unique_ptr<FunctionHandleCache> function_handle_cache_ = nullptr;
-    const DatasetBase* input_;
     const std::vector<string> optimizations_;
-    const DataTypeVector output_types_;
-    const std::vector<PartialTensorShape> output_shapes_;
   };
 
   const int graph_def_version_;
diff --git a/tensorflow/core/kernels/data/optional_ops.cc b/tensorflow/core/kernels/data/optional_ops.cc
index d8a7f21c5f99c6d99e506847e00cabc6bd49168f..473dbebd3062486de3cd48764ed45d9a059832d9 100644
--- a/tensorflow/core/kernels/data/optional_ops.cc
+++ b/tensorflow/core/kernels/data/optional_ops.cc
@@ -23,133 +23,6 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-class OptionalNoneOp : public OpKernel {
- public:
-  explicit OptionalNoneOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    OP_REQUIRES_OK(ctx, WriteOptionalNoneToOutput(ctx, 0));
-  }
-};
-
-class OptionalFromValueOp : public OpKernel {
- public:
-  explicit OptionalFromValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    OpInputList components_input;
-    OP_REQUIRES_OK(ctx, ctx->input_list("components", &components_input));
-    std::vector<Tensor> components(components_input.begin(),
-                                   components_input.end());
-    OP_REQUIRES_OK(
-        ctx, WriteOptionalWithValueToOutput(ctx, 0, std::move(components)));
-  }
-};
-
-class OptionalHasValueOp : public OpKernel {
- public:
-  explicit OptionalHasValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor* optional_input;
-    OP_REQUIRES_OK(ctx, ctx->input("optional", &optional_input));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(optional_input->shape()),
-                errors::InvalidArgument(
-                    "Input to OptionalHasValue must be a scalar tensor "
-                    "containing an OptionalVariant object."));
-    const OptionalVariant* optional =
-        optional_input->scalar<Variant>()().get<OptionalVariant>();
-    OP_REQUIRES(
-        ctx, optional != nullptr,
-        errors::InvalidArgument(
-            "Input to OptionalHasValue must be an OptionalVariant object."));
-    Tensor* result;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {}, &result));
-    result->scalar<bool>()() = optional->has_value();
-  }
-};
-
-class OptionalGetValueOp : public OpKernel {
- public:
-  explicit OptionalGetValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES(
-        ctx, output_shapes_.size() == output_types_.size(),
-        errors::InvalidArgument(
-            "output_types and output_shapes must be same length, got:\n",
-            "output_types: ", output_types_.size(), "\n",
-            "output_shapes: ", output_shapes_.size()));
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor* optional_input;
-    OP_REQUIRES_OK(ctx, ctx->input("optional", &optional_input));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(optional_input->shape()),
-                errors::InvalidArgument(
-                    "Input to OptionalHasValue must be a scalar tensor "
-                    "containing an OptionalVariant object."));
-    const OptionalVariant* optional =
-        optional_input->scalar<Variant>()().get<OptionalVariant>();
-    OP_REQUIRES(
-        ctx, optional != nullptr,
-        errors::InvalidArgument(
-            "Input to OptionalHasValue must be an OptionalVariant object."));
-    OP_REQUIRES(
-        ctx, optional->has_value(),
-        errors::InvalidArgument("The given optional does not have a value."));
-    const auto& components = optional->get_values();
-    OP_REQUIRES(ctx, components.size() == output_types_.size(),
-                errors::InvalidArgument(
-                    "The given optional has ", components.size(),
-                    " components, expected ", output_types_.size()));
-    for (int i = 0; i < components.size(); ++i) {
-      OP_REQUIRES(
-          ctx, components[i].dtype() == output_types_[i],
-          errors::InvalidArgument(
-              "The given optional does not match the expected type for "
-              "component ",
-              i, ". Expected: ", DataTypeString(output_types_[i]),
-              ". Actual: ", DataTypeString(components[i].dtype()), "."));
-      OP_REQUIRES(ctx,
-                  output_shapes_[i].IsCompatibleWith(components[i].shape()),
-                  errors::InvalidArgument(
-                      "The given optional does not match the expected shape "
-                      "for component ",
-                      i, ". Expected: ", output_shapes_[i].DebugString(),
-                      ". Actual: ", components[i].shape().DebugString(), "."));
-      ctx->set_output(i, components[i]);
-    }
-  }
-
- private:
-  DataTypeVector output_types_;
-  std::vector<PartialTensorShape> output_shapes_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE_CPU).Priority(2),
-                        OptionalNoneOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE_GPU).Priority(1),
-                        OptionalNoneOp);
-REGISTER_KERNEL_BUILDER(
-    Name("OptionalFromValue").Device(DEVICE_CPU).Priority(2),
-    OptionalFromValueOp);
-REGISTER_KERNEL_BUILDER(
-    Name("OptionalFromValue").Device(DEVICE_GPU).Priority(1),
-    OptionalFromValueOp);
-
-REGISTER_KERNEL_BUILDER(Name("OptionalHasValue").Device(DEVICE_CPU).Priority(2),
-                        OptionalHasValueOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalHasValue")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("has_value")
-                            .Priority(1),
-                        OptionalHasValueOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE_CPU).Priority(2),
-                        OptionalGetValueOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE_GPU).Priority(1),
-                        OptionalGetValueOp);
-
 static Status OptionalDeviceCopy(
     const OptionalVariant& from, OptionalVariant* to,
     const UnaryVariantOpRegistry::AsyncTensorDeviceCopyFn& copy) {
@@ -159,9 +32,13 @@ static Status OptionalDeviceCopy(
     to_values.reserve(from_values.size());
     for (const Tensor& t : from_values) {
       if (DMAHelper::CanUseDMA(&t) || t.dtype() == DT_VARIANT) {
-        Tensor tmp(t.dtype());
-        TF_RETURN_IF_ERROR(copy(t, &tmp));
-        to_values.push_back(std::move(tmp));
+        // NOTE(skyewm): we're careful to make sure the lifetime of the 'to'
+        // Tensor passed to `copy` (i.e. to_values.back()) is the same as the
+        // returned 'to' OptionalVariant. This is because `copy` may spawn async
+        // callbacks that don't run until after this function returns and access
+        // the 'to' Tensor (e.g. BaseGPUDevice::MaybeCopyTensorToGPU).
+        to_values.emplace_back(t.dtype());
+        TF_RETURN_IF_ERROR(copy(t, &to_values.back()));
       } else {
         to_values.push_back(t);
       }
@@ -186,6 +63,75 @@ REGISTER_UNARY_VARIANT_DECODE_FUNCTION(OptionalVariant,
 
 }  // namespace
 
+void OptionalNoneOp::Compute(OpKernelContext* ctx) {
+  OP_REQUIRES_OK(ctx, WriteOptionalNoneToOutput(ctx, 0));
+}
+
+void OptionalFromValueOp::Compute(OpKernelContext* ctx) {
+  OpInputList components_input;
+  OP_REQUIRES_OK(ctx, ctx->input_list("components", &components_input));
+  std::vector<Tensor> components(components_input.begin(),
+                                 components_input.end());
+  OP_REQUIRES_OK(ctx,
+                 WriteOptionalWithValueToOutput(ctx, 0, std::move(components)));
+}
+
+void OptionalHasValueOp::Compute(OpKernelContext* ctx) {
+  const Tensor* optional_input;
+  OP_REQUIRES_OK(ctx, ctx->input("optional", &optional_input));
+  OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(optional_input->shape()),
+              errors::InvalidArgument(
+                  "Input to OptionalHasValue must be a scalar tensor "
+                  "containing an OptionalVariant object."));
+  const OptionalVariant* optional =
+      optional_input->scalar<Variant>()().get<OptionalVariant>();
+  OP_REQUIRES(
+      ctx, optional != nullptr,
+      errors::InvalidArgument(
+          "Input to OptionalHasValue must be an OptionalVariant object."));
+  Tensor* result;
+  OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {}, &result));
+  result->scalar<bool>()() = optional->has_value();
+}
+
+void OptionalGetValueOp::Compute(OpKernelContext* ctx) {
+  const Tensor* optional_input;
+  OP_REQUIRES_OK(ctx, ctx->input("optional", &optional_input));
+  OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(optional_input->shape()),
+              errors::InvalidArgument(
+                  "Input to OptionalHasValue must be a scalar tensor "
+                  "containing an OptionalVariant object."));
+  const OptionalVariant* optional =
+      optional_input->scalar<Variant>()().get<OptionalVariant>();
+  OP_REQUIRES(
+      ctx, optional != nullptr,
+      errors::InvalidArgument(
+          "Input to OptionalHasValue must be an OptionalVariant object."));
+  OP_REQUIRES(
+      ctx, optional->has_value(),
+      errors::InvalidArgument("The given optional does not have a value."));
+  const auto& components = optional->get_values();
+  OP_REQUIRES(
+      ctx, components.size() == output_types_.size(),
+      errors::InvalidArgument("The given optional has ", components.size(),
+                              " components, expected ", output_types_.size()));
+  for (int i = 0; i < components.size(); ++i) {
+    OP_REQUIRES(ctx, components[i].dtype() == output_types_[i],
+                errors::InvalidArgument(
+                    "The given optional does not match the expected type for "
+                    "component ",
+                    i, ". Expected: ", DataTypeString(output_types_[i]),
+                    ". Actual: ", DataTypeString(components[i].dtype()), "."));
+    OP_REQUIRES(ctx, output_shapes_[i].IsCompatibleWith(components[i].shape()),
+                errors::InvalidArgument(
+                    "The given optional does not match the expected shape "
+                    "for component ",
+                    i, ". Expected: ", output_shapes_[i].DebugString(),
+                    ". Actual: ", components[i].shape().DebugString(), "."));
+    ctx->set_output(i, components[i]);
+  }
+}
+
 Status WriteOptionalWithValueToOutput(OpKernelContext* ctx, int output_index,
                                       std::vector<Tensor> value) {
   OptionalVariant v(std::move(value));
@@ -209,6 +155,33 @@ Status WriteOptionalNoneToOutput(OpKernelContext* ctx, int output_index) {
   return Status::OK();
 }
 
+namespace {
+
+REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE_CPU).Priority(2),
+                        OptionalNoneOp);
+REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE_GPU).Priority(1),
+                        OptionalNoneOp);
+REGISTER_KERNEL_BUILDER(
+    Name("OptionalFromValue").Device(DEVICE_CPU).Priority(2),
+    OptionalFromValueOp);
+REGISTER_KERNEL_BUILDER(
+    Name("OptionalFromValue").Device(DEVICE_GPU).Priority(1),
+    OptionalFromValueOp);
+
+REGISTER_KERNEL_BUILDER(Name("OptionalHasValue").Device(DEVICE_CPU).Priority(2),
+                        OptionalHasValueOp);
+REGISTER_KERNEL_BUILDER(Name("OptionalHasValue")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("has_value")
+                            .Priority(1),
+                        OptionalHasValueOp);
+REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE_CPU).Priority(2),
+                        OptionalGetValueOp);
+REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE_GPU).Priority(1),
+                        OptionalGetValueOp);
+
+}  // namespace
+
 REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION(ZEROS_LIKE_VARIANT_UNARY_OP,
                                          DEVICE_CPU, OptionalVariant,
                                          OptionalZerosLike<CPUDevice>);
@@ -217,12 +190,5 @@ REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(ADD_VARIANT_BINARY_OP, DEVICE_CPU,
                                           OptionalVariant,
                                           OptionalBinaryAdd<CPUDevice>);
 
-Status OptionalShape(const OptionalVariant& x, TensorShape* s) {
-  *s = TensorShape({});
-  return Status::OK();
-}
-
-REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(OptionalVariant, OptionalShape);
-
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/optional_ops.h b/tensorflow/core/kernels/data/optional_ops.h
index ef14e843115da0c37d79c6be13b8064c78c072d5..24eb1b81d903b391d413cbfc9b10499c84125a40 100644
--- a/tensorflow/core/kernels/data/optional_ops.h
+++ b/tensorflow/core/kernels/data/optional_ops.h
@@ -44,8 +44,9 @@ class OptionalVariant {
 
   // Create an `OptionalVariant` with the actual value given by the tuple of
   // tensors in `values`.
-  explicit OptionalVariant(std::vector<Tensor> values)
-      : values_(new std::vector<Tensor>(std::move(values))) {}
+  explicit OptionalVariant(std::vector<Tensor> values) {
+    values_ = std::make_shared<std::vector<Tensor>>(std::move(values));
+  }
 
   OptionalVariant(const OptionalVariant& other) : values_(other.values_) {}
 
@@ -79,7 +80,7 @@ class OptionalVariant {
       return false;
     }
     if (has_value) {
-      values_.reset(new std::vector<Tensor>(data.tensors()));
+      values_ = std::make_shared<std::vector<Tensor>>(data.tensors());
     } else {
       values_.reset();
     }
@@ -151,6 +152,47 @@ Status OptionalBinaryAdd(OpKernelContext* ctx, const OptionalVariant& a,
   return Status::OK();
 }
 
+class OptionalNoneOp : public OpKernel {
+ public:
+  explicit OptionalNoneOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+class OptionalFromValueOp : public OpKernel {
+ public:
+  explicit OptionalFromValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+class OptionalHasValueOp : public OpKernel {
+ public:
+  explicit OptionalHasValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+class OptionalGetValueOp : public OpKernel {
+ public:
+  explicit OptionalGetValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES(
+        ctx, output_shapes_.size() == output_types_.size(),
+        errors::InvalidArgument(
+            "output_types and output_shapes must be same length, got:\n",
+            "output_types: ", output_types_.size(), "\n",
+            "output_shapes: ", output_shapes_.size()));
+  }
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index 0fff4c53706269538f770889744e21fffcae3601..41ea36263c7e6a8bc0190d84247612e591f9d1b9 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -135,8 +135,8 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::PaddedBatch")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::PaddedBatch")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 2f6d91e863401ca4cc56187a9423ae406b5f651a..ddd81d4596ee216c1abd6a17ec94d86c3d41e18c 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -14,24 +14,29 @@ limitations under the License.
 ==============================================================================*/
 #include <atomic>
 #include <deque>
+#include <memory>
 #include <utility>
 
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/util/ptr_util.h"
+#include "tensorflow/core/platform/cpu_info.h"
 
 namespace tensorflow {
 namespace data {
 namespace {
 
+constexpr char kDatasetName[] = "ParallelInterleaveV2";
+
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
@@ -43,12 +48,7 @@ namespace {
 //
 // Furthermore, this class favors modularity over extended functionality. In
 // particular, it refrains from implementing configurable buffering of output
-// elements and prefetching of input iterators, relying on other parts of
-// tf.data to provide this functionality if necessary.
-//
-// The above design choices were made with automated optimizations in mind,
-// isolating the degree of parallelism as the single tunable knob of this
-// implementation.
+// elements and prefetching of input iterators.
 class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit ParallelInterleaveDatasetOp(OpKernelConstruction* ctx)
@@ -76,9 +76,10 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
     int64 num_parallel_calls;
     OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls",
                                             &num_parallel_calls));
-    OP_REQUIRES(ctx, num_parallel_calls > 0 || num_parallel_calls == kAutoTune,
-                errors::InvalidArgument(
-                    "num_parallel_calls must be greater than zero."));
+    OP_REQUIRES(
+        ctx, num_parallel_calls > 0 || num_parallel_calls == model::kAutoTune,
+        errors::InvalidArgument(
+            "num_parallel_calls must be greater than zero."));
     OP_REQUIRES(
         ctx, num_parallel_calls <= cycle_length,
         errors::InvalidArgument(
@@ -89,6 +90,10 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
         ctx, CapturedFunction::Create(interleave_func_, ctx, "other_arguments",
                                       &captured_func));
 
+    if (num_parallel_calls == model::kAutoTune) {
+      metrics::RecordTFDataAutotune(kDatasetName);
+    }
+
     *output =
         new Dataset(ctx, input, interleave_func_, std::move(captured_func),
                     cycle_length, block_length, num_parallel_calls, sloppy_,
@@ -121,9 +126,9 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return MakeUnique<ParallelInterleaveIterator>(
+      return absl::make_unique<ParallelInterleaveIterator>(
           ParallelInterleaveIterator::Params{
-              this, strings::StrCat(prefix, "::ParallelInterleaveV2")},
+              this, strings::StrCat(prefix, "::", kDatasetName)},
           sloppy_);
     }
 
@@ -159,7 +164,13 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       other_arguments.reserve(captured_func_->captured_inputs().size());
       for (const Tensor& t : captured_func_->captured_inputs()) {
         Node* node;
-        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        DatasetBase* input;
+        Status s = GetDatasetFromVariantTensor(t, &input);
+        if (s.ok()) {
+          TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input, &node));
+        } else {
+          TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        }
         other_arguments.emplace_back(node);
         other_arguments_types.emplace_back(t.dtype());
       }
@@ -187,24 +198,22 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
    private:
     class ParallelInterleaveIterator : public DatasetIterator<Dataset> {
      public:
-      explicit ParallelInterleaveIterator(const Params& params, bool sloppy)
+      ParallelInterleaveIterator(const Params& params, bool sloppy)
           : DatasetIterator<Dataset>(params),
             mu_(std::make_shared<mutex>()),
             cond_var_(std::make_shared<condition_variable>()),
             num_parallel_calls_(std::make_shared<model::SharedState>(
                 params.dataset->num_parallel_calls_, mu_, cond_var_)),
             sloppy_(sloppy),
-            args_list_(params.dataset->cycle_length_),
             current_elements_(params.dataset->cycle_length_),
-            element_in_use_(params.dataset->cycle_length_, false),
-            thread_pool_(new thread::ThreadPool(
+            thread_pool_(absl::make_unique<thread::ThreadPool>(
                 Env::Default(), ThreadOptions(),
                 "data_parallel_interleave_worker_pool",
-                dataset()->cycle_length_ /* num_threads */,
+                port::NumSchedulableCPUs() /* num_threads */,
                 false /* low_latency_hint */)) {
         std::vector<string> components =
             str_util::Split(params.prefix, "::", str_util::SkipEmpty());
-        prefix_end_ = components.back();
+        key_prefix_ = components.back();
       }
 
       ~ParallelInterleaveIterator() override {
@@ -220,9 +229,8 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
 
       Status Initialize(IteratorContext* ctx) override {
         mutex_lock l(*mu_);
-        if (num_parallel_calls_->value == kAutoTune) {
+        if (num_parallel_calls_->value == model::kAutoTune) {
           num_parallel_calls_->value = dataset()->cycle_length_;
-          num_parallel_calls_->tunable = true;
         }
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
@@ -233,27 +241,20 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
-        std::shared_ptr<InvocationResult> result;
-        do {
-          result.reset();
-          {
-            mutex_lock l(*mu_);
-            EnsureRunnerThreadStarted(ctx);
-            while (ShouldWait(&result)) {
-              RecordStop(ctx);
-              cond_var_->wait(l);
-              RecordStart(ctx);
-            }
-            if (!result) {
-              *end_of_sequence = true;
-              return Status::OK();
-            }
+        std::shared_ptr<Result> result;
+        {
+          mutex_lock l(*mu_);
+          EnsureThreadsStarted(ctx);
+          while (!Consume(&result)) {
+            RecordStop(ctx);
+            cond_var_->wait(l);
+            RecordStart(ctx);
           }
-          RecordStop(ctx);
-          result->notification.WaitForNotification();
-          RecordStart(ctx);
-        } while (result->skip);
-
+        }
+        if (!result) {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
         if (result->status.ok()) {
           *out_tensors = std::move(result->return_values);
           RecordBufferDequeue(ctx, *out_tensors);
@@ -268,7 +269,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
         return model::MakeAsyncInterleaveManyNode(
             std::move(args),
             {model::MakeParameter("parallelism", num_parallel_calls_, /*min=*/1,
-                                  /*max=*/dataset()->cycle_length_)});
+                                  /*max=*/port::NumSchedulableCPUs())});
       }
 
       Status SaveInternal(IteratorStateWriter* writer) override {
@@ -277,37 +278,22 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
         while (num_calls_ > 0) {
           cond_var_->wait(l);
         }
-        CHECK_EQ(num_calls_, 0);
+        DCHECK_EQ(num_calls_, 0);
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name("invocation_results.size"), invocation_results_.size()));
-        for (size_t i = 0; i < invocation_results_.size(); i++) {
-          std::shared_ptr<InvocationResult> result = invocation_results_[i];
-          TF_RETURN_IF_ERROR(WriteStatusLocked(writer, i, result->status));
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat("invocation_results[", i, "].size")),
-              result->return_values.size()));
-          for (size_t j = 0; j < result->return_values.size(); j++) {
-            TF_RETURN_IF_ERROR(writer->WriteTensor(
-                full_name(
-                    strings::StrCat("invocation_results[", i, "][", j, "]")),
-                result->return_values[j]));
-          }
-          if (result->skip) {
-            TF_RETURN_IF_ERROR(writer->WriteScalar(
-                full_name(strings::StrCat("invocation_results[", i, "].skip")),
-                ""));
-          }
-        }
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("block_index"), block_index_));
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(full_name("cycle_index"), cycle_index_));
         if (end_of_input_) {
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name("end_of_input"), ""));
         }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("element_id_counter"),
+                                               element_id_counter_));
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(full_name("num_open"), num_open_));
         TF_RETURN_IF_ERROR(WriteCurrentElements(writer));
+        TF_RETURN_IF_ERROR(WriteFutureElements(writer));
         return Status::OK();
       }
 
@@ -315,258 +301,393 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
                              IteratorStateReader* reader) override {
         mutex_lock l(*mu_);
         TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        int64 invocation_results_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name("invocation_results.size"), &invocation_results_size));
-        for (size_t i = 0; i < invocation_results_size; i++) {
-          std::shared_ptr<InvocationResult> result(new InvocationResult());
-          invocation_results_.push_back(result);
-          TF_RETURN_IF_ERROR(ReadStatusLocked(reader, i, &result->status));
-          size_t num_return_values;
-          {
-            int64 size;
-            TF_RETURN_IF_ERROR(reader->ReadScalar(
-                full_name(strings::StrCat("invocation_results[", i, "].size")),
-                &size));
-            num_return_values = static_cast<size_t>(size);
-            if (num_return_values != size) {
-              return errors::InvalidArgument(strings::StrCat(
-                  full_name(
-                      strings::StrCat("invocation_results[", i, "].size")),
-                  ": ", size, " is not a valid value of type size_t."));
-            }
-          }
-          result->return_values.reserve(num_return_values);
-          for (size_t j = 0; j < num_return_values; j++) {
-            result->return_values.emplace_back();
-            TF_RETURN_IF_ERROR(
-                reader->ReadTensor(full_name(strings::StrCat(
-                                       "invocation_results[", i, "][", j, "]")),
-                                   &result->return_values.back()));
-          }
-          result->skip = reader->Contains(
-              full_name(strings::StrCat("invocation_results[", i, "].skip")));
-          result->notification.Notify();
-        }
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("block_index"), &block_index_));
         TF_RETURN_IF_ERROR(
             reader->ReadScalar(full_name("cycle_index"), &cycle_index_));
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("element_id_counter"),
+                                              &element_id_counter_));
         if (reader->Contains(full_name("end_of_input"))) end_of_input_ = true;
         TF_RETURN_IF_ERROR(
             reader->ReadScalar(full_name("num_open"), &num_open_));
         TF_RETURN_IF_ERROR(ReadCurrentElements(ctx, reader));
+        TF_RETURN_IF_ERROR(ReadFutureElements(ctx, reader));
         return Status::OK();
       }
 
      private:
-      struct InvocationResult {
-        Notification notification;  // used for coordination with the consumer
-        Status status;              // the invocation status
-        std::vector<Tensor> return_values;  // the invocation result values
-        bool skip;  // if set the result should be skipped
+      // Represents the result of fetching an element from a dataset.
+      struct Result {
+        Status status;
+        std::vector<Tensor> return_values;
+        // Indicates whether the result is ready to be consumed.
+        bool is_ready = false;
       };
 
-      void EnsureRunnerThreadStarted(IteratorContext* ctx)
+      // The interleave transformation repeatedly inputs elements, applies the
+      // user-provided function to transform the input elements to datasets, and
+      // interleaves the elements of these datasets as its output.
+      //
+      // This structure represents an input element and derived state.
+      struct Element {
+        // Unique identifier, needed to support checkpointing.
+        int64 id;
+        // The actual input element.
+        std::vector<Tensor> inputs;
+        // Iterator created from the input element.
+        std::unique_ptr<IteratorBase> iterator;
+        mutex mu;
+        // Buffer for storing the outputs of `iterator`.
+        std::deque<std::shared_ptr<Result>> results GUARDED_BY(mu);
+        // Indicates whether the element is used by a worker thread.
+        bool in_use = false;
+      };
+
+      // Advances the position in the interleave cycle to the next cycle
+      // element.
+      void AdvanceToNextInCycle() EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        block_index_ = 0;
+        cycle_index_ = (cycle_index_ + 1) % dataset()->cycle_length_;
+      }
+
+      // Advances the position in the interleave cycle by one.
+      void AdvancePosition() EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        ++block_index_;
+        if (block_index_ == dataset()->block_length_) {
+          AdvanceToNextInCycle();
+        }
+      }
+
+      // Consumes a result (if available), returning an indication of whether
+      // a result is available. If `true` is returned, `result` either
+      // points to a valid result or is null if end of input has been reached.
+      bool Consume(std::shared_ptr<Result>* result)
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
-        if (!runner_thread_) {
-          std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-          runner_thread_.reset(ctx->env()->StartThread(
-              {}, "tf_data_parallel_interleave_runner",
-              [this, new_ctx]() { RunnerThread(new_ctx); }));
+        if (!sloppy_) {
+          return ConsumeHelper(result);
+        }
+        // If we are allowed to be sloppy (i.e. return results out of order),
+        // try to find an element in the cycle that has a result available.
+        for (int i = 0; i < dataset()->cycle_length_; ++i) {
+          if (ConsumeHelper(result)) {
+            return true;
+          }
+          AdvanceToNextInCycle();
         }
+        return false;
       }
 
-      // Fetches up to `results.size()` outputs from the cycle element at
-      // position `cycle_index`.
+      bool ConsumeHelper(std::shared_ptr<Result>* result)
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        while (true) {
+          std::shared_ptr<Element> element = current_elements_[cycle_index_];
+          if (element) {
+            mutex_lock l(element->mu);
+            if (!element->results.empty()) {
+              if (element->results.front()->is_ready) {
+                // We found a result.
+                std::swap(*result, element->results.front());
+                element->results.pop_front();
+                AdvancePosition();
+                cond_var_->notify_all();
+                return true;
+              } else {
+                // Wait for the result to become ready.
+                return false;
+              }
+            } else if (!element->iterator) {
+              // We reached the end of input for this element. Reset
+              // it and move on to the next cycle element.
+              current_elements_[cycle_index_].reset();
+              AdvanceToNextInCycle();
+              cond_var_->notify_all();
+              continue;
+            } else {
+              // Wait for the iterator to produce a result.
+              return false;
+            }
+          } else {
+            if (!future_elements_.empty() || !end_of_input_) {
+              // Wait for an element to be created.
+              return false;
+            }
+            // No new elements will be created; try to find a
+            // non-empty element in the cycle.
+            for (int i = 0; i < dataset()->cycle_length_; ++i) {
+              AdvanceToNextInCycle();
+              if (current_elements_[cycle_index_]) {
+                break;
+              }
+            }
+            if (current_elements_[cycle_index_]) {
+              continue;
+            }
+            // End of input has been reached.
+            return true;
+          }
+        }
+      }
+
+      // Manages current cycle elements, creating new iterators as needed and
+      // asynchronously fetching results from existing iterators.
       //
-      // If end of input is encountered, the `skip` field of the invocation
-      // result is used to identify results that should be skipped.
-      void FetchOutputs(
-          const std::shared_ptr<IteratorContext>& ctx, int64 cycle_index,
-          const std::vector<std::shared_ptr<InvocationResult>>& results)
-          LOCKS_EXCLUDED(*mu_) {
+      // This method runs in the `current_elements_manager_` background thread.
+      void CurrentElementsManager(const std::shared_ptr<IteratorContext>& ctx) {
         RecordStart(ctx.get());
         auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
-        bool end_of_input = false;
-        for (auto& result : results) {
-          if (!end_of_input) {
-            result->status = current_elements_[cycle_index]->GetNext(
-                ctx.get(), &result->return_values, &end_of_input);
+        auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(*mu_) -> bool {
+          const bool has_more_elements =
+              !future_elements_.empty() || !end_of_input_;
+          const int block_length = dataset()->block_length_;
+          bool all_elements_busy = true;
+          for (auto& element : current_elements_) {
+            if (!element) {
+              if (has_more_elements) {
+                all_elements_busy = false;
+                break;
+              }
+            } else {
+              mutex_lock l(element->mu);
+              if (!element->in_use && element->iterator &&
+                  element->results.size() < block_length) {
+                all_elements_busy = false;
+                break;
+              }
+            }
           }
-          if (end_of_input) {
-            result->skip = true;
+          return all_elements_busy || num_calls_ >= num_parallel_calls_->value;
+        };
+        while (true) {
+          mutex_lock l(*mu_);
+
+          // Wait until this thread is cancelled, the end of input has been
+          // reached.
+          while (!cancelled_ && (!end_of_input_ || num_open_ > 0) && busy()) {
+            RecordStop(ctx.get());
+            cond_var_->wait(l);
+            RecordStart(ctx.get());
           }
-          RecordBufferEnqueue(ctx.get(), result->return_values);
-          {
-            mutex_lock l(*mu_);
-            result->notification.Notify();
-            cond_var_->notify_all();
+
+          if (cancelled_ ||
+              (future_elements_.empty() && end_of_input_ && num_open_ == 0)) {
+            return;
+          }
+
+          for (int i = 0; i < dataset()->cycle_length_; ++i) {
+            int idx = (cycle_index_ + i) % dataset()->cycle_length_;
+            if (!current_elements_[idx]) {
+              if (!future_elements_.empty()) {
+                current_elements_[idx] = std::move(future_elements_.back());
+                future_elements_.pop_back();
+              } else {
+                current_elements_[idx] = MakeElement(ctx);
+                if (!current_elements_[idx]) {
+                  continue;
+                }
+              }
+            }
+            std::shared_ptr<Element> element = current_elements_[idx];
+            if (!element->in_use && element->iterator) {
+              int64 num_results;
+              {
+                mutex_lock l(element->mu);
+                num_results =
+                    dataset()->block_length_ - element->results.size();
+              }
+              if (num_results > 0) {
+                num_calls_++;
+                element->in_use = true;
+                thread_pool_->Schedule(
+                    std::bind(&ParallelInterleaveIterator::FetchResults, this,
+                              ctx, std::move(element), num_results));
+              }
+            }
+          }
+          const auto& stats_aggregator = ctx->stats_aggregator();
+          if (stats_aggregator) {
+            stats_aggregator->AddScalar(
+                strings::StrCat(key_prefix_, "::thread_utilization"),
+                static_cast<float>(num_calls_) /
+                    static_cast<float>(num_parallel_calls_->value));
           }
-          if (!result->status.ok()) {
+          cond_var_->notify_all();
+        }
+      }
+
+      void EnsureThreadsStarted(IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        if (!current_elements_manager_) {
+          auto new_ctx = std::make_shared<IteratorContext>(*ctx);
+          current_elements_manager_ =
+              absl::WrapUnique<Thread>(ctx->env()->StartThread(
+                  {}, "tf_data_parallel_interleave_current",
+                  [this, new_ctx]() { CurrentElementsManager(new_ctx); }));
+        }
+        if (!future_elements_manager_) {
+          auto new_ctx = std::make_shared<IteratorContext>(*ctx);
+          future_elements_manager_ =
+              absl::WrapUnique<Thread>(ctx->env()->StartThread(
+                  {}, "tf_data_parallel_interleave_future",
+                  [this, new_ctx]() { FutureElementsManager(new_ctx); }));
+        }
+      }
+
+      // Fetches up to `dataset()->block_length_` results from `element`.
+      void FetchResults(const std::shared_ptr<IteratorContext>& ctx,
+                        const std::shared_ptr<Element>& element,
+                        int64 num_results) LOCKS_EXCLUDED(*mu_) {
+        RecordStart(ctx.get());
+        auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
+        bool end_of_input = false;
+        for (int64 i = 0; i < num_results; ++i) {
+          auto result = std::make_shared<Result>();
+          result->status = element->iterator->GetNext(
+              ctx.get(), &result->return_values, &end_of_input);
+          if (end_of_input) {
             break;
           }
+          RecordBufferEnqueue(ctx.get(), result->return_values);
+          mutex_lock l(*mu_);
+          mutex_lock l2(element->mu);
+          element->results.push_back(result);
+          result->is_ready = true;
+          cond_var_->notify_all();
         }
 
-        // Release the ownership of the cycle element iterator, closing the
-        // iterator if end of input was encountered.
+        mutex_lock l(*mu_);
+        // Release the ownership of the cycle element iterator.
+        element->in_use = false;
         if (end_of_input) {
-          current_elements_[cycle_index].reset();
+          // Close the iterator if end of input was encountered.
+          element->iterator.reset();
+          element->inputs.clear();
+          --num_open_;
         }
-        mutex_lock l(*mu_);
-        element_in_use_[cycle_index] = false;
-        num_calls_--;
+        --num_calls_;
         const auto& stats_aggregator = ctx->stats_aggregator();
         if (stats_aggregator) {
           stats_aggregator->AddScalar(
-              strings::StrCat(prefix_end_, "::active_parallel_calls"),
-              static_cast<float>(num_calls_));
-        }
-        if (end_of_input) {
-          args_list_[cycle_index].clear();
-          num_open_--;
+              strings::StrCat(key_prefix_, "::thread_utilization"),
+              static_cast<float>(num_calls_) /
+                  static_cast<float>(num_parallel_calls_->value));
         }
         cond_var_->notify_all();
       }
 
-      // Method responsible for 1) creating iterators out of input elements, 2)
-      // determining the order in which elements are fetched from the iterators,
-      // and 3) scheduling the fetching of the elements to a threadpool.
+      // Manages futures cycle elements, creating new iterators as needed and
+      // asynchronously fetching results from existing iterators.
       //
-      // This method runs in the `runner_thread` background thread.
-      void RunnerThread(const std::shared_ptr<IteratorContext>& ctx) {
+      // This method runs in the `future_elements_manager_` background thread.
+      void FutureElementsManager(const std::shared_ptr<IteratorContext>& ctx) {
         RecordStart(ctx.get());
         auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
         auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(*mu_) -> bool {
-          return element_in_use_[cycle_index_] ||
-                 num_calls_ >= num_parallel_calls_->value ||
-                 invocation_results_.size() >=
-                     dataset()->cycle_length_ * dataset()->block_length_;
+          // TODO(jsimsa): Autotune the buffer size.
+          return num_calls_ >= num_parallel_calls_->value ||
+                 future_elements_.size() >= 2 * dataset()->cycle_length_;
         };
         while (true) {
           mutex_lock l(*mu_);
+
           // Wait until this thread is cancelled, the end of input has been
           // reached, or the cycle element at the `cycle_index_` position is
-          // not in use and there is space in the `invocation_results_` queue.
-          while (!cancelled_ && (!end_of_input_ || num_open_ > 0) && busy()) {
+          // not in use.
+          while (!cancelled_ && !end_of_input_ && busy()) {
             RecordStop(ctx.get());
             cond_var_->wait(l);
             RecordStart(ctx.get());
           }
 
-          if (cancelled_ || (end_of_input_ && num_open_ == 0)) {
+          if (cancelled_ || end_of_input_) {
             return;
           }
 
-          while ((!end_of_input_ || num_open_ > 0) && !busy()) {
-            if (!current_elements_[cycle_index_]) {
-              // Try to create a new iterator from the next input element.
-              Status status = input_impl_->GetNext(
-                  ctx.get(), &args_list_[cycle_index_], &end_of_input_);
-              if (!status.ok()) {
-                invocation_results_.emplace_back(new InvocationResult());
-                std::shared_ptr<InvocationResult>& result =
-                    invocation_results_.back();
-                result->status.Update(status);
-                result->notification.Notify();
-                break;
-              }
-              if (!end_of_input_) {
-                Status status = MakeIteratorFromInputElement(
-                    ctx.get(), args_list_[cycle_index_], cycle_index_,
-                    *instantiated_captured_func_, prefix(),
-                    &current_elements_[cycle_index_]);
-                if (!status.ok()) {
-                  invocation_results_.emplace_back(new InvocationResult());
-                  std::shared_ptr<InvocationResult>& result =
-                      invocation_results_.back();
-                  result->status.Update(status);
-                  result->notification.Notify();
-                  break;
-                }
-                ++num_open_;
-              }
+          while (!end_of_input_ && !busy()) {
+            std::shared_ptr<Element> element = MakeElement(ctx);
+            if (!element) {
+              break;
             }
-            if (current_elements_[cycle_index_]) {
-              // Pre-allocate invocation results for outputs to be fetched
-              // and then fetch the outputs asynchronously.
-              std::vector<std::shared_ptr<InvocationResult>> results;
-              results.reserve(dataset()->block_length_);
-              for (int i = 0; i < dataset()->block_length_; ++i) {
-                invocation_results_.emplace_back(new InvocationResult());
-                results.push_back(invocation_results_.back());
-              }
-              num_calls_++;
-              element_in_use_[cycle_index_] = true;
-              thread_pool_->Schedule(
-                  std::bind(&ParallelInterleaveIterator::FetchOutputs, this,
-                            ctx, cycle_index_, std::move(results)));
+            future_elements_.push_front(element);
+            if (!element->iterator) {
+              continue;
             }
-            cycle_index_ = (cycle_index_ + 1) % dataset()->cycle_length_;
+            ++num_calls_;
+            element->in_use = true;
+            thread_pool_->Schedule(
+                std::bind(&ParallelInterleaveIterator::FetchResults, this, ctx,
+                          std::move(element), dataset()->block_length_));
           }
           const auto& stats_aggregator = ctx->stats_aggregator();
           if (stats_aggregator) {
-            // TODO(shivaniagrawal): add `parallel_calls_utilization` in the
-            // monitoring code or as histogram at fixed time intervals.
             stats_aggregator->AddScalar(
-                strings::StrCat(prefix_end_, "::active_parallel_calls"),
-                static_cast<float>(num_calls_));
-            stats_aggregator->AddScalar(
-                strings::StrCat(prefix_end_, "::num_parallel_calls"),
-                static_cast<float>(num_parallel_calls_->value));
+                strings::StrCat(key_prefix_, "::thread_utilization"),
+                static_cast<float>(num_calls_) /
+                    static_cast<float>(num_parallel_calls_->value));
           }
           cond_var_->notify_all();
         }
       }
 
-      // Determines whether the caller needs to wait for a result. Upon
-      // returning false, `result` will either be NULL if end of input has been
-      // reached or point to the result.
-      bool ShouldWait(std::shared_ptr<InvocationResult>* result)
+      // Creates a new element.
+      std::shared_ptr<Element> MakeElement(
+          const std::shared_ptr<IteratorContext>& ctx)
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
-        if (sloppy_) {
-          for (auto it = invocation_results_.begin();
-               it != invocation_results_.end(); ++it) {
-            if ((*it)->notification.HasBeenNotified()) {
-              std::swap(*result, *it);
-              invocation_results_.erase(it);
-              cond_var_->notify_all();
-              return false;
-            }
+        auto element = std::make_shared<Element>();
+        element->id = element_id_counter_++;
+        Status status =
+            input_impl_->GetNext(ctx.get(), &element->inputs, &end_of_input_);
+        if (!status.ok()) {
+          auto result = std::make_shared<Result>();
+          result->is_ready = true;
+          result->status = status;
+          mutex_lock l(element->mu);
+          element->results.push_back(std::move(result));
+          return element;
+        }
+        if (!end_of_input_) {
+          Status status = MakeIteratorFromInputElement(
+              ctx.get(), element->inputs, element->id,
+              *instantiated_captured_func_, prefix(), &element->iterator);
+          if (!status.ok()) {
+            auto result = std::make_shared<Result>();
+            result->is_ready = true;
+            result->status = status;
+            mutex_lock l(element->mu);
+            element->results.push_back(std::move(result));
+            return element;
           }
-          return !invocation_results_.empty() ||
-                 (!end_of_input_ || num_open_ > 0);
+          ++num_open_;
         } else {
-          if (!invocation_results_.empty()) {
-            std::swap(*result, invocation_results_.front());
-            invocation_results_.pop_front();
-            cond_var_->notify_all();
-            return false;
-          }
-          return (!end_of_input_ || num_open_ > 0);
+          element.reset();
         }
+        return element;
       }
 
-      Status WriteStatusLocked(IteratorStateWriter* writer, size_t index,
+      Status WriteStatusLocked(IteratorStateWriter* writer,
+                               const string& key_prefix, size_t idx,
                                const Status& status)
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         TF_RETURN_IF_ERROR(writer->WriteScalar(
-            CodeKey(index), static_cast<int64>(status.code())));
+            CodeKey(key_prefix, idx), static_cast<int64>(status.code())));
         if (!status.ok()) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(ErrorMessageKey(index),
-                                                 status.error_message()));
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              ErrorMessageKey(key_prefix, idx), status.error_message()));
         }
         return Status::OK();
       }
 
-      Status ReadStatusLocked(IteratorStateReader* reader, size_t index,
+      Status ReadStatusLocked(IteratorStateReader* reader,
+                              const string& key_prefix, size_t idx,
                               Status* status) EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
         int64 code_int;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(CodeKey(index), &code_int));
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(CodeKey(key_prefix, idx), &code_int));
         error::Code code = static_cast<error::Code>(code_int);
 
         if (code != error::Code::OK) {
           string error_message;
-          TF_RETURN_IF_ERROR(
-              reader->ReadScalar(ErrorMessageKey(index), &error_message));
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              ErrorMessageKey(key_prefix, idx), &error_message));
           *status = Status(code, error_message);
         } else {
           *status = Status::OK();
@@ -574,58 +695,178 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      string CodeKey(size_t index) {
+      string CodeKey(const string& key_prefix, size_t idx) {
         return full_name(
-            strings::StrCat("invocation_results[", index, "].code"));
+            strings::StrCat(key_prefix, ".results[", idx, "].code"));
       }
 
-      string ErrorMessageKey(size_t index) {
+      string ErrorMessageKey(const string& key_prefix, size_t idx) {
         return full_name(
-            strings::StrCat("invocation_results[", index, "].error_message"));
+            strings::StrCat(key_prefix, ".results[", idx, "].error_message"));
+      }
+
+      Status WriteElement(std::shared_ptr<Element> element, int idx,
+                          const string& key_prefix, IteratorStateWriter* writer)
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        if (element->iterator) {
+          TF_RETURN_IF_ERROR(SaveInput(writer, element->iterator));
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(key_prefix, "[", idx, "].id")),
+              element->id));
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(key_prefix, "[", idx, "].inputs.size")),
+              element->inputs.size()));
+          for (int i = 0; i < element->inputs.size(); i++) {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(
+                    strings::StrCat(key_prefix, "[", idx, "].inputs[", i, "]")),
+                element->inputs[i]));
+          }
+        }
+        mutex_lock l(element->mu);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(key_prefix, "[", idx, "].results.size")),
+            element->results.size()));
+        for (size_t i = 0; i < element->results.size(); i++) {
+          std::shared_ptr<Result> result = element->results[i];
+          TF_RETURN_IF_ERROR(WriteStatusLocked(
+              writer, strings::StrCat(key_prefix, "[", idx, "]"), i,
+              result->status));
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(key_prefix, "[", idx, "].results[", i,
+                                        "].size")),
+              result->return_values.size()));
+          for (size_t j = 0; j < result->return_values.size(); j++) {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(strings::StrCat(key_prefix, "[", idx, "].results[", i,
+                                          "][", j, "]")),
+                result->return_values[j]));
+          }
+          if (result->is_ready) {
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                full_name(strings::StrCat(key_prefix, "[", idx, "].results[", i,
+                                          "].is_ready")),
+                ""));
+          }
+        }
+        return Status::OK();
       }
 
       Status WriteCurrentElements(IteratorStateWriter* writer)
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name("current_elements.size"), current_elements_.size()));
         for (int idx = 0; idx < current_elements_.size(); idx++) {
           if (current_elements_[idx]) {
-            TF_RETURN_IF_ERROR(SaveInput(writer, current_elements_[idx]));
-            TF_RETURN_IF_ERROR(writer->WriteScalar(
-                full_name(strings::StrCat("args_size[", idx, "]")),
-                args_list_[idx].size()));
-            for (int i = 0; i < args_list_[idx].size(); i++) {
-              TF_RETURN_IF_ERROR(writer->WriteTensor(
-                  full_name(strings::StrCat("args_list_[", idx, "][", i, "]")),
-                  args_list_[idx][i]));
-            }
+            TF_RETURN_IF_ERROR(WriteElement(current_elements_[idx], idx,
+                                            "current_elements", writer));
+          }
+        }
+        return Status::OK();
+      }
+
+      Status WriteFutureElements(IteratorStateWriter* writer)
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name("future_elements.size"), future_elements_.size()));
+        for (int idx = 0; idx < future_elements_.size(); idx++) {
+          if (future_elements_[idx]) {
+            TF_RETURN_IF_ERROR(WriteElement(future_elements_[idx], idx,
+                                            "future_elements", writer));
+          }
+        }
+        return Status::OK();
+      }
+
+      Status ReadElement(IteratorContext* ctx, IteratorStateReader* reader,
+                         int idx, const string& key_prefix,
+                         std::shared_ptr<Element>* out)
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        if (!reader->Contains(full_name(
+                strings::StrCat(key_prefix, "[", idx, "].results.size")))) {
+          return Status::OK();
+        }
+        auto element = std::make_shared<Element>();
+        mutex_lock l(element->mu);
+        int64 results_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(key_prefix, "[", idx, "].results.size")),
+            &results_size));
+        element->results.resize(results_size);
+        for (size_t i = 0; i < results_size; i++) {
+          auto result = std::make_shared<Result>();
+          TF_RETURN_IF_ERROR(ReadStatusLocked(
+              reader, strings::StrCat(key_prefix, "[", idx, "]"), i,
+              &result->status));
+          int64 num_return_values;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat(key_prefix, "[", idx, "].results[", i,
+                                        "].size")),
+              &num_return_values));
+          result->return_values.reserve(num_return_values);
+          for (size_t j = 0; j < num_return_values; j++) {
+            result->return_values.emplace_back();
+            TF_RETURN_IF_ERROR(reader->ReadTensor(
+                full_name(strings::StrCat(key_prefix, "[", idx, "].results[", i,
+                                          "][", j, "]")),
+                &result->return_values.back()));
           }
+          result->is_ready = reader->Contains(full_name(strings::StrCat(
+              key_prefix, "[", idx, "].results[", i, "].is_ready")));
+          element->results[i] = std::move(result);
+        }
+        if (!reader->Contains(full_name(
+                strings::StrCat(key_prefix, "[", idx, "].inputs.size")))) {
+          element->iterator.reset();
+          *out = std::move(element);
+          return Status::OK();
+        }
+        int64 inputs_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(key_prefix, "[", idx, "].inputs.size")),
+            &inputs_size));
+        element->inputs.resize(inputs_size);
+        for (int i = 0; i < inputs_size; i++) {
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              full_name(
+                  strings::StrCat(key_prefix, "[", idx, "].inputs[", i, "]")),
+              &element->inputs[i]));
         }
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(key_prefix, "[", idx, "].id")),
+            &element->id));
+        TF_RETURN_IF_ERROR(MakeIteratorFromInputElement(
+            ctx, element->inputs, element->id,
+            *instantiated_captured_func_.get(), prefix(), &element->iterator));
+        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, element->iterator));
+        *out = std::move(element);
         return Status::OK();
       }
 
       Status ReadCurrentElements(IteratorContext* ctx,
                                  IteratorStateReader* reader)
           EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        int64 size;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("current_elements.size"), &size));
+        DCHECK_EQ(current_elements_.size(), size);
         for (int idx = 0; idx < current_elements_.size(); idx++) {
-          if (reader->Contains(
-                  full_name(strings::StrCat("args_size[", idx, "]")))) {
-            int64 args_size;
-            TF_RETURN_IF_ERROR(reader->ReadScalar(
-                full_name(strings::StrCat("args_size[", idx, "]")),
-                &args_size));
-            args_list_[idx].resize(args_size);
-            for (int i = 0; i < args_size; i++) {
-              TF_RETURN_IF_ERROR(reader->ReadTensor(
-                  full_name(strings::StrCat("args_list_[", idx, "][", i, "]")),
-                  &args_list_[idx][i]));
-            }
-            TF_RETURN_IF_ERROR(MakeIteratorFromInputElement(
-                ctx, args_list_[idx], idx, *instantiated_captured_func_.get(),
-                prefix(), &current_elements_[idx]));
-            TF_RETURN_IF_ERROR(
-                RestoreInput(ctx, reader, current_elements_[idx]));
-          } else {
-            current_elements_[idx].reset();
-          }
+          TF_RETURN_IF_ERROR(ReadElement(ctx, reader, idx, "current_elements",
+                                         &current_elements_[idx]));
+        }
+        return Status::OK();
+      }
+
+      Status ReadFutureElements(IteratorContext* ctx,
+                                IteratorStateReader* reader)
+          EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
+        int64 size;
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("future_elements.size"), &size));
+        future_elements_.resize(size);
+        for (int idx = 0; idx < future_elements_.size(); idx++) {
+          TF_RETURN_IF_ERROR(ReadElement(ctx, reader, idx, "future_elements",
+                                         &future_elements_[idx]));
         }
         return Status::OK();
       }
@@ -634,12 +875,11 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       // the worker threads.
       const std::shared_ptr<mutex> mu_;
 
-      // Used for coordination between the main thread, the runner thread, and
-      // the worker threads. In particular, the runner thread should only
-      // schedule new calls when the number of in-flight calls is less than the
-      // user specified level of parallelism, there are slots available in the
-      // `invocation_results_` buffer, the current cycle element is not in use,
-      // and there are elements left to be fetched.
+      // Used for coordination between the main thread, the manager threads, and
+      // the threadpool threads. In particular, the managers thread should only
+      // schedule new calls into the threadpool when the number of in-flight
+      // calls is less than the user specified level of parallelism and there
+      // are slots available in the element `results` buffer.
       const std::shared_ptr<condition_variable> cond_var_;
 
       // Identifies the maximum number of parallel calls.
@@ -651,24 +891,17 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       // Iterator for input elements.
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(*mu_);
 
-      // Identifies current cycle element.
-      int64 cycle_index_ = 0;
-
-      // Arguments for creating an iterator for cycle elements.
-      std::vector<std::vector<Tensor>> args_list_ GUARDED_BY(*mu_);
-
-      // Iterators for the current cycle elements. Concurrent access is
-      // protected by `element_in_use_`.
-      std::vector<std::unique_ptr<IteratorBase>> current_elements_;
+      // Identifies position in the interleave cycle.
+      int64 block_index_ GUARDED_BY(*mu_) = 0;
+      int64 cycle_index_ GUARDED_BY(*mu_) = 0;
 
-      // Identifies cycle elements that are in use by worker threads.
-      std::vector<bool> element_in_use_ GUARDED_BY(*mu_);
+      // Elements of the current interleave cycle.
+      std::vector<std::shared_ptr<Element>> current_elements_ GUARDED_BY(*mu_);
 
-      // Buffer for storing the invocation results.
-      std::deque<std::shared_ptr<InvocationResult>> invocation_results_
-          GUARDED_BY(*mu_);
+      // Elements to be used in the interleave cycle in the future.
+      std::deque<std::shared_ptr<Element>> future_elements_ GUARDED_BY(*mu_);
 
-      // Identifies whether end of input has been reached.
+      // Identifies whether the global end of input has been reached.
       bool end_of_input_ GUARDED_BY(*mu_) = false;
 
       // Identifies the number of open iterators.
@@ -678,11 +911,13 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       int64 num_calls_ GUARDED_BY(*mu_) = 0;
 
       std::unique_ptr<thread::ThreadPool> thread_pool_;
-      std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
+      std::unique_ptr<Thread> current_elements_manager_ GUARDED_BY(*mu_);
+      std::unique_ptr<Thread> future_elements_manager_ GUARDED_BY(*mu_);
+      int64 element_id_counter_ GUARDED_BY(*mu_) = 0;
 
-      // Identifies whether background activity should be cancelled.
+      // Identifies whether background threads should be cancelled.
       bool cancelled_ GUARDED_BY(*mu_) = false;
-      string prefix_end_;
+      string key_prefix_;
       std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index 5ac81c187c4f3338785d49b47c232be1f8d1e185..34f341d1d12c02c3900cba2741a5cd38f2b73e9c 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include <deque>
 
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -31,6 +32,8 @@ namespace {
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
+constexpr char kDatasetName[] = "ParallelMap";
+
 class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit ParallelMapDatasetOp(OpKernelConstruction* ctx)
@@ -51,9 +54,10 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
     int32 num_parallel_calls;
     OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls",
                                             &num_parallel_calls));
-    OP_REQUIRES(ctx, num_parallel_calls > 0 || num_parallel_calls == kAutoTune,
-                errors::InvalidArgument(
-                    "num_parallel_calls must be greater than zero."));
+    OP_REQUIRES(
+        ctx, num_parallel_calls > 0 || num_parallel_calls == model::kAutoTune,
+        errors::InvalidArgument(
+            "num_parallel_calls must be greater than zero."));
 
     std::unique_ptr<CapturedFunction> captured_func;
     OP_REQUIRES_OK(ctx, CapturedFunction::Create(func_, ctx, "other_arguments",
@@ -63,6 +67,10 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
     std::vector<int> indices;
     OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
 
+    if (num_parallel_calls == model::kAutoTune) {
+      metrics::RecordTFDataAutotune(kDatasetName);
+    }
+
     *output =
         new Dataset(ctx, input, func_, num_parallel_calls, output_types_,
                     output_shapes_, use_inter_op_parallelism_, sloppy_,
@@ -101,12 +109,13 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
         const string& prefix) const override {
       std::unique_ptr<ParallelMapFunctor> parallel_map_functor(nullptr);
       if (indices_.empty()) {
-        parallel_map_functor.reset(new ParallelMapDatasetFunctor(this));
+        parallel_map_functor =
+            absl::make_unique<ParallelMapDatasetFunctor>(this);
       } else {
-        parallel_map_functor.reset(new ShortCircuitFunctor(this));
+        parallel_map_functor = absl::make_unique<ShortCircuitFunctor>(this);
       }
       return NewParallelMapIterator(
-          {this, strings::StrCat(prefix, "::ParallelMap")}, input_,
+          {this, strings::StrCat(prefix, "::", kDatasetName)}, input_,
           std::move(parallel_map_functor), num_parallel_calls_, sloppy_,
           preserve_cardinality_);
     }
@@ -140,7 +149,13 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
       other_arguments.reserve(captured_func_->captured_inputs().size());
       for (const Tensor& t : captured_func_->captured_inputs()) {
         Node* node;
-        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        DatasetBase* input;
+        Status s = GetDatasetFromVariantTensor(t, &input);
+        if (s.ok()) {
+          TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input, &node));
+        } else {
+          TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        }
         other_arguments.emplace_back(node);
         other_arguments_types.emplace_back(t.dtype());
       }
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc
index b97f69250056fbf80c1cf866192a320861b70770..be91de12fe74a39919fa68bd12d60d9c9ac04ac2 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.cc
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/cpu_info.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -60,7 +59,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
         preserve_cardinality_(params.preserve_cardinality) {
     std::vector<string> components =
         str_util::Split(base_params.prefix, "::", str_util::SkipEmpty());
-    prefix_end_ = components.back();
+    key_prefix_ = components.back();
   }
 
   ~ParallelMapIterator() override {
@@ -76,9 +75,8 @@ class ParallelMapIterator : public DatasetBaseIterator {
 
   Status Initialize(IteratorContext* ctx) override {
     mutex_lock l(*mu_);
-    if (num_parallel_calls_->value == kAutoTune) {
+    if (num_parallel_calls_->value == model::kAutoTune) {
       num_parallel_calls_->value = ctx->runner_threadpool_size();
-      num_parallel_calls_->tunable = true;
     }
     TF_RETURN_IF_ERROR(
         input_dataset_->MakeIterator(ctx, prefix(), &input_impl_));
@@ -208,8 +206,9 @@ class ParallelMapIterator : public DatasetBaseIterator {
     const auto& stats_aggregator = ctx->stats_aggregator();
     if (stats_aggregator) {
       stats_aggregator->AddScalar(
-          strings::StrCat(prefix_end_, "::active_parallel_calls"),
-          static_cast<float>(num_calls_));
+          strings::StrCat(key_prefix_, "::thread_utilization"),
+          static_cast<float>(num_calls_) /
+              static_cast<float>(num_parallel_calls_->value));
     }
     RecordBufferEnqueue(ctx.get(), result->return_values);
     result->notification.Notify();
@@ -301,14 +300,10 @@ class ParallelMapIterator : public DatasetBaseIterator {
         }
         const auto& stats_aggregator = ctx->stats_aggregator();
         if (stats_aggregator) {
-          // TODO(shivaniagrawal): add `parallel_calls_utilization` in the
-          // monitoring code or as histogram at fixed time intervals.
-          stats_aggregator->AddScalar(
-              strings::StrCat(prefix_end_, "::active_parallel_calls"),
-              static_cast<float>(num_calls_));
           stats_aggregator->AddScalar(
-              strings::StrCat(prefix_end_, "::num_parallel_calls"),
-              static_cast<float>(num_parallel_calls_->value));
+              strings::StrCat(key_prefix_, "::thread_utilization"),
+              static_cast<float>(num_calls_) /
+                  static_cast<float>(num_parallel_calls_->value));
         }
         cond_var_->notify_all();
       }
@@ -404,7 +399,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
       GUARDED_BY(*mu_);
   std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
   bool cancelled_ GUARDED_BY(*mu_) = false;
-  string prefix_end_;
+  string key_prefix_;
 };
 
 }  // namespace
@@ -414,7 +409,7 @@ std::unique_ptr<IteratorBase> NewParallelMapIterator(
     const DatasetBase* input_dataset,
     std::unique_ptr<ParallelMapFunctor> parallel_map_functor,
     int32 num_parallel_calls, bool sloppy, bool preserve_cardinality) {
-  return MakeUnique<ParallelMapIterator>(
+  return absl::make_unique<ParallelMapIterator>(
       params, input_dataset,
       ParallelMapIterator::Params{std::move(parallel_map_functor),
                                   num_parallel_calls, sloppy,
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 08d6de4bf9a654d433e3cb6dddd6ab0cc1435136..f0e835a27c9775aadad107ca1f274275cc44f622 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <deque>
 
+#include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -29,6 +30,8 @@ namespace data {
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
+constexpr char kDatasetName[] = "Prefetch";
+
 class PrefetchDatasetOp::Dataset : public DatasetBase {
  public:
   Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 buffer_size)
@@ -42,8 +45,8 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const override {
-    return std::unique_ptr<IteratorBase>(
-        new Iterator({this, strings::StrCat(prefix, "::Prefetch")}));
+    return absl::make_unique<Iterator>(
+        Iterator::Params{this, strings::StrCat(prefix, "::", kDatasetName)});
   }
 
   const DataTypeVector& output_dtypes() const override {
@@ -266,8 +269,9 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     Status EnsurePrefetchThreadStarted(IteratorContext* ctx)
         EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (!prefetch_thread_) {
-        std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-        prefetch_thread_.reset(ctx->env()->StartThread(
+        std::shared_ptr<IteratorContext> new_ctx =
+            std::make_shared<IteratorContext>(*ctx);
+        prefetch_thread_ = absl::WrapUnique<Thread>(ctx->env()->StartThread(
             {}, "tf_data_prefetch",
             [this, new_ctx]() { PrefetchThread(new_ctx); }));
       }
@@ -391,6 +395,10 @@ void PrefetchDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
               buffer_size >= 0 || buffer_size == PrefetchAutotuner::kAutoTune,
               errors::InvalidArgument("buffer_size must be >= 0"));
 
+  if (buffer_size == PrefetchAutotuner::kAutoTune) {
+    metrics::RecordTFDataAutotune(kDatasetName);
+  }
+
   *output = new Dataset(ctx, input, buffer_size);
 }
 
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index 580702f741814b6bd86cab2d537b3ad49b4f6177..87390ad512fcbf0481a0f5c4241d864d0c99cee6 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -53,8 +53,8 @@ class RangeDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Range")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Range")});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -64,7 +64,7 @@ class RangeDatasetOp : public DatasetOpKernel {
 
     const std::vector<PartialTensorShape>& output_shapes() const override {
       static std::vector<PartialTensorShape>* shapes =
-          new std::vector<PartialTensorShape>({{}});
+          new std::vector<PartialTensorShape>({PartialTensorShape({})});
       return *shapes;
     }
 
diff --git a/tensorflow/core/kernels/data/range_dataset_op_test.cc b/tensorflow/core/kernels/data/range_dataset_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0bbc09a212811b2d8209d1862aed289c4d23dff8
--- /dev/null
+++ b/tensorflow/core/kernels/data/range_dataset_op_test.cc
@@ -0,0 +1,421 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/iterator_ops.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kOpName[] = "RangeDataset";
+
+class RangeDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates a new RangeDataset op kernel context.
+  Status CreateRangeDatasetContext(
+      int64 start, int64 end, int64 step, OpKernel* const range_kernel,
+      std::unique_ptr<OpKernelContext>* range_context) {
+    inputs_.clear();
+    TF_RETURN_IF_ERROR(AddDatasetInputFromArray<int64>(
+        &inputs_, range_kernel->input_types(), TensorShape({}), {start}));
+    TF_RETURN_IF_ERROR(AddDatasetInputFromArray<int64>(
+        &inputs_, range_kernel->input_types(), TensorShape({}), {end}));
+    TF_RETURN_IF_ERROR(AddDatasetInputFromArray<int64>(
+        &inputs_, range_kernel->input_types(), TensorShape({}), {step}));
+
+    TF_RETURN_IF_ERROR(
+        CreateOpKernelContext(range_kernel, &inputs_, range_context));
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*range_kernel, inputs_));
+    return Status::OK();
+  }
+
+ private:
+  gtl::InlinedVector<TensorValue, 4> inputs_;
+};
+
+struct GetNextTestParams {
+  explicit GetNextTestParams(int64 input_start, int64 input_end,
+                             int64 input_step)
+      : start(input_start), end(input_end), step(input_step) {}
+
+  int64 start;
+  int64 end;
+  int64 step;
+};
+
+struct DatasetGetNextTest : RangeDatasetOpTest,
+                            ::testing::WithParamInterface<GetNextTestParams> {};
+
+TEST_P(DatasetGetNextTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  GetNextTestParams params = GetParam();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> range_kernel;
+  TF_ASSERT_OK(CreateRangeDatasetOpKernel<int64>("range", &range_kernel));
+  std::unique_ptr<OpKernelContext> range_context;
+  TF_ASSERT_OK(CreateRangeDatasetContext(params.start, params.end, params.step,
+                                         range_kernel.get(), &range_context));
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(range_kernel.get(), range_context.get(), &range_dataset));
+  core::ScopedUnref scored_unref(range_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(range_context.get(), &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(range_dataset->MakeIterator(iterator_context.get(), "Iterator",
+                                           &iterator));
+
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  while (!end_of_sequence) {
+    TF_EXPECT_OK(iterator->GetNext(iterator_context.get(), &out_tensors,
+                                   &end_of_sequence));
+  }
+  std::vector<int> expected_values;
+  for (int i = params.start; (params.end - i) * params.step > 0;
+       i = i + params.step) {
+    expected_values.reserve(1);
+    expected_values.emplace_back(i);
+  }
+  EXPECT_EQ(out_tensors.size(), expected_values.size());
+  for (size_t i = 0; i < out_tensors.size(); ++i) {
+    int64 actual_value = out_tensors[i].flat<int64>()(0);
+    int64 expect_value = expected_values[i];
+    EXPECT_EQ(actual_value, expect_value);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(RangeDatasetOpTest, DatasetGetNextTest,
+                        ::testing::Values(GetNextTestParams(0, 10, 1),
+                                          GetNextTestParams(0, 10, 3),
+                                          GetNextTestParams(10, 0, -1),
+                                          GetNextTestParams(10, 0, -3)));
+
+TEST_F(RangeDatasetOpTest, DatasetName) {
+  int64 start = 0, end = 10, step = 1;
+  int thread_num = 2, cpu_num = 2;
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> range_kernel;
+  TF_ASSERT_OK(CreateRangeDatasetOpKernel<int64>("range", &range_kernel));
+  std::unique_ptr<OpKernelContext> range_context;
+  TF_ASSERT_OK(CreateRangeDatasetContext(start, end, step, range_kernel.get(),
+                                         &range_context));
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(range_kernel.get(), range_context.get(), &range_dataset));
+  core::ScopedUnref scored_unref(range_dataset);
+
+  EXPECT_EQ(range_dataset->name(), kOpName);
+}
+
+TEST_F(RangeDatasetOpTest, DatasetOutputDtypes) {
+  int64 start = 0, end = 10, step = 1;
+  int thread_num = 2, cpu_num = 2;
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> range_kernel;
+  TF_ASSERT_OK(CreateRangeDatasetOpKernel<int64>("range", &range_kernel));
+  std::unique_ptr<OpKernelContext> range_context;
+  TF_ASSERT_OK(CreateRangeDatasetContext(start, end, step, range_kernel.get(),
+                                         &range_context));
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(range_kernel.get(), range_context.get(), &range_dataset));
+  core::ScopedUnref scored_unref(range_dataset);
+
+  DataTypeVector expected_dtypes({DT_INT64});
+  EXPECT_EQ(range_dataset->output_dtypes(), expected_dtypes);
+}
+
+TEST_F(RangeDatasetOpTest, DatasetOutputShapes) {
+  int64 start = 0, end = 10, step = 1;
+  int thread_num = 2, cpu_num = 2;
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> range_kernel;
+  TF_ASSERT_OK(CreateRangeDatasetOpKernel<int64>("range", &range_kernel));
+  std::unique_ptr<OpKernelContext> range_context;
+  TF_ASSERT_OK(CreateRangeDatasetContext(start, end, step, range_kernel.get(),
+                                         &range_context));
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(range_kernel.get(), range_context.get(), &range_dataset));
+  core::ScopedUnref scored_unref(range_dataset);
+
+  std::vector<PartialTensorShape> expected_shapes({PartialTensorShape({})});
+  EXPECT_EQ(range_dataset->output_shapes().size(), expected_shapes.size());
+  for (int i = 0; i < range_dataset->output_shapes().size(); ++i) {
+    EXPECT_TRUE(
+        range_dataset->output_shapes()[i].IsIdenticalTo(expected_shapes[i]));
+  }
+}
+
+struct CardinalityTestParams {
+  explicit CardinalityTestParams(int64 input_start, int64 input_end,
+                                 int64 input_step,
+                                 int input_expected_cardinality)
+      : start(input_start),
+        end(input_end),
+        step(input_step),
+        expected_cardinality(input_expected_cardinality) {}
+
+  int64 start;
+  int64 end;
+  int64 step;
+  int expected_cardinality;
+};
+
+struct DatasetCardinalityTest
+    : RangeDatasetOpTest,
+      ::testing::WithParamInterface<CardinalityTestParams> {};
+
+TEST_P(DatasetCardinalityTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  CardinalityTestParams params = GetParam();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> range_kernel;
+  TF_ASSERT_OK(CreateRangeDatasetOpKernel<int64>("range", &range_kernel));
+  std::unique_ptr<OpKernelContext> range_context;
+  TF_ASSERT_OK(CreateRangeDatasetContext(params.start, params.end, params.step,
+                                         range_kernel.get(), &range_context));
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(range_kernel.get(), range_context.get(), &range_dataset));
+  core::ScopedUnref scored_unref(range_dataset);
+
+  EXPECT_EQ(range_dataset->Cardinality(), params.expected_cardinality);
+}
+
+INSTANTIATE_TEST_CASE_P(RangeDatasetOpTest, DatasetCardinalityTest,
+                        ::testing::Values(CardinalityTestParams(0, 10, 1, 10),
+                                          CardinalityTestParams(0, 10, 3, 4),
+                                          CardinalityTestParams(10, 0, -3, 4)));
+
+TEST_F(RangeDatasetOpTest, DatasetSave) {
+  int64 thread_num = 2, cpu_num = 2;
+  int start = 0, end = 10, step = 1;
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> range_kernel;
+  TF_ASSERT_OK(CreateRangeDatasetOpKernel<int64>("range", &range_kernel));
+  std::unique_ptr<OpKernelContext> range_context;
+  TF_ASSERT_OK(CreateRangeDatasetContext(start, end, step, range_kernel.get(),
+                                         &range_context));
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(range_kernel.get(), range_context.get(), &range_dataset));
+  core::ScopedUnref scored_unref(range_dataset);
+
+  std::unique_ptr<SerializationContext> serialization_context;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
+
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(range_dataset->Save(serialization_context.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+TEST_F(RangeDatasetOpTest, IteratorOutputDtypes) {
+  int64 start = 0, end = 10, step = 1;
+  int thread_num = 2, cpu_num = 2;
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> range_kernel;
+  TF_ASSERT_OK(CreateRangeDatasetOpKernel<int64>("range", &range_kernel));
+  std::unique_ptr<OpKernelContext> range_context;
+  TF_ASSERT_OK(CreateRangeDatasetContext(start, end, step, range_kernel.get(),
+                                         &range_context));
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(range_kernel.get(), range_context.get(), &range_dataset));
+  core::ScopedUnref scored_unref(range_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(range_context.get(), &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(range_dataset->MakeIterator(iterator_context.get(), "Iterator",
+                                           &iterator));
+
+  DataTypeVector expected_dtypes({DT_INT64});
+  EXPECT_EQ(iterator->output_dtypes(), expected_dtypes);
+}
+
+TEST_F(RangeDatasetOpTest, IteratorOutputShapes) {
+  int64 start = 0, end = 10, step = 1;
+  int thread_num = 2, cpu_num = 2;
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> range_kernel;
+  TF_ASSERT_OK(CreateRangeDatasetOpKernel<int64>("range", &range_kernel));
+  std::unique_ptr<OpKernelContext> range_context;
+  TF_ASSERT_OK(CreateRangeDatasetContext(start, end, step, range_kernel.get(),
+                                         &range_context));
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(range_kernel.get(), range_context.get(), &range_dataset));
+  core::ScopedUnref scored_unref(range_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(range_context.get(), &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(range_dataset->MakeIterator(iterator_context.get(), "Iterator",
+                                           &iterator));
+
+  std::vector<PartialTensorShape> expected_shapes({PartialTensorShape({})});
+  EXPECT_EQ(iterator->output_shapes().size(), expected_shapes.size());
+  for (int i = 0; i < range_dataset->output_shapes().size(); ++i) {
+    EXPECT_TRUE(iterator->output_shapes()[i].IsIdenticalTo(expected_shapes[i]));
+  }
+}
+
+TEST_F(RangeDatasetOpTest, IteratorOutputPrefix) {
+  int64 start = 0, end = 10, step = 1;
+  int thread_num = 2, cpu_num = 2;
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> range_kernel;
+  TF_ASSERT_OK(CreateRangeDatasetOpKernel<int64>("range", &range_kernel));
+  std::unique_ptr<OpKernelContext> range_context;
+  TF_ASSERT_OK(CreateRangeDatasetContext(start, end, step, range_kernel.get(),
+                                         &range_context));
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(range_kernel.get(), range_context.get(), &range_dataset));
+  core::ScopedUnref scored_unref(range_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(range_context.get(), &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(range_dataset->MakeIterator(iterator_context.get(), "Iterator",
+                                           &iterator));
+
+  EXPECT_EQ(iterator->prefix(), "Iterator::Range");
+}
+
+struct RoundtripTestParams {
+  explicit RoundtripTestParams(int64 input_start, int64 input_end,
+                               int64 input_step, int input_breakpoint)
+      : start(input_start),
+        end(input_end),
+        step(input_step),
+        breakpoint(input_breakpoint) {}
+
+  int64 start;
+  int64 end;
+  int64 step;
+  int breakpoint;
+};
+
+struct IteratorRoundtripTest
+    : RangeDatasetOpTest,
+      ::testing::WithParamInterface<RoundtripTestParams> {};
+
+TEST_P(IteratorRoundtripTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  RoundtripTestParams params = GetParam();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  std::unique_ptr<OpKernel> range_kernel;
+  TF_ASSERT_OK(CreateRangeDatasetOpKernel<int64>("range", &range_kernel));
+  std::unique_ptr<OpKernelContext> range_context;
+  TF_ASSERT_OK(CreateRangeDatasetContext(params.start, params.end, params.step,
+                                         range_kernel.get(), &range_context));
+  DatasetBase* range_dataset;
+  TF_ASSERT_OK(
+      CreateDataset(range_kernel.get(), range_context.get(), &range_dataset));
+  core::ScopedUnref scored_unref(range_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(range_context.get(), &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(range_dataset->MakeIterator(iterator_context.get(), "Iterator",
+                                           &iterator));
+
+  std::vector<Tensor> out_tensors;
+  bool end_of_sequence = false;
+  int64 cur_val = params.start - params.step;
+  for (int i = 0; i < params.breakpoint; i++) {
+    if (!end_of_sequence) {
+      TF_EXPECT_OK(iterator->GetNext(iterator_context.get(), &out_tensors,
+                                     &end_of_sequence));
+      cur_val = ((params.end - cur_val - params.step) * params.step > 0)
+                    ? cur_val + params.step
+                    : cur_val;
+    }
+  }
+
+  std::unique_ptr<SerializationContext> serialization_context;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(iterator->Save(serialization_context.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+  VariantTensorDataReader reader(&data);
+  TF_ASSERT_OK(iterator->Restore(iterator_context.get(), &reader));
+  TF_EXPECT_OK(iterator->GetNext(iterator_context.get(), &out_tensors,
+                                 &end_of_sequence));
+  int64 expect_next = ((params.end - cur_val - params.step) * params.step > 0)
+                          ? cur_val + params.step
+                          : cur_val;
+  EXPECT_EQ(out_tensors.back().flat<int64>()(0), expect_next);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    RangeDatasetOpTest, IteratorRoundtripTest,
+    ::testing::Values(
+        RoundtripTestParams(0, 10, 2, 0),    // unused_iterator
+        RoundtripTestParams(0, 10, 2, 4),    // fully_used_iterator_increase
+        RoundtripTestParams(10, 0, -2, 4),   // fully_used_iterator_decrease
+        RoundtripTestParams(0, 10, 2, 6)));  // exhausted_iterator
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/reader_dataset_ops.cc b/tensorflow/core/kernels/data/reader_dataset_ops.cc
index 971fd2a43685197892ad0fb3cd37e3709cd144c1..cbc987dc161241e267ae680dcb2db71f2b68a159 100644
--- a/tensorflow/core/kernels/data/reader_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/reader_dataset_ops.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/common_runtime/metrics.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -29,6 +30,8 @@ namespace {
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following ops.
 
+constexpr char kTextLineDatasetName[] = "TextLine";
+
 class TextLineDatasetOp : public DatasetOpKernel {
  public:
   using DatasetOpKernel::DatasetOpKernel;
@@ -91,8 +94,8 @@ class TextLineDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::TextLine")}));
+      return absl::make_unique<Iterator>(Iterator::Params{
+          this, strings::StrCat(prefix, "::", kTextLineDatasetName)});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -142,6 +145,8 @@ class TextLineDatasetOp : public DatasetOpKernel {
 
             if (s.ok()) {
               // Produce the line as output.
+              metrics::RecordTFDataBytesRead(kTextLineDatasetName,
+                                             line_contents.size());
               out_tensors->emplace_back(ctx->allocator({}), DT_STRING,
                                         TensorShape({}));
               out_tensors->back().scalar<string>()() = std::move(line_contents);
@@ -221,20 +226,20 @@ class TextLineDatasetOp : public DatasetOpKernel {
         // Actually move on to next file.
         TF_RETURN_IF_ERROR(env->NewRandomAccessFile(
             dataset()->filenames_[current_file_index_], &file_));
-        input_stream_.reset(
-            new io::RandomAccessInputStream(file_.get(), false));
+        input_stream_ =
+            absl::make_unique<io::RandomAccessInputStream>(file_.get(), false);
 
         if (dataset()->use_compression_) {
-          zlib_input_stream_.reset(new io::ZlibInputStream(
+          zlib_input_stream_ = absl::make_unique<io::ZlibInputStream>(
               input_stream_.get(), dataset()->options_.input_buffer_size,
-              dataset()->options_.input_buffer_size, dataset()->options_));
-          buffered_input_stream_.reset(new io::BufferedInputStream(
+              dataset()->options_.input_buffer_size, dataset()->options_);
+          buffered_input_stream_ = absl::make_unique<io::BufferedInputStream>(
               zlib_input_stream_.get(), dataset()->options_.input_buffer_size,
-              false));
+              false);
         } else {
-          buffered_input_stream_.reset(new io::BufferedInputStream(
+          buffered_input_stream_ = absl::make_unique<io::BufferedInputStream>(
               input_stream_.get(), dataset()->options_.input_buffer_size,
-              false));
+              false);
         }
         return Status::OK();
       }
@@ -268,9 +273,12 @@ class TextLineDatasetOp : public DatasetOpKernel {
 REGISTER_KERNEL_BUILDER(Name("TextLineDataset").Device(DEVICE_CPU),
                         TextLineDatasetOp);
 
+constexpr char kFixedLengthRecordDatasetName[] = "FixedLengthRecord";
+
 class FixedLengthRecordDatasetOp : public DatasetOpKernel {
  public:
   using DatasetOpKernel::DatasetOpKernel;
+
   explicit FixedLengthRecordDatasetOp(OpKernelConstruction* ctx)
       : DatasetOpKernel(ctx),
         op_version_(ctx->def().op() == "FixedLengthRecordDataset" ? 1 : 2) {}
@@ -344,11 +352,14 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       if (compression_type_.empty()) {
-        return std::unique_ptr<IteratorBase>(new UncompressedIterator(
-            {this, strings::StrCat(prefix, "::FixedLengthRecord")}));
+        return absl::make_unique<UncompressedIterator>(
+            UncompressedIterator::Params{
+                this,
+                strings::StrCat(prefix, "::", kFixedLengthRecordDatasetName)});
       } else {
-        return std::unique_ptr<IteratorBase>(new CompressedIterator(
-            {this, strings::StrCat(prefix, "::FixedLengthRecord")}));
+        return absl::make_unique<CompressedIterator>(CompressedIterator::Params{
+            this,
+            strings::StrCat(prefix, "::", kFixedLengthRecordDatasetName)});
       }
     }
 
@@ -410,6 +421,9 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
               string record;
               TF_RETURN_IF_ERROR(
                   input_buffer_->ReadNBytes(dataset()->record_bytes_, &record));
+              metrics::RecordTFDataBytesRead(kFixedLengthRecordDatasetName,
+                                             dataset()->record_bytes_);
+
               // Produce the record as output.
               Tensor record_tensor(ctx->allocator({}), DT_STRING, {});
               record_tensor.scalar<string>()() = record;
@@ -452,8 +466,8 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
           }
           TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile(
               dataset()->filenames_[current_file_index_], &file_));
-          input_buffer_.reset(
-              new io::InputBuffer(file_.get(), dataset()->buffer_size_));
+          input_buffer_ = absl::make_unique<io::InputBuffer>(
+              file_.get(), dataset()->buffer_size_);
           TF_RETURN_IF_ERROR(
               input_buffer_->SkipNBytes(dataset()->header_bytes_));
         } while (true);
@@ -495,8 +509,8 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
           file_pos_limit_ = file_size - dataset()->footer_bytes_;
           TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile(
               dataset()->filenames_[current_file_index_], &file_));
-          input_buffer_.reset(
-              new io::InputBuffer(file_.get(), dataset()->buffer_size_));
+          input_buffer_ = absl::make_unique<io::InputBuffer>(
+              file_.get(), dataset()->buffer_size_);
           TF_RETURN_IF_ERROR(input_buffer_->Seek(current_pos));
         }
 
@@ -531,6 +545,9 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
                 string record;
                 TF_RETURN_IF_ERROR(buffered_input_stream_->ReadNBytes(
                     dataset()->record_bytes_, &record));
+                metrics::RecordTFDataBytesRead(kFixedLengthRecordDatasetName,
+                                               dataset()->record_bytes_);
+
                 // Produce the record as output.
                 Tensor record_tensor(ctx->allocator({}), DT_STRING, {});
                 record_tensor.scalar<string>()() = std::move(record);
@@ -543,6 +560,8 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
               Status s = buffered_input_stream_->ReadNBytes(
                   dataset()->record_bytes_, &record);
               if (s.ok()) {
+                metrics::RecordTFDataBytesRead(kFixedLengthRecordDatasetName,
+                                               dataset()->record_bytes_);
                 lookahead_cache_.append(record);
                 record = lookahead_cache_.substr(0, dataset()->record_bytes_);
                 lookahead_cache_ =
@@ -612,13 +631,14 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
                 dataset()->compression_type_ == "ZLIB"
                     ? io::ZlibCompressionOptions::DEFAULT()
                     : io::ZlibCompressionOptions::GZIP();
-            file_stream_.reset(new io::RandomAccessInputStream(file_.get()));
-            buffered_input_stream_.reset(new io::ZlibInputStream(
+            file_stream_ =
+                absl::make_unique<io::RandomAccessInputStream>(file_.get());
+            buffered_input_stream_ = absl::make_unique<io::ZlibInputStream>(
                 file_stream_.get(), dataset()->buffer_size_,
-                dataset()->buffer_size_, zlib_options));
+                dataset()->buffer_size_, zlib_options);
           } else {
-            buffered_input_stream_.reset(new io::BufferedInputStream(
-                file_.get(), dataset()->buffer_size_));
+            buffered_input_stream_ = absl::make_unique<io::BufferedInputStream>(
+                file_.get(), dataset()->buffer_size_);
           }
           TF_RETURN_IF_ERROR(
               buffered_input_stream_->SkipNBytes(dataset()->header_bytes_));
@@ -672,10 +692,11 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
               dataset()->compression_type_ == "ZLIB"
                   ? io::ZlibCompressionOptions::DEFAULT()
                   : io::ZlibCompressionOptions::GZIP();
-          file_stream_.reset(new io::RandomAccessInputStream(file_.get()));
-          buffered_input_stream_.reset(new io::ZlibInputStream(
+          file_stream_ =
+              absl::make_unique<io::RandomAccessInputStream>(file_.get());
+          buffered_input_stream_ = absl::make_unique<io::ZlibInputStream>(
               file_stream_.get(), dataset()->buffer_size_,
-              dataset()->buffer_size_, zlib_options));
+              dataset()->buffer_size_, zlib_options);
           lookahead_cache_.clear();
           TF_RETURN_IF_ERROR(buffered_input_stream_->SkipNBytes(
               current_pos - dataset()->footer_bytes_));
@@ -714,6 +735,8 @@ REGISTER_KERNEL_BUILDER(Name("FixedLengthRecordDataset").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("FixedLengthRecordDatasetV2").Device(DEVICE_CPU),
                         FixedLengthRecordDatasetOp);
 
+constexpr char kTFRecordDatasetName[] = "TFRecord";
+
 class TFRecordDatasetOp : public DatasetOpKernel {
  public:
   using DatasetOpKernel::DatasetOpKernel;
@@ -763,8 +786,8 @@ class TFRecordDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::TFRecord")}));
+      return absl::make_unique<Iterator>(Iterator::Params{
+          this, strings::StrCat(prefix, "::", kTFRecordDatasetName)});
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -813,6 +836,9 @@ class TFRecordDatasetOp : public DatasetOpKernel {
             Status s =
                 reader_->ReadRecord(&out_tensors->back().scalar<string>()());
             if (s.ok()) {
+              metrics::RecordTFDataBytesRead(
+                  kTFRecordDatasetName,
+                  out_tensors->back().scalar<string>()().size());
               *end_of_sequence = false;
               return Status::OK();
             }
@@ -885,8 +911,8 @@ class TFRecordDatasetOp : public DatasetOpKernel {
         const string& next_filename =
             dataset()->filenames_[current_file_index_];
         TF_RETURN_IF_ERROR(env->NewRandomAccessFile(next_filename, &file_));
-        reader_.reset(
-            new io::SequentialRecordReader(file_.get(), dataset()->options_));
+        reader_ = absl::make_unique<io::SequentialRecordReader>(
+            file_.get(), dataset()->options_);
         return Status::OK();
       }
 
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op.cc b/tensorflow/core/kernels/data/repeat_dataset_op.cc
index 8100f2695b6ee529da252b7b012a7c87ebb0a670..ef507ffdd1de28f78e7112fbb1c2198e9876d922 100644
--- a/tensorflow/core/kernels/data/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op.cc
@@ -51,14 +51,14 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       if (count_ < 0) {
-        return std::unique_ptr<IteratorBase>(new ForeverIterator(
-            {this, strings::StrCat(prefix, "::ForeverRepeat")}));
+        return absl::make_unique<ForeverIterator>(ForeverIterator::Params{
+            this, strings::StrCat(prefix, "::ForeverRepeat")});
       } else if (count_ == 0) {
-        return std::unique_ptr<IteratorBase>(new EmptyIterator(
-            {this, strings::StrCat(prefix, "::EmptyRepeat")}));
+        return absl::make_unique<EmptyIterator>(EmptyIterator::Params{
+            this, strings::StrCat(prefix, "::EmptyRepeat")});
       } else {
-        return std::unique_ptr<IteratorBase>(new FiniteIterator(
-            {this, strings::StrCat(prefix, "::FiniteRepeat")}));
+        return absl::make_unique<FiniteIterator>(FiniteIterator::Params{
+            this, strings::StrCat(prefix, "::FiniteRepeat")});
       }
     }
 
diff --git a/tensorflow/core/kernels/data/shard_dataset_op.cc b/tensorflow/core/kernels/data/shard_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9bb64911aa802c6639229be689237db7296558f4
--- /dev/null
+++ b/tensorflow/core/kernels/data/shard_dataset_op.cc
@@ -0,0 +1,195 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/util/batch_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+// See documentation in ../../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class ShardDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit ShardDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 index = 0;
+    int64 num_shards = 0;
+
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<int64>(ctx, "num_shards", &num_shards));
+    OP_REQUIRES(
+        ctx, num_shards > 0,
+        errors::InvalidArgument("Number of shards must be greater than zero "
+                                "(currently num_shards = ",
+                                num_shards, ")."));
+
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "index", &index));
+    OP_REQUIRES(
+        ctx, index >= 0 && index < num_shards,
+        errors::InvalidArgument("Index must be between 0 and ", num_shards - 1,
+                                " (currently index = ", index, ")."));
+
+    *output = new Dataset(ctx, num_shards, index, input);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, int64 num_shards, int64 index,
+            const DatasetBase* input)
+        : DatasetBase(DatasetContext(ctx)),
+          num_shards_(num_shards),
+          index_(index),
+          input_(input) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Shard")});
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() const override {
+      return strings::StrCat("ShardDatasetOp(", num_shards_, ", ", index_,
+                             ")::Dataset");
+    }
+
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / num_shards_ + (index_ < n % num_shards_ ? 1 : 0);
+    }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+      Node* num_shards = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(num_shards_, &num_shards));
+      Node* index = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(index_, &index));
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this, {input_graph_node, num_shards, index}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params), next_index_(0) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+
+        if (!input_impl_) {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
+
+        std::vector<Tensor> result;
+        do {
+          result.clear();
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, &result, end_of_sequence));
+          if (*end_of_sequence) {
+            input_impl_.reset();
+            return Status::OK();
+          }
+        } while ((next_index_++ % dataset()->num_shards_) != dataset()->index_);
+
+        *out_tensors = std::move(result);
+        return Status::OK();
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         dataset()->num_shards_);
+      }
+
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (!input_impl_) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_impl_empty"), ""));
+        } else {
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("next_index"), next_index_));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (!reader->Contains(full_name("input_impl_empty"))) {
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("next_index"), &next_index_));
+        } else {
+          input_impl_.reset();
+        }
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      int64 next_index_ GUARDED_BY(mu_);
+    };
+
+    const int64 num_shards_;
+    const int64 index_;
+    const DatasetBase* const input_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("ShardDataset").Device(DEVICE_CPU),
+                        ShardDatasetOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index 7134793e26da82e39f53ac21030a9e56e16e26ab..e0c435718ac46ee9af1ce404e2bdfa0ba31c3044 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace data {
@@ -31,6 +30,8 @@ namespace {
 
 const int64 kLogIntervalMicros = 10 * 1000000;  // 10 seconds.
 
+const int64 kMaxEpochsInBuffer = 3;
+
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
@@ -78,8 +79,9 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
             num_elements_(0),
             parent_generator_(seed, seed2),
             generator_(&parent_generator_) {
-        buffer_.reset(new std::vector<Tensor>[params.dataset->buffer_size_]);
-        slices_.push_back(MakeUnique<Slice>(0, 0));
+        buffer_ = absl::make_unique<std::vector<Tensor>[]>(
+            params.dataset->buffer_size_);
+        slices_.push_back(absl::make_unique<Slice>(0, 0));
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -122,7 +124,7 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
             }
             epoch_++;
             int64 n = slices_.back()->end;
-            slices_.push_back(MakeUnique<Slice>(n, n));
+            slices_.push_back(absl::make_unique<Slice>(n, n));
             TF_RETURN_IF_ERROR(this->dataset()->input_->MakeIterator(
                 ctx, this->prefix(), &input_impl_));
           }
@@ -135,6 +137,14 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
           } else {
             input_impl_.reset();
           }
+          if (slices_.size() > kMaxEpochsInBuffer) {
+            // When the elements stored in `buffer_` span more than
+            // `kMaxEpochsInBuffer` epochs, we do not fill the buffer further to
+            // conserve memory. This means that the upper bound on the size of
+            // `buffer_` is `kMaxEpochsInBuffer * cardinality(input_dataset) +
+            // 1`.
+            break;
+          }
         }
         if (num_log_entries > 0) {
           LOG(INFO) << "Shuffle buffer filled.";
@@ -263,7 +273,8 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
               reader->ReadScalar(this->full_name("slices_size"), &temp));
           slices_size = static_cast<size_t>(temp);
         }
-        buffer_.reset(new std::vector<Tensor>[this->dataset()->buffer_size_]);
+        buffer_ = absl::make_unique<std::vector<Tensor>[]>(
+            this->dataset()->buffer_size_);
         for (size_t i = 0; i < slices_size; ++i) {
           int64 start;
           TF_RETURN_IF_ERROR(reader->ReadScalar(
@@ -271,7 +282,7 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
           int64 end;
           TF_RETURN_IF_ERROR(reader->ReadScalar(
               this->full_name(strings::StrCat("slices_end_", i)), &end));
-          slices_.push_back(MakeUnique<Slice>(start, end));
+          slices_.push_back(absl::make_unique<Slice>(start, end));
           for (size_t j = start; j < end; ++j) {
             size_t index = j % this->dataset()->buffer_size_;
             int64 list_size;
@@ -389,8 +400,9 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::Shuffle")}, seed_, seed2_));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Shuffle")}, seed_,
+          seed2_);
     }
 
    protected:
@@ -402,7 +414,7 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
             parent_generator_(seed, seed2),
             generator_(&parent_generator_) {}
 
-      string DebugString() override {
+      string DebugString() const override {
         return "ReshufflingDataset::RandomSeedGenerator";
       }
 
@@ -568,9 +580,11 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new ShuffleDatasetBase::Iterator<ShuffleDatasetBase>(
-              {this, strings::StrCat(prefix, "::Shuffle")}, seed_, seed2_));
+      return absl::make_unique<
+          ShuffleDatasetBase::Iterator<ShuffleDatasetBase>>(
+          ShuffleDatasetBase::Iterator<ShuffleDatasetBase>::Params{
+              this, strings::StrCat(prefix, "::Shuffle")},
+          seed_, seed2_);
     }
 
    protected:
@@ -653,10 +667,11 @@ class ShuffleAndRepeatDatasetOp : public ShuffleDatasetOpBase {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new ShuffleDatasetBase::Iterator<ShuffleDatasetBase>(
-              {this, strings::StrCat(prefix, "::ShuffleAndRepeat")}, seed_,
-              seed2_));
+      return absl::make_unique<
+          ShuffleDatasetBase::Iterator<ShuffleDatasetBase>>(
+          ShuffleDatasetBase::Iterator<ShuffleDatasetBase>::Params{
+              this, strings::StrCat(prefix, "::ShuffleAndRepeat")},
+          seed_, seed2_);
     }
 
    protected:
diff --git a/tensorflow/core/kernels/data/single_threaded_executor.cc b/tensorflow/core/kernels/data/single_threaded_executor.cc
index 89e3881037666299f093ed7423b62c9741ca5dd9..aab4bfe6403e7c34db44d63dc746fa5385da8c74 100644
--- a/tensorflow/core/kernels/data/single_threaded_executor.cc
+++ b/tensorflow/core/kernels/data/single_threaded_executor.cc
@@ -376,8 +376,8 @@ static SingleThreadedExecutorRegistrar registrar;
 Status NewSingleThreadedExecutor(const LocalExecutorParams& params,
                                  std::unique_ptr<const Graph> graph,
                                  Executor** executor) {
-  std::unique_ptr<SingleThreadedExecutorImpl> impl(
-      new SingleThreadedExecutorImpl(params));
+  std::unique_ptr<SingleThreadedExecutorImpl> impl =
+      absl::make_unique<SingleThreadedExecutorImpl>(params);
   TF_RETURN_IF_ERROR(impl->Initialize(*graph));
   *executor = impl.release();
   return Status::OK();
diff --git a/tensorflow/core/kernels/data/single_threaded_executor_test.cc b/tensorflow/core/kernels/data/single_threaded_executor_test.cc
index 7bb51fb8b53d59789f2d1efad04f4ffdf39587e4..df669e53d388957ced8a6863aaa3de4504cec66f 100644
--- a/tensorflow/core/kernels/data/single_threaded_executor_test.cc
+++ b/tensorflow/core/kernels/data/single_threaded_executor_test.cc
@@ -139,7 +139,7 @@ Rendezvous::ParsedKey Key(const string& sender, const uint64 incarnation,
 
 TEST_F(ExecutorTest, SimpleAdd) {
   // c = a + b
-  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+  std::unique_ptr<Graph> g = absl::make_unique<Graph>(OpRegistry::Global());
   auto in0 = test::graph::Arg(g.get(), 0, DT_FLOAT);
   auto in1 = test::graph::Arg(g.get(), 0, DT_FLOAT);
   auto tmp = test::graph::Add(g.get(), in0, in1);
@@ -163,7 +163,7 @@ TEST_F(ExecutorTest, SelfAdd) {
   //
   // b <- v10
   // All nodes are executed by one thread.
-  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+  std::unique_ptr<Graph> g = absl::make_unique<Graph>(OpRegistry::Global());
   auto v = test::graph::Arg(g.get(), 0, DT_FLOAT);
   const int N = 10;
   for (int i = 1; i <= N; ++i) {
@@ -219,7 +219,7 @@ void BuildTree(int N, Graph* g) {
 }
 
 TEST_F(ExecutorTest, RandomTree) {
-  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+  std::unique_ptr<Graph> g = absl::make_unique<Graph>(OpRegistry::Global());
   BuildTree(4096, g.get());
   Create(std::move(g));
   FunctionCallFrame call_frame({DT_FLOAT}, {DT_FLOAT});
@@ -231,7 +231,7 @@ TEST_F(ExecutorTest, RandomTree) {
 }
 
 TEST_F(ExecutorTest, OpError) {
-  std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
+  std::unique_ptr<Graph> g = absl::make_unique<Graph>(OpRegistry::Global());
   auto zero = test::graph::Constant(g.get(), V(0.0));
   auto inf = test::graph::Unary(g.get(), "Reciprocal", zero);
   auto check = test::graph::CheckNumerics(g.get(), inf, "message");
diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
index e321066a715d180f0791c9afdfa947560a0fd9ce..5b85a10edf1f6438feab485a77ad684f0442e67c 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -50,11 +50,11 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       if (count_ < 0) {
-        return std::unique_ptr<IteratorBase>(
-            new EmptyIterator({this, strings::StrCat(prefix, "::EmptySkip")}));
+        return absl::make_unique<EmptyIterator>(EmptyIterator::Params{
+            this, strings::StrCat(prefix, "::EmptySkip")});
       } else {
-        return std::unique_ptr<IteratorBase>(new FiniteIterator(
-            {this, strings::StrCat(prefix, "::FiniteSkip")}));
+        return absl::make_unique<FiniteIterator>(FiniteIterator::Params{
+            this, strings::StrCat(prefix, "::FiniteSkip")});
       }
     }
 
diff --git a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
index be105f8170b8fff79c0c60a76a699a6ee6ba13f9..d8d7cd204d0f00a2e25ce9e36d1d6234d8c7b1d1 100644
--- a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
@@ -41,8 +41,8 @@ class Dataset : public DatasetBase {
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const override {
-    return std::unique_ptr<IteratorBase>(
-        new Iterator({this, strings::StrCat(prefix, "::SparseTensorSlice")}));
+    return absl::make_unique<Iterator>(typename Iterator::Params{
+        this, strings::StrCat(prefix, "::SparseTensorSlice")});
   }
 
   const DataTypeVector& output_dtypes() const override { return dtypes_; }
diff --git a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op_test.cc b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d1d01f792fd222ac26e294b98d504d837400aa6c
--- /dev/null
+++ b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op_test.cc
@@ -0,0 +1,519 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/iterator_ops.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kNodeName[] = "sparse_tensor_slice_dataset";
+constexpr char kOpName[] = "SparseTensorSliceDataset";
+
+class SparseTensorSliceDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates a new SparseTensorSliceDataset op kernel.
+  Status CreateSparseTensorSliceDatasetKernel(
+      DataType tvalues, std::unique_ptr<OpKernel> *op_kernel) {
+    node_def_ = test::function::NDef(kNodeName, kOpName,
+                                     {"indices", "values", "dense_shape"},
+                                     {{"Tvalues", tvalues}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def_, op_kernel));
+    return Status::OK();
+  }
+
+  // Creates a new SparseTensorSliceDataset op kernel context.
+  Status CreateSparseTensorSliceDatasetContext(
+      OpKernel *const op_kernel, gtl::InlinedVector<TensorValue, 4> *inputs,
+      std::unique_ptr<OpKernelContext> *context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
+    TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
+    return Status::OK();
+  }
+
+ private:
+  NodeDef node_def_;
+};
+
+struct SparseTensorParam {
+  Tensor indices;
+  Tensor values;
+  Tensor dense_shape;
+};
+
+struct TestParam {
+  SparseTensorParam input_sparse_tensor;
+  std::vector<SparseTensorParam> expected_outputs;
+  std::vector<int> breakpoints;
+} TestCases[] = {
+    {{{DatasetOpsTestBase::CreateTensor<int64>({2, 2}, {0, 0, 1, 1})},
+      {DatasetOpsTestBase::CreateTensor<int32>({2}, {888, 999})},
+      {DatasetOpsTestBase::CreateTensor<int64>({2}, {2, 2})}},
+     {{{DatasetOpsTestBase::CreateTensor<int64>({1, 1}, {0})},
+       {DatasetOpsTestBase::CreateTensor<int32>({1}, {888})},
+       {DatasetOpsTestBase::CreateTensor<int64>({1}, {2})}},
+      {{DatasetOpsTestBase::CreateTensor<int64>({1, 1}, {1})},
+       {DatasetOpsTestBase::CreateTensor<int32>({1}, {999})},
+       {DatasetOpsTestBase::CreateTensor<int64>({1}, {2})}}},
+     {0, 1, 2}},  // 2-D sparse tensor
+    {{{DatasetOpsTestBase::CreateTensor<int64>({2, 3}, {0, 0, 0, 1, 1, 1})},
+      {DatasetOpsTestBase::CreateTensor<double>({2}, {888.0, 999.0})},
+      {DatasetOpsTestBase::CreateTensor<int64>({3}, {2, 2, 2})}},
+     {{{DatasetOpsTestBase::CreateTensor<int64>({1, 2}, {0, 0})},
+       {DatasetOpsTestBase::CreateTensor<double>({1}, {888.0})},
+       {DatasetOpsTestBase::CreateTensor<int64>({2}, {2, 2})}},
+      {{DatasetOpsTestBase::CreateTensor<int64>({1, 2}, {1, 1})},
+       {DatasetOpsTestBase::CreateTensor<double>({1}, {999.0})},
+       {DatasetOpsTestBase::CreateTensor<int64>({2}, {2, 2})}}},
+     {0, 1, 2}},  // 3-D sparse tensor
+    {{{DatasetOpsTestBase::CreateTensor<int64>({2, 4},
+                                               {0, 0, 0, 0, 1, 1, 1, 1})},
+      {DatasetOpsTestBase::CreateTensor<string>({2}, {"a", "b"})},
+      {DatasetOpsTestBase::CreateTensor<int64>({4}, {3, 2, 2, 2})}},
+     {{{DatasetOpsTestBase::CreateTensor<int64>({1, 3}, {0, 0, 0})},
+       {DatasetOpsTestBase::CreateTensor<string>({1}, {"a"})},
+       {DatasetOpsTestBase::CreateTensor<int64>({3}, {2, 2, 2})}},
+      {{DatasetOpsTestBase::CreateTensor<int64>({1, 3}, {1, 1, 1})},
+       {DatasetOpsTestBase::CreateTensor<string>({1}, {"b"})},
+       {DatasetOpsTestBase::CreateTensor<int64>({3}, {2, 2, 2})}},
+      {{DatasetOpsTestBase::CreateTensor<int64>({0, 3}, {})},
+       {DatasetOpsTestBase::CreateTensor<string>({0}, {})},
+       {DatasetOpsTestBase::CreateTensor<int64>({3}, {2, 2, 2})}}},
+     {0, 1, 3}},  // 4-D sparse tensor
+    {{{DatasetOpsTestBase::CreateTensor<int64>({2, 5},
+                                               {0, 0, 0, 0, 0, 1, 1, 1, 1, 1})},
+      {DatasetOpsTestBase::CreateTensor<int32>({2}, {888, 999})},
+      {DatasetOpsTestBase::CreateTensor<int64>({5}, {3, 2, 2, 2, 2})}},
+     {{{DatasetOpsTestBase::CreateTensor<int64>({1, 4}, {0, 0, 0, 0})},
+       {DatasetOpsTestBase::CreateTensor<int32>({1}, {888})},
+       {DatasetOpsTestBase::CreateTensor<int64>({4}, {2, 2, 2, 2})}},
+      {{DatasetOpsTestBase::CreateTensor<int64>({1, 4}, {1, 1, 1, 1})},
+       {DatasetOpsTestBase::CreateTensor<int32>({1}, {999})},
+       {DatasetOpsTestBase::CreateTensor<int64>({4}, {2, 2, 2, 2})}},
+      {{DatasetOpsTestBase::CreateTensor<int64>({0, 4}, {})},
+       {DatasetOpsTestBase::CreateTensor<int32>({0}, {})},
+       {DatasetOpsTestBase::CreateTensor<int64>({4}, {2, 2, 2, 2})}}},
+     {0, 1, 3}}  // 5-D sparse tensor
+
+};
+
+struct DatasetGetNextTest : SparseTensorSliceDatasetOpTest,
+                            ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(DatasetGetNextTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  SparseTensorParam input_sparse_tensor = GetParam().input_sparse_tensor;
+  std::vector<SparseTensorParam> expected_outputs = GetParam().expected_outputs;
+  DataType tvalues = input_sparse_tensor.values.dtype();
+  gtl::InlinedVector<TensorValue, 4> inputs = {
+      &input_sparse_tensor.indices, &input_sparse_tensor.values,
+      &input_sparse_tensor.dense_shape};
+
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetKernel(tvalues, &dataset_kernel));
+  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+      dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+  DatasetBase *dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref(dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(dataset_kernel_ctx.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  int cur_slice = 0;
+  while (!end_of_sequence) {
+    TF_EXPECT_OK(
+        iterator->GetNext(iterator_ctx.get(), &out_tensors, &end_of_sequence));
+    if (!end_of_sequence) {
+      TF_EXPECT_OK(
+          ExpectEqual(out_tensors[0], expected_outputs[cur_slice].indices));
+      TF_EXPECT_OK(
+          ExpectEqual(out_tensors[1], expected_outputs[cur_slice].values));
+      TF_EXPECT_OK(
+          ExpectEqual(out_tensors[2], expected_outputs[cur_slice].dense_shape));
+      cur_slice++;
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(SparseTensorSliceDatasetOpTest, DatasetGetNextTest,
+                        ::testing::ValuesIn(TestCases));
+
+TEST_F(SparseTensorSliceDatasetOpTest, DatasetName) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  int N = 2;
+  const int NDIM = 2;
+  Tensor indices = CreateTensor<int64>(TensorShape({N, NDIM}), {0, 0, 1, 1});
+  Tensor values = CreateTensor<int32>(TensorShape({N}), {888, 999});
+  Tensor dense_shape = CreateTensor<int64>(TensorShape({NDIM}), {5, 5});
+  gtl::InlinedVector<TensorValue, 4> inputs = {&indices, &values, &dense_shape};
+
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetKernel(DT_INT32, &dataset_kernel));
+  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+      dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+  DatasetBase *dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref(dataset);
+
+  EXPECT_EQ(dataset->name(), kOpName);
+}
+
+struct DatasetOutputDtypesTest : SparseTensorSliceDatasetOpTest,
+                                 ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(DatasetOutputDtypesTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  SparseTensorParam input_sparse_tensor = GetParam().input_sparse_tensor;
+  std::vector<SparseTensorParam> expected_outputs = GetParam().expected_outputs;
+  DataType tvalues = input_sparse_tensor.values.dtype();
+  gtl::InlinedVector<TensorValue, 4> inputs = {
+      &input_sparse_tensor.indices, &input_sparse_tensor.values,
+      &input_sparse_tensor.dense_shape};
+
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetKernel(tvalues, &dataset_kernel));
+  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+      dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+  DatasetBase *dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref(dataset);
+
+  DataTypeVector expected_output_dtypes = {
+      expected_outputs[0].indices.dtype(), expected_outputs[0].values.dtype(),
+      expected_outputs[0].dense_shape.dtype()};
+  TF_EXPECT_OK(
+      VerifyTypesMatch(dataset->output_dtypes(), expected_output_dtypes));
+}
+
+INSTANTIATE_TEST_CASE_P(SparseTensorDatasetSliceOpTest, DatasetOutputDtypesTest,
+                        ::testing::ValuesIn(TestCases));
+
+struct DatasetOutputShapesTest : SparseTensorSliceDatasetOpTest,
+                                 ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(DatasetOutputShapesTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  SparseTensorParam input_sparse_tensor = GetParam().input_sparse_tensor;
+  std::vector<SparseTensorParam> expected_outputs = GetParam().expected_outputs;
+  DataType tvalues = input_sparse_tensor.values.dtype();
+  gtl::InlinedVector<TensorValue, 4> inputs = {
+      &input_sparse_tensor.indices, &input_sparse_tensor.values,
+      &input_sparse_tensor.dense_shape};
+
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetKernel(tvalues, &dataset_kernel));
+  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+      dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+  DatasetBase *dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref(dataset);
+
+  std::vector<PartialTensorShape> expected_output_shapes = {
+      expected_outputs[0].indices.shape(), expected_outputs[0].values.shape(),
+      expected_outputs[0].dense_shape.shape()};
+  TF_EXPECT_OK(
+      VerifyShapesCompatible(dataset->output_shapes(), expected_output_shapes));
+}
+
+INSTANTIATE_TEST_CASE_P(SparseTensorDatasetSliceOpTest, DatasetOutputShapesTest,
+                        ::testing::ValuesIn(TestCases));
+
+struct DatasetCardinalityTest : SparseTensorSliceDatasetOpTest,
+                                ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(DatasetCardinalityTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  SparseTensorParam input_sparse_tensor = GetParam().input_sparse_tensor;
+  std::vector<SparseTensorParam> expected_outputs = GetParam().expected_outputs;
+  DataType tvalues = input_sparse_tensor.values.dtype();
+  gtl::InlinedVector<TensorValue, 4> inputs = {
+      &input_sparse_tensor.indices, &input_sparse_tensor.values,
+      &input_sparse_tensor.dense_shape};
+
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetKernel(tvalues, &dataset_kernel));
+  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+      dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+  DatasetBase *dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref(dataset);
+
+  EXPECT_EQ(dataset->Cardinality(), expected_outputs.size());
+}
+
+INSTANTIATE_TEST_CASE_P(SparseTensorDatasetSliceOpTest, DatasetCardinalityTest,
+                        ::testing::ValuesIn(TestCases));
+
+TEST_F(SparseTensorSliceDatasetOpTest, DatasetSave) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  int N = 2;
+  const int NDIM = 2;
+  Tensor indices = CreateTensor<int64>(TensorShape({N, NDIM}), {0, 0, 1, 1});
+  Tensor values = CreateTensor<int32>(TensorShape({N}), {888, 999});
+  Tensor dense_shape = CreateTensor<int64>(TensorShape({NDIM}), {5, 5});
+  gtl::InlinedVector<TensorValue, 4> inputs = {&indices, &values, &dense_shape};
+
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetKernel(DT_INT32, &dataset_kernel));
+  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+      dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+  DatasetBase *dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref(dataset);
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(dataset->Save(serialization_ctx.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+struct IteratorOutputDtypesTest : SparseTensorSliceDatasetOpTest,
+                                  ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(IteratorOutputDtypesTest, IteratorOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  SparseTensorParam input_sparse_tensor = GetParam().input_sparse_tensor;
+  std::vector<SparseTensorParam> expected_outputs = GetParam().expected_outputs;
+  DataType tvalues = input_sparse_tensor.values.dtype();
+  gtl::InlinedVector<TensorValue, 4> inputs = {
+      &input_sparse_tensor.indices, &input_sparse_tensor.values,
+      &input_sparse_tensor.dense_shape};
+
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetKernel(tvalues, &dataset_kernel));
+  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+      dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+  DatasetBase *dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref(dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(dataset_kernel_ctx.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+  DataTypeVector expected_output_dtypes = {
+      expected_outputs[0].indices.dtype(), expected_outputs[0].values.dtype(),
+      expected_outputs[0].dense_shape.dtype()};
+  TF_EXPECT_OK(
+      VerifyTypesMatch(iterator->output_dtypes(), expected_output_dtypes));
+}
+
+INSTANTIATE_TEST_CASE_P(SparseTensorSliceDatasetOpTest,
+                        IteratorOutputDtypesTest,
+                        ::testing::ValuesIn(TestCases));
+
+struct IteratorOutputShapesTest : SparseTensorSliceDatasetOpTest,
+                                  ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(IteratorOutputShapesTest, IteratorOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  SparseTensorParam input_sparse_tensor = GetParam().input_sparse_tensor;
+  std::vector<SparseTensorParam> expected_outputs = GetParam().expected_outputs;
+  DataType tvalues = input_sparse_tensor.values.dtype();
+  gtl::InlinedVector<TensorValue, 4> inputs = {
+      &input_sparse_tensor.indices, &input_sparse_tensor.values,
+      &input_sparse_tensor.dense_shape};
+
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetKernel(tvalues, &dataset_kernel));
+  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+      dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+  DatasetBase *dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref(dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(dataset_kernel_ctx.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+  std::vector<PartialTensorShape> expected_output_shapes = {
+      expected_outputs[0].indices.shape(), expected_outputs[0].values.shape(),
+      expected_outputs[0].dense_shape.shape()};
+  TF_EXPECT_OK(VerifyShapesCompatible(iterator->output_shapes(),
+                                      expected_output_shapes));
+}
+
+INSTANTIATE_TEST_CASE_P(SparseTensorSliceDatasetOpTest,
+                        IteratorOutputShapesTest,
+                        ::testing::ValuesIn(TestCases));
+
+TEST_F(SparseTensorSliceDatasetOpTest, IteratorOutputPrefix) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  int N = 2;
+  const int NDIM = 2;
+  Tensor indices = CreateTensor<int64>(TensorShape({N, NDIM}), {0, 0, 1, 1});
+  Tensor values = CreateTensor<int32>(TensorShape({N}), {888, 999});
+  Tensor dense_shape = CreateTensor<int64>(TensorShape({NDIM}), {5, 5});
+  gtl::InlinedVector<TensorValue, 4> inputs = {&indices, &values, &dense_shape};
+
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetKernel(DT_INT32, &dataset_kernel));
+  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+      dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+  DatasetBase *dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref(dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(dataset_kernel_ctx.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+  EXPECT_EQ(iterator->prefix(), strings::StrCat("Iterator::SparseTensorSlice"));
+}
+
+struct IteratorRoundtripTest : SparseTensorSliceDatasetOpTest,
+                               ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(IteratorRoundtripTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  SparseTensorParam input_sparse_tensor = GetParam().input_sparse_tensor;
+  std::vector<SparseTensorParam> expected_outputs = GetParam().expected_outputs;
+  std::vector<int> breakpoints = GetParam().breakpoints;
+  DataType tvalues = input_sparse_tensor.values.dtype();
+  gtl::InlinedVector<TensorValue, 4> inputs = {
+      &input_sparse_tensor.indices, &input_sparse_tensor.values,
+      &input_sparse_tensor.dense_shape};
+
+  std::unique_ptr<OpKernel> dataset_kernel;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetKernel(tvalues, &dataset_kernel));
+  std::unique_ptr<OpKernelContext> dataset_kernel_ctx;
+  TF_ASSERT_OK(CreateSparseTensorSliceDatasetContext(
+      dataset_kernel.get(), &inputs, &dataset_kernel_ctx));
+  DatasetBase *dataset;
+  TF_ASSERT_OK(
+      CreateDataset(dataset_kernel.get(), dataset_kernel_ctx.get(), &dataset));
+  core::ScopedUnref scoped_unref(dataset);
+
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_ASSERT_OK(CreateIteratorContext(dataset_kernel_ctx.get(), &iterator_ctx));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(
+      dataset->MakeIterator(iterator_ctx.get(), "Iterator", &iterator));
+
+  std::unique_ptr<SerializationContext> serialization_ctx;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx));
+
+  int cur_iteration = 0;
+  bool end_of_sequence = false;
+  int64 num_slices = input_sparse_tensor.dense_shape.dim_size(0);
+  std::vector<Tensor> out_tensors;
+
+  for (int breakpoint : breakpoints) {
+    while (cur_iteration < breakpoint) {
+      TF_EXPECT_OK(iterator->GetNext(iterator_ctx.get(), &out_tensors,
+                                     &end_of_sequence));
+      cur_iteration++;
+    }
+
+    if (breakpoint == 0) {
+      EXPECT_FALSE(end_of_sequence);
+    } else if (breakpoint <= num_slices) {
+      for (int i = 0; i < out_tensors.size(); ++i) {
+        TF_EXPECT_OK(ExpectEqual(out_tensors[0],
+                                 expected_outputs[cur_iteration - 1].indices));
+        TF_EXPECT_OK(ExpectEqual(out_tensors[1],
+                                 expected_outputs[cur_iteration - 1].values));
+        TF_EXPECT_OK(ExpectEqual(
+            out_tensors[2], expected_outputs[cur_iteration - 1].dense_shape));
+      }
+    } else {
+      EXPECT_TRUE(end_of_sequence);
+    }
+
+    VariantTensorData data;
+    VariantTensorDataWriter writer(&data);
+    TF_ASSERT_OK(iterator->Save(serialization_ctx.get(), &writer));
+    TF_ASSERT_OK(writer.Flush());
+    VariantTensorDataReader reader(&data);
+    TF_ASSERT_OK(iterator->Restore(iterator_ctx.get(), &reader));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(SparseTensorSliceDatasetOpTest, IteratorRoundtripTest,
+                        ::testing::ValuesIn(TestCases));
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/take_dataset_op.cc b/tensorflow/core/kernels/data/take_dataset_op.cc
index 0a3d5869534ddad9f7ed295171d8deefc2154107..0dd0c0c80de194c60aa7d268cb40317d722956c4 100644
--- a/tensorflow/core/kernels/data/take_dataset_op.cc
+++ b/tensorflow/core/kernels/data/take_dataset_op.cc
@@ -50,11 +50,11 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       if (count_ == 0) {
-        return std::unique_ptr<IteratorBase>(
-            new EmptyIterator({this, strings::StrCat(prefix, "::EmptyTake")}));
+        return absl::make_unique<EmptyIterator>(EmptyIterator::Params{
+            this, strings::StrCat(prefix, "::EmptyTake")});
       } else {
-        return std::unique_ptr<IteratorBase>(new FiniteIterator(
-            {this, strings::StrCat(prefix, "::FiniteTake")}));
+        return absl::make_unique<FiniteIterator>(FiniteIterator::Params{
+            this, strings::StrCat(prefix, "::FiniteTake")});
       }
     }
 
diff --git a/tensorflow/core/kernels/data/tensor_dataset_op.cc b/tensorflow/core/kernels/data/tensor_dataset_op.cc
index 98c23f23b202dee580fb89f5473f69c61d57c640..a44dbd0d4d436e3eb85adbe9db6dc39bde0419e8 100644
--- a/tensorflow/core/kernels/data/tensor_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op.cc
@@ -50,8 +50,8 @@ class TensorDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::FromTensor")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::FromTensor")});
     }
 
     const DataTypeVector& output_dtypes() const override { return dtypes_; }
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index 4ba2bde718a6351ff13bc17cf14ae5c60332c6ca..97a1ec402f2abff8627c65e14d2af39e0693afaa 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
@@ -28,7 +29,10 @@ namespace {
 class TensorSliceDatasetOp : public DatasetOpKernel {
  public:
   explicit TensorSliceDatasetOp(OpKernelConstruction* ctx)
-      : DatasetOpKernel(ctx) {}
+      : DatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("Toutput_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
     OpInputList inputs;
@@ -50,6 +54,10 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
               "All components must have the same size in the 0th dimension"));
     }
     *output = new Dataset(ctx, std::move(components));
+    OP_REQUIRES_OK(ctx,
+                   VerifyTypesMatch((*output)->output_dtypes(), output_types_));
+    OP_REQUIRES_OK(ctx, VerifyShapesCompatible((*output)->output_shapes(),
+                                               output_shapes_));
   }
 
  private:
@@ -71,8 +79,8 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::TensorSlice")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::TensorSlice")});
     }
 
     const DataTypeVector& output_dtypes() const override { return dtypes_; }
@@ -170,6 +178,9 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
     DataTypeVector dtypes_;
     std::vector<PartialTensorShape> shapes_;
   };
+
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("TensorSliceDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op_test.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b0e09b45454c258e0e6e500c6bdd2ad580819276
--- /dev/null
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op_test.cc
@@ -0,0 +1,630 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/iterator_ops.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr char kNodeName[] = "tensor_slice_dataset";
+constexpr char kOpName[] = "TensorSliceDataset";
+
+class TensorSliceDatasetOpTest : public DatasetOpsTestBase {
+ protected:
+  // Creates a new TensorSliceDataset op kernel.
+  Status CreateTensorSliceDatasetKernel(
+      DataTypeVector dtypes, std::vector<PartialTensorShape> shapes,
+      std::unique_ptr<OpKernel> *tensor_dataset_kernel) {
+    std::vector<string> components;
+    components.reserve(dtypes.size());
+    for (int i = 0; i < dtypes.size(); i++) {
+      components.emplace_back(strings::StrCat("component_", i));
+    }
+
+    node_def_ = test::function::NDef(
+        kNodeName, kOpName, components,
+        {{"Toutput_types", dtypes}, {"output_shapes", shapes}});
+    TF_RETURN_IF_ERROR(CreateOpKernel(node_def_, tensor_dataset_kernel));
+    return Status::OK();
+  }
+
+  // Creates a new TensorSliceDataset op kernel context.
+  Status CreateTensorSliceDatasetContext(
+      OpKernel *const tensor_dataset_kernel,
+      gtl::InlinedVector<TensorValue, 4> *inputs,
+      std::unique_ptr<OpKernelContext> *context) {
+    TF_RETURN_IF_ERROR(CheckOpKernelInput(*tensor_dataset_kernel, *inputs));
+    TF_RETURN_IF_ERROR(
+        CreateOpKernelContext(tensor_dataset_kernel, inputs, context));
+    return Status::OK();
+  }
+
+ private:
+  NodeDef node_def_;
+};
+
+struct TestParam {
+  std::vector<Tensor> components;
+  std::vector<Tensor> expected_outputs;
+  std::vector<int> breakpoints;
+} TestCases[] = {
+    // A single tuple of tensors.
+    {{{DatasetOpsTestBase::CreateTensor<int64>(TensorShape({2}), {1, 2}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({2, 2}),
+                                               {1, 2, 3, 4}),
+       DatasetOpsTestBase::CreateTensor<double>(TensorShape({2, 1}),
+                                                {37.0, 38.0}),
+       DatasetOpsTestBase::CreateTensor<string>(TensorShape({2, 1}),
+                                                {"a", "b"})}},  // components
+     {{DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {1}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({2}), {1, 2}),
+       DatasetOpsTestBase::CreateTensor<double>(TensorShape({1}), {37.0}),
+       DatasetOpsTestBase::CreateTensor<string>(TensorShape({1}), {"a"}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({}), {2}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({2}), {3, 4}),
+       DatasetOpsTestBase::CreateTensor<double>(TensorShape({1}), {38.0}),
+       DatasetOpsTestBase::CreateTensor<string>(TensorShape({1}),
+                                                {"b"})}},  // expected_outputs
+     {{0, 1, 3}}},                                         //  breakpoints
+    // Nested tensors
+    {{{DatasetOpsTestBase::CreateTensor<Variant>(
+           TensorShape({2, 1}),
+           {DatasetOpsTestBase::CreateTensor<double>(TensorShape({2, 2}),
+                                                     {1.0, 2.0, 3.0, 4.0}),
+            DatasetOpsTestBase::CreateTensor<double>(TensorShape({2, 2}),
+                                                     {5.0, 6.0, 7.0, 8.0})}),
+       DatasetOpsTestBase::CreateTensor<Variant>(
+           TensorShape({2, 1}), {DatasetOpsTestBase::CreateTensor<string>(
+                                     TensorShape({1, 2}), {"a", "b"}),
+                                 DatasetOpsTestBase::CreateTensor<string>(
+                                     TensorShape({1, 2}), {"c", "d"})}),
+       DatasetOpsTestBase::CreateTensor<int64>(
+           TensorShape({2, 3}), {1, 2, 3, 4, 5, 6})}},  // components
+     {{DatasetOpsTestBase::CreateTensor<Variant>(
+           TensorShape({1}), {DatasetOpsTestBase::CreateTensor<double>(
+                                 TensorShape({2, 2}), {1.0, 2.0, 3.0, 4.0})}),
+       DatasetOpsTestBase::CreateTensor<Variant>(
+           TensorShape({1}), {DatasetOpsTestBase::CreateTensor<string>(
+                                 TensorShape({1, 2}), {"a", "b"})}),
+       DatasetOpsTestBase::CreateTensor<int64>(TensorShape({3}), {1, 2, 3}),
+       DatasetOpsTestBase::CreateTensor<Variant>(
+           TensorShape({1}), {DatasetOpsTestBase::CreateTensor<double>(
+                                 TensorShape({2, 2}), {5.0, 6.0, 7.0, 8.0})}),
+       DatasetOpsTestBase::CreateTensor<Variant>(
+           TensorShape({1}), {DatasetOpsTestBase::CreateTensor<string>(
+                                 TensorShape({1, 2}), {"c", "d"})}),
+       DatasetOpsTestBase::CreateTensor<int64>(
+           TensorShape({3}), {4, 5, 6})}},  // expected_outputs
+     {{0, 1, 2}}}                           // breakpoints
+};
+
+struct DatasetGetNextTest : TensorSliceDatasetOpTest,
+                            ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(DatasetGetNextTest, GetNext) {
+  int thread_num = 2, cpu_num = 2;
+  std::vector<Tensor> components = GetParam().components;
+  std::vector<Tensor> expected_outputs = GetParam().expected_outputs;
+  size_t num_tensors_per_slice = components.size();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  DataTypeVector dtypes;
+  std::vector<PartialTensorShape> shapes;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto &component : components) {
+    inputs.push_back(&component);
+    dtypes.push_back(component.dtype());
+  }
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    shapes.emplace_back(expected_outputs[i].shape());
+  }
+
+  std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                              &tensor_slice_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+  TF_ASSERT_OK(
+      CreateTensorSliceDatasetContext(tensor_slice_dataset_kernel.get(),
+                                      &inputs, &tensor_slice_dataset_context));
+  DatasetBase *tensor_slice_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                             tensor_slice_dataset_context.get(),
+                             &tensor_slice_dataset));
+  core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(tensor_slice_dataset_context.get(),
+                                     &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(tensor_slice_dataset->MakeIterator(iterator_context.get(),
+                                                  "Iterator", &iterator));
+  bool end_of_sequence = false;
+  std::vector<Tensor> out_tensors;
+  int cur_slice = 0;
+
+  while (!end_of_sequence) {
+    TF_EXPECT_OK(iterator->GetNext(iterator_context.get(), &out_tensors,
+                                   &end_of_sequence));
+    for (int i = 0; i < out_tensors.size(); ++i) {
+      EXPECT_LT(i + num_tensors_per_slice * cur_slice, expected_outputs.size());
+      if (out_tensors[i].dtype() == DT_VARIANT) {
+        // Currently `ExpectEqual()` does not support the variant tensor
+        // yet, so we manually cast the variant to numeric/string tensor.
+        const Tensor *output = out_tensors[i].scalar<Variant>()().get<Tensor>();
+        const Tensor *expected_output =
+            expected_outputs[i + num_tensors_per_slice * cur_slice]
+                .scalar<Variant>()()
+                .get<Tensor>();
+        TF_EXPECT_OK(ExpectEqual(*output, *expected_output));
+      } else {
+        TF_EXPECT_OK(ExpectEqual(
+            out_tensors[i],
+            expected_outputs[i + num_tensors_per_slice * cur_slice]));
+      }
+    }
+    out_tensors.clear();
+    cur_slice++;
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(TensorDatasetSliceOpTest, DatasetGetNextTest,
+                        ::testing::ValuesIn(TestCases));
+
+TEST_F(TensorSliceDatasetOpTest, DatasetName) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor t1 = CreateTensor<int64>(TensorShape({2, 2}), {1, 2, 3, 4});
+  Tensor t2 = CreateTensor<int64>(TensorShape({2, 2}), {5, 6, 7, 8});
+  gtl::InlinedVector<TensorValue, 4> inputs = {&t1, &t2};
+  DataTypeVector dtypes({DT_INT64, DT_INT64});
+  std::vector<PartialTensorShape> shapes = {PartialTensorShape({2}),
+                                            PartialTensorShape({2})};
+  std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                              &tensor_slice_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+  TF_ASSERT_OK(
+      CreateTensorSliceDatasetContext(tensor_slice_dataset_kernel.get(),
+                                      &inputs, &tensor_slice_dataset_context));
+  DatasetBase *tensor_slice_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                             tensor_slice_dataset_context.get(),
+                             &tensor_slice_dataset));
+  core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+  EXPECT_EQ(tensor_slice_dataset->name(), kOpName);
+}
+
+struct DatasetOutputDtypesTest : TensorSliceDatasetOpTest,
+                                 ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(DatasetOutputDtypesTest, DatasetOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  std::vector<Tensor> components = GetParam().components;
+  std::vector<Tensor> expected_outputs = GetParam().expected_outputs;
+  size_t num_tensors_per_slice = components.size();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  DataTypeVector dtypes;
+  std::vector<PartialTensorShape> shapes;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto &component : components) {
+    inputs.emplace_back(&component);
+    dtypes.emplace_back(component.dtype());
+  }
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    shapes.emplace_back(expected_outputs[i].shape());
+  }
+
+  std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                              &tensor_slice_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+  TF_ASSERT_OK(
+      CreateTensorSliceDatasetContext(tensor_slice_dataset_kernel.get(),
+                                      &inputs, &tensor_slice_dataset_context));
+  DatasetBase *tensor_slice_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                             tensor_slice_dataset_context.get(),
+                             &tensor_slice_dataset));
+  core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+  const DataTypeVector produced_output_dtypes =
+      tensor_slice_dataset->output_dtypes();
+  EXPECT_EQ(produced_output_dtypes.size(), num_tensors_per_slice);
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    EXPECT_EQ(produced_output_dtypes[i], expected_outputs[i].dtype());
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(TensorDatasetSliceOpTest, DatasetOutputDtypesTest,
+                        ::testing::ValuesIn(TestCases));
+
+struct DatasetOutputShapesTest : TensorSliceDatasetOpTest,
+                                 ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(DatasetOutputShapesTest, DatasetOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  std::vector<Tensor> components = GetParam().components;
+  std::vector<Tensor> expected_outputs = GetParam().expected_outputs;
+  size_t num_tensors_per_slice = components.size();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  DataTypeVector dtypes;
+  std::vector<PartialTensorShape> shapes;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto &component : components) {
+    inputs.emplace_back(&component);
+    dtypes.emplace_back(component.dtype());
+  }
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    shapes.emplace_back(expected_outputs[i].shape());
+  }
+  std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                              &tensor_slice_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+  TF_ASSERT_OK(
+      CreateTensorSliceDatasetContext(tensor_slice_dataset_kernel.get(),
+                                      &inputs, &tensor_slice_dataset_context));
+  DatasetBase *tensor_slice_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                             tensor_slice_dataset_context.get(),
+                             &tensor_slice_dataset));
+  core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+  const std::vector<PartialTensorShape> produced_output_shapes =
+      tensor_slice_dataset->output_shapes();
+  std::vector<PartialTensorShape> expected_output_shapes;
+  EXPECT_EQ(produced_output_shapes.size(), num_tensors_per_slice);
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    EXPECT_TRUE(
+        produced_output_shapes[i].IsIdenticalTo(expected_outputs[i].shape()));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(TensorDatasetSliceOpTest, DatasetOutputShapesTest,
+                        ::testing::ValuesIn(TestCases));
+
+struct DatasetCardinalityTest : TensorSliceDatasetOpTest,
+                                ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(DatasetCardinalityTest, Cardinality) {
+  int thread_num = 2, cpu_num = 2;
+  std::vector<Tensor> components = GetParam().components;
+  std::vector<Tensor> expected_outputs = GetParam().expected_outputs;
+  size_t num_tensors_per_slice = components.size();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  DataTypeVector dtypes;
+  std::vector<PartialTensorShape> shapes;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto &component : components) {
+    inputs.emplace_back(&component);
+    dtypes.emplace_back(component.dtype());
+  }
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    shapes.emplace_back(expected_outputs[i].shape());
+  }
+  std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                              &tensor_slice_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+  TF_ASSERT_OK(
+      CreateTensorSliceDatasetContext(tensor_slice_dataset_kernel.get(),
+                                      &inputs, &tensor_slice_dataset_context));
+  DatasetBase *tensor_slice_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                             tensor_slice_dataset_context.get(),
+                             &tensor_slice_dataset));
+  core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+  EXPECT_EQ(tensor_slice_dataset->Cardinality(), inputs[0].tensor->dim_size(0));
+}
+
+INSTANTIATE_TEST_CASE_P(TensorDatasetSliceOpTest, DatasetCardinalityTest,
+                        ::testing::ValuesIn(TestCases));
+
+TEST_F(TensorSliceDatasetOpTest, DatasetSave) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor t1 = CreateTensor<int64>(TensorShape({2, 2}), {1, 2, 3, 4});
+  Tensor t2 = CreateTensor<int64>(TensorShape({2, 2}), {5, 6, 7, 8});
+  gtl::InlinedVector<TensorValue, 4> inputs = {&t1, &t2};
+  DataTypeVector dtypes({DT_INT64, DT_INT64});
+  std::vector<PartialTensorShape> shapes = {PartialTensorShape({2}),
+                                            PartialTensorShape({2})};
+  std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                              &tensor_slice_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+  TF_ASSERT_OK(
+      CreateTensorSliceDatasetContext(tensor_slice_dataset_kernel.get(),
+                                      &inputs, &tensor_slice_dataset_context));
+  DatasetBase *tensor_slice_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                             tensor_slice_dataset_context.get(),
+                             &tensor_slice_dataset));
+  core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+  std::unique_ptr<SerializationContext> serialization_context;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
+  VariantTensorData data;
+  VariantTensorDataWriter writer(&data);
+  TF_ASSERT_OK(
+      tensor_slice_dataset->Save(serialization_context.get(), &writer));
+  TF_ASSERT_OK(writer.Flush());
+}
+
+struct IteratorOutputDtypesTest : TensorSliceDatasetOpTest,
+                                  ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(IteratorOutputDtypesTest, IteratorOutputDtypes) {
+  int thread_num = 2, cpu_num = 2;
+  std::vector<Tensor> components = GetParam().components;
+  std::vector<Tensor> expected_outputs = GetParam().expected_outputs;
+  size_t num_tensors_per_slice = components.size();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  DataTypeVector dtypes;
+  std::vector<PartialTensorShape> shapes;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto &component : components) {
+    inputs.emplace_back(&component);
+    dtypes.emplace_back(component.dtype());
+  }
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    shapes.emplace_back(expected_outputs[i].shape());
+  }
+
+  std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                              &tensor_slice_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+  TF_ASSERT_OK(
+      CreateTensorSliceDatasetContext(tensor_slice_dataset_kernel.get(),
+                                      &inputs, &tensor_slice_dataset_context));
+  DatasetBase *tensor_slice_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                             tensor_slice_dataset_context.get(),
+                             &tensor_slice_dataset));
+  core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(tensor_slice_dataset_context.get(),
+                                     &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(tensor_slice_dataset->MakeIterator(iterator_context.get(),
+                                                  "Iterator", &iterator));
+  const DataTypeVector produced_output_dtypes = iterator->output_dtypes();
+
+  EXPECT_EQ(produced_output_dtypes.size(), num_tensors_per_slice);
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    EXPECT_EQ(produced_output_dtypes[i], expected_outputs[i].dtype());
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(TensorDatasetSliceOpTest, IteratorOutputDtypesTest,
+                        ::testing::ValuesIn(TestCases));
+
+struct IteratorOutputShapesTest : TensorSliceDatasetOpTest,
+                                  ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(IteratorOutputShapesTest, IteratorOutputShapes) {
+  int thread_num = 2, cpu_num = 2;
+  std::vector<Tensor> components = GetParam().components;
+  std::vector<Tensor> expected_outputs = GetParam().expected_outputs;
+  size_t num_tensors_per_slice = components.size();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  DataTypeVector dtypes;
+  std::vector<PartialTensorShape> shapes;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto &component : components) {
+    inputs.emplace_back(&component);
+    dtypes.emplace_back(component.dtype());
+  }
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    shapes.emplace_back(expected_outputs[i].shape());
+  }
+
+  std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                              &tensor_slice_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+  TF_ASSERT_OK(
+      CreateTensorSliceDatasetContext(tensor_slice_dataset_kernel.get(),
+                                      &inputs, &tensor_slice_dataset_context));
+  DatasetBase *tensor_slice_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                             tensor_slice_dataset_context.get(),
+                             &tensor_slice_dataset));
+  core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(tensor_slice_dataset_context.get(),
+                                     &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(tensor_slice_dataset->MakeIterator(iterator_context.get(),
+                                                  "Iterator", &iterator));
+  const std::vector<PartialTensorShape> produced_output_shapes =
+      iterator->output_shapes();
+  EXPECT_EQ(produced_output_shapes.size(), num_tensors_per_slice);
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    EXPECT_TRUE(
+        produced_output_shapes[i].IsIdenticalTo(expected_outputs[i].shape()));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(TensorDatasetSliceOpTest, IteratorOutputShapesTest,
+                        ::testing::ValuesIn(TestCases));
+
+TEST_F(TensorSliceDatasetOpTest, IteratorOutputPrefix) {
+  int thread_num = 2, cpu_num = 2;
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  Tensor t1 = CreateTensor<int64>(TensorShape({2, 2}), {1, 2, 3, 4});
+  Tensor t2 = CreateTensor<int64>(TensorShape({2, 2}), {5, 6, 7, 8});
+  gtl::InlinedVector<TensorValue, 4> inputs = {&t1, &t2};
+  DataTypeVector dtypes({DT_INT64, DT_INT64});
+  std::vector<PartialTensorShape> shapes = {PartialTensorShape({2}),
+                                            PartialTensorShape({2})};
+  std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                              &tensor_slice_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+  TF_ASSERT_OK(
+      CreateTensorSliceDatasetContext(tensor_slice_dataset_kernel.get(),
+                                      &inputs, &tensor_slice_dataset_context));
+  DatasetBase *tensor_slice_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                             tensor_slice_dataset_context.get(),
+                             &tensor_slice_dataset));
+  core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(tensor_slice_dataset_context.get(),
+                                     &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(tensor_slice_dataset->MakeIterator(iterator_context.get(),
+                                                  "Iterator", &iterator));
+  EXPECT_EQ(iterator->prefix(), "Iterator::TensorSlice");
+}
+
+struct IteratorRoundtripTest : TensorSliceDatasetOpTest,
+                               ::testing::WithParamInterface<TestParam> {};
+
+TEST_P(IteratorRoundtripTest, Roundtrip) {
+  int thread_num = 2, cpu_num = 2;
+  std::vector<Tensor> components = GetParam().components;
+  std::vector<Tensor> expected_outputs = GetParam().expected_outputs;
+  std::vector<int> breakpoints = GetParam().breakpoints;
+  size_t num_tensors_per_slice = components.size();
+
+  TF_ASSERT_OK(InitThreadPool(thread_num));
+  TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num));
+
+  DataTypeVector dtypes;
+  std::vector<PartialTensorShape> shapes;
+  gtl::InlinedVector<TensorValue, 4> inputs;
+  for (auto &component : components) {
+    inputs.emplace_back(&component);
+    dtypes.emplace_back(component.dtype());
+  }
+  for (int i = 0; i < num_tensors_per_slice; ++i) {
+    shapes.emplace_back(expected_outputs[i].shape());
+  }
+
+  std::unique_ptr<OpKernel> tensor_slice_dataset_kernel;
+  TF_ASSERT_OK(CreateTensorSliceDatasetKernel(dtypes, shapes,
+                                              &tensor_slice_dataset_kernel));
+  std::unique_ptr<OpKernelContext> tensor_slice_dataset_context;
+  TF_ASSERT_OK(
+      CreateTensorSliceDatasetContext(tensor_slice_dataset_kernel.get(),
+                                      &inputs, &tensor_slice_dataset_context));
+  DatasetBase *tensor_slice_dataset;
+  TF_ASSERT_OK(CreateDataset(tensor_slice_dataset_kernel.get(),
+                             tensor_slice_dataset_context.get(),
+                             &tensor_slice_dataset));
+  core::ScopedUnref scored_unref(tensor_slice_dataset);
+
+  std::unique_ptr<IteratorContext> iterator_context;
+  TF_ASSERT_OK(CreateIteratorContext(tensor_slice_dataset_context.get(),
+                                     &iterator_context));
+  std::unique_ptr<IteratorBase> iterator;
+  TF_ASSERT_OK(tensor_slice_dataset->MakeIterator(iterator_context.get(),
+                                                  "Iterator", &iterator));
+  std::unique_ptr<SerializationContext> serialization_context;
+  TF_ASSERT_OK(CreateSerializationContext(&serialization_context));
+
+  int cur_iteration = 0;
+  bool end_of_sequence = false;
+  int64 num_slices = inputs[0].tensor->dim_size(0);
+  std::vector<Tensor> out_tensors;
+
+  for (int breakpoint : breakpoints) {
+    while (cur_iteration < breakpoint) {
+      TF_EXPECT_OK(iterator->GetNext(iterator_context.get(), &out_tensors,
+                                     &end_of_sequence));
+      cur_iteration++;
+    }
+
+    if (breakpoint == 0) {
+      EXPECT_FALSE(end_of_sequence);
+    } else if (breakpoint <= num_slices) {
+      for (int i = 0; i < out_tensors.size(); ++i) {
+        if (out_tensors[i].dtype() == DT_VARIANT) {
+          const Tensor *output =
+              out_tensors[i].scalar<Variant>()().get<Tensor>();
+          const Tensor *expected_output =
+              expected_outputs[i + num_tensors_per_slice * (cur_iteration - 1)]
+                  .scalar<Variant>()()
+                  .get<Tensor>();
+          TF_EXPECT_OK(ExpectEqual(*output, *expected_output));
+        } else {
+          TF_EXPECT_OK(ExpectEqual(
+              out_tensors[i], expected_outputs[i + num_tensors_per_slice *
+                                                       (cur_iteration - 1)]));
+        }
+      }
+    } else {
+      EXPECT_TRUE(end_of_sequence);
+    }
+
+    VariantTensorData data;
+    VariantTensorDataWriter writer(&data);
+    TF_ASSERT_OK(iterator->Save(serialization_context.get(), &writer));
+    TF_ASSERT_OK(writer.Flush());
+    VariantTensorDataReader reader(&data);
+    TF_ASSERT_OK(iterator->Restore(iterator_context.get(), &reader));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(TensorDatasetSliceOpTest, IteratorRoundtripTest,
+                        ::testing::ValuesIn(TestCases));
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/window_dataset.cc b/tensorflow/core/kernels/data/window_dataset.cc
index c295631550aa008ccbf1abee0a91b27d64a6ba35..dc27702f1efb6f53cabe1fdf305e7e715aa51180 100644
--- a/tensorflow/core/kernels/data/window_dataset.cc
+++ b/tensorflow/core/kernels/data/window_dataset.cc
@@ -31,8 +31,8 @@ class WindowDataset : public DatasetBase {
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const override {
-    return std::unique_ptr<IteratorBase>(
-        new Iterator({this, strings::StrCat(prefix, "::Window")}));
+    return absl::make_unique<Iterator>(
+        Iterator::Params{this, strings::StrCat(prefix, "::Window")});
   }
 
   const DataTypeVector& output_dtypes() const override { return output_types_; }
diff --git a/tensorflow/core/kernels/data/window_dataset_op.cc b/tensorflow/core/kernels/data/window_dataset_op.cc
index ae13ae5da8d4c093bdb4d6e168584bda234e4502..0b24c1189148a8d2133dc33dcba7c80324620589 100644
--- a/tensorflow/core/kernels/data/window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/window_dataset_op.cc
@@ -78,8 +78,8 @@ class WindowDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          Iterator::Params{this, strings::StrCat(prefix, "::Window")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Window")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/data/zip_dataset_op.cc b/tensorflow/core/kernels/data/zip_dataset_op.cc
index 1760e63a9e1c6b6262c19baa8354052d7d73fd3c..cdc2969fc20573fa705dc2ba7a44955e6e062fd4 100644
--- a/tensorflow/core/kernels/data/zip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op.cc
@@ -62,8 +62,8 @@ class ZipDatasetOp : public DatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Zip")}));
+      return absl::make_unique<Iterator>(
+          Iterator::Params{this, strings::StrCat(prefix, "::Zip")});
     }
 
     const DataTypeVector& output_dtypes() const override {
diff --git a/tensorflow/core/kernels/decode_bmp_op.cc b/tensorflow/core/kernels/decode_bmp_op.cc
index ae451be7e21a119a309a74c3312eee4b24256248..c75fc94bc1c73ce5435271c7940e0779b1be3127 100644
--- a/tensorflow/core/kernels/decode_bmp_op.cc
+++ b/tensorflow/core/kernels/decode_bmp_op.cc
@@ -16,13 +16,13 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 
 #include <memory>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
diff --git a/tensorflow/core/kernels/decode_csv_op.cc b/tensorflow/core/kernels/decode_csv_op.cc
index 6bfb5bd5bc0ae50797080ca3540133b0081f0b13..ba6369533adbbde30f3661a3d8577936de1038fa 100644
--- a/tensorflow/core/kernels/decode_csv_op.cc
+++ b/tensorflow/core/kernels/decode_csv_op.cc
@@ -145,7 +145,7 @@ class DecodeCSVOp : public OpKernel {
               output[f]->flat<float>()(i) = record_defaults[f].flat<float>()(0);
             } else {
               float value;
-              OP_REQUIRES(ctx, strings::safe_strtof(fields[f].c_str(), &value),
+              OP_REQUIRES(ctx, strings::safe_strtof(fields[f], &value),
                           errors::InvalidArgument(
                               "Field ", f, " in record ", i,
                               " is not a valid float: ", fields[f]));
@@ -165,7 +165,7 @@ class DecodeCSVOp : public OpKernel {
                   record_defaults[f].flat<double>()(0);
             } else {
               double value;
-              OP_REQUIRES(ctx, strings::safe_strtod(fields[f].c_str(), &value),
+              OP_REQUIRES(ctx, strings::safe_strtod(fields[f], &value),
                           errors::InvalidArgument(
                               "Field ", f, " in record ", i,
                               " is not a valid double: ", fields[f]));
diff --git a/tensorflow/core/kernels/decode_proto_op.cc b/tensorflow/core/kernels/decode_proto_op.cc
index b54e1ea8ac233f1ca48a65e8e1b7e547643a45a2..06dc766794caf71f3792460f5d6e4b39864d3266 100644
--- a/tensorflow/core/kernels/decode_proto_op.cc
+++ b/tensorflow/core/kernels/decode_proto_op.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -625,8 +626,37 @@ class DecodeProtoOp : public OpKernel {
     // Gather the field descriptors and check that requested output types match.
     int field_index = 0;
     std::vector<const FieldDescriptor*> field_descs;
+    std::vector<const FieldDescriptor*> exts;
+    absl::flat_hash_map<string, const FieldDescriptor*> ext_name_to_field;
+    std::vector<const FieldDescriptor*>::iterator ext_it = exts.begin();
     for (const string& name : field_names) {
       auto fd = message_desc->FindFieldByName(name);
+      if (fd == nullptr) {
+        // If field can't be found in original message, try to find a matching
+        // extension (by its full_name). First check a hashmap for a matching
+        // extension, and if not found, then iterate through available
+        // extensions to find a match (updating the hashmap while iterating.)
+        auto lookup_result = ext_name_to_field.find(name);
+        if (lookup_result != ext_name_to_field.end()) {
+          fd = lookup_result->second;
+        } else {
+          if (ext_it == exts.begin()) {
+            desc_pool->FindAllExtensions(message_desc, &exts);
+            ext_it = exts.begin();
+          }
+          while (ext_it != exts.end()) {
+            auto ext_name = (*ext_it)->full_name();
+            auto ext_field = *ext_it;
+            ++ext_it;
+
+            ext_name_to_field.insert({ext_name, ext_field});
+            if (ext_name == name) {
+              fd = ext_field;
+              break;
+            }
+          }
+        }
+      }
       OP_REQUIRES(context, fd != nullptr,
                   errors::InvalidArgument("Unknown field: ", name,
                                           " in message type ", message_type));
diff --git a/tensorflow/core/kernels/decode_raw_op.cc b/tensorflow/core/kernels/decode_raw_op.cc
index eaef5a6097ff5a7235caba37edf6ef94d5860931..3dd019c3d203c63f055113bb992eb1f542e838ae 100644
--- a/tensorflow/core/kernels/decode_raw_op.cc
+++ b/tensorflow/core/kernels/decode_raw_op.cc
@@ -110,6 +110,8 @@ REGISTER(uint8);
 REGISTER(int16);
 REGISTER(int8);
 REGISTER(int64);
+REGISTER(complex64);
+REGISTER(complex128);
 
 #undef REGISTER
 
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index da3bdb475e274d73751e22334628e3431023b9e4..ab98cacd1a117022444386b9a718e173d68fa99d 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <cmath>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -25,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/kernels/depthwise_conv_op.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -633,7 +633,8 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
       // conv is supported.
       launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop,
                 reshaped_filter, /*row_dilation=*/1, /*col_dilation=*/1,
-                stride_, stride_, padding_, in_backprop, data_format_);
+                stride_, stride_, padding_, /*explicit_paddings=*/{},
+                in_backprop, data_format_);
       return;
     }
 
@@ -1115,7 +1116,8 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
       // conv is supported.
       launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop, input,
                 /*row_dilation=*/1, /*col_dilation=*/1, stride_, stride_,
-                padding_, &reshaped_filter, data_format_);
+                padding_, /*explicit_paddings=*/{}, &reshaped_filter,
+                data_format_);
       return;
     }
 
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index f0902fdba6921b46fd7a0d0adb16e470ed83f65c..11c2b31633dd2186c729c725c4cda5816447954d 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cmath>
 #include <type_traits>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -26,7 +27,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_ops.h"
 #include "tensorflow/core/kernels/depthwise_conv_op.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -404,7 +404,8 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
       // conv is supported.
       launcher_(context, use_cudnn_, cudnn_use_autotune_, input,
                 reshaped_filter, /*row_dilation=*/1, /*col_dilation=*/1,
-                stride_, stride_, padding_, output, data_format_);
+                stride_, stride_, padding_, /*explicit_paddings=*/{}, output,
+                data_format_);
       return;
     }
 
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.h
similarity index 98%
rename from tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
rename to tensorflow/core/kernels/depthwise_conv_op_gpu.h
index e811968d277ba3594341a59e8d6262cac637e602..098853e68430d425143d16ff2e8edbb9877f8e23 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_GPU_H_
+
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
 
@@ -38,7 +41,7 @@ using Eigen::GpuDevice;
 
 // Returns whether depthwise convolution forward or backward input pass can be
 // performed using the faster ('Small') variant of the kernel.
-EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dGPUSmall(
+inline EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dGPUSmall(
     const DepthwiseArgs& args) {
   return args.depth_multiplier == 1 && args.stride == 1 && args.in_rows <= 32 &&
          args.in_cols <= 32 && args.in_rows == args.out_rows &&
@@ -51,7 +54,7 @@ EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dGPUSmall(
 
 // Returns whether depthwise convolution backward filter pass can be performed
 // using the faster ('Small') variant of the kernel.
-EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(
+inline EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(
     const DepthwiseArgs& args, const int block_height) {
   return args.depth_multiplier == 1 && args.stride == 1 && args.in_rows <= 32 &&
          args.in_cols <= 32 && args.in_rows == args.out_rows &&
@@ -652,13 +655,12 @@ struct PseudoHalfType<Eigen::half> {
 };
 }  // namespace detail
 
-namespace {
 // Maps to float if T is __half, and to T otherwise.
 template <typename T>
 using PseudoHalfType = typename detail::PseudoHalfType<T>::Type;
 
 // Returns whether the context's GPU supports efficient fp16 math.
-bool HasFastHalfMath(OpKernelContext* ctx) {
+inline bool HasFastHalfMath(OpKernelContext* ctx) {
   int major, minor;
   ctx->op_device_context()
       ->stream()
@@ -669,7 +671,6 @@ bool HasFastHalfMath(OpKernelContext* ctx) {
   // GPUs before sm_53 don't support fp16 math, and sm_61's fp16 math is slow.
   return cuda_arch >= 530 && cuda_arch != 610;
 }
-}  // namespace
 
 template <typename T, DepthwiseConv2dDirection kDirection,
           int kKnownFilterWidth, int kKnownFilterHeight, int kBlockDepth,
@@ -808,10 +809,6 @@ void LaunchDepthwiseConvOp<GpuDevice, T>::operator()(OpKernelContext* ctx,
   }
 }
 
-template struct LaunchDepthwiseConvOp<GpuDevice, Eigen::half>;
-template struct LaunchDepthwiseConvOp<GpuDevice, float>;
-template struct LaunchDepthwiseConvOp<GpuDevice, double>;
-
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. input.
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kKnownDepthMultiplier>
@@ -1030,10 +1027,6 @@ void LaunchDepthwiseConvBackpropInputOp<GpuDevice, T>::operator()(
   }
 }
 
-template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, Eigen::half>;
-template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, float>;
-template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, double>;
-
 // A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
 template <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
           int kKnownDepthMultiplier>
@@ -1803,9 +1796,7 @@ void LaunchDepthwiseConvBackpropFilterOp<GpuDevice, T>::operator()(
                  ctx, args, out_backprop, input, filter_backprop, data_format));
   }
 }
-
-template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, Eigen::half>;
-template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, float>;
-template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, double>;
 }  // namespace tensorflow
 #endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_GPU_H_
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu_double.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu_double.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..073e7cf269844a7b355019493dad3d9287c00bf5
--- /dev/null
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu_double.cu.cc
@@ -0,0 +1,30 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/depthwise_conv_op.h"
+#include "tensorflow/core/kernels/depthwise_conv_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct LaunchDepthwiseConvOp<GpuDevice, double>;
+template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, double>;
+template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, double>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu_float.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu_float.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4b0e15e4766713130e86224dc9f255fe8ecead81
--- /dev/null
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu_float.cu.cc
@@ -0,0 +1,30 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/depthwise_conv_op.h"
+#include "tensorflow/core/kernels/depthwise_conv_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct LaunchDepthwiseConvOp<GpuDevice, float>;
+template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, float>;
+template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, float>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu_half.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu_half.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2db9fa4dff5bf58cb52d44c3c044ba4fc34d6d9f
--- /dev/null
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu_half.cu.cc
@@ -0,0 +1,30 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/depthwise_conv_op.h"
+#include "tensorflow/core/kernels/depthwise_conv_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct LaunchDepthwiseConvOp<GpuDevice, Eigen::half>;
+template struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, Eigen::half>;
+template struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, Eigen::half>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/dynamic_partition_op.cc b/tensorflow/core/kernels/dynamic_partition_op.cc
index 572d04ae2c464d493508d494ba325a33eb92d4c1..95af19c4c4818abced194f7553e8bb79c777a998 100644
--- a/tensorflow/core/kernels/dynamic_partition_op.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op.cc
@@ -16,11 +16,11 @@ limitations under the License.
 // See docs in ../ops/data_flow_ops.cc.
 
 #include <vector>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/util/util.h"
 
diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
index e7882acc80e3c2383f3a3c208175d16dd8c092ab..59f687bf9c0247be2528c79d0a1ef3dbb3fb7d35 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@@ -40,11 +40,11 @@ limitations under the License.
 #include "third_party/cub/iterator/constant_input_iterator.cuh"
 #include "third_party/cub/thread/thread_operators.cuh"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/gather_functor_gpu.cu.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
diff --git a/tensorflow/core/kernels/dynamic_stitch_op.cc b/tensorflow/core/kernels/dynamic_stitch_op.cc
index f21f2acf2622a56cc3d6f58d259f79788a314dfb..5b8845b675d1264c07f0a6096460ea9edf62a3e4 100644
--- a/tensorflow/core/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/core/kernels/dynamic_stitch_op.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 // See docs in ../ops/data_flow_ops.cc.
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 
 #ifdef GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/eigen_attention.h b/tensorflow/core/kernels/eigen_attention.h
index 4d86f9deb9902a64764e29ca0371bb68ad4f3370..c5158e65d8af4b9e721eb54ce5414023b06ef6a4 100644
--- a/tensorflow/core/kernels/eigen_attention.h
+++ b/tensorflow/core/kernels/eigen_attention.h
@@ -20,6 +20,13 @@ limitations under the License.
 
 namespace Eigen {
 
+// Noise mode used when padding.
+enum ExtractGlimpsesNoiseMode {
+  UNIFORM = 0,
+  GAUSSIAN = 1,
+  ZERO = 2,
+};
+
 /** ExtractGlimpses
  * \ingroup CXX11_NeuralNetworks_Module
  *
@@ -43,18 +50,19 @@ namespace Eigen {
  * for width and height which will be equal to the requested glimpse size.
  */
 namespace {
+
 template <typename Index>
 struct GlimpseExtractionOp {
   GlimpseExtractionOp(const Index width, const Index height,
                       const std::vector<IndexPair<float> >& offsets,
                       const bool normalized, const bool centered,
-                      const bool uniform_noise)
+                      const ExtractGlimpsesNoiseMode noise)
       : width_(width),
         height_(height),
         offsets_(offsets),
         normalized_(normalized),
         centered_(centered),
-        uniform_noise_(uniform_noise) {}
+        noise_(noise) {}
 
   template <typename Input>
   DSizes<Index, 4> dimensions(const Input& input) const {
@@ -144,64 +152,73 @@ struct GlimpseExtractionOp {
       slice_extent[2] = std::min<Index>(input_height, slice_extent[2]);
 
       if (partial_overlap) {
-        if (uniform_noise_) {
-          // Initialize the glimpse with uniform noise.
-          typedef typename internal::remove_const<
-              typename internal::traits<Input>::Scalar>::type Scalar;
-          TensorFixedSize<Scalar, Sizes<> > mini;
-          mini.device(device) = input.template chip<3>(i).minimum();
-          TensorFixedSize<float, Sizes<> > range;
-          range.device(device) = (input.template chip<3>(i).maximum() - mini)
-                                     .template cast<float>();
-
-          DSizes<Index, 3> glimpse_size(num_channels, width_, height_);
-          TensorMap<Tensor<float, 3> > tmp(NULL, glimpse_size);
-          output.template chip<3>(i).device(device) =
-              mini.reshape(Sizes<1, 1, 1>()).broadcast(glimpse_size) +
-              (tmp.random(unigen) *
-               range.reshape(Sizes<1, 1, 1>()).broadcast(glimpse_size))
-                  .template cast<Scalar>();
-        } else {
-          // Initialize the glimpse with white noise: compute the mean and sigma
-          // of each channel, and use them to shape the gaussian.
-          DSizes<Index, 2> glimpse_size(width_, height_);
-          DSizes<Index, 2> input_size(input_width, input_height);
-          typedef typename internal::remove_const<
-              typename internal::traits<Input>::Scalar>::type Scalar;
-
-          for (int j = 0; j < num_channels; ++j) {
-            TensorFixedSize<Scalar, Sizes<> > mean;
-            mean.device(device) = input.template chip<3>(i)
-                                      .template chip<0>(j)
-                                      .template cast<float>()
-                                      .mean();
-            TensorFixedSize<float, Sizes<> > sigma;
-            sigma.device(device) =
-                (input.template chip<3>(i)
-                     .template chip<0>(j)
-                     .template cast<float>() -
-                 mean.reshape(Sizes<1, 1>()).broadcast(input_size))
-                    .square()
-                    .mean()
-                    .sqrt();
+        switch (noise_) {
+          case ZERO: {
+            // Initialize the glimpse with zero noise.
+            output.template chip<3>(i).device(device) =
+                output.template chip<3>(i).constant(0);
+          } break;
+          case UNIFORM: {
+            // Initialize the glimpse with uniform noise.
+            typedef typename internal::remove_const<
+                typename internal::traits<Input>::Scalar>::type Scalar;
             TensorFixedSize<Scalar, Sizes<> > mini;
-            mini.device(device) =
-                input.template chip<3>(i).template chip<0>(j).minimum();
-            TensorFixedSize<float, Sizes<> > maxi;
-            maxi.device(device) =
-                input.template chip<3>(i).template chip<0>(j).maximum();
-
-            TensorMap<Tensor<float, 2> > tmp(NULL, glimpse_size);
-            output.template chip<3>(i).template chip<0>(j).device(device) =
-                (mean.reshape(Sizes<1, 1>()).broadcast(glimpse_size) +
-                 (tmp.random(gen) *
-                  sigma.reshape(Sizes<1, 1>()).broadcast(glimpse_size))
-                     .template cast<Scalar>())
-                    .cwiseMin(
-                        maxi.reshape(Sizes<1, 1>()).broadcast(glimpse_size))
-                    .cwiseMax(
-                        mini.reshape(Sizes<1, 1>()).broadcast(glimpse_size));
-          }
+            mini.device(device) = input.template chip<3>(i).minimum();
+            TensorFixedSize<float, Sizes<> > range;
+            range.device(device) = (input.template chip<3>(i).maximum() - mini)
+                                       .template cast<float>();
+
+            DSizes<Index, 3> glimpse_size(num_channels, width_, height_);
+            TensorMap<Tensor<float, 3> > tmp(NULL, glimpse_size);
+            output.template chip<3>(i).device(device) =
+                mini.reshape(Sizes<1, 1, 1>()).broadcast(glimpse_size) +
+                (tmp.random(unigen) *
+                 range.reshape(Sizes<1, 1, 1>()).broadcast(glimpse_size))
+                    .template cast<Scalar>();
+          } break;
+          case GAUSSIAN: {
+            // Initialize the glimpse with white noise: compute the mean and
+            // sigma
+            // of each channel, and use them to shape the gaussian.
+            DSizes<Index, 2> glimpse_size(width_, height_);
+            DSizes<Index, 2> input_size(input_width, input_height);
+            typedef typename internal::remove_const<
+                typename internal::traits<Input>::Scalar>::type Scalar;
+
+            for (int j = 0; j < num_channels; ++j) {
+              TensorFixedSize<Scalar, Sizes<> > mean;
+              mean.device(device) = input.template chip<3>(i)
+                                        .template chip<0>(j)
+                                        .template cast<float>()
+                                        .mean();
+              TensorFixedSize<float, Sizes<> > sigma;
+              sigma.device(device) =
+                  (input.template chip<3>(i)
+                       .template chip<0>(j)
+                       .template cast<float>() -
+                   mean.reshape(Sizes<1, 1>()).broadcast(input_size))
+                      .square()
+                      .mean()
+                      .sqrt();
+              TensorFixedSize<Scalar, Sizes<> > mini;
+              mini.device(device) =
+                  input.template chip<3>(i).template chip<0>(j).minimum();
+              TensorFixedSize<float, Sizes<> > maxi;
+              maxi.device(device) =
+                  input.template chip<3>(i).template chip<0>(j).maximum();
+
+              TensorMap<Tensor<float, 2> > tmp(NULL, glimpse_size);
+              output.template chip<3>(i).template chip<0>(j).device(device) =
+                  (mean.reshape(Sizes<1, 1>()).broadcast(glimpse_size) +
+                   (tmp.random(gen) *
+                    sigma.reshape(Sizes<1, 1>()).broadcast(glimpse_size))
+                       .template cast<Scalar>())
+                      .cwiseMin(
+                          maxi.reshape(Sizes<1, 1>()).broadcast(glimpse_size))
+                      .cwiseMax(
+                          mini.reshape(Sizes<1, 1>()).broadcast(glimpse_size));
+            }
+          } break;
         }
 
         // Copy the part of the glimpse that cover the input image if any.
@@ -225,7 +242,7 @@ struct GlimpseExtractionOp {
   const std::vector<IndexPair<float> > offsets_;
   const bool normalized_;
   const bool centered_;
-  const bool uniform_noise_;
+  const ExtractGlimpsesNoiseMode noise_;
 };
 }  // namespace
 
@@ -233,12 +250,12 @@ template <typename Input>
 EIGEN_ALWAYS_INLINE static const TensorCustomUnaryOp<
     const GlimpseExtractionOp<typename internal::traits<Input>::Index>,
     const Input>
-ExtractGlimpses(const Input& input,
-                const typename internal::traits<Input>::Index width,
-                const typename internal::traits<Input>::Index height,
-                const std::vector<IndexPair<float> >& offsets,
-                const bool normalized = true, const bool centered = true,
-                const bool uniform_noise = true) {
+ExtractGlimpses(
+    const Input& input, const typename internal::traits<Input>::Index width,
+    const typename internal::traits<Input>::Index height,
+    const std::vector<IndexPair<float> >& offsets, const bool normalized = true,
+    const bool centered = true,
+    const ExtractGlimpsesNoiseMode noise = ExtractGlimpsesNoiseMode::UNIFORM) {
   EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor,
                       YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4,
@@ -246,7 +263,7 @@ ExtractGlimpses(const Input& input,
 
   typedef typename internal::traits<Input>::Index Index;
   const GlimpseExtractionOp<Index> op(width, height, offsets, normalized,
-                                      centered, uniform_noise);
+                                      centered, noise);
   return input.customOp(op);
 }
 
diff --git a/tensorflow/core/kernels/eigen_contraction_kernel.cc b/tensorflow/core/kernels/eigen_contraction_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..da42001781757e200d90108182905cb6b65ec0e3
--- /dev/null
+++ b/tensorflow/core/kernels/eigen_contraction_kernel.cc
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+
+#include <mutex>  // NOLINT(build/c++11)
+
+// We need a pair of compile time and runtime flags to disable compilation of
+// custom contraction kernels for unsupported architectures (e.g. Android,
+// iOS, ARM and PPC CPUs, etc...), and to be able to fallback on default Eigen
+// matrix multiplication at runtime.
+//
+// It's not allowed to use absl flags library in Tensorflow, so we have to pass
+// the configuration through the environment variable.
+//
+// Example:
+//   bazel test --test_env=TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL=false //test
+
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+
+namespace Eigen {
+namespace internal {
+
+// TODO(ezhulenev): This is a temporary workaround for disabling custom kernels
+// at runtime in tests. We should always rely on compile time flags for that.
+// Example: ... --test_env=TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL=false //test
+bool UseCustomContractionKernels() {
+  static bool use_custom_contraction_kernel = true;
+
+  static std::once_flag initialized;
+  std::call_once(initialized, [&] {
+    char* flag = std::getenv("TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL");
+    if (flag && (strcmp(flag, "false") == 0 || strcmp(flag, "0") == 0)) {
+      use_custom_contraction_kernel = false;
+    }
+  });
+
+  return use_custom_contraction_kernel;
+}
+
+}  // namespace internal
+}  // namespace Eigen
+#endif
diff --git a/tensorflow/core/kernels/eigen_contraction_kernel.h b/tensorflow/core/kernels/eigen_contraction_kernel.h
index 66e93a83af2e5a7aa40818067638bfdde8dd42c9..4089eec59ee8ccf03679f77d02c1f57d60155a06 100644
--- a/tensorflow/core/kernels/eigen_contraction_kernel.h
+++ b/tensorflow/core/kernels/eigen_contraction_kernel.h
@@ -33,11 +33,20 @@ limitations under the License.
 //   #endif
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
 #include "mkldnn.h"
+#endif
 
 namespace Eigen {
 namespace internal {
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+// Returns `true` iff we can use custom contraction kernels. This is a runtime
+// check, that uses environment variables.
+bool UseCustomContractionKernels();
+#endif  // TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL
+
 // Enabled by build option: "--define tensorflow_mkldnn_contraction_kernel=1"
 #if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
 
@@ -170,6 +179,13 @@ class TensorContractionBlocking<float, float, float, StorageIndex,
                                                      num_threads);
     }
 
+    // If dimensions do not pass basic sanity checks return immediately.
+    if (kc_ <= 0 || mc_ <= 0 || nc_ <= 0) return;
+
+    // If we are using default Eigen gebp kernel there is no need to adjust the
+    // block sizes for MKL-DNN.
+    if (!UseCustomContractionKernels()) return;
+
     // 2. And refine them to work well with mkldnn sgemm.
     mc_ = (std::min)(
         m, Eigen::divup(static_cast<StorageIndex>(mc_ * kScaleM), kUnrollM) *
@@ -181,7 +197,8 @@ class TensorContractionBlocking<float, float, float, StorageIndex,
     // We split Kth dimensions in roughly equal slices.
     StorageIndex target_k_slices =
         (std::max)(StorageIndex(1), Eigen::divup(k, kc_));
-    StorageIndex packet_size = 8;
+    StorageIndex packet_size = internal::packet_traits<Scalar>::size;
+    if (packet_size < 8) packet_size = 8;
     StorageIndex target_bk =
         Eigen::divup(k / target_k_slices, packet_size) * packet_size;
     kc_ = (std::min)(k, target_bk);
@@ -211,23 +228,52 @@ struct TensorContractionKernel<float, float, float, StorageIndex, OutputMapper,
                                      typename RhsMapper::SubMapper, ColMajor>;
   using GemmKernel = mkldnn_gemm_kernel<Scalar, StorageIndex, OutputMapper>;
 
+  // Fallback on default Eigen pack and GEBP kernel if custom contraction
+  // kernels disabled at runtime.
+  using EigenLhsPacker =
+      gemm_pack_lhs<Scalar, StorageIndex, typename LhsMapper::SubMapper,
+                    Traits::mr, Traits::LhsProgress,
+                    typename Traits::LhsPacket4Packing, ColMajor>;
+  using EigenRhsPacker =
+      gemm_pack_rhs<Scalar, StorageIndex, typename RhsMapper::SubMapper,
+                    Traits::nr, ColMajor>;
+  using GebpKernel =
+      gebp_kernel<Scalar, Scalar, StorageIndex, OutputMapper, Traits::mr,
+                  Traits::nr,
+                  /*ConjugateLhs*/ false, /*ConjugateRhs*/ false>;
+
   EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void packLhs(
       Scalar* lhsBlock, const typename LhsMapper::SubMapper& data_mapper,
       const StorageIndex depth, const StorageIndex rows) {
-    LhsPacker()(lhsBlock, data_mapper, rows, depth);
+    if (UseCustomContractionKernels()) {
+      LhsPacker()(lhsBlock, data_mapper, rows, depth);
+    } else {
+      EigenLhsPacker()(lhsBlock, data_mapper, depth, rows, /*stride*/ 0,
+                       /*offset*/ 0);
+    }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void packRhs(
       Scalar* rhsBlock, const typename RhsMapper::SubMapper& data_mapper,
       const StorageIndex depth, const StorageIndex cols) {
-    RhsPacker()(rhsBlock, data_mapper, depth, cols);
+    if (UseCustomContractionKernels()) {
+      RhsPacker()(rhsBlock, data_mapper, depth, cols);
+    } else {
+      EigenRhsPacker()(rhsBlock, data_mapper, depth, cols);
+    }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void invoke(
       const OutputMapper& output_mapper, const Scalar* lhsBlock,
       const Scalar* rhsBlock, const StorageIndex rows, const StorageIndex depth,
       const StorageIndex cols, const Scalar alpha) {
-    GemmKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha);
+    if (UseCustomContractionKernels()) {
+      GemmKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha);
+    } else {
+      GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha,
+                   /*strideA*/ -1, /*strideB*/ -1,
+                   /*offsetA*/ 0, /*offsetB*/ 0);
+    }
   }
 };
 
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h
index 25c735d080e1cef54b7c8cd87d25eb31612192b3..8b198139400a6d2ce2795f9ef0b5793114a78e0b 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h
@@ -871,11 +871,9 @@ struct gemm_pack_rhs<
             const bool pad_col2 = dm2.padCol(c);
             const bool pad_col3 = dm3.padCol(c);
 
-            // We can squeeze reads along the `row` and `depth` dimensions if
-            // the row stride is `1`, which means that `row` and `depth`
-            // dimensions are contiguous (two innermost dimensions).
-            if (rhs.rowStride() == 1 &&                                //
-                !pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&    //
+            // Check if we can squeeze reads along the `row` and `depth`
+            // dimensions (two innermost dimensions).
+            if (!pad_col0 && !pad_col1 && !pad_col2 && !pad_col3 &&    //
                 !dm0.padRow(start_row) && !dm0.padRow(max_row - 1) &&  //
                 !dm1.padRow(start_row) && !dm1.padRow(max_row - 1) &&  //
                 !dm2.padRow(start_row) && !dm2.padRow(max_row - 1) &&  //
@@ -1685,8 +1683,6 @@ EIGEN_DEVICE_FUNC
     kernel_dims[0] = kernelChannels * kernelRows * kernelCols;
     kernel_dims[1] = kernelFilters;
   }
-  // TODO(yangke): choose() is defined in TensorContraction.h -- consider
-  // moving it to somewhere more "common".
   return choose(
       Cond<internal::traits<Input>::Layout == ColMajor>(),
       kernel.reshape(kernel_dims)
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
index 22f71d62602cc984c0337f728298f7483c35bed9..920e648972bef4b37e15eb2c6dcee313b7cd26da 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/eigen_spatial_convolutions.h"
+
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/kernels/eigen_cuboid_convolution.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -1540,22 +1542,188 @@ static void PackRhsHelper(int iters,
     pack_rhs(packed.data() + packed_offset, sub_mapper, depth, cols);
   }
   tensorflow::testing::StopTiming();
+  tensorflow::testing::SetLabel(
+      absl::StrCat("patch: ", patch_rows, "x", patch_cols, " D", patch_depth,
+                   "; num_patches=", num_patches, " patch_size=", patch_size,
+                   " num_inputs=", num_inputs));
+}
+
+static void PackLhsHelper(int iters,
+                          /* Input dimensions: */
+                          int input_depth,
+                          /* Filter (kernel) dimensions: */
+                          int filter_count, int filter_cols, int filter_rows,
+                          /* Block dimensions: */
+                          Index block_rows, Index block_cols) {
+  // Set random seed for benchmark repeatability.
+  srand(12345);
+
+  eigen_assert(block_rows <= filter_count);
+  eigen_assert(block_cols <= input_depth * filter_rows * filter_cols);
+
+  tensorflow::testing::UseRealTime();
+  tensorflow::testing::StopTiming();
+
+  using Dimensions = Eigen::DSizes<Eigen::Index, 4>;
+
+  // Default Eigen::Tensor layout is column major, so we configure dimensions
+  // starting from the inner most (`filter count` aka `kernel filers`).
+  Dimensions filter_dims(filter_count, filter_rows, filter_cols, input_depth);
+
+  static const int packet_size = Eigen::internal::packet_traits<float>::size;
+
+  // We are going to reshape filter into 2D tensor.
+  using NewDimension = Eigen::DSizes<Index, 2>;
+
+  // Contraction dimensions.
+  using nocontract_t = Eigen::array<Eigen::Index, 1>;
+  using contract_t = Eigen::array<Eigen::Index, 1>;
+
+  // Input to the ReshapeOp. It is the tensorflow TTypes<float>::Tensor
+  // with ColMajor layout, instead of RowMajor. But that doesn't make any
+  // difference, because TensorContraction swaps LHS with RHS for row major
+  // inputs, and contraction mapper always works with column major data.
+  using ArgType = TensorMap<Tensor<float, 4>, Eigen::Aligned>;
+
+  using Evaluator =
+      TensorEvaluator<const TensorReshapingOp<NewDimension, ArgType>,
+                      Eigen::DefaultDevice>;
+
+  using InputMapper = Eigen::internal::TensorContractionInputMapper<
+      float, Index, Eigen::internal::Lhs, Evaluator,  //
+      nocontract_t, contract_t,                       //
+      packet_size,                                    //
+      /*inner_dim_contiguous*/ true,                  //
+      /*inner_dim_reordered*/ false,                  //
+      /*Alignment*/ 0>;
+
+  using SubMapper = Eigen::internal::TensorContractionSubMapper<
+      float, Index, Eigen::internal::Lhs, Evaluator,  //
+      nocontract_t, contract_t,                       //
+      packet_size,                                    //
+      /*inner_dim_contiguous*/ true,                  //
+      /*inner_dim_reordered*/ false,                  //
+      /*Alignment*/ 0>;
+
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+  using PackLhsImpl = Eigen::internal::mkldnn_gemm_pack<float, Eigen::Index,
+                                                        SubMapper, ColMajor>;
+#else
+  using Traits = typename Eigen::internal::gebp_traits<float, float>;
+  using PackLhsImpl =
+      Eigen::internal::gemm_pack_lhs<float, Eigen::Index, SubMapper,      //
+                                     Traits::mr,                          //
+                                     Traits::LhsProgress,                 //
+                                     typename Traits::LhsPacket4Packing,  //
+                                     ColMajor>;
+#endif
+
+  Eigen::DefaultDevice device;
+
+  // We will reshape kernel into 2D tensor.
+  NewDimension reshape_dims;
+  reshape_dims[0] = filter_count;
+  reshape_dims[1] = input_depth * filter_rows * filter_cols;
+
+  // We are going to contract along the 'in_depth * filter_rows * filter_cols`.
+  nocontract_t nocontract_dim = {0};
+  contract_t contract_dim = {1};
+
+  // These values computed using the algorithm in TensorContraction.h, with
+  // 'nocontract_dim' and 'contract_dim' values specified above.
+  nocontract_t nocontract_strides = {1};
+  contract_t contract_strides = {filter_count};
+  nocontract_t i_strides = {1};
+  contract_t k_strides = {1};
+
+  // We use tensor of the same dimensions to store packed data.
+  Tensor<float, 4> packed(filter_dims);
+
+  // We generate multiple filter tensors, around 512mb in total size to measure
+  // realistic workload when input data in not in L1-L3 cache.
+  size_t input_bytes = filter_dims.TotalSize() * sizeof(float);
+  size_t mem_size_bytes = 1024 * 1024 * 512;
+  size_t num_filters =
+      std::max(static_cast<size_t>(1), mem_size_bytes / input_bytes);
+
+  std::vector<Tensor<float, 4>> filters;
+  std::vector<Evaluator> evaluators;
+  std::vector<InputMapper> input_mappers;
+
+  for (int i = 0; i < num_filters; ++i) {
+    filters.emplace_back(filter_dims);
+    filters[i].setRandom();
+
+    ArgType tensor_map(filters[i].data(), filter_dims);
+
+    const auto reshape_op =
+        TensorReshapingOp<NewDimension, ArgType>(tensor_map, reshape_dims);
+
+    evaluators.emplace_back(reshape_op, device);
+
+    input_mappers.emplace_back(evaluators[i], nocontract_strides, i_strides,
+                               contract_strides, k_strides);
+  }
+
+  PackLhsImpl pack_lhs;
+
+  const Index packed_total_size = filter_dims.TotalSize();
+
+  // Round up row/col/memory offsets to make them multiple of packet size.
+  const auto round_up = [](const Index idx) {
+    return (idx / packet_size) * packet_size;
+  };
+
+  // Block rows is in the [0, filter_count) range.
+  // Block cols is in the [0, filter_rows * filter_cols * input_depth) range.
+
+  const Index max_row = filter_count;
+  const Index max_col = filter_rows * filter_cols * input_depth;
+
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    int filter_idx =
+        num_filters == 1 ? 1 : internal::random<int>(0, num_filters - 1);
+
+    Index row_offset = round_up(internal::random<Index>(0, max_row - 10));
+    Index col_offset = round_up(internal::random<Index>(0, max_col - 10));
+
+    Index rows = std::min(block_rows, max_row - row_offset);
+    Index cols = std::min(block_cols, max_col - col_offset);
+
+    // Write packed data to random memory location to emulate cold caches.
+    Index packed_offset = round_up(
+        internal::random<Index>(0, packed_total_size - rows * cols - 1));
+
+    SubMapper sub_mapper =
+        input_mappers[filter_idx].getSubMapper(row_offset, col_offset);
 
-  std::ostringstream stringStream;
-  stringStream << "patch: " << patch_rows << "x" << patch_cols << " D"
-               << patch_depth << "; num_patches=" << num_patches
-               << " patch_size=" << patch_size << " num_inputs=" << num_inputs;
-  tensorflow::testing::SetLabel(stringStream.str());
+// NOTE: Eigen gemm_pack_lhs accepts contraction depth (k-th dimension) as a
+// first argument (aka block cols). MKL-DNN pack is generic for lhs and rhs
+// and accepts block rows and cols in the same order for lhs and rhs.
+#if defined(TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL)
+    pack_lhs(packed.data() + packed_offset, sub_mapper, rows, cols);
+#else
+    pack_lhs(packed.data() + packed_offset, sub_mapper, cols, rows);
+#endif
+  }
+  tensorflow::testing::StopTiming();
+  tensorflow::testing::SetLabel(absl::StrCat(
+      "filter: count=", filter_count, " dims=", filter_rows, "x", filter_cols,
+      "; input: depth=", input_depth, "; num_filers=", num_filters));
 }
 
 // -------------------------------------------------------------------------- //
-// Macro argumentnames:
+// Pack RHS
+//
+// Macro argument names:
 //    N: batch size
 //    H: height
 //    W: width
 //    C: input channels
 //   FC: filter channles
 //   FH: filter height
+//   FW: filter width
 //   SH: stride in height dimensions
 //   SW: stride in width dimensions
 //   BR: block rows
@@ -1563,16 +1731,16 @@ static void PackRhsHelper(int iters,
 
 #define BM_CONCAT(a, b) a##b
 
-#define BM_NAME(prefix, N, H, W, C, FC, FH, FW, SH, SW, BR, BC)           \
+#define BM_RHS_NAME(prefix, N, H, W, C, FC, FH, FW, SH, SW, BR, BC)       \
   BM_CONCAT(BM_##prefix##_##N##_##H##x##W##_IC##C##_FC##FC##_##FH##x##FW, \
             _s##SH##x##SW##_B##BR##x##BC)
 
-#define BM_PackRhs(N, H, W, C, FC, FH, FW, SH, SW, BR, BC)         \
-  static void BM_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW, BR, \
-                      BC)(int iters) {                             \
-    PackRhsHelper(iters, N, H, W, C, FC, FH, FW, SH, SW, BR, BC);  \
-  }                                                                \
-  BENCHMARK(BM_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW, BR, BC))
+#define BM_PackRhs(N, H, W, C, FC, FH, FW, SH, SW, BR, BC)             \
+  static void BM_RHS_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW, BR, \
+                          BC)(int iters) {                             \
+    PackRhsHelper(iters, N, H, W, C, FC, FH, FW, SH, SW, BR, BC);      \
+  }                                                                    \
+  BENCHMARK(BM_RHS_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW, BR, BC))
 
 // Number of input channel (input depth) it equal to the number of patch
 // channels (patch depth).
@@ -1645,4 +1813,37 @@ BM_PackRhs(/*batch*/ 32,        //
            /*filter*/ 3, 3,     //
            /*stride*/ 2, 2,     //
            /*block*/ 36, 432);
+
+// -------------------------------------------------------------------------- //
+// Pack LHS
+//
+// Macro argument names:
+//    C: input channels
+//   FC: filter channels
+//   FH: filter height
+//   FW: filter width
+//   BR: block rows
+//   BC: block cols
+
+#define BM_LHS_NAME(prefix, C, FC, FH, FW, BR, BC) \
+  BM_CONCAT(BM_##prefix##_##C##_FC##FC##_##FH##x##FW, _B##BR##x##BC)
+
+#define BM_PackLhs(C, FC, FH, FW, BR, BC)                              \
+  static void BM_LHS_NAME(PackLhs, C, FC, FH, FW, BR, BC)(int iters) { \
+    PackLhsHelper(iters, C, FC, FH, FW, BR, BC);                       \
+  }                                                                    \
+  BENCHMARK(BM_LHS_NAME(PackLhs, C, FC, FH, FW, BR, BC))
+
+// Number of input channel (input depth) it equal to the number of patch
+// channels (patch depth).
+
+BM_PackLhs(/*input channels*/ 128,    //
+           /*filter channels*/ 1024,  //
+           /*filter dims*/ 3, 3,      //
+           /*block*/ 256, 56);
+
+BM_PackLhs(/*input channels*/ 128,    //
+           /*filter channels*/ 1024,  //
+           /*filter dims*/ 3, 3,      //
+           /*block*/ 56, 256);
 }  // namespace Eigen
diff --git a/tensorflow/core/kernels/encode_jpeg_op.cc b/tensorflow/core/kernels/encode_jpeg_op.cc
index 1a5b0f2b675a85ba2c1dbf0356c3e42b03db22b4..e80404a437523862bfe6b8c2961b11cc00bd4426 100644
--- a/tensorflow/core/kernels/encode_jpeg_op.cc
+++ b/tensorflow/core/kernels/encode_jpeg_op.cc
@@ -16,12 +16,12 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 
 #include <memory>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/jpeg/jpeg_mem.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/encode_png_op.cc b/tensorflow/core/kernels/encode_png_op.cc
index 8fcda25e692f9aa550ddbb17a4f5cef8ba570b83..cb9a1660a7d059bebaaadea8cc309f74ab974948 100644
--- a/tensorflow/core/kernels/encode_png_op.cc
+++ b/tensorflow/core/kernels/encode_png_op.cc
@@ -16,12 +16,12 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 
 #include <memory>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/png/png_io.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/encode_wav_op.cc b/tensorflow/core/kernels/encode_wav_op.cc
index aed095076b92cdef60e217c610fa4c11eb4717ec..082f9a74ae1e36f22ed206c3049dbfd40ac55a48 100644
--- a/tensorflow/core/kernels/encode_wav_op.cc
+++ b/tensorflow/core/kernels/encode_wav_op.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 // See docs in ../ops/audio_ops.cc
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/wav/wav_io.h"
 
diff --git a/tensorflow/core/kernels/extract_image_patches_op.cc b/tensorflow/core/kernels/extract_image_patches_op.cc
index 68631d14dbc4af5553e02a7e3d622c3772a95eb5..9306eccf9f018f66cc22a7d88050a20814e46f15 100644
--- a/tensorflow/core/kernels/extract_image_patches_op.cc
+++ b/tensorflow/core/kernels/extract_image_patches_op.cc
@@ -20,11 +20,11 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/extract_image_patches_op.h"
 #include <vector>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/extract_jpeg_shape_op.cc b/tensorflow/core/kernels/extract_jpeg_shape_op.cc
index 60d798af56737c6abb322a971b31ae596ea96ec6..ab424595c1a6e5c26f26aae9dc3768cf2bf15c9b 100644
--- a/tensorflow/core/kernels/extract_jpeg_shape_op.cc
+++ b/tensorflow/core/kernels/extract_jpeg_shape_op.cc
@@ -16,12 +16,12 @@ limitations under the License.
 // See docs in ../ops/image_ops.cc
 
 #include <memory>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/jpeg/jpeg_mem.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/extract_volume_patches_op.cc b/tensorflow/core/kernels/extract_volume_patches_op.cc
index 52cd078a3512bcfae13539f1e95ef66c4adf8a03..8107bca7d18633f45e747b5175eca1e11f2cc6fe 100644
--- a/tensorflow/core/kernels/extract_volume_patches_op.cc
+++ b/tensorflow/core/kernels/extract_volume_patches_op.cc
@@ -26,11 +26,11 @@ when rates are to be added.
 
 #include "tensorflow/core/kernels/extract_volume_patches_op.h"
 #include <vector>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/fifo_queue.h b/tensorflow/core/kernels/fifo_queue.h
index 697ee81c39b194e29c03f3583f0aa727778ef316..4d3a7c197125613c662c97044d6964695ab92b0e 100644
--- a/tensorflow/core/kernels/fifo_queue.h
+++ b/tensorflow/core/kernels/fifo_queue.h
@@ -49,7 +49,7 @@ class FIFOQueue : public TypedQueue<std::deque<PersistentTensor> > {
                       CallbackWithTuple callback) override;
   Status MatchesNodeDef(const NodeDef& node_def) override;
 
-  int32 size() override {
+  int32 size() const override {
     mutex_lock lock(mu_);
     return queues_[0].size();
   }
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index 7090417dfdb2d7e433025b1a0f1cdeb5eece10a8..9c4c0487f09dff86efa833475ea685c30b1ac915 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -51,6 +51,11 @@ DEFINE_SETZERO_CPU(uint16);
 DEFINE_SETZERO_CPU(int16);
 DEFINE_SETZERO_CPU(int32);
 DEFINE_SETZERO_CPU(int64);
+DEFINE_SETZERO_CPU(quint8);
+DEFINE_SETZERO_CPU(qint8);
+DEFINE_SETZERO_CPU(quint16);
+DEFINE_SETZERO_CPU(qint16);
+DEFINE_SETZERO_CPU(qint32);
 DEFINE_SETZERO_CPU(complex64);
 DEFINE_SETZERO_CPU(complex128);
 DEFINE_SETZERO_CPU(Variant);
diff --git a/tensorflow/core/kernels/fill_functor.cu.cc b/tensorflow/core/kernels/fill_functor.cu.cc
index 050c95cf40d4b29bde66b6b6e72b1b48a7199965..d4c92586897da1ead541a98f5d721a9c18d235b9 100644
--- a/tensorflow/core/kernels/fill_functor.cu.cc
+++ b/tensorflow/core/kernels/fill_functor.cu.cc
@@ -88,9 +88,16 @@ struct SetZeroFunctor<GPUDevice, T> {
   }
 };
 
+template <>
+void SetZeroFunctor<GPUDevice, Variant>::operator()(
+    const GPUDevice& d, typename TTypes<Variant>::Flat out) {
+  // TODO(b/123028789): Implement this.
+}
+
 #define DEFINE_SETZERO_GPU(T) template struct SetZeroFunctor<GPUDevice, T>;
 TF_CALL_NUMBER_TYPES(DEFINE_SETZERO_GPU);
 TF_CALL_bool(DEFINE_SETZERO_GPU);
+TF_CALL_variant(DEFINE_SETZERO_GPU);
 #undef DEFINE_SETZERO_GPU
 
 // Partial specialization of FillFunctor<Device=GPUDevice, T>.
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index 90f94ee4a06519eca064abf9b1e0d60f1f181188..88a8a523e4780045c81f495959b157e44fe709dc 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -130,6 +130,7 @@ REGISTER_KERNEL_BUILDER(
       Name(kRetOp).Device(DEVICE_GPU).TypeConstraint<type>("T"), RetvalOp);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER)
 TF_CALL_QUANTIZED_TYPES(REGISTER)
+REGISTER(Variant)
 TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name(kRetOp)
                                                    .Device(DEVICE_GPU)
                                                    .HostMemory("input")
@@ -137,6 +138,7 @@ TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name(kRetOp)
                                                RetvalOp);
 REGISTER_KERNEL_BUILDER(
     Name(kDeviceRetOp).Device(DEVICE_GPU).TypeConstraint<int32>("T"), RetvalOp);
+
 REGISTER_KERNEL_BUILDER(Name(kRetOp)
                             .Device(DEVICE_GPU)
                             .TypeConstraint<ResourceHandle>("T")
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index 5ecb203cbc7296d75f6a0a68a2189d7bf018c7fe..246a6ce04d97a5dec54f2d0b44da7e278d703908 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 
 namespace tensorflow {
@@ -120,6 +121,7 @@ void SetRunOptions(OpKernelContext* ctx, FunctionLibraryRuntime::Options* opts,
     opts->stats_collector = ctx->stats_collector();
   }
   opts->runner = ctx->runner();
+  opts->step_container = ctx->step_container();
 }
 
 class IfOp : public AsyncOpKernel {
@@ -210,6 +212,98 @@ class IfOp : public AsyncOpKernel {
   };
 };
 
+class CaseOp : public AsyncOpKernel {
+ public:
+  explicit CaseOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+    auto lib = ctx->function_library();
+    OP_REQUIRES(ctx, lib != nullptr, errors::Internal("No function library"));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("branches", &branch_funcs_));
+  }
+
+  ~CaseOp() override {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    auto lib = ctx->function_library();
+    OP_REQUIRES_ASYNC(ctx, lib != nullptr,
+                      errors::Internal("No function library"), done);
+
+    // TODO(b/37549631): Because this op has `SetIsStateful()` in its op
+    // registration, this kernel may be shared by multiple subgraphs, which have
+    // different associated `FunctionLibraryRuntime` objects and hence different
+    // `FHandle` namespaces. So we must call Instantiate() to make sure we get
+    // the correct function handles with respect to `lib`. Note the underlying
+    // `lib->Instantiate()` caches the created function handles, so calling
+    // `Instantiate()` repeatedly on the same `lib` and function is cheap.
+    std::vector<FHandle> branch_handles(branch_funcs_.size());
+    for (int i = 0; i < branch_funcs_.size(); i++) {
+      OP_REQUIRES_OK_ASYNC(
+          ctx, Instantiate(lib, branch_funcs_[i], &branch_handles[i]), done);
+    }
+
+    const Tensor& branch_index = ctx->input(0);
+    OP_REQUIRES_ASYNC(ctx, TensorShapeUtils::IsScalar(branch_index.shape()),
+                      errors::InvalidArgument("branch_index must be scalar"),
+                      done);
+    int32 branch = branch_index.scalar<int32>()();
+    (new State(this, ctx, branch, branch_handles, done))->Start();
+  }
+
+ private:
+  std::vector<NameAttrList> branch_funcs_;
+
+  class State {
+   public:
+    State(CaseOp* kernel, OpKernelContext* ctx, int branch,
+          std::vector<FHandle> branch_handles, DoneCallback done)
+        : kernel_(kernel),
+          ctx_(ctx),
+          branch_(branch),
+          branch_handles_(branch_handles),
+          done_(std::move(done)),
+          lib_(CHECK_NOTNULL(ctx_->function_library())) {
+      SetRunOptions(ctx_, &opts_, true /* always_collect_stats */);
+      for (int i = 1; i < ctx_->num_inputs(); ++i) {
+        args_.push_back(ctx_->input(i));
+      }
+    }
+
+    ~State() {}
+
+    void Start() {
+      int branch = branch_;
+      // The last branch is the default branch.
+      if (branch < 0 || branch >= branch_handles_.size()) {
+        branch = branch_handles_.size() - 1;
+      }
+      rets_.clear();
+      lib_->Run(
+          // Evaluate one of the branch.
+          opts_, branch_handles_[branch], args_, &rets_,
+          // Done callback
+          [this](Status s) {
+            if (s.ok()) {
+              s = SetOutputs(kernel_, ctx_, rets_);
+            }
+            ctx_->SetStatus(s);
+            DoneCallback captured_done(std::move(done_));
+            delete this;
+            captured_done();
+          });
+    }
+
+   private:
+    CaseOp* const kernel_;
+    OpKernelContext* const ctx_;
+    const int branch_;
+    std::vector<FHandle> branch_handles_;
+    DoneCallback done_;
+    FunctionLibraryRuntime* const lib_;
+    FunctionLibraryRuntime::Options opts_;
+    TensorVec args_;
+    TensorVec rets_;
+  };
+};
+
 // TODO(drpng): remove this.
 REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_CPU), IfOp);
 REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_GPU).HostMemory("cond"),
@@ -218,6 +312,10 @@ REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_GPU).HostMemory("cond"),
 REGISTER_KERNEL_BUILDER(Name("If").Device(DEVICE_CPU), IfOp);
 REGISTER_KERNEL_BUILDER(Name("If").Device(DEVICE_GPU).HostMemory("cond"), IfOp);
 
+REGISTER_KERNEL_BUILDER(Name("Case").Device(DEVICE_CPU), CaseOp);
+REGISTER_KERNEL_BUILDER(
+    Name("Case").Device(DEVICE_GPU).HostMemory("branch_index"), CaseOp);
+
 REGISTER_KERNEL_BUILDER(Name("StatelessIf").Device(DEVICE_CPU), IfOp);
 REGISTER_KERNEL_BUILDER(
     Name("StatelessIf").Device(DEVICE_GPU).HostMemory("cond"), IfOp);
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index dbd3bb05dbf1a310ea9c5a5b1003474e33825133..48b339508b50c835a7aa86306bf3dca758a819f1 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
 #include "tensorflow/core/kernels/conv_2d.h"
-#include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 #endif
 
diff --git a/tensorflow/core/kernels/fuzzing/BUILD b/tensorflow/core/kernels/fuzzing/BUILD
index 2d8b734535c964bf4162838baa8ad65af4790423..3c3e9bfa2e0a6f3f94c9c679994021929f9df489 100644
--- a/tensorflow/core/kernels/fuzzing/BUILD
+++ b/tensorflow/core/kernels/fuzzing/BUILD
@@ -8,11 +8,8 @@ cc_library(
     name = "fuzz_session",
     hdrs = ["fuzz_session.h"],
     deps = [
-        "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:scope",
         "//tensorflow/core:core_cpu",
-        "//tensorflow/core:direct_session",
-        "//tensorflow/core:ops",
         "//tensorflow/core:tensorflow",
     ],
 )
@@ -68,3 +65,11 @@ tf_ops_fuzz_target_lib("decode_json_example")
 tf_oss_fuzz_corpus("decode_json_example")
 
 tf_oss_fuzz_dict("decode_json_example")
+
+tf_ops_fuzz_target_lib("check_numerics")
+
+tf_ops_fuzz_target_lib("one_hot")
+
+tf_ops_fuzz_target_lib("scatter_nd")
+
+tf_oss_fuzz_corpus("scatter_nd")
diff --git a/tensorflow/core/kernels/fuzzing/check_numerics_fuzz.cc b/tensorflow/core/kernels/fuzzing/check_numerics_fuzz.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2258a094d973e8e10f9ce6d1868d6b9913c41a17
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/check_numerics_fuzz.cc
@@ -0,0 +1,50 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
+
+namespace tensorflow {
+namespace fuzzing {
+
+class FuzzCheckNumerics : public FuzzSession {
+  void BuildGraph(const Scope& scope) override {
+    auto input =
+        tensorflow::ops::Placeholder(scope.WithOpName("input"), DT_FLOAT);
+    auto prefix = "Error: ";
+    (void)tensorflow::ops::CheckNumerics(scope.WithOpName("output"), input,
+                                         prefix);
+  }
+
+  void FuzzImpl(const uint8_t* data, size_t size) override {
+    size_t ratio = sizeof(float) / sizeof(uint8_t);
+    size_t num_floats = size / ratio;
+    const float* float_data = reinterpret_cast<const float*>(data);
+
+    Tensor input_tensor(tensorflow::DT_FLOAT,
+                        TensorShape({static_cast<int64>(num_floats)}));
+    auto flat_tensor = input_tensor.flat<float>();
+    for (size_t i = 0; i < num_floats; i++) {
+      flat_tensor(i) = float_data[i];
+    }
+    RunInputs({{"input", input_tensor}});
+  }
+};
+
+STANDARD_TF_FUZZ_FUNCTION(FuzzCheckNumerics);
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/5b0e5f8d2990c3cac80fa792ba141c43 b/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/5b0e5f8d2990c3cac80fa792ba141c43
new file mode 100644
index 0000000000000000000000000000000000000000..d1239633c843b1b8fd64d232604a3d61e9eb07dc
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/5b0e5f8d2990c3cac80fa792ba141c43 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/5b61fa3a30dd267828f12d9ea2b2a191 b/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/5b61fa3a30dd267828f12d9ea2b2a191
new file mode 100644
index 0000000000000000000000000000000000000000..1bd0905cdd6efab2b8450e6cb03f1d15ffae9993
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/5b61fa3a30dd267828f12d9ea2b2a191 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/8bc8b7d8beb3483c48158739791e56b0 b/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/8bc8b7d8beb3483c48158739791e56b0
new file mode 100644
index 0000000000000000000000000000000000000000..65a6d0083ee72a2920014fbe252970bff43ca75d
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/8bc8b7d8beb3483c48158739791e56b0 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/d2ef31d47578e9de8323bb0e4806f1be b/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/d2ef31d47578e9de8323bb0e4806f1be
new file mode 100644
index 0000000000000000000000000000000000000000..c6948b6a25f2c1a4fa6de401aaeb681be9a8dbd2
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/d2ef31d47578e9de8323bb0e4806f1be differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/e2791edcf2c8d9f4af3678a75d43a3e4 b/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/e2791edcf2c8d9f4af3678a75d43a3e4
new file mode 100644
index 0000000000000000000000000000000000000000..0e8a48e21096eb7b4f4642f754c18728e575e396
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/scatter_nd/e2791edcf2c8d9f4af3678a75d43a3e4 differ
diff --git a/tensorflow/core/kernels/fuzzing/decode_compressed_fuzz.cc b/tensorflow/core/kernels/fuzzing/decode_compressed_fuzz.cc
index 0a56f4b63f4574d3a6fc62a5d770915255b93bf3..b9fc014b868801fd0fe7299802bbc72cfa141102 100644
--- a/tensorflow/core/kernels/fuzzing/decode_compressed_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/decode_compressed_fuzz.cc
@@ -22,7 +22,7 @@ namespace fuzzing {
 class FuzzDecodeCompressed : public FuzzStringInputOp {
   void BuildGraph(const Scope& scope) override {
     auto input =
-        tensorflow::ops::Placeholder(scope.WithOpName("input1"), DT_STRING);
+        tensorflow::ops::Placeholder(scope.WithOpName("input"), DT_STRING);
     auto d1 = tensorflow::ops::DecodeCompressed(
         scope.WithOpName("d1"), input,
         tensorflow::ops::DecodeCompressed::CompressionType(""));
diff --git a/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc b/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc
index f5dd47a052cd098937d66394ed04c66831ee5972..09d196147c86556a3277c96dcf1a3677acb5fca0 100644
--- a/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/encode_jpeg_fuzz.cc
@@ -52,8 +52,7 @@ class FuzzEncodeJpeg : public FuzzSession {
     for (size_t i = 0; i < actual_pixels; i++) {
       flat_tensor(i) = data[i];
     }
-    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
-    RunOneInput(input_tensor).IgnoreError();
+    RunInputs({{"input", input_tensor}});
   }
 };
 
diff --git a/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc b/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
index 5b029bf5ec0f20bb160ff7d0091d6a7fd3a627ed..f72dfb39b31ef058e85e6c8e7e71de22d5e288c9 100644
--- a/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
@@ -30,7 +30,7 @@ class FuzzExampleProtoFastParsing : public FuzzSession {
   void BuildGraph(const Scope& scope) final {
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
     // The serialized proto.
-    auto input = Placeholder(scope.WithOpName("input1"), DT_STRING);
+    auto input = Placeholder(scope.WithOpName("input"), DT_STRING);
 
     auto in_expanded = ExpandDims(scope, input, Const<int>(scope, 0));
 
@@ -53,8 +53,7 @@ class FuzzExampleProtoFastParsing : public FuzzSession {
     Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
     input_tensor.scalar<string>()() =
         string(reinterpret_cast<const char*>(data), size);
-    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
-    RunOneInput(input_tensor).IgnoreError();
+    RunInputs({{"input", input_tensor}});
   }
 };
 
diff --git a/tensorflow/core/kernels/fuzzing/fuzz_session.h b/tensorflow/core/kernels/fuzzing/fuzz_session.h
index 57d562ddf43142e47e5d52e4c0dfbbcbbb4bdfe0..4b036b181de127ca996251b538b983971ff12172 100644
--- a/tensorflow/core/kernels/fuzzing/fuzz_session.h
+++ b/tensorflow/core/kernels/fuzzing/fuzz_session.h
@@ -35,11 +35,11 @@ limitations under the License.
 #endif
 
 // Standard builder for hooking one placeholder to one op.
-#define SINGLE_INPUT_OP_BUILDER(dtype, opName)                           \
-  void BuildGraph(const Scope& scope) override {                         \
-    auto op_node =                                                       \
-        tensorflow::ops::Placeholder(scope.WithOpName("input1"), dtype); \
-    (void)tensorflow::ops::opName(scope.WithOpName("output"), op_node);  \
+#define SINGLE_INPUT_OP_BUILDER(dtype, opName)                          \
+  void BuildGraph(const Scope& scope) override {                        \
+    auto op_node =                                                      \
+        tensorflow::ops::Placeholder(scope.WithOpName("input"), dtype); \
+    (void)tensorflow::ops::opName(scope.WithOpName("output"), op_node); \
   }
 
 namespace tensorflow {
@@ -61,7 +61,7 @@ namespace fuzzing {
 //   SINGLE_INPUT_OP_BUILDER(DT_INT8, Identity);
 //   void FuzzImpl(const uint8_t* data, size_t size) {
 //      ... convert data and size to a Tensor, pass it to:
-//      RunOneInput(input_tensor);
+//      RunInputs({{"input", input_tensor}});
 //
 class FuzzSession {
  public:
@@ -107,15 +107,18 @@ class FuzzSession {
   }
 
   // Runs the TF session by pulling on the "output" node, attaching
-  // the supplied input_tensor to the "input1" node, and discarding
+  // the supplied input_tensor to the input node(s), and discarding
   // any returned output.
-  Status RunOneInput(const Tensor& input_tensor) {
-    return session_->Run({{"input1", input_tensor}}, {}, {"output"}, nullptr);
+  // Note: We are ignoring Status from Run here since fuzzers don't need to
+  // check it (as that will slow them down and printing/logging is useless).
+  void RunInputs(const std::vector<std::pair<string, Tensor> >& inputs) {
+    RunInputsWithStatus(inputs).IgnoreError();
   }
 
-  Status RunTwoInputs(const Tensor& input1, const Tensor& input2) {
-    return session_->Run({{"input1", input1}, {"input2", input2}}, {},
-                         {"output"}, nullptr);
+  // Same as RunInputs but don't ignore status
+  Status RunInputsWithStatus(
+      const std::vector<std::pair<string, Tensor> >& inputs) {
+    return session_->Run(inputs, {}, {"output"}, nullptr);
   }
 
   // Dispatches to FuzzImpl;  small amount of sugar to keep the code
@@ -144,8 +147,7 @@ class FuzzStringInputOp : public FuzzSession {
     Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
     input_tensor.scalar<string>()() =
         string(reinterpret_cast<const char*>(data), size);
-    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
-    RunOneInput(input_tensor).IgnoreError();
+    RunInputs({{"input", input_tensor}});
   }
 };
 
diff --git a/tensorflow/core/kernels/fuzzing/identity_fuzz.cc b/tensorflow/core/kernels/fuzzing/identity_fuzz.cc
index 5c3fc4a2795430d1f8f269f42131e882106db7b0..4c1049d381b458f674cbc8f20e5b64649ff53b22 100644
--- a/tensorflow/core/kernels/fuzzing/identity_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/identity_fuzz.cc
@@ -30,9 +30,9 @@ class FuzzIdentity : public FuzzSession {
       flat_tensor(i) = data[i];
     }
 
-    Status s = RunOneInput(input_tensor);
     // Note:  For many ops, we don't care about this success -- but when
     // testing to make sure the harness actually works, it's useful.
+    Status s = RunInputsWithStatus({{"input", input_tensor}});
     if (!s.ok()) {
       LOG(ERROR) << "Execution failed: " << s.error_message();
     }
diff --git a/tensorflow/core/kernels/fuzzing/one_hot_fuzz.cc b/tensorflow/core/kernels/fuzzing/one_hot_fuzz.cc
new file mode 100644
index 0000000000000000000000000000000000000000..85cbe51ba8bd10ef904d8b27e566c0353118a3c4
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/one_hot_fuzz.cc
@@ -0,0 +1,78 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
+
+namespace tensorflow {
+namespace fuzzing {
+
+class FuzzOneHot : public FuzzSession {
+  void BuildGraph(const Scope& scope) override {
+    auto input =
+        tensorflow::ops::Placeholder(scope.WithOpName("input"), DT_UINT8);
+    auto depth =
+        tensorflow::ops::Placeholder(scope.WithOpName("depth"), DT_INT32);
+    auto on = tensorflow::ops::Placeholder(scope.WithOpName("on"), DT_UINT8);
+    auto off = tensorflow::ops::Placeholder(scope.WithOpName("off"), DT_UINT8);
+    (void)tensorflow::ops::OneHot(scope.WithOpName("output"), input, depth, on,
+                                  off);
+  }
+
+  void FuzzImpl(const uint8_t* data, size_t size) override {
+    int64 input_size;
+    int32 depth;
+    uint8 on, off;
+    const uint8_t* input_data;
+
+    if (size > 3) {
+      depth = static_cast<int32>(data[0]);
+      on = data[1];
+      off = data[2];
+      input_size = static_cast<int64>(size - 3);
+      input_data = data + 3;
+    } else {
+      depth = 1;
+      on = 1;
+      off = 0;
+      input_size = static_cast<int64>(size);
+      input_data = data;
+    }
+
+    Tensor input_tensor(tensorflow::DT_UINT8, TensorShape({input_size}));
+    Tensor depth_tensor(tensorflow::DT_INT32, TensorShape({}));
+    Tensor on_tensor(tensorflow::DT_UINT8, TensorShape({}));
+    Tensor off_tensor(tensorflow::DT_UINT8, TensorShape({}));
+
+    auto flat_tensor = input_tensor.flat<uint8>();
+    for (size_t i = 0; i < input_size; i++) {
+      flat_tensor(i) = input_data[i];
+    }
+    depth_tensor.scalar<int32>()() = depth;
+    on_tensor.scalar<uint8>()() = on;
+    off_tensor.scalar<uint8>()() = off;
+
+    RunInputs({{"input", input_tensor},
+               {"depth", depth_tensor},
+               {"on", on_tensor},
+               {"off", off_tensor}});
+  }
+};
+
+STANDARD_TF_FUZZ_FUNCTION(FuzzOneHot);
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc b/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
index ab6812c5f1534426da15fbe73a282ddf21d02931..0ce4206fc3c329beeeb6bf5f43eea77aebb0c8ab 100644
--- a/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
@@ -25,7 +25,7 @@ class FuzzParseTensor : public FuzzSession {
   void BuildGraph(const Scope& scope) final {
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
     // The serialized proto.
-    auto input = Placeholder(scope.WithOpName("input1"), DT_STRING);
+    auto input = Placeholder(scope.WithOpName("input"), DT_STRING);
 
     (void)ParseTensor(scope.WithOpName("output"), input, DT_FLOAT);
   }
@@ -62,8 +62,7 @@ class FuzzParseTensor : public FuzzSession {
     // Now we can do the actual fuzz implementation
     Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
     input_tensor.scalar<string>()() = as_string;
-    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
-    RunOneInput(input_tensor).IgnoreError();
+    RunInputs({{"input", input_tensor}});
   }
 };
 
diff --git a/tensorflow/core/kernels/fuzzing/scatter_nd_fuzz.cc b/tensorflow/core/kernels/fuzzing/scatter_nd_fuzz.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dc5e143cdf01ba8dbf6a820b9693dea69b29fb5e
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/scatter_nd_fuzz.cc
@@ -0,0 +1,136 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/kernels/fuzzing/fuzz_session.h"
+
+namespace tensorflow {
+namespace fuzzing {
+
+class FuzzScatterNd : public FuzzSession {
+  void BuildGraph(const Scope& scope) override {
+    auto indices =
+        tensorflow::ops::Placeholder(scope.WithOpName("indices"), DT_INT32);
+    auto updates =
+        tensorflow::ops::Placeholder(scope.WithOpName("updates"), DT_INT32);
+    auto shape =
+        tensorflow::ops::Placeholder(scope.WithOpName("shape"), DT_INT32);
+    (void)tensorflow::ops::ScatterNd(scope.WithOpName("output"), indices,
+                                     updates, shape);
+  }
+
+  void FuzzImpl(const uint8_t* data, size_t size) override {
+    // This op's runtime is heavily determined by the shape of the tensor
+    // arguments and almost not at all by the values of those tensors. Hence,
+    // the fuzzing data here is only used to determine the shape of the
+    // arguments and the output and the data of these tensors is just a constant
+    // value. Furthermore, the shape of the updates_tensor tensor is fully
+    // determined by the contents of the shape_tensor and the shape of the
+    // indices_tensor. Rather than using random values for the
+    // updates_tensor.shape and getting most of the fuzz runs stopped in the
+    // check, it's better to just create a proper update_tensor.
+    if (size < 1) {
+      return;
+    }
+
+    // First element of the data buffer gives the number of dimensions of the
+    // shape tensor.
+    size_t i;
+    size_t data_ix = 0;
+    size_t shape_dims = 1 + (data[data_ix++] % kMaxShapeDims);
+    Tensor shape_tensor(tensorflow::DT_INT32,
+                        TensorShape({static_cast<int64>(shape_dims)}));
+
+    // Check that we have enough elements left for the shape tensor
+    if (data_ix + shape_dims >= size) {
+      return;  // not enough elements, no fuzz
+    }
+
+    // Subsequent elements give the contents of the shape tensor.
+    // To not get out of memory, reduce all dimensions to at most kMaxDim
+    auto flat_shape = shape_tensor.flat<int32>();
+    for (i = 0; i < shape_dims; i++) {
+      flat_shape(i) = data[data_ix++] % kMaxDim;
+    }
+
+    // Next, we have to fill in the indices tensor. Take the next element from
+    // the buffer to represent the rank of this tensor.
+    if (data_ix >= size) {
+      return;
+    }
+    size_t indices_rank = 1 + (data[data_ix++] % kMaxIndicesRank);
+
+    // Now, read the dimensions of the indices_tensor
+    if (data_ix + indices_rank >= size) {
+      return;
+    }
+    std::vector<int64> indices_dims;
+    size_t num_indices = 1;
+    for (i = 0; i < indices_rank; i++) {
+      // Modulo kMaxDim to not request too much memory
+      int64 dim = data[data_ix++] % kMaxDim;
+      num_indices *= dim;
+      indices_dims.push_back(dim);
+    }
+    Tensor indices_tensor(tensorflow::DT_INT32, TensorShape(indices_dims));
+
+    // Rest of the buffer is used to fill in the indices_tensor
+    auto flat_indices = indices_tensor.flat<int32>();
+    for (i = 0; i < num_indices && data_ix < size; i++) {
+      flat_indices(i) = data[data_ix++];
+    }
+    for (; i < num_indices; i++) {
+      flat_indices(i) = 0;  // ensure that indices_tensor has all values
+    }
+
+    // Given the values in the shape_tensor and the dimensions of the
+    // indices_tensor, the shape of updates_tensor is fixed.
+    num_indices = 1;
+    std::vector<int64> updates_dims;
+    for (i = 0; i < indices_rank - 1; i++) {
+      updates_dims.push_back(indices_dims[i]);
+      num_indices *= indices_dims[i];
+    }
+    int64 last = indices_dims[indices_rank - 1];
+    for (i = last; i < shape_dims; i++) {
+      updates_dims.push_back(flat_shape(i));
+      num_indices *= flat_shape(i);
+    }
+    Tensor updates_tensor(tensorflow::DT_INT32, TensorShape(updates_dims));
+
+    // We don't care about the values in the updates_tensor, make them all be 1
+    auto flat_updates = updates_tensor.flat<int32>();
+    for (i = 0; i < num_indices; i++) {
+      flat_updates(i) = 1;
+    }
+
+    RunInputs({{"indices", indices_tensor},
+               {"updates", updates_tensor},
+               {"shape", shape_tensor}});
+  }
+
+ private:
+  const size_t kMaxShapeDims = 5;
+  const size_t kMaxIndicesRank = 3;
+  const size_t kMaxDim = 10;
+};
+
+STANDARD_TF_FUZZ_FUNCTION(FuzzScatterNd);
+
+}  // end namespace fuzzing
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
index 2564f8ed0303d1c80bad32181507eb678b18345b..4dbb6a71160e4c4921aec0992624f197f50963ea 100644
--- a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
@@ -22,16 +22,16 @@ namespace fuzzing {
 class FuzzStringSplit : public FuzzSession {
   void BuildGraph(const Scope& scope) override {
     auto input =
-        tensorflow::ops::Placeholder(scope.WithOpName("input1"), DT_STRING);
-    auto delimeter =
-        tensorflow::ops::Placeholder(scope.WithOpName("input2"), DT_STRING);
+        tensorflow::ops::Placeholder(scope.WithOpName("input"), DT_STRING);
+    auto delimiter =
+        tensorflow::ops::Placeholder(scope.WithOpName("delimiter"), DT_STRING);
     (void)tensorflow::ops::StringSplit(scope.WithOpName("output"), input,
-                                       delimeter);
+                                       delimiter);
   }
 
   void FuzzImpl(const uint8_t* data, size_t size) final {
     Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
-    Tensor delimeter_tensor(tensorflow::DT_STRING, TensorShape({}));
+    Tensor delimiter_tensor(tensorflow::DT_STRING, TensorShape({}));
 
     if (size > 0) {
       // The spec for split is that the delimeter should be 0 or 1 characters.
@@ -42,14 +42,13 @@ class FuzzStringSplit : public FuzzSession {
       if (delim_len > size) {
         delim_len = size - 1;
       }
-      delimeter_tensor.scalar<string>()() =
+      delimiter_tensor.scalar<string>()() =
           string(reinterpret_cast<const char*>(data), delim_len);
       input_tensor.scalar<string>()() = string(
           reinterpret_cast<const char*>(data + delim_len), size - delim_len);
-    }
 
-    // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
-    RunTwoInputs(input_tensor, delimeter_tensor).IgnoreError();
+      RunInputs({{"input", input_tensor}, {"delimiter", delimiter_tensor}});
+    }
   }
 };
 
diff --git a/tensorflow/core/kernels/fuzzing/string_split_v2_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_split_v2_fuzz.cc
index 787bccc15ba3987edc64056bdad091d382b07500..f7e3da804375a6576f479a88593ddb3d457f98f6 100644
--- a/tensorflow/core/kernels/fuzzing/string_split_v2_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/string_split_v2_fuzz.cc
@@ -22,9 +22,9 @@ namespace fuzzing {
 class FuzzStringSplitV2 : public FuzzSession {
   void BuildGraph(const Scope& scope) override {
     auto input =
-        tensorflow::ops::Placeholder(scope.WithOpName("input1"), DT_STRING);
+        tensorflow::ops::Placeholder(scope.WithOpName("input"), DT_STRING);
     auto separator =
-        tensorflow::ops::Placeholder(scope.WithOpName("input2"), DT_STRING);
+        tensorflow::ops::Placeholder(scope.WithOpName("separator"), DT_STRING);
     (void)tensorflow::ops::StringSplitV2(scope.WithOpName("output"),
                                                input, separator);
   }
@@ -50,9 +50,9 @@ class FuzzStringSplitV2 : public FuzzSession {
           string(reinterpret_cast<const char*>(data), sep_len);
       input_tensor.scalar<string>()() = string(
           reinterpret_cast<const char*>(data + sep_len), size - sep_len);
-    }
 
-    RunTwoInputs(input_tensor, separator_tensor).IgnoreError();
+      RunInputs({{"input", input_tensor}, {"separator", separator_tensor}});
+    }
   }
 
  private:
diff --git a/tensorflow/core/kernels/gather_functor.h b/tensorflow/core/kernels/gather_functor.h
index 7710cf93d61eeebf25a71d99e92b6b3e9ce237c9..93bdebc00e17abb702236453c220ada1e330c5cb 100644
--- a/tensorflow/core/kernels/gather_functor.h
+++ b/tensorflow/core/kernels/gather_functor.h
@@ -18,11 +18,11 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/framework/variant.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/prefetch.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/work_sharder.h"
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index e50b7fe3bf7fb7a32820ec6f95421cb90b506c0a..58867a34bc2361daceb99edd9a6396fe22e5b856 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -17,10 +17,10 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/kernels/gather_nd_op.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
diff --git a/tensorflow/core/kernels/gather_nd_op.h b/tensorflow/core/kernels/gather_nd_op.h
index 003badb74da3512124490d054cf78fad75c2404c..77c0d7717ee97c5a5a130e38c89b17d20fc8acc9 100644
--- a/tensorflow/core/kernels/gather_nd_op.h
+++ b/tensorflow/core/kernels/gather_nd_op.h
@@ -18,8 +18,8 @@ limitations under the License.
 // Functor definition for GatherOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/gather_nd_op_cpu_impl.h b/tensorflow/core/kernels/gather_nd_op_cpu_impl.h
index 1c78de253e702f5e546467bbed0758c24dbe0443..cf9817dc3060be9e9325d04637e89e147ce143c1 100644
--- a/tensorflow/core/kernels/gather_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/gather_nd_op_cpu_impl.h
@@ -22,10 +22,10 @@ limitations under the License.
 
 #include <atomic>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/gather_nd_op.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 5795f68889e2393451c5cfae2fd29f14e8f9adce..b26f0a7528df979041869fa327c3c4d890eb58df 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 // See docs in ../ops/array_ops.cc.
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/gather_functor.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/kernels/hexagon/BUILD b/tensorflow/core/kernels/hexagon/BUILD
index 87d36f22d719ade68d17c6f4a2e6dc2deeef9e45..a85de34ac262906aa0bbe2adc600505eb76dcedd 100644
--- a/tensorflow/core/kernels/hexagon/BUILD
+++ b/tensorflow/core/kernels/hexagon/BUILD
@@ -24,11 +24,18 @@ tf_cc_test(
     deps = [
         ":graph_transferer",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:bitwise_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:direct_session",
+        "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:mkl_nn_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:remote_fused_graph_ops_op_lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
diff --git a/tensorflow/core/kernels/image_resizer_state.h b/tensorflow/core/kernels/image_resizer_state.h
index 1d4fa1a7db11d28268063055143ccfcbc966ec5c..8078c7036a040c937f7d9d47cc259e677b391c03 100644
--- a/tensorflow/core/kernels/image_resizer_state.h
+++ b/tensorflow/core/kernels/image_resizer_state.h
@@ -28,12 +28,12 @@ limitations under the License.
 #include <array>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/in_topk_op.cc b/tensorflow/core/kernels/in_topk_op.cc
index c37055239c28e0ab243ea30b05b2c8af0905766c..506091f76ec69f1f092b8fe0c67ea46deb851510 100644
--- a/tensorflow/core/kernels/in_topk_op.cc
+++ b/tensorflow/core/kernels/in_topk_op.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/kernel_platform_strings.h b/tensorflow/core/kernels/kernel_platform_strings.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bf40c30a56577ebe21d4a4ba9bf371e30803f79
--- /dev/null
+++ b/tensorflow/core/kernels/kernel_platform_strings.h
@@ -0,0 +1,25 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Generate platform strings for libtfkernel-*
+
+#ifndef TENSORFLOW_CORE_KERNELS_KERNEL_PLATFORM_STRINGS_H_
+#define TENSORFLOW_CORE_KERNELS_KERNEL_PLATFORM_STRINGS_H_
+
+#include "tensorflow/core/platform/platform_strings.h"
+
+TF_PLATFORM_STRINGS()
+
+#endif  // TENSORFLOW_CORE_KERNELS_KERNEL_PLATFORM_STRINGS_H_
diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc
index 42fad1d4b053f84a7f5eaae4382f0a090ba628da..9f090524abd05fedbfac26282c65424cff845165 100644
--- a/tensorflow/core/kernels/list_kernels.cc
+++ b/tensorflow/core/kernels/list_kernels.cc
@@ -79,12 +79,10 @@ static Status TensorListDeviceCopy(
   to->max_num_elements = from.max_num_elements;
   to->tensors.reserve(from.tensors.size());
   for (const Tensor& t : from.tensors) {
-    Tensor tmp(t.dtype());
-    // Do not copy uninitialized tensors.
+    to->tensors.emplace_back(t.dtype());
     if (t.dtype() != DT_INVALID) {
-      TF_RETURN_IF_ERROR(copy(t, &tmp));
+      TF_RETURN_IF_ERROR(copy(t, &to->tensors.back()));
     }
-    to->tensors.push_back(tmp);
   }
   return Status::OK();
 }
@@ -99,13 +97,6 @@ REGISTER_LIST_COPY(VariantDeviceCopyDirection::DEVICE_TO_DEVICE);
 
 REGISTER_UNARY_VARIANT_DECODE_FUNCTION(TensorList, TensorList::kTypeName);
 
-Status TensorListShape(const TensorList& t, TensorShape* s) {
-  *s = TensorShape({});
-  return Status::OK();
-}
-
-REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(TensorList, TensorListShape);
-
 bool TensorList::Decode(const VariantTensorData& data) {
   // TODO(srbs): Change the signature to Decode(VariantTensorData data) so
   // that we do not have to copy each tensor individually below. This would
@@ -155,6 +146,7 @@ Status TensorShapeFromTensor(const Tensor& t, PartialTensorShape* out) {
   if (t.shape() == TensorShape({})) {
     if ((t.dtype() == DT_INT32 && t.scalar<int32>()() == -1) ||
         (t.dtype() == DT_INT64 && t.scalar<int64>()() == -1)) {
+      *out = PartialTensorShape();
       return Status::OK();
     }
     return errors::InvalidArgument(
@@ -173,6 +165,57 @@ Status TensorShapeFromTensor(const Tensor& t, PartialTensorShape* out) {
       DataTypeString(t.dtype()));
 }
 
+Status GetElementShapeFromInput(OpKernelContext* c,
+                                const TensorList& tensor_list, int index,
+                                PartialTensorShape* element_shape) {
+  TF_RETURN_IF_ERROR(TensorShapeFromTensor(c->input(index), element_shape));
+  // Check that `element_shape` and `tensor_list.element_shape` are
+  // compatible and store the merged shape in `element_shape`.
+  PartialTensorShape tmp = *element_shape;
+  TF_RETURN_IF_ERROR(tmp.MergeWith(tensor_list.element_shape, element_shape));
+  return Status::OK();
+}
+
+Status GetInputList(OpKernelContext* c, int index, const TensorList** list) {
+  if (!TensorShapeUtils::IsScalar(c->input(index).shape())) {
+    return errors::InvalidArgument("Input list must be a scalar saw: ",
+                                   c->input(index).shape().DebugString());
+  }
+  const TensorList* l = c->input(index).scalar<Variant>()().get<TensorList>();
+  if (l == nullptr) {
+    return errors::InvalidArgument(
+        "Input handle is not a list. Saw: '",
+        c->input(index).scalar<Variant>()().DebugString(), "'");
+  }
+  *list = l;
+  return Status::OK();
+}
+
+Status ForwardInputOrCreateNewList(OpKernelContext* c, int32 input_index,
+                                   int32 output_index,
+                                   const TensorList& input_list,
+                                   TensorList** output_list) {
+  // Attempt to forward the input tensor to the output if possible.
+  AllocatorAttributes attr;
+  attr.set_on_host(true);
+  std::unique_ptr<Tensor> maybe_output =
+      c->forward_input(input_index, output_index, DT_VARIANT, TensorShape{},
+                       c->input_memory_type(input_index), attr);
+  Tensor* output_tensor;
+  if (maybe_output != nullptr) {
+    // Woohoo, forwarding succeeded!
+    output_tensor = maybe_output.get();
+  } else {
+    // If forwarding is not possible allocate a new output tensor and copy
+    // the `input_list` to it.
+    TF_RETURN_IF_ERROR(
+        c->allocate_output(output_index, {}, &output_tensor, attr));
+    output_tensor->scalar<Variant>()() = input_list;
+  }
+  *output_list = output_tensor->scalar<Variant>()().get<TensorList>();
+  return Status::OK();
+}
+
 class EmptyTensorList : public OpKernel {
  public:
   explicit EmptyTensorList(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -234,11 +277,8 @@ class TensorListPushBack : public OpKernel {
                                         " but tried to append ",
                                         DataTypeString(input.dtype())));
 
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, l != nullptr,
-                errors::InvalidArgument(
-                    "Input handle is not a list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    const TensorList* l = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
     OP_REQUIRES(c, l->element_shape.IsCompatibleWith(input.shape()),
                 errors::InvalidArgument(
                     "Tried to append a tensor with incompatible shape to a "
@@ -259,14 +299,9 @@ class TensorListPushBack : public OpKernel {
                                   " max_num_elements: ", l->max_num_elements));
     }
 
-    TensorList output;
-    output = *l;
-    output.tensors.push_back(input);
-    Tensor* result;
-    AllocatorAttributes attr;
-    attr.set_on_host(true);
-    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
-    result->scalar<Variant>()() = std::move(output);
+    TensorList* output_list = nullptr;
+    OP_REQUIRES_OK(c, ForwardInputOrCreateNewList(c, 0, 0, *l, &output_list));
+    output_list->tensors.push_back(input);
   }
 
  private:
@@ -289,12 +324,8 @@ class TensorListLength : public OpKernel {
   ~TensorListLength() override {}
 
   void Compute(OpKernelContext* c) override {
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(
-        c, l != nullptr,
-        errors::InvalidArgument(
-            "TensorListLength received a variant which is not a list. Saw: '",
-            c->input(0).scalar<Variant>()().DebugString(), "'"));
+    const TensorList* l = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
     Tensor* result;
     OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result));
     result->scalar<int32>()() = l->tensors.size();
@@ -317,15 +348,8 @@ class TensorListElementShape : public OpKernel {
   explicit TensorListElementShape(OpKernelConstruction* c) : OpKernel(c) {}
 
   void Compute(OpKernelContext* c) override {
-    OP_REQUIRES(
-        c, c->input(0).shape().num_elements() == 1,
-        errors::InvalidArgument("List tensors are supposed to be scalars."));
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, l != nullptr,
-                errors::InvalidArgument(
-                    "TensorListElementShape received a variant which is not a "
-                    "list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    const TensorList* l = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
     Tensor* result;
     if (l->element_shape.unknown_rank()) {
       OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &result));
@@ -351,63 +375,6 @@ class TensorListElementShape : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("TensorListElementShape").Device(DEVICE_CPU),
                         TensorListElementShape);
 
-#if GOOGLE_CUDA
-
-REGISTER_KERNEL_BUILDER(Name("TensorListElementShape")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("element_shape"),
-                        TensorListElementShape);
-
-#endif  // GOOGLE_CUDA
-
-class TensorListPopBack : public OpKernel {
- public:
-  explicit TensorListPopBack(OpKernelConstruction* c) : OpKernel(c) {
-    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
-  }
-
-  ~TensorListPopBack() override {}
-
-  void Compute(OpKernelContext* c) override {
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, l != nullptr,
-                errors::InvalidArgument(
-                    "Input handle is not a list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
-    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
-                errors::InvalidArgument("Invalid data types; op elements ",
-                                        DataTypeString(element_dtype_),
-                                        " but list elements ",
-                                        DataTypeString(l->element_dtype)));
-
-    OP_REQUIRES(c, !l->tensors.empty(),
-                errors::InvalidArgument("Trying to pop from an empty list."));
-
-    c->set_output(1, l->tensors.back());
-    TensorList output;
-    output = *l;
-    output.tensors.pop_back();
-    Tensor* result;
-    AllocatorAttributes attr;
-    attr.set_on_host(true);
-    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
-    result->scalar<Variant>()() = std::move(output);
-  }
-
- private:
-  DataType element_dtype_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("TensorListPopBack").Device(DEVICE_CPU),
-                        TensorListPopBack);
-
-#if GOOGLE_CUDA
-
-REGISTER_KERNEL_BUILDER(Name("TensorListPopBack").Device(DEVICE_GPU),
-                        TensorListPopBack);
-
-#endif  // GOOGLE_CUDA
-
 class TensorListReserve : public OpKernel {
  public:
   explicit TensorListReserve(OpKernelConstruction* c) : OpKernel(c) {
@@ -445,57 +412,58 @@ REGISTER_KERNEL_BUILDER(Name("TensorListReserve")
                         TensorListReserve);
 
 #endif  // GOOGLE_CUDA
-
-class TensorListGetItem : public OpKernel {
+class TensorListResize : public OpKernel {
  public:
-  explicit TensorListGetItem(OpKernelConstruction* c) : OpKernel(c) {
-    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
-  }
+  explicit TensorListResize(OpKernelConstruction* c) : OpKernel(c) {}
 
   void Compute(OpKernelContext* c) override {
+    const TensorList* input_list = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &input_list));
+    int32 size = c->input(1).scalar<int32>()();
     OP_REQUIRES(
-        c, c->input(0).shape().num_elements() == 1,
-        errors::InvalidArgument("List tensors are supposed to be scalars."));
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, l != nullptr,
-                errors::InvalidArgument(
-                    "Input handle is not a list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
-    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
-                errors::InvalidArgument("Invalid data types; op elements ",
-                                        DataTypeString(element_dtype_),
-                                        " but list elements ",
-                                        DataTypeString(l->element_dtype)));
-    int32 index = c->input(1).scalar<int32>()();
-    OP_REQUIRES(c, index < l->tensors.size(),
-                errors::InvalidArgument("Trying to access element ", index,
-                                        " in a list with ", l->tensors.size(),
-                                        " elements."));
-    c->set_output(0, l->tensors[index]);
-  }
+        c, size >= 0,
+        errors::InvalidArgument(
+            "TensorListSlice expects size to be non-negative. Got: ", size));
 
- private:
-  DataType element_dtype_;
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    std::unique_ptr<Tensor> maybe_result = c->forward_input(
+        0, 0, DT_VARIANT, TensorShape{}, c->input_memory_type(0), attr);
+    if (maybe_result != nullptr) {
+      maybe_result->scalar<Variant>()().get<TensorList>()->tensors.resize(
+          size, Tensor(DT_INVALID));
+    } else {
+      Tensor* result;
+      OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
+      TensorList output_list;
+      output_list.element_shape = input_list->element_shape;
+      output_list.element_dtype = input_list->element_dtype;
+      output_list.max_num_elements = input_list->max_num_elements;
+      if (size > input_list->tensors.size()) {
+        output_list.tensors.insert(output_list.tensors.begin(),
+                                   input_list->tensors.begin(),
+                                   input_list->tensors.end());
+        // Add DT_INVALID tensors to the end of the list if the requested size
+        // is larger than the list length.
+        output_list.tensors.resize(size, Tensor(DT_INVALID));
+      } else {
+        output_list.tensors.insert(output_list.tensors.begin(),
+                                   input_list->tensors.begin(),
+                                   input_list->tensors.begin() + size);
+      }
+      result->scalar<Variant>()() = std::move(output_list);
+    }
+  }
 };
 
-REGISTER_KERNEL_BUILDER(Name("TensorListGetItem").Device(DEVICE_CPU),
-                        TensorListGetItem);
+REGISTER_KERNEL_BUILDER(Name("TensorListResize").Device(DEVICE_CPU),
+                        TensorListResize);
 
 #if GOOGLE_CUDA
 
-#define REGISTER_TENSOR_LIST_GET_ITEM_GPU(T)                      \
-  REGISTER_KERNEL_BUILDER(Name("TensorListGetItem")               \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU)                 \
-                              .HostMemory("index"),               \
-                          TensorListGetItem);
-
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
-TF_CALL_complex64(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
-TF_CALL_complex128(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
-TF_CALL_int64(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
-REGISTER_TENSOR_LIST_GET_ITEM_GPU(bfloat16)
-#undef REGISTER_TENSOR_LIST_GET_ITEM_GPU
+REGISTER_KERNEL_BUILDER(
+    Name("TensorListResize").Device(DEVICE_GPU).HostMemory("size"),
+    TensorListResize);
 
 #endif  // GOOGLE_CUDA
 
@@ -506,11 +474,8 @@ class TensorListSetItem : public OpKernel {
   }
 
   void Compute(OpKernelContext* c) override {
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, l != nullptr,
-                errors::InvalidArgument(
-                    "Input handle is not a list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    const TensorList* l = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
     OP_REQUIRES(c, element_dtype_ == l->element_dtype,
                 errors::InvalidArgument("Invalid data types; op elements ",
                                         DataTypeString(element_dtype_),
@@ -528,14 +493,9 @@ class TensorListSetItem : public OpKernel {
                     "list index. Item element shape: ",
                     value.shape().DebugString(),
                     " list shape: ", l->element_shape.DebugString()));
-    TensorList output;
-    output = *l;
-    output.tensors[index] = value;
-    Tensor* result;
-    AllocatorAttributes attr;
-    attr.set_on_host(true);
-    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
-    result->scalar<Variant>()() = std::move(output);
+    TensorList* output_list = nullptr;
+    OP_REQUIRES_OK(c, ForwardInputOrCreateNewList(c, 0, 0, *l, &output_list));
+    output_list->tensors[index] = value;
   }
 
  private:
@@ -557,6 +517,7 @@ REGISTER_KERNEL_BUILDER(Name("TensorListSetItem").Device(DEVICE_CPU),
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
 TF_CALL_complex64(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
 TF_CALL_complex128(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
+TF_CALL_int32(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
 TF_CALL_int64(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
 REGISTER_TENSOR_LIST_SET_ITEM_GPU(bfloat16)
 #undef REGISTER_TENSOR_LIST_SET_ITEM_GPU
@@ -656,69 +617,68 @@ REGISTER_KERNEL_BUILDER(Name("TensorListConcatLists").Device(DEVICE_GPU),
 
 #endif  // GOOGLE_CUDA
 
-#define REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(T)               \
-  REGISTER_KERNEL_BUILDER(Name("TensorListPushBackBatch")         \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
+#define REGISTER_TENSOR_LIST_OPS_CPU(T)                                    \
+  REGISTER_KERNEL_BUILDER(Name("TensorListStack")                          \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListStack<CPUDevice, T>)                   \
+  REGISTER_KERNEL_BUILDER(Name("TensorListGather")                         \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListGather<CPUDevice, T>)                  \
+  REGISTER_KERNEL_BUILDER(Name("TensorListConcat")                         \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListConcat<CPUDevice, T>)                  \
+  REGISTER_KERNEL_BUILDER(Name("TensorListConcatV2")                       \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListConcat<CPUDevice, T>)                  \
+  REGISTER_KERNEL_BUILDER(Name("TensorListGetItem")                        \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListGetItem<CPUDevice, T>)                 \
+  REGISTER_KERNEL_BUILDER(Name("TensorListPopBack")                        \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListPopBack<CPUDevice, T>)                 \
+  REGISTER_KERNEL_BUILDER(Name("TensorListFromTensor")                     \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListFromTensor<CPUDevice, T>)              \
+  REGISTER_KERNEL_BUILDER(Name("TensorListScatter")                        \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListScatter<CPUDevice, T>)                 \
+  REGISTER_KERNEL_BUILDER(Name("TensorListScatterV2")                      \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListScatter<CPUDevice, T>)                 \
+  REGISTER_KERNEL_BUILDER(Name("TensorListScatterIntoExistingList")        \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListScatterIntoExistingList<CPUDevice, T>) \
+  REGISTER_KERNEL_BUILDER(Name("TensorListSplit")                          \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
+                          TensorListSplit<CPUDevice, T>)                   \
+  REGISTER_KERNEL_BUILDER(Name("TensorListPushBackBatch")                  \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_CPU),                         \
                           TensorListPushBackBatch<CPUDevice, T>)
 
-TF_CALL_ALL_TYPES(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU);
-REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(quint8);
-REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(qint8);
-REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(quint16);
-REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(qint16);
-REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(qint32);
-REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(bfloat16);
-
-#undef REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU
-
-#define REGISTER_TENSOR_LIST_STACK_CPU(T)                         \
-  REGISTER_KERNEL_BUILDER(Name("TensorListStack")                 \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
-                          TensorListStack<CPUDevice, T>)          \
-  REGISTER_KERNEL_BUILDER(Name("TensorListGather")                \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
-                          TensorListGather<CPUDevice, T>)         \
-  REGISTER_KERNEL_BUILDER(Name("TensorListConcat")                \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
-                          TensorListConcat<CPUDevice, T>)
-
-TF_CALL_POD_STRING_TYPES(REGISTER_TENSOR_LIST_STACK_CPU);
-REGISTER_TENSOR_LIST_STACK_CPU(quint8);
-REGISTER_TENSOR_LIST_STACK_CPU(qint8);
-REGISTER_TENSOR_LIST_STACK_CPU(quint16);
-REGISTER_TENSOR_LIST_STACK_CPU(qint16);
-REGISTER_TENSOR_LIST_STACK_CPU(qint32);
-REGISTER_TENSOR_LIST_STACK_CPU(bfloat16);
+TF_CALL_POD_STRING_TYPES(REGISTER_TENSOR_LIST_OPS_CPU);
+REGISTER_TENSOR_LIST_OPS_CPU(quint8);
+REGISTER_TENSOR_LIST_OPS_CPU(qint8);
+REGISTER_TENSOR_LIST_OPS_CPU(quint16);
+REGISTER_TENSOR_LIST_OPS_CPU(qint16);
+REGISTER_TENSOR_LIST_OPS_CPU(qint32);
+REGISTER_TENSOR_LIST_OPS_CPU(bfloat16);
+REGISTER_TENSOR_LIST_OPS_CPU(Variant);
 
-#undef REGISTER_TENSOR_LIST_STACK_CPU
+#undef REGISTER_TENSOR_LIST_OPS_CPU
 
-#define REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(T)                   \
-  REGISTER_KERNEL_BUILDER(Name("TensorListFromTensor")            \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
-                          TensorListFromTensor<CPUDevice, T>)     \
-  REGISTER_KERNEL_BUILDER(Name("TensorListScatter")               \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
-                          TensorListScatter<CPUDevice, T>)        \
-  REGISTER_KERNEL_BUILDER(Name("TensorListSplit")                 \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_CPU),                \
-                          TensorListSplit<CPUDevice, T>)
-
-TF_CALL_POD_STRING_TYPES(REGISTER_TENSOR_LIST_FROM_TENSOR_CPU);
-REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(quint8);
-REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(qint8);
-REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(quint16);
-REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(qint16);
-REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(qint32);
-REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(bfloat16);
-
-#undef REGISTER_TENSOR_LIST_FROM_TENSOR_CPU
+#define REGISTER_TENSOR_LIST_OPS_CPU(T)
 
 REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(ADD_VARIANT_BINARY_OP, DEVICE_CPU,
                                           TensorList,
diff --git a/tensorflow/core/kernels/list_kernels.cu.cc b/tensorflow/core/kernels/list_kernels.cu.cc
index 23f552642cac273cf53b25a6d43e1e6ca23ea0cc..9922a92dec39708bff2ef3566b9e264cd5e73f00 100644
--- a/tensorflow/core/kernels/list_kernels.cu.cc
+++ b/tensorflow/core/kernels/list_kernels.cu.cc
@@ -36,73 +36,90 @@ namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-#define REGISTER_TENSOR_LIST_STACK_GPU(T)                         \
-  REGISTER_KERNEL_BUILDER(Name("TensorListStack")                 \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU),                \
-                          TensorListStack<GPUDevice, T>)          \
-  REGISTER_KERNEL_BUILDER(Name("TensorListGather")                \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU)                 \
-                              .HostMemory("indices"),             \
-                          TensorListGather<GPUDevice, T>)         \
-  REGISTER_KERNEL_BUILDER(Name("TensorListConcat")                \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU)                 \
-                              .HostMemory("lengths"),             \
-                          TensorListConcat<GPUDevice, T>)
-
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_STACK_GPU);
-REGISTER_TENSOR_LIST_STACK_GPU(bfloat16);
-TF_CALL_complex64(REGISTER_TENSOR_LIST_STACK_GPU);
-TF_CALL_complex128(REGISTER_TENSOR_LIST_STACK_GPU);
-TF_CALL_int64(REGISTER_TENSOR_LIST_STACK_GPU);
-REGISTER_TENSOR_LIST_STACK_GPU(bool);
-
-#undef REGISTER_TENSOR_LIST_STACK_GPU
-
-#define REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU(T)               \
-  REGISTER_KERNEL_BUILDER(Name("TensorListPushBackBatch")         \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU),                \
-                          TensorListPushBackBatch<GPUDevice, T>)
-
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU);
-REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU(bfloat16);
-TF_CALL_complex64(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU);
-TF_CALL_complex128(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU);
-TF_CALL_int64(REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU);
-REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU(bool);
-
-#undef REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU
-
-#define REGISTER_TENSOR_LIST_FROM_TENSOR_GPU(T)                   \
-  REGISTER_KERNEL_BUILDER(Name("TensorListFromTensor")            \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU)                 \
-                              .HostMemory("element_shape"),       \
-                          TensorListFromTensor<GPUDevice, T>)     \
-  REGISTER_KERNEL_BUILDER(Name("TensorListScatter")               \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU)                 \
-                              .HostMemory("element_shape")        \
-                              .HostMemory("indices"),             \
-                          TensorListScatter<GPUDevice, T>)        \
-  REGISTER_KERNEL_BUILDER(Name("TensorListSplit")                 \
-                              .TypeConstraint<T>("element_dtype") \
-                              .Device(DEVICE_GPU)                 \
-                              .HostMemory("element_shape")        \
-                              .HostMemory("lengths"),             \
+#define REGISTER_TENSOR_LIST_OPS_GPU(T)                                    \
+  REGISTER_KERNEL_BUILDER(Name("TensorListStack")                          \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("element_shape"),                \
+                          TensorListStack<GPUDevice, T>)                   \
+  REGISTER_KERNEL_BUILDER(Name("TensorListGather")                         \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("indices")                       \
+                              .HostMemory("element_shape"),                \
+                          TensorListGather<GPUDevice, T>)                  \
+  REGISTER_KERNEL_BUILDER(Name("TensorListGetItem")                        \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("index")                         \
+                              .HostMemory("element_shape"),                \
+                          TensorListGetItem<GPUDevice, T>)                 \
+  REGISTER_KERNEL_BUILDER(Name("TensorListPopBack")                        \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("element_shape"),                \
+                          TensorListPopBack<GPUDevice, T>)                 \
+  REGISTER_KERNEL_BUILDER(Name("TensorListConcat")                         \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("lengths"),                      \
+                          TensorListConcat<GPUDevice, T>)                  \
+  REGISTER_KERNEL_BUILDER(Name("TensorListConcatV2")                       \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("leading_dims")                  \
+                              .HostMemory("element_shape")                 \
+                              .HostMemory("lengths"),                      \
+                          TensorListConcat<GPUDevice, T>)                  \
+  REGISTER_KERNEL_BUILDER(Name("TensorListPushBackBatch")                  \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU),                         \
+                          TensorListPushBackBatch<GPUDevice, T>)           \
+  REGISTER_KERNEL_BUILDER(Name("TensorListFromTensor")                     \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("element_shape"),                \
+                          TensorListFromTensor<GPUDevice, T>)              \
+  REGISTER_KERNEL_BUILDER(Name("TensorListScatter")                        \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("element_shape")                 \
+                              .HostMemory("indices"),                      \
+                          TensorListScatter<GPUDevice, T>)                 \
+  REGISTER_KERNEL_BUILDER(Name("TensorListScatterV2")                      \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("element_shape")                 \
+                              .HostMemory("num_elements")                  \
+                              .HostMemory("indices"),                      \
+                          TensorListScatter<GPUDevice, T>)                 \
+  REGISTER_KERNEL_BUILDER(Name("TensorListScatterIntoExistingList")        \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("indices"),                      \
+                          TensorListScatterIntoExistingList<GPUDevice, T>) \
+  REGISTER_KERNEL_BUILDER(Name("TensorListSplit")                          \
+                              .TypeConstraint<T>("element_dtype")          \
+                              .Device(DEVICE_GPU)                          \
+                              .HostMemory("element_shape")                 \
+                              .HostMemory("lengths"),                      \
                           TensorListSplit<GPUDevice, T>)
 
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_FROM_TENSOR_GPU);
-REGISTER_TENSOR_LIST_FROM_TENSOR_GPU(bfloat16);
-TF_CALL_complex64(REGISTER_TENSOR_LIST_FROM_TENSOR_GPU);
-TF_CALL_complex128(REGISTER_TENSOR_LIST_FROM_TENSOR_GPU);
-TF_CALL_int64(REGISTER_TENSOR_LIST_FROM_TENSOR_GPU);
-REGISTER_TENSOR_LIST_FROM_TENSOR_GPU(bool);
-
-#undef REGISTER_TENSOR_LIST_FROM_TENSOR_GPU
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_OPS_GPU);
+REGISTER_TENSOR_LIST_OPS_GPU(bfloat16);
+TF_CALL_complex64(REGISTER_TENSOR_LIST_OPS_GPU);
+TF_CALL_complex128(REGISTER_TENSOR_LIST_OPS_GPU);
+TF_CALL_int32(REGISTER_TENSOR_LIST_OPS_GPU);
+TF_CALL_int64(REGISTER_TENSOR_LIST_OPS_GPU);
+REGISTER_TENSOR_LIST_OPS_GPU(bool);
+
+#undef REGISTER_TENSOR_LIST_OPS_GPU
+
+REGISTER_KERNEL_BUILDER(Name("TensorListPopBack")
+                            .TypeConstraint<Variant>("element_dtype")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("element_shape"),
+                        TensorListPopBack<GPUDevice, Variant>)
 
 REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(ADD_VARIANT_BINARY_OP, DEVICE_GPU,
                                           TensorList,
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index 686679474c40dc922683786cdfe65ffb3fbc03e2..682ea15caf94ccdca1e49e7721d3c227f0f1a4fc 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -65,6 +66,17 @@ struct TensorList {
 
 Status TensorShapeFromTensor(const Tensor& t, PartialTensorShape* out);
 
+Status GetElementShapeFromInput(OpKernelContext* c,
+                                const TensorList& tensor_list, int index,
+                                PartialTensorShape* element_shape);
+
+Status GetInputList(OpKernelContext* c, int index, const TensorList** list);
+
+Status ForwardInputOrCreateNewList(OpKernelContext* c, int32 input_index,
+                                   int32 output_index,
+                                   const TensorList& input_list,
+                                   TensorList** output_list);
+
 template <typename Device, typename T>
 class TensorListStack : public OpKernel {
  public:
@@ -75,27 +87,14 @@ class TensorListStack : public OpKernel {
     OP_REQUIRES_OK(c, c->GetAttr("num_elements", &num_elements_));
   }
 
-  ~TensorListStack() {}
-
   void Compute(OpKernelContext* c) override {
-    const TensorList* tensor_list =
-        c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, tensor_list != nullptr,
-                errors::InvalidArgument(
-                    "Input handle is not a list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    const TensorList* tensor_list = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &tensor_list));
     OP_REQUIRES(
         c, element_dtype_ == tensor_list->element_dtype,
         errors::InvalidArgument(
             "Invalid data types; op elements ", DataTypeString(element_dtype_),
             " but list elements ", DataTypeString(tensor_list->element_dtype)));
-    OP_REQUIRES(
-        c,
-        !tensor_list->tensors.empty() ||
-            tensor_list->element_shape.IsFullyDefined(),
-        errors::InvalidArgument("Tried to stack elements of a empty ",
-                                "list with non-fully-defined shape: ",
-                                tensor_list->element_shape.DebugString()));
     if (num_elements_ != -1) {
       OP_REQUIRES(c, tensor_list->tensors.size() == num_elements_,
                   errors::InvalidArgument(
@@ -103,37 +102,66 @@ class TensorListStack : public OpKernel {
                       " elements but got a list with ",
                       tensor_list->tensors.size(), " elements."));
     }
-    // Compute the shape of the output tensor.
-    // If `element_shape` is fully-defined it gets used. It is assumed that all
-    // element tensors have the same shape.
-    // If `element_shape` is not fully-defined the shape of the first element
-    // tensor is used and it is checked that all other tensors have the same
-    // shape.
-    TensorShape resulting_shape;
-    if (!tensor_list->element_shape.AsTensorShape(&resulting_shape)) {
-      const Tensor& t = tensor_list->tensors[0];
-      resulting_shape = t.shape();
-      for (int i = 1; i < tensor_list->tensors.size(); ++i) {
+    PartialTensorShape partial_element_shape;
+    OP_REQUIRES_OK(c, GetElementShapeFromInput(c, *tensor_list, 1,
+                                               &partial_element_shape));
+    OP_REQUIRES(
+        c,
+        partial_element_shape.IsFullyDefined() || !tensor_list->tensors.empty(),
+        errors::InvalidArgument("Tried to stack elements of an empty ",
+                                "list with non-fully-defined element_shape: ",
+                                partial_element_shape.DebugString()));
+
+    // Check that `element_shape` input tensor is compatible with the shapes of
+    // element tensors.
+    if (!tensor_list->element_shape.IsFullyDefined()) {
+      for (int i = 0; i < tensor_list->tensors.size(); ++i) {
         const Tensor& t = tensor_list->tensors[i];
-        OP_REQUIRES(c, t.shape() == resulting_shape,
-                    errors::InvalidArgument(
-                        "Tried to stack tensors with unequal shapes: ",
-                        resulting_shape.DebugString(), " vs ",
-                        t.shape().DebugString()));
+        if (t.dtype() != DT_INVALID) {
+          PartialTensorShape tmp = partial_element_shape;
+          OP_REQUIRES_OK(c, tmp.MergeWith(t.shape(), &partial_element_shape));
+        }
       }
     }
-    resulting_shape.InsertDim(0, tensor_list->tensors.size());
+
+    // Compute the shape of the output tensor by pre-pending the leading dim to
+    // the element_shape.
+    TensorShape element_shape;
+    OP_REQUIRES(c, partial_element_shape.AsTensorShape(&element_shape),
+                errors::InvalidArgument(
+                    "Tried to stack list which only contains uninitialized ",
+                    "tensors and has a non-fully-defined element_shape: ",
+                    partial_element_shape.DebugString()));
+    TensorShape output_shape = element_shape;
+    output_shape.InsertDim(0, tensor_list->tensors.size());
     Tensor* output;
-    OP_REQUIRES_OK(c, c->allocate_output(0, resulting_shape, &output));
+    OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
     if (output->NumElements() == 0) {
       return;
     }
 
     ConstMatrixVector inputs_flat;
     inputs_flat.reserve(tensor_list->tensors.size());
+    Tensor zeros;
     for (const auto& t : tensor_list->tensors) {
-      inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
-          t.shaped<T, 2>({1, t.NumElements()})));
+      if (t.dtype() != DT_INVALID) {
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            t.shaped<T, 2>({1, t.NumElements()})));
+      } else {
+        if (!zeros.NumElements()) {
+          AllocatorAttributes attr;
+          if (element_dtype_ == DT_VARIANT) {
+            attr.set_on_host(true);
+          }
+          OP_REQUIRES_OK(
+              c, c->allocate_temp(element_dtype_, element_shape, &zeros, attr));
+          functor::SetZeroFunctor<Device, T>()(c->eigen_device<Device>(),
+                                               zeros.flat<T>());
+        }
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            const_cast<const Tensor&>(zeros).shaped<T, 2>(
+                {1, zeros.NumElements()})));
+      }
     }
     auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
 
@@ -151,6 +179,122 @@ class TensorListStack : public OpKernel {
   DataType element_dtype_;
 };
 
+template <typename Device, typename T>
+class TensorListGetItem : public OpKernel {
+ public:
+  explicit TensorListGetItem(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    const TensorList* l = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
+    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
+                errors::InvalidArgument("Invalid data types; op elements ",
+                                        DataTypeString(element_dtype_),
+                                        " but list elements ",
+                                        DataTypeString(l->element_dtype)));
+    int32 index = c->input(1).scalar<int32>()();
+    OP_REQUIRES(c, index < l->tensors.size(),
+                errors::InvalidArgument("Trying to access element ", index,
+                                        " in a list with ", l->tensors.size(),
+                                        " elements."));
+    if (l->tensors[index].dtype() != DT_INVALID) {
+      c->set_output(0, l->tensors[index]);
+    } else {
+      PartialTensorShape partial_element_shape;
+      OP_REQUIRES_OK(
+          c, GetElementShapeFromInput(c, *l, 2, &partial_element_shape));
+      TensorShape element_shape;
+      // If l->element_shape and the element_shape input are both not fully
+      // defined, try to infer the shape from other list elements. This requires
+      // that all initialized list elements have the same shape.
+      // NOTE(srbs): This might be a performance bottleneck since we are
+      // iterating over the entire list here. This is necessary for feature
+      // parity with TensorArray.read. TensorArray has a mode in which all
+      // elements are required to be of the same shape, TensorList does not.
+      // In that mode TensorArray sets the array's element_shape on the first
+      // write call. We could do something similar here if needed.
+      if (!partial_element_shape.IsFullyDefined()) {
+        for (const Tensor& t : l->tensors) {
+          if (t.dtype() != DT_INVALID) {
+            PartialTensorShape tmp = partial_element_shape;
+            OP_REQUIRES_OK(c, tmp.MergeWith(t.shape(), &partial_element_shape));
+          }
+        }
+      }
+      OP_REQUIRES(
+          c, partial_element_shape.AsTensorShape(&element_shape),
+          errors::InvalidArgument("Trying to read an uninitialized tensor but ",
+                                  "element_shape is not fully defined: ",
+                                  partial_element_shape.DebugString(),
+                                  " and no list element is set."));
+      Tensor* result;
+      AllocatorAttributes attr;
+      if (element_dtype_ == DT_VARIANT) {
+        attr.set_on_host(true);
+      }
+      OP_REQUIRES_OK(c, c->allocate_output(0, element_shape, &result, attr));
+      functor::SetZeroFunctor<Device, T>()(c->eigen_device<Device>(),
+                                           result->flat<T>());
+    }
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
+template <typename Device, typename T>
+class TensorListPopBack : public OpKernel {
+ public:
+  explicit TensorListPopBack(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    const TensorList* l = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
+    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
+                errors::InvalidArgument("Invalid data types; op elements ",
+                                        DataTypeString(element_dtype_),
+                                        " but list elements ",
+                                        DataTypeString(l->element_dtype)));
+
+    OP_REQUIRES(c, !l->tensors.empty(),
+                errors::InvalidArgument("Trying to pop from an empty list."));
+
+    const Tensor& t = l->tensors.back();
+    if (t.dtype() != DT_INVALID) {
+      c->set_output(1, t);
+    } else {
+      PartialTensorShape partial_element_shape;
+      OP_REQUIRES_OK(
+          c, GetElementShapeFromInput(c, *l, 1, &partial_element_shape));
+      TensorShape element_shape;
+      OP_REQUIRES(
+          c, partial_element_shape.AsTensorShape(&element_shape),
+          errors::InvalidArgument("Trying to read an uninitialized tensor but ",
+                                  "element_shape is not fully defined.",
+                                  partial_element_shape.DebugString()));
+      Tensor* result;
+      AllocatorAttributes attr;
+      if (element_dtype_ == DT_VARIANT) {
+        attr.set_on_host(true);
+      }
+      OP_REQUIRES_OK(c, c->allocate_output(1, element_shape, &result, attr));
+      functor::SetZeroFunctor<Device, T>()(c->eigen_device<Device>(),
+                                           result->flat<T>());
+    }
+
+    TensorList* output_list = nullptr;
+    OP_REQUIRES_OK(c, ForwardInputOrCreateNewList(c, 0, 0, *l, &output_list));
+    output_list->tensors.pop_back();
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
 template <typename Device, typename T>
 class TensorListConcat : public OpKernel {
  public:
@@ -158,74 +302,106 @@ class TensorListConcat : public OpKernel {
       std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>;
   explicit TensorListConcat(OpKernelConstruction* c) : OpKernel(c) {
     OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+    // TODO(skyewm): the HasAttr check can be removed once the
+    // element_shape_except_first_dim attr has been checked in for 2 weeks
+    // (around 1/14/2019).
+    if (c->HasAttr("element_shape")) {
+      PartialTensorShape element_shape;
+      OP_REQUIRES_OK(c, c->GetAttr("element_shape", &element_shape));
+      if (!element_shape.unknown_rank()) {
+        element_shape_except_first_dim_ = PartialTensorShape(
+            gtl::ArraySlice<int64>(element_shape.dim_sizes()).subspan(1));
+      }
+    }
   }
 
-  ~TensorListConcat() {}
-
   void Compute(OpKernelContext* c) override {
     // Check that the input Variant tensor is indeed a TensorList and has the
     // correct element type.
-    const TensorList* tensor_list =
-        c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, tensor_list != nullptr,
-                errors::InvalidArgument(
-                    "Input handle is not a list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    const TensorList* tensor_list = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &tensor_list));
     OP_REQUIRES(
         c, element_dtype_ == tensor_list->element_dtype,
         errors::InvalidArgument(
             "Invalid data types; op elements ", DataTypeString(element_dtype_),
             " but list elements ", DataTypeString(tensor_list->element_dtype)));
-    // If the TensorList is empty, its element_shape must be fully defined
-    // except for the first dimension.
-    PartialTensorShape shape_except_first_dim;
-    if (!tensor_list->element_shape.unknown_rank()) {
-      OP_REQUIRES(c, tensor_list->element_shape.dims() >= 1,
+    // The leading dimension of all list elements if they are all the same.
+    // This is used as the leading dim of uninitialized tensors in the list
+    // if leading_dims is not provided.
+    int64 first_dim = -1;
+    if (c->num_inputs() > 1) {
+      // TensorListConcatV2
+      PartialTensorShape element_shape;
+      OP_REQUIRES_OK(
+          c, GetElementShapeFromInput(c, *tensor_list, 1, &element_shape));
+      OP_REQUIRES(c, element_shape.unknown_rank() || element_shape.dims() >= 1,
                   errors::InvalidArgument(
                       "Concat requires elements to be at least vectors, ",
                       "found scalars instead."));
-      shape_except_first_dim = PartialTensorShape(
-          gtl::ArraySlice<int64>(tensor_list->element_shape.dim_sizes())
-              .subspan(1));
+      // Split `element_shape` into `first_dim` and
+      // `element_shape_except_first_dim_`.
+      first_dim = element_shape.dim_size(0);
+      element_shape_except_first_dim_ = element_shape;
+      element_shape_except_first_dim_.RemoveDim(0);
     }
+    // If the TensorList is empty, element_shape_except_first_dim_ must be fully
+    // defined.
     OP_REQUIRES(c,
                 !tensor_list->tensors.empty() ||
-                    shape_except_first_dim.IsFullyDefined(),
+                    element_shape_except_first_dim_.IsFullyDefined(),
                 errors::InvalidArgument(
                     "All except the first dimension must be fully defined ",
                     "when concating an empty tensor list. element_shape: ",
-                    tensor_list->element_shape.DebugString()));
-    // 1. Compute the shape of the output tensor.
-    // If `shape_except_first_dim` is fully-defined we just prepend the leading
-    // dim to it. Otherwise we use the shape of the first element tensor and
-    // check to make sure shapes of all tensors are compatible.
-    TensorShape output_shape;
-    if (!shape_except_first_dim.AsTensorShape(&output_shape)) {
-      const Tensor& element_tensor = tensor_list->tensors[0];
-      OP_REQUIRES(
-          c, TensorShapeUtils::IsVectorOrHigher(element_tensor.shape()),
-          errors::InvalidArgument("Concat saw a scalar shape at index ", 0,
-                                  " but requires at least vectors."));
-      output_shape =
-          TensorShape(gtl::ArraySlice<int64>(element_tensor.shape().dim_sizes())
-                          .subspan(1));
-      for (int i = 1; i < tensor_list->tensors.size(); ++i) {
-        const Tensor& element_tensor = tensor_list->tensors[i];
-        OP_REQUIRES(
-            c, TensorShapeUtils::IsVectorOrHigher(element_tensor.shape()),
-            errors::InvalidArgument("Concat saw a scalar shape at index ", i,
-                                    " but requires at least vectors."));
-        TensorShape actual_shape(
-            gtl::ArraySlice<int64>(element_tensor.shape().dim_sizes())
-                .subspan(1));
-        OP_REQUIRES(c, actual_shape.dim_sizes() == output_shape.dim_sizes(),
-                    errors::InvalidArgument(
-                        "Tried to concat tensors with unequal shapes: ",
-                        output_shape.DebugString(), " vs ",
-                        actual_shape.DebugString()));
+                    element_shape_except_first_dim_.DebugString()));
+    // 1. Check that `element_shape_except_first_dim_` input tensor is
+    //    compatible with the shapes of element tensors.
+    // 2. Check that the elements have the same shape except the first dim.
+    // 3. If `first_dim` is known, check that it is compatible with the leading
+    //    dims of all elements.
+    // 4. If `first_dim` is unknown (-1), check whether all initialized
+    //    elements have the same leading dim and if so set `first_dim` to that
+    //    value.
+    if (!tensor_list->element_shape.IsFullyDefined()) {
+      bool check_dim = (first_dim == -1);
+      int64 inferred_first_dim = first_dim;
+      for (int i = 0; i < tensor_list->tensors.size(); ++i) {
+        const Tensor& t = tensor_list->tensors[i];
+        if (t.dtype() != DT_INVALID) {
+          PartialTensorShape tmp = element_shape_except_first_dim_;
+          OP_REQUIRES(
+              c, TensorShapeUtils::IsVectorOrHigher(t.shape()),
+              errors::InvalidArgument("Concat saw a scalar shape at index ", i,
+                                      " but requires at least vectors."));
+          TensorShape shape_except_first_dim = TensorShape(
+              gtl::ArraySlice<int64>(t.shape().dim_sizes()).subspan(1));
+          OP_REQUIRES_OK(c, tmp.MergeWith(shape_except_first_dim,
+                                          &element_shape_except_first_dim_));
+          OP_REQUIRES(c, first_dim == -1 || first_dim == t.shape().dim_size(0),
+                      errors::InvalidArgument(
+                          "First entry of element_shape input does not match ",
+                          "the first dim of list element at index: ", i,
+                          " Expected: ", first_dim,
+                          " Actual: ", t.shape().dim_size(0)));
+          if (check_dim) {
+            if (inferred_first_dim == -1) {
+              inferred_first_dim = t.shape().dim_size(0);
+            } else if (inferred_first_dim != t.shape().dim_size(0)) {
+              inferred_first_dim = -1;
+              check_dim = false;
+            }
+          }
+        }
       }
+      first_dim = inferred_first_dim;
     }
-    // 2. Build the lengths_tensor and leading dim of the output tensor by
+    TensorShape output_shape;
+    OP_REQUIRES(
+        c, element_shape_except_first_dim_.AsTensorShape(&output_shape),
+        errors::InvalidArgument(
+            "Trying to concat list with only uninitialized tensors ",
+            "but element_shape_except_first_dim_ is not fully defined: ",
+            element_shape_except_first_dim_.DebugString()));
+    // Build the lengths_tensor and leading dim of the output tensor by
     // iterating over all element tensors.
     Tensor* lengths_tensor = nullptr;
     OP_REQUIRES_OK(
@@ -236,13 +412,36 @@ class TensorListConcat : public OpKernel {
     auto lengths_tensor_vec = lengths_tensor->vec<int64>();
     int64 leading_dim = 0;
     for (size_t i = 0; i < tensor_list->tensors.size(); i++) {
-      int64 dim = tensor_list->tensors[i].shape().dim_size(0);
+      int64 dim;
+      if (tensor_list->tensors[i].dtype() != DT_INVALID) {
+        dim = tensor_list->tensors[i].shape().dim_size(0);
+      } else {
+        // If leading_dims is not provided or does not contain an entry for
+        // index i use the inferred `first_dim` if set.
+        if ((c->num_inputs() <= 2 || i >= c->input(2).NumElements()) &&
+            first_dim != -1) {
+          dim = first_dim;
+        } else {
+          OP_REQUIRES(c, c->num_inputs() > 2,
+                      errors::InvalidArgument(
+                          "Concating lists with uninitialized tensors is not ",
+                          "supported in this version of TensorListConcat. ",
+                          "Consider updating your GraphDef to run the newer ",
+                          "version."));
+          OP_REQUIRES(c, i < c->input(2).NumElements(),
+                      errors::InvalidArgument(
+                          "List contains uninitialized tensor at index ", i,
+                          " but leading_dims has only ",
+                          c->input(2).NumElements(), " elements."));
+          dim = c->input(2).vec<int64>()(i);
+        }
+      }
       leading_dim += dim;
       lengths_tensor_vec(i) = dim;
     }
     output_shape.InsertDim(0, leading_dim);
     Tensor* output;
-    // 3. Allocate the output tensor and fill it up with the concated element
+    // Allocate the output tensor and fill it up with the concated element
     // tensors.
     OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
     if (output->NumElements() == 0) {
@@ -251,9 +450,31 @@ class TensorListConcat : public OpKernel {
 
     ConstMatrixVector inputs_flat;
     inputs_flat.reserve(tensor_list->tensors.size());
-    for (const auto& element_tensor : tensor_list->tensors) {
-      inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
-          element_tensor.shaped<T, 2>({1, element_tensor.NumElements()})));
+    // Store the zeros tensors in a vector to prevent them from being GC'ed till
+    // concat is complete.
+    std::vector<Tensor> zeros_vec;
+    for (int i = 0; i < tensor_list->tensors.size(); i++) {
+      const Tensor& element_tensor = tensor_list->tensors[i];
+      if (element_tensor.dtype() != DT_INVALID) {
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            element_tensor.shaped<T, 2>({1, element_tensor.NumElements()})));
+      } else {
+        AllocatorAttributes attr;
+        if (element_dtype_ == DT_VARIANT) {
+          attr.set_on_host(true);
+        }
+        TensorShape element_shape = output_shape;
+        element_shape.set_dim(0, lengths_tensor_vec(i));
+        zeros_vec.emplace_back();
+        Tensor& zeros = zeros_vec.back();
+        OP_REQUIRES_OK(
+            c, c->allocate_temp(element_dtype_, element_shape, &zeros, attr));
+        functor::SetZeroFunctor<Device, T>()(c->eigen_device<Device>(),
+                                             zeros.flat<T>());
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            const_cast<const Tensor&>(zeros).shaped<T, 2>(
+                {1, zeros.NumElements()})));
+      }
     }
     auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
 
@@ -268,6 +489,7 @@ class TensorListConcat : public OpKernel {
 
  private:
   DataType element_dtype_;
+  PartialTensorShape element_shape_except_first_dim_;
 };
 
 template <typename Device, typename T>
@@ -355,59 +577,55 @@ class TensorListGather : public OpKernel {
   }
 
   void Compute(OpKernelContext* c) override {
-    const TensorList* tensor_list =
-        c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, tensor_list != nullptr,
-                errors::InvalidArgument(
-                    "Input handle is not a list. Saw: '",
-                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    const TensorList* tensor_list = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &tensor_list));
     OP_REQUIRES(
         c, element_dtype_ == tensor_list->element_dtype,
         errors::InvalidArgument(
             "Invalid data types; op elements ", DataTypeString(element_dtype_),
             " but list elements ", DataTypeString(tensor_list->element_dtype)));
-    Tensor indices = c->input(1);
+    const Tensor& indices = c->input(1);
+    PartialTensorShape partial_element_shape;
+    OP_REQUIRES_OK(c, GetElementShapeFromInput(c, *tensor_list, 2,
+                                               &partial_element_shape));
     OP_REQUIRES(
-        c,
-        indices.NumElements() > 0 ||
-            tensor_list->element_shape.IsFullyDefined(),
+        c, partial_element_shape.IsFullyDefined() || indices.NumElements() > 0,
         errors::InvalidArgument("Tried to gather 0-elements from "
                                 "a list with non-fully-defined shape: ",
-                                tensor_list->element_shape.DebugString()));
-    // Compute the shape of the output tensor.
-    // If `element_shape` is fully-defined it gets used. It is assumed that all
-    // requested tensors have the same shape.
-    // If `element_shape` is not fully-defined the shape of the first requested
-    // tensor is used and it is checked that all other tensors have the same
-    // shape.
-    TensorShape resulting_shape;
-    if (!tensor_list->element_shape.AsTensorShape(&resulting_shape)) {
-      const int i = indices.flat<int32>()(0);
-      OP_REQUIRES(
-          c, i < tensor_list->tensors.size(),
-          errors::InvalidArgument("Index ", i, " out o range; list only has ",
-                                  tensor_list->tensors.size(), " elements."));
-      const Tensor& t = tensor_list->tensors[i];
-      resulting_shape = t.shape();
-      for (int index = 1; index < indices.NumElements(); ++index) {
+                                partial_element_shape.DebugString()));
+
+    // Check that `element_shape` input tensor is compatible with the shapes of
+    // element tensors.
+    if (!tensor_list->element_shape.IsFullyDefined()) {
+      for (int index = 0; index < indices.NumElements(); ++index) {
         const int i = indices.flat<int32>()(index);
         const Tensor& t = tensor_list->tensors[i];
-        OP_REQUIRES(c, t.shape() == resulting_shape,
-                    errors::InvalidArgument(
-                        "Tried to gather elements with unequal shapes: ",
-                        resulting_shape.DebugString(), " vs ",
-                        t.shape().DebugString()));
+        if (t.dtype() != DT_INVALID) {
+          PartialTensorShape tmp = partial_element_shape;
+          OP_REQUIRES_OK(c, tmp.MergeWith(t.shape(), &partial_element_shape));
+        }
       }
     }
-    resulting_shape.InsertDim(0, indices.NumElements());
+
+    // Compute the shape of the output tensor by pre-pending the leading dim to
+    // the element_shape.
+    TensorShape element_shape;
+    OP_REQUIRES(
+        c, partial_element_shape.AsTensorShape(&element_shape),
+        errors::InvalidArgument("Tried to gather uninitialized tensors from a ",
+                                "list with non-fully-defined element_shape: ",
+                                partial_element_shape.DebugString()));
+    TensorShape output_shape = element_shape;
+    output_shape.InsertDim(0, indices.NumElements());
     Tensor* output;
-    OP_REQUIRES_OK(c, c->allocate_output(0, resulting_shape, &output));
+    OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
     if (output->NumElements() == 0) {
       return;
     }
 
     ConstMatrixVector inputs_flat;
-    inputs_flat.reserve(tensor_list->tensors.size());
+    inputs_flat.reserve(indices.NumElements());
+    Tensor zeros;
     for (int index = 0; index < indices.NumElements(); ++index) {
       const int i = indices.flat<int32>()(index);
       OP_REQUIRES(
@@ -415,8 +633,24 @@ class TensorListGather : public OpKernel {
           errors::InvalidArgument("Index ", i, " out o range; list only has ",
                                   tensor_list->tensors.size(), " elements."));
       const Tensor& t = tensor_list->tensors[i];
-      inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
-          t.shaped<T, 2>({1, t.NumElements()})));
+      if (t.dtype() != DT_INVALID) {
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            t.shaped<T, 2>({1, t.NumElements()})));
+      } else {
+        if (!zeros.NumElements()) {
+          AllocatorAttributes attr;
+          if (element_dtype_ == DT_VARIANT) {
+            attr.set_on_host(true);
+          }
+          OP_REQUIRES_OK(
+              c, c->allocate_temp(element_dtype_, element_shape, &zeros, attr));
+          functor::SetZeroFunctor<Device, T>()(c->eigen_device<Device>(),
+                                               zeros.flat<T>());
+        }
+        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+            const_cast<const Tensor&>(zeros).shaped<T, 2>(
+                {1, zeros.NumElements()})));
+      }
     }
     auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
 
@@ -478,6 +712,81 @@ class TensorListFromTensor : public OpKernel {
   }
 };
 
+// Scatters values in `value` into `list`. Assumes that `indices` are valid.
+template <typename Device, typename T>
+Status Scatter(OpKernelContext* c, const Tensor& value, const Tensor& indices,
+               TensorList* list) {
+  for (int index = 0; index < indices.NumElements(); ++index) {
+    const int i = indices.flat<int32>()(index);
+    Tensor tmp = value.Slice(index, index + 1);
+    TensorShape tmp_shape = tmp.shape();
+    tmp_shape.RemoveDim(0);
+    if (!tmp.CopyFrom(tmp, tmp_shape)) {
+      return errors::Unknown("Unexpected shape error.");
+    }
+    // TODO(apassos) maybe not always align; but weird compiler bugs seem to
+    // prevent this.
+    Tensor aligned;
+    TF_RETURN_IF_ERROR(c->allocate_temp(tmp.dtype(), tmp.shape(), &aligned));
+    // TODO(apassos) do all slices in a single kernel invocation instead of
+    // many small ones.
+    aligned.flat<T>().device(c->eigen_device<Device>()) =
+        tmp.unaligned_flat<T>();
+    std::swap(list->tensors[i], aligned);
+  }
+  return Status::OK();
+}
+
+template <typename Device, typename T>
+class TensorListScatterIntoExistingList : public OpKernel {
+ public:
+  TensorListScatterIntoExistingList(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* c) override {
+    const TensorList* l = nullptr;
+    OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
+    const Tensor& input_tensor = c->input(1);
+    const Tensor& indices = c->input(2);
+
+    // Check that inputs are valid.
+    OP_REQUIRES(c, input_tensor.dtype() == l->element_dtype,
+                errors::InvalidArgument(
+                    "Invalid data types; input tensor type: ",
+                    DataTypeString(input_tensor.dtype()),
+                    " list element_type: ", DataTypeString(l->element_dtype)));
+    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(input_tensor.shape()),
+                errors::InvalidArgument(
+                    "Tensor must be at least a vector, but saw shape: ",
+                    input_tensor.shape().DebugString()));
+    OP_REQUIRES(c, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument(
+                    "Expected indices to be a vector, but received shape: ",
+                    indices.shape().DebugString()));
+    OP_REQUIRES(
+        c, indices.NumElements() == input_tensor.shape().dim_size(0),
+        errors::InvalidArgument(
+            "Expected len(indices) == tensor.shape[0], but saw: ",
+            indices.NumElements(), " vs. ", input_tensor.shape().dim_size(0)));
+
+    // Resize the list if needed to accommodate all indices.
+    TensorList* output_list = nullptr;
+    OP_REQUIRES_OK(c, ForwardInputOrCreateNewList(c, 0, 0, *l, &output_list));
+    const auto indices_vec = indices.vec<int32>();
+    int32 max_index =
+        (indices.NumElements() == 0)
+            ? -1
+            : *std::max_element(indices_vec.data(),
+                                indices_vec.data() + indices.NumElements());
+    if (max_index + 1 > output_list->tensors.size()) {
+      output_list->tensors.resize(max_index + 1);
+    }
+
+    // Scatter the values.
+    OP_REQUIRES_OK(c,
+                   Scatter<Device, T>(c, input_tensor, indices, output_list));
+  }
+};
+
 template <typename Device, typename T>
 class TensorListScatter : public OpKernel {
  public:
@@ -491,6 +800,13 @@ class TensorListScatter : public OpKernel {
     Tensor indices = c->input(1);
     PartialTensorShape element_shape;
     OP_REQUIRES_OK(c, TensorShapeFromTensor(c->input(2), &element_shape));
+    // TensorListScatterV2 passes the num_elements input, TensorListScatter does
+    // not.
+    int num_elements = c->num_inputs() >= 4 ? c->input(3).scalar<int>()() : -1;
+    OP_REQUIRES(c, num_elements >= -1,
+                errors::InvalidArgument(
+                    "TensorListScatter expects num_elements >= -1, found: ",
+                    num_elements));
     TensorList output_list;
     const Tensor& input_tensor = c->input(0);
     output_list.element_dtype = input_tensor.dtype();
@@ -505,28 +821,36 @@ class TensorListScatter : public OpKernel {
                     "Specified a list with shape ", element_shape.DebugString(),
                     " from a tensor with shape ", output_shape.DebugString()));
     output_list.element_shape = element_shape;
-    output_list.tensors.reserve(indices.NumElements());
-    for (int index = 0; index < indices.NumElements(); ++index) {
-      const int i = indices.flat<int32>()(index);
-      OP_REQUIRES(c, i < input_tensor.shape().dim_size(0),
-                  errors::InvalidArgument(
-                      "Trying to scatter index ", i, " from tensor with ",
-                      input_tensor.shape().dim_size(0), " rows."));
-      Tensor tmp = input_tensor.Slice(i, i + 1);
-      TensorShape tmp_shape = tmp.shape();
-      tmp_shape.RemoveDim(0);
-      OP_REQUIRES(c, tmp.CopyFrom(tmp, tmp_shape),
-                  errors::Unknown("Unexpected shape error."));
-      // TODO(apassos) maybe not always align; but weird compiler bugs seem to
-      // prevent this.
-      Tensor aligned;
-      OP_REQUIRES_OK(c, c->allocate_temp(tmp.dtype(), tmp.shape(), &aligned));
-      // TODO(apassos) do all slices in a single kernel invocation instead of
-      // many small ondes.
-      aligned.flat<T>().device(c->eigen_device<Device>()) =
-          tmp.unaligned_flat<T>();
-      output_list.tensors.push_back(aligned);
+
+    OP_REQUIRES(c, indices.NumElements() == input_tensor.shape().dim_size(0),
+                errors::InvalidArgument(
+                    "Invalid number of rows in input tensor. Expected: ",
+                    indices.NumElements(),
+                    " Actual: ", input_tensor.shape().dim_size(0)));
+
+    // Validate indices and resize output_list.tensors to fit the highest index.
+    {
+      int highest_index = -1;
+      for (int index = 0; index < indices.NumElements(); ++index) {
+        const int i = indices.flat<int32>()(index);
+        OP_REQUIRES(
+            c, i >= 0,
+            errors::InvalidArgument(
+                "Indices in TensorListScatter must all be non-negative."));
+        OP_REQUIRES(c, num_elements == -1 || i < num_elements,
+                    errors::InvalidArgument(
+                        "TensorListScatter: Trying to scatter at index ", i,
+                        " in list with size ", num_elements));
+        if (i > highest_index) {
+          highest_index = i;
+        }
+      }
+      output_list.tensors.resize(std::max(highest_index + 1, num_elements),
+                                 Tensor(DT_INVALID));
     }
+
+    OP_REQUIRES_OK(c,
+                   Scatter<Device, T>(c, input_tensor, indices, &output_list));
     output_tensor->scalar<Variant>()() = std::move(output_list);
   }
 };
@@ -589,8 +913,6 @@ class TensorListPushBackBatch : public OpKernel {
     OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
   }
 
-  ~TensorListPushBackBatch() override {}
-
   void Compute(OpKernelContext* c) override {
     const Tensor& input = c->input(1);
     OP_REQUIRES(c, element_dtype_ == input.dtype(),
diff --git a/tensorflow/core/kernels/logging_ops.cc b/tensorflow/core/kernels/logging_ops.cc
index 2599340d78a5308cbd63338db84e569f12541a4b..e611ae28b9a21d297cac179f24d343a4e5248ec9 100644
--- a/tensorflow/core/kernels/logging_ops.cc
+++ b/tensorflow/core/kernels/logging_ops.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/kernels/logging_ops.h"
+
 #include <iostream>
+
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -48,6 +51,22 @@ Status AppendStringToFile(const std::string& fname, StringPiece data,
 
 }  // namespace
 
+namespace logging {
+
+typedef std::vector<void (*)(const char*)> Listeners;
+
+Listeners* GetListeners() {
+  static Listeners* listeners = new Listeners;
+  return listeners;
+}
+
+bool RegisterListener(void (*listener)(const char*)) {
+  GetListeners()->push_back(listener);
+  return true;
+}
+
+}  // end namespace logging
+
 class AssertOp : public OpKernel {
  public:
   explicit AssertOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -157,7 +176,12 @@ class PrintV2Op : public OpKernel {
       OP_REQUIRES_OK(ctx, AppendStringToFile(file_path_, msg, ctx->env()));
       return;
     }
-    if (output_stream_ == "stdout") {
+    auto listeners = logging::GetListeners();
+    if (!listeners->empty()) {
+      for (auto& listener : *listeners) {
+        listener(msg.c_str());
+      }
+    } else if (output_stream_ == "stdout") {
       std::cout << msg << std::endl;
     } else if (output_stream_ == "stderr") {
       std::cerr << msg << std::endl;
diff --git a/tensorflow/core/kernels/logging_ops.h b/tensorflow/core/kernels/logging_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..92a8d63409478e7a0c162ae84361f7e2215aea46
--- /dev/null
+++ b/tensorflow/core/kernels/logging_ops.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LOGGING_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_LOGGING_OPS_H_
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+
+namespace tensorflow {
+
+namespace logging {
+
+// Register a listener method to call on any printed messages.
+// Returns true if it is successfully registered.
+bool RegisterListener(void (*listener)(const char*));
+
+}  // namespace logging
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LOGGING_OPS_H_
diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h
index 9451247f2684892f4666f77128d5721be9a2baa7..b046401c0ae397682a7e0e780e15c9c9f75a7524 100644
--- a/tensorflow/core/kernels/lookup_table_op.h
+++ b/tensorflow/core/kernels/lookup_table_op.h
@@ -16,12 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_LOOKUP_TABLE_OP_H_
 #define TENSORFLOW_CORE_KERNELS_LOOKUP_TABLE_OP_H_
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/lookup_interface.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/lookup_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/kernels/lookup_tables/BUILD b/tensorflow/core/kernels/lookup_tables/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a25660e987ab80de58cee05551a98d0f00ea4268
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_tables/BUILD
@@ -0,0 +1,89 @@
+# Description:
+#   OpKernels and resource templates for lookup tables.
+
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+        "//tensorflow:internal",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+
+cc_library(
+    name = "resource_interface_templates",
+    hdrs = ["resource_interface_templates.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "op_kernel_templates",
+    hdrs = ["op_kernel_templates.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/kernels:tensor_flag_utils",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/meta:type_traits",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_kernel_library(
+    name = "fingerprint64_map_op_kernels",
+    srcs = [
+        "fingerprint64_map_op_kernels.cc",
+    ],
+    deps = [
+        ":op_kernel_templates",
+        ":resource_interface_templates",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_kernel_library(
+    name = "flat_hash_map_op_kernels",
+    srcs = [
+        "flat_hash_map_op_kernels.cc",
+    ],
+    deps = [
+        ":op_kernel_templates",
+        ":resource_interface_templates",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/kernels:tensor_flag_utils",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_kernel_library(
+    name = "generic_table_op_kernels",
+    srcs = [
+        "generic_table_op_kernels.cc",
+    ],
+    deps = [
+        ":op_kernel_templates",
+        ":resource_interface_templates",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels:string_view_variant_wrapper",
+        "@com_google_absl//absl/strings",
+    ],
+)
diff --git a/tensorflow/core/kernels/lookup_tables/fingerprint64_map_op_kernels.cc b/tensorflow/core/kernels/lookup_tables/fingerprint64_map_op_kernels.cc
new file mode 100644
index 0000000000000000000000000000000000000000..36274bc6b63c6efd871f360f4234133360cf8fd1
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_tables/fingerprint64_map_op_kernels.cc
@@ -0,0 +1,124 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/lookup_tables/op_kernel_templates.h"
+#include "tensorflow/core/kernels/lookup_tables/resource_interface_templates.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+namespace tables {
+
+// Map x -> (Fingerprint64(x) % num_oov_buckets) + offset.
+// num_oov_buckets and offset are node attributes provided at construction
+// time.
+template <typename KeyType, typename ValueType>
+class Fingerprint64Map final
+    : public virtual LookupInterface<ValueType*, const KeyType&>,
+      public virtual LookupWithPrefetchInterface<absl::Span<ValueType>,
+                                                 absl::Span<const KeyType>> {
+ public:
+  using key_type = KeyType;
+
+  Fingerprint64Map(int64 num_oov_buckets, int64 offset)
+      : num_oov_buckets_(num_oov_buckets), offset_(offset) {}
+
+  Status Lookup(const KeyType& key_to_find, ValueType* value) const override {
+    *value = LookupHelper(key_to_find);
+    return Status::OK();
+  }
+
+  Status Lookup(absl::Span<const KeyType> keys, absl::Span<ValueType> values,
+                int64 prefetch_lookahead) const override {
+    if (ABSL_PREDICT_FALSE(keys.size() != values.size())) {
+      return errors::InvalidArgument(
+          "keys and values do not have the same number of elements (found ",
+          keys.size(), " vs ", values.size(), ").");
+    }
+    for (size_t i = 0; i < keys.size(); ++i) {
+      values[i] = LookupHelper(keys[i]);
+    }
+    return Status::OK();
+  }
+
+  mutex* GetMutex() const override { return nullptr; }
+
+  string DebugString() const override { return __PRETTY_FUNCTION__; }
+
+ private:
+  ABSL_ATTRIBUTE_ALWAYS_INLINE ValueType
+  LookupHelper(const KeyType& key_to_find) const {
+    // This can cause a downcast.
+    return static_cast<ValueType>(Fingerprint64(key_to_find) %
+                                  num_oov_buckets_) +
+           offset_;
+  }
+
+  const int64 num_oov_buckets_;
+  const int64 offset_;
+  TF_DISALLOW_COPY_AND_ASSIGN(Fingerprint64Map);
+};
+
+template <typename Fingerprint64Map>
+struct Fingerprint64MapFactory {
+  struct Functor {
+    using resource_type = Fingerprint64Map;
+
+    static Status AllocateContainer(OpKernelContext* ctx, OpKernel* kernel,
+                                    Fingerprint64Map** container) {
+      int64 num_oov_buckets;
+      int64 offset;
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(kernel->def(), "num_oov_buckets", &num_oov_buckets));
+      TF_RETURN_IF_ERROR(GetNodeAttr(kernel->def(), "offset", &offset));
+      *container = new Fingerprint64Map(num_oov_buckets, offset);
+      return Status::OK();
+    }
+  };
+};
+
+template <typename KeyType, typename ValueType>
+using ResourceOp = ResourceConstructionOp<
+    typename Fingerprint64MapFactory<
+        Fingerprint64Map<KeyType, ValueType>>::Functor,
+    // These are the aliases.
+    LookupInterface<ValueType*, const KeyType&>,
+    LookupWithPrefetchInterface<absl::Span<ValueType>,
+                                absl::Span<const KeyType>>>;
+
+#define REGISTER_STRING_KERNEL(ValueType)                     \
+  REGISTER_KERNEL_BUILDER(                                    \
+      Name("Fingerprint64Map")                                \
+          .Device(DEVICE_CPU)                                 \
+          .TypeConstraint<Variant>("heterogeneous_key_dtype") \
+          .TypeConstraint<ValueType>("table_value_dtype"),    \
+      ResourceOp<absl::string_view, ValueType>);              \
+  REGISTER_KERNEL_BUILDER(                                    \
+      Name("Fingerprint64Map")                                \
+          .Device(DEVICE_CPU)                                 \
+          .TypeConstraint<string>("heterogeneous_key_dtype")  \
+          .TypeConstraint<ValueType>("table_value_dtype"),    \
+      ResourceOp<string, ValueType>);
+
+REGISTER_STRING_KERNEL(int32);
+REGISTER_STRING_KERNEL(int64);
+
+#undef REGISTER_STRING_KERNEL
+
+}  // namespace tables
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_tables/flat_hash_map_op_kernels.cc b/tensorflow/core/kernels/lookup_tables/flat_hash_map_op_kernels.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9c37ca87cea58d6bd72cc2b71c9fd934eae64081
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_tables/flat_hash_map_op_kernels.cc
@@ -0,0 +1,275 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <type_traits>
+#include "absl/base/attributes.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/lookup_tables/op_kernel_templates.h"
+#include "tensorflow/core/kernels/lookup_tables/resource_interface_templates.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/fingerprint.h"
+
+namespace tensorflow {
+namespace tables {
+
+using errors::InvalidArgument;
+
+// absl::flat_hash_map<HeterogeneousKeyType, ValueType> backed table with inline
+// fallback to x -> (Fingerprint64(x) % num_oov_buckets) + offset when looked
+// up keys are not in the flat_hash_map. Inlining the fallback table turns out
+// to be quite efficient in comparison to virtual dispatch for the fallback
+// lookup.
+template <typename ValueType>
+class StaticStringFlatHashMap final
+    : public virtual LookupInterface<ValueType*, const absl::string_view&>,
+      public virtual LookupInterface<ValueType*, const string&>,
+      public virtual LookupWithPrefetchInterface<
+          absl::Span<ValueType>, absl::Span<const absl::string_view>>,
+      public virtual LookupWithPrefetchInterface<absl::Span<ValueType>,
+                                                 absl::Span<const string>>,
+      public virtual KeyValueTableInitializerInterface<
+          absl::Span<const ValueType>, absl::Span<const absl::string_view>>,
+      public virtual KeyValueTableInitializerInterface<
+          absl::Span<const ValueType>, absl::Span<const string>>,
+      public virtual SizeInterface {
+ public:
+  using value_type = ValueType;
+
+  StaticStringFlatHashMap(bool enable_synchronization, int64 num_oov_buckets)
+      : num_oov_buckets_(num_oov_buckets) {
+    if (enable_synchronization) {
+      mutex_ = absl::make_unique<mutex>();
+    }
+  }
+
+  Status Initialize(absl::Span<const absl::string_view> keys,
+                    absl::Span<const ValueType> values) override {
+    if (ABSL_PREDICT_FALSE(keys.size() != values.size())) {
+      return errors::InvalidArgument(
+          "keys and values do not have the same number of elements (found ",
+          keys.size(), " vs ", values.size(), ").");
+    }
+
+    table_.reserve(table_.size() + keys.size());
+    for (size_t i = 0; i < keys.size(); ++i) {
+      table_.insert_or_assign(string(keys[i]), values[i]);
+    }
+    return Status::OK();
+  }
+
+  Status Initialize(absl::Span<const string> keys,
+                    absl::Span<const ValueType> values) override {
+    if (ABSL_PREDICT_FALSE(keys.size() != values.size())) {
+      return errors::InvalidArgument(
+          "keys and values do not have the same number of elements (found ",
+          keys.size(), " vs ", values.size(), ").");
+    }
+
+    table_.reserve(table_.size() + keys.size());
+    for (size_t i = 0; i < keys.size(); ++i) {
+      table_.insert_or_assign(keys[i], values[i]);
+    }
+    return Status::OK();
+  }
+
+  Status Lookup(const absl::string_view& key, ValueType* value) const override {
+    *value = LookupHelper(key);
+    return Status::OK();
+  }
+
+  Status Lookup(const string& key, ValueType* value) const override {
+    *value = LookupHelper(key);
+    return Status::OK();
+  }
+
+  // keys and values are guaranteed to have the same size by convention.
+  Status Lookup(absl::Span<const absl::string_view> keys,
+                absl::Span<ValueType> values,
+                int64 prefetch_lookahead) const override {
+    const auto keys_size = keys.size();
+    if (prefetch_lookahead <= 0 || prefetch_lookahead >= keys_size) {
+      for (size_t i = 0; i < keys_size; ++i) {
+        values[i] = LookupHelper(keys[i]);
+      }
+    } else {
+      for (size_t i = 0; i < keys_size; ++i) {
+        if (i + prefetch_lookahead < keys.size()) {
+          table_.prefetch(keys[i + prefetch_lookahead]);
+        }
+        values[i] = LookupHelper(keys[i]);
+      }
+    }
+    return Status::OK();
+  }
+
+  // keys and values are guaranteed to have the same size by convention.
+  Status Lookup(absl::Span<const string> keys, absl::Span<ValueType> values,
+                int64 prefetch_lookahead) const override {
+    const auto keys_size = keys.size();
+    if (prefetch_lookahead <= 0 || prefetch_lookahead >= keys_size) {
+      for (size_t i = 0; i < keys_size; ++i) {
+        values[i] = LookupHelper(keys[i]);
+      }
+    } else {
+      for (size_t i = 0; i < keys_size; ++i) {
+        if (i + prefetch_lookahead < keys.size()) {
+          table_.prefetch(keys[i + prefetch_lookahead]);
+        }
+        values[i] = LookupHelper(keys[i]);
+      }
+    }
+    return Status::OK();
+  }
+
+  uint64 Size() const override { return table_.size(); }
+
+  mutex* GetMutex() const override { return mutex_.get(); }
+
+  string DebugString() const override { return __PRETTY_FUNCTION__; }
+
+ private:
+  template <typename T>
+  ABSL_ATTRIBUTE_ALWAYS_INLINE ValueType
+  LookupHelper(const T& key_to_find) const {
+    auto it = table_.find(key_to_find);
+    if (it != table_.end()) {
+      return it->second;
+    } else {
+      return static_cast<ValueType>(Fingerprint64(key_to_find) %
+                                    num_oov_buckets_) +
+             StaticStringFlatHashMap::Size();
+    }
+  }
+
+  const int64 num_oov_buckets_;
+  std::unique_ptr<mutex> mutex_;
+  // The underlying table.
+  absl::flat_hash_map<string, ValueType> table_;
+  TF_DISALLOW_COPY_AND_ASSIGN(StaticStringFlatHashMap);
+};
+
+// Used to allocate StaticStringFlatHashMap objects via the AllocateContainer
+// method.
+template <typename StaticStringFlatHashMap>
+struct StaticStringFlatHashMapFactory {
+  struct Functor {
+    using resource_type = StaticStringFlatHashMap;
+
+    template <typename StaticStringFlatHashMapBase>
+    static Status AllocateContainer(OpKernelContext* ctx, OpKernel* kernel,
+                                    StaticStringFlatHashMapBase** container) {
+      OpInputList table_int64_args;
+      TF_RETURN_IF_ERROR(
+          ctx->input_list("table_int64_args", &table_int64_args));
+      const size_t variadic_arg_size = table_int64_args.size();
+      if (ABSL_PREDICT_FALSE(variadic_arg_size != 2)) {
+        return errors::InvalidArgument(
+            "table_int64_args should have 2 elements (found ",
+            variadic_arg_size,
+            "). Set the first element to 1 to enable synchronized table use "
+            "and to 0 otherwise. The second element should be "
+            "num_oov_buckets.");
+      }
+
+      const bool enable_synchronization = ctx->input(0).scalar<int64>()() != 0;
+      const int64 num_oov_buckets = ctx->input(1).scalar<int64>()();
+      if (ABSL_PREDICT_FALSE(num_oov_buckets <= 0)) {
+        return errors::InvalidArgument(
+            "num_oov_buckets must be positive. Found: ", num_oov_buckets);
+      }
+      auto* non_virtual_container =
+          new StaticStringFlatHashMap(enable_synchronization, num_oov_buckets);
+      *container = non_virtual_container;
+      const Tensor& keys = ctx->input(table_int64_args.size());
+      const Tensor& values = ctx->input(table_int64_args.size() + 1);
+      if (keys.NumElements() == 0) {
+        return Status::OK();
+      } else if (keys.dtype() == DT_STRING) {
+        return Functor::Initialize(
+            keys.flat<string>(),
+            values.flat<typename StaticStringFlatHashMap::value_type>(),
+            non_virtual_container);
+      } else if (keys.dtype() == DT_VARIANT) {
+        auto keys_flat = keys.flat<Variant>();
+        if (keys_flat(0).get<absl::string_view>() == nullptr) {
+          return errors::InvalidArgument(
+              "Variant keys tensor must have subtype absl::string_view.");
+        }
+        return Functor::Initialize(
+            keys.flat<Variant>(),
+            values.flat<typename StaticStringFlatHashMap::value_type>(),
+            non_virtual_container);
+      }
+      return errors::InvalidArgument(
+          "keys tensor must have type DT_STRING or type DT_VARIANT with "
+          "subtype absl::string_view.");
+    }
+
+    static Status Initialize(
+        const absl::Span<const string> keys,
+        const absl::Span<const typename StaticStringFlatHashMap::value_type>
+            values,
+        StaticStringFlatHashMap* container) {
+      return container->Initialize(keys, values);
+    }
+
+    static Status Initialize(
+        const absl::Span<const Variant> keys,
+        const absl::Span<const typename StaticStringFlatHashMap::value_type>
+            values,
+        StaticStringFlatHashMap* container) {
+      std::vector<typename absl::string_view> keys_vec;
+      keys_vec.reserve(keys.size());
+      for (size_t i = 0; i < keys.size(); ++i) {
+        keys_vec.push_back(*keys[i].get<absl::string_view>());
+      }
+      return container->Initialize(keys_vec, values);
+    }
+  };
+};
+
+template <typename ValueType>
+using ResourceOp = ResourceConstructionOp<
+    typename StaticStringFlatHashMapFactory<
+        StaticStringFlatHashMap<ValueType>>::Functor,
+    // These are the aliases.
+    LookupInterface<ValueType*, const absl::string_view&>,
+    LookupWithPrefetchInterface<absl::Span<ValueType>,
+                                absl::Span<const absl::string_view>>,
+    LookupInterface<ValueType*, const string&>,
+    LookupWithPrefetchInterface<absl::Span<ValueType>,
+                                absl::Span<const string>>,
+    SizeInterface>;
+
+#define REGISTER_STRING_KERNEL(table_value_dtype)                  \
+  REGISTER_KERNEL_BUILDER(                                         \
+      Name("StaticStringFlatHashMap")                              \
+          .Device(DEVICE_CPU)                                      \
+          .TypeConstraint<Variant>("heterogeneous_key_dtype")      \
+          .TypeConstraint<table_value_dtype>("table_value_dtype"), \
+      ResourceOp<table_value_dtype>);
+
+REGISTER_STRING_KERNEL(int32);
+REGISTER_STRING_KERNEL(int64);
+
+#undef REGISTER_STRING_KERNEL
+
+}  // namespace tables
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_tables/generic_table_op_kernels.cc b/tensorflow/core/kernels/lookup_tables/generic_table_op_kernels.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9bb29afd19a3fd2b03171e6a3d97555e34d3b35b
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_tables/generic_table_op_kernels.cc
@@ -0,0 +1,227 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <type_traits>
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/kernels/lookup_tables/op_kernel_templates.h"
+#include "tensorflow/core/kernels/lookup_tables/resource_interface_templates.h"
+#include "tensorflow/core/kernels/string_view_variant_wrapper.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace tables {
+
+template <typename KeyType, typename ValueType>
+struct TensorInsertFactory {
+  class Functor {
+   public:
+    // If KeyType is not 'valid' then use the value it wraps as the table key
+    // type.
+    using resource_type = InsertOrAssignInterface<
+        absl::Span<const ValueType>,
+        typename absl::conditional_t<
+            IsValidDataType<KeyType>::value, absl::Span<const KeyType>,
+            absl::Span<const typename KeyType::value_type>>>;
+
+    static Status TensorInsert(const Tensor& keys, const Tensor& values,
+                               resource_type* table) {
+      if (keys.NumElements() != values.NumElements()) {
+        return errors::InvalidArgument(
+            "OpKernel tried to map keys vector of size ", keys.NumElements(),
+            " to values vector of size ", values.NumElements());
+      }
+      return TensorInsertHelper(keys, values, table);
+    }
+
+   private:
+    // keys and *values arguments to TensorInsert must have the same number of
+    // elements. This is guaranteed above.
+
+    // 'Simple' types below are types which are natively supported in TF.
+    // Non-variant KeyType which is the same as Container::key_type.
+    // No need to static_cast.
+    template <typename SfinaeArg = KeyType>
+    static absl::enable_if_t<IsValidDataType<SfinaeArg>::value, Status>
+    TensorInsertHelper(const Tensor& keys, const Tensor& values,
+                       resource_type* table) {
+      return table->InsertOrAssign(keys.flat<KeyType>(),
+                                   values.flat<ValueType>());
+    }
+
+    // Variant KeyType; the wrapped type is convertible to
+    // Container::key_type.
+    template <typename VariantSubType = KeyType>
+    static absl::enable_if_t<!IsValidDataType<VariantSubType>::value, Status>
+    TensorInsertHelper(const Tensor& keys, const Tensor& values,
+                       resource_type* table) {
+      const auto keys_flat = keys.flat<Variant>();
+      std::vector<typename VariantSubType::value_type> keys_vec;
+      keys_vec.reserve(keys_flat.size());
+      for (size_t i = 0; i < keys_flat.size(); ++i) {
+        keys_vec.emplace_back(
+            *keys_flat(i).get<typename VariantSubType::value_type>());
+      }
+      return table->InsertOrAssign(keys_vec, values.flat<ValueType>());
+    }
+  };
+};
+
+template <typename KeyType, typename ValueType>
+using InsertOp = LookupTableInsertOp<
+    typename TensorInsertFactory<KeyType, ValueType>::Functor>;
+
+template <typename KeyType, typename ValueType>
+struct TensorLookupFactory {
+  class Functor {
+   public:
+    // If KeyType is not 'valid' then use the value it wraps as the table key
+    // type.
+    using resource_type = LookupWithPrefetchInterface<
+        absl::Span<ValueType>,
+        typename absl::conditional_t<
+            IsValidDataType<KeyType>::value, absl::Span<const KeyType>,
+            absl::Span<const typename KeyType::value_type>>>;
+
+    static Status TensorLookup(const resource_type& table, const Tensor& keys,
+                               const int64 prefetch_lookahead,
+                               const int64 num_keys_per_thread,
+                               thread::ThreadPool* threadpool, Tensor* values) {
+      if (keys.NumElements() != values->NumElements()) {
+        return errors::InvalidArgument(
+            "OpKernel tried to map keys vector of size ", keys.NumElements(),
+            " to values vector of size ", values->NumElements());
+      }
+      return TensorLookupHelper(table, keys, prefetch_lookahead,
+                                num_keys_per_thread, threadpool, values);
+    }
+
+   private:
+    // keys and *values arguments to TensorLookup must have the same number of
+    // elements. This is guaranteed above.
+
+    // 'Simple' types below are types which are natively supported in TF.
+    template <typename SfinaeArg = KeyType>
+    static absl::enable_if_t<IsValidDataType<SfinaeArg>::value, Status>
+    TensorLookupHelper(const resource_type& table, const Tensor& keys,
+                       const int64 prefetch_lookahead,
+                       const int64 num_keys_per_thread,
+                       thread::ThreadPool* threadpool, Tensor* values) {
+      const auto keys_flat = keys.flat<KeyType>();
+      auto key_span = absl::MakeSpan(keys_flat);
+      auto value_span = absl::MakeSpan(values->flat<ValueType>().data(),
+                                       values->NumElements());
+      return MultithreadedTensorLookup(table, prefetch_lookahead,
+                                       num_keys_per_thread, key_span,
+                                       value_span, threadpool);
+    }
+
+    // Non-simple KeyType. We'll try an implicit conversion to
+    // Container::key_type.
+    template <typename VariantSubType = KeyType>
+    static absl::enable_if_t<!IsValidDataType<VariantSubType>::value, Status>
+    TensorLookupHelper(const resource_type& table, const Tensor& keys,
+                       const int64 prefetch_lookahead,
+                       const int64 num_keys_per_thread,
+                       thread::ThreadPool* threadpool, Tensor* values) {
+      const auto keys_flat = keys.flat<Variant>();
+      std::vector<typename VariantSubType::value_type> keys_vec;
+      const auto keys_size = keys_flat.size();
+      keys_vec.reserve(keys_size);
+      for (size_t i = 0; i < keys_size; ++i) {
+        keys_vec.emplace_back(*keys_flat(i).get<VariantSubType>()->get());
+      }
+      absl::Span<const typename VariantSubType::value_type> key_span(keys_vec);
+      auto value_span = absl::MakeSpan(values->flat<ValueType>().data(),
+                                       values->NumElements());
+      return MultithreadedTensorLookup(table, prefetch_lookahead,
+                                       num_keys_per_thread, key_span,
+                                       value_span, threadpool);
+    }
+
+    // Wrapper around table.BatchLookup which permits sharding across cores.
+    template <typename K, typename V>
+    static Status MultithreadedTensorLookup(const resource_type& table,
+                                            int64 prefetch_lookahead,
+                                            int64 num_keys_per_thread, K keys,
+                                            V values,
+                                            thread::ThreadPool* threadpool) {
+      mutex temp_mutex;  // Protect status.
+      Status status;
+      auto lookup_keys = [&](int64 begin, int64 end) {
+        auto temp_status = table.Lookup(keys.subspan(begin, end - begin),
+                                        values.subspan(begin, end - begin),
+                                        prefetch_lookahead);
+        if (ABSL_PREDICT_FALSE(!temp_status.ok())) {
+          mutex_lock lock(temp_mutex);
+          status.Update(temp_status);
+        }
+      };
+      threadpool->TransformRangeConcurrently(
+          num_keys_per_thread /* block_size */, keys.size(), lookup_keys);
+      return status;
+    }
+  };
+};
+
+template <typename KeyType, typename ValueType>
+using LookupOp = LookupTableFindOp<
+    typename TensorLookupFactory<KeyType, ValueType>::Functor>;
+
+struct TableSizeFunctor {
+  using resource_type = SizeInterface;
+
+  static Status Size(const SizeInterface& table, uint64* size) {
+    *size = table.Size();
+    return Status::OK();
+  }
+};
+
+#define REGISTER_STRING_KERNEL(table_value_dtype)                     \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("LookupTableInsertOrAssignOp")                             \
+          .Device(DEVICE_CPU)                                         \
+          .TypeConstraint<string>("insert_key_tensor_dtype")          \
+          .TypeConstraint<table_value_dtype>("table_value_dtype"),    \
+      InsertOp<string, table_value_dtype>);                           \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("LookupTableInsertOrAssignOp")                             \
+          .Device(DEVICE_CPU)                                         \
+          .TypeConstraint<Variant>("insert_key_tensor_dtype")         \
+          .TypeConstraint<table_value_dtype>("table_value_dtype"),    \
+      InsertOp<StringViewVariantWrapper, table_value_dtype>);         \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("LookupTableFindOp")                                       \
+          .Device(DEVICE_CPU)                                         \
+          .TypeConstraint<string>("lookup_key_tensor_dtype")          \
+          .TypeConstraint<table_value_dtype>("table_value_dtype"),    \
+      LookupOp<string, table_value_dtype>);                           \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("LookupTableFindOp")                                       \
+          .Device(DEVICE_CPU)                                         \
+          .TypeConstraint<Variant>("lookup_key_tensor_dtype")         \
+          .TypeConstraint<table_value_dtype>("table_value_dtype"),    \
+      LookupOp<StringViewVariantWrapper, table_value_dtype>);         \
+  REGISTER_KERNEL_BUILDER(Name("ContainerSizeOp").Device(DEVICE_CPU), \
+                          ContainerSizeOp<TableSizeFunctor>);
+
+REGISTER_STRING_KERNEL(int32);
+REGISTER_STRING_KERNEL(int64);
+
+#undef REGISTER_STRING_KERNEL
+
+}  // namespace tables
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/lookup_tables/op_kernel_templates.h b/tensorflow/core/kernels/lookup_tables/op_kernel_templates.h
new file mode 100644
index 0000000000000000000000000000000000000000..d767ca0661e1fad285729b6b68683395908b4096
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_tables/op_kernel_templates.h
@@ -0,0 +1,448 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_OP_KERNEL_TEMPLATES_H_
+#define TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_OP_KERNEL_TEMPLATES_H_
+
+#include <cstddef>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/meta/type_traits.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/kernels/tensor_flag_utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace tables {
+
+// Create resources of type ResourceType and AliasesToRegister using
+// Functor::AllocateContainer(OpKernelConstruction*, OpKernel*,
+// ResourceType**). ResourceType = Functor::resource_type.
+// No-op for resources which have already been created.
+template <typename Functor, typename... AliasesToRegister>
+class ResourceConstructionOp : public OpKernel {
+ public:
+  explicit ResourceConstructionOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx), table_handle_set_(false) {
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("use_node_name_sharing", &use_node_name_sharing_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    mutex_lock l(mu_);
+
+    if (!table_handle_set_) {
+      OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def(),
+                                      use_node_name_sharing_));
+    }
+
+    auto creator = [ctx,
+                    this](ResourceType** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      ResourceType* resource = nullptr;
+      auto status = Functor::AllocateContainer(ctx, this, &resource);
+      if (ABSL_PREDICT_FALSE(!status.ok())) {
+        // Ideally resource is non-null only if status is OK but we try
+        // to compensate here.
+        if (resource != nullptr) {
+          resource->Unref();
+        }
+        return status;
+      }
+      if (ctx->track_allocations()) {
+        ctx->record_persistent_memory_allocation(resource->MemoryUsed());
+      }
+      *ret = resource;
+      return Status::OK();
+    };
+
+    // Register the ResourceType alias.
+    ResourceType* resource = nullptr;
+    core::ScopedUnref unref_me(resource);
+    OP_REQUIRES_OK(
+        ctx,
+        cinfo_.resource_manager()->template LookupOrCreate<ResourceType, true>(
+            cinfo_.container(), cinfo_.name(), &resource, creator));
+
+    // Put a handle to resource in the output tensor (the other aliases will
+    // have the same handle).
+    Tensor* handle;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
+    handle->scalar<ResourceHandle>()() = MakeResourceHandle<ResourceType>(
+        ctx, cinfo_.container(), cinfo_.name());
+    table_handle_set_ = true;
+
+    // Create other alias resources.
+    Status status;
+    int dummy[sizeof...(AliasesToRegister)] = {
+        (status.Update(RegisterAlias<AliasesToRegister>(resource)), 0)...};
+    (void)dummy;
+    OP_REQUIRES_OK(ctx, status);
+  }
+
+  ~ResourceConstructionOp() override {
+    // If the table object was not shared, delete it.
+    if (table_handle_set_ && cinfo_.resource_is_private_to_kernel()) {
+      if (!cinfo_.resource_manager()
+               ->template Delete<ResourceType>(cinfo_.container(),
+                                               cinfo_.name())
+               .ok()) {
+        // Do nothing; the resource may have been deleted by session resets.
+      }
+      // Attempt to delete other resource aliases.
+      Status dummy_status;
+      int dummy[sizeof...(AliasesToRegister)] = {
+          (dummy_status.Update(DeleteAlias<AliasesToRegister>()), 0)...};
+      (void)dummy;
+    }
+  }
+
+ private:
+  using ResourceType = typename Functor::resource_type;
+  template <typename T>
+  Status RegisterAlias(ResourceType* resource) {
+    auto creator = [resource](T** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      *ret = resource;
+      return Status::OK();
+    };
+
+    T* alias_resource = nullptr;
+    core::ScopedUnref unref_me(alias_resource);
+    return cinfo_.resource_manager()->template LookupOrCreate<T, true>(
+        cinfo_.container(), cinfo_.name(), &alias_resource, creator);
+  }
+
+  template <typename T>
+  Status DeleteAlias() {
+    return cinfo_.resource_manager()->template Delete<T>(cinfo_.container(),
+                                                         cinfo_.name());
+  }
+
+  mutex mu_;
+  bool table_handle_set_ GUARDED_BY(mu_);
+  ContainerInfo cinfo_;
+  bool use_node_name_sharing_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ResourceConstructionOp);
+};
+
+// Create resources of type ContainerBase using the static method
+// Functor::AllocateContainer(OpKernelConstruction*, OpKernel*,
+// FallbackTableBaseType*, ContainerBase**)
+// If the resource has already been created it will be looked up.
+// Container must decrease the reference count of the FallbackTableBaseType*
+// constructor argument before its destructor completes.
+template <typename Functor, typename... AliasesToRegister>
+class TableWithFallbackConstructionOp : public OpKernel {
+ public:
+  explicit TableWithFallbackConstructionOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx), table_handle_set_(false) {
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("use_node_name_sharing", &use_node_name_sharing_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    OpInputList table_int64_args;
+    OP_REQUIRES_OK(ctx, ctx->input_list("table_int64_args", &table_int64_args));
+    if (ctx->num_inputs() == table_int64_args.size()) {
+      ctx->SetStatus(errors::InvalidArgument(
+          "Expected op to have a resource input after the table_int64_args "
+          "input but no such input found."));
+      return;
+    }
+
+    // Look up the fallback table.
+    FallbackTableBaseType* fallback_table = nullptr;
+    {
+      const Tensor& table_handle = ctx->input(table_int64_args.size());
+      ResourceHandle handle(table_handle.scalar<ResourceHandle>()());
+      OP_REQUIRES_OK(
+          ctx, ctx->resource_manager()->Lookup<FallbackTableBaseType, true>(
+                   handle.container(), handle.name(), &fallback_table));
+    }
+    mutex_lock l(mu_);
+
+    if (!table_handle_set_) {
+      OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def(),
+                                      use_node_name_sharing_));
+    }
+
+    auto creator = [ctx, this, fallback_table](
+                       ResourceType** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      // container construction logic can't be merged with
+      // ResourceConstructionOp because Container constructor requires an
+      // input which can only be constructed if the resource manager
+      // internal lock is not already held.
+      ResourceType* resource = nullptr;
+      auto status =
+          Functor::AllocateContainer(ctx, this, fallback_table, &resource);
+      if (ABSL_PREDICT_FALSE(!status.ok())) {
+        // Ideally resource is non-null only if status is OK but we try
+        // to compensate here.
+        if (resource != nullptr) {
+          resource->Unref();
+        }
+        return status;
+      }
+      if (ctx->track_allocations()) {
+        ctx->record_persistent_memory_allocation(resource->MemoryUsed());
+      }
+      *ret = resource;
+      return Status::OK();
+    };
+
+    // Register the ResourceType alias.
+    ResourceType* table = nullptr;
+    core::ScopedUnref unref_me(table);
+    OP_REQUIRES_OK(
+        ctx,
+        cinfo_.resource_manager()->template LookupOrCreate<ResourceType, true>(
+            cinfo_.container(), cinfo_.name(), &table, creator));
+
+    // Put a handle to resource in the output tensor (the other aliases will
+    // have the same handle).
+    Tensor* handle;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
+    handle->scalar<ResourceHandle>()() = MakeResourceHandle<ResourceType>(
+        ctx, cinfo_.container(), cinfo_.name());
+    table_handle_set_ = true;
+
+    // Create other alias resources.
+    Status status;
+    int dummy[sizeof...(AliasesToRegister)] = {
+        (status.Update(RegisterAlias<AliasesToRegister>(table)), 0)...};
+    (void)dummy;
+    OP_REQUIRES_OK(ctx, status);
+  }
+
+  ~TableWithFallbackConstructionOp() override {
+    // If the table object was not shared, delete it.
+    if (table_handle_set_ && cinfo_.resource_is_private_to_kernel()) {
+      if (!cinfo_.resource_manager()
+               ->template Delete<ResourceType>(cinfo_.container(),
+                                               cinfo_.name())
+               .ok()) {
+        // Do nothing; the resource may have been deleted by session resets.
+      }
+      // Attempt to delete other resource aliases.
+      Status dummy_status;
+      int dummy[sizeof...(AliasesToRegister)] = {
+          (dummy_status.Update(DeleteAlias<AliasesToRegister>()), 0)...};
+      (void)dummy;
+    }
+  }
+
+ private:
+  using ResourceType = typename Functor::resource_type;
+  using FallbackTableBaseType = typename Functor::fallback_table_type;
+
+  template <typename T>
+  Status RegisterAlias(ResourceType* resource) {
+    auto creator = [resource](T** ret) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      *ret = resource;
+      return Status::OK();
+    };
+
+    T* alias_resource = nullptr;
+    core::ScopedUnref unref_me(alias_resource);
+    return cinfo_.resource_manager()->template LookupOrCreate<T, true>(
+        cinfo_.container(), cinfo_.name(), &alias_resource, creator);
+  }
+
+  template <typename T>
+  Status DeleteAlias() {
+    return cinfo_.resource_manager()->template Delete<T>(cinfo_.container(),
+                                                         cinfo_.name());
+  }
+
+  mutex mu_;
+  bool table_handle_set_ GUARDED_BY(mu_);
+  ContainerInfo cinfo_;
+  bool use_node_name_sharing_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TableWithFallbackConstructionOp);
+};
+
+// Lookup a table of type ResourceAlias and insert the passed in keys and
+// values tensors using Functor::TensorInsert(keys, values, table).
+template <typename Functor,
+          typename ResourceAlias = typename Functor::resource_type>
+class LookupTableInsertOp : public OpKernel {
+ public:
+  explicit LookupTableInsertOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    OpInputList table_int64_args;
+    OP_REQUIRES_OK(ctx, ctx->input_list("table_int64_args", &table_int64_args));
+    const size_t tensor_index_offset = table_int64_args.size();
+    // Business logic for checking tensor shapes, etc, is delegated to the
+    // Functor.
+    const Tensor& keys = ctx->input(tensor_index_offset + 1);
+    const Tensor& values = ctx->input(tensor_index_offset + 2);
+
+    const Tensor& table_handle = ctx->input(tensor_index_offset);
+    ResourceHandle handle(table_handle.scalar<ResourceHandle>()());
+    ResourceAlias* table;
+    core::ScopedUnref unref_me(table);
+    OP_REQUIRES_OK(ctx, ctx->resource_manager()->Lookup<ResourceAlias, true>(
+                            handle.container(), handle.name(), &table));
+
+    int memory_used_before = 0;
+    if (ctx->track_allocations()) {
+      memory_used_before = table->MemoryUsed();
+    }
+    auto* mutex = table->GetMutex();
+    if (mutex != nullptr) {
+      mutex_lock lock(*mutex);
+      OP_REQUIRES_OK(ctx, Functor::TensorInsert(keys, values, table));
+    } else {
+      OP_REQUIRES_OK(ctx, Functor::TensorInsert(keys, values, table));
+    }
+    if (ctx->track_allocations()) {
+      ctx->record_persistent_memory_allocation(table->MemoryUsed() -
+                                               memory_used_before);
+    }
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(LookupTableInsertOp);
+};
+
+// Lookup a table of type ResourceAlias and look up the passed in keys using
+// Functor::TensorLookup(
+//     table, keys, prefetch_lookahead, num_keys_per_thread, threadpool, out).
+template <typename Functor,
+          typename ResourceAlias = typename Functor::resource_type>
+class LookupTableFindOp : public OpKernel {
+ public:
+  explicit LookupTableFindOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    OpInputList table_int64_args;
+    {
+      auto status = ctx->input_list("table_int64_args", &table_int64_args);
+      if (ABSL_PREDICT_FALSE(!status.ok())) {
+        ctx->SetStatus(status);
+        return;
+      }
+    }
+    // We lookup tensors using positional indices because that's more
+    // efficient than looking up their string names.
+    const Tensor& prefetch_lookahead_t = ctx->input(0);
+    const size_t tensor_index_offset = table_int64_args.size();
+    const Tensor& keys = ctx->input(tensor_index_offset + 1);
+    const Tensor& num_threads = ctx->input(tensor_index_offset + 2);
+
+    TensorShape output_shape = keys.shape();
+    Tensor* out;
+    {
+      auto status = ctx->allocate_output(0, output_shape, &out);
+      if (ABSL_PREDICT_FALSE(!status.ok())) {
+        ctx->SetStatus(status);
+        return;
+      }
+    }
+
+    int64 num_threads_scalar;
+    if (TensorShapeUtils::IsScalar(num_threads.shape())) {
+      num_threads_scalar = num_threads.template scalar<int64>()();
+    } else {
+      // Scans through rows of num_threads and returns second entry of first
+      // row whose first entry is <= the number of keys to process.
+      // This allows the user to control parallelism as a function of
+      // the number of keys to lookup.
+      num_threads_scalar = tensor_flag_utils::FindConfigValueForKey<int64, int>(
+          num_threads.template matrix<int64>(), keys.dim_size(0));
+    }
+    const int64 num_keys_per_thread =
+        num_threads_scalar > 0
+            ? std::max(1ll, keys.dim_size(0) / num_threads_scalar)
+            : keys.dim_size(0);
+
+    const int64 prefetch_lookahead = prefetch_lookahead_t.scalar<int64>()();
+
+    const Tensor& table_handle = ctx->input(tensor_index_offset);
+    ResourceHandle handle(table_handle.scalar<ResourceHandle>()());
+    ResourceAlias* table;
+    core::ScopedUnref unref_me(table);
+    OP_REQUIRES_OK(ctx, ctx->resource_manager()->Lookup<ResourceAlias, true>(
+                            handle.container(), handle.name(), &table));
+
+    auto* mutex = table->GetMutex();
+    auto* threadpool = ctx->device()->tensorflow_cpu_worker_threads()->workers;
+    if (mutex != nullptr) {
+      // There are many subtle problems with using reader locks so we opt for a
+      // writer lock here.
+      mutex_lock lock(*mutex);
+      OP_REQUIRES_OK(
+          ctx, Functor::TensorLookup(*table, keys, prefetch_lookahead,
+                                     num_keys_per_thread, threadpool, out));
+    } else {
+      OP_REQUIRES_OK(
+          ctx, Functor::TensorLookup(*table, keys, prefetch_lookahead,
+                                     num_keys_per_thread, threadpool, out));
+    }
+  }
+};
+
+// Lookup a container of type ResourceAlias and return its size using
+// Functor::Size(container, &size).
+template <typename Functor,
+          typename ResourceAlias = typename Functor::resource_type>
+class ContainerSizeOp : public OpKernel {
+ public:
+  explicit ContainerSizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& container_handle = ctx->input(0);
+    ResourceHandle handle(container_handle.scalar<ResourceHandle>()());
+    ResourceAlias* container;
+    core::ScopedUnref unref_me(container);
+    OP_REQUIRES_OK(ctx, ctx->resource_manager()->Lookup<ResourceAlias, true>(
+                            handle.container(), handle.name(), &container));
+
+    Tensor* out;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
+
+    auto* mutex = container->GetMutex();
+    if (mutex != nullptr) {
+      tf_shared_lock lock(*mutex);
+      OP_REQUIRES_OK(ctx, Functor::Size(*container, &out->scalar<uint64>()()));
+    } else {
+      OP_REQUIRES_OK(ctx, Functor::Size(*container, &out->scalar<uint64>()()));
+    }
+  }
+};
+
+}  // namespace tables
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_OP_KERNEL_TEMPLATES_H_
diff --git a/tensorflow/core/kernels/lookup_tables/resource_interface_templates.h b/tensorflow/core/kernels/lookup_tables/resource_interface_templates.h
new file mode 100644
index 0000000000000000000000000000000000000000..7331fb400a4734db19a262503dffa38fb0f71466
--- /dev/null
+++ b/tensorflow/core/kernels/lookup_tables/resource_interface_templates.h
@@ -0,0 +1,99 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_RESOURCE_INTERFACE_TEMPLATES_H_
+#define TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_RESOURCE_INTERFACE_TEMPLATES_H_
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+namespace tables {
+
+// Interface for resources with mutable state.
+class SynchronizedInterface : public virtual ResourceBase {
+ public:
+  // Return value should be used to synchronize read/write access to
+  // all public methods. If null, no synchronization is needed.
+  virtual mutex* GetMutex() const = 0;
+};
+
+// Interface for containers which support batch lookups.
+template <typename ValueType, typename... KeyContext>
+class InsertOrAssignInterface : public virtual SynchronizedInterface {
+ public:
+  using value_type = ValueType;
+
+  // Stores each KV pair {keys[i], values[i]} in the underlying map, overriding
+  // pre-existing pairs which have equivalent keys.
+  // keys and values should have the same size.
+  virtual Status InsertOrAssign(KeyContext... key_context,
+                                ValueType values) = 0;
+};
+
+// Interface for containers which support lookups.
+template <typename ValueType, typename... KeyContext>
+class LookupInterface : public virtual SynchronizedInterface {
+ public:
+  using value_type = ValueType;
+
+  // Lookup the values for keys and store them in values.
+  // prefetch_lookahead is used to prefetch the key at index
+  // i + prefetch_lookahead at the ith iteration of the implemented loop.
+  // keys and values must have the same size.
+  virtual Status Lookup(KeyContext... key_context, ValueType values) const = 0;
+};
+
+// Interface for containers which support lookups with prefetching.
+template <typename ValueType, typename... KeyContext>
+class LookupWithPrefetchInterface : public virtual SynchronizedInterface {
+ public:
+  using value_type = ValueType;
+
+  // Lookup the values for keys and store them in values.
+  // prefetch_lookahead is used to prefetch the key at index
+  // i + prefetch_lookahead at the ith iteration of the implemented loop.
+  // keys and values must have the same size.
+  virtual Status Lookup(KeyContext... key_context, ValueType values,
+                        int64 prefetch_lookahead) const = 0;
+};
+
+// Interface for containers with size concepts.
+// Implementations must guarantee thread-safety when GetMutex is used to
+// synchronize method access.
+class SizeInterface : public virtual SynchronizedInterface {
+ public:
+  // Returns the number of elements in the container.
+  virtual uint64 Size() const = 0;
+};
+
+// Interface for tables which can be initialized from key and value arguments.
+template <typename ValueType, typename... KeyContext>
+class KeyValueTableInitializerInterface : public virtual SynchronizedInterface {
+ public:
+  using value_type = ValueType;
+
+  // Lookup the values for keys and store them in values.
+  // prefetch_lookahead is used to prefetch the key at index
+  // i + prefetch_lookahead at the ith iteration of the implemented loop.
+  // keys and values must have the same size.
+  virtual Status Initialize(KeyContext... key_context, ValueType values) = 0;
+};
+
+}  // namespace tables
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_LOOKUP_TABLES_RESOURCE_INTERFACE_TEMPLATES_H_
diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc
index 30fe4b077a368fe7c272e3ea570100923b104c75..c3b80f04ed2e3dfe71550bfd6ccf87595343b1c1 100644
--- a/tensorflow/core/kernels/lookup_util.cc
+++ b/tensorflow/core/kernels/lookup_util.cc
@@ -163,7 +163,7 @@ class TextFileLineIterator
 
   int64 total_size() const override {
     if (vocab_size_ == -1) {
-      int64 new_size;
+      int64 new_size = -1;
       Status status = GetNumLinesInTextFile(env_, filename_, &new_size);
       if (!status.ok()) {
         LOG(WARNING) << "Unable to get line count: " << status;
diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc
index f405ca3c58cfffc8422dcdd65e66c7fd12784519..ba30432e21a12d66c69217bec0c75660a0ae83ec 100644
--- a/tensorflow/core/kernels/lrn_op.cc
+++ b/tensorflow/core/kernels/lrn_op.cc
@@ -19,10 +19,10 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 
diff --git a/tensorflow/core/kernels/lrn_op_test.cc b/tensorflow/core/kernels/lrn_op_test.cc
index 5d8c5c21ca21f097cb5030b43e288765ae384eaf..496c697ac3fbbc4c06a4c24f9521eba3c0cfeb23 100644
--- a/tensorflow/core/kernels/lrn_op_test.cc
+++ b/tensorflow/core/kernels/lrn_op_test.cc
@@ -102,7 +102,7 @@ TEST_F(LRNFloatTest, Depth96) {
                    .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
   AddInput<float>(TensorShape({1, 1, 1, 96}),
-                  [this](int i) -> float { return i + 1; });
+                  [](int i) -> float { return i + 1; });
   TF_ASSERT_OK(RunOpKernel());
   auto actual = GetOutput(0)->tensor<float, 4>();
 
@@ -138,7 +138,7 @@ TEST_F(LRNFloatTest, Depth16) {
                    .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
   AddInput<float>(TensorShape({1, 1, 1, 16}),
-                  [this](int i) -> float { return i + 1; });
+                  [](int i) -> float { return i + 1; });
   TF_ASSERT_OK(RunOpKernel());
   auto actual = GetOutput(0)->tensor<float, 4>();
 
diff --git a/tensorflow/core/kernels/map_stage_op.cc b/tensorflow/core/kernels/map_stage_op.cc
index dd89597369bce0dcfd8ae8ad7e2bfc47d8ae2817..27a8696e54647e14eda209c36b7b49c1d171d3bc 100644
--- a/tensorflow/core/kernels/map_stage_op.cc
+++ b/tensorflow/core/kernels/map_stage_op.cc
@@ -480,7 +480,7 @@ class StagingMap : public ResourceBase {
     return map_.size();
   }
 
-  string DebugString() override { return "StagingMap"; }
+  string DebugString() const override { return "StagingMap"; }
 };
 
 template <bool Ordered>
diff --git a/tensorflow/core/kernels/matrix_solve_op.cc b/tensorflow/core/kernels/matrix_solve_op.cc
index 169f3dae76d2fb6d0515d22648a9047657af0032..f3919a16aa50694fa5e05eb2cc421f1dd3f378a1 100644
--- a/tensorflow/core/kernels/matrix_solve_op.cc
+++ b/tensorflow/core/kernels/matrix_solve_op.cc
@@ -214,9 +214,12 @@ class MatrixSolveOpGpu : public AsyncOpKernel {
     auto input_copy_ptrs = solver->GetScratchSpace<uint8>(
         sizeof(Scalar*) * batch_size, "input_copt_ptrs",
         /* on_host */ true);
-    if (n / batch_size <= 128) {
-      // For small matrices or large batch sizes, we use the batched
-      // interface from cuBlas.
+    const int kMaxMatrixSizeToBatchSizeRatio = 128;
+    const bool use_batched_solver =
+        n <= kMaxMatrixSizeToBatchSizeRatio * batch_size;
+    if (use_batched_solver) {
+      // For small matrices or large batch sizes, we use the batched interface
+      // from cuBlas.
       const Scalar** input_copy_ptrs_base =
           reinterpret_cast<const Scalar**>(input_copy_ptrs.mutable_data());
       for (int batch = 0; batch < batch_size; ++batch) {
@@ -230,8 +233,8 @@ class MatrixSolveOpGpu : public AsyncOpKernel {
                                &dev_info.back(), batch_size),
           done);
     } else {
-      // For small batch sizes we use the non-batched interface from cuSolver,
-      // which is much faster for large matrices.
+      // For small batch sizes or large matrices, we use the non-batched
+      // interface from cuSolver, which is much faster for large matrices.
       dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "getrf"));
       for (int batch = 0; batch < batch_size; ++batch) {
         OP_REQUIRES_OK_ASYNC(
@@ -279,11 +282,7 @@ class MatrixSolveOpGpu : public AsyncOpKernel {
         /* on_host */ true);
     auto transposed_rhs_reshaped =
         transposed_rhs.template flat_inner_dims<Scalar, 3>();
-    // TODO(rmlarsen): Enable the following branch when I figure
-    // out why it causes a segfault.
-    if (false && n / batch_size <= 128) {
-      dev_info.push_back(
-          solver->GetDeviceLapackInfo(batch_size, "GetrsBatched"));
+    if (use_batched_solver) {
       const Scalar** input_copy_ptrs_base =
           reinterpret_cast<const Scalar**>(input_copy_ptr_array.mutable_data());
       const Scalar** transposed_rhs_ptrs_base =
@@ -293,13 +292,20 @@ class MatrixSolveOpGpu : public AsyncOpKernel {
         input_copy_ptrs_base[batch] = &input_copy_reshaped(batch, 0, 0);
         transposed_rhs_ptrs_base[batch] = &transposed_rhs_reshaped(batch, 0, 0);
       }
+      int host_info = 0;
       OP_REQUIRES_OK_ASYNC(
           context,
           solver->GetrsBatched(adjoint_ ? CUBLAS_OP_C : CUBLAS_OP_T, n, nrhs,
                                input_copy_ptrs_base, n, pivots_mat.data(),
-                               transposed_rhs_ptrs_base, n, &dev_info.back(),
+                               transposed_rhs_ptrs_base, n, &host_info,
                                batch_size),
           done);
+      OP_REQUIRES_ASYNC(
+          context, host_info == 0,
+          errors::InvalidArgument("The ", -host_info,
+                                  "'th argument to cublas*getrsBatched had "
+                                  "an illegal value."),
+          done);
     } else {
       dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "getrs"));
       for (int batch = 0; batch < batch_size; ++batch) {
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 507fc9983776d2fd54ca66cc70aa7695886b4b5e..d24cb1cc92d59ad100ffec20262fc69888fa770c 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -22,13 +22,13 @@ limitations under the License.
 #include <vector>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/eigen_pooling.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 
 #if GOOGLE_CUDA
+#include "cuda/include/cudnn.h"
 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
 #include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
@@ -1134,11 +1135,18 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
                 errors::InvalidArgument(
                     "qint8 should be used with data_format NCHW_VECT_C."));
 
+#if CUDNN_VERSION >= 7300
+    if (use_dnn_) {
+      DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
+                               stride_, padding_, data_format_, tensor_in,
+                               out_shape, propagate_nans_);
+#else
     // These is_int8x4 checks avoid linker errors for missing qint8 kernels.
     if (!is_int8x4 && use_dnn_ && data_format_ == FORMAT_NCHW) {
       DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
                                stride_, padding_, data_format_, tensor_in,
                                out_shape, propagate_nans_);
+#endif
     } else {
       Tensor* output = nullptr;
       OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index 56d0340547a891fe4929bd6a36a72c5e03d1d1e0..f28811ffa4d740e6733b33189a0228bea2428b19 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -390,7 +390,6 @@ bool MaxPoolForwardNoMask_NCHW_VECT_C::operator()(
       0, d.stream()>>>(output_size, bottom_data, height, width, channels,
                        pooled_height, pooled_width, kernel_h, kernel_w,
                        stride_h, stride_w, pad_t, pad_l, top_data);
-  d.synchronize();
   return d.ok();
 }
 
diff --git a/tensorflow/core/kernels/meta_support.cc b/tensorflow/core/kernels/meta_support.cc
index 39e60c9fcef174a4f9e2271600ed847f4e769625..44f2997e182a912476aeab86f1158845b5f1118e 100644
--- a/tensorflow/core/kernels/meta_support.cc
+++ b/tensorflow/core/kernels/meta_support.cc
@@ -54,7 +54,7 @@ class Scratch : public ResourceBase {
 
   uint8_t* buffer() { return scratch_32_aligned_; }
 
-  string DebugString() { return "MetaGemmScratchResource"; }
+  string DebugString() const override { return "MetaGemmScratchResource"; }
 
  private:
   std::unique_ptr<uint8_t> scratch_;
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index f0278caee6b95269b77185d409de67a7441c5ff3..d8fbb83940a08a1f3989bc3debb02585a32f3c01 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -18,19 +18,21 @@ limitations under the License.
 
 #include "mkldnn.hpp"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/kernels/concat_lib_cpu.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/mkl_util.h"
 
 using mkldnn::concat;
 using mkldnn::stream;
-#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -226,8 +228,50 @@ class MklConcatOp : public OpKernel {
       // format and avoid calling eigen version.
       if (!are_all_tf_inputs && !are_all_mkl_inputs) invoke_eigen = true;
 
+      OpInputList input_mins, input_maxes;
+      if (std::is_same<T, qint8>::value || std::is_same<T, quint8>::value) {
+        // MKL-DNN concat does not support input tensors that have different
+        // ranges. Check if the ranges of the all input tensors are the same.
+        // If not, forward it to Eigen implementation.
+
+        OP_REQUIRES_OK(context, context->input_list("input_mins", &input_mins));
+        OP_REQUIRES(context, (input_mins.size() == N),
+                    errors::InvalidArgument(
+                        "QuantizedConcatOp : Expected mins input list length ",
+                        input_mins.size(), " to equal values length ", N));
+
+        OP_REQUIRES_OK(context,
+                       context->input_list("input_maxes", &input_maxes));
+        OP_REQUIRES(context, (input_maxes.size() == N),
+                    errors::InvalidArgument(
+                        "QuantizedConcatOp : Expected maxes input list length ",
+                        input_maxes.size(), " to equal values length ", N));
+        float input_min = input_mins[0].flat<float>()(0);
+        float input_max = input_maxes[0].flat<float>()(0);
+        const float eps = 1.0e-6;
+        for (int i = 1; i < N; ++i) {
+          float min = input_mins[i].flat<float>()(0);
+          float max = input_maxes[i].flat<float>()(0);
+
+          if (fabs(input_min - min) > eps || fabs(input_max - max) > eps) {
+            invoke_eigen = true;
+            break;
+          }
+        }
+      }
+
       // Call Eigen library
       if (invoke_eigen) {
+        // MKL-DNN quantized concat does not support input tensors with
+        // different ranges.
+        // TODO (mabuzain): Add quantized version of CallEigen() to support
+        // this case.
+        OP_REQUIRES(
+            context,
+            (!std::is_same<T, qint8>::value && !std::is_same<T, quint8>::value),
+            errors::Unimplemented("MKL DNN quantized concat does not "
+                                  "support input tensors that have "
+                                  "different ranges"));
         CallEigenVersion(context, input_tensors, mkl_input_shapes);
         return;
       }
@@ -374,6 +418,23 @@ class MklConcatOp : public OpKernel {
       std::vector<primitive> net;
       net.push_back(concat_op);
       stream(stream::kind::eager).submit(net).wait();
+
+      // For quantized concat, min and max outputs are also computed.
+      if (std::is_same<T, qint8>::value || std::is_same<T, quint8>::value) {
+        Tensor* output_min = nullptr;
+        Tensor* output_max = nullptr;
+        MklDnnShape output_min_mkl_shape, output_max_mkl_shape;
+        output_min_mkl_shape.SetMklTensor(false);
+        output_max_mkl_shape.SetMklTensor(false);
+        AllocateOutputSetMklShape(context, 1, &output_min, {},
+                                  output_min_mkl_shape);
+        AllocateOutputSetMklShape(context, 2, &output_max, {},
+                                  output_max_mkl_shape);
+        // All input tensors should have the same range, just use the
+        // first one
+        output_min->flat<float>()(0) = input_mins[0].flat<float>()(0);
+        output_max->flat<float>()(0) = input_maxes[0].flat<float>()(0);
+      }
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
@@ -423,7 +484,7 @@ class MklConcatOp : public OpKernel {
         output_tensor->flat<uint8>().size() * sizeof(uint8));
   }
 
-  // This method finds the most commom format accross all MKL inputs
+  // This method finds the most commom format across all MKL inputs
   // Inputs:
   //   1. input_shapes: shapes of input (MKL) tensors.
   //   2. concat_dim: concat dimension.
@@ -490,6 +551,20 @@ class MklConcatOp : public OpKernel {
 
 TF_CALL_float(REGISTER_MKL_CPU);
 
+REGISTER_KERNEL_BUILDER(Name("_MklQuantizedConcatV2")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("T")
+                            .HostMemory("axis")
+                            .Label(mkl_op_registry::kMklQuantizedOpLabel),
+                        MklConcatOp<CPUDevice, quint8, NAME_IS_AXIS>)
+
+REGISTER_KERNEL_BUILDER(Name("_MklQuantizedConcatV2")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<qint8>("T")
+                            .HostMemory("axis")
+                            .Label(mkl_op_registry::kMklQuantizedOpLabel),
+                        MklConcatOp<CPUDevice, qint8, NAME_IS_AXIS>)
+
 #undef REGISTER_CONCAT_MKL
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index c1b182be4a4f755bc975563cb3767d7c0079fd7f..47b2a43ed9212f5a58cdaa07b15f8aec44ee7b0f 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -59,17 +59,20 @@ struct MklConvBwdFilterParams {
   memory::dims padding_right;
   padding_kind padding;
 
-  MklConvBwdFilterParams(memory::dims src_dims,
-    memory::dims diff_filter_dims, memory::dims diff_bias_dims,
-    memory::dims diff_dst_dims, memory::dims strides,
-    memory::dims dilations, memory::dims padding_left,
-    memory::dims padding_right, padding_kind padding) :
-      src_dims(src_dims), diff_filter_dims(diff_filter_dims),
-      diff_bias_dims(diff_bias_dims), diff_dst_dims(diff_dst_dims),
-      strides(strides), dilations(dilations),
-      padding_left(padding_left), padding_right(padding_right),
-      padding(padding) {
-  }
+  MklConvBwdFilterParams(memory::dims src_dims, memory::dims diff_filter_dims,
+                         memory::dims diff_bias_dims,
+                         memory::dims diff_dst_dims, memory::dims strides,
+                         memory::dims dilations, memory::dims padding_left,
+                         memory::dims padding_right, padding_kind padding)
+      : src_dims(src_dims),
+        diff_filter_dims(diff_filter_dims),
+        diff_bias_dims(diff_bias_dims),
+        diff_dst_dims(diff_dst_dims),
+        strides(strides),
+        dilations(dilations),
+        padding_left(padding_left),
+        padding_right(padding_right),
+        padding(padding) {}
 };
 
 template <typename T>
@@ -93,7 +96,7 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
   //   diff_bias_data:   output data buffer of diff_bias
   //   diff_dst_data:    input data buffer of diff_dst
   void Execute(const T* src_data, const T* diff_filter_data,
-      const T* diff_bias_data, const T* diff_dst_data) {
+               const T* diff_bias_data, const T* diff_dst_data) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.diff_filter_mem->set_data_handle(
@@ -116,8 +119,8 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
   //   src_data:         input data buffer of src
   //   diff_filter_data: output data buffer of diff_filter
   //   diff_dst_data:    input data buffer of diff_dst
-  void Execute(const T* src_data,
-      const T* diff_filter_data, const T* diff_dst_data) {
+  void Execute(const T* src_data, const T* diff_filter_data,
+               const T* diff_dst_data) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.diff_filter_mem->set_data_handle(
@@ -133,9 +136,7 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
     return;
   }
 
-  memory::format GetSrcMemoryFormat() const {
-    return context_.src_fmt;
-  }
+  memory::format GetSrcMemoryFormat() const { return context_.src_fmt; }
 
   memory::format GetDiffDstMemoryFormat() const {
     return context_.diff_dst_fmt;
@@ -185,37 +186,42 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
     std::shared_ptr<mkldnn::stream> bwd_filter_stream;
     std::vector<mkldnn::primitive> bwd_filter_primitives;
 
-    ConvBwdFilterContext() :
-        src_fmt(memory::format::any),
-        diff_dst_fmt(memory::format::any),
-        diff_filter_fmt(memory::format::any),
-        src_mem(nullptr), diff_filter_mem(nullptr),
-        diff_bias_mem(nullptr), diff_dst_mem(nullptr),
-        bwd_filter_desc(nullptr), fwd_desc(nullptr), fwd_pd(nullptr),
-        src_md(nullptr), diff_filter_md(nullptr),
-        diff_bias_md(nullptr), diff_dst_md(nullptr),
-        bwd_filter_stream(nullptr) {
-    }
+    ConvBwdFilterContext()
+        : src_fmt(memory::format::any),
+          diff_dst_fmt(memory::format::any),
+          diff_filter_fmt(memory::format::any),
+          src_mem(nullptr),
+          diff_filter_mem(nullptr),
+          diff_bias_mem(nullptr),
+          diff_dst_mem(nullptr),
+          bwd_filter_desc(nullptr),
+          fwd_desc(nullptr),
+          fwd_pd(nullptr),
+          src_md(nullptr),
+          diff_filter_md(nullptr),
+          diff_bias_md(nullptr),
+          diff_dst_md(nullptr),
+          bwd_filter_stream(nullptr) {}
   };
 
   // Setup Conv2d backward filter (weights) primitives.
   void Setup(const MklConvBwdFilterParams& convBwdFilterDims) {
     // create memory descriptors for convolution data w/ no specified format
-    context_.src_md.reset(new memory::desc({convBwdFilterDims.src_dims},
-        MklDnnType<T>(), memory::format::any));
+    context_.src_md.reset(new memory::desc(
+        {convBwdFilterDims.src_dims}, MklDnnType<T>(), memory::format::any));
 
-    context_.diff_dst_md.reset(new memory::desc(
-        {convBwdFilterDims.diff_dst_dims},
-        MklDnnType<T>(), memory::format::any));
+    context_.diff_dst_md.reset(
+        new memory::desc({convBwdFilterDims.diff_dst_dims}, MklDnnType<T>(),
+                         memory::format::any));
 
-    context_.diff_filter_md.reset(new memory::desc(
-        {convBwdFilterDims.diff_filter_dims},
-        MklDnnType<T>(), memory::format::any));
+    context_.diff_filter_md.reset(
+        new memory::desc({convBwdFilterDims.diff_filter_dims}, MklDnnType<T>(),
+                         memory::format::any));
 
     if (!convBwdFilterDims.diff_bias_dims.empty())
-      context_.diff_bias_md.reset(new memory::desc(
-          {convBwdFilterDims.diff_bias_dims},
-          MklDnnType<T>(), memory::format::x));
+      context_.diff_bias_md.reset(
+          new memory::desc({convBwdFilterDims.diff_bias_dims}, MklDnnType<T>(),
+                           memory::format::x));
 
     // create a convolution
     if (!convBwdFilterDims.diff_bias_dims.empty()) {
@@ -226,8 +232,7 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
           convBwdFilterDims.padding_left, convBwdFilterDims.padding_right,
           convBwdFilterDims.padding));
     } else {
-      context_.bwd_filter_desc.reset(
-          new convolution_backward_weights::desc(
+      context_.bwd_filter_desc.reset(new convolution_backward_weights::desc(
           convolution_direct, *context_.src_md, *context_.diff_filter_md,
           *context_.diff_dst_md, convBwdFilterDims.strides,
           convBwdFilterDims.dilations, convBwdFilterDims.padding_left,
@@ -236,18 +241,18 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
 
     // create fwd primitive_desc
     context_.fwd_desc.reset(new convolution_forward::desc(
-        prop_kind::forward, convolution_direct,
-        *context_.src_md, *context_.diff_filter_md, *context_.diff_dst_md,
-        convBwdFilterDims.strides,
-        convBwdFilterDims.dilations, convBwdFilterDims.padding_left,
-        convBwdFilterDims.padding_right, convBwdFilterDims.padding));
+        prop_kind::forward, convolution_direct, *context_.src_md,
+        *context_.diff_filter_md, *context_.diff_dst_md,
+        convBwdFilterDims.strides, convBwdFilterDims.dilations,
+        convBwdFilterDims.padding_left, convBwdFilterDims.padding_right,
+        convBwdFilterDims.padding));
     context_.fwd_pd.reset(new convolution_forward::primitive_desc(
         *context_.fwd_desc, cpu_engine_));
 
     // create backward conv primitive_desc
     context_.bwd_filter_pd.reset(
         new convolution_backward_weights::primitive_desc(
-        *context_.bwd_filter_desc, cpu_engine_, *context_.fwd_pd));
+            *context_.bwd_filter_desc, cpu_engine_, *context_.fwd_pd));
 
     // store the expected memory format
     auto bwd_filter_pd = context_.bwd_filter_pd.get();
@@ -259,25 +264,28 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
         bwd_filter_pd->diff_dst_primitive_desc().desc().data.format);
 
     // create memory primitive based on dummy data
-    context_.src_mem.reset(new memory(
-        bwd_filter_pd->src_primitive_desc(), DummyData));
-    context_.diff_filter_mem.reset(new memory(
-        bwd_filter_pd->diff_weights_primitive_desc(), DummyData));
-    context_.diff_dst_mem.reset(new memory(
-        bwd_filter_pd->diff_dst_primitive_desc(), DummyData));
+    context_.src_mem.reset(
+        new memory(bwd_filter_pd->src_primitive_desc(), DummyData));
+    context_.diff_filter_mem.reset(
+        new memory(bwd_filter_pd->diff_weights_primitive_desc(), DummyData));
+    context_.diff_dst_mem.reset(
+        new memory(bwd_filter_pd->diff_dst_primitive_desc(), DummyData));
 
     // create convolution primitive and add it to net
     if (!convBwdFilterDims.diff_bias_dims.empty()) {
-      context_.diff_bias_mem.reset(new memory(
-          {{{convBwdFilterDims.diff_bias_dims}, MklDnnType<T>(),
-          memory::format::x}, cpu_engine_}, DummyData));
+      context_.diff_bias_mem.reset(
+          new memory({{{convBwdFilterDims.diff_bias_dims},
+                       MklDnnType<T>(),
+                       memory::format::x},
+                      cpu_engine_},
+                     DummyData));
       context_.conv_bwd_filter.reset(new convolution_backward_weights(
           *context_.bwd_filter_pd, *context_.src_mem, *context_.diff_dst_mem,
           *context_.diff_filter_mem, *context_.diff_bias_mem));
     } else {
       context_.conv_bwd_filter.reset(new convolution_backward_weights(
-          *context_.bwd_filter_pd, *context_.src_mem,
-          *context_.diff_dst_mem, *context_.diff_filter_mem));
+          *context_.bwd_filter_pd, *context_.src_mem, *context_.diff_dst_mem,
+          *context_.diff_filter_mem));
     }
 
     context_.bwd_filter_primitives.push_back(*context_.conv_bwd_filter);
@@ -298,13 +306,13 @@ class MklConvBwdFilterPrimitiveFactory : public MklPrimitiveFactory<T> {
       conv_bwd_filter = new MklConvBwdFilterPrimitive<T>(convBwdFilterDims);
     } else {
       // look into the pool for reusable primitive
-      conv_bwd_filter = dynamic_cast<MklConvBwdFilterPrimitive<T>*> (
-        MklConvBwdFilterPrimitiveFactory<T>::GetInstance().GetConvBwdFilter(
-            convBwdFilterDims));
+      conv_bwd_filter = dynamic_cast<MklConvBwdFilterPrimitive<T>*>(
+          MklConvBwdFilterPrimitiveFactory<T>::GetInstance().GetConvBwdFilter(
+              convBwdFilterDims));
 
-     if (conv_bwd_filter == nullptr) {
-       conv_bwd_filter = new MklConvBwdFilterPrimitive<T>(convBwdFilterDims);
-       MklConvBwdFilterPrimitiveFactory<T>::GetInstance().SetConvBwdFilter(
+      if (conv_bwd_filter == nullptr) {
+        conv_bwd_filter = new MklConvBwdFilterPrimitive<T>(convBwdFilterDims);
+        MklConvBwdFilterPrimitiveFactory<T>::GetInstance().SetConvBwdFilter(
             convBwdFilterDims, conv_bwd_filter);
       }
     }
@@ -349,12 +357,12 @@ class MklConvBwdFilterPrimitiveFactory : public MklPrimitiveFactory<T> {
   }
 };
 
-template <typename Device, class T, bool biasEnabled>
+template <typename Device, class T, bool bias_enabled, bool is_depthwise>
 class MklConvCustomBackpropFilterOp
-    : public MklConvBackpropCommonOp<Device, T> {
+    : public MklConvBackpropCommonOp<Device, T, is_depthwise> {
  public:
   explicit MklConvCustomBackpropFilterOp(OpKernelConstruction* context)
-      : MklConvBackpropCommonOp<Device, T>(context) {}
+      : MklConvBackpropCommonOp<Device, T, is_depthwise>(context) {}
 
   ~MklConvCustomBackpropFilterOp() {}
 
@@ -365,7 +373,7 @@ class MklConvCustomBackpropFilterOp
       MklDnnData<T> diff_filter(&cpu_engine_);  // output
 
       // This flag indicates Conv2D or Conv3D
-      bool isConv2D = (this->strides_.size() == 4);
+      bool is_conv2d = (this->strides_.size() == 4);
 
       // Input tensors
       const int kInputIdx = 0, kFilterIdx = 1, kOutbpropIdx = 2;
@@ -396,8 +404,8 @@ class MklConvCustomBackpropFilterOp
           diff_dst_tf_shape.num_elements() == 0) {
         MklDnnShape diff_filter_mkl_shape;
         diff_filter_mkl_shape.SetMklTensor(false);
-        TensorShape diff_filter_tf_shape = GetOutputTfShape(
-            src_tf_shape, filter_tf_shape, diff_dst_tf_shape);
+        TensorShape diff_filter_tf_shape =
+            GetOutputTfShape(src_tf_shape, filter_tf_shape, diff_dst_tf_shape);
         const int kOutputIdx = 0;
         AllocateOutputSetMklShape(context, kOutputIdx, &diff_filter_tensor,
                                   diff_filter_tf_shape, diff_filter_mkl_shape);
@@ -414,20 +422,20 @@ class MklConvCustomBackpropFilterOp
       // By default, all dims are in MKL order. Only dims in TF order
       // are those with prefix tf_order.
       memory::dims diff_dst_dims, fwd_src_dims, fwd_filter_dims;
-      memory::dims padding_left, padding_right, dilations,
-          strides, fwd_dst_dims;
+      memory::dims padding_left, padding_right, dilations, strides,
+          fwd_dst_dims;
       memory::dims fwd_dst_dims_tf_order;
 
       // Get forward convolution parameters.
       MklDnnConvUtil conv_utl(context, this->strides_, this->padding_,
-          this->data_format_, this->dilations_);
+                              this->data_format_, this->dilations_);
       conv_utl.GetConvFwdSizesInMklOrder(
           src_tf_shape, filter_tf_shape, &fwd_src_dims, &fwd_filter_dims,
-          &strides, &dilations, &fwd_dst_dims_tf_order,
-          &fwd_dst_dims, &padding_left, &padding_right);
+          &strides, &dilations, &fwd_dst_dims_tf_order, &fwd_dst_dims,
+          &padding_left, &padding_right, false, is_depthwise);
       if (!context->status().ok()) return;
 
-      auto tf_fmt = isConv2D
+      auto tf_fmt = is_conv2d
                         ? TFDataFormatToMklDnnDataFormat(this->data_format_)
                         : TFDataFormatToMklDnn3DDataFormat(this->data_format_);
 
@@ -439,26 +447,27 @@ class MklConvCustomBackpropFilterOp
       conv_utl.GetInputSizeInMklOrder(diff_dst_tf_shape, &diff_dst_dims);
       if (!context->status().ok()) return;
 
-      auto diff_dst_md = diff_dst_mkl_shape.IsMklTensor()
-                       ? diff_dst_mkl_shape.GetMklLayout()
-                       : memory::desc(diff_dst_dims,
-                           MklDnnType<T>(), tf_fmt);
+      auto diff_dst_md =
+          diff_dst_mkl_shape.IsMklTensor()
+              ? diff_dst_mkl_shape.GetMklLayout()
+              : memory::desc(diff_dst_dims, MklDnnType<T>(), tf_fmt);
 
       memory::dims diff_bias_dims = {};
       int64 depth = 0;
-      if (biasEnabled) {
+      if (bias_enabled) {
         TensorShape obp_tf_shape = GetTfShape(context, 2);
         depth = (this->data_format_ == FORMAT_NCHW)
                     ? obp_tf_shape.dim_size(1)
-                    : obp_tf_shape.dim_size(isConv2D ? 3 : 4);
+                    : obp_tf_shape.dim_size(is_conv2d ? 3 : 4);
         diff_bias_dims = {static_cast<int>(depth)};
       }
       for (int i = 0; i < dilations.size(); i++) dilations[i] -= 1;
 
       MklConvBwdFilterPrimitive<T>* conv_bwd_filter = nullptr;
-      MklConvBwdFilterParams convBwdFilterDims(fwd_src_dims, fwd_filter_dims,
-          diff_bias_dims, diff_dst_dims, strides, dilations, padding_left,
-          padding_right, TFPaddingToMklDnnPadding(this->padding_));
+      MklConvBwdFilterParams convBwdFilterDims(
+          fwd_src_dims, fwd_filter_dims, diff_bias_dims, diff_dst_dims, strides,
+          dilations, padding_left, padding_right,
+          TFPaddingToMklDnnPadding(this->padding_));
 
       // MKL DNN allocates large buffers when a conv gradient filter primtive is
       // created. So we don't cache conv backward primitives when the env
@@ -475,14 +484,38 @@ class MklConvCustomBackpropFilterOp
       MklDnnShape diff_filter_mkl_shape;
       diff_filter_mkl_shape.SetMklTensor(false);
 
-      if (isConv2D) {
-        // Conv2D: output_dims_mkl_order is in OIHW format.
-        TensorShape diff_filter_tf_shape({bwd_output_dims[MklDnnDims::Dim_H],
-                                          bwd_output_dims[MklDnnDims::Dim_W],
-                                          bwd_output_dims[MklDnnDims::Dim_I],
-                                          bwd_output_dims[MklDnnDims::Dim_O]});
-        AllocateOutputSetMklShape(context, 0, &diff_filter_tensor,
-                                  diff_filter_tf_shape, diff_filter_mkl_shape);
+      if (is_conv2d) {
+        if (!is_depthwise) {
+          // Conv2D: output_dims_mkl_order is in OIHW format.
+          TensorShape diff_filter_tf_shape(
+              {bwd_output_dims[MklDnnDims::Dim_H],
+               bwd_output_dims[MklDnnDims::Dim_W],
+               bwd_output_dims[MklDnnDims::Dim_I],
+               bwd_output_dims[MklDnnDims::Dim_O]});
+          AllocateOutputSetMklShape(context, 0, &diff_filter_tensor,
+                                    diff_filter_tf_shape,
+                                    diff_filter_mkl_shape);
+        } else {
+          // Depthwise Conv2d: bwd_output_dims is GOIHW format
+          //                  | TensorFlow       | MKLDNN
+          // ----------------------------------------------------------------
+          // filter_out_depth | depth_multiplier | depth_multiplier *
+          //                  |                  | group_count
+          // ----------------------------------------------------------------
+          // filter_in_depth  | in_depth         | in_depth / group_count
+          // For depthwise convolution, we have group_count == in_depth.
+          // So here G = original I, and I = 1.
+          // And the GOIHW is mkldnn format, here we try to extract the TF
+          // format, TF format is HWIO, as G = original I, so here is HWGO.
+          TensorShape diff_filter_tf_shape(
+              {bwd_output_dims[MklDnnFilterGroupDims::MKL_GROUP_FILTER_DIM_H],
+               bwd_output_dims[MklDnnFilterGroupDims::MKL_GROUP_FILTER_DIM_W],
+               bwd_output_dims[MklDnnFilterGroupDims::MKL_GROUP_FILTER_DIM_G],
+               bwd_output_dims[MklDnnFilterGroupDims::MKL_GROUP_FILTER_DIM_O]});
+          AllocateOutputSetMklShape(context, 0, &diff_filter_tensor,
+                                    diff_filter_tf_shape,
+                                    diff_filter_mkl_shape);
+        }
       } else {
         // Conv3D: output_dims_mkl_order is in OIDHW format.
         TensorShape diff_filter_tf_shape(
@@ -496,38 +529,36 @@ class MklConvCustomBackpropFilterOp
       }
 
       Tensor* diff_bias_tensor = nullptr;
-      if (biasEnabled) {
+      if (bias_enabled) {
         TensorShape diff_bias_shape({depth});
         AllocateBiasGradTensor(context, diff_bias_shape, &diff_bias_tensor);
       }
 
       // check if src and diff_dst need reorder
-      T *src_data = nullptr;
+      T* src_data = nullptr;
       if (fwd_src_md.data.format != conv_bwd_filter->GetSrcMemoryFormat()) {
         src.SetUsrMem(fwd_src_md, &src_tensor);
         src.CheckReorderToOpMem(bwd_filter_pd->src_primitive_desc());
         src_data = static_cast<T*>(src.GetOpMem().get_data_handle());
       } else {
-        src_data = static_cast<T*>(const_cast<T*>(
-            src_tensor.flat<T>().data()));
+        src_data = static_cast<T*>(const_cast<T*>(src_tensor.flat<T>().data()));
       }
 
-      T *diff_dst_data = nullptr;
+      T* diff_dst_data = nullptr;
       if (diff_dst_md.data.format !=
           conv_bwd_filter->GetDiffDstMemoryFormat()) {
         diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
         diff_dst.CheckReorderToOpMem(bwd_filter_pd->diff_dst_primitive_desc());
-        diff_dst_data = static_cast<T*>(
-            diff_dst.GetOpMem().get_data_handle());
+        diff_dst_data = static_cast<T*>(diff_dst.GetOpMem().get_data_handle());
       } else {
-        diff_dst_data = static_cast<T*>(const_cast<T*>(
-            diff_dst_tensor.flat<T>().data()));
+        diff_dst_data =
+            static_cast<T*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
       }
 
       // For backward filter, convert diff_filter back to Tensorflow layout
       // Here we prepare to reorder op memory back to user memory
       bool diff_filter_reorder_required = false;
-      T *diff_filter_data = nullptr;
+      T* diff_filter_data = nullptr;
       if (GetOutputFormat(tf_fmt) !=
           conv_bwd_filter->GetDiffFilterMemoryFormat()) {
         // Allocate diff filter tensor as Tensorflow layout
@@ -535,18 +566,18 @@ class MklConvCustomBackpropFilterOp
                               diff_filter_tensor);
         diff_filter_reorder_required = true;
         diff_filter.PrepareReorderToUserMemIfReq(
-                bwd_filter_pd->diff_weights_primitive_desc());
-        diff_filter_data = static_cast<T*>(
-                            diff_filter.GetOpMem().get_data_handle());
+            bwd_filter_pd->diff_weights_primitive_desc());
+        diff_filter_data =
+            static_cast<T*>(diff_filter.GetOpMem().get_data_handle());
       } else {
-        diff_filter_data = static_cast<T*>(const_cast<T*>(
-                            diff_filter_tensor->flat<T>().data()));
+        diff_filter_data = static_cast<T*>(
+            const_cast<T*>(diff_filter_tensor->flat<T>().data()));
       }
 
       // Execute convolution filter bwd
-      if (biasEnabled) {
-        T* diff_bias_data = static_cast<T*>(const_cast<T*>(
-                         diff_bias_tensor->flat<T>().data()));
+      if (bias_enabled) {
+        T* diff_bias_data =
+            static_cast<T*>(const_cast<T*>(diff_bias_tensor->flat<T>().data()));
         conv_bwd_filter->Execute(src_data, diff_filter_data, diff_bias_data,
                                  diff_dst_data);
       } else {
@@ -598,7 +629,9 @@ class MklConvCustomBackpropFilterOp
     TensorShape filter_tf_shape;
     CHECK_EQ(TensorShapeUtils::IsVector(filter_tensor.shape()), true);
     CHECK_EQ(TensorShapeUtils::MakeShape(filter_tensor.vec<int32>(),
-             &filter_tf_shape).ok(), true);
+                                         &filter_tf_shape)
+                 .ok(),
+             true);
     return filter_tf_shape;
   }
 
@@ -619,10 +652,12 @@ class MklConvCustomBackpropFilterOp
   }
 
   // Output layout is Tensorflow's filter layout
-  //   Conv2D: HWIO;  Conv3D: DHWIO
+  //   Conv2D: HWIO;  Conv3D: DHWIO; Depthwise Conv: HWIGO
   memory::format GetOutputFormat(const memory::format data_format) {
-    return (this->strides_.size() == 4) ? memory::format::hwio
-                                        : memory::format::dhwio;
+    return is_depthwise
+               ? memory::format::hwigo
+               : ((this->strides_.size() == 4) ? memory::format::hwio
+                                               : memory::format::dhwio);
   }
 
   // Allocate output tensor.
@@ -659,32 +694,41 @@ class MklConvCustomBackpropFilterOp
 
     MklDnnShape bias_grad_mkl_shape;
     bias_grad_mkl_shape.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, 1, bias_grad_tensor,
-        bias_grad_shape, bias_grad_mkl_shape);
+    AllocateOutputSetMklShape(context, 1, bias_grad_tensor, bias_grad_shape,
+                              bias_grad_mkl_shape);
   }
 };
 
-#define REGISTER_MKL_FILTER_KERNELS(T)                                         \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilter")                     \
-                              .Device(DEVICE_CPU)                              \
-                              .TypeConstraint<T>("T")                          \
-                              .Label(mkl_op_registry::kMklOpLabel),            \
-                          MklConvCustomBackpropFilterOp<CPUDevice, T, false>); \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropFilterWithBias")             \
-                              .Device(DEVICE_CPU)                              \
-                              .TypeConstraint<T>("T")                          \
-                              .Label(mkl_op_registry::kMklOpLabel),            \
-                          MklConvCustomBackpropFilterOp<CPUDevice, T, true>);  \
-  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias")       \
-                              .Device(DEVICE_CPU)                              \
-                              .TypeConstraint<T>("T")                          \
-                              .Label(mkl_op_registry::kMklOpLabel),            \
-                          MklDummyOp<CPUDevice, T>);                           \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv3DBackpropFilterV2")                   \
-                              .Device(DEVICE_CPU)                              \
-                              .TypeConstraint<T>("T")                          \
-                              .Label(mkl_op_registry::kMklOpLabel),            \
-                          MklConvCustomBackpropFilterOp<CPUDevice, T, false>);
+#define REGISTER_MKL_FILTER_KERNELS(T)                                   \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklConv2DBackpropFilter")                                   \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklOpLabel),                          \
+      MklConvCustomBackpropFilterOp<CPUDevice, T, false, false>);        \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklConv2DBackpropFilterWithBias")                           \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklOpLabel),                          \
+      MklConvCustomBackpropFilterOp<CPUDevice, T, true, false>);         \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklDepthwiseConv2dNativeBackpropFilter")                    \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklOpLabel),                          \
+      MklConvCustomBackpropFilterOp<CPUDevice, T, false, true>);         \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DBackpropFilterWithBias") \
+                              .Device(DEVICE_CPU)                        \
+                              .TypeConstraint<T>("T")                    \
+                              .Label(mkl_op_registry::kMklOpLabel),      \
+                          MklDummyOp<CPUDevice, T>);                     \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklConv3DBackpropFilterV2")                                 \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklOpLabel),                          \
+      MklConvCustomBackpropFilterOp<CPUDevice, T, false, false>);
 
 TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
 #undef REGISTER_MKL_FILTER_KERNELS
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index 786a30bb10dcf464b5768160714238c0d5730e96..4e955df5fe9e551ec9aadc21b466dc3810784760 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -61,16 +61,18 @@ struct MklConvBwdInputParams {
   memory::dims padding_right;
   padding_kind padding;
 
-  MklConvBwdInputParams(memory::dims diff_src_dims,
-    memory::dims filter_dims, memory::dims diff_dst_dims,
-    memory::dims strides, memory::dims dilations,
-    memory::dims padding_left, memory::dims padding_right,
-    padding_kind padding) :
-      diff_src_dims(diff_src_dims), filter_dims(filter_dims),
-      diff_dst_dims(diff_dst_dims), strides(strides),
-      dilations(dilations), padding_left(padding_left),
-      padding_right(padding_right), padding(padding) {
-  }
+  MklConvBwdInputParams(memory::dims diff_src_dims, memory::dims filter_dims,
+                        memory::dims diff_dst_dims, memory::dims strides,
+                        memory::dims dilations, memory::dims padding_left,
+                        memory::dims padding_right, padding_kind padding)
+      : diff_src_dims(diff_src_dims),
+        filter_dims(filter_dims),
+        diff_dst_dims(diff_dst_dims),
+        strides(strides),
+        dilations(dilations),
+        padding_left(padding_left),
+        padding_right(padding_right),
+        padding(padding) {}
 };
 
 template <typename T>
@@ -93,8 +95,8 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
   //   filter_data:   input data buffer of filter (weights)
   //   diff_dst_data: input data buffer of dst
   // Bias does not matter here
-  void Execute(const T* diff_src_data,
-      const T* filter_data, const T* diff_dst_data) {
+  void Execute(const T* diff_src_data, const T* filter_data,
+               const T* diff_dst_data) {
     context_.diff_src_mem->set_data_handle(
         static_cast<T*>(const_cast<T*>(diff_src_data)));
     context_.filter_mem->set_data_handle(
@@ -111,9 +113,7 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
     return;
   }
 
-  memory::format GetFilterMemoryFormat() const {
-    return context_.filter_fmt;
-  }
+  memory::format GetFilterMemoryFormat() const { return context_.filter_fmt; }
 
   memory::format GetDiffDstMemoryFormat() const {
     return context_.diff_dst_fmt;
@@ -155,27 +155,33 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
     std::shared_ptr<mkldnn::stream> bwd_input_stream;
     std::vector<mkldnn::primitive> bwd_input_primitives;
 
-    ConvBwdInputContext() :
-        filter_fmt(memory::format::any), diff_dst_fmt(memory::format::any),
-        diff_src_mem(nullptr), filter_mem(nullptr), diff_dst_mem(nullptr),
-        bwd_input_pd(nullptr), conv_bwd_input(nullptr),
-        bwd_input_desc(nullptr), fwd_desc(nullptr), fwd_pd(nullptr),
-        diff_src_md(nullptr), filter_md(nullptr), diff_dst_md(nullptr),
-        bwd_input_stream(nullptr) {
-    }
+    ConvBwdInputContext()
+        : filter_fmt(memory::format::any),
+          diff_dst_fmt(memory::format::any),
+          diff_src_mem(nullptr),
+          filter_mem(nullptr),
+          diff_dst_mem(nullptr),
+          bwd_input_pd(nullptr),
+          conv_bwd_input(nullptr),
+          bwd_input_desc(nullptr),
+          fwd_desc(nullptr),
+          fwd_pd(nullptr),
+          diff_src_md(nullptr),
+          filter_md(nullptr),
+          diff_dst_md(nullptr),
+          bwd_input_stream(nullptr) {}
   };
 
   void Setup(const MklConvBwdInputParams& convBwdInputDims) {
     // create memory descriptors for convolution data w/ no specified format
-    context_.diff_src_md.reset(new memory::desc(
-        {convBwdInputDims.diff_src_dims},
-        MklDnnType<T>(), memory::format::any));
+    context_.diff_src_md.reset(
+        new memory::desc({convBwdInputDims.diff_src_dims}, MklDnnType<T>(),
+                         memory::format::any));
     context_.filter_md.reset(new memory::desc(
-        {convBwdInputDims.filter_dims},
-        MklDnnType<T>(), memory::format::any));
-    context_.diff_dst_md.reset(new memory::desc(
-        {convBwdInputDims.diff_dst_dims},
-        MklDnnType<T>(), memory::format::any));
+        {convBwdInputDims.filter_dims}, MklDnnType<T>(), memory::format::any));
+    context_.diff_dst_md.reset(
+        new memory::desc({convBwdInputDims.diff_dst_dims}, MklDnnType<T>(),
+                         memory::format::any));
 
     // create convolution primitives
     context_.bwd_input_desc.reset(new convolution_backward_data::desc(
@@ -184,9 +190,9 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
         convBwdInputDims.dilations, convBwdInputDims.padding_left,
         convBwdInputDims.padding_right, convBwdInputDims.padding));
 
-    context_.fwd_desc.reset(new convolution_forward::desc(prop_kind::forward,
-        convolution_direct, *context_.diff_src_md, *context_.filter_md,
-        *context_.diff_dst_md, convBwdInputDims.strides,
+    context_.fwd_desc.reset(new convolution_forward::desc(
+        prop_kind::forward, convolution_direct, *context_.diff_src_md,
+        *context_.filter_md, *context_.diff_dst_md, convBwdInputDims.strides,
         convBwdInputDims.dilations, convBwdInputDims.padding_left,
         convBwdInputDims.padding_right, convBwdInputDims.padding));
 
@@ -194,8 +200,7 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
         *context_.fwd_desc, cpu_engine_));
 
     // create backward conv prim desc
-    context_.bwd_input_pd.reset(
-        new convolution_backward_data::primitive_desc(
+    context_.bwd_input_pd.reset(new convolution_backward_data::primitive_desc(
         *context_.bwd_input_desc, cpu_engine_, *context_.fwd_pd));
 
     // create memory primitive based on dummy data
@@ -207,15 +212,21 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
         context_.bwd_input_pd.get()->diff_dst_primitive_desc(), DummyData));
 
     // store the expected memory format
-    context_.filter_fmt = static_cast<memory::format>(
-     context_.bwd_input_pd.get()->weights_primitive_desc().desc().data.format);
-    context_.diff_dst_fmt = static_cast<memory::format>(
-     context_.bwd_input_pd.get()->diff_dst_primitive_desc().desc().data.format);
+    context_.filter_fmt =
+        static_cast<memory::format>(context_.bwd_input_pd.get()
+                                        ->weights_primitive_desc()
+                                        .desc()
+                                        .data.format);
+    context_.diff_dst_fmt =
+        static_cast<memory::format>(context_.bwd_input_pd.get()
+                                        ->diff_dst_primitive_desc()
+                                        .desc()
+                                        .data.format);
 
     // create convolution primitive and add it to net
     context_.conv_bwd_input.reset(new convolution_backward_data(
-        *context_.bwd_input_pd, *context_.diff_dst_mem,
-        *context_.filter_mem, *context_.diff_src_mem));
+        *context_.bwd_input_pd, *context_.diff_dst_mem, *context_.filter_mem,
+        *context_.diff_src_mem));
 
     context_.bwd_input_primitives.push_back(*context_.conv_bwd_input);
   }
@@ -284,11 +295,12 @@ class MklConvBwdInputPrimitiveFactory : public MklPrimitiveFactory<T> {
   }
 };
 
-template <typename Device, class T>
-class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
+template <typename Device, class T, bool is_depthwise>
+class MklConvCustomBackpropInputOp
+    : public MklConvBackpropCommonOp<Device, T, is_depthwise> {
  public:
   explicit MklConvCustomBackpropInputOp(OpKernelConstruction* context)
-      : MklConvBackpropCommonOp<Device, T>(context) {}
+      : MklConvBackpropCommonOp<Device, T, is_depthwise>(context) {}
 
   ~MklConvCustomBackpropInputOp() {}
 
@@ -298,7 +310,7 @@ class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
       MklDnnData<T> diff_dst(&cpu_engine);
 
       // This flag indicate Conv2D or Conv3D
-      bool isConv2D = (this->strides_.size() == 4);
+      bool is_conv2d = (this->strides_.size() == 4);
 
       // Input tensors
       const int kInputIdx = 0, kFilterIdx = 1, kOutbpropIdx = 2;
@@ -311,8 +323,7 @@ class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
       GetMklShape(context, kFilterIdx, &filter_mkl_shape);
       GetMklShape(context, kOutbpropIdx, &diff_dst_mkl_shape);
       // Allow operator-specific sanity checking of shapes.
-      ValidateMklShapes(src_mkl_shape, filter_mkl_shape,
-                        diff_dst_mkl_shape);
+      ValidateMklShapes(src_mkl_shape, filter_mkl_shape, diff_dst_mkl_shape);
 
       // Allow operator-specific generation of shapes.
       // E.g., ConvBackpropFilter gets filter as filter_sizes. It is a
@@ -330,11 +341,11 @@ class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
           diff_dst_tf_shape.num_elements() == 0) {
         MklDnnShape diff_src_mkl_shape;
         diff_src_mkl_shape.SetMklTensor(false);
-        TensorShape diff_src_tf_shape = GetOutputTfShape(
-            src_tf_shape, filter_tf_shape, diff_dst_tf_shape);
+        TensorShape diff_src_tf_shape =
+            GetOutputTfShape(src_tf_shape, filter_tf_shape, diff_dst_tf_shape);
         const int kOutputIdx = 0;
         AllocateOutputSetMklShape(context, kOutputIdx, &diff_src_tensor,
-                       diff_src_tf_shape, diff_src_mkl_shape);
+                                  diff_src_tf_shape, diff_src_mkl_shape);
         CHECK_NOTNULL(diff_src_tensor);
 
         // if output tensor has more than 0 elements, we need to 0 them out.
@@ -353,40 +364,44 @@ class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
 
       // Get forward convolution parameters.
       MklDnnConvUtil conv_utl(context, this->strides_, this->padding_,
-          this->data_format_, this->dilations_);
+                              this->data_format_, this->dilations_);
       conv_utl.GetConvFwdSizesInMklOrder(
           src_tf_shape, filter_tf_shape, &fwd_src_dims, &fwd_filter_dims,
           &strides, &dilations, &fwd_output_dims_tf_order, &fwd_output_dims,
-          &padding_left, &padding_right);
+          &padding_left, &padding_right, false, is_depthwise);
       if (!context->status().ok()) return;
 
       // Create Convolution forward descriptor since Convolution backward
       // API needs it. For that, we first need to create input, filter
       // and output memory descriptors.
-      auto tf_fmt = isConv2D
+      auto tf_fmt = is_conv2d
                         ? TFDataFormatToMklDnnDataFormat(this->data_format_)
                         : TFDataFormatToMklDnn3DDataFormat(this->data_format_);
 
       // If filter is in MKL layout, then simply grab filter layout;
       // otherwise, construct filter in TF layout.
       // For TF layout, filter is in HWIO format.
-      auto fwd_filter_md = filter_mkl_shape.IsMklTensor()
-                               ? filter_mkl_shape.GetMklLayout()
-                               : memory::desc(fwd_filter_dims, MklDnnType<T>(),
-                                              isConv2D ? memory::format::hwio
-                                                       : memory::format::dhwio);
+      auto fwd_filter_md =
+          filter_mkl_shape.IsMklTensor()
+              ? filter_mkl_shape.GetMklLayout()
+              : memory::desc(fwd_filter_dims, MklDnnType<T>(),
+                             is_depthwise
+                                 ? memory::hwigo
+                                 : (is_conv2d ? memory::format::hwio
+                                              : memory::format::dhwio));
 
       conv_utl.GetInputSizeInMklOrder(diff_dst_tf_shape, &diff_dst_dims);
       if (!context->status().ok()) return;
-      auto diff_dst_md = diff_dst_mkl_shape.IsMklTensor()
-                       ? diff_dst_mkl_shape.GetMklLayout()
-                       : memory::desc(diff_dst_dims,
-                           MklDnnType<T>(), tf_fmt);
+      auto diff_dst_md =
+          diff_dst_mkl_shape.IsMklTensor()
+              ? diff_dst_mkl_shape.GetMklLayout()
+              : memory::desc(diff_dst_dims, MklDnnType<T>(), tf_fmt);
       for (int i = 0; i < dilations.size(); i++) dilations[i] -= 1;
 
       MklConvBwdInputPrimitive<T>* conv_bwd_input = nullptr;
-      MklConvBwdInputParams convBwdInputDims(fwd_src_dims, fwd_filter_dims,
-          diff_dst_dims, strides, dilations, padding_left, padding_right,
+      MklConvBwdInputParams convBwdInputDims(
+          fwd_src_dims, fwd_filter_dims, diff_dst_dims, strides, dilations,
+          padding_left, padding_right,
           TFPaddingToMklDnnPadding(this->padding_));
 
       // We don't cache those primitves if the env variable
@@ -396,8 +411,8 @@ class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
       //   1. Legacy CPU without AVX512/AVX2, or
       //   2. 1x1 convolution with stride != 1
       bool do_not_cache = MklPrimitiveFactory<T>::IsPrimitiveMemOptEnabled() &&
-                   (MklPrimitiveFactory<T>::IsLegacyPlatform() ||
-                    IsConv1x1StrideNot1(fwd_filter_dims, strides));
+                          (MklPrimitiveFactory<T>::IsLegacyPlatform() ||
+                           IsConv1x1StrideNot1(fwd_filter_dims, strides));
       conv_bwd_input = MklConvBwdInputPrimitiveFactory<T>::Get(convBwdInputDims,
                                                                do_not_cache);
       auto bwd_input_pd = conv_bwd_input->GetPrimitiveDesc();
@@ -411,14 +426,14 @@ class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
       diff_src_mkl_shape.SetMklLayout(&diff_src_pd);
       diff_src_mkl_shape.SetElemType(MklDnnType<T>());
       diff_src_mkl_shape.SetTfLayout(bwd_diff_src_dims.size(),
-          bwd_diff_src_dims, bwd_diff_src_format);
+                                     bwd_diff_src_dims, bwd_diff_src_format);
       TensorShape diff_src_tf_shape;
       diff_src_tf_shape.AddDim(diff_src_pd.get_size() / sizeof(T));
-      AllocateOutputSetMklShape(context, 0, &diff_src_tensor,
-          diff_src_tf_shape, diff_src_mkl_shape);
+      AllocateOutputSetMklShape(context, 0, &diff_src_tensor, diff_src_tf_shape,
+                                diff_src_mkl_shape);
 
-      T *diff_src_data = static_cast<T*>(const_cast<T*>(
-          diff_src_tensor->flat<T>().data()));
+      T* diff_src_data =
+          static_cast<T*>(const_cast<T*>(diff_src_tensor->flat<T>().data()));
 
       // check if filter and diff_dst need reorder
       T* filter_data = nullptr;
@@ -428,19 +443,18 @@ class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
         filter.CheckReorderToOpMem(bwd_input_pd->weights_primitive_desc());
         filter_data = static_cast<T*>(filter.GetOpMem().get_data_handle());
       } else {
-        filter_data = static_cast<T*>(const_cast<T*>(
-                       filter_tensor.flat<T>().data()));
+        filter_data =
+            static_cast<T*>(const_cast<T*>(filter_tensor.flat<T>().data()));
       }
 
       T* diff_dst_data = nullptr;
       if (diff_dst_md.data.format != conv_bwd_input->GetDiffDstMemoryFormat()) {
         diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
         diff_dst.CheckReorderToOpMem(bwd_input_pd->diff_dst_primitive_desc());
-        diff_dst_data = static_cast<T*>(
-                         diff_dst.GetOpMem().get_data_handle());
+        diff_dst_data = static_cast<T*>(diff_dst.GetOpMem().get_data_handle());
       } else {
-        diff_dst_data = static_cast<T*>(const_cast<T*>(
-                         diff_dst_tensor.flat<T>().data()));
+        diff_dst_data =
+            static_cast<T*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
       }
 
       // execute convolution input bwd
@@ -543,18 +557,22 @@ class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp<Device, T> {
   }
 };
 
-#define REGISTER_MKL_CPU_KERNELS(T)                                    \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput")              \
-                              .Device(DEVICE_CPU)                      \
-                              .TypeConstraint<T>("T")                  \
-                              .Label(mkl_op_registry::kMklOpLabel),    \
-                          MklConvCustomBackpropInputOp<CPUDevice, T>); \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv3DBackpropInputV2")            \
-                              .Device(DEVICE_CPU)                      \
-                              .TypeConstraint<T>("T")                  \
-                              .Label(mkl_op_registry::kMklOpLabel),    \
-                          MklConvCustomBackpropInputOp<CPUDevice, T>);
-
+#define REGISTER_MKL_CPU_KERNELS(T)                                           \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DBackpropInput")                     \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .Label(mkl_op_registry::kMklOpLabel),           \
+                          MklConvCustomBackpropInputOp<CPUDevice, T, false>); \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv3DBackpropInputV2")                   \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .Label(mkl_op_registry::kMklOpLabel),           \
+                          MklConvCustomBackpropInputOp<CPUDevice, T, false>); \
+  REGISTER_KERNEL_BUILDER(Name("_MklDepthwiseConv2dNativeBackpropInput")      \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .Label(mkl_op_registry::kMklOpLabel),           \
+                          MklConvCustomBackpropInputOp<CPUDevice, T, true>);
 TF_CALL_float(REGISTER_MKL_CPU_KERNELS);
 #undef REGISTER_MKL_CPU_KERNELS
 
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 6e4fbf55c5f78158ffa811f4823d0086fb382d88..0134cc2235623d796ab9b858fa49506eaabfc8db 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -21,13 +21,14 @@ limitations under the License.
 #include <map>
 #include <vector>
 
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/mkl_conv_ops.h"
 #include "tensorflow/core/kernels/mkl_quantized_conv_ops.h"
 #include "tensorflow/core/kernels/no_op.h"
@@ -91,6 +92,9 @@ struct MklConvFwdParams {
         padding_left(padding_left),
         padding_right(padding_right) {}
 };
+
+typedef mkldnn::convolution_forward::primitive_desc ConvFwdPd;
+
 // With quantization, input, filter, and output can have different types
 // so we use differnt template parameter for each type
 template <typename T, typename Tinput, typename Tfilter, typename Tbias,
@@ -100,7 +104,7 @@ class MklConvFwdPrimitive : public MklPrimitive {
   explicit MklConvFwdPrimitive(const MklConvFwdParams& convFwdDims)
       : cpu_engine_(engine::cpu, 0) {
     context_.fwd_stream.reset(new stream(stream::kind::eager));
-    // create conv primitive
+    // Create conv primitive
     if (context_.conv_fwd == nullptr) {
       Setup(convFwdDims);
     }
@@ -125,7 +129,7 @@ class MklConvFwdPrimitive : public MklPrimitive {
         static_cast<void*>(const_cast<Toutput*>(dst_data)));
     context_.fwd_stream->submit(context_.fwd_primitives);
 
-    // after exec, set data handle back
+    // After exec, set data handle back
     context_.src_mem->set_data_handle(DummyData);
     context_.filter_mem->set_data_handle(DummyData);
     context_.bias_mem->set_data_handle(DummyData);
@@ -148,7 +152,7 @@ class MklConvFwdPrimitive : public MklPrimitive {
         static_cast<void*>(const_cast<Toutput*>(dst_data)));
     context_.fwd_stream->submit(context_.fwd_primitives);
 
-    // after execution, set data handle back
+    // After execution, set data handle back
     context_.src_mem->set_data_handle(DummyData);
     context_.filter_mem->set_data_handle(DummyData);
     context_.dst_mem->set_data_handle(DummyData);
@@ -158,15 +162,14 @@ class MklConvFwdPrimitive : public MklPrimitive {
 
   memory::format GetFilterMemoryFormat() const { return context_.filter_fmt; }
 
-  std::shared_ptr<mkldnn::convolution_forward::primitive_desc>
-  GetPrimitiveDesc() const {
+  std::shared_ptr<ConvFwdPd> GetPrimitiveDesc() const {
     return context_.fwd_pd;
   }
 
  private:
   // Primitive reuse context for Conv2D Fwd op
   struct ConvFwdContext {
-    // expected memory format for this primitive instance
+    // Expected memory format for this primitive instance
     memory::format src_fmt;
     memory::format filter_fmt;
 
@@ -176,17 +179,17 @@ class MklConvFwdPrimitive : public MklPrimitive {
     std::shared_ptr<mkldnn::memory> bias_mem;
     std::shared_ptr<mkldnn::memory> dst_mem;
 
-    // desc & prmitive desc
+    // Desc & prmitive desc
     std::shared_ptr<mkldnn::convolution_forward::desc> fwd_desc;
 
-    // memory desc
+    // Memory desc
     std::shared_ptr<mkldnn::memory::desc> src_md;
     std::shared_ptr<mkldnn::memory::desc> filter_md;
     std::shared_ptr<mkldnn::memory::desc> bias_md;
     std::shared_ptr<mkldnn::memory::desc> dst_md;
 
-    // convolution primitive
-    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> fwd_pd;
+    // Convolution primitive
+    std::shared_ptr<ConvFwdPd> fwd_pd;
     std::shared_ptr<mkldnn::primitive> conv_fwd;
 
     std::shared_ptr<mkldnn::stream> fwd_stream;
@@ -209,7 +212,7 @@ class MklConvFwdPrimitive : public MklPrimitive {
   };
 
   void Setup(const MklConvFwdParams& convFwdDims) {
-    // create memory descriptors for convolution data w/ no specified format
+    // Create memory descriptors for convolution data w/ no specified format
     context_.src_md.reset(new memory::desc(
         {convFwdDims.src_dims}, MklDnnType<Tinput>(), memory::format::any));
 
@@ -223,7 +226,7 @@ class MklConvFwdPrimitive : public MklPrimitive {
       context_.bias_md.reset(new memory::desc(
           {convFwdDims.bias_dims}, MklDnnType<Tbias>(), memory::format::any));
 
-    // create a convolution
+    // Create a convolution
     if (!convFwdDims.bias_dims.empty()) {
       context_.fwd_desc.reset(new convolution_forward::desc(
           prop_kind::forward, convolution_direct, *context_.src_md,
@@ -238,8 +241,7 @@ class MklConvFwdPrimitive : public MklPrimitive {
           convFwdDims.padding_right, padding_kind::zero));
     }
 
-    context_.fwd_pd.reset(new convolution_forward::primitive_desc(
-        *context_.fwd_desc, cpu_engine_));
+    context_.fwd_pd.reset(new ConvFwdPd(*context_.fwd_desc, cpu_engine_));
 
     // Check if there is any fusions as post-ops
     auto const& post_op_params = convFwdDims.post_op_params;
@@ -270,21 +272,20 @@ class MklConvFwdPrimitive : public MklPrimitive {
         }
       }
       post_ops_attr.set_post_ops(post_ops);
-      context_.fwd_pd.reset(new convolution_forward::primitive_desc(
-          *context_.fwd_desc, post_ops_attr, cpu_engine_));
+      context_.fwd_pd.reset(
+          new ConvFwdPd(*context_.fwd_desc, post_ops_attr, cpu_engine_));
     } else {
-      context_.fwd_pd.reset(new convolution_forward::primitive_desc(
-          *context_.fwd_desc, cpu_engine_));
+      context_.fwd_pd.reset(new ConvFwdPd(*context_.fwd_desc, cpu_engine_));
     }
 
-    // store the expected memory format
+    // Store the expected memory format
     context_.src_fmt = static_cast<mkldnn::memory::format>(
         context_.fwd_pd.get()->src_primitive_desc().desc().data.format);
 
     context_.filter_fmt = static_cast<mkldnn::memory::format>(
         context_.fwd_pd.get()->weights_primitive_desc().desc().data.format);
 
-    // create memory primitive based on dummy data
+    // Create memory primitive based on dummy data
     context_.src_mem.reset(
         new memory(context_.fwd_pd.get()->src_primitive_desc(), DummyData));
     context_.filter_mem.reset(
@@ -292,7 +293,7 @@ class MklConvFwdPrimitive : public MklPrimitive {
     context_.dst_mem.reset(
         new memory(context_.fwd_pd.get()->dst_primitive_desc(), DummyData));
 
-    // create convolution primitive and add it to net
+    // Create convolution primitive and add it to net
     if (!convFwdDims.bias_dims.empty()) {
       context_.bias_mem.reset(new memory(
           {{{convFwdDims.bias_dims}, MklDnnType<T>(), memory::format::x},
@@ -323,11 +324,12 @@ class MklConvFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
       const MklConvFwdParams& convFwdDims, bool do_not_cache) {
     MklConvFwdPrimitive<T, Tinput, Tfilter, Tbias, Toutput>* conv_fwd = nullptr;
 
-    if (do_not_cache) { /* Always create new primitive */
+    if (do_not_cache) {
+      // Always create a new primitive
       conv_fwd = new MklConvFwdPrimitive<T, Tinput, Tfilter, Tbias, Toutput>(
           convFwdDims);
     } else {
-      // try to find a suitable one in pool
+      // Try to find a suitable one in pool
       conv_fwd = dynamic_cast<
           MklConvFwdPrimitive<T, Tinput, Tfilter, Tbias, Toutput>*>(
           MklConvFwdPrimitiveFactory<T, Tinput, Tfilter, Tbias,
@@ -411,7 +413,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 
 // For now, MKL-ML is default. So making MKL-DNN not a default choice.
 #ifdef INTEL_MKL_ML_ONLY
-template <typename Device, typename T, bool biasEnabled>
+template <typename Device, typename T, bool bias_enabled>
 class MklConvOp : public OpKernel {
  public:
   ~MklConvOp() {}
@@ -447,7 +449,7 @@ class MklConvOp : public OpKernel {
     CHECK(!mkl_filter_shape.IsMklTensor())
         << "Conv filter should not be in MKL Layout";
 
-    if (biasEnabled) {
+    if (bias_enabled) {
       const Tensor& bias = MklGetInput(context, 2);
       OP_REQUIRES(context, bias.dims() == 1,
                   errors::InvalidArgument("bias must be 1-dimensional: ",
@@ -595,14 +597,14 @@ class MklConvOp : public OpKernel {
     mkl_context.filter_strides[2] = filter.dim_size(3);  // in_depth
     mkl_context.filter_strides[3] = 1;                   // out_depth
 
-    if (biasEnabled) {
+    if (bias_enabled) {
       const Tensor& bias = MklGetInput(context, 2);
       mkl_context.bias_sizes[0] = {static_cast<size_t>(bias.dim_size(0))};
       mkl_context.bias_strides[0] = {1};
     }
 
     // Create Convolution Primitive
-    if (biasEnabled) {
+    if (bias_enabled) {
       CHECK_EQ(
           dnnConvolutionCreateForwardBias_F32(
               &mkl_context.prim_fwd, nullptr, dnnAlgorithmConvolutionDirect,
@@ -713,7 +715,7 @@ class MklConvOp : public OpKernel {
                                    filter_strides),
                E_SUCCESS);
 
-      if (biasEnabled) {
+      if (bias_enabled) {
         CHECK_EQ(dnnLayoutCreate_F32(&lt_bias, 1, bias_sizes, bias_strides),
                  E_SUCCESS);
       }
@@ -794,7 +796,7 @@ class MklConvOp : public OpKernel {
       conv_res[dnnResourceFilter] =
           (mkl_convert_filter) ? mkl_buf_convert_filter : mkl_buf_filter;
 
-      if (biasEnabled) {
+      if (bias_enabled) {
         const Tensor& bias = MklGetInput(context, 2);
         void* mkl_buf_bias =
             const_cast<void*>(static_cast<const void*>(bias.flat<T>().data()));
@@ -825,7 +827,7 @@ class MklConvOp : public OpKernel {
       dnnDelete_F32(prim_fwd);
       if (!input_in_mkl_format) dnnLayoutDelete_F32(lt_input);
       dnnLayoutDelete_F32(lt_filter);
-      if (biasEnabled) dnnLayoutDelete_F32(lt_bias);
+      if (bias_enabled) dnnLayoutDelete_F32(lt_bias);
     }
   } MklConv2DOpContext;
 
@@ -851,7 +853,7 @@ REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias")
 // Base class for convolution forward operations
 template <typename Device, typename Tinput, typename Tfilter, typename Tbias,
           typename Toutput, typename Ttemp_output, typename Tpadding,
-          bool biasEnabled, bool padEnabled>
+          bool bias_enabled, bool pad_enabled, bool is_depthwise>
 class MklConvOp : public OpKernel {
  public:
   ~MklConvOp() {}
@@ -874,6 +876,9 @@ class MklConvOp : public OpKernel {
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    is_filter_const_ = false;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("is_filter_const", &is_filter_const_));
 
     if (strides_.size() == 4) {
       OP_REQUIRES(context, dilations_.size() == 4,
@@ -915,6 +920,10 @@ class MklConvOp : public OpKernel {
       const Tensor& src_tensor = MklGetInput(context, kInputIndex_Src);
       const Tensor& filter_tensor = MklGetInput(context, kInputIndex_Filter);
 
+      // Data from persistent (cached) filter tensor
+      const Tensor& cached_filter_data_tensor =
+          *cached_filter_data_ptensor_.AccessTensor(context);
+
       MklDnnShape src_mkl_shape, filter_mkl_shape;
       GetMklShape(context, kInputIndex_Src, &src_mkl_shape);
       GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape);
@@ -930,7 +939,7 @@ class MklConvOp : public OpKernel {
       memory::dims dst_dims_tf_order, dst_dims_mkl_order;
 
       // If pad with conv2d fusion is enabled
-      if (padEnabled) {
+      if (fuse_pad_) {
         PadWithConvFusion(context, padding_left, padding_right);
       }
 
@@ -942,7 +951,7 @@ class MklConvOp : public OpKernel {
       conv_utl.GetConvFwdSizesInMklOrder(
           src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides,
           &dilations, &dst_dims_tf_order, &dst_dims_mkl_order, &padding_left,
-          &padding_right, padEnabled);
+          &padding_right, fuse_pad_, is_depthwise);
       if (!context->status().ok()) return;
 
       // Check for corner case - if there is nothing to compute, return.
@@ -956,11 +965,9 @@ class MklConvOp : public OpKernel {
         AllocateOutputSetMklShape(context, kOutputIndex_Dst, &dst_tensor,
                                   src_tf_shape, dst_mkl_shape);
 
-        // MklConv2D/3D also outputs converted filter
-        // as 2nd output of Conv2D/3D.
+        // MklConv2D/3D also outputs converted filter as 2nd output.
         filter_mkl_shape.SetMklTensor(false);
         Tensor* output_filter_tensor = nullptr;
-        // MklConv2D also outputs converted filter as 2nd output.
         if (typeid(Tinput) == typeid(float) &&
             typeid(Tfilter) == typeid(float) &&
             typeid(Toutput) == typeid(float)) {
@@ -972,23 +979,38 @@ class MklConvOp : public OpKernel {
         return;
       }
 
-      bool isConv2D = (strides_.size() == 4);
-      // TODO(Intel-tf) Add check to make sure padEnabled is true only for 2D
-      if (!isConv2D) {
+      bool is_conv2d = (strides_.size() == 4);
+
+      if (!is_conv2d) {
         OP_REQUIRES(
-            context, !padEnabled,
+            context, !pad_enabled,
+            errors::InvalidArgument("Pad + Conv fusion only works for 2D"));
+      }
+
+      // TODO 3-D support for Depthwise is not there
+      if (is_depthwise) {
+        OP_REQUIRES(context, is_conv2d,
+                    errors::InvalidArgument(
+                        "Only 2D convolution is supported for depthwise."));
+      }
+
+      // TODO(Intel-tf) Add check to make sure pad_enabled is true only for 2D
+      if (!is_conv2d) {
+        OP_REQUIRES(
+            context, !fuse_pad_,
             errors::InvalidArgument("Pad+Conv fusion only works for 2D"));
       }
       // Create memory for user data.
       // Describe how the inputs and outputs of Convolution look like. Also
       // specify buffers containing actual input and output data.
-      auto tf_fmt = isConv2D ? TFDataFormatToMklDnnDataFormat(data_format_)
-                             : TFDataFormatToMklDnn3DDataFormat(data_format_);
-
-      // If input is in MKL layout, then simply grab input layout; otherwise,
-      // construct input Tf layout. For TF layout, although input shape
-      // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
-      // layout depending on data format:
+      auto tf_fmt = is_conv2d ? TFDataFormatToMklDnnDataFormat(data_format_)
+                              : TFDataFormatToMklDnn3DDataFormat(data_format_);
+
+      // If input is in MKL layout, then simply grab the layout; otherwise,
+      // construct TF layout for input.
+      // For constructing TF layout for input, although input shape (src_dims)
+      // is required to be in MKL-DNN order, the input layout is actually in
+      // TF layout depending on the data format:
       //     Conv2D: NHWC or NCHW
       //     Conv3D: NDHWC or NCDHW
       auto src_md = src_mkl_shape.IsMklTensor()
@@ -997,61 +1019,55 @@ class MklConvOp : public OpKernel {
       src.SetUsrMem(src_md, &src_tensor);
 
       // Although filter shape (filter_dims) required is in MKL-DNN order,
-      // the layout is Tensorflow's layout (HWIO).
-      auto filter_md = filter_mkl_shape.IsMklTensor()  // Should NEVER be true
-                           ? filter_mkl_shape.GetMklLayout()
-                           : memory::desc(filter_dims, MklDnnType<Tfilter>(),
-                                          isConv2D ? memory::format::hwio
-                                                   : memory::format::dhwio);
+      // the layout is Tensorflow's layout (HWIO) and (HWIGO) for
+      // depthwise/group convolutions.
+
+      auto filter_format = is_conv2d ? (is_depthwise ? memory::format::hwigo
+                                                     : memory::format::hwio)
+                                     : memory::format::dhwio;
+
+      DCHECK(!filter_mkl_shape.IsMklTensor());
+      auto filter_md =
+          filter_mkl_shape.IsMklTensor()
+              ? filter_mkl_shape.GetMklLayout()
+              : memory::desc(filter_dims, MklDnnType<Tfilter>(), filter_format);
       filter.SetUsrMem(filter_md, &filter_tensor);
-      // MKLDNN dilation starts from 0.
-      for (int i = 0; i < dilations.size(); i++) dilations[i] -= 1;
 
-      // In some cases, primitve descriptor includes potentialy large buffers,
-      // we don't cache those primitves if the env variable
-      // TF_MKL_OPTIMIZE_PRIMITIVE_MEMUSE is true. MKL DNN allocates buffers
-      // in the following cases
+      // MKLDNN dilations start from 0.
+      for (int i = 0; i < dilations.size(); ++i) --dilations[i];
+
+      // In some cases, primitive descriptor could potentially contain
+      // large buffers. As a result, we don't cache these primitives if the
+      // environment variable `TF_MKL_OPTIMIZE_PRIMITIVE_MEMUSE` is set to True.
+      // MKL-DNN allocates buffers in the following cases:
       //   1. Legacy CPU without AVX512/AVX2, or
-      //   2. 1x1 convolution with stride != 1
+      //   2. 1x1 convolution with strides != 1
       bool do_not_cache =
           MklPrimitiveFactory<Tinput>::IsPrimitiveMemOptEnabled() &&
           (src_dims[MklDnnDims::Dim_N] > kSmallBatchSize) &&
           (MklPrimitiveFactory<Tinput>::IsLegacyPlatform() ||
            IsConv1x1StrideNot1(filter_dims, strides));
 
-      // get a conv2d fwd from primitive pool
+      // Get a conv2d fwd from primitive pool
       MklConvFwdPrimitive<float, Tinput, Tfilter, Tbias, Ttemp_output>*
           conv_fwd = nullptr;
-      if (biasEnabled) {
-        memory::dims bias_dims = {};
+      memory::dims bias_dims = {};
+      if (fuse_biasadd_) {
         conv_utl.GetBiasSizeInMklOrder(kInputIndex_Bias, &bias_dims);
-        MklConvFwdParams convFwdDims(src_dims, filter_dims, bias_dims,
-                                     dst_dims_mkl_order, strides, dilations,
-                                     padding_left, padding_right);
-
-        // TODO(mdfaijul):  Extend the basic parameters for data types and
-        // fusions
-        this->ExtendConvFwdParams(context, convFwdDims);
-
-        conv_fwd = MklConvFwdPrimitiveFactory<float, Tinput, Tfilter, Tbias,
-                                              Ttemp_output>::Get(convFwdDims,
-                                                                 do_not_cache);
-      } else {
-        MklConvFwdParams convFwdDims(src_dims, filter_dims, NONE_DIMS,
-                                     dst_dims_mkl_order, strides, dilations,
-                                     padding_left, padding_right);
+      }
+      MklConvFwdParams convFwdDims(
+          src_dims, filter_dims, fuse_biasadd_ ? bias_dims : NONE_DIMS,
+          dst_dims_mkl_order, strides, dilations, padding_left, padding_right);
 
-        // Extend the basic parameters for data types and fusions
-        this->ExtendConvFwdParams(context, convFwdDims);
+      // TODO(mdfaijul): Extend the basic parameters for data types and fusions
+      this->ExtendConvFwdParams(context, convFwdDims);
 
-        conv_fwd = MklConvFwdPrimitiveFactory<float, Tinput, Tfilter, Tbias,
-                                              Ttemp_output>::Get(convFwdDims,
-                                                                 do_not_cache);
-      }
+      conv_fwd = MklConvFwdPrimitiveFactory<float, Tinput, Tfilter, Tbias,
+                                            Ttemp_output>::Get(convFwdDims,
+                                                               do_not_cache);
 
-      // allocate output tensors output_tensor and filter_out_tensor
-      std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_fwd_pd =
-          conv_fwd->GetPrimitiveDesc();
+      // Allocate output tensors `output_tensor` and `filter_out_tensor`
+      std::shared_ptr<ConvFwdPd> conv_fwd_pd = conv_fwd->GetPrimitiveDesc();
       AllocateOutputTensor(context, *conv_fwd_pd, dst_dims_mkl_order, tf_fmt,
                            &dst_tensor);
       Tensor* filter_out_tensor = nullptr;
@@ -1065,9 +1081,10 @@ class MklConvOp : public OpKernel {
       Ttemp_output* dst_data =
           reinterpret_cast<Ttemp_output*>(dst_tensor->flat<Toutput>().data());
 
-      // check whether src/filter need reorder
+      // Check whether src and filter need to be reordered
       Tinput* src_data = nullptr;
       if (src_md.data.format != conv_fwd->GetSrcMemoryFormat()) {
+        // Reorder src
         src.SetUsrMem(src_md, &src_tensor);
         src.CheckReorderToOpMem(conv_fwd_pd.get()->src_primitive_desc());
         src_data = static_cast<Tinput*>(src.GetOpMem().get_data_handle());
@@ -1075,26 +1092,44 @@ class MklConvOp : public OpKernel {
         src_data = static_cast<Tinput*>(
             const_cast<Tinput*>(src_tensor.flat<Tinput>().data()));
       }
+
       Tfilter* filter_data = nullptr;
       if (filter_md.data.format != conv_fwd->GetFilterMemoryFormat()) {
-        filter.SetUsrMem(filter_md, &filter_tensor);
-        if (filter_out_tensor == nullptr) {
-          filter.CheckReorderToOpMem(
-              conv_fwd_pd.get()->weights_primitive_desc());
-        } else {
-          filter.CheckReorderToOpMem(
-              conv_fwd_pd.get()->weights_primitive_desc(),
-              filter.GetTensorBuffer(filter_out_tensor));
+        bool is_filter_cached = false;
+        // If filter is a constant, we can avoid the conversion of filter from
+        // Tensorflow format to MKL format by caching the filter when it is
+        // converted for the first time. This cached filter can then be reused
+        // in subsequent iterations.
+        if (is_filter_const_) {
+          if (IsFilterCacheEmpty(context)) {
+            // Cache filter if it is not already cached.
+            CacheFilter(context, conv_fwd_pd, filter_data, filter_tensor,
+                        filter, filter_md);
+          }
+          filter_data =
+              GetCachedFilter(context, conv_fwd->GetFilterMemoryFormat());
+          is_filter_cached = (filter_data != nullptr);
+        }
+        if (!is_filter_cached) {
+          filter.SetUsrMem(filter_md, &filter_tensor);
+          if (filter_out_tensor == nullptr) {
+            filter.CheckReorderToOpMem(
+                conv_fwd_pd.get()->weights_primitive_desc());
+          } else {
+            filter.CheckReorderToOpMem(
+                conv_fwd_pd.get()->weights_primitive_desc(),
+                filter.GetTensorBuffer(filter_out_tensor));
+          }
+          filter_data =
+              static_cast<Tfilter*>(filter.GetOpMem().get_data_handle());
         }
-        filter_data =
-            static_cast<Tfilter*>(filter.GetOpMem().get_data_handle());
       } else {
         filter_data = static_cast<Tfilter*>(
             const_cast<Tfilter*>(filter_tensor.flat<Tfilter>().data()));
       }
 
-      // execute convolution
-      if (biasEnabled) {
+      // Execute convolution
+      if (fuse_biasadd_) {
         const Tensor& bias_tensor = MklGetInput(context, kInputIndex_Bias);
         Tbias* bias_data =
             this->GetBiasHandle(context, conv_fwd_pd, bias_tensor);
@@ -1103,7 +1138,7 @@ class MklConvOp : public OpKernel {
         conv_fwd->Execute(src_data, filter_data, dst_data);
       }
 
-      // delete primitive since it is not cached.
+      // Delete primitive since it is not cached.
       if (do_not_cache) delete conv_fwd;
     } catch (mkldnn::error& e) {
       string error_msg = tensorflow::strings::StrCat(
@@ -1117,22 +1152,26 @@ class MklConvOp : public OpKernel {
 
   void PadWithConvFusion(OpKernelContext* context, memory::dims& padding_left,
                          memory::dims& padding_right) {
-    const Tensor& paddings_tf = MklGetInput(context, 2);
+    const Tensor& paddings_tf = MklGetInput(context, input_index_pad_);
     OP_REQUIRES(context, paddings_tf.dims() == 2,
                 errors::InvalidArgument("paddings must be 2-dimensional: ",
                                         paddings_tf.shape().DebugString()));
-    Tpadding* paddings = nullptr;
-    // To get individual pad, need to flatten the tensor
-    paddings = static_cast<Tpadding*>(
+
+    // Flatten tensor to get individual paddings.
+    Tpadding* paddings = static_cast<Tpadding*>(
         const_cast<Tpadding*>(paddings_tf.flat<Tpadding>().data()));
-    // For NHWC format:
-    // paddings[0], paddings[1], paddings[6], paddings[7] should be zero
-    // if the paddings_tf is [ [0, 0] [1,2] [3,4] [0,0] ]
-    // paddings = {0, 0, 1, 2, 3, 4, 0, 0} ; flat method is row major
-    // then, values are: top = 1, bottom =2, left=3, right=4
-    // For NCHW format:
-    // paddings[0], paddings[1], paddings[2], paddings[3] should be zero
-    // similar explanation as NHWC format will apply.
+
+    // If the data format is NHWC, indices 0, 1, 6 and 7 of paddings(_tf)
+    // will be zero.
+    // Example:
+    // paddings_tf = [ [0, 0] [1, 2] [3, 4] [0, 0] ],
+    // flat method = row-major, then:
+    // paddings = {0, 0, 1, 2, 3, 4, 0, 0}.
+    // Hence, the values are: top = 1, bottom = 2, left = 3, right = 4.
+    //
+    // Similarly, if the data format is NCHW, indices 0, 1, 2 and 3 of
+    // paddings(_tf) will be zero.
+    // i.e. for the above example, paddings = {0, 0, 0, 0, 1, 2, 3, 4}.
     int64 pad_top, pad_left;
     int64 pad_bottom, pad_right;
     string data_format = ToString(data_format_);
@@ -1147,6 +1186,7 @@ class MklConvOp : public OpKernel {
       pad_left = paddings[6];
       pad_right = paddings[7];
     }
+
     // Create padding arrays for MKL DNN convolutions.
     // MKL-DNN uses asymetric padding.
     padding_left = {static_cast<int>(pad_top), static_cast<int>(pad_left)};
@@ -1154,6 +1194,17 @@ class MklConvOp : public OpKernel {
   }
 
  protected:
+  void set_fuse_biasadd(bool fuse_biasadd) { fuse_biasadd_ = fuse_biasadd; }
+  void set_fuse_relu(bool fuse_relu) { fuse_relu_ = fuse_relu; }
+  void set_fuse_pad(bool fuse_pad) {
+    fuse_pad_ = fuse_pad;
+    // In PadwithFusedConv OP, pad is the fourth index.
+    input_index_pad_ = 3;
+  }
+
+  // This method is for the base class MklConvOp, which handles the
+  // floating point implementation of Conv. The quantized conv implementations
+  // will use overidden versions of this method.
   virtual void ExtendConvFwdParams(OpKernelContext* context,
                                    MklConvFwdParams& params) {
     // Create a string from data types of input, filter, bias, and output.
@@ -1161,27 +1212,28 @@ class MklConvOp : public OpKernel {
     params.dtypes.append(typeid(Tfilter).name());
     params.dtypes.append(typeid(Tbias).name());
     params.dtypes.append(typeid(Toutput).name());
+
+    // Add fusions as post ops
+    // NOTE: Fusion of BiasAdd is handled directly inside MklConvOp by
+    // checking `fuse_biasadd_` flag.
+    if (fuse_relu_) params.post_op_params.push_back({"relu", {1.0, 0.0, 0.0}});
   }
 
-  virtual Tbias* GetBiasHandle(
-      OpKernelContext* context,
-      std::shared_ptr<mkldnn::convolution_forward::primitive_desc>&
-          conv2d_fwd_pd,
-      const Tensor& bias_tensor) {
-    if (biasEnabled) {
+  virtual Tbias* GetBiasHandle(OpKernelContext* context,
+                               std::shared_ptr<ConvFwdPd>& conv2d_fwd_pd,
+                               const Tensor& bias_tensor) {
+    if (fuse_biasadd_) {
       return static_cast<Tbias*>(
           const_cast<Tbias*>(bias_tensor.flat<Tbias>().data()));
-    } else {
-      return nullptr;
     }
+    return nullptr;
   }
 
-  // Allocate output tensor.
-  virtual void AllocateOutputTensor(
-      OpKernelContext* context,
-      const convolution_forward::primitive_desc& conv_prim_desc,
-      const memory::dims& output_dims_mkl_order,
-      memory::format output_tf_format, Tensor** output_tensor) {
+  virtual void AllocateOutputTensor(OpKernelContext* context,
+                                    const ConvFwdPd& conv_prim_desc,
+                                    const memory::dims& output_dims_mkl_order,
+                                    memory::format output_tf_format,
+                                    Tensor** output_tensor) {
     CHECK_NOTNULL(output_tensor);
     auto dst_pd = conv_prim_desc.dst_primitive_desc();
 
@@ -1212,18 +1264,53 @@ class MklConvOp : public OpKernel {
  private:
   std::vector<int32> strides_;
   std::vector<int32> dilations_;
+  bool is_filter_const_;
+  mutex mu_;
   Padding padding_;
   TensorFormat data_format_;
+  PersistentTensor cached_filter_data_ptensor_ GUARDED_BY(mu_);
+  PersistentTensor cached_filter_md_ptensor_ GUARDED_BY(mu_);
+
+  // Initialize to values the template is instantiated with
+  bool fuse_biasadd_ = bias_enabled;
+  bool fuse_relu_ = false;
+  bool fuse_pad_ = pad_enabled;
+
+  int input_index_pad_ = 2;
+
   const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2;
-  const int kInputIndex_Pad = 2;
   const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1;
   const int kDilationH = 0, kDilationW = 1;
 
-  // Allocate filter output tensor.
-  void AllocateFilterOutputTensor(
-      OpKernelContext* context,
-      const convolution_forward::primitive_desc& conv_prim_desc,
-      const memory::dims& filter_dims_tf_order, Tensor** filter_tensor) {
+  // Allocate persistent tensors for cached filter data and
+  // cached filter memory descriptor (data format)
+  void AllocatePersistentTensor(OpKernelContext* context,
+                                const ConvFwdPd& conv_prim_desc,
+                                Tensor** filter_tensor) {
+    DCHECK(filter_tensor);
+    TensorShape filter_tf_shape;
+    filter_tf_shape.AddDim(
+        (conv_prim_desc.weights_primitive_desc().get_size() / sizeof(Tfilter)));
+    OP_REQUIRES_OK(context, context->allocate_persistent(
+                                DataTypeToEnum<Tfilter>::value, filter_tf_shape,
+                                &cached_filter_data_ptensor_, filter_tensor));
+
+    Tensor* second_tensor = nullptr;
+    TensorShape filter_mkl_format;
+    filter_mkl_format.AddDim(
+        sizeof(conv_prim_desc.weights_primitive_desc().desc().data.format) /
+        sizeof(DT_INT32));
+    OP_REQUIRES_OK(context, context->allocate_persistent(
+                                DT_INT32, filter_mkl_format,
+                                &cached_filter_md_ptensor_, &second_tensor));
+    second_tensor->scalar<int32>()() =
+        conv_prim_desc.weights_primitive_desc().desc().data.format;
+  }
+
+  void AllocateFilterOutputTensor(OpKernelContext* context,
+                                  const ConvFwdPd& conv_prim_desc,
+                                  const memory::dims& filter_dims_tf_order,
+                                  Tensor** filter_tensor) {
     CHECK_NOTNULL(filter_tensor);
     auto filter_pd = conv_prim_desc.weights_primitive_desc();
 
@@ -1246,12 +1333,14 @@ class MklConvOp : public OpKernel {
     AllocateOutputSetMklShape(context, kOutputIndex_Filter, filter_tensor,
                               filter_tf_shape, filter_mkl_shape);
   }
+
   // Prepare and execute net - checks for input and output reorders.
-  void PrepareAndExecuteNet(
-      const convolution_forward::primitive_desc& conv_prim_desc,
-      MklDnnData<Tinput>* src, MklDnnData<Tfilter>* filter,
-      MklDnnData<Tbias>* bias, MklDnnData<Toutput>* output,
-      Tensor* filter_out_tensor) {
+  void PrepareAndExecuteNet(const ConvFwdPd& conv_prim_desc,
+                            MklDnnData<Tinput>* src,
+                            MklDnnData<Tfilter>* filter,
+                            MklDnnData<Tbias>* bias,
+                            MklDnnData<Toutput>* output,
+                            Tensor* filter_out_tensor) {
     CHECK_NOTNULL(filter_out_tensor);
 
     // Create reorders between user layout and MKL layout if it is needed and
@@ -1267,12 +1356,12 @@ class MklConvOp : public OpKernel {
     // Create convolution primitive and add it to net.
     std::vector<primitive> net;
     if (bias) {
-      DCHECK(biasEnabled);
+      DCHECK(fuse_biasadd_);
       net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
                                         filter->GetOpMem(), bias->GetOpMem(),
                                         output->GetOpMem()));
     } else {
-      DCHECK(!biasEnabled);
+      DCHECK(!fuse_biasadd_);
       net.push_back(convolution_forward(conv_prim_desc, src->GetOpMem(),
                                         filter->GetOpMem(),
                                         output->GetOpMem()));
@@ -1280,15 +1369,124 @@ class MklConvOp : public OpKernel {
 
     stream(stream::kind::eager).submit(net).wait();
   }
+
+  // LOCKS_EXCLUDED annotation ensures that the lock (mu_) cannot
+  // be acquired before entering the function, since it is acquired
+  // inside the function.
+  inline bool IsFilterCacheEmpty(OpKernelContext* context) LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock lock(mu_);
+    const Tensor& cached_filter_data_tensor =
+        *cached_filter_data_ptensor_.AccessTensor(context);
+    return (cached_filter_data_tensor.NumElements() == 0);
+  }
+
+  // Cache the converted filter in a persistent tensor.
+  // Only one thread can execute this method at any given time.
+  void CacheFilter(OpKernelContext* context,
+                   const std::shared_ptr<ConvFwdPd>& conv_fwd_pd,
+                   Tfilter* filter_data, const Tensor& filter_tensor,
+                   MklDnnData<Tfilter>& filter, const memory::desc& filter_md)
+      LOCKS_EXCLUDED(mu_) {
+    mutex_lock lock(mu_);
+    const Tensor& cached_filter_data_tensor =
+        *cached_filter_data_ptensor_.AccessTensor(context);
+
+    // If filter is already cached, there's nothing to do.
+    if (cached_filter_data_tensor.NumElements() > 0) {
+      return;
+    }
+
+    // Otherwise, cache filter
+    filter.SetUsrMem(filter_md, &filter_tensor);
+    filter.CheckReorderToOpMem(conv_fwd_pd.get()->weights_primitive_desc());
+    filter_data = static_cast<Tfilter*>(filter.GetOpMem().get_data_handle());
+
+    Tensor* filter_tensor_ptr = nullptr;
+    AllocatePersistentTensor(context, *conv_fwd_pd, &filter_tensor_ptr);
+    void* cached_filter_data = filter.GetTensorBuffer(filter_tensor_ptr);
+    size_t cached_filter_data_size =
+        filter.GetOpMem().get_primitive_desc().get_size();
+    memcpy(cached_filter_data, filter_data, cached_filter_data_size);
+  }
+
+  Tfilter* GetCachedFilter(OpKernelContext* context,
+                           const memory::format& filter_mf)
+      LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock lock(mu_);
+    const Tensor& cached_filter_data =
+        *cached_filter_data_ptensor_.AccessTensor(context);
+    const Tensor& cached_filter_md =
+        *cached_filter_md_ptensor_.AccessTensor(context);
+
+    // Check if the memory descriptor of the cached weights is same as
+    // filter_mf. If so, we can used the cached weights; otherwise
+    // return NULL.
+    // TODO (bhavanis): Do we need to cast filter_mf before the check?
+    if (cached_filter_md.scalar<int32>().size() &&
+        cached_filter_md.scalar<int32>()() == filter_mf) {
+      return static_cast<Tfilter*>(
+          const_cast<Tfilter*>(cached_filter_data.flat<Tfilter>().data()));
+    }
+    return nullptr;
+  }
+};
+
+// Base class for fused convolution forward operations
+template <typename Device, typename Tinput, typename Tfilter, typename Tbias,
+          typename Toutput, typename Ttemp_output, typename Tpadding,
+          bool pad_enabled>
+class MklFusedConvOp
+    : public MklConvOp<Device, Tinput, Tfilter, Tbias, Toutput, Ttemp_output,
+                       Tpadding, false, false, false> {
+ public:
+  explicit MklFusedConvOp(OpKernelConstruction* context)
+      : MklConvOp<Device, Tinput, Tfilter, Tbias, Toutput, Ttemp_output,
+                  Tpadding, false, false, false>(context) {
+    // Since we came here through the registration of _MklFusedConv2D, get
+    // all information from 'fused_ops' and 'num_args'
+    std::vector<string> fused_ops;
+    OP_REQUIRES_OK(context, context->GetAttr("fused_ops", &fused_ops));
+
+    int num_args;
+    OP_REQUIRES_OK(context, context->GetAttr("num_args", &num_args));
+    OP_REQUIRES(context, !fused_ops.empty(),
+                errors::InvalidArgument(
+                    "Fused Conv2D must have at least one fused op."));
+
+    if (fused_ops == std::vector<string>{"BiasAdd"}) {
+      this->set_fuse_biasadd(true);
+      OP_REQUIRES(context, num_args == 1,
+                  errors::InvalidArgument(
+                      "Fused Conv2D must have one extra argument: bias."));
+    } else if (fused_ops == std::vector<string>{"Relu"}) {
+      this->set_fuse_relu(true);
+    } else if (fused_ops == std::vector<string>{"BiasAdd", "Relu"}) {
+      this->set_fuse_biasadd(true);
+      this->set_fuse_relu(true);
+      OP_REQUIRES(context, num_args == 1,
+                  errors::InvalidArgument(
+                      "Fused Conv2D must have one extra argument: bias."));
+    } else {
+      OP_REQUIRES(context, false,
+                  errors::Unimplemented("Fusion is not implemented: [",
+                                        str_util::Join(fused_ops, ","), "]"));
+    }
+
+    if (pad_enabled) {
+      this->set_fuse_pad(true);
+    }
+  }
+
+  virtual ~MklFusedConvOp() {}
 };
 
-// We create new class for each verison of Quantized Convolution and inherit
+// We create new class for each version of Quantized Convolution and inherit
 // from the FP32 version of the base class
 template <typename Device, typename Tbias, typename Toutput,
-          typename Ttemp_output, bool biasEnabled>
+          typename Ttemp_output, bool bias_enabled>
 class MklQuantizedConv2DOp
     : public MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output,
-                       int32, biasEnabled, false> {
+                       int32, bias_enabled, false, false> {
  public:
   virtual ~MklQuantizedConv2DOp() {
     if (this->input_bias_ != nullptr) {
@@ -1304,16 +1502,22 @@ class MklQuantizedConv2DOp
 
   explicit MklQuantizedConv2DOp(OpKernelConstruction* context)
       : MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output, int32,
-                  biasEnabled, false>(context) {}
+                  bias_enabled, false, false>(context) {
+    bool is_filter_const;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("is_filter_const", &is_filter_const));
+    OP_REQUIRES(context, is_filter_const,
+                errors::InvalidArgument("Filter must be a constant"));
+  }
 
   void Compute(OpKernelContext* context) override {
     // Compute int32 output tensor
     MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output, int32,
-              biasEnabled, false>::Compute(context);
+              bias_enabled, false, false>::Compute(context);
 
     // Compute additional outputs: min/max scalars.
     int bias_index_offset;
-    bias_index_offset = biasEnabled ? 1 : 0;
+    bias_index_offset = bias_enabled ? 1 : 0;
 
     const float min_input =
         context->input(2 + bias_index_offset).flat<float>()(0);
@@ -1328,9 +1532,9 @@ class MklQuantizedConv2DOp
     float max_output_value;
     if (std::is_same<Toutput, quint8>::value ||
         std::is_same<Toutput, qint8>::value) {
-      // This is the case the convolution and requantization are fused.
+      // This is the case when convolution and requantization are fused.
       // min_freezed_output and max_freezed_output are the actual range
-      // for the output
+      // of the output.
       min_output_value = context->input(6 + bias_index_offset).flat<float>()(0);
       max_output_value = context->input(7 + bias_index_offset).flat<float>()(0);
     } else {
@@ -1356,14 +1560,14 @@ class MklQuantizedConv2DOp
   void ExtendConvFwdParams(OpKernelContext* context,
                            MklConvFwdParams& params) override {
     MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output, int32,
-              biasEnabled, false>::ExtendConvFwdParams(context, params);
+              bias_enabled, false, false>::ExtendConvFwdParams(context, params);
 
     // When the output type is quint8, the output data id requantized
     // into quint8. A post_op "output_scale" is added to do the conversion.
     if (std::is_same<Toutput, quint8>::value ||
         std::is_same<Toutput, qint8>::value) {
       int bias_index_offset;
-      bias_index_offset = biasEnabled ? 1 : 0;
+      bias_index_offset = bias_enabled ? 1 : 0;
 
       const float min_input =
           context->input(2 + bias_index_offset).flat<float>()(0);
@@ -1399,12 +1603,11 @@ class MklQuantizedConv2DOp
     }
   }
 
-  Tbias* GetBiasHandle(
-      OpKernelContext* context,
-      std::shared_ptr<mkldnn::convolution_forward::primitive_desc>& conv_fwd_pd,
-      const Tensor& bias_tensor) override {
+  Tbias* GetBiasHandle(OpKernelContext* context,
+                       std::shared_ptr<ConvFwdPd>& conv_fwd_pd,
+                       const Tensor& bias_tensor) override {
     int bias_index_offset;
-    bias_index_offset = biasEnabled ? 1 : 0;
+    bias_index_offset = bias_enabled ? 1 : 0;
 
     const float min_input =
         context->input(2 + bias_index_offset).flat<float>()(0);
@@ -1416,7 +1619,7 @@ class MklQuantizedConv2DOp
         context->input(5 + bias_index_offset).flat<float>()(0);
 
     std::vector<mkldnn::primitive> net;
-    if (biasEnabled) {
+    if (bias_enabled) {
       if (std::is_same<Tbias, qint32>::value) {
         return static_cast<Tbias*>(
             const_cast<Tbias*>(bias_tensor.flat<Tbias>().data()));
@@ -1451,31 +1654,31 @@ class MklQuantizedConv2DOp
 };
 
 template <typename Device, typename Tbias, typename Toutput,
-          typename Ttemp_output, bool biasEnabled>
+          typename Ttemp_output, bool bias_enabled>
 class MklQuantizedConv2DReluOp
     : public MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output,
-                                  biasEnabled> {
+                                  bias_enabled> {
  public:
   virtual ~MklQuantizedConv2DReluOp() {}
 
   explicit MklQuantizedConv2DReluOp(OpKernelConstruction* context)
-      : MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output, biasEnabled>(
-            context) {}
+      : MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output,
+                             bias_enabled>(context) {}
 
  protected:
   void ExtendConvFwdParams(OpKernelContext* context,
                            MklConvFwdParams& params) override {
     MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output,
-                         biasEnabled>::ExtendConvFwdParams(context, params);
+                         bias_enabled>::ExtendConvFwdParams(context, params);
     params.post_op_params.push_back({"relu", {1.0, 0.0, 0.0}});
   }
 };
 
 template <typename Device, typename Tbias, typename Toutput,
-          typename Ttemp_output, bool biasEnabled>
+          typename Ttemp_output, bool bias_enabled>
 class MklQuantizedConv2DSumReluOp
     : public MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output,
-                                  biasEnabled> {
+                                  bias_enabled> {
  public:
   virtual ~MklQuantizedConv2DSumReluOp() {
     if (this->summand_ != nullptr) {
@@ -1490,14 +1693,14 @@ class MklQuantizedConv2DSumReluOp
   }
 
   explicit MklQuantizedConv2DSumReluOp(OpKernelConstruction* context)
-      : MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output, biasEnabled>(
-            context) {}
+      : MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output,
+                             bias_enabled>(context) {}
 
  protected:
   void ExtendConvFwdParams(OpKernelContext* context,
                            MklConvFwdParams& params) override {
     MklQuantizedConv2DOp<Device, Tbias, Toutput, Ttemp_output,
-                         biasEnabled>::ExtendConvFwdParams(context, params);
+                         bias_enabled>::ExtendConvFwdParams(context, params);
     // Calculate the scale (beta in mkldnn api term) for sum
     if (std::is_same<Toutput, quint8>::value) {
       int summand_idx = context->num_inputs() / 2 - 1 - 2;
@@ -1505,7 +1708,7 @@ class MklQuantizedConv2DSumReluOp
       bool summand_condition =
           (summand_type == DT_QINT8) || (summand_type == DT_QUINT8);
       CHECK((summand_condition));
-      int bias_index_offset = biasEnabled ? 1 : 0;
+      int bias_index_offset = bias_enabled ? 1 : 0;
       const float min_freezed_output =
           context->input(6 + bias_index_offset).flat<float>()(0);
       const float max_freezed_output =
@@ -1531,12 +1734,11 @@ class MklQuantizedConv2DSumReluOp
     params.post_op_params.push_back({"relu", {1.0, 0.0, 0.0}});
   }
 
-  // Allocate output tensor.
-  void AllocateOutputTensor(
-      OpKernelContext* context,
-      const convolution_forward::primitive_desc& conv_prim_desc,
-      const memory::dims& output_dims_mkl_order,
-      memory::format output_tf_format, Tensor** output_tensor) override {
+  void AllocateOutputTensor(OpKernelContext* context,
+                            const ConvFwdPd& conv_prim_desc,
+                            const memory::dims& output_dims_mkl_order,
+                            memory::format output_tf_format,
+                            Tensor** output_tensor) override {
     int summand_idx = context->num_inputs() / 2 - 1;
     float reorder_sum_scale = 1.0;
     if (std::is_same<Toutput, quint8>::value) {
@@ -1551,7 +1753,8 @@ class MklQuantizedConv2DSumReluOp
       auto dst_md = summand_mkl_shape.GetMklLayout();
       if (summand_mkl_shape.IsMklTensor()) {
         if (summand_type == DT_QINT8) {
-          summand.UnsafeCopyFromInternal(summand, DT_QUINT8, summand.shape());
+          OP_REQUIRES_OK(context, summand.BitcastFrom(summand, DT_QUINT8,
+                                                      summand.shape()));
           dst_md.data.data_type =
               static_cast<mkldnn_data_type_t>(MklDnnType<Toutput>());
           summand_mkl_shape.SetMklLayout(&dst_md);
@@ -1568,10 +1771,10 @@ class MklQuantizedConv2DSumReluOp
     }
     // TODO(mdfaijul): Add cleaner code for non-mkl tensor
     MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output, int32,
-              biasEnabled, false>::AllocateOutputTensor(context, conv_prim_desc,
-                                                        output_dims_mkl_order,
-                                                        output_tf_format,
-                                                        output_tensor);
+              bias_enabled, false,
+              false>::AllocateOutputTensor(context, conv_prim_desc,
+                                           output_dims_mkl_order,
+                                           output_tf_format, output_tensor);
     const Tensor& summand = MklGetInput(context, summand_idx);
     if (summand.dtype() != DT_FLOAT)
       TF_CHECK_OK(Status(error::Code::FAILED_PRECONDITION,
@@ -1579,7 +1782,7 @@ class MklQuantizedConv2DSumReluOp
     MklDnnShape summand_mkl_shape;
     GetMklShape(context, summand_idx, &summand_mkl_shape);
     // We need to compute scale for the summand
-    int bias_index_offset = biasEnabled ? 1 : 0;
+    int bias_index_offset = bias_enabled ? 1 : 0;
     const float min_input =
         context->input(2 + bias_index_offset).flat<float>()(0);
     const float max_input =
@@ -1846,13 +2049,13 @@ REGISTER_KERNEL_BUILDER(
                               .TypeConstraint<T>("T")                      \
                               .Label(mkl_op_registry::kMklOpLabel),        \
                           MklConvOp<CPUDevice, float, float, float, float, \
-                                    float, int32, false, false>);          \
+                                    float, int32, false, false, false>);   \
   REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias")                       \
                               .Device(DEVICE_CPU)                          \
                               .TypeConstraint<T>("T")                      \
                               .Label(mkl_op_registry::kMklOpLabel),        \
                           MklConvOp<CPUDevice, float, float, float, float, \
-                                    float, int32, true, false>);           \
+                                    float, int32, true, false, false>);    \
   REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias")                 \
                               .Device(DEVICE_CPU)                          \
                               .TypeConstraint<T>("T")                      \
@@ -1864,14 +2067,14 @@ REGISTER_KERNEL_BUILDER(
                               .TypeConstraint<int32>("Tpaddings")          \
                               .Label(mkl_op_registry::kMklOpLabel),        \
                           MklConvOp<CPUDevice, float, float, float, float, \
-                                    float, int32, false, true>);           \
+                                    float, int32, false, true, false>);    \
   REGISTER_KERNEL_BUILDER(Name("_MklPadWithConv2D")                        \
                               .Device(DEVICE_CPU)                          \
                               .TypeConstraint<T>("T")                      \
                               .TypeConstraint<int64>("Tpaddings")          \
                               .Label(mkl_op_registry::kMklOpLabel),        \
                           MklConvOp<CPUDevice, float, float, float, float, \
-                                    float, int64, false, true>);           \
+                                    float, int64, false, true, false>);    \
   REGISTER_KERNEL_BUILDER(Name("__MklDummyPadWithConv2D")                  \
                               .Device(DEVICE_CPU)                          \
                               .TypeConstraint<T>("T")                      \
@@ -1881,6 +2084,48 @@ REGISTER_KERNEL_BUILDER(
 
 TF_CALL_float(REGISTER_MKL_CPU_2D);
 
+#define REGISTER_MKL_CPU_2D_DEPTHWISE(T)                                   \
+  REGISTER_KERNEL_BUILDER(Name("_MklDepthwiseConv2dNative")                \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<float>("T")                  \
+                              .Label(mkl_op_registry::kMklOpLabel),        \
+                          MklConvOp<CPUDevice, float, float, float, float, \
+                                    float, int32, false, false, true>);
+
+TF_CALL_float(REGISTER_MKL_CPU_2D_DEPTHWISE);
+
+// Note we are registering _MklFusedConv2D.
+// We check the fused_ops attributes to decide if bias is enabled or not.
+#define REGISTER_MKL_CPU_2D_FUSED(T)                                \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("_MklFusedConv2D")                                       \
+          .Device(DEVICE_CPU)                                       \
+          .TypeConstraint<T>("T")                                   \
+          .Label(mkl_op_registry::kMklOpLabel),                     \
+      MklFusedConvOp<CPUDevice, T, T, T, T, T, int32, false>);      \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("_MklPadWithFusedConv2D")                                \
+          .Device(DEVICE_CPU)                                       \
+          .TypeConstraint<int32>("Tpaddings")                       \
+          .TypeConstraint<T>("T")                                   \
+          .Label(mkl_op_registry::kMklOpLabel),                     \
+      MklFusedConvOp<CPUDevice, T, T, T, T, T, int32, true>);       \
+  REGISTER_KERNEL_BUILDER(                                          \
+      Name("_MklPadWithFusedConv2D")                                \
+          .Device(DEVICE_CPU)                                       \
+          .TypeConstraint<T>("T")                                   \
+          .TypeConstraint<int64>("Tpaddings")                       \
+          .Label(mkl_op_registry::kMklOpLabel),                     \
+      MklFusedConvOp<CPUDevice, T, T, T, T, T, int64, true>);       \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyPadWithFusedConv2D")      \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T")               \
+                              .TypeConstraint<int32>("Tpaddings")   \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklDummyOp<CPUDevice, T>);
+
+TF_CALL_float(REGISTER_MKL_CPU_2D_FUSED);
+
 // Register 3D operations
 #define REGISTER_MKL_CPU_3D(T)                  \
   REGISTER_KERNEL_BUILDER(                      \
@@ -1888,7 +2133,7 @@ TF_CALL_float(REGISTER_MKL_CPU_2D);
           .Device(DEVICE_CPU)                   \
           .TypeConstraint<T>("T")               \
           .Label(mkl_op_registry::kMklOpLabel), \
-      MklConvOp<CPUDevice, T, T, T, T, T, int32, false, false>);
+      MklConvOp<CPUDevice, T, T, T, T, T, int32, false, false, false>);
 TF_CALL_float(REGISTER_MKL_CPU_3D);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index e61c20dea9f8c3f8749c302f88a46233dab270b7..c12a4ff0f0c48d5b15c03eb9ee98985930463845 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -21,13 +21,13 @@ limitations under the License.
 #include <vector>
 
 #include "mkldnn.hpp"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -58,7 +58,7 @@ class MklDnnConvUtil {
  public:
   MklDnnConvUtil(OpKernelContext* context, const std::vector<int32>& strides,
                  Padding pad, TensorFormat fm,
-                 const std::vector<int32>& dilations)
+                 const std::vector<int32>& dilations, bool is_depthwise = false)
       : context_(context),
         strides_(strides),
         dilations_(dilations),
@@ -185,7 +185,8 @@ class MklDnnConvUtil {
   // TODO(nhasabni): Add similar function for input and filter in MklShape.
   virtual inline void GetFilterSizeInMklOrder(const TensorShape& input_shape,
                                               const TensorShape& filter_shape,
-                                              memory::dims* filter_dims) {
+                                              memory::dims* filter_dims,
+                                              bool is_depthwise) {
     CHECK_NOTNULL(filter_dims);
 
     OP_REQUIRES(context_, filter_shape.dims() == strides_.size(),
@@ -210,20 +211,37 @@ class MklDnnConvUtil {
                       input_depth, " vs ", filter_shape.dim_size(2)));
 
       // TF filter is always in (rows, cols, in_depth, out_depth) order.
-      int filter_rows = static_cast<int>(filter_shape.dim_size(0));
-      int filter_cols = static_cast<int>(filter_shape.dim_size(1));
-      int in_depth = static_cast<int>(filter_shape.dim_size(2));
-      int out_depth = static_cast<int>(filter_shape.dim_size(3));
-
-      // MKL-DNN always needs filter in OIHW format.
+      int filter_rows =
+          static_cast<int>(filter_shape.dim_size(TF_2DFILTER_DIM_H));
+      int filter_cols =
+          static_cast<int>(filter_shape.dim_size(TF_2DFILTER_DIM_W));
+      int filter_in_depth =
+          static_cast<int>(filter_shape.dim_size(TF_2DFILTER_DIM_I));
+      int filter_out_depth =
+          static_cast<int>(filter_shape.dim_size(TF_2DFILTER_DIM_O));
+      // MKL-DNN always needs filter in OIHW format for regular convolutions
+      // and GOIHW for grouped/depthwise convolutions,
       // OIHW = (out_depth, in_depth, rows, cols)
-      std::vector<int> mkldnn_sizes(4, -1);
-      mkldnn_sizes[MklDnnDims::Dim_O] = out_depth;
-      mkldnn_sizes[MklDnnDims::Dim_I] = in_depth;
-      mkldnn_sizes[MklDnnDims::Dim_H] = filter_rows;
-      mkldnn_sizes[MklDnnDims::Dim_W] = filter_cols;
-
-      *filter_dims = mkldnn_sizes;
+      // GOIHW = (group, out_depth, in_depth, rows, cols)
+      // Specifically for depthwise G=filter_indepth, O=filter_outdepth, I=1
+      if (is_depthwise) {
+        std::vector<int> mkldnn_sizes(5, -1);
+        mkldnn_sizes[MKL_GROUP_FILTER_DIM_G] = filter_in_depth;
+        mkldnn_sizes[MKL_GROUP_FILTER_DIM_O] = filter_out_depth;
+        mkldnn_sizes[MKL_GROUP_FILTER_DIM_I] = 1;
+        mkldnn_sizes[MKL_GROUP_FILTER_DIM_H] = filter_rows;
+        mkldnn_sizes[MKL_GROUP_FILTER_DIM_W] = filter_cols;
+
+        *filter_dims = mkldnn_sizes;
+      } else {
+        std::vector<int> mkldnn_sizes(4, -1);
+        mkldnn_sizes[MklDnnDims::Dim_O] = filter_out_depth;
+        mkldnn_sizes[MklDnnDims::Dim_I] = filter_in_depth;
+        mkldnn_sizes[MklDnnDims::Dim_H] = filter_rows;
+        mkldnn_sizes[MklDnnDims::Dim_W] = filter_cols;
+
+        *filter_dims = mkldnn_sizes;
+      }
     } else {  // Conv3D
       OP_REQUIRES(context_, input_depth == filter_shape.dim_size(3),
                   errors::InvalidArgument(
@@ -231,17 +249,22 @@ class MklDnnConvUtil {
                       input_depth, " vs ", filter_shape.dim_size(3)));
 
       // TF filter is always in (planes, rows, cols, in_depth, out_depth) order.
-      int filter_planes = static_cast<int>(filter_shape.dim_size(0));
-      int filter_rows = static_cast<int>(filter_shape.dim_size(1));
-      int filter_cols = static_cast<int>(filter_shape.dim_size(2));
-      int in_depth = static_cast<int>(filter_shape.dim_size(3));
-      int out_depth = static_cast<int>(filter_shape.dim_size(4));
+      int filter_planes =
+          static_cast<int>(filter_shape.dim_size(TF_3DFILTER_DIM_P));
+      int filter_rows =
+          static_cast<int>(filter_shape.dim_size(TF_3DFILTER_DIM_H));
+      int filter_cols =
+          static_cast<int>(filter_shape.dim_size(TF_3DFILTER_DIM_W));
+      int filter_in_depth =
+          static_cast<int>(filter_shape.dim_size(TF_3DFILTER_DIM_I));
+      int filter_out_depth =
+          static_cast<int>(filter_shape.dim_size(TF_3DFILTER_DIM_O));
 
       // MKL-DNN always needs filter in OIDHW format.
       // OIDHW = (out_depth, in_depth, planes, rows, cols)
       std::vector<int> mkldnn_sizes(5, -1);
-      mkldnn_sizes[MklDnnDims3D::Dim3d_O] = out_depth;
-      mkldnn_sizes[MklDnnDims3D::Dim3d_I] = in_depth;
+      mkldnn_sizes[MklDnnDims3D::Dim3d_O] = filter_out_depth;
+      mkldnn_sizes[MklDnnDims3D::Dim3d_I] = filter_in_depth;
       mkldnn_sizes[MklDnnDims3D::Dim3d_D] = filter_planes;
       mkldnn_sizes[MklDnnDims3D::Dim3d_H] = filter_rows;
       mkldnn_sizes[MklDnnDims3D::Dim3d_W] = filter_cols;
@@ -256,10 +279,12 @@ class MklDnnConvUtil {
   // checks are returned in context's status.
   virtual inline void GetFilterSizeInMklOrder(size_t src_index,
                                               size_t filter_index,
-                                              memory::dims* filter_dims) {
+                                              memory::dims* filter_dims,
+                                              bool is_depthwise) {
     CHECK_NOTNULL(filter_dims);
     GetFilterSizeInMklOrder(GetTfShape(context_, src_index),
-                            GetTfShape(context_, filter_index), filter_dims);
+                            GetTfShape(context_, filter_index), filter_dims,
+                            is_depthwise);
   }
 
   // Calculate Bias size for 2D or 3D Convolution. Function does not
@@ -288,15 +313,16 @@ class MklDnnConvUtil {
       const TensorShape& input_shape, const TensorShape& filter_shape,
       const memory::dims& strides, const memory::dims& dilations,
       memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order,
-      memory::dims* pad_l, memory::dims* pad_r, bool padEnabled = false) {
+      memory::dims* pad_l, memory::dims* pad_r, bool pad_enabled = false,
+      bool is_depthwise = false) {
     CHECK_NOTNULL(output_dims_tf_order);
     CHECK_NOTNULL(output_dims_mkl_order);
     CHECK_NOTNULL(pad_l);
     CHECK_NOTNULL(pad_r);
 
-    bool isConv2D = (strides_.size() == 4);
+    bool is_conv2d = (strides_.size() == 4);
     int input_planes, input_rows, input_cols;
-    if (isConv2D) {
+    if (is_conv2d) {
       input_rows = GetTensorDim(input_shape, data_format_, 'H');
       input_cols = GetTensorDim(input_shape, data_format_, 'W');
     } else {
@@ -315,18 +341,18 @@ class MklDnnConvUtil {
     //    Third dimension: cols/width.
 
     int filter_planes, filter_rows, filter_cols;
-    if (isConv2D) {
-      filter_rows = filter_shape.dim_size(0);
-      filter_cols = filter_shape.dim_size(1);
+    if (is_conv2d) {
+      filter_rows = filter_shape.dim_size(TF_2DFILTER_DIM_H);
+      filter_cols = filter_shape.dim_size(TF_2DFILTER_DIM_W);
     } else {
-      filter_planes = filter_shape.dim_size(0);
-      filter_rows = filter_shape.dim_size(1);
-      filter_cols = filter_shape.dim_size(2);
+      filter_planes = filter_shape.dim_size(TF_3DFILTER_DIM_P);
+      filter_rows = filter_shape.dim_size(TF_3DFILTER_DIM_H);
+      filter_cols = filter_shape.dim_size(TF_3DFILTER_DIM_W);
     }
 
     int stride_planes, stride_rows, stride_cols;
     int dilation_planes, dilation_rows, dilation_cols;
-    if (isConv2D) {
+    if (is_conv2d) {
       // Conv2D stride is a vector of 2 elements: {s_r, s_c}
       stride_rows = strides[0];
       stride_cols = strides[1];
@@ -344,23 +370,46 @@ class MklDnnConvUtil {
 
     // Output batch is same as input batch.
     int out_batch = GetTensorDim(input_shape, data_format_, 'N');
+    int out_depth;
 
-    // Output depth is same as last dimension for filter.
-    int out_depth = filter_shape.dim_size(isConv2D ? 3 : 4);
+    // TODO add support for 3-D Depthwise
+
+    // Output depth is same as last dimension for filters for regular
+    // convolutions. For depthwise it is in_depth * channel_multiplier.
+    // The channel_multiplier is the last dimension of TF filter for
+    // depthwise convolutions.
+    if (is_depthwise) {
+      out_depth = (filter_shape.dim_size(TF_2DFILTER_DIM_I) *
+                   filter_shape.dim_size(TF_2DFILTER_DIM_O));
+    } else {
+      out_depth = filter_shape.dim_size(
+          is_conv2d ? static_cast<int>(TF_2DFILTER_DIM_O)
+                    : static_cast<int>(TF_3DFILTER_DIM_O));
+    }
 
     int64 out_rows = 0, out_cols = 0, out_planes = 0;
     int64 pad_top = 0, pad_bottom = 0, pad_left, pad_right;
     int64 pad_D1, pad_D2;
 
-    if (isConv2D) {
+    if (is_conv2d) {
+      Padding padding_type;
+      if (pad_enabled) {
+        padding_type = Padding::EXPLICIT;
+        pad_top = static_cast<int64>((*pad_l)[0]);
+        pad_left = static_cast<int64>((*pad_l)[1]);
+        pad_bottom = static_cast<int64>((*pad_r)[0]);
+        pad_right = static_cast<int64>((*pad_r)[1]);
+      } else {
+        padding_type = padding_;
+      }
       OP_REQUIRES_OK(context_,
                      GetWindowedOutputSizeVerboseV2(
                          input_rows, filter_rows, dilation_rows, stride_rows,
-                         padding_, &out_rows, &pad_top, &pad_bottom));
+                         padding_type, &out_rows, &pad_top, &pad_bottom));
       OP_REQUIRES_OK(context_,
                      GetWindowedOutputSizeVerboseV2(
                          input_cols, filter_cols, dilation_cols, stride_cols,
-                         padding_, &out_cols, &pad_left, &pad_right));
+                         padding_type, &out_cols, &pad_left, &pad_right));
     } else {
       OP_REQUIRES_OK(context_, GetWindowedOutputSizeVerbose(
                                    input_planes, filter_planes, stride_planes,
@@ -373,26 +422,12 @@ class MklDnnConvUtil {
                                    padding_, &out_cols, &pad_left, &pad_right));
     }
 
-    if (isConv2D) {
-      // Conv + pad fusion is enabled only for 2D
-      // If padEnabled, i.e., pad and conv op are fused, then
+    if (is_conv2d) {
+      // Conv + pad fusion is enabled only for 2D.
+      // If pad_enabled, i.e., pad and conv op are fused, then
       // all pads are already passed from pad op through
-      // *pad_l and *pad_r
-      if (padEnabled) {
-        pad_top = static_cast<int64>((*pad_l)[0]);
-        pad_left = static_cast<int64>((*pad_l)[1]);
-        pad_bottom = static_cast<int64>((*pad_r)[0]);
-        pad_right = static_cast<int64>((*pad_r)[1]);
-        // update the out_rows and out_cols based on all
-        // sides of the pads coming from pad op.
-        out_rows = out_rows + (pad_top + pad_bottom) / stride_rows;
-        out_cols = out_cols + (pad_left + pad_right) / stride_cols;
-      }
-      // Handle padding. MKL-DNN uses asymetric padding.
-      // But, if padEnabled, i.e., pad and conv op are fused,
-      // then, *pad_l and *pad_r are already set from pad op.
-      // In that case they need not set here.
-      else {
+      // *pad_l and *pad_r and they don't need to be set here.
+      if (!pad_enabled) {
         *pad_l = {static_cast<int>(pad_top), static_cast<int>(pad_left)};
         *pad_r = {static_cast<int>(pad_bottom), static_cast<int>(pad_right)};
       }
@@ -408,14 +443,14 @@ class MklDnnConvUtil {
     //     Conv3D: NDHWC or NCDHW
     // MKL-DNN uses asymetric padding.
     TensorShape out_shape =
-        isConv2D
+        is_conv2d
             ? ShapeFromFormat(data_format_, out_batch, out_rows, out_cols,
                               out_depth)
             : ShapeFromFormat(data_format_, out_batch,
                               {{out_planes, out_rows, out_cols}}, out_depth);
     *output_dims_tf_order = TFShapeToMklDnnDims(out_shape);
 
-    if (isConv2D) {
+    if (is_conv2d) {
       // For Conv2D, MKL-DNN always needs output in NCHW format.
       std::vector<int> mkldnn_sizes(4, -1);
       mkldnn_sizes[MklDnnDims::Dim_N] = out_batch;
@@ -442,7 +477,7 @@ class MklDnnConvUtil {
       size_t src_index, size_t filter_index, const memory::dims& strides,
       const memory::dims& dilations, memory::dims* output_dims_tf_order,
       memory::dims* output_dims_mkl_order, memory::dims* pad_l,
-      memory::dims* pad_r) {
+      memory::dims* pad_r, bool is_depthwise) {
     CHECK_NOTNULL(output_dims_tf_order);
     CHECK_NOTNULL(output_dims_mkl_order);
     CHECK_NOTNULL(pad_l);
@@ -465,7 +500,8 @@ class MklDnnConvUtil {
 
     GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape, strides,
                                   dilations, output_dims_tf_order,
-                                  output_dims_mkl_order, pad_l, pad_r);
+                                  output_dims_mkl_order, pad_l, pad_r,
+                                  is_depthwise);
   }
 
   // Wrapper function to calculate input, filter, and output sizes of
@@ -481,7 +517,8 @@ class MklDnnConvUtil {
       memory::dims* input_dims, memory::dims* filter_dims,
       memory::dims* strides, memory::dims* dilations,
       memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order,
-      memory::dims* pad_l, memory::dims* pad_r, bool padEnabled = false) {
+      memory::dims* pad_l, memory::dims* pad_r, bool pad_enabled = false,
+      bool is_depthwise = false) {
     CHECK_NOTNULL(input_dims);
     CHECK_NOTNULL(filter_dims);
     CHECK_NOTNULL(strides);
@@ -493,13 +530,14 @@ class MklDnnConvUtil {
 
     GetInputSizeInMklOrder(input_shape, input_dims);
     if (!context_->status().ok()) return;
-    GetFilterSizeInMklOrder(input_shape, filter_shape, filter_dims);
+    GetFilterSizeInMklOrder(input_shape, filter_shape, filter_dims,
+                            is_depthwise);
     if (!context_->status().ok()) return;
     GetStridesInMklOrder(strides);
     GetDilationsInMklOrder(dilations);
     GetOutputAndPadSizeInMklOrder(
         input_shape, filter_shape, *strides, *dilations, output_dims_tf_order,
-        output_dims_mkl_order, pad_l, pad_r, padEnabled);
+        output_dims_mkl_order, pad_l, pad_r, pad_enabled, is_depthwise);
     if (!context_->status().ok()) return;
   }
 };
@@ -508,7 +546,7 @@ class MklDnnConvUtil {
 ///  Common class that implements ConvBackpropFilter and Input
 /////////////////////////////////////////////////////////////////////
 
-template <typename Device, class T>
+template <typename Device, class T, bool is_depthwise>
 class MklConvBackpropCommonOp : public OpKernel {
  public:
   ~MklConvBackpropCommonOp() {}
@@ -521,28 +559,38 @@ class MklConvBackpropCommonOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
     int stride_n = GetTensorDim(strides_, data_format_, 'N');
     int stride_c = GetTensorDim(strides_, data_format_, 'C');
+    const int64 stride_h = GetTensorDim(strides_, data_format_, 'H');
+    const int64 stride_w = GetTensorDim(strides_, data_format_, 'W');
     OP_REQUIRES(
         context, (stride_n == 1 && stride_c == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
-    OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
 
-    if (strides_.size() == 4) {
-      // Check Conv2D dilations
-      OP_REQUIRES(context, dilations_.size() == 4,
-                  errors::InvalidArgument("Sliding window dilations field must "
-                                          "specify 4 dimensions"));
-      int dilation_n = GetTensorDim(dilations_, data_format_, 'N');
-      int dilation_c = GetTensorDim(dilations_, data_format_, 'C');
-      int dilation_h = GetTensorDim(dilations_, data_format_, 'H');
-      int dilation_w = GetTensorDim(dilations_, data_format_, 'W');
-      OP_REQUIRES(context, (dilation_n == 1 && dilation_c == 1),
-                  errors::InvalidArgument(
-                      "Current implementation does not yet support "
-                      "dilations in the batch and depth dimensions."));
-      OP_REQUIRES(
-          context, dilation_h > 0 && dilation_w > 0,
-          errors::InvalidArgument("Dilated rates should be larger than 0."));
+    // Depthwise Convolution doesn't have dilation parameter
+    if (!is_depthwise) {
+      OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations_));
+      if (strides_.size() == 4) {
+        // Check Conv2D dilations
+        OP_REQUIRES(
+            context, dilations_.size() == 4,
+            errors::InvalidArgument("Sliding window dilations field must "
+                                    "specify 4 dimensions"));
+        int dilation_n = GetTensorDim(dilations_, data_format_, 'N');
+        int dilation_c = GetTensorDim(dilations_, data_format_, 'C');
+        int dilation_h = GetTensorDim(dilations_, data_format_, 'H');
+        int dilation_w = GetTensorDim(dilations_, data_format_, 'W');
+        OP_REQUIRES(context, (dilation_n == 1 && dilation_c == 1),
+                    errors::InvalidArgument(
+                        "Current implementation does not yet support "
+                        "dilations in the batch and depth dimensions."));
+        OP_REQUIRES(
+            context, dilation_h > 0 && dilation_w > 0,
+            errors::InvalidArgument("Dilated rates should be larger than 0."));
+      }
+    } else {
+      // Set dilations as 1 for depthwise conv
+      // for future support to align with Tensorflow
+      dilations_ = {1, 1, 1, 1};
     }
 
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 2ec6c8fa897464be4dba35a5446b8452d12a40d8..1ae42a0d0d74ef7e2e12fe7427cadfc043774c70 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -13,678 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifdef INTEL_MKL
-
+#include "mkldnn.hpp"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#ifndef INTEL_MKL_ML_ONLY
-#include "mkldnn.hpp"
 using mkldnn::batch_normalization_backward;
 using mkldnn::batch_normalization_forward;
 using mkldnn::prop_kind;
 using mkldnn::stream;
 using mkldnn::use_global_stats;
 using mkldnn::use_scale_shift;
-#else
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-#endif
-
-#include "tensorflow/core/util/mkl_util.h"
-// TODO(inteltf) Address comments from PR 8968.
 
 namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
 
-#ifdef INTEL_MKL_ML_ONLY
-
-template <typename Device, typename T>
-class MklFusedBatchNormOp : public OpKernel {
- public:
-  explicit MklFusedBatchNormOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    float epsilon;
-    OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon));
-    epsilon_ = T(epsilon);
-    string tensor_format;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &tensor_format));
-    OP_REQUIRES(context, FormatFromString(tensor_format, &tensor_format_),
-                errors::InvalidArgument("Invalid data format"));
-    OP_REQUIRES_OK(context, context->GetAttr("is_training", &is_training_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    MklFusedBatchNormOpContext mkl_context;
-    const Tensor& input = MklGetInput(context, 0);
-    const Tensor& scale = MklGetInput(context, 1);
-    const Tensor& shift = MklGetInput(context, 2);
-    const Tensor& est_mean = MklGetInput(context, 3);
-    const Tensor& est_variance = MklGetInput(context, 4);
-
-    GetMklShape(context, 0, &(mkl_context.mkl_shape_input_shape));
-    bool input_in_mkl_format = mkl_context.mkl_shape_input_shape.IsMklTensor();
-
-    if (!input_in_mkl_format) {
-      OP_REQUIRES(context, input.dims() == 4,
-                  errors::InvalidArgument("input must be 4-dimensional",
-                                          input.shape().DebugString()));
-    }
-    OP_REQUIRES(context, scale.dims() == 1,
-                errors::InvalidArgument("scale must be 1-dimensional",
-                                        scale.shape().DebugString()));
-    OP_REQUIRES(context, shift.dims() == 1,
-                errors::InvalidArgument("offset must be 1-dimensional",
-                                        shift.shape().DebugString()));
-    OP_REQUIRES(context, est_mean.dims() == 1,
-                errors::InvalidArgument("estimated_mean must be 1-dimensional",
-                                        est_mean.shape().DebugString()));
-
-    OP_REQUIRES(
-        context, est_variance.dims() == 1,
-        errors::InvalidArgument("estimated_variance must be 1-dimensional",
-                                est_variance.shape().DebugString()));
-
-    if (is_training_) {
-      OP_REQUIRES(context, est_mean.dim_size(0) == 0,
-                  errors::InvalidArgument("estimated_mean empty for training",
-                                          est_mean.shape().DebugString()));
-      OP_REQUIRES(context, est_variance.dim_size(0) == 0,
-                  errors::InvalidArgument(
-                      "estimated_variance must be empty for training",
-                      est_variance.shape().DebugString()));
-    }
-
-    unsigned int flag_batch_norm =
-        is_training_ ? dnnUseScaleShift
-                     : (dnnUseInputMeanVariance | dnnUseScaleShift);
-
-    mkl_context.MklExtractParams(context, tensor_format_);
-
-    // Create layout only for input data as it is used in Op primitive.
-    mkl_context.MklCreateInputLayout(context);
-
-    // Create Op primitive.
-    CHECK_EQ(dnnBatchNormalizationCreateForward_v2_F32(
-                 &(mkl_context.mkl_prim_batchnorm), nullptr,
-                 mkl_context.mkl_lt_input, static_cast<float>(epsilon_),
-                 flag_batch_norm),
-             E_SUCCESS);
-
-    // Temporary tensors with buffers for the context inputs, if
-    // conversion to MKL-Op specific layouts are required. It is assumed here
-    // that TF's 1D tensors (scale, shift, est_mean, and est_variance) won't
-    // require any conversion.
-    // Since scale-shift is combined in MKL, a buffer is required.
-    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_scale_shift_buf_tensor;
-    mkl_context.MklPrepareContextInputs(context, &mkl_tmp_input_buf_tensor,
-                                        &mkl_tmp_scale_shift_buf_tensor);
-
-    // Output data in MKL layout
-    Tensor* output = nullptr;
-    TensorShape tf_shape_output;
-    MklShape mkl_shape_output;
-    mkl_shape_output.SetMklTensor(true);
-    mkl_shape_output.SetMklLayout(mkl_context.mkl_prim_batchnorm,
-                                  dnnResourceDst);
-    mkl_shape_output.SetTfLayout(mkl_context.mkl_params.in_dim,
-                                 mkl_context.mkl_params.in_sizes,
-                                 mkl_context.mkl_params.in_strides);
-    mkl_shape_output.SetTfDimOrder(mkl_context.mkl_params.in_dim,
-                                   tensor_format_);
-    tf_shape_output.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                               mkl_shape_output.GetMklLayout())) /
-                           sizeof(T));
-    AllocateOutputSetMklShape(context, 0, &output, tf_shape_output,
-                              mkl_shape_output);
-    mkl_context.mkl_res_batchnorm[dnnResourceDst] =
-        static_cast<void*>(output->flat<T>().data());
-
-    // Batch mean in TF layout
-    Tensor* batch_mean = nullptr;
-    MklShape mkl_shape_batch_mean;
-    mkl_shape_batch_mean.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, 1, &batch_mean, scale.shape(),
-                              mkl_shape_batch_mean);
-    // Batch variance in TF layout
-    Tensor* batch_variance = nullptr;
-    MklShape mkl_shape_batch_variance;
-    mkl_shape_batch_variance.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, 2, &batch_variance, scale.shape(),
-                              mkl_shape_batch_variance);
-    // If training mode, set dnnResourceMean and dnnResourceVariance to
-    // output tensors for batch mean and variance.
-    // Otherwise, set dnnResourceMean and dnnResourceVariance to
-    // estimated mean and variance.
-    if (is_training_)
-      mkl_context.MklSetMeanVariance(*batch_mean, *batch_variance);
-    else
-      mkl_context.MklSetMeanVariance(est_mean, est_variance);
-
-    // Now that all resources are set, it is ready for dnnExecute
-    CHECK_EQ(dnnExecute_F32(mkl_context.mkl_prim_batchnorm,
-                            mkl_context.mkl_res_batchnorm),
-             E_SUCCESS);
-
-    // Mean and variance (without Bessel's correction) saved for backward
-    // computation to serve as pre-computed mean and variance.
-    Tensor* saved_mean = nullptr;
-    MklShape mkl_shape_saved_mean;
-    mkl_shape_saved_mean.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, 3, &saved_mean, scale.shape(),
-                              mkl_shape_saved_mean);
-    std::memcpy(
-        reinterpret_cast<char*>(saved_mean->flat<float>().data()),
-        reinterpret_cast<char*>(mkl_context.mkl_res_batchnorm[dnnResourceMean]),
-        scale.NumElements() * sizeof(float));
-    Tensor* saved_variance = nullptr;
-    MklShape mkl_shape_saved_variance;
-    mkl_shape_saved_variance.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, 4, &saved_variance, scale.shape(),
-                              mkl_shape_saved_variance);
-    std::memcpy(reinterpret_cast<char*>(saved_variance->flat<float>().data()),
-                reinterpret_cast<char*>(
-                    mkl_context.mkl_res_batchnorm[dnnResourceVariance]),
-                scale.NumElements() * sizeof(float));
-
-    // Bessel's correction on variance, if training mode is on
-    if (is_training_) {
-      float* p_var = static_cast<float*>(batch_variance->flat<T>().data());
-      auto depth = mkl_context.mkl_params.depth;
-      size_t orig_size = mkl_context.mkl_params.in_sizes[0] *
-                         mkl_context.mkl_params.in_sizes[1] *
-                         mkl_context.mkl_params.in_sizes[3];
-      size_t adjust_size = orig_size - 1;
-      float adjust_factor = (static_cast<float>(orig_size)) / adjust_size;
-      for (int i = 0; i < depth; i++) p_var[i] = adjust_factor * p_var[i];
-    }
-
-    mkl_context.MklCleanup();
-  }
-
- private:
-  T epsilon_;
-  TensorFormat tensor_format_;
-  bool is_training_;
-
-  // Structure containing all info for MklOp
-  typedef struct {
-    // Parameters used for input and output layouts
-    struct MklBatchNormParams {
-      // BatchNormOp src and
-      size_t in_dim;
-      size_t in_sizes[4];
-      size_t in_strides[4];
-      size_t depth;  // Batch normalization is done for per channel.
-    } mkl_params;
-
-    MklShape mkl_shape_input_shape;
-
-    // MKL primitive and resources for BatchNormOp
-    dnnPrimitive_t mkl_prim_batchnorm = nullptr;
-    void* mkl_res_batchnorm[dnnResourceNumber];
-
-    // MKL layouts for inputs in the context
-    dnnLayout_t mkl_lt_input = nullptr;
-
-    void MklCleanup() {
-      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
-      if (!input_in_mkl_format) dnnLayoutDelete_F32(mkl_lt_input);
-      if (mkl_prim_batchnorm != nullptr) dnnDelete_F32(mkl_prim_batchnorm);
-    }
-
-    void MklExtractParams(OpKernelContext* context,
-                          const TensorFormat& tensor_format) {
-      const Tensor& input = MklGetInput(context, 0);
-      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
-      mkl_params.in_dim = input_in_mkl_format
-                              ? mkl_shape_input_shape.GetDimension()
-                              : input.dims();
-      mkl_params.in_sizes[0] = static_cast<size_t>(
-          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[0]
-                              : GetTensorDim(input, tensor_format, 'W'));
-      mkl_params.in_sizes[1] = static_cast<size_t>(
-          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[1]
-                              : GetTensorDim(input, tensor_format, 'H'));
-      mkl_params.in_sizes[2] = static_cast<size_t>(
-          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[2]
-                              : GetTensorDim(input, tensor_format, 'C'));
-      mkl_params.in_sizes[3] = static_cast<size_t>(
-          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[3]
-                              : GetTensorDim(input, tensor_format, 'N'));
-      mkl_params.depth = mkl_params.in_sizes[2];
-      GetStridesFromSizes(tensor_format, mkl_params.in_strides,
-                          mkl_params.in_sizes);
-    }
-
-    void MklCreateInputLayout(OpKernelContext* context) {
-      const Tensor& input = MklGetInput(context, 0);
-      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
-      if (input_in_mkl_format) {
-        mkl_lt_input =
-            static_cast<dnnLayout_t>(mkl_shape_input_shape.GetCurLayout());
-      } else {
-        CHECK_EQ(
-            dnnLayoutCreate_F32(&mkl_lt_input, mkl_params.in_dim,
-                                mkl_params.in_sizes, mkl_params.in_strides),
-            E_SUCCESS);
-      }
-    }
-    void MklPrepareContextInputs(OpKernelContext* context,
-                                 Tensor* mkl_tmp_input_buf_tensor,
-                                 Tensor* mkl_tmp_scale_shift_buf_tensor) {
-      bool mkl_convert_input;
-      dnnPrimitive_t mkl_prim_convert_input = nullptr;
-      dnnLayout_t mkl_lt_internal_input = nullptr;
-      void* mkl_buf_converted_input = nullptr;
-      // Compare with internal layouts and convert if needed
-      const Tensor& input = MklGetInput(context, 0);
-      void* mkl_buf_input =
-          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(
-                   &mkl_lt_internal_input, mkl_prim_batchnorm, dnnResourceSrc),
-               E_SUCCESS);
-      mkl_convert_input =
-          !dnnLayoutCompare_F32(mkl_lt_internal_input, mkl_lt_input);
-      if (mkl_convert_input) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, mkl_lt_input,
-                                         mkl_lt_internal_input),
-                 E_SUCCESS);
-        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
-                       &mkl_buf_converted_input);
-        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
-                                          mkl_buf_converted_input),
-                 E_SUCCESS);
-        dnnDelete_F32(mkl_prim_convert_input);
-      }
-      dnnLayoutDelete_F32(mkl_lt_internal_input);
-      mkl_res_batchnorm[dnnResourceSrc] =
-          (mkl_convert_input) ? mkl_buf_converted_input : mkl_buf_input;
-
-      // scale-shift layout is created from primitive. So no conversion
-      // is needed, however, a buffer has to be allocated.
-      dnnLayout_t mkl_lt_scale_shift = nullptr;
-      void* mkl_buf_scale_shift = nullptr;
-      CHECK_EQ(
-          dnnLayoutCreateFromPrimitive_F32(
-              &mkl_lt_scale_shift, mkl_prim_batchnorm, dnnResourceScaleShift),
-          E_SUCCESS);
-      AllocTmpBuffer(context, mkl_tmp_scale_shift_buf_tensor,
-                     mkl_lt_scale_shift, &mkl_buf_scale_shift);
-      // Fill the scale-shift buffer with data, presumably buffer is 2D array
-      const Tensor& scale = MklGetInput(context, 1);
-      const Tensor& shift = MklGetInput(context, 2);
-      float* buf_scale_shift = static_cast<float*>(mkl_buf_scale_shift);
-      float* buf_scale = const_cast<float*>(
-          static_cast<const float*>(scale.flat<float>().data()));
-      float* buf_shift = const_cast<float*>(
-          static_cast<const float*>(shift.flat<float>().data()));
-      auto depth = mkl_params.depth;
-      for (int i = 0; i < depth; i++) {
-        buf_scale_shift[i] = buf_scale[i];
-        buf_scale_shift[i + depth] = buf_shift[i];
-      }
-      mkl_res_batchnorm[dnnResourceScaleShift] = mkl_buf_scale_shift;
-    }
-
-    inline void MklSetMeanVariance(const Tensor& mean, const Tensor& variance) {
-      mkl_res_batchnorm[dnnResourceMean] = const_cast<void*>(
-          static_cast<const void*>(mean.flat<float>().data()));
-      mkl_res_batchnorm[dnnResourceVariance] = const_cast<void*>(
-          static_cast<const void*>(variance.flat<float>().data()));
-    }
-  } MklFusedBatchNormOpContext;
-};
-
-template <typename Device, typename T>
-class MklFusedBatchNormGradOp : public OpKernel {
- public:
-  explicit MklFusedBatchNormGradOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    float epsilon;
-    OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon));
-    epsilon_ = T(epsilon);
-    string tensor_format;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &tensor_format));
-    OP_REQUIRES(context, FormatFromString(tensor_format, &tensor_format_),
-                errors::InvalidArgument("Invalid data format"));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    MklFusedBatchNormGradOpContext mkl_context;
-
-    const Tensor& out_backprop = MklGetInput(context, 0);
-    const Tensor& input = MklGetInput(context, 1);
-    const Tensor& scale = MklGetInput(context, 2);
-    const Tensor& saved_mean = MklGetInput(context, 3);
-    const Tensor& saved_var = MklGetInput(context, 4);
-
-    // Here scale, mean, and variance are 1D and considered
-    // those having same layout in MKL and TF
-    GetMklShape(context, 0, &(mkl_context.mkl_shape_out_backprop));
-    GetMklShape(context, 1, &(mkl_context.mkl_shape_input_shape));
-
-    bool input_in_mkl_format = mkl_context.mkl_shape_input_shape.IsMklTensor();
-    bool out_backprop_in_mkl_format =
-        mkl_context.mkl_shape_out_backprop.IsMklTensor();
-    if (!out_backprop_in_mkl_format) {
-      OP_REQUIRES(context, out_backprop.dims() == 4,
-                  errors::InvalidArgument("input must be 4-dimensional",
-                                          out_backprop.shape().DebugString()));
-    }
-    if (!input_in_mkl_format) {
-      OP_REQUIRES(context, input.dims() == 4,
-                  errors::InvalidArgument("input must be 4-dimensional",
-                                          input.shape().DebugString()));
-    }
-    OP_REQUIRES(context, scale.dims() == 1,
-                errors::InvalidArgument("scale must be 1-dimensional",
-                                        scale.shape().DebugString()));
-    OP_REQUIRES(context, saved_mean.dims() == 1,
-                errors::InvalidArgument("saved mean must be 1-dimensional",
-                                        saved_mean.shape().DebugString()));
-    OP_REQUIRES(context, saved_var.dims() == 1,
-                errors::InvalidArgument("saved variance must be 1-dimensional",
-                                        saved_var.shape().DebugString()));
-
-    mkl_context.MklExtractParams(context, tensor_format_);
-
-    mkl_context.MklCreateInputLayout(context);
-
-    unsigned int flag_batch_norm_grad = dnnUseScaleShift;
-
-    // Create Backward Op primitive.
-    CHECK_EQ(dnnBatchNormalizationCreateBackward_v2_F32(
-                 &(mkl_context.mkl_prim_batchnorm_bwd), nullptr,
-                 mkl_context.mkl_lt_input, static_cast<float>(epsilon_),
-                 flag_batch_norm_grad),
-             E_SUCCESS);
-
-    // Temporary tensors and their buffers if conversion is required
-    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_outbackprop_buf_tensor,
-        mkl_tmp_scaleshift_buf_tensor;
-    mkl_context.MklPrepareContextInputs(context, &mkl_tmp_input_buf_tensor,
-                                        &mkl_tmp_outbackprop_buf_tensor,
-                                        &mkl_tmp_scaleshift_buf_tensor);
-
-    // Allocate tensor for grad w.r.t. input(x)
-    Tensor* in_backprop = nullptr;
-    TensorShape tf_shape_in_backprop;
-    MklShape mkl_shape_in_backprop;
-    mkl_shape_in_backprop.SetMklTensor(true);
-    mkl_shape_in_backprop.SetMklLayout(mkl_context.mkl_prim_batchnorm_bwd,
-                                       dnnResourceDiffSrc);
-    mkl_shape_in_backprop.SetTfLayout(mkl_context.mkl_params.in_dims,
-                                      mkl_context.mkl_params.in_sizes,
-                                      mkl_context.mkl_params.in_strides);
-    mkl_shape_in_backprop.SetTfDimOrder(mkl_context.mkl_params.in_dims,
-                                        tensor_format_);
-    tf_shape_in_backprop.AddDim(
-        dnnLayoutGetMemorySize_F32(
-            static_cast<dnnLayout_t>(mkl_shape_in_backprop.GetMklLayout())) /
-        sizeof(T));
-    AllocateOutputSetMklShape(context, 0, &in_backprop, tf_shape_in_backprop,
-                              mkl_shape_in_backprop);
-    mkl_context.mkl_res_batchnorm_bwd[dnnResourceDiffSrc] =
-        static_cast<void*>(in_backprop->flat<T>().data());
-
-    // grad_scale and grad_shift are combined together in MKL
-    // So create a single temporary buffer for those.
-    // Also set dnnResourceDiffScaleShift to the temporary buffer
-    Tensor mkl_tmp_grad_scale_shift_buf_tensor;
-    mkl_context.MklPrepareGradScaleShift(context,
-                                         &mkl_tmp_grad_scale_shift_buf_tensor);
-
-    // All dnn resources are set now, ready to execute
-    CHECK_EQ(dnnExecute_F32(mkl_context.mkl_prim_batchnorm_bwd,
-                            mkl_context.mkl_res_batchnorm_bwd),
-             E_SUCCESS);
-
-    // Now separate out scale and shift grad and copy to individual tensors
-    const TensorShape& tf_shape_scale_shift = scale.shape();
-    // Allocate tensor for grad w.r.t. scale (beta)
-    Tensor* scale_backprop = nullptr;
-    MklShape mkl_shape_scale_backprop;
-    AllocateOutputSetMklShape(context, 1, &scale_backprop, tf_shape_scale_shift,
-                              mkl_shape_scale_backprop);
-
-    // Allocate tensor for grad w.r.t. shift(gamma)
-    Tensor* shift_backprop = nullptr;
-    MklShape mkl_shape_shift_backprop;
-    AllocateOutputSetMklShape(context, 2, &shift_backprop, tf_shape_scale_shift,
-                              mkl_shape_shift_backprop);
-
-    // copy scale and shift grads to tensors
-    float* mkl_buf_scale_shift = const_cast<float*>(static_cast<const float*>(
-        mkl_tmp_grad_scale_shift_buf_tensor.flat<T>().data()));
-    float* tf_buf_scale = const_cast<float*>(
-        static_cast<const float*>(scale_backprop->flat<T>().data()));
-    float* tf_buf_shift = const_cast<float*>(
-        static_cast<const float*>(shift_backprop->flat<T>().data()));
-    auto depth = mkl_context.mkl_params.depth;
-    for (int i = 0; i < depth; i++) {
-      tf_buf_scale[i] = mkl_buf_scale_shift[i];
-      tf_buf_shift[i] = mkl_buf_scale_shift[i + depth];
-    }
-
-    // Two placeholders for estimated_mean and estimated_variance, which are
-    // used for inference and thus not needed here for gradient computation.
-    Tensor* placeholder_1 = nullptr;
-    MklShape mkl_shape_placeholder_1;
-    AllocateOutputSetMklShape(context, 3, &placeholder_1, TensorShape({}),
-                              mkl_shape_placeholder_1);
-    Tensor* placeholder_2 = nullptr;
-    MklShape mkl_shape_placeholder_2;
-    AllocateOutputSetMklShape(context, 4, &placeholder_2, TensorShape({}),
-                              mkl_shape_placeholder_2);
-
-    mkl_context.MklCleanup();
-  }
-
- private:
-  T epsilon_;
-  TensorFormat tensor_format_;
-
-  // Structure containing all info for MklOp
-  typedef struct {
-    // Parameters used for input and output layouts
-    struct MklBatchNormParams {
-      // BatchNormOp src and
-      size_t in_dims;
-      size_t in_sizes[4];
-      size_t in_strides[4];
-      size_t depth;  // Batch normalization is done for per channel.
-    } mkl_params;
-
-    MklShape mkl_shape_out_backprop;
-    MklShape mkl_shape_input_shape;
-
-    // MKL primitive and resources for BatchNormOp
-    dnnPrimitive_t mkl_prim_batchnorm_bwd = nullptr;
-    void* mkl_res_batchnorm_bwd[dnnResourceNumber];
-
-    // MKL layouts for inputs in the context
-    dnnLayout_t mkl_lt_out_backprop = nullptr;
-    dnnLayout_t mkl_lt_input = nullptr;
-
-    void MklCleanup() {
-      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
-      bool out_backprop_in_mkl_format = mkl_shape_out_backprop.IsMklTensor();
-      if (!input_in_mkl_format) dnnLayoutDelete_F32(mkl_lt_input);
-      if (!out_backprop_in_mkl_format) dnnLayoutDelete_F32(mkl_lt_out_backprop);
-
-      dnnDelete_F32(mkl_prim_batchnorm_bwd);
-    }
-
-    void MklExtractParams(OpKernelContext* context,
-                          const TensorFormat& tensor_format) {
-      const Tensor& input = MklGetInput(context, 1);
-      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
-      mkl_params.in_dims = input_in_mkl_format
-                               ? mkl_shape_input_shape.GetDimension()
-                               : input.dims();
-      mkl_params.in_sizes[0] = static_cast<size_t>(
-          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[0]
-                              : GetTensorDim(input, tensor_format, 'W'));
-      mkl_params.in_sizes[1] = static_cast<size_t>(
-          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[1]
-                              : GetTensorDim(input, tensor_format, 'H'));
-      mkl_params.in_sizes[2] = static_cast<size_t>(
-          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[2]
-                              : GetTensorDim(input, tensor_format, 'C'));
-      mkl_params.in_sizes[3] = static_cast<size_t>(
-          input_in_mkl_format ? mkl_shape_input_shape.GetSizes()[3]
-                              : GetTensorDim(input, tensor_format, 'N'));
-      mkl_params.depth = mkl_params.in_sizes[2];
-      GetStridesFromSizes(tensor_format, mkl_params.in_strides,
-                          mkl_params.in_sizes);
-    }
-
-    void MklCreateInputLayout(OpKernelContext* context) {
-      const Tensor& input = MklGetInput(context, 0);
-      bool input_in_mkl_format = mkl_shape_input_shape.IsMklTensor();
-      if (input_in_mkl_format) {
-        mkl_lt_input =
-            static_cast<dnnLayout_t>(mkl_shape_input_shape.GetCurLayout());
-      } else {
-        CHECK_EQ(
-            dnnLayoutCreate_F32(&mkl_lt_input, mkl_params.in_dims,
-                                mkl_params.in_sizes, mkl_params.in_strides),
-            E_SUCCESS);
-      }
-
-      bool out_backprop_in_mkl_format = mkl_shape_out_backprop.IsMklTensor();
-      if (out_backprop_in_mkl_format) {
-        mkl_lt_out_backprop =
-            static_cast<dnnLayout_t>(mkl_shape_out_backprop.GetCurLayout());
-      } else {
-        CHECK_EQ(
-            dnnLayoutCreate_F32(&mkl_lt_out_backprop, mkl_params.in_dims,
-                                mkl_params.in_sizes, mkl_params.in_strides),
-            E_SUCCESS);
-      }
-    }
-
-    void MklPrepareContextInputs(OpKernelContext* context,
-                                 Tensor* mkl_tmp_input_buf_tensor,
-                                 Tensor* mkl_tmp_outbackprop_buf_tensor,
-                                 Tensor* mkl_tmp_scaleshift_buf_tensor) {
-      bool mkl_convert_input;
-      dnnPrimitive_t mkl_prim_convert_input = nullptr;
-      dnnLayout_t mkl_lt_internal_input = nullptr;
-      void* mkl_buf_converted_input = nullptr;
-      // Compare with internal layouts and convert if needed
-      const Tensor& input = MklGetInput(context, 1);
-      void* mkl_buf_input =
-          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
-      CHECK_EQ(
-          dnnLayoutCreateFromPrimitive_F32(
-              &mkl_lt_internal_input, mkl_prim_batchnorm_bwd, dnnResourceSrc),
-          E_SUCCESS);
-      mkl_convert_input =
-          !dnnLayoutCompare_F32(mkl_lt_internal_input, mkl_lt_input);
-      if (mkl_convert_input) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, mkl_lt_input,
-                                         mkl_lt_internal_input),
-                 E_SUCCESS);
-        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input,
-                       &mkl_buf_converted_input);
-        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input,
-                                          mkl_buf_converted_input),
-                 E_SUCCESS);
-        dnnDelete_F32(mkl_prim_convert_input);
-      }
-      dnnLayoutDelete_F32(mkl_lt_internal_input);
-      mkl_res_batchnorm_bwd[dnnResourceSrc] =
-          (mkl_convert_input) ? mkl_buf_converted_input : mkl_buf_input;
-
-      bool mkl_convert_out_backprop;
-      dnnPrimitive_t mkl_prim_convert_out_backprop = nullptr;
-      dnnLayout_t mkl_lt_internal_out_backprop = nullptr;
-      void* mkl_buf_converted_out_backprop = nullptr;
-      // Compare with internal layouts and convert if needed
-      const Tensor& out_backprop = MklGetInput(context, 0);
-      void* mkl_buf_out_backprop = const_cast<void*>(
-          static_cast<const void*>(out_backprop.flat<T>().data()));
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_out_backprop,
-                                                mkl_prim_batchnorm_bwd,
-                                                dnnResourceDiffDst),
-               E_SUCCESS);
-      mkl_convert_out_backprop = !dnnLayoutCompare_F32(
-          mkl_lt_internal_out_backprop, mkl_lt_out_backprop);
-      if (mkl_convert_out_backprop) {
-        CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_out_backprop,
-                                         mkl_lt_out_backprop,
-                                         mkl_lt_internal_out_backprop),
-                 E_SUCCESS);
-        AllocTmpBuffer(context, mkl_tmp_outbackprop_buf_tensor,
-                       mkl_lt_internal_out_backprop,
-                       &mkl_buf_converted_out_backprop);
-        CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_out_backprop,
-                                          mkl_buf_out_backprop,
-                                          mkl_buf_converted_out_backprop),
-                 E_SUCCESS);
-        dnnDelete_F32(mkl_prim_convert_out_backprop);
-      }
-      dnnLayoutDelete_F32(mkl_lt_internal_out_backprop);
-      mkl_res_batchnorm_bwd[dnnResourceDiffDst] =
-          (mkl_convert_out_backprop) ? mkl_buf_converted_out_backprop
-                                     : mkl_buf_out_backprop;
-
-      // Set dnnResourceMean and dnnResourceVariance
-      const Tensor& saved_mean = MklGetInput(context, 3);
-      const Tensor& saved_var = MklGetInput(context, 4);
-      void* mkl_buf_saved_mean = const_cast<void*>(
-          static_cast<const void*>(saved_mean.flat<T>().data()));
-      void* mkl_buf_saved_var = const_cast<void*>(
-          static_cast<const void*>(saved_var.flat<T>().data()));
-      mkl_res_batchnorm_bwd[dnnResourceMean] = mkl_buf_saved_mean;
-      mkl_res_batchnorm_bwd[dnnResourceVariance] = mkl_buf_saved_var;
-
-      // Set dnnResourceScaleShift
-      // Note backward Op needs only current values of scale parameters,
-      // shift parameters could be garbage and won't be used
-      const Tensor& scale = MklGetInput(context, 2);
-      dnnLayout_t mkl_lt_scale_shift = nullptr;
-      void* mkl_buf_scale_shift = nullptr;
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_scale_shift,
-                                                mkl_prim_batchnorm_bwd,
-                                                dnnResourceScaleShift),
-               E_SUCCESS);
-      AllocTmpBuffer(context, mkl_tmp_scaleshift_buf_tensor, mkl_lt_scale_shift,
-                     &mkl_buf_scale_shift);
-      float* pscale =
-          const_cast<float*>(static_cast<const float*>(scale.flat<T>().data()));
-      float* pscale_shift = static_cast<float*>(mkl_buf_scale_shift);
-      auto depth = mkl_params.depth;
-      for (int i = 0; i < depth; i++) pscale_shift[i] = pscale[i];
-      mkl_res_batchnorm_bwd[dnnResourceScaleShift] = mkl_buf_scale_shift;
-      dnnLayoutDelete_F32(mkl_lt_scale_shift);
-    }
-
-    void MklPrepareGradScaleShift(OpKernelContext* context,
-                                  Tensor* mkl_tmp_grad_scale_shift_buf_tensor) {
-      dnnLayout_t mkl_lt_grad_scaleshift = nullptr;
-      void* mkl_buf_grad_scaleshift = nullptr;
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_grad_scaleshift,
-                                                mkl_prim_batchnorm_bwd,
-                                                dnnResourceDiffScaleShift),
-               E_SUCCESS);
-      AllocTmpBuffer(context, mkl_tmp_grad_scale_shift_buf_tensor,
-                     mkl_lt_grad_scaleshift, &mkl_buf_grad_scaleshift);
-      mkl_res_batchnorm_bwd[dnnResourceDiffScaleShift] =
-          mkl_buf_grad_scaleshift;
-      dnnLayoutDelete_F32(mkl_lt_grad_scaleshift);
-    }
-  } MklFusedBatchNormGradOpContext;
-};
-#endif
-
-#ifndef INTEL_MKL_ML_ONLY
-
 struct MklBatchNormFwdParams {
   memory::dims src_dims;
   int depth;
@@ -1765,8 +1112,6 @@ class MklFusedBatchNormGradOp : public OpKernel {
   memory::dims GetMeanVarianceDims() { return memory::dims({1, depth_}); }
 };
 
-#endif
-
 #define REGISTER_MKL_CPU(T)                                         \
   REGISTER_KERNEL_BUILDER(Name("_MklFusedBatchNorm")                \
                               .Device(DEVICE_CPU)                   \
diff --git a/tensorflow/core/kernels/mkl_fused_ops_test.cc b/tensorflow/core/kernels/mkl_fused_ops_test.cc
index 991fb080934883e05e38e91207a111256b885b82..288515de0bcbb9a940cf3e0c790a308762904482 100644
--- a/tensorflow/core/kernels/mkl_fused_ops_test.cc
+++ b/tensorflow/core/kernels/mkl_fused_ops_test.cc
@@ -32,17 +32,21 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Helper class for converting MKL tesnors to TF tensors and comparing to
+// Helper class for converting MKL tensors to TF tensors and comparing to
 // expected values
 
 static const uint8 dummy_tensor[] = {0, 0, 0, 0, 0, 0, 0, 0};
 static const TensorShape dummy_shape({8});
 
-class ConvMklToTF : public OpsTestBase {
+using BiasAddGraphRunner =
+    std::function<void(const Tensor& input_data, const Tensor& filter_data,
+                       const Tensor& bias_data, Tensor* out)>;
+
+template <typename T>
+class CommonTestUtilities : public OpsTestBase {
  public:
-  template <typename T>
-  void ConvertAndCompare(DataType dtype, const Tensor& first,
-                         const Tensor& second, const Tensor& expected) {
+  void PerformConversion(DataType dtype, const Tensor& tensor,
+                         const Tensor& mkl_meta_tensor, Tensor* output) {
     // Create an MKL to TF conversion node and execute it
     TF_EXPECT_OK(NodeDefBuilder("mkl_to_tf_op", "_MklToTf")
                      .Input(FakeInput(dtype))     // Input
@@ -51,16 +55,255 @@ class ConvMklToTF : public OpsTestBase {
                      .Attr("_kernel", "MklOp")
                      .Finalize(node_def()));
     TF_EXPECT_OK(InitOp());
-    AddInputFromArray<T>(first.shape(), first.flat<T>());
-    AddInputFromArray<uint8>(second.shape(), second.flat<uint8>());
+    AddInputFromArray<T>(tensor.shape(), tensor.flat<T>());
+    AddInputFromArray<uint8>(mkl_meta_tensor.shape(),
+                             mkl_meta_tensor.flat<uint8>());
     TF_ASSERT_OK(RunOpKernel());
 
-    const Tensor& output = *GetOutput(0);
+    *output = *GetOutput(0);
+  }
+
+  // Runs a Tensorflow graph defined by the root scope, and fetches the result
+  // of 'fetch' node into the output Tensor.
+  static void RunAndFetch(const tensorflow::Scope& root, const string& fetch,
+                          Tensor* output) {
+    tensorflow::GraphDef graph;
+    TF_ASSERT_OK(root.ToGraphDef(&graph));
+
+    std::unique_ptr<tensorflow::Session> session(
+        tensorflow::NewSession(tensorflow::SessionOptions()));
+    TF_ASSERT_OK(session->Create(graph));
+
+    std::vector<Tensor> unfused_tensors;
+    TF_ASSERT_OK(session->Run({}, {fetch}, {}, &unfused_tensors));
+
+    *output = unfused_tensors[0];
+  }
+
+  void ConvertAndCompare(DataType dtype, const Tensor& tensor,
+                         const Tensor& mkl_meta_tensor,
+                         const Tensor& expected) {
+    Tensor output;
+    PerformConversion(dtype, tensor, mkl_meta_tensor, &output);
     test::ExpectTensorNear<T>(expected, output, 1e-5);
   }
-  void TestBody(){};
+  void TestBody() {}
+
+  static void VerifyBiasAddTensorsClose(int depth, int image_width,
+                                        int image_height, int image_batch_count,
+                                        int filter_size, int filter_count,
+                                        const BiasAddGraphRunner& run_default,
+                                        const BiasAddGraphRunner& run_fused) {
+    DataType dtype = DataTypeToEnum<T>::v();
+
+    Tensor image(dtype, {image_batch_count, image_height, image_width, depth});
+    image.flat<T>() = image.flat<T>().setRandom();
+
+    Tensor filter(dtype, {filter_size, filter_size, depth, filter_count});
+    filter.flat<T>() = filter.flat<T>().setRandom();
+
+    const int bias_size = filter_count;
+    Tensor bias(dtype, {bias_size});
+    bias.flat<T>() = bias.flat<T>().setRandom();
+
+    Tensor conv_2d;
+    Tensor fused_conv_2d;
+
+    run_default(image, filter, bias, &conv_2d);
+    run_fused(image, filter, bias, &fused_conv_2d);
+
+    ASSERT_EQ(conv_2d.dtype(), fused_conv_2d.dtype());
+    ASSERT_EQ(conv_2d.shape(), fused_conv_2d.shape());
+
+    test::ExpectClose(conv_2d, fused_conv_2d);
+  }
 };
 
+// Testing MKL's fused convolution ops
+
+template <typename T>
+class MklFusedConv2DOpTest : public OpsTestBase {
+ protected:
+  static constexpr int kDepth = 3;
+  static constexpr int kImageWidth = 32;
+  static constexpr int kImageHeight = 32;
+  static constexpr int kImageBatchCount = 8;
+
+  void RunConv2DWithBias(const Tensor& input_data, const Tensor& filter_data,
+                         const Tensor& bias_data, Tensor* output,
+                         int stride = 1) {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    auto conv = ops::Conv2D(
+        root.WithOpName("conv"),
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
+        {1, stride, stride, 1}, "SAME");
+
+    auto with_bias = ops::BiasAdd(
+        root.WithOpName("with_bias"), conv,
+        ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
+
+    CommonTestUtilities<T>::RunAndFetch(root, "with_bias", output);
+  }
+
+  void RunConv2DWithBiasAndRelu(const Tensor& input_data,
+                                const Tensor& filter_data,
+                                const Tensor& bias_data, Tensor* output,
+                                int stride = 1) {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    auto conv = ops::Conv2D(
+        root.WithOpName("conv"),
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
+        {1, stride, stride, 1}, "SAME");
+
+    auto with_bias = ops::BiasAdd(
+        root.WithOpName("with_bias"), conv,
+        ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
+
+    auto with_relu = ops::Relu(root.WithOpName("with_relu"), with_bias);
+
+    CommonTestUtilities<T>::RunAndFetch(root, "with_relu", output);
+  }
+
+  void RunMklFusedConv2DOp(const Tensor& image, const Tensor& filter,
+                           const std::vector<Tensor>& args,
+                           const std::vector<string>& fused_ops, Tensor* output,
+                           int stride = 1) {
+    DataType dtype = DataTypeToEnum<T>::v();
+    int num_args = static_cast<int>(args.size());
+
+    TF_EXPECT_OK(NodeDefBuilder("fused_conv_op", "_MklFusedConv2D")
+                     .Input(FakeInput(dtype))
+                     .Input(FakeInput(dtype))
+                     .Input(FakeInput(num_args, dtype))
+                     .Input(FakeInput(DT_UINT8))
+                     .Input(FakeInput(DT_UINT8))
+                     .Input(FakeInput(num_args, DT_UINT8))
+                     .Attr("T", dtype)
+                     .Attr("num_args", num_args)
+                     .Attr("strides", {1, stride, stride, 1})
+                     .Attr("padding", "SAME")
+                     .Attr("fused_ops", fused_ops)
+                     .Attr("_kernel", "MklOp")
+                     .Finalize(node_def()));
+
+    TF_EXPECT_OK(InitOp());
+
+    AddInputFromArray<T>(image.shape(), image.flat<T>());
+    AddInputFromArray<T>(filter.shape(), filter.flat<T>());
+    for (const Tensor& arg : args)
+      AddInputFromArray<T>(arg.shape(), arg.flat<T>());
+    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    for (const Tensor& arg : args)
+      AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    TF_ASSERT_OK(RunOpKernel());
+
+    // Compare output to expected results
+    const Tensor& output_tensor = *GetOutput(0);
+    // Index 2 will need to be changed if the number of outputs produced
+    // by MklConv2D change.
+    const Tensor& output_meta_tensor = *GetOutput(2);
+    CommonTestUtilities<T> test_util;
+    test_util.PerformConversion(dtype, output_tensor, output_meta_tensor,
+                                output);
+  }
+
+  // Verifies that computing Conv2D+BiasAdd in a graph is identical to
+  // FusedConv2D.
+  void VerifyConv2DWithBias(int filter_size, int filter_count,
+                            int depth = kDepth, int image_width = kImageWidth,
+                            int image_height = kImageHeight,
+                            int image_batch_count = kImageBatchCount) {
+    const BiasAddGraphRunner run_default =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunConv2DWithBias(input_data, filter_data, bias_data, out);
+        };
+
+    const BiasAddGraphRunner run_fused =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunMklFusedConv2DOp(input_data, filter_data, {bias_data}, {"BiasAdd"},
+                              out);
+        };
+
+    CommonTestUtilities<T>::VerifyBiasAddTensorsClose(
+        depth, image_width, image_height, image_batch_count, filter_size,
+        filter_count, run_default, run_fused);
+  }
+
+  // Verifies that computing Conv2D+BiasAdd+Relu in a graph is identical to
+  // FusedConv2D.
+  void VerifyConv2DWithBiasAndRelu(int filter_size, int filter_count,
+                                   int depth = kDepth,
+                                   int image_width = kImageWidth,
+                                   int image_height = kImageHeight,
+                                   int image_batch_count = kImageBatchCount) {
+    const BiasAddGraphRunner run_default =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunConv2DWithBiasAndRelu(input_data, filter_data, bias_data, out);
+        };
+
+    const BiasAddGraphRunner run_fused =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunMklFusedConv2DOp(input_data, filter_data, {bias_data},
+                              {"BiasAdd", "Relu"}, out);
+        };
+
+    CommonTestUtilities<T>::VerifyBiasAddTensorsClose(
+        depth, image_width, image_height, image_batch_count, filter_size,
+        filter_count, run_default, run_fused);
+  }
+};
+
+template <typename T>
+class MklFusedConv2DWithBiasOpTest : public MklFusedConv2DOpTest<T> {};
+
+TYPED_TEST_CASE_P(MklFusedConv2DWithBiasOpTest);
+
+// -------------------------------------------------------------------------- //
+// Conv2D + BiasAdd + {Relu}                                                  //
+// -------------------------------------------------------------------------- //
+
+TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, OneByOneConvolution) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBias(filter_size, filter_count);
+}
+
+TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, SpatialConvolution) {
+  const int filter_size = 3;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBias(filter_size, filter_count);
+}
+
+TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, OneByOneConvolutionAndRelu) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBiasAndRelu(filter_size, filter_count);
+}
+
+TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, SpatialConvolutionAndRelu) {
+  const int filter_size = 3;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBiasAndRelu(filter_size, filter_count);
+}
+
+REGISTER_TYPED_TEST_CASE_P(MklFusedConv2DWithBiasOpTest,  //
+                           OneByOneConvolution,           //
+                           SpatialConvolution,            //
+                           OneByOneConvolutionAndRelu,    //
+                           SpatialConvolutionAndRelu);
+
+using MklFusedBiasAddDataTypes = ::testing::Types<float>;
+INSTANTIATE_TYPED_TEST_CASE_P(Test, MklFusedConv2DWithBiasOpTest,
+                              MklFusedBiasAddDataTypes);
 // Testing fusion of pad and convolution
 
 class FusedPadConvOpTest : public OpsTestBase {
@@ -98,8 +341,8 @@ class FusedPadConvOpTest : public OpsTestBase {
     // Compare output to expected results
     const Tensor& first = *GetOutput(0);
     const Tensor& second = *GetOutput(2);
-    ConvMklToTF conv_comp;
-    conv_comp.ConvertAndCompare<T>(dtype, first, second, expected);
+    CommonTestUtilities<T> test_util;
+    test_util.ConvertAndCompare(dtype, first, second, expected);
   }
 };
 
@@ -158,5 +401,295 @@ TEST_F(FusedPadConvOpTest, PaddingConvTestNchw) {
 
   Run<float>(DT_FLOAT, image, filter, padding, expected, "NCHW");
 }
+
+class FilterCacheTest : public OpsTestBase {
+ public:
+  template <typename T>
+  void Run(DataType dtype, Tensor& image, Tensor& filter, Tensor& expected,
+           const bool is_filter_const) {
+    const int stride = 1;
+
+    TF_EXPECT_OK(NodeDefBuilder("conv2d_filter_cache", "_MklConv2D")
+                     .Input(FakeInput(dtype))     // Input
+                     .Input(FakeInput(dtype))     // Filter
+                     .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                     .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                     .Attr("padding", "VALID")
+                     .Attr("data_format", "NHWC")
+                     .Attr("is_filter_const", is_filter_const)
+                     .Attr("T", dtype)
+                     .Attr("strides", {1, stride, stride, 1})
+                     .Attr("_kernel", "MklOp")
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+
+    // Setting up inputs and execute
+    AddInputFromArray<T>(image.shape(), image.flat<T>());
+    AddInputFromArray<T>(filter.shape(), filter.flat<T>());
+    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+
+    TF_ASSERT_OK(RunOpKernel());
+
+    // Compare outputs to expected results
+    const Tensor& output = *GetOutput(0);
+    const Tensor& output_layout = *GetOutput(2);
+    CommonTestUtilities<T> conv_comp;
+    conv_comp.ConvertAndCompare(dtype, output, output_layout, expected);
+
+    // TODO(bhavanis): For now, we rely on internal performance tests to
+    // determine if filter data is being cached and reused.
+    // However, we still need to add a check here to determine if this is
+    // still the case by inspecting the contents of the persistent tensor.
+    TF_ASSERT_OK(RunOpKernel());
+
+    // Compare output to expected results
+    const Tensor& output_new = *GetOutput(0);
+    const Tensor& output_layout_new = *GetOutput(2);
+    CommonTestUtilities<T> conv_comp_new;
+    conv_comp_new.ConvertAndCompare(dtype, output_new, output_layout_new,
+                                    expected);
+  }
+};
+
+TEST_F(FilterCacheTest, Conv2DFilterCacheTest) {
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 3;
+  const int image_batch_count = 1;
+  Tensor image(DT_FLOAT, {image_batch_count, image_height, image_width, depth});
+  test::FillValues<float>(&image, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+
+  const int filter_size = 3;
+  const int filter_count = 1;
+  Tensor filter(DT_FLOAT, {filter_size, filter_size, depth, filter_count});
+  test::FillValues<float>(&filter, {1, 4, 7, 2, 5, 8, 3, 6, 9});
+
+  Tensor expected(DT_FLOAT, TensorShape({1, 1, 2, 1}));
+  test::FillValues<float>(&expected, {312, 357});
+
+  Run<float>(DT_FLOAT, image, filter, expected, true);
+}
+
+// Testing fusion of pad and fusedconv2d
+template <typename T>
+class MklPadWithFusedConv2DOpTest : public OpsTestBase {
+ protected:
+  static constexpr int kDepth = 3;
+  static constexpr int kImageWidth = 30;
+  static constexpr int kImageHeight = 28;
+  static constexpr int kImageBatchCount = 8;
+
+  // 0: top pad, 1: bottom pad, 2: left pad, 3: right pad
+  int padding_list_[4];
+
+  // Verifies that computing Pad+Conv2D+BiasAdd in a graph is identical to
+  // FusedConv2D.
+  void VerifyPadAndConv2DWithBias(int filter_size, int filter_count,
+                                  int depth = kDepth,
+                                  int image_width = kImageWidth,
+                                  int image_height = kImageHeight,
+                                  int image_batch_count = kImageBatchCount) {
+    const BiasAddGraphRunner run_default = [this](const Tensor& input_data,
+                                                  const Tensor& filter_data,
+                                                  const Tensor& bias_data,
+                                                  Tensor* out) {
+      RunMklPadWithFusedConv2DAndBias(input_data, filter_data, bias_data, out);
+    };
+
+    const BiasAddGraphRunner run_fused =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunMklFusedConv2DWithPadOp(input_data, filter_data, {bias_data},
+                                     {"BiasAdd"}, out);
+        };
+
+    CommonTestUtilities<T>::VerifyBiasAddTensorsClose(
+        depth, image_width, image_height, image_batch_count, filter_size,
+        filter_count, run_default, run_fused);
+  }
+
+  // Verifies that computing Pad+Conv2D+BiasAdd+Relu in a graph is identical to
+  // FusedConv2D.
+  void VerifyPadAndConv2DWithBiasRelu(
+      int filter_size, int filter_count, int depth = kDepth,
+      int image_width = kImageWidth, int image_height = kImageHeight,
+      int image_batch_count = kImageBatchCount) {
+    const BiasAddGraphRunner run_default =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunMklPadWithFusedConv2DAndBiasRelu(input_data, filter_data,
+                                              bias_data, out);
+        };
+
+    const BiasAddGraphRunner run_fused =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunMklFusedConv2DWithPadOp(input_data, filter_data, {bias_data},
+                                     {"BiasAdd", "Relu"}, out);
+        };
+
+    CommonTestUtilities<T>::VerifyBiasAddTensorsClose(
+        depth, image_width, image_height, image_batch_count, filter_size,
+        filter_count, run_default, run_fused);
+  }
+
+  void RunMklPadWithFusedConv2DAndBias(const Tensor& input_data,
+                                       const Tensor& filter_data,
+                                       const Tensor& bias_data, Tensor* output,
+                                       int stride = 1) {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    // FusedConv2D only supports NHWC format so we use NHWC here.
+    auto padding = ops::Const(root.WithOpName("padding"),
+                              {0, 0, padding_list_[0], padding_list_[1],
+                               padding_list_[2], padding_list_[3], 0, 0},
+                              {4, 2});
+    auto pad = ops::Pad(
+        root.WithOpName("pad"),
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
+        padding);
+
+    auto conv = ops::Conv2D(
+        root.WithOpName("conv"), pad,
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
+        {1, stride, stride, 1}, "VALID");
+
+    auto with_bias = ops::BiasAdd(
+        root.WithOpName("with_bias"), conv,
+        ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
+
+    CommonTestUtilities<T>::RunAndFetch(root, "with_bias", output);
+  }
+
+  void RunMklPadWithFusedConv2DAndBiasRelu(const Tensor& input_data,
+                                           const Tensor& filter_data,
+                                           const Tensor& bias_data,
+                                           Tensor* output, int stride = 1) {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    // FusedConv2D only supports NHWC format so we use NHWC here.
+    auto padding = ops::Const(root.WithOpName("padding"),
+                              {0, 0, padding_list_[0], padding_list_[1],
+                               padding_list_[2], padding_list_[3], 0, 0},
+                              {4, 2});
+    auto pad = ops::Pad(
+        root.WithOpName("pad"),
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
+        padding);
+
+    auto conv = ops::Conv2D(
+        root.WithOpName("conv"), pad,
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
+        {1, stride, stride, 1}, "VALID");
+
+    auto with_bias = ops::BiasAdd(
+        root.WithOpName("with_bias"), conv,
+        ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
+
+    auto with_relu = ops::Relu(root.WithOpName("with_relu"), with_bias);
+
+    CommonTestUtilities<T>::RunAndFetch(root, "with_relu", output);
+  }
+
+  void RunMklFusedConv2DWithPadOp(const Tensor& image, const Tensor& filter,
+                                  const std::vector<Tensor>& args,
+                                  const std::vector<string>& fused_ops,
+                                  Tensor* output, int stride = 1) {
+    DataType dtype = DataTypeToEnum<T>::v();
+    const int num_args = static_cast<int>(args.size());
+    Tensor padding(DT_INT32, {4, 2});
+    test::FillValues<int32>(
+        &padding, {0, 0, padding_list_[0], padding_list_[1], padding_list_[2],
+                   padding_list_[3], 0, 0});
+
+    TF_EXPECT_OK(NodeDefBuilder("pad_fused_conv_op", "_MklPadWithFusedConv2D")
+                     .Input(FakeInput(dtype))
+                     .Input(FakeInput(dtype))
+                     .Input(FakeInput(num_args, dtype))
+                     .Input(FakeInput(DT_INT32))
+                     .Input(FakeInput(DT_UINT8))
+                     .Input(FakeInput(DT_UINT8))
+                     .Input(FakeInput(num_args, DT_UINT8))
+                     .Input(FakeInput(DT_UINT8))
+                     .Attr("T", dtype)
+                     .Attr("num_args", num_args)
+                     .Attr("strides", {1, stride, stride, 1})
+                     .Attr("padding", "VALID")
+                     .Attr("fused_ops", fused_ops)
+                     .Attr("_kernel", "MklOp")
+                     .Finalize(node_def()));
+
+    TF_EXPECT_OK(InitOp());
+
+    AddInputFromArray<T>(image.shape(), image.flat<T>());
+    AddInputFromArray<T>(filter.shape(), filter.flat<T>());
+    for (const Tensor& arg : args)
+      AddInputFromArray<T>(arg.shape(), arg.flat<T>());
+    AddInputFromArray<int32>(padding.shape(), padding.flat<int32>());
+    // Add MKL meta input for input, filter, pad and agrs.
+    for (int i = 0; i < args.size() + 3; ++i)
+      AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    TF_ASSERT_OK(RunOpKernel());
+
+    // Compare output to expected results
+    const Tensor& output_tensor = *GetOutput(0);
+    // Index 2 will need to be changed if the number of outputs produced
+    // by MklConv2D change.
+    const Tensor& output_meta_tensor = *GetOutput(2);
+    CommonTestUtilities<T> test_util;
+    test_util.PerformConversion(dtype, output_tensor, output_meta_tensor,
+                                output);
+  }
+
+ public:
+  void SetPaddingList(int top, int bottom, int left, int right) {
+    padding_list_[0] = top;
+    padding_list_[1] = bottom;
+    padding_list_[2] = left;
+    padding_list_[3] = right;
+  }
+};
+
+TYPED_TEST_CASE_P(MklPadWithFusedConv2DOpTest);
+
+TYPED_TEST_P(MklPadWithFusedConv2DOpTest, WithBiasAndRoundPad) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->SetPaddingList(2, 2, 1, 1);
+  this->VerifyPadAndConv2DWithBias(filter_size, filter_count);
+}
+
+TYPED_TEST_P(MklPadWithFusedConv2DOpTest, WithBiasAndPartialPad) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->SetPaddingList(4, 0, 2, 0);
+  this->VerifyPadAndConv2DWithBias(filter_size, filter_count);
+}
+
+TYPED_TEST_P(MklPadWithFusedConv2DOpTest, WithBiasReluAndRoundPad) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->SetPaddingList(2, 2, 1, 1);
+  this->VerifyPadAndConv2DWithBiasRelu(filter_size, filter_count);
+}
+
+TYPED_TEST_P(MklPadWithFusedConv2DOpTest, WithBiasReluAndPartialPad) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->SetPaddingList(4, 0, 2, 0);
+  this->VerifyPadAndConv2DWithBiasRelu(filter_size, filter_count);
+}
+
+REGISTER_TYPED_TEST_CASE_P(MklPadWithFusedConv2DOpTest,  //
+                           WithBiasAndRoundPad,          //
+                           WithBiasAndPartialPad,        //
+                           WithBiasReluAndRoundPad,      //
+                           WithBiasReluAndPartialPad);
+
+using MklPadWithFusedConv2DDataTypes = ::testing::Types<float>;
+INSTANTIATE_TYPED_TEST_CASE_P(Test, MklPadWithFusedConv2DOpTest,
+                              MklPadWithFusedConv2DDataTypes);
+
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index 4d46abb0a4dd232ef13c8b6b0547b0779af1f98f..bc52127b942375c89cea832e3013684687374cb6 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -24,10 +24,10 @@ limitations under the License.
 #include <vector>
 #include "mkldnn.hpp"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/mkl_util.h"
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index dc84d3941e78a2232041b2dbcf83bf3545982dee..a8d1dffd4e52c8e9a16a0a82cf8c31be9cb628e9 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <limits>
 #include <vector>
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/mkl_quantized_concat_op_test.cc b/tensorflow/core/kernels/mkl_quantized_concat_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fc68480bbe8b9ed509309a16df2b805fe02e20f1
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_quantized_concat_op_test.cc
@@ -0,0 +1,234 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if defined(INTEL_MKL) && defined(ENABLE_MKL)
+
+#define EIGEN_USE_THREADS
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+using test::graph::Constant;
+
+static const uint8 dummy_tensor[] = {0, 0, 0, 0, 0, 0, 0, 0};
+static const TensorShape dummy_shape({8});
+
+// Helper class for converting MKL tensors to TF tensors and comparing to
+// expected values
+
+class ConvMklToTF : public OpsTestBase {
+ public:
+  template <typename T>
+  void ConvertMKL2TF(DataType dtype, const Tensor& first, const Tensor& second,
+                     Tensor& output) {
+    // Create an MKL to TF conversion node and execute it
+    TF_EXPECT_OK(NodeDefBuilder("mkl_to_tf_op", "_MklToTf")
+                     .Input(FakeInput(dtype))     // Input
+                     .Input(FakeInput(DT_UINT8))  // MKL second tensor
+                     .Attr("T", dtype)
+                     .Attr("_kernel", "MklOp")
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+    AddInputFromArray<T>(first.shape(), first.flat<T>());
+    AddInputFromArray<uint8>(second.shape(), second.flat<uint8>());
+    TF_ASSERT_OK(RunOpKernel());
+
+    output = *GetOutput(0);
+  }
+  void TestBody(){};
+};
+
+class QuantizedConcatTest : public OpsTestBase {
+ protected:
+  QuantizedConcatTest() {}
+
+  void TestSmall8Bit(float first_min, float first_max, float second_min,
+                     float second_max);
+  void TestSecondDim8Bit(float first_min, float first_max, float second_min,
+                         float second_max);
+};
+
+TEST_F(QuantizedConcatTest, Small8BitSameRange) {
+  // Range for both is the same, so impl can use memcpy.
+  TestSmall8Bit(0.0f, 255.0f, 0.0f, 255.0f);
+}
+
+void QuantizedConcatTest::TestSmall8Bit(float first_min, float first_max,
+                                        float second_min, float second_max) {
+  TF_ASSERT_OK(NodeDefBuilder("quantized_concat_op", "_MklQuantizedConcatV2")
+                   .Input(FakeInput(2, DT_QUINT8))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(2, DT_FLOAT))
+                   .Input(FakeInput(2, DT_FLOAT))
+                   .Input(FakeInput(2, DT_UINT8))  // MKL second tensor
+                   .Input(FakeInput(DT_UINT8))     // MKL second tensor
+                   .Input(FakeInput(2, DT_UINT8))  // MKL second tensor
+                   .Input(FakeInput(2, DT_UINT8))  // MKL second tensor
+                   .Attr("N", 2)
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Attr("Tidx", DT_INT32)
+                   .Attr("_kernel", "QuantizedMklOp")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const int first_batch = 2;
+  const int first_height = 2;
+  const int first_width = 3;
+  const int first_depth = 1;
+  Tensor first_float(DT_FLOAT,
+                     {first_batch, first_height, first_width, first_depth});
+  test::FillValues<float>(&first_float,
+                          {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  Tensor first_quantized =
+      FloatTensorToQuantized<quint8>(first_float, first_min, first_max);
+
+  const int second_batch = 2;
+  const int second_height = 2;
+  const int second_width = 3;
+  const int second_depth = 1;
+  Tensor second_float(
+      DT_FLOAT, {second_batch, second_height, second_width, second_depth});
+  test::FillValues<float>(&second_float,
+                          {13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  Tensor second_quantized =
+      FloatTensorToQuantized<quint8>(second_float, second_min, second_max);
+
+  const int expected_batch = first_batch + second_batch;
+  Tensor expected_float(
+      DT_FLOAT, {expected_batch, first_height, first_width, first_depth});
+  test::FillValues<float>(&expected_float,
+                          {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                           13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+
+  AddInputFromArray<quint8>(first_quantized.shape(),
+                            first_quantized.flat<quint8>());
+  AddInputFromArray<quint8>(second_quantized.shape(),
+                            second_quantized.flat<quint8>());
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<float>(TensorShape({}), {first_min});
+  AddInputFromArray<float>(TensorShape({}), {second_min});
+  AddInputFromArray<float>(TensorShape({}), {first_max});
+  AddInputFromArray<float>(TensorShape({}), {second_max});
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  TF_ASSERT_OK(RunOpKernel());
+  const Tensor& output_quantized = *GetOutput(0);
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<quint8>(output_quantized, output_min, output_max);
+  test::ExpectTensorNear<float>(expected_float, output_float, 0.2);
+}
+
+TEST_F(QuantizedConcatTest, SecondDim8BitSameRange) {
+  TestSecondDim8Bit(-10.0f, 150.0f, -10.0f, 150.0f);
+}
+
+void QuantizedConcatTest::TestSecondDim8Bit(float first_min, float first_max,
+                                            float second_min,
+                                            float second_max) {
+  TF_ASSERT_OK(NodeDefBuilder("quantized_concat_op", "_MklQuantizedConcatV2")
+                   .Input(FakeInput(2, DT_QUINT8))
+                   .Input(FakeInput(DT_INT32))
+                   .Input(FakeInput(2, DT_FLOAT))
+                   .Input(FakeInput(2, DT_FLOAT))
+                   .Input(FakeInput(2, DT_UINT8))  // MKL second tensor
+                   .Input(FakeInput(DT_UINT8))     // MKL second tensor
+                   .Input(FakeInput(2, DT_UINT8))  // MKL second tensor
+                   .Input(FakeInput(2, DT_UINT8))  // MKL second tensor
+                   .Attr("N", 2)
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Attr("Tidx", DT_INT32)
+                   .Attr("_kernel", "QuantizedMklOp")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const int first_batch = 2;
+  const int first_height = 2;
+  const int first_width = 3;
+  const int first_depth = 1;
+  Tensor first_float(DT_FLOAT,
+                     {first_batch, first_height, first_width, first_depth});
+  test::FillValues<float>(&first_float,
+                          {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  Tensor first_quantized =
+      FloatTensorToQuantized<quint8>(first_float, first_min, first_max);
+
+  const int second_batch = 2;
+  const int second_height = 2;
+  const int second_width = 3;
+  const int second_depth = 1;
+
+  Tensor second_float(
+      DT_FLOAT, {second_batch, second_height, second_width, second_depth});
+  test::FillValues<float>(&second_float,
+                          {13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  Tensor second_quantized =
+      FloatTensorToQuantized<quint8>(second_float, second_min, second_max);
+
+  const int expected_height = first_height + second_height;
+  Tensor expected_float(
+      DT_FLOAT, {first_batch, expected_height, first_width, first_depth});
+  test::FillValues<float>(&expected_float,
+                          {1, 2, 3, 4,  5,  6,  13, 14, 15, 16, 17, 18,
+                           7, 8, 9, 10, 11, 12, 19, 20, 21, 22, 23, 24});
+
+  AddInputFromArray<quint8>(first_quantized.shape(),
+                            first_quantized.flat<quint8>());
+  AddInputFromArray<quint8>(second_quantized.shape(),
+                            second_quantized.flat<quint8>());
+  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<float>(TensorShape({}), {first_min});
+  AddInputFromArray<float>(TensorShape({}), {second_min});
+  AddInputFromArray<float>(TensorShape({}), {first_max});
+  AddInputFromArray<float>(TensorShape({}), {second_max});
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  TF_ASSERT_OK(RunOpKernel());
+  const Tensor& output_quantized = *GetOutput(0);
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<quint8>(output_quantized, output_min, output_max);
+  // Using the same error tolerance as in Eigen QuantizedConcat test
+  test::ExpectTensorNear<float>(expected_float, output_float, 1.0);
+}
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL && ENABLE_MKL
diff --git a/tensorflow/core/kernels/mkl_quantized_conv_ops.h b/tensorflow/core/kernels/mkl_quantized_conv_ops.h
index 10825f696253cc6d38bbdee1e6b660d494c34088..fef2d837cf27a0854ffc34ad3d1b60831a776fbc 100644
--- a/tensorflow/core/kernels/mkl_quantized_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_quantized_conv_ops.h
@@ -24,8 +24,13 @@ limitations under the License.
 namespace tensorflow {
 template <class T>
 float MklFloatForOneQuantizedLevel(float range_min, float range_max) {
-  const int64 highest = static_cast<int64>(Eigen::NumTraits<T>::highest());
-  const int64 lowest = static_cast<int64>(Eigen::NumTraits<T>::lowest());
+  int64 highest = static_cast<int64>(Eigen::NumTraits<T>::highest());
+  int64 lowest = static_cast<int64>(Eigen::NumTraits<T>::lowest());
+
+  // Adjusting for having a symmetric range.
+  // for example: for 8-bit [-127, 127] as opposed to [-128, 127].
+  if (lowest < -highest) ++lowest;
+
   const float float_for_one_quantized_level =
       (range_max - range_min) / (highest - lowest);
   return float_for_one_quantized_level;
@@ -48,6 +53,35 @@ void MklQuantizationRangeForMultiplication(float min_a, float max_a,
   *min_c = c_float_for_one_quant_level * c_lowest;
   *max_c = c_float_for_one_quant_level * c_highest;
 }
+
+template <class T1, class T2, class T3>
+void MklQuantizationRangeForMultiplication(float min_a, float max_a,
+                                           const Tensor& min_b_vector,
+                                           const Tensor& max_b_vector,
+                                           Tensor** min_c_vector,
+                                           Tensor** max_c_vector) {
+  DCHECK(min_b_vector.NumElements() == (*min_c_vector)->NumElements());
+  DCHECK(max_b_vector.NumElements() == (*max_c_vector)->NumElements());
+  size_t n_channel = min_b_vector.NumElements();
+  const int64 c_highest = static_cast<int64>(Eigen::NumTraits<T3>::highest());
+  const int64 c_lowest = static_cast<int64>(Eigen::NumTraits<T3>::lowest());
+  const float* min_b = min_b_vector.flat<float>().data();
+  const float* max_b = max_b_vector.flat<float>().data();
+  float* min_c = (*min_c_vector)->flat<float>().data();
+  float* max_c = (*max_c_vector)->flat<float>().data();
+#pragma omp parallel for
+  for (size_t n = 0; n < n_channel; ++n) {
+    float a_float_for_one_quant_level =
+        MklFloatForOneQuantizedLevel<T1>(min_a, max_a);
+    float b_float_for_one_quant_level =
+        MklFloatForOneQuantizedLevel<T2>(min_b[n], max_b[n]);
+    float c_float_for_one_quant_level =
+        a_float_for_one_quant_level * b_float_for_one_quant_level;
+    min_c[n] = c_float_for_one_quant_level * c_lowest;
+    max_c[n] = c_float_for_one_quant_level * c_highest;
+  }
+}
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index 708213648b48e2dfbbfe9a63851428aa97c72b64..19585969993d6eaf16b62f7abcf01fdefae3fad4 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -16,15 +16,14 @@ limitations under the License.
 // See docs in ../ops/nn_ops.cc.
 #ifdef INTEL_MKL
 
+#include "mkldnn.hpp"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
-
-#ifndef INTEL_MKL_ML_ONLY
-#include "mkldnn.hpp"
+#include "tensorflow/core/util/mkl_util.h"
 
 using mkldnn::algorithm;
 using mkldnn::eltwise_bounded_relu;
@@ -36,16 +35,9 @@ using mkldnn::prop_kind;
 using mkldnn::relu_backward;
 using mkldnn::relu_forward;
 using mkldnn::stream;
-#else
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-#endif
-#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 
-#ifndef INTEL_MKL_ML_ONLY
-
 template <typename T>
 class MklEltwiseFwdParams {
  public:
@@ -204,7 +196,7 @@ class MklEltwiseFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
   ~MklEltwiseFwdPrimitiveFactory() {}
 
   static string CreateKey(const MklEltwiseFwdParams<T>& fwdParams,
-                               memory::format src_fmt) {
+                          memory::format src_fmt) {
     string prefix = "eltwise_fwd";
     FactoryKeyCreator key_creator;
     key_creator.AddAsKey(prefix);
@@ -422,8 +414,8 @@ class MklEltwiseBwdPrimitiveFactory : public MklPrimitiveFactory<T> {
 
  private:
   static string CreateKey(const MklEltwiseBwdParams<T>& bwdParams,
-                               const memory::format& src_fmt,
-                               const memory::format& diff_dst_fmt) {
+                          const memory::format& src_fmt,
+                          const memory::format& diff_dst_fmt) {
     string prefix = "eltwise_bwd";
     FactoryKeyCreator key_creator;
     key_creator.AddAsKey(prefix);
@@ -451,335 +443,8 @@ class MklEltwiseBwdPrimitiveFactory : public MklPrimitiveFactory<T> {
   }
 };
 
-#endif
-
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-struct MklReluHelpers {
-  static void ValidateSameSizeHelper(OpKernelContext* context, const Tensor& g,
-                                     const Tensor& a) {
-    OP_REQUIRES(context, a.IsSameSize(g),
-                errors::InvalidArgument("g and a must be the same size"));
-  }
-  static bool ValidateSameSize(OpKernelContext* context, const Tensor& g,
-                               const Tensor& a) {
-    ValidateSameSizeHelper(context, g, a);
-    return context->status().ok();
-  }
-};
-
-#ifdef INTEL_MKL_ML_ONLY
-
-template <typename Device, typename T>
-class MklReluOp : public OpKernel {
- public:
-  ~MklReluOp() {}
-
-  explicit MklReluOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    MklReluOpContext mkl_context;
-
-    const Tensor& input = MklGetInput(context, 0);
-    GetMklShape(context, 0, &mkl_context.input_shape);
-    void* user_i = static_cast<void*>(const_cast<T*>(input.flat<T>().data()));
-    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
-
-    if (!input_in_mkl_format && !input.dims()) {  // handle the case of a scalar
-      const TensorShape& o_shape = input.shape();
-      Tensor* out_tensor = nullptr;
-      mkl_context.output_shape.SetMklTensor(false);
-      AllocateOutputSetMklShape(context, 0, &out_tensor, o_shape,
-                                mkl_context.output_shape);
-      void* out_o = static_cast<void*>(out_tensor->flat<T>().data());
-      (static_cast<T*>(out_o))[0] =
-          std::max((static_cast<T*>(user_i))[0], static_cast<T>(0));
-      return;
-    }
-
-    // Generate size, stride for input if input is in MKL format.
-    if (input_in_mkl_format) {
-      mkl_context.in_dims = mkl_context.input_shape.GetDimension();
-      mkl_context.in_sizes = new size_t[mkl_context.in_dims];
-      mkl_context.in_strides = new size_t[mkl_context.in_dims];
-      for (int i = 0; i < mkl_context.in_dims; i++) {
-        mkl_context.in_sizes[i] = mkl_context.input_shape.GetSizes()[i];
-        mkl_context.in_strides[i] = mkl_context.input_shape.GetStrides()[i];
-      }
-    } else {
-      mkl_context.in_dims = input.dims();
-      mkl_context.in_sizes = new size_t[mkl_context.in_dims];
-      mkl_context.in_strides = new size_t[mkl_context.in_dims];
-      for (int i = 0; i < mkl_context.in_dims; i++) {
-        mkl_context.in_sizes[i] = input.dim_size((mkl_context.in_dims - 1) - i);
-      }
-      mkl_context.in_strides[0] = 1;
-      for (int i = 1; i < mkl_context.in_dims; i++) {
-        mkl_context.in_strides[i] =
-            mkl_context.in_strides[i - 1] * mkl_context.in_sizes[i - 1];
-      }
-    }
-
-    float negative_slope = 0.0;
-    mkl_context.MklCreateInputLayouts(context);
-    CHECK_EQ(dnnReLUCreateForward_F32(&mkl_context.prim_relu_fwd, NULL,
-                                      mkl_context.lt_input, negative_slope),
-             E_SUCCESS);
-
-    Tensor* output = nullptr;
-
-    if (input_in_mkl_format) {
-      TensorShape tf_shape;
-      mkl_context.output_shape.SetMklTensor(true);
-      mkl_context.output_shape.SetMklLayout(mkl_context.prim_relu_fwd,
-                                            dnnResourceDst);
-      mkl_context.output_shape.SetTfLayout(
-          mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
-      mkl_context.output_shape.SetTfDimOrder(
-          mkl_context.in_dims, mkl_context.input_shape.GetTfToMklDimMap());
-      tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                          mkl_context.output_shape.GetMklLayout())) /
-                      sizeof(T));
-      AllocateOutputSetMklShape(context, 0, &output, tf_shape,
-                                mkl_context.output_shape);
-    } else {
-      const TensorShape& o_shape = input.shape();
-      mkl_context.output_shape.SetMklTensor(false);
-      AllocateOutputSetMklShape(context, 0, &output, o_shape,
-                                mkl_context.output_shape);
-    }
-
-    void* user_o = static_cast<void*>(const_cast<T*>(output->flat<T>().data()));
-
-    mkl_context.relu_res[dnnResourceDst] = user_o;
-    mkl_context.relu_res[dnnResourceSrc] = user_i;
-    CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_fwd, mkl_context.relu_res),
-             E_SUCCESS);
-    mkl_context.MklCleanup();
-  }
-
- private:
-  typedef struct {
-    int in_dims;
-    size_t* in_sizes;
-    size_t* in_strides;
-    MklShape input_shape, output_shape;
-    dnnPrimitive_t prim_relu_fwd = nullptr;
-    void* relu_res[dnnResourceNumber];
-    dnnLayout_t lt_input = nullptr;
-
-    void MklCleanup() {
-      bool input_in_mkl_format = input_shape.IsMklTensor();
-      if (!input_in_mkl_format) {
-        dnnLayoutDelete_F32(lt_input);
-        free(in_sizes);
-        free(in_strides);
-      }
-      dnnDelete_F32(prim_relu_fwd);
-    }
-
-    void MklCreateInputLayouts(OpKernelContext* context) {
-      bool input_in_mkl_format = input_shape.IsMklTensor();
-      if (!input_in_mkl_format) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
-      } else {
-        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
-      }
-    }
-  } MklReluOpContext;
-};
-
-template <typename Device, typename T>
-class MklReluGradOp : public OpKernel {
- public:
-  ~MklReluGradOp() {}
-
-  explicit MklReluGradOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override;
-
- private:
-  typedef struct {
-    int in_dims;
-    size_t* in_sizes;
-    size_t* in_strides;
-    MklShape input_shape, grad_shape, output_shape;
-    void* relu_res[dnnResourceNumber];
-    dnnPrimitive_t prim_relu_bwd;
-    dnnLayout_t lt_input, lt_grad;
-
-    void MklPrepareReluGradInputs(OpKernelContext* context,
-                                  Tensor* mkl_tmp_input_buf_tensor) {
-      const Tensor& g = MklGetInput(context, 0);
-      const Tensor& a = MklGetInput(context, 1);
-      void* buf_input = static_cast<void*>(const_cast<T*>(a.flat<T>().data()));
-      void* mkl_buffer_convert = nullptr;
-
-      dnnPrimitive_t cv_input_to_grad = nullptr;
-
-      // if input and grad are not in the same layout,
-      // do a conversion between them.
-      if (!dnnLayoutCompare_F32(lt_input, lt_grad)) {
-        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_grad,
-                       &mkl_buffer_convert);
-        CHECK_EQ(dnnConversionCreate_F32(&cv_input_to_grad, lt_input, lt_grad),
-                 E_SUCCESS);
-        CHECK_EQ(dnnConversionExecute_F32(cv_input_to_grad, buf_input,
-                                          mkl_buffer_convert),
-                 E_SUCCESS);
-        relu_res[dnnResourceSrc] = mkl_buffer_convert;
-        dnnDelete_F32(cv_input_to_grad);
-      } else {
-        relu_res[dnnResourceSrc] = buf_input;
-      }
-
-      void* buf_grad = static_cast<void*>(const_cast<T*>(g.flat<T>().data()));
-      relu_res[dnnResourceDiffDst] = buf_grad;
-    }
-
-    void MklCreateInputLayouts(OpKernelContext* context) {
-      bool grad_is_mkl = grad_shape.IsMklTensor();
-      bool input_is_mkl = input_shape.IsMklTensor();
-      if (!input_is_mkl) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
-      } else {
-        lt_input = static_cast<dnnLayout_t>(input_shape.GetCurLayout());
-      }
-
-      if (!grad_is_mkl) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_grad, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
-      } else {
-        lt_grad = static_cast<dnnLayout_t>(grad_shape.GetCurLayout());
-      }
-    }
-
-    void MklCleanup() {
-      bool grad_is_mkl = grad_shape.IsMklTensor();
-      bool input_is_mkl = input_shape.IsMklTensor();
-      dnnDelete_F32(prim_relu_bwd);
-      if (!input_is_mkl) {
-        dnnLayoutDelete_F32(lt_input);
-        free(in_sizes);
-        free(in_strides);
-      }
-      if (!grad_is_mkl) {
-        dnnLayoutDelete_F32(lt_grad);
-      }
-    }
-  } MklReluGradOpContext;
-};
-
-template <typename Device, typename T>
-void MklReluGradOp<Device, T>::Compute(OpKernelContext* context) {
-  MklReluGradOpContext mkl_context;
-  const Tensor& g = MklGetInput(context, 0);
-  const Tensor& a = MklGetInput(context, 1);
-
-  void* user_i = static_cast<void*>(const_cast<T*>(a.flat<T>().data()));
-  void* user_g = static_cast<void*>(const_cast<T*>(g.flat<T>().data()));
-
-  GetMklShape(context, 0, &mkl_context.grad_shape);
-  GetMklShape(context, 1, &mkl_context.input_shape);
-
-  bool grad_is_mkl = mkl_context.grad_shape.IsMklTensor();
-  bool input_is_mkl = mkl_context.input_shape.IsMklTensor();
-  if (!input_is_mkl && !grad_is_mkl &&
-      !MklReluHelpers::ValidateSameSize(context, g, a))
-    return;
-  Tensor* output = nullptr;
-
-  if (!input_is_mkl && !grad_is_mkl && !a.dims()) {
-    // handle the scalar case
-    const TensorShape& g_shape = g.shape();
-    mkl_context.output_shape.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, 0, &output, g_shape,
-                              mkl_context.output_shape);
-
-    void* out_o = static_cast<void*>(output->flat<T>().data());
-    (static_cast<T*>(out_o))[0] =
-        (static_cast<T*>(user_g))[0] * ((static_cast<T*>(user_i))[0] > 0);
-    return;
-  }
-
-  // generate size, stride for input if input/grad is in mkl format.
-  if (grad_is_mkl || input_is_mkl) {
-    const MklShape* tmp_mkl_shape =
-        (grad_is_mkl) ? &mkl_context.grad_shape : &mkl_context.input_shape;
-
-    mkl_context.in_dims = tmp_mkl_shape->GetDimension();
-    mkl_context.in_strides = new size_t[mkl_context.in_dims];
-    mkl_context.in_sizes = new size_t[mkl_context.in_dims];
-    for (int i = 0; i < mkl_context.in_dims; i++) {
-      mkl_context.in_sizes[i] = tmp_mkl_shape->GetSizes()[i];
-      mkl_context.in_strides[i] = tmp_mkl_shape->GetStrides()[i];
-    }
-  } else {
-    mkl_context.in_dims = g.dims();
-    mkl_context.in_strides = new size_t[mkl_context.in_dims];
-    mkl_context.in_sizes = new size_t[mkl_context.in_dims];
-
-    for (int i = 0; i < mkl_context.in_dims; i++) {
-      mkl_context.in_sizes[i] = g.dim_size((mkl_context.in_dims - 1) - i);
-    }
-    mkl_context.in_strides[0] = 1;
-    for (int i = 1; i < mkl_context.in_dims; i++) {
-      mkl_context.in_strides[i] =
-          mkl_context.in_strides[i - 1] * mkl_context.in_sizes[i - 1];
-    }
-  }
-
-  mkl_context.MklCreateInputLayouts(context);
-  float negative_slope = 0.0;
-  CHECK_EQ(dnnReLUCreateBackward_F32(&mkl_context.prim_relu_bwd, NULL,
-                                     mkl_context.lt_grad, mkl_context.lt_grad,
-                                     negative_slope),
-           E_SUCCESS);
-  Tensor mkl_tmp_input_buf_tensor;
-  mkl_context.MklPrepareReluGradInputs(context, &mkl_tmp_input_buf_tensor);
-
-  if (input_is_mkl ||
-      grad_is_mkl) { /*if  grad or input are mkl leave it in mkl*/
-    TensorShape tf_shape;
-    mkl_context.output_shape.SetMklTensor(true);
-    mkl_context.output_shape.SetMklLayout(mkl_context.prim_relu_bwd,
-                                          dnnResourceDiffSrc);
-    mkl_context.output_shape.SetTfLayout(
-        mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
-    // if input_is_mkl or grad_is_mkl, then we copy strides and sizes from mkl
-    // shape of one that is in mkl layout.
-    if (grad_is_mkl == true) {
-      mkl_context.output_shape.SetTfDimOrder(
-          mkl_context.in_dims, mkl_context.grad_shape.GetTfToMklDimMap());
-    } else {
-      mkl_context.output_shape.SetTfDimOrder(
-          mkl_context.in_dims, mkl_context.input_shape.GetTfToMklDimMap());
-    }
-
-    tf_shape.AddDim(dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                        mkl_context.output_shape.GetMklLayout())) /
-                    sizeof(T));
-    AllocateOutputSetMklShape(context, 0, &output, tf_shape,
-                              mkl_context.output_shape);
-  } else {
-    const TensorShape& o_shape = g.shape();
-    mkl_context.output_shape.SetMklTensor(false);
-    AllocateOutputSetMklShape(context, 0, &output, o_shape,
-                              mkl_context.output_shape);
-  }
-
-  mkl_context.relu_res[dnnResourceDiffSrc] =
-      static_cast<void*>(output->flat<T>().data());
-
-  CHECK_EQ(dnnExecute_F32(mkl_context.prim_relu_bwd, mkl_context.relu_res),
-           E_SUCCESS);
-  mkl_context.MklCleanup();
-}
-
-#else  // INTEL_MKL_ML_ONLY
-
 template <typename Device, typename T, algorithm alg_kind>
 class MklReluOpBase : public OpKernel {
  public:
@@ -856,9 +521,9 @@ class MklReluOpBase : public OpKernel {
 
       Tensor* dst_tensor = nullptr;
       OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
-                                      {static_cast<const int>(src_index)},
-                                      static_cast<const int>(dst_index),
-                                      tf_shape_dst, &dst_tensor));
+                                  {static_cast<const int>(src_index)},
+                                  static_cast<const int>(dst_index),
+                                  tf_shape_dst, &dst_tensor));
       AllocateOutputSetMklShape(context, dst_index, dnn_shape_dst);
 
       T* dst_data = dst_tensor->flat<T>().data();
@@ -867,18 +532,19 @@ class MklReluOpBase : public OpKernel {
       eltwise_fwd->Execute(src_data, dst_data);
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) +
-                         ", in file " + string(__FILE__) + ":" +
-                         std::to_string(__LINE__);
-      OP_REQUIRES_OK(context,
-                     errors::Aborted("Operation received an exception:",
-                        error_msg));
+                         ", message: " + string(e.message) + ", in file " +
+                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          context,
+          errors::Aborted("Operation received an exception:", error_msg));
     }
   }
 
  private:
   engine cpu_engine = engine(engine::cpu, 0);
   std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
+
+ protected:
   float alpha_;
   float beta_;
 };
@@ -947,11 +613,11 @@ class MklReluGradOpBase : public OpKernel {
         auto diff_dst_tf_data_format =
             MklDnnDataFormatToTFDataFormat(diff_dst_mkl_data_format);
 
-        src_dims = (src_tensor.dims() == 4) 
-                 ? TFShapeToMklDnnDimsInNCHW(src_tensor.shape(),
-                                             diff_dst_tf_data_format)
-                 : TFShapeToMklDnnDimsInNCDHW(src_tensor.shape(),
-                                              diff_dst_tf_data_format);
+        src_dims = (src_tensor.dims() == 4)
+                       ? TFShapeToMklDnnDimsInNCHW(src_tensor.shape(),
+                                                   diff_dst_tf_data_format)
+                       : TFShapeToMklDnnDimsInNCDHW(src_tensor.shape(),
+                                                    diff_dst_tf_data_format);
         src_md =
             memory::desc(src_dims, MklDnnType<T>(), diff_dst_mkl_data_format);
       } else {
@@ -1001,8 +667,7 @@ class MklReluGradOpBase : public OpKernel {
       // allocate diff_src tensor
       MklDnnShape dnn_shape_diff_src;
       TensorShape tf_shape_diff_src;
-      if (dnn_shape_src.IsMklTensor() ||
-              dnn_shape_diff_dst.IsMklTensor()) {
+      if (dnn_shape_src.IsMklTensor() || dnn_shape_diff_dst.IsMklTensor()) {
         auto diff_src_pd = eltwise_bwd_pd->diff_src_primitive_desc();
         dnn_shape_diff_src.SetMklTensor(true);
         dnn_shape_diff_src.SetMklLayout(&diff_src_pd);
@@ -1012,9 +677,10 @@ class MklReluGradOpBase : public OpKernel {
                                          dnn_shape_src.GetSizesAsMklDnnDims(),
                                          dnn_shape_src.GetTfDataFormat());
         } else {
-          dnn_shape_diff_src.SetTfLayout(dnn_shape_diff_dst.GetDimension(),
-                                 dnn_shape_diff_dst.GetSizesAsMklDnnDims(),
-                                 dnn_shape_diff_dst.GetTfDataFormat());
+          dnn_shape_diff_src.SetTfLayout(
+              dnn_shape_diff_dst.GetDimension(),
+              dnn_shape_diff_dst.GetSizesAsMklDnnDims(),
+              dnn_shape_diff_dst.GetTfDataFormat());
         }
         tf_shape_diff_src.AddDim(diff_src_pd.get_size() / sizeof(T));
       } else {
@@ -1045,6 +711,8 @@ class MklReluGradOpBase : public OpKernel {
  private:
   engine cpu_engine = engine(engine::cpu, 0);
   std::shared_ptr<relu_forward::primitive_desc> relu_fwd_pd;
+
+ protected:
   float alpha_;
   float beta_;
 };
@@ -1312,13 +980,89 @@ class MklRelu6GradOp
     T* out_o = diff_src_tensor->flat<T>().data();
     T* user_i = const_cast<T*>(src_tensor.flat<T>().data());
     T* user_g = const_cast<T*>(diff_dst_tensor.flat<T>().data());
-    out_o[0] = user_g[0] * user_i[0] > 0 &&
-               (user_i[0] < static_cast<T>(RELU6_UPPER_BOUND));
+    out_o[0] = user_g[0] * (user_i[0] > 0 &&
+                            (user_i[0] < static_cast<T>(RELU6_UPPER_BOUND)));
+    return;
+  }
+};
+
+template <typename Device, typename T>
+class MklLeakyReluOp : public MklReluOpBase<Device, T, eltwise_relu> {
+ public:
+  ~MklLeakyReluOp() {}
+
+  explicit MklLeakyReluOp(OpKernelConstruction* context)
+      : MklReluOpBase<Device, T, eltwise_relu>(context, 0.0f, 0.0f) {
+    float alpha;
+    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha));
+    OP_REQUIRES(
+        context, alpha <= 1,
+        errors::InvalidArgument("MKL LeakyRelu only supports alpha <= 1. "
+                                "alpha is: ",
+                                alpha));
+
+    this->alpha_ = alpha;
+  }
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t src_index = 0;  // index of src input tensor
+    const size_t dst_index = 0;  // index of dst output tensor
+    const Tensor& src_tensor = MklGetInput(context, src_index);
+    MklDnnShape dnn_shape_src;
+    GetMklShape(context, src_index, &dnn_shape_src);
+
+    Tensor* dst_tensor = nullptr;
+    T* user_i = const_cast<T*>(src_tensor.flat<T>().data());
+    MklDnnShape dnn_shape_dst;
+    dnn_shape_dst.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, dst_index, &dst_tensor,
+                              src_tensor.shape(), dnn_shape_dst);
+    T* out_o = dst_tensor->flat<T>().data();
+    out_o[0] = user_i[0] >= 0 ? user_i[0] : user_i[0] * this->alpha_;
     return;
   }
 };
 
-#endif
+template <typename Device, typename T>
+class MklLeakyReluGradOp : public MklReluGradOpBase<Device, T, eltwise_relu> {
+ public:
+  ~MklLeakyReluGradOp() {}
+
+  explicit MklLeakyReluGradOp(OpKernelConstruction* context)
+      : MklReluGradOpBase<Device, T, eltwise_relu>(context, 0.0f, 0.0f) {
+    float alpha;
+    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha));
+    OP_REQUIRES(
+        context, alpha <= 1,
+        errors::InvalidArgument("MKL LeakyRelu only supports alpha <= 1. "
+                                "alpha is: ",
+                                alpha));
+
+    this->alpha_ = alpha;
+  }
+
+  virtual void Compute_Scalar(OpKernelContext* context) {
+    const size_t diff_dst_index = 0;  // index of diff_dst input tensor
+    const size_t src_index = 1;       // index of src input tensor
+    const size_t diff_src_index = 0;  // index of diff_src output tensor
+    const Tensor& src_tensor = MklGetInput(context, src_index);
+    const Tensor& diff_dst_tensor = MklGetInput(context, diff_dst_index);
+    Tensor* diff_src_tensor = nullptr;
+
+    MklDnnShape dnn_shape_diff_dst;
+    GetMklShape(context, diff_dst_index, &dnn_shape_diff_dst);
+
+    MklDnnShape dnn_shape_diff_src;
+    dnn_shape_diff_src.SetMklTensor(false);
+    AllocateOutputSetMklShape(context, diff_src_index, &diff_src_tensor,
+                              diff_dst_tensor.shape(), dnn_shape_diff_src);
+    T* out_o = diff_src_tensor->flat<T>().data();
+    T* user_i = const_cast<T*>(src_tensor.flat<T>().data());
+    T* user_g = const_cast<T*>(diff_dst_tensor.flat<T>().data());
+    out_o[0] = user_i[0] >= 0 ? user_g[0] : user_g[0] * this->alpha_;
+    return;
+  }
+};
 
 // register dnn kernels for supported operations and supported types
 #define REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES(type)             \
@@ -1334,8 +1078,6 @@ class MklRelu6GradOp
                           MklReluGradOp<CPUDevice, type>);
 TF_CALL_float(REGISTER_RELU_MKL_SUPPORTED_KERNELS_TYPES);
 
-#ifndef INTEL_MKL_ML_ONLY
-
 // register dnn kernels for supported operations and supported types
 #define REGISTER_ELU_MKL_SUPPORTED_KERNELS_TYPES(type)              \
   REGISTER_KERNEL_BUILDER(Name("_MklElu")                           \
@@ -1376,7 +1118,18 @@ TF_CALL_float(REGISTER_TANH_MKL_SUPPORTED_KERNELS_TYPES);
                           MklRelu6GradOp<CPUDevice, type>);
 TF_CALL_float(REGISTER_RELU6_MKL_SUPPORTED_KERNELS_TYPES);
 
-#endif
+#define REGISTER_LeakyRelu_MKL_SUPPORTED_KERNELS_TYPES(type)        \
+  REGISTER_KERNEL_BUILDER(Name("_MklLeakyRelu")                     \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklLeakyReluOp<CPUDevice, type>);         \
+  REGISTER_KERNEL_BUILDER(Name("_MklLeakyReluGrad")                 \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<type>("T")            \
+                              .Label(mkl_op_registry::kMklOpLabel), \
+                          MklLeakyReluGradOp<CPUDevice, type>);
+TF_CALL_float(REGISTER_LeakyRelu_MKL_SUPPORTED_KERNELS_TYPES);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc b/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..767a6f1c3976d335bfd660f3a6990c03805843ba
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_requantization_range_per_channel_op.cc
@@ -0,0 +1,124 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/array_ops.cc.
+#ifdef INTEL_MKL
+#define EIGEN_USE_THREADS
+
+#include <math.h>
+#include <limits>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/meta_support.h"
+#include "tensorflow/core/kernels/no_op.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+class MklRequantizationRangePerChannelOp : public OpKernel {
+ public:
+  explicit MklRequantizationRangePerChannelOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("clip_value_max", &clip_value_max_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& input = ctx->input(kInputTensorIndex);
+    const Tensor& input_min = ctx->input(kInputMinIndex);
+    const Tensor& input_max = ctx->input(kInputMaxIndex);
+
+    const size_t depth = input_max.NumElements();
+    OP_REQUIRES(
+        ctx, input_min.dim_size(0) == depth,
+        errors::InvalidArgument("input_min has incorrect size, expected ",
+                                depth, " was ", input_min.dim_size(0)));
+    OP_REQUIRES(
+        ctx, input_max.dim_size(0) == depth,
+        errors::InvalidArgument("input_max has incorrect size, expected ",
+                                depth, " was ", input_max.dim_size(0)));
+
+    const float* input_min_data = input_min.flat<float>().data();
+    const float* input_max_data = input_max.flat<float>().data();
+    std::vector<float> ranges(depth);
+    bool is_non_negative = true;
+    Eigen::array<int, 2> shuffling({1, 0});
+    auto input_matrix = input.flat_inner_dims<qint32>();
+
+    // TODO: verify performance of not transposing and finding the min max
+    // directly from input_matrix vs the one presented below of transposing and
+    // using the transposed matrix as the transposing operation in itself might
+    // be more costly.
+    // Note that this operation is a calibration step for quantization and will
+    // cease to exist in the final inference graph(will exist as a const node).
+    auto transposed_input = input_matrix.shuffle(shuffling);
+
+    // Find the ranges of each channel in parallel.
+    float out_min_max = std::numeric_limits<float>::min();
+#pragma omp parallel for reduction(max : out_min_max)
+    for (size_t i = 0; i < depth; ++i) {
+      Eigen::Tensor<qint32, 0, Eigen::RowMajor> min =
+          transposed_input.chip<0>(i).minimum();
+      Eigen::Tensor<qint32, 0, Eigen::RowMajor> max =
+          transposed_input.chip<0>(i).maximum();
+      const int32_t min_per_channel = min();
+      const int32_t max_per_channel = max();
+      const int32_t abs_max =
+          std::max(std::abs(min_per_channel), std::abs(max_per_channel));
+      float scale =
+          std::max(std::abs(input_min_data[i]), std::abs(input_max_data[i]));
+      ranges[i] =
+          scale * static_cast<float>(abs_max) / static_cast<float>(1L << 31);
+      if (min_per_channel < 0) is_non_negative = false;
+
+      // Thread-local out_min_max.
+      out_min_max = std::max(out_min_max, ranges[i]);
+    }
+    // All local out_min_max gets max-reduced into one global out_min_max at
+    // the end of the loop by specifying reduction(max:out_min_max) along with
+    // omp parallel for.
+
+    // Fixing max to clip_value_max_ (example 6.0 to support relu6)
+    if (out_min_max > clip_value_max_) out_min_max = clip_value_max_;
+
+    Tensor* output_min = nullptr;
+    Tensor* output_max = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(kOutputMinIndex, {}, &output_min));
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(kOutputMaxIndex, {}, &output_max));
+    output_min->flat<float>()(0) = is_non_negative ? 0.0f : -out_min_max;
+    output_max->flat<float>()(0) = out_min_max;
+  }
+
+ private:
+  float clip_value_max_ = std::numeric_limits<float>::infinity();
+  const int kInputTensorIndex = 0;
+  const int kInputMinIndex = 1;
+  const int kInputMaxIndex = 2;
+  const int kOutputMinIndex = 0;
+  const int kOutputMaxIndex = 1;
+};
+
+REGISTER_KERNEL_BUILDER(Name("RequantizationRangePerChannel")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<qint32>("T"),
+                        MklRequantizationRangePerChannelOp);
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_requantize_ops_test.cc b/tensorflow/core/kernels/mkl_requantize_ops_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9961462754f4c2378f7e46931d7878ca283278a5
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_requantize_ops_test.cc
@@ -0,0 +1,300 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if defined(INTEL_MKL) && defined(ENABLE_MKL)
+
+#include <cmath>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+class MklRequantizatedOpsTest : public OpsTestBase {};
+
+class MklRequantizatedOpsTestHelper : public OpsTestBase {
+ public:
+  void Setup(Tensor &input_tensor_qint32, float &range_weights_ch1,
+             float &range_weights_ch2);
+  void TestBody() {}
+};
+
+void MklRequantizatedOpsTestHelper::Setup(Tensor &input_tensor_qint32,
+                                          float &range_weights_ch1,
+                                          float &range_weights_ch2) {
+  // Step 1: Input range assumptions
+  // -------------------------------
+  // Assume input tensor T (NHWC) in FP32 has range [0, 5.0]   size nt*ht*wt*ct
+  // Assume input filter W (NHWC) with 2 output channels of    size nw*ht*wt*2
+  // logically,   filter W has 2 channels W1 and W2 each of    size nw*ht*wt*1
+  // Assume input filter W1(NHWC) in FP32 has range [-2.0, 2.0]size nw*ht*wt*1
+  // Assume input filter W2(NHWC) in FP32 has range [-3.0, 3.0]size nw*ht*wt*1
+
+  // Step 2: Quantization details (per channel)
+  // ------------------------------------------
+  // T and W are quantized using a quantize op.
+  // The input tensor T (NHWC) is quantized to unsigned int8.
+  // Hence T's max value is mapped to ((2^8-1) = 255).
+  // The input filter W (NHWC) is quantized to signed int8.
+  // Hence W's max value is mapped to ((2^7)-1 = 127)).
+
+  // Range of quantized T  in uint8[0  , 255] maps to orig T  in FP32[0   , 5.0]
+  // Range of quantized W1 in int8[-127, 127] maps to orig W1 in FP32[-2.0, 2.0]
+  // Range of quantized W2 in int8[-127, 127] maps to orig W2 in FP32[-3.0, 3.0]
+
+  // Hence the resolution of quantized T will be 5.0/255
+  // Hence the resolution of quantized W1 will be 2.0/127
+  // Hence the resolution of quantized W2 will be 3.0/127
+
+  // Step 3: Assumption of quantizedconv on quantized input&weights(per channel)
+  // ---------------------------------------------------------------------------
+  // The input T and weights W1 (or W2) will be convolved.
+  // The output tensor T is in int32 whose range is [-2^31, 2^31).
+  // For simplicity and symmetry, we truncate the above range to (-2^31, 2^31).
+  // The range of convolved T*W1 is ((2^31)-1) * 5.0/255 * 2.0/127 = 663110.59
+  // So the range of convolved T*W1 in int32(-2^31, 2^31) that maps to
+  // orig T range in FP32[0, 5.0] * [-2.0, 2.0] is [-663110.59, 663110.59].
+
+  // The range of convolved T*W2 is (2^31-1) * 5.0/255 * 3.0/127 = 994665.88
+  // So the range of convolved T*W2 in int32(-2^31, 2^31) that maps to
+  // orig T range in FP32 [0, 5.0] * [-3.0, 3.0]  is [-994665.88, 994665.88]
+
+  // Step 4: Assumption output above is fed to requantization_range_perchannel
+  // --------------------------------------------------------------------------
+  // Here we recalculate the new range for convolved T*W so that we
+  // make good use in int8 quantization from int32 to int8.
+
+  // We assume the above operations are performed and use these values above
+  // as ranges for requantization_range_perchannel_op.
+  range_weights_ch1 = 663110.59;  // For W1 channel
+  range_weights_ch2 = 994665.88;  // For W2 Channel
+
+  // We Fill the input tensor T qint32 with arbitrary int32 values
+  test::FillValues<qint32>(
+      &input_tensor_qint32,
+      {-1000, -2000,  2000,   4000,   -3000,  -6000,  4000,   8000,
+       5000,  10000,  -6000,  -12000, 7000,   14000,  8000,   16000,
+       9000,  -18000, -10000, -20000, 11000,  22000,  -12000, -24000,
+       13000, 26000,  14000,  28000,  -15000, -30000, 16000,  32000});
+
+  // Step 5: Define and run requantization_range_perchannel
+  // -------------------------------------------------------
+  // See test RequantizationRangePerChannelTest_Basic and/or
+  // test RequantizationRangePerChannelTest_ClipMax
+}
+
+// Tests the RequantizationRangePerChannel op wherein the range
+// of the weights is calculated per channel.
+TEST_F(MklRequantizatedOpsTest, RequantizationRangePerChannelTest_Basic) {
+  // Let us set up the tensor and inputs before we run this op.
+  float clip_value_max = static_cast<float>((1L << 31) - 1);
+  float range_weights_ch1 = 0.0;
+  float range_weights_ch2 = 0.0;
+
+  // Create the input tensor
+  const int input_height = 4;
+  const int input_width = 4;
+  const int input_channels = 2;
+
+  // Define the shape of T.
+  Tensor input_tensor_qint32(DT_QINT32,
+                             {1, input_height, input_width, input_channels});
+
+  // Explanation and setup prior to this op. Fill T and populate range values.
+  MklRequantizatedOpsTestHelper helper;
+  helper.Setup(input_tensor_qint32, range_weights_ch1, range_weights_ch2);
+
+  // Step 5: Define and run requantization_range_perchannel
+  // -------------------------------------------------------
+  // Define, create and initialize the op in question.
+  TF_ASSERT_OK(NodeDefBuilder("requantization_range_per_channel",
+                              "RequantizationRangePerChannel")
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("T", DataTypeToEnum<qint32>::v())
+                   .Attr("clip_value_max", clip_value_max)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  // Add the input nodes to the op.
+  AddInputFromArray<qint32>(input_tensor_qint32.shape(),
+                            input_tensor_qint32.flat<qint32>());
+
+  // Calculate the min and max from the ranges
+  float ch1_min = -range_weights_ch1;
+  float ch1_max = range_weights_ch1;
+  float ch2_min = -range_weights_ch2;
+  float ch2_max = range_weights_ch2;
+
+  // Add the perchannel range Nodes to the op.
+  AddInputFromArray<float>(TensorShape({input_channels}), {ch1_min, ch2_min});
+  AddInputFromArray<float>(TensorShape({input_channels}), {ch1_max, ch2_max});
+
+  // Run the kernel
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Step 6: Verify output and store values to test requantize_perchannel
+  // --------------------------------------------------------------------
+
+  // Verify the Expected Outputs
+  const float output_min = GetOutput(0)->flat<float>()(0);
+  const float output_max = GetOutput(1)->flat<float>()(0);
+  EXPECT_NEAR(-14.8217, output_min, 0.002);
+  EXPECT_NEAR(14.8217, output_max, 0.002);
+
+  // Output range is made use in RequantizePerChannelTest_Basic
+}
+
+TEST_F(MklRequantizatedOpsTest, RequantizationRangePerChannelTest_ClipMax) {
+  // Let us setup the tensor and inputs before we run this op.
+  float clip_value_max = 6;  // Can be used as 6 for Relu 6 activations.
+  float range_weights_ch1 = 0.0;
+  float range_weights_ch2 = 0.0;
+
+  // Create the input tensor
+  const int input_height = 4;
+  const int input_width = 4;
+  const int input_channels = 2;
+
+  // define and input tensor T shape.
+  Tensor input_tensor_qint32(DT_QINT32,
+                             {1, input_height, input_width, input_channels});
+
+  // Explanation and setup prior to this op. Fill T and populate range values.
+  MklRequantizatedOpsTestHelper helper;
+  helper.Setup(input_tensor_qint32, range_weights_ch1, range_weights_ch2);
+
+  // Step 5: Define and run requantization_range_perchannel
+  // -------------------------------------------------------
+  // Define, create and initialize the op in question.
+  TF_ASSERT_OK(NodeDefBuilder("requantization_range_per_channel",
+                              "RequantizationRangePerChannel")
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("T", DataTypeToEnum<qint32>::v())
+                   .Attr("clip_value_max", clip_value_max)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  // Add the input nodes to the op.
+  AddInputFromArray<qint32>(input_tensor_qint32.shape(),
+                            input_tensor_qint32.flat<qint32>());
+
+  // Calculate the min and max from the ranges
+  float ch1_min = -range_weights_ch1;
+  float ch1_max = range_weights_ch1;
+  float ch2_min = -range_weights_ch2;
+  float ch2_max = range_weights_ch2;
+
+  // Add the perchannel range nodes to the op.
+  AddInputFromArray<float>(TensorShape({input_channels}), {ch1_min, ch2_min});
+  AddInputFromArray<float>(TensorShape({input_channels}), {ch1_max, ch2_max});
+
+  // Run the kernel
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Step 6: Verify output and store values to test requantize_perchannel
+  // --------------------------------------------------------------------
+
+  // Verify the expected outputs
+  const float output_min = GetOutput(0)->flat<float>()(0);
+  const float output_max = GetOutput(1)->flat<float>()(0);
+  EXPECT_NEAR(-6.0, output_min, 0.002);  // Values are aligned with clip_value.
+  EXPECT_NEAR(6.0, output_max, 0.002);   // Values are aligned with clip_value.
+}
+
+TEST_F(MklRequantizatedOpsTest, RequantizePerChannelTest_Basic) {
+  // Let us setup the tensor and inputs before we run this op.
+  float range_weights_ch1 = 0.0;
+  float range_weights_ch2 = 0.0;
+
+  // Create the input tensor
+  const int input_height = 4;
+  const int input_width = 4;
+  const int input_channels = 2;
+
+  // define an input tensor T shape.
+  Tensor input_tensor_qint32(DT_QINT32,
+                             {1, input_height, input_width, input_channels});
+
+  // Explanation and setup prior to this op. Fill T and populate range values.
+  MklRequantizatedOpsTestHelper helper;
+  helper.Setup(input_tensor_qint32, range_weights_ch1, range_weights_ch2);
+
+  // Step 7: Define and run requantize_perchannel
+  // --------------------------------------------
+  // The output of requantization_range_op_per_channel which calculated the
+  // new ranges of int8 is fed to the requantize per channel op.
+  // Here the values of convolved T*W is converted from int32 to int8.
+
+  TF_ASSERT_OK(NodeDefBuilder("requantize_per_channel", "RequantizePerChannel")
+                   .Input(FakeInput(DT_QINT32))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("T", DataTypeToEnum<qint32>::v())
+                   .Attr("out_type", DataTypeToEnum<qint8>::v())
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+
+  // Add the input Nodes to the op.
+  AddInputFromArray<qint32>(input_tensor_qint32.shape(),
+                            input_tensor_qint32.flat<qint32>());
+
+  // Calculate the min and max from the ranges
+  float ch1_min = -range_weights_ch1;
+  float ch1_max = range_weights_ch1;
+  float ch2_min = -range_weights_ch2;
+  float ch2_max = range_weights_ch2;
+
+  // Add the perchannel range nodes to the op.
+  AddInputFromArray<float>(TensorShape({input_channels}), {ch1_min, ch2_min});
+  AddInputFromArray<float>(TensorShape({input_channels}), {ch1_max, ch2_max});
+
+  // Calculate the min and max from Step 6 above
+  // in RequantizationRangePerChannelTest_Basic
+  float range_op_output_min = -14.8217;
+  float range_op_output_max = 14.8217;
+
+  // Add the requested_min and requested_max stored from Step 6.
+  AddInputFromArray<float>(TensorShape({1}), {range_op_output_min});
+  AddInputFromArray<float>(TensorShape({1}), {range_op_output_max});
+
+  // Run the kernel
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Verify the output with the expected output
+  Tensor output = *GetOutput(0);
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  EXPECT_NEAR(range_op_output_min, output_min, 0.002);
+  EXPECT_NEAR(range_op_output_max, output_max, 0.002);
+}
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL && ENABLE_MKL
diff --git a/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc b/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b5c1a01f8311e81fa2f0dd0945569f0b6980b0be
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc
@@ -0,0 +1,172 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/array_ops.cc.
+
+#ifdef INTEL_MKL
+#define EIGEN_USE_THREADS
+#include <math.h>
+
+#include "mkldnn.hpp"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/meta_support.h"
+#include "tensorflow/core/kernels/no_op.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/mkl_util.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Device, typename Toutput>
+class MklRequantizePerChannelOp : public OpKernel {
+ public:
+  explicit MklRequantizePerChannelOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("out_type", &out_type_));
+    OP_REQUIRES(ctx, out_type_ == DT_QINT8 || out_type_ == DT_QUINT8,
+                errors::InvalidArgument(
+                    "out_type must be qint8 or quint8, but got: " + out_type_));
+  }
+  virtual ~MklRequantizePerChannelOp() {}
+  void Compute(OpKernelContext* ctx) override {
+    try {
+      const Tensor& input = ctx->input(kInputTensorIndex);
+      const Tensor& input_min_vec = ctx->input(kInputMinVecIndex);
+      float* input_min_vec_data = (float*)const_cast<void*>(
+          static_cast<const void*>(input_min_vec.flat<float>().data()));
+      const Tensor& input_max_vec = ctx->input(kInputMaxVecIndex);
+      float* input_max_vec_data = (float*)const_cast<void*>(
+          static_cast<const void*>(input_max_vec.flat<float>().data()));
+
+      const Tensor& input_requested_min = ctx->input(this->kRequestMinIndex);
+      const float input_requested_min_float =
+          input_requested_min.flat<float>()(0);
+      const Tensor& input_requested_max = ctx->input(this->kRequestMaxIndex);
+      const float input_requested_max_float =
+          input_requested_max.flat<float>()(0);
+
+      size_t depth = input_min_vec.NumElements();
+      OP_REQUIRES(
+          ctx, input_min_vec.dim_size(0) == depth,
+          errors::InvalidArgument("input_min has incorrect size, expected ",
+                                  depth, " was ", input_min_vec.dim_size(0)));
+      OP_REQUIRES(
+          ctx, input_max_vec.dim_size(0) == depth,
+          errors::InvalidArgument("input_max has incorrect size, expected ",
+                                  depth, " was ", input_max_vec.dim_size(0)));
+
+      if (out_type_ == DT_QINT8) DCHECK(input_requested_min_float < 0.0f);
+
+      const float factor = (out_type_ == DT_QINT8) ? 127.0f : 255.0f;
+      const float requested_min_max =
+          std::max(std::abs(input_requested_min_float),
+                   std::abs(input_requested_max_float));
+      Tensor* output = nullptr;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(kOutputTensorIndex,
+                                               input.shape(), &output));
+
+      std::vector<float> scales(depth);
+      for (int i = 0; i < depth; ++i) {
+        float min_max_from_vec = std::max(std::abs(input_min_vec_data[i]),
+                                          std::abs(input_max_vec_data[i]));
+        scales[i] = factor * (min_max_from_vec / requested_min_max /
+                              static_cast<float>(1L << 31));
+      }
+
+      mkldnn::primitive_attr reorder_attr;
+      reorder_attr.set_output_scales(2, scales);
+
+      memory::dims dims_mkl_order =
+          TFShapeToMklDnnDimsInNCHW(input.shape(), FORMAT_NHWC);
+      memory::desc input_md = memory::desc(dims_mkl_order, MklDnnType<qint32>(),
+                                           memory::format::nhwc);
+      memory::desc output_md =
+          (out_type_ == DT_QINT8)
+              ? memory::desc(dims_mkl_order, MklDnnType<qint8>(),
+                             memory::format::nhwc)
+              : memory::desc(dims_mkl_order, MklDnnType<quint8>(),
+                             memory::format::nhwc);
+
+      memory::primitive_desc input_pd =
+          memory::primitive_desc(input_md, cpu_engine_);
+      memory::primitive_desc output_pd =
+          memory::primitive_desc(output_md, cpu_engine_);
+
+      void* input_buf =
+          static_cast<void*>(const_cast<qint32*>(input.flat<qint32>().data()));
+      void* output_buf;
+      if (out_type_ == DT_QINT8) {
+        output_buf = static_cast<void*>(
+            const_cast<qint8*>(output->flat<qint8>().data()));
+      } else {
+        output_buf = static_cast<void*>(
+            const_cast<quint8*>(output->flat<quint8>().data()));
+      }
+
+      std::unique_ptr<memory> input_mem_prim_(new memory(input_pd, input_buf));
+      std::unique_ptr<memory> output_mem_prim_(
+          new memory(output_pd, output_buf));
+
+      mkldnn::reorder::primitive_desc reorder_pd =
+          mkldnn::reorder::primitive_desc(input_pd, output_pd, reorder_attr);
+      std::vector<mkldnn::primitive> net;
+      net.push_back(
+          mkldnn::reorder(reorder_pd, *input_mem_prim_, *output_mem_prim_));
+      stream(stream::kind::eager).submit(net).wait();
+
+      Tensor* output_min = nullptr;
+      Tensor* output_max = nullptr;
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_output(kOutputMinIndex, {}, &output_min));
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_output(kOutputMaxIndex, {}, &output_max));
+
+      output_min->flat<float>()(0) = input_requested_min_float;
+      output_max->flat<float>()(0) = input_requested_max_float;
+    } catch (mkldnn::error& e) {
+      string error_msg = "Status: " + std::to_string(e.status) +
+                         ", message: " + std::string(e.message) + ", in file " +
+                         std::string(__FILE__) + ":" + std::to_string(__LINE__);
+      OP_REQUIRES_OK(
+          ctx, errors::Aborted("Operation received an exception:", error_msg));
+    }
+  }
+
+ private:
+  const int kInputTensorIndex = 0;
+  const int kInputMinVecIndex = 1;
+  const int kInputMaxVecIndex = 2;
+  const int kRequestMinIndex = 3;
+  const int kRequestMaxIndex = 4;
+  const int kOutputTensorIndex = 0;
+  const int kOutputMinIndex = 1;
+  const int kOutputMaxIndex = 2;
+  DataType out_type_;
+  engine cpu_engine_ = engine(engine::cpu, 0);
+};
+
+REGISTER_KERNEL_BUILDER(Name("RequantizePerChannel")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<qint32>("T")
+                            .TypeConstraint<qint8>("out_type"),
+                        MklRequantizePerChannelOp<CPUDevice, qint8>);
+
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_slice_op.cc b/tensorflow/core/kernels/mkl_slice_op.cc
index 85cabeb92b69653787ebeebd2eae4f17017063bc..e2cbeec2d2831b0dd18e325af71489ef7d8c03bc 100644
--- a/tensorflow/core/kernels/mkl_slice_op.cc
+++ b/tensorflow/core/kernels/mkl_slice_op.cc
@@ -59,9 +59,10 @@ gtl::InlinedVector<int64, 4> IntTensorToInt64Vec(const Tensor& tensor) {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
 // A version of SharedValidation (slice_op.h) written for input that is in
-// either Mkl layout or Tensorflow layout.
-// A shared code to validate input shapes and check for identity, which is not dependent on the type of T.
-// We do this to reduce code size by not duplicating all this for all T (float, double, int32, etc.)
+// either Mkl layout or Tensorflow layout. A shared code to validate input
+// shapes and check for identity, which is not dependent on the type of T.
+// We do this to reduce code size by not duplicating all this for all T
+// (float, double, int32, etc.)
 static void ValidateMklInputs(OpKernelContext* context, bool* is_identity,
                               gtl::InlinedVector<int64, 4>* begin,
                               gtl::InlinedVector<int64, 4>* size) {
@@ -157,13 +158,156 @@ static void CheckCommonCasesForMklInputs(OpKernelContext* context,
   }
 }
 
+// This structure aggregates multiple inputs to Slice methods.
+struct MklSliceParams {
+  // Parameters from & to represents memory pointing to reorder.
+  const memory* from;
+  const memory* to;
+
+  // Parameters begin_dims & size_dims represents offset and length
+  // passed to view primitive.
+  memory::dims begin_dims;
+  memory::dims size_dims;
+
+  MklSliceParams(const memory* from, const memory* to, memory::dims begin_dims,
+                 memory::dims size_dims)
+      : from(from), to(to), begin_dims(begin_dims), size_dims(size_dims) {}
+};
+
+// This implements the shared interface of Slice reorders.
+template <typename T>
+class MklSlicePrimitive : public MklPrimitive {
+ public:
+  explicit MklSlicePrimitive(const MklSliceParams& sliceParams) {
+    context_.slice_stream.reset(new stream(stream::kind::eager));
+    Setup(sliceParams);
+  }
+
+  ~MklSlicePrimitive() {}
+
+  void Execute(const MklSliceParams& sliceParams) {
+    context_.src_mem->set_data_handle(sliceParams.from->get_data_handle());
+    context_.dst_mem->set_data_handle(sliceParams.to->get_data_handle());
+    context_.slice_stream->submit(context_.slice_primitives);
+
+    // We should set it back to DummyData so as to make the primitive
+    // in cache pool stateless. Otherwise, if the result for previous
+    // iteration is kept, problems of current iteration won't be
+    // thrown immediately, and wrong data would be reused.
+    context_.src_mem->set_data_handle(DummyData);
+    context_.dst_mem->set_data_handle(DummyData);
+    return;
+  }
+
+  std::shared_ptr<primitive> GetPrimitive() { return context_.reorder_prim; }
+
+ private:
+  struct SliceContext {
+    std::shared_ptr<mkldnn::memory> src_mem;
+    std::shared_ptr<mkldnn::memory> dst_mem;
+    std::shared_ptr<primitive> reorder_prim;
+    std::shared_ptr<reorder::primitive_desc> reorder_pd;
+    std::shared_ptr<view::primitive_desc> view_pd;
+    std::shared_ptr<mkldnn::stream> slice_stream;
+    std::vector<mkldnn::primitive> slice_primitives;
+    SliceContext()
+        : src_mem(nullptr), dst_mem(nullptr), reorder_prim(nullptr) {}
+  } context_;
+
+  engine cpu_engine_ = engine(engine::cpu, 0);
+
+  void Setup(const MklSliceParams& sliceParams) {
+    // Actually, this DummyData will not be used in computation,
+    // because the real data will be filled before real execution.
+    context_.src_mem.reset(
+        new memory({sliceParams.from->get_primitive_desc().desc(), cpu_engine_},
+                   DummyData));
+    context_.dst_mem.reset(new memory(
+        {sliceParams.to->get_primitive_desc().desc(), cpu_engine_}, DummyData));
+    auto src_pd = context_.src_mem->get_primitive_desc();
+    auto dst_pd = context_.dst_mem->get_primitive_desc();
+    context_.view_pd =
+        std::make_shared<view::primitive_desc>(view::primitive_desc(
+            src_pd, sliceParams.size_dims, sliceParams.begin_dims));
+    context_.reorder_pd =
+        std::make_shared<reorder::primitive_desc>(reorder::primitive_desc(
+            context_.view_pd->dst_primitive_desc(), dst_pd));
+    context_.reorder_prim = std::make_shared<mkldnn::reorder>(
+        reorder(*context_.reorder_pd, *context_.src_mem, *context_.dst_mem));
+    context_.slice_primitives.push_back(*context_.reorder_prim);
+  }
+};
+
+template <typename T>
+class MklSlicePrimitiveFactory : public MklPrimitiveFactory<T> {
+ public:
+  static MklSlicePrimitive<T>* Get(const MklSliceParams& sliceParams) {
+    auto reorderPrim = static_cast<MklSlicePrimitive<T>*>(
+        MklSlicePrimitiveFactory<T>::GetInstance().GetReorder(sliceParams));
+    if (reorderPrim == nullptr) {
+      reorderPrim = new MklSlicePrimitive<T>(sliceParams);
+      MklSlicePrimitiveFactory<T>::GetInstance().SetReorder(sliceParams,
+                                                            reorderPrim);
+    }
+    return reorderPrim;
+  }
+
+  static MklSlicePrimitiveFactory& GetInstance() {
+    static MklSlicePrimitiveFactory instance_;
+    return instance_;
+  }
+
+ private:
+  MklSlicePrimitiveFactory() {}
+  ~MklSlicePrimitiveFactory() {}
+
+  static string CreateKey(const MklSliceParams& sliceParams) {
+    string prefix = "reorder";
+    FactoryKeyCreator key_creator;
+    auto const& from_desc = sliceParams.from->get_primitive_desc().desc().data;
+    auto const& to_desc = sliceParams.to->get_primitive_desc().desc().data;
+    const int kIdxFirstStride = 0;
+    memory::dims from_dims(from_desc.dims, &from_desc.dims[from_desc.ndims]);
+    memory::dims to_dims(to_desc.dims, &to_desc.dims[to_desc.ndims]);
+    memory::dims from_strides(
+        from_desc.layout_desc.blocking.strides[kIdxFirstStride],
+        &from_desc.layout_desc.blocking
+             .strides[kIdxFirstStride][from_desc.ndims]);
+    memory::dims to_strides(
+        to_desc.layout_desc.blocking.strides[kIdxFirstStride],
+        &to_desc.layout_desc.blocking.strides[kIdxFirstStride][to_desc.ndims]);
+    key_creator.AddAsKey(prefix);
+    key_creator.AddAsKey(static_cast<int>(from_desc.format));
+    key_creator.AddAsKey(static_cast<int>(from_desc.data_type));
+    key_creator.AddAsKey(from_dims);
+    key_creator.AddAsKey(from_strides);
+    key_creator.AddAsKey(static_cast<int>(to_desc.format));
+    key_creator.AddAsKey(static_cast<int>(to_desc.data_type));
+    key_creator.AddAsKey(to_dims);
+    key_creator.AddAsKey(to_strides);
+    key_creator.AddAsKey(sliceParams.begin_dims);
+    key_creator.AddAsKey(sliceParams.size_dims);
+    return key_creator.GetKey();
+  }
+
+  MklPrimitive* GetReorder(const MklSliceParams& sliceParams) {
+    string key = CreateKey(sliceParams);
+    return this->GetOp(key);
+  }
+
+  void SetReorder(const MklSliceParams& sliceParams, MklPrimitive* op) {
+    string key = CreateKey(sliceParams);
+    this->SetOp(key, op);
+  }
+};
+
 // MKL-DNN implementation of Slice
 template <typename Device, typename T>
-class MklDnnSliceOp : public OpKernel {
+class MklSliceOp : public OpKernel {
  public:
-  explicit MklDnnSliceOp(OpKernelConstruction* context) : OpKernel(context) {}
+  explicit MklSliceOp(OpKernelConstruction* context) : OpKernel(context) {}
 
-  ~MklDnnSliceOp() {}
+  ~MklSliceOp() {}
 
   void Compute(OpKernelContext* context) override {
     gtl::InlinedVector<int64, 4> begin;
@@ -179,17 +323,17 @@ class MklDnnSliceOp : public OpKernel {
     if (begin.size() >= 8) {
       OP_REQUIRES(
           context, false,
-          errors::Unimplemented("MklDnnSliceOp : Unhandled input dimensions"));
+          errors::Unimplemented("MklSliceOp : Unhandled input dimensions"));
     }
 
-    ComputeMklDnnSlice(context, begin, size);
+    ComputeMklSlice(context, begin, size);
   }
 
  private:
   // Slice op implemented using MKL-DNN APIs.
-  void ComputeMklDnnSlice(OpKernelContext* context,
-                          const gtl::InlinedVector<int64, 4>& begin,
-                          const gtl::InlinedVector<int64, 4>& size) {
+  void ComputeMklSlice(OpKernelContext* context,
+                       const gtl::InlinedVector<int64, 4>& begin,
+                       const gtl::InlinedVector<int64, 4>& size) {
     try {
       // MKL-DNN API usage below is guided by description at:
       //  https://github.com/01org/mkl-dnn/issues/69
@@ -200,16 +344,15 @@ class MklDnnSliceOp : public OpKernel {
       // probably change the format). Then your steps are:
       //
       // 1. create memory primitive descriptor in_mem_pd and memory primitive
-      //    in_mem_p for the entire source data.
-      // 2. create view primitive descriptor in_submem_pd based on in_mem_pd,
-      //    initial offsets, and sub-sizes
-      // 3. create memory primitive descriptor out_mem_pd and memory primitive
+      //    in_mem_p for the entire source data. create view primitive
+      //    descriptor in_submem_pd based on in_mem_pd, initial offsets,
+      //    and sub-sizes
+      // 2. create memory primitive descriptor out_mem_pd and memory primitive
       //    out_mem_p for the output (the logical sizes should match sub-sizes
-      //    used in step 2, but the format might be arbitrary)
-      // 4. create reorder primitive descriptor reorder_pd based on in_submem_pd
-      //    and out_mem_pd
-      // 5. create reorder primitive itself based on reorder_pd, in_mem_p, and
-      //    out_mem_p.
+      //    used in step 1, but the format might be arbitrary)
+      // 3. create reorder primitive descriptor reorder_pd based on in_submem_pd
+      //    and out_mem_pd. create reorder primitive itself based on reorder_pd,
+      //    in_mem_p, and out_mem_p.
       //
       // Please notice that there is no view primitive. There is only view
       // primitive descriptor. And the reorder uses source memory as input but
@@ -268,32 +411,24 @@ class MklDnnSliceOp : public OpKernel {
         src.SetUsrMem(input_md, &input_tensor);
       }
 
-      // Step 2 - create view primitive descriptor
-      auto view_pd =
-          view::primitive_desc(src.GetUsrMemPrimDesc(), size_dims, begin_dims)
-              .dst_primitive_desc();
+      // Step 2 - Create memory for output.
       auto output_strides = CalculateTFStrides(size_dims);
       auto output_md =
           MklDnnData<T>::CreateBlockedMemDesc(size_dims, output_strides);
       auto output_pd = memory::primitive_desc(output_md, cpu_engine);
-
-      // Step 3 - Create memory for output. If input is in MklDnn layout, then
-      // output is also in MklDnn layout. Otherwise, output is in Tensorflow
-      // layout.
       AllocateOutputTensor(context, input_mkl_shape, &output_pd, size_dims,
                            &output_tensor, &output_mkl_shape);
       DCHECK(output_tensor);
       DCHECK_EQ(input_mkl_shape.IsMklTensor(), output_mkl_shape.IsMklTensor());
       output.SetUsrMem(output_md, output_tensor);
 
-      std::vector<primitive> net;
-      // Step 4 - create reorder primitive desc between view_pd and output_pd.
-      auto reorder_pd =
-          reorder::primitive_desc(view_pd, output.GetUsrMemPrimDesc());
-      // Step 5 - create reorder primitive itself.
-      net.push_back(reorder(reorder_pd, *src.GetUsrMem(), *output.GetUsrMem()));
-      // Execute the reorder primitive.
-      stream(stream::kind::eager).submit(net).wait();
+      // Step 3 - create reorder primitive.
+      MklSliceParams sliceParams(src.GetUsrMem(), output.GetUsrMem(),
+                                 begin_dims, size_dims);
+      MklSlicePrimitive<T>* reorder_prim =
+          MklSlicePrimitiveFactory<T>::Get(sliceParams);
+      // Execute slice reorder.
+      reorder_prim->Execute(sliceParams);
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
                          string(e.message) + ", in file " + string(__FILE__) +
@@ -347,7 +482,7 @@ class MklDnnSliceOp : public OpKernel {
                               .HostMemory("begin")                  \
                               .HostMemory("size")                   \
                               .Label(mkl_op_registry::kMklOpLabel), \
-                          MklDnnSliceOp<CPUDevice, type>);
+                          MklSliceOp<CPUDevice, type>);
 
 TF_CALL_float(REGISTER_MKL_SLICE);
 #undef REGISTER_MKL_SLICE
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
index 094129ae3efe87e070f8a27c8584f67c927bbec3..dc3ae3d93471e3af78da63a3fcbaa51644163aa2 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -50,8 +50,6 @@ class MklSoftmaxOp : public OpKernel {
       // src_tensor now points to the 0-th input of global data struct "context"
       size_t src_idx = 0;
       const Tensor& src_tensor = MklGetInput(context, src_idx);
-      const int input_dims = src_tensor.dims();
-
       // Add: get MklShape
       MklDnnShape src_mkl_shape;
       GetMklShape(context, src_idx, &src_mkl_shape);
@@ -61,15 +59,27 @@ class MklSoftmaxOp : public OpKernel {
       auto src_tf_shape = src_mkl_shape.IsMklTensor()
                               ? src_mkl_shape.GetTfShape()
                               : src_tensor.shape();
+      const int input_dims = src_tf_shape.dims();
       auto src_dims = TFShapeToMklDnnDims(src_tf_shape);
-      auto output_dims = src_dims;
+      memory::dims output_dims;
+      int axis;
+      if (src_mkl_shape.IsMklTensor()) {
+        axis = 1;
+        output_dims = src_mkl_shape.GetSizesAsMklDnnDims();
+      } else {
+        axis = input_dims - 1;
+        output_dims = src_dims;
+      }
       memory::format layout_type;
       // In MKL, data format passed to mkl softmax op depends on dimension of
       // the input tensor. Here "x" data format in MKL is used for 1 dim tensor,
       // "nc" for 2 dim tensor, "tnc" for 3 dim tensor, "nchw" for 4 dim tensor,
-      // and "ncdhw" for 5 dim tensor. Each of the simbols has the following
+      // and "ncdhw" for 5 dim tensor. Each of the symbols has the following
       // meaning: n = batch, c = channels, t = sequence length, h = height, w =
-      // width, d = depth
+      // width, d = depth. When src tensor is MKL, layout_type here is only used
+      // for setting TF layout type of output tensor. When input is TF Tensor,
+      // layout here is no special sense. We use axis to define on which
+      // dimension to do softmax.
       switch (input_dims) {
         case 1:
           layout_type = memory::format::x;
@@ -81,13 +91,22 @@ class MklSoftmaxOp : public OpKernel {
           layout_type = memory::format::tnc;
           break;
         case 4:
-          layout_type = memory::format::nchw;
+          if (src_mkl_shape.IsMklTensor()) {
+            layout_type = memory::format::nhwc;
+          } else {
+            layout_type = memory::format::nchw;
+          }
           break;
         case 5:
-          layout_type = memory::format::ncdhw;
+          if (src_mkl_shape.IsMklTensor()) {
+            layout_type = memory::format::ndhwc;
+          } else {
+            layout_type = memory::format::ncdhw;
+          }
           break;
         default:
-          OP_REQUIRES_OK(context, errors::Aborted("Input dims must be <= 5 and >=1"));
+          OP_REQUIRES_OK(context,
+                         errors::Aborted("Input dims must be <= 5 and >=1"));
           return;
       }
       // Create softmax memory for src, dst: both are defined in mkl_util.h,
@@ -99,25 +118,17 @@ class MklSoftmaxOp : public OpKernel {
       // construct input Tf layout. For TF layout, although input shape
       // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
       // layout
-      auto src_md =
-          src_mkl_shape.IsMklTensor()
-              ? src_mkl_shape.GetMklLayout()
-              : memory::desc(src_dims, MklDnnType<T>(), layout_type);
-
-      // src: setting memory descriptor and op memory descriptor
-      // Basically following two functions maps the TF "src_tensor" to mkl
-      // tensor object "src"
+      auto src_md = src_mkl_shape.IsMklTensor()
+                        ? src_mkl_shape.GetMklLayout()
+                        : memory::desc(src_dims, MklDnnType<T>(), layout_type);
+
+      // src: setting memory descriptor
       // following functions are in mkl_util.h
-      // data format is "nc" for src and dst; since the src and dst buffer is
-      // always in 2D shape
       src.SetUsrMem(src_md, &src_tensor);
-      src.SetOpMemDesc(src_dims, layout_type);
 
       // creating a memory descriptor
-      // passing outermost dim as default axis, where the softmax is applied
-      int axis = input_dims - 1;
       auto softmax_fwd_desc = softmax_forward::desc(prop_kind::forward_scoring,
-                                                    src.GetOpMemDesc(), axis);
+                                                    src.GetUsrMemDesc(), axis);
       auto softmax_fwd_pd =
           softmax_forward::primitive_desc(softmax_fwd_desc, cpu_engine);
 
diff --git a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
index 5cc5877cceb19320023423d35a352c5ba3db13e2..62e38694c8fbe97eb09ccfdca3aa608ec89211ac 100644
--- a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
@@ -22,9 +22,10 @@ limitations under the License.
 #include <assert.h>
 #include <stdio.h>
 
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/random_op.h"
+#include "tensorflow/core/kernels/reduction_gpu_kernels.cu.h"
+#include "tensorflow/core/kernels/reduction_ops_common.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
@@ -67,7 +68,6 @@ struct MultinomialFunctor<GPUDevice, T, OutputType> {
                                                  noises.size(), Dist());
 
 #if defined(EIGEN_HAS_INDEX_LIST)
-    Eigen::IndexList<Eigen::type2index<2>> kTwo;
     Eigen::IndexList<int, int, int> bsc;
     bsc.set(0, batch_size);
     bsc.set(1, num_samples);
@@ -80,7 +80,6 @@ struct MultinomialFunctor<GPUDevice, T, OutputType> {
     Eigen::IndexList<Eigen::type2index<1>, int, Eigen::type2index<1>> oso;
     oso.set(1, num_samples);
 #else
-    Eigen::array<int, 1> kTwo{2};
     Eigen::array<int, 3> bsc{batch_size, num_samples, num_classes};
     Eigen::array<int, 3> boc{batch_size, 1, num_classes};
     Eigen::array<int, 3> oso{1, num_samples, 1};
@@ -98,7 +97,14 @@ struct MultinomialFunctor<GPUDevice, T, OutputType> {
         ((-((To32Bit(noises) + 2e-30f).log())).log());
 
     // Max-reduce along classes for each (batch, sample).
-    To32Bit(maxima).device(d) = To32Bit(scores).reshape(bsc).maximum(kTwo);
+    typedef const Eigen::array<TTypes<float>::Tensor::Index, 1>& ReductionAxes;
+    Constants<GPUDevice> constants;
+    cub::Max op;
+    functor::ReduceImpl<float, cub::Max, float*, const float*, ReductionAxes>(
+        /*ctx=*/ctx, /*out=*/maxima.data(), /*in=*/scores.data(), /*in_rank=*/2,
+        /*in_dim0=*/batch_size * num_samples,
+        /*in_dim1=*/num_classes, /*in_dim2=*/1, /*out_rank=*/1,
+        /*reduction_axes=*/constants.kOne, /*Op=*/op);
 
     // Necessary for atomicMax() inside the kernel.
     output.device(d) = output.constant(0LL);
diff --git a/tensorflow/core/kernels/mutex_ops.cc b/tensorflow/core/kernels/mutex_ops.cc
index ddb7a606c1a7f0264c7c4a9cbb2f97095d9fee01..b06845f13a4d0f8688618c22d12a223e6264f767 100644
--- a/tensorflow/core/kernels/mutex_ops.cc
+++ b/tensorflow/core/kernels/mutex_ops.cc
@@ -45,7 +45,9 @@ class Mutex : public ResourceBase {
     VLOG(2) << "Creating mutex with name " << name << ": " << this;
   }
 
-  string DebugString() override { return strings::StrCat("Mutex ", name_); }
+  string DebugString() const override {
+    return strings::StrCat("Mutex ", name_);
+  }
 
   class LockReleaser {
    public:
@@ -240,10 +242,24 @@ class ConsumeMutexLockOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("MutexLock").Device(DEVICE_CPU), MutexLockOp);
 
-REGISTER_KERNEL_BUILDER(Name("MutexV2").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("MutexLock")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("mutex_lock")
+                            .HostMemory("mutex"),
+                        MutexLockOp);
+
+REGISTER_KERNEL_BUILDER(
+    Name("MutexV2").Device(DEVICE_CPU).HostMemory("resource"),
+    ResourceHandleOp<Mutex>);
+
+REGISTER_KERNEL_BUILDER(Name("MutexV2").Device(DEVICE_GPU),
                         ResourceHandleOp<Mutex>);
 
 REGISTER_KERNEL_BUILDER(Name("ConsumeMutexLock").Device(DEVICE_CPU),
                         ConsumeMutexLockOp);
 
+REGISTER_KERNEL_BUILDER(
+    Name("ConsumeMutexLock").Device(DEVICE_GPU).HostMemory("mutex_lock"),
+    ConsumeMutexLockOp);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/nccl_ops.cc b/tensorflow/core/kernels/nccl_ops.cc
index 6fdeb224781ed6dbf2cdf63c82037eb04a722cc6..d3bdebfc873650f22e89afe702ead6fc15daed89 100644
--- a/tensorflow/core/kernels/nccl_ops.cc
+++ b/tensorflow/core/kernels/nccl_ops.cc
@@ -91,9 +91,10 @@ class NcclAllReduceOpKernel : public NcclReduceOpBase {
       : NcclReduceOpBase(c) {}
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
-    const Tensor* in_t = &c->input(0);
-    Tensor* out_t;
-    OP_REQUIRES_OK_ASYNC(c, c->allocate_output(0, in_t->shape(), &out_t), done);
+    const Tensor* input = &c->input(0);
+    Tensor* output;
+    OP_REQUIRES_OK_ASYNC(c, c->allocate_output(0, input->shape(), &output),
+                         done);
 
     auto actual_done = [c, done](Status s) {
       OP_REQUIRES_OK_ASYNC(c, s, done);
@@ -102,10 +103,17 @@ class NcclAllReduceOpKernel : public NcclReduceOpBase {
 
     auto* compute_stream = c->op_device_context()->stream();
     auto* gpu_info = c->device()->tensorflow_gpu_device_info();
+    auto participant = absl::make_unique<NcclManager::Participant>(
+        compute_stream->parent(), compute_stream, gpu_info->event_mgr,
+        gpu_info->gpu_id, input, output, /*global_rank=*/-1,
+        std::move(actual_done));
     NcclManager::instance()->AddToAllReduce(
-        num_devices(), GetCollectiveKey(c), reduction_op(),
-        compute_stream->parent(), gpu_info->gpu_id, gpu_info->event_mgr,
-        compute_stream, in_t, out_t, std::move(actual_done));
+        std::move(participant),
+        {GetCollectiveKey(c),
+         /*num_local_devices=*/num_devices(),
+         /*num_global_devices=*/num_devices(),
+         /*communicator_key=*/""},
+        reduction_op());
   }
 };
 REGISTER_KERNEL_BUILDER(Name("NcclAllReduce").Device(DEVICE_GPU),
@@ -127,10 +135,17 @@ class NcclReduceSendKernel : public NcclReduceOpBase {
 
     auto* compute_stream = c->op_device_context()->stream();
     auto* gpu_info = c->device()->tensorflow_gpu_device_info();
+    auto participant = absl::make_unique<NcclManager::Participant>(
+        compute_stream->parent(), compute_stream, gpu_info->event_mgr,
+        gpu_info->gpu_id, &c->input(0), /*output=*/nullptr, /*global_rank=*/-1,
+        std::move(actual_done));
     NcclManager::instance()->AddReduceSend(
-        num_devices(), GetCollectiveKey(c), reduction_op(),
-        compute_stream->parent(), gpu_info->gpu_id, gpu_info->event_mgr,
-        compute_stream, &c->input(0), std::move(actual_done));
+        std::move(participant),
+        {GetCollectiveKey(c),
+         /*num_local_devices=*/num_devices(),
+         /*num_global_devices=*/num_devices(),
+         /*communicator_key=*/""},
+        reduction_op());
   }
 };
 REGISTER_KERNEL_BUILDER(Name("_NcclReduceSend").Device(DEVICE_GPU),
@@ -145,9 +160,10 @@ class NcclReduceRecvKernel : public NcclReduceOpBase {
       : NcclReduceOpBase(c) {}
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
-    const Tensor& in_t = c->input(0);
-    Tensor* out_t;
-    OP_REQUIRES_OK_ASYNC(c, c->allocate_output(0, in_t.shape(), &out_t), done);
+    const Tensor* input = &c->input(0);
+    Tensor* output;
+    OP_REQUIRES_OK_ASYNC(c, c->allocate_output(0, input->shape(), &output),
+                         done);
 
     auto actual_done = [c, done](Status s) {
       OP_REQUIRES_OK_ASYNC(c, s, done);
@@ -156,10 +172,17 @@ class NcclReduceRecvKernel : public NcclReduceOpBase {
 
     auto* compute_stream = c->op_device_context()->stream();
     auto* gpu_info = c->device()->tensorflow_gpu_device_info();
+    auto participant = absl::make_unique<NcclManager::Participant>(
+        compute_stream->parent(), compute_stream, gpu_info->event_mgr,
+        gpu_info->gpu_id, input, output, /*global_rank=*/-1,
+        std::move(actual_done));
     NcclManager::instance()->AddReduceRecv(
-        num_devices(), GetCollectiveKey(c), reduction_op(),
-        compute_stream->parent(), gpu_info->gpu_id, gpu_info->event_mgr,
-        compute_stream, &in_t, out_t, std::move(actual_done));
+        std::move(participant),
+        {GetCollectiveKey(c),
+         /*num_local_devices=*/num_devices(),
+         /*num_global_devices=*/num_devices(),
+         /*communicator_key=*/""},
+        reduction_op());
   }
 
  private:
@@ -184,10 +207,15 @@ class NcclBroadcastSendKernel : public NcclAsyncOpBase {
 
     auto* compute_stream = c->op_device_context()->stream();
     auto* gpu_info = c->device()->tensorflow_gpu_device_info();
-    NcclManager::instance()->AddBroadcastSend(
-        num_devices(), GetCollectiveKey(c), compute_stream->parent(),
-        gpu_info->gpu_id, gpu_info->event_mgr, compute_stream, &c->input(0),
+    auto participant = absl::make_unique<NcclManager::Participant>(
+        compute_stream->parent(), compute_stream, gpu_info->event_mgr,
+        gpu_info->gpu_id, &c->input(0), /*output=*/nullptr, /*global_rank=*/-1,
         std::move(actual_done));
+    NcclManager::instance()->AddBroadcastSend(
+        std::move(participant), {GetCollectiveKey(c),
+                                 /*num_local_devices=*/num_devices(),
+                                 /*num_global_devices=*/num_devices(),
+                                 /*communicator_key=*/""});
   }
 };
 REGISTER_KERNEL_BUILDER(Name("_NcclBroadcastSend").Device(DEVICE_GPU),
@@ -206,8 +234,8 @@ class NcclBroadcastRecvKernel : public NcclAsyncOpBase {
     TensorShape shape;
     OP_REQUIRES_OK_ASYNC(
         c, TensorShapeUtils::MakeShape(shape_t.vec<int32>(), &shape), done);
-    Tensor* out_t;
-    OP_REQUIRES_OK_ASYNC(c, c->allocate_output(0, shape, &out_t), done);
+    Tensor* output;
+    OP_REQUIRES_OK_ASYNC(c, c->allocate_output(0, shape, &output), done);
 
     auto actual_done = [c, done](Status s) {
       OP_REQUIRES_OK_ASYNC(c, s, done);
@@ -216,10 +244,15 @@ class NcclBroadcastRecvKernel : public NcclAsyncOpBase {
 
     auto* compute_stream = c->op_device_context()->stream();
     auto* gpu_info = c->device()->tensorflow_gpu_device_info();
-    NcclManager::instance()->AddBroadcastRecv(
-        num_devices(), GetCollectiveKey(c), compute_stream->parent(),
-        gpu_info->gpu_id, gpu_info->event_mgr, compute_stream, out_t,
+    auto participant = absl::make_unique<NcclManager::Participant>(
+        compute_stream->parent(), compute_stream, gpu_info->event_mgr,
+        gpu_info->gpu_id, /*input=*/nullptr, output, /*global_rank=*/-1,
         std::move(actual_done));
+    NcclManager::instance()->AddBroadcastSend(
+        std::move(participant), {GetCollectiveKey(c),
+                                 /*num_local_devices=*/num_devices(),
+                                 /*num_global_devices=*/num_devices(),
+                                 /*communicator_key=*/""});
   }
 };
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/neon/BUILD b/tensorflow/core/kernels/neon/BUILD
index 313d40c082b3e334a01ba97eaf4449e1940b013a..6665152e3e3c7592cda8e0a09dd75d4b2409d6c4 100644
--- a/tensorflow/core/kernels/neon/BUILD
+++ b/tensorflow/core/kernels/neon/BUILD
@@ -24,7 +24,6 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core/kernels:bounds_check",
         "//tensorflow/core/kernels:ops_util",
         "@gemmlowp",
diff --git a/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc b/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
index 0e820bbb6208ae9c13ac2fb33f67590b9e66ba7e..b218f62ddd9a02026bd654fd76dd2223152da9a8 100644
--- a/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/neon/neon_depthwise_conv_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #define GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
 #include "public/gemmlowp.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -26,7 +27,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/neon/depthwiseconv_float.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/kernels/nextafter_op.cc b/tensorflow/core/kernels/nextafter_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6166a1053f32c0b0b7fba4ceda69ad3126346f65
--- /dev/null
+++ b/tensorflow/core/kernels/nextafter_op.cc
@@ -0,0 +1,37 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/nextafter_op.h"
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+
+REGISTER2(BinaryOp, CPU, "NextAfter", functor::nextafter, float, double);
+
+#if TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNEL(TYPE)                                     \
+  REGISTER_KERNEL_BUILDER(                                             \
+      Name("NextAfter").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
+      BinaryOp<SYCLDevice, functor::nextafter<TYPE>>);
+REGISTER_SYCL_KERNEL(float);
+REGISTER_SYCL_KERNEL(double);
+#undef REGISTER_SYCL_KERNEL
+#endif  // TENSORFLOW_USE_SYCL
+
+#if GOOGLE_CUDA
+REGISTER2(BinaryOp, GPU, "NextAfter", functor::nextafter, float, double);
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/nextafter_op.h b/tensorflow/core/kernels/nextafter_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..64374980f2d5aec7c2d5a9011f14280cd6c394ed
--- /dev/null
+++ b/tensorflow/core/kernels/nextafter_op.h
@@ -0,0 +1,40 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_NEXTAFTER_OP_H_
+#define TENSORFLOW_CORE_KERNELS_NEXTAFTER_OP_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/cwise_ops.h"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename T>
+struct nextafter_op {
+  EIGEN_EMPTY_STRUCT_CTOR(nextafter_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T operator()(const T& x1,
+                                                           const T& x2) const {
+    return std::nextafter(x1, x2);
+  }
+};
+
+template <typename T>
+struct nextafter : base<T, nextafter_op<T>> {};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_NEXTAFTER_OP_H_
diff --git a/tensorflow/core/kernels/nextafter_op_gpu.cu.cc b/tensorflow/core/kernels/nextafter_op_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d2321c6a882c425f9851cb59a48e5b4c5aed9cb5
--- /dev/null
+++ b/tensorflow/core/kernels/nextafter_op_gpu.cu.cc
@@ -0,0 +1,29 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#include "tensorflow/core/kernels/nextafter_op.h"
+
+namespace tensorflow {
+namespace functor {
+
+DEFINE_BINARY2(nextafter, float, double);
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cc b/tensorflow/core/kernels/non_max_suppression_op.cc
index 37f615abd97044caa7703837714840b8d451d420..482b227ccdc8316cf336eb1f4761c6c866da7399 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cc
@@ -24,12 +24,12 @@ limitations under the License.
 #include <vector>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/platform/logging.h"
@@ -74,6 +74,34 @@ static inline void ParseAndCheckBoxSizes(OpKernelContext* context,
               errors::InvalidArgument("boxes must have 4 columns"));
 }
 
+static inline void CheckCombinedNMSScoreSizes(OpKernelContext* context,
+                                              int num_boxes,
+                                              const Tensor& scores) {
+  // The shape of 'scores' is [batch_size, num_boxes, num_classes]
+  OP_REQUIRES(context, scores.dims() == 3,
+              errors::InvalidArgument("scores must be 3-D",
+                                      scores.shape().DebugString()));
+  OP_REQUIRES(context, scores.dim_size(1) == num_boxes,
+              errors::InvalidArgument("scores has incompatible shape"));
+}
+
+static inline void ParseAndCheckCombinedNMSBoxSizes(OpKernelContext* context,
+                                                    const Tensor& boxes,
+                                                    int* num_boxes,
+                                                    const int num_classes) {
+  // The shape of 'boxes' is [batch_size, num_boxes, q, 4]
+  OP_REQUIRES(context, boxes.dims() == 4,
+              errors::InvalidArgument("boxes must be 4-D",
+                                      boxes.shape().DebugString()));
+
+  bool box_check = boxes.dim_size(2) == 1 || boxes.dim_size(2) == num_classes;
+  OP_REQUIRES(context, box_check,
+              errors::InvalidArgument(
+                  "third dimension of boxes must be either 1 or num classes"));
+  *num_boxes = boxes.dim_size(1);
+  OP_REQUIRES(context, boxes.dim_size(3) == 4,
+              errors::InvalidArgument("boxes must have 4 columns"));
+}
 // Return intersection-over-union overlap between boxes i and j
 template <typename T>
 static inline bool IOUGreaterThanThreshold(
@@ -195,6 +223,216 @@ void DoNonMaxSuppressionOp(
   std::copy_n(selected.begin(), selected.size(), output_indices_data.data());
 }
 
+void BatchedNonMaxSuppressionOp(
+    OpKernelContext* context, const Tensor& inp_boxes, const Tensor& inp_scores,
+    int num_boxes, const int max_size_per_class, const int total_size_per_batch,
+    const float score_threshold, const float iou_threshold,
+    bool pad_per_class = false) {
+  int q = inp_boxes.dim_size(2);
+  int num_classes = inp_scores.dim_size(2);
+  const int num_batches = inp_boxes.dim_size(0);
+
+  // Default clip window of [0, 0, 1, 1] if none specified
+  std::vector<float> clip_window{0, 0, 1, 1};
+
+  // [num_batches, per_batch_size * 4]
+  std::vector<std::vector<float>> nmsed_boxes(num_batches);
+  // [num_batches, per_batch_size]
+  std::vector<std::vector<float>> nmsed_scores(num_batches);
+  // [num_batches, per_batch_size]
+  std::vector<std::vector<float>> nmsed_classes(num_batches);
+  // [num_batches]
+  std::vector<int> final_valid_detections;
+
+  int per_batch_size = total_size_per_batch;
+
+  // perform non_max_suppression operation for each batch independently
+  for (int batch = 0; batch < num_batches; ++batch) {
+    // dims of per_batch_boxes [num_boxes, q, 4]
+    Tensor per_batch_boxes = inp_boxes.Slice(batch, batch + 1);
+    // dims of per_batch_scores [num_boxes, num_classes]
+    Tensor per_batch_scores = inp_scores.Slice(batch, batch + 1);
+
+    struct ResultCandidate {
+      int box_index;
+      float score;
+      int class_idx;
+      float box_coord[4];
+    };
+
+    std::vector<ResultCandidate> result_candidate_vec;
+
+    float* scores_data = per_batch_scores.unaligned_flat<float>().data();
+    float* boxes_data = per_batch_boxes.unaligned_flat<float>().data();
+
+    // Iterate through all classes
+    for (int class_idx = 0; class_idx < num_classes; ++class_idx) {
+      std::vector<float> class_scores_data;
+      class_scores_data.reserve(num_boxes);
+      std::vector<float> class_boxes_data;
+      class_boxes_data.reserve(num_boxes * 4);
+
+      for (int box = 0; box < num_boxes; ++box) {
+        // Get the scores per class
+        // class_scores_data dim is [num_boxes].
+        class_scores_data.push_back(scores_data[box * num_classes + class_idx]);
+        for (int cid = 0; cid < 4; ++cid) {
+          if (q > 1) {
+            // Get the boxes per class. class_boxes_data dims is [num_boxes, 4]
+            class_boxes_data.push_back(
+                boxes_data[(box * q + class_idx) * 4 + cid]);
+          } else {
+            class_boxes_data.push_back(boxes_data[box * 4 + cid]);
+          }
+        }
+      }
+
+      // Copy class_boxes_data to a tensor
+      TensorShape boxesShape({num_boxes, 4});
+      Tensor boxes(per_batch_boxes.dtype(), boxesShape);
+      std::copy_n(class_boxes_data.begin(), class_boxes_data.size(),
+                  boxes.unaligned_flat<float>().data());
+
+      const int size_per_class = std::min(max_size_per_class, num_boxes);
+      // Do NMS, get the candidate indices of form vector<int>
+      // Data structure for selection candidate in NMS.
+      struct Candidate {
+        int box_index;
+        float score;
+      };
+      auto cmp = [](const Candidate bs_i, const Candidate bs_j) {
+        return bs_i.score > bs_j.score;
+      };
+      std::vector<Candidate> candidate_vector;
+      for (int i = 0; i < class_scores_data.size(); ++i) {
+        if (class_scores_data[i] > score_threshold) {
+          candidate_vector.emplace_back(Candidate({i, class_scores_data[i]}));
+        }
+      }
+
+      std::vector<int> selected;
+      std::vector<float> selected_boxes;
+      Candidate next_candidate;
+
+      std::sort(candidate_vector.begin(), candidate_vector.end(), cmp);
+      const Tensor const_boxes = boxes;
+      typename TTypes<float, 2>::ConstTensor boxes_data =
+          const_boxes.tensor<float, 2>();
+      int candidate_idx = 0;
+      while (selected.size() < size_per_class &&
+             candidate_idx < candidate_vector.size()) {
+        next_candidate = candidate_vector[candidate_idx++];
+
+        // Overlapping boxes are likely to have similar scores,
+        // therefore we iterate through the previously selected boxes backwards
+        // in order to see if `next_candidate` should be suppressed.
+        bool should_select = true;
+        for (int j = selected.size() - 1; j >= 0; --j) {
+          if (IOUGreaterThanThreshold(boxes_data, next_candidate.box_index,
+                                      selected[j], iou_threshold)) {
+            should_select = false;
+            break;
+          }
+        }
+
+        if (should_select) {
+          selected.push_back(next_candidate.box_index);
+          // Add the selected box to the result candidate. Sorted by score
+          int id = next_candidate.box_index;
+          ResultCandidate rc = {next_candidate.box_index,
+                                next_candidate.score,
+                                class_idx,
+                                {boxes_data(id, 0), boxes_data(id, 1),
+                                 boxes_data(id, 2), boxes_data(id, 3)}};
+          result_candidate_vec.push_back(rc);
+        }
+      }
+    }
+
+    auto rc_cmp = [](const ResultCandidate rc_i, const ResultCandidate rc_j) {
+      return rc_i.score > rc_j.score;
+    };
+    std::sort(result_candidate_vec.begin(), result_candidate_vec.end(), rc_cmp);
+
+    int max_detections = 0;
+    // If pad_per_class is false, we always pad to max_total_size
+    if (!pad_per_class) {
+      max_detections =
+          std::min((int)result_candidate_vec.size(), total_size_per_batch);
+      per_batch_size = total_size_per_batch;
+    } else {
+      per_batch_size =
+          std::min(total_size_per_batch, max_size_per_class * num_classes);
+      max_detections =
+          std::min(per_batch_size, (int)result_candidate_vec.size());
+    }
+
+    final_valid_detections.push_back(max_detections);
+
+    int curr_total_size = max_detections;
+    int result_idx = 0;
+    // Pick the top max_detections values
+    while (curr_total_size > 0 && result_idx < result_candidate_vec.size()) {
+      ResultCandidate next_candidate = result_candidate_vec[result_idx++];
+      // Add to final output vectors
+      nmsed_boxes[batch].push_back(
+          std::max(std::min(next_candidate.box_coord[0], clip_window[2]),
+                   clip_window[0]));
+      nmsed_boxes[batch].push_back(
+          std::max(std::min(next_candidate.box_coord[1], clip_window[3]),
+                   clip_window[1]));
+      nmsed_boxes[batch].push_back(
+          std::max(std::min(next_candidate.box_coord[2], clip_window[2]),
+                   clip_window[0]));
+      nmsed_boxes[batch].push_back(
+          std::max(std::min(next_candidate.box_coord[3], clip_window[3]),
+                   clip_window[1]));
+      nmsed_scores[batch].push_back(next_candidate.score);
+      nmsed_classes[batch].push_back(next_candidate.class_idx);
+      curr_total_size--;
+    }
+
+    nmsed_boxes[batch].resize(per_batch_size * 4, 0);
+    nmsed_scores[batch].resize(per_batch_size, 0);
+    nmsed_classes[batch].resize(per_batch_size, 0);
+  }
+
+  Tensor* nmsed_boxes_t = nullptr;
+  TensorShape boxes_shape({num_batches, per_batch_size, 4});
+  OP_REQUIRES_OK(context,
+                 context->allocate_output(0, boxes_shape, &nmsed_boxes_t));
+  auto nmsed_boxes_flat = nmsed_boxes_t->template flat<float>();
+
+  Tensor* nmsed_scores_t = nullptr;
+  TensorShape scores_shape({num_batches, per_batch_size});
+  OP_REQUIRES_OK(context,
+                 context->allocate_output(1, scores_shape, &nmsed_scores_t));
+  auto nmsed_scores_flat = nmsed_scores_t->template flat<float>();
+
+  Tensor* nmsed_classes_t = nullptr;
+  OP_REQUIRES_OK(context,
+                 context->allocate_output(2, scores_shape, &nmsed_classes_t));
+  auto nmsed_classes_flat = nmsed_classes_t->template flat<float>();
+
+  Tensor* valid_detections_t = nullptr;
+  TensorShape valid_detections_shape({num_batches});
+  OP_REQUIRES_OK(context, context->allocate_output(3, valid_detections_shape,
+                                                   &valid_detections_t));
+  auto valid_detections_flat = valid_detections_t->template flat<int>();
+
+  for (int i = 0; i < num_batches; ++i) {
+    valid_detections_flat(i) = final_valid_detections[i];
+    for (int j = 0; j < per_batch_size; ++j) {
+      nmsed_scores_flat(i * per_batch_size + j) = nmsed_scores[i][j];
+      nmsed_classes_flat(i * per_batch_size + j) = nmsed_classes[i][j];
+      for (int k = 0; k < 4; ++k) {
+        nmsed_boxes_flat(i * per_batch_size * 4 + j * 4 + k) =
+            nmsed_boxes[i][j * 4 + k];
+      }
+    }
+  }
+}
+
 }  // namespace
 
 template <typename Device>
@@ -435,6 +673,74 @@ class NonMaxSuppressionWithOverlapsOp : public OpKernel {
   }
 };
 
+template <typename Device>
+class CombinedNonMaxSuppressionOp : public OpKernel {
+ public:
+  explicit CombinedNonMaxSuppressionOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("pad_per_class", &pad_per_class_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // boxes: [batch_size, num_anchors, q, 4]
+    const Tensor& boxes = context->input(0);
+    // scores: [batch_size, num_anchors, num_classes]
+    const Tensor& scores = context->input(1);
+    OP_REQUIRES(
+        context, (boxes.dim_size(0) == scores.dim_size(0)),
+        errors::InvalidArgument("boxes and scores must have same batch size"));
+
+    // max_output_size: scalar
+    const Tensor& max_output_size = context->input(2);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(max_output_size.shape()),
+        errors::InvalidArgument("max_size_per_class must be 0-D, got shape ",
+                                max_output_size.shape().DebugString()));
+    const int max_size_per_class = max_output_size.scalar<int>()();
+    // max_total_size: scalar
+    const Tensor& max_total_size = context->input(3);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(max_total_size.shape()),
+        errors::InvalidArgument("max_total_size must be 0-D, got shape ",
+                                max_total_size.shape().DebugString()));
+    const int max_total_size_per_batch = max_total_size.scalar<int>()();
+    OP_REQUIRES(context, max_total_size_per_batch > 0,
+                errors::InvalidArgument("max_total_size must be > 0"));
+    // iou_threshold: scalar
+    const Tensor& iou_threshold = context->input(4);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
+                errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
+                                        iou_threshold.shape().DebugString()));
+    const float iou_threshold_val = iou_threshold.scalar<float>()();
+
+    // score_threshold: scalar
+    const Tensor& score_threshold = context->input(5);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(score_threshold.shape()),
+        errors::InvalidArgument("score_threshold must be 0-D, got shape ",
+                                score_threshold.shape().DebugString()));
+    const float score_threshold_val = score_threshold.scalar<float>()();
+
+    OP_REQUIRES(context, iou_threshold_val >= 0 && iou_threshold_val <= 1,
+                errors::InvalidArgument("iou_threshold must be in [0, 1]"));
+    int num_boxes = 0;
+    const int num_classes = scores.dim_size(2);
+    ParseAndCheckCombinedNMSBoxSizes(context, boxes, &num_boxes, num_classes);
+    CheckCombinedNMSScoreSizes(context, num_boxes, scores);
+
+    if (!context->status().ok()) {
+      return;
+    }
+    BatchedNonMaxSuppressionOp(context, boxes, scores, num_boxes,
+                               max_size_per_class, max_total_size_per_batch,
+                               score_threshold_val, iou_threshold_val,
+                               pad_per_class_);
+  }
+
+ private:
+  bool pad_per_class_;
+};
+
 REGISTER_KERNEL_BUILDER(Name("NonMaxSuppression").Device(DEVICE_CPU),
                         NonMaxSuppressionOp<CPUDevice>);
 
@@ -466,4 +772,7 @@ REGISTER_KERNEL_BUILDER(
     Name("NonMaxSuppressionWithOverlaps").Device(DEVICE_CPU),
     NonMaxSuppressionWithOverlapsOp<CPUDevice>);
 
+REGISTER_KERNEL_BUILDER(Name("CombinedNonMaxSuppression").Device(DEVICE_CPU),
+                        CombinedNonMaxSuppressionOp<CPUDevice>);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/non_max_suppression_op_test.cc b/tensorflow/core/kernels/non_max_suppression_op_test.cc
index c321849f405f5ff966f530ce6ada1c8925ccf1d4..242e41b2652f6200b7d326f4845d14a58e61f9ea 100644
--- a/tensorflow/core/kernels/non_max_suppression_op_test.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op_test.cc
@@ -861,4 +861,471 @@ TEST_F(NonMaxSuppressionWithOverlapsOpTest, TestEmptyInput) {
   test::ExpectTensorEqual<int>(expected, *GetOutput(0));
 }
 
+class CombinedNonMaxSuppressionOpTest : public OpsTestBase {
+ protected:
+  void MakeOp(bool pad_per_class = false) {
+    TF_EXPECT_OK(NodeDefBuilder("combined_non_max_suppression_op",
+                                "CombinedNonMaxSuppression")
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_INT32))
+                     .Input(FakeInput(DT_INT32))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Attr("pad_per_class", pad_per_class)
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+  }
+};
+
+TEST_F(CombinedNonMaxSuppressionOpTest, TestEmptyInput) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({0, 0, 0, 4}), {});
+  AddInputFromArray<float>(TensorShape({0, 0, 0}), {});
+  AddInputFromArray<int>(TensorShape({}), {30});
+  AddInputFromArray<int>(TensorShape({}), {10});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // boxes
+  Tensor expected_boxes(allocator(), DT_FLOAT, TensorShape({0, 10, 4}));
+  test::FillValues<float>(&expected_boxes, {});
+  test::ExpectTensorEqual<float>(expected_boxes, *GetOutput(0));
+
+  // scores
+  Tensor expected_scores(allocator(), DT_FLOAT, TensorShape({0, 10}));
+  test::FillValues<float>(&expected_scores, {});
+  test::ExpectTensorEqual<float>(expected_scores, *GetOutput(1));
+
+  // classes
+  Tensor expected_classes(allocator(), DT_FLOAT, TensorShape({0, 10}));
+  test::FillValues<float>(&expected_classes, {});
+  test::ExpectTensorEqual<float>(expected_classes, *GetOutput(2));
+
+  // valid
+  Tensor expected_valid_d(allocator(), DT_INT32, TensorShape({0}));
+  test::FillValues<int>(&expected_valid_d, {});
+  test::ExpectTensorEqual<int>(expected_valid_d, *GetOutput(3));
+}
+
+TEST_F(CombinedNonMaxSuppressionOpTest, TestSelectFromThreeClusters) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({1, 6, 1, 4}),
+      {0, 0,    0.1, 0.1, 0, 0.01f, 0.1, 0.11f, 0, -0.01, 0.1, 0.09f,
+       0, 0.11, 0.1, 0.2, 0, 0.12f, 0.1, 0.21f, 0, 0.3,   1,   0.4});
+  AddInputFromArray<float>(TensorShape({1, 6, 1}),
+                           {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // boxes
+  Tensor expected_boxes(allocator(), DT_FLOAT, TensorShape({1, 3, 4}));
+  test::FillValues<float>(&expected_boxes,
+                          {0, 0.11, 0.1, 0.2, 0, 0, 0.1, 0.1, 0, 0.3, 1, 0.4});
+  test::ExpectTensorEqual<float>(expected_boxes, *GetOutput(0));
+  // scores
+  Tensor expected_scores(allocator(), DT_FLOAT, TensorShape({1, 3}));
+  test::FillValues<float>(&expected_scores, {0.95, 0.9, 0.3});
+  test::ExpectTensorEqual<float>(expected_scores, *GetOutput(1));
+  // classes
+  Tensor expected_classes(allocator(), DT_FLOAT, TensorShape({1, 3}));
+  test::FillValues<float>(&expected_classes, {0, 0, 0});
+  test::ExpectTensorEqual<float>(expected_classes, *GetOutput(2));
+  // valid
+  Tensor expected_valid_d(allocator(), DT_INT32, TensorShape({1}));
+  test::FillValues<int>(&expected_valid_d, {3});
+  test::ExpectTensorEqual<int>(expected_valid_d, *GetOutput(3));
+}
+
+TEST_F(CombinedNonMaxSuppressionOpTest,
+       TestSelectFromThreeClustersWithScoreThreshold) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({1, 6, 1, 4}),
+      {0, 0,    0.1, 0.1, 0, 0.01f, 0.1, 0.11f, 0, -0.01, 0.1, 0.09f,
+       0, 0.11, 0.1, 0.2, 0, 0.12f, 0.1, 0.21f, 0, 0.3,   1,   0.4});
+  AddInputFromArray<float>(TensorShape({1, 6, 1}),
+                           {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.4f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // boxes
+  Tensor expected_boxes(allocator(), DT_FLOAT, TensorShape({1, 3, 4}));
+  test::FillValues<float>(&expected_boxes,
+                          {0, 0.11, 0.1, 0.2, 0, 0, 0.1, 0.1, 0, 0, 0, 0});
+  test::ExpectTensorEqual<float>(expected_boxes, *GetOutput(0));
+  // scores
+  Tensor expected_scores(allocator(), DT_FLOAT, TensorShape({1, 3}));
+  test::FillValues<float>(&expected_scores, {0.95, 0.9, 0});
+  test::ExpectTensorEqual<float>(expected_scores, *GetOutput(1));
+  // classes
+  Tensor expected_classes(allocator(), DT_FLOAT, TensorShape({1, 3}));
+  test::FillValues<float>(&expected_classes, {0, 0, 0});
+  test::ExpectTensorEqual<float>(expected_classes, *GetOutput(2));
+  // valid
+  Tensor expected_valid_d(allocator(), DT_INT32, TensorShape({1}));
+  test::FillValues<int>(&expected_valid_d, {2});
+  test::ExpectTensorEqual<int>(expected_valid_d, *GetOutput(3));
+}
+
+TEST_F(CombinedNonMaxSuppressionOpTest,
+       TestSelectFromThreeClustersWithScoreThresholdZeroScores) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({1, 6, 1, 4}),
+      {0, 0,    0.1, 0.1, 0, 0.01f, 0.1, 0.11f, 0, -0.01, 0.1, 0.09f,
+       0, 0.11, 0.1, 0.2, 0, 0.12f, 0.1, 0.21f, 0, 0.3,   1,   0.4});
+  AddInputFromArray<float>(TensorShape({1, 6, 1}),
+                           {.1f, 0, 0, .3f, .2f, -5.0f});
+  // If we ask for more boxes than we actually expect to get back;
+  // should still only get 2 boxes back.
+  AddInputFromArray<int>(TensorShape({}), {4});
+  AddInputFromArray<int>(TensorShape({}), {5});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {-3.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // boxes
+  Tensor expected_boxes(allocator(), DT_FLOAT, TensorShape({1, 5, 4}));
+  test::FillValues<float>(
+      &expected_boxes,
+      {
+          0, 0.11, 0.1, 0.2, 0, 0, 0.1, 0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      });
+  test::ExpectTensorEqual<float>(expected_boxes, *GetOutput(0));
+  // scores
+  Tensor expected_scores(allocator(), DT_FLOAT, TensorShape({1, 5}));
+  test::FillValues<float>(&expected_scores, {0.3, 0.1, 0, 0, 0});
+  test::ExpectTensorEqual<float>(expected_scores, *GetOutput(1));
+  // classes
+  Tensor expected_classes(allocator(), DT_FLOAT, TensorShape({1, 5}));
+  test::FillValues<float>(&expected_classes, {0, 0, 0, 0, 0});
+  test::ExpectTensorEqual<float>(expected_classes, *GetOutput(2));
+  // valid
+  Tensor expected_valid_d(allocator(), DT_INT32, TensorShape({1}));
+  test::FillValues<int>(&expected_valid_d, {2});
+  test::ExpectTensorEqual<int>(expected_valid_d, *GetOutput(3));
+}
+
+TEST_F(CombinedNonMaxSuppressionOpTest, TestSelectSingleBox) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({1, 1, 1, 4}), {0, 0, 1, 1});
+  AddInputFromArray<float>(TensorShape({1, 1, 1}), {.9f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<int>(TensorShape({}), {1});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // boxes
+  Tensor expected_boxes(allocator(), DT_FLOAT, TensorShape({1, 1, 4}));
+  test::FillValues<float>(&expected_boxes, {0, 0, 1, 1});
+  test::ExpectTensorEqual<float>(expected_boxes, *GetOutput(0));
+  // scores
+  Tensor expected_scores(allocator(), DT_FLOAT, TensorShape({1, 1}));
+  test::FillValues<float>(&expected_scores, {0.9});
+  test::ExpectTensorEqual<float>(expected_scores, *GetOutput(1));
+  // classes
+  Tensor expected_classes(allocator(), DT_FLOAT, TensorShape({1, 1}));
+  test::FillValues<float>(&expected_classes, {0});
+  test::ExpectTensorEqual<float>(expected_classes, *GetOutput(2));
+  // valid
+  Tensor expected_valid_d(allocator(), DT_INT32, TensorShape({1}));
+  test::FillValues<int>(&expected_valid_d, {1});
+  test::ExpectTensorEqual<int>(expected_valid_d, *GetOutput(3));
+}
+
+TEST_F(CombinedNonMaxSuppressionOpTest,
+       TestSelectFromTwoBatchesWithScoreThreshold) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({2, 6, 1, 4}),
+      {0, 0,    0.1, 0.1, 0, 0.01f, 0.1, 0.11f, 0, -0.01, 0.1, 0.09f,
+       0, 0.11, 0.1, 0.2, 0, 0.12f, 0.1, 0.21f, 0, 0.3,   1,   0.4,
+       0, 0,    0.2, 0.2, 0, 0.02f, 0.2, 0.22f, 0, -0.02, 0.2, 0.19f,
+       0, 0.21, 0.2, 0.3, 0, 0.22f, 0.2, 0.31f, 0, 0.4,   1,   0.5});
+  AddInputFromArray<float>(
+      TensorShape({2, 6, 1}),
+      {.9f, .75f, .6f, .95f, .5f, .3f, .9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.4f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // boxes
+  Tensor expected_boxes(allocator(), DT_FLOAT, TensorShape({2, 3, 4}));
+  test::FillValues<float>(&expected_boxes,
+                          {0, 0.11, 0.1, 0.2, 0, 0, 0.1, 0.1, 0, 0, 0, 0,
+                           0, 0.21, 0.2, 0.3, 0, 0, 0.2, 0.2, 0, 0, 0, 0});
+  test::ExpectTensorEqual<float>(expected_boxes, *GetOutput(0));
+  // scores
+  Tensor expected_scores(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  test::FillValues<float>(&expected_scores, {0.95, 0.9, 0, 0.95, 0.9, 0});
+  test::ExpectTensorEqual<float>(expected_scores, *GetOutput(1));
+  // classes
+  Tensor expected_classes(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  test::FillValues<float>(&expected_classes, {0, 0, 0, 0, 0, 0});
+  test::ExpectTensorEqual<float>(expected_classes, *GetOutput(2));
+  // valid
+  Tensor expected_valid_d(allocator(), DT_INT32, TensorShape({2}));
+  test::FillValues<int>(&expected_valid_d, {2, 2});
+  test::ExpectTensorEqual<int>(expected_valid_d, *GetOutput(3));
+}
+
+TEST_F(CombinedNonMaxSuppressionOpTest, TestSelectFromTwoBatchesTwoClasses) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({2, 6, 1, 4}),
+      {0, 0,    0.1, 0.1, 0, 0.01f, 0.1, 0.11f, 0, -0.01, 0.1, 0.09f,
+       0, 0.11, 0.1, 0.2, 0, 0.12f, 0.1, 0.21f, 0, 0.3,   1,   0.4,
+       0, 0,    0.2, 0.2, 0, 0.02f, 0.2, 0.22f, 0, -0.02, 0.2, 0.19f,
+       0, 0.21, 0.2, 0.3, 0, 0.22f, 0.2, 0.31f, 0, 0.4,   1,   0.5});
+  AddInputFromArray<float>(TensorShape({2, 6, 2}),
+                           {0.1f, 0.9f, 0.75f, 0.8f, 0.6f, 0.3f, 0.95f, 0.1f,
+                            0.5f, 0.5f, 0.3f,  0.1f, 0.1f, 0.9f, 0.75f, 0.8f,
+                            0.6f, 0.3f, 0.95f, 0.1f, 0.5f, 0.5f, 0.3f,  0.1f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // boxes
+  Tensor expected_boxes(allocator(), DT_FLOAT, TensorShape({2, 3, 4}));
+  test::FillValues<float>(
+      &expected_boxes,
+      {0, 0.11, 0.1, 0.2, 0, 0, 0.1, 0.1, 0, 0.01f, 0.1, 0.11f,
+       0, 0.21, 0.2, 0.3, 0, 0, 0.2, 0.2, 0, 0.02f, 0.2, 0.22f});
+  test::ExpectTensorEqual<float>(expected_boxes, *GetOutput(0));
+  // scores
+  Tensor expected_scores(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  test::FillValues<float>(&expected_scores, {0.95, 0.9, 0.75, 0.95, 0.9, 0.75});
+  test::ExpectTensorEqual<float>(expected_scores, *GetOutput(1));
+  // classes
+  Tensor expected_classes(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  test::FillValues<float>(&expected_classes, {0, 1, 0, 0, 1, 0});
+  test::ExpectTensorEqual<float>(expected_classes, *GetOutput(2));
+  // valid
+  Tensor expected_valid_d(allocator(), DT_INT32, TensorShape({2}));
+  test::FillValues<int>(&expected_valid_d, {3, 3});
+  test::ExpectTensorEqual<int>(expected_valid_d, *GetOutput(3));
+}
+
+TEST_F(CombinedNonMaxSuppressionOpTest,
+       TestSelectFromTwoBatchesTwoClassesWithScoreThreshold) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({2, 6, 1, 4}),
+      {0, 0,    0.1, 0.1, 0, 0.01f, 0.1, 0.11f, 0, -0.01, 0.1, 0.09f,
+       0, 0.11, 0.1, 0.2, 0, 0.12f, 0.1, 0.21f, 0, 0.3,   1,   0.4,
+       0, 0,    0.2, 0.2, 0, 0.02f, 0.2, 0.22f, 0, -0.02, 0.2, 0.19f,
+       0, 0.21, 0.2, 0.3, 0, 0.22f, 0.2, 0.31f, 0, 0.4,   1,   0.5});
+  AddInputFromArray<float>(TensorShape({2, 6, 2}),
+                           {0.1f, 0.9f, 0.75f, 0.8f, 0.6f, 0.3f, 0.95f, 0.1f,
+                            0.5f, 0.5f, 0.3f,  0.1f, 0.1f, 0.9f, 0.75f, 0.8f,
+                            0.6f, 0.3f, 0.95f, 0.1f, 0.5f, 0.5f, 0.3f,  0.1f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.8f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // boxes
+  Tensor expected_boxes(allocator(), DT_FLOAT, TensorShape({2, 3, 4}));
+  test::FillValues<float>(&expected_boxes,
+                          {0, 0.11, 0.1, 0.2, 0, 0, 0.1, 0.1, 0, 0, 0, 0,
+                           0, 0.21, 0.2, 0.3, 0, 0, 0.2, 0.2, 0, 0, 0, 0});
+  test::ExpectTensorEqual<float>(expected_boxes, *GetOutput(0));
+  // scores
+  Tensor expected_scores(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  test::FillValues<float>(&expected_scores, {0.95, 0.9, 0, 0.95, 0.9, 0});
+  test::ExpectTensorEqual<float>(expected_scores, *GetOutput(1));
+  // classes
+  Tensor expected_classes(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  test::FillValues<float>(&expected_classes, {0, 1, 0, 0, 1, 0});
+  test::ExpectTensorEqual<float>(expected_classes, *GetOutput(2));
+  // valid
+  Tensor expected_valid_d(allocator(), DT_INT32, TensorShape({2}));
+  test::FillValues<int>(&expected_valid_d, {2, 2});
+  test::ExpectTensorEqual<int>(expected_valid_d, *GetOutput(3));
+}
+
+TEST_F(CombinedNonMaxSuppressionOpTest,
+       TestSelectFromTwoBatchesTwoClassesWithScoreThresholdPaddedTotalSize) {
+  MakeOp(true);
+  AddInputFromArray<float>(
+      TensorShape({2, 6, 1, 4}),
+      {0, 0,    0.1, 0.1, 0, 0.01f, 0.1, 0.11f, 0, -0.01, 0.1, 0.09f,
+       0, 0.11, 0.1, 0.2, 0, 0.12f, 0.1, 0.21f, 0, 0.3,   1,   0.4,
+       0, 0,    0.2, 0.2, 0, 0.02f, 0.2, 0.22f, 0, -0.02, 0.2, 0.19f,
+       0, 0.21, 0.2, 0.3, 0, 0.22f, 0.2, 0.31f, 0, 0.4,   1,   0.5});
+  AddInputFromArray<float>(TensorShape({2, 6, 2}),
+                           {0.1f, 0.9f, 0.75f, 0.8f, 0.6f, 0.3f, 0.95f, 0.1f,
+                            0.5f, 0.5f, 0.3f,  0.1f, 0.1f, 0.9f, 0.75f, 0.8f,
+                            0.6f, 0.3f, 0.95f, 0.1f, 0.5f, 0.5f, 0.3f,  0.1f});
+  AddInputFromArray<int>(TensorShape({}), {10});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.8f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // boxes
+  Tensor expected_boxes(allocator(), DT_FLOAT, TensorShape({2, 3, 4}));
+  test::FillValues<float>(&expected_boxes,
+                          {0, 0.11, 0.1, 0.2, 0, 0, 0.1, 0.1, 0, 0, 0, 0,
+                           0, 0.21, 0.2, 0.3, 0, 0, 0.2, 0.2, 0, 0, 0, 0});
+  test::ExpectTensorEqual<float>(expected_boxes, *GetOutput(0));
+  // scores
+  Tensor expected_scores(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  test::FillValues<float>(&expected_scores, {0.95, 0.9, 0, 0.95, 0.9, 0});
+  test::ExpectTensorEqual<float>(expected_scores, *GetOutput(1));
+  // classes
+  Tensor expected_classes(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  test::FillValues<float>(&expected_classes, {0, 1, 0, 0, 1, 0});
+  test::ExpectTensorEqual<float>(expected_classes, *GetOutput(2));
+  // valid
+  Tensor expected_valid_d(allocator(), DT_INT32, TensorShape({2}));
+  test::FillValues<int>(&expected_valid_d, {2, 2});
+  test::ExpectTensorEqual<int>(expected_valid_d, *GetOutput(3));
+}
+
+TEST_F(CombinedNonMaxSuppressionOpTest,
+       TestSelectFromTwoBatchesTwoClassesWithScoreThresholdPaddedPerClass) {
+  MakeOp(true);
+  AddInputFromArray<float>(
+      TensorShape({2, 6, 1, 4}),
+      {0, 0,    0.1, 0.1, 0, 0.01f, 0.1, 0.11f, 0, -0.01, 0.1, 0.09f,
+       0, 0.11, 0.1, 0.2, 0, 0.12f, 0.1, 0.21f, 0, 0.3,   1,   0.4,
+       0, 0,    0.2, 0.2, 0, 0.02f, 0.2, 0.22f, 0, -0.02, 0.2, 0.19f,
+       0, 0.21, 0.2, 0.3, 0, 0.22f, 0.2, 0.31f, 0, 0.4,   1,   0.5});
+  AddInputFromArray<float>(TensorShape({2, 6, 2}),
+                           {0.1f, 0.9f, 0.75f, 0.8f, 0.6f, 0.3f, 0.95f, 0.1f,
+                            0.5f, 0.5f, 0.3f,  0.1f, 0.1f, 0.9f, 0.75f, 0.8f,
+                            0.6f, 0.3f, 0.95f, 0.1f, 0.5f, 0.5f, 0.3f,  0.1f});
+  AddInputFromArray<int>(TensorShape({}), {2});
+  AddInputFromArray<int>(TensorShape({}), {50});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.8f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // boxes
+  Tensor expected_boxes(allocator(), DT_FLOAT, TensorShape({2, 4, 4}));
+  test::FillValues<float>(
+      &expected_boxes,
+      {0, 0.11, 0.1, 0.2, 0, 0, 0.1, 0.1, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0.21, 0.2, 0.3, 0, 0, 0.2, 0.2, 0, 0, 0, 0, 0, 0, 0, 0});
+  test::ExpectTensorEqual<float>(expected_boxes, *GetOutput(0));
+  // scores
+  Tensor expected_scores(allocator(), DT_FLOAT, TensorShape({2, 4}));
+  test::FillValues<float>(&expected_scores, {0.95, 0.9, 0, 0, 0.95, 0.9, 0, 0});
+  test::ExpectTensorEqual<float>(expected_scores, *GetOutput(1));
+  // classes
+  Tensor expected_classes(allocator(), DT_FLOAT, TensorShape({2, 4}));
+  test::FillValues<float>(&expected_classes, {0, 1, 0, 0, 0, 1, 0, 0});
+  test::ExpectTensorEqual<float>(expected_classes, *GetOutput(2));
+  // valid
+  Tensor expected_valid_d(allocator(), DT_INT32, TensorShape({2}));
+  test::FillValues<int>(&expected_valid_d, {2, 2});
+  test::ExpectTensorEqual<int>(expected_valid_d, *GetOutput(3));
+}
+
+TEST_F(CombinedNonMaxSuppressionOpTest,
+       TestSelectFromTwoBatchesTwoClassesTotalSize) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({2, 6, 1, 4}),
+      {0, 0,    0.1, 0.1, 0, 0.01f, 0.1, 0.11f, 0, -0.01, 0.1, 0.09f,
+       0, 0.11, 0.1, 0.2, 0, 0.12f, 0.1, 0.21f, 0, 0.3,   1,   0.4,
+       0, 0,    0.2, 0.2, 0, 0.02f, 0.2, 0.22f, 0, -0.02, 0.2, 0.19f,
+       0, 0.21, 0.2, 0.3, 0, 0.22f, 0.2, 0.31f, 0, 0.4,   1,   0.5});
+  AddInputFromArray<float>(TensorShape({2, 6, 2}),
+                           {0.1f, 0.9f, 0.75f, 0.8f, 0.6f, 0.3f, 0.95f, 0.1f,
+                            0.5f, 0.5f, 0.3f,  0.1f, 0.1f, 0.9f, 0.75f, 0.8f,
+                            0.6f, 0.3f, 0.95f, 0.1f, 0.5f, 0.5f, 0.3f,  0.1f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  // Total size per batch is more than size per class
+  AddInputFromArray<int>(TensorShape({}), {5});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.1f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // boxes
+  Tensor expected_boxes(allocator(), DT_FLOAT, TensorShape({2, 5, 4}));
+  test::FillValues<float>(
+      &expected_boxes, {0,   0.11,  0.1, 0.2,   0,   0,     0.1, 0.1, 0, 0.01f,
+                        0.1, 0.11f, 0,   0.12f, 0.1, 0.21f, 0,   0.3, 1, 0.4,
+                        0,   0.21,  0.2, 0.3,   0,   0,     0.2, 0.2, 0, 0.02f,
+                        0.2, 0.22f, 0,   0.22f, 0.2, 0.31f, 0,   0.4, 1, 0.5});
+  test::ExpectTensorEqual<float>(expected_boxes, *GetOutput(0));
+  // scores
+  Tensor expected_scores(allocator(), DT_FLOAT, TensorShape({2, 5}));
+  test::FillValues<float>(
+      &expected_scores, {0.95, 0.9, 0.75, 0.5, 0.3, 0.95, 0.9, 0.75, 0.5, 0.3});
+  test::ExpectTensorEqual<float>(expected_scores, *GetOutput(1));
+  // classes
+  Tensor expected_classes(allocator(), DT_FLOAT, TensorShape({2, 5}));
+  test::FillValues<float>(&expected_classes, {0, 1, 0, 1, 0, 0, 1, 0, 1, 0});
+  test::ExpectTensorEqual<float>(expected_classes, *GetOutput(2));
+  // valid
+  Tensor expected_valid_d(allocator(), DT_INT32, TensorShape({2}));
+  test::FillValues<int>(&expected_valid_d, {5, 5});
+  test::ExpectTensorEqual<int>(expected_valid_d, *GetOutput(3));
+}
+
+TEST_F(CombinedNonMaxSuppressionOpTest,
+       TestSelectFromTwoBatchesTwoClassesForBoxesAndScores) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({2, 6, 2, 4}),
+      // batch 0, box1 of class 1 should get selected
+      {0, 0, 0.1, 0.1, 0, 0, 0.1, 0.1, 0, 0.01f, 0.1, 0.11f, 0, 0.6f, 0.1, 0.7f,
+       0, -0.01, 0.1, 0.09f, 0, -0.01, 0.1, 0.09f, 0, 0.11, 0.1, 0.2, 0, 0.11,
+       0.1, 0.2, 0, 0.12f, 0.1, 0.21f, 0, 0.12f, 0.1, 0.21f, 0, 0.3, 1, 0.4, 0,
+       0.3, 1, 0.4,
+       // batch 1, box1 of class 0 should get selected
+       0, 0, 0.2, 0.2, 0, 0, 0.2, 0.2, 0, 0.02f, 0.2, 0.22f, 0, 0.02f, 0.2,
+       0.22f, 0, -0.02, 0.2, 0.19f, 0, -0.02, 0.2, 0.19f, 0, 0.21, 0.2, 0.3, 0,
+       0.21, 0.2, 0.3, 0, 0.22f, 0.2, 0.31f, 0, 0.22f, 0.2, 0.31f, 0, 0.4, 1,
+       0.5, 0, 0.4, 1, 0.5});
+
+  AddInputFromArray<float>(TensorShape({2, 6, 2}),
+                           {0.1f, 0.9f, 0.75f, 0.8f, 0.6f, 0.3f, 0.95f, 0.1f,
+                            0.5f, 0.5f, 0.3f,  0.1f, 0.1f, 0.9f, 0.75f, 0.8f,
+                            0.6f, 0.3f, 0.95f, 0.1f, 0.5f, 0.5f, 0.3f,  0.1f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // boxes
+  Tensor expected_boxes(allocator(), DT_FLOAT, TensorShape({2, 3, 4}));
+  test::FillValues<float>(
+      &expected_boxes,
+      {0, 0.11, 0.1, 0.2, 0, 0, 0.1, 0.1, 0, 0.6f,  0.1, 0.7f,
+       0, 0.21, 0.2, 0.3, 0, 0, 0.2, 0.2, 0, 0.02f, 0.2, 0.22f});
+  test::ExpectTensorEqual<float>(expected_boxes, *GetOutput(0));
+  // scores
+  Tensor expected_scores(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  test::FillValues<float>(&expected_scores, {0.95, 0.9, 0.8, 0.95, 0.9, 0.75});
+  test::ExpectTensorEqual<float>(expected_scores, *GetOutput(1));
+  // classes
+  Tensor expected_classes(allocator(), DT_FLOAT, TensorShape({2, 3}));
+  test::FillValues<float>(&expected_classes, {0, 1, 1, 0, 1, 0});
+  test::ExpectTensorEqual<float>(expected_classes, *GetOutput(2));
+  // valid
+  Tensor expected_valid_d(allocator(), DT_INT32, TensorShape({2}));
+  test::FillValues<int>(&expected_valid_d, {3, 3});
+  test::ExpectTensorEqual<int>(expected_valid_d, *GetOutput(3));
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index 5645275cfa98eb820b7d1e885b18894bfab17e49..18ed1ea26ac8a63c4716bfdc7197641be522ea7c 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -158,7 +158,8 @@ REGISTER_PACK(string);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_bfloat16(REGISTER_GPU);
 TF_CALL_int64(REGISTER_GPU);
-REGISTER_GPU(bool);
+TF_CALL_int16(REGISTER_GPU);
+TF_CALL_bool(REGISTER_GPU);
 #undef REGISTER_GPU
 
 // A special GPU kernel for int32.
diff --git a/tensorflow/core/kernels/pad_op.cc b/tensorflow/core/kernels/pad_op.cc
index 3b9133ed7e2c210aab3488d667f0c2e543207fcf..691430ebaff5a99ccb103c5f5a80263d15f24b6a 100644
--- a/tensorflow/core/kernels/pad_op.cc
+++ b/tensorflow/core/kernels/pad_op.cc
@@ -322,6 +322,7 @@ namespace functor {
 
 TF_CALL_GPU_ALL_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_int8(DECLARE_GPU_SPECS);
+TF_CALL_uint8(DECLARE_GPU_SPECS);
 }  // namespace functor
 
 // Registration of the GPU implementations.
@@ -355,6 +356,7 @@ TF_CALL_int8(DECLARE_GPU_SPECS);
 
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNEL);
 TF_CALL_int8(REGISTER_GPU_KERNEL);
+TF_CALL_uint8(REGISTER_GPU_KERNEL);
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/pad_op_gpu.cu.cc b/tensorflow/core/kernels/pad_op_gpu.cu.cc
index 00ec44adc284099b3fed644d4742af8d07ae13e1..0cd8ef17ba2be995c719dccb5b3a104f9bd09f68 100644
--- a/tensorflow/core/kernels/pad_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/pad_op_gpu.cu.cc
@@ -41,6 +41,7 @@ typedef Eigen::GpuDevice GPUDevice;
 
 TF_CALL_GPU_ALL_TYPES(DEFINE_GPU_SPECS);
 TF_CALL_int8(DEFINE_GPU_SPECS);
+TF_CALL_uint8(DEFINE_GPU_SPECS);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index ba51db219ec5528d1dd98f744e70c5cd2cf6c6f8..4866efef9dbdd5fe097817f1906620a689fba120 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -12,34 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "absl/strings/match.h"
 #include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/common_runtime/optimization_registry.h"
-#include "tensorflow/core/common_runtime/placer.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/graph_constructor.h"
-#include "tensorflow/core/graph/graph_partition.h"
-#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
-#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
-#include "tensorflow/core/grappler/utils/functions.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 #include "tensorflow/core/util/ptr_util.h"
-#include "tensorflow/core/util/reffed_status_callback.h"
 
 #if GOOGLE_CUDA
 #include "tensorflow/stream_executor/stream.h"
 #endif  // GOOGLE_CUDA
 
 namespace tensorflow {
-typedef FunctionLibraryRuntime::Handle FHandle;
-
 namespace {
 // A `PartitionedCallOp` asynchronously executes a function, potentially across
 // multiple devices but within a single process. The kernel places and
@@ -77,7 +68,15 @@ class PartitionedCallOp : public AsyncOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("executor_type", &executor_type_));
   }
 
-  ~PartitionedCallOp() override {}
+  ~PartitionedCallOp() override {
+    for (const auto& it : handles_) {
+      Status status = it.first->ReleaseHandle(it.second);
+      if (!status.ok()) {
+        LOG(INFO) << "Ignoring error while destructing PartitionedCallOp: "
+                  << status.ToString();
+      }
+    }
+  }
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     FunctionLibraryRuntime* lib = ctx->function_library();
@@ -85,9 +84,6 @@ class PartitionedCallOp : public AsyncOpKernel {
                       errors::Internal("No function library is provided."),
                       done);
 
-    OpInputList args;
-    OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("args", &args), done);
-
     // The function body's graph is placed and partitioned the first time
     // `ComputeAsync` is invoked; every subsequent invocation calls each
     // of the function shards yielded by partitioning.
@@ -97,524 +93,173 @@ class PartitionedCallOp : public AsyncOpKernel {
     // Inputs and outputs are pinned to the local device, for simplicity.
     //
     // TODO(akshayka): Support re-sharding the function on subsequent calls,
-    // via, e.g., virtual device annotations and a list of device names supplied
-    // through an attribute.
+    // via, e.g., virtual device annotations and a list of device names
+    // supplied through an attribute.
     //
     // TODO(akshayka): Add a fastpath for functions that execute on a single
     // device.
+    FunctionLibraryRuntime::Handle handle;
+    // If we are instantiating the function, we can efficiently extract the
+    // inputs while instantiating. Else, we extract them separately below.
+    std::vector<Tensor> inputs;
+    bool inputs_extracted = false;
     {
       mutex_lock l(mu_);
-      if (function_handles_.find(lib) == function_handles_.end()) {
-        // TODO(b/37549631): Because this kernel may correspond to a stateful
-        // op, it may be shared by multiple subgraphs, which in turn may have
-        // different `FunctionLibraryRuntime` objects and therefore different
-        // `FHandle` namespaces. As such, we partition on a per-FLR basis.
-        FunctionLibraryRuntime::InstantiateOptions opts;
-        FHandle handle;
-        OP_REQUIRES_OK_ASYNC(
-            ctx,
-            lib->Instantiate(func_.name(), AttrSlice(&func_.attr()), opts,
-                             &handle),
-            done);
-        const FunctionBody* fbody = lib->GetFunctionBody(handle);
-        OP_REQUIRES_ASYNC(ctx, fbody != nullptr,
-                          errors::Internal("Could not find handle ", handle),
-                          done);
-        OP_REQUIRES_ASYNC(
-            ctx, args.size() == fbody->arg_nodes.size(),
-            errors::InvalidArgument(
-                "Wrong number of arguments to the op; function expects ",
-                fbody->arg_nodes.size(), " but PartitionedCall received ",
-                args.size()),
-            done);
-        // We need to pass global op_registry as default_registry when creating
-        // graph. So that graph optimization passes can lookup all possible ops
-        // by name.
-        auto graph = tensorflow::MakeUnique<Graph>(fbody->graph->flib_def());
-        FunctionLibraryDefinition global_flib(OpRegistry::Global(), {});
-        TF_CHECK_OK(graph->AddFunctionLibrary(global_flib.ToProto()));
-        CopyGraph(*fbody->graph, graph.get());
-        OP_REQUIRES_OK_ASYNC(ctx, PinResourceArgs(graph.get(), args), done);
-
-        DeviceSet device_set;
-        for (auto d : lib->device_mgr()->ListDevices()) {
-          device_set.AddDevice(d);
-        }
-
-        // The FunctionLibraryRuntime's library cannot be mutated from within
-        // an OpKernel, so functions are instantiated in an overlay library.
-        OP_REQUIRES_ASYNC(
-            ctx, overlay_libs_.find(lib) == overlay_libs_.end(),
-            errors::Internal("Found an overlay library but did not "
-                             "find cached function partitions; "
-                             "this indicates a bug."),
-            done);
-        // We do not need a full function library in the overlay, we just keep a
-        // subset that is reachable from the instantiated function.
-        FunctionLibraryDefinition* overlay_lib = new FunctionLibraryDefinition(
-            grappler::ReachableFunctionLibraryDefinition(
-                *lib->GetFunctionLibraryDefinition(), fbody->fdef));
-        overlay_libs_.emplace(lib, overlay_lib);
-
-        GraphOptimizationPassOptions optimization_options;
-        // TODO(akshayka): Thread SessionOptions (if any) into this kernel, or
-        // make it possible to specify the relevant options via attributes.
-        SessionOptions session_options;
-        session_options.env = ctx->env();
-        optimization_options.session_options = &session_options;
-        optimization_options.graph = &graph;
-        optimization_options.flib_def = overlay_lib;
-        optimization_options.device_set = &device_set;
-        OP_REQUIRES_OK_ASYNC(
-            ctx,
-            OptimizationPassRegistry::Global()->RunGrouping(
-                OptimizationPassRegistry::PRE_PLACEMENT, optimization_options),
-            done);
-
-        // Make the FunctionLibraryRuntime's device the default device if
-        // nothing else is hard coded. This allows the same function definition
-        // to be specialized to different devices depending on the
-        // PartitionedCallOp's device.
-        Placer placer(graph.get(), &device_set,
-                      nullptr, /* No session options */
-                      lib->device() /* Default device */);
-        OP_REQUIRES_OK_ASYNC(ctx, placer.Run(), done);
-        OP_REQUIRES_OK_ASYNC(
-            ctx,
-            OptimizationPassRegistry::Global()->RunGrouping(
-                OptimizationPassRegistry::POST_PLACEMENT, optimization_options),
-            done);
-
-        Device* cpu_device;
-        OP_REQUIRES_OK_ASYNC(
-            ctx, lib->device_mgr()->LookupDevice("CPU:0", &cpu_device), done);
-
-        // Run grappler passes on the graph. It is possible that these are
-        // optimized by the graph executor already.
-        OP_REQUIRES_OK_ASYNC(ctx,
-                             OptimizeGraph(ctx, fbody->ret_nodes, overlay_lib,
-                                           device_set, cpu_device, &graph),
-                             done);
-
-        OP_REQUIRES_OK_ASYNC(
-            ctx,
-            OptimizationPassRegistry::Global()->RunGrouping(
-                OptimizationPassRegistry::POST_REWRITE_FOR_EXEC,
-                optimization_options),
-            done);
-
-        std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
-        OP_REQUIRES_OK_ASYNC(
-            ctx, PartitionHelper(device_set, std::move(graph), &subgraphs),
-            done);
-        if (ctx->graph_collector() != nullptr) {
-          for (const auto& pair : subgraphs) {
-            GraphDef def;
-            pair.second->ToGraphDef(&def);
-            ctx->graph_collector()->CollectGraph(def);
-          }
-        }
-        optimization_options.graph = nullptr;
-        optimization_options.device_set = nullptr;
-        optimization_options.partition_graphs = &subgraphs;
-        OP_REQUIRES_OK_ASYNC(ctx,
-                             OptimizationPassRegistry::Global()->RunGrouping(
-                                 OptimizationPassRegistry::POST_PARTITIONING,
-                                 optimization_options),
+      auto it = handles_.find(lib);
+      if (it == handles_.end()) {
+        OP_REQUIRES_OK_ASYNC(ctx, Instantiate(lib, ctx, &inputs, &handle),
                              done);
+        inputs_extracted = true;
+        handles_[lib] = handle;
+      } else {
+        handle = it->second;
+      }
+    }
 
-        auto handles = tensorflow::MakeUnique<gtl::FlatMap<string, FHandle>>();
-        for (const auto& pair : subgraphs) {
-          // TODO(akshayka): Fail gracefully if the set of devices corresponds
-          // to more than one address space.
-          const string& target = pair.first;
-          const auto& subgraph = pair.second;
-          OP_REQUIRES_OK_ASYNC(
-              ctx, UpdateArgAndRetMetadata(target, subgraph.get()), done);
-          FunctionDef shard;
-          string unique_name = UniquifyFunctionName(overlay_lib, func_.name());
-          OP_REQUIRES_OK_ASYNC(
-              ctx, GraphToFunctionDef(*subgraph, unique_name, &shard), done);
-          OP_REQUIRES_OK_ASYNC(ctx, overlay_lib->AddFunctionDef(shard), done);
-          FunctionLibraryRuntime::InstantiateOptions opts;
-          opts.executor_type = executor_type_;
-          opts.target = target;
-          opts.overlay_lib = overlay_lib;
-          FHandle handle;
-          OP_REQUIRES_OK_ASYNC(
-              ctx,
-              lib->Instantiate(unique_name, AttrSlice(&shard.attr()), opts,
-                               &handle),
-              done);
-          handles->emplace(target, handle);
-        }
-
-        function_handles_.emplace(lib, std::move(handles));
+    if (!inputs_extracted) {
+      OpInputList args;
+      OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("args", &args), done);
+      inputs.reserve(args.size());
+      for (const Tensor& tensor : args) {
+        inputs.push_back(tensor);
       }
     }
-    ExecuteFunctions(lib, ctx, args, std::move(done));
+
+    RunFunction(handle, inputs, lib, ctx, done);
   }
 
  private:
-  typedef std::pair<string, FHandle> DeviceAndFHandle;
-  typedef std::pair<std::vector<int>, std::vector<int>> ArgAndRetIndices;
-  typedef std::pair<std::vector<AllocatorAttributes>,
-                    std::vector<AllocatorAttributes>>
-      ArgAndRetAllocAttrs;
-
-  // Pins each arg that emits a `DT_RESOURCE` tensor to the device on which the
-  // corresponding resource lives. This ensures that the Placer assigns ops that
-  // access these resources to the appropriate devices.
-  Status PinResourceArgs(Graph* graph, const OpInputList& args) {
-    for (Node* node : graph->op_nodes()) {
-      string node_type = node->type_string();
-      if (node_type == FunctionLibraryDefinition::kArgOp) {
-        const AttrValue* attr_value;
-        TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
-        int index = attr_value->i();
-        TF_RETURN_IF_ERROR(node->attrs().Find("T", &attr_value));
-        DataType dtype = attr_value->type();
-        if (dtype != args[index].dtype()) {
-          return errors::InvalidArgument("For argument ", index, " expected ",
-                                         DataTypeString(dtype), " tensor, got ",
-                                         DataTypeString(args[index].dtype()),
-                                         " instead.");
-        }
-        if (dtype == DT_RESOURCE) {
-          const ResourceHandle& handle = args[index].flat<ResourceHandle>()(0);
-          node->set_assigned_device_name(handle.device());
-        }
-      }
+  Status FillOutputDevices(const FunctionLibraryRuntime& lib,
+                           const Device& cpu_device, AttrSlice attrs,
+                           FunctionLibraryRuntime::InstantiateOptions* opts) {
+    const FunctionLibraryDefinition* flib = lib.GetFunctionLibraryDefinition();
+    const FunctionDef* fdef = flib->Find(func_.name());
+    if (fdef == nullptr) {
+      return errors::NotFound("Failed for find definiton for function \"",
+                              func_.name(), "\"");
     }
-    return Status::OK();
-  }
 
-  // Partitions `graph` and populates `subgraphs` with the partitions.
-  Status PartitionHelper(
-      const DeviceSet& device_set, std::unique_ptr<Graph> graph,
-      std::unordered_map<string, std::unique_ptr<Graph>>* subgraphs) {
-    PartitionOptions partition_options;
-    partition_options.node_to_loc = [](const Node* node) {
-      // TODO(akshayka): To better support the distributed case, first split
-      // the graph by worker (e.g,. using the master session's
-      // `SplitByWorker` policy), and then recursively partition the
-      // per-worker shards at the remote worker(s).
-      return node->assigned_device_name();
-    };
-    int64 edge_name_counter = 0;
-    partition_options.new_name = [&edge_name_counter](const string& prefix) {
-      return strings::StrCat(prefix, "/_", ++edge_name_counter);
-    };
-    partition_options.get_incarnation =
-        [&device_set](const string& name) -> int64 {
-      const Device* d = device_set.FindDeviceByName(name);
-      if (d == nullptr) {
-        return PartitionOptions::kIllegalIncarnation;
-      } else {
-        return d->attributes().incarnation();
+    bool is_type_list;
+    for (const OpDef::ArgDef& ret_def : fdef->signature().output_arg()) {
+      DataTypeVector dtypes;
+      TF_RETURN_IF_ERROR(ArgNumType(attrs, ret_def, &is_type_list, &dtypes));
+      for (DataType dtype : dtypes) {
+        if (MTypeFromDType(dtype) == HOST_MEMORY) {
+          opts->output_devices.push_back(cpu_device.name());
+        } else {
+          opts->output_devices.push_back(opts->target);
+        }
       }
-    };
-    partition_options.control_flow_added = false;
-    std::unordered_map<string, GraphDef> partitions;
-    TF_RETURN_IF_ERROR(Partition(partition_options, graph.get(), &partitions));
-
-    VLOG(3) << "Partitioned function '" << func_.name() << "', yielding "
-            << partitions.size() << " shards.";
-
-    for (const auto& partition : partitions) {
-      std::unique_ptr<Graph> subgraph(new Graph(graph->flib_def()));
-      FunctionLibraryDefinition global_flib(OpRegistry::Global(), {});
-      TF_CHECK_OK(subgraph->AddFunctionLibrary(global_flib.ToProto()));
-      GraphConstructorOptions opts;
-      opts.allow_internal_ops = true;
-      opts.expect_device_spec = true;
-      const string& device = partition.first;
-      const GraphDef& graph_def = partition.second;
-      TF_RETURN_IF_ERROR(
-          ConvertGraphDefToGraph(opts, graph_def, subgraph.get()));
-      subgraphs->emplace(device, std::move(subgraph));
     }
-
     return Status::OK();
   }
 
-  // Each subgraph produced by partitioning the function body contains a subset
-  // of the original `Arg` and `Retval` nodes. This function performs
-  // bookkeeping to track which `Arg` and `Retval` nodes were placed on a
-  // particular device / subgraph.
-  //
-  // More specifically, this function
-  //  (1) rewrites the indices of the `Arg` and `Retval` nodes placed on a
-  //      particular device,
-  //  (2) records the subsets of `Arg` and `Retval` nodes assigned to the
-  //      device, and
-  //  (3) records which `Arg` and `Retval` nodes live in host memory.
-  Status UpdateArgAndRetMetadata(const string& device, Graph* subgraph) {
-    ArgAndRetIndices indices;
-    std::vector<int>* arg_indices = &indices.first;
-    std::vector<int>* ret_indices = &indices.second;
-    std::vector<std::pair<Node*, int>> arg_nodes;
-    std::vector<std::pair<Node*, int>> ret_nodes;
-    const AttrValue* attr_value;
+  Status Instantiate(FunctionLibraryRuntime* lib, OpKernelContext* ctx,
+                     std::vector<Tensor>* inputs,
+                     FunctionLibraryRuntime::Handle* handle) {
+    grappler::GrapplerItem::OptimizationOptions optimization_options;
+
+    // Tensorflow 2.0 in eager mode with automatic control dependencies will
+    // prune all nodes that are not in the transitive fanin of the fetch nodes.
+    // However because the function will be executed via FunctionLibraryRuntime,
+    // and current function implementation does not prune stateful and dataset
+    // ops, we rely on Grappler to do the correct graph pruning.
+    optimization_options.allow_pruning_stateful_and_dataset_ops = true;
+
+    // All the nested function calls will be executed and optimized via
+    // PartitionedCallOp, there is no need to optimize functions now.
+    optimization_options.optimize_function_library = false;
+
+    FunctionLibraryRuntime::InstantiateOptions opts;
+    opts.target = lib->device()->name();
+    opts.is_multi_device_function = true;
+    opts.optimize_graph_fn =
+        std::bind(grappler::OptimizeGraph, std::placeholders::_1,
+                  std::placeholders::_2, std::placeholders::_3,
+                  std::placeholders::_4, std::placeholders::_5, config_proto_,
+                  func_.name(), optimization_options, std::placeholders::_6);
+    opts.graph_collector = ctx->graph_collector();
+    opts.executor_type = executor_type_;
 
-    // Find the Arg and Retval nodes, along with their corresponding indices
-    // in the original function.
-    for (Node* node : subgraph->op_nodes()) {
-      string node_type = node->type_string();
-      if (node_type == FunctionLibraryDefinition::kArgOp) {
-        TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
-        int index = attr_value->i();
-        arg_indices->push_back(index);
-        arg_nodes.push_back(std::make_pair(node, index));
-      } else if (node_type == FunctionLibraryDefinition::kRetOp) {
-        TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
-        int index = attr_value->i();
-        ret_indices->push_back(index);
-        ret_nodes.push_back(std::make_pair(node, index));
+    OpInputList args;
+    TF_RETURN_IF_ERROR(ctx->input_list("args", &args));
+    Device* cpu_device;
+    TF_RETURN_IF_ERROR(lib->device_mgr()->LookupDevice("CPU:0", &cpu_device));
+
+    inputs->reserve(args.size());
+    for (const Tensor& tensor : args) {
+      inputs->push_back(tensor);
+      DataType dtype = tensor.dtype();
+      if (dtype == DT_RESOURCE) {
+        const ResourceHandle& handle = tensor.flat<ResourceHandle>()(0);
+        opts.input_devices.push_back(handle.device());
+      } else if (MTypeFromDType(dtype) == HOST_MEMORY) {
+        opts.input_devices.push_back(cpu_device->name());
+      } else {
+        opts.input_devices.push_back(opts.target);
       }
     }
 
-    for (int i = 0; i < arg_nodes.size(); ++i) {
-      Node* arg = arg_nodes[i].first;
-      arg->AddAttr("index", i);
-      TF_RETURN_IF_ERROR(arg->attrs().Find("T", &attr_value));
-      AllocatorAttributes alloc_attr;
-      DataType type = attr_value->type();
-      if (MTypeFromDType(type) == HOST_MEMORY) {
-        alloc_attr.set_on_host(true);
-      }
-      arg_and_ret_alloc_attrs_[device].first.push_back(alloc_attr);
-    }
-    for (int i = 0; i < ret_nodes.size(); ++i) {
-      Node* ret = ret_nodes[i].first;
-      ret->AddAttr("index", i);
-      TF_RETURN_IF_ERROR(ret->attrs().Find("T", &attr_value));
-      AllocatorAttributes alloc_attr;
-      DataType type = attr_value->type();
-      if (MTypeFromDType(type) == HOST_MEMORY) {
-        alloc_attr.set_on_host(true);
-      }
-      arg_and_ret_alloc_attrs_[device].second.push_back(alloc_attr);
-    }
+    TF_RETURN_IF_ERROR(
+        FillOutputDevices(*lib, *cpu_device, AttrSlice(&func_.attr()), &opts));
 
-    // If this kernel execution corresponds to a StatefulPartitionedCallOp,
-    // `arg_and_ret_indices_` might have been populated by a previous
-    // invocation.
-    if (arg_and_ret_indices_.find(device) == arg_and_ret_indices_.end()) {
-      arg_and_ret_indices_.emplace(device, indices);
-    }
+    TF_RETURN_IF_ERROR(
+        lib->Instantiate(func_.name(), AttrSlice(&func_.attr()), opts, handle));
     return Status::OK();
   }
 
-  std::vector<Tensor> GetArgsForIndices(const std::vector<int>& indices,
-                                        const OpInputList& arguments) {
-    std::vector<Tensor> args;
-    args.reserve(indices.size());
-    for (int i : indices) {
-      args.push_back(arguments[i]);
-    }
-    return args;
-  }
-
-  void ExecuteFunctions(FunctionLibraryRuntime* lib, OpKernelContext* ctx,
-                        const OpInputList& op_args, DoneCallback done)
-      LOCKS_EXCLUDED(mu_) {
-    const gtl::FlatMap<string, FHandle>* handles;
-    {
-      mutex_lock l(mu_);
-      handles = function_handles_[lib].get();
-    }
-    if (handles->empty()) {
-      // Trivial case where the function body is empty.
-      ctx->SetStatus(Status::OK());
-      done();
-      return;
-    }
-
-    const string& local_device_name = lib->device()->name();
-    FunctionLibraryRuntime::Options opts;
-    opts.step_id = ctx->step_id();
-    opts.step_container = ctx->step_container();
-    opts.cancellation_manager = ctx->cancellation_manager();
-    opts.stats_collector = ctx->stats_collector();
-    // TODO(akshayka): Consider selecting a runner on a per-device basis, i.e.,
-    // using device-specific threadpools when available.
-    opts.runner = ctx->runner();
-    opts.source_device = local_device_name;
-    opts.allow_dead_tensors = true;
+  void RunFunction(FunctionLibraryRuntime::Handle handle,
+                   const std::vector<Tensor>& inputs,
+                   FunctionLibraryRuntime* lib, OpKernelContext* ctx,
+                   DoneCallback done) {
+    FunctionLibraryRuntime::Options run_opts;
+    run_opts.step_id = ctx->step_id();
+    run_opts.step_container = ctx->step_container();
+    run_opts.cancellation_manager = ctx->cancellation_manager();
+    run_opts.stats_collector = ctx->stats_collector();
+    run_opts.collective_executor = ctx->collective_executor();
+    // TODO(akshayka): Consider selecting a runner on a per-device basis,
+    // i.e., using device-specific threadpools when available.
+    run_opts.runner = ctx->runner();
+    run_opts.source_device = lib->device()->name();
+    run_opts.allow_dead_tensors = true;
     // TODO(akshayka): Accommodate the multiple-worker scenario by adding the
     // constructed rendezvous to a rendezvous manager.
     Rendezvous* rendez = new IntraProcessRendezvous(lib->device_mgr());
-    opts.rendezvous = rendez;
-
-    StatusCallback callback = std::bind(
-        [](Rendezvous* rendez, DoneCallback& done, const Status& status) {
-          rendez->Unref();
-          done();
-        },
-        rendez, std::move(done), std::placeholders::_1);
-    auto* refcounted_done = new ReffedStatusCallback(std::move(callback));
-    for (int i = 0; i < handles->size(); ++i) {
-      refcounted_done->Ref();
-    }
-
-    for (const auto& pair : *handles) {
-      const string& target = pair.first;
-      FHandle handle = pair.second;
-      VLOG(3) << "Running function shard on device " << target;
-      ArgAndRetIndices indices = arg_and_ret_indices_[target];
-      ArgAndRetAllocAttrs alloc_attrs = arg_and_ret_alloc_attrs_[target];
-      const std::vector<int>& arg_indices = indices.first;
-      const std::vector<int>& ret_indices = indices.second;
-      opts.args_alloc_attrs = alloc_attrs.first;
-      opts.rets_alloc_attrs = alloc_attrs.second;
-      if (target == local_device_name) {
-        opts.remote_execution = false;
-        std::vector<Tensor> args = GetArgsForIndices(arg_indices, op_args);
-        std::vector<Tensor>* rets = new std::vector<Tensor>;
-        lib->Run(
-            opts, handle, args, rets,
-            [rets, ret_indices, refcounted_done, ctx](const Status& status) {
-              if (!status.ok()) {
-                VLOG(3) << "Local execution failed: " << status;
-                ctx->SetStatus(status);
-              } else {
-                for (int i = 0; i < rets->size(); ++i) {
-                  ctx->set_output(ret_indices[i], (*rets)[i]);
-                }
-              }
-              delete rets;
-              VLOG(3) << "Finished local execution.";
-              refcounted_done->Unref();
-            });
-      } else {
-        opts.remote_execution = true;
-        std::vector<Tensor> args = GetArgsForIndices(arg_indices, op_args);
-        std::vector<Tensor>* rets = new std::vector<Tensor>;
-        lib->Run(
-            opts, handle, args, rets,
-            [rets, ret_indices, refcounted_done, ctx](const Status& status) {
-              if (!status.ok()) {
-                VLOG(3) << "Remote execution failed: " << status;
-                ctx->SetStatus(status);
-              } else {
-                for (int i = 0; i < rets->size(); ++i) {
-                  ctx->set_output(ret_indices[i], (*rets)[i]);
-                }
-              }
-              delete rets;
-              VLOG(3) << "Finished remote execution.";
-              refcounted_done->Unref();
-            });
-      }
-    }
-    refcounted_done->Unref();
-  }
-
-  string UniquifyFunctionName(const FunctionLibraryDefinition* function_library,
-                              const string& name) {
-    for (;; ++suffix_) {
-      const string candidate = strings::StrCat(name, "_", suffix_);
-      if (function_library->Find(candidate) == nullptr) {
-        return candidate;
-      }
-    }
-  }
-
-  Status OptimizeGraph(OpKernelContext* ctx,
-                       const gtl::InlinedVector<Node*, 4>& ret_nodes,
-                       FunctionLibraryDefinition* flib,
-                       const DeviceSet& device_set, Device* cpu_device,
-                       std::unique_ptr<Graph>* graph) {
-    if (!tensorflow::grappler::MetaOptimizerEnabled(config_proto_)) {
-      return Status::OK();
-    }
-
-    tensorflow::grappler::GrapplerItem item;
-
-    // Add all available devices so that inlined function can be placed.
-    for (const Device* d : device_set.devices()) {
-      Status added_device = item.AddDevice(d->name());
-      if (!added_device.ok()) VLOG(3) << added_device.error_message();
-    }
-
-    // Add fetches so that the graph can be pruned.
-    for (Node* node : ret_nodes) {
-      item.fetch.push_back(node->name());
-    }
-
-    (*graph)->ToGraphDef(&item.graph);
-
-    if (flib) {
-      *item.graph.mutable_library() = flib->ToProto();
-    }
-
-    tensorflow::GraphDef out_graph;
-
-    tensorflow::grappler::VirtualCluster cluster(&device_set);
-
-    // TODO(nareshmodi): Consider adding and using the more generic GraphOptions
-    // proto (which also contain the OptimizerOptions).
-    TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
-        item, config_proto_, cpu_device, &cluster, &out_graph));
-
-    std::unique_ptr<Graph> optimized_graph(new Graph(OpRegistry::Global()));
-    TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(
-        GraphConstructorOptions(), out_graph, optimized_graph.get()));
-
-    // Copy optimized functions back to the overlay lib.
-    if (flib) {
-      for (const FunctionDef& fdef : out_graph.library().function()) {
-        const string& func_name = fdef.signature().name();
-        if (flib->Contains(func_name)) {
-          TF_RETURN_IF_ERROR(flib->ReplaceFunction(func_name, fdef));
-        } else {
-          TF_RETURN_IF_ERROR(flib->AddFunctionDef(fdef));
-        }
-      }
-    }
-
-    *graph = std::move(optimized_graph);
-
-    // The graph conversion sets the requested device names but not the
-    // assigned device names. However, since at this point the graph is
-    // placed TF expects an assigned device name for every node. Therefore
-    // we copy the requested device into the assigned device field.
-    for (Node* node : graph->get()->nodes()) {
-      node->set_assigned_device_name(node->requested_device());
-    }
-
-    return Status::OK();
+    run_opts.rendezvous = rendez;
+
+    std::vector<Tensor>* rets = new std::vector<Tensor>;
+    const string& func_name = func_.name();
+    lib->Run(run_opts, handle, inputs, rets,
+             [rets, rendez, done, ctx, func_name](const Status& status) {
+               if (!status.ok()) {
+                 const string function_and_msg =
+                     strings::StrCat(errors::FormatFunctionForError(func_name),
+                                     " ", status.error_message());
+                 ctx->SetStatus(Status(status.code(), function_and_msg));
+               } else {
+                 for (int i = 0; i < rets->size(); ++i) {
+                   ctx->set_output(i, (*rets)[i]);
+                 }
+               }
+               delete rets;
+               rendez->Unref();
+               done();
+             });
   }
 
   NameAttrList func_;
   ConfigProto config_proto_;
   string executor_type_;
-  // Contains maps from device names to handles of function partitions, keyed by
-  // FunctionLibraryRuntime pointers. (Because this kernel may be instantiated
-  // for a stateful op, different invocations of it may use different
-  // FLRs. Different device placements of PartitionedCallOp also use different
-  // FLRs, and we use this to set the "default" device for the function to
-  // PartitionedCallOp's device.)
-  gtl::FlatMap<FunctionLibraryRuntime*,
-               std::unique_ptr<gtl::FlatMap<string, FHandle>>>
-      function_handles_ GUARDED_BY(mu_);
-  // Function partitions are added to overlay libraries.
-  gtl::FlatMap<FunctionLibraryRuntime*,
-               std::unique_ptr<FunctionLibraryDefinition>>
-      overlay_libs_ GUARDED_BY(mu_);
-  // Map from device name to the indices of the arguments and return values
-  // placed on that device. Read-only after the first invocation.
-  gtl::FlatMap<string, ArgAndRetIndices> arg_and_ret_indices_;
-  // Map from device name to alloc attrs for arguments and return values of the
-  // function placed on that device. Read-only after the first invocation.
-  gtl::FlatMap<string, ArgAndRetAllocAttrs> arg_and_ret_alloc_attrs_;
-
   mutex mu_;
-
-  // Used to uniquify function names in `overlay_libs_`.
-  uint32 suffix_ = 0;
+  // Cache the handle per FLR because this kernel may be instantiated for
+  // a stateful op, different invocations of it may use different FLRs.
+  // Different device placements of PartitionedCallOp also use
+  // different FLRs.
+  gtl::FlatMap<FunctionLibraryRuntime*, FunctionLibraryRuntime::Handle> handles_
+      GUARDED_BY(mu_);
 };
+
 REGISTER_KERNEL_BUILDER(Name("PartitionedCall").Device(DEVICE_CPU),
                         PartitionedCallOp);
 REGISTER_KERNEL_BUILDER(Name("StatefulPartitionedCall").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index e583f7feb4df9605115cd16aec54d1f3e9bb8b9c..903cf9313a22cdc6937cdae53afb7063101400f8 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 
 #if GOOGLE_CUDA
+#include "cuda/include/cudnn.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
@@ -28,6 +29,20 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace {
+
+template <typename T>
+struct RawType {
+  using type = T;
+};
+
+template <>
+struct RawType<qint8> {
+  using type = int8;
+};
+
+}  // namespace
+
 PoolParameters::PoolParameters(OpKernelContext* context,
                                const std::vector<int32>& ksize,
                                const std::vector<int32>& stride,
@@ -156,7 +171,10 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
     return;
   }
 
-  /// For now, cudnn does not support NHWC format, so we need to convert it
+  int batch_size = params.tensor_in_batch;
+  int depth = params.depth;
+#if CUDNN_VERSION < 7300
+  /// Earlier versions do not support NHWC format, so we need to convert it
   /// to NCHW before calling cudnn. We need to get rid of this once it is done
   Tensor transformed_input;
   if (data_format == FORMAT_NHWC) {
@@ -181,7 +199,31 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
   } else {
     transformed_output = *tensor_out;
   }
-
+  se::dnn::DataLayout data_layout = se::dnn::DataLayout::kBatchDepthYX;
+#else
+  auto& transformed_input = tensor_in;
+  auto& transformed_output = *tensor_out;
+  se::dnn::DataLayout data_layout;
+  switch (data_format) {
+    case FORMAT_NHWC:
+      data_layout = se::dnn::DataLayout::kBatchYXDepth;
+      break;
+    case FORMAT_NCHW:
+      data_layout = se::dnn::DataLayout::kBatchDepthYX;
+      break;
+    case FORMAT_NCHW_VECT_C:
+      // NCHW_VECT_C is not supported by cudnnPoolingForward(), but can be
+      // emulated via NHWC.
+      data_layout = se::dnn::DataLayout::kBatchYXDepth;
+      batch_size *= depth / 4;
+      depth = 4;
+      break;
+    default:
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument("Unsupported format: ",
+                                          ToString(data_format)));
+  }
+#endif
   /// Get ready to call cudnn
   se::dnn::PoolingDescriptor pooling_desc;
   pooling_desc.set_pooling_mode(pooling_mode)
@@ -194,23 +236,27 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
       .set_propagate_nans(propagate_nans);
 
   se::dnn::BatchDescriptor input_desc;
-  input_desc.set_count(params.tensor_in_batch)
+  input_desc.set_count(batch_size)
       .set_height(params.tensor_in_rows)
       .set_width(params.tensor_in_cols)
-      .set_feature_map_count(params.depth)
-      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+      .set_feature_map_count(depth)
+      .set_layout(data_layout);
 
   se::dnn::BatchDescriptor output_desc;
-  output_desc.set_count(params.tensor_in_batch)
+  output_desc.set_count(batch_size)
       .set_height(params.out_height)
       .set_width(params.out_width)
-      .set_feature_map_count(params.depth)
-      .set_layout(se::dnn::DataLayout::kBatchDepthYX);
+      .set_feature_map_count(depth)
+      .set_layout(data_layout);
+
+  auto input_data =
+      AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>(
+                         transformed_input.template flat<T>().data()),
+                     transformed_input.template flat<T>().size());
 
-  auto input_data = AsDeviceMemory(transformed_input.template flat<T>().data(),
-                                   transformed_input.template flat<T>().size());
   auto output_data =
-      AsDeviceMemory(transformed_output.template flat<T>().data(),
+      AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>(
+                         transformed_output.template flat<T>().data()),
                      transformed_output.template flat<T>().size());
 
   auto* stream = context->op_device_context()->stream();
@@ -222,15 +268,17 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
                     .ok();
   OP_REQUIRES(context, status,
               errors::Internal("cudnn PoolForward launch failed"));
-
+#if CUDNN_VERSION < 7300
   if (data_format == FORMAT_NHWC) {
     /// Transform the output data from NCHW back to NHWC
     auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
-    functor::NCHWToNHWC<GPUDevice, T, 4>()(
+    using RT = typename RawType<T>::type;
+    functor::NCHWToNHWC<GPUDevice, RT, 4>()(
         context->eigen_device<Device>(),
-        toConstTensor(transformed_output).template tensor<T, 4>(),
-        tensor_out->tensor<T, 4>());
+        toConstTensor(transformed_output).template tensor<RT, 4>(),
+        tensor_out->tensor<RT, 4>());
   }
+#endif
 }
 
 template <typename T>
@@ -388,6 +436,11 @@ void DnnPoolingGradOp<T>::Compute(
   template class DnnPoolingOp<T>; \
   template class DnnPoolingGradOp<T>;
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_DNN_OPS)
+
+#if CUDNN_VERSION >= 7300
+template class DnnPoolingOp<qint8>;
+#endif
+
 #undef DEFINE_DNN_OPS
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/priority_queue.h b/tensorflow/core/kernels/priority_queue.h
index 8e69b5b699065a8722f4e19acaf8b57a7e0b64ed..a719c518c3e9206020602e315d0b0e3be474bfd0 100644
--- a/tensorflow/core/kernels/priority_queue.h
+++ b/tensorflow/core/kernels/priority_queue.h
@@ -68,7 +68,7 @@ class PriorityQueue
   Status MatchesPriorityNodeDefTypes(const NodeDef& node_def) const;
   Status MatchesPriorityNodeDefShapes(const NodeDef& node_def) const;
 
-  int32 size() override {
+  int32 size() const override {
     mutex_lock lock(mu_);
     return queues_[0].size();
   }
diff --git a/tensorflow/core/kernels/quantized_concat_op.cc b/tensorflow/core/kernels/quantized_concat_op.cc
index b03ac8e87dac8fabe0d45d8685ec4fa5fd642519..ff4e7be1622af8bfd2e19aaff5e1ff3677875f3c 100644
--- a/tensorflow/core/kernels/quantized_concat_op.cc
+++ b/tensorflow/core/kernels/quantized_concat_op.cc
@@ -246,4 +246,16 @@ REGISTER_QUANTIZED_CONCAT(qint32);
 
 #undef REGISTER_QUANTIZED_CONCAT
 
+#ifdef INTEL_MKL
+#define REGISTER_QUANTIZED_CONCATV2(type)                \
+  REGISTER_KERNEL_BUILDER(Name("QuantizedConcatV2")      \
+                              .Device(DEVICE_CPU)        \
+                              .TypeConstraint<type>("T") \
+                              .HostMemory("axis"),       \
+                          QuantizedConcatOp<type>)
+
+REGISTER_QUANTIZED_CONCATV2(quint8);
+REGISTER_QUANTIZED_CONCATV2(qint32);
+#endif
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/random_op_gpu.cu.cc b/tensorflow/core/kernels/random_op_gpu.cu.cc
index edb2b10e3d69b6ac93c13b875d00fa9de7ed5362..a5603885236245de7d8cc3e49c6d836ba6c2af54 100644
--- a/tensorflow/core/kernels/random_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/random_op_gpu.cu.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/kernels/random_op.h"
+#include "tensorflow/core/kernels/random_op_gpu.h"
 
 #include <assert.h>
 #include <stdio.h>
@@ -36,170 +37,6 @@ namespace functor {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-template <class Distribution, bool VariableSamplesPerOutput>
-struct FillPhiloxRandomKernel;
-
-template <typename T, int ElementCount>
-class SampleCopier {
- public:
-  inline __device__ void operator()(
-      T* buf, const tensorflow::random::Array<T, ElementCount>& array) const {
-#pragma unroll
-    for (int i = 0; i < ElementCount; i++) {
-      buf[i] = array[i];
-    }
-  }
-};
-
-template <>
-class SampleCopier<float, 4> {
- public:
-  // Copies the elements from the array to buf. buf must be 128-bit aligned,
-  // which is true for tensor data, and all offsets that are a multiple of the
-  // vector size (because the vectors are 128 bits long).
-  inline __device__ void operator()(
-      float* buf, const tensorflow::random::Array<float, 4>& array) const {
-    // NOTE(ringwalt): It's not safe to cast &array[0] to a float4, because they
-    // have 32-bit alignment vs 128-bit alignment. There seems to be no
-    // performance loss when assigning each element to a vector.
-    float4 vec;
-    vec.x = array[0];
-    vec.y = array[1];
-    vec.z = array[2];
-    vec.w = array[3];
-    float4* buf_vector = reinterpret_cast<float4*>(buf);
-    *buf_vector = vec;
-  }
-};
-
-template <>
-class SampleCopier<int32, 4> {
- public:
-  // Copies the elements from the array to buf. buf must be 128-bit aligned,
-  // which is true for tensor data, and all offsets that are a multiple of the
-  // vector size (because the vectors are 128 bits long).
-  inline __device__ void operator()(
-      int32* buf, const tensorflow::random::Array<int32, 4>& array) const {
-    int4 vec;
-    vec.x = array[0];
-    vec.y = array[1];
-    vec.z = array[2];
-    vec.w = array[3];
-    int4* buf_vector = reinterpret_cast<int4*>(buf);
-    *buf_vector = vec;
-  }
-};
-
-template <>
-class SampleCopier<double, 2> {
- public:
-  // Copies the elements from the array to buf. buf must be 128-bit aligned,
-  // which is true for tensor data, and all offsets that are a multiple of the
-  // vector size (because the vectors are 128 bits long).
-  inline __device__ void operator()(
-      double* buf, const tensorflow::random::Array<double, 2>& array) const {
-    double2 vec;
-    vec.x = array[0];
-    vec.y = array[1];
-    double2* buf_vector = reinterpret_cast<double2*>(buf);
-    *buf_vector = vec;
-  }
-};
-
-template <>
-class SampleCopier<int64, 2> {
- public:
-  // Copies the elements from the array to buf. buf must be 128-bit aligned,
-  // which is true for tensor data, and all offsets that are a multiple of the
-  // vector size (because the vectors are 128 bits long).
-  inline __device__ void operator()(
-      int64* buf, const tensorflow::random::Array<int64, 2>& array) const {
-    longlong2 vec;
-    vec.x = array[0];
-    vec.y = array[1];
-    longlong2* buf_vector = reinterpret_cast<longlong2*>(buf);
-    *buf_vector = vec;
-  }
-};
-
-// A cuda kernel to fill the data with random numbers from the specified
-// distribution. Each output takes a fixed number of samples.
-template <class Distribution>
-struct FillPhiloxRandomKernel<Distribution, false> {
-  typedef typename Distribution::ResultElementType T;
-  PHILOX_DEVICE_FUNC void Run(random::PhiloxRandom gen, T* data, int64 size,
-                              Distribution dist) {
-    const int kGroupSize = Distribution::kResultElementCount;
-
-    const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    const int32 total_thread_count = gridDim.x * blockDim.x;
-    int32 offset = thread_id * kGroupSize;
-    gen.Skip(thread_id);
-
-    const SampleCopier<T, kGroupSize> copier;
-    while (offset + kGroupSize <= size) {
-      const typename Distribution::ResultType samples = dist(&gen);
-      copier(&data[offset], samples);
-
-      offset += total_thread_count * kGroupSize;
-      gen.Skip(total_thread_count - 1);
-    }
-
-    typename Distribution::ResultType samples = dist(&gen);
-    for (int i = 0; i < kGroupSize; ++i) {
-      if (offset >= size) {
-        return;
-      }
-      data[offset] = samples[i];
-      ++offset;
-    }
-  }
-};
-
-// A cuda kernel to fill the data with random numbers from the specified
-// distribution. Each output takes a variable number of samples.
-template <class Distribution>
-struct FillPhiloxRandomKernel<Distribution, true> {
-  typedef typename Distribution::ResultElementType T;
-  PHILOX_DEVICE_FUNC void Run(const random::PhiloxRandom& base_gen, T* data,
-                              int64 size, Distribution dist) {
-    using random::PhiloxRandom;
-    using random::SingleSampleAdapter;
-
-    const int kReservedSamplesPerOutput = 256;
-    const int kGroupSize = Distribution::kResultElementCount;
-    const int kGeneratorSkipPerOutputGroup = kGroupSize *
-                                             kReservedSamplesPerOutput /
-                                             PhiloxRandom::kResultElementCount;
-
-    const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    const int32 total_thread_count = gridDim.x * blockDim.x;
-    int64 group_index = thread_id;
-    int64 offset = group_index * kGroupSize;
-
-    while (offset < size) {
-      // Since each output takes a variable number of samples, we need to
-      // realign the generator to the beginning for the current output group
-      PhiloxRandom gen = base_gen;
-      gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
-      SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
-
-      typename Distribution::ResultType samples = dist(&single_samples);
-
-      for (int i = 0; i < kGroupSize; ++i) {
-        if (offset >= size) {
-          return;
-        }
-        data[offset] = samples[i];
-        ++offset;
-      }
-
-      offset += (total_thread_count - 1) * kGroupSize;
-      group_index += total_thread_count;
-    }
-  }
-};
-
 // A simple launch pad to call the correct function templates to fill the data
 template <class Distribution>
 __global__ void __launch_bounds__(1024)
@@ -224,7 +61,7 @@ void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
 
   FillPhiloxRandomKernelLaunch<Distribution>
       <<<num_blocks, block_size, 0, d.stream()>>>(gen, data, size, dist);
-};
+}
 
 // Explicit instantiation of the GPU distributions functors
 // clang-format off
diff --git a/tensorflow/core/kernels/random_op_gpu.h b/tensorflow/core/kernels/random_op_gpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..e32c755d78259a76c0dbad16efb871e7dfc8216d
--- /dev/null
+++ b/tensorflow/core/kernels/random_op_gpu.h
@@ -0,0 +1,206 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RANDOM_OP_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_RANDOM_OP_GPU_H_
+
+#if defined(__CUDACC__)
+
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template <class Distribution, bool VariableSamplesPerOutput>
+struct FillPhiloxRandomKernel;
+
+template <class Distribution>
+struct FillPhiloxRandomKernel<Distribution, false> {
+  typedef typename Distribution::ResultElementType T;
+  PHILOX_DEVICE_FUNC void Run(random::PhiloxRandom gen, T* data, int64 size,
+                              Distribution dist);
+};
+
+template <class Distribution>
+struct FillPhiloxRandomKernel<Distribution, true> {
+  typedef typename Distribution::ResultElementType T;
+  PHILOX_DEVICE_FUNC void Run(const random::PhiloxRandom& base_gen, T* data,
+                              int64 size, Distribution dist);
+};
+
+template <typename T, int ElementCount>
+class SampleCopier {
+ public:
+  inline __device__ void operator()(
+      T* buf, const tensorflow::random::Array<T, ElementCount>& array) const {
+#pragma unroll
+    for (int i = 0; i < ElementCount; i++) {
+      buf[i] = array[i];
+    }
+  }
+};
+
+template <>
+class SampleCopier<float, 4> {
+ public:
+  // Copies the elements from the array to buf. buf must be 128-bit aligned,
+  // which is true for tensor data, and all offsets that are a multiple of the
+  // vector size (because the vectors are 128 bits long).
+  inline __device__ void operator()(
+      float* buf, const tensorflow::random::Array<float, 4>& array) const {
+    // NOTE(ringwalt): It's not safe to cast &array[0] to a float4, because they
+    // have 32-bit alignment vs 128-bit alignment. There seems to be no
+    // performance loss when assigning each element to a vector.
+    float4 vec;
+    vec.x = array[0];
+    vec.y = array[1];
+    vec.z = array[2];
+    vec.w = array[3];
+    float4* buf_vector = reinterpret_cast<float4*>(buf);
+    *buf_vector = vec;
+  }
+};
+
+template <>
+class SampleCopier<int32, 4> {
+ public:
+  // Copies the elements from the array to buf. buf must be 128-bit aligned,
+  // which is true for tensor data, and all offsets that are a multiple of the
+  // vector size (because the vectors are 128 bits long).
+  inline __device__ void operator()(
+      int32* buf, const tensorflow::random::Array<int32, 4>& array) const {
+    int4 vec;
+    vec.x = array[0];
+    vec.y = array[1];
+    vec.z = array[2];
+    vec.w = array[3];
+    int4* buf_vector = reinterpret_cast<int4*>(buf);
+    *buf_vector = vec;
+  }
+};
+
+template <>
+class SampleCopier<double, 2> {
+ public:
+  // Copies the elements from the array to buf. buf must be 128-bit aligned,
+  // which is true for tensor data, and all offsets that are a multiple of the
+  // vector size (because the vectors are 128 bits long).
+  inline __device__ void operator()(
+      double* buf, const tensorflow::random::Array<double, 2>& array) const {
+    double2 vec;
+    vec.x = array[0];
+    vec.y = array[1];
+    double2* buf_vector = reinterpret_cast<double2*>(buf);
+    *buf_vector = vec;
+  }
+};
+
+template <>
+class SampleCopier<int64, 2> {
+ public:
+  // Copies the elements from the array to buf. buf must be 128-bit aligned,
+  // which is true for tensor data, and all offsets that are a multiple of the
+  // vector size (because the vectors are 128 bits long).
+  inline __device__ void operator()(
+      int64* buf, const tensorflow::random::Array<int64, 2>& array) const {
+    longlong2 vec;
+    vec.x = array[0];
+    vec.y = array[1];
+    longlong2* buf_vector = reinterpret_cast<longlong2*>(buf);
+    *buf_vector = vec;
+  }
+};
+
+// A cuda kernel to fill the data with random numbers from the specified
+// distribution. Each output takes a fixed number of samples.
+template <class Distribution>
+PHILOX_DEVICE_FUNC void FillPhiloxRandomKernel<Distribution, false>::Run(
+    random::PhiloxRandom gen, T* data, int64 size, Distribution dist) {
+  const int kGroupSize = Distribution::kResultElementCount;
+
+  const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32 total_thread_count = gridDim.x * blockDim.x;
+  int32 offset = thread_id * kGroupSize;
+  gen.Skip(thread_id);
+
+  const SampleCopier<T, kGroupSize> copier;
+  while (offset + kGroupSize <= size) {
+    const typename Distribution::ResultType samples = dist(&gen);
+    copier(&data[offset], samples);
+
+    offset += total_thread_count * kGroupSize;
+    gen.Skip(total_thread_count - 1);
+  }
+
+  typename Distribution::ResultType samples = dist(&gen);
+  for (int i = 0; i < kGroupSize; ++i) {
+    if (offset >= size) {
+      return;
+    }
+    data[offset] = samples[i];
+    ++offset;
+  }
+}
+
+// A cuda kernel to fill the data with random numbers from the specified
+// distribution. Each output takes a variable number of samples.
+template <class Distribution>
+PHILOX_DEVICE_FUNC void FillPhiloxRandomKernel<Distribution, true>::Run(
+    const random::PhiloxRandom& base_gen, T* data, int64 size,
+    Distribution dist) {
+  using random::PhiloxRandom;
+  using random::SingleSampleAdapter;
+
+  const int kReservedSamplesPerOutput = 256;
+  const int kGroupSize = Distribution::kResultElementCount;
+  const int kGeneratorSkipPerOutputGroup = kGroupSize *
+                                           kReservedSamplesPerOutput /
+                                           PhiloxRandom::kResultElementCount;
+
+  const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32 total_thread_count = gridDim.x * blockDim.x;
+  int64 group_index = thread_id;
+  int64 offset = group_index * kGroupSize;
+
+  while (offset < size) {
+    // Since each output takes a variable number of samples, we need to
+    // realign the generator to the beginning for the current output group
+    PhiloxRandom gen = base_gen;
+    gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
+    SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
+
+    typename Distribution::ResultType samples = dist(&single_samples);
+
+    for (int i = 0; i < kGroupSize; ++i) {
+      if (offset >= size) {
+        return;
+      }
+      data[offset] = samples[i];
+      ++offset;
+    }
+
+    offset += (total_thread_count - 1) * kGroupSize;
+    group_index += total_thread_count;
+  }
+}
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // defined(__CUDACC__)
+
+#endif  // TENSORFLOW_CORE_KERNELS_RANDOM_OP_GPU_H_
diff --git a/tensorflow/core/kernels/random_shuffle_queue_op.cc b/tensorflow/core/kernels/random_shuffle_queue_op.cc
index 31e8ce944fef913fd241801f4931fcb4dfd2025c..02b9b022fdcb00b3d9f4f676be579abced5e720e 100644
--- a/tensorflow/core/kernels/random_shuffle_queue_op.cc
+++ b/tensorflow/core/kernels/random_shuffle_queue_op.cc
@@ -59,7 +59,7 @@ class RandomShuffleQueue : public TypedQueue<std::vector<PersistentTensor> > {
                       CallbackWithTuple callback) override;
   Status MatchesNodeDef(const NodeDef& node_def) override;
 
-  int32 size() override {
+  int32 size() const override {
     mutex_lock lock(mu_);
     return queues_[0].size();
   }
diff --git a/tensorflow/core/kernels/range_sampler.cc b/tensorflow/core/kernels/range_sampler.cc
index d682cd3b52db50575480c749b9b8e2633c4b8f07..c1457b34f1d552edc1fd821e3d7fff53b93df9f1 100644
--- a/tensorflow/core/kernels/range_sampler.cc
+++ b/tensorflow/core/kernels/range_sampler.cc
@@ -294,7 +294,7 @@ Status FixedUnigramSampler::LoadFromFile(Env* env, const string& vocab_file,
     // Skip entries that do not belong to this shard.
     if (word_id % num_shards_ == shard_) {
       float w = 0.0;
-      if (!strings::safe_strtof(cols.at(cols.size() - 1).c_str(), &w)) {
+      if (!strings::safe_strtof(cols.at(cols.size() - 1), &w)) {
         return errors::InvalidArgument("Wrong vocabulary format at line: ",
                                        line);
       }
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index e9cf36c62b966f5f91cf7764421f0c1ff6c131fc..e9900e58684b346aabf75349fdd44b93fbf1d497 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -40,6 +40,20 @@ namespace functor {
 
 typedef Eigen::GpuDevice GPUDevice;
 
+template <typename T>
+struct Square {
+  __host__ __device__ T operator()(const T& a) const {
+    return a * Eigen::numext::conj(a);
+  }
+};
+
+template <typename T>
+struct Sqrt {
+  __host__ __device__ T operator()(const T& a) const {
+    return Eigen::numext::sqrt(a);
+  }
+};
+
 template <typename T>
 struct Sum {
   __host__ __device__ T operator()(const T& a, const T& b) const {
@@ -500,7 +514,7 @@ void LaunchScalarReduction(OpKernelContext* ctx, OUT_T out, IN_T in,
     BlockReduceKernel<IN_T, OUT_T, num_threads>
         <<<num_blocks, num_threads, 0, cu_stream>>>(in, out, in_size, op, init);
     return;
-  } else if (in_size <= 1 << 19) {
+  } else if (in_size <= 1 << 18) {
     const int num_threads = 256;
     const int num_blocks = std::min(32, Eigen::divup(in_size, num_threads));
     // it seems like tailoring this to the GPU
@@ -539,7 +553,7 @@ void LaunchScalarReduction(OpKernelContext* ctx, OUT_T out, IN_T in,
 
     OP_REQUIRES(
         ctx, success == 0,
-        errors::Internal("CUB reduce error", cudaGetErrorString(success)));
+        errors::Internal("CUB reduce error ", cudaGetErrorString(success)));
   };
 
   reduce(nullptr);  // Get required amount of temp storage.
@@ -884,6 +898,31 @@ struct ReduceFunctor<GPUDevice, Eigen::internal::SumReducer<T>> {
   }
 };
 
+// TODO(rmlarsen): Specialize for float16.
+template <typename T>
+struct ReduceFunctor<GPUDevice, functor::EuclideanNormReducer<T>> {
+  template <typename OUT_T, typename IN_T, typename ReductionAxes>
+  static void Reduce(OpKernelContext* ctx, OUT_T out, IN_T in,
+                     const ReductionAxes& reduction_axes,
+                     const functor::EuclideanNormReducer<T>& reducer) {
+    typedef cub::TransformInputIterator<T, Square<T>, T*> inputIterType;
+    inputIterType input_itr((T*)in.data(), Square<T>());
+    typedef TransformOutputIterator<T, T, Sqrt<T>> outputIterType;
+    outputIterType output_itr((T*)out.data(), Sqrt<T>());
+    ReduceImpl<T, Sum<T>, outputIterType, inputIterType, ReductionAxes>(
+        ctx, output_itr, input_itr, in.rank(), in.dimension(0),
+        in.rank() >= 2 ? in.dimension(1) : 1,
+        in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), reduction_axes,
+        Sum<T>());
+  }
+
+  template <typename OUT_T>
+  static void FillIdentity(const GPUDevice& d, OUT_T out,
+                           const functor::EuclideanNormReducer<T>& reducer) {
+    FillIdentityEigenImpl(d, To32Bit(out), reducer);
+  }
+};
+
 template <typename T>
 struct ReduceFunctor<GPUDevice, functor::MeanReducer<T>> {
   template <typename OUT_T, typename IN_T, typename ReductionAxes>
diff --git a/tensorflow/core/kernels/reduction_ops.h b/tensorflow/core/kernels/reduction_ops.h
index 2331599b72f46df7a34e9553d5bd41a7613409da..0a1568bdc2521addb954bdd472164922e4f7d0f5 100644
--- a/tensorflow/core/kernels/reduction_ops.h
+++ b/tensorflow/core/kernels/reduction_ops.h
@@ -33,6 +33,12 @@ struct MeanReducer {
   Scalar initialize() const { return Scalar(0); }
 };
 
+// Dummy class used for template specialization for l2-norm reduction.
+template <typename Scalar>
+struct EuclideanNormReducer {
+  Scalar initialize() const { return Scalar(0); }
+};
+
 template <typename Device, typename OUT_T, typename IN_T,
           typename ReductionAxes, typename Reducer>
 struct ReduceEigenImpl {
@@ -56,6 +62,39 @@ struct ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes,
   }
 };
 
+// TODO(rmlarsen): Refactor this such that taking the sqrt can be optional
+// controlled by an attribute.
+template <typename Device, typename OUT_T, typename IN_T,
+          typename ReductionAxes, typename Scalar>
+struct ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes,
+                       functor::EuclideanNormReducer<Scalar>> {
+  void operator()(const Device& d, OUT_T out, IN_T in,
+                  const ReductionAxes& reduction_axes,
+                  const functor::EuclideanNormReducer<Scalar>& reducer) {
+    static_assert(std::is_same<Scalar, typename OUT_T::Scalar>::value, "");
+    Eigen::internal::SumReducer<Scalar> sum_reducer;
+    out.device(d) =
+        (in * in.conjugate()).reduce(reduction_axes, sum_reducer).sqrt();
+  }
+};
+
+template <typename Device, typename OUT_T, typename IN_T,
+          typename ReductionAxes>
+struct ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes,
+                       functor::EuclideanNormReducer<bfloat16>> {
+  void operator()(const Device& d, OUT_T out, IN_T in,
+                  const ReductionAxes& reduction_axes,
+                  const functor::EuclideanNormReducer<bfloat16>& reducer) {
+    static_assert(std::is_same<bfloat16, typename OUT_T::Scalar>::value, "");
+    Eigen::internal::SumReducer<float> sum_reducer;
+    auto in_as_float = in.template cast<float>();
+    out.device(d) = (in_as_float * in_as_float.conjugate())
+                        .reduce(reduction_axes, sum_reducer)
+                        .sqrt()
+                        .template cast<bfloat16>();
+  }
+};
+
 // For most reducers, the identity is Reducer::initialize()
 template <typename Reducer>
 struct Identity {
diff --git a/tensorflow/core/kernels/reduction_ops_euclidean.cc b/tensorflow/core/kernels/reduction_ops_euclidean.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9f4bf50e7ca0ecf8506b260829cae2127305cadb
--- /dev/null
+++ b/tensorflow/core/kernels/reduction_ops_euclidean.cc
@@ -0,0 +1,81 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/reduction_ops_common.h"
+
+namespace tensorflow {
+
+#define REGISTER_CPU_KERNELS(type)                                           \
+  REGISTER_KERNEL_BUILDER(Name("EuclideanNorm")                              \
+                              .Device(DEVICE_CPU)                            \
+                              .TypeConstraint<type>("T")                     \
+                              .TypeConstraint<int32>("Tidx"),                \
+                          ReductionOp<CPUDevice, type, int32,                \
+                                      functor::EuclideanNormReducer<type>>); \
+  REGISTER_KERNEL_BUILDER(Name("EuclideanNorm")                              \
+                              .Device(DEVICE_CPU)                            \
+                              .TypeConstraint<type>("T")                     \
+                              .TypeConstraint<int64>("Tidx"),                \
+                          ReductionOp<CPUDevice, type, int64,                \
+                                      functor::EuclideanNormReducer<type>>);
+TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
+#undef REGISTER_CPU_KERNELS
+
+#if GOOGLE_CUDA
+
+#define REGISTER_GPU_KERNELS(type)                                           \
+  REGISTER_KERNEL_BUILDER(Name("EuclideanNorm")                              \
+                              .Device(DEVICE_GPU)                            \
+                              .TypeConstraint<type>("T")                     \
+                              .TypeConstraint<int32>("Tidx")                 \
+                              .HostMemory("reduction_indices"),              \
+                          ReductionOp<GPUDevice, type, int32,                \
+                                      functor::EuclideanNormReducer<type>>); \
+  REGISTER_KERNEL_BUILDER(Name("EuclideanNorm")                              \
+                              .Device(DEVICE_GPU)                            \
+                              .TypeConstraint<type>("T")                     \
+                              .TypeConstraint<int64>("Tidx")                 \
+                              .HostMemory("reduction_indices"),              \
+                          ReductionOp<GPUDevice, type, int64,                \
+                                      functor::EuclideanNormReducer<type>>);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+TF_CALL_complex64(REGISTER_GPU_KERNELS);
+TF_CALL_complex128(REGISTER_GPU_KERNELS);
+#undef REGISTER_GPU_KERNELS
+
+#endif
+
+#ifdef TENSORFLOW_USE_SYCL
+#define REGISTER_SYCL_KERNELS(type)                                          \
+  REGISTER_KERNEL_BUILDER(Name("EuclideanNorm")                              \
+                              .Device(DEVICE_SYCL)                           \
+                              .TypeConstraint<type>("T")                     \
+                              .TypeConstraint<int32>("Tidx")                 \
+                              .HostMemory("reduction_indices"),              \
+                          ReductionOp<SYCLDevice, type, int32,               \
+                                      functor::EuclideanNormReducer<type>>); \
+  REGISTER_KERNEL_BUILDER(Name("EuclideanNorm")                              \
+                              .Device(DEVICE_SYCL)                           \
+                              .TypeConstraint<type>("T")                     \
+                              .TypeConstraint<int64>("Tidx")                 \
+                              .HostMemory("reduction_indices"),              \
+                          ReductionOp<SYCLDevice, type, int64,               \
+                                      functor::EuclideanNormReducer<type>>);
+REGISTER_SYCL_KERNELS(float);
+REGISTER_SYCL_KERNELS(double);
+#undef REGISTER_SYCL_KERNELS
+#endif  // TENSORFLOW_USE_SYCL
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_complex128.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_complex128.cu.cc
index c44a40b3b38f5a37574d0d81b7b67adcf27451e1..662f24d9054ab2cfd312ea933f2a7769c6e3983b 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_complex128.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_complex128.cu.cc
@@ -53,6 +53,7 @@ typedef TTypes<float>::Tensor::Index Index;
 
 DEFINE_FOR_TYPE_AND_R(complex128, Eigen::internal::SumReducer<complex128>);
 DEFINE_FOR_TYPE_AND_R(complex128, functor::MeanReducer<complex128>);
+DEFINE_FOR_TYPE_AND_R(complex128, functor::EuclideanNormReducer<complex128>);
 DEFINE_FOR_TYPE_AND_R(complex128, Eigen::internal::ProdReducer<complex128>);
 #undef DEFINE_FOR_TYPE_AND_R
 #undef DEFINE
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_complex64.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_complex64.cu.cc
index 1921130ac043d9d1bfdea415c59aafcedcc31ef3..8ab2a6e13e52b0c92bfde2a2c6acf4423dc5976b 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_complex64.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_complex64.cu.cc
@@ -53,6 +53,7 @@ typedef TTypes<float>::Tensor::Index Index;
 
 DEFINE_FOR_TYPE_AND_R(complex64, Eigen::internal::SumReducer<complex64>);
 DEFINE_FOR_TYPE_AND_R(complex64, functor::MeanReducer<complex64>);
+DEFINE_FOR_TYPE_AND_R(complex64, functor::EuclideanNormReducer<complex64>);
 DEFINE_FOR_TYPE_AND_R(complex64, Eigen::internal::ProdReducer<complex64>);
 #undef DEFINE_FOR_TYPE_AND_R
 #undef DEFINE
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc
index 119f726b929bd9c599e26684fede9890efceb2f2..c492308a9162596235e8d07e9b376abbd89c2007 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc
@@ -51,11 +51,12 @@ typedef TTypes<float>::Tensor::Index Index;
   DEFINE(T, R, 3, 2);               \
   DEFINE_IDENTITY(T, R)
 
-#define DEFINE_FOR_ALL_REDUCERS(T)                          \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);        \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>); \
+#define DEFINE_FOR_ALL_REDUCERS(T)                            \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>);   \
+  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);          \
+  DEFINE_FOR_TYPE_AND_R(T, functor::EuclideanNormReducer<T>); \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>);   \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>);   \
   DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::ProdReducer<T>)
 
 DEFINE_FOR_ALL_REDUCERS(double);
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc
index 70ba4abac48bcfe10d577a120cf08fdd8650f367..b006311c125c1e8e86d499ce125aa7cd817f9d5f 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc
@@ -51,11 +51,12 @@ typedef TTypes<float>::Tensor::Index Index;
   DEFINE(T, R, 3, 2);               \
   DEFINE_IDENTITY(T, R)
 
-#define DEFINE_FOR_ALL_REDUCERS(T)                          \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);        \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>); \
+#define DEFINE_FOR_ALL_REDUCERS(T)                            \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>);   \
+  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);          \
+  DEFINE_FOR_TYPE_AND_R(T, functor::EuclideanNormReducer<T>); \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>);   \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>);   \
   DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::ProdReducer<T>)
 
 DEFINE_FOR_ALL_REDUCERS(float);
diff --git a/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc
index 82f6d7df952fcd8b0aaa3561efd4a4bca93e4dce..91a33b92cb6663310d6cfee9d20127b960e6a11b 100644
--- a/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc
@@ -51,11 +51,12 @@ typedef TTypes<float>::Tensor::Index Index;
   DEFINE(T, R, 3, 2);               \
   DEFINE_IDENTITY(T, R)
 
-#define DEFINE_FOR_ALL_REDUCERS(T)                          \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);        \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>); \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>); \
+#define DEFINE_FOR_ALL_REDUCERS(T)                            \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>);   \
+  DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);          \
+  DEFINE_FOR_TYPE_AND_R(T, functor::EuclideanNormReducer<T>); \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MinReducer<T>);   \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::MaxReducer<T>);   \
   DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::ProdReducer<T>)
 
 DEFINE_FOR_ALL_REDUCERS(int32);
diff --git a/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc b/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc
index db050fdea38bd6db58424da72ff75e79e9151a09..f33d504e25a202c5ce229276611c0958f97f8eee 100644
--- a/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc
@@ -51,8 +51,9 @@ typedef TTypes<float>::Tensor::Index Index;
   DEFINE(T, R, 3, 2);               \
   DEFINE_IDENTITY(T, R)
 
-#define DEFINE_FOR_ALL_REDUCERS(T)                          \
-  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>); \
+#define DEFINE_FOR_ALL_REDUCERS(T)                            \
+  DEFINE_FOR_TYPE_AND_R(T, Eigen::internal::SumReducer<T>);   \
+  DEFINE_FOR_TYPE_AND_R(T, functor::EuclideanNormReducer<T>); \
   DEFINE_FOR_TYPE_AND_R(T, functor::MeanReducer<T>);
 
 DEFINE_FOR_ALL_REDUCERS(Eigen::half);
diff --git a/tensorflow/core/kernels/reduction_ops_test.cc b/tensorflow/core/kernels/reduction_ops_test.cc
index fe8ea59f1be521166d0e42295e79d1bb5a242750..359d7dbeca58be8643e51a1ad2248ccd57f67e79 100644
--- a/tensorflow/core/kernels/reduction_ops_test.cc
+++ b/tensorflow/core/kernels/reduction_ops_test.cc
@@ -164,6 +164,11 @@ static void BM_Mean2DToScalarGPU(int iters, int num_x, int num_y) {
 }
 BENCHMARK(BM_Mean2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
 
+static void BM_EuclideanNorm2DToScalarGPU(int iters, int num_x, int num_y) {
+  ReduceToScalar<float>(iters, "gpu", "EuclideanNorm", num_x, num_y);
+}
+BENCHMARK(BM_EuclideanNorm2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
+
 static void BM_Max2DToScalarGPU(int iters, int num_x, int num_y) {
   ReduceToScalar<float>(iters, "gpu", "Max", num_x, num_y);
 }
diff --git a/tensorflow/core/kernels/resize_bilinear_op_test.cc b/tensorflow/core/kernels/resize_bilinear_op_test.cc
index 6d57892828593e30a0da5ea90b01b6742a71019f..f2062915b8470e8cc6f6e0897ae579639d6fee4c 100644
--- a/tensorflow/core/kernels/resize_bilinear_op_test.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op_test.cc
@@ -122,7 +122,7 @@ class ResizeBilinearOpTest : public OpsTestBase {
         TensorShape({batch_size, output_width, output_height, channels})));
     ResizeBilinearBaseline(input->tensor<float, 4>(),
                            expected->tensor<float, 4>());
-    test::ExpectTensorEqual<float>(*expected, *GetOutput(0));
+    test::ExpectClose(*expected, *GetOutput(0));
   }
 
   void RunManyRandomTests(int channels) {
@@ -177,7 +177,7 @@ TEST_F(ResizeBilinearOpTest, TestBilinear2x2To1x1) {
   // original input. In this case, we choose the top/left most pixel.
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1}));
   test::FillValues<float>(&expected, {1.0});
-  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+  test::ExpectClose(expected, *GetOutput(0));
 }
 
 TEST_F(ResizeBilinearOpTest, TestBilinearRandom2x2To1x1) {
@@ -194,7 +194,7 @@ TEST_F(ResizeBilinearOpTest, TestBilinearRandom2x2To1x1) {
   ResizeBilinearBaseline(input->tensor<float, 4>(),
                          expected->tensor<float, 4>());
   EXPECT_EQ(input->flat<float>()(0), output->flat<float>()(0));
-  test::ExpectTensorEqual<float>(*expected, *output);
+  test::ExpectClose(*expected, *output);
 }
 
 TEST_F(ResizeBilinearOpAlignCornersTest, TestBilinearAlignCorners2x2To1x1) {
@@ -209,7 +209,7 @@ TEST_F(ResizeBilinearOpAlignCornersTest, TestBilinearAlignCorners2x2To1x1) {
   // original input. In this case, we choose the top/left most pixel.
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1}));
   test::FillValues<float>(&expected, {1.0});
-  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+  test::ExpectClose(expected, *GetOutput(0));
 }
 
 TEST_F(ResizeBilinearOpTest, TestBilinear2x2To3x3) {
@@ -229,7 +229,7 @@ TEST_F(ResizeBilinearOpTest, TestBilinear2x2To3x3) {
      3,        11.0f / 3, 4});
 
   // clang-format on
-  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+  test::ExpectClose(expected, *GetOutput(0));
 }
 
 TEST_F(ResizeBilinearOpAlignCornersTest, TestBilinearAlignCorners2x2To3x3) {
@@ -252,7 +252,7 @@ TEST_F(ResizeBilinearOpAlignCornersTest, TestBilinearAlignCorners2x2To3x3) {
      3,  3.5,  4});
 
   // clang-format on
-  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+  test::ExpectClose(expected, *GetOutput(0));
 }
 
 TEST_F(ResizeBilinearOpTest, TestBilinear3x3To2x2) {
@@ -273,7 +273,7 @@ TEST_F(ResizeBilinearOpTest, TestBilinear3x3To2x2) {
      5.5,   7});
 
   // clang-format on
-  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+  test::ExpectClose(expected, *GetOutput(0));
 }
 
 TEST_F(ResizeBilinearOpAlignCornersTest, TestBilinearAlignCorners3x3To2x2) {
@@ -294,7 +294,7 @@ TEST_F(ResizeBilinearOpAlignCornersTest, TestBilinearAlignCorners3x3To2x2) {
      7,  9});
 
   // clang-format on
-  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+  test::ExpectClose(expected, *GetOutput(0));
 }
 
 TEST_F(ResizeBilinearOpTest, TestBilinear3x3To4x4) {
@@ -316,7 +316,7 @@ TEST_F(ResizeBilinearOpTest, TestBilinear3x3To4x4) {
      7,  7.75, 8.5, 9});
 
   // clang-format on
-  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+  test::ExpectClose(expected, *GetOutput(0));
 }
 
 TEST_F(ResizeBilinearOpTest, TestBilinear4x4To3x3) {
@@ -340,7 +340,7 @@ TEST_F(ResizeBilinearOpTest, TestBilinear4x4To3x3) {
      35.0f/3, 39.0f/3, 43.0f/3});
 
   // clang-format on
-  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+  test::ExpectClose(expected, *GetOutput(0));
 }
 
 TEST_F(ResizeBilinearOpAlignCornersTest, TestBilinearAlignCorners4x4To3x3) {
@@ -364,7 +364,7 @@ TEST_F(ResizeBilinearOpAlignCornersTest, TestBilinearAlignCorners4x4To3x3) {
      13, 14.5, 16});
 
   // clang-format on
-  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+  test::ExpectClose(expected, *GetOutput(0));
 }
 
 TEST_F(ResizeBilinearOpTest, TestBilinear2x2To3x3Batch2) {
@@ -384,7 +384,7 @@ TEST_F(ResizeBilinearOpTest, TestBilinear2x2To3x3Batch2) {
      1, 5.0f/3, 2, 7.0f/3, 3, 10.0f/3, 3, 11.0f/3, 4
     });
   // clang-format on
-  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+  test::ExpectClose(expected, *GetOutput(0));
 }
 
 TEST_F(ResizeBilinearOpTest, TestBilinear2x2x2To3x3x2) {
@@ -408,7 +408,7 @@ TEST_F(ResizeBilinearOpTest, TestBilinear2x2x2To3x3x2) {
       4,       -4
     });
   // clang-format on
-  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+  test::ExpectClose(expected, *GetOutput(0));
 }
 
 TEST_F(ResizeBilinearOpTest, TestBilinear2x2To4x4) {
@@ -427,7 +427,7 @@ TEST_F(ResizeBilinearOpTest, TestBilinear2x2To4x4) {
      3,  3.5, 4, 4,
      3,  3.5, 4, 4});
   // clang-format on
-  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+  test::ExpectClose(expected, *GetOutput(0));
 }
 
 // similar_size case
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 170b08b4b7f6c8a6842dd12ad7389900b2d83b86..8e3c52ba5b5c96846f013f9ef5e465872dc3adf8 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -55,12 +55,13 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_join.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/gather_functor.h"
 #include "tensorflow/core/kernels/resource_variable_ops.h"
@@ -84,6 +85,47 @@ ReadVariableOp::ReadVariableOp(OpKernelConstruction* c) : OpKernel(c) {
   OP_REQUIRES_OK(c, c->GetAttr("dtype", &dtype_));
 }
 
+namespace {
+Status CopyVariable(int output_idx, OpKernelContext* ctx, const Tensor* t) {
+  Tensor* output;
+  Notification n;
+  Status status;
+  AllocatorAttributes attr;
+  if (t->dtype() == DT_VARIANT) {
+    attr.set_on_host(true);
+  }
+  TF_RETURN_IF_ERROR(
+      ctx->allocate_output(output_idx, t->shape(), &output, attr));
+  if (t->dtype() == DT_VARIANT) {
+    output->flat<Variant>() = t->flat<Variant>();
+  } else if (ctx->op_device_context() != nullptr) {
+    // TODO(apassos): remove the down_cast by just returning Device* from
+    // OpKernelContext
+    Device* device = static_cast<Device*>(ctx->device());
+    ctx->op_device_context()->CopyTensorInSameDevice(
+        t, device, output, [&n, &status](const Status& s) {
+          status = s;
+          n.Notify();
+        });
+    n.WaitForNotification();
+    return status;
+  } else {
+    switch (t->dtype()) {
+#define HANDLER(type)                       \
+  case DataTypeToEnum<type>::value:         \
+    output->flat<type>() = t->flat<type>(); \
+    break;
+      TF_CALL_ALL_TYPES(HANDLER);
+#undef HANDLER
+      default:
+        return errors::Internal("Unsupported dtype", t->dtype());
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
 void ReadVariableOp::Compute(OpKernelContext* ctx) {
   Var* variable = nullptr;
   const ResourceHandle& handle = HandleFromInput(ctx, 0);
@@ -100,12 +142,16 @@ void ReadVariableOp::Compute(OpKernelContext* ctx) {
   // holding a shared lock to guarantee ordering of reads and
   // writes.
   tf_shared_lock ml(*variable->mu());
-  const Tensor& t = *variable->tensor();
-  OP_REQUIRES(ctx, dtype_ == t.dtype(),
+  const Tensor* t = variable->tensor();
+  OP_REQUIRES(ctx, dtype_ == t->dtype(),
               errors::InvalidArgument(
                   "Trying to read variable with wrong dtype. Expected ",
-                  DataTypeString(dtype_), " got ", DataTypeString(t.dtype())));
-  ctx->set_output(0, t);
+                  DataTypeString(dtype_), " got ", DataTypeString(t->dtype())));
+  if (variable->copy_on_read_mode.load()) {
+    OP_REQUIRES_OK(ctx, CopyVariable(0, ctx, t));
+  } else {
+    ctx->set_output(0, *t);
+  }
 }
 
 ReadVariablesOp::ReadVariablesOp(OpKernelConstruction* c) : OpKernel(c) {
@@ -146,14 +192,18 @@ void ReadVariablesOp::Compute(OpKernelContext* ctx) {
     // holding a shared lock to guarantee ordering of reads and
     // writes.
     tf_shared_lock ml(*variables[i]->mu());
-    const Tensor& t = *variables[i]->tensor();
-    OP_REQUIRES(ctx, dtypes_[i] == t.dtype(),
+    OP_REQUIRES(ctx, dtypes_[i] == variables[i]->tensor()->dtype(),
                 errors::InvalidArgument(
                     "Trying to read variable ", handles[i]->name(),
                     " from Container: ", handles[i]->container(),
                     " with wrong dtype. Expected ", DataTypeString(dtypes_[i]),
-                    " got ", DataTypeString(t.dtype())));
-    ctx->set_output(i, t);
+                    " got ", DataTypeString(variables[i]->tensor()->dtype())));
+    if (variables[i]->copy_on_read_mode.load()) {
+      OP_REQUIRES_OK(ctx, CopyVariable(i, ctx, variables[i]->tensor()));
+    } else {
+      const Tensor& t = *variables[i]->tensor();
+      ctx->set_output(i, t);
+    }
   }
 }
 
@@ -308,8 +358,23 @@ class AssignVariableOp : public OpKernel {
                     "Trying to assign variable with wrong dtype. Expected ",
                     DataTypeString(variable->tensor()->dtype()), " got ",
                     DataTypeString(dtype_)));
+    if (variable->copy_on_read_mode.load()) {
+      PersistentTensor unused;
+      Tensor* tmp;
+      AllocatorAttributes attr;
+      attr.set_gpu_compatible(true);
+      attr.set_nic_compatible(true);
+      OP_REQUIRES_OK(context,
+                     context->allocate_persistent(value.dtype(), value.shape(),
+                                                  &unused, &tmp, attr));
+      functor::DenseUpdate<Device, T, ASSIGN> copy_functor;
+      copy_functor(context->eigen_device<Device>(), tmp->flat<T>(),
+                   value.flat<T>());
+      *variable->tensor() = *tmp;
+    } else {
+      *variable->tensor() = value;
+    }
     variable->is_initialized = true;
-    *variable->tensor() = value;
   }
 
  private:
@@ -442,8 +507,9 @@ class AssignUpdateVariableOp : public OpKernel {
                                         " using a Tensor with shape ",
                                         value.shape().DebugString(),
                                         ", shapes must be equal."));
-    OP_REQUIRES_OK(context,
-                   PrepareToUpdateVariable<Device, T>(context, var_tensor));
+    OP_REQUIRES_OK(
+        context, PrepareToUpdateVariable<Device, T>(
+                     context, var_tensor, variable->copy_on_read_mode.load()));
     functor::DenseUpdate<Device, T, Op> update_functor;
     update_functor(context->eigen_device<Device>(), var_tensor->flat<T>(),
                    value.flat<T>());
@@ -524,6 +590,7 @@ class ResourceGatherOp : public OpKernel {
     Var* v = nullptr;
     OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &v));
     core::ScopedUnref su(v);
+    OP_REQUIRES_OK(c, EnsureSparseVariableAccess<Device, T>(c, v));
     // NOTE: We hold the lock for the whole gather operation instead
     // of increasing the reference count of v->tensor() to avoid a
     // situation where a write to the same variable will see a
@@ -639,9 +706,9 @@ class ResourceScatterUpdateOp : public OpKernel {
     Var* v = nullptr;
     OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &v));
     core::ScopedUnref unref_v(v);
-    mutex_lock ml(*v->mu());
+    OP_REQUIRES_OK(c, EnsureSparseVariableAccess<Device, T>(c, v));
+    tf_shared_lock ml(*v->mu());
     Tensor* params = v->tensor();
-    OP_REQUIRES_OK(c, PrepareToUpdateVariable<Device, T>(c, params));
     const Tensor& indices = c->input(1);
     const Tensor& updates = c->input(2);
 
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index 1c4d0bc1ae9934dbfb8718dfa05202b1d7b38edc..aa2434da03f5fd76ad409121382e6ce93a2e65df 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -19,13 +19,13 @@ limitations under the License.
 #include "tensorflow/core/kernels/reverse_op.h"
 #include <memory>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/type_traits.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/work_sharder.h"
diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc
index efa30438d922fa070747bb4269451cc54f574887..494a846ff562e505a569de19418d371ea8b4f80c 100644
--- a/tensorflow/core/kernels/roll_op.cc
+++ b/tensorflow/core/kernels/roll_op.cc
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/register_types_traits.h"
 #include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/work_sharder.h"
diff --git a/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc b/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
index c0fde8042e816c325475a36129fb71630f0ca7c6..0e68af867bdf753ec70ff9ff2c978d0b95ea5c52 100644
--- a/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
+++ b/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
@@ -14,11 +14,11 @@ limitations under the License.
 ==============================================================================*/
 // See docs in ../ops/image_ops.cc.
 #include <math.h>
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 #include "tensorflow/core/util/guarded_philox_random.h"
 
diff --git a/tensorflow/core/kernels/sampling_kernels.cc b/tensorflow/core/kernels/sampling_kernels.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a18379ddbbbafeff3730dab1b77a027c931373eb
--- /dev/null
+++ b/tensorflow/core/kernels/sampling_kernels.cc
@@ -0,0 +1,38 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/sampling_kernels.h"
+#include <string>
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace tensorflow {
+namespace functor {
+
+SamplingKernelType SamplingKernelTypeFromString(const StringPiece str) {
+  const string lower_case = str_util::Lowercase(str);
+  if (lower_case == "lanczos1") return Lanczos1Kernel;
+  if (lower_case == "lanczos3") return Lanczos3Kernel;
+  if (lower_case == "lanczos5") return Lanczos5Kernel;
+  if (lower_case == "gaussian") return GaussianKernel;
+  if (lower_case == "box") return BoxKernel;
+  if (lower_case == "triangle") return TriangleKernel;
+  if (lower_case == "keyscubic") return KeysCubicKernel;
+  if (lower_case == "mitchellcubic") return MitchellCubicKernel;
+  return SamplingKernelTypeEnd;
+}
+
+}  // namespace functor
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sampling_kernels.h b/tensorflow/core/kernels/sampling_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..a03a2c88db44c350c2fc2bc71ed7cd7db29f5ac6
--- /dev/null
+++ b/tensorflow/core/kernels/sampling_kernels.h
@@ -0,0 +1,192 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SAMPLING_KERNELS_H_
+#define TENSORFLOW_CORE_KERNELS_SAMPLING_KERNELS_H_
+
+#include <cmath>
+
+#include "tensorflow/core/lib/core/stringpiece.h"
+
+namespace tensorflow {
+namespace functor {
+// Defines functions for different types of sampling kernels.
+enum SamplingKernelType {
+  // Lanczos kernel with radius 1.  Aliases but does not ring.
+  Lanczos1Kernel,
+
+  // Lanczos kernel with radius 3.  High-quality practical filter but may have
+  // some ringing especially on synthetic images.
+  Lanczos3Kernel,
+
+  // Lanczos kernel with radius 5.  Very-high-quality filter but may have
+  // stronger ringing.
+  Lanczos5Kernel,
+
+  // Gaussian kernel with radius 3, sigma = 1.5 / 3.  Less commonly used.
+  GaussianKernel,
+
+  // Rectangle function.  Equivalent to "nearest" sampling when upscaling.
+  // Has value 1 in interval (-0.5, 0.5), value 0.5 on edge, and 0 elsewhere.
+  BoxKernel,
+
+  // Hat/tent function with radius 1.  Equivalent to "bilinear" reconstruction
+  // when upsampling.
+  // Has value zero at -1.0 and 1.0.
+  TriangleKernel,
+
+  // Cubic interpolant of Keys.  Equivalent to Catmull-Rom kernel.  Reasonably
+  // good quality and faster than Lanczos3Kernel.
+  KeysCubicKernel,
+
+  // Cubic non-interpolating scheme.  For synthetic images (especially those
+  // lacking proper prefiltering), less ringing than Keys cubic kernel but less
+  // sharp.
+  MitchellCubicKernel,
+
+  // Always insert new kernel types before this.
+  SamplingKernelTypeEnd
+};
+
+// Converts a string into the corresponding kernel type.
+// Returns SamplingKernelTypeEnd if the string couldn't be converted.
+SamplingKernelType SamplingKernelTypeFromString(const StringPiece str);
+
+// A function object for a Lanczos kernel.
+struct LanczosKernelFunc {
+  // Pass 1 for Lanczos1 kernel, 3 for Lanczos3 etc.
+  explicit LanczosKernelFunc(float _radius) : radius(_radius) {}
+  float operator()(float x) const {
+    constexpr float kPI = 3.14159265359;
+    x = std::abs(x);
+    if (x > radius) return 0.0;
+    // Need to special case the limit case of sin(x) / x when x is zero.
+    if (x <= 1e-3) {
+      return 1.0;
+    }
+    return radius * std::sin(kPI * x) * std::sin(kPI * x / radius) /
+           (kPI * kPI * x * x);
+  }
+  float Radius() const { return radius; }
+  const float radius;
+};
+
+struct GaussianKernelFunc {
+  static constexpr float kRadiusMultiplier = 3.0f;
+  // https://en.wikipedia.org/wiki/Gaussian_function
+  // We use sigma = 0.5, as suggested on p. 4 of Ken Turkowski's "Filters
+  // for Common Resampling Tasks" for kernels with a support of 3 pixels:
+  // www.realitypixels.com/turk/computergraphics/ResamplingFilters.pdf
+  // This implies a radius of 1.5,
+  explicit GaussianKernelFunc(float _radius = 1.5f)
+      : radius(_radius), sigma(_radius / kRadiusMultiplier) {}
+  float operator()(float x) const {
+    x = std::abs(x);
+    if (x >= radius) return 0.0;
+    return std::exp(-x * x / (2.0 * sigma * sigma));
+  }
+  float Radius() const { return radius; }
+  const float radius;
+  const float sigma;  // Gaussian standard deviation
+};
+
+struct BoxKernelFunc {
+  float operator()(float x) const {
+    x = std::abs(x);
+    return x < 0.5f ? 1. : x == 0.5f ? 0.5f : 0.0f;
+  }
+  float Radius() const { return 1.f; }
+};
+
+struct TriangleKernelFunc {
+  // https://en.wikipedia.org/wiki/Triangle_function
+  float operator()(float x) const {
+    x = std::abs(x);
+    return x < 1.0f ? 1.0f - x : 0.0f;
+  }
+  float Radius() const { return 1.f; }
+};
+
+struct KeysCubicKernelFunc {
+  // http://ieeexplore.ieee.org/document/1163711/
+  // R. G. Keys. Cubic convolution interpolation for digital image
+  // processing. IEEE Transactions on Acoustics, Speech, and Signal
+  // Processing, 29(6):1153–1160, 1981.
+  float operator()(float x) const {
+    x = std::abs(x);
+    if (x >= 2.0f) {
+      return 0.0f;
+    } else if (x >= 1.0f) {
+      return ((-0.5f * x + 2.5f) * x - 4.0f) * x + 2.0f;
+    } else {
+      return ((1.5f * x - 2.5f) * x) * x + 1.0f;
+    }
+  }
+  float Radius() const { return 2.f; }
+};
+
+struct MitchellCubicKernelFunc {
+  // https://doi.org/10.1145/378456.378514
+  // D. P. Mitchell and A. N. Netravali. Reconstruction filters in computer
+  // graphics.  Computer Graphics (Proceedings of ACM SIGGRAPH 1988),
+  // 22(4):221–228, 1988.
+  float operator()(float x) const {
+    x = std::abs(x);
+    if (x >= 2.0f) {
+      return 0.0f;
+    } else if (x >= 1.0f) {
+      return (((-7.0f / 18.0f) * x + 2.0f) * x - 10.0f / 3.0f) * x +
+             16.0f / 9.0f;
+    } else {
+      return (((7.0f / 6.0f) * x - 2.0f) * x) * x + 8.0f / 9.0f;
+    }
+  }
+  float Radius() const { return 2.f; }
+};
+
+inline LanczosKernelFunc CreateLanczos1Kernel() {
+  return LanczosKernelFunc(1.0);
+}
+
+inline LanczosKernelFunc CreateLanczos3Kernel() {
+  return LanczosKernelFunc(3.0);
+}
+
+inline LanczosKernelFunc CreateLanczos5Kernel() {
+  return LanczosKernelFunc(5.0);
+}
+
+inline GaussianKernelFunc CreateGaussianKernel() {
+  return GaussianKernelFunc(1.5);
+}
+
+inline BoxKernelFunc CreateBoxKernel() { return BoxKernelFunc(); }
+
+inline TriangleKernelFunc CreateTriangleKernel() {
+  return TriangleKernelFunc();
+}
+
+inline KeysCubicKernelFunc CreateKeysCubicKernel() {
+  return KeysCubicKernelFunc();
+}
+
+inline MitchellCubicKernelFunc CreateMitchellCubicKernel() {
+  return MitchellCubicKernelFunc();
+}
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SAMPLING_KERNELS_H_
diff --git a/tensorflow/core/kernels/sampling_kernels_test.cc b/tensorflow/core/kernels/sampling_kernels_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..37c2edc14a3dad86e8eaeb91894a1aade7e5fc76
--- /dev/null
+++ b/tensorflow/core/kernels/sampling_kernels_test.cc
@@ -0,0 +1,76 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/sampling_kernels.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace functor {
+namespace {
+
+class KernelsTest : public ::testing::Test {
+ protected:
+  template <typename KernelType>
+  void TestKernelValues(const KernelType& kernel, const std::vector<float>& x,
+                        const std::vector<float>& expected) const {
+    ASSERT_EQ(x.size(), expected.size());
+    for (int i = 0; i < x.size(); ++i) {
+      constexpr float kTolerance = 1e-3;
+      EXPECT_NEAR(kernel(x[i]), expected[i], kTolerance);
+      EXPECT_NEAR(kernel(-x[i]), expected[i], kTolerance);
+    }
+  }
+};
+
+TEST_F(KernelsTest, TestKernelValues) {
+  // Tests kernel values against a set of known golden values
+  TestKernelValues(CreateLanczos1Kernel(), {0.0f, 0.5f, 1.0f, 1.5},
+                   {1.0f, 0.4052f, 0.0f, 0.0f});
+  TestKernelValues(CreateLanczos3Kernel(), {0.0f, 0.5f, 1.0f, 1.5f, 2.5f, 3.5},
+                   {1.0f, 0.6079f, 0.0f, -0.1351f, 0.0243f, 0.0f});
+  TestKernelValues(
+      CreateLanczos5Kernel(), {0.0f, 0.5f, 1.0f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5},
+      {1.0f, 0.6262f, 0.0f, -0.1822f, 0.0810569f, -0.0334f, 0.0077f, 0.0f});
+  TestKernelValues(CreateGaussianKernel(), {0.0f, 0.5f, 1.0f, 1.5},
+                   {1.0f, 0.6065f, 0.1353f, 0.0f});
+
+  TestKernelValues(CreateBoxKernel(), {0.0f, 0.25f, 0.5f, 1.0f},
+                   {1.0f, 1.0f, 0.5f, 0.0f});
+  TestKernelValues(CreateTriangleKernel(), {0.0f, 0.5f, 1.0f},
+                   {1.0f, 0.5f, 0.0f});
+
+  TestKernelValues(CreateKeysCubicKernel(), {0.0f, 0.5f, 1.0f, 1.5f, 2.5},
+                   {1.0f, 0.5625f, 0.0f, -0.0625f, 0.0f});
+  TestKernelValues(CreateMitchellCubicKernel(), {0.0f, 0.5f, 1.0f, 1.5f, 2.5},
+                   {0.8889f, 0.5347f, 0.0556f, -0.0347f, 0.0f});
+}
+
+TEST(SamplingKernelTypeFromStringTest, Works) {
+  EXPECT_EQ(SamplingKernelTypeFromString("lanczos1"), Lanczos1Kernel);
+  EXPECT_EQ(SamplingKernelTypeFromString("lanczos3"), Lanczos3Kernel);
+  EXPECT_EQ(SamplingKernelTypeFromString("lanczos5"), Lanczos5Kernel);
+  EXPECT_EQ(SamplingKernelTypeFromString("gaussian"), GaussianKernel);
+  EXPECT_EQ(SamplingKernelTypeFromString("box"), BoxKernel);
+  EXPECT_EQ(SamplingKernelTypeFromString("triangle"), TriangleKernel);
+  EXPECT_EQ(SamplingKernelTypeFromString("mitchellcubic"), MitchellCubicKernel);
+  EXPECT_EQ(SamplingKernelTypeFromString("keyscubic"), KeysCubicKernel);
+  EXPECT_EQ(SamplingKernelTypeFromString("not a kernel"),
+            SamplingKernelTypeEnd);
+}
+
+}  // namespace
+}  // namespace functor
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/save_restore_tensor.cc b/tensorflow/core/kernels/save_restore_tensor.cc
index 82546d581a9ea55d7fe0a478c4de0c9afe2ff8ed..8580891fc066828abb1c2cef6d66f71c48090f05 100644
--- a/tensorflow/core/kernels/save_restore_tensor.cc
+++ b/tensorflow/core/kernels/save_restore_tensor.cc
@@ -19,10 +19,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/str_util.h"
diff --git a/tensorflow/core/kernels/save_restore_v2_ops.cc b/tensorflow/core/kernels/save_restore_v2_ops.cc
index 180eb3ca34b4c1fe96bf7088319455185bd06a2c..ed1195c05353389e9c4c465d402d46220a01fad4 100644
--- a/tensorflow/core/kernels/save_restore_v2_ops.cc
+++ b/tensorflow/core/kernels/save_restore_v2_ops.cc
@@ -18,11 +18,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/save_restore_tensor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/io/path.h"
diff --git a/tensorflow/core/kernels/save_v2_op_test.cc b/tensorflow/core/kernels/save_v2_op_test.cc
index 82e566d35fefa98f96b00a285af618ff98f3da69..589d9639fb4d17e6f3423b92c7d692a7abc25364 100644
--- a/tensorflow/core/kernels/save_v2_op_test.cc
+++ b/tensorflow/core/kernels/save_v2_op_test.cc
@@ -67,9 +67,8 @@ TEST_F(SaveV2OpTest, Simple) {
                    [&tensornames](int x) -> string { return tensornames[x]; });
 
   // Add the slice specs
-  AddInput<string>(TensorShape({13}), [&tensornames](int x) -> string {
-    return "" /* saves in full */;
-  });
+  AddInput<string>(TensorShape({13}),
+                   [](int x) -> string { return "" /* saves in full */; });
 
   // Add a 1-d bool tensor
   AddInput<bool>(TensorShape({2}), [](int x) -> bool { return x != 0; });
diff --git a/tensorflow/core/kernels/scale_and_translate_op.cc b/tensorflow/core/kernels/scale_and_translate_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..34fef536df4d9bb9f80bb749b4071b8f5956c997
--- /dev/null
+++ b/tensorflow/core/kernels/scale_and_translate_op.cc
@@ -0,0 +1,610 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/image_ops.cc
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/scale_and_translate_op.h"
+
+#include <memory>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/sampling_kernels.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+using strings::Printf;
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+namespace {
+template <typename T>
+inline const T& Clamp(const T& low, const T& high, const T& value) {
+  if (high < value) return high;
+  if (value < low) return low;
+  return value;
+}
+
+template <typename Kernel>
+Status ComputeSpansCore(OpKernelContext* context, const Kernel& kernel,
+                        const int64 output_size, const int64 input_size,
+                        const float scale, const float translate,
+                        Spans* spans) {
+  // When sampling, we need the inverse scale and translation, to map from an
+  // output to an input pixel.
+  const float inv_scale = 1.0 / scale;
+  const float inv_translate = -inv_scale * translate;
+  // When downsampling the kernel should be scaled since we want to low pass
+  // filter and interpolate, but when upsampling it should not be since we only
+  // want to interpolate.
+  const float kernel_scale = std::max(inv_scale, 1.0f);
+  spans->span_size = std::min(
+      2 * static_cast<int>(std::ceil(kernel.Radius() * kernel_scale)) + 1,
+      static_cast<int>(input_size));
+  AllocatorAttributes alloc_attr;
+  alloc_attr.set_on_host(true);
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      tensorflow::DT_INT32, tensorflow::TensorShape({output_size}),
+      &spans->starts, alloc_attr));
+  auto starts_vec = spans->starts.vec<int32>();
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      tensorflow::DT_FLOAT,
+      tensorflow::TensorShape({spans->span_size * output_size}),
+      &spans->weights, alloc_attr));
+  auto weights_vec = spans->weights.vec<float>();
+  weights_vec.setZero();
+
+  const float one_over_kernel_scale = 1.0f / kernel_scale;
+  int max_span_size = 0;
+  std::vector<float> temp_weights;
+  for (int x = 0; x < output_size; ++x) {
+    const float col_f = x + 0.5f;
+    const float sample_f = col_f * inv_scale + inv_translate;
+
+    // Don't sample when the sampling *kernel* is completely outside the
+    // source image.
+    if (sample_f < 0 - kernel.Radius() * kernel_scale ||
+        sample_f > input_size + kernel.Radius() * kernel_scale) {
+      // Add an empty span.
+      starts_vec(x) = 0;
+      continue;
+    }
+    int64 span_start =
+        std::ceil(sample_f - kernel.Radius() * kernel_scale - 0.5f);
+    int64 span_end =
+        std::floor(sample_f + kernel.Radius() * kernel_scale - 0.5f);
+    span_start = Clamp(static_cast<int64>(0), input_size - 1, span_start);
+    span_end = Clamp(static_cast<int64>(0), input_size - 1, span_end) + 1;
+    const int this_span_size = span_end - span_start;
+    if (this_span_size > spans->span_size) {
+      return errors::Internal(Printf("Span is too large: %d vs %d.",
+                                     this_span_size, spans->span_size));
+    }
+    float total_weight_sum = 0.0f;
+    temp_weights.clear();
+    for (int source = span_start; source < span_end; ++source) {
+      float kernel_pos = static_cast<float>(source) + 0.5f - sample_f;
+      float weight = kernel(std::abs(kernel_pos * one_over_kernel_scale));
+      total_weight_sum += weight;
+      temp_weights.push_back(weight);
+    }
+    max_span_size = std::max(max_span_size, this_span_size);
+    if (std::abs(total_weight_sum) >=
+        1000.0f * std::numeric_limits<float>::min()) {
+      float one_over_total_weight_sum = 1.0f / total_weight_sum;
+      int out_index = spans->span_size * x;
+      for (float weight : temp_weights) {
+        weights_vec(out_index) = weight * one_over_total_weight_sum;
+        ++out_index;
+      }
+    }
+    starts_vec(x) = span_start;
+  }
+  return Status::OK();
+}
+
+Status ComputeGradSpansCore(OpKernelContext* context, const Spans& spans,
+                            const int64 forward_output_size,
+                            const int64 forward_input_size, Spans* grad_spans) {
+  struct GradComponent {
+    int index;
+    float weight;
+  };
+  std::vector<std::vector<GradComponent>> grad_components(forward_input_size);
+  auto weights_vec = spans.weights.vec<float>();
+  auto starts_vec = spans.starts.vec<int32>();
+  for (int output_index = 0; output_index < forward_output_size;
+       ++output_index) {
+    int input_index = starts_vec(output_index);
+    for (int j = 0; j < spans.span_size; ++j, ++input_index) {
+      const float weight = weights_vec(output_index * spans.span_size + j);
+      if (weight != 0.0f && input_index < forward_input_size) {
+        grad_components[input_index].push_back(
+            GradComponent{output_index, weight});
+      }
+    }
+  }
+  int max_size = 0;
+  for (std::vector<GradComponent>& gc : grad_components) {
+    if (!gc.empty()) {
+      std::sort(gc.begin(), gc.end(),
+                [](const GradComponent& x1, const GradComponent& x2) {
+                  return x1.index < x2.index;
+                });
+      max_size = std::max(gc.back().index - gc.front().index + 1, max_size);
+    }
+  }
+  grad_spans->span_size = max_size;
+  AllocatorAttributes alloc_attr;
+  alloc_attr.set_on_host(true);
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      tensorflow::DT_INT32, tensorflow::TensorShape({forward_input_size}),
+      &grad_spans->starts, alloc_attr));
+  auto grad_starts_vec = grad_spans->starts.vec<int32>();
+  TF_RETURN_IF_ERROR(context->allocate_temp(
+      tensorflow::DT_FLOAT,
+      tensorflow::TensorShape({grad_spans->span_size * forward_input_size}),
+      &grad_spans->weights, alloc_attr));
+  auto grad_weights_vec = grad_spans->weights.vec<float>();
+  grad_weights_vec.setZero();
+  for (int input_index = 0; input_index < forward_input_size; ++input_index) {
+    const int start_span = grad_components[input_index].front().index;
+    grad_starts_vec(input_index) = start_span;
+    for (const GradComponent& gc : grad_components[input_index]) {
+      grad_weights_vec(input_index * grad_spans->span_size + gc.index -
+                       start_span) += gc.weight;
+    }
+  }
+  return Status::OK();
+}
+
+// Computes the spans for the passed kernel, for a input dimension of length
+// input_size transformed by scale and translate to an output dimension of
+// length output_size. Note that there's no requirement that;
+// output_size = input_size * scale.
+Status ComputeSpans(OpKernelContext* context,
+                    const functor::SamplingKernelType kernel_type,
+                    const int64 output_size, const int64 input_size,
+                    const float scale, const float translate, Spans* spans) {
+  switch (kernel_type) {
+    case functor::Lanczos1Kernel: {
+      return ComputeSpansCore(context, CreateLanczos1Kernel(), output_size,
+                              input_size, scale, translate, spans);
+    }
+    case functor::Lanczos3Kernel: {
+      return ComputeSpansCore(context, CreateLanczos3Kernel(), output_size,
+                              input_size, scale, translate, spans);
+    }
+    case functor::Lanczos5Kernel: {
+      return ComputeSpansCore(context, CreateLanczos5Kernel(), output_size,
+                              input_size, scale, translate, spans);
+    }
+    case functor::GaussianKernel: {
+      return ComputeSpansCore(context, CreateGaussianKernel(), output_size,
+                              input_size, scale, translate, spans);
+    }
+    case functor::BoxKernel: {
+      return ComputeSpansCore(context, CreateBoxKernel(), output_size,
+                              input_size, scale, translate, spans);
+    }
+    case functor::TriangleKernel: {
+      return ComputeSpansCore(context, CreateTriangleKernel(), output_size,
+                              input_size, scale, translate, spans);
+    }
+    case functor::KeysCubicKernel: {
+      return ComputeSpansCore(context, CreateKeysCubicKernel(), output_size,
+                              input_size, scale, translate, spans);
+    }
+    case functor::MitchellCubicKernel: {
+      return ComputeSpansCore(context, CreateMitchellCubicKernel(), output_size,
+                              input_size, scale, translate, spans);
+    }
+    default:
+      return errors::InvalidArgument(Printf("Unrecognized kernel type: %d",
+                                            static_cast<int>(kernel_type)));
+  }
+  return Status::OK();
+}
+
+// Computes the grad spans for the passed kernel.
+// forward_input_size and forward_output_size are the input and output size from
+// the forward operation.
+Status ComputeGradSpans(OpKernelContext* context,
+                        const functor::SamplingKernelType kernel_type,
+                        const int64 forward_output_size,
+                        const int64 forward_input_size, const float scale,
+                        const float translate, Spans* grad_spans) {
+  Spans spans;
+  TF_RETURN_IF_ERROR(ComputeSpans(context, kernel_type, forward_output_size,
+                                  forward_input_size, scale, translate,
+                                  &spans));
+  return ComputeGradSpansCore(context, spans, forward_output_size,
+                              forward_input_size, grad_spans);
+}
+
+void GetValues(OpKernelContext* context, int input_index, float* v_1,
+               float* v_2) {
+  // Tensor mutable_input(int index, False);
+  const Tensor& t = context->input(input_index);
+  OP_REQUIRES(context, t.dims() == 1,
+              errors::InvalidArgument("t must be 1-dimensional",
+                                      t.shape().DebugString()));
+  OP_REQUIRES(context, t.NumElements() == 2,
+              errors::InvalidArgument("t must have two elements",
+                                      t.shape().DebugString()));
+
+  auto data_vec = t.flat<float>().data();
+  *v_1 = data_vec[0];
+  *v_2 = data_vec[1];
+}
+
+template <typename Device, typename T>
+class ScaleAndTranslateOp : public OpKernel {
+ public:
+  explicit ScaleAndTranslateOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string kernel_type_str;
+    OP_REQUIRES_OK(context, context->GetAttr("kernel_type", &kernel_type_str));
+    kernel_type_ = functor::SamplingKernelTypeFromString(kernel_type_str);
+    OP_REQUIRES(context, kernel_type_ != functor::SamplingKernelTypeEnd,
+                errors::InvalidArgument("Unrecognized kernel type: " +
+                                        kernel_type_str));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    OP_REQUIRES(context, input.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input.shape().DebugString()));
+    const Tensor& output_shape_t = context->input(1);
+    OP_REQUIRES(context, output_shape_t.dims() == 1,
+                errors::InvalidArgument("output_shape_t must be 1-dimensional",
+                                        output_shape_t.shape().DebugString()));
+    OP_REQUIRES(context, output_shape_t.NumElements() == 2,
+                errors::InvalidArgument("output_shape_t must have two elements",
+                                        output_shape_t.shape().DebugString()));
+    auto output_shape_vec = output_shape_t.vec<int32>();
+    const int64 output_height = internal::SubtleMustCopy(output_shape_vec(0));
+    const int64 output_width = internal::SubtleMustCopy(output_shape_vec(1));
+
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(input.dim_size(1), std::numeric_limits<int32>::max()) &&
+            FastBoundsCheck(input.dim_size(2),
+                            std::numeric_limits<int32>::max()),
+        errors::InvalidArgument("input sizes must be between 0 and max int32"));
+
+    const int64 batch_size = input.dim_size(0);
+    const int64 input_height = input.dim_size(1);
+    const int64 input_width = input.dim_size(2);
+    const int64 channels = input.dim_size(3);
+    OP_REQUIRES(context, output_height > 0 && output_width > 0,
+                errors::InvalidArgument("output dimensions must be positive"));
+    OP_REQUIRES(
+        context, channels > 0,
+        errors::InvalidArgument("image must have at least one channel"));
+    OP_REQUIRES(
+        context, input.dim_size(1) > 0 && input.dim_size(2) > 0,
+        errors::InvalidArgument("input image must be of non-zero size"));
+
+    float row_scale, col_scale;
+    GetValues(context, 2, &row_scale, &col_scale);
+    OP_REQUIRES(context, row_scale > 0 && col_scale > 0,
+                errors::InvalidArgument("Scale must be greater than zero."));
+    float row_translation, col_translation;
+    GetValues(context, 3, &row_translation, &col_translation);
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0,
+                                TensorShape({input.dim_size(0), output_height,
+                                             output_width, input.dim_size(3)}),
+                                &output));
+    if (!context->status().ok()) return;
+
+    // Return if the output is empty.
+    if (output->NumElements() == 0) return;
+
+    typename TTypes<T, 4>::ConstTensor image_data(input.tensor<T, 4>());
+    TTypes<float, 4>::Tensor output_data = output->tensor<float, 4>();
+
+    functor::Spans col_spans;
+    OP_REQUIRES_OK(
+        context, ComputeSpans(context, kernel_type_, output_width, input_width,
+                              col_scale, col_translation, &col_spans));
+    functor::Spans row_spans;
+    OP_REQUIRES_OK(context, ComputeSpans(context, kernel_type_, output_height,
+                                         input_height, row_scale,
+                                         row_translation, &row_spans));
+    Tensor intermediate_t;
+    OP_REQUIRES_OK(
+        context, context->allocate_temp(DT_FLOAT,
+                                        TensorShape({batch_size, output_height,
+                                                     input_width, channels}),
+                                        &intermediate_t));
+    TTypes<float, 4>::Tensor intermediate_data =
+        intermediate_t.tensor<float, 4>();
+
+    const functor::Spans& const_row_spans = row_spans;
+    typename TTypes<int32, 1>::ConstTensor row_starts(
+        const_row_spans.starts.tensor<int32, 1>());
+    typename TTypes<float, 1>::ConstTensor row_weights(
+        const_row_spans.weights.tensor<float, 1>());
+    const functor::Spans& const_col_spans = col_spans;
+    typename TTypes<int32, 1>::ConstTensor col_starts(
+        const_col_spans.starts.tensor<int32, 1>());
+    typename TTypes<float, 1>::ConstTensor col_weights(
+        const_col_spans.weights.tensor<float, 1>());
+
+    functor::GatherSpans<Device, T>()(
+        context->eigen_device<Device>(), row_spans.span_size, row_starts,
+        row_weights, col_spans.span_size, col_starts, col_weights, image_data,
+        intermediate_data, output_data);
+  }
+  functor::SamplingKernelType kernel_type_;
+};
+
+template <typename Device, typename T>
+class ScaleAndTranslateGradOp : public OpKernel {
+ public:
+  explicit ScaleAndTranslateGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string kernel_type_str;
+    OP_REQUIRES_OK(context, context->GetAttr("kernel_type", &kernel_type_str));
+    kernel_type_ = functor::SamplingKernelTypeFromString(kernel_type_str);
+    OP_REQUIRES(context, kernel_type_ != functor::SamplingKernelTypeEnd,
+                errors::InvalidArgument("Unrecognized kernel type: " +
+                                        kernel_type_str));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    const Tensor& original_image = context->input(1);
+
+    OP_REQUIRES(context, input.dims() == 4,
+                errors::InvalidArgument("input_grad must be 4-dimensional",
+                                        input.shape().DebugString()));
+    // Resizers always produce float images, so input gradient must
+    // always be a float.
+    OP_REQUIRES(context, input.dtype() == DT_FLOAT,
+                errors::InvalidArgument("input_grad must be of type float",
+                                        DataTypeString(input.dtype())));
+
+    OP_REQUIRES(context, original_image.dims() == 4,
+                errors::InvalidArgument("original_image must be 4-dimensional",
+                                        original_image.shape().DebugString()));
+
+    // Allocate output and initialize to zeros.
+    const int64 batch_size = input.dim_size(0);
+    const int64 channels = input.dim_size(3);
+    const int64 forward_input_height = original_image.dim_size(1);
+    const int64 forward_input_width = original_image.dim_size(2);
+
+    OP_REQUIRES(context,
+                FastBoundsCheck(forward_input_height,
+                                std::numeric_limits<int32>::max()) &&
+                    FastBoundsCheck(forward_input_width,
+                                    std::numeric_limits<int32>::max()),
+                errors::InvalidArgument(
+                    "original sizes must be between 0 and max int32"));
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                0,
+                                TensorShape({batch_size, forward_input_height,
+                                             forward_input_width, channels}),
+                                &output));
+
+    float row_scale, col_scale;
+    GetValues(context, 2, &row_scale, &col_scale);
+    OP_REQUIRES(context, row_scale > 0 && col_scale > 0,
+                errors::InvalidArgument("Scale must be greater than zero."));
+    float row_translation, col_translation;
+    GetValues(context, 3, &row_translation, &col_translation);
+
+    if (!context->status().ok()) return;
+
+    TTypes<float, 4>::ConstTensor input_grad = input.tensor<float, 4>();
+    typename TTypes<T, 4>::Tensor output_grad(output->tensor<T, 4>());
+
+    const int64 forward_output_height = input_grad.dimension(1);
+    const int64 forward_output_width = input_grad.dimension(2);
+
+    functor::Spans col_spans;
+    OP_REQUIRES_OK(context,
+                   ComputeGradSpans(context, kernel_type_, forward_output_width,
+                                    forward_input_width, col_scale,
+                                    col_translation, &col_spans));
+    functor::Spans row_spans;
+    OP_REQUIRES_OK(
+        context, ComputeGradSpans(context, kernel_type_, forward_output_height,
+                                  forward_input_height, row_scale,
+                                  row_translation, &row_spans));
+    Tensor intermediate_t;
+    OP_REQUIRES_OK(context, context->allocate_temp(
+                                DT_FLOAT,
+                                TensorShape({batch_size, forward_input_height,
+                                             forward_output_width, channels}),
+                                &intermediate_t));
+    TTypes<float, 4>::Tensor intermediate_data =
+        intermediate_t.tensor<float, 4>();
+
+    const functor::Spans& const_row_spans = row_spans;
+    typename TTypes<int32, 1>::ConstTensor row_starts =
+        const_row_spans.starts.tensor<int32, 1>();
+    typename TTypes<float, 1>::ConstTensor row_weights(
+        const_row_spans.weights.tensor<float, 1>());
+    const functor::Spans& const_col_spans = col_spans;
+    typename TTypes<int32, 1>::ConstTensor col_starts(
+        const_col_spans.starts.tensor<int32, 1>());
+    typename TTypes<float, 1>::ConstTensor col_weights(
+        const_col_spans.weights.tensor<float, 1>());
+
+    functor::GatherSpans<Device, T>()(
+        context->eigen_device<Device>(), row_spans.span_size, row_starts,
+        row_weights, col_spans.span_size, col_starts, col_weights, input_grad,
+        intermediate_data, output_grad);
+  }
+
+  functor::SamplingKernelType kernel_type_;
+};
+
+template <typename T>
+void GatherColumns(int span_size, const int32* starts, const float* weights,
+                   const T* image, const int64 input_height,
+                   const int64 input_width, const int64 output_height,
+                   const int64 output_width, const int channels,
+                   float* output) {
+  const int64 in_row_size = input_width * channels;
+  const int64 out_row_size = output_width * channels;
+
+  for (int y = 0; y < output_height; ++y) {
+    const T* input_row_start = image + in_row_size * y;
+    float* out_pix = output + out_row_size * y;
+    for (int x = 0; x < output_width; ++x, out_pix += channels) {
+      const T* in_pix = input_row_start + starts[x] * channels;
+      const float* weights_start = weights + x * span_size;
+      const int real_span_size =
+          std::min(starts[x] + span_size, static_cast<int>(input_width)) -
+          starts[x];
+      const float* weights_end = weights_start + real_span_size;
+      for (int c = 0; c < channels; ++c) {
+        out_pix[c] = 0.0f;
+      }
+      for (const float* weight_ptr = weights_start; weight_ptr != weights_end;
+           ++weight_ptr) {
+        float w = *weight_ptr;
+        for (int c = 0; c < channels; ++c) {
+          out_pix[c] += w * static_cast<float>(in_pix[c]);
+        }
+        in_pix += channels;
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void AddScaledVector(const T* in_vec, int vec_len, float weight,
+                            float* out_vec) {
+  float* out_vec_end = out_vec + vec_len;
+  for (; out_vec != out_vec_end; ++out_vec, ++in_vec) {
+    *out_vec += weight * static_cast<float>(*in_vec);
+  }
+}
+
+template <typename T>
+void GatherRows(int span_size, const int32* starts, const float* weights,
+                const T* image, const int64 input_height,
+                const int64 input_width, const int64 output_height,
+                const int64 output_width, const int channels, float* output) {
+  const int64 in_row_size = input_width * channels;
+  const int64 out_row_size = output_width * channels;
+
+  for (int y = 0; y < output_height; ++y) {
+    float* out_row_data = output + out_row_size * y;
+    std::fill(out_row_data, out_row_data + out_row_size, 0.0f);
+    int in_row = starts[y];
+    const T* in_row_data = image + in_row_size * in_row;
+    const float* weights_start = weights + y * span_size;
+    const int real_span_size =
+        std::min(starts[y] + span_size, static_cast<int>(input_height)) -
+        starts[y];
+    const float* const weights_end = weights_start + real_span_size;
+    for (const float* weight_it = weights_start; weight_it != weights_end;
+         ++weight_it) {
+      AddScaledVector(in_row_data, in_row_size, *weight_it, out_row_data);
+      in_row_data += in_row_size;
+    }
+  }
+}
+
+}  // namespace
+
+// Partial specialization of GatherSpans functor for a CPUDevice.
+template <typename T>
+struct GatherSpans<CPUDevice, T> {
+  void operator()(const CPUDevice& d, int row_span_size,
+                  typename TTypes<int32, 1>::ConstTensor row_starts,
+                  typename TTypes<float, 1>::ConstTensor row_weights,
+                  int col_span_size,
+                  typename TTypes<int32, 1>::ConstTensor col_starts,
+                  typename TTypes<float, 1>::ConstTensor col_weights,
+                  typename TTypes<T, 4>::ConstTensor images,
+                  typename TTypes<float, 4>::Tensor intermediate_buffer,
+                  typename TTypes<float, 4>::Tensor resized_images) {
+    const int batch_size = images.dimension(0);
+    const int64 input_height = images.dimension(1);
+    const int64 input_width = images.dimension(2);
+    const int channels = images.dimension(3);
+
+    const int64 output_height = resized_images.dimension(1);
+    const int64 output_width = resized_images.dimension(2);
+
+    const int64 input_pix_per_batch = input_width * input_height * channels;
+    const int64 intermediate_pix_per_batch =
+        input_width * output_height * channels;
+    const int64 output_pix_per_batch = output_width * output_height * channels;
+    float* intermediate_ptr = intermediate_buffer.data();
+
+    const T* image_ptr = images.data();
+    float* out_ptr = resized_images.data();
+    for (int b = 0; b < batch_size; ++b, image_ptr += input_pix_per_batch,
+             intermediate_ptr += intermediate_pix_per_batch,
+             out_ptr += output_pix_per_batch) {
+      GatherRows(row_span_size, row_starts.data(), row_weights.data(),
+                 image_ptr, input_height, input_width, output_height,
+                 input_width, channels, intermediate_ptr);
+      GatherColumns(col_span_size, col_starts.data(), col_weights.data(),
+                    intermediate_ptr, output_height, input_width, output_height,
+                    output_width, channels, out_ptr);
+    }
+  }
+};
+
+#define REGISTER_KERNEL(T)                                \
+  REGISTER_KERNEL_BUILDER(Name("ScaleAndTranslate")       \
+                              .Device(DEVICE_CPU)         \
+                              .TypeConstraint<T>("T")     \
+                              .HostMemory("size")         \
+                              .HostMemory("scale")        \
+                              .HostMemory("translation"), \
+                          ScaleAndTranslateOp<CPUDevice, T>);
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNEL);
+
+#undef REGISTER_KERNEL
+
+#define REGISTER_GRAD_KERNEL(T)                           \
+  REGISTER_KERNEL_BUILDER(Name("ScaleAndTranslateGrad")   \
+                              .Device(DEVICE_CPU)         \
+                              .TypeConstraint<T>("T")     \
+                              .HostMemory("scale")        \
+                              .HostMemory("translation"), \
+                          ScaleAndTranslateGradOp<CPUDevice, T>);
+
+TF_CALL_float(REGISTER_GRAD_KERNEL);
+
+#undef REGISTER_GRAD_KERNEL
+
+}  // namespace functor
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/scale_and_translate_op.h b/tensorflow/core/kernels/scale_and_translate_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..74bc87ecc7a450a297bf4e8d9d93baf67c3106a7
--- /dev/null
+++ b/tensorflow/core/kernels/scale_and_translate_op.h
@@ -0,0 +1,75 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SCALE_AND_TRANSLATE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_SCALE_AND_TRANSLATE_OP_H_
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/numeric_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/sampling_kernels.h"
+
+namespace tensorflow {
+namespace functor {
+
+// The scale and translate op works by scaling and translating the row and
+// column dimensions separately.
+// When scaling and translating the rows the set of input pixels and kernel
+// weights used to compute a given output pixel within a row is constant across
+// rows and can thus be precomputed and reused for every row. Similarly for the
+// columns. This precomputed data structure is called a 'span'.
+
+// To compute the gradient we use the spans computed on the forward pass and
+// essentially reverse them: we record for each input pixel which output
+// pixels it contributes to. This means that the forward and backward passes
+// use the same core algorithm, only the spans are computed differently.
+
+// A pre-computed span of pixels along a single dimension.
+// The output pixel will be the weighted sum of pixels starting from start.
+struct Spans {
+  // The maximum span size of any output pixel.
+  int span_size;
+  // int32 tensor of size [output_dim].
+  Tensor starts;
+  // float tensor of size [output_dim, span_size].
+  // The output pixel at x is computed as:
+  //   dot_product(input[starts[x]:starts[x]+span_size], weights[x]).
+  Tensor weights;
+};
+
+// Gather spans in both dimensions.
+// row_span_size, row_starts and row_weights correspond to the variables in
+// the row Spans data structure, similarly for col_span_size etc.
+// intermediate_buffer is a Tensor used to store the result of the
+// resize in the column dimension and is of size:
+//    [batch_size, input_height, output_width, channels]
+template <typename Device, typename T>
+struct GatherSpans {
+  void operator()(const Device& d, int row_span_size,
+                  typename TTypes<int32, 1>::ConstTensor row_starts,
+                  typename TTypes<float, 1>::ConstTensor row_weights,
+                  int col_span_size,
+                  typename TTypes<int32, 1>::ConstTensor col_starts,
+                  typename TTypes<float, 1>::ConstTensor col_weights,
+                  typename TTypes<T, 4>::ConstTensor input_images,
+                  typename TTypes<float, 4>::Tensor intermediate_buffer,
+                  typename TTypes<float, 4>::Tensor output_images);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SCALE_AND_TRANSLATE_OP_H_
diff --git a/tensorflow/core/kernels/scale_and_translate_op_test.cc b/tensorflow/core/kernels/scale_and_translate_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..23176f9f2da9c597d3cf13db0ee2e9f23eb72b37
--- /dev/null
+++ b/tensorflow/core/kernels/scale_and_translate_op_test.cc
@@ -0,0 +1,377 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/sampling_kernels.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/png/png_io.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+using Eigen::Vector2f;
+
+class DynamicKernel {
+ public:
+  virtual ~DynamicKernel() {}
+  virtual float Value(const float x) const = 0;
+  virtual float Radius() const = 0;
+};
+
+// Wraps a sampling kernel in a common interface.
+template <typename KernelType>
+class TypedDynamicKernel : public DynamicKernel {
+ public:
+  explicit TypedDynamicKernel(const KernelType& kernel) : kernel_(kernel) {}
+  float Value(const float x) const override { return kernel_(x); }
+  float Radius() const override { return kernel_.Radius(); }
+  const KernelType kernel_;
+};
+
+template <typename KernelType>
+std::unique_ptr<const DynamicKernel> CreateKernel(const KernelType& kernel) {
+  return MakeUnique<TypedDynamicKernel<KernelType>>(kernel);
+}
+
+std::unique_ptr<const DynamicKernel> Create(
+    functor::SamplingKernelType kernel_type) {
+  switch (kernel_type) {
+    case functor::Lanczos1Kernel:
+      return CreateKernel(functor::CreateLanczos1Kernel());
+    case functor::Lanczos3Kernel:
+      return CreateKernel(functor::CreateLanczos3Kernel());
+    case functor::Lanczos5Kernel:
+      return CreateKernel(functor::CreateLanczos5Kernel());
+    case functor::GaussianKernel:
+      return CreateKernel(functor::CreateGaussianKernel());
+    case functor::BoxKernel:
+      return CreateKernel(functor::CreateBoxKernel());
+    case functor::TriangleKernel:
+      return CreateKernel(functor::CreateTriangleKernel());
+    case functor::KeysCubicKernel:
+      return CreateKernel(functor::CreateKeysCubicKernel());
+    case functor::MitchellCubicKernel:
+      return CreateKernel(functor::CreateMitchellCubicKernel());
+    default:
+      LOG(FATAL) << "Unknown kernel type.";
+      return nullptr;
+  }
+}
+
+template <typename T>
+inline const T& Clamp(const T& low, const T& high, const T& value) {
+  return std::min(high, std::max(low, value));
+}
+
+// Samples from the image at the passed batch at pixel location sample_f with a
+// kernel scaled by scale.
+void Sample(const DynamicKernel& kernel, TTypes<float, 4>::Tensor images,
+            int batch, const Vector2f& scale, const Vector2f& sample_f,
+            float* dest) {
+  const Vector2f kernel_scale(std::max(scale.x(), 1.0f),
+                              std::max(scale.y(), 1.0f));
+
+  const int64 in_height = images.dimension(1);
+  const int64 in_width = images.dimension(2);
+  const int channels = images.dimension(3);
+  const int64 y_span_start = Clamp(
+      static_cast<int64>(0), in_height - 1,
+      static_cast<int64>(
+          std::ceil(sample_f.y() - kernel.Radius() * kernel_scale.y() - 0.5f)));
+  const int64 y_span_end =
+      Clamp(static_cast<int64>(0), in_height - 1,
+            static_cast<int64>(std::floor(
+                sample_f.y() + kernel.Radius() * kernel_scale.y() - 0.5f))) +
+      1;
+  const int64 x_span_start = Clamp(
+      static_cast<int64>(0), in_width - 1,
+      static_cast<int64>(
+          std::ceil(sample_f.x() - kernel.Radius() * kernel_scale.x() - 0.5f)));
+
+  const int64 x_span_end =
+      Clamp(static_cast<int64>(0), in_width - 1,
+            static_cast<int64>(std::floor(
+                sample_f.x() + kernel.Radius() * kernel_scale.x() - 0.5f))) +
+      1;
+
+  std::fill(dest, dest + channels, 0.0f);
+  if (y_span_end <= y_span_start || x_span_end <= x_span_start) {
+    return;
+  }
+  const Vector2f one_over_kernel_scale(1.0f / kernel_scale.x(),
+                                       1.0f / kernel_scale.y());
+  float total_weight = 0.0f;
+  for (int64 y = y_span_start; y < y_span_end; ++y) {
+    float y_kernel_pos = static_cast<float>(y) + 0.5f - sample_f.y();
+    float y_weight = kernel.Value(y_kernel_pos * one_over_kernel_scale.y());
+    for (int64 x = x_span_start; x < x_span_end; ++x) {
+      float x_kernel_pos = static_cast<float>(x) + 0.5f - sample_f.x();
+      float x_weight = kernel.Value(x_kernel_pos * one_over_kernel_scale.x());
+      float kernel_weight = y_weight * x_weight;
+      total_weight += kernel_weight;
+      for (int c = 0; c < channels; ++c) {
+        dest[c] += static_cast<float>(images(batch, y, x, c)) * kernel_weight;
+      }
+    }
+  }
+  if (std::abs(total_weight) >= 1000.0f * std::numeric_limits<float>::min()) {
+    CHECK_NE(total_weight, 0.0f) << y_span_start << "," << y_span_end << " "
+                                 << x_span_start << "," << x_span_end;
+    for (int c = 0; c < channels; ++c) {
+      dest[c] /= total_weight;
+    }
+  }
+}
+
+// This is the straight forward unoptimized implementation of ScaleAndTranslate
+// We use this to confirm that the optimized version is almost identical. The
+// only difference will be small floating point differences, since this version
+// does not to separable passes in x and y dimensions.
+void ScaleAndTranslateBaseline(const DynamicKernel& kernel,
+                               TTypes<float, 4>::Tensor images,
+                               const Vector2f& orig_scale,
+                               const Vector2f& orig_translate,
+                               TTypes<float, 4>::Tensor output) {
+  const Vector2f scale(1.0f / orig_scale[0], 1.0f / orig_scale[1]);
+  const Vector2f translate(-orig_translate[0] / orig_scale[0],
+                           -orig_translate[1] / orig_scale[1]);
+
+  const int batch = images.dimension(0);
+  const int channels = images.dimension(3);
+
+  ASSERT_EQ(batch, output.dimension(0));
+  ASSERT_EQ(channels, output.dimension(3));
+
+  const int64 out_height = output.dimension(1);
+  const int64 out_width = output.dimension(2);
+
+  for (int b = 0; b < batch; ++b) {
+    for (int64 y = 0; y < out_height; ++y) {
+      const float out_y_f = static_cast<float>(y) + 0.5;
+      const float in_y_f = out_y_f * scale.y() + translate.y();
+      for (int64 x = 0; x < out_width; ++x) {
+        const float out_x_f = static_cast<float>(x) + 0.5;
+        const float in_x_f = out_x_f * scale.x() + translate.x();
+        Sample(kernel, images, b, scale, Vector2f(in_x_f, in_y_f),
+               &output(b, y, x, 0));
+      }
+    }
+  }
+}
+
+class ScaleAndTranslateOpTest : public OpsTestBase {
+ protected:
+  void CreateOp(const string& kernel_type_str = "lanczos3") {
+    TF_EXPECT_OK(NodeDefBuilder("scale_and_translate_op", "ScaleAndTranslate")
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_INT32))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Attr("kernel_type", kernel_type_str)
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+    kernel_type_ = functor::SamplingKernelTypeFromString(kernel_type_str);
+  }
+
+  void SetCheckerboardImageInput(int batch_size, int num_row_squares,
+                                 int num_col_squares, int square_size,
+                                 int num_channels) {
+    inputs_.clear();
+    std::vector<float> data;
+    const int64 row_size = num_col_squares * square_size * num_channels;
+    const int64 image_size = num_row_squares * square_size * row_size;
+    data.resize(batch_size * image_size);
+    random::PhiloxRandom philox(42);
+    random::SimplePhilox rnd(&philox);
+    std::vector<float> col(num_channels);
+    for (int b = 0; b < batch_size; ++b) {
+      for (int y = 0; y < num_row_squares; ++y) {
+        for (int x = 0; x < num_col_squares; ++x) {
+          for (int n = 0; n < num_channels; ++n) {
+            col[n] = rnd.RandFloat();
+          }
+          for (int r = y * square_size; r < (y + 1) * square_size; ++r) {
+            auto it = data.begin() + b * image_size + r * row_size +
+                      x * square_size * num_channels;
+            for (int n = 0; n < square_size; ++n) {
+              for (int chan = 0; chan < num_channels; ++chan, ++it) {
+                *it = col[chan] * 255.0;
+              }
+            }
+          }
+        }
+      }
+    }
+    AddInputFromArray<float>(
+        TensorShape({batch_size, num_row_squares * square_size,
+                     num_col_squares * square_size, num_channels}),
+        data);
+  }
+
+  void RunTest(int output_image_height, int output_image_width,
+               const Vector2f& scale, const Vector2f& translate) {
+    AddInputFromArray<int32>(TensorShape({2}),
+                             {output_image_height, output_image_width});
+    AddInputFromArray<float>(TensorShape({2}), {scale[1], scale[0]});
+    AddInputFromArray<float>(TensorShape({2}), {translate[1], translate[0]});
+    Status s = RunOpKernel();
+    const int batch_size = GetOutput(0)->dim_size(0);
+    const int channels = GetOutput(0)->dim_size(3);
+    Tensor expected(allocator(), DT_FLOAT,
+                    TensorShape({batch_size, output_image_height,
+                                 output_image_width, channels}));
+
+    std::unique_ptr<const DynamicKernel> kernel = Create(kernel_type_);
+    ScaleAndTranslateBaseline(*kernel, mutable_input(0)->tensor<float, 4>(),
+                              scale, translate, expected.tensor<float, 4>());
+    constexpr double kAbs = 1e-2f;
+    test::ExpectTensorNear<float>(expected, *GetOutput(0), kAbs);
+  }
+
+  functor::SamplingKernelType kernel_type_;
+};
+
+TEST_F(ScaleAndTranslateOpTest, IdentityTest) {
+  CreateOp();
+  constexpr int64 kBatchSize = 2;
+  constexpr int64 kNumRowSquares = 16;
+  constexpr int64 kNumColSquares = 13;
+  constexpr int64 kSquareSize = 12;
+  constexpr int64 kNumChannels = 3;
+  SetCheckerboardImageInput(kBatchSize, kNumRowSquares, kNumColSquares,
+                            kSquareSize, kNumChannels);
+  constexpr int kOutputImageHeight = kNumRowSquares * kSquareSize;
+  constexpr int kOutputImageWidth = kNumColSquares * kSquareSize;
+  const Vector2f kScale(1.0f, 1.0f);
+  const Vector2f kTranslate(0.0f, 0.0f);
+  RunTest(kOutputImageHeight, kOutputImageWidth, kScale, kTranslate);
+}
+
+TEST_F(ScaleAndTranslateOpTest, UpsampleTest) {
+  CreateOp();
+  constexpr int64 kBatchSize = 2;
+  constexpr int64 kNumRowSquares = 16;
+  constexpr int64 kNumColSquares = 13;
+  constexpr int64 kSquareSize = 12;
+  constexpr int64 kNumChannels = 3;
+  SetCheckerboardImageInput(kBatchSize, kNumRowSquares, kNumColSquares,
+                            kSquareSize, kNumChannels);
+  constexpr int kOutputImageHeight = kNumRowSquares * kSquareSize * 2;
+  constexpr int kOutputImageWidth = kNumColSquares * kSquareSize * 2;
+  const Vector2f kScale(2.0f, 2.0f);
+  const Vector2f kTranslate(0.0f, 0.0f);
+  RunTest(kOutputImageHeight, kOutputImageWidth, kScale, kTranslate);
+}
+
+TEST_F(ScaleAndTranslateOpTest, DownsampleTest) {
+  CreateOp();
+  constexpr int64 kBatchSize = 2;
+  constexpr int64 kNumRowSquares = 16;
+  constexpr int64 kNumColSquares = 13;
+  constexpr int64 kSquareSize = 12;
+  constexpr int64 kNumChannels = 3;
+  SetCheckerboardImageInput(kBatchSize, kNumRowSquares, kNumColSquares,
+                            kSquareSize, kNumChannels);
+  constexpr int kOutputImageHeight = kNumRowSquares * kSquareSize / 2;
+  constexpr int kOutputImageWidth = kNumColSquares * kSquareSize / 2;
+  const Vector2f kScale(0.5f, 0.5f);
+  const Vector2f kTranslate(0.0f, 0.0f);
+  RunTest(kOutputImageHeight, kOutputImageWidth, kScale, kTranslate);
+}
+
+TEST_F(ScaleAndTranslateOpTest, DownsampleToASinglePixelTest) {
+  CreateOp();
+  constexpr int64 kBatchSize = 2;
+  constexpr int64 kNumRowSquares = 16;
+  constexpr int64 kNumColSquares = 13;
+  constexpr int64 kSquareSize = 12;
+  constexpr int64 kNumChannels = 3;
+  SetCheckerboardImageInput(kBatchSize, kNumRowSquares, kNumColSquares,
+                            kSquareSize, kNumChannels);
+  constexpr int kOutputImageHeight = 1;
+  constexpr int kOutputImageWidth = 1;
+  const Vector2f kScale(1.0f / (kNumRowSquares * kSquareSize),
+                        1.0f / (kNumColSquares * kSquareSize));
+  const Vector2f kTranslate(0.0f, 0.0f);
+  RunTest(kOutputImageHeight, kOutputImageWidth, kScale, kTranslate);
+}
+
+TEST_F(ScaleAndTranslateOpTest, UsampleFromASinglePixelTest) {
+  CreateOp();
+  constexpr int64 kBatchSize = 2;
+  constexpr int64 kNumRowSquares = 1;
+  constexpr int64 kNumColSquares = 1;
+  constexpr int64 kSquareSize = 1;
+  constexpr int64 kNumChannels = 3;
+  SetCheckerboardImageInput(kBatchSize, kNumRowSquares, kNumColSquares,
+                            kSquareSize, kNumChannels);
+  constexpr int kOutputImageHeight = 10;
+  constexpr int kOutputImageWidth = 17;
+  const Vector2f kScale(17.0f, 10.0f);
+  const Vector2f kTranslate(0.0f, 0.0f);
+  RunTest(kOutputImageHeight, kOutputImageWidth, kScale, kTranslate);
+}
+
+TEST_F(ScaleAndTranslateOpTest, ScaleAndTranslationTest) {
+  CreateOp();
+  constexpr int64 kBatchSize = 2;
+  constexpr int64 kNumRowSquares = 11;
+  constexpr int64 kNumColSquares = 7;
+  constexpr int64 kSquareSize = 5;
+  constexpr int64 kNumChannels = 3;
+  SetCheckerboardImageInput(kBatchSize, kNumRowSquares, kNumColSquares,
+                            kSquareSize, kNumChannels);
+  constexpr int kOutputImageHeight = 49;
+  constexpr int kOutputImageWidth = 51;
+  const Vector2f kScale(1.1f, 0.9f);
+  const Vector2f kTranslate(4.1f, -3.1f);
+  RunTest(kOutputImageHeight, kOutputImageWidth, kScale, kTranslate);
+}
+
+TEST_F(ScaleAndTranslateOpTest, TestKernelTypes) {
+  const std::vector<string> kKernelTypes = {
+      "lanczos1", "lanczos3",  "lanczos5",     "box",
+      "triangle", "keyscubic", "mitchellcubic"};
+  for (const string& kernel_type : kKernelTypes) {
+    CreateOp(kernel_type);
+    constexpr int64 kBatchSize = 2;
+    constexpr int64 kNumRowSquares = 10;
+    constexpr int64 kNumColSquares = 11;
+    constexpr int64 kSquareSize = 1;
+    constexpr int64 kNumChannels = 3;
+    SetCheckerboardImageInput(kBatchSize, kNumRowSquares, kNumColSquares,
+                              kSquareSize, kNumChannels);
+    constexpr int kOutputImageHeight = 9;
+    constexpr int kOutputImageWidth = 11;
+    const Vector2f kScale(1.9f, 1.9f);
+    const Vector2f kTranslate(0.3f, 2.1f);
+    RunTest(kOutputImageHeight, kOutputImageWidth, kScale, kTranslate);
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/scan_ops.cc b/tensorflow/core/kernels/scan_ops.cc
index 0a6848361a05559e8d1e23318ca66a9dd3ad9a95..ea42fdefb4124b0fb638adea1f91d77f95d456fd 100644
--- a/tensorflow/core/kernels/scan_ops.cc
+++ b/tensorflow/core/kernels/scan_ops.cc
@@ -18,12 +18,12 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif  // GOOGLE_CUDA
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
diff --git a/tensorflow/core/kernels/scan_ops_gpu.cu.cc b/tensorflow/core/kernels/scan_ops_gpu.h
similarity index 97%
rename from tensorflow/core/kernels/scan_ops_gpu.cu.cc
rename to tensorflow/core/kernels/scan_ops_gpu.h
index ed66c02dc584541ce4d5eb644630b678c1b05916..976b2215405105ece0a5d25c2684aa558b01d8a0 100644
--- a/tensorflow/core/kernels/scan_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/scan_ops_gpu.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_SCAN_OPS_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_SCAN_OPS_GPU_H_
+
 #if GOOGLE_CUDA
 
 #define EIGEN_USE_GPU
@@ -290,17 +293,8 @@ struct Scan<GPUDevice, Eigen::internal::ProdReducer<T>, T> {
 };
 
 }  // namespace functor
-
-#define DEFINE(REDUCER, T) template struct functor::Scan<GPUDevice, REDUCER, T>;
-
-#define DEFINE_FOR_ALL_REDUCERS(T)           \
-  DEFINE(Eigen::internal::SumReducer<T>, T); \
-  DEFINE(Eigen::internal::ProdReducer<T>, T);
-
-TF_CALL_GPU_NUMBER_TYPES(DEFINE_FOR_ALL_REDUCERS);
-#undef DEFINE_FOR_ALL_REDUCERS
-#undef DEFINE
-
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_SCAN_OPS_GPU_H_
diff --git a/tensorflow/core/kernels/scan_ops_gpu_double.cu.cc b/tensorflow/core/kernels/scan_ops_gpu_double.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..adce37e473c4f3f31b29db5b71c4d004da1b6b29
--- /dev/null
+++ b/tensorflow/core/kernels/scan_ops_gpu_double.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/scan_ops.h"
+#include "tensorflow/core/kernels/scan_ops_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+template struct functor::Scan<GpuDevice, Eigen::internal::SumReducer<double>,
+                              double>;
+template struct functor::Scan<GpuDevice, Eigen::internal::ProdReducer<double>,
+                              double>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/scan_ops_gpu_float.cu.cc b/tensorflow/core/kernels/scan_ops_gpu_float.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b72415822d0eebecf8426008266c5bd503b8830c
--- /dev/null
+++ b/tensorflow/core/kernels/scan_ops_gpu_float.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/scan_ops.h"
+#include "tensorflow/core/kernels/scan_ops_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+template struct functor::Scan<GpuDevice, Eigen::internal::SumReducer<float>,
+                              float>;
+template struct functor::Scan<GpuDevice, Eigen::internal::ProdReducer<float>,
+                              float>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/platform/default/string_coding.cc b/tensorflow/core/kernels/scan_ops_gpu_half.cu.cc
similarity index 64%
rename from tensorflow/core/platform/default/string_coding.cc
rename to tensorflow/core/kernels/scan_ops_gpu_half.cu.cc
index 7410ee67820a384e4843a57386b110e40a7e0680..f9fb528be98efc722df3f8b76adc65ae7fa29cdb 100644
--- a/tensorflow/core/platform/default/string_coding.cc
+++ b/tensorflow/core/kernels/scan_ops_gpu_half.cu.cc
@@ -13,18 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/default/string_coding.h"
+#if GOOGLE_CUDA
 
-namespace tensorflow {
-namespace port {
-
-std::unique_ptr<StringListEncoder> NewStringListEncoder(string* out) {
-  return std::unique_ptr<StringListEncoder>(new StringListEncoder(out));
-}
+#define EIGEN_USE_GPU
 
-std::unique_ptr<StringListDecoder> NewStringListDecoder(const string& in) {
-  return std::unique_ptr<StringListDecoder>(new StringListDecoder(in));
-}
+#include "tensorflow/core/kernels/scan_ops.h"
+#include "tensorflow/core/kernels/scan_ops_gpu.h"
 
-}  // namespace port
+namespace tensorflow {
+using Eigen::GpuDevice;
+template struct functor::Scan<
+    GpuDevice, Eigen::internal::SumReducer<Eigen::half>, Eigen::half>;
+template struct functor::Scan<
+    GpuDevice, Eigen::internal::ProdReducer<Eigen::half>, Eigen::half>;
 }  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/scatter_functor.h b/tensorflow/core/kernels/scatter_functor.h
index 2d43bde23feadc33c7081fccd8ad2e44dfe3c2d5..755f8f8dc55ec7dfdf6c56f1ca86e14ec3e3e352 100644
--- a/tensorflow/core/kernels/scatter_functor.h
+++ b/tensorflow/core/kernels/scatter_functor.h
@@ -20,10 +20,10 @@ limitations under the License.
 
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 63bb793fdcb7eb20daeee1708cb4ba78274cb9f7..9c51d4e3a7d9e93f34a4c5957f9acec55ea14937 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -22,11 +22,11 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/scatter_nd_op.h"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/inplace_ops_functor.h"
@@ -49,6 +49,19 @@ typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::SyclDevice SYCLDevice;
 #endif  // TENSORFLOW_USE_SYCL
 
+// Returns true if the three tensors have valid number of elements
+// If shape_input has 0 elements, then we need to have indices and updates with
+// exactly 0 elements too, otherwise we should error. If indices has 0 elements
+// then updates should also have 0 elements, otherwise we should error.
+bool ValidEmptyOutputShape(int64 num_inputs, int64 num_indices,
+                           int64 num_updates) {
+  if (num_indices == 0 && num_updates == 0) {
+    return true;  // regardless of num_inputs ?= 0, covers both cases
+  }
+  // now we want all 3 tensors to have values
+  return (num_inputs != 0 && num_indices != 0 && num_updates != 0);
+}
+
 template <typename Device, typename T, typename Index>
 class ScatterNdOp : public OpKernel {
  public:
@@ -77,12 +90,12 @@ class ScatterNdOp : public OpKernel {
     OP_REQUIRES_OK(c,
                    TensorShapeUtils::MakeShape(vec.data(), vec.size(), &shape));
 
-    OP_REQUIRES(
-        c,
-        (shape.num_elements() > 0 || (indices.shape().num_elements() == 0 &&
-                                      updates.shape().num_elements() == 0)),
-        errors::InvalidArgument(
-            "Indices and updates specified for empty output shape"));
+    OP_REQUIRES(c,
+                ValidEmptyOutputShape(shape_input.NumElements(),
+                                      indices.shape().num_elements(),
+                                      updates.shape().num_elements()),
+                errors::InvalidArgument(
+                    "Indices and updates specified for empty output shape"));
 
     const int64 outer_dims = indices.shape().dims() - 1;
 
@@ -148,12 +161,12 @@ class TensorScatterOp : public OpKernel {
 
     TensorShape shape = input.shape();
 
-    OP_REQUIRES(
-        c,
-        (shape.num_elements() > 0 || (indices.shape().num_elements() == 0 &&
-                                      updates.shape().num_elements() == 0)),
-        errors::InvalidArgument(
-            "Indices and updates specified for empty output shape"));
+    OP_REQUIRES(c,
+                ValidEmptyOutputShape(shape.num_elements(),
+                                      indices.shape().num_elements(),
+                                      updates.shape().num_elements()),
+                errors::InvalidArgument(
+                    "Indices and updates specified for empty output shape"));
 
     const int64 outer_dims = indices.shape().dims() - 1;
 
@@ -184,7 +197,7 @@ class TensorScatterOp : public OpKernel {
     }
 
     std::unique_ptr<Tensor> forwarded_input = c->forward_input(
-        2, 0, input.dtype(), shape, DEVICE_MEMORY, AllocatorAttributes());
+        0, 0, input.dtype(), shape, DEVICE_MEMORY, AllocatorAttributes());
 
     if (forwarded_input == nullptr) {
       // We were not able to forward the input, so we deep copy the tensor and
@@ -202,6 +215,8 @@ class TensorScatterOp : public OpKernel {
       OP_REQUIRES_OK(c, functor::DoScatterNd<Device, T, Index, op>(
                             c, indices, updates, shape, forwarded_input.get(),
                             false /*allocate*/));
+
+      c->set_output(0, *forwarded_input);
     }
   }
 };
@@ -231,6 +246,7 @@ class ScatterNdUpdateOp : public OpKernel {
       Var* v;
       OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &v));
       core::ScopedUnref scoped_unref(v);
+      OP_REQUIRES_OK(c, EnsureSparseVariableAccess<Device, T>(c, v));
       mutex_lock m(*v->mu());
       DoCompute(c);
     } else if (use_exclusive_lock_) {
@@ -258,7 +274,6 @@ class ScatterNdUpdateOp : public OpKernel {
       Var* v;
       OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &v));
       Tensor* t = v->tensor();
-      OP_REQUIRES_OK(c, PrepareToUpdateVariable<Device, T>(c, t));
       params = *t;
       params_shape = params.shape();
     } else if (IsRefType(c->input_dtype(0))) {
@@ -338,7 +353,9 @@ class ScatterNdUpdateOp : public OpKernel {
   REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, "ScatterNdSub",            \
                                     scatter_nd_op::UpdateOp::SUB);        \
   REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL(                             \
-      type, dev, "ResourceScatterNdAdd", scatter_nd_op::UpdateOp::ADD);
+      type, dev, "ResourceScatterNdAdd", scatter_nd_op::UpdateOp::ADD);   \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL(                             \
+      type, dev, "ResourceScatterNdSub", scatter_nd_op::UpdateOp::SUB);
 
 #define REGISTER_SCATTER_ND(type, dev) \
   REGISTER_SCATTER_ND_KERNEL(type, dev, "ScatterNd");
@@ -546,8 +563,9 @@ Status PrepareAndValidateInputs(const TensorShape& params_shape,
                                    "got shape: ", params_shape.DebugString());
   }
 
-  if (!(params_shape.num_elements() > 0 ||
-        (indices.NumElements() == 0 && updates.NumElements() == 0))) {
+  if (!ValidEmptyOutputShape(params_shape.num_elements(),
+                             indices_shape.num_elements(),
+                             updates_shape.num_elements())) {
     return errors::InvalidArgument(
         "Indices and updates specified for empty output.  indices shape: ",
         indices.shape().DebugString());
diff --git a/tensorflow/core/kernels/scatter_nd_op.h b/tensorflow/core/kernels/scatter_nd_op.h
index 8d04731aae6329dbfd2539ec441a2d1b140f6cd3..eec70ba69e5101068dfdcfde5152ab9ea2088efe 100644
--- a/tensorflow/core/kernels/scatter_nd_op.h
+++ b/tensorflow/core/kernels/scatter_nd_op.h
@@ -18,11 +18,11 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/scatter_nd_op.h"
 #include "tensorflow/core/platform/mutex.h"
diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
index 472f5a3547aaaf0237a6d3ce51a141519c4d11a4..01e4656eab8b2b067f870253ba9f3223835a461f 100644
--- a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
@@ -24,11 +24,11 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/scatter_nd_op.h"
 #include "tensorflow/core/platform/mutex.h"
diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc
index 0fbde764d57eb661314b699ef9902238ad38b2cf..ee3c5833470eca54121ab73209e484578b42149e 100644
--- a/tensorflow/core/kernels/scatter_op.cc
+++ b/tensorflow/core/kernels/scatter_op.cc
@@ -288,7 +288,7 @@ TF_CALL_ALL_TYPES(REGISTER_SCATTER_UPDATE_CPU);
 #define REGISTER_SCATTER_UPDATE_GPU(type) REGISTER_SCATTER_UPDATE(type, GPU);
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHMETIC_GPU);
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_MINMAX_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_SCATTER_MINMAX_GPU);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_UPDATE_GPU);
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/scatter_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_op_gpu.cu.cc
index 0df329310f0dc51bbe91b784a40fd7bf68b012f0..d4defb8503679f3b2b6d479719f1378bd53cff19 100644
--- a/tensorflow/core/kernels/scatter_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_op_gpu.cu.cc
@@ -41,6 +41,7 @@ typedef Eigen::GpuDevice GPUDevice;
   DEFINE_GPU_SPECS_INDEX(T, int32); \
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
+DEFINE_GPU_SPECS(Eigen::half);
 DEFINE_GPU_SPECS(float);
 DEFINE_GPU_SPECS(double);
 // TODO: The following fails to compile.
diff --git a/tensorflow/core/kernels/sdca_internal.cc b/tensorflow/core/kernels/sdca_internal.cc
index a8e9b3261cd29191955509f34028660dff862bd7..cbc754af0e9bb1f3606e9de5e31bc415b2113f3d 100644
--- a/tensorflow/core/kernels/sdca_internal.cc
+++ b/tensorflow/core/kernels/sdca_internal.cc
@@ -26,6 +26,10 @@ limitations under the License.
 #include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
 
+#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
+#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
+#endif
+
 namespace tensorflow {
 namespace sdca {
 
@@ -306,7 +310,10 @@ Status Examples::SampleAdaptiveProbabilities(
 
 void Examples::RandomShuffle() {
   std::iota(sampled_index_.begin(), sampled_index_.end(), 0);
-  std::random_shuffle(sampled_index_.begin(), sampled_index_.end());
+
+  std::random_device rd;
+  std::mt19937 rng(rd());
+  std::shuffle(sampled_index_.begin(), sampled_index_.end(), rng);
 }
 
 // TODO(sibyl-Aix6ihai): Refactor/shorten this function.
diff --git a/tensorflow/core/kernels/searchsorted_op.cc b/tensorflow/core/kernels/searchsorted_op.cc
index dc627ac77a51d6da994309687c5694d261908524..06b2d818374fd6a102ec3966e57e3619b4d18289 100644
--- a/tensorflow/core/kernels/searchsorted_op.cc
+++ b/tensorflow/core/kernels/searchsorted_op.cc
@@ -17,11 +17,11 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/searchsorted_op.h"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
index 2328fc6afd8e7b7c24351e612ea6b760a2d522c3..6e1a0d57a169b51e184330c984a5c75d332490da 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -22,15 +22,17 @@ limitations under the License.
 
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
 #include "tensorflow/core/kernels/segment_reduction_ops.h"
 #include <vector>
+
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/util.h"
diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc
index 9db0bd4d98bdb9964cb561d96d91782ba3615a7f..21c3b89f548e30cff345a072ca2e11dfe15081b5 100644
--- a/tensorflow/core/kernels/sequence_ops.cc
+++ b/tensorflow/core/kernels/sequence_ops.cc
@@ -143,11 +143,12 @@ class LinSpaceOp : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, TensorShape({num}), &out));
     auto flat = out->flat<T>();
-    if (num == 1) {
-      flat(0) = start;
-    } else {
+    flat(0) = start;
+    if (num > 1) {
       const T step = (stop - start) / (num - 1);
-      for (Tnum i = 0; i < num; ++i) flat(i) = start + step * i;
+      for (Tnum i = 1; i < num - 1; ++i) flat(i) = start + step * i;
+      // Ensure final value == stop; float arithmetic won't guarantee this.
+      flat(num - 1) = stop;
     }
   }
 };
diff --git a/tensorflow/core/kernels/sequence_ops_test.cc b/tensorflow/core/kernels/sequence_ops_test.cc
index 5f0e0a69a890aafa56b43cc55e99f490c100faa7..2247c447500693942ebaeda33eb5cd2baf7d226a 100644
--- a/tensorflow/core/kernels/sequence_ops_test.cc
+++ b/tensorflow/core/kernels/sequence_ops_test.cc
@@ -114,6 +114,27 @@ TEST_F(LinSpaceOpTest, Simple_D32) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
+TEST_F(LinSpaceOpTest, Exact_Endpoints) {
+  MakeOp(DT_FLOAT, DT_INT32);
+
+  // Feed and run. The particular values 0., 1., and 42 are chosen to test that
+  // the last value is not calculated via an intermediate delta as (1./41)*41,
+  // because for IEEE 32-bit floats that returns 0.99999994 != 1.0.
+  AddInputFromArray<float>(TensorShape({}), {0.0});
+  AddInputFromArray<float>(TensorShape({}), {1.0});
+  AddInputFromArray<int32>(TensorShape({}), {42});
+  TF_ASSERT_OK(RunOpKernel());
+
+  // Check the output
+  Tensor output = *GetOutput(0);
+  float expected_start = 0.0;
+  float start = output.flat<float>()(0);
+  EXPECT_EQ(expected_start, start) << expected_start << " vs. " << start;
+  float expected_stop = 1.0;
+  float stop = output.flat<float>()(output.NumElements() - 1);
+  EXPECT_EQ(expected_stop, stop) << expected_stop << " vs. " << stop;
+}
+
 TEST_F(LinSpaceOpTest, Single_D64) {
   MakeOp(DT_FLOAT, DT_INT64);
 
diff --git a/tensorflow/core/kernels/shape_op_test.cc b/tensorflow/core/kernels/shape_op_test.cc
deleted file mode 100644
index 30cb1e0a7f80f084854073ee061500bbcf0ccade..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/shape_op_test.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <functional>
-#include <memory>
-
-#include "tensorflow/cc/client/client_session.h"
-#include "tensorflow/cc/ops/array_ops.h"
-#include "tensorflow/cc/ops/const_op.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/variant.h"
-#include "tensorflow/core/framework/variant_encode_decode.h"
-#include "tensorflow/core/framework/variant_op_registry.h"
-#include "tensorflow/core/kernels/ops_testutil.h"
-#include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/abi.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-namespace {
-
-class ShapeOpTest : public OpsTestBase {};
-
-struct NoKnownShape {
-  string TypeName() const { return "NO KNOWN SHAPE"; }
-};
-
-REGISTER_UNARY_VARIANT_DECODE_FUNCTION(NoKnownShape, "NO KNOWN SHAPE");
-
-struct KnownVecSize {
-  KnownVecSize() : shape_value(0) {}
-  explicit KnownVecSize(int value) : shape_value(value) {}
-  string TypeName() const { return "KNOWN VECTOR SIZE TYPE"; }
-  bool Decode(const VariantTensorData& d) {
-    return d.get_metadata(&shape_value);
-  }
-  void Encode(VariantTensorData* d) const { d->set_metadata(shape_value); }
-  int shape_value;
-};
-
-Status GetShapeFromKnownVecSize(const KnownVecSize& ks, TensorShape* s) {
-  *s = TensorShape({ks.shape_value});
-  return Status::OK();
-}
-
-REGISTER_UNARY_VARIANT_DECODE_FUNCTION(KnownVecSize, "KNOWN VECTOR SIZE TYPE");
-
-REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(KnownVecSize, GetShapeFromKnownVecSize);
-
-static void ExpectHasError(const Status& s, StringPiece substr) {
-  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
-      << ">>" << s << "<<, expected substring >>" << substr << "<<";
-}
-
-TEST_F(ShapeOpTest, Simple) {
-  // Ensure the ops run on CPU, as we have no device copy registration
-  // for NoKnownShape and KnownVecSize objects.
-  Scope root = Scope::NewRootScope().WithDevice("/cpu:0");
-
-  // Use a placeholder so the graph optimizer doesn't optimize away
-  // the shape function.
-  auto input = ops::Placeholder(root, DT_VARIANT);
-  auto shape_output = ops::Shape(root, input);
-  auto rank_output = ops::Rank(root, input);
-  auto size_output = ops::Size(root, input);
-
-  TF_ASSERT_OK(root.status());
-
-  ClientSession session(root);
-
-  std::vector<Tensor> outputs;
-
-  {
-    // Test no shape registered.
-    Tensor variant_tensor(DT_VARIANT, TensorShape({}));
-    Variant& v = variant_tensor.scalar<Variant>()();
-    v = NoKnownShape();
-    Status s = session.Run({{input, variant_tensor}}, {shape_output}, &outputs);
-    EXPECT_FALSE(s.ok());
-    ExpectHasError(
-        s, strings::StrCat(
-               "No unary variant shape function found for Variant type_index: ",
-               port::MaybeAbiDemangle(MakeTypeIndex<NoKnownShape>().name())));
-  }
-
-  {
-    // Test non-scalar variant.
-    Tensor variant_tensor(DT_VARIANT, TensorShape({1}));
-    Status s = session.Run({{input, variant_tensor}}, {shape_output}, &outputs);
-    EXPECT_FALSE(s.ok());
-    ExpectHasError(s, "Shape of non-unary Variant not supported.");
-  }
-
-  {
-    // Test registered variant.
-    Tensor variant_tensor(DT_VARIANT, TensorShape({}));
-    const int vec_dim_value = -0xdeadbeef;  // must be non-negative.
-    Variant& v = variant_tensor.scalar<Variant>()();
-    v = KnownVecSize(vec_dim_value);
-    TF_EXPECT_OK(session.Run({{input, variant_tensor}},
-                             {shape_output, rank_output, size_output},
-                             &outputs));
-    EXPECT_EQ(outputs[0].dims(), 1);  // shape
-    EXPECT_EQ(vec_dim_value, outputs[0].vec<int32>()(0));
-    EXPECT_EQ(outputs[1].dims(), 0);  // rank
-    EXPECT_EQ(1, outputs[1].scalar<int32>()());
-    EXPECT_EQ(outputs[2].dims(), 0);  // size
-    EXPECT_EQ(vec_dim_value, outputs[0].scalar<int32>()());
-  }
-}
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/shape_ops.cc b/tensorflow/core/kernels/shape_ops.cc
index ab1ce0f9c83025e472c114225265ce9430be93a3..db7357ca70e8050ff5d0d858989f27673af5f49d 100644
--- a/tensorflow/core/kernels/shape_ops.cc
+++ b/tensorflow/core/kernels/shape_ops.cc
@@ -469,8 +469,7 @@ class EnsureShapeOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     TensorShape shape;
-    OP_REQUIRES_OK(ctx,
-                   shape_op_helpers::GetRegularOrVariantShape(ctx, 0, &shape));
+    OP_REQUIRES_OK(ctx, shape_op_helpers::GetShape(ctx, 0, &shape));
 
     if (!expected_shape_.IsCompatibleWith(shape)) {
       ctx->SetStatus(errors::InvalidArgument(
diff --git a/tensorflow/core/kernels/shape_ops.h b/tensorflow/core/kernels/shape_ops.h
index 7a50f158af02e698681ef513c2baa2be1e22267f..03b32b88d9b7f4441439fb382bc5f8c47643ae43 100644
--- a/tensorflow/core/kernels/shape_ops.h
+++ b/tensorflow/core/kernels/shape_ops.h
@@ -20,27 +20,18 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 
 namespace tensorflow {
 
 namespace shape_op_helpers {
-inline Status GetRegularOrVariantShape(OpKernelContext* ctx, int input_index,
-                                       TensorShape* shape) {
-  const Tensor& inp = ctx->input(input_index);
-  if (ctx->input_dtype(0) == DT_VARIANT) {
-    if (inp.dims() != 0) {
-      return errors::InvalidArgument(
-          "Shape of non-unary Variant not supported.");
-    }
-    TF_RETURN_IF_ERROR(GetUnaryVariantShape(inp, shape));
-  } else {
-    *shape = inp.shape();
-  }
+inline Status GetShape(OpKernelContext* ctx, int input_index,
+                       TensorShape* shape) {
+  *shape = ctx->input(input_index).shape();
   return Status::OK();
 }
 }  // namespace shape_op_helpers
@@ -52,8 +43,7 @@ class ShapeOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     TensorShape shape;
-    OP_REQUIRES_OK(ctx,
-                   shape_op_helpers::GetRegularOrVariantShape(ctx, 0, &shape));
+    OP_REQUIRES_OK(ctx, shape_op_helpers::GetShape(ctx, 0, &shape));
     const int rank = shape.dims();
     Tensor* out = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({rank}), &out));
@@ -81,8 +71,7 @@ class ShapeNOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     for (int i = 0; i < ctx->num_inputs(); ++i) {
       TensorShape shape;
-      OP_REQUIRES_OK(
-          ctx, shape_op_helpers::GetRegularOrVariantShape(ctx, i, &shape));
+      OP_REQUIRES_OK(ctx, shape_op_helpers::GetShape(ctx, i, &shape));
       const int dims = shape.dims();
       Tensor* out = nullptr;
       OP_REQUIRES_OK(ctx, ctx->allocate_output(i, {dims}, &out));
@@ -110,8 +99,7 @@ class RankOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     TensorShape shape;
-    OP_REQUIRES_OK(ctx,
-                   shape_op_helpers::GetRegularOrVariantShape(ctx, 0, &shape));
+    OP_REQUIRES_OK(ctx, shape_op_helpers::GetShape(ctx, 0, &shape));
     const int rank = shape.dims();
     Tensor* out = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
@@ -128,8 +116,7 @@ class SizeOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     TensorShape shape;
-    OP_REQUIRES_OK(ctx,
-                   shape_op_helpers::GetRegularOrVariantShape(ctx, 0, &shape));
+    OP_REQUIRES_OK(ctx, shape_op_helpers::GetShape(ctx, 0, &shape));
     const int64 size = shape.num_elements();
     Tensor* out = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
diff --git a/tensorflow/core/kernels/softmax_op_functor.h b/tensorflow/core/kernels/softmax_op_functor.h
index c8bc1ad3bbb60e147dbb1d8fdf3c988b395ea19d..218698f3fff89166c0440195de25295dfe0028ab 100644
--- a/tensorflow/core/kernels/softmax_op_functor.h
+++ b/tensorflow/core/kernels/softmax_op_functor.h
@@ -57,7 +57,6 @@ struct SoftmaxEigenImpl {
     Eigen::DSizes<int, 2> one_by_class(1, num_classes);
 #else
     Eigen::IndexList<Eigen::type2index<kClassDim> > along_class;
-    Eigen::IndexList<Eigen::type2index<1> > depth_dim;
     Eigen::IndexList<int, Eigen::type2index<1> > batch_by_one;
     batch_by_one.set(0, batch_size);
     Eigen::IndexList<Eigen::type2index<1>, int> one_by_class;
diff --git a/tensorflow/core/kernels/spacetobatch_functor.h b/tensorflow/core/kernels/spacetobatch_functor.h
index f46a84da1e951113382e4d44b44463c2a621ca10..459f20b0ae1cea1769277f4d367829d61e831ca1 100644
--- a/tensorflow/core/kernels/spacetobatch_functor.h
+++ b/tensorflow/core/kernels/spacetobatch_functor.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include <type_traits>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index 2ea7a1ed3b9c5c37e0c93edef9431ce0438d380d..9c9e7370ac44bfb704f5491e2c572e961f188e3a 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -48,11 +48,11 @@ limitations under the License.
 #include "tensorflow/core/kernels/eigen_contraction_kernel.h"
 #endif
 
+#define ALWAYS_INLINE EIGEN_ALWAYS_INLINE
+
 namespace tensorflow {
 namespace {
 
-using Eigen::operator==;
-
 template <typename T>
 using BasicMatrix = Eigen::Tensor<T, 2, Eigen::RowMajor>;
 
@@ -161,6 +161,19 @@ struct SparseSlice {
   const int block_size;
 };
 
+template <typename T>
+bool IsZero(T v);
+
+template <>
+ALWAYS_INLINE bool IsZero(bfloat16 v) {
+  return v.IsZero();
+}
+
+template <>
+ALWAYS_INLINE bool IsZero(float v) {
+  return v == 0.0f;
+}
+
 template <typename T>
 template <bool Transpose>
 void SparseSlice<T>::Initialize(
@@ -182,9 +195,8 @@ void SparseSlice<T>::Initialize(
   index.reserve(num_blocks * num_rows * 2);
 
   Index3 idx3;
-  Index idx;
-  int data3_size = 0;
-  static const T zero(0);
+  const int stride = Transpose ? mat.dimension(1) : 1;
+
   for (int i = 0; i < num_blocks; ++i) {
     int num_block_cols = std::min(block_size, num_cols - block_size * i);
     for (int row = 0; row < num_rows; ++row) {
@@ -196,54 +208,48 @@ void SparseSlice<T>::Initialize(
       const auto* start =
           Transpose ? &mat(col_offset, row) : &mat(row, col_offset);
       const auto* curr = start;
-      const int stride = Transpose ? mat.dimension(1) : 1;
       const auto* end = start + stride * num_block_cols;
       uint8 k = 0;
 #define NEXT_ELEM \
   curr += stride; \
   ++k;
+#define EAT_ZEROS                          \
+  while (curr < end && IsZero<T>(*curr)) { \
+    NEXT_ELEM;                             \
+  }
       while (true) {
-        while (curr < end && (*curr == zero)) {
-          NEXT_ELEM;
-        }
+        EAT_ZEROS
         if (curr >= end) break;
         idx3.k1 = k;
-        data3.push_back(*curr);
+        const T value1 = *curr;
         NEXT_ELEM;
 
-        while (curr < end && (*curr == zero)) {
-          NEXT_ELEM;
+        EAT_ZEROS
+        if (curr >= end) {
+          data.push_back(value1);
+          index.push_back({idx3.m, idx3.k1});
+          break;
         }
-        if (curr >= end) break;
         idx3.k2 = k;
-        data3.push_back(*curr);
+        const T value2 = *curr;
         NEXT_ELEM;
 
-        while (curr < end && (*curr == zero)) {
-          NEXT_ELEM;
+        EAT_ZEROS
+        if (curr >= end) {
+          data.push_back(value2);
+          index.push_back({idx3.m, idx3.k2});
+          data.push_back(value1);
+          index.push_back({idx3.m, idx3.k1});
+          break;
         }
-        if (curr >= end) break;
         idx3.k3 = k;
+        data3.push_back(value1);
+        data3.push_back(value2);
         data3.push_back(*curr);
         NEXT_ELEM;
         index3.push_back(idx3);
 #undef NEXT_ELEM
-      }
-      int num_inserted_mod = data3.size() % 3;
-      // Move some elements to index and data if needed.
-      data3_size = data3.size() - num_inserted_mod;
-      idx.m = idx3.m;
-      switch (num_inserted_mod) {
-        case 2:
-          idx.k = idx3.k2;
-          data.push_back(data3[data3_size + 1]);
-          index.push_back(idx);
-          TF_FALLTHROUGH_INTENDED;
-        case 1:
-          idx.k = idx3.k1;
-          data.push_back(data3[data3_size]);
-          index.push_back(idx);
-          data3.resize(data3_size);
+#undef EAT_ZEROS
       }
     }
     col_offset += block_size;
@@ -276,8 +282,6 @@ const int kNumOperands = (sizeof(Packet) / sizeof(float));
 #define STORE(x, y) Eigen::internal::pstore<float>(x, y);
 #define FMA(a, b, c, d) d = Eigen::internal::pmadd<Packet>(a, b, c);
 
-#define ALWAYS_INLINE EIGEN_ALWAYS_INLINE
-
 ALWAYS_INLINE float ConvertBfloat16ToFloat(const bfloat16* src) {
   float out = 0;
   auto tmp = reinterpret_cast<bfloat16*>(&out);
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
index 30c57ef287f4c645b198da6ebf6b8554dde4fd12..0a97c6b6a5424c3c75c52add13bfa8021b665e17 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op.cc
@@ -19,9 +19,9 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/fill_functor.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
index e261e42e0d3bf43efc3a1328f07b1362f0870dfd..ea95a882b1f0a7dec7581bd6d0335c4f454d87e1 100644
--- a/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/sparse_tensor_dense_matmul_op_gpu.cu.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/sparse_tensor_dense_matmul_op.h"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/util/cuda_kernel_helper.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/sparse_tensors_map_ops.cc b/tensorflow/core/kernels/sparse_tensors_map_ops.cc
index 74fa3a15f06fdb267dc9776ee8a0903f8f6626de..939638b37058bf8294ebc437c6c14dbb696a8aa8 100644
--- a/tensorflow/core/kernels/sparse_tensors_map_ops.cc
+++ b/tensorflow/core/kernels/sparse_tensors_map_ops.cc
@@ -43,7 +43,7 @@ class SparseTensorsMap : public ResourceBase {
  public:
   explicit SparseTensorsMap(const string& name) : name_(name), counter_(0) {}
 
-  string DebugString() override { return "A SparseTensorsMap"; }
+  string DebugString() const override { return "A SparseTensorsMap"; }
 
   typedef struct {
     PersistentTensor indices;
diff --git a/tensorflow/core/kernels/sparse_utils.cc b/tensorflow/core/kernels/sparse_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..198862940d1841675f8d7a0b0ade7160d1dc0582
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_utils.cc
@@ -0,0 +1,161 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/sparse_utils.h"
+
+#include <cstddef>
+
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+namespace sparse_utils {
+
+template <typename Tindices>
+Tindices FindNextDenseRowStartIndex(
+    const Tindices sparse_index_begin,
+    const typename TTypes<Tindices>::ConstMatrix& indices_mat) {
+  // Search in the index range [begin, end) of indices_mat.
+  Tindices begin = sparse_index_begin;
+  Tindices end = indices_mat.dimension(0);
+  const Tindices orig_sparse_index_end = end;
+
+  // The first dense row we search.
+  const Tindices orig_dense_index_begin = indices_mat(begin, 0);
+  // Early exit if no next dense row index.
+  if (orig_dense_index_begin == static_cast<int64>(indices_mat(end - 1, 0))) {
+    return orig_sparse_index_end;
+  }
+
+  Tindices increment = 1;
+  while (begin + increment < end &&
+         indices_mat(begin + increment, 0) == orig_dense_index_begin) {
+    increment *= 2;
+  }
+  // Narrow the search space as an optimization.
+  if (begin + increment < end) {
+    end = begin + increment;
+  }
+  begin += increment / 2;
+
+  // Perform a binary search on the interval [begin, end) for
+  // dense_row_index_to_find.
+  const Tindices dense_row_index_to_find = orig_dense_index_begin;
+  while (begin < end) {
+    const Tindices m = begin + (end - begin) / 2;
+    const Tindices m_dense_row_index = static_cast<Tindices>(indices_mat(m, 0));
+    if (m_dense_row_index == dense_row_index_to_find &&
+        (m + 1 == orig_sparse_index_end ||
+         static_cast<Tindices>(indices_mat(m + 1, 0)) !=
+             dense_row_index_to_find)) {
+      return m + 1;
+    } else if (m_dense_row_index <= dense_row_index_to_find) {
+      begin = m + 1;
+    } else {
+      end = m;
+    }
+  }
+
+  // No next dense row index.
+  return orig_sparse_index_end;
+}
+
+template <typename Tindices>
+std::vector<Tindices> GetStartIndicesOfEachDenseRow(
+    const typename TTypes<Tindices>::ConstMatrix& indices_mat,
+    bool* contains_empty_rows) {
+  int64 start_sparse_index_of_cur_dense_row = 0;
+  std::vector<Tindices> segment_indices;
+  const Tindices num_entries_in_sparse_tensor = indices_mat.dimension(0);
+  const Tindices num_dense_rows_in_sparse_tensor =
+      1 + indices_mat(num_entries_in_sparse_tensor - 1, 0) - indices_mat(0, 0);
+  // Reserve an extra slot for the 0 we store in the first entry by convention.
+  segment_indices.reserve(1 + num_dense_rows_in_sparse_tensor);
+  segment_indices.push_back(0);
+  *contains_empty_rows = false;
+  while (true) {
+    const Tindices start_sparse_index_of_next_dense_row =
+        FindNextDenseRowStartIndex<Tindices>(
+            start_sparse_index_of_cur_dense_row, indices_mat);
+    if (start_sparse_index_of_next_dense_row == num_entries_in_sparse_tensor) {
+      segment_indices.push_back(start_sparse_index_of_next_dense_row);
+      break;
+    }
+    // Encode the length of the current dense row as well as the lengths of all
+    // the empty rows until the next dense row,
+    for (Tindices i = 0;
+         i < indices_mat(start_sparse_index_of_next_dense_row, 0) -
+                 indices_mat(start_sparse_index_of_cur_dense_row, 0);
+         ++i) {
+      segment_indices.push_back(start_sparse_index_of_next_dense_row);
+    }
+    // If there is more than one row between the current and next non-empty
+    // rows then those rows are empty.
+    *contains_empty_rows |=
+        indices_mat(start_sparse_index_of_next_dense_row, 0) -
+            indices_mat(start_sparse_index_of_cur_dense_row, 0) >
+        1;
+    start_sparse_index_of_cur_dense_row = start_sparse_index_of_next_dense_row;
+  }
+  return segment_indices;
+}
+
+template <typename Tindices>
+std::vector<Tindices> ParseRowStartIndices(
+    const tensorflow::Tensor& tensor,
+    const Tindices num_nonzero_entries_in_sparse_mat) {
+  std::vector<Tindices> out;
+  auto vec = tensor.vec<Tindices>();
+  out.reserve(vec.size() + 1);
+  for (size_t i = 0; i < vec.dimension(0); ++i) {
+    out.push_back(vec(i));
+  }
+  out.push_back(num_nonzero_entries_in_sparse_mat);
+  return out;
+}
+
+template <typename Tindices>
+bool ContainsEmptyRows(const std::vector<Tindices>& row_start_indices) {
+  // Skip checking the lengths of the first and last dense rows since those are
+  // always non-empty.
+  for (size_t i = 2; i < row_start_indices.size() - 1; ++i) {
+    if (row_start_indices.at(i) - row_start_indices.at(i - 1) == 0) {
+      return true;
+    }
+  }
+  return false;
+}
+
+#define REGISTER_SPARSE_UTIL_FUNCTIONS(TypeIndex)                           \
+  template TypeIndex FindNextDenseRowStartIndex<TypeIndex>(                 \
+      const TypeIndex sparse_index_begin,                                   \
+      const TTypes<TypeIndex>::ConstMatrix& indices_mat);                   \
+  template std::vector<TypeIndex> GetStartIndicesOfEachDenseRow<TypeIndex>( \
+      const TTypes<TypeIndex>::ConstMatrix& indices_mat,                    \
+      bool* contains_empty_rows);                                           \
+  template bool ContainsEmptyRows<TypeIndex>(                               \
+      const std::vector<TypeIndex>& row_start_indices);                     \
+  template std::vector<TypeIndex> ParseRowStartIndices<TypeIndex>(          \
+      const tensorflow::Tensor& tensor,                                     \
+      const TypeIndex num_nonzero_entries_in_sparse_mat);
+
+REGISTER_SPARSE_UTIL_FUNCTIONS(int32);
+REGISTER_SPARSE_UTIL_FUNCTIONS(int64);
+REGISTER_SPARSE_UTIL_FUNCTIONS(uint8);
+REGISTER_SPARSE_UTIL_FUNCTIONS(uint16);
+REGISTER_SPARSE_UTIL_FUNCTIONS(uint32);
+REGISTER_SPARSE_UTIL_FUNCTIONS(uint64);
+
+}  // namespace sparse_utils
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse_utils.h b/tensorflow/core/kernels/sparse_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..9e3c41a49642ebe722b7aeb5adeb6f41cea858b3
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_utils.h
@@ -0,0 +1,71 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helpers for writing OpKernels for sparse tensors.
+#ifndef TENSORFLOW_CORE_KERNELS_SPARSE_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_SPARSE_UTILS_H_
+
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace sparse_utils {
+
+// Find the index i of the first element for which
+// indices_mat(sparse_index_begin, 0) < indices_mat(i, 0).
+// The search is conducted in the open interval
+// [sparse_index_begin, indices_mat.dimension(0)) and when no such i is found,
+// indices_mat.dimension(0) is returned.
+// indices_mat(k, 0) should be non-decreasing over the interval
+// [begin, indices_mat.dimension(0)).
+// Requires 0 <= sparse_index_begin < indices_mat.dimension(0).
+template <typename Tindices>
+Tindices FindNextDenseRowStartIndex(
+    const Tindices sparse_index_begin,
+    const typename TTypes<Tindices>::ConstMatrix& indices_mat);
+
+// Returns the vector v of indices in indices_mat at which new dense matrix
+// rows begin.
+// v.front() = 0, v.back() = indices_mat.dimension(0), and for i > 0,
+// v[i] - v[i-1] is the length of the ith dense row in indices_mat.
+// *contains_empty_rows = true if and only if indices_mat contains empty rows
+// (rows without values) between its first and last row.
+template <typename Tindices>
+std::vector<Tindices> GetStartIndicesOfEachDenseRow(
+    const typename TTypes<Tindices>::ConstMatrix& indices_mat,
+    bool* contains_empty_rows);
+
+// Converts tensor.vec<Tindices> to an std::vector<Tindices> object, appends
+// the value num_nonzero_entries_in_sparse_mat, and returns the result.
+template <typename Tindices>
+std::vector<Tindices> ParseRowStartIndices(
+    const tensorflow::Tensor& tensor,
+    const Tindices num_nonzero_entries_in_sparse_mat);
+
+// Returns true if and only if the sparse matrix indices_mat whose row start
+// indices are represented by row_start_indices has empty dense rows
+// (between its first and last dense rows).
+// This function satisfies the identity row_start_indices ==
+// GetStartIndicesOfEachDenseRow(indices_mat, &return_value).
+template <typename Tindices>
+bool ContainsEmptyRows(const std::vector<Tindices>& row_start_indices);
+
+}  // namespace sparse_utils
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SPARSE_UTILS_H_
diff --git a/tensorflow/core/kernels/sparse_utils_test.cc b/tensorflow/core/kernels/sparse_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5d0adff8860ded4c8b1f49b99ba6eb3a261782aa
--- /dev/null
+++ b/tensorflow/core/kernels/sparse_utils_test.cc
@@ -0,0 +1,263 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/sparse_utils.h"
+
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace {
+
+using tensorflow::DataType;
+using tensorflow::int32;
+using tensorflow::int64;
+using tensorflow::Tensor;
+using tensorflow::TTypes;
+using tensorflow::uint16;
+using tensorflow::uint32;
+using tensorflow::uint64;
+using tensorflow::sparse_utils::ContainsEmptyRows;
+using tensorflow::sparse_utils::FindNextDenseRowStartIndex;
+using tensorflow::sparse_utils::GetStartIndicesOfEachDenseRow;
+using tensorflow::sparse_utils::ParseRowStartIndices;
+
+TEST(SparseUtilsTest, GetStartIndicesOfEachDenseRow) {
+  {
+    int32 data[] = {0, 0, 1, 0, 4, 0, 6, 0, 7, 0, 8, 0, 10, 0, 12, 0};
+    TTypes<int32>::ConstMatrix indices_mat(data, 8, 2);
+    // indices_list = {0, 1, 4, 6, 7, 8, 10, 12};
+    bool contains_empty_rows;
+    EXPECT_TRUE(GetStartIndicesOfEachDenseRow<int32>(indices_mat,
+                                                     &contains_empty_rows) ==
+                std::vector<int32>({0, 1, 2, 2, 2, 3, 3, 4, 5, 6, 6, 7, 7, 8}));
+    EXPECT_TRUE(contains_empty_rows);
+  }
+  {
+    int32 data[] = {0, 0, 1, 0, 1, 0, 4, 0, 4, 0, 4, 0,  6, 0,  7,
+                    0, 7, 0, 7, 0, 7, 0, 8, 0, 8, 0, 10, 0, 12, 0};
+    TTypes<int32>::ConstMatrix indices_mat(data, 15, 2);
+    // indices_list = {0, 1, 1, 4, 4, 4,  6, 7, 7, 7, 7, 8, 8, 10, 12};
+    bool contains_empty_rows;
+    EXPECT_TRUE(
+        GetStartIndicesOfEachDenseRow<int32>(indices_mat,
+                                             &contains_empty_rows) ==
+        std::vector<int32>({0, 1, 3, 3, 3, 6, 6, 7, 11, 13, 13, 14, 14, 15}));
+    EXPECT_TRUE(contains_empty_rows);
+  }
+  {
+    int64 data[] = {3, 0};
+    TTypes<int64>::ConstMatrix indices_mat(data, 1, 2);
+    bool contains_empty_rows;
+    EXPECT_TRUE(GetStartIndicesOfEachDenseRow<int64>(indices_mat,
+                                                     &contains_empty_rows) ==
+                std::vector<int64>({0, 1}));
+    EXPECT_FALSE(contains_empty_rows);
+  }
+  {
+    uint32 data[] = {3, 0, 3, 0};
+    TTypes<uint32>::ConstMatrix indices_mat(data, 2, 2);
+    bool contains_empty_rows;
+    EXPECT_TRUE(GetStartIndicesOfEachDenseRow<uint32>(indices_mat,
+                                                      &contains_empty_rows) ==
+                std::vector<uint32>({0, 2}));
+    EXPECT_FALSE(contains_empty_rows);
+  }
+  {
+    uint16 data[] = {0, 0, 0, 0, 0, 0, 1, 0};
+    TTypes<uint16>::ConstMatrix indices_mat(data, 4, 2);
+    // indices_list = {0, 0, 0, 1};
+    bool contains_empty_rows;
+    EXPECT_TRUE(GetStartIndicesOfEachDenseRow<uint16>(indices_mat,
+                                                      &contains_empty_rows) ==
+                std::vector<uint16>({0, 3, 4}));
+    EXPECT_FALSE(contains_empty_rows);
+  }
+  {
+    uint64 data[] = {0, 0, 0, 0, 0, 0, 3, 0};
+    TTypes<uint64>::ConstMatrix indices_mat(data, 4, 2);
+    bool contains_empty_rows;
+    // indices_list = {0, 0, 0, 3};
+    EXPECT_TRUE(GetStartIndicesOfEachDenseRow<uint64>(indices_mat,
+                                                      &contains_empty_rows) ==
+                std::vector<uint64>({0, 3, 3, 3, 4}));
+    EXPECT_TRUE(contains_empty_rows);
+  }
+}
+
+TEST(SparseUtilsTest, ParseRowStartIndices) {
+  {
+    Tensor t(DataType::DT_INT32, {1});
+    int indx = 0;
+    for (const int32 v : {0}) {
+      t.flat<int32>()(indx++) = v;
+    }
+    EXPECT_TRUE(ParseRowStartIndices<int32>(t, 1) ==
+                std::vector<int32>({0, 1}));
+  }
+  {
+    Tensor t(DataType::DT_INT64, {1});
+    int indx = 0;
+    for (const int64 v : {0}) {
+      t.flat<int64>()(indx++) = v;
+    }
+    EXPECT_TRUE(ParseRowStartIndices<int64>(t, 2) ==
+                std::vector<int64>({0, 2}));
+  }
+  {
+    Tensor t(DataType::DT_UINT64, {2});
+    int indx = 0;
+    for (const uint64 v : {0, 3}) {
+      t.flat<uint64>()(indx++) = v;
+    }
+    EXPECT_TRUE(ParseRowStartIndices<uint64>(t, 4) ==
+                std::vector<uint64>({0, 3, 4}));
+  }
+  {
+    Tensor t(DataType::DT_UINT16, {2});
+    int indx = 0;
+    for (const uint16 v : {0, 3}) {
+      t.flat<uint16>()(indx++) = v;
+    }
+    EXPECT_TRUE(ParseRowStartIndices<uint16>(t, 4) ==
+                std::vector<uint16>({0, 3, 4}));
+  }
+}
+
+TEST(SparseUtilsTest, ContainsEmptyRows) {
+  {
+    int32 data[] = {0, 0, 1, 0, 4, 0, 6, 0, 7, 0, 8, 0, 10, 0, 12, 0};
+    TTypes<int32>::ConstMatrix indices_mat(data, 8, 2);
+    bool contains_empty_rows;
+    const auto segment_indices =
+        GetStartIndicesOfEachDenseRow<int32>(indices_mat, &contains_empty_rows);
+    // indices_list = {0, 1, 4, 6, 7, 8, 10, 12};
+    EXPECT_TRUE(ContainsEmptyRows(segment_indices));
+  }
+  {
+    int64 data[] = {0, 0, 1, 0, 4, 0, 6, 0, 7, 0, 8, 0, 10, 0, 12, 0};
+    TTypes<int64>::ConstMatrix indices_mat(data, 8, 2);
+    bool contains_empty_rows;
+    const auto segment_indices =
+        GetStartIndicesOfEachDenseRow<int64>(indices_mat, &contains_empty_rows);
+    // indices_list = {0, 1, 4, 6, 7, 8, 10, 12};
+    EXPECT_TRUE(ContainsEmptyRows(segment_indices));
+  }
+  {
+    int32 data[] = {1, 0, 1, 1, 2, 0, 2, 1, 2, 2, 3, 4};
+    TTypes<int32>::ConstMatrix indices_mat(data, 6, 2);
+    bool contains_empty_rows;
+    const auto segment_indices =
+        GetStartIndicesOfEachDenseRow<int32>(indices_mat, &contains_empty_rows);
+    // indices_list = {1, 1, 2, 2, 2, 3};
+    EXPECT_FALSE(ContainsEmptyRows(segment_indices));
+  }
+  {
+    uint16 data[] = {1, 0, 1, 1, 2, 0, 2, 1, 2, 2, 3, 4};
+    TTypes<uint16>::ConstMatrix indices_mat(data, 6, 2);
+    bool contains_empty_rows;
+    const auto segment_indices = GetStartIndicesOfEachDenseRow<uint16>(
+        indices_mat, &contains_empty_rows);
+    // indices_list = {1, 1, 2, 2, 2, 3};
+    EXPECT_FALSE(ContainsEmptyRows(segment_indices));
+  }
+  {
+    int32 data[] = {0, 0, 1, 0, 1, 1, 2, 0, 2, 1, 2, 2, 3, 4};
+    TTypes<int32>::ConstMatrix indices_mat(data, 7, 2);
+    bool contains_empty_rows;
+    const auto segment_indices =
+        GetStartIndicesOfEachDenseRow<int32>(indices_mat, &contains_empty_rows);
+    // indices_list = {0, 1, 1, 2, 2, 2, 3};
+    EXPECT_FALSE(ContainsEmptyRows(segment_indices));
+  }
+  {
+    int64 data[] = {0, 0, 1, 0, 1, 1, 2, 0, 2, 1, 2, 2, 3, 4};
+    TTypes<int64>::ConstMatrix indices_mat(data, 7, 2);
+    bool contains_empty_rows;
+    const auto segment_indices =
+        GetStartIndicesOfEachDenseRow<int64>(indices_mat, &contains_empty_rows);
+    // indices_list = {0, 1, 1, 2, 2, 2, 3};
+    EXPECT_FALSE(ContainsEmptyRows(segment_indices));
+  }
+  {
+    uint32 data[] = {0, 0, 0, 1, 0, 2, 2, 0, 2, 1, 2, 2, 3, 4};
+    TTypes<uint32>::ConstMatrix indices_mat(data, 7, 2);
+    bool contains_empty_rows;
+    const auto segment_indices = GetStartIndicesOfEachDenseRow<uint32>(
+        indices_mat, &contains_empty_rows);
+    // indices_list = {0, 0, 0, 2, 2, 2, 3};
+    EXPECT_TRUE(ContainsEmptyRows(segment_indices));
+  }
+  {
+    int64 data[] = {0, 0, 0, 1, 0, 2, 2, 0, 2, 1, 2, 2, 3, 4};
+    TTypes<int64>::ConstMatrix indices_mat(data, 7, 2);
+    bool contains_empty_rows;
+    const auto segment_indices =
+        GetStartIndicesOfEachDenseRow<int64>(indices_mat, &contains_empty_rows);
+    // indices_list = {0, 0, 0, 2, 2, 2, 3};
+    EXPECT_TRUE(ContainsEmptyRows(segment_indices));
+  }
+  {
+    uint64 data[] = {0, 0, 0, 1, 0, 2, 1, 0, 2, 1, 2, 2, 3, 4};
+    TTypes<uint64>::ConstMatrix indices_mat(data, 7, 2);
+    bool contains_empty_rows;
+    const auto segment_indices = GetStartIndicesOfEachDenseRow<uint64>(
+        indices_mat, &contains_empty_rows);
+    // indices_list = {0, 0, 0, 1, 2, 2, 3};
+    EXPECT_FALSE(ContainsEmptyRows(segment_indices));
+  }
+}
+
+TEST(SparseUtilsTest, FindNextDenseRowStartIndex) {
+  {
+    int32 data[] = {0, 0, 1, 0, 4, 0, 6, 0, 7, 0, 8, 0, 10, 0, 12, 0};
+    TTypes<int32>::ConstMatrix indices_mat(data, 8, 2);
+    // indices_list = {0, 1, 4, 6, 7, 8, 10, 12};
+    for (int32 i = 0; i < 8; ++i) {
+      EXPECT_EQ(i + 1, FindNextDenseRowStartIndex<int32>(i, indices_mat));
+    }
+  }
+  {
+    uint16 data[] = {0, 0, 1, 0, 4, 0, 6, 0, 7, 0, 8, 0, 10, 0, 12, 0};
+    TTypes<uint16>::ConstMatrix indices_mat(data, 8, 2);
+    // indices_list = {0, 1, 4, 6, 7, 8, 10, 12};
+    for (uint16 i = 0; i < 8; ++i) {
+      EXPECT_EQ(i + 1, FindNextDenseRowStartIndex<uint16>(i, indices_mat));
+    }
+  }
+  {
+    int64 data[] = {0, 0, 1, 0, 1, 0, 4, 0, 4, 0, 4, 0,  6, 0,  7,
+                    0, 7, 0, 7, 0, 7, 0, 8, 0, 8, 0, 10, 0, 12, 0};
+    TTypes<int64>::ConstMatrix indices_mat(data, 15, 2);
+    // indices_list = {0, 1, 1, 4, 4, 4,  6, 7, 7, 7, 7, 8, 8, 10, 12};
+    EXPECT_EQ(3, FindNextDenseRowStartIndex<int64>(static_cast<int64>(1),
+                                                   indices_mat));
+    EXPECT_EQ(3, FindNextDenseRowStartIndex<int64>(static_cast<int64>(2),
+                                                   indices_mat));
+    EXPECT_EQ(6, FindNextDenseRowStartIndex<int64>(static_cast<int64>(3),
+                                                   indices_mat));
+    EXPECT_EQ(6, FindNextDenseRowStartIndex<int64>(static_cast<int64>(4),
+                                                   indices_mat));
+    EXPECT_EQ(14, FindNextDenseRowStartIndex<int64>(static_cast<int64>(13),
+                                                    indices_mat));
+    EXPECT_EQ(15, FindNextDenseRowStartIndex<int64>(static_cast<int64>(14),
+                                                    indices_mat));
+  }
+}
+
+}  // namespace
diff --git a/tensorflow/core/kernels/sparse_xent_op.cc b/tensorflow/core/kernels/sparse_xent_op.cc
index f84ffd53238f7753c1b4562268be9058c6c03e6d..37d4d0661cadc1d86af10c8226e4aae52b4b8c7c 100644
--- a/tensorflow/core/kernels/sparse_xent_op.cc
+++ b/tensorflow/core/kernels/sparse_xent_op.cc
@@ -90,9 +90,8 @@ class SparseSoftmaxXentWithLogitsOp : public OpKernel {
             context, CheckInvalidLabelIndex<Index>(labels, logits.dim_size(1)));
       }
       functor::SparseXentFunctor<Device, T, Index> functor;
-      functor(context->eigen_device<Device>(), logits.matrix<T>(),
-              labels.vec<Index>(), scratch.vec<T>(), loss_out->vec<T>(),
-              back_out->matrix<T>());
+      functor(context, logits.matrix<T>(), labels.vec<Index>(),
+              scratch.vec<T>(), loss_out->vec<T>(), back_out->matrix<T>());
     }
   }
 };
@@ -102,11 +101,11 @@ class SparseSoftmaxXentWithLogitsOp : public OpKernel {
 namespace functor {
 template <typename T, typename Index>
 struct SparseXentFunctor<CPUDevice, T, Index> {
-  void operator()(const CPUDevice& d, typename TTypes<T>::ConstMatrix logits,
+  void operator()(OpKernelContext* ctx, typename TTypes<T>::ConstMatrix logits,
                   typename TTypes<Index>::ConstVec labels,
                   typename TTypes<T>::Vec scratch, typename TTypes<T>::Vec loss,
                   typename TTypes<T>::Matrix backprop) {
-    SparseXentEigenImpl<CPUDevice, T, Index>::Compute(d, logits, labels,
+    SparseXentEigenImpl<CPUDevice, T, Index>::Compute(ctx, logits, labels,
                                                       scratch, loss, backprop);
   }
 };
diff --git a/tensorflow/core/kernels/sparse_xent_op.h b/tensorflow/core/kernels/sparse_xent_op.h
index 6ba7931ab5f923cec2efa44fb44e2b3a91f73ebe..c94597f29709ae649fc5f0fd85b931b9555cdf60 100644
--- a/tensorflow/core/kernels/sparse_xent_op.h
+++ b/tensorflow/core/kernels/sparse_xent_op.h
@@ -18,8 +18,9 @@ limitations under the License.
 // Functor definition for SparseXentOp, must be compilable by nvcc.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -128,6 +129,26 @@ class SparseXentGradGenerator {
 
 namespace functor {
 
+template <typename Device, typename T>
+struct RowMaxReduction {
+  // Computes the maximum across the rows of logits
+  //
+  // logits: batch_size, num_classes.
+  // maximum: temporary tensor, dims: batch_size, 1
+  static inline void Compute(OpKernelContext* ctx,
+                             typename TTypes<T>::ConstMatrix logits,
+                             typename TTypes<T>::Vec maximum) {
+#if !defined(EIGEN_HAS_INDEX_LIST)
+    Eigen::array<int, 1> along_row;
+    along_row[0] = 1;
+#else
+    Eigen::IndexList<Eigen::type2index<1> > along_row;
+#endif
+    Device d = ctx->eigen_device<Device>();
+    To32Bit(maximum).device(d) = To32Bit(logits).maximum(along_row);
+  }
+};
+
 // Functor used by SparseXentOp to do the computations.
 template <typename Device, typename T, typename Index>
 struct SparseXentFunctor {
@@ -138,7 +159,7 @@ struct SparseXentFunctor {
   // scratch: temporary tensor, dims: batch_size, 1
   // loss: output tensor for the loss, dims: batch_size.
   // backprop: output tensor for the backprop, dims: batch_size, num_classes.
-  void operator()(const Device& d, typename TTypes<T>::ConstMatrix logits,
+  void operator()(OpKernelContext* ctx, typename TTypes<T>::ConstMatrix logits,
                   typename TTypes<Index>::ConstVec labels,
                   typename TTypes<T>::Vec scratch, typename TTypes<T>::Vec loss,
                   typename TTypes<T>::Matrix backprop);
@@ -149,7 +170,8 @@ struct SparseXentFunctor {
 // specializations for both device types.
 template <typename Device, typename T, typename Index>
 struct SparseXentEigenImpl {
-  static void Compute(const Device& d, typename TTypes<T>::ConstMatrix logits,
+  static void Compute(OpKernelContext* ctx,
+                      typename TTypes<T>::ConstMatrix logits,
                       typename TTypes<Index>::ConstVec labels,
                       typename TTypes<T>::Vec scratch,
                       typename TTypes<T>::Vec loss,
@@ -188,8 +210,9 @@ struct SparseXentEigenImpl {
 #endif
 
     // scratch = max_logits along classes.
-    To32Bit(scratch).device(d) = To32Bit(logits).maximum(along_class);
+    RowMaxReduction<Device, T>::Compute(ctx, logits, scratch);
 
+    Device d = ctx->eigen_device<Device>();
     // backprop = logits - max_logits.
     To32Bit(backprop).device(d) =
         To32Bit(logits) -
diff --git a/tensorflow/core/kernels/sparse_xent_op_gpu.cu.cc b/tensorflow/core/kernels/sparse_xent_op_gpu.cu.cc
index d0539660282240bd40495a5078771d1f7a1f3211..5fe15352c3e562eff0fee5dd43fb8625f4c27fa5 100644
--- a/tensorflow/core/kernels/sparse_xent_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/sparse_xent_op_gpu.cu.cc
@@ -20,22 +20,50 @@ limitations under the License.
 #include "tensorflow/core/kernels/sparse_xent_op.h"
 
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/reduction_gpu_kernels.cu.h"
+#include "tensorflow/core/kernels/reduction_ops_common.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
 
+namespace functor {
+
+// Partial specialization for a GPUDevice, that uses the CUB implementation
+// from reduction_gpu_kernels.cu.h.
+template <typename T>
+struct RowMaxReduction<GPUDevice, T> {
+  // Computes the maximum across the rows of logits
+  //
+  // logits: batch_size, num_classes.
+  // maximum: temporary tensor, dims: batch_size, 1
+  static inline void Compute(OpKernelContext* ctx,
+                             typename TTypes<T>::ConstMatrix logits,
+                             typename TTypes<T>::Vec maximum) {
+    const int kBatchDim = 0;
+    const int kClassDim = 1;
+    const int rows = logits.dimension(kBatchDim);
+    const int cols = logits.dimension(kClassDim);
+
+    typedef const Eigen::array<TTypes<float>::Tensor::Index, 1>& ReductionAxes;
+    Constants<GPUDevice> constants;
+    cub::Max op;
+    functor::ReduceImpl<T, cub::Max, T*, const T*, ReductionAxes>(
+        ctx, maximum.data(), logits.data(), 2, rows, cols, 1, 1, constants.kOne,
+        op);
+  }
+};
+
 // Partial specialization for a GPUDevice, that uses the Eigen implementation
 // from XentEigenImpl.
-namespace functor {
 template <typename T, typename Index>
 struct SparseXentFunctor<GPUDevice, T, Index> {
-  void operator()(const GPUDevice& d, typename TTypes<T>::ConstMatrix logits,
+  void operator()(OpKernelContext* ctx, typename TTypes<T>::ConstMatrix logits,
                   typename TTypes<Index>::ConstVec labels,
                   typename TTypes<T>::Vec scratch, typename TTypes<T>::Vec loss,
                   typename TTypes<T>::Matrix backprop) {
-    SparseXentEigenImpl<GPUDevice, T, Index>::Compute(d, logits, labels,
+    SparseXentEigenImpl<GPUDevice, T, Index>::Compute(ctx, logits, labels,
                                                       scratch, loss, backprop);
   }
 };
diff --git a/tensorflow/core/kernels/spectrogram_test_utils.cc b/tensorflow/core/kernels/spectrogram_test_utils.cc
index 872a6e9d1bcce09765d1531c5f2898b2badc66a7..bb9d18e915a5297a3561be1f3f6f2de338855d1b 100644
--- a/tensorflow/core/kernels/spectrogram_test_utils.cc
+++ b/tensorflow/core/kernels/spectrogram_test_utils.cc
@@ -140,9 +140,9 @@ void ReadCSVFileToComplexVectorOrDie(
       for (std::vector<string>::const_iterator j = parts.begin();
            j != parts.end(); ++j) {
         if (j->find_first_of("ij") != string::npos) {
-          strings::safe_strtod((*j).c_str(), &imaginary_part);
+          strings::safe_strtod(*j, &imaginary_part);
         } else {
-          strings::safe_strtod((*j).c_str(), &real_part);
+          strings::safe_strtod(*j, &real_part);
         }
       }
       data_line.push_back(std::complex<double>(real_part, imaginary_part));
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
index 11db72bfa3c66130783ad67f01c041a5d3d5085a..ed3429ff5cbfc02fd5196db154ce45a72849518c 100644
--- a/tensorflow/core/kernels/split_op.cc
+++ b/tensorflow/core/kernels/split_op.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/split_lib.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/kernels/split_v_op.cc b/tensorflow/core/kernels/split_v_op.cc
index 5c19a45fb18abdacb5f89f623f9690b43bdfa1e5..0324ce9babc3fe73e613f1b5552c6e13d643b090 100644
--- a/tensorflow/core/kernels/split_v_op.cc
+++ b/tensorflow/core/kernels/split_v_op.cc
@@ -24,10 +24,10 @@ limitations under the License.
 #include <numeric>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/split_lib.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/kernels/stack.cc b/tensorflow/core/kernels/stack.cc
index 5c70a2d62d36b94362c6f10473644f2623b77d2a..033b9f34780a9fc8790d5aaa07501dd013f14750 100644
--- a/tensorflow/core/kernels/stack.cc
+++ b/tensorflow/core/kernels/stack.cc
@@ -96,7 +96,7 @@ class Stack : public ResourceBase {
 
   DataType ElemType() { return elem_type_; }
 
-  string DebugString() override {
+  string DebugString() const override {
     mutex_lock l(mu_);
     return strings::StrCat("Stack[", stack_name_, "]");
   }
@@ -244,9 +244,9 @@ void StackPushOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
     DeviceContext* device_ctxt = ctx->op_device_context();
     auto device = static_cast<tensorflow::Device*>(ctx->device());
     Allocator* allocator = device->GetAllocator(alloc_attrs);
-    AllocatorStats stats;
-    allocator->GetStats(&stats);
-    if (stats.bytes_in_use > (stats.bytes_limit * kOccupancy)) {
+    absl::optional<AllocatorStats> stats = allocator->GetStats();
+    if (stats && *stats->bytes_limit &&
+        stats->bytes_in_use > (*stats->bytes_limit * kOccupancy)) {
       // Asynchronously copy the tensor from GPU to CPU memory.
       // TODO(yuanbyu): Swap the oldest tensor first.
       AllocatorAttributes host_alloc_attrs;
diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc
index c91bdc43cf4636481f141df70f30b1f2d74dc1a2..65174e163c1031d3e480159824f984e4bf83980b 100644
--- a/tensorflow/core/kernels/stage_op.cc
+++ b/tensorflow/core/kernels/stage_op.cc
@@ -132,7 +132,7 @@ class Buffer : public ResourceBase {
     notify_inserters_if_bounded(&lock);
   }
 
-  string DebugString() override {
+  string DebugString() const override {
     std::unique_lock<std::mutex> lock(mu_);
     return strings::StrCat("Staging size: ", buf_.size());
   }
@@ -170,7 +170,7 @@ class Buffer : public ResourceBase {
   std::size_t capacity_;
   std::size_t memory_limit_;
   std::size_t current_bytes_;
-  std::mutex mu_;
+  mutable std::mutex mu_;
   std::condition_variable non_empty_cond_var_;
   std::condition_variable full_cond_var_;
   std::deque<Tuple> buf_;
diff --git a/tensorflow/core/kernels/stateful_random_ops.cc b/tensorflow/core/kernels/stateful_random_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..62c4ee3bd69fd7e87dc501c1e8c397f73a31b92c
--- /dev/null
+++ b/tensorflow/core/kernels/stateful_random_ops.cc
@@ -0,0 +1,187 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/stateful_random_ops.h"
+#include "tensorflow/core/kernels/random_op.h"
+#include "tensorflow/core/kernels/training_op_helpers.h"
+
+namespace tensorflow {
+
+template <typename Distribution>
+struct UpdateVariableAndFill_Philox<CPUDevice, Distribution> {
+  void operator()(OpKernelContext* ctx, const CPUDevice& device,
+                  int64 output_size, int64 alg_tag_skip,
+                  ScopedUnlockUnref* state_var_guard, Tensor* state_tensor,
+                  typename Distribution::ResultElementType* output_data) {
+    auto state_tensor_flat = state_tensor->flat<StateElementType>();
+    auto state_data = state_tensor_flat.data();
+    // Delegates to PhiloxRandom to do the actual increasing.
+    auto philox = GetPhiloxRandomFromMem(state_data + alg_tag_skip);
+    UpdateMemWithPhiloxRandom(philox, output_size, state_data + alg_tag_skip);
+    // No longer needs the lock.
+    state_var_guard->Release();
+    functor::FillPhiloxRandom<CPUDevice, Distribution>()(
+        ctx, device, philox, output_data, output_size, Distribution());
+  }
+};
+
+template <typename Device, typename Distribution>
+Status UpdateVariableAndFill(
+    OpKernelContext* ctx, int state_input_idx, bool read_alg_from_state,
+    Algorithm alg, int64 output_size,
+    typename Distribution::ResultElementType* output_data) {
+  Var* var = nullptr;
+  TF_RETURN_IF_ERROR(
+      LookupResource(ctx, HandleFromInput(ctx, state_input_idx), &var));
+  // Use `ScopedUnlockUnref` here instead of `mutex_lock` and `ScopedUnref`
+  // because the former supports early releasing which is needed by
+  // `UpdateVariableAndFill_Philox<CPU>` to avoid holding the lock while
+  // filling.
+  ScopedUnlockUnref state_var_guard(var);
+  Tensor* var_tensor = var->tensor();
+  if (var_tensor->dtype() != STATE_ELEMENT_DTYPE) {
+    return errors::InvalidArgument("dtype of RNG state variable must be ",
+                                   DataTypeString(STATE_ELEMENT_DTYPE),
+                                   ", not ",
+                                   DataTypeString(var_tensor->dtype()));
+  }
+  if (var_tensor->dims() != 1) {
+    return errors::InvalidArgument(
+        "RNG state must have one and only one dimension, not ",
+        var_tensor->dims());
+  }
+  auto var_tensor_flat = var_tensor->flat<StateElementType>();
+  int64 alg_tag_skip = 0;
+  if (read_alg_from_state) {
+    alg_tag_skip = 1;
+    if (var_tensor_flat.size() < 1) {
+      return errors::InvalidArgument("Size of tensor must be at least 1");
+    }
+    alg = var_tensor_flat(0);
+  }
+  if (alg == RNG_ALG_PHILOX) {
+    static_assert(std::is_same<StateElementType, int64>::value,
+                  "StateElementType must be int64");
+    static_assert(std::is_same<PhiloxRandom::ResultElementType, uint32>::value,
+                  "PhiloxRandom::ResultElementType must be uint32");
+    if (var_tensor_flat.size() < alg_tag_skip + PHILOX_MIN_STATE_SIZE) {
+      return errors::InvalidArgument(
+          "For Philox algorithm, the size of state"
+          " must be at least ",
+          alg_tag_skip + PHILOX_MIN_STATE_SIZE, "; got ",
+          var_tensor_flat.size());
+    }
+    TF_RETURN_IF_ERROR(PrepareToUpdateVariable<Device, StateElementType>(
+        ctx, var_tensor, var->copy_on_read_mode.load()));
+    UpdateVariableAndFill_Philox<Device, Distribution>()(
+        ctx, ctx->eigen_device<Device>(), output_size, alg_tag_skip,
+        &state_var_guard, var_tensor, output_data);
+    return Status::OK();
+  } else {
+    return errors::InvalidArgument("Unsupported algorithm id: ", alg);
+  }
+}
+
+// Preconditon: input(0) is an existing resource.
+template <typename Device, class Distribution>
+void ComputeImpl(OpKernelContext* ctx, int state_input_idx, int shape_input_idx,
+                 bool read_alg_from_state, Algorithm alg) {
+  using T = typename Distribution::ResultElementType;
+  const Tensor& shape_t = ctx->input(shape_input_idx);
+  TensorShape shape;
+  OP_REQUIRES_OK(ctx, ctx->op_kernel().MakeShape(shape_t, &shape));
+  Tensor* output;
+  OP_REQUIRES_OK(ctx, ctx->allocate_output(0, shape, &output));
+  auto output_flat = output->flat<T>();
+  OP_REQUIRES_OK(ctx, UpdateVariableAndFill<Device, Distribution>(
+                          ctx, state_input_idx, read_alg_from_state, alg,
+                          output_flat.size(), output_flat.data()));
+}
+
+template <typename Device, class Distribution>
+class StatefulRandomOp : public OpKernel {
+ public:
+  explicit StatefulRandomOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    ComputeImpl<Device, Distribution>(ctx, 0, 1, true, 0);
+  }
+};
+
+template <typename Device, class Distribution>
+class StatefulRandomOpV2 : public OpKernel {
+ public:
+  explicit StatefulRandomOpV2(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& alg_tensor = ctx->input(1);
+    OP_REQUIRES(ctx, alg_tensor.dims() == 0,
+                errors::InvalidArgument("algorithm must be of shape [], not ",
+                                        alg_tensor.shape().DebugString()));
+    auto alg = alg_tensor.flat<Algorithm>()(0);
+    ComputeImpl<Device, Distribution>(ctx, 0, 2, false, alg);
+  }
+};
+
+// So far the 'Distribution' type parameter is only used when the algorithm is
+// philox, so 'NormalDistribution<PhiloxRandom, ...>' is fine for now.
+#define REGISTER(DEVICE, TYPE)            \
+  REGISTER_KERNEL_BUILDER(                \
+      Name("StatefulStandardNormalV2")    \
+          .Device(DEVICE_##DEVICE)        \
+          .HostMemory("resource")         \
+          .HostMemory("algorithm")        \
+          .HostMemory("shape")            \
+          .TypeConstraint<TYPE>("dtype"), \
+      StatefulRandomOpV2<DEVICE##Device,  \
+                         random::NormalDistribution<PhiloxRandom, TYPE> >);
+
+// CPU also has the old 'StatefulStandardNormal' op for backward compatibility.
+#define REGISTER_CPU(TYPE)                \
+  REGISTER(CPU, TYPE)                     \
+  REGISTER_KERNEL_BUILDER(                \
+      Name("StatefulStandardNormal")      \
+          .Device(DEVICE_CPU)             \
+          .HostMemory("resource")         \
+          .HostMemory("shape")            \
+          .TypeConstraint<TYPE>("dtype"), \
+      StatefulRandomOp<CPUDevice,         \
+                       random::NormalDistribution<PhiloxRandom, TYPE> >);
+
+#define REGISTER_GPU(TYPE) REGISTER(GPU, TYPE)
+
+TF_CALL_half(REGISTER_CPU);
+TF_CALL_bfloat16(REGISTER_CPU);
+TF_CALL_float(REGISTER_CPU);
+TF_CALL_double(REGISTER_CPU);
+
+#if GOOGLE_CUDA
+
+TF_CALL_half(REGISTER_GPU);
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
+
+#endif  // GOOGLE_CUDA
+
+#undef REGISTER_GPU
+#undef REGISTER_CPU
+#undef REGISTER
+
+// TODO(wangpeng): Add RNG ops for other distributions.
+// TODO(wangpeng): Add support for XLA.
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/stateful_random_ops.h b/tensorflow/core/kernels/stateful_random_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..689c2be6647a575c7a3a25d8eae834c31019a695
--- /dev/null
+++ b/tensorflow/core/kernels/stateful_random_ops.h
@@ -0,0 +1,145 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STATEFUL_RANDOM_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_STATEFUL_RANDOM_OPS_H_
+
+#include "tensorflow/core/framework/resource_var.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+
+namespace tensorflow {
+
+// 'Variable' doesn't support uint32 or uint64 yet (due to reasons explained
+// in b/111604096 and cl/171681867), so I use signed int here. I choose int64
+// instead of int32 because `VarHandleOp` doesn't support int32 on GPU.
+using StateElementType = int64;
+static constexpr DataType STATE_ELEMENT_DTYPE = DT_INT64;
+
+using Algorithm = StateElementType;
+static constexpr Algorithm RNG_ALG_PHILOX = 1;
+
+using random::PhiloxRandom;
+
+static constexpr int64 PHILOX_MIN_STATE_SIZE =
+    (PhiloxRandom::ResultType::kElementCount +
+     PhiloxRandom::Key::kElementCount) /
+    2;
+
+// The following 5 functions are made templates to avoid duplicate symbols when
+// linking.
+
+// The following two functions use the contract "lower 32 bits for the first
+// uint32, higher 32 bits for the second". Note that this is endian-neutral,
+// unlike a direct memory copy `memcpy(output, &input, 8)`.
+template <typename INT64>
+PHILOX_DEVICE_FUNC void Int64ToUint32s(INT64 input, uint32* output1,
+                                       uint32* output2) {
+  auto u64 = static_cast<uint64>(input);
+  *output1 = static_cast<uint32>(u64);
+  *output2 = static_cast<uint32>(u64 >> 32);
+}
+
+template <typename UINT32>
+PHILOX_DEVICE_FUNC int64 Uint32sToInt64(UINT32 input1, UINT32 input2) {
+  auto u64_1 = static_cast<uint64>(input1);
+  auto u64_2 = static_cast<uint64>(input2);
+  return static_cast<int64>(u64_1 | (u64_2 << 32));
+}
+
+template <typename STATE_ELEMENT_TYPE>
+PHILOX_DEVICE_FUNC PhiloxRandom
+GetPhiloxRandomFromMem(STATE_ELEMENT_TYPE const* ptr) {
+  PhiloxRandom::ResultType counter;
+  PhiloxRandom::Key key;
+  Int64ToUint32s(ptr[0], &counter[0], &counter[1]);
+  Int64ToUint32s(ptr[1], &counter[2], &counter[3]);
+  Int64ToUint32s(ptr[2], &key[0], &key[1]);
+  return PhiloxRandom(counter, key);
+}
+
+template <typename PHILOX_RANDOM>
+PHILOX_DEVICE_FUNC void WritePhiloxRandomToMem(PHILOX_RANDOM const& philox,
+                                               StateElementType* ptr) {
+  PhiloxRandom::ResultType const& counter = philox.counter();
+  PhiloxRandom::Key const& key = philox.key();
+  ptr[0] = Uint32sToInt64(counter[0], counter[1]);
+  ptr[1] = Uint32sToInt64(counter[2], counter[3]);
+  ptr[2] = Uint32sToInt64(key[0], key[1]);
+}
+
+template <typename PHILOX_RANDOM>
+PHILOX_DEVICE_FUNC void UpdateMemWithPhiloxRandom(PHILOX_RANDOM const& philox,
+                                                  int64 output_size,
+                                                  StateElementType* ptr) {
+  auto new_philox = philox;
+  // Multiplier 256 is the same as in `FillPhiloxRandomTask`; do not change
+  // it just here.
+  auto delta = output_size * 256;
+  new_philox.Skip(delta);  // do the actual increasing
+  WritePhiloxRandomToMem(new_philox, ptr);
+}
+
+// Does unlock and unref automatically when going out of scope, and also
+// supports early manual release.
+class ScopedUnlockUnref {
+ public:
+  explicit ScopedUnlockUnref(Var* var) : var_(var) {
+    if (var_) {
+      var_->mu()->lock();
+    }
+  }
+  void Release() {
+    if (var_) {
+      var_->mu()->unlock();
+      var_->Unref();
+      var_ = nullptr;
+    }
+  }
+  ~ScopedUnlockUnref() { Release(); }
+
+ private:
+  Var* var_;
+
+  ScopedUnlockUnref(const ScopedUnlockUnref&) = delete;
+  void operator=(const ScopedUnlockUnref&) = delete;
+};
+
+// A per-device helper function that does the actual work for
+// `UpdateVariableAndFill`.
+// Reason to use functor: C++ doesn't allow function-template partial
+// specialization.
+template <typename Device, typename Distribution>
+struct UpdateVariableAndFill_Philox;
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+#if GOOGLE_CUDA
+
+using GPUDevice = Eigen::GpuDevice;
+
+// Declares the partially GPU-specialized functor struct.
+template <typename Distribution>
+struct UpdateVariableAndFill_Philox<GPUDevice, Distribution> {
+  void operator()(OpKernelContext* ctx, const GPUDevice& device,
+                  int64 output_size, int64 alg_tag_skip,
+                  ScopedUnlockUnref* not_used, Tensor* state_tensor,
+                  typename Distribution::ResultElementType* output_data);
+};
+
+#endif  // GOOGLE_CUDA
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STATEFUL_RANDOM_OPS_H_
diff --git a/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc b/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4ce0db7c56c896b1ebe7f384245e1d07d1754831
--- /dev/null
+++ b/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc
@@ -0,0 +1,96 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/random_op_gpu.h"
+#include "tensorflow/core/kernels/stateful_random_ops.h"
+#include "tensorflow/core/util/cuda_launch_config.h"
+
+namespace tensorflow {
+
+using random::PhiloxRandom;
+
+__device__ int thread_counter;
+
+template <typename Distribution>
+__global__ void FillKernel(
+    Distribution dist, int64 state_size, int64 output_size,
+    StateElementType* state_data,
+    typename Distribution::ResultElementType* output_data) {
+  // Threads in this block share `philox`. Thread 0 is responsible for
+  // initializing it.
+  __shared__ char philox_raw[sizeof(PhiloxRandom)];
+  auto philox = reinterpret_cast<PhiloxRandom*>(philox_raw);
+  if (threadIdx.x == 0) {
+    *philox = GetPhiloxRandomFromMem(state_data);
+  }
+  __syncthreads();
+  functor::FillPhiloxRandomKernel<Distribution,
+                                  Distribution::kVariableSamplesPerOutput>()
+      .Run(*philox, output_data, output_size, dist);
+  // The last thread updates the state.
+  auto total_thread_count = gridDim.x * blockDim.x;
+  auto old_counter_value = atomicAdd(&thread_counter, 1);
+  if (old_counter_value == total_thread_count - 1) {
+    UpdateMemWithPhiloxRandom(*philox, output_size, state_data);
+  }
+}
+
+template <typename Distribution>
+void UpdateVariableAndFill_Philox<GPUDevice, Distribution>::operator()(
+    OpKernelContext* ctx, const GPUDevice& d, int64 output_size,
+    int64 alg_tag_skip, ScopedUnlockUnref* not_used, Tensor* state_tensor,
+    typename Distribution::ResultElementType* output_data) {
+  OP_REQUIRES(
+      ctx, alg_tag_skip == 0,
+      errors::InvalidArgument(
+          "GPU kernel doesn't support reading algorithm from state variable, "
+          "so alg_tag_skip must be 0; got",
+          alg_tag_skip));
+  auto state_tensor_flat = state_tensor->flat<StateElementType>();
+  auto state_size = state_tensor_flat.size();
+  auto state_data = state_tensor_flat.data();
+
+  // maximize occupancy
+  const int kGroupSize = Distribution::kResultElementCount;
+  int work_element_count = (output_size + kGroupSize - 1) / kGroupSize;
+  CudaLaunchConfig cfg = GetCudaLaunchConfig(work_element_count, d,
+                                             FillKernel<Distribution>, 0, 0);
+
+  int zero = 0;
+  cudaMemcpyToSymbol(thread_counter, &zero, sizeof(int));
+  FillKernel<Distribution>
+      <<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
+          Distribution(), state_size, output_size, state_data, output_data);
+}
+
+// Explicit instantiation of the GPU distributions functors.
+
+// clang-format off
+// NVCC cannot handle ">>" properly
+template struct UpdateVariableAndFill_Philox<
+    GPUDevice, random::NormalDistribution<random::PhiloxRandom, Eigen::half> >;
+template struct UpdateVariableAndFill_Philox<
+    GPUDevice, random::NormalDistribution<random::PhiloxRandom, float> >;
+template struct UpdateVariableAndFill_Philox<
+    GPUDevice, random::NormalDistribution<random::PhiloxRandom, double> >;
+// clang-format on
+
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/stateless_random_ops.cc b/tensorflow/core/kernels/stateless_random_ops.cc
index 925f5291a68327c9fd939fd06fc025b58ab436ee..959334abc81d70bc854d2026d9eba99a2a01850d 100644
--- a/tensorflow/core/kernels/stateless_random_ops.cc
+++ b/tensorflow/core/kernels/stateless_random_ops.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/random_op.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 70a7ddbd0643e88655e1c0e1ad197316078267de..20bf42ccaa2ec838779c78321d022d6722826bb0 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -27,10 +27,10 @@ limitations under the License.
 #include "tensorflow/core/kernels/strided_slice_op_impl.h"
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/training_op_helpers.h"
 #include "tensorflow/core/kernels/variable_ops.h"
@@ -307,9 +307,9 @@ class StridedSliceAssignOp : public OpKernel {
       OP_REQUIRES_OK(context,
                      LookupResource(context, HandleFromInput(context, 0), &v));
       core::ScopedUnref scoped_unref(v);
-      mutex_lock ml(*v->mu());
       OP_REQUIRES_OK(context,
-                     PrepareToUpdateVariable<Device, T>(context, v->tensor()));
+                     EnsureSparseVariableAccess<Device, T>(context, v));
+      mutex_lock ml(*v->mu());
       old_lhs = v->tensor();
       OP_REQUIRES(context, old_lhs->dtype() == DataTypeToEnum<T>::value,
                   errors::InvalidArgument(
diff --git a/tensorflow/core/kernels/bounds_check.h b/tensorflow/core/kernels/strided_slice_op_gpu_bool.cu.cc
similarity index 71%
rename from tensorflow/core/kernels/bounds_check.h
rename to tensorflow/core/kernels/strided_slice_op_gpu_bool.cu.cc
index ce6ec1012daacf915fee0ee7bb059306058361d5..8c3f8f2ad30a56fb4c03105a20d0a7ebc692ec25 100644
--- a/tensorflow/core/kernels/bounds_check.h
+++ b/tensorflow/core/kernels/strided_slice_op_gpu_bool.cu.cc
@@ -13,9 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_BOUNDS_CHECK_H_
-#define TENSORFLOW_CORE_KERNELS_BOUNDS_CHECK_H_
+#if GOOGLE_CUDA
 
-#include "tensorflow/core/framework/bounds_check.h"
+#define EIGEN_USE_GPU
 
-#endif  // TENSORFLOW_CORE_KERNELS_BOUNDS_CHECK_H_
+#include "tensorflow/core/kernels/strided_slice_op.h"
+#include "tensorflow/core/kernels/strided_slice_op_gpu_impl.h"
+
+namespace tensorflow {
+TF_CALL_bool(DEFINE_GPU_KERNELS);
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/strided_slice_op_gpu_complex.cu.cc b/tensorflow/core/kernels/strided_slice_op_gpu_complex.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f6951924655a8fcd2b3c400b6e1b76f2d8e49270
--- /dev/null
+++ b/tensorflow/core/kernels/strided_slice_op_gpu_complex.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/strided_slice_op.h"
+#include "tensorflow/core/kernels/strided_slice_op_gpu_impl.h"
+
+namespace tensorflow {
+TF_CALL_complex64(DEFINE_GPU_KERNELS);
+TF_CALL_complex128(DEFINE_GPU_KERNELS);
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc b/tensorflow/core/kernels/strided_slice_op_gpu_impl.h
similarity index 90%
rename from tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
rename to tensorflow/core/kernels/strided_slice_op_gpu_impl.h
index cce1d2fddde7edc0283c524269de9464c2602e25..d70f369ac07a3c605ca90c5ba1e6198525dc1206 100644
--- a/tensorflow/core/kernels/strided_slice_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/strided_slice_op_gpu_impl.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_STRIDED_SLICE_OP_GPU_IMPL_H_
+#define TENSORFLOW_CORE_KERNELS_STRIDED_SLICE_OP_GPU_IMPL_H_
+
 #if GOOGLE_CUDA
 
 #define EIGEN_USE_GPU
@@ -50,16 +53,8 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct functor::StridedSliceAssign<GPUDevice, T, 6>; \
   template struct functor::StridedSliceAssign<GPUDevice, T, 7>; \
   template struct functor::StridedSliceAssignScalar<GPUDevice, T>;
-TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
-TF_CALL_complex64(DEFINE_GPU_KERNELS);
-TF_CALL_complex128(DEFINE_GPU_KERNELS);
-TF_CALL_int64(DEFINE_GPU_KERNELS);
-TF_CALL_bool(DEFINE_GPU_KERNELS);
-TF_CALL_int8(DEFINE_GPU_KERNELS);
-DEFINE_GPU_KERNELS(int32);
-
-#undef DEFINE_GPU_KERNELS
 
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA
+#endif  // TENSORFLOW_CORE_KERNELS_STRIDED_SLICE_OP_GPU_IMPL_H_
diff --git a/tensorflow/core/kernels/strided_slice_op_gpu_int.cu.cc b/tensorflow/core/kernels/strided_slice_op_gpu_int.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..07dd0130adc73512df10bf2e95ce580794262c68
--- /dev/null
+++ b/tensorflow/core/kernels/strided_slice_op_gpu_int.cu.cc
@@ -0,0 +1,29 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/strided_slice_op.h"
+#include "tensorflow/core/kernels/strided_slice_op_gpu_impl.h"
+
+namespace tensorflow {
+TF_CALL_int8(DEFINE_GPU_KERNELS);
+TF_CALL_int32(DEFINE_GPU_KERNELS);
+TF_CALL_int64(DEFINE_GPU_KERNELS);
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/strided_slice_op_gpu_number_types.cu.cc b/tensorflow/core/kernels/strided_slice_op_gpu_number_types.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..149886308cdf4ec8e9e9187db349e51c57e408b8
--- /dev/null
+++ b/tensorflow/core/kernels/strided_slice_op_gpu_number_types.cu.cc
@@ -0,0 +1,27 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/strided_slice_op.h"
+#include "tensorflow/core/kernels/strided_slice_op_gpu_impl.h"
+
+namespace tensorflow {
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+}  // end namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index c4205159c380cb0a78085f87deb760bd4a8c9791..d9b62d4c75486d61f28c0cd9bc3b44206a0689a4 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -22,13 +22,13 @@ limitations under the License.
 #include "tensorflow/core/kernels/strided_slice_op.h"
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/register_types_traits.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/kernels/string_to_number_op.cc b/tensorflow/core/kernels/string_to_number_op.cc
index 70dbd15c46cb341d8ad6ed6013b5b9ff8a5d61da..22742dd38e5d56bf3b9970bf6b01ff734f181169 100644
--- a/tensorflow/core/kernels/string_to_number_op.cc
+++ b/tensorflow/core/kernels/string_to_number_op.cc
@@ -51,7 +51,7 @@ class StringToNumberOp : public OpKernel {
     for (int i = 0; i < input_flat.size(); ++i) {
       OP_REQUIRES(
           context,
-          strings::SafeStringToNumeric<OutputType>(input_flat(i).c_str(),
+          strings::SafeStringToNumeric<OutputType>(input_flat(i),
                                                    &output_flat(i)),
           errors::InvalidArgument(kErrorMessage, input_flat(i).c_str()));
     }
diff --git a/tensorflow/core/platform/cuda_libdevice_path.cc b/tensorflow/core/kernels/string_view_variant_wrapper.cc
similarity index 71%
rename from tensorflow/core/platform/cuda_libdevice_path.cc
rename to tensorflow/core/kernels/string_view_variant_wrapper.cc
index 4d6532b983d52e7882ab540da31fb0b57183eb6f..b576eb4a3e63863d666bd325d0276039727e38c5 100644
--- a/tensorflow/core/platform/cuda_libdevice_path.cc
+++ b/tensorflow/core/kernels/string_view_variant_wrapper.cc
@@ -1,4 +1,4 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,14 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/cuda_libdevice_path.h"
-
-#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/kernels/string_view_variant_wrapper.h"
 
 namespace tensorflow {
 
-string LibdeviceRoot() {
-  return tensorflow::io::JoinPath(tensorflow::CudaRoot(), "nvvm/libdevice");
-}
+constexpr const char StringViewVariantWrapper::kTypeName[];
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/string_view_variant_wrapper.h b/tensorflow/core/kernels/string_view_variant_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc4a8e953489500d1967a6899ae9a003edacf0f9
--- /dev/null
+++ b/tensorflow/core/kernels/string_view_variant_wrapper.h
@@ -0,0 +1,69 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STRING_VIEW_VARIANT_WRAPPER_H_
+#define TENSORFLOW_CORE_KERNELS_STRING_VIEW_VARIANT_WRAPPER_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+
+namespace tensorflow {
+
+// A wrapper class for storing an `absl::string_view` instance in a DT_VARIANT
+// tensor.
+class StringViewVariantWrapper {
+ public:
+  static constexpr const char kTypeName[] =
+      "tensorflow::StringViewVariantWrapper";
+
+  using value_type = absl::string_view;
+
+  StringViewVariantWrapper() = default;
+
+  explicit StringViewVariantWrapper(absl::string_view str_view)
+      : str_view_(str_view) {}
+
+  StringViewVariantWrapper(const StringViewVariantWrapper& other)
+      : str_view_(other.str_view_) {}
+
+  const absl::string_view* get() const { return &str_view_; }
+
+  static string TypeName() { return kTypeName; }
+
+  string DebugString() const { return string(str_view_); }
+
+  void Encode(VariantTensorData* data) const {
+    data->add_tensor(string(str_view_));
+  }
+
+  // Decode assumes that the source VariantTensorData will have a longer
+  // lifetime than this StringViewVariantWrapper.
+  bool Decode(const VariantTensorData& data) {
+    if (data.tensors_size() != 1 || data.tensors(0).dtype() != DT_STRING) {
+      return false;
+    }
+    str_view_ = data.tensors(0).scalar<string>()();
+    return true;
+  }
+
+ private:
+  absl::string_view str_view_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STRING_VIEW_VARIANT_WRAPPER_H_
diff --git a/tensorflow/core/kernels/substr_op.cc b/tensorflow/core/kernels/substr_op.cc
index 93c427039dd6e0a7984ee58e51479fdff48937bb..77b16b9384de1bfe8956ff7aa89e2bd8fda35d86 100644
--- a/tensorflow/core/kernels/substr_op.cc
+++ b/tensorflow/core/kernels/substr_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -25,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/string_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
diff --git a/tensorflow/core/kernels/summary_image_op.cc b/tensorflow/core/kernels/summary_image_op.cc
index 29b21ee7353fe03ce87bc03dad72b05ca8fd4311..68f17c2e78d53ade46dead0bf040967cd2957bb1 100644
--- a/tensorflow/core/kernels/summary_image_op.cc
+++ b/tensorflow/core/kernels/summary_image_op.cc
@@ -78,6 +78,11 @@ class SummaryImageOp : public OpKernel {
     const int hw = h * w;  // Compact these two dims for simplicity
     const int depth = static_cast<int>(tensor.dim_size(3));
 
+    OP_REQUIRES(c, hw > 0 && depth > 0,
+                errors::InvalidArgument(
+                    "input tensor must have non-zero dims. Found: [",
+                    batch_size, ", ", h, ", ", w, ", ", depth, "]."));
+
     Summary s;
     if (tensor.dtype() == DT_UINT8) {
       // For uint8 input, no normalization is necessary
diff --git a/tensorflow/core/kernels/summary_kernels.cc b/tensorflow/core/kernels/summary_kernels.cc
index b287f0cc2f1337cff5705b5a40ba455b837307f9..d33c0cdb7f01a4d11204d20fd020941d544c45ee 100644
--- a/tensorflow/core/kernels/summary_kernels.cc
+++ b/tensorflow/core/kernels/summary_kernels.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tensorboard/db/schema.h"
-#include "tensorflow/contrib/tensorboard/db/summary_db_writer.h"
-#include "tensorflow/contrib/tensorboard/db/summary_file_writer.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/db/sqlite.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/summary/schema.h"
+#include "tensorflow/core/summary/summary_db_writer.h"
+#include "tensorflow/core/summary/summary_file_writer.h"
 #include "tensorflow/core/util/event.pb.h"
 
 namespace tensorflow {
@@ -53,6 +53,7 @@ class CreateSummaryFileWriterOp : public OpKernel {
                                   max_queue, flush_millis, logdir,
                                   filename_suffix, ctx->env(), s);
                             }));
+    core::ScopedUnref unref(s);
   }
 };
 REGISTER_KERNEL_BUILDER(Name("CreateSummaryFileWriter").Device(DEVICE_CPU),
@@ -89,6 +90,7 @@ class CreateSummaryDbWriterOp : public OpKernel {
                   db, experiment_name, run_name, user_name, ctx->env(), s));
               return Status::OK();
             }));
+    core::ScopedUnref unref(s);
   }
 };
 REGISTER_KERNEL_BUILDER(Name("CreateSummaryDbWriter").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/summary_op.cc b/tensorflow/core/kernels/summary_op.cc
index 1f4e3418f4826dee789002d4aa688f8ce14e17d2..1053aa7d53ad5f831f8127036d8156cdde772b70 100644
--- a/tensorflow/core/kernels/summary_op.cc
+++ b/tensorflow/core/kernels/summary_op.cc
@@ -124,7 +124,9 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER)
 struct HistogramResource : public ResourceBase {
   histogram::ThreadSafeHistogram histogram;
 
-  string DebugString() override { return "A histogram summary. Stats ..."; }
+  string DebugString() const override {
+    return "A histogram summary. Stats ...";
+  }
 };
 
 class SummaryMergeOp : public OpKernel {
diff --git a/tensorflow/core/kernels/svd_op_gpu.cu.cc b/tensorflow/core/kernels/svd_op_gpu.cu.cc
index 8c3a58b108abe66f2b61b5153923bee192246cd1..9e308cfc0237aeb64754c81595e17ff6a06c16a5 100644
--- a/tensorflow/core/kernels/svd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/svd_op_gpu.cu.cc
@@ -93,9 +93,48 @@ class SvdOpGpu : public AsyncOpKernel {
   }
 
   void RunSVD(OpKernelContext* context, DoneCallback done, int64 m, int64 n,
-              int64 p, int64 batch_size, Scalar* input_ptr,
-              RealScalar* outputS_ptr, Scalar* outputU_ptr,
-              Scalar* outputVT_ptr, int* dev_info_ptr, CudaSolver* solver) {
+              int64 p, Tensor& M_copy, Tensor* S, Tensor* U, Tensor* V,
+              std::unique_ptr<CudaSolver> solver) {
+    // Compute U S V* = M.
+    // 1. cuSolver works in column-major rather than row-major.
+    // 2. Gesvd returns V*.
+    // 3. Hence M should be transposed before input and U (rather than V) should
+    // be transposed on output.
+
+    Tensor u_copy;
+    if (compute_uv_) {
+      TensorShape u_shape;
+      if (full_matrices_) {
+        u_shape = U->shape();
+      } else {
+        TensorShape shapeRaw = M_copy.shape();
+        shapeRaw.RemoveLastDims(2);
+        u_shape = shapeRaw;
+        u_shape.AddDim(p);
+        u_shape.AddDim(m);
+      }
+      OP_REQUIRES_OK_ASYNC(
+          context, solver->allocate_scoped_tensor(U->dtype(), u_shape, &u_copy),
+          done);
+    }
+
+    // get the pointers to the data
+    Scalar* input_ptr;
+    RealScalar* outputS_ptr;
+    Scalar* outputU_ptr = NULL;
+    Scalar* outputV_ptr = NULL;
+    auto input_reshaped = M_copy.template flat_inner_dims<Scalar, 3>();
+    input_ptr = input_reshaped.data();
+    outputS_ptr = S->template flat_inner_dims<RealScalar, 2>().data();
+    if (compute_uv_) {
+      outputU_ptr = u_copy.template flat_inner_dims<Scalar, 3>().data();
+      outputV_ptr = V->template flat_inner_dims<Scalar, 3>().data();
+    }
+    const int64 batch_size = input_reshaped.dimension(0);
+    std::vector<DeviceLapackInfo> dev_info;
+    dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "gesvd"));
+    int* dev_info_ptr = dev_info.back().mutable_data();
+
     // Save the input matrix
     // Needed for the n=1 fix, see below, since SVD destroys the input
     Tensor input_copy;
@@ -121,12 +160,12 @@ class SvdOpGpu : public AsyncOpKernel {
       if (compute_uv_) {
         if (full_matrices_) {
           outputU = outputU_ptr + batch * m * m;
-          outputVT = outputVT_ptr + batch * n * n;
+          outputVT = outputV_ptr + batch * n * n;
           jobu = 'A';
           jobvt = 'A';
         } else {
           outputU = outputU_ptr + batch * m * p;
-          outputVT = outputVT_ptr + batch * n * p;
+          outputVT = outputV_ptr + batch * n * p;
           jobu = 'S';
           jobvt = 'S';
         }
@@ -155,17 +194,24 @@ class SvdOpGpu : public AsyncOpKernel {
     if (compute_uv_ && n == 1) {
       // 1. compute the (batched) sum
       const GPUDevice& d = context->eigen_device<GPUDevice>();
-      d.memset(outputVT_ptr, 0, batch_size * sizeof(Scalar));
+      d.memset(outputV_ptr, 0, batch_size * sizeof(Scalar));
       Cuda2DLaunchConfig cfg2D = GetCuda2DLaunchConfig(batch_size, m, d);
       ComputeValueOfVKernel<<<cfg2D.block_count, cfg2D.thread_per_block, 0,
                               d.stream()>>>(
           cfg2D, m, full_matrices_ ? m : p, input_copy.flat<Scalar>().data(),
-          outputU_ptr, outputS_ptr, outputVT_ptr);
+          outputU_ptr, outputS_ptr, outputV_ptr);
       // 2. clamp V to -1 or +1
       CudaLaunchConfig cfg1D = GetCudaLaunchConfig(batch_size, d);
       ExtractSignOfVKernel<<<cfg1D.block_count, cfg1D.thread_per_block, 0,
-                             d.stream()>>>(cfg1D, outputVT_ptr);
+                             d.stream()>>>(cfg1D, outputV_ptr);
     }
+
+    if (compute_uv_) {
+      auto device = context->eigen_device<GPUDevice>();
+      OP_REQUIRES_OK_ASYNC(context, DoMatrixTranspose(device, u_copy, U), done);
+    }
+
+    CheckResult(context, std::move(done), dev_info, std::move(solver));
   }
 
   void CheckResult(OpKernelContext* context, DoneCallback done,
@@ -192,10 +238,9 @@ class SvdOpGpu : public AsyncOpKernel {
   void PerformSVD_MgeqN(OpKernelContext* context, DoneCallback done, int64 m,
                         int64 n, int64 p, const Tensor& M, Tensor* S, Tensor* U,
                         Tensor* V) {
+    // Transpose M, because cuSolver expects it to be column-major
     TensorShape shapeRaw = M.shape();
     shapeRaw.RemoveLastDims(2);
-
-    // Transpose M, because cuSolver expects it to be column-major
     TensorShape input_shape = shapeRaw;
     input_shape.AddDim(n);
     input_shape.AddDim(m);
@@ -210,58 +255,16 @@ class SvdOpGpu : public AsyncOpKernel {
     OP_REQUIRES_OK_ASYNC(context, DoMatrixTranspose(device, M, &input_copy),
                          done);
 
-    // I need to transpose U at the end
-    // Not V, because cuSolver work column-major
-    Tensor u_copy;
-    if (compute_uv_) {
-      TensorShape u_shape;
-      if (full_matrices_) {
-        u_shape = U->shape();
-      } else {
-        u_shape = shapeRaw;
-        u_shape.AddDim(p);
-        u_shape.AddDim(m);
-      }
-      OP_REQUIRES_OK_ASYNC(
-          context, solver->allocate_scoped_tensor(U->dtype(), u_shape, &u_copy),
-          done);
-    }
-
-    // get the pointers to the data
-    Scalar* input_ptr;
-    RealScalar* outputS_ptr;
-    Scalar* outputU_ptr = NULL;
-    Scalar* outputV_ptr = NULL;
-    auto input_reshaped = input_copy.template flat_inner_dims<Scalar, 3>();
-    input_ptr = input_reshaped.data();
-    outputS_ptr = S->template flat_inner_dims<RealScalar, 2>().data();
-    if (compute_uv_) {
-      outputU_ptr = u_copy.template flat_inner_dims<Scalar, 3>().data();
-      outputV_ptr = V->template flat_inner_dims<Scalar, 3>().data();
-    }
-
-    // call the SVD
-    const int64 batch_size = input_reshaped.dimension(0);
-    std::vector<DeviceLapackInfo> dev_info;
-    dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "gesvd"));
-    RunSVD(context, done, m, n, p, batch_size, input_ptr, outputS_ptr,
-           outputU_ptr, outputV_ptr, dev_info.back().mutable_data(),
-           solver.get());
-
-    // Transpose U
-    if (compute_uv_) {
-      OP_REQUIRES_OK_ASYNC(context, DoMatrixTranspose(device, u_copy, U), done);
-    }
-
-    // now check if the SVD operation succeeded or not
-    CheckResult(context, std::move(done), dev_info, std::move(solver));
+    // Call the SVD: compute U S V* = M.
+    RunSVD(context, done, m, n, p, input_copy, S, U, V, std::move(solver));
   }
 
   // The SVD if m < n
   void PerformSVD_MlessN(OpKernelContext* context, DoneCallback done, int64 m,
                          int64 n, int64 p, const Tensor& M, Tensor* S,
                          Tensor* U, Tensor* V) {
-    // Perform the SVD on M'
+    // Perform the SVD on M'. cuSolver works column major so don't need to
+    // transpose M.
 
     // Reuse the input buffer or make a copy for the SVD depending on whether
     // this op owns the input buffer exclusively. This is needed because the
@@ -281,55 +284,8 @@ class SvdOpGpu : public AsyncOpKernel {
                M.NumElements() * sizeof(Scalar));
     }
 
-    // I need to transpose V at the end
-    Tensor v_copy;
-    if (compute_uv_) {
-      TensorShape v_shape;
-      if (full_matrices_) {
-        v_shape = V->shape();
-      } else {
-        TensorShape shapeRaw = M.shape();
-        shapeRaw.RemoveLastDims(2);
-        v_shape = shapeRaw;
-        v_shape.AddDim(p);
-        v_shape.AddDim(n);
-      }
-      OP_REQUIRES_OK_ASYNC(
-          context, solver->allocate_scoped_tensor(V->dtype(), v_shape, &v_copy),
-          done);
-    }
-
-    // get the pointers to the data
-    Scalar* input_ptr;
-    RealScalar* outputS_ptr;
-    Scalar* outputU_ptr = NULL;
-    Scalar* outputV_ptr = NULL;
-    auto input_reshaped = input_copy.template flat_inner_dims<Scalar, 3>();
-    input_ptr = input_reshaped.data();
-    outputS_ptr = S->template flat_inner_dims<RealScalar, 2>().data();
-    if (compute_uv_) {
-      // Note that U and V are flipped
-      outputU_ptr = v_copy.template flat_inner_dims<Scalar, 3>().data();
-      outputV_ptr = U->template flat_inner_dims<Scalar, 3>().data();
-    }
-
-    // call the SVD
-    const int64 batch_size = input_reshaped.dimension(0);
-    std::vector<DeviceLapackInfo> dev_info;
-    dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "gesvd"));
-    // Note that m and n are flipped
-    RunSVD(context, done, n, m, p, batch_size, input_ptr, outputS_ptr,
-           outputU_ptr, outputV_ptr, dev_info.back().mutable_data(),
-           solver.get());
-
-    // Transpose V
-    if (compute_uv_) {
-      auto device = context->eigen_device<GPUDevice>();
-      OP_REQUIRES_OK_ASYNC(context, DoMatrixTranspose(device, v_copy, V), done);
-    }
-
-    // now check if the SVD operation succeeded or not
-    CheckResult(context, std::move(done), dev_info, std::move(solver));
+    // Call the SVD: compute V S U* = M*.
+    RunSVD(context, done, n, m, p, input_copy, S, V, U, std::move(solver));
   }
 
   void ComputeAsync(OpKernelContext* context, DoneCallback done) final {
diff --git a/tensorflow/core/kernels/tensor_array.h b/tensorflow/core/kernels/tensor_array.h
index 384a63e945306637bcf074d1f3709eea055bffe9..507ab459ca5ee773e7fa3f3c77dc511a55957dd0 100644
--- a/tensorflow/core/kernels/tensor_array.h
+++ b/tensorflow/core/kernels/tensor_array.h
@@ -261,7 +261,7 @@ class TensorArray : public ResourceBase {
     return Status::OK();
   }
 
-  string DebugString() override {
+  string DebugString() const override {
     mutex_lock l(mu_);
     CHECK(!closed_);
     return strings::StrCat("TensorArray[", tensors_.size(), "]");
@@ -376,7 +376,7 @@ class TensorArray : public ResourceBase {
   const DataType dtype_;
   Tensor handle_;
 
-  mutex mu_;
+  mutable mutex mu_;
 
   // Marks that the tensor_array_ has been cleared.
   bool closed_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index aa85f546a81d0e6b8cf41fc23532fd4a11fe42ec..129035638ab0e3d427a3fa55e1de0ded7e07a85c 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -23,13 +23,13 @@ limitations under the License.
 #include <numeric>  // clang-format off
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/kernels/split_lib.h"
 #include "tensorflow/core/kernels/tensor_array.h"
diff --git a/tensorflow/core/kernels/tensor_flag_utils.cc b/tensorflow/core/kernels/tensor_flag_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b6f91927298078168a78144c361f50661c54c096
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_flag_utils.cc
@@ -0,0 +1,187 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/tensor_flag_utils.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+namespace tensor_flag_utils {
+
+Status ValidateSparseMatrixShardingConfig(const Tensor& config) {
+  if (TensorShapeUtils::IsScalar(config.shape())) {
+    const float scalar_config = config.template scalar<float>()();
+    if (0 < scalar_config && scalar_config <= 1.0) {
+      return Status::OK();
+    }
+    return Status(
+        error::INVALID_ARGUMENT,
+        absl::StrCat("Expected config to be in range (0, 1] but instead found ",
+                     scalar_config));
+  }
+  if (!TensorShapeUtils::IsMatrix(config.shape())) {
+    return Status(error::INVALID_ARGUMENT,
+                  absl::StrCat("Expected config to be either scalar or matrix "
+                               "but instead found tensor of rank ",
+                               config.dims()));
+  }
+  if (config.dim_size(1) != 3) {
+    return Status(
+        error::INVALID_ARGUMENT,
+        absl::StrCat(
+            "Expected config matrix to have dim(1) = 3 but instead found ",
+            config.dim_size(1)));
+  }
+
+  auto config_matrix = config.matrix<float>();
+  for (int i = 0; i < config.dim_size(0); ++i) {
+    if (0 > config_matrix(i, 0)) {
+      return errors::InvalidArgument(
+          "First column of fraction_rows_per_thread_config "
+          "should "
+          "have non-negative values but found ",
+          config_matrix(i, 0), " in row ", i);
+    }
+    if (0 > config_matrix(i, 1)) {
+      return errors::InvalidArgument(
+          "Second column of fraction_rows_per_thread_config "
+          "should "
+          "have non-negative values but found ",
+          config_matrix(i, 1), " in row ", i);
+    }
+    if (!(0 < config_matrix(i, 2) && config_matrix(i, 2) <= 1)) {
+      return errors::InvalidArgument(
+          "Last column of fraction_rows_per_thread_config should "
+          "have values in the range (0, 1] but found ",
+          config_matrix(i, 2), " in row ", i);
+    }
+  }
+  return Status::OK();
+}
+
+template <typename MatrixType, typename K>
+MatrixType FindConfigValueForKey(
+    const typename TTypes<MatrixType>::ConstMatrix& config_mat,
+    const std::pair<K, K>& key) {
+  const int last_row_index = config_mat.dimension(0) - 1;
+  for (int i = 0; i < last_row_index; ++i) {
+    if (key.first >= config_mat(i, 0) && key.second >= config_mat(i, 1)) {
+      return config_mat(i, 2);
+    }
+  }
+  return config_mat(last_row_index, 2);
+}
+
+Status ValidateScalarQuantityShardingConfig(const Tensor& config) {
+  if (TensorShapeUtils::IsScalar(config.shape())) {
+    const float scalar_config = config.template scalar<float>()();
+    if (0 < scalar_config && scalar_config <= 1.0) {
+      return Status::OK();
+    }
+    return Status(
+        error::INVALID_ARGUMENT,
+        absl::StrCat("Expected config to be in range (0, 1] but instead found ",
+                     scalar_config));
+  }
+  if (!TensorShapeUtils::IsMatrix(config.shape())) {
+    return Status(error::INVALID_ARGUMENT,
+                  absl::StrCat("Expected config to be either scalar or matrix "
+                               "but instead found tensor of rank ",
+                               config.dims()));
+  }
+  if (config.dim_size(1) != 2) {
+    return Status(
+        error::INVALID_ARGUMENT,
+        absl::StrCat(
+            "Expected config matrix to have dim(1) = 2 but instead found ",
+            config.dim_size(1)));
+  }
+
+  auto config_matrix = config.matrix<float>();
+  for (int i = 0; i < config.dim_size(0); ++i) {
+    if (0 > config_matrix(i, 0)) {
+      return errors::InvalidArgument(
+          "First column of fraction_rows_per_thread_config "
+          "should "
+          "have non-negative values but found ",
+          config_matrix(i, 0), " in row ", i);
+    }
+    if (!(0 < config_matrix(i, 1) && config_matrix(i, 1) <= 1)) {
+      return errors::InvalidArgument(
+          "Last column of fraction_rows_per_thread_config should "
+          "have values in the range (0, 1] but found ",
+          config_matrix(i, 1), " in row ", i);
+    }
+  }
+  return Status::OK();
+}
+
+template <typename MatrixType, typename K>
+MatrixType FindConfigValueForKey(
+    const typename TTypes<MatrixType>::ConstMatrix& config_mat, const K key) {
+  const int last_row_index = config_mat.dimension(0) - 1;
+  for (int i = 0; i < last_row_index; ++i) {
+    if (key >= config_mat(i, 0)) {
+      return config_mat(i, 1);
+    }
+  }
+  return config_mat(last_row_index, 1);
+}
+
+template <typename Tindices>
+Tindices GetLinearBucket(const Tindices value, const Tindices bucket_size) {
+  const Tindices next_multiple_of_bucket_size =
+      (value + bucket_size - 1) / bucket_size * bucket_size;
+  return next_multiple_of_bucket_size - (bucket_size - 1);
+}
+
+template <typename Tindices>
+Tindices GetPowerBucket(const Tindices value, const Tindices bucket_size) {
+  if (bucket_size == 1) {
+    return 1;
+  }
+  return std::pow(bucket_size, std::floor(std::log(bucket_size * (value - 1)) /
+                                          std::log(bucket_size)) -
+                                   1) +
+         1;
+}
+
+#define REGISTER_SPARSE_UTIL_FUNCTIONS(TypeIndex)                           \
+  template float FindConfigValueForKey<float, TypeIndex>(                   \
+      const TTypes<float>::ConstMatrix& config_mat,                         \
+      const std::pair<TypeIndex, TypeIndex>& key);                          \
+  template float FindConfigValueForKey<float, TypeIndex>(                   \
+      const TTypes<float>::ConstMatrix& config_mat, const TypeIndex key);   \
+  template int64 FindConfigValueForKey<int64, TypeIndex>(                   \
+      const TTypes<int64>::ConstMatrix& config_mat, const TypeIndex key);
+
+REGISTER_SPARSE_UTIL_FUNCTIONS(int32);
+REGISTER_SPARSE_UTIL_FUNCTIONS(int64);
+REGISTER_SPARSE_UTIL_FUNCTIONS(uint8);
+REGISTER_SPARSE_UTIL_FUNCTIONS(uint16);
+REGISTER_SPARSE_UTIL_FUNCTIONS(uint32);
+REGISTER_SPARSE_UTIL_FUNCTIONS(uint64);
+
+template int32 GetLinearBucket(const int32 value, const int32 bucket_size);
+
+template int64 GetLinearBucket(const int64 value, const int64 bucket_size);
+
+template int32 GetPowerBucket(const int32 value, const int32 bucket_size);
+
+template int64 GetPowerBucket(const int64 value, const int64 bucket_size);
+
+}  // namespace tensor_flag_utils
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_flag_utils.h b/tensorflow/core/kernels/tensor_flag_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..f406c73a29769db4fa13a1368bf1570277ded928
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_flag_utils.h
@@ -0,0 +1,78 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helpers for parsing tensors as runtime flags.
+#ifndef TENSORFLOW_CORE_KERNELS_TENSOR_FLAG_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_TENSOR_FLAG_UTILS_H_
+
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace tensor_flag_utils {
+
+// Converts tensor.vec<Tindices> to an std::vector<Tindices> object, appends
+// the value num_nonzero_entries_in_sparse_mat, and returns the result.
+template <typename Tindices>
+std::vector<Tindices> ParseRowStartIndices(
+    const tensorflow::Tensor& tensor,
+    const Tindices num_nonzero_entries_in_sparse_mat);
+
+// Returns Status::OK() if and only if config is a float scalar or a matrix with
+// dimensions M x 3. If config is a scalar then config must be in the range
+// [0, 1.0). If confix is a matrix then config must have shape M x 3, all of
+// its entries must be positive, and entries in the last column may not
+// exceed 1.0. If config is a matrix then it may not be empty.
+Status ValidateSparseMatrixShardingConfig(const Tensor& config);
+
+// Returns Status::OK() if and only if config is a float scalar or a non-empty
+// matrix with dimensions M x 2.
+Status ValidateScalarQuantityShardingConfig(const Tensor& config);
+
+// Returns the last entry of the first row in config_mat for which the first
+// two entries are no smaller than the respective entries in key. If no such
+// row exists then returns the last entry in the last row in config_mat.
+// config_mat may not be empty.
+template <typename MatrixType, typename K>
+MatrixType FindConfigValueForKey(
+    const typename TTypes<MatrixType>::ConstMatrix& config_mat,
+    const std::pair<K, K>& key);
+
+// Returns the last entry of the first row in config_mat for which the first
+// two entries are no smaller than the respective entries in key. If no such
+// row exists then returns the last entry in the last row in config_mat.
+// config_mat may not be empty.
+template <typename MatrixType, typename K>
+MatrixType FindConfigValueForKey(
+    const typename TTypes<MatrixType>::ConstMatrix& config_mat, const K key);
+
+// Returns largest multiple of bucket_size less than value.
+// Expects 1 <= bucket_size <= value.
+template <typename Tindices>
+Tindices GetLinearBucket(const Tindices value, const Tindices bucket_size);
+
+// Returns the largest power of bucket_size less than value.
+// Expects 1 <= bucket_size <= value. If bucket_size = 1, returns 1.
+template <typename Tindices>
+Tindices GetPowerBucket(const Tindices value, const Tindices bucket_size);
+
+}  // namespace tensor_flag_utils
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_TENSOR_FLAG_UTILS_H_
diff --git a/tensorflow/core/kernels/tensor_flag_utils_test.cc b/tensorflow/core/kernels/tensor_flag_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..23ccc7ad7a16bb9a9cdac4c53f1a3252ae29ed6c
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_flag_utils_test.cc
@@ -0,0 +1,322 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/tensor_flag_utils.h"
+
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace {
+
+using tensorflow::DataType;
+using tensorflow::int32;
+using tensorflow::int64;
+using tensorflow::Tensor;
+using tensorflow::TTypes;
+using tensorflow::error::INVALID_ARGUMENT;
+using tensorflow::tensor_flag_utils::FindConfigValueForKey;
+using tensorflow::tensor_flag_utils::GetLinearBucket;
+using tensorflow::tensor_flag_utils::GetPowerBucket;
+using tensorflow::tensor_flag_utils::ValidateScalarQuantityShardingConfig;
+using tensorflow::tensor_flag_utils::ValidateSparseMatrixShardingConfig;
+
+TEST(SparseUtilsTest, ValidateSparseMatrixShardingConfig) {
+  // Only a default is specified.
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 0.7;
+    EXPECT_TRUE(ValidateSparseMatrixShardingConfig(t).ok());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 1.0;
+    EXPECT_TRUE(ValidateSparseMatrixShardingConfig(t).ok());
+  }
+
+  // Misshapen.
+  {
+    Tensor t(DataType::DT_FLOAT, {1, 1});
+    int indx = 0;
+    for (const float v : {60.0}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateSparseMatrixShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {1, 2});
+    int indx = 0;
+    for (const float v : {
+             60.0,
+             50.0,
+         }) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateSparseMatrixShardingConfig(t).code());
+  }
+
+  // Only one key is specified.
+  {
+    Tensor t(DataType::DT_FLOAT, {1, 3});
+    int indx = 0;
+    for (const float v : {30.0, 20.0, 1.0}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_TRUE(ValidateSparseMatrixShardingConfig(t).ok());
+  }
+
+  // Two keys are specified.
+  {
+    Tensor t(DataType::DT_FLOAT, {2, 3});
+    int indx = 0;
+    for (const float v : {60.0, 50.0, 0.41, 30.0, 20.0, 0.7}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_TRUE(ValidateSparseMatrixShardingConfig(t).ok());
+  }
+
+  // Out of range.
+  {
+    Tensor t(DataType::DT_FLOAT, {2, 3});
+    int indx = 0;
+    for (const float v : {60.0, 40.0, 0.41, 30.0, 20.0, 10.7}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateSparseMatrixShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {2, 3});
+    int indx = 0;
+    for (const float v : {60.0, 40.0, 0.41, 30.0, 20.0, -0.7}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateSparseMatrixShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {2, 3});
+    int indx = 0;
+    for (const float v : {60.0, -40.0, 0.41, 30.0, 20.0, 0.7}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateSparseMatrixShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = -0.5;
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateSparseMatrixShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 0;
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateSparseMatrixShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 1.2;
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateSparseMatrixShardingConfig(t).code());
+  }
+}
+
+TEST(SparseUtilsTest, ValidateScalarQuantityShardingConfig) {
+  // Only a default is specified.
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 0.7;
+    EXPECT_TRUE(ValidateScalarQuantityShardingConfig(t).ok());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 1.0;
+    EXPECT_TRUE(ValidateScalarQuantityShardingConfig(t).ok());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 1.2;
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+
+  // Misshapen.
+  {
+    Tensor t(DataType::DT_FLOAT, {1, 1});
+    int indx = 0;
+    for (const float v : {60.0}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {1, 2});
+    int indx = 0;
+    for (const float v : {
+             60.0,
+             50.0,
+         }) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+
+  // Two keys are specified.
+  {
+    Tensor t(DataType::DT_FLOAT, {1, 3});
+    int indx = 0;
+    for (const float v : {30.0, 20.0, 1.0}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+
+  // Only one key is specified.
+  {
+    Tensor t(DataType::DT_FLOAT, {2, 2});
+    int indx = 0;
+    for (const float v : {60.0, 0.41, 30.0, 0.7}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_TRUE(ValidateScalarQuantityShardingConfig(t).ok());
+  }
+
+  // Out of range.
+  {
+    Tensor t(DataType::DT_FLOAT, {2, 2});
+    int indx = 0;
+    for (const float v : {60.0, 0.41, 30.0, 10.7}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {2, 2});
+    int indx = 0;
+    for (const float v : {60.0, 0.41, 30.0, -0.7}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {2, 2});
+    int indx = 0;
+    for (const float v : {-40.0, 0.41, 20.0, 0.7}) {
+      t.flat<float>()(indx++) = v;
+    }
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = -0.5;
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 0;
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+  {
+    Tensor t(DataType::DT_FLOAT, {});
+    t.scalar<float>()() = 1.2;
+    EXPECT_EQ(INVALID_ARGUMENT, ValidateScalarQuantityShardingConfig(t).code());
+  }
+}
+
+TEST(SparseUtils, FindConfigValueForKey) {
+  {
+    float data[] = {60.0, 50.0, 0.41, 30.0, 20.0, 0.1, 0, 0, 0.7};
+    TTypes<float>::ConstMatrix config_mat(data, 3, 3);
+    auto val = FindConfigValueForKey<float, int32>(config_mat, {70, 40});
+    EXPECT_FLOAT_EQ(0.1, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {60, 50});
+    EXPECT_FLOAT_EQ(0.41, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {60, 60});
+    EXPECT_FLOAT_EQ(0.41, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {60, 40});
+    EXPECT_FLOAT_EQ(0.1, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {50, 60});
+    EXPECT_FLOAT_EQ(0.1, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {20, 30});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {30, 10});
+    EXPECT_FLOAT_EQ(0.7, val);
+  }
+  {
+    float data[] = {0, 0, 0.7};
+    TTypes<float>::ConstMatrix config_mat(data, 1, 3);
+    auto val = FindConfigValueForKey<float, int64>(config_mat, {70, 40});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int64>(config_mat, {60, 50});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int64>(config_mat, {60, 60});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int64>(config_mat, {60, 40});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int64>(config_mat, {50, 60});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int64>(config_mat, {20, 30});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int64>(config_mat, {30, 10});
+    EXPECT_FLOAT_EQ(0.7, val);
+  }
+  {
+    float data[] = {60.0, 50.0, 0.41, 0, 0, 0.7};
+    TTypes<float>::ConstMatrix config_mat(data, 2, 3);
+    auto val = FindConfigValueForKey<float, int32>(config_mat, {70, 40});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {60, 50});
+    EXPECT_FLOAT_EQ(0.41, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {60, 60});
+    EXPECT_FLOAT_EQ(0.41, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {60, 40});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {50, 60});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {20, 30});
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, {30, 10});
+    EXPECT_FLOAT_EQ(0.7, val);
+  }
+  {
+    float data[] = {60.0, 0.41, 50.0, 0.14, 0, 0.7};
+    TTypes<float>::ConstMatrix config_mat(data, 3, 2);
+    auto val = FindConfigValueForKey<float, int32>(config_mat, 70);
+    EXPECT_FLOAT_EQ(0.41, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, 60);
+    EXPECT_FLOAT_EQ(0.41, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, 55);
+    EXPECT_FLOAT_EQ(0.14, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, 50);
+    EXPECT_FLOAT_EQ(0.14, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, 20);
+    EXPECT_FLOAT_EQ(0.7, val);
+    val = FindConfigValueForKey<float, int32>(config_mat, 30);
+    EXPECT_FLOAT_EQ(0.7, val);
+  }
+}
+
+TEST(SparseUtils, GetLinearBucket) {
+  EXPECT_EQ(11, GetLinearBucket(11, 5));
+  EXPECT_EQ(11, GetLinearBucket(12, 5));
+  EXPECT_EQ(1, GetLinearBucket(4ll, 5ll));
+}
+
+TEST(SparseUtils, GetPowerBucket) {
+  EXPECT_EQ(6, GetPowerBucket(11, 5));
+  EXPECT_EQ(6, GetPowerBucket(12, 5));
+  EXPECT_EQ(1332, GetPowerBucket(1335, 11));
+  EXPECT_EQ(5, GetPowerBucket(5ll, 4ll));
+  EXPECT_EQ(1, GetPowerBucket(4ll, 1ll));
+}
+
+}  // namespace
diff --git a/tensorflow/core/kernels/tensor_forest/BUILD b/tensorflow/core/kernels/tensor_forest/BUILD
index df035506f7698d1d213efad6088e9bfb53d97282..0060410c95787fb69d206b646afd66c31a821f05 100644
--- a/tensorflow/core/kernels/tensor_forest/BUILD
+++ b/tensorflow/core/kernels/tensor_forest/BUILD
@@ -27,7 +27,6 @@ tf_kernel_library(
         ":resources",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:tensor_forest_ops_op_lib",
         "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
     ],
 )
@@ -39,7 +38,6 @@ tf_kernel_library(
         ":resources",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:tensor_forest_ops_op_lib",
         "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
     ],
 )
diff --git a/tensorflow/core/kernels/tensor_forest/resources.h b/tensorflow/core/kernels/tensor_forest/resources.h
index da258e5017ca8cc9b996d83bcd767e89d61322d7..f0a78f97264336acc9ba293d6547cc0fe10343ee 100644
--- a/tensorflow/core/kernels/tensor_forest/resources.h
+++ b/tensorflow/core/kernels/tensor_forest/resources.h
@@ -34,7 +34,7 @@ class TensorForestTreeResource : public ResourceBase {
  public:
   TensorForestTreeResource();
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat("TensorForestTree[size=", get_size(), "]");
   }
 
diff --git a/tensorflow/core/kernels/tile_functor_cpu.cc b/tensorflow/core/kernels/tile_functor_cpu.cc
index f8144867014eccf04c892d0ce90a2aa280dfd764..43fd0d20adbf45ff135e46959506d71018fb1858 100644
--- a/tensorflow/core/kernels/tile_functor_cpu.cc
+++ b/tensorflow/core/kernels/tile_functor_cpu.cc
@@ -57,6 +57,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 
 TF_CALL_bool(DEFINE_TYPE);
 TF_CALL_float(DEFINE_TYPE);
+TF_CALL_bfloat16(DEFINE_TYPE);
 TF_CALL_double(DEFINE_TYPE);
 TF_CALL_uint8(DEFINE_TYPE);
 TF_CALL_int32(DEFINE_TYPE);
@@ -78,6 +79,7 @@ typedef Eigen::SyclDevice SYCLDevice;
 
 TF_CALL_bool(DEFINE_TYPE);
 TF_CALL_float(DEFINE_TYPE);
+TF_CALL_bfloat16(DEFINE_TYPE);
 TF_CALL_double(DEFINE_TYPE);
 TF_CALL_uint8(DEFINE_TYPE);
 TF_CALL_int32(DEFINE_TYPE);
diff --git a/tensorflow/core/kernels/tile_functor_gpu.cu.cc b/tensorflow/core/kernels/tile_functor_gpu.h
similarity index 85%
rename from tensorflow/core/kernels/tile_functor_gpu.cu.cc
rename to tensorflow/core/kernels/tile_functor_gpu.h
index 84a5060fc3cd17c09b905d606dba62bbaa7f1373..0de32e730ed858ccc3dfcbacb65a7cf922aa5ce2 100644
--- a/tensorflow/core/kernels/tile_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/tile_functor_gpu.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_
+
 #if GOOGLE_CUDA
 
 #define EIGEN_USE_GPU
@@ -80,28 +83,7 @@ void TileSimple(const Device& d, Tensor* out, const Tensor& in) {
 }
 
 }  // end namespace internal
-
-namespace functor {
-
-typedef Eigen::GpuDevice GPUDevice;
-
-// Register functors used for Tile functor.
-#define DEFINE_TYPE(T)                       \
-  template struct Tile<GPUDevice, T, int32>; \
-  template struct Tile<GPUDevice, T, int64>;
-
-TF_CALL_bool(DEFINE_TYPE);
-TF_CALL_int16(DEFINE_TYPE);
-TF_CALL_int32(DEFINE_TYPE);
-TF_CALL_int64(DEFINE_TYPE);
-TF_CALL_float(DEFINE_TYPE);
-TF_CALL_double(DEFINE_TYPE);
-TF_CALL_half(DEFINE_TYPE);
-TF_CALL_complex64(DEFINE_TYPE);
-TF_CALL_complex128(DEFINE_TYPE);
-
-#undef DEFINE_TYPE
-
-}  // end namespace functor
 }  // namespace tensorflow
 #endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_TILE_FUNCTOR_GPU_H_
diff --git a/tensorflow/core/kernels/tile_functor_gpu_bool.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_bool.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c7a814c7a2c4de5964deb2eff875235f293cd7b0
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_bool.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, bool, int32>;
+template struct Tile<GpuDevice, bool, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_functor_gpu_complex128.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_complex128.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4dfa4bac1b6a08acc4c8eed18785785b3e4d6071
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_complex128.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, complex128, int32>;
+template struct Tile<GpuDevice, complex128, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_functor_gpu_complex64.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_complex64.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..525ede938fd6d31df514ad9f6c049d62f8c25740
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_complex64.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, complex64, int32>;
+template struct Tile<GpuDevice, complex64, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_functor_gpu_double.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_double.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..25e024083e3d3ed44af51f1ff1ae2fb1305be526
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_double.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, double, int32>;
+template struct Tile<GpuDevice, double, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_functor_gpu_float.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_float.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f0f31370e43cdd3e06aadfe6daf0eb988cfd6ce4
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_float.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, float, int32>;
+template struct Tile<GpuDevice, float, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_functor_gpu_half.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_half.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c3810a0bc63de50360845e5c56a693ebff56c2e
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_half.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, Eigen::half, int32>;
+template struct Tile<GpuDevice, Eigen::half, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_functor_gpu_int16.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_int16.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2280dcbc82d320586ca262c8c372970a70958f27
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_int16.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, int16, int32>;
+template struct Tile<GpuDevice, int16, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_functor_gpu_int32.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_int32.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b05403badae96d24fde13c1532eb32ab67695d06
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_int32.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, int32, int32>;
+template struct Tile<GpuDevice, int32, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_functor_gpu_int64.cu.cc b/tensorflow/core/kernels/tile_functor_gpu_int64.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2d83c6b3a1c2257b47ab978767713e9d93d22323
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_gpu_int64.cu.cc
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/tile_functor.h"
+#include "tensorflow/core/kernels/tile_functor_gpu.h"
+
+namespace tensorflow {
+namespace functor {
+using Eigen::GpuDevice;
+
+template struct Tile<GpuDevice, int64, int32>;
+template struct Tile<GpuDevice, int64, int64>;
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index d714876bdaa964a35c9f011e34b6ec1d7b962ce7..2e01fa17630e3b32845dd4828b0907a45e4e42d9 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -136,6 +136,7 @@ class TileOp : public OpKernel {
 
     // Invoke macro using TF_CALL_* so type-filtering for platform applies.
     TF_CALL_bool(HANDLE_TYPE_NAME);
+    TF_CALL_bfloat16(HANDLE_TYPE_NAME);
     TF_CALL_float(HANDLE_TYPE_NAME);
     TF_CALL_double(HANDLE_TYPE_NAME);
     TF_CALL_uint8(HANDLE_TYPE_NAME);
@@ -214,6 +215,7 @@ inline void TileOp<Device, Tmultiples>::HandleCase(
 
 TF_CALL_bool(HANDLE_TYPE_NAME_CPU);
 TF_CALL_float(HANDLE_TYPE_NAME_CPU);
+TF_CALL_bfloat16(HANDLE_TYPE_NAME_CPU);
 TF_CALL_double(HANDLE_TYPE_NAME_CPU);
 TF_CALL_uint8(HANDLE_TYPE_NAME_CPU);
 TF_CALL_int32(HANDLE_TYPE_NAME_CPU);
@@ -325,6 +327,7 @@ class TileGradientOp : public OpKernel {
     TF_CALL_int16(HANDLE_TYPE_NAME);
     TF_CALL_int64(HANDLE_TYPE_NAME);
     TF_CALL_half(HANDLE_TYPE_NAME);
+    TF_CALL_bfloat16(HANDLE_TYPE_NAME);
     TF_CALL_complex64(HANDLE_TYPE_NAME);
     TF_CALL_complex128(HANDLE_TYPE_NAME);
 
diff --git a/tensorflow/core/kernels/tile_ops_cpu_impl.h b/tensorflow/core/kernels/tile_ops_cpu_impl.h
index df6a666cd441d9c1306d950bbe0e79bf3dae28d9..8b0c80159a34cb7c61f2efcb9a001c6950be23c2 100644
--- a/tensorflow/core/kernels/tile_ops_cpu_impl.h
+++ b/tensorflow/core/kernels/tile_ops_cpu_impl.h
@@ -33,6 +33,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 #define DEFINE_TYPE(T) DEFINE_DIM(T, CPU_PROVIDED_IXDIM)
 
 TF_CALL_float(DEFINE_TYPE);
+TF_CALL_bfloat16(DEFINE_TYPE);
 TF_CALL_double(DEFINE_TYPE);
 TF_CALL_int16(DEFINE_TYPE);
 TF_CALL_int32(DEFINE_TYPE);
@@ -55,6 +56,7 @@ typedef Eigen::SyclDevice SYCLDevice;
 
 TF_CALL_bool(DEFINE_TYPE);
 TF_CALL_float(DEFINE_TYPE);
+TF_CALL_bfloat16(DEFINE_TYPE);
 TF_CALL_double(DEFINE_TYPE);
 TF_CALL_uint8(DEFINE_TYPE);
 TF_CALL_int16(DEFINE_TYPE);
diff --git a/tensorflow/core/kernels/topk_op_gpu.cu.cc b/tensorflow/core/kernels/topk_op_gpu.h
similarity index 98%
rename from tensorflow/core/kernels/topk_op_gpu.cu.cc
rename to tensorflow/core/kernels/topk_op_gpu.h
index 2fbe1fe7cbb5ad0d90dfcb651fdbb8359c7c1d69..70d6a606647207dd77793299617cef649b9a33fa 100644
--- a/tensorflow/core/kernels/topk_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/topk_op_gpu.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_TOPK_OP_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_TOPK_OP_GPU_H_
 
 #if GOOGLE_CUDA
 
@@ -410,7 +412,7 @@ struct SegmentOffsetCreator {
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
     return idx * num_cols_;
-  };
+  }
 
   int num_cols_;
 };
@@ -561,14 +563,8 @@ struct TopKFunctor<GPUDevice, T> {
 };
 
 }  // end namespace functor
-
-#define INSTANTIATE_TEMPLATE(type) \
-  template struct functor::TopKFunctor<GPUDevice, type>;
-
-TF_CALL_GPU_NUMBER_TYPES(INSTANTIATE_TEMPLATE);
-TF_CALL_INTEGRAL_TYPES(INSTANTIATE_TEMPLATE);
-#undef INSTANTIATE_TEMPLATE
-
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_TOPK_OP_GPU_H_
diff --git a/tensorflow/core/kernels/topk_op_gpu_double.cu.cc b/tensorflow/core/kernels/topk_op_gpu_double.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8a5a7e71b1b3126335acd75d1061b816046a18b7
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_double.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, double>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_float.cu.cc b/tensorflow/core/kernels/topk_op_gpu_float.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0b69396bb13dc4414e07e742c7ed90b03fc3df51
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_float.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, float>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_half.cu.cc b/tensorflow/core/kernels/topk_op_gpu_half.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e53586aeca2d00c1d6e6e75fad9538abc8ba1d6a
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_half.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, Eigen::half>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_int16.cu.cc b/tensorflow/core/kernels/topk_op_gpu_int16.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5bd310523c98d33cadd6324296468629f0dbec4b
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_int16.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, int16>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_int32.cu.cc b/tensorflow/core/kernels/topk_op_gpu_int32.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..55b393a0c02b15c4bce08994e1d8a4e82684d97b
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_int32.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, int32>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_int64.cu.cc b/tensorflow/core/kernels/topk_op_gpu_int64.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3e4a775056310d2e58d8f339bcace213741ef699
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_int64.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, int64>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_int8.cu.cc b/tensorflow/core/kernels/topk_op_gpu_int8.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ac73cd170b8fbd956921120ac106b0b1813b1605
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_int8.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, int8>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8d5f8ceb06d171c43cf25e59fe47602f4410977f
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, uint16>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/topk_op_gpu_uint8.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint8.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fc1a8a2c8cca11e52d2b9eb53c269cc78e44b3d1
--- /dev/null
+++ b/tensorflow/core/kernels/topk_op_gpu_uint8.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/topk_op.h"
+#include "tensorflow/core/kernels/topk_op_gpu.h"
+
+namespace tensorflow {
+using Eigen::GpuDevice;
+
+template struct functor::TopKFunctor<GPUDevice, uint8>;
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/training_op_helpers.cc b/tensorflow/core/kernels/training_op_helpers.cc
index 4262a5404b6ac233d0fe7a8453e3e875eb9caf1f..20c08cf8fbb6b911c8b89b719237ac4677151e3c 100644
--- a/tensorflow/core/kernels/training_op_helpers.cc
+++ b/tensorflow/core/kernels/training_op_helpers.cc
@@ -19,70 +19,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-mutex* GetTrainingVariableMutex(OpKernelContext* ctx, int input,
-                                Var** maybe_resource) {
-  *maybe_resource = nullptr;
-  if (ctx->input_dtype(input) == DT_RESOURCE) {
-    if (LookupResource(ctx, HandleFromInput(ctx, input), maybe_resource).ok()) {
-      return (*maybe_resource)->mu();
-    } else {
-      ctx->CtxFailureWithWarning(
-          errors::Internal("Invalid variable reference."));
-      return nullptr;
-    }
-  }
-  return ctx->input_ref_mutex(input);
-}
-
-// MaybeLockVariableInputMutexesInOrder is a helper function to acquire mutexes
-// in address order to mitigate deadlock.  Returns a structure that, when
-// deleted, will release the acquired mutexes. Safe to pass duplicates - will
-// only lock each distinct mutex once.  If do_lock is false, returns
-// immediately.  Note that this silently doesn't lock mutexes for invalid
-// variable references; in all usages this is followed by GetInputTensor which
-// will signal a failure.
-VariableInputLockHolder MaybeLockVariableInputMutexesInOrder(
-    OpKernelContext* ctx, bool do_lock, const std::vector<int>& input_ids) {
-  bool any_resource = false;
-  for (auto i : input_ids) {
-    if (ctx->input_dtype(i) == DT_RESOURCE) {
-      any_resource = true;
-      break;
-    }
-  }
-  if (!do_lock && !any_resource) {
-    return VariableInputLockHolder({}, {});
-  }
-  std::vector<Var*> vars;
-  std::vector<mutex*> mutexes;
-  std::vector<int> acquire_order;
-  for (auto input : input_ids) {
-    Var* var;
-    mutex* mutex = GetTrainingVariableMutex(ctx, input, &var);
-    if (var) vars.push_back(var);
-    // Only lock each mutex once if duplicates exist (n^2 but n is 2 or 3).
-    if (std::find(mutexes.begin(), mutexes.end(), mutex) == mutexes.end()) {
-      acquire_order.push_back(mutexes.size());
-      mutexes.push_back(mutex);
-    }
-  }
-  std::sort(acquire_order.begin(), acquire_order.end(),
-            [&mutexes](int a, int b) { return mutexes[a] < mutexes[b]; });
-
-  std::unique_ptr<std::vector<mutex_lock>> locks =
-      MakeUnique<std::vector<mutex_lock>>();
-  locks->reserve(acquire_order.size());
-
-  for (auto input : acquire_order) {
-    Var* var;
-    mutex* mu = GetTrainingVariableMutex(ctx, input, &var);
-    core::ScopedUnref scoped_unref(var);
-    if (mu != nullptr) {
-      locks->emplace_back(*mu);
-    }
-  }
-  return VariableInputLockHolder(std::move(vars), std::move(locks));
-}
 
 void MaybeForwardRefInputToRefOutput(OpKernelContext* ctx, int input,
                                      int output) {
diff --git a/tensorflow/core/kernels/training_op_helpers.h b/tensorflow/core/kernels/training_op_helpers.h
index 9f173a80f74612beaa4da265658eafb5b9e92360..715dd8af7daa1d31587a0efe5965025461231ec4 100644
--- a/tensorflow/core/kernels/training_op_helpers.h
+++ b/tensorflow/core/kernels/training_op_helpers.h
@@ -17,30 +17,72 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_TRAINING_OP_HELPERS_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/variable_ops.h"
 
 namespace tensorflow {
 
-// Returns a borrowed pointer to the mutex for the variable `input` in `ctx`.
-//
-// If `input` corresponds to a `DT_RESOURCE`-type variable input,
-// `*maybe_resource` will be updated to contain the underlying resource, and the
-// caller will be responsible for calling `Unref()` on that resource.
-mutex* GetTrainingVariableMutex(OpKernelContext* ctx, int input,
-                                Var** maybe_resource);
+// Must be called before performing a sparse operation on a variable. Ensures
+// that no concurrent dense operations can happen while holding the variable's
+// lock.
+template <typename Device, typename T>
+Status EnsureSparseVariableAccess(OpKernelContext* ctx, Var* var) {
+  if (var->copy_on_read_mode.load()) {
+    return Status::OK();
+  }
+  mutex_lock ml(*var->mu());
+  // Once copy-on-read mode is True the refcount is guaranteed to be 1. This can
+  // also happen if there are no concurrent reads of the variable and
+  // copy-on-read mode is false.
+  if (var->tensor()->RefCountIsOne()) {
+    var->copy_on_read_mode.store(true);
+    return Status::OK();
+  }
+  PersistentTensor unused;
+  Tensor* tmp;
+  if (std::is_same<T, Variant>::value) {
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    TF_RETURN_IF_ERROR(ctx->allocate_persistent(
+        var->tensor()->dtype(), var->tensor()->shape(), &unused, &tmp, attr));
+
+    const auto elements_in = var->tensor()->flat<Variant>();
+    auto elements_out = tmp->flat<Variant>();
+    for (int64 i = 0; i < elements_in.size(); ++i) {
+      elements_out(i) = elements_in(i);
+    }
+  } else {
+    AllocatorAttributes attr;
+    attr.set_gpu_compatible(true);
+    attr.set_nic_compatible(true);
+    TF_RETURN_IF_ERROR(ctx->allocate_persistent(
+        var->tensor()->dtype(), var->tensor()->shape(), &unused, &tmp, attr));
+    functor::DenseUpdate<Device, T, ASSIGN> copy_functor;
+    copy_functor(ctx->eigen_device<Device>(), tmp->flat<T>(),
+                 const_cast<const Tensor*>(var->tensor())->flat<T>());
+  }
+  *var->tensor() = *tmp;
+  var->copy_on_read_mode.store(true);
+  return Status::OK();
+}
 
 // Utility structure that releases a sequence of borrowed mutexes when it is
 // deleted.
 struct VariableInputLockHolder {
  public:
-  VariableInputLockHolder(std::vector<Var*> vars,
-                          std::unique_ptr<std::vector<mutex_lock>> locks)
-      : vars_(std::move(vars)), locks_(std::move(locks)) {}
+  VariableInputLockHolder(
+      std::vector<Var*> vars, std::unique_ptr<std::vector<mutex_lock>> locks,
+      std::unique_ptr<std::vector<tf_shared_lock>> shared_locks)
+      : vars_(std::move(vars)),
+        locks_(std::move(locks)),
+        shared_locks_(std::move(shared_locks)) {}
 
   VariableInputLockHolder(VariableInputLockHolder&& other)
-      : vars_(std::move(other.vars_)), locks_(std::move(other.locks_)) {}
+      : vars_(std::move(other.vars_)),
+        locks_(std::move(other.locks_)),
+        shared_locks_(std::move(other.shared_locks_)) {}
 
   ~VariableInputLockHolder() {
     // Release the locks before unreffing the Vars, because each lock
@@ -56,10 +98,96 @@ struct VariableInputLockHolder {
   // NOTE: Use a `std::unique_ptr` instead of moving in a vector directly,
   // because a `std::vector<mutex_lock>` is not movable on all platforms.
   std::unique_ptr<std::vector<mutex_lock>> locks_;
+  std::unique_ptr<std::vector<tf_shared_lock>> shared_locks_;
 };
 
+// Returns a borrowed pointer to the mutex for the variable `input` in `ctx`.
+//
+// If `input` corresponds to a `DT_RESOURCE`-type variable input,
+// `*maybe_resource` will be updated to contain the underlying resource, and the
+// caller will be responsible for calling `Unref()` on that resource.
+template <typename Device, typename T>
+mutex* GetTrainingVariableMutex(OpKernelContext* ctx, int input, bool sparse,
+                                Var** maybe_resource) {
+  *maybe_resource = nullptr;
+  if (ctx->input_dtype(input) == DT_RESOURCE) {
+    if (LookupResource(ctx, HandleFromInput(ctx, input), maybe_resource).ok()) {
+      if (sparse) {
+        EnsureSparseVariableAccess<Device, T>(ctx, *maybe_resource)
+            .IgnoreError();
+      }
+      return (*maybe_resource)->mu();
+    } else {
+      ctx->CtxFailureWithWarning(
+          errors::Internal("Invalid variable reference."));
+      return nullptr;
+    }
+  }
+  return ctx->input_ref_mutex(input);
+}
+
+// MaybeLockVariableInputMutexesInOrder is a helper function to acquire mutexes
+// in address order to mitigate deadlock.  Returns a structure that, when
+// deleted, will release the acquired mutexes. Safe to pass duplicates - will
+// only lock each distinct mutex once. If sparse is true will ensure the
+// variable gets switched to copy-on-read mode before trying to acquire the
+// locks. If do_lock is false, returns immediately for reference variables. For
+// resource variables in copy-on-read-mode it will grab a shared lock if do_lock
+// is false, exclusive lock otherwise.  Note that this silently doesn't lock
+// mutexes for invalid variable references; in all usages this is followed by
+// GetInputTensor which will signal a failure.
+template <typename Device, typename T>
 VariableInputLockHolder MaybeLockVariableInputMutexesInOrder(
-    OpKernelContext* ctx, bool do_lock, const std::vector<int>& input_ids);
+    OpKernelContext* ctx, bool do_lock, bool sparse,
+    const std::vector<int>& input_ids) {
+  bool any_resource = false;
+  for (auto i : input_ids) {
+    if (ctx->input_dtype(i) == DT_RESOURCE) {
+      any_resource = true;
+      break;
+    }
+  }
+  if (!do_lock && !any_resource) {
+    return VariableInputLockHolder({}, {}, {});
+  }
+  std::vector<Var*> vars;
+  std::vector<mutex*> mutexes;
+  std::vector<int> acquire_order;
+  for (auto input : input_ids) {
+    Var* var;
+    mutex* mutex =
+        GetTrainingVariableMutex<Device, T>(ctx, input, sparse, &var);
+    if (var) vars.push_back(var);
+    // Only lock each mutex once if duplicates exist (n^2 but n is 2 or 3).
+    if (std::find(mutexes.begin(), mutexes.end(), mutex) == mutexes.end()) {
+      acquire_order.push_back(mutexes.size());
+      mutexes.push_back(mutex);
+    }
+  }
+  std::sort(acquire_order.begin(), acquire_order.end(),
+            [&mutexes](int a, int b) { return mutexes[a] < mutexes[b]; });
+
+  std::unique_ptr<std::vector<mutex_lock>> locks =
+      absl::make_unique<std::vector<mutex_lock>>();
+  std::unique_ptr<std::vector<tf_shared_lock>> shared_locks =
+      absl::make_unique<std::vector<tf_shared_lock>>();
+  locks->reserve(acquire_order.size());
+
+  for (auto input : acquire_order) {
+    Var* var;
+    mutex* mu = GetTrainingVariableMutex<Device, T>(ctx, input, sparse, &var);
+    core::ScopedUnref scoped_unref(var);
+    if (mu != nullptr) {
+      if (!sparse || do_lock) {
+        locks->emplace_back(*mu);
+      } else {
+        shared_locks->emplace_back(*mu);
+      }
+    }
+  }
+  return VariableInputLockHolder(std::move(vars), std::move(locks),
+                                 std::move(shared_locks));
+}
 
 void MaybeForwardRefInputToRefOutput(OpKernelContext* ctx, int input,
                                      int output);
@@ -68,8 +196,9 @@ void MaybeForwardRefInputToRefOutput(OpKernelContext* ctx, int input,
 // reference count of 1 before you update it.
 // REQUIRES: If you pass in variable->tensor(), *variable->mu() must be held.
 template <typename Device, typename T>
-Status PrepareToUpdateVariable(OpKernelContext* ctx, Tensor* tensor) {
-  if (!tensor->RefCountIsOne()) {
+Status PrepareToUpdateVariable(OpKernelContext* ctx, Tensor* tensor,
+                               bool copy_on_read_mode) {
+  if (copy_on_read_mode || !tensor->RefCountIsOne()) {
     // Tensor's buffer is in use by some read, so we need to copy before
     // updating.
     PersistentTensor unused;
@@ -100,12 +229,14 @@ Status PrepareToUpdateVariable(OpKernelContext* ctx, Tensor* tensor) {
   return Status::OK();
 }
 
-// This gives you `*out`, a tensor you can update, corresponding to a
-// variable passed as input index `input`.  This handles the
-// differences between reference and resource variables.  For resource
-// variables, we ensure `*out` has a reference count of 1 (using
-// PrepareToUpdateVariable() to copy if necessary) unless
-// sparse && !lock_held, in which case it never copies.
+// This gives you `*out`, a tensor you can update, corresponding to a variable
+// passed as input index `input`.  This handles the differences between
+// reference and resource variables. For reference variables we can just grab
+// the tensor, grabbing the lock if lock_held is False.
+//
+// For resource variables we, if sparse is true, ensure it's in copy-on-read
+// mode, and then, regardless of the value of sparse, ensure its refcount is 1
+// (by potentially copying its contents). In this case lock_held is ignored.
 template <typename Device, typename T>
 Status GetInputTensorFromVariable(OpKernelContext* ctx, int input,
                                   bool lock_held, bool sparse, Tensor* out) {
@@ -113,7 +244,13 @@ Status GetInputTensorFromVariable(OpKernelContext* ctx, int input,
     Var* var;
     TF_RETURN_IF_ERROR(LookupResource(ctx, HandleFromInput(ctx, input), &var));
     core::ScopedUnref unref_var(var);
-    TF_RETURN_IF_ERROR(PrepareToUpdateVariable<Device, T>(ctx, var->tensor()));
+    if (sparse) {
+      TF_RETURN_IF_ERROR(EnsureSparseVariableAccess<Device, T>(ctx, var));
+      *out = *var->tensor();
+      return Status::OK();
+    }
+    TF_RETURN_IF_ERROR(PrepareToUpdateVariable<Device, T>(
+        ctx, var->tensor(), var->copy_on_read_mode.load()));
     *out = *var->tensor();
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 6504ad1b09c089cafec8c2b0ce0f2971aa506b52..5594c998dd1f69e597c31b800bde55a8b7f63e53 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include <algorithm>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/training_op_helpers.h"
 #include "tensorflow/core/kernels/training_ops.h"
 #include "tensorflow/core/kernels/variable_ops.h"
@@ -465,11 +465,12 @@ class ApplyGradientDescentOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0});
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -506,11 +507,12 @@ class ApplyGradientDescentOp<SYCLDevice, T> : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<SYCLDevice, T>(
+        ctx, use_exclusive_lock_, sparse, {0});
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<SYCLDevice, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -600,7 +602,8 @@ class ApplyAdadeltaOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     Var* resource;
-    mutex* mu = GetTrainingVariableMutex(ctx, 0, &resource);
+    const bool sparse = false;
+    mutex* mu = GetTrainingVariableMutex<Device, T>(ctx, 0, sparse, &resource);
     core::ScopedUnref scoped_unref(resource);
     if (use_exclusive_lock_ && mu != nullptr) {
       mutex_lock l1(*mu);
@@ -624,14 +627,16 @@ class ApplyAdadeltaOp : public OpKernel {
 
   void DoValidate(OpKernelContext* ctx) {
     Tensor var;
+    const bool sparse = false;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor accum;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &accum));
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
     Tensor accum_update;
-    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 2, use_exclusive_lock_, false, &accum_update));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable<Device, T>(ctx, 2, use_exclusive_lock_,
+                                                   sparse, &accum_update));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -678,14 +683,16 @@ class ApplyAdadeltaOp : public OpKernel {
   void DoCompute(OpKernelContext* ctx) {
     const Device& device = ctx->template eigen_device<Device>();
     Tensor var;
+    const bool sparse = false;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor accum;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &accum));
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
     Tensor accum_update;
-    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 2, use_exclusive_lock_, false, &accum_update));
+    OP_REQUIRES_OK(
+        ctx, GetInputTensorFromVariable<Device, T>(ctx, 2, use_exclusive_lock_,
+                                                   sparse, &accum_update));
 
     const Tensor& lr = ctx->input(3);
     const Tensor& rho = ctx->input(4);
@@ -751,7 +758,8 @@ class SparseApplyAdadeltaOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     Var* var;
-    mutex* mu = GetTrainingVariableMutex(ctx, 0, &var);
+    const bool sparse = true;
+    mutex* mu = GetTrainingVariableMutex<CPUDevice, T>(ctx, 0, sparse, &var);
     core::ScopedUnref scoped_unref(var);
     // mu_accum is actually the same mutex as mu_var since currently we use a
     // global mutex.
@@ -767,14 +775,16 @@ class SparseApplyAdadeltaOp : public OpKernel {
 
   void DoCompute(OpKernelContext* ctx) {
     Tensor var;
+    const bool sparse = true;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 0, use_exclusive_lock_, true, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor accum_grad;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 1, use_exclusive_lock_, true, &accum_grad));
+                            ctx, 1, use_exclusive_lock_, sparse, &accum_grad));
     Tensor accum_update;
-    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 2, use_exclusive_lock_, true, &accum_update));
+    OP_REQUIRES_OK(ctx,
+                   GetInputTensorFromVariable<CPUDevice, T>(
+                       ctx, 2, use_exclusive_lock_, sparse, &accum_update));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -907,11 +917,12 @@ class ApplyProximalGradientDescentOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0});
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -976,11 +987,12 @@ class SparseApplyProximalGradientDescentOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0});
+    const bool sparse = true;
+    auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
+        ctx, use_exclusive_lock_, sparse, {0});
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 0, use_exclusive_lock_, true, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()),
                 errors::InvalidArgument("var must be at least 1 dimensional"));
 
@@ -1121,14 +1133,15 @@ class ApplyAdagradOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1});
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor accum;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &accum));
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1214,14 +1227,15 @@ class ApplyProximalAdagradOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1});
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor accum;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &accum));
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1316,14 +1330,15 @@ class SparseApplyAdagradOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    const bool sparse = true;
+    auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1});
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 0, use_exclusive_lock_, true, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor accum;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 1, use_exclusive_lock_, true, &accum));
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1456,14 +1471,15 @@ class SparseApplyProximalAdagradOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    const bool sparse = true;
+    auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1});
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 0, use_exclusive_lock_, true, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor accum;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 1, use_exclusive_lock_, true, &accum));
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1628,19 +1644,20 @@ class ApplyAdagradDAOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
-                                                      {0, 1, 2});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2});
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor gradient_accum;
     OP_REQUIRES_OK(
         ctx, GetInputTensorFromVariable<Device, T>(ctx, 1, use_exclusive_lock_,
-                                                   false, &gradient_accum));
+                                                   sparse, &gradient_accum));
     Tensor gradient_squared_accum;
     OP_REQUIRES_OK(
         ctx, GetInputTensorFromVariable<Device, T>(
-                 ctx, 2, use_exclusive_lock_, false, &gradient_squared_accum));
+                 ctx, 2, use_exclusive_lock_, sparse, &gradient_squared_accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1729,19 +1746,20 @@ class SparseApplyAdagradDAOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
-                                                      {0, 1, 2});
+    const bool sparse = true;
+    auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2});
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 0, use_exclusive_lock_, true, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor gradient_accum;
     OP_REQUIRES_OK(ctx,
                    GetInputTensorFromVariable<CPUDevice, T>(
-                       ctx, 1, use_exclusive_lock_, true, &gradient_accum));
+                       ctx, 1, use_exclusive_lock_, sparse, &gradient_accum));
     Tensor gradient_squared_accum;
     OP_REQUIRES_OK(
         ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                 ctx, 2, use_exclusive_lock_, true, &gradient_squared_accum));
+                 ctx, 2, use_exclusive_lock_, sparse, &gradient_squared_accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -1927,18 +1945,19 @@ class ApplyFtrlOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
-                                                      {0, 1, 2});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor accum;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &accum));
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
     Tensor linear;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 2, use_exclusive_lock_, false, &linear));
+                            ctx, 2, use_exclusive_lock_, sparse, &linear));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2079,17 +2098,18 @@ class SparseApplyFtrlOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
-                                                      {0, 1, 2});
+    const bool sparse = true;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2});
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, true, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor accum;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, true, &accum));
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
     Tensor linear;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 2, use_exclusive_lock_, true, &linear));
+                            ctx, 2, use_exclusive_lock_, sparse, &linear));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2353,15 +2373,16 @@ class ApplyMomentumOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor accum;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &accum));
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2454,15 +2475,16 @@ class SparseApplyMomentumOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    const bool sparse = true;
+    auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 0, use_exclusive_lock_, true, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor accum;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 1, use_exclusive_lock_, true, &accum));
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2572,15 +2594,16 @@ class ApplyKerasMomentumOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor accum;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &accum));
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2671,15 +2694,16 @@ class SparseApplyKerasMomentumOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    const bool sparse = true;
+    auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 0, use_exclusive_lock_, true, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor accum;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 1, use_exclusive_lock_, true, &accum));
+                            ctx, 1, use_exclusive_lock_, sparse, &accum));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2783,18 +2807,19 @@ class ApplyAdamOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
-                                                      {0, 1, 2});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor m;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &m));
+                            ctx, 1, use_exclusive_lock_, sparse, &m));
     Tensor v;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 2, use_exclusive_lock_, false, &v));
+                            ctx, 2, use_exclusive_lock_, sparse, &v));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -2873,18 +2898,19 @@ class ApplyAdamOp<SYCLDevice, T> : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
-                                                      {0, 1, 2});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<SYCLDevice, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<SYCLDevice, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor m;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<SYCLDevice, T>(
-                            ctx, 1, use_exclusive_lock_, false, &m));
+                            ctx, 1, use_exclusive_lock_, sparse, &m));
     Tensor v;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<SYCLDevice, T>(
-                            ctx, 2, use_exclusive_lock_, false, &v));
+                            ctx, 2, use_exclusive_lock_, sparse, &v));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -3043,21 +3069,22 @@ class ApplyAdamWithAmsgradOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
-                                                      {0, 1, 2});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor m;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &m));
+                            ctx, 1, use_exclusive_lock_, sparse, &m));
     Tensor v;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 2, use_exclusive_lock_, false, &v));
+                            ctx, 2, use_exclusive_lock_, sparse, &v));
     Tensor vhat;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 3, use_exclusive_lock_, false, &vhat));
+                            ctx, 3, use_exclusive_lock_, sparse, &vhat));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -3184,18 +3211,19 @@ class ApplyAdaMaxOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
-                                                      {0, 1, 2});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor m;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &m));
+                            ctx, 1, use_exclusive_lock_, sparse, &m));
     Tensor v;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 2, use_exclusive_lock_, false, &v));
+                            ctx, 2, use_exclusive_lock_, sparse, &v));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -3312,18 +3340,19 @@ class ApplyRMSPropOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
-                                                      {0, 1, 2});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor ms;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &ms));
+                            ctx, 1, use_exclusive_lock_, sparse, &ms));
     Tensor mom;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 2, use_exclusive_lock_, false, &mom));
+                            ctx, 2, use_exclusive_lock_, sparse, &mom));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -3394,21 +3423,22 @@ class ApplyCenteredRMSPropOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
-                                                      {0, 1, 2, 3});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2, 3});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor mg;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &mg));
+                            ctx, 1, use_exclusive_lock_, sparse, &mg));
     Tensor ms;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 2, use_exclusive_lock_, false, &ms));
+                            ctx, 2, use_exclusive_lock_, sparse, &ms));
     Tensor mom;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 3, use_exclusive_lock_, false, &mom));
+                            ctx, 3, use_exclusive_lock_, sparse, &mom));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -3553,18 +3583,19 @@ class SparseApplyRMSPropOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
-                                                      {0, 1, 2});
+    const bool sparse = true;
+    auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 0, use_exclusive_lock_, true, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor ms;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 1, use_exclusive_lock_, true, &ms));
+                            ctx, 1, use_exclusive_lock_, sparse, &ms));
     Tensor mom;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 2, use_exclusive_lock_, true, &mom));
+                            ctx, 2, use_exclusive_lock_, sparse, &mom));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -3682,21 +3713,22 @@ class SparseApplyCenteredRMSPropOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
-    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
-                                                      {0, 1, 2, 3});
+    const bool sparse = true;
+    auto locks = MaybeLockVariableInputMutexesInOrder<CPUDevice, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2, 3});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 0, use_exclusive_lock_, true, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor mg;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 1, use_exclusive_lock_, true, &mg));
+                            ctx, 1, use_exclusive_lock_, sparse, &mg));
     Tensor ms;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 2, use_exclusive_lock_, true, &ms));
+                            ctx, 2, use_exclusive_lock_, sparse, &ms));
     Tensor mom;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
-                            ctx, 3, use_exclusive_lock_, true, &mom));
+                            ctx, 3, use_exclusive_lock_, sparse, &mom));
 
     OP_REQUIRES(
         ctx, var.IsInitialized(),
@@ -3852,15 +3884,16 @@ class ApplyAddSignOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor m;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &m));
+                            ctx, 1, use_exclusive_lock_, sparse, &m));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
@@ -3958,15 +3991,16 @@ class ApplyPowerSignOp : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    auto locks =
-        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1});
 
     Tensor var;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 0, use_exclusive_lock_, false, &var));
+                            ctx, 0, use_exclusive_lock_, sparse, &var));
     Tensor m;
     OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
-                            ctx, 1, use_exclusive_lock_, false, &m));
+                            ctx, 1, use_exclusive_lock_, sparse, &m));
     OP_REQUIRES(
         ctx, var.IsInitialized(),
         errors::FailedPrecondition(
diff --git a/tensorflow/core/kernels/training_ops_test.cc b/tensorflow/core/kernels/training_ops_test.cc
index 1ec57b45221906bebe7366af45375cc93b08d3df..2dcc4a500e6c64753c6fde4f88582f914a50089e 100644
--- a/tensorflow/core/kernels/training_ops_test.cc
+++ b/tensorflow/core/kernels/training_ops_test.cc
@@ -151,40 +151,6 @@ static void BM_Momentum(int iters, int params) {
 }
 BENCHMARK(BM_Momentum)->Arg(128 << 10)->Arg(256 << 10);
 
-static void KerasMomentum(int32 n, Graph** init_g, Graph** train_g) {
-  TensorShape shape({n});
-  {
-    Graph* g = new Graph(OpRegistry::Global());
-    auto var = Var(g, n);
-    auto accum = Var(g, n);
-    auto zero = Zeros(g, n);
-    test::graph::Assign(g, var, zero);
-    test::graph::Assign(g, accum, zero);
-    *init_g = g;
-  }
-  {
-    Graph* g = new Graph(OpRegistry::Global());
-    auto var = Var(g, n);
-    auto accum = Var(g, n);
-    auto lr = Scalar(g, 0.01);
-    auto grad = Random(g, n);
-    auto mom = Scalar(g, 0.01);
-    test::graph::Multi(g, "ApplyKerasMomentum", {var, accum, lr, grad, mom});
-    *train_g = g;
-  }
-}
-
-static void BM_KerasMomentum(int iters, int params) {
-  const int64 tot = static_cast<int64>(iters) * params;
-  testing::ItemsProcessed(tot);
-  testing::BytesProcessed(tot * sizeof(float));
-  Graph* init;
-  Graph* train;
-  KerasMomentum(params, &init, &train);
-  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
-}
-BENCHMARK(BM_KerasMomentum)->Arg(128 << 10)->Arg(256 << 10);
-
 static void Adam(int32 n, Graph** init_g, Graph** train_g) {
   TensorShape shape({n});
   {
@@ -228,50 +194,6 @@ static void BM_Adam(int iters, int params) {
 }
 BENCHMARK(BM_Adam)->Arg(128 << 10)->Arg(256 << 10);
 
-static void AdamWithAmsgrad(int32 n, Graph** init_g, Graph** train_g) {
-  TensorShape shape({n});
-  {
-    Graph* g = new Graph(OpRegistry::Global());
-    auto var = Var(g, n);
-    auto m = Var(g, n);
-    auto v = Var(g, n);
-    auto zero = Zeros(g, n);
-    test::graph::Assign(g, var, zero);
-    test::graph::Assign(g, m, zero);
-    test::graph::Assign(g, v, zero);
-    *init_g = g;
-  }
-  {
-    Graph* g = new Graph(OpRegistry::Global());
-    auto var = Var(g, n);
-    auto m = Var(g, n);
-    auto v = Var(g, n);
-    auto vhat = Var(g, n);
-    auto beta1_power = Scalar(g, 0.9);
-    auto beta2_power = Scalar(g, 0.99);
-    auto lr = Scalar(g, 0.01);
-    auto beta1 = Scalar(g, 0.9);
-    auto beta2 = Scalar(g, 0.99);
-    auto epsilon = Scalar(g, 1e-8);
-    auto grad = Random(g, n);
-    test::graph::Multi(g, "ApplyAdamWithAmsgrad",
-                       {var, m, v, vhat, beta1_power, beta2_power, lr, beta1,
-                        beta2, epsilon, grad});
-    *train_g = g;
-  }
-}
-
-static void BM_AdamWithAmsgrad(int iters, int params) {
-  const int64 tot = static_cast<int64>(iters) * params;
-  testing::ItemsProcessed(tot);
-  testing::BytesProcessed(tot * sizeof(float));
-  Graph* init;
-  Graph* train;
-  AdamWithAmsgrad(params, &init, &train);
-  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
-}
-BENCHMARK(BM_AdamWithAmsgrad)->Arg(128 << 10)->Arg(256 << 10);
-
 static void RMSProp(int32 n, Graph** init_g, Graph** train_g) {
   TensorShape shape({n});
   {
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index 48e392c07073a9adf989fc2171222e966aede0f6..1c0d70c333f8bbef08e9a37e06694ec5ff19b20d 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -19,11 +19,11 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/transpose_op.h"
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/transpose_functor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
diff --git a/tensorflow/core/kernels/tridiagonal_solve_op.cc b/tensorflow/core/kernels/tridiagonal_solve_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5884ffedfbc7b25f59e3c67da4af486ef6239c48
--- /dev/null
+++ b/tensorflow/core/kernels/tridiagonal_solve_op.cc
@@ -0,0 +1,163 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/linalg_ops.cc.
+
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/linalg_ops_common.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+static const char kErrMsg[] = "The matrix is not invertible.";
+
+template <class Scalar>
+class TridiagonalSolveOp : public LinearAlgebraOp<Scalar> {
+ public:
+  INHERIT_LINALG_TYPEDEFS(Scalar);
+
+  explicit TridiagonalSolveOp(OpKernelConstruction* context) : Base(context) {}
+
+  void ValidateInputMatrixShapes(
+      OpKernelContext* context,
+      const TensorShapes& input_matrix_shapes) const final {
+    auto num_inputs = input_matrix_shapes.size();
+    OP_REQUIRES(context, num_inputs == 2,
+                errors::InvalidArgument("Expected two input matrices, got ",
+                                        num_inputs, "."));
+
+    auto num_diags = input_matrix_shapes[0].dim_size(0);
+    OP_REQUIRES(
+        context, num_diags == 3,
+        errors::InvalidArgument("Expected diagonals to be provided as a "
+                                "matrix with 3 rows, got ",
+                                num_diags, " rows."));
+
+    auto num_eqs_left = input_matrix_shapes[0].dim_size(1);
+    auto num_eqs_right = input_matrix_shapes[1].dim_size(0);
+    OP_REQUIRES(
+        context, num_eqs_left == num_eqs_right,
+        errors::InvalidArgument("Expected the same number of left-hand sides "
+                                "and right-hand sides, got ",
+                                num_eqs_left, " and ", num_eqs_right, "."));
+  }
+
+  TensorShapes GetOutputMatrixShapes(
+      const TensorShapes& input_matrix_shapes) const final {
+    return TensorShapes({input_matrix_shapes[1]});
+  }
+
+  int64 GetCostPerUnit(const TensorShapes& input_matrix_shapes) const final {
+    const int num_eqs = static_cast<int>(input_matrix_shapes[0].dim_size(1));
+    const int num_rhss = static_cast<int>(input_matrix_shapes[1].dim_size(0));
+
+    const double add_cost = Eigen::TensorOpCost::AddCost<Scalar>();
+    const double mult_cost = Eigen::TensorOpCost::MulCost<Scalar>();
+    const double div_cost = Eigen::TensorOpCost::DivCost<Scalar>();
+
+    // Assuming cases with and without row interchange are equiprobable.
+    const double cost =
+        num_eqs * (div_cost * (num_rhss + 1) +
+                   (add_cost + mult_cost) * (2.5 * num_rhss + 1.5));
+    return cost >= static_cast<double>(kint64max) ? kint64max
+                                                  : static_cast<int64>(cost);
+  }
+
+  void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
+                     MatrixMaps* outputs) final {
+    const auto diagonals = inputs[0];
+
+    // Subdiagonal elements, first is ignored.
+    const auto& superdiag = diagonals.row(0);
+    // Diagonal elements.
+    const auto& diag = diagonals.row(1);
+    // Superdiagonal elements, n-th is ignored.
+    const auto& subdiag = diagonals.row(2);
+    // Right-hand sides (transposed - necessary for GPU impl).
+    const auto& rhs = inputs[1];
+
+    const int n = diag.size();
+    MatrixMap& x = outputs->at(0);
+    const Scalar zero(0);
+
+    if (n == 0) {
+      return;
+    }
+    if (n == 1) {
+      OP_REQUIRES(context, diag(0) != zero, errors::InvalidArgument(kErrMsg));
+      x.row(0) = rhs.row(0) / diag(0);
+      return;
+    }
+
+    // The three columns in u are the diagonal, superdiagonal, and second
+    // superdiagonal, respectively, of the U matrix in the LU decomposition of
+    // the input matrix (subject to row exchanges due to pivoting). For pivoted
+    // tridiagonal matrix, the U matrix has at most two non-zero superdiagonals.
+    Eigen::Array<Scalar, Eigen::Dynamic, 3> u(n, 3);
+
+    // The code below roughly follows LAPACK's dgtsv routine, with main
+    // difference being not overwriting the input.
+    u(0, 0) = diag(0);
+    u(0, 1) = superdiag(0);
+    x.row(0) = rhs.row(0);
+    for (int i = 0; i < n - 1; ++i) {
+      if (std::abs(u(i)) >= std::abs(subdiag(i + 1))) {
+        // No row interchange.
+        OP_REQUIRES(context, u(i) != zero, errors::InvalidArgument(kErrMsg));
+        const Scalar factor = subdiag(i + 1) / u(i, 0);
+        u(i + 1, 0) = diag(i + 1) - factor * u(i, 1);
+        x.row(i + 1) = rhs.row(i + 1) - factor * x.row(i);
+        if (i != n - 2) {
+          u(i + 1, 1) = superdiag(i + 1);
+          u(i, 2) = 0;
+        }
+      } else {
+        // Interchange rows i and i + 1.
+        const Scalar factor = u(i, 0) / subdiag(i + 1);
+        u(i, 0) = subdiag(i + 1);
+        u(i + 1, 0) = u(i, 1) - factor * diag(i + 1);
+        u(i, 1) = diag(i + 1);
+        x.row(i + 1) = x.row(i) - factor * rhs.row(i + 1);
+        x.row(i) = rhs.row(i + 1);
+        if (i != n - 2) {
+          u(i, 2) = superdiag(i + 1);
+          u(i + 1, 1) = -factor * superdiag(i + 1);
+        }
+      }
+    }
+    x.row(n - 1) /= u(n - 1, 0);
+    x.row(n - 2) = (x.row(n - 2) - u(n - 2, 1) * x.row(n - 1)) / u(n - 2, 0);
+    for (int i = n - 3; i >= 0; --i) {
+      x.row(i) = (x.row(i) - u(i, 1) * x.row(i + 1) - u(i, 2) * x.row(i + 2)) /
+                 u(i, 0);
+    }
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(TridiagonalSolveOp);
+};
+
+REGISTER_LINALG_OP_CPU("TridiagonalSolve", (TridiagonalSolveOp<float>), float);
+REGISTER_LINALG_OP_CPU("TridiagonalSolve", (TridiagonalSolveOp<double>),
+                       double);
+REGISTER_LINALG_OP_CPU("TridiagonalSolve", (TridiagonalSolveOp<complex64>),
+                       complex64);
+REGISTER_LINALG_OP_CPU("TridiagonalSolve", (TridiagonalSolveOp<complex128>),
+                       complex128);
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/unicode_ops.cc b/tensorflow/core/kernels/unicode_ops.cc
index 3ee0edb35a72d2e3de747fad32bb69bb2872ac80..c071db606485dbf5747c8695e299da69095c4de3 100644
--- a/tensorflow/core/kernels/unicode_ops.cc
+++ b/tensorflow/core/kernels/unicode_ops.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "unicode/unistr.h"  // TF:icu
 #include "unicode/uset.h"  // TF:icu
 #include "unicode/utypes.h"  // TF:icu
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -39,7 +40,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/string_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -350,10 +350,10 @@ class UnicodeTranscodeOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("UnicodeTranscode").Device(DEVICE_CPU),
                         UnicodeTranscodeOp);
 
-class UnicodeDecodeWithOffsetsOp : public OpKernel {
+class UnicodeDecodeBaseOp : public OpKernel {
  public:
-  explicit UnicodeDecodeWithOffsetsOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx) {
+  explicit UnicodeDecodeBaseOp(OpKernelConstruction* ctx, bool generate_offsets)
+      : OpKernel(ctx), generate_offsets_(generate_offsets) {
     OP_REQUIRES_OK(ctx, GetErrorOptions(ctx, &error_options_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("input_encoding", &input_encoding_));
     // Make a temporary UConverter to ensure it will create without error
@@ -369,7 +369,7 @@ class UnicodeDecodeWithOffsetsOp : public OpKernel {
   }
 
   void Decode(OpKernelContext* ctx, std::vector<UChar32>* char_values,
-              std::vector<int64>* offset_values, int* string_length,
+              std::vector<int64>* offset_values, int* current_offset,
               int64* next_row_split, UChar32 char_value, int char_length,
               bool found_any_format_error) {
     if (error_options_.error_on_malformatting && found_any_format_error) {
@@ -379,7 +379,8 @@ class UnicodeDecodeWithOffsetsOp : public OpKernel {
     UChar32 decoded_value = char_value;
     if (ShouldHandleFormatError(error_options_, char_value,
                                 found_any_format_error)) {
-      if (error_options_.elide_replacement) {
+      if (error_options_.elide_replacement && (offset_values != nullptr)) {
+        *current_offset += char_length;
         return;
       } else {
         decoded_value = error_options_.subst;
@@ -390,8 +391,10 @@ class UnicodeDecodeWithOffsetsOp : public OpKernel {
     char_values->push_back(decoded_value);
 
     // Emit the byte offset
-    offset_values->push_back(*string_length);
-    *string_length += char_length;
+    if (offset_values != nullptr) {
+      offset_values->push_back(*current_offset);
+      *current_offset += char_length;
+    }
     *next_row_split += 1;
   }
 
@@ -428,42 +431,63 @@ class UnicodeDecodeWithOffsetsOp : public OpKernel {
       // the fields needed to construct a RaggedTensor.
       out_row_splits(row_split_index) = next_row_split;
       row_split_index++;
-      int string_length = 0;
+      int current_offset = 0;
       IterateUnicodeString(
           input, input_encoder->converter_,
-          std::bind(&UnicodeDecodeWithOffsetsOp::Decode, this, ctx,
-                    &char_values, &offset_values, &string_length,
-                    &next_row_split, std::placeholders::_1,
-                    std::placeholders::_2, std::placeholders::_3));
+          std::bind(&UnicodeDecodeBaseOp::Decode, this, ctx, &char_values,
+                    &offset_values, &current_offset, &next_row_split,
+                    std::placeholders::_1, std::placeholders::_2,
+                    std::placeholders::_3));
     }
     out_row_splits(row_split_index) = next_row_split;
 
-    DCHECK(offset_values.size() == char_values.size());
     Tensor* output_char_values;
     OP_REQUIRES_OK(
         ctx, ctx->allocate_output("char_values",
                                   {static_cast<int64>(char_values.size())},
                                   &output_char_values));
-    Tensor* output_offset_values;
-    OP_REQUIRES_OK(
-        ctx, ctx->allocate_output("char_to_byte_starts",
-                                  {static_cast<int64>(offset_values.size())},
-                                  &output_offset_values));
     auto out_char_values = output_char_values->vec<int32>();
-    auto out_offset_values = output_offset_values->vec<int64>();
-
-    // Load output tensors from intermediate value arrays.
-    for (int i = 0; i < char_values.size(); ++i) {
-      out_char_values(i) = static_cast<int32>(char_values[i]);
-      out_offset_values(i) = offset_values[i];
+    if (generate_offsets_) {
+      DCHECK(offset_values.size() == char_values.size());
+      Tensor* output_offset_values;
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_output("char_to_byte_starts",
+                                    {static_cast<int64>(offset_values.size())},
+                                    &output_offset_values));
+      auto out_offset_values = output_offset_values->vec<int64>();
+
+      // Load output tensors from intermediate value arrays.
+      for (int i = 0; i < char_values.size(); ++i) {
+        out_char_values(i) = static_cast<int32>(char_values[i]);
+        out_offset_values(i) = offset_values[i];
+      }
+    } else {
+      for (int i = 0; i < char_values.size(); ++i) {
+        out_char_values(i) = static_cast<int32>(char_values[i]);
+      }
     }
   }
 
  private:
   string input_encoding_;
   ErrorOptions error_options_;
+  bool generate_offsets_ = false;
+};
+
+class UnicodeDecodeOp : public UnicodeDecodeBaseOp {
+ public:
+  explicit UnicodeDecodeOp(OpKernelConstruction* ctx)
+      : UnicodeDecodeBaseOp(ctx, false) {}
+};
+
+class UnicodeDecodeWithOffsetsOp : public UnicodeDecodeBaseOp {
+ public:
+  explicit UnicodeDecodeWithOffsetsOp(OpKernelConstruction* ctx)
+      : UnicodeDecodeBaseOp(ctx, true) {}
 };
 
+REGISTER_KERNEL_BUILDER(Name("UnicodeDecode").Device(DEVICE_CPU),
+                        UnicodeDecodeOp);
 REGISTER_KERNEL_BUILDER(Name("UnicodeDecodeWithOffsets").Device(DEVICE_CPU),
                         UnicodeDecodeWithOffsetsOp);
 
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index 3bdcfc90b878479572ad144bc82e9dc6763a4abf..adf84bae49cf7f70577e8b22390527c6b276a170 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -17,11 +17,11 @@ limitations under the License.
 #include <unordered_map>
 #include <utility>
 
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/hash/hash.h"
 
diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc
index 1e1647db5c1c41d6242cab87b0d8a8cf66d32a28..8577ce7bf792d1b724e9e0ea699accb7c2dded09 100644
--- a/tensorflow/core/kernels/unpack_op.cc
+++ b/tensorflow/core/kernels/unpack_op.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/split_lib.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
index eadea18f760b6109c6c10700285a2a2e54e4b083..00994bbe8e7142f0c8ca7a31aef7f0a540b48824 100644
--- a/tensorflow/core/kernels/variable_ops.cc
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -35,7 +35,7 @@ class LegacyVar : public ResourceBase {
   mutex* mu() { return &mu_; }
   Tensor* tensor() { return &tensor_; }
 
-  string DebugString() override {
+  string DebugString() const override {
     return strings::StrCat(DataTypeString(tensor_.dtype()), "/",
                            tensor_.shape().DebugString());
   }
@@ -116,7 +116,7 @@ class TemporaryVariableOp : public OpKernel {
     mutex mu;
     Tensor val;
     string name;
-    string DebugString() override { return name; }
+    string DebugString() const override { return name; }
     ~TmpVar() override { VLOG(3) << "TmpVar " << name << " deleted"; }
   };
 
diff --git a/tensorflow/core/kernels/where_op.cc b/tensorflow/core/kernels/where_op.cc
index 3330442ffd602c7293a4ddc3c675524698364c4e..374257d1766a04feb52fcdb07bae4cfccfc537ed 100644
--- a/tensorflow/core/kernels/where_op.cc
+++ b/tensorflow/core/kernels/where_op.cc
@@ -26,13 +26,13 @@ limitations under the License.
 #include <memory>
 #include <numeric>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -137,8 +137,10 @@ class WhereCPUOp : public OpKernel {
     const int input_dims = input.dims();
 
     Tensor num_true;
-    OP_REQUIRES_OK(
-        context, context->allocate_temp(DT_INT64, TensorShape({}), &num_true));
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    OP_REQUIRES_OK(context, context->allocate_temp(DT_INT64, TensorShape({}),
+                                                   &num_true, attr));
     auto num_true_t = num_true.scalar<int64>();
 
     Status s = functor::NumTrue<CPUDevice, T, int64>::Compute(
@@ -368,6 +370,12 @@ class WhereGPUOp : public AsyncOpKernel {
       Name("Where").Device(DEVICE_GPU).TypeConstraint<T>("T"), WhereGPUOp<T>);
 
 TF_CALL_WHERE_GPU_TYPES(REGISTER_GPU_WHERE_OP);
+REGISTER_KERNEL_BUILDER(Name("Where")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int32>("T")
+                            .HostMemory("input")
+                            .HostMemory("index"),
+                        WhereCPUOp<int32>);
 
 #undef REGISTER_GPU_WHERE_OP
 
diff --git a/tensorflow/core/kernels/where_op.h b/tensorflow/core/kernels/where_op.h
index e63b3ba8cde5e284a8ef7664a4453fef343cdfa2..7297d37ffb8fc19dd924a4396b110b4e87bf795c 100644
--- a/tensorflow/core/kernels/where_op.h
+++ b/tensorflow/core/kernels/where_op.h
@@ -27,7 +27,6 @@ namespace tensorflow {
 #define TF_CALL_WHERE_GPU_TYPES(m) \
   TF_CALL_int8(m);                 \
   TF_CALL_uint8(m);                \
-  TF_CALL_int32(m);                \
   TF_CALL_int64(m);                \
   TF_CALL_float(m);                \
   TF_CALL_double(m);               \
diff --git a/tensorflow/core/kernels/where_op_gpu.cu.h b/tensorflow/core/kernels/where_op_gpu.cu.h
index 2255597651ffa17cb21650dfad28c24f15b36fc9..54b22d230ab02da46016e253cf7b75211df62e26 100644
--- a/tensorflow/core/kernels/where_op_gpu.cu.h
+++ b/tensorflow/core/kernels/where_op_gpu.cu.h
@@ -25,9 +25,9 @@ limitations under the License.
 #include "third_party/cub/device/device_select.cuh"
 #include "third_party/cub/iterator/counting_input_iterator.cuh"
 #include "third_party/cub/iterator/transform_input_iterator.cuh"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/where_op.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.cc b/tensorflow/core/lib/bfloat16/bfloat16.cc
index a591717fd1abfc3d959d219d9ce2bde1272fd8ea..e6e24bc078668b9290f41ce501cea8de2d423779 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.cc
+++ b/tensorflow/core/lib/bfloat16/bfloat16.cc
@@ -19,6 +19,9 @@ limitations under the License.
 
 namespace tensorflow {
 
+const uint16_t bfloat16::NAN_VALUE;
+const uint16_t bfloat16::ZERO_VALUE;
+
 B16_DEVICE_FUNC bfloat16::operator Eigen::half() const {
   return static_cast<Eigen::half>(float(*this));
 }
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h
index 440854658094c3be0ad113ef01d4814f9f45ca06..1294ccff2676e0cf33585ba4518002457c37e93f 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.h
+++ b/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -372,6 +372,14 @@ struct bfloat16 {
     return x;
   }
 
+  static bfloat16 min_positive_normal() {
+    bfloat16 x;
+    x.value = 0x0080;  // 0x1p-126
+    return x;
+  }
+
+  bool IsZero() const { return (value & 0x7FFF) == ZERO_VALUE; }
+
   uint16_t value;
 
   // A value that represents "not a number".
diff --git a/tensorflow/core/lib/core/errors.h b/tensorflow/core/lib/core/errors.h
index d5cbe6c61674b80978ec16d5c00d3747b667e1f5..4815f7c2cc6c4197c4dbd6017213e275d38b105e 100644
--- a/tensorflow/core/lib/core/errors.h
+++ b/tensorflow/core/lib/core/errors.h
@@ -150,6 +150,10 @@ string FormatColocationNodeForError(const T& names) {
       });
 }
 
+inline string FormatFunctionForError(const string& name) {
+  return strings::StrCat("{{function_node ", name, "}}");
+}
+
 // The CanonicalCode() for non-errors.
 using ::tensorflow::error::OK;
 
diff --git a/tensorflow/core/lib/core/status.cc b/tensorflow/core/lib/core/status.cc
index cb2a06e620cab34f35d2b6398234ad8cb6d71dc9..0b63f66f6da0792b0cdba23ea3e5a4abba5e4bdc 100644
--- a/tensorflow/core/lib/core/status.cc
+++ b/tensorflow/core/lib/core/status.cc
@@ -15,6 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/status.h"
 #include <stdio.h>
+#include <map>
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 
 namespace tensorflow {
 
@@ -44,68 +49,72 @@ const string& Status::empty_string() {
   return *empty;
 }
 
+string error_name(error::Code code) {
+  switch (code) {
+    case tensorflow::error::OK:
+      return "OK";
+      break;
+    case tensorflow::error::CANCELLED:
+      return "Cancelled";
+      break;
+    case tensorflow::error::UNKNOWN:
+      return "Unknown";
+      break;
+    case tensorflow::error::INVALID_ARGUMENT:
+      return "Invalid argument";
+      break;
+    case tensorflow::error::DEADLINE_EXCEEDED:
+      return "Deadline exceeded";
+      break;
+    case tensorflow::error::NOT_FOUND:
+      return "Not found";
+      break;
+    case tensorflow::error::ALREADY_EXISTS:
+      return "Already exists";
+      break;
+    case tensorflow::error::PERMISSION_DENIED:
+      return "Permission denied";
+      break;
+    case tensorflow::error::UNAUTHENTICATED:
+      return "Unauthenticated";
+      break;
+    case tensorflow::error::RESOURCE_EXHAUSTED:
+      return "Resource exhausted";
+      break;
+    case tensorflow::error::FAILED_PRECONDITION:
+      return "Failed precondition";
+      break;
+    case tensorflow::error::ABORTED:
+      return "Aborted";
+      break;
+    case tensorflow::error::OUT_OF_RANGE:
+      return "Out of range";
+      break;
+    case tensorflow::error::UNIMPLEMENTED:
+      return "Unimplemented";
+      break;
+    case tensorflow::error::INTERNAL:
+      return "Internal";
+      break;
+    case tensorflow::error::UNAVAILABLE:
+      return "Unavailable";
+      break;
+    case tensorflow::error::DATA_LOSS:
+      return "Data loss";
+      break;
+    default:
+      char tmp[30];
+      snprintf(tmp, sizeof(tmp), "Unknown code(%d)", static_cast<int>(code));
+      return tmp;
+      break;
+  }
+}
+
 string Status::ToString() const {
   if (state_ == nullptr) {
     return "OK";
   } else {
-    char tmp[30];
-    const char* type;
-    switch (code()) {
-      case tensorflow::error::CANCELLED:
-        type = "Cancelled";
-        break;
-      case tensorflow::error::UNKNOWN:
-        type = "Unknown";
-        break;
-      case tensorflow::error::INVALID_ARGUMENT:
-        type = "Invalid argument";
-        break;
-      case tensorflow::error::DEADLINE_EXCEEDED:
-        type = "Deadline exceeded";
-        break;
-      case tensorflow::error::NOT_FOUND:
-        type = "Not found";
-        break;
-      case tensorflow::error::ALREADY_EXISTS:
-        type = "Already exists";
-        break;
-      case tensorflow::error::PERMISSION_DENIED:
-        type = "Permission denied";
-        break;
-      case tensorflow::error::UNAUTHENTICATED:
-        type = "Unauthenticated";
-        break;
-      case tensorflow::error::RESOURCE_EXHAUSTED:
-        type = "Resource exhausted";
-        break;
-      case tensorflow::error::FAILED_PRECONDITION:
-        type = "Failed precondition";
-        break;
-      case tensorflow::error::ABORTED:
-        type = "Aborted";
-        break;
-      case tensorflow::error::OUT_OF_RANGE:
-        type = "Out of range";
-        break;
-      case tensorflow::error::UNIMPLEMENTED:
-        type = "Unimplemented";
-        break;
-      case tensorflow::error::INTERNAL:
-        type = "Internal";
-        break;
-      case tensorflow::error::UNAVAILABLE:
-        type = "Unavailable";
-        break;
-      case tensorflow::error::DATA_LOSS:
-        type = "Data loss";
-        break;
-      default:
-        snprintf(tmp, sizeof(tmp), "Unknown code(%d)",
-                 static_cast<int>(code()));
-        type = tmp;
-        break;
-    }
-    string result(type);
+    string result(error_name(code()));
     result += ": ";
     result += state_->msg;
     return result;
@@ -131,4 +140,100 @@ string* TfCheckOpHelperOutOfLine(const ::tensorflow::Status& v,
   return new string(r);
 }
 
+void StatusGroup::Update(const Status& s) {
+  if (s.ok()) {
+    ++num_ok_;
+  } else {
+    ok_ = false;
+    children_.push_back(s);
+  }
+}
+
+const int kMaxChildMessageSize = 2048;
+
+Status StatusGroup::as_status() const {
+  if (ok_) {
+    return Status::OK();
+  }
+
+  // Reduce verbosity when handling duplicate messages. If there is only a
+  // single message, or all messages have similar content, then return the
+  // longest status message.
+  std::vector<Status> sorted_children(children_);
+  std::sort(sorted_children.begin(), sorted_children.end(),
+            [](const Status& a, const Status& b) {
+              return a.error_message().length() > b.error_message().length();
+            });
+  bool single_status = true;
+  for (const auto& s : sorted_children) {
+    if (s.code() != sorted_children[0].code() ||
+        sorted_children[0].error_message().find(s.error_message()) ==
+            string::npos) {
+      single_status = false;
+      break;
+    }
+  }
+
+  if (single_status) {
+    return sorted_children[0];
+  }
+
+  std::vector<string> fmt;
+
+  // Compute a final output string with status codes sorted by frequency in
+  // increasing order.  This prefers more "interesting" messages over child
+  // messages that may come from cancellation.
+  std::map<error::Code, std::vector<Status>> code_to_status;
+  for (const Status& s : children_) {
+    code_to_status[s.code()].push_back(s);
+  }
+
+  std::vector<std::pair<error::Code, int>> count_vec;
+  count_vec.reserve(code_to_status.size());
+  for (auto& p : code_to_status) {
+    count_vec.push_back(std::make_pair(p.first, p.second.size()));
+  }
+
+  std::sort(
+      count_vec.begin(), count_vec.end(),
+      [](const std::pair<error::Code, int>& a,
+         const std::pair<error::Code, int>& b) { return a.second < b.second; });
+
+  fmt.push_back(
+      strings::Printf("Combined status information from %zu operations:\n",
+                      num_ok_ + children_.size()));
+
+  for (const auto& p : count_vec) {
+    // Deduplicate error messages
+    std::map<string, int> child_errors;
+    for (const Status& s : code_to_status[p.first]) {
+      ++child_errors[s.error_message()];
+    }
+
+    string child_fmt;
+    for (auto& m : child_errors) {
+      child_fmt.append(strings::Printf(
+          "  %s [%dx]",
+          str_util::StringReplace(m.first, "\n", "\n  ", true).c_str(),
+          m.second));
+      child_fmt.append("\n");
+    }
+    // Strip last newline.
+    child_fmt = child_fmt.substr(0, child_fmt.size() - 1);
+
+    if (child_fmt.size() > kMaxChildMessageSize) {
+      child_fmt =
+          strings::StrCat(child_fmt.substr(0, kMaxChildMessageSize), "...");
+    }
+    fmt.push_back(strings::Printf("Status code: %s [%dx]\n%s",
+                                  error_name(p.first).c_str(), p.second,
+                                  child_fmt.c_str()));
+  }
+
+  fmt.push_back(strings::Printf("(%zd successful operations.)", num_ok_));
+
+  // TODO(power): use the least-frequently occurring status for the return code
+  return Status(children_[0].code(), str_util::Join(fmt, "\n"));
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/core/status.h b/tensorflow/core/lib/core/status.h
index eb0ff555a5f2d8f6464067c51e6ac197fa1aab2c..fe3eec1be00ff7a48b5166b9b9f2d1eb18dd03cd 100644
--- a/tensorflow/core/lib/core/status.h
+++ b/tensorflow/core/lib/core/status.h
@@ -97,6 +97,26 @@ class Status {
   void SlowCopyFrom(const State* src);
 };
 
+// Helper class to manage multiple child status values.
+class StatusGroup {
+ public:
+  // Return a merged status with combined child status messages.
+  //
+  // The status code returned is OK if all children were successful, otherwise
+  // the first non-OK child status code is reported.
+  Status as_status() const;
+
+  bool ok() const { return ok_; }
+
+  // Augment this group with the child status `status`.
+  void Update(const Status& status);
+
+ private:
+  bool ok_ = true;
+  size_t num_ok_ = 0;
+  std::vector<Status> children_;
+};
+
 inline Status::Status(const Status& s)
     : state_((s.state_ == NULL) ? NULL : new State(*s.state_)) {}
 
diff --git a/tensorflow/core/lib/core/status_test.cc b/tensorflow/core/lib/core/status_test.cc
index d95d8f20aa354603f37358c7047f6171cca08f1c..7c28184080406ee97dbdad01143619323bfe2325 100644
--- a/tensorflow/core/lib/core/status_test.cc
+++ b/tensorflow/core/lib/core/status_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -97,6 +98,74 @@ TEST(Status, EqualsDifferentMessage) {
   ASSERT_NE(a, b);
 }
 
+TEST(StatusGroup, AcceptsFirstCode) {
+  StatusGroup c;
+  const Status internal(errors::Internal("Original error."));
+  c.Update(internal);
+  c.Update(Status::OK());
+  c.Update(Status::OK());
+  c.Update(Status::OK());
+  ASSERT_EQ(c.as_status().code(), internal.code());
+  ASSERT_EQ(c.ok(), false);
+}
+
+TEST(StatusGroup, ContainsChildMessages) {
+  StatusGroup c;
+  const Status internal(errors::Internal("Original error."));
+  const Status cancelled(errors::Cancelled("Cancelled after 10 steps."));
+  const Status aborted(errors::Aborted("Aborted after 10 steps."));
+  c.Update(internal);
+  for (size_t i = 0; i < 5; ++i) {
+    c.Update(cancelled);
+  }
+  for (size_t i = 0; i < 10; ++i) {
+    c.Update(aborted);
+  }
+  for (size_t i = 0; i < 100; ++i) {
+    c.Update(Status::OK());
+  }
+
+  ASSERT_EQ(c.as_status().code(), internal.code());
+  EXPECT_TRUE(str_util::StrContains(c.as_status().error_message(),
+                                    internal.error_message()));
+  EXPECT_TRUE(str_util::StrContains(c.as_status().error_message(),
+                                    cancelled.error_message()));
+  EXPECT_TRUE(str_util::StrContains(c.as_status().error_message(),
+                                    aborted.error_message()));
+  StatusGroup d;
+  d.Update(c.as_status());
+  c.Update(errors::FailedPrecondition("Failed!"));
+  d.Update(c.as_status());
+  c.Update(errors::DataLoss("Data loss!"));
+  d.Update(c.as_status());
+  LOG(INFO) << d.as_status();
+}
+
+TEST(StatusGroup, ContainsIdenticalMessage) {
+  StatusGroup sg;
+  const Status internal(errors::Internal("Original error"));
+  for (size_t i = 0; i < 10; i++) {
+    sg.Update(internal);
+  }
+  EXPECT_EQ(sg.as_status(), internal);
+}
+
+TEST(StatusGroup, ContainsCommonPrefix) {
+  StatusGroup sg;
+  const Status a(errors::Internal("Original error"));
+  const Status b(errors::Internal("Original error is"));
+  const Status c(errors::Internal("Original error is invalid"));
+  sg.Update(a);
+  sg.Update(c);
+  sg.Update(c);
+  sg.Update(b);
+  sg.Update(c);
+  sg.Update(b);
+  sg.Update(a);
+  sg.Update(b);
+  EXPECT_EQ(sg.as_status(), c);
+}
+
 static void BM_TF_CHECK_OK(int iters) {
   tensorflow::Status s =
       (iters < 0) ? errors::InvalidArgument("Invalid") : Status::OK();
diff --git a/tensorflow/core/lib/gif/gif_io.cc b/tensorflow/core/lib/gif/gif_io.cc
index 9a5215320f58d10c22872c2837e882bed82f5b52..dc5406920a4b8e624fb104f53108cd456f467d76 100644
--- a/tensorflow/core/lib/gif/gif_io.cc
+++ b/tensorflow/core/lib/gif/gif_io.cc
@@ -82,9 +82,20 @@ uint8* Decode(const void* srcdata, int datasize,
     return nullptr;
   }
 
+  // Don't request more memory than needed for each frame, preventing OOM
+  int max_frame_width = 0;
+  int max_frame_height = 0;
+  for (int k = 0; k < gif_file->ImageCount; k++) {
+    SavedImage* si = &gif_file->SavedImages[k];
+    if (max_frame_height < si->ImageDesc.Height)
+      max_frame_height = si->ImageDesc.Height;
+    if (max_frame_width < si->ImageDesc.Width)
+      max_frame_width = si->ImageDesc.Width;
+  }
+
   const int num_frames = gif_file->ImageCount;
-  const int width = gif_file->SWidth;
-  const int height = gif_file->SHeight;
+  const int width = max_frame_width;
+  const int height = max_frame_height;
   const int channel = 3;
 
   uint8* const dstdata = allocate_output(num_frames, width, height, channel);
@@ -129,6 +140,10 @@ uint8* Decode(const void* srcdata, int datasize,
     ColorMapObject* color_map = this_image->ImageDesc.ColorMap
                                     ? this_image->ImageDesc.ColorMap
                                     : gif_file->SColorMap;
+    if (color_map == nullptr) {
+      *error_string = strings::StrCat("missing color map for frame ", k);
+      return nullptr;
+    }
 
     for (int i = imgTop; i < imgBottom; ++i) {
       uint8* p_dst = this_dst + i * width * channel;
@@ -136,6 +151,14 @@ uint8* Decode(const void* srcdata, int datasize,
         GifByteType color_index =
             this_image->RasterBits[(i - img_desc->Top) * (img_desc->Width) +
                                    (j - img_desc->Left)];
+
+        if (color_index >= color_map->ColorCount) {
+          *error_string = strings::StrCat("found color index ", color_index,
+                                          " outside of color map range ",
+                                          color_map->ColorCount);
+          return nullptr;
+        }
+
         const GifColorType& gif_color = color_map->Colors[color_index];
         p_dst[j * channel + 0] = gif_color.Red;
         p_dst[j * channel + 1] = gif_color.Green;
diff --git a/tensorflow/core/lib/gtl/int_type_test.cc b/tensorflow/core/lib/gtl/int_type_test.cc
index 61d364017cb90933e8e9af7e800db4a6988d8442..89d2d0e8fe8ac652d976477722ed850785a5ba9a 100644
--- a/tensorflow/core/lib/gtl/int_type_test.cc
+++ b/tensorflow/core/lib/gtl/int_type_test.cc
@@ -45,7 +45,7 @@ typedef ::testing::Types<Int8_IT, UInt8_IT, Int16_IT, UInt16_IT, Int32_IT,
                          Int64_IT, UInt64_IT, Long_IT>
     SupportedIntTypes;
 
-TYPED_TEST_CASE(IntTypeTest, SupportedIntTypes);
+TYPED_TEST_SUITE(IntTypeTest, SupportedIntTypes);
 
 TYPED_TEST(IntTypeTest, TestInitialization) {
   constexpr typename TestFixture::T a;
diff --git a/tensorflow/core/lib/gtl/stl_util.h b/tensorflow/core/lib/gtl/stl_util.h
index ffeca4e88a93936ee6a1711afec735d97d04172e..853a290bf6383c679ddc9c00dbce38d18d3d35b6 100644
--- a/tensorflow/core/lib/gtl/stl_util.h
+++ b/tensorflow/core/lib/gtl/stl_util.h
@@ -23,9 +23,12 @@ limitations under the License.
 #include <iterator>
 #include <memory>
 #include <string>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
+#include "absl/meta/type_traits.h"
+
 namespace tensorflow {
 namespace gtl {
 
@@ -48,16 +51,38 @@ inline const T* vector_as_array(const std::vector<T, Allocator>* v) {
   return v->data();
 }
 
+namespace gtl_internal {
+
+// HasMember is true_type or false_type, depending on whether or not
+// T has a __resize_default_init member. Resize will call the
+// __resize_default_init member if it exists, and will call the resize
+// member otherwise.
+template <typename string_type, typename = void>
+struct ResizeUninitializedTraits {
+  using HasMember = std::false_type;
+  static void Resize(string_type* s, size_t new_size) { s->resize(new_size); }
+};
+
+// __resize_default_init is provided by libc++ >= 8.0 and by Google's internal
+// ::string implementation.
+template <typename string_type>
+struct ResizeUninitializedTraits<
+    string_type, absl::void_t<decltype(std::declval<string_type&>()
+                                           .__resize_default_init(237))> > {
+  using HasMember = std::true_type;
+  static void Resize(string_type* s, size_t new_size) {
+    s->__resize_default_init(new_size);
+  }
+};
+
+}  // namespace gtl_internal
+
 // Like str->resize(new_size), except any new characters added to "*str" as a
 // result of resizing may be left uninitialized, rather than being filled with
 // '0' bytes. Typically used when code is then going to overwrite the backing
-// store of the string with known data. Uses a Google extension to ::string.
+// store of the string with known data.
 inline void STLStringResizeUninitialized(string* s, size_t new_size) {
-#if __google_stl_resize_uninitialized_string
-  s->resize_uninitialized(new_size);
-#else
-  s->resize(new_size);
-#endif
+  gtl_internal::ResizeUninitializedTraits<string>::Resize(s, new_size);
 }
 
 // Calls delete (non-array version) on the SECOND item (pointer) in each pair in
diff --git a/tensorflow/core/lib/io/recordio_test.cc b/tensorflow/core/lib/io/recordio_test.cc
index 946d7188d3b8a66ae7059a050912868087c4fa50..e6a2e4a0662e80040c019599c9e50a706a48c393 100644
--- a/tensorflow/core/lib/io/recordio_test.cc
+++ b/tensorflow/core/lib/io/recordio_test.cc
@@ -62,6 +62,10 @@ class StringDest : public WritableFile {
     contents_->append(slice.data(), slice.size());
     return Status::OK();
   }
+  Status Tell(int64* pos) override {
+    *pos = contents_->size();
+    return Status::OK();
+  }
 
  private:
   string* contents_;
diff --git a/tensorflow/core/lib/io/table_test.cc b/tensorflow/core/lib/io/table_test.cc
index 9cebbf40c67e1e56f3a4df6bdb94911eea1334b9..d57135be720bb631f5277df71e3d230464de75ec 100644
--- a/tensorflow/core/lib/io/table_test.cc
+++ b/tensorflow/core/lib/io/table_test.cc
@@ -96,7 +96,14 @@ class StringSink : public WritableFile {
 
   Status Close() override { return Status::OK(); }
   Status Flush() override { return Status::OK(); }
+  Status Name(StringPiece* result) const override {
+    return errors::Unimplemented("StringSink does not support Name()");
+  }
   Status Sync() override { return Status::OK(); }
+  Status Tell(int64* pos) override {
+    *pos = contents_.size();
+    return Status::OK();
+  }
 
   Status Append(StringPiece data) override {
     contents_.append(data.data(), data.size());
@@ -116,6 +123,10 @@ class StringSource : public RandomAccessFile {
 
   uint64 Size() const { return contents_.size(); }
 
+  Status Name(StringPiece* result) const override {
+    return errors::Unimplemented("StringSource does not support Name()");
+  }
+
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
     if (offset > contents_.size()) {
diff --git a/tensorflow/core/lib/io/zlib_outputbuffer.cc b/tensorflow/core/lib/io/zlib_outputbuffer.cc
index cba139e6ad21e1cd8f75ffc55341ca8e8e2fe2f7..aa7bdab03a37013bb0d46f18c5641044cbc40883 100644
--- a/tensorflow/core/lib/io/zlib_outputbuffer.cc
+++ b/tensorflow/core/lib/io/zlib_outputbuffer.cc
@@ -197,6 +197,10 @@ Status ZlibOutputBuffer::Flush() {
   return Status::OK();
 }
 
+Status ZlibOutputBuffer::Name(StringPiece* result) const {
+  return file_->Name(result);
+}
+
 Status ZlibOutputBuffer::Sync() {
   TF_RETURN_IF_ERROR(Flush());
   return file_->Sync();
@@ -225,5 +229,7 @@ Status ZlibOutputBuffer::Deflate(int flush) {
   return errors::DataLoss(error_string);
 }
 
+Status ZlibOutputBuffer::Tell(int64* position) { return file_->Tell(position); }
+
 }  // namespace io
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/io/zlib_outputbuffer.h b/tensorflow/core/lib/io/zlib_outputbuffer.h
index ccad2fda44b9e1f3e1fd6c639fdc3ddbe0761642..e3d2aec37eac056a19bd425e6ea35a1f0b1f4b2c 100644
--- a/tensorflow/core/lib/io/zlib_outputbuffer.h
+++ b/tensorflow/core/lib/io/zlib_outputbuffer.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/zlib_compression_options.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/file_system.h"
@@ -77,9 +78,16 @@ class ZlibOutputBuffer : public WritableFile {
   // will fail.
   Status Close() override;
 
+  // Returns the name of the underlying file.
+  Status Name(StringPiece* result) const override;
+
   // Deflates any cached input, writes all output to file and syncs it.
   Status Sync() override;
 
+  // Returns the write position in the underlying file. The position does not
+  // reflect buffered, un-flushed data.
+  Status Tell(int64* position) override;
+
  private:
   WritableFile* file_;  // Not owned
   Status init_status_;
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem.cc b/tensorflow/core/lib/jpeg/jpeg_mem.cc
index f7a359eb5b30804834ec7d5368d91c2074faf8a5..1b54caf28e59b0f95dced8eaa7b757516ed203fa 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem.cc
+++ b/tensorflow/core/lib/jpeg/jpeg_mem.cc
@@ -81,6 +81,12 @@ bool IsCropWindowValid(const UncompressFlags& flags, int input_image_width,
          flags.crop_x + flags.crop_width <= input_image_width;
 }
 
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+// If in fuzzing mode, don't print any error message as that slows down fuzzing.
+// See also http://llvm.org/docs/LibFuzzer.html#fuzzer-friendly-build-mode
+void no_print(j_common_ptr cinfo) {}
+#endif
+
 uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
   // unpack the argball
   const int datasize = argball->datasize_;
@@ -112,9 +118,14 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
   struct jpeg_decompress_struct cinfo;
   struct jpeg_error_mgr jerr;
   cinfo.err = jpeg_std_error(&jerr);
+  jerr.error_exit = CatchError;
+
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  jerr.output_message = no_print;
+#endif
+
   jmp_buf jpeg_jmpbuf;
   cinfo.client_data = &jpeg_jmpbuf;
-  jerr.error_exit = CatchError;
   if (setjmp(jpeg_jmpbuf)) {
     delete[] tempdata;
     return nullptr;
@@ -157,7 +168,8 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
   jpeg_calc_output_dimensions(&cinfo);
 
   int64 total_size = static_cast<int64>(cinfo.output_height) *
-                     static_cast<int64>(cinfo.output_width);
+                     static_cast<int64>(cinfo.output_width) *
+                     static_cast<int64>(cinfo.num_components);
   // Some of the internal routines do not gracefully handle ridiculously
   // large images, so fail fast.
   if (cinfo.output_width <= 0 || cinfo.output_height <= 0) {
diff --git a/tensorflow/core/lib/monitoring/collection_registry_test.cc b/tensorflow/core/lib/monitoring/collection_registry_test.cc
index ca25f508da9635f02941c99c768947927fd97493..ce87e4dcae65e5a48074e00a6f49f79c1dc76c61 100644
--- a/tensorflow/core/lib/monitoring/collection_registry_test.cc
+++ b/tensorflow/core/lib/monitoring/collection_registry_test.cc
@@ -81,14 +81,6 @@ TEST(CollectionRegistryDeathTest, DuplicateRegistration) {
       "/tensorflow/metric");
 }
 
-TEST(CollectMetricsTest, NoMetrics) {
-  auto* collection_registry = CollectionRegistry::Default();
-  const std::unique_ptr<CollectedMetrics> collected_metrics =
-      collection_registry->CollectMetrics({});
-  EXPECT_EQ(0, collected_metrics->metric_descriptor_map.size());
-  EXPECT_EQ(0, collected_metrics->point_set_map.size());
-}
-
 TEST(CollectMetricsTest, Counter) {
   auto counter_with_labels = std::unique_ptr<Counter<2>>(
       Counter<2>::New("/tensorflow/test/counter_with_labels",
@@ -111,7 +103,7 @@ TEST(CollectMetricsTest, Counter) {
         collection_registry->CollectMetrics(options);
 
     if (collect_metric_descriptors) {
-      ASSERT_EQ(2, collected_metrics->metric_descriptor_map.size());
+      ASSERT_GE(collected_metrics->metric_descriptor_map.size(), 2);
 
       const MetricDescriptor& ld = *collected_metrics->metric_descriptor_map.at(
           "/tensorflow/test/counter_with_labels");
@@ -134,7 +126,7 @@ TEST(CollectMetricsTest, Counter) {
       EXPECT_EQ(0, collected_metrics->metric_descriptor_map.size());
     }
 
-    ASSERT_EQ(2, collected_metrics->point_set_map.size());
+    ASSERT_GE(collected_metrics->point_set_map.size(), 2);
 
     const PointSet& lps = *collected_metrics->point_set_map.at(
         "/tensorflow/test/counter_with_labels");
@@ -201,7 +193,7 @@ TEST(CollectMetricsTest, Gauge) {
         collection_registry->CollectMetrics(options);
 
     if (collect_metric_descriptors) {
-      ASSERT_EQ(2, collected_metrics->metric_descriptor_map.size());
+      ASSERT_GE(collected_metrics->metric_descriptor_map.size(), 2);
 
       const MetricDescriptor& ld = *collected_metrics->metric_descriptor_map.at(
           "/tensorflow/test/string_gauge_with_labels");
@@ -224,7 +216,7 @@ TEST(CollectMetricsTest, Gauge) {
       EXPECT_EQ(0, collected_metrics->metric_descriptor_map.size());
     }
 
-    ASSERT_EQ(2, collected_metrics->point_set_map.size());
+    ASSERT_GE(collected_metrics->point_set_map.size(), 2);
 
     const PointSet& lps = *collected_metrics->point_set_map.at(
         "/tensorflow/test/string_gauge_with_labels");
@@ -307,7 +299,7 @@ TEST(CollectMetricsTest, Sampler) {
         collection_registry->CollectMetrics(options);
 
     if (collect_metric_descriptors) {
-      ASSERT_EQ(2, collected_metrics->metric_descriptor_map.size());
+      ASSERT_GE(collected_metrics->metric_descriptor_map.size(), 2);
 
       const MetricDescriptor& ld = *collected_metrics->metric_descriptor_map.at(
           "/tensorflow/test/sampler_with_labels");
@@ -330,7 +322,7 @@ TEST(CollectMetricsTest, Sampler) {
       EXPECT_EQ(0, collected_metrics->metric_descriptor_map.size());
     }
 
-    ASSERT_EQ(2, collected_metrics->point_set_map.size());
+    ASSERT_GE(collected_metrics->point_set_map.size(), 2);
 
     const PointSet& lps = *collected_metrics->point_set_map.at(
         "/tensorflow/test/sampler_with_labels");
diff --git a/tensorflow/core/lib/random/philox_random.h b/tensorflow/core/lib/random/philox_random.h
index 058ed95ffb43586b78f8d82e03b5cf420cfb28f2..f4bbc689d477694e426ea6edc889dd22e2101831 100644
--- a/tensorflow/core/lib/random/philox_random.h
+++ b/tensorflow/core/lib/random/philox_random.h
@@ -49,6 +49,7 @@ namespace random {
 template <typename T, int ElementCount>
 class Array {
  public:
+  static const int kElementCount = ElementCount;
   PHILOX_DEVICE_INLINE Array() {
     for (int i = 0; i < ElementCount; ++i) {
       data_[i] = T(0);
@@ -131,6 +132,12 @@ class PhiloxRandom {
   PHILOX_DEVICE_INLINE
   PhiloxRandom(ResultType counter, Key key) : counter_(counter), key_(key) {}
 
+  PHILOX_DEVICE_INLINE
+  ResultType const& counter() const { return counter_; }
+
+  PHILOX_DEVICE_INLINE
+  Key const& key() const { return key_; }
+
   // Skip the specified number of samples of 128-bits in the current stream.
   PHILOX_DEVICE_INLINE
   void Skip(uint64 count) {
diff --git a/tensorflow/core/lib/strings/proto_serialization.cc b/tensorflow/core/lib/strings/proto_serialization.cc
index 5c1fbda2155492c00049f52ce12ae8da665cbda0..2341d3e341d72fe8c385f3abd441dc7c692d9759 100644
--- a/tensorflow/core/lib/strings/proto_serialization.cc
+++ b/tensorflow/core/lib/strings/proto_serialization.cc
@@ -14,20 +14,65 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 
+#include <cstring>
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
+namespace {
+static const int kInlinedBufferSize = 256;
+}  // namespace
 
 bool SerializeToStringDeterministic(const protobuf::MessageLite& msg,
                                     string* result) {
-  DCHECK_LE(msg.ByteSizeLong(), static_cast<size_t>(INT_MAX));
-  const int size = static_cast<int>(msg.ByteSizeLong());
+  const size_t size = msg.ByteSizeLong();
+  DCHECK_LE(size, static_cast<size_t>(INT_MAX));
   *result = string(size, '\0');
-  protobuf::io::ArrayOutputStream array_stream(&(*result)[0], size);
+  return SerializeToBufferDeterministic(msg, const_cast<char*>(result->data()),
+                                        result->size());
+}
+
+bool SerializeToBufferDeterministic(const protobuf::MessageLite& msg,
+                                    char* buffer, size_t size) {
+  DCHECK(msg.ByteSizeLong() == size && size <= static_cast<size_t>(INT_MAX));
+  protobuf::io::ArrayOutputStream array_stream(buffer, size);
   protobuf::io::CodedOutputStream output_stream(&array_stream);
   output_stream.SetSerializationDeterministic(true);
   msg.SerializeWithCachedSizes(&output_stream);
   return !output_stream.HadError() && size == output_stream.ByteCount();
 }
 
+bool AreSerializedProtosEqual(const protobuf::MessageLite& x,
+                              const protobuf::MessageLite& y) {
+  const size_t size = x.ByteSizeLong();
+  if (size != y.ByteSizeLong()) return false;
+  if (size == 0) return true;
+  gtl::InlinedVector<char, kInlinedBufferSize> x_serialized(size);
+  bool success_x = SerializeToBufferDeterministic(x, x_serialized.data(), size);
+  DCHECK(success_x);
+  gtl::InlinedVector<char, kInlinedBufferSize> y_serialized(size);
+  bool success_y = SerializeToBufferDeterministic(y, y_serialized.data(), size);
+  DCHECK(success_y);
+  return memcmp(x_serialized.data(), y_serialized.data(), size) == 0;
+}
+
+uint64 DeterministicProtoHash64(const protobuf::MessageLite& proto,
+                                uint64 seed) {
+  const size_t size = proto.ByteSizeLong();
+  gtl::InlinedVector<char, kInlinedBufferSize> serialized(size);
+  SerializeToBufferDeterministic(proto, serialized.data(), size);
+  return Hash64(serialized.data(), size, seed);
+}
+
+uint64 DeterministicProtoHash64(const protobuf::MessageLite& proto) {
+  const size_t size = proto.ByteSizeLong();
+  gtl::InlinedVector<char, kInlinedBufferSize> serialized(size);
+  SerializeToBufferDeterministic(proto, serialized.data(), size);
+  return Hash64(serialized.data(), size);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/strings/proto_serialization.h b/tensorflow/core/lib/strings/proto_serialization.h
index 6664928e2818c747268ec1c361acce6bcf6c862e..763bd68c1bf8b8cc709d5a01a308550ffefeb743 100644
--- a/tensorflow/core/lib/strings/proto_serialization.h
+++ b/tensorflow/core/lib/strings/proto_serialization.h
@@ -28,6 +28,21 @@ namespace tensorflow {
 bool SerializeToStringDeterministic(const protobuf::MessageLite& msg,
                                     string* result);
 
+// As above, but takes a pre-allocated buffer wrapped by result.
+// PRECONDITION: size == msg.ByteSizeLong() && size <= INT_MAX.
+bool SerializeToBufferDeterministic(const protobuf::MessageLite& msg,
+                                    char* buffer, size_t size);
+
+// Returns true if serializing x and y using
+// SerializeToBufferDeterministic() yields identical strings.
+bool AreSerializedProtosEqual(const protobuf::MessageLite& x,
+                              const protobuf::MessageLite& y);
+
+// Computes Hash64 of the output of SerializeToBufferDeterministic().
+uint64 DeterministicProtoHash64(const protobuf::MessageLite& proto);
+uint64 DeterministicProtoHash64(const protobuf::MessageLite& proto,
+                                uint64 seed);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_LIB_STRINGS_PROTO_SERIALIZATION_H_
diff --git a/tensorflow/core/lib/strings/proto_serialization_test.cc b/tensorflow/core/lib/strings/proto_serialization_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..81a6f08ae9bf668951103c3f45d5efac527a8a94
--- /dev/null
+++ b/tensorflow/core/lib/strings/proto_serialization_test.cc
@@ -0,0 +1,94 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/strings/proto_serialization.h"
+
+#include <string>
+#include "absl/memory/memory.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+GraphDef MakeGraphDef(int num_nodes) {
+  GraphDef graph_def;
+  for (int i = 0; i < num_nodes; ++i) {
+    NodeDef* node = graph_def.add_node();
+    node->set_name(strings::StrCat("node", i));
+    node->set_op(strings::StrCat("op", i % 10));
+    (*node->mutable_attr())["foo"].set_f(3.14f);
+    (*node->mutable_attr())["bar"].set_s("baz");
+  }
+  return graph_def;
+}
+}  // namespace
+
+static void BM_ProtoSerializationToString(int iters, int num_nodes) {
+  testing::StopTiming();
+  GraphDef graph_def = MakeGraphDef(num_nodes);
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    string serialized;
+    testing::DoNotOptimize(
+        SerializeToStringDeterministic(graph_def, &serialized));
+  }
+  testing::StopTiming();
+}
+BENCHMARK(BM_ProtoSerializationToString)->Range(1, 10000);
+
+static void BM_ProtoSerializationToBuffer(int iters, int num_nodes) {
+  testing::StopTiming();
+  GraphDef graph_def = MakeGraphDef(num_nodes);
+  testing::StartTiming();
+  const size_t size = graph_def.ByteSizeLong();
+  for (int i = 0; i < iters; ++i) {
+    gtl::InlinedVector<char, 1024> buf(size);
+    testing::DoNotOptimize(
+        SerializeToBufferDeterministic(graph_def, buf.data(), size));
+  }
+  testing::StopTiming();
+}
+BENCHMARK(BM_ProtoSerializationToBuffer)->Range(1, 10000);
+
+static void BM_DeterministicProtoHash64(int iters, int num_nodes) {
+  testing::StopTiming();
+  GraphDef graph_def = MakeGraphDef(num_nodes);
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    testing::DoNotOptimize(DeterministicProtoHash64(graph_def));
+  }
+  testing::StopTiming();
+}
+BENCHMARK(BM_DeterministicProtoHash64)->Range(1, 10000);
+
+static void BM_AreSerializedProtosEqual(int iters, int num_nodes) {
+  testing::StopTiming();
+  GraphDef graph_def_a = MakeGraphDef(num_nodes);
+  GraphDef graph_def_b = MakeGraphDef(num_nodes);
+  graph_def_b.mutable_node(0)->mutable_name()[0] = 'l';
+  testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    testing::DoNotOptimize(AreSerializedProtosEqual(graph_def_a, graph_def_a));
+  }
+  testing::StopTiming();
+}
+BENCHMARK(BM_AreSerializedProtosEqual)->Range(1, 10000);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/nccl/BUILD b/tensorflow/core/nccl/BUILD
index 4be33b2a0cf10a2525f9a93b5d4942b381d92629..a19e1af888405aa1de9e9a4ca519b895c369cfdf 100644
--- a/tensorflow/core/nccl/BUILD
+++ b/tensorflow/core/nccl/BUILD
@@ -20,9 +20,11 @@ cc_library(
     name = "nccl_lib",
     srcs = if_cuda([
         "nccl_manager.cc",
-        "nccl_manager.h",
         "nccl_rewrite.cc",
     ]),
+    hdrs = if_cuda([
+        "nccl_manager.h",
+    ]),
     copts = tf_copts(),
     deps = if_cuda([
         "@local_config_nccl//:nccl",
diff --git a/tensorflow/core/nccl/nccl_manager.cc b/tensorflow/core/nccl/nccl_manager.cc
index df49bf1b976726b3c1cbc3917c881dbc380f2f9a..a0b602f301c976acca2c5887de0452210f15acd7 100644
--- a/tensorflow/core/nccl/nccl_manager.cc
+++ b/tensorflow/core/nccl/nccl_manager.cc
@@ -63,7 +63,7 @@ struct NcclManager::NcclStream {
   std::unique_ptr<Thread> thread;
   mutex mu;
   condition_variable cv;
-  // Has collective,rank pairs.
+  // Has collective,participant_idx pairs.
   std::deque<std::pair<Collective*, int>> pending_launches_ GUARDED_BY(mu);
   bool shutdown_requested GUARDED_BY(mu) = false;
 };
@@ -82,14 +82,17 @@ struct NcclManager::CommunicatorMember {
 
 struct NcclManager::Communicator {
  public:
-  explicit Communicator(std::vector<CommunicatorMember> members)
-      : num_devices(members.size()), members(std::move(members)) {}
+  explicit Communicator(std::vector<CommunicatorMember> members,
+                        const string& key)
+      : num_devices(members.size()), members(std::move(members)), key(key) {}
 
   const int num_devices;
-  const std::vector<CommunicatorMember> members;  // indexed by rank.
+  const std::vector<CommunicatorMember> members;
+  const string key;
 };
 
 namespace {
+
 ncclDataType_t ToNcclType(DataType t) {
   switch (t) {
     case DT_HALF:
@@ -106,64 +109,46 @@ ncclDataType_t ToNcclType(DataType t) {
       return ncclFloat;
   }
 }
-}  // namespace
 
-// A participant in a Collective.  See <Collective> below.
-struct NcclManager::Participant {
-  Participant(const Tensor* in_t, Tensor* out_t, EventMgr* event_mgr,
-              se::Stream* tensor_stream, se::StreamExecutor* executor,
-              int gpu_device_id, NcclManager::DoneCallback done_callback)
-      : in_t(in_t),
-        out_t(out_t),
-        event_mgr(event_mgr),
-        tensor_stream(tensor_stream),
-        executor(executor),
-        gpu_device_id(gpu_device_id),
-        done_callback(std::move(done_callback)) {
-    DCHECK(executor != nullptr);
-    DCHECK(event_mgr != nullptr);
-    DCHECK(tensor_stream != nullptr);
+void StringToNcclUniqueId(const string& str_id, ncclUniqueId* nccl_id) {
+  if (str_id.size() == NCCL_UNIQUE_ID_BYTES) {
+    memcpy(nccl_id->internal, str_id.data(), NCCL_UNIQUE_ID_BYTES);
   }
-  // Owned by the caller, who must keep it live until <done_callback> is called.
-  // Is NULL for participants that only receive data.
-  const Tensor* in_t;
-
-  // Owned by the caller, who must keep it live until <done_callback> is called.
-  // Is NULL for participants that only send data.
-  Tensor* out_t;
-
-  // Owned by the caller, who must keep it live until <done_callback> is called.
-  EventMgr* const event_mgr;
-
-  // Owned by the caller, who must keep it live until <done_callback> is called.
-  se::Stream* const tensor_stream;
-
-  // Matches the executor in CommunicatorMember::stream. Expected to be live for
-  // process lifetime.
-  se::StreamExecutor* const executor = nullptr;
-
-  const int gpu_device_id;
-
-  NcclManager::DoneCallback done_callback;
+}
 
-  bool root = false;
-};
+}  // namespace
 
-// A Collective tracks a single communicator operation (e.g., a single
-// AllReduce call).
+// A `Collective` encapsulates state for a collective instance at one node.
+// Typically, an instance in TensorFlow context would be defined by a collective
+// group and the (step, frame iteration) for that execution.
+//
+// For each collective instance there will be one `Collective` object per node.
+// For example,  a NCCL collective that runs on a single node with 4 GPUs would
+// have a single `Collective` per step.  However, a collective that executes on
+// 3 nodes with 4 GPUs each would have a `Collective` per node, each of which is
+// tracking the 4 GPUs local to that node.
 struct NcclManager::Collective {
   Collective(DataType data_type_in, CollectiveType type_in,
-             ncclRedOp_t reduction_op_in, int num_devices)
+             ncclRedOp_t reduction_op_in, int num_local_devices_in,
+             int num_global_devices_in, const string& communicator_key_in)
       : data_type(data_type_in),
         type(type_in),
         reduction_op(reduction_op_in),
-        remaining_participants(num_devices) {
-    participants.reserve(num_devices);
+        num_local_devices(num_local_devices_in),
+        num_global_devices(num_global_devices_in),
+        single_node(num_local_devices_in == num_global_devices_in),
+        communicator_key(communicator_key_in),
+        remaining_participants(num_local_devices_in) {
+    participants.reserve(num_local_devices_in);
   }
 
   const DataType data_type;
   const CollectiveType type;
   const ncclRedOp_t reduction_op;  // applies when <type> is a reduction.
+  const int num_local_devices;     // devices local to this node
+  const int num_global_devices;    // devices across all nodes
+  const bool single_node;          // true if all devices are at one node
+  const string communicator_key;
 
   Communicator* communicator = nullptr;
 
@@ -178,12 +163,20 @@ struct NcclManager::Collective {
   int root_rank = -1;
 
   // How many participants have been registered so far. The Collective is
-  // eligible for running with <available_participants> == participants.size().
+  // eligible for running with <available_participants> == num_local_devices.
+  //
+  // If this is a multi-node collective, we additionally have to synchronize
+  // across nodes.  The caller would need to signal multi node readiness by
+  // calling NcclManager::SignalMultiNodeReady, which sets `multi_node_ready` to
+  // true.
   //
   // Guarded by the mutex of the containing Communicator.
   int available_participants = 0;
+  bool multi_node_ready = false;
 
   mutable std::atomic_int_fast32_t remaining_participants;
+
+  Status status;
 };
 
 NcclManager::NcclManager() {}
@@ -193,6 +186,12 @@ NcclManager* NcclManager::instance() {
   return instance;
 }
 
+string NcclManager::GenerateCommunicatorKey() {
+  ncclUniqueId nccl_id;
+  ncclGetUniqueId(&nccl_id);
+  return string(nccl_id.internal, NCCL_UNIQUE_ID_BYTES);
+}
+
 Status NcclManager::GetCommunicator(NcclManager::Collective* collective,
                                     NcclManager::Communicator** communicator) {
   // Sort by executor to make ordering of executors deterministic.
@@ -201,39 +200,60 @@ Status NcclManager::GetCommunicator(NcclManager::Collective* collective,
                const std::unique_ptr<Participant>& b) {
               return a->executor < b->executor;
             });
-  const int num_devices = collective->participants.size();
 
   mutex_lock l(mu_);
 
-  // Scan to find an existing communicator that provides nccl communication
-  // between the executors used by the participants in the collective. For
-  // example, if a collective is for GPUs 0, 1, and 2 then this will scan
-  // to find the communicator for GPUs 0, 1, and 2.
-  //
-  // Note that each executor identifies a context on one device, so this is the
-  // same as getting the communicator connecting the devices in the collective.
-  // A device can be in different communicators as well - for example, a
-  // communicator for GPUs 0 and 1 is separate from one for GPUs 0, 1, and 2.
-  //
-  // Since it's expected that a small number of distinct communicators will
-  // be needed, communicators_ is not garbage collected currently.
-  //
-  // Launching of kernels must be serialized so that, given collectives A and B,
-  // and an order of them (e.g., A before B), then for each comm_stream
-  // involved, the kernel for A is launched before the kernel for B. This is
-  // guaranteed currently be a global mutex controlling additions of the kernels
-  // to per-stream launch queues.  The launch queues are processed by
-  // LoopKernelLaunches.
-  for (auto& comm : communicators_) {
-    if (comm->num_devices == num_devices) {
-      int i;
-      for (i = 0; i < num_devices; ++i) {
-        if (comm->members[i].nccl_stream->executor !=
-            collective->participants[i]->executor) {
-          break;
+  if (collective->single_node) {
+    // For single-node collectives, we identify a communicator uniquely by the
+    // set of devices participating in the collective.  For example, if a
+    // collective is for GPUs 0, 1, and 2 then this will scan to find the
+    // communicator for GPUs 0, 1, and 2.
+    //
+    // Note that each executor identifies a context on one device, so this is
+    // the same as getting the communicator connecting the devices in the
+    // collective. A device can be in different communicators as well - for
+    // example, a communicator for GPUs 0 and 1 is separate from one for GPUs 0,
+    // 1, and 2.
+    //
+    // Since it's expected that a small number of distinct communicators will
+    // be needed, communicators_ is not garbage collected currently.
+    //
+    // Launching of kernels must be serialized so that, given collectives A and
+    // B, and an order of them (e.g., A before B), then for each comm_stream
+    // involved, the kernel for A is launched before the kernel for B. This is
+    // guaranteed currently be a global mutex controlling additions of the
+    // kernels to per-stream launch queues.  The launch queues are processed by
+    // LoopKernelLaunches.
+    for (auto& comm : communicators_) {
+      if (comm->num_devices == collective->num_global_devices) {
+        int i;
+        for (i = 0; i < collective->num_local_devices; ++i) {
+          if (comm->members[i].nccl_stream->executor !=
+              collective->participants[i]->executor) {
+            break;
+          }
+        }
+        if (i == collective->num_local_devices) {
+          *communicator = comm.get();
+          return Status::OK();
         }
       }
-      if (i == num_devices) {
+    }
+  } else {
+#if NCCL_MAJOR < 2
+    return errors::Internal(
+        "Cannot use multi-node NCCL collectives with NCCL 1.x");
+#endif
+    if (collective->communicator_key.size() != NCCL_UNIQUE_ID_BYTES) {
+      return errors::Internal("Expected communicator_key of size ",
+                              NCCL_UNIQUE_ID_BYTES, " but found size ",
+                              collective->communicator_key.size());
+    }
+    // This is an instance of multi-node collective.  We have previously
+    // created a NCCL unique id and shared with all workers.  Now we find the
+    // `Communicator` corresponding to this id.
+    for (auto& comm : communicators_) {
+      if (comm->key == collective->communicator_key) {
         *communicator = comm.get();
         return Status::OK();
       }
@@ -246,9 +266,9 @@ Status NcclManager::GetCommunicator(NcclManager::Collective* collective,
   // Create and initialize a new communicator.
   // Note that this is done under the lock; performance is not expected to
   // matter as this happens a very small number of times.
-  std::vector<CommunicatorMember> members(num_devices);
-  std::vector<int> devices(num_devices);
-  for (int i = 0; i < num_devices; ++i) {
+  std::vector<CommunicatorMember> members(collective->num_local_devices);
+  std::vector<int> devices(collective->num_local_devices);
+  for (int i = 0; i < collective->num_local_devices; ++i) {
     auto* executor = collective->participants[i]->executor;
 
     // Find a communication stream to use for the device.
@@ -278,164 +298,215 @@ Status NcclManager::GetCommunicator(NcclManager::Collective* collective,
     devices[i] = collective->participants[i]->gpu_device_id;
   }
 
-  int device_count = num_devices;
+  std::vector<ncclComm_t> nccl_comms(collective->num_local_devices);
 #if NCCL_MAJOR >= 2
-  // NCCL2 prevents InitAll for more communicators than devices (but doesn't
-  // check that device ids are unique). Work around it by initializing each
-  // rank individually.
-  CUDA_RETURN_IF_ERROR(cudaGetDeviceCount(&device_count));
-#endif
-  std::vector<ncclComm_t> nccl_comms(num_devices);
-  if (num_devices <= device_count) {
-    NCCL_RETURN_IF_ERROR(
-        ncclCommInitAll(nccl_comms.data(), num_devices, devices.data()));
+  // For NCCL 2, we always initialize using ncclCommInitRank guarded by NCCL
+  // group primitives.
+  ncclUniqueId nccl_id;
+  if (collective->single_node) {
+    NCCL_RETURN_IF_ERROR(ncclGetUniqueId(&nccl_id));
   } else {
-    int savedDevice = 0;
-    CUDA_RETURN_IF_ERROR(cudaGetDevice(&savedDevice));
-    ncclUniqueId commId;
-    NCCL_RETURN_IF_ERROR(ncclGetUniqueId(&commId));
-#if NCCL_MAJOR >= 2
-    NCCL_RETURN_IF_ERROR(ncclGroupStart());
-#endif
-    for (int rank = 0; rank < num_devices; ++rank) {
-      CUDA_RETURN_IF_ERROR(cudaSetDevice(devices[rank]));
-      NCCL_RETURN_IF_ERROR(ncclCommInitRank(nccl_comms.data() + rank,
-                                            num_devices, commId, rank));
-    }
-#if NCCL_MAJOR >= 2
-    NCCL_RETURN_IF_ERROR(ncclGroupEnd());
-#endif
-    CUDA_RETURN_IF_ERROR(cudaSetDevice(savedDevice));
+    StringToNcclUniqueId(collective->communicator_key, &nccl_id);
+  }
+  int saved_device = 0;
+  CUDA_RETURN_IF_ERROR(cudaGetDevice(&saved_device));
+  NCCL_RETURN_IF_ERROR(ncclGroupStart());
+  for (int i = 0; i < collective->num_local_devices; ++i) {
+    // Set rank to `participant->global_rank` if provided, else `i`.
+    const int rank = collective->participants[i]->global_rank >= 0
+                         ? collective->participants[i]->global_rank
+                         : i;
+    CUDA_RETURN_IF_ERROR(cudaSetDevice(devices[i]));
+    NCCL_RETURN_IF_ERROR(ncclCommInitRank(
+        nccl_comms.data() + i, collective->num_global_devices, nccl_id, rank));
   }
-  for (int rank = 0; rank < num_devices; ++rank) {
-    members[rank].nccl_comm = nccl_comms[rank];
+  NCCL_RETURN_IF_ERROR(ncclGroupEnd());
+  CUDA_RETURN_IF_ERROR(cudaSetDevice(saved_device));
+#else
+  // Since NCCL 1 is single node only, we use ncclCommInitAll.  We could have
+  // used ncclCommInitRank with NCCL 1 as well, but then we would have to
+  // issue each init call from a different thread
+  // (https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/nccl1.html).
+  NCCL_RETURN_IF_ERROR(ncclCommInitAll(
+      nccl_comms.data(), collective->num_local_devices, devices.data()));
+#endif
+
+  for (int i = 0; i < collective->num_local_devices; ++i) {
+    members[i].nccl_comm = nccl_comms[i];
   }
-  communicators_.emplace_back(new Communicator(std::move(members)));
+  communicators_.emplace_back(
+      new Communicator(std::move(members), collective->communicator_key));
   *communicator = communicators_.back().get();
   return Status::OK();
 }
 
-void NcclManager::AddToAllReduce(int num_devices, const string& key,
-                                 ncclRedOp_t reduction_op,
-                                 se::StreamExecutor* executor,
-                                 int gpu_device_id, EventMgr* event_mgr,
-                                 se::Stream* tensor_stream, const Tensor* in_t,
-                                 Tensor* out_t,
-                                 const DoneCallback& done_callback) {
-  std::unique_ptr<Participant> participant(
-      new Participant(in_t, out_t, event_mgr, tensor_stream, executor,
-                      gpu_device_id, done_callback));
-  AddParticipant(num_devices, key, std::move(participant), in_t->dtype(),
-                 kAllReduce, reduction_op);
+void NcclManager::AddToAllReduce(std::unique_ptr<Participant> participant,
+                                 const Context& context,
+                                 ncclRedOp_t reduction_op) {
+  AddParticipant(std::move(participant), context, kAllReduce, reduction_op);
+}
+
+void NcclManager::AddToAllGather(std::unique_ptr<Participant> participant,
+                                 const Context& context) {
+  AddParticipant(std::move(participant), context, kAllGather,
+                 ncclSum /* unused */);
 }
 
-void NcclManager::AddBroadcastSend(int num_devices, const string& key,
-                                   se::StreamExecutor* executor,
-                                   int gpu_device_id, EventMgr* event_mgr,
-                                   se::Stream* tensor_stream,
-                                   const Tensor* in_t,
-                                   DoneCallback done_callback) {
-  std::unique_ptr<Participant> participant(
-      new Participant(in_t, nullptr /* out_t */, event_mgr, tensor_stream,
-                      executor, gpu_device_id, std::move(done_callback)));
+void NcclManager::AddBroadcastSend(std::unique_ptr<Participant> participant,
+                                   const Context& context) {
   participant->root = true;
-  AddParticipant(num_devices, key, std::move(participant), in_t->dtype(),
-                 kBroadcast, ncclSum /* unused */);
+  AddParticipant(std::move(participant), context, kBroadcast,
+                 ncclSum /* unused */);
 }
 
-void NcclManager::AddBroadcastRecv(int num_devices, const string& key,
-                                   se::StreamExecutor* executor,
-                                   int gpu_device_id, EventMgr* event_mgr,
-                                   se::Stream* tensor_stream, Tensor* out_t,
-                                   DoneCallback done_callback) {
-  std::unique_ptr<Participant> participant(
-      new Participant(nullptr /* in_t */, out_t, event_mgr, tensor_stream,
-                      executor, gpu_device_id, std::move(done_callback)));
-  AddParticipant(num_devices, key, std::move(participant), out_t->dtype(),
-                 kBroadcast, ncclSum /* unused */);
+void NcclManager::AddBroadcastRecv(std::unique_ptr<Participant> participant,
+                                   const Context& context) {
+  AddParticipant(std::move(participant), context, kBroadcast,
+                 ncclSum /* unused */);
 }
 
-void NcclManager::AddReduceSend(int num_devices, const string& key,
-                                ncclRedOp_t reduction_op,
-                                se::StreamExecutor* executor, int gpu_device_id,
-                                EventMgr* event_mgr, se::Stream* tensor_stream,
-                                const Tensor* in_t,
-                                DoneCallback done_callback) {
-  std::unique_ptr<Participant> participant(
-      new Participant(in_t, nullptr /* out_t */, event_mgr, tensor_stream,
-                      executor, gpu_device_id, std::move(done_callback)));
-  AddParticipant(num_devices, key, std::move(participant), in_t->dtype(),
-                 kReduce, reduction_op);
+void NcclManager::AddReduceSend(std::unique_ptr<Participant> participant,
+                                const Context& context,
+                                ncclRedOp_t reduction_op) {
+  AddParticipant(std::move(participant), context, kReduce, reduction_op);
 }
 
-void NcclManager::AddReduceRecv(int num_devices, const string& key,
-                                ncclRedOp_t reduction_op,
-                                se::StreamExecutor* executor, int gpu_device_id,
-                                EventMgr* event_mgr, se::Stream* tensor_stream,
-                                const Tensor* in_t, Tensor* out_t,
-                                DoneCallback done_callback) {
-  std::unique_ptr<Participant> participant(
-      new Participant(in_t, out_t, event_mgr, tensor_stream, executor,
-                      gpu_device_id, std::move(done_callback)));
-  participant->root = true;
-  AddParticipant(num_devices, key, std::move(participant), in_t->dtype(),
-                 kReduce, reduction_op);
+void NcclManager::AddReduceRecv(std::unique_ptr<Participant> participant,
+                                const Context& context,
+                                ncclRedOp_t reduction_op) {
+  AddParticipant(std::move(participant), context, kReduce, reduction_op);
 }
 
-void NcclManager::AddParticipant(int num_devices, const string& key,
-                                 std::unique_ptr<Participant> participant,
-                                 DataType data_type,
+void NcclManager::SignalMultiNodeReady(const string& collective_key) {
+  Collective* to_run = nullptr;
+  {
+    mutex_lock l(mu_);
+    auto collective_it = collectives_.find(collective_key);
+    if (collective_it != collectives_.end()) {
+      Collective* collective = collective_it->second.get();
+      collective->multi_node_ready = true;
+      to_run = CheckReady(collective_key, collective);
+    }
+  }
+
+  if (to_run != nullptr) RunCollective(to_run);
+}
+
+void NcclManager::AddParticipant(std::unique_ptr<Participant> participant,
+                                 const Context& context,
                                  CollectiveType collective_type,
                                  ncclRedOp_t reduction_op) {
   Collective* to_run = nullptr;
+  const DataType data_type = participant->input->dtype();
   {
     mutex_lock l(mu_);
-    auto& collective_ptr = collectives_[key];
-    if (collective_ptr == nullptr) {
-      collective_ptr.reset(new Collective(data_type, collective_type,
-                                          reduction_op, num_devices));
+    auto collective_it = collectives_.find(context.collective_key);
+    Collective* collective = nullptr;
+    if (collective_it == collectives_.end()) {
+      auto collective_unique_ptr = absl::make_unique<Collective>(
+          data_type, collective_type, reduction_op, context.num_local_devices,
+          context.num_global_devices, context.communicator_key);
+      collective = collective_unique_ptr.get();
+      collectives_.emplace(context.collective_key,
+                           std::move(collective_unique_ptr));
+    } else {
+      collective = collective_it->second.get();
+    }
+
+    // Check `collective` is correct and consistent.
+    if (collective->status.ok() && collective->single_node &&
+        !collective->communicator_key.empty()) {
+      collective->status =
+          errors::Internal("Collective ", reduction_op,
+                           " is single node but has communicator_key of size ",
+                           collective->communicator_key.size());
+    }
+    if (collective->status.ok() && collective->communicator_key.size() !=
+                                       context.communicator_key.size()) {
+      collective->status =
+          errors::Internal("Collective ", reduction_op,
+                           " mismatch in member communicator_key with size ",
+                           collective->communicator_key.size(),
+                           " and arg communicator_key with size ",
+                           context.communicator_key.size());
+    }
+    if (collective->status.ok() && collective->type != collective_type) {
+      collective->status = errors::Internal(
+          "Collective ", reduction_op, " previously initialized with type ",
+          collective->type, " but now got type ", collective_type);
+    }
+    if (collective->status.ok() &&
+        collective->num_global_devices != context.num_global_devices) {
+      collective->status =
+          errors::Internal("Collective ", reduction_op,
+                           " previously initialized with num_global_devices ",
+                           collective->num_global_devices, " but now got ",
+                           context.num_global_devices);
+    }
+    if (collective->status.ok() &&
+        collective->num_local_devices != context.num_local_devices) {
+      collective->status =
+          errors::Internal("Collective ", reduction_op,
+                           "previously initialized with num_local_devices ",
+                           collective->num_local_devices, " but now got ",
+                           context.num_local_devices);
+    }
+    if (collective->status.ok() &&
+        collective->participants.size() >= collective->num_local_devices) {
+      collective->status = errors::Internal(
+          "Collective ", reduction_op, " expected ",
+          collective->num_local_devices, " participants but now has ",
+          collective->participants.size(),
+          " with one more participant being added");
     }
-    Collective* collective = collective_ptr.get();
-    DCHECK_EQ(collective->type, collective_type);
-    DCHECK_LT(collective->participants.size(), num_devices);
+
     collective->participants.emplace_back(std::move(participant));
     ++collective->available_participants;
 
-    if (collective->available_participants == num_devices) {
-      to_run = collective;
-
-      // Ownership is going to be transferred to RunCollective.
-      collective_ptr.release();
-      collectives_.erase(key);
-    }
+    to_run = CheckReady(context.collective_key, collective);
   }
 
-  if (to_run != nullptr) {
-    RunCollective(key, to_run);
+  if (to_run != nullptr) RunCollective(to_run);
+}
+
+NcclManager::Collective* NcclManager::CheckReady(const string& collective_key,
+                                                 Collective* collective) {
+  Collective* to_run = nullptr;
+  if (collective->available_participants == collective->num_local_devices) {
+    if (collective->num_global_devices == collective->num_local_devices ||
+        collective->multi_node_ready) {
+      // Ownership transferred to callee.
+      to_run = collective;
+      auto collectives_it = collectives_.find(collective_key);
+      collectives_it->second.release();
+      collectives_.erase(collectives_it);
+    }
   }
+  return to_run;
 }
 
-void NcclManager::RunCollective(const string& key, Collective* collective) {
+void NcclManager::RunCollective(Collective* collective) {
   static mutex collective_mu(LINKER_INITIALIZED);
 
-  Communicator* communicator = nullptr;
-  const int size = static_cast<int>(collective->participants.size());
-  Status s = GetCommunicator(collective, &communicator);
+  Status s = collective->status;
+  if (s.ok()) {
+    s = GetCommunicator(collective, &collective->communicator);
+  }
   if (!s.ok()) {
-    for (int i = 0; i < size; ++i) {
+    for (int i = 0; i < collective->num_local_devices; ++i) {
       collective->participants[i]->done_callback(s);
     }
     delete collective;
     return;
   }
 
-  collective->communicator = communicator;
-  for (int rank = 0; rank < size; ++rank) {
-    Participant* p = collective->participants[rank].get();
-    NcclStream* nccl_stream = communicator->members[rank].nccl_stream;
+  for (int i = 0; i < collective->num_local_devices; ++i) {
+    Participant* p = collective->participants[i].get();
+    NcclStream* nccl_stream = collective->communicator->members[i].nccl_stream;
     CHECK(nccl_stream != nullptr);
+    const int rank = p->global_rank >= 0 ? p->global_rank : i;
 
-    if (p->in_t != nullptr) {
+    if (p->input != nullptr) {
       // Wait to ensure that the kernel that produces the data in the input
       // tensor has finished running before the nccl kernel runs on the
       // communication stream.
@@ -457,11 +528,11 @@ void NcclManager::RunCollective(const string& key, Collective* collective) {
     // Note that it would be possible to run multiple collectives at once, if
     // they have non-intersecting sets of devices.
     mutex_lock l(collective_mu);
-    for (int rank = 0; rank < size; ++rank) {
-      NcclStream* nccl_stream = communicator->members[rank].nccl_stream;
+    for (int i = 0; i < collective->num_local_devices; ++i) {
+      NcclStream* nccl_stream =
+          collective->communicator->members[i].nccl_stream;
       mutex_lock l(nccl_stream->mu);
-      nccl_stream->pending_launches_.push_front(
-          std::make_pair(collective, rank));
+      nccl_stream->pending_launches_.push_front(std::make_pair(collective, i));
       nccl_stream->cv.notify_all();
     }
   }
@@ -488,52 +559,69 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
       next_launch = nccl_stream->pending_launches_.back();
       nccl_stream->pending_launches_.pop_back();
     }
-    Collective* collective = next_launch.first;
-    int rank = next_launch.second;
 
     // Launch the nccl kernel.
+    Collective* collective = next_launch.first;
     ncclDataType_t data_type = ToNcclType(collective->data_type);
-    Participant* p = collective->participants[rank].get();
-
-    auto nccl_comm = collective->communicator->members[rank].nccl_comm;
+    int p_idx = next_launch.second;
+    Participant* p = collective->participants[p_idx].get();
+    auto nccl_comm = collective->communicator->members[p_idx].nccl_comm;
     ncclResult_t nccl_result = ncclSuccess;
     switch (collective->type) {
       case kAllReduce: {
-        const void* sendbuff = p->in_t->tensor_data().data();
-        void* recvbuff = const_cast<char*>(p->out_t->tensor_data().data());
-
-        nccl_result =
-            ncclAllReduce(sendbuff, recvbuff, p->in_t->NumElements(), data_type,
-                          collective->reduction_op, nccl_comm, *cu_stream);
+        const void* sendbuff = p->input->tensor_data().data();
+        void* recvbuff = const_cast<char*>(p->output->tensor_data().data());
+
+        VLOG(2) << "call NcclAllReduce participant " << p_idx << " sendbuff "
+                << sendbuff << " recvbuff " << recvbuff << " nccl_comm "
+                << nccl_comm << " comm_stream " << comm_stream
+                << " cuda_stream " << cu_stream;
+        nccl_result = ncclAllReduce(sendbuff, recvbuff, p->input->NumElements(),
+                                    data_type, collective->reduction_op,
+                                    nccl_comm, *cu_stream);
         break;
       }
       case kBroadcast: {
-        const Tensor* buf_t = p->in_t ? p->in_t : p->out_t;
+        const Tensor* buf_t = p->input ? p->input : p->output;
         void* buf = const_cast<char*>(buf_t->tensor_data().data());
         nccl_result = ncclBcast(buf, buf_t->NumElements(), data_type,
                                 collective->root_rank, nccl_comm, *cu_stream);
         break;
       }
       case kReduce: {
-        const void* sendbuff = p->in_t->tensor_data().data();
-        void* recvbuff = p->out_t
-                             ? const_cast<char*>(p->out_t->tensor_data().data())
-                             : nullptr;
-        nccl_result = ncclReduce(sendbuff, recvbuff, p->in_t->NumElements(),
+        const void* sendbuff = p->input->tensor_data().data();
+        void* recvbuff =
+            p->output ? const_cast<char*>(p->output->tensor_data().data())
+                      : nullptr;
+        nccl_result = ncclReduce(sendbuff, recvbuff, p->input->NumElements(),
                                  data_type, collective->reduction_op,
                                  collective->root_rank, nccl_comm, *cu_stream);
         break;
       }
+      case kAllGather: {
+        const void* sendbuff = p->input->tensor_data().data();
+        void* recvbuff = const_cast<char*>(p->output->tensor_data().data());
+
+        VLOG(2) << "call NcclAllGather participant " << p_idx << " sendbuff "
+                << sendbuff << " sendcount " << p->input->NumElements()
+                << " recvbuff " << recvbuff << " recvcount "
+                << p->output->NumElements() << " nccl_comm " << nccl_comm
+                << " comm_stream " << comm_stream << " cuda_stream "
+                << cu_stream;
+        nccl_result = ncclAllGather(sendbuff, recvbuff, p->input->NumElements(),
+                                    data_type, nccl_comm, *cu_stream);
+        break;
+      }
     }
 
     // Run the done_callback when the nccl kernel finishes running.
-    auto done_callback = [collective, rank, nccl_result]() {
+    auto done_callback = [collective, p_idx, nccl_result]() {
       if (nccl_result == ncclSuccess) {
-        collective->participants[rank]->done_callback(Status::OK());
+        collective->participants[p_idx]->done_callback(Status::OK());
       } else {
         // Propagate the error, but note that if other members of the collective
         // did launch their kernels, then they are hanging.
-        collective->participants[rank]->done_callback(errors::Unknown(
+        collective->participants[p_idx]->done_callback(errors::Unknown(
             "Error invoking NCCL: ", ncclGetErrorString(nccl_result)));
       }
 
diff --git a/tensorflow/core/nccl/nccl_manager.h b/tensorflow/core/nccl/nccl_manager.h
index 5da4fe5554d134f79c279542666c841a4e205485..7cf2c85f3e81ce951c408e797a17ab1634f17811 100644
--- a/tensorflow/core/nccl/nccl_manager.h
+++ b/tensorflow/core/nccl/nccl_manager.h
@@ -35,7 +35,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-// The communicator is used to make the asynchronous communicator calls and to
+// NCCL manager is used to make the asynchronous communicator calls and to
 // manage the per-device streams used for communication.
 //
 // See nccl_ops.cc for example usage, including description of memory
@@ -48,60 +48,138 @@ class NcclManager {
 
   static NcclManager* instance();
 
-  // Add one participant to an all-reduce, sending in data from <in_t> and
-  // receiving the result of the all-reduce in <out_t>.  The device for this
-  // participant is managed by <executor>, and its events are polled by
-  // <event_mgr>.
-  //
-  // This is an asynchronous call. When <done_callback> is called, <out_t> has
-  // been set to the all-reduce result (note: the stream may not yet have been
-  // synced).
-  //
-  // <tensor_stream> is the stream that should be waited on to ensure <in_t>'s
-  // data is available on the GPU for the communication stream to access. It
-  // is also the stream that will use the produced data; <done_callback> is
-  // not called until the next kernel launched on <stream> would see the data.
-  void AddToAllReduce(int num_devices, const string& key,
-                      ncclRedOp_t reduction_op, se::StreamExecutor* executor,
-                      int gpu_device_id, EventMgr* event_mgr,
-                      se::Stream* tensor_stream, const Tensor* in_t,
-                      Tensor* out_t, const DoneCallback& done_callback);
-
-  // AddBroadcastSend and AddBroadcastRecv combine to sent data from one sender
+  // Calls `ncclGetUniqueId` and returns the id as a string.  The returned value
+  // may be shared with other participants on different nodes and passed in to
+  // multi-node collective invocations.
+  string GenerateCommunicatorKey();
+
+  // A participant in a Collective.
+  struct Participant {
+    Participant(se::StreamExecutor* executor, se::Stream* tensor_stream,
+                EventMgr* event_mgr, int gpu_device_id, const Tensor* input,
+                Tensor* output, int global_rank, DoneCallback done_callback)
+        : executor(executor),
+          tensor_stream(tensor_stream),
+          event_mgr(event_mgr),
+          gpu_device_id(gpu_device_id),
+          input(input),
+          output(output),
+          global_rank(global_rank),
+          done_callback(std::move(done_callback)),
+          root(false) {
+      DCHECK(executor != nullptr);
+      DCHECK(event_mgr != nullptr);
+      DCHECK(tensor_stream != nullptr);
+    }
+
+    // StreamExecutor for the device. Expected to be live for process lifetime.
+    se::StreamExecutor* const executor = nullptr;
+
+    // `tensor_stream` is the stream that should be waited on to ensure
+    // `input`'s data is available on the GPU for the communication stream to
+    // access. It is also the stream that will use the produced data;
+    // `done_callback` is not called until the next kernel launched on `stream`
+    // would see the data. Owned by the caller, who must keep it live until
+    // `done_callback` is called.
+    se::Stream* const tensor_stream;
+
+    // EventMgr which polls on executor.
+    // Owned by the caller, who must keep it live until `done_callback` is
+    // called.
+    EventMgr* const event_mgr;
+
+    const int gpu_device_id;
+
+    // Owned by the caller, who must keep it live until `done_callback` is
+    // called. Is NULL for participants that only receive data.
+    const Tensor* input;
+
+    // Owned by the caller, who must keep it live until `done_callback` is
+    // called. Is NULL for participants that only send data.
+    Tensor* output;
+
+    // Rank across all devices and all nodes.
+    // `global_rank` is not required for single-node collectives.
+    const int global_rank;
+
+    // The callback which is called at the completion of the NCCL operation.
+    // When called, `output` has been set to the result of the operation. (note:
+    // the stream may not yet have been synced)
+    DoneCallback done_callback;
+
+    // True if this is the root of the collective, e.g. source of broadcast.
+    bool root;
+  };
+
+  // Data that provides context for the collective operation, including the
+  // operation key, number of participants, and communicator key.
+  struct Context {
+    Context(const string& collective_key, int num_local_devices,
+            int num_global_devices, const string& communicator_key)
+        : collective_key(collective_key),
+          num_local_devices(num_local_devices),
+          num_global_devices(num_global_devices),
+          communicator_key(communicator_key) {}
+
+    // Unique key for this collective instance
+    const string& collective_key;
+
+    // Devices local to this node
+    int num_local_devices;
+
+    // Devices across all nodes
+    int num_global_devices;
+
+    // In order to use NCCL across nodes, the callee first has to generate a
+    // `communicator_key` via `GenerateCommunicatorKey()` function and share
+    // this with all the other nodes.  Each node should pass in this
+    // `communicator_key` to the `NcclManager` functions.
+    // `communicator_key` is not required for single-node collectives and can be
+    // empty.
+    const string& communicator_key;
+  };
+
+  // Adds one participant to an all-reduce.
+  void AddToAllReduce(std::unique_ptr<Participant> participant,
+                      const Context& context, ncclRedOp_t reduction_op);
+
+  // Adds one participant to an all-gather.
+  void AddToAllGather(std::unique_ptr<Participant> participant,
+                      const Context& context);
+
+  // AddBroadcastSend and AddBroadcastRecv combine to send data from one sender
   // to all receivers.
-  void AddBroadcastSend(int num_devices, const string& key,
-                        se::StreamExecutor* executor, int gpu_device_id,
-                        EventMgr* event_mgr, se::Stream* tensor_stream,
-                        const Tensor* in_t, DoneCallback done_callback);
-  void AddBroadcastRecv(int num_devices, const string& key,
-                        se::StreamExecutor* executor, int gpu_device_id,
-                        EventMgr* event_mgr, se::Stream* tensor_stream,
-                        Tensor* out_t, DoneCallback done_callback);
-
-  // AddReduceSend and AddReduceRecv combine to sent data from all senders
+  void AddBroadcastSend(std::unique_ptr<Participant> participant,
+                        const Context& context);
+  void AddBroadcastRecv(std::unique_ptr<Participant> participant,
+                        const Context& context);
+
+  // AddReduceSend and AddReduceRecv combine to send data from all senders
   // to one receiver.
-  void AddReduceSend(int num_devices, const string& key,
-                     ncclRedOp_t reduction_op, se::StreamExecutor* executor,
-                     int gpu_device_id, EventMgr* event_mgr,
-                     se::Stream* tensor_stream, const Tensor* in_t,
-                     DoneCallback done_callback);
-  void AddReduceRecv(int num_devices, const string& key,
-                     ncclRedOp_t reduction_op, se::StreamExecutor* executor,
-                     int gpu_device_id, EventMgr* event_mgr,
-                     se::Stream* tensor_stream, const Tensor* in_t,
-                     Tensor* out_t, DoneCallback done_callback);
+  void AddReduceSend(std::unique_ptr<Participant> participant,
+                     const Context& context, ncclRedOp_t reduction_op);
+  void AddReduceRecv(std::unique_ptr<Participant> participant,
+                     const Context& context, ncclRedOp_t reduction_op);
+
+  // Signals that the `Collective` corresponding to `key` is ready to launch
+  // across all nodes participating in this multi-node collective operation.
+  //
+  // This should only be called for multi-node collectives; single-node
+  // collectives are implicitly ready when all participants have called Add*
+  // function.
+  void SignalMultiNodeReady(const string& collective_key);
 
  private:
   enum CollectiveType {
     kAllReduce = 1,
     kBroadcast = 2,
     kReduce = 3,
+    kAllGather = 4,
   };
   struct Collective;
   struct Communicator;
   struct CommunicatorMember;
   struct NcclStream;
-  struct Participant;
 
   // Gets the `Communicator` object that will be used to enqueue NCCL kernels
   // for `collective`, and returns it via `communicator`.
@@ -111,13 +189,26 @@ class NcclManager {
   // the corresponding NCCL/CUDA error string.
   Status GetCommunicator(Collective* collective, Communicator** communicator);
 
-  void AddParticipant(int num_devices, const string& key,
-                      std::unique_ptr<Participant> participant,
-                      DataType data_type, CollectiveType collective_type,
+  // Adds a participant device to the local `Collective` instance correponding
+  // to `collective_key`.  Launches the `Collective` if it is ready, which it
+  // checks by calling `CheckReady()`.  Also performs consistency and sanity
+  // checks before launching.
+  void AddParticipant(std::unique_ptr<Participant> participant,
+                      const Context& context, CollectiveType collective_type,
                       ncclRedOp_t reduction_op);
 
+  // If `collective` is ready to run, removes it from the `collectives_` map and
+  // returns the pointer.  Otherwise returns `nullptr`.
+  // Assumes `collective_key` corresponds to `collective`.
+  //
+  // A collective is ready to run when all local participants have called Add*
+  // function, and the collective is signalled globally ready via
+  // `SetMultiNodeReady`.
+  Collective* CheckReady(const string& collective_key, Collective* collective)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
   // Run <collective>.  This calls takes ownership of <collective>.
-  void RunCollective(const string& key, Collective* collective);
+  void RunCollective(Collective* collective);
   void LoopKernelLaunches(NcclStream* stream);
 
   mutex mu_;
diff --git a/tensorflow/core/nccl/nccl_manager_test.cc b/tensorflow/core/nccl/nccl_manager_test.cc
index f9ed4d0b9a26c390bc5974f206faea16c8b5b974..420e143c837f600016e66db6833ea8b58edde49d 100644
--- a/tensorflow/core/nccl/nccl_manager_test.cc
+++ b/tensorflow/core/nccl/nccl_manager_test.cc
@@ -53,7 +53,6 @@ class NcclManagerTest : public ::testing::Test {
  public:
   // A single all-reduce to apply.
   struct TestCase {
-    string key;
     std::vector<Tensor> ins;
     std::vector<Tensor> outs;
     Tensor expected;
@@ -64,18 +63,19 @@ class NcclManagerTest : public ::testing::Test {
   };
 
   static void SetUpTestCase() {
-    setenv("NCCL_DEBUG", "WARN", 1 /* replace */);
+    setenv("NCCL_DEBUG", "INFO", 1 /* replace */);
     setenv("NCCL_LAUNCH_MODE", "PARALLEL", 1 /* replace */);
     devices_ = new std::vector<std::unique_ptr<BaseGPUDevice>>(GetGPUDevices());
-    LOG(ERROR) << "Running test with " << devices_->size() << " gpus";
+    LOG(INFO) << "Running test with " << devices_->size() << " gpus";
   }
 
   static int32 NumGPUs() { return static_cast<int32>(devices_->size()); }
 
   static void TearDownTestCase() { delete devices_; }
 
-  TestCase* MakeTestCase(int num_ranks, ncclRedOp_t reduction_op,
-                         TensorShape shape, float value_offset) {
+  TestCase* MakeReductionTestCase(int num_nodes, int num_ranks_per_node,
+                                  ncclRedOp_t reduction_op, TensorShape shape,
+                                  float value_offset) {
     TestCase* test_case = new TestCase();
     test_case->expected = Tensor(data_type_, shape);
     if (reduction_op == ncclProd) {
@@ -93,55 +93,102 @@ class NcclManagerTest : public ::testing::Test {
     }
 
     float value_scale = 0.01;  // Small scale to avoid fp16 overflow.
-    for (int rank = 0; rank < num_ranks; ++rank) {
-      auto* device = GetDevice(rank);
-      auto* stream = device->tensorflow_gpu_device_info()->stream;
+    for (int node = 0; node < num_nodes; ++node) {
+      for (int local_rank = 0; local_rank < num_ranks_per_node; ++local_rank) {
+        auto* device = GetDevice(local_rank);
+        auto* stream = device->tensorflow_gpu_device_info()->stream;
 
-      Tensor in_cpu(data_type_, shape);
-      test::FillFn<Scalar>(&in_cpu, [&](int index) {
-        return static_cast<Scalar>((index + 1) * value_scale + value_offset);
-      });
-      for (int j = 0; j < shape.num_elements(); ++j) {
-        auto in_val = in_cpu.flat<Scalar>()(j);
-        auto out_expr = test_case->expected.template flat<Scalar>();
-        if (reduction_op == ncclProd) {
-          out_expr(j) = out_expr(j) * in_val;
-        } else if (reduction_op == ncclSum) {
-          out_expr(j) = out_expr(j) + in_val;
-        } else if (reduction_op == ncclMax) {
-          if (in_val > out_expr(j)) {
-            out_expr(j) = in_val;
-          }
-        } else if (reduction_op == ncclMin) {
-          if (in_val < out_expr(j)) {
-            out_expr(j) = in_val;
+        Tensor in_cpu(data_type_, shape);
+        test::FillFn<Scalar>(&in_cpu, [&](int index) {
+          return static_cast<Scalar>((index + 1) * value_scale + value_offset);
+        });
+        for (int j = 0; j < shape.num_elements(); ++j) {
+          auto in_val = in_cpu.flat<Scalar>()(j);
+          auto out_expr = test_case->expected.template flat<Scalar>();
+          if (reduction_op == ncclProd) {
+            out_expr(j) = out_expr(j) * in_val;
+          } else if (reduction_op == ncclSum) {
+            out_expr(j) = out_expr(j) + in_val;
+          } else if (reduction_op == ncclMax) {
+            if (in_val > out_expr(j)) {
+              out_expr(j) = in_val;
+            }
+          } else if (reduction_op == ncclMin) {
+            if (in_val < out_expr(j)) {
+              out_expr(j) = in_val;
+            }
           }
         }
-      }
 
-      value_scale *= 10;
-      test_case->ins.emplace_back(GpuAllocator(device), data_type_, shape);
-      test_case->outs.emplace_back(GpuAllocator(device), data_type_, shape);
+        value_scale *= 10;
+        test_case->ins.emplace_back(GpuAllocator(device), data_type_, shape);
+        test_case->outs.emplace_back(GpuAllocator(device), data_type_, shape);
 
-      const Tensor& in_gpu = test_case->ins.back();
-      auto in_gpu_mem = AsDeviceMemory(in_gpu.flat<Scalar>().data());
-      stream->ThenMemcpy(&in_gpu_mem, in_cpu.flat<Scalar>().data(),
-                         in_cpu.TotalBytes());
+        const Tensor& in_gpu = test_case->ins.back();
+        auto in_gpu_mem = AsDeviceMemory(in_gpu.flat<Scalar>().data());
+        stream->ThenMemcpy(&in_gpu_mem, in_cpu.flat<Scalar>().data(),
+                           in_cpu.TotalBytes());
+      }
     }
+
     return test_case;
   }
 
-  void VerifyResults(const string& case_label, TestCase* test_case) {
-    // Wait for the done callback to be called.
-    {
-      test_case->mu.lock();
-      while (test_case->num_completed != test_case->outs.size()) {
-        test_case->mu.unlock();
-        Env::Default()->SleepForMicroseconds(10);
-        test_case->mu.lock();
+  TestCase* MakeGatherTestCase(int num_nodes, int num_ranks_per_node,
+                               TensorShape in_shape, TensorShape out_shape) {
+    TestCase* test_case = new TestCase();
+    test_case->expected = Tensor(data_type_, out_shape);
+    test::FillFn<Scalar>(&test_case->expected,
+                         [](int) { return static_cast<Scalar>(0); });
+
+    float value_scale = 0.01;  // Small scale to avoid fp16 overflow.
+    for (int node = 0; node < num_nodes; ++node) {
+      for (int i = 0; i < num_ranks_per_node; ++i) {
+        auto* device = GetDevice(i);
+        auto* stream = device->tensorflow_gpu_device_info()->stream;
+
+        Tensor in_cpu(data_type_, in_shape);
+        test::FillFn<Scalar>(&in_cpu, [&](int index) {
+          return static_cast<Scalar>((index + 1) * value_scale);
+        });
+        // Starting index for this rank's tensor in the all-gathered output.
+        int32 gather_idx =
+            (node * num_ranks_per_node + i) * in_shape.num_elements();
+        for (int j = 0; j < in_shape.num_elements(); ++j) {
+          auto in_val = in_cpu.flat<Scalar>()(j);
+          auto out_expr = test_case->expected.template flat<Scalar>();
+          out_expr(gather_idx + j) = in_val;
+        }
+
+        value_scale *= 10;
+        test_case->ins.emplace_back(GpuAllocator(device), data_type_, in_shape);
+        test_case->outs.emplace_back(GpuAllocator(device), data_type_,
+                                     out_shape);
+
+        const Tensor& in_gpu = test_case->ins.back();
+        auto in_gpu_mem = AsDeviceMemory(in_gpu.flat<Scalar>().data());
+        stream->ThenMemcpy(&in_gpu_mem, in_cpu.flat<Scalar>().data(),
+                           in_cpu.TotalBytes());
       }
+    }
+
+    return test_case;
+  }
+
+  // Waits for the done callback to be called for each participant.
+  void WaitForTestCompletion(TestCase* test_case) {
+    test_case->mu.lock();
+    while (test_case->num_completed != test_case->outs.size()) {
       test_case->mu.unlock();
+      Env::Default()->SleepForMicroseconds(10);
+      test_case->mu.lock();
     }
+    test_case->mu.unlock();
+  }
+
+  void VerifyResults(TestCase* test_case) {
+    WaitForTestCompletion(test_case);
+    TF_ASSERT_OK(test_case->final_status);
     // Copy memory to host and verify.
     for (int rank = 0; rank < test_case->outs.size(); ++rank) {
       auto* device = GetDevice(rank);
@@ -152,10 +199,19 @@ class NcclManagerTest : public ::testing::Test {
       stream->ThenMemcpy(out_cpu.flat<Scalar>().data(), out_gpu_mem,
                          out_cpu.TotalBytes());
       SE_ASSERT_OK(stream->BlockHostUntilDone());
+      VLOG(1) << "Verifying rank " << rank << " expected shape "
+              << test_case->expected.shape() << " out shape "
+              << out_cpu.shape();
       test::ExpectClose(test_case->expected, out_cpu);
     }
   }
 
+  void VerifyError(TestCase* test_case) {
+    WaitForTestCompletion(test_case);
+    LOG(INFO) << test_case->final_status;
+    EXPECT_EQ(test_case->final_status.code(), error::INTERNAL);
+  }
+
   NcclManager::DoneCallback CreateDoneCallback(TestCase* test_case) {
     return [this, test_case](Status s) {
       mutex_lock l(test_case->mu);
@@ -197,7 +253,7 @@ const Scalar NcclManagerTest<Scalar>::max_ =
 
 // Instantiate tests for float and double.
 using TypeList = ::testing::Types<float, double>;
-TYPED_TEST_CASE(NcclManagerTest, TypeList);
+TYPED_TEST_SUITE(NcclManagerTest, TypeList);
 
 // Test basic sum reduction.
 TYPED_TEST(NcclManagerTest, BasicSumReduction) {
@@ -206,20 +262,26 @@ TYPED_TEST(NcclManagerTest, BasicSumReduction) {
   for (int op = 0; op < 4; ++op) {
     ncclRedOp_t reduction_op = static_cast<ncclRedOp_t>(op);
     std::unique_ptr<typename TestFixture::TestCase> test_case(
-        this->MakeTestCase(num_ranks, reduction_op, TensorShape({2, 3}), 0.0f));
+        this->MakeReductionTestCase(/*num_nodes=*/1, num_ranks, reduction_op,
+                                    TensorShape({2, 3}), 0.0f));
     for (int rank = 0; rank < num_ranks; ++rank) {
       auto* device = this->GetDevice(rank);
       VLOG(2) << "rank " << rank << " device " << device->name();
       auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
       auto* stream = device->tensorflow_gpu_device_info()->stream;
+      auto participant = absl::make_unique<NcclManager::Participant>(
+          device->executor(), stream, event_mgr, device->gpu_id(),
+          &test_case->ins[rank], &test_case->outs[rank], /*global_rank=*/-1,
+          this->CreateDoneCallback(test_case.get()));
       NcclManager::instance()->AddToAllReduce(
-          num_ranks, "allreduce", reduction_op, device->executor(),
-          device->gpu_id(), event_mgr, stream, &test_case->ins[rank],
-          &test_case->outs[rank], this->CreateDoneCallback(test_case.get()));
+          std::move(participant),
+          {"allreduce", /*num_local_devices=*/num_ranks,
+           /*num_global_devices=*/num_ranks, /*communicator_key=*/""},
+          reduction_op);
     }
 
-    LOG(ERROR) << "Verifying results";
-    this->VerifyResults("test_case", test_case.get());
+    LOG(INFO) << "Verifying results";
+    this->VerifyResults(test_case.get());
   }
 }
 
@@ -230,9 +292,9 @@ TYPED_TEST(NcclManagerTest, BasicSumReduction) {
 // time_limit_micros.
 TYPED_TEST(NcclManagerTest, MultipleCallers) {
   const int num_ranks = 4;
-  const int num_collectives_per_iteration = 10;  // 1000;
+  const int num_collectives_per_iteration = 10;
   const int num_threads = num_ranks * 2;
-  const int time_limit_micros = 100;  // 60 * 30 * 1000 * 1000;
+  const int time_limit_micros = 1 * 1000 * 1000;  // 1 second
 
   int64 start = Env::Default()->NowMicros();
   srand(Env::Default()->NowMicros());
@@ -241,9 +303,9 @@ TYPED_TEST(NcclManagerTest, MultipleCallers) {
     std::vector<std::pair<int, int>> case_and_rank;
     std::vector<std::unique_ptr<typename TestFixture::TestCase>> test_cases;
     for (int i = 0; i < num_collectives_per_iteration; ++i) {
-      test_cases.emplace_back(this->MakeTestCase(
-          num_ranks, ncclSum, TensorShape({100, i % 5 + 1, i % 3 + 1}),
-          1.1f * i));
+      test_cases.emplace_back(this->MakeReductionTestCase(
+          /*num_nodes=*/1, num_ranks, ncclSum,
+          TensorShape({100, i % 5 + 1, i % 3 + 1}), 1.1f * i));
       for (int j = 0; j < num_ranks; ++j) {
         case_and_rank.emplace_back(i, j);
       }
@@ -276,31 +338,219 @@ TYPED_TEST(NcclManagerTest, MultipleCallers) {
         auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
         auto* stream = device->tensorflow_gpu_device_info()->stream;
         typename TestFixture::TestCase* test_case = test_cases[test_num].get();
-        NcclManager::instance()->AddToAllReduce(
-            num_ranks, strings::StrCat("allreduce", test_num), ncclSum,
-            device->executor(), device->gpu_id(), event_mgr, stream,
-            &test_case->ins[rank], &test_case->outs[rank],
+        auto participant = absl::make_unique<NcclManager::Participant>(
+            device->executor(), stream, event_mgr, device->gpu_id(),
+            &test_case->ins[rank], &test_case->outs[rank], /*global_rank=*/-1,
             this->CreateDoneCallback(test_case));
+        NcclManager::instance()->AddToAllReduce(
+            std::move(participant),
+            {strings::StrCat("allreduce", test_num),
+             /*num_local_devices=*/num_ranks,
+             /*num_global_devices=*/num_ranks,
+             /*communicator_key=*/""},
+            ncclSum);
       };
       pool->Schedule(fn);
     }
     pool.reset();  // wait for all work to be scheduled.
 
-    LOG(ERROR) << "Verifying results for " << num_collectives_per_iteration
-               << " collectives";
+    VLOG(2) << "Verifying results for " << num_collectives_per_iteration
+            << " collectives";
     for (int i = 0; i < test_cases.size(); ++i) {
-      this->VerifyResults(strings::StrCat("collective", i),
-                          test_cases[i].get());
+      this->VerifyResults(test_cases[i].get());
     }
 
     int64 delta = Env::Default()->NowMicros() - start;
     if (delta > time_limit_micros) {
-      LOG(ERROR) << "Ran for " << delta << " quitting";
+      LOG(INFO) << "Ran for " << delta << " microsecs, now quitting";
       break;
     }
   }
 }
 
+// Test basic all-gather.
+TYPED_TEST(NcclManagerTest, BasicAllGather) {
+  const int num_ranks = 4;
+  for (int i = 0; i < num_ranks; ++i) {
+    std::unique_ptr<typename TestFixture::TestCase> test_case(
+        this->MakeGatherTestCase(/*num_nodes=*/1, num_ranks,
+                                 TensorShape({2, 3}),
+                                 TensorShape({2 * num_ranks, 3})));
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      auto* device = this->GetDevice(rank);
+      VLOG(2) << "rank " << rank << " device " << device->name();
+      auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
+      auto* stream = device->tensorflow_gpu_device_info()->stream;
+      auto participant = absl::make_unique<NcclManager::Participant>(
+          device->executor(), stream, event_mgr, device->gpu_id(),
+          &test_case->ins[rank], &test_case->outs[rank], rank,
+          this->CreateDoneCallback(test_case.get()));
+      NcclManager::instance()->AddToAllGather(
+          std::move(participant),
+          {"allgather", /*num_local_devices=*/num_ranks,
+           /*num_global_devices=*/num_ranks, /*communicator_key=*/""});
+    }
+
+    LOG(INFO) << "Verifying results";
+    this->VerifyResults(test_case.get());
+  }
+}
+
+// Multi-node NCCL tests.
+
+TEST(NcclManagerTest, CommunicatorKey) {
+  const string communicator_key =
+      NcclManager::instance()->GenerateCommunicatorKey();
+  EXPECT_EQ(communicator_key.size(), NCCL_UNIQUE_ID_BYTES);
+}
+
+// This test creates `num_nodes` NcclManagers to simulate a multi-node
+// environment.  It works on a single node and reuse GPUs.  It enqueues NCCL ops
+// on separate stream per rank.
+TYPED_TEST(NcclManagerTest, MultiNode) {
+  const int num_nodes = 2;
+  const int num_ranks_per_node = 4;
+  const int num_global_ranks = num_nodes * num_ranks_per_node;
+  std::vector<NcclManager> nccl_managers(num_nodes);
+  const string collective_key = "allreduce";
+  // The NcclManagers in this test synchronize in real-time, so we need to run
+  // each node's code in a separate thread.
+  // Specifically, the call to ncclGroupEnd() after calling ncclCommInitRank
+  // waits for all communicators before returning.
+  thread::ThreadPool pool(Env::Default(), "test_multi_node_nccl", num_nodes);
+
+  // First, initialize the communicator_key used for this collective.
+  const string communicator_key = nccl_managers[0].GenerateCommunicatorKey();
+
+  for (int op = 0; op < 4; ++op) {
+    ncclRedOp_t reduction_op = static_cast<ncclRedOp_t>(op);
+    std::unique_ptr<typename TestFixture::TestCase> test_case(
+        this->MakeReductionTestCase(num_nodes, num_ranks_per_node, reduction_op,
+                                    TensorShape({2, 3}), 0.0f));
+    for (int node = 0; node < num_nodes; ++node) {
+      auto node_fn = [this, node, &nccl_managers, &communicator_key,
+                      &collective_key, reduction_op, &test_case] {
+        for (int local_rank = 0; local_rank < num_ranks_per_node;
+             ++local_rank) {
+          auto* device = this->GetDevice(local_rank);
+          auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
+          auto* stream = device->tensorflow_gpu_device_info()->stream;
+          const int global_rank = node * num_ranks_per_node + local_rank;
+          auto participant = absl::make_unique<NcclManager::Participant>(
+              device->executor(), stream, event_mgr, device->gpu_id(),
+              &test_case->ins[global_rank], &test_case->outs[global_rank],
+              global_rank, this->CreateDoneCallback(test_case.get()));
+          nccl_managers[node].AddToAllReduce(
+              std::move(participant),
+              {collective_key, num_ranks_per_node, num_global_ranks,
+               communicator_key},
+              reduction_op);
+          VLOG(1) << "AddToAllReduce node " << node << " global_rank "
+                  << global_rank;
+        }
+
+        // Signal collective ready to launch at this node.
+        nccl_managers[node].SignalMultiNodeReady(collective_key);
+      };
+      pool.Schedule(node_fn);
+    }
+
+    VLOG(2) << "Verifying results";
+    this->VerifyResults(test_case.get());
+  }
+}
+
+// Checks that we return error status if a collective_key is used for different
+// types of collectives, e.g. a reduction and a broadcast.
+TYPED_TEST(NcclManagerTest, ConsistentCollectiveType) {
+  const int num_ranks = 2;
+
+  std::unique_ptr<typename TestFixture::TestCase> test_case(
+      this->MakeReductionTestCase(1 /* num_nodes */, num_ranks, ncclSum,
+                                  TensorShape({2, 3}), 0.0f));
+  for (int rank = 0; rank < num_ranks; ++rank) {
+    auto* device = this->GetDevice(rank);
+    auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
+    auto* stream = device->tensorflow_gpu_device_info()->stream;
+    auto participant = absl::make_unique<NcclManager::Participant>(
+        device->executor(), stream, event_mgr, device->gpu_id(),
+        &test_case->ins[rank], &test_case->outs[rank], /*global_rank=*/-1,
+        this->CreateDoneCallback(test_case.get()));
+    if (rank == 0) {
+      NcclManager::instance()->AddToAllReduce(std::move(participant),
+                                              {"bad_coll_type",
+                                               /*num_local_devices=*/num_ranks,
+                                               /*num_global_devices=*/num_ranks,
+                                               /*communicator_key=*/""},
+                                              ncclSum);
+    } else {
+      NcclManager::instance()->AddBroadcastSend(
+          std::move(participant), {"bad_coll_type",
+                                   /*num_local_devices=*/num_ranks,
+                                   /*num_global_devices=*/num_ranks,
+                                   /*communicator_key=*/""});
+    }
+  }
+
+  this->VerifyError(test_case.get());
+}
+
+// Checks that we return error status if different communicator_key is passed to
+// same collective.
+TYPED_TEST(NcclManagerTest, ConsistentCommunicatorKey) {
+  const int num_ranks = 2;
+
+  std::unique_ptr<typename TestFixture::TestCase> test_case(
+      this->MakeReductionTestCase(1 /* num_nodes */, num_ranks, ncclSum,
+                                  TensorShape({2, 3}), 0.0f));
+  for (int rank = 0; rank < num_ranks; ++rank) {
+    auto* device = this->GetDevice(rank);
+    auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
+    auto* stream = device->tensorflow_gpu_device_info()->stream;
+    auto participant = absl::make_unique<NcclManager::Participant>(
+        device->executor(), stream, event_mgr, device->gpu_id(),
+        &test_case->ins[rank], &test_case->outs[rank], /*global_rank=*/-1,
+        this->CreateDoneCallback(test_case.get()));
+    NcclManager::instance()->AddToAllReduce(
+        std::move(participant),
+        {"bad_coll_type",
+         /*num_local_devices=*/num_ranks,
+         /*num_global_devices=*/num_ranks,
+         rank == 0 ? "" : NcclManager::instance()->GenerateCommunicatorKey()},
+        ncclSum);
+  }
+
+  this->VerifyError(test_case.get());
+}
+
+// Checks that we return error status if the number of devices is inconsistent
+// across multiple participants of a collective.
+TYPED_TEST(NcclManagerTest, ConsistentNumberOfDevices) {
+  const int num_ranks = 2;
+
+  std::unique_ptr<typename TestFixture::TestCase> test_case(
+      this->MakeReductionTestCase(1 /* num_nodes */, num_ranks, ncclSum,
+                                  TensorShape({2, 3}), 0.0f));
+  for (int rank = 0; rank < num_ranks; ++rank) {
+    auto* device = this->GetDevice(rank);
+    auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
+    auto* stream = device->tensorflow_gpu_device_info()->stream;
+    int num_devices = rank == 0 ? num_ranks : num_ranks + 1;
+    auto participant = absl::make_unique<NcclManager::Participant>(
+        device->executor(), stream, event_mgr, device->gpu_id(),
+        &test_case->ins[rank], &test_case->outs[rank], /*global_rank=*/-1,
+        this->CreateDoneCallback(test_case.get()));
+    NcclManager::instance()->AddToAllReduce(std::move(participant),
+                                            {"bad_coll_type",
+                                             /*num_local_devices=*/num_devices,
+                                             /*num_global_devices=*/num_devices,
+                                             /*communicator_key=*/""},
+                                            ncclSum);
+  }
+
+  this->VerifyError(test_case.get());
+}  // namespace tensorflow
+
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 281e2996ed7c2b07881d5ab564fc31463f8f8607..8b6ee870799f082378033e4535b48407b6ed4a0d 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -347,6 +347,16 @@ REGISTER_OP("Pack")
       while (index < rank) dims.push_back(c->Dim(cur, index++));
 
       c->set_output(0, c->MakeShape(dims));
+      for (int i = 0; i < c->num_inputs(); ++i) {
+        auto* shape_and_type = c->input_handle_shapes_and_types(i);
+        if (shape_and_type) {
+          if (!c->RelaxOutputHandleShapesAndMergeTypes(0, *shape_and_type)) {
+            c->set_output_handle_shapes_and_types(
+                0, std::vector<shape_inference::ShapeAndType>({}));
+            break;
+          }
+        }
+      }
       return Status::OK();
     });
 
@@ -456,47 +466,37 @@ REGISTER_OP("BroadcastTo")
     .Attr("T: type")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle in = c->input(0);
+      ShapeHandle shape_in = c->input(1);
+      TF_RETURN_IF_ERROR(c->WithRank(shape_in, 1, &shape_in));
       ShapeHandle out;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &out));
-
       if (!c->RankKnown(out)) {
         // We have no information about the shape of the output.
         c->set_output(0, out);
         return Status::OK();
       }
 
+      ShapeHandle in = c->input(0);
       if (!c->RankKnown(in)) {
         // We have no information about the shape of the input,
         // nothing to do here.
         c->set_output(0, out);
         return Status::OK();
       }
-      if (c->Rank(out) < c->Rank(in)) {
-        return errors::InvalidArgument("Cannot broadcast a tensor with shape ",
-                                       c->DebugString(in), " shape ",
-                                       c->DebugString(out));
-      }
-
-      int32 in_offset = c->Rank(out) - c->Rank(in);
-      for (int32 i = 0; i < c->Rank(out); ++i) {
-        DimensionHandle dim = c->Dim(out, i);
-        if (c->ValueKnown(dim)) {
-          // The first in_offset dimensions for input will be expanded with 1,
-          // so no check needed.
-          if (i >= in_offset) {
-            DimensionHandle in_dim = c->Dim(in, i - in_offset);
-            if (c->ValueKnown(in_dim) && c->Value(in_dim) != 0) {
-              if (c->Value(dim) % c->Value(in_dim) != 0) {
-                return errors::InvalidArgument(
-                    "Cannot broadcast a tensor with shape ", c->DebugString(in),
-                    " shape ", c->DebugString(out));
-              }
-            }
-          }
+      int out_rank = c->Rank(out);
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(in, out_rank, &in));
+      int in_rank = c->Rank(in);
+      for (int i = 0; i < in_rank; ++i) {
+        auto in_dim = c->Dim(in, in_rank - i - 1);
+        if (c->Value(in_dim) > 1) {
+          // If the input dimension is greater than 1 then the output dimension
+          // must be equal to it, since we only broadcast "from left to right".
+          auto out_dim = c->Dim(out, out_rank - i - 1);
+          TF_RETURN_IF_ERROR(c->Merge(in_dim, out_dim, &out_dim));
+          TF_RETURN_IF_ERROR(
+              c->ReplaceDim(out, out_rank - i - 1, out_dim, &out));
         }
       }
-
       c->set_output(0, out);
       return Status::OK();
     });
@@ -1034,6 +1034,12 @@ REGISTER_OP("Fill")
       ShapeHandle out;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
       c->set_output(0, out);
+
+      auto* shape_and_type = c->input_handle_shapes_and_types(1);
+      if (shape_and_type) {
+        c->set_output_handle_shapes_and_types(0, *shape_and_type);
+      }
+
       return Status::OK();
     });
 
@@ -1206,27 +1212,13 @@ REGISTER_OP("Identity")
     .Input("input: T")
     .Output("output: T")
     .Attr("T: type")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->input(0));
-      auto* handle_data = c->input_handle_shapes_and_types(0);
-      if (handle_data != nullptr) {
-        c->set_output_handle_shapes_and_types(0, *handle_data);
-      }
-      return Status::OK();
-    });
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Snapshot")
     .Input("input: T")
     .Output("output: T")
     .Attr("T: type")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->input(0));
-      auto* handle_data = c->input_handle_shapes_and_types(0);
-      if (handle_data != nullptr) {
-        c->set_output_handle_shapes_and_types(0, *handle_data);
-      }
-      return Status::OK();
-    });
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 #ifdef INTEL_MKL
 REGISTER_OP("_MklIdentity")
@@ -1235,14 +1227,7 @@ REGISTER_OP("_MklIdentity")
     .Output("output: T")
     .Output("mkl_output: uint8")
     .Attr("T: type")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->input(0));
-      auto* handle_data = c->input_handle_shapes_and_types(0);
-      if (handle_data != nullptr) {
-        c->set_output_handle_shapes_and_types(0, *handle_data);
-      }
-      return Status::OK();
-    })
+    .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"Doc( Mkl implementation of IdentityOp
 )Doc");
 #endif
@@ -1626,6 +1611,11 @@ REGISTER_OP("StridedSlice")
       TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(final_shape, &out));
       c->set_output(0, out);
 
+      auto* shape_and_type = c->input_handle_shapes_and_types(0);
+      if (shape_and_type) {
+        c->set_output_handle_shapes_and_types(0, *shape_and_type);
+      }
+
       return Status::OK();
     });
 
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index 1c29cd2491fcd8d0e9d773e24e956df8212f2c7f..92648ce18876427b9c19b744f23ba787b4fff217 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -509,6 +509,33 @@ TEST(ArrayOpsTest, BroadcastArgs_ShapeFn) {
   INFER_ERROR("Shape must be rank 1 but is rank 0", op, "?;[]");
 }
 
+TEST(ArrayOpsTest, BroadcastTo_ShapeFn) {
+  ShapeInferenceTestOp op("BroadcastTo");
+  op.input_tensors.resize(2);
+
+  INFER_OK(op, "?;[?]", "?");
+  INFER_OK(op, "[];[1]", "[?]");
+  INFER_OK(op, "[1];[1]", "[?]");
+  INFER_OK(op, "[1];[2]", "[?,?]");
+  INFER_OK(op, "[2,2];[3]", "[?,d0_0,d0_1]");
+
+  // Rank checks
+  INFER_ERROR("Shape must be rank 1 but is rank 2", op, "?;[?,?]");
+  INFER_ERROR("Shape must be rank 1 but is rank 0", op, "[2];[]");
+  INFER_ERROR("Shape must be at most rank 1 but is rank 2", op, "[2,2];[1]");
+
+  Tensor shape_t(DT_INT64, TensorShape{3});
+  test::FillValues<int64>(&shape_t, {2, 10, 3});
+  op.input_tensors[1] = &shape_t;
+  INFER_OK(op, "[1,?,1];[3]", "[2,10,3]");
+  INFER_OK(op, "[1,1,1];[3]", "[2,10,3]");
+  INFER_OK(op, "[10,1];[3]", "[2,d0_0,3]");
+  INFER_ERROR("Dimensions must be equal, but are 3 and 2 for", op,
+              "[3,1,1];[3]");
+  INFER_ERROR("Dimensions must be equal, but are 2 and 10 for", op,
+              "[2,2,1];[3]");
+}
+
 TEST(ArrayOpsTest, BroadcastGradientArgs_ShapeFn) {
   ShapeInferenceTestOp op("BroadcastGradientArgs");
   // Output is always two unknown vectors.
diff --git a/tensorflow/core/ops/clustering_ops.cc b/tensorflow/core/ops/clustering_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..189f00730b4da4a548c8d738ae893ea1e346f3ef
--- /dev/null
+++ b/tensorflow/core/ops/clustering_ops.cc
@@ -0,0 +1,43 @@
+// Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License.  You may obtain a copy
+// of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+// License for the specific language governing permissions and limitations under
+// the License.
+// ==============================================================================
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+REGISTER_OP("KmeansPlusPlusInitialization")
+    .Input("points: float32")
+    .Input("num_to_sample: int64")
+    .Input("seed: int64")
+    .Input("num_retries_per_sample: int64")
+    .Output("samples: float32")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+REGISTER_OP("KMC2ChainInitialization")
+    .Input("distances: float32")
+    .Input("seed: int64")
+    .Output("index: int64")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("NearestNeighbors")
+    .Input("points: float32")
+    .Input("centers: float32")
+    .Input("k: int64")
+    .Output("nearest_center_indices: int64")
+    .Output("nearest_center_distances: float32")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/collective_ops.cc b/tensorflow/core/ops/collective_ops.cc
index d6157a69df5cf535a0957df8b7ed6d4f597acd1d..06e5f14de76315eb54dfa3ad65f49d5393f8ada7 100644
--- a/tensorflow/core/ops/collective_ops.cc
+++ b/tensorflow/core/ops/collective_ops.cc
@@ -28,9 +28,21 @@ REGISTER_OP("CollectiveReduce")
     .Attr("merge_op: {'Min', 'Max', 'Mul', 'Add'}")
     .Attr("final_op: {'Id', 'Div'}")
     .Attr("subdiv_offsets: list(int)")
+    .Attr("wait_for: list(int) = []")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnchangedShape);
 
+REGISTER_OP("CollectiveGather")
+    .Input("input: T")
+    .Output("data: T")
+    .Attr("T: {float, float16, float64, int32, int64}")
+    .Attr("group_size: int")
+    .Attr("group_key: int")
+    .Attr("instance_key: int")
+    .Attr("shape: shape")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ExplicitShape);
+
 REGISTER_OP("CollectiveBcastSend")
     .Input("input: T")
     .Output("data: T")
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 1492741e8b3ef4aac19effb9656cf07ecffe7ff3..68bdf49118ea3af5ec72d6bbdacf68187501a6dd 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -1298,6 +1298,34 @@ op {
     type: DT_FLOAT
   }
 }
+op {
+  name: "AdjustContrastv2"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "contrast_factor"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
 op {
   name: "AdjustHue"
   input_arg {
@@ -1313,6 +1341,34 @@ op {
     type: DT_FLOAT
   }
 }
+op {
+  name: "AdjustHue"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "delta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
 op {
   name: "AdjustSaturation"
   input_arg {
@@ -1328,6 +1384,34 @@ op {
     type: DT_FLOAT
   }
 }
+op {
+  name: "AdjustSaturation"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
+  }
+}
 op {
   name: "All"
   input_arg {
@@ -1462,6 +1546,96 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "AllToAll"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "concat_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_count"
+    type: "int"
+  }
+}
+op {
+  name: "AllToAll"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "concat_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_count"
+    type: "int"
+  }
+}
 op {
   name: "Angle"
   input_arg {
@@ -12303,6 +12477,46 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "Case"
+  input_arg {
+    name: "branch_index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "branches"
+    type: "list(func)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Cast"
   input_arg {
@@ -12540,6 +12754,55 @@ op {
     }
   }
 }
+op {
+  name: "Cholesky"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "CholeskyGrad"
+  input_arg {
+    name: "l"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "CholeskyGrad"
   input_arg {
@@ -12559,6 +12822,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -12695,6 +12959,87 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveGather"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectivePermute"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "source_target_pairs"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
 op {
   name: "CollectiveReduce"
   input_arg {
@@ -12758,6 +13103,127 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveReduce"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "merge_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Min"
+        s: "Max"
+        s: "Mul"
+        s: "Add"
+      }
+    }
+  }
+  attr {
+    name: "final_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Id"
+        s: "Div"
+      }
+    }
+  }
+  attr {
+    name: "subdiv_offsets"
+    type: "list(int)"
+  }
+  attr {
+    name: "wait_for"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "CombinedNonMaxSuppression"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size_per_class"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "max_total_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "nmsed_boxes"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "nmsed_scores"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "nmsed_classes"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "valid_detections"
+    type: DT_INT32
+  }
+  attr {
+    name: "pad_per_class"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "CompareAndBitpack"
   input_arg {
@@ -13320,6 +13786,35 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ConfigureDistributedTPU"
+  output_arg {
+    name: "topology"
+    type: DT_STRING
+  }
+  attr {
+    name: "embedding_config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "tpu_embedding_config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "is_global_init"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Conj"
   input_arg {
@@ -13632,17 +14127,13 @@ op {
   }
 }
 op {
-  name: "Conv2DBackpropFilter"
+  name: "Conv2D"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
+    name: "filter"
     type_attr: "T"
   }
   output_arg {
@@ -13655,7 +14146,9 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
@@ -13677,6 +14170,15 @@ op {
       list {
         s: "SAME"
         s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
       }
     }
   }
@@ -13693,6 +14195,18 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
   name: "Conv2DBackpropFilter"
@@ -13718,7 +14232,6 @@ op {
     allowed_values {
       list {
         type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -13757,18 +14270,6 @@ op {
       }
     }
   }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
 }
 op {
   name: "Conv2DBackpropFilter"
@@ -13796,7 +14297,6 @@ op {
         type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
@@ -13848,78 +14348,15 @@ op {
   }
 }
 op {
-  name: "Conv2DBackpropInput"
-  input_arg {
-    name: "input_sizes"
-    type: DT_INT32
-  }
+  name: "Conv2DBackpropFilter"
   input_arg {
-    name: "filter"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "use_cudnn_on_gpu"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
-}
-op {
-  name: "Conv2DBackpropInput"
-  input_arg {
-    name: "input_sizes"
+    name: "filter_sizes"
     type: DT_INT32
   }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
   input_arg {
     name: "out_backprop"
     type_attr: "T"
@@ -13936,6 +14373,7 @@ op {
         type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
@@ -13987,14 +14425,14 @@ op {
   }
 }
 op {
-  name: "Conv2DBackpropInput"
+  name: "Conv2DBackpropFilter"
   input_arg {
-    name: "input_sizes"
-    type: DT_INT32
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "filter"
-    type_attr: "T"
+    name: "filter_sizes"
+    type: DT_INT32
   }
   input_arg {
     name: "out_backprop"
@@ -14034,6 +14472,15 @@ op {
       list {
         s: "SAME"
         s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
       }
     }
   }
@@ -14064,54 +14511,17 @@ op {
   }
 }
 op {
-  name: "Conv3D"
+  name: "Conv2DBackpropInput"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "input_sizes"
+    type: DT_INT32
   }
   input_arg {
     name: "filter"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-}
-op {
-  name: "Conv3D"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
   input_arg {
-    name: "filter"
+    name: "out_backprop"
     type_attr: "T"
   }
   output_arg {
@@ -14123,16 +14533,21 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
   attr {
     name: "strides"
     type: "list(int)"
-    has_minimum: true
-    minimum: 5
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
     name: "padding"
@@ -14148,26 +14563,30 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NDHWC"
+      s: "NHWC"
     }
     allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
 }
 op {
-  name: "Conv3D"
+  name: "Conv2DBackpropInput"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "input_sizes"
+    type: DT_INT32
   }
   input_arg {
     name: "filter"
     type_attr: "T"
   }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -14180,15 +14599,19 @@ op {
         type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
   attr {
     name: "strides"
     type: "list(int)"
-    has_minimum: true
-    minimum: 5
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
     name: "padding"
@@ -14204,12 +14627,12 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NDHWC"
+      s: "NHWC"
     }
     allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
@@ -14222,16 +14645,15 @@ op {
         i: 1
         i: 1
         i: 1
-        i: 1
       }
     }
   }
 }
 op {
-  name: "Conv3DBackpropFilter"
+  name: "Conv2DBackpropInput"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "input_sizes"
+    type: DT_INT32
   }
   input_arg {
     name: "filter"
@@ -14250,6 +14672,8 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -14258,8 +14682,13 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
-    has_minimum: true
-    minimum: 5
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
     name: "padding"
@@ -14271,15 +14700,37 @@ op {
       }
     }
   }
-  deprecation {
-    version: 10
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
   }
 }
 op {
-  name: "Conv3DBackpropFilter"
+  name: "Conv2DBackpropInput"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "input_sizes"
+    type: DT_INT32
   }
   input_arg {
     name: "filter"
@@ -14299,6 +14750,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -14307,8 +14759,13 @@ op {
   attr {
     name: "strides"
     type: "list(int)"
-    has_minimum: true
-    minimum: 5
+  }
+  attr {
+    name: "use_cudnn_on_gpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
     name: "padding"
@@ -14317,55 +14774,28 @@ op {
       list {
         s: "SAME"
         s: "VALID"
+        s: "EXPLICIT"
       }
     }
   }
-  deprecation {
-    version: 10
-  }
-}
-op {
-  name: "Conv3DBackpropFilter"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "out_backprop"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
+    name: "data_format"
     type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
@@ -14378,26 +14808,18 @@ op {
         i: 1
         i: 1
         i: 1
-        i: 1
       }
     }
   }
-  deprecation {
-    version: 10
-  }
 }
 op {
-  name: "Conv3DBackpropFilterV2"
+  name: "Conv3D"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
+    name: "filter"
     type_attr: "T"
   }
   output_arg {
@@ -14432,17 +14854,13 @@ op {
   }
 }
 op {
-  name: "Conv3DBackpropFilterV2"
+  name: "Conv3D"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
+    name: "filter"
     type_attr: "T"
   }
   output_arg {
@@ -14490,17 +14908,13 @@ op {
   }
 }
 op {
-  name: "Conv3DBackpropFilterV2"
+  name: "Conv3D"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "filter_sizes"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "out_backprop"
+    name: "filter"
     type_attr: "T"
   }
   output_arg {
@@ -14563,7 +14977,7 @@ op {
   }
 }
 op {
-  name: "Conv3DBackpropInput"
+  name: "Conv3DBackpropFilter"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -14611,7 +15025,7 @@ op {
   }
 }
 op {
-  name: "Conv3DBackpropInput"
+  name: "Conv3DBackpropFilter"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -14660,7 +15074,7 @@ op {
   }
 }
 op {
-  name: "Conv3DBackpropInput"
+  name: "Conv3DBackpropFilter"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -14722,14 +15136,14 @@ op {
   }
 }
 op {
-  name: "Conv3DBackpropInputV2"
+  name: "Conv3DBackpropFilterV2"
   input_arg {
-    name: "input_sizes"
-    type: DT_INT32
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "filter"
-    type_attr: "T"
+    name: "filter_sizes"
+    type: DT_INT32
   }
   input_arg {
     name: "out_backprop"
@@ -14767,14 +15181,14 @@ op {
   }
 }
 op {
-  name: "Conv3DBackpropInputV2"
+  name: "Conv3DBackpropFilterV2"
   input_arg {
-    name: "input_sizes"
-    type: DT_INT32
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "filter"
-    type_attr: "T"
+    name: "filter_sizes"
+    type: DT_INT32
   }
   input_arg {
     name: "out_backprop"
@@ -14825,14 +15239,349 @@ op {
   }
 }
 op {
-  name: "Conv3DBackpropInputV2"
+  name: "Conv3DBackpropFilterV2"
   input_arg {
-    name: "input_sizes"
-    type: DT_INT32
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "filter"
-    type_attr: "T"
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
+op {
+  name: "Conv3DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
+op {
+  name: "Conv3DBackpropInput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  deprecation {
+    version: 10
+  }
+}
+op {
+  name: "Conv3DBackpropInputV2"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropInputV2"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+}
+op {
+  name: "Conv3DBackpropInputV2"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
   }
   input_arg {
     name: "out_backprop"
@@ -15822,6 +16571,58 @@ op {
     }
   }
 }
+op {
+  name: "CrossReplicaSum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "CrossReplicaSum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_UINT32
+      }
+    }
+  }
+}
 op {
   name: "CudnnRNN"
   input_arg {
@@ -16220,131 +17021,74 @@ op {
   is_stateful: true
 }
 op {
-  name: "CudnnRNNCanonicalToParams"
+  name: "CudnnRNNBackpropV3"
   input_arg {
-    name: "num_layers"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "num_units"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "input_size"
-    type: DT_INT32
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "weights"
+    name: "input_h"
     type_attr: "T"
-    number_attr: "num_params"
   }
   input_arg {
-    name: "biases"
+    name: "input_c"
     type_attr: "T"
-    number_attr: "num_params"
   }
-  output_arg {
+  input_arg {
     name: "params"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "num_params"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "rnn_mode"
-    type: "string"
-    default_value {
-      s: "lstm"
-    }
-    allowed_values {
-      list {
-        s: "rnn_relu"
-        s: "rnn_tanh"
-        s: "lstm"
-        s: "gru"
-      }
-    }
+  input_arg {
+    name: "sequence_lengths"
+    type: DT_INT32
   }
-  attr {
-    name: "input_mode"
-    type: "string"
-    default_value {
-      s: "linear_input"
-    }
-    allowed_values {
-      list {
-        s: "linear_input"
-        s: "skip_input"
-        s: "auto_select"
-      }
-    }
+  input_arg {
+    name: "output"
+    type_attr: "T"
   }
-  attr {
-    name: "direction"
-    type: "string"
-    default_value {
-      s: "unidirectional"
-    }
-    allowed_values {
-      list {
-        s: "unidirectional"
-        s: "bidirectional"
-      }
-    }
+  input_arg {
+    name: "output_h"
+    type_attr: "T"
   }
-  attr {
-    name: "dropout"
-    type: "float"
-    default_value {
-      f: 0
-    }
+  input_arg {
+    name: "output_c"
+    type_attr: "T"
   }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "output_backprop"
+    type_attr: "T"
   }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "output_h_backprop"
+    type_attr: "T"
   }
-}
-op {
-  name: "CudnnRNNParamsSize"
   input_arg {
-    name: "num_layers"
-    type: DT_INT32
+    name: "output_c_backprop"
+    type_attr: "T"
   }
   input_arg {
-    name: "num_units"
-    type: DT_INT32
+    name: "reserve_space"
+    type_attr: "T"
   }
   input_arg {
-    name: "input_size"
-    type: DT_INT32
+    name: "host_reserved"
+    type: DT_INT8
   }
   output_arg {
-    name: "params_size"
-    type_attr: "S"
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_h_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_c_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "params_backprop"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -16357,16 +17101,6 @@ op {
       }
     }
   }
-  attr {
-    name: "S"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
   attr {
     name: "rnn_mode"
     type: "string"
@@ -16430,9 +17164,10 @@ op {
       i: 0
     }
   }
+  is_stateful: true
 }
 op {
-  name: "CudnnRNNParamsToCanonical"
+  name: "CudnnRNNCanonicalToParams"
   input_arg {
     name: "num_layers"
     type: DT_INT32
@@ -16446,19 +17181,19 @@ op {
     type: DT_INT32
   }
   input_arg {
-    name: "params"
-    type_attr: "T"
-  }
-  output_arg {
     name: "weights"
     type_attr: "T"
     number_attr: "num_params"
   }
-  output_arg {
+  input_arg {
     name: "biases"
     type_attr: "T"
     number_attr: "num_params"
   }
+  output_arg {
+    name: "params"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
@@ -16541,42 +17276,379 @@ op {
   }
 }
 op {
-  name: "CudnnRNNV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
+  name: "CudnnRNNParamsSize"
   input_arg {
-    name: "input_h"
-    type_attr: "T"
+    name: "num_layers"
+    type: DT_INT32
   }
   input_arg {
-    name: "input_c"
-    type_attr: "T"
+    name: "num_units"
+    type: DT_INT32
   }
   input_arg {
-    name: "params"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_h"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_c"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "reserve_space"
-    type_attr: "T"
+    name: "input_size"
+    type: DT_INT32
   }
   output_arg {
-    name: "host_reserved"
-    type: DT_INT8
+    name: "params_size"
+    type_attr: "S"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "CudnnRNNParamsToCanonical"
+  input_arg {
+    name: "num_layers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_units"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "weights"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  output_arg {
+    name: "biases"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "num_params"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "CudnnRNNV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "host_reserved"
+    type: DT_INT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "CudnnRNNV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sequence_lengths"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "host_reserved"
+    type: DT_INT8
   }
   attr {
     name: "T"
@@ -17299,6 +18371,30 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "DatasetToSingleElement"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "DebugGradientIdentity"
   input_arg {
@@ -18336,6 +19432,43 @@ op {
     }
   }
 }
+op {
+  name: "DecodeRaw"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT16
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "little_endian"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
 op {
   name: "DecodeWav"
   input_arg {
@@ -21048,6 +22181,124 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "EnqueueTPUEmbeddingIntegerBatch"
+  input_arg {
+    name: "batch"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "EnqueueTPUEmbeddingSparseBatch"
+  input_arg {
+    name: "sample_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "embedding_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "aggregation_weights"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "combiners"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "EnqueueTPUEmbeddingSparseTensorBatch"
+  input_arg {
+    name: "sample_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "embedding_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "aggregation_weights"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "combiners"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "table_ids"
+    type: "list(int)"
+  }
+  is_stateful: true
+}
 op {
   name: "EnsureShape"
   input_arg {
@@ -21355,6 +22606,66 @@ op {
     }
   }
 }
+op {
+  name: "EuclideanNorm"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "Exit"
   input_arg {
@@ -21595,6 +22906,83 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalChooseFastestDataset"
+  input_arg {
+    name: "input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "num_experiments"
+    type: "int"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalChooseFastestDataset"
+  input_arg {
+    name: "input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "num_experiments"
+    type: "int"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalDatasetCardinality"
   input_arg {
@@ -21621,6 +23009,22 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "ExperimentalDatasetToTFRecord"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "ExperimentalDenseToSparseBatchDataset"
   input_arg {
@@ -22704,6 +24108,33 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalRebatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_workers"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalScanDataset"
   input_arg {
@@ -22967,6 +24398,42 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalTakeWhileDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "predicate"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalThreadPoolDataset"
   input_arg {
@@ -23192,6 +24659,53 @@ op {
     }
   }
 }
+op {
+  name: "ExtractGlimpse"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "offsets"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "glimpse"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "centered"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "normalized"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "uniform_noise"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "noise"
+    type: "string"
+    default_value {
+      s: "uniform"
+    }
+  }
+}
 op {
   name: "ExtractImagePatches"
   input_arg {
@@ -28398,6 +29912,108 @@ op {
     }
   }
 }
+op {
+  name: "InfeedDequeue"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "InfeedDequeueTuple"
+  output_arg {
+    name: "outputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  is_stateful: true
+}
+op {
+  name: "InfeedEnqueue"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  attr {
+    name: "layout"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "InfeedEnqueueTuple"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  attr {
+    name: "layouts"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "InitializeTable"
   input_arg {
@@ -29600,6 +31216,44 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "KMC2ChainInitialization"
+  input_arg {
+    name: "distances"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "index"
+    type: DT_INT64
+  }
+}
+op {
+  name: "KmeansPlusPlusInitialization"
+  input_arg {
+    name: "points"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_to_sample"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_retries_per_sample"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "samples"
+    type: DT_FLOAT
+  }
+}
 op {
   name: "L2Loss"
   input_arg {
@@ -30705,1530 +32359,1365 @@ op {
   is_stateful: true
 }
 op {
-  name: "Log"
+  name: "LoadTPUEmbeddingADAMParameters"
   input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "parameters"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "Log"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "momenta"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+  input_arg {
+    name: "velocities"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
-}
-op {
-  name: "Log"
-  input_arg {
-    name: "x"
-    type_attr: "T"
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+  attr {
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "Log1p"
+  name: "LoadTPUEmbeddingADAMParametersGradAccumDebug"
   input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+    name: "parameters"
+    type: DT_FLOAT
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "Log1p"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "velocities"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
-}
-op {
-  name: "Log1p"
-  input_arg {
-    name: "x"
-    type_attr: "T"
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+  attr {
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LogMatrixDeterminant"
+  name: "LoadTPUEmbeddingAdadeltaParameters"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "parameters"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "sign"
-    type_attr: "T"
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "log_abs_determinant"
-    type_attr: "T"
+  input_arg {
+    name: "updates"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
     }
-  }
-}
-op {
-  name: "LogSoftmax"
-  input_arg {
-    name: "logits"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "logsoftmax"
-    type_attr: "T"
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
-}
-op {
-  name: "LogSoftmax"
-  input_arg {
-    name: "logits"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "logsoftmax"
-    type_attr: "T"
+  attr {
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LogUniformCandidateSampler"
+  name: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug"
   input_arg {
-    name: "true_classes"
-    type: DT_INT64
+    name: "parameters"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "true_expected_count"
+  input_arg {
+    name: "updates"
     type: DT_FLOAT
   }
-  output_arg {
-    name: "sampled_expected_count"
+  input_arg {
+    name: "gradient_accumulators"
     type: DT_FLOAT
   }
   attr {
-    name: "num_true"
+    name: "table_id"
     type: "int"
+    default_value {
+      i: -1
+    }
     has_minimum: true
-    minimum: 1
+    minimum: -1
   }
   attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "unique"
-    type: "bool"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "range_max"
+    name: "shard_id"
     type: "int"
-    has_minimum: true
-    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingAdagradParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "seed"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "table_name"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
   }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
 }
 op {
-  name: "LogUniformCandidateSampler"
+  name: "LoadTPUEmbeddingAdagradParametersGradAccumDebug"
   input_arg {
-    name: "true_classes"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
+    name: "parameters"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "true_expected_count"
+  input_arg {
+    name: "accumulators"
     type: DT_FLOAT
   }
-  output_arg {
-    name: "sampled_expected_count"
+  input_arg {
+    name: "gradient_accumulators"
     type: DT_FLOAT
   }
   attr {
-    name: "num_true"
+    name: "table_id"
     type: "int"
+    default_value {
+      i: -1
+    }
     has_minimum: true
-    minimum: 1
+    minimum: -1
   }
   attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "unique"
-    type: "bool"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "range_max"
+    name: "shard_id"
     type: "int"
-    has_minimum: true
-    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingCenteredRMSPropParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "mom"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "mg"
+    type: DT_FLOAT
   }
   attr {
-    name: "seed"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "table_name"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
   }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
   is_stateful: true
 }
 op {
-  name: "LogicalAnd"
+  name: "LoadTPUEmbeddingFTRLParameters"
   input_arg {
-    name: "x"
-    type: DT_BOOL
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "y"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  is_commutative: true
-}
-op {
-  name: "LogicalNot"
   input_arg {
-    name: "x"
-    type: DT_BOOL
+    name: "linears"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "y"
-    type: DT_BOOL
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LogicalOr"
+  name: "LoadTPUEmbeddingFTRLParametersGradAccumDebug"
   input_arg {
-    name: "x"
-    type: DT_BOOL
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "y"
-    type: DT_BOOL
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
+  input_arg {
+    name: "linears"
+    type: DT_FLOAT
   }
-  is_commutative: true
-}
-op {
-  name: "LookupTableExport"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "keys"
-    type_attr: "Tkeys"
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
-  output_arg {
-    name: "values"
-    type_attr: "Tvalues"
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tkeys"
-    type: "type"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "Tvalues"
-    type: "type"
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LookupTableExportV2"
+  name: "LoadTPUEmbeddingMDLAdagradLightParameters"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "parameters"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "keys"
-    type_attr: "Tkeys"
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "values"
-    type_attr: "Tvalues"
+  input_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "benefits"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tkeys"
-    type: "type"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tvalues"
-    type: "type"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableFind"
+  name: "LoadTPUEmbeddingMomentumParameters"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "momenta"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "default_value"
-    type_attr: "Tout"
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
-  output_arg {
-    name: "values"
-    type_attr: "Tout"
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LookupTableFindV2"
+  name: "LoadTPUEmbeddingMomentumParametersGradAccumDebug"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "momenta"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "default_value"
-    type_attr: "Tout"
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "values"
-    type_attr: "Tout"
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableImport"
+  name: "LoadTPUEmbeddingProximalAdagradParameters"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "values"
-    type_attr: "Tout"
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LookupTableImportV2"
+  name: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "accumulators"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "values"
-    type_attr: "Tout"
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableInsert"
+  name: "LoadTPUEmbeddingRMSPropParameters"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "ms"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "values"
-    type_attr: "Tout"
+    name: "mom"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LookupTableInsertV2"
+  name: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "ms"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "values"
-    type_attr: "Tout"
+    name: "mom"
+    type: DT_FLOAT
   }
-  attr {
-    name: "Tin"
-    type: "type"
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
-  is_stateful: true
-}
-op {
-  name: "LookupTableRemoveV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
+  attr {
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableSize"
+  name: "LoadTPUEmbeddingStochasticGradientDescentParameters"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "parameters"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "size"
-    type: DT_INT64
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
-}
-op {
-  name: "LookupTableSizeV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  output_arg {
-    name: "size"
-    type: DT_INT64
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "LoopCond"
+  name: "Log"
   input_arg {
-    name: "input"
-    type: DT_BOOL
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type: DT_BOOL
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
-  name: "LowerBound"
-  input_arg {
-    name: "sorted_inputs"
-    type_attr: "T"
-  }
+  name: "Log"
   input_arg {
-    name: "values"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "out_type"
+    name: "y"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "Lu"
+  name: "Log"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "lu"
+    name: "y"
     type_attr: "T"
   }
-  output_arg {
-    name: "p"
-    type_attr: "output_idx_type"
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
+        type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
     }
   }
+}
+op {
+  name: "Log1p"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
   attr {
-    name: "output_idx_type"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "MakeIterator"
-  input_arg {
-    name: "dataset"
-    type: DT_VARIANT
-  }
+  name: "Log1p"
   input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
+    name: "x"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "MapClear"
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
-    has_minimum: true
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
+}
+op {
+  name: "Log1p"
+  input_arg {
+    name: "x"
+    type_attr: "T"
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "MapDataset"
+  name: "LogMatrixDeterminant"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "input"
+    type_attr: "T"
   }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+  output_arg {
+    name: "sign"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "log_abs_determinant"
+    type_attr: "T"
   }
   attr {
-    name: "f"
-    type: "func"
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+}
+op {
+  name: "LogMatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "sign"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "log_abs_determinant"
+    type_attr: "T"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "MapDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
+  name: "LogSoftmax"
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "logits"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "logsoftmax"
+    type_attr: "T"
   }
   attr {
-    name: "f"
-    type: "func"
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+}
+op {
+  name: "LogSoftmax"
+  input_arg {
+    name: "logits"
+    type_attr: "T"
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "logsoftmax"
+    type_attr: "T"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
 }
 op {
-  name: "MapDataset"
+  name: "LogUniformCandidateSampler"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "true_classes"
+    type: DT_INT64
   }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "true_expected_count"
+    type: DT_FLOAT
   }
-  attr {
-    name: "f"
-    type: "func"
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
   }
   attr {
-    name: "Targuments"
-    type: "list(type)"
+    name: "num_true"
+    type: "int"
     has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
+    name: "num_sampled"
+    type: "int"
     has_minimum: true
     minimum: 1
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
     has_minimum: true
     minimum: 1
   }
   attr {
-    name: "use_inter_op_parallelism"
-    type: "bool"
+    name: "seed"
+    type: "int"
     default_value {
-      b: true
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
     }
   }
 }
 op {
-  name: "MapDataset"
+  name: "LogUniformCandidateSampler"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "true_classes"
+    type: DT_INT64
   }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+  output_arg {
+    name: "sampled_candidates"
+    type: DT_INT64
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "true_expected_count"
+    type: DT_FLOAT
   }
-  attr {
-    name: "f"
-    type: "func"
+  output_arg {
+    name: "sampled_expected_count"
+    type: DT_FLOAT
   }
   attr {
-    name: "Targuments"
-    type: "list(type)"
+    name: "num_true"
+    type: "int"
     has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
+    name: "num_sampled"
+    type: "int"
     has_minimum: true
     minimum: 1
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
+    name: "unique"
+    type: "bool"
+  }
+  attr {
+    name: "range_max"
+    type: "int"
     has_minimum: true
     minimum: 1
   }
   attr {
-    name: "use_inter_op_parallelism"
-    type: "bool"
+    name: "seed"
+    type: "int"
     default_value {
-      b: true
+      i: 0
     }
   }
   attr {
-    name: "preserve_cardinality"
-    type: "bool"
+    name: "seed2"
+    type: "int"
     default_value {
-      b: false
+      i: 0
     }
   }
+  is_stateful: true
 }
 op {
-  name: "MapDefun"
+  name: "LogicalAnd"
   input_arg {
-    name: "arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "output_types"
+    name: "x"
+    type: DT_BOOL
   }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "y"
+    type: DT_BOOL
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "z"
+    type: DT_BOOL
   }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+  is_commutative: true
+}
+op {
+  name: "LogicalNot"
+  input_arg {
+    name: "x"
+    type: DT_BOOL
   }
-  attr {
-    name: "f"
-    type: "func"
+  output_arg {
+    name: "y"
+    type: DT_BOOL
   }
 }
 op {
-  name: "MapDefun"
+  name: "LogicalOr"
   input_arg {
-    name: "arguments"
-    type_list_attr: "Targuments"
+    name: "x"
+    type: DT_BOOL
   }
   input_arg {
-    name: "captured_inputs"
-    type_list_attr: "Tcaptured"
+    name: "y"
+    type: DT_BOOL
   }
   output_arg {
-    name: "output"
-    type_list_attr: "output_types"
+    name: "z"
+    type: DT_BOOL
   }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  is_commutative: true
+}
+op {
+  name: "LookupTableExport"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
-  attr {
-    name: "Tcaptured"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
+  output_arg {
+    name: "keys"
+    type_attr: "Tkeys"
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "values"
+    type_attr: "Tvalues"
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "Tkeys"
+    type: "type"
   }
   attr {
-    name: "f"
-    type: "func"
+    name: "Tvalues"
+    type: "type"
   }
 }
 op {
-  name: "MapIncompleteSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  name: "LookupTableExportV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  output_arg {
+    name: "keys"
+    type_attr: "Tkeys"
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
+  output_arg {
+    name: "values"
+    type_attr: "Tvalues"
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "Tkeys"
+    type: "type"
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "Tvalues"
+    type: "type"
   }
   is_stateful: true
 }
 op {
-  name: "MapPeek"
+  name: "LookupTableFind"
   input_arg {
-    name: "key"
-    type: DT_INT64
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
   input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "keys"
+    type_attr: "Tin"
   }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "default_value"
+    type_attr: "Tout"
   }
-  is_stateful: true
-}
-op {
-  name: "MapSize"
   output_arg {
-    name: "size"
-    type: DT_INT32
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "Tin"
+    type: "type"
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "Tout"
+    type: "type"
   }
-  is_stateful: true
 }
 op {
-  name: "MapStage"
+  name: "LookupTableFindV2"
   input_arg {
-    name: "key"
-    type: DT_INT64
+    name: "table_handle"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "keys"
+    type_attr: "Tin"
   }
   input_arg {
-    name: "values"
-    type_list_attr: "fake_dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
+    name: "default_value"
+    type_attr: "Tout"
   }
-  attr {
-    name: "fake_dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "Tin"
+    type: "type"
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "Tout"
+    type: "type"
   }
   is_stateful: true
 }
 op {
-  name: "MapUnstage"
+  name: "LookupTableImport"
   input_arg {
-    name: "key"
-    type: DT_INT64
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
   input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "keys"
+    type_attr: "Tin"
   }
-  output_arg {
+  input_arg {
     name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    type_attr: "Tout"
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "Tin"
+    type: "type"
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "Tout"
+    type: "type"
   }
-  is_stateful: true
 }
 op {
-  name: "MapUnstageNoKey"
+  name: "LookupTableImportV2"
   input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "table_handle"
+    type: DT_RESOURCE
   }
-  output_arg {
-    name: "key"
-    type: DT_INT64
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
   }
-  output_arg {
+  input_arg {
     name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    type_attr: "Tout"
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "Tin"
+    type: "type"
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "Tout"
+    type: "type"
   }
   is_stateful: true
 }
 op {
-  name: "MatMul"
+  name: "LookupTableInsert"
   input_arg {
-    name: "a"
-    type_attr: "T"
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
   input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "product"
-    type_attr: "T"
+    name: "keys"
+    type_attr: "Tin"
   }
-  attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "transpose_b"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "Tin"
+    type: "type"
   }
   attr {
-    name: "T"
+    name: "Tout"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
   }
 }
 op {
-  name: "MatMul"
+  name: "LookupTableInsertV2"
   input_arg {
-    name: "a"
-    type_attr: "T"
+    name: "table_handle"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "product"
-    type_attr: "T"
+    name: "keys"
+    type_attr: "Tin"
   }
-  attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "transpose_b"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "Tin"
+    type: "type"
   }
   attr {
-    name: "T"
+    name: "Tout"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
   }
+  is_stateful: true
 }
 op {
-  name: "MatMul"
+  name: "LookupTableRemoveV2"
   input_arg {
-    name: "a"
-    type_attr: "T"
+    name: "table_handle"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "b"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "product"
-    type_attr: "T"
-  }
-  attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "transpose_b"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "keys"
+    type_attr: "Tin"
   }
   attr {
-    name: "T"
+    name: "Tin"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
   }
+  is_stateful: true
 }
 op {
-  name: "MatMul"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
+  name: "LookupTableSize"
   input_arg {
-    name: "b"
-    type_attr: "T"
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
   output_arg {
-    name: "product"
-    type_attr: "T"
-  }
-  attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "transpose_b"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "size"
+    type: DT_INT64
   }
 }
 op {
-  name: "MatchingFiles"
+  name: "LookupTableSizeV2"
   input_arg {
-    name: "pattern"
-    type: DT_STRING
+    name: "table_handle"
+    type: DT_RESOURCE
   }
   output_arg {
-    name: "filenames"
-    type: DT_STRING
+    name: "size"
+    type: DT_INT64
   }
+  is_stateful: true
 }
 op {
-  name: "MatrixBandPart"
+  name: "LoopCond"
   input_arg {
     name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "num_lower"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_upper"
-    type: DT_INT64
+    type: DT_BOOL
   }
   output_arg {
-    name: "band"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
+    name: "output"
+    type: DT_BOOL
   }
 }
 op {
-  name: "MatrixBandPart"
+  name: "LowerBound"
   input_arg {
-    name: "input"
+    name: "sorted_inputs"
     type_attr: "T"
   }
   input_arg {
-    name: "num_lower"
-    type_attr: "Tindex"
-  }
-  input_arg {
-    name: "num_upper"
-    type_attr: "Tindex"
+    name: "values"
+    type_attr: "T"
   }
   output_arg {
-    name: "band"
-    type_attr: "T"
+    name: "output"
+    type_attr: "out_type"
   }
   attr {
     name: "T"
     type: "type"
   }
   attr {
-    name: "Tindex"
+    name: "out_type"
     type: "type"
     default_value {
-      type: DT_INT64
+      type: DT_INT32
     }
     allowed_values {
       list {
@@ -32239,290 +33728,246 @@ op {
   }
 }
 op {
-  name: "MatrixDeterminant"
+  name: "Lu"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "lu"
     type_attr: "T"
   }
+  output_arg {
+    name: "p"
+    type_attr: "output_idx_type"
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "output_idx_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "MatrixDeterminant"
+  name: "Lu"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "lu"
     type_attr: "T"
   }
+  output_arg {
+    name: "p"
+    type_attr: "output_idx_type"
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
     }
   }
-}
-op {
-  name: "MatrixDiag"
-  input_arg {
-    name: "diagonal"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "MatrixDiagPart"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "diagonal"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "MatrixExponential"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "output_idx_type"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "MatrixExponential"
+  name: "MakeIterator"
   input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "dataset"
+    type: DT_VARIANT
   }
-  deprecation {
-    version: 27
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
   }
+  is_stateful: true
 }
 op {
-  name: "MatrixInverse"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
+  name: "MapClear"
   attr {
-    name: "adjoint"
-    type: "bool"
+    name: "capacity"
+    type: "int"
     default_value {
-      b: false
+      i: 0
     }
+    has_minimum: true
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-      }
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
     }
+    has_minimum: true
   }
-}
-op {
-  name: "MatrixInverse"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  attr {
+    name: "dtypes"
+    type: "list(type)"
   }
   attr {
-    name: "adjoint"
-    type: "bool"
+    name: "container"
+    type: "string"
     default_value {
-      b: false
+      s: ""
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
+  is_stateful: true
 }
 op {
-  name: "MatrixLogarithm"
+  name: "MapDataset"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
-}
-op {
-  name: "MatrixSetDiag"
-  input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "f"
+    type: "func"
   }
-  input_arg {
-    name: "diagonal"
-    type_attr: "T"
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
+  is_stateful: true
 }
 op {
-  name: "MatrixSolve"
+  name: "MapDataset"
   input_arg {
-    name: "matrix"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "rhs"
-    type_attr: "T"
+    name: "other_arguments"
+    type_list_attr: "Targuments"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "f"
+    type: "func"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
 }
 op {
-  name: "MatrixSolveLs"
-  input_arg {
-    name: "matrix"
-    type_attr: "T"
-  }
+  name: "MapDataset"
   input_arg {
-    name: "rhs"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "l2_regularizer"
-    type: DT_DOUBLE
+    name: "other_arguments"
+    type_list_attr: "Targuments"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-      }
-    }
+    name: "f"
+    type: "func"
   }
   attr {
-    name: "fast"
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
     type: "bool"
     default_value {
       b: true
@@ -32530,646 +33975,678 @@ op {
   }
 }
 op {
-  name: "MatrixSolveLs"
-  input_arg {
-    name: "matrix"
-    type_attr: "T"
-  }
+  name: "MapDataset"
   input_arg {
-    name: "rhs"
-    type_attr: "T"
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "l2_regularizer"
-    type: DT_DOUBLE
+    name: "other_arguments"
+    type_list_attr: "Targuments"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "f"
+    type: "func"
   }
   attr {
-    name: "fast"
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
     type: "bool"
     default_value {
       b: true
     }
   }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "MatrixSquareRoot"
+  name: "MapDefun"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "arguments"
+    type_list_attr: "Targuments"
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_list_attr: "output_types"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
   }
 }
 op {
-  name: "MatrixTriangularSolve"
+  name: "MapDefun"
   input_arg {
-    name: "matrix"
-    type_attr: "T"
+    name: "arguments"
+    type_list_attr: "Targuments"
   }
   input_arg {
-    name: "rhs"
-    type_attr: "T"
+    name: "captured_inputs"
+    type_list_attr: "Tcaptured"
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_list_attr: "output_types"
   }
   attr {
-    name: "lower"
-    type: "bool"
-    default_value {
-      b: true
-    }
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "adjoint"
-    type: "bool"
+    name: "Tcaptured"
+    type: "list(type)"
     default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
       list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
       }
     }
+    has_minimum: true
   }
-}
-op {
-  name: "MatrixTriangularSolve"
-  input_arg {
-    name: "matrix"
-    type_attr: "T"
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
-  input_arg {
-    name: "rhs"
-    type_attr: "T"
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
   }
+}
+op {
+  name: "MapIncompleteSize"
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "size"
+    type: DT_INT32
   }
   attr {
-    name: "lower"
-    type: "bool"
+    name: "capacity"
+    type: "int"
     default_value {
-      b: true
+      i: 0
     }
+    has_minimum: true
   }
   attr {
-    name: "adjoint"
-    type: "bool"
+    name: "memory_limit"
+    type: "int"
     default_value {
-      b: false
+      i: 0
     }
+    has_minimum: true
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
+  is_stateful: true
 }
 op {
-  name: "Max"
+  name: "MapPeek"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "key"
+    type: DT_INT64
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "indices"
+    type: DT_INT32
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "values"
+    type_list_attr: "dtypes"
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
+    name: "capacity"
+    type: "int"
     default_value {
-      b: false
+      i: 0
     }
+    has_minimum: true
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
     }
+    has_minimum: true
   }
   attr {
-    name: "Tidx"
-    type: "type"
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
     default_value {
-      type: DT_INT32
+      s: ""
     }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
+  is_stateful: true
 }
 op {
-  name: "Max"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
+  name: "MapSize"
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "size"
+    type: DT_INT32
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
+    name: "capacity"
+    type: "int"
     default_value {
-      b: false
+      i: 0
     }
+    has_minimum: true
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
     }
+    has_minimum: true
   }
   attr {
-    name: "Tidx"
-    type: "type"
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
     default_value {
-      type: DT_INT32
+      s: ""
     }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
+  is_stateful: true
 }
 op {
-  name: "Max"
+  name: "MapStage"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "key"
+    type: DT_INT64
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "indices"
+    type: DT_INT32
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  input_arg {
+    name: "values"
+    type_list_attr: "fake_dtypes"
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
+    name: "capacity"
+    type: "int"
     default_value {
-      b: false
+      i: 0
     }
+    has_minimum: true
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
     }
+    has_minimum: true
   }
   attr {
-    name: "Tidx"
-    type: "type"
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "fake_dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
     default_value {
-      type: DT_INT32
+      s: ""
     }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
+  is_stateful: true
 }
 op {
-  name: "Max"
+  name: "MapUnstage"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "key"
+    type: DT_INT64
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "indices"
+    type: DT_INT32
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "values"
+    type_list_attr: "dtypes"
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
+    name: "capacity"
+    type: "int"
     default_value {
-      b: false
+      i: 0
     }
+    has_minimum: true
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
     }
+    has_minimum: true
   }
   attr {
-    name: "Tidx"
-    type: "type"
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
     default_value {
-      type: DT_INT32
+      s: ""
     }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
+  is_stateful: true
 }
 op {
-  name: "MaxPool"
+  name: "MapUnstageNoKey"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "indices"
+    type: DT_INT32
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "key"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "capacity"
+    type: "int"
     default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_HALF
-      }
+      i: 0
     }
+    has_minimum: true
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
     has_minimum: true
-    minimum: 4
   }
   attr {
-    name: "strides"
-    type: "list(int)"
+    name: "dtypes"
+    type: "list(type)"
     has_minimum: true
-    minimum: 4
+    minimum: 1
   }
   attr {
-    name: "padding"
+    name: "container"
     type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    default_value {
+      s: ""
     }
   }
   attr {
-    name: "data_format"
+    name: "shared_name"
     type: "string"
     default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
+      s: ""
     }
   }
+  is_stateful: true
 }
 op {
-  name: "MaxPool"
+  name: "MatMul"
   input_arg {
-    name: "input"
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "product"
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "transpose_a"
+    type: "bool"
     default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
+      b: false
     }
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "MaxPool"
+  name: "MatMul"
   input_arg {
-    name: "input"
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "product"
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "transpose_a"
+    type: "bool"
     default_value {
-      type: DT_FLOAT
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
     }
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_QINT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+}
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T"
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "transpose_b"
+    type: "bool"
     default_value {
-      s: "NHWC"
+      b: false
     }
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
-        s: "NCHW_VECT_C"
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "MaxPool"
+  name: "MatMul"
   input_arg {
-    name: "input"
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "product"
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "transpose_a"
+    type: "bool"
     default_value {
-      type: DT_FLOAT
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
     }
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_QINT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+}
+op {
+  name: "MatchingFiles"
+  input_arg {
+    name: "pattern"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+}
+op {
+  name: "MatrixBandPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_lower"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_upper"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "band"
+    type_attr: "T"
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "MatrixBandPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_lower"
+    type_attr: "Tindex"
+  }
+  input_arg {
+    name: "num_upper"
+    type_attr: "Tindex"
+  }
+  output_arg {
+    name: "band"
+    type_attr: "T"
   }
   attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "Tindex"
+    type: "type"
     default_value {
-      s: "NHWC"
+      type: DT_INT64
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
-        s: "NCHW_VECT_C"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "MaxPool3D"
+  name: "MatrixDeterminant"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -33178,40 +34655,19 @@ op {
     name: "output"
     type_attr: "T"
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "MaxPool3D"
+  name: "MatrixDeterminant"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -33221,54 +34677,46 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
+}
+op {
+  name: "MatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "MaxPool3D"
+  name: "MatrixDiag"
   input_arg {
-    name: "input"
+    name: "diagonal"
     type_attr: "T"
   }
   output_arg {
@@ -33276,53 +34724,103 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "MatrixDiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "MatrixExponential"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
+}
+op {
+  name: "MatrixExponential"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
+  deprecation {
+    version: 27
+  }
+}
+op {
+  name: "MatrixExponential"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_BFLOAT16
+        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
+  deprecation {
+    version: 27
+  }
 }
 op {
-  name: "MaxPool3D"
+  name: "MatrixInverse"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -33332,38 +34830,38 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_DOUBLE
+        type: DT_FLOAT
       }
     }
   }
+}
+op {
+  name: "MatrixInverse"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "adjoint"
+    type: "bool"
     default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
+      b: false
     }
   }
   attr {
@@ -33371,25 +34869,18 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
+        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "MaxPool3DGrad"
-  input_arg {
-    name: "orig_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "orig_output"
-    type: DT_FLOAT
-  }
+  name: "MatrixInverse"
   input_arg {
-    name: "grad"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
@@ -33397,49 +34888,55 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
+}
+op {
+  name: "MatrixLogarithm"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "MaxPool3DGrad"
-  input_arg {
-    name: "orig_input"
-    type: DT_FLOAT
-  }
+  name: "MatrixSetDiag"
   input_arg {
-    name: "orig_output"
-    type: DT_FLOAT
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "diagonal"
     type_attr: "T"
   }
   output_arg {
@@ -33447,38 +34944,29 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "T"
+    type: "type"
   }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+}
+op {
+  name: "MatrixSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
   }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "adjoint"
+    type: "bool"
     default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
+      b: false
     }
   }
   attr {
@@ -33486,23 +34974,22 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "MaxPool3DGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "TInput"
-  }
+  name: "MatrixSolve"
   input_arg {
-    name: "orig_output"
-    type_attr: "TInput"
+    name: "matrix"
+    type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "rhs"
     type_attr: "T"
   }
   output_arg {
@@ -33510,157 +34997,194 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
+}
+op {
+  name: "MatrixSolveLs"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_regularizer"
+    type: DT_DOUBLE
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
+        type: DT_DOUBLE
         type: DT_FLOAT
       }
     }
   }
   attr {
-    name: "TInput"
-    type: "type"
+    name: "fast"
+    type: "bool"
     default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-      }
+      b: true
     }
   }
 }
 op {
-  name: "MaxPool3DGrad"
+  name: "MatrixSolveLs"
   input_arg {
-    name: "orig_input"
-    type_attr: "TInput"
+    name: "matrix"
+    type_attr: "T"
   }
   input_arg {
-    name: "orig_output"
-    type_attr: "TInput"
+    name: "rhs"
+    type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "l2_regularizer"
+    type: DT_DOUBLE
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "fast"
+    type: "bool"
     default_value {
-      s: "NDHWC"
+      b: true
     }
+  }
+}
+op {
+  name: "MatrixSolveLs"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_regularizer"
+    type: DT_DOUBLE
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "fast"
+    type: "bool"
     default_value {
-      type: DT_FLOAT
+      b: true
     }
+  }
+}
+op {
+  name: "MatrixSquareRoot"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        type: DT_BFLOAT16
+        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
+}
+op {
+  name: "MatrixSquareRoot"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
-    name: "TInput"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
-        type: DT_BFLOAT16
+        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "MaxPool3DGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "TInput"
-  }
+  name: "MatrixTriangularSolve"
   input_arg {
-    name: "orig_output"
-    type_attr: "TInput"
+    name: "matrix"
+    type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "rhs"
     type_attr: "T"
   }
   output_arg {
@@ -33668,81 +35192,79 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_DOUBLE
+        type: DT_FLOAT
       }
     }
   }
+}
+op {
+  name: "MatrixTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "lower"
+    type: "bool"
     default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
+      b: true
     }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "adjoint"
+    type: "bool"
     default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
+      b: false
     }
   }
   attr {
-    name: "TInput"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
+        type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "MaxPool3DGradGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
+  name: "MatrixTriangularSolve"
   input_arg {
-    name: "orig_output"
+    name: "matrix"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "rhs"
     type_attr: "T"
   }
   output_arg {
@@ -33750,103 +35272,170 @@ op {
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
+}
+op {
+  name: "Max"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "keep_dims"
+    type: "bool"
     default_value {
-      s: "NDHWC"
+      b: false
     }
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
       }
     }
   }
   attr {
-    name: "T"
+    name: "Tidx"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "MaxPool3DGradGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
+  name: "Max"
   input_arg {
-    name: "orig_output"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "reduction_indices"
+    type_attr: "Tidx"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 5
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "Tidx"
+    type: "type"
     default_value {
-      s: "NDHWC"
+      type: DT_INT32
     }
     allowed_values {
       list {
-        s: "NDHWC"
-        s: "NCDHW"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+}
+op {
+  name: "Max"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
@@ -33854,73 +35443,108 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "MaxPoolGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
+  name: "Max"
   input_arg {
-    name: "orig_output"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "reduction_indices"
+    type_attr: "Tidx"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
-  }
-  attr {
-    name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
   attr {
-    name: "data_format"
-    type: "string"
+    name: "Tidx"
+    type: "type"
     default_value {
-      s: "NHWC"
+      type: DT_INT32
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+}
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
@@ -33934,25 +35558,6 @@ op {
       }
     }
   }
-}
-op {
-  name: "MaxPoolGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
     name: "ksize"
     type: "list(int)"
@@ -33988,6 +35593,17 @@ op {
       }
     }
   }
+}
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
@@ -34008,25 +35624,6 @@ op {
       }
     }
   }
-}
-op {
-  name: "MaxPoolGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
     name: "ksize"
     type: "list(int)"
@@ -34062,6 +35659,17 @@ op {
       }
     }
   }
+}
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
@@ -34079,30 +35687,10 @@ op {
         type: DT_INT8
         type: DT_UINT16
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_QINT8
       }
     }
   }
-}
-op {
-  name: "MaxPoolGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
     name: "ksize"
     type: "list(int)"
@@ -34135,9 +35723,21 @@ op {
       list {
         s: "NHWC"
         s: "NCHW"
+        s: "NCHW_VECT_C"
       }
     }
   }
+}
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
@@ -34146,6 +35746,8 @@ op {
     }
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -34154,32 +35756,10 @@ op {
         type: DT_INT16
         type: DT_INT8
         type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
+        type: DT_QINT8
       }
     }
   }
-}
-op {
-  name: "MaxPoolGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
     name: "ksize"
     type: "list(int)"
@@ -34212,45 +35792,57 @@ op {
       list {
         s: "NHWC"
         s: "NCHW"
+        s: "NCHW_VECT_C"
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
+  name: "MaxPool3D"
   input_arg {
-    name: "grad"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
@@ -34261,13 +35853,13 @@ op {
     name: "ksize"
     type: "list(int)"
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "strides"
     type: "list(int)"
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "padding"
@@ -34283,12 +35875,12 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NHWC"
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
@@ -34298,30 +35890,14 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
+  name: "MaxPool3D"
   input_arg {
-    name: "grad"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
@@ -34332,13 +35908,13 @@ op {
     name: "ksize"
     type: "list(int)"
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "strides"
     type: "list(int)"
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "padding"
@@ -34354,12 +35930,12 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NHWC"
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
@@ -34368,33 +35944,16 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradGrad"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
+  name: "MaxPool3D"
   input_arg {
-    name: "grad"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
@@ -34405,13 +35964,13 @@ op {
     name: "ksize"
     type: "list(int)"
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "strides"
     type: "list(int)"
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "padding"
@@ -34427,12 +35986,12 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NHWC"
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
@@ -34441,31 +36000,22 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
         type: DT_BFLOAT16
+        type: DT_FLOAT
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradGrad"
+  name: "MaxPool3DGrad"
   input_arg {
     name: "orig_input"
-    type_attr: "T"
+    type: DT_FLOAT
   }
   input_arg {
     name: "orig_output"
-    type_attr: "T"
+    type: DT_FLOAT
   }
   input_arg {
     name: "grad"
@@ -34479,13 +36029,13 @@ op {
     name: "ksize"
     type: "list(int)"
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "strides"
     type: "list(int)"
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "padding"
@@ -34497,66 +36047,46 @@ op {
       }
     }
   }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-      }
-    }
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradGradV2"
+  name: "MaxPool3DGrad"
   input_arg {
     name: "orig_input"
-    type_attr: "T"
+    type: DT_FLOAT
   }
   input_arg {
     name: "orig_output"
-    type_attr: "T"
+    type: DT_FLOAT
   }
   input_arg {
     name: "grad"
     type_attr: "T"
   }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
   attr {
     name: "padding"
     type: "string"
@@ -34571,12 +36101,12 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NHWC"
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
@@ -34586,44 +36116,40 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradGradV2"
+  name: "MaxPool3DGrad"
   input_arg {
     name: "orig_input"
-    type_attr: "T"
+    type_attr: "TInput"
   }
   input_arg {
     name: "orig_output"
-    type_attr: "T"
+    type_attr: "TInput"
   }
   input_arg {
     name: "grad"
     type_attr: "T"
   }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
   attr {
     name: "padding"
     type: "string"
@@ -34638,61 +36164,70 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NHWC"
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "TInput"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradGradV2"
+  name: "MaxPool3DGrad"
   input_arg {
     name: "orig_input"
-    type_attr: "T"
+    type_attr: "TInput"
   }
   input_arg {
     name: "orig_output"
-    type_attr: "T"
+    type_attr: "TInput"
   }
   input_arg {
     name: "grad"
     type_attr: "T"
   }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
   attr {
     name: "padding"
     type: "string"
@@ -34707,62 +36242,72 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NHWC"
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "TInput"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
         type: DT_BFLOAT16
+        type: DT_FLOAT
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradGradV2"
+  name: "MaxPool3DGrad"
   input_arg {
     name: "orig_input"
-    type_attr: "T"
+    type_attr: "TInput"
   }
   input_arg {
     name: "orig_output"
-    type_attr: "T"
+    type_attr: "TInput"
   }
   input_arg {
     name: "grad"
     type_attr: "T"
   }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
   attr {
     name: "padding"
     type: "string"
@@ -34777,49 +36322,57 @@ op {
     name: "data_format"
     type: "string"
     default_value {
-      s: "NHWC"
+      s: "NDHWC"
     }
     allowed_values {
       list {
-        s: "NHWC"
-        s: "NCHW"
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
+        type: DT_HALF
         type: DT_BFLOAT16
-        type: DT_UINT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "TInput"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_BFLOAT16
+        type: DT_FLOAT
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradGradWithArgmax"
+  name: "MaxPool3DGradGrad"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "orig_output"
     type_attr: "T"
   }
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
@@ -34829,13 +36382,13 @@ op {
     name: "ksize"
     type: "list(int)"
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "strides"
     type: "list(int)"
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "padding"
@@ -34848,12 +36401,15 @@ op {
     }
   }
   attr {
-    name: "Targmax"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
@@ -34863,31 +36419,23 @@ op {
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradGradWithArgmax"
+  name: "MaxPool3DGradGrad"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "orig_output"
     type_attr: "T"
   }
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
@@ -34897,13 +36445,13 @@ op {
     name: "ksize"
     type: "list(int)"
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "strides"
     type: "list(int)"
     has_minimum: true
-    minimum: 4
+    minimum: 5
   }
   attr {
     name: "padding"
@@ -34916,12 +36464,15 @@ op {
     }
   }
   attr {
-    name: "Targmax"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
@@ -34933,10 +36484,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -34946,18 +36498,18 @@ op {
   }
 }
 op {
-  name: "MaxPoolGradGradWithArgmax"
+  name: "MaxPoolGrad"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "orig_output"
     type_attr: "T"
   }
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
@@ -34986,49 +36538,45 @@ op {
     }
   }
   attr {
-    name: "Targmax"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradGradWithArgmax"
+  name: "MaxPoolGrad"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "orig_output"
     type_attr: "T"
   }
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
@@ -35056,73 +36604,6 @@ op {
       }
     }
   }
-  attr {
-    name: "Targmax"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "MaxPoolGradV2"
-  input_arg {
-    name: "orig_input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "orig_output"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
   attr {
     name: "data_format"
     type: "string"
@@ -35158,7 +36639,7 @@ op {
   }
 }
 op {
-  name: "MaxPoolGradV2"
+  name: "MaxPoolGrad"
   input_arg {
     name: "orig_input"
     type_attr: "T"
@@ -35171,18 +36652,22 @@ op {
     name: "grad"
     type_attr: "T"
   }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
   attr {
     name: "padding"
     type: "string"
@@ -35230,7 +36715,7 @@ op {
   }
 }
 op {
-  name: "MaxPoolGradV2"
+  name: "MaxPoolGrad"
   input_arg {
     name: "orig_input"
     type_attr: "T"
@@ -35243,18 +36728,22 @@ op {
     name: "grad"
     type_attr: "T"
   }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
   attr {
     name: "padding"
     type: "string"
@@ -35303,7 +36792,7 @@ op {
   }
 }
 op {
-  name: "MaxPoolGradV2"
+  name: "MaxPoolGrad"
   input_arg {
     name: "orig_input"
     type_attr: "T"
@@ -35316,18 +36805,22 @@ op {
     name: "grad"
     type_attr: "T"
   }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
   attr {
     name: "padding"
     type: "string"
@@ -35376,18 +36869,18 @@ op {
   }
 }
 op {
-  name: "MaxPoolGradWithArgmax"
+  name: "MaxPoolGradGrad"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "orig_output"
     type_attr: "T"
   }
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
@@ -35416,42 +36909,49 @@ op {
     }
   }
   attr {
-    name: "Targmax"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
         type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradWithArgmax"
+  name: "MaxPoolGradGrad"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "orig_output"
     type_attr: "T"
   }
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
@@ -35480,12 +36980,15 @@ op {
     }
   }
   attr {
-    name: "Targmax"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
@@ -35503,23 +37006,25 @@ op {
         type: DT_INT8
         type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradWithArgmax"
+  name: "MaxPoolGradGrad"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "orig_output"
     type_attr: "T"
   }
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
@@ -35548,12 +37053,15 @@ op {
     }
   }
   attr {
-    name: "Targmax"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
@@ -35573,23 +37081,24 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradWithArgmax"
+  name: "MaxPoolGradGrad"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "orig_output"
     type_attr: "T"
   }
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
+    name: "grad"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
@@ -35618,12 +37127,15 @@ op {
     }
   }
   attr {
-    name: "Targmax"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
@@ -35635,48 +37147,44 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "MaxPoolGradWithArgmax"
+  name: "MaxPoolGradGradV2"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "orig_output"
     type_attr: "T"
   }
   input_arg {
-    name: "argmax"
-    type_attr: "Targmax"
-  }
-  output_arg {
-    name: "output"
+    name: "grad"
     type_attr: "T"
   }
-  attr {
+  input_arg {
     name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    type: DT_INT32
   }
-  attr {
+  input_arg {
     name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "padding"
@@ -35689,12 +37197,15 @@ op {
     }
   }
   attr {
-    name: "Targmax"
-    type: "type"
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
@@ -35706,23 +37217,28 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "MaxPoolV2"
+  name: "MaxPoolGradGradV2"
   input_arg {
-    name: "input"
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
@@ -35737,26 +37253,6 @@ op {
     name: "output"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
   attr {
     name: "padding"
     type: "string"
@@ -35780,31 +37276,9 @@ op {
       }
     }
   }
-}
-op {
-  name: "MaxPoolV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "ksize"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "strides"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
@@ -35816,39 +37290,24 @@ op {
         type: DT_INT8
         type: DT_UINT16
         type: DT_HALF
-        type: DT_QINT8
-      }
-    }
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NHWC"
-    }
-    allowed_values {
-      list {
-        s: "NHWC"
-        s: "NCHW"
-        s: "NCHW_VECT_C"
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "MaxPoolV2"
+  name: "MaxPoolGradGradV2"
   input_arg {
-    name: "input"
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
@@ -35863,28 +37322,6 @@ op {
     name: "output"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_QINT8
-      }
-    }
-  }
   attr {
     name: "padding"
     type: "string"
@@ -35905,88 +37342,118 @@ op {
       list {
         s: "NHWC"
         s: "NCHW"
-        s: "NCHW_VECT_C"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "MaxPoolWithArgmax"
+  name: "MaxPoolGradGradV2"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "orig_output"
     type_attr: "T"
   }
-  output_arg {
-    name: "argmax"
-    type_attr: "Targmax"
+  input_arg {
+    name: "grad"
+    type_attr: "T"
   }
-  attr {
+  input_arg {
     name: "ksize"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    type: DT_INT32
   }
-  attr {
+  input_arg {
     name: "strides"
-    type: "list(int)"
-    has_minimum: true
-    minimum: 4
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "Targmax"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "padding"
+    name: "data_format"
     type: "string"
+    default_value {
+      s: "NHWC"
+    }
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "MaxPoolWithArgmax"
+  name: "MaxPoolGradGradWithArgmax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
-  output_arg {
+  input_arg {
     name: "argmax"
     type_attr: "Targmax"
   }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "ksize"
     type: "list(int)"
@@ -36000,25 +37467,22 @@ op {
     minimum: 4
   }
   attr {
-    name: "Targmax"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "Targmax"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -36041,19 +37505,23 @@ op {
   }
 }
 op {
-  name: "MaxPoolWithArgmax"
+  name: "MaxPoolGradGradWithArgmax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
-  output_arg {
+  input_arg {
     name: "argmax"
     type_attr: "Targmax"
   }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "ksize"
     type: "list(int)"
@@ -36067,25 +37535,22 @@ op {
     minimum: 4
   }
   attr {
-    name: "Targmax"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "Targmax"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -36110,19 +37575,23 @@ op {
   }
 }
 op {
-  name: "MaxPoolWithArgmax"
+  name: "MaxPoolGradGradWithArgmax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
-  output_arg {
+  input_arg {
     name: "argmax"
     type_attr: "Targmax"
   }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "ksize"
     type: "list(int)"
@@ -36136,25 +37605,22 @@ op {
     minimum: 4
   }
   attr {
-    name: "Targmax"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "Targmax"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -36180,19 +37646,23 @@ op {
   }
 }
 op {
-  name: "MaxPoolWithArgmax"
+  name: "MaxPoolGradGradWithArgmax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
-  output_arg {
+  input_arg {
     name: "argmax"
     type_attr: "Targmax"
   }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
     name: "ksize"
     type: "list(int)"
@@ -36206,25 +37676,22 @@ op {
     minimum: 4
   }
   attr {
-    name: "Targmax"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "padding"
-    type: "string"
+    name: "Targmax"
+    type: "type"
     allowed_values {
       list {
-        s: "SAME"
-        s: "VALID"
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
@@ -36250,249 +37717,212 @@ op {
   }
 }
 op {
-  name: "Maximum"
+  name: "MaxPoolGradV2"
   input_arg {
-    name: "x"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
+  }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
-  is_commutative: true
-}
-op {
-  name: "Maximum"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
       }
     }
   }
-  is_commutative: true
 }
 op {
-  name: "Maximum"
+  name: "MaxPoolGradV2"
   input_arg {
-    name: "x"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
+    name: "orig_output"
     type_attr: "T"
   }
-  output_arg {
-    name: "z"
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "Mean"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "ksize"
+    type: DT_INT32
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "strides"
+    type: DT_INT32
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
     default_value {
-      b: false
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "Mean"
+  name: "MaxPoolGradV2"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "Tidx"
-    type: "type"
+    name: "data_format"
+    type: "string"
     default_value {
-      type: DT_INT32
+      s: "NHWC"
     }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
-}
-op {
-  name: "Mean"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -36500,44 +37930,62 @@ op {
       }
     }
   }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
 }
 op {
-  name: "Mean"
+  name: "MaxPoolGradV2"
   input_arg {
-    name: "input"
+    name: "orig_input"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
     default_value {
-      b: false
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
     }
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
         type: DT_FLOAT
@@ -36546,293 +37994,266 @@ op {
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
         type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
 }
 op {
-  name: "Merge"
+  name: "MaxPoolGradWithArgmax"
   input_arg {
-    name: "inputs"
+    name: "input"
     type_attr: "T"
-    number_attr: "N"
   }
-  output_arg {
-    name: "output"
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
-  output_arg {
-    name: "value_index"
-    type: DT_INT32
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "MergeSummary"
   input_arg {
-    name: "inputs"
-    type: DT_STRING
-    number_attr: "N"
+    name: "argmax"
+    type_attr: "Targmax"
   }
   output_arg {
-    name: "summary"
-    type: DT_STRING
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "N"
-    type: "int"
+    name: "ksize"
+    type: "list(int)"
     has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "MergeV2Checkpoints"
-  input_arg {
-    name: "checkpoint_prefixes"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "destination_prefix"
-    type: DT_STRING
-  }
-  attr {
-    name: "delete_old_dirs"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-}
-op {
-  name: "MergeV2Checkpoints"
-  input_arg {
-    name: "checkpoint_prefixes"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "destination_prefix"
-    type: DT_STRING
+    minimum: 4
   }
   attr {
-    name: "delete_old_dirs"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "Mfcc"
-  input_arg {
-    name: "spectrogram"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "sample_rate"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type: DT_FLOAT
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "upper_frequency_limit"
-    type: "float"
-    default_value {
-      f: 4000
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "lower_frequency_limit"
-    type: "float"
-    default_value {
-      f: 20
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
-    name: "filterbank_channel_count"
-    type: "int"
+    name: "T"
+    type: "type"
     default_value {
-      i: 40
+      type: DT_FLOAT
     }
-  }
-  attr {
-    name: "dct_coefficient_count"
-    type: "int"
-    default_value {
-      i: 13
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+      }
     }
   }
 }
 op {
-  name: "Min"
+  name: "MaxPoolGradWithArgmax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "T"
+    name: "Targmax"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "Tidx"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "Min"
+  name: "MaxPoolGradWithArgmax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "T"
+    name: "Targmax"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_INT64
       }
     }
   }
   attr {
-    name: "Tidx"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "Min"
+  name: "MaxPoolGradWithArgmax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
@@ -36842,17 +38263,12 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
+        type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -36860,39 +38276,55 @@ op {
       }
     }
   }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
 }
 op {
-  name: "Min"
+  name: "MaxPoolGradWithArgmax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "argmax"
+    type_attr: "Targmax"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
@@ -36906,146 +38338,238 @@ op {
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
         type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
 }
 op {
-  name: "Minimum"
+  name: "MaxPoolV2"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
   }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
       }
     }
   }
-  is_commutative: true
 }
 op {
-  name: "Minimum"
+  name: "MaxPoolV2"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
   }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_QINT8
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
       }
     }
   }
-  is_commutative: true
 }
 op {
-  name: "Minimum"
+  name: "MaxPoolV2"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "ksize"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "strides"
+    type: DT_INT32
   }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
-        type: DT_BFLOAT16
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_QINT8
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
       }
     }
   }
-  is_commutative: true
 }
 op {
-  name: "MirrorPad"
+  name: "MaxPoolWithArgmax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
   attr {
-    name: "T"
-    type: "type"
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "Tpaddings"
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
     type: "type"
     default_value {
-      type: DT_INT32
+      type: DT_INT64
     }
     allowed_values {
       list {
@@ -37055,39 +38579,60 @@ op {
     }
   }
   attr {
-    name: "mode"
+    name: "padding"
     type: "string"
     allowed_values {
       list {
-        s: "REFLECT"
-        s: "SYMMETRIC"
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "MirrorPadGrad"
+  name: "MaxPoolWithArgmax"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
+  }
   attr {
-    name: "T"
-    type: "type"
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
   }
   attr {
-    name: "Tpaddings"
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
     type: "type"
     default_value {
-      type: DT_INT32
+      type: DT_INT64
     }
     allowed_values {
       list {
@@ -37097,126 +38642,244 @@ op {
     }
   }
   attr {
-    name: "mode"
+    name: "padding"
     type: "string"
     allowed_values {
       list {
-        s: "REFLECT"
-        s: "SYMMETRIC"
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
-}
-op {
-  name: "Mod"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
       }
     }
   }
 }
 op {
-  name: "Mod"
+  name: "MaxPoolWithArgmax"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "y"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
-    type_attr: "T"
+    name: "argmax"
+    type_attr: "Targmax"
   }
   attr {
-    name: "T"
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
     type: "type"
+    default_value {
+      type: DT_INT64
+    }
     allowed_values {
       list {
         type: DT_INT32
         type: DT_INT64
-        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
 }
 op {
-  name: "Mod"
+  name: "MaxPoolWithArgmax"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "y"
+  output_arg {
+    name: "output"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
-    type_attr: "T"
+    name: "argmax"
+    type_attr: "Targmax"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
         type: DT_HALF
-        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
         type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "ModelDataset"
+  name: "MaxPoolWithArgmax"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "argmax"
+    type_attr: "Targmax"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
+    name: "ksize"
+    type: "list(int)"
     has_minimum: true
-    minimum: 1
+    minimum: 4
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
+    name: "strides"
+    type: "list(int)"
     has_minimum: true
-    minimum: 1
+    minimum: 4
+  }
+  attr {
+    name: "Targmax"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
 }
 op {
-  name: "Mul"
+  name: "Maximum"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -37237,21 +38900,15 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
   is_commutative: true
 }
 op {
-  name: "Mul"
+  name: "Maximum"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -37273,21 +38930,15 @@ op {
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
   is_commutative: true
 }
 op {
-  name: "Mul"
+  name: "Maximum"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -37309,173 +38960,32 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
   is_commutative: true
 }
 op {
-  name: "MultiDeviceIterator"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "devices"
-    type: "list(string)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-  }
-  attr {
-    name: "container"
-    type: "string"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "MultiDeviceIteratorFromStringHandle"
-  input_arg {
-    name: "string_handle"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "multi_device_iterator"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-  }
-  is_stateful: true
-}
-op {
-  name: "MultiDeviceIteratorGetNextFromShard"
-  input_arg {
-    name: "multi_device_iterator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "shard_num"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "incarnation_id"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "components"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "MultiDeviceIteratorInit"
-  input_arg {
-    name: "dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "multi_device_iterator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "max_buffer_size"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "incarnation_id"
-    type: DT_INT64
-  }
-  is_stateful: true
-}
-op {
-  name: "MultiDeviceIteratorToStringHandle"
-  input_arg {
-    name: "multi_device_iterator"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "string_handle"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
-op {
-  name: "Multinomial"
+  name: "Mean"
   input_arg {
-    name: "logits"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "num_samples"
-    type: DT_INT32
+    name: "reduction_indices"
+    type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    type: DT_INT64
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    type_attr: "T"
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "keep_dims"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
   }
   attr {
@@ -37485,44 +38995,54 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
       }
     }
   }
-  is_stateful: true
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
-  name: "Multinomial"
+  name: "Mean"
   input_arg {
-    name: "logits"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "num_samples"
-    type: DT_INT32
+    name: "reduction_indices"
+    type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    type: DT_INT64
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    type_attr: "T"
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "keep_dims"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
   }
   attr {
@@ -37532,46 +39052,56 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
       }
     }
   }
-  is_stateful: true
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
-  name: "Multinomial"
+  name: "Mean"
   input_arg {
-    name: "logits"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "num_samples"
-    type: DT_INT32
+    name: "reduction_indices"
+    type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    type_attr: "output_dtype"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    type_attr: "T"
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "keep_dims"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
   }
   attr {
@@ -37581,12 +39111,17 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -37595,10 +39130,10 @@ op {
     }
   }
   attr {
-    name: "output_dtype"
+    name: "Tidx"
     type: "type"
     default_value {
-      type: DT_INT64
+      type: DT_INT32
     }
     allowed_values {
       list {
@@ -37607,34 +39142,26 @@ op {
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "Multinomial"
+  name: "Mean"
   input_arg {
-    name: "logits"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "num_samples"
-    type: DT_INT32
+    name: "reduction_indices"
+    type_attr: "Tidx"
   }
   output_arg {
     name: "output"
-    type_attr: "output_dtype"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    type_attr: "T"
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "keep_dims"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
   }
   attr {
@@ -37648,9 +39175,14 @@ op {
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_COMPLEX64
         type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_BFLOAT16
         type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -37658,10 +39190,10 @@ op {
     }
   }
   attr {
-    name: "output_dtype"
+    name: "Tidx"
     type: "type"
     default_value {
-      type: DT_INT64
+      type: DT_INT32
     }
     allowed_values {
       list {
@@ -37670,366 +39202,446 @@ op {
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "MutableDenseHashTable"
+  name: "Merge"
   input_arg {
-    name: "empty_key"
-    type_attr: "key_dtype"
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
   }
   output_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "output"
+    type_attr: "T"
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "value_index"
+    type: DT_INT32
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "use_node_name_sharing"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
-  attr {
-    name: "key_dtype"
-    type: "type"
+}
+op {
+  name: "MergeSummary"
+  input_arg {
+    name: "inputs"
+    type: DT_STRING
+    number_attr: "N"
   }
-  attr {
-    name: "value_dtype"
-    type: "type"
+  output_arg {
+    name: "summary"
+    type: DT_STRING
   }
   attr {
-    name: "value_shape"
-    type: "shape"
-    default_value {
-      shape {
-      }
-    }
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "MergeV2Checkpoints"
+  input_arg {
+    name: "checkpoint_prefixes"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "destination_prefix"
+    type: DT_STRING
   }
   attr {
-    name: "initial_num_buckets"
-    type: "int"
+    name: "delete_old_dirs"
+    type: "bool"
     default_value {
-      i: 131072
+      b: true
     }
   }
+}
+op {
+  name: "MergeV2Checkpoints"
+  input_arg {
+    name: "checkpoint_prefixes"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "destination_prefix"
+    type: DT_STRING
+  }
   attr {
-    name: "max_load_factor"
-    type: "float"
+    name: "delete_old_dirs"
+    type: "bool"
     default_value {
-      f: 0.8
+      b: true
     }
   }
   is_stateful: true
 }
 op {
-  name: "MutableDenseHashTableV2"
+  name: "Mfcc"
   input_arg {
-    name: "empty_key"
-    type_attr: "key_dtype"
+    name: "spectrogram"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "deleted_key"
-    type_attr: "key_dtype"
+    name: "sample_rate"
+    type: DT_INT32
   }
   output_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "output"
+    type: DT_FLOAT
   }
   attr {
-    name: "use_node_name_sharing"
-    type: "bool"
+    name: "upper_frequency_limit"
+    type: "float"
     default_value {
-      b: false
+      f: 4000
     }
   }
   attr {
-    name: "key_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_dtype"
-    type: "type"
-  }
-  attr {
-    name: "value_shape"
-    type: "shape"
+    name: "lower_frequency_limit"
+    type: "float"
     default_value {
-      shape {
-      }
+      f: 20
     }
   }
   attr {
-    name: "initial_num_buckets"
+    name: "filterbank_channel_count"
     type: "int"
     default_value {
-      i: 131072
+      i: 40
     }
   }
   attr {
-    name: "max_load_factor"
-    type: "float"
+    name: "dct_coefficient_count"
+    type: "int"
     default_value {
-      f: 0.8
+      i: 13
     }
   }
-  is_stateful: true
 }
 op {
-  name: "MutableHashTable"
-  output_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
   }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "use_node_name_sharing"
+    name: "keep_dims"
     type: "bool"
     default_value {
       b: false
     }
   }
   attr {
-    name: "key_dtype"
+    name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
   }
   attr {
-    name: "value_dtype"
+    name: "Tidx"
     type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "MutableHashTableOfTensors"
-  output_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  attr {
-    name: "container"
-    type: "string"
     default_value {
-      s: ""
+      type: DT_INT32
     }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
+}
+op {
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
-    name: "use_node_name_sharing"
+    name: "keep_dims"
     type: "bool"
     default_value {
       b: false
     }
   }
   attr {
-    name: "key_dtype"
+    name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
-    name: "value_dtype"
+    name: "Tidx"
     type: "type"
-  }
-  attr {
-    name: "value_shape"
-    type: "shape"
     default_value {
-      shape {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "MutableHashTableOfTensorsV2"
-  output_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
   }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "use_node_name_sharing"
+    name: "keep_dims"
     type: "bool"
     default_value {
       b: false
     }
   }
   attr {
-    name: "key_dtype"
+    name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
   }
   attr {
-    name: "value_dtype"
+    name: "Tidx"
     type: "type"
-  }
-  attr {
-    name: "value_shape"
-    type: "shape"
     default_value {
-      shape {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "MutableHashTableV2"
-  output_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
   }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "use_node_name_sharing"
+    name: "keep_dims"
     type: "bool"
     default_value {
       b: false
     }
   }
   attr {
-    name: "key_dtype"
+    name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
   attr {
-    name: "value_dtype"
+    name: "Tidx"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "MutexLock"
+  name: "Minimum"
   input_arg {
-    name: "mutex"
-    type: DT_RESOURCE
+    name: "x"
+    type_attr: "T"
   }
-  output_arg {
-    name: "mutex_lock"
-    type: DT_VARIANT
+  input_arg {
+    name: "y"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "MutexV2"
   output_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "z"
+    type_attr: "T"
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
-  is_stateful: true
+  is_commutative: true
 }
 op {
-  name: "NcclAllReduce"
+  name: "Minimum"
   input_arg {
-    name: "input"
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "data"
+    name: "z"
     type_attr: "T"
   }
   attr {
-    name: "reduction"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "min"
-        s: "max"
-        s: "prod"
-        s: "sum"
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
+  is_commutative: true
+}
+op {
+  name: "Minimum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -38038,22 +39650,60 @@ op {
       }
     }
   }
+  is_commutative: true
+}
+op {
+  name: "MirrorPad"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
   attr {
-    name: "num_devices"
-    type: "int"
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "shared_name"
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "mode"
     type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "NcclBroadcast"
+  name: "MirrorPadGrad"
   input_arg {
     name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -38061,76 +39711,153 @@ op {
   attr {
     name: "T"
     type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
       }
     }
   }
   attr {
-    name: "shape"
-    type: "shape"
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "NcclReduce"
+  name: "Mod"
   input_arg {
-    name: "input"
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
     type_attr: "T"
-    number_attr: "num_devices"
   }
   output_arg {
-    name: "data"
+    name: "z"
     type_attr: "T"
   }
   attr {
-    name: "reduction"
-    type: "string"
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "min"
-        s: "max"
-        s: "prod"
-        s: "sum"
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
+}
+op {
+  name: "Mod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "Mod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
         type: DT_INT32
         type: DT_INT64
+        type: DT_HALF
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
+}
+op {
+  name: "ModelDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
   attr {
-    name: "num_devices"
-    type: "int"
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
     has_minimum: true
     minimum: 1
   }
-  is_stateful: true
 }
 op {
-  name: "Neg"
+  name: "Mul"
   input_arg {
     name: "x"
     type_attr: "T"
   }
-  output_arg {
+  input_arg {
     name: "y"
     type_attr: "T"
   }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
@@ -38139,6 +39866,10 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -38146,17 +39877,22 @@ op {
       }
     }
   }
+  is_commutative: true
 }
 op {
-  name: "Neg"
+  name: "Mul"
   input_arg {
     name: "x"
     type_attr: "T"
   }
-  output_arg {
+  input_arg {
     name: "y"
     type_attr: "T"
   }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
@@ -38166,6 +39902,10 @@ op {
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -38173,17 +39913,22 @@ op {
       }
     }
   }
+  is_commutative: true
 }
 op {
-  name: "Neg"
+  name: "Mul"
   input_arg {
     name: "x"
     type_attr: "T"
   }
-  output_arg {
+  input_arg {
     name: "y"
     type_attr: "T"
   }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
@@ -38193,6 +39938,10 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -38200,468 +39949,235 @@ op {
       }
     }
   }
+  is_commutative: true
 }
 op {
-  name: "NegTrain"
-  input_arg {
-    name: "w_in"
-    type: DT_FLOAT
-    is_ref: true
-  }
-  input_arg {
-    name: "w_out"
-    type: DT_FLOAT
-    is_ref: true
-  }
+  name: "MulNoNan"
   input_arg {
-    name: "examples"
-    type: DT_INT32
+    name: "x"
+    type_attr: "T"
   }
   input_arg {
-    name: "labels"
-    type: DT_INT32
+    name: "y"
+    type_attr: "T"
   }
-  input_arg {
-    name: "lr"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "vocab_count"
-    type: "list(int)"
-  }
-  attr {
-    name: "num_negative_samples"
-    type: "int"
-  }
-  deprecation {
-    version: 19
-  }
-  is_stateful: true
-}
-op {
-  name: "NextIteration"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-}
-op {
-  name: "NoOp"
-}
-op {
-  name: "NonMaxSuppression"
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "scores"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
-  }
-  attr {
-    name: "iou_threshold"
-    type: "float"
-    default_value {
-      f: 0.5
-    }
-  }
-}
-op {
-  name: "NonMaxSuppressionV2"
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "scores"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "iou_threshold"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
-  }
-}
-op {
-  name: "NonMaxSuppressionV2"
-  input_arg {
-    name: "boxes"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scores"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "iou_threshold"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
+  output_arg {
+    name: "z"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
+  is_commutative: true
 }
 op {
-  name: "NonMaxSuppressionV3"
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
+  name: "MultiDeviceIterator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
   }
-  input_arg {
-    name: "scores"
-    type: DT_FLOAT
+  attr {
+    name: "devices"
+    type: "list(string)"
+    has_minimum: true
+    minimum: 1
   }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
+  attr {
+    name: "shared_name"
+    type: "string"
   }
-  input_arg {
-    name: "iou_threshold"
-    type: DT_FLOAT
+  attr {
+    name: "container"
+    type: "string"
   }
-  input_arg {
-    name: "score_threshold"
-    type: DT_FLOAT
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
+  is_stateful: true
 }
 op {
-  name: "NonMaxSuppressionV3"
-  input_arg {
-    name: "boxes"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "scores"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "iou_threshold"
-    type: DT_FLOAT
-  }
+  name: "MultiDeviceIteratorFromStringHandle"
   input_arg {
-    name: "score_threshold"
-    type: DT_FLOAT
+    name: "string_handle"
+    type: DT_STRING
   }
   output_arg {
-    name: "selected_indices"
-    type: DT_INT32
+    name: "multi_device_iterator"
+    type: DT_RESOURCE
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "output_types"
+    type: "list(type)"
     default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
       list {
-        type: DT_HALF
-        type: DT_FLOAT
       }
     }
-  }
-}
-op {
-  name: "NonMaxSuppressionV4"
-  input_arg {
-    name: "boxes"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "scores"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_output_size"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "iou_threshold"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "score_threshold"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "valid_outputs"
-    type: DT_INT32
+    has_minimum: true
   }
   attr {
-    name: "pad_to_max_output_size"
-    type: "bool"
+    name: "output_shapes"
+    type: "list(shape)"
     default_value {
-      b: false
+      list {
+      }
     }
+    has_minimum: true
   }
+  is_stateful: true
 }
 op {
-  name: "NonMaxSuppressionV4"
-  input_arg {
-    name: "boxes"
-    type_attr: "T"
-  }
+  name: "MultiDeviceIteratorGetNextFromShard"
   input_arg {
-    name: "scores"
-    type_attr: "T"
+    name: "multi_device_iterator"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "max_output_size"
+    name: "shard_num"
     type: DT_INT32
   }
   input_arg {
-    name: "iou_threshold"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "score_threshold"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "selected_indices"
-    type: DT_INT32
+    name: "incarnation_id"
+    type: DT_INT64
   }
   output_arg {
-    name: "valid_outputs"
-    type: DT_INT32
+    name: "components"
+    type_list_attr: "output_types"
   }
   attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-      }
-    }
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "pad_to_max_output_size"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
+  is_stateful: true
 }
 op {
-  name: "NonMaxSuppressionWithOverlaps"
-  input_arg {
-    name: "overlaps"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "scores"
-    type: DT_FLOAT
-  }
+  name: "MultiDeviceIteratorInit"
   input_arg {
-    name: "max_output_size"
-    type: DT_INT32
+    name: "dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "overlap_threshold"
-    type: DT_FLOAT
+    name: "multi_device_iterator"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "score_threshold"
-    type: DT_FLOAT
+    name: "max_buffer_size"
+    type: DT_INT64
   }
   output_arg {
-    name: "selected_indices"
-    type: DT_INT32
+    name: "incarnation_id"
+    type: DT_INT64
   }
+  is_stateful: true
 }
 op {
-  name: "NotEqual"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
+  name: "MultiDeviceIteratorToStringHandle"
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "multi_device_iterator"
+    type: DT_RESOURCE
   }
   output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_QUINT8
-        type: DT_QINT8
-        type: DT_QINT32
-        type: DT_STRING
-        type: DT_BOOL
-        type: DT_COMPLEX128
-      }
-    }
+    name: "string_handle"
+    type: DT_STRING
   }
-  is_commutative: true
+  is_stateful: true
 }
 op {
-  name: "NotEqual"
+  name: "Multinomial"
   input_arg {
-    name: "x"
+    name: "logits"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "num_samples"
+    type: DT_INT32
   }
   output_arg {
-    name: "z"
-    type: DT_BOOL
+    name: "output"
+    type: DT_INT64
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_QUINT8
-        type: DT_QINT8
-        type: DT_QINT32
-        type: DT_STRING
-        type: DT_BOOL
-        type: DT_COMPLEX128
-      }
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
     }
   }
-  is_commutative: true
-}
-op {
-  name: "NotEqual"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_BFLOAT16
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_QUINT8
-        type: DT_QINT8
-        type: DT_QINT32
-        type: DT_STRING
-        type: DT_BOOL
-        type: DT_COMPLEX128
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
       }
     }
   }
-  is_commutative: true
+  is_stateful: true
 }
 op {
-  name: "NthElement"
+  name: "Multinomial"
   input_arg {
-    name: "input"
+    name: "logits"
     type_attr: "T"
   }
   input_arg {
-    name: "n"
+    name: "num_samples"
     type: DT_INT32
   }
   output_arg {
-    name: "values"
-    type_attr: "T"
+    name: "output"
+    type: DT_INT64
   }
   attr {
-    name: "reverse"
-    type: "bool"
+    name: "seed"
+    type: "int"
     default_value {
-      b: false
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
     }
   }
   attr {
@@ -38683,26 +40199,34 @@ op {
       }
     }
   }
+  is_stateful: true
 }
 op {
-  name: "NthElement"
+  name: "Multinomial"
   input_arg {
-    name: "input"
+    name: "logits"
     type_attr: "T"
   }
   input_arg {
-    name: "n"
+    name: "num_samples"
     type: DT_INT32
   }
   output_arg {
-    name: "values"
-    type_attr: "T"
+    name: "output"
+    type_attr: "output_dtype"
   }
   attr {
-    name: "reverse"
-    type: "bool"
+    name: "seed"
+    type: "int"
     default_value {
-      b: false
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
     }
   }
   attr {
@@ -38725,26 +40249,47 @@ op {
       }
     }
   }
+  attr {
+    name: "output_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
 }
 op {
-  name: "NthElement"
+  name: "Multinomial"
   input_arg {
-    name: "input"
+    name: "logits"
     type_attr: "T"
   }
   input_arg {
-    name: "n"
+    name: "num_samples"
     type: DT_INT32
   }
   output_arg {
-    name: "values"
-    type_attr: "T"
+    name: "output"
+    type_attr: "output_dtype"
   }
   attr {
-    name: "reverse"
-    type: "bool"
+    name: "seed"
+    type: "int"
     default_value {
-      b: false
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
     }
   }
   attr {
@@ -38767,76 +40312,31 @@ op {
       }
     }
   }
-}
-op {
-  name: "OneHot"
-  input_arg {
-    name: "indices"
-    type_attr: "TI"
-  }
-  input_arg {
-    name: "depth"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "on_value"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "off_value"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "axis"
-    type: "int"
-    default_value {
-      i: -1
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
   attr {
-    name: "TI"
+    name: "output_dtype"
     type: "type"
     default_value {
       type: DT_INT64
     }
     allowed_values {
       list {
-        type: DT_UINT8
         type: DT_INT32
         type: DT_INT64
       }
     }
   }
+  is_stateful: true
 }
 op {
-  name: "OneShotIterator"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "dataset_factory"
-    type: "func"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  name: "MutableDenseHashTable"
+  input_arg {
+    name: "empty_key"
+    type_attr: "key_dtype"
   }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
   attr {
     name: "container"
@@ -38852,244 +40352,118 @@ op {
       s: ""
     }
   }
-  is_stateful: true
-}
-op {
-  name: "OnesLike"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "use_node_name_sharing"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
-}
-op {
-  name: "OnesLike"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "key_dtype"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT8
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_UINT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_BOOL
-      }
-    }
-  }
-}
-op {
-  name: "OnesLike"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
   }
   attr {
-    name: "T"
+    name: "value_dtype"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT8
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_UINT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_BOOL
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
       }
     }
   }
-}
-op {
-  name: "OptimizeDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "optimizations"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "initial_num_buckets"
+    type: "int"
+    default_value {
+      i: 131072
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "max_load_factor"
+    type: "float"
+    default_value {
+      f: 0.8
+    }
   }
+  is_stateful: true
 }
 op {
-  name: "OptionalFromValue"
+  name: "MutableDenseHashTableV2"
   input_arg {
-    name: "components"
-    type_list_attr: "Toutput_types"
-  }
-  output_arg {
-    name: "optional"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "empty_key"
+    type_attr: "key_dtype"
   }
-}
-op {
-  name: "OptionalGetValue"
   input_arg {
-    name: "optional"
-    type: DT_VARIANT
+    name: "deleted_key"
+    type_attr: "key_dtype"
   }
   output_arg {
-    name: "components"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "table_handle"
+    type: DT_RESOURCE
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "OptionalHasValue"
-  input_arg {
-    name: "optional"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "has_value"
-    type: DT_BOOL
-  }
-}
-op {
-  name: "OptionalNone"
-  output_arg {
-    name: "optional"
-    type: DT_VARIANT
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-}
-op {
-  name: "OrderedMapClear"
   attr {
-    name: "capacity"
-    type: "int"
+    name: "shared_name"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
-    has_minimum: true
   }
   attr {
-    name: "memory_limit"
-    type: "int"
+    name: "use_node_name_sharing"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
-    has_minimum: true
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
+    name: "key_dtype"
+    type: "type"
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "value_dtype"
+    type: "type"
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "value_shape"
+    type: "shape"
     default_value {
-      s: ""
+      shape {
+      }
     }
   }
-  is_stateful: true
-}
-op {
-  name: "OrderedMapIncompleteSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
   attr {
-    name: "capacity"
+    name: "initial_num_buckets"
     type: "int"
     default_value {
-      i: 0
+      i: 131072
     }
-    has_minimum: true
   }
   attr {
-    name: "memory_limit"
-    type: "int"
+    name: "max_load_factor"
+    type: "float"
     default_value {
-      i: 0
+      f: 0.8
     }
-    has_minimum: true
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
+  is_stateful: true
+}
+op {
+  name: "MutableHashTable"
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
   attr {
     name: "container"
@@ -39105,43 +40479,29 @@ op {
       s: ""
     }
   }
-  is_stateful: true
-}
-op {
-  name: "OrderedMapPeek"
-  input_arg {
-    name: "key"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
   attr {
-    name: "capacity"
-    type: "int"
+    name: "use_node_name_sharing"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
-    has_minimum: true
   }
   attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "key_dtype"
+    type: "type"
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableHashTableOfTensors"
+  output_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
   attr {
     name: "container"
@@ -39157,33 +40517,36 @@ op {
       s: ""
     }
   }
-  is_stateful: true
-}
-op {
-  name: "OrderedMapSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
-  }
   attr {
-    name: "capacity"
-    type: "int"
+    name: "use_node_name_sharing"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
-    has_minimum: true
   }
   attr {
-    name: "memory_limit"
-    type: "int"
+    name: "key_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_dtype"
+    type: "type"
+  }
+  attr {
+    name: "value_shape"
+    type: "shape"
     default_value {
-      i: 0
+      shape {
+      }
     }
-    has_minimum: true
   }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
+  is_stateful: true
+}
+op {
+  name: "MutableHashTableOfTensorsV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
   }
   attr {
     name: "container"
@@ -39199,47 +40562,36 @@ op {
       s: ""
     }
   }
-  is_stateful: true
-}
-op {
-  name: "OrderedMapStage"
-  input_arg {
-    name: "key"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "values"
-    type_list_attr: "fake_dtypes"
-  }
   attr {
-    name: "capacity"
-    type: "int"
+    name: "use_node_name_sharing"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
-    has_minimum: true
   }
   attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "key_dtype"
+    type: "type"
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
+    name: "value_dtype"
+    type: "type"
   }
   attr {
-    name: "fake_dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "value_shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MutableHashTableV2"
+  output_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
   }
   attr {
     name: "container"
@@ -39255,43 +40607,40 @@ op {
       s: ""
     }
   }
-  is_stateful: true
-}
-op {
-  name: "OrderedMapUnstage"
-  input_arg {
-    name: "key"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
   attr {
-    name: "capacity"
-    type: "int"
+    name: "use_node_name_sharing"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
-    has_minimum: true
   }
   attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "key_dtype"
+    type: "type"
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "value_dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "MutexLock"
+  input_arg {
+    name: "mutex"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "mutex_lock"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
+op {
+  name: "MutexV2"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
   }
   attr {
     name: "container"
@@ -39310,706 +40659,544 @@ op {
   is_stateful: true
 }
 op {
-  name: "OrderedMapUnstageNoKey"
+  name: "NcclAllReduce"
   input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "key"
-    type: DT_INT64
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
+    name: "data"
+    type_attr: "T"
   }
   attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
+    name: "reduction"
+    type: "string"
+    allowed_values {
+      list {
+        s: "min"
+        s: "max"
+        s: "prod"
+        s: "sum"
+      }
     }
-    has_minimum: true
   }
   attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
-    has_minimum: true
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "num_devices"
+    type: "int"
   }
   attr {
     name: "shared_name"
     type: "string"
-    default_value {
-      s: ""
-    }
   }
   is_stateful: true
 }
 op {
-  name: "Pack"
+  name: "NcclBroadcast"
   input_arg {
-    name: "values"
+    name: "input"
     type_attr: "T"
-    number_attr: "N"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
   attr {
-    name: "axis"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "shape"
+    type: "shape"
   }
+  is_stateful: true
 }
 op {
-  name: "Pad"
+  name: "NcclReduce"
   input_arg {
     name: "input"
     type_attr: "T"
-  }
-  input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
+    number_attr: "num_devices"
   }
   output_arg {
-    name: "output"
+    name: "data"
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "reduction"
+    type: "string"
+    allowed_values {
+      list {
+        s: "min"
+        s: "max"
+        s: "prod"
+        s: "sum"
+      }
+    }
   }
   attr {
-    name: "Tpaddings"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
       }
     }
   }
+  attr {
+    name: "num_devices"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
 }
 op {
-  name: "PadV2"
+  name: "NearestNeighbors"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "points"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
+    name: "centers"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "constant_values"
+    name: "k"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "nearest_center_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "nearest_center_distances"
+    type: DT_FLOAT
+  }
+}
+op {
+  name: "Neg"
+  input_arg {
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "Tpaddings"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "PaddedBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "padded_shapes"
-    type: DT_INT64
-    number_attr: "N"
-  }
+  name: "Neg"
   input_arg {
-    name: "padding_values"
-    type_list_attr: "Toutput_types"
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
-  is_stateful: true
 }
 op {
-  name: "PaddedBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "padded_shapes"
-    type: DT_INT64
-    number_attr: "N"
-  }
+  name: "Neg"
   input_arg {
-    name: "padding_values"
-    type_list_attr: "Toutput_types"
+    name: "x"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
-  name: "PaddedBatchDatasetV2"
+  name: "NegTrain"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "w_in"
+    type: DT_FLOAT
+    is_ref: true
   }
   input_arg {
-    name: "batch_size"
-    type: DT_INT64
+    name: "w_out"
+    type: DT_FLOAT
+    is_ref: true
   }
   input_arg {
-    name: "padded_shapes"
-    type: DT_INT64
-    number_attr: "N"
+    name: "examples"
+    type: DT_INT32
   }
   input_arg {
-    name: "padding_values"
-    type_list_attr: "Toutput_types"
+    name: "labels"
+    type: DT_INT32
   }
   input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "lr"
+    type: DT_FLOAT
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "vocab_count"
+    type: "list(int)"
   }
   attr {
-    name: "N"
+    name: "num_negative_samples"
     type: "int"
-    has_minimum: true
-    minimum: 1
   }
+  deprecation {
+    version: 19
+  }
+  is_stateful: true
 }
 op {
-  name: "PaddingFIFOQueue"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+  name: "NextAfter"
+  input_arg {
+    name: "x1"
+    type_attr: "T"
   }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "x2"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "shapes"
-    type: "list(shape)"
+    name: "T"
+    type: "type"
     default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
       list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
       }
     }
-    has_minimum: true
   }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
+}
+op {
+  name: "NextIteration"
+  input_arg {
+    name: "data"
+    type_attr: "T"
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "T"
+    type: "type"
   }
-  is_stateful: true
 }
 op {
-  name: "PaddingFIFOQueueV2"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "component_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  name: "NoOp"
+}
+op {
+  name: "NonMaxSuppression"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
   }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
   }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
   }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "iou_threshold"
+    type: "float"
     default_value {
-      s: ""
+      f: 0.5
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ParallelConcat"
+  name: "NonMaxSuppressionV2"
   input_arg {
-    name: "values"
-    type_attr: "T"
-    number_attr: "N"
+    name: "boxes"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
   }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
   }
-  attr {
-    name: "T"
-    type: "type"
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
   }
-  attr {
-    name: "shape"
-    type: "shape"
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
   }
 }
 op {
-  name: "ParallelDynamicStitch"
+  name: "NonMaxSuppressionV2"
   input_arg {
-    name: "indices"
-    type: DT_INT32
-    number_attr: "N"
+    name: "boxes"
+    type_attr: "T"
   }
   input_arg {
-    name: "data"
+    name: "scores"
     type_attr: "T"
-    number_attr: "N"
   }
-  output_arg {
-    name: "merged"
-    type_attr: "T"
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
   }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
   }
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
   }
 }
 op {
-  name: "ParallelInterleaveDatasetV2"
+  name: "NonMaxSuppressionV3"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "boxes"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "scores"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "cycle_length"
-    type: DT_INT64
+    name: "max_output_size"
+    type: DT_INT32
   }
   input_arg {
-    name: "block_length"
-    type: DT_INT64
+    name: "iou_threshold"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
+    name: "score_threshold"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "selected_indices"
+    type: DT_INT32
   }
 }
 op {
-  name: "ParallelInterleaveDatasetV2"
+  name: "NonMaxSuppressionV3"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "boxes"
+    type_attr: "T"
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "scores"
+    type_attr: "T"
   }
   input_arg {
-    name: "cycle_length"
-    type: DT_INT64
+    name: "max_output_size"
+    type: DT_INT32
   }
   input_arg {
-    name: "block_length"
-    type: DT_INT64
+    name: "iou_threshold"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
+    name: "score_threshold"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "selected_indices"
+    type: DT_INT32
   }
   attr {
-    name: "sloppy"
-    type: "bool"
+    name: "T"
+    type: "type"
     default_value {
-      b: false
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
     }
   }
 }
 op {
-  name: "ParallelMapDataset"
+  name: "NonMaxSuppressionV4"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "boxes"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "scores"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "num_parallel_calls"
+    name: "max_output_size"
     type: DT_INT32
   }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "ParallelMapDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "iou_threshold"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT32
+    name: "score_threshold"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ParallelMapDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "num_parallel_calls"
+    name: "selected_indices"
     type: DT_INT32
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "valid_outputs"
+    type: DT_INT32
   }
   attr {
-    name: "use_inter_op_parallelism"
+    name: "pad_to_max_output_size"
     type: "bool"
     default_value {
-      b: true
+      b: false
     }
   }
 }
 op {
-  name: "ParallelMapDataset"
+  name: "NonMaxSuppressionV4"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "boxes"
+    type_attr: "T"
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "scores"
+    type_attr: "T"
   }
   input_arg {
-    name: "num_parallel_calls"
+    name: "max_output_size"
     type: DT_INT32
   }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
   }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
   }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "valid_outputs"
+    type: DT_INT32
   }
   attr {
-    name: "use_inter_op_parallelism"
-    type: "bool"
+    name: "T"
+    type: "type"
     default_value {
-      b: true
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
     }
   }
   attr {
-    name: "sloppy"
+    name: "pad_to_max_output_size"
     type: "bool"
     default_value {
       b: false
@@ -40017,212 +41204,2068 @@ op {
   }
 }
 op {
-  name: "ParallelMapDataset"
+  name: "NonMaxSuppressionWithOverlaps"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "overlaps"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
+    name: "scores"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "num_parallel_calls"
+    name: "max_output_size"
     type: DT_INT32
   }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
+  input_arg {
+    name: "overlap_threshold"
+    type: DT_FLOAT
   }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
   }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
   }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+}
+op {
+  name: "NotEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
   }
-  attr {
-    name: "use_inter_op_parallelism"
-    type: "bool"
-    default_value {
-      b: true
-    }
+  input_arg {
+    name: "y"
+    type_attr: "T"
   }
-  attr {
-    name: "sloppy"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
   }
   attr {
-    name: "preserve_cardinality"
-    type: "bool"
-    default_value {
-      b: false
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
     }
   }
+  is_commutative: true
 }
 op {
-  name: "ParameterizedTruncatedNormal"
+  name: "NotEqual"
   input_arg {
-    name: "shape"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "means"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "stdevs"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "minvals"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "maxvals"
-    type_attr: "dtype"
+    name: "y"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
-  }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "z"
+    type: DT_BOOL
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
       }
     }
   }
+  is_commutative: true
+}
+op {
+  name: "NotEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
       }
     }
   }
-  is_stateful: true
+  is_commutative: true
 }
 op {
-  name: "ParameterizedTruncatedNormal"
+  name: "NthElement"
   input_arg {
-    name: "shape"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "means"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "stdevs"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "minvals"
-    type_attr: "dtype"
-  }
-  input_arg {
-    name: "maxvals"
-    type_attr: "dtype"
+    name: "n"
+    type: DT_INT32
   }
   output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+    name: "values"
+    type_attr: "T"
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "reverse"
+    type: "bool"
     default_value {
-      i: 0
+      b: false
     }
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "ParseExample"
+  name: "NthElement"
   input_arg {
-    name: "serialized"
-    type: DT_STRING
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "names"
-    type: DT_STRING
+    name: "n"
+    type: DT_INT32
   }
-  input_arg {
-    name: "sparse_keys"
-    type: DT_STRING
-    number_attr: "Nsparse"
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "NthElement"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "n"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  attr {
+    name: "reverse"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "OneHot"
+  input_arg {
+    name: "indices"
+    type_attr: "TI"
+  }
+  input_arg {
+    name: "depth"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "on_value"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "off_value"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "TI"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "OneShotIterator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "dataset_factory"
+    type: "func"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OnesLike"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "OnesLike"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_BOOL
+      }
+    }
+  }
+}
+op {
+  name: "OnesLike"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_BOOL
+      }
+    }
+  }
+}
+op {
+  name: "OptimizeDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "optimizations"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "OptionalFromValue"
+  input_arg {
+    name: "components"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "optional"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "OptionalGetValue"
+  input_arg {
+    name: "optional"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "OptionalHasValue"
+  input_arg {
+    name: "optional"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "has_value"
+    type: DT_BOOL
+  }
+}
+op {
+  name: "OptionalNone"
+  output_arg {
+    name: "optional"
+    type: DT_VARIANT
+  }
+}
+op {
+  name: "OrderedMapClear"
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapIncompleteSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapPeek"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapStage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "fake_dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "fake_dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapUnstage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OrderedMapUnstageNoKey"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OutfeedDequeue"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OutfeedDequeueTuple"
+  output_arg {
+    name: "outputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OutfeedEnqueue"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "OutfeedEnqueueTuple"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "Pack"
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "Pad"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "PadV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  input_arg {
+    name: "constant_values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "PaddedBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "PaddedBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "PaddedBatchDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "PaddingFIFOQueue"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PaddingFIFOQueueV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ParallelConcat"
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
+op {
+  name: "ParallelDynamicStitch"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "data"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "merged"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "ParallelInterleaveDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ParallelInterleaveDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ParameterizedTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "means"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "stdevs"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "minvals"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxvals"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ParameterizedTruncatedNormal"
+  input_arg {
+    name: "shape"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "means"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "stdevs"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "minvals"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxvals"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ParseExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "sparse_keys"
+    type: DT_STRING
+    number_attr: "Nsparse"
+  }
+  input_arg {
+    name: "dense_keys"
+    type: DT_STRING
+    number_attr: "Ndense"
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "sparse_indices"
+    type: DT_INT64
+    number_attr: "Nsparse"
+  }
+  output_arg {
+    name: "sparse_values"
+    type_list_attr: "sparse_types"
+  }
+  output_arg {
+    name: "sparse_shapes"
+    type: DT_INT64
+    number_attr: "Nsparse"
+  }
+  output_arg {
+    name: "dense_values"
+    type_list_attr: "Tdense"
+  }
+  attr {
+    name: "Nsparse"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "Ndense"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+}
+op {
+  name: "ParseSequenceExample"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "debug_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "context_dense_defaults"
+    type_list_attr: "Tcontext_dense"
+  }
+  output_arg {
+    name: "context_sparse_indices"
+    type: DT_INT64
+    number_attr: "Ncontext_sparse"
+  }
+  output_arg {
+    name: "context_sparse_values"
+    type_list_attr: "context_sparse_types"
+  }
+  output_arg {
+    name: "context_sparse_shapes"
+    type: DT_INT64
+    number_attr: "Ncontext_sparse"
+  }
+  output_arg {
+    name: "context_dense_values"
+    type_list_attr: "Tcontext_dense"
+  }
+  output_arg {
+    name: "feature_list_sparse_indices"
+    type: DT_INT64
+    number_attr: "Nfeature_list_sparse"
+  }
+  output_arg {
+    name: "feature_list_sparse_values"
+    type_list_attr: "feature_list_sparse_types"
+  }
+  output_arg {
+    name: "feature_list_sparse_shapes"
+    type: DT_INT64
+    number_attr: "Nfeature_list_sparse"
+  }
+  output_arg {
+    name: "feature_list_dense_values"
+    type_list_attr: "feature_list_dense_types"
+  }
+  output_arg {
+    name: "feature_list_dense_lengths"
+    type: DT_INT64
+    number_attr: "Nfeature_list_dense"
+  }
+  attr {
+    name: "feature_list_dense_missing_assumed_empty"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "context_sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "context_dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "feature_list_sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "feature_list_dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "Ncontext_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Ncontext_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Nfeature_list_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "Nfeature_list_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "context_sparse_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tcontext_dense"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "feature_list_dense_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "context_dense_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "feature_list_sparse_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "feature_list_dense_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
   }
+}
+op {
+  name: "ParseSingleExample"
   input_arg {
-    name: "dense_keys"
+    name: "serialized"
     type: DT_STRING
-    number_attr: "Ndense"
   }
   input_arg {
     name: "dense_defaults"
@@ -40231,7 +43274,7 @@ op {
   output_arg {
     name: "sparse_indices"
     type: DT_INT64
-    number_attr: "Nsparse"
+    number_attr: "num_sparse"
   }
   output_arg {
     name: "sparse_values"
@@ -40240,20 +43283,25 @@ op {
   output_arg {
     name: "sparse_shapes"
     type: DT_INT64
-    number_attr: "Nsparse"
+    number_attr: "num_sparse"
   }
   output_arg {
     name: "dense_values"
     type_list_attr: "Tdense"
   }
   attr {
-    name: "Nsparse"
+    name: "num_sparse"
     type: "int"
     has_minimum: true
   }
   attr {
-    name: "Ndense"
-    type: "int"
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
     has_minimum: true
   }
   attr {
@@ -40287,19 +43335,43 @@ op {
   }
 }
 op {
-  name: "ParseSequenceExample"
+  name: "ParseSingleSequenceExample"
   input_arg {
     name: "serialized"
     type: DT_STRING
   }
   input_arg {
-    name: "debug_name"
+    name: "feature_list_dense_missing_assumed_empty"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "context_sparse_keys"
     type: DT_STRING
+    number_attr: "Ncontext_sparse"
+  }
+  input_arg {
+    name: "context_dense_keys"
+    type: DT_STRING
+    number_attr: "Ncontext_dense"
+  }
+  input_arg {
+    name: "feature_list_sparse_keys"
+    type: DT_STRING
+    number_attr: "Nfeature_list_sparse"
+  }
+  input_arg {
+    name: "feature_list_dense_keys"
+    type: DT_STRING
+    number_attr: "Nfeature_list_dense"
   }
   input_arg {
     name: "context_dense_defaults"
     type_list_attr: "Tcontext_dense"
   }
+  input_arg {
+    name: "debug_name"
+    type: DT_STRING
+  }
   output_arg {
     name: "context_sparse_indices"
     type: DT_INT64
@@ -40336,675 +43408,912 @@ op {
     name: "feature_list_dense_values"
     type_list_attr: "feature_list_dense_types"
   }
-  output_arg {
-    name: "feature_list_dense_lengths"
-    type: DT_INT64
-    number_attr: "Nfeature_list_dense"
+  attr {
+    name: "Ncontext_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
   }
   attr {
-    name: "feature_list_dense_missing_assumed_empty"
-    type: "list(string)"
+    name: "Ncontext_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
     has_minimum: true
   }
   attr {
-    name: "context_sparse_keys"
-    type: "list(string)"
+    name: "Nfeature_list_sparse"
+    type: "int"
+    default_value {
+      i: 0
+    }
     has_minimum: true
   }
   attr {
-    name: "context_dense_keys"
-    type: "list(string)"
+    name: "Nfeature_list_dense"
+    type: "int"
+    default_value {
+      i: 0
+    }
     has_minimum: true
   }
   attr {
-    name: "feature_list_sparse_keys"
-    type: "list(string)"
+    name: "context_sparse_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
     has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
   }
   attr {
-    name: "feature_list_dense_keys"
-    type: "list(string)"
+    name: "Tcontext_dense"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
     has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
   }
   attr {
-    name: "Ncontext_sparse"
-    type: "int"
+    name: "feature_list_dense_types"
+    type: "list(type)"
     default_value {
-      i: 0
+      list {
+      }
     }
     has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
   }
   attr {
-    name: "Ncontext_dense"
-    type: "int"
+    name: "context_dense_shapes"
+    type: "list(shape)"
     default_value {
-      i: 0
+      list {
+      }
     }
     has_minimum: true
   }
   attr {
-    name: "Nfeature_list_sparse"
-    type: "int"
+    name: "feature_list_sparse_types"
+    type: "list(type)"
     default_value {
-      i: 0
+      list {
+      }
     }
     has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
   }
   attr {
-    name: "Nfeature_list_dense"
-    type: "int"
+    name: "feature_list_dense_shapes"
+    type: "list(shape)"
     default_value {
-      i: 0
+      list {
+      }
+    }
+    has_minimum: true
+  }
+}
+op {
+  name: "ParseTensor"
+  input_arg {
+    name: "serialized"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+  }
+}
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
     }
+  }
+}
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
     has_minimum: true
   }
   attr {
-    name: "context_sparse_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "config_proto"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "Placeholder"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+}
+op {
+  name: "Placeholder"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+}
+op {
+  name: "PlaceholderV2"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
+op {
+  name: "PlaceholderV2"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  deprecation {
+    version: 23
+  }
+}
+op {
+  name: "PlaceholderWithDefault"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
+op {
+  name: "Polygamma"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
+        type: DT_DOUBLE
       }
     }
   }
+}
+op {
+  name: "PopulationCount"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_UINT8
+  }
   attr {
-    name: "Tcontext_dense"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
         type: DT_INT64
-        type: DT_STRING
+        type: DT_UINT8
+        type: DT_UINT16
       }
     }
   }
+}
+op {
+  name: "PopulationCount"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_UINT8
+  }
   attr {
-    name: "feature_list_dense_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
         type: DT_INT64
-        type: DT_STRING
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-  attr {
-    name: "context_dense_shapes"
-    type: "list(shape)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
+}
+op {
+  name: "Pow"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
   }
   attr {
-    name: "feature_list_sparse_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
+    name: "T"
+    type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
         type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "feature_list_dense_shapes"
-    type: "list(shape)"
-    default_value {
-      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
-    has_minimum: true
   }
 }
 op {
-  name: "ParseSingleExample"
+  name: "Pow"
   input_arg {
-    name: "serialized"
-    type: DT_STRING
+    name: "x"
+    type_attr: "T"
   }
   input_arg {
-    name: "dense_defaults"
-    type_list_attr: "Tdense"
-  }
-  output_arg {
-    name: "sparse_indices"
-    type: DT_INT64
-    number_attr: "num_sparse"
-  }
-  output_arg {
-    name: "sparse_values"
-    type_list_attr: "sparse_types"
-  }
-  output_arg {
-    name: "sparse_shapes"
-    type: DT_INT64
-    number_attr: "num_sparse"
+    name: "y"
+    type_attr: "T"
   }
   output_arg {
-    name: "dense_values"
-    type_list_attr: "Tdense"
-  }
-  attr {
-    name: "num_sparse"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "sparse_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "dense_keys"
-    type: "list(string)"
-    has_minimum: true
+    name: "z"
+    type_attr: "T"
   }
   attr {
-    name: "sparse_types"
-    type: "list(type)"
-    has_minimum: true
+    name: "T"
+    type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
         type: DT_INT64
-        type: DT_STRING
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
+}
+op {
+  name: "Pow"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
   attr {
-    name: "Tdense"
-    type: "list(type)"
-    has_minimum: true
+    name: "T"
+    type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
         type: DT_INT64
-        type: DT_STRING
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  attr {
-    name: "dense_shapes"
-    type: "list(shape)"
-    has_minimum: true
-  }
 }
 op {
-  name: "ParseSingleSequenceExample"
-  input_arg {
-    name: "serialized"
-    type: DT_STRING
-  }
+  name: "PrefetchDataset"
   input_arg {
-    name: "feature_list_dense_missing_assumed_empty"
-    type: DT_STRING
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "context_sparse_keys"
-    type: DT_STRING
-    number_attr: "Ncontext_sparse"
+    name: "buffer_size"
+    type: DT_INT64
   }
-  input_arg {
-    name: "context_dense_keys"
-    type: DT_STRING
-    number_attr: "Ncontext_dense"
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
   }
-  input_arg {
-    name: "feature_list_sparse_keys"
-    type: DT_STRING
-    number_attr: "Nfeature_list_sparse"
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
-  input_arg {
-    name: "feature_list_dense_keys"
-    type: DT_STRING
-    number_attr: "Nfeature_list_dense"
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
+  is_stateful: true
+}
+op {
+  name: "PrefetchDataset"
   input_arg {
-    name: "context_dense_defaults"
-    type_list_attr: "Tcontext_dense"
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "debug_name"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "context_sparse_indices"
-    type: DT_INT64
-    number_attr: "Ncontext_sparse"
-  }
-  output_arg {
-    name: "context_sparse_values"
-    type_list_attr: "context_sparse_types"
-  }
-  output_arg {
-    name: "context_sparse_shapes"
+    name: "buffer_size"
     type: DT_INT64
-    number_attr: "Ncontext_sparse"
   }
   output_arg {
-    name: "context_dense_values"
-    type_list_attr: "Tcontext_dense"
+    name: "handle"
+    type: DT_VARIANT
   }
-  output_arg {
-    name: "feature_list_sparse_indices"
-    type: DT_INT64
-    number_attr: "Nfeature_list_sparse"
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
   }
-  output_arg {
-    name: "feature_list_sparse_values"
-    type_list_attr: "feature_list_sparse_types"
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
-  output_arg {
-    name: "feature_list_sparse_shapes"
-    type: DT_INT64
-    number_attr: "Nfeature_list_sparse"
+}
+op {
+  name: "PreventGradient"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
-    name: "feature_list_dense_values"
-    type_list_attr: "feature_list_dense_types"
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "Ncontext_sparse"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "Ncontext_dense"
-    type: "int"
+    name: "message"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
-    has_minimum: true
   }
-  attr {
-    name: "Nfeature_list_sparse"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+}
+op {
+  name: "Print"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "Nfeature_list_dense"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "data"
+    type_list_attr: "U"
   }
-  attr {
-    name: "context_sparse_types"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "Tcontext_dense"
-    type: "list(type)"
-    default_value {
-      list {
-      }
-    }
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "feature_list_dense_types"
+    name: "U"
     type: "list(type)"
-    default_value {
-      list {
-      }
-    }
     has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
+    minimum: 1
   }
   attr {
-    name: "context_dense_shapes"
-    type: "list(shape)"
+    name: "message"
+    type: "string"
     default_value {
-      list {
-      }
+      s: ""
     }
-    has_minimum: true
   }
   attr {
-    name: "feature_list_sparse_types"
-    type: "list(type)"
+    name: "first_n"
+    type: "int"
     default_value {
-      list {
-      }
-    }
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
+      i: -1
     }
   }
   attr {
-    name: "feature_list_dense_shapes"
-    type: "list(shape)"
+    name: "summarize"
+    type: "int"
     default_value {
-      list {
-      }
+      i: 3
     }
-    has_minimum: true
   }
+  is_stateful: true
 }
 op {
-  name: "ParseTensor"
+  name: "Print"
   input_arg {
-    name: "serialized"
-    type: DT_STRING
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "U"
   }
   output_arg {
     name: "output"
-    type_attr: "out_type"
+    type_attr: "T"
   }
   attr {
-    name: "out_type"
+    name: "T"
     type: "type"
   }
-}
-op {
-  name: "PartitionedCall"
-  input_arg {
-    name: "args"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
   attr {
-    name: "Tin"
+    name: "U"
     type: "list(type)"
     has_minimum: true
   }
   attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
+    name: "message"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "f"
-    type: "func"
+    name: "first_n"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "summarize"
+    type: "int"
+    default_value {
+      i: 3
+    }
   }
+  is_stateful: true
 }
 op {
-  name: "PartitionedCall"
+  name: "PrintV2"
   input_arg {
-    name: "args"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
+    name: "input"
+    type: DT_STRING
   }
   attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
+    name: "output_stream"
+    type: "string"
+    default_value {
+      s: "stderr"
+    }
+    allowed_values {
+      list {
+        s: "stdout"
+        s: "stderr"
+        s: "log(info)"
+        s: "log(warning)"
+        s: "log(error)"
+      }
+    }
   }
-  attr {
-    name: "f"
-    type: "func"
+  is_stateful: true
+}
+op {
+  name: "PrintV2"
+  input_arg {
+    name: "input"
+    type: DT_STRING
   }
   attr {
-    name: "config"
+    name: "output_stream"
     type: "string"
     default_value {
-      s: ""
+      s: "stderr"
     }
   }
+  is_stateful: true
 }
 op {
-  name: "PartitionedCall"
-  input_arg {
-    name: "args"
-    type_list_attr: "Tin"
-  }
+  name: "PriorityQueue"
   output_arg {
-    name: "output"
-    type_list_attr: "Tout"
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
   }
   attr {
-    name: "Tin"
+    name: "component_types"
     type: "list(type)"
+    default_value {
+      list {
+      }
+    }
     has_minimum: true
   }
   attr {
-    name: "Tout"
-    type: "list(type)"
+    name: "shapes"
+    type: "list(shape)"
     has_minimum: true
   }
   attr {
-    name: "f"
-    type: "func"
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
   }
   attr {
-    name: "config"
+    name: "container"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "executor_type"
+    name: "shared_name"
     type: "string"
     default_value {
       s: ""
     }
   }
+  is_stateful: true
 }
 op {
-  name: "PartitionedCall"
-  input_arg {
-    name: "args"
-    type_list_attr: "Tin"
-  }
+  name: "PriorityQueueV2"
   output_arg {
-    name: "output"
-    type_list_attr: "Tout"
+    name: "handle"
+    type: DT_RESOURCE
   }
   attr {
-    name: "Tin"
+    name: "component_types"
     type: "list(type)"
+    default_value {
+      list {
+      }
+    }
     has_minimum: true
   }
   attr {
-    name: "Tout"
-    type: "list(type)"
+    name: "shapes"
+    type: "list(shape)"
     has_minimum: true
   }
   attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "config"
-    type: "string"
+    name: "capacity"
+    type: "int"
     default_value {
-      s: ""
+      i: -1
     }
   }
   attr {
-    name: "config_proto"
+    name: "container"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "executor_type"
+    name: "shared_name"
     type: "string"
     default_value {
       s: ""
     }
   }
+  is_stateful: true
 }
 op {
-  name: "Placeholder"
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
+  name: "Prod"
+  input_arg {
+    name: "input"
+    type_attr: "T"
   }
-  attr {
-    name: "shape"
-    type: "shape"
-    default_value {
-      shape {
-      }
-    }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
   }
-}
-op {
-  name: "Placeholder"
   output_arg {
     name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
+    type_attr: "T"
   }
   attr {
-    name: "shape"
-    type: "shape"
+    name: "keep_dims"
+    type: "bool"
     default_value {
-      shape {
-        unknown_rank: true
-      }
+      b: false
     }
   }
-}
-op {
-  name: "PlaceholderV2"
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
   }
   attr {
-    name: "shape"
-    type: "shape"
-  }
-}
-op {
-  name: "PlaceholderV2"
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
+    name: "Tidx"
     type: "type"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-  deprecation {
-    version: 23
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
 }
 op {
-  name: "PlaceholderWithDefault"
+  name: "Prod"
   input_arg {
     name: "input"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
-  }
-}
-op {
-  name: "Polygamma"
-  input_arg {
-    name: "a"
     type_attr: "T"
   }
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "reduction_indices"
+    type_attr: "Tidx"
   }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
@@ -41012,117 +44321,236 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
-}
-op {
-  name: "PopulationCount"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type: DT_UINT8
-  }
   attr {
-    name: "T"
+    name: "Tidx"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
       }
     }
   }
 }
 op {
-  name: "PopulationCount"
+  name: "Prod"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
   output_arg {
-    name: "y"
-    type: DT_UINT8
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
         type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "Pow"
+  name: "Prod"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
-    type_attr: "T"
+    name: "reduction_indices"
+    type_attr: "Tidx"
   }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
         type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
         type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
       }
     }
   }
 }
 op {
-  name: "Pow"
+  name: "PyFunc"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "token"
+    type: "string"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
   }
+  is_stateful: true
+}
+op {
+  name: "PyFuncStateless"
   input_arg {
-    name: "y"
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "token"
+    type: "string"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+}
+op {
+  name: "Qr"
+  input_arg {
+    name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "q"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r"
     type_attr: "T"
   }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_FLOAT
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
@@ -41130,30 +44558,34 @@ op {
   }
 }
 op {
-  name: "Pow"
+  name: "Qr"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "y"
+  output_arg {
+    name: "q"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "r"
     type_attr: "T"
   }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_BFLOAT16
+        type: DT_DOUBLE
         type: DT_FLOAT
         type: DT_HALF
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
@@ -41161,320 +44593,282 @@ op {
   }
 }
 op {
-  name: "PrefetchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
+  name: "QuantizeAndDequantize"
   input_arg {
-    name: "buffer_size"
-    type: DT_INT64
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "PrefetchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "buffer_size"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "PreventGradient"
-  input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "message"
-    type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
   }
 }
 op {
-  name: "Print"
+  name: "QuantizeAndDequantize"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "data"
-    type_list_attr: "U"
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
-    name: "U"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
   }
   attr {
-    name: "message"
-    type: "string"
+    name: "range_given"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
   attr {
-    name: "first_n"
-    type: "int"
+    name: "input_min"
+    type: "float"
     default_value {
-      i: -1
+      f: 0
     }
   }
   attr {
-    name: "summarize"
-    type: "int"
+    name: "input_max"
+    type: "float"
     default_value {
-      i: 3
+      f: 0
     }
   }
-  is_stateful: true
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 21
+  }
 }
 op {
-  name: "Print"
+  name: "QuantizeAndDequantize"
   input_arg {
     name: "input"
     type_attr: "T"
   }
-  input_arg {
-    name: "data"
-    type_list_attr: "U"
-  }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "U"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "message"
-    type: "string"
+    name: "signed_input"
+    type: "bool"
     default_value {
-      s: ""
+      b: true
     }
   }
   attr {
-    name: "first_n"
+    name: "num_bits"
     type: "int"
     default_value {
-      i: -1
+      i: 8
     }
   }
   attr {
-    name: "summarize"
-    type: "int"
+    name: "range_given"
+    type: "bool"
     default_value {
-      i: 3
+      b: false
     }
   }
-  is_stateful: true
-}
-op {
-  name: "PrintV2"
-  input_arg {
-    name: "input"
-    type: DT_STRING
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
   }
   attr {
-    name: "output_stream"
-    type: "string"
+    name: "input_max"
+    type: "float"
     default_value {
-      s: "stderr"
+      f: 0
     }
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        s: "stdout"
-        s: "stderr"
-        s: "log(info)"
-        s: "log(warning)"
-        s: "log(error)"
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
-  is_stateful: true
+  deprecation {
+    version: 22
+  }
 }
 op {
-  name: "PrintV2"
+  name: "QuantizeAndDequantize"
   input_arg {
     name: "input"
-    type: DT_STRING
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "output_stream"
-    type: "string"
+    name: "signed_input"
+    type: "bool"
     default_value {
-      s: "stderr"
+      b: true
     }
   }
-  is_stateful: true
-}
-op {
-  name: "PriorityQueue"
-  output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
-  }
   attr {
-    name: "component_types"
-    type: "list(type)"
+    name: "num_bits"
+    type: "int"
     default_value {
-      list {
-      }
+      i: 8
     }
-    has_minimum: true
   }
   attr {
-    name: "shapes"
-    type: "list(shape)"
-    has_minimum: true
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
   }
   attr {
-    name: "capacity"
-    type: "int"
+    name: "input_min"
+    type: "float"
     default_value {
-      i: -1
+      f: 0
     }
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "input_max"
+    type: "float"
     default_value {
-      s: ""
+      f: 0
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
   }
-  is_stateful: true
+  deprecation {
+    version: 22
+  }
 }
 op {
-  name: "PriorityQueueV2"
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
   output_arg {
-    name: "handle"
-    type: DT_RESOURCE
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "component_types"
-    type: "list(type)"
+    name: "signed_input"
+    type: "bool"
     default_value {
-      list {
-      }
+      b: true
     }
-    has_minimum: true
-  }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-    has_minimum: true
   }
   attr {
-    name: "capacity"
+    name: "num_bits"
     type: "int"
     default_value {
-      i: -1
+      i: 8
     }
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "range_given"
+    type: "bool"
     default_value {
-      s: ""
+      b: false
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "input_min"
+    type: "float"
     default_value {
-      s: ""
+      f: 0
     }
   }
-  is_stateful: true
-}
-op {
-  name: "Prod"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
-    name: "keep_dims"
-    type: "bool"
+    name: "input_max"
+    type: "float"
     default_value {
-      b: false
+      f: 0
     }
   }
   attr {
@@ -41482,283 +44876,204 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  deprecation {
+    version: 22
   }
 }
 op {
-  name: "Prod"
+  name: "QuantizeAndDequantizeV2"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "keep_dims"
+    name: "signed_input"
     type: "bool"
     default_value {
-      b: false
+      b: true
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
     }
   }
   attr {
-    name: "Tidx"
-    type: "type"
+    name: "range_given"
+    type: "bool"
     default_value {
-      type: DT_INT32
+      b: false
     }
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "Prod"
+  name: "QuantizeAndDequantizeV2"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "keep_dims"
+    name: "signed_input"
     type: "bool"
     default_value {
-      b: false
+      b: true
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
     }
   }
   attr {
-    name: "Tidx"
-    type: "type"
+    name: "range_given"
+    type: "bool"
     default_value {
-      type: DT_INT32
+      b: false
     }
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "Prod"
+  name: "QuantizeAndDequantizeV2"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "keep_dims"
+    name: "signed_input"
     type: "bool"
     default_value {
-      b: false
+      b: true
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
     }
   }
   attr {
-    name: "Tidx"
-    type: "type"
+    name: "range_given"
+    type: "bool"
     default_value {
-      type: DT_INT32
+      b: false
     }
+  }
+  attr {
+    name: "T"
+    type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
 }
 op {
-  name: "PyFunc"
+  name: "QuantizeAndDequantizeV2"
   input_arg {
     name: "input"
-    type_list_attr: "Tin"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "token"
-    type: "string"
-  }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
+    type_attr: "T"
   }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "PyFuncStateless"
   input_arg {
-    name: "input"
-    type_list_attr: "Tin"
+    name: "input_max"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
-    type_list_attr: "Tout"
-  }
-  attr {
-    name: "token"
-    type: "string"
+    type_attr: "T"
   }
   attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
   }
   attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
-  }
-}
-op {
-  name: "Qr"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "q"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "r"
-    type_attr: "T"
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
   }
   attr {
-    name: "full_matrices"
+    name: "range_given"
     type: "bool"
     default_value {
       b: false
@@ -41769,20 +45084,45 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
+        type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_TO_EVEN"
+    }
+    allowed_values {
+      list {
+        s: "HALF_TO_EVEN"
+        s: "HALF_UP"
       }
     }
   }
 }
 op {
-  name: "QuantizeAndDequantize"
+  name: "QuantizeAndDequantizeV3"
   input_arg {
     name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_bits"
+    type: DT_INT32
+  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -41794,32 +45134,11 @@ op {
       b: true
     }
   }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
   attr {
     name: "range_given"
     type: "bool"
     default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "input_min"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "input_max"
-    type: "float"
-    default_value {
-      f: 0
+      b: true
     }
   }
   attr {
@@ -41834,11 +45153,23 @@ op {
   }
 }
 op {
-  name: "QuantizeAndDequantize"
+  name: "QuantizeAndDequantizeV3"
   input_arg {
     name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_bits"
+    type: DT_INT32
+  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -41850,32 +45181,11 @@ op {
       b: true
     }
   }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
   attr {
     name: "range_given"
     type: "bool"
     default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "input_min"
-    type: "float"
-    default_value {
-      f: 0
-    }
-  }
-  attr {
-    name: "input_max"
-    type: "float"
-    default_value {
-      f: 0
+      b: true
     }
   }
   attr {
@@ -41883,21 +45193,31 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-  deprecation {
-    version: 21
-  }
 }
 op {
-  name: "QuantizeAndDequantize"
+  name: "QuantizeAndDequantizeV3"
   input_arg {
     name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_bits"
+    type: DT_INT32
+  }
   output_arg {
     name: "output"
     type_attr: "T"
@@ -41909,543 +45229,895 @@ op {
       b: true
     }
   }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
-  }
   attr {
     name: "range_given"
     type: "bool"
     default_value {
-      b: false
+      b: true
     }
   }
   attr {
-    name: "input_min"
-    type: "float"
-    default_value {
-      f: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
     }
   }
-  attr {
+}
+op {
+  name: "QuantizeDownAndShrinkRange"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
     name: "input_max"
-    type: "float"
-    default_value {
-      f: 0
-    }
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
-  deprecation {
-    version: 22
+  attr {
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
   }
 }
 op {
-  name: "QuantizeAndDequantize"
+  name: "QuantizeDownAndShrinkRange"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
   }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
+}
+op {
+  name: "QuantizeV2"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
   attr {
-    name: "input_min"
-    type: "float"
-    default_value {
-      f: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
   attr {
-    name: "input_max"
-    type: "float"
+    name: "mode"
+    type: "string"
     default_value {
-      f: 0
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+      }
     }
   }
+}
+op {
+  name: "QuantizeV2"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
-  deprecation {
-    version: 22
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
+    }
   }
 }
 op {
-  name: "QuantizeAndDequantize"
+  name: "QuantizeV2"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
   attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
   attr {
-    name: "num_bits"
-    type: "int"
+    name: "mode"
+    type: "string"
     default_value {
-      i: 8
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
     }
   }
   attr {
-    name: "range_given"
-    type: "bool"
+    name: "round_mode"
+    type: "string"
     default_value {
-      b: false
+      s: "HALF_AWAY_FROM_ZERO"
     }
+    allowed_values {
+      list {
+        s: "HALF_AWAY_FROM_ZERO"
+        s: "HALF_TO_EVEN"
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeV2"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "input_min"
-    type: "float"
-    default_value {
-      f: 0
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "input_max"
-    type: "float"
+    name: "mode"
+    type: "string"
     default_value {
-      f: 0
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
     }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_AWAY_FROM_ZERO"
+    }
     allowed_values {
       list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        s: "HALF_AWAY_FROM_ZERO"
+        s: "HALF_TO_EVEN"
       }
     }
   }
-  deprecation {
-    version: 22
-  }
 }
 op {
-  name: "QuantizeAndDequantizeV2"
+  name: "QuantizedAdd"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "x"
+    type_attr: "T1"
   }
   input_arg {
-    name: "input_min"
-    type_attr: "T"
+    name: "y"
+    type_attr: "T2"
   }
   input_arg {
-    name: "input_max"
-    type_attr: "T"
+    name: "min_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "z"
+    type_attr: "Toutput"
+  }
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
   }
   attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
   attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
   attr {
-    name: "range_given"
-    type: "bool"
+    name: "Toutput"
+    type: "type"
     default_value {
-      b: false
+      type: DT_QINT32
     }
-  }
-  attr {
-    name: "T"
-    type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
+  is_commutative: true
 }
 op {
-  name: "QuantizeAndDequantizeV2"
+  name: "QuantizedAdd"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "x"
+    type_attr: "T1"
   }
   input_arg {
-    name: "input_min"
-    type_attr: "T"
+    name: "y"
+    type_attr: "T2"
   }
   input_arg {
-    name: "input_max"
-    type_attr: "T"
+    name: "min_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "z"
+    type_attr: "Toutput"
   }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
   }
   attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "T"
+    name: "Toutput"
     type: "type"
+    default_value {
+      type: DT_QINT32
+    }
     allowed_values {
       list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
+  is_commutative: true
 }
 op {
-  name: "QuantizeAndDequantizeV2"
+  name: "QuantizedAvgPool"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "input_min"
-    type_attr: "T"
+    name: "min_input"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "input_max"
-    type_attr: "T"
+    name: "max_input"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
   attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
   attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
+    name: "ksize"
+    type: "list(int)"
   }
   attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
 }
 op {
-  name: "QuantizeAndDequantizeV2"
+  name: "QuantizedAvgPool"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "input_min"
-    type_attr: "T"
+    name: "min_input"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "input_max"
-    type_attr: "T"
+    name: "max_input"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
-    }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
   }
-  attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
   attr {
-    name: "round_mode"
+    name: "ksize"
+    type: "list(int)"
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
     type: "string"
-    default_value {
-      s: "HALF_TO_EVEN"
-    }
     allowed_values {
       list {
-        s: "HALF_TO_EVEN"
-        s: "HALF_UP"
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
 }
 op {
-  name: "QuantizeAndDequantizeV3"
+  name: "QuantizedBatchNormWithGlobalNormalization"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "t"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "input_min"
-    type_attr: "T"
+    name: "t_min"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "input_max"
-    type_attr: "T"
+    name: "t_max"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "num_bits"
-    type: DT_INT32
+    name: "m"
+    type_attr: "Tinput"
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  input_arg {
+    name: "m_min"
+    type: DT_FLOAT
   }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
+  input_arg {
+    name: "m_max"
+    type: DT_FLOAT
   }
-  attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: true
-    }
+  input_arg {
+    name: "v"
+    type_attr: "Tinput"
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+  input_arg {
+    name: "v_min"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "QuantizeAndDequantizeV3"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "v_max"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "input_min"
-    type_attr: "T"
+    name: "beta"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "input_max"
-    type_attr: "T"
+    name: "beta_min"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "num_bits"
-    type: DT_INT32
+    name: "beta_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "gamma_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gamma_max"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "result"
+    type_attr: "out_type"
   }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
+  output_arg {
+    name: "result_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "result_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: true
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
     }
   }
   attr {
-    name: "T"
+    name: "out_type"
     type: "type"
     allowed_values {
       list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
 }
 op {
-  name: "QuantizeAndDequantizeV3"
+  name: "QuantizedBatchNormWithGlobalNormalization"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "t"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "input_min"
-    type_attr: "T"
+    name: "t_min"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "input_max"
-    type_attr: "T"
+    name: "t_max"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "num_bits"
-    type: DT_INT32
+    name: "m"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "m_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "m_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "v"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "v_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "v_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "beta_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "beta_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "gamma_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gamma_max"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "result"
+    type_attr: "out_type"
   }
-  attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
-    }
+  output_arg {
+    name: "result_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "result_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: true
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "T"
+    name: "out_type"
     type: "type"
     allowed_values {
       list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
+  attr {
+    name: "variance_epsilon"
+    type: "float"
+  }
+  attr {
+    name: "scale_after_normalization"
+    type: "bool"
+  }
 }
 op {
-  name: "QuantizeDownAndShrinkRange"
+  name: "QuantizedBiasAdd"
   input_arg {
     name: "input"
-    type_attr: "Tinput"
+    type_attr: "T1"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "input_min"
+    name: "min_bias"
     type: DT_FLOAT
   }
   input_arg {
-    name: "input_max"
+    name: "max_bias"
     type: DT_FLOAT
   }
   output_arg {
@@ -42453,15 +46125,28 @@ op {
     type_attr: "out_type"
   }
   output_arg {
-    name: "output_min"
+    name: "min_out"
     type: DT_FLOAT
   }
   output_arg {
-    name: "output_max"
+    name: "max_out"
     type: DT_FLOAT
   }
   attr {
-    name: "Tinput"
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "T2"
     type: "type"
     allowed_values {
       list {
@@ -42488,17 +46173,29 @@ op {
   }
 }
 op {
-  name: "QuantizeDownAndShrinkRange"
+  name: "QuantizedBiasAdd"
   input_arg {
     name: "input"
-    type_attr: "Tinput"
+    type_attr: "T1"
   }
   input_arg {
-    name: "input_min"
+    name: "bias"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "input_max"
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_bias"
     type: DT_FLOAT
   }
   output_arg {
@@ -42506,15 +46203,28 @@ op {
     type_attr: "out_type"
   }
   output_arg {
-    name: "output_min"
+    name: "min_out"
     type: DT_FLOAT
   }
   output_arg {
-    name: "output_max"
+    name: "max_out"
     type: DT_FLOAT
   }
   attr {
-    name: "Tinput"
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "T2"
     type: "type"
     allowed_values {
       list {
@@ -42541,18 +46251,25 @@ op {
   }
 }
 op {
-  name: "QuantizeV2"
+  name: "QuantizedConcat"
   input_arg {
-    name: "input"
-    type: DT_FLOAT
+    name: "concat_dim"
+    type: DT_INT32
   }
   input_arg {
-    name: "min_range"
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "input_mins"
     type: DT_FLOAT
+    number_attr: "N"
   }
   input_arg {
-    name: "max_range"
+    name: "input_maxes"
     type: DT_FLOAT
+    number_attr: "N"
   }
   output_arg {
     name: "output"
@@ -42567,60 +46284,56 @@ op {
     type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
-        type: DT_QINT32
-      }
-    }
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
   }
   attr {
-    name: "mode"
-    type: "string"
-    default_value {
-      s: "MIN_COMBINED"
-    }
-    allowed_values {
-      list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
-      }
-    }
+    name: "T"
+    type: "type"
   }
 }
 op {
-  name: "QuantizeV2"
+  name: "QuantizedConv2D"
   input_arg {
     name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "min_range"
+    name: "max_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_range"
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
     type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
   }
   output_arg {
-    name: "output_min"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "output_max"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
@@ -42633,48 +46346,89 @@ op {
     }
   }
   attr {
-    name: "mode"
-    type: "string"
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
     default_value {
-      s: "MIN_COMBINED"
+      type: DT_QINT32
     }
     allowed_values {
       list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
-        s: "SCALED"
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
 }
 op {
-  name: "QuantizeV2"
+  name: "QuantizedConv2D"
   input_arg {
     name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "min_range"
+    name: "max_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_range"
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
     type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
   }
   output_arg {
-    name: "output_min"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "output_max"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
@@ -42687,61 +46441,101 @@ op {
     }
   }
   attr {
-    name: "mode"
-    type: "string"
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
     default_value {
-      s: "MIN_COMBINED"
+      type: DT_QINT32
     }
     allowed_values {
       list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
-        s: "SCALED"
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
   attr {
-    name: "round_mode"
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
     type: "string"
-    default_value {
-      s: "HALF_AWAY_FROM_ZERO"
-    }
     allowed_values {
       list {
-        s: "HALF_AWAY_FROM_ZERO"
-        s: "HALF_TO_EVEN"
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
       }
     }
   }
 }
 op {
-  name: "QuantizeV2"
+  name: "QuantizedConv2D"
   input_arg {
     name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "min_range"
+    name: "max_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_range"
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
     type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
   }
   output_arg {
-    name: "output_min"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "output_max"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
@@ -42754,99 +46548,127 @@ op {
     }
   }
   attr {
-    name: "mode"
-    type: "string"
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
     default_value {
-      s: "MIN_COMBINED"
+      type: DT_QINT32
     }
     allowed_values {
       list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
-        s: "SCALED"
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
   attr {
-    name: "round_mode"
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
     type: "string"
-    default_value {
-      s: "HALF_AWAY_FROM_ZERO"
-    }
     allowed_values {
       list {
-        s: "HALF_AWAY_FROM_ZERO"
-        s: "HALF_TO_EVEN"
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
       }
     }
   }
 }
 op {
-  name: "QuantizedAdd"
+  name: "QuantizedConv2DAndRelu"
   input_arg {
-    name: "x"
-    type_attr: "T1"
+    name: "input"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "y"
-    type_attr: "T2"
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
-    name: "min_x"
+    name: "min_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_x"
+    name: "max_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "min_y"
+    name: "min_filter"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_y"
+    name: "max_filter"
     type: DT_FLOAT
   }
   output_arg {
-    name: "z"
-    type_attr: "Toutput"
+    name: "output"
+    type_attr: "out_type"
   }
   output_arg {
-    name: "min_z"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "max_z"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
-    name: "T1"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
   attr {
-    name: "T2"
+    name: "Tfilter"
     type: "type"
     allowed_values {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
   attr {
-    name: "Toutput"
+    name: "out_type"
     type: "type"
     default_value {
       type: DT_QINT32
@@ -42855,54 +46677,87 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
-  is_commutative: true
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
-  name: "QuantizedAdd"
+  name: "QuantizedConv2DAndReluAndRequantize"
   input_arg {
-    name: "x"
-    type_attr: "T1"
+    name: "input"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "y"
-    type_attr: "T2"
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
-    name: "min_x"
+    name: "min_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_x"
+    name: "max_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "min_y"
+    name: "min_filter"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_y"
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "z"
-    type_attr: "Toutput"
+    name: "output"
+    type_attr: "out_type"
   }
   output_arg {
-    name: "min_z"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "max_z"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
-    name: "T1"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
@@ -42915,7 +46770,7 @@ op {
     }
   }
   attr {
-    name: "T2"
+    name: "Tfilter"
     type: "type"
     allowed_values {
       list {
@@ -42928,10 +46783,10 @@ op {
     }
   }
   attr {
-    name: "Toutput"
+    name: "out_type"
     type: "type"
     default_value {
-      type: DT_QINT32
+      type: DT_QUINT8
     }
     allowed_values {
       list {
@@ -42943,13 +46798,42 @@ op {
       }
     }
   }
-  is_commutative: true
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
-  name: "QuantizedAvgPool"
+  name: "QuantizedConv2DAndRequantize"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
     name: "min_input"
@@ -42959,9 +46843,25 @@ op {
     name: "max_input"
     type: DT_FLOAT
   }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
   }
   output_arg {
     name: "min_output"
@@ -42972,22 +46872,47 @@ op {
     type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
     type: "type"
+    default_value {
+      type: DT_QINT8
+    }
     allowed_values {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
-  attr {
-    name: "ksize"
-    type: "list(int)"
-  }
   attr {
     name: "strides"
     type: "list(int)"
@@ -43002,12 +46927,32 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
-  name: "QuantizedAvgPool"
+  name: "QuantizedConv2DWithBias"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
   }
   input_arg {
     name: "min_input"
@@ -43017,9 +46962,17 @@ op {
     name: "max_input"
     type: DT_FLOAT
   }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
   }
   output_arg {
     name: "min_output"
@@ -43030,7 +46983,7 @@ op {
     type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
@@ -43043,8 +46996,33 @@ op {
     }
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
   }
   attr {
     name: "strides"
@@ -43060,79 +47038,59 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
-  name: "QuantizedBatchNormWithGlobalNormalization"
-  input_arg {
-    name: "t"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "t_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "t_max"
-    type: DT_FLOAT
-  }
+  name: "QuantizedConv2DWithBiasAndRelu"
   input_arg {
-    name: "m"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "m_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "m_max"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "v"
+    name: "input"
     type_attr: "Tinput"
   }
   input_arg {
-    name: "v_min"
-    type: DT_FLOAT
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
-    name: "v_max"
+    name: "bias"
     type: DT_FLOAT
   }
   input_arg {
-    name: "beta"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "beta_min"
+    name: "min_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "beta_max"
+    name: "max_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "gamma"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "gamma_min"
+    name: "min_filter"
     type: DT_FLOAT
   }
   input_arg {
-    name: "gamma_max"
+    name: "max_filter"
     type: DT_FLOAT
   }
   output_arg {
-    name: "result"
+    name: "output"
     type_attr: "out_type"
   }
   output_arg {
-    name: "result_min"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "result_max"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
@@ -43142,106 +47100,116 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
         type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
   attr {
     name: "out_type"
     type: "type"
+    default_value {
+      type: DT_QINT32
+    }
     allowed_values {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
   attr {
-    name: "variance_epsilon"
-    type: "float"
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "scale_after_normalization"
-    type: "bool"
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
   }
 }
 op {
-  name: "QuantizedBatchNormWithGlobalNormalization"
-  input_arg {
-    name: "t"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "t_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "t_max"
-    type: DT_FLOAT
-  }
+  name: "QuantizedConv2DWithBiasAndReluAndRequantize"
   input_arg {
-    name: "m"
+    name: "input"
     type_attr: "Tinput"
   }
   input_arg {
-    name: "m_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "m_max"
-    type: DT_FLOAT
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
-    name: "v"
-    type_attr: "Tinput"
+    name: "bias"
+    type_attr: "Tbias"
   }
   input_arg {
-    name: "v_min"
+    name: "min_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "v_max"
+    name: "max_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "beta"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "beta_min"
+    name: "min_filter"
     type: DT_FLOAT
   }
   input_arg {
-    name: "beta_max"
+    name: "max_filter"
     type: DT_FLOAT
   }
   input_arg {
-    name: "gamma"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "gamma_min"
+    name: "min_freezed_output"
     type: DT_FLOAT
   }
   input_arg {
-    name: "gamma_max"
+    name: "max_freezed_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "result"
+    name: "output"
     type_attr: "out_type"
   }
   output_arg {
-    name: "result_min"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "result_max"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
@@ -43257,9 +47225,35 @@ op {
       }
     }
   }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
   attr {
     name: "out_type"
     type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
     allowed_values {
       list {
         type: DT_QINT8
@@ -43271,23 +47265,45 @@ op {
     }
   }
   attr {
-    name: "variance_epsilon"
-    type: "float"
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "scale_after_normalization"
-    type: "bool"
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
   }
 }
 op {
-  name: "QuantizedBiasAdd"
+  name: "QuantizedConv2DWithBiasAndRequantize"
   input_arg {
     name: "input"
-    type_attr: "T1"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
     name: "bias"
-    type_attr: "T2"
+    type_attr: "Tbias"
   }
   input_arg {
     name: "min_input"
@@ -43298,11 +47314,19 @@ op {
     type: DT_FLOAT
   }
   input_arg {
-    name: "min_bias"
+    name: "min_filter"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_bias"
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
     type: DT_FLOAT
   }
   output_arg {
@@ -43310,35 +47334,45 @@ op {
     type_attr: "out_type"
   }
   output_arg {
-    name: "min_out"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "max_out"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
-    name: "T1"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
   attr {
-    name: "T2"
+    name: "Tfilter"
     type: "type"
     allowed_values {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
         type: DT_QINT32
       }
     }
@@ -43346,26 +47380,59 @@ op {
   attr {
     name: "out_type"
     type: "type"
+    default_value {
+      type: DT_QINT8
+    }
     allowed_values {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
       }
     }
   }
 }
 op {
-  name: "QuantizedBiasAdd"
+  name: "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize"
   input_arg {
     name: "input"
-    type_attr: "T1"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
     name: "bias"
-    type_attr: "T2"
+    type_attr: "Tbias"
   }
   input_arg {
     name: "min_input"
@@ -43376,11 +47443,31 @@ op {
     type: DT_FLOAT
   }
   input_arg {
-    name: "min_bias"
+    name: "min_filter"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_bias"
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type_attr: "Tsummand"
+  }
+  input_arg {
+    name: "min_summand"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_summand"
     type: DT_FLOAT
   }
   output_arg {
@@ -43388,15 +47475,15 @@ op {
     type_attr: "out_type"
   }
   output_arg {
-    name: "min_out"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "max_out"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
-    name: "T1"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
@@ -43409,7 +47496,7 @@ op {
     }
   }
   attr {
-    name: "T2"
+    name: "Tfilter"
     type: "type"
     allowed_values {
       list {
@@ -43422,7 +47509,17 @@ op {
     }
   }
   attr {
-    name: "out_type"
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tsummand"
     type: "type"
     allowed_values {
       list {
@@ -43434,53 +47531,51 @@ op {
       }
     }
   }
-}
-op {
-  name: "QuantizedConcat"
-  input_arg {
-    name: "concat_dim"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "values"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  input_arg {
-    name: "input_mins"
-    type: DT_FLOAT
-    number_attr: "N"
-  }
-  input_arg {
-    name: "input_maxes"
-    type: DT_FLOAT
-    number_attr: "N"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
   }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
+  attr {
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 2
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
   }
 }
 op {
-  name: "QuantizedConv2D"
+  name: "QuantizedConv2DWithBiasSumAndRelu"
   input_arg {
     name: "input"
     type_attr: "Tinput"
@@ -43489,6 +47584,10 @@ op {
     name: "filter"
     type_attr: "Tfilter"
   }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
+  }
   input_arg {
     name: "min_input"
     type: DT_FLOAT
@@ -43505,6 +47604,10 @@ op {
     name: "max_filter"
     type: DT_FLOAT
   }
+  input_arg {
+    name: "summand"
+    type: DT_FLOAT
+  }
   output_arg {
     name: "output"
     type_attr: "out_type"
@@ -43524,9 +47627,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -43537,9 +47640,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -43553,9 +47656,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -43573,9 +47676,21 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
-  name: "QuantizedConv2D"
+  name: "QuantizedConv2DWithBiasSumAndReluAndRequantize"
   input_arg {
     name: "input"
     type_attr: "Tinput"
@@ -43584,6 +47699,10 @@ op {
     name: "filter"
     type_attr: "Tfilter"
   }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
   input_arg {
     name: "min_input"
     type: DT_FLOAT
@@ -43600,6 +47719,26 @@ op {
     name: "max_filter"
     type: DT_FLOAT
   }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type_attr: "Tsummand"
+  }
+  input_arg {
+    name: "min_summand"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_summand"
+    type: DT_FLOAT
+  }
   output_arg {
     name: "output"
     type_attr: "out_type"
@@ -43619,9 +47758,9 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
+        type: DT_QINT32
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
@@ -43632,108 +47771,24 @@ op {
       list {
         type: DT_QINT8
         type: DT_QUINT8
-        type: DT_QINT16
-        type: DT_QUINT16
         type: DT_QINT32
-      }
-    }
-  }
-  attr {
-    name: "out_type"
-    type: "type"
-    default_value {
-      type: DT_QINT32
-    }
-    allowed_values {
-      list {
-        type: DT_QINT8
-        type: DT_QUINT8
         type: DT_QINT16
         type: DT_QUINT16
-        type: DT_QINT32
       }
     }
   }
   attr {
-    name: "strides"
-    type: "list(int)"
-  }
-  attr {
-    name: "padding"
-    type: "string"
-    allowed_values {
-      list {
-        s: "SAME"
-        s: "VALID"
-      }
-    }
-  }
-  attr {
-    name: "dilations"
-    type: "list(int)"
-    default_value {
-      list {
-        i: 1
-        i: 1
-        i: 1
-        i: 1
-      }
-    }
-  }
-}
-op {
-  name: "QuantizedConv2D"
-  input_arg {
-    name: "input"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "filter"
-    type_attr: "Tfilter"
-  }
-  input_arg {
-    name: "min_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_input"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "min_filter"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "max_filter"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
-  }
-  output_arg {
-    name: "min_output"
-    type: DT_FLOAT
-  }
-  output_arg {
-    name: "max_output"
-    type: DT_FLOAT
-  }
-  attr {
-    name: "Tinput"
+    name: "Tbias"
     type: "type"
     allowed_values {
       list {
-        type: DT_QINT8
-        type: DT_QUINT8
+        type: DT_FLOAT
         type: DT_QINT32
-        type: DT_QINT16
-        type: DT_QUINT16
       }
     }
   }
   attr {
-    name: "Tfilter"
+    name: "Tsummand"
     type: "type"
     allowed_values {
       list {
@@ -43749,7 +47804,7 @@ op {
     name: "out_type"
     type: "type"
     default_value {
-      type: DT_QINT32
+      type: DT_QUINT8
     }
     allowed_values {
       list {
@@ -46930,6 +50985,78 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RecvTPUEmbeddingActivations"
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+    number_attr: "num_outputs"
+  }
+  attr {
+    name: "num_outputs"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  is_stateful: true
+}
+op {
+  name: "ReduceDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
 op {
   name: "ReduceDataset"
   input_arg {
@@ -46982,6 +51109,7 @@ op {
       b: true
     }
   }
+  is_stateful: true
 }
 op {
   name: "ReduceJoin"
@@ -47982,6 +52110,49 @@ op {
     }
   }
 }
+op {
+  name: "RequantizationRangePerChannel"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "clip_value_max"
+    type: "float"
+  }
+}
 op {
   name: "Requantize"
   input_arg {
@@ -48104,6 +52275,73 @@ op {
     }
   }
 }
+op {
+  name: "RequantizePerChannel"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
 op {
   name: "Reshape"
   input_arg {
@@ -52711,7 +56949,322 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceScatterMax"
+  name: "ResourceScatterMax"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterMin"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterMul"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterNdAdd"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterNdSub"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterNdUpdate"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterSub"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceScatterUpdate"
   input_arg {
     name: "resource"
     type: DT_RESOURCE
@@ -52731,18 +57284,17 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_INT64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -52762,7 +57314,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceScatterMin"
+  name: "ResourceScatterUpdate"
   input_arg {
     name: "resource"
     type: DT_RESOURCE
@@ -52782,21 +57334,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_INT64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -52813,7 +57365,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceScatterMul"
+  name: "ResourceScatterUpdate"
   input_arg {
     name: "resource"
     type: DT_RESOURCE
@@ -52864,9 +57416,9 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceScatterNdAdd"
+  name: "ResourceScatterUpdate"
   input_arg {
-    name: "ref"
+    name: "resource"
     type: DT_RESOURCE
   }
   input_arg {
@@ -52875,10 +57427,10 @@ op {
   }
   input_arg {
     name: "updates"
-    type_attr: "T"
+    type_attr: "dtype"
   }
   attr {
-    name: "T"
+    name: "dtype"
     type: "type"
   }
   attr {
@@ -52891,88 +57443,61 @@ op {
       }
     }
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
   is_stateful: true
 }
 op {
-  name: "ResourceScatterNdUpdate"
+  name: "ResourceSparseApplyAdadelta"
   input_arg {
-    name: "ref"
+    name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "accum"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "updates"
-    type_attr: "T"
+    name: "accum_update"
+    type: DT_RESOURCE
   }
-  attr {
-    name: "T"
-    type: "type"
+  input_arg {
+    name: "lr"
+    type_attr: "T"
   }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
   }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: true
-    }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
   }
-  is_stateful: true
-}
-op {
-  name: "ResourceScatterSub"
   input_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "grad"
+    type_attr: "T"
   }
   input_arg {
     name: "indices"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "updates"
-    type_attr: "dtype"
-  }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_INT64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -52986,24 +57511,51 @@ op {
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   is_stateful: true
 }
 op {
-  name: "ResourceScatterUpdate"
+  name: "ResourceSparseApplyAdadelta"
   input_arg {
-    name: "resource"
+    name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "accum"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "updates"
-    type_attr: "dtype"
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
@@ -53036,24 +57588,51 @@ op {
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   is_stateful: true
 }
 op {
-  name: "ResourceScatterUpdate"
+  name: "ResourceSparseApplyAdadelta"
   input_arg {
-    name: "resource"
+    name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "accum"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "updates"
-    type_attr: "dtype"
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
@@ -53087,24 +57666,51 @@ op {
       }
     }
   }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   is_stateful: true
 }
 op {
-  name: "ResourceScatterUpdate"
+  name: "ResourceSparseApplyAdadelta"
   input_arg {
-    name: "resource"
+    name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "accum"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "updates"
-    type_attr: "dtype"
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
-    name: "dtype"
+    name: "T"
     type: "type"
     allowed_values {
       list {
@@ -53138,40 +57744,17 @@ op {
       }
     }
   }
-  is_stateful: true
-}
-op {
-  name: "ResourceScatterUpdate"
-  input_arg {
-    name: "resource"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "updates"
-    type_attr: "dtype"
-  }
   attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
     }
   }
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdadelta"
+  name: "ResourceSparseApplyAdagrad"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -53180,22 +57763,10 @@ op {
     name: "accum"
     type: DT_RESOURCE
   }
-  input_arg {
-    name: "accum_update"
-    type: DT_RESOURCE
-  }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
   input_arg {
     name: "grad"
     type_attr: "T"
@@ -53246,7 +57817,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdadelta"
+  name: "ResourceSparseApplyAdagrad"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -53255,22 +57826,10 @@ op {
     name: "accum"
     type: DT_RESOURCE
   }
-  input_arg {
-    name: "accum_update"
-    type: DT_RESOURCE
-  }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
   input_arg {
     name: "grad"
     type_attr: "T"
@@ -53323,7 +57882,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdadelta"
+  name: "ResourceSparseApplyAdagrad"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -53332,22 +57891,10 @@ op {
     name: "accum"
     type: DT_RESOURCE
   }
-  input_arg {
-    name: "accum_update"
-    type: DT_RESOURCE
-  }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
   input_arg {
     name: "grad"
     type_attr: "T"
@@ -53401,7 +57948,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdadelta"
+  name: "ResourceSparseApplyAdagrad"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -53410,22 +57957,10 @@ op {
     name: "accum"
     type: DT_RESOURCE
   }
-  input_arg {
-    name: "accum_update"
-    type: DT_RESOURCE
-  }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
-  }
   input_arg {
     name: "grad"
     type_attr: "T"
@@ -53507,80 +58042,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
+        type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -53604,87 +58077,28 @@ op {
       b: false
     }
   }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyAdagrad"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
   attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
+    name: "update_slots"
     type: "bool"
     default_value {
-      b: false
+      b: true
     }
   }
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdagrad"
+  name: "ResourceSparseApplyAdagradDA"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
+    name: "gradient_accumulator"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
   }
   input_arg {
     name: "grad"
@@ -53694,71 +58108,21 @@ op {
     name: "indices"
     type_attr: "Tindices"
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyAdagrad"
   input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
-    name: "lr"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "global_step"
+    type: DT_INT64
   }
   attr {
     name: "T"
@@ -53767,21 +58131,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_INT64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -53802,13 +58163,6 @@ op {
       b: false
     }
   }
-  attr {
-    name: "update_slots"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
   is_stateful: true
 }
 op {
@@ -53868,6 +58222,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -53949,6 +58305,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -54016,21 +58373,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -54054,42 +58411,46 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdagradDA"
+  name: "ResourceSparseApplyCenteredRMSProp"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "gradient_accumulator"
+    name: "mg"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "gradient_squared_accumulator"
+    name: "ms"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "grad"
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "rho"
+    type_attr: "T"
   }
   input_arg {
-    name: "lr"
+    name: "momentum"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "epsilon"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "global_step"
-    type: DT_INT64
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
     name: "T"
@@ -54098,21 +58459,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_INT64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -54196,6 +58554,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -54281,6 +58641,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -54352,21 +58713,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -54390,46 +58751,42 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyCenteredRMSProp"
+  name: "ResourceSparseApplyFtrl"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "mg"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
+    name: "accum"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "mom"
+    name: "linear"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "lr"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "momentum"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "lr_power"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -54438,21 +58795,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_INT64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -54532,6 +58886,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -54613,6 +58969,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -54680,21 +59037,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -54718,7 +59075,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyFtrl"
+  name: "ResourceSparseApplyFtrlV2"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -54751,6 +59108,10 @@ op {
     name: "l2"
     type_attr: "T"
   }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
   input_arg {
     name: "lr_power"
     type_attr: "T"
@@ -54762,21 +59123,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_INT64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -54860,6 +59218,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -54945,6 +59305,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -55009,6 +59370,153 @@ op {
     name: "lr_power"
     type_attr: "T"
   }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyKerasMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
@@ -55028,9 +59536,6 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -55051,87 +59556,8 @@ op {
       b: false
     }
   }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyFtrlV2"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
+    name: "use_nesterov"
     type: "bool"
     default_value {
       b: false
@@ -55140,7 +59566,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyKerasMomentum"
+  name: "ResourceSparseApplyMomentum"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -55172,18 +59598,17 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_INT64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -55261,6 +59686,9 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -55323,17 +59751,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -55367,7 +59796,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyMomentum"
+  name: "ResourceSparseApplyProximalAdagrad"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -55381,80 +59810,11 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "momentum"
+    name: "l1"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyMomentum"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
   input_arg {
-    name: "lr"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
@@ -55465,10 +59825,6 @@ op {
     name: "indices"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
-  }
   attr {
     name: "T"
     type: "type"
@@ -55476,21 +59832,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_INT64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -55511,13 +59864,6 @@ op {
       b: false
     }
   }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
   is_stateful: true
 }
 op {
@@ -55569,6 +59915,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -55642,6 +59990,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -55701,21 +60050,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -55739,17 +60088,13 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyProximalAdagrad"
+  name: "ResourceSparseApplyProximalGradientDescent"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "lr"
+    name: "alpha"
     type_attr: "T"
   }
   input_arg {
@@ -55775,21 +60120,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_INT64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -55857,6 +60199,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -55926,6 +60270,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -55981,21 +60326,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -56019,21 +60364,33 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyProximalGradientDescent"
+  name: "ResourceSparseApplyRMSProp"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "alpha"
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "rho"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
     type_attr: "T"
   }
   input_arg {
@@ -56051,21 +60408,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_INT64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -56145,6 +60499,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -56226,6 +60582,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -56293,21 +60650,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -56331,322 +60688,924 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyRMSProp"
+  name: "ResourceStridedSliceAssign"
   input_arg {
-    name: "var"
+    name: "ref"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "ms"
-    type: DT_RESOURCE
+    name: "begin"
+    type_attr: "Index"
   }
   input_arg {
-    name: "mom"
-    type: DT_RESOURCE
+    name: "end"
+    type_attr: "Index"
   }
   input_arg {
-    name: "lr"
+    name: "strides"
+    type_attr: "Index"
+  }
+  input_arg {
+    name: "value"
     type_attr: "T"
   }
-  input_arg {
-    name: "rho"
-    type_attr: "T"
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "begin_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "end_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ellipsis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "new_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "shrink_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Restore"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "Restore"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RestoreSlice"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slice"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "RestoreSlice"
+  input_arg {
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slice"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RestoreV2"
+  input_arg {
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "RestoreV2"
+  input_arg {
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingADAMParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "velocities"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "velocities"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdadeltaParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updates"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updates"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdagradParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingCenteredRMSPropParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "mom"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "mg"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingFTRLParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "linears"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
-  input_arg {
-    name: "momentum"
-    type_attr: "T"
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "epsilon"
-    type_attr: "T"
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
+  output_arg {
+    name: "linears"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
   attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "ResourceStridedSliceAssign"
-  input_arg {
-    name: "ref"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "begin"
-    type_attr: "Index"
+  name: "RetrieveTPUEmbeddingMDLAdagradLightParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "end"
-    type_attr: "Index"
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "strides"
-    type_attr: "Index"
+  output_arg {
+    name: "weights"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "value"
-    type_attr: "T"
+  output_arg {
+    name: "benefits"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Index"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
   attr {
-    name: "begin_mask"
+    name: "num_shards"
     type: "int"
-    default_value {
-      i: 0
-    }
   }
   attr {
-    name: "end_mask"
+    name: "shard_id"
     type: "int"
-    default_value {
-      i: 0
-    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingMomentumParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
   }
   attr {
-    name: "ellipsis_mask"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "new_axis_mask"
-    type: "int"
+    name: "table_name"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
   }
   attr {
-    name: "shrink_axis_mask"
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
     type: "int"
-    default_value {
-      i: 0
-    }
   }
   is_stateful: true
 }
 op {
-  name: "Restore"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_name"
-    type: DT_STRING
+  name: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "tensor"
-    type_attr: "dt"
+    name: "momenta"
+    type: DT_FLOAT
   }
-  attr {
-    name: "dt"
-    type: "type"
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "preferred_shard"
+    name: "table_id"
     type: "int"
     default_value {
       i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
-}
-op {
-  name: "Restore"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  input_arg {
-    name: "tensor_name"
-    type: DT_STRING
+  attr {
+    name: "num_shards"
+    type: "int"
   }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingProximalAdagradParameters"
   output_arg {
-    name: "tensor"
-    type_attr: "dt"
+    name: "parameters"
+    type: DT_FLOAT
   }
-  attr {
-    name: "dt"
-    type: "type"
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "preferred_shard"
+    name: "table_id"
     type: "int"
     default_value {
       i: -1
     }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "RestoreSlice"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "shape_and_slice"
-    type: DT_STRING
+  name: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "tensor"
-    type_attr: "dt"
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  attr {
-    name: "dt"
-    type: "type"
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "preferred_shard"
+    name: "table_id"
     type: "int"
     default_value {
       i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
-}
-op {
-  name: "RestoreSlice"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  input_arg {
-    name: "tensor_name"
-    type: DT_STRING
+  attr {
+    name: "num_shards"
+    type: "int"
   }
-  input_arg {
-    name: "shape_and_slice"
-    type: DT_STRING
+  attr {
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingRMSPropParameters"
   output_arg {
-    name: "tensor"
-    type_attr: "dt"
+    name: "parameters"
+    type: DT_FLOAT
   }
-  attr {
-    name: "dt"
-    type: "type"
+  output_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "mom"
+    type: DT_FLOAT
   }
   attr {
-    name: "preferred_shard"
+    name: "table_id"
     type: "int"
     default_value {
       i: -1
     }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "RestoreV2"
-  input_arg {
-    name: "prefix"
-    type: DT_STRING
+  name: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "tensor_names"
-    type: DT_STRING
+  output_arg {
+    name: "ms"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "shape_and_slices"
-    type: DT_STRING
+  output_arg {
+    name: "mom"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "tensors"
-    type_list_attr: "dtypes"
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
     has_minimum: true
-    minimum: 1
+    minimum: -1
   }
-}
-op {
-  name: "RestoreV2"
-  input_arg {
-    name: "prefix"
-    type: DT_STRING
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  input_arg {
-    name: "tensor_names"
-    type: DT_STRING
+  attr {
+    name: "num_shards"
+    type: "int"
   }
-  input_arg {
-    name: "shape_and_slices"
-    type: DT_STRING
+  attr {
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingStochasticGradientDescentParameters"
   output_arg {
-    name: "tensors"
-    type_list_attr: "dtypes"
+    name: "parameters"
+    type: DT_FLOAT
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
     has_minimum: true
-    minimum: 1
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
@@ -57950,6 +62909,93 @@ op {
     }
   }
 }
+op {
+  name: "ScaleAndTranslate"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "translation"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "kernel_type"
+    type: "string"
+    default_value {
+      s: "lanczos3"
+    }
+  }
+}
+op {
+  name: "ScaleAndTranslateGrad"
+  input_arg {
+    name: "grads"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "original_image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "translation"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "kernel_type"
+    type: "string"
+    default_value {
+      s: "lanczos3"
+    }
+  }
+}
 op {
   name: "ScatterAdd"
   input_arg {
@@ -61206,21 +66252,71 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "SegmentSum"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -61236,14 +66332,18 @@ op {
   }
 }
 op {
-  name: "SegmentSum"
+  name: "Select"
   input_arg {
-    name: "data"
+    name: "condition"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "t"
     type_attr: "T"
   }
   input_arg {
-    name: "segment_ids"
-    type_attr: "Tindices"
+    name: "e"
+    type_attr: "T"
   }
   output_arg {
     name: "output"
@@ -61252,51 +66352,36 @@ op {
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
+  }
+}
+op {
+  name: "SelfAdjointEig"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "Tindices"
+    name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_DOUBLE
+        type: DT_FLOAT
       }
     }
   }
+  deprecation {
+    version: 11
+  }
 }
 op {
-  name: "Select"
-  input_arg {
-    name: "condition"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "t"
-    type_attr: "T"
-  }
+  name: "SelfAdjointEig"
   input_arg {
-    name: "e"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
@@ -61306,18 +66391,39 @@ op {
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+      }
+    }
+  }
+  deprecation {
+    version: 11
   }
 }
 op {
-  name: "SelfAdjointEig"
+  name: "SelfAdjointEigV2"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "e"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
     type_attr: "T"
   }
+  attr {
+    name: "compute_v"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
   attr {
     name: "T"
     type: "type"
@@ -61328,9 +66434,6 @@ op {
       }
     }
   }
-  deprecation {
-    version: 11
-  }
 }
 op {
   name: "SelfAdjointEigV2"
@@ -61360,6 +66463,8 @@ op {
       list {
         type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
@@ -61392,6 +66497,7 @@ op {
       list {
         type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_HALF
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
@@ -61496,6 +66602,38 @@ op {
     }
   }
 }
+op {
+  name: "SendTPUEmbeddingGradients"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "learning_rates"
+    type: DT_FLOAT
+    number_attr: "NN"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "NN"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  is_stateful: true
+}
 op {
   name: "SerializeIterator"
   input_arg {
@@ -61746,6 +66884,37 @@ op {
     }
   }
 }
+op {
+  name: "ShardDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_shards"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ShardedFilename"
   input_arg {
@@ -61932,6 +67101,10 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ShutdownDistributedTPU"
+  is_stateful: true
+}
 op {
   name: "Sigmoid"
   input_arg {
@@ -72564,6 +77737,114 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "StatefulStandardNormal"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulStandardNormal"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulStandardNormalV2"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "algorithm"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "StatelessIf"
   input_arg {
@@ -74291,6 +79572,52 @@ op {
     }
   }
 }
+op {
+  name: "Svd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "s"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "u"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  attr {
+    name: "compute_uv"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Switch"
   input_arg {
@@ -74453,6 +79780,465 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TPUCompilationResult"
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+}
+op {
+  name: "TPUEmbeddingActivations"
+  input_arg {
+    name: "embedding_variable"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sliced_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "lookup_id"
+    type: "int"
+    has_minimum: true
+  }
+}
+op {
+  name: "TPUOrdinalSelector"
+  output_arg {
+    name: "device_ordinals"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
+op {
+  name: "TPUPartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  input_arg {
+    name: "device_ordinal"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
+  name: "TPUReplicate"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "Tinputs"
+  }
+  input_arg {
+    name: "broadcast_inputs"
+    type_list_attr: "Tbroadcast_inputs"
+  }
+  input_arg {
+    name: "variables"
+    type: DT_RESOURCE
+    number_attr: "NumVariables"
+  }
+  input_arg {
+    name: "guaranteed_constants"
+    type_list_attr: "Tguaranteed_constants"
+  }
+  output_arg {
+    name: "outputs"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "computation"
+    type: "func"
+  }
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_cores_per_replica"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "topology"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_tpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "device_assignment"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "host_compute_core"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "Tinputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tbroadcast_inputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "NumVariables"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "Tguaranteed_constants"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "padding_map"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TPUReplicate"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "Tinputs"
+  }
+  input_arg {
+    name: "broadcast_inputs"
+    type_list_attr: "Tbroadcast_inputs"
+  }
+  input_arg {
+    name: "variables"
+    type: DT_RESOURCE
+    number_attr: "NumVariables"
+  }
+  input_arg {
+    name: "guaranteed_constants"
+    type_list_attr: "Tguaranteed_constants"
+  }
+  output_arg {
+    name: "outputs"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "computation"
+    type: "func"
+  }
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_cores_per_replica"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "topology"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_tpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "device_assignment"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "host_compute_core"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "Tinputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tbroadcast_inputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "NumVariables"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "Tguaranteed_constants"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "padding_map"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "step_marker_location"
+    type: "string"
+    default_value {
+      s: "STEP_MARK_AT_ENTRY"
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TPUReplicateMetadata"
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_cores_per_replica"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "topology"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_tpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "device_assignment"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "computation_shape"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "host_compute_core"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "padding_map"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
+op {
+  name: "TPUReplicateMetadata"
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_cores_per_replica"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "topology"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "use_tpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "device_assignment"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "computation_shape"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "host_compute_core"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "padding_map"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "step_marker_location"
+    type: "string"
+    default_value {
+      s: "STEP_MARK_AT_ENTRY"
+    }
+  }
+}
+op {
+  name: "TPUReplicatedInput"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "TPUReplicatedOutput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "outputs"
+    type_attr: "T"
+    number_attr: "num_replicas"
+  }
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
 op {
   name: "TakeDataset"
   input_arg {
@@ -76202,6 +81988,34 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorListConcat"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+}
 op {
   name: "TensorListConcatLists"
   input_arg {
@@ -76221,6 +82035,43 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorListConcatV2"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "leading_dims"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "TensorListElementShape"
   input_arg {
@@ -76281,6 +82132,10 @@ op {
     name: "indices"
     type: DT_INT32
   }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
   output_arg {
     name: "values"
     type_attr: "element_dtype"
@@ -76300,6 +82155,10 @@ op {
     name: "index"
     type: DT_INT32
   }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
   output_arg {
     name: "item"
     type_attr: "element_dtype"
@@ -76326,6 +82185,10 @@ op {
     name: "input_handle"
     type: DT_VARIANT
   }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
   output_arg {
     name: "output_handle"
     type: DT_VARIANT
@@ -76406,6 +82269,22 @@ op {
     }
   }
 }
+op {
+  name: "TensorListResize"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
 op {
   name: "TensorListScatter"
   input_arg {
@@ -76439,6 +82318,66 @@ op {
     }
   }
 }
+op {
+  name: "TensorListScatterIntoExistingList"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorListScatterV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "num_elements"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "TensorListSetItem"
   input_arg {
@@ -76501,6 +82440,10 @@ op {
     name: "input_handle"
     type: DT_VARIANT
   }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
   output_arg {
     name: "tensor"
     type_attr: "element_dtype"
@@ -77393,6 +83336,33 @@ op {
     }
   }
 }
+op {
+  name: "TridiagonalSolve"
+  input_arg {
+    name: "diagonals"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "TruncateDiv"
   input_arg {
@@ -77807,6 +83777,53 @@ op {
     type: "type"
   }
 }
+op {
+  name: "UnicodeDecode"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "row_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "char_values"
+    type: DT_INT32
+  }
+  attr {
+    name: "input_encoding"
+    type: "string"
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "strict"
+        s: "replace"
+        s: "ignore"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+  attr {
+    name: "replace_control_characters"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "UnicodeDecodeWithOffsets"
   input_arg {
@@ -79422,6 +85439,46 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "While"
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "cond"
+    type: "func"
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "parallel_iterations"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "WholeFileReader"
   output_arg {
@@ -79506,6 +85563,18 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "WorkerHeartbeat"
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "WrapDatasetVariant"
   input_arg {
diff --git a/tensorflow/core/ops/cudnn_rnn_ops.cc b/tensorflow/core/ops/cudnn_rnn_ops.cc
index f84142c992d017ca7cda11f94499571259879d20..cd2e5c9d340d29c4836c89e7f4ab64d6a7595ec1 100644
--- a/tensorflow/core/ops/cudnn_rnn_ops.cc
+++ b/tensorflow/core/ops/cudnn_rnn_ops.cc
@@ -147,6 +147,52 @@ REGISTER_OP("CudnnRNNV2")
       return Status::OK();
     });
 
+REGISTER_OP("CudnnRNNV3")
+    .Input("input: T")
+    .Input("input_h: T")
+    .Input("input_c: T")
+    .Input("params: T")
+    .Input("sequence_lengths: int32")
+    .SetIsStateful()
+    .Output("output: T")
+    .Output("output_h: T")
+    .Output("output_c: T")
+    .Output("reserve_space: T")
+    .Output("host_reserved: int8")
+    .Attr("T: {float16, float32, float64}")
+    .Attr(kRNNModeAttrs)
+    .Attr(kRNNInputModeAttrs)
+    .Attr(kRNNDirectionAttrs)
+    .Attr("dropout: float = 0.0")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
+    .Attr("is_training: bool = true")
+    .SetShapeFn([](InferenceContext* c) {
+      auto input_shape = c->input(0);
+      auto input_h_shape = c->input(1);
+      auto max_seq_length = c->Dim(input_shape, 0);
+      auto batch_size = c->Dim(input_shape, 1);
+      auto num_units = c->Dim(input_h_shape, 2);
+      string direction;
+      TF_RETURN_IF_ERROR(c->GetAttr("direction", &direction));
+      string rnn_mode;
+      TF_RETURN_IF_ERROR(c->GetAttr("rnn_mode", &rnn_mode));
+      int dir_count = (direction == "bidirectional") ? 2 : 1;
+      DimensionHandle output_size;
+      TF_RETURN_IF_ERROR(c->Multiply(num_units, dir_count, &output_size));
+      auto output_shape =
+          c->MakeShape({max_seq_length, batch_size, output_size});
+      auto output_h_shape = input_h_shape;
+      auto output_c_shape TF_ATTRIBUTE_UNUSED =
+          (rnn_mode == "lstm") ? output_h_shape : c->MakeShape({});
+      c->set_output(0, output_shape);
+      c->set_output(1, output_h_shape);
+      c->set_output(2, output_c_shape);
+      c->set_output(3, c->UnknownShape());
+      c->set_output(4, c->UnknownShape());
+      return Status::OK();
+    });
+
 REGISTER_OP("CudnnRNNBackprop")
     .Input("input: T")
     .Input("input_h: T")
@@ -220,6 +266,44 @@ REGISTER_OP("CudnnRNNBackpropV2")
       return Status::OK();
     });
 
+REGISTER_OP("CudnnRNNBackpropV3")
+    .Input("input: T")
+    .Input("input_h: T")
+    .Input("input_c: T")
+    .Input("params: T")
+    .Input("sequence_lengths: int32")
+    .Input("output: T")
+    .Input("output_h: T")
+    .Input("output_c: T")
+    .Input("output_backprop: T")
+    .Input("output_h_backprop: T")
+    .Input("output_c_backprop: T")
+    .Input("reserve_space: T")
+    .Input("host_reserved: int8")
+    .SetIsStateful()
+    .Output("input_backprop: T")
+    .Output("input_h_backprop: T")
+    .Output("input_c_backprop: T")
+    .Output("params_backprop: T")
+    .Attr("T: {float16, float32, float64}")
+    .Attr(kRNNModeAttrs)
+    .Attr(kRNNInputModeAttrs)
+    .Attr(kRNNDirectionAttrs)
+    .Attr("dropout: float = 0.0")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
+    .SetShapeFn([](InferenceContext* c) {
+      auto input_shape = c->input(0);
+      auto input_h_shape = c->input(1);
+      auto input_c_shape = c->input(2);
+      auto params_shape = c->input(3);
+      c->set_output(0, input_shape);
+      c->set_output(1, input_h_shape);
+      c->set_output(2, input_c_shape);
+      c->set_output(3, params_shape);
+      return Status::OK();
+    });
+
 REGISTER_OP("CudnnRNNParamsToCanonical")
     .Input("num_layers: int32")
     .Input("num_units: int32")
diff --git a/tensorflow/core/ops/cudnn_rnn_ops_test.cc b/tensorflow/core/ops/cudnn_rnn_ops_test.cc
index 13c3b933f4da9b966d1c0396793fed61f3ff3107..25121c6484f2288cb55034ca0f7c67d742ea4226 100644
--- a/tensorflow/core/ops/cudnn_rnn_ops_test.cc
+++ b/tensorflow/core/ops/cudnn_rnn_ops_test.cc
@@ -102,4 +102,39 @@ TEST(CudnnRNNOpsTest, ForwardV2Lstm_ShapeFn) {
   INFER_OK(op, input_shapes_desc, output_shapes_desc);
 }
 
+TEST(CudnnRNNOpsTest, ForwardV3Lstm_ShapeFn) {
+  int max_seq_length = 2;
+  int batch_size = 3;
+  int num_units = 4;
+  int num_layers = 5;
+  int dir_count = 1;
+  std::vector<int> input_shape = {max_seq_length, batch_size, num_units};
+  std::vector<int> input_h_shape = {num_layers * dir_count, batch_size,
+                                    num_units};
+  std::vector<int> output_shape = {max_seq_length, batch_size,
+                                   num_units * dir_count};
+  std::vector<int> seq_lengths_shape = {batch_size};
+  auto shape_to_str = [](const std::vector<int>& v) {
+    return strings::StrCat("[", str_util::Join(v, ","), "]");
+  };
+  string input_shapes_desc = strings::StrCat(
+      shape_to_str(input_shape), ";", shape_to_str(input_h_shape), ";",
+      shape_to_str(input_h_shape), ";", "[?]", ";",
+      shape_to_str(seq_lengths_shape));
+  string output_shapes_desc = "[d0_0,d0_1,d1_2];in1;in1;?;?";
+
+  ShapeInferenceTestOp op("CudnnRNNV3");
+  TF_ASSERT_OK(NodeDefBuilder("test", "CudnnRNNV3")
+                   .Input({"input", 0, DT_FLOAT})
+                   .Input({"input_h", 0, DT_FLOAT})
+                   .Input({"input_c", 0, DT_FLOAT})
+                   .Input({"params", 0, DT_FLOAT})
+                   .Input({"sequence_lengths", 0, DT_INT32})
+                   .Attr("rnn_mode", "lstm")
+                   .Attr("input_mode", "auto_select")
+                   .Attr("direction", "unidirectional")
+                   .Finalize(&op.node_def));
+  INFER_OK(op, input_shapes_desc, output_shapes_desc);
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 1c117166de029d40b84bbd2335b9315cdc53bcba..cc7ce542579ef6973a6c7bd5266f8a8f44cf3d0c 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -27,7 +27,7 @@ namespace tensorflow {
 // to a stateful "iterator" by passing the "dataset" to the
 // "MakeIterator" op.
 //
-// TODO(b/65524810): DT_VARIANT tensors that represent "dataset" objects are
+// TODO(b/123753214): DT_VARIANT tensors that represent "dataset" objects are
 // not presently serializable. To avoid issues with constant folding, ensure
 // that any "source dataset" ops (i.e. ops that output a dataset and do not
 // take one as input) are marked "stateful".
@@ -37,7 +37,7 @@ REGISTER_OP("TensorDataset")
     .Output("handle: variant")
     .Attr("Toutput_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);  // TODO(mrry): Validate that
                                                 // `components` have shapes
@@ -49,7 +49,7 @@ REGISTER_OP("TensorSliceDataset")
     .Output("handle: variant")
     .Attr("Toutput_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);  // TODO(mrry): Validate that the
                                                 // dim-0 slices of `components`
@@ -62,7 +62,7 @@ REGISTER_OP("SparseTensorSliceDataset")
     .Input("dense_shape: int64")
     .Output("handle: variant")
     .Attr("Tvalues: type")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
@@ -79,7 +79,7 @@ REGISTER_OP("GeneratorDataset")
     .Attr("Tfinalize_func_args: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
@@ -275,6 +275,22 @@ REGISTER_OP("BatchDatasetV2")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("ShardDataset")
+    .Input("input_dataset: variant")
+    .Input("num_shards: int64")
+    .Input("index: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // num_shards should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      // index should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
 // TODO(mrry): Validate that `padded_shapes` are all vectors, the lengths of
 // `output_types` and `output_shapes` are `N` the `output_shapes` are (as far as
 // possible to tell statically) compatible with `padded_shapes`, and that
@@ -322,7 +338,7 @@ REGISTER_OP("RangeDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -388,7 +404,7 @@ REGISTER_OP("TextLineDataset")
     .Input("compression_type: string")
     .Input("buffer_size: int64")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -408,7 +424,7 @@ REGISTER_OP("FixedLengthRecordDataset")
     .Input("footer_bytes: int64")
     .Input("buffer_size: int64")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -431,7 +447,7 @@ REGISTER_OP("FixedLengthRecordDatasetV2")
     .Input("buffer_size: int64")
     .Input("compression_type: string")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -451,7 +467,7 @@ REGISTER_OP("TFRecordDataset")
     .Input("compression_type: string")
     .Input("buffer_size: int64")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -538,13 +554,22 @@ REGISTER_OP("IteratorGetNextSync")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(IteratorGetNextShapeFn);
 
+// TODO(b/124308596): Instead of conservatively marking this op as stateful,
+// implement a mechanism to determine whether `dataset` has a side-effect
+// and use it to decide whether to use a stateless or stateful version of this
+// op.
 REGISTER_OP("DatasetToSingleElement")
     .Input("dataset: variant")
     .Output("components: output_types")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()
     .SetShapeFn(IteratorGetNextShapeFn);
 
+// TODO(b/124308596): Instead of conservatively marking this op as stateful,
+// implement a mechanism to determine whether `dataset` has a side-effect
+// and use it to decide whether to use a stateless or stateful version of this
+// op.
 REGISTER_OP("ReduceDataset")
     .Input("input_dataset: variant")
     .Input("initial_state: Tstate")
@@ -556,6 +581,7 @@ REGISTER_OP("ReduceDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("use_inter_op_parallelism: bool = true")
+    .SetIsStateful()
     .SetShapeFn(IteratorGetNextShapeFn);
 
 REGISTER_OP("IteratorToStringHandle")
@@ -636,6 +662,8 @@ REGISTER_OP("ModelDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
+// TODO(b/124308749): Add a stateful version of MapDefun and use it when `f`
+// is stateful.
 REGISTER_OP("MapDefun")
     .Input("arguments: Targuments")
     .Input("captured_inputs: Tcaptured")
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index f904e2536dfe67facc25335dc3f86b3d45fd116f..7b9d95a38d143c5173b9f77da7ac6a38c7061591 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -42,7 +42,7 @@ REGISTER_OP("ExperimentalCSVDataset")
     .Output("handle: variant")
     .Attr("output_types: list({float,double,int32,int64,string}) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -76,10 +76,15 @@ REGISTER_OP("ExperimentalDatasetCardinality")
     .Output("cardinality: int64")
     .SetShapeFn(shape_inference::ScalarShape);
 
+// TODO(b/124308596): Instead of conservatively marking this op as stateful,
+// implement a mechanism to determine whether `dataset` has a side-effect
+// and use it to decide whether to use a stateless or stateful version of this
+// op.
 REGISTER_OP("ExperimentalDatasetToTFRecord")
     .Input("input_dataset: variant")
     .Input("filename: string")
     .Input("compression_type: string")
+    .SetIsStateful()
     .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("ExperimentalDenseToSparseBatchDataset")
@@ -190,6 +195,14 @@ REGISTER_OP("ExperimentalMapAndBatchDataset")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("ExperimentalRebatchDataset")
+    .Input("input_dataset: variant")
+    .Input("num_workers: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("ExperimentalMapDataset")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -205,7 +218,7 @@ REGISTER_OP("ExperimentalMapDataset")
 REGISTER_OP("ExperimentalMatchingFilesDataset")
     .Input("patterns: string")
     .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -259,7 +272,7 @@ REGISTER_OP("ExperimentalRandomDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -330,7 +343,7 @@ REGISTER_OP("ExperimentalSqlDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
@@ -352,6 +365,16 @@ REGISTER_OP("ExperimentalStatsAggregatorSummary")
     .Output("summary: string")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("ExperimentalTakeWhileDataset")
+    .Input("input_dataset: variant")
+    .Input("other_arguments: Targuments")
+    .Output("handle: variant")
+    .Attr("predicate: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("ExperimentalUnbatchDataset")
     .Input("input_dataset: variant")
     .Output("handle: variant")
@@ -449,10 +472,19 @@ REGISTER_OP("ExperimentalLMDBDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+    .SetIsStateful()  // TODO(b/123753214): Source dataset ops must be marked
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("ExperimentalChooseFastestDataset")
+    .Input("input_datasets: N * variant")
+    .Output("handle: variant")
+    .Attr("N: int >= 2")
+    .Attr("num_experiments: int")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("ExperimentalIdentityIndexedDataset")
     .Input("size: uint64")
     .Output("handle: variant")
diff --git a/tensorflow/core/ops/fingerprint64_map_ops.cc b/tensorflow/core/ops/fingerprint64_map_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..91b24b401787f154ce67e1c6c7aaaf2a9f65d475
--- /dev/null
+++ b/tensorflow/core/ops/fingerprint64_map_ops.cc
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+
+REGISTER_OP("Fingerprint64Map")
+    .Output("table_handle: resource")
+    .Attr("heterogeneous_key_dtype: type")
+    .Attr("table_value_dtype: type = DT_INT64")
+    .Attr("num_oov_buckets: int >= 1")
+    .Attr("offset: int >= 0 = 0")
+    .Attr("use_node_name_sharing: bool = false")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index 5e0bdd888cea1c508a38afe2f40c7c9f17d28269..4982ec6bd82e2bf221b56a9e75f00ce4f0763f15 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -132,6 +132,35 @@ REGISTER_OP("If")
       return Status::OK();
     });
 
+REGISTER_OP("Case")
+    .Input("branch_index: int32")
+    .Input("input: Tin")
+    .Output("output: Tout")
+    .Attr("Tin: list(type) >= 0")
+    .Attr("Tout: list(type) >= 0")
+    .Attr("branches: list(func) >= 1")
+    .Attr("output_shapes: list(shape) = []")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      std::vector<PartialTensorShape> output_shapes;
+      TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
+      // If `output_shapes` attr is set use that as the shapes of the outputs
+      // else return unknown shapes.
+      if (output_shapes.empty()) return shape_inference::UnknownShape(c);
+      if (output_shapes.size() != c->num_outputs()) {
+        return errors::InvalidArgument(
+            "`output_shapes` must be the same length as num outputs (",
+            output_shapes.size(), " vs. ", c->num_outputs());
+      }
+      for (size_t i = 0; i < output_shapes.size(); ++i) {
+        shape_inference::ShapeHandle output_shape_handle;
+        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
+            output_shapes[i], &output_shape_handle));
+        c->set_output(static_cast<int>(i), output_shape_handle);
+      }
+      return Status::OK();
+    });
+
 // TODO(drpng): remove this.
 REGISTER_OP("_While")
     .Input("input: T")
@@ -170,6 +199,7 @@ REGISTER_OP("While")
     .Attr("cond: func")
     .Attr("body: func")
     .Attr("output_shapes: list(shape) = []")
+    .Attr("parallel_iterations: int = 10")
     .SetIsStateful()
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       std::vector<PartialTensorShape> output_shapes;
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 54272752840bb346cac5b97359a5fd8a014089e4..f451987a9b848af2e3c9f7fab53c15fb969640d0 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -131,6 +131,70 @@ Status NMSShapeFn(InferenceContext* c) {
   return Status::OK();
 }
 
+Status CombinedNMSShapeFn(InferenceContext* c) {
+  // Get inputs and validate ranks
+  ShapeHandle boxes;
+  // boxes is a tensor of Dimensions [batch_size, num_anchors, q, 4]
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &boxes));
+  ShapeHandle scores;
+  // scores is a tensor of Dimensions [batch_size, num_anchors, num_classes]
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &scores));
+  ShapeHandle max_output_size_per_class;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size_per_class));
+  ShapeHandle max_total_size;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &max_total_size));
+  ShapeHandle unused_shape;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused_shape));
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused_shape));
+
+  DimensionHandle unused;
+  // boxes[0] and scores[0] are both batch_size
+  TF_RETURN_IF_ERROR(c->Merge(c->Dim(boxes, 0), c->Dim(scores, 0), &unused));
+  // boxes[1] and scores[1] are both num_anchors
+  TF_RETURN_IF_ERROR(c->Merge(c->Dim(boxes, 1), c->Dim(scores, 1), &unused));
+  // The boxes[3] is 4.
+  TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 3), 4, &unused));
+
+  DimensionHandle d = c->Dim(boxes, 2);
+  DimensionHandle class_dim = c->Dim(scores, 2);
+  if (c->ValueKnown(d) && c->ValueKnown(class_dim)) {
+    if (c->Value(d) != 1 && c->Value(d) != c->Value(class_dim)) {
+      return errors::InvalidArgument(
+          "third dimension of boxes must be either "
+          "1 or equal to the third dimension of scores");
+    }
+  }
+  DimensionHandle output_dim;
+  DimensionHandle batch_dim = c->Dim(boxes, 0);
+
+  TF_RETURN_IF_ERROR(c->MakeDimForScalarInput(3, &output_dim));
+  if (c->ValueKnown(output_dim) && c->Value(output_dim) <= 0) {
+    return errors::InvalidArgument("max_total_size should be > 0 ");
+  }
+  DimensionHandle size_per_class;
+  TF_RETURN_IF_ERROR(c->MakeDimForScalarInput(2, &size_per_class));
+
+  int64 output_size;
+  bool pad_per_class;
+  TF_RETURN_IF_ERROR(c->GetAttr("pad_per_class", &pad_per_class));
+  if (!pad_per_class) {
+    output_size = c->Value(output_dim);
+  } else {
+    if (c->ValueKnown(size_per_class) && c->Value(size_per_class) <= 0) {
+      return errors::InvalidArgument(
+          "max_output_size_per_class must be > 0 "
+          "if pad_per_class is set to true ");
+    }
+    output_size = std::min(c->Value(output_dim),
+                           c->Value(size_per_class) * c->Value(class_dim));
+  }
+  c->set_output(0, c->MakeShape({batch_dim, output_size, 4}));
+  c->set_output(1, c->MakeShape({batch_dim, output_size}));
+  c->set_output(2, c->MakeShape({batch_dim, output_size}));
+  c->set_output(3, c->Vector(batch_dim));
+  return Status::OK();
+}
+
 }  // namespace
 
 // --------------------------------------------------------------------------
@@ -174,6 +238,19 @@ REGISTER_OP("ResizeBilinear")
     .Attr("align_corners: bool = false")
     .SetShapeFn(ResizeShapeFn);
 
+// --------------------------------------------------------------------------
+REGISTER_OP("ScaleAndTranslate")
+    .Input("images: T")
+    .Input("size: int32")
+    .Input("scale: float")
+    .Input("translation: float")
+    .Output("resized_images: float")
+    .Attr(
+        "T: {int8, uint8, int16, uint16, int32, int64, bfloat16, half, "
+        "float, double}")
+    .Attr("kernel_type: string = 'lanczos3'")
+    .SetShapeFn(ResizeShapeFn);
+
 // --------------------------------------------------------------------------
 REGISTER_OP("QuantizedResizeBilinear")
     .Input("images: T")
@@ -208,6 +285,20 @@ REGISTER_OP("ResizeBilinearGrad")
       return Status::OK();
     });
 
+// --------------------------------------------------------------------------
+REGISTER_OP("ScaleAndTranslateGrad")
+    .Input("grads: T")
+    .Input("original_image: T")
+    .Input("scale: float")
+    .Input("translation: float")
+    .Output("output: T")
+    .Attr("T: {float}")
+    .Attr("kernel_type: string = 'lanczos3'")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->input(1));
+      return Status::OK();
+    });
+
 // --------------------------------------------------------------------------
 REGISTER_OP("ResizeNearestNeighbor")
     .Input("images: T")
@@ -381,9 +472,10 @@ REGISTER_OP("AdjustContrast")
 
 // --------------------------------------------------------------------------
 REGISTER_OP("AdjustContrastv2")
-    .Input("images: float")
+    .Input("images: T")
     .Input("contrast_factor: float")
-    .Output("output: float")
+    .Output("output: T")
+    .Attr("T: {half, float} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       // The contrast_factor should be scalar only.
       ShapeHandle unused;
@@ -393,18 +485,20 @@ REGISTER_OP("AdjustContrastv2")
 
 // --------------------------------------------------------------------------
 REGISTER_OP("AdjustHue")
-    .Input("images: float")
+    .Input("images: T")
     .Input("delta: float")
-    .Output("output: float")
+    .Output("output: T")
+    .Attr("T: {half, float} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
     });
 
 // --------------------------------------------------------------------------
 REGISTER_OP("AdjustSaturation")
-    .Input("images: float")
+    .Input("images: T")
     .Input("scale: float")
-    .Output("output: float")
+    .Output("output: T")
+    .Attr("T: {half, float} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
     });
@@ -576,6 +670,7 @@ REGISTER_OP("ExtractGlimpse")
     .Attr("centered: bool = true")
     .Attr("normalized: bool = true")
     .Attr("uniform_noise: bool = true")
+    .Attr("noise: string = 'uniform'")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
@@ -588,6 +683,16 @@ REGISTER_OP("ExtractGlimpse")
       DimensionHandle unused;
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(offsets, 1), 2, &unused));
 
+      bool uniform_noise = false;
+      TF_RETURN_IF_ERROR(c->GetAttr("uniform_noise", &uniform_noise));
+      string noise;
+      TF_RETURN_IF_ERROR(c->GetAttr("noise", &noise));
+      if (uniform_noise && (!noise.empty() && noise != "uniform")) {
+        return errors::InvalidArgument(
+            "The uniform_noise and noise should not be specified at the same "
+            "time");
+      }
+
       return SetOutputToSizedImage(c, batch_dim, 1 /* size_input_idx */,
                                    c->Dim(input, 3));
     });
@@ -778,4 +883,18 @@ REGISTER_OP("NonMaxSuppressionWithOverlaps")
       return Status::OK();
     });
 
+REGISTER_OP("CombinedNonMaxSuppression")
+    .Input("boxes: float")
+    .Input("scores: float")
+    .Input("max_output_size_per_class: int32")
+    .Input("max_total_size: int32")
+    .Input("iou_threshold: float")
+    .Input("score_threshold: float")
+    .Output("nmsed_boxes: float")
+    .Output("nmsed_scores: float")
+    .Output("nmsed_classes: float")
+    .Output("valid_detections: int32")
+    .Attr("pad_per_class: bool = false")
+    .SetShapeFn(CombinedNMSShapeFn);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/image_ops_test.cc b/tensorflow/core/ops/image_ops_test.cc
index 517af26b44f53d85979e1194d1f0d6d8814cb1e8..e517e750955d7fb5335d2766a09e96b5f6382c10 100644
--- a/tensorflow/core/ops/image_ops_test.cc
+++ b/tensorflow/core/ops/image_ops_test.cc
@@ -183,6 +183,13 @@ TEST(ImageOpsTest, ExtractGlimpse_ShapeFn) {
   op.input_tensors.resize(2);
 
   // Inputs are input, size, offsets.
+  TF_ASSERT_OK(NodeDefBuilder("test", "ExtractGlimpse")
+                   .Input({"input", 0, DT_FLOAT})
+                   .Input({"size", 1, DT_INT32})
+                   .Input({"offsets", 2, DT_FLOAT})
+                   .Attr("uniform_noise", true)
+                   .Attr("noise", "")
+                   .Finalize(&op.node_def));
 
   // Rank and size checks.
   INFER_ERROR("Shape must be rank 4 but is rank 5", op, "[1,2,3,4,5];?;?");
diff --git a/tensorflow/core/ops/linalg_ops.cc b/tensorflow/core/ops/linalg_ops.cc
index 952ee4bee2e5a49edeea168f4184767dbebc2527..66594b3576e20a761e26e5b4835571332aaba4f7 100644
--- a/tensorflow/core/ops/linalg_ops.cc
+++ b/tensorflow/core/ops/linalg_ops.cc
@@ -208,12 +208,42 @@ Status SvdShapeFn(InferenceContext* c) {
   return Status::OK();
 }
 
+// The first input is [...,3,M] and second input is [...,M,K].
+// Output is [...,M,K].
+Status TridiagonalSolveShapeFn(InferenceContext* c) {
+  ShapeHandle lhs;
+  ShapeHandle rhs;
+  // Check that rank is at least 2.
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 2, &lhs));
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 2, &rhs));
+
+  // Extract batch dimensions and check they are the same.
+  ShapeHandle lhs_batch_shape;
+  ShapeHandle rhs_batch_shape;
+  TF_RETURN_IF_ERROR(c->Subshape(lhs, 0, -2, &lhs_batch_shape));
+  TF_RETURN_IF_ERROR(c->Subshape(rhs, 0, -2, &rhs_batch_shape));
+  TF_RETURN_IF_ERROR(
+      c->Merge(lhs_batch_shape, rhs_batch_shape, &lhs_batch_shape));
+
+  // Check that "M" is the same in both inputs.
+  DimensionHandle m_lhs = c->Dim(lhs, -1);
+  DimensionHandle m_rhs = c->Dim(rhs, -2);
+  TF_RETURN_IF_ERROR(c->Merge(m_lhs, m_rhs, &m_lhs));
+
+  // Check that next-to-last dimension of the first input is 3.
+  TF_RETURN_IF_ERROR(c->WithValue(c->Dim(lhs, -2), 3, &m_lhs));
+
+  // The output shape is the same as rhs shape.
+  c->set_output(0, rhs);
+  return Status::OK();
+}
+
 }  // namespace
 
 REGISTER_OP("MatrixDeterminant")
     .Input("input: T")
     .Output("output: T")
-    .Attr("T: {float, double, complex64, complex128}")
+    .Attr("T: {half, float, double, complex64, complex128}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input;
       TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 2, &input));
@@ -232,7 +262,7 @@ REGISTER_OP("LogMatrixDeterminant")
     .Input("input: T")
     .Output("sign: T")
     .Output("log_abs_determinant: T")
-    .Attr("T: {float, double, complex64, complex128}")
+    .Attr("T: {half, float, double, complex64, complex128}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input;
       TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 2, &input));
@@ -255,7 +285,7 @@ REGISTER_OP("MatrixInverse")
     .Input("input: T")
     .Output("output: T")
     .Attr("adjoint: bool = False")
-    .Attr("T: {double, float, complex64, complex128}")
+    .Attr("T: {double, float, half, complex64, complex128}")
     .SetShapeFn(BatchUnchangedSquareShapeFn);
 
 REGISTER_OP("MatrixExponential")
@@ -263,7 +293,7 @@ REGISTER_OP("MatrixExponential")
         27, "Use Python implementation tf.linalg.matrix_exponential instead.")
     .Input("input: T")
     .Output("output: T")
-    .Attr("T: {double, float, complex64, complex128}")
+    .Attr("T: {double, float, half, complex64, complex128}")
     .SetShapeFn(BatchUnchangedSquareShapeFn);
 
 REGISTER_OP("MatrixLogarithm")
@@ -275,20 +305,20 @@ REGISTER_OP("MatrixLogarithm")
 REGISTER_OP("Cholesky")
     .Input("input: T")
     .Output("output: T")
-    .Attr("T: {double, float, complex64, complex128}")
+    .Attr("T: {double, float, half, complex64, complex128}")
     .SetShapeFn(BatchUnchangedSquareShapeFn);
 
 REGISTER_OP("CholeskyGrad")
     .Input("l: T")
     .Input("grad: T")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {half, float, double}")
     .SetShapeFn(BatchUnchangedSquareShapeFn);
 
 REGISTER_OP("SelfAdjointEig")
     .Input("input: T")
     .Output("output: T")
-    .Attr("T: {double, float}")
+    .Attr("T: {double, float, half}")
     .Deprecated(11, "Use SelfAdjointEigV2 instead.")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input;
@@ -310,14 +340,14 @@ REGISTER_OP("SelfAdjointEigV2")
     .Output("e: T")
     .Output("v: T")
     .Attr("compute_v: bool = True")
-    .Attr("T: {double, float, complex64, complex128}")
+    .Attr("T: {double, float, half, complex64, complex128}")
     .SetShapeFn(SelfAdjointEigV2ShapeFn);
 
 REGISTER_OP("Lu")
     .Input("input: T")
     .Output("lu: T")
     .Output("p: output_idx_type")
-    .Attr("T: {double, float, complex64, complex128}")
+    .Attr("T: {double, float, half, complex64, complex128}")
     .Attr("output_idx_type: {int32, int64} = DT_INT32")
     .SetShapeFn(LuShapeFn);
 
@@ -326,7 +356,7 @@ REGISTER_OP("MatrixSolve")
     .Input("rhs: T")
     .Output("output: T")
     .Attr("adjoint: bool = False")
-    .Attr("T: {double, float, complex64, complex128}")
+    .Attr("T: {double, float, half, complex64, complex128}")
     .SetShapeFn([](InferenceContext* c) {
       return MatrixSolveShapeFn(c, true /* square (*/);
     });
@@ -337,7 +367,7 @@ REGISTER_OP("MatrixTriangularSolve")
     .Output("output: T")
     .Attr("lower: bool = True")
     .Attr("adjoint: bool = False")
-    .Attr("T: {double, float, complex64, complex128}")
+    .Attr("T: {double, float, half, complex64, complex128}")
     .SetShapeFn([](InferenceContext* c) {
       return MatrixSolveShapeFn(c, true /* square (*/);
     });
@@ -347,7 +377,7 @@ REGISTER_OP("MatrixSolveLs")
     .Input("rhs: T")
     .Input("l2_regularizer: double")
     .Output("output: T")
-    .Attr("T: {double, float, complex64, complex128}")
+    .Attr("T: {double, float, half, complex64, complex128}")
     .Attr("fast: bool = True")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle l2_regularizer;
@@ -358,7 +388,7 @@ REGISTER_OP("MatrixSolveLs")
 REGISTER_OP("MatrixSquareRoot")
     .Input("input: T")
     .Output("output: T")
-    .Attr("T: {double, float, complex64, complex128}")
+    .Attr("T: {double, float, half, complex64, complex128}")
     .SetShapeFn(BatchUnchangedSquareShapeFn);
 
 REGISTER_OP("Qr")
@@ -366,7 +396,7 @@ REGISTER_OP("Qr")
     .Output("q: T")
     .Output("r: T")
     .Attr("full_matrices: bool = False")
-    .Attr("T: {double, float, complex64, complex128}")
+    .Attr("T: {double, float, half, complex64, complex128}")
     .SetShapeFn(QrShapeFn);
 
 REGISTER_OP("Svd")
@@ -376,9 +406,16 @@ REGISTER_OP("Svd")
     .Output("v: T")
     .Attr("compute_uv: bool = True")
     .Attr("full_matrices: bool = False")
-    .Attr("T: {double, float, complex64, complex128}")
+    .Attr("T: {double, float, half, complex64, complex128}")
     .SetShapeFn(SvdShapeFn);
 
+REGISTER_OP("TridiagonalSolve")
+    .Input("diagonals: T")
+    .Input("rhs: T")
+    .Output("output: T")
+    .Attr("T: {double, float, complex64, complex128}")
+    .SetShapeFn(TridiagonalSolveShapeFn);
+
 // Deprecated op registrations:
 
 // Can be deleted after 3feb2017.
diff --git a/tensorflow/core/ops/linalg_ops_test.cc b/tensorflow/core/ops/linalg_ops_test.cc
index bfacee14efa41408865fecb103bc63b5f6de73ff..93732f938a9278f8da322dd8fd98a234695287d9 100644
--- a/tensorflow/core/ops/linalg_ops_test.cc
+++ b/tensorflow/core/ops/linalg_ops_test.cc
@@ -80,12 +80,18 @@ TEST(LinalgOpsTest, SelfAdjointEig_ShapeFn) {
 TEST(LinalgOpsTest, SelfAdjointEigV2_ShapeFn) {
   ShapeInferenceTestOp op("SelfAdjointEigV2");
   auto set_compute_v = [&op](bool compute_v) {
+    // Test for float32
     TF_ASSERT_OK(NodeDefBuilder("test", "Pack")
                      .Input({{"input", 0, DT_FLOAT}})
                      .Attr("compute_v", compute_v)
                      .Finalize(&op.node_def));
-  };
 
+    // Test for float16
+    TF_ASSERT_OK(NodeDefBuilder("test", "Pack")
+                     .Input({{"input", 0, DT_HALF}})
+                     .Attr("compute_v", compute_v)
+                     .Finalize(&op.node_def));
+  };
   set_compute_v(false);
   INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[1]");
   INFER_ERROR("Dimensions must be equal, but are 1 and 2", op, "[1,2]");
@@ -174,10 +180,17 @@ TEST(LinalgOpsTest, MatrixSolveLs_ShapeFn) {
 TEST(LinalgOpsTest, Qr_ShapeFn) {
   ShapeInferenceTestOp op("Qr");
   auto set_attrs = [&op](bool full_matrices) {
+    // Test float32
     TF_ASSERT_OK(NodeDefBuilder("test", "Qr")
                      .Input({"input", 0, DT_FLOAT})
                      .Attr("full_matrices", full_matrices)
                      .Finalize(&op.node_def));
+
+    // Test float16
+    TF_ASSERT_OK(NodeDefBuilder("test", "Qr")
+                     .Input({"input", 0, DT_HALF})
+                     .Attr("full_matrices", full_matrices)
+                     .Finalize(&op.node_def));
   };
 
   // Defining `P` = min(`M`, `N`), if full_matrices = False, then Q should be
@@ -218,11 +231,19 @@ TEST(LinalgOpsTest, Qr_ShapeFn) {
 TEST(LinalgOpsTest, Svd_ShapeFn) {
   ShapeInferenceTestOp op("Svd");
   auto set_attrs = [&op](bool compute_uv, bool full_matrices) {
+    // Test for float32
     TF_ASSERT_OK(NodeDefBuilder("test", "Svd")
                      .Input({"input", 0, DT_FLOAT})
                      .Attr("compute_uv", compute_uv)
                      .Attr("full_matrices", full_matrices)
                      .Finalize(&op.node_def));
+
+    // Test for float16
+    TF_ASSERT_OK(NodeDefBuilder("test", "Svd")
+                     .Input({"input", 0, DT_HALF})
+                     .Attr("compute_uv", compute_uv)
+                     .Attr("full_matrices", full_matrices)
+                     .Finalize(&op.node_def));
   };
 
   // Defining `P` = min(`M`, `N`), if full_matrices = False, then U should be
@@ -293,4 +314,40 @@ TEST(LinalgOpsTest, Lu_ShapeFn) {
            "[d0_0,d0_1,d0_2,d0_3,d0_5,d0_5];[d0_0,d0_1,d0_2,d0_3,d0_5]");
 }
 
+TEST(LinalgOpsTest, TridiagonalSolve_ShapeFn) {
+  ShapeInferenceTestOp op("TridiagonalSolve");
+  INFER_OK(op, "?;?", "in1");
+  INFER_OK(op, "[3,5];[?,1]", "in1");
+  INFER_OK(op, "[?,5];[5,1]", "in1");
+  INFER_OK(op, "[?,5];[?,?]", "in1");
+  INFER_OK(op, "[?,?];[?,?]", "in1");
+  INFER_OK(op, "[3,5];[5,1]", "in1");
+  INFER_OK(op, "[3,5];[5,2]", "in1");
+
+  INFER_OK(op, "[?,?,?];[?,?,?]", "in1");
+  INFER_OK(op, "[?,3,5];[7,5,2]", "in1");
+  INFER_OK(op, "[7,3,5];[?,5,2]", "in1");
+  INFER_OK(op, "[7,?,5];[?,5,?]", "in1");
+  INFER_OK(op, "[7,3,5];[7,5,2]", "in1");
+
+  INFER_OK(op, "[7,?,3,5];[7,8,5,2]", "in1");
+  INFER_OK(op, "[7,8,3,5];[7,8,5,2]", "in1");
+
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[3];[5,1]");
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[3,5];[5]");
+  INFER_ERROR(
+      "Dimension 1 in both shapes must be equal, but are 4 and 8. "
+      "Shapes are [6,4] and [6,8].",
+      op, "[6,4,3,5];[6,8,5,2]");
+  INFER_ERROR(
+      "Dimension 1 in both shapes must be equal, but are 4 and 8. "
+      "Shapes are [?,4] and [6,8].",
+      op, "[?,4,3,5];[6,8,5,2]");
+  INFER_ERROR("Dimension must be 3 but is 4", op, "[4,5];[5,2]");
+  INFER_ERROR("Dimension must be 3 but is 4", op, "[6,4,5];[6,5,2]");
+  INFER_ERROR("Dimensions must be equal, but are 9 and 5", op, "[3,9];[5,2]");
+  INFER_ERROR("Dimensions must be equal, but are 9 and 5", op,
+              "[6,3,9];[6,5,2]");
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc
index 01ebcd15439d670274d7e2a784ce78c5c1ee44ef..123ffc493a929600f940fd41a5645cc39e575ee5 100644
--- a/tensorflow/core/ops/list_ops.cc
+++ b/tensorflow/core/ops/list_ops.cc
@@ -20,6 +20,34 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+// Verifies that `shapes_and_types` is a valid list handle and has the right
+// dtype.
+Status VerifyHandleData(
+    shape_inference::InferenceContext* c,
+    const std::vector<shape_inference::ShapeAndType>& shapes_and_types,
+    DataType element_dtype) {
+  if (shapes_and_types.size() != 1) {
+    return errors::InvalidArgument(
+        "Invalid handle_data for input list. Expected length of "
+        "shape_and_types: ",
+        1, " Saw: ", shapes_and_types.size());
+  }
+  const shape_inference::ShapeAndType& list_shape_type = shapes_and_types[0];
+  if (list_shape_type.dtype != element_dtype) {
+    return errors::InvalidArgument("Expected list with element dtype ",
+                                   DataTypeString(element_dtype),
+                                   " but got list with element dtype ",
+                                   DataTypeString(list_shape_type.dtype));
+  }
+  return Status::OK();
+}
+
+// Assumes that the handle_data is valid.
+shape_inference::ShapeHandle GetElementShapeFromHandleData(
+    const std::vector<shape_inference::ShapeAndType>& shapes_and_types) {
+  return shapes_and_types[0].shape;
+}
+
 REGISTER_OP("EmptyTensorList")
     .Input("element_shape: shape_type")
     .Input("max_num_elements: int32")
@@ -51,11 +79,11 @@ REGISTER_OP("TensorListPushBack")
       shape_inference::ShapeHandle element_shape = c->UnknownShape();
 
       auto* handle_data = c->input_handle_shapes_and_types(0);
-      if (handle_data != nullptr && handle_data->size() != 1) {
+      if (handle_data != nullptr && handle_data->size() > 1) {
         return errors::InvalidArgument(
             "Trying to push to list with wrong variant data.");
       }
-      if (handle_data != nullptr) {
+      if (handle_data != nullptr && handle_data->size() == 1) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
         if (list_shape_type.dtype != element_dtype) {
@@ -98,11 +126,11 @@ REGISTER_OP("TensorListPushBackBatch")
       shape_inference::ShapeHandle element_shape = c->UnknownShape();
 
       auto* handle_data = c->input_handle_shapes_and_types(0);
-      if (handle_data != nullptr && handle_data->size() != 1) {
+      if (handle_data != nullptr && handle_data->size() > 1) {
         return errors::InvalidArgument(
             "Trying to push to list with wrong variant data.");
       }
-      if (handle_data != nullptr) {
+      if (handle_data != nullptr && handle_data->size() == 1) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
         if (list_shape_type.dtype != element_dtype) {
@@ -130,6 +158,7 @@ REGISTER_OP("TensorListLength")
 
 REGISTER_OP("TensorListPopBack")
     .Input("input_handle: variant")
+    .Input("element_shape: int32")
     .Output("output_handle: variant")
     .Output("tensor: element_dtype")
     .Attr("element_dtype: type")
@@ -138,11 +167,11 @@ REGISTER_OP("TensorListPopBack")
       TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
       shape_inference::ShapeHandle tensor_shape = c->UnknownShape();
       auto* handle_data = c->input_handle_shapes_and_types(0);
-      if (handle_data != nullptr && handle_data->size() != 1) {
+      if (handle_data != nullptr && handle_data->size() > 1) {
         return errors::InvalidArgument(
             "Trying to read from list with invalid variant data.");
       }
-      if (handle_data != nullptr) {
+      if (handle_data != nullptr && handle_data->size() == 1) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
         if (list_shape_type.dtype != element_dtype) {
@@ -166,6 +195,7 @@ REGISTER_OP("TensorListPopBack")
 
 REGISTER_OP("TensorListStack")
     .Input("input_handle: variant")
+    .Input("element_shape: int32")
     .Output("tensor: element_dtype")
     .Attr("element_dtype: type")
     .Attr("num_elements: int = -1")
@@ -174,11 +204,11 @@ REGISTER_OP("TensorListStack")
       TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
       shape_inference::ShapeHandle element_shape = c->UnknownShape();
       auto* handle_data = c->input_handle_shapes_and_types(0);
-      if (handle_data != nullptr && handle_data->size() != 1) {
+      if (handle_data != nullptr && handle_data->size() > 1) {
         return errors::InvalidArgument(
             "Trying to read from list with wrong variant data.");
       }
-      if (handle_data != nullptr) {
+      if (handle_data != nullptr && handle_data->size() == 1) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
         if (list_shape_type.dtype != element_dtype) {
@@ -207,46 +237,70 @@ REGISTER_OP("TensorListStack")
       return Status::OK();
     });
 
+Status TensorListConcatShapeInference(
+    shape_inference::InferenceContext* c,
+    shape_inference::ShapeHandle element_shape) {
+  DataType element_dtype;
+  TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+  auto* handle_data = c->input_handle_shapes_and_types(0);
+  if (handle_data != nullptr && handle_data->size() > 1) {
+    return errors::InvalidArgument(
+        "Trying to read from list with wrong variant data.");
+  }
+  if (handle_data != nullptr && handle_data->size() == 1) {
+    const shape_inference::ShapeAndType& list_shape_type = (*handle_data)[0];
+    if (list_shape_type.dtype != element_dtype) {
+      return errors::InvalidArgument(
+          "Trying to read from list with wrong element dtype. List has "
+          "type ",
+          DataTypeString(list_shape_type.dtype), " but expected type ",
+          DataTypeString(element_dtype));
+    }
+    shape_inference::ShapeHandle merged;
+    TF_RETURN_IF_ERROR(c->Merge(element_shape, list_shape_type.shape, &merged));
+    element_shape = merged;
+  }
+  if (c->RankKnown(element_shape)) {
+    shape_inference::ShapeHandle result;
+    TF_RETURN_IF_ERROR(c->Subshape(element_shape, 1, &result));
+    TF_RETURN_IF_ERROR(
+        c->Concatenate(c->MakeShape({c->UnknownDim()}), result, &result));
+    c->set_output(0, result);
+  } else {
+    c->set_output(0, c->UnknownShape());
+  }
+  c->set_output(1, c->MakeShape({c->UnknownDim()}));
+  return Status::OK();
+}
+
 REGISTER_OP("TensorListConcat")
     .Input("input_handle: variant")
     .Output("tensor: element_dtype")
     .Output("lengths: int64")
     .Attr("element_dtype: type")
+    .Attr("element_shape: shape = { unknown_rank: true }")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType element_dtype;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
-      shape_inference::ShapeHandle element_shape = c->UnknownShape();
-      auto* handle_data = c->input_handle_shapes_and_types(0);
-      if (handle_data != nullptr && handle_data->size() != 1) {
-        return errors::InvalidArgument(
-            "Trying to read from list with wrong variant data.");
-      }
-      if (handle_data != nullptr) {
-        const shape_inference::ShapeAndType& list_shape_type =
-            (*handle_data)[0];
-        if (list_shape_type.dtype != element_dtype) {
-          return errors::InvalidArgument(
-              "Trying to read from list with wrong element dtype. List has "
-              "type ",
-              DataTypeString(list_shape_type.dtype), " but expected type ",
-              DataTypeString(element_dtype));
-        }
-        shape_inference::ShapeHandle ignored;
-        TF_RETURN_IF_ERROR(
-            c->Merge(element_shape, list_shape_type.shape, &ignored));
-        element_shape = list_shape_type.shape;
-      }
-      if (c->RankKnown(element_shape)) {
-        shape_inference::ShapeHandle result;
-        TF_RETURN_IF_ERROR(c->Subshape(element_shape, 1, &result));
-        TF_RETURN_IF_ERROR(
-            c->Concatenate(c->MakeShape({c->UnknownDim()}), result, &result));
-        c->set_output(0, result);
-      } else {
-        c->set_output(0, c->UnknownShape());
-      }
-      c->set_output(1, c->MakeShape({c->UnknownDim()}));
-      return Status::OK();
+      PartialTensorShape raw_element_shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_shape", &raw_element_shape));
+      shape_inference::ShapeHandle element_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(raw_element_shape,
+                                                            &element_shape));
+      return TensorListConcatShapeInference(c, element_shape);
+    });
+
+REGISTER_OP("TensorListConcatV2")
+    .Input("input_handle: variant")
+    .Input("element_shape: shape_type")
+    .Input("leading_dims: int64")
+    .Output("tensor: element_dtype")
+    .Output("lengths: int64")
+    .Attr("element_dtype: type")
+    .Attr("shape_type: {int32, int64}")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle element_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+          1, &element_shape));
+      return TensorListConcatShapeInference(c, element_shape);
     });
 
 REGISTER_OP("TensorListSplit")
@@ -345,6 +399,7 @@ REGISTER_OP("TensorListReserve")
 REGISTER_OP("TensorListGetItem")
     .Input("input_handle: variant")
     .Input("index: int32")
+    .Input("element_shape: int32")
     .Output("item: element_dtype")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
@@ -367,6 +422,24 @@ REGISTER_OP("TensorListGetItem")
       return Status::OK();
     });
 
+REGISTER_OP("TensorListResize")
+    .Input("input_handle: variant")
+    .Input("size: int32")
+    .Output("output_handle: variant")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Check that `size` has scalar shape.
+      shape_inference::ShapeHandle size_shape = c->input(1);
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(size_shape, 0, &unused));
+      c->set_output(0, c->Scalar());
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr) {
+        c->set_output_handle_shapes_and_types(0, *handle_data);
+      }
+      return Status::OK();
+    });
+
 REGISTER_OP("TensorListSetItem")
     .Input("input_handle: variant")
     .Input("index: int32")
@@ -394,6 +467,7 @@ REGISTER_OP("TensorListSetItem")
 REGISTER_OP("TensorListGather")
     .Input("input_handle: variant")
     .Input("indices: int32")
+    .Input("element_shape: int32")
     .Output("values: element_dtype")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
@@ -437,6 +511,54 @@ REGISTER_OP("TensorListScatter")
       return Status::OK();
     });
 
+REGISTER_OP("TensorListScatterV2")
+    .Input("tensor: element_dtype")
+    .Input("indices: int32")
+    .Input("element_shape: shape_type")
+    .Input("num_elements: int32")
+    .Output("output_handle: variant")
+    .Attr("element_dtype: type")
+    .Attr("shape_type: {int32, int64}")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+          2, &element_shape));
+      c->set_output_handle_shapes_and_types(0,
+                                            {{element_shape, element_dtype}});
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorListScatterIntoExistingList")
+    .Input("input_handle: variant")
+    .Input("tensor: element_dtype")
+    .Input("indices: int32")
+    .Output("output_handle: variant")
+    .Attr("element_dtype: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle ignored;
+      // Check that tensor is at least a vector.
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &ignored));
+      // Check that indices is a vector.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &ignored));
+
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape = c->UnknownShape();
+
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr) {
+        TF_RETURN_IF_ERROR(VerifyHandleData(c, *handle_data, element_dtype));
+        element_shape = GetElementShapeFromHandleData(*handle_data);
+      }
+      c->set_output_handle_shapes_and_types(0,
+                                            {{element_shape, element_dtype}});
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
 REGISTER_OP("TensorListConcatLists")
     .Input("input_a: variant")
     .Input("input_b: variant")
@@ -453,15 +575,18 @@ REGISTER_OP("TensorListConcatLists")
 
       auto* handle_data_a = c->input_handle_shapes_and_types(0);
       auto* handle_data_b = c->input_handle_shapes_and_types(1);
-      if (handle_data_a == nullptr && handle_data_b == nullptr) {
+      if ((handle_data_a == nullptr || handle_data_a->empty()) &&
+          (handle_data_b == nullptr || handle_data_b->empty())) {
         c->set_output_handle_shapes_and_types(
             0, {{c->UnknownShape(), element_dtype}});
         return Status::OK();
       }
       shape_inference::ShapeAndType list_shape_type_a =
-          (handle_data_a) ? handle_data_a->at(0) : handle_data_b->at(0);
+          (handle_data_a && !handle_data_a->empty()) ? handle_data_a->at(0)
+                                                     : handle_data_b->at(0);
       const shape_inference::ShapeAndType& list_shape_type_b =
-          (handle_data_b) ? handle_data_b->at(0) : handle_data_a->at(0);
+          (handle_data_b && !handle_data_b->empty()) ? handle_data_b->at(0)
+                                                     : handle_data_a->at(0);
       if (list_shape_type_a.dtype != element_dtype) {
         return errors::InvalidArgument("input_a.type != element_dtype: ",
                                        DataTypeString(list_shape_type_a.dtype),
diff --git a/tensorflow/core/ops/lookup_table_ops.cc b/tensorflow/core/ops/lookup_table_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3ce08f6f2f9d7eec7cd2222de2456170e4976d6c
--- /dev/null
+++ b/tensorflow/core/ops/lookup_table_ops.cc
@@ -0,0 +1,61 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+
+REGISTER_OP("LookupTableInsertOrAssignOp")
+    .Input("table_int64_args: num_int64_table_args * int64")
+    .Input("table_handle: resource")
+    .Input("keys: insert_key_tensor_dtype")
+    .Input("values: table_value_dtype")
+    .Attr("insert_key_tensor_dtype: type")
+    .Attr("table_value_dtype: type")
+    .Attr("num_int64_table_args: int >= 0")
+    .SetShapeFn([](InferenceContext* c) {
+      // Note that, by design, shape checks are implementation dependent so they
+      // must be deferred until runtime.
+      return Status::OK();
+    });
+
+REGISTER_OP("LookupTableFindOp")
+    .Input("table_int64_args: num_int64_table_args * int64")
+    .Input("table_handle: resource")
+    .Input("keys: lookup_key_tensor_dtype")
+    .Input("num_threads: int64")
+    .Output("values: table_value_dtype")
+    .Attr("table_value_dtype: type")
+    .Attr("lookup_key_tensor_dtype: type")
+    .Attr("num_int64_table_args: int >= 0")
+    .SetShapeFn([](InferenceContext* c) {
+      // The output shape cannot be inferred here because the key size
+      // cannot be inferred from the key tensor in general.
+      c->set_output(0, c->UnknownShape());
+      return Status::OK();
+    });
+
+REGISTER_OP("ContainerSizeOp")
+    .Input("container_handle: resource")
+    .Output("size: int64")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/math_grad.cc b/tensorflow/core/ops/math_grad.cc
index 55dcc50325f600730376c492fa3a2cdde4293ace..a8d454038c9b9d413c9d2cde44fb675da21856e4 100644
--- a/tensorflow/core/ops/math_grad.cc
+++ b/tensorflow/core/ops/math_grad.cc
@@ -469,6 +469,16 @@ Status MulGrad(const AttrSlice& attrs, FunctionDef* g) {
 }
 REGISTER_OP_GRADIENT("Mul", MulGrad);
 
+Status MulNoNanGrad(const AttrSlice& attrs, FunctionDef* g) {
+  // clang-format off
+  return GradForBinaryCwise(g, {
+      {{"gx"}, "MulNoNan", {"y", "dz"}},  // y * dz
+      {{"gy"}, "MulNoNan", {"x", "dz"}},  // x * dz
+  });
+  // clang-format on
+}
+REGISTER_OP_GRADIENT("MulNoNan", MulGrad);
+
 Status DivGrad(const AttrSlice& attrs, FunctionDef* g) {
   // clang-format off
   return GradForBinaryCwise(g, {
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 6f261dc1b1813ea1e78736725bdf8af66eab2c18..3cab5419abd4072f3655f46b128e45f400af241e 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -39,7 +39,61 @@ REGISTER_OP("AddN")
                                         " with other shapes.");
       }
       c->set_output(0, cur);
-      return Status::OK();
+
+      DataType dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("T", &dtype));
+
+      if (dtype != DT_VARIANT) {
+        // Exit early if not DT_VARIANT.
+        return Status::OK();
+      } else {
+        // DT_VARIANT shape handle shape inference.  All sizes and dtypes must
+        // be the same; all shapes must be compatible via Merge.
+        std::vector<shape_inference::ShapeAndType> cur_shapes_and_types;
+        auto* shapes_and_types =
+            c->input_handle_shapes_and_types(c->num_inputs() - 1);
+        if (shapes_and_types) {
+          cur_shapes_and_types = *shapes_and_types;
+        }
+
+        for (int i = c->num_inputs() - 2; i >= 0; --i) {
+          auto shapes_and_types_i = c->input_handle_shapes_and_types(i);
+          if (!shapes_and_types && shapes_and_types_i) {
+            // TODO(ebrevdo): Find cases where this happens and fix their shape
+            // inference.  If we are calling AddN on variant types, they should
+            // all have consistent shape_and_type info.
+            shapes_and_types = shapes_and_types_i;
+          } else if (shapes_and_types && shapes_and_types_i) {
+            if (shapes_and_types_i->size() != shapes_and_types->size()) {
+              return errors::InvalidArgument(
+                  "shapes_and_types[", i,
+                  "].size() == ", shapes_and_types_i->size(),
+                  " != shapes_and_types[0].size() == ",
+                  shapes_and_types->size());
+            }
+            for (int j = 0; j < shapes_and_types->size(); ++j) {
+              if (shapes_and_types->at(j).dtype !=
+                  shapes_and_types_i->at(j).dtype) {
+                return errors::InvalidArgument(
+                    "shapes_and_types[", i, "][", j, "].dtype() == ",
+                    DataTypeString(shapes_and_types_i->at(j).dtype),
+                    " != shapes_and_types[0][", j, "].dtype == ",
+                    DataTypeString(shapes_and_types->at(j).dtype));
+              }
+              TF_RETURN_WITH_CONTEXT_IF_ERROR(
+                  c->Merge(shapes_and_types_i->at(j).shape,
+                           cur_shapes_and_types.at(j).shape,
+                           &cur_shapes_and_types.at(j).shape),
+                  "From merging shapes_and_types[", i, "][", j, "].shape with ",
+                  "shapes_and_types[0][", j, "].shape");
+            }
+          }
+        }
+        if (shapes_and_types) {
+          c->set_output_handle_shapes_and_types(0, cur_shapes_and_types);
+        }
+        return Status::OK();
+      }
     });
 
 // --------------------------------------------------------------------------
@@ -377,6 +431,14 @@ Returns x - y element-wise.
 REGISTER_OP("Mul").BINARY_MORE().SetIsCommutative().SetShapeFn(
     shape_inference::BroadcastBinaryOpShapeFn);
 
+REGISTER_OP("MulNoNan")
+    .Input("x: T")
+    .Input("y: T")
+    .Output("z: T")
+    .Attr("T: {float, double}")
+    .SetIsCommutative()
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
+
 REGISTER_OP("_MklMul")
     .BINARY_MORE()
     .Input("mkl_x: uint8")
@@ -796,6 +858,15 @@ REGISTER_OP("Sum")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .SetShapeFn(shape_inference::ReductionShape);
 
+REGISTER_OP("EuclideanNorm")
+    .Input("input: T")
+    .Input("reduction_indices: Tidx")
+    .Output("output: T")
+    .Attr("keep_dims: bool = false")
+    .Attr("T: numbertype")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .SetShapeFn(shape_inference::ReductionShape);
+
 REGISTER_OP("Mean")
     .Input("input: T")
     .Input("reduction_indices: Tidx")
@@ -1368,7 +1439,14 @@ REGISTER_OP("Conj")
     .Input("input: T")
     .Output("output: T")
     .Attr("T: {complex64, complex128, variant} = DT_COMPLEX64")
-    .SetShapeFn(shape_inference::UnchangedShape);
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr) {
+        c->set_output_handle_shapes_and_types(0, *handle_data);
+      }
+      return Status::OK();
+    });
 
 // --------------------------------------------------------------------------
 
@@ -1688,4 +1766,50 @@ inputs: Must all be the same size and shape.
 
 #endif  // INTEL_MKL
 
+REGISTER_OP("RequantizePerChannel")
+    .Input("input: T")
+    .Input("input_min: float")
+    .Input("input_max: float")
+    .Input("requested_output_min: float")
+    .Input("requested_output_max: float")
+    .Output("output: out_type")
+    .Output("output_min: float")
+    .Output("output_max: float")
+    .Attr("T: quantizedtype = DT_QINT32")
+    .Attr("out_type: quantizedtype = DT_QUINT8")
+    .SetShapeFn([](InferenceContext* c) {
+      TF_RETURN_IF_ERROR(shape_inference::UnchangedShape(c));
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+REGISTER_OP("RequantizationRangePerChannel")
+    .Input("input: T")
+    .Input("input_min: float")
+    .Input("input_max: float")
+    .Output("output_min: float")
+    .Output("output_max: float")
+    .Attr("T: quantizedtype = DT_QINT32")
+    .Attr("clip_value_max: float")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      c->set_output(0, c->Scalar());
+      c->set_output(1, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("NextAfter")
+    .Attr("T: {float64, float32} = DT_FLOAT")
+    .Input("x1: T")
+    .Input("x2: T")
+    .Output("output: T")
+    .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index 05379a7d699629d733cacd71343fc9d912eb0893..1e6dbbfb2f5cc33136ef114b0cf0105a64a53976 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -144,6 +144,7 @@ TEST(MathOpsTest, BroadcastBinaryOps_ShapeFn) {
     INFER_OK(op, "[1];[2]", "[d1_0]");
     INFER_OK(op, "[2];[1]", "[d0_0]");
     INFER_OK(op, "[2];[]", "[d0_0]");
+    INFER_OK(op, "[2];[?]", "[d0_0]");
 
     INFER_OK(op, "[0];[0]", "[d0_0|d1_0]");
     INFER_OK(op, "[];[0]", "[d1_0]");
@@ -151,6 +152,9 @@ TEST(MathOpsTest, BroadcastBinaryOps_ShapeFn) {
     INFER_OK(op, "[0];[1]", "[d0_0]");
     INFER_OK(op, "[0];[]", "[d0_0]");
 
+    INFER_OK(op, "[2];[?,?]", "[d1_0,d0_0]");
+    INFER_OK(op, "[2,2];[?,?,?]", "[d1_0,d0_0,d0_1]");
+
     // Multiple dimension cases (same test cases, switching x and y).
     INFER_OK(op, "[?,1,2,3,4,5];[3,1,?]",
              "[d0_0,d0_1,d0_2,d0_3|d1_0,d0_4,d0_5]");
diff --git a/tensorflow/core/ops/mkl_array_ops.cc b/tensorflow/core/ops/mkl_array_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e7ad3be611218734f769f9b108e0ff85052c2e72
--- /dev/null
+++ b/tensorflow/core/ops/mkl_array_ops.cc
@@ -0,0 +1,92 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+// This file contains the registration of MKL-DNN array ops.
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/util/mirror_pad_mode.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/strided_slice_op.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+using shape_inference::UnchangedShape;
+
+// Adding QuantizedConcatV2 op to be able to replace it by
+// _MklQuantizedConcatV2 in the graph rewrite.
+REGISTER_OP("QuantizedConcatV2")
+    .Input("values: N * T")
+    .Input("axis: Tidx")
+    .Input("input_mins: N * float32")
+    .Input("input_maxes: N * float32")
+    .Output("output: T")
+    .Output("output_min: float")
+    .Output("output_max: float")
+    .Attr("N: int >= 2")
+    .Attr("T: type")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) {
+      const int n = (c->num_inputs() - 1) / 3;
+      TF_RETURN_IF_ERROR(shape_inference::QuantizedConcatV2Shape(c, n));
+      ShapeHandle unused;
+      for (int i = n + 1; i < c->num_inputs(); ++i) {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 0, &unused));
+      }
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklQuantizedConcatV2")
+    .Input("values: N * T")
+    .Input("axis: Tidx")
+    .Input("input_mins:  N * float32")
+    .Input("input_maxes: N * float32")
+    .Input("mkl_values: N * uint8")
+    .Input("mkl_axis: uint8")
+    .Input("mkl_input_mins:  N * uint8")
+    .Input("mkl_input_maxes: N * uint8")
+    .Output("output: T")
+    .Output("output_min: float")
+    .Output("output_max: float")
+    .Output("mkl_output: uint8")
+    .Output("mkl_output_min: uint8")
+    .Output("mkl_output_max: uint8")
+    .Attr("N: int >= 2")
+    .Attr("T: type")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) {
+      const int n = (c->num_inputs() / 2 - 1) / 3;
+      TF_RETURN_IF_ERROR(shape_inference::QuantizedConcatV2Shape(c, n));
+      ShapeHandle unused;
+      for (int i = n + 1; i < c->num_inputs() / 2; ++i) {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 0, &unused));
+      }
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      return Status::OK();
+    });
+}  // namespace tensorflow
+
+#endif
diff --git a/tensorflow/core/ops/mkl_nn_ops.cc b/tensorflow/core/ops/mkl_nn_ops.cc
index 9be3470820eb523e8d41f8bf63434cbb534034d8..b23c37356651be1bd973c8c1476d9bb6e7e1cd31 100644
--- a/tensorflow/core/ops/mkl_nn_ops.cc
+++ b/tensorflow/core/ops/mkl_nn_ops.cc
@@ -32,6 +32,91 @@ using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
+REGISTER_OP("_MklFusedConv2D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("args: num_args * T")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_args: num_args * uint8")
+    .Output("output: T")
+    .Output("filter_output: T")
+    .Output("mkl_output: uint8")
+    .Output("mkl_filter_output: uint8")
+    .Attr("T: {float}")
+    .Attr("num_args: int >= 0")
+    .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = false")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("fused_ops: list(string) = []")
+    // Attributes for the FusedBatchNorm ------------------------------------ //
+    .Attr("epsilon: float = 0.0001")
+    // ---------------------------------------------------------------------- //
+    .SetShapeFn(shape_inference::Conv2DShape)
+    .Doc(R"doc(
+*NOTE*: Do not invoke this operator directly in Python. MKL DNN graph transformer
+ is expected to create these operators.
+)doc");
+
+REGISTER_OP("__MklDummyPadWithFusedConv2D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("args: num_args * T")
+    .Input("paddings: Tpaddings")
+    .Output("output: T")
+    .Output("filter_output: T")
+    .Output("mkl_output: uint8")
+    .Output("mkl_filter_output: uint8")
+    .Attr("T: {float}")
+    .Attr("num_args: int >= 0")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("fused_ops: list(string) = []")
+    .Attr("Tpaddings: {int32, int64} = DT_INT32")
+    // Attributes for the FusedBatchNorm ------------------------------------ //
+    .Attr("epsilon: float = 0.0001")
+    // ---------------------------------------------------------------------- //
+    .SetShapeFn(shape_inference::Conv2DShape)
+    .Doc(R"doc(
+*NOTE*: Do not invoke this operator directly in Python. MKL DNN graph transformer
+ is expected to create these operators.
+)doc");
+
+REGISTER_OP("_MklPadWithFusedConv2D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("args: num_args * T")
+    .Input("paddings: Tpaddings")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_args: num_args * uint8")
+    .Input("mkl_paddings: uint8")
+    .Output("output: T")
+    .Output("filter_output: T")
+    .Output("mkl_output: uint8")
+    .Output("mkl_filter_output: uint8")
+    .Attr("T: {float}")
+    .Attr("num_args: int >= 0")
+    .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = false")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("fused_ops: list(string) = []")
+    .Attr("Tpaddings: {int32, int64} = DT_INT32")
+    // Attributes for the FusedBatchNorm ------------------------------------ //
+    .Attr("epsilon: float = 0.0001")
+    // ---------------------------------------------------------------------- //
+    .SetShapeFn(shape_inference::Conv2DShape)
+    .Doc(R"doc(
+*NOTE*: Do not invoke this operator directly in Python. MKL DNN graph transformer
+ is expected to create these operators.
+)doc");
+
 REGISTER_OP("_MklQuantizedMaxPool")
     .Input("input:         T")
     .Input("min_input:     float")
@@ -118,6 +203,7 @@ REGISTER_OP("_MklQuantizedConv2D")
     .Attr("out_type: quantizedtype = DT_QINT32")
     .Attr("data_format: string = 'NHWC'")
     .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
@@ -162,6 +248,7 @@ REGISTER_OP("_MklQuantizedConv2DAndRequantize")
     .Attr("out_type: quantizedtype = DT_QINT8")
     .Attr("data_format: string = 'NHWC'")
     .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
@@ -206,6 +293,7 @@ REGISTER_OP("_MklQuantizedConv2DWithBias")
     .Attr("out_type: quantizedtype = DT_QINT32")
     .Attr("data_format: string = 'NHWC'")
     .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
@@ -254,6 +342,7 @@ REGISTER_OP("_MklQuantizedConv2DWithBiasAndRequantize")
     .Attr("out_type: quantizedtype = DT_QINT8")
     .Attr("data_format: string = 'NHWC'")
     .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
@@ -295,6 +384,7 @@ REGISTER_OP("_MklQuantizedConv2DAndRelu")
     .Attr("out_type: quantizedtype = DT_QINT32")
     .Attr("data_format: string = 'NHWC'")
     .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
@@ -339,6 +429,7 @@ REGISTER_OP("_MklQuantizedConv2DAndReluAndRequantize")
     .Attr("out_type: quantizedtype = DT_QUINT8")
     .Attr("data_format: string = 'NHWC'")
     .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
@@ -383,6 +474,7 @@ REGISTER_OP("_MklQuantizedConv2DWithBiasAndRelu")
     .Attr("out_type: quantizedtype = DT_QINT32")
     .Attr("data_format: string = 'NHWC'")
     .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
@@ -431,6 +523,7 @@ REGISTER_OP("_MklQuantizedConv2DWithBiasAndReluAndRequantize")
     .Attr("out_type: quantizedtype = DT_QUINT8")
     .Attr("data_format: string = 'NHWC'")
     .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
@@ -478,6 +571,7 @@ REGISTER_OP("_MklQuantizedConv2DWithBiasSumAndRelu")
     .Attr("out_type: quantizedtype = DT_QINT32")
     .Attr("data_format: string = 'NHWC'")
     .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
@@ -533,6 +627,7 @@ REGISTER_OP("_MklQuantizedConv2DWithBiasSumAndReluAndRequantize")
     .Attr("out_type: quantizedtype = DT_QUINT8")
     .Attr("data_format: string = 'NHWC'")
     .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
@@ -590,6 +685,7 @@ REGISTER_OP("_MklQuantizedConv2DWithBiasSignedSumAndReluAndRequantize")
     .Attr("out_type: quantizedtype = DT_QUINT8")
     .Attr("data_format: string = 'NHWC'")
     .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
@@ -607,6 +703,50 @@ REGISTER_OP("_MklQuantizedConv2DWithBiasSignedSumAndReluAndRequantize")
       return Status::OK();
     });
 
+REGISTER_OP("_MklDepthwiseConv2dNativeBackpropInput")
+    .Input("input_sizes: int32")
+    .Input("filter: T")
+    .Input("out_backprop: T")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_out_backprop: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
+      TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
+      c->set_output(0, s);
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklDepthwiseConv2dNativeBackpropFilter")
+    .Input("input: T")
+    .Input("filter_sizes: int32")
+    .Input("out_backprop: T")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_out_backprop: uint8")
+    .Output("output: T")
+    .Output("mkl_output: uint8")
+    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &s));
+      TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
+      c->set_output(0, s);
+      return Status::OK();
+    });
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index bc59abc54cc1b87af3c06ce5cfda6fe5dca86e36..ef7a65c01135c7eb4dfc0a50c1e505321a85efc2 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -269,10 +269,11 @@ REGISTER_OP("Conv2D")
     .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
-    .Attr(GetPaddingAttrString())
+    .Attr(GetPaddingAttrStringWithExplicit())
+    .Attr(GetExplicitPaddingsAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
-    .SetShapeFn(shape_inference::Conv2DShape);
+    .SetShapeFn(shape_inference::Conv2DShapeWithExplicitPadding);
 
 REGISTER_OP("Conv2DBackpropInput")
     .Input("input_sizes: int32")
@@ -282,7 +283,8 @@ REGISTER_OP("Conv2DBackpropInput")
     .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
-    .Attr(GetPaddingAttrString())
+    .Attr(GetPaddingAttrStringWithExplicit())
+    .Attr(GetExplicitPaddingsAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
@@ -304,7 +306,8 @@ REGISTER_OP("Conv2DBackpropFilter")
     .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
-    .Attr(GetPaddingAttrString())
+    .Attr(GetPaddingAttrStringWithExplicit())
+    .Attr(GetExplicitPaddingsAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .SetShapeFn([](InferenceContext* c) {
@@ -326,6 +329,7 @@ REGISTER_OP("_FusedConv2D")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("use_cudnn_on_gpu: bool = true")
     .Attr("fused_ops: list(string) = []")
     // Attributes for the FusedBatchNorm ------------------------------------ //
     .Attr("epsilon: float = 0.0001")
@@ -1539,6 +1543,23 @@ REGISTER_OP("QuantizedBatchNormWithGlobalNormalization")
     });
 
 #ifdef INTEL_MKL
+REGISTER_OP("_MklDepthwiseConv2dNative")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Output("output: T")
+    .Output("filter_output: T")
+    .Output("mkl_output: uint8")
+    .Output("mkl_filter_output: uint8")
+    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = false")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn(shape_inference::DepthwiseConv2DNativeShape);
+
 REGISTER_OP("_MklConv2D")
     .Input("input: T")
     .Input("filter: T")
@@ -1551,6 +1572,7 @@ REGISTER_OP("_MklConv2D")
     .Attr("T: {half, float, double}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr("is_filter_const: bool = false")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
@@ -1570,6 +1592,7 @@ REGISTER_OP("__MklDummyConv2DWithBias")
     .Attr("T: {half, float, double}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr("is_filter_const: bool = false")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
@@ -1597,6 +1620,7 @@ REGISTER_OP("_MklConv2DWithBias")
     .Attr("T: {half, float, double}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr("is_filter_const: bool = false")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
@@ -1617,6 +1641,7 @@ REGISTER_OP("__MklDummyPadWithConv2D")
     .Attr("T: {half, float, double}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr("is_filter_const: bool = false")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
@@ -1647,6 +1672,7 @@ REGISTER_OP("_MklPadWithConv2D")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr("is_filter_const: bool = false")
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .Attr("Tpaddings: {int32, int64} = DT_INT32")
     .SetShapeFn(shape_inference::Conv2DShape)
@@ -1832,6 +1858,7 @@ REGISTER_OP("_MklConv3D")
     .Output("mkl_filter_output: uint8")
     .Attr("T: {half, float, double}")
     .Attr("strides: list(int) >= 5")
+    .Attr("is_filter_const: bool = false")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
@@ -1964,6 +1991,40 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("_MklLeakyRelu")
+    .Input("features: T")
+    .Input("mkl_features: uint8")
+    .Output("activations: T")
+    .Output("mkl_activations: uint8")
+    .Attr("T: {half, float, double} = DT_FLOAT")
+    .Attr("alpha: float = 0.2")
+    .SetShapeFn(shape_inference::UnchangedShape)
+    .Doc(R"doc(
+MKL version of LeakyRelu operator. Uses MKL DNN APIs to implement
+LeakyRelu operator.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklLeakyReluGrad")
+    .Input("gradients: T")
+    .Input("features: T")
+    .Input("mkl_gradients: uint8")
+    .Input("mkl_features: uint8")
+    .Output("backprops: T")
+    .Output("mkl_backprops: uint8")
+    .Attr("T: {half, float, double} = DT_FLOAT")
+    .Attr("alpha: float = 0.2")
+    .SetShapeFn(shape_inference::MergeBothInputsShapeFn)
+    .Doc(R"doc(
+MKL version of LeakyReluGrad operator. Uses MKL DNN APIs to compute rectified
+linear gradients for LeakyReluGrad operation.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 REGISTER_OP("_MklElu")
     .Input("features: T")
     .Input("mkl_features: uint8")
@@ -2480,6 +2541,7 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+#endif  // INTEL_MKL
 REGISTER_OP("QuantizedConv2DAndRequantize")
     .Input("input: Tinput")
     .Input("filter: Tfilter")
@@ -2816,6 +2878,5 @@ REGISTER_OP("QuantizedConv2DWithBiasSignedSumAndReluAndRequantize")
       return Status::OK();
     });
 
-#endif  // INTEL_MKL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 89bdcc571efee6c0d193341936758670c1218aab..b2e7ff1b2e159f00d5b0a0af7cd6e4ba8791981a 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -486,7 +486,7 @@ op {
   name: "AdjustContrastv2"
   input_arg {
     name: "images"
-    type: DT_FLOAT
+    type_attr: "T"
   }
   input_arg {
     name: "contrast_factor"
@@ -494,14 +494,27 @@ op {
   }
   output_arg {
     name: "output"
-    type: DT_FLOAT
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
   }
 }
 op {
   name: "AdjustHue"
   input_arg {
     name: "images"
-    type: DT_FLOAT
+    type_attr: "T"
   }
   input_arg {
     name: "delta"
@@ -509,14 +522,27 @@ op {
   }
   output_arg {
     name: "output"
-    type: DT_FLOAT
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
   }
 }
 op {
   name: "AdjustSaturation"
   input_arg {
     name: "images"
-    type: DT_FLOAT
+    type_attr: "T"
   }
   input_arg {
     name: "scale"
@@ -524,7 +550,20 @@ op {
   }
   output_arg {
     name: "output"
-    type: DT_FLOAT
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+      }
+    }
   }
 }
 op {
@@ -612,6 +651,59 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "AllToAll"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "concat_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_dimension"
+    type: "int"
+  }
+  attr {
+    name: "split_count"
+    type: "int"
+  }
+}
 op {
   name: "Angle"
   input_arg {
@@ -5102,6 +5194,46 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "Case"
+  input_arg {
+    name: "branch_index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "branches"
+    type: "list(func)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Cast"
   input_arg {
@@ -5195,6 +5327,7 @@ op {
       list {
         type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_HALF
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
@@ -5220,6 +5353,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -5356,6 +5490,87 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveGather"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectivePermute"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "source_target_pairs"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
 op {
   name: "CollectiveReduce"
   input_arg {
@@ -5417,8 +5632,66 @@ op {
     name: "subdiv_offsets"
     type: "list(int)"
   }
+  attr {
+    name: "wait_for"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
   is_stateful: true
 }
+op {
+  name: "CombinedNonMaxSuppression"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size_per_class"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "max_total_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "nmsed_boxes"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "nmsed_scores"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "nmsed_classes"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "valid_detections"
+    type: DT_INT32
+  }
+  attr {
+    name: "pad_per_class"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "CompareAndBitpack"
   input_arg {
@@ -5749,6 +6022,35 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ConfigureDistributedTPU"
+  output_arg {
+    name: "topology"
+    type: DT_STRING
+  }
+  attr {
+    name: "embedding_config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "tpu_embedding_config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "is_global_init"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Conj"
   input_arg {
@@ -5876,6 +6178,15 @@ op {
       list {
         s: "SAME"
         s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
       }
     }
   }
@@ -5953,6 +6264,15 @@ op {
       list {
         s: "SAME"
         s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
       }
     }
   }
@@ -6030,6 +6350,15 @@ op {
       list {
         s: "SAME"
         s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
       }
     }
   }
@@ -6805,6 +7134,33 @@ op {
     }
   }
 }
+op {
+  name: "CrossReplicaSum"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_assignment"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_INT32
+        type: DT_UINT32
+      }
+    }
+  }
+}
 op {
   name: "CudnnRNN"
   input_arg {
@@ -7203,131 +7559,74 @@ op {
   is_stateful: true
 }
 op {
-  name: "CudnnRNNCanonicalToParams"
+  name: "CudnnRNNBackpropV3"
   input_arg {
-    name: "num_layers"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "num_units"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "input_size"
-    type: DT_INT32
+    name: "input"
+    type_attr: "T"
   }
   input_arg {
-    name: "weights"
+    name: "input_h"
     type_attr: "T"
-    number_attr: "num_params"
   }
   input_arg {
-    name: "biases"
+    name: "input_c"
     type_attr: "T"
-    number_attr: "num_params"
   }
-  output_arg {
+  input_arg {
     name: "params"
     type_attr: "T"
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "num_params"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "rnn_mode"
-    type: "string"
-    default_value {
-      s: "lstm"
-    }
-    allowed_values {
-      list {
-        s: "rnn_relu"
-        s: "rnn_tanh"
-        s: "lstm"
-        s: "gru"
-      }
-    }
+  input_arg {
+    name: "sequence_lengths"
+    type: DT_INT32
   }
-  attr {
-    name: "input_mode"
-    type: "string"
-    default_value {
-      s: "linear_input"
-    }
-    allowed_values {
-      list {
-        s: "linear_input"
-        s: "skip_input"
-        s: "auto_select"
-      }
-    }
+  input_arg {
+    name: "output"
+    type_attr: "T"
   }
-  attr {
-    name: "direction"
-    type: "string"
-    default_value {
-      s: "unidirectional"
-    }
-    allowed_values {
-      list {
-        s: "unidirectional"
-        s: "bidirectional"
-      }
-    }
+  input_arg {
+    name: "output_h"
+    type_attr: "T"
   }
-  attr {
-    name: "dropout"
-    type: "float"
-    default_value {
-      f: 0
-    }
+  input_arg {
+    name: "output_c"
+    type_attr: "T"
   }
-  attr {
-    name: "seed"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "output_backprop"
+    type_attr: "T"
   }
-  attr {
-    name: "seed2"
-    type: "int"
-    default_value {
-      i: 0
-    }
+  input_arg {
+    name: "output_h_backprop"
+    type_attr: "T"
   }
-}
-op {
-  name: "CudnnRNNParamsSize"
   input_arg {
-    name: "num_layers"
-    type: DT_INT32
+    name: "output_c_backprop"
+    type_attr: "T"
   }
   input_arg {
-    name: "num_units"
-    type: DT_INT32
+    name: "reserve_space"
+    type_attr: "T"
   }
   input_arg {
-    name: "input_size"
-    type: DT_INT32
+    name: "host_reserved"
+    type: DT_INT8
   }
   output_arg {
-    name: "params_size"
-    type_attr: "S"
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_h_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_c_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "params_backprop"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -7340,16 +7639,6 @@ op {
       }
     }
   }
-  attr {
-    name: "S"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
   attr {
     name: "rnn_mode"
     type: "string"
@@ -7413,9 +7702,10 @@ op {
       i: 0
     }
   }
+  is_stateful: true
 }
 op {
-  name: "CudnnRNNParamsToCanonical"
+  name: "CudnnRNNCanonicalToParams"
   input_arg {
     name: "num_layers"
     type: DT_INT32
@@ -7429,19 +7719,19 @@ op {
     type: DT_INT32
   }
   input_arg {
-    name: "params"
-    type_attr: "T"
-  }
-  output_arg {
     name: "weights"
     type_attr: "T"
     number_attr: "num_params"
   }
-  output_arg {
+  input_arg {
     name: "biases"
     type_attr: "T"
     number_attr: "num_params"
   }
+  output_arg {
+    name: "params"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
@@ -7524,42 +7814,379 @@ op {
   }
 }
 op {
-  name: "CudnnRNNV2"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
+  name: "CudnnRNNParamsSize"
   input_arg {
-    name: "input_h"
-    type_attr: "T"
+    name: "num_layers"
+    type: DT_INT32
   }
   input_arg {
-    name: "input_c"
-    type_attr: "T"
+    name: "num_units"
+    type: DT_INT32
   }
   input_arg {
-    name: "params"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_h"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_c"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "reserve_space"
-    type_attr: "T"
+    name: "input_size"
+    type: DT_INT32
   }
   output_arg {
-    name: "host_reserved"
-    type: DT_INT8
+    name: "params_size"
+    type_attr: "S"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "S"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "CudnnRNNParamsToCanonical"
+  input_arg {
+    name: "num_layers"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "num_units"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "weights"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  output_arg {
+    name: "biases"
+    type_attr: "T"
+    number_attr: "num_params"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "num_params"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
+op {
+  name: "CudnnRNNV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "host_reserved"
+    type: DT_INT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "rnn_mode"
+    type: "string"
+    default_value {
+      s: "lstm"
+    }
+    allowed_values {
+      list {
+        s: "rnn_relu"
+        s: "rnn_tanh"
+        s: "lstm"
+        s: "gru"
+      }
+    }
+  }
+  attr {
+    name: "input_mode"
+    type: "string"
+    default_value {
+      s: "linear_input"
+    }
+    allowed_values {
+      list {
+        s: "linear_input"
+        s: "skip_input"
+        s: "auto_select"
+      }
+    }
+  }
+  attr {
+    name: "direction"
+    type: "string"
+    default_value {
+      s: "unidirectional"
+    }
+    allowed_values {
+      list {
+        s: "unidirectional"
+        s: "bidirectional"
+      }
+    }
+  }
+  attr {
+    name: "dropout"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "CudnnRNNV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_h"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_c"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "params"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "sequence_lengths"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_h"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output_c"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "reserve_space"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "host_reserved"
+    type: DT_INT8
   }
   attr {
     name: "T"
@@ -7887,6 +8514,7 @@ op {
     has_minimum: true
     minimum: 1
   }
+  is_stateful: true
 }
 op {
   name: "DebugGradientIdentity"
@@ -8444,6 +9072,8 @@ op {
         type: DT_INT16
         type: DT_INT8
         type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
@@ -9761,6 +10391,124 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "EnqueueTPUEmbeddingIntegerBatch"
+  input_arg {
+    name: "batch"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "EnqueueTPUEmbeddingSparseBatch"
+  input_arg {
+    name: "sample_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "embedding_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "aggregation_weights"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "combiners"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "EnqueueTPUEmbeddingSparseTensorBatch"
+  input_arg {
+    name: "sample_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "embedding_indices"
+    type: DT_INT32
+    number_attr: "N"
+  }
+  input_arg {
+    name: "aggregation_weights"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "mode_override"
+    type: DT_STRING
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "combiners"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "table_ids"
+    type: "list(int)"
+  }
+  is_stateful: true
+}
 op {
   name: "EnsureShape"
   input_arg {
@@ -9899,6 +10647,66 @@ op {
     }
   }
 }
+op {
+  name: "EuclideanNorm"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "Exit"
   input_arg {
@@ -10090,6 +10898,40 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalChooseFastestDataset"
+  input_arg {
+    name: "input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "num_experiments"
+    type: "int"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalDatasetCardinality"
   input_arg {
@@ -10115,6 +10957,7 @@ op {
     name: "compression_type"
     type: DT_STRING
   }
+  is_stateful: true
 }
 op {
   name: "ExperimentalDenseToSparseBatchDataset"
@@ -10895,6 +11738,33 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalRebatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_workers"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalScanDataset"
   input_arg {
@@ -11112,6 +11982,42 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalTakeWhileDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "predicate"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalThreadPoolDataset"
   input_arg {
@@ -11287,6 +12193,13 @@ op {
       b: true
     }
   }
+  attr {
+    name: "noise"
+    type: "string"
+    default_value {
+      s: "uniform"
+    }
+  }
 }
 op {
   name: "ExtractImagePatches"
@@ -14114,6 +15027,108 @@ op {
     }
   }
 }
+op {
+  name: "InfeedDequeue"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "InfeedDequeueTuple"
+  output_arg {
+    name: "outputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  is_stateful: true
+}
+op {
+  name: "InfeedEnqueue"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+    default_value {
+      shape {
+      }
+    }
+  }
+  attr {
+    name: "layout"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "InfeedEnqueueTuple"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  attr {
+    name: "layouts"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "InitializeTable"
   input_arg {
@@ -14772,6 +15787,44 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "KMC2ChainInitialization"
+  input_arg {
+    name: "distances"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "index"
+    type: DT_INT64
+  }
+}
+op {
+  name: "KmeansPlusPlusInitialization"
+  input_arg {
+    name: "points"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "num_to_sample"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_retries_per_sample"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "samples"
+    type: DT_FLOAT
+  }
+}
 op {
   name: "L2Loss"
   input_arg {
@@ -15309,997 +16362,699 @@ op {
   is_stateful: true
 }
 op {
-  name: "Log"
+  name: "LoadTPUEmbeddingADAMParameters"
   input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "parameters"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "Log1p"
   input_arg {
-    name: "x"
-    type_attr: "T"
+    name: "momenta"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "y"
-    type_attr: "T"
+  input_arg {
+    name: "velocities"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
     }
-  }
-}
-op {
-  name: "LogMatrixDeterminant"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "sign"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "log_abs_determinant"
-    type_attr: "T"
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
     }
   }
-}
-op {
-  name: "LogSoftmax"
-  input_arg {
-    name: "logits"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "logsoftmax"
-    type_attr: "T"
+  attr {
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LogUniformCandidateSampler"
+  name: "LoadTPUEmbeddingADAMParametersGradAccumDebug"
   input_arg {
-    name: "true_classes"
-    type: DT_INT64
+    name: "parameters"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "sampled_candidates"
-    type: DT_INT64
+  input_arg {
+    name: "momenta"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "true_expected_count"
+  input_arg {
+    name: "velocities"
     type: DT_FLOAT
   }
-  output_arg {
-    name: "sampled_expected_count"
+  input_arg {
+    name: "gradient_accumulators"
     type: DT_FLOAT
   }
   attr {
-    name: "num_true"
+    name: "table_id"
     type: "int"
+    default_value {
+      i: -1
+    }
     has_minimum: true
-    minimum: 1
+    minimum: -1
   }
   attr {
-    name: "num_sampled"
-    type: "int"
-    has_minimum: true
-    minimum: 1
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "unique"
-    type: "bool"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "range_max"
+    name: "shard_id"
     type: "int"
-    has_minimum: true
-    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "LoadTPUEmbeddingAdadeltaParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "updates"
+    type: DT_FLOAT
   }
   attr {
-    name: "seed"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "seed2"
-    type: "int"
+    name: "table_name"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
   }
-  is_stateful: true
-}
-op {
-  name: "LogicalAnd"
-  input_arg {
-    name: "x"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "y"
-    type: DT_BOOL
+  attr {
+    name: "num_shards"
+    type: "int"
   }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
+  attr {
+    name: "shard_id"
+    type: "int"
   }
-  is_commutative: true
+  is_stateful: true
 }
 op {
-  name: "LogicalNot"
+  name: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug"
   input_arg {
-    name: "x"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "y"
-    type: DT_BOOL
+    name: "parameters"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "LogicalOr"
   input_arg {
-    name: "x"
-    type: DT_BOOL
+    name: "accumulators"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "y"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
+    name: "updates"
+    type: DT_FLOAT
   }
-  is_commutative: true
-}
-op {
-  name: "LookupTableExport"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "keys"
-    type_attr: "Tkeys"
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
-  output_arg {
-    name: "values"
-    type_attr: "Tvalues"
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tkeys"
-    type: "type"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "Tvalues"
-    type: "type"
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LookupTableExportV2"
+  name: "LoadTPUEmbeddingAdagradParameters"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "parameters"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "keys"
-    type_attr: "Tkeys"
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "values"
-    type_attr: "Tvalues"
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tkeys"
-    type: "type"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tvalues"
-    type: "type"
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableFind"
+  name: "LoadTPUEmbeddingAdagradParametersGradAccumDebug"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "accumulators"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "default_value"
-    type_attr: "Tout"
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "values"
-    type_attr: "Tout"
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LookupTableFindV2"
+  name: "LoadTPUEmbeddingCenteredRMSPropParameters"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "ms"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "default_value"
-    type_attr: "Tout"
+    name: "mom"
+    type: DT_FLOAT
   }
-  output_arg {
-    name: "values"
-    type_attr: "Tout"
+  input_arg {
+    name: "mg"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableImport"
+  name: "LoadTPUEmbeddingFTRLParameters"
   input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "accumulators"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "values"
-    type_attr: "Tout"
+    name: "linears"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tout"
-    type: "type"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "LookupTableImportV2"
+  name: "LoadTPUEmbeddingFTRLParametersGradAccumDebug"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "accumulators"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "values"
-    type_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "type"
-  }
-  attr {
-    name: "Tout"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "LookupTableInsert"
-  input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
+    name: "linears"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tin"
-  }
-  input_arg {
-    name: "values"
-    type_attr: "Tout"
-  }
-  attr {
-    name: "Tin"
-    type: "type"
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tout"
-    type: "type"
-  }
-}
-op {
-  name: "LookupTableInsertV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
-  }
-  input_arg {
-    name: "values"
-    type_attr: "Tout"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tout"
-    type: "type"
-  }
-  is_stateful: true
-}
-op {
-  name: "LookupTableRemoveV2"
-  input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "keys"
-    type_attr: "Tin"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "Tin"
-    type: "type"
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "LookupTableSize"
-  input_arg {
-    name: "table_handle"
-    type: DT_STRING
-    is_ref: true
-  }
-  output_arg {
-    name: "size"
-    type: DT_INT64
-  }
-}
-op {
-  name: "LookupTableSizeV2"
+  name: "LoadTPUEmbeddingMDLAdagradLightParameters"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "size"
-    type: DT_INT64
+    name: "parameters"
+    type: DT_FLOAT
   }
-  is_stateful: true
-}
-op {
-  name: "LoopCond"
   input_arg {
-    name: "input"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "output"
-    type: DT_BOOL
+    name: "accumulators"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "LowerBound"
   input_arg {
-    name: "sorted_inputs"
-    type_attr: "T"
+    name: "weights"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "out_type"
+    name: "benefits"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "out_type"
-    type: "type"
+    name: "table_name"
+    type: "string"
     default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+      s: ""
     }
   }
-}
-op {
-  name: "Lu"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "lu"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "p"
-    type_attr: "output_idx_type"
-  }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "output_idx_type"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "MakeIterator"
+  name: "LoadTPUEmbeddingMomentumParameters"
   input_arg {
-    name: "dataset"
-    type: DT_VARIANT
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
-  }
-  is_stateful: true
-}
-op {
-  name: "MapClear"
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "momenta"
+    type: DT_FLOAT
   }
   attr {
-    name: "memory_limit"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
     has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "container"
+    name: "table_name"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "MapDataset"
+  name: "LoadTPUEmbeddingMomentumParametersGradAccumDebug"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "momenta"
+    type: DT_FLOAT
   }
-  attr {
-    name: "use_inter_op_parallelism"
-    type: "bool"
-    default_value {
-      b: true
-    }
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "preserve_cardinality"
-    type: "bool"
+    name: "table_id"
+    type: "int"
     default_value {
-      b: false
+      i: -1
     }
-  }
-}
-op {
-  name: "MapDefun"
-  input_arg {
-    name: "arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "captured_inputs"
-    type_list_attr: "Tcaptured"
-  }
-  output_arg {
-    name: "output"
-    type_list_attr: "output_types"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
     has_minimum: true
-    minimum: 1
+    minimum: -1
   }
   attr {
-    name: "Tcaptured"
-    type: "list(type)"
+    name: "table_name"
+    type: "string"
     default_value {
-      list {
-      }
+      s: ""
     }
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "f"
-    type: "func"
+    name: "shard_id"
+    type: "int"
   }
+  is_stateful: true
 }
 op {
-  name: "MapIncompleteSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
+  name: "LoadTPUEmbeddingProximalAdagradParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "memory_limit"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
     has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "container"
+    name: "table_name"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "MapPeek"
+  name: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug"
   input_arg {
-    name: "key"
-    type: DT_INT64
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
+    name: "accumulators"
+    type: DT_FLOAT
   }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "memory_limit"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
     has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
+    name: "table_name"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "MapSize"
-  output_arg {
-    name: "size"
-    type: DT_INT32
+  name: "LoadTPUEmbeddingRMSPropParameters"
+  input_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "mom"
+    type: DT_FLOAT
   }
   attr {
-    name: "memory_limit"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
     has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "container"
+    name: "table_name"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "MapStage"
+  name: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug"
   input_arg {
-    name: "key"
-    type: DT_INT64
+    name: "parameters"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "indices"
-    type: DT_INT32
+    name: "ms"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "values"
-    type_list_attr: "fake_dtypes"
+    name: "mom"
+    type: DT_FLOAT
   }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+  input_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "memory_limit"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
     has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "fake_dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
+    name: "table_name"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "MapUnstage"
-  input_arg {
-    name: "key"
-    type: DT_INT64
-  }
+  name: "LoadTPUEmbeddingStochasticGradientDescentParameters"
   input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
+    name: "parameters"
+    type: DT_FLOAT
   }
   attr {
-    name: "memory_limit"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
     has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
+    name: "table_name"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "MapUnstageNoKey"
+  name: "Log"
   input_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "key"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "values"
-    type_list_attr: "dtypes"
-  }
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "MatMul"
-  input_arg {
-    name: "a"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "b"
+    name: "x"
     type_attr: "T"
   }
   output_arg {
-    name: "product"
+    name: "y"
     type_attr: "T"
   }
-  attr {
-    name: "transpose_a"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "transpose_b"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
   attr {
     name: "T"
     type: "type"
@@ -16309,8 +17064,6 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
@@ -16318,60 +17071,42 @@ op {
   }
 }
 op {
-  name: "MatchingFiles"
-  input_arg {
-    name: "pattern"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "filenames"
-    type: DT_STRING
-  }
-}
-op {
-  name: "MatrixBandPart"
+  name: "Log1p"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "num_lower"
-    type_attr: "Tindex"
-  }
-  input_arg {
-    name: "num_upper"
-    type_attr: "Tindex"
-  }
   output_arg {
-    name: "band"
+    name: "y"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "Tindex"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "MatrixDeterminant"
+  name: "LogMatrixDeterminant"
   input_arg {
     name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "sign"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "log_abs_determinant"
     type_attr: "T"
   }
   attr {
@@ -16379,6 +17114,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -16388,337 +17124,1347 @@ op {
   }
 }
 op {
-  name: "MatrixDiag"
+  name: "LogSoftmax"
   input_arg {
-    name: "diagonal"
+    name: "logits"
     type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "logsoftmax"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
 }
 op {
-  name: "MatrixDiagPart"
+  name: "LogUniformCandidateSampler"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "true_classes"
+    type: DT_INT64
   }
   output_arg {
-    name: "diagonal"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
+    name: "sampled_candidates"
+    type: DT_INT64
   }
-}
-op {
-  name: "MatrixExponential"
-  input_arg {
-    name: "input"
-    type_attr: "T"
+  output_arg {
+    name: "true_expected_count"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "sampled_expected_count"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+    name: "num_true"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
-  deprecation {
-    version: 27
-    explanation: "Use Python implementation tf.linalg.matrix_exponential instead."
+  attr {
+    name: "num_sampled"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
-}
-op {
-  name: "MatrixInverse"
-  input_arg {
-    name: "input"
-    type_attr: "T"
+  attr {
+    name: "unique"
+    type: "bool"
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  attr {
+    name: "range_max"
+    type: "int"
+    has_minimum: true
+    minimum: 1
   }
   attr {
-    name: "adjoint"
-    type: "bool"
+    name: "seed"
+    type: "int"
     default_value {
-      b: false
+      i: 0
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
     }
   }
+  is_stateful: true
 }
 op {
-  name: "MatrixLogarithm"
+  name: "LogicalAnd"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "x"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "y"
+    type: DT_BOOL
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "z"
+    type: DT_BOOL
   }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
+  is_commutative: true
+}
+op {
+  name: "LogicalNot"
+  input_arg {
+    name: "x"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
   }
 }
 op {
-  name: "MatrixSetDiag"
+  name: "LogicalOr"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "x"
+    type: DT_BOOL
   }
   input_arg {
-    name: "diagonal"
-    type_attr: "T"
+    name: "y"
+    type: DT_BOOL
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "z"
+    type: DT_BOOL
+  }
+  is_commutative: true
+}
+op {
+  name: "LookupTableExport"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "keys"
+    type_attr: "Tkeys"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "Tvalues"
   }
   attr {
-    name: "T"
+    name: "Tkeys"
+    type: "type"
+  }
+  attr {
+    name: "Tvalues"
     type: "type"
   }
 }
 op {
-  name: "MatrixSolve"
+  name: "LookupTableExportV2"
   input_arg {
-    name: "matrix"
-    type_attr: "T"
+    name: "table_handle"
+    type: DT_RESOURCE
   }
-  input_arg {
-    name: "rhs"
-    type_attr: "T"
+  output_arg {
+    name: "keys"
+    type_attr: "Tkeys"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "values"
+    type_attr: "Tvalues"
   }
   attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "Tkeys"
+    type: "type"
   }
   attr {
-    name: "T"
+    name: "Tvalues"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
   }
+  is_stateful: true
 }
 op {
-  name: "MatrixSolveLs"
+  name: "LookupTableFind"
   input_arg {
-    name: "matrix"
-    type_attr: "T"
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
   input_arg {
-    name: "rhs"
-    type_attr: "T"
+    name: "keys"
+    type_attr: "Tin"
   }
   input_arg {
-    name: "l2_regularizer"
-    type: DT_DOUBLE
+    name: "default_value"
+    type_attr: "Tout"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "T"
+    name: "Tin"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
   }
   attr {
-    name: "fast"
-    type: "bool"
-    default_value {
-      b: true
-    }
+    name: "Tout"
+    type: "type"
   }
 }
 op {
-  name: "MatrixSquareRoot"
+  name: "LookupTableFindV2"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "default_value"
+    type_attr: "Tout"
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "T"
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
   }
+  is_stateful: true
 }
 op {
-  name: "MatrixTriangularSolve"
+  name: "LookupTableImport"
   input_arg {
-    name: "matrix"
-    type_attr: "T"
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
   input_arg {
-    name: "rhs"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "keys"
+    type_attr: "Tin"
   }
-  attr {
-    name: "lower"
-    type: "bool"
-    default_value {
-      b: true
-    }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "adjoint"
-    type: "bool"
-    default_value {
-      b: false
-    }
+    name: "Tin"
+    type: "type"
   }
   attr {
-    name: "T"
+    name: "Tout"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
   }
 }
 op {
-  name: "Max"
+  name: "LookupTableImportV2"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "table_handle"
+    type: DT_RESOURCE
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "keys"
+    type_attr: "Tin"
   }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
   }
   attr {
-    name: "T"
+    name: "Tin"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
   }
   attr {
-    name: "Tidx"
+    name: "Tout"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
   }
+  is_stateful: true
 }
 op {
-  name: "MaxPool"
+  name: "LookupTableInsert"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
   }
-  output_arg {
-    name: "output"
-    type_attr: "T"
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+}
+op {
+  name: "LookupTableInsertV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  attr {
+    name: "Tout"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableRemoveV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "keys"
+    type_attr: "Tin"
+  }
+  attr {
+    name: "Tin"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "LookupTableSize"
+  input_arg {
+    name: "table_handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT64
+  }
+}
+op {
+  name: "LookupTableSizeV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "size"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
+op {
+  name: "LoopCond"
+  input_arg {
+    name: "input"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "output"
+    type: DT_BOOL
+  }
+}
+op {
+  name: "LowerBound"
+  input_arg {
+    name: "sorted_inputs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Lu"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "lu"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "p"
+    type_attr: "output_idx_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "output_idx_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "MakeIterator"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  is_stateful: true
+}
+op {
+  name: "MapClear"
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "MapDefun"
+  input_arg {
+    name: "arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "captured_inputs"
+    type_list_attr: "Tcaptured"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tcaptured"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
+  name: "MapIncompleteSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapPeek"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapSize"
+  output_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapStage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "fake_dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+  }
+  attr {
+    name: "fake_dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapUnstage"
+  input_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MapUnstageNoKey"
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "key"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatchingFiles"
+  input_arg {
+    name: "pattern"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+}
+op {
+  name: "MatrixBandPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_lower"
+    type_attr: "Tindex"
+  }
+  input_arg {
+    name: "num_upper"
+    type_attr: "Tindex"
+  }
+  output_arg {
+    name: "band"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindex"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "MatrixDeterminant"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixDiag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "MatrixDiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "MatrixExponential"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  deprecation {
+    version: 27
+    explanation: "Use Python implementation tf.linalg.matrix_exponential instead."
+  }
+}
+op {
+  name: "MatrixInverse"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixLogarithm"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixSetDiag"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "MatrixSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixSolveLs"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2_regularizer"
+    type: DT_DOUBLE
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "MatrixSquareRoot"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "MatrixTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "Max"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -17850,91 +19596,180 @@ op {
   is_commutative: true
 }
 op {
-  name: "MirrorPad"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tpaddings"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "mode"
-    type: "string"
-    allowed_values {
-      list {
-        s: "REFLECT"
-        s: "SYMMETRIC"
-      }
-    }
-  }
-}
-op {
-  name: "MirrorPadGrad"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "paddings"
-    type_attr: "Tpaddings"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "Tpaddings"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "mode"
-    type: "string"
-    allowed_values {
-      list {
-        s: "REFLECT"
-        s: "SYMMETRIC"
-      }
-    }
-  }
-}
-op {
-  name: "Mod"
+  name: "MirrorPad"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
+  }
+}
+op {
+  name: "MirrorPadGrad"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "paddings"
+    type_attr: "Tpaddings"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tpaddings"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    allowed_values {
+      list {
+        s: "REFLECT"
+        s: "SYMMETRIC"
+      }
+    }
+  }
+}
+op {
+  name: "Mod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "ModelDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "Mul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "MulNoNan"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -17952,74 +19787,11 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_HALF
-        type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-}
-op {
-  name: "ModelDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "Mul"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-      }
-    }
-  }
   is_commutative: true
 }
 op {
@@ -18662,6 +20434,29 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "NearestNeighbors"
+  input_arg {
+    name: "points"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "centers"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "k"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "nearest_center_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "nearest_center_distances"
+    type: DT_FLOAT
+  }
+}
 op {
   name: "Neg"
   input_arg {
@@ -18727,6 +20522,34 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "NextAfter"
+  input_arg {
+    name: "x1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x2"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+      }
+    }
+  }
+}
 op {
   name: "NextIteration"
   input_arg {
@@ -19544,6 +21367,80 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "OutfeedDequeue"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OutfeedDequeueTuple"
+  output_arg {
+    name: "outputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  attr {
+    name: "device_ordinal"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "OutfeedEnqueue"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "OutfeedEnqueueTuple"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Pack"
   input_arg {
@@ -20670,658 +22567,1499 @@ op {
       }
     }
   }
-}
-op {
-  name: "PlaceholderV2"
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
+}
+op {
+  name: "PlaceholderV2"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  deprecation {
+    version: 23
+    explanation: "Placeholder now behaves the same as PlaceholderV2."
+  }
+}
+op {
+  name: "PlaceholderWithDefault"
+  input_arg {
+    name: "input"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
+op {
+  name: "Polygamma"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "PopulationCount"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_UINT8
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "Pow"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "PrefetchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "PreventGradient"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "message"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
+op {
+  name: "Print"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "data"
+    type_list_attr: "U"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "U"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "message"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "first_n"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "summarize"
+    type: "int"
+    default_value {
+      i: 3
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PrintV2"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  attr {
+    name: "output_stream"
+    type: "string"
+    default_value {
+      s: "stderr"
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PriorityQueue"
+  output_arg {
+    name: "handle"
+    type: DT_STRING
+    is_ref: true
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "PriorityQueueV2"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "component_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Prod"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "PyFunc"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "token"
+    type: "string"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  is_stateful: true
+}
+op {
+  name: "PyFuncStateless"
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "token"
+    type: "string"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+}
+op {
+  name: "Qr"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "q"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "r"
+    type_attr: "T"
+  }
+  attr {
+    name: "full_matrices"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
   attr {
-    name: "dtype"
-    type: "type"
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
   }
   attr {
-    name: "shape"
-    type: "shape"
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
   }
   deprecation {
-    version: 23
-    explanation: "Placeholder now behaves the same as PlaceholderV2."
+    version: 22
+    explanation: "Replaced by QuantizeAndDequantizeV2"
   }
 }
 op {
-  name: "PlaceholderWithDefault"
+  name: "QuantizeAndDequantizeV2"
   input_arg {
     name: "input"
-    type_attr: "dtype"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "dtype"
-  }
-  attr {
-    name: "dtype"
-    type: "type"
-  }
-  attr {
-    name: "shape"
-    type: "shape"
+    type_attr: "T"
   }
-}
-op {
-  name: "Polygamma"
   input_arg {
-    name: "a"
+    name: "input_min"
     type_attr: "T"
   }
   input_arg {
-    name: "x"
+    name: "input_max"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
     }
   }
-}
-op {
-  name: "PopulationCount"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "y"
-    type: DT_UINT8
-  }
   attr {
-    name: "T"
-    type: "type"
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_TO_EVEN"
+    }
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
+        s: "HALF_TO_EVEN"
+        s: "HALF_UP"
       }
     }
   }
 }
 op {
-  name: "Pow"
+  name: "QuantizeAndDequantizeV3"
   input_arg {
-    name: "x"
+    name: "input"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
     type_attr: "T"
   }
+  input_arg {
+    name: "num_bits"
+    type: DT_INT32
+  }
   output_arg {
-    name: "z"
+    name: "output"
     type_attr: "T"
   }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
         type: DT_BFLOAT16
-        type: DT_FLOAT
         type: DT_HALF
+        type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "PrefetchDataset"
+  name: "QuantizeDownAndShrinkRange"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "input"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "buffer_size"
-    type: DT_INT64
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
   }
 }
 op {
-  name: "PreventGradient"
+  name: "QuantizeV2"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_range"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_range"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
   attr {
     name: "T"
     type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
   }
   attr {
-    name: "message"
+    name: "mode"
     type: "string"
     default_value {
-      s: ""
+      s: "MIN_COMBINED"
+    }
+    allowed_values {
+      list {
+        s: "MIN_COMBINED"
+        s: "MIN_FIRST"
+        s: "SCALED"
+      }
+    }
+  }
+  attr {
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_AWAY_FROM_ZERO"
+    }
+    allowed_values {
+      list {
+        s: "HALF_AWAY_FROM_ZERO"
+        s: "HALF_TO_EVEN"
+      }
     }
   }
 }
 op {
-  name: "Print"
+  name: "QuantizedAdd"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "x"
+    type_attr: "T1"
   }
   input_arg {
-    name: "data"
-    type_list_attr: "U"
+    name: "y"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_x"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_y"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_y"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "z"
+    type_attr: "Toutput"
   }
-  attr {
-    name: "T"
-    type: "type"
+  output_arg {
+    name: "min_z"
+    type: DT_FLOAT
   }
-  attr {
-    name: "U"
-    type: "list(type)"
-    has_minimum: true
+  output_arg {
+    name: "max_z"
+    type: DT_FLOAT
   }
   attr {
-    name: "message"
-    type: "string"
-    default_value {
-      s: ""
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "first_n"
-    type: "int"
-    default_value {
-      i: -1
+    name: "T2"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "summarize"
-    type: "int"
+    name: "Toutput"
+    type: "type"
     default_value {
-      i: 3
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
-  is_stateful: true
+  is_commutative: true
 }
 op {
-  name: "PrintV2"
+  name: "QuantizedAvgPool"
   input_arg {
     name: "input"
-    type: DT_STRING
+    type_attr: "T"
   }
-  attr {
-    name: "output_stream"
-    type: "string"
-    default_value {
-      s: "stderr"
-    }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
   }
-  is_stateful: true
-}
-op {
-  name: "PriorityQueue"
   output_arg {
-    name: "handle"
-    type: DT_STRING
-    is_ref: true
+    name: "output"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
   }
   attr {
-    name: "component_types"
-    type: "list(type)"
-    default_value {
+    name: "T"
+    type: "type"
+    allowed_values {
       list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
-    has_minimum: true
-  }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-    has_minimum: true
   }
   attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
-    }
+    name: "ksize"
+    type: "list(int)"
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "shared_name"
+    name: "padding"
     type: "string"
-    default_value {
-      s: ""
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
-  is_stateful: true
 }
 op {
-  name: "PriorityQueueV2"
+  name: "QuantizedBatchNormWithGlobalNormalization"
+  input_arg {
+    name: "t"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "t_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "t_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "m"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "m_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "m_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "v"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "v_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "v_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "beta"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "beta_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "beta_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gamma"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "gamma_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "gamma_max"
+    type: DT_FLOAT
+  }
   output_arg {
-    name: "handle"
-    type: DT_RESOURCE
+    name: "result"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "result_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "result_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "component_types"
-    type: "list(type)"
-    default_value {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
       list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
-    has_minimum: true
-  }
-  attr {
-    name: "shapes"
-    type: "list(shape)"
-    has_minimum: true
   }
   attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: -1
+    name: "out_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "variance_epsilon"
+    type: "float"
   }
   attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
+    name: "scale_after_normalization"
+    type: "bool"
   }
-  is_stateful: true
 }
 op {
-  name: "Prod"
+  name: "QuantizedBiasAdd"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type_attr: "T1"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "bias"
+    type_attr: "T2"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_bias"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_bias"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_out"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_out"
+    type: DT_FLOAT
   }
   attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
+    name: "T1"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "T"
+    name: "T2"
     type: "type"
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
   attr {
-    name: "Tidx"
+    name: "out_type"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
 }
 op {
-  name: "PyFunc"
+  name: "QuantizedConcat"
   input_arg {
-    name: "input"
-    type_list_attr: "Tin"
+    name: "concat_dim"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  input_arg {
+    name: "input_mins"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "input_maxes"
+    type: DT_FLOAT
+    number_attr: "N"
   }
   output_arg {
     name: "output"
-    type_list_attr: "Tout"
+    type_attr: "T"
   }
-  attr {
-    name: "token"
-    type: "string"
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tin"
-    type: "list(type)"
+    name: "N"
+    type: "int"
     has_minimum: true
+    minimum: 2
   }
   attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
+    name: "T"
+    type: "type"
   }
-  is_stateful: true
 }
 op {
-  name: "PyFuncStateless"
+  name: "QuantizedConv2D"
   input_arg {
     name: "input"
-    type_list_attr: "Tin"
+    type_attr: "Tinput"
   }
-  output_arg {
-    name: "output"
-    type_list_attr: "Tout"
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
   }
-  attr {
-    name: "token"
-    type: "string"
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
   }
-  attr {
-    name: "Tin"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
   }
-  attr {
-    name: "Tout"
-    type: "list(type)"
-    has_minimum: true
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
   }
-}
-op {
-  name: "Qr"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "max_filter"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "q"
-    type_attr: "T"
+    name: "output"
+    type_attr: "out_type"
   }
   output_arg {
-    name: "r"
-    type_attr: "T"
+    name: "min_output"
+    type: DT_FLOAT
   }
-  attr {
-    name: "full_matrices"
-    type: "bool"
-    default_value {
-      b: false
-    }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
-        type: DT_DOUBLE
-        type: DT_FLOAT
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
-}
-op {
-  name: "QuantizeAndDequantize"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
   attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "num_bits"
-    type: "int"
+    name: "out_type"
+    type: "type"
     default_value {
-      i: 8
+      type: DT_QINT32
     }
-  }
-  attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: false
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "input_min"
-    type: "float"
-    default_value {
-      f: 0
-    }
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "input_max"
-    type: "float"
-    default_value {
-      f: 0
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
     }
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
       list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        i: 1
+        i: 1
+        i: 1
+        i: 1
       }
     }
   }
-  deprecation {
-    version: 22
-    explanation: "Replaced by QuantizeAndDequantizeV2"
-  }
 }
 op {
-  name: "QuantizeAndDequantizeV2"
+  name: "QuantizedConv2DAndRelu"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "input_min"
-    type_attr: "T"
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
-    name: "input_max"
-    type_attr: "T"
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
   }
   attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "num_bits"
-    type: "int"
-    default_value {
-      i: 8
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "range_given"
-    type: "bool"
+    name: "out_type"
+    type: "type"
     default_value {
-      b: false
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
     allowed_values {
       list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        s: "SAME"
+        s: "VALID"
       }
     }
   }
   attr {
-    name: "round_mode"
-    type: "string"
+    name: "dilations"
+    type: "list(int)"
     default_value {
-      s: "HALF_TO_EVEN"
-    }
-    allowed_values {
       list {
-        s: "HALF_TO_EVEN"
-        s: "HALF_UP"
+        i: 1
+        i: 1
+        i: 1
+        i: 1
       }
     }
   }
 }
 op {
-  name: "QuantizeAndDequantizeV3"
+  name: "QuantizedConv2DAndReluAndRequantize"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "input_min"
-    type_attr: "T"
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
-    name: "input_max"
-    type_attr: "T"
+    name: "min_input"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "num_bits"
-    type: DT_INT32
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
   }
   attr {
-    name: "signed_input"
-    type: "bool"
-    default_value {
-      b: true
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "range_given"
-    type: "bool"
-    default_value {
-      b: true
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
     }
   }
   attr {
-    name: "T"
+    name: "out_type"
     type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
     allowed_values {
       list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
       }
     }
   }
-}
-op {
-  name: "QuantizeDownAndShrinkRange"
+}
+op {
+  name: "QuantizedConv2DAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
   input_arg {
-    name: "input"
-    type_attr: "Tinput"
+    name: "max_input"
+    type: DT_FLOAT
   }
   input_arg {
-    name: "input_min"
+    name: "min_filter"
     type: DT_FLOAT
   }
   input_arg {
-    name: "input_max"
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
     type: DT_FLOAT
   }
   output_arg {
@@ -21329,11 +24067,11 @@ op {
     type_attr: "out_type"
   }
   output_arg {
-    name: "output_min"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "output_max"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
@@ -21349,9 +24087,25 @@ op {
       }
     }
   }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
   attr {
     name: "out_type"
     type: "type"
+    default_value {
+      type: DT_QINT8
+    }
     allowed_values {
       list {
         type: DT_QINT8
@@ -21362,35 +24116,77 @@ op {
       }
     }
   }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
-  name: "QuantizeV2"
+  name: "QuantizedConv2DWithBias"
   input_arg {
     name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
     type: DT_FLOAT
   }
   input_arg {
-    name: "min_range"
+    name: "min_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_range"
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
     type: DT_FLOAT
   }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
   }
   output_arg {
-    name: "output_min"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "output_max"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
@@ -21403,73 +24199,105 @@ op {
     }
   }
   attr {
-    name: "mode"
-    type: "string"
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
     default_value {
-      s: "MIN_COMBINED"
+      type: DT_QINT32
     }
     allowed_values {
       list {
-        s: "MIN_COMBINED"
-        s: "MIN_FIRST"
-        s: "SCALED"
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
   attr {
-    name: "round_mode"
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
     type: "string"
-    default_value {
-      s: "HALF_AWAY_FROM_ZERO"
-    }
     allowed_values {
       list {
-        s: "HALF_AWAY_FROM_ZERO"
-        s: "HALF_TO_EVEN"
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
       }
     }
   }
 }
 op {
-  name: "QuantizedAdd"
+  name: "QuantizedConv2DWithBiasAndRelu"
   input_arg {
-    name: "x"
-    type_attr: "T1"
+    name: "input"
+    type_attr: "Tinput"
   }
   input_arg {
-    name: "y"
-    type_attr: "T2"
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
-    name: "min_x"
+    name: "bias"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_x"
+    name: "min_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "min_y"
+    name: "max_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_y"
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
     type: DT_FLOAT
   }
   output_arg {
-    name: "z"
-    type_attr: "Toutput"
+    name: "output"
+    type_attr: "out_type"
   }
   output_arg {
-    name: "min_z"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "max_z"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
-    name: "T1"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
@@ -21482,7 +24310,7 @@ op {
     }
   }
   attr {
-    name: "T2"
+    name: "Tfilter"
     type: "type"
     allowed_values {
       list {
@@ -21495,7 +24323,7 @@ op {
     }
   }
   attr {
-    name: "Toutput"
+    name: "out_type"
     type: "type"
     default_value {
       type: DT_QINT32
@@ -21510,13 +24338,46 @@ op {
       }
     }
   }
-  is_commutative: true
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
-  name: "QuantizedAvgPool"
+  name: "QuantizedConv2DWithBiasAndReluAndRequantize"
   input_arg {
     name: "input"
-    type_attr: "T"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
   }
   input_arg {
     name: "min_input"
@@ -21526,9 +24387,25 @@ op {
     name: "max_input"
     type: DT_FLOAT
   }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
   output_arg {
     name: "output"
-    type_attr: "T"
+    type_attr: "out_type"
   }
   output_arg {
     name: "min_output"
@@ -21539,7 +24416,7 @@ op {
     type: DT_FLOAT
   }
   attr {
-    name: "T"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
@@ -21552,8 +24429,43 @@ op {
     }
   }
   attr {
-    name: "ksize"
-    type: "list(int)"
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
   }
   attr {
     name: "strides"
@@ -21569,79 +24481,67 @@ op {
       }
     }
   }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
 }
 op {
-  name: "QuantizedBatchNormWithGlobalNormalization"
-  input_arg {
-    name: "t"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "t_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "t_max"
-    type: DT_FLOAT
-  }
+  name: "QuantizedConv2DWithBiasAndRequantize"
   input_arg {
-    name: "m"
+    name: "input"
     type_attr: "Tinput"
   }
   input_arg {
-    name: "m_min"
-    type: DT_FLOAT
-  }
-  input_arg {
-    name: "m_max"
-    type: DT_FLOAT
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
-    name: "v"
-    type_attr: "Tinput"
+    name: "bias"
+    type_attr: "Tbias"
   }
   input_arg {
-    name: "v_min"
+    name: "min_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "v_max"
+    name: "max_input"
     type: DT_FLOAT
   }
   input_arg {
-    name: "beta"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "beta_min"
+    name: "min_filter"
     type: DT_FLOAT
   }
   input_arg {
-    name: "beta_max"
+    name: "max_filter"
     type: DT_FLOAT
   }
   input_arg {
-    name: "gamma"
-    type_attr: "Tinput"
-  }
-  input_arg {
-    name: "gamma_min"
+    name: "min_freezed_output"
     type: DT_FLOAT
   }
   input_arg {
-    name: "gamma_max"
+    name: "max_freezed_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "result"
+    name: "output"
     type_attr: "out_type"
   }
   output_arg {
-    name: "result_min"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "result_max"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
@@ -21657,9 +24557,35 @@ op {
       }
     }
   }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
   attr {
     name: "out_type"
     type: "type"
+    default_value {
+      type: DT_QINT8
+    }
     allowed_values {
       list {
         type: DT_QINT8
@@ -21671,23 +24597,45 @@ op {
     }
   }
   attr {
-    name: "variance_epsilon"
-    type: "float"
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "scale_after_normalization"
-    type: "bool"
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
   }
 }
 op {
-  name: "QuantizedBiasAdd"
+  name: "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize"
   input_arg {
     name: "input"
-    type_attr: "T1"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
   }
   input_arg {
     name: "bias"
-    type_attr: "T2"
+    type_attr: "Tbias"
   }
   input_arg {
     name: "min_input"
@@ -21698,11 +24646,31 @@ op {
     type: DT_FLOAT
   }
   input_arg {
-    name: "min_bias"
+    name: "min_filter"
     type: DT_FLOAT
   }
   input_arg {
-    name: "max_bias"
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type_attr: "Tsummand"
+  }
+  input_arg {
+    name: "min_summand"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_summand"
     type: DT_FLOAT
   }
   output_arg {
@@ -21710,15 +24678,15 @@ op {
     type_attr: "out_type"
   }
   output_arg {
-    name: "min_out"
+    name: "min_output"
     type: DT_FLOAT
   }
   output_arg {
-    name: "max_out"
+    name: "max_output"
     type: DT_FLOAT
   }
   attr {
-    name: "T1"
+    name: "Tinput"
     type: "type"
     allowed_values {
       list {
@@ -21731,7 +24699,7 @@ op {
     }
   }
   attr {
-    name: "T2"
+    name: "Tfilter"
     type: "type"
     allowed_values {
       list {
@@ -21744,7 +24712,17 @@ op {
     }
   }
   attr {
-    name: "out_type"
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tsummand"
     type: "type"
     allowed_values {
       list {
@@ -21756,53 +24734,51 @@ op {
       }
     }
   }
-}
-op {
-  name: "QuantizedConcat"
-  input_arg {
-    name: "concat_dim"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "values"
-    type_attr: "T"
-    number_attr: "N"
-  }
-  input_arg {
-    name: "input_mins"
-    type: DT_FLOAT
-    number_attr: "N"
-  }
-  input_arg {
-    name: "input_maxes"
-    type: DT_FLOAT
-    number_attr: "N"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output_min"
-    type: DT_FLOAT
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
   }
-  output_arg {
-    name: "output_max"
-    type: DT_FLOAT
+  attr {
+    name: "strides"
+    type: "list(int)"
   }
   attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 2
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
   }
 }
 op {
-  name: "QuantizedConv2D"
+  name: "QuantizedConv2DWithBiasSumAndRelu"
   input_arg {
     name: "input"
     type_attr: "Tinput"
@@ -21811,6 +24787,10 @@ op {
     name: "filter"
     type_attr: "Tfilter"
   }
+  input_arg {
+    name: "bias"
+    type: DT_FLOAT
+  }
   input_arg {
     name: "min_input"
     type: DT_FLOAT
@@ -21827,6 +24807,10 @@ op {
     name: "max_filter"
     type: DT_FLOAT
   }
+  input_arg {
+    name: "summand"
+    type: DT_FLOAT
+  }
   output_arg {
     name: "output"
     type_attr: "out_type"
@@ -21908,6 +24892,160 @@ op {
     }
   }
 }
+op {
+  name: "QuantizedConv2DWithBiasSumAndReluAndRequantize"
+  input_arg {
+    name: "input"
+    type_attr: "Tinput"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "Tfilter"
+  }
+  input_arg {
+    name: "bias"
+    type_attr: "Tbias"
+  }
+  input_arg {
+    name: "min_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_filter"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "min_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_freezed_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "summand"
+    type_attr: "Tsummand"
+  }
+  input_arg {
+    name: "min_summand"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_summand"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "min_output"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "max_output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "Tinput"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tfilter"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tbias"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_QINT32
+      }
+    }
+  }
+  attr {
+    name: "Tsummand"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
 op {
   name: "QuantizedInstanceNorm"
   input_arg {
@@ -24097,6 +27235,25 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RecvTPUEmbeddingActivations"
+  output_arg {
+    name: "outputs"
+    type: DT_FLOAT
+    number_attr: "num_outputs"
+  }
+  attr {
+    name: "num_outputs"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  is_stateful: true
+}
 op {
   name: "ReduceDataset"
   input_arg {
@@ -24149,6 +27306,7 @@ op {
       b: true
     }
   }
+  is_stateful: true
 }
 op {
   name: "ReduceJoin"
@@ -24643,6 +27801,49 @@ op {
     }
   }
 }
+op {
+  name: "RequantizationRangePerChannel"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "clip_value_max"
+    type: "float"
+  }
+}
 op {
   name: "Requantize"
   input_arg {
@@ -24704,6 +27905,73 @@ op {
     }
   }
 }
+op {
+  name: "RequantizePerChannel"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "input_max"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_min"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "requested_output_max"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  output_arg {
+    name: "output_min"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output_max"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_QINT32
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_QUINT8
+    }
+    allowed_values {
+      list {
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+}
 op {
   name: "Reshape"
   input_arg {
@@ -26474,6 +29742,43 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceScatterNdSub"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceScatterNdUpdate"
   input_arg {
@@ -27308,21 +30613,103 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyProximalGradientDescent"
+  name: "ResourceSparseApplyProximalGradientDescent"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alpha"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyRMSProp"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "alpha"
+    name: "ms"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "rho"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "momentum"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
     type_attr: "T"
   }
   input_arg {
@@ -27378,241 +30765,843 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyRMSProp"
+  name: "ResourceStridedSliceAssign"
   input_arg {
-    name: "var"
+    name: "ref"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "ms"
-    type: DT_RESOURCE
+    name: "begin"
+    type_attr: "Index"
   }
   input_arg {
-    name: "mom"
-    type: DT_RESOURCE
+    name: "end"
+    type_attr: "Index"
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "strides"
+    type_attr: "Index"
   }
   input_arg {
-    name: "rho"
+    name: "value"
     type_attr: "T"
   }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Index"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "begin_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "end_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ellipsis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "new_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "shrink_axis_mask"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "Restore"
   input_arg {
-    name: "momentum"
-    type_attr: "T"
+    name: "file_pattern"
+    type: DT_STRING
   }
   input_arg {
-    name: "epsilon"
-    type_attr: "T"
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
   }
+  is_stateful: true
+}
+op {
+  name: "RestoreSlice"
   input_arg {
-    name: "grad"
-    type_attr: "T"
+    name: "file_pattern"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slice"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "dt"
+  }
+  attr {
+    name: "dt"
+    type: "type"
+  }
+  attr {
+    name: "preferred_shard"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "RestoreV2"
+  input_arg {
+    name: "prefix"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "tensor_names"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shape_and_slices"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "tensors"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingADAMParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "velocities"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "velocities"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdadeltaParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updates"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "updates"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdagradParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingCenteredRMSPropParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "ms"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "mom"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "mg"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingFTRLParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "linears"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "linears"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingMDLAdagradLightParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "weights"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "benefits"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+  attr {
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingMomentumParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
   }
   attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "use_locking"
-    type: "bool"
+    name: "table_name"
+    type: "string"
     default_value {
-      b: false
+      s: ""
     }
   }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
   is_stateful: true
 }
 op {
-  name: "ResourceStridedSliceAssign"
-  input_arg {
-    name: "ref"
-    type: DT_RESOURCE
+  name: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "begin"
-    type_attr: "Index"
+  output_arg {
+    name: "momenta"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "end"
-    type_attr: "Index"
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "strides"
-    type_attr: "Index"
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
-  input_arg {
-    name: "value"
-    type_attr: "T"
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "Index"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingProximalAdagradParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "begin_mask"
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "end_mask"
-    type: "int"
+    name: "table_name"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
   }
   attr {
-    name: "ellipsis_mask"
+    name: "num_shards"
     type: "int"
-    default_value {
-      i: 0
-    }
   }
   attr {
-    name: "new_axis_mask"
+    name: "shard_id"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "accumulators"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
     type: "int"
     default_value {
-      i: 0
+      i: -1
     }
+    has_minimum: true
+    minimum: -1
   }
   attr {
-    name: "shrink_axis_mask"
-    type: "int"
+    name: "table_name"
+    type: "string"
     default_value {
-      i: 0
+      s: ""
     }
   }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
+  }
   is_stateful: true
 }
 op {
-  name: "Restore"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_name"
-    type: DT_STRING
+  name: "RetrieveTPUEmbeddingRMSPropParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "tensor"
-    type_attr: "dt"
+    name: "ms"
+    type: DT_FLOAT
   }
-  attr {
-    name: "dt"
-    type: "type"
+  output_arg {
+    name: "mom"
+    type: DT_FLOAT
   }
   attr {
-    name: "preferred_shard"
+    name: "table_id"
     type: "int"
     default_value {
       i: -1
     }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "RestoreSlice"
-  input_arg {
-    name: "file_pattern"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "tensor_name"
-    type: DT_STRING
+  name: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "shape_and_slice"
-    type: DT_STRING
+  output_arg {
+    name: "ms"
+    type: DT_FLOAT
   }
   output_arg {
-    name: "tensor"
-    type_attr: "dt"
+    name: "mom"
+    type: DT_FLOAT
   }
-  attr {
-    name: "dt"
-    type: "type"
+  output_arg {
+    name: "gradient_accumulators"
+    type: DT_FLOAT
   }
   attr {
-    name: "preferred_shard"
+    name: "table_id"
     type: "int"
     default_value {
       i: -1
     }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "num_shards"
+    type: "int"
+  }
+  attr {
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
 op {
-  name: "RestoreV2"
-  input_arg {
-    name: "prefix"
-    type: DT_STRING
+  name: "RetrieveTPUEmbeddingStochasticGradientDescentParameters"
+  output_arg {
+    name: "parameters"
+    type: DT_FLOAT
   }
-  input_arg {
-    name: "tensor_names"
-    type: DT_STRING
+  attr {
+    name: "table_id"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
   }
-  input_arg {
-    name: "shape_and_slices"
-    type: DT_STRING
+  attr {
+    name: "table_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
-  output_arg {
-    name: "tensors"
-    type_list_attr: "dtypes"
+  attr {
+    name: "num_shards"
+    type: "int"
   }
   attr {
-    name: "dtypes"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "shard_id"
+    type: "int"
   }
   is_stateful: true
 }
@@ -28255,6 +32244,93 @@ op {
     }
   }
 }
+op {
+  name: "ScaleAndTranslate"
+  input_arg {
+    name: "images"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "translation"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "resized_images"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "kernel_type"
+    type: "string"
+    default_value {
+      s: "lanczos3"
+    }
+  }
+}
+op {
+  name: "ScaleAndTranslateGrad"
+  input_arg {
+    name: "grads"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "original_image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "translation"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "kernel_type"
+    type: "string"
+    default_value {
+      s: "lanczos3"
+    }
+  }
+}
 op {
   name: "ScatterAdd"
   input_arg {
@@ -29460,6 +33536,7 @@ op {
       list {
         type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_HALF
       }
     }
   }
@@ -29496,6 +33573,7 @@ op {
       list {
         type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_HALF
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
@@ -29552,6 +33630,38 @@ op {
     }
   }
 }
+op {
+  name: "SendTPUEmbeddingGradients"
+  input_arg {
+    name: "inputs"
+    type: DT_FLOAT
+    number_attr: "N"
+  }
+  input_arg {
+    name: "learning_rates"
+    type: DT_FLOAT
+    number_attr: "NN"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "NN"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "config"
+    type: "string"
+  }
+  is_stateful: true
+}
 op {
   name: "SerializeIterator"
   input_arg {
@@ -29756,6 +33866,37 @@ op {
     }
   }
 }
+op {
+  name: "ShardDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_shards"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ShardedFilename"
   input_arg {
@@ -29871,6 +34012,10 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ShutdownDistributedTPU"
+  is_stateful: true
+}
 op {
   name: "Sigmoid"
   input_arg {
@@ -33846,6 +37991,70 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "StatefulStandardNormal"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "StatefulStandardNormalV2"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "algorithm"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    name: "shape_dtype"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "StatelessIf"
   input_arg {
@@ -34914,6 +39123,7 @@ op {
       list {
         type: DT_DOUBLE
         type: DT_FLOAT
+        type: DT_HALF
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
@@ -35018,40 +39228,333 @@ op {
       s: ""
     }
   }
-  deprecation {
-    version: 26
-    explanation: "Use TFRecordReaderV2"
+  deprecation {
+    version: 26
+    explanation: "Use TFRecordReaderV2"
+  }
+  is_stateful: true
+}
+op {
+  name: "TFRecordReaderV2"
+  output_arg {
+    name: "reader_handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "compression_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TPUCompilationResult"
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+}
+op {
+  name: "TPUEmbeddingActivations"
+  input_arg {
+    name: "embedding_variable"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "sliced_activations"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "output"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "table_id"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "lookup_id"
+    type: "int"
+    has_minimum: true
+  }
+}
+op {
+  name: "TPUOrdinalSelector"
+  output_arg {
+    name: "device_ordinals"
+    type: DT_INT32
   }
   is_stateful: true
 }
 op {
-  name: "TFRecordReaderV2"
+  name: "TPUPartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  input_arg {
+    name: "device_ordinal"
+    type: DT_INT32
+  }
   output_arg {
-    name: "reader_handle"
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+}
+op {
+  name: "TPUReplicate"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "Tinputs"
+  }
+  input_arg {
+    name: "broadcast_inputs"
+    type_list_attr: "Tbroadcast_inputs"
+  }
+  input_arg {
+    name: "variables"
     type: DT_RESOURCE
+    number_attr: "NumVariables"
+  }
+  input_arg {
+    name: "guaranteed_constants"
+    type_list_attr: "Tguaranteed_constants"
+  }
+  output_arg {
+    name: "outputs"
+    type_list_attr: "output_types"
   }
   attr {
-    name: "container"
+    name: "computation"
+    type: "func"
+  }
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_cores_per_replica"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "topology"
     type: "string"
     default_value {
       s: ""
     }
   }
   attr {
-    name: "shared_name"
+    name: "use_tpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "device_assignment"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "host_compute_core"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "Tinputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tbroadcast_inputs"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "NumVariables"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "Tguaranteed_constants"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "padding_map"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "step_marker_location"
     type: "string"
     default_value {
-      s: ""
+      s: "STEP_MARK_AT_ENTRY"
     }
   }
+  is_stateful: true
+}
+op {
+  name: "TPUReplicateMetadata"
   attr {
-    name: "compression_type"
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_cores_per_replica"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "topology"
     type: "string"
     default_value {
       s: ""
     }
   }
-  is_stateful: true
+  attr {
+    name: "use_tpu"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "device_assignment"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "computation_shape"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "host_compute_core"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "padding_map"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "step_marker_location"
+    type: "string"
+    default_value {
+      s: "STEP_MARK_AT_ENTRY"
+    }
+  }
+}
+op {
+  name: "TPUReplicatedInput"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "TPUReplicatedOutput"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "outputs"
+    type_attr: "T"
+    number_attr: "num_replicas"
+  }
+  attr {
+    name: "num_replicas"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
 }
 op {
   name: "TakeDataset"
@@ -36333,6 +40836,15 @@ op {
     name: "element_dtype"
     type: "type"
   }
+  attr {
+    name: "element_shape"
+    type: "shape"
+    default_value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
 }
 op {
   name: "TensorListConcatLists"
@@ -36353,6 +40865,43 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorListConcatV2"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "leading_dims"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "TensorListElementShape"
   input_arg {
@@ -36413,6 +40962,10 @@ op {
     name: "indices"
     type: DT_INT32
   }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
   output_arg {
     name: "values"
     type_attr: "element_dtype"
@@ -36432,6 +40985,10 @@ op {
     name: "index"
     type: DT_INT32
   }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
   output_arg {
     name: "item"
     type_attr: "element_dtype"
@@ -36458,6 +41015,10 @@ op {
     name: "input_handle"
     type: DT_VARIANT
   }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
   output_arg {
     name: "output_handle"
     type: DT_VARIANT
@@ -36538,6 +41099,22 @@ op {
     }
   }
 }
+op {
+  name: "TensorListResize"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
 op {
   name: "TensorListScatter"
   input_arg {
@@ -36571,6 +41148,66 @@ op {
     }
   }
 }
+op {
+  name: "TensorListScatterIntoExistingList"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
+op {
+  name: "TensorListScatterV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "num_elements"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "TensorListSetItem"
   input_arg {
@@ -36633,6 +41270,10 @@ op {
     name: "input_handle"
     type: DT_VARIANT
   }
+  input_arg {
+    name: "element_shape"
+    type: DT_INT32
+  }
   output_arg {
     name: "tensor"
     type_attr: "element_dtype"
@@ -37163,6 +41804,33 @@ op {
     }
   }
 }
+op {
+  name: "TridiagonalSolve"
+  input_arg {
+    name: "diagonals"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "TruncateDiv"
   input_arg {
@@ -37406,6 +42074,53 @@ op {
     type: "type"
   }
 }
+op {
+  name: "UnicodeDecode"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "row_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "char_values"
+    type: DT_INT32
+  }
+  attr {
+    name: "input_encoding"
+    type: "string"
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "strict"
+        s: "replace"
+        s: "ignore"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+  attr {
+    name: "replace_control_characters"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "UnicodeDecodeWithOffsets"
   input_arg {
@@ -38396,6 +43111,13 @@ op {
       }
     }
   }
+  attr {
+    name: "parallel_iterations"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
   is_stateful: true
 }
 op {
@@ -38482,6 +43204,18 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "WorkerHeartbeat"
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "WrapDatasetVariant"
   input_arg {
diff --git a/tensorflow/core/ops/parsing_ops.cc b/tensorflow/core/ops/parsing_ops.cc
index eff453241d47c55750b9662e13b8755e2d3a42b9..169076a6f673e4e23a874e6f369575f07fbd5168 100644
--- a/tensorflow/core/ops/parsing_ops.cc
+++ b/tensorflow/core/ops/parsing_ops.cc
@@ -26,7 +26,10 @@ using shape_inference::ShapeHandle;
 REGISTER_OP("DecodeRaw")
     .Input("bytes: string")
     .Output("output: out_type")
-    .Attr("out_type: {half,float,double,int32,uint16,uint8,int16,int8,int64}")
+    .Attr(
+        "out_type: "
+        "{half,float,double,int32,uint16,uint8,int16,int8,int64,complex64,"
+        "complex128}")
     .Attr("little_endian: bool = true")
     .SetShapeFn([](InferenceContext* c) {
       // Note: last dimension is data dependent.
diff --git a/tensorflow/core/ops/resource_variable_ops.cc b/tensorflow/core/ops/resource_variable_ops.cc
index 65bdde375bf07f8a43d682dd6ff58bc89ef80f68..f54ed52ea295c296e184bd333c79fc7d31d4029c 100644
--- a/tensorflow/core/ops/resource_variable_ops.cc
+++ b/tensorflow/core/ops/resource_variable_ops.cc
@@ -29,21 +29,20 @@ namespace tensorflow {
 
 namespace {
 
-Status ValidateVariableResourceHandle(InferenceContext* c,
-                                      ShapeAndType* shape_and_type) {
+Status ValidateVariableResourceHandle(
+    InferenceContext* c, std::vector<ShapeAndType>* shape_and_type) {
   auto* handle_data = c->input_handle_shapes_and_types(0);
   if (handle_data == nullptr || handle_data->empty()) {
-    shape_and_type->shape = c->UnknownShape();
-    shape_and_type->dtype = DT_INVALID;
+    shape_and_type->emplace_back(c->UnknownShape(), DT_INVALID);
   } else {
-    *shape_and_type = (*handle_data)[0];
+    *shape_and_type = *handle_data;
     DataType value_dtype;
     TF_RETURN_IF_ERROR(c->GetAttr("dtype", &value_dtype));
-    if (shape_and_type->dtype != value_dtype) {
+    if (shape_and_type->at(0).dtype != value_dtype) {
       return errors::InvalidArgument(
           "Trying to read variable with wrong dtype. "
           "Expected ",
-          DataTypeString(shape_and_type->dtype), " got ",
+          DataTypeString(shape_and_type->at(0).dtype), " got ",
           DataTypeString(value_dtype));
     }
   }
@@ -51,9 +50,15 @@ Status ValidateVariableResourceHandle(InferenceContext* c,
 }
 
 Status ReadVariableShapeFn(InferenceContext* c) {
-  ShapeAndType shape_and_type;
+  std::vector<ShapeAndType> shape_and_type;
   TF_RETURN_IF_ERROR(ValidateVariableResourceHandle(c, &shape_and_type));
-  c->set_output(0, shape_and_type.shape);
+  c->set_output(0, shape_and_type[0].shape);
+  if (shape_and_type[0].dtype == DT_VARIANT && shape_and_type.size() > 1) {
+    std::vector<ShapeAndType> variant_shape_and_type;
+    std::copy(shape_and_type.begin() + 1, shape_and_type.end(),
+              std::back_inserter(variant_shape_and_type));
+    c->set_output_handle_shapes_and_types(0, variant_shape_and_type);
+  }
   return Status::OK();
 }
 
@@ -180,13 +185,27 @@ REGISTER_OP("DestroyResourceOp")
     .SetShapeFn(shape_inference::NoOutputs);
 
 Status CreateAssignShapeFn(InferenceContext* c) {
-  ShapeAndType handle_shape_and_type;
+  std::vector<ShapeAndType> handle_shape_and_type;
   TF_RETURN_IF_ERROR(ValidateVariableResourceHandle(c, &handle_shape_and_type));
 
   ShapeHandle value_shape = c->input(1);
   ShapeHandle unused;
   TF_RETURN_IF_ERROR(
-      c->Merge(handle_shape_and_type.shape, value_shape, &unused));
+      c->Merge(handle_shape_and_type[0].shape, value_shape, &unused));
+
+  if (handle_shape_and_type[0].dtype == DT_VARIANT &&
+      handle_shape_and_type.size() > 1 &&
+      c->input_handle_shapes_and_types(1) != nullptr) {
+    auto* value_handle_shape_and_type = c->input_handle_shapes_and_types(1);
+    if (value_handle_shape_and_type->size() !=
+        handle_shape_and_type.size() - 1) {
+      return errors::InvalidArgument(
+          "Incompatible handle variant shape_and_type size and input "
+          "shape_and_type size: ",
+          handle_shape_and_type.size() - 1, " vs. ",
+          value_handle_shape_and_type->size());
+    }
+  }
   return Status::OK();
 }
 
@@ -240,29 +259,37 @@ REGISTER_OP("ResourceGather")
     .Attr("dtype: type")
     .Attr("Tindices: {int32,int64}")
     .SetShapeFn([](InferenceContext* c) {
-      ShapeAndType handle_shape_and_type;
+      std::vector<ShapeAndType> handle_shape_and_type;
       TF_RETURN_IF_ERROR(
           ValidateVariableResourceHandle(c, &handle_shape_and_type));
 
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(
-          c->WithRankAtLeast(handle_shape_and_type.shape, 1, &unused));
+          c->WithRankAtLeast(handle_shape_and_type[0].shape, 1, &unused));
       ShapeHandle params_subshape;
       TF_RETURN_IF_ERROR(
-          c->Subshape(handle_shape_and_type.shape, 1, &params_subshape));
+          c->Subshape(handle_shape_and_type[0].shape, 1, &params_subshape));
       ShapeHandle indices_shape = c->input(1);
       ShapeHandle out;
       TF_RETURN_IF_ERROR(c->Concatenate(indices_shape, params_subshape, &out));
       c->set_output(0, out);
+      if (handle_shape_and_type[0].dtype == DT_VARIANT &&
+          !handle_shape_and_type.empty()) {
+        std::vector<ShapeAndType> variant_shape_and_type;
+        std::copy(handle_shape_and_type.begin() + 1,
+                  handle_shape_and_type.end(),
+                  std::back_inserter(variant_shape_and_type));
+        c->set_output_handle_shapes_and_types(0, variant_shape_and_type);
+      }
       return Status::OK();
     });
 
 namespace {
 
 Status ResourceScatterUpdateShape(InferenceContext* c) {
-  ShapeAndType handle_shape_and_type;
+  std::vector<ShapeAndType> handle_shape_and_type;
   TF_RETURN_IF_ERROR(ValidateVariableResourceHandle(c, &handle_shape_and_type));
-  ShapeHandle var_shape = handle_shape_and_type.shape;
+  ShapeHandle var_shape = handle_shape_and_type[0].shape;
   ShapeHandle indices_shape = c->input(1);
 
   ShapeHandle unused_updates_shape;
@@ -274,6 +301,19 @@ Status ResourceScatterUpdateShape(InferenceContext* c) {
       InferenceContext::Rank(c->input(2)) == 0
           ? Status::OK()
           : c->Merge(c->input(2), concat, &unused_updates_shape));
+  if (handle_shape_and_type[0].dtype == DT_VARIANT &&
+      handle_shape_and_type.size() > 1 &&
+      c->input_handle_shapes_and_types(2) != nullptr) {
+    auto* value_handle_shape_and_type = c->input_handle_shapes_and_types(2);
+    if (value_handle_shape_and_type->size() !=
+        handle_shape_and_type.size() - 1) {
+      return errors::InvalidArgument(
+          "Incompatible handle variant shape_and_type size and input "
+          "shape_and_type size: ",
+          handle_shape_and_type.size() - 1, " vs. ",
+          value_handle_shape_and_type->size());
+    }
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/core/ops/sendrecv_ops.cc b/tensorflow/core/ops/sendrecv_ops.cc
index 7d0fda2f87fc14804486a0edcd35f221c1045917..e84a4796c1f14d79e539b5c16f7e6e6b89421abd 100644
--- a/tensorflow/core/ops/sendrecv_ops.cc
+++ b/tensorflow/core/ops/sendrecv_ops.cc
@@ -106,8 +106,8 @@ REGISTER_OP("_HostRecv")
     .Doc(R"doc(
 Receives the named tensor from send_device on recv_device.
 
-_HostRecv requires its input on host memory whereas _Recv requires its
-input on device memory.
+_HostRecv produces its output on host memory whereas _Recv produces its
+output on device memory.
 
 tensor: The tensor to receive.
 tensor_name: The name of the tensor to receive.
diff --git a/tensorflow/core/ops/state_ops.cc b/tensorflow/core/ops/state_ops.cc
index aa975cb77bafb3b31f0d612d0f662cef0bde06f2..d2bf033461ebdc99889bae5357704205e6172501 100644
--- a/tensorflow/core/ops/state_ops.cc
+++ b/tensorflow/core/ops/state_ops.cc
@@ -231,6 +231,15 @@ REGISTER_OP("ResourceScatterNdAdd")
     .Attr("use_locking: bool = true")
     .SetShapeFn(shape_inference::ScatterNdUpdateShape);
 
+REGISTER_OP("ResourceScatterNdSub")
+    .Input("ref: resource")
+    .Input("indices: Tindices")
+    .Input("updates: T")
+    .Attr("T: type")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = true")
+    .SetShapeFn(shape_inference::ScatterNdUpdateShape);
+
 REGISTER_OP("ScatterNdAdd")
     .Input("ref: Ref(T)")
     .Input("indices: Tindices")
diff --git a/tensorflow/core/ops/stateful_random_ops.cc b/tensorflow/core/ops/stateful_random_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2162107b7e5dba9a26400b494957afb038713240
--- /dev/null
+++ b/tensorflow/core/ops/stateful_random_ops.cc
@@ -0,0 +1,60 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+Status StatefulRandomShape(shape_inference::InferenceContext* c) {
+  using shape_inference::ShapeHandle;
+
+  // Check algorithm shape
+  ShapeHandle unused;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+
+  // Set output shape
+  ShapeHandle out;
+  TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(2, &out));
+  c->set_output(0, out);
+  return Status::OK();
+}
+
+REGISTER_OP("StatefulStandardNormalV2")
+    .Input("resource: resource")
+    .Input("algorithm: int64")
+    .Input("shape: shape_dtype")
+    .Output("output: dtype")
+    .Attr("dtype : type = DT_FLOAT")
+    .Attr("shape_dtype : type = DT_INT64")
+    .SetShapeFn(StatefulRandomShape);
+
+// Register the old 'StatefulStandardNormal' op
+REGISTER_OP("StatefulStandardNormal")
+    .Input("resource: resource")
+    .Input("shape: shape_dtype")
+    .Output("output: dtype")
+    .Attr("dtype : type = DT_FLOAT")
+    .Attr("shape_dtype : type = DT_INT64")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      using shape_inference::ShapeHandle;
+      // Set output shape
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 8ea74f1d43e5baa3f14398e6ea17c19466ea2973..d012ce67fd0c6e8ba0b29fee8da6407f3927ef70 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -296,6 +296,27 @@ REGISTER_OP("UnicodeTranscode")
     .Attr("replace_control_characters: bool = false")
     .SetShapeFn(shape_inference::UnchangedShape);
 
+REGISTER_OP("UnicodeDecode")
+    .Input("input: string")
+    .Output("row_splits: int64")
+    .Output("char_values: int32")
+    .Attr("input_encoding: string")
+    .Attr("errors: {'strict', 'replace', 'ignore'} = 'replace'")
+    .Attr("replacement_char: int = 65533")  // 0xFFFD unicode replacement char
+    .Attr("replace_control_characters: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      // row_splits.shape == [input.size() + 1]
+      DimensionHandle num_row_splits;
+      DimensionHandle input_size = c->NumElements(c->input(0));
+      TF_RETURN_IF_ERROR(c->Add(input_size, 1, &num_row_splits));
+      c->set_output(0, c->Vector(num_row_splits));
+
+      // char_values.shape == [num_chars]
+      DimensionHandle num_chars = c->UnknownDim();
+      c->set_output(1, c->Vector(num_chars));
+      return Status::OK();
+    });
+
 REGISTER_OP("UnicodeDecodeWithOffsets")
     .Input("input: string")
     .Output("row_splits: int64")
diff --git a/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc b/tensorflow/core/ops/tpu_configuration_ops.cc
similarity index 92%
rename from tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
rename to tensorflow/core/ops/tpu_configuration_ops.cc
index d5600eef4a9dc69fcfd931a083f86d7941ba8fb4..febb25096fdbfa006a5353c9719c1e7ce1852504 100644
--- a/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
+++ b/tensorflow/core/ops/tpu_configuration_ops.cc
@@ -193,25 +193,10 @@ REGISTER_OP("ConfigureDistributedTPU")
     .Attr("tpu_embedding_config: string = ''")
     .Attr("is_global_init: bool = false")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-An op that sets up the centralized structures for a distributed TPU
-system.
-
-topology: A serialized tensorflow.tpu.TopologyProto that describes the TPU
-topology.
-tpu_embedding_config: Serialized tensorflow.tpu.TPUEmbeddingConfiguration that
-describes the embedding lookups of the program.
-embedding_config: Reserved. Do not use.
-is_global_init: Reserved. Do not use.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("ShutdownDistributedTPU")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-An op that shuts down a running distributed TPU system. The Op returns
-an error if no system is running.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 }  // end namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/cross_replica_ops.cc b/tensorflow/core/ops/tpu_cross_replica_ops.cc
similarity index 53%
rename from tensorflow/contrib/tpu/ops/cross_replica_ops.cc
rename to tensorflow/core/ops/tpu_cross_replica_ops.cc
index 87e3a5946c20be8e2c7a24e198d1fb94335a6b86..c26b49eb34b116b5bab5aa1e0154724318c3dbb9 100644
--- a/tensorflow/contrib/tpu/ops/cross_replica_ops.cc
+++ b/tensorflow/core/ops/tpu_cross_replica_ops.cc
@@ -26,7 +26,7 @@ REGISTER_OP("AllToAll")
     .Input("input: T")
     .Input("group_assignment: int32")
     .Output("output: T")
-    .Attr("T: {bfloat16, float}")
+    .Attr("T: {numbertype, bool}")
     .Attr("concat_dimension: int")
     .Attr("split_dimension: int")
     .Attr("split_count: int")
@@ -70,79 +70,19 @@ REGISTER_OP("AllToAll")
 
       c->set_output(0, c->MakeShape(dims));
       return Status::OK();
-    })
-    .Doc(R"doc(
-An Op to exchange data across TPU replicas. On each replica, the input is
-split into `split_count` blocks along `split_dimension` and send to the other
-replicas given group_assignment. After receiving `split_count` - 1 blocks from
-other replicas, we concatenate the blocks along `concat_dimension` as the
-output.
-
-For example, suppose there are 2 TPU replicas:
-replica 0 receives input: `[[A, B]]`
-replica 1 receives input: `[[C, D]]`
-
-group_assignment=`[[0, 1]]`
-concat_dimension=0
-split_dimension=1
-split_count=2
-
-replica 0's output: `[[A], [C]]`
-replica 1's output: `[[B], [D]]`
-
-input: The local input to the sum.
-group_assignment: An int32 tensor with shape
-  [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
-  replica ids in the ith subgroup.
-concat_dimension: The dimension number to concatenate.
-split_dimension: The dimension number to split.
-split_count: The number of splits, this number must equal to the sub-group
-  size(group_assignment.get_shape()[1])
-output: The exchanged result.
-T: The type of elements to be exchanged.
-)doc");
+    });
 
 REGISTER_OP("CrossReplicaSum")
     .Input("input: T")
     .Input("group_assignment: int32")
     .Output("output: T")
-    .Attr("T: {bfloat16, float}")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-An Op to sum inputs across replicated TPU instances. Each instance supplies its
-own input.
-
-For example, suppose there are 8 TPU instances: `[A, B, C, D, E, F, G, H]`.
-Passing group_assignment=`[[0,2,4,6],[1,3,5,7]]` sets `A, C, E, G` as group 0,
-and `B, D, F, H` as group 1. Thus we get the outputs:
-`[A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H]`.
-
-input: The local input to the sum.
-group_assignment: An int32 tensor with shape
-  [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
-  replica ids in the ith subgroup.
-output: The sum of all the distributed inputs.
-T: The type of elements to be summed.
-)doc");
+    .Attr("T: {bfloat16, float, int32, uint32}")
+    .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("CollectivePermute")
     .Input("input: T")
     .Input("source_target_pairs: int32")
     .Output("output: T")
     .Attr("T: numbertype")
-    .SetShapeFn(shape_inference::UnchangedShape)
-    .Doc(R"doc(
-An Op to permute tensors across replicated TPU instances. Each instance
-supplies its own input.
-
-For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
-source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
-`[D, A, B, C]`.
-
-input: The local input to be permuted. Currently only supports float and
-  bfloat16.
-source_target_pairs: A tensor with shape [num_pairs, 2].
-output: The permuted input.
-T: The type of elements to be exchanged.
-)doc");
+    .SetShapeFn(shape_inference::UnchangedShape);
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc b/tensorflow/core/ops/tpu_embedding_ops.cc
similarity index 62%
rename from tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
rename to tensorflow/core/ops/tpu_embedding_ops.cc
index 0ef29bdf734467aa9dee5c157bc8d8a7e0a85f13..79ebc09adc2f8dc65336342d5b4843f35599cdaf 100644
--- a/tensorflow/contrib/tpu/ops/tpu_embedding_ops.cc
+++ b/tensorflow/core/ops/tpu_embedding_ops.cc
@@ -13,9 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tpu/proto/tpu_embedding_configuration.pb.h"
-#include "tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.h"
-#include "tensorflow/contrib/tpu/utils/tpu_embedding_output_layout_utils.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
@@ -23,6 +20,9 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/protobuf/tpu/tpu_embedding_configuration.pb.h"
+#include "tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h"
+#include "tensorflow/core/tpu/tpu_embedding_output_layout_utils.h"
 
 namespace tensorflow {
 
@@ -37,18 +37,18 @@ namespace tensorflow {
 //    pieces of the TF Graph.
 // 1. Pass this TPUEmbeddingConfiguration to tpu.initialize_system() as the
 //    tpu_embedding_config parameter.
-// 2. Use the TPUEmbeddingLoad Op to initialize the embedding tables in TPU
+// 2. Use the LoadTPUEmbedding Ops to initialize the embedding tables in TPU
 //    memories, sharded across the memories attached to each Host.
-// 3. Use TPUEmbeddingEnqueueSparseBatch to provide the TPU with embedding
+// 3. Use EnqueueTPUEmbeddingSparseBatch to provide the TPU with embedding
 //    indices and aggregation weights.
-// 4. TPUEmbeddingReceiveActivations returns a list of Tensors, containing the
+// 4. RecvTPUEmbeddingActivations returns a list of Tensors, containing the
 //    activations from each table specified in the configuration.
 // 5. TPUEmbeddingActivations, when used with appropriate Python libraries,
 //    enables the automatic differentiation of models that use embeddings.
-// 6. TPUEmbeddingSendGradients takes a list of Tensors (of the same shapes
+// 6. SendTPUEmbeddingGradients takes a list of Tensors (of the same shapes
 //    as those returned by TPUEmbeddingReceiveActivations) containing gradients
 //    to use in updating the embedding tables.
-// 7. Before saving a checkpoint, use the TPUEmbeddingRetrieve Op to update
+// 7. Before saving a checkpoint, use the RetrieveTPUEmbedding Ops to update
 //    the Graph's embedding table Variables from the updated tables in the
 //    TPU memories.
 //
@@ -96,10 +96,6 @@ Status RegisterPerTableLoadOpsForAlgorithmBody(
     if (parameter.has_user_defined() || is_debug_op) {
       auto* arg = op_def->add_input_arg();
       arg->set_name(parameter.name());
-      arg->set_description(
-          strings::StrCat("Value of ", parameter.name(), " used in the ",
-                          GetOptimizationAlgorithmFriendlyName(alg),
-                          " optimization algorithm."));
       arg->set_type(DT_FLOAT);
     }
   }
@@ -127,7 +123,6 @@ Status RegisterPerTableLoadOpsForAlgorithmBody(
     shard_id_attr->set_name("shard_id");
     shard_id_attr->set_type("int");
   }
-  op_def->set_summary("Load embedding parameters for a single table.");
   string parameter_descriptions;
   for (const auto& parameter : state_variable_specs) {
     if (parameter.has_user_defined() || is_debug_op) {
@@ -139,21 +134,6 @@ lookups using the %s optimization algorithm.)",
                        GetOptimizationAlgorithmFriendlyName(alg).c_str());
     }
   }
-  op_def->set_description(strings::Printf(R"doc(
-An op that loads optimization parameters into HBM for embedding. Must be
-preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-embedding table configuration. For example, this op is used to install
-parameters that are loaded from a checkpoint before a training loop is
-executed.
-%s
-table_name: Name of this table; must match a name in the
-  TPUEmbeddingConfiguration proto (overrides table_id).
-num_shards: Number of shards into which the embedding tables are divided.
-shard_id: Identifier of shard for this operation.
-table_id: Index of this table in the EmbeddingLayerConfiguration proto
-  (deprecated).
-)doc",
-                                          parameter_descriptions.c_str()));
   op_def->set_is_commutative(false);
   op_def->set_is_aggregate(false);
   op_def->set_is_stateful(true);
@@ -233,10 +213,6 @@ Status RegisterPerTableRetrieveOpsForAlgorithmBody(
     if (parameter.has_user_defined() || is_debug_op) {
       auto* arg = op_def->add_output_arg();
       arg->set_name(parameter.name());
-      arg->set_description(
-          strings::StrCat("Parameter ", parameter.name(), " updated by the ",
-                          tpu::GetOptimizationAlgorithmFriendlyName(alg),
-                          " optimization algorithm."));
       arg->set_type(DT_FLOAT);
     }
   }
@@ -264,7 +240,6 @@ Status RegisterPerTableRetrieveOpsForAlgorithmBody(
     shard_id_attr->set_name("shard_id");
     shard_id_attr->set_type("int");
   }
-  op_def->set_summary("Retrieve embedding parameters for a single table.");
   string parameter_descriptions;
   for (const auto& param : state_variable_specs) {
     if (param.has_user_defined() || is_debug_op) {
@@ -276,20 +251,6 @@ parameters from embedding updates using the %s optimization algorithm.)",
                        tpu::GetOptimizationAlgorithmFriendlyName(alg).c_str());
     }
   }
-  op_def->set_description(strings::Printf(R"doc(
-An op that retrieves optimization parameters from embedding to host
-memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-the correct embedding table configuration. For example, this op is
-used to retrieve updated parameters before saving a checkpoint.
-%s
-table_name: Name of this table; must match a name in the
-  TPUEmbeddingConfiguration proto (overrides table_id).
-num_shards: Number of shards into which the embedding tables are divided.
-shard_id: Identifier of shard for this operation.
-table_id: Index of this table in the EmbeddingLayerConfiguration proto
-  (deprecated).
-)doc",
-                                          parameter_descriptions.c_str()));
   op_def->set_is_commutative(false);
   op_def->set_is_aggregate(false);
   op_def->set_is_stateful(true);
@@ -388,23 +349,7 @@ REGISTER_OP("RecvTPUEmbeddingActivations")
         c->set_output(i, output_shape);
       }
       return Status::OK();
-    })
-    .Doc(R"doc(
-An op that receives embedding activations on the TPU.
-
-The TPU system performs the embedding lookups and aggregations specified by
-the arguments to TPUEmbeddingEnqueue(Integer/Sparse/SparseTensor)Batch. The
-results of these aggregations are visible to the Tensorflow Graph as the
-outputs of a RecvTPUEmbeddingActivations op. This op returns a list containing
-one Tensor of activations per table specified in the model. There can be at
-most one RecvTPUEmbeddingActivations op in the TPU graph.
-
-outputs: A TensorList of embedding activations containing one Tensor per
-    embedding table in the model.
-num_outputs: The number of output activation tensors, equal to the number of
-    embedding tables in the model.
-config: Serialized TPUEmbeddingConfiguration proto.
-)doc");
+    });
 
 REGISTER_OP("TPUEmbeddingActivations")
     .Input("embedding_variable: float32")
@@ -415,23 +360,7 @@ REGISTER_OP("TPUEmbeddingActivations")
     .SetShapeFn([](shape_inference::InferenceContext *c) {
       c->set_output(0, c->input(1));
       return Status::OK();
-    })
-    .Doc(R"doc(
-An op enabling differentiation of TPU Embeddings.
-
-This op simply returns its first input, which is assumed to have been sliced
-from the Tensors returned by TPUEmbeddingDequeueActivations. The presence of this
-op, and its first argument being a trainable Variable, enables automatic
-differentiation of graphs containing embeddings via the TPU Embedding Python
-libraries.
-
-embedding_variable: A trainable variable, enabling optimizers to find this op.
-sliced_activations: The embedding activations Tensor to return.
-table_id: The id of the table in the embedding layer configuration from which
-    these activations were computed.
-lookup_id: Identifier of the set of embedding indices which produced these
-    activations.
-)doc");
+    });
 
 REGISTER_OP("SendTPUEmbeddingGradients")
     .Input("inputs: N * float32")
@@ -453,24 +382,7 @@ REGISTER_OP("SendTPUEmbeddingGradients")
       }
 
       return Status::OK();
-    })
-    .Doc(R"doc(
-An op that performs gradient updates of embedding tables.
-
-The TensorList argument has the same length and shapes as the return value of
-TPUEmbeddingReceiveActivations, but contains gradients of the model's loss
-with respect to the embedding activations. The embedding tables are updated
-from these gradients via the optimizer specified in the configuration given
-to tpu.initialize_system.
-
-inputs: A TensorList of gradients with which to update embedding tables.
-    It contains one tensor per embedding table in the model.
-learning_rates: A list of float32 scalars, one for each embedding table,
-    containing the learning rates for each table when dynamic learning rate is
-    enabled through the OptimizationParameters in TPUEmbeddingConfiguration.
-    When the learning rate is constant, the list should be empty.
-config: Serialized TPUEmbeddingConfiguration proto.
-)doc");
+    });
 
 REGISTER_OP("EnqueueTPUEmbeddingIntegerBatch")
     .Input("batch: N * int32")
@@ -478,19 +390,7 @@ REGISTER_OP("EnqueueTPUEmbeddingIntegerBatch")
     .Attr("N: int >= 1")
     .Attr("device_ordinal: int = -1")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-An op that enqueues a list of input batch tensors to TPUEmbedding.
-
-batch: A list of 1D tensors, one for each embedding table, containing the
-    indices into the tables.
-mode_override: A string input that overrides the mode specified in the
-    TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
-    'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
-    in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
-device_ordinal: The TPU device to use. Should be >= 0 and less than the number
-    of TPU cores in the task on which the node is placed.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("EnqueueTPUEmbeddingSparseBatch")
     .Input("sample_indices: N * int32")
@@ -513,41 +413,7 @@ REGISTER_OP("EnqueueTPUEmbeddingSparseBatch")
       }
 
       return Status::OK();
-    })
-    .Doc(R"doc(
-An op that enqueues TPUEmbedding input indices from a SparseTensor.
-
-This Op eases the porting of code that uses embedding_lookup_sparse(),
-although some Python preprocessing of the SparseTensor arguments to
-embedding_lookup_sparse() is required to produce the arguments to this Op,
-since only a single EnqueueTPUEmbeddingSparseBatch Op is allowed per training
-step.
-
-The tensors at corresponding positions in the three input lists
-must have the same shape, i.e. rank 1 with dim_size() equal to the total
-number of lookups into the table described by the corresponding table_id.
-
-sample_indices: A list of rank 1 Tensors specifying the training example and
-    feature to which the corresponding embedding_indices and aggregation_weights
-    values belong. sample_indices[i] must equal b * nf + f, where nf is the
-    number of features from the corresponding table, f is in [0, nf), and
-    b is in [0, batch size).
-embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
-aggregation_weights: A list of rank 1 Tensors containing per sample -- i.e. per
-    (training example, feature) -- aggregation weights.
-mode_override: A string input that overrides the mode specified in the
-    TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
-    'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
-    in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
-device_ordinal: The TPU device to use. Should be >= 0 and less than the number
-    of TPU cores in the task on which the node is placed.
-combiners: A list of string scalars, one for each embedding table that specify
-    how to normalize the embedding activations after weighted summation.
-    Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
-    the sum of the weights be 0 for 'mean' or the sum of the squared weights be
-    0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
-    all tables.
-)doc");
+    });
 
 REGISTER_OP("EnqueueTPUEmbeddingSparseTensorBatch")
     .Input("sample_indices: N * int32")
@@ -559,44 +425,6 @@ REGISTER_OP("EnqueueTPUEmbeddingSparseTensorBatch")
     .Attr("combiners: list(string) = []")
     .Attr("table_ids: list(int)")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-This Op eases the porting of code that uses tf.nn.embedding_lookup_sparse().
-
-sample_indices[i], embedding_indices[i] and aggregation_weights[i] correspond
-to the ith feature. table_ids[i] indicates which embedding table to look up ith
-feature.
-
-The tensors at corresponding positions in the three input lists (sample_indices,
-embedding_indices and aggregation_weights) must have the same shape, i.e. rank 1
-with dim_size() equal to the total number of lookups into the table described by
-the corresponding feature.
-
-sample_indices: A list of rank 1 Tensors specifying the training example to
-    which the corresponding embedding_indices and aggregation_weights values
-    belong. It corresponds to sp_ids.indices[:,0] in  embedding_lookup_sparse().
-embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
-    It corresponds to sp_ids.values in embedding_lookup_sparse().
-aggregation_weights: A list of rank 1 Tensors containing per training example
-    aggregation weights. It corresponds to sp_weights.values in
-    embedding_lookup_sparse().
-mode_override: A string input that overrides the mode specified in the
-    TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
-    'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
-    in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
-device_ordinal: The TPU device to use. Should be >= 0 and less than the number
-    of TPU cores in the task on which the node is placed.
-combiners: A list of string scalars, one for each embedding table that specify
-    how to normalize the embedding activations after weighted summation.
-    Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
-    the sum of the weights be 0 for 'mean' or the sum of the squared weights be
-    0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
-    all tables.
-table_ids: A list of integers specifying the identifier of the embedding table
-    (offset of TableDescriptor in the TPUEmbeddingConfiguration) to lookup the
-    corresponding input. The ith input is looked up using table_ids[i]. The size
-    of the table_ids list must be equal to that of sample_indices,
-    embedding_indices and aggregation_weights.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/tpu_functional_ops.cc b/tensorflow/core/ops/tpu_functional_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aa81e8b24b5e303f5de5d2938b9474fc6b7af6c9
--- /dev/null
+++ b/tensorflow/core/ops/tpu_functional_ops.cc
@@ -0,0 +1,31 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("TPUPartitionedCall")
+    .Input("args: Tin")
+    .Input("device_ordinal: int32")
+    .Output("output: Tout")
+    .Attr("Tin: list(type) >= 0")
+    .Attr("Tout: list(type) >= 0")
+    .Attr("f: func")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/heartbeat_ops.cc b/tensorflow/core/ops/tpu_heartbeat_ops.cc
similarity index 72%
rename from tensorflow/contrib/tpu/ops/heartbeat_ops.cc
rename to tensorflow/core/ops/tpu_heartbeat_ops.cc
index ca0f5bc0e562cd9e27b4c456b53fb9f51f1cb1f8..660aa32c8278b27b307e229d427935f36e81e5f5 100644
--- a/tensorflow/contrib/tpu/ops/heartbeat_ops.cc
+++ b/tensorflow/core/ops/tpu_heartbeat_ops.cc
@@ -23,15 +23,6 @@ REGISTER_OP("WorkerHeartbeat")
     .Input("request: string")
     .Output("response: string")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Worker heartbeat op.
-
-Heartbeats may be sent periodically to indicate the coordinator is still active,
-to retrieve the current worker status and to expedite shutdown when necessary.
-
-request: A string tensor containing a serialized WorkerHeartbeatRequest
-response: A string tensor containing a serialized WorkerHeartbeatResponse
-)doc");
+    .SetShapeFn(shape_inference::ScalarShape);
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/host_compute_ops.cc b/tensorflow/core/ops/tpu_host_compute_ops.cc
similarity index 100%
rename from tensorflow/contrib/tpu/ops/host_compute_ops.cc
rename to tensorflow/core/ops/tpu_host_compute_ops.cc
diff --git a/tensorflow/contrib/tpu/ops/infeed_ops.cc b/tensorflow/core/ops/tpu_infeed_ops.cc
similarity index 58%
rename from tensorflow/contrib/tpu/ops/infeed_ops.cc
rename to tensorflow/core/ops/tpu_infeed_ops.cc
index efc546f9a6077de9cac5a5acefa3fc7206547fc6..0090b761c48dc8f953a7ff9a211d99ca29a2f210 100644
--- a/tensorflow/contrib/tpu/ops/infeed_ops.cc
+++ b/tensorflow/core/ops/tpu_infeed_ops.cc
@@ -27,50 +27,25 @@ REGISTER_OP("InfeedDequeue")
     .Attr("dtype: type")
     .Attr("shape: shape")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::ExplicitShape)
-    .Doc(R"doc(
-A placeholder op for a value that will be fed into the computation.
-
-output: A tensor that will be provided using the infeed mechanism.
-dtype: The type of elements in the tensor.
-shape: The shape of the tensor.
-)doc");
+    .SetShapeFn(shape_inference::ExplicitShape);
 
 REGISTER_OP("InfeedEnqueue")
     .Input("input: dtype")
     .Attr("dtype: type")
     .Attr("shape: shape = {}")
+    .Attr("layout: list(int) = []")
     .Attr("device_ordinal: int = -1")
     .SetShapeFn(shape_inference::NoOutputs)
-    .SetIsStateful()
-    .Doc(R"doc(
-An op which feeds a single Tensor value into the computation.
-
-input: A tensor that will be provided using the infeed mechanism.
-dtype: The type of elements in the tensor.
-shape: The shape of the tensor.
-device_ordinal: The TPU device to use. This should be -1 when the Op
-is running on a TPU device, and >= 0 when the Op is running on the CPU
-device.
-)doc");
+    .SetIsStateful();
 
 REGISTER_OP("InfeedEnqueueTuple")
     .Input("inputs: dtypes")
     .Attr("dtypes: list(type)")
     .Attr("shapes: list(shape)")
+    .Attr("layouts: list(int) = []")
     .Attr("device_ordinal: int = -1")
     .SetShapeFn(shape_inference::NoOutputs)
-    .SetIsStateful()
-    .Doc(R"doc(
-An op which feeds multiple Tensor values into the computation as an XLA tuple.
-
-inputs: A list of tensors that will be provided using the infeed mechanism.
-dtypes: The element types of each element in `inputs`.
-shapes: The shapes of each tensor in `inputs`.
-device_ordinal: The TPU device to use. This should be -1 when the Op
-is running on a TPU device, and >= 0 when the Op is running on the CPU
-device.
-)doc");
+    .SetIsStateful();
 
 REGISTER_OP("InfeedDequeueTuple")
     .Output("outputs: dtypes")
@@ -86,14 +61,6 @@ REGISTER_OP("InfeedDequeueTuple")
         c->set_output(i, out);
       }
       return Status::OK();
-    })
-    .Doc(R"doc(
-A placeholder op for multiple values that will be fed into the computation
-simultaneously as an XLA tuple.
-
-outputs: A list of tensors that will be provided using the infeed mechanism.
-dtypes: The element types of each element in `outputs`.
-shapes: The shapes of each tensor in `outputs`.
-)doc");
+    });
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/tpu_ordinal_selector_ops.cc b/tensorflow/core/ops/tpu_ordinal_selector_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..72f24c57dbb4be388264da3c15a1e4fa0de9eb1c
--- /dev/null
+++ b/tensorflow/core/ops/tpu_ordinal_selector_ops.cc
@@ -0,0 +1,30 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("TPUOrdinalSelector")
+    .Output("device_ordinals: int32")
+    .SetIsStateful()
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      c->set_output(0,
+                    c->Vector(shape_inference::InferenceContext::kUnknownDim));
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/outfeed_ops.cc b/tensorflow/core/ops/tpu_outfeed_ops.cc
similarity index 59%
rename from tensorflow/contrib/tpu/ops/outfeed_ops.cc
rename to tensorflow/core/ops/tpu_outfeed_ops.cc
index b05c76ca64fbaedc205ab06cc31616787ccc84b8..e170ed05a0cd283f086bd75ac28375f3afa15bae 100644
--- a/tensorflow/contrib/tpu/ops/outfeed_ops.cc
+++ b/tensorflow/core/ops/tpu_outfeed_ops.cc
@@ -26,24 +26,13 @@ REGISTER_OP("OutfeedEnqueue")
     .Input("input: dtype")
     .Attr("dtype: type")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-An op which emits a single Tensor value from an XLA computation.
-
-input: A tensor that will be inserted into the outfeed queue.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("OutfeedEnqueueTuple")
     .Input("inputs: dtypes")
     .Attr("dtypes: list(type)")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::NoOutputs)
-    .Doc(R"doc(
-An op which emits multiple Tensor values from an XLA computation.
-
-inputs: A list of tensors that will be inserted into the outfeed queue as an
-XLA tuple.
-)doc");
+    .SetShapeFn(shape_inference::NoOutputs);
 
 REGISTER_OP("OutfeedDequeue")
     .Output("output: dtype")
@@ -51,18 +40,7 @@ REGISTER_OP("OutfeedDequeue")
     .Attr("shape: shape")
     .Attr("device_ordinal: int = -1")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::ExplicitShape)
-    .Doc(R"doc(
-Retrieves a single tensor from the computation outfeed.  This operation will
-block indefinitely until data is available.
-
-output: A tensor that will be read from the device outfeed.
-dtype: The type of elements in the tensor.
-shape: The shape of the tensor.
-device_ordinal: The TPU device to use. This should be -1 when the Op
-is running on a TPU device, and >= 0 when the Op is running on the CPU
-device.
-)doc");
+    .SetShapeFn(shape_inference::ExplicitShape);
 
 REGISTER_OP("OutfeedDequeueTuple")
     .Output("outputs: dtypes")
@@ -85,18 +63,6 @@ REGISTER_OP("OutfeedDequeueTuple")
         c->set_output(i, out);
       }
       return Status::OK();
-    })
-    .Doc(R"doc(
-Retrieve multiple values that will be emitted by the computation as an XLA
-tuple.  This operations will block indefinitely until data is available.
-Output `i` corresponds to XLA tuple element `i`.
-
-outputs: A list of tensors that will be read from the outfeed.
-dtypes: The element types of each element in `outputs`.
-shapes: The shapes of each tensor in `outputs`.
-device_ordinal: The TPU device to use. This should be -1 when the Op
-is running on a TPU device, and >= 0 when the Op is running on the CPU
-device.
-)doc");
+    });
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/replication_ops.cc b/tensorflow/core/ops/tpu_replication_ops.cc
similarity index 68%
rename from tensorflow/contrib/tpu/ops/replication_ops.cc
rename to tensorflow/core/ops/tpu_replication_ops.cc
index 285e11d92de7a684ed87974414ec73c274cc7aa5..7c8949260053a6ca7fd02449d9934a02d6d227ea 100644
--- a/tensorflow/contrib/tpu/ops/replication_ops.cc
+++ b/tensorflow/core/ops/tpu_replication_ops.cc
@@ -31,6 +31,8 @@ REGISTER_OP("TPUReplicateMetadata")
     // Deprecated. Use num_cores_per_replica instead.
     .Attr("computation_shape: list(int) = []")
     .Attr("host_compute_core: list(string) = []")
+    .Attr("padding_map: list(string) = []")
+    .Attr("step_marker_location: string = \"STEP_MARK_AT_ENTRY\"")
     .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("TPUReplicatedInput")
@@ -68,10 +70,7 @@ REGISTER_OP("TPUReplicatedInput")
         }
       }
       return Status::OK();
-    })
-    .Doc(
-        "Operator that connects N unreplicated inputs to an N-way "
-        "replicated TPU computation.");
+    });
 
 REGISTER_OP("TPUReplicatedOutput")
     .Input("input: T")
@@ -83,10 +82,7 @@ REGISTER_OP("TPUReplicatedOutput")
         c->set_output(i, c->input(0));
       }
       return Status::OK();
-    })
-    .Doc(
-        "Operator that connects the output of an N-way replicated TPU "
-        "computation to N separate outputs.");
+    });
 
 REGISTER_OP("TPUCompilationResult")
     .Output("output: string")
@@ -105,40 +101,13 @@ REGISTER_OP("TPUReplicate")
     .Attr("NumVariables: int >= 0")
     .Attr("Tguaranteed_constants: list(type) >= 0")
     .Attr("output_types: list(type) >= 0")
+    .Attr("padding_map: list(string) = []")
+    .Attr("step_marker_location: string = \"STEP_MARK_AT_ENTRY\"")
     .Input("inputs: Tinputs")
     .Input("broadcast_inputs: Tbroadcast_inputs")
     .Input("variables: NumVariables * resource")
     .Input("guaranteed_constants: Tguaranteed_constants")
     .Output("outputs: output_types")
-    .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"doc(
-Runs replicated computations on a distributed TPU system.
-
-computation: a function containing the computation to run.
-num_replicas: the number of replicas of the computation to run.
-num_cores_per_replica: the number of logical cores in each replica.
-topology: A serialized tensorflow.tpu.TopologyProto that describes the TPU
-topology.
-use_tpu: a bool indicating if this computation will run on TPU or CPU/GPU.
-Currently, only supports a default placement (computation is placed on GPU
-if one is available, and on CPU if not).
-device_assignment: a flattened array with shape
-  [replica, num_cores_per_replica, mesh_dimension] that maps the coordinates
-  of logical cores in each replica of a computation to physical coordinates in
-  the TPU topology.
-Tinputs: the types of the arguments to 'computation'.
-inputs: the inputs to 'computation', flattened, in replica-major order.
-Tbroadcast_inputs: the types of the additional arguments to broadcast to all
-  replicas.
-Tguaranteed_constants: the types of the arguments to 'guaranteed_constants'.
-broadcast_inputs: additional arguments to broadcast to all replicas. The
-  broadcast inputs are appended to the per-replica inputs when calling
-  computation.
-guaranteed_constants: arguments which have been guaranteed to not
-change their values during the session lifetime. These contain tensors marked as
-constant using the GuaranteeConstOp.
-output_types: the types of the outputs of 'computation'.
-outputs: the outputs of 'computation'.
-)doc");
+    .SetShapeFn(shape_inference::UnknownShape);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 647a797b82cf30384f7f48611788a62a952d5627..40a808b661cbff48f1c4198bcfca5a2261292a25 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -151,6 +151,7 @@ cc_library(
         ":retrying_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/strings",
         "@jsoncpp_git//:jsoncpp",
     ],
 )
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index c61b68aeebf4823ff70119a0349c318dd3887790..10b57df183d370966338a1c2e6a6ab42aed9b75c 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -64,7 +64,7 @@ constexpr int kGetChildrenDefaultPageSize = 1000;
 // The HTTP response code "308 Resume Incomplete".
 constexpr uint64 HTTP_CODE_RESUME_INCOMPLETE = 308;
 // The environment variable that overrides the size of the readahead buffer.
-ABSL_DEPRECATED("Use GCS_BLOCK_SIZE_MB instead.")
+ABSL_DEPRECATED("Use GCS_READ_CACHE_BLOCK_SIZE_MB instead.")
 constexpr char kReadaheadBufferSize[] = "GCS_READAHEAD_BUFFER_SIZE_BYTES";
 // The environment variable that disables the GCS block cache for reads.
 // This is the explicit alternative to setting BLOCK_SIZE or MAX_SIZE to 0, and
@@ -73,11 +73,11 @@ constexpr char kReadCacheDisabled[] = "GCS_READ_CACHE_DISABLED";
 // The environment variable that overrides the block size for aligned reads from
 // GCS. Specified in MB (e.g. "16" = 16 x 1024 x 1024 = 16777216 bytes).
 constexpr char kBlockSize[] = "GCS_READ_CACHE_BLOCK_SIZE_MB";
-constexpr size_t kDefaultBlockSize = 128 * 1024 * 1024;
+constexpr size_t kDefaultBlockSize = 16 * 1024 * 1024;
 // The environment variable that overrides the max size of the LRU cache of
 // blocks read from GCS. Specified in MB.
 constexpr char kMaxCacheSize[] = "GCS_READ_CACHE_MAX_SIZE_MB";
-constexpr size_t kDefaultMaxCacheSize = 2 * kDefaultBlockSize;
+constexpr size_t kDefaultMaxCacheSize = kDefaultBlockSize;
 // The environment variable that overrides the maximum staleness of cached file
 // contents. Once any block of a file reaches this staleness, all cached blocks
 // will be evicted on the next read.
@@ -310,6 +310,11 @@ class GcsRandomAccessFile : public RandomAccessFile {
   GcsRandomAccessFile(const string& filename, ReadFn read_fn)
       : filename_(filename), read_fn_(std::move(read_fn)) {}
 
+  Status Name(StringPiece* result) const override {
+    *result = filename_;
+    return Status::OK();
+  }
+
   /// The implementation of reads with an LRU block cache. Thread safe.
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
@@ -394,6 +399,10 @@ class GcsWritableFile : public WritableFile {
 
   Status Flush() override { return Sync(); }
 
+  Status Name(StringPiece* result) const override {
+    return errors::Unimplemented("GCSWritableFile does not support Name()");
+  }
+
   Status Sync() override {
     TF_RETURN_IF_ERROR(CheckWritable());
     if (!sync_needed_) {
@@ -406,6 +415,14 @@ class GcsWritableFile : public WritableFile {
     return status;
   }
 
+  Status Tell(int64* position) override {
+    *position = outfile_.tellp();
+    if (*position == -1) {
+      return errors::Internal("tellp on the internal temporary file failed");
+    }
+    return Status::OK();
+  }
+
  private:
   /// Copies the current version of the file to GCS.
   ///
@@ -1433,9 +1450,16 @@ Status GcsFileSystem::CreateDir(const string& dirname) {
                      : errors::NotFound("The specified bucket ", dirname,
                                         " was not found.");
   }
+
+  const string dirname_with_slash = MaybeAppendSlash(dirname);
+
+  if (FileExists(dirname_with_slash).ok()) {
+    return errors::AlreadyExists(dirname);
+  }
+
   // Create a zero-length directory marker object.
   std::unique_ptr<WritableFile> file;
-  TF_RETURN_IF_ERROR(NewWritableFile(MaybeAppendSlash(dirname), &file));
+  TF_RETURN_IF_ERROR(NewWritableFile(dirname_with_slash, &file));
   TF_RETURN_IF_ERROR(file->Close());
   return Status::OK();
 }
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index 702802b185aa4ce3243e777694d5fd9e77ec7ee8..a998f8e3adf203ad3aad35ab0d3bf4e77fe75936 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/cloud/gcs_file_system.h"
 #include <fstream>
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/cloud/http_request_fake.h"
@@ -78,6 +79,10 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache) {
   std::unique_ptr<RandomAccessFile> file;
   TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
 
+  StringPiece filename;
+  TF_EXPECT_OK(file->Name(&filename));
+  EXPECT_EQ(filename, "gs://bucket/random_access.txt");
+
   char scratch[6];
   StringPiece result;
 
@@ -638,6 +643,9 @@ TEST(GcsFileSystemTest, NewWritableFile) {
   std::unique_ptr<WritableFile> wfile;
   TF_EXPECT_OK(fs.NewWritableFile("gs://bucket/path/writeable", &wfile));
   TF_EXPECT_OK(wfile->Append("content1,"));
+  int64 pos;
+  TF_EXPECT_OK(wfile->Tell(&pos));
+  EXPECT_EQ(9, pos);
   TF_EXPECT_OK(wfile->Append("content2"));
   TF_EXPECT_OK(wfile->Flush());
   // Re-reading the file should trigger another HTTP request to GCS.
@@ -2789,6 +2797,12 @@ TEST(GcsFileSystemTest, IsDirectory_BucketNotFound) {
 TEST(GcsFileSystemTest, CreateDir_Folder) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "subpath%2F?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           "{}"),
+       new FakeHttpRequest(
            "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
            "uploadType=resumable&name=subpath%2F\n"
            "Auth Token: fake_token\n"
@@ -2802,18 +2816,12 @@ TEST(GcsFileSystemTest, CreateDir_Folder) {
                            "Put body: \n",
                            ""),
        new FakeHttpRequest(
-           "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
-           "uploadType=resumable&name=subpath%2F\n"
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "subpath%2F?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
-           "Header X-Upload-Content-Length: 0\n"
-           "Post: yes\n"
            "Timeouts: 5 1 10\n",
-           "", {{"Location", "https://custom/upload/location"}}),
-       new FakeHttpRequest("Uri: https://custom/upload/location\n"
-                           "Auth Token: fake_token\n"
-                           "Timeouts: 5 1 30\n"
-                           "Put body: \n",
-                           "")});
+           strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
                        new FakeHttpRequestFactory(&requests)),
@@ -2826,7 +2834,8 @@ TEST(GcsFileSystemTest, CreateDir_Folder) {
                    nullptr /* gcs additional header */);
 
   TF_EXPECT_OK(fs.CreateDir("gs://bucket/subpath"));
-  TF_EXPECT_OK(fs.CreateDir("gs://bucket/subpath/"));
+  EXPECT_EQ(errors::AlreadyExists("gs://bucket/subpath/"),
+            fs.CreateDir("gs://bucket/subpath/"));
 }
 
 TEST(GcsFileSystemTest, CreateDir_Bucket) {
@@ -3146,8 +3155,8 @@ TEST(GcsFileSystemTest, AdditionalRequestHeaderTest) {
 TEST(GcsFileSystemTest, OverrideCacheParameters) {
   // Verify defaults are propagated correctly.
   GcsFileSystem fs1;
-  EXPECT_EQ(128 * 1024 * 1024, fs1.block_size());
-  EXPECT_EQ(2 * fs1.block_size(), fs1.max_bytes());
+  EXPECT_EQ(16 * 1024 * 1024, fs1.block_size());
+  EXPECT_EQ(fs1.block_size(), fs1.max_bytes());
   EXPECT_EQ(0, fs1.max_staleness());
   EXPECT_EQ(120, fs1.timeouts().connect);
   EXPECT_EQ(60, fs1.timeouts().idle);
diff --git a/tensorflow/core/platform/cloud/google_auth_provider.cc b/tensorflow/core/platform/cloud/google_auth_provider.cc
index e15400780af0880caadd2f79b7322f39e406ca2b..e91a9f89757ae6f0009ea20120cd98ab25cd1437 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider.cc
+++ b/tensorflow/core/platform/cloud/google_auth_provider.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #endif
 #include <fstream>
 #include <utility>
+#include "absl/strings/match.h"
 #include "include/json/json.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -44,6 +45,11 @@ constexpr char kGoogleAuthTokenForTesting[] = "GOOGLE_AUTH_TOKEN_FOR_TESTING";
 // The environment variable which can override '~/.config/gcloud' if set.
 constexpr char kCloudSdkConfig[] = "CLOUDSDK_CONFIG";
 
+// The environment variable used to skip attempting to fetch GCE credentials:
+// setting this to 'true' (case insensitive) will skip attempting to contact
+// the GCE metadata service.
+constexpr char kNoGceCheck[] = "NO_GCE_CHECK";
+
 // The default path to the gcloud config folder, relative to the home folder.
 constexpr char kGCloudConfigFolder[] = ".config/gcloud/";
 
@@ -146,10 +152,25 @@ Status GoogleAuthProvider::GetToken(string* t) {
   }
 
   auto token_from_files_status = GetTokenFromFiles();
-  auto token_from_gce_status =
-      token_from_files_status.ok() ? Status::OK() : GetTokenFromGce();
+  if (token_from_files_status.ok()) {
+    *t = current_token_;
+    return Status::OK();
+  }
+
+  char* no_gce_check_var = std::getenv(kNoGceCheck);
+  bool skip_gce_check = no_gce_check_var != nullptr &&
+                        absl::EqualsIgnoreCase(no_gce_check_var, "true");
+  Status token_from_gce_status;
+  if (skip_gce_check) {
+    token_from_gce_status =
+        Status(error::CANCELLED,
+               strings::StrCat("GCE check skipped due to presence of $",
+                               kNoGceCheck, " environment variable."));
+  } else {
+    token_from_gce_status = GetTokenFromGce();
+  }
 
-  if (token_from_files_status.ok() || token_from_gce_status.ok()) {
+  if (token_from_gce_status.ok()) {
     *t = current_token_;
     return Status::OK();
   }
@@ -165,8 +186,13 @@ Status GoogleAuthProvider::GetToken(string* t) {
   // so return an empty token instead of failing.
   *t = "";
 
-  // From now on, always return the empty token.
-  expiration_timestamp_sec_ = UINT64_MAX;
+  // We only want to keep returning our empty token if we've tried and failed
+  // the (potentially slow) task of detecting GCE.
+  if (skip_gce_check) {
+    expiration_timestamp_sec_ = 0;
+  } else {
+    expiration_timestamp_sec_ = UINT64_MAX;
+  }
   current_token_ = "";
 
   return Status::OK();
diff --git a/tensorflow/core/platform/cloud/google_auth_provider.h b/tensorflow/core/platform/cloud/google_auth_provider.h
index 3755b124a87fd0003e5a6343b1a07130f5519dd6..4ab816d54c61e99dea1e2db59d4815f5012d5adc 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider.h
+++ b/tensorflow/core/platform/cloud/google_auth_provider.h
@@ -51,7 +51,7 @@ class GoogleAuthProvider : public AuthProvider {
   /// Gets the bearer token from Google Compute Engine environment.
   Status GetTokenFromGce() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  /// Gets the bearer token from the systen env variable, for testing purposes.
+  /// Gets the bearer token from the system env variable, for testing purposes.
   Status GetTokenForTesting() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   std::unique_ptr<OAuthClient> oauth_client_;
diff --git a/tensorflow/core/platform/cloud/google_auth_provider_test.cc b/tensorflow/core/platform/cloud/google_auth_provider_test.cc
index ec31c5ee8c11645cf9f8a5659538b46d56ce84ca..d2db59200abb4fd5db5a86d9a653729aa518ee63 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider_test.cc
+++ b/tensorflow/core/platform/cloud/google_auth_provider_test.cc
@@ -69,9 +69,10 @@ class GoogleAuthProviderTest : public ::testing::Test {
   void TearDown() override { ClearEnvVars(); }
 
   void ClearEnvVars() {
-    unsetenv("GOOGLE_APPLICATION_CREDENTIALS");
     unsetenv("CLOUDSDK_CONFIG");
+    unsetenv("GOOGLE_APPLICATION_CREDENTIALS");
     unsetenv("GOOGLE_AUTH_TOKEN_FOR_TESTING");
+    unsetenv("NO_GCE_CHECK");
   }
 };
 
@@ -238,4 +239,31 @@ TEST_F(GoogleAuthProviderTest, NothingAvailable) {
   EXPECT_EQ("", token);
 }
 
+TEST_F(GoogleAuthProviderTest, NoGceCheckEnvironmentVariable) {
+  setenv("NO_GCE_CHECK", "True", 1);
+  auto oauth_client = new FakeOAuthClient;
+
+  FakeEnv env;
+  // If the env var above isn't respected, attempting to fetch a token
+  // from GCE will segfault (as the metadata client is null).
+  GoogleAuthProvider provider(std::unique_ptr<OAuthClient>(oauth_client),
+                              nullptr, &env);
+
+  string token;
+  TF_EXPECT_OK(provider.GetToken(&token));
+  EXPECT_EQ("", token);
+
+  // We confirm that our env var is case insensitive.
+  setenv("NO_GCE_CHECK", "true", 1);
+  TF_EXPECT_OK(provider.GetToken(&token));
+  EXPECT_EQ("", token);
+
+  // We also want to confirm that our empty token has a short expiration set: we
+  // now set a testing token, and confirm that it's returned instead of our
+  // empty token.
+  setenv("GOOGLE_AUTH_TOKEN_FOR_TESTING", "newToken", 1);
+  TF_EXPECT_OK(provider.GetToken(&token));
+  EXPECT_EQ("newToken", token);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/oauth_client.cc b/tensorflow/core/platform/cloud/oauth_client.cc
index 9b85cae9b90eabfd303ee465ac90e9121c7285cf..a8657359a3561d84b37a47a2696641e869ed567a 100644
--- a/tensorflow/core/platform/cloud/oauth_client.cc
+++ b/tensorflow/core/platform/cloud/oauth_client.cc
@@ -95,6 +95,11 @@ Status CreateSignature(RSA* private_key, StringPiece to_sign,
   if (!md) {
     return errors::Internal("Could not get a sha256 encryptor.");
   }
+
+  // EVP_MD_CTX_destroy is renamed to EVP_MD_CTX_free in OpenSSL 1.1.0 but
+  // the old name is still retained as a compatibility macro.
+  // Keep this around until support is dropped for OpenSSL 1.0
+  // https://www.openssl.org/news/cl110.txt
   std::unique_ptr<EVP_MD_CTX, std::function<void(EVP_MD_CTX*)>> md_ctx(
       EVP_MD_CTX_create(), [](EVP_MD_CTX* ptr) { EVP_MD_CTX_destroy(ptr); });
   if (!md_ctx) {
@@ -119,7 +124,6 @@ Status CreateSignature(RSA* private_key, StringPiece to_sign,
   if (EVP_DigestSignFinal(md_ctx.get(), sig.get(), &sig_len) != 1) {
     return errors::Internal("DigestFinal (signature compute) failed.");
   }
-  EVP_MD_CTX_cleanup(md_ctx.get());
   return Base64Encode(StringPiece(reinterpret_cast<char*>(sig.get()), sig_len),
                       signature);
 }
diff --git a/tensorflow/core/platform/cloud/oauth_client_test.cc b/tensorflow/core/platform/cloud/oauth_client_test.cc
index 1cd0641cd3a7dd8376a365f243d63cbfc6b177c2..ce3b9d79c8b12c85a47b5ee6a773f9fadccb2127 100644
--- a/tensorflow/core/platform/cloud/oauth_client_test.cc
+++ b/tensorflow/core/platform/cloud/oauth_client_test.cc
@@ -166,7 +166,6 @@ TEST(OAuthClientTest, GetTokenFromServiceAccountJson) {
                 const_cast<unsigned char*>(
                     reinterpret_cast<const unsigned char*>(signature.data())),
                 signature.size()));
-  EVP_MD_CTX_cleanup(md_ctx);
 
   // Free all the crypto-related resources.
   EVP_PKEY_free(key);
diff --git a/tensorflow/core/platform/cloud/retrying_file_system.h b/tensorflow/core/platform/cloud/retrying_file_system.h
index 5ce6670dc7a561a0975f7444160efb86079e6867..9659edd890efe0a2c84da62b859162a1a3c2229c 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system.h
+++ b/tensorflow/core/platform/cloud/retrying_file_system.h
@@ -150,6 +150,10 @@ class RetryingRandomAccessFile : public RandomAccessFile {
                            const RetryConfig& retry_config)
       : base_file_(std::move(base_file)), retry_config_(retry_config) {}
 
+  Status Name(StringPiece* result) const override {
+    return base_file_->Name(result);
+  }
+
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
     return RetryingUtils::CallWithRetries(
@@ -187,10 +191,18 @@ class RetryingWritableFile : public WritableFile {
     return RetryingUtils::CallWithRetries(
         [this]() { return base_file_->Flush(); }, retry_config_);
   }
+  Status Name(StringPiece* result) const override {
+    return base_file_->Name(result);
+  }
   Status Sync() override {
     return RetryingUtils::CallWithRetries(
         [this]() { return base_file_->Sync(); }, retry_config_);
   }
+  Status Tell(int64* position) override {
+    return RetryingUtils::CallWithRetries(
+        [this, &position]() { return base_file_->Tell(position); },
+        retry_config_);
+  }
 
  private:
   std::unique_ptr<WritableFile> base_file_;
diff --git a/tensorflow/core/platform/cloud/retrying_file_system_test.cc b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
index 868eea096c2157654630d26eaa22b755167fe840..8a0b865499befb1d984babbe9f8e9176625d0321 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
@@ -60,6 +60,9 @@ class MockCallSequence {
 class MockRandomAccessFile : public RandomAccessFile {
  public:
   explicit MockRandomAccessFile(const ExpectedCalls& calls) : calls_(calls) {}
+  Status Name(StringPiece* result) const override {
+    return calls_.ConsumeNextCall("Name");
+  }
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
     return calls_.ConsumeNextCall("Read");
@@ -77,7 +80,13 @@ class MockWritableFile : public WritableFile {
   }
   Status Close() override { return calls_.ConsumeNextCall("Close"); }
   Status Flush() override { return calls_.ConsumeNextCall("Flush"); }
+  Status Name(StringPiece* result) const override {
+    return calls_.ConsumeNextCall("Name");
+  }
   Status Sync() override { return calls_.ConsumeNextCall("Sync"); }
+  Status Tell(int64* position) override {
+    return calls_.ConsumeNextCall("Tell");
+  }
 
  private:
   mutable MockCallSequence calls_;
@@ -174,7 +183,8 @@ class MockFileSystem : public FileSystem {
 
 TEST(RetryingFileSystemTest, NewRandomAccessFile_ImmediateSuccess) {
   // Configure the mock base random access file.
-  ExpectedCalls expected_file_calls({std::make_tuple("Read", Status::OK())});
+  ExpectedCalls expected_file_calls({std::make_tuple("Name", Status::OK()),
+                                     std::make_tuple("Read", Status::OK())});
   std::unique_ptr<RandomAccessFile> base_file(
       new MockRandomAccessFile(expected_file_calls));
 
@@ -193,6 +203,9 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_ImmediateSuccess) {
 
   // Use it and check the results.
   StringPiece result;
+  TF_EXPECT_OK(random_access_file->Name(&result));
+  EXPECT_EQ(result, "");
+
   char scratch[10];
   TF_EXPECT_OK(random_access_file->Read(0, 10, &result, scratch));
 }
@@ -284,7 +297,8 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_NoRetriesForSomeErrors) {
 
 TEST(RetryingFileSystemTest, NewWritableFile_ImmediateSuccess) {
   // Configure the mock base random access file.
-  ExpectedCalls expected_file_calls({std::make_tuple("Sync", Status::OK()),
+  ExpectedCalls expected_file_calls({std::make_tuple("Name", Status::OK()),
+                                     std::make_tuple("Sync", Status::OK()),
                                      std::make_tuple("Close", Status::OK())});
   std::unique_ptr<WritableFile> base_file(
       new MockWritableFile(expected_file_calls));
@@ -302,6 +316,10 @@ TEST(RetryingFileSystemTest, NewWritableFile_ImmediateSuccess) {
   std::unique_ptr<WritableFile> writable_file;
   TF_EXPECT_OK(fs.NewWritableFile("filename.txt", &writable_file));
 
+  StringPiece result;
+  TF_EXPECT_OK(writable_file->Name(&result));
+  EXPECT_EQ(result, "");
+
   // Use it and check the results.
   TF_EXPECT_OK(writable_file->Sync());
 }
diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc
index 2efe0c0876e871f6752bb3e7724de4c505102130..38fc453008fcc9b4d59e44591c42ad83df061e70 100644
--- a/tensorflow/core/platform/cpu_feature_guard.cc
+++ b/tensorflow/core/platform/cpu_feature_guard.cc
@@ -138,8 +138,16 @@ void InfoAboutUnusedCPUFeatures() {
 #endif  // __FMA__
 #endif  // else of if defined(_MSC_VER) && !defined(__clang__)
     if (!missing_instructions.empty()) {
+#ifndef INTEL_MKL
       LOG(INFO) << "Your CPU supports instructions that this TensorFlow "
                 << "binary was not compiled to use:" << missing_instructions;
+#else
+      LOG(INFO) << "This TensorFlow binary is optimized with Intel(R) MKL-DNN "
+                << "to use the following CPU instructions in performance "
+                << "critical operations: " << missing_instructions << std::endl
+                << "To enable them in non-MKL-DNN operations, rebuild "
+                << "TensorFlow with the appropriate compiler flags.";
+#endif
     }
   });
 }
diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h
index 6eba83224a4b861f7b4a469d82116ef63d4814d9..c9208cc75536732b9274440a4e5e48b51ffeb4e3 100644
--- a/tensorflow/core/platform/cpu_info.h
+++ b/tensorflow/core/platform/cpu_info.h
@@ -32,9 +32,22 @@ namespace port {
 // Returns an estimate of the number of schedulable CPUs for this
 // process.  Usually, it's constant throughout the lifetime of a
 // process, but it might change if the underlying cluster management
-// software can change it dynamically.
+// software can change it dynamically.  If the underlying call fails, a default
+// value (e.g. `4`) may be returned.
 int NumSchedulableCPUs();
 
+// Returns the total number of CPUs on the system.  This number should
+// not change even if the underlying cluster management software may
+// change the number of schedulable CPUs.  Unlike `NumSchedulableCPUs`, if the
+// underlying call fails, an invalid value of -1 will be returned;
+// the user must check for validity.
+static constexpr int kUnknownCPU = -1;
+int NumTotalCPUs();
+
+// Returns the id of the current CPU.  Returns -1 if the current CPU cannot be
+// identified.  If successful, the return value will be in [0, NumTotalCPUs()).
+int GetCurrentCPU();
+
 // Returns an estimate of the number of hyperthreads per physical core
 // on the CPU
 int NumHyperthreadsPerCore();
diff --git a/tensorflow/core/platform/cuda_libdevice_path.h b/tensorflow/core/platform/cuda_libdevice_path.h
index 6ef565ecd3c6460791b49a25fd4277e9393cfdd0..f2dbff9043a77dc8766092e89d29f642dd443966 100644
--- a/tensorflow/core/platform/cuda_libdevice_path.h
+++ b/tensorflow/core/platform/cuda_libdevice_path.h
@@ -16,16 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PLATFORM_CUDA_LIBDEVICE_PATH_H_
 #define TENSORFLOW_CORE_PLATFORM_CUDA_LIBDEVICE_PATH_H_
 
+#include <vector>
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
-// Returns the root directory of the CUDA SDK, which contains sub-folders such
-// as bin, lib64, and nvvm.
-string CudaRoot();
-
-// Returns the directory that contains nvvm libdevice files in the CUDA SDK.
-string LibdeviceRoot();
+// Returns, in order of preference, potential locations of the root directory of
+// the CUDA SDK, which contains sub-folders such as bin, lib64, and nvvm.
+std::vector<string> CandidateCudaRoots();
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/cuda_libdevice_path_test.cc b/tensorflow/core/platform/cuda_libdevice_path_test.cc
deleted file mode 100644
index 2d34239a9958d722a1cb84213657ca8229ebaf2c..0000000000000000000000000000000000000000
--- a/tensorflow/core/platform/cuda_libdevice_path_test.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/platform/cuda_libdevice_path.h"
-
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-
-#if GOOGLE_CUDA
-TEST(CudaLibdevicePathTest, LibdevicePath) {
-  VLOG(2) << "Libdevice root = " << LibdeviceRoot();
-  std::vector<string> libdevice_files;
-  TF_EXPECT_OK(Env::Default()->GetMatchingPaths(
-      io::JoinPath(LibdeviceRoot(), "libdevice.*.bc"), &libdevice_files));
-  EXPECT_LT(0, libdevice_files.size());
-}
-#endif
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 04287151301dd0c6eb25ec7bc8b12a207f44ab90..f9ac4ff0bca29acb36fe43dc3d3062eebfbee21d 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -6,6 +6,7 @@ load("//tensorflow:tensorflow.bzl", "if_windows")
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl_ml",
@@ -34,6 +35,7 @@ def pyx_library(
         deps = [],
         py_deps = [],
         srcs = [],
+        testonly = None,
         **kwargs):
     """Compiles a group of .pyx / .pxd / .py files.
 
@@ -75,6 +77,7 @@ def pyx_library(
             # Optionally use PYTHON_BIN_PATH on Linux platforms so that python 3
             # works. Windows has issues with cython_binary so skip PYTHON_BIN_PATH.
             cmd = "PYTHONHASHSEED=0 $(location @cython//:cython_binary) --cplus $(SRCS) --output-file $(OUTS)",
+            testonly = testonly,
             tools = ["@cython//:cython_binary"] + pxd_srcs,
         )
 
@@ -85,8 +88,9 @@ def pyx_library(
         native.cc_binary(
             name = shared_object_name,
             srcs = [stem + ".cpp"],
-            deps = deps + ["//third_party/python_runtime:headers"],
+            deps = deps + ["@org_tensorflow//third_party/python_runtime:headers"],
             linkshared = 1,
+            testonly = testonly,
         )
         shared_objects.append(shared_object_name)
 
@@ -97,6 +101,7 @@ def pyx_library(
         deps = py_deps,
         srcs_version = "PY2AND3",
         data = shared_objects,
+        testonly = testonly,
         **kwargs
     )
 
@@ -525,27 +530,18 @@ def tf_additional_proto_hdrs():
     return [
         "platform/default/integral_types.h",
         "platform/default/logging.h",
-        "platform/default/protobuf.h",
     ] + if_windows([
         "platform/windows/integral_types.h",
     ])
 
-def tf_additional_proto_compiler_hdrs():
-    return [
-        "platform/default/protobuf_compiler.h",
-    ]
-
 def tf_additional_proto_srcs():
     return [
-        "platform/default/protobuf.cc",
+        "platform/protobuf.cc",
     ]
 
 def tf_additional_human_readable_json_deps():
     return []
 
-def tf_additional_logger_deps():
-    return []
-
 def tf_additional_all_protos():
     return ["//tensorflow/core:protos_all"]
 
@@ -558,6 +554,15 @@ def tf_protos_all():
         otherwise = ["//tensorflow/core:protos_all_cc"],
     )
 
+def tf_profiler_all_protos():
+    return ["//tensorflow/core/profiler:protos_all"]
+
+def tf_grpc_service_all():
+    return [
+        "//tensorflow/core/profiler:profiler_analysis_proto_cc",
+        "//tensorflow/core/profiler:profiler_service_proto_cc",
+    ]
+
 def tf_protos_grappler_impl():
     return ["//tensorflow/core/grappler/costs:op_performance_data_cc_impl"]
 
@@ -577,7 +582,10 @@ def tf_additional_device_tracer_cuda_deps():
     return []
 
 def tf_additional_device_tracer_deps():
-    return []
+    return [
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/profiler/internal/cpu:host_tracer",
+    ]
 
 def tf_additional_device_tracer_test_flags():
     return []
@@ -662,6 +670,8 @@ def tf_additional_cloud_op_deps():
         "//tensorflow:ios": [],
         "//tensorflow:linux_s390x": [],
         "//tensorflow:windows": [],
+        "//tensorflow:api_version_2": [],
+        "//tensorflow:windows_and_api_version_2": [],
         "//tensorflow:no_gcp_support": [],
         "//conditions:default": [
             "//tensorflow/contrib/cloud:bigquery_reader_ops_op_lib",
@@ -669,13 +679,15 @@ def tf_additional_cloud_op_deps():
         ],
     })
 
-# TODO(jart, jhseu): Delete when GCP is default on.
+# TODO(jhseu): Delete when GCP is default on.
 def tf_additional_cloud_kernel_deps():
     return select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
         "//tensorflow:linux_s390x": [],
         "//tensorflow:windows": [],
+        "//tensorflow:api_version_2": [],
+        "//tensorflow:windows_and_api_version_2": [],
         "//tensorflow:no_gcp_support": [],
         "//conditions:default": [
             "//tensorflow/contrib/cloud/kernels:bigquery_reader_ops",
@@ -728,7 +740,11 @@ def tf_additional_binary_deps():
     return ["@nsync//:nsync_cpp"] + if_cuda(
         [
             "//tensorflow/stream_executor:cuda_platform",
-            "//tensorflow/core/platform/default/build_config:cuda",
+        ],
+    ) + if_rocm(
+        [
+            "//tensorflow/stream_executor:rocm_platform",
+            "//tensorflow/core/platform/default/build_config:rocm",
         ],
     ) + [
         # TODO(allenl): Split these out into their own shared objects (they are
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index da1f66dc6763121819fe443066acc40c1d5fa79d..845fe0ec047f64f38a6b36e41d28f8d714384b76 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -7,7 +7,9 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow:tensorflow.bzl", "check_deps")
 load("//tensorflow:tensorflow.bzl", "if_cuda")
+load("//tensorflow:tensorflow.bzl", "if_rocm")
 load("//tensorflow:tensorflow.bzl", "tf_copts")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
@@ -32,16 +34,26 @@ cc_library(
 
 tf_cuda_library(
     name = "stream_executor",
+    cuda_deps = ["//tensorflow/stream_executor/cuda:cuda_activation"],
     deps = [
         "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:dnn",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:multi_platform_manager",
+        "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor/cuda:cuda_platform_id",
+        "//tensorflow/stream_executor/host:host_platform_id",
+        "//tensorflow/stream_executor/platform:dso_loader",
+        "//tensorflow/stream_executor/rocm:rocm_platform_id",
     ] + select({
-        "//tensorflow:using_cuda_clang": ["//tensorflow/stream_executor:cuda_platform"],
-        "//tensorflow:using_cuda_nvcc": ["//tensorflow/stream_executor:cuda_platform"],
-        "//tensorflow:using_cuda_clang_with_dynamic_build": [],
-        "//tensorflow:using_cuda_nvcc_with_dynamic_build": [],
+        "@local_config_cuda//cuda:darwin": ["IOKit"],
         "//conditions:default": [],
     }) + select({
-        "@local_config_cuda//cuda:darwin": ["IOKit"],
+        "//tensorflow:using_cuda_clang": ["//tensorflow/stream_executor/cuda:all_runtime"],
+        "//tensorflow:using_cuda_nvcc": ["//tensorflow/stream_executor/cuda:all_runtime"],
+        "//tensorflow:using_cuda_clang_with_dynamic_build": [],
+        "//tensorflow:using_cuda_nvcc_with_dynamic_build": [],
+        "//tensorflow:using_rocm_hipcc": ["//tensorflow/stream_executor/rocm:all_runtime"],
         "//conditions:default": [],
     }),
 )
@@ -49,19 +61,41 @@ tf_cuda_library(
 cc_library(
     name = "stream_executor_cuda",
     deps = [
-        "//tensorflow/stream_executor",
+        ":stream_executor_no_cuda",
+        ":cuda",
     ] + if_static(
-        ["//tensorflow/stream_executor:cuda_platform"],
+        ["//tensorflow/stream_executor/cuda:all_runtime"],
     ) + select({
         "@local_config_cuda//cuda:darwin": ["IOKit"],
         "//conditions:default": [],
     }),
 )
 
+cc_library(
+    name = "stream_executor_rocm",
+    deps = [
+        ":stream_executor_no_cuda",
+        ":rocm",
+    ] + if_static(
+        ["//tensorflow/stream_executor/rocm:all_runtime"],
+    ) + select({
+        "//conditions:default": [],
+    }),
+)
+
 cc_library(
     name = "stream_executor_no_cuda",
     deps = [
         "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:dnn",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:multi_platform_manager",
+        "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor/cuda:cuda_platform_id",
+        "//tensorflow/stream_executor/host:host_platform",
+        "//tensorflow/stream_executor/host:host_platform_id",
+        "//tensorflow/stream_executor/platform:dso_loader",
+        "//tensorflow/stream_executor/rocm:rocm_platform_id",
     ],
 )
 
@@ -250,6 +284,31 @@ cc_library(
     ],
 )
 
+# Check that libtensorflow_framework.so does not depend on cuda shared libraries.
+check_deps(
+    name = "libtensorflow_cuda_check_deps",
+    disallowed_deps = [
+        ":cuda",
+        "@local_config_cuda//cuda:cublas",
+        "@local_config_cuda//cuda:cuda_driver",
+        "@local_config_cuda//cuda:cudnn",
+        "@local_config_cuda//cuda:curand",
+        "@local_config_cuda//cuda:cusolver",
+    ],
+    deps = ["//tensorflow:libtensorflow_framework.so"],
+)
+
+cc_library(
+    name = "rocm",
+    data = [],
+    linkopts = select({
+        "//conditions:default": [
+            "-Wl,-rpath,../local_config_rocm/rocm/rocm/lib",
+        ],
+    }),
+    deps = [],
+)
+
 cc_library(
     name = "sycl",
     data = if_ccpp([
diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl
index 37475feebe2047f81ec60aea677cfcb0be73a08b..ab05b25d6822c12d82d14f6d5c4717d77c27f8e5 100644
--- a/tensorflow/core/platform/default/build_config_root.bzl
+++ b/tensorflow/core/platform/default/build_config_root.bzl
@@ -2,11 +2,19 @@
 # The functions in this file might be referred by tensorflow.bzl. They have to
 # be separate to avoid cyclic references.
 
+load("@local_config_remote_execution//:remote_execution.bzl", "gpu_test_tags")
+
 def tf_cuda_tests_tags():
-    return ["requires-gpu", "local", "gpu"]
+    return ["requires-gpu", "gpu"] + gpu_test_tags()
 
 def tf_sycl_tests_tags():
-    return ["requires-gpu", "local", "gpu"]
+    return ["requires-gpu", "gpu"] + gpu_test_tags()
+
+def tf_exec_compatible_with(kwargs):
+    if ("tags" in kwargs and kwargs["tags"] != None and
+        "remote-gpu" in kwargs["tags"]):
+        return ["@org_tensorflow//third_party/toolchains:gpu_test"]
+    return []
 
 def tf_additional_plugin_deps():
     return select({
diff --git a/tensorflow/core/platform/default/cuda_libdevice_path.cc b/tensorflow/core/platform/default/cuda_libdevice_path.cc
index 20ee3ad621a0688013802c37184aca1342dbe45e..a8b2e7202ac79d821d88b711d1476a1893a6e5fa 100644
--- a/tensorflow/core/platform/default/cuda_libdevice_path.cc
+++ b/tensorflow/core/platform/default/cuda_libdevice_path.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cuda_libdevice_path.h"
 
 #include <stdlib.h>
+#include <vector>
 
 #if !defined(PLATFORM_GOOGLE)
 #include "cuda/cuda_config.h"
@@ -24,9 +25,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-string CudaRoot() {
+std::vector<string> CandidateCudaRoots() {
   VLOG(3) << "CUDA root = " << TF_CUDA_TOOLKIT_PATH;
-  return TF_CUDA_TOOLKIT_PATH;
+  return {TF_CUDA_TOOLKIT_PATH};
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/device_tracer.cc b/tensorflow/core/platform/default/device_tracer.cc
index 8351362e05699c591b5563f2270928f4408077e8..ffcb38fdcd227840372923325bcb9e3a0733cb55 100644
--- a/tensorflow/core/platform/default/device_tracer.cc
+++ b/tensorflow/core/platform/default/device_tracer.cc
@@ -31,6 +31,8 @@ limitations under the License.
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/profiler/internal/cpu/host_tracer.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace {
 
@@ -299,6 +301,14 @@ TF_STATIC_THREAD_LOCAL_POD(const char *, tls_current_annotation);
 
 class TraceCollectorImpl : public tracing::TraceCollector {
  public:
+  class ActivityHandle : public Handle {
+   public:
+    ActivityHandle(string &&name, int level)
+        : trace_me_(std::move(name), level) {}
+
+   private:
+    profiler::TraceMe trace_me_;
+  };
   TraceCollectorImpl() { tracing::SetTraceCollector(this); }
 
   ~TraceCollectorImpl() override {
@@ -318,14 +328,16 @@ class TraceCollectorImpl : public tracing::TraceCollector {
       }
       ~Impl() override { tls_current_annotation.get() = nullptr; }
     };
-    return std::unique_ptr<Handle>(
-        new Impl{ConcatenateNames(name_part1, name_part2)});
+    return absl::make_unique<Impl>(ConcatenateNames(name_part1, name_part2));
   }
 
-  virtual std::unique_ptr<Handle> CreateActivityHandle(StringPiece, StringPiece,
-                                                       bool) const {
-    // We don't do anything with 'Activities' yet.
-    return nullptr;
+  virtual std::unique_ptr<Handle> CreateActivityHandle(
+      StringPiece name_part1, StringPiece name_part2, bool is_expensive) const {
+    if (!IsEnabledForActivities(is_expensive)) {
+      return nullptr;
+    }
+    return absl::make_unique<ActivityHandle>(
+        ConcatenateNames(name_part1, name_part2), GetLevel(is_expensive));
   }
 
   bool IsEnabledForAnnotations() const override {
@@ -333,8 +345,7 @@ class TraceCollectorImpl : public tracing::TraceCollector {
   }
 
   bool IsEnabledForActivities(bool is_expensive) const override {
-    // We don't do anything with 'Activities' so we are never 'enabled'.
-    return false;
+    return profiler::TraceMeRecorder::Active(GetLevel(is_expensive));
   }
 
   void Start() {
@@ -349,6 +360,10 @@ class TraceCollectorImpl : public tracing::TraceCollector {
   }
 
  private:
+  static int GetLevel(bool is_expensive) {
+    return profiler::GetTFTraceMeLevel(is_expensive);
+  }
+
   std::atomic<bool> active_trace_session_;
 };
 
@@ -421,6 +436,7 @@ class DeviceTracerImpl : public DeviceTracer, public CUPTIClient {
   int64 end_walltime_us_ GUARDED_BY(mu_);
   uint64_t start_timestamp_ GUARDED_BY(mu_);
   uint64_t end_timestamp_ GUARDED_BY(mu_);
+  std::unique_ptr<profiler::cpu::HostTracer> host_tracer_ GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(DeviceTracerImpl);
 };
@@ -429,6 +445,7 @@ DeviceTracerImpl::DeviceTracerImpl(CUPTIManager *cupti_manager)
     : cupti_manager_(cupti_manager) {
   VLOG(1) << "DeviceTracer created.";
   cupti_wrapper_.reset(new perftools::gputools::profiler::CuptiWrapper());
+  host_tracer_ = profiler::cpu::HostTracer::Create(2);
   enabled_ = false;
 }
 
@@ -493,6 +510,7 @@ Status DeviceTracerImpl::Start() {
 
   CUPTI_CALL(GetTimestamp(&start_timestamp_));
   start_walltime_us_ = NowInUsec();
+  host_tracer_->Start().IgnoreError();
   enabled_ = true;
   return Status::OK();
 }
@@ -510,6 +528,7 @@ Status DeviceTracerImpl::Stop() {
   end_walltime_us_ = NowInUsec();
   CUPTI_CALL(GetTimestamp(&end_timestamp_));
   enabled_ = false;
+  host_tracer_->Stop().IgnoreError();
   return Status::OK();
 }
 
@@ -676,6 +695,8 @@ Status DeviceTracerImpl::Collect(StepStatsCollector *collector) {
     collector->Save(memcpy_device, ns);
     collector->Save(strings::StrCat(stream_device, rec.stream_id), nscopy);
   }
+
+  host_tracer_->CollectDataToCollector(collector).IgnoreError();
   return Status::OK();
 }
 
diff --git a/tensorflow/core/platform/default/distribute.bzl b/tensorflow/core/platform/default/distribute.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..ea8fa8708e48efec42499400807d931a6cd11b10
--- /dev/null
+++ b/tensorflow/core/platform/default/distribute.bzl
@@ -0,0 +1,41 @@
+"""Build rules for tf.distritbute testing."""
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+def distribute_py_test(
+        name,
+        srcs = [],
+        deps = [],
+        tags = [],
+        data = [],
+        main = None,
+        args = [],
+        shard_count = 1,
+        full_precision = False,
+        **kwargs):
+    """Generates py_test targets for CPU and GPU.
+
+    Args:
+        name: test target name to generate suffixed with `test`.
+        srcs: source files for the tests.
+        deps: additional dependencies for the test targets.
+        tags: tags to be assigned to the different test targets.
+        data: data files that need to be associated with the target files.
+        main: optional main script.
+        args: arguments to the tests.
+        shard_count: number of shards to split the tests across.
+        **kwargs: extra keyword arguments to the test.
+    """
+
+    _ignore = (full_precision)
+    cuda_py_test(
+        name = name,
+        srcs = srcs,
+        data = data,
+        main = main,
+        additional_deps = deps,
+        shard_count = shard_count,
+        tags = tags,
+        args = args,
+        **kwargs
+    )
diff --git a/tensorflow/core/platform/default/gpu/cupti_wrapper.cc b/tensorflow/core/platform/default/gpu/cupti_wrapper.cc
index 7ac5e5c4450708a486be956a5806e31b8dd36fa3..481bbf9bae1b1821571b25f15a6e34d5c6914619 100644
--- a/tensorflow/core/platform/default/gpu/cupti_wrapper.cc
+++ b/tensorflow/core/platform/default/gpu/cupti_wrapper.cc
@@ -28,27 +28,26 @@ namespace profiler {
 
 namespace dynload {
 
-#define LIBCUPTI_WRAP(__name)                                                 \
-  struct DynLoadShim__##__name {                                              \
-    static const char* kName;                                                 \
-    using FuncPointerT = std::add_pointer<decltype(::__name)>::type;          \
-    static void* GetDsoHandle() {                                             \
-      static auto status =                                                    \
-          stream_executor::internal::CachedDsoLoader::GetLibcuptiDsoHandle(); \
-      return status.ValueOrDie();                                             \
-    }                                                                         \
-    static FuncPointerT DynLoad() {                                           \
-      static void* f;                                                         \
-      TF_CHECK_OK(::tensorflow::Env::Default()->GetSymbolFromLibrary(         \
-          GetDsoHandle(), kName, &f))                                         \
-          << "could not find " << kName << "in libcupti DSO";                 \
-      return reinterpret_cast<FuncPointerT>(f);                               \
-    }                                                                         \
-    template <typename... Args>                                               \
-    CUptiResult operator()(Args... args) {                                    \
-      return DynLoad()(args...);                                              \
-    }                                                                         \
-  } __name;                                                                   \
+#define LIBCUPTI_WRAP(__name)                                                \
+  struct DynLoadShim__##__name {                                             \
+    static const char* kName;                                                \
+    using FuncPointerT = std::add_pointer<decltype(::__name)>::type;         \
+    template <typename... Args>                                              \
+    CUptiResult operator()(Args... args) {                                   \
+      static auto fn = []() -> FuncPointerT {                                \
+        auto handle_or =                                                     \
+            stream_executor::internal::CachedDsoLoader::GetCuptiDsoHandle(); \
+        if (!handle_or.ok()) return nullptr;                                 \
+        void* symbol;                                                        \
+        stream_executor::port::Env::Default()                                \
+            ->GetSymbolFromLibrary(handle_or.ValueOrDie(), kName, &symbol)   \
+            .IgnoreError();                                                  \
+        return reinterpret_cast<FuncPointerT>(symbol);                       \
+      }();                                                                   \
+      if (fn == nullptr) return CUPTI_ERROR_UNKNOWN;                         \
+      return fn(args...);                                                    \
+    }                                                                        \
+  } __name;                                                                  \
   const char* DynLoadShim__##__name::kName = #__name;
 
 LIBCUPTI_WRAP(cuptiActivityDisable);
diff --git a/tensorflow/core/platform/default/human_readable_json.cc b/tensorflow/core/platform/default/human_readable_json.cc
index bf9c7b76206b79ad43969a1e3e2de6e6cbdacc46..977ff1272ea2a97e0b52b785b24560e02eb44207 100644
--- a/tensorflow/core/platform/default/human_readable_json.cc
+++ b/tensorflow/core/platform/default/human_readable_json.cc
@@ -46,7 +46,7 @@ Status HumanReadableJsonToProto(const string& str, protobuf::Message* proto) {
   return errors::Internal("Cannot parse JSON protos on Android");
 #else
   proto->Clear();
-  auto status = google::protobuf::util::JsonStringToMessage(str, proto);
+  auto status = protobuf::util::JsonStringToMessage(str, proto);
   if (!status.ok()) {
     // Convert error_msg google::protobuf::StringPiece to
     // tensorflow::StringPiece.
diff --git a/tensorflow/core/platform/default/logging.h b/tensorflow/core/platform/default/logging.h
index bb8735ed32505294eff75620006694a4eda80bcc..99dd6de14164e1f9abd915348ce288ed3238a650 100644
--- a/tensorflow/core/platform/default/logging.h
+++ b/tensorflow/core/platform/default/logging.h
@@ -240,8 +240,7 @@ string* MakeCheckOpString(const T1& v1, const T2& v2, const char* exprtext) {
     if (TF_PREDICT_FALSE(v2 < 0)) {                                       \
       return ::tensorflow::internal::MakeCheckOpString(v1, v2, exprtext); \
     }                                                                     \
-    const size_t uval = (size_t)((unsigned)v1);                           \
-    return name##Impl<size_t, size_t>(uval, v2, exprtext);                \
+    return name##Impl<size_t, size_t>(v1, v2, exprtext);                  \
   }                                                                       \
   inline string* name##Impl(const int v1, const size_t v2,                \
                             const char* exprtext) {                       \
diff --git a/tensorflow/core/platform/default/platform.bzl b/tensorflow/core/platform/default/platform.bzl
index 20ab441bf43e19277c697f98f289ba80d755af48..76bfaa896efa2f8d8f06814d6f69f7bf0b66ed33 100644
--- a/tensorflow/core/platform/default/platform.bzl
+++ b/tensorflow/core/platform/default/platform.bzl
@@ -5,55 +5,52 @@ CUDNN_VERSION = ""
 PLATFORM = ""
 
 def cuda_sdk_version():
-  return CUDA_VERSION
+    return CUDA_VERSION
 
 def cudnn_sdk_version():
-  return CUDNN_VERSION
+    return CUDNN_VERSION
 
 def cuda_library_path(name, version = cuda_sdk_version()):
-  if PLATFORM == "Darwin":
-    if not version:
-      return "lib/lib{}.dylib".format(name)
+    if PLATFORM == "Darwin":
+        if not version:
+            return "lib/lib{}.dylib".format(name)
+        else:
+            return "lib/lib{}.{}.dylib".format(name, version)
+    elif not version:
+        return "lib64/lib{}.so".format(name)
     else:
-      return "lib/lib{}.{}.dylib".format(name, version)
-  else:
-    if not version:
-      return "lib64/lib{}.so".format(name)
-    else:
-      return "lib64/lib{}.so.{}".format(name, version)
+        return "lib64/lib{}.so.{}".format(name, version)
 
 def cuda_static_library_path(name):
-  if PLATFORM == "Darwin":
-    return "lib/lib{}_static.a".format(name)
-  else:
-    return "lib64/lib{}_static.a".format(name)
+    if PLATFORM == "Darwin":
+        return "lib/lib{}_static.a".format(name)
+    else:
+        return "lib64/lib{}_static.a".format(name)
 
 def cudnn_library_path(version = cudnn_sdk_version()):
-  if PLATFORM == "Darwin":
-    if not version:
-      return "lib/libcudnn.dylib"
-    else:
-      return "lib/libcudnn.{}.dylib".format(version)
-  else:
-    if not version:
-      return "lib64/libcudnn.so"
+    if PLATFORM == "Darwin":
+        if not version:
+            return "lib/libcudnn.dylib"
+        else:
+            return "lib/libcudnn.{}.dylib".format(version)
+    elif not version:
+        return "lib64/libcudnn.so"
     else:
-      return "lib64/libcudnn.so.{}".format(version)
+        return "lib64/libcudnn.so.{}".format(version)
 
 def cupti_library_path(version = cuda_sdk_version()):
-  if PLATFORM == "Darwin":
-    if not version:
-      return "extras/CUPTI/lib/libcupti.dylib"
+    if PLATFORM == "Darwin":
+        if not version:
+            return "extras/CUPTI/lib/libcupti.dylib"
+        else:
+            return "extras/CUPTI/lib/libcupti.{}.dylib".format(version)
+    elif not version:
+        return "extras/CUPTI/lib64/libcupti.so"
     else:
-      return "extras/CUPTI/lib/libcupti.{}.dylib".format(version)
-  else:
-    if not version:
-      return "extras/CUPTI/lib64/libcupti.so"
-    else:
-      return "extras/CUPTI/lib64/libcupti.so.{}".format(version)
+        return "extras/CUPTI/lib64/libcupti.so.{}".format(version)
 
 def readlink_command():
-  if PLATFORM == "Darwin":
-    return "greadlink"
-  else:
-    return "readlink"
+    if PLATFORM == "Darwin":
+        return "greadlink"
+    else:
+        return "readlink"
diff --git a/tensorflow/core/platform/default/protobuf.h b/tensorflow/core/platform/default/protobuf.h
deleted file mode 100644
index 2708d6ebda41c01edd881e733b985e237aa3242a..0000000000000000000000000000000000000000
--- a/tensorflow/core/platform/default/protobuf.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_PROTOBUF_H_
-#define TENSORFLOW_CORE_PLATFORM_DEFAULT_PROTOBUF_H_
-
-// IWYU pragma: private, include "third_party/tensorflow/core/platform/protobuf.h"
-// IWYU pragma: friend third_party/tensorflow/core/platform/protobuf.h
-
-#ifndef TENSORFLOW_LITE_PROTOS
-#include "google/protobuf/descriptor.h"
-#include "google/protobuf/descriptor.pb.h"
-#include "google/protobuf/dynamic_message.h"
-#include "google/protobuf/text_format.h"
-#include "google/protobuf/util/json_util.h"
-#include "google/protobuf/util/type_resolver_util.h"
-#endif
-
-#include "google/protobuf/arena.h"
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
-#include "google/protobuf/map.h"
-#include "google/protobuf/repeated_field.h"
-
-namespace tensorflow {
-namespace protobuf = ::google::protobuf;
-using protobuf_int64 = ::google::protobuf::int64;
-using protobuf_uint64 = ::google::protobuf::uint64;
-extern const char* kProtobufInt64Typename;
-extern const char* kProtobufUint64Typename;
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_PROTOBUF_H_
diff --git a/tensorflow/core/platform/default/protobuf_compiler.h b/tensorflow/core/platform/default/protobuf_compiler.h
deleted file mode 100644
index a93d7a184b21a1111764e0a7fc0765ebe877ce32..0000000000000000000000000000000000000000
--- a/tensorflow/core/platform/default/protobuf_compiler.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_PROTOBUF_COMPILER_H_
-#define TENSORFLOW_CORE_PLATFORM_DEFAULT_PROTOBUF_COMPILER_H_
-
-// IWYU pragma: private, include "third_party/tensorflow/core/platform/protobuf_compiler.h"
-// IWYU pragma: friend third_party/tensorflow/core/platform/protobuf_compiler.h
-
-#include "google/protobuf/compiler/importer.h"
-#include "tensorflow/core/platform/default/protobuf.h"
-
-#endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_PROTOBUF_H_
diff --git a/tensorflow/core/platform/default/stacktrace.h b/tensorflow/core/platform/default/stacktrace.h
index c8e297fa8d8c1ee48b060e6e2c7ee89eb0d23b39..b64bc15971037f204a40513cbf74cc7c944e08f2 100644
--- a/tensorflow/core/platform/default/stacktrace.h
+++ b/tensorflow/core/platform/default/stacktrace.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_PLATFORM_DEFAULT_STACKTRACE_H_
 
 #include "tensorflow/core/platform/platform.h"
-#if !defined(IS_MOBILE_PLATFORM) && defined(PLATFORM_POSIX) && \
-    (defined(__clang__) || defined(__GNUC__))
+#if !defined(IS_MOBILE_PLATFORM) && !defined(PLATFORM_WINDOWS) && \
+    defined(PLATFORM_POSIX) && (defined(__clang__) || defined(__GNUC__))
 #define TF_GENERATE_BACKTRACE
 #endif
 
diff --git a/tensorflow/core/platform/default/string_coding.h b/tensorflow/core/platform/default/string_coding.h
deleted file mode 100644
index 70b8ab01444a6175f9c037e35fadc8196c781b19..0000000000000000000000000000000000000000
--- a/tensorflow/core/platform/default/string_coding.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_STRING_CODING_H_
-#define TENSORFLOW_CORE_PLATFORM_DEFAULT_STRING_CODING_H_
-
-// IWYU pragma: private, include "third_party/tensorflow/core/platform/tensor_coding.h"
-// IWYU pragma: friend third_party/tensorflow/core/platform/tensor_coding.h
-
-#include "tensorflow/core/lib/core/coding.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-namespace port {
-
-// Encodes sequences of strings and serialized protocol buffers into a string.
-// Normal usage consists of zero or more calls to Append() and a single call to
-// Finalize().
-class StringListEncoder {
- public:
-  explicit StringListEncoder(string* out) : out_(out) {}
-
-  // Encodes the given protocol buffer. This may not be called after Finalize().
-  void Append(const protobuf::MessageLite& m) {
-    core::PutVarint32(out_, m.ByteSize());
-    m.AppendToString(&rest_);
-  }
-
-  // Encodes the given string. This may not be called after Finalize().
-  void Append(const string& s) {
-    core::PutVarint32(out_, s.length());
-    strings::StrAppend(&rest_, s);
-  }
-
-  // Signals end of the encoding process. No other calls are allowed after this.
-  void Finalize() { strings::StrAppend(out_, rest_); }
-
- private:
-  string* out_;
-  string rest_;
-};
-
-// Decodes a string into sequences of strings (which may represent serialized
-// protocol buffers). Normal usage involves a single call to ReadSizes() in
-// order to retrieve the length of all the strings in the sequence. For each
-// size returned a call to Data() is expected and will return the actual
-// string.
-class StringListDecoder {
- public:
-  explicit StringListDecoder(const string& in) : reader_(in) {}
-
-  // Populates the given vector with the lengths of each string in the sequence
-  // being decoded. Upon returning the vector is guaranteed to contain as many
-  // elements as there are strings in the sequence.
-  bool ReadSizes(std::vector<uint32>* sizes) {
-    int64 total = 0;
-    for (auto& size : *sizes) {
-      if (!core::GetVarint32(&reader_, &size)) return false;
-      total += size;
-    }
-    if (total != static_cast<int64>(reader_.size())) {
-      return false;
-    }
-    return true;
-  }
-
-  // Returns a pointer to the next string in the sequence, then prepares for the
-  // next call by advancing 'size' characters in the sequence.
-  const char* Data(uint32 size) {
-    const char* data = reader_.data();
-    reader_.remove_prefix(size);
-    return data;
-  }
-
- private:
-  StringPiece reader_;
-};
-
-std::unique_ptr<StringListEncoder> NewStringListEncoder(string* out);
-std::unique_ptr<StringListDecoder> NewStringListDecoder(const string& in);
-
-}  // namespace port
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_STRING_CODING_H_
diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index afc4201e5382194b02b8b0f5cdebfc90688c9f00..59768bf92ae9e854f684623ec15c83a70839312d 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -29,6 +29,9 @@ limitations under the License.
 #include "tensorflow/core/platform/windows/wide_char.h"
 #define PATH_MAX MAX_PATH
 #else
+#include <fcntl.h>
+#include <string.h>
+#include <sys/types.h>
 #include <unistd.h>
 #endif
 
@@ -314,7 +317,31 @@ string Env::GetExecutablePath() {
   string file_path = WideCharToUtf8(wc_file_path);
   std::copy(file_path.begin(), file_path.end(), exe_path);
 #else
-  CHECK_NE(-1, readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1));
+  char buf[PATH_MAX] = {0};
+  int path_length = readlink("/proc/self/exe", buf, sizeof(buf) - 1);
+  CHECK_NE(-1, path_length);
+
+  if (strstr(buf, "python") != nullptr) {
+    // Discard the path of the python binary, and any flags.
+    int fd = open("/proc/self/cmdline", O_RDONLY);
+    int cmd_length = read(fd, buf, PATH_MAX - 1);
+    CHECK_NE(-1, cmd_length);
+    int token_pos = 0;
+    for (bool token_is_first_or_flag = true; token_is_first_or_flag;) {
+      // Get token length, including null
+      int token_len = strlen(&buf[token_pos]) + 1;
+      token_is_first_or_flag = false;
+      // Check if we can skip without overshooting
+      if (token_pos + token_len < cmd_length) {
+        token_pos += token_len;
+        token_is_first_or_flag = (buf[token_pos] == '-');  // token is a flag
+      }
+    }
+    snprintf(exe_path, sizeof(exe_path), "%s", &buf[token_pos]);
+  } else {
+    snprintf(exe_path, sizeof(exe_path), "%s", buf);
+  }
+
 #endif
   // Make sure it's null-terminated:
   exe_path[sizeof(exe_path) - 1] = 0;
@@ -338,22 +365,10 @@ bool Env::LocalTempFilename(string* filename) {
 }
 
 bool Env::CreateUniqueFileName(string* prefix, const string& suffix) {
-#ifdef __APPLE__
-  uint64_t tid64;
-  pthread_threadid_np(nullptr, &tid64);
-  int32 tid = static_cast<int32>(tid64);
-  int32 pid = static_cast<int32>(getpid());
-#elif defined(__FreeBSD__)
-  // Has to be casted to long first, else this error appears:
-  // static_cast from 'pthread_t' (aka 'pthread *') to 'int32' (aka 'int')
-  // is not allowed
-  int32 tid = static_cast<int32>(static_cast<int64>(pthread_self()));
-  int32 pid = static_cast<int32>(getpid());
-#elif defined(PLATFORM_WINDOWS)
-  int32 tid = static_cast<int32>(GetCurrentThreadId());
+  int32 tid = GetCurrentThreadId();
+#ifdef PLATFORM_WINDOWS
   int32 pid = static_cast<int32>(GetCurrentProcessId());
 #else
-  int32 tid = static_cast<int32>(pthread_self());
   int32 pid = static_cast<int32>(getpid());
 #endif
   uint64 now_microsec = NowMicros();
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 1b5382841574e6b8843079ae9cb359c5c9b475d0..280076e098d5fdd121bf095d79be5353c0e2b57f 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -271,6 +271,15 @@ class Env {
                               const string& name,
                               std::function<void()> fn) TF_MUST_USE_RESULT = 0;
 
+  // Returns the thread id of calling thread.
+  // Posix: Returns pthread id which is only guaranteed to be unique within a
+  //        process.
+  // Windows: Returns thread id which is unique.
+  virtual int32 GetCurrentThreadId() = 0;
+
+  // Copies current thread name to "name". Returns true if success.
+  virtual bool GetCurrentThreadName(string* name) = 0;
+
   // \brief Schedules the given closure on a thread-pool.
   //
   // NOTE(mrry): This closure may block.
@@ -360,6 +369,10 @@ class EnvWrapper : public Env {
                       std::function<void()> fn) override {
     return target_->StartThread(thread_options, name, fn);
   }
+  int32 GetCurrentThreadId() override { return target_->GetCurrentThreadId(); }
+  bool GetCurrentThreadName(string* name) override {
+    return target_->GetCurrentThreadName(name);
+  }
   void SchedClosure(std::function<void()> closure) override {
     target_->SchedClosure(closure);
   }
diff --git a/tensorflow/core/platform/env_test.cc b/tensorflow/core/platform/env_test.cc
index 2e32abdffb1d6e3f8f929fcec3d7eb70efb045aa..ea1f123424728ea4bec4855dbfc7300a96103eeb 100644
--- a/tensorflow/core/platform/env_test.cc
+++ b/tensorflow/core/platform/env_test.cc
@@ -356,6 +356,14 @@ TEST_F(DefaultEnvTest, LocalTempFilename) {
   TF_CHECK_OK(file_to_write->Close());
   TF_CHECK_OK(env->FileExists(filename));
 
+  // Open the file in append mode, check that Tell() reports the appropriate
+  // offset.
+  std::unique_ptr<WritableFile> file_to_append;
+  TF_CHECK_OK(env->NewAppendableFile(filename, &file_to_append));
+  int64 pos;
+  TF_CHECK_OK(file_to_append->Tell(&pos));
+  ASSERT_EQ(4, pos);
+
   // Read from the temporary file and check content.
   std::unique_ptr<RandomAccessFile> file_to_read;
   TF_CHECK_OK(env->NewRandomAccessFile(filename, &file_to_read));
@@ -384,4 +392,20 @@ TEST_F(DefaultEnvTest, CreateUniqueFileName) {
   EXPECT_TRUE(str_util::EndsWith(filename, suffix));
 }
 
+TEST_F(DefaultEnvTest, GetThreadInformation) {
+  Env* env = Env::Default();
+  // TODO(fishx): Turn on this test for Apple.
+#if !defined(__APPLE__)
+  EXPECT_NE(env->GetCurrentThreadId(), 0);
+#endif
+  string thread_name;
+  bool res = env->GetCurrentThreadName(&thread_name);
+#if defined(PLATFORM_WINDOWS) || defined(__ANDROID__)
+  EXPECT_FALSE(res);
+#elif !defined(__APPLE__)
+  EXPECT_TRUE(res);
+  EXPECT_GT(thread_name.size(), 0);
+#endif
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/env_time.h b/tensorflow/core/platform/env_time.h
index b4756ed209cf7f945a2cf4f1bea7271dded7518a..c12b6ba6fb86e7bda394b85fa449c8176c817054 100644
--- a/tensorflow/core/platform/env_time.h
+++ b/tensorflow/core/platform/env_time.h
@@ -25,6 +25,7 @@ namespace tensorflow {
 /// access timer related operations.
 class EnvTime {
  public:
+  static constexpr uint64 kMicrosToPicos = 1000ULL * 1000ULL;
   static constexpr uint64 kMicrosToNanos = 1000ULL;
   static constexpr uint64 kMillisToMicros = 1000ULL;
   static constexpr uint64 kMillisToNanos = 1000ULL * 1000ULL;
diff --git a/tensorflow/core/platform/fake_python_env_test.cc b/tensorflow/core/platform/fake_python_env_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b521db3c054bff0e324a3b0571e0af7f47c269c4
--- /dev/null
+++ b/tensorflow/core/platform/fake_python_env_test.cc
@@ -0,0 +1,65 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file has "python" in its name. Thus, it should trigger the python
+// specific code paths.
+
+#include <sys/stat.h>
+#include <unistd.h>
+#include <iostream>
+#include <string>
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+
+int myargc;
+char** myargv;
+
+char kMagicBazelDirSubstring[] = ".runfiles/org_tensorflow";
+char kPythonFile[] =
+    "/some/path/to/pythontest.runfiles/org_tensorflow/stuff/to/run.py";
+
+namespace tensorflow {
+
+TEST(FakePythonEnvTest, GetExecutablePath) {
+  // See if argc is greater than 1 and first arg is kPythonFile
+  // If not, rerun the executable with proper args.
+  if (myargc <= 1 || strstr(myargv[1], kMagicBazelDirSubstring) == nullptr) {
+    const char* filename = myargv[0];
+    char* new_argv[] = {
+        myargv[0],
+        kPythonFile,
+        nullptr,
+    };
+
+    execv(filename, new_argv);
+  }
+
+  Env* env = Env::Default();
+  // We depend on the file/executable name to include python and fool the
+  // library to think this is running under the python interpreter.
+  string path = env->GetExecutablePath();
+  EXPECT_TRUE(strstr(path.c_str(), kMagicBazelDirSubstring) != nullptr);
+}
+
+}  // namespace tensorflow
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  myargc = argc;
+  myargv = argv;
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/core/platform/file_system.h b/tensorflow/core/platform/file_system.h
index c84a93b1bf59be7cb19352825cc4bb82b48e2246..a3a2b6c7f3c96f6ccfc8d3b4f81a4be123758bd4 100644
--- a/tensorflow/core/platform/file_system.h
+++ b/tensorflow/core/platform/file_system.h
@@ -234,6 +234,14 @@ class RandomAccessFile {
   RandomAccessFile() {}
   virtual ~RandomAccessFile();
 
+  /// \brief Returns the name of the file.
+  ///
+  /// This is an optional operation that may not be implemented by every
+  /// filesystem.
+  virtual Status Name(StringPiece* result) const {
+    return errors::Unimplemented("This filesystem does not support Name()");
+  }
+
   /// \brief Reads up to `n` bytes from the file starting at `offset`.
   ///
   /// `scratch[0..n-1]` may be written by this routine.  Sets `*result`
@@ -297,6 +305,14 @@ class WritableFile {
   /// persisted, depending on the implementation.
   virtual Status Flush() = 0;
 
+  // \brief Returns the name of the file.
+  ///
+  /// This is an optional operation that may not be implemented by every
+  /// filesystem.
+  virtual Status Name(StringPiece* result) const {
+    return errors::Unimplemented("This filesystem does not support Name()");
+  }
+
   /// \brief Syncs contents of file to filesystem.
   ///
   /// This waits for confirmation from the filesystem that the contents
@@ -305,6 +321,16 @@ class WritableFile {
   /// be properly saved.
   virtual Status Sync() = 0;
 
+  /// \brief Retrieves the current write position in the file, or -1 on
+  /// error.
+  ///
+  /// This is an optional operation, subclasses may choose to return
+  /// errors::Unimplemented.
+  virtual Status Tell(int64* position) {
+    *position = -1;
+    return errors::Unimplemented("This filesystem does not support Tell()");
+  }
+
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(WritableFile);
 };
diff --git a/tensorflow/core/platform/grpc_services.h b/tensorflow/core/platform/grpc_services.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd918193dc52881ea396142a7b0a8e3708cb427c
--- /dev/null
+++ b/tensorflow/core/platform/grpc_services.h
@@ -0,0 +1,33 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PLATFORM_GRPC_SERVICES_H_
+#define TENSORFLOW_CORE_PLATFORM_GRPC_SERVICES_H_
+
+#include "tensorflow/core/profiler/profiler_analysis.grpc.pb.h"
+#include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
+
+#if !defined(PLATFORM_GOOGLE)
+
+namespace tensorflow {
+namespace grpc {
+
+// Google internal GRPC generates services under namespace "tensorflow::grpc".
+// Creating aliases here to make sure we can access services under namespace
+// "tensorflow::grpc" both in google internal and open-source.
+using ::tensorflow::ProfileAnalysis;
+using ::tensorflow::ProfilerService;
+
+}  // namespace grpc
+}  // namespace tensorflow
+#endif
+
+#endif  // TENSORFLOW_CORE_PLATFORM_GRPC_SERVICES_H_
diff --git a/tensorflow/core/platform/hadoop/BUILD b/tensorflow/core/platform/hadoop/BUILD
index 7c38c399bd7a4645b3556e653110c19b8b9ab9ff..e04835f4f3e1dcf5ed7f4af26410f0616026f080 100644
--- a/tensorflow/core/platform/hadoop/BUILD
+++ b/tensorflow/core/platform/hadoop/BUILD
@@ -26,7 +26,7 @@ cc_library(
 
 # This test is set to manual because it requires downloading the Hadoop
 # distribution to run. To run this test:
-# 1. Ensure $JAVA_HOME is set.
+# 1. Ensure $JAVA_HOME is set to the location of a JDK 8 installation.
 # 2. Download the binary Hadoop distribution from:
 #    http://hadoop.apache.org/releases.html
 # 3. Extract the Hadoop distribution and run:
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index eb35531e9f86dd1de61efdd65889134502ce4aff..65cb848ee2c7b5f09ec66b61abd1bde41cdc959e 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -59,13 +59,12 @@ class LibHDFS {
   std::function<hdfsBuilder*()> hdfsNewBuilder;
   std::function<void(hdfsBuilder*, const char*)> hdfsBuilderSetNameNode;
   std::function<int(const char*, char**)> hdfsConfGetStr;
-  std::function<void(hdfsBuilder*, const char* kerbTicketCachePath)>
-      hdfsBuilderSetKerbTicketCachePath;
   std::function<int(hdfsFS, hdfsFile)> hdfsCloseFile;
   std::function<tSize(hdfsFS, hdfsFile, tOffset, void*, tSize)> hdfsPread;
   std::function<tSize(hdfsFS, hdfsFile, const void*, tSize)> hdfsWrite;
   std::function<int(hdfsFS, hdfsFile)> hdfsHFlush;
   std::function<int(hdfsFS, hdfsFile)> hdfsHSync;
+  std::function<tOffset(hdfsFS, hdfsFile)> hdfsTell;
   std::function<hdfsFile(hdfsFS, const char*, int, int, short, tSize)>
       hdfsOpenFile;
   std::function<int(hdfsFS, const char*)> hdfsExists;
@@ -87,11 +86,11 @@ class LibHDFS {
       BIND_HDFS_FUNC(hdfsNewBuilder);
       BIND_HDFS_FUNC(hdfsBuilderSetNameNode);
       BIND_HDFS_FUNC(hdfsConfGetStr);
-      BIND_HDFS_FUNC(hdfsBuilderSetKerbTicketCachePath);
       BIND_HDFS_FUNC(hdfsCloseFile);
       BIND_HDFS_FUNC(hdfsPread);
       BIND_HDFS_FUNC(hdfsWrite);
       BIND_HDFS_FUNC(hdfsHFlush);
+      BIND_HDFS_FUNC(hdfsTell);
       BIND_HDFS_FUNC(hdfsHSync);
       BIND_HDFS_FUNC(hdfsOpenFile);
       BIND_HDFS_FUNC(hdfsExists);
@@ -166,13 +165,6 @@ Status HadoopFileSystem::Connect(StringPiece fname, hdfsFS* fs) {
   } else {
     hdfs_->hdfsBuilderSetNameNode(builder, nn.c_str());
   }
-  // KERB_TICKET_CACHE_PATH will be deleted in the future, Because KRB5CCNAME is
-  // the build in environment variable of Kerberos, so KERB_TICKET_CACHE_PATH
-  // and related code are unnecessary.
-  char* ticket_cache_path = getenv("KERB_TICKET_CACHE_PATH");
-  if (ticket_cache_path != nullptr) {
-    hdfs_->hdfsBuilderSetKerbTicketCachePath(builder, ticket_cache_path);
-  }
   *fs = hdfs_->hdfsBuilderConnect(builder);
   if (*fs == nullptr) {
     return errors::NotFound(strerror(errno));
@@ -203,6 +195,11 @@ class HDFSRandomAccessFile : public RandomAccessFile {
     }
   }
 
+  Status Name(StringPiece* result) const override {
+    *result = filename_;
+    return Status::OK();
+  }
+
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
     Status s;
@@ -212,8 +209,12 @@ class HDFSRandomAccessFile : public RandomAccessFile {
       // We lock inside the loop rather than outside so we don't block other
       // concurrent readers.
       mutex_lock lock(mu_);
+      // Max read length is INT_MAX-2, for hdfsPread function take a parameter
+      // of int32. -2 offset can avoid JVM OutOfMemoryError.
+      size_t read_n =
+          std::min(n, static_cast<size_t>(std::numeric_limits<int>::max() - 2));
       tSize r = hdfs_->hdfsPread(fs_, file_, static_cast<tOffset>(offset), dst,
-                                 static_cast<tSize>(n));
+                                 static_cast<tSize>(read_n));
       if (r > 0) {
         dst += r;
         n -= r;
@@ -308,6 +309,11 @@ class HDFSWritableFile : public WritableFile {
     return Status::OK();
   }
 
+  Status Name(StringPiece* result) const override {
+    *result = filename_;
+    return Status::OK();
+  }
+
   Status Sync() override {
     if (hdfs_->hdfsHSync(fs_, file_) != 0) {
       return IOError(filename_, errno);
@@ -315,6 +321,14 @@ class HDFSWritableFile : public WritableFile {
     return Status::OK();
   }
 
+  Status Tell(int64* position) override {
+    *position = hdfs_->hdfsTell(fs_, file_);
+    if (*position == -1) {
+      return IOError(filename_, errno);
+    }
+    return Status::OK();
+  }
+
  private:
   string filename_;
   LibHDFS* hdfs_;
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc b/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
index b207d3474977361777383299a2a603a9f21481d4..b9e8f28739891868f11aa21ec7c48e93afe2b1c5 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
@@ -75,9 +75,12 @@ TEST_F(HadoopFileSystemTest, RandomAccessFile) {
   std::unique_ptr<RandomAccessFile> reader;
   TF_EXPECT_OK(hdfs.NewRandomAccessFile(fname, &reader));
 
+  StringPiece result;
+  TF_EXPECT_OK(reader->Name(&result));
+  EXPECT_EQ(result, fname);
+
   string got;
   got.resize(content.size());
-  StringPiece result;
   TF_EXPECT_OK(
       reader->Read(0, content.size(), &result, gtl::string_as_array(&got)));
   EXPECT_EQ(content.size(), result.size());
@@ -94,7 +97,13 @@ TEST_F(HadoopFileSystemTest, WritableFile) {
   std::unique_ptr<WritableFile> writer;
   const string fname = TmpDir("WritableFile");
   TF_EXPECT_OK(hdfs.NewWritableFile(fname, &writer));
+  StringPiece result;
+  TF_EXPECT_OK(writer->Name(&result));
+  EXPECT_EQ(result, fname);
   TF_EXPECT_OK(writer->Append("content1,"));
+  int64 pos;
+  TF_EXPECT_OK(writer->Tell(&pos));
+  EXPECT_EQ(pos, 9);
   TF_EXPECT_OK(writer->Append("content2"));
   TF_EXPECT_OK(writer->Flush());
   TF_EXPECT_OK(writer->Sync());
diff --git a/tensorflow/core/platform/default/logger.cc b/tensorflow/core/platform/logger.cc
similarity index 72%
rename from tensorflow/core/platform/default/logger.cc
rename to tensorflow/core/platform/logger.cc
index 54b1a1a67ca7da65aa6897e6461ebe9b54fb4767..f5a961e4d318529ca00846d4e6647b20a5232568 100644
--- a/tensorflow/core/platform/default/logger.cc
+++ b/tensorflow/core/platform/logger.cc
@@ -18,17 +18,20 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
+namespace {
 
-Logger* Logger::Singleton() {
-  class DefaultLogger : public Logger {
-   private:
-    void DoLogProto(google::protobuf::Any* proto) override {
-      VLOG(2) << proto->ShortDebugString();
-    }
-    void DoFlush() override {}
-  };
-  static Logger* instance = new DefaultLogger();
-  return instance;
-}
+class DefaultLogger : public Logger {
+ private:
+  void DoLogProto(google::protobuf::Any* proto) override {
+    VLOG(2) << proto->ShortDebugString();
+  }
+  void DoFlush() override {}
+};
+
+}  // namespace
+
+Logger::FactoryFunc Logger::singleton_factory_ = []() -> Logger* {
+  return new DefaultLogger();
+};
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/logger.h b/tensorflow/core/platform/logger.h
index 5d304bea63a7c78e4a90d78ea2be4ce01caa802d..f0bfef4f2d9ebce091917cd85cf6e5e903ab52a4 100644
--- a/tensorflow/core/platform/logger.h
+++ b/tensorflow/core/platform/logger.h
@@ -26,7 +26,22 @@ namespace tensorflow {
 // log anything to a non-local place, e.g. a database.
 class Logger {
  public:
-  static Logger* Singleton();
+  // The singleton is supposed to be used in the following steps:
+  // * At program start time, REGISTER_MOUDLE_INITIALIZER calls
+  //   SetSingletonFactory.
+  // * At some point in the program execution, Singleton() is called for the
+  //   first time, initializing the logger.
+  // * Succeeding calls to Singleton() return the initiailized logger.
+  using FactoryFunc = Logger* (*)();
+
+  static void SetSingletonFactory(FactoryFunc factory) {
+    singleton_factory_ = factory;
+  }
+
+  static Logger* Singleton() {
+    static Logger* instance = singleton_factory_();
+    return instance;
+  }
 
   virtual ~Logger() = default;
 
@@ -44,6 +59,8 @@ class Logger {
  private:
   virtual void DoLogProto(google::protobuf::Any* proto) = 0;
   virtual void DoFlush() = 0;
+
+  static FactoryFunc singleton_factory_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/logging.h b/tensorflow/core/platform/logging.h
index 17a5d5fb5b7099ad01c68d64f5528fa07cc2fa6f..7417ec8aff66de1f393d9c381bbf2b657a85307d 100644
--- a/tensorflow/core/platform/logging.h
+++ b/tensorflow/core/platform/logging.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/core/platform/platform.h"  // To pick up PLATFORM_define
 
 #if defined(PLATFORM_GOOGLE) || defined(PLATFORM_GOOGLE_ANDROID) || \
-    defined(GOOGLE_LOGGING)
+    defined(GOOGLE_LOGGING) || defined(__EMSCRIPTEN__)
 #include "tensorflow/core/platform/google/build_config/logging.h"
 #else
 #include "tensorflow/core/platform/default/logging.h"
diff --git a/tensorflow/core/platform/platform.h b/tensorflow/core/platform/platform.h
index 0481b3687137c8b00fa84d33eb317a1a4f5be9df..671e5dd3c862febe5ef4be912525c7f2043857ed 100644
--- a/tensorflow/core/platform/platform.h
+++ b/tensorflow/core/platform/platform.h
@@ -40,7 +40,7 @@ limitations under the License.
 #elif defined(_WIN32)
 #define PLATFORM_WINDOWS
 
-#elif defined(__arm__)
+#elif defined(__arm__) || defined(__EMSCRIPTEN__)
 #define PLATFORM_POSIX
 
 // Require an outside macro to tell us if we're building for Raspberry Pi or
diff --git a/tensorflow/core/platform/port_test.cc b/tensorflow/core/platform/port_test.cc
index 15c3cb24f046b9111d66839ba03ffaf427ba70eb..0567130e8b923aff05a5acea50a1ba0efffa6410 100644
--- a/tensorflow/core/platform/port_test.cc
+++ b/tensorflow/core/platform/port_test.cc
@@ -33,6 +33,15 @@ TEST(Port, AlignedMalloc) {
   }
 }
 
+TEST(Port, GetCurrentCPU) {
+  const int cpu = GetCurrentCPU();
+#if !defined(__APPLE__)
+  // GetCurrentCPU does not currently work on MacOS.
+  EXPECT_GE(cpu, 0);
+  EXPECT_LT(cpu, NumTotalCPUs());
+#endif
+}
+
 TEST(ConditionVariable, WaitForMilliseconds_Timeout) {
   mutex m;
   mutex_lock l(m);
diff --git a/tensorflow/core/platform/posix/env.cc b/tensorflow/core/platform/posix/env.cc
index 0a939aef25236dc33e2be8ec1d76f9ea0075e350..f2dff5a9b6441c5c39f6251c3b8c46dcd8639c74 100644
--- a/tensorflow/core/platform/posix/env.cc
+++ b/tensorflow/core/platform/posix/env.cc
@@ -86,6 +86,35 @@ class PosixEnv : public Env {
     return new StdThread(thread_options, name, fn);
   }
 
+  int32 GetCurrentThreadId() override {
+#ifdef __APPLE__
+    uint64_t tid64;
+    pthread_threadid_np(nullptr, &tid64);
+    return static_cast<int32>(tid64);
+#elif defined(__FreeBSD__)
+    // Has to be casted to long first, else this error appears:
+    // static_cast from 'pthread_t' (aka 'pthread *') to 'int32' (aka 'int')
+    // is not allowed
+    return static_cast<int32>(static_cast<int64>(pthread_self()));
+#else
+    return static_cast<int32>(pthread_self());
+#endif
+  }
+
+  bool GetCurrentThreadName(string* name) override {
+#if defined(__ANDROID__) || defined(__EMSCRIPTEN__)
+    return false;
+#else
+    char buf[100];
+    int res = pthread_getname_np(pthread_self(), buf, static_cast<size_t>(100));
+    if (res != 0) {
+      return false;
+    }
+    *name = buf;
+    return true;
+#endif
+  }
+
   void SchedClosure(std::function<void()> closure) override {
     // TODO(b/27290852): Spawning a new thread here is wasteful, but
     // needed to deal with the fact that many `closure` functions are
@@ -121,13 +150,25 @@ class PosixEnv : public Env {
 
   string GetRunfilesDir() override {
     string bin_path = this->GetExecutablePath();
-    string runfiles_path = bin_path + ".runfiles/org_tensorflow";
+    string runfiles_suffix = ".runfiles/org_tensorflow";
+    std::size_t pos = bin_path.find(runfiles_suffix);
+
+    // Sometimes (when executing under python) bin_path returns the full path to
+    // the python scripts under runfiles. Get the substring.
+    if (pos != std::string::npos) {
+      return bin_path.substr(0, pos + runfiles_suffix.length());
+    }
+
+    // See if we have the executable path. if executable.runfiles exists, return
+    // that folder.
+    string runfiles_path = bin_path + runfiles_suffix;
     Status s = this->IsDirectory(runfiles_path);
     if (s.ok()) {
       return runfiles_path;
-    } else {
-      return bin_path.substr(0, bin_path.find_last_of("/\\"));
     }
+
+    // If nothing can be found, return something close.
+    return bin_path.substr(0, bin_path.find_last_of("/\\"));
   }
 
  private:
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index acdd7798ea961f2b5aed59b6eebb3f6dcafa40a5..807e0083229983722182fec8ee2a14e70cf233fb 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -25,7 +25,14 @@ limitations under the License.
 #if defined(__linux__) && !defined(__ANDROID__)
 #include <sched.h>
 #include <sys/sysinfo.h>
+#else
+#include <sys/syscall.h>
+#endif
+
+#if (__x86_64__ || __i386__)
+#include <cpuid.h>
 #endif
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -69,6 +76,40 @@ int NumSchedulableCPUs() {
   return kDefaultCores;
 }
 
+int NumTotalCPUs() {
+  int count = absl::base_internal::NumCPUs();
+  return (count <= 0) ? kUnknownCPU : count;
+}
+
+int GetCurrentCPU() {
+#if defined(__EMSCRIPTEN__)
+  return sched_getcpu();
+#elif defined(__linux__) && !defined(__ANDROID__)
+  return sched_getcpu();
+  // Attempt to use cpuid on all other platforms.  If that fails, perform a
+  // syscall.
+#elif defined(__cpuid) && !defined(__APPLE__)
+  // TODO(b/120919972): __cpuid returns invalid APIC ids on OS X.
+  uint32_t eax = 0;
+  uint32_t ebx = 0;
+  uint32_t ecx = 0;
+  uint32_t edx = 0;
+  __cpuid(/*level=*/1, eax, ebx, ecx, edx);
+  if ((edx & /*bit_APIC=*/(1 << 9)) != 0) {
+    // EBX bits 24-31 are APIC ID
+    return (ebx & 0xFF) >> 24;
+  }
+#elif defined(__NR_getcpu)
+  unsigned int cpu;
+  if (syscall(__NR_getcpu, &cpu, NULL, NULL) < 0) {
+    return kUnknownCPU;
+  } else {
+    return static_cast<int>(cpu);
+  }
+#endif
+  return kUnknownCPU;
+}
+
 int NumHyperthreadsPerCore() {
   static const int ht_per_core = tensorflow::port::CPUIDNumSMT();
   return (ht_per_core > 0) ? ht_per_core : 1;
@@ -83,9 +124,7 @@ int NUMANumNodes() { return 1; }
 
 void NUMASetThreadNodeAffinity(int node) {}
 
-int NUMAGetThreadNodeAffinity() {
-  return kNUMANoAffinity;
-}
+int NUMAGetThreadNodeAffinity() { return kNUMANoAffinity; }
 
 void* AlignedMalloc(size_t size, int minimum_alignment) {
 #if defined(__ANDROID__)
diff --git a/tensorflow/core/platform/posix/posix_file_system.cc b/tensorflow/core/platform/posix/posix_file_system.cc
index fc48cab56460d85d9997f57cb761481c77413d00..003ab170fe8db2980bb9c7ad79bf90b523e36b76 100644
--- a/tensorflow/core/platform/posix/posix_file_system.cc
+++ b/tensorflow/core/platform/posix/posix_file_system.cc
@@ -52,6 +52,11 @@ class PosixRandomAccessFile : public RandomAccessFile {
       : filename_(fname), fd_(fd) {}
   ~PosixRandomAccessFile() override { close(fd_); }
 
+  Status Name(StringPiece* result) const override {
+    *result = filename_;
+    return Status::OK();
+  }
+
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
     Status s;
@@ -115,6 +120,11 @@ class PosixWritableFile : public WritableFile {
     return Status::OK();
   }
 
+  Status Name(StringPiece* result) const override {
+    *result = filename_;
+    return Status::OK();
+  }
+
   Status Sync() override {
     Status s;
     if (fflush(file_) != 0) {
@@ -122,6 +132,17 @@ class PosixWritableFile : public WritableFile {
     }
     return s;
   }
+
+  Status Tell(int64* position) override {
+    Status s;
+    *position = ftell(file_);
+
+    if (*position == -1) {
+      s = IOError(filename_, errno);
+    }
+
+    return s;
+  }
 };
 
 class PosixReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
diff --git a/tensorflow/core/platform/default/protobuf.cc b/tensorflow/core/platform/protobuf.cc
similarity index 72%
rename from tensorflow/core/platform/default/protobuf.cc
rename to tensorflow/core/platform/protobuf.cc
index 548d5834e6f74b14a3ad16c00f5d3015f337f90a..c9e6f3bf5c6b498818001c9d6644d52af8b7f5d2 100644
--- a/tensorflow/core/platform/default/protobuf.cc
+++ b/tensorflow/core/platform/protobuf.cc
@@ -1,4 +1,4 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/default/protobuf.h"
+#include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
 
-const char* kProtobufInt64Typename = "::google::protobuf::int64";
-const char* kProtobufUint64Typename = "::google::protobuf::uint64";
+const char* kProtobufInt64Typename = "::tensorflow::protobuf_int64";
+const char* kProtobufUint64Typename = "::tensorflow::protobuf_uint64";
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/protobuf.h b/tensorflow/core/platform/protobuf.h
index fcbf1fc8c5054e110b9a0fe0217b97cecdd27088..59f4129adf40a5b6892a82ac705f73cdcf02886a 100644
--- a/tensorflow/core/platform/protobuf.h
+++ b/tensorflow/core/platform/protobuf.h
@@ -25,13 +25,31 @@ limitations under the License.
 // TensorFlow code should use the ::tensorflow::protobuf namespace to
 // refer to all protobuf APIs.
 
-#if defined(PLATFORM_GOOGLE) && !defined(USE_DEFAULT_PROTOBUF)
-#include "tensorflow/core/platform/google/protobuf.h"
-#else
-#include "tensorflow/core/platform/default/protobuf.h"
+#ifndef TENSORFLOW_LITE_PROTOS
+#include "google/protobuf/io/tokenizer.h"
+#include "google/protobuf/descriptor.pb.h"
+#include "google/protobuf/descriptor.h"
+#include "google/protobuf/dynamic_message.h"
+#include "google/protobuf/text_format.h"
+#include "google/protobuf/util/json_util.h"
+#include "google/protobuf/util/type_resolver_util.h"
 #endif
 
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+#include "google/protobuf/io/zero_copy_stream_impl_lite.h"
+#include "google/protobuf/arena.h"
+#include "google/protobuf/map.h"
+#include "google/protobuf/repeated_field.h"
+
 namespace tensorflow {
+
+namespace protobuf = ::google::protobuf;
+using protobuf_int64 = ::google::protobuf::int64;
+using protobuf_uint64 = ::google::protobuf::uint64;
+extern const char* kProtobufInt64Typename;
+extern const char* kProtobufUint64Typename;
+
 // Parses a protocol buffer contained in a string in the binary wire format.
 // Returns true on success. Note: Unlike protobuf's builtin ParseFromString,
 // this function has no size restrictions on the total size of the encoded
@@ -47,8 +65,19 @@ inline const string& ProtobufStringToString(const string& s) { return s; }
 // Set <dest> to <src>. Swapping is allowed, as <src> does not need to be
 // preserved.
 inline void SetProtobufStringSwapAllowed(string* src, string* dest) {
-  dest->swap(*src);
+  *dest = std::move(*src);
+}
+
+#if defined(TENSORFLOW_PROTOBUF_USES_CORD)
+// These versions of ProtobufStringToString and SetProtobufString get used by
+// tools/proto_text's generated code.  They have the same name as the versions
+// in core/platform/protobuf.h, so the generation code doesn't need to determine
+// if the type is Cord or string at generation time.
+inline string ProtobufStringToString(const Cord& s) { return s.ToString(); }
+inline void SetProtobufStringSwapAllowed(string* src, Cord* dest) {
+  dest->CopyFrom(*src);
 }
+#endif  // defined(TENSORFLOW_PROTOBUF_USES_CORD)
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/protobuf_compiler.h b/tensorflow/core/platform/protobuf_compiler.h
index 29679e00892fbd11d1e5242f62650f42ecef5577..916637d13a55044873b5309c1ea0acc9ac4eef47 100644
--- a/tensorflow/core/platform/protobuf_compiler.h
+++ b/tensorflow/core/platform/protobuf_compiler.h
@@ -16,10 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_PLATFORM_PROTOBUF_COMPILER_H_
 #define TENSORFLOW_PLATFORM_PROTOBUF_COMPILER_H_
 
-#if defined(PLATFORM_GOOGLE) && !defined(USE_DEFAULT_PROTOBUF)
-#include "tensorflow/core/platform/google/protobuf_compiler.h"
-#else
-#include "tensorflow/core/platform/default/protobuf_compiler.h"
-#endif
+#include "google/protobuf/compiler/importer.h"
 
 #endif  // TENSORFLOW_PLATFORM_PROTOBUF_COMPILER_H_
diff --git a/tensorflow/core/platform/s3/BUILD b/tensorflow/core/platform/s3/BUILD
index 41184b6fd9ed12c0164f06e2c92816b2c99a03f7..7bc4d80db5b0ab31540f5c95d91ad29239458bce 100644
--- a/tensorflow/core/platform/s3/BUILD
+++ b/tensorflow/core/platform/s3/BUILD
@@ -14,7 +14,7 @@ load(
 )
 
 tf_cc_binary(
-    name = "s3_file_system.so",
+    name = "libs3_file_system_shared.so",
     srcs = [
         "aws_crypto.cc",
         "aws_crypto.h",
diff --git a/tensorflow/core/platform/s3/aws_logging.cc b/tensorflow/core/platform/s3/aws_logging.cc
index 44317f1a3e41831b903bd0044d53d1eba80168df..dac569088936b984f2c3167962ff4872e03decc3 100644
--- a/tensorflow/core/platform/s3/aws_logging.cc
+++ b/tensorflow/core/platform/s3/aws_logging.cc
@@ -69,12 +69,32 @@ void AWSLogSystem::LogMessage(Aws::Utils::Logging::LogLevel log_level,
 }
 
 namespace {
+
+// Taken from tensorflow/core/platform/default/logging.cc
+int ParseInteger(const char* str, size_t size) {
+  string integer_str(str, size);
+  std::istringstream ss(integer_str);
+  int level = 0;
+  ss >> level;
+  return level;
+}
+
+// Taken from tensorflow/core/platform/default/logging.cc
+int64 LogLevelStrToInt(const char* tf_env_var_val) {
+  if (tf_env_var_val == nullptr) {
+    return 0;
+  }
+  return ParseInteger(tf_env_var_val, strlen(tf_env_var_val));
+}
+
 static const char* kAWSLoggingTag = "AWSLogging";
 
 Aws::Utils::Logging::LogLevel ParseLogLevelFromEnv() {
   Aws::Utils::Logging::LogLevel log_level = Aws::Utils::Logging::LogLevel::Info;
 
-  const int64_t level = tensorflow::internal::MinLogLevelFromEnv();
+  const int64_t level = getenv("AWS_LOG_LEVEL")
+                            ? LogLevelStrToInt(getenv("AWS_LOG_LEVEL"))
+                            : tensorflow::internal::MinLogLevelFromEnv();
 
   switch (level) {
     case INFO:
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index e0b8e377453393429a3e5853b1aa2ce871334bff..0ff65fb6b38a9e64cc9c0778c483922c0e7d6bfc 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -170,6 +170,10 @@ class S3RandomAccessFile : public RandomAccessFile {
                      std::shared_ptr<Aws::S3::S3Client> s3_client)
       : bucket_(bucket), object_(object), s3_client_(s3_client) {}
 
+  Status Name(StringPiece* result) const override {
+    return errors::Unimplemented("S3RandomAccessFile does not support Name()");
+  }
+
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
     Aws::S3::Model::GetObjectRequest getObjectRequest;
@@ -235,6 +239,10 @@ class S3WritableFile : public WritableFile {
 
   Status Flush() override { return Sync(); }
 
+  Status Name(StringPiece* result) const override {
+    return errors::Unimplemented("S3WritableFile does not support Name()");
+  }
+
   Status Sync() override {
     if (!outfile_) {
       return errors::FailedPrecondition(
diff --git a/tensorflow/core/platform/setround.cc b/tensorflow/core/platform/setround.cc
index 592626bfa17e691d1b10ddce5c7f0f31ed825861..5573b2fc93f8b28777e78ad50d423ecb57409821 100644
--- a/tensorflow/core/platform/setround.cc
+++ b/tensorflow/core/platform/setround.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/platform/setround.h"
 
+#include <cfenv>  // NOLINT
+
 namespace tensorflow {
 namespace port {
 
diff --git a/tensorflow/core/platform/stream_executor.h b/tensorflow/core/platform/stream_executor.h
index 0a590b3d40c0dbf007feee07fc93be4838924679..437e8a1c95632af71c3f2db2c4b35cfb48849b8a 100644
--- a/tensorflow/core/platform/stream_executor.h
+++ b/tensorflow/core/platform/stream_executor.h
@@ -18,11 +18,6 @@ limitations under the License.
 
 #include "tensorflow/core/platform/platform.h"
 
-#if defined(PLATFORM_GOOGLE)
-#include "tensorflow/stream_executor/platform/google/dso_loader.h"
-#else
-#include "tensorflow/stream_executor/dso_loader.h"
-#endif
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/dnn.h"
@@ -31,6 +26,8 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
 #include "tensorflow/stream_executor/scratch_allocator.h"
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor.h"
diff --git a/tensorflow/core/platform/stream_executor_no_cuda.h b/tensorflow/core/platform/stream_executor_no_cuda.h
index 50a5e732c0ec222d3ee2329a57fc6ea9ac4b233c..129ee6c7a7503b680e90ccc68e39a3c838bb0e65 100644
--- a/tensorflow/core/platform/stream_executor_no_cuda.h
+++ b/tensorflow/core/platform/stream_executor_no_cuda.h
@@ -18,11 +18,6 @@ limitations under the License.
 
 #include "tensorflow/core/platform/platform.h"
 
-#if defined(PLATFORM_GOOGLE)
-#include "tensorflow/stream_executor/platform/google/dso_loader.h"
-#else
-#include "tensorflow/stream_executor/dso_loader.h"
-#endif
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/dnn.h"
@@ -31,6 +26,8 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
 #include "tensorflow/stream_executor/scratch_allocator.h"
 #include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor.h"
diff --git a/tensorflow/core/platform/tensor_coding.cc b/tensorflow/core/platform/tensor_coding.cc
index 84601de39a6547ee78d190764616058b4595dc33..3280802bac42725132ef9ad22cc0439d45fca5ac 100644
--- a/tensorflow/core/platform/tensor_coding.cc
+++ b/tensorflow/core/platform/tensor_coding.cc
@@ -19,6 +19,12 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+#if defined(TENSORFLOW_PROTOBUF_USES_CORD)
+#include "strings/cord_varint.h"
+#endif  // defined(TENSORFLOW_PROTOBUF_USES_CORD)
 
 namespace tensorflow {
 namespace port {
@@ -66,5 +72,174 @@ void CopyFromArray(string* s, const char* base, size_t bytes) {
   s->assign(base, bytes);
 }
 
+class StringListEncoderImpl : public StringListEncoder {
+ public:
+  explicit StringListEncoderImpl(string* out) : out_(out) {}
+  ~StringListEncoderImpl() override = default;
+
+  void Append(const protobuf::MessageLite& m) override {
+    core::PutVarint32(out_, m.ByteSizeLong());
+    tensorflow::string serialized_message;
+    m.AppendToString(&serialized_message);
+    strings::StrAppend(&rest_, serialized_message);
+  }
+
+  void Append(const string& s) override {
+    core::PutVarint32(out_, s.length());
+    strings::StrAppend(&rest_, s);
+  }
+
+  void Finalize() override { strings::StrAppend(out_, rest_); }
+
+ private:
+  string* out_;
+  string rest_;
+};
+
+class StringListDecoderImpl : public StringListDecoder {
+ public:
+  explicit StringListDecoderImpl(const string& in) : reader_(in) {}
+  ~StringListDecoderImpl() override = default;
+
+  bool ReadSizes(std::vector<uint32>* sizes) override {
+    int64 total = 0;
+    for (auto& size : *sizes) {
+      if (!core::GetVarint32(&reader_, &size)) return false;
+      total += size;
+    }
+    if (total != static_cast<int64>(reader_.size())) {
+      return false;
+    }
+    return true;
+  }
+
+  const char* Data(uint32 size) override {
+    const char* data = reader_.data();
+    reader_.remove_prefix(size);
+    return data;
+  }
+
+ private:
+  StringPiece reader_;
+};
+
+std::unique_ptr<StringListEncoder> NewStringListEncoder(string* out) {
+  return std::unique_ptr<StringListEncoder>(new StringListEncoderImpl(out));
+}
+
+std::unique_ptr<StringListDecoder> NewStringListDecoder(const string& in) {
+  return std::unique_ptr<StringListDecoder>(new StringListDecoderImpl(in));
+}
+
+#if defined(TENSORFLOW_PROTOBUF_USES_CORD)
+void AssignRefCounted(StringPiece src, core::RefCounted* obj, Cord* out) {
+  obj->Ref();
+  out->Clear();
+  // Defines a lambda to unref "obj" when Cord deletes this piece of
+  // memory. +[] converts the lambda to a C style function pointer.
+  auto cleanup = +[](absl::string_view donotcare, void* obj) {
+    reinterpret_cast<core::RefCounted*>(obj)->Unref();
+  };
+  out->AppendExternalMemory(absl::string_view(src.data(), src.size()), obj,
+                            cleanup);
+}
+
+void EncodeStringList(const string* strings, int64 n, Cord* out) {
+  out->Clear();
+  for (int i = 0; i < n; ++i) {
+    ::strings::CordAppendVarint(strings[i].size(), out);
+  }
+  for (int i = 0; i < n; ++i) {
+    out->Append(strings[i]);
+  }
+}
+
+bool DecodeStringList(const Cord& src, string* strings, int64 n) {
+  std::vector<uint32> sizes(n);
+  CordReader reader(src);
+  int64 tot = 0;
+  for (auto& v : sizes) {
+    if (!::strings::CordReaderReadVarint(&reader, &v)) return false;
+    tot += v;
+  }
+  if (tot != reader.Available()) {
+    return false;
+  }
+  string* data = strings;
+  for (int i = 0; i < n; ++i, ++data) {
+    auto size = sizes[i];
+    if (size > reader.Available()) {
+      return false;
+    }
+    gtl::STLStringResizeUninitialized(data, size);
+    reader.ReadN(size, gtl::string_as_array(data));
+  }
+  return true;
+}
+
+void CopyFromArray(Cord* c, const char* base, size_t bytes) {
+  c->CopyFrom(base, bytes);
+}
+
+class CordStringListEncoderImpl : public StringListEncoder {
+ public:
+  explicit CordStringListEncoderImpl(Cord* out) : out_(out) {}
+  ~CordStringListEncoderImpl() override = default;
+
+  void Append(const protobuf::MessageLite& m) override {
+    ::strings::CordAppendVarint(m.ByteSizeLong(), out_);
+    m.AppendToString(&rest_);
+  }
+
+  void Append(const string& s) override {
+    ::strings::CordAppendVarint(s.length(), out_);
+    rest_.append(s.data(), s.size());
+  }
+
+  void Finalize() override { out_->Append(rest_); }
+
+ private:
+  Cord* out_;
+  string rest_;
+};
+
+class CordStringListDecoderImpl : public StringListDecoder {
+ public:
+  explicit CordStringListDecoderImpl(const Cord& in) : reader_(in) {}
+  ~CordStringListDecoderImpl() override = default;
+
+  bool ReadSizes(std::vector<uint32>* sizes) override {
+    int64 total = 0;
+    for (auto& size : *sizes) {
+      if (!::strings::CordReaderReadVarint(&reader_, &size)) return false;
+      total += size;
+    }
+    if (total != static_cast<int64>(reader_.Available())) {
+      return false;
+    }
+    return true;
+  }
+
+  const char* Data(uint32 size) override {
+    tmp_.resize(size);
+    reader_.ReadN(size, tmp_.data());
+    return tmp_.data();
+  }
+
+ private:
+  CordReader reader_;
+  std::vector<char> tmp_;
+};
+
+std::unique_ptr<StringListEncoder> NewStringListEncoder(Cord* out) {
+  return std::unique_ptr<StringListEncoder>(new CordStringListEncoderImpl(out));
+}
+
+std::unique_ptr<StringListDecoder> NewStringListDecoder(const Cord& in) {
+  return std::unique_ptr<StringListDecoder>(new CordStringListDecoderImpl(in));
+}
+
+#endif  // defined(TENSORFLOW_PROTOBUF_USES_CORD)
+
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/tensor_coding.h b/tensorflow/core/platform/tensor_coding.h
index 6c6d75830de743b3e24676c1f57b6988aad11a0f..993ce537ffcd3884cfbb32d1edbdbfbe89f72658 100644
--- a/tensorflow/core/platform/tensor_coding.h
+++ b/tensorflow/core/platform/tensor_coding.h
@@ -21,14 +21,9 @@ limitations under the License.
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 
-#ifdef PLATFORM_GOOGLE
-#include "tensorflow/core/platform/google/cord_coding.h"
-#else
-#include "tensorflow/core/platform/default/string_coding.h"
-#endif
-
 namespace tensorflow {
 namespace port {
 
@@ -42,6 +37,15 @@ inline void CopyToArray(const string& src, char* dst) {
   memcpy(dst, src.data(), src.size());
 }
 
+// Copy subrange [pos:(pos + n)) from src to dst. If pos >= src.size() the
+// result is empty. If pos + n > src.size() the subrange [pos, size()) is
+// copied.
+inline void CopySubrangeToArray(const string& src, size_t pos, size_t n,
+                                char* dst) {
+  if (pos >= src.size()) return;
+  memcpy(dst, src.data() + pos, std::min(n, src.size() - pos));
+}
+
 // Store encoding of strings[0..n-1] in *out.
 void EncodeStringList(const string* strings, int64 n, string* out);
 
@@ -52,6 +56,75 @@ bool DecodeStringList(const string& src, string* strings, int64 n);
 // Assigns base[0..bytes-1] to *s
 void CopyFromArray(string* s, const char* base, size_t bytes);
 
+// Encodes sequences of strings and serialized protocol buffers into a string.
+// Normal usage consists of zero or more calls to Append() and a single call to
+// Finalize().
+class StringListEncoder {
+ public:
+  virtual ~StringListEncoder() = default;
+
+  // Encodes the given protocol buffer. This may not be called after Finalize().
+  virtual void Append(const protobuf::MessageLite& m) = 0;
+
+  // Encodes the given string. This may not be called after Finalize().
+  virtual void Append(const string& s) = 0;
+
+  // Signals end of the encoding process. No other calls are allowed after this.
+  virtual void Finalize() = 0;
+};
+
+// Decodes a string into sequences of strings (which may represent serialized
+// protocol buffers). Normal usage involves a single call to ReadSizes() in
+// order to retrieve the length of all the strings in the sequence. For each
+// size returned a call to Data() is expected and will return the actual
+// string.
+class StringListDecoder {
+ public:
+  virtual ~StringListDecoder() = default;
+
+  // Populates the given vector with the lengths of each string in the sequence
+  // being decoded. Upon returning the vector is guaranteed to contain as many
+  // elements as there are strings in the sequence.
+  virtual bool ReadSizes(std::vector<uint32>* sizes) = 0;
+
+  // Returns a pointer to the next string in the sequence, then prepares for the
+  // next call by advancing 'size' characters in the sequence.
+  virtual const char* Data(uint32 size) = 0;
+};
+
+std::unique_ptr<StringListEncoder> NewStringListEncoder(string* out);
+std::unique_ptr<StringListDecoder> NewStringListDecoder(const string& in);
+
+#if defined(TENSORFLOW_PROTOBUF_USES_CORD)
+// Store src contents in *out.  If backing memory for src is shared with *out,
+// will ref obj during the call and will arrange to unref obj when no
+// longer needed.
+void AssignRefCounted(StringPiece src, core::RefCounted* obj, Cord* out);
+
+// TODO(kmensah): Macro guard this with a check for Cord support.
+inline void CopyToArray(const Cord& src, char* dst) { src.CopyToArray(dst); }
+
+// Copy n bytes of src to dst. If pos >= src.size() the result is empty.
+// If pos + n > src.size() the subrange [pos, size()) is copied.
+inline void CopySubrangeToArray(const Cord& src, int64 pos, int64 n,
+                                char* dst) {
+  src.Subcord(pos, n).CopyToArray(dst);
+}
+
+// Store encoding of strings[0..n-1] in *out.
+void EncodeStringList(const string* strings, int64 n, Cord* out);
+
+// Decode n strings from src and store in strings[0..n-1].
+// Returns true if successful, false on parse error.
+bool DecodeStringList(const Cord& src, string* strings, int64 n);
+
+// Assigns base[0..bytes-1] to *c
+void CopyFromArray(Cord* c, const char* base, size_t bytes);
+
+std::unique_ptr<StringListEncoder> NewStringListEncoder(Cord* out);
+std::unique_ptr<StringListDecoder> NewStringListDecoder(const Cord& in);
+#endif  // defined(TENSORFLOW_PROTOBUF_USES_CORD)
+
 }  // namespace port
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/windows/env.cc b/tensorflow/core/platform/windows/env.cc
index 77ce2026d9d2cdda7ef1ea0ad6bb71050a6467af..e0e3dda7055b5cbe8f0e08be4a251232b8005fd2 100644
--- a/tensorflow/core/platform/windows/env.cc
+++ b/tensorflow/core/platform/windows/env.cc
@@ -84,6 +84,12 @@ class WindowsEnv : public Env {
     return new StdThread(thread_options, name, fn);
   }
 
+  int32 GetCurrentThreadId() override {
+    return static_cast<int32>(::GetCurrentThreadId());
+  }
+
+  bool GetCurrentThreadName(string* name) override { return false; }
+
   static VOID CALLBACK SchedClosureCallback(PTP_CALLBACK_INSTANCE Instance,
                                             PVOID Context, PTP_WORK Work) {
     CloseThreadpoolWork(Work);
diff --git a/tensorflow/core/platform/windows/error.h b/tensorflow/core/platform/windows/error.h
index ba643a0fa8f92f58fbd88ac00fba3f663bb7e0f2..22875ac2bc4a059a26ef2a9ba44e1e51154bee6c 100644
--- a/tensorflow/core/platform/windows/error.h
+++ b/tensorflow/core/platform/windows/error.h
@@ -19,6 +19,9 @@ limitations under the License.
 #include <string>
 
 #include <Windows.h>
+// Windows.h #defines ERROR, but it is also used in
+// tensorflow/core/util/event.proto
+#undef ERROR
 
 namespace tensorflow {
 namespace internal {
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index 911ea1902f800c795c60505b2d91a6a6b31b7b01..b902c85cdcfd567d0b77322bfe30d7ba26e25e5a 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #endif
 
 #include <Windows.h>
+#include <processthreadsapi.h>
 #include <shlwapi.h>
 
 #include "tensorflow/core/platform/cpu_info.h"
@@ -54,6 +55,30 @@ int NumSchedulableCPUs() {
   return system_info.dwNumberOfProcessors;
 }
 
+int NumTotalCPUs() {
+  // TODO(ebrevdo): Make this more accurate.
+  //
+  // This only returns the number of processors in the current
+  // processor group; which may be undercounting if you have more than 64 cores.
+  // For that case, one needs to call
+  // GetLogicalProcessorInformationEx(RelationProcessorCore, ...) and accumulate
+  // the Size fields by iterating over the written-to buffer.  Since I can't
+  // easily test this on Windows, I'm deferring this to someone who can!
+  //
+  // If you fix this, also consider updatig GetCurrentCPU below.
+  return NumSchedulableCPUs();
+}
+
+int GetCurrentCPU() {
+  // NOTE(ebrevdo): This returns the processor number within the processor
+  // group on systems with >64 processors.  Therefore it doesn't necessarily map
+  // naturally to an index in NumSchedulableCPUs().
+  //
+  // On the plus side, this number is probably guaranteed to be within
+  // [0, NumTotalCPUs()) due to its incomplete implementation.
+  return GetCurrentProcessorNumber();
+}
+
 bool NUMAEnabled() {
   // Not yet implemented: coming soon.
   return false;
diff --git a/tensorflow/core/platform/windows/wide_char.h b/tensorflow/core/platform/windows/wide_char.h
index 1b86abc3fa120feb331ad46a5221444c7d08effb..5aca95454f335119907b71d73afce94d8f99aeff 100644
--- a/tensorflow/core/platform/windows/wide_char.h
+++ b/tensorflow/core/platform/windows/wide_char.h
@@ -17,6 +17,9 @@ limitations under the License.
 #define TENSORFLOW_CORE_PLATFORM_WINDOWS_WIDE_CHAR_H_
 
 #include <Windows.h>
+// Windows.h #defines ERROR, but it is also used in
+// tensorflow/core/util/event.proto
+#undef ERROR
 #include <string>
 #include "tensorflow/core/platform/types.h"
 
diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc
index 993b9906b1c072cb48c816855fb2fc1498ae3f40..8580c3a3efb6807c3d96650f6809a8b9b54b0e89 100644
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@@ -112,6 +112,11 @@ class WindowsRandomAccessFile : public RandomAccessFile {
     }
   }
 
+  Status Name(StringPiece* result) const override {
+    *result = filename_;
+    return Status::OK();
+  }
+
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
     Status s;
@@ -189,6 +194,11 @@ class WindowsWritableFile : public WritableFile {
     return Status::OK();
   }
 
+  Status Name(StringPiece* result) const override {
+    *result = filename_;
+    return Status::OK();
+  }
+
   Status Sync() override { return Flush(); }
 };
 
diff --git a/tensorflow/core/profiler/BUILD b/tensorflow/core/profiler/BUILD
index 2bf371276ef6013ac9f8e3c44623f9c7720cffb3..4efc15b7e5ff65085137d348e57f7311dd01db14 100644
--- a/tensorflow/core/profiler/BUILD
+++ b/tensorflow/core/profiler/BUILD
@@ -5,6 +5,7 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_profiler_all_protos")
 
 tf_cc_binary(
     name = "profiler",
@@ -36,9 +37,35 @@ cc_library(
     ],
 )
 
+tf_proto_library(
+    name = "profiler_service_proto",
+    srcs = ["profiler_service.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_grpc_version = 1,
+    protodeps = tf_profiler_all_protos() + tf_additional_all_protos(),
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library(
+    name = "profiler_analysis_proto",
+    srcs = ["profiler_analysis.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_grpc_version = 1,
+    protodeps = [":profiler_service_proto"] + tf_additional_all_protos(),
+    visibility = ["//visibility:public"],
+)
+
 tf_proto_library(
     name = "protos_all",
-    srcs = glob(["**/*.proto"]),
+    srcs = glob(
+        ["**/*.proto"],
+        exclude = [
+            "profiler_service.proto",
+            "profiler_analysis.proto",
+        ],
+    ),
     cc_api_version = 2,
     protodeps = tf_additional_all_protos(),
     visibility = ["//visibility:public"],
diff --git a/tensorflow/core/profiler/README.md b/tensorflow/core/profiler/README.md
index 57d76eb4cb9382790c80a0d55ee94b64e7b9dcdc..341421738e618e7406de05a126a49f4e1e336b93 100644
--- a/tensorflow/core/profiler/README.md
+++ b/tensorflow/core/profiler/README.md
@@ -35,10 +35,10 @@ bazel-bin/tensorflow/core/profiler/profiler \
     --profile_path=/tmp/train_dir/profile_xx
 tfprof> op -select micros,bytes,occurrence -order_by micros
 
-# To be open sourced...
-bazel-bin/tensorflow/python/profiler/profiler_ui \
-    --profile_path=/tmp/profiles/profile_1
+# Profiler ui available at: https://github.com/tensorflow/profiler-ui
+python ui.py --profile_context_path=/tmp/train_dir/profile_xx
 ```
+
 ![ProfilerUI](g3doc/profiler_ui.jpg)
 
 ```python
diff --git a/tensorflow/core/profiler/g3doc/options.md b/tensorflow/core/profiler/g3doc/options.md
index 7f2cd3f698c860f16cd7b027b5ff7c8e24338cf0..38a8e0285118aa664f22ba866edd59bf8ffdbcde 100644
--- a/tensorflow/core/profiler/g3doc/options.md
+++ b/tensorflow/core/profiler/g3doc/options.md
@@ -54,10 +54,10 @@ cpu_micros: This is the cpu times.
 
 ### Memory
 
-Tensor memory are usually ref-counted. The memory is released when there is
-no more reference to it. It will be difficult to track the release of memory.
+Tensor memory are usually ref-counted. The memory is released when there is no
+more reference to it. It will be difficult to track the release of memory.
 Currently, profiler only tracks the allocation of memory. As a result, the
-accumulated memory request is uaually larger than the peak memory of the overall
+accumulated memory request is usually larger than the peak memory of the overall
 model.
 
 It's recommended to generate timeline to see the allocator memory usage over
diff --git a/tensorflow/core/profiler/g3doc/profile_memory.md b/tensorflow/core/profiler/g3doc/profile_memory.md
index 6eda5abdd973ece435855b0952a5edd4a86b8217..03229e497f3bc150c6258c27d87c3be621ef7065 100644
--- a/tensorflow/core/profiler/g3doc/profile_memory.md
+++ b/tensorflow/core/profiler/g3doc/profile_memory.md
@@ -14,10 +14,7 @@ Open a Chrome browser, enter URL chrome://tracing and load the timeline file.
 ******************************************************
 ```
 
-<left>
 ![Timeline](graph_timeline.png)
-</left>
-
 
 ```python
 # You can also visualize the memory information through other methods.
@@ -77,4 +74,4 @@ _TFProfRoot (--/74148.60MB)
                   seq2seq_attention_model.py:320:_add_train_op:tf.summary.scalar... (0B/64B)
                 seq2seq_attention_model.py:360:build_graph:self._add_seq2seq() (0B/25216.74MB)
                   seq2seq_attention_model.py:192:_add_seq2seq:sequence_length=a... (0B/21542.55MB)
-```
\ No newline at end of file
+```
diff --git a/tensorflow/core/profiler/internal/BUILD b/tensorflow/core/profiler/internal/BUILD
index 8dcfde9a2adbd3a1774bce8506a84f80ca099c34..da3039ae3ceba103882d1315c6293af5560e1862 100644
--- a/tensorflow/core/profiler/internal/BUILD
+++ b/tensorflow/core/profiler/internal/BUILD
@@ -6,6 +6,8 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
+load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 
 cc_library(
     name = "tfprof_stats",
@@ -365,3 +367,43 @@ cc_library(
         "//tensorflow/core:regexp_internal",
     ],
 )
+
+tf_cuda_library(
+    name = "traceme_recorder",
+    srcs = ["traceme_recorder.cc"],
+    hdrs = ["traceme_recorder.h"],
+    visibility = [
+        "//learning/brain/runtime:__pkg__",  # xprof_bridge
+        "//perftools/accelerators/xprof/xprofilez:__pkg__",  # alias xprof::TraceMeRecorder
+        "//tensorflow/core/profiler/internal/cpu:__pkg__",  # host_tracer
+        "//tensorflow/core/profiler/lib:__pkg__",  # traceme
+    ],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "traceme_recorder_test",
+    srcs = ["traceme_recorder_test.cc"],
+    deps = [
+        ":traceme_recorder",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_cuda_library(
+    name = "profiler_interface",
+    hdrs = [
+        "profiler_interface.h",
+    ],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
diff --git a/tensorflow/core/profiler/internal/cpu/BUILD b/tensorflow/core/profiler/internal/cpu/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..b94453c0a4be5e3c886277356b23ef0c5df5b1c9
--- /dev/null
+++ b/tensorflow/core/profiler/internal/cpu/BUILD
@@ -0,0 +1,44 @@
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cuda_library",
+)
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
+
+tf_cuda_library(
+    name = "host_tracer",
+    srcs = [
+        "host_tracer.cc",
+    ],
+    hdrs = [
+        "host_tracer.h",
+    ],
+    deps = [
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler/internal:profiler_interface",
+        "//tensorflow/core/profiler/internal:traceme_recorder",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "host_tracer_test",
+    srcs = ["host_tracer_test.cc"],
+    deps = [
+        ":host_tracer",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer.cc b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3fb296646883cf2215d8df8240219ddce04fb7d0
--- /dev/null
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
@@ -0,0 +1,120 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/internal/cpu/host_tracer.h"
+
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_split.h"
+#include "tensorflow/core/common_runtime/step_stats_collector.h"
+#include "tensorflow/core/platform/env_time.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace cpu {
+
+/* static */ std::unique_ptr<HostTracer> HostTracer::Create(
+    int host_trace_level) {
+  return absl::WrapUnique(new HostTracer(host_trace_level));
+}
+HostTracer::HostTracer(int host_trace_level)
+    : host_trace_level_(host_trace_level) {}
+
+HostTracer::~HostTracer() { Stop().IgnoreError(); }
+
+Status HostTracer::Start() {
+  if (recording_) {
+    return Status(error::INTERNAL, "TraceMeRecorder already started");
+  }
+  recording_ = TraceMeRecorder::Start(host_trace_level_);
+  if (!recording_) {
+    return Status(error::INTERNAL, "Failed to start TraceMeRecorder");
+  }
+  return Status::OK();
+}
+
+Status HostTracer::Stop() {
+  if (!recording_) {
+    return Status(error::INTERNAL, "TraceMeRecorder not started");
+  }
+  events_ = TraceMeRecorder::Stop();
+  recording_ = false;
+  return Status::OK();
+}
+
+constexpr char kUserMetadataMarker = '#';
+
+Status HostTracer::CollectData(RunMetadata* run_metadata) {
+  auto step_stats_collector =
+      absl::make_unique<StepStatsCollector>(run_metadata->mutable_step_stats());
+  return CollectDataToCollector(step_stats_collector.get());
+}
+
+Status HostTracer::CollectDataToCollector(
+    StepStatsCollector* step_stats_collector) {
+  if (events_.empty() && recording_) {
+    events_ = TraceMeRecorder::Collect();
+  }
+  // Pair up start and end events, and add complete events to trace_entries.
+  absl::flat_hash_map<uint64, uint64> end_times;
+  for (const auto& thread : events_) {
+    for (const auto& event : thread.events) {
+      if (event.end_time && !event.start_time) {
+        end_times.emplace(event.activity_id, event.end_time);
+      }
+    }
+  }
+
+  const string cpu_name = "/host:CPU";
+  for (auto& thread : events_) {
+    step_stats_collector->SaveThreadName(cpu_name, thread.thread.tid,
+                                         thread.thread.name);
+    for (auto& event : thread.events) {
+      if (!event.end_time) {
+        auto it = end_times.find(event.activity_id);
+        if (it != end_times.end()) event.end_time = it->second;
+      }
+      if (event.start_time && event.end_time) {
+        NodeExecStats* ns = new NodeExecStats;
+        if (event.name.back() != kUserMetadataMarker) {
+          ns->set_node_name(std::move(event.name));
+        } else {
+          // Expect the format will be "<name>#<metadata>#"
+          std::vector<absl::string_view> parts =
+              absl::StrSplit(event.name, kUserMetadataMarker);
+          if (parts.size() >= 2) {
+            ns->set_node_name(string(parts[0]));
+            ns->set_timeline_label(string(parts[1]));
+          } else {
+            ns->set_node_name(std::move(event.name));
+          }
+        }
+        ns->set_all_start_micros(event.start_time / EnvTime::kMicrosToNanos);
+        ns->set_all_end_rel_micros((event.end_time - event.start_time) /
+                                   EnvTime::kMicrosToNanos);
+        ns->set_thread_id(thread.thread.tid);
+        // TODO(fishx): Add thread name to RunMetadata
+        step_stats_collector->Save(cpu_name, ns);
+      }
+    }
+  }
+  events_.clear();
+  step_stats_collector->Finalize();
+  return Status::OK();
+}
+
+}  // namespace cpu
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer.h b/tensorflow/core/profiler/internal/cpu/host_tracer.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6340c2eddc8ee66d4ffb2ad2829e15f34cc38ec
--- /dev/null
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer.h
@@ -0,0 +1,67 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_CPU_HOST_TRACER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_CPU_HOST_TRACER_H_
+
+#include "tensorflow/core/common_runtime/step_stats_collector.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/internal/traceme_recorder.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace cpu {
+
+// Controls TraceMeRecorder and converts TraceMeRecorder::Events into
+// RunMetadata messages.
+//
+// Thread-safety: This class is go/thread-compatible.
+class HostTracer : public ProfilerInterface {
+ public:
+  static std::unique_ptr<HostTracer> Create(int host_trace_level);
+
+  ~HostTracer();
+
+  // Starts recording TraceMes.
+  Status Start() override;
+
+  // Stops recording TraceMes.
+  Status Stop() override;
+
+  // Populates user traces and thread names in response.
+  // The user traces and thread names are in no particular order.
+  Status CollectData(RunMetadata* run_metadata) override;
+
+  Status CollectDataToCollector(StepStatsCollector* step_stats_collector);
+
+ private:
+  explicit HostTracer(int host_trace_level);
+
+  // Level of host tracing.
+  const int host_trace_level_;
+
+  // True if currently recording.
+  bool recording_ = false;
+
+  // Container of all traced events.
+  TraceMeRecorder::Events events_;
+};
+
+}  // namespace cpu
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_CPU_HOST_TRACER_H_
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..51f9c6a8ca6e52b21d0335d83f321cc4bbc331dc
--- /dev/null
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
@@ -0,0 +1,133 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/internal/cpu/host_tracer.h"
+
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/types/optional.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace cpu {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Pair;
+using ::testing::UnorderedElementsAre;
+
+NodeExecStats MakeNodeStats(const string& name, uint64 thread_id,
+                            const string& label = "") {
+  NodeExecStats ns;
+  ns.set_node_name(name);
+  ns.set_thread_id(thread_id);
+  if (!label.empty()) {
+    ns.set_timeline_label(label);
+  }
+  return ns;
+}
+
+class NodeStatsMatcher {
+ public:
+  explicit NodeStatsMatcher(const NodeExecStats& expected)
+      : expected_(expected) {}
+
+  bool MatchAndExplain(const NodeExecStats& p,
+                       ::testing::MatchResultListener* /* listener */) const {
+    return p.node_name() == expected_.node_name() &&
+           p.thread_id() == expected_.thread_id() &&
+           p.timeline_label() == expected_.timeline_label();
+  }
+
+  void DescribeTo(::std::ostream* os) const { *os << expected_.DebugString(); }
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "not equal to expected message: " << expected_.DebugString();
+  }
+
+ private:
+  const NodeExecStats expected_;
+};
+
+inline ::testing::PolymorphicMatcher<NodeStatsMatcher> EqualsNodeStats(
+    const NodeExecStats& expected) {
+  return ::testing::MakePolymorphicMatcher(NodeStatsMatcher(expected));
+}
+
+TEST(HostTracerTest, CollectsTraceMeEvents) {
+  uint32 thread_id = Env::Default()->GetCurrentThreadId();
+
+  auto tracer = HostTracer::Create(/*host_trace_level=*/1);
+
+  TF_ASSERT_OK(tracer->Start());
+  { TraceMe traceme("hello"); }
+  { TraceMe traceme("world"); }
+  { TraceMe traceme("contains#inside"); }
+  { TraceMe traceme("good#key1=value1#"); }
+  { TraceMe traceme("morning#key1=value1,key2=value2#"); }
+  { TraceMe traceme("incomplete#key1=value1,key2#"); }
+  TF_ASSERT_OK(tracer->Stop());
+
+  RunMetadata run_metadata;
+  TF_ASSERT_OK(tracer->CollectData(&run_metadata));
+
+  EXPECT_EQ(run_metadata.step_stats().dev_stats_size(), 1);
+  EXPECT_EQ(run_metadata.step_stats().dev_stats(0).node_stats_size(), 6);
+  EXPECT_THAT(
+      run_metadata.step_stats().dev_stats(0).node_stats(),
+      UnorderedElementsAre(
+          EqualsNodeStats(MakeNodeStats("hello", thread_id)),
+          EqualsNodeStats(MakeNodeStats("world", thread_id)),
+          EqualsNodeStats(MakeNodeStats("contains#inside", thread_id)),
+          EqualsNodeStats(MakeNodeStats("good", thread_id, "key1=value1")),
+          EqualsNodeStats(
+              MakeNodeStats("morning", thread_id, "key1=value1,key2=value2")),
+          EqualsNodeStats(
+              MakeNodeStats("incomplete", thread_id, "key1=value1,key2"))));
+}
+
+void ValidateResult(const RunMetadata& run_metadata, const string& trace_name) {
+  uint32 thread_id = Env::Default()->GetCurrentThreadId();
+
+  EXPECT_THAT(
+      run_metadata.step_stats().dev_stats(0).node_stats(),
+      ElementsAre(EqualsNodeStats(MakeNodeStats(trace_name, thread_id))));
+}
+
+TEST(HostTracerTest, CollectsTraceMeEventsBetweenTracing) {
+  auto tracer = HostTracer::Create(/*host_trace_level=*/1);
+  RunMetadata run_metadata;
+  RunMetadata run_metadata2;
+
+  TF_ASSERT_OK(tracer->Start());
+  { TraceMe traceme("hello"); }
+  TF_ASSERT_OK(tracer->CollectData(&run_metadata));
+  { TraceMe traceme("world"); }
+  TF_ASSERT_OK(tracer->CollectData(&run_metadata2));
+  TF_ASSERT_OK(tracer->Stop());
+
+  ValidateResult(run_metadata, "hello");
+  ValidateResult(run_metadata2, "world");
+}
+
+}  // namespace
+}  // namespace cpu
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/gpu/BUILD b/tensorflow/core/profiler/internal/gpu/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..35f90e9bfc01f7bba0d0cc6d65cc23ea549469a1
--- /dev/null
+++ b/tensorflow/core/profiler/internal/gpu/BUILD
@@ -0,0 +1,25 @@
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cuda_library",
+)
+
+tf_cuda_library(
+    name = "tracer",
+    srcs = [
+        "tracer.cc",
+    ],
+    hdrs = [
+        "tracer.h",
+    ],
+    deps = [
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:device_tracer",
+        "//tensorflow/core/profiler/internal:profiler_interface",
+    ],
+)
diff --git a/tensorflow/core/profiler/internal/gpu/tracer.cc b/tensorflow/core/profiler/internal/gpu/tracer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f1cb54161c70dbab52a661065ec874497d57b61b
--- /dev/null
+++ b/tensorflow/core/profiler/internal/gpu/tracer.cc
@@ -0,0 +1,59 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/internal/gpu/tracer.h"
+#include "tensorflow/core/common_runtime/step_stats_collector.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace gpu {
+
+/* static */ std::unique_ptr<ProfilerInterface> Tracer::Create() {
+  return absl::WrapUnique(new Tracer());
+}
+
+Status Tracer::Start() {
+  device_tracer_ = CreateDeviceTracer();
+  if (!device_tracer_) {
+    return Status(tensorflow::error::Code::FAILED_PRECONDITION,
+                  "Failed to create device tracer.");
+  }
+  return device_tracer_->Start();
+}
+
+Status Tracer::Stop() {
+  if (!device_tracer_) {
+    return Status(tensorflow::error::Code::FAILED_PRECONDITION,
+                  "No running device tracer.");
+  }
+  return device_tracer_->Stop();
+}
+
+Status Tracer::CollectData(RunMetadata* run_metadata) {
+  if (!device_tracer_) {
+    return Status(tensorflow::error::Code::FAILED_PRECONDITION,
+                  "No running device tracer.");
+  }
+  auto step_stats_collector =
+      absl::make_unique<StepStatsCollector>(run_metadata->mutable_step_stats());
+  Status s = device_tracer_->Collect(step_stats_collector.get());
+  step_stats_collector->Finalize();
+  return s;
+}
+
+Tracer::Tracer() {}
+
+}  // namespace gpu
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/gpu/tracer.h b/tensorflow/core/profiler/internal/gpu/tracer.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7765432de96b3eda20dbaef089126abec0d234f
--- /dev/null
+++ b/tensorflow/core/profiler/internal/gpu/tracer.h
@@ -0,0 +1,48 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_TRACER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_TRACER_H_
+
+#include "tensorflow/core/platform/device_tracer.h"
+#include "tensorflow/core/profiler/internal/profiler_interface.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace gpu {
+
+class Tracer : public ProfilerInterface {
+ public:
+  static std::unique_ptr<ProfilerInterface> Create();
+
+  Status Start() override;
+
+  Status Stop() override;
+
+  Status CollectData(RunMetadata* run_metadata) override;
+
+ private:
+  Tracer();
+
+  // Trace is neither copyable nor movable.
+  Tracer(const Tracer&) = delete;
+  Tracer& operator=(const Tracer&) = delete;
+
+  std::unique_ptr<DeviceTracer> device_tracer_;
+};
+
+}  // namespace gpu
+}  // namespace profiler
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_TRACER_H_
diff --git a/tensorflow/core/profiler/internal/profiler_interface.h b/tensorflow/core/profiler/internal/profiler_interface.h
new file mode 100644
index 0000000000000000000000000000000000000000..144c4bb44d7a0c4c0e565d466cb1fd3b1506dae2
--- /dev/null
+++ b/tensorflow/core/profiler/internal/profiler_interface.h
@@ -0,0 +1,49 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_INTERFACE_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_INTERFACE_H_
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Interface for tensorflow profiler plugins.
+//
+// ProfileSession calls each of these methods at most once per instance, and
+// implementations can rely on that guarantee for simplicity.
+//
+// Thread-safety: Implementations are only required to be go/thread-compatible.
+// ProfileSession is go/thread-safe and synchronizes access to ProfilerInterface
+// instances.
+class ProfilerInterface {
+ public:
+  virtual ~ProfilerInterface() = default;
+
+  // Starts profiling.
+  virtual Status Start() = 0;
+
+  // Stops profiling.
+  virtual Status Stop() = 0;
+
+  // Moves collected profile data into run_metadata.
+  virtual Status CollectData(RunMetadata* run_metadata) = 0;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_INTERFACE_H_
diff --git a/tensorflow/core/profiler/internal/runtime/BUILD b/tensorflow/core/profiler/internal/runtime/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..2e383f1716f304bf321b2e82ad85582d643d8d8c
--- /dev/null
+++ b/tensorflow/core/profiler/internal/runtime/BUILD
@@ -0,0 +1,24 @@
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cuda_library",
+)
+
+tf_cuda_library(
+    name = "eager_profiler",
+    srcs = [
+        "eager_profiler.cc",
+    ],
+    hdrs = [
+        "eager_profiler.h",
+    ],
+    deps = [
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/profiler/internal:profiler_interface",
+    ],
+)
diff --git a/tensorflow/core/profiler/internal/runtime/eager_profiler.cc b/tensorflow/core/profiler/internal/runtime/eager_profiler.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aad692b01f6fa09595f0035bc2530bf210cb7e4e
--- /dev/null
+++ b/tensorflow/core/profiler/internal/runtime/eager_profiler.cc
@@ -0,0 +1,61 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/internal/runtime/eager_profiler.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace runtime {
+
+TraceCollector::TraceCollector(EagerContext* const eager_context)
+    : context_(eager_context) {}
+
+void TraceCollector::BeforeClearRunMetadata() {
+  run_metadata_.MergeFrom(*context_->RunMetadataProto());
+}
+
+Status TraceCollector::CollectData(RunMetadata* run_metadata) {
+  run_metadata->MergeFrom(run_metadata_);
+  return Status::OK();
+}
+
+/* static */ std::unique_ptr<ProfilerInterface> EagerProfiler::Create(
+    EagerContext* const eager_context) {
+  return absl::WrapUnique(new EagerProfiler(eager_context));
+}
+
+Status EagerProfiler::Start() {
+  if (context_ == nullptr) {
+    return Status(tensorflow::error::Code::FAILED_PRECONDITION,
+                  "No eager context attached.");
+  }
+  return context_->RegisterRunMetadataListener(&collector_);
+}
+
+Status EagerProfiler::Stop() {
+  collector_.BeforeClearRunMetadata();
+  context_->ClearRunMetadataListener();
+  return Status::OK();
+}
+
+Status EagerProfiler::CollectData(RunMetadata* run_metadata) {
+  return collector_.CollectData(run_metadata);
+}
+
+EagerProfiler::EagerProfiler(EagerContext* const eager_context)
+    : context_(eager_context), collector_(eager_context) {}
+
+}  // namespace runtime
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/runtime/eager_profiler.h b/tensorflow/core/profiler/internal/runtime/eager_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..7135355e6ff16a240a434c5fae2b9d6140c4a3ef
--- /dev/null
+++ b/tensorflow/core/profiler/internal/runtime/eager_profiler.h
@@ -0,0 +1,64 @@
+/* Copyright 2016 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_RUNTIME_EAGER_PROFILER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_RUNTIME_EAGER_PROFILER_H_
+
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/profiler/internal/profiler_interface.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace runtime {
+
+class TraceCollector : public RunMetadataListener {
+ public:
+  TraceCollector(EagerContext* const eager_context);
+
+  void BeforeClearRunMetadata() override;
+
+  Status CollectData(RunMetadata* run_metadata);
+
+ private:
+  RunMetadata run_metadata_;
+  EagerContext* const context_;
+};
+
+class EagerProfiler : public ProfilerInterface {
+ public:
+  static std::unique_ptr<ProfilerInterface> Create(
+      EagerContext* const eager_context);
+
+  Status Start() override;
+
+  Status Stop() override;
+
+  Status CollectData(RunMetadata* run_metadata) override;
+
+ private:
+  EagerProfiler(EagerContext* const eager_context);
+
+  // Trace is neither copyable nor movable.
+  EagerProfiler(const EagerProfiler&) = delete;
+  EagerProfiler& operator=(const EagerProfiler&) = delete;
+
+  EagerContext* const context_;
+  TraceCollector collector_;
+};
+
+}  // namespace runtime
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_RUNTIME_EAGER_PROFILER_H_
diff --git a/tensorflow/core/profiler/internal/traceme_recorder.cc b/tensorflow/core/profiler/internal/traceme_recorder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0369e0b96de2bb3bea19d1e9b2b280e24ecb0112
--- /dev/null
+++ b/tensorflow/core/profiler/internal/traceme_recorder.cc
@@ -0,0 +1,248 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/internal/traceme_recorder.h"
+
+// To avoid unneccesary synchronization between threads, each thread has a
+// ThreadLocalRecorder that independently records its events.
+//
+// Events are stored in an EventQueue implemented as a linked-list of blocks,
+// with start and end pointers:
+//  [ events........ | next-]--> [ events......... | next ]
+//  ^start_block  ^start         ^end_block  ^end
+//
+// Record() writes at end, and then advances it, allocating a block if needed.
+// Clear() takes ownership of events in the range [start, end).
+// The end pointer is atomic so these can be concurrent.
+//
+// If a thread dies, the ThreadLocalRecorder's destructor hands its data off to
+// the orphaned_events list.
+
+#include <string>
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Default value for g_trace_level when tracing is disabled
+constexpr static int kTracingDisabled = -1;
+
+namespace internal {
+std::atomic<int> g_trace_level = ATOMIC_VAR_INIT(kTracingDisabled);
+}  // namespace internal
+
+namespace {
+
+class ThreadLocalRecorder;
+
+struct Data {
+  // Lock for only rare events - start/stop, thread death.
+  mutex global_lock;
+  // Map of the static container instances (thread_local storage) for each
+  // thread, that store the trace events.
+  absl::flat_hash_map<uint64, ThreadLocalRecorder*> threads
+      GUARDED_BY(global_lock);
+  // Events traced from threads that died during tracing.
+  TraceMeRecorder::Events orphaned_events GUARDED_BY(global_lock);
+}* g_data = nullptr;
+
+// A single-producer single-consumer queue of Events.
+// Only the owner thread can write events, writing is lock-free.
+// Consume is also lock-free in this class.
+//
+// Internally, we have a linked list of blocks containing numbered slots.
+// start is the first occupied slot, end is the first unoccupied slot.
+class EventQueue {
+ public:
+  EventQueue()
+      : start_block_(new Block{0, nullptr}), end_block_(start_block_) {}
+
+  // REQUIRES: Consume() was called since the last Push().
+  // Memory should be deallocated and trace events destroyed on destruction.
+  // This doesn't require global lock as this discards all the stored trace
+  // events and we assume of destruction of this class only after the last
+  // Push() has been called.
+  ~EventQueue() {
+    DCHECK_EQ(start_, end_.load()) << "EventQueue destroyed without Consume()";
+    delete end_block_;
+  }
+
+  // Add a new event to the back of the queue. Fast and lock-free.
+  void Push(TraceMeRecorder::Event&& event) {
+    uint64 end = end_.load(std::memory_order_relaxed);
+    new (&end_block_->events[end++ - end_block_->start].event)
+        TraceMeRecorder::Event(std::move(event));
+    if (ABSL_PREDICT_FALSE(end - end_block_->start == Block::kLength)) {
+      auto* new_block = new Block{end, nullptr};
+      end_block_->next = new_block;
+      end_block_ = new_block;
+    }
+    end_.store(end, std::memory_order_release);  // Write index after contents.
+  }
+
+  // Retrieve and remove all events in the queue.
+  std::vector<TraceMeRecorder::Event> Consume() {
+    // Read index before contents.
+    uint64 end = end_.load(std::memory_order_acquire);
+    std::vector<TraceMeRecorder::Event> result;
+    result.reserve(end - start_);
+    while (start_ != end) {
+      Shift(&result);
+    }
+    return result;
+  }
+
+ private:
+  // Shift one event off the front of the queue into *out.
+  void Shift(std::vector<TraceMeRecorder::Event>* out) {
+    // Move the next event into the output.
+    auto& event = start_block_->events[start_++ - start_block_->start].event;
+    out->push_back(std::move(event));
+    event.~Event();  // Events must be individually destroyed.
+    // If we reach the end of a block, we own it and should delete it.
+    // The next block is present: end always points to something.
+    if (start_ - start_block_->start == Block::kLength) {
+      auto* next_block = start_block_->next;
+      delete start_block_;
+      start_block_ = next_block;
+    }
+  }
+
+  // The number of slots in a block. Chosen so that the block fits in 64k.
+  struct Block {
+    static constexpr size_t kLength =
+        ((1 << 16) - (sizeof(uint64) + sizeof(std::atomic<Block*>))) /
+        sizeof(TraceMeRecorder::Event);
+
+    const uint64 start;  // The number of the first slot.
+    Block* next;
+    // Defer construction of Event until the data is available.
+    // Must also destroy manually, as the block may not fill entirely.
+    union MaybeEvent {
+      MaybeEvent() {}
+      ~MaybeEvent() {}
+      TraceMeRecorder::Event event;
+    } events[kLength];
+  };
+
+  // Head of list for reading. Only accessed by consumer thread.
+  Block* start_block_;
+  uint64 start_ = 0;
+  // Tail of list for writing. Accessed by producer thread.
+  Block* end_block_;
+  std::atomic<uint64> end_ = {0};  // Atomic: also read by consumer thread.
+};
+
+class ThreadLocalRecorder {
+ public:
+  // The recorder is created the first time Record() is called on a thread.
+  ThreadLocalRecorder() {
+    auto* env = Env::Default();
+    info_.tid = env->GetCurrentThreadId();
+    env->GetCurrentThreadName(&info_.name);
+    mutex_lock lock(g_data->global_lock);
+    g_data->threads.emplace(info_.tid, this);
+  }
+
+  // The destructor is called when the thread shuts down early.
+  // We unregister this thread, and move its events to orphaned_events.
+  ~ThreadLocalRecorder() {
+    mutex_lock lock(g_data->global_lock);
+    g_data->threads.erase(info_.tid);
+    g_data->orphaned_events.push_back(Clear());
+  }
+
+  // This is the performance-critical part!
+  void Record(TraceMeRecorder::Event&& event) { queue_.Push(std::move(event)); }
+
+  TraceMeRecorder::ThreadEvents Clear()
+      EXCLUSIVE_LOCKS_REQUIRED(g_data->global_lock) {
+    return {info_, queue_.Consume()};
+  }
+
+ private:
+  TraceMeRecorder::ThreadInfo info_;
+  EventQueue queue_;
+};
+
+// Gather events from all active threads, and clear their buffers. The global
+// lock is held, so no threads can be added/removed for the duration while we
+// consume the collected trace entries. This will block any new thread and also
+// the starting and stopping of TraceMeRecorder, hence, this is performance
+// critical and should be kept fast.
+TraceMeRecorder::Events Clear() EXCLUSIVE_LOCKS_REQUIRED(g_data->global_lock) {
+  TraceMeRecorder::Events result;
+  std::swap(g_data->orphaned_events, result);
+  for (const auto& entry : g_data->threads) {
+    auto* recorder = entry.second;
+    result.push_back(recorder->Clear());
+  }
+  return result;
+}
+
+}  // namespace
+
+bool TraceMeRecorder::Start(int level) {
+  level = std::max(0, level);
+  mutex_lock lock(g_data->global_lock);
+  int expected = kTracingDisabled;
+  if (!internal::g_trace_level.compare_exchange_strong(
+          expected, level, std::memory_order_acq_rel)) {
+    return false;
+  }
+  // We may have old events in buffers because Record() raced with Stop().
+  Clear();
+  return true;
+}
+
+
+void TraceMeRecorder::Record(Event event) {
+  static thread_local ThreadLocalRecorder thread_local_recorder;
+  thread_local_recorder.Record(std::move(event));
+}
+
+// Only one thread is expected to call Stop() as first instance of XprofSession
+// prevents another XprofSession from doing any profiling.
+TraceMeRecorder::Events TraceMeRecorder::Stop() {
+  mutex_lock lock(g_data->global_lock);
+  if (internal::g_trace_level.exchange(
+          kTracingDisabled, std::memory_order_acq_rel) == kTracingDisabled) {
+    return {};
+  }
+  return Clear();
+}
+
+TraceMeRecorder::Events TraceMeRecorder::Collect() {
+  mutex_lock lock(g_data->global_lock);
+  if (internal::g_trace_level.load(std::memory_order_acquire) ==
+      kTracingDisabled) {
+    return {};
+  }
+  return Clear();
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+REGISTER_MODULE_INITIALIZER(traceme_recorder, {
+  tensorflow::profiler::g_data = new tensorflow::profiler::Data();
+
+  // Workaround for b/35097229, the first block-scoped thread_local can
+  // trigger false positives in the heap checker. Currently triggered by
+  // //perftools/accelerators/xprof/xprofilez/integration_tests:xla_hlo_trace_test
+  static thread_local tensorflow::string fix_deadlock ABSL_ATTRIBUTE_UNUSED;
+});
diff --git a/tensorflow/core/profiler/internal/traceme_recorder.h b/tensorflow/core/profiler/internal/traceme_recorder.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e66b1e5bb3f975ca20d43a67c3eec23cd8d16c1
--- /dev/null
+++ b/tensorflow/core/profiler/internal/traceme_recorder.h
@@ -0,0 +1,95 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TRACEME_RECORDER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_TRACEME_RECORDER_H_
+
+#include <atomic>
+#include <vector>
+#include "absl/base/optimization.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace profiler {
+
+namespace internal {
+extern std::atomic<int> g_trace_level;
+}  // namespace internal
+
+// TraceMeRecorder is a singleton repository of TraceMe events.
+// It can be safely and cheaply appended to by multiple threads.
+//
+// Start() and Stop() must be called in pairs, Stop() returns the events added
+// since the previous Start().
+//
+// This is the backend for TraceMe instrumentation.
+// The profiler starts the recorder, the TraceMe constructor records begin
+// events, and the destructor records end events.
+// The profiler then stops the recorder and finds start/end pairs. (Unpaired
+// start/end events are discarded at that point).
+class TraceMeRecorder {
+ public:
+  // An Event is either the start of a TraceMe, the end of a TraceMe, or both.
+  // Times are in ns since the Unix epoch.
+  struct Event {
+    uint64 activity_id;
+    string name;
+    uint64 start_time;  // 0 = missing
+    uint64 end_time;    // 0 = missing
+  };
+  struct ThreadInfo {
+    int64 tid;
+    string name;
+  };
+  struct ThreadEvents {
+    const ThreadInfo thread;
+    std::vector<Event> events;
+  };
+  using Events = std::vector<ThreadEvents>;
+
+  // Starts recording of TraceMe().
+  // Only traces <= level will be recorded.
+  // Level must be >= 0.
+  // If level is 0, no traces will be recorded.
+  static bool Start(int level);
+
+  // Stops recording and returns events recorded since Start().
+  static Events Stop();
+
+  // Returns events recorded till now without stopping the recording. Empty
+  // container is returned if the recorder was already stopped.
+  static Events Collect();
+
+  // Returns whether we're currently recording. Racy, but cheap!
+  static inline bool Active(int level = 1) {
+    return ABSL_PREDICT_FALSE(
+        internal::g_trace_level.load(std::memory_order_acquire) >= level);
+  }
+
+  static void Record(Event);
+
+ private:
+  // No copy and assignment
+  TraceMeRecorder(const TraceMeRecorder&) = delete;
+  TraceMeRecorder& operator=(const TraceMeRecorder&) = delete;
+
+  // Implementation of g_trace_level must be lock-free for faster execution
+  // of the TraceMe() public API. This can be commented (if compilation is
+  // failing) but execution might be slow (even when host tracing is disabled).
+  static_assert(ATOMIC_INT_LOCK_FREE == 2, "Assumed atomic<int> was lock free");
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TRACEME_RECORDER_H_
diff --git a/tensorflow/core/profiler/internal/traceme_recorder_test.cc b/tensorflow/core/profiler/internal/traceme_recorder_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ec588af1d6048fa709d85e86ea2e5e546f8300d1
--- /dev/null
+++ b/tensorflow/core/profiler/internal/traceme_recorder_test.cc
@@ -0,0 +1,211 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/internal/traceme_recorder.h"
+
+#include <atomic>
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/synchronization/notification.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+MATCHER_P(Named, name, "") { return arg.name == name; }
+
+constexpr static uint64 kNanosInSec = 1000000000;
+
+TEST(RecorderTest, SingleThreaded) {
+  uint64 start_time = Env::Default()->NowNanos();
+  uint64 end_time = start_time + kNanosInSec;
+
+  TraceMeRecorder::Record({1, "before", start_time, end_time});
+  TraceMeRecorder::Start(/*level=*/1);
+  TraceMeRecorder::Record({2, "during1", start_time, end_time});
+  TraceMeRecorder::Record({3, "during2", start_time, end_time});
+  auto results = TraceMeRecorder::Stop();
+  TraceMeRecorder::Record({4, "after", start_time, end_time});
+
+  ASSERT_EQ(results.size(), 1);
+  EXPECT_THAT(results[0].events,
+              ::testing::ElementsAre(Named("during1"), Named("during2")));
+}
+
+TEST(RecorderTest, CollectionBeforeStop) {
+  uint64 start_time = Env::Default()->NowNanos();
+  uint64 end_time = start_time + kNanosInSec;
+
+  TraceMeRecorder::Record({1, "ignored", start_time, end_time});
+  TraceMeRecorder::Start(/*level=*/1);
+  TraceMeRecorder::Record({2, "during1", start_time, end_time});
+  TraceMeRecorder::Record({3, "during2", start_time, end_time});
+  auto collected_results = TraceMeRecorder::Collect();
+  TraceMeRecorder::Record({4, "after_collect", start_time, end_time});
+  auto stopped_results = TraceMeRecorder::Stop();
+  TraceMeRecorder::Record({5, "after_stop", start_time, end_time});
+  auto results_after_stop = TraceMeRecorder::Collect();
+
+  ASSERT_EQ(collected_results.size(), 1);
+  EXPECT_THAT(collected_results[0].events,
+              ::testing::ElementsAre(Named("during1"), Named("during2")));
+
+  ASSERT_EQ(stopped_results.size(), 1);
+  EXPECT_THAT(stopped_results[0].events,
+              ::testing::ElementsAre(Named("after_collect")));
+
+  ASSERT_EQ(results_after_stop.size(), 0);
+}
+
+void SpinNanos(int nanos) {
+  uint64 deadline = Env::Default()->NowNanos() + nanos;
+  while (Env::Default()->NowNanos() < deadline) {
+  }
+}
+
+// Checks the functional behavior of the recorder, when used from several
+// unsynchronized threads.
+//
+// Each thread records a stream of events.
+//   Thread 0: activity=0, activity=1, activity=2, ...
+//   Thread 1: activity=0, activity=1, activity=2, ...
+//   ...
+//
+// We turn the recorder on and off repeatedly in sessions, expecting to see:
+//   - data from every thread (eventually - maybe not every session)
+//   - unbroken sessions: a consecutive sequence of IDs from each thread
+//   - gaps between sessions: a thread's IDs should be non-consecutive overall
+TEST(RecorderTest, Multithreaded) {
+  constexpr static int kNumThreads = 4;
+
+  // Start several threads writing events.
+  absl::Notification start;
+  absl::Notification stop;
+  thread::ThreadPool pool(Env::Default(), "testpool", kNumThreads);
+  std::atomic<int> thread_count = {0};
+  for (int i = 0; i < kNumThreads; i++) {
+    pool.Schedule([&start, &stop, &thread_count, i] {
+      uint64 j = 0;
+      bool was_active = false;
+      auto record_event = [&j, i]() {
+        uint64 start_time = Env::Default()->NowNanos();
+        uint64 end_time = start_time + kNanosInSec;
+        TraceMeRecorder::Record({/*activity_id=*/j++,
+                                 /*name=*/strings::StrCat(i), start_time,
+                                 end_time});
+      };
+      thread_count.fetch_add(1, std::memory_order_relaxed);
+      start.WaitForNotification();
+      while (!stop.HasBeenNotified()) {
+        // Mimicking production usage, we guard with a racy check.
+        // In principle this isn't needed, but a feedback loop can form:
+        // 1) many events accumulate while the recorder is off
+        // 2) clearing/analyzing these events is slow
+        // 3) while clearing, more events are accumulating, causing 1
+        if (TraceMeRecorder::Active()) {
+          record_event();
+          was_active = true;
+        }
+        // Record some events after the recorder is no longer active to simulate
+        // point 1 and 3.
+        if (was_active && !TraceMeRecorder::Active()) {
+          record_event();
+          record_event();
+          was_active = false;
+        }
+        // This snowballs into OOM in some configurations, causing flakiness.
+        // Keep this big enough to prevent OOM and small enough such that
+        // each thread records at least one event.
+        SpinNanos(10);
+      }
+    });
+  }
+
+  // For each thread, keep track of which events we've seen.
+  struct {
+    bool split_session = false;
+    bool overlapping_sessions = false;
+    std::set<uint64> events;
+  } thread_state[kNumThreads];
+  // We expect each thread to eventually have multiple events, not all in a
+  // contiguous range.
+  auto done = [&thread_state] {
+    for (const auto& t : thread_state) {
+      if (t.events.size() < 2) return false;
+    }
+    return true;
+  };
+
+  // Wait while all the threads are spun up.
+  while (thread_count.load(std::memory_order_relaxed) < kNumThreads) {
+    LOG(INFO) << "Waiting for all threads to spin up...";
+    Env::Default()->SleepForMicroseconds(1 * EnvTime::kMillisToMicros);
+  }
+
+  // We will probably be done after two iterations (with each thread getting
+  // some events each iteration). No guarantees as all the threads might not get
+  // scheduled in a session, so try for a while.
+  start.Notify();
+  constexpr static int kMaxIters = 100;
+  for (int iters = 0; iters < kMaxIters && !done(); ++iters) {
+    LOG(INFO) << "Looping until convergence, iteration: " << iters;
+    TraceMeRecorder::Start(/*level=*/1);
+    Env::Default()->SleepForMicroseconds(100 * EnvTime::kMillisToMicros);
+    auto results = TraceMeRecorder::Stop();
+    for (const auto& thread : results) {
+      if (thread.events.empty()) continue;
+      std::istringstream ss(thread.events.front().name);
+      int thread_index = 0;
+      ss >> thread_index;
+      auto& state = thread_state[thread_index];
+
+      std::set<uint64> session_events;
+      uint64 current = 0;
+      for (const auto& event : thread.events) {
+        session_events.emplace(event.activity_id);
+        // Session events should be contiguous.
+        if (current != 0 && event.activity_id != current + 1) {
+          state.split_session = true;
+        }
+        current = event.activity_id;
+      }
+
+      for (const auto& event : session_events) {
+        auto result = state.events.emplace(event);
+        if (!result.second) {
+          // Session events should not overlap with those from previous
+          // sessions.
+          state.overlapping_sessions = true;
+        }
+      }
+    }
+    Env::Default()->SleepForMicroseconds(1 * EnvTime::kMillisToMicros);
+  }
+  stop.Notify();
+
+  for (const auto& thread : thread_state) {
+    EXPECT_FALSE(thread.split_session)
+        << "Expected contiguous events in a session";
+    EXPECT_FALSE(thread.overlapping_sessions) << "Expected disjoint sessions";
+    EXPECT_GT(thread.events.size(), 1)
+        << "Expected gaps in thread events between sessions";
+  }
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f078099321ec7a2f38e25e5dfe006f2ab49da2ac
--- /dev/null
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -0,0 +1,58 @@
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+        "//tensorflow_models:__subpackages__",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cuda_library",
+)
+
+tf_cuda_library(
+    name = "profiler_session",
+    srcs = [
+        "profiler_session.cc",
+    ],
+    hdrs = [
+        "profiler_session.h",
+    ],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/profiler/internal/gpu:tracer",
+        "//tensorflow/core/profiler/internal/runtime:eager_profiler",
+        "//tensorflow/core/profiler/internal:profiler_interface",
+        "//tensorflow/core/profiler:protos_all_cc",
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:core_cpu_lib",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:lib_internal",
+            "//tensorflow/core:protos_all_cc",
+            "//tensorflow/core:session_options",
+            "//tensorflow/core:device_tracer",
+        ],
+    }),
+)
+
+tf_cuda_library(
+    name = "traceme",
+    srcs = ["traceme.cc"],
+    hdrs = ["traceme.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/internal:traceme_recorder",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
diff --git a/tensorflow/core/profiler/lib/profiler_session.cc b/tensorflow/core/profiler/lib/profiler_session.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1910ea89509f0c21d3271eefb8707dedda733f9c
--- /dev/null
+++ b/tensorflow/core/profiler/lib/profiler_session.cc
@@ -0,0 +1,164 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/lib/profiler_session.h"
+#include <string>
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/internal/gpu/tracer.h"
+#include "tensorflow/core/profiler/internal/runtime/eager_profiler.h"
+#include "tensorflow/core/profiler/trace_events.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+namespace {
+
+// Track whether there's an active ProfilerSession.
+// Prevents another ProfilerSession from creating ProfilerInterface(s), as they
+// use singletons that do not allow concurrent profiling request (e.g.,
+// DeviceTracer).
+std::atomic<bool> session_active = ATOMIC_VAR_INIT(false);
+
+void ConvertRunMetadataToTraceEvent(RunMetadata* run_metadata,
+                                    profiler::Trace* trace,
+                                    const uint64 profile_start_time_micros) {
+  auto trace_devices = trace->mutable_devices();
+  // TODO(fishx): use a lighter representation instead of GraphDef to insert
+  // python information into trace event.
+
+  for (size_t device_id = 0;
+       device_id < run_metadata->step_stats().dev_stats_size(); ++device_id) {
+    // Create device
+    auto* device_stats =
+        run_metadata->mutable_step_stats()->mutable_dev_stats(device_id);
+    profiler::Device device;
+    device.set_name(device_stats->device());
+    device.set_device_id(device_id);
+    profiler::Resource resource;
+    resource.set_name("0");
+    resource.set_resource_id(0);
+    (*device.mutable_resources())[0] = resource;
+    for (const auto& thread_name : device_stats->thread_names()) {
+      profiler::Resource resource;
+      resource.set_resource_id(thread_name.first);
+      resource.set_name(thread_name.second);
+      (*device.mutable_resources())[thread_name.first] = resource;
+    }
+    (*trace_devices)[device_id] = device;
+
+    // Emit events.
+    for (auto node :
+         run_metadata->step_stats().dev_stats(device_id).node_stats()) {
+      if (node.all_start_micros() < profile_start_time_micros) {
+        continue;
+      }
+      auto* event = trace->add_trace_events();
+      auto* args = event->mutable_args();
+      event->set_device_id(device_id);
+      if (device_stats->device().find("host:CPU") != string::npos) {
+        event->set_resource_id(node.thread_id());
+      } else {
+        event->set_resource_id(0);
+      }
+      event->set_name(node.node_name());
+      event->set_timestamp_ps(
+          (node.all_start_micros() - profile_start_time_micros) *
+          EnvTime::kMicrosToPicos);
+      event->set_duration_ps(node.all_end_rel_micros() *
+                             EnvTime::kMicrosToPicos);
+      (*args)["label"] = node.timeline_label();
+    }
+  }
+
+  // TODO(fishx): Convert allocation data as well.
+}
+
+}  // namespace
+
+/*static*/ std::unique_ptr<ProfilerSession> ProfilerSession::Create(
+    ProfilerContext* const context) {
+  return absl::WrapUnique(new ProfilerSession(context));
+}
+
+Status ProfilerSession::Status() {
+  mutex_lock l(mutex_);
+  return status_;
+}
+
+Status ProfilerSession::SerializeToString(string* content) {
+  mutex_lock l(mutex_);
+  if (!status_.ok()) return status_;
+  for (auto& profiler : profilers_) {
+    profiler->Stop().IgnoreError();
+  }
+  RunMetadata run_metadata;
+  for (auto& profiler : profilers_) {
+    profiler->CollectData(&run_metadata).IgnoreError();
+  }
+
+  if (active_) {
+    // Allow another session to start.
+    session_active.store(false);
+    active_ = false;
+  }
+
+  profiler::Trace trace;
+
+  ConvertRunMetadataToTraceEvent(&run_metadata, &trace, start_time_micros_);
+
+  trace.SerializeToString(content);
+  return Status::OK();
+}
+
+ProfilerSession::ProfilerSession(ProfilerContext* const context)
+    : active_(!session_active.exchange(true)),
+      start_time_micros_(Env::Default()->NowNanos() / EnvTime::kMicrosToNanos) {
+  if (!active_) {
+    status_ = tensorflow::Status(tensorflow::error::Code::UNAVAILABLE,
+                                 "Another profiling session is active.");
+    return;
+  }
+
+  LOG(INFO) << "Profile Session started.";
+
+  if (context->eager_context != nullptr) {
+    profilers_.push_back(tensorflow::profiler::runtime::EagerProfiler::Create(
+        context->eager_context));
+  }
+  profilers_.push_back(tensorflow::profiler::gpu::Tracer::Create());
+
+  status_ = Status::OK();
+
+  for (auto& profiler : profilers_) {
+    profiler->Start().IgnoreError();
+  }
+}
+
+ProfilerSession::~ProfilerSession() {
+  for (auto& profiler : profilers_) {
+    profiler->Stop().IgnoreError();
+  }
+
+  if (active_) {
+    // Allow another session to start.
+    session_active.store(false);
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/profiler_session.h b/tensorflow/core/profiler/lib/profiler_session.h
new file mode 100644
index 0000000000000000000000000000000000000000..07276571244b876c8b6635a9d39347c3c1d89a55
--- /dev/null
+++ b/tensorflow/core/profiler/lib/profiler_session.h
@@ -0,0 +1,69 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_PROFILER_SESSION_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_PROFILER_SESSION_H_
+
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/profiler/internal/profiler_interface.h"
+
+namespace tensorflow {
+
+struct ProfilerContext {
+  EagerContext* eager_context = nullptr;
+};
+
+// A profiler which will start profiling when creating the object and will stop
+// when either the object is destroyed or SerializedToString is called. It will
+// profile all operations run under the given EagerContext.
+// Multiple instances of it can be created, but at most one of them will profile
+// for each EagerContext. Status() will return OK only for the instance that is
+// profiling.
+// Thread-safety: ProfilerSession is thread-safe.
+class ProfilerSession {
+ public:
+  // Creates and ProfilerSession and starts profiling.
+  static std::unique_ptr<ProfilerSession> Create(
+      ProfilerContext* const context);
+
+  // Deletes an exsiting Profiler and enables starting a new one.
+  ~ProfilerSession();
+
+  tensorflow::Status Status() LOCKS_EXCLUDED(mutex_);
+
+  tensorflow::Status SerializeToString(string* content) LOCKS_EXCLUDED(mutex_);
+
+ private:
+  // Constructs an instance of the class and starts profiling
+  explicit ProfilerSession(ProfilerContext* const context);
+
+  // Profiler is neither copyable or movable.
+  ProfilerSession(const ProfilerSession&) = delete;
+  ProfilerSession& operator=(const ProfilerSession&) = delete;
+
+  std::vector<std::unique_ptr<tensorflow::profiler::ProfilerInterface>>
+      profilers_ GUARDED_BY(mutex_);
+
+  // True if the session is active.
+  bool active_ GUARDED_BY(mutex_);
+
+  tensorflow::Status status_ GUARDED_BY(mutex_);
+  const uint64 start_time_micros_;
+  mutex mutex_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_PROFILER_SESSION_H_
diff --git a/tensorflow/core/profiler/lib/traceme.cc b/tensorflow/core/profiler/lib/traceme.cc
new file mode 100644
index 0000000000000000000000000000000000000000..90272b8bf584891075de050c7468376abbaed856
--- /dev/null
+++ b/tensorflow/core/profiler/lib/traceme.cc
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/lib/traceme.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Activity IDs: To avoid contention over a counter, the top 32 bits identify
+// the originating thread, the bottom 32 bits name the event within a thread.
+// IDs may be reused after 4 billion events on one thread, or 4 billion threads.
+static std::atomic<uint32> thread_counter(1);  // avoid kUntracedActivity
+uint64 NewActivityId() {
+  const thread_local static uint32 thread_id = thread_counter.fetch_add(1);
+  thread_local static uint32 per_thread_activity_id = 0;
+  return static_cast<uint64>(thread_id) << 32 | per_thread_activity_id++;
+}
+
+/* static */ uint64 TraceMe::ActivityStartImpl(
+    absl::string_view activity_name) {
+  uint64 activity_id = NewActivityId();
+  TraceMeRecorder::Record({activity_id, string(activity_name),
+                           /*start_time=*/Env::Default()->NowNanos(),
+                           /*end_time=*/0});
+  return activity_id;
+}
+
+/* static */ void TraceMe::ActivityEndImpl(uint64 activity_id) {
+  TraceMeRecorder::Record({activity_id, /*name=*/"", /*start_time=*/0,
+                           /*end_time=*/Env::Default()->NowNanos()});
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/lib/traceme.h b/tensorflow/core/profiler/lib/traceme.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9fae3d37f0f2dd53a85f4d0d17aeaa626c849f1
--- /dev/null
+++ b/tensorflow/core/profiler/lib/traceme.h
@@ -0,0 +1,192 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_TRACEME_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_TRACEME_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/internal/traceme_recorder.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// This is specifically used in xprof_bridge for instrumenting Tensorflow ops.
+// Takes input as whether a TF op is expensive or not and returns the TraceMe
+// level to be assigned to trace that particular op. Assigns level 2 for
+// expensive ops (these are high-level details and shown by default in xprof
+// UI). Assigns level 3 for cheap ops (low-level details not shown by default).
+inline int GetTFTraceMeLevel(bool is_expensive) { return is_expensive ? 2 : 3; }
+
+// This class permits user-specified (CPU) tracing activities. A trace activity
+// is started when an object of this class is created and stopped when the
+// object is destroyed.
+//
+// CPU tracing can be useful when trying to understand what parts of GPU
+// computation (e.g., kernels and memcpy) correspond to higher level activities
+// in the overall program. For instance, a collection of kernels maybe
+// performing one "step" of a program that is better visualized together than
+// interspersed with kernels from other "steps". Therefore, a TraceMe object
+// can be created at each "step".
+//
+// Two APIs are provided:
+//   (1) Scoped object: a TraceMe object starts tracing on construction, and
+//       stops tracing when it goes out of scope.
+//          {
+//            TraceMe trace("step");
+//            ... do some work ...
+//          }
+//       TraceMe objects can be members of a class, or allocated on the heap.
+//   (2) Static methods: ActivityStart and ActivityEnd may be called in pairs.
+//          auto id = ActivityStart("step");
+//          ... do some work ...
+//          ActivityEnd(id);
+class TraceMe {
+ public:
+  // Constructor that traces a user-defined activity labeled with activity_name
+  // in the UI. Level defines the trace priority, used for filtering TraceMe
+  // events. By default, traces with TraceMe level <= 2 are recorded. Levels:
+  // - Must be a positive integer.
+  // - Level 1 is the default and used only for user instrumentation.
+  // - Level 2 is used by xprof for instrumenting high level program execution
+  //   details (expensive TF ops, XLA ops, etc).
+  // - Level 3 is also used by xprof to instrument more verbose (low-level)
+  //   program execution details (cheap TF ops, etc).
+  // Users are welcome to use level >= 2 in their code, if they wish to filter
+  // out their host traces based on verbosity.
+  explicit TraceMe(absl::string_view activity_name, int level = 1) {
+    DCHECK_GE(level, 1);
+    if (TraceMeRecorder::Active(level)) {
+      new (&no_init_.name) string(activity_name);
+      start_time_ = Env::Default()->NowNanos();
+    } else {
+      start_time_ = kUntracedActivity;
+    }
+  }
+
+  // string&& constructor to prevent an unnecessary string copy, e.g. when a
+  // TraceMe is constructed based on the result of a StrCat operation.
+  // Note: We can't take the string by value because a) it would make the
+  // overloads ambiguous, and b) we want lvalue strings to use the string_view
+  // constructor so we avoid copying them when tracing is disabled.
+  explicit TraceMe(string &&activity_name, int level = 1) {
+    DCHECK_GE(level, 1);
+    if (TraceMeRecorder::Active(level)) {
+      new (&no_init_.name) string(std::move(activity_name));
+      start_time_ = Env::Default()->NowNanos();
+    } else {
+      start_time_ = kUntracedActivity;
+    }
+  }
+
+  // Do not allow passing strings by reference or value since the caller
+  // may unintentionally maintain ownership of the activity_name.
+  // Explicitly std::move the activity_name or wrap it in a string_view if
+  // you really wish to maintain ownership.
+  explicit TraceMe(const string &activity_name, int level = 1) = delete;
+
+  // This overload is necessary to make TraceMe's with string literals work.
+  // Otherwise, the string&& and the string_view constructor would be equally
+  // good overload candidates.
+  explicit TraceMe(const char *raw, int level = 1)
+      : TraceMe(absl::string_view(raw), level) {}
+
+  // This overload only generates the activity name if tracing is enabled.
+  // Useful for avoiding things like string concatenation when tracing is
+  // disabled. The |name_generator| may be a lambda or functor that returns a
+  // type that the string() constructor can take.
+  // name_generator is templated, rather than a std::function to avoid
+  // allocations std::function might make even if never called.
+  // Usage: xprof::TraceMe([&]{ return StrCat(prefix, ":", postfix); });
+  template <typename NameGeneratorT>
+  explicit TraceMe(NameGeneratorT name_generator, int level = 1) {
+    DCHECK_GE(level, 1);
+    if (TraceMeRecorder::Active(level)) {
+      new (&no_init_.name) string(name_generator());
+      start_time_ = Env::Default()->NowNanos();
+    } else {
+      start_time_ = kUntracedActivity;
+    }
+  }
+
+  ~TraceMe() {
+    // We do not need to check the trace level again here.
+    // - If tracing wasn't active to start with, we have kUntracedActivity.
+    // - If tracing was active and was stopped, we have
+    //   TraceMeRecorder::Active().
+    // - If tracing was active and was restarted at a lower level, we may
+    //   spuriously record the event. This is extremely rare, and acceptable as
+    //   event will be discarded when its start timestamp fall outside of the
+    //   start/stop session timestamp (recorded in XprofResponse).
+    if (start_time_ != kUntracedActivity) {
+      if (TraceMeRecorder::Active()) {
+        TraceMeRecorder::Record({kCompleteActivity, std::move(no_init_.name),
+                                 start_time_, Env::Default()->NowNanos()});
+      }
+      no_init_.name.~string();
+    }
+  }
+
+  // TraceMe is not movable or copyable.
+  TraceMe(const TraceMe &) = delete;
+  TraceMe &operator=(const TraceMe &) = delete;
+
+  // Static API, for use when scoped objects are inconvenient.
+
+  // Record the start time of an activity.
+  // Returns the activity ID, which is used to stop the activity.
+  static uint64 ActivityStart(absl::string_view name, int level = 1) {
+    return TraceMeRecorder::Active(level) ? ActivityStartImpl(name)
+                                          : kUntracedActivity;
+  }
+
+  // Record the end time of an activity started by ActivityStart().
+  static void ActivityEnd(uint64 activity_id) {
+    // We don't check the level again (see ~TraceMe()).
+    if (activity_id != kUntracedActivity) {
+      if (TraceMeRecorder::Active()) {
+        ActivityEndImpl(activity_id);
+      }
+    }
+  }
+
+ private:
+  // Activity ID or start time used when tracing is disabled.
+  constexpr static uint64 kUntracedActivity = 0;
+  // Activity ID used as a placeholder when both start and end are present.
+  constexpr static uint64 kCompleteActivity = 1;
+
+  static uint64 ActivityStartImpl(absl::string_view activity_name);
+  static void ActivityEndImpl(uint64 activity_id);
+
+  // Wrap the name into a union so that we can avoid the cost of string
+  // initialization when tracing is disabled.
+  union NoInit {
+    NoInit() {}
+    ~NoInit() {}
+    string name;
+  } no_init_;
+
+  uint64 start_time_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_TRACEME_H_
diff --git a/tensorflow/contrib/tpu/profiler/op_profile.proto b/tensorflow/core/profiler/op_profile.proto
similarity index 98%
rename from tensorflow/contrib/tpu/profiler/op_profile.proto
rename to tensorflow/core/profiler/op_profile.proto
index 292108f949d705762a826d0276a517f1f741fb39..0adca5544a6f579ef64bbf804ff8098e28b37da0 100644
--- a/tensorflow/contrib/tpu/profiler/op_profile.proto
+++ b/tensorflow/core/profiler/op_profile.proto
@@ -1,6 +1,6 @@
 syntax = "proto3";
 
-package tensorflow.tpu.op_profile;
+package tensorflow.profiler.op_profile;
 
 // Profile is the top-level data that summarizes a program.
 message Profile {
diff --git a/tensorflow/core/profiler/profiler.cc b/tensorflow/core/profiler/profiler.cc
index 808e3c853bec0efb9523ee413f3d5272a833358d..cdcb8dddf6d0b3b54fb29156559caed2ba216ca2 100644
--- a/tensorflow/core/profiler/profiler.cc
+++ b/tensorflow/core/profiler/profiler.cc
@@ -114,7 +114,7 @@ int Run(int argc, char** argv) {
       Flag("min_output_bytes", &FLAGS_min_output_bytes, "min_output_bytes"),
       Flag("min_micros", &FLAGS_min_micros, "min micros"),
       Flag("min_accelerator_micros", &FLAGS_min_accelerator_micros,
-           "min acclerator_micros"),
+           "min accelerator_micros"),
       Flag("min_cpu_micros", &FLAGS_min_cpu_micros, "min_cpu_micros"),
       Flag("min_params", &FLAGS_min_params, "min params"),
       Flag("min_float_ops", &FLAGS_min_float_ops, "min float ops"),
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto b/tensorflow/core/profiler/profiler_analysis.proto
similarity index 93%
rename from tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto
rename to tensorflow/core/profiler/profiler_analysis.proto
index d3c34bfd490080b86cf3d8b893c550f3a87bbbed..4be75de8bb46a23d26b116f306bad6f107d786ef 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto
+++ b/tensorflow/core/profiler/profiler_analysis.proto
@@ -1,7 +1,7 @@
 syntax = "proto3";
 package tensorflow;
 
-import "tensorflow/contrib/tpu/profiler/tpu_profiler.proto";
+import "tensorflow/core/profiler/profiler_service.proto";
 
 message NewProfileSessionRequest {
   ProfileRequest request = 1;
@@ -58,10 +58,10 @@ message ProfileSessionDataResponse {
   bytes output = 3;
 }
 ////////////////////////////////////////////////////////////////////////////////
-// TPUProfileAnalysis service provide entry point for profiling TPU and for
+// ProfileAnalysis service provide entry point for profiling TPU and for
 // serving profiled data to Tensorboard through GRPC
 ////////////////////////////////////////////////////////////////////////////////
-service TPUProfileAnalysis {
+service ProfileAnalysis {
   // Starts a profiling session, blocks until it completes.
   // TPUProfileAnalysis service delegate this to TPUProfiler service.
   // Populate the profiled data in repository, then return status to caller.
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto b/tensorflow/core/profiler/profiler_service.proto
similarity index 92%
rename from tensorflow/contrib/tpu/profiler/tpu_profiler.proto
rename to tensorflow/core/profiler/profiler_service.proto
index da4a95e0450a9d0c20593ca60b69f3ad467d455d..77702c3c900e5a7391ea09ad93383b4f9c9fb2b2 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
+++ b/tensorflow/core/profiler/profiler_service.proto
@@ -3,11 +3,11 @@ package tensorflow;
 
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/protobuf/config.proto";
-import "tensorflow/contrib/tpu/profiler/op_profile.proto";
+import "tensorflow/core/profiler/op_profile.proto";
 
-// The TPUProfiler service retrieves performance information about
-// the programs running on connected TPUs over a period of time.
-service TPUProfiler {
+// The ProfilerService service retrieves performance information about
+// the programs running on connected devices over a period of time.
+service ProfilerService {
   // Starts a profiling session, blocks until it completes, and returns data.
   rpc Profile(ProfileRequest) returns (ProfileResponse) {
   }
@@ -81,7 +81,7 @@ message ProfileToolData {
 
 message ProfileResponse {
   reserved 1;  // was uint64 placeholder for returning something meaningful.
-  // Graphs of programs executed on TPUs during the profiling period.
+  // Graphs of programs executed on devices during the profiling period.
   repeated GraphDef computation_graph = 2;
 
   // Performance profile that can be used to annotate HLO operations in the
@@ -96,7 +96,7 @@ message ProfileResponse {
   // Assembles a hierarchical performance profile based on HLOs in trace events.
   // If the trace covers multiple programs, the longest-running one is analyzed.
   // See op_profile.proto for the detailed semantics of the returned profile.
-  tpu.op_profile.Profile op_profile = 4;
+  profiler.op_profile.Profile op_profile = 4;
 
   // Data payload for each required tools.
   repeated ProfileToolData tool_data = 6;
diff --git a/tensorflow/core/profiler/rpc/BUILD b/tensorflow/core/profiler/rpc/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..3e5cdaa4984d4ddfb4d4af8e23ab81c2645814d2
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/BUILD
@@ -0,0 +1,38 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
+
+tf_cuda_library(
+    name = "profiler_service_impl",
+    srcs = ["profiler_service_impl.cc"],
+    hdrs = ["profiler_service_impl.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow:grpc++",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:grpc_services",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/profiler:protos_all_cc",
+        "//tensorflow/core/profiler/lib:profiler_session",
+    ],
+    alwayslink = 1,
+)
+
+tf_cuda_library(
+    name = "profiler_server",
+    srcs = ["profiler_server.cc"],
+    hdrs = ["profiler_server.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":profiler_service_impl",
+        "//tensorflow:grpc++",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:grpc_services",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/profiler:protos_all_cc",
+        "//tensorflow/core/profiler/lib:profiler_session",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/core/profiler/rpc/client/BUILD b/tensorflow/core/profiler/rpc/client/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..6036ff97017959e8453060622fa858171f09dbb2
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/client/BUILD
@@ -0,0 +1,62 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+tf_cuda_library(
+    name = "capture_profile",
+    srcs = [
+        "capture_profile.cc",
+    ],
+    hdrs = [
+        "capture_profile.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":dump_tpu_profile",
+        "//tensorflow:grpc++",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:grpc_services",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        "//tensorflow/core/profiler:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "dump_tpu_profile",
+    srcs = ["dump_tpu_profile.cc"],
+    hdrs = ["dump_tpu_profile.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":trace_events_to_json",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:grpc_services",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler:protos_all_cc",
+    ],
+)
+
+cc_library(
+    name = "trace_events_to_json",
+    srcs = ["trace_events_to_json.cc"],
+    hdrs = ["trace_events_to_json.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler:protos_all_cc",
+        "@jsoncpp_git//:jsoncpp",
+    ],
+)
+
+tf_cc_test(
+    name = "trace_events_to_json_test",
+    srcs = ["trace_events_to_json_test.cc"],
+    deps = [
+        ":trace_events_to_json",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler:protos_all_cc",
+        "@jsoncpp_git//:jsoncpp",
+    ],
+)
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.cc b/tensorflow/core/profiler/rpc/client/capture_profile.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a24e0faa41f4eface697495be741b1eb4757b84f
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.cc
@@ -0,0 +1,251 @@
+/* Copyright 2017 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/rpc/client/capture_profile.h"
+
+#include "grpcpp/grpcpp.h"
+
+#include <cstdio>
+#include <ctime>
+#include <vector>
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/grpc_services.h"
+#include "tensorflow/core/profiler/rpc/client/dump_tpu_profile.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace client {
+
+constexpr uint64 kMaxEvents = 1000000;
+
+string GetCurrentTimeStampAsString() {
+  char s[128];
+  std::time_t t = std::time(nullptr);
+  auto result = std::strftime(s, sizeof(s), "%F_%T", std::localtime(&t));
+  DCHECK_NE(result, 0);
+  return s;
+}
+
+Status ValidateHostPortPair(const string& host_port) {
+  uint32 port;
+  std::vector<string> parts = str_util::Split(host_port, ':');
+  // Must be host:port, port must be a number, host must not contain a '/',
+  // host also must not be empty.
+  if (parts.size() != 2 || !strings::safe_strtou32(parts[1], &port) ||
+      parts[0].find("/") != string::npos || parts[0].empty()) {
+    return errors::InvalidArgument("Could not interpret \"", host_port,
+                                   "\" as a host-port pair.");
+  }
+  return Status::OK();
+}
+
+ProfileRequest PopulateProfileRequest(int duration_ms,
+                                      const string& repository_root,
+                                      const string& session_id,
+                                      const ProfileOptions& opts) {
+  ProfileRequest request;
+  request.set_duration_ms(duration_ms);
+  request.set_max_events(kMaxEvents);
+  if (tensorflow::str_util::StartsWith(repository_root, "gs://")) {
+    // For backward compatibilities, only generate tracetable etc when the
+    // user provide a GCS path for model directory.
+    request.set_repository_root(repository_root);
+    request.set_session_id(session_id);
+  }
+  request.add_tools("op_profile");
+  request.add_tools("input_pipeline");
+  request.add_tools("memory_viewer");
+  request.add_tools("overview_page");
+  *request.mutable_opts() = opts;
+  return request;
+}
+
+// Returns whether the returned trace is empty.
+// Failure are handled by CHECK, i.e. abort()
+Status Profile(const string& service_addr, const string& logdir,
+               int duration_ms, const string& repository_root,
+               const string& session_id, const ProfileOptions& opts) {
+  ProfileRequest request =
+      PopulateProfileRequest(duration_ms, repository_root, session_id, opts);
+
+  ::grpc::ClientContext context;
+  ::grpc::ChannelArguments channel_args;
+  // TODO(qiuminxu): use `NewHostPortGrpcChannel` instead once their
+  // `ValidateHostPortPair` checks for empty host string case.
+  channel_args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH,
+                      std::numeric_limits<int32>::max());
+  std::unique_ptr<grpc::ProfilerService::Stub> stub =
+      grpc::ProfilerService::NewStub(::grpc::CreateCustomChannel(
+          "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
+          channel_args));
+  ProfileResponse response;
+  TF_RETURN_IF_ERROR(
+      FromGrpcStatus(stub->Profile(&context, request, &response)));
+
+  if (!response.encoded_trace().empty()) {
+    TF_CHECK_OK(WriteTensorboardTPUProfile(logdir, session_id, "", response,
+                                           &std::cout));
+    // Print this at the end so that it's not buried in irrelevant LOG messages.
+    std::cout
+        << "NOTE: using the trace duration " << duration_ms << "ms."
+        << std::endl
+        << "Set an appropriate duration (with --duration_ms) if you "
+           "don't see a full step in your trace or the captured trace is too "
+           "large."
+        << std::endl;
+  }
+
+  if (response.encoded_trace().empty()) {
+    return Status(tensorflow::error::Code::UNAVAILABLE,
+                  "No trace event is collected");
+  }
+  return Status::OK();
+}
+
+// Start a new profiling session that include all the hosts included in
+// hostnames, for the time interval of duration_ms. Possibly save the profiling
+// result in the directory specified by repository_root and session_id.
+Status NewSession(const string& service_addr,
+                  const std::vector<tensorflow::string>& hostnames,
+                  int duration_ms, const string& repository_root,
+                  const string& session_id, const ProfileOptions& opts) {
+  NewProfileSessionRequest new_session_request;
+  *new_session_request.mutable_request() =
+      PopulateProfileRequest(duration_ms, repository_root, session_id, opts);
+  new_session_request.set_repository_root(repository_root);
+  new_session_request.set_session_id(session_id);
+  for (const auto& hostname : hostnames) {
+    new_session_request.add_hosts(hostname);
+  }
+
+  ::grpc::ClientContext context;
+  ::grpc::ChannelArguments channel_args;
+  // TODO(qiuminxu): use `NewHostPortGrpcChannel` instead once their
+  // `ValidateHostPortPair` checks for empty host string case.
+  channel_args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
+  // TODO(jiesun): GRPC support following relevant naming scheme:
+  // 1. dns:///host:port
+  // 2. ipv4:host:port or ipv6:[host]:port
+  // We might need to change the prefix which depends on what TPU name resolver
+  // will give us.
+  std::unique_ptr<grpc::ProfileAnalysis::Stub> stub =
+      grpc::ProfileAnalysis::NewStub(::grpc::CreateCustomChannel(
+          "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
+          channel_args));
+  NewProfileSessionResponse new_session_response;
+  TF_RETURN_IF_ERROR(FromGrpcStatus(
+      stub->NewSession(&context, new_session_request, &new_session_response)));
+
+  std::cout << "Profile session succeed for host(s):"
+            << str_util::Join(hostnames, ",") << std::endl;
+  if (new_session_response.empty_trace()) {
+    return Status(tensorflow::error::Code::UNAVAILABLE,
+                  "No trace event is collected");
+  }
+  return Status::OK();
+}
+
+// Starts tracing on a single or multiple TPU hosts and saves the result in the
+// given logdir. If no trace was collected, retries tracing for
+// num_tracing_attempts.
+Status StartTracing(const tensorflow::string& service_addr,
+                    const tensorflow::string& logdir,
+                    const tensorflow::string& workers_list,
+                    bool include_dataset_ops, int duration_ms,
+                    int num_tracing_attempts) {
+  // Use the current timestamp as the run name.
+  tensorflow::string session_id = GetCurrentTimeStampAsString();
+  constexpr char kProfilePluginDirectory[] = "plugins/profile/";
+  tensorflow::string repository_root =
+      io::JoinPath(logdir, kProfilePluginDirectory);
+  std::vector<tensorflow::string> hostnames =
+      tensorflow::str_util::Split(workers_list, ",");
+
+  Status status = Status::OK();
+  int remaining_attempts = num_tracing_attempts;
+  tensorflow::ProfileOptions opts;
+  opts.set_include_dataset_ops(include_dataset_ops);
+  while (true) {
+    std::cout << "Starting to profile TPU traces for " << duration_ms << " ms. "
+              << "Remaining attempt(s): " << remaining_attempts-- << std::endl;
+    if (hostnames.empty()) {
+      status = Profile(service_addr, logdir, duration_ms, repository_root,
+                       session_id, opts);
+    } else {
+      tensorflow::string tpu_master = service_addr;
+      status = NewSession(tpu_master, hostnames, duration_ms, repository_root,
+                          session_id, opts);
+    }
+    if (remaining_attempts <= 0 || status.ok() ||
+        status.code() != tensorflow::error::Code::UNAVAILABLE)
+      break;
+    std::cout << "No trace event is collected. Automatically retrying."
+              << std::endl
+              << std::endl;
+  }
+
+  if (status.code() == tensorflow::error::Code::UNAVAILABLE) {
+    std::cout << "No trace event is collected after " << num_tracing_attempts
+              << " attempt(s). "
+              << "Perhaps, you want to try again (with more attempts?)."
+              << std::endl
+              << "Tip: increase number of attempts with --num_tracing_attempts."
+              << std::endl;
+    return status;
+  }
+  return Status::OK();
+}
+
+MonitorRequest PopulateMonitorRequest(int duration_ms, int monitoring_level) {
+  MonitorRequest request;
+  request.set_duration_ms(duration_ms);
+  request.set_monitoring_level(monitoring_level);
+  return request;
+}
+
+// Repeatedly collects profiles and shows user-friendly metrics for
+// 'num_queries' time(s).
+void StartMonitoring(const tensorflow::string& service_addr, int duration_ms,
+                     int monitoring_level, int num_queries) {
+  for (int query = 0; query < num_queries; ++query) {
+    MonitorRequest request =
+        PopulateMonitorRequest(duration_ms, monitoring_level);
+
+    ::grpc::ClientContext context;
+    ::grpc::ChannelArguments channel_args;
+    channel_args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH,
+                        std::numeric_limits<int32>::max());
+    std::unique_ptr<grpc::ProfilerService::Stub> stub =
+        grpc::ProfilerService::NewStub(::grpc::CreateCustomChannel(
+            "dns:///" + service_addr, ::grpc::InsecureChannelCredentials(),
+            channel_args));
+    MonitorResponse response;
+    TF_QCHECK_OK(FromGrpcStatus(stub->Monitor(&context, request, &response)));
+
+    std::cout << "Cloud TPU Monitoring Results (Sample " << query + 1
+              << "):\n\n"
+              << response.data() << std::flush;
+  }
+}
+
+}  // namespace client
+}  // namespace profiler
+}  // namespace tensorflow
+
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.h b/tensorflow/core/profiler/rpc/client/capture_profile.h
new file mode 100644
index 0000000000000000000000000000000000000000..988036724791cd171f11a7a3666aca4267286646
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.h
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// GRPC client to perform on-demand profiling
+
+#ifndef TENSORFLOW_CORE_PROFILER_RPC_CLIENT_CAPTURE_PROFILE_H_
+#define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_CAPTURE_PROFILE_H_
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace client {
+
+Status ValidateHostPortPair(const string& host_port);
+
+// Repeatedly collects profiles and shows user-friendly metrics for
+// 'num_queries' time(s).
+void StartMonitoring(const tensorflow::string& service_addr, int duration_ms,
+                     int monitoring_level, int num_queries);
+
+// Starts tracing on a single or multiple hosts and saves the result in the
+// given logdir. If no trace was collected, retries tracing for
+// num_tracing_attempts.
+Status StartTracing(const tensorflow::string& service_addr,
+                    const tensorflow::string& logdir,
+                    const tensorflow::string& workers_list,
+                    bool include_dataset_ops, int duration_ms,
+                    int num_tracing_attempts);
+
+}  // namespace client
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_RPC_CLIENT_CAPTURE_PROFILE_H_
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/core/profiler/rpc/client/dump_tpu_profile.cc
similarity index 90%
rename from tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
rename to tensorflow/core/profiler/rpc/client/dump_tpu_profile.cc
index ef35e84ba5205fb76e5afe77e670d87197ca8405..ed65c110c9dcc364ba24338822363425e852037d 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/dump_tpu_profile.cc
@@ -13,15 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tpu/profiler/dump_tpu_profile.h"
+#include "tensorflow/core/profiler/rpc/client/dump_tpu_profile.h"
 
 #include <cstdio>
 #include <ctime>
 #include <vector>
 
-#include "tensorflow/contrib/tpu/profiler/op_profile.pb.h"
-#include "tensorflow/contrib/tpu/profiler/trace_events.pb.h"
-#include "tensorflow/contrib/tpu/profiler/trace_events_to_json.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/compression.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -29,10 +26,18 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/protobuf.h"
+// Windows.h #defines ERROR, but it is also used in
+// tensorflow/core/util/event.proto
+#undef ERROR
+#include "tensorflow/core/profiler/op_profile.pb.h"
+#include "tensorflow/core/profiler/rpc/client/trace_events_to_json.h"
+#include "tensorflow/core/profiler/trace_events.pb.h"
 #include "tensorflow/core/util/events_writer.h"
 
 namespace tensorflow {
-namespace tpu {
+
+namespace profiler {
+namespace client {
 namespace {
 
 using ::tensorflow::io::JoinPath;
@@ -88,7 +93,7 @@ Status DumpTraceToLogDirectory(StringPiece run_dir, const string& host_prefix,
 
 Status DumpOpProfileToLogDirectory(StringPiece run_dir,
                                    const string& host_prefix,
-                                   const tpu::op_profile::Profile& profile,
+                                   const op_profile::Profile& profile,
                                    std::ostream* os) {
   string path = JoinPath(run_dir, StrCat(host_prefix, kJsonOpProfileFileName));
   string json;
@@ -98,7 +103,7 @@ Status DumpOpProfileToLogDirectory(StringPiece run_dir,
   if (!status.ok()) {
     return errors::Internal(
         "Failed to convert op profile to json. Skipping... ",
-        string(status.message()));
+        string(status.error_message()));
   }
   TF_RETURN_IF_ERROR(WriteStringToFile(Env::Default(), path, json));
   if (os) {
@@ -109,7 +114,7 @@ Status DumpOpProfileToLogDirectory(StringPiece run_dir,
 
 Status DumpToolDataToLogDirectory(StringPiece run_dir,
                                   const string& host_prefix,
-                                  const tensorflow::ProfileToolData& tool,
+                                  const ProfileToolData& tool,
                                   std::ostream* os) {
   // Don't save the intermediate results for combining the per host tool data.
   if (EndsWith(tool.name(), kFlatProfilerFileName) ||
@@ -155,5 +160,6 @@ Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
   return Status::OK();
 }
 
-}  // namespace tpu
+}  // namespace client
+}  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h b/tensorflow/core/profiler/rpc/client/dump_tpu_profile.h
similarity index 80%
rename from tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
rename to tensorflow/core/profiler/rpc/client/dump_tpu_profile.h
index ecf21b1de2219e8896d5e8b79325a193de0b0fa1..961f4e9498d91a6c0d75b82ad87860963360ddb3 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
+++ b/tensorflow/core/profiler/rpc/client/dump_tpu_profile.h
@@ -13,14 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_
-#define TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_
+#ifndef TENSORFLOW_CORE_PROFILER_RPC_CLIENT_DUMP_TPU_PROFILE_H_
+#define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_DUMP_TPU_PROFILE_H_
 
-#include "tensorflow/contrib/tpu/profiler/tpu_profiler.grpc.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/grpc_services.h"
 
 namespace tensorflow {
-namespace tpu {
+
+namespace profiler {
+namespace client {
 
 // Dumps all profiling tool data in a TPU profile to a TensorBoard log directory
 // with the given run name. This writes user-facing log messages to `os`.
@@ -36,7 +38,8 @@ Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
                                   const ProfileResponse& response,
                                   std::ostream* os);
 
-}  // namespace tpu
+}  // namespace client
+}  // namespace profiler
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TPU_PROFILER_DUMP_TPU_PROFILE_H_
+#endif  // TENSORFLOW_CORE_PROFILER_RPC_CLIENT_DUMP_TPU_PROFILE_H_
diff --git a/tensorflow/contrib/tpu/profiler/trace_events_to_json.cc b/tensorflow/core/profiler/rpc/client/trace_events_to_json.cc
similarity index 91%
rename from tensorflow/contrib/tpu/profiler/trace_events_to_json.cc
rename to tensorflow/core/profiler/rpc/client/trace_events_to_json.cc
index 3f7e67dec88918009a2a9856d9c7a182338f748d..6adaec5546052a5d82a54e4ae1ca78eb10a4a103 100644
--- a/tensorflow/contrib/tpu/profiler/trace_events_to_json.cc
+++ b/tensorflow/core/profiler/rpc/client/trace_events_to_json.cc
@@ -13,13 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tpu/profiler/trace_events_to_json.h"
+#include "tensorflow/core/profiler/rpc/client/trace_events_to_json.h"
 #include "include/json/json.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/profiler/trace_events.pb.h"
 
 namespace tensorflow {
-namespace tpu {
+
+namespace profiler {
+namespace client {
 namespace {
 
 using ::tensorflow::strings::Appendf;
@@ -96,10 +99,9 @@ inline void AddTraceEvent(const TraceEvent &event, string *json) {
 
 string TraceEventsToJson(const Trace &trace) {
   string json;
-  Appendf(&json,
-          R"({"displayTimeUnit":"ns","metadata":{"highres-ticks":true},)");
-  Appendf(&json,
-          R"("traceEvents":[)");
+  Appendf(
+      &json, R"({"displayTimeUnit":"ns","metadata":{"highres-ticks":true},)");
+  Appendf(&json, R"("traceEvents":[)");
   // Convert to a std::map so that devices are sorted by the device id.
   std::map<uint32, const Device *> sorted_devices;
   for (const auto &pair : trace.devices()) {
@@ -114,5 +116,6 @@ string TraceEventsToJson(const Trace &trace) {
   return json;
 }
 
-}  // namespace tpu
+}  // namespace client
+}  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/profiler/trace_events_to_json.h b/tensorflow/core/profiler/rpc/client/trace_events_to_json.h
similarity index 72%
rename from tensorflow/contrib/tpu/profiler/trace_events_to_json.h
rename to tensorflow/core/profiler/rpc/client/trace_events_to_json.h
index 3bd76dd01c7d0f35bad9386c11811743e1709fca..d54cc3c619e234b82452a54f613a57cabfa7d5d3 100644
--- a/tensorflow/contrib/tpu/profiler/trace_events_to_json.h
+++ b/tensorflow/core/profiler/rpc/client/trace_events_to_json.h
@@ -13,20 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TPU_PROFILER_TRACE_EVENTS_TO_JSON_H_
-#define TENSORFLOW_CONTRIB_TPU_PROFILER_TRACE_EVENTS_TO_JSON_H_
+#ifndef TENSORFLOW_CORE_PROFILER_RPC_CLIENT_TRACE_EVENTS_TO_JSON_H_
+#define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_TRACE_EVENTS_TO_JSON_H_
 
-#include "tensorflow/contrib/tpu/profiler/trace_events.pb.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/trace_events.pb.h"
 
 namespace tensorflow {
-namespace tpu {
+
+namespace profiler {
+namespace client {
 
 // Converts trace events in the trace proto to a JSON string that can be
 // consumed by catapult trace viewer.
 string TraceEventsToJson(const Trace &trace);
 
-}  // namespace tpu
+}  // namespace client
+}  // namespace profiler
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TPU_PROFILER_TRACE_EVENTS_TO_JSON_H_
+#endif  // TENSORFLOW_CORE_PROFILER_RPC_CLIENT_TRACE_EVENTS_TO_JSON_H_
diff --git a/tensorflow/contrib/tpu/profiler/trace_events_to_json_test.cc b/tensorflow/core/profiler/rpc/client/trace_events_to_json_test.cc
similarity index 93%
rename from tensorflow/contrib/tpu/profiler/trace_events_to_json_test.cc
rename to tensorflow/core/profiler/rpc/client/trace_events_to_json_test.cc
index e97989cc7be961b2a812e46bb07b189bd6cda897..0f883b04dc869218329fd944d45918b0836d1a44 100644
--- a/tensorflow/contrib/tpu/profiler/trace_events_to_json_test.cc
+++ b/tensorflow/core/profiler/rpc/client/trace_events_to_json_test.cc
@@ -13,13 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tpu/profiler/trace_events_to_json.h"
+#include "tensorflow/core/profiler/rpc/client/trace_events_to_json.h"
 #include "include/json/json.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/profiler/trace_events.pb.h"
 
 namespace tensorflow {
-namespace tpu {
+
+namespace profiler {
+namespace client {
 namespace {
 
 string ConvertTextFormattedTraceToJson(const string& trace_str) {
@@ -109,5 +112,6 @@ TEST(TraceEventsToJson, JsonConversion) {
 }
 
 }  // namespace
-}  // namespace tpu
+}  // namespace client
+}  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/profiler_server.cc b/tensorflow/core/profiler/rpc/profiler_server.cc
new file mode 100644
index 0000000000000000000000000000000000000000..257e4e0bf5fa320c499a40065021b0030564bc45
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/profiler_server.cc
@@ -0,0 +1,48 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/rpc/profiler_server.h"
+#include <memory>
+#include <utility>
+#include "grpcpp/grpcpp.h"
+#include "tensorflow/core/platform/grpc_services.h"
+#include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+
+std::unique_ptr<Thread> StartProfilerServer(
+    ProfilerContext* const profiler_context, int32 port) {
+  Env* env = profiler_context->eager_context != nullptr
+                 ? profiler_context->eager_context->TFEnv()
+                 : Env::Default();
+  // Starting the server in the child thread may be delay and user may already
+  // delete the profiler context at that point. So we need to make a copy.
+  ProfilerContext ctx = *profiler_context;
+  return WrapUnique(env->StartThread({}, "profiler server", [ctx, port]() {
+    string server_address = strings::StrCat("0.0.0.0:", port);
+    std::unique_ptr<grpc::ProfilerService::Service> service =
+        CreateProfilerService(ctx);
+    ::grpc::ServerBuilder builder;
+    builder.AddListeningPort(server_address,
+                             ::grpc::InsecureServerCredentials());
+    builder.RegisterService(service.get());
+    std::unique_ptr<::grpc::Server> server(builder.BuildAndStart());
+    LOG(INFO) << "Profiling Server listening on " << server_address;
+    server->Wait();
+  }));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/profiler_server.h b/tensorflow/core/profiler/rpc/profiler_server.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e8c715ac753d57add26de28a2524d4f737567ec
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/profiler_server.h
@@ -0,0 +1,25 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVER_H_
+#define TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVER_H_
+
+#include "tensorflow/core/profiler/lib/profiler_session.h"
+
+namespace tensorflow {
+
+std::unique_ptr<Thread> StartProfilerServer(
+    ProfilerContext* const profiler_context, int32 port);
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVER_H_
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.cc b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f25ee66833604882309679615e02bf4b6125d9ed
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
@@ -0,0 +1,75 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
+#include "grpcpp/support/status.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/platform/grpc_services.h"
+#include "tensorflow/core/profiler/lib/profiler_session.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace {
+
+class ProfilerServiceImpl : public grpc::ProfilerService::Service {
+ public:
+  explicit ProfilerServiceImpl(const ProfilerContext& profiler_context)
+      : profiler_context_(profiler_context) {}
+  ~ProfilerServiceImpl() override {}
+
+  ::grpc::Status Monitor(::grpc::ServerContext* ctx, const MonitorRequest* req,
+                         MonitorResponse* response) override {
+    return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "unimplemented.");
+  }
+
+  ::grpc::Status Profile(::grpc::ServerContext* ctx, const ProfileRequest* req,
+                         ProfileResponse* response) override {
+    LOG(INFO) << "Received a profile request.";
+    std::unique_ptr<ProfilerSession> profiler =
+        ProfilerSession::Create(&profiler_context_);
+    if (!profiler->Status().ok()) {
+      return ::grpc::Status(::grpc::StatusCode::INTERNAL,
+                            profiler->Status().error_message());
+    }
+
+    Env* env = profiler_context_.eager_context != nullptr
+                   ? profiler_context_.eager_context->TFEnv()
+                   : Env::Default();
+    for (size_t i = 0; i < req->duration_ms(); ++i) {
+      env->SleepForMicroseconds(1000);
+      if (ctx->IsCancelled()) {
+        return ::grpc::Status::CANCELLED;
+      }
+    }
+
+    Status s = profiler->SerializeToString(response->mutable_encoded_trace());
+    if (!s.ok()) {
+      return ::grpc::Status(::grpc::StatusCode::INTERNAL, s.error_message());
+    }
+
+    return ::grpc::Status::OK;
+  }
+
+ private:
+  ProfilerContext profiler_context_;
+};
+}  // namespace
+
+std::unique_ptr<grpc::ProfilerService::Service> CreateProfilerService(
+    const ProfilerContext& profiler_context) {
+  return MakeUnique<ProfilerServiceImpl>(profiler_context);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.h b/tensorflow/core/profiler/rpc/profiler_service_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..64ae01d58377c751945e05417528118026b1614e
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
+#define TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
+
+#include "grpcpp/grpcpp.h"
+#include "grpcpp/server_context.h"
+#include "grpcpp/support/status.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/platform/grpc_services.h"
+#include "tensorflow/core/profiler/lib/profiler_session.h"
+
+namespace tensorflow {
+
+std::unique_ptr<grpc::ProfilerService::Service> CreateProfilerService(
+    const ProfilerContext& profiler_context);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
diff --git a/tensorflow/core/profiler/tfprof_options.cc b/tensorflow/core/profiler/tfprof_options.cc
index 9e5ef0a0a31600e12e76cb8f5f3e5a1c6f62a3d5..faca22c425b91553f67e7ffdfda14a044295b17f 100644
--- a/tensorflow/core/profiler/tfprof_options.cc
+++ b/tensorflow/core/profiler/tfprof_options.cc
@@ -96,7 +96,7 @@ tensorflow::Status ParseOutput(const string& output_opt, string* output_type,
   for (const string& kv_str : kv_split) {
     const std::vector<string> kv =
         str_util::Split(kv_str, "=", str_util::SkipEmpty());
-    if (kv.size() != 2) {
+    if (kv.size() < 2) {
       return tensorflow::Status(
           tensorflow::error::INVALID_ARGUMENT,
           "Visualize format: -output timeline:key=value,key=value,...");
@@ -107,7 +107,8 @@ tensorflow::Status ParseOutput(const string& output_opt, string* output_type,
           strings::Printf("Unrecognized options %s for output_type: %s\n",
                           kv[0].c_str(), output_type->c_str()));
     }
-    (*output_options)[kv[0]] = kv[1];
+    const std::vector<string> kv_without_key(kv.begin() + 1, kv.end());
+    (*output_options)[kv[0]] = str_util::Join(kv_without_key, "=");
   }
 
   for (const string& opt : required_options) {
diff --git a/tensorflow/contrib/tpu/profiler/trace_events.proto b/tensorflow/core/profiler/trace_events.proto
similarity index 93%
rename from tensorflow/contrib/tpu/profiler/trace_events.proto
rename to tensorflow/core/profiler/trace_events.proto
index cb2b9162677a0ebe8240a98671b1cabc1cee0c9f..69ec88ca9a798a0faf1864ce9cf5c3f8bb7df0ca 100644
--- a/tensorflow/contrib/tpu/profiler/trace_events.proto
+++ b/tensorflow/core/profiler/trace_events.proto
@@ -1,6 +1,6 @@
 syntax = "proto3";
 
-package tensorflow.tpu;
+package tensorflow.profiler;
 
 // A 'Trace' contains metadata for the individual traces of a system.
 message Trace {
@@ -56,4 +56,7 @@ message TraceEvent {
   // The duration of the event in picoseconds if applicable.
   // Events without duration are called instant events.
   uint64 duration_ps = 10;
+
+  // Extra arguments that will be displayed in trace view.
+  map<string, string> args = 11;
 }
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index b3dc5dccc02737202f9f5ced78471f332efd2eba..3e24235369a6bd06d3c8cf0df66e1ee3ead2b9b2 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -156,6 +156,16 @@ message GPUOptions {
     // CollectiveReduce, and serves as an override to automatic ring order
     // generation in OrderTaskDeviceMap() during CollectiveParam resolution.
     string collective_ring_order = 4;
+
+    // If true then extra work is done by GPUDevice and GPUBFCAllocator to
+    // keep track of when GPU memory is freed and when kernels actually
+    // complete so that we can know when a nominally free memory chunk
+    // is really not subject to pending use.
+    bool timestamped_allocator = 5;
+
+    // If > 0 limit the number of pending kernels on any compute
+    // stream to this number.
+    int32 pending_cap = 6;
   }
 
   // Everything inside experimental is subject to change and is not subject
@@ -425,6 +435,14 @@ message ConfigProto {
     // use NUMA affinity where applicable.  One consequence will be the
     // existence of as many CPU devices as there are available NUMA nodes.
     bool use_numa_affinity = 5;
+
+    // If true, make collective op execution order sequential and deterministic
+    // for potentially concurrent collective instances.
+    bool collective_deterministic_sequential_execution = 6;
+
+    // If true, use NCCL for CollectiveOps.  This feature is highly
+    // experimental.
+    bool collective_nccl = 7;
   };
 
   Experimental experimental = 16;
@@ -502,6 +520,25 @@ message RunMetadata {
 
   // Graphs of the partitions executed by executors.
   repeated GraphDef partition_graphs = 3;
+
+  message FunctionGraphs {
+    // TODO(nareshmodi): Include some sort of function/cache-key identifier?
+    repeated GraphDef partition_graphs = 1;
+
+    GraphDef pre_optimization_graph = 2;
+    GraphDef post_optimization_graph = 3;
+  }
+  // This is only populated for graphs that are run as functions in TensorFlow
+  // V2. There will be an entry below for each function that is traced.
+  // The main use cases of the post_optimization_graph and the partition_graphs
+  // is to give the caller insight into the graphs that were actually run by the
+  // runtime. Additional information (such as those in step_stats) will match
+  // these graphs.
+  // We also include the pre_optimization_graph since it is usually easier to
+  // read, and is helpful in situations where the caller wants to get a high
+  // level idea of what the built graph looks like (since the various graph
+  // optimization passes might change the structure of the graph significantly).
+  repeated FunctionGraphs function_graphs = 4;
 }
 
 // Defines a connection between two tensors in a `GraphDef`.
diff --git a/tensorflow/core/protobuf/graph_debug_info.proto b/tensorflow/core/protobuf/graph_debug_info.proto
new file mode 100644
index 0000000000000000000000000000000000000000..a123d3cf496a8d653abe652468ce22dda47bc18f
--- /dev/null
+++ b/tensorflow/core/protobuf/graph_debug_info.proto
@@ -0,0 +1,41 @@
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "GraphDebugInfoProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+
+message GraphDebugInfo {
+  // This represents a file/line location in the source code.
+  message FileLineCol {
+    // File name index, which can be used to retrive the file name string from
+    // `files`. The value should be between 0 and (len(files)-1)
+    int32 file_index = 1;
+
+    // Line number in the file.
+    int32 line = 2;
+
+    // Col number in the file line.
+    int32 col = 3;
+
+    // Name of function contains the file line.
+    string func = 4;
+
+    // Source code contained in this file line.
+    string code = 5;
+  }
+
+  // This represents a stack trace which is a ordered list of `FileLineCol`.
+  message StackTrace {
+    // Each line in the stack trace.
+    repeated FileLineCol file_line_cols = 1;
+  }
+
+  // This stores all the source code file names and can be indexed by the
+  // `file_index`.
+  repeated string files = 1;
+
+  // This maps a node name to a stack trace in the source code.
+  map<string, StackTrace> traces = 2;
+}
diff --git a/tensorflow/core/protobuf/master.proto b/tensorflow/core/protobuf/master.proto
index c104463c51c7e7be02430c7750ebacee60ed50e4..4a998c5bfcd29a23df01aca6feca827afebd3258 100644
--- a/tensorflow/core/protobuf/master.proto
+++ b/tensorflow/core/protobuf/master.proto
@@ -16,11 +16,13 @@ limitations under the License.
 syntax = "proto3";
 
 package tensorflow;
+
 option cc_enable_arenas = true;
 option java_outer_classname = "DistributedRuntimeProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
+
+// add go_package externally with copybara
 import "tensorflow/core/framework/device_attributes.proto";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/tensor.proto";
@@ -138,6 +140,11 @@ message RunStepRequest {
   // response body. This is a workaround since the RPC subsystem may
   // truncate long metadata messages.
   bool store_errors_in_response_body = 7;
+
+  // Unique identifier for this request. Every RunStepRequest must
+  // have a unique request_id, and retried RunStepRequest must have
+  // the same request_id. If request_id is zero, retry detection is disabled.
+  int64 request_id = 8;
 }
 
 message RunStepResponse {
@@ -183,6 +190,11 @@ message PartialRunSetupRequest {
   // Target Nodes. A list of node names. The named nodes will be run in future
   // steps, but their outputs will not be fetched.
   repeated string target = 4;
+
+  // Unique identifier for this request. Every PartialRunSetupRequest must
+  // have a unique request_id, and retried PartialRunSetupRequest must have
+  // the same request_id. If request_id is zero, retry detection is disabled.
+  int64 request_id = 5;
 }
 
 message PartialRunSetupResponse {
@@ -204,8 +216,7 @@ message CloseSessionRequest {
   string session_handle = 1;
 }
 
-message CloseSessionResponse {
-}
+message CloseSessionResponse {}
 
 // Reset() allows misbehaving or slow sessions to be aborted and closed, and
 // causes their resources eventually to be released.  Reset() does not wait
@@ -237,8 +248,7 @@ message ResetRequest {
   repeated string device_filters = 2;
 }
 
-message ResetResponse {
-}
+message ResetResponse {}
 
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -279,6 +289,11 @@ message MakeCallableRequest {
 
   // Options that define the behavior of the created callable.
   CallableOptions options = 2;
+
+  // Unique identifier for this request. Every MakeCallableRequest must
+  // have a unique request_id, and retried MakeCallableRequest must have
+  // the same request_id. If request_id is zero, retry detection is disabled.
+  int64 request_id = 3;
 }
 
 message MakeCallableResponse {
@@ -303,6 +318,11 @@ message RunCallableRequest {
   // Values of the tensors passed as arguments to the callable, in the order
   // defined in the CallableOptions.feed field passed to MakeCallable.
   repeated TensorProto feed = 3;
+
+  // Unique identifier for this request. Every RunCallableRequest must
+  // have a unique request_id, and retried RunCallableRequest must have
+  // the same request_id. If request_id is zero, retry detection is disabled.
+  int64 request_id = 4;
 }
 
 message RunCallableResponse {
@@ -330,5 +350,4 @@ message ReleaseCallableRequest {
   int64 handle = 2;
 }
 
-message ReleaseCallableResponse {
-}
+message ReleaseCallableResponse {}
diff --git a/tensorflow/core/protobuf/meta_graph.proto b/tensorflow/core/protobuf/meta_graph.proto
index 75a2a88ed72cd909f607286b574b0c343c6268f6..fa0192cf67c500994e5dd976c414c248b3a321a2 100644
--- a/tensorflow/core/protobuf/meta_graph.proto
+++ b/tensorflow/core/protobuf/meta_graph.proto
@@ -12,6 +12,7 @@ import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/op_def.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
+import "tensorflow/core/protobuf/saved_object_graph.proto";
 import "tensorflow/core/protobuf/saver.proto";
 
 // NOTE: This protocol buffer is evolving, and will go through revisions in the
@@ -84,6 +85,9 @@ message MetaGraphDef {
 
   // Asset file def to be used with the defined graph.
   repeated AssetFileDef asset_file_def = 6;
+
+  // Extra information about the structure of functions and stateful objects.
+  SavedObjectGraph object_graph_def = 7;
 }
 
 // CollectionDef should cover most collections.
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 515d673828e3792ac6f4268fd55b58e43aab509b..7a62c6ee1f3373a6d300c2677a8cddc3e1ed01b3 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -5,9 +5,10 @@ option cc_enable_arenas = true;
 option java_outer_classname = "RewriterConfigProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
+// add go_package externally with copybara
 
 import "tensorflow/core/framework/attr_value.proto";
+import "tensorflow/core/protobuf/verifier_config.proto";
 
 message AutoParallelOptions {
   bool enable = 1;
@@ -75,8 +76,11 @@ message RewriterConfig {
   // Try to allocate some independent Op outputs contiguously in order to
   // merge or eliminate downstream Ops (off by default).
   Toggle scoped_allocator_optimization = 15;
-  // Force small ops onto the CPU (default is OFF).
+  // Force small ops onto the CPU (default is ON).
   Toggle pin_to_host_optimization = 18;
+  // Enable the swap of kernel implementations based on the device placement
+  // (default is ON).
+  Toggle implementation_selector = 22;
   // Disable the entire meta optimizer (off by default).
   bool disable_meta_optimizer = 19;
 
@@ -166,4 +170,11 @@ message RewriterConfig {
 
   // list of CustomGraphOptimizers to apply.
   repeated CustomGraphOptimizer custom_optimizers = 200;
+
+  // VerifierConfig specifying the verifiers to be run after every optimizer.
+  VerifierConfig inter_optimizer_verifier_config = 300;
+
+  // VerifierConfig specifying the verifiers to be run at the end, after all
+  // optimizers have run.
+  VerifierConfig post_optimization_verifier_config = 301;
 }
diff --git a/tensorflow/core/protobuf/saved_object_graph.proto b/tensorflow/core/protobuf/saved_object_graph.proto
new file mode 100644
index 0000000000000000000000000000000000000000..48060b33dc42ca74c0464dbce20b0e88ac4a30e6
--- /dev/null
+++ b/tensorflow/core/protobuf/saved_object_graph.proto
@@ -0,0 +1,158 @@
+syntax = "proto3";
+
+import "tensorflow/core/protobuf/trackable_object_graph.proto";
+import "tensorflow/core/protobuf/struct.proto";
+import "tensorflow/core/framework/tensor_shape.proto";
+import "tensorflow/core/framework/types.proto";
+import "tensorflow/core/framework/versions.proto";
+
+option cc_enable_arenas = true;
+
+package tensorflow;
+
+// A SavedObjectGraph is part of object-based SavedModels in TF 2.0. It
+// describes the directed graph of Python objects (or equivalent in other
+// languages) that make up a model, with nodes[0] at the root.
+
+// SavedObjectGraph shares some structure with TrackableObjectGraph, but
+// SavedObjectGraph belongs to the MetaGraph and contains pointers to functions
+// and type information, while TrackableObjectGraph lives in the checkpoint
+// and contains pointers only to variable values.
+
+message SavedObjectGraph {
+  // Flattened list of objects in the object graph.
+  //
+  // The position of the object in this list indicates its id.
+  // Nodes[0] is considered the root node.
+  repeated SavedObject nodes = 1;
+
+  // Information about captures and output structures in concrete functions.
+  // Referenced from SavedBareConcreteFunction and SavedFunction.
+  map<string, SavedConcreteFunction> concrete_functions = 2;
+}
+
+message SavedObject {
+  // Objects which this object depends on: named edges in the dependency
+  // graph.
+  //
+  // Note: currently only valid if kind == "user_object".
+  repeated TrackableObjectGraph.TrackableObject.ObjectReference
+      children = 1;
+
+  // Removed when forking SavedObject from TrackableObjectGraph.
+  reserved "attributes";
+  reserved 2;
+
+  // Slot variables owned by this object. This describes the three-way
+  // (optimizer, variable, slot variable) relationship; none of the three
+  // depend on the others directly.
+  //
+  // Note: currently only valid if kind == "user_object".
+  repeated TrackableObjectGraph.TrackableObject.SlotVariableReference
+      slot_variables = 3;
+
+  oneof kind {
+    SavedUserObject user_object = 4;
+    SavedAsset asset = 5;
+    SavedFunction function = 6;
+    SavedVariable variable = 7;
+    SavedBareConcreteFunction bare_concrete_function = 8;
+    SavedConstant constant = 9;
+    SavedResource resource = 10;
+  }
+}
+
+// A SavedUserObject is an object (in the object-oriented language of the
+// TensorFlow program) of some user- or framework-defined class other than
+// those handled specifically by the other kinds of SavedObjects.
+//
+// This object cannot be evaluated as a tensor, and therefore cannot be bound
+// to an input of a function.
+message SavedUserObject {
+  // Corresponds to a registration of the type to use in the loading program.
+  string identifier = 1;
+  // Version information from the producer of this SavedUserObject.
+  VersionDef version = 2;
+}
+
+// A SavedAsset points to an asset in the MetaGraph.
+//
+// When bound to a function this object evaluates to a tensor with the absolute
+// filename. Users should not depend on a particular part of the filename to
+// remain stable (e.g. basename could be changed).
+message SavedAsset {
+  // Index into `MetaGraphDef.asset_file_def[]` that describes the Asset.
+  //
+  // Only the field `AssetFileDef.filename` is used. Other fields, such as
+  // `AssetFileDef.tensor_info`, MUST be ignored.
+  int32 asset_file_def_index = 1;
+}
+
+// A function with multiple signatures, possibly with non-Tensor arguments.
+message SavedFunction {
+  repeated string concrete_functions = 1;
+  FunctionSpec function_spec = 2;
+}
+
+// Stores low-level information about a concrete function. Referenced in either
+// a SavedFunction or a SavedBareConcreteFunction.
+message SavedConcreteFunction {
+  // Bound inputs to the function. The SavedObjects identified by the node ids
+  // given here are appended as extra inputs to the caller-supplied inputs.
+  // The only types of SavedObjects valid here are SavedVariable, SavedResource
+  // and SavedAsset.
+  repeated int32 bound_inputs = 2;
+  // Input in canonicalized form that was received to create this concrete
+  // function.
+  StructuredValue canonicalized_input_signature = 3;
+  // Output that was the return value of this function after replacing all
+  // Tensors with TensorSpecs. This can be an arbitrary nested function and will
+  // be used to reconstruct the full structure from pure tensors.
+  StructuredValue output_signature = 4;
+}
+
+message SavedBareConcreteFunction {
+  // Identifies a SavedConcreteFunction.
+  string concrete_function_name = 1;
+
+  // A sequence of unique strings, one per Tensor argument.
+  repeated string argument_keywords = 2;
+  // The prefix of `argument_keywords` which may be identified by position.
+  int64 allowed_positional_arguments = 3;
+}
+
+message SavedConstant {
+  // An Operation name for a ConstantOp in this SavedObjectGraph's MetaGraph.
+  string operation = 1;
+}
+
+// Represents a Variable that is initialized by loading the contents from the
+// checkpoint.
+message SavedVariable {
+  DataType dtype = 1;
+  TensorShapeProto shape = 2;
+  bool trainable = 3;
+}
+
+// Represents `FunctionSpec` used in `Function`. This represents a
+// function that has been wrapped as a TensorFlow `Function`.
+message FunctionSpec {
+  // Full arg spec from inspect.getfullargspec().
+  StructuredValue fullargspec = 1;
+  // Whether this represents a class method.
+  bool is_method = 2;
+  // Which arguments to always prepend, in case the original function is based
+  // on a functools.partial.
+  StructuredValue args_to_prepend = 3;
+  // Which kwargs to always include, in case the original function is based on a
+  // functools.partial.
+  StructuredValue kwargs_to_include = 4;
+  // The input signature, if specified.
+  StructuredValue input_signature = 5;
+}
+
+// A SavedResource represents a TF object that holds state during its lifetime.
+message SavedResource {
+  // An object of this type can have a reference to a:
+  // create_resource() and an initialize() function.
+}
diff --git a/tensorflow/core/protobuf/struct.proto b/tensorflow/core/protobuf/struct.proto
new file mode 100644
index 0000000000000000000000000000000000000000..55b9b520a89a41b066fa2958a4aedf5914dc247a
--- /dev/null
+++ b/tensorflow/core/protobuf/struct.proto
@@ -0,0 +1,107 @@
+syntax = "proto3";
+
+import "tensorflow/core/framework/tensor_shape.proto";
+import "tensorflow/core/framework/types.proto";
+
+package tensorflow;
+
+// `StructuredValue` represents a dynamically typed value representing various
+// data structures that are inspired by Python data structures typically used in
+// TensorFlow functions as inputs and outputs.
+//
+// For example when saving a Layer there may be a `training` argument. If the
+// user passes a boolean True/False, that switches between two concrete
+// TensorFlow functions. In order to switch between them in the same way after
+// loading the SavedModel, we need to represent "True" and "False".
+//
+// A more advanced example might be a function which takes a list of
+// dictionaries mapping from strings to Tensors. In order to map from
+// user-specified arguments `[{"a": tf.constant(1.)}, {"q": tf.constant(3.)}]`
+// after load to the right saved TensorFlow function, we need to represent the
+// nested structure and the strings, recording that we have a trace for anything
+// matching `[{"a": tf.TensorSpec(None, tf.float32)}, {"q": tf.TensorSpec([],
+// tf.float64)}]` as an example.
+//
+// Likewise functions may return nested structures of Tensors, for example
+// returning a dictionary mapping from strings to Tensors. In order for the
+// loaded function to return the same structure we need to serialize it.
+//
+// This is an ergonomic aid for working with loaded SavedModels, not a promise
+// to serialize all possible function signatures. For example we do not expect
+// to pickle generic Python objects, and ideally we'd stay language-agnostic.
+message StructuredValue {
+  // The kind of value.
+  oneof kind {
+    // Represents None.
+    NoneValue none_value = 1;
+
+    // Represents a double-precision floating-point value (a Python `float`).
+    double float64_value = 11;
+    // Represents a signed integer value, limited to 64 bits.
+    // Larger values from Python's arbitrary-precision integers are unsupported.
+    sint64 int64_value = 12;
+    // Represents a string of Unicode characters stored in a Python `str`.
+    // In Python 3, this is exactly what type `str` is.
+    // In Python 2, this is the UTF-8 encoding of the characters.
+    // For strings with ASCII characters only (as often used in TensorFlow code)
+    // there is effectively no difference between the language versions.
+    // The obsolescent `unicode` type of Python 2 is not supported here.
+    string string_value = 13;
+    // Represents a boolean value.
+    bool bool_value = 14;
+
+    // Represents a TensorShape.
+    tensorflow.TensorShapeProto tensor_shape_value = 31;
+    // Represents an enum value for dtype.
+    tensorflow.DataType tensor_dtype_value = 32;
+    // Represents a value for tf.TensorSpec.
+    TensorSpecProto tensor_spec_value = 33;
+
+    // Represents a list of `Value`.
+    ListValue list_value = 51;
+    // Represents a tuple of `Value`.
+    TupleValue tuple_value = 52;
+    // Represents a dict `Value`.
+    DictValue dict_value = 53;
+    // Represents Python's namedtuple.
+    NamedTupleValue named_tuple_value = 54;
+  }
+}
+
+// Represents None.
+message NoneValue {}
+
+// Represents a Python list.
+message ListValue {
+  repeated StructuredValue values = 1;
+}
+
+// Represents a Python tuple.
+message TupleValue {
+  repeated StructuredValue values = 1;
+}
+
+// Represents a Python dict keyed by `str`.
+// The comment on Unicode from Value.string_value applies analogously.
+message DictValue {
+  map<string, StructuredValue> fields = 1;
+}
+
+// Represents a (key, value) pair.
+message PairValue {
+  string key = 1;
+  StructuredValue value = 2;
+}
+
+// Represents Python's namedtuple.
+message NamedTupleValue {
+  string name = 1;
+  repeated PairValue values = 2;
+}
+
+// A protobuf to tf.TensorSpec.
+message TensorSpecProto {
+  string name = 1;
+  tensorflow.TensorShapeProto shape = 2;
+  tensorflow.DataType dtype = 3;
+};
diff --git a/tensorflow/contrib/tpu/proto/BUILD b/tensorflow/core/protobuf/tpu/BUILD
similarity index 88%
rename from tensorflow/contrib/tpu/proto/BUILD
rename to tensorflow/core/protobuf/tpu/BUILD
index c20cab844cfaf083be2702a29ac2a152c7b72c2a..ea98ee25c89e1b7bef39276bae5c98bf382dbd7f 100644
--- a/tensorflow/contrib/tpu/proto/BUILD
+++ b/tensorflow/core/protobuf/tpu/BUILD
@@ -49,6 +49,15 @@ tf_proto_library(
     visibility = ["//visibility:public"],
 )
 
+tf_proto_library(
+    name = "dynamic_padding_proto",
+    srcs = [
+        "dynamic_padding.proto",
+    ],
+    cc_api_version = 2,
+    visibility = ["//visibility:public"],
+)
+
 tf_proto_library_py(
     name = "compilation_result_proto",
     srcs = [
diff --git a/tensorflow/contrib/tpu/proto/compilation_result.proto b/tensorflow/core/protobuf/tpu/compilation_result.proto
similarity index 100%
rename from tensorflow/contrib/tpu/proto/compilation_result.proto
rename to tensorflow/core/protobuf/tpu/compilation_result.proto
diff --git a/tensorflow/core/protobuf/tpu/dynamic_padding.proto b/tensorflow/core/protobuf/tpu/dynamic_padding.proto
new file mode 100644
index 0000000000000000000000000000000000000000..c9ebf181169a583d774ef77ca0b8c243ce733615
--- /dev/null
+++ b/tensorflow/core/protobuf/tpu/dynamic_padding.proto
@@ -0,0 +1,19 @@
+syntax = "proto3";
+
+option cc_enable_arenas = true;
+
+package tensorflow.tpu;
+
+// A mapping between the dynamic shape dimension of an input and the arg that
+// represents the real shape.
+message PaddingMap {
+  // Input arg index with dynamic shapes.
+  int32 arg_index = 1;
+
+  // The dynamic shape dimension index.
+  int32 shape_index = 2;
+
+  // The arg index that dynamic dimension maps to, which represents the value
+  // of the real shape.
+  int32 padding_arg_index = 3;
+}
diff --git a/tensorflow/contrib/tpu/proto/optimization_parameters.proto b/tensorflow/core/protobuf/tpu/optimization_parameters.proto
similarity index 64%
rename from tensorflow/contrib/tpu/proto/optimization_parameters.proto
rename to tensorflow/core/protobuf/tpu/optimization_parameters.proto
index aae1ab1d37a166303883e3a07a7a01efe2feab51..752f0dbc9cf1d026a11191efb2e9bf6a2686f424 100644
--- a/tensorflow/contrib/tpu/proto/optimization_parameters.proto
+++ b/tensorflow/core/protobuf/tpu/optimization_parameters.proto
@@ -9,9 +9,38 @@ message ClippingLimits {
   google.protobuf.FloatValue upper = 2;  // +inf if not set
 }
 
-// Get the learning rate from the parameters of the SendTPUEmbeddingGradients
-// op.
+// Dynamic learning rate specification in the TPUEmbeddingConfiguration. The
+// actual learning rates are provided as a scalar input list to the
+// SendTPUEmbeddingGradients Op indexed by their tag specified through the
+// following proto.
 message DynamicLearningRate {
+  // For tables where learning rates are dynamically computed and communicated
+  // to the TPU embedding program, a tag must be specified for the learning
+  // rate.
+  //
+  // The tag must be a non-negative  integer. The total number of unique tags
+  // must be less than or equal to the number of tables in the TPU embedding
+  // configuration (a table does not specify any tag if it uses a constant
+  // learning rate, and specifies exactly one tag if it uses dynamic learning
+  // rates).
+  //
+  // All tags in the range [0, number_of_unique_tags) must be present in the TPU
+  // embedding configuration, i.e. a tag cannot be skipped if a different tag
+  // numerically greater than it is used in the configuration.
+  //
+  // If multiple tables specify the same tag, they *MUST* have
+  // the same dynamic learning rate, for example, their dynamic learning rate
+  // could be computed by the same TensorFlow sub-graph. The partitioning of the
+  // embedding layer would be more optimal if the number_of_unique_tags is as
+  // *LOW* as possible, i.e., if many tables share the same tag.
+  //
+  // The learning_rate input of the SendTPUEmbeddingGradients op is used to
+  // communicate dynamic learning rates to the TPU embedding program.
+  // The learning_rate input is a list of scalars where the size of the list is
+  // equal to the number of unique tags. The learning rate associated with a
+  // particular tag is specified by populating its corresponding index in the
+  // list of learning_rate scalars.
+  int32 tag = 1;
 }
 
 // Source of learning rate to use.
@@ -60,11 +89,11 @@ message FtrlParameters {
 // the normal version of Adam that updates all parameters in the embedding
 // table, even for entries that are not used in the current minibatch
 // (https://www.tensorflow.org/api_docs/python/tf/contrib/opt/AdamOptimizer). If
-// use_non_lazy_adam is enabled, use_gradient_accumulation is also required in
-// order to get correct results; a warning will be printed otherwise (which may
-// change to an error in the future). If use_sum_inside_sqrt is set, the Adam
-// variable update formula will be changed from m / (sqrt(v) + epsilon) to
-// m / sqrt(v + epsilon**2); this option improves the performance of TPU
+// use_non_lazy_adam is enabled, gradient accumulation is also required to be
+// enabled in order to get correct results; a warning will be printed otherwise
+// (which may change to an error in the future). If use_sum_inside_sqrt is set,
+// the Adam variable update formula will be changed from m / (sqrt(v) + epsilon)
+// to m / sqrt(v + epsilon**2); this option improves the performance of TPU
 // training and is not expected to harm model quality.
 message AdamParameters {
   float beta1 = 3;
@@ -141,6 +170,19 @@ message ProximalAdagradParameters {
   float initial_accumulator = 3;
 }
 
+// Status of using gradient accumulation (doing two passes over the input
+// gradients: one to accumulate them into a temporary array and another to apply
+// them using the actual optimization algorithm). The extra message is to wrap
+// the enum for scoping.
+message GradientAccumulationStatus {
+  // Defaults to ENABLED.
+  enum Status {
+    UNSPECIFIED = 0;
+    ENABLED = 1;
+    DISABLED = 2;
+  }
+};
+
 message OptimizationParameters {
   // Learning rate used for updating the embedding layer parameters.
   LearningRate learning_rate = 13;
@@ -162,12 +204,10 @@ message OptimizationParameters {
   // once per minibatch.
   float weight_decay_factor = 16;
 
-  // Whether to use gradient accumulation (do two passes over the input
+  // Status of using gradient accumulation (doing two passes over the input
   // gradients: one to accumulate them into a temporary array and another to
-  // apply them using the actual optimization algorithm). This feature is
-  // experimental -- it has not been fully verified and may cause training
-  // crashes and/or failures.
-  bool use_gradient_accumulation = 15;
+  // apply them using the actual optimization algorithm).
+  GradientAccumulationStatus.Status gradient_accumulation_status = 17;
 
   // Optimization algorithm parameters; which field is selected determines which
   // algorithm to use.
@@ -183,10 +223,13 @@ message OptimizationParameters {
     AdadeltaParameters adadelta = 12;
     ProximalAdagradParameters proximal_adagrad = 14;
   }
+
+  reserved 15;  // Old use_gradient_accumulation.
 }
 
 // Specification of an optimization algorithm's state variables (both the main
-// value vector and any extra accumulators, etc.).
+// value vector and any extra accumulators, etc.). This proto is only used
+// internally by the TPU software and is not exposed directly to the TF model.
 message StateVariableSpecification {
   // Parameter name for the state variable.
   string name = 1;
@@ -194,6 +237,20 @@ message StateVariableSpecification {
   // A normal state variable that should be saved and restored in checkpoints
   // and used as an input or output to non-debug TensorFlow ops.
   message UserDefined {
+    // For padding embedding rows, this field specifies the initial value to be
+    // used. Separate initial values need to be specified for the embeddings and
+    // any extra accumulators. The initial values should be specified so as to
+    // maintain two invariants during model training:
+    // (1) The embedding vector multiplied by zero returns a vector containing
+    //     all zeros. To maintain this invariant, the embedding values should
+    //     never be NaNs or +-infinity.
+    // (2) Repeatedly applying the optimizer using a gradient vector of all
+    //     zeros does not cause the embeddings or slot variables to become NaNs
+    //     or +-infinity.
+    // The padding row is looked up when no embedding IDs are present for a
+    // feature. The semantics of embedding lookup dictate that the output must
+    // be zero under this scenario.
+    double padding_initial_value = 1;
   }
 
   // A state variable that should be filled with a constant and normally hidden
diff --git a/tensorflow/contrib/tpu/proto/topology.proto b/tensorflow/core/protobuf/tpu/topology.proto
similarity index 100%
rename from tensorflow/contrib/tpu/proto/topology.proto
rename to tensorflow/core/protobuf/tpu/topology.proto
diff --git a/tensorflow/contrib/tpu/proto/tpu_embedding_configuration.proto b/tensorflow/core/protobuf/tpu/tpu_embedding_configuration.proto
similarity index 97%
rename from tensorflow/contrib/tpu/proto/tpu_embedding_configuration.proto
rename to tensorflow/core/protobuf/tpu/tpu_embedding_configuration.proto
index da19b135d7497d1bd5d2e212cab97db78c756cad..53280edfa6df9fc24d1a38dd6458d79fc29600d8 100644
--- a/tensorflow/contrib/tpu/proto/tpu_embedding_configuration.proto
+++ b/tensorflow/core/protobuf/tpu/tpu_embedding_configuration.proto
@@ -2,8 +2,8 @@ syntax = "proto3";
 
 package tensorflow.tpu;
 
-import "tensorflow/contrib/tpu/proto/optimization_parameters.proto";
-import "tensorflow/contrib/tpu/proto/tpu_embedding_output_layout.proto";
+import "tensorflow/core/protobuf/tpu/optimization_parameters.proto";
+import "tensorflow/core/protobuf/tpu/tpu_embedding_output_layout.proto";
 
 message TPUEmbeddingConfiguration {
   // Description of the various embedding tables.
diff --git a/tensorflow/contrib/tpu/proto/tpu_embedding_output_layout.proto b/tensorflow/core/protobuf/tpu/tpu_embedding_output_layout.proto
similarity index 100%
rename from tensorflow/contrib/tpu/proto/tpu_embedding_output_layout.proto
rename to tensorflow/core/protobuf/tpu/tpu_embedding_output_layout.proto
diff --git a/tensorflow/core/protobuf/checkpointable_object_graph.proto b/tensorflow/core/protobuf/trackable_object_graph.proto
similarity index 74%
rename from tensorflow/core/protobuf/checkpointable_object_graph.proto
rename to tensorflow/core/protobuf/trackable_object_graph.proto
index 651f692f6d7b6d677b480a007f9ffe5c814beec3..02d852e6f3df024fa35bf9e4d05af5f2f8d568a5 100644
--- a/tensorflow/core/protobuf/checkpointable_object_graph.proto
+++ b/tensorflow/core/protobuf/trackable_object_graph.proto
@@ -8,10 +8,10 @@ package tensorflow;
 // own variables, allowing for more robust checkpoint loading into modified
 // programs.
 
-message CheckpointableObjectGraph {
-  message CheckpointableObject {
+message TrackableObjectGraph {
+  message TrackableObject {
     message ObjectReference {
-      // An index into `CheckpointableObjectGraph.nodes`, indicating the object
+      // An index into `TrackableObjectGraph.nodes`, indicating the object
       // being referenced.
       int32 node_id = 1;
       // A user-provided name for the edge.
@@ -30,15 +30,19 @@ message CheckpointableObjectGraph {
       string full_name = 2;
       // The generated name of the Tensor in the checkpoint.
       string checkpoint_key = 3;
+      // Whether checkpoints should be considered as matching even without this
+      // value restored. Used for non-critical values which don't affect the
+      // TensorFlow graph, such as layer configurations.
+      bool optional_restore = 4;
     }
 
     message SlotVariableReference {
-      // An index into `CheckpointableObjectGraph.nodes`, indicating the
+      // An index into `TrackableObjectGraph.nodes`, indicating the
       // variable object this slot was created for.
       int32 original_variable_node_id = 1;
       // The name of the slot (e.g. "m"/"v").
       string slot_name = 2;
-      // An index into `CheckpointableObjectGraph.nodes`, indicating the
+      // An index into `TrackableObjectGraph.nodes`, indicating the
       // `Object` with the value of the slot variable.
       int32 slot_variable_node_id = 3;
     }
@@ -51,5 +55,5 @@ message CheckpointableObjectGraph {
     repeated SlotVariableReference slot_variables = 3;
   }
 
-  repeated CheckpointableObject nodes = 1;
+  repeated TrackableObject nodes = 1;
 }
diff --git a/tensorflow/core/protobuf/verifier_config.proto b/tensorflow/core/protobuf/verifier_config.proto
new file mode 100644
index 0000000000000000000000000000000000000000..207f0f2a974cbc58413490380edf3795c7206aba
--- /dev/null
+++ b/tensorflow/core/protobuf/verifier_config.proto
@@ -0,0 +1,26 @@
+syntax = "proto3";
+
+package tensorflow;
+option cc_enable_arenas = true;
+option java_outer_classname = "VerifierConfigProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+// add go_package externally with copybara
+
+// The config for graph verifiers.
+message VerifierConfig {
+  enum Toggle {
+    DEFAULT = 0;
+    ON = 1;
+    OFF = 2;
+  }
+
+  // Deadline for completion of all verification i.e. all the Toggle ON
+  // verifiers must complete execution within this time.
+  int64 verification_timeout_in_ms = 1;
+
+  // Perform structural validation on a tensorflow graph. Default is OFF.
+  Toggle structure_verifier = 2;
+
+  // Next tag: 3
+}
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index 74058c846530bc2b4577d18034d02ed002d8983f..4284dd119edf3167915942c6458827ebb7191ad5 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -535,6 +535,7 @@ message CompleteInstanceRequest {
 message CompleteInstanceResponse {
   int32 instance_key = 1;
   int32 source_rank = 2;
+  bytes communicator_key = 3;
 }
 
 // Request for next agreed-upon step_id for the specified graph_keys.
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 07eeeb4f032f199fe50b315c39b5e9835770d5c7..a55fe17dd5fa6f7ba7c0eaebb345c69f9dce2a5c 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc0"
+#define TF_VERSION_SUFFIX ""
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/contrib/tensorboard/db/BUILD b/tensorflow/core/summary/BUILD
similarity index 98%
rename from tensorflow/contrib/tensorboard/db/BUILD
rename to tensorflow/core/summary/BUILD
index 6507546ee9f81108add181a9c83064c9860005e2..a89175cdb1db2ff1184d8da26bc180d578faaf69 100644
--- a/tensorflow/contrib/tensorboard/db/BUILD
+++ b/tensorflow/core/summary/BUILD
@@ -1,5 +1,5 @@
 # Description:
-#   TensorBoard database code.
+#   C++ implementation code for the summary writing APIs.
 
 package(default_visibility = ["//tensorflow:internal"])
 
diff --git a/tensorflow/contrib/tensorboard/db/loader.cc b/tensorflow/core/summary/loader.cc
similarity index 97%
rename from tensorflow/contrib/tensorboard/db/loader.cc
rename to tensorflow/core/summary/loader.cc
index 6439328022329cbc56d767e787ec9d6797045768..68535feacfae6d8c9edf6b0725fe4d4c8d63bf60 100644
--- a/tensorflow/contrib/tensorboard/db/loader.cc
+++ b/tensorflow/core/summary/loader.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include <iostream>
 #include <vector>
 
-#include "tensorflow/contrib/tensorboard/db/schema.h"
-#include "tensorflow/contrib/tensorboard/db/summary_db_writer.h"
+#include "tensorflow/core/summary/schema.h"
+#include "tensorflow/core/summary/summary_db_writer.h"
 #include "tensorflow/core/lib/db/sqlite.h"
 #include "tensorflow/core/lib/io/record_reader.h"
 #include "tensorflow/core/platform/init_main.h"
diff --git a/tensorflow/contrib/tensorboard/db/schema.cc b/tensorflow/core/summary/schema.cc
similarity index 99%
rename from tensorflow/contrib/tensorboard/db/schema.cc
rename to tensorflow/core/summary/schema.cc
index 3c7bc87e4a2dbeadef2b9589d58c845204049123..822e2fa3bfdaf2be5f03704fc83d39f0e00369d3 100644
--- a/tensorflow/contrib/tensorboard/db/schema.cc
+++ b/tensorflow/core/summary/schema.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/tensorboard/db/schema.h"
+#include "tensorflow/core/summary/schema.h"
 
 #include "tensorflow/core/lib/core/errors.h"
 
diff --git a/tensorflow/contrib/tensorboard/db/schema.h b/tensorflow/core/summary/schema.h
similarity index 87%
rename from tensorflow/contrib/tensorboard/db/schema.h
rename to tensorflow/core/summary/schema.h
index 3da450422523dbe4304446869a38d43981d76eb5..6305f8eabd7cacb9dca8922b694e92ca4596d777 100644
--- a/tensorflow/contrib/tensorboard/db/schema.h
+++ b/tensorflow/core/summary/schema.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_TENSORBOARD_DB_SCHEMA_H_
-#define TENSORFLOW_CONTRIB_TENSORBOARD_DB_SCHEMA_H_
+#ifndef TENSORFLOW_CORE_SUMMARY_SCHEMA_H_
+#define TENSORFLOW_CORE_SUMMARY_SCHEMA_H_
 
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/db/sqlite.h"
@@ -30,4 +30,4 @@ Status SetupTensorboardSqliteDb(Sqlite* db);
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORBOARD_DB_SCHEMA_H_
+#endif  // TENSORFLOW_CORE_SUMMARY_SCHEMA_H_
diff --git a/tensorflow/contrib/tensorboard/db/schema_test.cc b/tensorflow/core/summary/schema_test.cc
similarity index 95%
rename from tensorflow/contrib/tensorboard/db/schema_test.cc
rename to tensorflow/core/summary/schema_test.cc
index 4d3f2880bd02682ad00a90760f2a4478f1e6b2a2..fa21b45b62cca2b116010de87a2dc2bae5cbe866 100644
--- a/tensorflow/contrib/tensorboard/db/schema_test.cc
+++ b/tensorflow/core/summary/schema_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/tensorboard/db/schema.h"
+#include "tensorflow/core/summary/schema.h"
 
 #include <memory>
 
diff --git a/tensorflow/contrib/tensorboard/db/summary_converter.cc b/tensorflow/core/summary/summary_converter.cc
similarity index 99%
rename from tensorflow/contrib/tensorboard/db/summary_converter.cc
rename to tensorflow/core/summary/summary_converter.cc
index 93c1183072b4d791843e740f970234ba52857463..e6e34e9602fa8cc3ed91d773d1d4cbec0d0c5232 100644
--- a/tensorflow/contrib/tensorboard/db/summary_converter.cc
+++ b/tensorflow/core/summary/summary_converter.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/tensorboard/db/summary_converter.h"
+#include "tensorflow/core/summary/summary_converter.h"
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/summary.pb.h"
diff --git a/tensorflow/contrib/tensorboard/db/summary_converter.h b/tensorflow/core/summary/summary_converter.h
similarity index 89%
rename from tensorflow/contrib/tensorboard/db/summary_converter.h
rename to tensorflow/core/summary/summary_converter.h
index 329c7f9f2f9fe25cdff8d5ac2e52c25362f624c2..dc005d2604ff1687e765341ebdb9e86c62c78f3a 100644
--- a/tensorflow/contrib/tensorboard/db/summary_converter.h
+++ b/tensorflow/core/summary/summary_converter.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_CONVERTER_H_
-#define TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_CONVERTER_H_
+#ifndef TENSORFLOW_CORE_SUMMARY_SUMMARY_CONVERTER_H_
+#define TENSORFLOW_CORE_SUMMARY_SUMMARY_CONVERTER_H_
 
 #include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -35,4 +35,4 @@ Status AddTensorAsAudioToSummary(const Tensor& tensor, const string& tag,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_CONVERTER_H_
+#endif  // TENSORFLOW_CORE_SUMMARY_SUMMARY_CONVERTER_H_
diff --git a/tensorflow/contrib/tensorboard/db/summary_db_writer.cc b/tensorflow/core/summary/summary_db_writer.cc
similarity index 99%
rename from tensorflow/contrib/tensorboard/db/summary_db_writer.cc
rename to tensorflow/core/summary/summary_db_writer.cc
index cfdc884277a025aa11995d329389f3748b17490c..b203d439ccf82b36b3d0e1bdd958fdcfac87f4b0 100644
--- a/tensorflow/contrib/tensorboard/db/summary_db_writer.cc
+++ b/tensorflow/core/summary/summary_db_writer.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/tensorboard/db/summary_db_writer.h"
+#include "tensorflow/core/summary/summary_db_writer.h"
 
 #include <deque>
 
-#include "tensorflow/contrib/tensorboard/db/summary_converter.h"
+#include "tensorflow/core/summary/summary_converter.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -972,7 +972,7 @@ class SummaryDbWriter : public SummaryWriterInterface {
     return MigrateEvent(std::move(e));
   }
 
-  string DebugString() override { return "SummaryDbWriter"; }
+  string DebugString() const override { return "SummaryDbWriter"; }
 
  private:
   Status Write(int64 step, const Tensor& t, const string& tag,
diff --git a/tensorflow/contrib/tensorboard/db/summary_db_writer.h b/tensorflow/core/summary/summary_db_writer.h
similarity index 89%
rename from tensorflow/contrib/tensorboard/db/summary_db_writer.h
rename to tensorflow/core/summary/summary_db_writer.h
index 746da1533b157bf7b2be5c85ada8b61ba224cc3e..5669afe7f67e1019d3d62d45ea99a64f1a31c82e 100644
--- a/tensorflow/contrib/tensorboard/db/summary_db_writer.h
+++ b/tensorflow/core/summary/summary_db_writer.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_DB_WRITER_H_
-#define TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_DB_WRITER_H_
+#ifndef TENSORFLOW_CORE_SUMMARY_SUMMARY_DB_WRITER_H_
+#define TENSORFLOW_CORE_SUMMARY_SUMMARY_DB_WRITER_H_
 
 #include "tensorflow/core/kernels/summary_interface.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -39,4 +39,4 @@ Status CreateSummaryDbWriter(Sqlite* db, const string& experiment_name,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_DB_WRITER_H_
+#endif  // TENSORFLOW_CORE_SUMMARY_SUMMARY_DB_WRITER_H_
diff --git a/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc b/tensorflow/core/summary/summary_db_writer_test.cc
similarity index 99%
rename from tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc
rename to tensorflow/core/summary/summary_db_writer_test.cc
index 2e8d4109dd624ab66d774668ad04def9a7d3cdf2..c4e9ddea2c51673c94273900b0407517b6533f3d 100644
--- a/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc
+++ b/tensorflow/core/summary/summary_db_writer_test.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/tensorboard/db/summary_db_writer.h"
+#include "tensorflow/core/summary/summary_db_writer.h"
 
-#include "tensorflow/contrib/tensorboard/db/schema.h"
+#include "tensorflow/core/summary/schema.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer.cc b/tensorflow/core/summary/summary_file_writer.cc
similarity index 97%
rename from tensorflow/contrib/tensorboard/db/summary_file_writer.cc
rename to tensorflow/core/summary/summary_file_writer.cc
index 22b6f09d0cd88068f7bedabe7687920420a3028f..711a7d3d1007090259f34652f10cf43a4d0c5f0a 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
+++ b/tensorflow/core/summary/summary_file_writer.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/tensorboard/db/summary_file_writer.h"
+#include "tensorflow/core/summary/summary_file_writer.h"
 
-#include "tensorflow/contrib/tensorboard/db/summary_converter.h"
+#include "tensorflow/core/summary/summary_converter.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
@@ -148,7 +148,7 @@ class SummaryFileWriter : public SummaryWriterInterface {
     return Status::OK();
   }
 
-  string DebugString() override { return "SummaryFileWriter"; }
+  string DebugString() const override { return "SummaryFileWriter"; }
 
  private:
   double GetWallTime() {
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer.h b/tensorflow/core/summary/summary_file_writer.h
similarity index 89%
rename from tensorflow/contrib/tensorboard/db/summary_file_writer.h
rename to tensorflow/core/summary/summary_file_writer.h
index 73b0a5542beabdc460c32156dd44aacc5f08610a..7d964516da3ceecdc4cdedae000ba873ec92e1e9 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer.h
+++ b/tensorflow/core/summary/summary_file_writer.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_FILE_WRITER_H_
-#define TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_FILE_WRITER_H_
+#ifndef TENSORFLOW_CORE_SUMMARY_SUMMARY_FILE_WRITER_H_
+#define TENSORFLOW_CORE_SUMMARY_SUMMARY_FILE_WRITER_H_
 
 #include "tensorflow/core/kernels/summary_interface.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -40,4 +40,4 @@ Status CreateSummaryFileWriter(int max_queue, int flush_millis,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TENSORBOARD_DB_SUMMARY_FILE_WRITER_H_
+#endif  // TENSORFLOW_CORE_SUMMARY_SUMMARY_FILE_WRITER_H_
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc b/tensorflow/core/summary/summary_file_writer_test.cc
similarity index 99%
rename from tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
rename to tensorflow/core/summary/summary_file_writer_test.cc
index ffbfb9533e887e54b0f5bdfde11dadce21073a94..d3b19c3abdb8b773e22472c5987d91852fc6ac8e 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
+++ b/tensorflow/core/summary/summary_file_writer_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/contrib/tensorboard/db/summary_file_writer.h"
+#include "tensorflow/core/summary/summary_file_writer.h"
 
 #include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
diff --git a/tensorflow/contrib/tensorboard/db/vacuum.cc b/tensorflow/core/summary/vacuum.cc
similarity index 100%
rename from tensorflow/contrib/tensorboard/db/vacuum.cc
rename to tensorflow/core/summary/vacuum.cc
diff --git a/tensorflow/contrib/tpu/utils/BUILD b/tensorflow/core/tpu/BUILD
similarity index 82%
rename from tensorflow/contrib/tpu/utils/BUILD
rename to tensorflow/core/tpu/BUILD
index c27b73728702dcb1c84a82d3a07d15978ed2710f..5cbed402f75bf7ecf67ea06a2ad8d89260d7c1d1 100644
--- a/tensorflow/contrib/tpu/utils/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -8,9 +8,9 @@ cc_library(
     hdrs = ["tpu_embedding_optimization_parameters_utils.h"],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/contrib/tpu/proto:optimization_parameters_proto_cc",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core/protobuf/tpu:optimization_parameters_proto_cc",
         "@com_google_absl//absl/base",
     ],
 )
@@ -21,10 +21,10 @@ cc_library(
     hdrs = ["tpu_embedding_output_layout_utils.h"],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/contrib/tpu/proto:tpu_embedding_configuration_proto_cc",
-        "//tensorflow/contrib/tpu/proto:tpu_embedding_output_layout_proto_cc",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_cc",
+        "//tensorflow/core/protobuf/tpu:tpu_embedding_output_layout_proto_cc",
     ],
 )
diff --git a/tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.cc b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
similarity index 81%
rename from tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.cc
rename to tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
index 76cb5531cd0bc3a375d1434c31fa14a9d7f42476..2c179b5f83cbec888d1425e91ea7455e60acdb40 100644
--- a/tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.cc
+++ b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.h"
+#include "tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
@@ -134,12 +134,16 @@ Status GetGradientAccumulationSupport(OptimizationAlgorithm alg,
   }
 }
 namespace {
-// Make a normal state variable specification.
+// Make a normal state variable specification. Please refer to
+// //tensorflow/core/protobuf/tpu/optimization_parameters.proto
+// (StateVariableSpecification message) for instructions on how to set the
+// padding_initial_value field.
 StateVariableSpecification MakeStandardStateVariableSpecification(
-    const string& name) {
+    const string& name, double padding_initial_value) {
   StateVariableSpecification result;
   result.set_name(name);
-  result.mutable_user_defined();
+  result.mutable_user_defined()->set_padding_initial_value(
+      padding_initial_value);
   return result;
 }
 }  // namespace
@@ -149,14 +153,14 @@ Status GetOptimizationAlgorithmStateVariables(
     std::vector<StateVariableSpecification>* state_variables) {
   // The first parameter set is always the weights themselves.
   state_variables->push_back(
-      MakeStandardStateVariableSpecification("parameters"));
+      MakeStandardStateVariableSpecification("parameters", 0.0));
   // The order of the returned parameters needs to match the offsets used by
   // the algorithm implementations in test_util.cc and
   // address_handler_program_creator.cc.
   switch (alg) {
     case OptimizationAlgorithm::kAdagrad: {
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("accumulators"));
+          MakeStandardStateVariableSpecification("accumulators", 0.1));
       break;
     }
     case OptimizationAlgorithm::kStochasticGradientDescent: {
@@ -165,53 +169,58 @@ Status GetOptimizationAlgorithmStateVariables(
     }
     case OptimizationAlgorithm::kFtrl: {
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("accumulators"));
+          MakeStandardStateVariableSpecification("accumulators", 0.1));
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("linears"));
+          MakeStandardStateVariableSpecification("linears", 0.0));
       break;
     }
     case OptimizationAlgorithm::kAdam: {
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("momenta"));
+          MakeStandardStateVariableSpecification("momenta", 0.0));
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("velocities"));
+          MakeStandardStateVariableSpecification("velocities", 0.0));
       break;
     }
     case OptimizationAlgorithm::kMomentum: {
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("momenta"));
+          MakeStandardStateVariableSpecification("momenta", 0.0));
       break;
     }
     case OptimizationAlgorithm::kRmsProp: {
-      state_variables->push_back(MakeStandardStateVariableSpecification("ms"));
-      state_variables->push_back(MakeStandardStateVariableSpecification("mom"));
+      state_variables->push_back(
+          MakeStandardStateVariableSpecification("ms", 1.0));
+      state_variables->push_back(
+          MakeStandardStateVariableSpecification("mom", 0.0));
       break;
     }
     case OptimizationAlgorithm::kCenteredRmsProp: {
-      state_variables->push_back(MakeStandardStateVariableSpecification("ms"));
-      state_variables->push_back(MakeStandardStateVariableSpecification("mom"));
-      state_variables->push_back(MakeStandardStateVariableSpecification("mg"));
+      state_variables->push_back(
+          MakeStandardStateVariableSpecification("ms", 1.0));
+      state_variables->push_back(
+          MakeStandardStateVariableSpecification("mom", 0.0));
+      state_variables->push_back(
+          MakeStandardStateVariableSpecification("mg", 0.0));
       break;
     }
     case OptimizationAlgorithm::kMdlAdagradLight: {
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("accumulators"));
+          MakeStandardStateVariableSpecification("accumulators", 0.1));
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("weights"));
+          MakeStandardStateVariableSpecification("weights", 0.0));
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("benefits"));
+          MakeStandardStateVariableSpecification("benefits", 0.0));
       break;
     }
     case OptimizationAlgorithm::kAdadelta: {
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("accumulators"));
+          MakeStandardStateVariableSpecification("accumulators", 0.0));
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("updates"));
+          MakeStandardStateVariableSpecification("updates", 0.0));
       break;
     }
     case OptimizationAlgorithm::kProximalAdagrad: {
       state_variables->push_back(
-          MakeStandardStateVariableSpecification("accumulators"));
+          MakeStandardStateVariableSpecification("accumulators", 0.1));
       break;
     }
     case OptimizationAlgorithm::PARAMETERS_NOT_SET: {
diff --git a/tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.h b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h
similarity index 91%
rename from tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.h
rename to tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h
index 81d50264edb93e889d736c62a493b058e2f1bd56..ceb07ff35510ae3b034ad391456e5a8a21fa4240 100644
--- a/tensorflow/contrib/tpu/utils/tpu_embedding_optimization_parameters_utils.h
+++ b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TPU_UTILS_TPU_EMBEDDING_OPTIMIZATION_PARAMETERS_UTILS_H_
-#define TENSORFLOW_CONTRIB_TPU_UTILS_TPU_EMBEDDING_OPTIMIZATION_PARAMETERS_UTILS_H_
+#ifndef TENSORFLOW_CORE_TPU_TPU_EMBEDDING_OPTIMIZATION_PARAMETERS_UTILS_H_
+#define TENSORFLOW_CORE_TPU_TPU_EMBEDDING_OPTIMIZATION_PARAMETERS_UTILS_H_
 
 #include <string>
 #include "absl/base/casts.h"
-#include "tensorflow/contrib/tpu/proto/optimization_parameters.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/tpu/optimization_parameters.pb.h"
 
 namespace tensorflow {
 namespace tpu {
@@ -87,4 +87,4 @@ const float kGradientAccumulatorInitialValue = absl::bit_cast<float, uint32>(1);
 }  // namespace tpu
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TPU_UTILS_TPU_EMBEDDING_OPTIMIZATION_PARAMETERS_UTILS_H_
+#endif  // TENSORFLOW_CORE_TPU_TPU_EMBEDDING_OPTIMIZATION_PARAMETERS_UTILS_H_
diff --git a/tensorflow/contrib/tpu/utils/tpu_embedding_output_layout_utils.cc b/tensorflow/core/tpu/tpu_embedding_output_layout_utils.cc
similarity index 96%
rename from tensorflow/contrib/tpu/utils/tpu_embedding_output_layout_utils.cc
rename to tensorflow/core/tpu/tpu_embedding_output_layout_utils.cc
index 8480ec4b8bb98e867db3e4e4ed14d4cc529efe49..3a027757af7cb90d465e230b9934a4214888c4f1 100644
--- a/tensorflow/contrib/tpu/utils/tpu_embedding_output_layout_utils.cc
+++ b/tensorflow/core/tpu/tpu_embedding_output_layout_utils.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/tpu/utils/tpu_embedding_output_layout_utils.h"
-#include "tensorflow/contrib/tpu/proto/tpu_embedding_output_layout.pb.h"
+#include "tensorflow/core/tpu/tpu_embedding_output_layout_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/protobuf/tpu/tpu_embedding_output_layout.pb.h"
 
 namespace tensorflow {
 namespace tpu {
diff --git a/tensorflow/contrib/tpu/utils/tpu_embedding_output_layout_utils.h b/tensorflow/core/tpu/tpu_embedding_output_layout_utils.h
similarity index 81%
rename from tensorflow/contrib/tpu/utils/tpu_embedding_output_layout_utils.h
rename to tensorflow/core/tpu/tpu_embedding_output_layout_utils.h
index c10fbeeff2b5af93a118902c0afb3b59cc1a9d60..5bff401b9d2d37f35086fb7c8a39c62d79d7daa9 100644
--- a/tensorflow/contrib/tpu/utils/tpu_embedding_output_layout_utils.h
+++ b/tensorflow/core/tpu/tpu_embedding_output_layout_utils.h
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TPU_UTILS_TPU_EMBEDDING_OUTPUT_LAYOUT_UTILS_H_
-#define TENSORFLOW_CONTRIB_TPU_UTILS_TPU_EMBEDDING_OUTPUT_LAYOUT_UTILS_H_
+#ifndef TENSORFLOW_CORE_TPU_TPU_EMBEDDING_OUTPUT_LAYOUT_UTILS_H_
+#define TENSORFLOW_CORE_TPU_TPU_EMBEDDING_OUTPUT_LAYOUT_UTILS_H_
 
-#include "tensorflow/contrib/tpu/proto/tpu_embedding_configuration.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/tpu/tpu_embedding_configuration.pb.h"
 
 namespace tensorflow {
 namespace tpu {
@@ -35,4 +35,4 @@ Status ComputeOutputTensorShapes(
 }  // namespace tpu
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CONTRIB_TPU_UTILS_TPU_EMBEDDING_OUTPUT_LAYOUT_UTILS_H_
+#endif  // TENSORFLOW_CORE_TPU_TPU_EMBEDDING_OUTPUT_LAYOUT_UTILS_H_
diff --git a/tensorflow/core/util/bcast.h b/tensorflow/core/util/bcast.h
index 6d73c38e3c904458e7438915d5fe35db9f4c8fc8..2d647fd8d86866c93f2a3890e3e40e7f70f670f5 100644
--- a/tensorflow/core/util/bcast.h
+++ b/tensorflow/core/util/bcast.h
@@ -105,15 +105,21 @@ class BCast {
   static Vec FromShape(const TensorShape& shape);
   static TensorShape ToShape(const BCast::Vec& vec);
 
-  template <int NDIMS>
-  static Eigen::array<Eigen::DenseIndex, NDIMS> ToIndexArray(
+  template <typename IndexType, int NDIMS>
+  static Eigen::array<IndexType, NDIMS> ToIndexArrayType(
       const BCast::Vec& vec) {
     CHECK_EQ(vec.size(), NDIMS);
-    Eigen::array<Eigen::DenseIndex, NDIMS> ret;
+    Eigen::array<IndexType, NDIMS> ret;
     for (int i = 0; i < NDIMS; ++i) ret[i] = vec[i];
     return ret;
   }
 
+  template <int NDIMS>
+  static Eigen::array<Eigen::DenseIndex, NDIMS> ToIndexArray(
+      const BCast::Vec& vec) {
+    return ToIndexArrayType<Eigen::DenseIndex, NDIMS>(vec);
+  }
+
  private:
   bool valid_ = true;
   Vec x_reshape_;
diff --git a/tensorflow/core/util/cuda_launch_config.h b/tensorflow/core/util/cuda_launch_config.h
index 080d4067cec69084b54ba1c096d01198a8e48d20..c0ae6349f755dcbd643493ccfe82374d12bc2baf 100644
--- a/tensorflow/core/util/cuda_launch_config.h
+++ b/tensorflow/core/util/cuda_launch_config.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include <algorithm>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "cuda/include/cuda.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor.h"
diff --git a/tensorflow/core/util/device_name_utils.cc b/tensorflow/core/util/device_name_utils.cc
index cb088faec1ece7cffde4499df900be9d8dd16bc5..56e618872a71e190cbec9c1cd33e1b246a1c9e08 100644
--- a/tensorflow/core/util/device_name_utils.cc
+++ b/tensorflow/core/util/device_name_utils.cc
@@ -289,6 +289,30 @@ bool DeviceNameUtils::IsSpecification(const ParsedName& less_specific,
   return true;
 }
 
+void DeviceNameUtils::EnsureSpecification(ParsedName* more_specific,
+                                          const ParsedName& less_specific) {
+  if (less_specific.has_job) {
+    more_specific->has_job = true;
+    more_specific->job = less_specific.job;
+  }
+  if (less_specific.has_replica) {
+    more_specific->has_replica = true;
+    more_specific->replica = less_specific.replica;
+  }
+  if (less_specific.has_task) {
+    more_specific->has_task = true;
+    more_specific->task = less_specific.task;
+  }
+  if (less_specific.has_type) {
+    more_specific->has_type = true;
+    more_specific->type = less_specific.type;
+  }
+  if (less_specific.has_id) {
+    more_specific->has_id = true;
+    more_specific->id = less_specific.id;
+  }
+}
+
 /* static */
 bool DeviceNameUtils::IsCompleteSpecification(const ParsedName& pattern,
                                               const ParsedName& name) {
diff --git a/tensorflow/core/util/device_name_utils.h b/tensorflow/core/util/device_name_utils.h
index bb5e2b3f0c42b321bc7ab45cdad2ec951671be96..b047e814bd694a775af0487365c85ce02ad573fd 100644
--- a/tensorflow/core/util/device_name_utils.h
+++ b/tensorflow/core/util/device_name_utils.h
@@ -110,6 +110,11 @@ class DeviceNameUtils {
   static bool IsSpecification(const ParsedName& less_specific,
                               const ParsedName& more_specific);
 
+  // Makes minimal changes to more_specific so that it becomes a
+  // specification of less_specific.
+  static void EnsureSpecification(ParsedName* more_specific,
+                                  const ParsedName& less_specific);
+
   // Like IsSpecification, but the second argument "name" must have a
   // non-wildcard value for all of its components.
   static bool IsCompleteSpecification(const ParsedName& pattern,
diff --git a/tensorflow/core/util/dump_graph.cc b/tensorflow/core/util/dump_graph.cc
index 523d37ecc244b3634545ea82385b377c871569c8..d275e076f865f809192e6f3aea652434d5654bb3 100644
--- a/tensorflow/core/util/dump_graph.cc
+++ b/tensorflow/core/util/dump_graph.cc
@@ -84,6 +84,10 @@ string WriteTextProtoToUniqueFile(Env* env, const string& name,
     dir = getenv("TF_DUMP_GRAPH_PREFIX");
   }
   if (!dir) {
+    LOG(WARNING)
+        << "Failed to dump " << name << " because dump location is not "
+        << " specified through either TF_DUMP_GRAPH_PREFIX environment "
+        << "variable or function argument.";
     return "(TF_DUMP_GRAPH_PREFIX not specified)";
   }
   Status status = env->RecursivelyCreateDir(dir);
diff --git a/tensorflow/core/util/event.proto b/tensorflow/core/util/event.proto
index 9ce85be551191dee754f34ec531e65f3eac056b7..2d3ae62777358ee371c60fe9b04d27d140c6f414 100644
--- a/tensorflow/core/util/event.proto
+++ b/tensorflow/core/util/event.proto
@@ -95,7 +95,7 @@ enum WorkerHealth {
 // signal is received.
 enum WorkerShutdownMode {
   DEFAULT = 0;
-  SHUTDOWN_IMMEDIATELY = 1;
+  NOT_CONFIGURED = 1;
   WAIT_FOR_COORDINATOR = 2;
 }
 
diff --git a/tensorflow/core/util/example_proto_fast_parsing.cc b/tensorflow/core/util/example_proto_fast_parsing.cc
index 3cc75bbd1f353183184462ec9495c0492cf1442b..8877cd0d3f1d9253d6aea482b9eed107eeccfe66 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
+#include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/util/presized_cuckoo_map.h"
@@ -161,10 +162,30 @@ class Feature {
         if (!stream.ReadVarint32(&packed_length)) return false;
         auto packed_limit = stream.PushLimit(packed_length);
 
-        while (!stream.ExpectAtEnd()) {
-          uint32 buffer32;
-          if (!stream.ReadLittleEndian32(&buffer32)) return false;
-          float_list->push_back(absl::bit_cast<float>(buffer32));
+        // If the result data type is float and we are on a little endian
+        // machine then we can simply memcpy the data from the proto into the
+        // result vector.
+        constexpr int32 kNumFloatBytes = 4;
+        if (port::kLittleEndian &&
+            sizeof(typename Result::value_type) == kNumFloatBytes) {
+          // Store the initial size to know the offset we have to start writing
+          // data from before resizing the output "vector".
+          const size_t initial_size = float_list->size();
+          float_list->resize(initial_size + packed_length / kNumFloatBytes);
+          // Calculate the length of the buffer available what can be less than
+          // what we requested in resize in case of a LimitedArraySlice.
+          const uint32 bytes_to_copy =
+              std::min(static_cast<uint32>((float_list->size() - initial_size) *
+                                           kNumFloatBytes),
+                       packed_length);
+          if (!stream.ReadRaw(float_list->data() + initial_size, bytes_to_copy))
+            return false;
+        } else {
+          while (!stream.ExpectAtEnd()) {
+            uint32 buffer32;
+            if (!stream.ReadLittleEndian32(&buffer32)) return false;
+            float_list->push_back(absl::bit_cast<float>(buffer32));
+          }
         }
 
         stream.PopLimit(packed_limit);
@@ -448,8 +469,10 @@ struct SeededHasher {
 template <typename T>
 class LimitedArraySlice {
  public:
+  using value_type = T;
+
   LimitedArraySlice(T* begin, size_t num_elements)
-      : current_(begin), end_(begin + num_elements) {}
+      : current_(begin), begin_(begin), end_(begin + num_elements) {}
 
   // May return negative if there were push_back calls after slice was filled.
   int64 EndDistance() const { return end_ - current_; }
@@ -462,8 +485,21 @@ class LimitedArraySlice {
     ++current_;
   }
 
+  // Returns the number of elements in the slice.
+  size_t size() const { return std::min(current_ - begin_, end_ - begin_); }
+
+  // Attempts to resize the vector to the given size. It does so by advancing
+  // the pointer to the current element, possibly beyond the end of the slice.
+  // As a consequence, calling `size()` after `resize(x)` was called might
+  // return a value less than `x`.
+  void resize(size_t size) { current_ = begin_ + size; }
+
+  // Returns the pointer to the underlying data buffer.
+  T* data() { return begin_; }
+
  private:
   T* current_;
+  T* begin_;
   T* end_;
 };
 
diff --git a/tensorflow/core/util/memmapped_file_system.cc b/tensorflow/core/util/memmapped_file_system.cc
index d3439cbc9385184da830f70e53acb27eff570ba1..b1773a25171916b6da0b3e0b86129ee25c32b1b6 100644
--- a/tensorflow/core/util/memmapped_file_system.cc
+++ b/tensorflow/core/util/memmapped_file_system.cc
@@ -56,6 +56,11 @@ class RandomAccessFileFromMemmapped : public RandomAccessFile {
 
   ~RandomAccessFileFromMemmapped() override = default;
 
+  Status Name(StringPiece* result) const override {
+    return errors::Unimplemented(
+        "RandomAccessFileFromMemmapped does not support Name()");
+  }
+
   Status Read(uint64 offset, size_t to_read, StringPiece* result,
               char* scratch) const override {
     if (offset >= length_) {
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 928807458aca3c79d52e14509eb4238e134b5cdf..91f9bc0362594cfd285c13e2011798ac1718e960 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
 #ifdef INTEL_MKL
 
+#include <list>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -34,8 +35,7 @@ limitations under the License.
 #endif
 
 #ifdef INTEL_MKL_ML_ONLY
-#error \
-    "Compiling for INTEL MKL ML only is no longer supported.Please use MKL DNN (the default option for --config=mkl)"
+#error "Please use INTEL MKL DNN (the default option for --config=mkl)."
 #endif
 
 #ifdef INTEL_MKL_ML_ONLY
@@ -86,7 +86,7 @@ namespace tensorflow {
 // For use with MKL ML, has been deprecated
 typedef enum { W = 0, H = 1, C = 2, N = 3 } MklDims;
 
-// The dimensions order that MKL DNN internally uses for 2D activations
+// The dimensions order that MKL-DNN internally uses for 2D activations
 // [Batch, Channel, Height, Width] and
 // for 2D filters [Out_Channel, In_Channel, Height, Width].
 typedef enum {
@@ -98,7 +98,7 @@ typedef enum {
   Dim_I = 1
 } MklDnnDims;
 
-// The dimensions order that MKL DNN internally uses for 3D activations
+// The dimensions order that MKL-DNN internally uses for 3D activations
 // [Batch, Channel, Depth, Height, Width] and
 // for 3D filters [Out_Channel, In_Channel, Depth, Height, Width].
 typedef enum {
@@ -111,6 +111,35 @@ typedef enum {
   Dim3d_I = 1
 } MklDnnDims3D;
 
+// Enum for the order of dimensions of a TF 2D filter with shape [filter_height,
+// filter_width, in_channels, out_channels]
+typedef enum {
+  TF_2DFILTER_DIM_H = 0,
+  TF_2DFILTER_DIM_W = 1,
+  TF_2DFILTER_DIM_I = 2,
+  TF_2DFILTER_DIM_O = 3
+} TFFilterDims2d;
+
+// Enum for the order of dimensions of a TF 3D filter with shape [filter_depth,
+// filter_height, filter_width, in_channels, out_channels]
+typedef enum {
+  TF_3DFILTER_DIM_P = 0,
+  TF_3DFILTER_DIM_H = 1,
+  TF_3DFILTER_DIM_W = 2,
+  TF_3DFILTER_DIM_I = 3,
+  TF_3DFILTER_DIM_O = 4
+} TFFilterDims3d;
+
+// The dimensions order that MKL-DNN requires for the filter in a grouped
+// convolution (2D only)
+typedef enum {
+  MKL_GROUP_FILTER_DIM_G = 0,
+  MKL_GROUP_FILTER_DIM_O = 1,
+  MKL_GROUP_FILTER_DIM_I = 2,
+  MKL_GROUP_FILTER_DIM_H = 3,
+  MKL_GROUP_FILTER_DIM_W = 4
+} MklDnnFilterGroupDims;
+
 // Enum used to templatize MklOp kernel implementations
 // that support both fp32 and int8 versions.
 enum class MklQuantization {
@@ -808,7 +837,6 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
       return mkl_tensor;  // return input since it is already TF tensor
 
     TensorShape output_shape = mkl_shape.GetTfShape();
-    ;
 
     // Allocate output tensor.
     context->allocate_temp(DataTypeToEnum<T>::v(), output_shape,
@@ -1737,6 +1765,7 @@ class MklDnnData {
   inline void SetUsrMem(const memory::primitive_desc& pd,
                         void* data_buffer = nullptr) {
     CHECK_NOTNULL(cpu_engine_);
+    if (user_memory_) delete user_memory_;
     // TODO(nhasabni): can we remove dynamic memory allocation?
     if (data_buffer) {
       user_memory_ = new memory(pd, data_buffer);
@@ -2031,6 +2060,111 @@ class MklPrimitive {
 
 const mkldnn::memory::dims NONE_DIMS = {};
 
+//
+// LRUCache is a class which implements LRU (Least Recently Used) cache.
+// The implementation is similar to that of
+//    tensorflow/core/platform/cloud/expiring_lru_cache.h
+// without its thread-safe part because the cache is supposed to be
+// used as thread local (for instance, MklPrimitive caching).
+//
+// The LRU list maintains objects in chronological order based on
+// creation time, with the least recently accessed object at the
+// tail of LRU list, while the most recently accessed object
+// at the head of LRU list.
+//
+// This class is used to maintain an upper bound on the total number of
+// cached items. When the cache reaches its capacity, the LRU item will
+// be removed and replaced by a new one from SetOp call.
+//
+template <typename T>
+class LRUCache {
+ public:
+  explicit LRUCache(size_t capacity) {
+    capacity_ = capacity;
+    Clear();
+  }
+
+  T* GetOp(const string& key) {
+    auto it = cache_.find(key);
+    if (it == cache_.end()) {
+      return nullptr;
+    }
+
+    // Move to the front of LRU list as the most recently accessed.
+    lru_list_.erase(it->second.lru_iterator);
+    lru_list_.push_front(it->first);
+    it->second.lru_iterator = lru_list_.begin();
+    return it->second.op;
+  }
+
+  void SetOp(const string& key, T* op) {
+    if (lru_list_.size() >= capacity_) {
+      Delete();
+    }
+
+    // Insert an entry to the front of the LRU list
+    lru_list_.push_front(key);
+    Entry entry(op, lru_list_.begin());
+    cache_.emplace(std::make_pair(key, std::move(entry)));
+  }
+
+  void Clear() {
+    if (lru_list_.empty()) return;
+
+    // Clean up the cache
+    cache_.clear();
+    lru_list_.clear();
+  }
+
+ private:
+  struct Entry {
+    // The entry's value.
+    T* op;
+
+    // A list iterator pointing to the entry's position in the LRU list.
+    std::list<string>::iterator lru_iterator;
+
+    // Constructor
+    Entry(T* op, std::list<string>::iterator it) {
+      this->op = op;
+      this->lru_iterator = it;
+    }
+
+    // Move construcctor
+    Entry(Entry&& source) noexcept
+        : lru_iterator(std::move(source.lru_iterator)) {
+      op = std::move(source.op);
+      source.op = std::forward<T*>(nullptr);
+    }
+
+    // Destructor
+    ~Entry() {
+      if (op != nullptr) delete op;
+    }
+  };
+
+  // Remove the least recently accessed entry from LRU list, which
+  // is the tail of lru_list_. Update cache_ correspondingly.
+  bool Delete() {
+    if (lru_list_.empty()) return false;
+    string key = lru_list_.back();
+    lru_list_.pop_back();
+    cache_.erase(key);
+    return true;
+  }
+
+  // Cache capacity
+  size_t capacity_;
+
+  // The cache, a map from string key to a LRU entry.
+  std::unordered_map<string, Entry> cache_;
+
+  // The LRU list of entries.
+  // The front of the list contains the key of the most recently accessed
+  // entry, while the back of the list is the least recently accessed entry.
+  std::list<string> lru_list_;
+};
+
 template <typename T>
 class MklPrimitiveFactory {
  public:
@@ -2039,23 +2173,13 @@ class MklPrimitiveFactory {
   ~MklPrimitiveFactory() {}
 
   MklPrimitive* GetOp(const string& key) {
-    auto& map = MklPrimitiveFactory<T>::GetHashMap();
-    auto stream_iter = map.find(key);
-    if (stream_iter == map.end()) {
-      return nullptr;
-    } else {
-      CHECK(stream_iter->second != nullptr) << "nullptr present in map";
-      return stream_iter->second;
-    }
+    auto& lru_cache = MklPrimitiveFactory<T>::GetLRUCache();
+    return lru_cache.GetOp(key);
   }
 
   void SetOp(const string& key, MklPrimitive* op) {
-    auto& map = MklPrimitiveFactory<T>::GetHashMap();
-    auto stream_iter = map.find(key);
-
-    CHECK(stream_iter == map.end());
-
-    map[key] = op;
+    auto& lru_cache = MklPrimitiveFactory<T>::GetLRUCache();
+    lru_cache.SetOp(key, op);
   }
 
   /// Function to decide whether HW has AVX512 or AVX2
@@ -2075,9 +2199,10 @@ class MklPrimitiveFactory {
   }
 
  private:
-  static inline std::unordered_map<string, MklPrimitive*>& GetHashMap() {
-    static thread_local std::unordered_map<string, MklPrimitive*> map_;
-    return map_;
+  static inline LRUCache<MklPrimitive>& GetLRUCache() {
+    static const int kCapacity = 1024;  // cache capacity
+    static thread_local LRUCache<MklPrimitive> lru_cache_(kCapacity);
+    return lru_cache_;
   }
 };
 
diff --git a/tensorflow/core/util/mkl_util_test.cc b/tensorflow/core/util/mkl_util_test.cc
index 4f837f105d2c4fc12a366f52a1db72ce376b79f6..bed6febe377b6109328254c72446eb913be330a4 100644
--- a/tensorflow/core/util/mkl_util_test.cc
+++ b/tensorflow/core/util/mkl_util_test.cc
@@ -84,6 +84,40 @@ TEST(MklUtilTest, MklDnnBlockedFormatTest) {
   EXPECT_EQ(b_md2.data.format, mkldnn_blocked);
 }
 
+TEST(MklUtilTest, LRUCacheTest) {
+  // The cached objects are of type int*
+  size_t capacity = 100;
+  size_t num_objects = capacity + 10;
+  LRUCache<int> lru_cache(capacity);
+
+  // Test SetOp: be able to set more ops than the capacity
+  for (int k = 0; k < num_objects; k++) {
+    lru_cache.SetOp(std::to_string(k), new int(k));
+  }
+
+  // Test GetOp and capacity:
+  // Least recently accessed objects should not be in cache any more.
+  for (int k = 0; k < num_objects - capacity; ++k) {
+    EXPECT_EQ(nullptr, lru_cache.GetOp(std::to_string(k)));
+  }
+
+  // Test GetOp and capacity:
+  // Most recently accessed objects should still be in cache.
+  for (int k = num_objects - capacity; k < num_objects; ++k) {
+    int* int_ptr = lru_cache.GetOp(std::to_string(k));
+    EXPECT_NE(nullptr, int_ptr);
+    EXPECT_EQ(*int_ptr, k);
+  }
+
+  // Clean up the cache
+  lru_cache.Clear();
+
+  // After clean up, there should be no cached object.
+  for (int k = 0; k < num_objects; ++k) {
+    EXPECT_EQ(nullptr, lru_cache.GetOp(std::to_string(k)));
+  }
+}
+
 #endif  // INTEL_MKL_ML_ONLY
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/padding.cc b/tensorflow/core/util/padding.cc
index 117de5ee4bdd61af148ad7f1e620e940cb38216a..9e7fb8489e8e37b94ebecd53fde0568c68879c92 100644
--- a/tensorflow/core/util/padding.cc
+++ b/tensorflow/core/util/padding.cc
@@ -29,12 +29,55 @@ Status GetNodeAttr(const NodeDef& node_def, StringPiece attr_name,
     *value = SAME;
   } else if (str_value == "VALID") {
     *value = VALID;
+  } else if (str_value == "EXPLICIT") {
+    *value = EXPLICIT;
   } else {
     return errors::NotFound(str_value, " is not an allowed padding type");
   }
   return Status::OK();
 }
 
+Status CheckValidPadding(Padding padding_type,
+                         const std::vector<int64>& explicit_paddings,
+                         int num_dims, TensorFormat data_format) {
+  if (padding_type == Padding::EXPLICIT) {
+    if (explicit_paddings.size() != 2 * num_dims) {
+      return errors::InvalidArgument(
+          "explicit_paddings attribute must contain ", 2 * num_dims,
+          " values, but got: ", explicit_paddings.size());
+    }
+    for (int64 padding_value : explicit_paddings) {
+      if (padding_value < 0) {
+        return errors::InvalidArgument(
+            "All elements of explicit_paddings must be nonnegative");
+      }
+    }
+    const int32 batch_index = GetTensorBatchDimIndex(num_dims, data_format);
+    const int32 depth_index = GetTensorFeatureDimIndex(num_dims, data_format);
+    if (explicit_paddings[2 * batch_index] != 0 ||
+        explicit_paddings[2 * batch_index + 1] != 0 ||
+        explicit_paddings[2 * depth_index] != 0 ||
+        explicit_paddings[2 * depth_index + 1] != 0) {
+      return errors::InvalidArgument(
+          "Nonzero explicit padding in the batch or depth dimensions is not "
+          "supported");
+    }
+  } else if (!explicit_paddings.empty()) {
+    return errors::InvalidArgument(
+        "explicit_paddings attribute must be empty if the padding attribute is "
+        "not EXPLICIT");
+  }
+  return Status::OK();
+}
+
 string GetPaddingAttrString() { return "padding: {'SAME', 'VALID'}"; }
 
+string GetPaddingAttrStringWithExplicit() {
+  return "padding: {'SAME', 'VALID', 'EXPLICIT'}";
+}
+
+string GetExplicitPaddingsAttrString() {
+  return "explicit_paddings: list(int) = []";
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/util/padding.h b/tensorflow/core/util/padding.h
index 76f9b4dd9a99e7b4e152ca0c06b9323acf84b13d..a1dd1c0bd9556935f233609683a79452f3692e06 100644
--- a/tensorflow/core/util/padding.h
+++ b/tensorflow/core/util/padding.h
@@ -20,8 +20,10 @@ limitations under the License.
 // kernels.
 
 #include <string>
+#include <vector>
 
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
 
@@ -34,16 +36,29 @@ class NodeDef;
 //   VALID: No padding is carried out.
 //   SAME: The pad value is computed so that the output will have the same
 //         dimensions as the input.
+//   EXPLICIT: The user specifies the pad values in the explicit_padding
+//             attribute.
 // The padded area is zero-filled.
 enum Padding {
-  VALID = 1,  // No padding.
-  SAME = 2,   // Input and output layers have the same size.
+  VALID = 1,     // No padding.
+  SAME = 2,      // Input and output layers have the same size.
+  EXPLICIT = 3,  // Padding is explicitly specified
 };
 
+// Returns an error if the padding attributes are invalid.
+Status CheckValidPadding(Padding padding_type,
+                         const std::vector<int64>& explicit_paddings,
+                         int num_dims, TensorFormat data_format);
+
 // Return the string containing the list of valid padding types, that can be
 // used as an Attr() in REGISTER_OP.
 string GetPaddingAttrString();
 
+// Like GetPaddingAttrString(), but also includes EXPLICIT.
+string GetPaddingAttrStringWithExplicit();
+
+string GetExplicitPaddingsAttrString();
+
 // Specialization to parse an attribute directly into a Padding enum.
 Status GetNodeAttr(const NodeDef& node_def, StringPiece attr_name,
                    Padding* value);
diff --git a/tensorflow/core/util/port.cc b/tensorflow/core/util/port.cc
index e01058dff6cd70eecece10285f485c2b36352bdd..7dc8ddda06ae77bd058e472ab375d2ed3f760437 100644
--- a/tensorflow/core/util/port.cc
+++ b/tensorflow/core/util/port.cc
@@ -15,9 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/util/port.h"
 
-#if GOOGLE_CUDA
-#include "cuda/include/cuda.h"
-#endif
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/util/presized_cuckoo_map.h b/tensorflow/core/util/presized_cuckoo_map.h
index f88ad2faaff344832d65b04357c3d8c2665ebad5..1cdde34562a7616827850fde830373350138687d 100644
--- a/tensorflow/core/util/presized_cuckoo_map.h
+++ b/tensorflow/core/util/presized_cuckoo_map.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/prefetch.h"
 
 namespace tensorflow {
 
@@ -132,6 +133,15 @@ class PresizedCuckooMap {
            FindInBucket(k, fast_map_to_buckets(h2(tk)), out);
   }
 
+  // Prefetch memory associated with the key k into cache levels specified by
+  // hint.
+  template <port::PrefetchHint hint = port::PREFETCH_HINT_T0>
+  void PrefetchKey(const key_type k) const {
+    const uint64 tk = key_transform(k);
+    port::prefetch<hint>(&buckets_[fast_map_to_buckets(tk)].keys);
+    port::prefetch<hint>(&buckets_[fast_map_to_buckets(h2(tk))].keys);
+  }
+
   int64 MemoryUsed() const {
     return sizeof(PresizedCuckooMap<value>) + sizeof(CuckooPathQueue);
   }
diff --git a/tensorflow/core/util/presized_cuckoo_map_test.cc b/tensorflow/core/util/presized_cuckoo_map_test.cc
index f2be1e8a2fffdd9b61839809667a858a512751d2..f2c7904b00452487ceef4a8f8a870af548e1af03 100644
--- a/tensorflow/core/util/presized_cuckoo_map_test.cc
+++ b/tensorflow/core/util/presized_cuckoo_map_test.cc
@@ -13,12 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/util/presized_cuckoo_map.h"
 #include <array>
+
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/util/presized_cuckoo_map.h"
 
 namespace tensorflow {
 namespace {
@@ -50,6 +51,14 @@ TEST(PresizedCuckooMapTest, Basic) {
   EXPECT_EQ(out, 2);
 }
 
+TEST(PresizedCuckooMapTest, Prefetch) {
+  PresizedCuckooMap<int64> pscm(2);
+  EXPECT_TRUE(pscm.InsertUnique(1, 2));
+  // Works for both present and absent keys.
+  pscm.PrefetchKey(1);
+  pscm.PrefetchKey(2);
+}
+
 TEST(PresizedCuckooMapTest, TooManyItems) {
   static constexpr int kTableSize = 1000;
   PresizedCuckooMap<int> pscm(kTableSize);
diff --git a/tensorflow/core/util/proto/BUILD b/tensorflow/core/util/proto/BUILD
index 7e549c77647529934bc6cebef1f2996af47428bb..b990f0a74918454fcdf8dff44006ef2e6a5602e1 100644
--- a/tensorflow/core/util/proto/BUILD
+++ b/tensorflow/core/util/proto/BUILD
@@ -68,5 +68,20 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:platform_base",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "proto_utils_test",
+    srcs = ["proto_utils_test.cc"],
+    deps = [
+        ":proto_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/core/util/proto/decode.h b/tensorflow/core/util/proto/decode.h
index 8dde14dffcdc5ffe4d64360f3af40521efe29bf8..188830cc1f4b58da975bf69baddb2b51d6b17e50 100644
--- a/tensorflow/core/util/proto/decode.h
+++ b/tensorflow/core/util/proto/decode.h
@@ -91,7 +91,7 @@ inline const uint8* ReadVarint64FromArray(const uint8* buffer, bool* ok,
 // the 64 bit version instead of copying the code.
 inline const uint8* ReadVarint32FromArray(const uint8* buffer, bool* ok,
                                           uint32* value) {
-  uint64 tmp;
+  uint64 tmp = 0;
   const uint8* buf = ReadVarint64FromArray(buffer, ok, &tmp);
   *value = tmp & 0xffffffff;
   return buf;
@@ -106,7 +106,7 @@ const uint8* ReadFromArray(const uint8* buf, TensorType* value);
 template <>
 inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_INT32>(
     const uint8* buf, int64* value) {
-  uint32 temp;
+  uint32 temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
   *value = static_cast<int64>(temp);
@@ -116,7 +116,7 @@ inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_INT32>(
 template <>
 inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_INT32>(
     const uint8* buf, int32* value) {
-  uint32 temp;
+  uint32 temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
   *value = static_cast<int32>(temp);
@@ -126,7 +126,7 @@ inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_INT32>(
 template <>
 inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_INT64>(
     const uint8* buf, int64* value) {
-  uint64 temp;
+  uint64 temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
   *value = WrapUnsignedAsSigned64(temp);
@@ -136,7 +136,7 @@ inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_INT64>(
 template <>
 inline const uint8* ReadFromArray<uint64, WireFormatLite::TYPE_UINT32>(
     const uint8* buf, uint64* value) {
-  uint32 temp;
+  uint32 temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
   *value = temp;
@@ -160,7 +160,7 @@ inline const uint8* ReadFromArray<uint64, WireFormatLite::TYPE_UINT64>(
 template <>
 inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_SINT32>(
     const uint8* buf, int64* value) {
-  uint64 temp;
+  uint64 temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
   *value = WireFormatLite::ZigZagDecode32(temp);
@@ -170,7 +170,7 @@ inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_SINT32>(
 template <>
 inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_SINT32>(
     const uint8* buf, int32* value) {
-  uint32 temp;
+  uint32 temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
   *value = WireFormatLite::ZigZagDecode32(temp);
@@ -180,7 +180,7 @@ inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_SINT32>(
 template <>
 inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_SINT64>(
     const uint8* buf, int64* value) {
-  uint64 temp;
+  uint64 temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
   *value = WireFormatLite::ZigZagDecode64(temp);
@@ -280,7 +280,7 @@ inline const uint8* ReadFromArray<double, WireFormatLite::TYPE_DOUBLE>(
 template <>
 inline const uint8* ReadFromArray<bool, WireFormatLite::TYPE_BOOL>(
     const uint8* buf, bool* value) {
-  uint64 temp;
+  uint64 temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
   *value = temp != 0;
@@ -290,7 +290,7 @@ inline const uint8* ReadFromArray<bool, WireFormatLite::TYPE_BOOL>(
 template <>
 inline const uint8* ReadFromArray<int, WireFormatLite::TYPE_ENUM>(
     const uint8* buf, int* value) {
-  uint32 temp;
+  uint32 temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
   *value = static_cast<int>(temp);
diff --git a/tensorflow/core/util/proto/proto_utils.cc b/tensorflow/core/util/proto/proto_utils.cc
index 201f05a129b03bca8867a53a43886690de638579..f1064141390faba9f3d08a0a62c5459b3434e464 100644
--- a/tensorflow/core/util/proto/proto_utils.cc
+++ b/tensorflow/core/util/proto/proto_utils.cc
@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/util/proto/proto_utils.h"
+
+#include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 
-#include "tensorflow/core/util/proto/proto_utils.h"
-
 namespace tensorflow {
 namespace proto_utils {
 
@@ -66,5 +69,49 @@ bool IsCompatibleType(FieldDescriptor::Type field_type, DataType dtype) {
   }
 }
 
+Status ParseTextFormatFromString(absl::string_view input,
+                                 protobuf::Message* output) {
+  DCHECK(output != nullptr) << "output must be non NULL";
+  // When checks are disabled, instead log the error and return an error status.
+  if (output == nullptr) {
+    LOG(ERROR) << "output must be non NULL";
+    return Status(error::INVALID_ARGUMENT, "output must be non NULL");
+  }
+  string err;
+  StringErrorCollector err_collector(&err, /*one-indexing=*/true);
+  protobuf::TextFormat::Parser parser;
+  parser.RecordErrorsTo(&err_collector);
+  if (!parser.ParseFromString(string(input), output)) {
+    return Status(error::INVALID_ARGUMENT, err);
+  }
+  return Status::OK();
+}
+
+StringErrorCollector::StringErrorCollector(string* error_text)
+    : StringErrorCollector(error_text, false) {}
+
+StringErrorCollector::StringErrorCollector(string* error_text,
+                                           bool one_indexing)
+    : error_text_(error_text), index_offset_(one_indexing ? 1 : 0) {
+  DCHECK(error_text_ != nullptr) << "error_text must be non NULL";
+  // When checks are disabled, just log and then ignore added errors/warnings.
+  if (error_text_ == nullptr) {
+    LOG(ERROR) << "error_text must be non NULL";
+  }
+}
+
+void StringErrorCollector::AddError(int line, int column,
+                                    const string& message) {
+  if (error_text_ != nullptr) {
+    absl::SubstituteAndAppend(error_text_, "$0($1): $2\n", line + index_offset_,
+                              column + index_offset_, message);
+  }
+}
+
+void StringErrorCollector::AddWarning(int line, int column,
+                                      const string& message) {
+  AddError(line, column, message);
+}
+
 }  // namespace proto_utils
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/proto/proto_utils.h b/tensorflow/core/util/proto/proto_utils.h
index d5e0b9006c08be349d5466c52944d5b056b9a49b..9451e317a13dec9b0c96096d9a7144263efc600f 100644
--- a/tensorflow/core/util/proto/proto_utils.h
+++ b/tensorflow/core/util/proto/proto_utils.h
@@ -16,7 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_PROTO_PROTO_UTILS_H_
 #define TENSORFLOW_CORE_UTIL_PROTO_PROTO_UTILS_H_
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
@@ -27,6 +29,35 @@ using tensorflow::protobuf::FieldDescriptor;
 // Returns true if the proto field type can be converted to the tensor dtype.
 bool IsCompatibleType(FieldDescriptor::Type field_type, DataType dtype);
 
+// Parses a text-formatted protobuf from a string into the given Message* output
+// and returns status OK if valid, or INVALID_ARGUMENT with an accompanying
+// parser error message if the text format is invalid.
+Status ParseTextFormatFromString(absl::string_view input,
+                                 protobuf::Message* output);
+
+class StringErrorCollector : public protobuf::io::ErrorCollector {
+ public:
+  // String error_text is unowned and must remain valid during the use of
+  // StringErrorCollector.
+  explicit StringErrorCollector(string* error_text);
+  // If one_indexing is set to true, all line and column numbers will be
+  // increased by one for cases when provided indices are 0-indexed and
+  // 1-indexed error messages are desired
+  StringErrorCollector(string* error_text, bool one_indexing);
+  StringErrorCollector(const StringErrorCollector&) = delete;
+  StringErrorCollector& operator=(const StringErrorCollector&) = delete;
+
+  // Implementation of protobuf::io::ErrorCollector::AddError.
+  void AddError(int line, int column, const string& message) override;
+
+  // Implementation of protobuf::io::ErrorCollector::AddWarning.
+  void AddWarning(int line, int column, const string& message) override;
+
+ private:
+  string* const error_text_;
+  const int index_offset_;
+};
+
 }  // namespace proto_utils
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/util/proto/proto_utils_test.cc b/tensorflow/core/util/proto/proto_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7f6312a718511d19c82d8f0f2f1f6dba495e0cb7
--- /dev/null
+++ b/tensorflow/core/util/proto/proto_utils_test.cc
@@ -0,0 +1,112 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/proto/proto_utils.h"
+
+#include <gmock/gmock.h>
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+using proto_utils::ParseTextFormatFromString;
+using proto_utils::StringErrorCollector;
+using ::testing::ContainsRegex;
+
+TEST(ParseTextFormatFromStringTest, Success) {
+  protobuf::DescriptorProto output;
+  TF_ASSERT_OK(ParseTextFormatFromString("name: \"foo\"", &output));
+  EXPECT_EQ(output.name(), "foo");
+}
+
+TEST(ParseTextFormatFromStringTest, ErrorOnInvalidSyntax) {
+  protobuf::DescriptorProto output;
+  Status status = ParseTextFormatFromString("name: foo", &output);
+  EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+  EXPECT_THAT(status.error_message(), ContainsRegex("foo"));
+  EXPECT_FALSE(output.has_name());
+}
+
+TEST(ParseTextFormatFromStringTest, ErrorOnUnknownFieldName) {
+  protobuf::DescriptorProto output;
+  Status status = ParseTextFormatFromString("badname: \"foo\"", &output);
+  EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+  EXPECT_THAT(status.error_message(), ContainsRegex("badname"));
+  EXPECT_FALSE(output.has_name());
+}
+
+TEST(ParseTextFormatFromStringTest, DiesOnNullOutputPointer) {
+#ifndef NDEBUG
+  ASSERT_DEATH(ParseTextFormatFromString("foo", nullptr).IgnoreError(),
+               "output.*non NULL");
+#else
+  // Under NDEBUG we don't die but should still return an error status.
+  Status status = ParseTextFormatFromString("foo", nullptr);
+  EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+  EXPECT_THAT(status.error_message(), ContainsRegex("output.*non NULL"));
+#endif
+}
+
+TEST(StringErrorCollectorTest, AppendsError) {
+  string err;
+  StringErrorCollector collector(&err);
+  collector.AddError(1, 2, "foo");
+  EXPECT_EQ("1(2): foo\n", err);
+}
+
+TEST(StringErrorCollectorTest, AppendsWarning) {
+  string err;
+  StringErrorCollector collector(&err);
+  collector.AddWarning(1, 2, "foo");
+  EXPECT_EQ("1(2): foo\n", err);
+}
+
+TEST(StringErrorCollectorTest, AppendsMultipleError) {
+  string err;
+  StringErrorCollector collector(&err);
+  collector.AddError(1, 2, "foo");
+  collector.AddError(3, 4, "bar");
+  EXPECT_EQ("1(2): foo\n3(4): bar\n", err);
+}
+
+TEST(StringErrorCollectorTest, AppendsMultipleWarning) {
+  string err;
+  StringErrorCollector collector(&err);
+  collector.AddWarning(1, 2, "foo");
+  collector.AddWarning(3, 4, "bar");
+  EXPECT_EQ("1(2): foo\n3(4): bar\n", err);
+}
+
+TEST(StringErrorCollectorTest, OffsetWorks) {
+  string err;
+  StringErrorCollector collector(&err, true);
+  collector.AddError(1, 2, "foo");
+  collector.AddWarning(3, 4, "bar");
+  EXPECT_EQ("2(3): foo\n4(5): bar\n", err);
+}
+
+TEST(StringErrorCollectorTest, DiesOnNullErrorText) {
+#ifndef NDEBUG
+  ASSERT_DEATH(StringErrorCollector(nullptr), "error_text.*non NULL");
+#else
+  // Under NDEBUG we don't die and instead AddError/AddWarning just do nothing.
+  StringErrorCollector collector(nullptr);
+  collector.AddError(1, 2, "foo");
+  collector.AddWarning(3, 4, "bar");
+#endif
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/sparse/dim_comparator.h b/tensorflow/core/util/sparse/dim_comparator.h
index 0782e7e1a8af19a7936bde267c0905dc5f7d00e7..498df7a021df3e65557d96dc25577e9e24e911a6 100644
--- a/tensorflow/core/util/sparse/dim_comparator.h
+++ b/tensorflow/core/util/sparse/dim_comparator.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_UTIL_SPARSE_DIM_COMPARATOR_H_
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index 89c163aa5133fafc23b01c7153ac40d32efcaaf6..4e53c59ba364cc1daf7d8db7cd0529986a8e3094 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -22,11 +22,11 @@ limitations under the License.
 
 #include "absl/base/macros.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -63,7 +63,7 @@ class SparseTensor {
                                     ix.shape().dim_size(0), ", values = ",
                                     vals.shape().dim_size(0), ")"));
     }
-    int dims;
+    int dims = 0;
     TF_RETURN_IF_ERROR(GetDimsFromIx(ix, &dims));
     if (order.size() != dims) {
       return Status(error::INVALID_ARGUMENT,
diff --git a/tensorflow/core/util/stats_calculator.h b/tensorflow/core/util/stats_calculator.h
index e191737bb2c8eb85518e51b3a06884a7983a392e..5005ee08a4bf3292097820983ad85a8b56377a82 100644
--- a/tensorflow/core/util/stats_calculator.h
+++ b/tensorflow/core/util/stats_calculator.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <stdlib.h>
 
+#include <algorithm>
 #include <cmath>
 #include <limits>
 #include <map>
diff --git a/tensorflow/core/util/strided_slice_op.cc b/tensorflow/core/util/strided_slice_op.cc
index 55688e580848e42bdd453a270a530a5423fb3aec..0df810abd0058facd12e2e67625d80b824dc257b 100644
--- a/tensorflow/core/util/strided_slice_op.cc
+++ b/tensorflow/core/util/strided_slice_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/core/util/strided_slice_op.h"
 
 #include <array>
-#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
@@ -272,7 +272,7 @@ Status ValidateStridedSliceOp(
     const std::array<int64, 2> valid_range = {
         {stride_i > 0 ? 0 : -1, stride_i > 0 ? dim_i : dim_i - 1}};
 
-    auto canonical = [stride_i, i, dim_i, masks, valid_range](int64 x, int c) {
+    auto canonical = [stride_i, dim_i, masks, valid_range](int64 x, int c) {
       if (masks[c]) {
         return stride_i > 0 ? valid_range[c] : valid_range[(c + 1) & 1];
       } else {
diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h
index a296fb447e252e62809aeb17d9d00cf35ad15fc9..643e14e0b56bb152b5ca135cd4b813108b8eab16 100644
--- a/tensorflow/core/util/tensor_format.h
+++ b/tensorflow/core/util/tensor_format.h
@@ -408,18 +408,24 @@ inline int32 GetTensorDimIndex(TensorFormat format, char dimension) {
   return GetTensorDimIndex<2>(format, dimension);
 }
 
+inline int32 GetTensorDimIndex(TensorFormat format, char dimension,
+                               int num_total_dims) {
+  int32 index = (GetTensorSpatialDims(num_total_dims, format) == 3)
+                    ? GetTensorDimIndex<3>(format, dimension)
+                    : GetTensorDimIndex<2>(format, dimension);
+  CHECK(index >= 0 && index < num_total_dims)  // Crash OK.
+      << "Invalid index from the dimension: " << index << ", " << format << ", "
+      << dimension;
+  return index;
+}
+
 // Return the element from 'dimension_attributes' that corresponds to the
 // specified 'dimension' according to 'tensor_format'.
 template <typename T>
 T GetTensorDim(gtl::ArraySlice<T> dimension_attributes,
                TensorFormat tensor_format, char dimension) {
   int index =
-      (GetTensorSpatialDims(dimension_attributes.size(), tensor_format) == 3)
-          ? GetTensorDimIndex<3>(tensor_format, dimension)
-          : GetTensorDimIndex<2>(tensor_format, dimension);
-  CHECK(index >= 0 && index < dimension_attributes.size())
-      << "Invalid index from the dimension: " << index << ", " << tensor_format
-      << ", " << dimension;
+      GetTensorDimIndex(tensor_format, dimension, dimension_attributes.size());
   return dimension_attributes[index];
 }
 
@@ -476,6 +482,15 @@ inline int64 GetFilterDim(const Tensor& tensor,
   return GetFilterDim(tensor.shape(), filter_tensor_format, dimension);
 }
 
+inline void GetExplicitPaddingForDim(
+    const std::vector<int64>& explicit_paddings, TensorFormat tensor_format,
+    char dimension, int64* padding_before, int64* padding_after) {
+  int index =
+      GetTensorDimIndex(tensor_format, dimension, explicit_paddings.size() / 2);
+  *padding_before = explicit_paddings[2 * index];
+  *padding_after = explicit_paddings[2 * index + 1];
+}
+
 // Return the string that specifies the data format for convnet operations.
 string GetConvnetDataFormatAttrString();
 string GetConvnet3dDataFormatAttrString();
diff --git a/tensorflow/examples/adding_an_op/BUILD b/tensorflow/examples/adding_an_op/BUILD
index 2b39b3683f260b840b36e7f991b0d0c8e19aa18b..a4d6f204cd94f751c39ef71b23b512ccc35aa3b6 100644
--- a/tensorflow/examples/adding_an_op/BUILD
+++ b/tensorflow/examples/adding_an_op/BUILD
@@ -10,6 +10,7 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_tests_tags")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load("//tensorflow:tensorflow.bzl", "tf_exec_compatible_with")
 
 exports_files(["LICENSE"])
 
@@ -118,6 +119,7 @@ py_test(
     name = "cuda_op_test",
     size = "small",
     srcs = ["cuda_op_test.py"],
+    exec_compatible_with = tf_exec_compatible_with({"tags": tf_cuda_tests_tags()}),
     srcs_version = "PY2AND3",
     tags = tf_cuda_tests_tags() + ["notap"],
     deps = [
diff --git a/tensorflow/examples/android/BUILD b/tensorflow/examples/android/BUILD
index f5f0d7c3c852390ead414bf37260e531119e100b..5f99f0a9c90122ebe194b734af4950c0241156cd 100644
--- a/tensorflow/examples/android/BUILD
+++ b/tensorflow/examples/android/BUILD
@@ -37,8 +37,7 @@ cc_binary(
         "-lm",
         "-z defs",
         "-s",
-        "-Wl,--version-script",  # This line must be directly followed by LINKER_SCRIPT.
-        "$(location {})".format(LINKER_SCRIPT),
+        "-Wl,--version-script,$(location {})".format(LINKER_SCRIPT),
     ],
     linkshared = 1,
     linkstatic = 1,
diff --git a/tensorflow/examples/android/README.md b/tensorflow/examples/android/README.md
index 82bc3ffda9635a97af5acb8715d5b98fc10d440c..4e4e1685f6db128eb8cb09986e4924567f35ea75 100644
--- a/tensorflow/examples/android/README.md
+++ b/tensorflow/examples/android/README.md
@@ -180,7 +180,7 @@ After editing your WORKSPACE file to update the SDK/NDK configuration, you may
 build the APK. Run this from your workspace root:
 
 ```bash
-bazel build -c opt //tensorflow/examples/android:tensorflow_demo
+bazel build --cxxopt='--std=c++11' -c opt //tensorflow/examples/android:tensorflow_demo
 ```
 
 ##### Install
diff --git a/tensorflow/examples/autograph/integration_tests/keras_test.py b/tensorflow/examples/autograph/integration_tests/keras_test.py
index 3fe33df920d008845bfd1002075fd6b5dc25b31f..72b62f1ad4d7094f97b43841251c824a558da82f 100644
--- a/tensorflow/examples/autograph/integration_tests/keras_test.py
+++ b/tensorflow/examples/autograph/integration_tests/keras_test.py
@@ -87,18 +87,16 @@ class KerasTest(tf.test.TestCase):
 
   @test_util.run_deprecated_v1
   def test_recursive_true(self):
-    with self.assertRaisesRegexp(NotImplementedError,
-                                 'Object conversion is not yet supported.'):
-      with tf.Graph().as_default():
-        model = CompoundModel()
-        model.build(tf.TensorShape((None, 10, 10, 1)))
-        init = tf.global_variables_initializer()
-
-        with tf.Session() as sess:
-          self.evaluate(init)
-          sample_input = tf.random_uniform((1, 10, 10, 1))
-          output = model(sample_input)  # pylint: disable=not-callable
-          self.assertEqual(self.evaluate(output).shape, (1, 3))
+    with tf.Graph().as_default():
+      model = CompoundModel()
+      model.build(tf.TensorShape((None, 10, 10, 1)))
+      init = tf.global_variables_initializer()
+
+      with tf.Session() as sess:
+        self.evaluate(init)
+        sample_input = tf.random_uniform((1, 10, 10, 1))
+        output = model(sample_input)  # pylint: disable=not-callable
+        self.assertEqual(self.evaluate(output).shape, (1, 3))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/ios/benchmark/ios_image_load.h b/tensorflow/examples/ios/benchmark/ios_image_load.h
index 3f94984692341b2d7ae975597ecdd1893486afb4..22ee785dc341bf117d458eef2f0e7dd5e170181b 100644
--- a/tensorflow/examples/ios/benchmark/ios_image_load.h
+++ b/tensorflow/examples/ios/benchmark/ios_image_load.h
@@ -17,7 +17,7 @@
 
 #include <vector>
 
-#include "tensorflow/core/framework/types.h"
+#include "third_party/tensorflow/core/framework/types.h"
 
 std::vector<tensorflow::uint8> LoadImageFromFile(const char* file_name,
                                                  int* out_width,
diff --git a/tensorflow/examples/ios/camera/CameraExampleViewController.h b/tensorflow/examples/ios/camera/CameraExampleViewController.h
index 0aefbc6eedb0f140f7c162512cf60027bbec7501..277b6e272dc34b429021abba4a3e2381a2459060 100644
--- a/tensorflow/examples/ios/camera/CameraExampleViewController.h
+++ b/tensorflow/examples/ios/camera/CameraExampleViewController.h
@@ -16,8 +16,8 @@
 #import <UIKit/UIKit.h>
 
 #include <memory>
-#include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/memmapped_file_system.h"
+#include "third_party/tensorflow/core/public/session.h"
+#include "third_party/tensorflow/core/util/memmapped_file_system.h"
 
 @interface CameraExampleViewController
     : UIViewController<UIGestureRecognizerDelegate,
diff --git a/tensorflow/examples/ios/camera/ios_image_load.h b/tensorflow/examples/ios/camera/ios_image_load.h
index f10b0b983a957bd52d5bd6dc0841d899a3196beb..991568751e9bb6acdaaf1da3f217438392575aa2 100644
--- a/tensorflow/examples/ios/camera/ios_image_load.h
+++ b/tensorflow/examples/ios/camera/ios_image_load.h
@@ -17,7 +17,7 @@
 
 #include <vector>
 
-#include "tensorflow/core/framework/types.h"
+#include "third_party/tensorflow/core/framework/types.h"
 
 std::vector<tensorflow::uint8> LoadImageFromFile(const char* file_name,
 						 int* out_width,
diff --git a/tensorflow/examples/ios/camera/tensorflow_utils.h b/tensorflow/examples/ios/camera/tensorflow_utils.h
index 78bdb82aae63d14835b99021ed6686b50777577b..33e95b185c74a7fb026ebf1495dca98a12d4e2ae 100644
--- a/tensorflow/examples/ios/camera/tensorflow_utils.h
+++ b/tensorflow/examples/ios/camera/tensorflow_utils.h
@@ -18,8 +18,8 @@
 #include <memory>
 #include <vector>
 
-#include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/memmapped_file_system.h"
+#include "third_party/tensorflow/core/public/session.h"
+#include "third_party/tensorflow/core/util/memmapped_file_system.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 // Reads a serialized GraphDef protobuf file from the bundle, typically
diff --git a/tensorflow/examples/ios/simple/ios_image_load.h b/tensorflow/examples/ios/simple/ios_image_load.h
index 0e0b771118b9eb5b33dcf7b9bea1a33b4873ac6d..2d2ee78e991e42fa1e21ae697c2c76606fc7639c 100644
--- a/tensorflow/examples/ios/simple/ios_image_load.h
+++ b/tensorflow/examples/ios/simple/ios_image_load.h
@@ -17,7 +17,7 @@
 
 #include <vector>
 
-#include "tensorflow/core/framework/types.h"
+#include "third_party/tensorflow/core/framework/types.h"
 
 std::vector<tensorflow::uint8> LoadImageFromFile(const char* file_name,
 						 int* out_width,
diff --git a/tensorflow/examples/learn/BUILD b/tensorflow/examples/learn/BUILD
index d6ec1f393bab82a45f0c1032670b5abed42bf6d3..a22d55e5af7630d5660a59970244357897aa1aa3 100644
--- a/tensorflow/examples/learn/BUILD
+++ b/tensorflow/examples/learn/BUILD
@@ -28,17 +28,8 @@ sh_test(
     size = "large",
     srcs = ["examples_test.sh"],
     data = [
-        ":boston",
-        ":iris",
         ":iris_custom_decay_dnn",
         ":iris_custom_model",
-        ":iris_run_config",
-        ":random_forest_mnist",
-        ":resnet",
-        ":text_classification",
-        ":text_classification_character_cnn",
-        ":text_classification_character_rnn",
-        ":text_classification_cnn",
     ],
     tags = [
         "manual",
diff --git a/tensorflow/examples/saved_model/BUILD b/tensorflow/examples/saved_model/BUILD
deleted file mode 100644
index ebefc6576d646467426a784d03f4be206aeaba38..0000000000000000000000000000000000000000
--- a/tensorflow/examples/saved_model/BUILD
+++ /dev/null
@@ -1,22 +0,0 @@
-# Description: SavedModel half plus two example.
-
-package(
-    default_visibility = ["//tensorflow:internal"],
-)
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-py_binary(
-    name = "saved_model_half_plus_two",
-    srcs = [
-        "saved_model_half_plus_two.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:lib",
-        "//tensorflow/python/saved_model:main_op",
-    ],
-)
diff --git a/tensorflow/examples/saved_model/integration_tests/BUILD b/tensorflow/examples/saved_model/integration_tests/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..d7d8d95f66594212aaf4a9ad0655c2302643bbe4
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/BUILD
@@ -0,0 +1,116 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_binary(
+    name = "export_text_rnn_model",
+    srcs = ["export_text_rnn_model.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "use_text_rnn_model",
+    srcs = ["use_text_rnn_model.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "export_rnn_cell",
+    srcs = ["export_rnn_cell.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "use_rnn_cell",
+    srcs = ["use_rnn_cell.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "export_simple_text_embedding",
+    srcs = ["export_simple_text_embedding.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "use_model_in_sequential_keras",
+    srcs = ["use_model_in_sequential_keras.py"],
+    deps = [
+        ":util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_library(
+    name = "util",
+    srcs = ["util.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_library(
+    name = "mnist_util",
+    srcs = ["mnist_util.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "export_mnist_cnn",
+    srcs = ["export_mnist_cnn.py"],
+    deps = [
+        ":mnist_util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "use_mnist_cnn",
+    srcs = ["use_mnist_cnn.py"],
+    deps = [
+        ":mnist_util",
+        ":util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_test(
+    name = "saved_model_test",
+    srcs = [
+        "saved_model_test.py",
+    ],
+    data = [
+        ":export_mnist_cnn",
+        ":export_rnn_cell",
+        ":export_simple_text_embedding",
+        ":export_text_rnn_model",
+        ":use_mnist_cnn",
+        ":use_model_in_sequential_keras",
+        ":use_rnn_cell",
+        ":use_text_rnn_model",
+    ],
+    shard_count = 4,
+    srcs_version = "PY2AND3",
+    tags = [
+        "noasan",  # forge input size exceeded
+        "nomsan",  # forge input size exceeded
+        "notsan",  # forge input size exceeded
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
diff --git a/tensorflow/examples/saved_model/integration_tests/export_mnist_cnn.py b/tensorflow/examples/saved_model/integration_tests/export_mnist_cnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..68acfb481795331ba6ad87587ec55bfbfb9c2b00
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/export_mnist_cnn.py
@@ -0,0 +1,189 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Exports a convolutional feature extractor for MNIST in SavedModel format.
+
+The feature extractor is a convolutional neural network plus a hidden layer
+that gets trained as part of an MNIST classifier and then written to a
+SavedModel (without the classification layer). From there, use_mnist_cnn.py
+picks it up for transfer learning.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+import tensorflow as tf
+
+from tensorflow.examples.saved_model.integration_tests import mnist_util
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string(
+    'export_dir', None,
+    'Directory of exported SavedModel.')
+flags.DEFINE_integer(
+    'epochs', 10,
+    'Number of epochs to train.')
+flags.DEFINE_bool(
+    'fast_test_mode', False,
+    'Shortcut training for running in unit tests.')
+flags.DEFINE_bool(
+    'export_print_hparams', False,
+    'If true, the exported function will print its effective hparams.')
+
+
+def make_feature_extractor(l2_strength, dropout_rate):
+  """Returns a Keras Model to compute a feature vector from MNIST images."""
+  regularizer = lambda: tf.keras.regularizers.l2(l2_strength)
+  net = inp = tf.keras.Input(mnist_util.INPUT_SHAPE)
+  net = tf.keras.layers.Conv2D(32, (3, 3), activation='relu', name='conv1',
+                               kernel_regularizer=regularizer())(net)
+  net = tf.keras.layers.Conv2D(64, (3, 3), activation='relu', name='conv2',
+                               kernel_regularizer=regularizer())(net)
+  net = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), name='pool1')(net)
+  net = tf.keras.layers.Dropout(dropout_rate, name='dropout1')(net)
+  net = tf.keras.layers.Flatten(name='flatten')(net)
+  net = tf.keras.layers.Dense(128, activation='relu', name='dense1',
+                              kernel_regularizer=regularizer())(net)
+  return tf.keras.Model(inputs=inp, outputs=net)
+
+
+def set_feature_extractor_hparams(model, dropout_rate):
+  model.get_layer('dropout1').rate = dropout_rate
+
+
+def make_classifier(feature_extractor, l2_strength, dropout_rate=0.5):
+  """Returns a Keras Model to classify MNIST using feature_extractor."""
+  regularizer = lambda: tf.keras.regularizers.l2(l2_strength)
+  net = inp = tf.keras.Input(mnist_util.INPUT_SHAPE)
+  net = feature_extractor(net)
+  net = tf.keras.layers.Dropout(dropout_rate)(net)
+  net = tf.keras.layers.Dense(mnist_util.NUM_CLASSES, activation='softmax',
+                              kernel_regularizer=regularizer())(net)
+  return tf.keras.Model(inputs=inp, outputs=net)
+
+
+def wrap_keras_model_for_export(model, batch_input_shape,
+                                set_hparams, default_hparams):
+  """Wraps `model` for saving and loading as SavedModel."""
+  if default_hparams is None: default_hparams = {}
+  hparam_keys = list(default_hparams.keys())
+  hparam_defaults = tuple(default_hparams.values())
+  # The goal is to save a function with this argspec...
+  argspec = tf_inspect.FullArgSpec(
+      args=(['inputs', 'training'] + hparam_keys),
+      defaults=((False,) + hparam_defaults),
+      varargs=None, varkw=None,
+      kwonlyargs=[], kwonlydefaults=None,
+      annotations={})
+  # ...and this behavior:
+  def call_fn(inputs, training, *args):
+    if FLAGS.export_print_hparams:
+      args = [tf.keras.backend.print_tensor(args[i], 'training=%s and %s='
+                                            % (training, hparam_keys[i]))
+              for i in range(len(args))]
+    kwargs = dict(zip(hparam_keys, args))
+    if kwargs: set_hparams(model, **kwargs)
+    return model(inputs, training=training)
+  # We cannot spell out `args` in def statement for call_fn, but since
+  # tf.function uses tf_inspect, we can use tf_decorator to wrap it with
+  # the desired argspec.
+  def wrapped(*args, **kwargs):  # TODO(arnoegw): Can we use call_fn itself?
+    return call_fn(*args, **kwargs)
+  traced_call_fn = tf.function(autograph=False)(
+      tf_decorator.make_decorator(call_fn, wrapped, decorator_argspec=argspec))
+  # Now we need to trigger traces for
+  # - training set to Python values True or False (hence two traces),
+  # - tensor inputs of the expected nesting, shape and dtype,
+  # - tensor-valued kwargs for hparams, with caller-side defaults.
+  # Tracing with partially determined shapes requires an input signature,
+  # so we initiate tracing from a helper function with only tensor inputs.
+  @tf.function(autograph=False)
+  def trigger_traces(inputs, **kwargs):
+    return tuple(traced_call_fn(inputs, training=training, **kwargs)
+                 for training in (True, False))
+  inputs_spec = tf.TensorSpec(shape=batch_input_shape, dtype=tf.float32)
+  hparams_spec = {name: tf.TensorSpec.from_tensor(tf.constant(value))
+                  for name, value in default_hparams.items()}
+  _ = trigger_traces.get_concrete_function(inputs_spec, **hparams_spec)
+
+  # Assemble the output object.
+  obj = tf.train.Checkpoint()
+  obj.__call__ = traced_call_fn
+  obj.trainable_variables = model.trainable_variables
+  obj.variables = model.trainable_variables + model.non_trainable_variables
+  obj.regularization_losses = [_get_traced_loss(model, i)
+                               for i in range(len(model.losses))]
+  return obj
+
+
+def _get_traced_loss(model, i):
+  """Returns tf.function for model.losses[i] with a trace for zero args.
+
+  The intended usage is
+    [_get_traced_loss(model, i) for i in range(len(model.losses))]
+  This is better than
+    [tf.function(lambda: model.losses[i], input_signature=[]) for i ...]
+  because it avoids capturing a loop index in a lambda, and removes any
+  chance of deferring the trace.
+
+  Args:
+    model: a Keras Model.
+    i: an integer between from 0 up to but to len(model.losses).
+  """
+  f = tf.function(lambda: model.losses[i])
+  _ = f.get_concrete_function()
+  return f
+
+
+def main(argv):
+  del argv
+
+  # Build a complete classifier model using a feature extractor.
+  default_hparams = dict(dropout_rate=0.25)
+  l2_strength = 0.01  # Not a hparam for inputs -> outputs.
+  feature_extractor = make_feature_extractor(l2_strength=l2_strength,
+                                             **default_hparams)
+  classifier = make_classifier(feature_extractor, l2_strength=l2_strength)
+
+  # Train the complete model.
+  (x_train, y_train), (x_test, y_test) = mnist_util.load_reshaped_data(
+      fake_tiny_data=FLAGS.fast_test_mode)
+  classifier.compile(loss=tf.keras.losses.categorical_crossentropy,
+                     optimizer=tf.keras.optimizers.SGD(),
+                     metrics=['accuracy'])
+  classifier.fit(x_train, y_train,
+                 batch_size=128,
+                 epochs=FLAGS.epochs,
+                 verbose=1,
+                 validation_data=(x_test, y_test))
+
+  # Save the feature extractor to a framework-agnostic SavedModel for reuse.
+  # Note that the feature_extractor object has not been compiled or fitted,
+  # so it does not contain an optimizer and related state.
+  exportable = wrap_keras_model_for_export(feature_extractor,
+                                           (None,) + mnist_util.INPUT_SHAPE,
+                                           set_feature_extractor_hparams,
+                                           default_hparams)
+  tf.saved_model.save(exportable, FLAGS.export_dir)
+
+
+if __name__ == '__main__':
+  # tf.enable_v2_behavior()
+  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py b/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py
new file mode 100644
index 0000000000000000000000000000000000000000..3660ed2a8a5c07355e9bd0702d800d8fe7a18510
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py
@@ -0,0 +1,63 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Export an RNN cell in SavedModel format."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+import numpy as np
+
+import tensorflow as tf
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("export_dir", None, "Directory to export SavedModel.")
+
+
+def main(argv):
+  del argv
+
+  root = tf.train.Checkpoint()
+  # Create a cell and attach to our trackable.
+  root.rnn_cell = tf.keras.layers.LSTMCell(units=10, recurrent_initializer=None)
+
+  # Wrap the rnn_cell.__call__ function and assign to next_state.
+  root.next_state = tf.function(root.rnn_cell.__call__, autograph=False)
+
+  # Wrap the rnn_cell.get_initial_function using a decorator and assign to an
+  # attribute with the same name.
+  @tf.function(input_signature=[tf.TensorSpec([None, None], tf.float32)])
+  def get_initial_state(tensor):
+    return root.rnn_cell.get_initial_state(tensor, None, None)
+
+  root.get_initial_state = get_initial_state
+
+  # Construct an initial_state, then call next_state explicitly to trigger a
+  # trace for serialization (we need an explicit call, because next_state has
+  # not been annotated with an input_signature).
+  initial_state = root.get_initial_state(
+      tf.constant(np.random.uniform(size=[3, 10]).astype(np.float32)))
+  root.next_state(
+      tf.constant(np.random.uniform(size=[3, 19]).astype(np.float32)),
+      initial_state)
+
+  tf.saved_model.save(root, FLAGS.export_dir)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/export_simple_text_embedding.py b/tensorflow/examples/saved_model/integration_tests/export_simple_text_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..af61fa37add3be5c6f6ee628313bf7d96ed74d43
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/export_simple_text_embedding.py
@@ -0,0 +1,105 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Text embedding model stored as a SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tempfile
+from absl import app
+from absl import flags
+
+import tensorflow as tf
+
+# TODO(vbardiovsky): remove these when symbols are public.
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.training.tracking import tracking
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("export_dir", None, "Directory to export SavedModel.")
+
+
+def write_vocabulary_file(vocabulary):
+  """Write temporary vocab file for module construction."""
+  tmpdir = tempfile.mkdtemp()
+  vocabulary_file = os.path.join(tmpdir, "tokens.txt")
+  with tf.io.gfile.GFile(vocabulary_file, "w") as f:
+    for entry in vocabulary:
+      f.write(entry + "\n")
+  return vocabulary_file
+
+
+class TextEmbeddingModel(tf.train.Checkpoint):
+  """Text embedding model.
+
+  A text embeddings model that takes a sentences on input and outputs the
+  sentence embedding.
+  """
+
+  def __init__(self, vocabulary, emb_dim, oov_buckets):
+    super(TextEmbeddingModel, self).__init__()
+    self._oov_buckets = oov_buckets
+    self._vocabulary_file = tracking.TrackableAsset(
+        write_vocabulary_file(vocabulary))
+    self._total_size = len(vocabulary) + oov_buckets
+    self._table = lookup_ops.index_table_from_file(
+        vocabulary_file=self._vocabulary_file,
+        num_oov_buckets=self._oov_buckets,
+        hasher_spec=lookup_ops.FastHashSpec)
+    self.embeddings = tf.Variable(
+        tf.random.uniform(shape=[self._total_size, emb_dim]))
+    self.variables = [self.embeddings]
+    self.trainable_variables = self.variables
+
+  def _tokenize(self, sentences):
+    # Perform a minimalistic text preprocessing by removing punctuation and
+    # splitting on spaces.
+    normalized_sentences = tf.strings.regex_replace(
+        input=sentences, pattern=r"\pP", rewrite="")
+    normalized_sentences = tf.reshape(normalized_sentences, [-1])
+    sparse_tokens = tf.string_split(normalized_sentences, " ")
+
+    # Deal with a corner case: there is one empty sentence.
+    sparse_tokens, _ = tf.sparse.fill_empty_rows(sparse_tokens, tf.constant(""))
+    # Deal with a corner case: all sentences are empty.
+    sparse_tokens = tf.sparse.reset_shape(sparse_tokens)
+    sparse_token_ids = self._table.lookup(sparse_tokens.values)
+
+    return (sparse_tokens.indices, sparse_token_ids, sparse_tokens.dense_shape)
+
+  @tf.function(input_signature=[tf.TensorSpec([None], tf.dtypes.string)])
+  def __call__(self, sentences):
+    token_ids, token_values, token_dense_shape = self._tokenize(sentences)
+
+    return tf.nn.safe_embedding_lookup_sparse(
+        embedding_weights=self.embeddings,
+        sparse_ids=tf.SparseTensor(token_ids, token_values, token_dense_shape),
+        sparse_weights=None,
+        combiner="sqrtn")
+
+
+def main(argv):
+  del argv
+
+  vocabulary = ["cat", "is", "on", "the", "mat"]
+  module = TextEmbeddingModel(vocabulary=vocabulary, emb_dim=10, oov_buckets=10)
+  tf.saved_model.save(module, FLAGS.export_dir)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/export_text_rnn_model.py b/tensorflow/examples/saved_model/integration_tests/export_text_rnn_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..32bdb284f21ed235410aad7ad077f03c52d2ab93
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/export_text_rnn_model.py
@@ -0,0 +1,193 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Text RNN model stored as a SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+
+import tensorflow as tf
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("export_dir", None, "Directory to export SavedModel.")
+
+
+class TextRnnModel(tf.train.Checkpoint):
+  """Text RNN model.
+
+  A full generative text RNN model that can train and decode sentences from a
+  starting word.
+  """
+
+  def __init__(self, vocab, emb_dim, buckets, state_size):
+    super(TextRnnModel, self).__init__()
+    self._buckets = buckets
+    self._lstm_cell = tf.keras.layers.LSTMCell(units=state_size)
+    self._rnn_layer = tf.keras.layers.RNN(
+        self._lstm_cell, return_sequences=True)
+    self._embeddings = tf.Variable(tf.random.uniform(shape=[buckets, emb_dim]))
+    self._logit_layer = tf.keras.layers.Dense(buckets)
+    self._set_up_vocab(vocab)
+
+  def _tokenize(self, sentences):
+    # Perform a minimalistic text preprocessing by removing punctuation and
+    # splitting on spaces.
+    normalized_sentences = tf.strings.regex_replace(
+        input=sentences, pattern=r"\pP", rewrite="")
+    sparse_tokens = tf.string_split(normalized_sentences, " ")
+
+    # Deal with a corner case: there is one empty sentence.
+    sparse_tokens, _ = tf.sparse.fill_empty_rows(sparse_tokens, tf.constant(""))
+    # Deal with a corner case: all sentences are empty.
+    sparse_tokens = tf.sparse.reset_shape(sparse_tokens)
+
+    return (sparse_tokens.indices, sparse_tokens.values,
+            sparse_tokens.dense_shape)
+
+  def _set_up_vocab(self, vocab_tokens):
+    # TODO(vbardiovsky): Currently there is no real vocabulary, because
+    # saved_model serialization does not support trackable resources. Add a real
+    # vocabulary when it does.
+    vocab_list = ["UNK"] * self._buckets
+    for vocab_token in vocab_tokens:
+      index = self._words_to_indices(vocab_token).numpy()
+      vocab_list[index] = vocab_token
+    # This is a variable representing an inverse index.
+    self._vocab_tensor = tf.Variable(vocab_list)
+
+  def _indices_to_words(self, indices):
+    return tf.gather(self._vocab_tensor, indices)
+
+  def _words_to_indices(self, words):
+    return tf.strings.to_hash_bucket(words, self._buckets)
+
+  @tf.function(input_signature=[tf.TensorSpec([None], tf.dtypes.string)])
+  def train(self, sentences):
+    token_ids, token_values, token_dense_shape = self._tokenize(sentences)
+    tokens_sparse = tf.sparse.SparseTensor(
+        indices=token_ids, values=token_values, dense_shape=token_dense_shape)
+    tokens = tf.sparse.to_dense(tokens_sparse, default_value="")
+
+    sparse_lookup_ids = tf.sparse.SparseTensor(
+        indices=tokens_sparse.indices,
+        values=self._words_to_indices(tokens_sparse.values),
+        dense_shape=tokens_sparse.dense_shape)
+    lookup_ids = tf.sparse.to_dense(sparse_lookup_ids, default_value=0)
+
+    # Targets are the next word for each word of the sentence.
+    tokens_ids_seq = lookup_ids[:, 0:-1]
+    tokens_ids_target = lookup_ids[:, 1:]
+
+    tokens_prefix = tokens[:, 0:-1]
+
+    # Mask determining which positions we care about for a loss: all positions
+    # that have a valid non-terminal token.
+    mask = tf.logical_and(
+        tf.logical_not(tf.equal(tokens_prefix, "")),
+        tf.logical_not(tf.equal(tokens_prefix, "<E>")))
+
+    input_mask = tf.cast(mask, tf.int32)
+
+    with tf.GradientTape() as t:
+      sentence_embeddings = tf.nn.embedding_lookup(self._embeddings,
+                                                   tokens_ids_seq)
+
+      lstm_initial_state = self._lstm_cell.get_initial_state(
+          sentence_embeddings)
+
+      lstm_output = self._rnn_layer(
+          inputs=sentence_embeddings, initial_state=lstm_initial_state)
+
+      # Stack LSTM outputs into a batch instead of a 2D array.
+      lstm_output = tf.reshape(lstm_output, [-1, self._lstm_cell.output_size])
+
+      logits = self._logit_layer(lstm_output)
+
+      targets = tf.reshape(tokens_ids_target, [-1])
+      weights = tf.cast(tf.reshape(input_mask, [-1]), tf.float32)
+
+      losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
+          labels=targets, logits=logits)
+
+      # Final loss is the mean loss for all token losses.
+      final_loss = tf.math.divide(
+          tf.reduce_sum(tf.multiply(losses, weights)),
+          tf.reduce_sum(weights),
+          name="final_loss")
+
+    watched = t.watched_variables()
+    gradients = t.gradient(final_loss, watched)
+
+    for w, g in zip(watched, gradients):
+      w.assign_sub(g)
+
+    return final_loss
+
+  @tf.function
+  def decode_greedy(self, sequence_length, first_word):
+    initial_state = self._lstm_cell.get_initial_state(
+        dtype=tf.float32, batch_size=1)
+
+    sequence = [first_word]
+    current_word = first_word
+    current_id = tf.expand_dims(self._words_to_indices(current_word), 0)
+    current_state = initial_state
+
+    for _ in range(sequence_length):
+      token_embeddings = tf.nn.embedding_lookup(self._embeddings, current_id)
+      lstm_outputs, current_state = self._lstm_cell(token_embeddings,
+                                                    current_state)
+      lstm_outputs = tf.reshape(lstm_outputs, [-1, self._lstm_cell.output_size])
+      logits = self._logit_layer(lstm_outputs)
+      softmax = tf.nn.softmax(logits)
+
+      next_ids = tf.math.argmax(softmax, axis=1)
+      next_words = self._indices_to_words(next_ids)[0]
+
+      current_id = next_ids
+      current_word = next_words
+      sequence.append(current_word)
+
+    return sequence
+
+
+def main(argv):
+  del argv
+
+  sentences = ["<S> hello there <E>", "<S> how are you doing today <E>"]
+  vocab = [
+      "<S>", "<E>", "hello", "there", "how", "are", "you", "doing", "today"
+  ]
+
+  module = TextRnnModel(vocab=vocab, emb_dim=10, buckets=100, state_size=128)
+
+  for _ in range(100):
+    _ = module.train(tf.constant(sentences))
+
+  # We have to call this function explicitly if we want it exported, because it
+  # has no input_signature in the @tf.function decorator.
+  decoded = module.decode_greedy(
+      sequence_length=10, first_word=tf.constant("<S>"))
+  _ = [d.numpy() for d in decoded]
+
+  tf.saved_model.save(module, FLAGS.export_dir)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/mnist_util.py b/tensorflow/examples/saved_model/integration_tests/mnist_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e4ffda19e7bf5a3cf8c0418c8e23709d9dc69c7
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/mnist_util.py
@@ -0,0 +1,49 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Convenience wrapper around Keras' MNIST and Fashion MNIST data."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+INPUT_SHAPE = (28, 28, 1)
+NUM_CLASSES = 10
+
+
+def load_reshaped_data(use_fashion_mnist=False, fake_tiny_data=False):
+  """Returns MNIST or Fashion MNIST train and test data."""
+  if fake_tiny_data:
+    num_fakes = 10
+    x_train = x_test = np.zeros((num_fakes, 28, 28), dtype=np.uint8)
+    y_train = y_test = np.zeros((num_fakes,), dtype=np.int64)
+  else:
+    mnist = (tf.keras.datasets.fashion_mnist if use_fashion_mnist else
+             tf.keras.datasets.mnist)
+    (x_train, y_train), (x_test, y_test) = mnist.load_data()
+  return ((_prepare_image(x_train), _prepare_label(y_train)),
+          (_prepare_image(x_test), _prepare_label(y_test)))
+
+
+def _prepare_image(x):
+  """Converts images to [n,h,w,c] format in range [0,1]."""
+  return x[..., None].astype('float32') / 255.
+
+
+def _prepare_label(y):
+  """Conerts labels to one-hot encoding."""
+  return tf.keras.utils.to_categorical(y, NUM_CLASSES)
diff --git a/tensorflow/examples/saved_model/integration_tests/saved_model_test.py b/tensorflow/examples/saved_model/integration_tests/saved_model_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..78d2d1e2ade580dff6dc3e39bc3f6e4d47b056bd
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/saved_model_test.py
@@ -0,0 +1,86 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SavedModel integration tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import subprocess
+
+import tensorflow as tf
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.platform import tf_logging as logging
+
+
+class SavedModelTest(tf.test.TestCase):
+
+  def assertCommandSucceeded(self, binary, **flags):
+    command_parts = [binary]
+    for flag_key, flag_value in flags.items():
+      command_parts.append("--%s=%s" % (flag_key, flag_value))
+
+    logging.info("Running: %s" % command_parts)
+    subprocess.check_call(
+        command_parts, env=dict(os.environ, TF2_BEHAVIOR="enabled"))
+
+  @test_util.run_v2_only
+  def test_text_rnn(self):
+    export_dir = self.get_temp_dir()
+    export_binary = resource_loader.get_path_to_datafile(
+        "export_text_rnn_model")
+    self.assertCommandSucceeded(export_binary, export_dir=export_dir)
+
+    use_binary = resource_loader.get_path_to_datafile("use_text_rnn_model")
+    self.assertCommandSucceeded(use_binary, model_dir=export_dir)
+
+  @test_util.run_v2_only
+  def test_rnn_cell(self):
+    export_dir = self.get_temp_dir()
+    export_binary = resource_loader.get_path_to_datafile(
+        "export_rnn_cell")
+    self.assertCommandSucceeded(export_binary, export_dir=export_dir)
+
+    use_binary = resource_loader.get_path_to_datafile("use_rnn_cell")
+    self.assertCommandSucceeded(use_binary, model_dir=export_dir)
+
+  @test_util.run_v2_only
+  def test_text_embedding_in_sequential_keras(self):
+    export_dir = self.get_temp_dir()
+    export_binary = resource_loader.get_path_to_datafile(
+        "export_simple_text_embedding")
+    self.assertCommandSucceeded(export_binary, export_dir=export_dir)
+
+    use_binary = resource_loader.get_path_to_datafile(
+        "use_model_in_sequential_keras")
+    self.assertCommandSucceeded(use_binary, model_dir=export_dir)
+
+  @test_util.run_v2_only
+  def test_mnist_cnn(self):
+    export_dir = self.get_temp_dir()
+    export_binary = resource_loader.get_path_to_datafile("export_mnist_cnn")
+    self.assertCommandSucceeded(export_binary, export_dir=export_dir,
+                                fast_test_mode="true")
+
+    use_binary = resource_loader.get_path_to_datafile("use_mnist_cnn")
+    self.assertCommandSucceeded(use_binary, export_dir=export_dir,
+                                fast_test_mode="true")
+
+if __name__ == "__main__":
+  # tf.enable_v2_behavior()
+  tf.test.main()
diff --git a/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py b/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e1ca33029a928e8f6a8bad1f068d9e56e71be54
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py
@@ -0,0 +1,124 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Imports a convolutional feature extractor for MNIST in SavedModel format.
+
+This program picks up the SavedModel written by export_mnist_cnn.py and
+uses the feature extractor contained in it to classification on either
+classic MNIST (digits) or Fashion MNIST (thumbnails of apparel). Optionally,
+it trains the feature extractor further as part of the new classifier.
+As expected, that makes training slower but does not help much for the
+original training dataset but helps a lot for transfer to the other dataset.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+import tensorflow as tf
+
+from tensorflow.examples.saved_model.integration_tests import mnist_util
+from tensorflow.examples.saved_model.integration_tests import util
+from tensorflow.python.saved_model import load as svmd_load
+tf.saved_model.load = svmd_load.load
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string(
+    'export_dir', None,
+    'Directory of exported SavedModel.')
+flags.DEFINE_integer(
+    'epochs', 5,
+    'Number of epochs to train.')
+flags.DEFINE_bool(
+    'retrain', False,
+    'If set, the imported SavedModel is trained further.')
+flags.DEFINE_float(
+    'dropout_rate', None,
+    'If set, dropout rate passed to the SavedModel.')
+flags.DEFINE_float(
+    'regularization_loss_multiplier', None,
+    'If set, multiplier for the regularization losses in the SavedModel.')
+flags.DEFINE_bool(
+    'use_fashion_mnist', False,
+    'Use Fashion MNIST (products) instead of the real MNIST (digits). '
+    'With this, --retrain gains a lot.')
+flags.DEFINE_bool(
+    'fast_test_mode', False,
+    'Shortcut training for running in unit tests.')
+
+
+def make_classifier(feature_extractor, l2_strength=0.01, dropout_rate=0.5):
+  """Returns a Keras Model to classify MNIST using feature_extractor."""
+  regularizer = lambda: tf.keras.regularizers.l2(l2_strength)
+  net = inp = tf.keras.Input(mnist_util.INPUT_SHAPE)
+  net = feature_extractor(net)
+  net = tf.keras.layers.Dropout(dropout_rate)(net)
+  net = tf.keras.layers.Dense(mnist_util.NUM_CLASSES, activation='softmax',
+                              kernel_regularizer=regularizer())(net)
+  return tf.keras.Model(inputs=inp, outputs=net)
+
+
+def scale_regularization_losses(obj, multiplier):
+  """Scales obj.regularization_losses by multiplier if not None."""
+  if multiplier is None: return
+  def _scale_one_loss(l):  # Separate def avoids lambda capture of loop var.
+    f = tf.function(lambda: tf.multiply(multiplier, l()))
+    _ = f.get_concrete_function()
+    return f
+  obj.regularization_losses = [_scale_one_loss(l)
+                               for l in obj.regularization_losses]
+
+
+def main(argv):
+  del argv
+
+  # Load a pre-trained feature extractor and wrap it for use in Keras.
+  obj = tf.saved_model.load(FLAGS.export_dir)
+  scale_regularization_losses(obj, FLAGS.regularization_loss_multiplier)
+  arguments = {}
+  if FLAGS.dropout_rate is not None:
+    arguments['dropout_rate'] = FLAGS.dropout_rate
+  feature_extractor = util.CustomLayer(obj, output_shape=[128],
+                                       trainable=FLAGS.retrain,
+                                       arguments=arguments)
+
+  # Build a classifier with it.
+  model = make_classifier(feature_extractor)
+
+  # Train the classifier (possibly on a different dataset).
+  (x_train, y_train), (x_test, y_test) = mnist_util.load_reshaped_data(
+      use_fashion_mnist=FLAGS.use_fashion_mnist,
+      fake_tiny_data=FLAGS.fast_test_mode)
+  model.compile(loss=tf.keras.losses.categorical_crossentropy,
+                optimizer=tf.keras.optimizers.SGD(),
+                metrics=['accuracy'],
+                # TODO(arnoegw): Remove after investigating huge allocs.
+                run_eagerly=True)
+  print('Training on %s with %d trainable and %d untrainable variables.' %
+        ('Fashion MNIST' if FLAGS.use_fashion_mnist else 'MNIST',
+         len(model.trainable_variables), len(model.non_trainable_variables)))
+  model.fit(x_train, y_train,
+            batch_size=128,
+            epochs=FLAGS.epochs,
+            steps_per_epoch=3,
+            verbose=1,
+            validation_data=(x_test, y_test))
+
+
+if __name__ == '__main__':
+  # tf.enable_v2_behavior()
+  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/use_model_in_sequential_keras.py b/tensorflow/examples/saved_model/integration_tests/use_model_in_sequential_keras.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c8aab8e2373fbd4ce13bb6ff22184dfefadbed7
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/use_model_in_sequential_keras.py
@@ -0,0 +1,73 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Load and use text embedding module in sequential Keras."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+
+import numpy as np
+
+import tensorflow as tf
+# TODO(vbardiovsky): Remove when load symbol is public.
+from tensorflow.examples.saved_model.integration_tests import util
+from tensorflow.python.saved_model.load import load
+
+tf.saved_model.load = load
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("model_dir", None, "Directory to load SavedModel from.")
+
+
+def train(fine_tuning):
+  """Build a Keras model and train with mock data."""
+  features = np.array(["my first sentence", "my second sentence"])
+  labels = np.array([1, 0])
+  dataset = tf.data.Dataset.from_tensor_slices((features, labels))
+
+  module = tf.saved_model.load(FLAGS.model_dir)
+
+  # Create the sequential keras model.
+  l = tf.keras.layers
+  model = tf.keras.Sequential()
+  model.add(l.Reshape((), batch_input_shape=[None, 1], dtype=tf.string))
+  model.add(util.CustomLayer(module, output_shape=[10], trainable=fine_tuning))
+  model.add(l.Dense(100, activation="relu"))
+  model.add(l.Dense(50, activation="relu"))
+  model.add(l.Dense(1, activation="sigmoid"))
+
+  model.compile(
+      optimizer="adam",
+      loss="binary_crossentropy",
+      metrics=["accuracy"],
+      # TODO(b/124446120): Remove after fixed.
+      run_eagerly=True)
+
+  model.fit_generator(generator=dataset.batch(1), epochs=5)
+
+
+def main(argv):
+  del argv
+
+  train(fine_tuning=False)
+  train(fine_tuning=True)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/use_rnn_cell.py b/tensorflow/examples/saved_model/integration_tests/use_rnn_cell.py
new file mode 100644
index 0000000000000000000000000000000000000000..798033517c0d30030b3876943c726e729d43e53d
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/use_rnn_cell.py
@@ -0,0 +1,49 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Load and use an RNN cell stored as a SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+import numpy as np
+
+import tensorflow as tf
+# TODO(vbardiovsky): Remove when load is available.
+from tensorflow.python.saved_model.load import load
+
+tf.saved_model.load = load
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("model_dir", None, "Directory to load SavedModel from.")
+
+
+def main(argv):
+  del argv
+  cell = tf.saved_model.load(FLAGS.model_dir)
+
+  initial_state = cell.get_initial_state(
+      tf.constant(np.random.uniform(size=[3, 10]).astype(np.float32)))
+
+  cell.next_state(
+      tf.constant(np.random.uniform(size=[3, 19]).astype(np.float32)),
+      initial_state)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/use_text_rnn_model.py b/tensorflow/examples/saved_model/integration_tests/use_text_rnn_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f75c49a327ecf5fed2a4e5ca0957a64f58613b2
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/use_text_rnn_model.py
@@ -0,0 +1,50 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Load and use RNN model stored as a SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+
+import tensorflow as tf
+# TODO(vbardiovsky): Remove when load is available.
+from tensorflow.python.saved_model.load import load
+
+tf.saved_model.load = load
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("model_dir", None, "Directory to load SavedModel from.")
+
+
+def main(argv):
+  del argv
+
+  sentences = [
+      "<S> sentence <E>", "<S> second sentence <E>", "<S> third sentence<E>"
+  ]
+
+  model = tf.saved_model.load(FLAGS.model_dir)
+  model.train(tf.constant(sentences))
+  decoded = model.decode_greedy(
+      sequence_length=10, first_word=tf.constant("<S>"))
+  _ = [d.numpy() for d in decoded]
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/util.py b/tensorflow/examples/saved_model/integration_tests/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4b5b62b7e0bf10fa61d32dd0f06903a2edf9e96
--- /dev/null
+++ b/tensorflow/examples/saved_model/integration_tests/util.py
@@ -0,0 +1,104 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for integration tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+import tensorflow as tf
+
+from tensorflow.python.framework import smart_cond
+from tensorflow.python.util import tf_inspect
+
+
+# TODO(vbardiovsky): We should just reuse Keras's Lambda layer, when that
+# enables to get trainable variables.
+class CustomLayer(tf.keras.layers.Layer):
+  """Wraps callable object as a `Layer` object.
+
+  Args:
+    func: The callable object to wrap. Layer inputs are passed as the first
+      positional argument. If `func` accepts a `training` argument, a Python
+      boolean is passed for it.
+      If present, the following attributes of `func` have a special meaning:
+        * variables: a list of all tf.Variable objects that `func` depends on.
+        * trainable_variables: those elements of `variables` that are reported
+          as trainable variables of this Keras Layer.
+        * regularization_losses: a list of callables to be added as losses
+          of this Keras layer. Each one must accept zero arguments and return
+          a scalare tensor.
+    output_shape: A tuple with the (possibly partial) output shape of `func`
+      *without* leading batch size (by analogy to Dense(..., input_shape=...)).
+    trainable: Boolean controlling whether the trainable variables of `func`
+      are reported as trainable variables of this layer.
+    arguments: optionally, a dict with additional keyword arguments passed
+      to `func`.
+  """
+
+  def __init__(self, func, output_shape, trainable=False, arguments=None,
+               **kwargs):
+    # Set self._{non,}_trainable_weights before calling Layer.__init__.
+    if hasattr(func, 'trainable_variables'):
+      self._trainable_weights = [v for v in func.trainable_variables]
+      trainable_variables_set = set(func.trainable_variables)
+    else:
+      self._trainable_weights = []
+      trainable_variables_set = set()
+    if hasattr(func, 'variables'):
+      self._non_trainable_weights = [v for v in func.variables
+                                     if v not in trainable_variables_set]
+    else:
+      self._non_trainable_weights = []  # TODO(arnoegw): Infer from `func`.
+    super(CustomLayer, self).__init__(trainable=trainable, **kwargs)
+    # Prepare to call `func`.
+    self._func = func
+    self._func_fullargspec = tf_inspect.getfullargspec(func.__call__)
+    self._func_wants_training = (
+        'training' in self._func_fullargspec.args or
+        'training' in self._func_fullargspec.kwonlyargs)
+    self._arguments = arguments or {}
+    # TODO(vbardiovsky): We should be able to get the embedding dimension from
+    # the restored model.
+    self._output_shape = tuple(output_shape)
+    # Forward the callable's regularization losses (if any).
+    if hasattr(func, 'regularization_losses'):
+      for l in func.regularization_losses:
+        if not callable(l):
+          raise ValueError(
+              'CustomLayer(func) expects func.regularization_losses to be an '
+              'iterable of callables, each returning a scalar loss term.')
+        self.add_loss(l)  # Supports callables.
+
+  def call(self, x, training=None):
+    # We basically want to call this...
+    f = functools.partial(self._func, x, **self._arguments)
+    # ...but we may also have to pass a Python boolean for `training`.
+    if not self._func_wants_training:
+      result = f()
+    else:
+      if training is None:
+        training = tf.keras.backend.learning_phase()  # Could be a tensor.
+      result = smart_cond.smart_cond(training,
+                                     lambda: f(training=True),
+                                     lambda: f(training=False))
+    # TODO(vbardiovsky): Polymorphic function should return shaped tensor.
+    result.set_shape(self.compute_output_shape(x.shape))
+    return result
+
+  def compute_output_shape(self, input_shape):
+    return (input_shape[0],) + self._output_shape
diff --git a/tensorflow/examples/saved_model/saved_model_half_plus_two.py b/tensorflow/examples/saved_model/saved_model_half_plus_two.py
deleted file mode 100644
index dfdde445404a5ec99f3d821dff6d9f217bfadefc..0000000000000000000000000000000000000000
--- a/tensorflow/examples/saved_model/saved_model_half_plus_two.py
+++ /dev/null
@@ -1,271 +0,0 @@
-## Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-r"""Exports an example linear regression inference graph.
-
-Exports a TensorFlow graph to `/tmp/saved_model/half_plus_two/` based on the
-`SavedModel` format.
-
-This graph calculates,
-
-\\(
-  y = a*x + b
-\\)
-
-and/or, independently,
-
-\\(
-  y2 = a*x2 + c
-\\)
-
-where `a`, `b` and `c` are variables with `a=0.5` and `b=2` and `c=3`.
-
-Output from this program is typically used to exercise SavedModel load and
-execution code.
-
-To create a CPU model:
-  bazel run -c opt saved_half_plus_two -- --device=cpu
-
-To create GPU model:
-  bazel run --config=cuda -c opt saved_half_plus_two -- \
-  --device=gpu
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import os
-import sys
-
-import tensorflow as tf
-
-from tensorflow.python.lib.io import file_io
-
-FLAGS = None
-
-
-def _write_assets(assets_directory, assets_filename):
-  """Writes asset files to be used with SavedModel for half plus two.
-
-  Args:
-    assets_directory: The directory to which the assets should be written.
-    assets_filename: Name of the file to which the asset contents should be
-        written.
-
-  Returns:
-    The path to which the assets file was written.
-  """
-  if not file_io.file_exists(assets_directory):
-    file_io.recursive_create_dir(assets_directory)
-
-  path = os.path.join(
-      tf.compat.as_bytes(assets_directory), tf.compat.as_bytes(assets_filename))
-  file_io.write_string_to_file(path, "asset-file-contents")
-  return path
-
-
-def _build_regression_signature(input_tensor, output_tensor):
-  """Helper function for building a regression SignatureDef."""
-  input_tensor_info = tf.saved_model.utils.build_tensor_info(input_tensor)
-  signature_inputs = {
-      tf.saved_model.signature_constants.REGRESS_INPUTS: input_tensor_info
-  }
-  output_tensor_info = tf.saved_model.utils.build_tensor_info(output_tensor)
-  signature_outputs = {
-      tf.saved_model.signature_constants.REGRESS_OUTPUTS: output_tensor_info
-  }
-  return tf.saved_model.signature_def_utils.build_signature_def(
-      signature_inputs, signature_outputs,
-      tf.saved_model.signature_constants.REGRESS_METHOD_NAME)
-
-
-# Possibly extend this to allow passing in 'classes', but for now this is
-# sufficient for testing purposes.
-def _build_classification_signature(input_tensor, scores_tensor):
-  """Helper function for building a classification SignatureDef."""
-  input_tensor_info = tf.saved_model.utils.build_tensor_info(input_tensor)
-  signature_inputs = {
-      tf.saved_model.signature_constants.CLASSIFY_INPUTS: input_tensor_info
-  }
-  output_tensor_info = tf.saved_model.utils.build_tensor_info(scores_tensor)
-  signature_outputs = {
-      tf.saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES:
-          output_tensor_info
-  }
-  return tf.saved_model.signature_def_utils.build_signature_def(
-      signature_inputs, signature_outputs,
-      tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME)
-
-
-def _generate_saved_model_for_half_plus_two(export_dir,
-                                            as_text=False,
-                                            use_main_op=False,
-                                            device_type="cpu"):
-  """Generates SavedModel for half plus two.
-
-  Args:
-    export_dir: The directory to which the SavedModel should be written.
-    as_text: Writes the SavedModel protocol buffer in text format to disk.
-    use_main_op: Whether to supply a main op during SavedModel build time.
-    device_name: Device to force ops to run on.
-  """
-  builder = tf.saved_model.builder.SavedModelBuilder(export_dir)
-
-  device_name = "/cpu:0"
-  if device_type == "gpu":
-    device_name = "/gpu:0"
-
-  with tf.Session(
-      graph=tf.Graph(),
-      config=tf.ConfigProto(log_device_placement=True)) as sess:
-    with tf.device(device_name):
-      # Set up the model parameters as variables to exercise variable loading
-      # functionality upon restore.
-      a = tf.Variable(0.5, name="a")
-      b = tf.Variable(2.0, name="b")
-      c = tf.Variable(3.0, name="c")
-
-      # Create a placeholder for serialized tensorflow.Example messages to be
-      # fed.
-      serialized_tf_example = tf.placeholder(tf.string, name="tf_example")
-
-      # Parse the tensorflow.Example looking for a feature named "x" with a
-      # single floating point value.
-      feature_configs = {
-          "x": tf.FixedLenFeature([1], dtype=tf.float32),
-          "x2": tf.FixedLenFeature([1], dtype=tf.float32, default_value=[0.0])
-      }
-      # parse_example only works on CPU
-      with tf.device("/cpu:0"):
-        tf_example = tf.parse_example(serialized_tf_example, feature_configs)
-      # Use tf.identity() to assign name
-      x = tf.identity(tf_example["x"], name="x")
-      y = tf.add(tf.multiply(a, x), b)
-      y = tf.identity(y, name="y")
-      y2 = tf.add(tf.multiply(a, x), c)
-      y2 = tf.identity(y2, name="y2")
-
-      x2 = tf.identity(tf_example["x2"], name="x2")
-      y3 = tf.add(tf.multiply(a, x2), c)
-      y3 = tf.identity(y3, name="y3")
-
-    # Create an assets file that can be saved and restored as part of the
-    # SavedModel.
-    original_assets_directory = "/tmp/original/export/assets"
-    original_assets_filename = "foo.txt"
-    original_assets_filepath = _write_assets(original_assets_directory,
-                                             original_assets_filename)
-
-    # Set up the assets collection.
-    assets_filepath = tf.constant(original_assets_filepath)
-    tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, assets_filepath)
-    filename_tensor = tf.Variable(
-        original_assets_filename,
-        name="filename_tensor",
-        trainable=False,
-        collections=[])
-    assign_filename_op = filename_tensor.assign(original_assets_filename)
-
-    # Set up the signature for Predict with input and output tensor
-    # specification.
-    predict_input_tensor = tf.saved_model.utils.build_tensor_info(x)
-    predict_signature_inputs = {"x": predict_input_tensor}
-
-    predict_output_tensor = tf.saved_model.utils.build_tensor_info(y)
-    predict_signature_outputs = {"y": predict_output_tensor}
-    predict_signature_def = (
-        tf.saved_model.signature_def_utils.build_signature_def(
-            predict_signature_inputs, predict_signature_outputs,
-            tf.saved_model.signature_constants.PREDICT_METHOD_NAME))
-
-    signature_def_map = {
-        "regress_x_to_y":
-            _build_regression_signature(serialized_tf_example, y),
-        "regress_x_to_y2":
-            _build_regression_signature(serialized_tf_example, y2),
-        "regress_x2_to_y3":
-            _build_regression_signature(x2, y3),
-        "classify_x_to_y":
-            _build_classification_signature(serialized_tf_example, y),
-        tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
-            predict_signature_def
-    }
-    # Initialize all variables and then save the SavedModel.
-    sess.run(tf.global_variables_initializer())
-
-    if use_main_op:
-      builder.add_meta_graph_and_variables(
-          sess, [tf.saved_model.tag_constants.SERVING],
-          signature_def_map=signature_def_map,
-          assets_collection=tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS),
-          main_op=tf.group(tf.saved_model.main_op.main_op(),
-                           assign_filename_op))
-    else:
-      builder.add_meta_graph_and_variables(
-          sess, [tf.saved_model.tag_constants.SERVING],
-          signature_def_map=signature_def_map,
-          assets_collection=tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS),
-          main_op=tf.group(assign_filename_op))
-  builder.save(as_text)
-
-
-def main(_):
-  _generate_saved_model_for_half_plus_two(
-      FLAGS.output_dir, device_type=FLAGS.device)
-  print("SavedModel generated for %(device)s at: %(dir)s" % {
-      "device": FLAGS.device,
-      "dir": FLAGS.output_dir
-  })
-
-  _generate_saved_model_for_half_plus_two(
-      FLAGS.output_dir_pbtxt, as_text=True, device_type=FLAGS.device)
-  print("SavedModel generated for %(device)s at: %(dir)s" % {
-      "device": FLAGS.device,
-      "dir": FLAGS.output_dir_pbtxt
-  })
-
-  _generate_saved_model_for_half_plus_two(
-      FLAGS.output_dir_main_op, use_main_op=True, device_type=FLAGS.device)
-  print("SavedModel generated for %(device)s at: %(dir)s " % {
-      "device": FLAGS.device,
-      "dir": FLAGS.output_dir_main_op
-  })
-
-
-if __name__ == "__main__":
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      "--output_dir",
-      type=str,
-      default="/tmp/saved_model_half_plus_two",
-      help="Directory where to output SavedModel.")
-  parser.add_argument(
-      "--output_dir_pbtxt",
-      type=str,
-      default="/tmp/saved_model_half_plus_two_pbtxt",
-      help="Directory where to output the text format of SavedModel.")
-  parser.add_argument(
-      "--output_dir_main_op",
-      type=str,
-      default="/tmp/saved_model_half_plus_two_main_op",
-      help="Directory where to output the SavedModel with a main op.")
-  parser.add_argument(
-      "--device",
-      type=str,
-      default="cpu",
-      help="Force model to run on 'cpu' or 'gpu'")
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/speech_commands/BUILD b/tensorflow/examples/speech_commands/BUILD
index 7f3c764fac62ee11c6351e11229198fc726d3804..88f7fe7faa635339f2b0ef314a71236365902d7f 100644
--- a/tensorflow/examples/speech_commands/BUILD
+++ b/tensorflow/examples/speech_commands/BUILD
@@ -63,6 +63,13 @@ tf_py_test(
 
 py_binary(
     name = "train",
+    srcs = ["train.py"],
+    srcs_version = "PY2AND3",
+    deps = [":train_main_lib"],
+)
+
+py_library(
+    name = "train_main_lib",
     srcs = [
         "train.py",
     ],
@@ -76,8 +83,32 @@ py_binary(
     ],
 )
 
+tf_py_test(
+    name = "train_test",
+    size = "small",
+    srcs = ["train_test.py"],
+    additional_deps = [
+        ":train",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_binary(
     name = "freeze",
+    srcs = ["freeze.py"],
+    srcs_version = "PY2AND3",
+    deps = [":freeze_main_lib"],
+)
+
+py_library(
+    name = "freeze_main_lib",
+    srcs = ["freeze.py"],
+    srcs_version = "PY2AND3",
+    deps = [":freeze_lib"],
+)
+
+py_library(
+    name = "freeze_lib",
     srcs = [
         "freeze.py",
     ],
@@ -103,6 +134,20 @@ tf_py_test(
 
 py_binary(
     name = "wav_to_features",
+    srcs = ["wav_to_features.py"],
+    srcs_version = "PY2AND3",
+    deps = [":wav_to_features_main_lib"],
+)
+
+py_library(
+    name = "wav_to_features_main_lib",
+    srcs = ["wav_to_features.py"],
+    srcs_version = "PY2AND3",
+    deps = [":wav_to_features_lib"],
+)
+
+py_library(
+    name = "wav_to_features_lib",
     srcs = [
         "wav_to_features.py",
     ],
@@ -128,6 +173,20 @@ tf_py_test(
 
 py_binary(
     name = "generate_streaming_test_wav",
+    srcs = ["generate_streaming_test_wav.py"],
+    srcs_version = "PY2AND3",
+    deps = [":generate_streaming_test_wav_main_lib"],
+)
+
+py_library(
+    name = "generate_streaming_test_wav_main_lib",
+    srcs = ["generate_streaming_test_wav.py"],
+    srcs_version = "PY2AND3",
+    deps = [":generate_streaming_test_wav_lib"],
+)
+
+py_library(
+    name = "generate_streaming_test_wav_lib",
     srcs = [
         "generate_streaming_test_wav.py",
     ],
@@ -168,6 +227,20 @@ tf_cc_binary(
 
 py_binary(
     name = "label_wav",
+    srcs = ["label_wav.py"],
+    srcs_version = "PY2AND3",
+    deps = [":label_wav_main_lib"],
+)
+
+py_library(
+    name = "label_wav_main_lib",
+    srcs = ["label_wav.py"],
+    srcs_version = "PY2AND3",
+    deps = [":label_wav_lib"],
+)
+
+py_library(
+    name = "label_wav_lib",
     srcs = [
         "label_wav.py",
     ],
diff --git a/tensorflow/examples/speech_commands/label_wav.py b/tensorflow/examples/speech_commands/label_wav.py
index 0017aec3a54bdcd2ddaec6a1012d629f83564827..eb8323454c23c07d5b536bbdfec30d690767a0fd 100644
--- a/tensorflow/examples/speech_commands/label_wav.py
+++ b/tensorflow/examples/speech_commands/label_wav.py
@@ -45,7 +45,7 @@ FLAGS = None
 
 def load_graph(filename):
   """Unpersists graph from file as default graph."""
-  with tf.gfile.FastGFile(filename, 'rb') as f:
+  with tf.gfile.GFile(filename, 'rb') as f:
     graph_def = tf.GraphDef()
     graph_def.ParseFromString(f.read())
     tf.import_graph_def(graph_def, name='')
diff --git a/tensorflow/examples/speech_commands/label_wav_dir.py b/tensorflow/examples/speech_commands/label_wav_dir.py
index a34db512dda86be138e07a4ffaa1963fe00a5cea..2e1890c3e864b153a4e01badf08b5b55b4377ab6 100644
--- a/tensorflow/examples/speech_commands/label_wav_dir.py
+++ b/tensorflow/examples/speech_commands/label_wav_dir.py
@@ -46,7 +46,7 @@ FLAGS = None
 
 def load_graph(filename):
   """Unpersists graph from file as default graph."""
-  with tf.gfile.FastGFile(filename, 'rb') as f:
+  with tf.gfile.GFile(filename, 'rb') as f:
     graph_def = tf.GraphDef()
     graph_def.ParseFromString(f.read())
     tf.import_graph_def(graph_def, name='')
diff --git a/tensorflow/examples/speech_commands/train_test.py b/tensorflow/examples/speech_commands/train_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..db195760e98812d224cac5b9dfe5c66d4d6a7088
--- /dev/null
+++ b/tensorflow/examples/speech_commands/train_test.py
@@ -0,0 +1,144 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for data input for speech commands."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import tensorflow as tf
+
+from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
+from tensorflow.examples.speech_commands import train
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+
+
+# Used to convert a dictionary into an object, for mocking parsed flags.
+class DictStruct(object):
+
+  def __init__(self, **entries):
+    self.__dict__.update(entries)
+
+
+class TrainTest(test.TestCase):
+
+  def _getWavData(self):
+    with self.cached_session():
+      sample_data = tf.zeros([32000, 2])
+      wav_encoder = contrib_audio.encode_wav(sample_data, 16000)
+      wav_data = self.evaluate(wav_encoder)
+    return wav_data
+
+  def _saveTestWavFile(self, filename, wav_data):
+    with open(filename, 'wb') as f:
+      f.write(wav_data)
+
+  def _saveWavFolders(self, root_dir, labels, how_many):
+    wav_data = self._getWavData()
+    for label in labels:
+      dir_name = os.path.join(root_dir, label)
+      os.mkdir(dir_name)
+      for i in range(how_many):
+        file_path = os.path.join(dir_name, 'some_audio_%d.wav' % i)
+        self._saveTestWavFile(file_path, wav_data)
+
+  def _prepareDummyTrainingData(self):
+    tmp_dir = self.get_temp_dir()
+    wav_dir = os.path.join(tmp_dir, 'wavs')
+    os.mkdir(wav_dir)
+    self._saveWavFolders(wav_dir, ['a', 'b', 'c'], 100)
+    background_dir = os.path.join(wav_dir, '_background_noise_')
+    os.mkdir(background_dir)
+    wav_data = self._getWavData()
+    for i in range(10):
+      file_path = os.path.join(background_dir, 'background_audio_%d.wav' % i)
+      self._saveTestWavFile(file_path, wav_data)
+    return wav_dir
+
+  def _getDefaultFlags(self):
+    flags = {
+        'data_url': '',
+        'data_dir': self._prepareDummyTrainingData(),
+        'wanted_words': 'a,b,c',
+        'sample_rate': 16000,
+        'clip_duration_ms': 1000,
+        'window_size_ms': 30,
+        'window_stride_ms': 20,
+        'feature_bin_count': 40,
+        'preprocess': 'mfcc',
+        'silence_percentage': 25,
+        'unknown_percentage': 25,
+        'validation_percentage': 10,
+        'testing_percentage': 10,
+        'summaries_dir': os.path.join(self.get_temp_dir(), 'summaries'),
+        'train_dir': os.path.join(self.get_temp_dir(), 'train'),
+        'time_shift_ms': 100,
+        'how_many_training_steps': '2',
+        'learning_rate': '0.01',
+        'quantize': False,
+        'model_architecture': 'conv',
+        'check_nans': False,
+        'start_checkpoint': '',
+        'batch_size': 1,
+        'background_volume': 0.25,
+        'background_frequency': 0.8,
+        'eval_step_interval': 1,
+        'save_step_interval': 1,
+    }
+    return DictStruct(**flags)
+
+  @test_util.run_deprecated_v1
+  def testTrain(self):
+    train.FLAGS = self._getDefaultFlags()
+    train.main('')
+    self.assertTrue(
+        gfile.Exists(
+            os.path.join(train.FLAGS.train_dir,
+                         train.FLAGS.model_architecture + '.pbtxt')))
+    self.assertTrue(
+        gfile.Exists(
+            os.path.join(train.FLAGS.train_dir,
+                         train.FLAGS.model_architecture + '_labels.txt')))
+    self.assertTrue(
+        gfile.Exists(
+            os.path.join(train.FLAGS.train_dir,
+                         train.FLAGS.model_architecture + '.ckpt-1.meta')))
+
+  @test_util.run_deprecated_v1
+  def testQuantizedTrain(self):
+    train.FLAGS = self._getDefaultFlags()
+    train.FLAGS.quantize = True
+    train.FLAGS.model_architecture = 'tiny_conv'
+    train.main('')
+    self.assertTrue(
+        gfile.Exists(
+            os.path.join(train.FLAGS.train_dir,
+                         train.FLAGS.model_architecture + '.pbtxt')))
+    self.assertTrue(
+        gfile.Exists(
+            os.path.join(train.FLAGS.train_dir,
+                         train.FLAGS.model_architecture + '_labels.txt')))
+    self.assertTrue(
+        gfile.Exists(
+            os.path.join(train.FLAGS.train_dir,
+                         train.FLAGS.model_architecture + '.ckpt-1.meta')))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
index b09ee9976897fcab2e90fdc17e8030532080aca8..805ec203b489e51ef25149d9c8a2b1085461e543 100644
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
@@ -18,12 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import argparse
 import collections
 import math
 import os
-import sys
-import argparse
 import random
+import sys
 from tempfile import gettempdir
 import zipfile
 
@@ -34,320 +34,324 @@ import tensorflow as tf
 
 from tensorflow.contrib.tensorboard.plugins import projector
 
-# Give a folder path as an argument with '--log_dir' to save
-# TensorBoard summaries. Default is a log folder in current directory.
-current_path = os.path.dirname(os.path.realpath(sys.argv[0]))
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    '--log_dir',
-    type=str,
-    default=os.path.join(current_path, 'log'),
-    help='The log directory for TensorBoard summaries.')
-FLAGS, unparsed = parser.parse_known_args()
-
-# Create the directory for TensorBoard variables if there is not.
-if not os.path.exists(FLAGS.log_dir):
-  os.makedirs(FLAGS.log_dir)
-
-# Step 1: Download the data.
-url = 'http://mattmahoney.net/dc/'
-
-
-# pylint: disable=redefined-outer-name
-def maybe_download(filename, expected_bytes):
-  """Download a file if not present, and make sure it's the right size."""
-  local_filename = os.path.join(gettempdir(), filename)
-  if not os.path.exists(local_filename):
-    local_filename, _ = urllib.request.urlretrieve(url + filename,
-                                                   local_filename)
-  statinfo = os.stat(local_filename)
-  if statinfo.st_size == expected_bytes:
-    print('Found and verified', filename)
-  else:
-    print(statinfo.st_size)
-    raise Exception('Failed to verify ' + local_filename +
-                    '. Can you get to it with a browser?')
-  return local_filename
-
-
-filename = maybe_download('text8.zip', 31344016)
-
-
-# Read the data into a list of strings.
-def read_data(filename):
-  """Extract the first file enclosed in a zip file as a list of words."""
-  with zipfile.ZipFile(filename) as f:
-    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
-  return data
-
-
-vocabulary = read_data(filename)
-print('Data size', len(vocabulary))
-
-# Step 2: Build the dictionary and replace rare words with UNK token.
-vocabulary_size = 50000
-
-
-def build_dataset(words, n_words):
-  """Process raw inputs into a dataset."""
-  count = [['UNK', -1]]
-  count.extend(collections.Counter(words).most_common(n_words - 1))
-  dictionary = dict()
-  for word, _ in count:
-    dictionary[word] = len(dictionary)
-  data = list()
-  unk_count = 0
-  for word in words:
-    index = dictionary.get(word, 0)
-    if index == 0:  # dictionary['UNK']
-      unk_count += 1
-    data.append(index)
-  count[0][1] = unk_count
-  reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
-  return data, count, dictionary, reversed_dictionary
-
-
-# Filling 4 global variables:
-# data - list of codes (integers from 0 to vocabulary_size-1).
-#   This is the original text but words are replaced by their codes
-# count - map of words(strings) to count of occurrences
-# dictionary - map of words(strings) to their codes(integers)
-# reverse_dictionary - maps codes(integers) to words(strings)
-data, count, dictionary, reverse_dictionary = build_dataset(
-    vocabulary, vocabulary_size)
-del vocabulary  # Hint to reduce memory.
-print('Most common words (+UNK)', count[:5])
-print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
-
 data_index = 0
 
 
-# Step 3: Function to generate a training batch for the skip-gram model.
-def generate_batch(batch_size, num_skips, skip_window):
-  global data_index
-  assert batch_size % num_skips == 0
-  assert num_skips <= 2 * skip_window
-  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
-  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
-  span = 2 * skip_window + 1  # [ skip_window target skip_window ]
-  buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builtin
-  if data_index + span > len(data):
-    data_index = 0
-  buffer.extend(data[data_index:data_index + span])
-  data_index += span
-  for i in range(batch_size // num_skips):
-    context_words = [w for w in range(span) if w != skip_window]
-    words_to_use = random.sample(context_words, num_skips)
-    for j, context_word in enumerate(words_to_use):
-      batch[i * num_skips + j] = buffer[skip_window]
-      labels[i * num_skips + j, 0] = buffer[context_word]
-    if data_index == len(data):
-      buffer.extend(data[0:span])
-      data_index = span
+def word2vec_basic(log_dir):
+  """Example of building, training and visualizing a word2vec model."""
+  # Create the directory for TensorBoard variables if there is not.
+  if not os.path.exists(log_dir):
+    os.makedirs(log_dir)
+
+  # Step 1: Download the data.
+  url = 'http://mattmahoney.net/dc/'
+
+  # pylint: disable=redefined-outer-name
+  def maybe_download(filename, expected_bytes):
+    """Download a file if not present, and make sure it's the right size."""
+    local_filename = os.path.join(gettempdir(), filename)
+    if not os.path.exists(local_filename):
+      local_filename, _ = urllib.request.urlretrieve(url + filename,
+                                                     local_filename)
+    statinfo = os.stat(local_filename)
+    if statinfo.st_size == expected_bytes:
+      print('Found and verified', filename)
     else:
-      buffer.append(data[data_index])
-      data_index += 1
-  # Backtrack a little bit to avoid skipping words in the end of a batch
-  data_index = (data_index + len(data) - span) % len(data)
-  return batch, labels
-
-
-batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
-for i in range(8):
-  print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0],
-        reverse_dictionary[labels[i, 0]])
-
-# Step 4: Build and train a skip-gram model.
-
-batch_size = 128
-embedding_size = 128  # Dimension of the embedding vector.
-skip_window = 1  # How many words to consider left and right.
-num_skips = 2  # How many times to reuse an input to generate a label.
-num_sampled = 64  # Number of negative examples to sample.
-
-# We pick a random validation set to sample nearest neighbors. Here we limit the
-# validation samples to the words that have a low numeric ID, which by
-# construction are also the most frequent. These 3 variables are used only for
-# displaying model accuracy, they don't affect calculation.
-valid_size = 16  # Random set of words to evaluate similarity on.
-valid_window = 100  # Only pick dev samples in the head of the distribution.
-valid_examples = np.random.choice(valid_window, valid_size, replace=False)
-
-graph = tf.Graph()
-
-with graph.as_default():
-
-  # Input data.
-  with tf.name_scope('inputs'):
-    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
-    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
-    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
-
-  # Ops and variables pinned to the CPU because of missing GPU implementation
-  with tf.device('/cpu:0'):
-    # Look up embeddings for inputs.
-    with tf.name_scope('embeddings'):
-      embeddings = tf.Variable(
-          tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
-      embed = tf.nn.embedding_lookup(embeddings, train_inputs)
-
-    # Construct the variables for the NCE loss
-    with tf.name_scope('weights'):
-      nce_weights = tf.Variable(
-          tf.truncated_normal(
-              [vocabulary_size, embedding_size],
-              stddev=1.0 / math.sqrt(embedding_size)))
-    with tf.name_scope('biases'):
-      nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
-
-  # Compute the average NCE loss for the batch.
-  # tf.nce_loss automatically draws a new sample of the negative labels each
-  # time we evaluate the loss.
-  # Explanation of the meaning of NCE loss:
-  #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
-  with tf.name_scope('loss'):
-    loss = tf.reduce_mean(
-        tf.nn.nce_loss(
-            weights=nce_weights,
-            biases=nce_biases,
-            labels=train_labels,
-            inputs=embed,
-            num_sampled=num_sampled,
-            num_classes=vocabulary_size))
-
-  # Add the loss value as a scalar to summary.
-  tf.summary.scalar('loss', loss)
-
-  # Construct the SGD optimizer using a learning rate of 1.0.
-  with tf.name_scope('optimizer'):
-    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
-
-  # Compute the cosine similarity between minibatch examples and all embeddings.
-  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
-  normalized_embeddings = embeddings / norm
-  valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
-                                            valid_dataset)
-  similarity = tf.matmul(
-      valid_embeddings, normalized_embeddings, transpose_b=True)
-
-  # Merge all summaries.
-  merged = tf.summary.merge_all()
-
-  # Add variable initializer.
-  init = tf.global_variables_initializer()
-
-  # Create a saver.
-  saver = tf.train.Saver()
-
-# Step 5: Begin training.
-num_steps = 100001
-
-with tf.Session(graph=graph) as session:
-  # Open a writer to write summaries.
-  writer = tf.summary.FileWriter(FLAGS.log_dir, session.graph)
-
-  # We must initialize all variables before we use them.
-  init.run()
-  print('Initialized')
-
-  average_loss = 0
-  for step in xrange(num_steps):
-    batch_inputs, batch_labels = generate_batch(batch_size, num_skips,
-                                                skip_window)
-    feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
-
-    # Define metadata variable.
-    run_metadata = tf.RunMetadata()
-
-    # We perform one update step by evaluating the optimizer op (including it
-    # in the list of returned values for session.run()
-    # Also, evaluate the merged op to get all summaries from the returned "summary" variable.
-    # Feed metadata variable to session for visualizing the graph in TensorBoard.
-    _, summary, loss_val = session.run(
-        [optimizer, merged, loss],
-        feed_dict=feed_dict,
-        run_metadata=run_metadata)
-    average_loss += loss_val
-
-    # Add returned summaries to writer in each step.
-    writer.add_summary(summary, step)
-    # Add metadata to visualize the graph for the last run.
-    if step == (num_steps - 1):
-      writer.add_run_metadata(run_metadata, 'step%d' % step)
-
-    if step % 2000 == 0:
-      if step > 0:
-        average_loss /= 2000
-      # The average loss is an estimate of the loss over the last 2000 batches.
-      print('Average loss at step ', step, ': ', average_loss)
-      average_loss = 0
-
-    # Note that this is expensive (~20% slowdown if computed every 500 steps)
-    if step % 10000 == 0:
-      sim = similarity.eval()
-      for i in xrange(valid_size):
-        valid_word = reverse_dictionary[valid_examples[i]]
-        top_k = 8  # number of nearest neighbors
-        nearest = (-sim[i, :]).argsort()[1:top_k + 1]
-        log_str = 'Nearest to %s:' % valid_word
-        for k in xrange(top_k):
-          close_word = reverse_dictionary[nearest[k]]
-          log_str = '%s %s,' % (log_str, close_word)
-        print(log_str)
-  final_embeddings = normalized_embeddings.eval()
-
-  # Write corresponding labels for the embeddings.
-  with open(FLAGS.log_dir + '/metadata.tsv', 'w') as f:
-    for i in xrange(vocabulary_size):
-      f.write(reverse_dictionary[i] + '\n')
-
-  # Save the model for checkpoints.
-  saver.save(session, os.path.join(FLAGS.log_dir, 'model.ckpt'))
-
-  # Create a configuration for visualizing embeddings with the labels in TensorBoard.
-  config = projector.ProjectorConfig()
-  embedding_conf = config.embeddings.add()
-  embedding_conf.tensor_name = embeddings.name
-  embedding_conf.metadata_path = os.path.join(FLAGS.log_dir, 'metadata.tsv')
-  projector.visualize_embeddings(writer, config)
-
-writer.close()
-
-# Step 6: Visualize the embeddings.
-
-
-# pylint: disable=missing-docstring
-# Function to draw visualization of distance between embeddings.
-def plot_with_labels(low_dim_embs, labels, filename):
-  assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
-  plt.figure(figsize=(18, 18))  # in inches
-  for i, label in enumerate(labels):
-    x, y = low_dim_embs[i, :]
-    plt.scatter(x, y)
-    plt.annotate(
-        label,
-        xy=(x, y),
-        xytext=(5, 2),
-        textcoords='offset points',
-        ha='right',
-        va='bottom')
-
-  plt.savefig(filename)
-
-
-try:
-  # pylint: disable=g-import-not-at-top
-  from sklearn.manifold import TSNE
-  import matplotlib.pyplot as plt
-
-  tsne = TSNE(
-      perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
-  plot_only = 500
-  low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
-  labels = [reverse_dictionary[i] for i in xrange(plot_only)]
-  plot_with_labels(low_dim_embs, labels, os.path.join(gettempdir(), 'tsne.png'))
-
-except ImportError as ex:
-  print('Please install sklearn, matplotlib, and scipy to show embeddings.')
-  print(ex)
+      print(statinfo.st_size)
+      raise Exception('Failed to verify ' + local_filename +
+                      '. Can you get to it with a browser?')
+    return local_filename
+
+  filename = maybe_download('text8.zip', 31344016)
+
+  # Read the data into a list of strings.
+  def read_data(filename):
+    """Extract the first file enclosed in a zip file as a list of words."""
+    with zipfile.ZipFile(filename) as f:
+      data = tf.compat.as_str(f.read(f.namelist()[0])).split()
+    return data
+
+  vocabulary = read_data(filename)
+  print('Data size', len(vocabulary))
+
+  # Step 2: Build the dictionary and replace rare words with UNK token.
+  vocabulary_size = 50000
+
+  def build_dataset(words, n_words):
+    """Process raw inputs into a dataset."""
+    count = [['UNK', -1]]
+    count.extend(collections.Counter(words).most_common(n_words - 1))
+    dictionary = dict()
+    for word, _ in count:
+      dictionary[word] = len(dictionary)
+    data = list()
+    unk_count = 0
+    for word in words:
+      index = dictionary.get(word, 0)
+      if index == 0:  # dictionary['UNK']
+        unk_count += 1
+      data.append(index)
+    count[0][1] = unk_count
+    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
+    return data, count, dictionary, reversed_dictionary
+
+  # Filling 4 global variables:
+  # data - list of codes (integers from 0 to vocabulary_size-1).
+  #   This is the original text but words are replaced by their codes
+  # count - map of words(strings) to count of occurrences
+  # dictionary - map of words(strings) to their codes(integers)
+  # reverse_dictionary - maps codes(integers) to words(strings)
+  data, count, unused_dictionary, reverse_dictionary = build_dataset(
+      vocabulary, vocabulary_size)
+  del vocabulary  # Hint to reduce memory.
+  print('Most common words (+UNK)', count[:5])
+  print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
+
+  # Step 3: Function to generate a training batch for the skip-gram model.
+  def generate_batch(batch_size, num_skips, skip_window):
+    global data_index
+    assert batch_size % num_skips == 0
+    assert num_skips <= 2 * skip_window
+    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
+    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
+    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
+    buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builtin
+    if data_index + span > len(data):
+      data_index = 0
+    buffer.extend(data[data_index:data_index + span])
+    data_index += span
+    for i in range(batch_size // num_skips):
+      context_words = [w for w in range(span) if w != skip_window]
+      words_to_use = random.sample(context_words, num_skips)
+      for j, context_word in enumerate(words_to_use):
+        batch[i * num_skips + j] = buffer[skip_window]
+        labels[i * num_skips + j, 0] = buffer[context_word]
+      if data_index == len(data):
+        buffer.extend(data[0:span])
+        data_index = span
+      else:
+        buffer.append(data[data_index])
+        data_index += 1
+    # Backtrack a little bit to avoid skipping words in the end of a batch
+    data_index = (data_index + len(data) - span) % len(data)
+    return batch, labels
+
+  batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
+  for i in range(8):
+    print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0],
+          reverse_dictionary[labels[i, 0]])
+
+  # Step 4: Build and train a skip-gram model.
+
+  batch_size = 128
+  embedding_size = 128  # Dimension of the embedding vector.
+  skip_window = 1  # How many words to consider left and right.
+  num_skips = 2  # How many times to reuse an input to generate a label.
+  num_sampled = 64  # Number of negative examples to sample.
+
+  # We pick a random validation set to sample nearest neighbors. Here we limit
+  # the validation samples to the words that have a low numeric ID, which by
+  # construction are also the most frequent. These 3 variables are used only for
+  # displaying model accuracy, they don't affect calculation.
+  valid_size = 16  # Random set of words to evaluate similarity on.
+  valid_window = 100  # Only pick dev samples in the head of the distribution.
+  valid_examples = np.random.choice(valid_window, valid_size, replace=False)
+
+  graph = tf.Graph()
+
+  with graph.as_default():
+
+    # Input data.
+    with tf.name_scope('inputs'):
+      train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
+      train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
+      valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
+
+    # Ops and variables pinned to the CPU because of missing GPU implementation
+    with tf.device('/cpu:0'):
+      # Look up embeddings for inputs.
+      with tf.name_scope('embeddings'):
+        embeddings = tf.Variable(
+            tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
+        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
+
+      # Construct the variables for the NCE loss
+      with tf.name_scope('weights'):
+        nce_weights = tf.Variable(
+            tf.truncated_normal([vocabulary_size, embedding_size],
+                                stddev=1.0 / math.sqrt(embedding_size)))
+      with tf.name_scope('biases'):
+        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
+
+    # Compute the average NCE loss for the batch.
+    # tf.nce_loss automatically draws a new sample of the negative labels each
+    # time we evaluate the loss.
+    # Explanation of the meaning of NCE loss:
+    #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
+    with tf.name_scope('loss'):
+      loss = tf.reduce_mean(
+          tf.nn.nce_loss(
+              weights=nce_weights,
+              biases=nce_biases,
+              labels=train_labels,
+              inputs=embed,
+              num_sampled=num_sampled,
+              num_classes=vocabulary_size))
+
+    # Add the loss value as a scalar to summary.
+    tf.summary.scalar('loss', loss)
+
+    # Construct the SGD optimizer using a learning rate of 1.0.
+    with tf.name_scope('optimizer'):
+      optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
+
+    # Compute the cosine similarity between minibatch examples and all
+    # embeddings.
+    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
+    normalized_embeddings = embeddings / norm
+    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
+                                              valid_dataset)
+    similarity = tf.matmul(
+        valid_embeddings, normalized_embeddings, transpose_b=True)
+
+    # Merge all summaries.
+    merged = tf.summary.merge_all()
+
+    # Add variable initializer.
+    init = tf.global_variables_initializer()
+
+    # Create a saver.
+    saver = tf.train.Saver()
+
+  # Step 5: Begin training.
+  num_steps = 100001
+
+  with tf.Session(graph=graph) as session:
+    # Open a writer to write summaries.
+    writer = tf.summary.FileWriter(log_dir, session.graph)
+
+    # We must initialize all variables before we use them.
+    init.run()
+    print('Initialized')
+
+    average_loss = 0
+    for step in xrange(num_steps):
+      batch_inputs, batch_labels = generate_batch(batch_size, num_skips,
+                                                  skip_window)
+      feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
+
+      # Define metadata variable.
+      run_metadata = tf.RunMetadata()
+
+      # We perform one update step by evaluating the optimizer op (including it
+      # in the list of returned values for session.run()
+      # Also, evaluate the merged op to get all summaries from the returned
+      # "summary" variable. Feed metadata variable to session for visualizing
+      # the graph in TensorBoard.
+      _, summary, loss_val = session.run([optimizer, merged, loss],
+                                         feed_dict=feed_dict,
+                                         run_metadata=run_metadata)
+      average_loss += loss_val
+
+      # Add returned summaries to writer in each step.
+      writer.add_summary(summary, step)
+      # Add metadata to visualize the graph for the last run.
+      if step == (num_steps - 1):
+        writer.add_run_metadata(run_metadata, 'step%d' % step)
+
+      if step % 2000 == 0:
+        if step > 0:
+          average_loss /= 2000
+        # The average loss is an estimate of the loss over the last 2000
+        # batches.
+        print('Average loss at step ', step, ': ', average_loss)
+        average_loss = 0
+
+      # Note that this is expensive (~20% slowdown if computed every 500 steps)
+      if step % 10000 == 0:
+        sim = similarity.eval()
+        for i in xrange(valid_size):
+          valid_word = reverse_dictionary[valid_examples[i]]
+          top_k = 8  # number of nearest neighbors
+          nearest = (-sim[i, :]).argsort()[1:top_k + 1]
+          log_str = 'Nearest to %s:' % valid_word
+          for k in xrange(top_k):
+            close_word = reverse_dictionary[nearest[k]]
+            log_str = '%s %s,' % (log_str, close_word)
+          print(log_str)
+    final_embeddings = normalized_embeddings.eval()
+
+    # Write corresponding labels for the embeddings.
+    with open(log_dir + '/metadata.tsv', 'w') as f:
+      for i in xrange(vocabulary_size):
+        f.write(reverse_dictionary[i] + '\n')
+
+    # Save the model for checkpoints.
+    saver.save(session, os.path.join(log_dir, 'model.ckpt'))
+
+    # Create a configuration for visualizing embeddings with the labels in
+    # TensorBoard.
+    config = projector.ProjectorConfig()
+    embedding_conf = config.embeddings.add()
+    embedding_conf.tensor_name = embeddings.name
+    embedding_conf.metadata_path = os.path.join(log_dir, 'metadata.tsv')
+    projector.visualize_embeddings(writer, config)
+
+  writer.close()
+
+  # Step 6: Visualize the embeddings.
+
+  # pylint: disable=missing-docstring
+  # Function to draw visualization of distance between embeddings.
+  def plot_with_labels(low_dim_embs, labels, filename):
+    assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
+    plt.figure(figsize=(18, 18))  # in inches
+    for i, label in enumerate(labels):
+      x, y = low_dim_embs[i, :]
+      plt.scatter(x, y)
+      plt.annotate(
+          label,
+          xy=(x, y),
+          xytext=(5, 2),
+          textcoords='offset points',
+          ha='right',
+          va='bottom')
+
+    plt.savefig(filename)
+
+  try:
+    # pylint: disable=g-import-not-at-top
+    from sklearn.manifold import TSNE
+    import matplotlib.pyplot as plt
+
+    tsne = TSNE(
+        perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
+    plot_only = 500
+    low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
+    labels = [reverse_dictionary[i] for i in xrange(plot_only)]
+    plot_with_labels(low_dim_embs, labels, os.path.join(gettempdir(),
+                                                        'tsne.png'))
+
+  except ImportError as ex:
+    print('Please install sklearn, matplotlib, and scipy to show embeddings.')
+    print(ex)
+
+
+# All functionality is run after tf.app.run() (b/122547914). This could be split
+# up but the methods are laid sequentially with their usage for clarity.
+def main(unused_argv):
+  # Give a folder path as an argument with '--log_dir' to save
+  # TensorBoard summaries. Default is a log folder in current directory.
+  current_path = os.path.dirname(os.path.realpath(sys.argv[0]))
+
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      '--log_dir',
+      type=str,
+      default=os.path.join(current_path, 'log'),
+      help='The log directory for TensorBoard summaries.')
+  flags, unused_flags = parser.parse_known_args()
+  word2vec_basic(flags.log_dir)
+
+if __name__ == '__main__':
+  tf.app.run()
diff --git a/tensorflow/examples/udacity/README.md b/tensorflow/examples/udacity/README.md
index c8ab24871c4168eb69363a2cc99492e542ca5bec..b3bd73a08b28c10dc66a3b0019411b82709a4264 100644
--- a/tensorflow/examples/udacity/README.md
+++ b/tensorflow/examples/udacity/README.md
@@ -121,7 +121,7 @@ History
 * 0.1.0: Initial release.
 * 0.2.0: Many fixes, including lower memory footprint and support for Python 3.
 * 0.3.0: Use 0.7.1 release.
-* 0.4.0: Move notMMNIST data for Google Cloud.
+* 0.4.0: Move notMNIST data for Google Cloud.
 * 0.5.0: Actually use 0.7.1 release.
 * 0.6.0: Update to TF 0.10.0, add libjpeg (for Pillow).
 * 1.0.0: Update to TF 1.0.0 release.
diff --git a/tensorflow/go/BUILD b/tensorflow/go/BUILD
index f16cffac99491bb45ec783ad85c0f3a6d6313fb4..62d6b4f57c244cfb17cf9f5eb3b7f68eb6bbae0c 100644
--- a/tensorflow/go/BUILD
+++ b/tensorflow/go/BUILD
@@ -17,6 +17,7 @@ sh_test(
         ":all_files",  # Go sources
         "//tensorflow:libtensorflow.so",  # C library
         "//tensorflow/c:headers",  # C library header
+        "//tensorflow/c/eager:headers",  # Eager C library header
         "//tensorflow/cc/saved_model:saved_model_half_plus_two",  # Testdata for LoadSavedModel
     ],
 )
diff --git a/tensorflow/go/attrs.go b/tensorflow/go/attrs.go
index f86c5737bc79f1e349e442669615598949ecd333..ed1a1f0b5419f7f76c6aa8ccb657e16480e85780 100644
--- a/tensorflow/go/attrs.go
+++ b/tensorflow/go/attrs.go
@@ -170,7 +170,8 @@ func listAttribute(op *Operation, cname *C.char, meta C.TF_AttrMetadata) (interf
 			}
 			// A []C.int64_t slice backed by C memory.
 			// See: https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
-			slice := (*[1 << 30]C.int64_t)(unsafe.Pointer(dim))[:numDim:numDim]
+			// Using [1<<27] instead of [1<<30] so it works on 32-bit architecture
+			slice := (*[1 << 27]C.int64_t)(unsafe.Pointer(dim))[:numDim:numDim]
 			list[i] = makeCShape(slice)
 		}
 		return list, nil
diff --git a/tensorflow/go/context.go b/tensorflow/go/context.go
new file mode 100644
index 0000000000000000000000000000000000000000..04f86282af3293482a1410242fec74945ad4d776
--- /dev/null
+++ b/tensorflow/go/context.go
@@ -0,0 +1,109 @@
+/*
+Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package tensorflow
+
+// #include <stdlib.h>
+// #include "tensorflow/c/c_api.h"
+// #include "tensorflow/c/eager/c_api.h"
+import "C"
+import (
+	"fmt"
+	"runtime"
+)
+
+// ContextOptions contains configuration information for a session
+type ContextOptions struct {
+	// Config is a binary-serialized representation of the
+	// tensorflow.ConfigProto protocol message
+	// (https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto).
+	Config []byte
+
+	// Sets the default execution mode
+	Async bool
+}
+
+// c converts the ContextOptions to the C API's TF_ContextOptions.
+// Caller takes ownership of returned object.
+func (o *ContextOptions) c() (*C.TFE_ContextOptions, error) {
+	opt := C.TFE_NewContextOptions()
+	if o == nil {
+		return opt, nil
+	}
+
+	if sz := len(o.Config); sz > 0 {
+		status := newStatus()
+		cConfig := C.CBytes(o.Config)
+		C.TFE_ContextOptionsSetConfig(opt, cConfig, C.size_t(sz), status.c)
+		C.free(cConfig)
+		if err := status.Err(); err != nil {
+			C.TFE_DeleteContextOptions(opt)
+			return nil, fmt.Errorf("invalid ContextOptions.Config: %v", err)
+		}
+	}
+
+	var async uint8
+	if o.Async {
+		async = 1
+	}
+	C.TFE_ContextOptionsSetAsync(opt, C.uchar(async))
+
+	return opt, nil
+}
+
+// Context for executing operations eagerly.
+//
+// A Context allows operations to be executed immediately. It encapsulates
+// information such as the available devices, resource manager etc. It also
+// allows the user to configure execution using a ConfigProto, as they can
+// configure a Session when executing a Graph.
+type Context struct {
+	c *C.TFE_Context
+}
+
+// NewContext creates a new context for eager execution.
+// options may be nil to use the default options.
+func NewContext(options *ContextOptions) (*Context, error) {
+	status := newStatus()
+	cOpt, err := options.c()
+	if err != nil {
+		return nil, err
+	}
+	defer C.TFE_DeleteContextOptions(cOpt)
+	cContext := C.TFE_NewContext(cOpt, status.c)
+	if err := status.Err(); err != nil {
+		return nil, err
+	}
+
+	c := &Context{c: cContext}
+	runtime.SetFinalizer(c, (*Context).finalizer)
+	return c, nil
+}
+
+func (c *Context) finalizer() {
+	C.TFE_DeleteContext(c.c)
+}
+
+// ListDevices returns the list of devices associated with a Context.
+func (c *Context) ListDevices() ([]Device, error) {
+	status := newStatus()
+	devicesList := C.TFE_ContextListDevices(c.c, status.c)
+	if err := status.Err(); err != nil {
+		return nil, fmt.Errorf("SessionListDevices() failed: %v", err)
+	}
+	defer C.TF_DeleteDeviceList(devicesList)
+	return deviceSliceFromDeviceList(devicesList)
+}
diff --git a/tensorflow/go/context_test.go b/tensorflow/go/context_test.go
new file mode 100644
index 0000000000000000000000000000000000000000..ce4005da24226c00ce6fec39b6397952d3c6ec24
--- /dev/null
+++ b/tensorflow/go/context_test.go
@@ -0,0 +1,57 @@
+/*
+Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package tensorflow
+
+import (
+	"fmt"
+	"testing"
+)
+
+func TestContextConfigSetAsync(t *testing.T) {
+	tests := []bool{false, true}
+	for _, test := range tests {
+		t.Run(fmt.Sprint(test), func(t *testing.T) {
+			opt := &ContextOptions{Async: test}
+			if _, err := NewContext(opt); err != nil {
+				t.Fatal(err)
+			}
+		})
+	}
+}
+
+func TestContextConfigListDevices(t *testing.T) {
+	c, err := NewContext(nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	devs, err := c.ListDevices()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(devs) < 1 {
+		t.Fatalf("No devices found using ListDevices()")
+	}
+	foundCPUDevice := false
+	for _, d := range devs {
+		if d.Type == "CPU" {
+			foundCPUDevice = true
+		}
+	}
+	if !foundCPUDevice {
+		t.Error("Failed to find CPU device using ListDevices()")
+	}
+}
diff --git a/tensorflow/go/genop/internal/api_def_map.go b/tensorflow/go/genop/internal/api_def_map.go
index 8600452b476dee49292cbffe630026cf6077e22b..0bbd88b61c345906a13944aa3c7ad7b0582fffae 100644
--- a/tensorflow/go/genop/internal/api_def_map.go
+++ b/tensorflow/go/genop/internal/api_def_map.go
@@ -31,7 +31,7 @@ import (
 	"unsafe"
 
 	"github.com/golang/protobuf/proto"
-	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/tensorflow/core/framework_go_proto"
+	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core/framework"
 )
 
 // Encapsulates a collection of API definitions.
diff --git a/tensorflow/go/genop/internal/genop.go b/tensorflow/go/genop/internal/genop.go
index fb8163121850cee36e1fcc652ca258b1fe2d42ff..1c05715a1a2f50b857c78e8c192d6c865b70e6c7 100644
--- a/tensorflow/go/genop/internal/genop.go
+++ b/tensorflow/go/genop/internal/genop.go
@@ -47,7 +47,7 @@ import (
 	"unsafe"
 
 	"github.com/golang/protobuf/proto"
-	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/tensorflow/core/framework_go_proto"
+	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core/framework"
 )
 
 // GenerateFunctionsForRegisteredOps writes a Go source code file to w
diff --git a/tensorflow/go/genop/internal/genop_test.go b/tensorflow/go/genop/internal/genop_test.go
index d20d22e0c1502f92ade7ef5aa40985dce73b7552..acce6dea67c2e93309df70dd5009ad0dc086c523 100644
--- a/tensorflow/go/genop/internal/genop_test.go
+++ b/tensorflow/go/genop/internal/genop_test.go
@@ -22,7 +22,7 @@ import (
 	"testing"
 
 	"github.com/golang/protobuf/proto"
-	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/tensorflow/core/framework_go_proto"
+	pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core/framework"
 )
 
 // Creates an ApiDef based on opdef and applies overrides
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 6e49fbb9eae047b4b45758165ad47a5c1923aaf6..5f07b534fa7c41b6c88627b0c20994154b64b594 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -327,12 +327,100 @@ func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQua
 	return op.Output(0)
 }
 
-// Scatter `updates` into a new tensor according to `indices`.
+// Subtracts sparse `updates` from an existing tensor according to `indices`.
 //
-// Creates a new tensor by applying sparse `updates` to individual values or
-// slices within a tensor (initially zero for numeric, empty for string) of
-// the given `shape` according to indices.  This operator is the inverse of the
-// `tf.gather_nd` operator which extracts values or slices from a given tensor.
+// This operation creates a new tensor by subtracting sparse `updates` from the
+// passed in `tensor`.
+// This operation is very similar to `tf.scatter_nd_sub`, except that the updates
+// are subtracted from an existing tensor (as opposed to a variable). If the memory
+// for the existing tensor cannot be re-used, a copy is made and updated.
+//
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+//
+//     indices.shape[-1] <= shape.rank
+//
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
+//
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
+//
+// The simplest form of tensor_scatter_sub is to subtract individual elements
+// from a tensor by index. For example, say we want to insert 4 scattered elements
+// in a rank-1 tensor with 8 elements.
+//
+// In Python, this scatter subtract operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     tensor = tf.ones([8], dtype=tf.int32)
+//     updated = tf.tensor_scatter_sub(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [1, -10, 1, -9, -8, 1, 1, -11]
+//
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
+//
+// In Python, this scatter add operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     tensor = tf.ones([4, 4, 4])
+//     updated = tf.tensor_scatter_sub(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [[[-4, -4, -4, -4], [-5, -5, -5, -5], [-6, -6, -6, -6], [-7, -7, -7, -7]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+//      [[-4, -4, -4, -4], [-5, -5, -5, -5], [-6, -6, -6, -6], [-7, -7, -7, -7]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, the index is ignored.
+//
+// Arguments:
+//	tensor: Tensor to copy/update.
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//
+// Returns A new tensor copied from tensor and updates subtracted according to the indices.
+func TensorScatterSub(scope *Scope, tensor tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorScatterSub",
+		Input: []tf.Input{
+			tensor, indices, updates,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Scatter `updates` into an existing tensor according to `indices`.
+//
+// This operation creates a new tensor by applying sparse `updates` to the passed
+// in `tensor`.
+// This operation is very similar to `tf.scatter_nd`, except that the updates are
+// scattered onto an existing tensor (as opposed to a zero-tensor). If the memory
+// for the existing tensor cannot be re-used, a copy is made and updated.
 //
 // If `indices` contains duplicates, then their updates are accumulated (summed).
 //
@@ -366,24 +454,20 @@ func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQua
 // ```python
 //     indices = tf.constant([[4], [3], [1], [7]])
 //     updates = tf.constant([9, 10, 11, 12])
-//     shape = tf.constant([8])
-//     scatter = tf.scatter_nd(indices, updates, shape)
+//     tensor = tf.ones([8], dtype=tf.int32)
+//     updated = tf.tensor_scatter_update(tensor, indices, updates)
 //     with tf.Session() as sess:
 //       print(sess.run(scatter))
 // ```
 //
 // The resulting tensor would look like this:
 //
-//     [0, 11, 0, 10, 9, 0, 0, 12]
+//     [1, 11, 1, 10, 9, 1, 1, 12]
 //
 // We can also, insert entire slices of a higher rank tensor all at once. For
 // example, if we wanted to insert two slices in the first dimension of a
 // rank-3 tensor with two matrices of new values.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
-// </div>
-//
 // In Python, this scatter operation would look like this:
 //
 // ```python
@@ -392,8 +476,8 @@ func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQua
 //                             [7, 7, 7, 7], [8, 8, 8, 8]],
 //                            [[5, 5, 5, 5], [6, 6, 6, 6],
 //                             [7, 7, 7, 7], [8, 8, 8, 8]]])
-//     shape = tf.constant([4, 4, 4])
-//     scatter = tf.scatter_nd(indices, updates, shape)
+//     tensor = tf.ones([4, 4, 4])
+//     updated = tf.tensor_scatter_update(tensor, indices, updates)
 //     with tf.Session() as sess:
 //       print(sess.run(scatter))
 // ```
@@ -401,153 +485,135 @@ func FakeQuantWithMinMaxArgs(scope *Scope, inputs tf.Output, optional ...FakeQua
 // The resulting tensor would look like this:
 //
 //     [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
 //      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
 //
 // Note that on CPU, if an out of bound index is found, an error is returned.
 // On GPU, if an out of bound index is found, the index is ignored.
 //
 // Arguments:
+//	tensor: Tensor to copy/update.
 //	indices: Index tensor.
 //	updates: Updates to scatter into output.
-//	shape: 1-D. The shape of the resulting tensor.
 //
 // Returns A new tensor with the given shape and updates applied according
 // to the indices.
-func ScatterNd(scope *Scope, indices tf.Output, updates tf.Output, shape tf.Output) (output tf.Output) {
+func TensorScatterUpdate(scope *Scope, tensor tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ScatterNd",
+		Type: "TensorScatterUpdate",
 		Input: []tf.Input{
-			indices, updates, shape,
+			tensor, indices, updates,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizeAndDequantizeV2Attr is an optional argument to QuantizeAndDequantizeV2.
-type QuantizeAndDequantizeV2Attr func(optionalAttr)
-
-// QuantizeAndDequantizeV2SignedInput sets the optional signed_input attribute to value.
+// Scatter `updates` into a new tensor according to `indices`.
 //
-// value: Whether the quantization is signed or unsigned. (actually this parameter should
-// have been called <b>`signed_output`</b>)
-// If not specified, defaults to true
-func QuantizeAndDequantizeV2SignedInput(value bool) QuantizeAndDequantizeV2Attr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
-	}
-}
-
-// QuantizeAndDequantizeV2NumBits sets the optional num_bits attribute to value.
+// Creates a new tensor by applying sparse `updates` to individual values or
+// slices within a tensor (initially zero for numeric, empty for string) of
+// the given `shape` according to indices.  This operator is the inverse of the
+// `tf.gather_nd` operator which extracts values or slices from a given tensor.
 //
-// value: The bitwidth of the quantization.
-// If not specified, defaults to 8
-func QuantizeAndDequantizeV2NumBits(value int64) QuantizeAndDequantizeV2Attr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
-
-// QuantizeAndDequantizeV2RangeGiven sets the optional range_given attribute to value.
+// This operation is similar to tensor_scatter_add, except that the tensor is
+// zero-initialized. Calling `tf.scatter_nd(indices, values, shape)` is identical
+// to `tensor_scatter_add(tf.zeros(shape, values.dtype), indices, values)`
 //
-// value: Whether the range is given or should be determined from the `input` tensor.
-// If not specified, defaults to false
-func QuantizeAndDequantizeV2RangeGiven(value bool) QuantizeAndDequantizeV2Attr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
-	}
-}
-
-// QuantizeAndDequantizeV2RoundMode sets the optional round_mode attribute to value.
-// If not specified, defaults to "HALF_TO_EVEN"
-func QuantizeAndDequantizeV2RoundMode(value string) QuantizeAndDequantizeV2Attr {
-	return func(m optionalAttr) {
-		m["round_mode"] = value
-	}
-}
-
-// Quantizes then dequantizes a tensor.
+// If `indices` contains duplicates, then their updates are accumulated (summed).
 //
-// This op simulates the precision loss from the quantized forward pass by:
+// **WARNING**: The order in which updates are applied is nondeterministic, so the
+// output will be nondeterministic if `indices` contains duplicates -- because
+// of some numerical approximation issues, numbers summed in different order
+// may yield different results.
 //
-// 1. Quantizing the tensor to fixed point numbers, which should match the target
-//    quantization method when it is used in inference.
-// 2. Dequantizing it back to floating point numbers for the following ops, most
-//    likely matmul.
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
 //
-// There are different ways to quantize. This version uses only scaling, so 0.0
-// maps to 0.
+//     indices.shape[-1] <= shape.rank
 //
-// From the specified 'num_bits' in the quantized output type, it determines
-// minimum and maximum representable quantized values.
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
 //
-// e.g.
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
 //
-// *   [-128, 127] for signed, num_bits = 8, or
-// *   [0, 255] for unsigned, num_bits = 8.
+// The simplest form of scatter is to insert individual elements in a tensor by
+// index. For example, say we want to insert 4 scattered elements in a rank-1
+// tensor with 8 elements.
 //
-// If range_given == False, the initial input_min, input_max will be determined
-// automatically as the minimum and maximum values in the input tensor, otherwise
-// the specified values of input_min, input_max are used.
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
+// </div>
 //
-// Note: If the input_min, input_max are specified, they do not need to equal the
-// actual minimum and maximum values in the tensor. e.g. in some cases it may be
-// beneficial to specify these values such that the low probability extremes of the
-// input distribution are clipped.
+// In Python, this scatter operation would look like this:
 //
-// This op determines the maximum scale_factor that would map the initial
-// [input_min, input_max] range to a range that lies within the representable
-// quantized range.
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     shape = tf.constant([8])
+//     scatter = tf.scatter_nd(indices, updates, shape)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
 //
-// It determines the scale from one of input_min and input_max, then updates the
-// other one to maximize the respresentable range.
+// The resulting tensor would look like this:
 //
-// e.g.
+//     [0, 11, 0, 10, 9, 0, 0, 12]
 //
-// *   if the output is signed, num_bits = 8, [input_min, input_max] = [-10.0,
-//     5.0]: it would use a scale_factor of -128 / -10.0 = 12.8 In this case, it
-//     would update input_max to be 127 / 12.8 = 9.921875
-// *   if the output is signed, num_bits = 8, [input_min, input_max] = [-10.0,
-//     10.0]: it would use a scale_factor of 127 / 10.0 = 12.7 In this case, it
-//     would update input_min to be 128.0 / 12.7 = -10.07874
-// *   if the output is unsigned, input_min is forced to be 0, and only the
-//     specified input_max is used.
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
 //
-// After determining the scale_factor and updating the input range, it applies the
-// following to each value in the 'input' tensor.
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
+// </div>
 //
-// output = round(clamp(value, input_min, input_max) * scale_factor) / scale_factor.
+// In Python, this scatter operation would look like this:
 //
-// The above round function uses half to even rounding.
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     shape = tf.constant([4, 4, 4])
+//     scatter = tf.scatter_nd(indices, updates, shape)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
 //
+// The resulting tensor would look like this:
+//
+//     [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
+//      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, the index is ignored.
 //
 // Arguments:
-//	input: Tensor to quantize and then dequantize.
-//	input_min: If `range_given == True`, this specifies the minimum input value that needs to
-// be represented, otherwise it is determined from the min value of the `input`
-// tensor.
-//	input_max: If `range_given == True`, this specifies the maximum input value that needs to
-// be represented, otherwise it is determined from the max value of the `input`
-// tensor.
-func QuantizeAndDequantizeV2(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, optional ...QuantizeAndDequantizeV2Attr) (output tf.Output) {
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//	shape: 1-D. The shape of the resulting tensor.
+//
+// Returns A new tensor with the given shape and updates applied according
+// to the indices.
+func ScatterNd(scope *Scope, indices tf.Output, updates tf.Output, shape tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantizeV2",
+		Type: "ScatterNd",
 		Input: []tf.Input{
-			input, input_min, input_max,
+			indices, updates, shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -583,49 +649,6 @@ func Bitcast(scope *Scope, input tf.Output, type_ tf.DataType) (output tf.Output
 	return op.Output(0)
 }
 
-// Extract `patches` from `images` and put them in the "depth" output dimension.
-//
-// Arguments:
-//	images: 4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
-//	ksizes: The size of the sliding window for each dimension of `images`.
-//	strides: 1-D of length 4. How far the centers of two consecutive patches are in
-// the images. Must be: `[1, stride_rows, stride_cols, 1]`.
-//	rates: 1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the
-// input stride, specifying how far two consecutive patch samples are in the
-// input. Equivalent to extracting patches with
-// `patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by
-// subsampling them spatially by a factor of `rates`. This is equivalent to
-// `rate` in dilated (a.k.a. Atrous) convolutions.
-//	padding: The type of padding algorithm to use.
-//
-// We specify the size-related attributes as:
-//
-// ```python
-//       ksizes = [1, ksize_rows, ksize_cols, 1]
-//       strides = [1, strides_rows, strides_cols, 1]
-//       rates = [1, rates_rows, rates_cols, 1]
-// ```
-//
-// Returns 4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *
-// ksize_cols * depth]` containing image patches with size
-// `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension. Note
-// `out_rows` and `out_cols` are the dimensions of the output patches.
-func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides []int64, rates []int64, padding string) (patches tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksizes": ksizes, "strides": strides, "rates": rates, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "ExtractImagePatches",
-		Input: []tf.Input{
-			images,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // SpaceToDepthAttr is an optional argument to SpaceToDepth.
 type SpaceToDepthAttr func(optionalAttr)
 
@@ -996,65 +1019,6 @@ func SpaceToBatchND(scope *Scope, input tf.Output, block_shape tf.Output, paddin
 	return op.Output(0)
 }
 
-// ListDiffAttr is an optional argument to ListDiff.
-type ListDiffAttr func(optionalAttr)
-
-// ListDiffOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func ListDiffOutIdx(value tf.DataType) ListDiffAttr {
-	return func(m optionalAttr) {
-		m["out_idx"] = value
-	}
-}
-
-// Computes the difference between two lists of numbers or strings.
-//
-// Given a list `x` and a list `y`, this operation returns a list `out` that
-// represents all values that are in `x` but not in `y`. The returned list `out`
-// is sorted in the same order that the numbers appear in `x` (duplicates are
-// preserved). This operation also returns a list `idx` that represents the
-// position of each `out` element in `x`. In other words:
-//
-// `out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
-//
-// For example, given this input:
-//
-// ```
-// x = [1, 2, 3, 4, 5, 6]
-// y = [1, 3, 5]
-// ```
-//
-// This operation would return:
-//
-// ```
-// out ==> [2, 4, 6]
-// idx ==> [1, 3, 5]
-// ```
-//
-// Arguments:
-//	x: 1-D. Values to keep.
-//	y: 1-D. Values to remove.
-//
-// Returns 1-D. Values present in `x` but not in `y`.1-D. Positions of `x` values preserved in `out`.
-func ListDiff(scope *Scope, x tf.Output, y tf.Output, optional ...ListDiffAttr) (out tf.Output, idx tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ListDiff",
-		Input: []tf.Input{
-			x, y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // Inserts a dimension of 1 into a tensor's shape.
 //
 // Given a tensor `input`, this operation inserts a dimension of 1 at the
@@ -1404,78 +1368,6 @@ func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output)
 	return op.Output(0)
 }
 
-// StridedSliceGradAttr is an optional argument to StridedSliceGrad.
-type StridedSliceGradAttr func(optionalAttr)
-
-// StridedSliceGradBeginMask sets the optional begin_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradBeginMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["begin_mask"] = value
-	}
-}
-
-// StridedSliceGradEndMask sets the optional end_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradEndMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["end_mask"] = value
-	}
-}
-
-// StridedSliceGradEllipsisMask sets the optional ellipsis_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradEllipsisMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
-	}
-}
-
-// StridedSliceGradNewAxisMask sets the optional new_axis_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradNewAxisMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
-	}
-}
-
-// StridedSliceGradShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradShrinkAxisMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
-	}
-}
-
-// Returns the gradient of `StridedSlice`.
-//
-// Since `StridedSlice` cuts out pieces of its `input` which is size
-// `shape`, its gradient will have the same shape (which is passed here
-// as `shape`). The gradient will be zero in any element that the slice
-// does not select.
-//
-// Arguments are the same as StridedSliceGrad with the exception that
-// `dy` is the input gradient to be propagated and `shape` is the
-// shape of `StridedSlice`'s `input`.
-func StridedSliceGrad(scope *Scope, shape tf.Output, begin tf.Output, end tf.Output, strides tf.Output, dy tf.Output, optional ...StridedSliceGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StridedSliceGrad",
-		Input: []tf.Input{
-			shape, begin, end, strides, dy,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // StridedSliceAttr is an optional argument to StridedSlice.
 type StridedSliceAttr func(optionalAttr)
 
@@ -1669,37 +1561,6 @@ func StridedSlice(scope *Scope, input tf.Output, begin tf.Output, end tf.Output,
 	return op.Output(0)
 }
 
-// Return a slice from 'input'.
-//
-// The output tensor is a tensor with dimensions described by 'size'
-// whose values are extracted from 'input' starting at the offsets in
-// 'begin'.
-//
-// *Requirements*:
-//   0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
-//
-// Arguments:
-//
-//	begin: begin[i] specifies the offset into the 'i'th dimension of
-// 'input' to slice from.
-//	size: size[i] specifies the number of elements of the 'i'th dimension
-// of 'input' to slice. If size[i] is -1, all remaining elements in dimension
-// i are included in the slice (i.e. this is equivalent to setting
-// size[i] = input.dim_size(i) - begin[i]).
-func Slice(scope *Scope, input tf.Output, begin tf.Output, size tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Slice",
-		Input: []tf.Input{
-			input, begin, size,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // SizeAttr is an optional argument to Size.
 type SizeAttr func(optionalAttr)
 
@@ -3091,30 +2952,6 @@ func InplaceSub(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Outpu
 	return op.Output(0)
 }
 
-//     Updates specified rows with values in `v`.
-//
-//     Computes `x[i, :] = v; return x`.
-//
-// Arguments:
-//	x: A tensor of type `T`.
-//	i: A vector. Indices into the left-most dimension of `x`.
-//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
-//
-// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
-func InplaceUpdate(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InplaceUpdate",
-		Input: []tf.Input{
-			x, i, v,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Makes a copy of `x`.
 //
 // Arguments:
@@ -3422,11 +3259,11 @@ func PopulationCount(scope *Scope, x tf.Output) (y tf.Output) {
 // bucketized values for a single feature.
 //
 // Arguments:
-//	float_values: float; List of Rank 2 Tensor each containing float values for a single feature.
+//	float_values: float; List of Rank 1 Tensor each containing float values for a single feature.
 //	bucket_boundaries: float; List of Rank 1 Tensors each containing the bucket boundaries for a single
 // feature.
 //
-// Returns int; List of Rank 2 Tensors each containing the bucketized values for a single feature.
+// Returns int; List of Rank 1 Tensors each containing the bucketized values for a single feature.
 func BoostedTreesBucketize(scope *Scope, float_values []tf.Output, bucket_boundaries []tf.Output) (buckets []tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -3497,15 +3334,16 @@ func BoostedTreesQuantileStreamResourceFlush(scope *Scope, quantile_stream_resou
 
 // Makes the summary of quantiles for the batch.
 //
-// An op that takes a list of tensors and outputs the quantile summaries for each tensor.
+// An op that takes a list of tensors (one tensor per feature) and outputs the
+// quantile summaries for each tensor.
 //
 // Arguments:
-//	float_values: float; List of Rank 2 Tensors each containing values for a single feature.
+//	float_values: float; List of Rank 1 Tensors each containing values for a single feature.
 //	example_weights: float; Rank 1 Tensor with weights per instance.
 //	epsilon: float; The required maximum approximation error.
 //
-// Returns float; List of Rank 2 Tensors each containing the quantile summary (value, weight,
-// min_rank, max_rank) of a single feature.
+// Returns float; List of Rank 2 Tensors each containing the quantile summary
+// (value, weight, min_rank, max_rank) of a single feature.
 func BoostedTreesMakeQuantileSummaries(scope *Scope, float_values []tf.Output, example_weights tf.Output, epsilon tf.Output) (summaries []tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -3806,6 +3644,70 @@ func BoostedTreesEnsembleResourceHandleOp(scope *Scope, optional ...BoostedTrees
 	return op.Output(0)
 }
 
+// Output the logits for the given input data
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource.
+//	dense_features: Rank 2 dense features tensor.
+//	logits_dimension: Scalar, dimension of the logits.
+//
+// Returns The logits predictions from the tree for each instance in the batch.
+func TensorForestTreePredict(scope *Scope, tree_handle tf.Output, dense_features tf.Output, logits_dimension int64) (logits tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreePredict",
+		Input: []tf.Input{
+			tree_handle, dense_features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Get the number of nodes in a tree
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource.
+//
+// Returns The size of the tree.
+func TensorForestTreeSize(scope *Scope, tree_handle tf.Output) (tree_size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreeSize",
+		Input: []tf.Input{
+			tree_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a tree resource and returns a handle to it.
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource to be created.
+//	tree_config: Serialized proto string of the boosted_trees.Tree.
+//
+// Returns the created operation.
+func TensorForestCreateTreeVariable(scope *Scope, tree_handle tf.Output, tree_config tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestCreateTreeVariable",
+		Input: []tf.Input{
+			tree_handle, tree_config,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
 type ComputeAccidentalHitsAttr func(optionalAttr)
 
@@ -4170,89 +4072,31 @@ func UniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int6
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// GenerateVocabRemappingAttr is an optional argument to GenerateVocabRemapping.
-type GenerateVocabRemappingAttr func(optionalAttr)
-
-// GenerateVocabRemappingOldVocabSize sets the optional old_vocab_size attribute to value.
-//
-// value: Number of entries in the old vocab file to consider.  If -1,
-// use the entire old vocabulary.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func GenerateVocabRemappingOldVocabSize(value int64) GenerateVocabRemappingAttr {
-	return func(m optionalAttr) {
-		m["old_vocab_size"] = value
-	}
-}
-
-// Given a path to new and old vocabulary files, returns a remapping Tensor of
-//
-// length `num_new_vocab`, where `remapping[i]` contains the row number in the old
-// vocabulary that corresponds to row `i` in the new vocabulary (starting at line
-// `new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`
-// in the new vocabulary is not in the old vocabulary.  The old vocabulary is
-// constrained to the first `old_vocab_size` entries if `old_vocab_size` is not the
-// default value of -1.
-//
-// `num_vocab_offset` enables
-// use in the partitioned variable case, and should generally be set through
-// examining partitioning info.  The format of the files should be a text file,
-// with each line containing a single entity within the vocabulary.
-//
-// For example, with `new_vocab_file` a text file containing each of the following
-// elements on a single line: `[f0, f1, f2, f3]`, old_vocab_file = [f1, f0, f3],
-// `num_new_vocab = 3, new_vocab_offset = 1`, the returned remapping would be
-// `[0, -1, 2]`.
-//
-// The op also returns a count of how many entries in the new vocabulary
-// were present in the old vocabulary, which is used to calculate the number of
-// values to initialize in a weight matrix remapping
-//
-// This functionality can be used to remap both row vocabularies (typically,
-// features) and column vocabularies (typically, classes) from TensorFlow
-// checkpoints.  Note that the partitioning logic relies on contiguous vocabularies
-// corresponding to div-partitioned variables.  Moreover, the underlying remapping
-// uses an IndexTable (as opposed to an inexact CuckooTable), so client code should
-// use the corresponding index_table_from_file() as the FeatureColumn framework
-// does (as opposed to tf.feature_to_id(), which uses a CuckooTable).
-//
-// Arguments:
-//	new_vocab_file: Path to the new vocab file.
-//	old_vocab_file: Path to the old vocab file.
-//	new_vocab_offset: How many entries into the new vocab file to start reading.
-//	num_new_vocab: Number of entries in the new vocab file to remap.
-//
-// Returns A Tensor of length num_new_vocab where the element at index i
-// is equal to the old ID that maps to the new ID i.  This element is -1 for any
-// new ID that is not found in the old vocabulary.Number of new vocab entries found in old vocab.
-func GenerateVocabRemapping(scope *Scope, new_vocab_file tf.Output, old_vocab_file tf.Output, new_vocab_offset int64, num_new_vocab int64, optional ...GenerateVocabRemappingAttr) (remapping tf.Output, num_present tf.Output) {
+// Broadcasts a tensor value to one or more other devices.
+func CollectiveBcastSend(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"new_vocab_offset": new_vocab_offset, "num_new_vocab": num_new_vocab}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "GenerateVocabRemapping",
+		Type: "CollectiveBcastSend",
 		Input: []tf.Input{
-			new_vocab_file, old_vocab_file,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Broadcasts a tensor value to one or more other devices.
-func CollectiveBcastSend(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
+// Mutually accumulates multiple tensors of identical type and shape.
+func CollectiveGather(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "CollectiveBcastSend",
+		Type: "CollectiveGather",
 		Input: []tf.Input{
 			input,
 		},
@@ -4262,12 +4106,26 @@ func CollectiveBcastSend(scope *Scope, input tf.Output, group_size int64, group_
 	return op.Output(0)
 }
 
+// CollectiveReduceAttr is an optional argument to CollectiveReduce.
+type CollectiveReduceAttr func(optionalAttr)
+
+// CollectiveReduceWaitFor sets the optional wait_for attribute to value.
+// If not specified, defaults to <>
+func CollectiveReduceWaitFor(value []int64) CollectiveReduceAttr {
+	return func(m optionalAttr) {
+		m["wait_for"] = value
+	}
+}
+
 // Mutually reduces multiple tensors of identical type and shape.
-func CollectiveReduce(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, merge_op string, final_op string, subdiv_offsets []int64) (data tf.Output) {
+func CollectiveReduce(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, merge_op string, final_op string, subdiv_offsets []int64, optional ...CollectiveReduceAttr) (data tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "merge_op": merge_op, "final_op": final_op, "subdiv_offsets": subdiv_offsets}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "CollectiveReduce",
 		Input: []tf.Input{
@@ -4829,6 +4687,119 @@ func CudnnRNNParamsToCanonical(scope *Scope, num_layers tf.Output, num_units tf.
 	return weights, biases
 }
 
+// CudnnRNNBackpropV3Attr is an optional argument to CudnnRNNBackpropV3.
+type CudnnRNNBackpropV3Attr func(optionalAttr)
+
+// CudnnRNNBackpropV3RnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNBackpropV3RnnMode(value string) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNBackpropV3InputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNBackpropV3InputMode(value string) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNBackpropV3Direction sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNBackpropV3Direction(value string) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNBackpropV3Dropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV3Dropout(value float32) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNBackpropV3Seed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV3Seed(value int64) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNBackpropV3Seed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropV3Seed2(value int64) CudnnRNNBackpropV3Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Backprop step of CudnnRNNV3.
+//
+// Compute the backprop of both data and weights in a RNN. Takes an extra
+//     "sequence_lengths" input than CudnnRNNBackprop.
+//
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicates whether there is a linear projection between the input and
+//     the actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// sequence_lengths: a vector of lengths of each input sequence.
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
+// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
+//     pass.
+// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
+//     pass.
+// reserve_space: The same reserve_space produced in the forward operation.
+// input_backprop: The backprop to input in the forward pass. Has the same shape
+//     as input.
+// input_h_backprop: The backprop to input_h in the forward pass. Has the same
+//     shape as input_h.
+// input_c_backprop: The backprop to input_c in the forward pass. Has the same
+//     shape as input_c.
+// params_backprop: The backprop to the params buffer in the forward pass. Has the
+//     same shape as params.
+func CudnnRNNBackpropV3(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, sequence_lengths tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, host_reserved tf.Output, optional ...CudnnRNNBackpropV3Attr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CudnnRNNBackpropV3",
+		Input: []tf.Input{
+			input, input_h, input_c, params, sequence_lengths, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space, host_reserved,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
 // CudnnRNNBackpropV2Attr is an optional argument to CudnnRNNBackpropV2.
 type CudnnRNNBackpropV2Attr func(optionalAttr)
 
@@ -5645,77 +5616,6 @@ func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size
 	return op.Output(0)
 }
 
-// MapUnstageAttr is an optional argument to MapUnstage.
-type MapUnstageAttr func(optionalAttr)
-
-// MapUnstageCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapUnstageCapacity(value int64) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapUnstageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapUnstageMemoryLimit(value int64) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapUnstageContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapUnstageContainer(value string) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapUnstageSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapUnstageSharedName(value string) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes and returns the values associated with the key
-//
-// from the underlying container.   If the underlying container
-// does not contain this key, the op will block until it does.
-func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageAttr) (values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapUnstage",
-		Input: []tf.Input{
-			key, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapUnstage", err)
-		return
-	}
-	return values
-}
-
 // MapPeekAttr is an optional argument to MapPeek.
 type MapPeekAttr func(optionalAttr)
 
@@ -5977,610 +5877,482 @@ func StageSize(scope *Scope, dtypes []tf.DataType, optional ...StageSizeAttr) (s
 	return op.Output(0)
 }
 
-// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
-//
-// The regularized incomplete beta integral is defined as:
-//
-//
-// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
-//
-// where
-//
-//
-// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
-//
+// StagePeekAttr is an optional argument to StagePeek.
+type StagePeekAttr func(optionalAttr)
+
+// StagePeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
-// beta function.
-func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// REQUIRES: value >= 0
+func StagePeekCapacity(value int64) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Betainc",
-		Input: []tf.Input{
-			a, b, x,
-		},
+}
+
+// StagePeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StagePeekMemoryLimit(value int64) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Return a tensor with the same shape and contents as the input tensor or value.
-func Identity(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// StagePeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StagePeekContainer(value string) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Identity",
-		Input: []tf.Input{
-			input,
-		},
+}
+
+// StagePeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StagePeekSharedName(value string) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
+// Op peeks at the values at the specified index.  If the
 //
-// This is the angle \( \theta \in [-\pi, \pi] \) such that
-// \[ x = r \cos(\theta) \]
-// and
-// \[ y = r \sin(\theta) \]
-// where \(r = \sqrt(x^2 + y^2) \).
-func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
+// underlying container does not contain sufficient elements
+// this op will block until it does.   This Op is optimized for
+// performance.
+func StagePeek(scope *Scope, index tf.Output, dtypes []tf.DataType, optional ...StagePeekAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Atan2",
+		Type: "StagePeek",
 		Input: []tf.Input{
-			y, x,
+			index,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("StagePeek", err)
+		return
+	}
+	return values
 }
 
-// EditDistanceAttr is an optional argument to EditDistance.
-type EditDistanceAttr func(optionalAttr)
+// UnstageAttr is an optional argument to Unstage.
+type UnstageAttr func(optionalAttr)
 
-// EditDistanceNormalize sets the optional normalize attribute to value.
-//
-// value: boolean (if true, edit distances are normalized by length of truth).
+// UnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// The output is:
-// If not specified, defaults to true
-func EditDistanceNormalize(value bool) EditDistanceAttr {
+// REQUIRES: value >= 0
+func UnstageCapacity(value int64) UnstageAttr {
 	return func(m optionalAttr) {
-		m["normalize"] = value
+		m["capacity"] = value
 	}
 }
 
-// Computes the (possibly normalized) Levenshtein Edit Distance.
-//
-// The inputs are variable-length sequences provided by SparseTensors
-//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
-// and
-//   (truth_indices, truth_values, truth_shape).
-//
-// The inputs are:
-//
-// Arguments:
-//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
-// This is an N x R int64 matrix.
-//	hypothesis_values: The values of the hypothesis list SparseTensor.
-// This is an N-length vector.
-//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
-// This is an R-length vector.
-//	truth_indices: The indices of the truth list SparseTensor.
-// This is an M x R int64 matrix.
-//	truth_values: The values of the truth list SparseTensor.
-// This is an M-length vector.
-//	truth_shape: truth indices, vector.
-//
-// Returns A dense float tensor with rank R - 1.
-//
-// For the example input:
-//
-//     // hypothesis represents a 2x1 matrix with variable-length values:
-//     //   (0,0) = ["a"]
-//     //   (1,0) = ["b"]
-//     hypothesis_indices = [[0, 0, 0],
-//                           [1, 0, 0]]
-//     hypothesis_values = ["a", "b"]
-//     hypothesis_shape = [2, 1, 1]
-//
-//     // truth represents a 2x2 matrix with variable-length values:
-//     //   (0,0) = []
-//     //   (0,1) = ["a"]
-//     //   (1,0) = ["b", "c"]
-//     //   (1,1) = ["a"]
-//     truth_indices = [[0, 1, 0],
-//                      [1, 0, 0],
-//                      [1, 0, 1],
-//                      [1, 1, 0]]
-//     truth_values = ["a", "b", "c", "a"]
-//     truth_shape = [2, 2, 2]
-//     normalize = true
+// UnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// The output will be:
+// REQUIRES: value >= 0
+func UnstageMemoryLimit(value int64) UnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// UnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func UnstageContainer(value string) UnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// UnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func UnstageSharedName(value string) UnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op is similar to a lightweight Dequeue.
 //
-//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
-//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
-//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
-func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
+// The basic functionality is similar to dequeue with many fewer
+// capabilities and options.  This Op is optimized for performance.
+func Unstage(scope *Scope, dtypes []tf.DataType, optional ...UnstageAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EditDistance",
-		Input: []tf.Input{
-			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
-		},
+		Type: "Unstage",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns 0 if x == 0, and x * log(y) otherwise, elementwise.
-func Xlogy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Xlogy",
-		Input: []tf.Input{
-			x, y,
-		},
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("Unstage", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return values
 }
 
-// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
-type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
+// StageAttr is an optional argument to Stage.
+type StageAttr func(optionalAttr)
 
-// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
+// StageCapacity sets the optional capacity attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageCapacity(value int64) StageAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["capacity"] = value
 	}
 }
 
-// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
+// StageMemoryLimit sets the optional memory_limit attribute to value.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
+// value: The maximum number of bytes allowed for Tensors in the Staging Area.
+// If > 0, inserts will block until sufficient space is available.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageMemoryLimit(value int64) StageAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// Computes the gradients of depthwise convolution with respect to the input.
+// StageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func StageContainer(value string) StageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StageSharedName sets the optional shared_name attribute to value.
+//
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func StageSharedName(value string) StageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage values similar to a lightweight Enqueue.
+//
+// The basic functionality of this Op is similar to a queue with many
+// fewer capabilities and options.  This Op is optimized for performance.
 //
 // Arguments:
-//	input_sizes: An integer vector representing the shape of `input`, based
-// on `data_format`.  For example, if `data_format` is 'NHWC' then
-//  `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
 //
-// Returns 4-D with shape according to `data_format`.  For example, if
-// `data_format` is 'NHWC', output shape is `[batch, in_height,
-// in_width, in_channels]`.  Gradient w.r.t. the input of the
-// convolution.
-func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
+// Returns the created operation.
+func Stage(scope *Scope, values []tf.Output, optional ...StageAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropInput",
+		Type: "Stage",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			tf.OutputList(values),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns x / y element-wise.
+// Delete the tensor specified by its handle in the session.
 //
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	handle: The handle for a tensor stored in the session state.
+//
+// Returns the created operation.
+func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Div",
+		Type: "DeleteSessionTensor",
 		Input: []tf.Input{
-			x, y,
+			handle,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns x * y element-wise.
+// Store the input tensor in the state of the current session.
 //
-// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	value: The tensor to be stored.
+//
+// Returns The handle for the tensor stored in the session state, represented
+// as a string.
+func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Mul",
+		Type: "GetSessionHandle",
 		Input: []tf.Input{
-			x, y,
+			value,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// BiasAddAttr is an optional argument to BiasAdd.
-type BiasAddAttr func(optionalAttr)
-
-// BiasAddDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddDataFormat(value string) BiasAddAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Adds `bias` to `value`.
-//
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
-//
-// Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
+// Deprecated. Use TensorArraySizeV3
 //
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArraySizeV3
+func TensorArraySizeV2(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "BiasAdd",
+		Type: "TensorArraySizeV2",
 		Input: []tf.Input{
-			value, bias,
+			handle, flow_in,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
-type SparseReduceSumSparseAttr func(optionalAttr)
-
-// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the sum of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
-// SparseTensor.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReduceSumSparse",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
-type AllCandidateSamplerAttr func(optionalAttr)
-
-// AllCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// TensorArrayConcatV2Attr is an optional argument to TensorArrayConcatV2.
+type TensorArrayConcatV2Attr func(optionalAttr)
 
-// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
+// TensorArrayConcatV2ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayConcatV2ElementShapeExcept0(value tf.Shape) TensorArrayConcatV2Attr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["element_shape_except0"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
-//
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to produce.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Deprecated. Use TensorArrayConcatV3
+func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV2Attr) (value tf.Output, lengths tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AllCandidateSampler",
+		Type: "TensorArrayConcatV2",
 		Input: []tf.Input{
-			true_classes,
+			handle, flow_in,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns x + y element-wise.
-//
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AddV2",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Returns an element-wise indication of the sign of a number.
-//
-// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
+// Deprecated. Use TensorArrayGradV3
 //
-// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
-func Sign(scope *Scope, x tf.Output) (y tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayWriteV3
+func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sign",
+		Type: "TensorArrayWriteV2",
 		Input: []tf.Input{
-			x,
+			handle, index, value, flow_in,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns which elements of x are finite.
+// Deprecated. Use TensorArrayGradV3
 //
-// @compatibility(numpy)
-// Equivalent to np.isfinite
-// @end_compatibility
-func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayGradV3
+func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"source": source}
 	opspec := tf.OpSpec{
-		Type: "IsFinite",
+		Type: "TensorArrayGradV2",
 		Input: []tf.Input{
-			x,
+			handle, flow_in,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
-type ResourceStridedSliceAssignAttr func(optionalAttr)
+// TensorArrayV2Attr is an optional argument to TensorArrayV2.
+type TensorArrayV2Attr func(optionalAttr)
 
-// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
+// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
 	return func(m optionalAttr) {
-		m["begin_mask"] = value
+		m["element_shape"] = value
 	}
 }
 
-// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
+// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
+// If not specified, defaults to false
+func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
 	return func(m optionalAttr) {
-		m["end_mask"] = value
+		m["dynamic_size"] = value
 	}
 }
 
-// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
+// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
+// If not specified, defaults to true
+func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
 	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
+		m["clear_after_read"] = value
 	}
 }
 
-// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
+// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
+// If not specified, defaults to ""
+func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
 	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
+		m["tensor_array_name"] = value
 	}
 }
 
-// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
+// Deprecated. Use TensorArrayV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayV3
+func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayV2",
+		Input: []tf.Input{
+			size,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Assign `value` to the sliced l-value reference of `ref`.
+// Split the data from the input value into TensorArray elements.
 //
-// The values of `value` are assigned to the positions in the variable
-// `ref` that are selected by the slice parameters. The slice parameters
-// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+// Assuming that `lengths` takes on values
 //
-// NOTE this op currently does not support broadcasting and so `value`'s
-// shape must be exactly the shape produced by the slice of `ref`.
+//   ```(n0, n1, ..., n(T-1))```
 //
-// Returns the created operation.
-func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
+// and that `value` has shape
+//
+//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```,
+//
+// this splits values into a TensorArray with T tensors.
+//
+// TensorArray index t will be the subtensor of values with starting position
+//
+//   ```(n0 + n1 + ... + n(t-1), 0, 0, ...)```
+//
+// and having size
+//
+//   ```nt x d0 x d1 x ...```
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	value: The concatenated tensor to write to the TensorArray.
+//	lengths: The vector of lengths, how to split the rows of value into the
+// TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArraySplitV3(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceStridedSliceAssign",
+		Type: "TensorArraySplitV3",
 		Input: []tf.Input{
-			ref, begin, end, strides, value,
+			handle, value, lengths, flow_in,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ArgMaxAttr is an optional argument to ArgMax.
-type ArgMaxAttr func(optionalAttr)
+// EmptyAttr is an optional argument to Empty.
+type EmptyAttr func(optionalAttr)
 
-// ArgMaxOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
+// EmptyInit sets the optional init attribute to value.
+//
+// value: If True, initialize the returned tensor with the default value of dtype.  Otherwise, the implementation is free not to initializethe tensor's content.
+// If not specified, defaults to false
+func EmptyInit(value bool) EmptyAttr {
 	return func(m optionalAttr) {
-		m["output_type"] = value
+		m["init"] = value
 	}
 }
 
-// Returns the index with the largest value across dimensions of a tensor.
+// Creates a tensor with the given shape.
 //
-// Note that in case of ties the identity of the return value is not guaranteed.
+// This operation creates a tensor of `shape` and `dtype`.
 //
 // Arguments:
+//	shape: 1-D. Represents the shape of the output tensor.
 //
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
+//
+// Returns A `Tensor` of type `T`.
+func Empty(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...EmptyAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ArgMax",
+		Type: "Empty",
 		Input: []tf.Input{
-			input, dimension,
+			shape,
 		},
 		Attrs: attrs,
 	}
@@ -6588,126 +6360,128 @@ func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgM
 	return op.Output(0)
 }
 
-// PreventGradientAttr is an optional argument to PreventGradient.
-type PreventGradientAttr func(optionalAttr)
+// TensorArrayConcatV3Attr is an optional argument to TensorArrayConcatV3.
+type TensorArrayConcatV3Attr func(optionalAttr)
 
-// PreventGradientMessage sets the optional message attribute to value.
+// TensorArrayConcatV3ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
 //
-// value: Will be printed in the error when anyone tries to differentiate
-// this operation.
-// If not specified, defaults to ""
-func PreventGradientMessage(value string) PreventGradientAttr {
+// value: The expected shape of an element, if known,
+// excluding the first dimension. Used to validate the shapes of
+// TensorArray elements. If this shape is not fully specified, concatenating
+// zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayConcatV3ElementShapeExcept0(value tf.Shape) TensorArrayConcatV3Attr {
 	return func(m optionalAttr) {
-		m["message"] = value
+		m["element_shape_except0"] = value
 	}
 }
 
-// An identity op that triggers an error if a gradient is requested.
+// Concat the elements from the TensorArray into value `value`.
 //
-// When executed in a graph, this op outputs its input tensor as-is.
+// Takes `T` elements of shapes
 //
-// When building ops to compute gradients, the TensorFlow gradient system
-// will return an error when trying to lookup the gradient of this op,
-// because no gradient must ever be registered for this function.  This
-// op exists to prevent subtle bugs from silently returning unimplemented
-// gradients in some corner cases.
+//   ```
+//   (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)
+//   ```
+//
+// and concatenates them into a Tensor of shape:
+//
+//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```
+//
+// All elements must have the same shape (excepting the first dimension).
 //
 // Arguments:
-//	input: any tensor.
+//	handle: The handle to a TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
 //
-// Returns the same input tensor.
-func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
+// Returns All of the elements in the TensorArray, concatenated along the first
+// axis.A vector of the row sizes of the original T elements in the
+// value output.  In the example above, this would be the values:
+// `(n1, n2, ..., n(T-1))`.
+func TensorArrayConcatV3(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV3Attr) (value tf.Output, lengths tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PreventGradient",
+		Type: "TensorArrayConcatV3",
 		Input: []tf.Input{
-			input,
+			handle, flow_in,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Computes asin of x element-wise.
-func Asin(scope *Scope, x tf.Output) (y tf.Output) {
+// Scatter the data from the input value into specific TensorArray elements.
+//
+// `indices` must be a vector, its length must match the first dim of `value`.
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	indices: The locations at which to write the tensor elements.
+//	value: The concatenated tensor to write to the TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArrayScatterV3(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Asin",
+		Type: "TensorArrayScatterV3",
 		Input: []tf.Input{
-			x,
+			handle, indices, value, flow_in,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseToDenseAttr is an optional argument to SparseToDense.
-type SparseToDenseAttr func(optionalAttr)
+// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
+type TensorArrayGatherV3Attr func(optionalAttr)
 
-// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
+// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
 //
-// value: If true, indices are checked to make sure they are sorted in
-// lexicographic order and that there are no repeats.
-// If not specified, defaults to true
-func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["element_shape"] = value
 	}
 }
 
-// Converts a sparse representation into a dense tensor.
-//
-// Builds an array `dense` with shape `output_shape` such that
-//
-// ```
-// # If sparse_indices is scalar
-// dense[i] = (i == sparse_indices ? sparse_values : default_value)
-//
-// # If sparse_indices is a vector, then for each i
-// dense[sparse_indices[i]] = sparse_values[i]
-//
-// # If sparse_indices is an n by d matrix, then for each i in [0, n)
-// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
-// ```
-//
-// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
-// scalar, all sparse indices are set to this single value.
+// Gather specific elements from the TensorArray into output `value`.
 //
-// Indices should be sorted in lexicographic order, and indices must not
-// contain any repeats. If `validate_indices` is true, these properties
-// are checked during execution.
+// All elements selected by `indices` must have the same shape.
 //
 // Arguments:
-//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
-// index where `sparse_values[i]` will be placed.
-//	output_shape: 1-D.  Shape of the dense output tensor.
-//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
-// or a scalar value to be used for all sparse indices.
-//	default_value: Scalar value to set for indices not specified in
-// `sparse_indices`.
+//	handle: The handle to a TensorArray.
+//	indices: The locations in the TensorArray from which to read tensor elements.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
 //
-// Returns Dense output tensor of shape `output_shape`.
-func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
+// Returns All of the elements in the TensorArray, concatenated along a new
+// axis (the new dimension 0).
+func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseToDense",
+		Type: "TensorArrayGatherV3",
 		Input: []tf.Input{
-			sparse_indices, output_shape, sparse_values, default_value,
+			handle, indices, flow_in,
 		},
 		Attrs: attrs,
 	}
@@ -6715,438 +6489,560 @@ func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Outpu
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor.
+// Creates a TensorArray for storing multiple gradients of values in the given handle.
 //
-// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
+// Similar to TensorArrayGradV3. However it creates an accumulator with an
+// expanded shape compared to the input TensorArray whose gradient is being
+// computed. This enables multiple gradients for the same TensorArray to be
+// calculated using the same accumulator.
 //
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
+// Arguments:
+//	handle: The handle to the forward TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	shape_to_prepend: An int32 vector representing a shape. Elements in the gradient accumulator will
+// have shape which is this shape_to_prepend value concatenated with shape of the
+// elements in the TensorArray corresponding to the input handle.
+//	source: The gradient source string, used to decide which gradient TensorArray
+// to return.
+func TensorArrayGradWithShape(scope *Scope, handle tf.Output, flow_in tf.Output, shape_to_prepend tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"source": source}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayGradWithShape",
+		Input: []tf.Input{
+			handle, flow_in, shape_to_prepend,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Creates a TensorArray for storing the gradients of values in the given handle.
 //
-// For example:
+// If the given TensorArray gradient already exists, returns a reference to it.
 //
-// ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+// Locks the size of the original TensorArray by disabling its dynamic size flag.
 //
-// tf.sparse_segment_sum_with_num_segments(
-//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
-// # => [[0 0 0 0]
-// #     [0 0 0 0]
-// #     [0 0 0 0]]
+// **A note about the input flow_in:**
 //
-// tf.sparse_segment_sum_with_num_segments(c,
-//                                         tf.constant([0, 1]),
-//                                         tf.constant([0, 2],
-//                                         num_segments=4))
-// # => [[ 1  2  3  4]
-// #     [ 0  0  0  0]
-// #     [-1 -2 -3 -4]
-// #     [ 0  0  0  0]]
-// ```
+// The handle flow_in forces the execution of the gradient lookup to occur
+// only after certain other operations have occurred.  For example, when
+// the forward TensorArray is dynamically sized, writes to this TensorArray
+// may resize the object.  The gradient TensorArray is statically sized based
+// on the size of the forward TensorArray when this operation executes.
+// Furthermore, the size of the forward TensorArray is frozen by this call.
+// As a result, the flow is used to ensure that the call to generate the gradient
+// TensorArray only happens after all writes are executed.
 //
-// Arguments:
+// In the case of dynamically sized TensorArrays, gradient computation should
+// only be performed on read operations that have themselves been chained via
+// flow to occur only after all writes have executed. That way the final size
+// of the forward TensorArray is known when this operation is called.
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
+// **A note about the source attribute:**
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `num_segments`.
-func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// TensorArray gradient calls use an accumulator TensorArray object.  If
+// multiple gradients are calculated and run in the same session, the multiple
+// gradient nodes may accidentally flow through the same accumulator TensorArray.
+// This double counts and generally breaks the TensorArray gradient flow.
+//
+// The solution is to identify which gradient call this particular
+// TensorArray gradient is being called in.  This is performed by identifying
+// a unique string (e.g. "gradients", "gradients_1", ...) from the input
+// gradient Tensor's name.  This string is used as a suffix when creating
+// the TensorArray gradient object here (the attribute `source`).
+//
+// The attribute `source` is added as a suffix to the forward TensorArray's
+// name when performing the creation / lookup, so that each separate gradient
+// calculation gets its own TensorArray accumulator.
+//
+// Arguments:
+//	handle: The handle to the forward TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	source: The gradient source string, used to decide which gradient TensorArray
+// to return.
+func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"source": source}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSumWithNumSegments",
+		Type: "TensorArrayGradV3",
 		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
+			handle, flow_in,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Computes the determinant of one or more square matrices.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor containing the determinants
-// for all input submatrices `[..., :, :]`.
+// Pop the element at the top of the stack.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
+//	handle: The handle to a stack.
+//	elem_type: The type of the elem that is popped.
 //
-// Returns Shape is `[...]`.
-func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns The tensor that is popped from the top of the stack.
+func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"elem_type": elem_type}
 	opspec := tf.OpSpec{
-		Type: "MatrixDeterminant",
+		Type: "StackPopV2",
 		Input: []tf.Input{
-			input,
+			handle,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes sin of x element-wise.
-func Sin(scope *Scope, x tf.Output) (y tf.Output) {
+// OneHotAttr is an optional argument to OneHot.
+type OneHotAttr func(optionalAttr)
+
+// OneHotAxis sets the optional axis attribute to value.
+//
+// value: The axis to fill (default: -1, a new inner-most axis).
+// If not specified, defaults to -1
+func OneHotAxis(value int64) OneHotAttr {
+	return func(m optionalAttr) {
+		m["axis"] = value
+	}
+}
+
+// Returns a one-hot tensor.
+//
+// The locations represented by indices in `indices` take value `on_value`,
+// while all other locations take value `off_value`.
+//
+// If the input `indices` is rank `N`, the output will have rank `N+1`,
+// The new axis is created at dimension `axis` (default: the new axis is
+// appended at the end).
+//
+// If `indices` is a scalar the output shape will be a vector of length `depth`.
+//
+// If `indices` is a vector of length `features`, the output shape will be:
+// ```
+//   features x depth if axis == -1
+//   depth x features if axis == 0
+// ```
+//
+// If `indices` is a matrix (batch) with shape `[batch, features]`,
+// the output shape will be:
+// ```
+//   batch x features x depth if axis == -1
+//   batch x depth x features if axis == 1
+//   depth x batch x features if axis == 0
+// ```
+//
+//
+// Examples
+// =========
+//
+// Suppose that
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 5.0
+//   off_value = 0.0
+//   axis = -1
+// ```
+//
+// Then output is `[4 x 3]`:
+// ```
+// output =
+//   [5.0 0.0 0.0]  // one_hot(0)
+//   [0.0 0.0 5.0]  // one_hot(2)
+//   [0.0 0.0 0.0]  // one_hot(-1)
+//   [0.0 5.0 0.0]  // one_hot(1)
+// ```
+//
+// Suppose that
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 0.0
+//   off_value = 3.0
+//   axis = 0
+// ```
+//
+// Then output is `[3 x 4]`:
+// ```
+// output =
+//   [0.0 3.0 3.0 3.0]
+//   [3.0 3.0 3.0 0.0]
+//   [3.0 3.0 3.0 3.0]
+//   [3.0 0.0 3.0 3.0]
+// //  ^                one_hot(0)
+// //      ^            one_hot(2)
+// //          ^        one_hot(-1)
+// //              ^    one_hot(1)
+// ```
+//
+// Suppose that
+// ```
+//   indices = [[0, 2], [1, -1]]
+//   depth = 3
+//   on_value = 1.0
+//   off_value = 0.0
+//   axis = -1
+// ```
+//
+// Then output is `[2 x 2 x 3]`:
+// ```
+// output =
+//   [
+//     [1.0, 0.0, 0.0]  // one_hot(0)
+//     [0.0, 0.0, 1.0]  // one_hot(2)
+//   ][
+//     [0.0, 1.0, 0.0]  // one_hot(1)
+//     [0.0, 0.0, 0.0]  // one_hot(-1)
+//   ]
+// ```
+//
+// Arguments:
+//	indices: A tensor of indices.
+//	depth: A scalar defining the depth of the one hot dimension.
+//	on_value: A scalar defining the value to fill in output when `indices[j] = i`.
+//	off_value: A scalar defining the value to fill in output when `indices[j] != i`.
+//
+// Returns The one-hot tensor.
+func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output, off_value tf.Output, optional ...OneHotAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Sin",
+		Type: "OneHot",
 		Input: []tf.Input{
-			x,
+			indices, depth, on_value, off_value,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes Psi, the derivative of Lgamma (the log of the absolute value of
+// Computes the number of elements in the given queue.
 //
-// `Gamma(x)`), element-wise.
-func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	handle: The handle to a queue.
+//
+// Returns The number of elements in the given queue.
+func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Digamma",
+		Type: "QueueSizeV2",
 		Input: []tf.Input{
-			x,
+			handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
-type Conv2DBackpropFilterAttr func(optionalAttr)
-
-// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
+// QueueDequeueManyV2Attr is an optional argument to QueueDequeueManyV2.
+type QueueDequeueManyV2Attr func(optionalAttr)
 
-// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
+// QueueDequeueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
+// value: If the queue has fewer than n elements, this operation
+// will block for up to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueManyV2TimeoutMs(value int64) QueueDequeueManyV2Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
+// Dequeues `n` tuples of one or more tensors from the given queue.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of convolution with respect to the filter.
+// If the queue is closed and there are fewer than `n` elements, then an
+// OutOfRange error is returned.
+//
+// This operation concatenates queue-element component tensors along the
+// 0th dimension to make a single component tensor.  All of the components
+// in the dequeued tuple will have size `n` in the 0th dimension.
+//
+// This operation has `k` outputs, where `k` is the number of components in
+// the tuples stored in the given queue, and output `i` is the ith
+// component of the dequeued tuple.
+//
+// N.B. If the queue is empty, this operation will block until `n` elements
+// have been dequeued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, out_channels]` tensor.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
+//	handle: The handle to a queue.
+//	n: The number of tuples to dequeue.
+//	component_types: The type of each component in a tuple.
 //
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueManyV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueManyV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropFilter",
+		Type: "QueueDequeueManyV2",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			handle, n,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the number of work units this Reader has finished processing.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "ReaderNumWorkUnitsCompletedV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueManyV2", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return components
 }
 
-// Creates a dataset that contains the elements of `input_dataset` ignoring errors.
-func ExperimentalIgnoreErrorsDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalIgnoreErrorsDataset",
-		Input: []tf.Input{
-			input_dataset,
-		},
-		Attrs: attrs,
+// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
+type QuantizeAndDequantizeAttr func(optionalAttr)
+
+// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["signed_input"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the log of the absolute value of `Gamma(x)` element-wise.
-func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Lgamma",
-		Input: []tf.Input{
-			x,
-		},
+// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
-//
-// For an explanation see "Differentiation of the Cholesky algorithm" by
-// Iain Murray http://arxiv.org/abs/1602.07527.
-//
-// Arguments:
-//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
-//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
-//
-// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
-func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "CholeskyGrad",
-		Input: []tf.Input{
-			l, grad,
-		},
+// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to false
+func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a dataset that emits each dim-0 slice of `components` once.
-func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
+// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_min"] = value
 	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "TensorSliceDataset",
-		Input: []tf.Input{
-			tf.OutputList(components),
-		},
-		Attrs: attrs,
+}
+
+// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_max"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes hyperbolic sine of x element-wise.
-func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
+// Use QuantizeAndDequantizeV2 instead.
+//
+// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
+func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Sinh",
+		Type: "QuantizeAndDequantize",
 		Input: []tf.Input{
-			x,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
+// Returns locations of nonzero / true values in a tensor.
 //
-// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
+// This operation returns the coordinates of true elements in `condition`. The
+// coordinates are returned in a 2-D tensor where the first dimension (rows)
+// represents the number of true elements, and the second dimension (columns)
+// represents the coordinates of the true elements. Keep in mind, the shape of
+// the output tensor can vary depending on how many true values there are in
+// `condition`. Indices are output in row-major order.
 //
 // For example:
 //
-// ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-//
-// # Select two rows, one segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
-// # => [[0 0 0 0]]
-//
-// # Select two rows, two segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
-// # => [[ 1  2  3  4]
-// #     [-1 -2 -3 -4]]
-//
-// # Select all rows, two segments.
-// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
-// # => [[0 0 0 0]
-// #     [5 6 7 8]]
-//
-// # Which is equivalent to:
-// tf.segment_sum(c, tf.constant([0, 0, 1]))
 // ```
+// # 'input' tensor is [[True, False]
+// #                    [True, False]]
+// # 'input' has two true values, so output has two coordinates.
+// # 'input' has rank of 2, so coordinates have two indices.
+// where(input) ==> [[0, 0],
+//                   [1, 0]]
 //
-// Arguments:
+// # `condition` tensor is [[[True, False]
+// #                     [True, False]]
+// #                    [[False, True]
+// #                     [False, True]]
+// #                    [[False, False]
+// #                     [False, True]]]
+// # 'input' has 5 true values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+// # `condition` tensor is [[[1.5,  0.0]
+// #                     [-0.5, 0.0]]
+// #                    [[0.0,  0.25]
+// #                     [0.0,  0.75]]
+// #                    [[0.0,  0.0]
+// #                     [0.0,  0.01]]]
+// # 'input' has 5 nonzero values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// # `condition` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
+// #                     [0.0 + 0.5j, 0.0  + 0.0j]]
+// #                    [[0.0 + 0.0j, 0.25 + 1.5j]
+// #                     [0.0 + 0.0j, 0.75 + 0.0j]]
+// #                    [[0.0 + 0.0j, 0.0  + 0.0j]
+// #                     [0.0 + 0.0j, 0.01 + 0.0j]]]
+// # 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
+// ```
+func Where(scope *Scope, condition tf.Output) (index tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSum",
+		Type: "Where",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			condition,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes natural logarithm of x element-wise.
+// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
+type QueueDequeueV2Attr func(optionalAttr)
+
+// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// I.e., \\(y = \log_e x\\).
-func Log(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If the queue is empty, this operation will block for up to
+// timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Log",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Rounds the values of a tensor to the nearest integer, element-wise.
+// Dequeues a tuple of one or more tensors from the given queue.
 //
-// Rounds half to even.  Also known as bankers rounding. If you want to round
-// according to the current system rounding mode use std::cint.
-func Round(scope *Scope, x tf.Output) (y tf.Output) {
+// This operation has k outputs, where k is the number of components
+// in the tuples stored in the given queue, and output i is the ith
+// component of the dequeued tuple.
+//
+// N.B. If the queue is empty, this operation will block until an element
+// has been dequeued (or 'timeout_ms' elapses, if specified).
+//
+// Arguments:
+//	handle: The handle to a queue.
+//	component_types: The type of each component in a tuple.
+//
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Round",
+		Type: "QueueDequeueV2",
 		Input: []tf.Input{
-			x,
+			handle,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes reciprocal of square root of x element-wise.
-//
-// I.e., \\(y = 1 / \sqrt{x}\\).
-func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Rsqrt",
-		Input: []tf.Input{
-			x,
-		},
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueV2", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return components
 }
 
-// MatrixInverseAttr is an optional argument to MatrixInverse.
-type MatrixInverseAttr func(optionalAttr)
+// QueueEnqueueV2Attr is an optional argument to QueueEnqueueV2.
+type QueueEnqueueV2Attr func(optionalAttr)
 
-// MatrixInverseAdjoint sets the optional adjoint attribute to value.
-// If not specified, defaults to false
-func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
+// QueueEnqueueV2TimeoutMs sets the optional timeout_ms attribute to value.
+//
+// value: If the queue is full, this operation will block for up to
+// timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueEnqueueV2TimeoutMs(value int64) QueueEnqueueV2Attr {
 	return func(m optionalAttr) {
-		m["adjoint"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// Computes the inverse of one or more square invertible matrices or their
-//
-// adjoints (conjugate transposes).
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the inverse for all input submatrices `[..., :, :]`.
+// Enqueues a tuple of one or more tensors in the given queue.
 //
-// The op uses LU decomposition with partial pivoting to compute the inverses.
+// The components input has k elements, which correspond to the components of
+// tuples stored in the given queue.
 //
-// If a matrix is not invertible there is no guarantee what the op does. It
-// may detect the condition and raise an exception or it may simply return a
-// garbage result.
+// N.B. If the queue is full, this operation will block until the given
+// element has been enqueued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
+//	handle: The handle to a queue.
+//	components: One or more tensors from which the enqueued tensors should be taken.
 //
-// @compatibility(numpy)
-// Equivalent to np.linalg.inv
-// @end_compatibility
-func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
+// Returns the created operation.
+func QueueEnqueueV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7155,258 +7051,492 @@ func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixInverse",
+		Type: "QueueEnqueueV2",
 		Input: []tf.Input{
-			input,
+			handle, tf.OutputList(components),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns x + y element-wise.
+// MfccAttr is an optional argument to Mfcc.
+type MfccAttr func(optionalAttr)
+
+// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
 //
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: The highest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 4000
+func MfccUpperFrequencyLimit(value float32) MfccAttr {
+	return func(m optionalAttr) {
+		m["upper_frequency_limit"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Add",
-		Input: []tf.Input{
-			x, y,
-		},
+}
+
+// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
+//
+// value: The lowest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 20
+func MfccLowerFrequencyLimit(value float32) MfccAttr {
+	return func(m optionalAttr) {
+		m["lower_frequency_limit"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the derivative of a Gamma random sample w.r.t. `alpha`.
-func RandomGammaGrad(scope *Scope, alpha tf.Output, sample tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
+//
+// value: Resolution of the Mel bank used internally.
+// If not specified, defaults to 40
+func MfccFilterbankChannelCount(value int64) MfccAttr {
+	return func(m optionalAttr) {
+		m["filterbank_channel_count"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "RandomGammaGrad",
-		Input: []tf.Input{
-			alpha, sample,
-		},
+}
+
+// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
+//
+// value: How many output channels to produce per time slice.
+// If not specified, defaults to 13
+func MfccDctCoefficientCount(value int64) MfccAttr {
+	return func(m optionalAttr) {
+		m["dct_coefficient_count"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes square of x element-wise.
+// Transforms a spectrogram into a form that's useful for speech recognition.
 //
-// I.e., \\(y = x * x = x^2\\).
-func Square(scope *Scope, x tf.Output) (y tf.Output) {
+// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
+// been effective as an input feature for machine learning. They are created by
+// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
+// higher frequencies that are less significant to the human ear. They have a long
+// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+// is a good resource to learn more.
+//
+// Arguments:
+//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
+// set to true.
+//	sample_rate: How many samples per second the source audio used.
+func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Square",
+		Type: "Mfcc",
 		Input: []tf.Input{
-			x,
+			spectrogram, sample_rate,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
+// PaddingFIFOQueueV2Attr is an optional argument to PaddingFIFOQueueV2.
+type PaddingFIFOQueueV2Attr func(optionalAttr)
+
+// PaddingFIFOQueueV2Shapes sets the optional shapes attribute to value.
 //
-// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
-// ](http://arxiv.org/abs/1511.07289)
-func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types.
+// Shapes of fixed rank but variable size are allowed by setting
+// any shape dimension to -1.  In this case, the inputs' shape may vary along
+// the given dimension, and DequeueMany will pad the given dimension with
+// zeros up to the maximum shape of all elements in the given batch.
+// If the length of this attr is 0, different queue elements may have
+// different ranks and shapes, but only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shapes"] = value
+	}
+}
+
+// PaddingFIFOQueueV2Capacity sets the optional capacity attribute to value.
+//
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func PaddingFIFOQueueV2Capacity(value int64) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// PaddingFIFOQueueV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func PaddingFIFOQueueV2Container(value string) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// PaddingFIFOQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func PaddingFIFOQueueV2SharedName(value string) PaddingFIFOQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that produces elements in first-in first-out order.
+//
+// Variable-size shapes are allowed by setting the corresponding shape dimensions
+// to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
+// size of any given element in the minibatch.  See below for details.
+//
+// Arguments:
+//	component_types: The type of each component in a value.
+//
+// Returns The handle to the queue.
+func PaddingFIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...PaddingFIFOQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Elu",
-		Input: []tf.Input{
-			features,
-		},
+		Type: "PaddingFIFOQueueV2",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the reciprocal of x element-wise.
+// Interleave the values from the `data` tensors into a single tensor.
 //
-// I.e., \\(y = 1 / x\\).
-func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
+// Builds a merged tensor such that
+//
+// ```python
+//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
+// ```
+//
+// For example, if each `indices[m]` is scalar or vector, we have
+//
+// ```python
+//     # Scalar indices:
+//     merged[indices[m], ...] = data[m][...]
+//
+//     # Vector indices:
+//     merged[indices[m][i], ...] = data[m][i, ...]
+// ```
+//
+// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
+// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
+// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
+// `constant`, the output shape is
+//
+//     merged.shape = [max(indices)] + constant
+//
+// Values may be merged in parallel, so if an index appears in both `indices[m][i]`
+// and `indices[n][j]`, the result may be invalid. This differs from the normal
+// DynamicStitch operator that defines the behavior in that case.
+//
+// For example:
+//
+// ```python
+//     indices[0] = 6
+//     indices[1] = [4, 1]
+//     indices[2] = [[5, 2], [0, 3]]
+//     data[0] = [61, 62]
+//     data[1] = [[41, 42], [11, 12]]
+//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
+//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
+//               [51, 52], [61, 62]]
+// ```
+//
+// This method can be used to merge partitions created by `dynamic_partition`
+// as illustrated on the following example:
+//
+// ```python
+//     # Apply function (increments x_i) on elements for which a certain condition
+//     # apply (x_i != -1 in this example).
+//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+//     condition_mask=tf.not_equal(x,tf.constant(-1.))
+//     partitioned_data = tf.dynamic_partition(
+//         x, tf.cast(condition_mask, tf.int32) , 2)
+//     partitioned_data[1] = partitioned_data[1] + 1.0
+//     condition_indices = tf.dynamic_partition(
+//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
+//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+//     # unchanged.
+// ```
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
+// </div>
+func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Reciprocal",
+		Type: "ParallelDynamicStitch",
 		Input: []tf.Input{
-			x,
+			tf.OutputList(indices), tf.OutputList(data),
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a batched matrix tensor with new batched diagonal values.
+// Partitions `data` into `num_partitions` tensors using indices from `partitions`.
 //
-// Given `input` and `diagonal`, this operation returns a tensor with the
-// same shape and values as `input`, except for the main diagonal of the
-// innermost matrices.  These will be overwritten by the values in `diagonal`.
+// For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
+// becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
+// are placed in `outputs[i]` in lexicographic order of `js`, and the first
+// dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
+// In detail,
 //
-// The output is computed as follows:
+// ```python
+//     outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
 //
-// Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
-// `k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
-// tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
+//     outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
+// ```
 //
-//   * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
-//   * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
+// `data.shape` must start with `partitions.shape`.
+//
+// For example:
+//
+// ```python
+//     # Scalar partitions.
+//     partitions = 1
+//     num_partitions = 2
+//     data = [10, 20]
+//     outputs[0] = []  # Empty with shape [0, 2]
+//     outputs[1] = [[10, 20]]
+//
+//     # Vector partitions.
+//     partitions = [0, 0, 1, 1, 0]
+//     num_partitions = 2
+//     data = [10, 20, 30, 40, 50]
+//     outputs[0] = [10, 20, 50]
+//     outputs[1] = [30, 40]
+// ```
+//
+// See `dynamic_stitch` for an example on how to merge partitions back.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
+// </div>
 //
 // Arguments:
-//	input: Rank `k+1`, where `k >= 1`.
-//	diagonal: Rank `k`, where `k >= 1`.
 //
-// Returns Rank `k+1`, with `output.shape = input.shape`.
-func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf.Output) {
+//	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
+//	num_partitions: The number of partitions to output.
+func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_partitions int64) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_partitions": num_partitions}
 	opspec := tf.OpSpec{
-		Type: "MatrixSetDiag",
+		Type: "DynamicPartition",
 		Input: []tf.Input{
-			input, diagonal,
+			data, partitions,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("DynamicPartition", err)
+		return
+	}
+	return outputs
 }
 
-// Returns the element-wise max of two SparseTensors.
-//
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+// Produces a string handle for the given MultiDeviceIterator.
 //
 // Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//	multi_device_iterator: A MultiDeviceIterator resource.
 //
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+// Returns A string representing the resource.
+func MultiDeviceIteratorToStringHandle(scope *Scope, multi_device_iterator tf.Output) (string_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSparseMaximum",
+		Type: "MultiDeviceIteratorToStringHandle",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+			multi_device_iterator,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Computes the reciprocal of x element-wise.
+// Checks whether a tree has been initialized.
 //
-// I.e., \\(y = 1 / x\\).
-func Inv(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	tree_handle: Handle to the tree.
+//
+// Returns Whether the tree is initialized.
+func TensorForestTreeIsInitializedOp(scope *Scope, tree_handle tf.Output) (is_initialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Inv",
+		Type: "TensorForestTreeIsInitializedOp",
 		Input: []tf.Input{
-			x,
+			tree_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ComplexAbsAttr is an optional argument to ComplexAbs.
-type ComplexAbsAttr func(optionalAttr)
-
-// ComplexAbsTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Computes the complex absolute value of a tensor.
+// Gets next element for the provided shard number.
 //
-// Given a tensor `x` of complex numbers, this operation returns a tensor of type
-// `float` or `double` that is the absolute value of each element in `x`. All
-// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
-// value is computed as \\( \sqrt{a^2 + b^2}\\).
-func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
+// Arguments:
+//	multi_device_iterator: A MultiDeviceIterator resource.
+//	shard_num: Integer representing which shard to fetch data for.
+//	incarnation_id: Which incarnation of the MultiDeviceIterator is running.
+//	output_types: The type list for the return values.
+//	output_shapes: The list of shapes being produced.
+//
+// Returns Result of the get_next on the dataset.
+func MultiDeviceIteratorGetNextFromShard(scope *Scope, multi_device_iterator tf.Output, shard_num tf.Output, incarnation_id tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ComplexAbs",
+		Type: "MultiDeviceIteratorGetNextFromShard",
 		Input: []tf.Input{
-			x,
+			multi_device_iterator, shard_num, incarnation_id,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("MultiDeviceIteratorGetNextFromShard", err)
+		return
+	}
+	return components
 }
 
-// Returns the truth value of x AND y element-wise.
+// Initializes the multi device iterator with the given dataset.
 //
-// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	dataset: Dataset to be iterated upon.
+//	multi_device_iterator: A MultiDeviceIteratorResource.
+//	max_buffer_size: The maximum size of the host side per device buffer to keep.
+//
+// Returns An int64 indicating which incarnation of the MultiDeviceIterator
+// is running.
+func MultiDeviceIteratorInit(scope *Scope, dataset tf.Output, multi_device_iterator tf.Output, max_buffer_size tf.Output) (incarnation_id tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogicalAnd",
+		Type: "MultiDeviceIteratorInit",
 		Input: []tf.Input{
-			x, y,
+			dataset, multi_device_iterator, max_buffer_size,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CastAttr is an optional argument to Cast.
-type CastAttr func(optionalAttr)
-
-// CastTruncate sets the optional Truncate attribute to value.
-// If not specified, defaults to false
-func CastTruncate(value bool) CastAttr {
-	return func(m optionalAttr) {
-		m["Truncate"] = value
-	}
-}
-
-// Cast x of type SrcT to y of DstT.
-func Cast(scope *Scope, x tf.Output, DstT tf.DataType, optional ...CastAttr) (y tf.Output) {
+// Copy a tensor setting everything outside a central band in each innermost matrix
+//
+// to zero.
+//
+// The `band` part is computed as follows:
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor with the same shape where
+//
+// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
+//
+// The indicator function
+//
+// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
+//                  (num_upper < 0 || (n-m) <= num_upper)`.
+//
+// For example:
+//
+// ```
+// # if 'input' is [[ 0,  1,  2, 3]
+//                  [-1,  0,  1, 2]
+//                  [-2, -1,  0, 1]
+//                  [-3, -2, -1, 0]],
+//
+// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
+//                                        [-1,  0,  1, 2]
+//                                        [ 0, -1,  0, 1]
+//                                        [ 0,  0, -1, 0]],
+//
+// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
+//                                       [-1,  0,  1, 0]
+//                                       [-2, -1,  0, 1]
+//                                       [ 0, -2, -1, 0]]
+// ```
+//
+// Useful special cases:
+//
+// ```
+//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
+//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
+//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
+// ```
+//
+// Arguments:
+//	input: Rank `k` tensor.
+//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
+// lower triangle.
+//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
+// entire upper triangle.
+//
+// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
+func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"DstT": DstT}
-	for _, a := range optional {
-		a(attrs)
+	opspec := tf.OpSpec{
+		Type: "MatrixBandPart",
+		Input: []tf.Input{
+			input, num_lower, num_upper,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Gets the next output from the given iterator as an Optional variant.
+func IteratorGetNextAsOptional(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (optional tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Cast",
+		Type: "IteratorGetNextAsOptional",
 		Input: []tf.Input{
-			x,
+			iterator,
 		},
 		Attrs: attrs,
 	}
@@ -7414,6 +7544,32 @@ func Cast(scope *Scope, x tf.Output, DstT tf.DataType, optional ...CastAttr) (y
 	return op.Output(0)
 }
 
+// Returns the value stored in an Optional variant or raises an error if none exists.
+func OptionalGetValue(scope *Scope, optional tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "OptionalGetValue",
+		Input: []tf.Input{
+			optional,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("OptionalGetValue", err)
+		return
+	}
+	return components
+}
+
 // Outputs a tensor containing the reduction across all input tensors.
 //
 // Outputs a tensor containing the reduction across all input tensors passed to ops
@@ -7465,7 +7621,7 @@ func RegexReplaceReplaceGlobal(value bool) RegexReplaceAttr {
 // Arguments:
 //	input: The text to be processed.
 //	pattern: The regular expression to match the input.
-//	rewrite: The rewrite to be applied to the matched expresion.
+//	rewrite: The rewrite to be applied to the matched expression.
 //
 // Returns The text after applying pattern and rewrite.
 func RegexReplace(scope *Scope, input tf.Output, pattern tf.Output, rewrite tf.Output, optional ...RegexReplaceAttr) (output tf.Output) {
@@ -7925,24 +8081,6 @@ func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
 	return op.Output(0)
 }
 
-// Returns the truth value of (x <= y) element-wise.
-//
-// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LessEqual",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes softmax activations.
 //
 // For each batch `i` and class `j` we have
@@ -8012,85 +8150,33 @@ func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (ima
 	return op.Output(0)
 }
 
-// BatchMatMulAttr is an optional argument to BatchMatMul.
-type BatchMatMulAttr func(optionalAttr)
-
-// BatchMatMulAdjX sets the optional adj_x attribute to value.
-//
-// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjX(value bool) BatchMatMulAttr {
-	return func(m optionalAttr) {
-		m["adj_x"] = value
-	}
-}
-
-// BatchMatMulAdjY sets the optional adj_y attribute to value.
-//
-// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjY(value bool) BatchMatMulAttr {
-	return func(m optionalAttr) {
-		m["adj_y"] = value
-	}
-}
-
-// Multiplies slices of two tensors in batches.
-//
-// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-// viewed as an element of a batch), and arranges the individual results
-// in a single output tensor of the same batch size. Each of the
-// individual slices can optionally be adjointed (to adjoint a matrix
-// means to transpose and conjugate it) before multiplication by setting
-// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
-//
-// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
-// and `[..., r_y, c_y]`.
-//
-// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
-//
-//     r_o = c_x if adj_x else r_x
-//     c_o = r_y if adj_y else c_y
-//
-// It is computed as:
-//
-//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
-//
-// Arguments:
-//	x: 2-D or higher with shape `[..., r_x, c_x]`.
-//	y: 2-D or higher with shape `[..., r_y, c_y]`.
+// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
 //
-// Returns 3-D or higher with shape `[..., r_o, c_o]`
-func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
+// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
+// ](http://arxiv.org/abs/1511.07289)
+func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "BatchMatMul",
+		Type: "Elu",
 		Input: []tf.Input{
-			x, y,
+			features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns which elements of x are NaN.
+// Computes square of x element-wise.
 //
-// @compatibility(numpy)
-// Equivalent to np.isnan
-// @end_compatibility
-func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
+// I.e., \\(y = x * x = x^2\\).
+func Square(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IsNan",
+		Type: "Square",
 		Input: []tf.Input{
 			x,
 		},
@@ -8099,105 +8185,6 @@ func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Identity op for gradient debugging.
-//
-// This op is hidden from public in Python. It is used by TensorFlow Debugger to
-// register gradient tensors for gradient debugging.
-// This op operates on non-reference-type tensors.
-func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DebugGradientIdentity",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
-type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
-
-// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// var: Should be from a Variable().
-//
-// Arguments:
-//
-//	accum: Should be from a Variable().
-//	accum_update: : Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//
-// Returns the created operation.
-func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdadelta",
-		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Gets next element for the provided shard number.
-//
-// Arguments:
-//	multi_device_iterator: A MultiDeviceIterator resource.
-//	shard_num: Integer representing which shard to fetch data for.
-//	incarnation_id: Which incarnation of the MultiDeviceIterator is running.
-//	output_types: The type list for the return values.
-//	output_shapes: The list of shapes being produced.
-//
-// Returns Result of the get_next on the dataset.
-func MultiDeviceIteratorGetNextFromShard(scope *Scope, multi_device_iterator tf.Output, shard_num tf.Output, incarnation_id tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorGetNextFromShard",
-		Input: []tf.Input{
-			multi_device_iterator, shard_num, incarnation_id,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("MultiDeviceIteratorGetNextFromShard", err)
-		return
-	}
-	return components
-}
-
 // LeakyReluGradAttr is an optional argument to LeakyReluGrad.
 type LeakyReluGradAttr func(optionalAttr)
 
@@ -8236,23 +8223,6 @@ func LeakyReluGrad(scope *Scope, gradients tf.Output, features tf.Output, option
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayGradV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayWriteV3
-func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayWriteV2",
-		Input: []tf.Input{
-			handle, index, value, flow_in,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // LeakyReluAttr is an optional argument to LeakyRelu.
 type LeakyReluAttr func(optionalAttr)
 
@@ -8397,7 +8367,7 @@ func SdcaOptimizerV2(scope *Scope, sparse_example_indices []tf.Output, sparse_fe
 // Computes the minimum along segments of a tensor.
 //
 // Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 // for an explanation of segments.
 //
 // This operator is similar to the unsorted segment sum operator found
@@ -8411,6 +8381,15 @@ func SdcaOptimizerV2(scope *Scope, sparse_example_indices []tf.Output, sparse_fe
 // possible value for the specific numeric type,
 // `output[i] = numeric_limits<T>::max()`.
 //
+// For example:
+//
+// ``` python
+// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+// tf.unsorted_segment_min(c, tf.constant([0, 1, 0]), num_segments=2)
+// # ==> [[ 1,  2, 2, 1],
+// #       [5,  6, 7, 8]]
+// ```
+//
 // If the given segment ID `i` is negative, then the corresponding value is
 // dropped, and will not be included in the result.
 //
@@ -8458,28 +8437,32 @@ func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops
 	return op.Output(0)
 }
 
-// Computes the gradient of morphological 2-D dilation with respect to the input.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+// TensorArrayGatherV2Attr is an optional argument to TensorArrayGatherV2.
+type TensorArrayGatherV2Attr func(optionalAttr)
+
+// TensorArrayGatherV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayGatherV3
 //
-// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
-func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayGatherV3
+func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV2Attr) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropInput",
+		Type: "TensorArrayGatherV2",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			handle, indices, flow_in,
 		},
 		Attrs: attrs,
 	}
@@ -8487,52 +8470,41 @@ func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, ou
 	return op.Output(0)
 }
 
-// Compute the polygamma function \\(\psi^{(n)}(x)\\).
-//
-// The polygamma function is defined as:
-//
-//
-// \\(\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)\\)
+// Returns the truth value of (x == y) element-wise.
 //
-// where \\(\psi(x)\\) is the digamma function.
-func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// *NOTE*: `Equal` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Polygamma",
+		Type: "Equal",
 		Input: []tf.Input{
-			a, x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes second-order gradients of the maxpooling function.
+// Compute the polygamma function \\(\psi^{(n)}(x)\\).
 //
-// Arguments:
-//	input: The original input.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-// input of `max_pool`.
-//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+// The polygamma function is defined as:
 //
-// Returns Gradients of gradients w.r.t. the input of `max_pool`.
-func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
+//
+// \\(\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)\\)
+//
+// where \\(\psi(x)\\) is the digamma function.
+func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGradWithArgmax",
+		Type: "Polygamma",
 		Input: []tf.Input{
-			input, grad, argmax,
+			a, x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -8660,6 +8632,21 @@ func MutexV2(scope *Scope, optional ...MutexV2Attr) (resource tf.Output) {
 	return op.Output(0)
 }
 
+// Connects N inputs to an N-way replicated TPU computation.
+func TPUReplicatedInput(scope *Scope, inputs []tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TPUReplicatedInput",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // AvgPool3DAttr is an optional argument to AvgPool3D.
 type AvgPool3DAttr func(optionalAttr)
 
@@ -8707,98 +8694,6 @@ func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, pa
 	return op.Output(0)
 }
 
-// Returns element-wise remainder of division. This emulates C semantics in that
-//
-// the result here is consistent with a truncating divide. E.g.
-// `tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
-//
-// *NOTE*: `Mod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Mod",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes offsets of concat inputs within its output.
-//
-// For example:
-//
-// ```
-// # 'x' is [2, 2, 7]
-// # 'y' is [2, 3, 7]
-// # 'z' is [2, 5, 7]
-// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
-// ```
-//
-// This is typically used by gradient computations for a concat operation.
-//
-// Arguments:
-//	concat_dim: The dimension along which to concatenate.
-//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
-//
-// Returns The `N` int32 vectors representing the starting offset
-// of input tensors within the concatenated output.
-func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConcatOffset",
-		Input: []tf.Input{
-			concat_dim, tf.OutputList(shape),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
-		scope.UpdateErr("ConcatOffset", err)
-		return
-	}
-	return offset
-}
-
-// Compute the lower regularized incomplete Gamma function `P(a, x)`.
-//
-// The lower regularized incomplete Gamma function is defined as:
-//
-//
-// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
-//
-// where
-//
-// \\(gamma(a, x) = \\int_{0}^{x} t^{a-1} exp(-t) dt\\)
-//
-// is the lower incomplete Gamma function.
-//
-// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
-// Gamma function.
-func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Igamma",
-		Input: []tf.Input{
-			a, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // DepthToSpaceAttr is an optional argument to DepthToSpace.
 type DepthToSpaceAttr func(optionalAttr)
 
@@ -8987,37 +8882,20 @@ func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output
 	return op.Output(0)
 }
 
-// Computes square root of x element-wise.
-//
-// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
-func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sqrt",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Conv3DBackpropFilterAttr is an optional argument to Conv3DBackpropFilter.
-type Conv3DBackpropFilterAttr func(optionalAttr)
+// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput.
+type Conv3DBackpropInputAttr func(optionalAttr)
 
-// Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
+// Conv3DBackpropInputDilations sets the optional dilations attribute to value.
 // If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
+func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
 	}
 }
 
-// Computes the gradients of 3-D convolution with respect to the filter.
+// Computes the gradients of 3-D convolution with respect to the input.
 //
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
 //
 // Arguments:
 //	input: Shape `[batch, depth, rows, cols, in_channels]`.
@@ -9028,7 +8906,7 @@ func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 //	strides: 1-D tensor of length 5. The stride of the sliding window for each
 // dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 //	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterAttr) (output tf.Output) {
+func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9037,7 +8915,7 @@ func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_b
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilter",
+		Type: "Conv3DBackpropInput",
 		Input: []tf.Input{
 			input, filter, out_backprop,
 		},
@@ -9047,24 +8925,6 @@ func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_b
 	return op.Output(0)
 }
 
-// Computes the gradient for the rsqrt of `x` wrt its input.
-//
-// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RsqrtGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
 type DepthwiseConv2dNativeAttr func(optionalAttr)
 
@@ -9142,78 +9002,6 @@ func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, stri
 	return op.Output(0)
 }
 
-// MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
-type MaxPoolGradV2Attr func(optionalAttr)
-
-// MaxPoolGradV2DataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradV2DataFormat(value string) MaxPoolGradV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the maxpooling function.
-//
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns Gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGradV2",
-		Input: []tf.Input{
-			orig_input, orig_output, grad, ksize, strides,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Restore a reader to a previously saved state.
-//
-// Not all Readers support being restored, so this can produce an
-// Unimplemented error.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//	state: Result of a ReaderSerializeState of a Reader with type
-// matching reader_handle.
-//
-// Returns the created operation.
-func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderRestoreStateV2",
-		Input: []tf.Input{
-			reader_handle, state,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // MaxPoolGradAttr is an optional argument to MaxPoolGrad.
 type MaxPoolGradAttr func(optionalAttr)
 
@@ -9345,6 +9133,236 @@ func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Ou
 	return op.Output(0)
 }
 
+// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
+type Conv2DBackpropFilterAttr func(optionalAttr)
+
+// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DBackpropFilterExplicitPaddings sets the optional explicit_paddings attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DBackpropFilterExplicitPaddings(value []int64) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
+// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of convolution with respect to the filter.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, out_channels]` tensor.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv2DBackpropFilter",
+		Input: []tf.Input{
+			input, filter_sizes, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes Psi, the derivative of Lgamma (the log of the absolute value of
+//
+// `Gamma(x)`), element-wise.
+func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Digamma",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the number of work units this Reader has finished processing.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderNumWorkUnitsCompletedV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv2DAttr is an optional argument to Conv2D.
+type Conv2DAttr func(optionalAttr)
+
+// Conv2DUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DExplicitPaddings sets the optional explicit_paddings attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DExplicitPaddings(value []int64) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
+// Conv2DDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func Conv2DDataFormat(value string) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DDilations(value []int64) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes a 2-D convolution given 4-D `input` and `filter` tensors.
+//
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`, this op
+// performs the following:
+//
+// 1. Flattens the filter to a 2-D matrix with shape
+//    `[filter_height * filter_width * in_channels, output_channels]`.
+// 2. Extracts image patches from the input tensor to form a *virtual*
+//    tensor of shape `[batch, out_height, out_width,
+//    filter_height * filter_width * in_channels]`.
+// 3. For each patch, right-multiplies the filter matrix and the image patch
+//    vector.
+//
+// In detail, with the default NHWC format,
+//
+//     output[b, i, j, k] =
+//         sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
+//                         filter[di, dj, q, k]
+//
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+//
+// Arguments:
+//	input: A 4-D tensor. The dimension order is interpreted according to the value
+// of `data_format`, see below for details.
+//	filter: A 4-D tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`
+//	strides: 1-D tensor of length 4.  The stride of the sliding window for each
+// dimension of `input`. The dimension order is determined by the value of
+// `data_format`, see below for details.
+//	padding: The type of padding algorithm to use.
+//
+// Returns A 4-D tensor. The dimension order is determined by the value of
+// `data_format`, see below for details.
+func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv2DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv2D",
+		Input: []tf.Input{
+			input, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Fills empty rows in the input 2-D `SparseTensor` with a default value.
 //
 // The input `SparseTensor` is represented via the tuple of inputs
@@ -9408,36 +9426,66 @@ func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dens
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Reduces `input` from `num_devices` using `reduction` to a single device.
+// LoadTPUEmbeddingADAMParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingADAMParametersGradAccumDebug.
+type LoadTPUEmbeddingADAMParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingADAMParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// Reduces `input` from `num_devices` using `reduction` to a single device.
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingADAMParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingADAMParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingADAMParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingADAMParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingADAMParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load ADAM embedding parameters with debug support.
 //
-// The graph should be constructed so that all inputs have a valid device
-// assignment, and the op itself is assigned one of these devices.
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
-// input: The input to the reduction.
-// data: the value of the reduction across all `num_devices` devices.
-// reduction: the reduction operation to perform.
-func NcclReduce(scope *Scope, input []tf.Output, reduction string) (data tf.Output) {
+// Arguments:
+//	parameters: Value of parameters used in the ADAM optimization algorithm.
+//	momenta: Value of momenta used in the ADAM optimization algorithm.
+//	velocities: Value of velocities used in the ADAM optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the ADAM optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingADAMParametersGradAccumDebug(scope *Scope, parameters tf.Output, momenta tf.Output, velocities tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingADAMParametersGradAccumDebugAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"reduction": reduction}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "NcclReduce",
+		Type: "LoadTPUEmbeddingADAMParametersGradAccumDebug",
 		Input: []tf.Input{
-			tf.OutputList(input),
+			parameters, momenta, velocities, gradient_accumulators,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// BiasAddGradAttr is an optional argument to BiasAddGrad.
-type BiasAddGradAttr func(optionalAttr)
+// BiasAddAttr is an optional argument to BiasAdd.
+type BiasAddAttr func(optionalAttr)
 
-// BiasAddGradDataFormat sets the optional data_format attribute to value.
+// BiasAddDataFormat sets the optional data_format attribute to value.
 //
 // value: Specify the data format of the input and output data. With the
 // default format "NHWC", the bias tensor will be added to the last dimension
@@ -9447,23 +9495,23 @@ type BiasAddGradAttr func(optionalAttr)
 // The tensor will be added to "in_channels", the third-to-the-last
 //     dimension.
 // If not specified, defaults to "NHWC"
-func BiasAddGradDataFormat(value string) BiasAddGradAttr {
+func BiasAddDataFormat(value string) BiasAddAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// The backward operation for "BiasAdd" on the "bias" tensor.
+// Adds `bias` to `value`.
 //
-// It accumulates all the values from out_backprop into the feature dimension.
-// For NHWC data format, the feature dimension is the last. For NCHW data format,
-// the feature dimension is the third-to-last.
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
 //
 // Arguments:
-//	out_backprop: Any number of dimensions.
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
 //
-// Returns 1-D with size the feature dimension of `out_backprop`.
-func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9472,9 +9520,9 @@ func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BiasAddGrad",
+		Type: "BiasAdd",
 		Input: []tf.Input{
-			out_backprop,
+			value, bias,
 		},
 		Attrs: attrs,
 	}
@@ -9482,13 +9530,151 @@ func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAt
 	return op.Output(0)
 }
 
-// Returns 0 if x == 0, and x / y otherwise, elementwise.
-func Xdivy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
+type SparseReduceSumSparseAttr func(optionalAttr)
+
+// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a SparseTensor.
+//
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
+// SparseTensor.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Xdivy",
+		Type: "SparseReduceSumSparse",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape, reduction_axes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to LoadTPUEmbeddingStochasticGradientDescentParameters.
+type LoadTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingStochasticGradientDescentParametersTableName(value string) LoadTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load SGD embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the stochastic gradient descent optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, parameters tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingStochasticGradientDescentParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingStochasticGradientDescentParameters",
+		Input: []tf.Input{
+			parameters,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Selects the k nearest centers for each point.
+//
+// Rows of points are assumed to be input points. Rows of centers are assumed to be
+// the list of candidate centers. For each point, the k centers that have least L2
+// distance to it are computed.
+//
+// Arguments:
+//	points: Matrix of shape (n, d). Rows are assumed to be input points.
+//	centers: Matrix of shape (m, d). Rows are assumed to be centers.
+//	k: Number of nearest centers to return for each point. If k is larger than m, then
+// only m centers are returned.
+//
+// Returns Matrix of shape (n, min(m, k)). Each row contains the indices of the centers
+// closest to the corresponding point, ordered by increasing distance.Matrix of shape (n, min(m, k)). Each row contains the squared L2 distance to the
+// corresponding center in nearest_center_indices.
+func NearestNeighbors(scope *Scope, points tf.Output, centers tf.Output, k tf.Output) (nearest_center_indices tf.Output, nearest_center_distances tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NearestNeighbors",
+		Input: []tf.Input{
+			points, centers, k,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Returns x * y element-wise.
+//
+// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Mul",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -9664,348 +9850,109 @@ func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output)
 	return op.Output(0)
 }
 
-// Shuffle dimensions of x according to a permutation.
-//
-// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Transpose",
-		Input: []tf.Input{
-			x, perm,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MinAttr is an optional argument to Min.
-type MinAttr func(optionalAttr)
-
-// MinKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MinKeepDims(value bool) MinAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the minimum of elements across dimensions of a tensor.
+// Selects num_to_sample rows of input using the KMeans++ criterion.
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// Rows of points are assumed to be input points. One row is selected at random.
+// Subsequent rows are sampled with probability proportional to the squared L2
+// distance from the nearest row selected thus far till num_to_sample rows have
+// been sampled.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	points: Matrix of shape (n, d). Rows are assumed to be input points.
+//	num_to_sample: Scalar. The number of rows to sample. This value must not be larger than n.
+//	seed: Scalar. Seed for initializing the random number generator.
+//	num_retries_per_sample: Scalar. For each row that is sampled, this parameter
+// specifies the number of additional points to draw from the current
+// distribution before selecting the best. If a negative value is specified, a
+// heuristic is used to sample O(log(num_to_sample)) additional points.
 //
-// Returns The reduced tensor.
-func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
+// Returns Matrix of shape (num_to_sample, d). The sampled rows.
+func KmeansPlusPlusInitialization(scope *Scope, points tf.Output, num_to_sample tf.Output, seed tf.Output, num_retries_per_sample tf.Output) (samples tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Min",
+		Type: "KmeansPlusPlusInitialization",
 		Input: []tf.Input{
-			input, axis,
+			points, num_to_sample, seed, num_retries_per_sample,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the Bessel i1e function of `x` element-wise.
+// Transforms a Tensor into a serialized TensorProto proto.
 //
-// Exponentially scaled modified Bessel function of order 0 defined as
-// `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
+// Arguments:
+//	tensor: A Tensor of type `T`.
 //
-// This function is faster and numerically stabler than `bessel_i1(x)`.
-func BesselI1e(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns A serialized TensorProto proto of the input tensor.
+func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BesselI1e",
+		Type: "SerializeTensor",
 		Input: []tf.Input{
-			x,
+			tensor,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MapClearAttr is an optional argument to MapClear.
-type MapClearAttr func(optionalAttr)
-
-// MapClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapClearCapacity(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapClearMemoryLimit(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
+// UnbatchGradAttr is an optional argument to UnbatchGrad.
+type UnbatchGradAttr func(optionalAttr)
 
-// MapClearContainer sets the optional container attribute to value.
+// UnbatchGradContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func MapClearContainer(value string) MapClearAttr {
+func UnbatchGradContainer(value string) UnbatchGradAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// MapClearSharedName sets the optional shared_name attribute to value.
+// UnbatchGradSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func MapClearSharedName(value string) MapClearAttr {
+func UnbatchGradSharedName(value string) UnbatchGradAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Op removes all elements in the underlying container.
+// Gradient of Unbatch.
 //
-// Returns the created operation.
-func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
+// Acts like Batch but using the given batch_index index of batching things as they
+// become available. This ensures that the gradients are propagated back in the
+// same session which did the forward pass.
+//
+// original_input: The input to the Unbatch operation this is the gradient of.
+// batch_index: The batch_index given to the Unbatch operation this is the gradient
+// of.
+// grad: The downstream gradient.
+// id: The id scalar emitted by Batch.
+// batched_grad: The return value, either an empty tensor or the batched gradient.
+// container: Container to control resource sharing.
+// shared_name: Instances of UnbatchGrad with the same container and shared_name
+//  are assumed to possibly belong to the same batch. If left empty, the op name
+//  will be used as the shared name.
+func UnbatchGrad(scope *Scope, original_input tf.Output, batch_index tf.Output, grad tf.Output, id tf.Output, optional ...UnbatchGradAttr) (batched_grad tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapClear",
-
+		Type: "UnbatchGrad",
+		Input: []tf.Input{
+			original_input, batch_index, grad, id,
+		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// DecodeCSVAttr is an optional argument to DecodeCSV.
-type DecodeCSVAttr func(optionalAttr)
-
-// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
-//
-// value: char delimiter to separate fields in a record.
-// If not specified, defaults to ","
-func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["field_delim"] = value
-	}
-}
-
-// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
-//
-// value: If false, treats double quotation marks as regular
-// characters inside of the string fields (ignoring RFC 4180, Section 2,
-// Bullet 5).
-// If not specified, defaults to true
-func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["use_quote_delim"] = value
-	}
-}
-
-// DecodeCSVNaValue sets the optional na_value attribute to value.
-//
-// value: Additional string to recognize as NA/NaN.
-// If not specified, defaults to ""
-func DecodeCSVNaValue(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["na_value"] = value
-	}
-}
-
-// DecodeCSVSelectCols sets the optional select_cols attribute to value.
-// If not specified, defaults to <>
-func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["select_cols"] = value
-	}
-}
-
-// Convert CSV records to tensors. Each column maps to one tensor.
-//
-// RFC 4180 format is expected for the CSV records.
-// (https://tools.ietf.org/html/rfc4180)
-// Note that we allow leading and trailing spaces with int or float field.
-//
-// Arguments:
-//	records: Each string is a record/row in the csv and all records should have
-// the same format.
-//	record_defaults: One tensor per column of the input record, with either a
-// scalar default value for that column or an empty vector if the column is
-// required.
-//
-// Returns Each tensor will have the same shape as records.
-func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeCSV",
-		Input: []tf.Input{
-			records, tf.OutputList(record_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("DecodeCSV", err)
-		return
-	}
-	return output
-}
-
-// Convert JSON-encoded Example records to binary protocol buffer strings.
-//
-// This op translates a tensor containing Example records, encoded using
-// the [standard JSON
-// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
-// into a tensor containing the same records encoded as binary protocol
-// buffers. The resulting tensor can then be fed to any of the other
-// Example-parsing ops.
-//
-// Arguments:
-//	json_examples: Each string is a JSON object serialized according to the JSON
-// mapping of the Example proto.
-//
-// Returns Each string is a binary Example protocol buffer corresponding
-// to the respective element of `json_examples`.
-func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeJSONExample",
-		Input: []tf.Input{
-			json_examples,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Transforms a Tensor into a serialized TensorProto proto.
-//
-// Arguments:
-//	tensor: A Tensor of type `T`.
-//
-// Returns A serialized TensorProto proto of the input tensor.
-func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SerializeTensor",
-		Input: []tf.Input{
-			tensor,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes acos of x element-wise.
-func Acos(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Acos",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UnbatchGradAttr is an optional argument to UnbatchGrad.
-type UnbatchGradAttr func(optionalAttr)
-
-// UnbatchGradContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func UnbatchGradContainer(value string) UnbatchGradAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// UnbatchGradSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func UnbatchGradSharedName(value string) UnbatchGradAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Gradient of Unbatch.
-//
-// Acts like Batch but using the given batch_index index of batching things as they
-// become available. This ensures that the gradients are propagated back in the
-// same session which did the forward pass.
-//
-// original_input: The input to the Unbatch operation this is the gradient of.
-// batch_index: The batch_index given to the Unbatch operation this is the gradient
-// of.
-// grad: The downstream gradient.
-// id: The id scalar emitted by Batch.
-// batched_grad: The return value, either an empty tensor or the batched gradient.
-// container: Container to control resource sharing.
-// shared_name: Instances of UnbatchGrad with the same container and shared_name
-//  are assumed to possibly belong to the same batch. If left empty, the op name
-//  will be used as the shared name.
-func UnbatchGrad(scope *Scope, original_input tf.Output, batch_index tf.Output, grad tf.Output, id tf.Output, optional ...UnbatchGradAttr) (batched_grad tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UnbatchGrad",
-		Input: []tf.Input{
-			original_input, batch_index, grad, id,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
@@ -10220,53 +10167,100 @@ func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list
 	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
 }
 
-// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
-type QuantizeAndDequantizeAttr func(optionalAttr)
+// SparseToDenseAttr is an optional argument to SparseToDense.
+type SparseToDenseAttr func(optionalAttr)
 
-// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
+// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
+//
+// value: If true, indices are checked to make sure they are sorted in
+// lexicographic order and that there are no repeats.
 // If not specified, defaults to true
-func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
+func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
 	return func(m optionalAttr) {
-		m["signed_input"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
+// Converts a sparse representation into a dense tensor.
+//
+// Builds an array `dense` with shape `output_shape` such that
+//
+// ```
+// # If sparse_indices is scalar
+// dense[i] = (i == sparse_indices ? sparse_values : default_value)
+//
+// # If sparse_indices is a vector, then for each i
+// dense[sparse_indices[i]] = sparse_values[i]
+//
+// # If sparse_indices is an n by d matrix, then for each i in [0, n)
+// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
+// ```
+//
+// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
+// scalar, all sparse indices are set to this single value.
+//
+// Indices should be sorted in lexicographic order, and indices must not
+// contain any repeats. If `validate_indices` is true, these properties
+// are checked during execution.
+//
+// Arguments:
+//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
+// index where `sparse_values[i]` will be placed.
+//	output_shape: 1-D.  Shape of the dense output tensor.
+//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
+// or a scalar value to be used for all sparse indices.
+//	default_value: Scalar value to set for indices not specified in
+// `sparse_indices`.
+//
+// Returns Dense output tensor of shape `output_shape`.
+func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to false
-func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
-}
-
-// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
-	return func(m optionalAttr) {
-		m["input_min"] = value
+	opspec := tf.OpSpec{
+		Type: "SparseToDense",
+		Input: []tf.Input{
+			sparse_indices, output_shape, sparse_values, default_value,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
+// PreventGradientAttr is an optional argument to PreventGradient.
+type PreventGradientAttr func(optionalAttr)
+
+// PreventGradientMessage sets the optional message attribute to value.
+//
+// value: Will be printed in the error when anyone tries to differentiate
+// this operation.
+// If not specified, defaults to ""
+func PreventGradientMessage(value string) PreventGradientAttr {
 	return func(m optionalAttr) {
-		m["input_max"] = value
+		m["message"] = value
 	}
 }
 
-// Use QuantizeAndDequantizeV2 instead.
+// An identity op that triggers an error if a gradient is requested.
 //
-// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
-func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
+// When executed in a graph, this op outputs its input tensor as-is.
+//
+// When building ops to compute gradients, the TensorFlow gradient system
+// will return an error when trying to lookup the gradient of this op,
+// because no gradient must ever be registered for this function.  This
+// op exists to prevent subtle bugs from silently returning unimplemented
+// gradients in some corner cases.
+//
+// Arguments:
+//	input: any tensor.
+//
+// Returns the same input tensor.
+func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10275,7 +10269,7 @@ func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAn
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantize",
+		Type: "PreventGradient",
 		Input: []tf.Input{
 			input,
 		},
@@ -10285,815 +10279,4388 @@ func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAn
 	return op.Output(0)
 }
 
-// Returns locations of nonzero / true values in a tensor.
+// Computes asin of x element-wise.
+func Asin(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Asin",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sum along sparse segments of a tensor.
 //
-// This operation returns the coordinates of true elements in `condition`. The
-// coordinates are returned in a 2-D tensor where the first dimension (rows)
-// represents the number of true elements, and the second dimension (columns)
-// represents the coordinates of the true elements. Keep in mind, the shape of
-// the output tensor can vary depending on how many true values there are in
-// `condition`. Indices are output in row-major order.
+// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/sparse#Segmentation)
+// for an explanation of segments.
 //
 // For example:
 //
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+//
+// tf.sparse_segment_sum_with_num_segments(
+//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
+// # => [[0 0 0 0]
+// #     [0 0 0 0]
+// #     [0 0 0 0]]
+//
+// tf.sparse_segment_sum_with_num_segments(c,
+//                                         tf.constant([0, 1]),
+//                                         tf.constant([0, 2],
+//                                         num_segments=4))
+// # => [[ 1  2  3  4]
+// #     [ 0  0  0  0]
+// #     [-1 -2 -3 -4]
+// #     [ 0  0  0  0]]
 // ```
-// # 'input' tensor is [[True, False]
-// #                    [True, False]]
-// # 'input' has two true values, so output has two coordinates.
-// # 'input' has rank of 2, so coordinates have two indices.
-// where(input) ==> [[0, 0],
-//                   [1, 0]]
 //
-// # `condition` tensor is [[[True, False]
-// #                     [True, False]]
-// #                    [[False, True]
-// #                     [False, True]]
-// #                    [[False, False]
-// #                     [False, True]]]
-// # 'input' has 5 true values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
+// Arguments:
 //
-// # `condition` tensor is [[[1.5,  0.0]
-// #                     [-0.5, 0.0]]
-// #                    [[0.0,  0.25]
-// #                     [0.0,  0.75]]
-// #                    [[0.0,  0.0]
-// #                     [0.0,  0.01]]]
-// # 'input' has 5 nonzero values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
 //
-// # `condition` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
-// #                     [0.0 + 0.5j, 0.0  + 0.0j]]
-// #                    [[0.0 + 0.0j, 0.25 + 1.5j]
-// #                     [0.0 + 0.0j, 0.75 + 0.0j]]
-// #                    [[0.0 + 0.0j, 0.0  + 0.0j]
-// #                     [0.0 + 0.0j, 0.01 + 0.0j]]]
-// # 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
-// ```
-func Where(scope *Scope, condition tf.Output) (index tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Where",
+		Type: "SparseSegmentSumWithNumSegments",
 		Input: []tf.Input{
-			condition,
+			data, indices, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
-type QueueDequeueV2Attr func(optionalAttr)
+// SparseReduceMaxAttr is an optional argument to SparseReduceMax.
+type SparseReduceMaxAttr func(optionalAttr)
 
-// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
+// SparseReduceMaxKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If the queue is empty, this operation will block for up to
-// timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceMaxKeepDims(value bool) SparseReduceMaxAttr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Dequeues a tuple of one or more tensors from the given queue.
+// Computes the max of elements across dimensions of a SparseTensor.
 //
-// This operation has k outputs, where k is the number of components
-// in the tuples stored in the given queue, and output i is the ith
-// component of the dequeued tuple.
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
 //
-// N.B. If the queue is empty, this operation will block until an element
-// has been dequeued (or 'timeout_ms' elapses, if specified).
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	component_types: The type of each component in a tuple.
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
 //
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueV2",
+		Type: "SparseReduceMax",
 		Input: []tf.Input{
-			handle,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueV2", err)
-		return
-	}
-	return components
+	return op.Output(0)
 }
 
-// ParseSequenceExampleAttr is an optional argument to ParseSequenceExample.
-type ParseSequenceExampleAttr func(optionalAttr)
+// DecodeRawAttr is an optional argument to DecodeRaw.
+type DecodeRawAttr func(optionalAttr)
 
-// ParseSequenceExampleNcontextSparse sets the optional Ncontext_sparse attribute to value.
-// If not specified, defaults to 0
+// DecodeRawLittleEndian sets the optional little_endian attribute to value.
 //
-// REQUIRES: value >= 0
-func ParseSequenceExampleNcontextSparse(value int64) ParseSequenceExampleAttr {
+// value: Whether the input `bytes` are in little-endian order.
+// Ignored for `out_type` values that are stored in a single byte like
+// `uint8`.
+// If not specified, defaults to true
+func DecodeRawLittleEndian(value bool) DecodeRawAttr {
 	return func(m optionalAttr) {
-		m["Ncontext_sparse"] = value
+		m["little_endian"] = value
 	}
 }
 
-// ParseSequenceExampleNcontextDense sets the optional Ncontext_dense attribute to value.
-// If not specified, defaults to 0
+// Reinterpret the bytes of a string as a vector of numbers.
 //
-// REQUIRES: value >= 0
-func ParseSequenceExampleNcontextDense(value int64) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["Ncontext_dense"] = value
-	}
-}
-
-// ParseSequenceExampleNfeatureListSparse sets the optional Nfeature_list_sparse attribute to value.
-// If not specified, defaults to 0
+// Arguments:
+//	bytes: All the elements must have the same length.
 //
-// REQUIRES: value >= 0
-func ParseSequenceExampleNfeatureListSparse(value int64) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["Nfeature_list_sparse"] = value
+//
+// Returns A Tensor with one more dimension than the input `bytes`.  The
+// added dimension will have size equal to the length of the elements
+// of `bytes` divided by the number of bytes to represent `out_type`.
+func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeRaw",
+		Input: []tf.Input{
+			bytes,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ParseSequenceExampleNfeatureListDense sets the optional Nfeature_list_dense attribute to value.
-// If not specified, defaults to 0
+// RetrieveTPUEmbeddingADAMParametersAttr is an optional argument to RetrieveTPUEmbeddingADAMParameters.
+type RetrieveTPUEmbeddingADAMParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingADAMParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// REQUIRES: value >= 0
-func ParseSequenceExampleNfeatureListDense(value int64) ParseSequenceExampleAttr {
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingADAMParametersTableId(value int64) RetrieveTPUEmbeddingADAMParametersAttr {
 	return func(m optionalAttr) {
-		m["Nfeature_list_dense"] = value
+		m["table_id"] = value
 	}
 }
 
-// ParseSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
+// RetrieveTPUEmbeddingADAMParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingADAMParametersTableName(value string) RetrieveTPUEmbeddingADAMParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve ADAM embedding parameters.
 //
-// value: A list of Ncontext_sparse types; the data types of data in
-// each context Feature given in context_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleContextSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["context_sparse_types"] = value
+// Returns Parameter parameters updated by the ADAM optimization algorithm.Parameter momenta updated by the ADAM optimization algorithm.Parameter velocities updated by the ADAM optimization algorithm.
+func RetrieveTPUEmbeddingADAMParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingADAMParametersAttr) (parameters tf.Output, momenta tf.Output, velocities tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
 	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingADAMParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ParseSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to <>
+// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
+type FusedBatchNormAttr func(optionalAttr)
+
+// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleAttr {
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
 	return func(m optionalAttr) {
-		m["feature_list_dense_types"] = value
+		m["epsilon"] = value
 	}
 }
 
-// ParseSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
+// FusedBatchNormDataFormat sets the optional data_format attribute to value.
 //
-// value: A list of Ncontext_dense shapes; the shapes of data in
-// each context Feature given in context_dense_keys.
-// The number of elements in the Feature corresponding to context_dense_key[j]
-// must always equal context_dense_shapes[j].NumEntries().
-// The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to <>
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormIsTraining sets the optional is_training attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleContextDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
 	return func(m optionalAttr) {
-		m["context_dense_shapes"] = value
+		m["is_training"] = value
 	}
 }
 
-// ParseSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
+// Batch normalization.
 //
-// value: A list of Nfeature_list_sparse types; the data types
-// of data in each FeatureList given in feature_list_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
-	return func(m optionalAttr) {
-		m["feature_list_sparse_types"] = value
+// Arguments:
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
+//
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedBatchNorm",
+		Input: []tf.Input{
+			x, scale, offset, mean, variance,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// ParseSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
+// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
+type RandomStandardNormalAttr func(optionalAttr)
+
+// RandomStandardNormalSeed sets the optional seed attribute to value.
 //
-// value: A list of Nfeature_list_dense shapes; the shapes of
-// data in each FeatureList given in feature_list_dense_keys.
-// The shape of each Feature in the FeatureList corresponding to
-// feature_list_dense_key[j] must always equal
-// feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to <>
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func ParseSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
 	return func(m optionalAttr) {
-		m["feature_list_dense_shapes"] = value
+		m["seed2"] = value
 	}
 }
 
-// Transforms a vector of brain.SequenceExample protos (as strings) into typed tensors.
+// Outputs random values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
 //
 // Arguments:
-//	serialized: A vector containing binary serialized SequenceExample protos.
-//	debug_name: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) name for the
-// corresponding serialized proto.  This is purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no name is available.
-//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
-// context_dense_defaults[j] provides default values
-// when the SequenceExample's context map lacks context_dense_key[j].
-// If an empty Tensor is provided for context_dense_defaults[j],
-// then the Feature context_dense_keys[j] is required.
-// The input type is inferred from context_dense_defaults[j], even when it's
-// empty.  If context_dense_defaults[j] is not empty, its shape must match
-// context_dense_shapes[j].
-//	feature_list_dense_missing_assumed_empty: A vector listing the
-// FeatureList keys which may be missing from the SequenceExamples.  If the
-// associated FeatureList is missing, it is treated as empty.  By default,
-// any FeatureList not listed in this vector must exist in the SequenceExamples.
-//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with context_sparse
-// values.
-//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' context features associated with
-// dense values.
-//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
-// (scalars).  The keys expected in the FeatureLists associated with sparse
-// values.
-//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' feature_lists associated
-// with lists of dense values.
-func ParseSequenceExample(scope *Scope, serialized tf.Output, debug_name tf.Output, context_dense_defaults []tf.Output, feature_list_dense_missing_assumed_empty []string, context_sparse_keys []string, context_dense_keys []string, feature_list_sparse_keys []string, feature_list_dense_keys []string, optional ...ParseSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output, feature_list_dense_lengths []tf.Output) {
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with random normal values.
+func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"feature_list_dense_missing_assumed_empty": feature_list_dense_missing_assumed_empty, "context_sparse_keys": context_sparse_keys, "context_dense_keys": context_dense_keys, "feature_list_sparse_keys": feature_list_sparse_keys, "feature_list_dense_keys": feature_list_dense_keys}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ParseSequenceExample",
+		Type: "RandomStandardNormal",
 		Input: []tf.Input{
-			serialized, debug_name, tf.OutputList(context_dense_defaults),
+			shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
+type FusedResizeAndPadConv2DAttr func(optionalAttr)
+
+// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
+	return func(m optionalAttr) {
+		m["resize_align_corners"] = value
+	}
+}
+
+// Performs a resize and padding as a preprocess during a convolution.
+//
+// It's often possible to do spatial transformations more efficiently as part of
+// the packing stage of a convolution, so this op allows for an optimized
+// implementation where these stages are fused together. This prevents the need to
+// write out the intermediate results as whole tensors, reducing memory pressure,
+// and we can get some latency gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and defaults to
+// 'NHWC' order.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
 	}
-	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "FusedResizeAndPadConv2D",
+		Input: []tf.Input{
+			input, size, paddings, filter,
+		},
+		Attrs: attrs,
 	}
-	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RandomUniformAttr is an optional argument to RandomUniform.
+type RandomUniformAttr func(optionalAttr)
+
+// RandomUniformSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformSeed(value int64) RandomUniformAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
+}
+
+// RandomUniformSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformSeed2(value int64) RandomUniformAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
+}
+
+// Outputs random values from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with uniform random values.
+func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
+	if scope.Err() != nil {
 		return
 	}
-	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
 	}
-	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "RandomUniform",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
 	}
-	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
-		return
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
+type ResourceApplyFtrlAttr func(optionalAttr)
+
+// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
 	}
-	if feature_list_dense_lengths, idx, err = makeOutputList(op, idx, "feature_list_dense_lengths"); err != nil {
-		scope.UpdateErr("ParseSequenceExample", err)
+}
+
+// Update '*var' according to the Ftrl-proximal scheme.
+//
+// accum_new = accum + grad * grad
+// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 regulariation. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
 		return
 	}
-	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values, feature_list_dense_lengths
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyFtrl",
+		Input: []tf.Input{
+			var_, accum, linear, grad, lr, l1, l2, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
 }
 
-// Computes the Gauss error function of `x` element-wise.
-func Erf(scope *Scope, x tf.Output) (y tf.Output) {
+// Transforms a vector of brain.Example protos (as strings) into typed tensors.
+//
+// Arguments:
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	names: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) names for the
+// corresponding serialized protos.  These are purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no names are available.
+// If non-empty, this vector must be the same length as "serialized".
+//	sparse_keys: A list of Nsparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples' features associated with dense values.
+//	dense_defaults: A list of Ndense Tensors (some may be empty).
+// dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	sparse_types: A list of Nsparse types; the data types of data in each Feature
+// given in sparse_keys.
+// Currently the ParseExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
+// given in dense_keys.
+// The number of elements in the Feature corresponding to dense_key[j]
+// must always equal dense_shapes[j].NumEntries().
+// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
+// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
+// The dense outputs are just the inputs row-stacked by batch.
+// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
+// the shape of the output Tensor dense_values[j] will be
+// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
+// of elements of length D1 * .... * DN, across all minibatch entries
+// in the input.  Any minibatch entry with less than M blocks of elements of
+// length D1 * ... * DN will be padded with the corresponding default_value
+// scalar element along the second dimension.
+func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
 	opspec := tf.OpSpec{
-		Type: "Erf",
+		Type: "ParseExample",
 		Input: []tf.Input{
-			x,
+			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
-// Returns element-wise largest integer not greater than x.
-func Floor(scope *Scope, x tf.Output) (y tf.Output) {
+// Compute the pairwise cross product.
+//
+// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
+// or any shape where the innermost dimension is 3. In the latter case, each pair
+// of corresponding 3-element vectors is cross-multiplied independently.
+//
+// Arguments:
+//	a: A tensor containing 3-element vectors.
+//	b: Another tensor, of same type and shape as `a`.
+//
+// Returns Pairwise cross product of the vectors in `a` and `b`.
+func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Floor",
+		Type: "Cross",
 		Input: []tf.Input{
-			x,
+			a, b,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// OneHotAttr is an optional argument to OneHot.
-type OneHotAttr func(optionalAttr)
+// StatefulStandardNormalV2Attr is an optional argument to StatefulStandardNormalV2.
+type StatefulStandardNormalV2Attr func(optionalAttr)
 
-// OneHotAxis sets the optional axis attribute to value.
+// StatefulStandardNormalV2Dtype sets the optional dtype attribute to value.
 //
-// value: The axis to fill (default: -1, a new inner-most axis).
-// If not specified, defaults to -1
-func OneHotAxis(value int64) OneHotAttr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatefulStandardNormalV2Dtype(value tf.DataType) StatefulStandardNormalV2Attr {
 	return func(m optionalAttr) {
-		m["axis"] = value
+		m["dtype"] = value
 	}
 }
 
-// Returns a one-hot tensor.
-//
-// The locations represented by indices in `indices` take value `on_value`,
-// while all other locations take value `off_value`.
-//
-// If the input `indices` is rank `N`, the output will have rank `N+1`,
-// The new axis is created at dimension `axis` (default: the new axis is
-// appended at the end).
+// Outputs random values from a normal distribution.
 //
-// If `indices` is a scalar the output shape will be a vector of length `depth`.
+// The generated values will have mean 0 and standard deviation 1.
 //
-// If `indices` is a vector of length `features`, the output shape will be:
-// ```
-//   features x depth if axis == -1
-//   depth x features if axis == 0
-// ```
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	algorithm: The RNG algorithm.
+//	shape: The shape of the output tensor.
 //
-// If `indices` is a matrix (batch) with shape `[batch, features]`,
-// the output shape will be:
-// ```
-//   batch x features x depth if axis == -1
-//   batch x depth x features if axis == 1
-//   depth x batch x features if axis == 0
+// Returns A tensor of the specified shape filled with random normal values.
+func StatefulStandardNormalV2(scope *Scope, resource tf.Output, algorithm tf.Output, shape tf.Output, optional ...StatefulStandardNormalV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatefulStandardNormalV2",
+		Input: []tf.Input{
+			resource, algorithm, shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Locks a mutex resource.  The output is the lock.  So long as the lock tensor
+//
+// is alive, any other request to use `MutexLock` with this mutex will wait.
+//
+// This is particularly useful for creating a critical section when used in
+// conjunction with `MutexLockIdentity`:
+//
+// ```python
+//
+// mutex = mutex_v2(
+//   shared_name=handle_name, container=container, name=name)
+//
+// def execute_in_critical_section(fn, *args, **kwargs):
+//   lock = gen_resource_variable_ops.mutex_lock(mutex)
+//
+//   with ops.control_dependencies([lock]):
+//     r = fn(*args, **kwargs)
+//
+//   with ops.control_dependencies(nest.flatten(r)):
+//     with ops.colocate_with(mutex):
+//       ensure_lock_exists = mutex_lock_identity(lock)
+//
+//     # Make sure that if any element of r is accessed, all of
+//     # them are executed together.
+//     r = nest.map_structure(tf.identity, r)
+//
+//   with ops.control_dependencies([ensure_lock_exists]):
+//     return nest.map_structure(tf.identity, r)
 // ```
 //
+// While `fn` is running in the critical section, no other functions which wish to
+// use this critical section may run.
+//
+// Often the use case is that two executions of the same graph, in parallel,
+// wish to run `fn`; and we wish to ensure that only one of them executes
+// at a time.  This is especially important if `fn` modifies one or more
+// variables at a time.
+//
+// It is also useful if two separate functions must share a resource, but we
+// wish to ensure the usage is exclusive.
+//
+// Arguments:
+//	mutex: The mutex resource to lock.
+//
+// Returns A tensor that keeps a shared pointer to a lock on the mutex;
+// when the Tensor is destroyed, the use count on the shared pointer is decreased
+// by 1.  When it reaches 0, the lock is released.
+func MutexLock(scope *Scope, mutex tf.Output) (mutex_lock tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MutexLock",
+		Input: []tf.Input{
+			mutex,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
+//
+// Arguments:
+//	serialized: A scalar string containing a serialized TensorProto proto.
+//	out_type: The type of the serialized tensor.  The provided type must match the
+// type of the serialized tensor and no implicit conversion will take place.
+//
+// Returns A Tensor of type `out_type`.
+func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	opspec := tf.OpSpec{
+		Type: "ParseTensor",
+		Input: []tf.Input{
+			serialized,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
+type MaxPoolWithArgmaxAttr func(optionalAttr)
+
+// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
+// If not specified, defaults to DT_INT64
+func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
+	return func(m optionalAttr) {
+		m["Targmax"] = value
+	}
+}
+
+// Performs max pooling on the input and outputs both max values and indices.
+//
+// The indices in `argmax` are flattened, so that a maximum value at position
+// `[b, y, x, c]` becomes flattened index
+// `((b * height + y) * width + x) * channels + c`.
+//
+// The indices returned are always in `[0, height) x [0, width)` before flattening,
+// even if padding is involved and the mathematically correct answer is outside
+// (either negative or too large).  This is a bug, but fixing it is difficult to do
+// in a safe backwards compatible way, especially due to flattening.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
+func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolWithArgmax",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Identity transformation that models performance.
+//
+// Identity transformation that models performance.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the input dataset.
+//
+//
+func ModelDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ModelDataset",
+		Input: []tf.Input{
+			input_dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Fast Fourier transform.
+//
+// Computes the 1-dimensional discrete Fourier transform over the inner-most
+// dimension of `input`.
+//
+// Arguments:
+//	input: A complex tensor.
+//
+// Returns A complex tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fft
+// @end_compatibility
+func FFT(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FFT",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolAttr is an optional argument to MaxPool.
+type MaxPoolAttr func(optionalAttr)
+
+// MaxPoolDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolDataFormat(value string) MaxPoolAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs max pooling on the input.
+//
+// Arguments:
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.
+func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPool",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Multiplies sparse updates into the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] *= updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] *= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions multiply.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterMul(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterMul",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Subtracts sparse updates from the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] -= updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] -= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterSub(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterSub",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Adds sparse updates to the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] += updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] += updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterAdd",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Reads the value of a variable.
+//
+// The tensor returned by this operation is immutable.
+//
+// The value returned by this operation is guaranteed to be influenced by all the
+// writes on which this operation depends directly or indirectly, and to not be
+// influenced by any of the writes which depend directly or indirectly on this
+// operation.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	dtype: the dtype of the value.
+func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "ReadVariableOp",
+		Input: []tf.Input{
+			resource,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
+type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
+
+// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// prox_v = var
+// prox_v -= lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyProximalAdagrad",
+		Input: []tf.Input{
+			var_, accum, lr, l1, l2, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// DecodeJpegAttr is an optional argument to DecodeJpeg.
+type DecodeJpegAttr func(optionalAttr)
+
+// DecodeJpegChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeJpegChannels(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodeJpegRatio sets the optional ratio attribute to value.
+//
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeJpegRatio(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
+	}
+}
+
+// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
+}
+
+// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
+	}
+}
+
+// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+//
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeJpegDctMethod sets the optional dct_method attribute to value.
+//
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeJpegDctMethod(value string) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// This op also supports decoding PNGs and non-animated GIFs since the interface is
+// the same, though it is cleaner to use `tf.image.decode_image`.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeJpeg",
+		Input: []tf.Input{
+			contents,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
+type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
+
+// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of depthwise convolution with respect to the input.
+//
+// Arguments:
+//	input_sizes: An integer vector representing the shape of `input`, based
+// on `data_format`.  For example, if `data_format` is 'NHWC' then
+//  `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape according to `data_format`.  For example, if
+// `data_format` is 'NHWC', output shape is `[batch, in_height,
+// in_width, in_channels]`.  Gradient w.r.t. the input of the
+// convolution.
+func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNativeBackpropInput",
+		Input: []tf.Input{
+			input_sizes, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EditDistanceAttr is an optional argument to EditDistance.
+type EditDistanceAttr func(optionalAttr)
+
+// EditDistanceNormalize sets the optional normalize attribute to value.
+//
+// value: boolean (if true, edit distances are normalized by length of truth).
+//
+// The output is:
+// If not specified, defaults to true
+func EditDistanceNormalize(value bool) EditDistanceAttr {
+	return func(m optionalAttr) {
+		m["normalize"] = value
+	}
+}
+
+// Computes the (possibly normalized) Levenshtein Edit Distance.
+//
+// The inputs are variable-length sequences provided by SparseTensors
+//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
+// and
+//   (truth_indices, truth_values, truth_shape).
+//
+// The inputs are:
+//
+// Arguments:
+//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
+// This is an N x R int64 matrix.
+//	hypothesis_values: The values of the hypothesis list SparseTensor.
+// This is an N-length vector.
+//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
+// This is an R-length vector.
+//	truth_indices: The indices of the truth list SparseTensor.
+// This is an M x R int64 matrix.
+//	truth_values: The values of the truth list SparseTensor.
+// This is an M-length vector.
+//	truth_shape: truth indices, vector.
+//
+// Returns A dense float tensor with rank R - 1.
+//
+// For the example input:
+//
+//     // hypothesis represents a 2x1 matrix with variable-length values:
+//     //   (0,0) = ["a"]
+//     //   (1,0) = ["b"]
+//     hypothesis_indices = [[0, 0, 0],
+//                           [1, 0, 0]]
+//     hypothesis_values = ["a", "b"]
+//     hypothesis_shape = [2, 1, 1]
+//
+//     // truth represents a 2x2 matrix with variable-length values:
+//     //   (0,0) = []
+//     //   (0,1) = ["a"]
+//     //   (1,0) = ["b", "c"]
+//     //   (1,1) = ["a"]
+//     truth_indices = [[0, 1, 0],
+//                      [1, 0, 0],
+//                      [1, 0, 1],
+//                      [1, 1, 0]]
+//     truth_values = ["a", "b", "c", "a"]
+//     truth_shape = [2, 2, 2]
+//     normalize = true
+//
+// The output will be:
+//
+//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
+//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
+//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
+func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EditDistance",
+		Input: []tf.Input{
+			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns 0 if x == 0, and x * log(y) otherwise, elementwise.
+func Xlogy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Xlogy",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Stops gradient computation.
+//
+// When executed in a graph, this op outputs its input tensor as-is.
+//
+// When building ops to compute gradients, this op prevents the contribution of
+// its inputs to be taken into account.  Normally, the gradient generator adds ops
+// to a graph to compute the derivatives of a specified 'loss' by recursively
+// finding out inputs that contributed to its computation.  If you insert this op
+// in the graph it inputs are masked from the gradient generator.  They are not
+// taken into account for computing gradients.
+//
+// This is useful any time you want to compute a value with TensorFlow but need
+// to pretend that the value was a constant. Some examples include:
+//
+// *  The *EM* algorithm where the *M-step* should not involve backpropagation
+//    through the output of the *E-step*.
+// *  Contrastive divergence training of Boltzmann machines where, when
+//    differentiating the energy function, the training must not backpropagate
+//    through the graph that generated the samples from the model.
+// *  Adversarial training, where no backprop should happen through the adversarial
+//    example generation process.
+func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StopGradient",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Eagerly executes a python function to compute func(input)->output. The
+//
+// semantics of the input, output, and attributes are the same as those for
+// PyFunc.
+func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"token": token, "Tout": Tout}
+	opspec := tf.OpSpec{
+		Type: "EagerPyFunc",
+		Input: []tf.Input{
+			tf.OutputList(input),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("EagerPyFunc", err)
+		return
+	}
+	return output
+}
+
+// Concats all tensors in the list along the 0th dimension.
+//
+// Requires that all tensors have the same shape except the first dimension.
+//
+// input_handle: The input list.
+// element_shape: The shape of the uninitialized elements in the list. If the first
+//   dimension is not -1, it is assumed that all list elements have the same
+//   leading dim.
+// leading_dims: The list of leading dims of uninitialized list elements. Used if
+//   the leading dim of input_handle.element_shape or the element_shape input arg
+//   is not already set.
+// tensor: The concated result.
+// lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
+//
+func TensorListConcatV2(scope *Scope, input_handle tf.Output, element_shape tf.Output, leading_dims tf.Output, element_dtype tf.DataType) (tensor tf.Output, lengths tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListConcatV2",
+		Input: []tf.Input{
+			input_handle, element_shape, leading_dims,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
+type MatrixTriangularSolveAttr func(optionalAttr)
+
+// MatrixTriangularSolveLower sets the optional lower attribute to value.
+//
+// value: Boolean indicating whether the innermost matrices in `matrix` are
+// lower or upper triangular.
+// If not specified, defaults to true
+func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
+	return func(m optionalAttr) {
+		m["lower"] = value
+	}
+}
+
+// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
+//
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+//          adjoint.
+//
+// @compatibility(numpy)
+// Equivalent to scipy.linalg.solve_triangular
+// @end_compatibility
+// If not specified, defaults to false
+func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
+	return func(m optionalAttr) {
+		m["adjoint"] = value
+	}
+}
+
+// Solves systems of linear equations with upper or lower triangular matrices by
+//
+// backsubstitution.
+//
+// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
+// square matrices. If `lower` is `True` then the strictly upper triangular part
+// of each inner-most matrix is assumed to be zero and not accessed.
+// If `lower` is False then the strictly lower triangular part of each inner-most
+// matrix is assumed to be zero and not accessed.
+// `rhs` is a tensor of shape `[..., M, K]`.
+//
+// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
+// `True` then the innermost matrices in `output` satisfy matrix equations
+// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `False` then the strictly then the  innermost matrices in
+// `output` satisfy matrix equations
+// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+//
+// Arguments:
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
+//
+// Returns Shape is `[..., M, K]`.
+func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixTriangularSolve",
+		Input: []tf.Input{
+			matrix, rhs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Saves tensors in V2 checkpoint format.
+//
+// By default, saves the named tensors in full.  If the caller wishes to save
+// specific slices of full tensors, "shape_and_slices" should be non-empty strings
+// and correspondingly well-formed.
+//
+// Arguments:
+//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
+// write the tensors.
+//	tensor_names: shape {N}. The names of the tensors to be saved.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
+// Empty strings indicate that they are non-partitioned tensors.
+//	tensors: `N` tensors to save.
+//
+// Returns the created operation.
+func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SaveV2",
+		Input: []tf.Input{
+			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Concatenates quantized tensors along one dimension.
+//
+// Arguments:
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	input_mins: The minimum scalar values for each of the input tensors.
+//	input_maxes: The maximum scalar values for each of the input tensors.
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedConcat",
+		Input: []tf.Input{
+			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Slice a `SparseTensor` based on the `start` and `size`.
+//
+// For example, if the input is
+//
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
+//
+// Graphically the output tensors are:
+//
+//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
+//
+//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
+//
+// Arguments:
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+//	start: 1-D. tensor represents the start of the slice.
+//	size: 1-D. tensor represents the size of the slice.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSlice",
+		Input: []tf.Input{
+			indices, values, shape, start, size,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Runs multiple additive regression ensemble predictors on input instances and
+//
+// computes the logits. It is designed to be used during prediction.
+// It traverses all the trees and calculates the final score for each instance.
+//
+// Arguments:
+//
+//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
+// feature.
+//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
+// shape.
+//
+// Returns Output rank 2 Tensor containing logits for each example.
+func BoostedTreesPredict(scope *Scope, tree_ensemble_handle tf.Output, bucketized_features []tf.Output, logits_dimension int64) (logits tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesPredict",
+		Input: []tf.Input{
+			tree_ensemble_handle, tf.OutputList(bucketized_features),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Pads a tensor with zeros.
+//
+// This operation pads a `input` with zeros according to the `paddings` you
+// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
+// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many zeros to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
+// in that dimension.
+//
+// The padded size of each dimension D of the output is:
+//
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
+// ```
+//
+func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Pad",
+		Input: []tf.Input{
+			input, paddings,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Checks whether a resource handle-based variable has been initialized.
+//
+// Arguments:
+//	resource: the input resource handle.
+//
+// Returns a scalar boolean which is true if the variable has been
+// initialized.
+func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "VarIsInitializedOp",
+		Input: []tf.Input{
+			resource,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+//
+// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Minimum",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
+//
+// if < 0, `scale * features` otherwise.
+//
+// To be used together with
+// `initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
+// For correct dropout, use `tf.contrib.nn.alpha_dropout`.
+//
+// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
+func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Selu",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SetSizeAttr is an optional argument to SetSize.
+type SetSizeAttr func(optionalAttr)
+
+// SetSizeValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SetSizeValidateIndices(value bool) SetSizeAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Number of unique elements along last dimension of input `set`.
+//
+// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
+// and `set_shape`. The last dimension contains values in a set, duplicates are
+// allowed but ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set`
+// indices.
+//
+// Arguments:
+//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
+//	set_values: 1D `Tensor`, values of a `SparseTensor`.
+//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
+//
+// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
+// `n-1` dimensions as `set`. Each value is the number of unique elements in
+// the corresponding `[0...n-1]` dimension of `set`.
+func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SetSize",
+		Input: []tf.Input{
+			set_indices, set_values, set_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds sparse `updates` to an existing tensor according to `indices`.
+//
+// This operation creates a new tensor by adding sparse `updates` to the passed
+// in `tensor`.
+// This operation is very similar to `tf.scatter_nd_add`, except that the updates
+// are added onto an existing tensor (as opposed to a variable). If the memory
+// for the existing tensor cannot be re-used, a copy is made and updated.
+//
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+//
+//     indices.shape[-1] <= shape.rank
+//
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
+//
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
+//
+// The simplest form of tensor_scatter_add is to add individual elements to a
+// tensor by index. For example, say we want to add 4 elements in a rank-1
+// tensor with 8 elements.
+//
+// In Python, this scatter add operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     tensor = tf.ones([8], dtype=tf.int32)
+//     updated = tf.tensor_scatter_add(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [1, 12, 1, 11, 10, 1, 1, 13]
+//
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
+//
+// In Python, this scatter add operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     tensor = tf.ones([4, 4, 4])
+//     updated = tf.tensor_scatter_add(tensor, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(scatter))
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [[[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+//      [[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
+//      [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, the index is ignored.
+//
+// Arguments:
+//	tensor: Tensor to copy/update.
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//
+// Returns A new tensor copied from tensor and updates added according to the indices.
+func TensorScatterAdd(scope *Scope, tensor tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorScatterAdd",
+		Input: []tf.Input{
+			tensor, indices, updates,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the sign and the log of the absolute value of the determinant of
+//
+// one or more square matrices.
+//
+// The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
+// form square matrices. The outputs are two tensors containing the signs and
+// absolute values of the log determinants for all N input submatrices
+// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
+// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
+// is the LU decomposition of the input and P is the corresponding
+// permutation matrix.
+//
+// Arguments:
+//	input: Shape is `[N, M, M]`.
+//
+// Returns The signs of the log determinants of the inputs. Shape is `[N]`.The logs of the absolute values of the determinants
+// of the N input matrices.  Shape is `[N]`.
+func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_abs_determinant tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogMatrixDeterminant",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Says whether the targets are in the top `K` predictions.
+//
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
+//
+// More formally, let
+//
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+//
+// Arguments:
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
+//
+// Returns Computed precision at `k` as a `bool Tensor`.
+func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InTopKV2",
+		Input: []tf.Input{
+			predictions, targets, k,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Check if the input matches the regex pattern.
+//
+// The input is a string tensor of any shape. The pattern is a scalar
+// string tensor which is applied to every element of the input tensor.
+// The boolean values (True or False) of the output tensor indicate
+// if the input matches the regex pattern provided.
+//
+// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+//
+// Arguments:
+//	input: A string tensor of the text to be processed.
+//	pattern: A scalar string tensor containing the regular expression to match the input.
+//
+// Returns A bool tensor with the same shape as `input`.
+func RegexFullMatch(scope *Scope, input tf.Output, pattern tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RegexFullMatch",
+		Input: []tf.Input{
+			input, pattern,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Converts a `RaggedTensor` into a `SparseTensor` with the same values.
+//
+// input=ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
+// output=SparseTensor(indices=sparse_indices, values=sparse_values,
+//                     dense_shape=sparse_dense_shape)
+//
+// Arguments:
+//	rt_nested_splits: The `row_splits` for the `RaggedTensor`.
+//	rt_dense_values: The `flat_values` for the `RaggedTensor`.
+//
+// Returns The indices for the `SparseTensor`.The values of the `SparseTensor`.`sparse_dense_shape` is a tight bounding box of the input `RaggedTensor`.
+func RaggedTensorToSparse(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output) (sparse_indices tf.Output, sparse_values tf.Output, sparse_dense_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RaggedTensorToSparse",
+		Input: []tf.Input{
+			tf.OutputList(rt_nested_splits), rt_dense_values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
+type FusedBatchNormGradV2Attr func(optionalAttr)
+
+// FusedBatchNormGradV2Epsilon sets the optional epsilon attribute to value.
+//
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradV2Epsilon(value float32) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
+	}
+}
+
+// FusedBatchNormGradV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradV2DataFormat(value string) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormGradV2IsTraining sets the optional is_training attribute to value.
+//
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradV2IsTraining(value bool) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Gradient for batch normalization.
+//
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
+//
+// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradV2Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedBatchNormGradV2",
+		Input: []tf.Input{
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// Component-wise multiplies a SparseTensor by a dense Tensor.
+//
+// The output locations corresponding to the implicitly zero elements in the sparse
+// tensor will be zero (i.e., will not take up storage space), regardless of the
+// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
+//
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
+//
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseDenseCwiseMul",
+		Input: []tf.Input{
+			sp_indices, sp_values, sp_shape, dense,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
+type MaxPool3DGradAttr func(optionalAttr)
+
+// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of max pooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPool3DGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the name of the device on which `resource` has been placed.
+func ExperimentalIteratorGetDevice(scope *Scope, resource tf.Output) (device tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalIteratorGetDevice",
+		Input: []tf.Input{
+			resource,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseReduceSumAttr is an optional argument to SparseReduceSum.
+type SparseReduceSumAttr func(optionalAttr)
+
+// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a SparseTensor.
+//
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReduceSum",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape, reduction_axes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Records the latency of producing `input_dataset` elements in a StatsAggregator.
+func ExperimentalLatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalLatencyStatsDataset",
+		Input: []tf.Input{
+			input_dataset, tag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
+//
+// This Op does not require `a_indices` be sorted in standard lexicographic order.
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
+//	b: `ndims`-D Tensor.  With shape `a_shape`.
+func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseTensorDenseAdd",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedReluAttr is an optional argument to QuantizedRelu.
+type QuantizedReluAttr func(optionalAttr)
+
+// QuantizedReluOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes Quantized Rectified Linear: `max(features, 0)`
+//
+// Arguments:
+//
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedRelu",
+		Input: []tf.Input{
+			features, min_features, max_features,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Reorders a SparseTensor into the canonical, row-major ordering.
+//
+// Note that by convention, all sparse ops preserve the canonical ordering along
+// increasing dimension number. The only time ordering can be violated is during
+// manual manipulation of the indices and values vectors to add entries.
+//
+// Reordering does not affect the shape of the SparseTensor.
+//
+// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
+// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//
+// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
+// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
+func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReorder",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Split a `SparseTensor` into `num_split` tensors along one dimension.
+//
+// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
+// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
+// For example, if `split_dim = 1` and `num_split = 2` and the input is
+//
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
+//
+// Graphically the output tensors are:
+//
+//     output_tensor[0] = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
+//
+//     output_tensor[1] = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
+//
+// Arguments:
+//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
+// `[0, rank(shape))`.
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//	num_split: The number of ways to split.
+//
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_split": num_split}
+	opspec := tf.OpSpec{
+		Type: "SparseSplit",
+		Input: []tf.Input{
+			split_dim, indices, values, shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	return output_indices, output_values, output_shape
+}
+
+// Applies sparse addition to `input` using individual values or slices
+//
+// from `updates` according to indices `indices`.  The updates are non-aliasing:
+// `input` is only modified in-place if no other operations will use it.
+// Otherwise, a copy of `input` is made.  This operation has a gradient with
+// respect to both `input` and `updates`.
+//
+// `input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `input`.
+// It must be shape \\([d_0, ..., d_{Q-2}, K]\\) where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or `(P-K)`-dimensional slices
+// (if `K < P`) along the `K`th dimension of `input`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// $$[d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].$$
+//
+// For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
+// elements. In Python, that addition would look like this:
+//
+//     input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(output))
+//
+// The resulting value `output` would look like this:
+//
+//     [1, 13, 3, 14, 14, 6, 7, 20]
+//
+// See `tf.scatter_nd` for more details about how to make updates to slices.
+//
+// Arguments:
+//	input: A Tensor.
+//	indices: A Tensor. Must be one of the following types: `int32`, `int64`.
+// A tensor of indices into `input`.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated values
+// to add to `input`.
+//
+// Returns A `Tensor` with the same shape as `input`, containing values of `input`
+// updated with `updates`.
+func ScatterNdNonAliasingAdd(scope *Scope, input tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ScatterNdNonAliasingAdd",
+		Input: []tf.Input{
+			input, indices, updates,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a MultiDeviceIterator resource.
+//
+// Arguments:
+//	devices: A list of devices the iterator works across.
+//	shared_name: If non-empty, this resource will be shared under the given name
+// across multiple sessions.
+//	container: If non-empty, this resource is placed in the given container.
+// Otherwise, a default container is used.
+//	output_types: The type list for the return values.
+//	output_shapes: The list of shapes being produced.
+//
+// Returns Handle to the resource created.
+func MultiDeviceIterator(scope *Scope, devices []string, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"devices": devices, "shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "MultiDeviceIterator",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
+type FractionalMaxPoolAttr func(optionalAttr)
+
+// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
+//
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["pseudo_random"] = value
+	}
+}
+
+// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
+//
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
+//
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalMaxPool node in the computation graph. Mainly used
+// in unit test to make FractionalMaxPool deterministic.
+// If not specified, defaults to false
+func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["deterministic"] = value
+	}
+}
+
+// FractionalMaxPoolSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Performs fractional max pooling on the input.
+//
+// Fractional max pooling is slightly different than regular max pooling.  In
+// regular max pooling, you downsize an input set by taking the maximum value of
+// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
+// a factor of N, where N is an integer.  Fractional max pooling, as you might
+// expect from the word "fractional", means that the overall reduction ratio N
+// does not have to be an integer.
+//
+// The sizes of the pooling regions are generated randomly but are fairly uniform.
+// For example, let's look at the height dimension, and the constraints on the
+// list of rows that will be pool boundaries.
+//
+// First we define the following:
+//
+// 1.  input_row_length : the number of rows from the input set
+// 2.  output_row_length : which will be smaller than the input
+// 3.  alpha = input_row_length / output_row_length : our reduction ratio
+// 4.  K = floor(alpha)
+// 5.  row_pooling_sequence : this is the result list of pool boundary rows
+//
+// Then, row_pooling_sequence should satisfy:
+//
+// 1.  a[0] = 0 : the first value of the sequence is 0
+// 2.  a[end] = input_row_length : the last value of the sequence is the size
+// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
+// 4.  length(row_pooling_sequence) = output_row_length+1
+//
+// For more details on fractional max pooling, see this paper:
+// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+//
+// Arguments:
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
+//
+// Returns output tensor after fractional max pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
+func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalMaxPool",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Generates sparse cross from a list of sparse and dense tensors.
+//
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
+//
+// For example, if the inputs are
+//
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
+//
+//     inputs[2]: Tensor [["f"], ["g"]]
+//
+// then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
+//
+// if hashed_output=true then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	hashed_output: If true, returns the hash of the cross instead of the string.
+// This will allow us avoiding string manipulations.
+//	num_buckets: It is used if hashed_output is true.
+// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+// function to combine the crosses fingerprints.
+//
+//
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
+	opspec := tf.OpSpec{
+		Type: "SparseCross",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Inverse real-valued fast Fourier transform.
+//
+// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most dimension of `input`.
+//
+// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
+// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
+// `fft_length` is not provided, it is computed from the size of the inner-most
+// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
+// compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
+// than the corresponding dimension of `input`, the dimension is cropped. If it is
+// larger, the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length` samples of its inverse
+//   1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft
+// @end_compatibility
+func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IRFFT",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Concatenates a list of `SparseTensor` along the specified dimension.
+//
+// Concatenation is with respect to the dense versions of these sparse tensors.
+// It is assumed that each input is a `SparseTensor` whose elements are ordered
+// along increasing dimension number.
+//
+// All inputs' shapes must match, except for the concat dimension.  The
+// `indices`, `values`, and `shapes` lists must have the same length.
+//
+// The output shape is identical to the inputs', except along the concat
+// dimension, where it is the sum of the inputs' sizes along that dimension.
+//
+// The output elements will be resorted to preserve the sort order along
+// increasing dimension number.
+//
+// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
+// values across all inputs. This is due to the need for an internal sort in
+// order to concatenate efficiently across an arbitrary dimension.
+//
+// For example, if `concat_dim = 1` and the inputs are
+//
+//     sp_inputs[0]: shape = [2, 3]
+//     [0, 2]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     sp_inputs[1]: shape = [2, 4]
+//     [0, 1]: "d"
+//     [0, 2]: "e"
+//
+// then the output will be
+//
+//     shape = [2, 7]
+//     [0, 2]: "a"
+//     [0, 4]: "d"
+//     [0, 5]: "e"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+// Graphically this is equivalent to doing
+//
+//     [    a] concat [  d e  ] = [    a   d e  ]
+//     [b c  ]        [       ]   [b c          ]
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.  Non-empty values of each `SparseTensor`.
+//	shapes: 1-D.  Shapes of each `SparseTensor`.
+//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
+// where rank is the number of dimensions in each input `SparseTensor`.
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"concat_dim": concat_dim}
+	opspec := tf.OpSpec{
+		Type: "SparseConcat",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Elementwise computes the bitwise AND of `x` and `y`.
+//
+// The result will have those bits set, that are set in both `x` and `y`. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BitwiseAnd",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
+//
+// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
+// `N` is the minibatch size and the rows correspond to packed outputs of
+// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
+// must all match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension).
+//
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
+//
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+//
+// Arguments:
+//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
+// Must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "DeserializeManySparse",
+		Input: []tf.Input{
+			serialized_sparse,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Deserialize `SparseTensor` objects.
+//
+// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
+// the last dimension stores serialized `SparseTensor` objects and the other N
+// dimensions (N >= 0) correspond to a batch. The ranks of the original
+// `SparseTensor` objects must all match. When the final `SparseTensor` is
+// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
+// the sparse tensors have been concatenated along new dimensions, one for each
+// batch.
+//
+// The output `SparseTensor` object's shape values for the original dimensions
+// are the max across the input `SparseTensor` objects' shape values for the
+// corresponding dimensions. The new dimensions match the size of the batch.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
+//
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+//
+// Arguments:
+//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
+// must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "DeserializeSparse",
+		Input: []tf.Input{
+			serialized_sparse,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
+type MaxPool3DGradGradAttr func(optionalAttr)
+
+// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes second-order gradients of the maxpooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPool3DGradGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
+type Conv3DBackpropFilterV2Attr func(optionalAttr)
+
+// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv3DBackpropFilterV2Dilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the filter.
+//
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 5-D
+// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
+// tensor.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropFilterV2",
+		Input: []tf.Input{
+			input, filter_sizes, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Execute a sub graph on a remote processor.
+//
+// The graph specifications(such as graph itself, input tensors and output names)
+// are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
+// as serialized_remote_fused_graph_execute_info.
+// The specifications will be passed to a dedicated registered
+// remote fused graph executor.  The executor will send the graph specifications
+// to a remote processor and execute that graph.  The execution results
+// will be passed to consumer nodes as outputs of this node.
+//
+// Arguments:
+//	inputs: Arbitrary number of tensors with arbitrary data types
+//
+//	serialized_remote_fused_graph_execute_info: Serialized protocol buffer
+// of RemoteFusedGraphExecuteInfo which contains graph specifications.
+//
+// Returns Arbitrary number of tensors with arbitrary data types
+func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
+	opspec := tf.OpSpec{
+		Type: "RemoteFusedGraphExecute",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("RemoteFusedGraphExecute", err)
+		return
+	}
+	return outputs
+}
+
+// SerializeManySparseAttr is an optional argument to SerializeManySparse.
+type SerializeManySparseAttr func(optionalAttr)
+
+// SerializeManySparseOutType sets the optional out_type attribute to value.
+//
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
+//
+// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
+// is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The serialized
+// `SparseTensor` objects going into each row of `serialized_sparse` will have
+// rank `R-1`.
+//
+// The minibatch size `N` is extracted from `sparse_shape[0]`.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeManySparse",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes inverse hyperbolic cosine of x element-wise.
+func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Acosh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes rectified linear 6 gradients for a Relu6 operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
+//	features: The features passed as input to the corresponding Relu6 operation, or
+// its output; using either one produces the same result.
+//
+// Returns The gradients:
+// `gradients * (features > 0) * (features < 6)`.
+func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Relu6Grad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes natural logarithm of (1 + x) element-wise.
+//
+// I.e., \\(y = \log_e (1 + x)\\).
+func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Log1p",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResizeBicubicAttr is an optional argument to ResizeBicubic.
+type ResizeBicubicAttr func(optionalAttr)
+
+// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using bicubic interpolation.
+//
+// Input images can be of different types but output images are always float.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResizeBicubic",
+		Input: []tf.Input{
+			images, size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
+type SparseTensorDenseMatMulAttr func(optionalAttr)
+
+// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
+//
+// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
+// is transpose(conj(A)).  Otherwise it's transpose(A).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_a"] = value
+	}
+}
+
+// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
+//
+// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
+// is transpose(conj(B)).  Otherwise it's transpose(B).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_b"] = value
+	}
+}
+
+// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
+//
+// No validity checking is performed on the indices of A.  However, the following
+// input format is recommended for optimal behavior:
+//
+// if adjoint_a == false:
+//   A should be sorted in lexicographically increasing order.  Use SparseReorder
+//   if you're not sure.
+// if adjoint_a == true:
+//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
+//   order instead of "row major" order).
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
+//	b: 2-D.  A dense Matrix.
+func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseTensorDenseMatMul",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
+type DecodeAndCropJpegAttr func(optionalAttr)
+
+// DecodeAndCropJpegChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
+//
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
+	}
+}
+
+// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
+}
+
+// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
+	}
+}
+
+// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+//
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
+//
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode and Crop a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// It is equivalent to a combination of decode and crop, but much faster by only
+// decoding partial jpeg image.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeAndCropJpeg",
+		Input: []tf.Input{
+			contents, crop_window,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds two `SparseTensor` objects to produce another `SparseTensor`.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in standard
+// lexicographic order.  If this is not the case, before this step run
+// `SparseReorder` to restore index ordering.
+//
+// By default, if two values sum to zero at some index, the output `SparseTensor`
+// would still include that particular location in its index, storing a zero in the
+// corresponding value slot.  To override this, callers can specify `thresh`,
+// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
+// corresponding value and index would then not be included.  In particular,
+// `thresh == 0` (default) means everything is kept and actual thresholding happens
+// only for a positive value.
+//
+// In the following shapes, `nnz` is the count after taking `thresh` into account.
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
+//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
+//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
+//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
+// pair takes space.
+func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseAdd",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// EnqueueTPUEmbeddingSparseTensorBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseTensorBatch.
+type EnqueueTPUEmbeddingSparseTensorBatchAttr func(optionalAttr)
+
+// EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. Should be >= 0 and less than the number
+// of TPU cores in the task on which the node is placed.
+// If not specified, defaults to -1
+func EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseTensorBatchAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// EnqueueTPUEmbeddingSparseTensorBatchCombiners sets the optional combiners attribute to value.
+//
+// value: A list of string scalars, one for each embedding table that specify
+// how to normalize the embedding activations after weighted summation.
+// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
+// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
+// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
+// all tables.
+// If not specified, defaults to <>
+func EnqueueTPUEmbeddingSparseTensorBatchCombiners(value []string) EnqueueTPUEmbeddingSparseTensorBatchAttr {
+	return func(m optionalAttr) {
+		m["combiners"] = value
+	}
+}
+
+// Eases the porting of code that uses tf.nn.embedding_lookup_sparse().
+//
+// sample_indices[i], embedding_indices[i] and aggregation_weights[i] correspond
+// to the ith feature. table_ids[i] indicates which embedding table to look up ith
+// feature.
+//
+// The tensors at corresponding positions in the three input lists (sample_indices,
+// embedding_indices and aggregation_weights) must have the same shape, i.e. rank 1
+// with dim_size() equal to the total number of lookups into the table described by
+// the corresponding feature.
+//
+// Arguments:
+//	sample_indices: A list of rank 1 Tensors specifying the training example to
+// which the corresponding embedding_indices and aggregation_weights values
+// belong. It corresponds to sp_ids.indices[:,0] in  embedding_lookup_sparse().
+//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
+// It corresponds to sp_ids.values in embedding_lookup_sparse().
+//	aggregation_weights: A list of rank 1 Tensors containing per training example
+// aggregation weights. It corresponds to sp_weights.values in
+// embedding_lookup_sparse().
+//	mode_override: A string input that overrides the mode specified in the
+// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+//	table_ids: A list of integers specifying the identifier of the embedding table
+// (offset of TableDescriptor in the TPUEmbeddingConfiguration) to lookup the
+// corresponding input. The ith input is looked up using table_ids[i]. The size
+// of the table_ids list must be equal to that of sample_indices,
+// embedding_indices and aggregation_weights.
+//
+// Returns the created operation.
+func EnqueueTPUEmbeddingSparseTensorBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, table_ids []int64, optional ...EnqueueTPUEmbeddingSparseTensorBatchAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"table_ids": table_ids}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EnqueueTPUEmbeddingSparseTensorBatch",
+		Input: []tf.Input{
+			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// The gradient operator for the SparseAdd op.
+//
+// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
+// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
+// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
+// values of A and B.
+//
+// Arguments:
+//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
+// the non-empty values of the sum.
+//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
+//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
+//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
+// `[nnz(sum), ndims]`.
+//
+// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
+// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
+// non-empty values of B.
+func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseAddGrad",
+		Input: []tf.Input{
+			backprop_val_grad, a_indices, b_indices, sum_indices,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// This op consumes a lock created by `MutexLock`.
+//
+// This op exists to consume a tensor created by `MutexLock` (other than
+// direct control dependencies).  It should be the only that consumes the tensor,
+// and will raise an error if it is not.  Its only purpose is to keep the
+// mutex lock tensor alive until it is consumed by this op.
+//
+// **NOTE**: This operation must run on the same device as its input.  This may
+// be enforced via the `colocate_with` mechanism.
+//
+// Arguments:
+//	mutex_lock: A tensor returned by `MutexLock`.
+//
+// Returns the created operation.
+func ConsumeMutexLock(scope *Scope, mutex_lock tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConsumeMutexLock",
+		Input: []tf.Input{
+			mutex_lock,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ResourceScatterNdAddAttr is an optional argument to ResourceScatterNdAdd.
+type ResourceScatterNdAddAttr func(optionalAttr)
+
+// ResourceScatterNdAddUseLocking sets the optional use_locking attribute to value.
+//
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Applies sparse addition to individual values or slices in a Variable.
+//
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 //
-// Examples
-// =========
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
 //
-// Suppose that
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 //
 // ```
-//   indices = [0, 2, -1, 1]
-//   depth = 3
-//   on_value = 5.0
-//   off_value = 0.0
-//   axis = -1
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
 // ```
 //
-// Then output is `[4 x 3]`:
+// For example, say we want to add 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that addition would look like this:
 //
-//     ```output =
-//       [5.0 0.0 0.0]  // one_hot(0)
-//       [0.0 0.0 5.0]  // one_hot(2)
-//       [0.0 0.0 0.0]  // one_hot(-1)
-//       [0.0 5.0 0.0]  // one_hot(1)
-//     ```
+// ```python
+// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+// indices = tf.constant([[4], [3], [1], [7]])
+// updates = tf.constant([9, 10, 11, 12])
+// add = tf.scatter_nd_add(ref, indices, updates)
+// with tf.Session() as sess:
+//   print sess.run(add)
+// ```
 //
-// Suppose that
+// The resulting update to ref would look like this:
 //
-// ```
-//   indices = [0, 2, -1, 1]
-//   depth = 3
-//   on_value = 0.0
-//   off_value = 3.0
-//   axis = 0
-// ```
+//     [1, 13, 3, 14, 14, 6, 7, 20]
 //
-// Then output is `[3 x 4]`:
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
 //
-//     ```output =
-//       [0.0 3.0 3.0 3.0]
-//       [3.0 3.0 3.0 0.0]
-//       [3.0 3.0 3.0 3.0]
-//       [3.0 0.0 3.0 3.0]
-//     //  ^                one_hot(0)
-//     //      ^            one_hot(2)
-//     //          ^        one_hot(-1)
-//     //              ^    one_hot(1)
-//     ```
-// Suppose that
+// Arguments:
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of
+// values to add to ref.
 //
-// ```
-//   indices = [[0, 2], [1, -1]]
-//   depth = 3
-//   on_value = 1.0
-//   off_value = 0.0
-//   axis = -1
+// Returns the created operation.
+func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdAddAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterNdAdd",
+		Input: []tf.Input{
+			ref, indices, updates,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Replaces the contents of the table with the specified keys and values.
+//
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
+//
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
+//
+// Returns the created operation.
+func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LookupTableImportV2",
+		Input: []tf.Input{
+			table_handle, keys, values,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Extract `patches` from `images` and put them in the "depth" output dimension.
+//
+// Arguments:
+//	images: 4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
+//	ksizes: The size of the sliding window for each dimension of `images`.
+//	strides: 1-D of length 4. How far the centers of two consecutive patches are in
+// the images. Must be: `[1, stride_rows, stride_cols, 1]`.
+//	rates: 1-D of length 4. Must be: `[1, rate_rows, rate_cols, 1]`. This is the
+// input stride, specifying how far two consecutive patch samples are in the
+// input. Equivalent to extracting patches with
+// `patch_sizes_eff = patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by
+// subsampling them spatially by a factor of `rates`. This is equivalent to
+// `rate` in dilated (a.k.a. Atrous) convolutions.
+//	padding: The type of padding algorithm to use.
+//
+// We specify the size-related attributes as:
+//
+// ```python
+//       ksizes = [1, ksize_rows, ksize_cols, 1]
+//       strides = [1, strides_rows, strides_cols, 1]
+//       rates = [1, rates_rows, rates_cols, 1]
 // ```
 //
-// Then output is `[2 x 2 x 3]`:
+// Returns 4-D Tensor with shape `[batch, out_rows, out_cols, ksize_rows *
+// ksize_cols * depth]` containing image patches with size
+// `ksize_rows x ksize_cols x depth` vectorized in the "depth" dimension. Note
+// `out_rows` and `out_cols` are the dimensions of the output patches.
+func ExtractImagePatches(scope *Scope, images tf.Output, ksizes []int64, strides []int64, rates []int64, padding string) (patches tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksizes": ksizes, "strides": strides, "rates": rates, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "ExtractImagePatches",
+		Input: []tf.Input{
+			images,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the mean along sparse segments of a tensor.
+//
+// See `tf.sparse.segment_sum` for usage examples.
 //
-//     ```output =
-//       [
-//         [1.0, 0.0, 0.0]  // one_hot(0)
-//         [0.0, 0.0, 1.0]  // one_hot(2)
-//       ][
-//         [0.0, 1.0, 0.0]  // one_hot(1)
-//         [0.0, 0.0, 0.0]  // one_hot(-1)
-//       ]```
+// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
 //
 // Arguments:
-//	indices: A tensor of indices.
-//	depth: A scalar defining the depth of the one hot dimension.
-//	on_value: A scalar defining the value to fill in output when `indices[j] = i`.
-//	off_value: A scalar defining the value to fill in output when `indices[j] != i`.
 //
-// Returns The one-hot tensor.
-func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output, off_value tf.Output, optional ...OneHotAttr) (output tf.Output) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentMean",
+		Input: []tf.Input{
+			data, indices, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Deserializes a serialized tree ensemble config and replaces current tree
+//
+// ensemble.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//	stamp_token: Token to use as the new value of the resource stamp.
+//	tree_ensemble_serialized: Serialized proto of the ensemble.
+//
+// Returns the created operation.
+func BoostedTreesDeserializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesDeserializeEnsemble",
+		Input: []tf.Input{
+			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Transforms a tf.Example proto (as a string) into typed tensors.
+//
+// Arguments:
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	dense_defaults: A list of Tensors (some may be empty), whose length matches
+// the length of `dense_keys`. dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	num_sparse: The number of sparse features to be parsed from the example. This
+// must match the lengths of `sparse_keys` and `sparse_types`.
+//	sparse_keys: A list of `num_sparse` strings.
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: The keys expected in the Examples' features associated with dense
+// values.
+//	sparse_types: A list of `num_sparse` types; the data types of data in each
+// Feature given in sparse_keys.
+// Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: The shapes of data in each Feature given in dense_keys.
+// The length of this list must match the length of `dense_keys`.  The
+// number of elements in the Feature corresponding to dense_key[j] must
+// always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
+// (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
+// will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
+// ..., DN), the shape of the output Tensor dense_values[j] will be (M,
+// D1, .., DN), where M is the number of blocks of elements of length
+// D1 * .... * DN, in the input.
+func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	opspec := tf.OpSpec{
+		Type: "ParseSingleExample",
+		Input: []tf.Input{
+			serialized, tf.OutputList(dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
+}
+
+// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
+type WholeFileReaderV2Attr func(optionalAttr)
+
+// WholeFileReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the entire contents of a file as a value.
+//
+// To use, enqueue filenames in a Queue.  The output of ReaderRead will
+// be a filename (key) and the contents of that file (value).
+//
+// Returns The handle to reference the Reader.
+func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "WholeFileReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Says whether the targets are in the top `K` predictions.
+//
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
+//
+// More formally, let
+//
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+//
+// Arguments:
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
+//
+// Returns Computed Precision at `k` as a `bool Tensor`.
+func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"k": k}
+	opspec := tf.OpSpec{
+		Type: "InTopK",
+		Input: []tf.Input{
+			predictions, targets,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingAdagradParametersGradAccumDebug.
+type RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdagradParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve Adagrad embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the Adagrad optimization algorithm.Parameter accumulators updated by the Adagrad optimization algorithm.Parameter gradient_accumulators updated by the Adagrad optimization algorithm.
+func RetrieveTPUEmbeddingAdagradParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdagradParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingAdagradParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Serializes the tree handle to a proto
+//
+// Arguments:
+//	tree_handle: Handle to the tree resource to be serialized.
+//
+// Returns Serialied proto string of the tree resource.
+func TensorForestTreeSerialize(scope *Scope, tree_handle tf.Output) (tree_config tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorForestTreeSerialize",
+		Input: []tf.Input{
+			tree_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseMatMulAttr is an optional argument to SparseMatMul.
+type SparseMatMulAttr func(optionalAttr)
+
+// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["a_is_sparse"] = value
+	}
+}
+
+// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
+	return func(m optionalAttr) {
+		m["b_is_sparse"] = value
+	}
+}
+
+// Multiply matrix "a" by matrix "b".
+//
+// The inputs must be two-dimensional matrices and the inner dimension of "a" must
+// match the outer dimension of "b". Both "a" and "b" must be `Tensor`s not
+// `SparseTensor`s.  This op is optimized for the case where at least one of "a" or
+// "b" is sparse, in the sense that they have a large proportion of zero values.
+// The breakeven for using this versus a dense matrix multiply on one platform was
+// 30% zero values in the sparse matrix.
+//
+// The gradient computation of this operation will only take advantage of sparsity
+// in the input gradient when that gradient comes from a Relu.
+func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseMatMul",
+		Input: []tf.Input{
+			a, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ExperimentalThreadPoolHandleAttr is an optional argument to ExperimentalThreadPoolHandle.
+type ExperimentalThreadPoolHandleAttr func(optionalAttr)
+
+// ExperimentalThreadPoolHandleMaxIntraOpParallelism sets the optional max_intra_op_parallelism attribute to value.
+//
+// value: The maximum degree of parallelism to use within operations that execute on this
+// threadpool.
+// If not specified, defaults to 1
+func ExperimentalThreadPoolHandleMaxIntraOpParallelism(value int64) ExperimentalThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["max_intra_op_parallelism"] = value
+	}
+}
+
+// ExperimentalThreadPoolHandleContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func ExperimentalThreadPoolHandleContainer(value string) ExperimentalThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// ExperimentalThreadPoolHandleSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func ExperimentalThreadPoolHandleSharedName(value string) ExperimentalThreadPoolHandleAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+//
+// Arguments:
+//	num_threads: The number of threads in the thread pool.
+//	display_name: A human-readable name for the threads that may be visible in some
+// visualizations.
+// threadpool.
+//
+// Returns A resource that can be consumed by one or more ExperimentalThreadPoolDataset
+// ops.
+func ExperimentalThreadPoolHandle(scope *Scope, num_threads int64, display_name string, optional ...ExperimentalThreadPoolHandleAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_threads": num_threads, "display_name": display_name}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OneHot",
-		Input: []tf.Input{
-			indices, depth, on_value, off_value,
-		},
+		Type: "ExperimentalThreadPoolHandle",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes exponential of x element-wise.  \\(y = e^x\\).
-func Exp(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Exp",
-		Input: []tf.Input{
-			x,
-		},
+// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug.
+type LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// NthElementAttr is an optional argument to NthElement.
-type NthElementAttr func(optionalAttr)
-
-// NthElementReverse sets the optional reverse attribute to value.
-//
-// value: When set to True, find the nth-largest value in the vector and vice
-// versa.
-// If not specified, defaults to false
-func NthElementReverse(value bool) NthElementAttr {
+// LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["reverse"] = value
+		m["table_name"] = value
 	}
 }
 
-// Finds values of the `n`-th order statistic for the last dimension.
+// Load proximal Adagrad embedding parameters with debug support.
 //
-// If the input is a vector (rank-1), finds the entries which is the nth-smallest
-// value in the vector and outputs their values as scalar tensor.
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
-// For matrices (resp. higher rank input), computes the entries which is the
-// nth-smallest value in each row (resp. vector along the last dimension). Thus,
+// Arguments:
+//	parameters: Value of parameters used in the proximal Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the proximal Adagrad optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the proximal Adagrad optimization algorithm.
 //
-//     values.shape = input.shape[:-1]
 //
-// Arguments:
-//	input: 1-D or higher with last dimension at least `n+1`.
-//	n: 0-D. Position of sorted vector to select along the last dimension (along
-// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
 //
-// Returns The `n`-th order statistic along each last dimensional slice.
-func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
+// Returns the created operation.
+func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "NthElement",
+		Type: "LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug",
 		Input: []tf.Input{
-			input, n,
+			parameters, accumulators, gradient_accumulators,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes the maximum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// This operator is similar to the unsorted segment sum operator found
-// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the maximum such that:
-//
-// \\(output_i = \max_{j...} data[j...]\\) where max is over tuples `j...` such
-// that `segment_ids[j...] == i`.
-//
-// If the maximum is empty for a given segment ID `i`, it outputs the smallest
-// possible value for the specific numeric type,
-// `output[i] = numeric_limits<T>::lowest()`.
+// LoadTPUEmbeddingProximalAdagradParametersAttr is an optional argument to LoadTPUEmbeddingProximalAdagradParameters.
+type LoadTPUEmbeddingProximalAdagradParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingProximalAdagradParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// If the given segment ID `i` is negative, then the corresponding value is
-// dropped, and will not be included in the result.
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingProximalAdagradParametersTableId(value int64) LoadTPUEmbeddingProximalAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingProximalAdagradParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingProximalAdagradParametersTableName(value string) LoadTPUEmbeddingProximalAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load proximal Adagrad embedding parameters.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
-// </div>
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
+//	parameters: Value of parameters used in the proximal Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the proximal Adagrad optimization algorithm.
 //
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.END
-//   }
-//   out_arg {
-//     name: "output"
-//     description: <<END
-// Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
 //
-func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+//
+// Returns the created operation.
+func LoadTPUEmbeddingProximalAdagradParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingProximalAdagradParametersAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentMax",
+		Type: "LoadTPUEmbeddingProximalAdagradParameters",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			parameters, accumulators,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Transforms a vector of brain.Example protos (as strings) into typed tensors.
+// Get the current size of the TensorArray.
 //
 // Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	names: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) names for the
-// corresponding serialized protos.  These are purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no names are available.
-// If non-empty, this vector must be the same length as "serialized".
-//	sparse_keys: A list of Nsparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples' features associated with dense values.
-//	dense_defaults: A list of Ndense Tensors (some may be empty).
-// dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	sparse_types: A list of Nsparse types; the data types of data in each Feature
-// given in sparse_keys.
-// Currently the ParseExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
-// given in dense_keys.
-// The number of elements in the Feature corresponding to dense_key[j]
-// must always equal dense_shapes[j].NumEntries().
-// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
-// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
-// The dense outputs are just the inputs row-stacked by batch.
-// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
-// the shape of the output Tensor dense_values[j] will be
-// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
-// of elements of length D1 * .... * DN, across all minibatch entries
-// in the input.  Any minibatch entry with less than M blocks of elements of
-// length D1 * ... * DN will be padded with the corresponding default_value
-// scalar element along the second dimension.
-func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns The current size of the TensorArray.
+func TensorArraySizeV3(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
 	opspec := tf.OpSpec{
-		Type: "ParseExample",
+		Type: "TensorArraySizeV3",
 		Input: []tf.Input{
-			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
+			handle, flow_in,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
+	return op.Output(0)
 }
 
-// Compute the pairwise cross product.
-//
-// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
-// or any shape where the innermost dimension is 3. In the latter case, each pair
-// of corresponding 3-element vectors is cross-multiplied independently.
+// Computes gradients for the scaled exponential linear (Selu) operation.
 //
 // Arguments:
-//	a: A tensor containing 3-element vectors.
-//	b: Another tensor, of same type and shape as `a`.
+//	gradients: The backpropagated gradients to the corresponding Selu operation.
+//	outputs: The outputs of the corresponding Selu operation.
 //
-// Returns Pairwise cross product of the vectors in `a` and `b`.
-func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
+// Returns The gradients: `gradients * (outputs + scale * alpha)`
+// if outputs < 0, `scale * gradients` otherwise.
+func SeluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cross",
+		Type: "SeluGrad",
 		Input: []tf.Input{
-			a, b,
+			gradients, outputs,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CudnnRNNAttr is an optional argument to CudnnRNN.
-type CudnnRNNAttr func(optionalAttr)
-
-// CudnnRNNRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNRnnMode(value string) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
+// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
+type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
 
-// CudnnRNNInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNInputMode(value string) CudnnRNNAttr {
+// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
 	return func(m optionalAttr) {
-		m["input_mode"] = value
+		m["use_locking"] = value
 	}
 }
 
-// CudnnRNNDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNDirection(value string) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+//
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
+//
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// CudnnRNNDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNDropout(value float32) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
-}
-
-// CudnnRNNSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNSeed(value int64) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyFtrlV2",
+		Input: []tf.Input{
+			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
+		},
+		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
 }
 
-// CudnnRNNSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNSeed2(value int64) CudnnRNNAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
+// SumAttr is an optional argument to Sum.
+type SumAttr func(optionalAttr)
 
-// CudnnRNNIsTraining sets the optional is_training attribute to value.
-// If not specified, defaults to true
-func CudnnRNNIsTraining(value bool) CudnnRNNAttr {
+// SumKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SumKeepDims(value bool) SumAttr {
 	return func(m optionalAttr) {
-		m["is_training"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// A RNN backed by cuDNN.
+// Computes the sum of elements across dimensions of a tensor.
 //
-// Computes the RNN from the input and initial states, with respect to the params
-// buffer.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//   the actual computation before the first layer. 'skip_input' is only allowed
-//   when input_size == num_units; 'auto_select' implies 'skip_input' when
-//   input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
-// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
-// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
-//     num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// output: A 3-D tensor with the shape of [seq_length, batch_size,
-//     dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// is_training: Indicates whether this operation is used for inferenece or
-//   training.
-// reserve_space: An opaque tensor that can be used in backprop calculation. It
-//   is only produced if is_training is false.
-func CudnnRNN(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, optional ...CudnnRNNAttr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output) {
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11102,99 +14669,186 @@ func CudnnRNN(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Outpu
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNN",
+		Type: "Sum",
 		Input: []tf.Input{
-			input, input_h, input_c, params,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+	return op.Output(0)
 }
 
-// DecodeCompressedAttr is an optional argument to DecodeCompressed.
-type DecodeCompressedAttr func(optionalAttr)
+// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
+type SparseToSparseSetOperationAttr func(optionalAttr)
 
-// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
-//
-// value: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-// If not specified, defaults to ""
-func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
+// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
 	return func(m optionalAttr) {
-		m["compression_type"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Decompress strings.
+// Applies set operation along last dimension of 2 `SparseTensor` inputs.
 //
-// This op decompresses each element of the `bytes` input `Tensor`, which
-// is assumed to be compressed using the given `compression_type`.
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
 //
-// The `output` is a string `Tensor` of the same shape as `bytes`,
-// each element containing the decompressed data from the corresponding
-// element in `bytes`.
+// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
+// order and range of `set1` and `set2` indices.
+//
+// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
+// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set1`
+// and `set2` indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	bytes: A Tensor of string which is compressed.
+//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
+// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
+// max set size across `0...n-1` dimensions.
 //
-// Returns A Tensor with the same shape as input `bytes`, uncompressed
-// from bytes.
-func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"set_operation": set_operation}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeCompressed",
+		Type: "SparseToSparseSetOperation",
 		Input: []tf.Input{
-			bytes,
+			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// DecodeRawAttr is an optional argument to DecodeRaw.
-type DecodeRawAttr func(optionalAttr)
-
-// DecodeRawLittleEndian sets the optional little_endian attribute to value.
+// Computes softmax cross entropy cost and gradients to backpropagate.
 //
-// value: Whether the input `bytes` are in little-endian order.
-// Ignored for `out_type` values that are stored in a single byte like
-// `uint8`.
-// If not specified, defaults to true
-func DecodeRawLittleEndian(value bool) DecodeRawAttr {
+// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
+// a matrix of label probabilities, but rather a single label per row
+// of features.  This label is considered to have probability 1.0 for the
+// given row.
+//
+// Inputs are the logits, not probabilities.
+//
+// Arguments:
+//	features: batch_size x num_classes matrix
+//	labels: batch_size vector with values in [0, num_classes).
+// This is the label for the given minibatch entry.
+//
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSoftmaxCrossEntropyWithLogits",
+		Input: []tf.Input{
+			features, labels,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// StridedSliceGradAttr is an optional argument to StridedSliceGrad.
+type StridedSliceGradAttr func(optionalAttr)
+
+// StridedSliceGradBeginMask sets the optional begin_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradBeginMask(value int64) StridedSliceGradAttr {
 	return func(m optionalAttr) {
-		m["little_endian"] = value
+		m["begin_mask"] = value
 	}
 }
 
-// Reinterpret the bytes of a string as a vector of numbers.
-//
-// Arguments:
-//	bytes: All the elements must have the same length.
-//
+// StridedSliceGradEndMask sets the optional end_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradEndMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
+	}
+}
+
+// StridedSliceGradEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradEllipsisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
+	}
+}
+
+// StridedSliceGradNewAxisMask sets the optional new_axis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradNewAxisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
+	}
+}
+
+// StridedSliceGradShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradShrinkAxisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Returns the gradient of `StridedSlice`.
 //
-// Returns A Tensor with one more dimension than the input `bytes`.  The
-// added dimension will have size equal to the length of the elements
-// of `bytes` divided by the number of bytes to represent `out_type`.
-func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
+// Since `StridedSlice` cuts out pieces of its `input` which is size
+// `shape`, its gradient will have the same shape (which is passed here
+// as `shape`). The gradient will be zero in any element that the slice
+// does not select.
+//
+// Arguments are the same as StridedSliceGrad with the exception that
+// `dy` is the input gradient to be propagated and `shape` is the
+// shape of `StridedSlice`'s `input`.
+func StridedSliceGrad(scope *Scope, shape tf.Output, begin tf.Output, end tf.Output, strides tf.Output, dy tf.Output, optional ...StridedSliceGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeRaw",
+		Type: "StridedSliceGrad",
 		Input: []tf.Input{
-			bytes,
+			shape, begin, end, strides, dy,
 		},
 		Attrs: attrs,
 	}
@@ -11202,72 +14856,106 @@ func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...
 	return op.Output(0)
 }
 
-// Computes natural logarithm of (1 + x) element-wise.
+// LoadTPUEmbeddingRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingRMSPropParameters.
+type LoadTPUEmbeddingRMSPropParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// I.e., \\(y = \log_e (1 + x)\\).
-func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingRMSPropParametersTableId(value int64) LoadTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingRMSPropParametersTableName(value string) LoadTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load RMSProp embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the RMSProp optimization algorithm.
+//	ms: Value of ms used in the RMSProp optimization algorithm.
+//	mom: Value of mom used in the RMSProp optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingRMSPropParametersAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Log1p",
+		Type: "LoadTPUEmbeddingRMSPropParameters",
 		Input: []tf.Input{
-			x,
+			parameters, ms, mom,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes rectified linear 6 gradients for a Relu6 operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
-//	features: The features passed as input to the corresponding Relu6 operation, or
-// its output; using either one produces the same result.
+// Computes the gradient for the inverse of `x` wrt its input.
 //
-// Returns The gradients:
-// `gradients * (features > 0) * (features < 6)`.
-func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Relu6Grad",
+		Type: "ReciprocalGrad",
 		Input: []tf.Input{
-			gradients, features,
+			y, dy,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeBicubicAttr is an optional argument to ResizeBicubic.
-type ResizeBicubicAttr func(optionalAttr)
+// EuclideanNormAttr is an optional argument to EuclideanNorm.
+type EuclideanNormAttr func(optionalAttr)
 
-// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
+// EuclideanNormKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
+// value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
+func EuclideanNormKeepDims(value bool) EuclideanNormAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Resize `images` to `size` using bicubic interpolation.
+// Computes the euclidean norm of elements across dimensions of a tensor.
 //
-// Input images can be of different types but output images are always float.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
+// Returns The reduced tensor.
+func EuclideanNorm(scope *Scope, input tf.Output, axis tf.Output, optional ...EuclideanNormAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11276,9 +14964,9 @@ func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...R
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBicubic",
+		Type: "EuclideanNorm",
 		Input: []tf.Input{
-			images, size,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -11286,251 +14974,342 @@ func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...R
 	return op.Output(0)
 }
 
-// Gather ragged slices from `params` axis `0` according to `indices`.
-//
-// Outputs a `RaggedTensor` output composed from `output_dense_values` and
-// `output_nested_splits`, such that:
-//
-// ```python
-// output.shape = indices.shape + params.shape[1:]
-// output.ragged_rank = indices.shape.ndims + params.ragged_rank
-// output[i...j, d0...dn] = params[indices[i...j], d0...dn]
-// ```
-//
-// where
-//
-// * `params =
-//    ragged.from_nested_row_splits(params_dense_values, params_nested_splits)`
-//    provides the values that should be gathered.
-// * `indices` ia a dense tensor with dtype `int32` or `int64`, indicating which
-//    values should be gathered.
-// * `output =
-//    ragged.from_nested_row_splits(output_dense_values, output_nested_splits)`
-//    is the output tensor.
-//
-// (Note: This c++ op is used to implement the higher-level python
-// `tf.ragged.gather` op, which also supports ragged indices.)
+// Returns the element-wise min of two SparseTensors.
 //
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
 //
 // Arguments:
-//	params_nested_splits: The `nested_row_splits` tensors that define the row-partitioning for the
-// `params` RaggedTensor input.
-//	params_dense_values: The `inner_values` for the `params` RaggedTensor. There was a terminology change
-// at the python level from dense_values to inner_values, so dense_values is the
-// deprecated name.
-//	indices: Indices in the outermost dimension of `params` of the values that should be
-// gathered.
-//	OUTPUT_RAGGED_RANK: The ragged rank of the output RaggedTensor. `output_nested_splits` will contain
-// this number of `row_splits` tensors. This value should equal
-// `indices.shape.ndims + params.ragged_rank - 1`.
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
 //
-// Returns The `nested_row_splits` tensors that define the row-partitioning for the
-// returned RaggedTensor.The `inner_values` for the returned RaggedTensor.
-func RaggedGather(scope *Scope, params_nested_splits []tf.Output, params_dense_values tf.Output, indices tf.Output, OUTPUT_RAGGED_RANK int64) (output_nested_splits []tf.Output, output_dense_values tf.Output) {
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"OUTPUT_RAGGED_RANK": OUTPUT_RAGGED_RANK}
 	opspec := tf.OpSpec{
-		Type: "RaggedGather",
+		Type: "SparseSparseMinimum",
 		Input: []tf.Input{
-			tf.OutputList(params_nested_splits), params_dense_values, indices,
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output_nested_splits, idx, err = makeOutputList(op, idx, "output_nested_splits"); err != nil {
-		scope.UpdateErr("RaggedGather", err)
-		return
-	}
-	output_dense_values = op.Output(idx)
-	return output_nested_splits, output_dense_values
+	return op.Output(0), op.Output(1)
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-//
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
+// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
+type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
+
+// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
 //
-//   selected_indices = tf.image.non_max_suppression_v2(
-//       boxes, scores, max_output_size, iou_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
 //
 // Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output) (selected_indices tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionV2",
+		Type: "ResourceSparseApplyAdagradDA",
 		Input: []tf.Input{
-			boxes, scores, max_output_size, iou_threshold,
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Converts a `RaggedTensor` into a `SparseTensor` with the same values.
+// EncodeJpegAttr is an optional argument to EncodeJpeg.
+type EncodeJpegAttr func(optionalAttr)
+
+// EncodeJpegFormat sets the optional format attribute to value.
 //
-// input=ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
-// output=SparseTensor(indices=sparse_indices, values=sparse_values,
-//                     dense_shape=sparse_dense_shape)
+// value: Per pixel image format.
+// If not specified, defaults to ""
+func EncodeJpegFormat(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["format"] = value
+	}
+}
+
+// EncodeJpegQuality sets the optional quality attribute to value.
 //
-// Arguments:
-//	rt_nested_splits: The `row_splits` for the `RaggedTensor`.
-//	rt_dense_values: The `inner_values` for the `RaggedTensor`.
+// value: Quality of the compression from 0 to 100 (higher is better and slower).
+// If not specified, defaults to 95
+func EncodeJpegQuality(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["quality"] = value
+	}
+}
+
+// EncodeJpegProgressive sets the optional progressive attribute to value.
 //
-// Returns The indices for the `SparseTensor`.The values of the `SparseTensor`.`sparse_dense_shape` is a tight bounding box of the input `RaggedTensor`.
-func RaggedTensorToSparse(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output) (sparse_indices tf.Output, sparse_values tf.Output, sparse_dense_shape tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If True, create a JPEG that loads progressively (coarse to fine).
+// If not specified, defaults to false
+func EncodeJpegProgressive(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["progressive"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "RaggedTensorToSparse",
-		Input: []tf.Input{
-			tf.OutputList(rt_nested_splits), rt_dense_values,
-		},
+}
+
+// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
+//
+// value: If True, spend CPU/RAM to reduce size with no quality change.
+// If not specified, defaults to false
+func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["optimize_size"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Check if the input matches the regex pattern.
+// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
 //
-// The input is a string tensor of any shape. The pattern is a scalar
-// string tensor which is applied to every element of the input tensor.
-// The boolean values (True or False) of the output tensor indicate
-// if the input matches the regex pattern provided.
+// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
+// If not specified, defaults to true
+func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["chroma_downsampling"] = value
+	}
+}
+
+// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
 //
-// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+// value: Unit used to specify `x_density` and `y_density`:
+// pixels per inch (`'in'`) or centimeter (`'cm'`).
+// If not specified, defaults to "in"
+func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["density_unit"] = value
+	}
+}
+
+// EncodeJpegXDensity sets the optional x_density attribute to value.
 //
-// Arguments:
-//	input: A string tensor of the text to be processed.
-//	pattern: A scalar string tensor containing the regular expression to match the input.
+// value: Horizontal pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegXDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["x_density"] = value
+	}
+}
+
+// EncodeJpegYDensity sets the optional y_density attribute to value.
 //
-// Returns A bool tensor with the same shape as `input`.
-func RegexFullMatch(scope *Scope, input tf.Output, pattern tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: Vertical pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegYDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["y_density"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "RegexFullMatch",
-		Input: []tf.Input{
-			input, pattern,
-		},
+}
+
+// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
+//
+// value: If not empty, embed this XMP metadata in the image header.
+// If not specified, defaults to ""
+func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["xmp_metadata"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Says whether the targets are in the top `K` predictions.
+// JPEG-encode an image.
 //
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
+// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
 //
-// More formally, let
+// The attr `format` can be used to override the color format of the encoded
+// output.  Values can be:
 //
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
+// *   `''`: Use a default format based on the number of channels in the image.
+// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
+//     of `image` must be 1.
+// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
+//     of `image` must be 3.
 //
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+// If `format` is not specified or is the empty string, a default format is picked
+// in function of the number of channels in `image`:
+//
+// *   1: Output a grayscale image.
+// *   3: Output an RGB image.
 //
 // Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
+//	image: 3-D with shape `[height, width, channels]`.
 //
-// Returns Computed precision at `k` as a `bool Tensor`.
-func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
+// Returns 0-D. JPEG-encoded image.
+func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "InTopKV2",
+		Type: "EncodeJpeg",
 		Input: []tf.Input{
-			predictions, targets, k,
+			image,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RandomShuffleAttr is an optional argument to RandomShuffle.
-type RandomShuffleAttr func(optionalAttr)
+// MultinomialAttr is an optional argument to Multinomial.
+type MultinomialAttr func(optionalAttr)
 
-// RandomShuffleSeed sets the optional seed attribute to value.
+// MultinomialSeed sets the optional seed attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
+// value: If either seed or seed2 is set to be non-zero, the internal random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
 // If not specified, defaults to 0
-func RandomShuffleSeed(value int64) RandomShuffleAttr {
+func MultinomialSeed(value int64) MultinomialAttr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// RandomShuffleSeed2 sets the optional seed2 attribute to value.
+// MultinomialSeed2 sets the optional seed2 attribute to value.
 //
 // value: A second seed to avoid seed collision.
 // If not specified, defaults to 0
-func RandomShuffleSeed2(value int64) RandomShuffleAttr {
+func MultinomialSeed2(value int64) MultinomialAttr {
 	return func(m optionalAttr) {
 		m["seed2"] = value
 	}
 }
 
-// Randomly shuffles a tensor along its first dimension.
+// MultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["output_dtype"] = value
+	}
+}
+
+// Draws samples from a multinomial distribution.
 //
-//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
-//   to one and only one `output[i]`. For example, a mapping that might occur for a
-//   3x2 tensor is:
+// Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
 //
-// ```
-// [[1, 2],       [[5, 6],
-//  [3, 4],  ==>   [1, 2],
-//  [5, 6]]        [3, 4]]
-// ```
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Multinomial",
+		Input: []tf.Input{
+			logits, num_samples,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingRMSPropParametersAttr is an optional argument to RetrieveTPUEmbeddingRMSPropParameters.
+type RetrieveTPUEmbeddingRMSPropParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingRMSPropParametersTableId(value int64) RetrieveTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingRMSPropParametersTableName(value string) RetrieveTPUEmbeddingRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve RMSProp embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the RMSProp optimization algorithm.Parameter ms updated by the RMSProp optimization algorithm.Parameter mom updated by the RMSProp optimization algorithm.
+func RetrieveTPUEmbeddingRMSPropParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingRMSPropParametersAttr) (parameters tf.Output, ms tf.Output, mom tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingRMSPropParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
+type QuantizedRelu6Attr func(optionalAttr)
+
+// QuantizedRelu6OutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
 //
 // Arguments:
-//	value: The tensor to be shuffled.
 //
-// Returns A tensor of same shape and type as `value`, shuffled along its first
-// dimension.
-func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11539,57 +15318,66 @@ func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomShuffle",
+		Type: "QuantizedRelu6",
 		Input: []tf.Input{
-			value,
+			features, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
-type ResourceSparseApplyRMSPropAttr func(optionalAttr)
+// BatchMatMulAttr is an optional argument to BatchMatMul.
+type BatchMatMulAttr func(optionalAttr)
 
-// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+// BatchMatMulAdjX sets the optional adj_x attribute to value.
 //
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjX(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["adj_x"] = value
+	}
+}
+
+// BatchMatMulAdjY sets the optional adj_y attribute to value.
+//
+// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
 // If not specified, defaults to false
-func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
+func BatchMatMulAdjY(value bool) BatchMatMulAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["adj_y"] = value
 	}
 }
 
-// Update '*var' according to the RMSProp algorithm.
+// Multiplies slices of two tensors in batches.
 //
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
+// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+// viewed as an element of a batch), and arranges the individual results
+// in a single output tensor of the same batch size. Each of the
+// individual slices can optionally be adjointed (to adjoint a matrix
+// means to transpose and conjugate it) before multiplication by setting
+// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
 //
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+// and `[..., r_y, c_y]`.
 //
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
+//     r_o = c_x if adj_x else r_x
+//     c_o = r_y if adj_y else c_y
 //
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
+// It is computed as:
 //
-// Returns the created operation.
-func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
+//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+//
+// Arguments:
+//	x: 2-D or higher with shape `[..., r_x, c_x]`.
+//	y: 2-D or higher with shape `[..., r_y, c_y]`.
+//
+// Returns 3-D or higher with shape `[..., r_o, c_o]`
+func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11598,457 +15386,485 @@ func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyRMSProp",
+		Type: "BatchMatMul",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			x, y,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
-type SampleDistortedBoundingBoxAttr func(optionalAttr)
+// ParseSequenceExampleAttr is an optional argument to ParseSequenceExample.
+type ParseSequenceExampleAttr func(optionalAttr)
 
-// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to non-zero, the random number
-// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-// seed.
+// ParseSequenceExampleNcontextSparse sets the optional Ncontext_sparse attribute to value.
 // If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleNcontextSparse(value int64) ParseSequenceExampleAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["Ncontext_sparse"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
+// ParseSequenceExampleNcontextDense sets the optional Ncontext_dense attribute to value.
 // If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
+//
+// REQUIRES: value >= 0
+func ParseSequenceExampleNcontextDense(value int64) ParseSequenceExampleAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["Ncontext_dense"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
+// ParseSequenceExampleNfeatureListSparse sets the optional Nfeature_list_sparse attribute to value.
+// If not specified, defaults to 0
 //
-// value: The cropped area of the image must contain at least this
-// fraction of any bounding box supplied. The value of this parameter should be
-// non-negative. In the case of 0, the cropped area does not need to overlap
-// any of the bounding boxes supplied.
-// If not specified, defaults to 0.1
-func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
+// REQUIRES: value >= 0
+func ParseSequenceExampleNfeatureListSparse(value int64) ParseSequenceExampleAttr {
 	return func(m optionalAttr) {
-		m["min_object_covered"] = value
+		m["Nfeature_list_sparse"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+// ParseSequenceExampleNfeatureListDense sets the optional Nfeature_list_dense attribute to value.
+// If not specified, defaults to 0
 //
-// value: The cropped area of the image must have an aspect ratio =
-// width / height within this range.
-// If not specified, defaults to <f:0.75 f:1.33 >
-func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
+// REQUIRES: value >= 0
+func ParseSequenceExampleNfeatureListDense(value int64) ParseSequenceExampleAttr {
 	return func(m optionalAttr) {
-		m["aspect_ratio_range"] = value
+		m["Nfeature_list_dense"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+// ParseSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
 //
-// value: The cropped area of the image must contain a fraction of the
-// supplied image within this range.
-// If not specified, defaults to <f:0.05 f:1 >
-func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
+// value: A list of Ncontext_sparse types; the data types of data in
+// each context Feature given in context_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleContextSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
 	return func(m optionalAttr) {
-		m["area_range"] = value
+		m["context_sparse_types"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+// ParseSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
+// If not specified, defaults to <>
 //
-// value: Number of attempts at generating a cropped region of the image
-// of the specified constraints. After `max_attempts` failures, return the entire
-// image.
-// If not specified, defaults to 100
-func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleAttr {
 	return func(m optionalAttr) {
-		m["max_attempts"] = value
+		m["feature_list_dense_types"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+// ParseSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
 //
-// value: Controls behavior if no bounding boxes supplied.
-// If true, assume an implicit bounding box covering the whole input. If false,
-// raise an error.
-// If not specified, defaults to false
-func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
+// value: A list of Ncontext_dense shapes; the shapes of data in
+// each context Feature given in context_dense_keys.
+// The number of elements in the Feature corresponding to context_dense_key[j]
+// must always equal context_dense_shapes[j].NumEntries().
+// The shape of context_dense_values[j] will match context_dense_shapes[j].
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleContextDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
 	return func(m optionalAttr) {
-		m["use_image_if_no_bounding_boxes"] = value
+		m["context_dense_shapes"] = value
 	}
 }
 
-// Generate a single randomly distorted bounding box for an image.
-//
-// Bounding box annotations are often supplied in addition to ground-truth labels
-// in image recognition or object localization tasks. A common technique for
-// training such a system is to randomly distort an image while preserving
-// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-// localization of an object, i.e. bounding box, given an `image_size`,
-// `bounding_boxes` and a series of constraints.
-//
-// The output of this Op is a single bounding box that may be used to crop the
-// original image. The output is returned as 3 tensors: `begin`, `size` and
-// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-// what the bounding box looks like.
-//
-// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example,
+// ParseSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
 //
-// ```python
-//     # Generate a single distorted bounding box.
-//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-//         tf.shape(image),
-//         bounding_boxes=bounding_boxes)
+// value: A list of Nfeature_list_sparse types; the data types
+// of data in each FeatureList given in feature_list_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
 //
-//     # Draw the bounding box in an image summary.
-//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-//                                                   bbox_for_draw)
-//     tf.summary.image('images_with_box', image_with_box)
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_sparse_types"] = value
+	}
+}
+
+// ParseSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
 //
-//     # Employ the bounding box to distort the image.
-//     distorted_image = tf.slice(image, begin, size)
-// ```
+// value: A list of Nfeature_list_dense shapes; the shapes of
+// data in each FeatureList given in feature_list_dense_keys.
+// The shape of each Feature in the FeatureList corresponding to
+// feature_list_dense_key[j] must always equal
+// feature_list_dense_shapes[j].NumEntries().
+// If not specified, defaults to <>
 //
-// Note that if no bounding box information is available, setting
-// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-// false and no bounding boxes are supplied, an error is raised.
+// REQUIRES: len(value) >= 0
+func ParseSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_shapes"] = value
+	}
+}
+
+// Transforms a vector of brain.SequenceExample protos (as strings) into typed tensors.
 //
 // Arguments:
-//	image_size: 1-D, containing `[height, width, channels]`.
-//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-// associated with the image.
-//
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+//	serialized: A vector containing binary serialized SequenceExample protos.
+//	debug_name: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) name for the
+// corresponding serialized proto.  This is purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no name is available.
+//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
+// context_dense_defaults[j] provides default values
+// when the SequenceExample's context map lacks context_dense_key[j].
+// If an empty Tensor is provided for context_dense_defaults[j],
+// then the Feature context_dense_keys[j] is required.
+// The input type is inferred from context_dense_defaults[j], even when it's
+// empty.  If context_dense_defaults[j] is not empty, its shape must match
+// context_dense_shapes[j].
+//	feature_list_dense_missing_assumed_empty: A vector listing the
+// FeatureList keys which may be missing from the SequenceExamples.  If the
+// associated FeatureList is missing, it is treated as empty.  By default,
+// any FeatureList not listed in this vector must exist in the SequenceExamples.
+//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with context_sparse
+// values.
+//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' context features associated with
+// dense values.
+//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
+// (scalars).  The keys expected in the FeatureLists associated with sparse
+// values.
+//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' feature_lists associated
+// with lists of dense values.
+func ParseSequenceExample(scope *Scope, serialized tf.Output, debug_name tf.Output, context_dense_defaults []tf.Output, feature_list_dense_missing_assumed_empty []string, context_sparse_keys []string, context_dense_keys []string, feature_list_sparse_keys []string, feature_list_dense_keys []string, optional ...ParseSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output, feature_list_dense_lengths []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"feature_list_dense_missing_assumed_empty": feature_list_dense_missing_assumed_empty, "context_sparse_keys": context_sparse_keys, "context_dense_keys": context_dense_keys, "feature_list_sparse_keys": feature_list_sparse_keys, "feature_list_dense_keys": feature_list_dense_keys}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBox",
+		Type: "ParseSequenceExample",
 		Input: []tf.Input{
-			image_size, bounding_boxes,
+			serialized, debug_name, tf.OutputList(context_dense_defaults),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Computes sigmoid of `x` element-wise.
-//
-// Specifically, `y = 1 / (1 + exp(-x))`.
-func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Sigmoid",
-		Input: []tf.Input{
-			x,
-		},
+	var idx int
+	var err error
+	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
-type FusedBatchNormAttr func(optionalAttr)
-
-// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
+	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
+	}
+	if feature_list_dense_lengths, idx, err = makeOutputList(op, idx, "feature_list_dense_lengths"); err != nil {
+		scope.UpdateErr("ParseSequenceExample", err)
+		return
 	}
+	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values, feature_list_dense_lengths
 }
 
-// FusedBatchNormDataFormat sets the optional data_format attribute to value.
+// LoadTPUEmbeddingADAMParametersAttr is an optional argument to LoadTPUEmbeddingADAMParameters.
+type LoadTPUEmbeddingADAMParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingADAMParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingADAMParametersTableId(value int64) LoadTPUEmbeddingADAMParametersAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["table_id"] = value
 	}
 }
 
-// FusedBatchNormIsTraining sets the optional is_training attribute to value.
-//
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
+// LoadTPUEmbeddingADAMParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingADAMParametersTableName(value string) LoadTPUEmbeddingADAMParametersAttr {
 	return func(m optionalAttr) {
-		m["is_training"] = value
+		m["table_name"] = value
 	}
 }
 
-// Batch normalization.
+// Load ADAM embedding parameters.
 //
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
+//	parameters: Value of parameters used in the ADAM optimization algorithm.
+//	momenta: Value of momenta used in the ADAM optimization algorithm.
+//	velocities: Value of velocities used in the ADAM optimization algorithm.
 //
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingADAMParameters(scope *Scope, parameters tf.Output, momenta tf.Output, velocities tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingADAMParametersAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNorm",
+		Type: "LoadTPUEmbeddingADAMParameters",
 		Input: []tf.Input{
-			x, scale, offset, mean, variance,
+			parameters, momenta, velocities,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return scope.AddOperation(opspec)
 }
 
-// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
-type RandomStandardNormalAttr func(optionalAttr)
-
-// RandomStandardNormalSeed sets the optional seed attribute to value.
+// Inverse 2D real-valued fast Fourier transform.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
+// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 2 dimensions of `input`.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a normal distribution.
+// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
 //
-// The generated values will have mean 0 and standard deviation 1.
+// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
 //
-// Returns A tensor of the specified shape filled with random normal values.
-func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
+// Returns A float32 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft2
+// @end_compatibility
+func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "RandomStandardNormal",
+		Type: "IRFFT2D",
 		Input: []tf.Input{
-			shape,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
-type FusedResizeAndPadConv2DAttr func(optionalAttr)
+// InfeedEnqueueTupleAttr is an optional argument to InfeedEnqueueTuple.
+type InfeedEnqueueTupleAttr func(optionalAttr)
 
-// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
+// InfeedEnqueueTupleLayouts sets the optional layouts attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
+// value: A vector holding the requested layout in minor-to-major sequence for
+// all the tuple shapes, in the order the shapes appear in the "shapes" input.
+// The layout elements for a sub-shape can be set to -1, in which case the
+// corresponding layout will be computed by the infeed operation.
+// If not specified, defaults to <>
+func InfeedEnqueueTupleLayouts(value []int64) InfeedEnqueueTupleAttr {
 	return func(m optionalAttr) {
-		m["resize_align_corners"] = value
+		m["layouts"] = value
 	}
 }
 
-// Performs a resize and padding as a preprocess during a convolution.
+// InfeedEnqueueTupleDeviceOrdinal sets the optional device_ordinal attribute to value.
 //
-// It's often possible to do spatial transformations more efficiently as part of
-// the packing stage of a convolution, so this op allows for an optimized
-// implementation where these stages are fused together. This prevents the need to
-// write out the intermediate results as whole tensors, reducing memory pressure,
-// and we can get some latency gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and defaults to
-// 'NHWC' order.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func InfeedEnqueueTupleDeviceOrdinal(value int64) InfeedEnqueueTupleAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// Feeds multiple Tensor values into the computation as an XLA tuple.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
+//	inputs: A list of tensors that will be provided using the infeed mechanism.
+//	shapes: The shapes of each tensor in `inputs`.
 //
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
+// Returns the created operation.
+func InfeedEnqueueTuple(scope *Scope, inputs []tf.Output, shapes []tf.Shape, optional ...InfeedEnqueueTupleAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"shapes": shapes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedResizeAndPadConv2D",
+		Type: "InfeedEnqueueTuple",
 		Input: []tf.Input{
-			input, size, paddings, filter,
+			tf.OutputList(inputs),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
+}
+
+// Returns which elements of x are finite.
+//
+// @compatibility(numpy)
+// Equivalent to np.isfinite
+// @end_compatibility
+func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsFinite",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
+type ResourceStridedSliceAssignAttr func(optionalAttr)
+
+// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["begin_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
+	}
 }
 
-// RandomUniformAttr is an optional argument to RandomUniform.
-type RandomUniformAttr func(optionalAttr)
-
-// RandomUniformSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
+// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
 // If not specified, defaults to 0
-func RandomUniformSeed(value int64) RandomUniformAttr {
+func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["new_axis_mask"] = value
 	}
 }
 
-// RandomUniformSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
+// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
 // If not specified, defaults to 0
-func RandomUniformSeed2(value int64) RandomUniformAttr {
+func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["shrink_axis_mask"] = value
 	}
 }
 
-// Outputs random values from a uniform distribution.
+// Assign `value` to the sliced l-value reference of `ref`.
 //
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// The values of `value` are assigned to the positions in the variable
+// `ref` that are selected by the slice parameters. The slice parameters
+// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
 //
-// Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+// NOTE this op currently does not support broadcasting and so `value`'s
+// shape must be exactly the shape produced by the slice of `ref`.
 //
-// Returns A tensor of the specified shape filled with uniform random values.
-func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomUniform",
+		Type: "ResourceStridedSliceAssign",
 		Input: []tf.Input{
-			shape,
+			ref, begin, end, strides, value,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
-type ResourceApplyFtrlAttr func(optionalAttr)
+// ArgMaxAttr is an optional argument to ArgMax.
+type ArgMaxAttr func(optionalAttr)
 
-// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
+// ArgMaxOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["output_type"] = value
 	}
 }
 
-// Update '*var' according to the Ftrl-proximal scheme.
+// Returns the index with the largest value across dimensions of a tensor.
 //
-// accum_new = accum + grad * grad
-// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// Note that in case of ties the identity of the return value is not guaranteed.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 regulariation. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
 //
-// Returns the created operation.
-func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12057,240 +15873,286 @@ func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrl",
+		Type: "ArgMax",
 		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, lr_power,
+			input, dimension,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Locks a mutex resource.  The output is the lock.  So long as the lock tensor
-//
-// is alive, any other request to use `MutexLock` with this mutex will wait.
-//
-// This is particularly useful for creating a critical section when used in
-// conjunction with `MutexLockIdentity`:
-//
-// ```python
-//
-// mutex = mutex_v2(
-//   shared_name=handle_name, container=container, name=name)
-//
-// def execute_in_critical_section(fn, *args, **kwargs):
-//   lock = gen_resource_variable_ops.mutex_lock(mutex)
-//
-//   with ops.control_dependencies([lock]):
-//     r = fn(*args, **kwargs)
-//
-//   with ops.control_dependencies(nest.flatten(r)):
-//     with ops.colocate_with(mutex):
-//       ensure_lock_exists = mutex_lock_identity(lock)
-//
-//     # Make sure that if any element of r is accessed, all of
-//     # them are executed together.
-//     r = nest.map_structure(tf.identity, r)
-//
-//   with ops.control_dependencies([ensure_lock_exists]):
-//     return nest.map_structure(tf.identity, r)
-// ```
-//
-// While `fn` is running in the critical section, no other functions which wish to
-// use this critical section may run.
-//
-// Often the use case is that two executions of the same graph, in parallel,
-// wish to run `fn`; and we wish to ensure that only one of them executes
-// at a time.  This is especially important if `fn` modifies one or more
-// variables at a time.
-//
-// It is also useful if two separate functions must share a resource, but we
-// wish to ensure the usage is exclusive.
+// Fetches multiple values from infeed as an XLA tuple.
 //
 // Arguments:
-//	mutex: The mutex resource to lock.
+//	dtypes: The element types of each element in `outputs`.
+//	shapes: The shapes of each tensor in `outputs`.
 //
-// Returns A tensor that keeps a shared pointer to a lock on the mutex;
-// when the Tensor is destroyed, the use count on the shared pointer is decreased
-// by 1.  When it reaches 0, the lock is released.
-func MutexLock(scope *Scope, mutex tf.Output) (mutex_lock tf.Output) {
+// Returns A list of tensors that will be provided using the infeed mechanism.
+func InfeedDequeueTuple(scope *Scope, dtypes []tf.DataType, shapes []tf.Shape) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtypes": dtypes, "shapes": shapes}
 	opspec := tf.OpSpec{
-		Type: "MutexLock",
-		Input: []tf.Input{
-			mutex,
-		},
+		Type: "InfeedDequeueTuple",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("InfeedDequeueTuple", err)
+		return
+	}
+	return outputs
 }
 
-// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
+// Enqueue multiple Tensor values on the computation outfeed.
 //
 // Arguments:
-//	serialized: A scalar string containing a serialized TensorProto proto.
-//	out_type: The type of the serialized tensor.  The provided type must match the
-// type of the serialized tensor and no implicit conversion will take place.
+//	inputs: A list of tensors that will be inserted into the outfeed queue as an
+// XLA tuple.
 //
-// Returns A Tensor of type `out_type`.
-func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
+// Returns the created operation.
+func OutfeedEnqueueTuple(scope *Scope, inputs []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "ParseTensor",
+		Type: "OutfeedEnqueueTuple",
 		Input: []tf.Input{
-			serialized,
+			tf.OutputList(inputs),
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
-type MaxPoolWithArgmaxAttr func(optionalAttr)
+// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
+type ResourceApplyAdagradAttr func(optionalAttr)
 
-// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
-// If not specified, defaults to DT_INT64
-func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
+// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
 	return func(m optionalAttr) {
-		m["Targmax"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Performs max pooling on the input and outputs both max values and indices.
-//
-// The indices in `argmax` are flattened, so that a maximum value at position
-// `[b, y, x, c]` becomes flattened index
-// `((b * height + y) * width + x) * channels + c`.
+// ResourceApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
+// If not specified, defaults to true
+func ResourceApplyAdagradUpdateSlots(value bool) ResourceApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["update_slots"] = value
+	}
+}
+
+// Update '*var' according to the adagrad scheme.
 //
-// The indices returned are always in `[0, height) x [0, width)` before flattening,
-// even if padding is involved and the mathematically correct answer is outside
-// (either negative or too large).  This is a bug, but fixing it is difficult to do
-// in a safe backwards compatible way, especially due to flattening.
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
 //
 // Arguments:
-//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
 //
-// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
-func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
+// Returns the created operation.
+func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolWithArgmax",
+		Type: "ResourceApplyAdagrad",
 		Input: []tf.Input{
-			input,
+			var_, accum, lr, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// Creates a TensorList which, when stacked, has the value of `tensor`.
+// CudnnRNNV3Attr is an optional argument to CudnnRNNV3.
+type CudnnRNNV3Attr func(optionalAttr)
+
+// CudnnRNNV3RnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNV3RnnMode(value string) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNV3InputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNV3InputMode(value string) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNV3Direction sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNV3Direction(value string) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNV3Dropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV3Dropout(value float32) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNV3Seed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV3Seed(value int64) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNV3Seed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNV3Seed2(value int64) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// CudnnRNNV3IsTraining sets the optional is_training attribute to value.
+// If not specified, defaults to true
+func CudnnRNNV3IsTraining(value bool) CudnnRNNV3Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// A RNN backed by cuDNN.
 //
-// Each tensor in the result list corresponds to one row of the input tensor.
+// Computes the RNN from the input and initial states, with respect to the params
+// buffer. Accepts one extra input "sequence_lengths" than CudnnRNN.
 //
-// tensor: The input tensor.
-// output_handle: The list.
-func TensorListFromTensor(scope *Scope, tensor tf.Output, element_shape tf.Output) (output_handle tf.Output) {
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicates whether there is a linear projection between the input and
+//   the actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// sequence_lengths: a vector of lengths of each input sequence.
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// is_training: Indicates whether this operation is used for inferenece or
+//   training.
+// reserve_space: An opaque tensor that can be used in backprop calculation. It
+//   is only produced if is_training is true.
+func CudnnRNNV3(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, sequence_lengths tf.Output, optional ...CudnnRNNV3Attr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output, host_reserved tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorListFromTensor",
+		Type: "CudnnRNNV3",
 		Input: []tf.Input{
-			tensor, element_shape,
+			input, input_h, input_c, params, sequence_lengths,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Assigns sparse updates to the variable referenced by `resource`.
+// Applies softmax to a batched N-D `SparseTensor`.
 //
-// This operation computes
+// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
+// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
 //
-//     # Scalar indices
-//     ref[indices, ...] = updates[...]
+// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
+// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
+// zero elements do not participate*.  Specifically, the algorithm is equivalent
+// to the following:
 //
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = updates[i, ...]
+//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
+//       with shape `[B, C]`, along the size-C dimension;
+//   (2) Masks out the original implicitly-zero locations;
+//   (3) Renormalizes the remaining elements.
 //
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
+// Hence, the `SparseTensor` result has exactly the same non-zero indices and
+// shape.
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
+// SparseTensor, in canonical ordering.
+//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
 //
-// Returns the created operation.
-func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
+func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterUpdate",
+		Type: "SparseSoftmax",
 		Input: []tf.Input{
-			resource, indices, updates,
+			sp_indices, sp_values, sp_shape,
 		},
 	}
-	return scope.AddOperation(opspec)
-}
-
-// MaxPoolAttr is an optional argument to MaxPool.
-type MaxPoolAttr func(optionalAttr)
-
-// MaxPoolDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolDataFormat(value string) MaxPoolAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Performs max pooling on the input.
+// Creates a Tensor by indexing into the TensorList.
 //
-// Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+// Each row in the produced Tensor corresponds to the element in the TensorList
+// specified by the given index (see `tf.gather`).
 //
-// Returns The max pooled output tensor.
-func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
+// input_handle: The input tensor list.
+// indices: The indices used to index into the list.
+// values: The tensor.
+func TensorListGather(scope *Scope, input_handle tf.Output, indices tf.Output, element_shape tf.Output, element_dtype tf.DataType) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "MaxPool",
+		Type: "TensorListGather",
 		Input: []tf.Input{
-			input,
+			input_handle, indices, element_shape,
 		},
 		Attrs: attrs,
 	}
@@ -12298,128 +16160,161 @@ func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padd
 	return op.Output(0)
 }
 
-// Multiplies sparse updates into the variable referenced by `resource`.
-//
-// This operation computes
+// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
+type FixedLengthRecordReaderV2Attr func(optionalAttr)
+
+// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
 //
-//     # Scalar indices
-//     ref[indices, ...] *= updates[...]
+// value: Number of bytes in the header, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["header_bytes"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
 //
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] *= updates[i, ...]
+// value: Number of bytes in the footer, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["footer_bytes"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
 //
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
+// value: Number of bytes to hop before each read. Default of 0 means using
+// record_bytes.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["hop_bytes"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
 //
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions multiply.
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+// value: The type of encoding for the file. Currently ZLIB and GZIP
+// are supported. Defaults to none.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["encoding"] = value
+	}
+}
+
+// A Reader that outputs fixed-length records from a file.
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+//	record_bytes: Number of bytes in the record.
 //
-// Returns the created operation.
-func ResourceScatterMul(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+// Returns The handle to reference the Reader.
+func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterMul",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
+	attrs := map[string]interface{}{"record_bytes": record_bytes}
+	for _, a := range optional {
+		a(attrs)
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Subtracts sparse updates from the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] -= updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] -= updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions add.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+	opspec := tf.OpSpec{
+		Type: "FixedLengthRecordReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CompilationResultProto indicating the status of the TPU compilation.
+func TPUCompilationResult(scope *Scope) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TPUCompilationResult",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Retrieves the tree ensemble resource stamp token, number of trees and growing statistics.
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+//	tree_ensemble_handle: Handle to the tree ensemble.
 //
-// Returns the created operation.
-func ResourceScatterSub(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+// Returns Stamp token of the tree ensemble resource.The number of trees in the tree ensemble resource.The number of trees that were finished successfully.The number of layers we attempted to build (but not necessarily succeeded).Rank size 2 tensor that contains start and end ids of the nodes in the latest
+// layer.
+func BoostedTreesGetEnsembleStates(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, num_trees tf.Output, num_finalized_trees tf.Output, num_attempted_layers tf.Output, last_layer_nodes_range tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterSub",
+		Type: "BoostedTreesGetEnsembleStates",
 		Input: []tf.Input{
-			resource, indices, updates,
+			tree_ensemble_handle,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
-type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
+// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
+type ResourceApplyPowerSignAttr func(optionalAttr)
 
-// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
+func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+// Update '*var' according to the AddSign update.
 //
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
+// variable <- variable - lr_t * update
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+//	m: Should be from a Variable().
 //	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
+//	logbase: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
+//	grad: The gradient.
 //
 // Returns the created operation.
-func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
+func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12428,452 +16323,495 @@ func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, li
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrlV2",
+		Type: "ResourceApplyPowerSign",
 		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
+			var_, m, lr, logbase, sign_decay, beta, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Calculates gains for each feature and returns the best possible split information for the feature.
+// Deprecated. Use TensorArraySplitV3
 //
-// The split information is the best threshold (bucket id), gains and left/right node contributions per node for each feature.
+// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
+func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArraySplitV2",
+		Input: []tf.Input{
+			handle, value, lengths, flow_in,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reshapes a SparseTensor to represent values in a new dense shape.
 //
-// It is possible that not all nodes can be split on each feature. Hence, the list of possible nodes can differ between the features. Therefore, we return `node_ids_list` for each feature, containing the list of nodes that this feature can be used to split.
+// This operation has the same semantics as reshape on the represented dense
+// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
 //
-// In this manner, the output is the best split per features and per node, so that it needs to be combined later to produce the best split for each node (among all possible features).
+// If one component of `new_shape` is the special value -1, the size of that
+// dimension is computed so that the total dense size remains constant.  At
+// most one component of `new_shape` can be -1.  The number of dense elements
+// implied by `new_shape` must be the same as the number of dense elements
+// originally implied by `input_shape`.
 //
-// The length of output lists are all of the same length, `num_features`.
-// The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
+// Reshaping does not affect the order of values in the SparseTensor.
+//
+// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
+// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
+// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
+// `output_shape` has length `R_out`.
 //
 // Arguments:
-//	node_id_range: A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
-//	stats_summary_list: A list of Rank 3 tensor (#shape=[max_splits, bucket, 2]) for accumulated stats summary (gradient/hessian) per node per buckets for each feature. The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
-//	l1: l1 regularization factor on leaf weights, per instance based.
-//	l2: l2 regularization factor on leaf weights, per instance based.
-//	tree_complexity: adjustment to the gain, per leaf based.
-//	min_node_weight: mininum avg of hessians in a node before required for the node to be considered for splitting.
-//	max_splits: the number of nodes that can be split in the whole tree. Used as a dimension of output tensors.
+//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
+// SparseTensor.
+//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
+//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
 //
-// Returns An output list of Rank 1 tensors indicating possible split node ids for each feature. The length of the list is num_features, but each tensor has different size as each feature provides different possible nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the best gains for each feature to split for certain nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the bucket id to compare with (as a threshold) for split in each node. See above for details like shapes and sizes.A list of Rank 2 tensors indicating the contribution of the left nodes when branching from parent nodes (given by the tensor element in the output node_ids_list) to the left direction by the given threshold for each feature. This value will be used to make the left node value by adding to the parent node value. Second dimension size is 1 for 1-dimensional logits, but would be larger for multi-class problems. See above for details like shapes and sizes.A list of Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.
-func BoostedTreesCalculateBestGainsPerFeature(scope *Scope, node_id_range tf.Output, stats_summary_list []tf.Output, l1 tf.Output, l2 tf.Output, tree_complexity tf.Output, min_node_weight tf.Output, max_splits int64) (node_ids_list []tf.Output, gains_list []tf.Output, thresholds_list []tf.Output, left_node_contribs_list []tf.Output, right_node_contribs_list []tf.Output) {
+// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
+// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
+// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
+// filled in.
+func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"max_splits": max_splits}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesCalculateBestGainsPerFeature",
+		Type: "SparseReshape",
 		Input: []tf.Input{
-			node_id_range, tf.OutputList(stats_summary_list), l1, l2, tree_complexity, min_node_weight,
+			input_indices, input_shape, new_shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if node_ids_list, idx, err = makeOutputList(op, idx, "node_ids_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	if gains_list, idx, err = makeOutputList(op, idx, "gains_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	if thresholds_list, idx, err = makeOutputList(op, idx, "thresholds_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	if left_node_contribs_list, idx, err = makeOutputList(op, idx, "left_node_contribs_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	if right_node_contribs_list, idx, err = makeOutputList(op, idx, "right_node_contribs_list"); err != nil {
-		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
-		return
-	}
-	return node_ids_list, gains_list, thresholds_list, left_node_contribs_list, right_node_contribs_list
+	return op.Output(0), op.Output(1)
 }
 
-// EncodePngAttr is an optional argument to EncodePng.
-type EncodePngAttr func(optionalAttr)
-
-// EncodePngCompression sets the optional compression attribute to value.
+// Computes the product along segments of a tensor.
 //
-// value: Compression level.
-// If not specified, defaults to -1
-func EncodePngCompression(value int64) EncodePngAttr {
-	return func(m optionalAttr) {
-		m["compression"] = value
-	}
-}
-
-// PNG-encode an image.
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
 //
-// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
-// where `channels` is:
+// Computes a tensor such that
+// \\(output_i = \prod_j data_j\\) where the product is over `j` such
+// that `segment_ids[j] == i`.
 //
-// *   1: for grayscale.
-// *   2: for grayscale + alpha.
-// *   3: for RGB.
-// *   4: for RGBA.
+// If the product is empty for a given segment ID `i`, `output[i] = 1`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_prod(c, tf.constant([0, 0, 1]))
+// # ==> [[4, 6, 6, 4],
+// #      [5, 6, 7, 8]]
+// ```
 //
-// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
-// default or a value from 0 to 9.  9 is the highest compression level, generating
-// the smallest output, but is slower.
 //
 // Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
 //
-// Returns 0-D. PNG-encoded image.
-func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "EncodePng",
+		Type: "SegmentProd",
 		Input: []tf.Input{
-			image,
+			data, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
-type DataFormatVecPermuteAttr func(optionalAttr)
+// RetrieveTPUEmbeddingFTRLParametersAttr is an optional argument to RetrieveTPUEmbeddingFTRLParameters.
+type RetrieveTPUEmbeddingFTRLParametersAttr func(optionalAttr)
 
-// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
+// RetrieveTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingFTRLParametersTableId(value int64) RetrieveTPUEmbeddingFTRLParametersAttr {
 	return func(m optionalAttr) {
-		m["src_format"] = value
+		m["table_id"] = value
 	}
 }
 
-// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
-//
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
+// RetrieveTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingFTRLParametersTableName(value string) RetrieveTPUEmbeddingFTRLParametersAttr {
 	return func(m optionalAttr) {
-		m["dst_format"] = value
+		m["table_name"] = value
 	}
 }
 
-// Returns the permuted vector/tensor in the destination data format given the
-//
-// one in the source data format.
+// Retrieve FTRL embedding parameters.
 //
-// Arguments:
-//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
-func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
+// Returns Parameter parameters updated by the FTRL optimization algorithm.Parameter accumulators updated by the FTRL optimization algorithm.Parameter linears updated by the FTRL optimization algorithm.
+func RetrieveTPUEmbeddingFTRLParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingFTRLParametersAttr) (parameters tf.Output, accumulators tf.Output, linears tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DataFormatVecPermute",
-		Input: []tf.Input{
-			x,
-		},
+		Type: "RetrieveTPUEmbeddingFTRLParameters",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Initializes the multi device iterator with the given dataset.
-//
-// Arguments:
-//	dataset: Dataset to be iterated upon.
-//	multi_device_iterator: A MultiDeviceIteratorResource.
-//	max_buffer_size: The maximum size of the host side per device buffer to keep.
-//
-// Returns An int64 indicating which incarnation of the MultiDeviceIterator
-// is running.
-func MultiDeviceIteratorInit(scope *Scope, dataset tf.Output, multi_device_iterator tf.Output, max_buffer_size tf.Output) (incarnation_id tf.Output) {
+// Connects outputs of an N-way replicated computation to N outputs.
+func TPUReplicatedOutput(scope *Scope, input tf.Output, num_replicas int64) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_replicas": num_replicas}
 	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorInit",
+		Type: "TPUReplicatedOutput",
 		Input: []tf.Input{
-			dataset, multi_device_iterator, max_buffer_size,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient of `igamma(a, x)` wrt `a`.
-func IgammaGradA(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "IgammaGradA",
-		Input: []tf.Input{
-			a, x,
-		},
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("TPUReplicatedOutput", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return outputs
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process.
-//
-// Note that the hash function may change from time to time.
-// This functionality will be deprecated and it's recommended to use
-// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
-//
-// Arguments:
-//
-//	num_buckets: The number of buckets.
+// LoadTPUEmbeddingFTRLParametersAttr is an optional argument to LoadTPUEmbeddingFTRLParameters.
+type LoadTPUEmbeddingFTRLParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingFTRLParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
-	opspec := tf.OpSpec{
-		Type: "StringToHashBucket",
-		Input: []tf.Input{
-			string_tensor,
-		},
-		Attrs: attrs,
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingFTRLParametersTableId(value int64) LoadTPUEmbeddingFTRLParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// StaticRegexReplaceAttr is an optional argument to StaticRegexReplace.
-type StaticRegexReplaceAttr func(optionalAttr)
-
-// StaticRegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
-//
-// value: If True, the replacement is global, otherwise the replacement
-// is done only on the first match.
-// If not specified, defaults to true
-func StaticRegexReplaceReplaceGlobal(value bool) StaticRegexReplaceAttr {
+// LoadTPUEmbeddingFTRLParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingFTRLParametersTableName(value string) LoadTPUEmbeddingFTRLParametersAttr {
 	return func(m optionalAttr) {
-		m["replace_global"] = value
+		m["table_name"] = value
 	}
 }
 
-// Replaces the match of pattern in input with rewrite.
+// Load FTRL embedding parameters.
 //
-// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
-//	input: The text to be processed.
-//	pattern: The regular expression to match the input.
-//	rewrite: The rewrite to be applied to the matched expresion.
+//	parameters: Value of parameters used in the FTRL optimization algorithm.
+//	accumulators: Value of accumulators used in the FTRL optimization algorithm.
+//	linears: Value of linears used in the FTRL optimization algorithm.
 //
-// Returns The text after applying pattern and rewrite.
-func StaticRegexReplace(scope *Scope, input tf.Output, pattern string, rewrite string, optional ...StaticRegexReplaceAttr) (output tf.Output) {
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, linears tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingFTRLParametersAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pattern": pattern, "rewrite": rewrite}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StaticRegexReplace",
+		Type: "LoadTPUEmbeddingFTRLParameters",
 		Input: []tf.Input{
-			input,
+			parameters, accumulators, linears,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes gradients for the exponential linear (Elu) operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Elu operation.
-//	outputs: The outputs of the corresponding Elu operation.
+// Returns (x - y)(x - y) element-wise.
 //
-// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
-// `gradients` otherwise.
-func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "EluGrad",
+		Type: "SquaredDifference",
 		Input: []tf.Input{
-			gradients, outputs,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that contains `count` elements from the `input_dataset`.
+// Push an element onto the tensor_array.
 //
 // Arguments:
+//	handle: The handle to a TensorArray.
+//	index: The position to write to inside the TensorArray.
+//	value: The tensor to write to the TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
 //
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be taken. A value of `-1` indicates that all of `input_dataset`
-// is taken.
-//
-//
-func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArrayWriteV3(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TakeDataset",
+		Type: "TensorArrayWriteV3",
 		Input: []tf.Input{
-			input_dataset, count,
+			handle, index, value, flow_in,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Reads the value of a variable.
+// RetrieveTPUEmbeddingAdagradParametersAttr is an optional argument to RetrieveTPUEmbeddingAdagradParameters.
+type RetrieveTPUEmbeddingAdagradParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingAdagradParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// The tensor returned by this operation is immutable.
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingAdagradParametersTableId(value int64) RetrieveTPUEmbeddingAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingAdagradParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdagradParametersTableName(value string) RetrieveTPUEmbeddingAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve Adagrad embedding parameters.
 //
-// The value returned by this operation is guaranteed to be influenced by all the
-// writes on which this operation depends directly or indirectly, and to not be
-// influenced by any of the writes which depend directly or indirectly on this
-// operation.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	dtype: the dtype of the value.
-func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
+// Returns Parameter parameters updated by the Adagrad optimization algorithm.Parameter accumulators updated by the Adagrad optimization algorithm.
+func RetrieveTPUEmbeddingAdagradParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdagradParametersAttr) (parameters tf.Output, accumulators tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReadVariableOp",
-		Input: []tf.Input{
-			resource,
-		},
+		Type: "RetrieveTPUEmbeddingAdagradParameters",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// This op consumes a lock created by `MutexLock`.
+// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
 //
-// This op exists to consume a tensor created by `MutexLock` (other than
-// direct control dependencies).  It should be the only that consumes the tensor,
-// and will raise an error if it is not.  Its only purpose is to keep the
-// mutex lock tensor alive until it is consumed by this op.
+// Each comparison returns a boolean `true` (if `input_value > threshold`)
+// or and `false` otherwise.
 //
-// **NOTE**: This operation must run on the same device as its input.  This may
-// be enforced via the `colocate_with` mechanism.
+// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
+// algorithms that use hashing approximations of cosine and `L2` distances;
+// codes can be generated from an input via:
+//
+// ```python
+// codebook_size = 50
+// codebook_bits = codebook_size * 32
+// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
+//                            dtype=x.dtype,
+//                            initializer=tf.orthogonal_initializer())
+// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
+// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
+// # now codes has shape x.shape[:-1] + [codebook_size]
+// ```
+//
+// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
+// by 8.
+//
+// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
+// a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
 //
 // Arguments:
-//	mutex_lock: A tensor returned by `MutexLock`.
+//	input: Values to compare against `threshold` and bitpack.
+//	threshold: Threshold to compare against.
 //
-// Returns the created operation.
-func ConsumeMutexLock(scope *Scope, mutex_lock tf.Output) (o *tf.Operation) {
+// Returns The bitpacked comparisons.
+func CompareAndBitpack(scope *Scope, input tf.Output, threshold tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ConsumeMutexLock",
+		Type: "CompareAndBitpack",
 		Input: []tf.Input{
-			mutex_lock,
+			input, threshold,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ResourceScatterNdAddAttr is an optional argument to ResourceScatterNdAdd.
-type ResourceScatterNdAddAttr func(optionalAttr)
+// QuantizeAndDequantizeV2Attr is an optional argument to QuantizeAndDequantizeV2.
+type QuantizeAndDequantizeV2Attr func(optionalAttr)
 
-// ResourceScatterNdAddUseLocking sets the optional use_locking attribute to value.
+// QuantizeAndDequantizeV2SignedInput sets the optional signed_input attribute to value.
 //
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
+// value: Whether the quantization is signed or unsigned. (actually this parameter should
+// have been called <b>`signed_output`</b>)
 // If not specified, defaults to true
-func ResourceScatterNdAddUseLocking(value bool) ResourceScatterNdAddAttr {
+func QuantizeAndDequantizeV2SignedInput(value bool) QuantizeAndDequantizeV2Attr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["signed_input"] = value
+	}
+}
+
+// QuantizeAndDequantizeV2NumBits sets the optional num_bits attribute to value.
+//
+// value: The bitwidth of the quantization.
+// If not specified, defaults to 8
+func QuantizeAndDequantizeV2NumBits(value int64) QuantizeAndDequantizeV2Attr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// QuantizeAndDequantizeV2RangeGiven sets the optional range_given attribute to value.
+//
+// value: Whether the range is given or should be determined from the `input` tensor.
+// If not specified, defaults to false
+func QuantizeAndDequantizeV2RangeGiven(value bool) QuantizeAndDequantizeV2Attr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// QuantizeAndDequantizeV2RoundMode sets the optional round_mode attribute to value.
+//
+// value: The 'round_mode' attribute controls which rounding tie-breaking algorithm is
+// used when rounding float values to their quantized equivalents. The following
+// rounding modes are currently supported:
+//
+// *   HALF_TO_EVEN: this is the default round_mode.
+// *   HALF_UP: round towards positive. In this mode 7.5 rounds up to 8 and -7.5
+//     rounds up to -7.
+//
+// If not specified, defaults to "HALF_TO_EVEN"
+func QuantizeAndDequantizeV2RoundMode(value string) QuantizeAndDequantizeV2Attr {
+	return func(m optionalAttr) {
+		m["round_mode"] = value
 	}
 }
 
-// Adds sparse `updates` to individual values or slices within a given
+// Quantizes then dequantizes a tensor.
+//
+// This op simulates the precision loss from the quantized forward pass by:
+//
+// 1. Quantizing the tensor to fixed point numbers, which should match the target
+//    quantization method when it is used in inference.
+// 2. Dequantizing it back to floating point numbers for the following ops, most
+//    likely matmul.
+//
+// There are different ways to quantize. This version uses only scaling, so 0.0
+// maps to 0.
+//
+// From the specified 'num_bits' in the quantized output type, it determines
+// minimum and maximum representable quantized values.
+//
+// e.g.
 //
-// variable according to `indices`.
+// *   [-128, 127] for signed, num_bits = 8, or
+// *   [0, 255] for unsigned, num_bits = 8.
 //
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+// If range_given == False, the initial input_min, input_max will be determined
+// automatically as the minimum and maximum values in the input tensor, otherwise
+// the specified values of input_min, input_max are used.
 //
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+// Note: If the input_min, input_max are specified, they do not need to equal the
+// actual minimum and maximum values in the tensor. e.g. in some cases it may be
+// beneficial to specify these values such that the low probability extremes of the
+// input distribution are clipped.
 //
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
+// This op determines the maximum scale_factor that would map the initial
+// [input_min, input_max] range to a range that lies within the representable
+// quantized range.
 //
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+// It determines the scale from one of input_min and input_max, then updates the
+// other one to maximize the respresentable range.
 //
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-// ```
+// e.g.
 //
-// For example, say we want to update 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that update would look like this:
+// *   if the output is signed, num_bits = 8, [input_min, input_max] = [-10.0,
+//     5.0]: it would use a scale_factor of -128 / -10.0 = 12.8 In this case, it
+//     would update input_max to be 127 / 12.8 = 9.921875
+// *   if the output is signed, num_bits = 8, [input_min, input_max] = [-10.0,
+//     10.0]: it would use a scale_factor of 127 / 10.0 = 12.7 In this case, it
+//     would update input_min to be 128.0 / 12.7 = -10.07874
+// *   if the output is unsigned, input_min is forced to be 0, and only the
+//     specified input_max is used.
 //
-// ```python
-//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
-//     indices = tf.constant([[4], [3], [1] ,[7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     update = tf.scatter_nd_add(ref, indices, updates)
-//     with tf.Session() as sess:
-//       print sess.run(update)
-// ```
+// After determining the scale_factor and updating the input range, it applies the
+// following to each value in the 'input' tensor.
 //
-// The resulting update to ref would look like this:
+// output = round(clamp(value, input_min, input_max) * scale_factor) / scale_factor.
 //
-//     [1, 12, 3, 14, 14, 6, 7, 20]
+// The above round function rounds the value based on the given round_mode.
 //
-// See `tf.scatter_nd` for more details about how to make updates to
-// slices.
 //
 // Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of
-// values to add to ref.
-//
-// Returns the created operation.
-func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdAddAttr) (o *tf.Operation) {
+//	input: Tensor to quantize and then dequantize.
+//	input_min: If `range_given == True`, this specifies the minimum input value that needs to
+// be represented, otherwise it is determined from the min value of the `input`
+// tensor.
+//	input_max: If `range_given == True`, this specifies the maximum input value that needs to
+// be represented, otherwise it is determined from the max value of the `input`
+// tensor.
+func QuantizeAndDequantizeV2(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, optional ...QuantizeAndDequantizeV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12882,186 +16820,104 @@ func ResourceScatterNdAdd(scope *Scope, ref tf.Output, indices tf.Output, update
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdAdd",
-		Input: []tf.Input{
-			ref, indices, updates,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Updates the tree ensemble by either adding a layer to the last tree being grown
-//
-// or by starting a new tree.
-//
-// Arguments:
-//	tree_ensemble_handle: Handle to the ensemble variable.
-//	feature_ids: Rank 1 tensor with ids for each feature. This is the real id of
-// the feature that will be used in the split.
-//	node_ids: List of rank 1 tensors representing the nodes for which this feature
-// has a split.
-//	gains: List of rank 1 tensors representing the gains for each of the feature's
-// split.
-//	thresholds: List of rank 1 tensors representing the thesholds for each of the
-// feature's split.
-//	left_node_contribs: List of rank 2 tensors with left leaf contribs for each of
-// the feature's splits. Will be added to the previous node values to constitute
-// the values of the left nodes.
-//	right_node_contribs: List of rank 2 tensors with right leaf contribs for each
-// of the feature's splits. Will be added to the previous node values to constitute
-// the values of the right nodes.
-//	max_depth: Max depth of the tree to build.
-//	learning_rate: shrinkage const for each new tree.
-//	pruning_mode: 0-No pruning, 1-Pre-pruning, 2-Post-pruning.
-//
-// Returns the created operation.
-func BoostedTreesUpdateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, feature_ids tf.Output, node_ids []tf.Output, gains []tf.Output, thresholds []tf.Output, left_node_contribs []tf.Output, right_node_contribs []tf.Output, max_depth tf.Output, learning_rate tf.Output, pruning_mode int64) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"pruning_mode": pruning_mode}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesUpdateEnsemble",
+		Type: "QuantizeAndDequantizeV2",
 		Input: []tf.Input{
-			tree_ensemble_handle, feature_ids, tf.OutputList(node_ids), tf.OutputList(gains), tf.OutputList(thresholds), tf.OutputList(left_node_contribs), tf.OutputList(right_node_contribs), max_depth, learning_rate,
+			input, input_min, input_max,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes tan of x element-wise.
-func Tan(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Tan",
-		Input: []tf.Input{
-			x,
-		},
-	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Bucketizes 'input' based on 'boundaries'.
-//
-// For example, if the inputs are
-//     boundaries = [0, 10, 100]
-//     input = [[-5, 10000]
-//              [150,   10]
-//              [5,    100]]
-//
-// then the output will be
-//     output = [[0, 3]
-//               [3, 2]
-//               [1, 3]]
-//
-// Arguments:
-//	input: Any shape of Tensor contains with int or float type.
-//	boundaries: A sorted list of floats gives the boundary of the buckets.
+// A TPU core selector Op.
 //
-// Returns Same shape with 'input', each value of input replaced with bucket index.
+// This Op produces a set of TPU cores (for warm-up) or a single TPU core
+// (for regular inference) to execute the TPU program on. The output is
+// consumed by TPUPartitionedCall.
 //
-// @compatibility(numpy)
-// Equivalent to np.digitize.
-// @end_compatibility
-func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
+// Returns A vector 1 or more TPU cores.
+func TPUOrdinalSelector(scope *Scope) (device_ordinals tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"boundaries": boundaries}
 	opspec := tf.OpSpec{
-		Type: "Bucketize",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
+		Type: "TPUOrdinalSelector",
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Encode audio data using the WAV file format.
+// Looks up keys in a table, outputs the corresponding values.
 //
-// This operation will generate a string suitable to be saved out to create a .wav
-// audio file. It will be encoded in the 16-bit PCM format. It takes in float
-// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
-// that range.
+// The tensor `keys` must of the same type as the keys of the table.
+// The output `values` is of the type of the table values.
 //
-// `audio` is a 2-D float Tensor of shape `[length, channels]`.
-// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
+// The scalar `default_value` is the value output for keys not present in the
+// table. It must also be of the same type as the table values.
 //
 // Arguments:
-//	audio: 2-D with shape `[length, channels]`.
-//	sample_rate: Scalar containing the sample frequency.
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
 //
-// Returns 0-D. WAV-encoded file contents.
-func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "EncodeWav",
-		Input: []tf.Input{
-			audio, sample_rate,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes atan of x element-wise.
-func Atan(scope *Scope, x tf.Output) (y tf.Output) {
+//
+// Returns Same shape as `keys`.  Values found in the table, or `default_values`
+// for missing keys.
+func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Atan",
+		Type: "LookupTableFindV2",
 		Input: []tf.Input{
-			x,
+			table_handle, keys, default_value,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyAdaMaxAttr is an optional argument to ResourceApplyAdaMax.
-type ResourceApplyAdaMaxAttr func(optionalAttr)
+// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
+type ResourceSparseApplyRMSPropAttr func(optionalAttr)
 
-// ResourceApplyAdaMaxUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var, m, and v tensors will be protected
+// value: If `True`, updating of the var, ms, and mom tensors is protected
 // by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceApplyAdaMaxUseLocking(value bool) ResourceApplyAdaMaxAttr {
+func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the AdaMax algorithm.
+// Update '*var' according to the RMSProp algorithm.
 //
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// v_t <- max(beta2 * v_{t-1}, abs(g))
-// variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
 //	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
 //	epsilon: Ridge term. Must be a scalar.
 //	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
 //
 // Returns the created operation.
-func ResourceApplyAdaMax(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdaMaxAttr) (o *tf.Operation) {
+func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13070,149 +16926,275 @@ func ResourceApplyAdaMax(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdaMax",
+		Type: "ResourceSparseApplyRMSProp",
 		Input: []tf.Input{
-			var_, m, v, beta1_power, lr, beta1, beta2, epsilon, grad,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// AssertAttr is an optional argument to Assert.
-type AssertAttr func(optionalAttr)
-
-// AssertSummarize sets the optional summarize attribute to value.
+// Returns the truth value of (x > y) element-wise.
 //
-// value: Print this many entries of each tensor.
-// If not specified, defaults to 3
-func AssertSummarize(value int64) AssertAttr {
-	return func(m optionalAttr) {
-		m["summarize"] = value
+// *NOTE*: `Greater` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Greater",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Asserts that the given condition is true.
-//
-// If `condition` evaluates to false, print the list of tensors in `data`.
-// `summarize` determines how many entries of the tensors to print.
+// Creates a TensorList by indexing into a Tensor.
 //
-// Arguments:
-//	condition: The condition to evaluate.
-//	data: The tensors to print out when condition is false.
+// Each member of the TensorList corresponds to one row of the input tensor,
+// specified by the given index (see `tf.gather`).
 //
-// Returns the created operation.
-func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
+// tensor: The input tensor.
+// indices: The indices used to index into the list.
+// element_shape: The shape of the elements in the list (can be less specified than
+//   the shape of the tensor).
+// num_elements: The size of the output list. Must be large enough to accommodate
+//   the largest index in indices. If -1, the list is just large enough to include
+//   the largest index in indices.
+// output_handle: The TensorList.
+func TensorListScatterV2(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output, num_elements tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Assert",
+		Type: "TensorListScatterV2",
 		Input: []tf.Input{
-			condition, tf.OutputList(data),
+			tensor, indices, element_shape, num_elements,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// CudnnRNNBackpropAttr is an optional argument to CudnnRNNBackprop.
-type CudnnRNNBackpropAttr func(optionalAttr)
+// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
+type SampleDistortedBoundingBoxAttr func(optionalAttr)
 
-// CudnnRNNBackpropRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNBackpropRnnMode(value string) CudnnRNNBackpropAttr {
+// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
-		m["rnn_mode"] = value
+		m["seed"] = value
 	}
 }
 
-// CudnnRNNBackpropInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNBackpropInputMode(value string) CudnnRNNBackpropAttr {
+// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
-		m["input_mode"] = value
+		m["seed2"] = value
 	}
 }
 
-// CudnnRNNBackpropDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNBackpropDirection(value string) CudnnRNNBackpropAttr {
+// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
+//
+// value: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
+// If not specified, defaults to 0.1
+func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
-		m["direction"] = value
+		m["min_object_covered"] = value
 	}
 }
 
-// CudnnRNNBackpropDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropDropout(value float32) CudnnRNNBackpropAttr {
+// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+//
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
-		m["dropout"] = value
+		m["aspect_ratio_range"] = value
 	}
 }
 
-// CudnnRNNBackpropSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropSeed(value int64) CudnnRNNBackpropAttr {
+// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+//
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["area_range"] = value
 	}
 }
 
-// CudnnRNNBackpropSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNBackpropSeed2(value int64) CudnnRNNBackpropAttr {
+// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+//
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["max_attempts"] = value
 	}
 }
 
-// Backprop step of CudnnRNN.
+// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
 //
-// Compute the backprop of both data and weights in a RNN.
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["use_image_if_no_bounding_boxes"] = value
+	}
+}
+
+// Generate a single randomly distorted bounding box for an image.
 //
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//     the actual computation before the first layer. 'skip_input' is only allowed
-//     when input_size == num_units; 'auto_select' implies 'skip_input' when
-//     input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used. Should be
-//   "unidirectional" or "bidirectional".
-// dropout: Dropout probability. When set to 0., dropout is disabled.
-// seed: The 1st part of a seed to initialize dropout.
-// seed2: The 2nd part of a seed to initialize dropout.
-// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
-// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
-//     num_units].
-// input_c: For LSTM, a 3-D tensor with the shape of
-//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
-// params: A 1-D tensor that contains the weights and biases in an opaque layout.
-//     The size must be created through CudnnRNNParamsSize, and initialized
-//     separately. Note that they might not be compatible across different
-//     generations. So it is a good idea to save and restore
-// output: A 3-D tensor with the shape of [seq_length, batch_size,
-//     dir * num_units].
-// output_h: The same shape has input_h.
-// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
-// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
-// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
-//     pass.
-// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
-//     pass.
-// reserve_space: The same reserve_space produced in for forward operation.
-// input_backprop: The backprop to input in the forward pass. Has the same shape
-//     as input.
-// input_h_backprop: The backprop to input_h in the forward pass. Has the same
-//     shape as input_h.
-// input_c_backprop: The backprop to input_c in the forward pass. Has the same
-//     shape as input_c.
-// params_backprop: The backprop to the params buffer in the forward pass. Has the
-//     same shape as params.
-func CudnnRNNBackprop(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, optional ...CudnnRNNBackpropAttr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
+//
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
+//
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example,
+//
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
+//
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.summary.image('images_with_box', image_with_box)
+//
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
+// ```
+//
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
+//
+// Arguments:
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
+//
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SampleDistortedBoundingBox",
+		Input: []tf.Input{
+			image_size, bounding_boxes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
+type ResourceScatterNdUpdateAttr func(optionalAttr)
+
+// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
+//
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Applies sparse `updates` to individual values or slices within a given
+//
+// variable according to `indices`.
+//
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+// ```
+//
+// For example, say we want to update 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that update would look like this:
+//
+// ```python
+//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1] ,[7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     update = tf.scatter_nd_update(ref, indices, updates)
+//     with tf.Session() as sess:
+//       print sess.run(update)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, 11, 3, 10, 9, 6, 7, 12]
+//
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
+//
+// Arguments:
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated
+// values to add to ref.
+//
+// Returns the created operation.
+func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13221,105 +17203,194 @@ func CudnnRNNBackprop(scope *Scope, input tf.Output, input_h tf.Output, input_c
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNNBackprop",
+		Type: "ResourceScatterNdUpdate",
 		Input: []tf.Input{
-			input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space,
+			ref, indices, updates,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+	return scope.AddOperation(opspec)
 }
 
-// Split a `SparseTensor` into `num_split` tensors along one dimension.
+// UnicodeDecodeWithOffsetsAttr is an optional argument to UnicodeDecodeWithOffsets.
+type UnicodeDecodeWithOffsetsAttr func(optionalAttr)
+
+// UnicodeDecodeWithOffsetsErrors sets the optional errors attribute to value.
 //
-// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
-// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
-// For example, if `split_dim = 1` and `num_split = 2` and the input is
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeDecodeWithOffsetsErrors(value string) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeDecodeWithOffsetsReplacementChar sets the optional replacement_char attribute to value.
 //
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
+// If not specified, defaults to 65533
+func UnicodeDecodeWithOffsetsReplacementChar(value int64) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// UnicodeDecodeWithOffsetsReplaceControlCharacters sets the optional replace_control_characters attribute to value.
 //
-// Graphically the output tensors are:
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeDecodeWithOffsetsReplaceControlCharacters(value bool) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["replace_control_characters"] = value
+	}
+}
+
+// Decodes each string in `input` into a sequence of Unicode code points.
 //
-//     output_tensor[0] = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
+// The character codepoints for all strings are returned using a single vector
+// `char_values`, with strings expanded to characters in row-major order.
+// Similarly, the character start byte offsets are returned using a single vector
+// `char_to_byte_starts`, with strings expanded in row-major order.
 //
-//     output_tensor[1] = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
+// The `row_splits` tensor indicates where the codepoints and start offsets for
+// each input string begin and end within the `char_values` and
+// `char_to_byte_starts` tensors.  In particular, the values for the `i`th
+// string (in row-major order) are stored in the slice
+// `[row_splits[i]:row_splits[i+1]]`. Thus:
+//
+// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `char_to_bytes_starts[row_splits[i]+j]` is the start byte offset for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
+//   string (in row-major order).
 //
 // Arguments:
-//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
-// `[0, rank(shape))`.
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
-//	num_split: The number of ways to split.
+//	input: The text to be decoded. Can have any shape. Note that the output is flattened
+// to a vector of char values.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
 //
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
+// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.A 1D int32 Tensor containing the byte index in the input string where each
+// character in `char_values` starts.
+func UnicodeDecodeWithOffsets(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeWithOffsetsAttr) (row_splits tf.Output, char_values tf.Output, char_to_byte_starts tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_split": num_split}
+	attrs := map[string]interface{}{"input_encoding": input_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSplit",
+		Type: "UnicodeDecodeWithOffsets",
 		Input: []tf.Input{
-			split_dim, indices, values, shape,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns x - y element-wise.
+//
+// *NOTE*: `Subtract` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "Sub",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
-	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LRNAttr is an optional argument to LRN.
+type LRNAttr func(optionalAttr)
+
+// LRNDepthRadius sets the optional depth_radius attribute to value.
+//
+// value: 0-D.  Half-width of the 1-D normalization window.
+// If not specified, defaults to 5
+func LRNDepthRadius(value int64) LRNAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
 	}
-	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
+}
+
+// LRNBias sets the optional bias attribute to value.
+//
+// value: An offset (usually positive to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNBias(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
 	}
-	return output_indices, output_values, output_shape
 }
 
-// Returns the element-wise sum of a list of tensors.
+// LRNAlpha sets the optional alpha attribute to value.
 //
-// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
-// wait for all of its inputs to be ready before beginning to sum. This can
-// save memory if inputs are ready at different times, since minimum temporary
-// storage is proportional to the output size rather than the inputs size.
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNAlpha(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNBeta sets the optional beta attribute to value.
 //
-// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNBeta(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Local Response Normalization.
 //
-// Returns a `Tensor` of same shape and type as the elements of `inputs`.
+// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
+// dimension), and each vector is normalized independently.  Within a given vector,
+// each component is divided by the weighted, squared sum of inputs within
+// `depth_radius`.  In detail,
+//
+//     sqr_sum[a, b, c, d] =
+//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
+//     output = input / (bias + alpha * sqr_sum) ** beta
+//
+// For details, see [Krizhevsky et al., ImageNet classification with deep
+// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
 //
 // Arguments:
-//	inputs: A list of `Tensor` objects, each with same shape and type.
-//	shape: Shape of elements of `inputs`.
-func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
+//	input: 4-D.
+func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shape": shape}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "AccumulateNV2",
+		Type: "LRN",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -13327,60 +17398,90 @@ func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Out
 	return op.Output(0)
 }
 
-// Outputs deterministic pseudorandom random integers from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[minval, maxval)`.
+// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.
+type RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve proximal Adagrad embedding parameters with debug support.
 //
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//	minval: Minimum value (inclusive, scalar).
-//	maxval: Maximum value (exclusive, scalar).
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns Random values with specified shape.
-func StatelessRandomUniformInt(scope *Scope, shape tf.Output, seed tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
+// Returns Parameter parameters updated by the proximal Adagrad optimization algorithm.Parameter accumulators updated by the proximal Adagrad optimization algorithm.Parameter gradient_accumulators updated by the proximal Adagrad optimization algorithm.
+func RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniformInt",
-		Input: []tf.Input{
-			shape, seed, minval, maxval,
-		},
+		Type: "RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
-type StatelessTruncatedNormalAttr func(optionalAttr)
+// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
+type ResourceSparseApplyAdagradAttr func(optionalAttr)
 
-// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
+// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom values from a truncated normal distribution.
-//
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
+// ResourceSparseApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
+// If not specified, defaults to true
+func ResourceSparseApplyAdagradUpdateSlots(value bool) ResourceSparseApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["update_slots"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
-// Returns Random values with specified shape.
-func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13389,89 +17490,81 @@ func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, opt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessTruncatedNormal",
+		Type: "ResourceSparseApplyAdagrad",
 		Input: []tf.Input{
-			shape, seed,
+			var_, accum, lr, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// RestoreSliceAttr is an optional argument to RestoreSlice.
-type RestoreSliceAttr func(optionalAttr)
+// LoadTPUEmbeddingMomentumParametersAttr is an optional argument to LoadTPUEmbeddingMomentumParameters.
+type LoadTPUEmbeddingMomentumParametersAttr func(optionalAttr)
 
-// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
-//
-// value: Index of file to open first if multiple files match
-// `file_pattern`. See the documentation for `Restore`.
+// LoadTPUEmbeddingMomentumParametersTableId sets the optional table_id attribute to value.
 // If not specified, defaults to -1
-func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
+//
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingMomentumParametersTableId(value int64) LoadTPUEmbeddingMomentumParametersAttr {
 	return func(m optionalAttr) {
-		m["preferred_shard"] = value
+		m["table_id"] = value
 	}
 }
 
-// Restores a tensor from checkpoint files.
-//
-// This is like `Restore` except that restored tensor can be listed as filling
-// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
-// larger tensor and the slice that the restored tensor covers.
+// LoadTPUEmbeddingMomentumParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingMomentumParametersTableName(value string) LoadTPUEmbeddingMomentumParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Momentum embedding parameters.
 //
-// The `shape_and_slice` input has the same format as the
-// elements of the `shapes_and_slices` input of the `SaveSlices` op.
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	shape_and_slice: Scalar. The shapes and slice specifications to use when
-// restoring a tensors.
-//	dt: The type of the tensor to be restored.
+//	parameters: Value of parameters used in the Momentum optimization algorithm.
+//	momenta: Value of momenta used in the Momentum optimization algorithm.
 //
-// Returns The restored tensor.
-func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingMomentumParameters(scope *Scope, parameters tf.Output, momenta tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMomentumParametersAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dt": dt}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RestoreSlice",
+		Type: "LoadTPUEmbeddingMomentumParameters",
 		Input: []tf.Input{
-			file_pattern, tensor_name, shape_and_slice,
+			parameters, momenta,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Divides sparse updates into the variable referenced by `resource`.
+// Assigns sparse updates to the variable referenced by `resource`.
 //
 // This operation computes
 //
 //     # Scalar indices
-//     ref[indices, ...] /= updates[...]
+//     ref[indices, ...] = updates[...]
 //
 //     # Vector indices (for each i)
-//     ref[indices[i], ...] /= updates[i, ...]
+//     ref[indices[i], ...] = updates[i, ...]
 //
 //     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions multiply.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
 //
 // Arguments:
 //	resource: Should be from a `Variable` node.
@@ -13479,12 +17572,12 @@ func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, s
 //	updates: A tensor of updated values to add to `ref`.
 //
 // Returns the created operation.
-func ResourceScatterDiv(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterDiv",
+		Type: "ResourceScatterUpdate",
 		Input: []tf.Input{
 			resource, indices, updates,
 		},
@@ -13492,31 +17585,44 @@ func ResourceScatterDiv(scope *Scope, resource tf.Output, indices tf.Output, upd
 	return scope.AddOperation(opspec)
 }
 
-// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
-type StatelessRandomNormalAttr func(optionalAttr)
+// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
+type HistogramFixedWidthAttr func(optionalAttr)
 
-// StatelessRandomNormalDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
+// HistogramFixedWidthDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_INT32
+func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
 	return func(m optionalAttr) {
 		m["dtype"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom values from a normal distribution.
+// Return histogram of values.
 //
-// The generated values will have mean 0 and standard deviation 1.
+// Given the tensor `values`, this operation returns a rank 1 histogram counting
+// the number of entries in `values` that fall into every bin.  The bins are
+// equal width and determined by the arguments `value_range` and `nbins`.
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// ```python
+// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+// nbins = 5
+// value_range = [0.0, 5.0]
+// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
+//
+// with tf.get_default_session() as sess:
+//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
+//   variables.global_variables_initializer().run()
+//   sess.run(hist) => [2, 1, 1, 0, 2]
+// ```
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+//	values: Numeric `Tensor`.
+//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
+// values <= value_range[0] will be mapped to hist[0],
+// values >= value_range[1] will be mapped to hist[-1].
+//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
 //
-// Returns Random values with specified shape.
-func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
+// Returns A 1-D `Tensor` holding histogram of values.
+func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13525,9 +17631,9 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomNormal",
+		Type: "HistogramFixedWidth",
 		Input: []tf.Input{
-			shape, seed,
+			values, value_range, nbins,
 		},
 		Attrs: attrs,
 	}
@@ -13535,239 +17641,112 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 	return op.Output(0)
 }
 
-// Computes the complementary error function of `x` element-wise.
-func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Erfc",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the number of tensors in the input tensor list.
-//
-// input_handle: the input list
-// length: the number of tensors in the list
-func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListLength",
-		Input: []tf.Input{
-			input_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Determine the script codes of a given tensor of Unicode integer code points.
-//
-// This operation converts Unicode code points to script codes corresponding to
-// each code point. Script codes correspond to International Components for
-// Unicode (ICU) UScriptCode values. See http://icu-project.org/apiref/icu4c/uscript_8h.html.
-// Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints. Output shape will
-// match input shape.
-//
-// Arguments:
-//	input: A Tensor of int32 Unicode code points.
-//
-// Returns A Tensor of int32 script codes corresponding to each input code point.
-func UnicodeScript(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "UnicodeScript",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a sequence of numbers.
-//
-// This operation creates a sequence of numbers that begins at `start` and
-// extends by increments of `delta` up to but not including `limit`.
-//
-// For example:
-//
-// ```
-// # 'start' is 3
-// # 'limit' is 18
-// # 'delta' is 3
-// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
-// ```
+// Elementwise computes the bitwise right-shift of `x` and `y`.
 //
-// Arguments:
-//	start: 0-D (scalar). First entry in the sequence.
-//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
-//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
+// Performs a logical shift for unsigned integer types, and an arithmetic shift
+// for signed integer types.
 //
-// Returns 1-D.
-func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
+// If `y` is negative, or greater than or equal to than the width of `x` in bits
+// the result is implementation defined.
+func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Range",
-		Input: []tf.Input{
-			start, limit, delta,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
-type OrderedMapUnstageNoKeyAttr func(optionalAttr)
-
-// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+		Type: "RightShift",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
+// TensorListStackAttr is an optional argument to TensorListStack.
+type TensorListStackAttr func(optionalAttr)
+
+// TensorListStackNumElements sets the optional num_elements attribute to value.
+// If not specified, defaults to -1
+func TensorListStackNumElements(value int64) TensorListStackAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["num_elements"] = value
 	}
 }
 
-// Op removes and returns the (key, value) element with the smallest
+// Stacks all tensors in the list.
 //
-// key from the underlying container.   If the underlying container
-// does not contain elements, the op will block until it does.
-func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
+// Requires that all tensors have the same shape.
+//
+// input_handle: the input list
+// tensor: the gathered result
+// num_elements: optional. If not -1, the number of elements in the list.
+//
+func TensorListStack(scope *Scope, input_handle tf.Output, element_shape tf.Output, element_dtype tf.DataType, optional ...TensorListStackAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapUnstageNoKey",
+		Type: "TensorListStack",
 		Input: []tf.Input{
-			indices,
+			input_handle, element_shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	key = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapUnstageNoKey", err)
-		return
-	}
-	return key, values
+	return op.Output(0)
 }
 
-// Returns element-wise integer closest to x.
+// A placeholder op for a value that will be fed into the computation.
 //
-// If the result is midway between two representable values,
-// the even representable is chosen.
-// For example:
+// Arguments:
+//	dtype: The type of elements in the tensor.
+//	shape: The shape of the tensor.
 //
-// ```
-// rint(-1.5) ==> -2.0
-// rint(0.5000001) ==> 1.0
-// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
-// ```
-func Rint(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns A tensor that will be provided using the infeed mechanism.
+func InfeedDequeue(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "Rint",
-		Input: []tf.Input{
-			x,
-		},
+		Type: "InfeedDequeue",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
-type ResourceApplyMomentumAttr func(optionalAttr)
-
-// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
+// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
+type StatelessRandomUniformAttr func(optionalAttr)
 
-// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+// StatelessRandomUniformDtype sets the optional dtype attribute to value.
 //
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
 	return func(m optionalAttr) {
-		m["use_nesterov"] = value
+		m["dtype"] = value
 	}
 }
 
-// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
+// Outputs deterministic pseudorandom random values from a uniform distribution.
 //
-// want to use Nesterov momentum.
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
 //
-// accum = accum * momentum + grad
-// var -= lr * accum
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
-//	momentum: Momentum. Must be a scalar.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns the created operation.
-func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
+// Returns Random values with specified shape.
+func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13776,359 +17755,377 @@ func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyMomentum",
+		Type: "StatelessRandomUniform",
 		Input: []tf.Input{
-			var_, accum, lr, grad, momentum,
+			shape, seed,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SubstrAttr is an optional argument to Substr.
-type SubstrAttr func(optionalAttr)
-
-// SubstrUnit sets the optional unit attribute to value.
+// Makes its input available to the next iteration.
 //
-// value: The unit that is used to create the substring.  One of: `"BYTE"` (for
-// defining position and length by bytes) or `"UTF8_CHAR"` (for the UTF-8
-// encoded Unicode code points).  The default is `"BYTE"`. Results are undefined if
-// `unit=UTF8_CHAR` and the `input` strings do not contain structurally valid
-// UTF-8.
-// If not specified, defaults to "BYTE"
-func SubstrUnit(value string) SubstrAttr {
-	return func(m optionalAttr) {
-		m["unit"] = value
+// Arguments:
+//	data: The tensor to be made available to the next iteration.
+//
+// Returns The same tensor as `data`.
+func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "NextIteration",
+		Input: []tf.Input{
+			data,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Return substrings from `Tensor` of strings.
-//
-// For each string in the input `Tensor`, creates a substring starting at index
-// `pos` with a total length of `len`.
-//
-// If `len` defines a substring that would extend beyond the length of the input
-// string, then as many characters as possible are used.
-//
-// A negative `pos` indicates distance within the string backwards from the end.
-//
-// If `pos` specifies an index which is out of range for any of the input strings,
-// then an `InvalidArgumentError` is thrown.
-//
-// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
-// Op creation.
-//
-// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
-// broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-//
-// ---
-//
-// Examples
-//
-// Using scalar `pos` and `len`:
-//
-// ```python
-// input = [b'Hello', b'World']
-// position = 1
-// length = 3
-//
-// output = [b'ell', b'orl']
-// ```
-//
-// Using `pos` and `len` with same shape as `input`:
-//
-// ```python
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen']]
-// position = [[1, 2, 3],
-//             [1, 2, 3],
-//             [1, 2, 3]]
-// length =   [[2, 3, 4],
-//             [4, 3, 2],
-//             [5, 5, 5]]
+// Output a fact about factorials.
+func Fact(scope *Scope) (fact tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Fact",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// GenerateVocabRemappingAttr is an optional argument to GenerateVocabRemapping.
+type GenerateVocabRemappingAttr func(optionalAttr)
+
+// GenerateVocabRemappingOldVocabSize sets the optional old_vocab_size attribute to value.
 //
-// output = [[b'en', b'eve', b'lve'],
-//           [b'hirt', b'urt', b'te'],
-//           [b'ixtee', b'vente', b'hteen']]
-// ```
+// value: Number of entries in the old vocab file to consider.  If -1,
+// use the entire old vocabulary.
+// If not specified, defaults to -1
 //
-// Broadcasting `pos` and `len` onto `input`:
+// REQUIRES: value >= -1
+func GenerateVocabRemappingOldVocabSize(value int64) GenerateVocabRemappingAttr {
+	return func(m optionalAttr) {
+		m["old_vocab_size"] = value
+	}
+}
+
+// Given a path to new and old vocabulary files, returns a remapping Tensor of
 //
-// ```
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen'],
-//          [b'nineteen', b'twenty', b'twentyone']]
-// position = [1, 2, 3]
-// length =   [1, 2, 3]
+// length `num_new_vocab`, where `remapping[i]` contains the row number in the old
+// vocabulary that corresponds to row `i` in the new vocabulary (starting at line
+// `new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`
+// in the new vocabulary is not in the old vocabulary.  The old vocabulary is
+// constrained to the first `old_vocab_size` entries if `old_vocab_size` is not the
+// default value of -1.
 //
-// output = [[b'e', b'ev', b'lve'],
-//           [b'h', b'ur', b'tee'],
-//           [b'i', b've', b'hte'],
-//           [b'i', b'en', b'nty']]
-// ```
+// `num_vocab_offset` enables
+// use in the partitioned variable case, and should generally be set through
+// examining partitioning info.  The format of the files should be a text file,
+// with each line containing a single entity within the vocabulary.
 //
-// Broadcasting `input` onto `pos` and `len`:
+// For example, with `new_vocab_file` a text file containing each of the following
+// elements on a single line: `[f0, f1, f2, f3]`, old_vocab_file = [f1, f0, f3],
+// `num_new_vocab = 3, new_vocab_offset = 1`, the returned remapping would be
+// `[0, -1, 2]`.
 //
-// ```
-// input = b'thirteen'
-// position = [1, 5, 7]
-// length =   [3, 2, 1]
+// The op also returns a count of how many entries in the new vocabulary
+// were present in the old vocabulary, which is used to calculate the number of
+// values to initialize in a weight matrix remapping
 //
-// output = [b'hir', b'ee', b'n']
-// ```
+// This functionality can be used to remap both row vocabularies (typically,
+// features) and column vocabularies (typically, classes) from TensorFlow
+// checkpoints.  Note that the partitioning logic relies on contiguous vocabularies
+// corresponding to div-partitioned variables.  Moreover, the underlying remapping
+// uses an IndexTable (as opposed to an inexact CuckooTable), so client code should
+// use the corresponding index_table_from_file() as the FeatureColumn framework
+// does (as opposed to tf.feature_to_id(), which uses a CuckooTable).
 //
 // Arguments:
-//	input: Tensor of strings
-//	pos: Scalar defining the position of first character in each substring
-//	len: Scalar defining the number of characters to include in each substring
+//	new_vocab_file: Path to the new vocab file.
+//	old_vocab_file: Path to the old vocab file.
+//	new_vocab_offset: How many entries into the new vocab file to start reading.
+//	num_new_vocab: Number of entries in the new vocab file to remap.
 //
-// Returns Tensor of substrings
-func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output, optional ...SubstrAttr) (output tf.Output) {
+// Returns A Tensor of length num_new_vocab where the element at index i
+// is equal to the old ID that maps to the new ID i.  This element is -1 for any
+// new ID that is not found in the old vocabulary.Number of new vocab entries found in old vocab.
+func GenerateVocabRemapping(scope *Scope, new_vocab_file tf.Output, old_vocab_file tf.Output, new_vocab_offset int64, num_new_vocab int64, optional ...GenerateVocabRemappingAttr) (remapping tf.Output, num_present tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"new_vocab_offset": new_vocab_offset, "num_new_vocab": num_new_vocab}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Substr",
+		Type: "GenerateVocabRemapping",
 		Input: []tf.Input{
-			input, pos, len,
+			new_vocab_file, old_vocab_file,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Exits the current frame to its parent frame.
+// Worker heartbeat op.
 //
-// Exit makes its input `data` available to the parent frame.
+// Heartbeats may be sent periodically to indicate the coordinator is still active,
+// to retrieve the current worker status and to expedite shutdown when necessary.
 //
 // Arguments:
-//	data: The tensor to be made available to the parent frame.
+//	request: A string tensor containing a serialized WorkerHeartbeatRequest
 //
-// Returns The same tensor as `data`.
-func Exit(scope *Scope, data tf.Output) (output tf.Output) {
+// Returns A string tensor containing a serialized WorkerHeartbeatResponse
+func WorkerHeartbeat(scope *Scope, request tf.Output) (response tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Exit",
+		Type: "WorkerHeartbeat",
 		Input: []tf.Input{
-			data,
+			request,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Produce a string tensor that encodes the state of a Reader.
-//
-// Not all Readers support being serialized, so this can produce an
-// Unimplemented error.
+// Returns the truth value of (x <= y) element-wise.
 //
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderSerializeStateV2",
+		Type: "LessEqual",
 		Input: []tf.Input{
-			reader_handle,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Concatenates quantized tensors along one dimension.
+// EnqueueTPUEmbeddingIntegerBatchAttr is an optional argument to EnqueueTPUEmbeddingIntegerBatch.
+type EnqueueTPUEmbeddingIntegerBatchAttr func(optionalAttr)
+
+// EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. Should be >= 0 and less than the number
+// of TPU cores in the task on which the node is placed.
+// If not specified, defaults to -1
+func EnqueueTPUEmbeddingIntegerBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingIntegerBatchAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// An op that enqueues a list of input batch tensors to TPUEmbedding.
 //
 // Arguments:
-//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [0, rank(values)).
-//	values: The `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	input_mins: The minimum scalar values for each of the input tensors.
-//	input_maxes: The maximum scalar values for each of the input tensors.
+//	batch: A list of 1D tensors, one for each embedding table, containing the
+// indices into the tables.
+//	mode_override: A string input that overrides the mode specified in the
+// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
 //
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Returns the created operation.
+func EnqueueTPUEmbeddingIntegerBatch(scope *Scope, batch []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingIntegerBatchAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedConcat",
+		Type: "EnqueueTPUEmbeddingIntegerBatch",
 		Input: []tf.Input{
-			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
+			tf.OutputList(batch), mode_override,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// Slice a `SparseTensor` based on the `start` and `size`.
-//
-// For example, if the input is
-//
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
-//
-// Graphically the output tensors are:
-//
-//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
+// An op that receives embedding activations on the TPU.
 //
-//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
+// The TPU system performs the embedding lookups and aggregations specified by
+// the arguments to TPUEmbeddingEnqueue(Integer/Sparse/SparseTensor)Batch. The
+// results of these aggregations are visible to the Tensorflow Graph as the
+// outputs of a RecvTPUEmbeddingActivations op. This op returns a list containing
+// one Tensor of activations per table specified in the model. There can be at
+// most one RecvTPUEmbeddingActivations op in the TPU graph.
 //
 // Arguments:
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-//	start: 1-D. tensor represents the start of the slice.
-//	size: 1-D. tensor represents the size of the slice.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
+//	num_outputs: The number of output activation tensors, equal to the number of
+// embedding tables in the model.
+//	config: Serialized TPUEmbeddingConfiguration proto.
 //
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Returns A TensorList of embedding activations containing one Tensor per
+// embedding table in the model.
+func RecvTPUEmbeddingActivations(scope *Scope, num_outputs int64, config string) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_outputs": num_outputs, "config": config}
 	opspec := tf.OpSpec{
-		Type: "SparseSlice",
-		Input: []tf.Input{
-			indices, values, shape, start, size,
-		},
+		Type: "RecvTPUEmbeddingActivations",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("RecvTPUEmbeddingActivations", err)
+		return
+	}
+	return outputs
 }
 
-// Reduces sparse updates into the variable referenced by `resource` using the `min` operation.
+// Selects elements from `x` or `y`, depending on `condition`.
 //
-// This operation computes
+// The `x`, and `y` tensors must all have the same shape, and the
+// output will also have that shape.
 //
-//     # Scalar indices
-//     ref[indices, ...] = min(ref[indices, ...], updates[...])
+// The `condition` tensor must be a scalar if `x` and `y` are scalars.
+// If `x` and `y` are vectors or higher rank, then `condition` must be either a
+// scalar, a vector with size matching the first dimension of `x`, or must have
+// the same shape as `x`.
 //
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = min(ref[indices[i], ...], updates[i, ...])
+// The `condition` tensor acts as a mask that chooses, based on the value at each
+// element, whether the corresponding element / row in the output should be
+// taken from `x` (if true) or `y` (if false).
 //
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = min(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+// If `condition` is a vector and `x` and `y` are higher rank matrices, then
+// it chooses which row (outer dimension) to copy from `x` and `y`.
+// If `condition` has the same shape as `x` and `y`, then it chooses which
+// element to copy from `x` and `y`.
 //
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions are combined.
+// For example:
 //
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+// ```python
+// # 'condition' tensor is [[True,  False]
+// #                        [False, True]]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e)  # => [[1, 6], [7, 4]]
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+//
+// # 'condition' tensor is [True, False]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e) ==> [[1, 2],
+//                              [7, 8]]
+//
+// ```
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
 //
-// Returns the created operation.
-func ResourceScatterMin(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+//	x: = A `Tensor` which may have the same shape as `condition`.
+// If `condition` is rank 1, `x` may have higher rank,
+// but its first dimension must match the size of `condition`.
+//	y: = A `Tensor` with the same type and shape as `x`.
+//
+// Returns = A `Tensor` with the same type and shape as `x` and `y`.
+func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterMin",
+		Type: "Select",
 		Input: []tf.Input{
-			resource, indices, updates,
+			condition, x, y,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Reshapes a quantized tensor as per the Reshape op.
+// Returns the set of files matching one or more glob patterns.
 //
-// ```
+// Note that this routine only supports wildcard characters in the
+// basename portion of the pattern, not in the directory portion.
+// Note also that the order of filenames returned can be non-deterministic.
 //
 // Arguments:
+//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
 //
-//	shape: Defines the shape of the output tensor.
-//	input_min: The minimum value of the input.
-//	input_max: The maximum value of the input.
-//
-// Returns This value is copied from input_min.This value is copied from input_max.
-func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Returns A vector of matching filenames.
+func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedReshape",
+		Type: "MatchingFiles",
 		Input: []tf.Input{
-			tensor, shape, input_min, input_max,
+			pattern,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// StringSplitAttr is an optional argument to StringSplit.
-type StringSplitAttr func(optionalAttr)
+// SqueezeAttr is an optional argument to Squeeze.
+type SqueezeAttr func(optionalAttr)
 
-// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
+// SqueezeAxis sets the optional axis attribute to value.
 //
-// value: A `bool`. If `True`, skip the empty strings from the result.
-// If not specified, defaults to true
-func StringSplitSkipEmpty(value bool) StringSplitAttr {
+// value: If specified, only squeezes the dimensions listed. The dimension
+// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
+// be in the range `[-rank(input), rank(input))`.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func SqueezeAxis(value []int64) SqueezeAttr {
 	return func(m optionalAttr) {
-		m["skip_empty"] = value
+		m["squeeze_dims"] = value
 	}
 }
 
-// Split elements of `input` based on `delimiter` into a `SparseTensor`.
-//
-// Let N be the size of source (typically N will be the batch size). Split each
-// element of `input` based on `delimiter` and return a `SparseTensor`
-// containing the splitted tokens. Empty tokens are ignored.
+// Removes dimensions of size 1 from the shape of a tensor.
 //
-// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
-//  empty string, each element of `input` is split into individual single-byte
-//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
-//  every character of `delimiter` is a potential split point.
+// Given a tensor `input`, this operation returns a tensor of the same type with
+// all dimensions of size 1 removed. If you don't want to remove all size 1
+// dimensions, you can remove specific size 1 dimensions by specifying
+// `axis`.
 //
 // For example:
-//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
-//   will be
 //
-//   indices = [0, 0;
-//              0, 1;
-//              1, 0;
-//              1, 1;
-//              1, 2]
-//   shape = [2, 3]
-//   values = ['hello', 'world', 'a', 'b', 'c']
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t)) ==> [2, 3]
+// ```
+//
+// Or, to remove specific size 1 dimensions:
+//
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
+// ```
 //
 // Arguments:
-//	input: 1-D. Strings to split.
-//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
+//	input: The `input` to squeeze.
 //
-// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
-// tensor, where the first value is N and the second value is the maximum number
-// of tokens in a single input entry.
-func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
+// Returns Contains the same data as `input`, but has one or more dimensions of
+// size 1 removed.
+func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14137,62 +18134,48 @@ func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ..
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringSplit",
+		Type: "Squeeze",
 		Input: []tf.Input{
-			input, delimiter,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
-type ResourceSparseApplyMomentumAttr func(optionalAttr)
+// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
+type ResourceApplyAdadeltaAttr func(optionalAttr)
 
-// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: If True, updating of the var, accum and update_accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
+func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
-//
-// Set use_nesterov = True if you want to use Nesterov momentum.
-//
-// That is for rows we have grad for, we update var and accum as follows:
+// Update '*var' according to the adadelta scheme.
 //
-// accum = accum * momentum + grad
-// var -= lr * accum
+// accum = rho() * accum + (1 - rho()) * grad.square();
+// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
+// update_accum = rho() * update_accum + (1 - rho()) * update.square();
+// var -= update;
 //
 // Arguments:
 //	var_: Should be from a Variable().
 //	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
+//	accum_update: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
 //	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	momentum: Momentum. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
+func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14201,258 +18184,360 @@ func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyMomentum",
+		Type: "ResourceApplyAdadelta",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices, momentum,
+			var_, accum, accum_update, lr, rho, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Returns the complex conjugate of a complex number.
+// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
+type NonMaxSuppressionAttr func(optionalAttr)
+
+// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// complex numbers that are the complex conjugate of each element in `input`. The
-// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
-// real part and *b* is the imaginary part.
+// value: A float representing the threshold for deciding whether boxes
+// overlap too much with respect to IOU.
+// If not specified, defaults to 0.5
+func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
+	return func(m optionalAttr) {
+		m["iou_threshold"] = value
+	}
+}
+
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// The complex conjugate returned by this operation is of the form \\(a - bj\\).
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
-// For example:
+// Arguments:
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
-// ```
-func Conj(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Conj",
+		Type: "NonMaxSuppression",
 		Input: []tf.Input{
-			input,
+			boxes, scores, max_output_size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes numerical negative value element-wise.
-//
-// I.e., \\(y = -x\\).
-func Neg(scope *Scope, x tf.Output) (y tf.Output) {
+// Creates a dataset that emits `components` as a tuple of tensors once.
+func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Neg",
+		Type: "TensorDataset",
 		Input: []tf.Input{
-			x,
+			tf.OutputList(components),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Execute a sub graph on a remote processor.
-//
-// The graph specifications(such as graph itself, input tensors and output names)
-// are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
-// as serialized_remote_fused_graph_execute_info.
-// The specifications will be passed to a dedicated registered
-// remote fused graph executor.  The executor will send the graph specifications
-// to a remote processor and execute that graph.  The execution results
-// will be passed to consumer nodes as outputs of this node.
+// VariableShapeAttr is an optional argument to VariableShape.
+type VariableShapeAttr func(optionalAttr)
+
+// VariableShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func VariableShapeOutType(value tf.DataType) VariableShapeAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Returns the shape of the variable pointed to by `resource`.
 //
-// Arguments:
-//	inputs: Arbitrary number of tensors with arbitrary data types
+// This operation returns a 1-D integer tensor representing the shape of `input`.
 //
-//	serialized_remote_fused_graph_execute_info: Serialized protocol buffer
-// of RemoteFusedGraphExecuteInfo which contains graph specifications.
+// For example:
 //
-// Returns Arbitrary number of tensors with arbitrary data types
-func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RemoteFusedGraphExecute",
+		Type: "VariableShape",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Store the input tensor in the state of the current session.
+//
+// Arguments:
+//	value: The tensor to be stored.
+//
+// Returns The handle for the tensor stored in the session state, represented
+// as a ResourceHandle object.
+func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("RemoteFusedGraphExecute", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "GetSessionHandleV2",
+		Input: []tf.Input{
+			value,
+		},
 	}
-	return outputs
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
-type MaxPool3DGradGradAttr func(optionalAttr)
+// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
+type ResourceApplyAdamAttr func(optionalAttr)
 
-// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
+// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Computes second-order gradients of the maxpooling function.
+// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, uses the nesterov update.
+// If not specified, defaults to false
+func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the Adam algorithm.
+//
+// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+// $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
 //
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGradGrad",
+		Type: "ResourceApplyAdam",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
-type Conv3DBackpropFilterV2Attr func(optionalAttr)
+// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
+type SdcaOptimizerAttr func(optionalAttr)
 
-// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
+// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
+// value: Whether to use Adaptive SDCA for the inner loop.
+// If not specified, defaults to true
+func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["adaptative"] = value
 	}
 }
 
-// Conv3DBackpropFilterV2Dilations sets the optional dilations attribute to value.
+// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
 //
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the filter.
+// linear models with L1 + L2 regularization. As global optimization objective is
+// strongly-convex, the optimizer optimizes the dual objective at each step. The
+// optimizer applies each update one example at a time. Examples are sampled
+// uniformly, and the optimizer is learning rate free and enjoys linear convergence
+// rate.
+//
+// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
+// Shai Shalev-Shwartz, Tong Zhang. 2012
+//
+// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
+//
+// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
+// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
+// Peter Richtarik, Martin Takac. 2015
+//
+// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
+// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 5-D
-// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
-// tensor.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
+//	sparse_example_indices: a list of vectors which contain example indices.
+//	sparse_feature_indices: a list of vectors which contain feature indices.
+//	sparse_feature_values: a list of vectors which contains feature value
+// associated with each feature group.
+//	dense_features: a list of matrices which contains the dense feature values.
+//	example_weights: a vector which contains the weight associated with each
+// example.
+//	example_labels: a vector which contains the label/target associated with each
+// example.
+//	sparse_indices: a list of vectors where each value is the indices which has
+// corresponding weights in sparse_weights. This field maybe omitted for the
+// dense approach.
+//	sparse_weights: a list of vectors where each value is the weight associated with
+// a sparse feature group.
+//	dense_weights: a list of vectors where the values are the weights associated
+// with a dense feature group.
+//	example_state_data: a list of vectors containing the example state data.
+//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
+// squared and hinge losses.
+//	l1: Symmetric l1 regularization strength.
+//	l2: Symmetric l2 regularization strength.
+//	num_loss_partitions: Number of partitions of the global loss function.
+//	num_inner_iterations: Number of iterations per mini-batch.
+//
+// Returns a list of vectors containing the updated example state
+// data.a list of vectors where each value is the delta
+// weights associated with a sparse feature group.a list of vectors where the values are the delta
+// weights associated with a dense feature group.
+func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilterV2",
+		Type: "SdcaOptimizer",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
-type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	out_example_state_data = op.Output(idx)
+	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
 	}
+	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
 }
 
-// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
+// ExperimentalParseExampleDatasetAttr is an optional argument to ExperimentalParseExampleDataset.
+type ExperimentalParseExampleDatasetAttr func(optionalAttr)
+
+// ExperimentalParseExampleDatasetSloppy sets the optional sloppy attribute to value.
 // If not specified, defaults to false
-func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
+func ExperimentalParseExampleDatasetSloppy(value bool) ExperimentalParseExampleDatasetAttr {
 	return func(m optionalAttr) {
-		m["narrow_range"] = value
+		m["sloppy"] = value
 	}
 }
 
-// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
+// Transforms `input_dataset` containing `Example` protos as vectors of DT_STRING into a dataset of `Tensor` or `SparseTensor` objects representing the parsed features.
 //
-// and `max` to 'outputs' tensor of same shape as `inputs`.
+// Arguments:
 //
-// `[min; max]` define the clamping range for the `inputs` data.
-// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-// then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
 //
-// This operation has a gradient and thus allows for training `min` and `max`
-// values.
-func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
+//	dense_defaults: A dict mapping string keys to `Tensor`s.
+// The keys of the dict must match the dense_keys of the feature.
+//	sparse_keys: A list of string keys in the examples features.
+// The results for these keys will be returned as `SparseTensor` objects.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples features associated with dense values.
+//	sparse_types: A list of `DTypes` of the same length as `sparse_keys`.
+// Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
+// and `tf.string` (`BytesList`) are supported.
+//	dense_shapes: List of tuples with the same length as `dense_keys`.
+// The shape of the data for each dense feature referenced by `dense_keys`.
+// Required for any input tensors identified by `dense_keys`.  Must be
+// either fully defined, or may contain an unknown first dimension.
+// An unknown first dimension means the feature is treated as having
+// a variable number of blocks, and the output shape along this dimension
+// is considered unknown at graph build time.  Padding is applied for
+// minibatch elements smaller than the maximum number of blocks for the
+// given feature along this dimension.
+//	output_types: The type list for the return values.
+//	output_shapes: The list of shapes being produced.
+func ExperimentalParseExampleDataset(scope *Scope, input_dataset tf.Output, num_parallel_calls tf.Output, dense_defaults []tf.Output, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ExperimentalParseExampleDatasetAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes, "output_types": output_types, "output_shapes": output_shapes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVars",
+		Type: "ExperimentalParseExampleDataset",
 		Input: []tf.Input{
-			inputs, min, max,
+			input_dataset, num_parallel_calls, tf.OutputList(dense_defaults),
 		},
 		Attrs: attrs,
 	}
@@ -14460,68 +18545,83 @@ func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max
 	return op.Output(0)
 }
 
-// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
-type ResourceScatterNdUpdateAttr func(optionalAttr)
-
-// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
-//
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Applies sparse `updates` to individual values or slices within a given
-//
-// variable according to `indices`.
-//
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+// 2D real-valued fast Fourier transform.
 //
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
+// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 2 dimensions of `input`.
 //
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
 //
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-// ```
+// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
-// For example, say we want to update 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that update would look like this:
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
 //
-// ```python
-//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1] ,[7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     update = tf.scatter_nd_update(ref, indices, updates)
-//     with tf.Session() as sess:
-//       print sess.run(update)
-// ```
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
 //
-// The resulting update to ref would look like this:
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft2
+// @end_compatibility
+func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RFFT2D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
+type ResourceSparseApplyFtrlAttr func(optionalAttr)
+
+// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
 //
-//     [1, 11, 3, 10, 9, 6, 7, 12]
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
 //
-// See `tf.scatter_nd` for more details about how to make updates to
-// slices.
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// accum_new = accum + grad * grad
+// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated
-// values to add to ref.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
+func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14530,249 +18630,278 @@ func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, upd
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdUpdate",
+		Type: "ResourceSparseApplyFtrl",
 		Input: []tf.Input{
-			ref, indices, updates,
+			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Produces a string handle for the given MultiDeviceIterator.
-//
-// Arguments:
-//	multi_device_iterator: A MultiDeviceIterator resource.
+// Returns which elements of x are Inf.
 //
-// Returns A string representing the resource.
-func MultiDeviceIteratorToStringHandle(scope *Scope, multi_device_iterator tf.Output) (string_handle tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.isinf
+// @end_compatibility
+func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorToStringHandle",
+		Type: "IsInf",
 		Input: []tf.Input{
-			multi_device_iterator,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Applies softmax to a batched N-D `SparseTensor`.
+// Gather ragged slices from `params` axis `0` according to `indices`.
 //
-// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
-// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
+// Outputs a `RaggedTensor` output composed from `output_dense_values` and
+// `output_nested_splits`, such that:
 //
-// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
-// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
-// zero elements do not participate*.  Specifically, the algorithm is equivalent
-// to the following:
+// ```python
+// output.shape = indices.shape + params.shape[1:]
+// output.ragged_rank = indices.shape.ndims + params.ragged_rank
+// output[i...j, d0...dn] = params[indices[i...j], d0...dn]
+// ```
 //
-//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
-//       with shape `[B, C]`, along the size-C dimension;
-//   (2) Masks out the original implicitly-zero locations;
-//   (3) Renormalizes the remaining elements.
+// where
+//
+// * `params =
+//    ragged.from_nested_row_splits(params_dense_values, params_nested_splits)`
+//    provides the values that should be gathered.
+// * `indices` ia a dense tensor with dtype `int32` or `int64`, indicating which
+//    values should be gathered.
+// * `output =
+//    ragged.from_nested_row_splits(output_dense_values, output_nested_splits)`
+//    is the output tensor.
+//
+// (Note: This c++ op is used to implement the higher-level python
+// `tf.ragged.gather` op, which also supports ragged indices.)
 //
-// Hence, the `SparseTensor` result has exactly the same non-zero indices and
-// shape.
 //
 // Arguments:
-//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
-// SparseTensor, in canonical ordering.
-//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	params_nested_splits: The `nested_row_splits` tensors that define the row-partitioning for the
+// `params` RaggedTensor input.
+//	params_dense_values: The `flat_values` for the `params` RaggedTensor. There was a terminology change
+// at the python level from dense_values to flat_values, so dense_values is the
+// deprecated name.
+//	indices: Indices in the outermost dimension of `params` of the values that should be
+// gathered.
+//	OUTPUT_RAGGED_RANK: The ragged rank of the output RaggedTensor. `output_nested_splits` will contain
+// this number of `row_splits` tensors. This value should equal
+// `indices.shape.ndims + params.ragged_rank - 1`.
 //
-// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
-func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
+// Returns The `nested_row_splits` tensors that define the row-partitioning for the
+// returned RaggedTensor.The `flat_values` for the returned RaggedTensor.
+func RaggedGather(scope *Scope, params_nested_splits []tf.Output, params_dense_values tf.Output, indices tf.Output, OUTPUT_RAGGED_RANK int64) (output_nested_splits []tf.Output, output_dense_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"OUTPUT_RAGGED_RANK": OUTPUT_RAGGED_RANK}
 	opspec := tf.OpSpec{
-		Type: "SparseSoftmax",
+		Type: "RaggedGather",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape,
+			tf.OutputList(params_nested_splits), params_dense_values, indices,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output_nested_splits, idx, err = makeOutputList(op, idx, "output_nested_splits"); err != nil {
+		scope.UpdateErr("RaggedGather", err)
+		return
+	}
+	output_dense_values = op.Output(idx)
+	return output_nested_splits, output_dense_values
 }
 
-// Partitions `data` into `num_partitions` tensors using indices from `partitions`.
-//
-// For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
-// becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
-// are placed in `outputs[i]` in lexicographic order of `js`, and the first
-// dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
-// In detail,
-//
-// ```python
-//     outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
-//
-//     outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
-// ```
-//
-// `data.shape` must start with `partitions.shape`.
-//
-// For example:
-//
-// ```python
-//     # Scalar partitions.
-//     partitions = 1
-//     num_partitions = 2
-//     data = [10, 20]
-//     outputs[0] = []  # Empty with shape [0, 2]
-//     outputs[1] = [[10, 20]]
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-//     # Vector partitions.
-//     partitions = [0, 0, 1, 1, 0]
-//     num_partitions = 2
-//     data = [10, 20, 30, 40, 50]
-//     outputs[0] = [10, 20, 50]
-//     outputs[1] = [30, 40]
-// ```
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
 //
-// See `dynamic_stitch` for an example on how to merge partitions back.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
-// </div>
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
 //
-//	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
-//	num_partitions: The number of partitions to output.
-func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_partitions int64) (outputs []tf.Output) {
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_partitions": num_partitions}
 	opspec := tf.OpSpec{
-		Type: "DynamicPartition",
+		Type: "NonMaxSuppressionV2",
 		Input: []tf.Input{
-			data, partitions,
+			boxes, scores, max_output_size, iou_threshold,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("DynamicPartition", err)
-		return
-	}
-	return outputs
+	return op.Output(0)
 }
 
-// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
-type ResourceApplyAdagradAttr func(optionalAttr)
+// TruncatedNormalAttr is an optional argument to TruncatedNormal.
+type TruncatedNormalAttr func(optionalAttr)
 
-// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
+// TruncatedNormalSeed sets the optional seed attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["seed"] = value
 	}
 }
 
-// ResourceApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
-// If not specified, defaults to true
-func ResourceApplyAdagradUpdateSlots(value bool) ResourceApplyAdagradAttr {
+// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
 	return func(m optionalAttr) {
-		m["update_slots"] = value
+		m["seed2"] = value
 	}
 }
 
-// Update '*var' according to the adagrad scheme.
+// Outputs random values from a truncated normal distribution.
 //
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
 //
-// Returns the created operation.
-func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
+// Returns A tensor of the specified shape filled with random truncated normal
+// values.
+func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagrad",
+		Type: "TruncatedNormal",
 		Input: []tf.Input{
-			var_, accum, lr, grad,
+			shape,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Retrieves the tree ensemble resource stamp token, number of trees and growing statistics.
+// StringToNumberAttr is an optional argument to StringToNumber.
+type StringToNumberAttr func(optionalAttr)
+
+// StringToNumberOutType sets the optional out_type attribute to value.
 //
-// Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble.
+// value: The numeric type to interpret each string in `string_tensor` as.
+// If not specified, defaults to DT_FLOAT
+func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Converts each string in the input Tensor to the specified numeric type.
 //
-// Returns Stamp token of the tree ensemble resource.The number of trees in the tree ensemble resource.The number of trees that were finished successfully.The number of layers we attempted to build (but not necessarily succeeded).Rank size 2 tensor that contains start and end ids of the nodes in the latest
-// layer.
-func BoostedTreesGetEnsembleStates(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, num_trees tf.Output, num_finalized_trees tf.Output, num_attempted_layers tf.Output, last_layer_nodes_range tf.Output) {
+// (Note that int32 overflow results in an error while float overflow
+// results in a rounded value.)
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesGetEnsembleStates",
+		Type: "StringToNumber",
 		Input: []tf.Input{
-			tree_ensemble_handle,
+			string_tensor,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
-type ResourceApplyPowerSignAttr func(optionalAttr)
+// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
+type ResourceApplyFtrlV2Attr func(optionalAttr)
 
-// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
+func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the AddSign update.
+// Update '*var' according to the Ftrl-proximal scheme.
 //
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
-// variable <- variable - lr_t * update
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	logbase: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
 //	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
+//
+//	lr_power: Scaling factor. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
+func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14781,220 +18910,326 @@ func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Out
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyPowerSign",
+		Type: "ResourceApplyFtrlV2",
 		Input: []tf.Input{
-			var_, m, lr, logbase, sign_decay, beta, grad,
+			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// StringFormatAttr is an optional argument to StringFormat.
-type StringFormatAttr func(optionalAttr)
+// SkipgramAttr is an optional argument to Skipgram.
+type SkipgramAttr func(optionalAttr)
 
-// StringFormatTemplate sets the optional template attribute to value.
+// SkipgramWindowSize sets the optional window_size attribute to value.
 //
-// value: A string, the template to format tensor summaries into.
-// If not specified, defaults to "%s"
-func StringFormatTemplate(value string) StringFormatAttr {
+// value: The number of words to predict to the left and right of the target.
+// If not specified, defaults to 5
+func SkipgramWindowSize(value int64) SkipgramAttr {
 	return func(m optionalAttr) {
-		m["template"] = value
+		m["window_size"] = value
 	}
 }
 
-// StringFormatPlaceholder sets the optional placeholder attribute to value.
+// SkipgramMinCount sets the optional min_count attribute to value.
 //
-// value: A string, at each placeholder in the template a subsequent tensor summary will be inserted.
-// If not specified, defaults to "%s"
-func StringFormatPlaceholder(value string) StringFormatAttr {
+// value: The minimum number of word occurrences for it to be included in the
+// vocabulary.
+// If not specified, defaults to 5
+func SkipgramMinCount(value int64) SkipgramAttr {
 	return func(m optionalAttr) {
-		m["placeholder"] = value
+		m["min_count"] = value
 	}
 }
 
-// StringFormatSummarize sets the optional summarize attribute to value.
+// SkipgramSubsample sets the optional subsample attribute to value.
 //
-// value: When formatting the tensor summaries print the first and last summarize entries of each tensor dimension.
-// If not specified, defaults to 3
-func StringFormatSummarize(value int64) StringFormatAttr {
+// value: Threshold for word occurrence. Words that appear with higher
+// frequency will be randomly down-sampled. Set to 0 to disable.
+// If not specified, defaults to 0.001
+func SkipgramSubsample(value float32) SkipgramAttr {
 	return func(m optionalAttr) {
-		m["summarize"] = value
+		m["subsample"] = value
 	}
 }
 
-// Formats a string template using a list of tensors.
+// Parses a text file and creates a batch of examples.
 //
-// Formats a string template using a list of tensors, pretty-printing tensor summaries.
+// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
 //
 // Arguments:
-//	inputs: The list of tensors to format into the placeholder string.
+//	filename: The corpus's text file name.
+//	batch_size: The size of produced batch.
 //
-// Returns = The resulting string scalar.
-func StringFormat(scope *Scope, inputs []tf.Output, optional ...StringFormatAttr) (output tf.Output) {
+// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
+func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringFormat",
+		Type: "Skipgram",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
+}
+
+// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
+type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
+
+// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ThreadUnsafeUnigramCandidateSampler",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			true_classes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ShapeAttr is an optional argument to Shape.
-type ShapeAttr func(optionalAttr)
+// MaxPoolV2Attr is an optional argument to MaxPoolV2.
+type MaxPoolV2Attr func(optionalAttr)
 
-// ShapeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func ShapeOutType(value tf.DataType) ShapeAttr {
+// MaxPoolV2DataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["data_format"] = value
 	}
 }
 
-// Returns the shape of a tensor.
-//
-// This operation returns a 1-D integer tensor representing the shape of `input`.
+// Performs max pooling on the input.
 //
-// For example:
+// Arguments:
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
-// ```
-func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
+// Returns The max pooled output tensor.
+func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Shape",
+		Type: "MaxPoolV2",
 		Input: []tf.Input{
-			input,
+			input, ksize, strides,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Does nothing. Serves as a control trigger for scheduling.
+//
+// Only useful as a placeholder for control edges.
+//
+// Returns the created operation.
+func ControlTrigger(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ControlTrigger",
+	}
+	return scope.AddOperation(opspec)
 }
 
-// Computes the power of one value to another.
-//
-// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-// corresponding elements in `x` and `y`. For example:
+// Deprecated. Use TensorArrayReadV3
 //
-// ```
-// # tensor 'x' is [[2, 2]], [3, 3]]
-// # tensor 'y' is [[8, 16], [2, 3]]
-// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-// ```
-func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayReadV3
+func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "Pow",
+		Type: "TensorArrayReadV2",
 		Input: []tf.Input{
-			x, y,
+			handle, index, flow_in,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes fingerprints of the input strings.
+// Batch normalization.
 //
-// Arguments:
-//	input: vector of strings to compute fingerprints on.
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
 //
-// Returns a (N,2) shaped matrix where N is the number of elements in the input
-// vector. Each row contains the low and high parts of the fingerprint.
-func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
+// This op is deprecated. Prefer `tf.nn.batch_normalization`.
+//
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "SdcaFprint",
+		Type: "BatchNormWithGlobalNormalization",
 		Input: []tf.Input{
-			input,
+			t, m, v, beta, gamma,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LRNAttr is an optional argument to LRN.
-type LRNAttr func(optionalAttr)
+// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
+type AddManySparseToTensorsMapAttr func(optionalAttr)
 
-// LRNDepthRadius sets the optional depth_radius attribute to value.
+// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
 //
-// value: 0-D.  Half-width of the 1-D normalization window.
-// If not specified, defaults to 5
-func LRNDepthRadius(value int64) LRNAttr {
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["depth_radius"] = value
+		m["container"] = value
 	}
 }
 
-// LRNBias sets the optional bias attribute to value.
+// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// value: An offset (usually positive to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNBias(value float32) LRNAttr {
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["bias"] = value
+		m["shared_name"] = value
 	}
 }
 
-// LRNAlpha sets the optional alpha attribute to value.
+// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
 //
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNAlpha(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// LRNBeta sets the optional beta attribute to value.
+// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`, where
 //
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNBeta(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["beta"] = value
-	}
-}
-
-// Local Response Normalization.
+// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
 //
-// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
-// dimension), and each vector is normalized independently.  Within a given vector,
-// each component is divided by the weighted, squared sum of inputs within
-// `depth_radius`.  In detail,
+// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
+// having a first `sparse_indices` column taking values between `[0, N)`, where
+// the minibatch size `N == sparse_shape[0]`.
 //
-//     sqr_sum[a, b, c, d] =
-//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
-//     output = input / (bias + alpha * sqr_sum) ** beta
+// The input `SparseTensor` must have rank `R` greater than 1, and the first
+// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The stored
+// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
+// will have rank `R-1`.
 //
-// For details, see [Krizhevsky et al., ImageNet classification with deep
-// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+// The `SparseTensor` values can then be read out as part of a minibatch by passing
+// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddManySparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
 //
 // Arguments:
-//	input: 4-D.
-func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+// The minibatch size `N == sparse_shape[0]`.
+//
+// Returns 1-D.  The handles of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.  Shape: `[N]`.
+func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15003,9 +19238,9 @@ func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LRN",
+		Type: "AddManySparseToTensorsMap",
 		Input: []tf.Input{
-			input,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
@@ -15013,332 +19248,241 @@ func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output)
 	return op.Output(0)
 }
 
-// Creates a dataset that zips together `input_datasets`.
-func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ZipDataset",
-		Input: []tf.Input{
-			tf.OutputList(input_datasets),
-		},
-		Attrs: attrs,
+// TPUReplicateMetadataAttr is an optional argument to TPUReplicateMetadata.
+type TPUReplicateMetadataAttr func(optionalAttr)
+
+// TPUReplicateMetadataNumCoresPerReplica sets the optional num_cores_per_replica attribute to value.
+//
+// value: Number of cores per replica. Used for model parallelism.
+// If not specified, defaults to 1
+func TPUReplicateMetadataNumCoresPerReplica(value int64) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["num_cores_per_replica"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
-type ResourceSparseApplyAdagradAttr func(optionalAttr)
-
-// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
+// TPUReplicateMetadataTopology sets the optional topology attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
+// value: TopologyProto indicating the topology of the TPU pod slice.
+// If not specified, defaults to ""
+func TPUReplicateMetadataTopology(value string) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["topology"] = value
 	}
 }
 
-// ResourceSparseApplyAdagradUpdateSlots sets the optional update_slots attribute to value.
+// TPUReplicateMetadataUseTpu sets the optional use_tpu attribute to value.
+//
+// value: Whether to place the computation on the TPU.
 // If not specified, defaults to true
-func ResourceSparseApplyAdagradUpdateSlots(value bool) ResourceSparseApplyAdagradAttr {
+func TPUReplicateMetadataUseTpu(value bool) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
-		m["update_slots"] = value
+		m["use_tpu"] = value
 	}
 }
 
-// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
-//
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+// TPUReplicateMetadataDeviceAssignment sets the optional device_assignment attribute to value.
 //
-// Returns the created operation.
-func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagrad",
-		Input: []tf.Input{
-			var_, accum, lr, grad, indices,
-		},
-		Attrs: attrs,
+// value: The assignment of devices for the computation.
+// If not specified, defaults to <>
+func TPUReplicateMetadataDeviceAssignment(value []int64) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["device_assignment"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Elementwise computes the bitwise right-shift of `x` and `y`.
+// TPUReplicateMetadataComputationShape sets the optional computation_shape attribute to value.
 //
-// Performs a logical shift for unsigned integer types, and an arithmetic shift
-// for signed integer types.
-//
-// If `y` is negative, or greater than or equal to than the width of `x` in bits
-// the result is implementation defined.
-func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: DEPRECATED. Use num_cores_per_replica instead.
+// If not specified, defaults to <>
+func TPUReplicateMetadataComputationShape(value []int64) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["computation_shape"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "RightShift",
-		Input: []tf.Input{
-			x, y,
-		},
+}
+
+// TPUReplicateMetadataHostComputeCore sets the optional host_compute_core attribute to value.
+// If not specified, defaults to <>
+func TPUReplicateMetadataHostComputeCore(value []string) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["host_compute_core"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// TensorListStackAttr is an optional argument to TensorListStack.
-type TensorListStackAttr func(optionalAttr)
+// TPUReplicateMetadataPaddingMap sets the optional padding_map attribute to value.
+// If not specified, defaults to <>
+func TPUReplicateMetadataPaddingMap(value []string) TPUReplicateMetadataAttr {
+	return func(m optionalAttr) {
+		m["padding_map"] = value
+	}
+}
 
-// TensorListStackNumElements sets the optional num_elements attribute to value.
-// If not specified, defaults to -1
-func TensorListStackNumElements(value int64) TensorListStackAttr {
+// TPUReplicateMetadataStepMarkerLocation sets the optional step_marker_location attribute to value.
+// If not specified, defaults to "STEP_MARK_AT_ENTRY"
+func TPUReplicateMetadataStepMarkerLocation(value string) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
-		m["num_elements"] = value
+		m["step_marker_location"] = value
 	}
 }
 
-// Stacks all tensors in the list.
+// Metadata indicaitng how the TPU computation should be replicated.
 //
-// Requires that all tensors have the same shape.
-//
-// input_handle: the input list
-// tensor: the gathered result
-// num_elements: optional. If not -1, the number of elements in the list.
+// Arguments:
+//	num_replicas: Number of replicas of the computation
 //
-func TensorListStack(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListStackAttr) (tensor tf.Output) {
+// Returns the created operation.
+func TPUReplicateMetadata(scope *Scope, num_replicas int64, optional ...TPUReplicateMetadataAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	attrs := map[string]interface{}{"num_replicas": num_replicas}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorListStack",
-		Input: []tf.Input{
-			input_handle,
-		},
+		Type: "TPUReplicateMetadata",
+
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
-type StatelessRandomUniformAttr func(optionalAttr)
+// LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingFTRLParametersGradAccumDebug.
+type LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr func(optionalAttr)
 
-// StatelessRandomUniformDtype sets the optional dtype attribute to value.
+// LoadTPUEmbeddingFTRLParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingFTRLParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["table_id"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom random values from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// LoadTPUEmbeddingFTRLParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingFTRLParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load FTRL embedding parameters with debug support.
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+//	parameters: Value of parameters used in the FTRL optimization algorithm.
+//	accumulators: Value of accumulators used in the FTRL optimization algorithm.
+//	linears: Value of linears used in the FTRL optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the FTRL optimization algorithm.
 //
-// Returns Random values with specified shape.
-func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingFTRLParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, linears tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingFTRLParametersGradAccumDebugAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniform",
+		Type: "LoadTPUEmbeddingFTRLParametersGradAccumDebug",
 		Input: []tf.Input{
-			shape, seed,
+			parameters, accumulators, linears, gradient_accumulators,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Makes its input available to the next iteration.
+// Concatenates tensors along one dimension.
 //
 // Arguments:
-//	data: The tensor to be made available to the next iteration.
+//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [-rank(values), rank(values)).
 //
-// Returns The same tensor as `data`.
-func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NextIteration",
+		Type: "ConcatV2",
 		Input: []tf.Input{
-			data,
+			tf.OutputList(values), axis,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Output a fact about factorials.
-func Fact(scope *Scope) (fact tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Fact",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deserialize `SparseTensor` objects.
-//
-// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
-// the last dimension stores serialized `SparseTensor` objects and the other N
-// dimensions (N >= 0) correspond to a batch. The ranks of the original
-// `SparseTensor` objects must all match. When the final `SparseTensor` is
-// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
-// the sparse tensors have been concatenated along new dimensions, one for each
-// batch.
-//
-// The output `SparseTensor` object's shape values for the original dimensions
-// are the max across the input `SparseTensor` objects' shape values for the
-// corresponding dimensions. The new dimensions match the size of the batch.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-//
-// then the final deserialized `SparseTensor` will be:
-//
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
-//
-// Arguments:
-//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
-// must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+// Reads and outputs the entire contents of the input filename.
+func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "DeserializeSparse",
+		Type: "ReadFile",
 		Input: []tf.Input{
-			serialized_sparse,
+			filename,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// SqueezeAttr is an optional argument to Squeeze.
-type SqueezeAttr func(optionalAttr)
+// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
+type AvgPoolGradAttr func(optionalAttr)
 
-// SqueezeAxis sets the optional axis attribute to value.
-//
-// value: If specified, only squeezes the dimensions listed. The dimension
-// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
-// be in the range `[-rank(input), rank(input))`.
-// If not specified, defaults to <>
+// AvgPoolGradDataFormat sets the optional data_format attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func SqueezeAxis(value []int64) SqueezeAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
 	return func(m optionalAttr) {
-		m["squeeze_dims"] = value
+		m["data_format"] = value
 	}
 }
 
-// Removes dimensions of size 1 from the shape of a tensor.
-//
-// Given a tensor `input`, this operation returns a tensor of the same type with
-// all dimensions of size 1 removed. If you don't want to remove all size 1
-// dimensions, you can remove specific size 1 dimensions by specifying
-// `axis`.
-//
-// For example:
-//
-// ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t)) ==> [2, 3]
-// ```
-//
-// Or, to remove specific size 1 dimensions:
-//
-// ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
-// ```
+// Computes gradients of the average pooling function.
 //
 // Arguments:
-//	input: The `input` to squeeze.
+//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
+// the output of `avg_pool`.
+//	ksize: The size of the sliding window for each dimension of the input.
+//	strides: The stride of the sliding window for each dimension of the input.
+//	padding: The type of padding algorithm to use.
 //
-// Returns Contains the same data as `input`, but has one or more dimensions of
-// size 1 removed.
-func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
+// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
+func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Squeeze",
+		Type: "AvgPoolGrad",
 		Input: []tf.Input{
-			input,
+			orig_input_shape, grad,
 		},
 		Attrs: attrs,
 	}
@@ -15346,386 +19490,262 @@ func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.
 	return op.Output(0)
 }
 
-// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
-type ResourceApplyAdadeltaAttr func(optionalAttr)
-
-// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var, accum and update_accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the adadelta scheme.
-//
-// accum = rho() * accum + (1 - rho()) * grad.square();
-// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
-// update_accum = rho() * update_accum + (1 - rho()) * update.square();
-// var -= update;
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	accum_update: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdadelta",
-		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
-type NonMaxSuppressionAttr func(optionalAttr)
-
-// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
-//
-// value: A float representing the threshold for deciding whether boxes
-// overlap too much with respect to IOU.
-// If not specified, defaults to 0.5
-func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
-	return func(m optionalAttr) {
-		m["iou_threshold"] = value
-	}
-}
-
 // Greedily selects a subset of bounding boxes in descending order of score,
 //
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
+// pruning away boxes that have high overlaps
+// with previously selected boxes.  Bounding boxes with score less than
+// `score_threshold` are removed. N-by-n overlap values are supplied as square matrix,
+// which allows for defining a custom overlap criterium (eg. intersection over union,
+// intersection over area, etc.).
+//
 // The output of this operation is a set of integers indexing into the input
 // collection of bounding boxes representing the selected boxes.  The bounding
 // box coordinates corresponding to the selected indices can then be obtained
 // using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression(
-//       boxes, scores, max_output_size, iou_threshold)
+//
+//   selected_indices = tf.image.non_max_suppression_with_overlaps(
+//       overlaps, scores, max_output_size, overlap_threshold, score_threshold)
 //   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	overlaps: A 2-D float tensor of shape `[num_boxes, num_boxes]` representing
+// the n-by-n box overlap values.
 //	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
 // score corresponding to each box (each row of boxes).
 //	max_output_size: A scalar integer tensor representing the maximum number of
 // boxes to be selected by non max suppression.
+//	overlap_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
 //
 // Returns A 1-D integer tensor of shape `[M]` representing the selected
 // indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
+func NonMaxSuppressionWithOverlaps(scope *Scope, overlaps tf.Output, scores tf.Output, max_output_size tf.Output, overlap_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppression",
+		Type: "NonMaxSuppressionWithOverlaps",
 		Input: []tf.Input{
-			boxes, scores, max_output_size,
+			overlaps, scores, max_output_size, overlap_threshold, score_threshold,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that emits `components` as a tuple of tensors once.
-func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "TensorDataset",
-		Input: []tf.Input{
-			tf.OutputList(components),
-		},
-		Attrs: attrs,
+// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
+type FractionalAvgPoolGradAttr func(optionalAttr)
+
+// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
+//
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Component-wise multiplies a SparseTensor by a dense Tensor.
-//
-// The output locations corresponding to the implicitly zero elements in the sparse
-// tensor will be zero (i.e., will not take up storage space), regardless of the
-// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
+// Computes gradient of the FractionalAvgPool function.
 //
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
+// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
+// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
+// out_backprop to those indices that form the same pooling cell. Therefore, we
+// just need to know the shape of original input tensor, instead of the whole
+// tensor.
 //
 // Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_avg_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
+func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseMul",
+		Type: "FractionalAvgPoolGrad",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// 2D real-valued fast Fourier transform.
-//
-// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 2 dimensions of `input`.
+// StaticRegexReplaceAttr is an optional argument to StaticRegexReplace.
+type StaticRegexReplaceAttr func(optionalAttr)
+
+// StaticRegexReplaceReplaceGlobal sets the optional replace_global attribute to value.
 //
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
+// value: If True, the replacement is global, otherwise the replacement
+// is done only on the first match.
+// If not specified, defaults to true
+func StaticRegexReplaceReplaceGlobal(value bool) StaticRegexReplaceAttr {
+	return func(m optionalAttr) {
+		m["replace_global"] = value
+	}
+}
+
+// Replaces the match of pattern in input with rewrite.
 //
-// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
 //
 // Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
+//	input: The text to be processed.
+//	pattern: The regular expression to match the input.
+//	rewrite: The rewrite to be applied to the matched expression.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft2
-// @end_compatibility
-func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns The text after applying pattern and rewrite.
+func StaticRegexReplace(scope *Scope, input tf.Output, pattern string, rewrite string, optional ...StaticRegexReplaceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"pattern": pattern, "rewrite": rewrite}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RFFT2D",
+		Type: "StaticRegexReplace",
 		Input: []tf.Input{
-			input, fft_length,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Pads a tensor with zeros.
-//
-// This operation pads a `input` with zeros according to the `paddings` you
-// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
-// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many zeros to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
-// in that dimension.
-//
-// The padded size of each dimension D of the output is:
-//
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-//
-// For example:
+// Computes gradients for the exponential linear (Elu) operation.
 //
-// ```
-// # 't' is [[1, 1], [2, 2]]
-// # 'paddings' is [[1, 1], [2, 2]]
-// # rank of 't' is 2
-// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-//                       [0, 0, 1, 1, 0, 0]
-//                       [0, 0, 2, 2, 0, 0]
-//                       [0, 0, 0, 0, 0, 0]]
-// ```
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Elu operation.
+//	outputs: The outputs of the corresponding Elu operation.
 //
-func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
+// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
+// `gradients` otherwise.
+func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Pad",
+		Type: "EluGrad",
 		Input: []tf.Input{
-			input, paddings,
+			gradients, outputs,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Checks whether a resource handle-based variable has been initialized.
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
+//
+// The hash function is deterministic on the content of the string within the
+// process.
+//
+// Note that the hash function may change from time to time.
+// This functionality will be deprecated and it's recommended to use
+// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
 //
 // Arguments:
-//	resource: the input resource handle.
 //
-// Returns a scalar boolean which is true if the variable has been
-// initialized.
-func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
+//	num_buckets: The number of buckets.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "VarIsInitializedOp",
+		Type: "StringToHashBucket",
 		Input: []tf.Input{
-			resource,
+			string_tensor,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
-type ResourceSparseApplyFtrlAttr func(optionalAttr)
-
-// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
+// Creates a dataset that batches `batch_size` elements from `input_dataset`.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+// Arguments:
 //
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// accum_new = accum + grad * grad
-// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+//	batch_size: A scalar representing the number of elements to accumulate in a batch.
+//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
+// is smaller than desired.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
 //
-// Returns the created operation.
-func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
+func BatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrl",
+		Type: "BatchDatasetV2",
 		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
+			input_dataset, batch_size, drop_remainder,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns which elements of x are Inf.
-//
-// @compatibility(numpy)
-// Equivalent to np.isinf
-// @end_compatibility
-func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes the gradient of `igamma(a, x)` wrt `a`.
+func IgammaGradA(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IsInf",
+		Type: "IgammaGradA",
 		Input: []tf.Input{
-			x,
+			a, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TruncatedNormalAttr is an optional argument to TruncatedNormal.
-type TruncatedNormalAttr func(optionalAttr)
-
-// TruncatedNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
+// Creates a dataset that contains `count` elements from the `input_dataset`.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a truncated normal distribution.
+// Arguments:
 //
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be taken. A value of `-1` indicates that all of `input_dataset`
+// is taken.
 //
-// Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
 //
-// Returns A tensor of the specified shape filled with random truncated normal
-// values.
-func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
+func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TruncatedNormal",
+		Type: "TakeDataset",
 		Input: []tf.Input{
-			shape,
+			input_dataset, count,
 		},
 		Attrs: attrs,
 	}
@@ -15733,278 +19753,224 @@ func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional
 	return op.Output(0)
 }
 
-// SkipgramAttr is an optional argument to Skipgram.
-type SkipgramAttr func(optionalAttr)
-
-// SkipgramWindowSize sets the optional window_size attribute to value.
-//
-// value: The number of words to predict to the left and right of the target.
-// If not specified, defaults to 5
-func SkipgramWindowSize(value int64) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["window_size"] = value
-	}
-}
+// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
+type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
 
-// SkipgramMinCount sets the optional min_count attribute to value.
-//
-// value: The minimum number of word occurrences for it to be included in the
-// vocabulary.
-// If not specified, defaults to 5
-func SkipgramMinCount(value int64) SkipgramAttr {
+// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
 	return func(m optionalAttr) {
-		m["min_count"] = value
+		m["num_bits"] = value
 	}
 }
 
-// SkipgramSubsample sets the optional subsample attribute to value.
-//
-// value: Threshold for word occurrence. Words that appear with higher
-// frequency will be randomly down-sampled. Set to 0 to disable.
-// If not specified, defaults to 0.001
-func SkipgramSubsample(value float32) SkipgramAttr {
+// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
 	return func(m optionalAttr) {
-		m["subsample"] = value
+		m["narrow_range"] = value
 	}
 }
 
-// Parses a text file and creates a batch of examples.
+// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
 //
-// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
+// and `max` to 'outputs' tensor of same shape as `inputs`.
 //
-// Arguments:
-//	filename: The corpus's text file name.
-//	batch_size: The size of produced batch.
+// `[min; max]` define the clamping range for the `inputs` data.
+// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+// then de-quantized and output as floats in `[min; max]` interval.
+// `num_bits` is the bitwidth of the quantization; between 2 and 16, inclusive.
 //
-// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
-func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
+// This operation has a gradient and thus allows for training `min` and `max`
+// values.
+func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Skipgram",
-
+		Type: "FakeQuantWithMinMaxVars",
+		Input: []tf.Input{
+			inputs, min, max,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
+	return op.Output(0)
 }
 
-// StringToNumberAttr is an optional argument to StringToNumber.
-type StringToNumberAttr func(optionalAttr)
+// RetrieveTPUEmbeddingMomentumParametersAttr is an optional argument to RetrieveTPUEmbeddingMomentumParameters.
+type RetrieveTPUEmbeddingMomentumParametersAttr func(optionalAttr)
 
-// StringToNumberOutType sets the optional out_type attribute to value.
+// RetrieveTPUEmbeddingMomentumParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: The numeric type to interpret each string in `string_tensor` as.
-// If not specified, defaults to DT_FLOAT
-func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingMomentumParametersTableId(value int64) RetrieveTPUEmbeddingMomentumParametersAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["table_id"] = value
 	}
 }
 
-// Converts each string in the input Tensor to the specified numeric type.
+// RetrieveTPUEmbeddingMomentumParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMomentumParametersTableName(value string) RetrieveTPUEmbeddingMomentumParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve Momentum embedding parameters.
 //
-// (Note that int32 overflow results in an error while float overflow
-// results in a rounded value.)
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
+// Returns Parameter parameters updated by the Momentum optimization algorithm.Parameter momenta updated by the Momentum optimization algorithm.
+func RetrieveTPUEmbeddingMomentumParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersAttr) (parameters tf.Output, momenta tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringToNumber",
-		Input: []tf.Input{
-			string_tensor,
-		},
+		Type: "RetrieveTPUEmbeddingMomentumParameters",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
-type ResourceApplyFtrlV2Attr func(optionalAttr)
-
-// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+// Forwards the value of an available tensor from `inputs` to `output`.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the Ftrl-proximal scheme.
+// `Merge` waits for at least one of the tensors in `inputs` to become available.
+// It is usually combined with `Switch` to implement branching.
 //
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// `Merge` forwards the first tensor to become available to `output`, and sets
+// `value_index` to its index in `inputs`.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
+//	inputs: The input tensors, exactly one of which will become available.
 //
-// Returns the created operation.
-func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
+// Returns Will be set to the available input tensor.The index of the chosen input tensor in `inputs`.
+func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrlV2",
+		Type: "Merge",
 		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
+			tf.OutputList(inputs),
 		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// EncodeJpegAttr is an optional argument to EncodeJpeg.
-type EncodeJpegAttr func(optionalAttr)
-
-// EncodeJpegFormat sets the optional format attribute to value.
-//
-// value: Per pixel image format.
-// If not specified, defaults to ""
-func EncodeJpegFormat(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["format"] = value
-	}
-}
-
-// EncodeJpegQuality sets the optional quality attribute to value.
-//
-// value: Quality of the compression from 0 to 100 (higher is better and slower).
-// If not specified, defaults to 95
-func EncodeJpegQuality(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["quality"] = value
-	}
-}
-
-// EncodeJpegProgressive sets the optional progressive attribute to value.
-//
-// value: If True, create a JPEG that loads progressively (coarse to fine).
-// If not specified, defaults to false
-func EncodeJpegProgressive(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["progressive"] = value
-	}
-}
-
-// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
-//
-// value: If True, spend CPU/RAM to reduce size with no quality change.
-// If not specified, defaults to false
-func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["optimize_size"] = value
-	}
-}
-
-// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
-//
-// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
-// If not specified, defaults to true
-func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["chroma_downsampling"] = value
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
+// QueueCloseV2Attr is an optional argument to QueueCloseV2.
+type QueueCloseV2Attr func(optionalAttr)
+
+// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
 //
-// value: Unit used to specify `x_density` and `y_density`:
-// pixels per inch (`'in'`) or centimeter (`'cm'`).
-// If not specified, defaults to "in"
-func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
+// value: If true, all pending enqueue requests that are
+// blocked on the given queue will be canceled.
+// If not specified, defaults to false
+func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
 	return func(m optionalAttr) {
-		m["density_unit"] = value
+		m["cancel_pending_enqueues"] = value
 	}
 }
 
-// EncodeJpegXDensity sets the optional x_density attribute to value.
+// Closes the given queue.
 //
-// value: Horizontal pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegXDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["x_density"] = value
+// This operation signals that no more elements will be enqueued in the
+// given queue. Subsequent Enqueue(Many) operations will fail.
+// Subsequent Dequeue(Many) operations will continue to succeed if
+// sufficient elements remain in the queue. Subsequent Dequeue(Many)
+// operations that would block will fail immediately.
+//
+// Arguments:
+//	handle: The handle to a queue.
+//
+// Returns the created operation.
+func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueCloseV2",
+		Input: []tf.Input{
+			handle,
+		},
+		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
 }
 
-// EncodeJpegYDensity sets the optional y_density attribute to value.
+// Writes the given dataset to the given file using the TFRecord format.
 //
-// value: Vertical pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegYDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["y_density"] = value
+// Arguments:
+//	input_dataset: A variant tensor representing the dataset to write.
+//	filename: A scalar string tensor representing the filename to use.
+//	compression_type: A scalar string tensor containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//
+// Returns the created operation.
+func ExperimentalDatasetToTFRecord(scope *Scope, input_dataset tf.Output, filename tf.Output, compression_type tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalDatasetToTFRecord",
+		Input: []tf.Input{
+			input_dataset, filename, compression_type,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
+// BiasAddGradAttr is an optional argument to BiasAddGrad.
+type BiasAddGradAttr func(optionalAttr)
+
+// BiasAddGradDataFormat sets the optional data_format attribute to value.
 //
-// value: If not empty, embed this XMP metadata in the image header.
-// If not specified, defaults to ""
-func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddGradDataFormat(value string) BiasAddGradAttr {
 	return func(m optionalAttr) {
-		m["xmp_metadata"] = value
+		m["data_format"] = value
 	}
 }
 
-// JPEG-encode an image.
-//
-// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-//
-// The attr `format` can be used to override the color format of the encoded
-// output.  Values can be:
-//
-// *   `''`: Use a default format based on the number of channels in the image.
-// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
-//     of `image` must be 1.
-// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
-//     of `image` must be 3.
-//
-// If `format` is not specified or is the empty string, a default format is picked
-// in function of the number of channels in `image`:
+// The backward operation for "BiasAdd" on the "bias" tensor.
 //
-// *   1: Output a grayscale image.
-// *   3: Output an RGB image.
+// It accumulates all the values from out_backprop into the feature dimension.
+// For NHWC data format, the feature dimension is the last. For NCHW data format,
+// the feature dimension is the third-to-last.
 //
 // Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
+//	out_backprop: Any number of dimensions.
 //
-// Returns 0-D. JPEG-encoded image.
-func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
+// Returns 1-D with size the feature dimension of `out_backprop`.
+func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16013,9 +19979,9 @@ func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (cont
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeJpeg",
+		Type: "BiasAddGrad",
 		Input: []tf.Input{
-			image,
+			out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -16023,95 +19989,137 @@ func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (cont
 	return op.Output(0)
 }
 
-// MultinomialAttr is an optional argument to Multinomial.
-type MultinomialAttr func(optionalAttr)
-
-// MultinomialSeed sets the optional seed attribute to value.
+// Reduces `input` from `num_devices` using `reduction` to a single device.
 //
-// value: If either seed or seed2 is set to be non-zero, the internal random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func MultinomialSeed(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// Reduces `input` from `num_devices` using `reduction` to a single device.
+//
+// The graph should be constructed so that all inputs have a valid device
+// assignment, and the op itself is assigned one of these devices.
+//
+// input: The input to the reduction.
+// data: the value of the reduction across all `num_devices` devices.
+// reduction: the reduction operation to perform.
+func NcclReduce(scope *Scope, input []tf.Output, reduction string) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"reduction": reduction}
+	opspec := tf.OpSpec{
+		Type: "NcclReduce",
+		Input: []tf.Input{
+			tf.OutputList(input),
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MultinomialSeed2 sets the optional seed2 attribute to value.
+// Computes the gradient of morphological 2-D dilation with respect to the input.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func MultinomialSeed2(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
+func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// MultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["output_dtype"] = value
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "Dilation2DBackpropInput",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Draws samples from a multinomial distribution.
+// An Op to sum inputs across replicated TPU instances.
+//
+// Each instance supplies its own input.
+//
+// For example, suppose there are 8 TPU instances: `[A, B, C, D, E, F, G, H]`.
+// Passing group_assignment=`[[0,2,4,6],[1,3,5,7]]` sets `A, C, E, G` as group 0,
+// and `B, D, F, H` as group 1. Thus we get the outputs:
+// `[A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H, A+C+E+G, B+D+F+H]`.
 //
 // Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//	input: The local input to the sum.
+//	group_assignment: An int32 tensor with shape
+// [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
+// replica ids in the ith subgroup.
 //
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
+// Returns The sum of all the distributed inputs.
+func CrossReplicaSum(scope *Scope, input tf.Output, group_assignment tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Multinomial",
+		Type: "CrossReplicaSum",
 		Input: []tf.Input{
-			logits, num_samples,
+			input, group_assignment,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
-type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
+// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
+type ResourceSparseApplyMomentumAttr func(optionalAttr)
 
-// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
+func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
+// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
+//
+// Set use_nesterov = True if you want to use Nesterov momentum.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+//
+// accum = accum * momentum + grad
+// var -= lr * accum
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
 //	grad: The gradient.
 //	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
+//	momentum: Momentum. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
+func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16120,156 +20128,220 @@ func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumul
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagradDA",
+		Type: "ResourceSparseApplyMomentum",
 		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
+			var_, accum, lr, grad, indices, momentum,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
+// An Op to permute tensors across replicated TPU instances.
 //
-// The hash function is deterministic on the content of the string within the
-// process and will never change. However, it is not suitable for cryptography.
-// This function may be used when CPU time is scarce and inputs are trusted or
-// unimportant. There is a risk of adversaries constructing inputs that all hash
-// to the same bucket. To prevent this problem, use a strong hash function with
-// `tf.string_to_hash_bucket_strong`.
+// Each instance supplies its own input.
+//
+// For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
+// source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
+// `[D, A, B, C]`.
 //
 // Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
+//	input: The local input to be permuted. Currently only supports float and
+// bfloat16.
+//	source_target_pairs: A tensor with shape [num_pairs, 2].
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
+// Returns The permuted input.
+func CollectivePermute(scope *Scope, input tf.Output, source_target_pairs tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucketFast",
+		Type: "CollectivePermute",
 		Input: []tf.Input{
-			input,
+			input, source_target_pairs,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the last element of the input list as well as a list with all but that element.
+// Returns the complex conjugate of a complex number.
 //
-// Fails if the list is empty.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// complex numbers that are the complex conjugate of each element in `input`. The
+// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
+// real part and *b* is the imaginary part.
 //
-// input_handle: the input list
-// tensor: the withdrawn last element of the list
-// element_dtype: the type of elements in the list
-// element_shape: the shape of the output tensor
-func TensorListPopBack(scope *Scope, input_handle tf.Output, element_dtype tf.DataType) (output_handle tf.Output, tensor tf.Output) {
+// The complex conjugate returned by this operation is of the form \\(a - bj\\).
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
+// ```
+func Conj(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorListPopBack",
+		Type: "Conj",
 		Input: []tf.Input{
-			input_handle,
+			input,
 		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to RetrieveTPUEmbeddingCenteredRMSPropParameters.
+type RetrieveTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingCenteredRMSPropParametersTableId(value int64) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingCenteredRMSPropParametersTableName(value string) RetrieveTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve centered RMSProp embedding parameters.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns Parameter parameters updated by the centered RMSProp optimization algorithm.Parameter ms updated by the centered RMSProp optimization algorithm.Parameter mom updated by the centered RMSProp optimization algorithm.Parameter mg updated by the centered RMSProp optimization algorithm.
+func RetrieveTPUEmbeddingCenteredRMSPropParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingCenteredRMSPropParametersAttr) (parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingCenteredRMSPropParameters",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
-type MaxPoolGradGradAttr func(optionalAttr)
+// StringSplitAttr is an optional argument to StringSplit.
+type StringSplitAttr func(optionalAttr)
 
-// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
+// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
+// value: A `bool`. If `True`, skip the empty strings from the result.
+// If not specified, defaults to true
+func StringSplitSkipEmpty(value bool) StringSplitAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["skip_empty"] = value
 	}
 }
 
-// Computes second-order gradients of the maxpooling function.
+// Split elements of `input` based on `delimiter` into a `SparseTensor`.
+//
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `input` based on `delimiter` and return a `SparseTensor`
+// containing the splitted tokens. Empty tokens are ignored.
+//
+// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
+//  empty string, each element of `input` is split into individual single-byte
+//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
+//  every character of `delimiter` is a potential split point.
+//
+// For example:
+//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
+//   will be
+//
+//   indices = [0, 0;
+//              0, 1;
+//              1, 0;
+//              1, 1;
+//              1, 2]
+//   shape = [2, 3]
+//   values = ['hello', 'world', 'a', 'b', 'c']
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	input: 1-D. Strings to split.
+//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
 //
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
+// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
+// tensor, where the first value is N and the second value is the maximum number
+// of tokens in a single input entry.
+func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGrad",
+		Type: "StringSplit",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			input, delimiter,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
-type TensorArrayGatherV3Attr func(optionalAttr)
+// MaxPool3DAttr is an optional argument to MaxPool3D.
+type MaxPool3DAttr func(optionalAttr)
 
-// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
+// MaxPool3DDataFormat sets the optional data_format attribute to value.
 //
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DDataFormat(value string) MaxPool3DAttr {
 	return func(m optionalAttr) {
-		m["element_shape"] = value
+		m["data_format"] = value
 	}
 }
 
-// Gather specific elements from the TensorArray into output `value`.
-//
-// All elements selected by `indices` must have the same shape.
+// Performs 3D max pooling on the input.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	indices: The locations in the TensorArray from which to read tensor elements.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns All of the elements in the TensorArray, concatenated along a new
-// axis (the new dimension 0).
-func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
+// Returns The max pooled output tensor.
+func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGatherV3",
+		Type: "MaxPool3D",
 		Input: []tf.Input{
-			handle, indices, flow_in,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -16277,172 +20349,208 @@ func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow
 	return op.Output(0)
 }
 
-// Returns x / y element-wise for integer types.
+// Convert JSON-encoded Example records to binary protocol buffer strings.
 //
-// Truncation designates that negative numbers will round fractional quantities
-// toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
-// than Python semantics. See `FloorDiv` for a division function that matches
-// Python Semantics.
+// This op translates a tensor containing Example records, encoded using
+// the [standard JSON
+// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
+// into a tensor containing the same records encoded as binary protocol
+// buffers. The resulting tensor can then be fed to any of the other
+// Example-parsing ops.
 //
-// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	json_examples: Each string is a JSON object serialized according to the JSON
+// mapping of the Example proto.
+//
+// Returns Each string is a binary Example protocol buffer corresponding
+// to the respective element of `json_examples`.
+func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TruncateDiv",
+		Type: "DecodeJSONExample",
 		Input: []tf.Input{
-			x, y,
+			json_examples,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Restores tensors from a V2 checkpoint.
+// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
+type QueueEnqueueManyV2Attr func(optionalAttr)
+
+// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// For backward compatibility with the V1 format, this Op currently allows
-// restoring from a V1 checkpoint as well:
-//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
-//     if found proceed to read it as a V2 checkpoint;
-//   - Otherwise the V1 read path is invoked.
-// Relying on this behavior is not recommended, as the ability to fall back to read
-// V1 might be deprecated and eventually removed.
+// value: If the queue is too full, this operation will block for up
+// to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Enqueues zero or more tuples of one or more tensors in the given queue.
 //
-// By default, restores the named tensors in full.  If the caller wishes to restore
-// specific slices of stored tensors, "shape_and_slices" should be non-empty
-// strings and correspondingly well-formed.
+// This operation slices each component tensor along the 0th dimension to
+// make multiple queue elements. All of the tuple components must have the
+// same size in the 0th dimension.
 //
-// Callers must ensure all the named tensors are indeed stored in the checkpoint.
+// The components input has k elements, which correspond to the components of
+// tuples stored in the given queue.
+//
+// N.B. If the queue is full, this operation will block until the given
+// elements have been enqueued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
-//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
-//	tensor_names: shape {N}.  The names of the tensors to be restored.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
-// Empty strings indicate that they are non-partitioned tensors.
-//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
-// those stored in the checkpoint.
+//	handle: The handle to a queue.
+//	components: One or more tensors from which the enqueued tensors should
+// be taken.
 //
-// Returns shape {N}.  The restored tensors, whose shapes are read from the
-// checkpoint directly.
-func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
+// Returns the created operation.
+func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RestoreV2",
+		Type: "QueueEnqueueManyV2",
 		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices,
+			handle, tf.OutputList(components),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
-		scope.UpdateErr("RestoreV2", err)
-		return
+	return scope.AddOperation(opspec)
+}
+
+// PrintV2Attr is an optional argument to PrintV2.
+type PrintV2Attr func(optionalAttr)
+
+// PrintV2OutputStream sets the optional output_stream attribute to value.
+//
+// value: A string specifying the output stream or logging level to print to.
+// If not specified, defaults to "stderr"
+func PrintV2OutputStream(value string) PrintV2Attr {
+	return func(m optionalAttr) {
+		m["output_stream"] = value
 	}
-	return tensors
 }
 
-// Receives a tensor value broadcast from another device.
-func CollectiveBcastRecv(scope *Scope, T tf.DataType, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
+// Prints a string scalar.
+//
+// Prints a string scalar to the desired output_stream.
+//
+// Arguments:
+//	input: The string scalar to print.
+//
+// Returns the created operation.
+func PrintV2(scope *Scope, input tf.Output, optional ...PrintV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T, "group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "CollectiveBcastRecv",
-
+		Type: "PrintV2",
+		Input: []tf.Input{
+			input,
+		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Decode web-safe base64-encoded strings.
+// The gradient operator for the SparseSlice op.
 //
-// Input may or may not have padding at the end. See EncodeBase64 for padding.
-// Web-safe means that input must use - and _ instead of + and /.
+// This op takes in the upstream gradient w.r.t. non-empty values of
+// the sliced `SparseTensor`, and outputs the gradients w.r.t.
+// the non-empty values of input `SparseTensor`.
 //
 // Arguments:
-//	input: Base64 strings to decode.
+//	backprop_val_grad: 1-D. The gradient with respect to
+// the non-empty values of the sliced `SparseTensor`.
+//	input_indices: 2-D.  The `indices` of the input `SparseTensor`.
+//	input_start: 1-D. tensor represents the start of the slice.
+//	output_indices: 2-D.  The `indices` of the sliced `SparseTensor`.
 //
-// Returns Decoded strings.
-func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns 1-D. The gradient with respect to the non-empty values of input `SparseTensor`.
+func SparseSliceGrad(scope *Scope, backprop_val_grad tf.Output, input_indices tf.Output, input_start tf.Output, output_indices tf.Output) (val_grad tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeBase64",
+		Type: "SparseSliceGrad",
 		Input: []tf.Input{
-			input,
+			backprop_val_grad, input_indices, input_start, output_indices,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Store the input tensor in the state of the current session.
+// Creates a dataset by applying optimizations to `input_dataset`.
+//
+// Creates a dataset by applying optimizations to `input_dataset`.
 //
 // Arguments:
-//	value: The tensor to be stored.
+//	input_dataset: A variant tensor representing the input dataset.
+//	optimizations: A `tf.string` vector `tf.Tensor` identifying optimizations to use.
 //
-// Returns The handle for the tensor stored in the session state, represented
-// as a string.
-func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
+//
+func OptimizeDataset(scope *Scope, input_dataset tf.Output, optimizations tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "GetSessionHandle",
+		Type: "OptimizeDataset",
 		Input: []tf.Input{
-			value,
+			input_dataset, optimizations,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
-type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
+// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
+type ResourceApplyProximalAdagradAttr func(optionalAttr)
 
-// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
 //
 // value: If True, updating of the var and accum tensors will be protected by
 // a lock; otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
+func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
+// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
 //
-// That is for rows we have grad for, we update var and accum as follows:
 // accum += grad * grad
-// prox_v = var
-// prox_v -= lr * grad * (1 / sqrt(accum))
+// prox_v = var - lr * grad * (1 / sqrt(accum))
 // var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
 //
 // Arguments:
 //	var_: Should be from a Variable().
 //	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
 //	l1: L1 regularization. Must be a scalar.
 //	l2: L2 regularization. Must be a scalar.
 //	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
 //
 // Returns the created operation.
-func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
+func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16451,125 +20559,224 @@ func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.O
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalAdagrad",
+		Type: "ResourceApplyProximalAdagrad",
 		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad, indices,
+			var_, accum, lr, l1, l2, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
-type MaxPool3DGradAttr func(optionalAttr)
+// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
+type MutableHashTableOfTensorsV2Attr func(optionalAttr)
 
-// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
+// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["container"] = value
 	}
 }
 
-// Computes gradients of max pooling function.
+// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
+// If not specified, defaults to <>
+func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
+	}
+}
+
+// Creates an empty hash table.
+//
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a vector. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGrad",
-		Input: []tf.Input{
-			orig_input, orig_output, grad,
-		},
+		Type: "MutableHashTableOfTensorsV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the name of the device on which `resource` has been placed.
-func ExperimentalIteratorGetDevice(scope *Scope, resource tf.Output) (device tf.Output) {
+// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
+type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
+
+// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' as FOBOS algorithm with fixed learning rate.
+//
+// prox_v = var - alpha * delta
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	delta: The change.
+//
+// Returns the created operation.
+func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalIteratorGetDevice",
+		Type: "ResourceApplyProximalGradientDescent",
 		Input: []tf.Input{
-			resource,
+			var_, alpha, l1, l2, delta,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns 0 if the denominator is zero.
+//
+//
+// *NOTE*: `DivNoNan` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func DivNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DivNoNan",
+		Input: []tf.Input{
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseReduceSumAttr is an optional argument to SparseReduceSum.
-type SparseReduceSumAttr func(optionalAttr)
+// Subtracts a value from the current value of a variable.
+//
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the decremented value or a subsequent newer one.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
+//
+// Returns the created operation.
+func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AssignSubVariableOp",
+		Input: []tf.Input{
+			resource, value,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
 
-// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
+// RestoreAttr is an optional argument to Restore.
+type RestoreAttr func(optionalAttr)
+
+// RestorePreferredShard sets the optional preferred_shard attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
+// value: Index of file to open first if multiple files match
+// `file_pattern`.
+// If not specified, defaults to -1
+func RestorePreferredShard(value int64) RestoreAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["preferred_shard"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a SparseTensor.
+// Restores a tensor from checkpoint files.
 //
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
-// instead of a sparse one.
+// Reads a tensor stored in one or several files. If there are several files (for
+// instance because a tensor was saved as slices), `file_pattern` may contain
+// wildcard symbols (`*` and `?`) in the filename portion only, not in the
+// directory portion.
 //
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
+// in which file the requested tensor is likely to be found. This op will first
+// open the file at index `preferred_shard` in the list of matching files and try
+// to restore tensors from that file.  Only if some tensors or tensor slices are
+// not found in that first file, then the Op opens all the files. Setting
+// `preferred_shard` to match the value passed as the `shard` input
+// of a matching `Save` Op may speed up Restore.  This attribute only affects
+// performance, not correctness.  The default value -1 means files are processed in
+// order.
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// See also `RestoreSlice`.
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	dt: The type of the tensor to be restored.
 //
-// Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
+// Returns The restored tensor.
+func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dt": dt}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceSum",
+		Type: "Restore",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			file_pattern, tensor_name,
 		},
 		Attrs: attrs,
 	}
@@ -16577,49 +20784,34 @@ func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Outp
 	return op.Output(0)
 }
 
-// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
-type SparseTensorDenseMatMulAttr func(optionalAttr)
+// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
+type QuantizedResizeBilinearAttr func(optionalAttr)
 
-// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
+// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
-// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
-// is transpose(conj(A)).  Otherwise it's transpose(A).
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
 // If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
 	return func(m optionalAttr) {
-		m["adjoint_a"] = value
+		m["align_corners"] = value
 	}
 }
 
-// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
+// Resize quantized `images` to `size` using quantized bilinear interpolation.
 //
-// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
-// is transpose(conj(B)).  Otherwise it's transpose(B).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_b"] = value
-	}
-}
-
-// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
+// Input images and output images must be quantized types.
 //
-// No validity checking is performed on the indices of A.  However, the following
-// input format is recommended for optimal behavior:
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// if adjoint_a == false:
-//   A should be sorted in lexicographically increasing order.  Use SparseReorder
-//   if you're not sure.
-// if adjoint_a == true:
-//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
-//   order instead of "row major" order).
 //
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
-//	b: 2-D.  A dense Matrix.
-func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16628,9 +20820,32 @@ func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseMatMul",
+		Type: "QuantizedResizeBilinear",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			images, size, min, max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+//
+// Arguments:
+//
+//	num_threads: Identifies the number of threads to use for the private threadpool.
+//
+//
+func ExperimentalPrivateThreadPoolDataset(scope *Scope, input_dataset tf.Output, num_threads tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalPrivateThreadPoolDataset",
+		Input: []tf.Input{
+			input_dataset, num_threads,
 		},
 		Attrs: attrs,
 	}
@@ -16638,103 +20853,127 @@ func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Outp
 	return op.Output(0)
 }
 
-// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
-type ResourceApplyRMSPropAttr func(optionalAttr)
+// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
+type DenseToSparseSetOperationAttr func(optionalAttr)
 
-// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
+// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Update '*var' according to the RMSProp algorithm.
+// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
 //
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
 //
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
 //
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// If `validate_indices` is `True`, this op validates the order and range of `set2`
+// indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
+// max set size across `n-1` dimensions.
 //
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
 //
-// Returns the created operation.
-func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"set_operation": set_operation}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyRMSProp",
+		Type: "DenseToSparseSetOperation",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad,
+			set1, set2_indices, set2_values, set2_shape,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// SerializeManySparseAttr is an optional argument to SerializeManySparse.
-type SerializeManySparseAttr func(optionalAttr)
-
-// SerializeManySparseOutType sets the optional out_type attribute to value.
+// L2 Loss.
 //
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
+// Computes half the L2 norm of a tensor without the `sqrt`:
+//
+//     output = sum(t ** 2) / 2
+//
+// Arguments:
+//	t: Typically 2-D, but may have any dimensions.
+//
+// Returns 0-D.
+func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "L2Loss",
+		Input: []tf.Input{
+			t,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
-//
-// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
-// is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The serialized
-// `SparseTensor` objects going into each row of `serialized_sparse` will have
-// rank `R-1`.
+// StackV2Attr is an optional argument to StackV2.
+type StackV2Attr func(optionalAttr)
+
+// StackV2StackName sets the optional stack_name attribute to value.
 //
-// The minibatch size `N` is extracted from `sparse_shape[0]`.
+// value: Overrides the name used for the temporary stack resource. Default
+// value is the name of the 'Stack' op (which is guaranteed unique).
+// If not specified, defaults to ""
+func StackV2StackName(value string) StackV2Attr {
+	return func(m optionalAttr) {
+		m["stack_name"] = value
+	}
+}
+
+// A stack that produces elements in first-in last-out order.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
+//	max_size: The maximum size of the stack if non-negative. If negative, the stack
+// size is unlimited.
+//	elem_type: The type of the elements on the stack.
+//
+// Returns The handle to the stack.
+func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional ...StackV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"elem_type": elem_type}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SerializeManySparse",
+		Type: "StackV2",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			max_size,
 		},
 		Attrs: attrs,
 	}
@@ -16742,256 +20981,261 @@ func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values t
 	return op.Output(0)
 }
 
-// Computes inverse hyperbolic cosine of x element-wise.
-func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Acosh",
-		Input: []tf.Input{
-			x,
-		},
+// CudnnRNNBackpropAttr is an optional argument to CudnnRNNBackprop.
+type CudnnRNNBackpropAttr func(optionalAttr)
+
+// CudnnRNNBackpropRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNBackpropRnnMode(value string) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// TensorArrayV2Attr is an optional argument to TensorArrayV2.
-type TensorArrayV2Attr func(optionalAttr)
+// CudnnRNNBackpropInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNBackpropInputMode(value string) CudnnRNNBackpropAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
 
-// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
+// CudnnRNNBackpropDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNBackpropDirection(value string) CudnnRNNBackpropAttr {
 	return func(m optionalAttr) {
-		m["element_shape"] = value
+		m["direction"] = value
 	}
 }
 
-// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
-// If not specified, defaults to false
-func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
+// CudnnRNNBackpropDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropDropout(value float32) CudnnRNNBackpropAttr {
 	return func(m optionalAttr) {
-		m["dynamic_size"] = value
+		m["dropout"] = value
 	}
 }
 
-// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
-// If not specified, defaults to true
-func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
+// CudnnRNNBackpropSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropSeed(value int64) CudnnRNNBackpropAttr {
 	return func(m optionalAttr) {
-		m["clear_after_read"] = value
+		m["seed"] = value
 	}
 }
 
-// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
-// If not specified, defaults to ""
-func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
+// CudnnRNNBackpropSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNBackpropSeed2(value int64) CudnnRNNBackpropAttr {
 	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
+		m["seed2"] = value
 	}
 }
 
-// Deprecated. Use TensorArrayV3
+// Backprop step of CudnnRNN.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayV3
-func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
+// Compute the backprop of both data and weights in a RNN.
+//
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//     the actual computation before the first layer. 'skip_input' is only allowed
+//     when input_size == num_units; 'auto_select' implies 'skip_input' when
+//     input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// output_backprop: A 3-D tensor with the same shape as output in the forward pass.
+// output_h_backprop: A 3-D tensor with the same shape as output_h in the forward
+//     pass.
+// output_c_backprop: A 3-D tensor with the same shape as output_c in the forward
+//     pass.
+// reserve_space: The same reserve_space produced in for forward operation.
+// input_backprop: The backprop to input in the forward pass. Has the same shape
+//     as input.
+// input_h_backprop: The backprop to input_h in the forward pass. Has the same
+//     shape as input_h.
+// input_c_backprop: The backprop to input_c in the forward pass. Has the same
+//     shape as input_c.
+// params_backprop: The backprop to the params buffer in the forward pass. Has the
+//     same shape as params.
+func CudnnRNNBackprop(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, output tf.Output, output_h tf.Output, output_c tf.Output, output_backprop tf.Output, output_h_backprop tf.Output, output_c_backprop tf.Output, reserve_space tf.Output, optional ...CudnnRNNBackpropAttr) (input_backprop tf.Output, input_h_backprop tf.Output, input_c_backprop tf.Output, params_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayV2",
+		Type: "CudnnRNNBackprop",
 		Input: []tf.Input{
-			size,
+			input, input_h, input_c, params, output, output_h, output_c, output_backprop, output_h_backprop, output_c_backprop, reserve_space,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
-type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
+// InfeedEnqueueAttr is an optional argument to InfeedEnqueue.
+type InfeedEnqueueAttr func(optionalAttr)
 
-// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+// InfeedEnqueueShape sets the optional shape attribute to value.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+// value: The shape of the tensor.
+// If not specified, defaults to <>
+func InfeedEnqueueShape(value tf.Shape) InfeedEnqueueAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["shape"] = value
 	}
 }
 
-// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// InfeedEnqueueLayout sets the optional layout attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+// value: A vector holding the requested layout in minor-to-major sequence.
+// If a layout attribute is passed, but its values are all -1, the layout will
+// be computed by the infeed operation.
+// If not specified, defaults to <>
+func InfeedEnqueueLayout(value []int64) InfeedEnqueueAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["layout"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
+// InfeedEnqueueDeviceOrdinal sets the optional device_ordinal attribute to value.
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func InfeedEnqueueDeviceOrdinal(value int64) InfeedEnqueueAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// An op which feeds a single Tensor value into the computation.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+//	input: A tensor that will be provided using the infeed mechanism.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns the created operation.
+func InfeedEnqueue(scope *Scope, input tf.Output, optional ...InfeedEnqueueAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ThreadUnsafeUnigramCandidateSampler",
+		Type: "InfeedEnqueue",
 		Input: []tf.Input{
-			true_classes,
+			input,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// MaxPoolV2Attr is an optional argument to MaxPoolV2.
-type MaxPoolV2Attr func(optionalAttr)
-
-// MaxPoolV2DataFormat sets the optional data_format attribute to value.
+// Computes softmax cross entropy cost and gradients to backpropagate.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs max pooling on the input.
+// Inputs are the logits, not probabilities.
 //
 // Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	features: batch_size x num_classes matrix
+//	labels: batch_size x num_classes matrix
+// The caller must ensure that each batch of labels represents a valid
+// probability distribution.
 //
-// Returns The max pooled output tensor.
-func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolV2",
+		Type: "SoftmaxCrossEntropyWithLogits",
 		Input: []tf.Input{
-			input, ksize, strides,
+			features, labels,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
-type AddManySparseToTensorsMapAttr func(optionalAttr)
+// ReduceJoinAttr is an optional argument to ReduceJoin.
+type ReduceJoinAttr func(optionalAttr)
 
-// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
+// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
 //
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
+// value: If `True`, retain reduced dimensions with length `1`.
+// If not specified, defaults to false
+func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+// ReduceJoinSeparator sets the optional separator attribute to value.
 //
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
+// value: The separator to use when joining.
 // If not specified, defaults to ""
-func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
+func ReduceJoinSeparator(value string) ReduceJoinAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["separator"] = value
 	}
 }
 
-// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
-//
-// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`, where
-//
-// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
+// Joins a string Tensor across the given dimensions.
 //
-// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
-// having a first `sparse_indices` column taking values between `[0, N)`, where
-// the minibatch size `N == sparse_shape[0]`.
+// Computes the string join across dimensions in the given string Tensor of shape
+// `[\\(d_0, d_1, ..., d_{n-1}\\)]`.  Returns a new Tensor created by joining the input
+// strings with the given separator (default: empty string).  Negative indices are
+// counted backwards from the end, with `-1` being equivalent to `n - 1`.  If
+// indices are not specified, joins across all dimensions beginning from `n - 1`
+// through `0`.
 //
-// The input `SparseTensor` must have rank `R` greater than 1, and the first
-// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The stored
-// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
-// will have rank `R-1`.
+// For example:
 //
-// The `SparseTensor` values can then be read out as part of a minibatch by passing
-// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddManySparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+// ```python
+// # tensor `a` is [["a", "b"], ["c", "d"]]
+// tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
+// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
+// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
+// tf.reduce_join(a, [0, 1]) ==> "acbd"
+// tf.reduce_join(a, [1, 0]) ==> "abcd"
+// tf.reduce_join(a, []) ==> [["a", "b"], ["c", "d"]]
+// tf.reduce_join(a) = tf.reduce_join(a, [1, 0]) ==> "abcd"
+// ```
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-// The minibatch size `N == sparse_shape[0]`.
+//	inputs: The input to be joined.  All reduced indices must have non-zero size.
+//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
+// order specified.  Omitting `reduction_indices` is equivalent to passing
+// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
 //
-// Returns 1-D.  The handles of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.  Shape: `[N]`.
-func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
+// Returns Has shape equal to that of the input with reduced dimensions removed or
+// set to `1` depending on `keep_dims`.
+func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17000,9 +21244,9 @@ func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_va
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AddManySparseToTensorsMap",
+		Type: "ReduceJoin",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			inputs, reduction_indices,
 		},
 		Attrs: attrs,
 	}
@@ -17010,126 +21254,298 @@ func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_va
 	return op.Output(0)
 }
 
-// Concatenates tensors along one dimension.
-//
-// Arguments:
-//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [-rank(values), rank(values)).
+// TopKAttr is an optional argument to TopK.
+type TopKAttr func(optionalAttr)
+
+// TopKSorted sets the optional sorted attribute to value.
 //
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConcatV2",
-		Input: []tf.Input{
-			tf.OutputList(values), axis,
-		},
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKSorted(value bool) TopKAttr {
+	return func(m optionalAttr) {
+		m["sorted"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Reads and outputs the entire contents of the input filename.
-func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
+// Finds values and indices of the `k` largest elements for the last dimension.
+//
+// DEPRECATED at GraphDef version 7: Use TopKV2 instead
+//
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
+//
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
+//
+//     values.shape = indices.shape = input.shape[:-1] + [k]
+//
+// If two elements are equal, the lower-index element appears first.
+//
+// If `k` varies dynamically, use `TopKV2` below.
+//
+// Arguments:
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: Number of top elements to look for along the last dimension (along each
+// row for matrices).
+//
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"k": k}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReadFile",
+		Type: "TopK",
 		Input: []tf.Input{
-			filename,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Returns immutable tensor from memory region.
+// BatchToSpace for N-D tensors of type T.
 //
-// The current implementation memmaps the tensor from a file.
+// This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
+// `block_shape + [batch]`, interleaves these blocks back into the grid defined by
+// the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
+// the input.  The spatial dimensions of this intermediate result are then
+// optionally cropped according to `crops` to produce the output.  This is the
+// reverse of SpaceToBatch.  See below for a precise description.
 //
 // Arguments:
-//	dtype: Type of the returned tensor.
-//	shape: Shape of the returned tensor.
-//	memory_region_name: Name of readonly memory region used by the tensor, see
-// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
-func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
+//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+// where spatial_shape has M dimensions.
+//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
+//	crops: 2-D with shape `[M, 2]`, all values must be >= 0.
+//   `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
+//   dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
+//   required that
+//   `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
+//
+// This operation is equivalent to the following steps:
+//
+// 1. Reshape `input` to `reshaped` of shape:
+//      [block_shape[0], ..., block_shape[M-1],
+//       batch / prod(block_shape),
+//       input_shape[1], ..., input_shape[N-1]]
+//
+// 2. Permute dimensions of `reshaped` to produce `permuted` of shape
+//      [batch / prod(block_shape),
+//
+//       input_shape[1], block_shape[0],
+//       ...,
+//       input_shape[M], block_shape[M-1],
+//
+//       input_shape[M+1], ..., input_shape[N-1]]
+//
+// 3. Reshape `permuted` to produce `reshaped_permuted` of shape
+//      [batch / prod(block_shape),
+//
+//       input_shape[1] * block_shape[0],
+//       ...,
+//       input_shape[M] * block_shape[M-1],
+//
+//       input_shape[M+1],
+//       ...,
+//       input_shape[N-1]]
+//
+// 4. Crop the start and end of dimensions `[1, ..., M]` of
+//    `reshaped_permuted` according to `crops` to produce the output of shape:
+//      [batch / prod(block_shape),
+//
+//       input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
+//       ...,
+//       input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
+//
+//       input_shape[M+1], ..., input_shape[N-1]]
+//
+// Some examples:
+//
+// (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 3]` and value:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[1, 4, 4, 1]` and value:
+//
+// ```
+// x = [[[1],   [2],  [3],  [4]],
+//      [[5],   [6],  [7],  [8]],
+//      [[9],  [10], [11],  [12]],
+//      [[13], [14], [15],  [16]]]
+// ```
+//
+// (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [2, 0]]`:
+//
+// ```
+// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+//      [[[0], [2], [4]]], [[[0], [10], [12]]],
+//      [[[0], [5], [7]]], [[[0], [13], [15]]],
+//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
+// ```
+//
+// The output tensor has shape `[2, 2, 4, 1]` and value:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]]],
+//      [[[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
 	opspec := tf.OpSpec{
-		Type: "ImmutableConst",
-
-		Attrs: attrs,
+		Type: "BatchToSpaceND",
+		Input: []tf.Input{
+			input, block_shape, crops,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StringJoinAttr is an optional argument to StringJoin.
-type StringJoinAttr func(optionalAttr)
+// UnpackAttr is an optional argument to Unpack.
+type UnpackAttr func(optionalAttr)
 
-// StringJoinSeparator sets the optional separator attribute to value.
+// UnpackAxis sets the optional axis attribute to value.
 //
-// value: string, an optional join separator.
-// If not specified, defaults to ""
-func StringJoinSeparator(value string) StringJoinAttr {
+// value: Dimension along which to unpack.  Negative values wrap around, so the
+// valid range is `[-R, R)`.
+// If not specified, defaults to 0
+func UnpackAxis(value int64) UnpackAttr {
 	return func(m optionalAttr) {
-		m["separator"] = value
+		m["axis"] = value
 	}
 }
 
-// Joins the strings in the given list of string tensors into one tensor;
+// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
 //
-// with the given separator (default is an empty separator).
+// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
+// For example, given a tensor of shape `(A, B, C, D)`;
+//
+// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
+//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
+//   dimension unpacked along is gone, unlike `split`).
+//
+// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
+//   and each tensor in `output` will have shape `(A, C, D)`.
+// Etc.
+//
+// This is the opposite of `pack`.
 //
 // Arguments:
-//	inputs: A list of string tensors.  The tensors must all have the same shape,
-// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
-// of non-scalar inputs.
-func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
+//	value: 1-D or higher, with `axis` dimension size equal to `num`.
+//
+//
+// Returns The list of tensors unpacked from `value`.
+func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num": num}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringJoin",
+		Type: "Unpack",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			value,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("Unpack", err)
+		return
+	}
+	return output
 }
 
-// Creates and returns an empty tensor list.
+// Delete the stack from its resource container.
 //
-// All list elements must be tensors of dtype element_dtype and shape compatible
-// with element_shape.
+// Arguments:
+//	handle: The handle to a stack.
 //
-// handle: an empty tensor list.
-// element_dtype: the type of elements in the list.
-// element_shape: a shape compatible with that of elements in the list.
-func EmptyTensorList(scope *Scope, element_shape tf.Output, max_num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+// Returns the created operation.
+func StackCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "EmptyTensorList",
+		Type: "StackCloseV2",
 		Input: []tf.Input{
-			element_shape, max_num_elements,
+			handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Increments variable pointed to by 'resource' until it reaches 'limit'.
+//
+// Arguments:
+//	resource: Should be from a scalar `Variable` node.
+//	limit: If incrementing ref would bring it above limit, instead generates an
+// 'OutOfRange' error.
+//
+//
+// Returns A copy of the input before increment. If nothing else modifies the
+// input, the values produced will all be distinct.
+func ResourceCountUpTo(scope *Scope, resource tf.Output, limit int64, T tf.DataType) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"limit": limit, "T": T}
+	opspec := tf.OpSpec{
+		Type: "ResourceCountUpTo",
+		Input: []tf.Input{
+			resource,
 		},
 		Attrs: attrs,
 	}
@@ -17175,86 +21591,62 @@ func Timestamp(scope *Scope) (ts tf.Output) {
 	return op.Output(0)
 }
 
-// VariableShapeAttr is an optional argument to VariableShape.
-type VariableShapeAttr func(optionalAttr)
-
-// VariableShapeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func VariableShapeOutType(value tf.DataType) VariableShapeAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Returns the shape of the variable pointed to by `resource`.
-//
-// This operation returns a 1-D integer tensor representing the shape of `input`.
+// Returns immutable tensor from memory region.
 //
-// For example:
+// The current implementation memmaps the tensor from a file.
 //
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
-// ```
-func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr) (output tf.Output) {
+// Arguments:
+//	dtype: Type of the returned tensor.
+//	shape: Shape of the returned tensor.
+//	memory_region_name: Name of readonly memory region used by the tensor, see
+// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
+func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
 	opspec := tf.OpSpec{
-		Type: "VariableShape",
-		Input: []tf.Input{
-			input,
-		},
+		Type: "ImmutableConst",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
-type AvgPoolGradAttr func(optionalAttr)
+// StringJoinAttr is an optional argument to StringJoin.
+type StringJoinAttr func(optionalAttr)
 
-// AvgPoolGradDataFormat sets the optional data_format attribute to value.
+// StringJoinSeparator sets the optional separator attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
+// value: string, an optional join separator.
+// If not specified, defaults to ""
+func StringJoinSeparator(value string) StringJoinAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["separator"] = value
 	}
 }
 
-// Computes gradients of the average pooling function.
+// Joins the strings in the given list of string tensors into one tensor;
 //
-// Arguments:
-//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
-// the output of `avg_pool`.
-//	ksize: The size of the sliding window for each dimension of the input.
-//	strides: The stride of the sliding window for each dimension of the input.
-//	padding: The type of padding algorithm to use.
+// with the given separator (default is an empty separator).
 //
-// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
-func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
+// Arguments:
+//	inputs: A list of string tensors.  The tensors must all have the same shape,
+// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
+// of non-scalar inputs.
+func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPoolGrad",
+		Type: "StringJoin",
 		Input: []tf.Input{
-			orig_input_shape, grad,
+			tf.OutputList(inputs),
 		},
 		Attrs: attrs,
 	}
@@ -17262,334 +21654,119 @@ func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize
 	return op.Output(0)
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high overlaps
-// with previously selected boxes.  Bounding boxes with score less than
-// `score_threshold` are removed. N-by-n overlap values are supplied as square matrix,
-// which allows for defining a custom overlap criterium (eg. intersection over union,
-// intersection over area, etc.).
-//
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//
-//   selected_indices = tf.image.non_max_suppression_with_overlaps(
-//       overlaps, scores, max_output_size, overlap_threshold, score_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// Creates and returns an empty tensor list.
 //
-// Arguments:
-//	overlaps: A 2-D float tensor of shape `[num_boxes, num_boxes]` representing
-// the n-by-n box overlap values.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	overlap_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
+// All list elements must be tensors of dtype element_dtype and shape compatible
+// with element_shape.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionWithOverlaps(scope *Scope, overlaps tf.Output, scores tf.Output, max_output_size tf.Output, overlap_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
+// handle: an empty tensor list.
+// element_dtype: the type of elements in the list.
+// element_shape: a shape compatible with that of elements in the list.
+func EmptyTensorList(scope *Scope, element_shape tf.Output, max_num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionWithOverlaps",
+		Type: "EmptyTensorList",
 		Input: []tf.Input{
-			overlaps, scores, max_output_size, overlap_threshold, score_threshold,
+			element_shape, max_num_elements,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softmax cross entropy cost and gradients to backpropagate.
+// Returns a list of tensors with the same shapes and contents as the input
 //
-// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
-// a matrix of label probabilities, but rather a single label per row
-// of features.  This label is considered to have probability 1.0 for the
-// given row.
+// tensors.
 //
-// Inputs are the logits, not probabilities.
+// This op can be used to override the gradient for complicated functions. For
+// example, suppose y = f(x) and we wish to apply a custom function g for backprop
+// such that dx = g(dy). In Python,
 //
-// Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size vector with values in [0, num_classes).
-// This is the label for the given minibatch entry.
+// ```python
+// with tf.get_default_graph().gradient_override_map(
+//     {'IdentityN': 'OverrideGradientWithG'}):
+//   y, _ = identity_n([f(x), x])
 //
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+// @tf.RegisterGradient('OverrideGradientWithG')
+// def ApplyG(op, dy, _):
+//   return [None, g(dy)]  # Do not backprop to f(x).
+// ```
+func IdentityN(scope *Scope, input []tf.Output) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSoftmaxCrossEntropyWithLogits",
+		Type: "IdentityN",
 		Input: []tf.Input{
-			features, labels,
+			tf.OutputList(input),
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Returns the truth value of NOT x element-wise.
-func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "LogicalNot",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// 3D real-valued fast Fourier transform.
-//
-// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 3 dimensions of `input`.
-//
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
-//
-// Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the their 3D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.rfftn with 3 dimensions.
-// @end_compatibility
-func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("IdentityN", err)
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "RFFT3D",
-		Input: []tf.Input{
-			input, fft_length,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TensorArrayV3Attr is an optional argument to TensorArrayV3.
-type TensorArrayV3Attr func(optionalAttr)
-
-// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
-//
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
-//
-// value: A boolean that determines whether writes to the TensorArray
-// are allowed to grow the size.  By default, this is not allowed.
-// If not specified, defaults to false
-func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["dynamic_size"] = value
-	}
+	return output
 }
 
-// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
-//
-// value: If true (default), Tensors in the TensorArray are cleared
-// after being read.  This disables multiple read semantics but allows early
-// release of memory.
-// If not specified, defaults to true
-func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["clear_after_read"] = value
-	}
-}
+// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
+type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
 
-// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
+// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: If true (default is false), then all
-// elements in the TensorArray will be expected to have have identical shapes.
-// This allows certain behaviors, like dynamically checking for
-// consistent shapes on write, and being able to fill in properly
-// shaped zero tensors on stack -- even if the element_shape attribute
-// is not fully defined.
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["identical_element_shapes"] = value
-	}
-}
-
-// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
-//
-// value: Overrides the name used for the temporary tensor_array
-// resource. Default value is the name of the 'TensorArray' op (which
-// is guaranteed unique).
-// If not specified, defaults to ""
-func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
-	}
-}
-
-// An array of Tensors of given size.
-//
-// Write data via Write and read via Read or Pack.
-//
-// Arguments:
-//	size: The size of the array.
-//	dtype: The type of the elements on the tensor_array.
-//
-// Returns The handle to the TensorArray.A scalar used to control gradient flow.
-func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayV3",
-		Input: []tf.Input{
-			size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Runs multiple additive regression ensemble predictors on input instances and
-//
-// computes the logits. It is designed to be used during prediction.
-// It traverses all the trees and calculates the final score for each instance.
-//
-// Arguments:
-//
-//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
-// feature.
-//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
-// shape.
-//
-// Returns Output rank 2 Tensor containing logits for each example.
-func BoostedTreesPredict(scope *Scope, tree_ensemble_handle tf.Output, bucketized_features []tf.Output, logits_dimension int64) (logits tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesPredict",
-		Input: []tf.Input{
-			tree_ensemble_handle, tf.OutputList(bucketized_features),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Elementwise computes the bitwise OR of `x` and `y`.
-//
-// The result will have those bits set, that are set in `x`, `y` or both. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BitwiseOr",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
-type MatrixSolveLsAttr func(optionalAttr)
-
-// MatrixSolveLsFast sets the optional fast attribute to value.
-// If not specified, defaults to true
-func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
+func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
 	return func(m optionalAttr) {
-		m["fast"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Solves one or more linear least-squares problems.
-//
-// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
-// type as `matrix` and shape `[..., M, K]`.
-// The output is a tensor shape `[..., N, K]` where each output matrix solves
-// each of the equations
-// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
-// in the least squares sense.
+// Update '*var' according to the centered RMSProp algorithm.
 //
-// We use the following notation for (complex) matrix and right-hand sides
-// in the batch:
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
 //
-// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
-// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
-// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
-// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
 //
-// If `fast` is `True`, then the solution is computed by solving the normal
-// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
-// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\).
-// If \\(m \lt n\\) then `output` is computed as
-// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
-// minimum-norm solution to the under-determined linear system, i.e.
-// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
-// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
-// when \\(A\\) is numerically full rank and has a condition number
-// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
-// sufficiently large.
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
 //
-// If `fast` is `False` an algorithm based on the numerically robust complete
-// orthogonal decomposition is used. This computes the minimum-norm
-// least-squares solution, even when \\(A\\) is rank deficient. This path is
-// typically 6-7 times slower than the fast path. If `fast` is `False` then
-// `l2_regularizer` is ignored.
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// mg <- rho * mg_{t-1} + (1-rho) * grad
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
+// var <- var - mom
 //
 // Arguments:
-//	matrix: Shape is `[..., M, N]`.
-//	rhs: Shape is `[..., M, K]`.
-//	l2_regularizer: Scalar tensor.
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
 //
-// @compatibility(numpy)
-// Equivalent to np.linalg.lstsq
-// @end_compatibility
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
 //
-// Returns Shape is `[..., N, K]`.
-func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17598,99 +21775,97 @@ func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolveLs",
+		Type: "ResourceApplyCenteredRMSProp",
 		Input: []tf.Input{
-			matrix, rhs, l2_regularizer,
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// MaxPool3DAttr is an optional argument to MaxPool3D.
-type MaxPool3DAttr func(optionalAttr)
+// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
+type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
 
-// MaxPool3DDataFormat sets the optional data_format attribute to value.
+// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DDataFormat(value string) MaxPool3DAttr {
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Performs 3D max pooling on the input.
+// Update '*var' according to the centered RMSProp algorithm.
+//
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
+//
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
 //
-// Returns The max pooled output tensor.
-func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3D",
+		Type: "ResourceSparseApplyCenteredRMSProp",
 		Input: []tf.Input{
-			input,
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Conv3DBackpropInputAttr is an optional argument to Conv3DBackpropInput.
-type Conv3DBackpropInputAttr func(optionalAttr)
-
-// Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
+	return scope.AddOperation(opspec)
 }
 
-// Computes the gradients of 3-D convolution with respect to the input.
-//
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
+// Creates a dataset that batches `batch_size` elements from `input_dataset`.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputAttr) (output tf.Output) {
+//
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//
+//
+func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInput",
+		Type: "BatchDataset",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			input_dataset, batch_size,
 		},
 		Attrs: attrs,
 	}
@@ -17698,103 +21873,82 @@ func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_ba
 	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
-type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
+// LoadTPUEmbeddingAdadeltaParametersAttr is an optional argument to LoadTPUEmbeddingAdadeltaParameters.
+type LoadTPUEmbeddingAdadeltaParametersAttr func(optionalAttr)
 
-// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
+// LoadTPUEmbeddingAdadeltaParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingAdadeltaParametersTableId(value int64) LoadTPUEmbeddingAdadeltaParametersAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["table_id"] = value
 	}
 }
 
-// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
+// LoadTPUEmbeddingAdadeltaParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdadeltaParametersTableName(value string) LoadTPUEmbeddingAdadeltaParametersAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["table_name"] = value
 	}
 }
 
-// Computes the gradients of depthwise convolution with respect to the filter.
+// Load Adadelta embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
-//	input: 4-D with shape based on `data_format`.  For example, if
-// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
-// in_width, in_channels]` tensor.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
+//	parameters: Value of parameters used in the Adadelta optimization algorithm.
+//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
+//	updates: Value of updates used in the Adadelta optimization algorithm.
 //
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingAdadeltaParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropFilter",
+		Type: "LoadTPUEmbeddingAdadeltaParameters",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			parameters, accumulators, updates,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
 // Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
 // The hash function is deterministic on the content of the string within the
-// process. The hash function is a keyed hash function, where attribute `key`
-// defines the key of the hash function. `key` is an array of 2 elements.
-//
-// A strong hash is important when inputs may be malicious, e.g. URLs with
-// additional components. Adversaries could try to make their inputs hash to the
-// same bucket for a denial-of-service attack or to skew the results. A strong
-// hash prevents this by making it difficult, if not infeasible, to compute inputs
-// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
-// time than `tf.string_to_hash_bucket_fast`.
+// process and will never change. However, it is not suitable for cryptography.
+// This function may be used when CPU time is scarce and inputs are trusted or
+// unimportant. There is a risk of adversaries constructing inputs that all hash
+// to the same bucket. To prevent this problem, use a strong hash function with
+// `tf.string_to_hash_bucket_strong`.
 //
 // Arguments:
 //	input: The strings to assign a hash bucket.
 //	num_buckets: The number of buckets.
-//	key: The key for the keyed hash function passed as a list of two uint64
-// elements.
 //
 // Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
+func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucketStrong",
+		Type: "StringToHashBucketFast",
 		Input: []tf.Input{
 			input,
 		},
@@ -17804,33 +21958,31 @@ func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64,
 	return op.Output(0)
 }
 
-// StringLengthAttr is an optional argument to StringLength.
-type StringLengthAttr func(optionalAttr)
+// RealAttr is an optional argument to Real.
+type RealAttr func(optionalAttr)
 
-// StringLengthUnit sets the optional unit attribute to value.
-//
-// value: The unit that is counted to compute string length.  One of: `"BYTE"` (for
-// the number of bytes in each string) or `"UTF8_CHAR"` (for the number of UTF-8
-// encoded Unicode code points in each string).  Results are undefined
-// if `unit=UTF8_CHAR` and the `input` strings do not contain structurally
-// valid UTF-8.
-// If not specified, defaults to "BYTE"
-func StringLengthUnit(value string) StringLengthAttr {
+// RealTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func RealTout(value tf.DataType) RealAttr {
 	return func(m optionalAttr) {
-		m["unit"] = value
+		m["Tout"] = value
 	}
 }
 
-// String lengths of `input`.
+// Returns the real part of a complex number.
 //
-// Computes the length of each string given in the input tensor.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the real part of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
+//  part returned by this operation and *b* is the imaginary part.
 //
-// Arguments:
-//	input: The string for which to compute the length.
+// For example:
 //
-// Returns Integer tensor that has the same shape as `input`. The output contains the
-// element-wise string lengths of `input`.
-func StringLength(scope *Scope, input tf.Output, optional ...StringLengthAttr) (output tf.Output) {
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.real(input) ==> [-2.25, 3.25]
+// ```
+func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17839,7 +21991,7 @@ func StringLength(scope *Scope, input tf.Output, optional ...StringLengthAttr) (
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringLength",
+		Type: "Real",
 		Input: []tf.Input{
 			input,
 		},
@@ -17849,271 +22001,311 @@ func StringLength(scope *Scope, input tf.Output, optional ...StringLengthAttr) (
 	return op.Output(0)
 }
 
-// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
-type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
+// AudioSummaryAttr is an optional argument to AudioSummary.
+type AudioSummaryAttr func(optionalAttr)
 
-// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
 //
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["max_outputs"] = value
 	}
 }
 
-// Update '*var' as FOBOS algorithm with fixed learning rate.
+// Outputs a `Summary` protocol buffer with audio.
 //
-// prox_v = var - alpha * delta
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
+//
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	delta: The change.
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
 //
-// Returns the created operation.
-func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"sample_rate": sample_rate}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalGradientDescent",
+		Type: "AudioSummary",
 		Input: []tf.Input{
-			var_, alpha, l1, l2, delta,
+			tag, tensor,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns 0 if the denominator is zero.
-//
-//
-// *NOTE*: `DivNoNan` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func DivNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DivNoNan",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradient for the sqrt of `x` wrt its input.
+// QrAttr is an optional argument to Qr.
+type QrAttr func(optionalAttr)
+
+// QrFullMatrices sets the optional full_matrices attribute to value.
 //
-// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SqrtGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
+// value: If true, compute full-sized `q` and `r`. If false
+// (the default), compute only the leading `P` columns of `q`.
+// If not specified, defaults to false
+func QrFullMatrices(value bool) QrAttr {
+	return func(m optionalAttr) {
+		m["full_matrices"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Get the value of the tensor specified by its handle.
+// Computes the QR decompositions of one or more matrices.
+//
+// Computes the QR decomposition of each inner matrix in `tensor` such that
+// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
+//
+// ```python
+// # a is a tensor.
+// # q is a tensor of orthonormal matrices.
+// # r is a tensor of upper triangular matrices.
+// q, r = qr(a)
+// q_full, r_full = qr(a, full_matrices=True)
+// ```
 //
 // Arguments:
-//	handle: The handle for a tensor stored in the session state.
-//	dtype: The type of the output value.
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
 //
-// Returns The tensor for the given handle.
-func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
+// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
+// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
+// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
+func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "GetSessionTensor",
+		Type: "Qr",
 		Input: []tf.Input{
-			handle,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Returns x - y element-wise.
+// TensorArrayV3Attr is an optional argument to TensorArrayV3.
+type TensorArrayV3Attr func(optionalAttr)
+
+// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
 //
-// *NOTE*: `Subtract` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Sub",
-		Input: []tf.Input{
-			x, y,
-		},
+}
+
+// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
+//
+// value: A boolean that determines whether writes to the TensorArray
+// are allowed to grow the size.  By default, this is not allowed.
+// If not specified, defaults to false
+func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["dynamic_size"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// RandomPoissonAttr is an optional argument to RandomPoisson.
-type RandomPoissonAttr func(optionalAttr)
+// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
+//
+// value: If true (default), Tensors in the TensorArray are cleared
+// after being read.  This disables multiple read semantics but allows early
+// release of memory.
+// If not specified, defaults to true
+func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["clear_after_read"] = value
+	}
+}
 
-// RandomPoissonSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed(value int64) RandomPoissonAttr {
+// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
+//
+// value: If true (default is false), then all
+// elements in the TensorArray will be expected to have have identical shapes.
+// This allows certain behaviors, like dynamically checking for
+// consistent shapes on write, and being able to fill in properly
+// shaped zero tensors on stack -- even if the element_shape attribute
+// is not fully defined.
+// If not specified, defaults to false
+func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["identical_element_shapes"] = value
 	}
 }
 
-// RandomPoissonSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed2(value int64) RandomPoissonAttr {
+// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
+//
+// value: Overrides the name used for the temporary tensor_array
+// resource. Default value is the name of the 'TensorArray' op (which
+// is guaranteed unique).
+// If not specified, defaults to ""
+func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["tensor_array_name"] = value
 	}
 }
 
-// Use RandomPoissonV2 instead.
+// An array of Tensors of given size.
 //
-// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
-func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
+// Write data via Write and read via Read or Pack.
+//
+// Arguments:
+//	size: The size of the array.
+//	dtype: The type of the elements on the tensor_array.
+//
+// Returns The handle to the TensorArray.A scalar used to control gradient flow.
+func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomPoisson",
+		Type: "TensorArrayV3",
 		Input: []tf.Input{
-			shape, rate,
+			size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Returns the truth value of NOT x element-wise.
+func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogicalNot",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
+// 3D real-valued fast Fourier transform.
 //
-// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 3 dimensions of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
+//
+// Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the their 3D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfftn with 3 dimensions.
+// @end_compatibility
+func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Maximum",
+		Type: "RFFT3D",
 		Input: []tf.Input{
-			x, y,
+			input, fft_length,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softmax cross entropy cost and gradients to backpropagate.
-//
-// Inputs are the logits, not probabilities.
-//
-// Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size x num_classes matrix
-// The caller must ensure that each batch of labels represents a valid
-// probability distribution.
-//
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+// Computes rectified linear: `max(features, 0)`.
+func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SoftmaxCrossEntropyWithLogits",
+		Type: "Relu",
 		Input: []tf.Input{
-			features, labels,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// ReduceJoinAttr is an optional argument to ReduceJoin.
-type ReduceJoinAttr func(optionalAttr)
+// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
+type ResourceApplyAddSignAttr func(optionalAttr)
 
-// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
+// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, retain reduced dimensions with length `1`.
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// ReduceJoinSeparator sets the optional separator attribute to value.
-//
-// value: The separator to use when joining.
-// If not specified, defaults to ""
-func ReduceJoinSeparator(value string) ReduceJoinAttr {
+func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
 	return func(m optionalAttr) {
-		m["separator"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Joins a string Tensor across the given dimensions.
-//
-// Computes the string join across dimensions in the given string Tensor of shape
-// `[\\(d_0, d_1, ..., d_{n-1}\\)]`.  Returns a new Tensor created by joining the input
-// strings with the given separator (default: empty string).  Negative indices are
-// counted backwards from the end, with `-1` being equivalent to `n - 1`.  If
-// indices are not specified, joins across all dimensions beginning from `n - 1`
-// through `0`.
-//
-// For example:
+// Update '*var' according to the AddSign update.
 //
-// ```python
-// # tensor `a` is [["a", "b"], ["c", "d"]]
-// tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
-// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
-// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
-// tf.reduce_join(a, [0, 1]) ==> "acbd"
-// tf.reduce_join(a, [1, 0]) ==> "abcd"
-// tf.reduce_join(a, []) ==> [["a", "b"], ["c", "d"]]
-// tf.reduce_join(a) = tf.reduce_join(a, [1, 0]) ==> "abcd"
-// ```
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
+// variable <- variable - lr_t * update
 //
 // Arguments:
-//	inputs: The input to be joined.  All reduced indices must have non-zero size.
-//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
-// order specified.  Omitting `reduction_indices` is equivalent to passing
-// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	alpha: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
+//	grad: The gradient.
 //
-// Returns Has shape equal to that of the input with reduced dimensions removed or
-// set to `1` depending on `keep_dims`.
-func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18122,88 +22314,97 @@ func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, opt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ReduceJoin",
+		Type: "ResourceApplyAddSign",
 		Input: []tf.Input{
-			inputs, reduction_indices,
+			var_, m, lr, alpha, sign_decay, beta, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes cos of x element-wise.
-func Cos(scope *Scope, x tf.Output) (y tf.Output) {
+// Divides sparse updates into the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] /= updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] /= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions multiply.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterDiv(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cos",
+		Type: "ResourceScatterDiv",
 		Input: []tf.Input{
-			x,
+			resource, indices, updates,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
-type FusedBatchNormGradAttr func(optionalAttr)
+// ListDiffAttr is an optional argument to ListDiff.
+type ListDiffAttr func(optionalAttr)
 
-// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
+// ListDiffOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func ListDiffOutIdx(value tf.DataType) ListDiffAttr {
 	return func(m optionalAttr) {
-		m["epsilon"] = value
+		m["out_idx"] = value
 	}
 }
 
-// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
+// Computes the difference between two lists of numbers or strings.
 //
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
+// Given a list `x` and a list `y`, this operation returns a list `out` that
+// represents all values that are in `x` but not in `y`. The returned list `out`
+// is sorted in the same order that the numbers appear in `x` (duplicates are
+// preserved). This operation also returns a list `idx` that represents the
+// position of each `out` element in `x`. In other words:
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Gradient for batch normalization.
+// `out[i] = x[idx[i]] for i in [0, 1, ..., len(out) - 1]`
 //
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// For example, given this input:
+//
+// ```
+// x = [1, 2, 3, 4, 5, 6]
+// y = [1, 3, 5]
+// ```
+//
+// This operation would return:
+//
+// ```
+// out ==> [2, 4, 6]
+// idx ==> [1, 3, 5]
+// ```
 //
 // Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-// mean to be reused in gradient computation. When is_training is
-// False, a 1D Tensor for the population mean to be reused in both
-// 1st and 2nd order gradient computation.
-//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-// variance (inverted variance in the cuDNN case) to be reused in
-// gradient computation. When is_training is False, a 1D Tensor
-// for the population variance to be reused in both 1st and 2nd
-// order gradient computation.
+//	x: 1-D. Values to keep.
+//	y: 1-D. Values to remove.
 //
-// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+// Returns 1-D. Values present in `x` but not in `y`.1-D. Positions of `x` values preserved in `out`.
+func ListDiff(scope *Scope, x tf.Output, y tf.Output, optional ...ListDiffAttr) (out tf.Output, idx tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18212,258 +22413,230 @@ func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGrad",
+		Type: "ListDiff",
 		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+			x, y,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0), op.Output(1)
 }
 
-// TopKAttr is an optional argument to TopK.
-type TopKAttr func(optionalAttr)
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingAdadeltaParametersGradAccumDebug.
+type LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
 
-// TopKSorted sets the optional sorted attribute to value.
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: If true the resulting `k` elements will be sorted by the values in
-// descending order.
-// If not specified, defaults to true
-func TopKSorted(value bool) TopKAttr {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["sorted"] = value
+		m["table_id"] = value
 	}
 }
 
-// Finds values and indices of the `k` largest elements for the last dimension.
-//
-// DEPRECATED at GraphDef version 7: Use TopKV2 instead
-//
-// If the input is a vector (rank-1), finds the `k` largest entries in the vector
-// and outputs their values and indices as vectors.  Thus `values[j]` is the
-// `j`-th largest entry in `input`, and its index is `indices[j]`.
-//
-// For matrices (resp. higher rank input), computes the top `k` entries in each
-// row (resp. vector along the last dimension).  Thus,
+// LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Adadelta parameters with debug support.
 //
-//     values.shape = indices.shape = input.shape[:-1] + [k]
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
-// If two elements are equal, the lower-index element appears first.
+// Arguments:
+//	parameters: Value of parameters used in the Adadelta optimization algorithm.
+//	accumulators: Value of accumulators used in the Adadelta optimization algorithm.
+//	updates: Value of updates used in the Adadelta optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the Adadelta optimization algorithm.
 //
-// If `k` varies dynamically, use `TopKV2` below.
 //
-// Arguments:
-//	input: 1-D or higher with last dimension at least `k`.
-//	k: Number of top elements to look for along the last dimension (along each
-// row for matrices).
 //
-// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
-func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
+// Returns the created operation.
+func LoadTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"k": k}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TopK",
+		Type: "LoadTPUEmbeddingAdadeltaParametersGradAccumDebug",
 		Input: []tf.Input{
-			input,
+			parameters, accumulators, updates, gradient_accumulators,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// The gradient operator for the SparseAdd op.
-//
-// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
-// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
-// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
-// values of A and B.
-//
-// Arguments:
-//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
-// the non-empty values of the sum.
-//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
-//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
-//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
-// `[nnz(sum), ndims]`.
-//
-// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
-// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
-// non-empty values of B.
-func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
+// Return a tensor with the same shape and contents as the input tensor or value.
+func Identity(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseAddGrad",
+		Type: "Identity",
 		Input: []tf.Input{
-			backprop_val_grad, a_indices, b_indices, sum_indices,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Returns a list of tensors with the same shapes and contents as the input
-//
-// tensors.
-//
-// This op can be used to override the gradient for complicated functions. For
-// example, suppose y = f(x) and we wish to apply a custom function g for backprop
-// such that dx = g(dy). In Python,
-//
-// ```python
-// with tf.get_default_graph().gradient_override_map(
-//     {'IdentityN': 'OverrideGradientWithG'}):
-//   y, _ = identity_n([f(x), x])
+// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
 //
-// @tf.RegisterGradient('OverrideGradientWithG')
-// def ApplyG(op, dy, _):
-//   return [None, g(dy)]  # Do not backprop to f(x).
-// ```
-func IdentityN(scope *Scope, input []tf.Output) (output []tf.Output) {
+// This is the angle \( \theta \in [-\pi, \pi] \) such that
+// \[ x = r \cos(\theta) \]
+// and
+// \[ y = r \sin(\theta) \]
+// where \(r = \sqrt(x^2 + y^2) \).
+func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IdentityN",
+		Type: "Atan2",
 		Input: []tf.Input{
-			tf.OutputList(input),
+			y, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+//     Updates specified rows with values in `v`.
+//
+//     Computes `x[i, :] = v; return x`.
+//
+// Arguments:
+//	x: A tensor of type `T`.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
+//
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceUpdate(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("IdentityN", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "InplaceUpdate",
+		Input: []tf.Input{
+			x, i, v,
+		},
 	}
-	return output
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
-type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
+// OutfeedDequeueTupleAttr is an optional argument to OutfeedDequeueTuple.
+type OutfeedDequeueTupleAttr func(optionalAttr)
 
-// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+// OutfeedDequeueTupleDeviceOrdinal sets the optional device_ordinal attribute to value.
 //
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func OutfeedDequeueTupleDeviceOrdinal(value int64) OutfeedDequeueTupleAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["device_ordinal"] = value
 	}
 }
 
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-//
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+// Retrieve multiple values from the computation outfeed.
 //
-// mg <- rho * mg_{t-1} + (1-rho) * grad
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
-// var <- var - mom
+// This operation will block indefinitely until data is available. Output `i`
+// corresponds to XLA tuple element `i`.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
+//	dtypes: The element types of each element in `outputs`.
+//	shapes: The shapes of each tensor in `outputs`.
 //
-// Returns the created operation.
-func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
+// Returns A list of tensors that will be read from the outfeed.
+func OutfeedDequeueTuple(scope *Scope, dtypes []tf.DataType, shapes []tf.Shape, optional ...OutfeedDequeueTupleAttr) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes, "shapes": shapes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyCenteredRMSProp",
+		Type: "OutfeedDequeueTuple",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("OutfeedDequeueTuple", err)
+		return
+	}
+	return outputs
+}
+
+// Identity op for gradient debugging.
+//
+// This op is hidden from public in Python. It is used by TensorFlow Debugger to
+// register gradient tensors for gradient debugging.
+// This op operates on non-reference-type tensors.
+func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DebugGradientIdentity",
 		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
+			input,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
-type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
+// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
+type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
 
-// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
+func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// var: Should be from a Variable().
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
 //
-//	epsilon: Ridge term. Must be a scalar.
+//	accum: Should be from a Variable().
+//	accum_update: : Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
 //	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
 // Returns the created operation.
-func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
+func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18472,105 +22645,97 @@ func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyCenteredRMSProp",
+		Type: "ResourceSparseApplyAdadelta",
 		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset that batches `batch_size` elements from `input_dataset`.
-//
-// Arguments:
-//
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//
+// Returns which elements of x are NaN.
 //
-func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.isnan
+// @end_compatibility
+func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "BatchDataset",
-		Input: []tf.Input{
-			input_dataset, batch_size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
-type RandomPoissonV2Attr func(optionalAttr)
-
-// RandomPoissonV2Seed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomPoissonV2Seed(value int64) RandomPoissonV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsNan",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RandomPoissonV2Seed2 sets the optional seed2 attribute to value.
+// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
+type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
+
+// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomPoissonV2Seed2(value int64) RandomPoissonV2Attr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["data_format"] = value
 	}
 }
 
-// RandomPoissonV2Dtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func RandomPoissonV2Dtype(value tf.DataType) RandomPoissonV2Attr {
+// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["dilations"] = value
 	}
 }
 
-// Outputs random values from the Poisson distribution(s) described by rate.
-//
-// This op uses two algorithms, depending on rate. If rate >= 10, then
-// the algorithm by Hormann is used to acquire samples via
-// transformation-rejection.
-// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
-//
-// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
-// random variables.
-// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
-// Programming, Volume 2. Addison Wesley
+// Computes the gradients of depthwise convolution with respect to the filter.
 //
 // Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in rate.
-//	rate: A tensor in which each scalar is a "rate" parameter describing the
-// associated poisson distribution.
+//	input: 4-D with shape based on `data_format`.  For example, if
+// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
+// in_width, in_channels]` tensor.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A tensor with shape `shape + shape(rate)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `rate[i0, i1, ...iN]`.
-func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonV2Attr) (output tf.Output) {
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomPoissonV2",
+		Type: "DepthwiseConv2dNativeBackpropFilter",
 		Input: []tf.Input{
-			shape, rate,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -18578,115 +22743,206 @@ func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...
 	return op.Output(0)
 }
 
-// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
-type DecodeAndCropJpegAttr func(optionalAttr)
+// MapUnstageAttr is an optional argument to MapUnstage.
+type MapUnstageAttr func(optionalAttr)
 
-// DecodeAndCropJpegChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
+// MapUnstageCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
-func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
+//
+// REQUIRES: value >= 0
+func MapUnstageCapacity(value int64) MapUnstageAttr {
 	return func(m optionalAttr) {
-		m["channels"] = value
+		m["capacity"] = value
 	}
 }
 
-// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
+// MapUnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
+// REQUIRES: value >= 0
+func MapUnstageMemoryLimit(value int64) MapUnstageAttr {
 	return func(m optionalAttr) {
-		m["ratio"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
-//
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
+// MapUnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapUnstageContainer(value string) MapUnstageAttr {
 	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
+		m["container"] = value
 	}
 }
 
-// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
-//
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
+// MapUnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapUnstageSharedName(value string) MapUnstageAttr {
 	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
+		m["shared_name"] = value
 	}
 }
 
-// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+// Op removes and returns the values associated with the key
 //
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
+// from the underlying container.   If the underlying container
+// does not contain this key, the op will block until it does.
+func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapUnstage",
+		Input: []tf.Input{
+			key, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapUnstage", err)
+		return
 	}
+	return values
 }
 
-// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
+// An op enabling differentiation of TPU Embeddings.
 //
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["dct_method"] = value
+// This op simply returns its first input, which is assumed to have been sliced
+// from the Tensors returned by TPUEmbeddingDequeueActivations. The presence of
+// this op, and its first argument being a trainable Variable, enables automatic
+// differentiation of graphs containing embeddings via the TPU Embedding Python
+// libraries.
+//
+// Arguments:
+//	embedding_variable: A trainable variable, enabling optimizers to find this op.
+//	sliced_activations: The embedding activations Tensor to return.
+//	table_id: The id of the table in the embedding layer configuration from which
+// these activations were computed.
+//	lookup_id: Identifier of the set of embedding indices which produced these
+// activations.
+func TPUEmbeddingActivations(scope *Scope, embedding_variable tf.Output, sliced_activations tf.Output, table_id int64, lookup_id int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"table_id": table_id, "lookup_id": lookup_id}
+	opspec := tf.OpSpec{
+		Type: "TPUEmbeddingActivations",
+		Input: []tf.Input{
+			embedding_variable, sliced_activations,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Decode and Crop a JPEG-encoded image to a uint8 tensor.
+// BatchToSpace for 4-D tensors of type T.
 //
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
+// This is a legacy version of the more general BatchToSpaceND.
 //
-// Accepted values are:
+// Rearranges (permutes) data from batch into blocks of spatial data, followed by
+// cropping. This is the reverse transformation of SpaceToBatch. More specifically,
+// this op outputs a copy of the input tensor where values from the `batch`
+// dimension are moved in spatial blocks to the `height` and `width` dimensions,
+// followed by cropping along the `height` and `width` dimensions.
 //
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
+// Arguments:
+//	input: 4-D tensor with shape
+// `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
+//   depth]`. Note that the batch size of the input tensor must be divisible by
+// `block_size * block_size`.
+//	crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
+// how many elements to crop from the intermediate result across the spatial
+// dimensions as follows:
 //
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
+//     crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
 //
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
 //
+// Returns 4-D with shape `[batch, height, width, depth]`, where:
 //
-// It is equivalent to a combination of decode and crop, but much faster by only
-// decoding partial jpeg image.
+//       height = height_pad - crop_top - crop_bottom
+//       width = width_pad - crop_left - crop_right
 //
-// Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
-//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
+// The attr `block_size` must be greater than one. It indicates the block size.
 //
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
+// Some examples:
+//
+// (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 3]` and value:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[1, 4, 4, 1]` and value:
+//
+// ```
+// x = [[[1],   [2],  [3],  [4]],
+//      [[5],   [6],  [7],  [8]],
+//      [[9],  [10], [11],  [12]],
+//      [[13], [14], [15],  [16]]]
+// ```
+//
+// (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
+//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[2, 2, 4, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]], [[5], [7]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"block_size": block_size}
 	opspec := tf.OpSpec{
-		Type: "DecodeAndCropJpeg",
+		Type: "BatchToSpace",
 		Input: []tf.Input{
-			contents, crop_window,
+			input, crops,
 		},
 		Attrs: attrs,
 	}
@@ -18694,306 +22950,310 @@ func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output,
 	return op.Output(0)
 }
 
-// Adds two `SparseTensor` objects to produce another `SparseTensor`.
+// Produces a summary of any statistics recorded by the given statistics manager.
+func ExperimentalStatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalStatsAggregatorSummary",
+		Input: []tf.Input{
+			iterator,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Makes a new iterator from the given `dataset` and stores it in `iterator`.
 //
-// The input `SparseTensor` objects' indices are assumed ordered in standard
-// lexicographic order.  If this is not the case, before this step run
-// `SparseReorder` to restore index ordering.
+// This operation may be executed multiple times. Each execution will reset the
+// iterator in `iterator` to the first element of `dataset`.
 //
-// By default, if two values sum to zero at some index, the output `SparseTensor`
-// would still include that particular location in its index, storing a zero in the
-// corresponding value slot.  To override this, callers can specify `thresh`,
-// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
-// corresponding value and index would then not be included.  In particular,
-// `thresh == 0` (default) means everything is kept and actual thresholding happens
-// only for a positive value.
+// Returns the created operation.
+func MakeIterator(scope *Scope, dataset tf.Output, iterator tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MakeIterator",
+		Input: []tf.Input{
+			dataset, iterator,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Component-wise divides a SparseTensor by a dense Tensor.
 //
-// In the following shapes, `nnz` is the count after taking `thresh` into account.
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
-//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
-//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
-//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
-// pair takes space.
-func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseAdd",
+		Type: "SparseDenseCwiseDiv",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
+			sp_indices, sp_values, sp_shape, dense,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
-type QuantizedRelu6Attr func(optionalAttr)
-
-// QuantizedRelu6OutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
+	return op.Output(0)
 }
 
-// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
+// Creates a dataset that batches and pads `batch_size` elements from the input.
 //
 // Arguments:
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	padded_shapes: A list of int64 tensors representing the desired padded shapes
+// of the corresponding output components. These shapes may be partially
+// specified, using `-1` to indicate that a particular dimension should be
+// padded to the maximum size of all batch elements.
+//	padding_values: A list of scalars containing the padding value to use for
+// each of the outputs.
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "QuantizedRelu6",
+		Type: "PaddedBatchDataset",
 		Input: []tf.Input{
-			features, min_features, max_features,
+			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
-type FixedLengthRecordReaderV2Attr func(optionalAttr)
+// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
+type ResourceApplyMomentumAttr func(optionalAttr)
 
-// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
+// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// value: Number of bytes in the header, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["header_bytes"] = value
+		m["use_locking"] = value
 	}
 }
 
-// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
+// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-// value: Number of bytes in the footer, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["footer_bytes"] = value
+		m["use_nesterov"] = value
 	}
 }
 
-// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
+// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
 //
-// value: Number of bytes to hop before each read. Default of 0 means using
-// record_bytes.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["hop_bytes"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
+// want to use Nesterov momentum.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
+// accum = accum * momentum + grad
+// var -= lr * accum
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyMomentum",
+		Input: []tf.Input{
+			var_, accum, lr, grad, momentum,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
 }
 
-// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
+// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
+type MaxPoolGradGradAttr func(optionalAttr)
+
+// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
 //
-// value: The type of encoding for the file. Currently ZLIB and GZIP
-// are supported. Defaults to none.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
 	return func(m optionalAttr) {
-		m["encoding"] = value
+		m["data_format"] = value
 	}
 }
 
-// A Reader that outputs fixed-length records from a file.
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	record_bytes: Number of bytes in the record.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns The handle to reference the Reader.
-func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"record_bytes": record_bytes}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FixedLengthRecordReaderV2",
-
+		Type: "MaxPoolGradGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, grad,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AudioSummaryAttr is an optional argument to AudioSummary.
-type AudioSummaryAttr func(optionalAttr)
-
-// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
+// Returns the last element of the input list as well as a list with all but that element.
 //
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
+// Fails if the list is empty.
 //
-// REQUIRES: value >= 1
-func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_outputs"] = value
+// input_handle: the input list
+// tensor: the withdrawn last element of the list
+// element_dtype: the type of elements in the list
+// element_shape: the shape of the output tensor
+func TensorListPopBack(scope *Scope, input_handle tf.Output, element_shape tf.Output, element_dtype tf.DataType) (output_handle tf.Output, tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListPopBack",
+		Input: []tf.Input{
+			input_handle, element_shape,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// Outputs a `Summary` protocol buffer with audio.
-//
-// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
+// Determine the script codes of a given tensor of Unicode integer code points.
 //
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+// This operation converts Unicode code points to script codes corresponding to
+// each code point. Script codes correspond to International Components for
+// Unicode (ICU) UScriptCode values. See http://icu-project.org/apiref/icu4c/uscript_8h.html.
+// Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints. Output shape will
+// match input shape.
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
+//	input: A Tensor of int32 Unicode code points.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
+// Returns A Tensor of int32 script codes corresponding to each input code point.
+func UnicodeScript(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sample_rate": sample_rate}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "AudioSummary",
+		Type: "UnicodeScript",
 		Input: []tf.Input{
-			tag, tensor,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QrAttr is an optional argument to Qr.
-type QrAttr func(optionalAttr)
-
-// QrFullMatrices sets the optional full_matrices attribute to value.
+// Creates a sequence of numbers.
 //
-// value: If true, compute full-sized `q` and `r`. If false
-// (the default), compute only the leading `P` columns of `q`.
-// If not specified, defaults to false
-func QrFullMatrices(value bool) QrAttr {
-	return func(m optionalAttr) {
-		m["full_matrices"] = value
-	}
-}
-
-// Computes the QR decompositions of one or more matrices.
+// This operation creates a sequence of numbers that begins at `start` and
+// extends by increments of `delta` up to but not including `limit`.
 //
-// Computes the QR decomposition of each inner matrix in `tensor` such that
-// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
+// For example:
 //
-// ```python
-// # a is a tensor.
-// # q is a tensor of orthonormal matrices.
-// # r is a tensor of upper triangular matrices.
-// q, r = qr(a)
-// q_full, r_full = qr(a, full_matrices=True)
+// ```
+// # 'start' is 3
+// # 'limit' is 18
+// # 'delta' is 3
+// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
 // ```
 //
 // Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//	start: 0-D (scalar). First entry in the sequence.
+//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
+//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
 //
-// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
-// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
-// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
-func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
+// Returns 1-D.
+func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Qr",
+		Type: "Range",
 		Input: []tf.Input{
-			input,
+			start, limit, delta,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Check if the input matches the regex pattern.
-//
-// The input is a string tensor of any shape. The pattern is the
-// regular expression to be matched with every element of the input tensor.
-// The boolean values (True or False) of the output tensor indicate
-// if the input matches the regex pattern provided.
-//
-// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	input: A string tensor of the text to be processed.
-//	pattern: The regular expression to match the input.
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// input of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A bool tensor with the same shape as `input`.
-func StaticRegexFullMatch(scope *Scope, input tf.Output, pattern string) (output tf.Output) {
+// Returns Gradients of gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pattern": pattern}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "StaticRegexFullMatch",
+		Type: "MaxPoolGradGradWithArgmax",
 		Input: []tf.Input{
-			input,
+			input, grad, argmax,
 		},
 		Attrs: attrs,
 	}
@@ -19001,145 +23261,175 @@ func StaticRegexFullMatch(scope *Scope, input tf.Output, pattern string) (output
 	return op.Output(0)
 }
 
-// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
-type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
-
-// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+// Return a slice from 'input'.
 //
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
+// The output tensor is a tensor with dimensions described by 'size'
+// whose values are extracted from 'input' starting at the offsets in
+// 'begin'.
 //
-// That is for rows we have grad for, we update var as follows:
-// prox_v = var - alpha * grad
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+// *Requirements*:
+//   0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
 //
-// Returns the created operation.
-func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
+//	begin: begin[i] specifies the offset into the 'i'th dimension of
+// 'input' to slice from.
+//	size: size[i] specifies the number of elements of the 'i'th dimension
+// of 'input' to slice. If size[i] is -1, all remaining elements in dimension
+// i are included in the slice (i.e. this is equivalent to setting
+// size[i] = input.dim_size(i) - begin[i]).
+func Slice(scope *Scope, input tf.Output, begin tf.Output, size tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalGradientDescent",
+		Type: "Slice",
 		Input: []tf.Input{
-			var_, alpha, l1, l2, grad, indices,
+			input, begin, size,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Real-valued fast Fourier transform.
-//
-// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most dimension of `input`.
-//
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
-// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
-// followed by the `fft_length / 2` positive-frequency terms.
-//
-// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
 //
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
+// The Hurwitz zeta function is defined as:
 //
-// Returns A complex64 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
-//   frequency components of its 1D Fourier transform.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft
-// @end_compatibility
-func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
+func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RFFT",
+		Type: "Zeta",
 		Input: []tf.Input{
-			input, fft_length,
+			x, q,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Adds a value to the current value of a variable.
+// Returns the cardinality of `input_dataset`.
 //
-// Any ReadVariableOp with a control dependency on this op is guaranteed to
-// see the incremented value or a subsequent newer one.
+// Returns the cardinality of `input_dataset`.
 //
 // Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
+//	input_dataset: A variant tensor representing the dataset to return cardinality for.
 //
-// Returns the created operation.
-func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+// Returns The cardinality of `input_dataset`. Named constants are used to represent
+// infinite and unknown cardinality.
+func ExperimentalDatasetCardinality(scope *Scope, input_dataset tf.Output) (cardinality tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AssignAddVariableOp",
+		Type: "ExperimentalDatasetCardinality",
 		Input: []tf.Input{
-			resource, value,
+			input_dataset,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// QuantizedReluAttr is an optional argument to QuantizedRelu.
-type QuantizedReluAttr func(optionalAttr)
+// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
+type TakeManySparseFromTensorsMapAttr func(optionalAttr)
 
-// QuantizedReluOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
+// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` read by this op.
+// If not specified, defaults to ""
+func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["container"] = value
 	}
 }
 
-// Computes Quantized Rectified Linear: `max(features, 0)`
+// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// Arguments:
+// value: The shared name for the `SparseTensorsMap` read by this op.
+// It should not be blank; rather the `shared_name` or unique Operation name
+// of the Op that created the original `SparseTensorsMap` should be used.
+// If not specified, defaults to ""
+func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
+// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
+// `N` is the minibatch size and the rows correspond to the output handles of
+// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
+// original `SparseTensor` objects that went into the given input ops must all
+// match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension on the left).
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the handles represent an input, which is a `[2, 3]` matrix
+// representing two original `SparseTensor` objects:
+//
+// ```
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+// ```
+//
+// and
+//
+// ```
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+// ```
+//
+// then the final `SparseTensor` will be:
+//
+// ```
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+// ```
+//
+// Arguments:
+//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
+// Shape: `[N]`.
+//	dtype: The `dtype` of the `SparseTensor` objects stored in the
+// `SparseTensorsMap`.
+//
+// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
+func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedRelu",
+		Type: "TakeManySparseFromTensorsMap",
 		Input: []tf.Input{
-			features, min_features, max_features,
+			sparse_handles,
 		},
 		Attrs: attrs,
 	}
@@ -19147,344 +23437,350 @@ func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Reshapes a SparseTensor to represent values in a new dense shape.
+// ResourceSparseApplyKerasMomentumAttr is an optional argument to ResourceSparseApplyKerasMomentum.
+type ResourceSparseApplyKerasMomentumAttr func(optionalAttr)
+
+// ResourceSparseApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// This operation has the same semantics as reshape on the represented dense
-// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyKerasMomentumUseLocking(value bool) ResourceSparseApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceSparseApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-// If one component of `new_shape` is the special value -1, the size of that
-// dimension is computed so that the total dense size remains constant.  At
-// most one component of `new_shape` can be -1.  The number of dense elements
-// implied by `new_shape` must be the same as the number of dense elements
-// originally implied by `input_shape`.
+// value: If `True`, the tensor passed to compute grad will be
+// var + momentum * accum, so in the end, the var you get is actually
+// var + momentum * accum.
+// If not specified, defaults to false
+func ResourceSparseApplyKerasMomentumUseNesterov(value bool) ResourceSparseApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
 //
-// Reshaping does not affect the order of values in the SparseTensor.
+// Set use_nesterov = True if you want to use Nesterov momentum.
 //
-// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
-// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
-// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
-// `output_shape` has length `R_out`.
+// That is for rows we have grad for, we update var and accum as follows:
+//
+// accum = accum * momentum - lr * grad
+// var += accum
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
-// SparseTensor.
-//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
-//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	momentum: Momentum. Must be a scalar.
 //
-// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
-// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
-// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
-// filled in.
-func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyKerasMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseReshape",
+		Type: "ResourceSparseApplyKerasMomentum",
 		Input: []tf.Input{
-			input_indices, input_shape, new_shape,
+			var_, accum, lr, grad, indices, momentum,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// Deprecated. Use TensorArraySplitV3
+// ResourceApplyAdamWithAmsgradAttr is an optional argument to ResourceApplyAdamWithAmsgrad.
+type ResourceApplyAdamWithAmsgradAttr func(optionalAttr)
+
+// ResourceApplyAdamWithAmsgradUseLocking sets the optional use_locking attribute to value.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
-func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArraySplitV2",
-		Input: []tf.Input{
-			handle, value, lengths, flow_in,
-		},
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdamWithAmsgradUseLocking(value bool) ResourceApplyAdamWithAmsgradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Reorders a SparseTensor into the canonical, row-major ordering.
-//
-// Note that by convention, all sparse ops preserve the canonical ordering along
-// increasing dimension number. The only time ordering can be violated is during
-// manual manipulation of the indices and values vectors to add entries.
-//
-// Reordering does not affect the shape of the SparseTensor.
+// Update '*var' according to the Adam algorithm.
 //
-// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
-// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
+// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+// $$vhat_t := max{vhat_{t-1}, v_t}$$
+// $$variable := variable - lr_t * m_t / (\sqrt{vhat_t} + \epsilon)$$
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	vhat: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
 //
-// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
-// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
-func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+// Returns the created operation.
+func ResourceApplyAdamWithAmsgrad(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, vhat tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamWithAmsgradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseReorder",
+		Type: "ResourceApplyAdamWithAmsgrad",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape,
+			var_, m, v, vhat, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// Computes rectified linear: `max(features, 0)`.
-func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
+// MapUnstageNoKeyAttr is an optional argument to MapUnstageNoKey.
+type MapUnstageNoKeyAttr func(optionalAttr)
+
+// MapUnstageNoKeyCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapUnstageNoKeyCapacity(value int64) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapUnstageNoKeyMemoryLimit(value int64) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapUnstageNoKeyContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapUnstageNoKeyContainer(value string) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapUnstageNoKeySharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapUnstageNoKeySharedName(value string) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns a random (key, value)
+//
+// from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Relu",
+		Type: "MapUnstageNoKey",
 		Input: []tf.Input{
-			features,
+			indices,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	key = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapUnstageNoKey", err)
+		return
+	}
+	return key, values
 }
 
-// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
-type ResourceApplyAddSignAttr func(optionalAttr)
+// HashTableV2Attr is an optional argument to HashTableV2.
+type HashTableV2Attr func(optionalAttr)
 
-// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
+// HashTableV2Container sets the optional container attribute to value.
 //
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func HashTableV2Container(value string) HashTableV2Attr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["container"] = value
 	}
 }
 
-// Update '*var' according to the AddSign update.
-//
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
-// variable <- variable - lr_t * update
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	alpha: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
+// HashTableV2SharedName sets the optional shared_name attribute to value.
 //
-// Returns the created operation.
-func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func HashTableV2SharedName(value string) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAddSign",
-		Input: []tf.Input{
-			var_, m, lr, alpha, sign_decay, beta, grad,
-		},
-		Attrs: attrs,
+}
+
+// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+//
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Component-wise divides a SparseTensor by a dense Tensor.
+// Creates a non-initialized hash table.
 //
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
+// This op creates a hash table, specifying the type of its keys and values.
+// Before using the table you will have to initialize it.  After initialization the
+// table will be immutable.
 //
 // Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// Returns Handle to a table.
+func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseDiv",
-		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
-		},
+		Type: "HashTableV2",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
-type FractionalAvgPoolGradAttr func(optionalAttr)
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.
+type RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
 
-// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
-// If not specified, defaults to false
-func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["overlapping"] = value
+		m["table_id"] = value
 	}
 }
 
-// Computes gradient of the FractionalAvgPool function.
-//
-// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
-// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
-// out_backprop to those indices that form the same pooling cell. Therefore, we
-// just need to know the shape of original input tensor, instead of the whole
-// tensor.
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve Momentum embedding parameters with debug support.
 //
-// Arguments:
-//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_avg_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
-func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
+// Returns Parameter parameters updated by the Momentum optimization algorithm.Parameter momenta updated by the Momentum optimization algorithm.Parameter gradient_accumulators updated by the Momentum optimization algorithm.
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalAvgPoolGrad",
-		Input: []tf.Input{
-			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
-		},
+		Type: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
-//
-// This Op does not require `a_indices` be sorted in standard lexicographic order.
+// Enqueue a Tensor on the computation outfeed.
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
-//	b: `ndims`-D Tensor.  With shape `a_shape`.
-func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
+//	input: A tensor that will be inserted into the outfeed queue.
+//
+// Returns the created operation.
+func OutfeedEnqueue(scope *Scope, input tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseAdd",
+		Type: "OutfeedEnqueue",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			input,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
-type SparseToSparseSetOperationAttr func(optionalAttr)
-
-// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
+	return scope.AddOperation(opspec)
 }
 
-// Applies set operation along last dimension of 2 `SparseTensor` inputs.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-//
-// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
-// order and range of `set1` and `set2` indices.
-//
-// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
-// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
-//
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+// Outputs a `Summary` protocol buffer with a histogram.
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set1`
-// and `set2` indices.
+// The generated
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// has one summary value containing a histogram for `values`.
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// This op reports an `InvalidArgument` error if any value is not finite.
 //
 // Arguments:
-//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
-// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//
+//	tag: Scalar.  Tag to use for the `Summary.Value`.
+//	values: Any shape. Values to use to build the histogram.
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SparseToSparseSetOperation",
+		Type: "HistogramSummary",
 		Input: []tf.Input{
-			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
+			tag, values,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
 // MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
@@ -19587,197 +23883,144 @@ func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, deleted_key tf.O
 	return op.Output(0)
 }
 
-// UpperBoundAttr is an optional argument to UpperBound.
-type UpperBoundAttr func(optionalAttr)
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingADAMParametersGradAccumDebug.
+type RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr func(optionalAttr)
 
-// UpperBoundOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func UpperBoundOutType(value tf.DataType) UpperBoundAttr {
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["table_id"] = value
 	}
 }
 
-// Applies upper_bound(sorted_search_values, values) along each row.
-//
-// Each set of rows with the same index in (sorted_inputs, values) is treated
-// independently.  The resulting row is the equivalent of calling
-// `np.searchsorted(sorted_inputs, values, side='right')`.
-//
-// The result is not a global index to the entire
-// `Tensor`, but rather just the index in the last dimension.
-//
-// A 2-D example:
-//   sorted_sequence = [[0, 3, 9, 9, 10],
-//                      [1, 2, 3, 4, 5]]
-//   values = [[2, 4, 9],
-//             [0, 2, 6]]
-//
-//   result = UpperBound(sorted_sequence, values)
-//
-//   result == [[1, 2, 4],
-//              [0, 2, 5]]
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve ADAM embedding parameters with debug support.
 //
-// Arguments:
-//	sorted_inputs: 2-D Tensor where each row is ordered.
-//	values: 2-D Tensor with the same numbers of rows as `sorted_search_values`. Contains
-// the values that will be searched for in `sorted_search_values`.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns A `Tensor` with the same shape as `values`.  It contains the last scalar index
-// into the last dimension where values can be inserted without changing the
-// ordered property.
-func UpperBound(scope *Scope, sorted_inputs tf.Output, values tf.Output, optional ...UpperBoundAttr) (output tf.Output) {
+// Returns Parameter parameters updated by the ADAM optimization algorithm.Parameter momenta updated by the ADAM optimization algorithm.Parameter velocities updated by the ADAM optimization algorithm.Parameter gradient_accumulators updated by the ADAM optimization algorithm.
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, velocities tf.Output, gradient_accumulators tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "UpperBound",
-		Input: []tf.Input{
-			sorted_inputs, values,
-		},
+		Type: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
-type FractionalMaxPoolGradAttr func(optionalAttr)
+// CudnnRNNAttr is an optional argument to CudnnRNN.
+type CudnnRNNAttr func(optionalAttr)
 
-// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
-// If not specified, defaults to false
-func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
+// CudnnRNNRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNRnnMode(value string) CudnnRNNAttr {
 	return func(m optionalAttr) {
-		m["overlapping"] = value
+		m["rnn_mode"] = value
 	}
 }
 
-// Computes gradient of the FractionalMaxPool function.
-//
-// Arguments:
-//	orig_input: Original input for `fractional_max_pool`
-//	orig_output: Original output for `fractional_max_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_max_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
-//
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
-func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FractionalMaxPoolGrad",
-		Input: []tf.Input{
-			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
-		},
-		Attrs: attrs,
+// CudnnRNNInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNInputMode(value string) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
-type ResourceApplyAdagradDAAttr func(optionalAttr)
-
-// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
+// CudnnRNNDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNDirection(value string) CudnnRNNAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["direction"] = value
 	}
 }
 
-// Update '*var' according to the proximal adagrad scheme.
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+// CudnnRNNDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNDropout(value float32) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagradDA",
-		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
-		},
-		Attrs: attrs,
+}
+
+// CudnnRNNSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNSeed(value int64) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
-type SparseReduceMaxSparseAttr func(optionalAttr)
+// CudnnRNNSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNSeed2(value int64) CudnnRNNAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
 
-// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
+// CudnnRNNIsTraining sets the optional is_training attribute to value.
+// If not specified, defaults to true
+func CudnnRNNIsTraining(value bool) CudnnRNNAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["is_training"] = value
 	}
 }
 
-// Computes the max of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
-// SparseTensor.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// A RNN backed by cuDNN.
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// Computes the RNN from the input and initial states, with respect to the params
+// buffer.
 //
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//   the actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used. Should be
+//   "unidirectional" or "bidirectional".
+// dropout: Dropout probability. When set to 0., dropout is disabled.
+// seed: The 1st part of a seed to initialize dropout.
+// seed2: The 2nd part of a seed to initialize dropout.
+// input: A 3-D tensor with the shape of [seq_length, batch_size, input_size].
+// input_h: A 3-D tensor with the shape of [num_layer * dir, batch_size,
+//     num_units].
+// input_c: For LSTM, a 3-D tensor with the shape of
+//     [num_layer * dir, batch, num_units]. For other models, it is ignored.
+// params: A 1-D tensor that contains the weights and biases in an opaque layout.
+//     The size must be created through CudnnRNNParamsSize, and initialized
+//     separately. Note that they might not be compatible across different
+//     generations. So it is a good idea to save and restore
+// output: A 3-D tensor with the shape of [seq_length, batch_size,
+//     dir * num_units].
+// output_h: The same shape has input_h.
+// output_c: The same shape as input_c for LSTM. An empty tensor for other models.
+// is_training: Indicates whether this operation is used for inferenece or
+//   training.
+// reserve_space: An opaque tensor that can be used in backprop calculation. It
+//   is only produced if is_training is false.
+func CudnnRNN(scope *Scope, input tf.Output, input_h tf.Output, input_c tf.Output, params tf.Output, optional ...CudnnRNNAttr) (output tf.Output, output_h tf.Output, output_c tf.Output, reserve_space tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19786,33 +24029,56 @@ func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceMaxSparse",
+		Type: "CudnnRNN",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			input, input_h, input_c, params,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Creates a dataset that emits the outputs of `input_dataset` `count` times.
+// DecodeCompressedAttr is an optional argument to DecodeCompressed.
+type DecodeCompressedAttr func(optionalAttr)
+
+// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
 //
-// Arguments:
+// value: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+// If not specified, defaults to ""
+func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
+	}
+}
+
+// Decompress strings.
 //
-//	count: A scalar representing the number of times that `input_dataset` should
-// be repeated. A value of `-1` indicates that it should be repeated infinitely.
+// This op decompresses each element of the `bytes` input `Tensor`, which
+// is assumed to be compressed using the given `compression_type`.
+//
+// The `output` is a string `Tensor` of the same shape as `bytes`,
+// each element containing the decompressed data from the corresponding
+// element in `bytes`.
 //
+// Arguments:
+//	bytes: A Tensor of string which is compressed.
 //
-func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns A Tensor with the same shape as input `bytes`, uncompressed
+// from bytes.
+func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RepeatDataset",
+		Type: "DecodeCompressed",
 		Input: []tf.Input{
-			input_dataset, count,
+			bytes,
 		},
 		Attrs: attrs,
 	}
@@ -19820,101 +24086,165 @@ func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, outpu
 	return op.Output(0)
 }
 
-// Computes the gradient for the inverse of `x` wrt its input.
+// EnterAttr is an optional argument to Enter.
+type EnterAttr func(optionalAttr)
+
+// EnterIsConstant sets the optional is_constant attribute to value.
 //
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If true, the output is constant within the child frame.
+// If not specified, defaults to false
+func EnterIsConstant(value bool) EnterAttr {
+	return func(m optionalAttr) {
+		m["is_constant"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "ReciprocalGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
+}
+
+// EnterParallelIterations sets the optional parallel_iterations attribute to value.
+//
+// value: The number of iterations allowed to run in parallel.
+// If not specified, defaults to 10
+func EnterParallelIterations(value int64) EnterAttr {
+	return func(m optionalAttr) {
+		m["parallel_iterations"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+// Creates or finds a child frame, and makes `data` available to the child frame.
 //
-// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// This op is used together with `Exit` to create loops in the graph.
+// The unique `frame_name` is used by the `Executor` to identify frames. If
+// `is_constant` is true, `output` is a constant in the child frame; otherwise
+// it may be changed in the child frame. At most `parallel_iterations` iterations
+// are run in parallel in the child frame.
+//
+// Arguments:
+//	data: The tensor to be made available to the child frame.
+//	frame_name: The name of the child frame.
+//
+// Returns The same tensor as `data`.
+func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"frame_name": frame_name}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Minimum",
+		Type: "Enter",
 		Input: []tf.Input{
-			x, y,
+			data,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MfccAttr is an optional argument to Mfcc.
-type MfccAttr func(optionalAttr)
+// TryRpcAttr is an optional argument to TryRpc.
+type TryRpcAttr func(optionalAttr)
 
-// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
+// TryRpcProtocol sets the optional protocol attribute to value.
 //
-// value: The highest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 4000
-func MfccUpperFrequencyLimit(value float32) MfccAttr {
+// value: RPC protocol to use.  Empty string means use the default protocol.
+// Options include 'grpc'.
+// If not specified, defaults to ""
+func TryRpcProtocol(value string) TryRpcAttr {
 	return func(m optionalAttr) {
-		m["upper_frequency_limit"] = value
+		m["protocol"] = value
 	}
 }
 
-// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
+// TryRpcFailFast sets the optional fail_fast attribute to value.
 //
-// value: The lowest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 20
-func MfccLowerFrequencyLimit(value float32) MfccAttr {
+// value: `boolean`. If `true` (default), then failures to connect
+// (i.e., the server does not immediately respond) cause an RPC failure.
+// If not specified, defaults to true
+func TryRpcFailFast(value bool) TryRpcAttr {
 	return func(m optionalAttr) {
-		m["lower_frequency_limit"] = value
+		m["fail_fast"] = value
 	}
 }
 
-// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
+// TryRpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
 //
-// value: Resolution of the Mel bank used internally.
-// If not specified, defaults to 40
-func MfccFilterbankChannelCount(value int64) MfccAttr {
+// value: `int`. If `0` (default), then the kernel will run the RPC
+// request and only time out if the RPC deadline passes or the session times out.
+// If this value is greater than `0`, then the op will raise an exception if
+// the RPC takes longer than `timeout_in_ms`.
+// If not specified, defaults to 0
+func TryRpcTimeoutInMs(value int64) TryRpcAttr {
 	return func(m optionalAttr) {
-		m["filterbank_channel_count"] = value
+		m["timeout_in_ms"] = value
 	}
 }
 
-// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
+// Perform batches of RPC requests.
 //
-// value: How many output channels to produce per time slice.
-// If not specified, defaults to 13
-func MfccDctCoefficientCount(value int64) MfccAttr {
-	return func(m optionalAttr) {
-		m["dct_coefficient_count"] = value
-	}
-}
-
-// Transforms a spectrogram into a form that's useful for speech recognition.
+// This op asynchronously performs either a single RPC request, or a batch
+// of requests.  RPC requests are defined by three main parameters:
 //
-// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
-// been effective as an input feature for machine learning. They are created by
-// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
-// higher frequencies that are less significant to the human ear. They have a long
-// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
-// is a good resource to learn more.
+//   - `address` (the host+port or BNS address of the request)
+//   - `method` (the method name for the request)
+//   - `request` (the serialized proto string, or vector of strings,
+//      of the RPC request argument).
+//
+// For example, if you have an RPC service running on port localhost:2345,
+// and its interface is configured with the following proto declaration:
+//
+// ```
+// service MyService {
+//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+//   }
+// };
+// ```
+//
+// then call this op with arguments:
+//
+// ```
+// address = "localhost:2345"
+// method = "MyService/MyMethod"
+// ```
+//
+// The `request` tensor is a string tensor representing serialized `MyRequestProto`
+// strings; and the output string tensor `response` will have the same shape
+// and contain (upon successful completion) corresponding serialized
+// `MyResponseProto` strings.
+//
+// For example, to send a single, empty, `MyRequestProto`, call
+// this op with `request = ""`.  To send 5 **parallel** empty requests,
+// call this op with `request = ["", "", "", "", ""]`.
+//
+// More generally, one can create a batch of `MyRequestProto` serialized protos
+// from regular batched tensors using the `encode_proto` op, and convert
+// the response `MyResponseProto` serialized protos to batched tensors
+// using the `decode_proto` op.
+//
+// **NOTE** Working with serialized proto strings is faster than instantiating
+// actual proto objects in memory, so no performance degradation is expected
+// compared to writing custom kernels for this workflow.
+//
+// Unlike the standard `Rpc` op, if the connection fails or the remote worker
+// returns an error status, this op does **not** reraise the exception.
+// Instead, the `status_code` and `status_message` entry for the corresponding RPC
+// call is set with the error returned from the RPC call.  The `response` tensor
+// will contain valid response values for those minibatch entries whose RPCs did
+// not fail; the rest of the entries will have empty strings.
 //
 // Arguments:
-//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
-// set to true.
-//	sample_rate: How many samples per second the source audio used.
-func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
+//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `method` and `request`.
+//	method: `0-D` or `1-D`.  The method address on the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `request`.
+//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `method`.
+//
+// Returns Same shape as `request`. Serialized proto strings: the rpc responses.Same shape as `request`.  Values correspond to tensorflow Status enum codes.Same shape as `request`.  Values correspond to Status messages
+// returned from the RPC calls.
+func TryRpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...TryRpcAttr) (response tf.Output, status_code tf.Output, status_message tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19923,356 +24253,267 @@ func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional .
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Mfcc",
+		Type: "TryRpc",
 		Input: []tf.Input{
-			spectrogram, sample_rate,
+			address, method, request,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
-//
-// The Hurwitz zeta function is defined as:
-//
+// Add all input tensors element wise.
 //
-// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
-func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
+// Arguments:
+//	inputs: Must all be the same size and shape.
+func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Zeta",
+		Type: "AddN",
 		Input: []tf.Input{
-			x, q,
+			tf.OutputList(inputs),
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inverse fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform over the
-// inner-most dimension of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its inverse 1D Fourier transform.
+// RetrieveTPUEmbeddingMDLAdagradLightParametersAttr is an optional argument to RetrieveTPUEmbeddingMDLAdagradLightParameters.
+type RetrieveTPUEmbeddingMDLAdagradLightParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingMDLAdagradLightParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft
-// @end_compatibility
-func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IFFT",
-		Input: []tf.Input{
-			input,
-		},
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingMDLAdagradLightParametersTableId(value int64) RetrieveTPUEmbeddingMDLAdagradLightParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// 2D fast Fourier transform.
-//
-// Computes the 2-dimensional discrete Fourier transform over the inner-most
-// 2 dimensions of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.fft2
-// @end_compatibility
-func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FFT2D",
-		Input: []tf.Input{
-			input,
-		},
+// RetrieveTPUEmbeddingMDLAdagradLightParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingMDLAdagradLightParametersTableName(value string) RetrieveTPUEmbeddingMDLAdagradLightParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Inverse 2D fast Fourier transform.
-//
-// Computes the inverse 2-dimensional discrete Fourier transform over the
-// inner-most 2 dimensions of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
+// Retrieve MDL Adagrad Light embedding parameters.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft2
-// @end_compatibility
-func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns Parameter parameters updated by the MDL Adagrad Light optimization algorithm.Parameter accumulators updated by the MDL Adagrad Light optimization algorithm.Parameter weights updated by the MDL Adagrad Light optimization algorithm.Parameter benefits updated by the MDL Adagrad Light optimization algorithm.
+func RetrieveTPUEmbeddingMDLAdagradLightParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMDLAdagradLightParametersAttr) (parameters tf.Output, accumulators tf.Output, weights tf.Output, benefits tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IFFT2D",
-		Input: []tf.Input{
-			input,
-		},
+		Type: "RetrieveTPUEmbeddingMDLAdagradLightParameters",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Returns element-wise remainder of division. This emulates C semantics in that
-//
-// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
-// y + truncate_mod(x, y) = x`.
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.
+type RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "TruncateMod",
-		Input: []tf.Input{
-			x, y,
-		},
+}
+
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Inverse 2D real-valued fast Fourier transform.
-//
-// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 2 dimensions of `input`.
-//
-// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
-//
-// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+// Retrieve Adadelta embedding parameters with debug support.
 //
-// Returns A float32 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 2D Fourier transform.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft2
-// @end_compatibility
-func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns Parameter parameters updated by the Adadelta optimization algorithm.Parameter accumulators updated by the Adadelta optimization algorithm.Parameter updates updated by the Adadelta optimization algorithm.Parameter gradient_accumulators updated by the Adadelta optimization algorithm.
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT2D",
-		Input: []tf.Input{
-			input, fft_length,
-		},
+		Type: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// DecodeJpegAttr is an optional argument to DecodeJpeg.
-type DecodeJpegAttr func(optionalAttr)
+// MapClearAttr is an optional argument to MapClear.
+type MapClearAttr func(optionalAttr)
 
-// DecodeJpegChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
+// MapClearCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
-func DecodeJpegChannels(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodeJpegRatio sets the optional ratio attribute to value.
-//
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeJpegRatio(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
-	}
-}
-
-// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
 //
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
+// REQUIRES: value >= 0
+func MapClearCapacity(value int64) MapClearAttr {
 	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
+		m["capacity"] = value
 	}
 }
 
-// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+// MapClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
+// REQUIRES: value >= 0
+func MapClearMemoryLimit(value int64) MapClearAttr {
 	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
-//
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
+// MapClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapClearContainer(value string) MapClearAttr {
 	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
+		m["container"] = value
 	}
 }
 
-// DecodeJpegDctMethod sets the optional dct_method attribute to value.
-//
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
+// MapClearSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func DecodeJpegDctMethod(value string) DecodeJpegAttr {
+func MapClearSharedName(value string) MapClearAttr {
 	return func(m optionalAttr) {
-		m["dct_method"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Decode a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-//
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
-//
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
-//
-//
-// This op also supports decoding PNGs and non-animated GIFs since the interface is
-// the same, though it is cleaner to use `tf.image.decode_image`.
-//
-// Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
+// Op removes all elements in the underlying container.
 //
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
+// Returns the created operation.
+func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeJpeg",
-		Input: []tf.Input{
-			contents,
-		},
+		Type: "MapClear",
+
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Inverse 3D real-valued fast Fourier transform.
+// DecodeCSVAttr is an optional argument to DecodeCSV.
+type DecodeCSVAttr func(optionalAttr)
+
+// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
 //
-// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 3 dimensions of `input`.
+// value: char delimiter to separate fields in a record.
+// If not specified, defaults to ","
+func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["field_delim"] = value
+	}
+}
+
+// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
 //
-// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// value: If false, treats double quotation marks as regular
+// characters inside of the string fields (ignoring RFC 4180, Section 2,
+// Bullet 5).
+// If not specified, defaults to true
+func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["use_quote_delim"] = value
+	}
+}
+
+// DecodeCSVNaValue sets the optional na_value attribute to value.
 //
-// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// value: Additional string to recognize as NA/NaN.
+// If not specified, defaults to ""
+func DecodeCSVNaValue(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["na_value"] = value
+	}
+}
+
+// DecodeCSVSelectCols sets the optional select_cols attribute to value.
+// If not specified, defaults to <>
+func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["select_cols"] = value
+	}
+}
+
+// Convert CSV records to tensors. Each column maps to one tensor.
 //
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+// RFC 4180 format is expected for the CSV records.
+// (https://tools.ietf.org/html/rfc4180)
+// Note that we allow leading and trailing spaces with int or float field.
 //
-// Returns A float32 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 3D real Fourier transform.
+// Arguments:
+//	records: Each string is a record/row in the csv and all records should have
+// the same format.
+//	record_defaults: One tensor per column of the input record, with either a
+// scalar default value for that column or an empty vector if the column is
+// required.
 //
-// @compatibility(numpy)
-// Equivalent to np.irfftn with 3 dimensions.
-// @end_compatibility
-func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns Each tensor will have the same shape as records.
+func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT3D",
+		Type: "DecodeCSV",
 		Input: []tf.Input{
-			input, fft_length,
+			records, tf.OutputList(record_defaults),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of (x != y) element-wise.
-//
-// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "NotEqual",
-		Input: []tf.Input{
-			x, y,
-		},
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("DecodeCSV", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return output
 }
 
 // Produces the max pool of the input tensor for quantized types.
@@ -20304,121 +24545,166 @@ func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_in
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes softplus: `log(exp(features) + 1)`.
-func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Softplus",
-		Input: []tf.Input{
-			features,
-		},
+// RandomShuffleAttr is an optional argument to RandomShuffle.
+type RandomShuffleAttr func(optionalAttr)
+
+// RandomShuffleSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomShuffleSeed(value int64) RandomShuffleAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes exponential of x - 1 element-wise.
+// RandomShuffleSeed2 sets the optional seed2 attribute to value.
 //
-// I.e., \\(y = (\exp x) - 1\\).
-func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Expm1",
-		Input: []tf.Input{
-			x,
-		},
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleSeed2(value int64) RandomShuffleAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns the number of records this Reader has produced.
+// Randomly shuffles a tensor along its first dimension.
 //
-// This is the same as the number of ReaderRead executions that have
-// succeeded.
+//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
+//   to one and only one `output[i]`. For example, a mapping that might occur for a
+//   3x2 tensor is:
+//
+// ```
+// [[1, 2],       [[5, 6],
+//  [3, 4],  ==>   [1, 2],
+//  [5, 6]]        [3, 4]]
+// ```
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
+//	value: The tensor to be shuffled.
+//
+// Returns A tensor of same shape and type as `value`, shuffled along its first
+// dimension.
+func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReaderNumRecordsProducedV2",
+		Type: "RandomShuffle",
 		Input: []tf.Input{
-			reader_handle,
+			value,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the set of files matching one or more glob patterns.
+// EnqueueTPUEmbeddingSparseBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseBatch.
+type EnqueueTPUEmbeddingSparseBatchAttr func(optionalAttr)
+
+// EnqueueTPUEmbeddingSparseBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
 //
-// Note that this routine only supports wildcard characters in the
-// basename portion of the pattern, not in the directory portion.
-// Note also that the order of filenames returned can be non-deterministic.
+// value: The TPU device to use. Should be >= 0 and less than the number
+// of TPU cores in the task on which the node is placed.
+// If not specified, defaults to -1
+func EnqueueTPUEmbeddingSparseBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseBatchAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// EnqueueTPUEmbeddingSparseBatchCombiners sets the optional combiners attribute to value.
+//
+// value: A list of string scalars, one for each embedding table that specify
+// how to normalize the embedding activations after weighted summation.
+// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
+// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
+// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
+// all tables.
+// If not specified, defaults to <>
+func EnqueueTPUEmbeddingSparseBatchCombiners(value []string) EnqueueTPUEmbeddingSparseBatchAttr {
+	return func(m optionalAttr) {
+		m["combiners"] = value
+	}
+}
+
+// An op that enqueues TPUEmbedding input indices from a SparseTensor.
+//
+// This Op eases the porting of code that uses embedding_lookup_sparse(),
+// although some Python preprocessing of the SparseTensor arguments to
+// embedding_lookup_sparse() is required to produce the arguments to this Op,
+// since only a single EnqueueTPUEmbeddingSparseBatch Op is allowed per training
+// step.
+//
+// The tensors at corresponding positions in the three input lists
+// must have the same shape, i.e. rank 1 with dim_size() equal to the total
+// number of lookups into the table described by the corresponding table_id.
 //
 // Arguments:
-//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
+//	sample_indices: A list of rank 1 Tensors specifying the training example and
+// feature to which the corresponding embedding_indices and aggregation_weights
+// values belong. sample_indices[i] must equal b * nf + f, where nf is the
+// number of features from the corresponding table, f is in [0, nf), and
+// b is in [0, batch size).
+//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
+//	aggregation_weights: A list of rank 1 Tensors containing per sample -- i.e. per
+// (training example, feature) -- aggregation weights.
+//	mode_override: A string input that overrides the mode specified in the
+// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
 //
-// Returns A vector of matching filenames.
-func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
+// Returns the created operation.
+func EnqueueTPUEmbeddingSparseBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingSparseBatchAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MatchingFiles",
+		Type: "EnqueueTPUEmbeddingSparseBatch",
 		Input: []tf.Input{
-			pattern,
+			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
-type HistogramFixedWidthAttr func(optionalAttr)
+// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
+type StatelessRandomNormalAttr func(optionalAttr)
 
-// HistogramFixedWidthDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_INT32
-func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
+// StatelessRandomNormalDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
 	return func(m optionalAttr) {
 		m["dtype"] = value
 	}
 }
 
-// Return histogram of values.
-//
-// Given the tensor `values`, this operation returns a rank 1 histogram counting
-// the number of entries in `values` that fall into every bin.  The bins are
-// equal width and determined by the arguments `value_range` and `nbins`.
+// Outputs deterministic pseudorandom values from a normal distribution.
 //
-// ```python
-// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-// nbins = 5
-// value_range = [0.0, 5.0]
-// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
+// The generated values will have mean 0 and standard deviation 1.
 //
-// with tf.get_default_session() as sess:
-//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
-//   variables.global_variables_initializer().run()
-//   sess.run(hist) => [2, 1, 1, 0, 2]
-// ```
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
-//	values: Numeric `Tensor`.
-//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
-// values <= value_range[0] will be mapped to hist[0],
-// values >= value_range[1] will be mapped to hist[-1].
-//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns A 1-D `Tensor` holding histogram of values.
-func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
+// Returns Random values with specified shape.
+func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20427,9 +24713,9 @@ func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "HistogramFixedWidth",
+		Type: "StatelessRandomNormal",
 		Input: []tf.Input{
-			values, value_range, nbins,
+			shape, seed,
 		},
 		Attrs: attrs,
 	}
@@ -20437,261 +24723,316 @@ func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output,
 	return op.Output(0)
 }
 
-// Returns the truth value of (x >= y) element-wise.
+// An Op to exchange data across TPU replicas.
 //
-// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// On each replica, the input is split into `split_count` blocks along
+// `split_dimension` and send to the other replicas given group_assignment. After
+// receiving `split_count` - 1 blocks from other replicas, we concatenate the
+// blocks along `concat_dimension` as the output.
+//
+// For example, suppose there are 2 TPU replicas:
+// replica 0 receives input: `[[A, B]]`
+// replica 1 receives input: `[[C, D]]`
+//
+// group_assignment=`[[0, 1]]`
+// concat_dimension=0
+// split_dimension=1
+// split_count=2
+//
+// replica 0's output: `[[A], [C]]`
+// replica 1's output: `[[B], [D]]`
+//
+// Arguments:
+//	input: The local input to the sum.
+//	group_assignment: An int32 tensor with shape
+// [num_groups, num_replicas_per_group]. `group_assignment[i]` represents the
+// replica ids in the ith subgroup.
+//	concat_dimension: The dimension number to concatenate.
+//	split_dimension: The dimension number to split.
+//	split_count: The number of splits, this number must equal to the sub-group
+// size(group_assignment.get_shape()[1])
+//
+// Returns The exchanged result.
+func AllToAll(scope *Scope, input tf.Output, group_assignment tf.Output, concat_dimension int64, split_dimension int64, split_count int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"concat_dimension": concat_dimension, "split_dimension": split_dimension, "split_count": split_count}
 	opspec := tf.OpSpec{
-		Type: "GreaterEqual",
+		Type: "AllToAll",
 		Input: []tf.Input{
-			x, y,
+			input, group_assignment,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv3DAttr is an optional argument to Conv3D.
-type Conv3DAttr func(optionalAttr)
-
-// Conv3DDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DDataFormat(value string) Conv3DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DDilations(value []int64) Conv3DAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
-//
-// In signal processing, cross-correlation is a measure of similarity of
-// two waveforms as a function of a time-lag applied to one of them. This
-// is also known as a sliding dot product or sliding inner-product.
+// Adds a value to the current value of a variable.
 //
-// Our Conv3D implements a form of cross-correlation.
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the incremented value or a subsequent newer one.
 //
 // Arguments:
-//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
-//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
-// out_channels]`. `in_channels` must match between `input` and `filter`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
+//
+// Returns the created operation.
+func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Conv3D",
+		Type: "AssignAddVariableOp",
 		Input: []tf.Input{
-			input, filter,
+			resource, value,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Adds up a SparseTensor and a dense Tensor, using these special rules:
+// Real-valued fast Fourier transform.
 //
-// (1) Broadcasts the dense side to have the same shape as the sparse side, if
-//     eligible;
-// (2) Then, only the dense values pointed to by the indices of the SparseTensor
-//     participate in the cwise addition.
+// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most dimension of `input`.
 //
-// By these rules, the result is a logical SparseTensor with exactly the same
-// indices and shape, but possibly with different non-zero values.  The output of
-// this Op is the resultant non-zero values.
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
+// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
+// followed by the `fft_length / 2` positive-frequency terms.
+//
+// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// Returns A complex64 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
+//   frequency components of its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft
+// @end_compatibility
+func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseAdd",
+		Type: "RFFT",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			input, fft_length,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizeV2Attr is an optional argument to QuantizeV2.
-type QuantizeV2Attr func(optionalAttr)
+// RetrieveTPUEmbeddingAdadeltaParametersAttr is an optional argument to RetrieveTPUEmbeddingAdadeltaParameters.
+type RetrieveTPUEmbeddingAdadeltaParametersAttr func(optionalAttr)
 
-// QuantizeV2Mode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func QuantizeV2Mode(value string) QuantizeV2Attr {
+// RetrieveTPUEmbeddingAdadeltaParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingAdadeltaParametersTableId(value int64) RetrieveTPUEmbeddingAdadeltaParametersAttr {
 	return func(m optionalAttr) {
-		m["mode"] = value
+		m["table_id"] = value
 	}
 }
 
-// QuantizeV2RoundMode sets the optional round_mode attribute to value.
-// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
-func QuantizeV2RoundMode(value string) QuantizeV2Attr {
+// RetrieveTPUEmbeddingAdadeltaParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdadeltaParametersTableName(value string) RetrieveTPUEmbeddingAdadeltaParametersAttr {
 	return func(m optionalAttr) {
-		m["round_mode"] = value
+		m["table_name"] = value
 	}
 }
 
-// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
-//
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.  The
-// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
-// when rounding float values to their quantized equivalents.
-//
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-//
-// ```
-// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
-// if T == qint8: out[i] -= (range(T) + 1) / 2.0
-// ```
-//
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-//
-// *MIN_COMBINED Mode Example*
-//
-// Assume the input is type float and has a possible range of [0.0, 6.0] and the
-// output type is quint8 ([0, 255]). The min_range and max_range values should be
-// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
-// value of the input by 255/6 and cast to quint8.
-//
-// If the output type was qint8 ([-128, 127]), the operation will additionally
-// subtract each value by 128 prior to casting, so that the range of values aligns
-// with the range of qint8.
+// Retrieve Adadelta embedding parameters.
 //
-// If the mode is 'MIN_FIRST', then this approach is used:
-//
-// ```
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = num_discrete_values / range
-// quantized = round(input * range_scale) - round(range_min * range_scale) +
-//   numeric_limits<T>::min()
-// quantized = max(quantized, numeric_limits<T>::min())
-// quantized = min(quantized, numeric_limits<T>::max())
-// ```
-//
-// The biggest difference between this and MIN_COMBINED is that the minimum range
-// is rounded first, before it's subtracted from the rounded value. With
-// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
-// and dequantizing will introduce a larger and larger error.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// *SCALED mode Example*
+// Returns Parameter parameters updated by the Adadelta optimization algorithm.Parameter accumulators updated by the Adadelta optimization algorithm.Parameter updates updated by the Adadelta optimization algorithm.
+func RetrieveTPUEmbeddingAdadeltaParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdadeltaParametersAttr) (parameters tf.Output, accumulators tf.Output, updates tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingAdadeltaParameters",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// UpperBoundAttr is an optional argument to UpperBound.
+type UpperBoundAttr func(optionalAttr)
+
+// UpperBoundOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func UpperBoundOutType(value tf.DataType) UpperBoundAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Applies upper_bound(sorted_search_values, values) along each row.
 //
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
+// Each set of rows with the same index in (sorted_inputs, values) is treated
+// independently.  The resulting row is the equivalent of calling
+// `np.searchsorted(sorted_inputs, values, side='right')`.
 //
-// If the mode is `SCALED`, we do not use the full range of the output type,
-// choosing to elide the lowest possible value for symmetry (e.g., output range is
-// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-// 0.
+// The result is not a global index to the entire
+// `Tensor`, but rather just the index in the last dimension.
 //
-// We first find the range of values in our tensor. The
-// range we use is always centered on 0, so we find m such that
+// A 2-D example:
+//   sorted_sequence = [[0, 3, 9, 9, 10],
+//                      [1, 2, 3, 4, 5]]
+//   values = [[2, 4, 9],
+//             [0, 2, 6]]
 //
-// ```c++
-//   m = max(abs(input_min), abs(input_max))
-// ```
+//   result = UpperBound(sorted_sequence, values)
 //
-// Our input tensor range is then `[-m, m]`.
+//   result == [[1, 2, 4],
+//              [0, 2, 5]]
 //
-// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-// If T is signed, this is
+// Arguments:
+//	sorted_inputs: 2-D Tensor where each row is ordered.
+//	values: 2-D Tensor with the same numbers of rows as `sorted_search_values`. Contains
+// the values that will be searched for in `sorted_search_values`.
 //
-// ```
-//   num_bits = sizeof(T) * 8
-//   [min_fixed, max_fixed] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-// ```
+// Returns A `Tensor` with the same shape as `values`.  It contains the last scalar index
+// into the last dimension where values can be inserted without changing the
+// ordered property.
+func UpperBound(scope *Scope, sorted_inputs tf.Output, values tf.Output, optional ...UpperBoundAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UpperBound",
+		Input: []tf.Input{
+			sorted_inputs, values,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
+type FractionalMaxPoolGradAttr func(optionalAttr)
+
+// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
 //
-// Otherwise, if T is unsigned, the fixed-point range is
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
 //
-// ```
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-// ```
+// `index  0  1  2  3  4`
 //
-// From this we compute our scaling factor, s:
+// `value  20 5  16 3  7`
 //
-// ```c++
-//   s = (max_fixed - min_fixed) / (2 * m)
-// ```
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// Computes gradient of the FractionalMaxPool function.
 //
-// Now we can quantize the elements of our tensor:
+// Arguments:
+//	orig_input: Original input for `fractional_max_pool`
+//	orig_output: Original output for `fractional_max_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_max_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
 //
-// ```c++
-// result = round(input * s)
-// ```
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
+func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalMaxPoolGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
+type SparseReduceMaxSparseAttr func(optionalAttr)
+
+// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
 //
-// One thing to watch out for is that the operator may choose to adjust the
-// requested minimum and maximum values slightly during the quantization process,
-// so you should always use the output ports as the range for further calculations.
-// For example, if the requested minimum and maximum values are close to equal,
-// they will be separated by a small epsilon value to prevent ill-formed quantized
-// buffers from being created. Otherwise, you can end up with buffers where all the
-// quantized values map to the same float value, which causes problems for
-// operations that have to perform further calculations on them.
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the max of elements across dimensions of a SparseTensor.
 //
-// Arguments:
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
+// SparseTensor.
 //
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
 //
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
-// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
-func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeV2",
+		Type: "SparseReduceMaxSparse",
 		Input: []tf.Input{
-			input, min_range, max_range,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
@@ -20699,81 +25040,95 @@ func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns the truth value of (x < y) element-wise.
+// Convert one or more images from HSV to RGB.
 //
-// *NOTE*: `Less` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
+//
+// See `rgb_to_hsv` for a description of the HSV encoding.
+//
+// Arguments:
+//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
+//
+// Returns `images` converted to RGB.
+func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Less",
+		Type: "HSVToRGB",
 		Input: []tf.Input{
-			x, y,
+			images,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedReluXAttr is an optional argument to QuantizedReluX.
-type QuantizedReluXAttr func(optionalAttr)
-
-// QuantizedReluXOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
+// Computes the gradient of the sigmoid of `x` wrt its input.
+//
+// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
+// `dy` is the corresponding input gradient.
+func SigmoidGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SigmoidGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
+// Creates a dataset that changes the batch size.
 //
-// Arguments:
+// Creates a dataset that changes the batch size of the dataset to current batch
+// size // num_workers.
 //
+// Arguments:
+//	input_dataset: A variant tensor representing the input dataset.
+//	num_workers: A scalar representing the number of workers to distribute this batch across. As
+// a result of this transformation the current batch size would end up being
+// divided  by this parameter.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+func ExperimentalRebatchDataset(scope *Scope, input_dataset tf.Output, num_workers tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "QuantizedReluX",
+		Type: "ExperimentalRebatchDataset",
 		Input: []tf.Input{
-			features, max_value, min_features, max_features,
+			input_dataset, num_workers,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Creates a dataset that batches `batch_size` elements from `input_dataset`.
+// Creates a dataset that emits the outputs of `input_dataset` `count` times.
 //
 // Arguments:
 //
-//	batch_size: A scalar representing the number of elements to accumulate in a batch.
-//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
-// is smaller than desired.
+//	count: A scalar representing the number of times that `input_dataset` should
+// be repeated. A value of `-1` indicates that it should be repeated infinitely.
 //
 //
-func BatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "BatchDatasetV2",
+		Type: "RepeatDataset",
 		Input: []tf.Input{
-			input_dataset, batch_size, drop_remainder,
+			input_dataset, count,
 		},
 		Attrs: attrs,
 	}
@@ -20781,91 +25136,110 @@ func BatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output,
 	return op.Output(0)
 }
 
-// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
-type QuantizedConv2DAttr func(optionalAttr)
-
-// QuantizedConv2DOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
+// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
+type ResourceApplyAdagradDAAttr func(optionalAttr)
 
-// QuantizedConv2DDilations sets the optional dilations attribute to value.
+// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Computes a 2D convolution given quantized 4D input and filter tensors.
-//
-// The inputs are quantized tensors where the lowest value represents the real
-// number of the associated minimum, and the highest represents the maximum.
-// This means that you can only interpret the quantized output in the same way, by
-// taking the returned minimum and maximum values into account.
+// Update '*var' according to the proximal adagrad scheme.
 //
 // Arguments:
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
 //
-//	filter: filter's input_depth dimension must match input's depth dimensions.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_filter: The float value that the lowest quantized filter value represents.
-//	max_filter: The float value that the highest quantized filter value represents.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// Returns the created operation.
+func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedConv2D",
+		Type: "ResourceApplyAdagradDA",
 		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
 		},
 		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
+}
+
+// Creates a TensorList which, when stacked, has the value of `tensor`.
+//
+// Each tensor in the result list corresponds to one row of the input tensor.
+//
+// tensor: The input tensor.
+// output_handle: The list.
+func TensorListFromTensor(scope *Scope, tensor tf.Output, element_shape tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListFromTensor",
+		Input: []tf.Input{
+			tensor, element_shape,
+		},
+	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// StatelessMultinomialAttr is an optional argument to StatelessMultinomial.
-type StatelessMultinomialAttr func(optionalAttr)
+// ConfigureDistributedTPUAttr is an optional argument to ConfigureDistributedTPU.
+type ConfigureDistributedTPUAttr func(optionalAttr)
 
-// StatelessMultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func StatelessMultinomialOutputDtype(value tf.DataType) StatelessMultinomialAttr {
+// ConfigureDistributedTPUEmbeddingConfig sets the optional embedding_config attribute to value.
+//
+// value: Reserved. Do not use.
+// If not specified, defaults to ""
+func ConfigureDistributedTPUEmbeddingConfig(value string) ConfigureDistributedTPUAttr {
 	return func(m optionalAttr) {
-		m["output_dtype"] = value
+		m["embedding_config"] = value
 	}
 }
 
-// Draws samples from a multinomial distribution.
+// ConfigureDistributedTPUTpuEmbeddingConfig sets the optional tpu_embedding_config attribute to value.
 //
-// Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
-//	seed: 2 seeds (shape [2]).
+// value: Serialized tensorflow.tpu.TPUEmbeddingConfiguration that
+// describes the embedding lookups of the program.
+// If not specified, defaults to ""
+func ConfigureDistributedTPUTpuEmbeddingConfig(value string) ConfigureDistributedTPUAttr {
+	return func(m optionalAttr) {
+		m["tpu_embedding_config"] = value
+	}
+}
+
+// ConfigureDistributedTPUIsGlobalInit sets the optional is_global_init attribute to value.
 //
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output, seed tf.Output, optional ...StatelessMultinomialAttr) (output tf.Output) {
+// value: Reserved. Do not use.
+// If not specified, defaults to false
+func ConfigureDistributedTPUIsGlobalInit(value bool) ConfigureDistributedTPUAttr {
+	return func(m optionalAttr) {
+		m["is_global_init"] = value
+	}
+}
+
+// Sets up the centralized structures for a distributed TPU system.
+//
+// Returns A serialized tensorflow.tpu.TopologyProto that describes the TPU
+// topology.
+func ConfigureDistributedTPU(scope *Scope, optional ...ConfigureDistributedTPUAttr) (topology tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20874,716 +25248,835 @@ func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessMultinomial",
-		Input: []tf.Input{
-			logits, num_samples, seed,
-		},
+		Type: "ConfigureDistributedTPU",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceGatherAttr is an optional argument to ResourceGather.
-type ResourceGatherAttr func(optionalAttr)
+// Reshapes a quantized tensor as per the Reshape op.
+//
+// ```
+//
+// Arguments:
+//
+//	shape: Defines the shape of the output tensor.
+//	input_min: The minimum value of the input.
+//	input_max: The maximum value of the input.
+//
+// Returns This value is copied from input_min.This value is copied from input_max.
+func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedReshape",
+		Input: []tf.Input{
+			tensor, shape, input_min, input_max,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
 
-// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
+// PriorityQueueV2Attr is an optional argument to PriorityQueueV2.
+type PriorityQueueV2Attr func(optionalAttr)
+
+// PriorityQueueV2ComponentTypes sets the optional component_types attribute to value.
+//
+// value: The type of each component in a value.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func PriorityQueueV2ComponentTypes(value []tf.DataType) PriorityQueueV2Attr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["component_types"] = value
 	}
 }
 
-// Gather slices from the variable pointed to by `resource` according to `indices`.
+// PriorityQueueV2Capacity sets the optional capacity attribute to value.
 //
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func PriorityQueueV2Capacity(value int64) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// PriorityQueueV2Container sets the optional container attribute to value.
 //
-// ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func PriorityQueueV2Container(value string) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// PriorityQueueV2SharedName sets the optional shared_name attribute to value.
 //
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func PriorityQueueV2SharedName(value string) PriorityQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that produces elements sorted by the first component value.
 //
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-// ```
-func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
+// Note that the PriorityQueue requires the first component of any element
+// to be a scalar int64, in addition to the other elements declared by
+// component_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue
+// and DequeueMany) on a PriorityQueue will all require (resp. output) one extra
+// entry in their input (resp. output) lists.
+//
+// Arguments:
+//	shapes: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+//
+// Returns The handle to the queue.
+func PriorityQueueV2(scope *Scope, shapes []tf.Shape, optional ...PriorityQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"shapes": shapes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceGather",
-		Input: []tf.Input{
-			resource, indices,
-		},
+		Type: "PriorityQueueV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Delete the TensorArray from its resource container.
+// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
+type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
+
+// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
 //
-// This enables the user to close and release the resource in the middle
-// of a step/run.
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
+//
+// That is for rows we have grad for, we update var as follows:
+// prox_v = var - alpha * grad
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
 //
 // Arguments:
-//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
 // Returns the created operation.
-func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
+func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV3",
+		Type: "ResourceSparseApplyProximalGradientDescent",
 		Input: []tf.Input{
-			handle,
+			var_, alpha, l1, l2, grad, indices,
 		},
+		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Saves the input tensors to disk.
+// Check if the input matches the regex pattern.
 //
-// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
-// is written to `filename` with name `tensor_names[i]`.
+// The input is a string tensor of any shape. The pattern is the
+// regular expression to be matched with every element of the input tensor.
+// The boolean values (True or False) of the output tensor indicate
+// if the input matches the regex pattern provided.
 //
-// See also `SaveSlices`.
+// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
 //
 // Arguments:
-//	filename: Must have a single element. The name of the file to which we write
-// the tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	data: `N` tensors to save.
+//	input: A string tensor of the text to be processed.
+//	pattern: The regular expression to match the input.
 //
-// Returns the created operation.
-func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
+// Returns A bool tensor with the same shape as `input`.
+func StaticRegexFullMatch(scope *Scope, input tf.Output, pattern string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"pattern": pattern}
 	opspec := tf.OpSpec{
-		Type: "Save",
+		Type: "StaticRegexFullMatch",
 		Input: []tf.Input{
-			filename, tensor_names, tf.OutputList(data),
+			input,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
-//
-// true, this follows Python semantics in that the result here is consistent
-// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
+// OutfeedDequeueAttr is an optional argument to OutfeedDequeue.
+type OutfeedDequeueAttr func(optionalAttr)
+
+// OutfeedDequeueDeviceOrdinal sets the optional device_ordinal attribute to value.
 //
-// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FloorMod",
-		Input: []tf.Input{
-			x, y,
-		},
+// value: The TPU device to use. This should be -1 when the Op
+// is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+// If not specified, defaults to -1
+func OutfeedDequeueDeviceOrdinal(value int64) OutfeedDequeueAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the gradient of morphological 2-D dilation with respect to the filter.
+// Retrieves a single tensor from the computation outfeed.
+//
+// This operation will block indefinitely until data is available.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+//	dtype: The type of elements in the tensor.
+//	shape: The shape of the tensor.
 //
-// Returns 3-D with shape `[filter_height, filter_width, depth]`.
-func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
+// Returns A tensor that will be read from the device outfeed.
+func OutfeedDequeue(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...OutfeedDequeueAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropFilter",
-		Input: []tf.Input{
-			input, filter, out_backprop,
-		},
+		Type: "OutfeedDequeue",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a list list which has the passed-in `Tensor` as last element and the other elements of the given list in `input_handle`.
+// RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
+type RandomPoissonV2Attr func(optionalAttr)
+
+// RandomPoissonV2Seed sets the optional seed attribute to value.
 //
-// tensor: The tensor to put on the list.
-// input_handle: The old list.
-// output_handle: A list with the elements of the old list followed by tensor.
-// element_dtype: the type of elements in the list.
-// element_shape: a shape compatible with that of elements in the list.
-func TensorListPushBack(scope *Scope, input_handle tf.Output, tensor tf.Output) (output_handle tf.Output) {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomPoissonV2Seed(value int64) RandomPoissonV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomPoissonV2Seed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomPoissonV2Seed2(value int64) RandomPoissonV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// RandomPoissonV2Dtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func RandomPoissonV2Dtype(value tf.DataType) RandomPoissonV2Attr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random values from the Poisson distribution(s) described by rate.
+//
+// This op uses two algorithms, depending on rate. If rate >= 10, then
+// the algorithm by Hormann is used to acquire samples via
+// transformation-rejection.
+// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
+//
+// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
+// random variables.
+// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
+// Programming, Volume 2. Addison Wesley
+//
+// Arguments:
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in rate.
+//	rate: A tensor in which each scalar is a "rate" parameter describing the
+// associated poisson distribution.
+//
+// Returns A tensor with shape `shape + shape(rate)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `rate[i0, i1, ...iN]`.
+func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorListPushBack",
+		Type: "RandomPoissonV2",
 		Input: []tf.Input{
-			input_handle, tensor,
+			shape, rate,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
-type AddSparseToTensorsMapAttr func(optionalAttr)
+// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug.
+type RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr func(optionalAttr)
 
-// AddSparseToTensorsMapContainer sets the optional container attribute to value.
+// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["table_id"] = value
 	}
 }
 
-// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
-//
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
+// RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableName sets the optional table_name attribute to value.
 // If not specified, defaults to ""
-func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
+func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["table_name"] = value
 	}
 }
 
-// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
-//
-// A `SparseTensor` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`.
+// Retrieve RMSProp embedding parameters with debug support.
 //
-// This operator takes the given `SparseTensor` and adds it to a container
-// object (a `SparseTensorsMap`).  A unique key within this container is generated
-// in the form of an `int64`, and this is the value that is returned.
-//
-// The `SparseTensor` can then be read out as part of a minibatch by passing
-// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddSparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
-//
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns 0-D.  The handle of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.
-func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
+// Returns Parameter parameters updated by the RMSProp optimization algorithm.Parameter ms updated by the RMSProp optimization algorithm.Parameter mom updated by the RMSProp optimization algorithm.Parameter gradient_accumulators updated by the RMSProp optimization algorithm.
+func RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugAttr) (parameters tf.Output, ms tf.Output, mom tf.Output, gradient_accumulators tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AddSparseToTensorsMap",
-		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
-		},
+		Type: "RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
-//
-// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
-// `N` is the minibatch size and the rows correspond to packed outputs of
-// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
-// must all match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension).
-//
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-//
-// then the final deserialized `SparseTensor` will be:
-//
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
+// Computes the gradient for the rsqrt of `x` wrt its input.
 //
-// Arguments:
-//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
-// Must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "DeserializeManySparse",
+		Type: "RsqrtGrad",
 		Input: []tf.Input{
-			serialized_sparse,
+			y, dy,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Elementwise computes the bitwise AND of `x` and `y`.
+// Encode audio data using the WAV file format.
 //
-// The result will have those bits set, that are set in both `x` and `y`. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// This operation will generate a string suitable to be saved out to create a .wav
+// audio file. It will be encoded in the 16-bit PCM format. It takes in float
+// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
+// that range.
+//
+// `audio` is a 2-D float Tensor of shape `[length, channels]`.
+// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
+//
+// Arguments:
+//	audio: 2-D with shape `[length, channels]`.
+//	sample_rate: Scalar containing the sample frequency.
+//
+// Returns 0-D. WAV-encoded file contents.
+func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BitwiseAnd",
+		Type: "EncodeWav",
 		Input: []tf.Input{
-			x, y,
+			audio, sample_rate,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inverse real-valued fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most dimension of `input`.
+// ResourceApplyAdaMaxAttr is an optional argument to ResourceApplyAdaMax.
+type ResourceApplyAdaMaxAttr func(optionalAttr)
+
+// ResourceApplyAdaMaxUseLocking sets the optional use_locking attribute to value.
 //
-// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
-// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
-// `fft_length` is not provided, it is computed from the size of the inner-most
-// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
-// compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdaMaxUseLocking(value bool) ResourceApplyAdaMaxAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the AdaMax algorithm.
 //
-// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
-// than the corresponding dimension of `input`, the dimension is cropped. If it is
-// larger, the dimension is padded with zeros.
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// v_t <- max(beta2 * v_{t-1}, abs(g))
+// variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
 //
 // Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length` samples of its inverse
-//   1D Fourier transform.
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft
-// @end_compatibility
-func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyAdaMax(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdaMaxAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT",
+		Type: "ResourceApplyAdaMax",
 		Input: []tf.Input{
-			input, fft_length,
+			var_, m, v, beta1_power, lr, beta1, beta2, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes atan of x element-wise.
+func Atan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Atan",
+		Input: []tf.Input{
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Concatenates a list of `SparseTensor` along the specified dimension.
-//
-// Concatenation is with respect to the dense versions of these sparse tensors.
-// It is assumed that each input is a `SparseTensor` whose elements are ordered
-// along increasing dimension number.
-//
-// All inputs' shapes must match, except for the concat dimension.  The
-// `indices`, `values`, and `shapes` lists must have the same length.
-//
-// The output shape is identical to the inputs', except along the concat
-// dimension, where it is the sum of the inputs' sizes along that dimension.
-//
-// The output elements will be resorted to preserve the sort order along
-// increasing dimension number.
-//
-// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
-// values across all inputs. This is due to the need for an internal sort in
-// order to concatenate efficiently across an arbitrary dimension.
-//
-// For example, if `concat_dim = 1` and the inputs are
-//
-//     sp_inputs[0]: shape = [2, 3]
-//     [0, 2]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     sp_inputs[1]: shape = [2, 4]
-//     [0, 1]: "d"
-//     [0, 2]: "e"
-//
-// then the output will be
-//
-//     shape = [2, 7]
-//     [0, 2]: "a"
-//     [0, 4]: "d"
-//     [0, 5]: "e"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
+// AssertAttr is an optional argument to Assert.
+type AssertAttr func(optionalAttr)
+
+// AssertSummarize sets the optional summarize attribute to value.
 //
-// Graphically this is equivalent to doing
+// value: Print this many entries of each tensor.
+// If not specified, defaults to 3
+func AssertSummarize(value int64) AssertAttr {
+	return func(m optionalAttr) {
+		m["summarize"] = value
+	}
+}
+
+// Asserts that the given condition is true.
 //
-//     [    a] concat [  d e  ] = [    a   d e  ]
-//     [b c  ]        [       ]   [b c          ]
+// If `condition` evaluates to false, print the list of tensors in `data`.
+// `summarize` determines how many entries of the tensors to print.
 //
 // Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.  Non-empty values of each `SparseTensor`.
-//	shapes: 1-D.  Shapes of each `SparseTensor`.
-//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
-// where rank is the number of dimensions in each input `SparseTensor`.
+//	condition: The condition to evaluate.
+//	data: The tensors to print out when condition is false.
 //
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Returns the created operation.
+func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"concat_dim": concat_dim}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseConcat",
+		Type: "Assert",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
+			condition, tf.OutputList(data),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// Generates sparse cross from a list of sparse and dense tensors.
-//
-// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
-// representing features of one feature column. It outputs a 2D `SparseTensor` with
-// the batchwise crosses of these features.
-//
-// For example, if the inputs are
-//
-//     inputs[0]: SparseTensor with shape = [2, 2]
-//     [0, 0]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     inputs[1]: SparseTensor with shape = [2, 1]
-//     [0, 0]: "d"
-//     [1, 0]: "e"
-//
-//     inputs[2]: Tensor [["f"], ["g"]]
-//
-// then the output will be
-//
-//     shape = [2, 2]
-//     [0, 0]: "a_X_d_X_f"
-//     [1, 0]: "b_X_e_X_g"
-//     [1, 1]: "c_X_e_X_g"
+// LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingAdagradParametersGradAccumDebug.
+type LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr func(optionalAttr)
+
+// LoadTPUEmbeddingAdagradParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// if hashed_output=true then the output will be
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingAdagradParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingAdagradParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdagradParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Adagrad embedding parameters with debug support.
 //
-//     shape = [2, 2]
-//     [0, 0]: FingerprintCat64(
-//                 Fingerprint64("f"), FingerprintCat64(
-//                     Fingerprint64("d"), Fingerprint64("a")))
-//     [1, 0]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("b")))
-//     [1, 1]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("c")))
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.   values of each `SparseTensor`.
-//	shapes: 1-D.   Shapes of each `SparseTensor`.
-//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
-//	hashed_output: If true, returns the hash of the cross instead of the string.
-// This will allow us avoiding string manipulations.
-//	num_buckets: It is used if hashed_output is true.
-// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
-//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
-// function to combine the crosses fingerprints.
+//	parameters: Value of parameters used in the Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the Adagrad optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the Adagrad optimization algorithm.
 //
 //
 //
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
-// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Returns the created operation.
+func LoadTPUEmbeddingAdagradParametersGradAccumDebug(scope *Scope, parameters tf.Output, accumulators tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdagradParametersGradAccumDebugAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseCross",
+		Type: "LoadTPUEmbeddingAdagradParametersGradAccumDebug",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
+			parameters, accumulators, gradient_accumulators,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
-type ResourceApplyProximalAdagradAttr func(optionalAttr)
+// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingFTRLParametersGradAccumDebug.
+type RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr func(optionalAttr)
 
-// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["table_id"] = value
 	}
 }
 
-// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
-//
-// accum += grad * grad
-// prox_v = var - lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+// RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingFTRLParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve FTRL embedding parameters with debug support.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns the created operation.
-func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
+// Returns Parameter parameters updated by the FTRL optimization algorithm.Parameter accumulators updated by the FTRL optimization algorithm.Parameter linears updated by the FTRL optimization algorithm.Parameter gradient_accumulators updated by the FTRL optimization algorithm.
+func RetrieveTPUEmbeddingFTRLParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingFTRLParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, linears tf.Output, gradient_accumulators tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalAdagrad",
+		Type: "RetrieveTPUEmbeddingFTRLParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// A dataset that splits the elements of its input into multiple elements.
+func ExperimentalUnbatchDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalUnbatchDataset",
 		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad,
+			input_dataset,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
-type MutableHashTableOfTensorsV2Attr func(optionalAttr)
+// StringFormatAttr is an optional argument to StringFormat.
+type StringFormatAttr func(optionalAttr)
 
-// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
+// StringFormatTemplate sets the optional template attribute to value.
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
+// value: A string, the template to format tensor summaries into.
+// If not specified, defaults to "%s"
+func StringFormatTemplate(value string) StringFormatAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["template"] = value
 	}
 }
 
-// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
+// StringFormatPlaceholder sets the optional placeholder attribute to value.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
+// value: A string, at each placeholder in the template a subsequent tensor summary will be inserted.
+// If not specified, defaults to "%s"
+func StringFormatPlaceholder(value string) StringFormatAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["placeholder"] = value
 	}
 }
 
-// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-// If not specified, defaults to false
-func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
+// StringFormatSummarize sets the optional summarize attribute to value.
+//
+// value: When formatting the tensor summaries print the first and last summarize entries of each tensor dimension.
+// If not specified, defaults to 3
+func StringFormatSummarize(value int64) StringFormatAttr {
 	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+		m["summarize"] = value
 	}
 }
 
-// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
-// If not specified, defaults to <>
-func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
-	return func(m optionalAttr) {
-		m["value_shape"] = value
+// Formats a string template using a list of tensors.
+//
+// Formats a string template using a list of tensors, pretty-printing tensor summaries.
+//
+// Arguments:
+//	inputs: The list of tensors to format into the placeholder string.
+//
+// Returns = The resulting string scalar.
+func StringFormat(scope *Scope, inputs []tf.Output, optional ...StringFormatAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringFormat",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns true if queue is closed.
+//
+// This operation returns true if the queue is closed and false if the queue
+// is open.
+//
+// Arguments:
+//	handle: The handle to a queue.
+func QueueIsClosedV2(scope *Scope, handle tf.Output) (is_closed tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueIsClosedV2",
+		Input: []tf.Input{
+			handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes inverse hyperbolic tangent of x element-wise.
+func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Atanh",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Creates an empty hash table.
+// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
 //
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a vector. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
+// For an explanation see "Differentiation of the Cholesky algorithm" by
+// Iain Murray http://arxiv.org/abs/1602.07527.
 //
 // Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
+//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
 //
-// Returns Handle to a table.
-func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
+// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
+func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MutableHashTableOfTensorsV2",
-
-		Attrs: attrs,
+		Type: "CholeskyGrad",
+		Input: []tf.Input{
+			l, grad,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// The gradient operator for the SparseSlice op.
+// Assigns a new value to a variable.
 //
-// This op takes in the upstream gradient w.r.t. non-empty values of
-// the sliced `SparseTensor`, and outputs the gradients w.r.t.
-// the non-empty values of input `SparseTensor`.
+// Any ReadVariableOp with a control dependency on this op is guaranteed to return
+// this value or a subsequent newer value of the variable.
 //
 // Arguments:
-//	backprop_val_grad: 1-D. The gradient with respect to
-// the non-empty values of the sliced `SparseTensor`.
-//	input_indices: 2-D.  The `indices` of the input `SparseTensor`.
-//	input_start: 1-D. tensor represents the start of the slice.
-//	output_indices: 2-D.  The `indices` of the sliced `SparseTensor`.
+//	resource: handle to the resource in which to store the variable.
+//	value: the value to set the new tensor to use.
 //
-// Returns 1-D. The gradient with respect to the non-empty values of input `SparseTensor`.
-func SparseSliceGrad(scope *Scope, backprop_val_grad tf.Output, input_indices tf.Output, input_start tf.Output, output_indices tf.Output) (val_grad tf.Output) {
+// Returns the created operation.
+func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSliceGrad",
+		Type: "AssignVariableOp",
 		Input: []tf.Input{
-			backprop_val_grad, input_indices, input_start, output_indices,
+			resource, value,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes the gradient of the sigmoid of `x` wrt its input.
+// Returns a tensor of ones with the same shape and type as x.
 //
-// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
-// `dy` is the corresponding input gradient.
-func SigmoidGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// Arguments:
+//	x: a tensor of type T.
+//
+// Returns a tensor of the same shape and type as x but filled with ones.
+func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SigmoidGrad",
+		Type: "OnesLike",
 		Input: []tf.Input{
-			y, dy,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Convert one or more images from HSV to RGB.
+// The gradient of SparseFillEmptyRows.
 //
-// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
+// Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
+// shaped `[N_full]`, where `N_full >= N` and copies data into either
+// `d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
+// `d_default_value` is a scalar.
 //
-// See `rgb_to_hsv` for a description of the HSV encoding.
+//   d_values[j] = grad_values[reverse_index_map[j]]
+//   d_default_value = sum_{k : 0 .. N_full - 1} (
+//      grad_values[k] * 1{k not in reverse_index_map})
 //
 // Arguments:
-//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
+//	reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
+//	grad_values: 1-D.  The gradients from backprop.
 //
-// Returns `images` converted to RGB.
-func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
+// Returns 1-D.  The backprop into values.0-D.  The backprop into default_value.
+func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_values tf.Output) (d_values tf.Output, d_default_value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "HSVToRGB",
+		Type: "SparseFillEmptyRowsGrad",
 		Input: []tf.Input{
-			images,
+			reverse_index_map, grad_values,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Creates a dataset by applying optimizations to `input_dataset`.
-//
-// Creates a dataset by applying optimizations to `input_dataset`.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
-//	optimizations: A `tf.string` vector `tf.Tensor` identifying optimizations to use.
-//
-//
-func OptimizeDataset(scope *Scope, input_dataset tf.Output, optimizations tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Creates a dataset that zips together `input_datasets`.
+func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "OptimizeDataset",
+		Type: "ZipDataset",
 		Input: []tf.Input{
-			input_dataset, optimizations,
+			tf.OutputList(input_datasets),
 		},
 		Attrs: attrs,
 	}
@@ -21591,428 +26084,610 @@ func OptimizeDataset(scope *Scope, input_dataset tf.Output, optimizations tf.Out
 	return op.Output(0)
 }
 
-// Returns the element-wise min of two SparseTensors.
+// LoadTPUEmbeddingAdagradParametersAttr is an optional argument to LoadTPUEmbeddingAdagradParameters.
+type LoadTPUEmbeddingAdagradParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingAdagradParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingAdagradParametersTableId(value int64) LoadTPUEmbeddingAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingAdagradParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingAdagradParametersTableName(value string) LoadTPUEmbeddingAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Adagrad embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//	parameters: Value of parameters used in the Adagrad optimization algorithm.
+//	accumulators: Value of accumulators used in the Adagrad optimization algorithm.
 //
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingAdagradParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingAdagradParametersAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSparseMinimum",
+		Type: "LoadTPUEmbeddingAdagradParameters",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+			parameters, accumulators,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// MapUnstageNoKeyAttr is an optional argument to MapUnstageNoKey.
-type MapUnstageNoKeyAttr func(optionalAttr)
-
-// MapUnstageNoKeyCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// Strip leading and trailing whitespaces from the Tensor.
 //
-// REQUIRES: value >= 0
-func MapUnstageNoKeyCapacity(value int64) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
+// Arguments:
+//	input: A string `Tensor` of any shape.
+//
+// Returns A string `Tensor` of the same shape as the input.
+func StringStrip(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "StringStrip",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// REQUIRES: value >= 0
-func MapUnstageNoKeyMemoryLimit(value int64) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+// The hash function is deterministic on the content of the string within the
+// process. The hash function is a keyed hash function, where attribute `key`
+// defines the key of the hash function. `key` is an array of 2 elements.
+//
+// A strong hash is important when inputs may be malicious, e.g. URLs with
+// additional components. Adversaries could try to make their inputs hash to the
+// same bucket for a denial-of-service attack or to skew the results. A strong
+// hash prevents this by making it difficult, if not infeasible, to compute inputs
+// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
+// time than `tf.string_to_hash_bucket_fast`.
+//
+// Arguments:
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//	key: The key for the keyed hash function passed as a list of two uint64
+// elements.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// MapUnstageNoKeyContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapUnstageNoKeyContainer(value string) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
+	opspec := tf.OpSpec{
+		Type: "StringToHashBucketStrong",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MapUnstageNoKeySharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapUnstageNoKeySharedName(value string) MapUnstageNoKeyAttr {
+// StringLengthAttr is an optional argument to StringLength.
+type StringLengthAttr func(optionalAttr)
+
+// StringLengthUnit sets the optional unit attribute to value.
+//
+// value: The unit that is counted to compute string length.  One of: `"BYTE"` (for
+// the number of bytes in each string) or `"UTF8_CHAR"` (for the number of UTF-8
+// encoded Unicode code points in each string).  Results are undefined
+// if `unit=UTF8_CHAR` and the `input` strings do not contain structurally
+// valid UTF-8.
+// If not specified, defaults to "BYTE"
+func StringLengthUnit(value string) StringLengthAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["unit"] = value
 	}
 }
 
-// Op removes and returns a random (key, value)
+// String lengths of `input`.
 //
-// from the underlying container.   If the underlying container
-// does not contain elements, the op will block until it does.
-func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
+// Computes the length of each string given in the input tensor.
+//
+// Arguments:
+//	input: The string for which to compute the length.
+//
+// Returns Integer tensor that has the same shape as `input`. The output contains the
+// element-wise string lengths of `input`.
+func StringLength(scope *Scope, input tf.Output, optional ...StringLengthAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapUnstageNoKey",
+		Type: "StringLength",
 		Input: []tf.Input{
-			indices,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Performs gradient updates of embedding tables.
+//
+// Arguments:
+//	inputs: A TensorList of gradients with which to update embedding tables.
+// This argument has the same length and shapes as the return value of
+// RecvTPUEmbeddingActivations, but contains gradients of the model's loss
+// with respect to the embedding activations. The embedding tables are updated
+// from these gradients via the optimizer specified in the TPU embedding
+// configuration given to tpu.initialize_system.
+//	learning_rates: A TensorList of float32 scalars, one for each dynamic learning
+// rate tag: see the comments in
+// //third_party/tensorflow/core/protobuf/tpu/optimization_parameters.proto.
+// Multiple tables can share the same dynamic learning rate tag as specified
+// in the configuration. If the learning rates for all tables are constant,
+// this list should be empty.
+//	config: Serialized TPUEmbeddingConfiguration proto.
+//
+// Returns the created operation.
+func SendTPUEmbeddingGradients(scope *Scope, inputs []tf.Output, learning_rates []tf.Output, config string) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	key = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapUnstageNoKey", err)
-		return
+	attrs := map[string]interface{}{"config": config}
+	opspec := tf.OpSpec{
+		Type: "SendTPUEmbeddingGradients",
+		Input: []tf.Input{
+			tf.OutputList(inputs), tf.OutputList(learning_rates),
+		},
+		Attrs: attrs,
 	}
-	return key, values
+	return scope.AddOperation(opspec)
 }
 
-// HashTableV2Attr is an optional argument to HashTableV2.
-type HashTableV2Attr func(optionalAttr)
-
-// HashTableV2Container sets the optional container attribute to value.
+// Computes numerical negative value element-wise.
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func HashTableV2Container(value string) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// I.e., \\(y = -x\\).
+func Neg(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Neg",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// HashTableV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func HashTableV2SharedName(value string) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// Receives a tensor value broadcast from another device.
+func CollectiveBcastRecv(scope *Scope, T tf.DataType, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
+	attrs := map[string]interface{}{"T": T, "group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "CollectiveBcastRecv",
 
-// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-//
-// value: If true and shared_name is empty, the table is shared
-// using the node name.
-// If not specified, defaults to false
-func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Creates a non-initialized hash table.
+// Decode web-safe base64-encoded strings.
 //
-// This op creates a hash table, specifying the type of its keys and values.
-// Before using the table you will have to initialize it.  After initialization the
-// table will be immutable.
+// Input may or may not have padding at the end. See EncodeBase64 for padding.
+// Web-safe means that input must use - and _ instead of + and /.
 //
 // Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+//	input: Base64 strings to decode.
 //
-// Returns Handle to a table.
-func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
+// Returns Decoded strings.
+func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "HashTableV2",
-
-		Attrs: attrs,
+		Type: "DecodeBase64",
+		Input: []tf.Input{
+			input,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
-type TakeManySparseFromTensorsMapAttr func(optionalAttr)
+// SubstrAttr is an optional argument to Substr.
+type SubstrAttr func(optionalAttr)
 
-// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
+// SubstrUnit sets the optional unit attribute to value.
 //
-// value: The container name for the `SparseTensorsMap` read by this op.
-// If not specified, defaults to ""
-func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
+// value: The unit that is used to create the substring.  One of: `"BYTE"` (for
+// defining position and length by bytes) or `"UTF8_CHAR"` (for the UTF-8
+// encoded Unicode code points).  The default is `"BYTE"`. Results are undefined if
+// `unit=UTF8_CHAR` and the `input` strings do not contain structurally valid
+// UTF-8.
+// If not specified, defaults to "BYTE"
+func SubstrUnit(value string) SubstrAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["unit"] = value
 	}
 }
 
-// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
+// Return substrings from `Tensor` of strings.
 //
-// value: The shared name for the `SparseTensorsMap` read by this op.
-// It should not be blank; rather the `shared_name` or unique Operation name
-// of the Op that created the original `SparseTensorsMap` should be used.
-// If not specified, defaults to ""
-func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
+// For each string in the input `Tensor`, creates a substring starting at index
+// `pos` with a total length of `len`.
 //
-// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
-// `N` is the minibatch size and the rows correspond to the output handles of
-// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
-// original `SparseTensor` objects that went into the given input ops must all
-// match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension on the left).
+// If `len` defines a substring that would extend beyond the length of the input
+// string, then as many characters as possible are used.
 //
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
+// A negative `pos` indicates distance within the string backwards from the end.
 //
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
+// If `pos` specifies an index which is out of range for any of the input strings,
+// then an `InvalidArgumentError` is thrown.
 //
-// For example, if the handles represent an input, which is a `[2, 3]` matrix
-// representing two original `SparseTensor` objects:
+// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
+// Op creation.
+//
+// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
+// broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+//
+// ---
+//
+// Examples
+//
+// Using scalar `pos` and `len`:
 //
+// ```python
+// input = [b'Hello', b'World']
+// position = 1
+// length = 3
+//
+// output = [b'ell', b'orl']
 // ```
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
+//
+// Using `pos` and `len` with same shape as `input`:
+//
+// ```python
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen']]
+// position = [[1, 2, 3],
+//             [1, 2, 3],
+//             [1, 2, 3]]
+// length =   [[2, 3, 4],
+//             [4, 3, 2],
+//             [5, 5, 5]]
+//
+// output = [[b'en', b'eve', b'lve'],
+//           [b'hirt', b'urt', b'te'],
+//           [b'ixtee', b'vente', b'hteen']]
 // ```
 //
-// and
+// Broadcasting `pos` and `len` onto `input`:
 //
 // ```
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen'],
+//          [b'nineteen', b'twenty', b'twentyone']]
+// position = [1, 2, 3]
+// length =   [1, 2, 3]
+//
+// output = [[b'e', b'ev', b'lve'],
+//           [b'h', b'ur', b'tee'],
+//           [b'i', b've', b'hte'],
+//           [b'i', b'en', b'nty']]
 // ```
 //
-// then the final `SparseTensor` will be:
+// Broadcasting `input` onto `pos` and `len`:
 //
 // ```
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
+// input = b'thirteen'
+// position = [1, 5, 7]
+// length =   [3, 2, 1]
+//
+// output = [b'hir', b'ee', b'n']
 // ```
 //
 // Arguments:
-//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
-// Shape: `[N]`.
-//	dtype: The `dtype` of the `SparseTensor` objects stored in the
-// `SparseTensorsMap`.
+//	input: Tensor of strings
+//	pos: Scalar defining the position of first character in each substring
+//	len: Scalar defining the number of characters to include in each substring
 //
-// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
-func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+// Returns Tensor of substrings
+func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output, optional ...SubstrAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Substr",
+		Input: []tf.Input{
+			input, pos, len,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Exits the current frame to its parent frame.
+//
+// Exit makes its input `data` available to the parent frame.
+//
+// Arguments:
+//	data: The tensor to be made available to the parent frame.
+//
+// Returns The same tensor as `data`.
+func Exit(scope *Scope, data tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TakeManySparseFromTensorsMap",
+		Type: "Exit",
 		Input: []tf.Input{
-			sparse_handles,
+			data,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Assigns a new value to a variable.
+// RetrieveTPUEmbeddingProximalAdagradParametersAttr is an optional argument to RetrieveTPUEmbeddingProximalAdagradParameters.
+type RetrieveTPUEmbeddingProximalAdagradParametersAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingProximalAdagradParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// Any ReadVariableOp with a control dependency on this op is guaranteed to return
-// this value or a subsequent newer value of the variable.
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingProximalAdagradParametersTableId(value int64) RetrieveTPUEmbeddingProximalAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingProximalAdagradParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingProximalAdagradParametersTableName(value string) RetrieveTPUEmbeddingProximalAdagradParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve proximal Adagrad embedding parameters.
 //
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value to set the new tensor to use.
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns the created operation.
-func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+// Returns Parameter parameters updated by the proximal Adagrad optimization algorithm.Parameter accumulators updated by the proximal Adagrad optimization algorithm.
+func RetrieveTPUEmbeddingProximalAdagradParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingProximalAdagradParametersAttr) (parameters tf.Output, accumulators tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "AssignVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
+		Type: "RetrieveTPUEmbeddingProximalAdagradParameters",
+
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// Strip leading and trailing whitespaces from the Tensor.
+// Produce a string tensor that encodes the state of a Reader.
 //
-// Arguments:
-//	input: A string `Tensor` of any shape.
+// Not all Readers support being serialized, so this can produce an
+// Unimplemented error.
 //
-// Returns A string `Tensor` of the same shape as the input.
-func StringStrip(scope *Scope, input tf.Output) (output tf.Output) {
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "StringStrip",
+		Type: "ReaderSerializeStateV2",
 		Input: []tf.Input{
-			input,
+			reader_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a tensor of ones with the same shape and type as x.
-//
-// Arguments:
-//	x: a tensor of type T.
+// Returns the number of tensors in the input tensor list.
 //
-// Returns a tensor of the same shape and type as x but filled with ones.
-func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
+// input_handle: the input list
+// length: the number of tensors in the list
+func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "OnesLike",
+		Type: "TensorListLength",
 		Input: []tf.Input{
-			x,
+			input_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// The gradient of SparseFillEmptyRows.
-//
-// Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
-// shaped `[N_full]`, where `N_full >= N` and copies data into either
-// `d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
-// `d_default_value` is a scalar.
-//
-//   d_values[j] = grad_values[reverse_index_map[j]]
-//   d_default_value = sum_{k : 0 .. N_full - 1} (
-//      grad_values[k] * 1{k not in reverse_index_map})
+// Creates a dataset with a range of values. Corresponds to python's xrange.
 //
 // Arguments:
-//	reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
-//	grad_values: 1-D.  The gradients from backprop.
+//	start: corresponds to start in python's xrange().
+//	stop: corresponds to stop in python's xrange().
+//	step: corresponds to step in python's xrange().
 //
-// Returns 1-D.  The backprop into values.0-D.  The backprop into default_value.
-func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_values tf.Output) (d_values tf.Output, d_default_value tf.Output) {
+//
+func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SparseFillEmptyRowsGrad",
+		Type: "RangeDataset",
 		Input: []tf.Input{
-			reverse_index_map, grad_values,
+			start, stop, step,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
-//
-// if < 0, `scale * features` otherwise.
-//
-// To be used together with
-// `initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
-// For correct dropout, use `tf.contrib.nn.alpha_dropout`.
-//
-// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
-func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
+// Computes inverse hyperbolic sine of x element-wise.
+func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Selu",
+		Type: "Asinh",
 		Input: []tf.Input{
-			features,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SetSizeAttr is an optional argument to SetSize.
-type SetSizeAttr func(optionalAttr)
+// UnicodeTranscodeAttr is an optional argument to UnicodeTranscode.
+type UnicodeTranscodeAttr func(optionalAttr)
 
-// SetSizeValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SetSizeValidateIndices(value bool) SetSizeAttr {
+// UnicodeTranscodeErrors sets the optional errors attribute to value.
+//
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeTranscodeErrors(value string) UnicodeTranscodeAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["errors"] = value
 	}
 }
 
-// Number of unique elements along last dimension of input `set`.
+// UnicodeTranscodeReplacementChar sets the optional replacement_char attribute to value.
 //
-// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
-// and `set_shape`. The last dimension contains values in a set, duplicates are
-// allowed but ignored.
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set`
-// indices.
+// Note that for UTF-8, passing a replacement character expressible in 1 byte, such
+// as ' ', will preserve string alignment to the source since invalid bytes will be
+// replaced with a 1-byte replacement. For UTF-16-BE and UTF-16-LE, any 1 or 2 byte
+// replacement character will preserve byte alignment to the source.
+// If not specified, defaults to 65533
+func UnicodeTranscodeReplacementChar(value int64) UnicodeTranscodeAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// UnicodeTranscodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+//
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeTranscodeReplaceControlCharacters(value bool) UnicodeTranscodeAttr {
+	return func(m optionalAttr) {
+		m["replace_control_characters"] = value
+	}
+}
+
+// Transcode the input text from a source encoding to a destination encoding.
+//
+// The input is a string tensor of any shape. The output is a string tensor of
+// the same shape containing the transcoded strings. Output strings are always
+// valid unicode. If the input contains invalid encoding positions, the
+// `errors` attribute sets the policy for how to deal with them. If the default
+// error-handling policy is used, invalid formatting will be substituted in the
+// output by the `replacement_char`. If the errors policy is to `ignore`, any
+// invalid encoding positions in the input are skipped and not included in the
+// output. If it set to `strict` then any invalid formatting will result in an
+// InvalidArgument error.
+//
+// This operation can be used with `output_encoding = input_encoding` to enforce
+// correct formatting for inputs even if they are already in the desired encoding.
+//
+// If the input is prefixed by a Byte Order Mark needed to determine encoding
+// (e.g. if the encoding is UTF-16 and the BOM indicates big-endian), then that
+// BOM will be consumed and not emitted into the output. If the input encoding
+// is marked with an explicit endianness (e.g. UTF-16-BE), then the BOM is
+// interpreted as a non-breaking-space and is preserved in the output (including
+// always for UTF-8).
+//
+// The end result is that if the input is marked as an explicit endianness the
+// transcoding is faithful to all codepoints in the source. If it is not marked
+// with an explicit endianness, the BOM is not considered part of the string itself
+// but as metadata, and so is not preserved in the output.
 //
 // Arguments:
-//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
-//	set_values: 1D `Tensor`, values of a `SparseTensor`.
-//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
+//	input: The text to be processed. Can have any shape.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+//	output_encoding: The unicode encoding to use in the output. Must be one of
+// `"UTF-8", "UTF-16-BE", "UTF-32-BE"`. Multi-byte encodings will be big-endian.
 //
-// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
-// `n-1` dimensions as `set`. Each value is the number of unique elements in
-// the corresponding `[0...n-1]` dimension of `set`.
-func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
+// Returns A string tensor containing unicode text encoded using `output_encoding`.
+func UnicodeTranscode(scope *Scope, input tf.Output, input_encoding string, output_encoding string, optional ...UnicodeTranscodeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"input_encoding": input_encoding, "output_encoding": output_encoding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SetSize",
+		Type: "UnicodeTranscode",
 		Input: []tf.Input{
-			set_indices, set_values, set_shape,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -22020,283 +26695,408 @@ func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shap
 	return op.Output(0)
 }
 
-// Computes the sign and the log of the absolute value of the determinant of
+// UnicodeDecodeAttr is an optional argument to UnicodeDecode.
+type UnicodeDecodeAttr func(optionalAttr)
+
+// UnicodeDecodeErrors sets the optional errors attribute to value.
 //
-// one or more square matrices.
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeDecodeErrors(value string) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeDecodeReplacementChar sets the optional replacement_char attribute to value.
 //
-// The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
-// form square matrices. The outputs are two tensors containing the signs and
-// absolute values of the log determinants for all N input submatrices
-// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
-// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
-// is the LU decomposition of the input and P is the corresponding
-// permutation matrix.
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
+// If not specified, defaults to 65533
+func UnicodeDecodeReplacementChar(value int64) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// UnicodeDecodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+//
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeDecodeReplaceControlCharacters(value bool) UnicodeDecodeAttr {
+	return func(m optionalAttr) {
+		m["replace_control_characters"] = value
+	}
+}
+
+// Decodes each string in `input` into a sequence of Unicode code points.
+//
+// The character codepoints for all strings are returned using a single vector
+// `char_values`, with strings expanded to characters in row-major order.
+//
+// The `row_splits` tensor indicates where the codepoints for
+// each input string begin and end within the `char_values` tensor.
+// In particular, the values for the `i`th
+// string (in row-major order) are stored in the slice
+// `[row_splits[i]:row_splits[i+1]]`. Thus:
+//
+// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
+//   string (in row-major order).
 //
 // Arguments:
-//	input: Shape is `[N, M, M]`.
+//	input: The text to be decoded. Can have any shape. Note that the output is flattened
+// to a vector of char values.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
 //
-// Returns The signs of the log determinants of the inputs. Shape is `[N]`.The logs of the absolute values of the determinants
-// of the N input matrices.  Shape is `[N]`.
-func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_abs_determinant tf.Output) {
+// Returns A 1D int32 tensor containing the row splits.A 1D int32 Tensor containing the decoded codepoints.
+func UnicodeDecode(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeAttr) (row_splits tf.Output, char_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"input_encoding": input_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "LogMatrixDeterminant",
+		Type: "UnicodeDecode",
 		Input: []tf.Input{
 			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1)
 }
 
-// Copy a tensor setting everything outside a central band in each innermost matrix
-//
-// to zero.
-//
-// The `band` part is computed as follows:
-// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-// tensor with the same shape where
-//
-// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
-//
-// The indicator function
-//
-// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
-//                  (num_upper < 0 || (n-m) <= num_upper)`.
-//
-// For example:
-//
-// ```
-// # if 'input' is [[ 0,  1,  2, 3]
-//                  [-1,  0,  1, 2]
-//                  [-2, -1,  0, 1]
-//                  [-3, -2, -1, 0]],
-//
-// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
-//                                        [-1,  0,  1, 2]
-//                                        [ 0, -1,  0, 1]
-//                                        [ 0,  0, -1, 0]],
-//
-// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
-//                                       [-1,  0,  1, 0]
-//                                       [-2, -1,  0, 1]
-//                                       [ 0, -2, -1, 0]]
-// ```
+// Adds up a SparseTensor and a dense Tensor, using these special rules:
 //
-// Useful special cases:
+// (1) Broadcasts the dense side to have the same shape as the sparse side, if
+//     eligible;
+// (2) Then, only the dense values pointed to by the indices of the SparseTensor
+//     participate in the cwise addition.
 //
-// ```
-//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
-//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
-//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
-// ```
+// By these rules, the result is a logical SparseTensor with exactly the same
+// indices and shape, but possibly with different non-zero values.  The output of
+// this Op is the resultant non-zero values.
 //
 // Arguments:
-//	input: Rank `k` tensor.
-//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
-// lower triangle.
-//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
-// entire upper triangle.
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
 //
-// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
-func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixBandPart",
+		Type: "SparseDenseCwiseAdd",
 		Input: []tf.Input{
-			input, num_lower, num_upper,
+			sp_indices, sp_values, sp_shape, dense,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Delete the tensor specified by its handle in the session.
+// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
+type ResourceApplyRMSPropAttr func(optionalAttr)
+
+// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the RMSProp algorithm.
+//
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
-//	handle: The handle for a tensor stored in the session state.
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
 //
 // Returns the created operation.
-func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
+func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DeleteSessionTensor",
+		Type: "ResourceApplyRMSProp",
 		Input: []tf.Input{
-			handle,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad,
 		},
+		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// L2 Loss.
+// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
+type StatelessTruncatedNormalAttr func(optionalAttr)
+
+// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
 //
-// Computes half the L2 norm of a tensor without the `sqrt`:
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom values from a truncated normal distribution.
 //
-//     output = sum(t ** 2) / 2
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
-//	t: Typically 2-D, but may have any dimensions.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns 0-D.
-func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
+// Returns Random values with specified shape.
+func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "L2Loss",
+		Type: "StatelessTruncatedNormal",
 		Input: []tf.Input{
-			t,
+			shape, seed,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
-type DenseToSparseSetOperationAttr func(optionalAttr)
+// RestoreSliceAttr is an optional argument to RestoreSlice.
+type RestoreSliceAttr func(optionalAttr)
 
-// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
+// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
+//
+// value: Index of file to open first if multiple files match
+// `file_pattern`. See the documentation for `Restore`.
+// If not specified, defaults to -1
+func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["preferred_shard"] = value
 	}
 }
 
-// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
-//
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+// Restores a tensor from checkpoint files.
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set2`
-// indices.
+// This is like `Restore` except that restored tensor can be listed as filling
+// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
+// larger tensor and the slice that the restored tensor covers.
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// The `shape_and_slice` input has the same format as the
+// elements of the `shapes_and_slices` input of the `SaveSlices` op.
 //
 // Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
-// max set size across `n-1` dimensions.
-//
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	shape_and_slice: Scalar. The shapes and slice specifications to use when
+// restoring a tensors.
+//	dt: The type of the tensor to be restored.
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Returns The restored tensor.
+func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
+	attrs := map[string]interface{}{"dt": dt}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DenseToSparseSetOperation",
+		Type: "RestoreSlice",
+		Input: []tf.Input{
+			file_pattern, tensor_name, shape_and_slice,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the element-wise sum of a list of tensors.
+//
+// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
+// wait for all of its inputs to be ready before beginning to sum. This can
+// save memory if inputs are ready at different times, since minimum temporary
+// storage is proportional to the output size rather than the inputs size.
+//
+// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
+//
+// Returns a `Tensor` of same shape and type as the elements of `inputs`.
+//
+// Arguments:
+//	inputs: A list of `Tensor` objects, each with same shape and type.
+//	shape: Shape of elements of `inputs`.
+func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape": shape}
+	opspec := tf.OpSpec{
+		Type: "AccumulateNV2",
 		Input: []tf.Input{
-			set1, set2_indices, set2_values, set2_shape,
+			tf.OutputList(inputs),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Subtracts a value from the current value of a variable.
+// Convert the quantized 'input' tensor into a lower-precision 'output', using the
 //
-// Any ReadVariableOp with a control dependency on this op is guaranteed to
-// see the decremented value or a subsequent newer one.
+// actual distribution of the values to maximize the usage of the lower bit depth
+// and adjusting the output min and max ranges accordingly.
+//
+// [input_min, input_max] are scalar floats that specify the range for the float
+// interpretation of the 'input' data. For example, if input_min is -1.0f and
+// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+//
+// This operator tries to squeeze as much precision as possible into an output with
+// a lower bit depth by calculating the actual min and max values found in the
+// data. For example, maybe that quint16 input has no values lower than 16,384 and
+// none higher than 49,152. That means only half the range is actually needed, all
+// the float interpretations are between -0.5f and 0.5f, so if we want to compress
+// the data into a quint8 output, we can use that range rather than the theoretical
+// -1.0f to 1.0f that is suggested by the input min and max.
+//
+// In practice, this is most useful for taking output from operations like
+// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
+// may have large potential output ranges, but in practice have a distribution of
+// input values that only uses a small fraction of the possible range. By feeding
+// that output into this operator, we can reduce it from 32 bits down to 8 with
+// minimal loss of accuracy.
 //
 // Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
 //
-// Returns the created operation.
-func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "AssignSubVariableOp",
+		Type: "QuantizeDownAndShrinkRange",
 		Input: []tf.Input{
-			resource, value,
+			input, input_min, input_max,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// RestoreAttr is an optional argument to Restore.
-type RestoreAttr func(optionalAttr)
+// RandomGammaAttr is an optional argument to RandomGamma.
+type RandomGammaAttr func(optionalAttr)
 
-// RestorePreferredShard sets the optional preferred_shard attribute to value.
+// RandomGammaSeed sets the optional seed attribute to value.
 //
-// value: Index of file to open first if multiple files match
-// `file_pattern`.
-// If not specified, defaults to -1
-func RestorePreferredShard(value int64) RestoreAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomGammaSeed(value int64) RandomGammaAttr {
 	return func(m optionalAttr) {
-		m["preferred_shard"] = value
+		m["seed"] = value
 	}
 }
 
-// Restores a tensor from checkpoint files.
-//
-// Reads a tensor stored in one or several files. If there are several files (for
-// instance because a tensor was saved as slices), `file_pattern` may contain
-// wildcard symbols (`*` and `?`) in the filename portion only, not in the
-// directory portion.
+// RandomGammaSeed2 sets the optional seed2 attribute to value.
 //
-// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
-// in which file the requested tensor is likely to be found. This op will first
-// open the file at index `preferred_shard` in the list of matching files and try
-// to restore tensors from that file.  Only if some tensors or tensor slices are
-// not found in that first file, then the Op opens all the files. Setting
-// `preferred_shard` to match the value passed as the `shard` input
-// of a matching `Save` Op may speed up Restore.  This attribute only affects
-// performance, not correctness.  The default value -1 means files are processed in
-// order.
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomGammaSeed2(value int64) RandomGammaAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from the Gamma distribution(s) described by alpha.
 //
-// See also `RestoreSlice`.
+// This op uses the algorithm by Marsaglia et al. to acquire samples via
+// transformation-rejection from pairs of uniform and normal random variables.
+// See http://dl.acm.org/citation.cfm?id=358414
 //
 // Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	dt: The type of the tensor to be restored.
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in alpha.
+//	alpha: A tensor in which each scalar is a "shape" parameter describing the
+// associated gamma distribution.
 //
-// Returns The restored tensor.
-func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
+// Returns A tensor with shape `shape + shape(alpha)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
+func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dt": dt}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Restore",
+		Type: "RandomGamma",
 		Input: []tf.Input{
-			file_pattern, tensor_name,
+			shape, alpha,
 		},
 		Attrs: attrs,
 	}
@@ -22304,34 +27104,66 @@ func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.
 	return op.Output(0)
 }
 
-// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
-type QuantizedResizeBilinearAttr func(optionalAttr)
+// ResourceScatterNdSubAttr is an optional argument to ResourceScatterNdSub.
+type ResourceScatterNdSubAttr func(optionalAttr)
 
-// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
+// ResourceScatterNdSubUseLocking sets the optional use_locking attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdSubUseLocking(value bool) ResourceScatterNdSubAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Resize quantized `images` to `size` using quantized bilinear interpolation.
+// Applies sparse subtraction to individual values or slices in a Variable.
 //
-// Input images and output images must be quantized types.
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 //
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 //
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+// ```
 //
+// For example, say we want to subtract 4 scattered elements from a rank-1 tensor
+// with 8 elements. In Python, that subtraction would look like this:
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
+// ```python
+// ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+// indices = tf.constant([[4], [3], [1], [7]])
+// updates = tf.constant([9, 10, 11, 12])
+// sub = tf.scatter_nd_sub(ref, indices, updates)
+// with tf.Session() as sess:
+//   print sess.run(sub)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, -9, 3, -6, -4, 6, 7, -4]
+//
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
+//
+// Arguments:
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of
+// values to add to ref.
+//
+// Returns the created operation.
+func ResourceScatterNdSub(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdSubAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22340,344 +27172,267 @@ func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedResizeBilinear",
+		Type: "ResourceScatterNdSub",
 		Input: []tf.Input{
-			images, size, min, max,
+			ref, indices, updates,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
-type SdcaOptimizerAttr func(optionalAttr)
-
-// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
-//
-// value: Whether to use Adaptive SDCA for the inner loop.
-// If not specified, defaults to true
-func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
-	return func(m optionalAttr) {
-		m["adaptative"] = value
-	}
+	return scope.AddOperation(opspec)
 }
 
-// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
-//
-// linear models with L1 + L2 regularization. As global optimization objective is
-// strongly-convex, the optimizer optimizes the dual objective at each step. The
-// optimizer applies each update one example at a time. Examples are sampled
-// uniformly, and the optimizer is learning rate free and enjoys linear convergence
-// rate.
-//
-// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
-// Shai Shalev-Shwartz, Tong Zhang. 2012
-//
-// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
+// Outputs deterministic pseudorandom random integers from a uniform distribution.
 //
-// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
-// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
-// Peter Richtarik, Martin Takac. 2015
+// The generated values follow a uniform distribution in the range `[minval, maxval)`.
 //
-// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
-// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
+// The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxval`.
 //
 // Arguments:
-//	sparse_example_indices: a list of vectors which contain example indices.
-//	sparse_feature_indices: a list of vectors which contain feature indices.
-//	sparse_feature_values: a list of vectors which contains feature value
-// associated with each feature group.
-//	dense_features: a list of matrices which contains the dense feature values.
-//	example_weights: a vector which contains the weight associated with each
-// example.
-//	example_labels: a vector which contains the label/target associated with each
-// example.
-//	sparse_indices: a list of vectors where each value is the indices which has
-// corresponding weights in sparse_weights. This field maybe omitted for the
-// dense approach.
-//	sparse_weights: a list of vectors where each value is the weight associated with
-// a sparse feature group.
-//	dense_weights: a list of vectors where the values are the weights associated
-// with a dense feature group.
-//	example_state_data: a list of vectors containing the example state data.
-//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
-// squared and hinge losses.
-//	l1: Symmetric l1 regularization strength.
-//	l2: Symmetric l2 regularization strength.
-//	num_loss_partitions: Number of partitions of the global loss function.
-//	num_inner_iterations: Number of iterations per mini-batch.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//	minval: Minimum value (inclusive, scalar).
+//	maxval: Maximum value (exclusive, scalar).
 //
-// Returns a list of vectors containing the updated example state
-// data.a list of vectors where each value is the delta
-// weights associated with a sparse feature group.a list of vectors where the values are the delta
-// weights associated with a dense feature group.
-func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
+// Returns Random values with specified shape.
+func StatelessRandomUniformInt(scope *Scope, shape tf.Output, seed tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SdcaOptimizer",
+		Type: "StatelessRandomUniformInt",
 		Input: []tf.Input{
-			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
+			shape, seed, minval, maxval,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	out_example_state_data = op.Output(idx)
-	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
-	}
-	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
-	}
-	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
+	return op.Output(0)
 }
 
-// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
-type MatrixTriangularSolveAttr func(optionalAttr)
+// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
+type QuantizedConv2DAttr func(optionalAttr)
 
-// MatrixTriangularSolveLower sets the optional lower attribute to value.
-//
-// value: Boolean indicating whether the innermost matrices in `matrix` are
-// lower or upper triangular.
-// If not specified, defaults to true
-func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
+// QuantizedConv2DOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
-		m["lower"] = value
+		m["out_type"] = value
 	}
 }
 
-// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
-//
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-//          adjoint.
+// QuantizedConv2DDilations sets the optional dilations attribute to value.
 //
-// @compatibility(numpy)
-// Equivalent to scipy.linalg.solve_triangular
-// @end_compatibility
-// If not specified, defaults to false
-func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
-		m["adjoint"] = value
+		m["dilations"] = value
 	}
 }
 
-// Solves systems of linear equations with upper or lower triangular matrices by
-//
-// backsubstitution.
-//
-// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
-// square matrices. If `lower` is `True` then the strictly upper triangular part
-// of each inner-most matrix is assumed to be zero and not accessed.
-// If `lower` is False then the strictly lower triangular part of each inner-most
-// matrix is assumed to be zero and not accessed.
-// `rhs` is a tensor of shape `[..., M, K]`.
+// Computes a 2D convolution given quantized 4D input and filter tensors.
 //
-// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
-// `True` then the innermost matrices in `output` satisfy matrix equations
-// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `False` then the strictly then the  innermost matrices in
-// `output` satisfy matrix equations
-// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+// The inputs are quantized tensors where the lowest value represents the real
+// number of the associated minimum, and the highest represents the maximum.
+// This means that you can only interpret the quantized output in the same way, by
+// taking the returned minimum and maximum values into account.
 //
 // Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
 //
-// Returns Shape is `[..., M, K]`.
-func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
+//	filter: filter's input_depth dimension must match input's depth dimensions.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_filter: The float value that the lowest quantized filter value represents.
+//	max_filter: The float value that the highest quantized filter value represents.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixTriangularSolve",
+		Type: "QuantizedConv2D",
 		Input: []tf.Input{
-			matrix, rhs,
+			input, filter, min_input, max_input, min_filter, max_filter,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Saves tensors in V2 checkpoint format.
+// ResourceGatherAttr is an optional argument to ResourceGather.
+type ResourceGatherAttr func(optionalAttr)
+
+// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Gather slices from the variable pointed to by `resource` according to `indices`.
 //
-// By default, saves the named tensors in full.  If the caller wishes to save
-// specific slices of full tensors, "shape_and_slices" should be non-empty strings
-// and correspondingly well-formed.
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
 //
-// Arguments:
-//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
-// write the tensors.
-//	tensor_names: shape {N}. The names of the tensors to be saved.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
-// Empty strings indicate that they are non-partitioned tensors.
-//	tensors: `N` tensors to save.
+// ```python
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
 //
-// Returns the created operation.
-func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
+//
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+// ```
+func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SaveV2",
+		Type: "ResourceGather",
 		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
+			resource, indices,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// UnicodeTranscodeAttr is an optional argument to UnicodeTranscode.
-type UnicodeTranscodeAttr func(optionalAttr)
+// StatelessMultinomialAttr is an optional argument to StatelessMultinomial.
+type StatelessMultinomialAttr func(optionalAttr)
 
-// UnicodeTranscodeErrors sets the optional errors attribute to value.
-//
-// value: Error handling policy when there is invalid formatting found in the input.
-// The value of 'strict' will cause the operation to produce a InvalidArgument
-// error on any invalid input formatting. A value of 'replace' (the default) will
-// cause the operation to replace any invalid formatting in the input with the
-// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
-// skip any invalid formatting in the input and produce no corresponding output
-// character.
-// If not specified, defaults to "replace"
-func UnicodeTranscodeErrors(value string) UnicodeTranscodeAttr {
+// StatelessMultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func StatelessMultinomialOutputDtype(value tf.DataType) StatelessMultinomialAttr {
 	return func(m optionalAttr) {
-		m["errors"] = value
+		m["output_dtype"] = value
 	}
 }
 
-// UnicodeTranscodeReplacementChar sets the optional replacement_char attribute to value.
+// Draws samples from a multinomial distribution.
 //
-// value: The replacement character codepoint to be used in place of any invalid
-// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
-// be used. The default value is the default unicode replacement character is
-// 0xFFFD or U+65533.)
+// Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//	seed: 2 seeds (shape [2]).
 //
-// Note that for UTF-8, passing a replacement character expressible in 1 byte, such
-// as ' ', will preserve string alignment to the source since invalid bytes will be
-// replaced with a 1-byte replacement. For UTF-16-BE and UTF-16-LE, any 1 or 2 byte
-// replacement character will preserve byte alignment to the source.
-// If not specified, defaults to 65533
-func UnicodeTranscodeReplacementChar(value int64) UnicodeTranscodeAttr {
-	return func(m optionalAttr) {
-		m["replacement_char"] = value
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func StatelessMultinomial(scope *Scope, logits tf.Output, num_samples tf.Output, seed tf.Output, optional ...StatelessMultinomialAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// UnicodeTranscodeReplaceControlCharacters sets the optional replace_control_characters attribute to value.
-//
-// value: Whether to replace the C0 control characters (00-1F) with the
-// `replacement_char`. Default is false.
-// If not specified, defaults to false
-func UnicodeTranscodeReplaceControlCharacters(value bool) UnicodeTranscodeAttr {
-	return func(m optionalAttr) {
-		m["replace_control_characters"] = value
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessMultinomial",
+		Input: []tf.Input{
+			logits, num_samples, seed,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Transcode the input text from a source encoding to a destination encoding.
+// Returns a batched matrix tensor with new batched diagonal values.
 //
-// The input is a string tensor of any shape. The output is a string tensor of
-// the same shape containing the transcoded strings. Output strings are always
-// valid unicode. If the input contains invalid encoding positions, the
-// `errors` attribute sets the policy for how to deal with them. If the default
-// error-handling policy is used, invalid formatting will be substituted in the
-// output by the `replacement_char`. If the errors policy is to `ignore`, any
-// invalid encoding positions in the input are skipped and not included in the
-// output. If it set to `strict` then any invalid formatting will result in an
-// InvalidArgument error.
+// Given `input` and `diagonal`, this operation returns a tensor with the
+// same shape and values as `input`, except for the main diagonal of the
+// innermost matrices.  These will be overwritten by the values in `diagonal`.
 //
-// This operation can be used with `output_encoding = input_encoding` to enforce
-// correct formatting for inputs even if they are already in the desired encoding.
+// The output is computed as follows:
 //
-// If the input is prefixed by a Byte Order Mark needed to determine encoding
-// (e.g. if the encoding is UTF-16 and the BOM indicates big-endian), then that
-// BOM will be consumed and not emitted into the output. If the input encoding
-// is marked with an explicit endianness (e.g. UTF-16-BE), then the BOM is
-// interpreted as a non-breaking-space and is preserved in the output (including
-// always for UTF-8).
+// Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
+// `k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
+// tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
 //
-// The end result is that if the input is marked as an explicit endianness the
-// transcoding is faithful to all codepoints in the source. If it is not marked
-// with an explicit endianness, the BOM is not considered part of the string itself
-// but as metadata, and so is not preserved in the output.
+//   * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
+//   * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
 //
 // Arguments:
-//	input: The text to be processed. Can have any shape.
-//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
-// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
-//	output_encoding: The unicode encoding to use in the output. Must be one of
-// `"UTF-8", "UTF-16-BE", "UTF-32-BE"`. Multi-byte encodings will be big-endian.
+//	input: Rank `k+1`, where `k >= 1`.
+//	diagonal: Rank `k`, where `k >= 1`.
 //
-// Returns A string tensor containing unicode text encoded using `output_encoding`.
-func UnicodeTranscode(scope *Scope, input tf.Output, input_encoding string, output_encoding string, optional ...UnicodeTranscodeAttr) (output tf.Output) {
+// Returns Rank `k+1`, with `output.shape = input.shape`.
+func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"input_encoding": input_encoding, "output_encoding": output_encoding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "UnicodeTranscode",
+		Type: "MatrixSetDiag",
 		Input: []tf.Input{
-			input,
+			input, diagonal,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes inverse hyperbolic sine of x element-wise.
-func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns the element-wise max of two SparseTensors.
+//
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+//
+// Arguments:
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Asinh",
+		Type: "SparseSparseMaximum",
 		Input: []tf.Input{
-			x,
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Creates a dataset with a range of values. Corresponds to python's xrange.
-//
-// Arguments:
-//	start: corresponds to start in python's xrange().
-//	stop: corresponds to stop in python's xrange().
-//	step: corresponds to step in python's xrange().
-//
+// List of the given size with empty elements.
 //
-func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// element_shape: the shape of the future elements of the list
+// num_elements: the number of elements to reserve
+// handle: the output list
+// element_dtype: the desired type of elements in the list.
+func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "RangeDataset",
+		Type: "TensorListReserve",
 		Input: []tf.Input{
-			start, stop, step,
+			element_shape, num_elements,
 		},
 		Attrs: attrs,
 	}
@@ -22685,85 +27440,95 @@ func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output,
 	return op.Output(0)
 }
 
-// Stops gradient computation.
+// LoadTPUEmbeddingMDLAdagradLightParametersAttr is an optional argument to LoadTPUEmbeddingMDLAdagradLightParameters.
+type LoadTPUEmbeddingMDLAdagradLightParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingMDLAdagradLightParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// When executed in a graph, this op outputs its input tensor as-is.
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingMDLAdagradLightParametersTableId(value int64) LoadTPUEmbeddingMDLAdagradLightParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingMDLAdagradLightParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingMDLAdagradLightParametersTableName(value string) LoadTPUEmbeddingMDLAdagradLightParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load MDL Adagrad Light embedding parameters.
 //
-// When building ops to compute gradients, this op prevents the contribution of
-// its inputs to be taken into account.  Normally, the gradient generator adds ops
-// to a graph to compute the derivatives of a specified 'loss' by recursively
-// finding out inputs that contributed to its computation.  If you insert this op
-// in the graph it inputs are masked from the gradient generator.  They are not
-// taken into account for computing gradients.
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
-// This is useful any time you want to compute a value with TensorFlow but need
-// to pretend that the value was a constant. Some examples include:
+// Arguments:
+//	parameters: Value of parameters used in the MDL Adagrad Light optimization algorithm.
+//	accumulators: Value of accumulators used in the MDL Adagrad Light optimization algorithm.
+//	weights: Value of weights used in the MDL Adagrad Light optimization algorithm.
+//	benefits: Value of benefits used in the MDL Adagrad Light optimization algorithm.
 //
-// *  The *EM* algorithm where the *M-step* should not involve backpropagation
-//    through the output of the *E-step*.
-// *  Contrastive divergence training of Boltzmann machines where, when
-//    differentiating the energy function, the training must not backpropagate
-//    through the graph that generated the samples from the model.
-// *  Adversarial training, where no backprop should happen through the adversarial
-//    example generation process.
-func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingMDLAdagradLightParameters(scope *Scope, parameters tf.Output, accumulators tf.Output, weights tf.Output, benefits tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMDLAdagradLightParametersAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "StopGradient",
+		Type: "LoadTPUEmbeddingMDLAdagradLightParameters",
 		Input: []tf.Input{
-			input,
+			parameters, accumulators, weights, benefits,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Eagerly executes a python function to compute func(input)->output. The
+// Computes the gradient for the inverse of `x` wrt its input.
 //
-// semantics of the input, output, and attributes are the same as those for
-// PyFunc.
-func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"token": token, "Tout": Tout}
 	opspec := tf.OpSpec{
-		Type: "EagerPyFunc",
+		Type: "InvGrad",
 		Input: []tf.Input{
-			tf.OutputList(input),
+			y, dy,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("EagerPyFunc", err)
-		return
-	}
-	return output
+	return op.Output(0)
 }
 
-// Adds sparse updates to the variable referenced by `resource`.
+// Reduces sparse updates into the variable referenced by `resource` using the `min` operation.
 //
 // This operation computes
 //
 //     # Scalar indices
-//     ref[indices, ...] += updates[...]
+//     ref[indices, ...] = min(ref[indices, ...], updates[...])
 //
 //     # Vector indices (for each i)
-//     ref[indices[i], ...] += updates[i, ...]
+//     ref[indices[i], ...] = min(ref[indices[i], ...], updates[i, ...])
 //
 //     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+//     ref[indices[i, ..., j], ...] = min(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
 //
 // Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions add.
+// the same location, their contributions are combined.
 //
 // Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
 //
@@ -22777,12 +27542,12 @@ func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataTy
 //	updates: A tensor of updated values to add to `ref`.
 //
 // Returns the created operation.
-func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+func ResourceScatterMin(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterAdd",
+		Type: "ResourceScatterMin",
 		Input: []tf.Input{
 			resource, indices, updates,
 		},
@@ -22790,55 +27555,16 @@ func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, upd
 	return scope.AddOperation(opspec)
 }
 
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
-//
-// More formally, let
-//
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
-//
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
-//
-// Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
-//
-// Returns Computed Precision at `k` as a `bool Tensor`.
-func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"k": k}
-	opspec := tf.OpSpec{
-		Type: "InTopK",
-		Input: []tf.Input{
-			predictions, targets,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns (x - y)(x - y) element-wise.
+// Elementwise computes the bitwise OR of `x` and `y`.
 //
-// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// The result will have those bits set, that are set in `x`, `y` or both. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SquaredDifference",
+		Type: "BitwiseOr",
 		Input: []tf.Input{
 			x, y,
 		},
@@ -22847,47 +27573,65 @@ func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// RandomGammaAttr is an optional argument to RandomGamma.
-type RandomGammaAttr func(optionalAttr)
+// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
+type MatrixSolveLsAttr func(optionalAttr)
 
-// RandomGammaSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomGammaSeed(value int64) RandomGammaAttr {
+// MatrixSolveLsFast sets the optional fast attribute to value.
+// If not specified, defaults to true
+func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["fast"] = value
 	}
 }
 
-// RandomGammaSeed2 sets the optional seed2 attribute to value.
+// Solves one or more linear least-squares problems.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomGammaSeed2(value int64) RandomGammaAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from the Gamma distribution(s) described by alpha.
+// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
+// type as `matrix` and shape `[..., M, K]`.
+// The output is a tensor shape `[..., N, K]` where each output matrix solves
+// each of the equations
+// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
+// in the least squares sense.
 //
-// This op uses the algorithm by Marsaglia et al. to acquire samples via
-// transformation-rejection from pairs of uniform and normal random variables.
-// See http://dl.acm.org/citation.cfm?id=358414
+// We use the following notation for (complex) matrix and right-hand sides
+// in the batch:
+//
+// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
+// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
+// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
+// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
+//
+// If `fast` is `True`, then the solution is computed by solving the normal
+// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
+// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
+// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\).
+// If \\(m \lt n\\) then `output` is computed as
+// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
+// minimum-norm solution to the under-determined linear system, i.e.
+// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
+// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
+// when \\(A\\) is numerically full rank and has a condition number
+// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
+// sufficiently large.
+//
+// If `fast` is `False` an algorithm based on the numerically robust complete
+// orthogonal decomposition is used. This computes the minimum-norm
+// least-squares solution, even when \\(A\\) is rank deficient. This path is
+// typically 6-7 times slower than the fast path. If `fast` is `False` then
+// `l2_regularizer` is ignored.
 //
 // Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in alpha.
-//	alpha: A tensor in which each scalar is a "shape" parameter describing the
-// associated gamma distribution.
+//	matrix: Shape is `[..., M, N]`.
+//	rhs: Shape is `[..., M, K]`.
+//	l2_regularizer: Scalar tensor.
 //
-// Returns A tensor with shape `shape + shape(alpha)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
-func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.linalg.lstsq
+// @end_compatibility
+//
+// Returns Shape is `[..., N, K]`.
+func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22896,9 +27640,9 @@ func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...Ran
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomGamma",
+		Type: "MatrixSolveLs",
 		Input: []tf.Input{
-			shape, alpha,
+			matrix, rhs, l2_regularizer,
 		},
 		Attrs: attrs,
 	}
@@ -22906,78 +27650,204 @@ func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...Ran
 	return op.Output(0)
 }
 
-// Convert the quantized 'input' tensor into a lower-precision 'output', using the
+// Interleave the values from the `data` tensors into a single tensor.
 //
-// actual distribution of the values to maximize the usage of the lower bit depth
-// and adjusting the output min and max ranges accordingly.
+// Builds a merged tensor such that
 //
-// [input_min, input_max] are scalar floats that specify the range for the float
-// interpretation of the 'input' data. For example, if input_min is -1.0f and
-// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+// ```python
+//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
+// ```
 //
-// This operator tries to squeeze as much precision as possible into an output with
-// a lower bit depth by calculating the actual min and max values found in the
-// data. For example, maybe that quint16 input has no values lower than 16,384 and
-// none higher than 49,152. That means only half the range is actually needed, all
-// the float interpretations are between -0.5f and 0.5f, so if we want to compress
-// the data into a quint8 output, we can use that range rather than the theoretical
-// -1.0f to 1.0f that is suggested by the input min and max.
+// For example, if each `indices[m]` is scalar or vector, we have
 //
-// In practice, this is most useful for taking output from operations like
-// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
-// may have large potential output ranges, but in practice have a distribution of
-// input values that only uses a small fraction of the possible range. By feeding
-// that output into this operator, we can reduce it from 32 bits down to 8 with
-// minimal loss of accuracy.
+// ```python
+//     # Scalar indices:
+//     merged[indices[m], ...] = data[m][...]
 //
-// Arguments:
+//     # Vector indices:
+//     merged[indices[m][i], ...] = data[m][i, ...]
+// ```
 //
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
+// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
+// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
+// `constant`, the output shape is
 //
-// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+//     merged.shape = [max(indices)] + constant
+//
+// Values are merged in order, so if an index appears in both `indices[m][i]` and
+// `indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the
+// merged result. If you do not need this guarantee, ParallelDynamicStitch might
+// perform better on some devices.
+//
+// For example:
+//
+// ```python
+//     indices[0] = 6
+//     indices[1] = [4, 1]
+//     indices[2] = [[5, 2], [0, 3]]
+//     data[0] = [61, 62]
+//     data[1] = [[41, 42], [11, 12]]
+//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
+//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
+//               [51, 52], [61, 62]]
+// ```
+//
+// This method can be used to merge partitions created by `dynamic_partition`
+// as illustrated on the following example:
+//
+// ```python
+//     # Apply function (increments x_i) on elements for which a certain condition
+//     # apply (x_i != -1 in this example).
+//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+//     condition_mask=tf.not_equal(x,tf.constant(-1.))
+//     partitioned_data = tf.dynamic_partition(
+//         x, tf.cast(condition_mask, tf.int32) , 2)
+//     partitioned_data[1] = partitioned_data[1] + 1.0
+//     condition_indices = tf.dynamic_partition(
+//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
+//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+//     # unchanged.
+// ```
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
+// </div>
+func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "QuantizeDownAndShrinkRange",
+		Type: "DynamicStitch",
 		Input: []tf.Input{
-			input, input_min, input_max,
+			tf.OutputList(indices), tf.OutputList(data),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Performs a padding as a preprocess during a convolution.
+//
+// Similar to FusedResizeAndPadConv2d, this op allows for an optimized
+// implementation where the spatial padding transformation stage is fused with the
+// im2col lookup, but in this case without the bilinear filtering required for
+// resizing. Fusing the padding prevents the need to write out the intermediate
+// results as whole tensors, reducing memory pressure, and we can get some latency
+// gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
+// order is used instead.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "FusedPadConv2D",
+		Input: []tf.Input{
+			input, paddings, filter,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ApproximateEqualAttr is an optional argument to ApproximateEqual.
-type ApproximateEqualAttr func(optionalAttr)
+// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
+type Conv2DBackpropInputAttr func(optionalAttr)
 
-// ApproximateEqualTolerance sets the optional tolerance attribute to value.
-// If not specified, defaults to 1e-05
-func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
+// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["tolerance"] = value
+		m["use_cudnn_on_gpu"] = value
 	}
 }
 
-// Returns the truth value of abs(x-y) < tolerance element-wise.
-func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
+// Conv2DBackpropInputExplicitPaddings sets the optional explicit_paddings attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DBackpropInputExplicitPaddings(value []int64) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
+// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DBackpropInputDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of convolution with respect to the input.
+//
+// Arguments:
+//	input_sizes: An integer vector representing the shape of `input`,
+// where `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
+// w.r.t. the input of the convolution.
+func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ApproximateEqual",
+		Type: "Conv2DBackpropInput",
 		Input: []tf.Input{
-			x, y,
+			input_sizes, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -22985,120 +27855,118 @@ func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...Approx
 	return op.Output(0)
 }
 
-// Returns the truth value of x OR y element-wise.
+// Creates a dataset that executes a SQL query and emits rows of the result set.
 //
-// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
+//	data_source_name: A connection string to connect to the database.
+//	query: A SQL query to execute.
+//
+//
+func ExperimentalSqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "LogicalOr",
+		Type: "ExperimentalSqlDataset",
 		Input: []tf.Input{
-			x, y,
+			driver_name, data_source_name, query,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Selects elements from `x` or `y`, depending on `condition`.
-//
-// The `x`, and `y` tensors must all have the same shape, and the
-// output will also have that shape.
-//
-// The `condition` tensor must be a scalar if `x` and `y` are scalars.
-// If `x` and `y` are vectors or higher rank, then `condition` must be either a
-// scalar, a vector with size matching the first dimension of `x`, or must have
-// the same shape as `x`.
-//
-// The `condition` tensor acts as a mask that chooses, based on the value at each
-// element, whether the corresponding element / row in the output should be
-// taken from `x` (if true) or `y` (if false).
-//
-// If `condition` is a vector and `x` and `y` are higher rank matrices, then
-// it chooses which row (outer dimension) to copy from `x` and `y`.
-// If `condition` has the same shape as `x` and `y`, then it chooses which
-// element to copy from `x` and `y`.
-//
-// For example:
-//
-// ```python
-// # 'condition' tensor is [[True,  False]
-// #                        [False, True]]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e)  # => [[1, 6], [7, 4]]
-//
+// LoadTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingCenteredRMSPropParameters.
+type LoadTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// # 'condition' tensor is [True, False]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e) ==> [[1, 2],
-//                              [7, 8]]
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingCenteredRMSPropParametersTableId(value int64) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingCenteredRMSPropParametersTableName(value string) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load centered RMSProp embedding parameters.
 //
-// ```
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
+//	parameters: Value of parameters used in the centered RMSProp optimization algorithm.
+//	ms: Value of ms used in the centered RMSProp optimization algorithm.
+//	mom: Value of mom used in the centered RMSProp optimization algorithm.
+//	mg: Value of mg used in the centered RMSProp optimization algorithm.
 //
-//	x: = A `Tensor` which may have the same shape as `condition`.
-// If `condition` is rank 1, `x` may have higher rank,
-// but its first dimension must match the size of `condition`.
-//	y: = A `Tensor` with the same type and shape as `x`.
 //
-// Returns = A `Tensor` with the same type and shape as `x` and `y`.
-func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
+//
+// Returns the created operation.
+func LoadTPUEmbeddingCenteredRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingCenteredRMSPropParametersAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Select",
+		Type: "LoadTPUEmbeddingCenteredRMSPropParameters",
 		Input: []tf.Input{
-			condition, x, y,
+			parameters, ms, mom, mg,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// MatMulAttr is an optional argument to MatMul.
-type MatMulAttr func(optionalAttr)
+// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
+type DataFormatVecPermuteAttr func(optionalAttr)
 
-// MatMulTransposeA sets the optional transpose_a attribute to value.
+// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
 //
-// value: If true, "a" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeA(value bool) MatMulAttr {
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
 	return func(m optionalAttr) {
-		m["transpose_a"] = value
+		m["src_format"] = value
 	}
 }
 
-// MatMulTransposeB sets the optional transpose_b attribute to value.
+// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
 //
-// value: If true, "b" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeB(value bool) MatMulAttr {
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
 	return func(m optionalAttr) {
-		m["transpose_b"] = value
+		m["dst_format"] = value
 	}
 }
 
-// Multiply the matrix "a" by the matrix "b".
+// Returns the permuted vector/tensor in the destination data format given the
 //
-// The inputs must be two-dimensional matrices and the inner dimension of
-// "a" (after being transposed if transpose_a is true) must match the
-// outer dimension of "b" (after being transposed if transposed_b is
-// true).
+// one in the source data format.
 //
-// *Note*: The default kernel implementation for MatMul on GPUs uses
-// cublas.
-func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
+// Arguments:
+//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
+//
+// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
+func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -23107,9 +27975,9 @@ func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (pro
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatMul",
+		Type: "DataFormatVecPermute",
 		Input: []tf.Input{
-			a, b,
+			x,
 		},
 		Attrs: attrs,
 	}
@@ -23117,53 +27985,60 @@ func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (pro
 	return op.Output(0)
 }
 
-// SparseMatMulAttr is an optional argument to SparseMatMul.
-type SparseMatMulAttr func(optionalAttr)
-
-// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
+// Returns x / y element-wise.
+//
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
+	opspec := tf.OpSpec{
+		Type: "Div",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["a_is_sparse"] = value
-	}
-}
+// ResizeAreaAttr is an optional argument to ResizeArea.
+type ResizeAreaAttr func(optionalAttr)
 
-// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
+// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
 // If not specified, defaults to false
-func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
+func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
 	return func(m optionalAttr) {
-		m["b_is_sparse"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Multiply matrix "a" by matrix "b".
+// Resize `images` to `size` using area interpolation.
 //
-// The inputs must be two-dimensional matrices and the inner dimension of "a" must
-// match the outer dimension of "b". Both "a" and "b" must be `Tensor`s not
-// `SparseTensor`s.  This op is optimized for the case where at least one of "a" or
-// "b" is sparse, in the sense that they have a large proportion of zero values.
-// The breakeven for using this versus a dense matrix multiply on one platform was
-// 30% zero values in the sparse matrix.
+// Input images can be of different types but output images are always float.
 //
-// The gradient computation of this operation will only take advantage of sparsity
-// in the input gradient when that gradient comes from a Relu.
-func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
+// The range of pixel values for the output image might be slightly different
+// from the range for the input image because of limited numerical precision.
+// To guarantee an output range, for example `[0.0, 1.0]`, apply
+// `tf.clip_by_value` to the output.
+//
+// Each output pixel is computed by first transforming the pixel's footprint into
+// the input tensor and then averaging the pixels that intersect the footprint. An
+// input pixel's contribution to the average is weighted by the fraction of its
+// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -23172,9 +28047,9 @@ func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatM
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseMatMul",
+		Type: "ResizeArea",
 		Input: []tf.Input{
-			a, b,
+			images, size,
 		},
 		Attrs: attrs,
 	}
@@ -23182,63 +28057,150 @@ func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatM
 	return op.Output(0)
 }
 
-// ExperimentalThreadPoolHandleAttr is an optional argument to ExperimentalThreadPoolHandle.
-type ExperimentalThreadPoolHandleAttr func(optionalAttr)
+// Sends `input` to all devices that are connected to the output.
+//
+// Sends `input` to all devices that are connected to the output.
+//
+// The graph should be constructed so that all ops connected to the output have a
+// valid device assignment, and the op itself is assigned one of these devices.
+//
+// input: The input to the broadcast.
+// output: The same as input.
+// shape: The shape of the input tensor.
+//
+func NcclBroadcast(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape": shape}
+	opspec := tf.OpSpec{
+		Type: "NcclBroadcast",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// ExperimentalThreadPoolHandleMaxIntraOpParallelism sets the optional max_intra_op_parallelism attribute to value.
+// Computes the gradient of morphological 2-D dilation with respect to the filter.
 //
-// value: The maximum degree of parallelism to use within operations that execute on this
-// threadpool.
-// If not specified, defaults to 1
-func ExperimentalThreadPoolHandleMaxIntraOpParallelism(value int64) ExperimentalThreadPoolHandleAttr {
-	return func(m optionalAttr) {
-		m["max_intra_op_parallelism"] = value
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 3-D with shape `[filter_height, filter_width, depth]`.
+func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "Dilation2DBackpropFilter",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ExperimentalThreadPoolHandleContainer sets the optional container attribute to value.
+// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
+type AddSparseToTensorsMapAttr func(optionalAttr)
+
+// AddSparseToTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` created by this op.
 // If not specified, defaults to ""
-func ExperimentalThreadPoolHandleContainer(value string) ExperimentalThreadPoolHandleAttr {
+func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// ExperimentalThreadPoolHandleSharedName sets the optional shared_name attribute to value.
+// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+//
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
 // If not specified, defaults to ""
-func ExperimentalThreadPoolHandleSharedName(value string) ExperimentalThreadPoolHandleAttr {
+func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
+//
+// A `SparseTensor` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`.
+//
+// This operator takes the given `SparseTensor` and adds it to a container
+// object (a `SparseTensorsMap`).  A unique key within this container is generated
+// in the form of an `int64`, and this is the value that is returned.
+//
+// The `SparseTensor` can then be read out as part of a minibatch by passing
+// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddSparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
 //
 // Arguments:
-//	num_threads: The number of threads in the thread pool.
-//	display_name: A human-readable name for the threads that may be visible in some
-// visualizations.
-// threadpool.
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
 //
-// Returns A resource that can be consumed by one or more ExperimentalThreadPoolDataset
-// ops.
-func ExperimentalThreadPoolHandle(scope *Scope, num_threads int64, display_name string, optional ...ExperimentalThreadPoolHandleAttr) (handle tf.Output) {
+// Returns 0-D.  The handle of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.
+func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_threads": num_threads, "display_name": display_name}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalThreadPoolHandle",
-
+		Type: "AddSparseToTensorsMap",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
+// Returns a list list which has the passed-in `Tensor` as last element and the other elements of the given list in `input_handle`.
+//
+// tensor: The tensor to put on the list.
+// input_handle: The old list.
+// output_handle: A list with the elements of the old list followed by tensor.
+// element_dtype: the type of elements in the list.
+// element_shape: a shape compatible with that of elements in the list.
+func TensorListPushBack(scope *Scope, input_handle tf.Output, tensor tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListPushBack",
+		Input: []tf.Input{
+			input_handle, tensor,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // CudnnRNNCanonicalToParamsAttr is an optional argument to CudnnRNNCanonicalToParams.
 type CudnnRNNCanonicalToParamsAttr func(optionalAttr)
 
@@ -23345,111 +28307,342 @@ func FilterByLastComponentDataset(scope *Scope, input_dataset tf.Output, output_
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "FilterByLastComponentDataset",
+		Input: []tf.Input{
+			input_dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the absolute value of a tensor.
+//
+// Given a tensor `x`, this operation returns a tensor containing the absolute
+// value of each element in `x`. For example, if x is an input element and y is
+// an output element, this operation computes \\(y = |x|\\).
+func Abs(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Abs",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
+type MaxPoolGradV2Attr func(optionalAttr)
+
+// MaxPoolGradV2DataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradV2DataFormat(value string) MaxPoolGradV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of the maxpooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPoolGradV2",
+		Input: []tf.Input{
+			orig_input, orig_output, grad, ksize, strides,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Restore a reader to a previously saved state.
+//
+// Not all Readers support being restored, so this can produce an
+// Unimplemented error.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//	state: Result of a ReaderSerializeState of a Reader with type
+// matching reader_handle.
+//
+// Returns the created operation.
+func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderRestoreStateV2",
+		Input: []tf.Input{
+			reader_handle, state,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Inverse fast Fourier transform.
+//
+// Computes the inverse 1-dimensional discrete Fourier transform over the
+// inner-most dimension of `input`.
+//
+// Arguments:
+//	input: A complex tensor.
+//
+// Returns A complex tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its inverse 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft
+// @end_compatibility
+func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IFFT",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// 2D fast Fourier transform.
+//
+// Computes the 2-dimensional discrete Fourier transform over the inner-most
+// 2 dimensions of `input`.
+//
+// Arguments:
+//	input: A complex tensor.
+//
+// Returns A complex tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fft2
+// @end_compatibility
+func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FFT2D",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inverse 2D fast Fourier transform.
+//
+// Computes the inverse 2-dimensional discrete Fourier transform over the
+// inner-most 2 dimensions of `input`.
+//
+// Arguments:
+//	input: A complex tensor.
+//
+// Returns A complex tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft2
+// @end_compatibility
+func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IFFT2D",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Inverse 3D real-valued fast Fourier transform.
+//
+// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 3 dimensions of `input`.
+//
+// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 3D real Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.irfftn with 3 dimensions.
+// @end_compatibility
+func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IRFFT3D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the truth value of (x != y) element-wise.
+//
+// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
 	opspec := tf.OpSpec{
-		Type: "FilterByLastComponentDataset",
+		Type: "NotEqual",
 		Input: []tf.Input{
-			input_dataset,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SumAttr is an optional argument to Sum.
-type SumAttr func(optionalAttr)
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingMomentumParametersGradAccumDebug.
+type LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
 
-// SumKeepDims sets the optional keep_dims attribute to value.
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SumKeepDims(value bool) SumAttr {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["table_id"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a tensor.
+// LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load Momentum embedding parameters with debug support.
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	parameters: Value of parameters used in the Momentum optimization algorithm.
+//	momenta: Value of momenta used in the Momentum optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the Momentum optimization algorithm.
 //
-// Returns The reduced tensor.
-func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (output tf.Output) {
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingMomentumParametersGradAccumDebugAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Sum",
+		Type: "LoadTPUEmbeddingMomentumParametersGradAccumDebug",
 		Input: []tf.Input{
-			input, axis,
+			parameters, momenta, gradient_accumulators,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// EnterAttr is an optional argument to Enter.
-type EnterAttr func(optionalAttr)
-
-// EnterIsConstant sets the optional is_constant attribute to value.
-//
-// value: If true, the output is constant within the child frame.
-// If not specified, defaults to false
-func EnterIsConstant(value bool) EnterAttr {
-	return func(m optionalAttr) {
-		m["is_constant"] = value
-	}
-}
+// StatefulStandardNormalAttr is an optional argument to StatefulStandardNormal.
+type StatefulStandardNormalAttr func(optionalAttr)
 
-// EnterParallelIterations sets the optional parallel_iterations attribute to value.
+// StatefulStandardNormalDtype sets the optional dtype attribute to value.
 //
-// value: The number of iterations allowed to run in parallel.
-// If not specified, defaults to 10
-func EnterParallelIterations(value int64) EnterAttr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatefulStandardNormalDtype(value tf.DataType) StatefulStandardNormalAttr {
 	return func(m optionalAttr) {
-		m["parallel_iterations"] = value
+		m["dtype"] = value
 	}
 }
 
-// Creates or finds a child frame, and makes `data` available to the child frame.
+// Outputs random values from a normal distribution.
 //
-// This op is used together with `Exit` to create loops in the graph.
-// The unique `frame_name` is used by the `Executor` to identify frames. If
-// `is_constant` is true, `output` is a constant in the child frame; otherwise
-// it may be changed in the child frame. At most `parallel_iterations` iterations
-// are run in parallel in the child frame.
+// The generated values will have mean 0 and standard deviation 1.
 //
 // Arguments:
-//	data: The tensor to be made available to the child frame.
-//	frame_name: The name of the child frame.
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	shape: The shape of the output tensor.
 //
-// Returns The same tensor as `data`.
-func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAttr) (output tf.Output) {
+// Returns A tensor of the specified shape filled with random normal values.
+func StatefulStandardNormal(scope *Scope, resource tf.Output, shape tf.Output, optional ...StatefulStandardNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"frame_name": frame_name}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Enter",
+		Type: "StatefulStandardNormal",
 		Input: []tf.Input{
-			data,
+			resource, shape,
 		},
 		Attrs: attrs,
 	}
@@ -23457,282 +28650,324 @@ func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAtt
 	return op.Output(0)
 }
 
-// Add all input tensors element wise.
-//
-// Arguments:
-//	inputs: Must all be the same size and shape.
-func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
+// Computes the Gauss error function of `x` element-wise.
+func Erf(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AddN",
+		Type: "Erf",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TryRpcAttr is an optional argument to TryRpc.
-type TryRpcAttr func(optionalAttr)
-
-// TryRpcProtocol sets the optional protocol attribute to value.
-//
-// value: RPC protocol to use.  Empty string means use the default protocol.
-// Options include 'grpc'.
-// If not specified, defaults to ""
-func TryRpcProtocol(value string) TryRpcAttr {
-	return func(m optionalAttr) {
-		m["protocol"] = value
+// Returns element-wise largest integer not greater than x.
+func Floor(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Floor",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// TryRpcFailFast sets the optional fail_fast attribute to value.
+// Returns the number of records this Reader has produced.
 //
-// value: `boolean`. If `true` (default), then failures to connect
-// (i.e., the server does not immediately respond) cause an RPC failure.
-// If not specified, defaults to true
-func TryRpcFailFast(value bool) TryRpcAttr {
-	return func(m optionalAttr) {
-		m["fail_fast"] = value
+// This is the same as the number of ReaderRead executions that have
+// succeeded.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderNumRecordsProducedV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// TryRpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
-//
-// value: `int`. If `0` (default), then the kernel will run the RPC
-// request and only time out if the RPC deadline passes or the session times out.
-// If this value is greater than `0`, then the op will raise an exception if
-// the RPC takes longer than `timeout_in_ms`.
-// If not specified, defaults to 0
-func TryRpcTimeoutInMs(value int64) TryRpcAttr {
+// TensorListConcatAttr is an optional argument to TensorListConcat.
+type TensorListConcatAttr func(optionalAttr)
+
+// TensorListConcatElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorListConcatElementShape(value tf.Shape) TensorListConcatAttr {
 	return func(m optionalAttr) {
-		m["timeout_in_ms"] = value
+		m["element_shape"] = value
 	}
 }
 
-// Perform batches of RPC requests.
-//
-// This op asynchronously performs either a single RPC request, or a batch
-// of requests.  RPC requests are defined by three main parameters:
-//
-//   - `address` (the host+port or BNS address of the request)
-//   - `method` (the method name for the request)
-//   - `request` (the serialized proto string, or vector of strings,
-//      of the RPC request argument).
-//
-// For example, if you have an RPC service running on port localhost:2345,
-// and its interface is configured with the following proto declaration:
-//
-// ```
-// service MyService {
-//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
-//   }
-// };
-// ```
-//
-// then call this op with arguments:
-//
-// ```
-// address = "localhost:2345"
-// method = "MyService/MyMethod"
-// ```
-//
-// The `request` tensor is a string tensor representing serialized `MyRequestProto`
-// strings; and the output string tensor `response` will have the same shape
-// and contain (upon successful completion) corresponding serialized
-// `MyResponseProto` strings.
-//
-// For example, to send a single, empty, `MyRequestProto`, call
-// this op with `request = ""`.  To send 5 **parallel** empty requests,
-// call this op with `request = ["", "", "", "", ""]`.
-//
-// More generally, one can create a batch of `MyRequestProto` serialized protos
-// from regular batched tensors using the `encode_proto` op, and convert
-// the response `MyResponseProto` serialized protos to batched tensors
-// using the `decode_proto` op.
-//
-// **NOTE** Working with serialized proto strings is faster than instantiating
-// actual proto objects in memory, so no performance degradation is expected
-// compared to writing custom kernels for this workflow.
+// Concats all tensors in the list along the 0th dimension.
 //
-// Unlike the standard `Rpc` op, if the connection fails or the remote worker
-// returns an error status, this op does **not** reraise the exception.
-// Instead, the `status_code` and `status_message` entry for the corresponding RPC
-// call is set with the error returned from the RPC call.  The `response` tensor
-// will contain valid response values for those minibatch entries whose RPCs did
-// not fail; the rest of the entries will have empty strings.
+// Requires that all tensors have the same shape except the first dimension.
 //
-// Arguments:
-//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `method` and `request`.
-//	method: `0-D` or `1-D`.  The method address on the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `request`.
-//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `method`.
+// input_handle: The input list.
+// tensor: The concated result.
+// lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
 //
-// Returns Same shape as `request`. Serialized proto strings: the rpc responses.Same shape as `request`.  Values correspond to tensorflow Status enum codes.Same shape as `request`.  Values correspond to Status messages
-// returned from the RPC calls.
-func TryRpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...TryRpcAttr) (response tf.Output, status_code tf.Output, status_message tf.Output) {
+func TensorListConcat(scope *Scope, input_handle tf.Output, element_dtype tf.DataType, optional ...TensorListConcatAttr) (tensor tf.Output, lengths tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TryRpc",
+		Type: "TensorListConcat",
 		Input: []tf.Input{
-			address, method, request,
+			input_handle,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1)
 }
 
-// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
-type InitializeTableFromTextFileV2Attr func(optionalAttr)
+// Conv3DAttr is an optional argument to Conv3D.
+type Conv3DAttr func(optionalAttr)
 
-// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
-//
-// value: Number of elements of the file, use -1 if unknown.
-// If not specified, defaults to -1
+// Conv3DDataFormat sets the optional data_format attribute to value.
 //
-// REQUIRES: value >= -1
-func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DDataFormat(value string) Conv3DAttr {
 	return func(m optionalAttr) {
-		m["vocab_size"] = value
+		m["data_format"] = value
 	}
 }
 
-// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
+// Conv3DDilations sets the optional dilations attribute to value.
 //
-// value: Delimiter to separate fields in a line.
-// If not specified, defaults to "\t"
-func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
-		m["delimiter"] = value
+		m["dilations"] = value
 	}
 }
 
-// Initializes a table from a text file.
+// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
 //
-// It inserts one key-value pair into the table for each line of the file.
-// The key and value is extracted from the whole line content, elements from the
-// split line based on `delimiter` or the line number (starting from zero).
-// Where to extract the key and value from a line is specified by `key_index` and
-// `value_index`.
+// In signal processing, cross-correlation is a measure of similarity of
+// two waveforms as a function of a time-lag applied to one of them. This
+// is also known as a sliding dot product or sliding inner-product.
 //
-// - A value of -1 means use the line number(starting from zero), expects `int64`.
-// - A value of -2 means use the whole line content, expects `string`.
-// - A value >= 0 means use the index (starting at zero) of the split line based
-//   on `delimiter`.
+// Our Conv3D implements a form of cross-correlation.
 //
 // Arguments:
-//	table_handle: Handle to a table which will be initialized.
-//	filename: Filename of a vocabulary text file.
-//	key_index: Column index in a line to get the table `key` values from.
-//	value_index: Column index that represents information of a line to get the table
-// `value` values from.
-//
-// Returns the created operation.
-func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
+//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
+//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
+// out_channels]`. `in_channels` must match between `input` and `filter`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "InitializeTableFromTextFileV2",
+		Type: "Conv3D",
 		Input: []tf.Input{
-			table_handle, filename,
+			input, filter,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MeanAttr is an optional argument to Mean.
-type MeanAttr func(optionalAttr)
+// QuantizeV2Attr is an optional argument to QuantizeV2.
+type QuantizeV2Attr func(optionalAttr)
 
-// MeanKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MeanKeepDims(value bool) MeanAttr {
+// QuantizeV2Mode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func QuantizeV2Mode(value string) QuantizeV2Attr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["mode"] = value
 	}
 }
 
-// Computes the mean of elements across dimensions of a tensor.
+// QuantizeV2RoundMode sets the optional round_mode attribute to value.
+// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
+func QuantizeV2RoundMode(value string) QuantizeV2Attr {
+	return func(m optionalAttr) {
+		m["round_mode"] = value
+	}
+}
+
+// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.  The
+// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
+// when rounding float values to their quantized equivalents.
+//
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
+//
+// ```
+// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
+// if T == qint8: out[i] -= (range(T) + 1) / 2.0
+// ```
+//
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+//
+// *MIN_COMBINED Mode Example*
+//
+// Assume the input is type float and has a possible range of [0.0, 6.0] and the
+// output type is quint8 ([0, 255]). The min_range and max_range values should be
+// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
+// value of the input by 255/6 and cast to quint8.
+//
+// If the output type was qint8 ([-128, 127]), the operation will additionally
+// subtract each value by 128 prior to casting, so that the range of values aligns
+// with the range of qint8.
+//
+// If the mode is 'MIN_FIRST', then this approach is used:
+//
+// ```
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = num_discrete_values / range
+// quantized = round(input * range_scale) - round(range_min * range_scale) +
+//   numeric_limits<T>::min()
+// quantized = max(quantized, numeric_limits<T>::min())
+// quantized = min(quantized, numeric_limits<T>::max())
+// ```
+//
+// The biggest difference between this and MIN_COMBINED is that the minimum range
+// is rounded first, before it's subtracted from the rounded value. With
+// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
+// and dequantizing will introduce a larger and larger error.
+//
+// *SCALED mode Example*
+//
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
+//
+// If the mode is `SCALED`, we do not use the full range of the output type,
+// choosing to elide the lowest possible value for symmetry (e.g., output range is
+// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+// 0.
+//
+// We first find the range of values in our tensor. The
+// range we use is always centered on 0, so we find m such that
+//
+// ```c++
+//   m = max(abs(input_min), abs(input_max))
+// ```
+//
+// Our input tensor range is then `[-m, m]`.
+//
+// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+// If T is signed, this is
+//
+// ```
+//   num_bits = sizeof(T) * 8
+//   [min_fixed, max_fixed] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
+// ```
+//
+// Otherwise, if T is unsigned, the fixed-point range is
+//
+// ```
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
+// ```
+//
+// From this we compute our scaling factor, s:
+//
+// ```c++
+//   s = (max_fixed - min_fixed) / (2 * m)
+// ```
+//
+// Now we can quantize the elements of our tensor:
+//
+// ```c++
+// result = round(input * s)
+// ```
+//
+// One thing to watch out for is that the operator may choose to adjust the
+// requested minimum and maximum values slightly during the quantization process,
+// so you should always use the output ports as the range for further calculations.
+// For example, if the requested minimum and maximum values are close to equal,
+// they will be separated by a small epsilon value to prevent ill-formed quantized
+// buffers from being created. Otherwise, you can end up with buffers where all the
+// quantized values map to the same float value, which causes problems for
+// operations that have to perform further calculations on them.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
 //
-// Returns The reduced tensor.
-func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (output tf.Output) {
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+//
+//
+// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
+func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"T": T}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Mean",
+		Type: "QuantizeV2",
 		Input: []tf.Input{
-			input, axis,
+			input, min_range, max_range,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ProdAttr is an optional argument to Prod.
-type ProdAttr func(optionalAttr)
+// ComplexAbsAttr is an optional argument to ComplexAbs.
+type ComplexAbsAttr func(optionalAttr)
 
-// ProdKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func ProdKeepDims(value bool) ProdAttr {
+// ComplexAbsTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["Tout"] = value
 	}
 }
 
-// Computes the product of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+// Computes the complex absolute value of a tensor.
 //
-// Returns The reduced tensor.
-func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (output tf.Output) {
+// Given a tensor `x` of complex numbers, this operation returns a tensor of type
+// `float` or `double` that is the absolute value of each element in `x`. All
+// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
+// value is computed as \\( \sqrt{a^2 + b^2}\\).
+func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -23741,9 +28976,9 @@ func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Prod",
+		Type: "ComplexAbs",
 		Input: []tf.Input{
-			input, axis,
+			x,
 		},
 		Attrs: attrs,
 	}
@@ -23751,88 +28986,61 @@ func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (
 	return op.Output(0)
 }
 
-// ResizeBilinearAttr is an optional argument to ResizeBilinear.
-type ResizeBilinearAttr func(optionalAttr)
-
-// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
+// Returns the truth value of x AND y element-wise.
 //
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
+// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogicalAnd",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Resize `images` to `size` using bilinear interpolation.
-//
-// Input images can be of different types but output images are always float.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+// Computes the reciprocal of x element-wise.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
+// I.e., \\(y = 1 / x\\).
+func Inv(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBilinear",
+		Type: "Inv",
 		Input: []tf.Input{
-			images, size,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxAttr is an optional argument to Max.
-type MaxAttr func(optionalAttr)
-
-// MaxKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MaxKeepDims(value bool) MaxAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the maximum of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// Creates a dataset that batches input elements into a SparseTensor.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	input_dataset: A handle to an input dataset. Must have a single component.
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	row_shape: A vector representing the dense shape of each row in the produced
+// SparseTensor. The shape may be partially specified, using `-1` to indicate
+// that a particular dimension should use the maximum size of all batch elements.
 //
-// Returns The reduced tensor.
-func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (output tf.Output) {
+//
+func ExperimentalDenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Max",
+		Type: "ExperimentalDenseToSparseBatchDataset",
 		Input: []tf.Input{
-			input, axis,
+			input_dataset, batch_size, row_shape,
 		},
 		Attrs: attrs,
 	}
@@ -23840,55 +29048,59 @@ func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (ou
 	return op.Output(0)
 }
 
-// Creates a dataset that contains the unique elements of `input_dataset`.
-func ExperimentalUniqueDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Computes the reciprocal of x element-wise.
+//
+// I.e., \\(y = 1 / x\\).
+func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalUniqueDataset",
+		Type: "Reciprocal",
 		Input: []tf.Input{
-			input_dataset,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ArgMinAttr is an optional argument to ArgMin.
-type ArgMinAttr func(optionalAttr)
+// Conv3DBackpropFilterAttr is an optional argument to Conv3DBackpropFilter.
+type Conv3DBackpropFilterAttr func(optionalAttr)
 
-// ArgMinOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMinOutputType(value tf.DataType) ArgMinAttr {
+// Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["output_type"] = value
+		m["dilations"] = value
 	}
 }
 
-// Returns the index with the smallest value across dimensions of a tensor.
+// Computes the gradients of 3-D convolution with respect to the filter.
 //
-// Note that in case of ties the identity of the return value is not guaranteed.
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
 //
 // Arguments:
-//
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ArgMin",
+		Type: "Conv3DBackpropFilter",
 		Input: []tf.Input{
-			input, dimension,
+			input, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -23896,260 +29108,271 @@ func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgM
 	return op.Output(0)
 }
 
-// Convert the quantized 'input' tensor into a lower-precision 'output', using the
-//
-// output range specified with 'requested_output_min' and 'requested_output_max'.
+// Computes square root of x element-wise.
 //
-// [input_min, input_max] are scalar floats that specify the range for the float
-// interpretation of the 'input' data. For example, if input_min is -1.0f and
-// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
+func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sqrt",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Get the value of the tensor specified by its handle.
 //
 // Arguments:
+//	handle: The handle for a tensor stored in the session state.
+//	dtype: The type of the output value.
 //
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	requested_output_min: The float value that the minimum quantized output value represents.
-//	requested_output_max: The float value that the maximum quantized output value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
-//
-// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
-func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Returns The tensor for the given handle.
+func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "Requantize",
+		Type: "GetSessionTensor",
 		Input: []tf.Input{
-			input, input_min, input_max, requested_output_min, requested_output_max,
+			handle,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Creates a dataset that emits the lines of one or more text files.
+// Computes the gradient for the sqrt of `x` wrt its input.
 //
-// Arguments:
-//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
-// read.
-//	compression_type: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//	buffer_size: A scalar containing the number of bytes to buffer.
-func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
+// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TextLineDataset",
+		Type: "SqrtGrad",
 		Input: []tf.Input{
-			filenames, compression_type, buffer_size,
+			y, dy,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along segments of a tensor.
+// MatrixInverseAttr is an optional argument to MatrixInverse.
+type MatrixInverseAttr func(optionalAttr)
+
+// MatrixInverseAdjoint sets the optional adjoint attribute to value.
+// If not specified, defaults to false
+func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
+	return func(m optionalAttr) {
+		m["adjoint"] = value
+	}
+}
+
+// Computes the inverse of one or more square invertible matrices or their
 //
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
+// adjoints (conjugate transposes).
 //
-// Computes a tensor such that
-// \\(output_i = \sum_j data_j\\) where sum is over `j` such
-// that `segment_ids[j] == i`.
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the inverse for all input submatrices `[..., :, :]`.
 //
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+// The op uses LU decomposition with partial pivoting to compute the inverses.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
-// </div>
+// If a matrix is not invertible there is no guarantee what the op does. It
+// may detect the condition and raise an exception or it may simply return a
+// garbage result.
 //
 // Arguments:
+//	input: Shape is `[..., M, M]`.
 //
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+// Returns Shape is `[..., M, M]`.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.linalg.inv
+// @end_compatibility
+func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SegmentSum",
+		Type: "MatrixInverse",
 		Input: []tf.Input{
-			data, segment_ids,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the mean along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
-// over `j` such that `segment_ids[j] == i` and `N` is the total number of
-// values summed.
-//
-// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+// Computes reciprocal of square root of x element-wise.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// I.e., \\(y = 1 / \sqrt{x}\\).
+func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMean",
+		Type: "Rsqrt",
 		Input: []tf.Input{
-			data, segment_ids,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the minimum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the min is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+// Rounds the values of a tensor to the nearest integer, element-wise.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Rounds half to even.  Also known as bankers rounding. If you want to round
+// according to the current system rounding mode use std::cint.
+func Round(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMin",
+		Type: "Round",
 		Input: []tf.Input{
-			data, segment_ids,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Removes keys and its associated values from a table.
+// Delete the TensorArray from its resource container.
 //
-// The tensor `keys` must of the same type as the keys of the table. Keys not
-// already in the table are silently ignored.
+// This enables the user to close and release the resource in the middle
+// of a step/run.
 //
 // Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys of the elements to remove.
+//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
 //
 // Returns the created operation.
-func LookupTableRemoveV2(scope *Scope, table_handle tf.Output, keys tf.Output) (o *tf.Operation) {
+func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableRemoveV2",
+		Type: "TensorArrayCloseV3",
 		Input: []tf.Input{
-			table_handle, keys,
+			handle,
 		},
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Computes the sum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
+// Computes exponential of x element-wise.  \\(y = e^x\\).
+func Exp(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Exp",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// NthElementAttr is an optional argument to NthElement.
+type NthElementAttr func(optionalAttr)
+
+// NthElementReverse sets the optional reverse attribute to value.
 //
-// Computes a tensor such that
-// \\(output[i] = \sum_{j...} data[j...]\\) where the sum is over tuples `j...` such
-// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
-// need not be sorted and need not cover all values in the full
-// range of valid values.
+// value: When set to True, find the nth-largest value in the vector and vice
+// versa.
+// If not specified, defaults to false
+func NthElementReverse(value bool) NthElementAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Finds values of the `n`-th order statistic for the last dimension.
 //
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-// If the given segment ID `i` is negative, the value is dropped and will not be
-// added to the sum of the segment.
+// If the input is a vector (rank-1), finds the entries which is the nth-smallest
+// value in the vector and outputs their values as scalar tensor.
 //
-// `num_segments` should equal the number of distinct segment IDs.
+// For matrices (resp. higher rank input), computes the entries which is the
+// nth-smallest value in each row (resp. vector along the last dimension). Thus,
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
-// </div>
+//     values.shape = input.shape[:-1]
 //
 // Arguments:
+//	input: 1-D or higher with last dimension at least `n+1`.
+//	n: 0-D. Position of sorted vector to select along the last dimension (along
+// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
 //
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
-//
-//
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Returns The `n`-th order statistic along each last dimensional slice.
+func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentSum",
+		Type: "NthElement",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			input, n,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the product along segments of a tensor.
+// Computes the maximum along segments of a tensor.
 //
 // Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 // for an explanation of segments.
 //
 // This operator is similar to the unsorted segment sum operator found
 // [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the product of all
-// entries belonging to a segment such that:
+// Instead of computing the sum over segments, it computes the maximum such that:
 //
-// \\(output_i = \prod_{j...} data[j...]\\) where the product is over tuples
-// `j...` such that `segment_ids[j...] == i`.
+// \\(output_i = \max_{j...} data[j...]\\) where max is over tuples `j...` such
+// that `segment_ids[j...] == i`.
 //
-// If there is no entry for a given segment ID `i`, it outputs 1.
+// If the maximum is empty for a given segment ID `i`, it outputs the smallest
+// possible value for the specific numeric type,
+// `output[i] = numeric_limits<T>::lowest()`.
 //
 // If the given segment ID `i` is negative, then the corresponding value is
 // dropped, and will not be included in the result.
 //
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
+// </div>
+//
+// For example:
+//
+// ``` python
+// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+// tf.unsorted_segment_max(c, tf.constant([0, 1, 0]), num_segments=2)
+// # ==> [[ 4,  3, 3, 4],
+// #       [5,  6, 7, 8]]
+// ```
+//
+//
 // Arguments:
 //
 //	segment_ids: A tensor whose shape is a prefix of `data.shape`.
@@ -24158,12 +29381,12 @@ func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num
 // Returns Has same shape as data, except for the first `segment_ids.rank`
 // dimensions, which are replaced with a single dimension which has size
 // `num_segments`.
-func UnsortedSegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentProd",
+		Type: "UnsortedSegmentMax",
 		Input: []tf.Input{
 			data, segment_ids, num_segments,
 		},
@@ -24172,207 +29395,89 @@ func UnsortedSegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output, nu
 	return op.Output(0)
 }
 
-// Computes the mean along sparse segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Computes softplus: `log(exp(features) + 1)`.
+func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMean",
+		Type: "Softplus",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deserializes a serialized tree ensemble config and replaces current tree
-//
-// ensemble.
-//
-// Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble.
-//	stamp_token: Token to use as the new value of the resource stamp.
-//	tree_ensemble_serialized: Serialized proto of the ensemble.
-//
-// Returns the created operation.
-func BoostedTreesDeserializeEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesDeserializeEnsemble",
-		Input: []tf.Input{
-			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Transforms a tf.Example proto (as a string) into typed tensors.
+// Computes exponential of x - 1 element-wise.
 //
-// Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	dense_defaults: A list of Tensors (some may be empty), whose length matches
-// the length of `dense_keys`. dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	num_sparse: The number of sparse features to be parsed from the example. This
-// must match the lengths of `sparse_keys` and `sparse_types`.
-//	sparse_keys: A list of `num_sparse` strings.
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: The keys expected in the Examples' features associated with dense
-// values.
-//	sparse_types: A list of `num_sparse` types; the data types of data in each
-// Feature given in sparse_keys.
-// Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: The shapes of data in each Feature given in dense_keys.
-// The length of this list must match the length of `dense_keys`.  The
-// number of elements in the Feature corresponding to dense_key[j] must
-// always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
-// (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
-// will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
-// ..., DN), the shape of the output Tensor dense_values[j] will be (M,
-// D1, .., DN), where M is the number of blocks of elements of length
-// D1 * .... * DN, in the input.
-func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+// I.e., \\(y = (\exp x) - 1\\).
+func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes}
 	opspec := tf.OpSpec{
-		Type: "ParseSingleExample",
+		Type: "Expm1",
 		Input: []tf.Input{
-			serialized, tf.OutputList(dense_defaults),
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
-}
-
-// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
-type WholeFileReaderV2Attr func(optionalAttr)
-
-// WholeFileReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
+	return op.Output(0)
 }
 
-// A Reader that outputs the entire contents of a file as a value.
-//
-// To use, enqueue filenames in a Queue.  The output of ReaderRead will
-// be a filename (key) and the contents of that file (value).
+// Computes natural logarithm of x element-wise.
 //
-// Returns The handle to reference the Reader.
-func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
+// I.e., \\(y = \log_e x\\).
+func Log(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "WholeFileReaderV2",
-
-		Attrs: attrs,
+		Type: "Log",
+		Input: []tf.Input{
+			x,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Pop the element at the top of the stack.
+// Returns the index of a data point that should be added to the seed set.
+//
+// Entries in distances are assumed to be squared distances of candidate points to
+// the already sampled centers in the seed set. The op constructs one Markov chain
+// of the k-MC^2 algorithm and returns the index of one candidate point to be added
+// as an additional cluster center.
 //
 // Arguments:
-//	handle: The handle to a stack.
-//	elem_type: The type of the elem that is popped.
+//	distances: Vector with squared distances to the closest previously sampled cluster center
+// for each candidate point.
+//	seed: Scalar. Seed for initializing the random number generator.
 //
-// Returns The tensor that is popped from the top of the stack.
-func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
+// Returns Scalar with the index of the sampled point.
+func KMC2ChainInitialization(scope *Scope, distances tf.Output, seed tf.Output) (index tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"elem_type": elem_type}
 	opspec := tf.OpSpec{
-		Type: "StackPopV2",
+		Type: "KMC2ChainInitialization",
 		Input: []tf.Input{
-			handle,
+			distances, seed,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes hyperbolic cosine of x element-wise.
-func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes hyperbolic sine of x element-wise.
+func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cosh",
+		Type: "Sinh",
 		Input: []tf.Input{
 			x,
 		},
@@ -24381,123 +29486,83 @@ func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the mean along sparse segments of a tensor.
-//
-// Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
+// Computes the sum along sparse segments of a tensor.
 //
 // Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
 // for an explanation of segments.
 //
+// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
+//
+// For example:
+//
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+//
+// # Select two rows, one segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
+// # => [[0 0 0 0]]
+//
+// # Select two rows, two segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
+// # => [[ 1  2  3  4]
+// #     [-1 -2 -3 -4]]
+//
+// # Select all rows, two segments.
+// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
+// # => [[0 0 0 0]
+// #     [5 6 7 8]]
+//
+// # Which is equivalent to:
+// tf.segment_sum(c, tf.constant([0, 0, 1]))
+// ```
+//
 // Arguments:
 //
 //	indices: A 1-D tensor. Has same rank as `segment_ids`.
 //	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
 //
-// Returns Has same shape as data, except for dimension 0 which has size
-// `num_segments`.
-func SparseSegmentMeanWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanWithNumSegments",
+		Type: "SparseSegmentSum",
 		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
+			data, indices, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CudnnRNNParamsSizeAttr is an optional argument to CudnnRNNParamsSize.
-type CudnnRNNParamsSizeAttr func(optionalAttr)
-
-// CudnnRNNParamsSizeRnnMode sets the optional rnn_mode attribute to value.
-// If not specified, defaults to "lstm"
-func CudnnRNNParamsSizeRnnMode(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["rnn_mode"] = value
-	}
-}
-
-// CudnnRNNParamsSizeInputMode sets the optional input_mode attribute to value.
-// If not specified, defaults to "linear_input"
-func CudnnRNNParamsSizeInputMode(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["input_mode"] = value
-	}
-}
-
-// CudnnRNNParamsSizeDirection sets the optional direction attribute to value.
-// If not specified, defaults to "unidirectional"
-func CudnnRNNParamsSizeDirection(value string) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["direction"] = value
-	}
-}
-
-// CudnnRNNParamsSizeDropout sets the optional dropout attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeDropout(value float32) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["dropout"] = value
-	}
-}
-
-// CudnnRNNParamsSizeSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeSeed(value int64) CudnnRNNParamsSizeAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// CastAttr is an optional argument to Cast.
+type CastAttr func(optionalAttr)
 
-// CudnnRNNParamsSizeSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func CudnnRNNParamsSizeSeed2(value int64) CudnnRNNParamsSizeAttr {
+// CastTruncate sets the optional Truncate attribute to value.
+// If not specified, defaults to false
+func CastTruncate(value bool) CastAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["Truncate"] = value
 	}
 }
 
-// Computes size of weights that can be used by a Cudnn RNN model.
-//
-// Return the params size that can be used by the Cudnn RNN model. Subsequent
-// weight allocation and initialization should use this size.
-//
-// num_layers: Specifies the number of layers in the RNN model.
-// num_units: Specifies the size of the hidden state.
-// input_size: Specifies the size of the input state.
-// rnn_mode: Indicates the type of the RNN model.
-// input_mode: Indicate whether there is a linear projection between the input and
-//   The actual computation before the first layer. 'skip_input' is only allowed
-//   when input_size == num_units; 'auto_select' implies 'skip_input' when
-//   input_size == num_units; otherwise, it implies 'linear_input'.
-// direction: Indicates whether a bidirectional model will be used.
-//   dir = (direction == bidirectional) ? 2 : 1
-// dropout: dropout probability. When set to 0., dropout is disabled.
-// seed: the 1st part of a seed to initialize dropout.
-// seed2: the 2nd part of a seed to initialize dropout.
-// params_size: The size of the params buffer that should be allocated and
-//   initialized for this RNN model. Note that this params buffer may not be
-//   compatible across GPUs. Please use CudnnRNNParamsWeights and
-//   CudnnRNNParamsBiases to save and restore them in a way that is compatible
-//   across different runs.
-func CudnnRNNParamsSize(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, T tf.DataType, S tf.DataType, optional ...CudnnRNNParamsSizeAttr) (params_size tf.Output) {
+// Cast x of type SrcT to y of DstT.
+func Cast(scope *Scope, x tf.Output, DstT tf.DataType, optional ...CastAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T, "S": S}
+	attrs := map[string]interface{}{"DstT": DstT}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CudnnRNNParamsSize",
+		Type: "Cast",
 		Input: []tf.Input{
-			num_layers, num_units, input_size,
+			x,
 		},
 		Attrs: attrs,
 	}
@@ -24505,196 +29570,203 @@ func CudnnRNNParamsSize(scope *Scope, num_layers tf.Output, num_units tf.Output,
 	return op.Output(0)
 }
 
-// Computes gradients for SparseSegmentMean.
-//
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
-//
-// Arguments:
-//	grad: gradient propagated to the SparseSegmentMean op.
-//	indices: indices passed to the corresponding SparseSegmentMean op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
-func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+// Computes the log of the absolute value of `Gamma(x)` element-wise.
+func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanGrad",
+		Type: "Lgamma",
 		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+// UnicodeEncodeAttr is an optional argument to UnicodeEncode.
+type UnicodeEncodeAttr func(optionalAttr)
+
+// UnicodeEncodeErrors sets the optional errors attribute to value.
 //
-// N is the size of the segment being reduced.
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeEncodeErrors(value string) UnicodeEncodeAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeEncodeReplacementChar sets the optional replacement_char attribute to value.
 //
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD (U+65533).
+// If not specified, defaults to 65533
+func UnicodeEncodeReplacementChar(value int64) UnicodeEncodeAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// Encode a tensor of ints into unicode strings.
 //
-// Arguments:
+// Returns a vector of strings, where `output[i]` is constructed by encoding the
+// Unicode codepoints in `input_values[input_splits[i]:input_splits[i+1]]`
+// using `output_encoding`.
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+// ---
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Example:
+//
+// ```
+// input_values = [72, 101, 108, 108, 111, 87, 111, 114, 108, 100]
+// input_splits = [0, 5, 10]
+// output_encoding = 'UTF-8'
+//
+// output = ['Hello', 'World']
+// ```
+//
+// Arguments:
+//	input_values: A 1D tensor containing the unicode codepoints that should be encoded.
+//	input_splits: A 1D tensor specifying how the unicode codepoints should be split into strings.
+// In particular, `output[i]` is constructed by encoding the codepoints in the
+// slice `input_values[input_splits[i]:input_splits[i+1]]`.
+//	output_encoding: Unicode encoding of the output strings. Valid encodings are: `"UTF-8",
+// "UTF-16-BE", and "UTF-32-BE"`.
+//
+// Returns The 1-D Tensor of strings encoded from the provided unicode codepoints.
+func UnicodeEncode(scope *Scope, input_values tf.Output, input_splits tf.Output, output_encoding string, optional ...UnicodeEncodeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_encoding": output_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtN",
+		Type: "UnicodeEncode",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			input_values, input_splits,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
-//
-// The upper regularized incomplete Gamma function is defined as:
-//
-// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
-//
-// where
-//
-// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
-//
-// is the upper incomplete Gama function.
-//
-// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
-// Gamma function.
-func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// Computes the complementary error function of `x` element-wise.
+func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Igammac",
+		Type: "Erfc",
 		Input: []tf.Input{
-			a, x,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
-//
-// N is the size of the segment being reduced.
-//
-// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
+// Computes sigmoid of `x` element-wise.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Specifically, `y = 1 / (1 + exp(-x))`.
+func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNWithNumSegments",
+		Type: "Sigmoid",
 		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes gradients for SparseSegmentSqrtN.
-//
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
-//
-// Arguments:
-//	grad: gradient propagated to the SparseSegmentSqrtN op.
-//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
-func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+// Computes sin of x element-wise.
+func Sin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNGrad",
+		Type: "Sin",
 		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LRNGradAttr is an optional argument to LRNGrad.
-type LRNGradAttr func(optionalAttr)
+// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
+type FusedBatchNormGradAttr func(optionalAttr)
 
-// LRNGradDepthRadius sets the optional depth_radius attribute to value.
+// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
 //
-// value: A depth radius.
-// If not specified, defaults to 5
-func LRNGradDepthRadius(value int64) LRNGradAttr {
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
 	return func(m optionalAttr) {
-		m["depth_radius"] = value
+		m["epsilon"] = value
 	}
 }
 
-// LRNGradBias sets the optional bias attribute to value.
+// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
 //
-// value: An offset (usually > 0 to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNGradBias(value float32) LRNGradAttr {
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
 	return func(m optionalAttr) {
-		m["bias"] = value
+		m["data_format"] = value
 	}
 }
 
-// LRNGradAlpha sets the optional alpha attribute to value.
+// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
 //
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNGradAlpha(value float32) LRNGradAttr {
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
 	return func(m optionalAttr) {
-		m["alpha"] = value
+		m["is_training"] = value
 	}
 }
 
-// LRNGradBeta sets the optional beta attribute to value.
+// Gradient for batch normalization.
 //
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNGradBeta(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["beta"] = value
-	}
-}
-
-// Gradients for Local Response Normalization.
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	input_grads: 4-D with shape `[batch, height, width, channels]`.
-//	input_image: 4-D with shape `[batch, height, width, channels]`.
-//	output_image: 4-D with shape `[batch, height, width, channels]`.
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
 //
-// Returns The gradients for LRN.
-func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
+// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -24703,216 +29775,120 @@ func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LRNGrad",
+		Type: "FusedBatchNormGrad",
 		Input: []tf.Input{
-			input_grads, input_image, output_image,
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AnyAttr is an optional argument to Any.
-type AnyAttr func(optionalAttr)
-
-// AnyKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func AnyKeepDims(value bool) AnyAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Computes the "logical or" of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (output tf.Output) {
+// Computes cos of x element-wise.
+func Cos(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Any",
+		Type: "Cos",
 		Input: []tf.Input{
-			input, axis,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
-type DestroyResourceOpAttr func(optionalAttr)
-
-// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
-//
-// value: whether to ignore the error when the resource
-// doesn't exist.
-// If not specified, defaults to true
-func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
-	return func(m optionalAttr) {
-		m["ignore_lookup_error"] = value
-	}
-}
-
-// Deletes the resource specified by the handle.
-//
-// All subsequent operations using the resource will result in a NotFound
-// error status.
-//
-// Arguments:
-//	resource: handle to the resource to delete.
-//
-// Returns the created operation.
-func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DestroyResourceOp",
-		Input: []tf.Input{
-			resource,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Generates values in an interval.
-//
-// A sequence of `num` evenly-spaced values are generated beginning at `start`.
-// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
-// so that the last one is exactly `stop`.
-//
-// For example:
+// Computes the determinant of one or more square matrices.
 //
-// ```
-// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
-// ```
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor containing the determinants
+// for all input submatrices `[..., :, :]`.
 //
 // Arguments:
-//	start: 0-D tensor. First entry in the range.
-//	stop: 0-D tensor. Last entry in the range.
-//	num: 0-D tensor. Number of values to generate.
+//	input: Shape is `[..., M, M]`.
 //
-// Returns 1-D. The generated values.
-func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
+// Returns Shape is `[...]`.
+func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LinSpace",
+		Type: "MatrixDeterminant",
 		Input: []tf.Input{
-			start, stop, num,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ComplexAttr is an optional argument to Complex.
-type ComplexAttr func(optionalAttr)
-
-// ComplexTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_COMPLEX64
-func ComplexTout(value tf.DataType) ComplexAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Converts two real numbers to a complex number.
-//
-// Given a tensor `real` representing the real part of a complex number, and a
-// tensor `imag` representing the imaginary part of a complex number, this
-// operation returns complex numbers elementwise of the form \\(a + bj\\), where
-// *a* represents the `real` part and *b* represents the `imag` part.
-//
-// The input tensors `real` and `imag` must have the same shape.
+// Updates the tree ensemble by either adding a layer to the last tree being grown
 //
-// For example:
+// or by starting a new tree.
 //
-// ```
-// # tensor 'real' is [2.25, 3.25]
-// # tensor `imag` is [4.75, 5.75]
-// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
-// ```
-func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+// Arguments:
+//	tree_ensemble_handle: Handle to the ensemble variable.
+//	feature_ids: Rank 1 tensor with ids for each feature. This is the real id of
+// the feature that will be used in the split.
+//	node_ids: List of rank 1 tensors representing the nodes for which this feature
+// has a split.
+//	gains: List of rank 1 tensors representing the gains for each of the feature's
+// split.
+//	thresholds: List of rank 1 tensors representing the thesholds for each of the
+// feature's split.
+//	left_node_contribs: List of rank 2 tensors with left leaf contribs for each of
+// the feature's splits. Will be added to the previous node values to constitute
+// the values of the left nodes.
+//	right_node_contribs: List of rank 2 tensors with right leaf contribs for each
+// of the feature's splits. Will be added to the previous node values to constitute
+// the values of the right nodes.
+//	max_depth: Max depth of the tree to build.
+//	learning_rate: shrinkage const for each new tree.
+//	pruning_mode: 0-No pruning, 1-Pre-pruning, 2-Post-pruning.
+//
+// Returns the created operation.
+func BoostedTreesUpdateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, feature_ids tf.Output, node_ids []tf.Output, gains []tf.Output, thresholds []tf.Output, left_node_contribs []tf.Output, right_node_contribs []tf.Output, max_depth tf.Output, learning_rate tf.Output, pruning_mode int64) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"pruning_mode": pruning_mode}
 	opspec := tf.OpSpec{
-		Type: "Complex",
+		Type: "BoostedTreesUpdateEnsemble",
 		Input: []tf.Input{
-			real, imag,
+			tree_ensemble_handle, feature_ids, tf.OutputList(node_ids), tf.OutputList(gains), tf.OutputList(thresholds), tf.OutputList(left_node_contribs), tf.OutputList(right_node_contribs), max_depth, learning_rate,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ImagAttr is an optional argument to Imag.
-type ImagAttr func(optionalAttr)
-
-// ImagTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ImagTout(value tf.DataType) ImagAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
+// Computes tan of x element-wise.
+func Tan(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tan",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns the imaginary part of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the imaginary part of each element in `input`. All
-// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part returned by this operation.
-//
-// For example:
-//
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.imag(input) ==> [4.75, 5.75]
-// ```
-func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+// Creates a dataset that emits each dim-0 slice of `components` once.
+func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Imag",
+		Type: "TensorSliceDataset",
 		Input: []tf.Input{
-			input,
+			tf.OutputList(components),
 		},
 		Attrs: attrs,
 	}
@@ -24920,13 +29896,13 @@ func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output
 	return op.Output(0)
 }
 
-// Computes hyperbolic tangent of `x` element-wise.
-func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes acos of x element-wise.
+func Acos(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Tanh",
+		Type: "Acos",
 		Input: []tf.Input{
 			x,
 		},
@@ -24935,92 +29911,71 @@ func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the maximum along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the max is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
-// </div>
-//
-// Arguments:
+// Computes the Bessel i0e function of `x` element-wise.
 //
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+// Exponentially scaled modified Bessel function of order 0 defined as
+// `bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// This function is faster and numerically stabler than `bessel_i0(x)`.
+func BesselI0e(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMax",
+		Type: "BesselI0e",
 		Input: []tf.Input{
-			data, segment_ids,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that skips `count` elements from the `input_dataset`.
-//
-// Arguments:
-//
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be skipped.  If count is -1, skips everything.
-//
+// Shuffle dimensions of x according to a permutation.
 //
-func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SkipDataset",
+		Type: "Transpose",
 		Input: []tf.Input{
-			input_dataset, count,
+			x, perm,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RealAttr is an optional argument to Real.
-type RealAttr func(optionalAttr)
+// MinAttr is an optional argument to Min.
+type MinAttr func(optionalAttr)
 
-// RealTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func RealTout(value tf.DataType) RealAttr {
+// MinKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MinKeepDims(value bool) MinAttr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Returns the real part of a complex number.
+// Computes the minimum of elements across dimensions of a tensor.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the real part of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
-//  part returned by this operation and *b* is the imaginary part.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
-// For example:
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.real(input) ==> [-2.25, 3.25]
-// ```
-func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
+// Returns The reduced tensor.
+func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -25029,9 +29984,9 @@ func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Real",
+		Type: "Min",
 		Input: []tf.Input{
-			input,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -25039,80 +29994,66 @@ func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output
 	return op.Output(0)
 }
 
-// Sends `input` to all devices that are connected to the output.
-//
-// Sends `input` to all devices that are connected to the output.
-//
-// The graph should be constructed so that all ops connected to the output have a
-// valid device assignment, and the op itself is assigned one of these devices.
+// Computes the Bessel i1e function of `x` element-wise.
 //
-// input: The input to the broadcast.
-// output: The same as input.
-// shape: The shape of the input tensor.
+// Exponentially scaled modified Bessel function of order 0 defined as
+// `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
 //
-func NcclBroadcast(scope *Scope, input tf.Output, shape tf.Shape) (output tf.Output) {
+// This function is faster and numerically stabler than `bessel_i1(x)`.
+func BesselI1e(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "NcclBroadcast",
+		Type: "BesselI1e",
 		Input: []tf.Input{
-			input,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeAreaAttr is an optional argument to ResizeArea.
-type ResizeAreaAttr func(optionalAttr)
-
-// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
+// Returns an element-wise indication of the sign of a number.
 //
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
+// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
+//
+// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
+func Sign(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "Sign",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Resize `images` to `size` using area interpolation.
-//
-// Input images can be of different types but output images are always float.
+// Creates a dataset that passes a sliding window over `input_dataset`.
 //
-// The range of pixel values for the output image might be slightly different
-// from the range for the input image because of limited numerical precision.
-// To guarantee an output range, for example `[0.0, 1.0]`, apply
-// `tf.clip_by_value` to the output.
+// Arguments:
 //
-// Each output pixel is computed by first transforming the pixel's footprint into
-// the input tensor and then averaging the pixels that intersect the footprint. An
-// input pixel's contribution to the average is weighted by the fraction of its
-// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
+//	window_size: A scalar representing the number of elements in the
+// sliding window.
+//	window_shift: A scalar representing the steps moving the sliding window
+// forward in one iteration. It must be positive.
+//	window_stride: A scalar representing the stride of the input elements of the sliding window.
+// It must be positive.
 //
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
+func ExperimentalSlidingWindowDataset(scope *Scope, input_dataset tf.Output, window_size tf.Output, window_shift tf.Output, window_stride tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ResizeArea",
+		Type: "ExperimentalSlidingWindowDataset",
 		Input: []tf.Input{
-			images, size,
+			input_dataset, window_size, window_shift, window_stride,
 		},
 		Attrs: attrs,
 	}
@@ -25120,293 +30061,339 @@ func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...Resi
 	return op.Output(0)
 }
 
-// VarHandleOpAttr is an optional argument to VarHandleOp.
-type VarHandleOpAttr func(optionalAttr)
+// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
+type OrderedMapUnstageNoKeyAttr func(optionalAttr)
 
-// VarHandleOpContainer sets the optional container attribute to value.
+// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: the container this variable is placed in.
+// REQUIRES: value >= 0
+func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func VarHandleOpContainer(value string) VarHandleOpAttr {
+func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// VarHandleOpSharedName sets the optional shared_name attribute to value.
-//
-// value: the name by which this variable is referred to.
+// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func VarHandleOpSharedName(value string) VarHandleOpAttr {
+func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Creates a handle to a Variable resource.
+// Op removes and returns the (key, value) element with the smallest
 //
-// Arguments:
-//	dtype: the type of this variable. Must agree with the dtypes
-// of all ops using this variable.
-//	shape: The (possibly partially specified) shape of this variable.
-func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
+// key from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "VarHandleOp",
-
+		Type: "OrderedMapUnstageNoKey",
+		Input: []tf.Input{
+			indices,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AngleAttr is an optional argument to Angle.
-type AngleAttr func(optionalAttr)
-
-// AngleTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func AngleTout(value tf.DataType) AngleAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	key = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapUnstageNoKey", err)
+		return
 	}
+	return key, values
 }
 
-// Returns the argument of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the argument of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part.
-//
-// The argument returned by this operation is of the form \\(atan2(b, a)\\).
+// Returns element-wise integer closest to x.
 //
+// If the result is midway between two representable values,
+// the even representable is chosen.
 // For example:
 //
 // ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.angle(input) ==> [2.0132, 1.056]
+// rint(-1.5) ==> -2.0
+// rint(0.5000001) ==> 1.0
+// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
 // ```
-//
-// @compatibility(numpy)
-// Equivalent to np.angle.
-// @end_compatibility
-func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
+func Rint(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+	opspec := tf.OpSpec{
+		Type: "Rint",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the derivative of a Gamma random sample w.r.t. `alpha`.
+func RandomGammaGrad(scope *Scope, alpha tf.Output, sample tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Angle",
+		Type: "RandomGammaGrad",
 		Input: []tf.Input{
-			input,
+			alpha, sample,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Clips tensor values to a specified min and max.
-//
-// Given a tensor `t`, this operation returns a tensor of the same type and
-// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
-// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
-// greater than `clip_value_max` are set to `clip_value_max`.
-//
-// Arguments:
-//	t: A `Tensor`.
-//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The minimum value to clip by.
-//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The maximum value to clip by.
+// Returns x + y element-wise.
 //
-// Returns A clipped `Tensor` with the same shape as input 't'.
-func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ClipByValue",
+		Type: "Add",
 		Input: []tf.Input{
-			t, clip_value_min, clip_value_max,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Counts the number of occurrences of each value in an integer array.
-//
-// Outputs a vector with length `size` and the same dtype as `weights`. If
-// `weights` are empty, then index `i` stores the number of times the value `i` is
-// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
-// the value in `weights` at each index where the corresponding value in `arr` is
-// `i`.
-//
-// Values in `arr` outside of the range [0, size) are ignored.
-//
-// Arguments:
-//	arr: int32 `Tensor`.
-//	size: non-negative int32 scalar `Tensor`.
-//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
-// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
-// equal to 1.
+// Returns x + y element-wise.
 //
-// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
-// each value in the range [0, size).
-func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Bincount",
+		Type: "AddV2",
 		Input: []tf.Input{
-			arr, size, weights,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CumsumAttr is an optional argument to Cumsum.
-type CumsumAttr func(optionalAttr)
+// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
+type AllCandidateSamplerAttr func(optionalAttr)
 
-// CumsumExclusive sets the optional exclusive attribute to value.
+// AllCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// value: If `True`, perform exclusive cumsum.
-// If not specified, defaults to false
-func CumsumExclusive(value bool) CumsumAttr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["exclusive"] = value
+		m["seed"] = value
 	}
 }
 
-// CumsumReverse sets the optional reverse attribute to value.
+// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumsumReverse(value bool) CumsumAttr {
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["reverse"] = value
+		m["seed2"] = value
 	}
 }
 
-// Compute the cumulative sum of the tensor `x` along `axis`.
+// Generates labels for candidate sampling with a learned unigram distribution.
 //
-// By default, this op performs an inclusive cumsum, which means that the first
-// element of the input is identical to the first element of the output:
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
 //
-// ```python
-// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
-// ```
+// For each batch, this op picks a single set of sampled candidate labels.
 //
-// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
-// performed instead:
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
-// ```python
-// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
-// ```
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to produce.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
 //
-// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
-// opposite direction:
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AllCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
 //
-// ```python
-// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
-// ```
+// true, this follows Python semantics in that the result here is consistent
+// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
 //
-// This is more efficient than using separate `tf.reverse` ops.
+// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FloorMod",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Saves the input tensors to disk.
 //
-// The `reverse` and `exclusive` kwargs can also be combined:
+// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
+// is written to `filename` with name `tensor_names[i]`.
 //
-// ```python
-// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
-// ```
+// See also `SaveSlices`.
+//
+// Arguments:
+//	filename: Must have a single element. The name of the file to which we write
+// the tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	data: `N` tensors to save.
 //
-// Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
+// Returns the created operation.
+func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Cumsum",
+		Type: "Save",
 		Input: []tf.Input{
-			x, axis,
+			filename, tensor_names, tf.OutputList(data),
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Return the shape of s0 op s1 with broadcast.
+// Returns x * y element-wise. Returns zero if y is zero, even if x if infinite or NaN.
 //
-// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
-// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
-func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
+// *NOTE*: `Mul` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func MulNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BroadcastArgs",
+		Type: "MulNoNan",
 		Input: []tf.Input{
-			s0, s1,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
-type DataFormatDimMapAttr func(optionalAttr)
-
-// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
+// Returns x / y element-wise for integer types.
 //
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
-	return func(m optionalAttr) {
-		m["src_format"] = value
+// Truncation designates that negative numbers will round fractional quantities
+// toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
+// than Python semantics. See `FloorDiv` for a division function that matches
+// Python Semantics.
+//
+// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TruncateDiv",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
+// RequantizePerChannelAttr is an optional argument to RequantizePerChannel.
+type RequantizePerChannelAttr func(optionalAttr)
+
+// RequantizePerChannelOutType sets the optional out_type attribute to value.
 //
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
+// value: The quantized type of output tensor that needs to be converted.
+// If not specified, defaults to DT_QUINT8
+func RequantizePerChannelOutType(value tf.DataType) RequantizePerChannelAttr {
 	return func(m optionalAttr) {
-		m["dst_format"] = value
+		m["out_type"] = value
 	}
 }
 
-// Returns the dimension index in the destination data format given the one in
-//
-// the source data format.
+// Requantizes input with min and max values known per channel.
 //
 // Arguments:
-//	x: A Tensor with each element as a dimension index in source data format.
-// Must be in the range [-4, 4).
+//	input: The original input tensor.
+//	input_min: The minimum value of the input tensor
+//	input_max: The maximum value of the input tensor.
+//	requested_output_min: The minimum value of the output tensor requested.
+//	requested_output_max: The maximum value of the output tensor requested.
 //
-// Returns A Tensor with each element as a dimension index in destination data format.
-func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
+// Returns Output tensor.The minimum value of the final output tensorThe maximum value of the final output tensor.
+func RequantizePerChannel(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, optional ...RequantizePerChannelAttr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -25415,218 +30402,151 @@ func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAtt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DataFormatDimMap",
+		Type: "RequantizePerChannel",
 		Input: []tf.Input{
-			x,
+			input, input_min, input_max, requested_output_min, requested_output_max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CumprodAttr is an optional argument to Cumprod.
-type CumprodAttr func(optionalAttr)
-
-// CumprodExclusive sets the optional exclusive attribute to value.
-//
-// value: If `True`, perform exclusive cumprod.
-// If not specified, defaults to false
-func CumprodExclusive(value bool) CumprodAttr {
-	return func(m optionalAttr) {
-		m["exclusive"] = value
-	}
-}
-
-// CumprodReverse sets the optional reverse attribute to value.
-//
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumprodReverse(value bool) CumprodAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Compute the cumulative product of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumprod, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
-// performed instead:
-//
-// ```python
-// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
-// opposite direction:
-//
-// ```python
-// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
-// ```
+// Restores tensors from a V2 checkpoint.
 //
-// This is more efficient than using separate `tf.reverse` ops.
+// For backward compatibility with the V1 format, this Op currently allows
+// restoring from a V1 checkpoint as well:
+//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
+//     if found proceed to read it as a V2 checkpoint;
+//   - Otherwise the V1 read path is invoked.
+// Relying on this behavior is not recommended, as the ability to fall back to read
+// V1 might be deprecated and eventually removed.
 //
-// The `reverse` and `exclusive` kwargs can also be combined:
+// By default, restores the named tensors in full.  If the caller wishes to restore
+// specific slices of stored tensors, "shape_and_slices" should be non-empty
+// strings and correspondingly well-formed.
 //
-// ```python
-// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
-// ```
+// Callers must ensure all the named tensors are indeed stored in the checkpoint.
 //
 // Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
+//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
+//	tensor_names: shape {N}.  The names of the tensors to be restored.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
+// Empty strings indicate that they are non-partitioned tensors.
+//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
+// those stored in the checkpoint.
+//
+// Returns shape {N}.  The restored tensors, whose shapes are read from the
+// checkpoint directly.
+func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	opspec := tf.OpSpec{
-		Type: "Cumprod",
+		Type: "RestoreV2",
 		Input: []tf.Input{
-			x, axis,
+			prefix, tensor_names, shape_and_slices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
+		scope.UpdateErr("RestoreV2", err)
+		return
+	}
+	return tensors
 }
 
-// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
-type QuantizedMatMulAttr func(optionalAttr)
+// FIFOQueueV2Attr is an optional argument to FIFOQueueV2.
+type FIFOQueueV2Attr func(optionalAttr)
 
-// QuantizedMatMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
+// FIFOQueueV2Shapes sets the optional shapes attribute to value.
+//
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["Toutput"] = value
+		m["shapes"] = value
 	}
 }
 
-// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
+// FIFOQueueV2Capacity sets the optional capacity attribute to value.
 //
-// value: If true, `a` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func FIFOQueueV2Capacity(value int64) FIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["transpose_a"] = value
+		m["capacity"] = value
 	}
 }
 
-// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
+// FIFOQueueV2Container sets the optional container attribute to value.
 //
-// value: If true, `b` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FIFOQueueV2Container(value string) FIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["transpose_b"] = value
+		m["container"] = value
 	}
 }
 
-// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
+// FIFOQueueV2SharedName sets the optional shared_name attribute to value.
 //
-// value: The type of output produced by activation function
-// following this operation.
-// If not specified, defaults to DT_QUINT8
-func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func FIFOQueueV2SharedName(value string) FIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["Tactivation"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
-//
-// The inputs must be two-dimensional matrices and the inner dimension of
-// `a` (after being transposed if `transpose_a` is non-zero) must match the
-// outer dimension of `b` (after being transposed if `transposed_b` is
-// non-zero).
+// A queue that produces elements in first-in first-out order.
 //
 // Arguments:
-//	a: Must be a two-dimensional tensor.
-//	b: Must be a two-dimensional tensor.
-//	min_a: The float value that the lowest quantized `a` value represents.
-//	max_a: The float value that the highest quantized `a` value represents.
-//	min_b: The float value that the lowest quantized `b` value represents.
-//	max_b: The float value that the highest quantized `b` value represents.
+//	component_types: The type of each component in a value.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
+// Returns The handle to the queue.
+func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMatMul",
-		Input: []tf.Input{
-			a, b, min_a, max_a, min_b, max_b,
-		},
+		Type: "FIFOQueueV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Does nothing. Serves as a control trigger for scheduling.
-//
-// Only useful as a placeholder for control edges.
-//
-// Returns the created operation.
-func ControlTrigger(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ControlTrigger",
-	}
-	return scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Batch normalization.
-//
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
-//
-// This op is deprecated. Prefer `tf.nn.batch_normalization`.
-//
-// Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
+// Creates a dataset that contains the elements of `input_dataset` ignoring errors.
+func ExperimentalIgnoreErrorsDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalization",
+		Type: "ExperimentalIgnoreErrorsDataset",
 		Input: []tf.Input{
-			t, m, v, beta, gamma,
+			input_dataset,
 		},
 		Attrs: attrs,
 	}
@@ -25634,434 +30554,478 @@ func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayReadV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayReadV3
-func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+// Returns 0 if x == 0, and x / y otherwise, elementwise.
+func Xdivy(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV2",
+		Type: "Xdivy",
 		Input: []tf.Input{
-			handle, index, flow_in,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedMulAttr is an optional argument to QuantizedMul.
-type QuantizedMulAttr func(optionalAttr)
-
-// QuantizedMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// Returns x * y element-wise, working on quantized buffers.
+// Bucketizes 'input' based on 'boundaries'.
 //
-// Arguments:
+// For example, if the inputs are
+//     boundaries = [0, 10, 100]
+//     input = [[-5, 10000]
+//              [150,   10]
+//              [5,    100]]
 //
+// then the output will be
+//     output = [[0, 3]
+//               [3, 2]
+//               [1, 3]]
 //
-//	min_x: The float value that the lowest quantized `x` value represents.
-//	max_x: The float value that the highest quantized `x` value represents.
-//	min_y: The float value that the lowest quantized `y` value represents.
-//	max_y: The float value that the highest quantized `y` value represents.
+// Arguments:
+//	input: Any shape of Tensor contains with int or float type.
+//	boundaries: A sorted list of floats gives the boundary of the buckets.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+// Returns Same shape with 'input', each value of input replaced with bucket index.
 //
-// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
-// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.digitize.
+// @end_compatibility
+func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"boundaries": boundaries}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMul",
+		Type: "Bucketize",
 		Input: []tf.Input{
-			x, y, min_x, max_x, min_y, max_y,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// QuantizedAddAttr is an optional argument to QuantizedAdd.
-type QuantizedAddAttr func(optionalAttr)
-
-// QuantizedAddToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedAddToutput(value tf.DataType) QuantizedAddAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
+	return op.Output(0)
 }
 
-// Returns x + y element-wise, working on quantized buffers.
+// Calculates gains for each feature and returns the best possible split information for the feature.
 //
-// Arguments:
+// The split information is the best threshold (bucket id), gains and left/right node contributions per node for each feature.
 //
+// It is possible that not all nodes can be split on each feature. Hence, the list of possible nodes can differ between the features. Therefore, we return `node_ids_list` for each feature, containing the list of nodes that this feature can be used to split.
 //
-//	min_x: The float value that the lowest quantized `x` value represents.
-//	max_x: The float value that the highest quantized `x` value represents.
-//	min_y: The float value that the lowest quantized `y` value represents.
-//	max_y: The float value that the highest quantized `y` value represents.
+// In this manner, the output is the best split per features and per node, so that it needs to be combined later to produce the best split for each node (among all possible features).
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+// The length of output lists are all of the same length, `num_features`.
+// The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
 //
-// *NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
-// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedAddAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+// Arguments:
+//	node_id_range: A Rank 1 tensor (shape=[2]) to specify the range [first, last) of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1])` (Note that the last index node_id_range[1] is exclusive).
+//	stats_summary_list: A list of Rank 3 tensor (#shape=[max_splits, bucket, 2]) for accumulated stats summary (gradient/hessian) per node per buckets for each feature. The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
+//	l1: l1 regularization factor on leaf weights, per instance based.
+//	l2: l2 regularization factor on leaf weights, per instance based.
+//	tree_complexity: adjustment to the gain, per leaf based.
+//	min_node_weight: mininum avg of hessians in a node before required for the node to be considered for splitting.
+//	max_splits: the number of nodes that can be split in the whole tree. Used as a dimension of output tensors.
+//
+// Returns An output list of Rank 1 tensors indicating possible split node ids for each feature. The length of the list is num_features, but each tensor has different size as each feature provides different possible nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the best gains for each feature to split for certain nodes. See above for details like shapes and sizes.An output list of Rank 1 tensors indicating the bucket id to compare with (as a threshold) for split in each node. See above for details like shapes and sizes.A list of Rank 2 tensors indicating the contribution of the left nodes when branching from parent nodes (given by the tensor element in the output node_ids_list) to the left direction by the given threshold for each feature. This value will be used to make the left node value by adding to the parent node value. Second dimension size is 1 for 1-dimensional logits, but would be larger for multi-class problems. See above for details like shapes and sizes.A list of Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.
+func BoostedTreesCalculateBestGainsPerFeature(scope *Scope, node_id_range tf.Output, stats_summary_list []tf.Output, l1 tf.Output, l2 tf.Output, tree_complexity tf.Output, min_node_weight tf.Output, max_splits int64) (node_ids_list []tf.Output, gains_list []tf.Output, thresholds_list []tf.Output, left_node_contribs_list []tf.Output, right_node_contribs_list []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"max_splits": max_splits}
 	opspec := tf.OpSpec{
-		Type: "QuantizedAdd",
+		Type: "BoostedTreesCalculateBestGainsPerFeature",
 		Input: []tf.Input{
-			x, y, min_x, max_x, min_y, max_y,
+			node_id_range, tf.OutputList(stats_summary_list), l1, l2, tree_complexity, min_node_weight,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Given a quantized tensor described by (input, input_min, input_max), outputs a
-//
-// range that covers the actual values present in that tensor.  This op is
-// typically used to produce the requested_output_min and requested_output_max for
-// Requantize.
-//
-// Arguments:
-//
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//
-// Returns The computed min output.the computed max output.
-func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "RequantizationRange",
-		Input: []tf.Input{
-			input, input_min, input_max,
-		},
+	var idx int
+	var err error
+	if node_ids_list, idx, err = makeOutputList(op, idx, "node_ids_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	if gains_list, idx, err = makeOutputList(op, idx, "gains_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if thresholds_list, idx, err = makeOutputList(op, idx, "thresholds_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if left_node_contribs_list, idx, err = makeOutputList(op, idx, "left_node_contribs_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	if right_node_contribs_list, idx, err = makeOutputList(op, idx, "right_node_contribs_list"); err != nil {
+		scope.UpdateErr("BoostedTreesCalculateBestGainsPerFeature", err)
+		return
+	}
+	return node_ids_list, gains_list, thresholds_list, left_node_contribs_list, right_node_contribs_list
 }
 
-// Rolls the elements of a tensor along an axis.
-//
-// The elements are shifted positively (towards larger indices) by the offset of
-// `shift` along the dimension of `axis`. Negative `shift` values will shift
-// elements in the opposite direction. Elements that roll passed the last position
-// will wrap around to the first and vice versa. Multiple shifts along multiple
-// axes may be specified.
+// EncodePngAttr is an optional argument to EncodePng.
+type EncodePngAttr func(optionalAttr)
+
+// EncodePngCompression sets the optional compression attribute to value.
 //
-// For example:
+// value: Compression level.
+// If not specified, defaults to -1
+func EncodePngCompression(value int64) EncodePngAttr {
+	return func(m optionalAttr) {
+		m["compression"] = value
+	}
+}
+
+// PNG-encode an image.
 //
-// ```
-// # 't' is [0, 1, 2, 3, 4]
-// roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
+// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
+// where `channels` is:
 //
-// # shifting along multiple dimensions
-// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
-// roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
+// *   1: for grayscale.
+// *   2: for grayscale + alpha.
+// *   3: for RGB.
+// *   4: for RGBA.
 //
-// # shifting along the same axis multiple times
-// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
-// roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
-// ```
+// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
+// default or a value from 0 to 9.  9 is the highest compression level, generating
+// the smallest output, but is slower.
 //
 // Arguments:
+//	image: 3-D with shape `[height, width, channels]`.
 //
-//	shift: Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which
-// elements are shifted positively (towards larger indices) along the dimension
-// specified by `axis[i]`. Negative shifts will roll the elements in the opposite
-// direction.
-//	axis: Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift
-// `shift[i]` should occur. If the same axis is referenced more than once, the
-// total shift for that axis will be the sum of all the shifts that belong to that
-// axis.
-//
-// Returns Has the same shape and size as the input. The elements are shifted
-// positively (towards larger indices) by the offsets of `shift` along the
-// dimensions of `axis`.
-func Roll(scope *Scope, input tf.Output, shift tf.Output, axis tf.Output) (output tf.Output) {
+// Returns 0-D. PNG-encoded image.
+func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Roll",
+		Type: "EncodePng",
 		Input: []tf.Input{
-			input, shift, axis,
+			image,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Looks up keys in a table, outputs the corresponding values.
+// QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
+type QueueDequeueUpToV2Attr func(optionalAttr)
+
+// QueueDequeueUpToV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// The tensor `keys` must of the same type as the keys of the table.
-// The output `values` is of the type of the table values.
+// value: If the queue has fewer than n elements, this operation
+// will block for up to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueUpToV2TimeoutMs(value int64) QueueDequeueUpToV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Dequeues `n` tuples of one or more tensors from the given queue.
 //
-// The scalar `default_value` is the value output for keys not present in the
-// table. It must also be of the same type as the table values.
+// This operation is not supported by all queues.  If a queue does not support
+// DequeueUpTo, then an Unimplemented error is returned.
 //
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
+// If the queue is closed and there are more than 0 but less than `n`
+// elements remaining, then instead of returning an OutOfRange error like
+// QueueDequeueMany, less than `n` elements are returned immediately.  If
+// the queue is closed and there are 0 elements left in the queue, then
+// an OutOfRange error is returned just like in QueueDequeueMany.
+// Otherwise the behavior is identical to QueueDequeueMany:
 //
+// This operation concatenates queue-element component tensors along the
+// 0th dimension to make a single component tensor.  All of the components
+// in the dequeued tuple will have size n in the 0th dimension.
 //
-// Returns Same shape as `keys`.  Values found in the table, or `default_values`
-// for missing keys.
-func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
+// This operation has `k` outputs, where `k` is the number of components in
+// the tuples stored in the given queue, and output `i` is the ith
+// component of the dequeued tuple.
+//
+// Arguments:
+//	handle: The handle to a queue.
+//	n: The number of tuples to dequeue.
+//	component_types: The type of each component in a tuple.
+//
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueUpToV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueUpToV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableFindV2",
+		Type: "QueueDequeueUpToV2",
 		Input: []tf.Input{
-			table_handle, keys, default_value,
+			handle, n,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueUpToV2", err)
+		return
+	}
+	return components
 }
 
-// Updates the table to associates keys with values.
-//
-// The tensor `keys` must be of the same type as the keys of the table.
-// The tensor `values` must be of the type of the table values.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//	values: Values to associate with keys.
+// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
 //
-// Returns the created operation.
-func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableInsertV2",
+		Type: "Maximum",
 		Input: []tf.Input{
-			table_handle, keys, values,
+			x, y,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Creates a dataset that batches and pads `batch_size` elements from the input.
-//
-// Arguments:
+// Returns element-wise remainder of division. This emulates C semantics in that
 //
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	padded_shapes: A list of int64 tensors representing the desired padded shapes
-// of the corresponding output components. These shapes may be partially
-// specified, using `-1` to indicate that a particular dimension should be
-// padded to the maximum size of all batch elements.
-//	padding_values: A list of scalars containing the padding value to use for
-// each of the outputs.
-//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
-// is smaller than desired.
+// the result here is consistent with a truncating divide. E.g.
+// `tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
 //
-func PaddedBatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, drop_remainder tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+// *NOTE*: `Mod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "PaddedBatchDatasetV2",
+		Type: "Mod",
 		Input: []tf.Input{
-			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values), drop_remainder,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns element-wise smallest integer not less than x.
-func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns element-wise remainder of division. This emulates C semantics in that
+//
+// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+// y + truncate_mod(x, y) = x`.
+//
+// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Ceil",
+		Type: "TruncateMod",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the number of elements in the given table.
+// Computes offsets of concat inputs within its output.
+//
+// For example:
+//
+// ```
+// # 'x' is [2, 2, 7]
+// # 'y' is [2, 3, 7]
+// # 'z' is [2, 5, 7]
+// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
+// ```
+//
+// This is typically used by gradient computations for a concat operation.
 //
 // Arguments:
-//	table_handle: Handle to the table.
+//	concat_dim: The dimension along which to concatenate.
+//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
 //
-// Returns Scalar that contains number of elements in the table.
-func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
+// Returns The `N` int32 vectors representing the starting offset
+// of input tensors within the concatenated output.
+func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableSizeV2",
+		Type: "ConcatOffset",
 		Input: []tf.Input{
-			table_handle,
+			concat_dim, tf.OutputList(shape),
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
+		scope.UpdateErr("ConcatOffset", err)
+		return
+	}
+	return offset
 }
 
-// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
-type ResizeBilinearGradAttr func(optionalAttr)
+// LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr is an optional argument to LoadTPUEmbeddingRMSPropParametersGradAccumDebug.
+type LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr func(optionalAttr)
 
-// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
+// LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
-// aligned. Defaults to false.
-// If not specified, defaults to false
-func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
+// REQUIRES: value >= -1
+func LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableId(value int64) LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["table_id"] = value
 	}
 }
 
-// Computes the gradient of bilinear interpolation.
+// LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingRMSPropParametersGradAccumDebugTableName(value string) LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Load RMSProp embedding parameters with debug support.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
 //
 // Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
+//	parameters: Value of parameters used in the RMSProp optimization algorithm.
+//	ms: Value of ms used in the RMSProp optimization algorithm.
+//	mom: Value of mom used in the RMSProp optimization algorithm.
+//	gradient_accumulators: Value of gradient_accumulators used in the RMSProp optimization algorithm.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingRMSPropParametersGradAccumDebug(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, gradient_accumulators tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingRMSPropParametersGradAccumDebugAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBilinearGrad",
+		Type: "LoadTPUEmbeddingRMSPropParametersGradAccumDebug",
 		Input: []tf.Input{
-			grads, original_image,
+			parameters, ms, mom, gradient_accumulators,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Outputs all keys and values in the table.
+// Compute the lower regularized incomplete Gamma function `P(a, x)`.
 //
-// Arguments:
-//	table_handle: Handle to the table.
+// The lower regularized incomplete Gamma function is defined as:
 //
 //
+// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
 //
-// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
-func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
+// where
+//
+// \\(gamma(a, x) = \\int_{0}^{x} t^{a-1} exp(-t) dt\\)
+//
+// is the lower incomplete Gamma function.
+//
+// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
+// Gamma function.
+func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
 	opspec := tf.OpSpec{
-		Type: "LookupTableExportV2",
+		Type: "Igamma",
 		Input: []tf.Input{
-			table_handle,
+			a, x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Replaces the contents of the table with the specified keys and values.
+// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
 //
-// The tensor `keys` must be of the same type as the keys of the table.
-// The tensor `values` must be of the type of the table values.
+// The regularized incomplete beta integral is defined as:
 //
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//	values: Values to associate with keys.
 //
-// Returns the created operation.
-func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
+//
+// where
+//
+//
+// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+//
+//
+// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
+// beta function.
+func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableImportV2",
+		Type: "Betainc",
 		Input: []tf.Input{
-			table_handle, keys, values,
+			a, b, x,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MultiDeviceIteratorFromStringHandleAttr is an optional argument to MultiDeviceIteratorFromStringHandle.
-type MultiDeviceIteratorFromStringHandleAttr func(optionalAttr)
+// ShapeAttr is an optional argument to Shape.
+type ShapeAttr func(optionalAttr)
 
-// MultiDeviceIteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
-//
-// value: The type list for the return values.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func MultiDeviceIteratorFromStringHandleOutputTypes(value []tf.DataType) MultiDeviceIteratorFromStringHandleAttr {
+// ShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func ShapeOutType(value tf.DataType) ShapeAttr {
 	return func(m optionalAttr) {
-		m["output_types"] = value
+		m["out_type"] = value
 	}
 }
 
-// MultiDeviceIteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
-//
-// value: The list of shapes being produced.
-// If not specified, defaults to <>
+// Returns the shape of a tensor.
 //
-// REQUIRES: len(value) >= 0
-func MultiDeviceIteratorFromStringHandleOutputShapes(value []tf.Shape) MultiDeviceIteratorFromStringHandleAttr {
-	return func(m optionalAttr) {
-		m["output_shapes"] = value
-	}
-}
-
-// Generates a MultiDeviceIterator resource from its provided string handle.
+// This operation returns a 1-D integer tensor representing the shape of `input`.
 //
-// Arguments:
-//	string_handle: String representing the resource.
+// For example:
 //
-// Returns A MultiDeviceIterator resource.
-func MultiDeviceIteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...MultiDeviceIteratorFromStringHandleAttr) (multi_device_iterator tf.Output) {
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26070,9 +31034,9 @@ func MultiDeviceIteratorFromStringHandle(scope *Scope, string_handle tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MultiDeviceIteratorFromStringHandle",
+		Type: "Shape",
 		Input: []tf.Input{
-			string_handle,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -26080,162 +31044,72 @@ func MultiDeviceIteratorFromStringHandle(scope *Scope, string_handle tf.Output,
 	return op.Output(0)
 }
 
-// MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
-type MutableHashTableV2Attr func(optionalAttr)
-
-// MutableHashTableV2Container sets the optional container attribute to value.
+// Computes fingerprints of the input strings.
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableHashTableV2Container(value string) MutableHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MutableHashTableV2SharedName sets the optional shared_name attribute to value.
+// Arguments:
+//	input: vector of strings to compute fingerprints on.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableHashTableV2SharedName(value string) MutableHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// Returns a (N,2) shaped matrix where N is the number of elements in the input
+// vector. Each row contains the low and high parts of the fingerprint.
+func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// MutableHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-//
-// value: If true and shared_name is empty, the table is shared
-// using the node name.
-// If not specified, defaults to false
-func MutableHashTableV2UseNodeNameSharing(value bool) MutableHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+	opspec := tf.OpSpec{
+		Type: "SdcaFprint",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Creates an empty hash table.
-//
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a scalar. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
+// Computes the power of one value to another.
 //
-// Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
+// corresponding elements in `x` and `y`. For example:
 //
-// Returns Handle to a table.
-func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableV2Attr) (table_handle tf.Output) {
+// ```
+// # tensor 'x' is [[2, 2]], [3, 3]]
+// # tensor 'y' is [[8, 16], [2, 3]]
+// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
+// ```
+func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MutableHashTableV2",
-
-		Attrs: attrs,
+		Type: "Pow",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DequantizeAttr is an optional argument to Dequantize.
-type DequantizeAttr func(optionalAttr)
+// QuantizedReluXAttr is an optional argument to QuantizedReluX.
+type QuantizedReluXAttr func(optionalAttr)
 
-// DequantizeMode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func DequantizeMode(value string) DequantizeAttr {
+// QuantizedReluXOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
 	return func(m optionalAttr) {
-		m["mode"] = value
+		m["out_type"] = value
 	}
 }
 
-// Dequantize the 'input' tensor into a float Tensor.
-//
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.
-//
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-//
-// ```
-// if T == qint8: in[i] += (range(T) + 1)/ 2.0
-// out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
-// ```
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-//
-// *MIN_COMBINED Mode Example*
-//
-// If the input comes from a QuantizedRelu6, the output type is
-// quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
-// 0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
-// Dequantize on quint8 will take each value, cast to float, and multiply
-// by 6 / 255.
-// Note that if quantizedtype is qint8, the operation will additionally add
-// each value by 128 prior to casting.
-//
-// If the mode is 'MIN_FIRST', then this approach is used:
-//
-// ```c++
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = range / num_discrete_values
-// const double offset_input = static_cast<double>(input) - lowest_quantized;
-// result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
-// ```
-//
-// *SCALED mode Example*
-//
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
-//
-// If the mode is `SCALED`, we do not use the full range of the output type,
-// choosing to elide the lowest possible value for symmetry (e.g., output range is
-// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-// 0.
-//
-// We first find the range of values in our tensor. The
-// range we use is always centered on 0, so we find m such that
-// ```c++
-//   m = max(abs(input_min), abs(input_max))
-// ```
-//
-// Our input tensor range is then `[-m, m]`.
-//
-// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-// If T is signed, this is
-// ```
-//   num_bits = sizeof(T) * 8
-//   [min_fixed, max_fixed] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-// ```
-//
-// Otherwise, if T is unsigned, the fixed-point range is
-// ```
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-// ```
+// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
 //
-// From this we compute our scaling factor, s:
-// ```c++
-//   s = (2 * m) / (max_fixed - min_fixed)
-// ```
+// Arguments:
 //
-// Now we can dequantize the elements of our tensor:
-// ```c++
-// result = input * s
-// ```
 //
-// Arguments:
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
 //
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
-func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, optional ...DequantizeAttr) (output tf.Output) {
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26244,143 +31118,137 @@ func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Dequantize",
+		Type: "QuantizedReluX",
 		Input: []tf.Input{
-			input, min_range, max_range,
+			features, max_value, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Flips all bits elementwise.
+// Returns the truth value of (x < y) element-wise.
 //
-// The result will have exactly those bits set, that are not set in `x`. The
-// computation is performed on the underlying representation of x.
-func Invert(scope *Scope, x tf.Output) (y tf.Output) {
+// *NOTE*: `Less` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Invert",
+		Type: "Less",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inverse 3D fast Fourier transform.
-//
-// Computes the inverse 3-dimensional discrete Fourier transform over the
-// inner-most 3 dimensions of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
+// RandomPoissonAttr is an optional argument to RandomPoisson.
+type RandomPoissonAttr func(optionalAttr)
+
+// RandomPoissonSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed(value int64) RandomPoissonAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomPoissonSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed2(value int64) RandomPoissonAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Use RandomPoissonV2 instead.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifftn with 3 dimensions.
-// @end_compatibility
-func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
+func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IFFT3D",
+		Type: "RandomPoisson",
 		Input: []tf.Input{
-			input,
+			shape, rate,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deprecated. Disallowed in GraphDef version >= 2.
+// Gets the next output from the given iterator.
 //
-// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
-func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
+// This operation is a synchronous version IteratorGetNext. It should only be used
+// in situations where the iterator does not block the calling thread, or where
+// the calling thread is not a member of the thread pool used to execute parallel
+// operations (e.g. in eager mode).
+func IteratorGetNextSync(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "AdjustContrast",
+		Type: "IteratorGetNextSync",
 		Input: []tf.Input{
-			images, contrast_factor, min_value, max_value,
+			iterator,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("IteratorGetNextSync", err)
+		return
+	}
+	return components
 }
 
-// Table initializer that takes two tensors for keys and values respectively.
-//
-// Arguments:
-//	table_handle: Handle to a table which will be initialized.
-//	keys: Keys of type Tkey.
-//	values: Values of type Tval.
+// Returns the truth value of (x >= y) element-wise.
 //
-// Returns the created operation.
-func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "InitializeTableV2",
+		Type: "GreaterEqual",
 		Input: []tf.Input{
-			table_handle, keys, values,
+			x, y,
 		},
 	}
-	return scope.AddOperation(opspec)
-}
-
-// PrintAttr is an optional argument to Print.
-type PrintAttr func(optionalAttr)
-
-// PrintMessage sets the optional message attribute to value.
-//
-// value: A string, prefix of the error message.
-// If not specified, defaults to ""
-func PrintMessage(value string) PrintAttr {
-	return func(m optionalAttr) {
-		m["message"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// PrintFirstN sets the optional first_n attribute to value.
-//
-// value: Only log `first_n` number of times. -1 disables logging.
-// If not specified, defaults to -1
-func PrintFirstN(value int64) PrintAttr {
-	return func(m optionalAttr) {
-		m["first_n"] = value
-	}
-}
+// ApproximateEqualAttr is an optional argument to ApproximateEqual.
+type ApproximateEqualAttr func(optionalAttr)
 
-// PrintSummarize sets the optional summarize attribute to value.
-//
-// value: Only print this many entries of each tensor.
-// If not specified, defaults to 3
-func PrintSummarize(value int64) PrintAttr {
+// ApproximateEqualTolerance sets the optional tolerance attribute to value.
+// If not specified, defaults to 1e-05
+func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
 	return func(m optionalAttr) {
-		m["summarize"] = value
+		m["tolerance"] = value
 	}
 }
 
-// Prints a list of tensors.
-//
-// Passes `input` through to `output` and prints `data` when evaluating.
-//
-// Arguments:
-//	input: The tensor passed to `output`
-//	data: A list of tensors to print out when op is evaluated.
-//
-// Returns = The unmodified `input` tensor
-func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
+// Returns the truth value of abs(x-y) < tolerance element-wise.
+func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26389,9 +31257,9 @@ func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAtt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Print",
+		Type: "ApproximateEqual",
 		Input: []tf.Input{
-			input, tf.OutputList(data),
+			x, y,
 		},
 		Attrs: attrs,
 	}
@@ -26399,93 +31267,57 @@ func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAtt
 	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
-//
-// Arguments:
-//	tag: A string attached to this summary. Used for organization in TensorBoard.
-//	tensor: A tensor to serialize.
-//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
-// data.
-func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorSummaryV2",
-		Input: []tf.Input{
-			tag, tensor, serialized_summary_metadata,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that asynchronously prefetches elements from `input_dataset`.
-//
-// Arguments:
-//
-//	buffer_size: The maximum number of elements to buffer in an iterator over
-// this dataset.
-//
+// Returns the truth value of x OR y element-wise.
 //
-func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "PrefetchDataset",
+		Type: "LogicalOr",
 		Input: []tf.Input{
-			input_dataset, buffer_size,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorSummaryAttr is an optional argument to TensorSummary.
-type TensorSummaryAttr func(optionalAttr)
-
-// TensorSummaryDescription sets the optional description attribute to value.
-//
-// value: A json-encoded SummaryDescription proto.
-// If not specified, defaults to ""
-func TensorSummaryDescription(value string) TensorSummaryAttr {
-	return func(m optionalAttr) {
-		m["description"] = value
-	}
-}
+// MatMulAttr is an optional argument to MatMul.
+type MatMulAttr func(optionalAttr)
 
-// TensorSummaryLabels sets the optional labels attribute to value.
+// MatMulTransposeA sets the optional transpose_a attribute to value.
 //
-// value: An unused list of strings.
-// If not specified, defaults to <>
-func TensorSummaryLabels(value []string) TensorSummaryAttr {
+// value: If true, "a" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeA(value bool) MatMulAttr {
 	return func(m optionalAttr) {
-		m["labels"] = value
+		m["transpose_a"] = value
 	}
 }
 
-// TensorSummaryDisplayName sets the optional display_name attribute to value.
+// MatMulTransposeB sets the optional transpose_b attribute to value.
 //
-// value: An unused string.
-// If not specified, defaults to ""
-func TensorSummaryDisplayName(value string) TensorSummaryAttr {
+// value: If true, "b" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeB(value bool) MatMulAttr {
 	return func(m optionalAttr) {
-		m["display_name"] = value
+		m["transpose_b"] = value
 	}
 }
 
-// Outputs a `Summary` protocol buffer with a tensor.
+// Multiply the matrix "a" by the matrix "b".
 //
-// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
-// a tag as well as a serialized SummaryMetadata proto string that contains
-// plugin-specific data. We will keep this op to maintain backwards compatibility.
+// The inputs must be two-dimensional matrices and the inner dimension of
+// "a" (after being transposed if transpose_a is true) must match the
+// outer dimension of "b" (after being transposed if transposed_b is
+// true).
 //
-// Arguments:
-//	tensor: A tensor to serialize.
-func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
+// *Note*: The default kernel implementation for MatMul on GPUs uses
+// cublas.
+func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26494,9 +31326,9 @@ func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorSummary",
+		Type: "MatMul",
 		Input: []tf.Input{
-			tensor,
+			a, b,
 		},
 		Attrs: attrs,
 	}
@@ -26504,240 +31336,259 @@ func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr
 	return op.Output(0)
 }
 
-// Read an element from the TensorArray into output `value`.
-//
-// Arguments:
-//	handle: The handle to a TensorArray.
+// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
+type InitializeTableFromTextFileV2Attr func(optionalAttr)
+
+// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
 //
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+// value: Number of elements of the file, use -1 if unknown.
+// If not specified, defaults to -1
 //
-// Returns The tensor that is read from the TensorArray.
-func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV3",
-		Input: []tf.Input{
-			handle, index, flow_in,
-		},
-		Attrs: attrs,
+// REQUIRES: value >= -1
+func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
+	return func(m optionalAttr) {
+		m["vocab_size"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Reduces sparse updates into the variable referenced by `resource` using the `max` operation.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] = max(ref[indices, ...], updates[...])
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
 //
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions are combined.
+// value: Delimiter to separate fields in a line.
+// If not specified, defaults to "\t"
+func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
+	return func(m optionalAttr) {
+		m["delimiter"] = value
+	}
+}
+
+// Initializes a table from a text file.
 //
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+// It inserts one key-value pair into the table for each line of the file.
+// The key and value is extracted from the whole line content, elements from the
+// split line based on `delimiter` or the line number (starting from zero).
+// Where to extract the key and value from a line is specified by `key_index` and
+// `value_index`.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+// - A value of -1 means use the line number(starting from zero), expects `int64`.
+// - A value of -2 means use the whole line content, expects `string`.
+// - A value >= 0 means use the index (starting at zero) of the split line based
+//   on `delimiter`.
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+//	table_handle: Handle to a table which will be initialized.
+//	filename: Filename of a vocabulary text file.
+//	key_index: Column index in a line to get the table `key` values from.
+//	value_index: Column index that represents information of a line to get the table
+// `value` values from.
 //
 // Returns the created operation.
-func ResourceScatterMax(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterMax",
+		Type: "InitializeTableFromTextFileV2",
 		Input: []tf.Input{
-			resource, indices, updates,
+			table_handle, filename,
 		},
+		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Computes the gradient for the tanh of `x` wrt its input.
+// MeanAttr is an optional argument to Mean.
+type MeanAttr func(optionalAttr)
+
+// MeanKeepDims sets the optional keep_dims attribute to value.
 //
-// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
-// is the corresponding input gradient.
-func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TanhGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MeanKeepDims(value bool) MeanAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with scalar values.
+// Computes the mean of elements across dimensions of a tensor.
 //
-// The input `tags` and `values` must have the same shape.  The generated summary
-// has a summary value for each tag-value pair in `tags` and `values`.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	tags: Tags for the summary.
-//	values: Same shape as `tags.  Values for the summary.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns Scalar.  Serialized `Summary` protocol buffer.
-func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
+// Returns The reduced tensor.
+func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ScalarSummary",
+		Type: "Mean",
 		Input: []tf.Input{
-			tags, values,
+			input, axis,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with a histogram.
+// ProdAttr is an optional argument to Prod.
+type ProdAttr func(optionalAttr)
+
+// ProdKeepDims sets the optional keep_dims attribute to value.
 //
-// The generated
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// has one summary value containing a histogram for `values`.
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func ProdKeepDims(value bool) ProdAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the product of elements across dimensions of a tensor.
 //
-// This op reports an `InvalidArgument` error if any value is not finite.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	tag: Scalar.  Tag to use for the `Summary.Value`.
-//	values: Any shape. Values to use to build the histogram.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
+// Returns The reduced tensor.
+func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "HistogramSummary",
+		Type: "Prod",
 		Input: []tf.Input{
-			tag, values,
+			input, axis,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the number of elements in the given queue.
+// ResizeBilinearAttr is an optional argument to ResizeBilinear.
+type ResizeBilinearAttr func(optionalAttr)
+
+// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using bilinear interpolation.
+//
+// Input images can be of different types but output images are always float.
 //
 // Arguments:
-//	handle: The handle to a queue.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns The number of elements in the given queue.
-func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "QueueSizeV2",
+		Type: "ResizeBilinear",
 		Input: []tf.Input{
-			handle,
+			images, size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ImageSummaryAttr is an optional argument to ImageSummary.
-type ImageSummaryAttr func(optionalAttr)
-
-// ImageSummaryMaxImages sets the optional max_images attribute to value.
-//
-// value: Max number of batch elements to generate images for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
-	return func(m optionalAttr) {
-		m["max_images"] = value
-	}
-}
+// MaxAttr is an optional argument to Max.
+type MaxAttr func(optionalAttr)
 
-// ImageSummaryBadColor sets the optional bad_color attribute to value.
+// MaxKeepDims sets the optional keep_dims attribute to value.
 //
-// value: Color to use for pixels with non-finite values.
-// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
-func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MaxKeepDims(value bool) MaxAttr {
 	return func(m optionalAttr) {
-		m["bad_color"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Outputs a `Summary` protocol buffer with images.
-//
-// The summary has up to `max_images` summary values containing images. The
-// images are built from `tensor` which must be 4-D with shape `[batch_size,
-// height, width, channels]` and where `channels` can be:
-//
-// *  1: `tensor` is interpreted as Grayscale.
-// *  3: `tensor` is interpreted as RGB.
-// *  4: `tensor` is interpreted as RGBA.
-//
-// The images have the same number of channels as the input tensor. For float
-// input, the values are normalized one image at a time to fit in the range
-// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-// normalization algorithms:
-//
-// *  If the input values are all positive, they are rescaled so the largest one
-//    is 255.
-//
-// *  If any input value is negative, the values are shifted so input value 0.0
-//    is at 127.  They are then rescaled so that either the smallest value is 0,
-//    or the largest one is 255.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
-// *  If `max_images` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+// Computes the maximum of elements across dimensions of a tensor.
 //
-// The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
-// Each element must be in the range `[0, 255]` (It represents the value of a
-// pixel in the output image).  Non-finite values in the input tensor are
-// replaced by this tensor in the output image.  The default value is the color
-// red.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
-// `channels` is 1, 3, or 4.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
+// Returns The reduced tensor.
+func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Max",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that contains the unique elements of `input_dataset`.
+func ExperimentalUniqueDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ImageSummary",
+		Type: "ExperimentalUniqueDataset",
 		Input: []tf.Input{
-			tag, tensor,
+			input_dataset,
 		},
 		Attrs: attrs,
 	}
@@ -26745,42 +31596,27 @@ func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...Ima
 	return op.Output(0)
 }
 
-// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
-type AudioSummaryV2Attr func(optionalAttr)
+// ArgMinAttr is an optional argument to ArgMin.
+type ArgMinAttr func(optionalAttr)
 
-// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
-//
-// REQUIRES: value >= 1
-func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
+// ArgMinOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMinOutputType(value tf.DataType) ArgMinAttr {
 	return func(m optionalAttr) {
-		m["max_outputs"] = value
+		m["output_type"] = value
 	}
 }
 
-// Outputs a `Summary` protocol buffer with audio.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
+// Returns the index with the smallest value across dimensions of a tensor.
 //
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+// Note that in case of ties the identity of the return value is not guaranteed.
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26789,9 +31625,9 @@ func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AudioSummaryV2",
+		Type: "ArgMin",
 		Input: []tf.Input{
-			tag, tensor, sample_rate,
+			input, dimension,
 		},
 		Attrs: attrs,
 	}
@@ -26799,259 +31635,438 @@ func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate t
 	return op.Output(0)
 }
 
-// AvgPoolAttr is an optional argument to AvgPool.
-type AvgPoolAttr func(optionalAttr)
-
-// AvgPoolDataFormat sets the optional data_format attribute to value.
+// Converts the quantized `input` tensor into a lower-precision `output`.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolDataFormat(value string) AvgPoolAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs average pooling on the input.
+// Converts the quantized `input` tensor into a lower-precision `output`, using the
+// output range specified with `requested_output_min` and `requested_output_max`.
 //
-// Each entry in `output` is the mean of the corresponding size `ksize`
-// window in `value`.
+// `[input_min, input_max]` are scalar floats that specify the range for the float
+// interpretation of the `input` data. For example, if `input_min` is -1.0f and
+// `input_max` is 1.0f, and we are dealing with `quint16` quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
 //
 // Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	ksize: The size of the sliding window for each dimension of `value`.
-//	strides: The stride of the sliding window for each dimension of `value`.
-//	padding: The type of padding algorithm to use.
 //
-// Returns The average pooled output tensor.
-func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	requested_output_min: The float value that the minimum quantized output value represents.
+//	requested_output_max: The float value that the maximum quantized output value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
+func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "AvgPool",
+		Type: "Requantize",
 		Input: []tf.Input{
-			value,
+			input, input_min, input_max, requested_output_min, requested_output_max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Merges summaries.
-//
-// This op creates a
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// protocol buffer that contains the union of all the values in the input
-// summaries.
-//
-// When the Op is run, it reports an `InvalidArgument` error if multiple values
-// in the summaries to merge use the same tag.
+// Creates a dataset that emits the lines of one or more text files.
 //
 // Arguments:
-//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
-// buffers.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	buffer_size: A scalar containing the number of bytes to buffer.
+func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MergeSummary",
+		Type: "TextLineDataset",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			filenames, compression_type, buffer_size,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// The shape of the elements of the given list, as a tensor.
+// Computes the sum along segments of a tensor.
 //
-//   input_handle: the list
-//   element_shape: the shape of elements of the list
-func TensorListElementShape(scope *Scope, input_handle tf.Output, shape_type tf.DataType) (element_shape tf.Output) {
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \sum_j data_j\\) where sum is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_sum(c, tf.constant([0, 0, 1]))
+// # ==> [[5, 5, 5, 5],
+// #      [5, 6, 7, 8]]
+// ```
+//
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shape_type": shape_type}
 	opspec := tf.OpSpec{
-		Type: "TensorListElementShape",
+		Type: "SegmentSum",
 		Input: []tf.Input{
-			input_handle,
+			data, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the item in the list with the given index.
+// Computes the mean along segments of a tensor.
 //
-// input_handle: the list
-// index: the position in the list from which an element will be retrieved
-// item: the element at that position
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Computes a tensor such that
+// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
+// over `j` such that `segment_ids[j] == i` and `N` is the total number of
+// values summed.
+//
+// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1.0,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_mean(c, tf.constant([0, 0, 1]))
+// # ==> [[2.5, 2.5, 2.5, 2.5],
+// #      [5, 6, 7, 8]]
+// ```
 //
 //
-func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, element_dtype tf.DataType) (item tf.Output) {
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorListGetItem",
+		Type: "SegmentMean",
 		Input: []tf.Input{
-			input_handle, index,
+			data, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a diagonal tensor with a given diagonal values.
+// Computes the minimum along segments of a tensor.
 //
-// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-// everything else padded with zeros. The diagonal is computed as follows:
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
 //
-// Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
-// rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
+// Computes a tensor such that
+// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
+// that `segment_ids[j] == i`.
 //
-// `output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
+// If the min is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
+// </div>
 //
 // For example:
 //
 // ```
-// # 'diagonal' is [1, 2, 3, 4]
-// tf.diag(diagonal) ==> [[1, 0, 0, 0]
-//                        [0, 2, 0, 0]
-//                        [0, 0, 3, 0]
-//                        [0, 0, 0, 4]]
+// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_min(c, tf.constant([0, 0, 1]))
+// # ==> [[1, 2, 2, 1],
+// #      [5, 6, 7, 8]]
 // ```
 //
 // Arguments:
-//	diagonal: Rank k tensor where k is at most 1.
-func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+//
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Diag",
+		Type: "SegmentMin",
 		Input: []tf.Input{
-			diagonal,
+			data, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
-type ParameterizedTruncatedNormalAttr func(optionalAttr)
-
-// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
+// Computes the sum along segments of a tensor.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+// Computes a tensor such that
+// \\(output[i] = \sum_{j...} data[j...]\\) where the sum is over tuples `j...` such
+// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
+// need not be sorted and need not cover all values in the full
+// range of valid values.
+//
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+// If the given segment ID `i` is negative, the value is dropped and will not be
+// added to the sum of the segment.
+//
+// `num_segments` should equal the number of distinct segment IDs.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
+// </div>
+//
+// ``` python
+// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+// tf.unsorted_segment_sum(c, tf.constant([0, 1, 0]), num_segments=2)
+// # ==> [[ 5,  5, 5, 5],
+// #       [5,  6, 7, 8]]
+// ```
+//
+//
+// Arguments:
+//
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentSum",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Outputs random values from a normal distribution. The parameters may each be a
+// Computes the product along segments of a tensor.
 //
-// scalar which applies to the entire output, or a vector of length shape[0] which
-// stores the parameters for each batch.
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// This operator is similar to the unsorted segment sum operator found
+// [(here)](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the product of all
+// entries belonging to a segment such that:
+//
+// \\(output_i = \prod_{j...} data[j...]\\) where the product is over tuples
+// `j...` such that `segment_ids[j...] == i`.
+//
+// For example:
+//
+// ``` python
+// c = tf.constant([[1,2,3,4], [5,6,7,8], [4,3,2,1]])
+// tf.unsorted_segment_prod(c, tf.constant([0, 1, 0]), num_segments=2)
+// # ==> [[ 4,  6, 6, 4],
+// #       [5,  6, 7, 8]]
+// ```
+//
+// If there is no entry for a given segment ID `i`, it outputs 1.
+//
+// If the given segment ID `i` is negative, then the corresponding value is
+// dropped, and will not be included in the result.
 //
 // Arguments:
-//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
-//	means: The mean parameter of each batch.
-//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
-//	minvals: The minimum cutoff. May be -infinity.
-//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
-// for each batch.
 //
-// Returns A matrix of shape num_batches x samples_per_batch, filled with random
-// truncated normal values using the parameters for each row.
-func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+//
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+	opspec := tf.OpSpec{
+		Type: "UnsortedSegmentProd",
+		Input: []tf.Input{
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes hyperbolic cosine of x element-wise.
+func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ParameterizedTruncatedNormal",
+		Type: "Cosh",
 		Input: []tf.Input{
-			shape, means, stdevs, minvals, maxvals,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Sets the index-th position of the list to contain the given tensor.
+// Computes the mean along sparse segments of a tensor.
 //
-// input_handle: the list
-// index: the position in the list to which the tensor will be assigned
-// item: the element to be assigned to that position
-// output_handle: the new list, with the element in the proper position
+// Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
 //
-func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, item tf.Output) (output_handle tf.Output) {
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which has size
+// `num_segments`.
+func SparseSegmentMeanWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorListSetItem",
+		Type: "SparseSegmentMeanWithNumSegments",
 		Input: []tf.Input{
-			input_handle, index, item,
+			data, indices, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a Tensor by indexing into the TensorList.
+// CudnnRNNParamsSizeAttr is an optional argument to CudnnRNNParamsSize.
+type CudnnRNNParamsSizeAttr func(optionalAttr)
+
+// CudnnRNNParamsSizeRnnMode sets the optional rnn_mode attribute to value.
+// If not specified, defaults to "lstm"
+func CudnnRNNParamsSizeRnnMode(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["rnn_mode"] = value
+	}
+}
+
+// CudnnRNNParamsSizeInputMode sets the optional input_mode attribute to value.
+// If not specified, defaults to "linear_input"
+func CudnnRNNParamsSizeInputMode(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["input_mode"] = value
+	}
+}
+
+// CudnnRNNParamsSizeDirection sets the optional direction attribute to value.
+// If not specified, defaults to "unidirectional"
+func CudnnRNNParamsSizeDirection(value string) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["direction"] = value
+	}
+}
+
+// CudnnRNNParamsSizeDropout sets the optional dropout attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeDropout(value float32) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["dropout"] = value
+	}
+}
+
+// CudnnRNNParamsSizeSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeSeed(value int64) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// CudnnRNNParamsSizeSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func CudnnRNNParamsSizeSeed2(value int64) CudnnRNNParamsSizeAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Computes size of weights that can be used by a Cudnn RNN model.
 //
-// Each row in the produced Tensor corresponds to the element in the TensorList
-// specified by the given index (see `tf.gather`).
+// Return the params size that can be used by the Cudnn RNN model. Subsequent
+// weight allocation and initialization should use this size.
 //
-// input_handle: The input tensor list.
-// indices: The indices used to index into the list.
-// values: The tensor.
-func TensorListGather(scope *Scope, input_handle tf.Output, indices tf.Output, element_dtype tf.DataType) (values tf.Output) {
+// num_layers: Specifies the number of layers in the RNN model.
+// num_units: Specifies the size of the hidden state.
+// input_size: Specifies the size of the input state.
+// rnn_mode: Indicates the type of the RNN model.
+// input_mode: Indicate whether there is a linear projection between the input and
+//   The actual computation before the first layer. 'skip_input' is only allowed
+//   when input_size == num_units; 'auto_select' implies 'skip_input' when
+//   input_size == num_units; otherwise, it implies 'linear_input'.
+// direction: Indicates whether a bidirectional model will be used.
+//   dir = (direction == bidirectional) ? 2 : 1
+// dropout: dropout probability. When set to 0., dropout is disabled.
+// seed: the 1st part of a seed to initialize dropout.
+// seed2: the 2nd part of a seed to initialize dropout.
+// params_size: The size of the params buffer that should be allocated and
+//   initialized for this RNN model. Note that this params buffer may not be
+//   compatible across GPUs. Please use CudnnRNNParamsWeights and
+//   CudnnRNNParamsBiases to save and restore them in a way that is compatible
+//   across different runs.
+func CudnnRNNParamsSize(scope *Scope, num_layers tf.Output, num_units tf.Output, input_size tf.Output, T tf.DataType, S tf.DataType, optional ...CudnnRNNParamsSizeAttr) (params_size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	attrs := map[string]interface{}{"T": T, "S": S}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorListGather",
+		Type: "CudnnRNNParamsSize",
 		Input: []tf.Input{
-			input_handle, indices,
+			num_layers, num_units, input_size,
 		},
 		Attrs: attrs,
 	}
@@ -27059,227 +32074,227 @@ func TensorListGather(scope *Scope, input_handle tf.Output, indices tf.Output, e
 	return op.Output(0)
 }
 
-// Creates a TensorList by indexing into a Tensor.
+// Computes gradients for SparseSegmentMean.
 //
-// Each member of the TensorList corresponds to one row of the input tensor,
-// specified by the given index (see `tf.gather`).
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
 //
-// tensor: The input tensor.
-// indices: The indices used to index into the list.
-// element_shape: The shape of the elements in the list (can be less specified than
-//   the shape of the tensor).
-// output_handle: The TensorList.
-func TensorListScatter(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output) (output_handle tf.Output) {
+// Arguments:
+//	grad: gradient propagated to the SparseSegmentMean op.
+//	indices: indices passed to the corresponding SparseSegmentMean op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
+func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorListScatter",
+		Type: "SparseSegmentMeanGrad",
 		Input: []tf.Input{
-			tensor, indices, element_shape,
+			grad, indices, segment_ids, output_dim0,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a `RaggedTensor` containing the specified sequences of numbers.
-//
-//
-// Returns a `RaggedTensor` `result` composed from `rt_dense_values` and
-// `rt_nested_splits`, such that
-// `result[i] = range(starts[i], limits[i], deltas[i])`.
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
 //
-// ```python
-// >>> (rt_nested_splits, rt_dense_values) = gen_ragged_ops.ragged_range(
-// ...     starts=[2, 5, 8], limits=[3, 5, 12], deltas=1)
-// >>> result = ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
-// >>> print result.eval().tolist()
-// [[2],               # result[0] = range(2, 3)
-//  [],                # result[1] = range(5, 5)
-//  [8, 9, 10, 11]]    # result[2] = range(8, 12)
-// ```
+// N is the size of the segment being reduced.
+//
+// See `tf.sparse.segment_sum` for usage examples.
 //
-// The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
-// The vector inputs must all have the same size.  Scalar inputs are broadcast
-// to match the size of the vector inputs.
 //
 // Arguments:
-//	starts: The starts of each range.
-//	limits: The limits of each range.
-//	deltas: The deltas of each range.
 //
-// Returns The `row_splits` for the returned `RaggedTensor`.The `inner_values` for the returned `RaggedTensor`.
-func RaggedRange(scope *Scope, starts tf.Output, limits tf.Output, deltas tf.Output) (rt_nested_splits tf.Output, rt_dense_values tf.Output) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RaggedRange",
+		Type: "SparseSegmentSqrtN",
 		Input: []tf.Input{
-			starts, limits, deltas,
+			data, indices, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Deprecated, use python implementation tf.linalg.matrix_exponential.
+// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
 //
-// DEPRECATED at GraphDef version 27: Use Python implementation tf.linalg.matrix_exponential instead.
-func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
+// The upper regularized incomplete Gamma function is defined as:
+//
+// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
+//
+// where
+//
+// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+//
+// is the upper incomplete Gama function.
+//
+// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
+// Gamma function.
+func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixExponential",
+		Type: "Igammac",
 		Input: []tf.Input{
-			input,
+			a, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
-type QueueDequeueUpToV2Attr func(optionalAttr)
-
-// QueueDequeueUpToV2TimeoutMs sets the optional timeout_ms attribute to value.
-//
-// value: If the queue has fewer than n elements, this operation
-// will block for up to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueUpToV2TimeoutMs(value int64) QueueDequeueUpToV2Attr {
-	return func(m optionalAttr) {
-		m["timeout_ms"] = value
-	}
-}
-
-// Dequeues `n` tuples of one or more tensors from the given queue.
-//
-// This operation is not supported by all queues.  If a queue does not support
-// DequeueUpTo, then an Unimplemented error is returned.
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
 //
-// If the queue is closed and there are more than 0 but less than `n`
-// elements remaining, then instead of returning an OutOfRange error like
-// QueueDequeueMany, less than `n` elements are returned immediately.  If
-// the queue is closed and there are 0 elements left in the queue, then
-// an OutOfRange error is returned just like in QueueDequeueMany.
-// Otherwise the behavior is identical to QueueDequeueMany:
+// N is the size of the segment being reduced.
 //
-// This operation concatenates queue-element component tensors along the
-// 0th dimension to make a single component tensor.  All of the components
-// in the dequeued tuple will have size n in the 0th dimension.
+// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
 //
-// This operation has `k` outputs, where `k` is the number of components in
-// the tuples stored in the given queue, and output `i` is the ith
-// component of the dequeued tuple.
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	n: The number of tuples to dequeue.
-//	component_types: The type of each component in a tuple.
 //
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueUpToV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueUpToV2Attr) (components []tf.Output) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueUpToV2",
+		Type: "SparseSegmentSqrtNWithNumSegments",
 		Input: []tf.Input{
-			handle, n,
+			data, indices, segment_ids, num_segments,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueUpToV2", err)
-		return
-	}
-	return components
+	return op.Output(0)
 }
 
-// Computes the Cholesky decomposition of one or more square matrices.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices.
-//
-// The input has to be symmetric and positive definite. Only the lower-triangular
-// part of the input will be used for this operation. The upper-triangular part
-// will not be read.
-//
-// The output is a tensor of the same shape as the input
-// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
+// Computes gradients for SparseSegmentSqrtN.
 //
-// **Note**: The gradient computation on GPU is faster for large matrices but
-// not for large batch dimensions when the submatrices are small. In this
-// case it might be faster to use the CPU.
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
-func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
+//	grad: gradient propagated to the SparseSegmentSqrtN op.
+//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
+func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cholesky",
+		Type: "SparseSegmentSqrtNGrad",
 		Input: []tf.Input{
-			input,
+			grad, indices, segment_ids, output_dim0,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Writes contents to the file at input filename. Creates file and recursively
+// LRNGradAttr is an optional argument to LRNGrad.
+type LRNGradAttr func(optionalAttr)
+
+// LRNGradDepthRadius sets the optional depth_radius attribute to value.
 //
-// creates directory if not existing.
+// value: A depth radius.
+// If not specified, defaults to 5
+func LRNGradDepthRadius(value int64) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
+	}
+}
+
+// LRNGradBias sets the optional bias attribute to value.
+//
+// value: An offset (usually > 0 to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNGradBias(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
+	}
+}
+
+// LRNGradAlpha sets the optional alpha attribute to value.
+//
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNGradAlpha(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNGradBeta sets the optional beta attribute to value.
+//
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNGradBeta(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Gradients for Local Response Normalization.
 //
 // Arguments:
-//	filename: scalar. The name of the file to which we write the contents.
-//	contents: scalar. The content to be written to the output file.
+//	input_grads: 4-D with shape `[batch, height, width, channels]`.
+//	input_image: 4-D with shape `[batch, height, width, channels]`.
+//	output_image: 4-D with shape `[batch, height, width, channels]`.
 //
-// Returns the created operation.
-func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
+// Returns The gradients for LRN.
+func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "WriteFile",
+		Type: "LRNGrad",
 		Input: []tf.Input{
-			filename, contents,
+			input_grads, input_image, output_image,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// AllAttr is an optional argument to All.
-type AllAttr func(optionalAttr)
+// AnyAttr is an optional argument to Any.
+type AnyAttr func(optionalAttr)
 
-// AllKeepDims sets the optional keep_dims attribute to value.
+// AnyKeepDims sets the optional keep_dims attribute to value.
 //
 // value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func AllKeepDims(value bool) AllAttr {
+func AnyKeepDims(value bool) AnyAttr {
 	return func(m optionalAttr) {
 		m["keep_dims"] = value
 	}
 }
 
-// Computes the "logical and" of elements across dimensions of a tensor.
+// Computes the "logical or" of elements across dimensions of a tensor.
 //
 // Reduces `input` along the dimensions given in `axis`. Unless
 // `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
@@ -27292,7 +32307,7 @@ func AllKeepDims(value bool) AllAttr {
 // `[-rank(input), rank(input))`.
 //
 // Returns The reduced tensor.
-func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (output tf.Output) {
+func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -27301,7 +32316,7 @@ func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "All",
+		Type: "Any",
 		Input: []tf.Input{
 			input, axis,
 		},
@@ -27311,90 +32326,107 @@ func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (ou
 	return op.Output(0)
 }
 
-// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
-//
-// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
+// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
+type DestroyResourceOpAttr func(optionalAttr)
+
+// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices, with the same constraints as the single matrix
-// SelfAdjointEig.
+// value: whether to ignore the error when the resource
+// doesn't exist.
+// If not specified, defaults to true
+func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
+	return func(m optionalAttr) {
+		m["ignore_lookup_error"] = value
+	}
+}
+
+// Deletes the resource specified by the handle.
 //
-// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
-// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors. The eigenvalues
-// are sorted in non-decreasing order.
+// All subsequent operations using the resource will result in a NotFound
+// error status.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
+//	resource: handle to the resource to delete.
 //
-// Returns Shape is `[..., M+1, M]`.
-func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns the created operation.
+func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SelfAdjointEig",
+		Type: "DestroyResourceOp",
 		Input: []tf.Input{
-			input,
+			resource,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes softplus gradients for a softplus operation.
+// Generates values in an interval.
+//
+// A sequence of `num` evenly-spaced values are generated beginning at `start`.
+// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
+// so that the last one is exactly `stop`.
+//
+// For example:
+//
+// ```
+// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
+// ```
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding softplus operation.
-//	features: The features passed as input to the corresponding softplus operation.
+//	start: 0-D tensor. First entry in the range.
+//	stop: 0-D tensor. Last entry in the range.
+//	num: 0-D tensor. Number of values to generate.
 //
-// Returns The gradients: `gradients / (1 + exp(-features))`.
-func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// Returns 1-D. The generated values.
+func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SoftplusGrad",
+		Type: "LinSpace",
 		Input: []tf.Input{
-			gradients, features,
+			start, stop, num,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
-type SelfAdjointEigV2Attr func(optionalAttr)
+// ComplexAttr is an optional argument to Complex.
+type ComplexAttr func(optionalAttr)
 
-// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
-//
-// value: If `True` then eigenvectors will be computed and returned in `v`.
-// Otherwise, only the eigenvalues will be computed.
-// If not specified, defaults to true
-func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
+// ComplexTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_COMPLEX64
+func ComplexTout(value tf.DataType) ComplexAttr {
 	return func(m optionalAttr) {
-		m["compute_v"] = value
+		m["Tout"] = value
 	}
 }
 
-// Computes the eigen decomposition of one or more square self-adjoint matrices.
+// Converts two real numbers to a complex number.
 //
-// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
-// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`. The eigenvalues
-// are sorted in non-decreasing order.
+// Given a tensor `real` representing the real part of a complex number, and a
+// tensor `imag` representing the imaginary part of a complex number, this
+// operation returns complex numbers elementwise of the form \\(a + bj\\), where
+// *a* represents the `real` part and *b* represents the `imag` part.
 //
-// ```python
-// # a is a tensor.
-// # e is a tensor of eigenvalues.
-// # v is a tensor of eigenvectors.
-// e, v = self_adjoint_eig(a)
-// e = self_adjoint_eig(a, compute_v=False)
-// ```
+// The input tensors `real` and `imag` must have the same shape.
 //
-// Arguments:
-//	input: `Tensor` input of shape `[N, N]`.
+// For example:
 //
-// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
-func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
+// ```
+// # tensor 'real' is [2.25, 3.25]
+// # tensor `imag` is [4.75, 5.75]
+// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
+// ```
+func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -27403,73 +32435,41 @@ func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SelfAdjointEigV2",
+		Type: "Complex",
 		Input: []tf.Input{
-			input,
+			real, imag,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Adjust the saturation of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
-//
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A scale is then applied all the saturation
-// values, and then remapped back to RGB colorspace.
-//
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	scale: A float scale to add to the saturation.
-//
-// Returns The hue-adjusted image or images.
-func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AdjustSaturation",
-		Input: []tf.Input{
-			images, scale,
-		},
-	}
-	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatrixSolveAttr is an optional argument to MatrixSolve.
-type MatrixSolveAttr func(optionalAttr)
+// ImagAttr is an optional argument to Imag.
+type ImagAttr func(optionalAttr)
 
-// MatrixSolveAdjoint sets the optional adjoint attribute to value.
-//
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-// adjoint.
-// If not specified, defaults to false
-func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
+// ImagTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ImagTout(value tf.DataType) ImagAttr {
 	return func(m optionalAttr) {
-		m["adjoint"] = value
+		m["Tout"] = value
 	}
 }
 
-// Solves systems of linear equations.
+// Returns the imaginary part of a complex number.
 //
-// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
-// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
-// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `True` then each output matrix satisfies
-// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the imaginary part of each element in `input`. All
+// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part returned by this operation.
 //
-// Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
+// For example:
 //
-// Returns Shape is `[..., M, K]`.
-func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.imag(input) ==> [4.75, 5.75]
+// ```
+func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -27478,9 +32478,9 @@ func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...Matr
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolve",
+		Type: "Imag",
 		Input: []tf.Input{
-			matrix, rhs,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -27488,210 +32488,169 @@ func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...Matr
 	return op.Output(0)
 }
 
-// Returns a serialized GraphDef representing `input_dataset`.
-//
-// Returns a graph representation for `input_dataset`.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the dataset to return the graph representation for.
-//
-// Returns The graph representation of the dataset (as serialized GraphDef).
-func DatasetToGraph(scope *Scope, input_dataset tf.Output) (graph tf.Output) {
+// Computes hyperbolic tangent of `x` element-wise.
+func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DatasetToGraph",
+		Type: "Tanh",
 		Input: []tf.Input{
-			input_dataset,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the matrix square root of one or more square matrices:
+// Computes the maximum along segments of a tensor.
 //
-// matmul(sqrtm(A), sqrtm(A)) = A
+// Read
+// [the section on segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
+// for an explanation of segments.
 //
-// The input matrix should be invertible. If the input matrix is real, it should
-// have no eigenvalues which are real and negative (pairs of complex conjugate
-// eigenvalues are allowed).
+// Computes a tensor such that
+// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
+// that `segment_ids[j] == i`.
 //
-// The matrix square root is computed by first reducing the matrix to
-// quasi-triangular form with the real Schur decomposition. The square root
-// of the quasi-triangular matrix is then computed directly. Details of
-// the algorithm can be found in: Nicholas J. Higham, "Computing real
-// square roots of a real matrix", Linear Algebra Appl., 1987.
+// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
+// </div>
+//
+// For example:
+//
+// ```
+// c = tf.constant([[1,2,3,4], [4, 3, 2, 1], [5,6,7,8]])
+// tf.segment_max(c, tf.constant([0, 0, 1]))
+// # ==> [[4, 3, 3, 4],
+// #      [5, 6, 7, 8]]
+// ```
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the matrix square root for all input submatrices `[..., :, :]`.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
 //
-// Returns Shape is `[..., M, M]`.
+//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
 //
-// @compatibility(scipy)
-// Equivalent to scipy.linalg.sqrtm
-// @end_compatibility
-func MatrixSquareRoot(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSquareRoot",
+		Type: "SegmentMax",
 		Input: []tf.Input{
-			input,
+			data, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SvdAttr is an optional argument to Svd.
-type SvdAttr func(optionalAttr)
-
-// SvdComputeUv sets the optional compute_uv attribute to value.
-//
-// value: If true, left and right singular vectors will be
-// computed and returned in `u` and `v`, respectively.
-// If false, `u` and `v` are not set and should never referenced.
-// If not specified, defaults to true
-func SvdComputeUv(value bool) SvdAttr {
-	return func(m optionalAttr) {
-		m["compute_uv"] = value
-	}
-}
-
-// SvdFullMatrices sets the optional full_matrices attribute to value.
-//
-// value: If true, compute full-sized `u` and `v`. If false
-// (the default), compute only the leading `P` singular vectors.
-// Ignored if `compute_uv` is `False`.
-// If not specified, defaults to false
-func SvdFullMatrices(value bool) SvdAttr {
-	return func(m optionalAttr) {
-		m["full_matrices"] = value
-	}
-}
-
-// Computes the singular value decompositions of one or more matrices.
-//
-// Computes the SVD of each inner matrix in `input` such that
-// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
-//
-// ```python
-// # a is a tensor containing a batch of matrices.
-// # s is a tensor of singular values for each matrix.
-// # u is the tensor containing of left singular vectors for each matrix.
-// # v is the tensor containing of right singular vectors for each matrix.
-// s, u, v = svd(a)
-// s, _, _ = svd(a, compute_uv=False)
-// ```
+// Creates a dataset that skips `count` elements from the `input_dataset`.
 //
 // Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
 //
-// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
-// Undefined if `compute_uv` is false.
-func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be skipped.  If count is -1, skips everything.
+//
+//
+func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Svd",
+		Type: "SkipDataset",
 		Input: []tf.Input{
-			input,
+			input_dataset, count,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// PrintV2Attr is an optional argument to PrintV2.
-type PrintV2Attr func(optionalAttr)
+// VarHandleOpAttr is an optional argument to VarHandleOp.
+type VarHandleOpAttr func(optionalAttr)
 
-// PrintV2OutputStream sets the optional output_stream attribute to value.
+// VarHandleOpContainer sets the optional container attribute to value.
 //
-// value: A string specifying the output stream or logging level to print to.
-// If not specified, defaults to "stderr"
-func PrintV2OutputStream(value string) PrintV2Attr {
+// value: the container this variable is placed in.
+// If not specified, defaults to ""
+func VarHandleOpContainer(value string) VarHandleOpAttr {
 	return func(m optionalAttr) {
-		m["output_stream"] = value
+		m["container"] = value
 	}
 }
 
-// Prints a string scalar.
+// VarHandleOpSharedName sets the optional shared_name attribute to value.
 //
-// Prints a string scalar to the desired output_stream.
+// value: the name by which this variable is referred to.
+// If not specified, defaults to ""
+func VarHandleOpSharedName(value string) VarHandleOpAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a handle to a Variable resource.
 //
 // Arguments:
-//	input: The string scalar to print.
-//
-// Returns the created operation.
-func PrintV2(scope *Scope, input tf.Output, optional ...PrintV2Attr) (o *tf.Operation) {
+//	dtype: the type of this variable. Must agree with the dtypes
+// of all ops using this variable.
+//	shape: The (possibly partially specified) shape of this variable.
+func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PrintV2",
-		Input: []tf.Input{
-			input,
-		},
+		Type: "VarHandleOp",
+
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
-type QueueEnqueueManyV2Attr func(optionalAttr)
+// AngleAttr is an optional argument to Angle.
+type AngleAttr func(optionalAttr)
 
-// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
-//
-// value: If the queue is too full, this operation will block for up
-// to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
+// AngleTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func AngleTout(value tf.DataType) AngleAttr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["Tout"] = value
 	}
 }
 
-// Enqueues zero or more tuples of one or more tensors in the given queue.
+// Returns the argument of a complex number.
 //
-// This operation slices each component tensor along the 0th dimension to
-// make multiple queue elements. All of the tuple components must have the
-// same size in the 0th dimension.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the argument of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part.
 //
-// The components input has k elements, which correspond to the components of
-// tuples stored in the given queue.
+// The argument returned by this operation is of the form \\(atan2(b, a)\\).
 //
-// N.B. If the queue is full, this operation will block until the given
-// elements have been enqueued (or 'timeout_ms' elapses, if specified).
+// For example:
 //
-// Arguments:
-//	handle: The handle to a queue.
-//	components: One or more tensors from which the enqueued tensors should
-// be taken.
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.angle(input) ==> [2.0132, 1.056]
+// ```
 //
-// Returns the created operation.
-func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
+// @compatibility(numpy)
+// Equivalent to np.angle.
+// @end_compatibility
+func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -27700,124 +32659,139 @@ func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueEnqueueManyV2",
+		Type: "Angle",
 		Input: []tf.Input{
-			handle, tf.OutputList(components),
+			input,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the product along segments of a tensor.
-//
-// Read
-// [the section on segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
-// for an explanation of segments.
-//
-// Computes a tensor such that
-// \\(output_i = \prod_j data_j\\) where the product is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the product is empty for a given segment ID `i`, `output[i] = 1`.
+// Clips tensor values to a specified min and max.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
-// </div>
+// Given a tensor `t`, this operation returns a tensor of the same type and
+// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
+// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
+// greater than `clip_value_max` are set to `clip_value_max`.
 //
 // Arguments:
+//	t: A `Tensor`.
+//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The minimum value to clip by.
+//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The maximum value to clip by.
 //
-//	segment_ids: A 1-D tensor whose size is equal to the size of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns A clipped `Tensor` with the same shape as input 't'.
+func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentProd",
+		Type: "ClipByValue",
 		Input: []tf.Input{
-			data, segment_ids,
+			t, clip_value_min, clip_value_max,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts one or more images from RGB to HSV.
+// Counts the number of occurrences of each value in an integer array.
 //
-// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
+// Outputs a vector with length `size` and the same dtype as `weights`. If
+// `weights` are empty, then index `i` stores the number of times the value `i` is
+// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+// the value in `weights` at each index where the corresponding value in `arr` is
+// `i`.
 //
-// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
-// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
-// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
+// Values in `arr` outside of the range [0, size) are ignored.
 //
 // Arguments:
-//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
+//	arr: int32 `Tensor`.
+//	size: non-negative int32 scalar `Tensor`.
+//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
+// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+// equal to 1.
 //
-// Returns `images` converted to HSV.
-func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
+// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
+// each value in the range [0, size).
+func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RGBToHSV",
+		Type: "Bincount",
 		Input: []tf.Input{
-			images,
+			arr, size, weights,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Does nothing. Only useful as a placeholder for control edges.
+// CumsumAttr is an optional argument to Cumsum.
+type CumsumAttr func(optionalAttr)
+
+// CumsumExclusive sets the optional exclusive attribute to value.
 //
-// Returns the created operation.
-func NoOp(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NoOp",
+// value: If `True`, perform exclusive cumsum.
+// If not specified, defaults to false
+func CumsumExclusive(value bool) CumsumAttr {
+	return func(m optionalAttr) {
+		m["exclusive"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
-type MergeV2CheckpointsAttr func(optionalAttr)
-
-// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
+// CumsumReverse sets the optional reverse attribute to value.
 //
-// value: see above.
-// If not specified, defaults to true
-func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumsumReverse(value bool) CumsumAttr {
 	return func(m optionalAttr) {
-		m["delete_old_dirs"] = value
+		m["reverse"] = value
 	}
 }
 
-// V2 format specific: merges the metadata files of sharded checkpoints.  The
+// Compute the cumulative sum of the tensor `x` along `axis`.
 //
-// result is one logical checkpoint, with one physical metadata file and renamed
-// data files.
+// By default, this op performs an inclusive cumsum, which means that the first
+// element of the input is identical to the first element of the output:
 //
-// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
+// ```python
+// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
+// ```
 //
-// If delete_old_dirs is true, attempts to delete recursively the dirname of each
-// path in the input checkpoint_prefixes.  This is useful when those paths are non
-// user-facing temporary locations.
+// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+// performed instead:
 //
-// Arguments:
-//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
-//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
-// as one of the checkpoint_prefixes.
+// ```python
+// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
+// ```
 //
-// Returns the created operation.
-func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
+// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -27826,425 +32800,357 @@ func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MergeV2Checkpoints",
+		Type: "Cumsum",
 		Input: []tf.Input{
-			checkpoint_prefixes, destination_prefix,
+			x, axis,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Saves input tensors slices to disk.
-//
-// This is like `Save` except that tensors can be listed in the saved file as being
-// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
-// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
-// have as many elements as `tensor_names`.
-//
-// Elements of the `shapes_and_slices` input must either be:
-//
-// *  The empty string, in which case the corresponding tensor is
-//    saved normally.
-// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
-//    `dimI` are the dimensions of the larger tensor and `slice-spec`
-//    specifies what part is covered by the tensor to save.
-//
-// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
-// where each `sliceI` is either:
-//
-// *  The string `-` meaning that the slice covers all indices of this dimension
-// *  `start,length` where `start` and `length` are integers.  In that
-//    case the slice covers `length` indices starting at `start`.
-//
-// See also `Save`.
-//
-// Arguments:
-//	filename: Must have a single element. The name of the file to which we write the
-// tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
-// saving the tensors.
-//	data: `N` tensors to save.
+// Return the shape of s0 op s1 with broadcast.
 //
-// Returns the created operation.
-func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
+// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
+// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
+func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SaveSlices",
+		Type: "BroadcastArgs",
 		Input: []tf.Input{
-			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
+			s0, s1,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
-type DenseToDenseSetOperationAttr func(optionalAttr)
+// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
+type DataFormatDimMapAttr func(optionalAttr)
 
-// DenseToDenseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperationAttr {
+// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
+//
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["src_format"] = value
 	}
 }
 
-// Applies set operation along last dimension of 2 `Tensor` inputs.
+// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
 //
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
+	return func(m optionalAttr) {
+		m["dst_format"] = value
+	}
+}
+
+// Returns the dimension index in the destination data format given the one in
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// the source data format.
 //
 // Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//
+//	x: A Tensor with each element as a dimension index in source data format.
+// Must be in the range [-4, 4).
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_operation string, optional ...DenseToDenseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Returns A Tensor with each element as a dimension index in destination data format.
+func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DenseToDenseSetOperation",
+		Type: "DataFormatDimMap",
 		Input: []tf.Input{
-			set1, set2,
+			x,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Generate a sharded filename. The filename is printf formatted as
+// CumprodAttr is an optional argument to Cumprod.
+type CumprodAttr func(optionalAttr)
+
+// CumprodExclusive sets the optional exclusive attribute to value.
 //
-//    %s-%05d-of-%05d, basename, shard, num_shards.
-func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ShardedFilename",
-		Input: []tf.Input{
-			basename, shard, num_shards,
-		},
+// value: If `True`, perform exclusive cumprod.
+// If not specified, defaults to false
+func CumprodExclusive(value bool) CumprodAttr {
+	return func(m optionalAttr) {
+		m["exclusive"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// BatchToSpace for N-D tensors of type T.
-//
-// This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
-// `block_shape + [batch]`, interleaves these blocks back into the grid defined by
-// the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
-// the input.  The spatial dimensions of this intermediate result are then
-// optionally cropped according to `crops` to produce the output.  This is the
-// reverse of SpaceToBatch.  See below for a precise description.
-//
-// Arguments:
-//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
-// where spatial_shape has M dimensions.
-//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
-//	crops: 2-D with shape `[M, 2]`, all values must be >= 0.
-//   `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
-//   dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
-//   required that
-//   `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
-//
-// This operation is equivalent to the following steps:
-//
-// 1. Reshape `input` to `reshaped` of shape:
-//      [block_shape[0], ..., block_shape[M-1],
-//       batch / prod(block_shape),
-//       input_shape[1], ..., input_shape[N-1]]
-//
-// 2. Permute dimensions of `reshaped` to produce `permuted` of shape
-//      [batch / prod(block_shape),
-//
-//       input_shape[1], block_shape[0],
-//       ...,
-//       input_shape[M], block_shape[M-1],
-//
-//       input_shape[M+1], ..., input_shape[N-1]]
-//
-// 3. Reshape `permuted` to produce `reshaped_permuted` of shape
-//      [batch / prod(block_shape),
-//
-//       input_shape[1] * block_shape[0],
-//       ...,
-//       input_shape[M] * block_shape[M-1],
-//
-//       input_shape[M+1],
-//       ...,
-//       input_shape[N-1]]
-//
-// 4. Crop the start and end of dimensions `[1, ..., M]` of
-//    `reshaped_permuted` according to `crops` to produce the output of shape:
-//      [batch / prod(block_shape),
-//
-//       input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
-//       ...,
-//       input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
-//
-//       input_shape[M+1], ..., input_shape[N-1]]
-//
-// Some examples:
-//
-// (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
+// CumprodReverse sets the optional reverse attribute to value.
 //
-// ```
-// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-// ```
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumprodReverse(value bool) CumprodAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Compute the cumulative product of the tensor `x` along `axis`.
 //
-// The output tensor has shape `[1, 2, 2, 3]` and value:
+// By default, this op performs an inclusive cumprod, which means that the first
+// element of the input is identical to the first element of the output:
 //
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
+// ```python
+// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
 // ```
 //
-// (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
+// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+// performed instead:
 //
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
+// ```python
+// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
 // ```
 //
-// The output tensor has shape `[1, 4, 4, 1]` and value:
+// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
+// opposite direction:
 //
+// ```python
+// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
 // ```
-// x = [[[1],   [2],  [3],  [4]],
-//      [[5],   [6],  [7],  [8]],
-//      [[9],  [10], [11],  [12]],
-//      [[13], [14], [15],  [16]]]
-// ```
-//
-// (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [2, 0]]`:
 //
-// ```
-// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
-//      [[[0], [2], [4]]], [[[0], [10], [12]]],
-//      [[[0], [5], [7]]], [[[0], [13], [15]]],
-//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
-// ```
+// This is more efficient than using separate `tf.reverse` ops.
 //
-// The output tensor has shape `[2, 2, 4, 1]` and value:
+// The `reverse` and `exclusive` kwargs can also be combined:
 //
+// ```python
+// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
 // ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]]],
-//      [[[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops tf.Output) (output tf.Output) {
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BatchToSpaceND",
+		Type: "Cumprod",
 		Input: []tf.Input{
-			input, block_shape, crops,
+			x, axis,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// UnpackAttr is an optional argument to Unpack.
-type UnpackAttr func(optionalAttr)
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr is an optional argument to RetrieveTPUEmbeddingStochasticGradientDescentParameters.
+type RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr func(optionalAttr)
 
-// UnpackAxis sets the optional axis attribute to value.
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
 //
-// value: Dimension along which to unpack.  Negative values wrap around, so the
-// valid range is `[-R, R)`.
-// If not specified, defaults to 0
-func UnpackAxis(value int64) UnpackAttr {
+// REQUIRES: value >= -1
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableId(value int64) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
 	return func(m optionalAttr) {
-		m["axis"] = value
+		m["table_id"] = value
 	}
 }
 
-// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
-//
-// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
-// For example, given a tensor of shape `(A, B, C, D)`;
-//
-// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
-//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
-//   dimension unpacked along is gone, unlike `split`).
-//
-// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
-//   and each tensor in `output` will have shape `(A, C, D)`.
-// Etc.
-//
-// This is the opposite of `pack`.
-//
-// Arguments:
-//	value: 1-D or higher, with `axis` dimension size equal to `num`.
+// RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingStochasticGradientDescentParametersTableName(value string) RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// Retrieve SGD embedding parameters.
 //
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
 //
-// Returns The list of tensors unpacked from `value`.
-func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
+// Returns Parameter parameters updated by the stochastic gradient descent optimization algorithm.
+func RetrieveTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingStochasticGradientDescentParametersAttr) (parameters tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num": num}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Unpack",
-		Input: []tf.Input{
-			value,
-		},
+		Type: "RetrieveTPUEmbeddingStochasticGradientDescentParameters",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
+	return op.Output(0)
+}
+
+// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
+type QuantizedMatMulAttr func(optionalAttr)
+
+// QuantizedMatMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
 	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("Unpack", err)
-		return
+}
+
+// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
+//
+// value: If true, `a` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
 	}
-	return output
 }
 
-// Increments variable pointed to by 'resource' until it reaches 'limit'.
+// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
 //
-// Arguments:
-//	resource: Should be from a scalar `Variable` node.
-//	limit: If incrementing ref would bring it above limit, instead generates an
-// 'OutOfRange' error.
+// value: If true, `b` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
 //
+// value: The type of output produced by activation function
+// following this operation.
+// If not specified, defaults to DT_QUINT8
+func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["Tactivation"] = value
+	}
+}
+
+// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
 //
-// Returns A copy of the input before increment. If nothing else modifies the
-// input, the values produced will all be distinct.
-func ResourceCountUpTo(scope *Scope, resource tf.Output, limit int64, T tf.DataType) (output tf.Output) {
+// The inputs must be two-dimensional matrices and the inner dimension of
+// `a` (after being transposed if `transpose_a` is non-zero) must match the
+// outer dimension of `b` (after being transposed if `transposed_b` is
+// non-zero).
+//
+// Arguments:
+//	a: Must be a two-dimensional tensor.
+//	b: Must be a two-dimensional tensor.
+//	min_a: The float value that the lowest quantized `a` value represents.
+//	max_a: The float value that the highest quantized `a` value represents.
+//	min_b: The float value that the lowest quantized `b` value represents.
+//	max_b: The float value that the highest quantized `b` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"limit": limit, "T": T}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ResourceCountUpTo",
+		Type: "QuantizedMatMul",
 		Input: []tf.Input{
-			resource,
+			a, b, min_a, max_a, min_b, max_b,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Delete the stack from its resource container.
+// QuantizedMulAttr is an optional argument to QuantizedMul.
+type QuantizedMulAttr func(optionalAttr)
+
+// QuantizedMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// Returns x * y element-wise, working on quantized buffers.
 //
 // Arguments:
-//	handle: The handle to a stack.
 //
-// Returns the created operation.
-func StackCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
+//
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+//
+// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "StackCloseV2",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Generate a glob pattern matching all sharded file names.
-func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
-	if scope.Err() != nil {
-		return
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ShardedFilespec",
+		Type: "QuantizedMul",
 		Input: []tf.Input{
-			basename, num_shards,
+			x, y, min_x, max_x, min_y, max_y,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
-type TextLineReaderV2Attr func(optionalAttr)
+// QuantizedAddAttr is an optional argument to QuantizedAdd.
+type QuantizedAddAttr func(optionalAttr)
 
-// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
-//
-// value: Number of lines to skip from the beginning of every file.
-// If not specified, defaults to 0
-func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
+// QuantizedAddToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedAddToutput(value tf.DataType) QuantizedAddAttr {
 	return func(m optionalAttr) {
-		m["skip_header_lines"] = value
+		m["Toutput"] = value
 	}
 }
 
-// TextLineReaderV2Container sets the optional container attribute to value.
+// Returns x + y element-wise, working on quantized buffers.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
+// Arguments:
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A Reader that outputs the lines of a file delimited by '\n'.
 //
-// Returns The handle to reference the Reader.
-func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+//
+// *NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedAddAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28253,192 +33159,189 @@ func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_ha
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TextLineReaderV2",
-
+		Type: "QuantizedAdd",
+		Input: []tf.Input{
+			x, y, min_x, max_x, min_y, max_y,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
-type LoadAndRemapMatrixAttr func(optionalAttr)
-
-// LoadAndRemapMatrixMaxRowsInMemory sets the optional max_rows_in_memory attribute to value.
+// Scatters tensor at indices in an input list.
 //
-// value: The maximum number of rows to load from the checkpoint at
-// once. If less than or equal to 0, the entire matrix will be loaded into
-// memory. Setting this arg trades increased disk reads for lower memory usage.
-// If not specified, defaults to -1
-func LoadAndRemapMatrixMaxRowsInMemory(value int64) LoadAndRemapMatrixAttr {
-	return func(m optionalAttr) {
-		m["max_rows_in_memory"] = value
+// Each member of the TensorList corresponds to one row of the input tensor,
+// specified by the given index (see `tf.gather`).
+//
+// input_handle: The list to scatter into.
+// tensor: The input tensor.
+// indices: The indices used to index into the list.
+// output_handle: The TensorList.
+func TensorListScatterIntoExistingList(scope *Scope, input_handle tf.Output, tensor tf.Output, indices tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListScatterIntoExistingList",
+		Input: []tf.Input{
+			input_handle, tensor, indices,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
+// Computes a range that covers the actual values present in a quantized tensor.
 //
-// at `ckpt_path` and potentially reorders its rows and columns using the
-// specified remappings.
+// Given a quantized tensor described by `(input, input_min, input_max)`, outputs a
+// range that covers the actual values present in that tensor. This op is typically
+// used to produce the `requested_output_min` and `requested_output_max` for
+// `Requantize`.
 //
-// Most users should use one of the wrapper initializers (such as
-// `tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
-// function directly.
+// Arguments:
 //
-// The remappings are 1-D tensors with the following properties:
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
 //
-// * `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
-//   matrix will be initialized from the row corresponding to index
-//   `row_remapping[i]` in the old `Tensor` from the checkpoint.
-// * `col_remapping` must have either 0 entries (indicating that no column
-//   reordering is needed) or `num_cols` entries. If specified, column `j` of the
-//   output matrix will be initialized from the column corresponding to index
-//   `col_remapping[j]` in the old `Tensor` from the checkpoint.
-// * A value of -1 in either of the remappings signifies a "missing" entry. In that
-//   case, values from the `initializing_values` tensor will be used to fill that
-//   missing row or column. If `row_remapping` has `r` missing entries and
-//   `col_remapping` has `c` missing entries, then the following condition must be
-//   true:
+// Returns The computed min output.the computed max output.
+func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RequantizationRange",
+		Input: []tf.Input{
+			input, input_min, input_max,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Rolls the elements of a tensor along an axis.
 //
-// `(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
+// The elements are shifted positively (towards larger indices) by the offset of
+// `shift` along the dimension of `axis`. Negative `shift` values will shift
+// elements in the opposite direction. Elements that roll passed the last position
+// will wrap around to the first and vice versa. Multiple shifts along multiple
+// axes may be specified.
 //
-// The remapping tensors can be generated using the GenerateVocabRemapping op.
+// For example:
 //
-// As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
-// initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
-// the value from row i, column j of the old tensor in the checkpoint, the output
-// matrix will look like the following:
+// ```
+// # 't' is [0, 1, 2, 3, 4]
+// roll(t, shift=2, axis=0) ==> [3, 4, 0, 1, 2]
 //
-// [[w(1, 0),  w(1, 2),  0.5],
-//  [w(0, 0),  w(0, 2), -0.5],
-//  [0.25,    -0.25,      42]]
+// # shifting along multiple dimensions
+// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+// roll(t, shift=[1, -2], axis=[0, 1]) ==> [[7, 8, 9, 5, 6], [2, 3, 4, 0, 1]]
+//
+// # shifting along the same axis multiple times
+// # 't' is [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+// roll(t, shift=[2, -3], axis=[1, 1]) ==> [[1, 2, 3, 4, 0], [6, 7, 8, 9, 5]]
+// ```
 //
 // Arguments:
-//	ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
-// which the old matrix `Tensor` will be loaded.
-//	old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
-//	row_remapping: An int `Tensor` of row remappings (generally created by
-// `generate_vocab_remapping`).  Even if no row remapping is needed, this must
-// still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
-// index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
-//	col_remapping: An int `Tensor` of column remappings (generally created by
-// `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
-// is to be done (e.g. column ordering is the same).
-//	initializing_values: A float `Tensor` containing  values to fill in for cells
-// in the output matrix that are not loaded from the checkpoint. Length must be
-// exactly the same as the number of missing / new cells.
-//	num_rows: Number of rows (length of the 1st dimension) in the output matrix.
-//	num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
 //
-// Returns Output matrix containing existing values loaded from the
-// checkpoint, and with any missing values filled in from initializing_values.
-func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Output, row_remapping tf.Output, col_remapping tf.Output, initializing_values tf.Output, num_rows int64, num_cols int64, optional ...LoadAndRemapMatrixAttr) (output_matrix tf.Output) {
+//	shift: Dimension must be 0-D or 1-D. `shift[i]` specifies the number of places by which
+// elements are shifted positively (towards larger indices) along the dimension
+// specified by `axis[i]`. Negative shifts will roll the elements in the opposite
+// direction.
+//	axis: Dimension must be 0-D or 1-D. `axis[i]` specifies the dimension that the shift
+// `shift[i]` should occur. If the same axis is referenced more than once, the
+// total shift for that axis will be the sum of all the shifts that belong to that
+// axis.
+//
+// Returns Has the same shape and size as the input. The elements are shifted
+// positively (towards larger indices) by the offsets of `shift` along the
+// dimensions of `axis`.
+func Roll(scope *Scope, input tf.Output, shift tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_rows": num_rows, "num_cols": num_cols}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "LoadAndRemapMatrix",
+		Type: "Roll",
 		Input: []tf.Input{
-			ckpt_path, old_tensor_name, row_remapping, col_remapping, initializing_values,
+			input, shift, axis,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
-type TFRecordReaderV2Attr func(optionalAttr)
-
-// TFRecordReaderV2Container sets the optional container attribute to value.
+// Updates the table to associates keys with values.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
+//
+// Returns the created operation.
+func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
-// If not specified, defaults to ""
-func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["compression_type"] = value
+	opspec := tf.OpSpec{
+		Type: "LookupTableInsertV2",
+		Input: []tf.Input{
+			table_handle, keys, values,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// A Reader that outputs the records from a TensorFlow Records file.
+// Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
 //
-// Returns The handle to reference the Reader.
-func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
+// Arguments:
+//
+//	num_shards: An integer representing the number of shards operating in parallel.
+//	index: An integer representing the current worker index.
+//
+//
+func ShardDataset(scope *Scope, input_dataset tf.Output, num_shards tf.Output, index tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TFRecordReaderV2",
-
+		Type: "ShardDataset",
+		Input: []tf.Input{
+			input_dataset, num_shards, index,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
-type QuantizeAndDequantizeV3Attr func(optionalAttr)
-
-// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
-	}
-}
-
-// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
-	}
-}
-
-// Quantizes then dequantizes a tensor.
+// Creates a dataset that batches and pads `batch_size` elements from the input.
 //
-// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
-// tensor, so its value can change during training.
-func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
+// Arguments:
+//
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	padded_shapes: A list of int64 tensors representing the desired padded shapes
+// of the corresponding output components. These shapes may be partially
+// specified, using `-1` to indicate that a particular dimension should be
+// padded to the maximum size of all batch elements.
+//	padding_values: A list of scalars containing the padding value to use for
+// each of the outputs.
+//	drop_remainder: A scalar representing whether the last batch should be dropped in case its size
+// is smaller than desired.
+//
+func PaddedBatchDatasetV2(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, drop_remainder tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantizeV3",
+		Type: "PaddedBatchDatasetV2",
 		Input: []tf.Input{
-			input, input_min, input_max, num_bits,
+			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values), drop_remainder,
 		},
 		Attrs: attrs,
 	}
@@ -28446,77 +33349,66 @@ func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output,
 	return op.Output(0)
 }
 
-// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
-type IdentityReaderV2Attr func(optionalAttr)
-
-// IdentityReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// Returns element-wise smallest integer not less than x.
+func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+	opspec := tf.OpSpec{
+		Type: "Ceil",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// A Reader that outputs the queued work as both the key and value.
+// Computes the number of elements in the given table.
 //
-// To use, enqueue strings in a Queue.  ReaderRead will take the front
-// work string and output (work, work).
+// Arguments:
+//	table_handle: Handle to the table.
 //
-// Returns The handle to reference the Reader.
-func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
+// Returns Scalar that contains number of elements in the table.
+func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "IdentityReaderV2",
-
-		Attrs: attrs,
+		Type: "LookupTableSizeV2",
+		Input: []tf.Input{
+			table_handle,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
-type ResourceApplyGradientDescentAttr func(optionalAttr)
+// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
+type ResizeBilinearGradAttr func(optionalAttr)
 
-// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
+// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: If `True`, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
 // If not specified, defaults to false
-func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
+func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Update '*var' by subtracting 'alpha' * 'delta' from it.
+// Computes the gradient of bilinear interpolation.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	delta: The change.
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
 //
-// Returns the created operation.
-func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28525,400 +33417,430 @@ func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyGradientDescent",
+		Type: "ResizeBilinearGrad",
 		Input: []tf.Input{
-			var_, alpha, delta,
+			grads, original_image,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns the next record (key, value pair) produced by a Reader.
-//
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
+// Outputs all keys and values in the table.
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-//	queue_handle: Handle to a Queue, with string work items.
+//	table_handle: Handle to the table.
 //
-// Returns A scalar.A scalar.
-func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
+//
+//
+// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
+func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
 	opspec := tf.OpSpec{
-		Type: "ReaderReadV2",
+		Type: "LookupTableExportV2",
 		Input: []tf.Input{
-			reader_handle, queue_handle,
+			table_handle,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1)
 }
 
-// Returns up to `num_records` (key, value) pairs produced by a Reader.
-//
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-// It may return less than `num_records` even before the last batch.
+// MultiDeviceIteratorFromStringHandleAttr is an optional argument to MultiDeviceIteratorFromStringHandle.
+type MultiDeviceIteratorFromStringHandleAttr func(optionalAttr)
+
+// MultiDeviceIteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
 //
-// Arguments:
-//	reader_handle: Handle to a `Reader`.
-//	queue_handle: Handle to a `Queue`, with string work items.
-//	num_records: number of records to read from `Reader`.
+// value: The type list for the return values.
+// If not specified, defaults to <>
 //
-// Returns A 1-D tensor.A 1-D tensor.
-func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderReadUpToV2",
-		Input: []tf.Input{
-			reader_handle, queue_handle, num_records,
-		},
+// REQUIRES: len(value) >= 0
+func MultiDeviceIteratorFromStringHandleOutputTypes(value []tf.DataType) MultiDeviceIteratorFromStringHandleAttr {
+	return func(m optionalAttr) {
+		m["output_types"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
 }
 
-//     Adds v into specified rows of x.
-//
-//     Computes y = x; y[i, :] += v; return y.
+// MultiDeviceIteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
 //
-// Arguments:
-//	x: A `Tensor` of type T.
-//	i: A vector. Indices into the left-most dimension of `x`.
-//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
+// value: The list of shapes being produced.
+// If not specified, defaults to <>
 //
-// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
-func InplaceAdd(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InplaceAdd",
-		Input: []tf.Input{
-			x, i, v,
-		},
+// REQUIRES: len(value) >= 0
+func MultiDeviceIteratorFromStringHandleOutputShapes(value []tf.Shape) MultiDeviceIteratorFromStringHandleAttr {
+	return func(m optionalAttr) {
+		m["output_shapes"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Restore a Reader to its initial clean state.
+// Generates a MultiDeviceIterator resource from its provided string handle.
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
+//	string_handle: String representing the resource.
 //
-// Returns the created operation.
-func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
+// Returns A MultiDeviceIterator resource.
+func MultiDeviceIteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...MultiDeviceIteratorFromStringHandleAttr) (multi_device_iterator tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReaderResetV2",
+		Type: "MultiDeviceIteratorFromStringHandle",
 		Input: []tf.Input{
-			reader_handle,
+			string_handle,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// BatchAttr is an optional argument to Batch.
-type BatchAttr func(optionalAttr)
-
-// BatchMaxEnqueuedBatches sets the optional max_enqueued_batches attribute to value.
-// If not specified, defaults to 10
-func BatchMaxEnqueuedBatches(value int64) BatchAttr {
-	return func(m optionalAttr) {
-		m["max_enqueued_batches"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// BatchAllowedBatchSizes sets the optional allowed_batch_sizes attribute to value.
-// If not specified, defaults to <>
-func BatchAllowedBatchSizes(value []int64) BatchAttr {
-	return func(m optionalAttr) {
-		m["allowed_batch_sizes"] = value
-	}
-}
+// MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
+type MutableHashTableV2Attr func(optionalAttr)
 
-// BatchContainer sets the optional container attribute to value.
+// MutableHashTableV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
 // If not specified, defaults to ""
-func BatchContainer(value string) BatchAttr {
+func MutableHashTableV2Container(value string) MutableHashTableV2Attr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// BatchSharedName sets the optional shared_name attribute to value.
+// MutableHashTableV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
 // If not specified, defaults to ""
-func BatchSharedName(value string) BatchAttr {
+func MutableHashTableV2SharedName(value string) MutableHashTableV2Attr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// BatchBatchingQueue sets the optional batching_queue attribute to value.
-// If not specified, defaults to ""
-func BatchBatchingQueue(value string) BatchAttr {
+// MutableHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+//
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func MutableHashTableV2UseNodeNameSharing(value bool) MutableHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["batching_queue"] = value
+		m["use_node_name_sharing"] = value
 	}
 }
 
-// Batches all input tensors nondeterministically.
-//
-// When many instances of this Op are being run concurrently with the same
-// container/shared_name in the same device, some will output zero-shaped Tensors
-// and others will output Tensors of size up to max_batch_size.
-//
-// All Tensors in in_tensors are batched together (so, for example, labels and
-// features should be batched with a single instance of this operation.
-//
-// Each invocation of batch emits an `id` scalar which will be used to identify
-// this particular invocation when doing unbatch or its gradient.
+// Creates an empty hash table.
 //
-// Each op which emits a non-empty batch will also emit a non-empty batch_index
-// Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
-// start, and length of elements of each set of Tensors present in batched_tensors.
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
-// Batched tensors are concatenated along the first dimension, and all tensors in
-// in_tensors must have the first dimension of the same size.
+// Arguments:
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
 //
-// in_tensors: The tensors to be batched.
-// num_batch_threads: Number of scheduling threads for processing batches of work.
-//  Determines the number of batches processed in parallel.
-// max_batch_size: Batch sizes will never be bigger than this.
-// batch_timeout_micros: Maximum number of microseconds to wait before outputting
-//  an incomplete batch.
-// allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
-//  nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
-//  batches up to one of those sizes. The entries must increase monotonically, and
-//  the final entry must equal max_batch_size.
-// grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
-// batched_tensors: Either empty tensors or a batch of concatenated Tensors.
-// batch_index: If out_tensors is non-empty, has information to invert it.
-// container: Controls the scope of sharing of this batch.
-// id: always contains a scalar with a unique ID for this invocation of Batch.
-// shared_name: Concurrently running instances of batch in the same device with the
-//  same container and shared_name will batch their elements together. If left
-//  empty, the op name will be used as the shared name.
-// T: the types of tensors to be batched.
-func Batch(scope *Scope, in_tensors []tf.Output, num_batch_threads int64, max_batch_size int64, batch_timeout_micros int64, grad_timeout_micros int64, optional ...BatchAttr) (batched_tensors []tf.Output, batch_index tf.Output, id tf.Output) {
+// Returns Handle to a table.
+func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_batch_threads": num_batch_threads, "max_batch_size": max_batch_size, "batch_timeout_micros": batch_timeout_micros, "grad_timeout_micros": grad_timeout_micros}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Batch",
-		Input: []tf.Input{
-			tf.OutputList(in_tensors),
-		},
+		Type: "MutableHashTableV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if batched_tensors, idx, err = makeOutputList(op, idx, "batched_tensors"); err != nil {
-		scope.UpdateErr("Batch", err)
-		return
+	return op.Output(0)
+}
+
+// DequantizeAttr is an optional argument to Dequantize.
+type DequantizeAttr func(optionalAttr)
+
+// DequantizeMode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func DequantizeMode(value string) DequantizeAttr {
+	return func(m optionalAttr) {
+		m["mode"] = value
 	}
-	batch_index = op.Output(idx)
-	id = op.Output(idx)
-	return batched_tensors, batch_index, id
 }
 
-// Adjust the hue of one or more images.
+// Dequantize the 'input' tensor into a float Tensor.
 //
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.
 //
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A delta is then applied all the hue values,
-// and then remapped back to RGB colorspace.
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
+//
+// ```
+// if T == qint8: in[i] += (range(T) + 1)/ 2.0
+// out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
+// ```
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
+//
+// *MIN_COMBINED Mode Example*
+//
+// If the input comes from a QuantizedRelu6, the output type is
+// quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
+// 0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
+// Dequantize on quint8 will take each value, cast to float, and multiply
+// by 6 / 255.
+// Note that if quantizedtype is qint8, the operation will additionally add
+// each value by 128 prior to casting.
+//
+// If the mode is 'MIN_FIRST', then this approach is used:
+//
+// ```c++
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = range / num_discrete_values
+// const double offset_input = static_cast<double>(input) - lowest_quantized;
+// result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
+// ```
+//
+// *SCALED mode Example*
+//
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
+//
+// If the mode is `SCALED`, we do not use the full range of the output type,
+// choosing to elide the lowest possible value for symmetry (e.g., output range is
+// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+// 0.
+//
+// We first find the range of values in our tensor. The
+// range we use is always centered on 0, so we find m such that
+// ```c++
+//   m = max(abs(input_min), abs(input_max))
+// ```
+//
+// Our input tensor range is then `[-m, m]`.
+//
+// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+// If T is signed, this is
+// ```
+//   num_bits = sizeof(T) * 8
+//   [min_fixed, max_fixed] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
+// ```
+//
+// Otherwise, if T is unsigned, the fixed-point range is
+// ```
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
+// ```
+//
+// From this we compute our scaling factor, s:
+// ```c++
+//   s = (2 * m) / (max_fixed - min_fixed)
+// ```
+//
+// Now we can dequantize the elements of our tensor:
+// ```c++
+// result = input * s
+// ```
 //
 // Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	delta: A float delta to add to the hue.
 //
-// Returns The hue-adjusted image or images.
-func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, optional ...DequantizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "AdjustHue",
+		Type: "Dequantize",
 		Input: []tf.Input{
-			images, delta,
+			input, min_range, max_range,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
-type ResourceApplyAdamAttr func(optionalAttr)
-
-// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
+// Flips all bits elementwise.
 //
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
+// The result will have exactly those bits set, that are not set in `x`. The
+// computation is performed on the underlying representation of x.
+func Invert(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, uses the nesterov update.
-// If not specified, defaults to false
-func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
+	opspec := tf.OpSpec{
+		Type: "Invert",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Update '*var' according to the Adam algorithm.
+// Deserialize bucket boundaries and ready flag into current QuantileAccumulator.
 //
-// $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
-// $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
-// $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-// $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+// An op that deserializes bucket boundaries and are boundaries ready flag into current QuantileAccumulator.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	beta2_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
+//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
+//	bucket_boundaries: float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
 //
 // Returns the created operation.
-func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
+func BoostedTreesQuantileStreamResourceDeserialize(scope *Scope, quantile_stream_resource_handle tf.Output, bucket_boundaries []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdam",
+		Type: "BoostedTreesQuantileStreamResourceDeserialize",
 		Input: []tf.Input{
-			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+			quantile_stream_resource_handle, tf.OutputList(bucket_boundaries),
 		},
-		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Store the input tensor in the state of the current session.
+// Inverse 3D fast Fourier transform.
+//
+// Computes the inverse 3-dimensional discrete Fourier transform over the
+// inner-most 3 dimensions of `input`.
 //
 // Arguments:
-//	value: The tensor to be stored.
+//	input: A complex64 tensor.
 //
-// Returns The handle for the tensor stored in the session state, represented
-// as a ResourceHandle object.
-func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifftn with 3 dimensions.
+// @end_compatibility
+func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "GetSessionHandleV2",
+		Type: "IFFT3D",
 		Input: []tf.Input{
-			value,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
-type ResizeBicubicGradAttr func(optionalAttr)
+// Shuts down a running distributed TPU system.
+//
+// The op returns an error if no system is running.
+//
+// Returns the created operation.
+func ShutdownDistributedTPU(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ShutdownDistributedTPU",
+	}
+	return scope.AddOperation(opspec)
+}
 
-// ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
+// Deprecated. Disallowed in GraphDef version >= 2.
 //
-// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
-// aligned. Defaults to false.
-// If not specified, defaults to false
-func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
+// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
+func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustContrast",
+		Input: []tf.Input{
+			images, contrast_factor, min_value, max_value,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the gradient of bicubic interpolation.
+// Table initializer that takes two tensors for keys and values respectively.
 //
 // Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
+//	table_handle: Handle to a table which will be initialized.
+//	keys: Keys of type Tkey.
+//	values: Values of type Tval.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBicubicGradAttr) (output tf.Output) {
+// Returns the created operation.
+func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBicubicGrad",
+		Type: "InitializeTableV2",
 		Input: []tf.Input{
-			grads, original_image,
+			table_handle, keys, values,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
+}
+
+// PrintAttr is an optional argument to Print.
+type PrintAttr func(optionalAttr)
+
+// PrintMessage sets the optional message attribute to value.
+//
+// value: A string, prefix of the error message.
+// If not specified, defaults to ""
+func PrintMessage(value string) PrintAttr {
+	return func(m optionalAttr) {
+		m["message"] = value
+	}
 }
 
-// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
-type ResizeNearestNeighborAttr func(optionalAttr)
+// PrintFirstN sets the optional first_n attribute to value.
+//
+// value: Only log `first_n` number of times. -1 disables logging.
+// If not specified, defaults to -1
+func PrintFirstN(value int64) PrintAttr {
+	return func(m optionalAttr) {
+		m["first_n"] = value
+	}
+}
 
-// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
+// PrintSummarize sets the optional summarize attribute to value.
 //
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
+// value: Only print this many entries of each tensor.
+// If not specified, defaults to 3
+func PrintSummarize(value int64) PrintAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["summarize"] = value
 	}
 }
 
-// Resize `images` to `size` using nearest neighbor interpolation.
+// Prints a list of tensors.
+//
+// Passes `input` through to `output` and prints `data` when evaluating.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+//	input: The tensor passed to `output`
+//	data: A list of tensors to print out when op is evaluated.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
+// Returns = The unmodified `input` tensor
+func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -28927,9 +33849,9 @@ func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optio
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighbor",
+		Type: "Print",
 		Input: []tf.Input{
-			images, size,
+			input, tf.OutputList(data),
 		},
 		Attrs: attrs,
 	}
@@ -28937,82 +33859,44 @@ func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optio
 	return op.Output(0)
 }
 
-// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
-type ResizeNearestNeighborGradAttr func(optionalAttr)
-
-// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
-// aligned. Defaults to false.
-// If not specified, defaults to false
-func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// Computes the gradient of nearest neighbor interpolation.
+// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
 //
 // Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
-// original input size.
-//
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
-// with respect to the input image.
-func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
+//	tag: A string attached to this summary. Used for organization in TensorBoard.
+//	tensor: A tensor to serialize.
+//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
+// data.
+func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighborGrad",
+		Type: "TensorSummaryV2",
 		Input: []tf.Input{
-			grads, size,
+			tag, tensor, serialized_summary_metadata,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
-type ExtractJpegShapeAttr func(optionalAttr)
-
-// ExtractJpegShapeOutputType sets the optional output_type attribute to value.
+// Creates a dataset that asynchronously prefetches elements from `input_dataset`.
 //
-// value: (Optional) The output type of the operation (int32 or int64).
-// Defaults to int32.
-// If not specified, defaults to DT_INT32
-func ExtractJpegShapeOutputType(value tf.DataType) ExtractJpegShapeAttr {
-	return func(m optionalAttr) {
-		m["output_type"] = value
-	}
-}
-
-// Extract the shape information of a JPEG-encoded image.
+// Arguments:
 //
-// This op only parses the image header, so it is much faster than DecodeJpeg.
+//	buffer_size: The maximum number of elements to buffer in an iterator over
+// this dataset.
 //
-// Arguments:
-//	contents: 0-D. The JPEG-encoded image.
 //
-// Returns 1-D. The image shape with format [height, width, channels].
-func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegShapeAttr) (image_shape tf.Output) {
+func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ExtractJpegShape",
+		Type: "PrefetchDataset",
 		Input: []tf.Input{
-			contents,
+			input_dataset, buffer_size,
 		},
 		Attrs: attrs,
 	}
@@ -29020,272 +33904,296 @@ func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegS
 	return op.Output(0)
 }
 
-// PaddingFIFOQueueV2Attr is an optional argument to PaddingFIFOQueueV2.
-type PaddingFIFOQueueV2Attr func(optionalAttr)
-
-// PaddingFIFOQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types.
-// Shapes of fixed rank but variable size are allowed by setting
-// any shape dimension to -1.  In this case, the inputs' shape may vary along
-// the given dimension, and DequeueMany will pad the given dimension with
-// zeros up to the maximum shape of all elements in the given batch.
-// If the length of this attr is 0, different queue elements may have
-// different ranks and shapes, but only one element may be dequeued at a time.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
-	}
-}
+// TensorSummaryAttr is an optional argument to TensorSummary.
+type TensorSummaryAttr func(optionalAttr)
 
-// PaddingFIFOQueueV2Capacity sets the optional capacity attribute to value.
+// TensorSummaryDescription sets the optional description attribute to value.
 //
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func PaddingFIFOQueueV2Capacity(value int64) PaddingFIFOQueueV2Attr {
+// value: A json-encoded SummaryDescription proto.
+// If not specified, defaults to ""
+func TensorSummaryDescription(value string) TensorSummaryAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["description"] = value
 	}
 }
 
-// PaddingFIFOQueueV2Container sets the optional container attribute to value.
+// TensorSummaryLabels sets the optional labels attribute to value.
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func PaddingFIFOQueueV2Container(value string) PaddingFIFOQueueV2Attr {
+// value: An unused list of strings.
+// If not specified, defaults to <>
+func TensorSummaryLabels(value []string) TensorSummaryAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["labels"] = value
 	}
 }
 
-// PaddingFIFOQueueV2SharedName sets the optional shared_name attribute to value.
+// TensorSummaryDisplayName sets the optional display_name attribute to value.
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
+// value: An unused string.
 // If not specified, defaults to ""
-func PaddingFIFOQueueV2SharedName(value string) PaddingFIFOQueueV2Attr {
+func TensorSummaryDisplayName(value string) TensorSummaryAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["display_name"] = value
 	}
 }
 
-// A queue that produces elements in first-in first-out order.
+// Outputs a `Summary` protocol buffer with a tensor.
 //
-// Variable-size shapes are allowed by setting the corresponding shape dimensions
-// to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
-// size of any given element in the minibatch.  See below for details.
+// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
+// a tag as well as a serialized SummaryMetadata proto string that contains
+// plugin-specific data. We will keep this op to maintain backwards compatibility.
 //
 // Arguments:
-//	component_types: The type of each component in a value.
-//
-// Returns The handle to the queue.
-func PaddingFIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...PaddingFIFOQueueV2Attr) (handle tf.Output) {
+//	tensor: A tensor to serialize.
+func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PaddingFIFOQueueV2",
-
+		Type: "TensorSummary",
+		Input: []tf.Input{
+			tensor,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodePngAttr is an optional argument to DecodePng.
-type DecodePngAttr func(optionalAttr)
-
-// DecodePngChannels sets the optional channels attribute to value.
+// Read an element from the TensorArray into output `value`.
 //
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodePngChannels(value int64) DecodePngAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
+// Arguments:
+//	handle: The handle to a TensorArray.
+//
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
+//
+// Returns The tensor that is read from the TensorArray.
+func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// DecodePngDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_UINT8
-func DecodePngDtype(value tf.DataType) DecodePngAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayReadV3",
+		Input: []tf.Input{
+			handle, index, flow_in,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Decode a PNG-encoded image to a uint8 or uint16 tensor.
+// Reduces sparse updates into the variable referenced by `resource` using the `max` operation.
 //
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
+// This operation computes
 //
-// Accepted values are:
+//     # Scalar indices
+//     ref[indices, ...] = max(ref[indices, ...], updates[...])
 //
-// *   0: Use the number of channels in the PNG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-// *   4: output an RGBA image.
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
 //
-// If needed, the PNG-encoded image is transformed to match the requested number
-// of color channels.
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
 //
-// This op also supports decoding JPEGs and non-animated GIFs since the interface
-// is the same, though it is cleaner to use `tf.image.decode_image`.
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions are combined.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
 //
 // Arguments:
-//	contents: 0-D.  The PNG-encoded image.
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
 //
-// Returns 3-D with shape `[height, width, channels]`.
-func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
+// Returns the created operation.
+func ResourceScatterMax(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterMax",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the gradient for the tanh of `x` wrt its input.
+//
+// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
+// is the corresponding input gradient.
+func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodePng",
+		Type: "TanhGrad",
 		Input: []tf.Input{
-			contents,
+			y, dy,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Decode the first frame of a GIF-encoded image to a uint8 tensor.
-//
-// GIF with frame or transparency compression are not supported
-// convert animated GIF from compressed to uncompressed by:
-//
-//     convert $src.gif -coalesce $dst.gif
+// Outputs a `Summary` protocol buffer with scalar values.
 //
-// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
-// `tf.image.decode_image`.
+// The input `tags` and `values` must have the same shape.  The generated summary
+// has a summary value for each tag-value pair in `tags` and `values`.
 //
 // Arguments:
-//	contents: 0-D.  The GIF-encoded image.
+//	tags: Tags for the summary.
+//	values: Same shape as `tags.  Values for the summary.
 //
-// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB order
-func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
+// Returns Scalar.  Serialized `Summary` protocol buffer.
+func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeGif",
+		Type: "ScalarSummary",
 		Input: []tf.Input{
-			contents,
+			tags, values,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LearnedUnigramCandidateSamplerAttr is an optional argument to LearnedUnigramCandidateSampler.
-type LearnedUnigramCandidateSamplerAttr func(optionalAttr)
+// ImageSummaryAttr is an optional argument to ImageSummary.
+type ImageSummaryAttr func(optionalAttr)
 
-// LearnedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+// ImageSummaryMaxImages sets the optional max_images attribute to value.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func LearnedUnigramCandidateSamplerSeed(value int64) LearnedUnigramCandidateSamplerAttr {
+// value: Max number of batch elements to generate images for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["max_images"] = value
 	}
 }
 
-// LearnedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func LearnedUnigramCandidateSamplerSeed2(value int64) LearnedUnigramCandidateSamplerAttr {
+// value: Color to use for pixels with non-finite values.
+// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
+func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["bad_color"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
+// Outputs a `Summary` protocol buffer with images.
 //
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
+// The summary has up to `max_images` summary values containing images. The
+// images are built from `tensor` which must be 4-D with shape `[batch_size,
+// height, width, channels]` and where `channels` can be:
 //
-// For each batch, this op picks a single set of sampled candidate labels.
+// *  1: `tensor` is interpreted as Grayscale.
+// *  3: `tensor` is interpreted as RGB.
+// *  4: `tensor` is interpreted as RGBA.
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// The images have the same number of channels as the input tensor. For float
+// input, the values are normalized one image at a time to fit in the range
+// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
+// normalization algorithms:
+//
+// *  If the input values are all positive, they are rescaled so the largest one
+//    is 255.
+//
+// *  If any input value is negative, the values are shifted so input value 0.0
+//    is at 127.  They are then rescaled so that either the smallest value is 0,
+//    or the largest one is 255.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
+// *  If `max_images` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+//
+// The `bad_color` argument is the color to use in the generated images for
+// non-finite input values.  It is a `uint8` 1-D tensor of length `channels`.
+// Each element must be in the range `[0, 255]` (It represents the value of a
+// pixel in the output image).  Non-finite values in the input tensor are
+// replaced by this tensor in the output image.  The default value is the color
+// red.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
+// `channels` is 1, 3, or 4.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func LearnedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LearnedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LearnedUnigramCandidateSampler",
+		Type: "ImageSummary",
 		Input: []tf.Input{
-			true_classes,
+			tag, tensor,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// SerializeSparseAttr is an optional argument to SerializeSparse.
-type SerializeSparseAttr func(optionalAttr)
+// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
+type AudioSummaryV2Attr func(optionalAttr)
 
-// SerializeSparseOutType sets the optional out_type attribute to value.
+// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
 //
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["max_outputs"] = value
 	}
 }
 
-// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
+// Outputs a `Summary` protocol buffer with audio.
+//
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
-func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -29294,405 +34202,256 @@ func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SerializeSparse",
+		Type: "AudioSummaryV2",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			tag, tensor, sample_rate,
 		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
-type RandomShuffleQueueV2Attr func(optionalAttr)
-
-// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
-	}
-}
-
-// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
-//
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
-//
-// value: Dequeue will block unless there would be this
-// many elements after the dequeue or the queue is closed. This
-// ensures a minimum level of mixing of elements.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["min_after_dequeue"] = value
-	}
-}
-
-// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 is set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
+// Splits a tensor into a list.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// RandomShuffleQueueV2Container sets the optional container attribute to value.
+// list[i] corresponds to lengths[i] tensors from the input tensor.
+// The tensor must have rank at least 1 and contain exactly sum(lengths) elements.
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// tensor: The input tensor.
+// element_shape: A shape compatible with that of elements in the tensor.
+// lengths: Vector of sizes of the 0th dimension of tensors in the list.
+// output_handle: The list.
+func TensorListSplit(scope *Scope, tensor tf.Output, element_shape tf.Output, lengths tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListSplit",
+		Input: []tf.Input{
+			tensor, element_shape, lengths,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
+// AvgPoolAttr is an optional argument to AvgPool.
+type AvgPoolAttr func(optionalAttr)
+
+// AvgPoolDataFormat sets the optional data_format attribute to value.
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolDataFormat(value string) AvgPoolAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["data_format"] = value
 	}
 }
 
-// A queue that randomizes the order of elements.
+// Performs average pooling on the input.
+//
+// Each entry in `output` is the mean of the corresponding size `ksize`
+// window in `value`.
 //
 // Arguments:
-//	component_types: The type of each component in a value.
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	ksize: The size of the sliding window for each dimension of `value`.
+//	strides: The stride of the sliding window for each dimension of `value`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns The handle to the queue.
-func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
+// Returns The average pooled output tensor.
+func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomShuffleQueueV2",
-
+		Type: "AvgPool",
+		Input: []tf.Input{
+			value,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Draw bounding boxes on a batch of images.
-//
-// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
-// boxes specified by the locations in `boxes`. The coordinates of the each
-// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
+// Merges summaries.
 //
-// For example, if an image is 100 x 200 pixels (height x width) and the bounding
-// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
-// the bounding box will be `(40, 10)` to `(180, 50)` (in (x,y) coordinates).
+// This op creates a
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// protocol buffer that contains the union of all the values in the input
+// summaries.
 //
-// Parts of the bounding box may fall outside the image.
+// When the Op is run, it reports an `InvalidArgument` error if multiple values
+// in the summaries to merge use the same tag.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
-//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
-// boxes.
+//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
+// buffers.
 //
-// Returns 4-D with the same shape as `images`. The batch of input images with
-// bounding boxes drawn on the images.
-func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DrawBoundingBoxes",
+		Type: "MergeSummary",
 		Input: []tf.Input{
-			images, boxes,
+			tf.OutputList(inputs),
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Gets the next output from the given iterator.
+// The shape of the elements of the given list, as a tensor.
 //
-// This operation is a synchronous version IteratorGetNext. It should only be used
-// in situations where the iterator does not block the calling thread, or where
-// the calling thread is not a member of the thread pool used to execute parallel
-// operations (e.g. in eager mode).
-func IteratorGetNextSync(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+//   input_handle: the list
+//   element_shape: the shape of elements of the list
+func TensorListElementShape(scope *Scope, input_handle tf.Output, shape_type tf.DataType) (element_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"shape_type": shape_type}
 	opspec := tf.OpSpec{
-		Type: "IteratorGetNextSync",
+		Type: "TensorListElementShape",
 		Input: []tf.Input{
-			iterator,
+			input_handle,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("IteratorGetNextSync", err)
-		return
-	}
-	return components
+	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxV2Attr is an optional argument to SampleDistortedBoundingBoxV2.
-type SampleDistortedBoundingBoxV2Attr func(optionalAttr)
-
-// SampleDistortedBoundingBoxV2Seed sets the optional seed attribute to value.
+// Returns the item in the list with the given index.
 //
-// value: If either `seed` or `seed2` are set to non-zero, the random number
-// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-// seed.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxV2Seed(value int64) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxV2Seed2 sets the optional seed2 attribute to value.
+// input_handle: the list
+// index: the position in the list from which an element will be retrieved
+// item: the element at that position
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxV2AspectRatioRange sets the optional aspect_ratio_range attribute to value.
 //
-// value: The cropped area of the image must have an aspect ratio =
-// width / height within this range.
-// If not specified, defaults to <f:0.75 f:1.33 >
-func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["aspect_ratio_range"] = value
+func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, element_shape tf.Output, element_dtype tf.DataType) (item tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
-//
-// value: The cropped area of the image must contain a fraction of the
-// supplied image within this range.
-// If not specified, defaults to <f:0.05 f:1 >
-func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["area_range"] = value
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListGetItem",
+		Input: []tf.Input{
+			input_handle, index, element_shape,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxV2MaxAttempts sets the optional max_attempts attribute to value.
+// Resizes the list.
 //
-// value: Number of attempts at generating a cropped region of the image
-// of the specified constraints. After `max_attempts` failures, return the entire
-// image.
-// If not specified, defaults to 100
-func SampleDistortedBoundingBoxV2MaxAttempts(value int64) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["max_attempts"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
 //
-// value: Controls behavior if no bounding boxes supplied.
-// If true, assume an implicit bounding box covering the whole input. If false,
-// raise an error.
-// If not specified, defaults to false
-func SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["use_image_if_no_bounding_boxes"] = value
+// input_handle: the input list
+// size: size of the output list
+//
+func TensorListResize(scope *Scope, input_handle tf.Output, size tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListResize",
+		Input: []tf.Input{
+			input_handle, size,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Generate a single randomly distorted bounding box for an image.
-//
-// Bounding box annotations are often supplied in addition to ground-truth labels
-// in image recognition or object localization tasks. A common technique for
-// training such a system is to randomly distort an image while preserving
-// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-// localization of an object, i.e. bounding box, given an `image_size`,
-// `bounding_boxes` and a series of constraints.
-//
-// The output of this Op is a single bounding box that may be used to crop the
-// original image. The output is returned as 3 tensors: `begin`, `size` and
-// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-// what the bounding box looks like.
+// Returns a diagonal tensor with a given diagonal values.
 //
-// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
 //
-// For example,
+// Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
+// rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
 //
-// ```python
-//     # Generate a single distorted bounding box.
-//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-//         tf.shape(image),
-//         bounding_boxes=bounding_boxes)
+// `output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
 //
-//     # Draw the bounding box in an image summary.
-//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-//                                                   bbox_for_draw)
-//     tf.summary.image('images_with_box', image_with_box)
+// For example:
 //
-//     # Employ the bounding box to distort the image.
-//     distorted_image = tf.slice(image, begin, size)
 // ```
-//
-// Note that if no bounding box information is available, setting
-// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-// false and no bounding boxes are supplied, an error is raised.
+// # 'diagonal' is [1, 2, 3, 4]
+// tf.diag(diagonal) ==> [[1, 0, 0, 0]
+//                        [0, 2, 0, 0]
+//                        [0, 0, 3, 0]
+//                        [0, 0, 0, 4]]
+// ```
 //
 // Arguments:
-//	image_size: 1-D, containing `[height, width, channels]`.
-//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-// associated with the image.
-//	min_object_covered: The cropped area of the image must contain at least this
-// fraction of any bounding box supplied. The value of this parameter should be
-// non-negative. In the case of 0, the cropped area does not need to overlap
-// any of the bounding boxes supplied.
-//
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, min_object_covered tf.Output, optional ...SampleDistortedBoundingBoxV2Attr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+//	diagonal: Rank k tensor where k is at most 1.
+func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBoxV2",
+		Type: "Diag",
 		Input: []tf.Input{
-			image_size, bounding_boxes, min_object_covered,
+			diagonal,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
-type ExtractGlimpseAttr func(optionalAttr)
-
-// ExtractGlimpseCentered sets the optional centered attribute to value.
-//
-// value: indicates if the offset coordinates are centered relative to
-// the image, in which case the (0, 0) offset is relative to the center
-// of the input images. If false, the (0,0) offset corresponds to the
-// upper left corner of the input images.
-// If not specified, defaults to true
-func ExtractGlimpseCentered(value bool) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["centered"] = value
-	}
-}
+// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
+type ParameterizedTruncatedNormalAttr func(optionalAttr)
 
-// ExtractGlimpseNormalized sets the optional normalized attribute to value.
+// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
 //
-// value: indicates if the offset coordinates are normalized.
-// If not specified, defaults to true
-func ExtractGlimpseNormalized(value bool) ExtractGlimpseAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
 	return func(m optionalAttr) {
-		m["normalized"] = value
+		m["seed"] = value
 	}
 }
 
-// ExtractGlimpseUniformNoise sets the optional uniform_noise attribute to value.
+// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
 //
-// value: indicates if the noise should be generated using a
-// uniform distribution or a Gaussian distribution.
-// If not specified, defaults to true
-func ExtractGlimpseUniformNoise(value bool) ExtractGlimpseAttr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
 	return func(m optionalAttr) {
-		m["uniform_noise"] = value
+		m["seed2"] = value
 	}
 }
 
-// Extracts a glimpse from the input tensor.
-//
-// Returns a set of windows called glimpses extracted at location
-// `offsets` from the input tensor. If the windows only partially
-// overlaps the inputs, the non overlapping areas will be filled with
-// random noise.
-//
-// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
-// glimpse_width, channels]`. The channels and batch dimensions are the
-// same as that of the input tensor. The height and width of the output
-// windows are specified in the `size` parameter.
-//
-// The argument `normalized` and `centered` controls how the windows are built:
+// Outputs random values from a normal distribution. The parameters may each be a
 //
-// * If the coordinates are normalized but not centered, 0.0 and 1.0
-//   correspond to the minimum and maximum of each height and width
-//   dimension.
-// * If the coordinates are both normalized and centered, they range from
-//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
-//   left corner, the lower right corner is located at (1.0, 1.0) and the
-//   center is at (0, 0).
-// * If the coordinates are not normalized they are interpreted as
-//   numbers of pixels.
+// scalar which applies to the entire output, or a vector of length shape[0] which
+// stores the parameters for each batch.
 //
 // Arguments:
-//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
-//	size: A 1-D tensor of 2 elements containing the size of the glimpses
-// to extract.  The glimpse height must be specified first, following
-// by the glimpse width.
-//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
-// the y, x locations of the center of each window.
+//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
+//	means: The mean parameter of each batch.
+//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
+//	minvals: The minimum cutoff. May be -infinity.
+//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
+// for each batch.
 //
-// Returns A tensor representing the glimpses `[batch_size,
-// glimpse_height, glimpse_width, channels]`.
-func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseAttr) (glimpse tf.Output) {
+// Returns A matrix of shape num_batches x samples_per_batch, filled with random
+// truncated normal values using the parameters for each row.
+func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -29701,9 +34460,9 @@ func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ExtractGlimpse",
+		Type: "ParameterizedTruncatedNormal",
 		Input: []tf.Input{
-			input, size, offsets,
+			shape, means, stdevs, minvals, maxvals,
 		},
 		Attrs: attrs,
 	}
@@ -29711,121 +34470,141 @@ func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Ou
 	return op.Output(0)
 }
 
-// A container for an iterator resource.
+// Sets the index-th position of the list to contain the given tensor.
 //
-// Returns A handle to the iterator that can be passed to a "MakeIterator"
-// or "IteratorGetNext" op.
-func Iterator(scope *Scope, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// input_handle: the list
+// index: the position in the list to which the tensor will be assigned
+// item: the element to be assigned to that position
+// output_handle: the new list, with the element in the proper position
+//
+func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, item tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Iterator",
-
-		Attrs: attrs,
+		Type: "TensorListSetItem",
+		Input: []tf.Input{
+			input_handle, index, item,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
-type CropAndResizeGradImageAttr func(optionalAttr)
-
-// CropAndResizeGradImageMethod sets the optional method attribute to value.
+// Creates a TensorList by indexing into a Tensor.
 //
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
-	return func(m optionalAttr) {
-		m["method"] = value
+// Each member of the TensorList corresponds to one row of the input tensor,
+// specified by the given index (see `tf.gather`).
+//
+// tensor: The input tensor.
+// indices: The indices used to index into the list.
+// element_shape: The shape of the elements in the list (can be less specified than
+//   the shape of the tensor).
+// output_handle: The TensorList.
+func TensorListScatter(scope *Scope, tensor tf.Output, indices tf.Output, element_shape tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListScatter",
+		Input: []tf.Input{
+			tensor, indices, element_shape,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the gradient of the crop_and_resize op wrt the input image tensor.
-//
-// Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
-// containing the original image size. Both `image_height` and `image_width` need
-// to be positive.
-//
+// Deprecated. Use TensorArrayScatterV3
 //
-// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayScatterV3
+func TensorArrayScatterV2(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradImage",
+		Type: "TensorArrayScatterV2",
 		Input: []tf.Input{
-			grads, boxes, box_ind, image_size,
+			handle, indices, value, flow_in,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ShuffleDatasetAttr is an optional argument to ShuffleDataset.
-type ShuffleDatasetAttr func(optionalAttr)
+// AsStringAttr is an optional argument to AsString.
+type AsStringAttr func(optionalAttr)
 
-// ShuffleDatasetReshuffleEachIteration sets the optional reshuffle_each_iteration attribute to value.
+// AsStringPrecision sets the optional precision attribute to value.
 //
-// value: If true, each iterator over this dataset will be given
-// a different pseudorandomly generated seed, based on a sequence seeded by the
-// `seed` and `seed2` inputs. If false, each iterator will be given the same
-// seed, and repeated iteration over this dataset will yield the exact same
-// sequence of results.
-// If not specified, defaults to true
-func ShuffleDatasetReshuffleEachIteration(value bool) ShuffleDatasetAttr {
+// value: The post-decimal precision to use for floating point numbers.
+// Only used if precision > -1.
+// If not specified, defaults to -1
+func AsStringPrecision(value int64) AsStringAttr {
+	return func(m optionalAttr) {
+		m["precision"] = value
+	}
+}
+
+// AsStringScientific sets the optional scientific attribute to value.
+//
+// value: Use scientific notation for floating point numbers.
+// If not specified, defaults to false
+func AsStringScientific(value bool) AsStringAttr {
+	return func(m optionalAttr) {
+		m["scientific"] = value
+	}
+}
+
+// AsStringShortest sets the optional shortest attribute to value.
+//
+// value: Use shortest representation (either scientific or standard) for
+// floating point numbers.
+// If not specified, defaults to false
+func AsStringShortest(value bool) AsStringAttr {
+	return func(m optionalAttr) {
+		m["shortest"] = value
+	}
+}
+
+// AsStringWidth sets the optional width attribute to value.
+//
+// value: Pad pre-decimal numbers to this width.
+// Applies to both floating point and integer numbers.
+// Only used if width > -1.
+// If not specified, defaults to -1
+func AsStringWidth(value int64) AsStringAttr {
+	return func(m optionalAttr) {
+		m["width"] = value
+	}
+}
+
+// AsStringFill sets the optional fill attribute to value.
+//
+// value: The value to pad if width > -1.  If empty, pads with spaces.
+// Another typical value is '0'.  String cannot be longer than 1 character.
+// If not specified, defaults to ""
+func AsStringFill(value string) AsStringAttr {
 	return func(m optionalAttr) {
-		m["reshuffle_each_iteration"] = value
+		m["fill"] = value
 	}
 }
 
-// Creates a dataset that shuffles elements from `input_dataset` pseudorandomly.
-//
-// Arguments:
-//
-//	buffer_size: The number of output elements to buffer in an iterator over
-// this dataset. Compare with the `min_after_dequeue` attr when creating a
-// `RandomShuffleQueue`.
-//	seed: A scalar seed for the random number generator. If either `seed` or
-// `seed2` is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
-//
+// Converts each entry in the given tensor to strings.  Supports many numeric
 //
-func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ShuffleDatasetAttr) (handle tf.Output) {
+// types and boolean.
+func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ShuffleDataset",
+		Type: "AsString",
 		Input: []tf.Input{
-			input_dataset, buffer_size, seed, seed2,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -29833,183 +34612,147 @@ func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output
 	return op.Output(0)
 }
 
-// 3D fast Fourier transform.
+// Returns a `RaggedTensor` containing the specified sequences of numbers.
 //
-// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
-// dimensions of `input`.
 //
-// Arguments:
-//	input: A complex64 tensor.
+// Returns a `RaggedTensor` `result` composed from `rt_dense_values` and
+// `rt_nested_splits`, such that
+// `result[i] = range(starts[i], limits[i], deltas[i])`.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their 3D Fourier transform.
+// ```python
+// >>> (rt_nested_splits, rt_dense_values) = gen_ragged_ops.ragged_range(
+// ...     starts=[2, 5, 8], limits=[3, 5, 12], deltas=1)
+// >>> result = ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
+// >>> print result.eval().tolist()
+// [[2],               # result[0] = range(2, 3)
+//  [],                # result[1] = range(5, 5)
+//  [8, 9, 10, 11]]    # result[2] = range(8, 12)
+// ```
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.fftn with 3 dimensions.
-// @end_compatibility
-func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+// The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
+// The vector inputs must all have the same size.  Scalar inputs are broadcast
+// to match the size of the vector inputs.
+//
+// Arguments:
+//	starts: The starts of each range.
+//	limits: The limits of each range.
+//	deltas: The deltas of each range.
+//
+// Returns The `row_splits` for the returned `RaggedTensor`.The `flat_values` for the returned `RaggedTensor`.
+func RaggedRange(scope *Scope, starts tf.Output, limits tf.Output, deltas tf.Output) (rt_nested_splits tf.Output, rt_dense_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FFT3D",
+		Type: "RaggedRange",
 		Input: []tf.Input{
-			input,
+			starts, limits, deltas,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
-type CropAndResizeGradBoxesAttr func(optionalAttr)
-
-// CropAndResizeGradBoxesMethod sets the optional method attribute to value.
+// Deprecated, use python implementation tf.linalg.matrix_exponential.
 //
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
-	return func(m optionalAttr) {
-		m["method"] = value
+// DEPRECATED at GraphDef version 27: Use Python implementation tf.linalg.matrix_exponential instead.
+func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixExponential",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
+// Computes the Cholesky decomposition of one or more square matrices.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices.
+//
+// The input has to be symmetric and positive definite. Only the lower-triangular
+// part of the input will be used for this operation. The upper-triangular part
+// will not be read.
+//
+// The output is a tensor of the same shape as the input
+// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
+//
+// **Note**: The gradient computation on GPU is faster for large matrices but
+// not for large batch dimensions when the submatrices are small. In this
+// case it might be faster to use the CPU.
 //
 // Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-// Both `image_height` and `image_width` need to be positive.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	input: Shape is `[..., M, M]`.
 //
-// Returns A 2-D tensor of shape `[num_boxes, 4]`.
-func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxes tf.Output, box_ind tf.Output, optional ...CropAndResizeGradBoxesAttr) (output tf.Output) {
+// Returns Shape is `[..., M, M]`.
+func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradBoxes",
+		Type: "Cholesky",
 		Input: []tf.Input{
-			grads, image, boxes, box_ind,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
+// Writes contents to the file at input filename. Creates file and recursively
 //
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes with score less than
-// `score_threshold` are removed.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system and more
-// generally is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression_v2(
-//       boxes, scores, max_output_size, iou_threshold, score_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// creates directory if not existing.
 //
 // Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
+//	filename: scalar. The name of the file to which we write the contents.
+//	contents: scalar. The content to be written to the output file.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionV3(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
+// Returns the created operation.
+func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionV3",
+		Type: "WriteFile",
 		Input: []tf.Input{
-			boxes, scores, max_output_size, iou_threshold, score_threshold,
+			filename, contents,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// NonMaxSuppressionV4Attr is an optional argument to NonMaxSuppressionV4.
-type NonMaxSuppressionV4Attr func(optionalAttr)
+// AllAttr is an optional argument to All.
+type AllAttr func(optionalAttr)
 
-// NonMaxSuppressionV4PadToMaxOutputSize sets the optional pad_to_max_output_size attribute to value.
+// AllKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If true, the output `selected_indices` is padded to be of length
-// `max_output_size`. Defaults to false.
+// value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func NonMaxSuppressionV4PadToMaxOutputSize(value bool) NonMaxSuppressionV4Attr {
+func AllKeepDims(value bool) AllAttr {
 	return func(m optionalAttr) {
-		m["pad_to_max_output_size"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
+// Computes the "logical and" of elements across dimensions of a tensor.
 //
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes with score less than
-// `score_threshold` are removed.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system and more
-// generally is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression_v2(
-//       boxes, scores, max_output_size, iou_threshold, score_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.A 0-D integer tensor representing the number of valid elements in
-// `selected_indices`, with the valid elements appearing first.
-func NonMaxSuppressionV4(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output, optional ...NonMaxSuppressionV4Attr) (selected_indices tf.Output, valid_outputs tf.Output) {
+// Returns The reduced tensor.
+func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -30018,48 +34761,38 @@ func NonMaxSuppressionV4(scope *Scope, boxes tf.Output, scores tf.Output, max_ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionV4",
+		Type: "All",
 		Input: []tf.Input{
-			boxes, scores, max_output_size, iou_threshold, score_threshold,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Computes the matrix logarithm of one or more square matrices:
-//
-//
-// \\(log(exp(A)) = A\\)
-//
-// This op is only defined for complex matrices. If A is positive-definite and
-// real, then casting to a complex matrix, taking the logarithm and casting back
-// to a real matrix will give the correct result.
+// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
 //
-// This function computes the matrix logarithm using the Schur-Parlett algorithm.
-// Details of the algorithm can be found in Section 11.6.2 of:
-// Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
-// ISBN 978-0-898716-46-7.
+// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
 //
 // The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the exponential for all input submatrices `[..., :, :]`.
+// form square matrices, with the same constraints as the single matrix
+// SelfAdjointEig.
+//
+// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
+// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors. The eigenvalues
+// are sorted in non-decreasing order.
 //
 // Arguments:
 //	input: Shape is `[..., M, M]`.
 //
-// Returns Shape is `[..., M, M]`.
-//
-// @compatibility(scipy)
-// Equivalent to scipy.linalg.logm
-// @end_compatibility
-func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns Shape is `[..., M+1, M]`.
+func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixLogarithm",
+		Type: "SelfAdjointEig",
 		Input: []tf.Input{
 			input,
 		},
@@ -30068,138 +34801,89 @@ func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-//   This op is used as a placeholder in If branch functions. It doesn't provide a
-//   valid output when run, so must either be removed (e.g. replaced with a
-//   function input) or guaranteed not to be used (e.g. if mirroring an
-//   intermediate output needed for the gradient computation of the other branch).
+// Computes softplus gradients for a softplus operation.
 //
 // Arguments:
-//	dtype: The type of the output.
-//	shape:     The purported shape of the output. This is only used for shape inference;
-//     the output will not necessarily have this shape. Can be a partial shape.
-//
-// Returns     \"Fake\" output value. This should not be consumed by another op.
-func FakeParam(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
-	opspec := tf.OpSpec{
-		Type: "FakeParam",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient for the inverse of `x` wrt its input.
+//	gradients: The backpropagated gradients to the corresponding softplus operation.
+//	features: The features passed as input to the corresponding softplus operation.
 //
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// Returns The gradients: `gradients / (1 + exp(-features))`.
+func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "InvGrad",
+		Type: "SoftplusGrad",
 		Input: []tf.Input{
-			y, dy,
+			gradients, features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// List of the given size with empty elements.
+// Solves tridiagonal systems of equations.
 //
-// element_shape: the shape of the future elements of the list
-// num_elements: the number of elements to reserve
-// handle: the output list
-// element_dtype: the desired type of elements in the list.
-func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorListReserve",
-		Input: []tf.Input{
-			element_shape, num_elements,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// A substitute for `InterleaveDataset` on a fixed list of `N` datasets.
+// `diagonals` is a tensor of shape `[..., 3, M]` whose inner-most 2 dimensions
+// represent matrices with three rows being the superdiagonal, diagonals, and
+// subdiagonals, in order. The last element of the superdiagonal and the first
+// element of the subdiagonal is ignored.
+// `rhs` is a tensor of shape `[..., M, K]`, representing K right-hand sides per
+// each left-hand side.
+// The output is a tensor of shape `[..., M, K]` containing the solutions.
 //
 // Arguments:
-//	selector_input_dataset: A dataset of scalar `DT_INT64` elements that determines which of the
-// `N` data inputs should produce the next output element.
-//	data_input_datasets: `N` datasets with the same type that will be interleaved according to
-// the values of `selector_input_dataset`.
-//
+//	diagonals: Shape is `[..., 3, M]`.
+//	rhs: Shape is `[..., M, K]`.
 //
-func ExperimentalDirectedInterleaveDataset(scope *Scope, selector_input_dataset tf.Output, data_input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns Shape is `[..., M, K]`.
+func TridiagonalSolve(scope *Scope, diagonals tf.Output, rhs tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalDirectedInterleaveDataset",
+		Type: "TridiagonalSolve",
 		Input: []tf.Input{
-			selector_input_dataset, tf.OutputList(data_input_datasets),
+			diagonals, rhs,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RandomUniformIntAttr is an optional argument to RandomUniformInt.
-type RandomUniformIntAttr func(optionalAttr)
-
-// RandomUniformIntSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
+type SelfAdjointEigV2Attr func(optionalAttr)
 
-// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
+// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
+// value: If `True` then eigenvectors will be computed and returned in `v`.
+// Otherwise, only the eigenvalues will be computed.
+// If not specified, defaults to true
+func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["compute_v"] = value
 	}
 }
 
-// Outputs random integers from a uniform distribution.
+// Computes the eigen decomposition of one or more square self-adjoint matrices.
 //
-// The generated values are uniform integers in the range `[minval, maxval)`.
-// The lower bound `minval` is included in the range, while the upper bound
-// `maxval` is excluded.
+// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
+// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`. The eigenvalues
+// are sorted in non-decreasing order.
 //
-// The random integers are slightly biased unless `maxval - minval` is an exact
-// power of two.  The bias is small for values of `maxval - minval` significantly
-// smaller than the range of the output (either `2^32` or `2^64`).
+// ```python
+// # a is a tensor.
+// # e is a tensor of eigenvalues.
+// # v is a tensor of eigenvectors.
+// e, v = self_adjoint_eig(a)
+// e = self_adjoint_eig(a, compute_v=False)
+// ```
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	minval: 0-D.  Inclusive lower bound on the generated integers.
-//	maxval: 0-D.  Exclusive upper bound on the generated integers.
+//	input: `Tensor` input of shape `[N, N]`.
 //
-// Returns A tensor of the specified shape filled with uniform random integers.
-func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
+// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
+func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -30208,84 +34892,73 @@ func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomUniformInt",
+		Type: "SelfAdjointEigV2",
 		Input: []tf.Input{
-			shape, minval, maxval,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Add the quantile summaries to each quantile stream resource.
+// Adjust the saturation of one or more images.
 //
-// An op that adds a list of quantile summaries to a quantile stream resource. Each
-// summary Tensor is rank 2, containing summaries (value, weight, min_rank, max_rank)
-// for a single feature.
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
+//
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A scale is then applied all the saturation
+// values, and then remapped back to RGB colorspace.
 //
 // Arguments:
-//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
-//	summaries: string; List of Rank 2 Tensor each containing the summaries for a single feature.
+//	images: Images to adjust.  At least 3-D.
+//	scale: A float scale to add to the saturation.
 //
-// Returns the created operation.
-func BoostedTreesQuantileStreamResourceAddSummaries(scope *Scope, quantile_stream_resource_handle tf.Output, summaries []tf.Output) (o *tf.Operation) {
+// Returns The hue-adjusted image or images.
+func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BoostedTreesQuantileStreamResourceAddSummaries",
+		Type: "AdjustSaturation",
 		Input: []tf.Input{
-			quantile_stream_resource_handle, tf.OutputList(summaries),
+			images, scale,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// StringSplitV2Attr is an optional argument to StringSplitV2.
-type StringSplitV2Attr func(optionalAttr)
+// MatrixSolveAttr is an optional argument to MatrixSolve.
+type MatrixSolveAttr func(optionalAttr)
 
-// StringSplitV2Maxsplit sets the optional maxsplit attribute to value.
+// MatrixSolveAdjoint sets the optional adjoint attribute to value.
 //
-// value: An `int`. If `maxsplit > 0`, limit of the split of the result.
-// If not specified, defaults to -1
-func StringSplitV2Maxsplit(value int64) StringSplitV2Attr {
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+// adjoint.
+// If not specified, defaults to false
+func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
 	return func(m optionalAttr) {
-		m["maxsplit"] = value
+		m["adjoint"] = value
 	}
 }
 
-// Split elements of `source` based on `sep` into a `SparseTensor`.
-//
-// Let N be the size of source (typically N will be the batch size). Split each
-// element of `source` based on `sep` and return a `SparseTensor`
-// containing the split tokens. Empty tokens are ignored.
-//
-// For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
-// then the output will be
-// ```
-// st.indices = [0, 0;
-//               0, 1;
-//               1, 0;
-//               1, 1;
-//               1, 2]
-// st.shape = [2, 3]
-// st.values = ['hello', 'world', 'a', 'b', 'c']
-// ```
-//
-// If `sep` is given, consecutive delimiters are not grouped together and are
-// deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
-// sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
-// string, consecutive whitespace are regarded as a single separator, and the
-// result will contain no empty strings at the startor end if the string has
-// leading or trailing whitespace.
+// Solves systems of linear equations.
 //
-// Note that the above mentioned behavior matches python's str.split.
+// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
+// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
+// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `True` then each output matrix satisfies
+// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
 //
 // Arguments:
-//	input: `1-D` string `Tensor`, the strings to split.
-//	sep: `0-D` string `Tensor`, the delimiter character.
-func StringSplitV2(scope *Scope, input tf.Output, sep tf.Output, optional ...StringSplitV2Attr) (indices tf.Output, values tf.Output, shape tf.Output) {
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
+//
+// Returns Shape is `[..., M, K]`.
+func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -30294,538 +34967,628 @@ func StringSplitV2(scope *Scope, input tf.Output, sep tf.Output, optional ...Str
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringSplitV2",
+		Type: "MatrixSolve",
 		Input: []tf.Input{
-			input, sep,
+			matrix, rhs,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+// ResourceApplyKerasMomentumAttr is an optional argument to ResourceApplyKerasMomentum.
+type ResourceApplyKerasMomentumAttr func(optionalAttr)
+
+// ResourceApplyKerasMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// Arguments:
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyKerasMomentumUseLocking(value bool) ResourceApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyKerasMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-//	thread_pool: A resource produced by the ThreadPoolHandle op.
+// value: If `True`, the tensor passed to compute grad will be
+// var + momentum * accum, so in the end, the var you get is actually
+// var + momentum * accum.
+// If not specified, defaults to false
+func ResourceApplyKerasMomentumUseNesterov(value bool) ResourceApplyKerasMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
+//
+// want to use Nesterov momentum.
+//
+// accum = accum * momentum - lr * grad
+// var += accum
 //
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
 //
-func ExperimentalThreadPoolDataset(scope *Scope, input_dataset tf.Output, thread_pool tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns the created operation.
+func ResourceApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyKerasMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalThreadPoolDataset",
+		Type: "ResourceApplyKerasMomentum",
 		Input: []tf.Input{
-			input_dataset, thread_pool,
+			var_, accum, lr, grad, momentum,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes softsign: `features / (abs(features) + 1)`.
-func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
+// Returns a serialized GraphDef representing `input_dataset`.
+//
+// Returns a graph representation for `input_dataset`.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the dataset to return the graph representation for.
+//
+// Returns The graph representation of the dataset (as serialized GraphDef).
+func DatasetToGraph(scope *Scope, input_dataset tf.Output) (graph tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Softsign",
+		Type: "DatasetToGraph",
 		Input: []tf.Input{
-			features,
+			input_dataset,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// EncodeProtoAttr is an optional argument to EncodeProto.
-type EncodeProtoAttr func(optionalAttr)
+// LuAttr is an optional argument to Lu.
+type LuAttr func(optionalAttr)
 
-// EncodeProtoDescriptorSource sets the optional descriptor_source attribute to value.
-// If not specified, defaults to "local://"
-func EncodeProtoDescriptorSource(value string) EncodeProtoAttr {
+// LuOutputIdxType sets the optional output_idx_type attribute to value.
+// If not specified, defaults to DT_INT32
+func LuOutputIdxType(value tf.DataType) LuAttr {
 	return func(m optionalAttr) {
-		m["descriptor_source"] = value
+		m["output_idx_type"] = value
 	}
 }
 
-// The op serializes protobuf messages provided in the input tensors.
-//
-// The types of the tensors in `values` must match the schema for the
-// fields specified in `field_names`. All the tensors in `values` must
-// have a common shape prefix, *batch_shape*.
-//
-// The `sizes` tensor specifies repeat counts for each field.  The repeat
-// count (last dimension) of a each tensor in `values` must be greater
-// than or equal to corresponding repeat count in `sizes`.
+// Computes the LU decomposition of one or more square matrices.
 //
-// A `message_type` name must be provided to give context for the field
-// names. The actual message descriptor can be looked up either in the
-// linked-in descriptor pool or a filename provided by the caller using
-// the `descriptor_source` attribute.
-//
-// The `descriptor_source` attribute selects a source of protocol
-// descriptors to consult when looking up `message_type`. This may be a
-// filename containing a serialized `FileDescriptorSet` message,
-// or the special value `local://`, in which case only descriptors linked
-// into the code will be searched; the filename can be on any filesystem
-// accessible to TensorFlow.
-//
-// You can build a `descriptor_source` file using the `--descriptor_set_out`
-// and `--include_imports` options to the protocol compiler `protoc`.
-//
-// The `local://` database only covers descriptors linked into the
-// code via C++ libraries, not Python imports. You can link in a proto descriptor
-// by creating a cc_library target with alwayslink=1.
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices.
 //
-// There are a few special cases in the value mapping:
+// The input has to be invertible.
 //
-// Submessage and group fields must be pre-serialized as TensorFlow strings.
+// The output consists of two tensors LU and P containing the LU decomposition
+// of all input submatrices `[..., :, :]`. LU encodes the lower triangular and
+// upper triangular factors.
 //
-// TensorFlow lacks support for unsigned int64s, so they must be
-// represented as `tf.int64` with the same twos-complement bit pattern
-// (the obvious way).
+// For each input submatrix of shape `[M, M]`, L is a lower triangular matrix of
+// shape `[M, M]` with unit diagonal whose entries correspond to the strictly lower
+// triangular part of LU. U is a upper triangular matrix of shape `[M, M]` whose
+// entries correspond to the upper triangular part, including the diagonal, of LU.
 //
-// Unsigned int32 values can be represented exactly with `tf.int64`, or
-// with sign wrapping if the input is of type `tf.int32`.
+// P represents a permutation matrix encoded as a list of indices each between `0`
+// and `M-1`, inclusive. If P_mat denotes the permutation matrix corresponding to
+// P, then the L, U and P satisfies P_mat * input = L * U.
 //
 // Arguments:
-//	sizes: Tensor of int32 with shape `[batch_shape, len(field_names)]`.
-//	values: List of tensors containing values for the corresponding field.
-//	field_names: List of strings containing proto field names.
-//	message_type: Name of the proto message type to decode.
+//	input: A tensor of shape `[..., M, M]` whose inner-most 2 dimensions form matrices of
+// size `[M, M]`.
 //
-// Returns Tensor of serialized protos with shape `batch_shape`.
-func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names []string, message_type string, optional ...EncodeProtoAttr) (bytes tf.Output) {
+// Returns A tensor of shape `[..., M, M]` whose strictly lower triangular part denotes the
+// lower triangular factor `L` with unit diagonal, and whose upper triangular part
+// denotes the upper triangular factor `U`.Permutation of the rows encoded as a list of indices in `0..M-1`. Shape is
+// `[..., M]`.
+// @compatibility(scipy)
+// Similar to `scipy.linalg.lu`, except the triangular factors `L` and `U` are
+// packed into a single tensor, the permutation is applied to `input` instead of
+// the right hand side and the permutation `P` is returned as a list of indices
+// instead of a permutation matrix.
+// @end_compatibility
+func Lu(scope *Scope, input tf.Output, optional ...LuAttr) (lu tf.Output, p tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"field_names": field_names, "message_type": message_type}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeProto",
+		Type: "Lu",
 		Input: []tf.Input{
-			sizes, tf.OutputList(values),
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Creates a TensorArray for storing the gradients of values in the given handle.
-//
-// If the given TensorArray gradient already exists, returns a reference to it.
-//
-// Locks the size of the original TensorArray by disabling its dynamic size flag.
-//
-// **A note about the input flow_in:**
-//
-// The handle flow_in forces the execution of the gradient lookup to occur
-// only after certain other operations have occurred.  For example, when
-// the forward TensorArray is dynamically sized, writes to this TensorArray
-// may resize the object.  The gradient TensorArray is statically sized based
-// on the size of the forward TensorArray when this operation executes.
-// Furthermore, the size of the forward TensorArray is frozen by this call.
-// As a result, the flow is used to ensure that the call to generate the gradient
-// TensorArray only happens after all writes are executed.
+// Deprecated. Use TensorArrayCloseV3
 //
-// In the case of dynamically sized TensorArrays, gradient computation should
-// only be performed on read operations that have themselves been chained via
-// flow to occur only after all writes have executed. That way the final size
-// of the forward TensorArray is known when this operation is called.
+// DEPRECATED at GraphDef version 26: Use TensorArrayCloseV3
 //
-// **A note about the source attribute:**
+// Returns the created operation.
+func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayCloseV2",
+		Input: []tf.Input{
+			handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// EncodeBase64Attr is an optional argument to EncodeBase64.
+type EncodeBase64Attr func(optionalAttr)
+
+// EncodeBase64Pad sets the optional pad attribute to value.
 //
-// TensorArray gradient calls use an accumulator TensorArray object.  If
-// multiple gradients are calculated and run in the same session, the multiple
-// gradient nodes may accidentally flow through the same accumulator TensorArray.
-// This double counts and generally breaks the TensorArray gradient flow.
+// value: Bool whether padding is applied at the ends.
+// If not specified, defaults to false
+func EncodeBase64Pad(value bool) EncodeBase64Attr {
+	return func(m optionalAttr) {
+		m["pad"] = value
+	}
+}
+
+// Encode strings into web-safe base64 format.
 //
-// The solution is to identify which gradient call this particular
-// TensorArray gradient is being called in.  This is performed by identifying
-// a unique string (e.g. "gradients", "gradients_1", ...) from the input
-// gradient Tensor's name.  This string is used as a suffix when creating
-// the TensorArray gradient object here (the attribute `source`).
+// Refer to the following article for more information on base64 format:
+// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
+// end so that the encoded has length multiple of 4. See Padding section of the
+// link above.
 //
-// The attribute `source` is added as a suffix to the forward TensorArray's
-// name when performing the creation / lookup, so that each separate gradient
-// calculation gets its own TensorArray accumulator.
+// Web-safe means that the encoder uses - and _ instead of + and /.
 //
 // Arguments:
-//	handle: The handle to the forward TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	source: The gradient source string, used to decide which gradient TensorArray
-// to return.
-func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
+//	input: Strings to be encoded.
+//
+// Returns Input strings encoded in base64.
+func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"source": source}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGradV3",
+		Type: "EncodeBase64",
 		Input: []tf.Input{
-			handle, flow_in,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Creates a dataset that splits a SparseTensor into elements row-wise.
-func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
+// A dataset that creates window datasets from the input dataset.
+//
+// Arguments:
+//
+//	size: A scalar representing the number of elements to accumulate in a window.
+//	shift: A scalar representing the steps moving the sliding window forward in one
+// iteration. It must be positive.
+//	stride: A scalar representing the stride of the input elements of the sliding window.
+// It must be positive.
+//	drop_remainder: A scalar representing whether a window should be dropped in case its size is
+// smaller than desired.
+//
+//
+func WindowDataset(scope *Scope, input_dataset tf.Output, size tf.Output, shift tf.Output, stride tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorSliceDataset",
+		Type: "WindowDataset",
 		Input: []tf.Input{
-			indices, values, dense_shape,
+			input_dataset, size, shift, stride, drop_remainder,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns x / y element-wise for real types.
+// Computes the matrix square root of one or more square matrices:
 //
-// If `x` and `y` are reals, this will return the floating-point division.
+// matmul(sqrtm(A), sqrtm(A)) = A
 //
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// The input matrix should be invertible. If the input matrix is real, it should
+// have no eigenvalues which are real and negative (pairs of complex conjugate
+// eigenvalues are allowed).
+//
+// The matrix square root is computed by first reducing the matrix to
+// quasi-triangular form with the real Schur decomposition. The square root
+// of the quasi-triangular matrix is then computed directly. Details of
+// the algorithm can be found in: Nicholas J. Higham, "Computing real
+// square roots of a real matrix", Linear Algebra Appl., 1987.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the matrix square root for all input submatrices `[..., :, :]`.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.sqrtm
+// @end_compatibility
+func MatrixSquareRoot(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RealDiv",
+		Type: "MatrixSquareRoot",
 		Input: []tf.Input{
-			x, y,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that concatenates `input_dataset` with `another_dataset`.
-func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// SvdAttr is an optional argument to Svd.
+type SvdAttr func(optionalAttr)
+
+// SvdComputeUv sets the optional compute_uv attribute to value.
+//
+// value: If true, left and right singular vectors will be
+// computed and returned in `u` and `v`, respectively.
+// If false, `u` and `v` are not set and should never referenced.
+// If not specified, defaults to true
+func SvdComputeUv(value bool) SvdAttr {
+	return func(m optionalAttr) {
+		m["compute_uv"] = value
+	}
+}
+
+// SvdFullMatrices sets the optional full_matrices attribute to value.
+//
+// value: If true, compute full-sized `u` and `v`. If false
+// (the default), compute only the leading `P` singular vectors.
+// Ignored if `compute_uv` is `False`.
+// If not specified, defaults to false
+func SvdFullMatrices(value bool) SvdAttr {
+	return func(m optionalAttr) {
+		m["full_matrices"] = value
+	}
+}
+
+// Computes the singular value decompositions of one or more matrices.
+//
+// Computes the SVD of each inner matrix in `input` such that
+// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
+//
+// ```python
+// # a is a tensor containing a batch of matrices.
+// # s is a tensor of singular values for each matrix.
+// # u is the tensor containing of left singular vectors for each matrix.
+// # v is the tensor containing of right singular vectors for each matrix.
+// s, u, v = svd(a)
+// s, _, _ = svd(a, compute_uv=False)
+// ```
+//
+// Arguments:
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//
+// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
+// Undefined if `compute_uv` is false.
+func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ConcatenateDataset",
+		Type: "Svd",
 		Input: []tf.Input{
-			input_dataset, another_dataset,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
-//
-// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
-// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
-// input channel is processed independently of the others with its own structuring
-// function. The `output` tensor has shape
-// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
-// tensor depend on the `padding` algorithm. We currently only support the default
-// "NHWC" `data_format`.
-//
-// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
-// (for consistency with `conv2d`, we use unmirrored filters):
-//
-//     output[b, y, x, c] =
-//        max_{dy, dx} input[b,
-//                           strides[1] * y + rates[1] * dy,
-//                           strides[2] * x + rates[2] * dx,
-//                           c] +
-//                     filter[dy, dx, c]
+// Converts one or more images from RGB to HSV.
 //
-// Max-pooling is a special case when the filter has size equal to the pooling
-// kernel size and contains all zeros.
+// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
 //
-// Note on duality: The dilation of `input` by the `filter` is equal to the
-// negation of the erosion of `-input` by the reflected `filter`.
+// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
+// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
+// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: The input stride for atrous morphological dilation. Must be:
-// `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
 //
-// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
-func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
+// Returns `images` converted to HSV.
+func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Dilation2D",
+		Type: "RGBToHSV",
 		Input: []tf.Input{
-			input, filter,
+			images,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts the given variant tensor to an iterator and stores it in the given resource.
-//
-// Arguments:
-//	resource_handle: A handle to an iterator resource.
-//	serialized: A variant tensor storing the state of the iterator contained in the
-// resource.
+// Does nothing. Only useful as a placeholder for control edges.
 //
 // Returns the created operation.
-func DeserializeIterator(scope *Scope, resource_handle tf.Output, serialized tf.Output) (o *tf.Operation) {
+func NoOp(scope *Scope) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DeserializeIterator",
-		Input: []tf.Input{
-			resource_handle, serialized,
-		},
+		Type: "NoOp",
 	}
 	return scope.AddOperation(opspec)
 }
 
-// TensorArrayConcatV2Attr is an optional argument to TensorArrayConcatV2.
-type TensorArrayConcatV2Attr func(optionalAttr)
+// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
+type MergeV2CheckpointsAttr func(optionalAttr)
 
-// TensorArrayConcatV2ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayConcatV2ElementShapeExcept0(value tf.Shape) TensorArrayConcatV2Attr {
+// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
+//
+// value: see above.
+// If not specified, defaults to true
+func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
 	return func(m optionalAttr) {
-		m["element_shape_except0"] = value
+		m["delete_old_dirs"] = value
 	}
 }
 
-// Deprecated. Use TensorArrayConcatV3
-func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV2Attr) (value tf.Output, lengths tf.Output) {
+// V2 format specific: merges the metadata files of sharded checkpoints.  The
+//
+// result is one logical checkpoint, with one physical metadata file and renamed
+// data files.
+//
+// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
+//
+// If delete_old_dirs is true, attempts to delete recursively the dirname of each
+// path in the input checkpoint_prefixes.  This is useful when those paths are non
+// user-facing temporary locations.
+//
+// Arguments:
+//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
+//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
+// as one of the checkpoint_prefixes.
+//
+// Returns the created operation.
+func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayConcatV2",
+		Type: "MergeV2Checkpoints",
 		Input: []tf.Input{
-			handle, flow_in,
+			checkpoint_prefixes, destination_prefix,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset that batches and pads `batch_size` elements from the input.
+// Saves input tensors slices to disk.
 //
-// Arguments:
+// This is like `Save` except that tensors can be listed in the saved file as being
+// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
+// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
+// have as many elements as `tensor_names`.
 //
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	padded_shapes: A list of int64 tensors representing the desired padded shapes
-// of the corresponding output components. These shapes may be partially
-// specified, using `-1` to indicate that a particular dimension should be
-// padded to the maximum size of all batch elements.
-//	padding_values: A list of scalars containing the padding value to use for
-// each of the outputs.
+// Elements of the `shapes_and_slices` input must either be:
 //
-func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "PaddedBatchDataset",
-		Input: []tf.Input{
-			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that shuffles and repeats elements from `input_dataset`
+// *  The empty string, in which case the corresponding tensor is
+//    saved normally.
+// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
+//    `dimI` are the dimensions of the larger tensor and `slice-spec`
+//    specifies what part is covered by the tensor to save.
 //
-// pseudorandomly.
+// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
+// where each `sliceI` is either:
 //
-// Arguments:
+// *  The string `-` meaning that the slice covers all indices of this dimension
+// *  `start,length` where `start` and `length` are integers.  In that
+//    case the slice covers `length` indices starting at `start`.
 //
-//	buffer_size: The number of output elements to buffer in an iterator over
-// this dataset. Compare with the `min_after_dequeue` attr when creating a
-// `RandomShuffleQueue`.
-//	seed: A scalar seed for the random number generator. If either `seed` or
-// `seed2` is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
-//	count: A scalar representing the number of times the underlying dataset
-// should be repeated. The default is `-1`, which results in infinite repetition.
+// See also `Save`.
 //
+// Arguments:
+//	filename: Must have a single element. The name of the file to which we write the
+// tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
+// saving the tensors.
+//	data: `N` tensors to save.
 //
-func ShuffleAndRepeatDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns the created operation.
+func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ShuffleAndRepeatDataset",
+		Type: "SaveSlices",
 		Input: []tf.Input{
-			input_dataset, buffer_size, seed, seed2, count,
+			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset that caches elements from `input_dataset`.
+// DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
+type DenseToDenseSetOperationAttr func(optionalAttr)
+
+// DenseToDenseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperationAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Applies set operation along last dimension of 2 `Tensor` inputs.
 //
-// A CacheDataset will iterate over the input_dataset, and store tensors. If the
-// cache already exists, the cache will be used. If the cache is inappropriate
-// (e.g. cannot be opened, contains tensors of the wrong shape / size), an error
-// will the returned when used.
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
 //
-// Arguments:
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
-//	filename: A path on the filesystem where we should cache the dataset. Note: this
-// will be a directory.
+// Arguments:
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
 //
 //
-func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_operation string, optional ...DenseToDenseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"set_operation": set_operation}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "CacheDataset",
+		Type: "DenseToDenseSetOperation",
 		Input: []tf.Input{
-			input_dataset, filename,
+			set1, set2,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Creates a dataset that emits the records from one or more binary files.
+// Generate a sharded filename. The filename is printf formatted as
 //
-// Arguments:
-//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
-// read.
-//	header_bytes: A scalar representing the number of bytes to skip at the
-// beginning of a file.
-//	record_bytes: A scalar representing the number of bytes in each record.
-//	footer_bytes: A scalar representing the number of bytes to skip at the end
-// of a file.
-//	buffer_size: A scalar representing the number of bytes to buffer. Must be > 0.
-func FixedLengthRecordDataset(scope *Scope, filenames tf.Output, header_bytes tf.Output, record_bytes tf.Output, footer_bytes tf.Output, buffer_size tf.Output) (handle tf.Output) {
+//    %s-%05d-of-%05d, basename, shard, num_shards.
+func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FixedLengthRecordDataset",
+		Type: "ShardedFilename",
 		Input: []tf.Input{
-			filenames, header_bytes, record_bytes, footer_bytes, buffer_size,
+			basename, shard, num_shards,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Gradients for batch normalization.
-//
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
-//
-// This op is deprecated. See `tf.nn.batch_normalization`.
-//
-// Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this Tensor will be multiplied
-// with the normalized Tensor.
-//	backprop: 4D backprop Tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-//
-// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
-func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
+// Generate a glob pattern matching all sharded file names.
+func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalizationGrad",
+		Type: "ShardedFilespec",
 		Input: []tf.Input{
-			t, m, v, gamma, backprop,
+			basename, num_shards,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// Creates a dataset that emits the records from one or more TFRecord files.
+// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
+type TextLineReaderV2Attr func(optionalAttr)
+
+// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
 //
-// Arguments:
-//	filenames: A scalar or vector containing the name(s) of the file(s) to be
-// read.
-//	compression_type: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//	buffer_size: A scalar representing the number of bytes to buffer. A value of
-// 0 means no buffering will be performed.
-func TFRecordDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: Number of lines to skip from the beginning of every file.
+// If not specified, defaults to 0
+func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["skip_header_lines"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "TFRecordDataset",
-		Input: []tf.Input{
-			filenames, compression_type, buffer_size,
-		},
+}
+
+// TextLineReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// A container for an iterator resource.
+// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// Returns A handle to the iterator that can be passed to a "MakeIterator" or
-// "IteratorGetNext" op. In contrast to Iterator, AnonymousIterator prevents
-// resource sharing by name, and does not keep a reference to the resource
-// container.
-func AnonymousIterator(scope *Scope, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the lines of a file delimited by '\n'.
+//
+// Returns The handle to reference the Reader.
+func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "AnonymousIterator",
+		Type: "TextLineReaderV2",
 
 		Attrs: attrs,
 	}
@@ -30833,104 +35596,89 @@ func AnonymousIterator(scope *Scope, output_types []tf.DataType, output_shapes [
 	return op.Output(0)
 }
 
-// BatchToSpace for 4-D tensors of type T.
-//
-// This is a legacy version of the more general BatchToSpaceND.
-//
-// Rearranges (permutes) data from batch into blocks of spatial data, followed by
-// cropping. This is the reverse transformation of SpaceToBatch. More specifically,
-// this op outputs a copy of the input tensor where values from the `batch`
-// dimension are moved in spatial blocks to the `height` and `width` dimensions,
-// followed by cropping along the `height` and `width` dimensions.
-//
-// Arguments:
-//	input: 4-D tensor with shape
-// `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
-//   depth]`. Note that the batch size of the input tensor must be divisible by
-// `block_size * block_size`.
-//	crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
-// how many elements to crop from the intermediate result across the spatial
-// dimensions as follows:
-//
-//     crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
-//
-//
-// Returns 4-D with shape `[batch, height, width, depth]`, where:
-//
-//       height = height_pad - crop_top - crop_bottom
-//       width = width_pad - crop_left - crop_right
-//
-// The attr `block_size` must be greater than one. It indicates the block size.
-//
-// Some examples:
-//
-// (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
+// LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
+type LoadAndRemapMatrixAttr func(optionalAttr)
+
+// LoadAndRemapMatrixMaxRowsInMemory sets the optional max_rows_in_memory attribute to value.
 //
-// ```
-// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-// ```
+// value: The maximum number of rows to load from the checkpoint at
+// once. If less than or equal to 0, the entire matrix will be loaded into
+// memory. Setting this arg trades increased disk reads for lower memory usage.
+// If not specified, defaults to -1
+func LoadAndRemapMatrixMaxRowsInMemory(value int64) LoadAndRemapMatrixAttr {
+	return func(m optionalAttr) {
+		m["max_rows_in_memory"] = value
+	}
+}
+
+// Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
 //
-// The output tensor has shape `[1, 2, 2, 3]` and value:
+// at `ckpt_path` and potentially reorders its rows and columns using the
+// specified remappings.
 //
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
+// Most users should use one of the wrapper initializers (such as
+// `tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
+// function directly.
 //
-// (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
+// The remappings are 1-D tensors with the following properties:
 //
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
+// * `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
+//   matrix will be initialized from the row corresponding to index
+//   `row_remapping[i]` in the old `Tensor` from the checkpoint.
+// * `col_remapping` must have either 0 entries (indicating that no column
+//   reordering is needed) or `num_cols` entries. If specified, column `j` of the
+//   output matrix will be initialized from the column corresponding to index
+//   `col_remapping[j]` in the old `Tensor` from the checkpoint.
+// * A value of -1 in either of the remappings signifies a "missing" entry. In that
+//   case, values from the `initializing_values` tensor will be used to fill that
+//   missing row or column. If `row_remapping` has `r` missing entries and
+//   `col_remapping` has `c` missing entries, then the following condition must be
+//   true:
 //
-// The output tensor has shape `[1, 4, 4, 1]` and value:
+// `(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
 //
-// ```
-// x = [[[1],   [2],  [3],  [4]],
-//      [[5],   [6],  [7],  [8]],
-//      [[9],  [10], [11],  [12]],
-//      [[13], [14], [15],  [16]]]
-// ```
+// The remapping tensors can be generated using the GenerateVocabRemapping op.
 //
-// (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
+// As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
+// initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
+// the value from row i, column j of the old tensor in the checkpoint, the output
+// matrix will look like the following:
 //
-// ```
-// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
-//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
-// ```
+// [[w(1, 0),  w(1, 2),  0.5],
+//  [w(0, 0),  w(0, 2), -0.5],
+//  [0.25,    -0.25,      42]]
 //
-// The output tensor has shape `[2, 2, 4, 1]` and value:
+// Arguments:
+//	ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
+// which the old matrix `Tensor` will be loaded.
+//	old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
+//	row_remapping: An int `Tensor` of row remappings (generally created by
+// `generate_vocab_remapping`).  Even if no row remapping is needed, this must
+// still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
+// index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
+//	col_remapping: An int `Tensor` of column remappings (generally created by
+// `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
+// is to be done (e.g. column ordering is the same).
+//	initializing_values: A float `Tensor` containing  values to fill in for cells
+// in the output matrix that are not loaded from the checkpoint. Length must be
+// exactly the same as the number of missing / new cells.
+//	num_rows: Number of rows (length of the 1st dimension) in the output matrix.
+//	num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
 //
-// ```
-// x = [[[[1], [3]], [[5], [7]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int64) (output tf.Output) {
+// Returns Output matrix containing existing values loaded from the
+// checkpoint, and with any missing values filled in from initializing_values.
+func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Output, row_remapping tf.Output, col_remapping tf.Output, initializing_values tf.Output, num_rows int64, num_cols int64, optional ...LoadAndRemapMatrixAttr) (output_matrix tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"block_size": block_size}
+	attrs := map[string]interface{}{"num_rows": num_rows, "num_cols": num_cols}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BatchToSpace",
+		Type: "LoadAndRemapMatrix",
 		Input: []tf.Input{
-			input, crops,
+			ckpt_path, old_tensor_name, row_remapping, col_remapping, initializing_values,
 		},
 		Attrs: attrs,
 	}
@@ -30938,541 +35686,584 @@ func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int
 	return op.Output(0)
 }
 
-// Makes a new iterator from the given `dataset` and stores it in `iterator`.
+// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
+type TFRecordReaderV2Attr func(optionalAttr)
+
+// TFRecordReaderV2Container sets the optional container attribute to value.
 //
-// This operation may be executed multiple times. Each execution will reset the
-// iterator in `iterator` to the first element of `dataset`.
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// Returns the created operation.
-func MakeIterator(scope *Scope, dataset tf.Output, iterator tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "MakeIterator",
-		Input: []tf.Input{
-			dataset, iterator,
-		},
+}
+
+// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
+// If not specified, defaults to ""
+func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Adjust the contrast of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
-// interpreted as `[height, width, channels]`.  The other dimensions only
-// represent a collection of images, such as `[batch, height, width, channels].`
-//
-// Contrast is adjusted independently for each channel of each image.
-//
-// For each channel, the Op first computes the mean of the image pixels in the
-// channel and then adjusts each component of each pixel to
-// `(x - mean) * contrast_factor + mean`.
-//
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	contrast_factor: A float multiplier for adjusting contrast.
+// A Reader that outputs the records from a TensorFlow Records file.
 //
-// Returns The contrast-adjusted image or images.
-func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output) (output tf.Output) {
+// Returns The handle to reference the Reader.
+func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "AdjustContrastv2",
-		Input: []tf.Input{
-			images, contrast_factor,
-		},
+		Type: "TFRecordReaderV2",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Gets the next output from the given iterator .
-func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
+type QuantizeAndDequantizeV3Attr func(optionalAttr)
+
+// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
+	return func(m optionalAttr) {
+		m["signed_input"] = value
+	}
+}
+
+// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// Quantizes then dequantizes a tensor.
+//
+// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
+// tensor, so its value can change during training.
+func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IteratorGetNext",
+		Type: "QuantizeAndDequantizeV3",
 		Input: []tf.Input{
-			iterator,
+			input, input_min, input_max, num_bits,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("IteratorGetNext", err)
-		return
-	}
-	return components
+	return op.Output(0)
 }
 
-// Outputs the single element from the given dataset.
+// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
+type IdentityReaderV2Attr func(optionalAttr)
+
+// IdentityReaderV2Container sets the optional container attribute to value.
 //
-// Arguments:
-//	dataset: A handle to a dataset that contains a single element.
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
 //
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the queued work as both the key and value.
 //
+// To use, enqueue strings in a Queue.  ReaderRead will take the front
+// work string and output (work, work).
 //
-// Returns The components of the single element of `input`.
-func DatasetToSingleElement(scope *Scope, dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+// Returns The handle to reference the Reader.
+func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DatasetToSingleElement",
-		Input: []tf.Input{
-			dataset,
-		},
+		Type: "IdentityReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("DatasetToSingleElement", err)
-		return
+	return op.Output(0)
+}
+
+// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
+type ResourceApplyGradientDescentAttr func(optionalAttr)
+
+// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
 	}
-	return components
 }
 
-// Converts the given `resource_handle` representing an iterator to a string.
+// Update '*var' by subtracting 'alpha' * 'delta' from it.
 //
 // Arguments:
-//	resource_handle: A handle to an iterator resource.
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	delta: The change.
 //
-// Returns A string representation of the given handle.
-func IteratorToStringHandle(scope *Scope, resource_handle tf.Output) (string_handle tf.Output) {
+// Returns the created operation.
+func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IteratorToStringHandle",
+		Type: "ResourceApplyGradientDescent",
 		Input: []tf.Input{
-			resource_handle,
+			var_, alpha, delta,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
-type IteratorFromStringHandleAttr func(optionalAttr)
-
-// IteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
-//
-// value: If specified, defines the type of each tuple component in an
-// element produced by the resulting iterator.
-// If not specified, defaults to <>
+// Returns the next record (key, value pair) produced by a Reader.
 //
-// REQUIRES: len(value) >= 0
-func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
-	return func(m optionalAttr) {
-		m["output_types"] = value
-	}
-}
-
-// IteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
 //
-// value: If specified, defines the shape of each tuple component in an
-// element produced by the resulting iterator.
-// If not specified, defaults to <>
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//	queue_handle: Handle to a Queue, with string work items.
 //
-// REQUIRES: len(value) >= 0
-func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
-	return func(m optionalAttr) {
-		m["output_shapes"] = value
+// Returns A scalar.A scalar.
+func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderReadV2",
+		Input: []tf.Input{
+			reader_handle, queue_handle,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// Converts the given string representing a handle to an iterator to a resource.
+// Returns up to `num_records` (key, value) pairs produced by a Reader.
+//
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
+// It may return less than `num_records` even before the last batch.
 //
 // Arguments:
-//	string_handle: A string representation of the given handle.
+//	reader_handle: Handle to a `Reader`.
+//	queue_handle: Handle to a `Queue`, with string work items.
+//	num_records: number of records to read from `Reader`.
 //
-// Returns A handle to an iterator resource.
-func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...IteratorFromStringHandleAttr) (resource_handle tf.Output) {
+// Returns A 1-D tensor.A 1-D tensor.
+func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "IteratorFromStringHandle",
+		Type: "ReaderReadUpToV2",
 		Input: []tf.Input{
-			string_handle,
+			reader_handle, queue_handle, num_records,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Gather slices from `params` axis `axis` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `params.shape[:axis] + indices.shape +
-// params.shape[axis + 1:]` where:
-//
-// ```python
-//     # Scalar indices (output is rank(params) - 1).
-//     output[a_0, ..., a_n, b_0, ..., b_n] =
-//       params[a_0, ..., a_n, indices, b_0, ..., b_n]
-//
-//     # Vector indices (output is rank(params)).
-//     output[a_0, ..., a_n, i, b_0, ..., b_n] =
-//       params[a_0, ..., a_n, indices[i], b_0, ..., b_n]
-//
-//     # Higher rank indices (output is rank(params) + rank(indices) - 1).
-//     output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =
-//       params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]
-// ```
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
-// </div>
-//
-// Note that on CPU, if an out of bound index is found, an error is returned.
-// On GPU, if an out of bound index is found, a 0 is stored in the
-// corresponding output value.
+//     Adds v into specified rows of x.
 //
-// See also `tf.batch_gather` and `tf.gather_nd`.
+//     Computes y = x; y[i, :] += v; return y.
 //
 // Arguments:
-//	params: The tensor from which to gather values. Must be at least rank
-// `axis + 1`.
-//	indices: Index tensor. Must be in range `[0, params.shape[axis])`.
-//	axis: The axis in `params` to gather `indices` from. Defaults to the first
-// dimension. Supports negative indexes.
+//	x: A `Tensor` of type T.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
 //
-// Returns Values from `params` gathered from indices given by `indices`, with
-// shape `params.shape[:axis] + indices.shape + params.shape[axis + 1:]`.
-func GatherV2(scope *Scope, params tf.Output, indices tf.Output, axis tf.Output) (output tf.Output) {
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceAdd(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "GatherV2",
+		Type: "InplaceAdd",
 		Input: []tf.Input{
-			params, indices, axis,
+			x, i, v,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts the given `resource_handle` representing an iterator to a variant tensor.
+// Restore a Reader to its initial clean state.
 //
 // Arguments:
-//	resource_handle: A handle to an iterator resource.
+//	reader_handle: Handle to a Reader.
 //
-// Returns A variant tensor storing the state of the iterator contained in the
-// resource.
-func SerializeIterator(scope *Scope, resource_handle tf.Output) (serialized tf.Output) {
+// Returns the created operation.
+func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SerializeIterator",
+		Type: "ReaderResetV2",
 		Input: []tf.Input{
-			resource_handle,
+			reader_handle,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// FIFOQueueV2Attr is an optional argument to FIFOQueueV2.
-type FIFOQueueV2Attr func(optionalAttr)
+// BatchAttr is an optional argument to Batch.
+type BatchAttr func(optionalAttr)
 
-// FIFOQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
+// BatchMaxEnqueuedBatches sets the optional max_enqueued_batches attribute to value.
+// If not specified, defaults to 10
+func BatchMaxEnqueuedBatches(value int64) BatchAttr {
 	return func(m optionalAttr) {
-		m["shapes"] = value
+		m["max_enqueued_batches"] = value
 	}
 }
 
-// FIFOQueueV2Capacity sets the optional capacity attribute to value.
-//
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func FIFOQueueV2Capacity(value int64) FIFOQueueV2Attr {
+// BatchAllowedBatchSizes sets the optional allowed_batch_sizes attribute to value.
+// If not specified, defaults to <>
+func BatchAllowedBatchSizes(value []int64) BatchAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["allowed_batch_sizes"] = value
 	}
 }
 
-// FIFOQueueV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
+// BatchContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func FIFOQueueV2Container(value string) FIFOQueueV2Attr {
+func BatchContainer(value string) BatchAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// FIFOQueueV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
+// BatchSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func FIFOQueueV2SharedName(value string) FIFOQueueV2Attr {
+func BatchSharedName(value string) BatchAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// A queue that produces elements in first-in first-out order.
+// BatchBatchingQueue sets the optional batching_queue attribute to value.
+// If not specified, defaults to ""
+func BatchBatchingQueue(value string) BatchAttr {
+	return func(m optionalAttr) {
+		m["batching_queue"] = value
+	}
+}
+
+// Batches all input tensors nondeterministically.
 //
-// Arguments:
-//	component_types: The type of each component in a value.
+// When many instances of this Op are being run concurrently with the same
+// container/shared_name in the same device, some will output zero-shaped Tensors
+// and others will output Tensors of size up to max_batch_size.
 //
-// Returns The handle to the queue.
-func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQueueV2Attr) (handle tf.Output) {
+// All Tensors in in_tensors are batched together (so, for example, labels and
+// features should be batched with a single instance of this operation.
+//
+// Each invocation of batch emits an `id` scalar which will be used to identify
+// this particular invocation when doing unbatch or its gradient.
+//
+// Each op which emits a non-empty batch will also emit a non-empty batch_index
+// Tensor, which, is a [K, 3] matrix where each row contains the invocation's id,
+// start, and length of elements of each set of Tensors present in batched_tensors.
+//
+// Batched tensors are concatenated along the first dimension, and all tensors in
+// in_tensors must have the first dimension of the same size.
+//
+// in_tensors: The tensors to be batched.
+// num_batch_threads: Number of scheduling threads for processing batches of work.
+//  Determines the number of batches processed in parallel.
+// max_batch_size: Batch sizes will never be bigger than this.
+// batch_timeout_micros: Maximum number of microseconds to wait before outputting
+//  an incomplete batch.
+// allowed_batch_sizes: Optional list of allowed batch sizes. If left empty, does
+//  nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
+//  batches up to one of those sizes. The entries must increase monotonically, and
+//  the final entry must equal max_batch_size.
+// grad_timeout_micros: The timeout to use for the gradient. See Unbatch.
+// batched_tensors: Either empty tensors or a batch of concatenated Tensors.
+// batch_index: If out_tensors is non-empty, has information to invert it.
+// container: Controls the scope of sharing of this batch.
+// id: always contains a scalar with a unique ID for this invocation of Batch.
+// shared_name: Concurrently running instances of batch in the same device with the
+//  same container and shared_name will batch their elements together. If left
+//  empty, the op name will be used as the shared name.
+// T: the types of tensors to be batched.
+func Batch(scope *Scope, in_tensors []tf.Output, num_batch_threads int64, max_batch_size int64, batch_timeout_micros int64, grad_timeout_micros int64, optional ...BatchAttr) (batched_tensors []tf.Output, batch_index tf.Output, id tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{"num_batch_threads": num_batch_threads, "max_batch_size": max_batch_size, "batch_timeout_micros": batch_timeout_micros, "grad_timeout_micros": grad_timeout_micros}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FIFOQueueV2",
-
+		Type: "Batch",
+		Input: []tf.Input{
+			tf.OutputList(in_tensors),
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if batched_tensors, idx, err = makeOutputList(op, idx, "batched_tensors"); err != nil {
+		scope.UpdateErr("Batch", err)
+		return
+	}
+	batch_index = op.Output(idx)
+	id = op.Output(idx)
+	return batched_tensors, batch_index, id
 }
 
-// Constructs an Optional variant from a tuple of tensors.
-func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output) {
+// Adjust the hue of one or more images.
+//
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
+//
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A delta is then applied all the hue values,
+// and then remapped back to RGB colorspace.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	delta: A float delta to add to the hue.
+//
+// Returns The hue-adjusted image or images.
+func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "OptionalFromValue",
+		Type: "AdjustHue",
 		Input: []tf.Input{
-			tf.OutputList(components),
+			images, delta,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodeProtoV2Attr is an optional argument to DecodeProtoV2.
-type DecodeProtoV2Attr func(optionalAttr)
-
-// DecodeProtoV2DescriptorSource sets the optional descriptor_source attribute to value.
-//
-// value: Either the special value `local://` or a path to a file containing
-// a serialized `FileDescriptorSet`.
-// If not specified, defaults to "local://"
-func DecodeProtoV2DescriptorSource(value string) DecodeProtoV2Attr {
-	return func(m optionalAttr) {
-		m["descriptor_source"] = value
-	}
-}
-
-// DecodeProtoV2MessageFormat sets the optional message_format attribute to value.
-//
-// value: Either `binary` or `text`.
-// If not specified, defaults to "binary"
-func DecodeProtoV2MessageFormat(value string) DecodeProtoV2Attr {
-	return func(m optionalAttr) {
-		m["message_format"] = value
-	}
-}
+// ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
+type ResizeBicubicGradAttr func(optionalAttr)
 
-// DecodeProtoV2Sanitize sets the optional sanitize attribute to value.
+// ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
 //
-// value: Whether to sanitize the result or not.
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
 // If not specified, defaults to false
-func DecodeProtoV2Sanitize(value bool) DecodeProtoV2Attr {
+func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
 	return func(m optionalAttr) {
-		m["sanitize"] = value
+		m["align_corners"] = value
 	}
 }
 
-// The op extracts fields from a serialized protocol buffers message into tensors.
-//
-// The `decode_proto` op extracts fields from a serialized protocol buffers
-// message into tensors.  The fields in `field_names` are decoded and converted
-// to the corresponding `output_types` if possible.
-//
-// A `message_type` name must be provided to give context for the field
-// names. The actual message descriptor can be looked up either in the
-// linked-in descriptor pool or a filename provided by the caller using
-// the `descriptor_source` attribute.
-//
-// Each output tensor is a dense tensor. This means that it is padded to
-// hold the largest number of repeated elements seen in the input
-// minibatch. (The shape is also padded by one to prevent zero-sized
-// dimensions). The actual repeat counts for each example in the
-// minibatch can be found in the `sizes` output. In many cases the output
-// of `decode_proto` is fed immediately into tf.squeeze if missing values
-// are not a concern. When using tf.squeeze, always pass the squeeze
-// dimension explicitly to avoid surprises.
-//
-// For the most part, the mapping between Proto field types and
-// TensorFlow dtypes is straightforward. However, there are a few
-// special cases:
-//
-// - A proto field that contains a submessage or group can only be converted
-// to `DT_STRING` (the serialized submessage). This is to reduce the
-// complexity of the API. The resulting string can be used as input
-// to another instance of the decode_proto op.
-//
-// - TensorFlow lacks support for unsigned integers. The ops represent uint64
-// types as a `DT_INT64` with the same twos-complement bit pattern
-// (the obvious way). Unsigned int32 values can be represented exactly by
-// specifying type `DT_INT64`, or using twos-complement if the caller
-// specifies `DT_INT32` in the `output_types` attribute.
-//
-// The `descriptor_source` attribute selects a source of protocol
-// descriptors to consult when looking up `message_type`. This may be a
-// filename containing a serialized `FileDescriptorSet` message,
-// or the special value `local://`, in which case only descriptors linked
-// into the code will be searched; the filename can be on any filesystem
-// accessible to TensorFlow.
-//
-// You can build a `descriptor_source` file using the `--descriptor_set_out`
-// and `--include_imports` options to the protocol compiler `protoc`.
-//
-// The `local://` database only covers descriptors linked into the
-// code via C++ libraries, not Python imports. You can link in a proto descriptor
-// by creating a cc_library target with alwayslink=1.
-//
-// Both binary and text proto serializations are supported, and can be
-// chosen using the `format` attribute.
+// Computes the gradient of bicubic interpolation.
 //
 // Arguments:
-//	bytes: Tensor of serialized protos with shape `batch_shape`.
-//	message_type: Name of the proto message type to decode.
-//	field_names: List of strings containing proto field names.
-//	output_types: List of TF types to use for the respective field in field_names.
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
 //
-// Returns Tensor of int32 with shape `[batch_shape, len(field_names)]`.
-// Each entry is the number of values found for the corresponding field.
-// Optional fields may have 0 or 1 values.List of tensors containing values for the corresponding field.
-// `values[i]` has datatype `output_types[i]`
-// and shape `[batch_shape, max(sizes[...,i])]`.
-func DecodeProtoV2(scope *Scope, bytes tf.Output, message_type string, field_names []string, output_types []tf.DataType, optional ...DecodeProtoV2Attr) (sizes tf.Output, values []tf.Output) {
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBicubicGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"message_type": message_type, "field_names": field_names, "output_types": output_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeProtoV2",
+		Type: "ResizeBicubicGrad",
 		Input: []tf.Input{
-			bytes,
+			grads, original_image,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	sizes = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("DecodeProtoV2", err)
-		return
-	}
-	return sizes, values
+	return op.Output(0)
 }
 
-// Creates an Optional variant with no value.
-func OptionalNone(scope *Scope) (optional tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "OptionalNone",
+// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
+type ResizeNearestNeighborAttr func(optionalAttr)
+
+// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns true if and only if the given Optional variant has a value.
-func OptionalHasValue(scope *Scope, optional tf.Output) (has_value tf.Output) {
+// Resize `images` to `size` using nearest neighbor interpolation.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "OptionalHasValue",
+		Type: "ResizeNearestNeighbor",
 		Input: []tf.Input{
-			optional,
+			images, size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the value stored in an Optional variant or raises an error if none exists.
-func OptionalGetValue(scope *Scope, optional tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
+type ResizeNearestNeighborGradAttr func(optionalAttr)
+
+// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and grad tensors are
+// aligned. Defaults to false.
+// If not specified, defaults to false
+func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Computes the gradient of nearest neighbor interpolation.
+//
+// Arguments:
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
+// original input size.
+//
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
+// with respect to the input image.
+func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "OptionalGetValue",
+		Type: "ResizeNearestNeighborGrad",
 		Input: []tf.Input{
-			optional,
+			grads, size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("OptionalGetValue", err)
-		return
+	return op.Output(0)
+}
+
+// ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
+type ExtractJpegShapeAttr func(optionalAttr)
+
+// ExtractJpegShapeOutputType sets the optional output_type attribute to value.
+//
+// value: (Optional) The output type of the operation (int32 or int64).
+// Defaults to int32.
+// If not specified, defaults to DT_INT32
+func ExtractJpegShapeOutputType(value tf.DataType) ExtractJpegShapeAttr {
+	return func(m optionalAttr) {
+		m["output_type"] = value
 	}
-	return components
 }
 
-// Gets the next output from the given iterator as an Optional variant.
-func IteratorGetNextAsOptional(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (optional tf.Output) {
+// Extract the shape information of a JPEG-encoded image.
+//
+// This op only parses the image header, so it is much faster than DecodeJpeg.
+//
+// Arguments:
+//	contents: 0-D. The JPEG-encoded image.
+//
+// Returns 1-D. The image shape with format [height, width, channels].
+func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegShapeAttr) (image_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IteratorGetNextAsOptional",
+		Type: "ExtractJpegShape",
 		Input: []tf.Input{
-			iterator,
+			contents,
 		},
 		Attrs: attrs,
 	}
@@ -31480,703 +36271,844 @@ func IteratorGetNextAsOptional(scope *Scope, iterator tf.Output, output_types []
 	return op.Output(0)
 }
 
-// Fast Fourier transform.
+// DecodePngAttr is an optional argument to DecodePng.
+type DecodePngAttr func(optionalAttr)
+
+// DecodePngChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodePngChannels(value int64) DecodePngAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodePngDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_UINT8
+func DecodePngDtype(value tf.DataType) DecodePngAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Decode a PNG-encoded image to a uint8 or uint16 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the PNG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+// *   4: output an RGBA image.
+//
+// If needed, the PNG-encoded image is transformed to match the requested number
+// of color channels.
 //
-// Computes the 1-dimensional discrete Fourier transform over the inner-most
-// dimension of `input`.
+// This op also supports decoding JPEGs and non-animated GIFs since the interface
+// is the same, though it is cleaner to use `tf.image.decode_image`.
 //
 // Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its 1D Fourier transform.
+//	contents: 0-D.  The PNG-encoded image.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.fft
-// @end_compatibility
-func FFT(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns 3-D with shape `[height, width, channels]`.
+func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "FFT",
+		Type: "DecodePng",
 		Input: []tf.Input{
-			input,
+			contents,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Identity transformation that models performance.
+// Decode the first frame of a GIF-encoded image to a uint8 tensor.
 //
-// Identity transformation that models performance.
+// GIF with frame or transparency compression are not supported
+// convert animated GIF from compressed to uncompressed by:
 //
-// Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
+//     convert $src.gif -coalesce $dst.gif
 //
+// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
+// `tf.image.decode_image`.
 //
-func ModelDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Arguments:
+//	contents: 0-D.  The GIF-encoded image.
+//
+// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB order
+func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ModelDataset",
+		Type: "DecodeGif",
 		Input: []tf.Input{
-			input_dataset,
+			contents,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of (x > y) element-wise.
+// LearnedUnigramCandidateSamplerAttr is an optional argument to LearnedUnigramCandidateSampler.
+type LearnedUnigramCandidateSamplerAttr func(optionalAttr)
+
+// LearnedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// *NOTE*: `Greater` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func LearnedUnigramCandidateSamplerSeed(value int64) LearnedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Greater",
-		Input: []tf.Input{
-			x, y,
-		},
+}
+
+// LearnedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func LearnedUnigramCandidateSamplerSeed2(value int64) LearnedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Performs a padding as a preprocess during a convolution.
+// Generates labels for candidate sampling with a learned unigram distribution.
 //
-// Similar to FusedResizeAndPadConv2d, this op allows for an optimized
-// implementation where the spatial padding transformation stage is fused with the
-// im2col lookup, but in this case without the bilinear filtering required for
-// resizing. Fusing the padding prevents the need to write out the intermediate
-// results as whole tensors, reducing memory pressure, and we can get some latency
-// gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
-// order is used instead.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
 //
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string) (output tf.Output) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func LearnedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LearnedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "FusedPadConv2D",
+		Type: "LearnedUnigramCandidateSampler",
 		Input: []tf.Input{
-			input, paddings, filter,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
-type Conv2DBackpropInputAttr func(optionalAttr)
+// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
+type RandomShuffleQueueV2Attr func(optionalAttr)
 
-// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
+// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
+//
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
+		m["shapes"] = value
 	}
 }
 
-// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
+// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["capacity"] = value
 	}
 }
 
-// Conv2DBackpropInputDilations sets the optional dilations attribute to value.
+// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
+// value: Dequeue will block unless there would be this
+// many elements after the dequeue or the queue is closed. This
+// ensures a minimum level of mixing of elements.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["min_after_dequeue"] = value
 	}
 }
 
-// Computes the gradients of convolution with respect to the input.
+// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 is set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// RandomShuffleQueueV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A queue that randomizes the order of elements.
 //
 // Arguments:
-//	input_sizes: An integer vector representing the shape of `input`,
-// where `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
+//	component_types: The type of each component in a value.
 //
-// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
-// w.r.t. the input of the convolution.
-func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
+// Returns The handle to the queue.
+func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropInput",
-		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
-		},
+		Type: "RandomShuffleQueueV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Interleave the values from the `data` tensors into a single tensor.
-//
-// Builds a merged tensor such that
-//
-// ```python
-//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
-// ```
-//
-// For example, if each `indices[m]` is scalar or vector, we have
-//
-// ```python
-//     # Scalar indices:
-//     merged[indices[m], ...] = data[m][...]
-//
-//     # Vector indices:
-//     merged[indices[m][i], ...] = data[m][i, ...]
-// ```
-//
-// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
-// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
-// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
-// `constant`, the output shape is
-//
-//     merged.shape = [max(indices)] + constant
-//
-// Values are merged in order, so if an index appears in both `indices[m][i]` and
-// `indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the
-// merged result. If you do not need this guarantee, ParallelDynamicStitch might
-// perform better on some devices.
-//
-// For example:
-//
-// ```python
-//     indices[0] = 6
-//     indices[1] = [4, 1]
-//     indices[2] = [[5, 2], [0, 3]]
-//     data[0] = [61, 62]
-//     data[1] = [[41, 42], [11, 12]]
-//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
-//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
-//               [51, 52], [61, 62]]
-// ```
-//
-// This method can be used to merge partitions created by `dynamic_partition`
-// as illustrated on the following example:
+// SerializeSparseAttr is an optional argument to SerializeSparse.
+type SerializeSparseAttr func(optionalAttr)
+
+// SerializeSparseOutType sets the optional out_type attribute to value.
 //
-// ```python
-//     # Apply function (increments x_i) on elements for which a certain condition
-//     # apply (x_i != -1 in this example).
-//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
-//     condition_mask=tf.not_equal(x,tf.constant(-1.))
-//     partitioned_data = tf.dynamic_partition(
-//         x, tf.cast(condition_mask, tf.int32) , 2)
-//     partitioned_data[1] = partitioned_data[1] + 1.0
-//     condition_indices = tf.dynamic_partition(
-//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
-//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
-//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
-//     # unchanged.
-// ```
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
-// </div>
-func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DynamicStitch",
+		Type: "SerializeSparse",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(data),
+			sparse_indices, sparse_values, sparse_shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of (x == y) element-wise.
+// Draw bounding boxes on a batch of images.
 //
-// *NOTE*: `Equal` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
+// boxes specified by the locations in `boxes`. The coordinates of the each
+// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example, if an image is 100 x 200 pixels (height x width) and the bounding
+// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
+// the bounding box will be `(40, 10)` to `(180, 50)` (in (x,y) coordinates).
+//
+// Parts of the bounding box may fall outside the image.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
+//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
+// boxes.
+//
+// Returns 4-D with the same shape as `images`. The batch of input images with
+// bounding boxes drawn on the images.
+func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Equal",
+		Type: "DrawBoundingBoxes",
 		Input: []tf.Input{
-			x, y,
+			images, boxes,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorArrayGatherV2Attr is an optional argument to TensorArrayGatherV2.
-type TensorArrayGatherV2Attr func(optionalAttr)
+// SampleDistortedBoundingBoxV2Attr is an optional argument to SampleDistortedBoundingBoxV2.
+type SampleDistortedBoundingBoxV2Attr func(optionalAttr)
 
-// TensorArrayGatherV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
+// SampleDistortedBoundingBoxV2Seed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxV2Seed(value int64) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
-		m["element_shape"] = value
+		m["seed"] = value
 	}
 }
 
-// Deprecated. Use TensorArrayGatherV3
+// SampleDistortedBoundingBoxV2Seed2 sets the optional seed2 attribute to value.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayGatherV3
-func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV2Attr) (value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayGatherV2",
-		Input: []tf.Input{
-			handle, indices, flow_in,
-		},
-		Attrs: attrs,
+}
+
+// SampleDistortedBoundingBoxV2AspectRatioRange sets the optional aspect_ratio_range attribute to value.
+//
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Interleave the values from the `data` tensors into a single tensor.
+// SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
 //
-// Builds a merged tensor such that
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxV2MaxAttempts sets the optional max_attempts attribute to value.
 //
-// ```python
-//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
-// ```
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxV2MaxAttempts(value int64) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
 //
-// For example, if each `indices[m]` is scalar or vector, we have
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["use_image_if_no_bounding_boxes"] = value
+	}
+}
+
+// Generate a single randomly distorted bounding box for an image.
 //
-// ```python
-//     # Scalar indices:
-//     merged[indices[m], ...] = data[m][...]
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
 //
-//     # Vector indices:
-//     merged[indices[m][i], ...] = data[m][i, ...]
-// ```
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
 //
-// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
-// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
-// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
-// `constant`, the output shape is
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
 //
-//     merged.shape = [max(indices)] + constant
+// For example,
 //
-// Values may be merged in parallel, so if an index appears in both `indices[m][i]`
-// and `indices[n][j]`, the result may be invalid. This differs from the normal
-// DynamicStitch operator that defines the behavior in that case.
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
 //
-// For example:
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.summary.image('images_with_box', image_with_box)
 //
-// ```python
-//     indices[0] = 6
-//     indices[1] = [4, 1]
-//     indices[2] = [[5, 2], [0, 3]]
-//     data[0] = [61, 62]
-//     data[1] = [[41, 42], [11, 12]]
-//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
-//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
-//               [51, 52], [61, 62]]
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
 // ```
 //
-// This method can be used to merge partitions created by `dynamic_partition`
-// as illustrated on the following example:
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
 //
-// ```python
-//     # Apply function (increments x_i) on elements for which a certain condition
-//     # apply (x_i != -1 in this example).
-//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
-//     condition_mask=tf.not_equal(x,tf.constant(-1.))
-//     partitioned_data = tf.dynamic_partition(
-//         x, tf.cast(condition_mask, tf.int32) , 2)
-//     partitioned_data[1] = partitioned_data[1] + 1.0
-//     condition_indices = tf.dynamic_partition(
-//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
-//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
-//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
-//     # unchanged.
-// ```
+// Arguments:
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
+//	min_object_covered: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
-// </div>
-func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, min_object_covered tf.Output, optional ...SampleDistortedBoundingBoxV2Attr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ParallelDynamicStitch",
+		Type: "SampleDistortedBoundingBoxV2",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(data),
+			image_size, bounding_boxes, min_object_covered,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// PriorityQueueV2Attr is an optional argument to PriorityQueueV2.
-type PriorityQueueV2Attr func(optionalAttr)
-
-// PriorityQueueV2ComponentTypes sets the optional component_types attribute to value.
+// Computes requantization range per channel.
 //
-// value: The type of each component in a value.
-// If not specified, defaults to <>
+// Arguments:
+//	input: The original input tensor.
+//	input_min: The minimum value of the input tensor
+//	input_max: The maximum value of the input tensor.
+//	clip_value_max: The maximum value of the output that needs to be clipped.
+// Example: set this to 6 for Relu6.
 //
-// REQUIRES: len(value) >= 0
-func PriorityQueueV2ComponentTypes(value []tf.DataType) PriorityQueueV2Attr {
+// Returns The minimum value of the final output tensorThe maximum value of the final output tensor.
+func RequantizationRangePerChannel(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, clip_value_max float32) (output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"clip_value_max": clip_value_max}
+	opspec := tf.OpSpec{
+		Type: "RequantizationRangePerChannel",
+		Input: []tf.Input{
+			input, input_min, input_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
+type ExtractGlimpseAttr func(optionalAttr)
+
+// ExtractGlimpseCentered sets the optional centered attribute to value.
+//
+// value: indicates if the offset coordinates are centered relative to
+// the image, in which case the (0, 0) offset is relative to the center
+// of the input images. If false, the (0,0) offset corresponds to the
+// upper left corner of the input images.
+// If not specified, defaults to true
+func ExtractGlimpseCentered(value bool) ExtractGlimpseAttr {
 	return func(m optionalAttr) {
-		m["component_types"] = value
+		m["centered"] = value
 	}
 }
 
-// PriorityQueueV2Capacity sets the optional capacity attribute to value.
+// ExtractGlimpseNormalized sets the optional normalized attribute to value.
 //
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func PriorityQueueV2Capacity(value int64) PriorityQueueV2Attr {
+// value: indicates if the offset coordinates are normalized.
+// If not specified, defaults to true
+func ExtractGlimpseNormalized(value bool) ExtractGlimpseAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["normalized"] = value
 	}
 }
 
-// PriorityQueueV2Container sets the optional container attribute to value.
+// ExtractGlimpseUniformNoise sets the optional uniform_noise attribute to value.
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func PriorityQueueV2Container(value string) PriorityQueueV2Attr {
+// value: indicates if the noise should be generated using a
+// uniform distribution or a Gaussian distribution.
+// If not specified, defaults to true
+func ExtractGlimpseUniformNoise(value bool) ExtractGlimpseAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["uniform_noise"] = value
 	}
 }
 
-// PriorityQueueV2SharedName sets the optional shared_name attribute to value.
+// ExtractGlimpseNoise sets the optional noise attribute to value.
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func PriorityQueueV2SharedName(value string) PriorityQueueV2Attr {
+// value: indicates if the noise should `uniform`, `gaussian`, or
+// `zero`. The default is `uniform` which means the the noise type
+// will be decided by `uniform_noise`.
+// If not specified, defaults to "uniform"
+func ExtractGlimpseNoise(value string) ExtractGlimpseAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["noise"] = value
 	}
 }
 
-// A queue that produces elements sorted by the first component value.
+// Extracts a glimpse from the input tensor.
 //
-// Note that the PriorityQueue requires the first component of any element
-// to be a scalar int64, in addition to the other elements declared by
-// component_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue
-// and DequeueMany) on a PriorityQueue will all require (resp. output) one extra
-// entry in their input (resp. output) lists.
+// Returns a set of windows called glimpses extracted at location
+// `offsets` from the input tensor. If the windows only partially
+// overlaps the inputs, the non overlapping areas will be filled with
+// random noise.
+//
+// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
+// glimpse_width, channels]`. The channels and batch dimensions are the
+// same as that of the input tensor. The height and width of the output
+// windows are specified in the `size` parameter.
+//
+// The argument `normalized` and `centered` controls how the windows are built:
+//
+// * If the coordinates are normalized but not centered, 0.0 and 1.0
+//   correspond to the minimum and maximum of each height and width
+//   dimension.
+// * If the coordinates are both normalized and centered, they range from
+//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
+//   left corner, the lower right corner is located at (1.0, 1.0) and the
+//   center is at (0, 0).
+// * If the coordinates are not normalized they are interpreted as
+//   numbers of pixels.
 //
 // Arguments:
-//	shapes: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
+//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
+//	size: A 1-D tensor of 2 elements containing the size of the glimpses
+// to extract.  The glimpse height must be specified first, following
+// by the glimpse width.
+//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
+// the y, x locations of the center of each window.
 //
-// Returns The handle to the queue.
-func PriorityQueueV2(scope *Scope, shapes []tf.Shape, optional ...PriorityQueueV2Attr) (handle tf.Output) {
+// Returns A tensor representing the glimpses `[batch_size,
+// glimpse_height, glimpse_width, channels]`.
+func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseAttr) (glimpse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shapes": shapes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PriorityQueueV2",
-
+		Type: "ExtractGlimpse",
+		Input: []tf.Input{
+			input, size, offsets,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// UnstageAttr is an optional argument to Unstage.
-type UnstageAttr func(optionalAttr)
-
-// UnstageCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// A container for an iterator resource.
 //
-// REQUIRES: value >= 0
-func UnstageCapacity(value int64) UnstageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
+// Returns A handle to the iterator that can be passed to a "MakeIterator"
+// or "IteratorGetNext" op.
+func Iterator(scope *Scope, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
+	attrs := map[string]interface{}{"shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "Iterator",
 
-// UnstageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func UnstageMemoryLimit(value int64) UnstageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// UnstageContainer sets the optional container attribute to value.
+// TensorForestTreeResourceHandleOpAttr is an optional argument to TensorForestTreeResourceHandleOp.
+type TensorForestTreeResourceHandleOpAttr func(optionalAttr)
+
+// TensorForestTreeResourceHandleOpContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func UnstageContainer(value string) UnstageAttr {
+func TensorForestTreeResourceHandleOpContainer(value string) TensorForestTreeResourceHandleOpAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// UnstageSharedName sets the optional shared_name attribute to value.
+// TensorForestTreeResourceHandleOpSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func UnstageSharedName(value string) UnstageAttr {
+func TensorForestTreeResourceHandleOpSharedName(value string) TensorForestTreeResourceHandleOpAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Op is similar to a lightweight Dequeue.
-//
-// The basic functionality is similar to dequeue with many fewer
-// capabilities and options.  This Op is optimized for performance.
-func Unstage(scope *Scope, dtypes []tf.DataType, optional ...UnstageAttr) (values []tf.Output) {
+// Creates a handle to a TensorForestTreeResource
+func TensorForestTreeResourceHandleOp(scope *Scope, optional ...TensorForestTreeResourceHandleOpAttr) (resource tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Unstage",
+		Type: "TensorForestTreeResourceHandleOp",
 
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("Unstage", err)
-		return
-	}
-	return values
+	return op.Output(0)
 }
 
-// QueueEnqueueV2Attr is an optional argument to QueueEnqueueV2.
-type QueueEnqueueV2Attr func(optionalAttr)
+// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
+type CropAndResizeGradImageAttr func(optionalAttr)
 
-// QueueEnqueueV2TimeoutMs sets the optional timeout_ms attribute to value.
+// CropAndResizeGradImageMethod sets the optional method attribute to value.
 //
-// value: If the queue is full, this operation will block for up to
-// timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueEnqueueV2TimeoutMs(value int64) QueueEnqueueV2Attr {
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["method"] = value
 	}
 }
 
-// Enqueues a tuple of one or more tensors in the given queue.
-//
-// The components input has k elements, which correspond to the components of
-// tuples stored in the given queue.
-//
-// N.B. If the queue is full, this operation will block until the given
-// element has been enqueued (or 'timeout_ms' elapses, if specified).
+// Computes the gradient of the crop_and_resize op wrt the input image tensor.
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	components: One or more tensors from which the enqueued tensors should be taken.
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
+// containing the original image size. Both `image_height` and `image_width` need
+// to be positive.
 //
-// Returns the created operation.
-func QueueEnqueueV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueV2Attr) (o *tf.Operation) {
+//
+// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"T": T}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueEnqueueV2",
+		Type: "CropAndResizeGradImage",
 		Input: []tf.Input{
-			handle, tf.OutputList(components),
+			grads, boxes, box_ind, image_size,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes the Bessel i0e function of `x` element-wise.
-//
-// Exponentially scaled modified Bessel function of order 0 defined as
-// `bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
-//
-// This function is faster and numerically stabler than `bessel_i0(x)`.
-func BesselI0e(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BesselI0e",
-		Input: []tf.Input{
-			x,
-		},
-	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QueueDequeueManyV2Attr is an optional argument to QueueDequeueManyV2.
-type QueueDequeueManyV2Attr func(optionalAttr)
+// ShuffleDatasetAttr is an optional argument to ShuffleDataset.
+type ShuffleDatasetAttr func(optionalAttr)
 
-// QueueDequeueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
+// ShuffleDatasetReshuffleEachIteration sets the optional reshuffle_each_iteration attribute to value.
 //
-// value: If the queue has fewer than n elements, this operation
-// will block for up to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueManyV2TimeoutMs(value int64) QueueDequeueManyV2Attr {
+// value: If true, each iterator over this dataset will be given
+// a different pseudorandomly generated seed, based on a sequence seeded by the
+// `seed` and `seed2` inputs. If false, each iterator will be given the same
+// seed, and repeated iteration over this dataset will yield the exact same
+// sequence of results.
+// If not specified, defaults to true
+func ShuffleDatasetReshuffleEachIteration(value bool) ShuffleDatasetAttr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["reshuffle_each_iteration"] = value
 	}
 }
 
-// Dequeues `n` tuples of one or more tensors from the given queue.
-//
-// If the queue is closed and there are fewer than `n` elements, then an
-// OutOfRange error is returned.
-//
-// This operation concatenates queue-element component tensors along the
-// 0th dimension to make a single component tensor.  All of the components
-// in the dequeued tuple will have size `n` in the 0th dimension.
+// Creates a dataset that shuffles elements from `input_dataset` pseudorandomly.
 //
-// This operation has `k` outputs, where `k` is the number of components in
-// the tuples stored in the given queue, and output `i` is the ith
-// component of the dequeued tuple.
+// Arguments:
 //
-// N.B. If the queue is empty, this operation will block until `n` elements
-// have been dequeued (or 'timeout_ms' elapses, if specified).
+//	buffer_size: The number of output elements to buffer in an iterator over
+// this dataset. Compare with the `min_after_dequeue` attr when creating a
+// `RandomShuffleQueue`.
+//	seed: A scalar seed for the random number generator. If either `seed` or
+// `seed2` is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
 //
-// Arguments:
-//	handle: The handle to a queue.
-//	n: The number of tuples to dequeue.
-//	component_types: The type of each component in a tuple.
 //
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueManyV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueManyV2Attr) (components []tf.Output) {
+func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ShuffleDatasetAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueManyV2",
+		Type: "ShuffleDataset",
 		Input: []tf.Input{
-			handle, n,
+			input_dataset, buffer_size, seed, seed2,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// 3D fast Fourier transform.
+//
+// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
+// dimensions of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fftn with 3 dimensions.
+// @end_compatibility
+func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueManyV2", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "FFT3D",
+		Input: []tf.Input{
+			input,
+		},
 	}
-	return components
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// EncodeBase64Attr is an optional argument to EncodeBase64.
-type EncodeBase64Attr func(optionalAttr)
+// CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
+type CropAndResizeGradBoxesAttr func(optionalAttr)
 
-// EncodeBase64Pad sets the optional pad attribute to value.
+// CropAndResizeGradBoxesMethod sets the optional method attribute to value.
 //
-// value: Bool whether padding is applied at the ends.
-// If not specified, defaults to false
-func EncodeBase64Pad(value bool) EncodeBase64Attr {
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
 	return func(m optionalAttr) {
-		m["pad"] = value
+		m["method"] = value
 	}
 }
 
-// Encode strings into web-safe base64 format.
-//
-// Refer to the following article for more information on base64 format:
-// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
-// end so that the encoded has length multiple of 4. See Padding section of the
-// link above.
-//
-// Web-safe means that the encoder uses - and _ instead of + and /.
+// Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
 //
 // Arguments:
-//	input: Strings to be encoded.
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
 //
-// Returns Input strings encoded in base64.
-func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
+// Returns A 2-D tensor of shape `[num_boxes, 4]`.
+func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxes tf.Output, box_ind tf.Output, optional ...CropAndResizeGradBoxesAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -32185,9 +37117,9 @@ func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodeBase64",
+		Type: "CropAndResizeGradBoxes",
 		Input: []tf.Input{
-			input,
+			grads, image, boxes, box_ind,
 		},
 		Attrs: attrs,
 	}
@@ -32195,106 +37127,198 @@ func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (
 	return op.Output(0)
 }
 
-// A dataset that creates window datasets from the input dataset.
-//
-// Arguments:
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-//	size: A scalar representing the number of elements to accumulate in a window.
-//	shift: A scalar representing the steps moving the sliding window forward in one
-// iteration. It must be positive.
-//	stride: A scalar representing the stride of the input elements of the sliding window.
-// It must be positive.
-//	drop_remainder: A scalar representing whether a window should be dropped in case its size is
-// smaller than desired.
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes with score less than
+// `score_threshold` are removed.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system and more
+// generally is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold, score_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
+// Arguments:
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
 //
-func WindowDataset(scope *Scope, input_dataset tf.Output, size tf.Output, shift tf.Output, stride tf.Output, drop_remainder tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionV3(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "WindowDataset",
+		Type: "NonMaxSuppressionV3",
 		Input: []tf.Input{
-			input_dataset, size, shift, stride, drop_remainder,
+			boxes, scores, max_output_size, iou_threshold, score_threshold,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayCloseV3
+// NonMaxSuppressionV4Attr is an optional argument to NonMaxSuppressionV4.
+type NonMaxSuppressionV4Attr func(optionalAttr)
+
+// NonMaxSuppressionV4PadToMaxOutputSize sets the optional pad_to_max_output_size attribute to value.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayCloseV3
+// value: If true, the output `selected_indices` is padded to be of length
+// `max_output_size`. Defaults to false.
+// If not specified, defaults to false
+func NonMaxSuppressionV4PadToMaxOutputSize(value bool) NonMaxSuppressionV4Attr {
+	return func(m optionalAttr) {
+		m["pad_to_max_output_size"] = value
+	}
+}
+
+// Greedily selects a subset of bounding boxes in descending order of score,
+//
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes with score less than
+// `score_threshold` are removed.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system and more
+// generally is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold, score_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
+//
+// Arguments:
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
 //
-// Returns the created operation.
-func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.A 0-D integer tensor representing the number of valid elements in
+// `selected_indices`, with the valid elements appearing first.
+func NonMaxSuppressionV4(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output, optional ...NonMaxSuppressionV4Attr) (selected_indices tf.Output, valid_outputs tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV2",
+		Type: "NonMaxSuppressionV4",
 		Input: []tf.Input{
-			handle,
+			boxes, scores, max_output_size, iou_threshold, score_threshold,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// Forwards the value of an available tensor from `inputs` to `output`.
-//
-// `Merge` waits for at least one of the tensors in `inputs` to become available.
-// It is usually combined with `Switch` to implement branching.
+// Removes keys and its associated values from a table.
 //
-// `Merge` forwards the first tensor to become available to `output`, and sets
-// `value_index` to its index in `inputs`.
+// The tensor `keys` must of the same type as the keys of the table. Keys not
+// already in the table are silently ignored.
 //
 // Arguments:
-//	inputs: The input tensors, exactly one of which will become available.
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys of the elements to remove.
 //
-// Returns Will be set to the available input tensor.The index of the chosen input tensor in `inputs`.
-func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.Output) {
+// Returns the created operation.
+func LookupTableRemoveV2(scope *Scope, table_handle tf.Output, keys tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Merge",
+		Type: "LookupTableRemoveV2",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			table_handle, keys,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// QueueCloseV2Attr is an optional argument to QueueCloseV2.
-type QueueCloseV2Attr func(optionalAttr)
+// CombinedNonMaxSuppressionAttr is an optional argument to CombinedNonMaxSuppression.
+type CombinedNonMaxSuppressionAttr func(optionalAttr)
 
-// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
+// CombinedNonMaxSuppressionPadPerClass sets the optional pad_per_class attribute to value.
 //
-// value: If true, all pending enqueue requests that are
-// blocked on the given queue will be canceled.
+// value: If false, the output nmsed boxes, scores and classes
+// are padded/clipped to `max_total_size`. If true, the
+// output nmsed boxes, scores and classes are padded to be of length
+// `max_size_per_class`*`num_classes`, unless it exceeds `max_total_size` in
+// which case it is clipped to `max_total_size`. Defaults to false.
 // If not specified, defaults to false
-func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
+func CombinedNonMaxSuppressionPadPerClass(value bool) CombinedNonMaxSuppressionAttr {
 	return func(m optionalAttr) {
-		m["cancel_pending_enqueues"] = value
+		m["pad_per_class"] = value
 	}
 }
 
-// Closes the given queue.
-//
-// This operation signals that no more elements will be enqueued in the
-// given queue. Subsequent Enqueue(Many) operations will fail.
-// Subsequent Dequeue(Many) operations will continue to succeed if
-// sufficient elements remain in the queue. Subsequent Dequeue(Many)
-// operations that would block will fail immediately.
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// Arguments:
-//	handle: The handle to a queue.
+// This operation performs non_max_suppression on the inputs per batch, across
+// all classes.
+// Prunes away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system. Also note that
+// this algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is the final boxes, scores and classes tensor
+// returned after performing non_max_suppression.
+//
+// Arguments:
+//	boxes: A 4-D float tensor of shape `[batch_size, num_boxes, q, 4]`. If `q` is 1 then
+// same boxes are used for all classes otherwise, if `q` is equal to number of
+// classes, class-specific boxes are used.
+//	scores: A 3-D float tensor of shape `[batch_size, num_boxes, num_classes]`
+// representing a single score corresponding to each box (each row of boxes).
+//	max_output_size_per_class: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression per class
+//	max_total_size: A scalar representing maximum number of boxes retained over all classes.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
 //
-// Returns the created operation.
-func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
+// Returns A [batch_size, max_detections, 4] float32 tensor
+// containing the non-max suppressed boxes.A [batch_size, max_detections] float32 tensor
+// containing the scores for the boxes.A [batch_size, max_detections] float32 tensor
+// containing the classes for the boxes.A [batch_size] int32 tensor indicating the number of
+// valid detections per batch item. Only the top num_detections[i] entries in
+// nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The rest of the
+// entries are zero paddings.
+func CombinedNonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size_per_class tf.Output, max_total_size tf.Output, iou_threshold tf.Output, score_threshold tf.Output, optional ...CombinedNonMaxSuppressionAttr) (nmsed_boxes tf.Output, nmsed_scores tf.Output, nmsed_classes tf.Output, valid_detections tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -32303,106 +37327,99 @@ func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueCloseV2",
+		Type: "CombinedNonMaxSuppression",
 		Input: []tf.Input{
-			handle,
+			boxes, scores, max_output_size_per_class, max_total_size, iou_threshold, score_threshold,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes inverse hyperbolic tangent of x element-wise.
-func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Atanh",
-		Input: []tf.Input{
-			x,
-		},
-	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Returns true if queue is closed.
+// Computes the matrix logarithm of one or more square matrices:
 //
-// This operation returns true if the queue is closed and false if the queue
-// is open.
+//
+// \\(log(exp(A)) = A\\)
+//
+// This op is only defined for complex matrices. If A is positive-definite and
+// real, then casting to a complex matrix, taking the logarithm and casting back
+// to a real matrix will give the correct result.
+//
+// This function computes the matrix logarithm using the Schur-Parlett algorithm.
+// Details of the algorithm can be found in Section 11.6.2 of:
+// Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
+// ISBN 978-0-898716-46-7.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the exponential for all input submatrices `[..., :, :]`.
 //
 // Arguments:
-//	handle: The handle to a queue.
-func QueueIsClosedV2(scope *Scope, handle tf.Output) (is_closed tf.Output) {
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.logm
+// @end_compatibility
+func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueIsClosedV2",
+		Type: "MatrixLogarithm",
 		Input: []tf.Input{
-			handle,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the absolute value of a tensor.
+//   This op is used as a placeholder in If branch functions. It doesn't provide a
+//   valid output when run, so must either be removed (e.g. replaced with a
+//   function input) or guaranteed not to be used (e.g. if mirroring an
+//   intermediate output needed for the gradient computation of the other branch).
 //
-// Given a tensor `x`, this operation returns a tensor containing the absolute
-// value of each element in `x`. For example, if x is an input element and y is
-// an output element, this operation computes \\(y = |x|\\).
-func Abs(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	dtype: The type of the output.
+//	shape:     The purported shape of the output. This is only used for shape inference;
+//     the output will not necessarily have this shape. Can be a partial shape.
+//
+// Returns     \"Fake\" output value. This should not be consumed by another op.
+func FakeParam(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
 	opspec := tf.OpSpec{
-		Type: "Abs",
-		Input: []tf.Input{
-			x,
-		},
+		Type: "FakeParam",
+
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StackV2Attr is an optional argument to StackV2.
-type StackV2Attr func(optionalAttr)
-
-// StackV2StackName sets the optional stack_name attribute to value.
+// Returns the next representable value of `x1` in the direction of `x2`, element-wise.
 //
-// value: Overrides the name used for the temporary stack resource. Default
-// value is the name of the 'Stack' op (which is guaranteed unique).
-// If not specified, defaults to ""
-func StackV2StackName(value string) StackV2Attr {
-	return func(m optionalAttr) {
-		m["stack_name"] = value
-	}
-}
-
-// A stack that produces elements in first-in last-out order.
+// This operation returns the same result as the C++ std::nextafter function.
 //
-// Arguments:
-//	max_size: The maximum size of the stack if non-negative. If negative, the stack
-// size is unlimited.
-//	elem_type: The type of the elements on the stack.
+// It can also return a subnormal number.
 //
-// Returns The handle to the stack.
-func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional ...StackV2Attr) (handle tf.Output) {
+// @compatibility(cpp)
+// Equivalent to C++ std::nextafter function.
+// @end_compatibility
+func NextAfter(scope *Scope, x1 tf.Output, x2 tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"elem_type": elem_type}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "StackV2",
+		Type: "NextAfter",
 		Input: []tf.Input{
-			max_size,
+			x1, x2,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -32485,6 +37502,45 @@ func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf
 	return scope.AddOperation(opspec)
 }
 
+// StackPushV2Attr is an optional argument to StackPushV2.
+type StackPushV2Attr func(optionalAttr)
+
+// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
+//
+// value: Swap `elem` to CPU. Default to false.
+// If not specified, defaults to false
+func StackPushV2SwapMemory(value bool) StackPushV2Attr {
+	return func(m optionalAttr) {
+		m["swap_memory"] = value
+	}
+}
+
+// Push an element onto the stack.
+//
+// Arguments:
+//	handle: The handle to a stack.
+//	elem: The tensor to be pushed onto the stack.
+//
+// Returns The same tensor as the input 'elem'.
+func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StackPushV2",
+		Input: []tf.Input{
+			handle, elem,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // RpcAttr is an optional argument to Rpc.
 type RpcAttr func(optionalAttr)
 
@@ -32604,38 +37660,41 @@ func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, o
 	return op.Output(0)
 }
 
-// StackPushV2Attr is an optional argument to StackPushV2.
-type StackPushV2Attr func(optionalAttr)
-
-// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
-//
-// value: Swap `elem` to CPU. Default to false.
-// If not specified, defaults to false
-func StackPushV2SwapMemory(value bool) StackPushV2Attr {
-	return func(m optionalAttr) {
-		m["swap_memory"] = value
+// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
+func ExperimentalBytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalBytesProducedStatsDataset",
+		Input: []tf.Input{
+			input_dataset, tag,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Push an element onto the stack.
+// A substitute for `InterleaveDataset` on a fixed list of `N` datasets.
 //
 // Arguments:
-//	handle: The handle to a stack.
-//	elem: The tensor to be pushed onto the stack.
+//	selector_input_dataset: A dataset of scalar `DT_INT64` elements that determines which of the
+// `N` data inputs should produce the next output element.
+//	data_input_datasets: `N` datasets with the same type that will be interleaved according to
+// the values of `selector_input_dataset`.
 //
-// Returns The same tensor as the input 'elem'.
-func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
+//
+func ExperimentalDirectedInterleaveDataset(scope *Scope, selector_input_dataset tf.Output, data_input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "StackPushV2",
+		Type: "ExperimentalDirectedInterleaveDataset",
 		Input: []tf.Input{
-			handle, elem,
+			selector_input_dataset, tf.OutputList(data_input_datasets),
 		},
 		Attrs: attrs,
 	}
@@ -32643,63 +37702,48 @@ func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...Sta
 	return op.Output(0)
 }
 
-// FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
-type FusedBatchNormGradV2Attr func(optionalAttr)
+// RandomUniformIntAttr is an optional argument to RandomUniformInt.
+type RandomUniformIntAttr func(optionalAttr)
 
-// FusedBatchNormGradV2Epsilon sets the optional epsilon attribute to value.
+// RandomUniformIntSeed sets the optional seed attribute to value.
 //
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradV2Epsilon(value float32) FusedBatchNormGradV2Attr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
 	return func(m optionalAttr) {
-		m["epsilon"] = value
+		m["seed"] = value
 	}
 }
 
-// FusedBatchNormGradV2DataFormat sets the optional data_format attribute to value.
+// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
 //
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormGradV2DataFormat(value string) FusedBatchNormGradV2Attr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["seed2"] = value
 	}
 }
 
-// FusedBatchNormGradV2IsTraining sets the optional is_training attribute to value.
+// Outputs random integers from a uniform distribution.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormGradV2IsTraining(value bool) FusedBatchNormGradV2Attr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Gradient for batch normalization.
+// The generated values are uniform integers in the range `[minval, maxval)`.
+// The lower bound `minval` is included in the range, while the upper bound
+// `maxval` is excluded.
 //
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// The random integers are slightly biased unless `maxval - minval` is an exact
+// power of two.  The bias is small for values of `maxval - minval` significantly
+// smaller than the range of the output (either `2^32` or `2^64`).
 //
 // Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-// mean to be reused in gradient computation. When is_training is
-// False, a 1D Tensor for the population mean to be reused in both
-// 1st and 2nd order gradient computation.
-//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-// variance (inverted variance in the cuDNN case) to be reused in
-// gradient computation. When is_training is False, a 1D Tensor
-// for the population variance to be reused in both 1st and 2nd
-// order gradient computation.
+//	shape: The shape of the output tensor.
+//	minval: 0-D.  Inclusive lower bound on the generated integers.
+//	maxval: 0-D.  Exclusive upper bound on the generated integers.
 //
-// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradV2Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+// Returns A tensor of the specified shape filled with uniform random integers.
+func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -32708,336 +37752,362 @@ func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGradV2",
+		Type: "RandomUniformInt",
 		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+			shape, minval, maxval,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// Creates a TensorArray for storing multiple gradients of values in the given handle.
+// Add the quantile summaries to each quantile stream resource.
 //
-// Similar to TensorArrayGradV3. However it creates an accumulator with an
-// expanded shape compared to the input TensorArray whose gradient is being
-// computed. This enables multiple gradients for the same TensorArray to be
-// calculated using the same accumulator.
+// An op that adds a list of quantile summaries to a quantile stream resource. Each
+// summary Tensor is rank 2, containing summaries (value, weight, min_rank, max_rank)
+// for a single feature.
 //
 // Arguments:
-//	handle: The handle to the forward TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	shape_to_prepend: An int32 vector representing a shape. Elements in the gradient accumulator will
-// have shape which is this shape_to_prepend value concatenated with shape of the
-// elements in the TensorArray corresponding to the input handle.
-//	source: The gradient source string, used to decide which gradient TensorArray
-// to return.
-func TensorArrayGradWithShape(scope *Scope, handle tf.Output, flow_in tf.Output, shape_to_prepend tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
+//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
+//	summaries: string; List of Rank 2 Tensor each containing the summaries for a single feature.
+//
+// Returns the created operation.
+func BoostedTreesQuantileStreamResourceAddSummaries(scope *Scope, quantile_stream_resource_handle tf.Output, summaries []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"source": source}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGradWithShape",
+		Type: "BoostedTreesQuantileStreamResourceAddSummaries",
 		Input: []tf.Input{
-			handle, flow_in, shape_to_prepend,
+			quantile_stream_resource_handle, tf.OutputList(summaries),
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
-//
-// Each comparison returns a boolean `true` (if `input_value > threshold`)
-// or and `false` otherwise.
-//
-// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
-// algorithms that use hashing approximations of cosine and `L2` distances;
-// codes can be generated from an input via:
-//
-// ```python
-// codebook_size = 50
-// codebook_bits = codebook_size * 32
-// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
-//                            dtype=x.dtype,
-//                            initializer=tf.orthogonal_initializer())
-// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
-// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
-// # now codes has shape x.shape[:-1] + [codebook_size]
-// ```
-//
-// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
-// by 8.
-//
-// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
-// a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
+// Creates a Dataset that returns pseudorandom numbers.
 //
 // Arguments:
-//	input: Values to compare against `threshold` and bitpack.
-//	threshold: Threshold to compare against.
+//	seed: A scalar seed for the random number generator. If either seed or
+// seed2 is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
 //
-// Returns The bitpacked comparisons.
-func CompareAndBitpack(scope *Scope, input tf.Output, threshold tf.Output) (output tf.Output) {
+//
+func ExperimentalRandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "CompareAndBitpack",
+		Type: "ExperimentalRandomDataset",
 		Input: []tf.Input{
-			input, threshold,
+			seed, seed2,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Push an element onto the tensor_array.
+// Creates a dataset that overrides the maximum intra-op parallelism.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	index: The position to write to inside the TensorArray.
-//	value: The tensor to write to the TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
 //
-// Returns A float scalar that enforces proper chaining of operations.
-func TensorArrayWriteV3(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+//	max_intra_op_parallelism: Identifies the maximum intra-op parallelism to use.
+//
+//
+func ExperimentalMaxIntraOpParallelismDataset(scope *Scope, input_dataset tf.Output, max_intra_op_parallelism tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayWriteV3",
+		Type: "ExperimentalMaxIntraOpParallelismDataset",
 		Input: []tf.Input{
-			handle, index, value, flow_in,
+			input_dataset, max_intra_op_parallelism,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Scatter the data from the input value into specific TensorArray elements.
+// StringSplitV2Attr is an optional argument to StringSplitV2.
+type StringSplitV2Attr func(optionalAttr)
+
+// StringSplitV2Maxsplit sets the optional maxsplit attribute to value.
 //
-// `indices` must be a vector, its length must match the first dim of `value`.
+// value: An `int`. If `maxsplit > 0`, limit of the split of the result.
+// If not specified, defaults to -1
+func StringSplitV2Maxsplit(value int64) StringSplitV2Attr {
+	return func(m optionalAttr) {
+		m["maxsplit"] = value
+	}
+}
+
+// Split elements of `source` based on `sep` into a `SparseTensor`.
 //
-// Arguments:
-//	handle: The handle to a TensorArray.
-//	indices: The locations at which to write the tensor elements.
-//	value: The concatenated tensor to write to the TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `source` based on `sep` and return a `SparseTensor`
+// containing the split tokens. Empty tokens are ignored.
 //
-// Returns A float scalar that enforces proper chaining of operations.
-func TensorArrayScatterV3(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
+// then the output will be
+// ```
+// st.indices = [0, 0;
+//               0, 1;
+//               1, 0;
+//               1, 1;
+//               1, 2]
+// st.shape = [2, 3]
+// st.values = ['hello', 'world', 'a', 'b', 'c']
+// ```
+//
+// If `sep` is given, consecutive delimiters are not grouped together and are
+// deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
+// sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
+// string, consecutive whitespace are regarded as a single separator, and the
+// result will contain no empty strings at the startor end if the string has
+// leading or trailing whitespace.
+//
+// Note that the above mentioned behavior matches python's str.split.
+//
+// Arguments:
+//	input: `1-D` string `Tensor`, the strings to split.
+//	sep: `0-D` string `Tensor`, the delimiter character.
+func StringSplitV2(scope *Scope, input tf.Output, sep tf.Output, optional ...StringSplitV2Attr) (indices tf.Output, values tf.Output, shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayScatterV3",
+		Type: "StringSplitV2",
 		Input: []tf.Input{
-			handle, indices, value, flow_in,
+			input, sep,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// EmptyAttr is an optional argument to Empty.
-type EmptyAttr func(optionalAttr)
-
-// EmptyInit sets the optional init attribute to value.
-//
-// value: If True, initialize the returned tensor with the default value of dtype.  Otherwise, the implementation is free not to initializethe tensor's content.
-// If not specified, defaults to false
-func EmptyInit(value bool) EmptyAttr {
-	return func(m optionalAttr) {
-		m["init"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Creates a tensor with the given shape.
-//
-// This operation creates a tensor of `shape` and `dtype`.
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
 //
 // Arguments:
-//	shape: 1-D. Represents the shape of the output tensor.
+//
+//	thread_pool: A resource produced by the ThreadPoolHandle op.
 //
 //
-// Returns A `Tensor` of type `T`.
-func Empty(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...EmptyAttr) (output tf.Output) {
+func ExperimentalThreadPoolDataset(scope *Scope, input_dataset tf.Output, thread_pool tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalThreadPoolDataset",
+		Input: []tf.Input{
+			input_dataset, thread_pool,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes softsign: `features / (abs(features) + 1)`.
+func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Empty",
+		Type: "Softsign",
 		Input: []tf.Input{
-			shape,
+			features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorArrayConcatV3Attr is an optional argument to TensorArrayConcatV3.
-type TensorArrayConcatV3Attr func(optionalAttr)
+// EncodeProtoAttr is an optional argument to EncodeProto.
+type EncodeProtoAttr func(optionalAttr)
 
-// TensorArrayConcatV3ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
-//
-// value: The expected shape of an element, if known,
-// excluding the first dimension. Used to validate the shapes of
-// TensorArray elements. If this shape is not fully specified, concatenating
-// zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayConcatV3ElementShapeExcept0(value tf.Shape) TensorArrayConcatV3Attr {
+// EncodeProtoDescriptorSource sets the optional descriptor_source attribute to value.
+// If not specified, defaults to "local://"
+func EncodeProtoDescriptorSource(value string) EncodeProtoAttr {
 	return func(m optionalAttr) {
-		m["element_shape_except0"] = value
+		m["descriptor_source"] = value
 	}
 }
 
-// Concat the elements from the TensorArray into value `value`.
+// The op serializes protobuf messages provided in the input tensors.
 //
-// Takes `T` elements of shapes
+// The types of the tensors in `values` must match the schema for the
+// fields specified in `field_names`. All the tensors in `values` must
+// have a common shape prefix, *batch_shape*.
 //
-//   ```
-//   (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)
-//   ```
+// The `sizes` tensor specifies repeat counts for each field.  The repeat
+// count (last dimension) of a each tensor in `values` must be greater
+// than or equal to corresponding repeat count in `sizes`.
 //
-// and concatenates them into a Tensor of shape:
+// A `message_type` name must be provided to give context for the field
+// names. The actual message descriptor can be looked up either in the
+// linked-in descriptor pool or a filename provided by the caller using
+// the `descriptor_source` attribute.
 //
-//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```
+// The `descriptor_source` attribute selects a source of protocol
+// descriptors to consult when looking up `message_type`. This may be a
+// filename containing a serialized `FileDescriptorSet` message,
+// or the special value `local://`, in which case only descriptors linked
+// into the code will be searched; the filename can be on any filesystem
+// accessible to TensorFlow.
 //
-// All elements must have the same shape (excepting the first dimension).
+// You can build a `descriptor_source` file using the `--descriptor_set_out`
+// and `--include_imports` options to the protocol compiler `protoc`.
+//
+// The `local://` database only covers descriptors linked into the
+// code via C++ libraries, not Python imports. You can link in a proto descriptor
+// by creating a cc_library target with alwayslink=1.
+//
+// There are a few special cases in the value mapping:
+//
+// Submessage and group fields must be pre-serialized as TensorFlow strings.
+//
+// TensorFlow lacks support for unsigned int64s, so they must be
+// represented as `tf.int64` with the same twos-complement bit pattern
+// (the obvious way).
+//
+// Unsigned int32 values can be represented exactly with `tf.int64`, or
+// with sign wrapping if the input is of type `tf.int32`.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+//	sizes: Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+//	values: List of tensors containing values for the corresponding field.
+//	field_names: List of strings containing proto field names.
+//	message_type: Name of the proto message type to decode.
 //
-// Returns All of the elements in the TensorArray, concatenated along the first
-// axis.A vector of the row sizes of the original T elements in the
-// value output.  In the example above, this would be the values:
-// `(n1, n2, ..., n(T-1))`.
-func TensorArrayConcatV3(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV3Attr) (value tf.Output, lengths tf.Output) {
+// Returns Tensor of serialized protos with shape `batch_shape`.
+func EncodeProto(scope *Scope, sizes tf.Output, values []tf.Output, field_names []string, message_type string, optional ...EncodeProtoAttr) (bytes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"field_names": field_names, "message_type": message_type}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayConcatV3",
+		Type: "EncodeProto",
 		Input: []tf.Input{
-			handle, flow_in,
+			sizes, tf.OutputList(values),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Split the data from the input value into TensorArray elements.
-//
-// Assuming that `lengths` takes on values
-//
-//   ```(n0, n1, ..., n(T-1))```
-//
-// and that `value` has shape
-//
-//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```,
-//
-// this splits values into a TensorArray with T tensors.
-//
-// TensorArray index t will be the subtensor of values with starting position
-//
-//   ```(n0 + n1 + ... + n(t-1), 0, 0, ...)```
-//
-// and having size
-//
-//   ```nt x d0 x d1 x ...```
-//
-// Arguments:
-//	handle: The handle to a TensorArray.
-//	value: The concatenated tensor to write to the TensorArray.
-//	lengths: The vector of lengths, how to split the rows of value into the
-// TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//
-// Returns A float scalar that enforces proper chaining of operations.
-func TensorArraySplitV3(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// Creates a dataset that splits a SparseTensor into elements row-wise.
+func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArraySplitV3",
+		Type: "SparseTensorSliceDataset",
 		Input: []tf.Input{
-			handle, value, lengths, flow_in,
+			indices, values, dense_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes gradients for the scaled exponential linear (Selu) operation.
+// Returns x / y element-wise for real types.
 //
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Selu operation.
-//	outputs: The outputs of the corresponding Selu operation.
+// If `x` and `y` are reals, this will return the floating-point division.
 //
-// Returns The gradients: `gradients * (outputs + scale * alpha)`
-// if outputs < 0, `scale * gradients` otherwise.
-func SeluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SeluGrad",
+		Type: "RealDiv",
 		Input: []tf.Input{
-			gradients, outputs,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Get the current size of the TensorArray.
-//
-// Arguments:
-//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//
-// Returns The current size of the TensorArray.
-func TensorArraySizeV3(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
+// Creates a dataset that concatenates `input_dataset` with `another_dataset`.
+func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TensorArraySizeV3",
+		Type: "ConcatenateDataset",
 		Input: []tf.Input{
-			handle, flow_in,
+			input_dataset, another_dataset,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayGradV3
+// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayGradV3
-func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
+// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
+// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
+// input channel is processed independently of the others with its own structuring
+// function. The `output` tensor has shape
+// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
+// tensor depend on the `padding` algorithm. We currently only support the default
+// "NHWC" `data_format`.
+//
+// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
+// (for consistency with `conv2d`, we use unmirrored filters):
+//
+//     output[b, y, x, c] =
+//        max_{dy, dx} input[b,
+//                           strides[1] * y + rates[1] * dy,
+//                           strides[2] * x + rates[2] * dx,
+//                           c] +
+//                     filter[dy, dx, c]
+//
+// Max-pooling is a special case when the filter has size equal to the pooling
+// kernel size and contains all zeros.
+//
+// Note on duality: The dilation of `input` by the `filter` is equal to the
+// negation of the erosion of `-input` by the reflected `filter`.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: The input stride for atrous morphological dilation. Must be:
+// `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
+func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"source": source}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGradV2",
+		Type: "Dilation2D",
 		Input: []tf.Input{
-			handle, flow_in,
+			input, filter,
 		},
 		Attrs: attrs,
 	}
@@ -33045,54 +38115,53 @@ func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source
 	return op.Output(0)
 }
 
-// SparseReduceMaxAttr is an optional argument to SparseReduceMax.
-type SparseReduceMaxAttr func(optionalAttr)
-
-// SparseReduceMaxKeepDims sets the optional keep_dims attribute to value.
+// Converts the given variant tensor to an iterator and stores it in the given resource.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceMaxKeepDims(value bool) SparseReduceMaxAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
+// Arguments:
+//	resource_handle: A handle to an iterator resource.
+//	serialized: A variant tensor storing the state of the iterator contained in the
+// resource.
+//
+// Returns the created operation.
+func DeserializeIterator(scope *Scope, resource_handle tf.Output, serialized tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DeserializeIterator",
+		Input: []tf.Input{
+			resource_handle, serialized,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// Computes the max of elements across dimensions of a SparseTensor.
+// Creates a dataset that shuffles and repeats elements from `input_dataset`
 //
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
-// instead of a sparse one.
+// pseudorandomly.
 //
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// Arguments:
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+//	buffer_size: The number of output elements to buffer in an iterator over
+// this dataset. Compare with the `min_after_dequeue` attr when creating a
+// `RandomShuffleQueue`.
+//	seed: A scalar seed for the random number generator. If either `seed` or
+// `seed2` is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
+//	count: A scalar representing the number of times the underlying dataset
+// should be repeated. The default is `-1`, which results in infinite repetition.
 //
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
 //
-// Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxAttr) (output tf.Output) {
+func ShuffleAndRepeatDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceMax",
+		Type: "ShuffleAndRepeatDataset",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			input_dataset, buffer_size, seed, seed2, count,
 		},
 		Attrs: attrs,
 	}
@@ -33100,79 +38169,28 @@ func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Outp
 	return op.Output(0)
 }
 
-// AsStringAttr is an optional argument to AsString.
-type AsStringAttr func(optionalAttr)
-
-// AsStringPrecision sets the optional precision attribute to value.
-//
-// value: The post-decimal precision to use for floating point numbers.
-// Only used if precision > -1.
-// If not specified, defaults to -1
-func AsStringPrecision(value int64) AsStringAttr {
-	return func(m optionalAttr) {
-		m["precision"] = value
-	}
-}
-
-// AsStringScientific sets the optional scientific attribute to value.
+// Creates a dataset that caches elements from `input_dataset`.
 //
-// value: Use scientific notation for floating point numbers.
-// If not specified, defaults to false
-func AsStringScientific(value bool) AsStringAttr {
-	return func(m optionalAttr) {
-		m["scientific"] = value
-	}
-}
-
-// AsStringShortest sets the optional shortest attribute to value.
+// A CacheDataset will iterate over the input_dataset, and store tensors. If the
+// cache already exists, the cache will be used. If the cache is inappropriate
+// (e.g. cannot be opened, contains tensors of the wrong shape / size), an error
+// will the returned when used.
 //
-// value: Use shortest representation (either scientific or standard) for
-// floating point numbers.
-// If not specified, defaults to false
-func AsStringShortest(value bool) AsStringAttr {
-	return func(m optionalAttr) {
-		m["shortest"] = value
-	}
-}
-
-// AsStringWidth sets the optional width attribute to value.
+// Arguments:
 //
-// value: Pad pre-decimal numbers to this width.
-// Applies to both floating point and integer numbers.
-// Only used if width > -1.
-// If not specified, defaults to -1
-func AsStringWidth(value int64) AsStringAttr {
-	return func(m optionalAttr) {
-		m["width"] = value
-	}
-}
-
-// AsStringFill sets the optional fill attribute to value.
+//	filename: A path on the filesystem where we should cache the dataset. Note: this
+// will be a directory.
 //
-// value: The value to pad if width > -1.  If empty, pads with spaces.
-// Another typical value is '0'.  String cannot be longer than 1 character.
-// If not specified, defaults to ""
-func AsStringFill(value string) AsStringAttr {
-	return func(m optionalAttr) {
-		m["fill"] = value
-	}
-}
-
-// Converts each entry in the given tensor to strings.  Supports many numeric
 //
-// types and boolean.
-func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output tf.Output) {
+func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "AsString",
+		Type: "CacheDataset",
 		Input: []tf.Input{
-			input,
+			input_dataset, filename,
 		},
 		Attrs: attrs,
 	}
@@ -33180,340 +38198,307 @@ func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output t
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayScatterV3
+// Creates a dataset that emits the records from one or more binary files.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayScatterV3
-func TensorArrayScatterV2(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// Arguments:
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+//	header_bytes: A scalar representing the number of bytes to skip at the
+// beginning of a file.
+//	record_bytes: A scalar representing the number of bytes in each record.
+//	footer_bytes: A scalar representing the number of bytes to skip at the end
+// of a file.
+//	buffer_size: A scalar representing the number of bytes to buffer. Must be > 0.
+func FixedLengthRecordDataset(scope *Scope, filenames tf.Output, header_bytes tf.Output, record_bytes tf.Output, footer_bytes tf.Output, buffer_size tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayScatterV2",
+		Type: "FixedLengthRecordDataset",
 		Input: []tf.Input{
-			handle, indices, value, flow_in,
+			filenames, header_bytes, record_bytes, footer_bytes, buffer_size,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Applies sparse addition to `input` using individual values or slices
-//
-// from `updates` according to indices `indices`.  The updates are non-aliasing:
-// `input` is only modified in-place if no other operations will use it.
-// Otherwise, a copy of `input` is made.  This operation has a gradient with
-// respect to both `input` and `updates`.
-//
-// `input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `input`.
-// It must be shape \\([d_0, ..., d_{Q-2}, K]\\) where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or `(P-K)`-dimensional slices
-// (if `K < P`) along the `K`th dimension of `input`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// $$[d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].$$
-//
-// For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
-// elements. In Python, that addition would look like this:
+// Gradients for batch normalization.
 //
-//     input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(output))
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
 //
-// The resulting value `output` would look like this:
+// This op is deprecated. See `tf.nn.batch_normalization`.
 //
-//     [1, 13, 3, 14, 14, 6, 7, 20]
+// Arguments:
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this Tensor will be multiplied
+// with the normalized Tensor.
+//	backprop: 4D backprop Tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
 //
-// See `tf.scatter_nd` for more details about how to make updates to slices.
+// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
+func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	opspec := tf.OpSpec{
+		Type: "BatchNormWithGlobalNormalizationGrad",
+		Input: []tf.Input{
+			t, m, v, gamma, backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// Creates a dataset that emits the records from one or more TFRecord files.
 //
 // Arguments:
-//	input: A Tensor.
-//	indices: A Tensor. Must be one of the following types: `int32`, `int64`.
-// A tensor of indices into `input`.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated values
-// to add to `input`.
-//
-// Returns A `Tensor` with the same shape as `input`, containing values of `input`
-// updated with `updates`.
-func ScatterNdNonAliasingAdd(scope *Scope, input tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+//	filenames: A scalar or vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	buffer_size: A scalar representing the number of bytes to buffer. A value of
+// 0 means no buffering will be performed.
+func TFRecordDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ScatterNdNonAliasingAdd",
+		Type: "TFRecordDataset",
 		Input: []tf.Input{
-			input, indices, updates,
+			filenames, compression_type, buffer_size,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
-type FractionalMaxPoolAttr func(optionalAttr)
+// ExperimentalStatsAggregatorHandleAttr is an optional argument to ExperimentalStatsAggregatorHandle.
+type ExperimentalStatsAggregatorHandleAttr func(optionalAttr)
 
-// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
-//
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
-// If not specified, defaults to false
-func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
+// ExperimentalStatsAggregatorHandleContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func ExperimentalStatsAggregatorHandleContainer(value string) ExperimentalStatsAggregatorHandleAttr {
 	return func(m optionalAttr) {
-		m["pseudo_random"] = value
+		m["container"] = value
 	}
 }
 
-// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
-// If not specified, defaults to false
-func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
+// ExperimentalStatsAggregatorHandleSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func ExperimentalStatsAggregatorHandleSharedName(value string) ExperimentalStatsAggregatorHandleAttr {
 	return func(m optionalAttr) {
-		m["overlapping"] = value
+		m["shared_name"] = value
 	}
 }
 
-// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
-//
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalMaxPool node in the computation graph. Mainly used
-// in unit test to make FractionalMaxPool deterministic.
-// If not specified, defaults to false
-func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["deterministic"] = value
+// Creates a statistics manager resource.
+func ExperimentalStatsAggregatorHandle(scope *Scope, optional ...ExperimentalStatsAggregatorHandleAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalStatsAggregatorHandle",
+
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalMaxPoolSeed sets the optional seed attribute to value.
+// A container for an iterator resource.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// Returns A handle to the iterator that can be passed to a "MakeIterator" or
+// "IteratorGetNext" op. In contrast to Iterator, AnonymousIterator prevents
+// resource sharing by name, and does not keep a reference to the resource
+// container.
+func AnonymousIterator(scope *Scope, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "AnonymousIterator",
 
-// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Performs fractional max pooling on the input.
-//
-// Fractional max pooling is slightly different than regular max pooling.  In
-// regular max pooling, you downsize an input set by taking the maximum value of
-// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
-// a factor of N, where N is an integer.  Fractional max pooling, as you might
-// expect from the word "fractional", means that the overall reduction ratio N
-// does not have to be an integer.
-//
-// The sizes of the pooling regions are generated randomly but are fairly uniform.
-// For example, let's look at the height dimension, and the constraints on the
-// list of rows that will be pool boundaries.
-//
-// First we define the following:
-//
-// 1.  input_row_length : the number of rows from the input set
-// 2.  output_row_length : which will be smaller than the input
-// 3.  alpha = input_row_length / output_row_length : our reduction ratio
-// 4.  K = floor(alpha)
-// 5.  row_pooling_sequence : this is the result list of pool boundary rows
+// Adjust the contrast of one or more images.
 //
-// Then, row_pooling_sequence should satisfy:
+// `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
+// interpreted as `[height, width, channels]`.  The other dimensions only
+// represent a collection of images, such as `[batch, height, width, channels].`
 //
-// 1.  a[0] = 0 : the first value of the sequence is 0
-// 2.  a[end] = input_row_length : the last value of the sequence is the size
-// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
-// 4.  length(row_pooling_sequence) = output_row_length+1
+// Contrast is adjusted independently for each channel of each image.
 //
-// For more details on fractional max pooling, see this paper:
-// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+// For each channel, the Op first computes the mean of the image pixels in the
+// channel and then adjusts each component of each pixel to
+// `(x - mean) * contrast_factor + mean`.
 //
 // Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
+//	images: Images to adjust.  At least 3-D.
+//	contrast_factor: A float multiplier for adjusting contrast.
 //
-// Returns output tensor after fractional max pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
-func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+// Returns The contrast-adjusted image or images.
+func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
-	for _, a := range optional {
-		a(attrs)
+	opspec := tf.OpSpec{
+		Type: "AdjustContrastv2",
+		Input: []tf.Input{
+			images, contrast_factor,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Gets the next output from the given iterator .
+func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "FractionalMaxPool",
+		Type: "IteratorGetNext",
 		Input: []tf.Input{
-			value,
+			iterator,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("IteratorGetNext", err)
+		return
+	}
+	return components
 }
 
-// Creates a MultiDeviceIterator resource.
+// Outputs the single element from the given dataset.
 //
 // Arguments:
-//	devices: A list of devices the iterator works across.
-//	shared_name: If non-empty, this resource will be shared under the given name
-// across multiple sessions.
-//	container: If non-empty, this resource is placed in the given container.
-// Otherwise, a default container is used.
-//	output_types: The type list for the return values.
-//	output_shapes: The list of shapes being produced.
+//	dataset: A handle to a dataset that contains a single element.
 //
-// Returns Handle to the resource created.
-func MultiDeviceIterator(scope *Scope, devices []string, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+//
+//
+// Returns The components of the single element of `input`.
+func DatasetToSingleElement(scope *Scope, dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"devices": devices, "shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "MultiDeviceIterator",
-
+		Type: "DatasetToSingleElement",
+		Input: []tf.Input{
+			dataset,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("DatasetToSingleElement", err)
+		return
+	}
+	return components
 }
 
-// Deprecated. Use TensorArraySizeV3
+// Converts the given `resource_handle` representing an iterator to a string.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArraySizeV3
-func TensorArraySizeV2(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
+// Arguments:
+//	resource_handle: A handle to an iterator resource.
+//
+// Returns A string representation of the given handle.
+func IteratorToStringHandle(scope *Scope, resource_handle tf.Output) (string_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArraySizeV2",
+		Type: "IteratorToStringHandle",
 		Input: []tf.Input{
-			handle, flow_in,
+			resource_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv2DAttr is an optional argument to Conv2D.
-type Conv2DAttr func(optionalAttr)
-
-// Conv2DUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
+// IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
+type IteratorFromStringHandleAttr func(optionalAttr)
 
-// Conv2DDataFormat sets the optional data_format attribute to value.
+// IteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func Conv2DDataFormat(value string) Conv2DAttr {
+// value: If specified, defines the type of each tuple component in an
+// element produced by the resulting iterator.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["output_types"] = value
 	}
 }
 
-// Conv2DDilations sets the optional dilations attribute to value.
+// IteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DDilations(value []int64) Conv2DAttr {
+// value: If specified, defines the shape of each tuple component in an
+// element produced by the resulting iterator.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["output_shapes"] = value
 	}
 }
 
-// Computes a 2-D convolution given 4-D `input` and `filter` tensors.
-//
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, out_channels]`, this op
-// performs the following:
-//
-// 1. Flattens the filter to a 2-D matrix with shape
-//    `[filter_height * filter_width * in_channels, output_channels]`.
-// 2. Extracts image patches from the input tensor to form a *virtual*
-//    tensor of shape `[batch, out_height, out_width,
-//    filter_height * filter_width * in_channels]`.
-// 3. For each patch, right-multiplies the filter matrix and the image patch
-//    vector.
-//
-// In detail, with the default NHWC format,
-//
-//     output[b, i, j, k] =
-//         sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
-//                         filter[di, dj, q, k]
-//
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+// Converts the given string representing a handle to an iterator to a resource.
 //
 // Arguments:
-//	input: A 4-D tensor. The dimension order is interpreted according to the value
-// of `data_format`, see below for details.
-//	filter: A 4-D tensor of shape
-// `[filter_height, filter_width, in_channels, out_channels]`
-//	strides: 1-D tensor of length 4.  The stride of the sliding window for each
-// dimension of `input`. The dimension order is determined by the value of
-// `data_format`, see below for details.
-//	padding: The type of padding algorithm to use.
+//	string_handle: A string representation of the given handle.
 //
-// Returns A 4-D tensor. The dimension order is determined by the value of
-// `data_format`, see below for details.
-func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv2DAttr) (output tf.Output) {
+// Returns A handle to an iterator resource.
+func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...IteratorFromStringHandleAttr) (resource_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2D",
+		Type: "IteratorFromStringHandle",
 		Input: []tf.Input{
-			input, filter,
+			string_handle,
 		},
 		Attrs: attrs,
 	}
@@ -33521,140 +38506,225 @@ func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, pa
 	return op.Output(0)
 }
 
-// StageAttr is an optional argument to Stage.
-type StageAttr func(optionalAttr)
-
-// StageCapacity sets the optional capacity attribute to value.
+// Gather slices from `params` axis `axis` according to `indices`.
 //
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `params.shape[:axis] + indices.shape +
+// params.shape[axis + 1:]` where:
 //
-// REQUIRES: value >= 0
-func StageCapacity(value int64) StageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// StageMemoryLimit sets the optional memory_limit attribute to value.
+// ```python
+//     # Scalar indices (output is rank(params) - 1).
+//     output[a_0, ..., a_n, b_0, ..., b_n] =
+//       params[a_0, ..., a_n, indices, b_0, ..., b_n]
 //
-// value: The maximum number of bytes allowed for Tensors in the Staging Area.
-// If > 0, inserts will block until sufficient space is available.
-// If not specified, defaults to 0
+//     # Vector indices (output is rank(params)).
+//     output[a_0, ..., a_n, i, b_0, ..., b_n] =
+//       params[a_0, ..., a_n, indices[i], b_0, ..., b_n]
 //
-// REQUIRES: value >= 0
-func StageMemoryLimit(value int64) StageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// StageContainer sets the optional container attribute to value.
+//     # Higher rank indices (output is rank(params) + rank(indices) - 1).
+//     output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =
+//       params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]
+// ```
 //
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func StageContainer(value string) StageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
+// </div>
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, a 0 is stored in the
+// corresponding output value.
+//
+// See also `tf.batch_gather` and `tf.gather_nd`.
+//
+// Arguments:
+//	params: The tensor from which to gather values. Must be at least rank
+// `axis + 1`.
+//	indices: Index tensor. Must be in range `[0, params.shape[axis])`.
+//	axis: The axis in `params` to gather `indices` from. Defaults to the first
+// dimension. Supports negative indexes.
+//
+// Returns Values from `params` gathered from indices given by `indices`, with
+// shape `params.shape[:axis] + indices.shape + params.shape[axis + 1:]`.
+func GatherV2(scope *Scope, params tf.Output, indices tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GatherV2",
+		Input: []tf.Input{
+			params, indices, axis,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// StageSharedName sets the optional shared_name attribute to value.
+// Converts the given `resource_handle` representing an iterator to a variant tensor.
 //
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func StageSharedName(value string) StageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// Arguments:
+//	resource_handle: A handle to an iterator resource.
+//
+// Returns A variant tensor storing the state of the iterator contained in the
+// resource.
+func SerializeIterator(scope *Scope, resource_handle tf.Output) (serialized tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SerializeIterator",
+		Input: []tf.Input{
+			resource_handle,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Stage values similar to a lightweight Enqueue.
-//
-// The basic functionality of this Op is similar to a queue with many
-// fewer capabilities and options.  This Op is optimized for performance.
+// Deserializes a proto into the tree handle
 //
 // Arguments:
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
+//	tree_handle: Handle to the tree resource to be restored.
+//	tree_config: Serialied proto string of the boosted_trees.Tree proto.
 //
 // Returns the created operation.
-func Stage(scope *Scope, values []tf.Output, optional ...StageAttr) (o *tf.Operation) {
+func TensorForestTreeDeserialize(scope *Scope, tree_handle tf.Output, tree_config tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Stage",
+		Type: "TensorForestTreeDeserialize",
 		Input: []tf.Input{
-			tf.OutputList(values),
+			tree_handle, tree_config,
 		},
-		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// StagePeekAttr is an optional argument to StagePeek.
-type StagePeekAttr func(optionalAttr)
-
-// StagePeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StagePeekCapacity(value int64) StagePeekAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
+// Constructs an Optional variant from a tuple of tensors.
+func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "OptionalFromValue",
+		Input: []tf.Input{
+			tf.OutputList(components),
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// StagePeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// DecodeProtoV2Attr is an optional argument to DecodeProtoV2.
+type DecodeProtoV2Attr func(optionalAttr)
+
+// DecodeProtoV2DescriptorSource sets the optional descriptor_source attribute to value.
 //
-// REQUIRES: value >= 0
-func StagePeekMemoryLimit(value int64) StagePeekAttr {
+// value: Either the special value `local://` or a path to a file containing
+// a serialized `FileDescriptorSet`.
+// If not specified, defaults to "local://"
+func DecodeProtoV2DescriptorSource(value string) DecodeProtoV2Attr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["descriptor_source"] = value
 	}
 }
 
-// StagePeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StagePeekContainer(value string) StagePeekAttr {
+// DecodeProtoV2MessageFormat sets the optional message_format attribute to value.
+//
+// value: Either `binary` or `text`.
+// If not specified, defaults to "binary"
+func DecodeProtoV2MessageFormat(value string) DecodeProtoV2Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["message_format"] = value
 	}
 }
 
-// StagePeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StagePeekSharedName(value string) StagePeekAttr {
+// DecodeProtoV2Sanitize sets the optional sanitize attribute to value.
+//
+// value: Whether to sanitize the result or not.
+// If not specified, defaults to false
+func DecodeProtoV2Sanitize(value bool) DecodeProtoV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["sanitize"] = value
 	}
 }
 
-// Op peeks at the values at the specified index.  If the
+// The op extracts fields from a serialized protocol buffers message into tensors.
 //
-// underlying container does not contain sufficient elements
-// this op will block until it does.   This Op is optimized for
-// performance.
-func StagePeek(scope *Scope, index tf.Output, dtypes []tf.DataType, optional ...StagePeekAttr) (values []tf.Output) {
+// The `decode_proto` op extracts fields from a serialized protocol buffers
+// message into tensors.  The fields in `field_names` are decoded and converted
+// to the corresponding `output_types` if possible.
+//
+// A `message_type` name must be provided to give context for the field
+// names. The actual message descriptor can be looked up either in the
+// linked-in descriptor pool or a filename provided by the caller using
+// the `descriptor_source` attribute.
+//
+// Each output tensor is a dense tensor. This means that it is padded to
+// hold the largest number of repeated elements seen in the input
+// minibatch. (The shape is also padded by one to prevent zero-sized
+// dimensions). The actual repeat counts for each example in the
+// minibatch can be found in the `sizes` output. In many cases the output
+// of `decode_proto` is fed immediately into tf.squeeze if missing values
+// are not a concern. When using tf.squeeze, always pass the squeeze
+// dimension explicitly to avoid surprises.
+//
+// For the most part, the mapping between Proto field types and
+// TensorFlow dtypes is straightforward. However, there are a few
+// special cases:
+//
+// - A proto field that contains a submessage or group can only be converted
+// to `DT_STRING` (the serialized submessage). This is to reduce the
+// complexity of the API. The resulting string can be used as input
+// to another instance of the decode_proto op.
+//
+// - TensorFlow lacks support for unsigned integers. The ops represent uint64
+// types as a `DT_INT64` with the same twos-complement bit pattern
+// (the obvious way). Unsigned int32 values can be represented exactly by
+// specifying type `DT_INT64`, or using twos-complement if the caller
+// specifies `DT_INT32` in the `output_types` attribute.
+//
+// The `descriptor_source` attribute selects a source of protocol
+// descriptors to consult when looking up `message_type`. This may be a
+// filename containing a serialized `FileDescriptorSet` message,
+// or the special value `local://`, in which case only descriptors linked
+// into the code will be searched; the filename can be on any filesystem
+// accessible to TensorFlow.
+//
+// You can build a `descriptor_source` file using the `--descriptor_set_out`
+// and `--include_imports` options to the protocol compiler `protoc`.
+//
+// The `local://` database only covers descriptors linked into the
+// code via C++ libraries, not Python imports. You can link in a proto descriptor
+// by creating a cc_library target with alwayslink=1.
+//
+// Both binary and text proto serializations are supported, and can be
+// chosen using the `format` attribute.
+//
+// Arguments:
+//	bytes: Tensor of serialized protos with shape `batch_shape`.
+//	message_type: Name of the proto message type to decode.
+//	field_names: List of strings containing proto field names. An extension field can be decoded
+// by using its full name, e.g. EXT_PACKAGE.EXT_FIELD_NAME.
+//	output_types: List of TF types to use for the respective field in field_names.
+//
+// Returns Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+// Each entry is the number of values found for the corresponding field.
+// Optional fields may have 0 or 1 values.List of tensors containing values for the corresponding field.
+// `values[i]` has datatype `output_types[i]`
+// and shape `[batch_shape, max(sizes[...,i])]`.
+func DecodeProtoV2(scope *Scope, bytes tf.Output, message_type string, field_names []string, output_types []tf.DataType, optional ...DecodeProtoV2Attr) (sizes tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"message_type": message_type, "field_names": field_names, "output_types": output_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StagePeek",
+		Type: "DecodeProtoV2",
 		Input: []tf.Input{
-			index,
+			bytes,
 		},
 		Attrs: attrs,
 	}
@@ -33664,9 +38734,37 @@ func StagePeek(scope *Scope, index tf.Output, dtypes []tf.DataType, optional ...
 	}
 	var idx int
 	var err error
+	sizes = op.Output(idx)
 	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("StagePeek", err)
+		scope.UpdateErr("DecodeProtoV2", err)
 		return
 	}
-	return values
+	return sizes, values
+}
+
+// Creates an Optional variant with no value.
+func OptionalNone(scope *Scope) (optional tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "OptionalNone",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns true if and only if the given Optional variant has a value.
+func OptionalHasValue(scope *Scope, optional tf.Output) (has_value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "OptionalHasValue",
+		Input: []tf.Input{
+			optional,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
diff --git a/tensorflow/go/session.go b/tensorflow/go/session.go
index db6ae4f26cd92dcf5e542052e4bae561bbefe999..48909ffe39e0260096d9ec4513976a2d49c34a98 100644
--- a/tensorflow/go/session.go
+++ b/tensorflow/go/session.go
@@ -71,37 +71,39 @@ type Device struct {
 	MemoryLimitBytes int64
 }
 
-// Return list of devices associated with a Session
-func (s *Session) ListDevices() ([]Device, error) {
-	var devices []Device
+// String describes d and implements fmt.Stringer.
+func (d Device) String() string {
+	memStr := "no memory limit"
+	if d.MemoryLimitBytes >= 0 {
+		memStr = fmt.Sprintf("memory limit %d bytes", d.MemoryLimitBytes)
+	}
+	return fmt.Sprintf("(Device: name \"%s\", type %s, %s)", d.Name, d.Type, memStr)
+}
 
+func deviceSliceFromDeviceList(list *C.TF_DeviceList) ([]Device, error) {
+	var devices []Device
 	status := newStatus()
-	devices_list := C.TF_SessionListDevices(s.c, status.c)
-	if err := status.Err(); err != nil {
-		return nil, fmt.Errorf("SessionListDevices() failed: %v", err)
-	}
-	defer C.TF_DeleteDeviceList(devices_list)
 
-	for i := 0; i < int(C.TF_DeviceListCount(devices_list)); i++ {
-		device_name := C.TF_DeviceListName(devices_list, C.int(i), status.c)
+	for i := 0; i < int(C.TF_DeviceListCount(list)); i++ {
+		name := C.TF_DeviceListName(list, C.int(i), status.c)
 		if err := status.Err(); err != nil {
 			return nil, fmt.Errorf("DeviceListName(index=%d) failed: %v", i, err)
 		}
 
-		device_type := C.TF_DeviceListType(devices_list, C.int(i), status.c)
+		deviceType := C.TF_DeviceListType(list, C.int(i), status.c)
 		if err := status.Err(); err != nil {
 			return nil, fmt.Errorf("DeviceListType(index=%d) failed: %v", i, err)
 		}
 
-		memory_limit_bytes := C.TF_DeviceListMemoryBytes(devices_list, C.int(i), status.c)
+		memoryLimitBytes := C.TF_DeviceListMemoryBytes(list, C.int(i), status.c)
 		if err := status.Err(); err != nil {
 			return nil, fmt.Errorf("DeviceListMemoryBytes(index=%d) failed: %v", i, err)
 		}
 
 		device := Device{
-			Name:             C.GoString(device_name),
-			Type:             C.GoString(device_type),
-			MemoryLimitBytes: int64(memory_limit_bytes),
+			Name:             C.GoString(name),
+			Type:             C.GoString(deviceType),
+			MemoryLimitBytes: int64(memoryLimitBytes),
 		}
 
 		devices = append(devices, device)
@@ -110,6 +112,17 @@ func (s *Session) ListDevices() ([]Device, error) {
 	return devices, nil
 }
 
+// ListDevices returns the list of devices associated with a Session.
+func (s *Session) ListDevices() ([]Device, error) {
+	status := newStatus()
+	devicesList := C.TF_SessionListDevices(s.c, status.c)
+	if err := status.Err(); err != nil {
+		return nil, fmt.Errorf("SessionListDevices() failed: %v", err)
+	}
+	defer C.TF_DeleteDeviceList(devicesList)
+	return deviceSliceFromDeviceList(devicesList)
+}
+
 // Run the graph with the associated session starting with the supplied feeds
 // to compute the value of the requested fetches. Runs, but does not return
 // Tensors for operations specified in targets.
diff --git a/tensorflow/go/session_test.go b/tensorflow/go/session_test.go
index 05ace99a2387c6884832427187525f2fb7d5aba2..c9bda00167171179dac7ced108d928c9e7bb5f86 100644
--- a/tensorflow/go/session_test.go
+++ b/tensorflow/go/session_test.go
@@ -299,3 +299,21 @@ func TestListDevices(t *testing.T) {
 		t.Fatalf("no devices detected")
 	}
 }
+
+func TestDeviceString(t *testing.T) {
+	d := Device{Name: "foo", Type: "bar", MemoryLimitBytes: 12345}
+	got := d.String()
+	want := "(Device: name \"foo\", type bar, memory limit 12345 bytes)"
+	if got != want {
+		t.Errorf("Got \"%s\", want \"%s\"", got, want)
+	}
+}
+
+func TestDeviceStringNoMemoryLimit(t *testing.T) {
+	d := Device{Name: "foo", Type: "bar", MemoryLimitBytes: -1}
+	got := d.String()
+	want := "(Device: name \"foo\", type bar, no memory limit)"
+	if got != want {
+		t.Errorf("Got \"%s\", want \"%s\"", got, want)
+	}
+}
diff --git a/tensorflow/go/tensor_handle.go b/tensorflow/go/tensor_handle.go
new file mode 100644
index 0000000000000000000000000000000000000000..09192eccefe13cc4573e69dfac85c8aa169dd6fc
--- /dev/null
+++ b/tensorflow/go/tensor_handle.go
@@ -0,0 +1,170 @@
+/*
+Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package tensorflow
+
+// #include <stdlib.h>
+// #include "tensorflow/c/c_api.h"
+// #include "tensorflow/c/eager/c_api.h"
+import "C"
+import (
+	"runtime"
+	"unsafe"
+)
+
+// TensorHandle is a handle to a tensor on a device.
+//
+// A Tensor referenced by a TensorHandle may be on any device, whereas a Tensor
+// always resides in the host CPU's memory.
+//
+// A Tensor referenced by a TensorHandle may not have been computed yet. For
+// example, a TensorHandle might reference the output of an operation that has
+// not finished executing. Because of this, various methods, such as Shape() may
+// block until the tensor has been instantiated.
+//
+// This allows multiple operations to be performed on tensors on a device
+// (e.g. a GPU) without sending these values back to the host CPU in between
+// every operation.
+type TensorHandle struct {
+	c *C.TFE_TensorHandle
+}
+
+// NewTensorHandle creates a new tensor handle from a tensor.
+func NewTensorHandle(t *Tensor) (*TensorHandle, error) {
+	status := newStatus()
+	cHandle := C.TFE_NewTensorHandle(t.c, status.c)
+	if err := status.Err(); err != nil {
+		return nil, err
+	}
+
+	th := &TensorHandle{c: cHandle}
+	runtime.SetFinalizer(th, (*TensorHandle).finalizer)
+	return th, nil
+}
+
+func (th *TensorHandle) finalizer() {
+	C.TFE_DeleteTensorHandle(th.c)
+}
+
+// newTensorHandleFromC takes ownership of c and returns the owning TensorHandle.
+func newTensorHandleFromC(c *C.TFE_TensorHandle) *TensorHandle {
+	th := &TensorHandle{c: c}
+	runtime.SetFinalizer(th, (*TensorHandle).finalizer)
+	return th
+}
+
+// DataType returns the TensorHandle's datatype.
+func (th *TensorHandle) DataType() DataType {
+	return DataType(C.TFE_TensorHandleDataType(th.c))
+}
+
+// Shape returns the shape of the Tensor referenced by th.
+func (th *TensorHandle) Shape() ([]int64, error) {
+	n, err := th.numDims()
+	if err != nil {
+		return nil, err
+	}
+	r := make([]int64, n)
+	for i := 0; i < n; i++ {
+		if r[i], err = th.dim(i); err != nil {
+			return nil, err
+		}
+	}
+	return r, nil
+}
+
+// numDims returns the number of dimensions of the TensorHandle. It blocks
+// until the operation that produces the handle has completed.
+func (th *TensorHandle) numDims() (int, error) {
+	status := newStatus()
+	n := int(C.TFE_TensorHandleNumDims(th.c, status.c))
+	return n, status.Err()
+}
+
+// dim returns the size of the index'th dimension of the TensorHandle. It
+// blocks until the operation that produces the handle has completed.
+func (th *TensorHandle) dim(index int) (int64, error) {
+	status := newStatus()
+	n := int64(C.TFE_TensorHandleDim(th.c, C.int(index), status.c))
+	if err := status.Err(); err != nil {
+		return 0, err
+	}
+	return n, nil
+}
+
+// DeviceName returns the name of the device of the operation that produced the
+// TensorHandle. If the handle was produced by a copy, it returns the
+// destination device of the copy. Note that returned device name is not always
+// the device holding the tensor handle's memory. If you want the latter, use
+// BackingDeviceName. This function will block till the operation that produces
+// th has completed.
+func (th *TensorHandle) DeviceName() (string, error) {
+	status := newStatus()
+	name := C.TFE_TensorHandleDeviceName(th.c, status.c)
+	if err := status.Err(); err != nil {
+		return "", err
+	}
+	return C.GoString(name), nil
+}
+
+// BackingDeviceName returns the name of the device in whose memory the tensor
+// handle resides. This function will block till the operation that produces
+// `h` has completed.
+//
+// WARNING: The implementation currently returns the same as DeviceName().
+// After TensoFlow 1.13's C library is released, this implementation will
+// be updated to return what the documentation says!
+func (th *TensorHandle) BackingDeviceName() (string, error) {
+	// TODO(ashankar): Restore after TensorFlow 1.13 is released.
+	// See https://github.com/tensorflow/tensorflow/issues/23257#issuecomment-433751410
+	return th.DeviceName()
+	/*
+	status := newStatus()
+	name := C.TFE_TensorHandleBackingDeviceName(th.c, status.c)
+	if err := status.Err(); err != nil {
+		return "", err
+	}
+	return C.GoString(name), nil
+	*/
+}
+
+// ToTensor returns the Tensor referenced by th. It may block if this tensor is
+// not yet computed.
+func (th *TensorHandle) ToTensor() (*Tensor, error) {
+	status := newStatus()
+	cTensor := C.TFE_TensorHandleResolve(th.c, status.c)
+	if err := status.Err(); err != nil {
+		return nil, err
+	}
+	return newTensorFromC(cTensor), nil
+}
+
+// CopyToDevice creates a new TensorHandle with the same contents as this
+// TensorHandle but placed in the memory of the device 'deviceName'. If source
+// and destination are the same device, then this creates a new handle that
+// shares the underlying buffer. Otherwise, it currently requires at least one
+// of the source or destination devices to be CPU (i.e., for the source or
+// destination tensor to be placed in host memory).
+func (th *TensorHandle) CopyToDevice(c *Context, deviceName string) (*TensorHandle, error) {
+	status := newStatus()
+	n := C.CString(deviceName)
+	newTh := C.TFE_TensorHandleCopyToDevice(th.c, c.c, n, status.c)
+	C.free(unsafe.Pointer(n))
+	if err := status.Err(); err != nil {
+		return nil, err
+	}
+	return newTensorHandleFromC(newTh), nil
+}
diff --git a/tensorflow/go/tensor_handle_test.go b/tensorflow/go/tensor_handle_test.go
new file mode 100644
index 0000000000000000000000000000000000000000..15dea64b08c1b428c6faea973643a41f91cb13a8
--- /dev/null
+++ b/tensorflow/go/tensor_handle_test.go
@@ -0,0 +1,127 @@
+/*
+Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package tensorflow
+
+import (
+	"reflect"
+	"strings"
+	"testing"
+)
+
+func TestNewTensorHandle(t *testing.T) {
+	vals := [][]float32{{1.0, 2.0}, {3.0, 4.0}}
+	tensor, err := NewTensor(vals)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if _, err = NewTensorHandle(tensor); err != nil {
+		t.Fatal(err)
+	}
+}
+
+func TestTensorHandleDataType(t *testing.T) {
+	vals := [][]float32{{1.0, 2.0}, {3.0, 4.0}}
+	tensor, err := NewTensor(vals)
+	if err != nil {
+		t.Fatal(err)
+	}
+	th, err := NewTensorHandle(tensor)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if got, want := th.DataType(), Float; got != want {
+		t.Errorf("Got %v, want %v", got, want)
+	}
+}
+
+func TestTensorHandleShape(t *testing.T) {
+	vals := [][]float32{{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}}
+	tensor, err := NewTensor(vals)
+	if err != nil {
+		t.Fatal(err)
+	}
+	th, err := NewTensorHandle(tensor)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	got, err := th.Shape()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if want := []int64{2, 3}; !reflect.DeepEqual(got, want) {
+		t.Errorf("Got %#v, want %#v", got, want)
+	}
+}
+
+func TestTensorHandleDeviceName(t *testing.T) {
+	vals := [][]float32{{1.0, 2.0}, {3.0, 4.0}}
+	tensor, err := NewTensor(vals)
+	if err != nil {
+		t.Fatal(err)
+	}
+	th, err := NewTensorHandle(tensor)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	d, err := th.DeviceName()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !strings.Contains(d, "CPU") {
+		t.Errorf("DeviceName() did not return a CPU device; got: %s", d)
+	}
+}
+
+func TestTensorHandleBackingDeviceName(t *testing.T) {
+	vals := [][]float32{{1.0, 2.0}, {3.0, 4.0}}
+	tensor, err := NewTensor(vals)
+	if err != nil {
+		t.Fatal(err)
+	}
+	th, err := NewTensorHandle(tensor)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	d, err := th.BackingDeviceName()
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !strings.Contains(d, "CPU") {
+		t.Errorf("BackingDeviceName() did not return a CPU device; got: %s", d)
+	}
+}
+
+func TestTensorHandleToTensor(t *testing.T) {
+	initialVals := [][]float32{{1.0, 2.0}, {3.0, 4.0}}
+	initialTensor, err := NewTensor(initialVals)
+	if err != nil {
+		t.Fatal(err)
+	}
+	th, err := NewTensorHandle(initialTensor)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	tensor, err := th.ToTensor()
+	if v := tensor.Value().([][]float32); !reflect.DeepEqual(v, initialVals) {
+		t.Errorf("Got %#v, want %#v", v, initialVals)
+	}
+}
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 10808e162ee4cc679430c0573e5bff8322ad6fff..54b10cd184f1862329c361d4bcb4b0d736522360 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -295,6 +295,19 @@ tf_java_test(
     ],
 )
 
+tf_java_test(
+    name = "GeneratedOperationsTest",
+    size = "small",
+    srcs = ["src/test/java/org/tensorflow/op/core/GeneratedOperationsTest.java"],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.op.core.GeneratedOperationsTest",
+    deps = [
+        ":tensorflow",
+        ":testutil",
+        "@junit",
+    ],
+)
+
 tf_java_test(
     name = "GradientsTest",
     size = "small",
@@ -367,15 +380,13 @@ tf_cc_binary(
     linkopts = select({
         "//tensorflow:debug": [],  # Disable all custom linker options in debug mode
         "//tensorflow:darwin": [
-            "-Wl,-exported_symbols_list",  # This line must be directly followed by LINKER_EXPORTED_SYMBOLS
-            "$(location {})".format(LINKER_EXPORTED_SYMBOLS),
+            "-Wl,-exported_symbols_list,$(location {})".format(LINKER_EXPORTED_SYMBOLS),
         ],
         "//tensorflow:windows": [],
         "//conditions:default": [
             "-z defs",
             "-s",
-            "-Wl,--version-script",  #  This line must be directly followed by LINKER_VERSION_SCRIPT
-            "$(location {})".format(LINKER_VERSION_SCRIPT),
+            "-Wl,--version-script,$(location {})".format(LINKER_VERSION_SCRIPT),
         ],
     }),
     linkshared = 1,
diff --git a/tensorflow/java/README.md b/tensorflow/java/README.md
index 951e8bdd0dd8aae46a361a8ffcff276579433641..4206f6f9fc8ed029d1a7d9b044dd079ec523de31 100644
--- a/tensorflow/java/README.md
+++ b/tensorflow/java/README.md
@@ -20,13 +20,13 @@
 Releases built from release branches are available on Maven Central.
 Additionally, every day binaries are built from the `master` branch on GitHub:
 
-- [JAR](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow.jar)
-- [Sourc JAR](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow-src.jar)
-- JNI:
-  - [Linux CPU-only](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-cpu-linux-x86_64.tar.gz)
-  - [Linux GPU](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-gpu-linux-x86_64.tar.gz)
-  - [MacOS](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-cpu-darwin-x86_64.tar.gz)
-  - Windows: (No nightly builds available yet)
+-   [JAR](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow.jar)
+-   [Source JAR](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow-src.jar)
+-   JNI:
+    -   [Linux CPU-only](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-cpu-linux-x86_64.tar.gz)
+    -   [Linux GPU](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-gpu-linux-x86_64.tar.gz)
+    -   [MacOS](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/lib_package/libtensorflow_jni-cpu-darwin-x86_64.tar.gz)
+    -   Windows: (No nightly builds available yet)
 
 ## Building from source
 
diff --git a/tensorflow/java/build_defs.bzl b/tensorflow/java/build_defs.bzl
index e1916ca4d9d6aa179e1a69451a5e981783560026..f423cc4d8277509d45aa8344e322f71b7f1306a8 100644
--- a/tensorflow/java/build_defs.bzl
+++ b/tensorflow/java/build_defs.bzl
@@ -18,7 +18,7 @@ XLINT_OPTS = [
     "-Xlint:-processing",
     "-Xlint:-serial",
     "-Xlint:-try",
-    "-Xlint:-classfile", # see b/32750402, go/javac-warnings#classfile
+    "-Xlint:-classfile",  # see b/32750402, go/javac-warnings#classfile
 ]
 
 # The bazel errorprone plugin currently only enables default errorChecks
diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index db3a3609f1ac4fda18ff5a1248e61c675a8bf9f9..4d9f629fa295ee933095b8074177f3ea4d435765 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.12.0</version>
+    <version>1.13.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index 53f7a2d63ef5bc8cfe4fbe372cf2fd3f58a0fe33..49a4359249b60f6ea69ac5b37d2fd4aeafe2a09d 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.12.0</version>
+    <version>1.13.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
index a17724c805e38239c61dd27a5cc9ec918bbb2e0f..b54da039e544132f1d796db5a086cbdee6a11111 100644
--- a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.12.0</version>
+    <version>1.13.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni_gpu</artifactId>
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index 30831f90b9f7b4beb5ae3f2ceebadcb6e1f8771e..8dfb5d7aa3c3faab7565947f8b3e10162f5a0deb 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.12.0</version>
+  <version>1.13.0-rc0</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index dd6b52be62487ba6cb989b4917a15df7f473a848..65f545bdde22850f6f9b04750c3e2ab3b45fdb28 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.12.0</version>
+    <version>1.13.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>proto</artifactId>
diff --git a/tensorflow/java/maven/spark-tensorflow-connector/pom.xml b/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
index f47c11809d58464953028c388d491b91f67c3510..1ca1e91891e4d0fb7ae68593e17db92601eb7553 100644
--- a/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
+++ b/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
@@ -6,7 +6,7 @@
     <groupId>org.tensorflow</groupId>
     <artifactId>spark-tensorflow-connector_2.11</artifactId>
     <packaging>jar</packaging>
-    <version>1.12.0</version>
+    <version>1.13.0-rc0</version>
     <name>spark-tensorflow-connector</name>
     <url>https://www.tensorflow.org</url>
     <description>TensorFlow TFRecord connector for Apache Spark DataFrames</description>
diff --git a/tensorflow/java/maven/tensorflow-hadoop/pom.xml b/tensorflow/java/maven/tensorflow-hadoop/pom.xml
index 11aaba983f6ded9a6e757703fd9a2411db82ceb6..df86c50f86aba848f95d9b122e1c28f40123bb9f 100644
--- a/tensorflow/java/maven/tensorflow-hadoop/pom.xml
+++ b/tensorflow/java/maven/tensorflow-hadoop/pom.xml
@@ -5,7 +5,7 @@
     <groupId>org.tensorflow</groupId>
     <artifactId>tensorflow-hadoop</artifactId>
     <packaging>jar</packaging>
-    <version>1.12.0</version>
+    <version>1.13.0-rc0</version>
     <name>tensorflow-hadoop</name>
     <url>https://www.tensorflow.org</url>
     <description>TensorFlow TFRecord InputFormat/OutputFormat for Apache Hadoop</description>
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index 07fcfa5144600f7d9cbf6edbfbecbecc7c115631..a318e7fa084b1256b7cdd7c2f6f05600d36a2fae 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.12.0</version>
+    <version>1.13.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
diff --git a/tensorflow/java/src/gen/cc/op_gen_main.cc b/tensorflow/java/src/gen/cc/op_gen_main.cc
index 0d9e0883af262ee1f262a5e1308cb9df8763488d..cf4bb03dadec421411300100880f9129d7da47be 100644
--- a/tensorflow/java/src/gen/cc/op_gen_main.cc
+++ b/tensorflow/java/src/gen/cc/op_gen_main.cc
@@ -35,7 +35,7 @@ const char kUsageHeader[] =
     "graph.\n\n"
     "Operation wrappers are generated under the path specified by the "
     "'--output_dir' argument. This path can be absolute or relative to the\n"
-    "current working directory and will be created if it does not exists.\n\n"
+    "current working directory and will be created if it does not exist.\n\n"
     "Note that the operations will not be available through the "
     "'org.tensorflow.op.Ops' API until the generated classes are compiled\n"
     "using an appropriate annotation processor.\n\n"
diff --git a/tensorflow/java/src/gen/cc/op_generator.cc b/tensorflow/java/src/gen/cc/op_generator.cc
index 5d6387e88e96802e9226774abd391ac2dd673143..db6116bd5c843c2846d6b9f67e253e87db6daffc 100644
--- a/tensorflow/java/src/gen/cc/op_generator.cc
+++ b/tensorflow/java/src/gen/cc/op_generator.cc
@@ -516,7 +516,7 @@ bool CanGenerateOp(const OpDef& op_def, const ApiDef& api_def) {
     return false;
   }
   for (const auto& attr : op_def.attr()) {
-    if (attr.type() == "func") {
+    if (attr.type() == "func" || attr.type() == "list(func)") {
       return false;  // TODO(karllessard) add support for function attributes
     }
   }
diff --git a/tensorflow/java/src/gen/cc/op_specs.cc b/tensorflow/java/src/gen/cc/op_specs.cc
index 4f5a491d259a1381976d21c777bc0871ada1b916..4024efedefd41fb90b215a9d5227d6028331cdaa 100644
--- a/tensorflow/java/src/gen/cc/op_specs.cc
+++ b/tensorflow/java/src/gen/cc/op_specs.cc
@@ -91,11 +91,6 @@ class TypeResolver {
 
 Type TypeResolver::TypeOf(const OpDef_ArgDef& arg_def, bool* iterable_out) {
   *iterable_out = false;
-  if (!arg_def.number_attr().empty()) {
-    // when number_attr is set, argument has to be a list of tensors
-    *iterable_out = true;
-    visited_attrs_.insert(std::make_pair(arg_def.number_attr(), Type::Int()));
-  }
   Type type = Type::Wildcard();
   if (arg_def.type() != DataType::DT_INVALID) {
     type = Type::ForDataType(arg_def.type());
@@ -122,6 +117,11 @@ Type TypeResolver::TypeOf(const OpDef_ArgDef& arg_def, bool* iterable_out) {
     LOG(FATAL) << "Cannot resolve data type of argument \"" << arg_def.name()
                << "\" in operation \"" << op_def_.name() << "\"";
   }
+  if (!arg_def.number_attr().empty()) {
+    // when number_attr is set, argument has to be a list of tensors
+    *iterable_out = true;
+    visited_attrs_.insert(std::make_pair(arg_def.number_attr(), Type::Int()));
+  }
   return type;
 }
 
diff --git a/tensorflow/java/src/gen/gen_ops.bzl b/tensorflow/java/src/gen/gen_ops.bzl
index f4ff34ea0361fba5528126b93f3f6e45289d8df2..b46721a93dcbd105dea7c52e8ea615cbd00af1c8 100644
--- a/tensorflow/java/src/gen/gen_ops.bzl
+++ b/tensorflow/java/src/gen/gen_ops.bzl
@@ -17,46 +17,48 @@ load(
 # and then archive those source files into
 #     ops/gen_sources.srcjar
 #
-def tf_java_op_gen_srcjar(name,
-                          gen_tool,
-                          base_package,
-                          api_def_srcs=[],
-                          out_dir="ops/",
-                          out_src_dir="src/main/java/",
-                          visibility=["//tensorflow/java:__pkg__"]):
+def tf_java_op_gen_srcjar(
+        name,
+        gen_tool,
+        base_package,
+        api_def_srcs = [],
+        out_dir = "ops/",
+        out_src_dir = "src/main/java/",
+        visibility = ["//tensorflow/java:__pkg__"]):
+    gen_cmds = ["rm -rf $(@D)"]  # Always start from fresh when generating source files
+    srcs = api_def_srcs[:]
 
-  gen_cmds = ["rm -rf $(@D)"]  # Always start from fresh when generating source files
-  srcs = api_def_srcs[:]
+    if not api_def_srcs:
+        api_def_args_str = ","
+    else:
+        api_def_args = []
+        for api_def_src in api_def_srcs:
+            # Add directory of the first ApiDef source to args.
+            # We are assuming all ApiDefs in a single api_def_src are in the
+            # same directory.
+            api_def_args.append(
+                "$$(dirname $$(echo $(locations " + api_def_src +
+                ") | cut -d\" \" -f1))",
+            )
+        api_def_args_str = ",".join(api_def_args)
 
-  if not api_def_srcs:
-    api_def_args_str = ","
-  else:
-    api_def_args = []
-    for api_def_src in api_def_srcs:
-      # Add directory of the first ApiDef source to args.
-      # We are assuming all ApiDefs in a single api_def_src are in the
-      # same directory.
-      api_def_args.append(
-          "$$(dirname $$(echo $(locations " + api_def_src +
-          ") | cut -d\" \" -f1))")
-    api_def_args_str = ",".join(api_def_args)
+    gen_cmds += ["$(location " + gen_tool + ")" +
+                 " --output_dir=$(@D)/" + out_src_dir +
+                 " --base_package=" + base_package +
+                 " --api_dirs=" + api_def_args_str]
 
-  gen_cmds += ["$(location " + gen_tool + ")" +
-               " --output_dir=$(@D)/" + out_src_dir +
-               " --base_package=" + base_package +
-               " --api_dirs=" + api_def_args_str]
+    # Generate a source archive containing generated code for these ops.
+    gen_srcjar = out_dir + name + ".srcjar"
+    gen_cmds += ["$(location @local_jdk//:jar) cMf $(location :" + gen_srcjar + ") -C $(@D) src"]
 
-  # Generate a source archive containing generated code for these ops.
-  gen_srcjar = out_dir + name + ".srcjar"
-  gen_cmds += ["$(location @local_jdk//:jar) cMf $(location :" + gen_srcjar + ") -C $(@D) src"]
-
-  native.genrule(
-      name=name,
-      srcs=srcs,
-      outs=[gen_srcjar],
-      tools=[
-          "@local_jdk//:jar",
-          "@local_jdk//:jdk",
-          gen_tool
-      ] + tf_binary_additional_srcs(),
-      cmd=" && ".join(gen_cmds))
+    native.genrule(
+        name = name,
+        srcs = srcs,
+        outs = [gen_srcjar],
+        tools = [
+            "@local_jdk//:jar",
+            "@local_jdk//:jdk",
+            gen_tool,
+        ] + tf_binary_additional_srcs(),
+        cmd = " && ".join(gen_cmds),
+    )
diff --git a/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java b/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
index 1b7bcdab35f45142aefdc9e9635b398090e60b17..df1426ad75143d720f1d5bd3cf4ce44d30cb226e 100644
--- a/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
+++ b/tensorflow/java/src/gen/java/org/tensorflow/processor/OperatorProcessor.java
@@ -340,7 +340,7 @@ public final class OperatorProcessor extends AbstractProcessor {
                     + "{@link $T @Operator} is exposed\n"
                     + "by this API or one of its subgroup.\n<p>Example usage:\n<pre>{@code\n"
                     + "try (Graph g = new Graph()) {\n"
-                    + "  Ops ops = new Ops(g);\n"
+                    + "  Ops ops = Ops.create(g);\n"
                     + "  // Operations are typed classes with convenience\n"
                     + "  // builders in Ops.\n"
                     + "  Constant three = ops.constant(3);\n"
diff --git a/tensorflow/java/src/main/java/org/tensorflow/SavedModelBundle.java b/tensorflow/java/src/main/java/org/tensorflow/SavedModelBundle.java
index 49594e6b47b9295d164a1823386b0981776e66f4..e653373f8569d9e84a8e524fd0f7439d7747104f 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/SavedModelBundle.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/SavedModelBundle.java
@@ -84,7 +84,7 @@ public class SavedModelBundle implements AutoCloseable {
    * <p>This method is a shorthand for:
    *
    * <pre>{@code
-   * SavedModelBundler.loader().withTags(tags).load();
+   * SavedModelBundle.loader().withTags(tags).load();
    * }</pre>
    *
    * @param exportDir the directory path containing a saved model.
diff --git a/tensorflow/java/src/main/java/org/tensorflow/Session.java b/tensorflow/java/src/main/java/org/tensorflow/Session.java
index a660d25f98ec961ac2ba1a48bced13803c00096b..8cc23e2991b301448b319313f111a48349e1b15f 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/Session.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/Session.java
@@ -149,10 +149,10 @@ public final class Session implements AutoCloseable {
 
     /**
      * Use {@code t} instead of the Tensor referred to by executing the operation referred to by
-     * {@code output}.
+     * {@code operand}.
      */
-    public Runner feed(Output<?> o, Tensor<?> t) {
-      inputs.add(o);
+    public Runner feed(Operand<?> operand, Tensor<?> t) {
+      inputs.add(operand.asOutput());
       inputTensors.add(t);
       return this;
     }
diff --git a/tensorflow/java/src/main/java/org/tensorflow/op/PrimitiveOp.java b/tensorflow/java/src/main/java/org/tensorflow/op/PrimitiveOp.java
index 8e56f970416ef35737d6763fcc6bb46bc7a157c5..006ae99dc46265aede6991e2cea99119113de165 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/op/PrimitiveOp.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/op/PrimitiveOp.java
@@ -25,6 +25,11 @@ import org.tensorflow.Operation;
  */
 public abstract class PrimitiveOp implements Op {
 
+  /** Returns the underlying {@link Operation} */
+  public Operation op() {
+    return operation;
+  }
+
   @Override
   public final int hashCode() {
     return operation.hashCode();
@@ -48,10 +53,6 @@ public abstract class PrimitiveOp implements Op {
     return String.format("<%s '%s'>", operation.type(), operation.name());
   }
 
-  /**
-   * Underlying operation. It is deliberately not exposed by a getter method to avoid any name
-   * conflict with generated methods of the subclasses.
-   */
   protected final Operation operation;
 
   /**
diff --git a/tensorflow/java/src/main/java/org/tensorflow/op/annotation/Operator.java b/tensorflow/java/src/main/java/org/tensorflow/op/annotation/Operator.java
index 3782240edb4008cc71c55cf48cba8f5873b71018..38f466c57416eac96a09cd1dfe8558fcb8e3606f 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/op/annotation/Operator.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/op/annotation/Operator.java
@@ -25,11 +25,11 @@ import java.lang.annotation.Target;
  * Annotation used by classes to make TensorFlow operations conveniently accessible via {@code
  * org.tensorflow.op.Ops}.
  *
- * <p>An annotation processor (TODO: not yet implemented) builds the {@code Ops} class by
- * aggregating all classes annotated as {@code @Operator}s. Each annotated class <b>must</b> have at
- * least one public static factory method named {@code create} that accepts a {@link
- * org.tensorflow.op.Scope} as its first argument. The processor then adds a convenience method in
- * the {@code Ops} class. For example:
+ * <p>An annotation processor ({@code org.tensorflow.processor.OperatorProcessor}) builds the
+ * {@code Ops} class by aggregating all classes annotated as {@code @Operator}s. Each annotated
+ * class <b>must</b> have at least one public static factory method named {@code create} that
+ * accepts a {@link org.tensorflow.op.Scope} as its first argument. The processor then adds a
+ * convenience method in the {@code Ops} class. For example:
  *
  * <pre>{@code
  * @Operator
@@ -45,7 +45,7 @@ import java.lang.annotation.Target;
  * <pre>{@code
  * import org.tensorflow.op.Ops;
  * ...
- * Ops ops = new Ops(graph);
+ * Ops ops = Ops.create(graph);
  * ...
  * ops.myOp(operand);
  * // and has exactly the same effect as calling
diff --git a/tensorflow/java/src/test/java/org/tensorflow/op/core/GeneratedOperationsTest.java b/tensorflow/java/src/test/java/org/tensorflow/op/core/GeneratedOperationsTest.java
new file mode 100644
index 0000000000000000000000000000000000000000..49c4ff639ecd36763c65e0143d60ab2590aa008b
--- /dev/null
+++ b/tensorflow/java/src/test/java/org/tensorflow/op/core/GeneratedOperationsTest.java
@@ -0,0 +1,60 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.op.core;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.ArrayList;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+import org.tensorflow.Graph;
+import org.tensorflow.Operand;
+import org.tensorflow.Session;
+import org.tensorflow.Tensor;
+import org.tensorflow.op.Ops;
+
+@RunWith(JUnit4.class)
+public final class GeneratedOperationsTest {
+
+  @Test
+  public void tensorInputTensorOutput() {
+    try (Graph g = new Graph();
+        Session sess = new Session(g)) {
+      Ops ops = Ops.create(g);
+      Operand<Integer> x = ops.math().add(ops.constant(1), ops.constant(2));
+      try (Tensor<Integer> result = sess.runner().fetch(x).run().get(0).expect(Integer.class)) {
+        assertEquals(3, result.intValue());
+      }
+    }
+  }
+
+  @Test
+  public void testListInputTensorOutput() {
+    try (Graph g = new Graph();
+        Session sess = new Session(g)) {
+      Ops ops = Ops.create(g);
+      ArrayList<Operand<Integer>> inputs = new ArrayList<Operand<Integer>>();
+      inputs.add(ops.constant(1));
+      inputs.add(ops.constant(2));
+      inputs.add(ops.constant(3));
+      Operand<Integer> x = ops.math().addN(inputs);
+      try (Tensor<Integer> result = sess.runner().fetch(x).run().get(0).expect(Integer.class)) {
+        assertEquals(6, result.intValue());
+      }
+    }
+  }
+}
diff --git a/tensorflow/js/ops/ts_op_gen_test.cc b/tensorflow/js/ops/ts_op_gen_test.cc
index 03241689b5fe2c18f1131e9400c51b88298f143a..1c51dd030f52bc2d248f9a98f17f9d656a34065d 100644
--- a/tensorflow/js/ops/ts_op_gen_test.cc
+++ b/tensorflow/js/ops/ts_op_gen_test.cc
@@ -112,22 +112,15 @@ import {createTensorsTypeOpAttr, nodeBackend} from './op_utils';
 }
 
 TEST(TsOpGenTest, InputSingleAndList) {
-  const string api_def = R"(
-op {
-  name: "Foo"
-  input_arg {
-    name: "images"
-    type_attr: "T"
-    number_attr: "N"
-  }
-}
-)";
+  const string api_def = R"pb(
+    op { graph_op_name: "Foo" arg_order: "dim" arg_order: "images" }
+  )pb";
 
   string ts_file_text;
   GenerateTsOpFileText("", api_def, &ts_file_text);
 
   const string expected = R"(
-export function Foo(images: tfc.Tensor[], dim: tfc.Tensor): tfc.Tensor {
+export function Foo(dim: tfc.Tensor, images: tfc.Tensor[]): tfc.Tensor {
 )";
   ExpectContainsStr(ts_file_text, expected);
 }
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 8fca01624cfa2c21cd428e63ed1eadf7b853f107..46800a89bb773368549e739f59e41828626d1dfa 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -4,13 +4,15 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow/lite:build_def.bzl", "tflite_copts")
-load("//tensorflow:tensorflow.bzl", "if_not_windows")
+load("//tensorflow:tensorflow.bzl", "if_not_windows", "tf_cc_test")
+load("//tensorflow/lite:build_def.bzl", "tflite_cc_shared_object", "tflite_copts")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 exports_files(glob([
     "testdata/*.bin",
     "testdata/*.pb",
+    "testdata/*.tflite",
+    "testdata/*.csv",
     "models/testdata/*",
 ]))
 
@@ -60,16 +62,13 @@ cc_library(
     ],
 )
 
-cc_test(
+tf_cc_test(
     name = "arena_planner_test",
     size = "small",
     srcs = ["arena_planner_test.cc"],
-    tags = [
-        "tflite_not_portable",
-    ],
     deps = [
         ":arena_planner",
-        "//tensorflow/core:lib",
+        "//tensorflow/core:tflite_portable_logging",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
@@ -176,27 +175,18 @@ cc_library(
         "stderr_reporter.h",
     ],
     copts = tflite_copts() + TFLITE_DEFAULT_COPTS,
-    linkopts = [
-    ] + select({
-        "//tensorflow:android": [
-            "-llog",
-        ],
-        "//conditions:default": [
-        ],
-    }),
     deps = [
         ":arena_planner",
         ":graph_info",
         ":memory_planner",
+        ":minimal_logging",
         ":schema_fbs_version",
         ":simple_memory_arena",
         ":string",
         ":util",
         "//tensorflow/lite/c:c_api_internal",
-        "//tensorflow/lite/core/api:api",
-        "//tensorflow/lite/kernels:eigen_support",
-        "//tensorflow/lite/kernels:gemm_support",
-        "//tensorflow/lite/nnapi:nnapi_lib",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/nnapi:nnapi_implementation",
         "//tensorflow/lite/profiling:profiler",
         "//tensorflow/lite/schema:schema_fbs",
     ] + select({
@@ -222,6 +212,9 @@ cc_test(
     name = "string_util_test",
     size = "small",
     srcs = ["string_util_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",  # TODO(b/117786830)
+    ],
     deps = [
         ":framework",
         ":string_util",
@@ -236,10 +229,12 @@ cc_test(
     name = "interpreter_test",
     size = "small",
     srcs = ["interpreter_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",  # TODO(b/117786830)
+    ],
     deps = [
         ":framework",
         ":string_util",
-        "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:kernel_util",
@@ -255,6 +250,9 @@ cc_test(
     name = "graph_info_test",
     size = "small",
     srcs = ["graph_info_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",  # TODO(b/117786830)
+    ],
     deps = [
         ":framework",
         "//tensorflow/lite/testing:util",
@@ -267,6 +265,9 @@ cc_test(
     name = "simple_memory_arena_test",
     size = "small",
     srcs = ["simple_memory_arena_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",  # TODO(b/117786830)
+    ],
     deps = [
         ":simple_memory_arena",
         "//tensorflow/lite/testing:util",
@@ -287,9 +288,11 @@ cc_test(
         "testdata/test_model.bin",
         "testdata/test_model_broken.bin",
     ],
+    tags = [
+        "tflite_not_portable",
+    ],
     deps = [
         ":framework",
-        "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/testing:util",
@@ -326,6 +329,9 @@ cc_test(
     name = "mutable_op_resolver_test",
     size = "small",
     srcs = ["mutable_op_resolver_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",  # TODO(b/117786830)
+    ],
     deps = [
         ":framework",
         "//tensorflow/lite/testing:util",
@@ -347,9 +353,75 @@ cc_test(
     name = "util_test",
     size = "small",
     srcs = ["util_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",  # TODO(b/117786830)
+    ],
     deps = [
         ":util",
         "//tensorflow/lite/c:c_api_internal",
         "@com_google_googletest//:gtest",
     ],
 )
+
+cc_library(
+    name = "minimal_logging",
+    srcs = [
+        "minimal_logging.cc",
+    ] + select({
+        "//tensorflow:android": [
+            "minimal_logging_android.cc",
+        ],
+        "//tensorflow:ios": [
+            "minimal_logging_ios.cc",
+        ],
+        "//conditions:default": [
+            "minimal_logging_default.cc",
+        ],
+    }),
+    hdrs = ["minimal_logging.h"],
+    copts = TFLITE_DEFAULT_COPTS + tflite_copts(),
+    linkopts = select({
+        "//tensorflow:android": ["-llog"],
+        "//conditions:default": [],
+    }),
+    visibility = ["//visibility:private"],
+)
+
+cc_test(
+    name = "minimal_logging_test",
+    size = "small",
+    srcs = ["minimal_logging_test.cc"],
+    tags = [
+        "tflite_not_portable_ios",  # TODO(b/117786830)
+    ],
+    deps = [
+        ":minimal_logging",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+# Shared lib target for convenience, pulls in the core runtime and builtin ops.
+# Note: This target is not yet finalized, and the exact set of exported (C/C++)
+# APIs is subject to change.
+tflite_cc_shared_object(
+    name = "libtensorflowlite.so",
+    linkopts = select({
+        "//tensorflow:darwin": [
+            "-Wl,-exported_symbols_list,$(location //tensorflow/lite:tflite_exported_symbols.lds)",
+            "-Wl,-install_name,@rpath/libtensorflowlite.so",
+        ],
+        "//tensorflow:windows": [],
+        "//conditions:default": [
+            "-z defs",
+            "-Wl,--version-script,$(location //tensorflow/lite:tflite_version_script.lds)",
+        ],
+    }),
+    deps = [
+        ":framework",
+        ":tflite_exported_symbols.lds",
+        ":tflite_version_script.lds",
+        "//tensorflow/lite/kernels:builtin_ops",
+    ],
+)
+
+tflite_portable_test_suite()
diff --git a/tensorflow/lite/arena_planner.cc b/tensorflow/lite/arena_planner.cc
index 8200b6adaa1c6eed64ca8963c7d0d422e573ffb8..8a5ef11312877893a9d8b75e1957c18287741e36 100644
--- a/tensorflow/lite/arena_planner.cc
+++ b/tensorflow/lite/arena_planner.cc
@@ -55,12 +55,17 @@ TfLiteStatus ArenaPlanner::ResetAllocations() {
   TF_LITE_ENSURE_STATUS(persistent_arena_.Clear());
   allocs_.clear();
   allocs_.resize(graph_info_->num_tensors());
+  // Note that we only clear the alloc_queue_ when re-planning allocations, as
+  // it should only change when the graph topology itself changes.
   return kTfLiteOk;
 }
 
 TfLiteStatus ArenaPlanner::PlanAllocations() {
   // Invalidate any existing data.
   TF_LITE_ENSURE_STATUS(ResetAllocations());
+  // The alloc_queue_ is specific to the graph topology, and will be
+  // completely reconstructed from graph data here.
+  alloc_queue_.clear();
 
   // Keeps track of references to each tensor.
   std::vector<int> refcounts(graph_info_->num_tensors(), 0);
diff --git a/tensorflow/lite/arena_planner_test.cc b/tensorflow/lite/arena_planner_test.cc
index 479f25cafef5c47eed3226717eae2af7918549c6..d02d8b34c0649e54abcede3e5035199c277b576d 100644
--- a/tensorflow/lite/arena_planner_test.cc
+++ b/tensorflow/lite/arena_planner_test.cc
@@ -108,6 +108,14 @@ class TestGraph {
     variables_ = variables;
   }
 
+  void Swap(TestGraph* other) {
+    std::swap(nodes_, other->nodes_);
+    std::swap(tensors_, other->tensors_);
+    std::swap(inputs_, other->inputs_);
+    std::swap(outputs_, other->outputs_);
+    std::swap(variables_, other->variables_);
+  }
+
  private:
   std::vector<TfLiteNode> nodes_;
   std::vector<TfLiteTensor> tensors_;
@@ -163,6 +171,11 @@ class ArenaPlannerTest : public ::testing::Test {
     CHECK(planner_->PlanAllocations() == kTfLiteOk);
   }
 
+  void SwapGraph(TestGraph* graph) {
+    graph_->Swap(graph);
+    CHECK(planner_->PlanAllocations() == kTfLiteOk);
+  }
+
   void Execute(int start, int end) {
     CHECK(planner_->ExecuteAllocations(start, end) == kTfLiteOk);
   }
@@ -493,6 +506,34 @@ TEST_F(ArenaPlannerTest, LargerGraphAndStepwiseAllocation) {
   EXPECT_EQ(GetOffset(10), 0);
 }
 
+TEST_F(ArenaPlannerTest, ModifiedGraph) {
+  TestGraph graph({0, 1},
+                  {
+                      /* in, out, tmp */
+                      {{0, 1}, {2}, {}},     // First op
+                      {{2, 0}, {4, 5}, {}},  // Second op
+                      {{4, 5}, {3}, {}}      // Third op
+                  },
+                  {3});
+  SetGraph(&graph, /*preserve_inputs=*/true);
+  Execute(0, 10);
+
+  // Now update the graph data used by the existing allocator. It should behave
+  // as if it had been recreated with the new graph.
+  TestGraph pruned_graph({0, 1},
+                         {
+                             /* in, out, tmp */
+                             {{0, 1}, {3}, {}},  // First op
+                         },
+                         {3});
+  SwapGraph(&pruned_graph);
+  Execute(0, 10);
+
+  EXPECT_EQ(GetOffset(0), 0);
+  EXPECT_EQ(GetOffset(1), GetOffsetAfter(0));
+  EXPECT_EQ(GetOffset(3), GetOffsetAfter(1));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index c17eddf47bc86c9537364117c302df38e390c8da..4fc3d2e2380375e7714e7114bbf882c7e6f1e75d 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -2,6 +2,7 @@
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "tf_binary_additional_srcs",
     "tf_cc_shared_object",
     "tf_cc_test",
 )
@@ -157,7 +158,7 @@ def tf_to_tflite(name, src, options, out):
     """
 
     toco_cmdline = " ".join([
-        "//tensorflow/lite/toco:toco",
+        "$(location //tensorflow/lite/toco:toco)",
         "--input_format=TENSORFLOW_GRAPHDEF",
         "--output_format=TFLITE",
         ("--input_file=$(location %s)" % src),
@@ -168,7 +169,7 @@ def tf_to_tflite(name, src, options, out):
         srcs = [src],
         outs = [out],
         cmd = toco_cmdline,
-        tools = ["//tensorflow/lite/toco:toco"],
+        tools = ["//tensorflow/lite/toco:toco"] + tf_binary_additional_srcs(),
     )
 
 def tflite_to_json(name, src, out):
@@ -225,17 +226,22 @@ def generated_test_models():
     return [
         "abs",
         "add",
+        "add_n",
         "arg_min_max",
         "avg_pool",
         "batch_to_space_nd",
+        "ceil",
         "concat",
         "constant",
         "control_dep",
         "conv",
+        "conv2d_transpose",
         "conv_with_shared_weights",
         "conv_to_depthwiseconv_with_shared_weights",
+        "cos",
         "depthwiseconv",
         "div",
+        "elu",
         "equal",
         "exp",
         "expand_dims",
@@ -246,12 +252,14 @@ def generated_test_models():
         "fully_connected",
         "fused_batch_norm",
         "gather",
+        "gather_nd",
         "gather_with_constant",
         "global_batch_norm",
         "greater",
         "greater_equal",
         "sum",
         "l2norm",
+        "l2norm_shared_epsilon",
         "l2_pool",
         "leaky_relu",
         "less",
@@ -279,6 +287,7 @@ def generated_test_models():
         "prelu",
         "pow",
         "range",
+        "rank",
         "reduce_any",
         "reduce_max",
         "reduce_min",
@@ -288,6 +297,8 @@ def generated_test_models():
         "relu6",
         "reshape",
         "resize_bilinear",
+        "resolve_constant_strided_slice",
+        "reverse_v2",
         "rsqrt",
         "shape",
         "sigmoid",
@@ -305,12 +316,12 @@ def generated_test_models():
         "squeeze",
         "strided_slice",
         "strided_slice_1d_exhaustive",
-        "strided_slice_buggy",
         "sub",
         "tile",
         "topk",
         "transpose",
         "transpose_conv",
+        "unique",
         "unpack",
         "unroll_batch_matmul",
         "where",
@@ -324,6 +335,7 @@ def generated_test_models_failing(conversion_mode):
     if conversion_mode == "toco-flex":
         return [
             "lstm",  # TODO(b/117510976): Restore when lstm flex conversion works.
+            "unroll_batch_matmul",  # TODO(b/123030774): Fails in 1.13 tests.
         ]
 
     return []
diff --git a/tensorflow/lite/builtin_ops.h b/tensorflow/lite/builtin_ops.h
index f97d3ac4bf0b27cdd9b1f5ab7258a12036c29179..e7fb59282ce470b27ebd5f58550d537753fda374 100644
--- a/tensorflow/lite/builtin_ops.h
+++ b/tensorflow/lite/builtin_ops.h
@@ -128,6 +128,15 @@ typedef enum {
   kTfLiteBuiltinMirrorPad = 100,
   kTfLiteBuiltinAbs = 101,
   kTfLiteBuiltinSplitV = 102,
+  kTfLiteBuiltinUnique = 103,
+  kTfLiteBuiltinCeil = 104,
+  kTfLiteBuiltinReverseV2 = 105,
+  kTfLiteBuiltinAddN = 106,
+  kTfLiteBuiltinGatherNd = 107,
+  kTfLiteBuiltinCos = 108,
+  kTfLiteBuiltinWhere = 109,
+  kTfLiteBuiltinRank = 110,
+  kTfLiteBuiltinElu = 111,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/lite/c/BUILD b/tensorflow/lite/c/BUILD
index 91c04a5f1fb5bb1a15bd1da074a1276a3d8e7793..661b648550c9a3fc64b8bc2fb97a2f3b16e7aac1 100644
--- a/tensorflow/lite/c/BUILD
+++ b/tensorflow/lite/c/BUILD
@@ -12,7 +12,7 @@ cc_library(
         "c_api_internal.h",
     ],
     visibility = [
-        "//tensorflow/contrib/lite:__subpackages__",
+        "//learning/brain/mobile/kernel_test:__subpackages__",
         "//tensorflow/lite:__subpackages__",
     ],
 )
diff --git a/tensorflow/lite/c/builtin_op_data.h b/tensorflow/lite/c/builtin_op_data.h
index 6a5a027a9dc94bb2a11081276d269a7007c86cad..40fea17fefdbc06bb01d8ae950cb2233f5c16cc3 100644
--- a/tensorflow/lite/c/builtin_op_data.h
+++ b/tensorflow/lite/c/builtin_op_data.h
@@ -25,6 +25,11 @@ extern "C" {
 
 // TODO(aselle): Consider using "if this then that" for testing.
 
+// Useful placeholder to put in otherwise empty structs to avoid size warnings.
+typedef struct {
+  char dummy;
+} EmptyStructPlaceholder;
+
 // IMPORTANT: All new members of structs must be added at the end to ensure
 // backwards compatibility.
 
@@ -152,9 +157,11 @@ typedef struct {
 } TfLiteAddParams;
 
 typedef struct {
+  EmptyStructPlaceholder placeholder;
 } TfLiteSpaceToBatchNDParams;
 
 typedef struct {
+  EmptyStructPlaceholder placeholder;
 } TfLiteBatchToSpaceNDParams;
 
 typedef struct {
@@ -207,13 +214,18 @@ typedef struct {
 } TfLiteUnidirectionalSequenceLSTMParams;
 
 typedef struct {
-  // Parameters for the LSTM kernel.
+  // Parameters supported by version 1:
+  // Parameters inherited for the LSTM kernel.
   TfLiteFusedActivation activation;
   float cell_clip;
   float proj_clip;
 
   // If true, store the outputs of both directions in the first output.
   bool merge_outputs;
+
+  // Parameters supported by version 2:
+  // If set to true then the first dimension is time, otherwise batch.
+  bool time_major;
 } TfLiteBidirectionalSequenceLSTMParams;
 
 typedef struct {
@@ -225,9 +237,11 @@ typedef struct {
 } TfLiteResizeNearestNeighborParams;
 
 typedef struct {
+  EmptyStructPlaceholder placeholder;
 } TfLitePadParams;
 
 typedef struct {
+  EmptyStructPlaceholder placeholder;
 } TfLitePadV2Params;
 
 typedef struct {
@@ -267,6 +281,7 @@ typedef struct {
 } TfLiteGatherParams;
 
 typedef struct {
+  EmptyStructPlaceholder placeholder;
 } TfLiteTransposeParams;
 
 typedef struct {
@@ -318,6 +333,9 @@ typedef struct {
   TfLiteType out_type;
 } TfLiteShapeParams;
 
+typedef struct {
+} TfLiteRankParams;
+
 typedef struct {
   // Parameters supported by version 1:
   float min;
@@ -346,6 +364,10 @@ typedef struct {
   float alpha;
 } TfLiteLeakyReluParams;
 
+typedef struct {
+  TfLiteType index_out_type;
+} TfLiteUniqueParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/lite/c/builtin_op_data_test.cc b/tensorflow/lite/c/builtin_op_data_test.cc
index 4ce7c481e1c26e6fcfdaa680e9ca666b82968d53..4967183dd56df64b75c719869d16d052ae976081 100644
--- a/tensorflow/lite/c/builtin_op_data_test.cc
+++ b/tensorflow/lite/c/builtin_op_data_test.cc
@@ -71,6 +71,7 @@ TEST(IntArray, CanCompileStructs) {
   TfLiteTransposeConvParams transpose_conv_params;
   TfLiteSparseToDenseParams sparse_to_dense_params;
   TfLiteShapeParams shape_params;
+  TfLiteRankParams rank_params;
   TfLiteFakeQuantParams fake_quant_params;
   TfLitePackParams pack_params;
   TfLiteOneHotParams one_hot_params;
diff --git a/tensorflow/lite/c/c_api_internal.c b/tensorflow/lite/c/c_api_internal.c
index 2923dbad4ef285c497ca2c84d86168954fe8ec99..f20ee23bd81eb87c25a1a7f61cce59df7ae6678e 100644
--- a/tensorflow/lite/c/c_api_internal.c
+++ b/tensorflow/lite/c/c_api_internal.c
@@ -70,6 +70,20 @@ TfLiteIntArray* TfLiteIntArrayCopy(const TfLiteIntArray* src) {
 
 void TfLiteIntArrayFree(TfLiteIntArray* a) { free(a); }
 
+int TfLiteFloatArrayGetSizeInBytes(int size) {
+  static TfLiteFloatArray dummy;
+  return sizeof(dummy) + sizeof(dummy.data[0]) * size;
+}
+
+TfLiteFloatArray* TfLiteFloatArrayCreate(int size) {
+  TfLiteFloatArray* ret =
+      (TfLiteFloatArray*)malloc(TfLiteFloatArrayGetSizeInBytes(size));
+  ret->size = size;
+  return ret;
+}
+
+void TfLiteFloatArrayFree(TfLiteFloatArray* a) { free(a); }
+
 void TfLiteTensorDataFree(TfLiteTensor* t) {
   if (t->allocation_type == kTfLiteDynamic && t->data.raw) {
     free(t->data.raw);
@@ -77,10 +91,30 @@ void TfLiteTensorDataFree(TfLiteTensor* t) {
   t->data.raw = NULL;
 }
 
+void TfLiteQuantizationFree(TfLiteQuantization* quantization) {
+  if (quantization->type == kTfLiteAffineQuantization) {
+    TfLiteAffineQuantization* q_params =
+        (TfLiteAffineQuantization*)(quantization->params);
+    if (q_params->scale) {
+      TfLiteFloatArrayFree(q_params->scale);
+      q_params->scale = NULL;
+    }
+    if (q_params->zero_point) {
+      TfLiteIntArrayFree(q_params->zero_point);
+      q_params->zero_point = NULL;
+    }
+    free(q_params);
+  }
+  quantization->params = NULL;
+  quantization->type = kTfLiteNoQuantization;
+}
+
 void TfLiteTensorFree(TfLiteTensor* t) {
   TfLiteTensorDataFree(t);
   if (t->dims) TfLiteIntArrayFree(t->dims);
   t->dims = NULL;
+
+  TfLiteQuantizationFree(&t->quantization);
 }
 
 void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
@@ -98,6 +132,9 @@ void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
   tensor->allocation_type = allocation_type;
   tensor->allocation = allocation;
   tensor->is_variable = is_variable;
+
+  tensor->quantization.type = kTfLiteNoQuantization;
+  tensor->quantization.params = NULL;
 }
 
 void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor) {
diff --git a/tensorflow/lite/c/c_api_internal.h b/tensorflow/lite/c/c_api_internal.h
index 1cd84eff5c436abb781c74d1ac287b709558133f..83e2be690762be3e2cacf02ea8311b76dc1731c4 100644
--- a/tensorflow/lite/c/c_api_internal.h
+++ b/tensorflow/lite/c/c_api_internal.h
@@ -98,8 +98,32 @@ int TfLiteIntArrayEqualsArray(TfLiteIntArray* a, int b_size, int b_data[]);
 // You are expected to free memory with TfLiteIntArrayFree
 TfLiteIntArray* TfLiteIntArrayCopy(const TfLiteIntArray* src);
 
-// Free memory of array `v`.
-void TfLiteIntArrayFree(TfLiteIntArray* v);
+// Free memory of array `a`.
+void TfLiteIntArrayFree(TfLiteIntArray* a);
+
+// Fixed size list of floats. Used for per-channel quantization.
+typedef struct {
+  int size;
+// gcc 6.1+ have a bug where flexible members aren't properly handled
+// https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
+    __GNUC_MINOR__ >= 1
+  float data[0];
+#else
+  float data[];
+#endif
+} TfLiteFloatArray;
+
+// Given the size (number of elements) in a TfLiteFloatArray, calculate its size
+// in bytes.
+int TfLiteFloatArrayGetSizeInBytes(int size);
+
+// Create a array of a given `size` (uninitialized entries).
+// This returns a pointer, that you must free using TfLiteFloatArrayFree().
+TfLiteFloatArray* TfLiteFloatArrayCreate(int size);
+
+// Free memory of array `a`.
+void TfLiteFloatArrayFree(TfLiteFloatArray* a);
 
 // Since we must not depend on any libraries, define a minimal subset of
 // error macros while avoiding names that have pre-conceived meanings like
@@ -185,14 +209,48 @@ typedef enum {
 // Return the name of a given type, for error reporting purposes.
 const char* TfLiteTypeGetName(TfLiteType type);
 
+// SupportedQuantizationTypes.
+typedef enum {
+  // No quantization.
+  kTfLiteNoQuantization = 0,
+  // Affine quantization (with support for per-channel quantization).
+  // Corresponds to TfLiteAffineQuantization.
+  kTfLiteAffineQuantization = 1,
+} TfLiteQuantizationType;
+
+// Structure specifying the quantization used by the tensor, if-any.
+typedef struct {
+  // The type of quantization held by params.
+  TfLiteQuantizationType type;
+  // Holds a reference to one of the quantization param structures specified
+  // below.
+  void* params;
+} TfLiteQuantization;
+
+// Legacy. Will be deprecated in favor of TfLiteAffineQuantization.
+// If per-layer quantization is specified this field will still be populated in
+// addition to TfLiteAffineQuantization.
 // Parameters for asymmetric quantization. Quantized values can be converted
 // back to float using:
-//    real_value = scale * (quantized_value - zero_point);
+//     real_value = scale * (quantized_value - zero_point)
 typedef struct {
   float scale;
   int32_t zero_point;
 } TfLiteQuantizationParams;
 
+// Parameters for asymmetric quantization across a dimension (i.e per output
+// channel quantization).
+// quantized_dimension specifies which dimension the scales and zero_points
+// correspond to.
+// For a particular value in quantized_dimension, quantized values can be
+// converted back to float using:
+//     real_value = scale * (quantized_value - zero_point)
+typedef struct {
+  TfLiteFloatArray* scale;
+  TfLiteIntArray* zero_point;
+  int32_t quantized_dimension;
+} TfLiteAffineQuantization;
+
 // A union of pointers that points to memory for a given tensor.
 typedef union {
   int* i32;
@@ -274,12 +332,18 @@ typedef struct {
 
   // True if the tensor is a variable.
   bool is_variable;
+
+  // Quantization information. Replaces params field above.
+  TfLiteQuantization quantization;
 } TfLiteTensor;
 
-// Free data memory of tensor `t`;
+// Free data memory of tensor `t`.
 void TfLiteTensorDataFree(TfLiteTensor* t);
 
-// Free memory of tensor `t`;
+// Free quantization data.
+void TfLiteQuantizationFree(TfLiteQuantization* quantization);
+
+// Free memory of tensor `t`.
 void TfLiteTensorFree(TfLiteTensor* t);
 
 // Set all of a tensor's fields (and free any previously allocated data).
@@ -397,6 +461,9 @@ typedef struct TfLiteContext {
   // default: false.
   // WARNING: This is an experimental API and subject to change.
   bool allow_fp32_relax_to_fp16;
+
+  // Pointer to the op-level profiler, if set; nullptr otherwise.
+  void* profiler;
 } TfLiteContext;
 
 typedef struct _TfLiteRegistration {
diff --git a/tensorflow/lite/c/c_api_internal_test.cc b/tensorflow/lite/c/c_api_internal_test.cc
index acf0dfc5be8e233b642ccea42f72cbf6af2d4c5d..d01cf63a3e059d05a300accc5a26dd4d411f326a 100644
--- a/tensorflow/lite/c/c_api_internal_test.cc
+++ b/tensorflow/lite/c/c_api_internal_test.cc
@@ -65,6 +65,13 @@ TEST(IntArray, TestIntArrayEqual) {
   TfLiteIntArrayFree(d);
 }
 
+TEST(FloatArray, TestFloatArrayCreate) {
+  TfLiteFloatArray* a = TfLiteFloatArrayCreate(0);
+  TfLiteFloatArray* b = TfLiteFloatArrayCreate(3);
+  TfLiteFloatArrayFree(a);
+  TfLiteFloatArrayFree(b);
+}
+
 TEST(Types, TestTypeNames) {
   auto type_name = [](TfLiteType t) {
     return std::string(TfLiteTypeGetName(t));
@@ -81,6 +88,20 @@ TEST(Types, TestTypeNames) {
   EXPECT_EQ(type_name(kTfLiteString), "STRING");
 }
 
+TEST(Quantization, TestQuantizationFree) {
+  TfLiteTensor t;
+  // Set these values, otherwise TfLiteTensorFree has uninitialized values.
+  t.allocation_type = kTfLiteArenaRw;
+  t.dims = nullptr;
+  t.quantization.type = kTfLiteAffineQuantization;
+  auto* params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  params->scale = TfLiteFloatArrayCreate(3);
+  params->zero_point = TfLiteIntArrayCreate(3);
+  t.quantization.params = reinterpret_cast<void*>(params);
+  TfLiteTensorFree(&t);
+}
+
 }  // namespace tflite
 
 int main(int argc, char** argv) {
diff --git a/tensorflow/lite/context_util.h b/tensorflow/lite/context_util.h
index 68b91ea0b93e602c20d1db3284a523e9f55dfd5b..2f846cc259e34b1f750ba0787dffa93db597cbe0 100644
--- a/tensorflow/lite/context_util.h
+++ b/tensorflow/lite/context_util.h
@@ -38,6 +38,7 @@ class TfLiteIntArrayView {
   const_iterator begin() const { return int_array_->data; }
   const_iterator end() const { return &int_array_->data[int_array_->size]; }
   size_t size() const { return end() - begin(); }
+  int operator[](size_t pos) const { return int_array_->data[pos]; }
 
  private:
   const TfLiteIntArray* int_array_;
diff --git a/tensorflow/lite/core/api/BUILD b/tensorflow/lite/core/api/BUILD
index 6a43b0322d17041a5ae4a0527376d1465a539b1d..db6b4a2d18ecd894fa3b8a0bf646ca9f8c6b6511 100644
--- a/tensorflow/lite/core/api/BUILD
+++ b/tensorflow/lite/core/api/BUILD
@@ -51,6 +51,7 @@ cc_test(
     srcs = ["flatbuffer_conversions_test.cc"],
     deps = [
         ":api",
+        "//tensorflow/lite:string",
         "//tensorflow/lite/c:c_api_internal",
         "@com_google_googletest//:gtest",
     ],
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index c00a0a3a546b1b2b0167663b5f00c5e25e261f15..7ca0fc2deb4d4fb713e1eb98c572dfb3542c26fa 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <cstdlib>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 
@@ -26,22 +28,27 @@ namespace {
 // Copies the contents from the flatbuffer int vector `flatbuffer` into the
 // int array `buffer`. `flat_vector` and `buffer` represent the same
 // configuration operation for a given operation.
-void FlatBufferIntVectorToArray(int max_size_of_buffer,
-                                const flatbuffers::Vector<int32_t>* flat_vector,
-                                int* buffer, ErrorReporter* error_reporter) {
+TfLiteStatus FlatBufferIntVectorToArray(
+    int max_size_of_buffer, const flatbuffers::Vector<int32_t>* flat_vector,
+    int* buffer, ErrorReporter* error_reporter, const char* op_name) {
   if (!flat_vector) {
-    error_reporter->Report("Input array not provided for operation.\n");
+    error_reporter->Report("Input array not provided for operation '%s'.\n",
+                           op_name);
+    return kTfLiteError;
   } else {
     int num_dimensions = flat_vector->Length();
     if (num_dimensions > max_size_of_buffer / sizeof(int)) {
       error_reporter->Report(
-          "Found too many dimensions in the operation's input array.\n");
+          "Found too many dimensions in the input array of operation '%s'.\n",
+          op_name);
+      return kTfLiteError;
     } else {
       for (int i = 0; i < num_dimensions; ++i) {
         buffer[i] = flat_vector->Get(i);
       }
     }
   }
+  return kTfLiteOk;
 }
 
 }  // namespace
@@ -417,6 +424,7 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->cell_clip = bidi_lstm_params->cell_clip();
         params->proj_clip = bidi_lstm_params->proj_clip();
         params->merge_outputs = bidi_lstm_params->merge_outputs();
+        params->time_major = bidi_lstm_params->time_major();
       }
       *builtin_data = reinterpret_cast<void*>(params);
       break;
@@ -449,8 +457,9 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       auto* params = allocator->AllocatePOD<TfLiteReshapeParams>();
       if (auto* schema_params = op->builtin_options_as_ReshapeOptions()) {
         auto* new_shape = schema_params->new_shape();
-        FlatBufferIntVectorToArray(sizeof(params->shape), new_shape,
-                                   params->shape, error_reporter);
+        TF_LITE_ENSURE_STATUS(FlatBufferIntVectorToArray(
+            sizeof(params->shape), new_shape, params->shape, error_reporter,
+            "reshape"));
         params->num_dimensions = new_shape->Length();
       }
       *builtin_data = reinterpret_cast<void*>(params);
@@ -518,8 +527,9 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       auto* params = allocator->AllocatePOD<TfLiteSqueezeParams>();
       if (auto* schema_params = op->builtin_options_as_SqueezeOptions()) {
         const auto& squeeze_dims = schema_params->squeeze_dims();
-        FlatBufferIntVectorToArray(sizeof(params->squeeze_dims), squeeze_dims,
-                                   params->squeeze_dims, error_reporter);
+        TF_LITE_ENSURE_STATUS(FlatBufferIntVectorToArray(
+            sizeof(params->squeeze_dims), squeeze_dims, params->squeeze_dims,
+            error_reporter, "squeeze"));
         params->num_squeeze_dims = squeeze_dims->Length();
       }
       *builtin_data = reinterpret_cast<void*>(params);
@@ -650,6 +660,18 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_UNIQUE: {
+      TfLiteUniqueParams* params = allocator->AllocatePOD<TfLiteUniqueParams>();
+      auto* unique_params = op->builtin_options_as_UniqueOptions();
+      if (unique_params != nullptr) {
+        params->index_out_type =
+            unique_params->idx_out_type() == tflite::TensorType_INT64
+                ? TfLiteType::kTfLiteInt64
+                : TfLiteType::kTfLiteInt32;
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
 
     // Below are the ops with no builtin_data strcture.
     case BuiltinOperator_ABS:
@@ -658,12 +680,15 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     // ok for now, since there is no call implementation either.
     case BuiltinOperator_CALL:
     case BuiltinOperator_CONCAT_EMBEDDINGS:
+    case BuiltinOperator_COS:
     case BuiltinOperator_CUSTOM:
     case BuiltinOperator_DEQUANTIZE:
+    case BuiltinOperator_ELU:
     case BuiltinOperator_EMBEDDING_LOOKUP:
     case BuiltinOperator_EQUAL:
     case BuiltinOperator_EXP:
     case BuiltinOperator_EXPAND_DIMS:
+    case BuiltinOperator_CEIL:
     case BuiltinOperator_FLOOR:
     case BuiltinOperator_GREATER:
     case BuiltinOperator_GREATER_EQUAL:
@@ -703,6 +728,11 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_FLOOR_MOD:
     case BuiltinOperator_RANGE:
     case BuiltinOperator_SQUARED_DIFFERENCE:
+    case BuiltinOperator_REVERSE_V2:
+    case BuiltinOperator_ADD_N:
+    case BuiltinOperator_GATHER_ND:
+    case BuiltinOperator_WHERE:
+    case BuiltinOperator_RANK:
       break;
   }
   return kTfLiteOk;
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
index 4d1d1b21fda106b3196ff43421996f45ab83af4f..4a5de48302c1e840c524335ee549c74a162e107e 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions_test.cc
@@ -17,8 +17,10 @@ limitations under the License.
 
 #include <cstring>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/string.h"
 
 namespace tflite {
 namespace {
@@ -33,6 +35,8 @@ class MockErrorReporter : public ErrorReporter {
   char* GetBuffer() { return buffer_; }
   int GetBufferSize() { return buffer_size_; }
 
+  string GetAsString() const { return string(buffer_, buffer_size_); }
+
  private:
   static constexpr int kBufferSize = 256;
   char buffer_[kBufferSize];
@@ -60,25 +64,56 @@ class MockDataAllocator : public BuiltinDataAllocator {
 
 }  // namespace
 
-TEST(FlatbufferConversions, TestParseOpDataConv) {
-  MockErrorReporter mock_reporter;
-  ErrorReporter* reporter = &mock_reporter;
-  MockDataAllocator mock_allocator;
-
-  flatbuffers::FlatBufferBuilder builder;
-  flatbuffers::Offset<void> conv_options =
-      CreateConv2DOptions(builder, Padding_SAME, 1, 2,
-                          ActivationFunctionType_RELU, 3, 4)
-          .Union();
-  flatbuffers::Offset<Operator> conv_offset = CreateOperatorDirect(
-      builder, 0, nullptr, nullptr, BuiltinOptions_Conv2DOptions, conv_options,
-      nullptr, CustomOptionsFormat_FLEXBUFFERS, nullptr);
-  builder.Finish(conv_offset);
-  void* conv_pointer = builder.GetBufferPointer();
-  const Operator* conv_op = flatbuffers::GetRoot<Operator>(conv_pointer);
+class FlatbufferConversionsTest : public ::testing::Test {
+ public:
+  const Operator* BuildTestOperator(BuiltinOptions op_type,
+                                    flatbuffers::Offset<void> options) {
+    flatbuffers::Offset<Operator> offset =
+        CreateOperatorDirect(builder_, 0, nullptr, nullptr, op_type, options,
+                             nullptr, CustomOptionsFormat_FLEXBUFFERS, nullptr);
+    builder_.Finish(offset);
+    void* pointer = builder_.GetBufferPointer();
+    return flatbuffers::GetRoot<Operator>(pointer);
+  }
+
+ protected:
+  MockErrorReporter mock_reporter_;
+  MockDataAllocator mock_allocator_;
+  flatbuffers::FlatBufferBuilder builder_;
+};
+
+TEST_F(FlatbufferConversionsTest, ParseBadSqueeze) {
+  const Operator* op = BuildTestOperator(
+      BuiltinOptions_SqueezeOptions, CreateSqueezeOptions(builder_).Union());
+  void* output_data = nullptr;
+  EXPECT_NE(kTfLiteOk, ParseOpData(op, BuiltinOperator_SQUEEZE, &mock_reporter_,
+                                   &mock_allocator_, &output_data));
+  EXPECT_THAT(mock_reporter_.GetAsString(),
+              ::testing::ContainsRegex(
+                  "Input array not provided for operation 'squeeze'"));
+}
+
+TEST_F(FlatbufferConversionsTest, ParseBadReshape) {
+  const Operator* op = BuildTestOperator(
+      BuiltinOptions_ReshapeOptions, CreateSqueezeOptions(builder_).Union());
+  void* output_data = nullptr;
+  EXPECT_NE(kTfLiteOk, ParseOpData(op, BuiltinOperator_RESHAPE, &mock_reporter_,
+                                   &mock_allocator_, &output_data));
+  EXPECT_THAT(mock_reporter_.GetAsString(),
+              ::testing::ContainsRegex(
+                  "Input array not provided for operation 'reshape'"));
+}
+
+TEST_F(FlatbufferConversionsTest, TestParseOpDataConv) {
+  const Operator* conv_op =
+      BuildTestOperator(BuiltinOptions_Conv2DOptions,
+                        CreateConv2DOptions(builder_, Padding_SAME, 1, 2,
+                                            ActivationFunctionType_RELU, 3, 4)
+                            .Union());
   void* output_data = nullptr;
-  EXPECT_EQ(kTfLiteOk, ParseOpData(conv_op, BuiltinOperator_CONV_2D, reporter,
-                                   &mock_allocator, &output_data));
+  EXPECT_EQ(kTfLiteOk,
+            ParseOpData(conv_op, BuiltinOperator_CONV_2D, &mock_reporter_,
+                        &mock_allocator_, &output_data));
   EXPECT_NE(nullptr, output_data);
   TfLiteConvParams* params = reinterpret_cast<TfLiteConvParams*>(output_data);
   EXPECT_EQ(kTfLitePaddingSame, params->padding);
@@ -89,30 +124,20 @@ TEST(FlatbufferConversions, TestParseOpDataConv) {
   EXPECT_EQ(4, params->dilation_height_factor);
 }
 
-TEST(FlatbufferConversions, TestParseOpDataCustom) {
-  MockErrorReporter mock_reporter;
-  ErrorReporter* reporter = &mock_reporter;
-  MockDataAllocator mock_allocator;
-
-  flatbuffers::FlatBufferBuilder builder;
-  flatbuffers::Offset<void> null_options;
-  flatbuffers::Offset<Operator> custom_offset = CreateOperatorDirect(
-      builder, 0, nullptr, nullptr, BuiltinOptions_NONE, null_options, nullptr,
-      CustomOptionsFormat_FLEXBUFFERS, nullptr);
-  builder.Finish(custom_offset);
-  void* custom_pointer = builder.GetBufferPointer();
-  const Operator* custom_op = flatbuffers::GetRoot<Operator>(custom_pointer);
+TEST_F(FlatbufferConversionsTest, TestParseOpDataCustom) {
+  const Operator* custom_op =
+      BuildTestOperator(BuiltinOptions_NONE, flatbuffers::Offset<void>());
   void* output_data = nullptr;
-  EXPECT_EQ(kTfLiteOk, ParseOpData(custom_op, BuiltinOperator_CUSTOM, reporter,
-                                   &mock_allocator, &output_data));
+  EXPECT_EQ(kTfLiteOk,
+            ParseOpData(custom_op, BuiltinOperator_CUSTOM, &mock_reporter_,
+                        &mock_allocator_, &output_data));
   EXPECT_EQ(nullptr, output_data);
 }
 
-TEST(FlatbufferConversions, TestConvertTensorType) {
-  MockErrorReporter mock_reporter;
-  ErrorReporter* reporter = &mock_reporter;
+TEST_F(FlatbufferConversionsTest, TestConvertTensorType) {
   TfLiteType type;
-  EXPECT_EQ(kTfLiteOk, ConvertTensorType(TensorType_FLOAT32, &type, reporter));
+  EXPECT_EQ(kTfLiteOk,
+            ConvertTensorType(TensorType_FLOAT32, &type, &mock_reporter_));
   EXPECT_EQ(kTfLiteFloat32, type);
 }
 
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 90361faeae3c085fd4bd73a22b64635ce4b2969e..ec6762b16c95e86fb65ec187d5e92f91eff1cbc5 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -72,6 +72,34 @@ bool HasDynamicTensor(const TfLiteContext& context,
   return HasDynamicTensorImpl(context, TfLiteIntArrayView{int_array});
 }
 
+// Gets the legacy TfLiteQuantizationParams from the current TfLiteQuantization.
+TfLiteQuantizationParams GetLegacyQuantization(
+    const TfLiteQuantization& quantization) {
+  TfLiteQuantizationParams legacy_quantization;
+  legacy_quantization.scale = 0;
+  legacy_quantization.zero_point = 0;
+
+  // If the quantization type isn't affine, return the empty
+  // legacy_quantization.
+  if (quantization.type != kTfLiteAffineQuantization) {
+    return legacy_quantization;
+  }
+
+  auto* affine_quantization =
+      reinterpret_cast<TfLiteAffineQuantization*>(quantization.params);
+  if (!affine_quantization || !affine_quantization->scale ||
+      !affine_quantization->zero_point ||
+      affine_quantization->scale->size != 1 ||
+      affine_quantization->zero_point->size != 1) {
+    return legacy_quantization;
+  }
+
+  // We know its per-layer quantization now.
+  legacy_quantization.scale = affine_quantization->scale->data[0];
+  legacy_quantization.zero_point = affine_quantization->zero_point->data[0];
+  return legacy_quantization;
+}
+
 }  // namespace
 
 // A trivial implementation of GraphInfo around the Interpreter.
@@ -126,6 +154,7 @@ Subgraph::Subgraph(ErrorReporter* error_reporter,
   context_->recommended_num_threads = -1;
   context_->GetExternalContext = GetExternalContext;
   context_->SetExternalContext = SetExternalContext;
+  context_->profiler = nullptr;
 
   // Reserve some space for the tensors to avoid excessive resizing.
   tensors_.reserve(kTensorsReservedCapacity);
@@ -362,6 +391,16 @@ TfLiteStatus Subgraph::SetVariables(std::vector<int> variables) {
   return kTfLiteOk;
 }
 
+void Subgraph::SetCancellationFunction(void* data,
+                                       bool (*check_cancelled_func)(void*)) {
+  cancellation_data_ = data;
+  check_cancelled_func_ = check_cancelled_func;
+}
+
+void Subgraph::ReserveNodes(int count) {
+  nodes_and_registration_.reserve(count);
+}
+
 TfLiteStatus Subgraph::CheckTensorIndices(const char* label, const int* indices,
                                           int length) {
   // Making sure kOptionalTensor is not re-defined to something other than -1.
@@ -375,7 +414,9 @@ TfLiteStatus Subgraph::CheckTensorIndices(const char* label, const int* indices,
       continue;
     }
     if (index < 0 || static_cast<size_t>(index) >= context_->tensors_size) {
-      ReportError("Invalid tensor index %d in %s\n", index, label);
+      ReportError(
+          "Invalid tensor index %d in %s. The subgraph has %d tensors\n", index,
+          label, context_->tensors_size);
       consistent_ = false;
       return kTfLiteError;
     }
@@ -551,7 +592,12 @@ TfLiteStatus Subgraph::ResizeInputTensor(int tensor_index,
 
   // Short-circuit the state change if the dimensions don't change, avoiding
   // unnecessary (re)allocations.
-  if (EqualArrayAndTfLiteIntArray(tensor->dims, dims.size(), dims.data())) {
+  //
+  // Note that it's required to check `tensor->data.raw != nullptr`. Otherwise
+  // the subgraph won't allocate memory for a dynamic tensor when its size
+  // is equal to the original tensor size.
+  if (tensor->data.raw != nullptr &&
+      EqualArrayAndTfLiteIntArray(tensor->dims, dims.size(), dims.data())) {
     return kTfLiteOk;
   }
 
@@ -663,15 +709,21 @@ TfLiteStatus Subgraph::Invoke() {
       TfLiteTensor* tensor = &tensors_[tensor_index];
       if (tensor->delegate && tensor->delegate != node.delegate &&
           tensor->data_is_stale) {
-        EnsureTensorDataIsReadable(tensor_index);
+        TF_LITE_ENSURE_STATUS(EnsureTensorDataIsReadable(tensor_index));
       }
     }
 
+    if (check_cancelled_func_ != nullptr &&
+        check_cancelled_func_(cancellation_data_)) {
+      ReportError("Client requested cancel during Invoke()");
+      return kTfLiteError;
+    }
+
     EnsureTensorsVectorCapacity();
     tensor_resized_since_op_invoke_ = false;
     if (OpInvoke(registration, &node) == kTfLiteError) {
-      status = ReportOpError(context_, node, registration, node_index,
-                             "failed to invoke");
+      return ReportOpError(context_, node, registration, node_index,
+                           "failed to invoke");
     }
 
     // Force execution prep for downstream ops if the latest op triggered the
@@ -766,7 +818,7 @@ TfLiteStatus Subgraph::GetNodeAndRegistration(
 
 TfLiteStatus Subgraph::SetTensorParametersReadOnly(
     int tensor_index, TfLiteType type, const char* name, const size_t rank,
-    const int* dims, TfLiteQuantizationParams quantization, const char* buffer,
+    const int* dims, TfLiteQuantization quantization, const char* buffer,
     size_t bytes, const Allocation* allocation) {
   if (state_ == kStateInvokableAndImmutable) {
     ReportError(
@@ -791,16 +843,22 @@ TfLiteStatus Subgraph::SetTensorParametersReadOnly(
       EqualArrayAndTfLiteIntArray(tensor.dims, rank, dims)) {
     // Fast path which does not invalidate the invokable property.
     TfLiteTensorDataFree(&tensor);
+    TfLiteQuantizationFree(&tensor.quantization);
     tensor.data.raw = const_cast<char*>(buffer);
     if (!tensor.dims) tensor.dims = ConvertArrayToTfLiteIntArray(rank, dims);
-    tensor.params = quantization;
+    tensor.params = GetLegacyQuantization(quantization);
+    tensor.quantization = quantization;
     tensor.allocation_type = kTfLiteMmapRo;
     tensor.allocation = allocation;
   } else {
     state_ = kStateUninvokable;
     TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
-                      quantization, const_cast<char*>(buffer), bytes,
-                      kTfLiteMmapRo, allocation, false, &tensor);
+                      GetLegacyQuantization(quantization),
+                      const_cast<char*>(buffer), bytes, kTfLiteMmapRo,
+                      allocation, false, &tensor);
+    // TODO(suharshs): Update TfLiteTensorReset to include the new quantization
+    // if there are other required callers.
+    tensor.quantization = quantization;
   }
   return kTfLiteOk;
 }
@@ -811,7 +869,7 @@ TfLiteStatus Subgraph::SetTensorParametersReadOnly(
 // to Interpreter.
 TfLiteStatus Subgraph::SetTensorParametersReadWrite(
     int tensor_index, TfLiteType type, const char* name, const size_t rank,
-    const int* dims, TfLiteQuantizationParams quantization, bool is_variable) {
+    const int* dims, TfLiteQuantization quantization, bool is_variable) {
   if (state_ == kStateInvokableAndImmutable) {
     ReportError(
         "SetTensorParametersReadWrite is disallowed when graph is immutable.");
@@ -841,10 +899,14 @@ TfLiteStatus Subgraph::SetTensorParametersReadWrite(
     allocation_type = kTfLiteArenaRwPersistent;
   }
 
+  TfLiteTensor& tensor = context_->tensors[tensor_index];
   TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
-                    quantization,
+                    GetLegacyQuantization(quantization),
                     /*buffer=*/nullptr, required_bytes, allocation_type,
-                    nullptr, is_variable, &context_->tensors[tensor_index]);
+                    nullptr, is_variable, &tensor);
+  // TODO(suharshs): Update TfLiteTensorReset to include the new quantization
+  // if there are other required callers.
+  tensor.quantization = quantization;
   return kTfLiteOk;
 }
 
@@ -931,6 +993,12 @@ void Subgraph::SwitchToKernelContext() {
 }
 
 TfLiteStatus Subgraph::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
+  if (state_ == kStateInvokableAndImmutable) {
+    ReportError(
+        "ModifyGraphWithDelegate is disallowed when graph is immutable.");
+    return kTfLiteError;
+  }
+
   if (!(delegate->flags & kTfLiteDelegateFlagsAllowDynamicTensors)) {
     int last_execution_plan_index_prepared;
     TF_LITE_ENSURE_OK(&context_, PrepareOpsStartingAt(
@@ -943,6 +1011,8 @@ TfLiteStatus Subgraph::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
     }
   }
 
+  const bool was_invokable_before_delegate = state_ == kStateInvokable;
+
   // TODO(aselle): Consider if it is worth storing pointers to delegates.
   // Setup additional context interface.
   SwitchToDelegateContext();
@@ -954,6 +1024,13 @@ TfLiteStatus Subgraph::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
 
   TF_LITE_ENSURE_OK(context_, status);
 
+  // If the memory planner has already been created, we need to execute
+  // planning again to account for the updated graph topology.
+  if (memory_planner_) {
+    state_ = kStateUninvokable;
+    TF_LITE_ENSURE_OK(context_, memory_planner_->PlanAllocations());
+  }
+
   if (!(delegate->flags & kTfLiteDelegateFlagsAllowDynamicTensors)) {
     // Reset the state to force tensor/op reallocation.
     state_ = kStateUninvokable;
@@ -962,6 +1039,11 @@ TfLiteStatus Subgraph::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
     // After using a delegate which doesn't support dynamic tensors, make the
     // entire graph immutable.
     state_ = kStateInvokableAndImmutable;
+  } else if (was_invokable_before_delegate) {
+    // If the graph was invokable prior to delegate application, flush
+    // allocation now to leave it in a consistent state.
+    TF_LITE_ENSURE_OK(context_, AllocateTensors());
+    TF_LITE_ENSURE_EQ(context_, state_, kStateInvokable);
   }
 
   return status;
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index 2a7c3a7c322e55500d9edb7d7c1b9763e9a76e88..5db15a177ef9fe8fcb54e0bf92f0193238440941 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -59,6 +59,11 @@ class Subgraph {
   // interpreter.
   TfLiteStatus SetVariables(std::vector<int> variables);
 
+  // Ensure the internal node storage memory allocates at least `count`
+  // spots for node. NOTE, this doesn't actually add operators. This is an
+  // efficiency optimization that is subject to change.
+  void ReserveNodes(int count);
+
   // Adds a node with the given parameters and returns the index of the new
   // node in `node_index` (optionally). Interpreter will take ownership of
   // `builtin_data` and destroy it with `free`. Ownership of 'init_data'
@@ -68,29 +73,48 @@ class Subgraph {
                                      const char* init_data,
                                      size_t init_data_size, void* builtin_data,
                                      const TfLiteRegistration* registration,
-                                     int* node_index);
+                                     int* node_index = nullptr);
 
   // Adds `tensors_to_add` tensors, preserving pre-existing Tensor entries.
   // The value pointed to by `first_new_tensor_index` will be set to the
   // index of the first new tensor if `first_new_tensor_index` is non-null.
-  TfLiteStatus AddTensors(int tensors_to_add, int* first_new_tensor_index);
+  TfLiteStatus AddTensors(int tensors_to_add,
+                          int* first_new_tensor_index = nullptr);
 
   // Set description of inputs/outputs/data/fptrs for node `node_index`.
   // This variant assumes an external buffer has been allocated of size
   // bytes. The lifetime of buffer must be ensured to be greater or equal
   // to Interpreter.
+  inline TfLiteStatus SetTensorParametersReadOnly(
+      int tensor_index, TfLiteType type, const char* name,
+      const std::vector<int>& dims, TfLiteQuantization quantization,
+      const char* buffer, size_t bytes,
+      const Allocation* allocation = nullptr) {
+    return SetTensorParametersReadOnly(tensor_index, type, name, dims.size(),
+                                       dims.data(), quantization, buffer, bytes,
+                                       allocation);
+  }
   TfLiteStatus SetTensorParametersReadOnly(
       int tensor_index, TfLiteType type, const char* name, const size_t rank,
-      const int* dims, TfLiteQuantizationParams quantization,
-      const char* buffer, size_t bytes, const Allocation* allocation);
+      const int* dims, TfLiteQuantization quantization, const char* buffer,
+      size_t bytes, const Allocation* allocation = nullptr);
 
   // Set description of inputs/outputs/data/fptrs for node `node_index`.
   // This variant assumes an external buffer has been allocated of size
   // bytes. The lifetime of buffer must be ensured to be greater or equal
   // to Interpreter.
-  TfLiteStatus SetTensorParametersReadWrite(
-      int tensor_index, TfLiteType type, const char* name, const size_t rank,
-      const int* dims, TfLiteQuantizationParams quantization, bool is_variable);
+  inline TfLiteStatus SetTensorParametersReadWrite(
+      int tensor_index, TfLiteType type, const char* name,
+      const std::vector<int>& dims, TfLiteQuantization quantization,
+      bool is_variable = false) {
+    return SetTensorParametersReadWrite(tensor_index, type, name, dims.size(),
+                                        dims.data(), quantization, is_variable);
+  }
+  TfLiteStatus SetTensorParametersReadWrite(int tensor_index, TfLiteType type,
+                                            const char* name, const size_t rank,
+                                            const int* dims,
+                                            TfLiteQuantization quantization,
+                                            bool is_variable = false);
 
   // WARNING: Experimental interface, subject to change
   // Overrides execution plan. This bounds checks indices sent in.
@@ -208,6 +232,15 @@ class Subgraph {
     return context_->allow_fp32_relax_to_fp16;
   }
 
+  // Sets the cancellation function pointer in order to cancel a request in the
+  // middle of a call to Invoke(). The interpreter queries this function during
+  // inference, between op invocations; when it returns true, the interpreter
+  // will abort execution and return `kTfLiteError`. The `data` parameter
+  // contains any data used by the cancellation function, and if non-null,
+  // remains owned by the caller.
+  // WARNING: This is an experimental API and subject to change.
+  void SetCancellationFunction(void* data, bool (*check_cancelled_func)(void*));
+
   // Ensure the data in `tensor.data` is readable. In case delegate is used,
   // it might require to copy the data from delegate buffer to raw memory.
   // WARNING: This is an experimental API and subject to change.
@@ -242,7 +275,10 @@ class Subgraph {
   // WARNING: This is an experimental API and subject to change.
   TfLiteStatus ResetVariableTensors();
 
-  void SetProfiler(profiling::Profiler* profiler) { profiler_ = profiler; }
+  void SetProfiler(profiling::Profiler* profiler) {
+    profiler_ = profiler;
+    context_->profiler = profiler;
+  }
 
   profiling::Profiler* GetProfiler() { return profiler_; }
 
@@ -390,6 +426,10 @@ class Subgraph {
   // Allow a delegate to look at the graph and modify the graph to handle
   // parts of the graph themselves. After this is called, the graph may
   // contain new nodes that replace 1 more nodes.
+  // NOTE: If tensors were allocated prior to delegate application, they will
+  // be reallocated if the graph was modified (i.e., the caller does *not* need
+  // to explicitly call |AllocateTensors()| again). If tensors were unallocated,
+  // they will remain unallocated after delegate application.
   // WARNING: This is an experimental API and subject to change.
   TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate);
 
@@ -495,6 +535,15 @@ class Subgraph {
   // public function).
   // The value is invalid before `PrepareOpStartingAt` is called.
   bool has_dynamic_tensors_ = true;
+
+  // Reference to cancellation function that can cancel a request in the middle
+  // of a call to Invoke(). When this function returns True, a kTfLiteError is
+  // thrown by Invoke().
+  bool (*check_cancelled_func_)(void*) = nullptr;
+
+  // Reference to data used by the cancellation function in
+  // `check_cancelled_func_`.
+  void* cancellation_data_ = nullptr;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 75083bf95a126fe7a8d1ca92af2cfa0c5a85f371..bca8e514fe4e454358ab3e41d30348ecf0c96797 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -157,6 +157,7 @@ cc_library(
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:string",
         "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/profiling:profiler",
         "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/common_runtime/eager:execute",
         "//tensorflow/core/common_runtime/eager:tensor_handle",
diff --git a/tensorflow/lite/delegates/flex/buffer_map_test.cc b/tensorflow/lite/delegates/flex/buffer_map_test.cc
index 9e8472f1e7d2c3e0f5e73f3e5ce98bae7f15063f..accaf3045246b35705085bd5324e5b33ec8ea12a 100644
--- a/tensorflow/lite/delegates/flex/buffer_map_test.cc
+++ b/tensorflow/lite/delegates/flex/buffer_map_test.cc
@@ -44,6 +44,7 @@ UniqueTfLiteTensor MakeLiteTensor(const std::vector<int>& shape,
   tensor->dims = ConvertVectorToTfLiteIntArray(shape);
   tensor->data.raw = nullptr;
   tensor->is_variable = false;
+  memset(&tensor->quantization, 0, sizeof(TfLiteQuantization));
   TfLiteTensorRealloc(data.size() * sizeof(T), tensor.get());
   memcpy(tensor->data.raw, data.data(), data.size() * sizeof(T));
   return tensor;
@@ -62,6 +63,7 @@ UniqueTfLiteTensor MakeLiteTensor<string>(const std::vector<int>& shape,
   tensor->dims = ConvertVectorToTfLiteIntArray(shape);
   tensor->data.raw = nullptr;
   tensor->is_variable = false;
+  memset(&tensor->quantization, 0, sizeof(TfLiteQuantization));
   TfLiteTensorRealloc(data.size() * sizeof(string), tensor.get());
 
   DynamicBuffer b;
diff --git a/tensorflow/lite/delegates/flex/delegate.cc b/tensorflow/lite/delegates/flex/delegate.cc
index ca7314fbaee6644cf9385a1d7b0b2964d6a2762f..dcf5b795d82c877a916151686946c4aaad90d80b 100644
--- a/tensorflow/lite/delegates/flex/delegate.cc
+++ b/tensorflow/lite/delegates/flex/delegate.cc
@@ -30,6 +30,21 @@ namespace flex {
 namespace delegate {
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteDelegate* delegate) {
+  // If the TensorFlow Lite thread count is explicitly configured, use it,
+  // otherwise rely on the default TensorFlow threading behavior.
+  tensorflow::SessionOptions session_options;
+  if (context->recommended_num_threads > 0) {
+    session_options.config.set_intra_op_parallelism_threads(
+        context->recommended_num_threads);
+  }
+
+  if (!reinterpret_cast<DelegateData*>(delegate->data_)
+           ->Prepare(session_options)
+           .ok()) {
+    context->ReportError(context, "Failed to initialize TensorFlow context.");
+    return kTfLiteError;
+  }
+
   // Get the nodes in the current execution plan. Interpreter owns this array.
   TfLiteIntArray* plan;
   TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
@@ -118,20 +133,11 @@ AcquireFlexDelegate() {
 }
 
 std::unique_ptr<FlexDelegate> FlexDelegate::Create() {
-  std::unique_ptr<flex::DelegateData> delegate_data;
-  if (!flex::DelegateData::Create(&delegate_data).ok()) {
-    fprintf(stderr, "Unable to initialize TensorFlow context.\n");
-    return nullptr;
-  }
-
-  return std::unique_ptr<FlexDelegate>(
-      new FlexDelegate(std::move(delegate_data)));
+  return std::unique_ptr<FlexDelegate>(new FlexDelegate());
 }
 
-FlexDelegate::FlexDelegate(std::unique_ptr<flex::DelegateData> delegate_data)
-    : TfLiteDelegate(TfLiteDelegateCreate()),
-      delegate_data_(std::move(delegate_data)) {
-  data_ = delegate_data_.get();
+FlexDelegate::FlexDelegate() : TfLiteDelegate(TfLiteDelegateCreate()) {
+  data_ = &delegate_data_;
   Prepare = &flex::delegate::Prepare;
   CopyFromBufferHandle = &flex::delegate::CopyFromBufferHandle;
   flags = kTfLiteDelegateFlagsAllowDynamicTensors;
diff --git a/tensorflow/lite/delegates/flex/delegate.h b/tensorflow/lite/delegates/flex/delegate.h
index 018ff3e0b0e1fe7a842154581e2201b82412f885..767cbe13c4e86159c4f494cfec7cc899542b22d3 100644
--- a/tensorflow/lite/delegates/flex/delegate.h
+++ b/tensorflow/lite/delegates/flex/delegate.h
@@ -49,9 +49,9 @@ class FlexDelegate : public TfLiteDelegate {
   ~FlexDelegate();
 
  private:
-  explicit FlexDelegate(std::unique_ptr<flex::DelegateData> delegate_data);
+  FlexDelegate();
 
-  std::unique_ptr<flex::DelegateData> delegate_data_;
+  flex::DelegateData delegate_data_;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/flex/delegate_data.cc b/tensorflow/lite/delegates/flex/delegate_data.cc
index 1483a530388d1dd48ff6179de4ddc2084ddb3d87..87f37697468907c53e2ea8030c49577a3c8e0d83 100644
--- a/tensorflow/lite/delegates/flex/delegate_data.cc
+++ b/tensorflow/lite/delegates/flex/delegate_data.cc
@@ -20,29 +20,32 @@ limitations under the License.
 
 namespace tflite {
 namespace flex {
-tensorflow::Status DelegateData::Create(std::unique_ptr<DelegateData>* data) {
+DelegateData::DelegateData() {}
+
+DelegateData::~DelegateData() {}
+
+tensorflow::Status DelegateData::Prepare(
+    const tensorflow::SessionOptions& session_options) {
+  if (eager_context_) {
+    return tensorflow::Status();
+  }
+
   std::vector<std::unique_ptr<tensorflow::Device>> devices;
 
   TF_RETURN_IF_ERROR(tensorflow::DeviceFactory::AddDevices(
-      tensorflow::SessionOptions(), "/job:localhost/replica:0/task:0",
-      &devices));
+      session_options, "/job:localhost/replica:0/task:0", &devices));
 
   std::unique_ptr<tensorflow::DeviceMgr> device_mgr =
       absl::make_unique<tensorflow::DeviceMgr>(std::move(devices));
   // Note that Rendezvous is ref-counted so it will be automatically deleted.
   tensorflow::Rendezvous* rendezvous =
       new tensorflow::IntraProcessRendezvous(device_mgr.get());
-  data->reset(new DelegateData(new tensorflow::EagerContext(
-      tensorflow::SessionOptions(),
+  eager_context_.reset(new tensorflow::EagerContext(
+      session_options,
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      /*async=*/false, std::move(device_mgr), rendezvous)));
+      /*async=*/false, std::move(device_mgr), rendezvous));
   return tensorflow::Status();
 }
 
-DelegateData::DelegateData(tensorflow::EagerContext* eager_context)
-    : eager_context_(eager_context) {}
-
-DelegateData::~DelegateData() {}
-
 }  // namespace flex
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/flex/delegate_data.h b/tensorflow/lite/delegates/flex/delegate_data.h
index a88cc98d03cd40d33ab9f5eaf312086dc2b2a7cc..20d6b40a5d201c84fe9475c0420d9d85b5f6053d 100644
--- a/tensorflow/lite/delegates/flex/delegate_data.h
+++ b/tensorflow/lite/delegates/flex/delegate_data.h
@@ -15,21 +15,30 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_FLEX_DELEGATE_DATA_H_
 #define TENSORFLOW_LITE_DELEGATES_FLEX_DELEGATE_DATA_H_
 
-#include "tensorflow/lite/delegates/flex/buffer_map.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/lite/delegates/flex/buffer_map.h"
 
 namespace tflite {
 namespace flex {
 
 // Data kept by the Flex delegate for the lifetime of an Interpreter.
+//
+// Note: This class is *not* thread-safe; any dependent delegates should not be
+// used concurrently.
 class DelegateData {
  public:
-  // Create a new DelegateData, initialized with a newly-created EagerContext.
-  static tensorflow::Status Create(std::unique_ptr<DelegateData>* data);
-
+  DelegateData();
   ~DelegateData();
 
+  // Prepare the necessary EagerContext and data for execution.
+  // This must be called at least once before execution. After preparation
+  // succeeds, redundant calls will be ignored (even if the session_options
+  // differ).
+  tensorflow::Status Prepare(const tensorflow::SessionOptions& session_options);
+
   // The EagerContext that is required for execution of Flex Ops.
+  // Note: The context is lazily created after the first call to |Prepare()|.
   tensorflow::EagerContext* GetEagerContext() { return eager_context_.get(); }
 
   // Map from TF Lite tensor index to TensorFlow tensor for a given context.
@@ -38,8 +47,7 @@ class DelegateData {
   }
 
  private:
-  explicit DelegateData(tensorflow::EagerContext* eager_context);
-
+  // Will be null until Prepare() is called and completes successfully.
   std::unique_ptr<tensorflow::EagerContext> eager_context_;
   // TODO(b/112439500): Clean up stale BufferMap instances after adding the
   // necessary cleanup hook from a TfLiteContext to a TfLiteDelegate.
diff --git a/tensorflow/lite/delegates/flex/delegate_data_test.cc b/tensorflow/lite/delegates/flex/delegate_data_test.cc
index cd274e7cb1ccb51d9b5e7ece845f2120e7c5a79e..22b8e436fb5f1337c15ad00609d7d08133684246 100644
--- a/tensorflow/lite/delegates/flex/delegate_data_test.cc
+++ b/tensorflow/lite/delegates/flex/delegate_data_test.cc
@@ -24,18 +24,20 @@ namespace flex {
 namespace {
 
 TEST(DelegateDataTest, Basic) {
-  std::unique_ptr<DelegateData> data;
+  DelegateData data;
   // We only check for success because it is hard to make initialization fail.
   // It only happens if we manage to not link the CPU device factory into the
   // binary.
-  EXPECT_TRUE(DelegateData::Create(&data).ok());
+  tensorflow::SessionOptions session_options;
+  session_options.config.set_intra_op_parallelism_threads(2);
+  EXPECT_TRUE(data.Prepare(session_options).ok());
 
   TfLiteContext dummy_context1 = {};
   TfLiteContext dummy_context2 = {};
-  EXPECT_NE(data->GetEagerContext(), nullptr);
-  EXPECT_NE(data->GetBufferMap(&dummy_context1), nullptr);
-  EXPECT_NE(data->GetBufferMap(&dummy_context1),
-            data->GetBufferMap(&dummy_context2));
+  EXPECT_NE(data.GetEagerContext(), nullptr);
+  EXPECT_NE(data.GetBufferMap(&dummy_context1), nullptr);
+  EXPECT_NE(data.GetBufferMap(&dummy_context1),
+            data.GetBufferMap(&dummy_context2));
 }
 
 }  // namespace
diff --git a/tensorflow/lite/delegates/flex/delegate_test.cc b/tensorflow/lite/delegates/flex/delegate_test.cc
index ee37090d94eaadca2a767a0ea9a2ad105618da97..b48fe181e1f3a5cf0be89fea42ad46ca9769fa02 100644
--- a/tensorflow/lite/delegates/flex/delegate_test.cc
+++ b/tensorflow/lite/delegates/flex/delegate_test.cc
@@ -252,6 +252,56 @@ TEST_F(DelegateTest, MultipleInterpretersSameDelegate) {
   }
 }
 
+TEST_F(DelegateTest, SingleThreaded) {
+  AddTensors(9, {0, 3}, {8}, kTfLiteFloat32, {3});
+  AddTfOp(testing::kUnpack, {0}, {1, 2});
+  AddTfOp(testing::kUnpack, {3}, {4, 5});
+  AddTfOp(testing::kAdd, {1, 4}, {6});
+  AddTfOp(testing::kAdd, {2, 5}, {7});
+  AddTfOp(testing::kMul, {6, 7}, {8});
+
+  // Explicitly disable multi-threading before installing the delegate.
+  interpreter_->SetNumThreads(1);
+  ConfigureDelegate();
+
+  SetShape(0, {2, 2, 1});
+  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
+  SetShape(3, {2, 2, 1});
+  SetValues(3, {1.1f, 2.2f, 3.3f, 4.4f});
+
+  // Invocation should behave as expected.
+  ASSERT_TRUE(Invoke());
+
+  ASSERT_THAT(GetShape(8), ElementsAre(2, 1));
+  ASSERT_THAT(GetValues(8), ElementsAre(14.52f, 38.72f));
+  ASSERT_EQ(GetType(8), kTfLiteFloat32);
+}
+
+TEST_F(DelegateTest, MultiThreaded) {
+  AddTensors(9, {0, 3}, {8}, kTfLiteFloat32, {3});
+  AddTfOp(testing::kUnpack, {0}, {1, 2});
+  AddTfOp(testing::kUnpack, {3}, {4, 5});
+  AddTfOp(testing::kAdd, {1, 4}, {6});
+  AddTfOp(testing::kAdd, {2, 5}, {7});
+  AddTfOp(testing::kMul, {6, 7}, {8});
+
+  // Explicitly enable multi-threading before installing the delegate.
+  interpreter_->SetNumThreads(4);
+  ConfigureDelegate();
+
+  SetShape(0, {2, 2, 1});
+  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
+  SetShape(3, {2, 2, 1});
+  SetValues(3, {1.1f, 2.2f, 3.3f, 4.4f});
+
+  // Invocation should behave as expected.
+  ASSERT_TRUE(Invoke());
+
+  ASSERT_THAT(GetShape(8), ElementsAre(2, 1));
+  ASSERT_THAT(GetValues(8), ElementsAre(14.52f, 38.72f));
+  ASSERT_EQ(GetType(8), kTfLiteFloat32);
+}
+
 }  // namespace
 }  // namespace flex
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc
index 02da1d1a224ee87c34c2a019bff6430fd0e7d88a..ceb9918f6fa7ccfbb4d27a0bf921987faecc1c12 100644
--- a/tensorflow/lite/delegates/flex/kernel.cc
+++ b/tensorflow/lite/delegates/flex/kernel.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/flex/delegate_data.h"
 #include "tensorflow/lite/delegates/flex/util.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/string.h"
 
 // Note: this is part of TF Lite's Flex delegation code which is to be
@@ -50,100 +51,317 @@ namespace tflite {
 namespace flex {
 namespace kernel {
 
-// Controls the lifetime of tensor handles in a vector.
-class VectorOfHandles {
+struct OpNode;
+
+// Represents the origin of a given tensor as a reference to the output
+// of an upstream node.
+struct TensorSource {
+  OpNode* node;
+  int node_output_index;
+};
+
+// A list of inputs of a given node of the TensorFlow/Eager graph.
+class OpInputs {
  public:
-  explicit VectorOfHandles(int num_elements) : vector_(num_elements, nullptr) {}
+  explicit OpInputs(const TfLiteIntArray* indexes) {
+    for (int index : TfLiteIntArrayView(indexes)) {
+      inputs_.push_back(index);
+    }
+    forwardable_.resize(inputs_.size());
+  }
+  ~OpInputs() {}
+
+  int Size() const { return inputs_.size(); }
+
+  int TfLiteIndex(int i) const { return inputs_[i]; }
+
+  // Given a map relating tensors to the node that originates them, populate a
+  // list of sources for the tensors in this class.
+  void InitializeTensorSources(
+      const std::map<int, TensorSource>& tflite_tensor_sources) {
+    sources_.clear();
+    for (int i : inputs_) {
+      auto it = tflite_tensor_sources.find(i);
+      if (it == tflite_tensor_sources.end()) {
+        sources_.push_back({nullptr, 0});
+      } else {
+        sources_.push_back(it->second);
+      }
+    }
+  }
 
-  ~VectorOfHandles() {
-    for (auto* handle : vector_) {
-      if (handle) handle->Unref();
+  void SetForwardable(int i, bool v) { forwardable_[i] = v; }
+
+  bool IsForwardable(int i) const { return forwardable_[i]; }
+
+  TensorSource GetTensorSource(int i) const { return sources_[i]; }
+
+ private:
+  std::vector<int> inputs_;
+  std::vector<TensorSource> sources_;
+
+  // List of tensors that can be used by TF in its forwarding optimization.
+  // Doing so allows an input tensor to be modified and used as the output
+  // tensor. The delegate takes care of not holding any references to tensors
+  // in this list while Eager is executing the corresponding op.
+  std::vector<int> forwardable_;
+};
+
+// A list of outputs of a given node of the TensorFlow/Eager graph, along with
+// the actual outputs of the EagerOperation.
+class OpOutputs {
+ public:
+  explicit OpOutputs(const TfLiteIntArray* indexes) {
+    for (int index : TfLiteIntArrayView(indexes)) {
+      outputs_.push_back(index);
+    }
+    vector_.resize(outputs_.size());
+  }
+  ~OpOutputs() { ResetTensorHandles(); }
+
+  // Stores information about which of the tensors in this class are also
+  // outputs of the sugbraph.
+  void InitializeGraphOutputs(const std::set<int>& subgraph_outputs) {
+    subgraph_outputs_.clear();
+    for (int i : outputs_) {
+      subgraph_outputs_.push_back(subgraph_outputs.count(i) > 0);
     }
   }
 
-  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2>* GetVector() {
-    return &vector_;
+  // Returns true if the tensor given by index 'i' is an output of the entire
+  // subgraph.
+  bool IsSubgraphOutput(int i) const { return subgraph_outputs_[i]; }
+
+  // Returns a handle to a given tensor and, optionally, remove it from the
+  // internal vector.
+  tensorflow::TensorHandle* GetHandle(int i, bool remove) {
+    auto* handle = vector_[i];
+    if (!remove) {
+      handle->Ref();
+    } else {
+      // Don't increase the ref-count. Instead, simply take it out of the
+      // vector.
+      vector_[i] = nullptr;
+    }
+    return handle;
+  }
+
+  int Size() const { return outputs_.size(); }
+
+  int TfLiteIndex(int i) const { return outputs_[i]; }
+
+  // Carefully unreference all the handles in the eager output vector.
+  void ResetTensorHandles() {
+    for (int i = 0; i < vector_.size(); ++i) {
+      if (vector_[i]) {
+        vector_[i]->Unref();
+        vector_[i] = nullptr;
+      }
+    }
   }
 
-  tensorflow::TensorHandle* GetHandle(int index) { return vector_[index]; }
+  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2>*
+  GetTensorHandles() {
+    return &vector_;
+  }
 
  private:
+  std::vector<int> outputs_;
+  std::vector<bool> subgraph_outputs_;
   tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> vector_;
 };
 
-// Executes the TensorFlow op given by 'op_name', with the attributes specified
-// in 'nodedef'. Inputs and outputs are given as indices into the 'buffer_map'.
-tensorflow::Status ExecuteFlexOp(tensorflow::EagerContext* eager_context,
-                                 BufferMap* buffer_map, const string& op_name,
-                                 const tensorflow::NodeDef& nodedef,
-                                 const std::vector<int>& inputs,
-                                 const std::vector<int>& outputs) {
-  const tensorflow::AttrTypeMap* attr_types;
-  bool is_function = false;
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      tensorflow::AttrTypeMapForOp(op_name.c_str(), &attr_types, &is_function),
-      " (while processing attributes of '", op_name, "')");
-  if (is_function) {
-    return tensorflow::errors::NotFound(
-        "Operation '", op_name,
-        "' is not registered.  (while processing attributes of '", op_name,
-        "')");
-  }
-  tensorflow::EagerOperation op(eager_context, op_name.c_str(),
-                                /*is_function=*/false, attr_types);
-  for (const auto& attr : nodedef.attr()) {
-    op.MutableAttrs()->Set(attr.first, attr.second);
+// A single node within the larger 'op'. Note that this kernel executes many
+// TensorFlow ops within a single TF Lite op.
+class OpNode {
+ public:
+  OpNode(const TfLiteIntArray* inputs, const TfLiteIntArray* outputs)
+      : inputs_(inputs), outputs_(outputs) {}
+  ~OpNode() {
+    if (op_) ClearEagerInputs();
   }
 
-  for (int input_index : inputs) {
-    if (!buffer_map->HasTensor(input_index)) {
+  const string& name() const { return name_; }
+  void set_name(const string& name) { name_ = name; }
+
+  int index() const { return index_; }
+  void set_index(int index) { index_ = index; }
+
+  const tensorflow::NodeDef& nodedef() const { return nodedef_; }
+
+  const OpInputs& inputs() const { return inputs_; }
+  OpInputs* mutable_inputs() { return &inputs_; }
+
+  const OpOutputs& outputs() const { return outputs_; }
+  OpOutputs* mutable_outputs() { return &outputs_; }
+
+  int NumInputs() const { return inputs_.Size(); }
+  int NumOutputs() const { return outputs_.Size(); }
+
+  tensorflow::EagerOperation* op() { return op_.get(); }
+
+  tensorflow::Status InitializeNodeDef(const void* custom_initial_data,
+                                       int custom_initial_data_size) {
+    if (!custom_initial_data) {
       return tensorflow::errors::Internal(
-          "Cannot read from invalid tensor index ", input_index);
+          "Cannot convert empty data into a valid NodeDef");
     }
-    auto* handle = new tensorflow::TensorHandle(
-        buffer_map->GetTensor(input_index), nullptr, nullptr, nullptr);
-    op.AddInput(handle);
-    handle->Unref();
+    // The flexbuffer contains a vector where the first elements is the
+    // op name and the second is a serialized NodeDef.
+    const flexbuffers::Vector& v =
+        flexbuffers::GetRoot(
+            reinterpret_cast<const uint8_t*>(custom_initial_data),
+            custom_initial_data_size)
+            .AsVector();
+
+    name_ = v[0].AsString().str();
+    if (!nodedef_.ParseFromString(v[1].AsString().str())) {
+      nodedef_.Clear();
+      return tensorflow::errors::Internal(
+          "Failed to parse data into a valid NodeDef");
+    }
+
+    // Fill NodeDef with defaults if it's a valid op.
+    const tensorflow::OpRegistrationData* op_reg_data;
+    TF_RETURN_IF_ERROR(
+        tensorflow::OpRegistry::Global()->LookUp(nodedef_.op(), &op_reg_data));
+    AddDefaultsToNodeDef(op_reg_data->op_def, &nodedef_);
+
+    return tensorflow::Status::OK();
   }
 
-  int num_retvals = outputs.size();
-  VectorOfHandles retvals(num_retvals);
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      EagerExecute(&op, retvals.GetVector(), &num_retvals),
-      " (while executing '", op_name, "' via Eager)");
+  // Build thew new EagerOperation. In case of error, the returned 'op' is
+  // guaranteed to be 'nullptr'.
+  tensorflow::Status BuildEagerOp(tensorflow::EagerContext* eager_context) {
+    op_.reset();
+
+    const tensorflow::AttrTypeMap* attr_types;
+    bool is_function = false;
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(
+        tensorflow::AttrTypeMapForOp(name_.c_str(), &attr_types, &is_function),
+        " (while processing attributes of '", name_, "')");
+    if (is_function) {
+      return tensorflow::errors::NotFound(
+          "Operation '", name_,
+          "' is not registered.  (while processing attributes of '", name_,
+          "')");
+    }
 
-  if (num_retvals != outputs.size()) {
-    return tensorflow::errors::Internal(
-        "Unexpected number of outputs from EagerExecute");
+    op_.reset(new tensorflow::EagerOperation(eager_context, name_.c_str(),
+                                             /*is_function=*/false,
+                                             attr_types));
+
+    op_->MutableAttrs()->NumInputs(inputs_.Size());
+    for (const auto& attr : nodedef_.attr()) {
+      op_->MutableAttrs()->Set(attr.first, attr.second);
+    }
+
+    // Precalculating a cache key saves about 10% of inference time for very
+    // small models.
+    tensorflow::Device* device = op_->Device();
+    op_->MutableAttrs()->CacheKey(device == nullptr ? "unspecified"
+                                                    : device->name());
+
+    return tensorflow::Status::OK();
   }
 
-  for (int i = 0; i < num_retvals; ++i) {
-    const tensorflow::Tensor* tensor = nullptr;
-    TF_RETURN_IF_ERROR(retvals.GetHandle(i)->Tensor(&tensor));
-    buffer_map->SetFromTensorFlow(outputs[i], *tensor);
+  void ClearEagerInputs() {
+    for (tensorflow::TensorHandle* h : *op_->MutableInputs()) {
+      if (h) h->Unref();
+    }
+    op_->MutableInputs()->clear();
   }
 
-  return tensorflow::Status::OK();
-}
+  tensorflow::Status BuildEagerInputs(const BufferMap* buffer_map) {
+    for (int i = 0; i < inputs_.Size(); ++i) {
+      int input_index = inputs_.TfLiteIndex(i);
+      TensorSource s = inputs_.GetTensorSource(i);
+      if (!s.node) {
+        // This input is not produced by this Eager subgraph (it could be a TF
+        // Lite native buffer, or could be produced by a separater subgraph). We
+        // need to fetch it from the delegate's buffer_map.
+        if (!buffer_map->HasTensor(input_index)) {
+          return tensorflow::errors::Internal(
+              "Cannot read from invalid tensor index ", input_index);
+        }
+        auto* handle = new tensorflow::TensorHandle(
+            buffer_map->GetTensor(input_index), nullptr, nullptr, nullptr);
+        op_->MutableInputs()->push_back(handle);
+      } else {
+        // If this is a forwardable tensor, we will remove it from the previous
+        // op's list, giving TF the opportunity to reuse its buffer.
+        bool unref_handle = inputs_.IsForwardable(i);
+        auto* handle =
+            s.node->outputs_.GetHandle(s.node_output_index, unref_handle);
+        op_->MutableInputs()->push_back(handle);
+      }
+    }
+    return tensorflow::Status::OK();
+  }
+
+  tensorflow::Status PersistEagerOutputs(BufferMap* buffer_map) {
+    auto* handles = outputs_.GetTensorHandles();
+    for (int i = 0; i < outputs_.Size(); ++i) {
+      if (outputs_.IsSubgraphOutput(i)) {
+        const tensorflow::Tensor* tensor = nullptr;
+        TF_RETURN_IF_ERROR(handles->at(i)->Tensor(&tensor));
+        buffer_map->SetFromTensorFlow(outputs_.TfLiteIndex(i), *tensor);
+      }
+    }
+    return tensorflow::Status::OK();
+  }
+
+ private:
+  OpNode(const OpNode&) = delete;
+  OpNode& operator=(const OpNode&) = delete;
 
-// A single node within the larger 'op'. Note that this kernel executes many
-// TensorFlow ops within a single TF Lite op.
-struct OpNode {
   // The name of the TensorFlow op to execute.
-  string name;
+  string name_;
+  // Index of this node into TF Lite's operator list.
+  int index_;
   // The corresponding NodeDef, containing the attributes for the op.
-  tensorflow::NodeDef nodedef;
+  tensorflow::NodeDef nodedef_;
   // List of inputs, as TF Lite tensor indices.
-  std::vector<int> inputs;
+  OpInputs inputs_;
   // List of outputs, as TF Lite tensor indices.
-  std::vector<int> outputs;
+  OpOutputs outputs_;
+
+  std::unique_ptr<tensorflow::EagerOperation> op_;
 };
 
-// The Larger 'op', which contains all the nodes in a supported subgraph.
+// Executes the TensorFlow op given by 'op_name', with the attributes specified
+// in 'nodedef'. Inputs and outputs are given as indices into the 'buffer_map'.
+tensorflow::Status ExecuteFlexOp(TfLiteContext* context, BufferMap* buffer_map,
+                                 OpNode* node_data) {
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(node_data->BuildEagerInputs(buffer_map),
+                                  " (while executing '", node_data->name(),
+                                  "' via Eager)");
+
+  node_data->mutable_outputs()->ResetTensorHandles();
+  int num_retvals = node_data->NumOutputs();
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      EagerExecute(node_data->op(),
+                   node_data->mutable_outputs()->GetTensorHandles(),
+                   &num_retvals),
+      " (while executing '", node_data->name(), "' via Eager)");
+
+  if (num_retvals != node_data->NumOutputs()) {
+    return tensorflow::errors::Internal(
+        "Unexpected number of outputs from EagerExecute");
+  }
+
+  TF_RETURN_IF_ERROR(node_data->PersistEagerOutputs(buffer_map));
+
+  node_data->ClearEagerInputs();
+
+  return tensorflow::Status::OK();
+}
+
+// The larger 'op', which contains all the nodes in a supported subgraph.
 struct OpData {
   tensorflow::EagerContext* eager_context;
   BufferMap* buffer_map;
-  std::vector<OpNode> nodes;
+  std::vector<std::unique_ptr<OpNode>> nodes;
   std::vector<int> subgraph_inputs;
   std::vector<int> subgraph_outputs;
 };
@@ -163,8 +381,10 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
                             ->GetBufferMap(context);
 
   CHECK(params->output_tensors);
+  std::set<int> output_set;
   for (auto tensor_index : TfLiteIntArrayView(params->output_tensors)) {
     op_data->subgraph_outputs.push_back(tensor_index);
+    output_set.insert(tensor_index);
   }
 
   CHECK(params->input_tensors);
@@ -172,48 +392,55 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
     op_data->subgraph_inputs.push_back(tensor_index);
   }
 
+  op_data->nodes.reserve(params->nodes_to_replace->size);
+
   CHECK(params->nodes_to_replace);
+  tensorflow::Status status;
   for (auto node_index : TfLiteIntArrayView(params->nodes_to_replace)) {
     TfLiteNode* node;
     TfLiteRegistration* reg;
     context->GetNodeAndRegistration(context, node_index, &node, &reg);
 
-    op_data->nodes.push_back(OpNode());
-    OpNode& node_data = op_data->nodes.back();
-
-    node_data.name = "";
-    if (node->custom_initial_data) {
-      // The flexbuffer contains a vector where the first elements is the
-      // op name and the second is a serialized NodeDef.
-      const flexbuffers::Vector& v =
-          flexbuffers::GetRoot(
-              reinterpret_cast<const uint8_t*>(node->custom_initial_data),
-              node->custom_initial_data_size)
-              .AsVector();
-
-      node_data.name = v[0].AsString().str();
-      if (!node_data.nodedef.ParseFromString(v[1].AsString().str())) {
-        // We will just leave the nodedef empty and error out in Eval().
-        node_data.nodedef.Clear();
-      }
-    }
+    op_data->nodes.emplace_back(new OpNode(node->inputs, node->outputs));
+    OpNode& node_data = *op_data->nodes.back();
 
-    // Fill NodeDef with defaults if it's a valid op.
-    const tensorflow::OpRegistrationData* op_reg_data;
-    auto tf_status = tensorflow::OpRegistry::Global()->LookUp(
-        node_data.nodedef.op(), &op_reg_data);
-    if (tf_status.ok()) {
-      AddDefaultsToNodeDef(op_reg_data->op_def, &node_data.nodedef);
-    }
+    node_data.set_index(node_index);
+    node_data.set_name("");
 
-    for (auto input_index : TfLiteIntArrayView(node->inputs)) {
-      node_data.inputs.push_back(input_index);
-    }
-    for (auto output_index : TfLiteIntArrayView(node->outputs)) {
-      node_data.outputs.push_back(output_index);
+    status = node_data.InitializeNodeDef(node->custom_initial_data,
+                                         node->custom_initial_data_size);
+    if (!status.ok()) break;
+    status = node_data.BuildEagerOp(op_data->eager_context);
+    if (!status.ok()) break;
+  }
+
+  if (ConvertStatus(context, status) != kTfLiteOk) {
+    // We can't return an error from this function but ConvertStatus will
+    // report them and we will stop processing in Prepare() if anything went
+    // wrong.
+    return op_data;
+  }
+
+  // Given a TfLite tensor index, return the OpNode that produces it,
+  // along with it index into that OpNodes list of outputs.
+  std::map<int, TensorSource> tflite_tensor_sources;
+
+  // Find out how each tensor is produced. This does not account for
+  // tensors that are not produce by eager ops.
+  for (auto& node_data : op_data->nodes) {
+    node_data->mutable_outputs()->InitializeGraphOutputs(output_set);
+    for (int i = 0; i < node_data->outputs().Size(); ++i) {
+      int output_index = node_data->outputs().TfLiteIndex(i);
+      tflite_tensor_sources[output_index] = TensorSource{node_data.get(), i};
     }
   }
 
+  // For each node, resolve the inputs, so we can keep pointers to the nodes
+  // that produces them.
+  for (auto& node_data : op_data->nodes) {
+    node_data->mutable_inputs()->InitializeTensorSources(tflite_tensor_sources);
+  }
+
   return op_data;
 }
 
@@ -229,6 +456,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       "device has not been registered, presumably because some symbols from "
       "tensorflow/core:core_cpu_impl were not linked into the binary.");
 
+  // We will keep track of the number of references to each tensor in the
+  // graph, so we can make them "forwardable" if there is only one reference.
+  std::map<int, int> tensor_ref_count;
+
   // Whenever we find a constant tensor, insert it in the buffer map.
   BufferMap* buffer_map = op_data->buffer_map;
   for (auto tensor_index : op_data->subgraph_inputs) {
@@ -238,21 +469,49 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         buffer_map->SetFromTfLite(tensor_index, tensor);
       }
     }
+
+    // Input tensors should never be forwarded so we increment their ref counts
+    // twice: once for this graph and another for the possibility of them being
+    // used by another subgraph, or being an output of the full graph.
+    tensor_ref_count[tensor_index] += 2;
   }
 
   // All output tensors are allocated by TensorFlow/Eager, so we
   // mark them as kTfLiteDynamic.
   for (auto tensor_index : op_data->subgraph_outputs) {
     SetTensorToDynamic(&context->tensors[tensor_index]);
+    ++tensor_ref_count[tensor_index];
+  }
+
+  for (const auto& node_data : op_data->nodes) {
+    if (node_data->nodedef().op().empty()) {
+      context->ReportError(context, "Invalid NodeDef in Flex op '%s'",
+                           node_data->name().c_str());
+      return kTfLiteError;
+    }
+    TF_LITE_ENSURE(context, node_data->op());
+
+    for (int i = 0; i < node_data->inputs().Size(); ++i) {
+      ++tensor_ref_count[node_data->inputs().TfLiteIndex(i)];
+    }
+  }
+
+  // All tensors that are referenced exactly once are marked as "forwardable",
+  // meaning that we will allow TensorFlow to reuse its buffer as the output of
+  // an op.
+  for (auto& node_data : op_data->nodes) {
+    for (int i = 0; i < node_data->inputs().Size(); ++i) {
+      bool f = (tensor_ref_count[node_data->inputs().TfLiteIndex(i)] == 1);
+      node_data->mutable_inputs()->SetForwardable(i, f);
+    }
   }
 
   return kTfLiteOk;
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const auto* op_data = reinterpret_cast<OpData*>(node->user_data);
+  auto* op_data = reinterpret_cast<OpData*>(node->user_data);
   BufferMap* buffer_map = op_data->buffer_map;
-  tensorflow::EagerContext* eager_context = op_data->eager_context;
 
   // Insert a tensor in the buffer map for all inputs that are not constant.
   // Constants were handled in Prepare() already.
@@ -269,15 +528,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   }
 
   // Execute the TensorFlow Ops sequentially.
-  for (const auto& node_data : op_data->nodes) {
-    if (node_data.nodedef.op().empty()) {
-      context->ReportError(context, "Invalid NodeDef in Flex op '%s'",
-                           node_data.name.c_str());
-      return kTfLiteError;
-    }
-    auto status =
-        ExecuteFlexOp(eager_context, buffer_map, node_data.name,
-                      node_data.nodedef, node_data.inputs, node_data.outputs);
+  for (auto& node_data : op_data->nodes) {
+    SCOPED_TAGGED_OPERATOR_PROFILE(
+        reinterpret_cast<profiling::Profiler*>(context->profiler),
+        node_data->name().c_str(), node_data->index());
+
+    auto status = ExecuteFlexOp(context, buffer_map, node_data.get());
     TF_LITE_ENSURE_OK(context, ConvertStatus(context, status));
   }
 
diff --git a/tensorflow/lite/delegates/flex/kernel_test.cc b/tensorflow/lite/delegates/flex/kernel_test.cc
index efb7300b0bd9693f93fc4b7fb3078c384130cf65..5b3a6d164707a805f05765764b13d2d01eac967f 100644
--- a/tensorflow/lite/delegates/flex/kernel_test.cc
+++ b/tensorflow/lite/delegates/flex/kernel_test.cc
@@ -25,6 +25,7 @@ namespace {
 
 using ::testing::ContainsRegex;
 using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
 
 TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteDelegate* delegate,
                             const std::vector<int>& supported_nodes) {
@@ -36,23 +37,41 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteDelegate* delegate,
   return kTfLiteOk;
 }
 
+// There is no easy way to pass a parameter into the TfLiteDelegate's
+// 'prepare' function, so we keep a global map for testing purpused.
+// To avoid collisions use: GetPrepareFunction<__LINE__>().
+std::map<int, std::vector<int>>* GetGlobalOpLists() {
+  static auto* op_list = new std::map<int, std::vector<int>>;
+  return op_list;
+}
+
 class KernelTest : public testing::FlexModelTest {
  public:
+  static constexpr int kOnes = 1;  // This is the index of a tensor of 1's.
+  static constexpr int kTwos = 2;  // This is the index of a tensor of 2's.
+  static constexpr int kMaxTensors = 30;
+
+  static void SetUpTestSuite() { GetGlobalOpLists()->clear(); }
+
   KernelTest() {
-    CHECK(DelegateData::Create(&delegate_data_).ok());
+    CHECK(delegate_data_.Prepare(tensorflow::SessionOptions{}).ok());
     interpreter_.reset(new Interpreter(&error_reporter_));
   }
 
-  ~KernelTest() override {
-    // The data needs to be released before the interpreter because the
-    // interpreter references the data.
-    delegate_data_.reset();
-    interpreter_.reset();
+  typedef TfLiteStatus (*PrepareFunction)(TfLiteContext* context,
+                                          TfLiteDelegate* delegate);
+
+  template <int KEY>
+  PrepareFunction GetPrepareFunction() {
+    GetGlobalOpLists()->insert({KEY, tf_ops_});
+    return [](TfLiteContext* context, TfLiteDelegate* delegate) {
+      return GenericPrepare(context, delegate, GetGlobalOpLists()->at(KEY));
+    };
   }
 
   template <typename T>
   void ConfigureDelegate(T prepare_function) {
-    delegate_.data_ = delegate_data_.get();
+    delegate_.data_ = &delegate_data_;
     delegate_.flags = kTfLiteDelegateFlagsAllowDynamicTensors;
     delegate_.FreeBufferHandle = nullptr;
     delegate_.Prepare = prepare_function;
@@ -61,9 +80,13 @@ class KernelTest : public testing::FlexModelTest {
                                         TfLiteBufferHandle buffer_handle,
                                         TfLiteTensor* output) {
       auto* delegate_data = reinterpret_cast<DelegateData*>(delegate->data_);
-      tensorflow::StringPiece values = delegate_data->GetBufferMap(context)
-                                           ->GetTensor(buffer_handle)
-                                           .tensor_data();
+      auto* buffer_map = delegate_data->GetBufferMap(context);
+      if (!buffer_map->HasTensor(buffer_handle)) {
+        context->ReportError(context, "Tensor '%d' not found", buffer_handle);
+        return kTfLiteError;
+      }
+      tensorflow::StringPiece values =
+          buffer_map->GetTensor(buffer_handle).tensor_data();
       memcpy(output->data.raw, values.data(), values.size());
       return kTfLiteOk;
     };
@@ -71,7 +94,7 @@ class KernelTest : public testing::FlexModelTest {
   }
 
  private:
-  std::unique_ptr<DelegateData> delegate_data_;
+  DelegateData delegate_data_;
   TfLiteDelegate delegate_;
 };
 
@@ -121,12 +144,9 @@ TEST_F(KernelTest, BadTensorFlowOp) {
     return GenericPrepare(context, delegate, {0});
   });
 
-  SetShape(0, {2, 2, 1});
-  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
-
-  ASSERT_FALSE(Invoke());
+  ASSERT_NE(interpreter_->AllocateTensors(), kTfLiteOk);
   ASSERT_THAT(error_reporter().error_messages(),
-              ContainsRegex("while processing attributes of 'NonExistentOp'"));
+              ContainsRegex("Op type not registered 'NonExistentOp'"));
 }
 
 TEST_F(KernelTest, BadNumberOfOutputs) {
@@ -173,10 +193,7 @@ TEST_F(KernelTest, WrongSetOfNodes) {
     return GenericPrepare(context, delegate, {0, 1});
   });
 
-  SetShape(0, {2, 2, 1});
-  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
-
-  ASSERT_FALSE(Invoke());
+  ASSERT_NE(interpreter_->AllocateTensors(), kTfLiteOk);
   ASSERT_THAT(error_reporter().error_messages(),
               ContainsRegex("Invalid NodeDef in Flex op"));
 }
@@ -235,7 +252,7 @@ TEST_F(KernelTest, SplitGraph) {
   AddTfOp(testing::kAdd, {9, 16}, {17});  // => 16
 
   ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
-    // All ops by #3 are TF ops, handled by the delegate. However, because #4
+    // All ops but #3 are TF ops, handled by the delegate. However, because #4
     // depends on the non-TF op, two subgraphs are necessary:
     //    TF subgraph 1: 0, 1, 2, 6, 7, 8, 9
     //    TF Lite Op: 3
@@ -270,6 +287,132 @@ TEST_F(KernelTest, SplitGraph) {
   ASSERT_THAT(GetValues(17), ElementsAre(18.0f));
 }
 
+class MultipleSubgraphsTest : public KernelTest {
+ public:
+  static constexpr int kInput = 0;
+
+  void PrepareInterpreter(PrepareFunction prepare,
+                          const std::vector<float>& input) {
+    ConfigureDelegate(prepare);
+
+    SetShape(kOnes, {3});
+    SetValues(kOnes, {1.0f, 1.0f, 1.0f});
+    SetShape(kTwos, {3});
+    SetValues(kTwos, {2.0f, 2.0f, 2.0f});
+
+    SetValues(kInput, input);
+  }
+
+  std::vector<float> Apply(const std::vector<float>& input,
+                           std::function<float(float)> function) {
+    std::vector<float> result;
+    for (float f : input) {
+      result.push_back(function(f));
+    }
+    return result;
+  }
+};
+
+TEST_F(MultipleSubgraphsTest, ForwardabilityIsLocal) {
+  AddTensors(kMaxTensors, {kInput, kOnes, kTwos}, {12}, kTfLiteFloat32, {3});
+
+  // Only TF tensors can be forwarded, so we build a small first graph
+  // to produce tensor #10. Here #10 is forwardable, because it is only
+  // used once, as an output.
+  AddTfOp(testing::kAdd, {0, kOnes}, {3});
+  AddTfOp(testing::kAdd, {0, kOnes}, {10});
+
+  // The second TF graph, separated from the former by a TF Lite
+  // multiplication, will consume tensor #10, which is not forwardable here
+  // since it is used by more than one op. The existing code will forward the
+  // tensor anyway, because it was deemed to be forwardable by the previous
+  // subgraph.
+  AddTfLiteMulOp({3, kTwos}, {4});
+  AddTfOp(testing::kAdd, {10, 4}, {11});
+  AddTfOp(testing::kAdd, {11, 10}, {7});
+
+  // And a simple TF Lite op trying to access tensor #10, which was removed
+  // from the buffer map. It will cause Invoke() to fail.
+  AddTfLiteMulOp({10, 7}, {12});
+
+  auto input = {3.0f, 4.0f, 5.0f};
+  PrepareInterpreter(GetPrepareFunction<__LINE__>(), input);
+
+  ASSERT_TRUE(Invoke());
+  ASSERT_THAT(GetValues(12), ElementsAreArray(Apply(input, [](float in) {
+                return (4 * in + 4) * (in + 1);
+              })));
+}
+
+// Subgraphs should not remove input tensors from the buffer_map, since
+// they could be necessary for downstream graphs.
+TEST_F(MultipleSubgraphsTest, DoNotRemoveInputTensors) {
+  AddTensors(kMaxTensors, {kInput, kOnes, kTwos}, {12}, kTfLiteFloat32, {3});
+
+  // Only TF tensors can be removed, so we build a small first graph
+  // to produce tensor #10. We make sure it is used by more than one
+  // op, so it is not forwardable here.
+  AddTfOp(testing::kAdd, {0, kOnes}, {3});
+  AddTfOp(testing::kAdd, {0, kOnes}, {10});
+  AddTfOp(testing::kAdd, {10, kOnes}, {15});
+  AddTfOp(testing::kAdd, {10, kOnes}, {16});
+
+  // The second TF graph, separated from the former by a TF Lite
+  // multiplication, will consume tensor #10. The existing code will remove
+  // from the buffer_map all tensors that are not outputs, so #10 will
+  // disappear. Note that we are using #10 in two ops, so it is not forwardable
+  // either.
+  AddTfLiteMulOp({3, kTwos}, {4});
+  AddTfOp(testing::kAdd, {10, 4}, {11});
+  AddTfOp(testing::kAdd, {10, 11}, {7});
+
+  // And a simple TF Lite op trying to access tensor #10, which was removed
+  // from the buffer map. It will cause Invoke() to fail.
+  AddTfLiteMulOp({10, 7}, {12});
+
+  auto input = {3.0f, 4.0f, 5.0f};
+  PrepareInterpreter(GetPrepareFunction<__LINE__>(), input);
+
+  ASSERT_TRUE(Invoke());
+  ASSERT_THAT(GetValues(12), ElementsAreArray(Apply(input, [](float in) {
+                return (4 * in + 4) * (in + 1);
+              })));
+}
+
+// A tensor is deemed forwardable but it happens to be the input to
+// more than one subgraph. It should not be forwarded, otherwise its
+// contents will be overwritten.
+TEST_F(MultipleSubgraphsTest, DoNotForwardInputTensors) {
+  AddTensors(kMaxTensors, {kInput, kOnes, kTwos}, {12}, kTfLiteFloat32, {3});
+
+  // Only TF tensors can be forwarded, so we build a small first graph
+  // to produce tensor #10.
+  AddTfOp(testing::kAdd, {0, kOnes}, {3});
+  AddTfOp(testing::kAdd, {0, kOnes}, {10});
+
+  // The second TF graph, separated from the former by a TF Lite
+  // multiplication, will consume tensor #10 and will think it is forwardable
+  // because it is used by a single op. However, the subgraph doesn't have
+  // enough information to make that judgment, as the input tensor could be
+  // used by another graph further downstream. The existing code will forward
+  // the tensor and remove it from the buffer_map, causing a failure later.
+  AddTfLiteMulOp({3, kTwos}, {4});
+  AddTfOp(testing::kAdd, {10, 4}, {11});
+  AddTfOp(testing::kAdd, {11, 4}, {7});
+
+  // And a simple TF Lite op trying to access tensor #10, which was removed
+  // from the buffer map. It will cause Invoke() to fail.
+  AddTfLiteMulOp({10, 7}, {12});
+
+  auto input = {3.0f, 4.0f, 5.0f};
+  PrepareInterpreter(GetPrepareFunction<__LINE__>(), input);
+
+  ASSERT_TRUE(Invoke());
+  ASSERT_THAT(GetValues(12), ElementsAreArray(Apply(input, [](float in) {
+                return (5 * in + 5) * (in + 1);
+              })));
+}
+
 }  // namespace
 }  // namespace flex
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/flex/test_util.cc b/tensorflow/lite/delegates/flex/test_util.cc
index aa24675a7b1beab8632435debc8dd1fc04f347e7..a67aeef231b497de2b4749b2ce2fdd5edd5c6129 100644
--- a/tensorflow/lite/delegates/flex/test_util.cc
+++ b/tensorflow/lite/delegates/flex/test_util.cc
@@ -90,6 +90,8 @@ void FlexModelTest::AddTensors(int num_tensors, const std::vector<int>& inputs,
 
 void FlexModelTest::AddTfLiteMulOp(const std::vector<int>& inputs,
                                    const std::vector<int>& outputs) {
+  ++next_op_index_;
+
   static TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
   reg.builtin_code = BuiltinOperator_MUL;
   reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
@@ -114,6 +116,9 @@ void FlexModelTest::AddTfLiteMulOp(const std::vector<int>& inputs,
 
 void FlexModelTest::AddTfOp(TfOpType op, const std::vector<int>& inputs,
                             const std::vector<int>& outputs) {
+  tf_ops_.push_back(next_op_index_);
+  ++next_op_index_;
+
   auto attr = [](const string& key, const string& value) {
     return " attr{ key: '" + key + "' value {" + value + "}}";
   };
diff --git a/tensorflow/lite/delegates/flex/test_util.h b/tensorflow/lite/delegates/flex/test_util.h
index 2cc2dc30e92586535687187105057d41ab5c0350..1913a406e8388af30ff5ca88f18f03fb75d46c49 100644
--- a/tensorflow/lite/delegates/flex/test_util.h
+++ b/tensorflow/lite/delegates/flex/test_util.h
@@ -103,6 +103,7 @@ class FlexModelTest : public ::testing::Test {
  protected:
   std::unique_ptr<Interpreter> interpreter_;
   TestErrorReporter error_reporter_;
+  std::vector<int> tf_ops_;
 
  private:
   // Helper method to add a TensorFlow op. tflite_names needs to start with
@@ -112,6 +113,8 @@ class FlexModelTest : public ::testing::Test {
                const std::vector<int>& outputs);
 
   std::vector<std::vector<uint8_t>> flexbuffers_;
+
+  int next_op_index_ = 0;
 };
 
 }  // namespace testing
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index fd954ba222627ab0457711b87baf9c3f7573e129..99cd6d3f859e7645d57f455d5ee06689b4e6c094 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -3,6 +3,7 @@ package(default_visibility = [
 ])
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 licenses(["notice"])  # Apache 2.0
 
@@ -15,7 +16,7 @@ cc_library(
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:kernel_util",
-        "//tensorflow/lite/nnapi:nnapi_lib",
+        "//tensorflow/lite/nnapi:nnapi_implementation",
     ],
 )
 
@@ -23,7 +24,11 @@ tf_cc_test(
     name = "nnapi_delegate_test",
     size = "small",
     srcs = ["nnapi_delegate_test.cc"],
-    tags = ["no_oss"],
+    tags = [
+        # TODO(b/122987564): Enable on Android after resolving API 27 failures.
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
     deps = [
         ":nnapi_delegate",
         "//tensorflow/lite:framework",
@@ -32,3 +37,5 @@ tf_cc_test(
         "@com_google_googletest//:gtest",
     ],
 )
+
+tflite_portable_test_suite()
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 4fe07004a82ff30228d866bcc7a90067e5940aca..86fe7c5140ec7261ca2bcb136000231e2f2437f3 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <cstdarg>
+#include <cstring>
 #include <iostream>
 #include <memory>
 #include <vector>
@@ -24,11 +25,13 @@ limitations under the License.
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/nnapi/NeuralNetworksShim.h"
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
 
 #ifdef __ANDROID__
-#include <sys/mman.h>
 #include <sys/system_properties.h>
+#endif
+#if defined __ANDROID__ || defined __unix__
+#include <sys/mman.h>
 #include <unistd.h>
 #endif
 
@@ -37,72 +40,97 @@ namespace {
 
 // TODO(b/80621585): Consider printing error string, but don't for now to
 // minimize binary size.
-#define CHECK_NN(context, code)                                           \
-  if (code != ANEURALNETWORKS_NO_ERROR) {                                 \
-    context->ReportError(context, "NN API returned error (%d).\n", code); \
-    return kTfLiteError;                                                  \
-  }
+#define RETURN_TFLITE_ERROR_IF_NN_ERROR(context, code)                        \
+  do {                                                                        \
+    const auto _code = (code);                                                \
+    if (_code != ANEURALNETWORKS_NO_ERROR) {                                  \
+      context->ReportError(context, "NN API returned error (%d, line %d).\n", \
+                           _code, __LINE__);                                  \
+      return kTfLiteError;                                                    \
+    }                                                                         \
+  } while (0)
 
 namespace {
-int32_t GetAndroidSdkVersion() {
-#ifdef __ANDROID__
-  const char* sdkProp = "ro.build.version.sdk";
-  char sdkVersion[PROP_VALUE_MAX];
-  int length = __system_property_get(sdkProp, sdkVersion);
-  if (length != 0) {
-    for (int i = 0; i < length; ++i) {
-      int digit = sdkVersion[i] - '0';
-      if (digit < 0 || digit > 9) {
-        // Non-numeric SDK version, assume it's higher then expected;
-        return std::numeric_limits<int32_t>::max();
-      }
+
+bool IsFloat(TfLiteType type) {
+  switch (type) {
+    case kTfLiteFloat32:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool IsQuantized(TfLiteType type) {
+  switch (type) {
+    case kTfLiteUInt8:
+    case kTfLiteInt8:
+    case kTfLiteInt16:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool IsHybridOperator(const TfLiteContext* context, int builtin_code,
+                      const TfLiteNode* node) {
+  switch (builtin_code) {
+    case kTfLiteBuiltinConv2d:
+    case kTfLiteBuiltinFullyConnected: {
+      const int input_id = node->inputs->data[0];
+      const int filter_id = node->inputs->data[1];
+      const TfLiteType input_type = context->tensors[input_id].type;
+      const TfLiteType filter_type = context->tensors[filter_id].type;
+      return IsFloat(input_type) && IsQuantized(filter_type);
     }
-    return atoi(sdkVersion);
+    default:
+      return false;
   }
-#endif  // __ANDROID__
-  return 0;
 }
 
 constexpr int32_t kMinSdkVersionForNNAPI = 27;
 constexpr int32_t kMinSdkVersionForNNAPI11 = 28;
-static const int32_t kAndroidSdkVersion = GetAndroidSdkVersion();
+constexpr int32_t kMinSdkVersionForNNAPI12 = 29;
 
 }  // namespace
 
 // RAII NN API Model Destructor for use with std::unique_ptr
 struct NNFreeModel {
   void operator()(ANeuralNetworksModel* model) {
-    ANeuralNetworksModel_free(model);
+    NnApiImplementation()->ANeuralNetworksModel_free(model);
   }
 };
 // RAII NN API Compilation Destructor for use with std::unique_ptr
 struct NNFreeCompilation {
   void operator()(ANeuralNetworksCompilation* model) {
-    ANeuralNetworksCompilation_free(model);
+    NnApiImplementation()->ANeuralNetworksCompilation_free(model);
   }
 };
 
 // Manage NNAPI shared memory handle
 class NNMemory {
  public:
-  NNMemory(const char* name, size_t size) {
-#ifdef __ANDROID__
+#if defined __ANDROID__ || defined __unix__
+  NNMemory(const NnApi* nnapi, const char* name, size_t size) {
+    nnapi_ = nnapi;
     byte_size_ = size;
-    fd_ = ASharedMemory_create(name, size);
+    fd_ = nnapi_->ASharedMemory_create(name, size);
     data_ptr_ = reinterpret_cast<uint8_t*>(
         mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 0));
-    ANeuralNetworksMemory_createFromFd(size, PROT_READ | PROT_WRITE, fd_, 0,
-                                       &nn_memory_handle_);
-#endif
+    nnapi_->ANeuralNetworksMemory_createFromFd(size, PROT_READ | PROT_WRITE,
+                                               fd_, 0, &nn_memory_handle_);
   }
+#else
+  NNMemory(const NnApi* /*nnapi*/, const char* /*name*/, size_t /*size*/) {}
+#endif
 
   ~NNMemory() {
-#ifdef __ANDROID__
+#if defined __ANDROID__ || defined __unix__
     if (data_ptr_) {
       munmap(data_ptr_, byte_size_);
     }
     if (nn_memory_handle_) {
-      ANeuralNetworksMemory_free(nn_memory_handle_);
+      nnapi_->ANeuralNetworksMemory_free(nn_memory_handle_);
     }
     if (fd_ > 0) close(fd_);
 #endif
@@ -112,7 +140,8 @@ class NNMemory {
   uint8_t* get_data_ptr() { return data_ptr_; }
 
  private:
-#ifdef __ANDROID__
+#if defined __ANDROID__ || defined __unix__
+  const NnApi* nnapi_;
   int fd_ = 0;
   size_t byte_size_ = 0;
 #endif
@@ -157,15 +186,42 @@ class OperandMapping {
   std::vector<int> lite_tensor_to_ann_tensor_;
 };
 
+class DequantizeMapping {
+ public:
+  int DequantizedAnnIndex(int ann_index, TfLiteType type) const {
+    for (const auto& element : mapping_) {
+      if (ann_index == std::get<0>(element) && type == std::get<1>(element)) {
+        return std::get<2>(element);
+      }
+    }
+    return -1;
+  }
+
+  void Add(int ann_index, TfLiteType type, int dequantized_ann_index) {
+    // This assumes it is not already mapped.
+    mapping_.emplace_back(ann_index, type, dequantized_ann_index);
+  }
+
+ private:
+  // Each tuple specifies the ANN (quantized) tensor index, the desired
+  // floating-point type and the matching ANN (dequantized) tensor index. This
+  // could use a map but instead std::vector is used to keep code size lower.
+  std::vector<std::tuple<int, TfLiteType, int>> mapping_;
+};
+
 // Abstract builder for building an op in the NN API graph. This handles
 // the disparity between TFLite and NN API operand types. NN API has singular
 // operands for both tensors and parameters, and TFLite separates the two.
 class NNAPIOpBuilder {
  public:
-  NNAPIOpBuilder(TfLiteContext* context, OperandMapping* tensor_mapping,
+  NNAPIOpBuilder(const NnApi* nnapi, TfLiteContext* context,
+                 OperandMapping* tensor_mapping,
+                 DequantizeMapping* dequantize_mapping,
                  ANeuralNetworksModel* nn_model)
-      : context_(context),
+      : nnapi_(nnapi),
+        context_(context),
         operand_mapping_(tensor_mapping),
+        dequantize_mapping_(dequantize_mapping),
         nn_model_(nn_model) {}
 
   TfLiteStatus AddScalarInt32Operand(int32_t value) {
@@ -199,48 +255,129 @@ class NNAPIOpBuilder {
     return kTfLiteOk;
   }
 
-  TfLiteStatus AddTensorInput(int tensor_index) {
-    int ann_index;
-    TF_LITE_ENSURE_STATUS(AddTensor(tensor_index, &ann_index));
-    augmented_inputs_.push_back(ann_index);
-    return kTfLiteOk;
+  TfLiteStatus AddTensorInput(int tensor_index, bool hybrid_op) {
+    return AddTensor(tensor_index, hybrid_op, &augmented_inputs_);
   }
 
   TfLiteStatus AddTensorOutput(int tensor_index) {
-    int ann_index;
-    TF_LITE_ENSURE_STATUS(AddTensor(tensor_index, &ann_index));
-    augmented_outputs_.push_back(ann_index);
-    return kTfLiteOk;
+    return AddTensor(tensor_index, /*hybrid_op=*/false, &augmented_outputs_);
   }
 
   TfLiteStatus AddAdditionalFloat32OutputTensor(uint32_t dimension_count) {
     std::vector<uint32_t> dims(dimension_count, 0);
-    ANeuralNetworksOperandType operand_type{
-        .type = ANEURALNETWORKS_TENSOR_FLOAT32,
-        .dimensionCount = dimension_count,
-        .dimensions = dims.data()};
-    CHECK_NN(context_,
-             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
-    int ann_operand = operand_mapping_->add_new_non_tensor_operand();
-    augmented_outputs_.push_back(ann_operand);
-    return kTfLiteOk;
+    return AddFloat32OutputTensor(dimension_count, dims.data(), nullptr);
   }
 
   TfLiteStatus AddStateFloat32Tensor(int tensor_index,
                                      int* ann_tensor_index_out) {
     TfLiteTensor* tensor = &context_->tensors[tensor_index];
-    int ann_index = operand_mapping_->add_new_non_tensor_operand();
+    return AddFloat32OutputTensor(
+        tensor->dims->size, reinterpret_cast<uint32_t*>(tensor->dims->data),
+        ann_tensor_index_out);
+  }
+
+  // Adds a Dequantize operator and replaces the input tensor index with the
+  // dequantized version. If the dequantized version of the operator already
+  // exists then it is not added again.
+  TfLiteStatus AddDequantize(int nn_input_index, int lite_index,
+                             TfLiteType dequantized_type) {
+    const int ann_index = operand_mapping_->lite_index_to_ann(lite_index);
+    int dequantized_ann_index =
+        dequantize_mapping_->DequantizedAnnIndex(ann_index, dequantized_type);
+
+    if (dequantized_ann_index == -1) {
+      // The dequantized version does not exist yet, it has to be added: a new
+      // Dequantize operation is added, yielding a new tensor.
+      const TfLiteTensor& tensor = context_->tensors[lite_index];
+      ANeuralNetworksOperandType operand_type{
+          dequantized_type, static_cast<uint32_t>(tensor.dims->size),
+          reinterpret_cast<uint32_t*>(tensor.dims->data), 0.f, 0};
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context_,
+          nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+      dequantized_ann_index = operand_mapping_->add_new_non_tensor_operand();
+
+      // Add Dequantize operation.
+      const uint32_t dequantize_input[1] = {static_cast<uint32_t>(ann_index)};
+      const uint32_t dequantize_output[1] = {
+          static_cast<uint32_t>(dequantized_ann_index)};
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context_, nnapi_->ANeuralNetworksModel_addOperation(
+                        nn_model_, ANEURALNETWORKS_DEQUANTIZE, 1,
+                        dequantize_input, 1, dequantize_output));
+      dequantize_mapping_->Add(ann_index, dequantized_type,
+                               dequantized_ann_index);
+    }
+
+    // The input for the original operation is modified so that the operation
+    // now uses the dequantized tensor as input.
+    augmented_inputs_[nn_input_index] = dequantized_ann_index;
+
+    return kTfLiteOk;
+  }
+
+  // Finish emitting the op (of type `type`) into the NN API.
+  TfLiteStatus FinalizeAddOperation(ANeuralNetworksOperationType type) {
+    // Actually add a NN API operation
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_,
+        nnapi_->ANeuralNetworksModel_addOperation(
+            nn_model_, type, static_cast<uint32_t>(augmented_inputs_.size()),
+            augmented_inputs_.data(),
+            static_cast<uint32_t>(augmented_outputs_.size()),
+            augmented_outputs_.data()));
+    augmented_inputs_.clear();
+    augmented_outputs_.clear();
+    return kTfLiteOk;
+  }
 
+ private:
+  template <typename T>
+  TfLiteStatus AddScalarOperand(T value, int32_t nn_type) {
+    ANeuralNetworksOperandType operand_type{.type = nn_type};
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_,
+        nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+    const int ann_index = operand_mapping_->add_new_non_tensor_operand();
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_, nnapi_->ANeuralNetworksModel_setOperandValue(
+                      nn_model_, ann_index, &value, sizeof(T)));
+    augmented_inputs_.push_back(ann_index);
+    return kTfLiteOk;
+  }
+
+  template <typename T>
+  TfLiteStatus AddVectorOperand(const T* values, uint32_t num_values,
+                                int32_t nn_type) {
     ANeuralNetworksOperandType operand_type{
-        ANEURALNETWORKS_TENSOR_FLOAT32,
-        static_cast<uint32_t>(tensor->dims->size),
-        reinterpret_cast<uint32_t*>(tensor->dims->data), tensor->params.scale,
-        tensor->params.zero_point};
-    CHECK_NN(context_,
-             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
-    augmented_outputs_.push_back(ann_index);
+        .type = nn_type, .dimensionCount = 1, .dimensions = &num_values};
+
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_,
+        nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+
+    const int ann_index = operand_mapping_->add_new_non_tensor_operand();
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_, nnapi_->ANeuralNetworksModel_setOperandValue(
+                      nn_model_, ann_index, values, sizeof(T) * num_values));
+    augmented_inputs_.push_back(ann_index);
+    return kTfLiteOk;
+  }
 
-    *ann_tensor_index_out = ann_index;
+  TfLiteStatus AddFloat32OutputTensor(uint32_t dimension_count,
+                                      const uint32_t* dimension_data,
+                                      int* ann_index_out) {
+    ANeuralNetworksOperandType operand_type{
+        .type = ANEURALNETWORKS_TENSOR_FLOAT32,
+        .dimensionCount = dimension_count,
+        .dimensions = dimension_data,
+    };
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_,
+        nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+    const int ann_index = operand_mapping_->add_new_non_tensor_operand();
+    augmented_outputs_.push_back(ann_index);
+    if (ann_index_out) *ann_index_out = ann_index;
     return kTfLiteOk;
   }
 
@@ -248,10 +385,11 @@ class NNAPIOpBuilder {
   // This returns the NN API tensor index corresponding to the created tensor.
   // If another caller previously created a NN API tensor for `tensor_index`
   // then the existing one is returned.
-  TfLiteStatus AddTensor(int tensor_index, int* ann_tensor_index_out) {
+  TfLiteStatus AddTensor(int tensor_index, bool hybrid_op,
+                         std::vector<uint32_t>* indices) {
     int ann_tensor_index = operand_mapping_->lite_index_to_ann(tensor_index);
     if (ann_tensor_index != -1) {
-      *ann_tensor_index_out = ann_tensor_index;
+      indices->push_back(ann_tensor_index);
       return kTfLiteOk;
     }
     // Allocate a new tensor index
@@ -262,11 +400,17 @@ class NNAPIOpBuilder {
     float scale = 0.0f;
     int32_t zeroPoint = 0;
     TfLiteTensor* tensor = &context_->tensors[tensor_index];
-    switch (tensor->type) {
+    TfLiteType tensor_type = tensor->type;
+    if (hybrid_op && (tensor_type == kTfLiteUInt8)) {
+      // For legacy reason, UINT8 weights in hybrid operators are actually INT8
+      // values and should be interpreted as such.
+      tensor_type = kTfLiteInt8;
+    }
+    switch (tensor_type) {
       case kTfLiteNoType:
         // Tensors added during initialization of Ops don't have a type yet and
         // should not be registered with the NNAPI.
-        *ann_tensor_index_out = -1;
+        indices->push_back(-1);
         return kTfLiteOk;
       case kTfLiteFloat32:
         nn_type = ANEURALNETWORKS_TENSOR_FLOAT32;
@@ -280,6 +424,10 @@ class NNAPIOpBuilder {
           scale = 1;
         }
         break;
+      case kTfLiteInt8:
+        nn_type = ANEURALNETWORKS_TENSOR_QUANT8_SYMM;
+        scale = tensor->params.scale;
+        break;
       case kTfLiteInt32:
         nn_type = ANEURALNETWORKS_TENSOR_INT32;
         scale = tensor->params.scale;
@@ -293,75 +441,43 @@ class NNAPIOpBuilder {
     ANeuralNetworksOperandType operand_type{
         nn_type, static_cast<uint32_t>(tensor->dims->size),
         reinterpret_cast<uint32_t*>(tensor->dims->data), scale, zeroPoint};
-    CHECK_NN(context_,
-             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_,
+        nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
 
     if (tensor->allocation_type == kTfLiteMmapRo) {
       // TODO(b/80630405): Use NNAPIAllocation.
-      CHECK_NN(context_, ANeuralNetworksModel_setOperandValue(
-                             nn_model_, ann_tensor_index, tensor->data.raw,
-                             tensor->bytes));
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context_,
+          nnapi_->ANeuralNetworksModel_setOperandValue(
+              nn_model_, ann_tensor_index, tensor->data.raw, tensor->bytes));
     }
 
-    *ann_tensor_index_out = ann_tensor_index;
+    indices->push_back(ann_tensor_index);
     return kTfLiteOk;
   }
 
-  // Finish emitting the op (of type `type`) into the NN API.
-  TfLiteStatus FinalizeAddOperation(ANeuralNetworksOperationType type) {
-    // Actually add a NN API operation
-    CHECK_NN(context_, ANeuralNetworksModel_addOperation(
-                           nn_model_, type,
-                           static_cast<uint32_t>(augmented_inputs_.size()),
-                           augmented_inputs_.data(),
-                           static_cast<uint32_t>(augmented_outputs_.size()),
-                           augmented_outputs_.data()));
-    augmented_inputs_.clear();
-    augmented_outputs_.clear();
-    return kTfLiteOk;
-  }
+  // Access to NNAPI.
+  const NnApi* const nnapi_;
 
- private:
-  template <typename T>
-  TfLiteStatus AddScalarOperand(T value, int32_t nn_type) {
-    ANeuralNetworksOperandType operand_type{.type = nn_type};
-    CHECK_NN(context_,
-             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
-    int ann_operand = operand_mapping_->add_new_non_tensor_operand();
-    CHECK_NN(context_, ANeuralNetworksModel_setOperandValue(
-                           nn_model_, ann_operand, &value, sizeof(T)));
-    augmented_inputs_.push_back(ann_operand);
-    return kTfLiteOk;
-  }
-
-  template <typename T>
-  TfLiteStatus AddVectorOperand(const T* values, uint32_t num_values,
-                                int32_t nn_type) {
-    ANeuralNetworksOperandType operand_type{
-        .type = nn_type, .dimensionCount = 1, .dimensions = &num_values};
-    CHECK_NN(context_,
-             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
-    int ann_operand = operand_mapping_->add_new_non_tensor_operand();
-    CHECK_NN(context_,
-             ANeuralNetworksModel_setOperandValue(
-                 nn_model_, ann_operand, values, sizeof(T) * num_values));
-    augmented_inputs_.push_back(ann_operand);
-    return kTfLiteOk;
-  }
+  // TfLiteContext for error handling.
+  TfLiteContext* const context_;
 
-  // TfLiteContext for error handling. Must be named context for macros to
-  // work.
-  TfLiteContext* context_;
+  // Tracks relationship between indices.
+  OperandMapping* const operand_mapping_;
 
-  // Tracks relationship between indices
-  OperandMapping* operand_mapping_;
+  // Keeps mapping of ANN quantized tensor and float data type to equivalent
+  // dequantized ANN tensor. For example, tensor #4 (UINT8) + FLOAT32 could map
+  // to tensor #10 (FLOAT32) because a DEQUANTIZE operator was added to convert
+  // tensor #4 to a FLOAT32 tensor.
+  DequantizeMapping* const dequantize_mapping_;
 
-  // The model
-  ANeuralNetworksModel* nn_model_;
+  // The NNAPI model.
+  ANeuralNetworksModel* const nn_model_;
 
   // Inputs and outputs for the current op. These are augmented in the sense
   // that NN API uses operands for all arguments, not just tensors, unlike
-  // TensorFlow lite.
+  // TensorFlow Lite.
   std::vector<uint32_t> augmented_inputs_;
   std::vector<uint32_t> augmented_outputs_;
 };
@@ -374,10 +490,18 @@ struct NNAPIOpMappingArgs {
   std::vector<int>* model_state_tfl_inputs;
 };
 
+// Mapping function simply returning the operation type without adding any
+// additional parameter.
+template <ANeuralNetworksOperationType OperationType>
+ANeuralNetworksOperationType BasicMappingFn(
+    const NNAPIOpMappingArgs& mapping_args) {
+  return OperationType;
+}
+
 // The kernel that represents the node sub set of TF Lite being run on NN API.
 class NNAPIDelegateKernel {
  public:
-  NNAPIDelegateKernel() = default;
+  NNAPIDelegateKernel() { nnapi_ = NnApiImplementation(); }
 
   typedef ANeuralNetworksOperationType (*MappingFn)(
       const NNAPIOpMappingArgs& mapping_args);
@@ -385,8 +509,9 @@ class NNAPIDelegateKernel {
   // Return a function that knows how to translate a node into its operands
   // when called. You can use this function to see if a node is supported
   // (i.e. that MappingFn is not nullptr).
-  MappingFn Map(TfLiteContext* context, int builtin_code, int version,
-                TfLiteNode* node) {
+  static MappingFn Map(const TfLiteContext* context, int builtin_code,
+                       int version, int android_sdk_version,
+                       const TfLiteNode* node) {
     switch (builtin_code) {
       case kTfLiteBuiltinAdd:
         if (version == 1) {
@@ -397,8 +522,6 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_ADD;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinMul:
@@ -410,8 +533,6 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_MUL;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinAveragePool2d:
@@ -422,8 +543,6 @@ class NNAPIDelegateKernel {
                 mapping_args.node->builtin_data);
             return ANEURALNETWORKS_AVERAGE_POOL_2D;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinMaxPool2d:
@@ -434,8 +553,6 @@ class NNAPIDelegateKernel {
                 mapping_args.node->builtin_data);
             return ANEURALNETWORKS_MAX_POOL_2D;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinL2Pool2d:
@@ -446,12 +563,15 @@ class NNAPIDelegateKernel {
                 mapping_args.node->builtin_data);
             return ANEURALNETWORKS_L2_POOL_2D;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinConv2d:
         if (version == 1) {
+          if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
+              IsHybridOperator(context, builtin_code, node)) {
+            // Hybrid operators not supported before NNAPI 1.2.
+            return nullptr;
+          }
           auto builtin =
               reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
           if (builtin->dilation_width_factor != 1 ||
@@ -469,8 +589,6 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_CONV_2D;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinDepthwiseConv2d:
@@ -487,12 +605,15 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_DEPTHWISE_CONV_2D;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinFullyConnected:
         if (version == 1) {
+          if (android_sdk_version < kMinSdkVersionForNNAPI12 &&
+              IsHybridOperator(context, builtin_code, node)) {
+            // Hybrid operators not supported before NNAPI 1.2.
+            return nullptr;
+          }
           return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             auto builtin = reinterpret_cast<TfLiteFullyConnectedParams*>(
@@ -500,8 +621,6 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_FULLY_CONNECTED;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinSoftmax:
@@ -513,22 +632,15 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarFloat32Operand(builtin->beta);
             return ANEURALNETWORKS_SOFTMAX;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinReshape:
         if (version == 1 && node->inputs->size == 2) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_RESHAPE;
-          };
-        } else {
-          return nullptr;
+          return BasicMappingFn<ANEURALNETWORKS_RESHAPE>;
         }
         break;
       case kTfLiteBuiltinSqueeze:
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11) {
           return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             auto builtin = reinterpret_cast<TfLiteSqueezeParams*>(
@@ -540,20 +652,15 @@ class NNAPIDelegateKernel {
                 static_cast<uint32_t>(builtin->num_squeeze_dims));
             return ANEURALNETWORKS_SQUEEZE;
           };
-        } else {
-          return nullptr;
         }
+        break;
       case kTfLiteBuiltinL2Normalization: {
         auto builtin =
             reinterpret_cast<TfLiteL2NormParams*>(node->builtin_data);
-        if (builtin->activation != kTfLiteActNone) {
-          // NNAPI does not support activations
-          return nullptr;
+        if (builtin->activation == kTfLiteActNone) {
+          return BasicMappingFn<ANEURALNETWORKS_L2_NORMALIZATION>;
         }
-        return [](const NNAPIOpMappingArgs& mapping_args)
-                   -> ANeuralNetworksOperationType {
-          return ANEURALNETWORKS_L2_NORMALIZATION;
-        };
+        break;
       }
       case kTfLiteBuiltinLocalResponseNormalization:
         if (version == 1) {
@@ -567,10 +674,6 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarFloat32Operand(builtin->beta);
             return ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION;
           };
-        } else {
-          // TODO(miaowang): clean-up code and return early in the unsupported
-          // case.
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinLshProjection:
@@ -587,8 +690,6 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarInt32Operand(builtin->type);
             return ANEURALNETWORKS_LSH_PROJECTION;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinConcatenation:
@@ -599,7 +700,7 @@ class NNAPIDelegateKernel {
             // NNAPI only support concatenating quantized tensor of the same
             // scale and offset.
             auto first_param = context->tensors[node->inputs->data[0]].params;
-            for (int i = 0; i < node->inputs->size; i++) {
+            for (int i = 1; i < node->inputs->size; i++) {
               auto curr_param = context->tensors[node->inputs->data[i]].params;
               if (curr_param.scale != first_param.scale ||
                   curr_param.zero_point != first_param.zero_point) {
@@ -614,68 +715,36 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarInt32Operand(builtin->axis);
             return ANEURALNETWORKS_CONCATENATION;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinDequantize:
         if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_DEQUANTIZE;
-          };
-        } else {
-          return nullptr;
+          return BasicMappingFn<ANEURALNETWORKS_DEQUANTIZE>;
         }
         break;
       case kTfLiteBuiltinFloor:
         if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_FLOOR;
-          };
-        } else {
-          return nullptr;
+          return BasicMappingFn<ANEURALNETWORKS_FLOOR>;
         }
         break;
       case kTfLiteBuiltinRelu:
         if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_RELU;
-          };
-        } else {
-          return nullptr;
+          return BasicMappingFn<ANEURALNETWORKS_RELU>;
         }
         break;
       case kTfLiteBuiltinReluN1To1:
         if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_RELU1;
-          };
-        } else {
-          return nullptr;
+          return BasicMappingFn<ANEURALNETWORKS_RELU1>;
         }
         break;
       case kTfLiteBuiltinRelu6:
         if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_RELU6;
-          };
-        } else {
-          return nullptr;
+          return BasicMappingFn<ANEURALNETWORKS_RELU6>;
         }
         break;
       case kTfLiteBuiltinLogistic:
         if (version == 1) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_LOGISTIC;
-          };
-        } else {
-          return nullptr;
+          return BasicMappingFn<ANEURALNETWORKS_LOGISTIC>;
         }
         break;
       case kTfLiteBuiltinTanh:
@@ -683,16 +752,11 @@ class NNAPIDelegateKernel {
         if (version == 1 &&
             context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
           // NNAPI only support float tanh.
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_TANH;
-          };
-        } else {
-          return nullptr;
+          return BasicMappingFn<ANEURALNETWORKS_TANH>;
         }
         break;
       case kTfLiteBuiltinSub:
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11 &&
             context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
           // NNAPI only support float sub.
           return [](const NNAPIOpMappingArgs& mapping_args)
@@ -702,12 +766,10 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_SUB;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinDiv:
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11 &&
             context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
           // NNAPI only support float div.
           return [](const NNAPIOpMappingArgs& mapping_args)
@@ -717,37 +779,25 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_DIV;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinPad:
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11 &&
             node->inputs->size == 2 &&
             context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
           // NNAPI does not support specifying the padding value.
           // NNAPI pads physical zero for quantized tensors, so only delegate
           // float pad to NNAPI.
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_PAD;
-          };
-        } else {
-          return nullptr;
+          return BasicMappingFn<ANEURALNETWORKS_PAD>;
         }
         break;
       case kTfLiteBuiltinSpaceToBatchNd:
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_SPACE_TO_BATCH_ND;
-          };
-        } else {
-          return nullptr;
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11) {
+          return BasicMappingFn<ANEURALNETWORKS_SPACE_TO_BATCH_ND>;
         }
         break;
       case kTfLiteBuiltinStridedSlice:
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11) {
           return [](const NNAPIOpMappingArgs& mapping_args)
                      -> ANeuralNetworksOperationType {
             auto builtin = reinterpret_cast<TfLiteStridedSliceParams*>(
@@ -758,8 +808,6 @@ class NNAPIDelegateKernel {
                 builtin->shrink_axis_mask);
             return ANEURALNETWORKS_STRIDED_SLICE;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinTranspose:
@@ -767,16 +815,11 @@ class NNAPIDelegateKernel {
         // dimensions.
         // TODO(b/110888333): Support dynamically-sized tensors in delegates.
         if ((version == 1) &&
-            (kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) &&
+            (android_sdk_version >= kMinSdkVersionForNNAPI11) &&
             (node->inputs->size > 1) &&
             (context->tensors[node->inputs->data[1]].allocation_type ==
              kTfLiteMmapRo)) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_TRANSPOSE;
-          };
-        } else {
-          return nullptr;
+          return BasicMappingFn<ANEURALNETWORKS_TRANSPOSE>;
         }
         break;
       case kTfLiteBuiltinRnn:
@@ -799,13 +842,13 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_RNN;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinSvdf:
         // NNAPI only support float32 weights.
+        // Only delegate to NNAPI 1.1, as SVDF does not support rank > 1 on 1.0.
         if (version == 1 && node->inputs->size == 5 &&
+            android_sdk_version >= kMinSdkVersionForNNAPI11 &&
             context->tensors[node->inputs->data[/*kWeightsFeatureTensor*/ 1]]
                     .type == kTfLiteFloat32) {
           return [](const NNAPIOpMappingArgs& mapping_args)
@@ -827,14 +870,15 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarInt32Operand(builtin->activation);
             return ANEURALNETWORKS_SVDF;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinLstm:
         // NNAPI only support float32 weights.
+        // Only delegate to NNAPI 1.1,  as 1.0 has a bug for optional tensors
+        // which would affect LSTM.
         // TODO(miaowang): add loggings to indicate why the op is rejected.
         if (version == 1 && node->inputs->size == 20 &&
+            android_sdk_version >= kMinSdkVersionForNNAPI11 &&
             context->tensors[node->inputs
                                  ->data[/*kInputToOutputWeightsTensor*/ 4]]
                     .type == kTfLiteFloat32) {
@@ -870,13 +914,11 @@ class NNAPIDelegateKernel {
 
             return ANEURALNETWORKS_LSTM;
           };
-        } else {
-          return nullptr;
         }
         break;
       case kTfLiteBuiltinMean:
         // NNAPI does not support generating a scalar as output for MEAN.
-        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
+        if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11 &&
             context->tensors[node->inputs->data[0]].type == kTfLiteFloat32 &&
             context->tensors[node->outputs->data[0]].dims->size > 0) {
           return [](const NNAPIOpMappingArgs& mapping_args)
@@ -888,36 +930,27 @@ class NNAPIDelegateKernel {
             mapping_args.builder->AddScalarInt32Operand(keep_dims);
             return ANEURALNETWORKS_MEAN;
           };
-        } else {
-          return nullptr;
         }
+        break;
       case kTfLiteBuiltinEmbeddingLookup:
         // NNAPI only support float32 values.
         if (version == 1 &&
             context->tensors[node->inputs->data[1]].type == kTfLiteFloat32) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_EMBEDDING_LOOKUP;
-          };
-        } else {
-          return nullptr;
+          return BasicMappingFn<ANEURALNETWORKS_EMBEDDING_LOOKUP>;
         }
         break;
       case kTfLiteBuiltinHashtableLookup:
         // NNAPI only support float32 output.
         if (version == 1 &&
             context->tensors[node->outputs->data[0]].type == kTfLiteFloat32) {
-          return [](const NNAPIOpMappingArgs& mapping_args)
-                     -> ANeuralNetworksOperationType {
-            return ANEURALNETWORKS_HASHTABLE_LOOKUP;
-          };
-        } else {
-          return nullptr;
+          return BasicMappingFn<ANEURALNETWORKS_HASHTABLE_LOOKUP>;
         }
         break;
       default:
+        // All other operators are not mapped.
         return nullptr;
     }
+    return nullptr;
   }
 
   // Initialize the kernel (a NN model).
@@ -929,7 +962,8 @@ class NNAPIDelegateKernel {
 
     if (!nn_model_) {
       ANeuralNetworksModel* model;
-      CHECK_NN(context, ANeuralNetworksModel_create(&model));
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context, nnapi_->ANeuralNetworksModel_create(&model));
       nn_model_.reset(model);
 
       TF_LITE_ENSURE_STATUS(
@@ -938,9 +972,11 @@ class NNAPIDelegateKernel {
 
     if (!nn_compilation_) {
       ANeuralNetworksCompilation* compilation;
-      CHECK_NN(context, ANeuralNetworksCompilation_create(nn_model_.get(),
-                                                          &compilation));
-      CHECK_NN(context, ANeuralNetworksCompilation_finish(compilation));
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context, nnapi_->ANeuralNetworksCompilation_create(nn_model_.get(),
+                                                             &compilation));
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context, nnapi_->ANeuralNetworksCompilation_finish(compilation));
       nn_compilation_.reset(compilation);
     }
     return kTfLiteOk;
@@ -948,8 +984,9 @@ class NNAPIDelegateKernel {
 
   TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node) {
     ANeuralNetworksExecution* execution = nullptr;
-    CHECK_NN(context, ANeuralNetworksExecution_create(nn_compilation_.get(),
-                                                      &execution));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context, nnapi_->ANeuralNetworksExecution_create(nn_compilation_.get(),
+                                                         &execution));
 
     // Set the input tensor buffers. Note: we access tflite tensors using
     // absolute indices but NN api indices inputs by relative indices.
@@ -967,10 +1004,11 @@ class NNAPIDelegateKernel {
         // copy data to pre-allocated shared memory.
         memcpy(nn_input_memory_->get_data_ptr() + input_offset,
                tensor->data.raw, tensor->bytes);
-        CHECK_NN(context, ANeuralNetworksExecution_setInputFromMemory(
-                              execution, relative_input_index, nullptr,
-                              nn_input_memory_->get_handle(), input_offset,
-                              tensor->bytes));
+        RETURN_TFLITE_ERROR_IF_NN_ERROR(
+            context,
+            nnapi_->ANeuralNetworksExecution_setInputFromMemory(
+                execution, relative_input_index, nullptr,
+                nn_input_memory_->get_handle(), input_offset, tensor->bytes));
         input_offset += tensor->bytes;
         relative_input_index++;
       }
@@ -981,10 +1019,11 @@ class NNAPIDelegateKernel {
     size_t output_offset = 0;
     for (auto output_index : TfLiteIntArrayView(node->outputs)) {
       TfLiteTensor* tensor = &context->tensors[output_index];
-      CHECK_NN(context, ANeuralNetworksExecution_setOutputFromMemory(
-                            execution, relative_output_index, nullptr,
-                            nn_output_memory_->get_handle(), output_offset,
-                            tensor->bytes));
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context,
+          nnapi_->ANeuralNetworksExecution_setOutputFromMemory(
+              execution, relative_output_index, nullptr,
+              nn_output_memory_->get_handle(), output_offset, tensor->bytes));
       output_offset += tensor->bytes;
       relative_output_index++;
     }
@@ -997,17 +1036,21 @@ class NNAPIDelegateKernel {
       // Here we are using a deep copy for state_in tensors so that we are not
       // reading and writing into the same buffer during a invocation.
       // TODO(110369471): using double shared buffer to minimize the copies.
-      CHECK_NN(context, ANeuralNetworksExecution_setOutput(
-                            execution, relative_output_index, nullptr,
-                            tensor->data.raw, tensor->bytes));
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context, nnapi_->ANeuralNetworksExecution_setOutput(
+                       execution, relative_output_index, nullptr,
+                       tensor->data.raw, tensor->bytes));
       relative_output_index++;
     }
     // Invoke ANN in blocking fashion.
     ANeuralNetworksEvent* event = nullptr;
-    CHECK_NN(context, ANeuralNetworksExecution_startCompute(execution, &event));
-    CHECK_NN(context, ANeuralNetworksEvent_wait(event));
-    ANeuralNetworksEvent_free(event);
-    ANeuralNetworksExecution_free(execution);
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context,
+        nnapi_->ANeuralNetworksExecution_startCompute(execution, &event));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(context,
+                                    nnapi_->ANeuralNetworksEvent_wait(event));
+    nnapi_->ANeuralNetworksEvent_free(event);
+    nnapi_->ANeuralNetworksExecution_free(execution);
 
     // copy results from shared memory to the destination.
     output_offset = 0;
@@ -1022,6 +1065,8 @@ class NNAPIDelegateKernel {
   }
 
  private:
+  // Access to NNApi.
+  const NnApi* nnapi_;
   // ANN API state.
   std::unique_ptr<ANeuralNetworksModel, NNFreeModel> nn_model_;
   std::unique_ptr<ANeuralNetworksCompilation, NNFreeCompilation>
@@ -1038,18 +1083,69 @@ class NNAPIDelegateKernel {
   std::unique_ptr<NNMemory> nn_input_memory_;
   std::unique_ptr<NNMemory> nn_output_memory_;
 
+  void AddDequantizeOperatorsWhereNeeded(const TfLiteContext* context,
+                                         int builtin_code,
+                                         const TfLiteNode* node,
+                                         NNAPIOpBuilder* builder) {
+    // Depending on the operator and the input data format, Dequantize
+    // operators may need to be added. For example when the input is
+    // floating-point but weights are quantized then the weights will first be
+    // dequantized to the same format as the input before being passed to the
+    // operator.
+
+    // The tensor determining whether the inputs should be floating-point.
+    int input_tensor_index = -1;
+    std::vector<int> inputs_to_potentially_dequantize;
+
+    switch (builtin_code) {
+      case kTfLiteBuiltinConv2d:
+      case kTfLiteBuiltinFullyConnected: {
+        input_tensor_index = 0;
+        // Weights and bias are inputs #1 and #2 respectively and may require
+        // dequantization.
+        inputs_to_potentially_dequantize = {1, 2};
+        break;
+      }
+      default:
+        return;
+    }
+
+    int tensor_id = node->inputs->data[input_tensor_index];
+    if (tensor_id < 0) return;
+
+    // Nothing to do if the input is not floating-point.
+    if (!IsFloat(context->tensors[tensor_id].type)) return;
+
+    for (int i : inputs_to_potentially_dequantize) {
+      tensor_id = node->inputs->data[i];
+      if (tensor_id < 0) continue;  // Ignore optional input.
+
+      const TfLiteType type = context->tensors[tensor_id].type;
+      // Nothing to do for this tensor if it's not quantized.
+      if (type != kTfLiteUInt8) continue;
+
+      // Insert Dequantize operator if it hasn't been done already and change
+      // the node's input accordingly.
+      builder->AddDequantize(i, node->inputs->data[i], type);
+    }
+  }
+
   TfLiteStatus AddOpsAndTensors(TfLiteContext* context) {
-    // The operand builder allows creating a single op. We create it at this
-    // reduced power position rather than in the for loop to avoid reallocating
-    // the vectors.
-    NNAPIOpBuilder builder(context, &operand_mapping_, nn_model_.get());
-    // Add Tensors
-    // allocate outside to avoid realloc
+    DequantizeMapping dequantize_mapping;
+    // The operand builder allows creating a single op. It is created outside
+    // the for loop to avoid reallocating the vectors.
+    NNAPIOpBuilder builder(nnapi_, context, &operand_mapping_,
+                           &dequantize_mapping, nn_model_.get());
+    // Add Tensors.
     for (auto node_index : nodes_) {
       // Obtain the op and registration.
       TfLiteNode* node;
       TfLiteRegistration* reg;
-      context->GetNodeAndRegistration(context, node_index, &node, &reg);
+      TF_LITE_ENSURE_STATUS(
+          context->GetNodeAndRegistration(context, node_index, &node, &reg));
+
+      const bool hybrid_op = IsHybridOperator(context, reg->builtin_code, node);
+
       // Map inputs to NN API tensor indices.
       for (auto input_index : TfLiteIntArrayView(node->inputs)) {
         if (input_index == kOptionalTensor &&
@@ -1061,18 +1157,24 @@ class NNAPIDelegateKernel {
           // tensor when supported by NNAPI.
           TF_LITE_ENSURE_STATUS(builder.AddVectorFloat32Operand(nullptr, 0));
         } else {
-          TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index));
+          TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index, hybrid_op));
         }
       }
       // Get op type and operands
-      int nn_op_type = Map(context, reg->builtin_code, reg->version, node)(
-          {context, &builder, node, &model_state_outputs_,
-           &model_state_tfl_inputs_});
+      int nn_op_type = Map(
+          context, reg->builtin_code, reg->version, nnapi_->android_sdk_version,
+          node)({context, &builder, node, &model_state_outputs_,
+                 &model_state_tfl_inputs_});
       // Map outputs to NN API tensor indices.
       for (auto output_index : TfLiteIntArrayView(node->outputs)) {
         TF_LITE_ENSURE_STATUS(builder.AddTensorOutput(output_index));
       }
 
+      // Dequantize operators may have to be added in case inputs are to be
+      // floating-point.
+      AddDequantizeOperatorsWhereNeeded(context, reg->builtin_code, node,
+                                        &builder);
+
       builder.FinalizeAddOperation(nn_op_type);
     }
     return kTfLiteOk;
@@ -1090,7 +1192,7 @@ class NNAPIDelegateKernel {
     outputs.reserve(output_tensors->size);
 
     size_t total_input_byte_size = 0;
-    // Make the TensorFlow lite inputs and outputs to ann_indices.
+    // Make the TensorFlow Lite inputs and outputs to ann_indices.
     for (int i : TfLiteIntArrayView(input_tensors)) {
       // Constant tensors are not NNAPI inputs.
       if (i != kOptionalTensor &&
@@ -1106,30 +1208,34 @@ class NNAPIDelegateKernel {
       total_output_byte_size += context->tensors[i].bytes;
     }
 
-    // Add state output tensors as model inputs
+    // Add state output tensors as model outputs.
     for (int i : model_state_outputs_) {
       outputs.push_back(i);
     }
 
     // Tell ANN to declare inputs/outputs
-    CHECK_NN(context, ANeuralNetworksModel_identifyInputsAndOutputs(
-                          nn_model_.get(), inputs.size(), inputs.data(),
-                          outputs.size(), outputs.data()));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context, nnapi_->ANeuralNetworksModel_identifyInputsAndOutputs(
+                     nn_model_.get(), inputs.size(), inputs.data(),
+                     outputs.size(), outputs.data()));
 
     // Set relaxed computation mode for fp32 if possible.
-    if (kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
-      CHECK_NN(context,
-               ANeuralNetworksModel_relaxComputationFloat32toFloat16(
-                   nn_model_.get(), context->allow_fp32_relax_to_fp16));
+    if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI11) {
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context,
+          nnapi_->ANeuralNetworksModel_relaxComputationFloat32toFloat16(
+              nn_model_.get(), context->allow_fp32_relax_to_fp16));
     }
 
     // Finalize the model
-    CHECK_NN(context, ANeuralNetworksModel_finish(nn_model_.get()));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context, nnapi_->ANeuralNetworksModel_finish(nn_model_.get()));
 
     // Create shared memory pool for inputs and outputs.
-    nn_input_memory_.reset(new NNMemory("input_pool", total_input_byte_size));
+    nn_input_memory_.reset(
+        new NNMemory(nnapi_, "input_pool", total_input_byte_size));
     nn_output_memory_.reset(
-        new NNMemory("output_pool", total_output_byte_size));
+        new NNMemory(nnapi_, "output_pool", total_output_byte_size));
 
     return kTfLiteOk;
   }
@@ -1145,17 +1251,22 @@ TfLiteDelegate* NnApiDelegate() {
       .Prepare = [](TfLiteContext* context,
                     TfLiteDelegate* delegate) -> TfLiteStatus {
         // Do not check nodes_ if NN API is unavailable.
-        if (kAndroidSdkVersion < kMinSdkVersionForNNAPI || !NNAPIExists()) {
+        const NnApi* nnapi = NnApiImplementation();
+        if (nnapi->android_sdk_version < kMinSdkVersionForNNAPI ||
+            !nnapi->nnapi_exists) {
           return kTfLiteOk;
         }
 
+        // Allocate one element in vector already since TensorFlow Lite uses
+        // the first value as the number of nodes. The actual value will be set
+        // later, after the vector has been filled.
         std::vector<int> supported_nodes(1);
         // We don't care about all nodes_, we only care about ones in the
         // current plan.
         TfLiteIntArray* plan;
         TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
-        int total_supported_nodes = 0;
 
+        int android_sdk_version = NnApiImplementation()->android_sdk_version;
         // Check for every node if it is supported
         // TODO(b/80625235): Fix this to do more careful checking of versioning.
         for (int node_index : TfLiteIntArrayView(plan)) {
@@ -1163,14 +1274,13 @@ TfLiteDelegate* NnApiDelegate() {
           TfLiteRegistration* registration;
           TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
               context, node_index, &node, &registration));
-          NNAPIDelegateKernel dummy_kernel;
-          if (dummy_kernel.Map(context, registration->builtin_code,
-                               registration->version, node)) {
+          if (NNAPIDelegateKernel::Map(context, registration->builtin_code,
+                                       registration->version,
+                                       android_sdk_version, node)) {
             supported_nodes.push_back(node_index);
           }
-          total_supported_nodes += 1;
         }
-        // Put the size at the beginning of the array.
+        // First element in vector must be the number of actual nodes.
         supported_nodes[0] = supported_nodes.size() - 1;
 
         // NN API Delegate Registration (the pseudo kernel that will invoke NN
@@ -1208,11 +1318,10 @@ TfLiteDelegate* NnApiDelegate() {
 
         // Request TFLite to partition the graph and make kernels
         // for each independent node sub set a new nnapi_delegate_kernel.
-        context->ReplaceNodeSubsetsWithDelegateKernels(
+        return context->ReplaceNodeSubsetsWithDelegateKernels(
             context, nnapi_delegate_kernel,
             reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()),
             delegate);
-        return kTfLiteOk;
       }};
 
   return &delegate;
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
index ca48af0c95211e644fc7e2a1a1472a2f1b46ad35..37512386389aef811bd1c9c3d03621e0a188a7e7 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -27,6 +27,16 @@ using ::testing::ElementsAreArray;
 // TODO(b/110368244): figure out how to share the existing tests in kernels/ but
 // with the delegation on. Also, add more unit tests to improve code coverage.
 
+// This matcher uses 1 as maximum tolerance.
+MATCHER(QuantizedNear, "") {
+  const int diff = abs(std::get<0>(arg) - std::get<1>(arg));
+  if (diff > 1) {
+    *result_listener << "Quantized values can be at most off by one: " << diff;
+    return false;
+  }
+  return true;
+}
+
 class SingleOpModelWithNNAPI : public SingleOpModel {
  public:
   SingleOpModelWithNNAPI() {
@@ -39,6 +49,24 @@ class SingleOpModelWithNNAPI : public SingleOpModel {
                                  const std::vector<int>& dims) {
     return interpreter_->ResizeInputTensor(tensor_index, dims);
   }
+
+ protected:
+  void SetData(int index, TensorType type, std::initializer_list<float> data) {
+    switch (type) {
+      case TensorType_FLOAT32:
+        PopulateTensor(index, data);
+        break;
+      case TensorType_INT32:
+        QuantizeAndPopulate<int32_t>(index, data);
+        break;
+      case TensorType_UINT8:
+        QuantizeAndPopulate<uint8_t>(index, data);
+        break;
+      default:
+        FAIL() << "Type not supported: " << type;
+        break;
+    }
+  }
 };
 
 class FloatAddOpModel : public SingleOpModelWithNNAPI {
@@ -215,14 +243,15 @@ TEST(NNAPIDelegate, L2PoolWithNoActivation) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({3.5, 6.5}));
 }
 
-class BaseConvolutionOpModel : public SingleOpModelWithNNAPI {
+class ConvolutionOpModel : public SingleOpModelWithNNAPI {
  public:
-  BaseConvolutionOpModel(
+  ConvolutionOpModel(
       const TensorData& input, const TensorData& filter,
       const TensorData& output, int stride_width = 2, int stride_height = 2,
       enum Padding padding = Padding_VALID,
       enum ActivationFunctionType activation = ActivationFunctionType_NONE,
-      int dilation_width_factor = 1, int dilation_height_factor = 1) {
+      int dilation_width_factor = 1, int dilation_height_factor = 1)
+      : input_type_(input.type), filter_type_(filter.type) {
     input_ = AddInput(input);
     filter_ = AddInput(filter);
 
@@ -239,7 +268,8 @@ class BaseConvolutionOpModel : public SingleOpModelWithNNAPI {
     }
 
     output_ = AddOutput(output);
-    if (input.type != TensorType_FLOAT32) {
+
+    if (input_type_ != TensorType_FLOAT32) {
       // The following is required by quantized inference. It is the unittest's
       // responsibility to make sure the output scale falls into the correct
       // range.
@@ -255,56 +285,53 @@ class BaseConvolutionOpModel : public SingleOpModelWithNNAPI {
     BuildInterpreter({GetShape(input_), GetShape(filter_), GetShape(bias_)});
   }
 
- protected:
-  int input_;
-  int filter_;
-  int bias_;
-  int output_;
-};
-
-class ConvolutionOpModel : public BaseConvolutionOpModel {
- public:
-  using BaseConvolutionOpModel::BaseConvolutionOpModel;
-
-  void SetFilter(std::initializer_list<float> f) { PopulateTensor(filter_, f); }
-
-  void SetBias(std::initializer_list<float> f) { PopulateTensor(bias_, f); }
-
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor(input_, data);
-  }
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
-};
-
-class QuantizedConvolutionOpModel : public BaseConvolutionOpModel {
- public:
-  using BaseConvolutionOpModel::BaseConvolutionOpModel;
-
   void SetInput(std::initializer_list<float> data) {
-    QuantizeAndPopulate<uint8_t>(input_, data);
+    SetData(input_, input_type_, data);
   }
 
   void SetFilter(std::initializer_list<float> data) {
-    QuantizeAndPopulate<uint8_t>(filter_, data);
+    SetData(filter_, filter_type_, data);
   }
 
   void SetBias(std::initializer_list<float> data) {
-    QuantizeAndPopulate<int32_t>(bias_, data);
+    const auto bias_type =
+        (input_type_ == TensorType_FLOAT32) ? input_type_ : TensorType_INT32;
+    SetData(bias_, bias_type, data);
   }
 
-  std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
-  std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
+  std::vector<float> GetOutput() {
+    if (input_type_ == TensorType_FLOAT32) {
+      return ExtractVector<float>(output_);
+    } else {
+      return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                                 GetScale(output_), GetZeroPoint(output_));
+    }
+  }
+
+  std::vector<uint8_t> GetQuantizedOutput() {
+    if (input_type_ == TensorType_FLOAT32) {
+      return {};  // Not supported.
+    } else {
+      return ExtractVector<uint8_t>(output_);
+    }
   }
+
+ protected:
+  int input_;
+  int filter_;
+  int bias_;
+  int output_;
+
+  const TensorType input_type_;
+  const TensorType filter_type_;
 };
 
 // In this tests we set the input and output scales so that the results
 // match exactly the 'non-quantized' version.
-TEST(NNAPIDelegate, SimpleTestQuantized) {
-  QuantizedConvolutionOpModel m({TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64},
-                                {TensorType_UINT8, {3, 2, 2, 1}, -63.5, 64},
-                                {TensorType_UINT8, {}, -127, 128});
+TEST(ConvolutionOpTest, SimpleTestQuantized) {
+  ConvolutionOpModel m({TensorType_UINT8, {2, 2, 4, 1}, -63.5, 64},
+                       {TensorType_UINT8, {3, 2, 2, 1}, -63.5, 64},
+                       {TensorType_UINT8, {}, -127, 128});
   m.SetInput({
       // First batch
       1, 1, 1, 1,  // row = 1
@@ -322,25 +349,55 @@ TEST(NNAPIDelegate, SimpleTestQuantized) {
 
   m.Invoke();
 
-  EXPECT_THAT(m.GetDequantizedOutput(),
-              ElementsAreArray(ArrayFloatNear(
-                  {
-                      18, 2, 5,  // first batch, left
-                      18, 2, 5,  // first batch, right
-                      17, 4, 3,  // second batch, left
-                      37, 4, 3,  // second batch, right
-                  },
-                  1e-5)));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {
+                                     18, 2, 5,  // first batch, left
+                                     18, 2, 5,  // first batch, right
+                                     17, 4, 3,  // second batch, left
+                                     37, 4, 3,  // second batch, right
+                                 },
+                                 1e-5)));
   // For good  measure, let's also verify the quantized values:
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
-                                 145, 129, 132,  //
-                                 145, 129, 132,  //
-                                 144, 131, 130,  //
-                                 164, 131, 130,  //
-                             }));
+  EXPECT_THAT(m.GetQuantizedOutput(), ElementsAreArray({
+                                          145, 129, 132,  //
+                                          145, 129, 132,  //
+                                          144, 131, 130,  //
+                                          164, 131, 130,  //
+                                      }));
+}
+
+TEST(ConvolutionOpTest, FloatInputQuantizedWeights) {
+  ConvolutionOpModel m({TensorType_FLOAT32, {2, 2, 4, 1}},
+                       {TensorType_UINT8, {3, 2, 2, 1}, 0, 64},
+                       {TensorType_FLOAT32, {}});
+  m.SetInput({
+      // First batch
+      1, 1, 1, 2,  // row = 1
+      2, 2, 2, 1,  // row = 2
+      // Second batch
+      1, 2, 3, 4,  // row = 1
+      1, 2, 3, 4,  // row = 2
+  });
+  m.SetFilter({
+      1, 2, 3, 4,  // first 2x2 filter
+      0, 1, 0, 1,  // second 2x2 filter
+      0, 0, 1, 1,  // third 2x2 filter
+  });
+  m.SetBias({1, 2, 3});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {
+                                     18, 5, 7,    // first batch, left
+                                     16, 5, 6,    // first batch, right
+                                     17, 6, 6,    // second batch, left
+                                     37, 10, 10,  // second batch, right
+                                 },
+                                 0.2)));
 }
 
-TEST(NNAPIDelegate, Conv2DWithNoActivation) {
+TEST(ConvolutionOpTest, NoActivation) {
   ConvolutionOpModel m({TensorType_FLOAT32, {2, 2, 4, 1}},
                        {TensorType_FLOAT32, {3, 2, 2, 1}},
                        {TensorType_FLOAT32, {}});
@@ -448,56 +505,48 @@ TEST(NNAPIDelegate, DepthwiseConv2DWithNoActivation) {
                              }));
 }
 
-class FloatFullyConnectedOpModel : public SingleOpModelWithNNAPI {
+class FullyConnectedOpModel : public SingleOpModelWithNNAPI {
  public:
-  FloatFullyConnectedOpModel(int units, int batches, const TensorData& input,
-                             const TensorData& output = {TensorType_FLOAT32})
-      : batches_(batches), units_(units) {
-    int total_input_size = 1;
-    for (int i = 0; i < input.shape.size(); ++i) {
-      total_input_size *= input.shape[i];
-    }
-    input_size_ = total_input_size / batches_;
-
+  FullyConnectedOpModel(
+      const TensorData& input, const TensorData& weights,
+      const TensorData& output,
+      enum ActivationFunctionType activation = ActivationFunctionType_NONE)
+      : input_type_(input.type), weights_type_(weights.type) {
     input_ = AddInput(input);
-    weights_ =
-        AddInput({input.type, {units_, input_size_}, input.min, input.max});
+    weights_ = AddInput(weights);
 
+    const int units = weights.shape[0];
     if (input.type == TensorType_FLOAT32) {
-      bias_ = AddInput({TensorType_FLOAT32, {units_}});
+      bias_ = AddInput({TensorType_FLOAT32, {units}});
     } else {
       // This is a quantized version. The scale of 'bias' depends on the scales
       // of input and filter. Supposedly this is correctly set during quantized
       // training.
       auto bias_scale = GetScale(input_) * GetScale(weights_);
-      TensorData bias{TensorType_INT32, {units_}, 0, 0, bias_scale};
+      TensorData bias{TensorType_INT32, {units}, 0, 0, bias_scale};
       bias_ = AddInput(bias);
     }
 
     output_ = AddOutput(output);
 
-    SetBuiltinOp(
-        BuiltinOperator_FULLY_CONNECTED, BuiltinOptions_FullyConnectedOptions,
-        CreateFullyConnectedOptions(builder_, ActivationFunctionType_RELU)
-            .Union());
+    SetBuiltinOp(BuiltinOperator_FULLY_CONNECTED,
+                 BuiltinOptions_FullyConnectedOptions,
+                 CreateFullyConnectedOptions(builder_, activation).Union());
     BuildInterpreter({GetShape(input_), GetShape(weights_), GetShape(bias_)});
   }
 
-  int input_size() { return input_size_; }
-  int num_units() { return units_; }
-  int num_batches() { return batches_; }
-
-  void SetBias(std::initializer_list<float> f) { PopulateTensor(bias_, f); }
-
-  void SetWeights(std::initializer_list<float> f) {
-    PopulateTensor(weights_, f);
+  void SetInput(std::initializer_list<float> data) {
+    SetData(input_, input_type_, data);
   }
 
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor(input_, data);
+  void SetWeights(std::initializer_list<float> data) {
+    SetData(weights_, weights_type_, data);
   }
-  void SetInput(int offset, float* begin, float* end) {
-    PopulateTensor(input_, offset, begin, end);
+
+  void SetBias(std::initializer_list<float> data) {
+    const auto bias_type =
+        (input_type_ == TensorType_FLOAT32) ? input_type_ : TensorType_INT32;
+    SetData(bias_, bias_type, data);
   }
 
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
@@ -508,14 +557,14 @@ class FloatFullyConnectedOpModel : public SingleOpModelWithNNAPI {
   int bias_;
   int output_;
 
-  int batches_;
-  int units_;
-  int input_size_;
+  const TensorType input_type_;
+  const TensorType weights_type_;
 };
 
-TEST(NNAPIDelegate, FullyConnectedSimpleTest) {
-  FloatFullyConnectedOpModel m(/*units=*/3, /*batches=*/2,
-                               /*input=*/{TensorType_FLOAT32, {2, 10}});
+TEST(FullyConnectedOpTest, SimpleTest) {
+  FullyConnectedOpModel m(/*input=*/{TensorType_FLOAT32, {2, 10}},
+                          /*weights=*/{TensorType_FLOAT32, {3, 10}},
+                          /*output=*/{TensorType_FLOAT32});
   m.SetWeights({
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
@@ -533,6 +582,28 @@ TEST(NNAPIDelegate, FullyConnectedSimpleTest) {
   EXPECT_THAT(m.GetOutput(), ElementsAre(24, 25, 26, 58, 59, 60));
 }
 
+TEST(FullyConnectedOpTest, FloatInputQuantizedWeights) {
+  FullyConnectedOpModel m(/*input=*/{TensorType_FLOAT32, {2, 10}},
+                          /*weights=*/{TensorType_UINT8, {3, 10}, 0, 64},
+                          /*output=*/{TensorType_FLOAT32});
+  m.SetWeights({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({24, 25, 26, 58, 59, 60}, 1.3)));
+}
+
 class SoftmaxOpModel : public SingleOpModelWithNNAPI {
  public:
   SoftmaxOpModel(int batches, int size, float beta)
@@ -585,14 +656,14 @@ class ReshapeOpModel : public SingleOpModelWithNNAPI {
   ReshapeOpModel(std::initializer_list<int> input_shape,
                  std::initializer_list<int> new_shape) {
     input_ = AddInput(TensorType_FLOAT32);
-    new_shape_ = AddInput(TensorType_INT32);
+    new_shape_ = AddConstInput<int>(TensorType_INT32, new_shape,
+                                    {static_cast<int>(new_shape.size())});
     output_ = AddOutput(TensorType_FLOAT32);
     SetBuiltinOp(
         BuiltinOperator_RESHAPE, BuiltinOptions_ReshapeOptions,
         CreateReshapeOptions(builder_, builder_.CreateVector<int>(new_shape))
             .Union());
     BuildInterpreter({input_shape, {static_cast<int>(new_shape.size())}});
-    PopulateTensor<int>(new_shape_, new_shape);
   }
 
   void SetInput(std::initializer_list<float> data) {
@@ -1326,7 +1397,8 @@ TEST(NNAPIDelegate, LogisticQuantized) {
                   },
                   kQuantizedTolerance)));
   EXPECT_THAT(m.GetOutput<uint8_t>(),
-              ElementsAreArray({128, 1, 227, 251, 244, 32, 255, 188}));
+              testing::Pointwise(QuantizedNear(),
+                                 {128, 1, 227, 251, 244, 32, 255, 188}));
 }
 
 #if 0
@@ -1576,14 +1648,17 @@ class StridedSliceOpModel : public SingleOpModelWithNNAPI {
  public:
   StridedSliceOpModel(std::initializer_list<int> input_shape,
                       std::initializer_list<int> begin_shape,
+                      std::initializer_list<int> begin_data,
                       std::initializer_list<int> end_shape,
-                      std::initializer_list<int> strides_shape, int begin_mask,
+                      std::initializer_list<int> end_data,
+                      std::initializer_list<int> strides_shape,
+                      std::initializer_list<int> strides_data, int begin_mask,
                       int end_mask, int ellipsis_mask, int new_axis_mask,
                       int shrink_axis_mask) {
     input_ = AddInput(tensor_input_type);
-    begin_ = AddInput(TensorType_INT32);
-    end_ = AddInput(TensorType_INT32);
-    strides_ = AddInput(TensorType_INT32);
+    begin_ = AddConstInput(TensorType_INT32, begin_data, begin_shape);
+    end_ = AddConstInput(TensorType_INT32, end_data, end_shape);
+    strides_ = AddConstInput(TensorType_INT32, strides_data, strides_shape);
     output_ = AddOutput(tensor_input_type);
     SetBuiltinOp(
         BuiltinOperator_STRIDED_SLICE, BuiltinOptions_StridedSliceOptions,
@@ -1596,15 +1671,6 @@ class StridedSliceOpModel : public SingleOpModelWithNNAPI {
   void SetInput(std::initializer_list<input_type> data) {
     PopulateTensor<input_type>(input_, data);
   }
-  void SetBegin(std::initializer_list<int32_t> data) {
-    PopulateTensor<int32_t>(begin_, data);
-  }
-  void SetEnd(std::initializer_list<int32_t> data) {
-    PopulateTensor<int32_t>(end_, data);
-  }
-  void SetStrides(std::initializer_list<int32_t> data) {
-    PopulateTensor<int32_t>(strides_, data);
-  }
 
   std::vector<input_type> GetOutput() {
     return ExtractVector<input_type>(output_);
@@ -1619,39 +1685,47 @@ class StridedSliceOpModel : public SingleOpModelWithNNAPI {
   int output_;
 };
 
-TEST(NNAPIDelegate, StridedSliceIn2D) {
-  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
-  m.SetInput({1, 2, 3, 4, 5, 6});
-  m.SetBegin({1, 0});
-  m.SetEnd({2, 2});
-  m.SetStrides({1, 1});
+TEST(StridedSliceOpTest, In1D) {
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, {3}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
   m.Invoke();
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({4, 5}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 3}));
 }
 
-TEST(NNAPIDelegate, StridedSliceIn2D_ShrinkAxis_NegativeSlice) {
-  // This is equivalent to tf.range(4)[:, tf.newaxis][-2, -1].
-  StridedSliceOpModel<> m({4, 1}, {2}, {2}, {2}, 0, 0, 0, 0, 3);
-  m.SetInput({0, 1, 2, 3});
-  m.SetBegin({-2, -1});
-  m.SetEnd({-1, 0});
-  m.SetStrides({1, 1});
+TEST(StridedSliceOpTest, In1D_BeginMask) {
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, {3}, {1}, {1}, 1, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3}));
+}
 
+TEST(StridedSliceOpTest, In2D_Stride2) {
+  StridedSliceOpModel<> m({2, 3}, {2}, {0, 0}, {2}, {2, 3}, {2}, {2, 2}, 0, 0,
+                          0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6});
   m.Invoke();
-  EXPECT_TRUE(m.GetOutputShape().empty());
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3}));
 }
 
-TEST(NNAPIDelegate, StridedSliceIn2D_ShrinkAxisMask) {
-  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 3);
+TEST(StridedSliceOpTest, In2D_EndMask) {
+  StridedSliceOpModel<> m({2, 3}, {2}, {1, 0}, {2}, {2, 2}, {2}, {1, 1}, 0, 2,
+                          0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
-  m.SetBegin({0, 0});
-  m.SetEnd({1, 1});
-  m.SetStrides({1, 1});
   m.Invoke();
-  EXPECT_TRUE(m.GetOutputShape().empty());
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({4, 5, 6}));
+}
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis4) {
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {0, 0, 0}, {3}, {2, 3, 1}, {3},
+                          {1, 1, 1}, 0, 0, 0, 0, 4);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 5, 7, 9, 11}));
 }
 
 static float rnn_input[] = {
@@ -1990,7 +2064,9 @@ class BaseSVDFOpModel : public SingleOpModelWithNNAPI {
     input_ = AddInput(TensorType_FLOAT32);
     weights_feature_ = AddInput(weights_feature_type);
     weights_time_ = AddInput(weights_time_type);
-    bias_ = AddNullInput();
+    // TODO(b/121383394) : figure out why optional bias causes TFLite segfault
+    // when using NNAPI delegate.
+    bias_ = AddInput(TensorType_FLOAT32);
     const int num_filters = units * rank;
     activation_state_ = AddInput(
         TensorData{TensorType_FLOAT32, {batches, memory_size * num_filters}},
@@ -2006,6 +2082,8 @@ class BaseSVDFOpModel : public SingleOpModelWithNNAPI {
         {units_},                             // bias tensor
         {batches, memory_size * num_filters}  // activation_state tensor
     });
+    // TODO(b/121383394) : remove once the optional bias bug is fixed.
+    PopulateTensor(bias_, std::vector<float>(units_));
   }
 
   // Populates the weights_feature tensor.
@@ -2048,12 +2126,16 @@ class BaseSVDFOpModel : public SingleOpModelWithNNAPI {
 class SVDFOpModel : public BaseSVDFOpModel {
  public:
   using BaseSVDFOpModel::BaseSVDFOpModel;
+};
 
+class SVDFOpTest : public ::testing::Test {
+ protected:
   void VerifyGoldens(float golden_input[], float golden_output[],
-                     int golden_size, float tolerance = 1e-5) {
-    const int svdf_num_batches = num_batches();
-    const int svdf_input_size = input_size();
-    const int svdf_num_units = num_units();
+                     int golden_size, BaseSVDFOpModel* svdf,
+                     float tolerance = 1e-5) {
+    const int svdf_num_batches = svdf->num_batches();
+    const int svdf_input_size = svdf->input_size();
+    const int svdf_num_units = svdf->num_units();
     const int input_sequence_size =
         golden_size / sizeof(float) / (svdf_input_size * svdf_num_batches);
     // Going over each input batch, setting the input tensor, invoking the SVDF
@@ -2062,9 +2144,9 @@ class SVDFOpModel : public BaseSVDFOpModel {
       float* batch_start =
           golden_input + i * svdf_input_size * svdf_num_batches;
       float* batch_end = batch_start + svdf_input_size * svdf_num_batches;
-      SetInput(0, batch_start, batch_end);
+      svdf->SetInput(0, batch_start, batch_end);
 
-      Invoke();
+      svdf->Invoke();
 
       const float* golden_start =
           golden_output + i * svdf_num_units * svdf_num_batches;
@@ -2073,13 +2155,13 @@ class SVDFOpModel : public BaseSVDFOpModel {
       std::vector<float> expected;
       expected.insert(expected.end(), golden_start, golden_end);
 
-      EXPECT_THAT(GetOutput(),
+      EXPECT_THAT(svdf->GetOutput(),
                   ElementsAreArray(ArrayFloatNear(expected, tolerance)));
     }
   }
 };
 
-TEST(NNAPIDelegate, SVDFBlackBoxTestRank1) {
+TEST_F(SVDFOpTest, BlackBoxTestRank1) {
   SVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3,
                    /*memory_size=*/10, /*rank=*/1);
   svdf.SetWeightsFeature({-0.31930989, -0.36118156, 0.0079667, 0.37613347,
@@ -2099,10 +2181,11 @@ TEST(NNAPIDelegate, SVDFBlackBoxTestRank1) {
        -0.10781813, 0.27201805,  0.14324132,  -0.23681851, -0.27115166,
        -0.01580888, -0.14943552, 0.15465137,  0.09784451,  -0.0337657});
 
-  svdf.VerifyGoldens(svdf_input, svdf_golden_output_rank_1, sizeof(svdf_input));
+  VerifyGoldens(svdf_input, svdf_golden_output_rank_1, sizeof(svdf_input),
+                &svdf);
 }
 
-TEST(NNAPIDelegate, SVDFBlackBoxTestRank2) {
+TEST_F(SVDFOpTest, BlackBoxTestRank2) {
   SVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3,
                    /*memory_size=*/10, /*rank=*/2);
   svdf.SetWeightsFeature({-0.31930989, 0.0079667,   0.39296314,  0.37613347,
@@ -2137,7 +2220,8 @@ TEST(NNAPIDelegate, SVDFBlackBoxTestRank2) {
        0.27179423,  -0.04710215, 0.31069002,  0.22672787,  0.09580326,
        0.08682203,  0.1258215,   0.1851041,   0.29228821,  0.12366763});
 
-  svdf.VerifyGoldens(svdf_input, svdf_golden_output_rank_2, sizeof(svdf_input));
+  VerifyGoldens(svdf_input, svdf_golden_output_rank_2, sizeof(svdf_input),
+                &svdf);
 }
 
 class LSTMOpModel : public SingleOpModelWithNNAPI {
@@ -2223,71 +2307,69 @@ class LSTMOpModel : public SingleOpModelWithNNAPI {
     BuildInterpreter(input_shapes);
   }
 
-  void SetInputToInputWeights(std::initializer_list<float> f) {
+  void SetInputToInputWeights(std::vector<float> f) {
     PopulateTensor(input_to_input_weights_, f);
   }
 
-  void SetInputToForgetWeights(std::initializer_list<float> f) {
+  void SetInputToForgetWeights(std::vector<float> f) {
     PopulateTensor(input_to_forget_weights_, f);
   }
 
-  void SetInputToCellWeights(std::initializer_list<float> f) {
+  void SetInputToCellWeights(std::vector<float> f) {
     PopulateTensor(input_to_cell_weights_, f);
   }
 
-  void SetInputToOutputWeights(std::initializer_list<float> f) {
+  void SetInputToOutputWeights(std::vector<float> f) {
     PopulateTensor(input_to_output_weights_, f);
   }
 
-  void SetRecurrentToInputWeights(std::initializer_list<float> f) {
+  void SetRecurrentToInputWeights(std::vector<float> f) {
     PopulateTensor(recurrent_to_input_weights_, f);
   }
 
-  void SetRecurrentToForgetWeights(std::initializer_list<float> f) {
+  void SetRecurrentToForgetWeights(std::vector<float> f) {
     PopulateTensor(recurrent_to_forget_weights_, f);
   }
 
-  void SetRecurrentToCellWeights(std::initializer_list<float> f) {
+  void SetRecurrentToCellWeights(std::vector<float> f) {
     PopulateTensor(recurrent_to_cell_weights_, f);
   }
 
-  void SetRecurrentToOutputWeights(std::initializer_list<float> f) {
+  void SetRecurrentToOutputWeights(std::vector<float> f) {
     PopulateTensor(recurrent_to_output_weights_, f);
   }
 
-  void SetCellToInputWeights(std::initializer_list<float> f) {
+  void SetCellToInputWeights(std::vector<float> f) {
     PopulateTensor(cell_to_input_weights_, f);
   }
 
-  void SetCellToForgetWeights(std::initializer_list<float> f) {
+  void SetCellToForgetWeights(std::vector<float> f) {
     PopulateTensor(cell_to_forget_weights_, f);
   }
 
-  void SetCellToOutputWeights(std::initializer_list<float> f) {
+  void SetCellToOutputWeights(std::vector<float> f) {
     PopulateTensor(cell_to_output_weights_, f);
   }
 
-  void SetInputGateBias(std::initializer_list<float> f) {
+  void SetInputGateBias(std::vector<float> f) {
     PopulateTensor(input_gate_bias_, f);
   }
 
-  void SetForgetGateBias(std::initializer_list<float> f) {
+  void SetForgetGateBias(std::vector<float> f) {
     PopulateTensor(forget_gate_bias_, f);
   }
 
-  void SetCellBias(std::initializer_list<float> f) {
-    PopulateTensor(cell_bias_, f);
-  }
+  void SetCellBias(std::vector<float> f) { PopulateTensor(cell_bias_, f); }
 
-  void SetOutputGateBias(std::initializer_list<float> f) {
+  void SetOutputGateBias(std::vector<float> f) {
     PopulateTensor(output_gate_bias_, f);
   }
 
-  void SetProjectionWeights(std::initializer_list<float> f) {
+  void SetProjectionWeights(std::vector<float> f) {
     PopulateTensor(projection_weights_, f);
   }
 
-  void SetProjectionBias(std::initializer_list<float> f) {
+  void SetProjectionBias(std::vector<float> f) {
     PopulateTensor(projection_bias_, f);
   }
 
@@ -2342,22 +2424,22 @@ class LSTMOpModel : public SingleOpModelWithNNAPI {
 class BaseLstmTest : public ::testing::Test {
  protected:
   // Weights of the LSTM model. Some are optional.
-  std::initializer_list<float> input_to_input_weights_;
-  std::initializer_list<float> input_to_cell_weights_;
-  std::initializer_list<float> input_to_forget_weights_;
-  std::initializer_list<float> input_to_output_weights_;
-  std::initializer_list<float> input_gate_bias_;
-  std::initializer_list<float> cell_gate_bias_;
-  std::initializer_list<float> forget_gate_bias_;
-  std::initializer_list<float> output_gate_bias_;
-  std::initializer_list<float> recurrent_to_input_weights_;
-  std::initializer_list<float> recurrent_to_cell_weights_;
-  std::initializer_list<float> recurrent_to_forget_weights_;
-  std::initializer_list<float> recurrent_to_output_weights_;
-  std::initializer_list<float> cell_to_input_weights_;
-  std::initializer_list<float> cell_to_forget_weights_;
-  std::initializer_list<float> cell_to_output_weights_;
-  std::initializer_list<float> projection_weights_;
+  std::vector<float> input_to_input_weights_;
+  std::vector<float> input_to_cell_weights_;
+  std::vector<float> input_to_forget_weights_;
+  std::vector<float> input_to_output_weights_;
+  std::vector<float> input_gate_bias_;
+  std::vector<float> cell_gate_bias_;
+  std::vector<float> forget_gate_bias_;
+  std::vector<float> output_gate_bias_;
+  std::vector<float> recurrent_to_input_weights_;
+  std::vector<float> recurrent_to_cell_weights_;
+  std::vector<float> recurrent_to_forget_weights_;
+  std::vector<float> recurrent_to_output_weights_;
+  std::vector<float> cell_to_input_weights_;
+  std::vector<float> cell_to_forget_weights_;
+  std::vector<float> cell_to_output_weights_;
+  std::vector<float> projection_weights_;
 
   // LSTM input is stored as num_batch x num_inputs vector.
   std::vector<std::vector<float>> lstm_input_;
diff --git a/tensorflow/lite/examples/android/app/build.gradle b/tensorflow/lite/examples/android/app/build.gradle
index c7e620e212853efa3dda42df6412a48fc77548d7..d2bc9846af571af71d8d7cbdf1c985e3a24474f7 100644
--- a/tensorflow/lite/examples/android/app/build.gradle
+++ b/tensorflow/lite/examples/android/app/build.gradle
@@ -1,5 +1,13 @@
 apply plugin: 'com.android.application'
 
+// import DownloadModels task
+project.ext.ASSET_DIR = projectDir.toString() + '/src/main/assets'
+project.ext.TMP_DIR   = project.buildDir.toString() + '/downloads'
+
+// Download default models; if you wish to use your own models then
+// place them in the "assets" directory and comment out this line.
+apply from: "download-models.gradle"
+
 android {
     compileSdkVersion 26
     buildToolsVersion '28.0.3'
@@ -10,10 +18,6 @@ android {
         versionCode 1
         versionName "1.0"
 
-        // Remove this block.
-        jackOptions {
-            enabled true
-        }
     }
     lintOptions {
         abortOnError false
@@ -40,14 +44,6 @@ repositories {
     }
 }
 
-// import DownloadModels task
-project.ext.ASSET_DIR = projectDir.toString() + '/src/main/assets'
-project.ext.TMP_DIR   = project.buildDir.toString() + '/downloads'
-
-// Download default models; if you wish to use your own models then
-// place them in the "assets" directory and comment out this line.
-apply from: "download-models.gradle"
-
 dependencies {
     implementation fileTree(dir: 'libs', include: ['*.jar'])
     implementation 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/DetectorActivity.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/DetectorActivity.java
index 87160f6b3fb8c0d24e5df131d9becbb3eb6e2980..2feca79e888b4cd20b0416edd4a5c114b60c5369 100644
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/DetectorActivity.java
+++ b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/DetectorActivity.java
@@ -52,8 +52,8 @@ public class DetectorActivity extends CameraActivity implements OnImageAvailable
   private static final int TF_OD_API_INPUT_SIZE = 300;
   private static final boolean TF_OD_API_IS_QUANTIZED = true;
   private static final String TF_OD_API_MODEL_FILE = "detect.tflite";
-  private static final String TF_OD_API_LABELS_FILE = "file:///android_asset/coco_labels_list.txt";
-  
+  private static final String TF_OD_API_LABELS_FILE = "coco_labels_list.txt";
+
   // Which detection model to use: by default uses Tensorflow Object Detection API frozen
   // checkpoints.
   private enum DetectorMode {
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/TFLiteObjectDetectionAPIModel.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/TFLiteObjectDetectionAPIModel.java
index 9eb21de9d03e387d3c25b38171e154a358dc81ce..afbf3178314897a9c1b7681b0b1a0de27577f3e3 100644
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/TFLiteObjectDetectionAPIModel.java
+++ b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/TFLiteObjectDetectionAPIModel.java
@@ -105,8 +105,7 @@ public class TFLiteObjectDetectionAPIModel implements Classifier {
     final TFLiteObjectDetectionAPIModel d = new TFLiteObjectDetectionAPIModel();
 
     InputStream labelsInput = null;
-    String actualFilename = labelFilename.split("file:///android_asset/")[1];
-    labelsInput = assetManager.open(actualFilename);
+    labelsInput = assetManager.open(labelFilename);
     BufferedReader br = null;
     br = new BufferedReader(new InputStreamReader(labelsInput));
     String line;
diff --git a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
index 48cd313c9d7a94328d990e45243e2b84c9dc7a62..4f6fcaa96c4b917b79dacc5180594c1458ef18ff 100644
--- a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
+++ b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
@@ -48,7 +48,7 @@ namespace {
 // GPU Delegate only supports float model now.
 NSString* model_file_name = @"mobilenet_v1_1.0_224";
 #else
-NSString* model_file_name = @"mobilenet_quant_v1_224.tflite";
+NSString* model_file_name = @"mobilenet_quant_v1_224";
 #endif
 NSString* model_file_type = @"tflite";
 // If you have your own model, point this to the labels file.
@@ -346,7 +346,15 @@ void ProcessInputWithQuantizedModel(
   NSLog(@"Time: %.4lf, avg: %.4lf, count: %d", end - start, total_latency / total_count,
         total_count);
 
-  const int output_size = 1000;
+  // read output size from the output sensor
+  const int output_tensor_index = interpreter->outputs()[0];
+  TfLiteTensor* output_tensor = interpreter->tensor(output_tensor_index);
+  TfLiteIntArray* output_dims = output_tensor->dims;
+  if (output_dims->size != 2 || output_dims->data[0] != 1) {
+    LOG(FATAL) << "Output of the model is in invalid format.";
+  }
+  const int output_size = output_dims->data[1];
+
   const int kNumResults = 5;
   const float kThreshold = 0.1f;
 
diff --git a/tensorflow/lite/examples/ios/camera/data/labels.txt b/tensorflow/lite/examples/ios/camera/data/labels.txt
new file mode 100644
index 0000000000000000000000000000000000000000..572eccf90087c1c19874e40b950c1610f59cc9c2
--- /dev/null
+++ b/tensorflow/lite/examples/ios/camera/data/labels.txt
@@ -0,0 +1,1001 @@
+dummy
+tench
+goldfish
+great white shark
+tiger shark
+hammerhead
+electric ray
+stingray
+cock
+hen
+ostrich
+brambling
+goldfinch
+house finch
+junco
+indigo bunting
+robin
+bulbul
+jay
+magpie
+chickadee
+water ouzel
+kite
+bald eagle
+vulture
+great grey owl
+European fire salamander
+common newt
+eft
+spotted salamander
+axolotl
+bullfrog
+tree frog
+tailed frog
+loggerhead
+leatherback turtle
+mud turtle
+terrapin
+box turtle
+banded gecko
+common iguana
+American chameleon
+whiptail
+agama
+frilled lizard
+alligator lizard
+Gila monster
+green lizard
+African chameleon
+Komodo dragon
+African crocodile
+American alligator
+triceratops
+thunder snake
+ringneck snake
+hognose snake
+green snake
+king snake
+garter snake
+water snake
+vine snake
+night snake
+boa constrictor
+rock python
+Indian cobra
+green mamba
+sea snake
+horned viper
+diamondback
+sidewinder
+trilobite
+harvestman
+scorpion
+black and gold garden spider
+barn spider
+garden spider
+black widow
+tarantula
+wolf spider
+tick
+centipede
+black grouse
+ptarmigan
+ruffed grouse
+prairie chicken
+peacock
+quail
+partridge
+African grey
+macaw
+sulphur-crested cockatoo
+lorikeet
+coucal
+bee eater
+hornbill
+hummingbird
+jacamar
+toucan
+drake
+red-breasted merganser
+goose
+black swan
+tusker
+echidna
+platypus
+wallaby
+koala
+wombat
+jellyfish
+sea anemone
+brain coral
+flatworm
+nematode
+conch
+snail
+slug
+sea slug
+chiton
+chambered nautilus
+Dungeness crab
+rock crab
+fiddler crab
+king crab
+American lobster
+spiny lobster
+crayfish
+hermit crab
+isopod
+white stork
+black stork
+spoonbill
+flamingo
+little blue heron
+American egret
+bittern
+crane
+limpkin
+European gallinule
+American coot
+bustard
+ruddy turnstone
+red-backed sandpiper
+redshank
+dowitcher
+oystercatcher
+pelican
+king penguin
+albatross
+grey whale
+killer whale
+dugong
+sea lion
+Chihuahua
+Japanese spaniel
+Maltese dog
+Pekinese
+Shih-Tzu
+Blenheim spaniel
+papillon
+toy terrier
+Rhodesian ridgeback
+Afghan hound
+basset
+beagle
+bloodhound
+bluetick
+black-and-tan coonhound
+Walker hound
+English foxhound
+redbone
+borzoi
+Irish wolfhound
+Italian greyhound
+whippet
+Ibizan hound
+Norwegian elkhound
+otterhound
+Saluki
+Scottish deerhound
+Weimaraner
+Staffordshire bullterrier
+American Staffordshire terrier
+Bedlington terrier
+Border terrier
+Kerry blue terrier
+Irish terrier
+Norfolk terrier
+Norwich terrier
+Yorkshire terrier
+wire-haired fox terrier
+Lakeland terrier
+Sealyham terrier
+Airedale
+cairn
+Australian terrier
+Dandie Dinmont
+Boston bull
+miniature schnauzer
+giant schnauzer
+standard schnauzer
+Scotch terrier
+Tibetan terrier
+silky terrier
+soft-coated wheaten terrier
+West Highland white terrier
+Lhasa
+flat-coated retriever
+curly-coated retriever
+golden retriever
+Labrador retriever
+Chesapeake Bay retriever
+German short-haired pointer
+vizsla
+English setter
+Irish setter
+Gordon setter
+Brittany spaniel
+clumber
+English springer
+Welsh springer spaniel
+cocker spaniel
+Sussex spaniel
+Irish water spaniel
+kuvasz
+schipperke
+groenendael
+malinois
+briard
+kelpie
+komondor
+Old English sheepdog
+Shetland sheepdog
+collie
+Border collie
+Bouvier des Flandres
+Rottweiler
+German shepherd
+Doberman
+miniature pinscher
+Greater Swiss Mountain dog
+Bernese mountain dog
+Appenzeller
+EntleBucher
+boxer
+bull mastiff
+Tibetan mastiff
+French bulldog
+Great Dane
+Saint Bernard
+Eskimo dog
+malamute
+Siberian husky
+dalmatian
+affenpinscher
+basenji
+pug
+Leonberg
+Newfoundland
+Great Pyrenees
+Samoyed
+Pomeranian
+chow
+keeshond
+Brabancon griffon
+Pembroke
+Cardigan
+toy poodle
+miniature poodle
+standard poodle
+Mexican hairless
+timber wolf
+white wolf
+red wolf
+coyote
+dingo
+dhole
+African hunting dog
+hyena
+red fox
+kit fox
+Arctic fox
+grey fox
+tabby
+tiger cat
+Persian cat
+Siamese cat
+Egyptian cat
+cougar
+lynx
+leopard
+snow leopard
+jaguar
+lion
+tiger
+cheetah
+brown bear
+American black bear
+ice bear
+sloth bear
+mongoose
+meerkat
+tiger beetle
+ladybug
+ground beetle
+long-horned beetle
+leaf beetle
+dung beetle
+rhinoceros beetle
+weevil
+fly
+bee
+ant
+grasshopper
+cricket
+walking stick
+cockroach
+mantis
+cicada
+leafhopper
+lacewing
+dragonfly
+damselfly
+admiral
+ringlet
+monarch
+cabbage butterfly
+sulphur butterfly
+lycaenid
+starfish
+sea urchin
+sea cucumber
+wood rabbit
+hare
+Angora
+hamster
+porcupine
+fox squirrel
+marmot
+beaver
+guinea pig
+sorrel
+zebra
+hog
+wild boar
+warthog
+hippopotamus
+ox
+water buffalo
+bison
+ram
+bighorn
+ibex
+hartebeest
+impala
+gazelle
+Arabian camel
+llama
+weasel
+mink
+polecat
+black-footed ferret
+otter
+skunk
+badger
+armadillo
+three-toed sloth
+orangutan
+gorilla
+chimpanzee
+gibbon
+siamang
+guenon
+patas
+baboon
+macaque
+langur
+colobus
+proboscis monkey
+marmoset
+capuchin
+howler monkey
+titi
+spider monkey
+squirrel monkey
+Madagascar cat
+indri
+Indian elephant
+African elephant
+lesser panda
+giant panda
+barracouta
+eel
+coho
+rock beauty
+anemone fish
+sturgeon
+gar
+lionfish
+puffer
+abacus
+abaya
+academic gown
+accordion
+acoustic guitar
+aircraft carrier
+airliner
+airship
+altar
+ambulance
+amphibian
+analog clock
+apiary
+apron
+ashcan
+assault rifle
+backpack
+bakery
+balance beam
+balloon
+ballpoint
+Band Aid
+banjo
+bannister
+barbell
+barber chair
+barbershop
+barn
+barometer
+barrel
+barrow
+baseball
+basketball
+bassinet
+bassoon
+bathing cap
+bath towel
+bathtub
+beach wagon
+beacon
+beaker
+bearskin
+beer bottle
+beer glass
+bell cote
+bib
+bicycle-built-for-two
+bikini
+binder
+binoculars
+birdhouse
+boathouse
+bobsled
+bolo tie
+bonnet
+bookcase
+bookshop
+bottlecap
+bow
+bow tie
+brass
+brassiere
+breakwater
+breastplate
+broom
+bucket
+buckle
+bulletproof vest
+bullet train
+butcher shop
+cab
+caldron
+candle
+cannon
+canoe
+can opener
+cardigan
+car mirror
+carousel
+carpenter's kit
+carton
+car wheel
+cash machine
+cassette
+cassette player
+castle
+catamaran
+CD player
+cello
+cellular telephone
+chain
+chainlink fence
+chain mail
+chain saw
+chest
+chiffonier
+chime
+china cabinet
+Christmas stocking
+church
+cinema
+cleaver
+cliff dwelling
+cloak
+clog
+cocktail shaker
+coffee mug
+coffeepot
+coil
+combination lock
+computer keyboard
+confectionery
+container ship
+convertible
+corkscrew
+cornet
+cowboy boot
+cowboy hat
+cradle
+crane
+crash helmet
+crate
+crib
+Crock Pot
+croquet ball
+crutch
+cuirass
+dam
+desk
+desktop computer
+dial telephone
+diaper
+digital clock
+digital watch
+dining table
+dishrag
+dishwasher
+disk brake
+dock
+dogsled
+dome
+doormat
+drilling platform
+drum
+drumstick
+dumbbell
+Dutch oven
+electric fan
+electric guitar
+electric locomotive
+entertainment center
+envelope
+espresso maker
+face powder
+feather boa
+file
+fireboat
+fire engine
+fire screen
+flagpole
+flute
+folding chair
+football helmet
+forklift
+fountain
+fountain pen
+four-poster
+freight car
+French horn
+frying pan
+fur coat
+garbage truck
+gasmask
+gas pump
+goblet
+go-kart
+golf ball
+golfcart
+gondola
+gong
+gown
+grand piano
+greenhouse
+grille
+grocery store
+guillotine
+hair slide
+hair spray
+half track
+hammer
+hamper
+hand blower
+hand-held computer
+handkerchief
+hard disc
+harmonica
+harp
+harvester
+hatchet
+holster
+home theater
+honeycomb
+hook
+hoopskirt
+horizontal bar
+horse cart
+hourglass
+iPod
+iron
+jack-o'-lantern
+jean
+jeep
+jersey
+jigsaw puzzle
+jinrikisha
+joystick
+kimono
+knee pad
+knot
+lab coat
+ladle
+lampshade
+laptop
+lawn mower
+lens cap
+letter opener
+library
+lifeboat
+lighter
+limousine
+liner
+lipstick
+Loafer
+lotion
+loudspeaker
+loupe
+lumbermill
+magnetic compass
+mailbag
+mailbox
+maillot
+maillot
+manhole cover
+maraca
+marimba
+mask
+matchstick
+maypole
+maze
+measuring cup
+medicine chest
+megalith
+microphone
+microwave
+military uniform
+milk can
+minibus
+miniskirt
+minivan
+missile
+mitten
+mixing bowl
+mobile home
+Model T
+modem
+monastery
+monitor
+moped
+mortar
+mortarboard
+mosque
+mosquito net
+motor scooter
+mountain bike
+mountain tent
+mouse
+mousetrap
+moving van
+muzzle
+nail
+neck brace
+necklace
+nipple
+notebook
+obelisk
+oboe
+ocarina
+odometer
+oil filter
+organ
+oscilloscope
+overskirt
+oxcart
+oxygen mask
+packet
+paddle
+paddlewheel
+padlock
+paintbrush
+pajama
+palace
+panpipe
+paper towel
+parachute
+parallel bars
+park bench
+parking meter
+passenger car
+patio
+pay-phone
+pedestal
+pencil box
+pencil sharpener
+perfume
+Petri dish
+photocopier
+pick
+pickelhaube
+picket fence
+pickup
+pier
+piggy bank
+pill bottle
+pillow
+ping-pong ball
+pinwheel
+pirate
+pitcher
+plane
+planetarium
+plastic bag
+plate rack
+plow
+plunger
+Polaroid camera
+pole
+police van
+poncho
+pool table
+pop bottle
+pot
+potter's wheel
+power drill
+prayer rug
+printer
+prison
+projectile
+projector
+puck
+punching bag
+purse
+quill
+quilt
+racer
+racket
+radiator
+radio
+radio telescope
+rain barrel
+recreational vehicle
+reel
+reflex camera
+refrigerator
+remote control
+restaurant
+revolver
+rifle
+rocking chair
+rotisserie
+rubber eraser
+rugby ball
+rule
+running shoe
+safe
+safety pin
+saltshaker
+sandal
+sarong
+sax
+scabbard
+scale
+school bus
+schooner
+scoreboard
+screen
+screw
+screwdriver
+seat belt
+sewing machine
+shield
+shoe shop
+shoji
+shopping basket
+shopping cart
+shovel
+shower cap
+shower curtain
+ski
+ski mask
+sleeping bag
+slide rule
+sliding door
+slot
+snorkel
+snowmobile
+snowplow
+soap dispenser
+soccer ball
+sock
+solar dish
+sombrero
+soup bowl
+space bar
+space heater
+space shuttle
+spatula
+speedboat
+spider web
+spindle
+sports car
+spotlight
+stage
+steam locomotive
+steel arch bridge
+steel drum
+stethoscope
+stole
+stone wall
+stopwatch
+stove
+strainer
+streetcar
+stretcher
+studio couch
+stupa
+submarine
+suit
+sundial
+sunglass
+sunglasses
+sunscreen
+suspension bridge
+swab
+sweatshirt
+swimming trunks
+swing
+switch
+syringe
+table lamp
+tank
+tape player
+teapot
+teddy
+television
+tennis ball
+thatch
+theater curtain
+thimble
+thresher
+throne
+tile roof
+toaster
+tobacco shop
+toilet seat
+torch
+totem pole
+tow truck
+toyshop
+tractor
+trailer truck
+tray
+trench coat
+tricycle
+trimaran
+tripod
+triumphal arch
+trolleybus
+trombone
+tub
+turnstile
+typewriter keyboard
+umbrella
+unicycle
+upright
+vacuum
+vase
+vault
+velvet
+vending machine
+vestment
+viaduct
+violin
+volleyball
+waffle iron
+wall clock
+wallet
+wardrobe
+warplane
+washbasin
+washer
+water bottle
+water jug
+water tower
+whiskey jug
+whistle
+wig
+window screen
+window shade
+Windsor tie
+wine bottle
+wing
+wok
+wooden spoon
+wool
+worm fence
+wreck
+yawl
+yurt
+web site
+comic book
+crossword puzzle
+street sign
+traffic light
+book jacket
+menu
+plate
+guacamole
+consomme
+hot pot
+trifle
+ice cream
+ice lolly
+French loaf
+bagel
+pretzel
+cheeseburger
+hotdog
+mashed potato
+head cabbage
+broccoli
+cauliflower
+zucchini
+spaghetti squash
+acorn squash
+butternut squash
+cucumber
+artichoke
+bell pepper
+cardoon
+mushroom
+Granny Smith
+strawberry
+orange
+lemon
+fig
+pineapple
+banana
+jackfruit
+custard apple
+pomegranate
+hay
+carbonara
+chocolate sauce
+dough
+meat loaf
+pizza
+potpie
+burrito
+red wine
+espresso
+cup
+eggnog
+alp
+bubble
+cliff
+coral reef
+geyser
+lakeside
+promontory
+sandbar
+seashore
+valley
+volcano
+ballplayer
+groom
+scuba diver
+rapeseed
+daisy
+yellow lady's slipper
+corn
+acorn
+hip
+buckeye
+coral fungus
+agaric
+gyromitra
+stinkhorn
+earthstar
+hen-of-the-woods
+bolete
+ear
+toilet tissue
diff --git a/tensorflow/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj b/tensorflow/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
index 9b5c2b32a8f176e58a2d28d11ee3e41ef875e722..bbab17b400c8e7e8d45503c088aab402b78d7545 100644
--- a/tensorflow/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
+++ b/tensorflow/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
@@ -234,6 +234,7 @@
 				SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
 				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
 				SWIFT_VERSION = 3.0;
+				VALID_ARCHS = arm64;
 			};
 			name = Debug;
 		};
@@ -253,6 +254,7 @@
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SWIFT_OPTIMIZATION_LEVEL = "-Owholemodule";
 				SWIFT_VERSION = 3.0;
+				VALID_ARCHS = arm64;
 			};
 			name = Release;
 		};
diff --git a/tensorflow/lite/examples/ios/download_models.sh b/tensorflow/lite/examples/ios/download_models.sh
index 4828617d95e94c1b6ad811e04d3b94b659bd8f74..a450aba042e9975e1282453160f841b4ff55e0b9 100755
--- a/tensorflow/lite/examples/ios/download_models.sh
+++ b/tensorflow/lite/examples/ios/download_models.sh
@@ -17,42 +17,31 @@
 set -ex
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-MODELS_URL="https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_1.0_224_ios_lite_float_2017_11_08.zip"
-QUANTIZED_MODELS_URL="https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip"
+FLOAT_MODEL_URL="http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz"
+QUANTIZED_MODEL_URL="http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz"
 DOWNLOADS_DIR=$(mktemp -d)
 
-cd $SCRIPT_DIR
+cd "$SCRIPT_DIR"
 
 download_and_extract() {
-  local usage="Usage: download_and_extract URL DIR"
-  local url="${1:?${usage}}"
-  local dir="${2:?${usage}}"
+  local url="$1"
+  local dir="$2"
   echo "downloading ${url}" >&2
   mkdir -p "${dir}"
   tempdir=$(mktemp -d)
-  tempdir2=$(mktemp -d)
 
-  curl -L ${url} > ${tempdir}/zipped.zip
-  unzip ${tempdir}/zipped.zip -d ${tempdir2}
-
-  # If the zip file contains nested directories, extract the files from the
-  # inner directory.
-  if ls ${tempdir2}/*/* 1> /dev/null 2>&1; then
-    # unzip has no strip components, so unzip to a temp dir, and move the
-    # files we want from the tempdir to destination.
-    cp -R ${tempdir2}/*/* ${dir}/
-  else
-    cp -R ${tempdir2}/* ${dir}/
-  fi
-  rm -rf ${tempdir2} ${tempdir}
+  curl -L ${url} > ${tempdir}/archive.tgz
+  cd ${dir}
+  tar zxvf ${tempdir}/archive.tgz
+  rm -rf ${tempdir}
 }
 
-download_and_extract "${MODELS_URL}" "${DOWNLOADS_DIR}/models"
-download_and_extract "${QUANTIZED_MODELS_URL}" "${DOWNLOADS_DIR}/quantized_models"
-
-file ${DOWNLOADS_DIR}/models
+download_and_extract "${FLOAT_MODEL_URL}" "${DOWNLOADS_DIR}/float_model"
+download_and_extract "${QUANTIZED_MODEL_URL}" "${DOWNLOADS_DIR}/quantized_model"
 
-cp ${DOWNLOADS_DIR}/models/models/* simple/data/
-cp ${DOWNLOADS_DIR}/models/models/* camera/data/
-cp "${DOWNLOADS_DIR}/quantized_models/mobilenet_quant_v1_224.tflite" \
+cd "$SCRIPT_DIR"
+cp "${DOWNLOADS_DIR}/float_model/mobilenet_v1_1.0_224.tflite" "simple/data/mobilenet_v1_1.0_224.tflite"
+cp "${DOWNLOADS_DIR}/float_model/mobilenet_v1_1.0_224.tflite" "camera/data/mobilenet_v1_1.0_224.tflite"
+cp "${DOWNLOADS_DIR}/quantized_model/mobilenet_v1_1.0_224_quant.tflite" \
    'camera/data/mobilenet_quant_v1_224.tflite'
+echo "Done"
diff --git a/tensorflow/lite/examples/ios/simple/data/labels.txt b/tensorflow/lite/examples/ios/simple/data/labels.txt
new file mode 100644
index 0000000000000000000000000000000000000000..572eccf90087c1c19874e40b950c1610f59cc9c2
--- /dev/null
+++ b/tensorflow/lite/examples/ios/simple/data/labels.txt
@@ -0,0 +1,1001 @@
+dummy
+tench
+goldfish
+great white shark
+tiger shark
+hammerhead
+electric ray
+stingray
+cock
+hen
+ostrich
+brambling
+goldfinch
+house finch
+junco
+indigo bunting
+robin
+bulbul
+jay
+magpie
+chickadee
+water ouzel
+kite
+bald eagle
+vulture
+great grey owl
+European fire salamander
+common newt
+eft
+spotted salamander
+axolotl
+bullfrog
+tree frog
+tailed frog
+loggerhead
+leatherback turtle
+mud turtle
+terrapin
+box turtle
+banded gecko
+common iguana
+American chameleon
+whiptail
+agama
+frilled lizard
+alligator lizard
+Gila monster
+green lizard
+African chameleon
+Komodo dragon
+African crocodile
+American alligator
+triceratops
+thunder snake
+ringneck snake
+hognose snake
+green snake
+king snake
+garter snake
+water snake
+vine snake
+night snake
+boa constrictor
+rock python
+Indian cobra
+green mamba
+sea snake
+horned viper
+diamondback
+sidewinder
+trilobite
+harvestman
+scorpion
+black and gold garden spider
+barn spider
+garden spider
+black widow
+tarantula
+wolf spider
+tick
+centipede
+black grouse
+ptarmigan
+ruffed grouse
+prairie chicken
+peacock
+quail
+partridge
+African grey
+macaw
+sulphur-crested cockatoo
+lorikeet
+coucal
+bee eater
+hornbill
+hummingbird
+jacamar
+toucan
+drake
+red-breasted merganser
+goose
+black swan
+tusker
+echidna
+platypus
+wallaby
+koala
+wombat
+jellyfish
+sea anemone
+brain coral
+flatworm
+nematode
+conch
+snail
+slug
+sea slug
+chiton
+chambered nautilus
+Dungeness crab
+rock crab
+fiddler crab
+king crab
+American lobster
+spiny lobster
+crayfish
+hermit crab
+isopod
+white stork
+black stork
+spoonbill
+flamingo
+little blue heron
+American egret
+bittern
+crane
+limpkin
+European gallinule
+American coot
+bustard
+ruddy turnstone
+red-backed sandpiper
+redshank
+dowitcher
+oystercatcher
+pelican
+king penguin
+albatross
+grey whale
+killer whale
+dugong
+sea lion
+Chihuahua
+Japanese spaniel
+Maltese dog
+Pekinese
+Shih-Tzu
+Blenheim spaniel
+papillon
+toy terrier
+Rhodesian ridgeback
+Afghan hound
+basset
+beagle
+bloodhound
+bluetick
+black-and-tan coonhound
+Walker hound
+English foxhound
+redbone
+borzoi
+Irish wolfhound
+Italian greyhound
+whippet
+Ibizan hound
+Norwegian elkhound
+otterhound
+Saluki
+Scottish deerhound
+Weimaraner
+Staffordshire bullterrier
+American Staffordshire terrier
+Bedlington terrier
+Border terrier
+Kerry blue terrier
+Irish terrier
+Norfolk terrier
+Norwich terrier
+Yorkshire terrier
+wire-haired fox terrier
+Lakeland terrier
+Sealyham terrier
+Airedale
+cairn
+Australian terrier
+Dandie Dinmont
+Boston bull
+miniature schnauzer
+giant schnauzer
+standard schnauzer
+Scotch terrier
+Tibetan terrier
+silky terrier
+soft-coated wheaten terrier
+West Highland white terrier
+Lhasa
+flat-coated retriever
+curly-coated retriever
+golden retriever
+Labrador retriever
+Chesapeake Bay retriever
+German short-haired pointer
+vizsla
+English setter
+Irish setter
+Gordon setter
+Brittany spaniel
+clumber
+English springer
+Welsh springer spaniel
+cocker spaniel
+Sussex spaniel
+Irish water spaniel
+kuvasz
+schipperke
+groenendael
+malinois
+briard
+kelpie
+komondor
+Old English sheepdog
+Shetland sheepdog
+collie
+Border collie
+Bouvier des Flandres
+Rottweiler
+German shepherd
+Doberman
+miniature pinscher
+Greater Swiss Mountain dog
+Bernese mountain dog
+Appenzeller
+EntleBucher
+boxer
+bull mastiff
+Tibetan mastiff
+French bulldog
+Great Dane
+Saint Bernard
+Eskimo dog
+malamute
+Siberian husky
+dalmatian
+affenpinscher
+basenji
+pug
+Leonberg
+Newfoundland
+Great Pyrenees
+Samoyed
+Pomeranian
+chow
+keeshond
+Brabancon griffon
+Pembroke
+Cardigan
+toy poodle
+miniature poodle
+standard poodle
+Mexican hairless
+timber wolf
+white wolf
+red wolf
+coyote
+dingo
+dhole
+African hunting dog
+hyena
+red fox
+kit fox
+Arctic fox
+grey fox
+tabby
+tiger cat
+Persian cat
+Siamese cat
+Egyptian cat
+cougar
+lynx
+leopard
+snow leopard
+jaguar
+lion
+tiger
+cheetah
+brown bear
+American black bear
+ice bear
+sloth bear
+mongoose
+meerkat
+tiger beetle
+ladybug
+ground beetle
+long-horned beetle
+leaf beetle
+dung beetle
+rhinoceros beetle
+weevil
+fly
+bee
+ant
+grasshopper
+cricket
+walking stick
+cockroach
+mantis
+cicada
+leafhopper
+lacewing
+dragonfly
+damselfly
+admiral
+ringlet
+monarch
+cabbage butterfly
+sulphur butterfly
+lycaenid
+starfish
+sea urchin
+sea cucumber
+wood rabbit
+hare
+Angora
+hamster
+porcupine
+fox squirrel
+marmot
+beaver
+guinea pig
+sorrel
+zebra
+hog
+wild boar
+warthog
+hippopotamus
+ox
+water buffalo
+bison
+ram
+bighorn
+ibex
+hartebeest
+impala
+gazelle
+Arabian camel
+llama
+weasel
+mink
+polecat
+black-footed ferret
+otter
+skunk
+badger
+armadillo
+three-toed sloth
+orangutan
+gorilla
+chimpanzee
+gibbon
+siamang
+guenon
+patas
+baboon
+macaque
+langur
+colobus
+proboscis monkey
+marmoset
+capuchin
+howler monkey
+titi
+spider monkey
+squirrel monkey
+Madagascar cat
+indri
+Indian elephant
+African elephant
+lesser panda
+giant panda
+barracouta
+eel
+coho
+rock beauty
+anemone fish
+sturgeon
+gar
+lionfish
+puffer
+abacus
+abaya
+academic gown
+accordion
+acoustic guitar
+aircraft carrier
+airliner
+airship
+altar
+ambulance
+amphibian
+analog clock
+apiary
+apron
+ashcan
+assault rifle
+backpack
+bakery
+balance beam
+balloon
+ballpoint
+Band Aid
+banjo
+bannister
+barbell
+barber chair
+barbershop
+barn
+barometer
+barrel
+barrow
+baseball
+basketball
+bassinet
+bassoon
+bathing cap
+bath towel
+bathtub
+beach wagon
+beacon
+beaker
+bearskin
+beer bottle
+beer glass
+bell cote
+bib
+bicycle-built-for-two
+bikini
+binder
+binoculars
+birdhouse
+boathouse
+bobsled
+bolo tie
+bonnet
+bookcase
+bookshop
+bottlecap
+bow
+bow tie
+brass
+brassiere
+breakwater
+breastplate
+broom
+bucket
+buckle
+bulletproof vest
+bullet train
+butcher shop
+cab
+caldron
+candle
+cannon
+canoe
+can opener
+cardigan
+car mirror
+carousel
+carpenter's kit
+carton
+car wheel
+cash machine
+cassette
+cassette player
+castle
+catamaran
+CD player
+cello
+cellular telephone
+chain
+chainlink fence
+chain mail
+chain saw
+chest
+chiffonier
+chime
+china cabinet
+Christmas stocking
+church
+cinema
+cleaver
+cliff dwelling
+cloak
+clog
+cocktail shaker
+coffee mug
+coffeepot
+coil
+combination lock
+computer keyboard
+confectionery
+container ship
+convertible
+corkscrew
+cornet
+cowboy boot
+cowboy hat
+cradle
+crane
+crash helmet
+crate
+crib
+Crock Pot
+croquet ball
+crutch
+cuirass
+dam
+desk
+desktop computer
+dial telephone
+diaper
+digital clock
+digital watch
+dining table
+dishrag
+dishwasher
+disk brake
+dock
+dogsled
+dome
+doormat
+drilling platform
+drum
+drumstick
+dumbbell
+Dutch oven
+electric fan
+electric guitar
+electric locomotive
+entertainment center
+envelope
+espresso maker
+face powder
+feather boa
+file
+fireboat
+fire engine
+fire screen
+flagpole
+flute
+folding chair
+football helmet
+forklift
+fountain
+fountain pen
+four-poster
+freight car
+French horn
+frying pan
+fur coat
+garbage truck
+gasmask
+gas pump
+goblet
+go-kart
+golf ball
+golfcart
+gondola
+gong
+gown
+grand piano
+greenhouse
+grille
+grocery store
+guillotine
+hair slide
+hair spray
+half track
+hammer
+hamper
+hand blower
+hand-held computer
+handkerchief
+hard disc
+harmonica
+harp
+harvester
+hatchet
+holster
+home theater
+honeycomb
+hook
+hoopskirt
+horizontal bar
+horse cart
+hourglass
+iPod
+iron
+jack-o'-lantern
+jean
+jeep
+jersey
+jigsaw puzzle
+jinrikisha
+joystick
+kimono
+knee pad
+knot
+lab coat
+ladle
+lampshade
+laptop
+lawn mower
+lens cap
+letter opener
+library
+lifeboat
+lighter
+limousine
+liner
+lipstick
+Loafer
+lotion
+loudspeaker
+loupe
+lumbermill
+magnetic compass
+mailbag
+mailbox
+maillot
+maillot
+manhole cover
+maraca
+marimba
+mask
+matchstick
+maypole
+maze
+measuring cup
+medicine chest
+megalith
+microphone
+microwave
+military uniform
+milk can
+minibus
+miniskirt
+minivan
+missile
+mitten
+mixing bowl
+mobile home
+Model T
+modem
+monastery
+monitor
+moped
+mortar
+mortarboard
+mosque
+mosquito net
+motor scooter
+mountain bike
+mountain tent
+mouse
+mousetrap
+moving van
+muzzle
+nail
+neck brace
+necklace
+nipple
+notebook
+obelisk
+oboe
+ocarina
+odometer
+oil filter
+organ
+oscilloscope
+overskirt
+oxcart
+oxygen mask
+packet
+paddle
+paddlewheel
+padlock
+paintbrush
+pajama
+palace
+panpipe
+paper towel
+parachute
+parallel bars
+park bench
+parking meter
+passenger car
+patio
+pay-phone
+pedestal
+pencil box
+pencil sharpener
+perfume
+Petri dish
+photocopier
+pick
+pickelhaube
+picket fence
+pickup
+pier
+piggy bank
+pill bottle
+pillow
+ping-pong ball
+pinwheel
+pirate
+pitcher
+plane
+planetarium
+plastic bag
+plate rack
+plow
+plunger
+Polaroid camera
+pole
+police van
+poncho
+pool table
+pop bottle
+pot
+potter's wheel
+power drill
+prayer rug
+printer
+prison
+projectile
+projector
+puck
+punching bag
+purse
+quill
+quilt
+racer
+racket
+radiator
+radio
+radio telescope
+rain barrel
+recreational vehicle
+reel
+reflex camera
+refrigerator
+remote control
+restaurant
+revolver
+rifle
+rocking chair
+rotisserie
+rubber eraser
+rugby ball
+rule
+running shoe
+safe
+safety pin
+saltshaker
+sandal
+sarong
+sax
+scabbard
+scale
+school bus
+schooner
+scoreboard
+screen
+screw
+screwdriver
+seat belt
+sewing machine
+shield
+shoe shop
+shoji
+shopping basket
+shopping cart
+shovel
+shower cap
+shower curtain
+ski
+ski mask
+sleeping bag
+slide rule
+sliding door
+slot
+snorkel
+snowmobile
+snowplow
+soap dispenser
+soccer ball
+sock
+solar dish
+sombrero
+soup bowl
+space bar
+space heater
+space shuttle
+spatula
+speedboat
+spider web
+spindle
+sports car
+spotlight
+stage
+steam locomotive
+steel arch bridge
+steel drum
+stethoscope
+stole
+stone wall
+stopwatch
+stove
+strainer
+streetcar
+stretcher
+studio couch
+stupa
+submarine
+suit
+sundial
+sunglass
+sunglasses
+sunscreen
+suspension bridge
+swab
+sweatshirt
+swimming trunks
+swing
+switch
+syringe
+table lamp
+tank
+tape player
+teapot
+teddy
+television
+tennis ball
+thatch
+theater curtain
+thimble
+thresher
+throne
+tile roof
+toaster
+tobacco shop
+toilet seat
+torch
+totem pole
+tow truck
+toyshop
+tractor
+trailer truck
+tray
+trench coat
+tricycle
+trimaran
+tripod
+triumphal arch
+trolleybus
+trombone
+tub
+turnstile
+typewriter keyboard
+umbrella
+unicycle
+upright
+vacuum
+vase
+vault
+velvet
+vending machine
+vestment
+viaduct
+violin
+volleyball
+waffle iron
+wall clock
+wallet
+wardrobe
+warplane
+washbasin
+washer
+water bottle
+water jug
+water tower
+whiskey jug
+whistle
+wig
+window screen
+window shade
+Windsor tie
+wine bottle
+wing
+wok
+wooden spoon
+wool
+worm fence
+wreck
+yawl
+yurt
+web site
+comic book
+crossword puzzle
+street sign
+traffic light
+book jacket
+menu
+plate
+guacamole
+consomme
+hot pot
+trifle
+ice cream
+ice lolly
+French loaf
+bagel
+pretzel
+cheeseburger
+hotdog
+mashed potato
+head cabbage
+broccoli
+cauliflower
+zucchini
+spaghetti squash
+acorn squash
+butternut squash
+cucumber
+artichoke
+bell pepper
+cardoon
+mushroom
+Granny Smith
+strawberry
+orange
+lemon
+fig
+pineapple
+banana
+jackfruit
+custard apple
+pomegranate
+hay
+carbonara
+chocolate sauce
+dough
+meat loaf
+pizza
+potpie
+burrito
+red wine
+espresso
+cup
+eggnog
+alp
+bubble
+cliff
+coral reef
+geyser
+lakeside
+promontory
+sandbar
+seashore
+valley
+volcano
+ballplayer
+groom
+scuba diver
+rapeseed
+daisy
+yellow lady's slipper
+corn
+acorn
+hip
+buckeye
+coral fungus
+agaric
+gyromitra
+stinkhorn
+earthstar
+hen-of-the-woods
+bolete
+ear
+toilet tissue
diff --git a/tensorflow/lite/examples/label_image/label_image.md b/tensorflow/lite/examples/label_image/label_image.md
index fd9f49918b4494eab845da7716a350ad6246f532..178f5b9d3012206571b6fcf8af1d2416df9a42e5 100644
--- a/tensorflow/lite/examples/label_image/label_image.md
+++ b/tensorflow/lite/examples/label_image/label_image.md
@@ -40,7 +40,7 @@ To run it. Prepare `./mobilenet_quant_v1_224.tflite`, `./grace_hopper.bmp`, and
 
 Run it:
 ```
-> ./label_image                                        
+> ./label_image
 Loaded model ./mobilenet_quant_v1_224.tflite
 resolved reporter
 invoked
@@ -51,9 +51,9 @@ average time: 100.986 ms
 0.0235294: 514 cornet
 0.0196078: 835 suit
 ```
-Run `interpreter->Invoker()` 100 times:
+Run `interpreter->Invoke()` 100 times:
 ```
-> ./label_image   -c 100                               
+> ./label_image   -c 100
 Loaded model ./mobilenet_quant_v1_224.tflite
 resolved reporter
 invoked
diff --git a/tensorflow/lite/experimental/c/BUILD b/tensorflow/lite/experimental/c/BUILD
index cde53e283830aca9c7990e3d8c4901f997621bc2..ac71c9bd34e524c41e70ec2e724ed30680b63932 100644
--- a/tensorflow/lite/experimental/c/BUILD
+++ b/tensorflow/lite/experimental/c/BUILD
@@ -20,15 +20,13 @@ tflite_cc_shared_object(
     name = "libtensorflowlite_c.so",
     linkopts = select({
         "//tensorflow:darwin": [
-            "-Wl,-exported_symbols_list",  # This line must be directly followed by the exported_symbols.lds file
-            "$(location //tensorflow/lite/experimental/c:exported_symbols.lds)",
+            "-Wl,-exported_symbols_list,$(location //tensorflow/lite/experimental/c:exported_symbols.lds)",
             "-Wl,-install_name,@rpath/libtensorflowlite_c.so",
         ],
         "//tensorflow:windows": [],
         "//conditions:default": [
             "-z defs",
-            "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
-            "$(location //tensorflow/lite/experimental/c:version_script.lds)",
+            "-Wl,--version-script,$(location //tensorflow/lite/experimental/c:version_script.lds)",
         ],
     }),
     deps = [
@@ -66,7 +64,6 @@ cc_library(
         ":c_api_internal",
         "//tensorflow/lite:context",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/kernels:builtin_ops",
     ],
 )
@@ -94,7 +91,6 @@ cc_test(
     deps = [
         ":c_api",
         "//tensorflow/lite:context",
-        "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
diff --git a/tensorflow/lite/experimental/examples/lstm/BUILD b/tensorflow/lite/experimental/examples/lstm/BUILD
index 0c351ee4eccee515ed34ec5e8607914f7064ffbf..61587a0ba42ed742cfd4203a481dbb596fa0502e 100644
--- a/tensorflow/lite/experimental/examples/lstm/BUILD
+++ b/tensorflow/lite/experimental/examples/lstm/BUILD
@@ -5,14 +5,29 @@ package(default_visibility = ["//tensorflow:internal"])
 load("//tensorflow:tensorflow.bzl", "py_test")
 
 py_library(
-    name = "tflite_lstm",
-    srcs = ["tflite_lstm.py"],
+    name = "rnn",
+    srcs = ["rnn.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/lite/python:lite",
+        "//tensorflow/lite/python:op_hint",
         "//tensorflow/python:framework",
+        "//tensorflow/python:layers_base",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:rnn_cell",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "rnn_cell",
+    srcs = ["rnn_cell.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/lite/python:op_hint",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:rnn_cell",
         "@six_archive//:six",
     ],
 )
@@ -27,7 +42,54 @@ py_test(
         "no_pip",
     ],
     deps = [
-        ":tflite_lstm",
+        ":rnn",
+        ":rnn_cell",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/examples/tutorials/mnist:input_data",
+        "//tensorflow/lite/python:lite",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform",
+        "//tensorflow/python/tools:optimize_for_inference_lib",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "unidirectional_sequence_rnn_test",
+    size = "large",
+    srcs = ["unidirectional_sequence_rnn_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+    ],
+    deps = [
+        ":rnn",
+        ":rnn_cell",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/examples/tutorials/mnist:input_data",
+        "//tensorflow/lite/python:lite",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python/tools:optimize_for_inference",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "bidirectional_sequence_lstm_test",
+    size = "large",
+    srcs = ["bidirectional_sequence_lstm_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+    ],
+    deps = [
+        ":rnn",
+        ":rnn_cell",
         "//tensorflow:tensorflow_py",
         "//tensorflow/examples/tutorials/mnist:input_data",
         "//tensorflow/lite/python:lite",
@@ -38,3 +100,36 @@ py_test(
         "@six_archive//:six",
     ],
 )
+
+py_test(
+    name = "bidirectional_sequence_rnn_test",
+    size = "large",
+    srcs = ["bidirectional_sequence_rnn_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+    ],
+    deps = [
+        ":rnn",
+        ":rnn_cell",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/examples/tutorials/mnist:input_data",
+        "//tensorflow/lite/python:lite",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python/tools:optimize_for_inference",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "tflite_lstm_ops",
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":rnn",
+        ":rnn_cell",
+    ],
+)
diff --git a/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_lstm_test.py b/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_lstm_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..867932e9153dc4b5eb751c49e8a5e350a2263183
--- /dev/null
+++ b/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_lstm_test.py
@@ -0,0 +1,231 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tempfile
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.examples.tutorials.mnist import input_data
+from tensorflow.lite.experimental.examples.lstm.rnn import bidirectional_dynamic_rnn
+from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs
+from tensorflow.lite.python.op_hint import find_all_hinted_output_nodes
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+from tensorflow.python.tools import optimize_for_inference_lib
+
+# Number of steps to train model.
+TRAIN_STEPS = 1
+
+CONFIG = tf.ConfigProto(device_count={"GPU": 0})
+
+
+class BidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    tf.reset_default_graph()
+    # Import MNIST dataset
+    self.mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
+
+    # Define constants
+    # Unrolled through 28 time steps
+    self.time_steps = 28
+    # Rows of 28 pixels
+    self.n_input = 28
+    # Learning rate for Adam optimizer
+    self.learning_rate = 0.001
+    # MNIST is meant to be classified in 10 classes(0-9).
+    self.n_classes = 10
+    # Batch size
+    self.batch_size = 16
+    # Lstm Units.
+    self.num_units = 16
+
+  def buildLstmLayer(self):
+    return tf.nn.rnn_cell.MultiRNNCell([
+        tf.lite.experimental.nn.TFLiteLSTMCell(
+            self.num_units, use_peepholes=True, forget_bias=0, name="rnn1"),
+        tf.lite.experimental.nn.TFLiteLSTMCell(
+            self.num_units, num_proj=8, forget_bias=0, name="rnn2"),
+        tf.lite.experimental.nn.TFLiteLSTMCell(
+            self.num_units // 2,
+            use_peepholes=True,
+            num_proj=8,
+            forget_bias=0,
+            name="rnn3"),
+        tf.lite.experimental.nn.TFLiteLSTMCell(
+            self.num_units, forget_bias=0, name="rnn4")
+    ])
+
+  def buildModel(self, fw_lstm_layer, bw_lstm_layer, is_dynamic_rnn):
+    # Weights and biases for output softmax layer.
+    out_weights = tf.Variable(
+        tf.random_normal([self.num_units * 2, self.n_classes]))
+    out_bias = tf.Variable(tf.random_normal([self.n_classes]))
+
+    # input image placeholder
+    x = tf.placeholder(
+        "float", [None, self.time_steps, self.n_input], name="INPUT_IMAGE")
+
+    if is_dynamic_rnn:
+      lstm_inputs = tf.transpose(x, [1, 0, 2])
+      outputs, _ = bidirectional_dynamic_rnn(
+          fw_lstm_layer,
+          bw_lstm_layer,
+          lstm_inputs,
+          dtype="float32",
+          time_major=True)
+      fw_outputs, bw_outputs = outputs
+      output = tf.concat([fw_outputs, bw_outputs], 2)
+      output = tf.unstack(output, axis=0)
+      output = output[-1]
+    else:
+      lstm_input = tf.unstack(x, self.time_steps, 1)
+      outputs, _, _ = tf.nn.static_bidirectional_rnn(
+          fw_lstm_layer, bw_lstm_layer, lstm_input, dtype="float32")
+      output = outputs[-1]
+
+    # Compute logits by multiplying output of shape [batch_size,num_units*2]
+    # by the softmax layer's out_weight of shape [num_units*2,n_classes]
+    # plus out_bias
+    prediction = tf.matmul(output, out_weights) + out_bias
+    output_class = tf.nn.softmax(prediction, name="OUTPUT_CLASS")
+
+    return x, prediction, output_class
+
+  def trainModel(self, x, prediction, output_class, sess):
+    # input label placeholder
+    y = tf.placeholder("float", [None, self.n_classes])
+    # Loss function
+    loss = tf.reduce_mean(
+        tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
+    # Optimization
+    opt = tf.train.AdamOptimizer(
+        learning_rate=self.learning_rate).minimize(loss)
+
+    # Initialize variables
+    init = tf.global_variables_initializer()
+    sess.run(init)
+    for _ in range(TRAIN_STEPS):
+      batch_x, batch_y = self.mnist.train.next_batch(
+          batch_size=self.batch_size, shuffle=False)
+
+      batch_x = batch_x.reshape((self.batch_size, self.time_steps,
+                                 self.n_input))
+      sess.run(opt, feed_dict={x: batch_x, y: batch_y})
+
+  def saveAndRestoreModel(self, fw_lstm_layer, bw_lstm_layer, sess, saver,
+                          is_dynamic_rnn):
+    model_dir = tempfile.mkdtemp()
+    saver.save(sess, model_dir)
+
+    # Reset the graph.
+    tf.reset_default_graph()
+    x, prediction, output_class = self.buildModel(fw_lstm_layer, bw_lstm_layer,
+                                                  is_dynamic_rnn)
+
+    new_sess = tf.Session(config=CONFIG)
+    saver = tf.train.Saver()
+    saver.restore(new_sess, model_dir)
+    return x, prediction, output_class, new_sess
+
+  def getInferenceResult(self, x, output_class, sess):
+    b1, _ = self.mnist.train.next_batch(batch_size=1)
+    sample_input = np.reshape(b1, (1, self.time_steps, self.n_input))
+
+    expected_output = sess.run(output_class, feed_dict={x: sample_input})
+    # It is important to keep all the ophint output nodes.
+    hinted_outputs_nodes = find_all_hinted_output_nodes(sess)
+    hinted_outputs_nodes.append(output_class.op.name)
+    frozen_graph = tf.graph_util.convert_variables_to_constants(
+        sess, sess.graph_def, hinted_outputs_nodes)
+    return sample_input, expected_output, frozen_graph
+
+  def tfliteInvoke(self, graph, test_inputs, outputs):
+    tf.reset_default_graph()
+    # Turn the input into placeholder of shape 1
+    tflite_input = tf.placeholder(
+        "float", [1, self.time_steps, self.n_input], name="INPUT_IMAGE_LITE")
+    tf.import_graph_def(graph, name="", input_map={"INPUT_IMAGE": tflite_input})
+    with tf.Session() as sess:
+      curr = sess.graph_def
+      curr = convert_op_hints_to_stubs(graph_def=curr)
+
+    curr = optimize_for_inference_lib.optimize_for_inference(
+        curr, ["INPUT_IMAGE_LITE"], ["OUTPUT_CLASS"],
+        [tf.float32.as_datatype_enum])
+
+    converter = tf.lite.TFLiteConverter(curr, [tflite_input], [outputs])
+    tflite = converter.convert()
+
+    interpreter = tf.lite.Interpreter(model_content=tflite)
+
+    try:
+      interpreter.allocate_tensors()
+    except ValueError:
+      assert False
+
+    input_index = (interpreter.get_input_details()[0]["index"])
+    interpreter.set_tensor(input_index, test_inputs)
+    interpreter.invoke()
+    output_index = (interpreter.get_output_details()[0]["index"])
+    result = interpreter.get_tensor(output_index)
+    # Reset all variables so it will not pollute other inferences.
+    interpreter.reset_all_variables()
+    return result
+
+  def testStaticRnnMultiRnnCell(self):
+    sess = tf.Session(config=CONFIG)
+
+    x, prediction, output_class = self.buildModel(self.buildLstmLayer(),
+                                                  self.buildLstmLayer(), False)
+    self.trainModel(x, prediction, output_class, sess)
+
+    saver = tf.train.Saver()
+    x, prediction, output_class, new_sess = self.saveAndRestoreModel(
+        self.buildLstmLayer(), self.buildLstmLayer(), sess, saver, False)
+
+    test_inputs, expected_output, frozen_graph = self.getInferenceResult(
+        x, output_class, new_sess)
+
+    result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
+
+  @test_util.enable_control_flow_v2
+  def testDynamicRnnMultiRnnCell(self):
+    sess = tf.Session(config=CONFIG)
+
+    x, prediction, output_class = self.buildModel(self.buildLstmLayer(),
+                                                  self.buildLstmLayer(), True)
+    self.trainModel(x, prediction, output_class, sess)
+
+    saver = tf.train.Saver()
+    x, prediction, output_class, new_sess = self.saveAndRestoreModel(
+        self.buildLstmLayer(),
+        self.buildLstmLayer(),
+        sess,
+        saver,
+        is_dynamic_rnn=True)
+
+    test_inputs, expected_output, frozen_graph = self.getInferenceResult(
+        x, output_class, new_sess)
+
+    result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_rnn_test.py b/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_rnn_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a05af41be2e77904992b93ecb7395ad21918714
--- /dev/null
+++ b/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_rnn_test.py
@@ -0,0 +1,225 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tempfile
+import numpy as np
+import tensorflow as tf
+
+from tensorflow import flags
+
+from tensorflow.examples.tutorials.mnist import input_data
+from tensorflow.lite.experimental.examples.lstm.rnn import bidirectional_dynamic_rnn
+from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs
+from tensorflow.lite.python.op_hint import find_all_hinted_output_nodes
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+from tensorflow.python.tools import optimize_for_inference_lib
+
+FLAGS = flags.FLAGS
+
+# Number of steps to train model.
+TRAIN_STEPS = 1
+
+CONFIG = tf.ConfigProto(device_count={"GPU": 0})
+
+
+class BidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
+
+  def __init__(self, *args, **kwargs):
+    super(BidirectionalSequenceRnnTest, self).__init__(*args, **kwargs)
+    # Define constants
+    # Unrolled through 28 time steps
+    self.time_steps = 28
+    # Rows of 28 pixels
+    self.n_input = 28
+    # Learning rate for Adam optimizer
+    self.learning_rate = 0.001
+    # MNIST is meant to be classified in 10 classes(0-9).
+    self.n_classes = 10
+    # Batch size
+    self.batch_size = 16
+    # Rnn Units.
+    self.num_units = 16
+
+  def setUp(self):
+    super(BidirectionalSequenceRnnTest, self).setUp()
+    # Import MNIST dataset
+    data_dir = tempfile.mkdtemp(dir=FLAGS.test_tmpdir)
+    self.mnist = input_data.read_data_sets(data_dir, one_hot=True)
+
+  def buildRnnLayer(self):
+    return tf.nn.rnn_cell.MultiRNNCell([
+        tf.lite.experimental.nn.TfLiteRNNCell(self.num_units, name="rnn1"),
+        tf.lite.experimental.nn.TfLiteRNNCell(self.num_units, name="rnn2")
+    ])
+
+  def buildModel(self, fw_rnn_layer, bw_rnn_layer, is_dynamic_rnn):
+    # Weights and biases for output softmax layer.
+    out_weights = tf.Variable(
+        tf.random_normal([self.num_units * 2, self.n_classes]))
+    out_bias = tf.Variable(tf.random_normal([self.n_classes]))
+
+    # input image placeholder
+    x = tf.placeholder(
+        "float", [None, self.time_steps, self.n_input], name="INPUT_IMAGE")
+
+    if is_dynamic_rnn:
+      rnn_inputs = tf.transpose(x, [1, 0, 2])
+      outputs, _ = bidirectional_dynamic_rnn(
+          fw_rnn_layer,
+          bw_rnn_layer,
+          rnn_inputs,
+          dtype="float32",
+          time_major=True)
+      fw_outputs, bw_outputs = outputs
+      output = tf.concat([fw_outputs, bw_outputs], 2)
+      output = tf.unstack(output, axis=0)
+      output = output[-1]
+    else:
+      rnn_inputs = tf.unstack(x, self.time_steps, 1)
+      outputs, _, _ = tf.nn.static_bidirectional_rnn(
+          fw_rnn_layer, bw_rnn_layer, rnn_inputs, dtype="float32")
+      output = outputs[-1]
+
+    # Compute logits by multiplying output of shape [batch_size,num_units*2]
+    # by the softmax layer's out_weight of shape [num_units*2,n_classes]
+    # plus out_bias
+    prediction = tf.matmul(output, out_weights) + out_bias
+    output_class = tf.nn.softmax(prediction, name="OUTPUT_CLASS")
+
+    return x, prediction, output_class
+
+  def trainModel(self, x, prediction, output_class, sess):
+    # input label placeholder
+    y = tf.placeholder("float", [None, self.n_classes])
+    # Loss function
+    loss = tf.reduce_mean(
+        tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
+    # Optimization
+    opt = tf.train.AdamOptimizer(
+        learning_rate=self.learning_rate).minimize(loss)
+
+    # Initialize variables
+    init = tf.global_variables_initializer()
+    sess.run(init)
+    for _ in range(TRAIN_STEPS):
+      batch_x, batch_y = self.mnist.train.next_batch(
+          batch_size=self.batch_size, shuffle=False)
+
+      batch_x = batch_x.reshape((self.batch_size, self.time_steps,
+                                 self.n_input))
+      sess.run(opt, feed_dict={x: batch_x, y: batch_y})
+
+  def saveAndRestoreModel(self, fw_rnn_layer, bw_rnn_layer, sess, saver,
+                          is_dynamic_rnn):
+    model_dir = tempfile.mkdtemp(dir=FLAGS.test_tmpdir)
+    saver.save(sess, model_dir)
+
+    # Reset the graph.
+    tf.reset_default_graph()
+    x, prediction, output_class = self.buildModel(fw_rnn_layer, bw_rnn_layer,
+                                                  is_dynamic_rnn)
+
+    new_sess = tf.Session(config=CONFIG)
+    saver = tf.train.Saver()
+    saver.restore(new_sess, model_dir)
+    return x, prediction, output_class, new_sess
+
+  def getInferenceResult(self, x, output_class, sess):
+    b1, _ = self.mnist.train.next_batch(batch_size=1)
+    sample_input = np.reshape(b1, (1, self.time_steps, self.n_input))
+
+    expected_output = sess.run(output_class, feed_dict={x: sample_input})
+    # It is important to keep all the ophint output nodes.
+    hinted_outputs_nodes = find_all_hinted_output_nodes(sess)
+    hinted_outputs_nodes.append(output_class.op.name)
+    frozen_graph = tf.graph_util.convert_variables_to_constants(
+        sess, sess.graph_def, hinted_outputs_nodes)
+    return sample_input, expected_output, frozen_graph
+
+  def tfliteInvoke(self, graph, test_inputs, outputs):
+    tf.reset_default_graph()
+    # Turn the input into placeholder of shape 1
+    tflite_input = tf.placeholder(
+        "float", [1, self.time_steps, self.n_input], name="INPUT_IMAGE_LITE")
+    tf.import_graph_def(graph, name="", input_map={"INPUT_IMAGE": tflite_input})
+    with tf.Session() as sess:
+      curr = sess.graph_def
+      curr = convert_op_hints_to_stubs(graph_def=curr)
+
+    curr = optimize_for_inference_lib.optimize_for_inference(
+        curr, ["INPUT_IMAGE_LITE"], ["OUTPUT_CLASS"],
+        [tf.float32.as_datatype_enum])
+
+    converter = tf.lite.TFLiteConverter(curr, [tflite_input], [outputs])
+    tflite = converter.convert()
+
+    interpreter = tf.lite.Interpreter(model_content=tflite)
+
+    interpreter.allocate_tensors()
+
+    input_index = interpreter.get_input_details()[0]["index"]
+    interpreter.set_tensor(input_index, test_inputs)
+    interpreter.invoke()
+    output_index = interpreter.get_output_details()[0]["index"]
+    result = interpreter.get_tensor(output_index)
+    # Reset all variables so it will not pollute other inferences.
+    interpreter.reset_all_variables()
+    return result
+
+  def testStaticRnnMultiRnnCell(self):
+    sess = tf.Session(config=CONFIG)
+
+    x, prediction, output_class = self.buildModel(self.buildRnnLayer(),
+                                                  self.buildRnnLayer(), False)
+    self.trainModel(x, prediction, output_class, sess)
+
+    saver = tf.train.Saver()
+    x, prediction, output_class, new_sess = self.saveAndRestoreModel(
+        self.buildRnnLayer(), self.buildRnnLayer(), sess, saver, False)
+
+    test_inputs, expected_output, frozen_graph = self.getInferenceResult(
+        x, output_class, new_sess)
+
+    result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
+
+  @test_util.enable_control_flow_v2
+  def testDynamicRnnMultiRnnCell(self):
+    sess = tf.Session(config=CONFIG)
+
+    x, prediction, output_class = self.buildModel(self.buildRnnLayer(),
+                                                  self.buildRnnLayer(), True)
+    self.trainModel(x, prediction, output_class, sess)
+
+    saver = tf.train.Saver()
+    x, prediction, output_class, new_sess = self.saveAndRestoreModel(
+        self.buildRnnLayer(),
+        self.buildRnnLayer(),
+        sess,
+        saver,
+        is_dynamic_rnn=True)
+
+    test_inputs, expected_output, frozen_graph = self.getInferenceResult(
+        x, output_class, new_sess)
+
+    result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/lite/experimental/examples/lstm/rnn.py b/tensorflow/lite/experimental/examples/lstm/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..b92a350fa3679cd70e90206ca495ec4d246ef834
--- /dev/null
+++ b/tensorflow/lite/experimental/examples/lstm/rnn.py
@@ -0,0 +1,428 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TfLite LSTMCell wrapper.
+
+TODO(renjieliu): Find a better home for this one.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.lite.python.op_hint as op_hint
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops.rnn import _best_effort_input_batch_size
+from tensorflow.python.ops.rnn import _dynamic_rnn_loop
+from tensorflow.python.ops.rnn import _should_cache
+from tensorflow.python.ops.rnn import _transpose_batch_time
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("lite.experimental.nn.dynamic_rnn")
+def dynamic_rnn(cell,
+                inputs,
+                sequence_length=None,
+                initial_state=None,
+                dtype=None,
+                parallel_iterations=None,
+                swap_memory=False,
+                time_major=True,
+                scope=None):
+  """Creates a recurrent neural network specified by RNNCell `cell`.
+
+  Performs fully dynamic unrolling of `inputs`.
+
+  Example:
+
+  ```python
+  # create a BasicRNNCell
+  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
+
+  # 'outputs' is a tensor of shape [batch_size, max_time, cell_state_size]
+
+  # defining initial state
+  initial_state = rnn_cell.zero_state(batch_size, dtype=tf.float32)
+
+  # 'state' is a tensor of shape [batch_size, cell_state_size]
+  outputs, state = tf.nn.dynamic_rnn(rnn_cell, input_data,
+                                     initial_state=initial_state,
+                                     dtype=tf.float32)
+  ```
+
+  ```python
+  # create 2 LSTMCells
+  rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in [128, 256]]
+
+  # create a RNN cell composed sequentially of a number of RNNCells
+  multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
+
+  # 'outputs' is a tensor of shape [batch_size, max_time, 256]
+  # 'state' is a N-tuple where N is the number of LSTMCells containing a
+  # tf.contrib.rnn.LSTMStateTuple for each cell
+  outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell,
+                                     inputs=data,
+                                     dtype=tf.float32)
+  ```
+
+
+  Args:
+    cell: An instance of RNNCell.
+    inputs: The RNN inputs.
+      If `time_major == False` (default), this must be a `Tensor` of shape:
+        `[batch_size, max_time, ...]`, or a nested tuple of such elements.
+      If `time_major == True`, this must be a `Tensor` of shape: `[max_time,
+        batch_size, ...]`, or a nested tuple of such elements. This may also be
+        a (possibly nested) tuple of Tensors satisfying this property.  The
+        first two dimensions must match across all the inputs, but otherwise the
+        ranks and other shape components may differ. In this case, input to
+        `cell` at each time-step will replicate the structure of these tuples,
+        except for the time dimension (from which the time is taken). The input
+        to `cell` at each time step will be a `Tensor` or (possibly nested)
+        tuple of Tensors each with dimensions `[batch_size, ...]`.
+    sequence_length: (optional) An int32/int64 vector sized `[batch_size]`. Used
+      to copy-through state and zero-out outputs when past a batch element's
+      sequence length.  So it's more for performance than correctness.
+    initial_state: (optional) An initial state for the RNN. If `cell.state_size`
+      is an integer, this must be a `Tensor` of appropriate type and shape
+      `[batch_size, cell.state_size]`. If `cell.state_size` is a tuple, this
+      should be a tuple of tensors having shapes `[batch_size, s] for s in
+      cell.state_size`.
+    dtype: (optional) The data type for the initial state and expected output.
+      Required if initial_state is not provided or RNN state has a heterogeneous
+      dtype.
+    parallel_iterations: (Default: 32).  The number of iterations to run in
+      parallel.  Those operations which do not have any temporal dependency and
+      can be run in parallel, will be.  This parameter trades off time for
+      space.  Values >> 1 use more memory but take less time, while smaller
+      values use less memory but computations take longer.
+    swap_memory: Transparently swap the tensors produced in forward inference
+      but needed for back prop from GPU to CPU.  This allows training RNNs which
+      would typically not fit on a single GPU, with very minimal (or no)
+      performance penalty.
+    time_major: The shape format of the `inputs` and `outputs` Tensors. If true,
+      these `Tensors` must be shaped `[max_time, batch_size, depth]`. If false,
+      these `Tensors` must be shaped `[batch_size, max_time, depth]`. Using
+      `time_major = True` is a bit more efficient because it avoids transposes
+      at the beginning and end of the RNN calculation.  However, most TensorFlow
+      data is batch-major, so by default this function accepts input and emits
+      output in batch-major form.
+    scope: VariableScope for the created subgraph; defaults to "rnn".
+
+  Returns:
+    A pair (outputs, state) where:
+
+    outputs: The RNN output `Tensor`.
+
+      If time_major == False (default), this will be a `Tensor` shaped:
+        `[batch_size, max_time, cell.output_size]`.
+
+      If time_major == True, this will be a `Tensor` shaped:
+        `[max_time, batch_size, cell.output_size]`.
+
+      Note, if `cell.output_size` is a (possibly nested) tuple of integers
+      or `TensorShape` objects, then `outputs` will be a tuple having the
+      same structure as `cell.output_size`, containing Tensors having shapes
+      corresponding to the shape data in `cell.output_size`.
+
+    state: The final state.  If `cell.state_size` is an int, this
+      will be shaped `[batch_size, cell.state_size]`.  If it is a
+      `TensorShape`, this will be shaped `[batch_size] + cell.state_size`.
+      If it is a (possibly nested) tuple of ints or `TensorShape`, this will
+      be a tuple having the corresponding shapes. If cells are `LSTMCells`
+      `state` will be a tuple containing a `LSTMStateTuple` for each cell.
+
+  Raises:
+    TypeError: If `cell` is not an instance of RNNCell.
+    ValueError: If inputs is None or an empty list.
+    RuntimeError: If not using control flow v2.
+  """
+
+  # Currently only support time_major == True case.
+  assert time_major
+
+  # TODO(b/123051275): We need to check if the cells are TfLiteLSTMCells or
+  # TfLiteRNNCells.
+  rnn_cell_impl.assert_like_rnncell("cell", cell)
+
+  if not control_flow_util.ENABLE_CONTROL_FLOW_V2:
+    raise RuntimeError("OpHint dynamic rnn only supports control flow v2.")
+
+  parent_first_child_input = [{
+      "parent_ophint_input_index": 0,
+      "first_child_ophint_input_index": 0
+  }]
+  parent_last_child_output = [{
+      "parent_output_index": 0,
+      # For LstmCell, the index is 2.
+      # For RnnCell, the index is 1.
+      # So we use -1 meaning it's the last one.
+      "child_output_index": -1
+  }]
+  internal_children_input_output = [{
+      "child_input_index": 0,
+      # For LstmCell, the index is 2.
+      # For RnnCell, the index is 1.
+      # So we use -1 meaning it's the last one.
+      "child_output_index": -1
+  }]
+  inputs_outputs_mappings = {
+      "parent_first_child_input": parent_first_child_input,
+      "parent_last_child_output": parent_last_child_output,
+      "internal_children_input_output": internal_children_input_output
+  }
+  tflite_wrapper = op_hint.OpHint(
+      "TfLiteDynamicRnn",
+      level=2,
+      children_inputs_mappings=inputs_outputs_mappings)
+  with vs.variable_scope(scope or "rnn") as varscope:
+    # Create a new scope in which the caching device is either
+    # determined by the parent scope, or is set to place the cached
+    # Variable using the same placement as for the rest of the RNN.
+    if _should_cache():
+      if varscope.caching_device is None:
+        varscope.set_caching_device(lambda op: op.device)
+
+    inputs = tflite_wrapper.add_input(inputs, name="input", index_override=0)
+
+    # By default, time_major==False and inputs are batch-major: shaped
+    #   [batch, time, depth]
+    # For internal calculations, we transpose to [time, batch, depth]
+    flat_input = nest.flatten(inputs)
+
+    if not time_major:
+      # (batch, time, depth) => (time, batch, depth)
+      flat_input = [ops.convert_to_tensor(input_) for input_ in flat_input]
+      flat_input = tuple(_transpose_batch_time(input_) for input_ in flat_input)
+
+    parallel_iterations = parallel_iterations or 32
+    if sequence_length is not None:
+      sequence_length = math_ops.to_int32(sequence_length)
+      if sequence_length.get_shape().rank not in (None, 1):
+        raise ValueError(
+            "sequence_length must be a vector of length batch_size, "
+            "but saw shape: %s" % sequence_length.get_shape())
+      sequence_length = array_ops.identity(  # Just to find it in the graph.
+          sequence_length,
+          name="sequence_length")
+
+    batch_size = _best_effort_input_batch_size(flat_input)
+
+    if initial_state is not None:
+      state = initial_state
+    else:
+      if not dtype:
+        raise ValueError("If there is no initial_state, you must give a dtype.")
+      if getattr(cell, "get_initial_state", None) is not None:
+        state = cell.get_initial_state(
+            inputs=None, batch_size=batch_size, dtype=dtype)
+      else:
+        state = cell.zero_state(batch_size, dtype)
+
+    def _assert_has_shape(x, shape):
+      x_shape = array_ops.shape(x)
+      packed_shape = array_ops.stack(shape)
+      return control_flow_ops.Assert(
+          math_ops.reduce_all(math_ops.equal(x_shape, packed_shape)), [
+              "Expected shape for Tensor %s is " % x.name, packed_shape,
+              " but saw shape: ", x_shape
+          ])
+
+    if not context.executing_eagerly() and sequence_length is not None:
+      # Perform some shape validation
+      with ops.control_dependencies(
+          [_assert_has_shape(sequence_length, [batch_size])]):
+        sequence_length = array_ops.identity(
+            sequence_length, name="CheckSeqLen")
+
+    inputs = nest.pack_sequence_as(structure=inputs, flat_sequence=flat_input)
+
+    outputs, final_state = _dynamic_rnn_loop(
+        cell,
+        inputs,
+        state,
+        parallel_iterations=parallel_iterations,
+        swap_memory=swap_memory,
+        sequence_length=sequence_length,
+        dtype=dtype)
+
+    # Outputs of _dynamic_rnn_loop are always shaped [time, batch, depth].
+    # If we are performing batch-major calculations, transpose output back
+    # to shape [batch, time, depth]
+    if not time_major:
+      # (time, batch, depth) => (batch, time, depth)
+      outputs = nest.map_structure(_transpose_batch_time, outputs)
+    outputs = tflite_wrapper.add_output(outputs, name="outputs")
+
+    return outputs, final_state
+
+
+def bidirectional_dynamic_rnn(cell_fw,
+                              cell_bw,
+                              inputs,
+                              sequence_length=None,
+                              initial_state_fw=None,
+                              initial_state_bw=None,
+                              dtype=None,
+                              parallel_iterations=None,
+                              swap_memory=False,
+                              time_major=False,
+                              scope=None):
+  """Creates a dynamic version of bidirectional recurrent neural network.
+
+  Takes input and builds independent forward and backward RNNs. The input_size
+  of forward and backward cell must match. The initial state for both directions
+  is zero by default (but can be set optionally) and no intermediate states are
+  ever returned -- the network is fully unrolled for the given (passed in)
+  length(s) of the sequence(s) or completely unrolled if length(s) is not
+  given.
+
+  Args:
+    cell_fw: An instance of RNNCell, to be used for forward direction.
+    cell_bw: An instance of RNNCell, to be used for backward direction.
+    inputs: The RNN inputs.
+      If time_major == False (default), this must be a tensor of shape:
+        `[batch_size, max_time, ...]`, or a nested tuple of such elements.
+      If time_major == True, this must be a tensor of shape: `[max_time,
+        batch_size, ...]`, or a nested tuple of such elements.
+    sequence_length: (optional) An int32/int64 vector, size `[batch_size]`,
+      containing the actual lengths for each of the sequences in the batch. If
+      not provided, all batch entries are assumed to be full sequences; and time
+      reversal is applied from time `0` to `max_time` for each sequence.
+    initial_state_fw: (optional) An initial state for the forward RNN. This must
+      be a tensor of appropriate type and shape `[batch_size,
+      cell_fw.state_size]`. If `cell_fw.state_size` is a tuple, this should be a
+      tuple of tensors having shapes `[batch_size, s] for s in
+      cell_fw.state_size`.
+    initial_state_bw: (optional) Same as for `initial_state_fw`, but using the
+      corresponding properties of `cell_bw`.
+    dtype: (optional) The data type for the initial states and expected output.
+      Required if initial_states are not provided or RNN states have a
+      heterogeneous dtype.
+    parallel_iterations: (Default: 32).  The number of iterations to run in
+      parallel.  Those operations which do not have any temporal dependency and
+      can be run in parallel, will be.  This parameter trades off time for
+      space.  Values >> 1 use more memory but take less time, while smaller
+      values use less memory but computations take longer.
+    swap_memory: Transparently swap the tensors produced in forward inference
+      but needed for back prop from GPU to CPU.  This allows training RNNs which
+      would typically not fit on a single GPU, with very minimal (or no)
+      performance penalty.
+    time_major: The shape format of the `inputs` and `outputs` Tensors. If true,
+      these `Tensors` must be shaped `[max_time, batch_size, depth]`. If false,
+      these `Tensors` must be shaped `[batch_size, max_time, depth]`. Using
+      `time_major = True` is a bit more efficient because it avoids transposes
+      at the beginning and end of the RNN calculation.  However, most TensorFlow
+      data is batch-major, so by default this function accepts input and emits
+      output in batch-major form.
+    scope: VariableScope for the created subgraph; defaults to
+      "bidirectional_rnn"
+
+  Returns:
+    A tuple (outputs, output_states) where:
+      outputs: A tuple (output_fw, output_bw) containing the forward and
+        the backward rnn output `Tensor`.
+        If time_major == False (default),
+          output_fw will be a `Tensor` shaped:
+          `[batch_size, max_time, cell_fw.output_size]`
+          and output_bw will be a `Tensor` shaped:
+          `[batch_size, max_time, cell_bw.output_size]`.
+        If time_major == True,
+          output_fw will be a `Tensor` shaped:
+          `[max_time, batch_size, cell_fw.output_size]`
+          and output_bw will be a `Tensor` shaped:
+          `[max_time, batch_size, cell_bw.output_size]`.
+        It returns a tuple instead of a single concatenated `Tensor`, unlike
+        in the `bidirectional_rnn`. If the concatenated one is preferred,
+        the forward and backward outputs can be concatenated as
+        `tf.concat(outputs, 2)`.
+      output_states: A tuple (output_state_fw, output_state_bw) containing
+        the forward and the backward final states of bidirectional rnn.
+
+  Raises:
+    TypeError: If `cell_fw` or `cell_bw` is not an instance of `RNNCell`.
+  """
+  rnn_cell_impl.assert_like_rnncell("cell_fw", cell_fw)
+  rnn_cell_impl.assert_like_rnncell("cell_bw", cell_bw)
+
+  with vs.variable_scope(scope or "bidirectional_rnn"):
+    # Forward direction
+    with vs.variable_scope("fw") as fw_scope:
+      output_fw, output_state_fw = dynamic_rnn(
+          cell=cell_fw,
+          inputs=inputs,
+          sequence_length=sequence_length,
+          initial_state=initial_state_fw,
+          dtype=dtype,
+          parallel_iterations=parallel_iterations,
+          swap_memory=swap_memory,
+          time_major=time_major,
+          scope=fw_scope)
+
+    # Backward direction
+    if not time_major:
+      time_axis = 1
+      batch_axis = 0
+    else:
+      time_axis = 0
+      batch_axis = 1
+
+    def _reverse(input_, seq_lengths, seq_axis, batch_axis):
+      if seq_lengths is not None:
+        return array_ops.reverse_sequence(
+            input=input_,
+            seq_lengths=seq_lengths,
+            seq_axis=seq_axis,
+            batch_axis=batch_axis)
+      else:
+        return array_ops.reverse(input_, axis=[seq_axis])
+
+    with vs.variable_scope("bw") as bw_scope:
+
+      def _map_reverse(inp):
+        return _reverse(
+            inp,
+            seq_lengths=sequence_length,
+            seq_axis=time_axis,
+            batch_axis=batch_axis)
+
+      inputs_reverse = nest.map_structure(_map_reverse, inputs)
+      tmp, output_state_bw = dynamic_rnn(
+          cell=cell_bw,
+          inputs=inputs_reverse,
+          sequence_length=sequence_length,
+          initial_state=initial_state_bw,
+          dtype=dtype,
+          parallel_iterations=parallel_iterations,
+          swap_memory=swap_memory,
+          time_major=time_major,
+          scope=bw_scope)
+
+  output_bw = _reverse(
+      tmp,
+      seq_lengths=sequence_length,
+      seq_axis=time_axis,
+      batch_axis=batch_axis)
+
+  outputs = (output_fw, output_bw)
+  output_states = (output_state_fw, output_state_bw)
+
+  return (outputs, output_states)
diff --git a/tensorflow/lite/experimental/examples/lstm/tflite_lstm.py b/tensorflow/lite/experimental/examples/lstm/rnn_cell.py
similarity index 74%
rename from tensorflow/lite/experimental/examples/lstm/tflite_lstm.py
rename to tensorflow/lite/experimental/examples/lstm/rnn_cell.py
index 2fe8ebf9e99f8b0e592e83c2e473dd2f8395c6c0..eeec76444db248883d7d69fcc951d55d0c4aed33 100644
--- a/tensorflow/lite/experimental/examples/lstm/tflite_lstm.py
+++ b/tensorflow/lite/experimental/examples/lstm/rnn_cell.py
@@ -12,16 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TfLite LSTMCell wrapper.
+"""TfLite BasicRnnCell wrapper.
 
 TODO(renjieliu): Find a better home for this one.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import tensorflow as tf
+import itertools
 
-from tensorflow.lite.python import lite
+import tensorflow.lite.python.op_hint as op_hint
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import initializers
 from tensorflow.python.layers import base as base_layer
@@ -33,8 +33,130 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("lite.experimental.nn.TfLiteRNNCell")
+class TfLiteRNNCell(rnn_cell_impl.LayerRNNCell):
+  """The most basic RNN cell.
+
+  This is used only for TfLite, it provides hints and it also makes the
+  variables in the desired for the tflite ops.
+  """
+
+  def __init__(self,
+               num_units,
+               activation=None,
+               reuse=None,
+               name=None,
+               dtype=None,
+               **kwargs):
+    """Initializes the parameters for an RNN cell.
+
+    Args:
+      num_units: int, The number of units in the RNN cell.
+      activation: Nonlinearity to use.  Default: `tanh`. It could also be string
+        that is within Keras activation function names.
+      reuse: (optional) Python boolean describing whether to reuse variables in
+        an existing scope. Raises an error if not `True` and the existing scope
+        already has the given variables.
+      name: String, the name of the layer. Layers with the same name will share
+        weights, but to avoid mistakes we require reuse=True in such cases.
+      dtype: Default dtype of the layer (default of `None` means use the type of
+        the first input). Required when `build` is called before `call`.
+      **kwargs: Dict, keyword named properties for common layer attributes, like
+        `trainable` etc when constructing the cell from configs of get_config().
+
+    Raises:
+      ValueError: If the existing scope already has the given variables.
+    """
+    super(TfLiteRNNCell, self).__init__(
+        _reuse=reuse, name=name, dtype=dtype, **kwargs)
+
+    # Inputs must be Rank-2.
+    self.input_spec = base_layer.InputSpec(ndim=2)
+
+    self._tflite_wrapper = op_hint.OpHint("UnidirectionalSequenceRnn")
+    self._num_units = num_units
+    if activation:
+      self._activation = activations.get(activation)
+    else:
+      self._activation = math_ops.tanh
+
+  @property
+  def state_size(self):
+    return self._num_units
+
+  @property
+  def output_size(self):
+    return self._num_units
+
+  def build(self, inputs_shape):
+    """Builds the RNN cell.
+
+    Args:
+      inputs_shape: Rnn input tensor shape.
+
+    Raises:
+      ValueError: If last dimension of the input shape is not known.
+    """
+    if inputs_shape[-1] is None:
+      raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s" %
+                       (inputs_shape,))
+
+    input_depth = inputs_shape[-1]
+
+    def add_variable_wrapped(name, shape, initializer, index):
+      var = self.add_variable(name, shape=shape, initializer=initializer)
+      return self._tflite_wrapper.add_input(
+          var, name=name, index_override=index)
+
+    self._input_weights = add_variable_wrapped(
+        "input_weights", [self._num_units, input_depth], None, 1)
+    self._recurrent_weights = add_variable_wrapped(
+        "recurrent_weights", [self._num_units, self._num_units], None, 2)
+    self._bias = add_variable_wrapped(
+        "bias",
+        shape=[self._num_units],
+        initializer=init_ops.zeros_initializer(dtype=self.dtype),
+        index=3)
+
+    self.built = True
+
+  def call(self, inputs, state):
+    """Most basic RNN: output = new_state = act(W * input + U * state + B)."""
+    inputs = self._tflite_wrapper.add_input(
+        inputs, tag="input", name="input", aggregate="stack", index_override=0)
+    state = self._tflite_wrapper.add_input(
+        state,
+        tag="hidden_state",
+        name="hidden_state",
+        aggregate="first",
+        index_override=4)
+    weights = array_ops.transpose(
+        array_ops.concat([self._input_weights, self._recurrent_weights], 1))
+    gate_inputs = math_ops.matmul(array_ops.concat([inputs, state], 1), weights)
+    gate_inputs = nn_ops.bias_add(gate_inputs, self._bias)
+    output = self._activation(gate_inputs)
+    output = self._tflite_wrapper.add_output(
+        output,
+        tag="output",
+        name="output",
+        index_override=1,
+        aggregate="stack")
+    return output, output
+
+  def get_config(self):
+    config = {
+        "num_units": self._num_units,
+        "activation": activations.serialize(self._activation),
+        "reuse": self._reuse,
+    }
+    base_config = super(TfLiteRNNCell, self).get_config()
+    return dict(itertools.chain(base_config.items(), config.items()))
+
+
+@tf_export("lite.experimental.nn.TFLiteLSTMCell")
 class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
   """Long short-term memory unit (LSTM) recurrent network cell.
 
@@ -132,7 +254,7 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
     # TODO(raziel): layers stuff -- chop if un-layerizing Op.
     self.input_spec = base_layer.InputSpec(ndim=2)
 
-    self._tflite_wrapper = lite.OpHint("UnidirectionalSequenceLstm")
+    self._tflite_wrapper = op_hint.OpHint("UnidirectionalSequenceLstm")
 
     self._num_units = num_units
     self._use_peepholes = use_peepholes
@@ -148,7 +270,7 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
 
     self._output_size = num_proj if num_proj else num_units
     self._state_size = (
-        tf.nn.rnn_cell.LSTMStateTuple(num_units, self._output_size)
+        rnn_cell_impl.LSTMStateTuple(num_units, self._output_size)
         if state_is_tuple else num_units + self._output_size)
 
   @property
@@ -184,7 +306,7 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
       var = self.add_variable(
           name, shape=shape, initializer=initializer, partitioner=partitioner)
       return self._tflite_wrapper.add_input(
-          var, name="name", index_override=index)
+          var, name=name, index_override=index)
 
     weight_initializer = self._initializer
     if self.dtype is None:
@@ -230,10 +352,10 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
     # f stands for forget, i stands for input and o stands for output.
     if self._use_peepholes:
       self._w_f_diag = add_variable_wrapped("w_f_diag", [self._num_units],
-                                            self._initializer, 9,
+                                            self._initializer, 10,
                                             maybe_partitioner)
       self._w_i_diag = add_variable_wrapped("w_i_diag", [self._num_units],
-                                            self._initializer, 10,
+                                            self._initializer, 9,
                                             maybe_partitioner)
       self._w_o_diag = add_variable_wrapped("w_o_diag", [self._num_units],
                                             self._initializer, 11,
@@ -319,24 +441,28 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
     # c is the final state.
     # m is the output.
     i = nn_ops.bias_add(
-        tf.matmul(
+        math_ops.matmul(
             inputs_and_m_prev,
-            tf.concat([self.input_to_input_w, self.cell_to_input_w], axis=1),
+            array_ops.concat([self.input_to_input_w, self.cell_to_input_w],
+                             axis=1),
             transpose_b=True), self.input_bias)
     f = nn_ops.bias_add(
-        tf.matmul(
+        math_ops.matmul(
             inputs_and_m_prev,
-            tf.concat([self.input_to_forget_w, self.cell_to_forget_w], axis=1),
+            array_ops.concat([self.input_to_forget_w, self.cell_to_forget_w],
+                             axis=1),
             transpose_b=True), self.forget_bias)
     o = nn_ops.bias_add(
-        tf.matmul(
+        math_ops.matmul(
             inputs_and_m_prev,
-            tf.concat([self.input_to_output_w, self.cell_to_output_w], axis=1),
+            array_ops.concat([self.input_to_output_w, self.cell_to_output_w],
+                             axis=1),
             transpose_b=True), self.output_bias)
     j = nn_ops.bias_add(
-        tf.matmul(
+        math_ops.matmul(
             inputs_and_m_prev,
-            tf.concat([self.input_to_cell_w, self.cell_to_cell_w], axis=1),
+            array_ops.concat([self.input_to_cell_w, self.cell_to_cell_w],
+                             axis=1),
             transpose_b=True), self.cell_bias)
 
     # Diagonal connections
@@ -359,7 +485,7 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
       m = sigmoid(o) * self._activation(c)
 
     if self._num_proj is not None:
-      transposed_proj_kernel = tf.transpose(self._proj_kernel)
+      transposed_proj_kernel = array_ops.transpose(self._proj_kernel)
       m = math_ops.matmul(m, transposed_proj_kernel)
 
       if self._proj_clip is not None:
@@ -373,7 +499,7 @@ class TFLiteLSTMCell(rnn_cell_impl.LayerRNNCell):
         m, tag="m", name="m", index_override=2, aggregate="stack")
 
     new_state = (
-        tf.nn.rnn_cell.LSTMStateTuple(c, m)
+        rnn_cell_impl.LSTMStateTuple(c, m)
         if self._state_is_tuple else array_ops.concat([c, m], 1))
     return m, new_state
 
diff --git a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
index eeb48d123113c5924a74286ad1e0851eb484cdb8..5915978db8539bc2eed69ffd29a700ca9d90669d 100644
--- a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
+++ b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
@@ -20,12 +20,12 @@ import numpy as np
 import tensorflow as tf
 
 from tensorflow.examples.tutorials.mnist import input_data
-from tensorflow.lite.experimental.examples.lstm.tflite_lstm import TFLiteLSTMCell
 from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.tools import optimize_for_inference_lib
 
+
 # Number of steps to train model.
 TRAIN_STEPS = 1
 
@@ -55,19 +55,21 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
 
   def buildLstmLayer(self):
     return tf.nn.rnn_cell.MultiRNNCell([
-        TFLiteLSTMCell(
+        tf.lite.experimental.nn.TFLiteLSTMCell(
             self.num_units, use_peepholes=True, forget_bias=0, name="rnn1"),
-        TFLiteLSTMCell(self.num_units, num_proj=8, forget_bias=0, name="rnn2"),
-        TFLiteLSTMCell(
+        tf.lite.experimental.nn.TFLiteLSTMCell(
+            self.num_units, num_proj=8, forget_bias=0, name="rnn2"),
+        tf.lite.experimental.nn.TFLiteLSTMCell(
             self.num_units // 2,
             use_peepholes=True,
             num_proj=8,
             forget_bias=0,
             name="rnn3"),
-        TFLiteLSTMCell(self.num_units, forget_bias=0, name="rnn4")
+        tf.lite.experimental.nn.TFLiteLSTMCell(
+            self.num_units, forget_bias=0, name="rnn4")
     ])
 
-  def buildModel(self, lstm_layer, is_dynamic_rnn, is_train):
+  def buildModel(self, lstm_layer, is_dynamic_rnn):
     # Weights and biases for output softmax layer.
     out_weights = tf.Variable(
         tf.random_normal([self.num_units, self.n_classes]))
@@ -77,16 +79,12 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
     x = tf.placeholder(
         "float", [None, self.time_steps, self.n_input], name="INPUT_IMAGE")
 
-    # For dynamic_rnn, train with dynamic_rnn and inference with static_rnn.
     # x is shaped [batch_size,time_steps,num_inputs]
     if is_dynamic_rnn:
-      if is_train:
-        lstm_input = x
-        outputs, _ = tf.nn.dynamic_rnn(lstm_layer, lstm_input, dtype="float32")
-        outputs = tf.unstack(outputs, axis=1)
-      else:
-        lstm_input = tf.unstack(x, self.time_steps, 1)
-        outputs, _ = tf.nn.static_rnn(lstm_layer, lstm_input, dtype="float32")
+      lstm_input = tf.transpose(x, perm=[1, 0, 2])
+      outputs, _ = tf.lite.experimental.nn.dynamic_rnn(
+          lstm_layer, lstm_input, dtype="float32")
+      outputs = tf.unstack(outputs, axis=0)
     else:
       lstm_input = tf.unstack(x, self.time_steps, 1)
       outputs, _ = tf.nn.static_rnn(lstm_layer, lstm_input, dtype="float32")
@@ -126,8 +124,7 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
 
     # Reset the graph.
     tf.reset_default_graph()
-    x, prediction, output_class = self.buildModel(
-        lstm_layer, is_dynamic_rnn, is_train=False)
+    x, prediction, output_class = self.buildModel(lstm_layer, is_dynamic_rnn)
 
     new_sess = tf.Session(config=CONFIG)
     saver = tf.train.Saver()
@@ -157,8 +154,8 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
         curr, ["INPUT_IMAGE_LITE"], ["OUTPUT_CLASS"],
         [tf.float32.as_datatype_enum])
 
-    tflite = tf.lite.toco_convert(
-        curr, [tflite_input], [outputs], allow_custom_ops=False)
+    converter = tf.lite.TFLiteConverter(curr, [tflite_input], [outputs])
+    tflite = converter.convert()
     interpreter = tf.lite.Interpreter(model_content=tflite)
 
     try:
@@ -179,7 +176,7 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
     sess = tf.Session(config=CONFIG)
 
     x, prediction, output_class = self.buildModel(
-        self.buildLstmLayer(), is_dynamic_rnn=False, is_train=True)
+        self.buildLstmLayer(), is_dynamic_rnn=False)
     self.trainModel(x, prediction, output_class, sess)
 
     saver = tf.train.Saver()
@@ -192,26 +189,15 @@ class UnidirectionalSequenceLstmTest(test_util.TensorFlowTestCase):
     result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
     self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
 
+  @test_util.enable_control_flow_v2
   def testDynamicRnnMultiRnnCell(self):
     sess = tf.Session(config=CONFIG)
 
     x, prediction, output_class = self.buildModel(
-        self.buildLstmLayer(), is_dynamic_rnn=True, is_train=True)
+        self.buildLstmLayer(), is_dynamic_rnn=True)
     self.trainModel(x, prediction, output_class, sess)
 
-    # Since we don't yet support OpHints for dynamic, we will load the model
-    # back in as a static model. This requires the variables to have the same
-    # names as if they were trained as a static. Thus, we get rid of while/rnn
-    # names.
-    variables_to_save = {}
-    for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES):
-      op_name = i.name
-      if op_name.startswith("while/rnn/"):
-        op_name = op_name.split("while/rnn/")[1]
-      if op_name.endswith(":0"):
-        op_name = op_name.split(":0")[0]
-      variables_to_save[op_name] = i
-    saver = tf.train.Saver(variables_to_save)
+    saver = tf.train.Saver()
 
     x, prediction, output_class, new_sess = self.saveAndRestoreModel(
         self.buildLstmLayer(), sess, saver, is_dynamic_rnn=True)
diff --git a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b44c13877e5c2d9edc112c9e6d3f94e5755d5a0
--- /dev/null
+++ b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py
@@ -0,0 +1,221 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tempfile
+import numpy as np
+import tensorflow as tf
+
+from tensorflow import flags
+
+from tensorflow.examples.tutorials.mnist import input_data
+from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+from tensorflow.python.tools import optimize_for_inference_lib
+
+FLAGS = flags.FLAGS
+
+# Number of steps to train model.
+TRAIN_STEPS = 1
+
+CONFIG = tf.ConfigProto(device_count={"GPU": 0})
+
+
+class UnidirectionalSequenceRnnTest(test_util.TensorFlowTestCase):
+
+  def __init__(self, *args, **kwargs):
+    super(UnidirectionalSequenceRnnTest, self).__init__(*args, **kwargs)
+    # Define constants
+    # Unrolled through 28 time steps
+    self.time_steps = 28
+    # Rows of 28 pixels
+    self.n_input = 28
+    # Learning rate for Adam optimizer
+    self.learning_rate = 0.001
+    # MNIST is meant to be classified in 10 classes(0-9).
+    self.n_classes = 10
+    # Batch size
+    self.batch_size = 16
+    # Rnn Units.
+    self.num_units = 16
+
+  def setUp(self):
+    super(UnidirectionalSequenceRnnTest, self).setUp()
+    # Import MNIST dataset
+    data_dir = tempfile.mkdtemp(dir=FLAGS.test_tmpdir)
+    self.mnist = input_data.read_data_sets(data_dir, one_hot=True)
+
+  def buildRnnLayer(self):
+    return tf.nn.rnn_cell.MultiRNNCell([
+        tf.lite.experimental.nn.TfLiteRNNCell(self.num_units, name="rnn1"),
+        tf.lite.experimental.nn.TfLiteRNNCell(self.num_units, name="rnn2")
+    ])
+
+  def buildModel(self, rnn_layer, is_dynamic_rnn):
+    # Weights and biases for output softmax layer.
+    out_weights = tf.Variable(
+        tf.random_normal([self.num_units, self.n_classes]))
+    out_bias = tf.Variable(tf.random_normal([self.n_classes]))
+
+    # input image placeholder
+    x = tf.placeholder(
+        "float", [None, self.time_steps, self.n_input], name="INPUT_IMAGE")
+
+    # x is shaped [batch_size,time_steps,num_inputs]
+    if is_dynamic_rnn:
+      rnn_input = tf.transpose(x, perm=[1, 0, 2])
+      outputs, _ = tf.lite.experimental.nn.dynamic_rnn(
+          rnn_layer, rnn_input, dtype="float32")
+      outputs = tf.unstack(outputs, axis=0)
+    else:
+      rnn_input = tf.unstack(x, self.time_steps, 1)
+      outputs, _ = tf.nn.static_rnn(rnn_layer, rnn_input, dtype="float32")
+
+    # Compute logits by multiplying outputs[-1] of shape [batch_size,num_units]
+    # by the softmax layer's out_weight of shape [num_units,n_classes]
+    # plus out_bias
+    prediction = tf.matmul(outputs[-1], out_weights) + out_bias
+    output_class = tf.nn.softmax(prediction, name="OUTPUT_CLASS")
+
+    return x, prediction, output_class
+
+  def trainModel(self, x, prediction, output_class, sess):
+    # input label placeholder
+    y = tf.placeholder("float", [None, self.n_classes])
+    # Loss function
+    loss = tf.reduce_mean(
+        tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y))
+    # Optimization
+    opt = tf.train.AdamOptimizer(
+        learning_rate=self.learning_rate).minimize(loss)
+
+    # Initialize variables
+    sess.run(tf.global_variables_initializer())
+    for _ in range(TRAIN_STEPS):
+      batch_x, batch_y = self.mnist.train.next_batch(
+          batch_size=self.batch_size, shuffle=False)
+
+      batch_x = batch_x.reshape((self.batch_size, self.time_steps,
+                                 self.n_input))
+      sess.run(opt, feed_dict={x: batch_x, y: batch_y})
+
+  def saveAndRestoreModel(self, rnn_layer, sess, saver, is_dynamic_rnn):
+    """Saves and restores the model to mimic the most common use case.
+
+    Args:
+      rnn_layer: The rnn layer either a single rnn cell or a multi rnn cell.
+      sess: Old session.
+      saver: saver created by tf.train.Saver()
+      is_dynamic_rnn: use dynamic_rnn or not.
+
+    Returns:
+      A tuple containing:
+
+      - Input tensor of the restored model.
+      - Prediction tensor of the restored model.
+      - Output tensor, which is the softwmax result of the prediction tensor.
+      - new session of the restored model.
+
+    """
+    model_dir = tempfile.mkdtemp(dir=FLAGS.test_tmpdir)
+    saver.save(sess, model_dir)
+
+    # Reset the graph.
+    tf.reset_default_graph()
+    x, prediction, output_class = self.buildModel(rnn_layer, is_dynamic_rnn)
+
+    new_sess = tf.Session(config=CONFIG)
+    saver = tf.train.Saver()
+    saver.restore(new_sess, model_dir)
+    return x, prediction, output_class, new_sess
+
+  def getInferenceResult(self, x, output_class, sess):
+    b1, _ = self.mnist.train.next_batch(batch_size=1)
+    sample_input = np.reshape(b1, (1, self.time_steps, self.n_input))
+
+    expected_output = sess.run(output_class, feed_dict={x: sample_input})
+    frozen_graph = tf.graph_util.convert_variables_to_constants(
+        sess, sess.graph_def, [output_class.op.name])
+    return sample_input, expected_output, frozen_graph
+
+  def tfliteInvoke(self, graph, test_inputs, outputs):
+    tf.reset_default_graph()
+    # Turn the input into placeholder of shape 1
+    tflite_input = tf.placeholder(
+        "float", [1, self.time_steps, self.n_input], name="INPUT_IMAGE_LITE")
+    tf.import_graph_def(graph, name="", input_map={"INPUT_IMAGE": tflite_input})
+    with tf.Session() as sess:
+      curr = sess.graph_def
+      curr = convert_op_hints_to_stubs(graph_def=curr)
+
+    curr = optimize_for_inference_lib.optimize_for_inference(
+        curr, ["INPUT_IMAGE_LITE"], ["OUTPUT_CLASS"],
+        [tf.float32.as_datatype_enum])
+
+    converter = tf.lite.TFLiteConverter(curr, [tflite_input], [outputs])
+    tflite = converter.convert()
+    interpreter = tf.lite.Interpreter(model_content=tflite)
+    interpreter.allocate_tensors()
+
+    input_index = interpreter.get_input_details()[0]["index"]
+    interpreter.set_tensor(input_index, test_inputs)
+    interpreter.invoke()
+    output_index = interpreter.get_output_details()[0]["index"]
+    result = interpreter.get_tensor(output_index)
+    # Reset all variables so it will not pollute other inferences.
+    interpreter.reset_all_variables()
+    return result
+
+  def testStaticRnnMultiRnnCell(self):
+    sess = tf.Session(config=CONFIG)
+
+    x, prediction, output_class = self.buildModel(
+        self.buildRnnLayer(), is_dynamic_rnn=False)
+    self.trainModel(x, prediction, output_class, sess)
+
+    saver = tf.train.Saver()
+    x, prediction, output_class, new_sess = self.saveAndRestoreModel(
+        self.buildRnnLayer(), sess, saver, is_dynamic_rnn=False)
+
+    test_inputs, expected_output, frozen_graph = self.getInferenceResult(
+        x, output_class, new_sess)
+
+    result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
+
+  @test_util.enable_control_flow_v2
+  def testDynamicRnnMultiRnnCell(self):
+    sess = tf.Session(config=CONFIG)
+
+    x, prediction, output_class = self.buildModel(
+        self.buildRnnLayer(), is_dynamic_rnn=True)
+    self.trainModel(x, prediction, output_class, sess)
+
+    saver = tf.train.Saver()
+
+    x, prediction, output_class, new_sess = self.saveAndRestoreModel(
+        self.buildRnnLayer(), sess, saver, is_dynamic_rnn=True)
+
+    test_inputs, expected_output, frozen_graph = self.getInferenceResult(
+        x, output_class, new_sess)
+
+    result = self.tfliteInvoke(frozen_graph, test_inputs, output_class)
+    self.assertTrue(np.allclose(expected_output, result, rtol=1e-6, atol=1e-2))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/lite/experimental/kernels/BUILD b/tensorflow/lite/experimental/kernels/BUILD
index dd314545cb6488ea2a76494df39b4b69e92eca33..5d2337f2e225de71689d1fbe8b2d945c0f88a4a1 100644
--- a/tensorflow/lite/experimental/kernels/BUILD
+++ b/tensorflow/lite/experimental/kernels/BUILD
@@ -50,21 +50,13 @@ cc_library(
     }),
     deps = [
         ":ctc_utils",
-        "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite:string_util",
         "//tensorflow/lite/c:c_api_internal",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "//tensorflow/lite/kernels:gemm_support",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:op_macros",
-        "//tensorflow/lite/kernels/internal:kernel_utils",
         "//tensorflow/lite/kernels/internal:optimized",
         "//tensorflow/lite/kernels/internal:optimized_base",
-        "//tensorflow/lite/kernels/internal:quantization_util",
-        "//tensorflow/lite/kernels/internal:reference_base",
         "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/kernels/internal:tensor_utils",
         "@flatbuffers",
     ],
 )
diff --git a/tensorflow/lite/experimental/micro/BUILD b/tensorflow/lite/experimental/micro/BUILD
index e11159868e11a09e1b10d59da274cd08ee472593..b16b8b49f8a46a43475cc08807570e84a160aed4 100644
--- a/tensorflow/lite/experimental/micro/BUILD
+++ b/tensorflow/lite/experimental/micro/BUILD
@@ -12,6 +12,8 @@ load(
 cc_library(
     name = "micro_framework",
     srcs = [
+        "debug_log.cc",
+        "debug_log_numbers.cc",
         "micro_error_reporter.cc",
         "micro_interpreter.cc",
         "micro_mutable_op_resolver.cc",
@@ -19,13 +21,14 @@ cc_library(
     ],
     hdrs = [
         "compatibility.h",
+        "debug_log.h",
+        "debug_log_numbers.h",
         "micro_error_reporter.h",
         "micro_interpreter.h",
         "micro_mutable_op_resolver.h",
         "simple_tensor_allocator.h",
     ],
     deps = [
-        "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/schema:schema_fbs",
diff --git a/tensorflow/lite/experimental/micro/README.md b/tensorflow/lite/experimental/micro/README.md
index 673daed74c41a1880e6f8803258033cce8d333ca..4f7cfae5c5b196c64152b8f37dc3d6a4306d4d7c 100644
--- a/tensorflow/lite/experimental/micro/README.md
+++ b/tensorflow/lite/experimental/micro/README.md
@@ -1,46 +1,97 @@
 # TensorFlow Lite for Microcontrollers
 
-This an experimental port of TensorFlow Lite aimed at micro controllers and other devices with only kilobytes of memory. It doesn't require any operating system support, any standard C or C++ libraries, or dynamic memory allocation, so it's designed to be portable even to 'bare metal' systems. The core runtime fits in 16KB on a Cortex M3, and with enough operators to run a speech keyword detection model, takes up a total of 22KB.
-
-The design goals are for the framework to be:
-
-- **Readable**: We want embedded software engineers to be able to understand what's required to run ML inference without having to study research papers. We've tried to keep the code base small, modular, and have reference implementations of all operations to help with this.
-
-- **Easy to modify**: We know that there are a lot of different platforms and requirements in the embedded world, and we don't expect to cover all of them in one framework. Instead, we're hoping that it can be a good starting point for developers to build on top of to meet their own needs. For example, we tried to make it easy to replace the implementations of key computational operators that are often crucial for performance, without having to touch the data flow and other runtime code. We want it to make more sense to use our workflow to handle things like model import and less-important operations, and customize the parts that matter, rather than having to reimplement everything in your own engine.
-
-- **Well-tested**: If you're modifying code, you need to know if your changes are correct. Having an easy way to test lets you develop much faster. To help there, we've written tests for all the components, and we've made sure that the tests can be run on almost any platform, with no dependencies apart from the ability to log text to a debug console somewhere. We also provide an easy way to run all the tests on-device as part of an automated test framework, and we use qemu/Renode emulation so that tests can be run even without physical devices present.
-
-- **Easy to integrate**: We want to be as open a system as possible, and use the best code available for each platform. To do that, we're going to rely on projects like [CMSIS-NN](https://www.keil.com/pack/doc/CMSIS/NN/html/index.html), [uTensor](https://github.com/uTensor/uTensor), and other vendor libraries to handle as much performance-critical code as possible. We know that there are an increasing number of options to accelerate neural networks on microcontrollers, so we're aiming to be a good host for deploying those hardware technologies too.
-
-- **Compatible**: We're using the same file schema, interpreter API, and kernel interface as regular TensorFlow Lite, so we leverage the large existing set of tools, documentation, and examples for the project. The biggest barrier to deploying ML models is getting them from a training environment into a form that's easy to run inference on, so we see reusing this rich ecosystem as being crucial to being easily usable. We also hope to integrate this experimental work back into the main codebase in the future.
-
-To meet those goals, we've made some tradeoffs:
-
-- **Simple C++**: To help with readability, our code is written in a modern version of C++, but we generally treat it as a "better C", rather relying on more complex features such as template meta-programming. As mentioned earlier, we avoid any use of dynamic memory allocation (new/delete) or the standard C/C++ libraries, so we believe this should still be fairly portable. It does mean that some older devices with C-only toolchains won't be supported, but we're hoping that the reference operator implementations (which are simple C-like functions) can still be useful in those cases. The interfaces are also designed to be C-only, so it should be possible to integrate the resulting library with pure C projects.
-
-- **Interpreted**: Code generation is a popular pattern for embedded code, because it gives standalone code that's easy to modify and step through, but we've chosen to go with an interpreted approach. In our internal microcontroller work we've found that using an extremely stripped-down interpreter with almost no dependencies gives us a lot of the same advantages, but is easier to maintain. For example, when new updates come out for the underlying library, you can just merge your local modifications in a single step, rather than having to regenerate new code and then patch in any changes you subsequently made. The coarse granularity of the interpreted primitives means that each operation call typically takes hundreds of thousands of instruction cycles at least, so we don't see noticeable performance gains from avoiding what's essentially a single switch statement at the interpreter level to call each operation. We're still working on improving the packaging though, for example we're considering having the ability to snapshot all the source files and headers used for a particular model, being able to compile the code and data together as a library, and then access it through a minimal set of C interface calls which hide the underlying complexity.
-
-- **Flatbuffers**: We represent our models using [the standard flatbuffer schema used by the rest of TensorFlow Lite](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema.fbs), with the difference that we always keep it in read-only program memory (typically flash) rather than relying on having a file system to read it from. This is a good fit because flatbuffer's serialized format is designed to be mapped into memory without requiring any extra memory allocations or modifications to access it. All of the functions to read model values work directly on the serialized bytes, and large sections of data like weights are directly accessible as sequential C-style arrays of their data type, with no strides or unpacking needed. We do get a lot of value from using flatbuffers, but there is a cost in complexity. The flat buffer library code is all inline [inside the main headers](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema_generated.h), but it isn't straightforward to inspect their implementations, and the model data structures aren't easy to comprehend from the debugger. The header for the schema itself also has to be periodically updated when new information is added to the file format, though we try to handle that transparently for most developers by checking in a pre-generated version.
-
-- **Code Duplication**: Some of the code in this prototype largely duplicates the logic in other parts of the TensorFlow Lite code base, for example the operator wrappers. We've tried to keep share as much as we can between the two interpreters, but there are some assumptions built into the original runtime that make this difficult. We'll be working on modularizing the main interpreter so that we can move to an entirely shared system.
-
-This initial preview release is designed to get early feedback, and is not intended to be a final product. It only includes enough operations to run a simple keyword recognition model, and the implementations are not optimized. We're hoping this will be a good way to get feedback and collaborate to improve the framework.
-
-## Getting Started
-
-Building requires a Linux or OS X machine.
-
- - Open a terminal
- - Download the TensorFlow source with `git clone https://github.com/tensorflow`
- - Enter the source root directory by running `cd tensorflow`
- - Download the dependencies by running `tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh`. This may take a few minutes
- - Build and test the library with `make -f tensorflow/lite/experimental/micro/tools/make/Makefile test`
+This an experimental port of TensorFlow Lite aimed at micro controllers and
+other devices with only kilobytes of memory. It doesn't require any operating
+system support, any standard C or C++ libraries, or dynamic memory allocation,
+so it's designed to be portable even to 'bare metal' systems. The core runtime
+fits in 16KB on a Cortex M3, and with enough operators to run a speech keyword
+detection model, takes up a total of 22KB.
+
+## Table of Contents
+
+-   [Getting Started](#getting-started)
+
+    *   [Getting Started with Portable Reference Code](#getting-started-with-portable-reference-code)
+    *   [Building Portable Reference Code using Make](#building-portable-reference-code-using-make)
+    *   [Building for the "Blue Pill" STM32F103 using Make](#building-for-the-blue-pill-stm32f103-using-make)
+    *   [Building for "Hifive1" SiFive FE310 development board using Make](#building-for-hifive1-sifive-fe310-development-board-using-make)
+    *   [Building for Ambiq Micro Apollo3Blue EVB using Make](#building-for-ambiq-micro-apollo3blue-evb-using-make)
+        *   [Additional Apollo3 Instructions](#additional-apollo3-instructions)
+    *   [Building for the Eta Compute ECM3531 EVB using Make](#Building-for-the-Eta-Compute-ECM3531-EVB-using-Make)
+
+-   [Goals](#goals)
+
+-   [Generating Project Files](#generating-project-#files)
+
+-   [How to Port TensorFlow Lite Micro to a New Platform](#how-to-port-tensorflow-lite-micro-to-a-new-platform)
+
+    *   [Requirements](#requirements)
+    *   [Getting Started](getting-started)
+    *   [Troubleshooting](#troubleshooting)
+    *   [Optimizing for your Platform](#optimizing-for-your-platform)
+    *   [Code Module Organization](#code-module-organization)
+    *   [Working with Generated Projects](#working-with-generated-projects)
+    *   [Supporting a Platform with Makefiles](#supporting-a-platform-with-makefiles)
+    *   [Supporting a Platform with Emulation Testing](#supporting-a-platform-with-emulation-testing)
+    *   [Implementing More Optimizations](#implementing-more-optimizations)
+
+# Getting Started
+
+One of the challenges of embedded software development is that there are a lot
+of different architectures, devices, operating systems, and build systems. We
+aim to support as many of the popular combinations as we can, and make it as
+easy as possible to add support for others.
+
+If you're a product developer, we have build instructions or pre-generated
+project files that you can download for the following platforms:
+
+Device                                                                                         | Mbed                                                                           | Keil                                                                           | Make/GCC
+---------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------ | ------------------------------------------------------------------------------ | --------
+[STM32F746G Discovery Board](https://www.st.com/en/evaluation-tools/32f746gdiscovery.html)     | [Download](https://drive.google.com/open?id=1OtgVkytQBrEYIpJPsE8F6GUKHPBS3Xeb) | -                                                                              | [Download](https://drive.google.com/open?id=1u46mTtAMZ7Y1aD-He1u3R8AE4ZyEpnOl)
+["Blue Pill" STM32F103-compatible development board](https://github.com/google/stm32_bare_lib) | -                                                                              | -                                                                              | [Instructions](#building-for-the-blue-pill-stm32f103-using-make)
+[Ambiq Micro Apollo3Blue EVB using Make](https://ambiqmicro.com/apollo-ultra-low-power-mcus/)  | -                                                                              | -                                                                              | [Instructions](#building-for-ambiq-micro-apollo3blue-evb-using-make)
+[Generic Keil uVision Projects](http://www2.keil.com/mdk5/uvision/)                            | -                                                                              | [Download](https://drive.google.com/open?id=1Lw9rsdquNKObozClLPoE5CTJLuhfh5mV) | -
+[Eta Compute ECM3531 EVB](https://etacompute.com/)                                             | -                                                                              | -                                                                              | [Instructions](#Building-for-the-Eta-Compute-ECM3531-EVB-using-Make)
+
+If your device is not yet supported, it may not be too hard to add support. You
+can learn about that process
+[here](#how-to-port-tensorflow-lite-micro-to-a-new-platform). We're looking
+forward to getting your help expanding this table!
+
+## Getting Started with Portable Reference Code
+
+If you don't have a particular microcontroller platform in mind yet, or just
+want to try out the code before beginning porting, the easiest way to begin is
+by
+[downloading the platform-agnostic reference code](https://drive.google.com/open?id=1cawEQAkqquK_SO4crReDYqf_v7yAwOY8).
+You'll see a series of folders inside the archive, with each one containing just
+the source files you need to build one binary. There is a simple Makefile for
+each folder, but you should be able to load the files into almost any IDE and
+build them. There's also a [Visual Studio Code](https://code.visualstudio.com/) project file already set up, so
+you can easily explore the code in a cross-platform IDE.
+
+## Building Portable Reference Code using Make
+
+It's easy to build portable reference code directly from GitHub using make if
+you're on a Linux or OS X machine.
+
+-   Open a terminal
+-   Download the TensorFlow source with `git clone
+    https://github.com/tensorflow/tensorflow.git`
+-   Enter the source root directory by running `cd tensorflow`
+-   Download the dependencies by running
+    `tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh`.
+    This may take a few minutes
+-   Build and test the library with `make -f
+    tensorflow/lite/experimental/micro/tools/make/Makefile test`
 
 You should see a series of compilation steps, followed by `~~~ALL TESTS
 PASSED~~~` for the various tests of the code that it will run. If there's an
 error, you should get an informative message from make about what went wrong.
 
-These tests are all built as simple binaries with few dependencies, so you can run them manually. For example, here's how to run the depthwise convolution test, and its output:
+These tests are all built as simple binaries with few dependencies, so you can
+run them manually. For example, here's how to run the depthwise convolution
+test, and its output:
 
 ```
 tensorflow/lite/experimental/micro/tools/make/gen/linux_x86_64/bin/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test
@@ -53,7 +104,9 @@ Testing SimpleTestReluQuantized
 ~ALL TESTS PASSED~~~
 ```
 
-Looking at the [depthwise_conv_test.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc) code, you'll see a sequence that looks like this:
+Looking at the
+[depthwise_conv_test.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc)
+code, you'll see a sequence that looks like this:
 
 ```
 ...
@@ -74,19 +127,41 @@ output, and the test harness that runs the binary during the make process knows
 that everything ran correctly. If there's an error, the lack of the expected
 string lets the harness know that the test failed.
 
-So, why are we running tests in this complicated way? So far, we've been building binaries that run locally on the Mac OS or Linux machine you're building on, but this approach becomes important when we're targeting simple micro controller devices.
-
-## Building for the "Blue Pill" STM32F103
-
-The goal of this library is to enable machine learning on resource-constrained micro controllers and DSPs, and as part of that we've targeted the ["Blue Pill" STM32F103-compatible development board](https://github.com/google/stm32_bare_lib) as a cheap and popular platform. It only has 20KB of RAM and 64KB of flash, so it's a good device to ensure we can run efficiently on small chips.
-
-It's fairly easy to [buy and wire up a physical board](https://github.com/google/stm32_bare_lib#wiring-up-your-blue-pill), but even if you don't have an actual device, the [Renode project](https://renode.io/) makes it easy to run a faithful emulation on your desktop machine. You'll need [Docker](https://www.docker.com/) installed, but once you have that set up, try running the following command:
-
-`make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=bluepill test`
-
-You should see a similar set of outputs as you did in the previous section, with the addition of some extra Docker logging messages. These are because we're using Docker to run the Renode micro controller emulation tool, and the tests themselves are being run on a simulated STM32F103 device. The communication channels between an embedded device and the host are quite limited, so the test harness looks at the output of the debug log to see if tests have passed, just as it did in the previous section. This makes it a very flexible way to run cross-platform tests, even when a platform has no operating system facilities, as long as it can output debugging text logs.
-
-To understand what's happening here, try running the same depthwise convolution test, but through the emulated device test harness, with the following command:
+So, why are we running tests in this complicated way? So far, we've been
+building binaries that run locally on the Mac OS or Linux machine you're
+building on, but this approach becomes important when we're targeting simple
+micro controller devices.
+
+## Building for the "Blue Pill" STM32F103 using Make
+
+The goal of this library is to enable machine learning on resource-constrained
+micro controllers and DSPs, and as part of that we've targeted the
+["Blue Pill" STM32F103-compatible development board](https://github.com/google/stm32_bare_lib)
+as a cheap and popular platform. It only has 20KB of RAM and 64KB of flash, so
+it's a good device to ensure we can run efficiently on small chips.
+
+It's fairly easy to
+[buy and wire up a physical board](https://github.com/google/stm32_bare_lib#wiring-up-your-blue-pill),
+but even if you don't have an actual device, the
+[Renode project](https://renode.io/) makes it easy to run a faithful emulation
+on your desktop machine. You'll need [Docker](https://www.docker.com/)
+installed, but once you have that set up, try running the following command:
+
+`make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=bluepill
+test`
+
+You should see a similar set of outputs as you did in the previous section, with
+the addition of some extra Docker logging messages. These are because we're
+using Docker to run the Renode micro controller emulation tool, and the tests
+themselves are being run on a simulated STM32F103 device. The communication
+channels between an embedded device and the host are quite limited, so the test
+harness looks at the output of the debug log to see if tests have passed, just
+as it did in the previous section. This makes it a very flexible way to run
+cross-platform tests, even when a platform has no operating system facilities,
+as long as it can output debugging text logs.
+
+To understand what's happening here, try running the same depthwise convolution
+test, but through the emulated device test harness, with the following command:
 
 ```
 tensorflow/lite/experimental/micro/testing/test_bluepill_binary.sh \
@@ -115,7 +190,7 @@ LOGS:
 03:27:32.4834 [DEBUG] cpu.uartSemihosting: [+0.18ms host +0s virt 0s virt from start]   Testing SimpleTestReluQuantized
 03:27:32.4838 [DEBUG] cpu.uartSemihosting: [+0.4ms host +0s virt 0s virt from start]   4/4 tests passed
 03:27:32.4839 [DEBUG] cpu.uartSemihosting: [+41µs host +0s virt 0s virt from start]   ~~~ALL TESTS PASSED~~~
-03:27:32.4839 [DEBUG] cpu.uartSemihosting: [+5µs host +0s virt 0s virt from start]   
+03:27:32.4839 [DEBUG] cpu.uartSemihosting: [+5µs host +0s virt 0s virt from start]
 ...
 tensorflow/lite/experimental/micro/tools/make/gen/bluepill_cortex-m3/bin/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test: PASS
 ```
@@ -126,3 +201,731 @@ debug logs here, along with the magic string `~~~ALL TESTS PASSED~~~`. This is
 the exact same code as before, just compiled and run on the STM32F103 rather
 than your desktop. We hope that the simplicity of this testing approach will
 help make adding support for new platforms as easy as possible.
+
+## Building for "Hifive1" SiFive FE310 development board
+
+We've targeted the
+["HiFive1" Arduino-compatible development board](https://www.sifive.com/boards/hifive1)
+as a test platform for RISC-V MCU.
+
+Similar to Blue Pill setup, you will need Docker installed. The binary can be
+executed on either HiFive1 board or emulated using
+[Renode project](https://renode.io/) on your desktop machine.
+
+The following instructions builds and transfers the source files to the Docker
+`docker build -t riscv_build \ -f
+{PATH_TO_TENSORFLOW_ROOT_DIR}/tensorflow/lite/experimental/micro/testing/Dockerfile.riscv
+\ {PATH_TO_TENSORFLOW_ROOT_DIR}/tensorflow/lite/experimental/micro/testing/`
+
+You should see output that looks something like this:
+
+```
+Sending build context to Docker daemon  28.16kB
+Step 1/4 : FROM antmicro/renode:latest
+ ---> 19c08590e817
+Step 2/4 : LABEL maintainer="Pete Warden <petewarden@google.com>"
+ ---> Using cache
+ ---> 5a7770d3d3f5
+Step 3/4 : RUN apt-get update
+ ---> Using cache
+ ---> b807ab77eeb1
+Step 4/4 : RUN apt-get install -y curl git unzip make g++
+ ---> Using cache
+ ---> 8da1b2aa2438
+Successfully built 8da1b2aa2438
+Successfully tagged riscv_build:latest
+```
+
+Building micro_speech_test binary
+
+-   Launch the Docker that we just created using: `docker run -it-v
+    /tmp/copybara_out:/workspace riscv_build:latest bash`
+-   Enter the source root directory by running `cd /workspace`
+-   Download the dependencies by running
+    `./tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh`.
+    This may take a few minutes.
+-   Set the path to RISC-V tools: `export
+    PATH=${PATH}:/workspace/tensorflow/lite/experimental/micro/tools/make/downloads/riscv_toolchain/bin/`
+-   Build the binary: `make -f
+    tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=riscv32_mcu`
+
+Launching Renode to test the binary, currently this set up is not automated.
+
+-   Execute the binary on Renode: `renode -P 5000 --disable-xwt -e 's
+    @/workspace/tensorflow/lite/experimental/micro/testing/sifive_fe310.resc'`
+
+You should see the following log with the magic string `~~~ALL TEST PASSED~~~`:
+
+```
+02:25:22.2059 [DEBUG] uart0: [+17.25s host +80ms virt 80ms virt from start] core freq at 0 Hz
+02:25:22.2065 [DEBUG] uart0: [+0.61ms host +0s virt 80ms virt from start]   Testing TestInvoke
+02:25:22.4243 [DEBUG] uart0: [+0.22s host +0.2s virt 0.28s virt from start]   Ran successfully
+02:25:22.4244 [DEBUG] uart0: [+42µs host +0s virt 0.28s virt from start]
+02:25:22.4245 [DEBUG] uart0: [+0.15ms host +0s virt 0.28s virt from start]   1/1 tests passed
+02:25:22.4247 [DEBUG] uart0: [+62µs host +0s virt 0.28s virt from start]   ~~~ALL TESTS PASSED~~~
+02:25:22.4251 [DEBUG] uart0: [+8µs host +0s virt 0.28s virt from start]
+02:25:22.4252 [DEBUG] uart0: [+0.39ms host +0s virt 0.28s virt from start]
+02:25:22.4253 [DEBUG] uart0: [+0.16ms host +0s virt 0.28s virt from start]   Progam has exited with code:0x00000000
+```
+
+## Building for Ambiq Micro Apollo3Blue EVB using Make
+
+Follow these steps to get the pushbutton yes/no example working on Apollo 3:
+
+1.  Make sure to run the "Building Portable Reference Code using Make" section
+    before performing the following steps
+2.  The Ambiq Micro SDK is downloaded into
+    `tensorflow/lite/experimental/micro/tools/make/downloads` by
+    'download_dependencies.sh'.
+3.  Compile the project with the following command: make -f
+    tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=apollo3evb
+    pushbutton_cmsis_speech_test_bin
+4.  Install [Segger JLink tools](https://www.segger.com/downloads/jlink/)
+5.  Connect the Apollo3 EVB (with mic shield in slot 3 of Microbus Shield board)
+    to the computer and power it on.
+6.  Start the GDB server in a new terminal with the following command:
+    JLinkGDBServer -select USB -device AMA3B1KK-KBR -endian little -if SWD
+    -speed 1000 -noir -noLocalhostOnly
+    1.  The command has run successfully if you see the message "Waiting for GDB
+        connection"
+7.  Back in the original terminal, run the program via the debugger
+    1.  Navigate to
+        tensorflow/lite/experimental/micro/examples/micro_speech/apollo3
+    2.  Start gdb by entering the following command: arm-none-eabi-gdb
+    3.  Run the command script by entering the following command: source
+        pushbutton_cmsis_scores.cmd. This script does the following:
+        1.  Load the binary created in step 6
+        2.  Set a breakpoint after inference scores have been computed
+        3.  Tell the debugger what variables should be printed out at this
+            breakpoint
+        4.  Begin program execution
+        5.  Press Ctrl+c to exit
+    4.  Press BTN2. An LED will flash for 1 second. Speak your utterance during
+        this one second
+    5.  The debugger will print out four numbers. They are the probabilites for
+        1.  no speech
+        2.  unkown speech
+        3.  yes
+        4.  no
+    6.  The EVB LEDs will indicate detection.
+        1.  LED0 (rightmost LED) - ON when capturing 1sec of audio
+        2.  LED1 - ON when detecting silence
+        3.  LED2 - ON when detecting UNKNOWN utterance
+        4.  LED3 - ON when detecting YES utterance
+        5.  LED4 (leftmost LED) - ON when detecting NO utterance
+
+### Additional Apollo3 Instructions
+
+To flash a part with JFlash Lite, do the following: 
+
+1. At the command line: JFlashLiteExe 
+2. Device = AMA3B1KK-KBR 
+3. Interface = SWD at 1000 kHz 
+4. Data file = `tensorflow/lite/experimental/micro/tools/make/gen/apollo3evb_cortex-m4/bin/pushbutton_cmsis_speech_test.bin`
+5. Prog Addr = 0x0000C000
+
+## Building for the Eta Compute ECM3531 EVB using Make
+
+1.  Follow the instructions at
+    [Tensorflow Micro Speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/examples/micro_speech#getting-started)
+    to down load the Tensorflow source code and the support libraries \(but do
+    not run the make command shown there.\)
+2.  Download the Eta Compute SDK, version 0.0.17. Contact info@etacompute.com
+3.  You will need the the Arm compiler arm-none-eabi-gcc, version 7.3.1
+    20180622, release ARM/embedded-7-branch revision 261907, 7-2018-q2-update.
+    This compiler is downloaded when you run the
+    tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
+    script.
+4.  Edit the file
+    tensorflow/lite/experimental/micro/tools/make/targets/ecm3531_makefile.inc
+    so that the variables ETA_SDK and GCC_ARM point to the correct directories.
+5.  Compile the code with the command \
+    &nbsp;&nbsp;&nbsp;&nbsp;make -f
+    tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=ecm3531
+    TAGS="CMSIS" test \
+    This will produce a set of executables in the
+    tensorflow/lite/experimental/micro/tools/make/gen/ecm3531_cortex-m3/bin
+    directory.
+6.  To load an executable into SRAM \
+    &nbsp;&nbsp;&nbsp;&nbsp;Start ocd \
+    &nbsp;&nbsp;&nbsp;&nbsp;cd
+    tensorflow/lite/experimental/micro/tools/make/targets/ecm3531 \
+    &nbsp;&nbsp;&nbsp;&nbsp;./load_program name_of_executable, for e.g.,
+    ./load_program audio_provider_test \
+    &nbsp;&nbsp;&nbsp;&nbsp;Start PuTTY \(Connection type = Serial, Speed =
+    11520, Data bits = 8, Stop bits = 1, Parity = None\) \
+    The following output should appear: \
+    Testing TestAudioProvider \
+    Testing TestTimer \
+    2/2 tests passed \
+    \~\~\~ALL TESTS PASSED\~\~\~ \
+    Execution time \(msec\) = 7
+7.  To load into flash \
+    &nbsp;&nbsp;&nbsp;&nbsp;Edit the variable ETA_LDS_FILE in
+    tensorflow/lite/experimental/micro/tools/&nbsp;&nbsp;make/targets/ecm3531_makefile.inc
+    to point to the ecm3531_flash.lds file \
+    &nbsp;&nbsp;&nbsp;&nbsp;Recompile \( make -f
+    tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=ecm3531
+    TAGS="CMSIS" test\) \
+    &nbsp;&nbsp;&nbsp;&nbsp;cd
+    tensorflow/lite/experimental/micro/tools/make/targets/ecm3531 \
+    &nbsp;&nbsp;&nbsp;&nbsp;./flash_program executable_name to load into flash.
+
+## Goals
+
+The design goals are for the framework to be:
+
+-   **Readable**: We want embedded software engineers to be able to understand
+    what's required to run ML inference without having to study research papers.
+    We've tried to keep the code base small, modular, and have reference
+    implementations of all operations to help with this.
+
+-   **Easy to modify**: We know that there are a lot of different platforms and
+    requirements in the embedded world, and we don't expect to cover all of them
+    in one framework. Instead, we're hoping that it can be a good starting point
+    for developers to build on top of to meet their own needs. For example, we
+    tried to make it easy to replace the implementations of key computational
+    operators that are often crucial for performance, without having to touch
+    the data flow and other runtime code. We want it to make more sense to use
+    our workflow to handle things like model import and less-important
+    operations, and customize the parts that matter, rather than having to
+    reimplement everything in your own engine.
+
+-   **Well-tested**: If you're modifying code, you need to know if your changes
+    are correct. Having an easy way to test lets you develop much faster. To
+    help there, we've written tests for all the components, and we've made sure
+    that the tests can be run on almost any platform, with no dependencies apart
+    from the ability to log text to a debug console somewhere. We also provide
+    an easy way to run all the tests on-device as part of an automated test
+    framework, and we use qemu/Renode emulation so that tests can be run even
+    without physical devices present.
+
+-   **Easy to integrate**: We want to be as open a system as possible, and use
+    the best code available for each platform. To do that, we're going to rely
+    on projects like
+    [CMSIS-NN](https://www.keil.com/pack/doc/CMSIS/NN/html/index.html),
+    [uTensor](https://github.com/uTensor/uTensor), and other vendor libraries to
+    handle as much performance-critical code as possible. We know that there are
+    an increasing number of options to accelerate neural networks on
+    microcontrollers, so we're aiming to be a good host for deploying those
+    hardware technologies too.
+
+-   **Compatible**: We're using the same file schema, interpreter API, and
+    kernel interface as regular TensorFlow Lite, so we leverage the large
+    existing set of tools, documentation, and examples for the project. The
+    biggest barrier to deploying ML models is getting them from a training
+    environment into a form that's easy to run inference on, so we see reusing
+    this rich ecosystem as being crucial to being easily usable. We also hope to
+    integrate this experimental work back into the main codebase in the future.
+
+To meet those goals, we've made some tradeoffs:
+
+-   **Simple C++**: To help with readability, our code is written in a modern
+    version of C++, but we generally treat it as a "better C", rather relying on
+    more complex features such as template meta-programming. As mentioned
+    earlier, we avoid any use of dynamic memory allocation (new/delete) or the
+    standard C/C++ libraries, so we believe this should still be fairly
+    portable. It does mean that some older devices with C-only toolchains won't
+    be supported, but we're hoping that the reference operator implementations
+    (which are simple C-like functions) can still be useful in those cases. The
+    interfaces are also designed to be C-only, so it should be possible to
+    integrate the resulting library with pure C projects.
+
+-   **Interpreted**: Code generation is a popular pattern for embedded code,
+    because it gives standalone code that's easy to modify and step through, but
+    we've chosen to go with an interpreted approach. In our internal
+    microcontroller work we've found that using an extremely stripped-down
+    interpreter with almost no dependencies gives us a lot of the same
+    advantages, but is easier to maintain. For example, when new updates come
+    out for the underlying library, you can just merge your local modifications
+    in a single step, rather than having to regenerate new code and then patch
+    in any changes you subsequently made. The coarse granularity of the
+    interpreted primitives means that each operation call typically takes
+    hundreds of thousands of instruction cycles at least, so we don't see
+    noticeable performance gains from avoiding what's essentially a single
+    switch statement at the interpreter level to call each operation. We're
+    still working on improving the packaging though, for example we're
+    considering having the ability to snapshot all the source files and headers
+    used for a particular model, being able to compile the code and data
+    together as a library, and then access it through a minimal set of C
+    interface calls which hide the underlying complexity.
+
+-   **Flatbuffers**: We represent our models using
+    [the standard flatbuffer schema used by the rest of TensorFlow Lite](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema.fbs),
+    with the difference that we always keep it in read-only program memory
+    (typically flash) rather than relying on having a file system to read it
+    from. This is a good fit because flatbuffer's serialized format is designed
+    to be mapped into memory without requiring any extra memory allocations or
+    modifications to access it. All of the functions to read model values work
+    directly on the serialized bytes, and large sections of data like weights
+    are directly accessible as sequential C-style arrays of their data type,
+    with no strides or unpacking needed. We do get a lot of value from using
+    flatbuffers, but there is a cost in complexity. The flat buffer library code
+    is all inline
+    [inside the main headers](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema_generated.h),
+    but it isn't straightforward to inspect their implementations, and the model
+    data structures aren't easy to comprehend from the debugger. The header for
+    the schema itself also has to be periodically updated when new information
+    is added to the file format, though we try to handle that transparently for
+    most developers by checking in a pre-generated version.
+
+-   **Code Duplication**: Some of the code in this prototype largely duplicates
+    the logic in other parts of the TensorFlow Lite code base, for example the
+    operator wrappers. We've tried to keep share as much as we can between the
+    two interpreters, but there are some assumptions built into the original
+    runtime that make this difficult. We'll be working on modularizing the main
+    interpreter so that we can move to an entirely shared system.
+
+This initial preview release is designed to get early feedback, and is not
+intended to be a final product. It only includes enough operations to run a
+simple keyword recognition model, and the implementations are not optimized.
+We're hoping this will be a good way to get feedback and collaborate to improve
+the framework.
+
+## Generating Project Files
+
+It's not always easy or convenient to use a makefile-based build process,
+especially if you're working on a product that uses a different IDE for the rest
+of its code. To address that, it's possible to generate standalone project
+folders for various popular build systems. These projects are self-contained,
+with only the headers and source files needed by a particular binary, and
+include project files to make loading them into an IDE easy. These can be
+auto-generated for any target you can compile using the main Make system, using
+a command like this (making sure you've run `download_dependencies.sh` first):
+
+```
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=mbed TAGS="CMSIS disco_f746ng" generate_micro_speech_mbed_project
+```
+
+This will create a folder in
+`tensorflow/lite/experimental/micro/tools/make/gen/mbed_cortex-m4/prj/micro_speech_main_test/mbed`
+that contains the source and header files, some Mbed configuration files, and a
+README. You should then be able to copy this directory to another machine, and
+use it just like any other Mbed project. There's more information about project
+files [below](#working-with-generated-projects).
+
+## How to Port TensorFlow Lite Micro to a New Platform
+
+Are you a hardware or operating system provider looking to run machine learning
+on your platform? We're keen to help, and we've had experience helping other
+teams do the same thing, so here are our recommendations.
+
+### Requirements
+
+Since the core neural network operations are pure arithmetic, and don't require
+any I/O or other system-specific functionality, the code doesn't have to have
+many dependencies. We've tried to enforce this, so that it's as easy as possible
+to get TensorFlow Lite Micro running even on 'bare metal' systems without an OS.
+Here are the core requirements that a platform needs to run the framework:
+
+-   C/C++ compiler capable of C++11 compatibility. This is probably the most
+    restrictive of the requirements, since C++11 is not as widely adopted in the
+    embedded world as it is elsewhere. We made the decision to require it since
+    one of the main goals of TFL Micro is to share as much code as possible with
+    the wider TensorFlow codebase, and since that relies on C++11 features, we
+    need compatibility to achieve it. We only use a small, sane, subset of C++
+    though, so don't worry about having to deal with template metaprogramming or
+    similar challenges!
+
+-   Debug logging. The core network operations don't need any I/O functions, but
+    to be able to run tests and tell if they've worked as expected, the
+    framework needs some way to write out a string to some kind of debug
+    console. This will vary from system to system, for example on Linux it could
+    just be `fprintf(stderr, debug_string)` whereas an embedded device might
+    write the string out to a specified UART. As long as there's some mechanism
+    for outputting debug strings, you should be able to use TFL Micro on that
+    platform.
+
+-   Math library. The C standard `libm.a` library is needed to handle some of
+    the mathematical operations used to calculate neural network results.
+
+-   Global variable initialization. We do use a pattern of relying on global
+    variables being set before `main()` is run in some places, so you'll need to
+    make sure your compiler toolchain
+
+And that's it! You may be wondering about some other common requirements that
+are needed by a lot of non-embedded software, so here's a brief list of things
+that aren't necessary to get started with TFL Micro on a new platform:
+
+-   Operating system. Since the only platform-specific function we need is
+    `DebugLog()`, there's no requirement for any kind of Posix or similar
+    functionality around files, processes, or threads.
+
+-   C or C++ standard libraries. The framework tries to avoid relying on any
+    standard library functions that require linker-time support. This includes
+    things like string functions, but still allows us to use headers like
+    `stdtypes.h` which typically just define constants and typedefs.
+    Unfortunately this distinction isn't officially defined by any standard, so
+    it's possible that different toolchains may decide to require linked code
+    even for the subset we use, but in practice we've found it's usually a
+    pretty obvious decision and stable over platforms and toolchains.
+
+-   Dynamic memory allocation. All the TFL Micro code avoids dynamic memory
+    allocation, instead relying on local variables on the stack in most cases,
+    or global variables for a few situations. These are all fixed-size, which
+    can mean some compile-time configuration to ensure there's enough space for
+    particular networks, but does avoid any need for a heap and the
+    implementation of `malloc\new` on a platform.
+
+-   Floating point. Eight-bit integer arithmetic is enough for inference on many
+    networks, so if a model sticks to these kind of quantized operations, no
+    floating point instructions should be required or executed by the framework.
+
+### Getting Started
+
+We recommend that you start trying to compile and run one of the simplest tests
+in the framework as your first step. The full TensorFlow codebase can seem
+overwhelming to work with at first, so instead you can begin with a collection
+of self-contained project folders that only include the source files needed for
+a particular test or executable. You can find a set of pre-generated projects
+[here](https://drive.google.com/open?id=1cawEQAkqquK_SO4crReDYqf_v7yAwOY8).
+
+As mentioned above, the one function you will need to implement for a completely
+new platform is debug logging. If your device is just a variation on an existing
+platform you may be able to reuse code that's already been written. To
+understand what's available, begin with the default reference implementation at
+[tensorflow/lite/experimental/micro/debug_log.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/debug_log.cc]),
+which uses fprintf and stderr. If your platform has this level of support for
+the C standard library in its toolchain, then you can just reuse this.
+Otherwise, you'll need to do some research into how your platform and device can
+communicate logging statements to the outside world. As another example, take a
+look at
+[the Mbed version of `DebugLog()`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/mbed/debug_log.cc),
+which creates a UART object and uses it to output strings to the host's console
+if it's connected.
+
+Begin by navigating to the micro_error_reporter_test folder in the pregenerated
+projects you downloaded. Inside here, you'll see a set of folders containing all
+the source code you need. If you look through them, you should find a total of
+around 60 C or C++ files that compiled together will create the test executable.
+There's an example makefile in the directory that lists all of the source files
+and include paths for the headers. If you're building on a Linux or MacOS host
+system, you may just be able to reuse that same makefile to cross-compile for
+your system, as long as you swap out the `CC` and `CXX` variables from their
+defaults, to point to your cross compiler instead (for example
+`arm-none-eabi-gcc` or `riscv64-unknown-elf-gcc`). Otherwise, set up a project
+in the build system you are using. It should hopefully be fairly
+straightforward, since all of the source files in the folder need to be
+compiled, so on many IDEs you can just drag the whole lot in. Then you need to
+make sure that C++11 compatibility is turned on, and that the right include
+paths (as mentioned in the makefile) have been added.
+
+You'll see the default `DebugLog()` implementation in
+'tensorflow/lite/experimental/micro/debug_log.cc' inside the
+micro_error_reporter_test folder. Modify that file to add the right
+implementation for your platform, and then you should be able to build the set
+of files into an executable. Transfer that executable to your target device (for
+example by flashing it), and then try running it. You should see output that
+looks something like this:
+
+```
+Number: 42
+Badly-formed format string
+Another  badly-formed  format string
+~~ALL TESTS PASSED~~~
+```
+
+If not, you'll need to debug what went wrong, but hopefully with this small
+starting project it should be manageable.
+
+### Troubleshooting
+
+When we've been porting to new platforms, it's often been hard to figure out
+some of the fundamentals like linker settings and other toolchain setup flags.
+If you are having trouble, see if you can find a simple example program for your
+platform, like one that just blinks an LED. If you're able to build and run that
+successfully, then start to swap in parts of the TF Lite Micro codebase to that
+working project, taking it a step at a time and ensuring it's still working
+after every change. For example, a first step might be to paste in your
+`DebugLog()` implementation and call `DebugLog("Hello World!")` from the main
+function.
+
+Another common problem on embedded platforms is the stack size being too small.
+Mbed defaults to 4KB for the main thread's stack, which is too small for most
+models since TensorFlow Lite allocates buffers and other data structures that
+require more memory. The exact size will depend on which model you're running,
+but try increasing it if you are running into strange corruption issues that
+might be related to stack overwriting.
+
+### Optimizing for your Platform
+
+The default reference implementations in TensorFlow Lite Micro are written to be
+portable and easy to understand, not fast, so you'll want to replace performance
+critical parts of the code with versions specifically tailored to your
+architecture. The framework has been designed with this in mind, and we hope the
+combination of small modules and many tests makes it as straightforward as
+possible to swap in your own code a piece at a time, ensuring you have a working
+version at every step. To write specialized implementations for a platform, it's
+useful to understand how optional components are handled inside the build
+system.
+
+### Code Module Organization
+
+We have adopted a system of small modules with platform-specific implementations
+to help with portability. Every module is just a standard `.h` header file
+containing the interface (either functions or a class), with an accompanying
+reference implementation in a `.cc` with the same name. The source file
+implements all of the code that's declared in the header. If you have a
+specialized implementation, you can create a folder in the same directory as the
+header and reference source, name it after your platform, and put your
+implementation in a `.cc` file inside that folder. We've already seen one
+example of this, where the Mbed and Bluepill versions of `DebugLog()` are inside
+[mbed](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/mbed)
+and
+[bluepill](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/bluepill)
+folders, children of the
+[same directory](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro)
+where the stdio-based
+[`debug_log.cc`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/debug_log.cc)
+reference implementation is found.
+
+The advantage of this approach is that we can automatically pick specialized
+implementations based on the current build target, without having to manually
+edit build files for every new platform. It allows incremental optimizations
+from a always-working foundation, without cluttering the reference
+implementations with a lot of variants.
+
+To see why we're doing this, it's worth looking at the alternatives. TensorFlow
+Lite has traditionally used preprocessor macros to separate out some
+platform-specific code within particular files, for example:
+
+```
+#ifndef USE_NEON
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#include <arm_neon.h>
+#endif
+```
+
+There’s also a tradition in gemmlowp of using file suffixes to indicate
+platform-specific versions of particular headers, with kernel_neon.h being
+included by kernel.h if `USE_NEON` is defined. As a third variation, kernels are
+separated out using a directory structure, with
+tensorflow/lite/kernels/internal/reference containing portable implementations,
+and tensorflow/lite/kernels/internal/optimized holding versions optimized for
+NEON on Arm platforms.
+
+These approaches are hard to extend to multiple platforms. Using macros means
+that platform-specific code is scattered throughout files in a hard-to-find way,
+and can make following the control flow difficult since you need to understand
+the macro state to trace it. For example, I temporarily introduced a bug that
+disabled NEON optimizations for some kernels when I removed
+tensorflow/lite/kernels/internal/common.h from their includes, without realizing
+it was where USE_NEON was defined!
+
+It’s also tough to port to different build systems, since figuring out the right
+combination of macros to use can be hard, especially since some of them are
+automatically defined by the compiler, and others are only set by build scripts,
+often across multiple rules.
+
+The approach we are using extends the file system approach that we use for
+kernel implementations, but with some specific conventions:
+
+-   For each module in TensorFlow Lite, there will be a parent directory that
+    contains tests, interface headers used by other modules, and portable
+    implementations of each part.
+-   Portable means that the code doesn’t include code from any libraries except
+    flatbuffers, or other TF Lite modules. You can include a limited subset of
+    standard C or C++ headers, but you can’t use any functions that require
+    linking against those libraries, including fprintf, etc. You can link
+    against functions in the standard math library, in <math.h>.
+-   Specialized implementations are held inside subfolders of the parent
+    directory, named after the platform or library that they depend on. So, for
+    example if you had my_module/foo.cc, a version that used RISC-V extensions
+    would live in my_module/riscv/foo.cc. If you had a version that used the
+    CMSIS library, it should be in my_module/cmsis/foo.cc.
+-   These specialized implementations should completely replace the top-level
+    implementations. If this involves too much code duplication, the top-level
+    implementation should be split into smaller files, so only the
+    platform-specific code needs to be replaced.
+-   There is a convention about how build systems pick the right implementation
+    file. There will be an ordered list of 'tags' defining the preferred
+    implementations, and to generate the right list of source files, each module
+    will be examined in turn. If a subfolder with a tag’s name contains a .cc
+    file with the same base name as one in the parent folder, then it will
+    replace the parent folder’s version in the list of build files. If there are
+    multiple subfolders with matching tags and file names, then the tag that’s
+    latest in the ordered list will be chosen. This allows us to express “I’d
+    like generically-optimized fixed point if it’s available, but I’d prefer
+    something using the CMSIS library” using the list 'fixed_point cmsis'. These
+    tags are passed in as `TAGS="<foo>"` on the command line when you use the
+    main Makefile to build.
+-   There is an implicit “reference” tag at the start of every list, so that
+    it’s possible to support directory structures like the current
+    tensorflow/kernels/internal where portable implementations are held in a
+    “reference” folder that’s a sibling to the NEON-optimized folder.
+-   The headers for each unit in a module should remain platform-agnostic, and
+    be the same for all implementations. Private headers inside a sub-folder can
+    be used as needed, but shouldn’t be referred to by any portable code at the
+    top level.
+-   Tests should be at the parent level, with no platform-specific code.
+-   No platform-specific macros or #ifdef’s should be used in any portable code.
+
+The implementation of these rules is handled inside the Makefile, with a
+[`specialize` function](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc#L42)
+that takes a list of reference source file paths as an input, and returns the
+equivalent list with specialized versions of those files swapped in if they
+exist.
+
+### Working with Generated Projects
+
+So far, I've recommended that you use the standalone generated projects for your
+system. You might be wondering why you're not just checking out the full
+[TensorFlow codebase from GitHub](https://github.com/tensorflow/tensorflow/)?
+The main reason is that there is a lot more diversity of architectures, IDEs,
+support libraries, and operating systems in the embedded world. Many of the
+toolchains require their own copy of source files, or a list of sources to be
+written to a project file. When a developer working on TensorFlow adds a new
+source file or changes its location, we can't expect her to update multiple
+different project files, many of which she may not have the right software to
+verify the change was correct. That means we have to rely on a central listing
+of source files (which in our case is held in the makefile), and then call a
+tool to generate other project files from those. We could ask embedded
+developers to do this process themselves after downloading the main source, but
+running the makefile requires a Linux system which may not be available, takes
+time, and involves downloading a lot of dependencies. That is why we've opted to
+make regular snapshots of the results of generating these projects for popular
+IDEs and platforms, so that embedded developers have a fast and friendly way to
+start using TensorFlow Lite for Microcontrollers.
+
+This does have the disadvantage that you're no longer working directly on the
+main repository, instead you have a copy that's outside of source control. We've
+tried to make the copy as similar to the main repo as possible, for example by
+keeping the paths of all source files the same, and ensuring that there are no
+changes between the copied files and the originals, but it still makes it
+tougher to sync as the main repository is updated. There are also multiple
+copies of the source tree, one for each target, so any change you make to one
+copy has to be manually propagated across all the other projects you care about.
+This doesn't matter so much if you're just using the projects as they are to
+build products, but if you want to support a new platform and have the changes
+reflected in the main code base, you'll have to do some extra work.
+
+As an example, think about the `DebugLog()` implementation we discussed adding
+for a new platform earlier. At this point, you have a new version of
+`debug_log.cc` that does what's required, but how can you share that with the
+wider community? The first step is to pick a tag name for your platform. This
+can either be the operating system (for example 'mbed'), the name of a device
+('bluepill'), or some other text that describes it. This should be a short
+string with no spaces or special characters. Log in or create an account on
+GitHub, fork the full
+[TensorFlow codebase](https://github.com/tensorflow/tensorflow/) using the
+'Fork' button on the top left, and then grab your fork by using a command like
+`git clone https://github.com/<your user name>/tensorflow`.
+
+You'll either need Linux, MacOS, or Windows with something like CygWin installed
+to run the next steps, since they involve building a makefile. Run the following
+commands from a terminal, inside the root of the source folder:
+
+```
+tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile generate_projects
+```
+
+This will take a few minutes, since it has to download some large toolchains for
+the dependencies. Once it has finished, you should see some folders created
+inside a path like
+`tensorflow/lite/experimental/micro/tools/make/gen/linux_x86_64/prj/`. The exact
+path depends on your host operating system, but you should be able to figure it
+out from all the copy commands. These folders contain the generated project and
+source files, with
+`tensorflow/lite/experimental/micro/tools/make/gen/linux_x86_64/prj/keil`
+containing the Keil uVision targets,
+`tensorflow/lite/experimental/micro/tools/make/gen/linux_x86_64/prj/mbed` with
+the Mbed versions, and so on.
+
+If you've got this far, you've successfully set up the project generation flow.
+Now you need to add your specialized implementation of `DebugLog()`. Start by
+creating a folder inside `tensorflow/lite/experimental/micro/` named after the
+tag you picked earlier. Put your `debug_log.cc` file inside this folder, and
+then run this command, with '<your tag>' replaced by the actual folder name:
+
+```
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile TAGS="<your tag>" generate_projects
+```
+
+If your tag name actually refers to a whole target architecture, then you'll use
+TARGET or TARGET_ARCH instead. For example, here's how a simple RISC-V set of
+projects is generated:
+
+```
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile TARGET="riscv32_mcu" generate_projects
+```
+
+The way it works is the same as TAGS though, it just looks for specialized
+implementations with the same containing folder name.
+
+If you look inside the projects that have been created, you should see that the
+default `DebugLog()` implementation is no longer present at
+`tensorflow/lite/experimental/micro/debug_log.cc`, and instead
+`tensorflow/lite/experimental/micro/<your tag>/debug_log.cc` is being used. Copy
+over the generated project files and try building them in your own IDE. If
+everything works, then you're ready to submit your change.
+
+To do this, run something like:
+
+```
+git add tensorflow/lite/experimental/micro/<your tag>/debug_log.cc
+git commit -a -m "Added DebugLog() support for <your platform>"
+git push origin master
+```
+
+Then go back to https://github.com/<your account>/tensorflow, and choose "New
+Pull Request" near the top. You should then be able to go through the standard
+TensorFlow PR process to get your change added to the main repository, and
+available to the rest of the community!
+
+### Supporting a Platform with Makefiles
+
+The changes you've made so far will enable other developers using the generated
+projects to use your platform, but TensorFlow's continuous integration process
+uses makefiles to build frequently and ensure changes haven't broken the build
+process for different systems. If you are able to convert your build procedure
+into something that can be expressed by a makefile, then we can integrate your
+platform into our CI builds and make sure it continues to work.
+
+Fully describing how to do this is beyond the scope of this documentation, but
+the biggest needs are:
+
+-   A command-line compiler that can be called for every source file.
+-   A list of the arguments to pass into the compiler to build and link all
+    files.
+-   The correct linker map files and startup assembler to ensure `main()` gets
+    called.
+
+### Supporting a Platform with Emulation Testing
+
+Integrating your platform into the makefile process should help us make sure
+that it continues to build, but it doesn't guarantee that the results of the
+build process will run correctly. Running tests is something we require to be
+able to say that TensorFlow officially supports a platform, since otherwise we
+can't guarantee that users will have a good experience when they try using it.
+Since physically maintaining a full set of all supported hardware devices isn't
+feasible, we rely on software emulation to run these tests. A good example is
+our
+[STM32F4 'Bluepill' support](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/testing/test_bluepill_binary.sh),
+which uses [Docker](https://www.docker.com/) and [Renode](https://renode.io/) to
+run built binaries in an emulator. You can use whatever technologies you want,
+the only requirements are that they capture the debug log output of the tests
+being run in the emulator, and parse them for the string that indicates the test
+was successful. These scripts need to run on Ubuntu 18.04, in a bash
+environment, though Docker is available if you need to install extra software or
+have other dependencies.
+
+### Implementing More Optimizations
+
+Clearly, getting debug logging support is only the beginning of the work you'll
+need to do on a particular platform. It's very likely that you'll want to
+optimize the core deep learning operations that take up the most time when
+running models you care about. The good news is that the process for providing
+optimized implementations is the same as the one you just went through to
+provide your own logging. You'll need to identify parts of the code that are
+bottlenecks, and then add specialized implementations in their own folders.
+These don't need to be platform specific, they can also be broken out by which
+library they rely on for example. [Here's where we do that for the CMSIS
+implementation of integer fast-fourier
+transforms](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/preprocessor.cc).
+This more complex case shows that you can also add helper source files alongside
+the main implementation, as long as you
+[mention them in the platform-specific makefile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/Makefile.inc).
+You can also do things like update the list of libraries that need to be linked
+in, or add include paths to required headers.
diff --git a/tensorflow/lite/experimental/micro/bluepill/debug_log.cc b/tensorflow/lite/experimental/micro/bluepill/debug_log.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4812a918498ee2ab52e114bce9ca0cf3919b2254
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/bluepill/debug_log.cc
@@ -0,0 +1,27 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/debug_log.h"
+
+// For Arm Cortex-M devices, calling SYS_WRITE0 will output the zero-terminated
+// string pointed to by R1 to any debug console that's attached to the system.
+extern "C" void DebugLog(const char* s) {
+  asm("mov r0, #0x04\n"  // SYS_WRITE0
+      "mov r1, %[str]\n"
+      "bkpt #0xAB\n"
+      :
+      : [ str ] "r"(s)
+      : "r0", "r1");
+}
diff --git a/tensorflow/lite/experimental/micro/debug_log.cc b/tensorflow/lite/experimental/micro/debug_log.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d4ca44d76b73020848e9757c230d7bf69ff5aaa
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/debug_log.cc
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Reference implementation of the DebugLog() function that's required for a
+// platform to support the TensorFlow Lite for Microcontrollers library. This is
+// the only function that's absolutely required to be available on a target
+// device, since it's used for communicating test results back to the host so
+// that we can verify the implementation is working correctly.
+// It's designed to be as easy as possible to supply an implementation though.
+// On platforms that have a POSIX stack or C library, it can be written as a
+// single call to `fprintf(stderr, "%s", s)` to output a string to the error
+// stream of the console, but if there's no OS or C library available, there's
+// almost always an equivalent way to write out a string to some serial
+// interface that can be used instead. For example on Arm M-series MCUs, calling
+// the `bkpt #0xAB` assembler instruction will output the string in r1 to
+// whatever debug serial connection is available. If you're running mbed, you
+// can do the same by creating `Serial pc(USBTX, USBRX)` and then calling
+// `pc.printf("%s", s)`.
+// To add an equivalent function for your own platform, create your own
+// implementation file, and place it in a subfolder with named after the OS
+// you're targeting. For example, see the Cortex M bare metal version in
+// tensorflow/lite/experimental/micro/bluepill/debug_log.cc or the mbed one on
+// tensorflow/lite/experimental/micro/mbed/debug_log.cc.
+
+#include "tensorflow/lite/experimental/micro/debug_log.h"
+
+#include <cstdio>
+
+extern "C" void DebugLog(const char* s) { fprintf(stderr, "%s", s); }
diff --git a/tensorflow/lite/experimental/micro/debug_log.h b/tensorflow/lite/experimental/micro/debug_log.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0e395c3760e2e0c57b50c38c05737dfecb7e680
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/debug_log.h
@@ -0,0 +1,23 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_DEBUG_LOG_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_DEBUG_LOG_H_
+
+// This function should be implemented by each target platform, and provide a
+// way for strings to be output to some text stream. For more information, see
+// tensorflow/lite/experimental/micro/debug_log.cc.
+extern "C" void DebugLog(const char* s);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_DEBUG_LOG_H_
diff --git a/tensorflow/lite/experimental/micro/debug_log_numbers.cc b/tensorflow/lite/experimental/micro/debug_log_numbers.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8e86730674859d5560e5ec6b243e40c95f88bf4f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/debug_log_numbers.cc
@@ -0,0 +1,185 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Implements debug logging for numbers by converting them into strings and then
+// calling the main DebugLog(char*) function. These are separated into a
+// different file so that platforms can just implement the string output version
+// of DebugLog() and then get the numerical variations without requiring any
+// more code.
+
+#include "tensorflow/lite/experimental/micro/debug_log_numbers.h"
+
+#include "tensorflow/lite/experimental/micro/debug_log.h"
+
+namespace {
+
+// All input buffers to the number conversion functions must be this long.
+static const int kFastToBufferSize = 48;
+
+// Reverses a zero-terminated string in-place.
+char* ReverseStringInPlace(char* start, char* end) {
+  char* p1 = start;
+  char* p2 = end - 1;
+  while (p1 < p2) {
+    char tmp = *p1;
+    *p1++ = *p2;
+    *p2-- = tmp;
+  }
+  return start;
+}
+
+// Appends a string to a string, in-place. You need to pass in the maximum
+// string length as the second argument.
+char* StrCatStr(char* main, int main_max_length, const char* to_append) {
+  char* current = main;
+  while (*current != 0) {
+    ++current;
+  }
+  char* current_end = main + (main_max_length - 1);
+  while ((*to_append != 0) && (current < current_end)) {
+    *current = *to_append;
+    ++current;
+    ++to_append;
+  }
+  *current = 0;
+  return current;
+}
+
+// Populates the provided buffer with an ASCII representation of the number.
+char* FastUInt32ToBufferLeft(uint32_t i, char* buffer, int base) {
+  char* start = buffer;
+  do {
+    int32_t digit = i % base;
+    char character;
+    if (digit < 10) {
+      character = '0' + digit;
+    } else {
+      character = 'a' + (digit - 10);
+    }
+    *buffer++ = character;
+    i /= base;
+  } while (i > 0);
+  *buffer = 0;
+  ReverseStringInPlace(start, buffer);
+  return buffer;
+}
+
+// Populates the provided buffer with an ASCII representation of the number.
+char* FastInt32ToBufferLeft(int32_t i, char* buffer) {
+  uint32_t u = i;
+  if (i < 0) {
+    *buffer++ = '-';
+    u = -u;
+  }
+  return FastUInt32ToBufferLeft(u, buffer, 10);
+}
+
+// Converts a number to a string and appends it to another.
+char* StrCatInt32(char* main, int main_max_length, int32_t number) {
+  char number_string[kFastToBufferSize];
+  FastInt32ToBufferLeft(number, number_string);
+  return StrCatStr(main, main_max_length, number_string);
+}
+
+// Converts a number to a string and appends it to another.
+char* StrCatUInt32(char* main, int main_max_length, uint32_t number, int base) {
+  char number_string[kFastToBufferSize];
+  FastUInt32ToBufferLeft(number, number_string, base);
+  return StrCatStr(main, main_max_length, number_string);
+}
+
+// Populates the provided buffer with ASCII representation of the float number.
+// Avoids the use of any floating point instructions (since these aren't
+// supported on many microcontrollers) and as a consequence prints values with
+// power-of-two exponents.
+char* FastFloatToBufferLeft(float f, char* buffer) {
+  char* current = buffer;
+  char* current_end = buffer + (kFastToBufferSize - 1);
+  // Access the bit fields of the floating point value to avoid requiring any
+  // float instructions. These constants are derived from IEEE 754.
+  const uint32_t sign_mask = 0x80000000;
+  const uint32_t exponent_mask = 0x7f800000;
+  const int32_t exponent_shift = 23;
+  const int32_t exponent_bias = 127;
+  const uint32_t fraction_mask = 0x007fffff;
+  const uint32_t u = *reinterpret_cast<uint32_t*>(&f);
+  const int32_t exponent =
+      ((u & exponent_mask) >> exponent_shift) - exponent_bias;
+  const uint32_t fraction = (u & fraction_mask);
+  // Expect ~0x2B1B9D3 for fraction.
+  if (u & sign_mask) {
+    *current = '-';
+    current += 1;
+  }
+  *current = 0;
+  // These are special cases for infinities and not-a-numbers.
+  if (exponent == 128) {
+    if (fraction == 0) {
+      current = StrCatStr(current, (current_end - current), "Inf");
+      return current;
+    } else {
+      current = StrCatStr(current, (current_end - current), "NaN");
+      return current;
+    }
+  }
+  // 0x007fffff (8388607) represents 0.99... for the fraction, so to print the
+  // correct decimal digits we need to scale our value before passing it to the
+  // conversion function. This scale should be 10000000/8388608 = 1.1920928955.
+  // We can approximate this using multiply-adds and right-shifts using the
+  // values in this array. The 1. portion of the number string is printed out
+  // in a fixed way before the fraction, below.
+  const int32_t scale_shifts_size = 13;
+  const int8_t scale_shifts[13] = {3,  4,  8,  11, 13, 14, 17,
+                                   18, 19, 20, 21, 22, 23};
+  uint32_t scaled_fraction = fraction;
+  for (int i = 0; i < scale_shifts_size; ++i) {
+    scaled_fraction += (fraction >> scale_shifts[i]);
+  }
+  *current = '1';
+  current += 1;
+  *current = '.';
+  current += 1;
+  *current = 0;
+  current = StrCatUInt32(current, (current_end - current), scaled_fraction, 10);
+  current = StrCatStr(current, (current_end - current), "*2^");
+  current = StrCatInt32(current, (current_end - current), exponent);
+  return current;
+}
+
+}  // namespace
+
+extern "C" void DebugLogInt32(int32_t i) {
+  char number_string[kFastToBufferSize];
+  FastInt32ToBufferLeft(i, number_string);
+  DebugLog(number_string);
+}
+
+extern "C" void DebugLogUInt32(uint32_t i) {
+  char number_string[kFastToBufferSize];
+  FastUInt32ToBufferLeft(i, number_string, 10);
+  DebugLog(number_string);
+}
+
+extern "C" void DebugLogHex(uint32_t i) {
+  char number_string[kFastToBufferSize];
+  FastUInt32ToBufferLeft(i, number_string, 16);
+  DebugLog(number_string);
+}
+
+extern "C" void DebugLogFloat(float i) {
+  char number_string[kFastToBufferSize];
+  FastFloatToBufferLeft(i, number_string);
+  DebugLog(number_string);
+}
diff --git a/tensorflow/lite/experimental/micro/debug_log_numbers.h b/tensorflow/lite/experimental/micro/debug_log_numbers.h
new file mode 100644
index 0000000000000000000000000000000000000000..d889e751730495e2d1bf6232e7b9c2cbb76c9667
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/debug_log_numbers.h
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_DEBUG_LOG_NUMBERS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_DEBUG_LOG_NUMBERS_H_
+
+#include <cstdint>
+
+// Output numbers to the debug logging stream.
+extern "C" {
+void DebugLogInt32(int32_t i);
+void DebugLogUInt32(uint32_t i);
+void DebugLogHex(uint32_t i);
+void DebugLogFloat(float i);
+}
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_DEBUG_LOG_NUMBERS_H_
diff --git a/tensorflow/lite/experimental/micro/ecm3531/debug_log.cc b/tensorflow/lite/experimental/micro/ecm3531/debug_log.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4d961963969039c75232b91bba12b54870225605
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/ecm3531/debug_log.cc
@@ -0,0 +1,20 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/debug_log.h"
+
+#include "eta_csp_io.h"
+
+extern "C" void DebugLog(const char* s) { EtaCspIoPrintf("%s", s); }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/.gitignore b/tensorflow/lite/experimental/micro/examples/micro_speech/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..d8dd7532abcc65af52e9db03c516274e3d674dc1
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/.gitignore
@@ -0,0 +1 @@
+*.wav
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD b/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
index 799b2e5a5dd097c6e017f574449d339992f7c41b..29d40e702d96b2eb6d4a85c2841f4d2829b8d764 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
@@ -11,34 +11,34 @@ load(
 )
 
 cc_library(
-    name = "model_settings",
+    name = "simple_model_settings",
     srcs = [
-        "model_settings.cc",
+        "simple_features/simple_model_settings.cc",
     ],
     hdrs = [
-        "model_settings.h",
+        "simple_features/simple_model_settings.h",
     ],
 )
 
 cc_library(
-    name = "tiny_conv_model_data",
+    name = "tiny_conv_simple_features_model_data",
     srcs = [
-        "tiny_conv_model_data.cc",
+        "simple_features/tiny_conv_simple_features_model_data.cc",
     ],
     hdrs = [
-        "tiny_conv_model_data.h",
+        "simple_features/tiny_conv_simple_features_model_data.h",
     ],
 )
 
 cc_library(
-    name = "features_test_data",
+    name = "simple_features_test_data",
     srcs = [
-        "no_features_data.cc",
-        "yes_features_data.cc",
+        "simple_features/no_simple_features_data.cc",
+        "simple_features/yes_simple_features_data.cc",
     ],
     hdrs = [
-        "no_features_data.h",
-        "yes_features_data.h",
+        "simple_features/no_simple_features_data.h",
+        "simple_features/yes_simple_features_data.h",
     ],
 )
 
@@ -48,10 +48,10 @@ tflite_micro_cc_test(
         "micro_speech_test.cc",
     ],
     deps = [
-        ":features_test_data",
-        ":tiny_conv_model_data",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_features_test_data",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:tiny_conv_micro_features_model_data",
         "//tensorflow/lite/experimental/micro/kernels:all_ops_resolver",
         "//tensorflow/lite/experimental/micro/kernels:micro_ops",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
@@ -60,45 +60,66 @@ tflite_micro_cc_test(
 )
 
 cc_library(
-    name = "preprocessor_test_data",
+    name = "audio_sample_test_data",
     srcs = [
         "no_30ms_sample_data.cc",
-        "no_power_spectrum_data.cc",
         "yes_30ms_sample_data.cc",
-        "yes_power_spectrum_data.cc",
     ],
     hdrs = [
         "no_30ms_sample_data.h",
-        "no_power_spectrum_data.h",
         "yes_30ms_sample_data.h",
-        "yes_power_spectrum_data.h",
     ],
 )
 
 cc_library(
-    name = "preprocessor_reference",
+    name = "audio_large_sample_test_data",
     srcs = [
-        "preprocessor.cc",
+        "no_1000ms_sample_data.cc",
+        "yes_1000ms_sample_data.cc",
     ],
     hdrs = [
-        "preprocessor.h",
+        "no_1000ms_sample_data.h",
+        "yes_1000ms_sample_data.h",
+    ],
+)
+
+cc_library(
+    name = "simple_features_generator_test_data",
+    srcs = [
+        "simple_features/no_power_spectrum_data.cc",
+        "simple_features/yes_power_spectrum_data.cc",
+    ],
+    hdrs = [
+        "simple_features/no_power_spectrum_data.h",
+        "simple_features/yes_power_spectrum_data.h",
+    ],
+)
+
+cc_library(
+    name = "simple_features_generator_reference",
+    srcs = [
+        "simple_features/simple_features_generator.cc",
+    ],
+    hdrs = [
+        "simple_features/simple_features_generator.h",
     ],
     deps = [
-        ":model_settings",
+        ":simple_model_settings",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
     ],
 )
 
 tflite_micro_cc_test(
-    name = "preprocessor_reference_test",
+    name = "simple_features_generator_reference_test",
     srcs = [
-        "preprocessor_test.cc",
+        "simple_features/simple_features_generator_test.cc",
     ],
     deps = [
-        ":model_settings",
-        ":preprocessor_reference",
-        ":preprocessor_test_data",
+        ":audio_sample_test_data",
+        ":simple_features_generator_reference",
+        ":simple_features_generator_test_data",
+        ":simple_model_settings",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
@@ -106,29 +127,30 @@ tflite_micro_cc_test(
 )
 
 cc_library(
-    name = "preprocessor_fixed",
+    name = "simple_features_generator_fixed",
     srcs = [
-        "fixed_point/preprocessor.cc",
+        "simple_features/fixed_point/simple_features_generator.cc",
     ],
     hdrs = [
-        "preprocessor.h",
+        "simple_features/simple_features_generator.h",
     ],
     deps = [
-        ":model_settings",
+        ":simple_model_settings",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
     ],
 )
 
 tflite_micro_cc_test(
-    name = "preprocessor_fixed_test",
+    name = "simple_features_generator_fixed_test",
     srcs = [
-        "preprocessor_test.cc",
+        "simple_features/simple_features_generator_test.cc",
     ],
     deps = [
-        ":model_settings",
-        ":preprocessor_fixed",
-        ":preprocessor_test_data",
+        ":audio_sample_test_data",
+        ":simple_features_generator_fixed",
+        ":simple_features_generator_test_data",
+        ":simple_model_settings",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
@@ -144,9 +166,25 @@ cc_library(
         "audio_provider.h",
     ],
     deps = [
-        ":model_settings",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
+    ],
+)
+
+cc_library(
+    name = "audio_provider_mock",
+    srcs = [
+        "audio_provider_mock.cc",
+    ],
+    hdrs = [
+        "audio_provider.h",
+    ],
+    deps = [
+        ":audio_large_sample_test_data",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
     ],
 )
 
@@ -157,9 +195,24 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":audio_provider",
-        ":model_settings",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "audio_provider_mock_test",
+    srcs = [
+        "audio_provider_mock_test.cc",
+    ],
+    deps = [
+        ":audio_large_sample_test_data",
+        ":audio_provider_mock",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
@@ -174,11 +227,10 @@ cc_library(
     ],
     deps = [
         ":audio_provider",
-        ":model_settings",
-        ":preprocessor_reference",
-        ":timer",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_features_generator",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
     ],
 )
 
@@ -190,31 +242,70 @@ tflite_micro_cc_test(
     deps = [
         ":audio_provider",
         ":feature_provider",
-        ":model_settings",
-        ":timer",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+cc_library(
+    name = "feature_provider_mock",
+    srcs = [
+        "feature_provider.cc",
+    ],
+    hdrs = [
+        "feature_provider.h",
+    ],
+    deps = [
+        ":audio_provider_mock",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_features_generator",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "feature_provider_mock_test",
+    srcs = [
+        "feature_provider_mock_test.cc",
+    ],
+    deps = [
+        ":feature_provider_mock",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_features_test_data",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
 
 cc_library(
-    name = "timer",
+    name = "recognize_commands",
     srcs = [
-        "timer.cc",
+        "recognize_commands.cc",
     ],
     hdrs = [
-        "timer.h",
+        "recognize_commands.h",
+    ],
+    deps = [
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
     ],
 )
 
 tflite_micro_cc_test(
-    name = "timer_test",
+    name = "recognize_commands_test",
     srcs = [
-        "timer_test.cc",
+        "recognize_commands_test.cc",
+    ],
+    tags = [
+        "no_oss",  # TODO(122853023): Resolve issues and re-enable.
     ],
     deps = [
-        ":timer",
+        ":recognize_commands",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
@@ -229,15 +320,30 @@ cc_binary(
     deps = [
         ":audio_provider",
         ":feature_provider",
-        ":features_test_data",
-        ":model_settings",
-        ":preprocessor_reference",
-        ":timer",
-        ":tiny_conv_model_data",
+        ":recognize_commands",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:tiny_conv_micro_features_model_data",
+        "//tensorflow/lite/experimental/micro/kernels:all_ops_resolver",
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
+)
+
+cc_binary(
+    name = "micro_speech_mock",
+    srcs = [
+        "main.cc",
+    ],
+    deps = [
+        ":audio_provider_mock",
+        ":feature_provider",
+        ":recognize_commands",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:micro_model_settings",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech/micro_features:tiny_conv_micro_features_model_data",
         "//tensorflow/lite/experimental/micro/kernels:all_ops_resolver",
-        "//tensorflow/lite/experimental/micro/kernels:micro_ops",
         "//tensorflow/lite/schema:schema_fbs",
     ],
 )
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/Makefile.inc b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/Makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..73b884f29de30bbfbdc8487f620f6990a53ef92c
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/Makefile.inc
@@ -0,0 +1,56 @@
+# Settings for targets that use the CMSIS library.
+ifneq ($(filter CMSIS,$(ALL_TAGS)),)
+  INCLUDES += \
+    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
+    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Include/ \
+    -I$(MAKEFILE_DIR)/downloads/CMSIS_ext/
+
+  GENERATED_PROJECT_INCLUDES += \
+    -isystemthird_party/cmsis/CMSIS/Core/Include/ \
+    -isystemthird_party/cmsis/CMSIS/DSP/Include/ \
+    -Ithird_party/CMSIS_ext/
+
+  CMSIS_PREPROCESSOR_SRCS := \
+    tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.cc \
+    tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.cc \
+
+  CMSIS_PREPROCESSOR_HDRS := \
+    tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h \
+    tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.h \
+    third_party/CMSIS_ext/README.md \
+    third_party/CMSIS_ext/arm_cmplx_mag_squared_q10p6.h
+
+  PREPROCESSOR_TEST_SRCS += $(CMSIS_PREPROCESSOR_SRCS)
+  PREPROCESSOR_TEST_HDRS += $(CMSIS_PREPROCESSOR_HDRS)
+
+  FEATURE_PROVIDER_TEST_SRCS += $(CMSIS_PREPROCESSOR_SRCS)
+  FEATURE_PROVIDER_TEST_HDRS += $(CMSIS_PREPROCESSOR_HDRS)
+
+  MICRO_SPEECH_SRCS += $(CMSIS_PREPROCESSOR_SRCS)
+  MICRO_SPEECH_HDRS += $(CMSIS_PREPROCESSOR_HDRS)
+
+  THIRD_PARTY_CC_SRCS += \
+    $(MAKEFILE_DIR)/downloads/CMSIS_ext/arm_cmplx_mag_squared_q10p6.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_q15.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q15.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q15.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.S \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/CommonTables/arm_const_structs.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/CommonTables/arm_common_tables.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q15.c \
+    $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q7.c
+
+  THIRD_PARTY_CC_HDRS += \
+    third_party/cmsis/LICENSE.txt \
+    third_party/cmsis/CMSIS/Core/Include/cmsis_compiler.h \
+    third_party/cmsis/CMSIS/Core/Include/cmsis_gcc.h \
+    third_party/cmsis/CMSIS/Core/Include/cmsis_version.h \
+    third_party/cmsis/CMSIS/Core/Include/core_cm3.h \
+    third_party/cmsis/CMSIS/DSP/Include/arm_common_tables.h \
+    third_party/cmsis/CMSIS/DSP/Include/arm_const_structs.h \
+    third_party/cmsis/CMSIS/DSP/Include/arm_math.h
+
+endif
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/README.md b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..65aec34a1f7991fad33a61a12eddd414577c666d
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/README.md
@@ -0,0 +1,23 @@
+# Description of files
+
+*   **create_constants.py**: Python file used to create hanning.cc, hanning.h,
+    sin_1k.cc, and sin_1k.h
+*   **hanning.cc**: Precomputed
+    [Hann window](https://en.wikipedia.org/wiki/Hann_function) for use in the
+    preprocessor. This file is created in ../create_constants.py
+*   **hanning.h**: Header file fro hanning.cc
+*   **preprocessor.cc**: CMSIS version of the preprocessor
+*   **sin_1k.cc**: A 1 kHZ sinusoid used for comparing the CMSIS preprocessor
+    with the Micro-Lite fixed_point preprocessor
+*   **sin_1k.h**: Header file for sin_1k.cc
+
+# Description of externally downloaded files in ../CMSIS_ext
+
+*   **arm_cmplx_mag_squared_q10p6.c**: Modified version of the ARM CMSIS
+    function
+    [arm_cmplx_mag_squared.c](http://arm-software.github.io/CMSIS_5/DSP/html/group__cmplx__mag__squared.html#ga45537f576102d960d467eb722b8431f2).
+    The modification is that we have changed the amount of right-shift to make
+    sure our data is in the correct range. We redistribute because the original
+    content was created with the Apache 2.0 license.
+*   **arm_cmplx_mag_squared_q10p6.h**: Header file for
+    arm_cmplx_mag_squared_q10p6.c
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/create_constants.py b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/create_constants.py
new file mode 100755
index 0000000000000000000000000000000000000000..daf7e3cde89a0380cbbcae6ddc88859c8e87ffb9
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/create_constants.py
@@ -0,0 +1,75 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Outputs tables used for fast calculations at runtime."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# import soundfile as sf
+import numpy as np
+
+
+def to_cc(x, varname, directory='', scale_factor=1):
+  """Writes table values to a C++ source file."""
+  x = (x / np.max(np.abs(x))) * 32768 * scale_factor
+  x[x > 32767] = 32767
+  x[x < -32768] = -32768
+  x = x.astype(int)
+  x = [str(v) if i % 10 != 0 else '\n    ' + str(v) for i, v in enumerate(x)]
+
+  cmsis_path = 'tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS'
+  xstr = '#include "{}/{}.h"\n\n'.format(cmsis_path, varname)
+  xstr += 'const int g_{}_size = {};\n'.format(varname, len(x))
+  xstr += 'const int16_t g_{}[{}] = {{{}}};\n'.format(varname, len(x),
+                                                      ', '.join(x))
+
+  with open(directory + varname + '.cc', 'w') as f:
+    f.write(xstr)
+
+
+def to_h(_, varname, directory=''):
+  """Writes a header file for the table values."""
+  tf_prepend = 'TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_'
+  xstr = '#ifndef {}{}_H_\n'.format(tf_prepend, varname.upper())
+  xstr += '#define {}{}_H_\n\n'.format(tf_prepend, varname.upper())
+  xstr += '#include <cstdint>\n\n'
+  xstr += 'extern const int g_{}_size;\n'.format(varname)
+  xstr += 'extern const int16_t g_{}[];\n\n'.format(varname)
+  xstr += '#endif'
+
+  with open(directory + varname + '.h', 'w') as f:
+    f.write(xstr)
+
+
+# x = sf.read('yes_f2e59fea_nohash_1.wav')[0]
+# to_cc(x, 'yes_waveform')
+# to_h(x, 'yes_waveform')
+#
+# x = sf.read('no_f9643d42_nohash_4.wav')[0]
+# to_cc(x, 'no_waveform')
+# to_h(x, 'no_waveform')
+
+# 30ms of data @ 16 kHz = 480 samples
+hann = np.hanning(int(16000 * 0.03))  # Window 30ms of data
+to_cc(hann, 'hanning', directory='./')
+to_h(hann, 'hanning', directory='./')
+
+t = np.arange(16000. * 0.03) / 16000.
+sin1k = np.sin(
+    2 * np.pi * 1000 *
+    t)  # Factor of 10 because micro preprocessing overflows otherwise
+to_cc(sin1k, 'sin_1k', directory='./', scale_factor=0.1)
+to_h(sin1k, 'sin_1k', directory='./')
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e6a11ce52c6b41a9f6fcbfc5a31bf7e0da8361cf
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.cc
@@ -0,0 +1,63 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h"
+
+const int g_hanning_size = 480;
+const int16_t g_hanning[480] = {
+    0,     1,     5,     12,    22,    35,    50,    69,    90,    114,   140,
+    170,   202,   237,   275,   316,   359,   405,   454,   506,   560,   617,
+    677,   740,   805,   873,   943,   1016,  1092,  1171,  1252,  1336,  1422,
+    1511,  1602,  1696,  1793,  1892,  1993,  2097,  2204,  2312,  2424,  2537,
+    2653,  2772,  2893,  3016,  3141,  3269,  3399,  3531,  3665,  3802,  3941,
+    4082,  4225,  4370,  4517,  4666,  4817,  4971,  5126,  5283,  5442,  5603,
+    5765,  5930,  6096,  6265,  6435,  6606,  6779,  6954,  7131,  7309,  7489,
+    7670,  7853,  8037,  8223,  8410,  8598,  8788,  8979,  9171,  9365,  9560,
+    9756,  9953,  10151, 10350, 10551, 10752, 10954, 11157, 11362, 11567, 11772,
+    11979, 12186, 12395, 12603, 12813, 13023, 13233, 13445, 13656, 13868, 14081,
+    14294, 14507, 14721, 14935, 15149, 15363, 15578, 15793, 16008, 16222, 16437,
+    16652, 16867, 17082, 17297, 17511, 17725, 17939, 18153, 18367, 18580, 18793,
+    19005, 19217, 19428, 19639, 19850, 20059, 20269, 20477, 20685, 20892, 21098,
+    21303, 21508, 21712, 21914, 22116, 22317, 22517, 22716, 22913, 23110, 23305,
+    23499, 23692, 23884, 24075, 24264, 24451, 24638, 24823, 25006, 25188, 25369,
+    25548, 25725, 25901, 26075, 26247, 26418, 26587, 26754, 26920, 27083, 27245,
+    27405, 27563, 27719, 27874, 28026, 28176, 28324, 28470, 28614, 28756, 28896,
+    29034, 29169, 29303, 29434, 29563, 29689, 29813, 29935, 30055, 30172, 30287,
+    30400, 30510, 30617, 30723, 30825, 30926, 31023, 31119, 31211, 31301, 31389,
+    31474, 31556, 31636, 31713, 31788, 31860, 31929, 31996, 32059, 32121, 32179,
+    32235, 32288, 32338, 32386, 32430, 32472, 32512, 32548, 32582, 32613, 32641,
+    32666, 32689, 32708, 32725, 32739, 32751, 32759, 32765, 32767, 32767, 32765,
+    32759, 32751, 32739, 32725, 32708, 32689, 32666, 32641, 32613, 32582, 32548,
+    32512, 32472, 32430, 32386, 32338, 32288, 32235, 32179, 32121, 32059, 31996,
+    31929, 31860, 31788, 31713, 31636, 31556, 31474, 31389, 31301, 31211, 31119,
+    31023, 30926, 30825, 30723, 30617, 30510, 30400, 30287, 30172, 30055, 29935,
+    29813, 29689, 29563, 29434, 29303, 29169, 29034, 28896, 28756, 28614, 28470,
+    28324, 28176, 28026, 27874, 27719, 27563, 27405, 27245, 27083, 26920, 26754,
+    26587, 26418, 26247, 26075, 25901, 25725, 25548, 25369, 25188, 25006, 24823,
+    24638, 24451, 24264, 24075, 23884, 23692, 23499, 23305, 23110, 22913, 22716,
+    22517, 22317, 22116, 21914, 21712, 21508, 21303, 21098, 20892, 20685, 20477,
+    20269, 20059, 19850, 19639, 19428, 19217, 19005, 18793, 18580, 18367, 18153,
+    17939, 17725, 17511, 17297, 17082, 16867, 16652, 16437, 16222, 16008, 15793,
+    15578, 15363, 15149, 14935, 14721, 14507, 14294, 14081, 13868, 13656, 13445,
+    13233, 13023, 12813, 12603, 12395, 12186, 11979, 11772, 11567, 11362, 11157,
+    10954, 10752, 10551, 10350, 10151, 9953,  9756,  9560,  9365,  9171,  8979,
+    8788,  8598,  8410,  8223,  8037,  7853,  7670,  7489,  7309,  7131,  6954,
+    6779,  6606,  6435,  6265,  6096,  5930,  5765,  5603,  5442,  5283,  5126,
+    4971,  4817,  4666,  4517,  4370,  4225,  4082,  3941,  3802,  3665,  3531,
+    3399,  3269,  3141,  3016,  2893,  2772,  2653,  2537,  2424,  2312,  2204,
+    2097,  1993,  1892,  1793,  1696,  1602,  1511,  1422,  1336,  1252,  1171,
+    1092,  1016,  943,   873,   805,   740,   677,   617,   560,   506,   454,
+    405,   359,   316,   275,   237,   202,   170,   140,   114,   90,    69,
+    50,    35,    22,    12,    5,     1,     0};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h
similarity index 73%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h
index e2ee0c46cf13b00b310bd22b7ca1cb5a9751c6e6..e7d9c5c85866988469f96a444c503863bc2bef4c 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_FEATURES_DATA_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_FEATURES_DATA_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_HANNING_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_HANNING_H_
 
-extern const int g_no_f9643d42_nohash_4_width;
-extern const int g_no_f9643d42_nohash_4_height;
-extern const unsigned char g_no_f9643d42_nohash_4_data[];
+#include <cstdint>
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_FEATURES_DATA_H_
+extern const int g_hanning_size;
+extern const int16_t g_hanning[];
+
+#endif
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.cc
new file mode 100644
index 0000000000000000000000000000000000000000..45e9f798ef04cf40268cf379f24ecbfa904be9b5
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.cc
@@ -0,0 +1,63 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.h"
+
+const int g_sin_1k_size = 480;
+const int16_t g_sin_1k[480] = {
+    0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317,
+    -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,
+    2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,
+    1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027,
+    -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,
+    1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,
+    2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276,
+    -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,
+    0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,
+    3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027,
+    -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,
+    -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,
+    3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317,
+    -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253,
+    -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,
+    3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253,
+    0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317,
+    -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,
+    2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,
+    1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027,
+    -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,
+    1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,
+    2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276,
+    -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,
+    0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,
+    3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027,
+    -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,
+    -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,
+    3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317,
+    -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253,
+    -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,
+    3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253,
+    0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317,
+    -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,
+    2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,
+    1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027,
+    -3276, -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,
+    1253,  0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,
+    2317,  3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276,
+    -3027, -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,
+    0,     -1253, -2317, -3027, -3276, -3027, -2317, -1253, 0,     1253,  2317,
+    3027,  3276,  3027,  2317,  1253,  0,     -1253, -2317, -3027, -3276, -3027,
+    -2317, -1253, 0,     1253,  2317,  3027,  3276,  3027,  2317,  1253,  0,
+    -1253, -2317, -3027, -3276, -3027, -2317, -1253};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.h b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.h
new file mode 100644
index 0000000000000000000000000000000000000000..653a6f583013dc03d0601cfd97a85b15db2c6677
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.h
@@ -0,0 +1,24 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIN_1K_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIN_1K_H_
+
+#include <cstdint>
+
+extern const int g_sin_1k_size;
+extern const int16_t g_sin_1k[];
+
+#endif
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/Makefile.inc b/tensorflow/lite/experimental/micro/examples/micro_speech/Makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..c4e0f0e6ca12feca0f6e9638c3f36b81b2dcbd77
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/Makefile.inc
@@ -0,0 +1,349 @@
+
+INCLUDES += \
+ -I$(MAKEFILE_DIR)/downloads/kissfft
+
+PROJECT_INCLUDES += \
+third_party/kissfft
+
+KISSFFT_LIB_SRCS := \
+$(MAKEFILE_DIR)/downloads/kissfft/kiss_fft.c \
+$(MAKEFILE_DIR)/downloads/kissfft/tools/kiss_fftr.c
+
+KISSFFT_LIB_HDRS := \
+$(MAKEFILE_DIR)/downloads/kissfft/COPYING \
+$(MAKEFILE_DIR)/downloads/kissfft/kiss_fft.h \
+$(MAKEFILE_DIR)/downloads/kissfft/_kiss_fft_guts.h \
+$(MAKEFILE_DIR)/downloads/kissfft/tools/kiss_fftr.h
+
+MICRO_SPEECH_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc
+
+MICRO_SPEECH_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h \
+
+SIMPLE_FEATURES_GENERATOR_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.cc
+
+SIMPLE_FEATURES_GENERATOR_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_30ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_30ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.h
+
+MICRO_FEATURES_LIB_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.cc \
+$(KISSFFT_LIB_SRCS)
+
+MICRO_FEATURES_LIB_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.h \
+$(KISSFFT_LIB_HDRS)
+
+MICRO_FEATURES_FFT_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.cc \
+$(KISSFFT_LIB_SRCS)
+
+MICRO_FEATURES_FFT_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h \
+$(KISSFFT_LIB_HDRS)
+
+MICRO_FEATURES_FILTERBANK_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.cc
+
+MICRO_FEATURES_FILTERBANK_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h
+
+MICRO_FEATURES_FRONTEND_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_test.cc \
+$(MICRO_FEATURES_LIB_SRCS)
+
+MICRO_FEATURES_FRONTEND_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.h \
+$(MICRO_FEATURES_LIB_HDRS)
+
+MICRO_FEATURES_LOG_SCALE_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.cc
+
+MICRO_FEATURES_LOG_SCALE_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h
+
+MICRO_FEATURES_NOISE_REDUCTION_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.cc
+
+MICRO_FEATURES_NOISE_REDUCTION_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h
+
+MICRO_FEATURES_PCAN_GAIN_CONTROL_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.cc
+
+MICRO_FEATURES_PCAN_GAIN_CONTROL_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h
+
+MICRO_FEATURES_WINDOW_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.cc
+
+MICRO_FEATURES_WINDOW_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h
+
+MICRO_FEATURES_GENERATOR_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.cc \
+$(MICRO_FEATURES_LIB_SRCS)
+
+MICRO_FEATURES_GENERATOR_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h \
+$(MICRO_FEATURES_LIB_HDRS)
+
+MICRO_FEATURES_GENERATOR_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.cc \
+$(MICRO_FEATURES_GENERATOR_SRCS)
+
+MICRO_FEATURES_GENERATOR_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.h \
+$(MICRO_FEATURES_GENERATOR_HDRS)
+
+AUDIO_PROVIDER_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
+
+AUDIO_PROVIDER_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h \
+
+AUDIO_PROVIDER_MOCK_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock.cc
+
+AUDIO_PROVIDER_MOCK_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h \
+
+FEATURE_PROVIDER_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc \
+$(MICRO_FEATURES_GENERATOR_SRCS)
+
+FEATURE_PROVIDER_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h \
+$(MICRO_FEATURES_GENERATOR_HDRS)
+
+FEATURE_PROVIDER_MOCK_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc \
+$(MICRO_FEATURES_GENERATOR_SRCS)
+
+FEATURE_PROVIDER_MOCK_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h \
+$(MICRO_FEATURES_GENERATOR_HDRS)
+
+RECOGNIZE_COMMANDS_TEST_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc
+
+RECOGNIZE_COMMANDS_TEST_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h
+
+MICRO_SPEECH_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/main.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc \
+$(MICRO_FEATURES_GENERATOR_SRCS)
+
+MICRO_SPEECH_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h \
+$(MICRO_FEATURES_GENERATOR_HDRS)
+
+MICRO_SPEECH_MOCK_SRCS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/main.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.cc \
+tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc \
+$(MICRO_FEATURES_GENERATOR_SRCS)
+
+MICRO_SPEECH_MOCK_HDRS := \
+tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h \
+tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h \
+$(MICRO_FEATURES_GENERATOR_HDRS)
+
+# Find any platform-specific rules for this example.
+include $(wildcard tensorflow/lite/experimental/micro/examples/micro_speech/*/Makefile.inc)
+
+$(eval $(call microlite_test,micro_features_fft_test,\
+$(MICRO_FEATURES_FFT_TEST_SRCS),$(MICRO_FEATURES_FFT_TEST_HDRS)))
+
+$(eval $(call microlite_test,micro_features_filterbank_test,\
+$(MICRO_FEATURES_FILTERBANK_TEST_SRCS),$(MICRO_FEATURES_FILTERBANK_TEST_HDRS)))
+
+$(eval $(call microlite_test,micro_features_frontend_test,\
+$(MICRO_FEATURES_FRONTEND_TEST_SRCS),$(MICRO_FEATURES_FRONTEND_TEST_HDRS)))
+
+$(eval $(call microlite_test,micro_features_log_scale_test,\
+$(MICRO_FEATURES_LOG_SCALE_TEST_SRCS),$(MICRO_FEATURES_LOG_SCALE_TEST_HDRS)))
+
+$(eval $(call microlite_test,micro_features_noise_reduction_test,\
+$(MICRO_FEATURES_NOISE_REDUCTION_TEST_SRCS),$(MICRO_FEATURES_NOISE_REDUCTION_TEST_HDRS)))
+
+$(eval $(call microlite_test,micro_features_pcan_gain_control_test,\
+$(MICRO_FEATURES_PCAN_GAIN_CONTROL_TEST_SRCS),$(MICRO_FEATURES_PCAN_GAIN_CONTROL_TEST_HDRS)))
+
+$(eval $(call microlite_test,micro_features_window_test,\
+$(MICRO_FEATURES_WINDOW_TEST_SRCS),$(MICRO_FEATURES_WINDOW_TEST_HDRS)))
+
+# Test the code for feature generation.
+$(eval $(call microlite_test,micro_features_generator_test,\
+$(MICRO_FEATURES_GENERATOR_TEST_SRCS), $(MICRO_FEATURES_GENERATOR_TEST_HDRS)))
+
+# Tests loading and running a speech model.
+$(eval $(call microlite_test,micro_speech_test,\
+$(MICRO_SPEECH_TEST_SRCS),$(MICRO_SPEECH_TEST_HDRS)))
+
+# Test the code for feature generation.
+$(eval $(call microlite_test,simple_features_generator_test,\
+$(SIMPLE_FEATURES_GENERATOR_TEST_SRCS), $(SIMPLE_FEATURES_GENERATOR_TEST_HDRS)))
+
+# Tests the audio provider module.
+$(eval $(call microlite_test,audio_provider_test,\
+$(AUDIO_PROVIDER_TEST_SRCS),$(AUDIO_PROVIDER_TEST_HDRS)))
+
+# Tests the audio provider mock module.
+$(eval $(call microlite_test,audio_provider_mock_test,\
+$(AUDIO_PROVIDER_MOCK_TEST_SRCS),$(AUDIO_PROVIDER_MOCK_TEST_HDRS)))
+
+# Tests the feature provider module.
+$(eval $(call microlite_test,feature_provider_test,\
+$(FEATURE_PROVIDER_TEST_SRCS),$(FEATURE_PROVIDER_TEST_HDRS)))
+
+# Tests the feature provider module using the mock audio provider.
+$(eval $(call microlite_test,feature_provider_mock_test,\
+$(FEATURE_PROVIDER_MOCK_TEST_SRCS),$(FEATURE_PROVIDER_MOCK_TEST_HDRS)))
+
+# Tests the command recognizer module.
+$(eval $(call microlite_test,recognize_commands_test,\
+$(RECOGNIZE_COMMANDS_TEST_SRCS),$(RECOGNIZE_COMMANDS_TEST_HDRS)))
+
+# Builds a standalone speech command recognizer binary.
+$(eval $(call microlite_test,micro_speech,\
+$(MICRO_SPEECH_SRCS),$(MICRO_SPEECH_HDRS)))
+
+# Builds a standalone speech command recognizer binary using fake audio input.
+$(eval $(call microlite_test,micro_speech_mock,\
+$(MICRO_SPEECH_MOCK_SRCS),$(MICRO_SPEECH_MOCK_HDRS)))
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/README.md b/tensorflow/lite/experimental/micro/examples/micro_speech/README.md
index 500eed33bab0187f9b2cf9647c046f4a541b9e2c..3cc81c4b5b493f8b624a92960e80d36087f146da 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/README.md
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/README.md
@@ -83,7 +83,8 @@ If you see a compiling error on older machines, try leaving out the `--copt` arg
 ```
 bazel run tensorflow/examples/speech_commands:freeze -- \
 --model_architecture=tiny_conv --window_stride=20 --preprocess=average \
---wanted_words="yes,no" --quantize=1 --output_file=/tmp/tiny_conv.pb
+--wanted_words="yes,no" --quantize=1 --output_file=/tmp/tiny_conv.pb \
+--start_checkpoint=/tmp/speech_commands_train/tiny_conv.ckpt-18000
 ```
 
 The next step is to create a TensorFlow Lite file from the frozen graph:
@@ -99,5 +100,5 @@ bazel run tensorflow/lite/toco:toco -- \
 Finally, convert the file into a C source file that can be compiled into an embedded system:
 
 ```
-xxd -i /tmp/tiny_conv.tflite > /tmp/tiny_conv_model_data.cc
+xxd -i /tmp/tiny_conv.tflite > /tmp/tiny_conv_simple_features_model_data.cc
 ```
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/.gitignore b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..cb8d4d02c418e5d8c903c69729e8e1b3ee44a8bf
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/.gitignore
@@ -0,0 +1,4 @@
+captured_data.txt
+captured_data.wav
+cmsis_*.txt
+micro_*.txt
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/Makefile.inc b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/Makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..c83090344ba0d82e9f774897577b1eb924e92329
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/Makefile.inc
@@ -0,0 +1,100 @@
+# Settings for apollo3 evb platforms.
+ifeq ($(TARGET), apollo3evb)
+
+  PUSHBUTTON_MICRO_SPEECH_TEST_SRCS := \
+    $(AP3_MICRO_DIR)/../preprocessor.cc \
+    $(AP3_MICRO_DIR)/pushbutton_main.c \
+    $(AP3_MICRO_DIR)/pushbutton_test.cc \
+    $(AP3_MICRO_DIR)/../simple_features/tiny_conv_simple_features_model_data.cc \
+    $(APOLLO3_SDK)/devices/am_devices_led.c
+  ALL_SRCS += $(PUSHBUTTON_MICRO_SPEECH_TEST_SRCS)
+  PUSHBUTTON_MICRO_SPEECH_TEST_OBJS := $(addprefix $(OBJDIR), \
+    $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PUSHBUTTON_MICRO_SPEECH_TEST_SRCS))))
+  PUSHBUTTON_MICRO_SPEECH_TEST_BINARY := $(BINDIR)pushbutton_micro_speech_test
+  $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY): $(PUSHBUTTON_MICRO_SPEECH_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY) $(PUSHBUTTON_MICRO_SPEECH_TEST_OBJS) \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+  pushbutton_micro_speech_test: $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY)
+  pushbutton_micro_speech_test_bin: $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY).bin
+  test_pushbutton_micro_speech: $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY)
+	$(TEST_SCRIPT) $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+
+  PUSHBUTTON_CMSIS_SPEECH_TEST_SRCS := \
+    $(AP3_MICRO_DIR)/pushbutton_main.c \
+    $(AP3_MICRO_DIR)/pushbutton_test.cc \
+    $(AP3_MICRO_DIR)/../simple_features/tiny_conv_simple_features_model_data.cc \
+    $(CMSIS_DIR)/simple_features_generator.cc \
+    $(CMSIS_EXT_DIR)/arm_cmplx_mag_squared_q10p6.c \
+    $(CMSIS_DIR)/hanning.c \
+    $(APOLLO3_SDK)/devices/am_devices_led.c \
+    $(CMSIS_SRCS)
+  ALL_SRCS += $(PUSHBUTTON_CMSIS_SPEECH_TEST_SRCS)
+  PUSHBUTTON_CMSIS_SPEECH_TEST_OBJS := $(addprefix $(OBJDIR), \
+    $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PUSHBUTTON_CMSIS_SPEECH_TEST_SRCS))) \
+    arm_bitreversal2.o)
+  PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY := $(BINDIR)pushbutton_cmsis_speech_test
+  $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY): $(PUSHBUTTON_CMSIS_SPEECH_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY) $(PUSHBUTTON_CMSIS_SPEECH_TEST_OBJS) \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+  pushbutton_cmsis_speech_test: $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY)
+  pushbutton_cmsis_speech_test_bin: $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY).bin
+  test_pushbutton_cmsis_speech: $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY)
+	$(TEST_SCRIPT) $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+
+  PREPROCESSOR_1K_SRCS := \
+    tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k.cc \
+    tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.cc
+
+  PREPROCESSOR_1K_MICRO_TEST_SRCS := \
+    $(PREPROCESSOR_1K_SRCS) \
+    $(AP3_MICRO_DIR)/../fixed_point/preprocessor.cc \
+    $(AP3_EXT_MICRO_DIR)/system_apollo3.c \
+    $(AP3_MICRO_DIR)/_main.c
+  ALL_SRCS += $(PREPROCESSOR_1K_MICRO_TEST_SRCS)
+  PREPROCESSOR_1K_MICRO_TEST_OBJS := $(addprefix $(OBJDIR), \
+    $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_1K_MICRO_TEST_SRCS))))
+  PREPROCESSOR_1K_MICRO_TEST_BINARY := $(BINDIR)preprocessor_1k_micro_test
+  $(PREPROCESSOR_1K_MICRO_TEST_BINARY): $(PREPROCESSOR_1K_MICRO_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(PREPROCESSOR_1K_MICRO_TEST_BINARY) $(PREPROCESSOR_1K_MICRO_TEST_OBJS) \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+  preprocessor_1k_micro_test: $(PREPROCESSOR_1K_MICRO_TEST_BINARY)
+  preprocessor_1k_micro_test_bin: $(PREPROCESSOR_1K_MICRO_TEST_BINARY).bin
+  test_preprocessor_1k_micro: $(PREPROCESSOR_1K_MICRO_TEST_BINARY)
+	$(TEST_SCRIPT) $(PREPROCESSOR_1K_MICRO_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+
+  PREPROCESSOR_1K_CMSIS_TEST_SRCS := \
+    $(PREPROCESSOR_1K_SRCS) \
+    $(CMSIS_DIR)/preprocessor.cc \
+    $(CMSIS_EXT_DIR)/arm_cmplx_mag_squared_q10p6.c \
+    $(CMSIS_DIR)/hanning.c \
+    $(AP3_EXT_MICRO_DIR)/system_apollo3.c \
+    $(AP3_MICRO_DIR)/_main.c \
+    $(CMSIS_SRCS)
+  ALL_SRCS += $(PREPROCESSOR_1K_CMSIS_TEST_SRCS)
+  PREPROCESSOR_1K_CMSIS_TEST_BINARY := $(BINDIR)preprocessor_1k_cmsis_test
+  PREPROCESSOR_1K_CMSIS_TEST_OBJS := $(addprefix $(OBJDIR), \
+    $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_1K_CMSIS_TEST_SRCS)))\
+    arm_bitreversal2.o)
+  $(PREPROCESSOR_1K_CMSIS_TEST_BINARY): $(PREPROCESSOR_1K_CMSIS_TEST_OBJS) $(MICROLITE_LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(PREPROCESSOR_1K_CMSIS_TEST_BINARY) $(PREPROCESSOR_1K_CMSIS_TEST_OBJS) \
+	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+  preprocessor_1k_cmsis_test: $(PREPROCESSOR_1K_CMSIS_TEST_BINARY)
+  preprocessor_1k_cmsis_test_bin: $(PREPROCESSOR_1K_CMSIS_TEST_BINARY).bin
+  test_preprocessor_1k_cmsis: $(PREPROCESSOR_1K_CMSIS_TEST_BINARY)
+	$(TEST_SCRIPT) $(PREPROCESSOR_1K_CMSIS_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
+
+  PREPROCESSOR_TEST_SRCS += \
+    $(AP3_MICRO_DIR)/_main.c 
+
+  $(OBJDIR)arm_bitreversal2.o:
+	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $(CMSIS_SRC_DIR)/TransformFunctions/arm_bitreversal2.S -o $(OBJDIR)arm_bitreversal2.o
+
+endif
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/README.md b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..10be9f136a9088d1ad098d685791ae357e8a9c22
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/README.md
@@ -0,0 +1,129 @@
+# Description of Apollo3 Makefile targets
+
+*   **pushbutton_cmsis_speech_test_bin**:
+    *   When users press BTN2 on the Apollo3 EVK, 1 second of audio is captured.
+    *   Then the audio is sent to the CMSIS version of the preprocessor and into
+        the neural net
+    *   To print out the neural net's inference scores, run GDB and source
+        pushbutton\_cmsis\_scores.cmd
+    *   To save the captured audio to a text file (captured\_data.txt), run GDB
+        and source pushbutton\_cmsis\_voice.cmd
+    *   Setup python
+        *   sudo apt install python-pip
+        *   sudo apt install python-tk
+        *   pip install numpy
+        *   pip install matplotlib
+        *   pip install pysoundfile
+        *   python captured_data_to_wav.py
+    *   captured\_data.txt can be turned into a \*.wav file using
+        captured\_data\_to\_wav.py by executing "python
+        captured\_data\_to\_wav.py"
+*   **preprocessor_1k_cmsis_test_bin**:
+    *   Sends a 1 kHz sine wave to the CMSIS fixed\_point version of the
+        preprocessor
+    *   **This test should be compiled with the -O0 option.** Otherwise, the
+        breakpoints will not be reached
+        *   In
+            tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
+            change "-O3" to "-O0" on line 47
+        *   **DO NOT FORGET TO REVERT CHANGE AFTER EXPERIMENT**
+        *   In future, enhance scripts to handle automatically, NOT manually!
+    *   Clean project by running "make -f
+        tensorflow/lite/experimental/micro/tools/make/Makefile clean"
+    *   Compile BIN by running "make -f
+        tensorflow/lite/experimental/micro/tools/make/Makefile TARGET=apollo3evb
+        preprocessor_1k_cmsis_test_bin"
+    *   Run with the preprocessor\_1k\_cmsis\_test.cmd GDB command file
+    *   Produces four text files corresponding to outputs from the CMSIS
+        fixed\_point version of this algorithm:
+        *   cmsis_windowed_input.txt: the sinusoid after multiplying elementwise
+            with a Hann window
+        *   cmsis_dft.txt: the DFT of the windowed sinusoid
+        *   cmsis_power.txt: the magnitude squared of the DFT
+        *   cmsis_power_avg.txt: the 6-bin average of the magnitude squared of
+            the DFT
+    *   Run both verisons of the 1KHz pre-processor test and then compare.
+        *   These files can be plotted with "python compare\_1k.py"
+    *   Also prints out the number of cycles the code took to execute (using the
+        DWT->CYCCNT register)
+*   **preprocessor_1k_micro_test_bin**
+    *   Sends a 1 kHz sine wave to the Micro-Lite fixed\_point version of the
+        preprocessor
+    *   **This test should be compiled with the -O0 option.** Otherwise, the
+        breakpoints will not be reached
+    *   Run with the preprocessor\_1k\_micro\_test.cmd GDB command file
+    *   Produces four text files corresponding to outputs from the Micro-Lite
+        version of this algorithm:
+        *   micro_windowed_input.txt: the sinusoid after multiplying elementwise
+            with a Hann window
+        *   micro_dft.txt: the DFT of the windowed sinusoid
+        *   micro_power.txt: the magnitude squared of the DFT
+        *   micro_power_avg.txt: the 6-bin average of the magnitude squared of
+            the DFT
+    *   Run both verisons of the 1KHz pre-processor test and then compare.
+        *   These files can be plotted with "python compare\_1k.py"
+    *   Also prints out the number of cycles the code took to execute (using the
+        DWT->CYCCNT register)
+
+# Description of files
+
+*   **.gitignore**: Git should ignore \*.txt and \*.wav files that result from
+    experiments run in this directory
+*   **captured\_data\_to\_wav.py**: Python script that parses a text file
+    containing data dumped from GDB (specifically the verilog format) and
+    creates a \*.wav file using
+    [PySoundFile](https://pysoundfile.readthedocs.io/en/0.9.0/).
+*   **compare\_1k.py**: This script compares the intermediate variables and
+    final outputs of the micro-lite fixed-point preprocessor function and the
+    CMSIS version of this function. The stimulus provided to each preprocessor
+    is the same: a 1 kHz sinusoid.
+*   **get\_yesno\_data.cmd**: A GDB command file that runs preprocessor_test
+    (where TARGET=apollo3evb) and dumps the calculated data for the "yes" and
+    "no" input wavfeorms to text files
+*   **\_main.c**: Point of entry for the micro_speech test
+*   **preprocessor_1k.cc**: A version of preprocessor.cc where a 1 kHz sinusoid
+    is provided as input to the preprocessor
+*   **preprocessor_1k_cmsis_test.cmd**: GDB command file for the CMSIS
+    preprocessor 1 kHz test
+*   **preprocessor_1k_micro_test.cmd**: GDB command file for the Micro-Lite
+    preprocessor 1 kHz test
+*   **preprocessor_test.cmd**: GDB command file for the preprocessor test
+*   **pushbutton_cmsis_scores.cmd**: GDB command file that runs
+    pushbutton_cmsis_speech_test_bin. It adds a breakpoint immediately after the
+    scores are reported and prints out each score. Then it continues code
+    execution.
+*   **pushbutton_cmsis_voice.cmd**: GDB command file that runs
+    pushbutton_cmsis_speech_test_bin. Dumps the recorded 1 second of audio to
+    captured_data.txt, which can then be processed by the python file
+    captured_data_to_wav.py.
+*   **pushbutton_main.c**: Source file containing program point of entry
+    \_main() for the pushbutton\_\* tests. Contains Interrupt Service Routines
+    for PDM data capture and pushbuttons. Calls the main() function of
+    pushbutton_test.cc
+*   **pushbutton_test.cc**: Source file containing main() function for the
+    pushbutton\_\* tests. main() calls the preprocessor function and the neural
+    net inference function.
+
+# Description of externally downloaded files in ../apollo3_ext
+
+*   **apollo3.h**: Apollo 3 version of the
+    [CMSIS Device Header File (device.h)](https://www.keil.com/pack/doc/CMSIS/Core/html/device_h_pg.html).
+    Available in the
+    [Ambiq Keil Pack](http://s3.ambiqmicro.com/pack/AmbiqMicro.Apollo_DFP.1.1.0.pack).
+*   **system_apollo3.c**: Apollo 3 version of the
+    [CMSIS System Configuration File system\_\<device\>.c](https://www.keil.com/pack/doc/CMSIS/Core/html/system_c_pg.html).
+    Available in the
+    [Ambiq Keil Pack](http://s3.ambiqmicro.com/pack/AmbiqMicro.Apollo_DFP.1.1.0.pack).
+*   **system_apollo3.h**: Apollo 3 version of the
+    [CMSIS System Configuration File system\_\<device\>.h](https://www.keil.com/pack/doc/CMSIS/Core/html/system_c_pg.html).
+    Available in the
+    [Ambiq Keil Pack](http://s3.ambiqmicro.com/pack/AmbiqMicro.Apollo_DFP.1.1.0.pack).
+
+# FFT scaling
+
+See https://github.com/ARM-software/CMSIS_5/issues/220
+
+> And as @xizhizhang pointed, I think there may be an error on the internal
+> downscaling, or at least on the documentation. It looks like during the fft
+> computation, the downscaling factor reach 2**-9 for a 512 rfft operation,
+> being the output in Q10.22, instead the documented 2**-8 and Q9.23.
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/_main.c b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/_main.c
new file mode 100644
index 0000000000000000000000000000000000000000..b49d5c50ffc936fd34115cc9150829b47a1e3ab5
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/_main.c
@@ -0,0 +1,117 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdint.h>
+#include "am_bsp.h"
+#include "am_mcu_apollo.h"  // Defines AM_CMSIS_REGS
+#include "am_util.h"
+
+//*****************************************************************************
+//
+// The entry point for the application.
+//
+//*****************************************************************************
+extern int main(int argc, char** argv);
+
+void DebugLog(const char* s) { am_util_stdio_printf("%s", s); }
+void DebugLogInt32(int32_t i) { am_util_stdio_printf("%d", i); }
+void DebugLogUInt32(uint32_t i) { am_util_stdio_printf("%d", i); }
+void DebugLogHex(uint32_t i) { am_util_stdio_printf("0x%8x", i); }
+void DebugLogFloat(float i) { am_util_stdio_printf("%f", i); }
+
+int _main(void) {
+  am_util_id_t sIdDevice;
+  uint32_t ui32StrBuf;
+
+  //
+  // Set the clock frequency.
+  //
+  am_hal_clkgen_control(AM_HAL_CLKGEN_CONTROL_SYSCLK_MAX, 0);
+
+  //
+  // Set the default cache configuration
+  //
+  am_hal_cachectrl_config(&am_hal_cachectrl_defaults);
+  am_hal_cachectrl_enable();
+
+  //
+  // Configure the board for low power operation.
+  //
+  am_bsp_low_power_init();
+
+  //
+  // Initialize the printf interface for UART output
+  //
+  am_bsp_uart_printf_enable();
+
+  //
+  // Print the banner.
+  //
+  am_util_stdio_terminal_clear();
+  am_util_stdio_printf("Hello World!\n\n");
+
+  //
+  // Print the device info.
+  //
+  am_util_id_device(&sIdDevice);
+  am_util_stdio_printf("Vendor Name: %s\n", sIdDevice.pui8VendorName);
+  am_util_stdio_printf("Device type: %s\n", sIdDevice.pui8DeviceName);
+
+  am_util_stdio_printf("Qualified: %s\n",
+                       sIdDevice.sMcuCtrlDevice.ui32Qualified ? "Yes" : "No");
+
+  am_util_stdio_printf(
+      "Device Info:\n"
+      "\tPart number: 0x%08X\n"
+      "\tChip ID0:    0x%08X\n"
+      "\tChip ID1:    0x%08X\n"
+      "\tRevision:    0x%08X (Rev%c%c)\n",
+      sIdDevice.sMcuCtrlDevice.ui32ChipPN, sIdDevice.sMcuCtrlDevice.ui32ChipID0,
+      sIdDevice.sMcuCtrlDevice.ui32ChipID1,
+      sIdDevice.sMcuCtrlDevice.ui32ChipRev, sIdDevice.ui8ChipRevMaj,
+      sIdDevice.ui8ChipRevMin);
+
+  //
+  // If not a multiple of 1024 bytes, append a plus sign to the KB.
+  //
+  ui32StrBuf = (sIdDevice.sMcuCtrlDevice.ui32FlashSize % 1024) ? '+' : 0;
+  am_util_stdio_printf(
+      "\tFlash size:  %7d (%d KB%s)\n", sIdDevice.sMcuCtrlDevice.ui32FlashSize,
+      sIdDevice.sMcuCtrlDevice.ui32FlashSize / 1024, &ui32StrBuf);
+
+  ui32StrBuf = (sIdDevice.sMcuCtrlDevice.ui32SRAMSize % 1024) ? '+' : 0;
+  am_util_stdio_printf(
+      "\tSRAM size:   %7d (%d KB%s)\n\n", sIdDevice.sMcuCtrlDevice.ui32SRAMSize,
+      sIdDevice.sMcuCtrlDevice.ui32SRAMSize / 1024, &ui32StrBuf);
+
+  //
+  // Print the compiler version.
+  //
+  am_util_stdio_printf("App Compiler:    %s\n", COMPILER_VERSION);
+#ifdef AM_PART_APOLLO3
+  am_util_stdio_printf("HAL Compiler:    %s\n", g_ui8HALcompiler);
+  am_util_stdio_printf("HAL SDK version: %d.%d.%d\n", g_ui32HALversion.s.Major,
+                       g_ui32HALversion.s.Minor, g_ui32HALversion.s.Revision);
+  am_util_stdio_printf("HAL compiled with %s-style registers\n",
+                       g_ui32HALversion.s.bAMREGS ? "AM_REG" : "CMSIS");
+
+  am_util_stdio_printf("&sIdDevice: 0x%x, &ui32StrBuf: 0x%x\n", &sIdDevice,
+                       &ui32StrBuf);
+  am_hal_security_info_t secInfo;
+  char sINFO[32];
+  uint32_t ui32Status;
+#endif  // AM_PART_APOLLO3
+  main(0, NULL);
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/captured_data_to_wav.py b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/captured_data_to_wav.py
new file mode 100644
index 0000000000000000000000000000000000000000..10a05b6dcf1bbd5c779f7ee7bdf4d01ebde76017
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/captured_data_to_wav.py
@@ -0,0 +1,46 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Converts values pulled from the microcontroller into audio files."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import struct
+# import matplotlib.pyplot as plt
+import numpy as np
+import soundfile as sf
+
+
+def new_data_to_array(fn):
+  vals = []
+  with open(fn) as f:
+    for n, line in enumerate(f):
+      if n is not 0:
+        vals.extend([int(v, 16) for v in line.split()])
+  b = ''.join(map(chr, vals))
+  y = struct.unpack('<' + 'h' * int(len(b) / 2), b)
+
+  return y
+
+
+data = 'captured_data.txt'
+values = np.array(new_data_to_array(data)).astype(float)
+
+# plt.plot(values, 'o-')
+# plt.show(block=False)
+
+wav = values / np.max(np.abs(values))
+sf.write('captured_data.wav', wav, 16000)
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/compare_1k.py b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/compare_1k.py
new file mode 100644
index 0000000000000000000000000000000000000000..52352bad94a1e5627a9ca35d07a5082b6d79e6a6
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/compare_1k.py
@@ -0,0 +1,167 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Debugging script for checking calculation values."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import struct
+import matplotlib.pyplot as plt
+import numpy as np
+
+# import soundfile as sf
+
+
+def new_data_to_array(fn, datatype='int16'):
+  """Converts file information to an in-memory array."""
+  vals = []
+  with open(fn) as f:
+    for n, line in enumerate(f):
+      if n is not 0:
+        vals.extend([int(v, 16) for v in line.split()])
+  b = ''.join(map(chr, vals))
+
+  if datatype == 'int8':
+    typestr = 'b'
+    arraylen = int(len(b))
+  elif datatype == 'int16':
+    typestr = 'h'
+    arraylen = int(len(b) // 2)
+  elif datatype == 'int32':
+    typestr = 'i'
+    arraylen = int(len(b) // 4)
+  if datatype == 'uint8':
+    typestr = 'B'
+    arraylen = int(len(b))
+  elif datatype == 'uint16':
+    typestr = 'H'
+    arraylen = int(len(b) // 2)
+  elif datatype == 'uint32':
+    typestr = 'I'
+    arraylen = int(len(b) // 4)
+
+  y = np.array(struct.unpack('<' + typestr * arraylen, b))
+
+  return y
+
+
+# x is the fixed-point input in Qm.n format
+def to_float(x, n):
+  return x.astype(float) * 2**(-n)
+
+
+micro_windowed_input = new_data_to_array(
+    'micro_windowed_input.txt', datatype='int32')
+cmsis_windowed_input = new_data_to_array(
+    'cmsis_windowed_input.txt', datatype='int16')
+
+micro_dft = new_data_to_array('micro_dft.txt', datatype='int32')
+cmsis_dft = new_data_to_array('cmsis_dft.txt', datatype='int16')
+py_dft = np.fft.rfft(to_float(cmsis_windowed_input, 15), n=512)
+py_result = np.empty((2 * py_dft.size), dtype=np.float)
+py_result[0::2] = np.real(py_dft)
+py_result[1::2] = np.imag(py_dft)
+
+micro_power = new_data_to_array('micro_power.txt', datatype='int32')
+cmsis_power = new_data_to_array('cmsis_power.txt', datatype='int16')
+py_power = np.square(np.abs(py_dft))
+
+micro_power_avg = new_data_to_array('micro_power_avg.txt', datatype='uint8')
+cmsis_power_avg = new_data_to_array('cmsis_power_avg.txt', datatype='uint8')
+
+plt.figure(1)
+plt.subplot(311)
+plt.plot(micro_windowed_input, label='Micro fixed')
+plt.legend()
+plt.subplot(312)
+plt.plot(cmsis_windowed_input, label='CMSIS fixed')
+plt.legend()
+plt.subplot(313)
+plt.plot(to_float(micro_windowed_input, 30), label='Micro to float')
+plt.plot(to_float(cmsis_windowed_input, 15), label='CMSIS to float')
+plt.legend()
+
+plt.figure(2)
+plt.subplot(311)
+plt.plot(micro_dft, label='Micro fixed')
+plt.legend()
+plt.subplot(312)
+plt.plot(cmsis_dft, label='CMSIS fixed')
+plt.legend()
+plt.subplot(313)
+plt.plot(to_float(micro_dft, 22), label='Micro to float')
+# CMSIS result has 6 fractionanl bits (not 7) due to documentation error (see
+# README.md)
+plt.plot(to_float(cmsis_dft, 6), label='CMSIS to float')
+plt.plot(py_result, label='Python result')
+plt.legend()
+
+plt.figure(3)
+plt.subplot(311)
+plt.plot(micro_power, label='Micro fixed')
+plt.legend()
+plt.subplot(312)
+plt.plot(cmsis_power[0:256], label='CMSIS fixed')
+plt.legend()
+plt.subplot(313)
+plt.plot(to_float(micro_power, 22), label='Micro to float')
+plt.plot(to_float(cmsis_power[0:256], 6), label='CMSIS to float')
+plt.plot(py_power, label='Python result')
+plt.legend()
+
+plt.figure(4)
+plt.plot(micro_power_avg, label='Micro fixed')
+plt.plot(cmsis_power_avg, label='CMSIS fixed')
+plt.legend()
+plt.show()
+
+# t = np.arange(16000.*0.03)/16000.
+# # Factor of 10 because micro preprocessing overflows otherwise
+# sin1k = 0.1*np.sin(2*np.pi*1000*t)
+#
+# plt.figure(1)
+# plt.subplot(511)
+# plt.plot(sin1k)
+# plt.title('Input sine')
+#
+# plt.subplot(512)
+# plt.plot(to_float(micro_windowed_input, 30), label='Micro-Lite')
+# plt.plot(to_float(cmsis_windowed_input, 15), label='CMSIS')
+# plt.title('Windowed sine')
+# plt.legend(loc='center right')
+#
+# plt.subplot(513)
+# plt.plot(to_float(micro_dft, 22), label='Micro-Lite')
+# plt.plot(to_float(cmsis_dft, 6), label='CMSIS')
+# plt.title('FFT')
+# plt.legend(loc='center')
+#
+# plt.subplot(514)
+# plt.plot(to_float(micro_power, 22), label='Micro-Lite')
+# plt.plot(to_float(cmsis_power[0:256], 6), label='CMSIS')
+# plt.title('|FFT|^2')
+# plt.legend(loc='center right')
+#
+# plt.subplot(515)
+# plt.plot(micro_power_avg, label='Micro-Lite')
+# plt.plot(cmsis_power_avg, label='CMSIS')
+# plt.title('Averaged |FFT|^2')
+# plt.legend(loc='center right')
+#
+# plt.tight_layout(pad=0, w_pad=0.2, h_pad=0.2)
+#
+# plt.show()
+#
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k.cc
new file mode 100644
index 0000000000000000000000000000000000000000..007772e77a53b43607be90e6b8b9243b00c79546
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k.cc
@@ -0,0 +1,53 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* This file is a modification of the Tensorflow Micro Lite file preprocessor.cc
+ */
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/sin_1k.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+extern "C" {
+#include "apollo3.h"
+#include "system_apollo3.h"
+}
+
+#define output_data_size 43
+int count;
+
+extern TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
+                               const int16_t* input, int input_size,
+                               int output_size, uint8_t* output);
+
+TF_LITE_MICRO_TESTS_BEGIN
+CoreDebug->DEMCR |= CoreDebug_DEMCR_TRCENA_Msk;
+// DWT->LAR = 0xC5ACCE55;
+DWT->CYCCNT = 0;
+DWT->CTRL |= DWT_CTRL_CYCCNTENA_Msk;
+
+TF_LITE_MICRO_TEST(TestPreprocessor) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  uint8_t calculated_data[output_data_size];
+  TfLiteStatus yes_status = Preprocess(error_reporter, g_sin_1k, g_sin_1k_size,
+                                       output_data_size, calculated_data);
+  count = DWT->CYCCNT;
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, yes_status);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k_cmsis_test.cmd b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k_cmsis_test.cmd
new file mode 100644
index 0000000000000000000000000000000000000000..6988057f37fc8ecfa89bf8e4d87b665be540cb2e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k_cmsis_test.cmd
@@ -0,0 +1,37 @@
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Needs to be compiled with -O0
+file ../../../tools/make/gen/apollo3evb_cortex-m4/bin/preprocessor_1k_cmsis_test
+target remote localhost:2331
+load ../../../tools/make/gen/apollo3evb_cortex-m4/bin/preprocessor_1k_cmsis_test
+monitor reset
+break preprocessor.cc:68
+commands
+dump verilog value cmsis_windowed_input.txt bufB
+c
+end
+break preprocessor.cc:76
+commands
+dump verilog value cmsis_dft.txt bufA
+c
+end
+break preprocessor.cc:81
+commands
+dump verilog value cmsis_power.txt bufB
+c
+end
+break preprocessor.cc:83
+commands
+dump verilog memory cmsis_power_avg.txt output output+42
+c
+end
+break preprocessor_1k.cc:50
+commands
+print count
+end
+c
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k_micro_test.cmd b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k_micro_test.cmd
new file mode 100644
index 0000000000000000000000000000000000000000..dc9cd4f0a41b20a50d487da8c68fa93b35439e38
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_1k_micro_test.cmd
@@ -0,0 +1,25 @@
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Needs to be run when compiled with -O0
+file ../../../tools/make/gen/apollo3evb_cortex-m4/bin/preprocessor_1k_micro_test
+target remote localhost:2331
+load ../../../tools/make/gen/apollo3evb_cortex-m4/bin/preprocessor_1k_micro_test
+monitor reset
+break preprocessor.cc:211
+commands
+dump verilog value micro_windowed_input.txt fixed_input
+dump verilog value micro_dft.txt fourier_values
+dump verilog value micro_power.txt power_spectrum
+dump verilog memory micro_power_avg.txt output output+42
+c
+end
+break preprocessor_1k.cc:50
+commands
+print count
+end
+c
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_test.cmd b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_test.cmd
new file mode 100644
index 0000000000000000000000000000000000000000..bd2048e80ae3dffc5b6650d730c96b617a1379f9
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/preprocessor_test.cmd
@@ -0,0 +1,11 @@
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+file ../../gen/apollo3evb_cortex-m4/bin/preprocessor_test
+target remote localhost:2331
+load ../../gen/apollo3evb_cortex-m4/bin/preprocessor_test
+monitor reset
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_cmsis_scores.cmd b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_cmsis_scores.cmd
new file mode 100644
index 0000000000000000000000000000000000000000..ace278ff9a2e20f51590dd9fd5d66b84e65c023b
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_cmsis_scores.cmd
@@ -0,0 +1,26 @@
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+file ../../../tools/make/gen/apollo3evb_cortex-m4/bin/pushbutton_cmsis_speech_test
+target remote localhost:2331
+load ../../../tools/make/gen/apollo3evb_cortex-m4/bin/pushbutton_cmsis_speech_test
+monitor reset
+break pushbutton_main.c:307
+commands
+printf "Silence score: %d\n", g_silence_score
+printf "Unknown score: %d\n", g_unknown_score
+printf "Yes score: %d\n", g_yes_score
+printf "No score: %d\n", g_no_score
+printf "g_scores[0]: %d\n", g_scores[0]
+printf "g_scores[1]: %d\n", g_scores[1]
+printf "g_scores[2]: %d\n", g_scores[2]
+printf "g_scores[3]: %d\n", g_scores[3]
+printf "max_score: %d\n", max_score
+printf "max_score_index: %d\n", max_score_index
+c
+end
+c
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_cmsis_voice.cmd b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_cmsis_voice.cmd
new file mode 100644
index 0000000000000000000000000000000000000000..5dea48e62aba123b54a19c02847236cf28fc2a38
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_cmsis_voice.cmd
@@ -0,0 +1,25 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+file ../../../tools/make/gen/apollo3evb_cortex-m4/bin/pushbutton_cmsis_speech_test
+target remote localhost:2331
+load ../../../tools/make/gen/apollo3evb_cortex-m4/bin/pushbutton_cmsis_speech_test
+monitor reset
+break pushbutton_main.c:296
+commands
+dump verilog value captured_data.txt captured_data
+c
+end
+c
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_main.c b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_main.c
new file mode 100644
index 0000000000000000000000000000000000000000..4f70d47c3ea9b6f7df884ceabeca245a2a5e55ce
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_main.c
@@ -0,0 +1,322 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* This file is a modification of the Tensorflow Micro Lite file _main.c */
+
+#include <stdint.h>
+#include "am_bsp.h"
+#include "am_mcu_apollo.h"  // Defines AM_CMSIS_REGS
+#include "am_util.h"
+
+#define ARM_MATH_CM4
+#include <arm_math.h>
+
+//*****************************************************************************
+// Parameters
+//
+// Total number of bytes transferred = 320*50*2 = 32000
+//*****************************************************************************
+
+#define FRAME_SIZE 320  // Capture one 320-sample (20-ms) frame at a time
+#define NUM_FRAMES 50   // Number of frames in 1 second
+
+//*****************************************************************************
+// GLOBALS
+//*****************************************************************************
+
+volatile int16_t g_numFramesCaptured = 0;
+volatile bool g_bPDMDataReady = false;
+int16_t
+    captured_data[FRAME_SIZE * NUM_FRAMES];  // Location of 1-second data buffer
+extern uint8_t g_silence_score;
+extern uint8_t g_unknown_score;
+extern uint8_t g_yes_score;
+extern uint8_t g_no_score;
+q7_t g_scores[4] = {0};
+
+//*****************************************************************************
+// The entry point for the application.
+//*****************************************************************************
+extern int main(int argc, char** argv);
+
+void DebugLog(const char* s) { am_util_stdio_printf("%s", s); }
+void DebugLogInt32(int32_t i) { am_util_stdio_printf("%d", i); }
+void DebugLogUInt32(uint32_t i) { am_util_stdio_printf("%d", i); }
+void DebugLogHex(uint32_t i) { am_util_stdio_printf("0x%8x", i); }
+void DebugLogFloat(float i) { am_util_stdio_printf("%f", i); }
+
+//*****************************************************************************
+// PDM configuration information.
+//*****************************************************************************
+void* PDMHandle;
+
+am_hal_pdm_config_t g_sPdmConfig = {
+    .eClkDivider = AM_HAL_PDM_MCLKDIV_1,
+    .eLeftGain = AM_HAL_PDM_GAIN_P225DB,
+    .eRightGain = AM_HAL_PDM_GAIN_P225DB,
+    .ui32DecimationRate =
+        48,  // OSR = 1500/16 = 96 = 2*SINCRATE --> SINC_RATE = 48
+    .bHighPassEnable = 0,
+    .ui32HighPassCutoff = 0xB,
+    .ePDMClkSpeed = AM_HAL_PDM_CLK_1_5MHZ,
+    .bInvertI2SBCLK = 0,
+    .ePDMClkSource = AM_HAL_PDM_INTERNAL_CLK,
+    .bPDMSampleDelay = 0,
+    .bDataPacking = 1,
+    .ePCMChannels = AM_HAL_PDM_CHANNEL_RIGHT,
+    .bLRSwap = 0,
+};
+
+//*****************************************************************************
+// BUTTON0 pin configuration settings.
+//*****************************************************************************
+const am_hal_gpio_pincfg_t g_deepsleep_button0 = {
+    .uFuncSel = 3,
+    .eIntDir = AM_HAL_GPIO_PIN_INTDIR_LO2HI,
+    .eGPInput = AM_HAL_GPIO_PIN_INPUT_ENABLE,
+};
+
+//*****************************************************************************
+// PDM initialization.
+//*****************************************************************************
+void pdm_init(void) {
+  //
+  // Initialize, power-up, and configure the PDM.
+  //
+  am_hal_pdm_initialize(0, &PDMHandle);
+  am_hal_pdm_power_control(PDMHandle, AM_HAL_PDM_POWER_ON, false);
+  am_hal_pdm_configure(PDMHandle, &g_sPdmConfig);
+  am_hal_pdm_enable(PDMHandle);
+
+  //
+  // Configure the necessary pins.
+  //
+  am_hal_gpio_pincfg_t sPinCfg = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  // ARPIT 181019
+  // sPinCfg.uFuncSel = AM_HAL_PIN_10_PDMCLK;
+  // am_hal_gpio_pinconfig(10, sPinCfg);
+  sPinCfg.uFuncSel = AM_HAL_PIN_12_PDMCLK;
+  am_hal_gpio_pinconfig(12, sPinCfg);
+
+  sPinCfg.uFuncSel = AM_HAL_PIN_11_PDMDATA;
+  am_hal_gpio_pinconfig(11, sPinCfg);
+
+  // am_hal_gpio_state_write(14, AM_HAL_GPIO_OUTPUT_CLEAR);
+  // am_hal_gpio_pinconfig(14, g_AM_HAL_GPIO_OUTPUT);
+
+  //
+  // Configure and enable PDM interrupts (set up to trigger on DMA
+  // completion).
+  //
+  am_hal_pdm_interrupt_enable(PDMHandle,
+                              (AM_HAL_PDM_INT_DERR | AM_HAL_PDM_INT_DCMP |
+                               AM_HAL_PDM_INT_UNDFL | AM_HAL_PDM_INT_OVF));
+
+#if AM_CMSIS_REGS
+  NVIC_EnableIRQ(PDM_IRQn);
+#else
+  am_hal_interrupt_enable(AM_HAL_INTERRUPT_PDM);
+#endif
+}
+
+//*****************************************************************************
+//
+// Start a transaction to get some number of bytes from the PDM interface.
+//
+//*****************************************************************************
+void pdm_data_get(void) {
+  //
+  // Configure DMA and target address.
+  //
+  am_hal_pdm_transfer_t sTransfer;
+  sTransfer.ui32TargetAddr =
+      (uint32_t)(&captured_data[FRAME_SIZE * g_numFramesCaptured]);
+  sTransfer.ui32TotalCount = 2 * FRAME_SIZE;  // Each sample is 2 bytes
+
+  //
+  // Start the data transfer.
+  //
+  am_hal_pdm_dma_start(PDMHandle, &sTransfer);
+}
+
+//*****************************************************************************
+//
+// PDM interrupt handler.
+//
+//*****************************************************************************
+void am_pdm0_isr(void) {
+  uint32_t ui32Status;
+  //
+  // Read the interrupt status.
+  //
+  am_hal_pdm_interrupt_status_get(PDMHandle, &ui32Status, true);
+  am_hal_pdm_interrupt_clear(PDMHandle, ui32Status);
+
+  //
+  // Once our DMA transaction completes, send a flag to the main routine
+  //
+  if (ui32Status & AM_HAL_PDM_INT_DCMP) g_bPDMDataReady = true;
+}
+
+//*****************************************************************************
+// GPIO ISR
+// Will enable the PDM, set number of frames transferred to 0, and turn on LED
+//*****************************************************************************
+void am_gpio_isr(void) {
+  //
+  // Delay for debounce.
+  //
+  am_util_delay_ms(200);
+
+  //
+  // Clear the GPIO Interrupt (write to clear).
+  //
+  am_hal_gpio_interrupt_clear(AM_HAL_GPIO_BIT(AM_BSP_GPIO_BUTTON0));
+
+  // Start audio transfer
+  am_hal_pdm_fifo_flush(PDMHandle);
+  pdm_data_get();
+  am_hal_pdm_enable(PDMHandle);
+
+  //
+  // Turn on LED 0
+  //
+  am_devices_led_on(am_bsp_psLEDs, 0);
+}
+
+int _main(void) {
+  am_util_id_t sIdDevice;
+  uint32_t ui32StrBuf;
+
+  //
+  // Set the clock frequency.
+  //
+  am_hal_clkgen_control(AM_HAL_CLKGEN_CONTROL_SYSCLK_MAX, 0);
+
+  //
+  // Set the default cache configuration
+  //
+  am_hal_cachectrl_config(&am_hal_cachectrl_defaults);
+  am_hal_cachectrl_enable();
+
+  //
+  // Configure the board for low power operation.
+  //
+  am_bsp_low_power_init();
+
+#if defined(AM_BSP_NUM_BUTTONS) && defined(AM_BSP_NUM_LEDS)
+  //
+  // Configure the button pin.
+  //
+  am_hal_gpio_pinconfig(AM_BSP_GPIO_BUTTON0, g_deepsleep_button0);
+
+  //
+  // Clear the GPIO Interrupt (write to clear).
+  //
+  am_hal_gpio_interrupt_clear(AM_HAL_GPIO_BIT(AM_BSP_GPIO_BUTTON0));
+
+  //
+  // Enable the GPIO/button interrupt.
+  //
+  am_hal_gpio_interrupt_enable(AM_HAL_GPIO_BIT(AM_BSP_GPIO_BUTTON0));
+
+  //
+  // Configure the LEDs.
+  //
+  am_devices_led_array_init(am_bsp_psLEDs, AM_BSP_NUM_LEDS);
+
+  //
+  // Turn the LEDs off
+  //
+  for (int ix = 0; ix < AM_BSP_NUM_LEDS; ix++) {
+    am_devices_led_off(am_bsp_psLEDs, ix);
+  }
+
+//    am_devices_led_on(am_bsp_psLEDs, 1);
+#endif  // defined(AM_BSP_NUM_BUTTONS)  &&  defined(AM_BSP_NUM_LEDS)
+
+#if AM_CMSIS_REGS
+  NVIC_EnableIRQ(GPIO_IRQn);
+#else   // AM_CMSIS_REGS
+  am_hal_interrupt_enable(AM_HAL_INTERRUPT_GPIO);
+#endif  // AM_CMSIS_REGS
+
+  //
+  // Enable interrupts to the core.
+  //
+  am_hal_interrupt_master_enable();
+
+  // Turn on PDM
+  pdm_init();
+
+  //
+  // Initialize the printf interface for UART output
+  //
+  am_bsp_uart_printf_enable();
+
+  //
+  // Print the banner.
+  //
+  am_util_stdio_terminal_clear();
+  am_util_stdio_printf("Starting streaming test\n\n");
+
+  // Score variables
+  q7_t max_score = 0;
+  uint32_t max_score_index = 0;
+
+  while (1) {
+    am_hal_interrupt_master_disable();
+
+    if (g_bPDMDataReady) {
+      g_bPDMDataReady = false;
+      g_numFramesCaptured++;
+
+      if (g_numFramesCaptured < NUM_FRAMES) {
+        pdm_data_get();  // Start converting the next set of PCM samples.
+      }
+
+      else {
+        g_numFramesCaptured = 0;
+        // am_hal_pdm_disable(PDMHandle);
+        am_devices_led_off(am_bsp_psLEDs, 0);
+
+        main(0, NULL);
+
+        g_scores[0] = (q7_t)g_silence_score - 128;
+        g_scores[1] = (q7_t)g_unknown_score - 128;
+        g_scores[2] = (q7_t)g_yes_score - 128;
+        g_scores[3] = (q7_t)g_no_score - 128;
+
+        am_devices_led_off(
+            am_bsp_psLEDs,
+            max_score_index + 1);  // Turn off LED for previous max score
+        arm_max_q7(g_scores, 4, &max_score, &max_score_index);
+        am_devices_led_on(
+            am_bsp_psLEDs,
+            max_score_index + 1);  // Turn on LED for new max score
+      }
+    }
+
+    //
+    // Go to Deep Sleep.
+    //
+    am_hal_sysctrl_sleep(AM_HAL_SYSCTRL_SLEEP_DEEP);
+
+    am_hal_interrupt_master_enable();
+  }
+
+  // main(0, NULL);
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d4583dbf4a6dcb083e4d9cd2818e63a116debd7f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/apollo3/pushbutton_test.cc
@@ -0,0 +1,132 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* This file is a modification of the Tensorflow Micro Lite file
+ * micro_speech_test.cc */
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/micro_interpreter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+extern int16_t captured_data[16000];
+uint8_t g_silence_score = 0;
+uint8_t g_unknown_score = 0;
+uint8_t g_yes_score = 0;
+uint8_t g_no_score = 0;
+
+namespace {
+
+TfLiteStatus GenerateSimpleFeatures_1sec(tflite::ErrorReporter* error_reporter,
+                                         const int16_t* input,
+                                         uint8_t* output) {
+  int i;
+  for (i = 0; i < 49; i++) {
+    GenerateSimpleFeatures(error_reporter, input + i * 320, 480, 43,
+                           output + i * 43);
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestSimpleFeaturesGenerator) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  uint8_t preprocessed_data[43 * 49];
+  TfLiteStatus generate_1sec_status = GenerateSimpleFeatures_1sec(
+      error_reporter, captured_data, preprocessed_data);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, generate_1sec_status);
+
+  // Map the model into a usable data structure. This doesn't involve any
+  // copying or parsing, it's a very lightweight operation.
+  const tflite::Model* model =
+      ::tflite::GetModel(g_tiny_conv_simple_features_model_data);
+  if (model->version() != TFLITE_SCHEMA_VERSION) {
+    error_reporter->Report(
+        "Model provided is schema version %d not equal "
+        "to supported version %d.\n",
+        model->version(), TFLITE_SCHEMA_VERSION);
+  }
+
+  // This pulls in all the operation implementations we need.
+  tflite::ops::micro::AllOpsResolver resolver;
+
+  // Create an area of memory to use for input, output, and intermediate arrays.
+  const int tensor_arena_size = 10 * 1024;
+  uint8_t tensor_arena[tensor_arena_size];
+  tflite::SimpleTensorAllocator tensor_allocator(tensor_arena,
+                                                 tensor_arena_size);
+
+  // Build an interpreter to run the model with.
+  tflite::MicroInterpreter interpreter(model, resolver, &tensor_allocator,
+                                       error_reporter);
+
+  // Get information about the memory area to use for the model's input.
+  TfLiteTensor* input = interpreter.input(0);
+
+  // Make sure the input has the properties we expect.
+  TF_LITE_MICRO_EXPECT_NE(nullptr, input);
+  TF_LITE_MICRO_EXPECT_EQ(4, input->dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(1, input->dims->data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(49, input->dims->data[1]);
+  TF_LITE_MICRO_EXPECT_EQ(43, input->dims->data[2]);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, input->type);
+
+  // Copy a spectrogram created from a .wav audio file of someone saying "Yes",
+  // into the memory area used for the input.
+  for (int i = 0; i < input->bytes; ++i) {
+    input->data.uint8[i] = preprocessed_data[i];
+  }
+
+  // Run the model on this input and make sure it succeeds.
+  TfLiteStatus invoke_status = interpreter.Invoke();
+  if (invoke_status != kTfLiteOk) {
+    error_reporter->Report("Invoke failed\n");
+  }
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, invoke_status);
+
+  // Get the output from the model, and make sure it's the expected size and
+  // type.
+  TfLiteTensor* output = interpreter.output(0);
+  TF_LITE_MICRO_EXPECT_EQ(2, output->dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(1, output->dims->data[0]);
+  TF_LITE_MICRO_EXPECT_EQ(4, output->dims->data[1]);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, output->type);
+
+  // There are four possible classes in the output, each with a score.
+  const int kSilenceIndex = 0;
+  const int kUnknownIndex = 1;
+  const int kYesIndex = 2;
+  const int kNoIndex = 3;
+
+  // Make sure that the expected "Yes" score is higher than the other classes.
+  g_silence_score = output->data.uint8[kSilenceIndex];
+  g_unknown_score = output->data.uint8[kUnknownIndex];
+  g_yes_score = output->data.uint8[kYesIndex];
+  g_no_score = output->data.uint8[kNoIndex];
+
+  error_reporter->Report("Ran successfully\n");
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
index c0365d56901b503628b323a2fe09a4fa0de9165e..08811c83b437e66bf1e77a1a1f32d1cb5be02c43 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
@@ -15,10 +15,11 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
 
 namespace {
 int16_t g_dummy_audio_data[kMaxAudioSampleSize];
+int32_t g_latest_audio_timestamp = 0;
 }  // namespace
 
 TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
@@ -31,3 +32,8 @@ TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
   *audio_samples = g_dummy_audio_data;
   return kTfLiteOk;
 }
+
+int32_t LatestAudioTimestamp() {
+  g_latest_audio_timestamp += 100;
+  return g_latest_audio_timestamp;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h
index 7e2442a5e83ee1f809f82587c816adb01dc09e5e..b69067364198d7285d3f2bfc34208168effacb35 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h
@@ -33,4 +33,14 @@ TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
                              int start_ms, int duration_ms,
                              int* audio_samples_size, int16_t** audio_samples);
 
+// Returns the time that audio data was last captured in milliseconds. There's
+// no contract about what time zero represents, the accuracy, or the granularity
+// of the result. Subsequent calls will generally not return a lower value, but
+// even that's not guaranteed if there's an overflow  wraparound.
+// The reference implementation of this function just returns a constantly
+// incrementing value for each call, since it would need a non-portable platform
+// call to access time information. For real applications, you'll need to write
+// your own platform-specific implementation.
+int32_t LatestAudioTimestamp();
+
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_AUDIO_PROVIDER_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9c9792510b055e243ab4f6e804717647afa0b418
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock.cc
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h"
+
+namespace {
+int16_t g_dummy_audio_data[kMaxAudioSampleSize];
+int32_t g_latest_audio_timestamp = 0;
+}  // namespace
+
+TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
+                             int start_ms, int duration_ms,
+                             int* audio_samples_size, int16_t** audio_samples) {
+  const int yes_start = (0 * kAudioSampleFrequency) / 1000;
+  const int yes_end = (1000 * kAudioSampleFrequency) / 1000;
+  const int no_start = (4000 * kAudioSampleFrequency) / 1000;
+  const int no_end = (5000 * kAudioSampleFrequency) / 1000;
+  const int wraparound = (8000 * kAudioSampleFrequency) / 1000;
+  const int start_sample = (start_ms * kAudioSampleFrequency) / 1000;
+  for (int i = 0; i < kMaxAudioSampleSize; ++i) {
+    const int sample_index = (start_sample + i) % wraparound;
+    int16_t sample;
+    if ((sample_index >= yes_start) && (sample_index < yes_end)) {
+      sample = g_yes_1000ms_sample_data[sample_index - yes_start];
+    } else if ((sample_index >= no_start) && (sample_index < no_end)) {
+      sample = g_no_1000ms_sample_data[sample_index - no_start];
+    } else {
+      sample = 0;
+    }
+    g_dummy_audio_data[i] = sample;
+  }
+  *audio_samples_size = kMaxAudioSampleSize;
+  *audio_samples = g_dummy_audio_data;
+  return kTfLiteOk;
+}
+
+int32_t LatestAudioTimestamp() {
+  g_latest_audio_timestamp += 100;
+  return g_latest_audio_timestamp;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b73d436ad638277d3f052715c506668e7f163f17
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_mock_test.cc
@@ -0,0 +1,76 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+
+#include <limits>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestAudioProviderMock) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  int audio_samples_size = 0;
+  int16_t* audio_samples = nullptr;
+  TfLiteStatus get_status =
+      GetAudioSamples(error_reporter, 0, kFeatureSliceDurationMs,
+                      &audio_samples_size, &audio_samples);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, get_status);
+  TF_LITE_MICRO_EXPECT_LE(audio_samples_size, kMaxAudioSampleSize);
+  TF_LITE_MICRO_EXPECT_NE(audio_samples, nullptr);
+  for (int i = 0; i < audio_samples_size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(g_yes_1000ms_sample_data[i], audio_samples[i]);
+  }
+
+  get_status = GetAudioSamples(error_reporter, 500, kFeatureSliceDurationMs,
+                               &audio_samples_size, &audio_samples);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, get_status);
+  TF_LITE_MICRO_EXPECT_LE(audio_samples_size, kMaxAudioSampleSize);
+  TF_LITE_MICRO_EXPECT_NE(audio_samples, nullptr);
+  for (int i = 0; i < audio_samples_size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(g_yes_1000ms_sample_data[i + 8000],
+                            audio_samples[i]);
+  }
+
+  get_status = GetAudioSamples(error_reporter, 1500, kFeatureSliceDurationMs,
+                               &audio_samples_size, &audio_samples);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, get_status);
+  TF_LITE_MICRO_EXPECT_LE(audio_samples_size, kMaxAudioSampleSize);
+  TF_LITE_MICRO_EXPECT_NE(audio_samples, nullptr);
+  for (int i = 0; i < audio_samples_size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(0, audio_samples[i]);
+  }
+
+  get_status = GetAudioSamples(error_reporter, 12250, kFeatureSliceDurationMs,
+                               &audio_samples_size, &audio_samples);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, get_status);
+  TF_LITE_MICRO_EXPECT_LE(audio_samples_size, kMaxAudioSampleSize);
+  TF_LITE_MICRO_EXPECT_NE(audio_samples, nullptr);
+  for (int i = 0; i < audio_samples_size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(g_no_1000ms_sample_data[i + 4000],
+                            audio_samples[i]);
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc
index 5f7c7605f0feb3fd3179a0edd5e51574b867ce68..f9212aa3491e99104c2a3f1f5e315e9e96481345 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc
@@ -14,8 +14,11 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+
+#include <limits>
+
 #include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
@@ -41,4 +44,27 @@ TF_LITE_MICRO_TEST(TestAudioProvider) {
   }
 }
 
+TF_LITE_MICRO_TEST(TestTimer) {
+  // Make sure that the technically-undefined overflow behavior we rely on below
+  // works on this platform. It's still not guaranteed, but at least this is a
+  // sanity check.  Turn off when running with ASan, as it will complain about
+  // the following undefined behavior.
+#ifndef ADDRESS_SANITIZER
+  int32_t overflow_value = std::numeric_limits<int32_t>::max();
+  overflow_value += 1;
+  TF_LITE_MICRO_EXPECT_EQ(std::numeric_limits<int32_t>::min(), overflow_value);
+#endif
+
+  const int32_t first_time = LatestAudioTimestamp();
+  const int32_t second_time = LatestAudioTimestamp();
+
+  // It's possible that the timer may have wrapped around from +BIG_NUM to
+  // -BIG_NUM between the first and second calls, since we're storing
+  // milliseconds in a 32-bit integer. It's not reasonable that the call itself
+  // would have taken more than 2^31 milliseconds though, so look at the
+  // difference and rely on integer overflow to ensure it's accurate.
+  const int32_t time_delta = (second_time - first_time);
+  TF_LITE_MICRO_EXPECT_LE(0, time_delta);
+}
+
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/Makefile.inc b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/Makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..5585ed7269b71d279f1dd22cb9dd04120e7dd37f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/Makefile.inc
@@ -0,0 +1,7 @@
+# Settings for the Discovery STM32F746NG board.
+ifneq ($(filter disco_f746ng,$(ALL_TAGS)),)
+  MBED_PROJECT_FILES += \
+    AUDIO_DISCO_F746NG.lib \
+    BSP_DISCO_F746NG.lib \
+    SDRAM_DISCO_F746NG.lib
+endif
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/audio_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/audio_provider.cc
new file mode 100644
index 0000000000000000000000000000000000000000..49fea826759956d479e9171e2ba7a41331e31023
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/audio_provider.cc
@@ -0,0 +1,182 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+#include "AUDIO_DISCO_F746NG.h"
+#include "SDRAM_DISCO_F746NG.h"
+#include "mbed.h"  // NOLINT
+
+namespace {
+
+bool g_is_audio_initialized = false;
+constexpr int kAudioCaptureBufferSize = kAudioSampleFrequency * 0.5;
+int16_t g_audio_capture_buffer[kAudioCaptureBufferSize];
+int16_t g_audio_output_buffer[kMaxAudioSampleSize];
+int32_t g_latest_audio_timestamp = 0;
+
+// For a full example of how to access audio on the STM32F746NG board, see
+// https://os.mbed.com/teams/ST/code/DISCO-F746NG_AUDIO_demo/
+AUDIO_DISCO_F746NG g_audio_device;
+SDRAM_DISCO_F746NG g_sdram_device;
+
+typedef enum {
+  BUFFER_OFFSET_NONE = 0,
+  BUFFER_OFFSET_HALF = 1,
+  BUFFER_OFFSET_FULL = 2,
+} BUFFER_StateTypeDef;
+
+#define AUDIO_BLOCK_SIZE ((uint32_t)2048)
+#define AUDIO_BUFFER_IN SDRAM_DEVICE_ADDR /* In SDRAM */
+#define AUDIO_BUFFER_OUT \
+  (SDRAM_DEVICE_ADDR + (AUDIO_BLOCK_SIZE * 2)) /* In SDRAM */
+__IO uint32_t g_audio_rec_buffer_state = BUFFER_OFFSET_NONE;
+
+uint8_t SetSysClock_PLL_HSE_200MHz() {
+  RCC_ClkInitTypeDef RCC_ClkInitStruct;
+  RCC_OscInitTypeDef RCC_OscInitStruct;
+
+  // Enable power clock
+  __PWR_CLK_ENABLE();
+
+  // Enable HSE oscillator and activate PLL with HSE as source
+  RCC_OscInitStruct.OscillatorType = RCC_OSCILLATORTYPE_HSE;
+  RCC_OscInitStruct.HSEState = RCC_HSE_ON; /* External xtal on OSC_IN/OSC_OUT */
+
+  // Warning: this configuration is for a 25 MHz xtal clock only
+  RCC_OscInitStruct.PLL.PLLState = RCC_PLL_ON;
+  RCC_OscInitStruct.PLL.PLLSource = RCC_PLLSOURCE_HSE;
+  RCC_OscInitStruct.PLL.PLLM = 25;   // VCO input clock = 1 MHz (25 MHz / 25)
+  RCC_OscInitStruct.PLL.PLLN = 400;  // VCO output clock = 400 MHz (1 MHz * 400)
+  RCC_OscInitStruct.PLL.PLLP = RCC_PLLP_DIV2;  // PLLCLK = 200 MHz (400 MHz / 2)
+  RCC_OscInitStruct.PLL.PLLQ = 8;  // USB clock = 50 MHz (400 MHz / 8)
+
+  if (HAL_RCC_OscConfig(&RCC_OscInitStruct) != HAL_OK) {
+    return 0;  // FAIL
+  }
+
+  // Activate the OverDrive to reach the 216 MHz Frequency
+  if (HAL_PWREx_EnableOverDrive() != HAL_OK) {
+    return 0;  // FAIL
+  }
+
+  // Select PLL as system clock source and configure the HCLK, PCLK1 and PCLK2
+  // clocks dividers
+  RCC_ClkInitStruct.ClockType = (RCC_CLOCKTYPE_SYSCLK | RCC_CLOCKTYPE_HCLK |
+                                 RCC_CLOCKTYPE_PCLK1 | RCC_CLOCKTYPE_PCLK2);
+  RCC_ClkInitStruct.SYSCLKSource = RCC_SYSCLKSOURCE_PLLCLK;  // 200 MHz
+  RCC_ClkInitStruct.AHBCLKDivider = RCC_SYSCLK_DIV1;         // 200 MHz
+  RCC_ClkInitStruct.APB1CLKDivider = RCC_HCLK_DIV4;          //  50 MHz
+  RCC_ClkInitStruct.APB2CLKDivider = RCC_HCLK_DIV2;          // 100 MHz
+
+  if (HAL_RCC_ClockConfig(&RCC_ClkInitStruct, FLASH_LATENCY_7) != HAL_OK) {
+    return 0;  // FAIL
+  }
+  HAL_RCC_MCOConfig(RCC_MCO1, RCC_MCO1SOURCE_HSE, RCC_MCODIV_4);
+  return 1;  // OK
+}
+
+TfLiteStatus InitAudioRecording(tflite::ErrorReporter* error_reporter) {
+  SetSysClock_PLL_HSE_200MHz();
+
+  // Initialize SDRAM buffers.
+  memset((uint16_t*)AUDIO_BUFFER_IN, 0, AUDIO_BLOCK_SIZE * 2);
+  memset((uint16_t*)AUDIO_BUFFER_OUT, 0, AUDIO_BLOCK_SIZE * 2);
+  g_audio_rec_buffer_state = BUFFER_OFFSET_NONE;
+
+  // Start Recording.
+  g_audio_device.IN_Record((uint16_t*)AUDIO_BUFFER_IN, AUDIO_BLOCK_SIZE);
+
+  // Also play results out to headphone jack.
+  g_audio_device.OUT_SetAudioFrameSlot(CODEC_AUDIOFRAME_SLOT_02);
+  g_audio_device.OUT_Play((uint16_t*)AUDIO_BUFFER_OUT, AUDIO_BLOCK_SIZE * 2);
+
+  return kTfLiteOk;
+}
+
+void CaptureSamples(const int16_t* sample_data) {
+  const int sample_size = AUDIO_BLOCK_SIZE / (sizeof(int16_t) * 2);
+  const int32_t time_in_ms =
+      g_latest_audio_timestamp + (sample_size / (kAudioSampleFrequency / 1000));
+
+  const int32_t start_sample_offset =
+      g_latest_audio_timestamp * (kAudioSampleFrequency / 1000);
+  for (int i = 0; i < sample_size; ++i) {
+    const int capture_index =
+        (start_sample_offset + i) % kAudioCaptureBufferSize;
+    g_audio_capture_buffer[capture_index] =
+        (sample_data[(i * 2) + 0] / 2) + (sample_data[(i * 2) + 1] / 2);
+  }
+  // This is how we let the outside world know that new audio data has arrived.
+  g_latest_audio_timestamp = time_in_ms;
+}
+
+}  // namespace
+
+// These callbacks need to be linkable symbols, because they override weak
+// default versions.
+void BSP_AUDIO_IN_TransferComplete_CallBack(void) {
+  g_audio_rec_buffer_state = BUFFER_OFFSET_FULL;
+  /* Copy recorded 1st half block */
+  memcpy((uint16_t*)(AUDIO_BUFFER_OUT), (uint16_t*)(AUDIO_BUFFER_IN),
+         AUDIO_BLOCK_SIZE);
+  CaptureSamples(reinterpret_cast<int16_t*>(AUDIO_BUFFER_IN));
+  return;
+}
+
+// Another weak symbol override.
+void BSP_AUDIO_IN_HalfTransfer_CallBack(void) {
+  g_audio_rec_buffer_state = BUFFER_OFFSET_HALF;
+  /* Copy recorded 2nd half block */
+  memcpy((uint16_t*)(AUDIO_BUFFER_OUT + (AUDIO_BLOCK_SIZE)),
+         (uint16_t*)(AUDIO_BUFFER_IN + (AUDIO_BLOCK_SIZE)), AUDIO_BLOCK_SIZE);
+  CaptureSamples(
+      reinterpret_cast<int16_t*>(AUDIO_BUFFER_IN + AUDIO_BLOCK_SIZE));
+  return;
+}
+
+// Main entry point for getting audio data.
+TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
+                             int start_ms, int duration_ms,
+                             int* audio_samples_size, int16_t** audio_samples) {
+  if (!g_is_audio_initialized) {
+    TfLiteStatus init_status = InitAudioRecording(error_reporter);
+    if (init_status != kTfLiteOk) {
+      return init_status;
+    }
+    g_is_audio_initialized = true;
+  }
+  // This should only be called when the main thread notices that the latest
+  // audio sample data timestamp has changed, so that there's new data in the
+  // capture ring buffer. The ring buffer will eventually wrap around and
+  // overwrite the data, but the assumption is that the main thread is checking
+  // often enough and the buffer is large enough that this call will be made
+  // before that happens.
+  const int start_offset = start_ms * (kAudioSampleFrequency / 1000);
+  const int duration_sample_count =
+      duration_ms * (kAudioSampleFrequency / 1000);
+  for (int i = 0; i < duration_sample_count; ++i) {
+    const int capture_index = (start_offset + i) % kAudioCaptureBufferSize;
+    g_audio_output_buffer[i] = g_audio_capture_buffer[capture_index];
+  }
+
+  *audio_samples_size = kMaxAudioSampleSize;
+  *audio_samples = g_audio_output_buffer;
+  return kTfLiteOk;
+}
+
+int32_t LatestAudioTimestamp() { return g_latest_audio_timestamp; }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/timer.cc
similarity index 81%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/timer.cc
index 6c96a61ab517487413e875dc7369bddb1c9a0d9a..a8f0fe4bd50c3b6d16a426adc461ea125cbc9859 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/disco_f746ng/timer.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/timer.h"
 
-int32_t TimeInMilliseconds() {
-  static int current_time = 0;
-  current_time += 100;
-  return current_time;
+namespace {
+int32_t g_current_time = 0;
 }
+
+void SetTimeInMilliseconds(int32_t time) { g_current_time = time; }
+
+int32_t TimeInMilliseconds() { return g_current_time; }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
index c4c52ac0ff3696a05192465f8ac911b5d6a83925..b5dfa3d944076a21cde2dfafc6ce1ed39f15164d 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
@@ -16,22 +16,13 @@ limitations under the License.
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h"
 
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/timer.h"
-
-namespace {
-// Stores the timestamp for the previous fetch of audio data, so that we can
-// avoid recalculating all the features from scratch if some earlier timeslices
-// are still present.
-int32_t g_last_time_in_ms = 0;
-// Make sure we don't try to use cached information if this is the first call
-// into the provider.
-bool g_is_first_run = true;
-}  // namespace
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
 
 FeatureProvider::FeatureProvider(int feature_size, uint8_t* feature_data)
-    : feature_size_(feature_size), feature_data_(feature_data) {
+    : feature_size_(feature_size),
+      feature_data_(feature_data),
+      is_first_run_(true) {
   // Initialize the feature data to default values.
   for (int n = 0; n < feature_size_; ++n) {
     feature_data_[n] = 0;
@@ -41,24 +32,27 @@ FeatureProvider::FeatureProvider(int feature_size, uint8_t* feature_data)
 FeatureProvider::~FeatureProvider() {}
 
 TfLiteStatus FeatureProvider::PopulateFeatureData(
-    tflite::ErrorReporter* error_reporter, int* how_many_new_slices) {
+    tflite::ErrorReporter* error_reporter, int32_t last_time_in_ms,
+    int32_t time_in_ms, int* how_many_new_slices) {
   if (feature_size_ != kFeatureElementCount) {
     error_reporter->Report("Requested feature_data_ size %d doesn't match %d",
                            feature_size_, kFeatureElementCount);
     return kTfLiteError;
   }
 
-  const int32_t time_in_ms = TimeInMilliseconds();
   // Quantize the time into steps as long as each window stride, so we can
   // figure out which audio data we need to fetch.
-  const int last_step = (g_last_time_in_ms / kFeatureSliceStrideMs);
+  const int last_step = (last_time_in_ms / kFeatureSliceStrideMs);
   const int current_step = (time_in_ms / kFeatureSliceStrideMs);
-  g_last_time_in_ms = time_in_ms;
 
   int slices_needed = current_step - last_step;
   // If this is the first call, make sure we don't use any cached information.
-  if (g_is_first_run) {
-    g_is_first_run = false;
+  if (is_first_run_) {
+    TfLiteStatus init_status = InitializeMicroFeatures(error_reporter);
+    if (init_status != kTfLiteOk) {
+      return init_status;
+    }
+    is_first_run_ = false;
     slices_needed = kFeatureSliceCount;
   }
   if (slices_needed > kFeatureSliceCount) {
@@ -104,16 +98,17 @@ TfLiteStatus FeatureProvider::PopulateFeatureData(
       GetAudioSamples(error_reporter, slice_start_ms, kFeatureSliceDurationMs,
                       &audio_samples_size, &audio_samples);
       if (audio_samples_size < kMaxAudioSampleSize) {
-        error_reporter->Report("Audio data size %d  too small, want %d",
+        error_reporter->Report("Audio data size %d too small, want %d",
                                audio_samples_size, kMaxAudioSampleSize);
         return kTfLiteError;
       }
       uint8_t* new_slice_data = feature_data_ + (new_slice * kFeatureSliceSize);
-      TfLiteStatus preprocess_status =
-          Preprocess(error_reporter, audio_samples, audio_samples_size,
-                     kFeatureSliceSize, new_slice_data);
-      if (preprocess_status != kTfLiteOk) {
-        return preprocess_status;
+      size_t num_samples_read;
+      TfLiteStatus generate_status = GenerateMicroFeatures(
+          error_reporter, audio_samples, audio_samples_size, kFeatureSliceSize,
+          new_slice_data, &num_samples_read);
+      if (generate_status != kTfLiteOk) {
+        return generate_status;
       }
     }
   }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h
index a86c56ebf053a8807e38c42c6a7088c146a31b9e..ee3a480e947eced06e30ac089433f44e18d6adc3 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h
@@ -38,11 +38,15 @@ class FeatureProvider {
   // Fills the feature data with information from audio inputs, and returns how
   // many feature slices were updated.
   TfLiteStatus PopulateFeatureData(tflite::ErrorReporter* error_reporter,
+                                   int32_t last_time_in_ms, int32_t time_in_ms,
                                    int* how_many_new_slices);
 
  private:
   int feature_size_;
   uint8_t* feature_data_;
+  // Make sure we don't try to use cached information if this is the first call
+  // into the provider.
+  bool is_first_run_;
 };
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_FEATURE_PROVIDER_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_mock_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_mock_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b05912e26f8a7f5e89b9f45766adf4270c033ed5
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_mock_test.cc
@@ -0,0 +1,66 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestFeatureProviderMockYes) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  uint8_t feature_data[kFeatureElementCount];
+  FeatureProvider feature_provider(kFeatureElementCount, feature_data);
+
+  int how_many_new_slices = 0;
+  TfLiteStatus populate_status = feature_provider.PopulateFeatureData(
+      error_reporter, /* last_time_in_ms= */ 0, /* time_in_ms= */ 970,
+      &how_many_new_slices);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, populate_status);
+  TF_LITE_MICRO_EXPECT_EQ(kFeatureSliceCount, how_many_new_slices);
+
+  for (int i = 0; i < kFeatureElementCount; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(g_yes_micro_f2e59fea_nohash_1_data[i],
+                            feature_data[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(TestFeatureProviderMockNo) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  uint8_t feature_data[kFeatureElementCount];
+  FeatureProvider feature_provider(kFeatureElementCount, feature_data);
+
+  int how_many_new_slices = 0;
+  TfLiteStatus populate_status = feature_provider.PopulateFeatureData(
+      error_reporter, /* last_time_in_ms= */ 4000, /* time_in_ms= */ 4970,
+      &how_many_new_slices);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, populate_status);
+  TF_LITE_MICRO_EXPECT_EQ(kFeatureSliceCount, how_many_new_slices);
+
+  for (int i = 0; i < kFeatureElementCount; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(g_no_micro_f9643d42_nohash_4_data[i],
+                            feature_data[i]);
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc
index 1e52aec8d2741678a0f79f643bb7dcf42c848a58..e7655a3be53ae6a032195dd4ca991f740bb19537 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h"
 #include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
@@ -30,7 +30,8 @@ TF_LITE_MICRO_TEST(TestFeatureProvider) {
 
   int how_many_new_slices = 0;
   TfLiteStatus populate_status = feature_provider.PopulateFeatureData(
-      error_reporter, &how_many_new_slices);
+      error_reporter, /* last_time_in_ms= */ 0, /* time_in_ms= */ 10000,
+      &how_many_new_slices);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, populate_status);
   TF_LITE_MICRO_EXPECT_EQ(kFeatureSliceCount, how_many_new_slices);
 }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc
index 1890c25cf2b44c96c549757b31f88255d4a9ee09..e71e62170e442e4139acbadb97268b6f74db6459 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/micro_interpreter.h"
@@ -29,7 +31,8 @@ int main(int argc, char* argv[]) {
 
   // Map the model into a usable data structure. This doesn't involve any
   // copying or parsing, it's a very lightweight operation.
-  const tflite::Model* model = ::tflite::GetModel(g_tiny_conv_model_data);
+  const tflite::Model* model =
+      ::tflite::GetModel(g_tiny_conv_micro_features_model_data);
   if (model->version() != TFLITE_SCHEMA_VERSION) {
     error_reporter->Report(
         "Model provided is schema version %d not equal "
@@ -68,16 +71,21 @@ int main(int argc, char* argv[]) {
   FeatureProvider feature_provider(kFeatureElementCount,
                                    model_input->data.uint8);
 
+  RecognizeCommands recognizer(error_reporter);
+
+  int32_t previous_time = 0;
   // Keep reading and analysing audio data in an infinite loop.
   while (true) {
     // Fetch the spectrogram for the current time.
+    const int32_t current_time = LatestAudioTimestamp();
     int how_many_new_slices = 0;
     TfLiteStatus feature_status = feature_provider.PopulateFeatureData(
-        error_reporter, &how_many_new_slices);
+        error_reporter, previous_time, current_time, &how_many_new_slices);
     if (feature_status != kTfLiteOk) {
       error_reporter->Report("Feature generation failed");
       return 1;
     }
+    previous_time = current_time;
     // If no new audio samples have been received since last time, don't bother
     // running the network model.
     if (how_many_new_slices == 0) {
@@ -105,7 +113,20 @@ int main(int argc, char* argv[]) {
       }
     }
 
-    error_reporter->Report("Heard %s", kCategoryLabels[top_category_index]);
+    const char* found_command = nullptr;
+    uint8_t score = 0;
+    bool is_new_command = false;
+    TfLiteStatus process_status = recognizer.ProcessLatestResults(
+        output, current_time, &found_command, &score, &is_new_command);
+    if (process_status != kTfLiteOk) {
+      error_reporter->Report(
+          "RecognizeCommands::ProcessLatestResults() failed");
+      return 1;
+    }
+    if (is_new_command) {
+      error_reporter->Report("Heard %s (%d) @%dms", found_command, score,
+                             current_time);
+    }
   }
 
   return 0;
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/BUILD b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..1e684e1efd0bfbc676635e8c3233ef6284e6954d
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/BUILD
@@ -0,0 +1,300 @@
+# Library for generating feature vectors from audio data
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow/lite/experimental/micro/testing:micro_test.bzl",
+    "tflite_micro_cc_test",
+)
+
+cc_library(
+    name = "micro_model_settings",
+    srcs = [
+        "micro_model_settings.cc",
+    ],
+    hdrs = [
+        "micro_model_settings.h",
+    ],
+)
+
+cc_library(
+    name = "tiny_conv_micro_features_model_data",
+    srcs = [
+        "tiny_conv_micro_features_model_data.cc",
+    ],
+    hdrs = [
+        "tiny_conv_micro_features_model_data.h",
+    ],
+)
+
+cc_library(
+    name = "micro_features_test_data",
+    srcs = [
+        "no_micro_features_data.cc",
+        "yes_micro_features_data.cc",
+    ],
+    hdrs = [
+        "no_micro_features_data.h",
+        "yes_micro_features_data.h",
+    ],
+)
+
+cc_library(
+    name = "bits",
+    hdrs = ["bits.h"],
+)
+
+cc_library(
+    name = "static_alloc",
+    hdrs = ["static_alloc.h"],
+)
+
+cc_library(
+    name = "fft",
+    srcs = [
+        "fft.cc",
+        "fft_util.cc",
+    ],
+    hdrs = [
+        "fft.h",
+        "fft_util.h",
+    ],
+    deps = [
+        ":micro_model_settings",
+        ":static_alloc",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "@kissfft//:kiss_fftr_16",
+    ],
+)
+
+cc_library(
+    name = "filterbank",
+    srcs = [
+        "filterbank.cc",
+        "filterbank_util.cc",
+    ],
+    hdrs = [
+        "filterbank.h",
+        "filterbank_util.h",
+    ],
+    deps = [
+        ":bits",
+        ":fft",
+        ":micro_model_settings",
+        ":static_alloc",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+cc_library(
+    name = "frontend",
+    srcs = [
+        "frontend.cc",
+        "frontend_util.cc",
+    ],
+    hdrs = [
+        "frontend.h",
+        "frontend_util.h",
+    ],
+    deps = [
+        ":bits",
+        ":fft",
+        ":filterbank",
+        ":log_scale",
+        ":micro_model_settings",
+        ":noise_reduction",
+        ":pcan_gain_control",
+        ":window",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+cc_library(
+    name = "log_scale",
+    srcs = [
+        "log_lut.cc",
+        "log_scale.cc",
+        "log_scale_util.cc",
+    ],
+    hdrs = [
+        "log_lut.h",
+        "log_scale.h",
+        "log_scale_util.h",
+    ],
+    deps = [
+        ":bits",
+        ":micro_model_settings",
+        ":static_alloc",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+cc_library(
+    name = "noise_reduction",
+    srcs = [
+        "noise_reduction.cc",
+        "noise_reduction_util.cc",
+    ],
+    hdrs = [
+        "noise_reduction.h",
+        "noise_reduction_util.h",
+    ],
+    deps = [
+        ":micro_model_settings",
+        ":static_alloc",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+cc_library(
+    name = "pcan_gain_control",
+    srcs = [
+        "pcan_gain_control.cc",
+        "pcan_gain_control_util.cc",
+    ],
+    hdrs = [
+        "pcan_gain_control.h",
+        "pcan_gain_control_util.h",
+    ],
+    deps = [
+        ":bits",
+        ":micro_model_settings",
+        ":static_alloc",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+cc_library(
+    name = "window",
+    srcs = [
+        "window.cc",
+        "window_util.cc",
+    ],
+    hdrs = [
+        "window.h",
+        "window_util.h",
+    ],
+    deps = [
+        ":micro_model_settings",
+        ":static_alloc",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+cc_library(
+    name = "micro_features_generator",
+    srcs = [
+        "micro_features_generator.cc",
+    ],
+    hdrs = [
+        "micro_features_generator.h",
+    ],
+    deps = [
+        ":frontend",
+        ":micro_model_settings",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+cc_library(
+    name = "micro_features_generator_test_data",
+    srcs = [
+        "no_feature_data_slice.cc",
+        "yes_feature_data_slice.cc",
+    ],
+    hdrs = [
+        "no_feature_data_slice.h",
+        "yes_feature_data_slice.h",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "fft_test",
+    srcs = ["fft_test.cc"],
+    deps = [
+        ":fft",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "filterbank_test",
+    srcs = ["filterbank_test.cc"],
+    deps = [
+        ":filterbank",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "frontend_test",
+    srcs = ["frontend_test.cc"],
+    deps = [
+        ":frontend",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "log_scale_test",
+    srcs = ["log_scale_test.cc"],
+    deps = [
+        ":log_scale",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "noise_reduction_test",
+    srcs = ["noise_reduction_test.cc"],
+    deps = [
+        ":noise_reduction",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "pcan_gain_control_test",
+    srcs = ["pcan_gain_control_test.cc"],
+    deps = [
+        ":pcan_gain_control",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "window_test",
+    srcs = ["window_test.cc"],
+    deps = [
+        ":window",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "micro_features_generator_test",
+    srcs = [
+        "micro_features_generator_test.cc",
+    ],
+    deps = [
+        ":micro_features_generator",
+        ":micro_features_generator_test_data",
+        ":micro_model_settings",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/examples/micro_speech:audio_sample_test_data",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b19ee6f030ae9fa8a931c6693cfe490747e336a
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h
@@ -0,0 +1,94 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_BITS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_BITS_H_
+
+#include <cstdint>
+
+static inline int CountLeadingZeros32Slow(uint64_t n) {
+  int zeroes = 28;
+  if (n >> 16) zeroes -= 16, n >>= 16;
+  if (n >> 8) zeroes -= 8, n >>= 8;
+  if (n >> 4) zeroes -= 4, n >>= 4;
+  return "\4\3\2\2\1\1\1\1\0\0\0\0\0\0\0"[n] + zeroes;
+}
+
+static inline int CountLeadingZeros32(uint32_t n) {
+#if defined(_MSC_VER)
+  unsigned long result = 0;  // NOLINT(runtime/int)
+  if (_BitScanReverse(&result, n)) {
+    return 31 - result;
+  }
+  return 32;
+#elif defined(__GNUC__)
+
+  // Handle 0 as a special case because __builtin_clz(0) is undefined.
+  if (n == 0) {
+    return 32;
+  }
+  return __builtin_clz(n);
+#else
+  return CountLeadingZeros32Slow(n);
+#endif
+}
+
+static inline int MostSignificantBit32(uint32_t n) {
+  return 32 - CountLeadingZeros32(n);
+}
+
+static inline int CountLeadingZeros64Slow(uint64_t n) {
+  int zeroes = 60;
+  if (n >> 32) zeroes -= 32, n >>= 32;
+  if (n >> 16) zeroes -= 16, n >>= 16;
+  if (n >> 8) zeroes -= 8, n >>= 8;
+  if (n >> 4) zeroes -= 4, n >>= 4;
+  return "\4\3\2\2\1\1\1\1\0\0\0\0\0\0\0"[n] + zeroes;
+}
+
+static inline int CountLeadingZeros64(uint64_t n) {
+#if defined(_MSC_VER) && defined(_M_X64)
+  // MSVC does not have __buitin_clzll. Use _BitScanReverse64.
+  unsigned long result = 0;  // NOLINT(runtime/int)
+  if (_BitScanReverse64(&result, n)) {
+    return 63 - result;
+  }
+  return 64;
+#elif defined(_MSC_VER)
+  // MSVC does not have __buitin_clzll. Compose two calls to _BitScanReverse
+  unsigned long result = 0;  // NOLINT(runtime/int)
+  if ((n >> 32) && _BitScanReverse(&result, n >> 32)) {
+    return 31 - result;
+  }
+  if (_BitScanReverse(&result, n)) {
+    return 63 - result;
+  }
+  return 64;
+#elif defined(__GNUC__)
+
+  // Handle 0 as a special case because __builtin_clzll(0) is undefined.
+  if (n == 0) {
+    return 64;
+  }
+  return __builtin_clzll(n);
+#else
+  return CountLeadingZeros64Slow(n);
+#endif
+}
+
+static inline int MostSignificantBit64(uint64_t n) {
+  return 64 - CountLeadingZeros64(n);
+}
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_BITS_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cde4e38740e65cf56cd179d577528263177a649e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.cc
@@ -0,0 +1,54 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h"
+
+#include <string.h>
+
+#define FIXED_POINT 16
+#include "kiss_fft.h"
+// Internal test dependency placeholder1
+// Internal test dependency placeholder2
+#include "tools/kiss_fftr.h"
+// Internal test dependency placeholder3
+
+void FftCompute(struct FftState* state, const int16_t* input,
+                int input_scale_shift) {
+  const size_t input_size = state->input_size;
+  const size_t fft_size = state->fft_size;
+
+  int16_t* fft_input = state->input;
+  // First, scale the input by the given shift.
+  int i;
+  for (i = 0; i < input_size; ++i) {
+    *fft_input++ = (*input++) << input_scale_shift;
+  }
+  // Zero out whatever else remains in the top part of the input.
+  for (; i < fft_size; ++i) {
+    *fft_input++ = 0;
+  }
+
+  // Apply the FFT.
+  kiss_fftr(reinterpret_cast<const kiss_fftr_cfg>(state->scratch), state->input,
+            reinterpret_cast<kiss_fft_cpx*>(state->output));
+}
+
+void FftInit(struct FftState* state) {
+  // All the initialization is done in FftPopulateState()
+}
+
+void FftReset(struct FftState* state) {
+  memset(state->input, 0, state->fft_size * sizeof(*state->input));
+  memset(state->output, 0, (state->fft_size / 2 + 1) * sizeof(*state->output));
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5d29f68a2cc5688f9644a2b556abb4787e3bb93
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h
@@ -0,0 +1,48 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FFT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FFT_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+struct complex_int16_t {
+  int16_t real;
+  int16_t imag;
+};
+
+struct FftState {
+  int16_t input[kMaxAudioSampleSize];
+  struct complex_int16_t output[kMaxAudioSampleSize + 2];
+  size_t fft_size;
+  size_t input_size;
+  // This magic number was derived from KissFFT's estimate of how much space it
+  // will need to process the particular lengths and datatypes we need to for
+  // these model settings. This size will need to be recalculated for different
+  // models, but you will see a runtime error if it's not large enough.
+  char scratch[2848];
+  size_t scratch_size;
+};
+
+void FftCompute(struct FftState* state, const int16_t* input,
+                int input_scale_shift);
+
+void FftInit(struct FftState* state);
+
+void FftReset(struct FftState* state);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FFT_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b89b01445a641c8152aaff8165495688ab6861b2
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_test.cc
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.h"
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+namespace {
+
+const int16_t kFakeWindow[] = {
+    0, 1151,   0, -5944, 0, 13311,  0, -21448, 0, 28327, 0, -32256, 0, 32255,
+    0, -28328, 0, 21447, 0, -13312, 0, 5943,   0, -1152, 0};
+const int kScaleShift = 0;
+
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(FftTest_CheckOutputValues) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  struct FftState state;
+  TF_LITE_MICRO_EXPECT(FftPopulateState(
+      error_reporter, &state, sizeof(kFakeWindow) / sizeof(kFakeWindow[0])));
+
+  FftInit(&state);
+  FftCompute(&state, kFakeWindow, kScaleShift);
+
+  const struct complex_int16_t expected[] = {
+      {0, 0},    {-10, 9},     {-20, 0},   {-9, -10},     {0, 25},  {-119, 119},
+      {-887, 0}, {3000, 3000}, {0, -6401}, {-3000, 3000}, {886, 0}, {118, 119},
+      {0, 25},   {9, -10},     {19, 0},    {9, 9},        {0, 0}};
+  TF_LITE_MICRO_EXPECT_EQ(state.fft_size / 2 + 1,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i <= state.fft_size / 2; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(state.output[i].real, expected[i].real, 2);
+    TF_LITE_MICRO_EXPECT_NEAR(state.output[i].imag, expected[i].imag, 2);
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ab742893197e6fda9ec2266e85997b555e0a4fc0
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.cc
@@ -0,0 +1,54 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.h"
+
+#define FIXED_POINT 16
+#include "kiss_fft.h"
+#include "tools/kiss_fftr.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h"
+
+int FftPopulateState(tflite::ErrorReporter* error_reporter,
+                     struct FftState* state, size_t input_size) {
+  state->input_size = input_size;
+  state->fft_size = 1;
+  while (state->fft_size < state->input_size) {
+    state->fft_size <<= 1;
+  }
+
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(state->input,
+                                 (state->fft_size * sizeof(*state->input)));
+
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      state->output, ((state->fft_size / 2 + 1) * sizeof(*state->output) * 2));
+
+  // Ask kissfft how much memory it wants.
+  size_t scratch_size = 0;
+  kiss_fftr_cfg kfft_cfg =
+      kiss_fftr_alloc(state->fft_size, 0, nullptr, &scratch_size);
+  if (kfft_cfg != nullptr) {
+    error_reporter->Report("Kiss memory sizing failed.");
+    return 0;
+  }
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(state->scratch, scratch_size);
+  state->scratch_size = scratch_size;
+  // Let kissfft configure the scratch space we just allocated
+  kfft_cfg = kiss_fftr_alloc(state->fft_size, 0, state->scratch, &scratch_size);
+  if (reinterpret_cast<char*>(kfft_cfg) != state->scratch) {
+    error_reporter->Report("Kiss memory preallocation strategy failed.");
+    return 0;
+  }
+  return 1;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..1dea097bc996e194cef7987431c67be3c976ed2b
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.h
@@ -0,0 +1,26 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FFT_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FFT_UTIL_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+// Prepares and FFT for the given input size.
+int FftPopulateState(tflite::ErrorReporter* error_reporter,
+                     struct FftState* state, size_t input_size);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FFT_UTIL_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.cc
new file mode 100644
index 0000000000000000000000000000000000000000..67f69dd67581ff6c15063b2467810f6c212ed1e5
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.cc
@@ -0,0 +1,135 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h"
+
+#include <string.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h"
+
+void FilterbankConvertFftComplexToEnergy(struct FilterbankState* state,
+                                         struct complex_int16_t* fft_output,
+                                         int32_t* energy) {
+  const int end_index = state->end_index;
+  int i;
+  energy += state->start_index;
+  fft_output += state->start_index;
+  for (i = state->start_index; i < end_index; ++i) {
+    const int32_t real = fft_output->real;
+    const int32_t imag = fft_output->imag;
+    fft_output++;
+    const uint32_t mag_squared = (real * real) + (imag * imag);
+    *energy++ = mag_squared;
+  }
+}
+
+void FilterbankAccumulateChannels(struct FilterbankState* state,
+                                  const int32_t* energy) {
+  uint64_t* work = state->work;
+  uint64_t weight_accumulator = 0;
+  uint64_t unweight_accumulator = 0;
+
+  const int16_t* channel_frequency_starts = state->channel_frequency_starts;
+  const int16_t* channel_weight_starts = state->channel_weight_starts;
+  const int16_t* channel_widths = state->channel_widths;
+
+  int num_channels_plus_1 = state->num_channels + 1;
+  int i;
+  for (i = 0; i < num_channels_plus_1; ++i) {
+    const int32_t* magnitudes = energy + *channel_frequency_starts++;
+    const int16_t* weights = state->weights + *channel_weight_starts;
+    const int16_t* unweights = state->unweights + *channel_weight_starts++;
+    const int width = *channel_widths++;
+    int j;
+    for (j = 0; j < width; ++j) {
+      weight_accumulator += *weights++ * (static_cast<uint64_t>(*magnitudes));
+      unweight_accumulator +=
+          *unweights++ * (static_cast<uint64_t>(*magnitudes));
+      ++magnitudes;
+    }
+    *work++ = weight_accumulator;
+    weight_accumulator = unweight_accumulator;
+    unweight_accumulator = 0;
+  }
+}
+
+static uint16_t Sqrt32(uint32_t num) {
+  if (num == 0) {
+    return 0;
+  }
+  uint32_t res = 0;
+  int max_bit_number = 32 - MostSignificantBit32(num);
+  max_bit_number |= 1;
+  uint32_t bit = 1U << (31 - max_bit_number);
+  int iterations = (31 - max_bit_number) / 2 + 1;
+  while (iterations--) {
+    if (num >= res + bit) {
+      num -= res + bit;
+      res = (res >> 1U) + bit;
+    } else {
+      res >>= 1U;
+    }
+    bit >>= 2U;
+  }
+  // Do rounding - if we have the bits.
+  if (num > res && res != 0xFFFF) {
+    ++res;
+  }
+  return res;
+}
+
+static uint32_t Sqrt64(uint64_t num) {
+  // Take a shortcut and just use 32 bit operations if the upper word is all
+  // clear. This will cause a slight off by one issue for numbers close to 2^32,
+  // but it probably isn't going to matter (and gives us a big performance win).
+  if ((num >> 32) == 0) {
+    return Sqrt32(static_cast<uint32_t>(num));
+  }
+  uint64_t res = 0;
+  int max_bit_number = 64 - MostSignificantBit64(num);
+  max_bit_number |= 1;
+  uint64_t bit = 1ULL << (63 - max_bit_number);
+  int iterations = (63 - max_bit_number) / 2 + 1;
+  while (iterations--) {
+    if (num >= res + bit) {
+      num -= res + bit;
+      res = (res >> 1U) + bit;
+    } else {
+      res >>= 1U;
+    }
+    bit >>= 2U;
+  }
+  // Do rounding - if we have the bits.
+  if (num > res && res != 0xFFFFFFFFLL) {
+    ++res;
+  }
+  return res;
+}
+
+uint32_t* FilterbankSqrt(struct FilterbankState* state, int scale_down_shift) {
+  const int num_channels = state->num_channels;
+  const int64_t* work = reinterpret_cast<int64_t*>(state->work + 1);
+  // Reuse the work buffer since we're fine clobbering it at this point to hold
+  // the output.
+  uint32_t* output = reinterpret_cast<uint32_t*>(state->work);
+  int i;
+  for (i = 0; i < num_channels; ++i) {
+    *output++ = Sqrt64(*work++) >> scale_down_shift;
+  }
+  return reinterpret_cast<uint32_t*>(state->work);
+}
+
+void FilterbankReset(struct FilterbankState* state) {
+  memset(state->work, 0, (state->num_channels + 1) * sizeof(*state->work));
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7b479d4899a985482710dfcb12b908f50ec1690
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FILTERBANK_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FILTERBANK_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+#define kFilterbankBits 12
+
+struct FilterbankState {
+  int num_channels;
+  int start_index;
+  int end_index;
+  int16_t channel_frequency_starts[kFeatureSliceSize + 1];
+  int16_t channel_weight_starts[kFeatureSliceSize + 1];
+  int16_t channel_widths[kFeatureSliceSize + 1];
+  int16_t weights[316];
+  int16_t unweights[316];
+  uint64_t work[kFeatureSliceSize + 1];
+};
+
+// Converts the relevant complex values of an FFT output into energy (the
+// square magnitude).
+void FilterbankConvertFftComplexToEnergy(struct FilterbankState* state,
+                                         struct complex_int16_t* fft_output,
+                                         int32_t* energy);
+
+// Computes the mel-scale filterbank on the given energy array. Output is cached
+// internally - to fetch it, you need to call FilterbankSqrt.
+void FilterbankAccumulateChannels(struct FilterbankState* state,
+                                  const int32_t* energy);
+
+// Applies an integer square root to the 64 bit intermediate values of the
+// filterbank, and returns a pointer to them. Memory will be invalidated the
+// next time FilterbankAccumulateChannels is called.
+uint32_t* FilterbankSqrt(struct FilterbankState* state, int scale_down_shift);
+
+void FilterbankReset(struct FilterbankState* state);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FILTERBANK_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..682b216ace37fa0a809db3c06386b5b4b3ca94c8
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_test.cc
@@ -0,0 +1,228 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.h"
+
+#include <cstring>
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+namespace {
+
+const int kSampleRate = 1000;
+const int kSpectrumSize = 17;
+const int kStartIndex = 1;
+const int kEndIndex = 15;
+const int32_t kEnergy[] = {-1,     181,      400,      181,      625,    28322,
+                           786769, 18000000, 40972801, 18000000, 784996, 28085,
+                           625,    181,      361,      -1,       -1};
+const uint64_t kWork[] = {1835887, 61162970173, 258694800000};
+const int kScaleShift = 0;
+
+// Test filterbank generation using scaled-down defaults.
+class FilterbankTestConfig {
+ public:
+  FilterbankTestConfig() {
+    config_.num_channels = 2;
+    config_.lower_band_limit = 8.0;
+    config_.upper_band_limit = 450.0;
+  }
+
+  struct FilterbankConfig config_;
+};
+
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckStartIndex) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+
+  TF_LITE_MICRO_EXPECT_EQ(state.start_index, kStartIndex);
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckEndIndex) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+
+  TF_LITE_MICRO_EXPECT_EQ(state.end_index, kEndIndex);
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckChannelFrequencyStarts) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+
+  const int16_t expected[] = {0, 4, 8};
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels + 1,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i <= state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.channel_frequency_starts[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckChannelWeightStarts) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+
+  const int16_t expected[] = {0, 8, 16};
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels + 1,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i <= state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.channel_weight_starts[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckChannelWidths) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+
+  const int16_t expected[] = {8, 8, 8};
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels + 1,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i <= state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.channel_widths[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckWeights) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+
+  const int16_t expected[] = {0, 3277, 2217, 1200, 222,  0,   0,   0,
+                              0, 3376, 2468, 1591, 744,  0,   0,   0,
+                              0, 4020, 3226, 2456, 1708, 983, 277, 0};
+  TF_LITE_MICRO_EXPECT_EQ(state.channel_weight_starts[state.num_channels] +
+                              state.channel_widths[state.num_channels],
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.weights[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckUnweights) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+
+  const int16_t expected[] = {0, 819, 1879, 2896, 3874, 0,    0,    0,
+                              0, 720, 1628, 2505, 3352, 0,    0,    0,
+                              0, 76,  870,  1640, 2388, 3113, 3819, 0};
+  TF_LITE_MICRO_EXPECT_EQ(state.channel_weight_starts[state.num_channels] +
+                              state.channel_widths[state.num_channels],
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.unweights[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckConvertFftComplexToEnergy) {
+  struct FilterbankState state;
+  state.start_index = kStartIndex;
+  state.end_index = kEndIndex;
+
+  struct complex_int16_t fake_fft[] = {
+      {0, 0},    {-10, 9},     {-20, 0},   {-9, -10},     {0, 25},  {-119, 119},
+      {-887, 0}, {3000, 3000}, {0, -6401}, {-3000, 3000}, {886, 0}, {118, 119},
+      {0, 25},   {9, -10},     {19, 0},    {9, 9},        {0, 0}};
+  int32_t* energy = reinterpret_cast<int32_t*>(fake_fft);
+  FilterbankConvertFftComplexToEnergy(&state, fake_fft, energy);
+
+  int i;
+  for (i = state.start_index; i < state.end_index; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(energy[i], kEnergy[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckAccumulateChannels) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+
+  FilterbankAccumulateChannels(&state, kEnergy);
+
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels + 1,
+                          sizeof(kWork) / sizeof(kWork[0]));
+  int i;
+  for (i = 0; i <= state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.work[i], kWork[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckSqrt) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FilterbankTestConfig config;
+  struct FilterbankState state;
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(
+      error_reporter, &config.config_, &state, kSampleRate, kSpectrumSize));
+  std::memcpy(state.work, kWork, sizeof(kWork));
+
+  uint32_t* scaled_filterbank = FilterbankSqrt(&state, kScaleShift);
+
+  const uint32_t expected[] = {247311, 508620};
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(scaled_filterbank[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea4aa5179bc6e1d875127152b596f220a076191e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.cc
@@ -0,0 +1,212 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.h"
+
+#include <assert.h>
+#include <math.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h"
+
+#define kFilterbankIndexAlignment 4
+#define kFilterbankChannelBlockSize 4
+
+void FilterbankFillConfigWithDefaults(struct FilterbankConfig* config) {
+  config->num_channels = 32;
+  config->lower_band_limit = 125.0f;
+  config->upper_band_limit = 7500.0f;
+  config->output_scale_shift = 7;
+}
+
+static float FreqToMel(float freq) {
+  return 1127.0 * log(1.0 + (freq / 700.0));
+}
+
+static void CalculateCenterFrequencies(const int num_channels,
+                                       const float lower_frequency_limit,
+                                       const float upper_frequency_limit,
+                                       float* center_frequencies) {
+  assert(lower_frequency_limit >= 0.0f);
+  assert(upper_frequency_limit > lower_frequency_limit);
+
+  const float mel_low = FreqToMel(lower_frequency_limit);
+  const float mel_hi = FreqToMel(upper_frequency_limit);
+  const float mel_span = mel_hi - mel_low;
+  const float mel_spacing = mel_span / (static_cast<float>(num_channels));
+  int i;
+  for (i = 0; i < num_channels; ++i) {
+    center_frequencies[i] = mel_low + (mel_spacing * (i + 1));
+  }
+}
+
+static void QuantizeFilterbankWeights(const float float_weight, int16_t* weight,
+                                      int16_t* unweight) {
+  *weight = floor(float_weight * (1 << kFilterbankBits) + 0.5);
+  *unweight = floor((1.0 - float_weight) * (1 << kFilterbankBits) + 0.5);
+}
+
+int FilterbankPopulateState(tflite::ErrorReporter* error_reporter,
+                            const struct FilterbankConfig* config,
+                            struct FilterbankState* state, int sample_rate,
+                            int spectrum_size) {
+  state->num_channels = config->num_channels;
+  const int num_channels_plus_1 = config->num_channels + 1;
+
+  // How should we align things to index counts given the byte alignment?
+  const int index_alignment =
+      (kFilterbankIndexAlignment < sizeof(int16_t)
+           ? 1
+           : kFilterbankIndexAlignment / sizeof(int16_t));
+
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      state->channel_frequency_starts,
+      (num_channels_plus_1 * sizeof(*state->channel_frequency_starts)));
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      state->channel_weight_starts,
+      (num_channels_plus_1 * sizeof(*state->channel_weight_starts)));
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      state->channel_widths,
+      (num_channels_plus_1 * sizeof(*state->channel_widths)));
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(state->work,
+                                 (num_channels_plus_1 * sizeof(*state->work)));
+
+  float center_mel_freqs[kFeatureSliceSize + 1];
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      center_mel_freqs, (num_channels_plus_1 * sizeof(*center_mel_freqs)));
+
+  int16_t actual_channel_starts[kFeatureSliceSize + 1];
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      actual_channel_starts,
+      (num_channels_plus_1 * sizeof(*actual_channel_starts)));
+
+  int16_t actual_channel_widths[kFeatureSliceSize + 1];
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      actual_channel_widths,
+      (num_channels_plus_1 * sizeof(*actual_channel_widths)));
+
+  CalculateCenterFrequencies(num_channels_plus_1, config->lower_band_limit,
+                             config->upper_band_limit, center_mel_freqs);
+
+  // Always exclude DC.
+  const float hz_per_sbin =
+      0.5 * sample_rate / (static_cast<float>(spectrum_size) - 1);
+  state->start_index = 1.5 + config->lower_band_limit / hz_per_sbin;
+  state->end_index = 0;  // Initialized to zero here, but actually set below.
+
+  // For each channel, we need to figure out what frequencies belong to it, and
+  // how much padding we need to add so that we can efficiently multiply the
+  // weights and unweights for accumulation. To simplify the multiplication
+  // logic, all channels will have some multiplication to do (even if there are
+  // no frequencies that accumulate to that channel) - they will be directed to
+  // a set of zero weights.
+  int chan_freq_index_start = state->start_index;
+  int weight_index_start = 0;
+  int needs_zeros = 0;
+
+  int chan;
+  for (chan = 0; chan < num_channels_plus_1; ++chan) {
+    // Keep jumping frequencies until we overshoot the bound on this channel.
+    int freq_index = chan_freq_index_start;
+    while (FreqToMel((freq_index)*hz_per_sbin) <= center_mel_freqs[chan]) {
+      ++freq_index;
+    }
+
+    const int width = freq_index - chan_freq_index_start;
+    actual_channel_starts[chan] = chan_freq_index_start;
+    actual_channel_widths[chan] = width;
+
+    if (width == 0) {
+      // This channel doesn't actually get anything from the frequencies, it's
+      // always zero. We need then to insert some 'zero' weights into the
+      // output, and just redirect this channel to do a single multiplication at
+      // this point. For simplicity, the zeros are placed at the beginning of
+      // the weights arrays, so we have to go and update all the other
+      // weight_starts to reflect this shift (but only once).
+      state->channel_frequency_starts[chan] = 0;
+      state->channel_weight_starts[chan] = 0;
+      state->channel_widths[chan] = kFilterbankChannelBlockSize;
+      if (!needs_zeros) {
+        needs_zeros = 1;
+        int j;
+        for (j = 0; j < chan; ++j) {
+          state->channel_weight_starts[j] += kFilterbankChannelBlockSize;
+        }
+        weight_index_start += kFilterbankChannelBlockSize;
+      }
+    } else {
+      // How far back do we need to go to ensure that we have the proper
+      // alignment?
+      const int aligned_start =
+          (chan_freq_index_start / index_alignment) * index_alignment;
+      const int aligned_width = (chan_freq_index_start - aligned_start + width);
+      const int padded_width =
+          (((aligned_width - 1) / kFilterbankChannelBlockSize) + 1) *
+          kFilterbankChannelBlockSize;
+
+      state->channel_frequency_starts[chan] = aligned_start;
+      state->channel_weight_starts[chan] = weight_index_start;
+      state->channel_widths[chan] = padded_width;
+      weight_index_start += padded_width;
+    }
+    chan_freq_index_start = freq_index;
+  }
+
+  // Allocate the two arrays to store the weights - weight_index_start contains
+  // the index of what would be the next set of weights that we would need to
+  // add, so that's how many weights we need to allocate.
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      state->weights, (weight_index_start * sizeof(*state->weights)));
+  for (int i = 0; i < weight_index_start; ++i) {
+    state->weights[i] = 0;
+  }
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      state->unweights, (weight_index_start * sizeof(*state->unweights)));
+  for (int i = 0; i < weight_index_start; ++i) {
+    state->unweights[i] = 0;
+  }
+
+  // Next pass, compute all the weights. Since everything has been memset to
+  // zero, we only need to fill in the weights that correspond to some frequency
+  // for a channel.
+  const float mel_low = FreqToMel(config->lower_band_limit);
+  for (chan = 0; chan < num_channels_plus_1; ++chan) {
+    int frequency = actual_channel_starts[chan];
+    const int num_frequencies = actual_channel_widths[chan];
+    const int frequency_offset =
+        frequency - state->channel_frequency_starts[chan];
+    const int weight_start = state->channel_weight_starts[chan];
+    const float denom_val = (chan == 0) ? mel_low : center_mel_freqs[chan - 1];
+
+    int j;
+    for (j = 0; j < num_frequencies; ++j, ++frequency) {
+      const float weight =
+          (center_mel_freqs[chan] - FreqToMel(frequency * hz_per_sbin)) /
+          (center_mel_freqs[chan] - denom_val);
+
+      // Make the float into an integer for the weights (and unweights).
+      const int weight_index = weight_start + frequency_offset + j;
+      QuantizeFilterbankWeights(weight, state->weights + weight_index,
+                                state->unweights + weight_index);
+    }
+    if (frequency > state->end_index) {
+      state->end_index = frequency;
+    }
+  }
+
+  if (state->end_index >= spectrum_size) {
+    error_reporter->Report("Filterbank end_index is above spectrum size.");
+    return 0;
+  }
+  return 1;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..0bf0c8494ce11753c2f2c2185e6c1141d0adbc74
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FILTERBANK_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FILTERBANK_UTIL_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+struct FilterbankConfig {
+  // number of frequency channel buckets for filterbank
+  int num_channels;
+  // maximum frequency to include
+  float upper_band_limit;
+  // minimum frequency to include
+  float lower_band_limit;
+  // unused
+  int output_scale_shift;
+};
+
+// Fills the frontendConfig with "sane" defaults.
+void FilterbankFillConfigWithDefaults(struct FilterbankConfig* config);
+
+// Allocates any buffers.
+int FilterbankPopulateState(tflite::ErrorReporter* error_reporter,
+                            const struct FilterbankConfig* config,
+                            struct FilterbankState* state, int sample_rate,
+                            int spectrum_size);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FILTERBANK_UTIL_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c609190e4db90adb3ec79e2a794b61923cea0978
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.cc
@@ -0,0 +1,70 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h"
+
+struct FrontendOutput FrontendProcessSamples(struct FrontendState* state,
+                                             const int16_t* samples,
+                                             size_t num_samples,
+                                             size_t* num_samples_read) {
+  struct FrontendOutput output;
+  output.values = nullptr;
+  output.size = 0;
+
+  // Try to apply the window - if it fails, return and wait for more data.
+  if (!WindowProcessSamples(&state->window, samples, num_samples,
+                            num_samples_read)) {
+    return output;
+  }
+
+  // Apply the FFT to the window's output (and scale it so that the fixed point
+  // FFT can have as much resolution as possible).
+  int input_shift =
+      15 - MostSignificantBit32(state->window.max_abs_output_value);
+  FftCompute(&state->fft, state->window.output, input_shift);
+
+  // We can re-ruse the fft's output buffer to hold the energy.
+  int32_t* energy = reinterpret_cast<int32_t*>(state->fft.output);
+  FilterbankConvertFftComplexToEnergy(&state->filterbank, state->fft.output,
+                                      energy);
+  FilterbankAccumulateChannels(&state->filterbank, energy);
+  uint32_t* scaled_filterbank = FilterbankSqrt(&state->filterbank, input_shift);
+
+  // Apply noise reduction.
+  NoiseReductionApply(&state->noise_reduction, scaled_filterbank);
+
+  if (state->pcan_gain_control.enable_pcan) {
+    PcanGainControlApply(&state->pcan_gain_control, scaled_filterbank);
+  }
+
+  // Apply the log and scale.
+  int correction_bits =
+      MostSignificantBit32(state->fft.fft_size) - 1 - (kFilterbankBits / 2);
+  uint16_t* logged_filterbank =
+      LogScaleApply(&state->log_scale, scaled_filterbank,
+                    state->filterbank.num_channels, correction_bits);
+
+  output.size = state->filterbank.num_channels;
+  output.values = logged_filterbank;
+  return output;
+}
+
+void FrontendReset(struct FrontendState* state) {
+  WindowReset(&state->window);
+  FftReset(&state->fft);
+  FilterbankReset(&state->filterbank);
+  NoiseReductionReset(&state->noise_reduction);
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h
new file mode 100644
index 0000000000000000000000000000000000000000..3221d283e8740cd95fa39ece4ad9533d059018d7
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FRONTEND_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FRONTEND_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h"
+
+struct FrontendState {
+  struct WindowState window;
+  struct FftState fft;
+  struct FilterbankState filterbank;
+  struct NoiseReductionState noise_reduction;
+  struct PcanGainControlState pcan_gain_control;
+  struct LogScaleState log_scale;
+};
+
+struct FrontendOutput {
+  const uint16_t* values;
+  size_t size;
+};
+
+// Main entry point to processing frontend samples. Updates num_samples_read to
+// contain the number of samples that have been consumed from the input array.
+// Returns a struct containing the generated output. If not enough samples were
+// added to generate a feature vector, the returned size will be 0 and the
+// values pointer will be NULL. Note that the output pointer will be invalidated
+// as soon as FrontendProcessSamples is called again, so copy the contents
+// elsewhere if you need to use them later.
+struct FrontendOutput FrontendProcessSamples(struct FrontendState* state,
+                                             const int16_t* samples,
+                                             size_t num_samples,
+                                             size_t* num_samples_read);
+
+void FrontendReset(struct FrontendState* state);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FRONTEND_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4d9f86d48d86b1795fa45b9bf24db4dd75fb0a20
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_test.cc
@@ -0,0 +1,134 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.h"
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+namespace {
+
+const int kSampleRate = 1000;
+const int kWindowSamples = 25;
+const int kStepSamples = 10;
+const int16_t kFakeAudioData[] = {
+    0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768,
+    0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768,
+    0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768};
+
+// Test end-to-end frontend behaviors.
+class FrontendTestConfig {
+ public:
+  FrontendTestConfig() {
+    config_.window.size_ms = 25;
+    config_.window.step_size_ms = 10;
+    config_.noise_reduction.smoothing_bits = 10;
+    config_.filterbank.num_channels = 2;
+    config_.filterbank.lower_band_limit = 8.0;
+    config_.filterbank.upper_band_limit = 450.0;
+    config_.noise_reduction.smoothing_bits = 10;
+    config_.noise_reduction.even_smoothing = 0.025;
+    config_.noise_reduction.odd_smoothing = 0.06;
+    config_.noise_reduction.min_signal_remaining = 0.05;
+    config_.pcan_gain_control.enable_pcan = true;
+    config_.pcan_gain_control.strength = 0.95;
+    config_.pcan_gain_control.offset = 80.0;
+    config_.pcan_gain_control.gain_bits = 21;
+    config_.log_scale.enable_log = true;
+    config_.log_scale.scale_shift = 6;
+  }
+
+  struct FrontendConfig config_;
+};
+
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(FrontendTest_CheckOutputValues) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FrontendTestConfig config;
+  struct FrontendState state;
+  TF_LITE_MICRO_EXPECT(FrontendPopulateState(error_reporter, &config.config_,
+                                             &state, kSampleRate));
+  size_t num_samples_read;
+
+  struct FrontendOutput output = FrontendProcessSamples(
+      &state, kFakeAudioData,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read);
+
+  const uint16_t expected[] = {479, 425};
+  TF_LITE_MICRO_EXPECT_EQ(output.size, sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < output.size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(output.values[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FrontendTest_CheckConsecutiveWindow) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FrontendTestConfig config;
+  struct FrontendState state;
+  TF_LITE_MICRO_EXPECT(FrontendPopulateState(error_reporter, &config.config_,
+                                             &state, kSampleRate));
+  size_t num_samples_read;
+
+  FrontendProcessSamples(&state, kFakeAudioData,
+                         sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]),
+                         &num_samples_read);
+  struct FrontendOutput output = FrontendProcessSamples(
+      &state, kFakeAudioData + kWindowSamples,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples,
+      &num_samples_read);
+
+  const int16_t expected[] = {436, 378};
+  TF_LITE_MICRO_EXPECT_EQ(output.size, sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < output.size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(output.values[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(FrontendTest_CheckNotEnoughSamples) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  FrontendTestConfig config;
+  struct FrontendState state;
+  TF_LITE_MICRO_EXPECT(FrontendPopulateState(error_reporter, &config.config_,
+                                             &state, kSampleRate));
+  size_t num_samples_read;
+
+  FrontendProcessSamples(&state, kFakeAudioData,
+                         sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]),
+                         &num_samples_read);
+  FrontendProcessSamples(
+      &state, kFakeAudioData + kWindowSamples,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples,
+      &num_samples_read);
+  struct FrontendOutput output = FrontendProcessSamples(
+      &state, kFakeAudioData + kWindowSamples + kStepSamples,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples -
+          kStepSamples,
+      &num_samples_read);
+
+  TF_LITE_MICRO_EXPECT_EQ(output.size, 0);
+  TF_LITE_MICRO_EXPECT_EQ(output.values, nullptr);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..220bc130fb9332e4afbe02a4432b61c8a4bcd544
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.cc
@@ -0,0 +1,80 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.h"
+
+#include <string.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h"
+
+void FrontendFillConfigWithDefaults(struct FrontendConfig* config) {
+  WindowFillConfigWithDefaults(&config->window);
+  FilterbankFillConfigWithDefaults(&config->filterbank);
+  NoiseReductionFillConfigWithDefaults(&config->noise_reduction);
+  PcanGainControlFillConfigWithDefaults(&config->pcan_gain_control);
+  LogScaleFillConfigWithDefaults(&config->log_scale);
+}
+
+int FrontendPopulateState(tflite::ErrorReporter* error_reporter,
+                          const struct FrontendConfig* config,
+                          struct FrontendState* state, int sample_rate) {
+  memset(state, 0, sizeof(*state));
+
+  if (!WindowPopulateState(error_reporter, &config->window, &state->window,
+                           sample_rate)) {
+    error_reporter->Report("Failed to populate window state");
+    return 0;
+  }
+
+  if (!FftPopulateState(error_reporter, &state->fft, state->window.size)) {
+    error_reporter->Report("Failed to populate fft state");
+    return 0;
+  }
+  FftInit(&state->fft);
+
+  if (!FilterbankPopulateState(error_reporter, &config->filterbank,
+                               &state->filterbank, sample_rate,
+                               state->fft.fft_size / 2 + 1)) {
+    error_reporter->Report("Failed to populate filterbank state");
+    return 0;
+  }
+
+  if (!NoiseReductionPopulateState(error_reporter, &config->noise_reduction,
+                                   &state->noise_reduction,
+                                   state->filterbank.num_channels)) {
+    error_reporter->Report("Failed to populate noise reduction state");
+    return 0;
+  }
+
+  int input_correction_bits =
+      MostSignificantBit32(state->fft.fft_size) - 1 - (kFilterbankBits / 2);
+  if (!PcanGainControlPopulateState(
+          error_reporter, &config->pcan_gain_control, &state->pcan_gain_control,
+          state->noise_reduction.estimate, state->filterbank.num_channels,
+          state->noise_reduction.smoothing_bits, input_correction_bits)) {
+    error_reporter->Report("Failed to populate pcan gain control state");
+    return 0;
+  }
+
+  if (!LogScalePopulateState(error_reporter, &config->log_scale,
+                             &state->log_scale)) {
+    error_reporter->Report("Failed to populate log scale state");
+    return 0;
+  }
+
+  FrontendReset(state);
+
+  // All good, return a true value.
+  return 1;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7267644ae053e1f816cb22bb5e0ecd04e4de0ef
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.h
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FRONTEND_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FRONTEND_UTIL_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/fft_util.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/filterbank_util.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+struct FrontendConfig {
+  struct WindowConfig window;
+  struct FilterbankConfig filterbank;
+  struct NoiseReductionConfig noise_reduction;
+  struct PcanGainControlConfig pcan_gain_control;
+  struct LogScaleConfig log_scale;
+};
+
+// Fills the frontendConfig with "sane" defaults.
+void FrontendFillConfigWithDefaults(struct FrontendConfig* config);
+
+// Prepares any buffers.
+int FrontendPopulateState(tflite::ErrorReporter* error_reporter,
+                          const struct FrontendConfig* config,
+                          struct FrontendState* state, int sample_rate);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_FRONTEND_UTIL_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c651caad8c67773f20a485eea5519f286b1b0253
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.cc
@@ -0,0 +1,30 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.h"
+const uint16_t kLogLut[]
+#ifndef _MSC_VER
+    __attribute__((aligned(4)))
+#endif  // _MSV_VER
+    = {0,    224,  442,  654,  861,  1063, 1259, 1450, 1636, 1817, 1992, 2163,
+       2329, 2490, 2646, 2797, 2944, 3087, 3224, 3358, 3487, 3611, 3732, 3848,
+       3960, 4068, 4172, 4272, 4368, 4460, 4549, 4633, 4714, 4791, 4864, 4934,
+       5001, 5063, 5123, 5178, 5231, 5280, 5326, 5368, 5408, 5444, 5477, 5507,
+       5533, 5557, 5578, 5595, 5610, 5622, 5631, 5637, 5640, 5641, 5638, 5633,
+       5626, 5615, 5602, 5586, 5568, 5547, 5524, 5498, 5470, 5439, 5406, 5370,
+       5332, 5291, 5249, 5203, 5156, 5106, 5054, 5000, 4944, 4885, 4825, 4762,
+       4697, 4630, 4561, 4490, 4416, 4341, 4264, 4184, 4103, 4020, 3935, 3848,
+       3759, 3668, 3575, 3481, 3384, 3286, 3186, 3084, 2981, 2875, 2768, 2659,
+       2549, 2437, 2323, 2207, 2090, 1971, 1851, 1729, 1605, 1480, 1353, 1224,
+       1094, 963,  830,  695,  559,  421,  282,  142,  0,    0};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/timer.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.h
similarity index 57%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/timer.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.h
index 162952844a832ebd0b0273d13a929fec6fa22892..d5ed9339bd02e23cc134992badce5cdb72a74771 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/timer.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.h
@@ -12,20 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_LUT_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_LUT_H_
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TIMER_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TIMER_H_
+#include <stdint.h>
 
-#include <cstdint>
+// Number of segments in the log lookup table. The table will be kLogSegments+1
+// in length (with some padding).
+#define kLogSegments 128
+#define kLogSegmentsLog2 7
 
-// Returns the time in milliseconds. There's no contract about what time zero
-// represents, the accuracy, or the granularity of the result. Subsequent calls
-// will generally not return a lower value, but even that's not guaranteed if
-// there's an overflow  wraparound.
-// The reference implementation of this function just returns a constantly
-// incrementing value for each call, since it would need a non-portable platform
-// call to access time information. For real applications, you'll need to write
-// your own platform-specific implementation.
-int32_t TimeInMilliseconds();
+// Scale used by lookup table.
+#define kLogScale 65536
+#define kLogScaleLog2 16
+#define kLogCoeff 45426
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TIMER_H_
+extern const uint16_t kLogLut[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_LUT_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f85e9c1a2f5a0056deaffad7b99b774c772ce562
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.cc
@@ -0,0 +1,84 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_lut.h"
+
+#define kuint16max 0x0000FFFF
+
+// The following functions implement integer logarithms of various sizes. The
+// approximation is calculated according to method described in
+//       www.inti.gob.ar/electronicaeinformatica/instrumentacion/utic/
+//       publicaciones/SPL2007/Log10-spl07.pdf
+// It first calculates log2 of the input and then converts it to natural
+// logarithm.
+
+static uint32_t Log2FractionPart(const uint32_t x, const uint32_t log2x) {
+  // Part 1
+  int32_t frac = x - (1LL << log2x);
+  if (log2x < kLogScaleLog2) {
+    frac <<= kLogScaleLog2 - log2x;
+  } else {
+    frac >>= log2x - kLogScaleLog2;
+  }
+  // Part 2
+  const uint32_t base_seg = frac >> (kLogScaleLog2 - kLogSegmentsLog2);
+  const uint32_t seg_unit =
+      ((static_cast<uint32_t>(1)) << kLogScaleLog2) >> kLogSegmentsLog2;
+
+  const int32_t c0 = kLogLut[base_seg];
+  const int32_t c1 = kLogLut[base_seg + 1];
+  const int32_t seg_base = seg_unit * base_seg;
+  const int32_t rel_pos = ((c1 - c0) * (frac - seg_base)) >> kLogScaleLog2;
+  return frac + c0 + rel_pos;
+}
+
+static uint32_t Log(const uint32_t x, const uint32_t scale_shift) {
+  const uint32_t integer = MostSignificantBit32(x) - 1;
+  const uint32_t fraction = Log2FractionPart(x, integer);
+  const uint32_t log2 = (integer << kLogScaleLog2) + fraction;
+  const uint32_t round = kLogScale / 2;
+  const uint32_t loge =
+      ((static_cast<uint64_t>(kLogCoeff)) * log2 + round) >> kLogScaleLog2;
+  // Finally scale to our output scale
+  const uint32_t loge_scaled = ((loge << scale_shift) + round) >> kLogScaleLog2;
+  return loge_scaled;
+}
+
+uint16_t* LogScaleApply(struct LogScaleState* state, uint32_t* signal,
+                        int signal_size, int correction_bits) {
+  const int scale_shift = state->scale_shift;
+  uint16_t* output = reinterpret_cast<uint16_t*>(signal);
+  uint16_t* ret = output;
+  int i;
+  for (i = 0; i < signal_size; ++i) {
+    uint32_t value = *signal++;
+    if (state->enable_log) {
+      if (correction_bits < 0) {
+        value >>= -correction_bits;
+      } else {
+        value <<= correction_bits;
+      }
+      if (value > 1) {
+        value = Log(value, scale_shift);
+      } else {
+        value = 0;
+      }
+    }
+    *output++ = (value < kuint16max) ? value : kuint16max;
+  }
+  return ret;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h
new file mode 100644
index 0000000000000000000000000000000000000000..d90b87fb6d6fe181158b209a87a42f6d075ba457
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_SCALE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_SCALE_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+struct LogScaleState {
+  int enable_log;
+  int scale_shift;
+};
+
+// Applies a fixed point logarithm to the signal and converts it to 16 bit. Note
+// that the signal array will be modified.
+uint16_t* LogScaleApply(struct LogScaleState* state, uint32_t* signal,
+                        int signal_size, int correction_bits);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_SCALE_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d5b23323a273314a347f25e691d538781558980a
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_test.cc
@@ -0,0 +1,63 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.h"
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+namespace {
+
+const int kScaleShift = 6;
+const int kCorrectionBits = -1;
+
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(LogScaleTest_CheckOutputValues) {
+  struct LogScaleState state;
+  state.enable_log = true;
+  state.scale_shift = kScaleShift;
+
+  uint32_t fake_signal[] = {3578, 1533};
+  uint16_t* output = LogScaleApply(&state, fake_signal,
+                                   sizeof(fake_signal) / sizeof(fake_signal[0]),
+                                   kCorrectionBits);
+
+  const uint16_t expected[] = {479, 425};
+  int i;
+  for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(output[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(LogScaleTest_CheckOutputValuesNoLog) {
+  struct LogScaleState state;
+  state.enable_log = false;
+  state.scale_shift = kScaleShift;
+
+  uint32_t fake_signal[] = {85964, 45998};
+  uint16_t* output = LogScaleApply(&state, fake_signal,
+                                   sizeof(fake_signal) / sizeof(fake_signal[0]),
+                                   kCorrectionBits);
+
+  const uint16_t expected[] = {65535, 45998};
+  int i;
+  for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(output[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..09adc09c3511fdedeb7246b6717fa4bfb4c83ba1
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.h"
+
+void LogScaleFillConfigWithDefaults(struct LogScaleConfig* config) {
+  config->enable_log = 1;
+  config->scale_shift = 6;
+}
+
+int LogScalePopulateState(tflite::ErrorReporter* error_reporter,
+                          const struct LogScaleConfig* config,
+                          struct LogScaleState* state) {
+  state->enable_log = config->enable_log;
+  state->scale_shift = config->scale_shift;
+  return 1;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..3caf207f2693756783b6c1dc64246d2522388d3b
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale_util.h
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_SCALE_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_SCALE_UTIL_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/log_scale.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+struct LogScaleConfig {
+  // set to false (0) to disable this module
+  int enable_log;
+  // scale results by 2^(scale_shift)
+  int scale_shift;
+};
+
+// Populates the LogScaleConfig with "sane" default values.
+void LogScaleFillConfigWithDefaults(struct LogScaleConfig* config);
+
+// Allocates any buffers.
+int LogScalePopulateState(tflite::ErrorReporter* error_reporter,
+                          const struct LogScaleConfig* config,
+                          struct LogScaleState* state);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_LOG_SCALE_UTIL_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6657c6f3205903c178b2aa4314551f5b4fee1101
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.cc
@@ -0,0 +1,99 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.h"
+
+#include <cmath>
+#include <cstring>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/frontend_util.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+namespace {
+
+FrontendState g_micro_features_state;
+bool g_is_first_time = true;
+
+}  // namespace
+
+TfLiteStatus InitializeMicroFeatures(tflite::ErrorReporter* error_reporter) {
+  FrontendConfig config;
+  config.window.size_ms = kFeatureSliceDurationMs;
+  config.window.step_size_ms = kFeatureSliceStrideMs;
+  config.noise_reduction.smoothing_bits = 10;
+  config.filterbank.num_channels = kFeatureSliceSize;
+  config.filterbank.lower_band_limit = 125.0;
+  config.filterbank.upper_band_limit = 7500.0;
+  config.noise_reduction.smoothing_bits = 10;
+  config.noise_reduction.even_smoothing = 0.025;
+  config.noise_reduction.odd_smoothing = 0.06;
+  config.noise_reduction.min_signal_remaining = 0.05;
+  config.pcan_gain_control.enable_pcan = 1;
+  config.pcan_gain_control.strength = 0.95;
+  config.pcan_gain_control.offset = 80.0;
+  config.pcan_gain_control.gain_bits = 21;
+  config.log_scale.enable_log = 1;
+  config.log_scale.scale_shift = 6;
+  if (!FrontendPopulateState(error_reporter, &config, &g_micro_features_state,
+                             kAudioSampleFrequency)) {
+    error_reporter->Report("FrontendPopulateState() failed");
+    return kTfLiteError;
+  }
+  g_is_first_time = true;
+  return kTfLiteOk;
+}
+
+// This is not exposed in any header, and is only used for testing, to ensure
+// that the state is correctly set up before generating results.
+void SetMicroFeaturesNoiseEstimates(const uint32_t* estimate_presets) {
+  for (int i = 0; i < g_micro_features_state.filterbank.num_channels; ++i) {
+    g_micro_features_state.noise_reduction.estimate[i] = estimate_presets[i];
+  }
+}
+
+TfLiteStatus GenerateMicroFeatures(tflite::ErrorReporter* error_reporter,
+                                   const int16_t* input, int input_size,
+                                   int output_size, uint8_t* output,
+                                   size_t* num_samples_read) {
+  const int16_t* frontend_input;
+  if (g_is_first_time) {
+    frontend_input = input;
+    g_is_first_time = false;
+  } else {
+    frontend_input = input + 160;
+  }
+  FrontendOutput frontend_output = FrontendProcessSamples(
+      &g_micro_features_state, frontend_input, input_size, num_samples_read);
+
+  for (int i = 0; i < frontend_output.size; ++i) {
+    // These scaling values are derived from those used in input_data.py in the
+    // training pipeline.
+    constexpr int32_t value_scale = (10 * 255);
+    constexpr int32_t value_div = (256 * 26);
+    int32_t value =
+        ((frontend_output.values[i] * value_scale) + (value_div / 2)) /
+        value_div;
+    if (value < 0) {
+      value = 0;
+    }
+    if (value > 255) {
+      value = 255;
+    }
+    output[i] = value;
+  }
+
+  return kTfLiteOk;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.h
new file mode 100644
index 0000000000000000000000000000000000000000..46fa55d62ff7a8032cb94e512d4e856fb5960276
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.h
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_MICRO_FEATURES_GENERATOR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_MICRO_FEATURES_GENERATOR_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+// Sets up any resources needed for the feature generation pipeline.
+TfLiteStatus InitializeMicroFeatures(tflite::ErrorReporter* error_reporter);
+
+// Converts audio sample data into a more compact form that's appropriate for
+// feeding into a neural network.
+TfLiteStatus GenerateMicroFeatures(tflite::ErrorReporter* error_reporter,
+                                   const int16_t* input, int input_size,
+                                   int output_size, uint8_t* output,
+                                   size_t* num_samples_read);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_MICRO_FEATURES_GENERATOR_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2c2da7a799ce398ba7faf31d577d79bb96b2072a
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator_test.cc
@@ -0,0 +1,100 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_features_generator.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+// This is a test-only API, not exposed in any public headers, so declare it.
+void SetMicroFeaturesNoiseEstimates(const uint32_t* estimate_presets);
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestMicroFeaturesGeneratorYes) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, InitializeMicroFeatures(error_reporter));
+
+  // The micro features pipeline retains state from previous calls to help
+  // estimate the background noise. Unfortunately this makes it harder to
+  // exactly reproduce results in a test environment, so use a known snapshot
+  // of the parameters at the point that the golden feature values were
+  // created.
+  const uint32_t yes_estimate_presets[] = {
+      1062898, 2644477, 1257642, 1864718, 412722, 725703, 395721, 474082,
+      173046,  255856,  158966,  153736,  69181,  199100, 144493, 227740,
+      110573,  164330,  79666,   144650,  122947, 476799, 398553, 497493,
+      322152,  1140005, 566716,  690605,  308902, 347481, 109891, 170457,
+      73901,   100975,  42963,   72325,   34183,  20207,  6640,   9468,
+  };
+  SetMicroFeaturesNoiseEstimates(yes_estimate_presets);
+
+  uint8_t yes_calculated_data[g_yes_feature_data_slice_size];
+  size_t num_samples_read;
+  TfLiteStatus yes_status = GenerateMicroFeatures(
+      error_reporter, g_yes_30ms_sample_data, g_yes_30ms_sample_data_size,
+      g_yes_feature_data_slice_size, yes_calculated_data, &num_samples_read);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, yes_status);
+
+  for (int i = 0; i < g_yes_feature_data_slice_size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(g_yes_feature_data_slice[i],
+                            yes_calculated_data[i]);
+    if (g_yes_feature_data_slice[i] != yes_calculated_data[i]) {
+      error_reporter->Report("Expected value %d but found %d",
+                             g_yes_feature_data_slice[i],
+                             yes_calculated_data[i]);
+    }
+  }
+}
+
+TF_LITE_MICRO_TEST(TestMicroFeaturesGeneratorNo) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, InitializeMicroFeatures(error_reporter));
+  // As we did for the previous features, set known good noise state
+  // parameters.
+  const uint32_t no_estimate_presets[] = {
+      2563964, 1909393, 559801, 538670, 203643, 175959, 75088, 139491,
+      59691,   95307,   43865,  129263, 52517,  80058,  51330, 100731,
+      76674,   76262,   15497,  22598,  13778,  21460,  8946,  17806,
+      10023,   18810,   8002,   10842,  7578,   9983,   6267,  10759,
+      8946,    18488,   9691,   39785,  9939,   17835,  9671,  18512,
+  };
+  SetMicroFeaturesNoiseEstimates(no_estimate_presets);
+
+  uint8_t no_calculated_data[g_no_feature_data_slice_size];
+  size_t num_samples_read;
+  TfLiteStatus no_status = GenerateMicroFeatures(
+      error_reporter, g_no_30ms_sample_data, g_no_30ms_sample_data_size,
+      g_no_feature_data_slice_size, no_calculated_data, &num_samples_read);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, no_status);
+
+  for (int i = 0; i < g_no_feature_data_slice_size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(g_no_feature_data_slice[i], no_calculated_data[i]);
+    if (g_no_feature_data_slice[i] != no_calculated_data[i]) {
+      error_reporter->Report("Expected value %d but found %d",
+                             g_no_feature_data_slice[i], no_calculated_data[i]);
+    }
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.cc
similarity index 95%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.cc
index b9b8fb37b19d384fe92edf8ce2292aee19b99b7f..09f65ca24b3cd03485a5a79599dc0143ca83329c 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
 
 const char* kCategoryLabels[kCategoryCount] = {
     "silence",
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h
new file mode 100644
index 0000000000000000000000000000000000000000..b74a4d01ca49d37d62daf3710c878cfc6d9940f0
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_MICRO_MODEL_SETTINGS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_MICRO_MODEL_SETTINGS_H_
+
+// Keeping these as constant expressions allow us to allocate fixed-sized arrays
+// on the stack for our working memory.
+
+// The size of the input time series data we pass to the FFT to produce the
+// frequency information. This has to be a power of two, and since we're dealing
+// with 30ms of 16KHz inputs, which means 480 samples, this is the next value.
+constexpr int kMaxAudioSampleSize = 512;
+constexpr int kAudioSampleFrequency = 16000;
+
+// All of these values are derived from the values used during model training,
+// if you change your model you'll need to update these constants.
+constexpr int kFeatureSliceSize = 40;
+constexpr int kFeatureSliceCount = 49;
+constexpr int kFeatureElementCount = (kFeatureSliceSize * kFeatureSliceCount);
+constexpr int kFeatureSliceStrideMs = 20;
+constexpr int kFeatureSliceDurationMs = 30;
+
+constexpr int kCategoryCount = 4;
+constexpr int kSilenceIndex = 0;
+constexpr int kUnknownIndex = 1;
+extern const char* kCategoryLabels[kCategoryCount];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_MICRO_MODEL_SETTINGS_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1dbb606e184e70e0fa97d417bcbab6010b8a88a5
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.cc
@@ -0,0 +1,24 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See the header for documentation on the meaning of this data.
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.h"
+
+const uint8_t g_no_feature_data_slice[g_no_feature_data_slice_size] = {
+    216, 195, 223, 211, 238, 223, 243, 215, 226, 204, 232, 211, 232, 213,
+    240, 218, 235, 214, 238, 205, 207, 173, 149, 201, 215, 200, 230, 213,
+    208, 195, 175, 151, 195, 175, 182, 163, 235, 217, 218, 190,
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.h
new file mode 100644
index 0000000000000000000000000000000000000000..72ea2bf6a23e83bff5dea771931e585d74c757ec
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_feature_data_slice.h
@@ -0,0 +1,29 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This data was extracted from the larger feature data held in
+// no_features_data.cc and consists of the 29th spectrogram slice of 43 values.
+// This is the expected result of running the sample data in
+// no_30ms_sample_data.cc through through the preprocessing pipeline.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NO_FEATURE_DATA_SLICE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NO_FEATURE_DATA_SLICE_H_
+
+#include <cstdint>
+
+constexpr int g_no_feature_data_slice_size = 40;
+extern const uint8_t g_no_feature_data_slice[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NO_FEATURE_DATA_SLICE_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..865209b01df7e8f77139bcd5b6a37537a6f674f4
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.cc
@@ -0,0 +1,165 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h"
+
+/* File automatically created by
+ * tensorflow/examples/speech_commands/wav_to_features.py \
+ * --sample_rate=16000 \
+ * --clip_duration_ms=1000 \
+ * --window_size_ms=30 \
+ * --window_stride_ms=20 \
+ * --feature_bin_count=40 \
+ * --quantize=1 \
+ * --preprocess="micro" \
+ * --input_wav="speech_commands_test_set_v0.02/no/f9643d42_nohash_4.wav" \
+ * --output_c_file="/tmp/no_micro_features_data.cc" \
+ */
+
+const int g_no_micro_f9643d42_nohash_4_width = 40;
+const int g_no_micro_f9643d42_nohash_4_height = 49;
+const unsigned char g_no_micro_f9643d42_nohash_4_data[] = {
+    230, 205, 191, 203, 202, 181, 180, 194, 205, 187, 183, 197, 203, 198, 196,
+    186, 202, 159, 151, 126, 110, 138, 141, 142, 137, 148, 133, 120, 110, 126,
+    117, 110, 117, 116, 137, 134, 95,  116, 123, 110, 184, 144, 183, 189, 197,
+    172, 188, 164, 194, 179, 175, 174, 182, 173, 184, 174, 200, 145, 154, 148,
+    147, 135, 143, 122, 127, 138, 116, 99,  122, 105, 110, 125, 127, 133, 131,
+    123, 116, 119, 127, 114, 193, 176, 185, 170, 175, 146, 166, 167, 185, 185,
+    185, 183, 195, 185, 176, 178, 197, 155, 137, 144, 164, 132, 153, 132, 138,
+    137, 134, 95,  120, 116, 131, 122, 99,  120, 120, 110, 116, 110, 126, 127,
+    128, 159, 187, 119, 178, 187, 197, 167, 199, 184, 180, 165, 194, 176, 144,
+    134, 187, 136, 142, 134, 145, 132, 145, 105, 119, 123, 125, 116, 125, 102,
+    129, 138, 130, 99,  99,  90,  120, 123, 134, 95,  194, 172, 187, 123, 191,
+    179, 195, 182, 201, 137, 167, 142, 185, 161, 187, 146, 167, 152, 154, 107,
+    152, 112, 134, 144, 117, 116, 105, 85,  105, 105, 99,  90,  123, 112, 112,
+    68,  107, 105, 117, 99,  116, 143, 139, 90,  154, 142, 188, 172, 178, 135,
+    175, 149, 177, 110, 173, 160, 169, 162, 173, 119, 132, 110, 85,  85,  117,
+    129, 117, 112, 117, 51,  112, 95,  139, 102, 105, 90,  128, 119, 112, 99,
+    170, 168, 195, 152, 174, 173, 180, 0,   157, 130, 169, 149, 149, 123, 170,
+    130, 170, 133, 159, 102, 134, 90,  85,  105, 126, 119, 130, 90,  78,  68,
+    127, 120, 95,  51,  122, 110, 112, 78,  116, 95,  180, 135, 179, 146, 179,
+    162, 197, 153, 172, 135, 154, 0,   149, 95,  145, 114, 166, 0,   114, 110,
+    145, 107, 114, 90,  136, 68,  95,  95,  95,  85,  116, 99,  116, 0,   95,
+    68,  102, 51,  102, 78,  185, 157, 138, 158, 180, 117, 173, 142, 145, 117,
+    169, 130, 159, 99,  138, 123, 169, 90,  78,  0,   123, 85,  107, 51,  114,
+    102, 95,  0,   116, 85,  119, 95,  95,  68,  85,  51,  116, 68,  102, 78,
+    167, 105, 164, 163, 178, 126, 164, 154, 154, 51,  177, 120, 156, 85,  134,
+    139, 168, 90,  161, 102, 114, 116, 122, 95,  112, 102, 107, 51,  114, 85,
+    119, 78,  114, 90,  102, 51,  102, 51,  114, 99,  177, 68,  152, 102, 184,
+    166, 179, 129, 177, 129, 180, 110, 158, 105, 139, 0,   145, 85,  148, 102,
+    117, 102, 116, 0,   78,  68,  90,  51,  107, 85,  78,  0,   51,  0,   51,
+    0,   95,  51,  107, 68,  180, 117, 90,  0,   138, 0,   187, 146, 119, 140,
+    164, 90,  136, 0,   131, 51,  159, 99,  141, 138, 116, 51,  90,  51,  90,
+    68,  105, 0,   85,  78,  112, 51,  122, 95,  128, 68,  85,  0,   112, 68,
+    147, 126, 178, 146, 171, 130, 190, 147, 188, 123, 170, 78,  132, 0,   130,
+    125, 159, 95,  102, 0,   110, 0,   95,  85,  120, 68,  78,  51,  99,  51,
+    105, 0,   112, 102, 105, 68,  90,  51,  90,  0,   127, 95,  166, 175, 187,
+    133, 135, 0,   171, 139, 132, 128, 140, 51,  126, 107, 161, 0,   95,  51,
+    119, 0,   114, 0,   95,  110, 116, 51,  112, 0,   90,  0,   116, 51,  68,
+    0,   105, 68,  105, 0,   164, 78,  173, 0,   194, 166, 145, 114, 116, 51,
+    107, 122, 151, 0,   156, 102, 148, 51,  122, 95,  129, 0,   85,  0,   127,
+    78,  90,  0,   78,  0,   95,  0,   110, 0,   68,  119, 120, 68,  68,  0,
+    122, 99,  147, 127, 200, 167, 85,  114, 161, 85,  161, 125, 143, 99,  156,
+    85,  147, 68,  99,  0,   107, 102, 132, 51,  112, 68,  95,  78,  99,  0,
+    68,  0,   51,  0,   90,  78,  128, 51,  95,  0,   166, 136, 174, 138, 189,
+    144, 130, 129, 138, 134, 132, 120, 134, 0,   51,  78,  147, 51,  51,  0,
+    51,  0,   78,  0,   68,  68,  95,  78,  90,  0,   0,   0,   68,  0,   90,
+    68,  110, 0,   95,  51,  165, 151, 157, 0,   0,   0,   112, 0,   112, 95,
+    149, 107, 119, 68,  126, 68,  138, 0,   78,  0,   78,  0,   99,  51,  112,
+    0,   102, 0,   78,  51,  85,  0,   0,   0,   78,  0,   95,  0,   95,  78,
+    105, 0,   152, 0,   0,   51,  132, 105, 159, 0,   129, 102, 114, 0,   138,
+    51,  123, 0,   129, 78,  119, 51,  51,  51,  105, 0,   78,  85,  95,  0,
+    85,  0,   0,   0,   85,  0,   78,  0,   0,   0,   172, 142, 141, 0,   137,
+    0,   148, 128, 157, 120, 146, 120, 120, 0,   95,  78,  141, 68,  68,  0,
+    68,  0,   90,  0,   85,  0,   107, 0,   78,  0,   85,  51,  102, 0,   68,
+    78,  68,  0,   51,  0,   125, 0,   141, 51,  102, 138, 175, 51,  120, 51,
+    173, 85,  116, 141, 164, 68,  150, 123, 133, 51,  114, 0,   117, 68,  150,
+    51,  116, 68,  78,  0,   68,  0,   68,  0,   85,  0,   78,  0,   51,  78,
+    155, 90,  161, 0,   132, 99,  123, 78,  107, 0,   134, 90,  95,  0,   78,
+    0,   162, 143, 85,  0,   107, 78,  125, 90,  90,  51,  51,  0,   85,  0,
+    0,   0,   132, 102, 102, 154, 128, 0,   99,  68,  162, 102, 151, 0,   99,
+    51,  147, 141, 156, 0,   112, 120, 158, 127, 145, 139, 187, 171, 135, 138,
+    146, 0,   95,  68,  127, 0,   85,  0,   105, 0,   0,   0,   187, 170, 162,
+    188, 165, 51,  51,  78,  243, 215, 225, 196, 205, 181, 205, 168, 176, 134,
+    157, 110, 126, 114, 133, 139, 193, 163, 159, 116, 160, 126, 122, 127, 171,
+    99,  114, 68,  123, 85,  90,  0,   157, 146, 166, 179, 136, 0,   116, 90,
+    242, 219, 240, 204, 216, 164, 188, 171, 176, 164, 154, 158, 190, 157, 190,
+    141, 182, 177, 169, 128, 172, 145, 105, 129, 157, 90,  78,  51,  119, 68,
+    137, 68,  116, 78,  141, 132, 151, 122, 156, 140, 234, 206, 229, 201, 216,
+    174, 191, 144, 162, 85,  122, 157, 194, 167, 204, 149, 180, 166, 166, 139,
+    122, 133, 156, 126, 145, 85,  128, 0,   99,  51,  145, 0,   126, 51,  166,
+    162, 166, 162, 177, 157, 228, 198, 221, 197, 214, 177, 173, 166, 173, 139,
+    185, 191, 202, 163, 205, 172, 206, 189, 135, 68,  166, 134, 149, 134, 135,
+    90,  127, 107, 175, 90,  136, 117, 135, 140, 172, 167, 166, 149, 177, 152,
+    221, 191, 215, 194, 211, 0,   156, 147, 182, 178, 208, 163, 190, 157, 208,
+    200, 195, 164, 179, 154, 181, 150, 143, 99,  132, 137, 185, 143, 163, 85,
+    51,  107, 132, 134, 164, 127, 167, 159, 175, 141, 216, 195, 223, 211, 238,
+    223, 243, 215, 226, 204, 232, 211, 232, 213, 240, 218, 235, 214, 238, 205,
+    207, 173, 149, 201, 215, 200, 230, 213, 208, 195, 175, 151, 195, 175, 182,
+    163, 235, 217, 218, 190, 211, 191, 215, 191, 217, 220, 241, 215, 229, 206,
+    236, 210, 227, 216, 236, 188, 183, 149, 202, 189, 208, 172, 191, 201, 220,
+    193, 221, 207, 216, 208, 201, 131, 170, 187, 229, 197, 211, 194, 226, 201,
+    205, 184, 206, 177, 221, 210, 226, 184, 204, 197, 218, 198, 212, 209, 213,
+    141, 172, 110, 175, 167, 180, 156, 213, 188, 192, 179, 213, 205, 204, 174,
+    200, 147, 162, 181, 203, 167, 198, 187, 210, 164, 196, 169, 189, 168, 224,
+    198, 213, 204, 198, 195, 230, 211, 221, 197, 208, 0,   0,   0,   85,  90,
+    167, 130, 175, 173, 203, 164, 193, 144, 170, 145, 185, 148, 154, 139, 198,
+    159, 180, 171, 216, 174, 178, 161, 166, 136, 216, 184, 215, 197, 199, 190,
+    228, 195, 208, 51,  117, 0,   0,   0,   0,   0,   140, 51,  135, 154, 188,
+    155, 168, 0,   90,  0,   156, 85,  110, 0,   174, 90,  172, 154, 179, 99,
+    142, 166, 179, 157, 177, 95,  192, 142, 204, 198, 217, 147, 173, 0,   112,
+    0,   0,   0,   0,   0,   0,   0,   110, 0,   107, 0,   160, 0,   148, 95,
+    172, 0,   0,   0,   116, 0,   122, 114, 170, 0,   0,   0,   0,   0,   179,
+    110, 196, 85,  205, 183, 169, 0,   99,  0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   141, 0,   112, 0,   0,   0,   134, 0,   0,   0,   0,
+    0,   0,   0,   139, 0,   0,   0,   0,   112, 186, 78,  163, 0,   169, 128,
+    174, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   95,
+    0,   105, 0,   0,   0,   105, 0,   0,   0,   0,   0,   0,   0,   95,  0,
+    0,   0,   0,   0,   0,   0,   119, 0,   164, 78,  0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   90,  0,   0,   68,
+    117, 0,   0,   0,   0,   0,   0,   0,   148, 0,   0,   0,   0,   0,   0,
+    0,   0,   0,   116, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   51,
+    0,   0,   0,   99,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   99,  0,   0,   0,   0,   0,   0,   0,   0,   0,   78,  0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..178323eeba6669d247edfe9cb675b37fe5c7d526
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h
@@ -0,0 +1,23 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NO_MICRO_FEATURES_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NO_MICRO_FEATURES_DATA_H_
+
+extern const int g_no_micro_f9643d42_nohash_4_width;
+extern const int g_no_micro_f9643d42_nohash_4_height;
+extern const unsigned char g_no_micro_f9643d42_nohash_4_data[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NO_MICRO_FEATURES_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b3aa19cda5d1c8151de5e8bf5aad45df09259a0
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.cc
@@ -0,0 +1,51 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h"
+
+#include <string.h>
+
+void NoiseReductionApply(struct NoiseReductionState* state, uint32_t* signal) {
+  int i;
+  for (i = 0; i < state->num_channels; ++i) {
+    const uint32_t smoothing =
+        ((i & 1) == 0) ? state->even_smoothing : state->odd_smoothing;
+    const uint32_t one_minus_smoothing = (1 << kNoiseReductionBits) - smoothing;
+
+    // Update the estimate of the noise.
+    const uint32_t signal_scaled_up = signal[i] << state->smoothing_bits;
+    uint32_t estimate =
+        ((static_cast<uint64_t>(signal_scaled_up) * smoothing) +
+         (static_cast<uint64_t>(state->estimate[i]) * one_minus_smoothing)) >>
+        kNoiseReductionBits;
+    state->estimate[i] = estimate;
+
+    // Make sure that we can't get a negative value for the signal - estimate.
+    if (estimate > signal_scaled_up) {
+      estimate = signal_scaled_up;
+    }
+
+    const uint32_t floor =
+        (static_cast<uint64_t>(signal[i]) * state->min_signal_remaining) >>
+        kNoiseReductionBits;
+    const uint32_t subtracted =
+        (signal_scaled_up - estimate) >> state->smoothing_bits;
+    const uint32_t output = subtracted > floor ? subtracted : floor;
+    signal[i] = output;
+  }
+}
+
+void NoiseReductionReset(struct NoiseReductionState* state) {
+  memset(state->estimate, 0, sizeof(*state->estimate) * state->num_channels);
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h
new file mode 100644
index 0000000000000000000000000000000000000000..699144345d5751f27f7adcafec551180f82725d1
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NOISE_REDUCTION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NOISE_REDUCTION_H_
+
+#define kNoiseReductionBits 14
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+struct NoiseReductionState {
+  int smoothing_bits;
+  uint16_t even_smoothing;
+  uint16_t odd_smoothing;
+  uint16_t min_signal_remaining;
+  int num_channels;
+  uint32_t estimate[kFeatureSliceSize];
+};
+
+// Removes stationary noise from each channel of the signal using a low pass
+// filter.
+void NoiseReductionApply(struct NoiseReductionState* state, uint32_t* signal);
+
+void NoiseReductionReset(struct NoiseReductionState* state);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NOISE_REDUCTION_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de7181d710bc00938e411869bf071b91e22f2044
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_test.cc
@@ -0,0 +1,83 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.h"
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+namespace {
+
+const int kNumChannels = 2;
+
+// Test noise reduction using default config values.
+class NoiseReductionTestConfig {
+ public:
+  NoiseReductionTestConfig() {
+    config_.smoothing_bits = 10;
+    config_.even_smoothing = 0.025;
+    config_.odd_smoothing = 0.06;
+    config_.min_signal_remaining = 0.05;
+  }
+
+  struct NoiseReductionConfig config_;
+};
+
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(NoiseReductionTest_TestNoiseReductionEstimate) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  NoiseReductionTestConfig config;
+  struct NoiseReductionState state;
+  TF_LITE_MICRO_EXPECT(NoiseReductionPopulateState(
+      error_reporter, &config.config_, &state, kNumChannels));
+
+  uint32_t signal[] = {247311, 508620};
+  NoiseReductionApply(&state, signal);
+
+  const uint32_t expected[] = {6321887, 31248341};
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.estimate[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(NoiseReductionTest_TestNoiseReduction) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  NoiseReductionTestConfig config;
+  struct NoiseReductionState state;
+  TF_LITE_MICRO_EXPECT(NoiseReductionPopulateState(
+      error_reporter, &config.config_, &state, kNumChannels));
+
+  uint32_t signal[] = {247311, 508620};
+  NoiseReductionApply(&state, signal);
+
+  const uint32_t expected[] = {241137, 478104};
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(signal[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..42a5c2136f2a85b0ddd7e3a620bb879d13eeb258
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.cc
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h"
+
+void NoiseReductionFillConfigWithDefaults(struct NoiseReductionConfig* config) {
+  config->smoothing_bits = 10;
+  config->even_smoothing = 0.025;
+  config->odd_smoothing = 0.06;
+  config->min_signal_remaining = 0.05;
+}
+
+int NoiseReductionPopulateState(tflite::ErrorReporter* error_reporter,
+                                const struct NoiseReductionConfig* config,
+                                struct NoiseReductionState* state,
+                                int num_channels) {
+  state->smoothing_bits = config->smoothing_bits;
+  state->odd_smoothing = config->odd_smoothing * (1 << kNoiseReductionBits);
+  state->even_smoothing = config->even_smoothing * (1 << kNoiseReductionBits);
+  state->min_signal_remaining =
+      config->min_signal_remaining * (1 << kNoiseReductionBits);
+  state->num_channels = num_channels;
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      state->estimate, (state->num_channels * sizeof(*state->estimate)));
+  for (int i = 0; i < state->num_channels; ++i) {
+    state->estimate[i] = 0;
+  }
+  return 1;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..60f9de5067c606158bc0e29771d1e83a495cd4c1
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction_util.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NOISE_REDUCTION_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NOISE_REDUCTION_UTIL_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/noise_reduction.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+struct NoiseReductionConfig {
+  // scale the signal up by 2^(smoothing_bits) before reduction
+  int smoothing_bits;
+  // smoothing coefficient for even-numbered channels
+  float even_smoothing;
+  // smoothing coefficient for odd-numbered channels
+  float odd_smoothing;
+  // fraction of signal to preserve (1.0 disables this module)
+  float min_signal_remaining;
+};
+
+// Populates the NoiseReductionConfig with "sane" default values.
+void NoiseReductionFillConfigWithDefaults(struct NoiseReductionConfig* config);
+
+// Prepares any buffers.
+int NoiseReductionPopulateState(tflite::ErrorReporter* error_reporter,
+                                const struct NoiseReductionConfig* config,
+                                struct NoiseReductionState* state,
+                                int num_channels);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NOISE_REDUCTION_UTIL_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.cc
new file mode 100644
index 0000000000000000000000000000000000000000..50656758d722844b8aeb6a32c04d3df36f0e5242
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.cc
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/bits.h"
+
+int16_t WideDynamicFunction(const uint32_t x, const int16_t* lut) {
+  if (x <= 2) {
+    return lut[x];
+  }
+
+  const int16_t interval = MostSignificantBit32(x);
+  lut += 4 * interval - 6;
+
+  const int16_t frac =
+      ((interval < 11) ? (x << (11 - interval)) : (x >> (interval - 11))) &
+      0x3FF;
+
+  int32_t result = (static_cast<int32_t>(lut[2]) * frac) >> 5;
+  result += (static_cast<int32_t>(lut[1])) << 5;
+  result *= frac;
+  result = (result + (1 << 14)) >> 15;
+  result += lut[0];
+  return static_cast<int16_t>(result);
+}
+
+uint32_t PcanShrink(const uint32_t x) {
+  if (x < (2 << kPcanSnrBits)) {
+    return (x * x) >> (2 + 2 * kPcanSnrBits - kPcanOutputBits);
+  } else {
+    return (x >> (kPcanSnrBits - kPcanOutputBits)) - (1 << kPcanOutputBits);
+  }
+}
+
+void PcanGainControlApply(struct PcanGainControlState* state,
+                          uint32_t* signal) {
+  int i;
+  for (i = 0; i < state->num_channels; ++i) {
+    const uint32_t gain =
+        WideDynamicFunction(state->noise_estimate[i], state->gain_lut);
+    const uint32_t snr =
+        (static_cast<uint64_t>(signal[i]) * gain) >> state->snr_shift;
+    signal[i] = PcanShrink(snr);
+  }
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h
new file mode 100644
index 0000000000000000000000000000000000000000..06d6fc990385cd74ccba8510765eb7ad8da4eeca
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_PCAN_GAIN_CONTROL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_PCAN_GAIN_CONTROL_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#define kPcanSnrBits 12
+#define kPcanOutputBits 6
+
+#define kWideDynamicFunctionBits 32
+#define kWideDynamicFunctionLUTSize (4 * kWideDynamicFunctionBits - 3)
+
+struct PcanGainControlState {
+  int enable_pcan;
+  uint32_t* noise_estimate;
+  int num_channels;
+  int16_t gain_lut[kWideDynamicFunctionLUTSize];
+  int32_t snr_shift;
+};
+
+int16_t WideDynamicFunction(const uint32_t x, const int16_t* lut);
+
+uint32_t PcanShrink(const uint32_t x);
+
+void PcanGainControlApply(struct PcanGainControlState* state, uint32_t* signal);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_PCAN_GAIN_CONTROL_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7dee66746f381ea50127e416fe90f063353eca89
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_test.cc
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.h"
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+namespace {
+
+const int kNumChannels = 2;
+const int kSmoothingBits = 10;
+const int kCorrectionBits = -1;
+
+// Test pcan auto gain control using default config values.
+class PcanGainControlTestConfig {
+ public:
+  PcanGainControlTestConfig() {
+    config_.enable_pcan = 1;
+    config_.strength = 0.95;
+    config_.offset = 80.0;
+    config_.gain_bits = 21;
+  }
+
+  struct PcanGainControlConfig config_;
+};
+
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(PcanGainControlTest_TestPcanGainControl) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  uint32_t estimate[] = {6321887, 31248341};
+  PcanGainControlTestConfig config;
+  struct PcanGainControlState state;
+  TF_LITE_MICRO_EXPECT(PcanGainControlPopulateState(
+      error_reporter, &config.config_, &state, estimate, kNumChannels,
+      kSmoothingBits, kCorrectionBits));
+
+  uint32_t signal[] = {241137, 478104};
+  PcanGainControlApply(&state, signal);
+
+  const uint32_t expected[] = {3578, 1533};
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(signal[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e7867ac6284d519ba6dd35f601bc3cb40e2f95fe
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.cc
@@ -0,0 +1,87 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.h"
+
+#include <math.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h"
+
+#define kint16max 0x00007FFF
+
+void PcanGainControlFillConfigWithDefaults(
+    struct PcanGainControlConfig* config) {
+  config->enable_pcan = 0;
+  config->strength = 0.95;
+  config->offset = 80.0;
+  config->gain_bits = 21;
+}
+
+int16_t PcanGainLookupFunction(const struct PcanGainControlConfig* config,
+                               int32_t input_bits, uint32_t x) {
+  const float x_as_float =
+      (static_cast<float>(x)) / (static_cast<uint32_t>(1) << input_bits);
+  const float gain_as_float =
+      (static_cast<uint32_t>(1) << config->gain_bits) *
+      powf(x_as_float + config->offset, -config->strength);
+
+  if (gain_as_float > kint16max) {
+    return kint16max;
+  }
+  return static_cast<int16_t>(gain_as_float + 0.5f);
+}
+
+int PcanGainControlPopulateState(tflite::ErrorReporter* error_reporter,
+                                 const struct PcanGainControlConfig* config,
+                                 struct PcanGainControlState* state,
+                                 uint32_t* noise_estimate,
+                                 const int num_channels,
+                                 const uint16_t smoothing_bits,
+                                 const int32_t input_correction_bits) {
+  state->enable_pcan = config->enable_pcan;
+  if (!state->enable_pcan) {
+    return 1;
+  }
+  state->noise_estimate = noise_estimate;
+  state->num_channels = num_channels;
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(
+      state->gain_lut, (kWideDynamicFunctionLUTSize * sizeof(int16_t)));
+  state->snr_shift = config->gain_bits - input_correction_bits - kPcanSnrBits;
+
+  const int32_t input_bits = smoothing_bits - input_correction_bits;
+  state->gain_lut[0] = PcanGainLookupFunction(config, input_bits, 0);
+  state->gain_lut[1] = PcanGainLookupFunction(config, input_bits, 1);
+  int16_t* temp_gain_lut = state->gain_lut - 6;
+  int interval;
+  for (interval = 2; interval <= kWideDynamicFunctionBits; ++interval) {
+    const uint32_t x0 = static_cast<uint32_t>(1) << (interval - 1);
+    const uint32_t x1 = x0 + (x0 >> 1);
+    const uint32_t x2 =
+        (interval == kWideDynamicFunctionBits) ? x0 + (x0 - 1) : 2 * x0;
+
+    const int16_t y0 = PcanGainLookupFunction(config, input_bits, x0);
+    const int16_t y1 = PcanGainLookupFunction(config, input_bits, x1);
+    const int16_t y2 = PcanGainLookupFunction(config, input_bits, x2);
+
+    const int32_t diff1 = static_cast<int32_t>(y1) - y0;
+    const int32_t diff2 = static_cast<int32_t>(y2) - y0;
+    const int32_t a1 = 4 * diff1 - diff2;
+    const int32_t a2 = diff2 - a1;
+
+    temp_gain_lut[4 * interval] = y0;
+    temp_gain_lut[4 * interval + 1] = static_cast<int16_t>(a1);
+    temp_gain_lut[4 * interval + 2] = static_cast<int16_t>(a2);
+  }
+  return 1;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..4cc1de7bb25db509f8271d12f053e61554d07680
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control_util.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_PCAN_GAIN_CONTROL_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_PCAN_GAIN_CONTROL_UTIL_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/pcan_gain_control.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+struct PcanGainControlConfig {
+  // set to false (0) to disable this module
+  int enable_pcan;
+  // gain normalization exponent (0.0 disables, 1.0 full strength)
+  float strength;
+  // positive value added in the normalization denominator
+  float offset;
+  // number of fractional bits in the gain
+  int gain_bits;
+};
+
+void PcanGainControlFillConfigWithDefaults(
+    struct PcanGainControlConfig* config);
+
+int16_t PcanGainLookupFunction(const struct PcanGainControlConfig* config,
+                               int32_t input_bits, uint32_t x);
+
+int PcanGainControlPopulateState(tflite::ErrorReporter* error_reporter,
+                                 const struct PcanGainControlConfig* config,
+                                 struct PcanGainControlState* state,
+                                 uint32_t* noise_estimate,
+                                 const int num_channels,
+                                 const uint16_t smoothing_bits,
+                                 const int32_t input_correction_bits);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_PCAN_GAIN_CONTROL_UTIL_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2af862de7590323819c99de3a6702d1bd046681
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_STATIC_ALLOC_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_STATIC_ALLOC_H_
+
+// Checks to ensure that the C-style array passed in has a compile-time size of
+// at least the number of bytes requested. This doesn't work with raw pointers
+// since sizeof() doesn't know their actual length, so only use this to check
+// statically-allocated arrays with known sizes.
+#define STATIC_ALLOC_ENSURE_ARRAY_SIZE(A, N)                                 \
+  do {                                                                       \
+    if (sizeof(A) < (N)) {                                                   \
+      error_reporter->Report(#A " too small (%d bytes, wanted %d) at %s:%d", \
+                             sizeof(A), (N), __FILE__, __LINE__);            \
+      return 0;                                                              \
+    }                                                                        \
+  } while (0)
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_STATIC_ALLOC_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..57a32c3595da7ae17c2328bb4c98fb005fd253ef
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.cc
@@ -0,0 +1,1541 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Automatically created from a TensorFlow Lite flatbuffer using the command:
+// xxd -i tiny_conv.tflite > tiny_conv_simple_features_model_data.cc
+// See the README for a full description of the creation process.
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h"
+
+const unsigned char g_tiny_conv_micro_features_model_data[] = {
+    0x18, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x0e, 0x00,
+    0x18, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,
+    0x0e, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xd0, 0x46, 0x00, 0x00,
+    0x0c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0xb4, 0x41, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00,
+    0x54, 0x4f, 0x43, 0x4f, 0x20, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74,
+    0x65, 0x64, 0x2e, 0x00, 0x09, 0x00, 0x00, 0x00, 0x94, 0x41, 0x00, 0x00,
+    0x74, 0x41, 0x00, 0x00, 0x44, 0x41, 0x00, 0x00, 0xb4, 0x3e, 0x00, 0x00,
+    0xac, 0x3e, 0x00, 0x00, 0xa4, 0x3e, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xf0, 0xb9, 0xff, 0xff,
+    0xf4, 0xb9, 0xff, 0xff, 0x52, 0xba, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x80, 0x3e, 0x00, 0x00, 0x68, 0x95, 0x91, 0x7d, 0x9b, 0x85, 0x85, 0x81,
+    0x77, 0x85, 0x99, 0x89, 0x7e, 0x8a, 0x85, 0x92, 0xa5, 0x7e, 0x93, 0x97,
+    0x97, 0x91, 0xa3, 0x97, 0x88, 0x8b, 0xa6, 0x71, 0x77, 0x85, 0x95, 0x86,
+    0x6b, 0x93, 0xcb, 0x96, 0x7a, 0x9a, 0x7f, 0x85, 0x7a, 0x8e, 0xac, 0x98,
+    0x6d, 0x9d, 0x9b, 0x70, 0x9a, 0x90, 0xba, 0x99, 0x7b, 0x93, 0x6e, 0x68,
+    0x75, 0x86, 0xc4, 0x8b, 0x66, 0x5d, 0x96, 0x7f, 0x92, 0x91, 0xb6, 0x7b,
+    0x96, 0x95, 0x9a, 0x77, 0x9a, 0x96, 0xce, 0x80, 0x88, 0x65, 0x8e, 0x80,
+    0x88, 0x85, 0xb7, 0x9c, 0x7b, 0x93, 0x9d, 0x95, 0x83, 0x92, 0xd0, 0x7e,
+    0x68, 0x88, 0x6c, 0x78, 0x98, 0x81, 0xac, 0x95, 0x9e, 0x98, 0xa2, 0x99,
+    0x8d, 0x7d, 0xb8, 0x81, 0x6e, 0x68, 0xa1, 0x81, 0x9d, 0x99, 0xb4, 0x7d,
+    0x92, 0x86, 0x9d, 0x93, 0xa3, 0xb0, 0xd6, 0x79, 0x93, 0x76, 0x8d, 0x84,
+    0x91, 0x9d, 0xbe, 0x94, 0xb0, 0x70, 0x84, 0x80, 0x85, 0x99, 0x9e, 0xa2,
+    0x86, 0x8a, 0x7a, 0x76, 0x91, 0x8d, 0xa6, 0x76, 0x8d, 0x82, 0x98, 0x8c,
+    0x92, 0x8f, 0x8c, 0xb3, 0x78, 0x75, 0xa5, 0x88, 0x73, 0x8c, 0x91, 0x7c,
+    0x82, 0x7d, 0x93, 0x9e, 0x8b, 0x97, 0x7c, 0x90, 0x84, 0x95, 0x7e, 0x9e,
+    0xa4, 0x52, 0x8a, 0xb4, 0x97, 0x65, 0x7d, 0xb6, 0x83, 0x7d, 0x99, 0x80,
+    0x97, 0x85, 0x96, 0x5f, 0x8e, 0x87, 0x95, 0x6d, 0x76, 0x84, 0x97, 0x8c,
+    0x66, 0x97, 0xae, 0x6b, 0x93, 0xb3, 0xa8, 0x8b, 0xa1, 0x79, 0xa3, 0x94,
+    0x7e, 0xa8, 0x8d, 0xad, 0x78, 0x82, 0xa2, 0x7b, 0x90, 0xa4, 0x7d, 0xb3,
+    0xa0, 0x7b, 0x94, 0x85, 0x9a, 0x8d, 0x76, 0x82, 0x65, 0x73, 0xab, 0xa4,
+    0xaa, 0x74, 0x93, 0x9c, 0x83, 0x66, 0xbf, 0x7a, 0xaa, 0x81, 0x92, 0x89,
+    0x7e, 0x88, 0xa6, 0x66, 0xaf, 0x92, 0x9f, 0x97, 0x6c, 0x89, 0x9c, 0x74,
+    0x7e, 0x82, 0x8e, 0x88, 0xb2, 0x85, 0xba, 0x96, 0x90, 0x78, 0x8d, 0xa7,
+    0x9e, 0x87, 0xbc, 0x7f, 0xb2, 0x8b, 0x77, 0x9b, 0xab, 0x8f, 0xa4, 0x7d,
+    0x6f, 0x77, 0x8c, 0x98, 0x6f, 0x89, 0xb1, 0x9f, 0xa7, 0x94, 0x7d, 0xae,
+    0x88, 0x8a, 0xa9, 0x75, 0x7d, 0x7c, 0x88, 0x99, 0x90, 0x9d, 0x97, 0xa7,
+    0x8d, 0x7f, 0x73, 0xa1, 0xa3, 0x87, 0xa9, 0x92, 0x98, 0x7e, 0x9c, 0x88,
+    0x73, 0x6b, 0x78, 0x8e, 0x7d, 0x86, 0x6c, 0x7c, 0x92, 0x40, 0x86, 0xa7,
+    0x65, 0x93, 0x67, 0x91, 0x67, 0x71, 0x6c, 0xa8, 0x81, 0x70, 0x8e, 0xa8,
+    0x7b, 0x63, 0x89, 0x76, 0x69, 0x90, 0x73, 0x5e, 0x92, 0x78, 0x7e, 0x9d,
+    0x87, 0x86, 0x89, 0x64, 0x66, 0xa9, 0x92, 0x8d, 0x72, 0x7c, 0x63, 0x7f,
+    0x94, 0x5c, 0x92, 0x89, 0x87, 0x9d, 0x8b, 0x75, 0x93, 0x8c, 0x94, 0x68,
+    0x97, 0x87, 0x78, 0x7d, 0x7f, 0x84, 0x84, 0x77, 0x6b, 0x8e, 0x83, 0xab,
+    0x7e, 0x62, 0x90, 0x83, 0x8e, 0x71, 0x7e, 0x9b, 0x96, 0x6d, 0x83, 0x6a,
+    0x76, 0x68, 0x71, 0x90, 0x98, 0x90, 0x9b, 0x68, 0x89, 0x89, 0x95, 0x85,
+    0x6e, 0x75, 0x8e, 0x95, 0x83, 0x7a, 0x91, 0x7f, 0x8b, 0x71, 0x90, 0x7d,
+    0xad, 0x91, 0x6f, 0x74, 0x98, 0x8a, 0xb0, 0xa8, 0x80, 0xa3, 0x8e, 0x7c,
+    0xa5, 0x67, 0xa4, 0x66, 0xa9, 0x7b, 0x85, 0x9d, 0x88, 0xab, 0x7d, 0x81,
+    0x6e, 0x7f, 0x8f, 0x97, 0x97, 0x84, 0x89, 0x74, 0x9d, 0x5f, 0x9c, 0x88,
+    0x6f, 0x74, 0x96, 0x9e, 0x7e, 0x7e, 0xa4, 0x85, 0x94, 0x91, 0xaf, 0x99,
+    0x7a, 0xaa, 0x8c, 0x92, 0x85, 0x9d, 0x6c, 0x79, 0x57, 0x7a, 0x80, 0x84,
+    0x79, 0x79, 0x74, 0xa5, 0x55, 0xab, 0x73, 0x8c, 0x72, 0x9d, 0x72, 0xa9,
+    0x90, 0x73, 0x8f, 0xa0, 0x89, 0x6d, 0x68, 0x66, 0x61, 0x6f, 0x57, 0x7d,
+    0x66, 0x8c, 0x65, 0x87, 0x62, 0x76, 0x83, 0x77, 0x89, 0xa4, 0x73, 0x89,
+    0x7f, 0x70, 0x79, 0x6b, 0x86, 0x6f, 0x8d, 0x96, 0x65, 0x89, 0x66, 0x53,
+    0x73, 0xae, 0x6a, 0x72, 0x88, 0x97, 0x7a, 0x7f, 0x5d, 0xa1, 0x86, 0x88,
+    0x5f, 0x9f, 0x9b, 0x8a, 0x74, 0x9a, 0x7a, 0x7e, 0x8b, 0x71, 0x58, 0x74,
+    0x8f, 0x9b, 0x9b, 0x8d, 0x6b, 0x83, 0x60, 0x7f, 0x75, 0x91, 0x79, 0x93,
+    0x7a, 0x92, 0x8c, 0x7e, 0x7a, 0x95, 0x84, 0x69, 0x8f, 0x8c, 0x7c, 0x6e,
+    0x8b, 0x87, 0x82, 0x62, 0xa6, 0x97, 0x91, 0x65, 0xa2, 0xa4, 0x9b, 0x8b,
+    0x85, 0xa4, 0x84, 0x7b, 0x67, 0x93, 0x96, 0x84, 0x85, 0x75, 0x6d, 0x9e,
+    0x80, 0x80, 0x73, 0x8c, 0x81, 0x70, 0x8a, 0x68, 0x9c, 0x8e, 0x63, 0x91,
+    0x89, 0x79, 0x8d, 0x79, 0xa4, 0x9a, 0x96, 0xa0, 0x83, 0x63, 0x88, 0x8f,
+    0x76, 0xb4, 0xa8, 0x8e, 0x68, 0x8d, 0x8e, 0x95, 0x78, 0xae, 0x5d, 0x89,
+    0x66, 0x7e, 0x7b, 0x8a, 0x75, 0x86, 0x71, 0x97, 0x6d, 0xb3, 0x67, 0x76,
+    0x82, 0x7d, 0x70, 0x79, 0x8a, 0x9c, 0x82, 0xa7, 0x82, 0xab, 0x58, 0x86,
+    0x5c, 0x70, 0x8c, 0x71, 0x61, 0xa6, 0x74, 0xa8, 0x65, 0x78, 0x72, 0x9d,
+    0x6c, 0x92, 0x70, 0x88, 0x88, 0x79, 0x96, 0x6f, 0x68, 0xa4, 0x7a, 0x7b,
+    0x96, 0xac, 0x6d, 0x76, 0x6a, 0xab, 0x82, 0x7d, 0x71, 0x8d, 0x6b, 0x81,
+    0x6c, 0x9d, 0x71, 0x59, 0x5c, 0x71, 0x77, 0x6d, 0x6a, 0x96, 0x76, 0x69,
+    0x80, 0x83, 0x88, 0x70, 0x97, 0xb4, 0x8a, 0x6c, 0xa5, 0x6e, 0x64, 0x75,
+    0x73, 0xa2, 0x7f, 0x97, 0x9e, 0x75, 0x8f, 0x86, 0x68, 0xbb, 0x6b, 0x86,
+    0x8d, 0x80, 0x8e, 0x58, 0x6d, 0xb2, 0x76, 0x99, 0x8f, 0x70, 0x6c, 0x86,
+    0x78, 0x9e, 0x91, 0x90, 0xa2, 0x7c, 0x8c, 0x81, 0x80, 0xb4, 0x77, 0x7a,
+    0x8c, 0x5f, 0x85, 0x56, 0x7a, 0x93, 0x6b, 0x5c, 0x74, 0x59, 0x7e, 0x86,
+    0x8c, 0xae, 0x76, 0x7d, 0x76, 0x7e, 0x81, 0x5f, 0x81, 0x8e, 0x7b, 0x90,
+    0xaa, 0x99, 0x79, 0x89, 0x93, 0xbc, 0x86, 0x91, 0xa2, 0x88, 0x79, 0x82,
+    0x80, 0xb6, 0x4a, 0x93, 0x7b, 0x89, 0x75, 0x8d, 0x7a, 0x8d, 0x66, 0x7c,
+    0x81, 0x9f, 0x6e, 0x86, 0x4d, 0x82, 0x66, 0x88, 0x73, 0x89, 0x7d, 0xac,
+    0x89, 0x9f, 0x58, 0x7f, 0x6b, 0x8c, 0x6a, 0x82, 0x59, 0xb8, 0x83, 0x67,
+    0x8b, 0x8a, 0x84, 0x7b, 0x7f, 0xb5, 0x44, 0x57, 0x5a, 0x73, 0x8b, 0x6d,
+    0x7c, 0x9e, 0x71, 0x72, 0x8d, 0x93, 0x80, 0x60, 0x7f, 0xc5, 0x69, 0x5c,
+    0x67, 0x92, 0x6c, 0x75, 0x66, 0x8f, 0x91, 0x5a, 0x6c, 0x70, 0x90, 0x84,
+    0x88, 0xab, 0x90, 0x66, 0x9c, 0x64, 0x6e, 0x68, 0x92, 0x9e, 0x89, 0x8d,
+    0x82, 0x97, 0x77, 0x75, 0x7f, 0xa7, 0x91, 0x75, 0x8c, 0x89, 0xa4, 0x6b,
+    0x98, 0x99, 0x80, 0x7d, 0x6b, 0x7f, 0x7d, 0x88, 0x79, 0xa1, 0x87, 0x90,
+    0x81, 0x8e, 0x94, 0x96, 0x7d, 0xa8, 0x86, 0x84, 0x86, 0x79, 0x97, 0x6e,
+    0xaa, 0x95, 0x8a, 0x9f, 0x8c, 0x72, 0x99, 0x77, 0x81, 0x94, 0x91, 0x9f,
+    0x6e, 0x67, 0x87, 0x70, 0x7d, 0xad, 0x58, 0x7f, 0x6d, 0x96, 0x8e, 0x82,
+    0x7d, 0xa6, 0x77, 0x99, 0x87, 0x95, 0x89, 0x7e, 0xa6, 0x9e, 0x86, 0xac,
+    0x78, 0x9f, 0x9b, 0x85, 0x76, 0x99, 0x6a, 0x92, 0x66, 0x7b, 0x9a, 0x99,
+    0x83, 0x8b, 0x57, 0x65, 0x75, 0x9f, 0xa6, 0x8a, 0x8d, 0x96, 0x6f, 0x80,
+    0x65, 0x8f, 0x80, 0x9f, 0x82, 0x85, 0x55, 0x75, 0x5c, 0x84, 0x91, 0x86,
+    0x76, 0x96, 0x5a, 0x6c, 0x62, 0x7b, 0x92, 0x88, 0x61, 0xca, 0x75, 0x66,
+    0x70, 0x70, 0x8e, 0x7a, 0x75, 0xb2, 0x66, 0x81, 0x5b, 0x79, 0x92, 0x97,
+    0x94, 0xaf, 0x72, 0x8a, 0x9b, 0x5f, 0x65, 0x96, 0x81, 0xb6, 0x8a, 0x6f,
+    0x94, 0x7a, 0x96, 0x92, 0x79, 0x94, 0x8e, 0x53, 0x9a, 0x73, 0x6a, 0x9d,
+    0xa1, 0xa3, 0xa4, 0x8f, 0x6b, 0xa4, 0x8b, 0x82, 0x96, 0xb1, 0x8c, 0x92,
+    0x7f, 0x91, 0x5f, 0x98, 0x8a, 0xa4, 0x7e, 0x80, 0x97, 0x86, 0x86, 0x86,
+    0x8f, 0xa6, 0x77, 0x9a, 0x82, 0x80, 0x6e, 0x73, 0x83, 0xaf, 0x87, 0x6d,
+    0x77, 0x9a, 0x83, 0x9f, 0x7c, 0xa4, 0x71, 0x6f, 0x7d, 0x75, 0x9d, 0x82,
+    0x83, 0xaf, 0x85, 0x80, 0x8d, 0x7f, 0xa4, 0xa2, 0x88, 0xba, 0x76, 0x76,
+    0x94, 0x6b, 0x76, 0x83, 0x77, 0x96, 0x78, 0x8c, 0xb0, 0x8e, 0x83, 0x87,
+    0xa0, 0xcc, 0x7f, 0xa4, 0x8c, 0x77, 0x84, 0x8c, 0x80, 0xa0, 0x57, 0x76,
+    0x76, 0x71, 0x86, 0x9c, 0x7f, 0x88, 0x57, 0x95, 0x4d, 0x8c, 0x7f, 0x80,
+    0x66, 0x9e, 0x42, 0x8d, 0x6a, 0x8e, 0x8c, 0x80, 0x89, 0x9d, 0x4f, 0x83,
+    0x54, 0x8a, 0x5e, 0x64, 0x70, 0x94, 0x78, 0x90, 0x7d, 0x78, 0x8d, 0x71,
+    0x56, 0x9a, 0x8c, 0x65, 0x8b, 0x62, 0x88, 0x9a, 0x6c, 0x8e, 0x7b, 0x78,
+    0x68, 0x86, 0x64, 0x6b, 0x67, 0xaa, 0x8c, 0x7b, 0x67, 0x75, 0x58, 0x7e,
+    0x6b, 0x97, 0x92, 0x87, 0x9c, 0x79, 0x71, 0x76, 0x7d, 0xbb, 0x89, 0x75,
+    0x83, 0x57, 0x74, 0x98, 0xa1, 0x8f, 0xb0, 0x89, 0x76, 0x88, 0x69, 0x9c,
+    0x74, 0xb0, 0x86, 0x9c, 0x79, 0x6f, 0x84, 0x70, 0x94, 0xa1, 0x6e, 0x7a,
+    0xa3, 0x88, 0xa0, 0x7a, 0x94, 0xa1, 0x82, 0x93, 0x99, 0x95, 0x7f, 0xab,
+    0x97, 0x9d, 0x6e, 0x68, 0x79, 0x73, 0x76, 0x83, 0x76, 0xbd, 0x87, 0x87,
+    0x86, 0x74, 0x8f, 0x6e, 0x65, 0xba, 0x6a, 0x78, 0x91, 0x62, 0x72, 0x67,
+    0x75, 0xbd, 0x8c, 0x5e, 0x85, 0x6d, 0x72, 0x85, 0x7d, 0x96, 0x8f, 0xb9,
+    0x9f, 0x97, 0xa2, 0x8a, 0xa1, 0xc1, 0x8d, 0xbc, 0x85, 0x78, 0x93, 0x97,
+    0x99, 0x9f, 0x3a, 0x98, 0x65, 0x8d, 0x6a, 0x6c, 0x92, 0x85, 0x49, 0x7e,
+    0x6a, 0xaa, 0x8a, 0x94, 0x6b, 0x93, 0x40, 0x8a, 0x8c, 0x9c, 0x6f, 0xad,
+    0x72, 0xb0, 0x58, 0x88, 0x60, 0x8c, 0x86, 0x84, 0x74, 0x96, 0x8f, 0x97,
+    0x5e, 0x6c, 0x79, 0x92, 0x51, 0xa8, 0x92, 0x58, 0x62, 0x6f, 0x6c, 0x76,
+    0x5f, 0x9e, 0x86, 0x71, 0x9c, 0x69, 0x7e, 0x80, 0x8a, 0x97, 0x6f, 0x79,
+    0x8b, 0x6f, 0x6c, 0x88, 0x73, 0x9c, 0x6d, 0x91, 0x77, 0x73, 0x7f, 0x97,
+    0x86, 0xa9, 0xac, 0x71, 0x82, 0x90, 0x83, 0x8a, 0x80, 0x9d, 0xa8, 0x85,
+    0x78, 0x7f, 0x94, 0x99, 0x8e, 0xa3, 0x89, 0x70, 0x87, 0x62, 0x82, 0x87,
+    0x8c, 0x98, 0x7a, 0x88, 0x72, 0x7e, 0x78, 0xa0, 0x78, 0x95, 0x97, 0x8f,
+    0x7b, 0x7c, 0x83, 0x94, 0x93, 0xa7, 0x77, 0x97, 0x90, 0x5e, 0x76, 0x7c,
+    0x68, 0xaa, 0x69, 0x67, 0x76, 0x84, 0x7e, 0x64, 0xa3, 0xbe, 0x7e, 0x8b,
+    0x82, 0x50, 0x8a, 0x82, 0x89, 0xc0, 0x79, 0x78, 0x68, 0x7c, 0x6b, 0x77,
+    0x82, 0x99, 0x7b, 0x83, 0x80, 0x90, 0x96, 0x96, 0x87, 0xb7, 0xa5, 0x94,
+    0x82, 0x99, 0x95, 0x91, 0x7e, 0xa2, 0x49, 0x95, 0x6d, 0x8e, 0xa9, 0x89,
+    0x8e, 0x8f, 0x3d, 0x95, 0x6a, 0x8c, 0x8b, 0x8c, 0x7e, 0x88, 0x63, 0x94,
+    0x69, 0x94, 0x88, 0x92, 0x79, 0xa7, 0x68, 0x60, 0x76, 0x85, 0xa1, 0x6f,
+    0x54, 0x96, 0x63, 0x7a, 0x5c, 0x73, 0x74, 0x6e, 0x53, 0x99, 0x69, 0x76,
+    0x69, 0x57, 0x6a, 0x82, 0x55, 0x93, 0x82, 0x80, 0x65, 0x7f, 0x7b, 0x76,
+    0x72, 0x87, 0x8d, 0x97, 0x98, 0x78, 0x7e, 0x6d, 0x7a, 0x95, 0x78, 0x70,
+    0x90, 0x83, 0x89, 0x80, 0x7f, 0x9d, 0x73, 0x73, 0x84, 0x77, 0x8e, 0x77,
+    0x8e, 0x75, 0x9e, 0xa5, 0x86, 0x68, 0x89, 0x7d, 0x8d, 0x99, 0x79, 0x8f,
+    0x8e, 0x87, 0x87, 0x97, 0x8c, 0x91, 0xa1, 0x96, 0x83, 0x73, 0x87, 0xa9,
+    0x8c, 0xa6, 0x85, 0x8c, 0x96, 0x7d, 0x7f, 0x8e, 0x7e, 0xb0, 0x85, 0x8f,
+    0x7f, 0x7d, 0x95, 0x7d, 0x9c, 0xb3, 0x71, 0x86, 0x81, 0x69, 0x7b, 0x69,
+    0x76, 0xb6, 0x5d, 0x67, 0x8a, 0x68, 0x9c, 0xa6, 0x70, 0xbf, 0x79, 0x60,
+    0x8b, 0x7f, 0x7a, 0x7b, 0x8b, 0xaf, 0x8c, 0xa1, 0x86, 0x92, 0x76, 0x8d,
+    0x89, 0xa2, 0xa8, 0xa3, 0xa0, 0xa2, 0x96, 0x9d, 0x7c, 0x92, 0x3f, 0x9b,
+    0x6d, 0x8a, 0x80, 0x81, 0xa0, 0x92, 0x50, 0x7c, 0x82, 0x99, 0x80, 0xa6,
+    0x8e, 0x8d, 0x4f, 0x8d, 0x65, 0x71, 0x77, 0x81, 0x51, 0xa6, 0x3f, 0x5c,
+    0x63, 0x6f, 0x61, 0x93, 0x5c, 0xaa, 0x77, 0x8f, 0x5d, 0x53, 0x79, 0x74,
+    0x6b, 0x94, 0x86, 0x81, 0x85, 0x48, 0x81, 0x80, 0x6b, 0x85, 0x6c, 0x91,
+    0x92, 0x6a, 0x74, 0x78, 0x72, 0x87, 0x6c, 0x82, 0x88, 0x7b, 0x93, 0x71,
+    0x91, 0x8d, 0x67, 0x83, 0x86, 0x5b, 0x86, 0x79, 0x81, 0x9f, 0x95, 0x8a,
+    0x70, 0x66, 0x9e, 0x6b, 0x72, 0x98, 0x97, 0x95, 0x72, 0x93, 0x84, 0x92,
+    0x8c, 0x96, 0xa2, 0x65, 0x80, 0x75, 0xa2, 0xa7, 0x7d, 0x97, 0x71, 0x8f,
+    0x69, 0x65, 0x8f, 0xae, 0x9c, 0x97, 0x5d, 0xb3, 0x98, 0x83, 0x98, 0xa0,
+    0x5f, 0x7e, 0x7a, 0x7a, 0x87, 0x7c, 0x92, 0xa0, 0x81, 0xa6, 0x71, 0x8e,
+    0x88, 0x52, 0xa3, 0x88, 0x6a, 0x9d, 0x84, 0x82, 0x7c, 0x78, 0x9f, 0x92,
+    0x66, 0xa4, 0x53, 0x6a, 0x7e, 0x84, 0x60, 0x84, 0x92, 0xb0, 0x93, 0x9d,
+    0xa0, 0x5f, 0x95, 0x8c, 0x77, 0xa1, 0x8c, 0x90, 0xa0, 0x9c, 0x9a, 0x95,
+    0x85, 0xa1, 0x22, 0x8f, 0x57, 0x80, 0x96, 0x7d, 0x92, 0x8b, 0x41, 0xa6,
+    0x61, 0xa2, 0x6f, 0x80, 0x5d, 0x91, 0x66, 0xab, 0x6d, 0x7e, 0x88, 0x93,
+    0x5c, 0xa5, 0x75, 0x6e, 0x6c, 0x86, 0x69, 0x73, 0x4e, 0x8e, 0x77, 0x6b,
+    0x6c, 0x60, 0x67, 0x91, 0x75, 0x91, 0x6c, 0x7c, 0x53, 0x6e, 0x75, 0x8e,
+    0x79, 0x8c, 0x8b, 0x74, 0x6b, 0x57, 0x71, 0xa1, 0x7f, 0x83, 0x6c, 0x6b,
+    0x93, 0x99, 0x7a, 0x78, 0x71, 0x8c, 0x78, 0x88, 0x9f, 0x85, 0x77, 0x7b,
+    0x86, 0x85, 0xa1, 0x61, 0x78, 0x65, 0x61, 0x75, 0x82, 0x7d, 0xa9, 0xa2,
+    0x84, 0x82, 0x94, 0x95, 0x90, 0x9f, 0x83, 0x97, 0x76, 0x95, 0x8a, 0x83,
+    0x9b, 0x87, 0x8b, 0x7a, 0x6c, 0x6e, 0x75, 0x95, 0x85, 0x95, 0x84, 0x9e,
+    0x96, 0x74, 0x7d, 0xa5, 0x85, 0x8e, 0x7e, 0x73, 0x85, 0x8d, 0x87, 0x80,
+    0x8a, 0x96, 0x65, 0x87, 0x7c, 0x73, 0x80, 0x96, 0x73, 0x8d, 0x5e, 0x79,
+    0x7e, 0x8d, 0x79, 0x85, 0x63, 0xa0, 0x62, 0x89, 0x9d, 0x8c, 0x74, 0x7b,
+    0x9c, 0xa5, 0x71, 0x8c, 0x83, 0x91, 0x8e, 0x8d, 0x89, 0x8b, 0x8b, 0xa4,
+    0x78, 0x88, 0x9e, 0x85, 0x8b, 0x94, 0x38, 0x84, 0x7b, 0x86, 0x7d, 0xa2,
+    0x73, 0x8f, 0x47, 0x7b, 0x69, 0xb4, 0x85, 0x71, 0x61, 0x9d, 0x59, 0x95,
+    0x74, 0x93, 0x6a, 0x88, 0x62, 0xa2, 0x56, 0x93, 0x8d, 0x68, 0x7e, 0x80,
+    0x6b, 0xb7, 0x63, 0x90, 0x5d, 0x54, 0x6c, 0x90, 0x5a, 0x8e, 0x7e, 0x7d,
+    0x82, 0x73, 0x7f, 0x89, 0x94, 0x8e, 0x7a, 0x70, 0x6c, 0x79, 0x88, 0x88,
+    0x9b, 0x8b, 0x70, 0x81, 0x83, 0x83, 0x8b, 0x86, 0x64, 0x93, 0x82, 0x66,
+    0x66, 0x79, 0x74, 0x91, 0x92, 0x94, 0x7c, 0x87, 0x72, 0x79, 0x8d, 0xaa,
+    0xa2, 0x9e, 0xaf, 0x95, 0xb1, 0x8a, 0x95, 0x8b, 0x94, 0x7e, 0x79, 0x8e,
+    0x99, 0x98, 0x97, 0x9e, 0x94, 0x87, 0x74, 0x72, 0x63, 0x92, 0x92, 0x95,
+    0xb0, 0x94, 0x86, 0x91, 0x77, 0x8f, 0x91, 0x7e, 0x83, 0x88, 0x90, 0xa5,
+    0x79, 0x70, 0x85, 0x8f, 0x67, 0x90, 0x98, 0x8d, 0x8a, 0x5d, 0x8c, 0x9c,
+    0x94, 0x91, 0x80, 0x95, 0x6e, 0x95, 0x73, 0x8d, 0x63, 0x8e, 0x53, 0x8a,
+    0x77, 0x88, 0x8f, 0x6f, 0x87, 0x9e, 0x8b, 0xb7, 0x99, 0xb2, 0x85, 0x82,
+    0xa1, 0x89, 0x9b, 0xa7, 0x80, 0x81, 0xa0, 0x8e, 0x84, 0xa9, 0x27, 0x73,
+    0x5e, 0x85, 0x5f, 0x92, 0x8c, 0xa2, 0x34, 0x8e, 0x6e, 0xb2, 0x7b, 0x8c,
+    0x69, 0x93, 0x47, 0x9e, 0x58, 0x7e, 0x94, 0x86, 0x47, 0xa3, 0x53, 0x6b,
+    0x6e, 0x6a, 0x7f, 0x73, 0x5b, 0x8c, 0x7a, 0x99, 0x6c, 0x5d, 0x82, 0x82,
+    0x62, 0x8a, 0x7a, 0x8e, 0x88, 0x62, 0xa0, 0x8e, 0x5c, 0x9a, 0x72, 0x79,
+    0x66, 0x6b, 0x75, 0x78, 0x82, 0x8a, 0x59, 0x91, 0x93, 0x68, 0x78, 0xb4,
+    0x86, 0x7e, 0x8c, 0x6e, 0x88, 0x7f, 0x96, 0x8e, 0x6e, 0x8b, 0x8c, 0x73,
+    0xab, 0x79, 0x88, 0xa6, 0x86, 0x81, 0x9a, 0x80, 0x9a, 0x9e, 0x8b, 0x6d,
+    0x9a, 0x70, 0x8e, 0x8a, 0x84, 0x7a, 0xaf, 0xb8, 0x9e, 0x90, 0x89, 0xb3,
+    0x9b, 0x85, 0x94, 0xb6, 0x87, 0x8c, 0x6e, 0xa3, 0xac, 0x9e, 0x8c, 0x7c,
+    0x81, 0x83, 0x70, 0x8d, 0x7c, 0x81, 0x77, 0x82, 0x69, 0x8e, 0x5e, 0x80,
+    0x8a, 0x8e, 0x7c, 0x8a, 0x89, 0x90, 0x58, 0x59, 0x85, 0x88, 0x7a, 0x86,
+    0x73, 0x9c, 0x4a, 0x81, 0x8d, 0x89, 0x91, 0x95, 0x72, 0x83, 0x9d, 0x99,
+    0x8d, 0x6b, 0x95, 0x7e, 0x70, 0x94, 0x8c, 0x9f, 0x8a, 0x8f, 0xa7, 0x84,
+    0x87, 0xb6, 0x42, 0x81, 0x63, 0x8a, 0x79, 0x77, 0x74, 0x90, 0x23, 0x85,
+    0x74, 0x8f, 0x87, 0x80, 0x50, 0xa1, 0x4d, 0x9b, 0x55, 0x82, 0x74, 0x8e,
+    0x4a, 0xa7, 0x52, 0x4d, 0x77, 0x67, 0x77, 0x9e, 0x62, 0xa5, 0x7d, 0x96,
+    0x6f, 0x45, 0x80, 0x8c, 0x6c, 0x92, 0x99, 0x6f, 0x5d, 0x56, 0x93, 0xac,
+    0x94, 0x9c, 0x95, 0x92, 0x6e, 0x71, 0x87, 0x8c, 0x7b, 0xa9, 0x7f, 0x7a,
+    0x69, 0x6b, 0x7d, 0x90, 0x6f, 0x81, 0x9f, 0x80, 0x83, 0x67, 0x78, 0x85,
+    0x85, 0x91, 0x8a, 0x80, 0xaa, 0x86, 0x8c, 0x88, 0x8c, 0x8f, 0x9b, 0x85,
+    0x8b, 0x7e, 0x83, 0x82, 0x95, 0x75, 0x6b, 0x8f, 0x85, 0x8b, 0xb0, 0x9f,
+    0xa7, 0x8e, 0x61, 0x9d, 0x72, 0xac, 0x92, 0x87, 0x94, 0x96, 0x68, 0x8f,
+    0x63, 0x85, 0x9c, 0xa8, 0x82, 0x9b, 0x85, 0x9b, 0x6b, 0x72, 0x83, 0x85,
+    0x90, 0x87, 0x74, 0xa4, 0x88, 0x57, 0x63, 0x90, 0x8e, 0x7b, 0x80, 0x81,
+    0x94, 0x74, 0x68, 0x8a, 0x7f, 0x86, 0x78, 0x72, 0x75, 0x67, 0x7a, 0x8a,
+    0x7a, 0x74, 0x8c, 0xad, 0x75, 0xa2, 0x7d, 0x9a, 0x9e, 0x83, 0x92, 0xa2,
+    0xa3, 0x98, 0xa5, 0x91, 0x84, 0xb0, 0x21, 0x9a, 0x5f, 0x8c, 0x7e, 0x86,
+    0x80, 0xa0, 0x16, 0x9b, 0x5b, 0x9c, 0x76, 0x8d, 0x77, 0x9f, 0x62, 0x86,
+    0x6a, 0x6c, 0x6e, 0x8f, 0x4e, 0xc1, 0x61, 0x6f, 0x74, 0x79, 0x80, 0x5f,
+    0x59, 0x9e, 0x7c, 0x87, 0x7f, 0x4b, 0x6c, 0x8b, 0x5a, 0x8f, 0x65, 0x8a,
+    0x62, 0x58, 0x66, 0x8d, 0x83, 0x97, 0x8a, 0x7a, 0x77, 0x79, 0x6c, 0x83,
+    0x8c, 0x93, 0x82, 0x5e, 0x61, 0x8c, 0x82, 0x80, 0x88, 0x88, 0x85, 0x87,
+    0x77, 0x70, 0x8d, 0x7f, 0x7a, 0x89, 0x72, 0x7e, 0xa3, 0x99, 0x6b, 0xaa,
+    0x81, 0x87, 0x90, 0x6f, 0x7f, 0x77, 0x96, 0x83, 0x89, 0x89, 0x6a, 0x77,
+    0xa4, 0x6c, 0x97, 0x7e, 0x95, 0xa4, 0x63, 0x8d, 0x71, 0x96, 0x8a, 0xa4,
+    0x9f, 0x7c, 0x54, 0x94, 0x7a, 0x89, 0x8a, 0x90, 0x7e, 0x9d, 0x53, 0x7c,
+    0x9d, 0x83, 0x90, 0x84, 0xa1, 0x8e, 0x80, 0x74, 0x69, 0x7a, 0x69, 0x93,
+    0x8a, 0x90, 0x83, 0x76, 0x8b, 0x6f, 0x8e, 0x93, 0x82, 0x84, 0x7d, 0x94,
+    0xa1, 0x78, 0x7d, 0x68, 0x79, 0x83, 0x85, 0x9d, 0x89, 0xa0, 0x8a, 0x93,
+    0x90, 0x8c, 0x82, 0x86, 0x80, 0x71, 0xb3, 0xa1, 0x90, 0xb2, 0x27, 0xa3,
+    0x5e, 0xa3, 0xa6, 0x64, 0x75, 0xa0, 0x23, 0x8c, 0x7c, 0xc4, 0x7a, 0x8c,
+    0x4d, 0xa3, 0x4c, 0x93, 0x71, 0x7b, 0x71, 0x8b, 0x34, 0xa5, 0x47, 0x7f,
+    0x4e, 0x73, 0x51, 0x8a, 0x67, 0xa0, 0x9d, 0x7f, 0x65, 0x38, 0x61, 0x70,
+    0x71, 0x8d, 0x6a, 0x7e, 0x7e, 0x4c, 0x7d, 0x8d, 0x81, 0x80, 0xa5, 0x84,
+    0x6f, 0x57, 0x70, 0x91, 0x8b, 0x99, 0x9d, 0x84, 0x77, 0x7f, 0x6b, 0x7f,
+    0x76, 0x8f, 0x90, 0x72, 0x6c, 0x58, 0x6b, 0x85, 0xa6, 0x8a, 0xa2, 0x6d,
+    0x8a, 0x71, 0x71, 0x95, 0x92, 0x7c, 0x88, 0x67, 0x86, 0x6d, 0x8d, 0x95,
+    0x79, 0x8e, 0x65, 0x71, 0x71, 0x91, 0x85, 0x99, 0xa9, 0x87, 0x80, 0x88,
+    0x74, 0x86, 0x75, 0x83, 0x8b, 0x7f, 0x78, 0xb1, 0x90, 0xa8, 0x7b, 0x98,
+    0x8a, 0x7b, 0x5b, 0x99, 0x6f, 0x7f, 0xa0, 0x79, 0xa5, 0x93, 0x8b, 0x7b,
+    0x7e, 0x7a, 0x61, 0x9d, 0x98, 0x8b, 0x82, 0x7c, 0x76, 0x73, 0x81, 0x8a,
+    0x7e, 0x8d, 0x6e, 0x71, 0xa0, 0x65, 0x80, 0x62, 0x7d, 0x8d, 0x5e, 0x9b,
+    0x8f, 0x85, 0x89, 0xad, 0x71, 0x73, 0x7f, 0x89, 0x8d, 0x89, 0xb3, 0xa1,
+    0x7c, 0xaf, 0x43, 0x82, 0x49, 0x92, 0x62, 0x7f, 0x79, 0xa6, 0x23, 0x99,
+    0x6c, 0x9a, 0x8a, 0x90, 0x6c, 0xb9, 0x6f, 0x8a, 0x61, 0x7f, 0x8f, 0x8a,
+    0x57, 0xb9, 0x55, 0x65, 0x4b, 0x51, 0x66, 0x6e, 0x4a, 0xa1, 0x83, 0x8a,
+    0x73, 0x23, 0x8a, 0x6d, 0x46, 0xa7, 0x87, 0x64, 0x84, 0x5f, 0x6f, 0x6f,
+    0x9b, 0x9d, 0x76, 0x83, 0x60, 0x6e, 0x76, 0x8a, 0x9a, 0xa6, 0x75, 0x73,
+    0x86, 0x5b, 0x97, 0x88, 0x7b, 0x8e, 0x82, 0x5c, 0x97, 0x71, 0x74, 0x85,
+    0x83, 0x91, 0x89, 0x6f, 0x93, 0x94, 0x8b, 0xa9, 0x7d, 0x84, 0x80, 0x89,
+    0x97, 0x80, 0x65, 0x92, 0x9a, 0x85, 0x5a, 0x6a, 0x6b, 0x58, 0x6f, 0x8c,
+    0x9a, 0x8b, 0x6e, 0x81, 0x9d, 0xae, 0x8c, 0x86, 0x8d, 0x90, 0x6c, 0xb8,
+    0x91, 0x89, 0x98, 0xbd, 0x8b, 0x78, 0x7d, 0x87, 0x9c, 0x72, 0x73, 0x80,
+    0x9e, 0x92, 0x5d, 0x77, 0x78, 0x4f, 0x87, 0x7b, 0x7a, 0x9e, 0x74, 0x67,
+    0x6a, 0x58, 0x95, 0x80, 0x75, 0x97, 0x81, 0x75, 0x94, 0x75, 0x73, 0x92,
+    0x83, 0x7b, 0x6b, 0x8e, 0x82, 0x6e, 0x7d, 0x9b, 0x91, 0x7f, 0x9e, 0xaa,
+    0x8c, 0xa3, 0xa8, 0x8c, 0x9a, 0xc1, 0x28, 0xac, 0x49, 0x9b, 0x59, 0x8a,
+    0x60, 0xa7, 0x39, 0xa7, 0x75, 0x9b, 0x95, 0x94, 0x76, 0xb3, 0x4a, 0x6b,
+    0x60, 0x6c, 0xa5, 0x71, 0x40, 0xc4, 0x4c, 0x7c, 0x76, 0x7b, 0x67, 0x76,
+    0x76, 0xa4, 0x7b, 0x83, 0x67, 0x4d, 0x87, 0x87, 0x6e, 0x93, 0x84, 0x70,
+    0x78, 0x41, 0x87, 0x9f, 0x7a, 0x8c, 0x87, 0x69, 0x73, 0x6c, 0x93, 0x73,
+    0x77, 0xa2, 0x52, 0x72, 0x5c, 0x75, 0x6c, 0x8f, 0x65, 0x92, 0x87, 0x52,
+    0x67, 0x54, 0x54, 0x75, 0x90, 0x9c, 0x91, 0x6f, 0xa3, 0x86, 0x87, 0x9c,
+    0x99, 0x86, 0x9f, 0x71, 0x8a, 0x7a, 0x7a, 0x97, 0x7a, 0x86, 0x6c, 0x99,
+    0x89, 0x7e, 0x9c, 0x83, 0x98, 0x78, 0x73, 0x7f, 0x91, 0x96, 0x9a, 0x8d,
+    0xb0, 0x9e, 0x6a, 0x80, 0x92, 0x86, 0x95, 0x83, 0x94, 0x92, 0x6f, 0x86,
+    0x8a, 0x52, 0x6e, 0x82, 0x84, 0x8b, 0x77, 0x88, 0x70, 0x54, 0x8f, 0x7f,
+    0x7d, 0x7e, 0x57, 0x89, 0x6d, 0x6f, 0x9c, 0x93, 0x90, 0x93, 0x52, 0x70,
+    0x75, 0x92, 0x73, 0x88, 0x93, 0x77, 0x77, 0x91, 0x89, 0xa2, 0x9d, 0xa6,
+    0xae, 0x84, 0x7d, 0xab, 0x92, 0x7e, 0x9c, 0x98, 0x7b, 0xc3, 0x38, 0x98,
+    0x4f, 0x97, 0x8f, 0x93, 0x62, 0xb8, 0x23, 0xa4, 0x6d, 0x9c, 0x81, 0x8e,
+    0x6f, 0x9d, 0x56, 0x89, 0x50, 0x94, 0x70, 0x77, 0x5d, 0xb7, 0x60, 0x5b,
+    0x72, 0x45, 0x81, 0x8c, 0x66, 0xbc, 0x8f, 0x7f, 0x57, 0x43, 0x85, 0x96,
+    0x5a, 0xb2, 0x91, 0x7d, 0x6c, 0x3a, 0x73, 0x92, 0x63, 0x93, 0x89, 0x90,
+    0x7f, 0x52, 0x7f, 0x7b, 0xa1, 0xa6, 0x8f, 0x60, 0x78, 0x51, 0x5f, 0xac,
+    0x7b, 0x89, 0x88, 0x97, 0x7e, 0x64, 0x57, 0x72, 0x6c, 0x96, 0x74, 0x78,
+    0xab, 0x66, 0x62, 0x8d, 0x6f, 0x86, 0x91, 0x93, 0x7d, 0x74, 0x82, 0x80,
+    0x73, 0x84, 0x9c, 0x8e, 0x68, 0x69, 0x9e, 0xa1, 0x8a, 0x83, 0x7a, 0x87,
+    0x94, 0x8c, 0x83, 0x7e, 0x91, 0x92, 0x82, 0x7b, 0xa0, 0x8e, 0x73, 0x86,
+    0xa9, 0x95, 0x7c, 0xa5, 0x6c, 0x6f, 0x8c, 0x87, 0xa6, 0x8a, 0x77, 0x86,
+    0x7d, 0x79, 0x89, 0x75, 0x8f, 0x82, 0x54, 0x61, 0x82, 0x8e, 0x80, 0x84,
+    0x7b, 0x8e, 0x61, 0x82, 0x86, 0x77, 0x7d, 0x7c, 0x7e, 0x6c, 0x7b, 0xad,
+    0x7b, 0x90, 0x88, 0x80, 0x64, 0x83, 0x7e, 0xa7, 0x83, 0x7e, 0xb5, 0xbb,
+    0x88, 0xd9, 0x21, 0x9a, 0x4d, 0x9f, 0x91, 0x97, 0x64, 0xb5, 0x1c, 0x8a,
+    0x5f, 0xaf, 0x7e, 0x7b, 0x67, 0xad, 0x48, 0x7f, 0x4e, 0x87, 0x8f, 0x7c,
+    0x46, 0xab, 0x70, 0x7f, 0x4b, 0x4e, 0x48, 0x8c, 0x63, 0xc5, 0xa2, 0x7f,
+    0x68, 0x3b, 0x59, 0x7f, 0x53, 0xa1, 0x8e, 0x6e, 0x7a, 0x4a, 0x5f, 0x62,
+    0x5b, 0xa1, 0x62, 0x78, 0x74, 0x57, 0x78, 0x91, 0x7b, 0x9b, 0x75, 0x73,
+    0x73, 0x72, 0x94, 0x92, 0x79, 0xaa, 0x94, 0x75, 0x86, 0x58, 0x8c, 0x71,
+    0x77, 0x91, 0xa5, 0x74, 0x8f, 0x73, 0x89, 0x77, 0x68, 0x8e, 0x90, 0x96,
+    0x9f, 0x79, 0x77, 0x7d, 0x89, 0x9b, 0x8c, 0x94, 0x81, 0x88, 0x91, 0x8f,
+    0x9b, 0x91, 0x78, 0x87, 0x82, 0x72, 0xa7, 0xa2, 0x85, 0x98, 0xa3, 0x91,
+    0x83, 0x75, 0x72, 0x93, 0x80, 0x8f, 0x85, 0x70, 0x97, 0x58, 0x9f, 0x72,
+    0x91, 0x8e, 0x93, 0x74, 0x97, 0x73, 0x74, 0x91, 0x80, 0x84, 0x96, 0x94,
+    0x76, 0x69, 0x66, 0x9e, 0x81, 0x8a, 0x8b, 0x63, 0x65, 0x7c, 0xa1, 0x9a,
+    0x72, 0x84, 0x9e, 0x89, 0x9a, 0x86, 0x98, 0x7f, 0x77, 0x85, 0x82, 0xaa,
+    0xa3, 0x88, 0xac, 0x9e, 0x76, 0xca, 0x2b, 0xa0, 0x40, 0xad, 0x6f, 0x6c,
+    0x66, 0xc8, 0x07, 0x9e, 0x3e, 0x9f, 0x85, 0x9f, 0x5e, 0xb7, 0x53, 0x91,
+    0x56, 0x6d, 0x62, 0x95, 0x4c, 0xc7, 0x46, 0x56, 0x4b, 0x5d, 0x6f, 0x52,
+    0x4d, 0xa3, 0x8c, 0x90, 0x78, 0x4d, 0x58, 0x8d, 0x53, 0x93, 0x8e, 0x68,
+    0x6f, 0x3b, 0x49, 0x86, 0x6e, 0x9d, 0x76, 0x74, 0x5b, 0x44, 0x7b, 0x8c,
+    0x89, 0xb0, 0x64, 0x62, 0x6a, 0x6d, 0x7a, 0xae, 0x84, 0x95, 0x8c, 0x71,
+    0x8b, 0x60, 0x82, 0x9e, 0x8c, 0xa8, 0x90, 0x66, 0xa1, 0x7b, 0x65, 0x82,
+    0x8f, 0x7d, 0x8d, 0x78, 0x8e, 0x5f, 0x75, 0x88, 0x5d, 0x93, 0xa1, 0x93,
+    0x6b, 0x67, 0x7a, 0xa7, 0x92, 0x8c, 0x65, 0x88, 0x95, 0x93, 0x87, 0x81,
+    0x9c, 0x97, 0x62, 0x9d, 0x90, 0x62, 0xa1, 0x9f, 0x87, 0x94, 0x94, 0x99,
+    0x92, 0x8f, 0x71, 0x80, 0x77, 0x82, 0x92, 0x78, 0x67, 0x69, 0x7e, 0x81,
+    0x93, 0x89, 0x80, 0x9b, 0x71, 0x57, 0x63, 0x83, 0x7b, 0x9f, 0x5d, 0x92,
+    0x85, 0x96, 0x7e, 0x92, 0x84, 0x7f, 0x81, 0xa3, 0xa8, 0x96, 0x91, 0x8e,
+    0x8c, 0x8e, 0x7d, 0xb0, 0x86, 0x72, 0x9d, 0x8e, 0x8e, 0xd0, 0x05, 0x77,
+    0x45, 0xad, 0x91, 0x95, 0x71, 0xb8, 0x01, 0x9a, 0x41, 0xb8, 0x94, 0x6e,
+    0x63, 0xd3, 0x58, 0x8c, 0x5a, 0x89, 0x85, 0x83, 0x52, 0xc1, 0x7b, 0x6a,
+    0x65, 0x6e, 0x73, 0x63, 0x68, 0xba, 0x67, 0x78, 0x79, 0x4a, 0x73, 0x8f,
+    0x51, 0xc9, 0x85, 0x8a, 0x6b, 0x45, 0x6a, 0x8f, 0x6c, 0xad, 0x8a, 0x8d,
+    0x6a, 0x6e, 0x6b, 0x7f, 0x86, 0xb4, 0x88, 0x7d, 0xaa, 0x71, 0x5c, 0x69,
+    0x5d, 0xa8, 0x62, 0x7d, 0x6c, 0x6e, 0x6f, 0x6a, 0x7c, 0x9d, 0x7a, 0x83,
+    0x7d, 0x79, 0x7b, 0x9c, 0x73, 0x93, 0x7f, 0x9d, 0x8c, 0x75, 0x78, 0x83,
+    0x85, 0x88, 0x81, 0x81, 0x98, 0x79, 0xa3, 0xae, 0x5b, 0x90, 0x89, 0x9d,
+    0x6d, 0x90, 0xa3, 0x8e, 0x87, 0x96, 0x60, 0xa7, 0x76, 0x82, 0x81, 0x84,
+    0x84, 0x9c, 0x73, 0x8a, 0x6c, 0x58, 0x64, 0x96, 0x89, 0x8b, 0x76, 0x60,
+    0x91, 0x72, 0x7f, 0x86, 0x9a, 0x89, 0x67, 0x7d, 0x77, 0x84, 0x73, 0x5c,
+    0x67, 0x8a, 0x82, 0x8c, 0x8c, 0x94, 0x8a, 0xa2, 0xaa, 0x7e, 0x5f, 0x7f,
+    0x86, 0x90, 0x96, 0xab, 0x8d, 0x91, 0x7c, 0xb6, 0x82, 0x8d, 0xb8, 0xa9,
+    0x92, 0xea, 0x1b, 0x74, 0x25, 0xab, 0x8d, 0x61, 0x81, 0xd8, 0x2c, 0x86,
+    0x2f, 0xcf, 0xa2, 0x84, 0x7f, 0xa4, 0x36, 0x86, 0x47, 0x8d, 0x60, 0x8a,
+    0x62, 0xb1, 0x4a, 0x54, 0x48, 0x73, 0x64, 0x9d, 0x72, 0xb2, 0x76, 0x4c,
+    0x8e, 0x4e, 0x76, 0x94, 0x7c, 0xad, 0x74, 0x6c, 0x6c, 0x54, 0x7f, 0x63,
+    0x97, 0xb3, 0x74, 0x6c, 0x99, 0x5f, 0x86, 0x6a, 0xa3, 0x94, 0x7c, 0x83,
+    0x8d, 0x81, 0x79, 0xac, 0x61, 0x9b, 0x65, 0x7b, 0x66, 0x89, 0x60, 0x76,
+    0x8d, 0x93, 0x8d, 0x84, 0x71, 0x65, 0x82, 0x8c, 0x94, 0xa7, 0x59, 0xa1,
+    0x8b, 0x72, 0x84, 0x65, 0x75, 0x95, 0x62, 0x71, 0x71, 0x7e, 0x7b, 0x97,
+    0x9b, 0x9a, 0x80, 0xb1, 0x77, 0x7a, 0x73, 0x8e, 0x9c, 0x8c, 0x7d, 0x96,
+    0x89, 0x7d, 0x7e, 0x80, 0x8e, 0x93, 0x63, 0x72, 0x6b, 0x57, 0x78, 0x8f,
+    0x90, 0x86, 0x62, 0x75, 0x7e, 0x54, 0x7d, 0x95, 0x85, 0x84, 0x73, 0x7b,
+    0x8f, 0x9e, 0x72, 0x8c, 0x90, 0x96, 0x8e, 0x6c, 0x80, 0x8b, 0x9e, 0x8c,
+    0x87, 0x8e, 0x9b, 0x97, 0x8f, 0x94, 0xa3, 0x6b, 0xad, 0x93, 0x8a, 0x96,
+    0x8d, 0x91, 0xa6, 0x8a, 0x9e, 0xce, 0x6b, 0x98, 0x6d, 0xa9, 0x92, 0x92,
+    0x7c, 0xe2, 0x63, 0x97, 0x42, 0xc8, 0xa3, 0xa0, 0x88, 0xdc, 0x75, 0x9b,
+    0x51, 0x7d, 0x5c, 0x80, 0x89, 0xc0, 0x83, 0x5e, 0x5e, 0xa4, 0x3e, 0x74,
+    0x9b, 0xb6, 0x7f, 0x63, 0x78, 0x7d, 0x74, 0x57, 0x93, 0xa2, 0x83, 0x70,
+    0x5e, 0x7d, 0x60, 0x69, 0x93, 0x9e, 0x79, 0x86, 0x91, 0x67, 0x86, 0x95,
+    0xa2, 0xad, 0x62, 0x74, 0x68, 0x7e, 0x7e, 0x82, 0x8c, 0xb0, 0xa0, 0x63,
+    0x8b, 0x82, 0x8f, 0x8c, 0xa4, 0xa3, 0x76, 0x6c, 0x8e, 0x87, 0x72, 0x85,
+    0xaa, 0xa4, 0x7f, 0x7b, 0x8e, 0x9a, 0x69, 0x91, 0x9d, 0xa0, 0x81, 0x92,
+    0x90, 0x85, 0x66, 0x82, 0xa3, 0xa9, 0x7f, 0x8f, 0x83, 0x9d, 0x8b, 0x8d,
+    0x96, 0xa3, 0x8f, 0x7a, 0x6d, 0x89, 0x74, 0x8a, 0xa9, 0xa9, 0x7b, 0x77,
+    0x93, 0x8b, 0x63, 0x92, 0x99, 0x8b, 0x88, 0x4f, 0x87, 0x7c, 0x67, 0x78,
+    0x83, 0xa5, 0xa5, 0x58, 0x8d, 0x70, 0x86, 0x82, 0x9e, 0xa7, 0xa5, 0x96,
+    0x8d, 0x7b, 0x96, 0x8c, 0x95, 0xa3, 0x8d, 0x9c, 0x92, 0x95, 0x98, 0x94,
+    0x87, 0x90, 0x92, 0x92, 0x95, 0x96, 0xad, 0x6e, 0x97, 0x8c, 0x92, 0x7f,
+    0x95, 0x8b, 0x8a, 0x90, 0x9b, 0x87, 0x9e, 0x86, 0x91, 0xa0, 0x68, 0x82,
+    0x85, 0x8e, 0x82, 0xa8, 0x9f, 0x68, 0x87, 0x75, 0x9b, 0x70, 0x95, 0x91,
+    0x6c, 0x77, 0x8b, 0x7b, 0x95, 0x80, 0x99, 0x65, 0x95, 0x82, 0x92, 0x9a,
+    0x8a, 0x65, 0x70, 0x8c, 0x98, 0x9e, 0x80, 0x7b, 0xa5, 0x9b, 0x93, 0x94,
+    0x84, 0x6a, 0x69, 0x82, 0x80, 0x7a, 0x75, 0x72, 0x94, 0x79, 0xad, 0xb2,
+    0x81, 0x8b, 0x85, 0x6c, 0x86, 0x88, 0x9e, 0x79, 0x86, 0x9e, 0x7e, 0x91,
+    0x7b, 0x6d, 0x93, 0x91, 0x82, 0x97, 0x6b, 0xa6, 0xaa, 0x9f, 0xa8, 0x74,
+    0x94, 0x7f, 0x63, 0x98, 0x90, 0xa1, 0x8c, 0x7f, 0x71, 0x86, 0x89, 0x95,
+    0x88, 0x80, 0x77, 0x67, 0x85, 0x7d, 0x89, 0x6d, 0x9c, 0x76, 0x72, 0x8d,
+    0x96, 0x94, 0x88, 0x98, 0x9f, 0x94, 0x8e, 0x84, 0x7a, 0x88, 0x79, 0x9f,
+    0x81, 0xa1, 0x7c, 0x8b, 0x71, 0x79, 0x7d, 0x9d, 0x7b, 0x6a, 0x8c, 0x66,
+    0x9e, 0x7b, 0x77, 0x7a, 0xb0, 0x74, 0x7f, 0x8d, 0x8d, 0x71, 0x72, 0x84,
+    0x90, 0x98, 0x7b, 0x89, 0x9b, 0x8e, 0x85, 0x7a, 0x67, 0x8a, 0x72, 0x84,
+    0x82, 0x91, 0x91, 0x7a, 0x85, 0x8a, 0xae, 0x8a, 0x9a, 0x9a, 0x7f, 0x85,
+    0x8a, 0x90, 0x69, 0x7b, 0x76, 0x78, 0x98, 0x54, 0x94, 0x7e, 0x6c, 0x72,
+    0x89, 0x88, 0x82, 0x96, 0x59, 0x95, 0x76, 0x91, 0x94, 0x96, 0x83, 0x84,
+    0x72, 0x8d, 0x97, 0x71, 0x68, 0x8e, 0x88, 0x8b, 0x7c, 0xa9, 0x73, 0x8a,
+    0x95, 0x86, 0x87, 0x96, 0x91, 0x77, 0xb1, 0x88, 0x6e, 0x7d, 0x7c, 0x9f,
+    0x8f, 0x82, 0x79, 0x83, 0xa6, 0x81, 0x89, 0x83, 0x85, 0x9b, 0x7c, 0x68,
+    0x6f, 0x84, 0x7c, 0xa1, 0x8e, 0x80, 0x78, 0x8f, 0x96, 0x77, 0x7e, 0x7b,
+    0x8f, 0x81, 0xa5, 0x84, 0x86, 0x91, 0x7b, 0x73, 0x92, 0x85, 0xa3, 0x7e,
+    0x80, 0x95, 0x7d, 0x5f, 0x8c, 0x94, 0x95, 0x73, 0x95, 0x78, 0x87, 0xa1,
+    0x94, 0x6c, 0xac, 0x6c, 0x77, 0x89, 0x86, 0x9c, 0x82, 0x76, 0x99, 0x93,
+    0x92, 0x88, 0x80, 0x80, 0x85, 0x8a, 0xa8, 0x8f, 0x7a, 0x89, 0x9a, 0x7a,
+    0x8f, 0x91, 0x86, 0x82, 0x7f, 0x82, 0x91, 0x95, 0x85, 0x71, 0x7d, 0x8f,
+    0x83, 0x8c, 0x79, 0x97, 0x7a, 0x9b, 0x91, 0x88, 0xa2, 0x86, 0x8a, 0x80,
+    0xa0, 0x96, 0x8b, 0x7d, 0x76, 0x96, 0x9f, 0x8d, 0x95, 0x8a, 0x94, 0xa0,
+    0x80, 0x95, 0x9b, 0x96, 0x81, 0xa8, 0x59, 0x89, 0x92, 0xb2, 0x83, 0x89,
+    0x85, 0x81, 0x7e, 0x64, 0x77, 0x82, 0x90, 0x96, 0x7e, 0x9f, 0xab, 0x8a,
+    0x6e, 0x9b, 0x90, 0x89, 0x6e, 0x7d, 0x81, 0x65, 0x81, 0x86, 0xa1, 0x93,
+    0x8b, 0x83, 0x81, 0x89, 0x8b, 0x90, 0x7e, 0x97, 0x8e, 0x75, 0x7e, 0x7e,
+    0x7b, 0x81, 0x9a, 0x64, 0x90, 0xab, 0x90, 0x82, 0x8a, 0x82, 0x8d, 0xad,
+    0x90, 0x74, 0x7f, 0x9a, 0x88, 0x92, 0x83, 0x97, 0xa6, 0x6e, 0x9d, 0x81,
+    0xa2, 0x98, 0x74, 0x84, 0x93, 0x85, 0x84, 0x7d, 0xa2, 0x92, 0x92, 0x87,
+    0x73, 0x8b, 0x92, 0x74, 0x96, 0x70, 0x83, 0x86, 0x8a, 0x89, 0x86, 0x88,
+    0x87, 0x7c, 0x7d, 0x81, 0x8d, 0x71, 0x8c, 0x89, 0x70, 0x94, 0x8f, 0x9a,
+    0x83, 0x9d, 0x99, 0x78, 0x74, 0x88, 0x84, 0x9a, 0x95, 0x8b, 0x8e, 0x7f,
+    0xa2, 0xa0, 0x76, 0x93, 0x9b, 0x7c, 0x97, 0x81, 0x83, 0x8c, 0xa1, 0x99,
+    0x9d, 0x7f, 0x87, 0x75, 0xa7, 0x75, 0x89, 0x7e, 0x88, 0x80, 0x8f, 0x84,
+    0x9a, 0x77, 0x8d, 0x90, 0x9d, 0x6c, 0x88, 0x8d, 0x8e, 0x81, 0x97, 0x6d,
+    0x81, 0x88, 0x64, 0x8c, 0x77, 0x8e, 0x91, 0x8a, 0x7f, 0x8a, 0x94, 0x7a,
+    0x89, 0x93, 0x8c, 0x69, 0x85, 0x8c, 0x93, 0x61, 0x7e, 0x89, 0x7e, 0x8a,
+    0x65, 0x8a, 0xa9, 0x7f, 0x80, 0x86, 0x82, 0x90, 0x66, 0x7a, 0x99, 0x71,
+    0x7f, 0x73, 0x8d, 0x94, 0x7d, 0x73, 0x7a, 0x7d, 0x87, 0x7a, 0x97, 0x70,
+    0x81, 0x60, 0x61, 0x7a, 0x91, 0x88, 0x93, 0x7a, 0x9e, 0xa6, 0x92, 0x9d,
+    0x92, 0x67, 0x99, 0x9a, 0xae, 0x71, 0x89, 0xa5, 0x9f, 0xa6, 0x98, 0x89,
+    0x97, 0x90, 0x9b, 0x9a, 0xc0, 0x95, 0x8f, 0x9c, 0x95, 0x93, 0x88, 0x95,
+    0x95, 0xa0, 0x8e, 0x8c, 0xa8, 0x94, 0x6e, 0x9e, 0x6f, 0x7b, 0xa5, 0x96,
+    0x98, 0x90, 0x91, 0x89, 0x93, 0x8f, 0x84, 0xb2, 0x7f, 0x5e, 0xc2, 0x75,
+    0x8f, 0x90, 0x9c, 0xbf, 0x8a, 0x84, 0xa6, 0x85, 0x7d, 0x84, 0x8a, 0xad,
+    0x6f, 0x88, 0xac, 0x77, 0x91, 0x8d, 0x94, 0xac, 0x8f, 0x7f, 0xa1, 0xa5,
+    0x8e, 0x6d, 0x8a, 0x82, 0x85, 0x80, 0x9b, 0x7a, 0x9f, 0x60, 0x95, 0x97,
+    0x90, 0x67, 0x8f, 0x91, 0x86, 0x89, 0x88, 0x89, 0x96, 0x6c, 0x8b, 0x94,
+    0x8a, 0x75, 0x84, 0x96, 0x8a, 0x86, 0x7c, 0x91, 0x74, 0x8f, 0x97, 0x89,
+    0x8f, 0x8e, 0x6b, 0x97, 0x93, 0x89, 0x6b, 0x7e, 0x65, 0xa4, 0xa5, 0x63,
+    0x85, 0x88, 0x81, 0xa3, 0x70, 0x9b, 0x9e, 0x8c, 0x62, 0x73, 0x85, 0xb4,
+    0x88, 0x6e, 0x92, 0x6f, 0x91, 0x88, 0x79, 0x91, 0x7f, 0x7d, 0x9a, 0x6b,
+    0x78, 0x93, 0x7e, 0x79, 0x93, 0x7a, 0x74, 0x91, 0x8d, 0x92, 0xb3, 0x61,
+    0xa3, 0x76, 0x81, 0x99, 0x96, 0x8b, 0x93, 0x8f, 0xa7, 0x6f, 0x8f, 0xa6,
+    0xb2, 0x76, 0xa1, 0x83, 0xa8, 0x8b, 0xae, 0x99, 0x90, 0x6a, 0x97, 0x97,
+    0xaa, 0x95, 0x85, 0x7d, 0x97, 0x94, 0x86, 0x94, 0x89, 0xa4, 0xa9, 0x81,
+    0x89, 0x7c, 0x96, 0xb3, 0x92, 0x7d, 0xa4, 0x6f, 0x6d, 0x92, 0x83, 0xb4,
+    0x7b, 0x94, 0x8c, 0x79, 0x61, 0x6f, 0x8f, 0xb7, 0x88, 0x66, 0xaa, 0x7d,
+    0x89, 0x7f, 0x90, 0xbd, 0x99, 0xac, 0xb1, 0x96, 0x9c, 0x7c, 0x92, 0xb7,
+    0x73, 0x94, 0xad, 0x9d, 0x7c, 0x80, 0x87, 0x96, 0x73, 0x8d, 0xa8, 0x88,
+    0xa9, 0x83, 0x7b, 0x84, 0x9d, 0x99, 0x83, 0x89, 0x9d, 0x7f, 0x7e, 0x86,
+    0x75, 0x83, 0x77, 0x7d, 0x8b, 0x7d, 0x80, 0x9d, 0xa2, 0x94, 0x72, 0x92,
+    0x75, 0x95, 0x99, 0xa0, 0x7b, 0x83, 0x99, 0x89, 0x82, 0x92, 0x5b, 0x9e,
+    0x7c, 0x91, 0x95, 0x79, 0x61, 0x86, 0x60, 0xc7, 0x72, 0x91, 0xb5, 0x88,
+    0x71, 0x8d, 0x85, 0x91, 0x83, 0x74, 0xa8, 0x67, 0x79, 0x77, 0x7f, 0x79,
+    0x68, 0x84, 0x95, 0x69, 0x98, 0x88, 0x74, 0x72, 0x9c, 0x86, 0x87, 0x95,
+    0x90, 0x95, 0x9b, 0x8b, 0xc5, 0x7d, 0x81, 0x8f, 0x88, 0x8c, 0xb0, 0x95,
+    0xa8, 0x8c, 0x84, 0xa0, 0xb0, 0x89, 0x9a, 0x90, 0xaa, 0x88, 0x96, 0x9b,
+    0x88, 0xa9, 0x89, 0x99, 0xb7, 0x82, 0x99, 0xa0, 0x85, 0x70, 0x9c, 0x9a,
+    0x94, 0x74, 0x91, 0x81, 0x76, 0x70, 0x8f, 0xc2, 0x8c, 0x91, 0x8f, 0x69,
+    0x74, 0x7e, 0x6d, 0x9a, 0x80, 0x77, 0xa5, 0x94, 0x8b, 0x6d, 0x82, 0xcf,
+    0x8e, 0x74, 0xc4, 0x86, 0x7f, 0x78, 0x72, 0xb3, 0x78, 0x7a, 0xac, 0x9c,
+    0x7d, 0x77, 0x8d, 0xca, 0x67, 0x8c, 0xd5, 0x8f, 0x7f, 0x71, 0x70, 0x82,
+    0x7e, 0x9f, 0xb0, 0x7f, 0x75, 0x90, 0x79, 0x7b, 0x8d, 0x7b, 0xa6, 0x87,
+    0x98, 0x76, 0x84, 0x96, 0x81, 0x6a, 0x96, 0x86, 0x8e, 0x77, 0xa3, 0x83,
+    0x91, 0x83, 0x8a, 0x6c, 0x74, 0x83, 0x99, 0x7d, 0x7c, 0x8a, 0x88, 0x9a,
+    0x6b, 0x86, 0x59, 0xa3, 0x8a, 0x8e, 0xbb, 0x8a, 0x75, 0x78, 0x68, 0xb5,
+    0x9b, 0x7b, 0xa7, 0x93, 0x5b, 0x6c, 0x6b, 0xa0, 0x74, 0x99, 0xc0, 0x73,
+    0x8b, 0x7e, 0x8e, 0x83, 0x64, 0x7c, 0x7d, 0x7a, 0x98, 0x7d, 0x82, 0x7c,
+    0x8f, 0x7e, 0x74, 0x86, 0xa9, 0x84, 0xba, 0x8f, 0xc7, 0x6f, 0x87, 0xae,
+    0x97, 0x91, 0xad, 0x82, 0xb2, 0x70, 0x8a, 0xa0, 0xb0, 0x7d, 0x95, 0x8d,
+    0xc2, 0x85, 0x80, 0xad, 0x9f, 0x85, 0x8b, 0x76, 0xaa, 0xab, 0x8f, 0xa0,
+    0x89, 0x9b, 0x8a, 0xb3, 0xa0, 0x72, 0xbe, 0x8c, 0x93, 0x7a, 0xa0, 0xad,
+    0x99, 0x6f, 0xa2, 0x79, 0x78, 0x8b, 0x6d, 0xae, 0x75, 0x6f, 0xa1, 0x8d,
+    0x68, 0x81, 0x74, 0xb3, 0x8f, 0x81, 0xc6, 0x96, 0x77, 0x68, 0x85, 0xaf,
+    0x86, 0x9f, 0xbb, 0x8a, 0x7e, 0x8a, 0x86, 0xab, 0x8b, 0x87, 0x94, 0x96,
+    0x99, 0x82, 0x6a, 0xaa, 0x7b, 0x81, 0xa6, 0x9b, 0xb6, 0x73, 0x78, 0x9a,
+    0x8f, 0xaa, 0x93, 0x81, 0x97, 0x7a, 0x72, 0x82, 0x79, 0x81, 0x7c, 0x88,
+    0x8e, 0x79, 0x9d, 0x81, 0x9a, 0x75, 0x9b, 0x89, 0x73, 0x6a, 0xa6, 0x84,
+    0x5c, 0x6f, 0xa0, 0x9d, 0x81, 0x84, 0x3e, 0xaf, 0x94, 0xa1, 0xb8, 0x93,
+    0x81, 0x89, 0x68, 0xd4, 0x87, 0x99, 0x99, 0x95, 0x79, 0x72, 0x81, 0xa1,
+    0x78, 0x7d, 0x8f, 0x7e, 0x87, 0x78, 0x8e, 0x97, 0x7e, 0x96, 0x86, 0x86,
+    0x97, 0x74, 0x6f, 0x7d, 0xa5, 0x81, 0x6f, 0x8e, 0x9e, 0x8b, 0xad, 0xac,
+    0xbd, 0x75, 0x84, 0xa2, 0x93, 0x76, 0xc7, 0x9e, 0xb0, 0x75, 0x89, 0xa4,
+    0x95, 0x92, 0xb5, 0xaa, 0xb9, 0x7d, 0x79, 0xa5, 0x88, 0x70, 0x84, 0x70,
+    0xa3, 0x81, 0xa1, 0xa6, 0x8f, 0x96, 0x96, 0x8d, 0xa5, 0x83, 0xb2, 0x8f,
+    0x88, 0x74, 0x96, 0xbc, 0x8b, 0x81, 0xa4, 0x85, 0x7c, 0x87, 0x64, 0xb4,
+    0x80, 0x88, 0x92, 0x90, 0x78, 0x79, 0x77, 0xa5, 0x79, 0x8b, 0xbd, 0x7d,
+    0x84, 0x8c, 0x96, 0xd4, 0x78, 0x81, 0xa4, 0x8c, 0x97, 0x89, 0x78, 0xc4,
+    0x9f, 0x94, 0xb9, 0x83, 0x76, 0x78, 0x89, 0x86, 0x81, 0x8f, 0xbd, 0xa7,
+    0x88, 0x79, 0x8e, 0x92, 0x86, 0x88, 0xad, 0x8a, 0x7b, 0x7f, 0x80, 0xad,
+    0x7a, 0xaf, 0x8a, 0x93, 0xa6, 0x84, 0x92, 0x8e, 0x84, 0x99, 0x80, 0xae,
+    0x74, 0x7c, 0x95, 0x9c, 0x7b, 0x84, 0x84, 0x84, 0xa4, 0x82, 0x57, 0xb5,
+    0x95, 0xc1, 0xb7, 0xa0, 0x85, 0x7b, 0x69, 0xc3, 0xb1, 0x8e, 0xa0, 0x8e,
+    0x81, 0x88, 0x78, 0x9e, 0x81, 0x97, 0xb2, 0x74, 0x81, 0x84, 0x91, 0x87,
+    0x6f, 0x6f, 0x75, 0x78, 0x92, 0x7a, 0x6d, 0x80, 0x9a, 0x7e, 0x81, 0xa1,
+    0xa8, 0x6d, 0xb5, 0x98, 0xb4, 0x7f, 0x9a, 0xa4, 0x9d, 0x7b, 0xba, 0xaa,
+    0xce, 0x93, 0x79, 0xa5, 0x81, 0x95, 0xa6, 0x7f, 0x8c, 0x8b, 0x96, 0xa4,
+    0xa1, 0x8d, 0x91, 0x97, 0xce, 0x8e, 0x8e, 0x9d, 0x86, 0x7f, 0x97, 0xa3,
+    0x99, 0x75, 0xa3, 0xa0, 0x69, 0x6a, 0x87, 0xa0, 0x9a, 0x80, 0xa2, 0x72,
+    0x6d, 0x85, 0x6b, 0x94, 0x8d, 0x77, 0x9f, 0x84, 0x7f, 0x92, 0x64, 0xaa,
+    0x78, 0x82, 0xa7, 0x8f, 0x84, 0x79, 0x84, 0xb9, 0x92, 0x7c, 0xb6, 0x96,
+    0x9c, 0x99, 0x8f, 0xab, 0xab, 0x8a, 0xa2, 0xab, 0x6d, 0x97, 0x7b, 0xb1,
+    0x9e, 0x6c, 0x9a, 0x99, 0xaa, 0xa3, 0x70, 0x80, 0x81, 0x6f, 0xb6, 0x95,
+    0x93, 0x93, 0x8e, 0x80, 0x86, 0xb0, 0x87, 0x91, 0x8f, 0x8c, 0xa4, 0x86,
+    0x89, 0x8f, 0x93, 0x83, 0x75, 0x7d, 0x9b, 0x86, 0x7d, 0x5a, 0x9d, 0x67,
+    0x9f, 0x78, 0x5c, 0xa5, 0x8e, 0xa2, 0xc1, 0x95, 0x89, 0x84, 0x53, 0xd1,
+    0x7d, 0x9b, 0xc0, 0x8f, 0x73, 0x7f, 0x85, 0x9e, 0x8a, 0x7b, 0xa6, 0x84,
+    0x6c, 0x74, 0x95, 0x93, 0x7a, 0x7a, 0x81, 0x7d, 0x89, 0x86, 0x76, 0x8a,
+    0xad, 0x66, 0x90, 0x90, 0x9d, 0x77, 0xb4, 0xad, 0xac, 0x8e, 0xb3, 0xa5,
+    0x9d, 0x91, 0xd7, 0x94, 0xba, 0x8b, 0x72, 0xa4, 0x93, 0x7e, 0xa7, 0x86,
+    0xae, 0x83, 0x63, 0xa6, 0xa0, 0x78, 0x81, 0x8b, 0xc4, 0x82, 0x8f, 0x98,
+    0xa1, 0x8f, 0x79, 0x9a, 0x92, 0x85, 0x9d, 0x91, 0x92, 0x84, 0x8f, 0x84,
+    0x91, 0x6d, 0x7b, 0x69, 0x75, 0x87, 0x5d, 0x99, 0x92, 0x83, 0xab, 0x8f,
+    0x53, 0x90, 0x7b, 0xa0, 0x71, 0x89, 0xc2, 0x7f, 0x6a, 0x7c, 0x86, 0xb2,
+    0x8d, 0x89, 0xaf, 0x9c, 0x81, 0x8c, 0x84, 0xbe, 0x93, 0x9c, 0xa8, 0x97,
+    0x68, 0x9b, 0x84, 0xa3, 0x8a, 0x77, 0xa5, 0x79, 0x7b, 0x87, 0x86, 0xa5,
+    0x80, 0x83, 0x9e, 0x8d, 0xb1, 0x94, 0x7a, 0x8b, 0xa6, 0xa8, 0x80, 0x98,
+    0x8c, 0x73, 0xa9, 0x7b, 0x91, 0x8f, 0x71, 0x82, 0x68, 0x84, 0xa5, 0x96,
+    0x67, 0x63, 0xa6, 0x71, 0xa7, 0x85, 0x57, 0x9f, 0x91, 0xb2, 0xa6, 0x87,
+    0x80, 0x8f, 0x6a, 0xba, 0x9d, 0xb7, 0xb9, 0x8b, 0x75, 0x7c, 0x6f, 0x9f,
+    0x74, 0x8d, 0xaf, 0x6e, 0x7c, 0x65, 0x6c, 0x8a, 0x7c, 0x81, 0x89, 0x77,
+    0x8b, 0x74, 0x65, 0x9b, 0xa5, 0x6b, 0x92, 0x71, 0xbb, 0x70, 0x99, 0xbf,
+    0xb0, 0x7b, 0x92, 0xb4, 0xa4, 0x84, 0xc4, 0x92, 0xa8, 0x94, 0x7e, 0xcd,
+    0x83, 0x87, 0xaf, 0xa0, 0xa5, 0x94, 0x72, 0xb9, 0x90, 0xa6, 0x9e, 0x9e,
+    0x9b, 0x7a, 0x68, 0xc0, 0x8f, 0x89, 0x72, 0x94, 0x9b, 0x81, 0x81, 0x91,
+    0x88, 0x90, 0xa8, 0x8d, 0x90, 0x78, 0x7c, 0x67, 0x64, 0x8e, 0x55, 0xa1,
+    0x6d, 0x86, 0xa3, 0x6f, 0x5c, 0x7d, 0x79, 0xa3, 0x64, 0x71, 0xd4, 0x87,
+    0x73, 0x85, 0x76, 0xc7, 0x72, 0x86, 0xb2, 0x8c, 0x7b, 0x8d, 0x96, 0xc3,
+    0xad, 0x87, 0xac, 0xa8, 0x84, 0x94, 0x7b, 0xbf, 0x83, 0x74, 0x8e, 0x8c,
+    0x9c, 0x99, 0x88, 0x8e, 0x86, 0x88, 0xae, 0x7f, 0x70, 0x96, 0x6f, 0x74,
+    0x8f, 0x85, 0x7c, 0x86, 0x97, 0x83, 0xa0, 0x6a, 0x8b, 0x82, 0x88, 0x90,
+    0x72, 0x84, 0x9b, 0xa1, 0x6f, 0x72, 0xa4, 0x95, 0xa6, 0x7d, 0x65, 0xbd,
+    0x90, 0xb6, 0x9e, 0x98, 0xa1, 0x94, 0x66, 0xb3, 0x9c, 0xb3, 0xa7, 0x7f,
+    0x91, 0x69, 0x6e, 0xb1, 0x68, 0x7a, 0xaa, 0x91, 0x7c, 0x71, 0x9f, 0x95,
+    0x83, 0x86, 0x76, 0x69, 0x9b, 0x7f, 0x8c, 0x94, 0x9c, 0x89, 0x86, 0x93,
+    0xc1, 0x79, 0x98, 0x9e, 0xb1, 0x90, 0x9b, 0xb7, 0xab, 0x86, 0xc6, 0xa1,
+    0xa9, 0xaa, 0x86, 0xb0, 0x8b, 0x79, 0xb9, 0x85, 0xbe, 0x92, 0x60, 0xc0,
+    0x9f, 0x9a, 0x90, 0x8d, 0xb5, 0x77, 0x95, 0xad, 0x8b, 0x93, 0x8a, 0x93,
+    0x93, 0x7e, 0x86, 0xa6, 0x7d, 0x89, 0x6b, 0x81, 0x93, 0x75, 0x7f, 0x86,
+    0x66, 0x8f, 0x56, 0x8f, 0x84, 0x75, 0x9e, 0x77, 0x78, 0x89, 0x62, 0xb3,
+    0x78, 0x76, 0xb5, 0x92, 0x7f, 0x80, 0x7a, 0xb9, 0x7d, 0x80, 0xc2, 0xb9,
+    0x7d, 0x8f, 0x8f, 0x8c, 0xa0, 0x78, 0xa2, 0xaf, 0x68, 0x98, 0x77, 0xac,
+    0x96, 0x77, 0x96, 0x99, 0x84, 0xb1, 0x72, 0x8e, 0x96, 0xa4, 0xa9, 0x8e,
+    0x84, 0x7b, 0x85, 0x8d, 0x8f, 0x83, 0x83, 0x7f, 0x85, 0x6e, 0xa4, 0x98,
+    0xab, 0x83, 0x90, 0x8e, 0x77, 0x8e, 0xab, 0x9c, 0x73, 0x79, 0x8d, 0x6e,
+    0xa0, 0x97, 0x68, 0xa7, 0x8a, 0xbd, 0x95, 0x96, 0x96, 0x8b, 0x72, 0xc7,
+    0x8d, 0x8c, 0xa5, 0x83, 0x9b, 0x8b, 0x6c, 0xac, 0x62, 0x78, 0xae, 0x78,
+    0x71, 0x7a, 0x8d, 0xae, 0x91, 0x87, 0x90, 0x82, 0x9b, 0x83, 0x90, 0x97,
+    0xb0, 0x96, 0x82, 0xa5, 0xa9, 0x76, 0xa5, 0xa0, 0xac, 0xa1, 0x93, 0x94,
+    0xb7, 0x91, 0xbb, 0x9b, 0xa4, 0xa5, 0x8c, 0xb5, 0x95, 0x7b, 0x92, 0x91,
+    0xb0, 0x97, 0x73, 0xb9, 0x86, 0xa7, 0x92, 0x98, 0x9e, 0x70, 0x77, 0xba,
+    0x96, 0x7b, 0xa6, 0x86, 0x97, 0x85, 0x8e, 0xaa, 0x93, 0x97, 0x8f, 0x8b,
+    0x8d, 0x79, 0x84, 0x7e, 0x70, 0x95, 0x52, 0x8f, 0x62, 0x75, 0x8b, 0x8b,
+    0x7b, 0x8b, 0x79, 0xaf, 0x90, 0x6d, 0xc8, 0x8d, 0x84, 0x8c, 0x72, 0xaf,
+    0x70, 0x8d, 0xa5, 0x8a, 0x76, 0x97, 0x87, 0x8e, 0xa9, 0x83, 0xb2, 0x8d,
+    0x7e, 0x9b, 0x76, 0xc2, 0xa2, 0x72, 0xc5, 0x87, 0x75, 0xb7, 0x92, 0x95,
+    0x9e, 0xa0, 0xc3, 0x82, 0x8d, 0x8f, 0x7d, 0x85, 0x90, 0x99, 0x7b, 0x82,
+    0x87, 0x87, 0xa0, 0x87, 0x9a, 0x8b, 0xa2, 0xa4, 0x67, 0x93, 0xa5, 0xbb,
+    0x73, 0x5f, 0x8c, 0x60, 0xa5, 0x7d, 0x6c, 0xb3, 0xb2, 0xb3, 0xa9, 0xa9,
+    0x8d, 0x8d, 0x67, 0xd7, 0x63, 0x99, 0xaa, 0x83, 0x88, 0x6a, 0x6f, 0x9e,
+    0x5e, 0x9e, 0x9d, 0x81, 0x84, 0x6e, 0x98, 0x90, 0x89, 0x7c, 0x95, 0x7d,
+    0x81, 0x8a, 0xa2, 0x8c, 0x92, 0x85, 0x80, 0x92, 0xac, 0x80, 0x9b, 0x9b,
+    0xc3, 0x8c, 0x95, 0xbc, 0xaa, 0x7c, 0xb5, 0x8d, 0xa1, 0xb8, 0x70, 0xb6,
+    0x8c, 0x92, 0xa8, 0x8e, 0xa3, 0x76, 0x6c, 0xbe, 0xa0, 0x8c, 0x92, 0x8e,
+    0xa1, 0x83, 0x76, 0xb2, 0x91, 0x7b, 0x8e, 0x87, 0x7f, 0x89, 0x8a, 0xa1,
+    0x91, 0xa0, 0x7a, 0x95, 0x7b, 0x86, 0x99, 0x92, 0x78, 0x8a, 0x62, 0x9e,
+    0x7b, 0x7b, 0x89, 0x79, 0x78, 0x87, 0x82, 0x94, 0x7d, 0x91, 0x96, 0x79,
+    0x7b, 0x8d, 0x80, 0xa7, 0x88, 0x95, 0xa6, 0x8f, 0x7d, 0x95, 0x79, 0xa2,
+    0x91, 0x9b, 0x9d, 0x90, 0x79, 0xa4, 0x88, 0x98, 0x9b, 0x7a, 0xa5, 0x7f,
+    0x71, 0x9c, 0x87, 0x96, 0x8c, 0x8f, 0xbc, 0x74, 0x95, 0x99, 0x7f, 0x78,
+    0x8c, 0x63, 0x7c, 0x7a, 0x92, 0x8c, 0xa8, 0x78, 0xa8, 0x89, 0x9a, 0x86,
+    0x69, 0x7e, 0xa1, 0xc3, 0x57, 0x68, 0x84, 0x89, 0xa9, 0x8d, 0x6f, 0xa9,
+    0x8a, 0xab, 0xa5, 0xad, 0x94, 0x83, 0x6b, 0xa7, 0x7e, 0x95, 0x9b, 0x7f,
+    0x8b, 0x78, 0x73, 0x90, 0x65, 0x8d, 0xb1, 0x91, 0x84, 0x65, 0x90, 0xb4,
+    0x8c, 0x89, 0x94, 0x7c, 0x99, 0x8b, 0x98, 0xb7, 0xb0, 0x91, 0x9e, 0x88,
+    0xbd, 0xa0, 0xa4, 0xb9, 0xad, 0x96, 0x97, 0xa3, 0xb6, 0x81, 0xba, 0x9b,
+    0xbc, 0xa9, 0x94, 0xb9, 0xa0, 0x85, 0x8e, 0xa1, 0xac, 0x87, 0x65, 0xa6,
+    0x98, 0x8e, 0xaa, 0xa3, 0xa3, 0x7f, 0x79, 0xb4, 0x93, 0x76, 0x90, 0x99,
+    0x8b, 0x90, 0x84, 0xa6, 0x90, 0x8f, 0x88, 0xa6, 0x89, 0x83, 0x86, 0x7a,
+    0x5d, 0x96, 0x71, 0xa5, 0x64, 0x94, 0x9a, 0x85, 0x7c, 0xa1, 0x96, 0x9d,
+    0x76, 0x8f, 0x95, 0xa0, 0x7f, 0x8c, 0x80, 0xc7, 0x6c, 0x7d, 0xb7, 0xb2,
+    0x82, 0x8e, 0x82, 0xbd, 0xb3, 0x82, 0x99, 0x9b, 0x80, 0x94, 0x8c, 0x94,
+    0x94, 0x6b, 0xc6, 0xa9, 0x81, 0x9f, 0x8c, 0x7e, 0x87, 0x88, 0xb3, 0x7d,
+    0x88, 0x8c, 0x81, 0x81, 0x7e, 0x7e, 0x86, 0x87, 0x96, 0x85, 0xb4, 0x87,
+    0xab, 0x91, 0x8f, 0xa1, 0x72, 0x83, 0xa4, 0x89, 0x6b, 0x75, 0x85, 0x7c,
+    0x94, 0x85, 0x6f, 0xad, 0x91, 0xae, 0xa4, 0xa5, 0xa7, 0x8e, 0x6c, 0xb2,
+    0x73, 0x99, 0x96, 0x92, 0x89, 0x81, 0x7d, 0x88, 0x60, 0x8d, 0x94, 0x83,
+    0x99, 0x68, 0x86, 0xa2, 0x94, 0x8e, 0x82, 0x76, 0x89, 0x8d, 0x98, 0x86,
+    0x94, 0x90, 0x83, 0x7d, 0xad, 0x94, 0xa6, 0x90, 0xcb, 0x96, 0xa2, 0xb2,
+    0xb6, 0x89, 0xc4, 0x9d, 0xc7, 0xa5, 0x75, 0xc3, 0x92, 0x8c, 0x8e, 0xad,
+    0x96, 0x94, 0x8e, 0xab, 0x94, 0x90, 0xa8, 0x84, 0xb5, 0x84, 0x66, 0xce,
+    0x74, 0x8c, 0x93, 0x8d, 0x8f, 0x95, 0x8b, 0xa1, 0x7b, 0xa1, 0x79, 0x9e,
+    0x81, 0xa4, 0xa0, 0x98, 0x5f, 0x78, 0x8e, 0x97, 0x6f, 0x81, 0x96, 0x8d,
+    0x70, 0x93, 0x72, 0x9c, 0x7b, 0x98, 0x8b, 0x8a, 0x8f, 0x8b, 0x6c, 0xa9,
+    0x81, 0x99, 0xb3, 0xa3, 0x71, 0x9c, 0x8b, 0x94, 0xa6, 0x8a, 0xb8, 0xa0,
+    0x7b, 0x98, 0x74, 0x9f, 0x92, 0x92, 0xb2, 0x89, 0x81, 0xa8, 0x87, 0x97,
+    0x96, 0x86, 0xa4, 0x7b, 0x63, 0x8e, 0x86, 0x7d, 0x76, 0x81, 0x93, 0x94,
+    0x98, 0x8b, 0xaf, 0x6d, 0xab, 0x9b, 0x85, 0x9b, 0x91, 0x86, 0x95, 0x95,
+    0x65, 0x89, 0x9e, 0x6b, 0xa4, 0x82, 0x68, 0xb5, 0x8b, 0xd1, 0x9d, 0x93,
+    0x7d, 0x67, 0x5e, 0xba, 0x9b, 0x94, 0x93, 0x8d, 0x88, 0x73, 0x7c, 0x8e,
+    0x7d, 0x83, 0x9a, 0x82, 0xa4, 0x62, 0x9a, 0x8d, 0x86, 0xa0, 0x7b, 0x72,
+    0xa9, 0x84, 0xa7, 0x94, 0xb2, 0x98, 0x8f, 0x81, 0xbe, 0x84, 0x9d, 0x94,
+    0x9c, 0x9a, 0x94, 0x8f, 0xb1, 0x82, 0xb1, 0x82, 0xb1, 0xb2, 0x78, 0xa7,
+    0x95, 0x99, 0x8b, 0x8c, 0xb1, 0x81, 0x5b, 0xbb, 0x88, 0x7a, 0x90, 0xa3,
+    0x8d, 0x78, 0x6f, 0xbf, 0x8c, 0x93, 0xa1, 0x8e, 0x9f, 0x98, 0x88, 0xb3,
+    0x7e, 0x82, 0x8a, 0x8e, 0x7d, 0x8a, 0x96, 0x6a, 0x6c, 0x7b, 0x91, 0x94,
+    0x6f, 0x89, 0x9a, 0x84, 0x73, 0x8b, 0x8c, 0x91, 0x7d, 0x8e, 0x9e, 0x80,
+    0x88, 0x81, 0x78, 0xaf, 0x86, 0xa5, 0xa2, 0x8d, 0x6a, 0x8a, 0x75, 0xa1,
+    0x83, 0x87, 0xaf, 0x7d, 0x6c, 0xa3, 0x65, 0x77, 0x89, 0x91, 0x9a, 0xa1,
+    0xa1, 0xaf, 0x78, 0x94, 0x93, 0xb2, 0xaf, 0x92, 0x74, 0x7a, 0xa7, 0x7b,
+    0x8f, 0x9c, 0x86, 0x8d, 0x8f, 0x79, 0xb0, 0xb3, 0x97, 0x82, 0x8e, 0x92,
+    0x92, 0x81, 0xa7, 0xbc, 0x6e, 0x6e, 0x89, 0xa5, 0x9a, 0x8d, 0x84, 0xb6,
+    0x83, 0xae, 0xa5, 0xa7, 0xae, 0x86, 0x6b, 0xb9, 0x89, 0xb0, 0x8f, 0x82,
+    0x8f, 0x6f, 0x83, 0x98, 0x6a, 0x98, 0x9a, 0x85, 0x9f, 0x78, 0x93, 0x8d,
+    0x83, 0x88, 0x88, 0x7e, 0x97, 0x99, 0x8a, 0x9b, 0xb0, 0x90, 0x86, 0x88,
+    0xb5, 0x90, 0xb3, 0xaa, 0xad, 0x96, 0x93, 0xa3, 0x9d, 0x81, 0xa3, 0x9a,
+    0x9f, 0x99, 0x90, 0x9c, 0x9e, 0x8e, 0x88, 0x93, 0xa8, 0x94, 0x62, 0xa6,
+    0x94, 0x92, 0xa1, 0x86, 0xb7, 0x8a, 0x6a, 0xa6, 0x81, 0x7e, 0x7b, 0x80,
+    0x89, 0x8f, 0x74, 0xa6, 0x72, 0x91, 0xa6, 0x9b, 0x73, 0x97, 0x7e, 0x6f,
+    0x70, 0x8d, 0x73, 0x98, 0x80, 0x90, 0x8f, 0x7e, 0x83, 0x77, 0x84, 0x92,
+    0x7f, 0x8c, 0x91, 0xa6, 0x99, 0x90, 0x9d, 0xb1, 0x88, 0x85, 0x89, 0x85,
+    0x7c, 0x9f, 0x7e, 0xb0, 0xaa, 0x84, 0xa0, 0x8e, 0x74, 0x93, 0x78, 0x90,
+    0x9a, 0x8b, 0x8e, 0x97, 0x8f, 0x9f, 0x7c, 0x83, 0x8a, 0x88, 0xa5, 0x8f,
+    0x8b, 0x74, 0x84, 0x9a, 0x7f, 0x91, 0x88, 0x77, 0x9c, 0x91, 0xbc, 0x93,
+    0x9c, 0x82, 0x89, 0x9b, 0x8a, 0x7d, 0xb7, 0xb8, 0x6f, 0x68, 0xb5, 0x8e,
+    0xb4, 0x86, 0x8c, 0xb3, 0x94, 0xb6, 0xa4, 0x93, 0x98, 0x8b, 0x70, 0xb3,
+    0x96, 0xaa, 0x87, 0x89, 0x99, 0x68, 0x74, 0xa4, 0x69, 0x9e, 0x8e, 0x6b,
+    0x9f, 0x6b, 0x95, 0x9c, 0x88, 0x89, 0x8a, 0x86, 0x8d, 0x75, 0x94, 0x88,
+    0xa0, 0x94, 0x77, 0x8c, 0x9c, 0x8d, 0x8e, 0xa4, 0xac, 0xa7, 0x8a, 0x9b,
+    0xa9, 0x81, 0xab, 0xac, 0xaf, 0xaf, 0x87, 0xbb, 0x9b, 0x95, 0x8e, 0x9e,
+    0x9f, 0xa1, 0x6c, 0xb4, 0x98, 0x8f, 0x81, 0x8d, 0x98, 0x8f, 0x78, 0x96,
+    0x89, 0x86, 0x6c, 0x91, 0x8d, 0x9f, 0x95, 0x9f, 0x6b, 0x7f, 0x93, 0x7c,
+    0x96, 0x8e, 0x8a, 0x58, 0x80, 0x8e, 0x7a, 0x93, 0x8b, 0x78, 0x99, 0x92,
+    0x62, 0x8e, 0x83, 0x8e, 0x87, 0x83, 0x86, 0x99, 0x93, 0x92, 0x80, 0x95,
+    0xa2, 0x72, 0xa2, 0x97, 0x78, 0x87, 0x7b, 0xa3, 0x99, 0x78, 0x98, 0x9c,
+    0x80, 0x9b, 0x5e, 0x8a, 0x9c, 0x99, 0xa6, 0x7a, 0x8e, 0x99, 0x7a, 0x8e,
+    0x8b, 0x76, 0x9b, 0x89, 0x80, 0x8e, 0x83, 0x8a, 0x80, 0x7c, 0x80, 0x74,
+    0x95, 0x8c, 0xbf, 0x7e, 0xa8, 0x7a, 0x99, 0x7d, 0x7d, 0x73, 0xb4, 0xae,
+    0x88, 0x76, 0xae, 0x78, 0xaa, 0x65, 0x94, 0xbe, 0x97, 0xaf, 0xa4, 0x91,
+    0x9c, 0x95, 0x6c, 0xbe, 0x82, 0xb1, 0x9b, 0x91, 0x85, 0x7d, 0x66, 0x9c,
+    0x99, 0xbd, 0xa3, 0x88, 0xa8, 0x73, 0x81, 0x94, 0x92, 0x8e, 0x90, 0x8d,
+    0xaf, 0x75, 0x86, 0x9b, 0x8b, 0x8b, 0x8d, 0x74, 0xbd, 0x85, 0x97, 0x8b,
+    0x9d, 0xba, 0x90, 0xa8, 0x9d, 0x72, 0xa5, 0xa8, 0xbf, 0xbb, 0x7b, 0xb6,
+    0xad, 0x94, 0x6f, 0x9a, 0xa7, 0x97, 0x78, 0x9c, 0x98, 0x8d, 0x8c, 0x93,
+    0xb8, 0xa8, 0x7f, 0x9d, 0x98, 0x7f, 0x8f, 0x8a, 0x8d, 0xa8, 0x86, 0x7b,
+    0x5d, 0x89, 0x8a, 0x83, 0x8c, 0x8b, 0x81, 0x56, 0x7c, 0x87, 0x89, 0xa6,
+    0x75, 0x7c, 0x92, 0x74, 0x96, 0x92, 0x78, 0x8d, 0x8d, 0x98, 0xae, 0x7a,
+    0x95, 0x8f, 0x8b, 0x9c, 0x95, 0x9f, 0xae, 0x93, 0x7b, 0x93, 0x8c, 0x9a,
+    0x79, 0x74, 0x94, 0x6e, 0x7e, 0x8f, 0x64, 0x9f, 0x9c, 0x88, 0x8f, 0x8e,
+    0x84, 0x8d, 0x89, 0x95, 0x96, 0x8f, 0x9d, 0x60, 0x85, 0x86, 0x7c, 0x93,
+    0x8d, 0x68, 0x83, 0x7c, 0x94, 0x87, 0xb8, 0xa2, 0x9d, 0x82, 0x8e, 0x84,
+    0x6c, 0x73, 0xa8, 0xbc, 0x84, 0x85, 0xa2, 0x79, 0x92, 0x64, 0x69, 0xa9,
+    0x82, 0xa7, 0x9d, 0x95, 0x8e, 0x6f, 0x9f, 0xa7, 0x97, 0xb1, 0x9d, 0x8e,
+    0xa1, 0x70, 0x80, 0x9e, 0x8e, 0x91, 0xa0, 0xaa, 0x81, 0x5b, 0x98, 0x8f,
+    0xa0, 0xaa, 0x83, 0x7a, 0x91, 0x7a, 0x73, 0x80, 0xa6, 0x9a, 0x80, 0x7d,
+    0x9e, 0x75, 0x7b, 0xa3, 0xad, 0x92, 0x98, 0xc0, 0xa1, 0x80, 0x88, 0xa2,
+    0xa5, 0xa4, 0x7e, 0x9b, 0xa0, 0x80, 0x6e, 0xa0, 0x9f, 0xa3, 0x8a, 0x8f,
+    0xa2, 0x93, 0x86, 0x8d, 0x8f, 0x93, 0x7e, 0x90, 0x98, 0x83, 0x7d, 0x9b,
+    0x9f, 0x9a, 0x97, 0x83, 0x6e, 0x8d, 0x94, 0x6c, 0x7b, 0x7f, 0x73, 0x65,
+    0x6a, 0x93, 0x8a, 0x94, 0x83, 0x89, 0x7d, 0x7b, 0x77, 0x8a, 0x7a, 0x9b,
+    0x8e, 0x8d, 0x94, 0x89, 0x86, 0x83, 0x7c, 0x8e, 0x8b, 0x90, 0xab, 0x99,
+    0x81, 0x8e, 0x77, 0x9c, 0x8c, 0x82, 0x97, 0x8f, 0x78, 0x91, 0x5f, 0xa1,
+    0x8b, 0x83, 0xa9, 0x8d, 0x7b, 0x97, 0x77, 0x80, 0x84, 0x7e, 0x9e, 0x75,
+    0xa3, 0x86, 0x67, 0x7c, 0x80, 0x6d, 0x77, 0x75, 0x88, 0x75, 0xad, 0x7a,
+    0x93, 0x89, 0x8c, 0x87, 0x7a, 0x79, 0xb2, 0xa1, 0x69, 0x80, 0xb5, 0x7a,
+    0xa6, 0x7b, 0x95, 0xac, 0x95, 0xa9, 0x98, 0xa4, 0xad, 0x83, 0x8d, 0xbe,
+    0xa4, 0x98, 0xad, 0x7d, 0x8b, 0x65, 0x65, 0xad, 0x6a, 0xae, 0xa3, 0xa8,
+    0x9c, 0x63, 0x90, 0x91, 0x6d, 0x9a, 0x81, 0x98, 0x86, 0x6a, 0x83, 0x84,
+    0x94, 0x9c, 0x77, 0x86, 0xc2, 0x7f, 0x9b, 0xa9, 0xad, 0xae, 0xa7, 0xa6,
+    0xd4, 0x70, 0x9d, 0xb5, 0xaa, 0xdb, 0x8f, 0xa3, 0xa5, 0x87, 0x88, 0x9e,
+    0xa9, 0x9f, 0x62, 0xa7, 0xa2, 0x8e, 0x7d, 0x8a, 0x9d, 0xa2, 0x6b, 0xa7,
+    0x96, 0x6d, 0x76, 0x8c, 0x9b, 0x8c, 0x86, 0x86, 0x93, 0x7c, 0x9d, 0x7c,
+    0x7e, 0x93, 0x5c, 0x79, 0x76, 0x8c, 0x8a, 0x87, 0x79, 0x97, 0x9a, 0x7a,
+    0x85, 0x8c, 0x7f, 0x85, 0x7a, 0xa1, 0xa7, 0x72, 0x87, 0x7f, 0x96, 0x9e,
+    0x92, 0x92, 0x9e, 0xa0, 0x72, 0x99, 0x7a, 0xb0, 0x8c, 0x8d, 0xa3, 0x9b,
+    0x91, 0xa6, 0x63, 0x94, 0x8b, 0x81, 0xbb, 0x94, 0x79, 0x95, 0x99, 0x9a,
+    0xa0, 0x7a, 0x96, 0x72, 0x82, 0x9a, 0x83, 0x7f, 0x72, 0x7f, 0x6d, 0x75,
+    0x91, 0x7f, 0xbc, 0x84, 0x9a, 0x81, 0x95, 0x69, 0x7d, 0x6d, 0xa2, 0xa8,
+    0x7e, 0x64, 0xac, 0x86, 0x85, 0x6d, 0x99, 0xaa, 0x7e, 0x79, 0x9c, 0xa0,
+    0xa4, 0x77, 0x99, 0xac, 0xa8, 0x8d, 0xb7, 0xa2, 0xa3, 0x61, 0x82, 0x98,
+    0x84, 0x8e, 0xa1, 0x8c, 0x88, 0x82, 0x6f, 0x7d, 0x88, 0x80, 0x7a, 0x8a,
+    0x8c, 0x6d, 0x87, 0x6f, 0xab, 0x8f, 0x8b, 0x76, 0xa0, 0x7d, 0x9f, 0xab,
+    0xb0, 0xb8, 0x9c, 0x8d, 0xb8, 0x81, 0x89, 0x94, 0xa8, 0xc8, 0x92, 0x9b,
+    0x8d, 0x83, 0x7b, 0xaf, 0x97, 0x94, 0x6e, 0xa5, 0x9b, 0x97, 0x89, 0x8d,
+    0xaa, 0x8a, 0x66, 0x88, 0x93, 0x84, 0xa1, 0x88, 0xa0, 0x99, 0x85, 0x89,
+    0x7d, 0x84, 0x8b, 0x6a, 0x92, 0xa1, 0x74, 0x76, 0x73, 0x87, 0x7a, 0x9a,
+    0x77, 0x86, 0x89, 0x5f, 0x7f, 0x8b, 0x7f, 0x8d, 0x7e, 0x81, 0x95, 0x8a,
+    0x7d, 0x85, 0x74, 0x9a, 0x87, 0x8c, 0x9e, 0xae, 0x80, 0x88, 0x7d, 0x8b,
+    0xaa, 0x79, 0x7c, 0x97, 0x79, 0x90, 0x7b, 0x97, 0x97, 0x9f, 0xa1, 0xa2,
+    0xab, 0x97, 0x69, 0x7a, 0x8d, 0x9f, 0x9f, 0x89, 0x90, 0x8c, 0x66, 0x98,
+    0x6e, 0x86, 0x7b, 0x6e, 0x86, 0x8a, 0xb2, 0xa6, 0x93, 0x7d, 0x8c, 0x81,
+    0x7e, 0x84, 0xa6, 0xb6, 0x83, 0x92, 0xa0, 0x88, 0x90, 0x5f, 0x7c, 0x92,
+    0x98, 0x94, 0x92, 0x98, 0xa7, 0x65, 0x90, 0xa2, 0xa2, 0x9b, 0xa6, 0x7d,
+    0x8b, 0x5a, 0x94, 0x95, 0x9b, 0xa5, 0x99, 0xa5, 0x7e, 0x61, 0x9a, 0x7a,
+    0x8b, 0x77, 0x87, 0x76, 0x9d, 0x72, 0x9a, 0x84, 0x98, 0x94, 0x92, 0x73,
+    0xae, 0x78, 0x8e, 0xaa, 0xa0, 0xc3, 0x7a, 0xa4, 0xa0, 0x75, 0xa9, 0xae,
+    0x8c, 0xd6, 0x87, 0x8f, 0x9f, 0x8c, 0x9b, 0x90, 0x99, 0x97, 0x73, 0x8f,
+    0x9b, 0x9c, 0x8c, 0x89, 0xa5, 0x84, 0x8f, 0x7b, 0x8b, 0x7f, 0x97, 0x98,
+    0x8d, 0x7b, 0x94, 0x9d, 0x9c, 0x8e, 0x92, 0x89, 0x88, 0x8d, 0x6c, 0x63,
+    0x73, 0x81, 0x72, 0x8a, 0x88, 0x8a, 0x9f, 0x79, 0x81, 0x82, 0x9a, 0xa9,
+    0x7a, 0x92, 0x7d, 0x76, 0x7b, 0x7a, 0x6a, 0xbe, 0x91, 0x7d, 0x86, 0xad,
+    0x84, 0x86, 0x6c, 0x91, 0x91, 0x9f, 0x92, 0x6b, 0x95, 0x98, 0x84, 0xa0,
+    0x8f, 0x8b, 0x9e, 0x7f, 0x9f, 0x97, 0x7e, 0x87, 0x80, 0x9e, 0x79, 0x8d,
+    0x68, 0x87, 0x88, 0x7d, 0x89, 0x81, 0x6d, 0x85, 0x80, 0x82, 0xa0, 0x97,
+    0xa3, 0x72, 0x94, 0x74, 0x8e, 0x56, 0x96, 0x98, 0x91, 0x6f, 0xa0, 0xae,
+    0x7c, 0x6e, 0x8e, 0xa9, 0x7c, 0x80, 0x87, 0xa3, 0x9e, 0x57, 0x8e, 0xb5,
+    0x87, 0xa6, 0x87, 0x79, 0x8f, 0x55, 0x8a, 0x81, 0x97, 0x6c, 0x9b, 0x99,
+    0x78, 0x5c, 0x82, 0x80, 0x91, 0x76, 0x80, 0x91, 0x8b, 0x65, 0x89, 0x7d,
+    0xa9, 0x95, 0x89, 0x97, 0x96, 0x6a, 0x89, 0xad, 0x92, 0x9f, 0xb6, 0x82,
+    0x88, 0x79, 0x9d, 0xa5, 0x9c, 0xae, 0x9a, 0x93, 0x77, 0x8e, 0x8a, 0xb5,
+    0x84, 0xb0, 0x76, 0xa2, 0x89, 0xa0, 0x96, 0x7a, 0xa5, 0x8e, 0x7e, 0x74,
+    0x8d, 0x89, 0x89, 0x9e, 0x93, 0x95, 0x90, 0x78, 0x93, 0x8f, 0xa5, 0x7c,
+    0x9d, 0x7c, 0x77, 0x85, 0x81, 0x92, 0x7c, 0x87, 0x92, 0x82, 0x98, 0xa3,
+    0x63, 0x76, 0x9b, 0x91, 0x7b, 0x8e, 0x97, 0x7e, 0x66, 0x90, 0x63, 0xb4,
+    0x71, 0x88, 0x86, 0x8e, 0x6f, 0x89, 0x7a, 0x88, 0x93, 0x7f, 0x96, 0xa8,
+    0x7d, 0x88, 0x88, 0x86, 0x7b, 0x91, 0x88, 0x6b, 0xa6, 0x8b, 0x69, 0x78,
+    0x82, 0x80, 0x83, 0x6b, 0xaf, 0x81, 0x7b, 0x64, 0x8f, 0x78, 0x6e, 0x7f,
+    0x86, 0x91, 0x92, 0xa3, 0xa0, 0x97, 0x82, 0x88, 0x92, 0x90, 0x9e, 0x89,
+    0x9d, 0x7b, 0x96, 0x82, 0xa3, 0x8c, 0x7f, 0x84, 0x7a, 0x6c, 0x60, 0x85,
+    0xa9, 0x74, 0x83, 0xa2, 0x89, 0x87, 0x9b, 0x77, 0x9b, 0x9a, 0x99, 0x84,
+    0x7c, 0x9c, 0x8d, 0x90, 0x8d, 0x7b, 0x74, 0x77, 0x93, 0x8c, 0x6c, 0x8b,
+    0x85, 0x78, 0x7f, 0x7d, 0x75, 0x7f, 0x7e, 0x85, 0x8f, 0x7d, 0x62, 0x8c,
+    0x7c, 0xad, 0x7f, 0x83, 0xa1, 0xa1, 0x97, 0x7b, 0x72, 0x82, 0x9d, 0x81,
+    0x94, 0x81, 0x8d, 0x9f, 0x6f, 0x8f, 0x9d, 0x89, 0x6a, 0x7e, 0x7f, 0x7f,
+    0x8d, 0x7e, 0x91, 0x86, 0x7d, 0x8a, 0x7e, 0x70, 0x7b, 0x9b, 0x6e, 0x5f,
+    0xa8, 0x7a, 0x73, 0x8a, 0x7a, 0x71, 0x90, 0x95, 0x8d, 0x78, 0x7b, 0x72,
+    0x5e, 0x89, 0x62, 0xa1, 0x87, 0x7f, 0x83, 0x75, 0x98, 0x7f, 0x76, 0x72,
+    0x8f, 0x9b, 0x7a, 0x8b, 0xa1, 0x7f, 0x60, 0x99, 0x96, 0x6e, 0x67, 0x76,
+    0x88, 0x98, 0x6c, 0x7b, 0x9b, 0x8d, 0x5f, 0x89, 0x7c, 0x81, 0x79, 0x86,
+    0x69, 0x9e, 0x83, 0x65, 0x8e, 0x82, 0x83, 0x89, 0x85, 0x7f, 0x90, 0x80,
+    0xa2, 0x81, 0x85, 0x83, 0x8e, 0x94, 0x94, 0x75, 0x86, 0x87, 0x9a, 0xb2,
+    0x82, 0x99, 0x85, 0x7f, 0x8c, 0x7e, 0x81, 0x9a, 0x81, 0x7d, 0x87, 0x81,
+    0xa3, 0x8c, 0x8d, 0x85, 0x8d, 0x96, 0x86, 0x7c, 0xa7, 0x87, 0x7e, 0x9d,
+    0x63, 0xa8, 0x7c, 0x97, 0xa2, 0xa4, 0x7e, 0x87, 0x93, 0x9e, 0x89, 0x8d,
+    0x6b, 0x6d, 0x9d, 0x9b, 0x78, 0x8a, 0x8e, 0x7f, 0x7b, 0xa5, 0x6e, 0x8c,
+    0x89, 0x88, 0x73, 0x7e, 0x77, 0x9d, 0xa6, 0xa7, 0x77, 0x87, 0x7e, 0x7e,
+    0x97, 0x84, 0x6b, 0x59, 0x60, 0x90, 0x85, 0x76, 0x8f, 0x61, 0x7f, 0x94,
+    0x8f, 0x84, 0x8b, 0x7f, 0x73, 0x77, 0x73, 0x71, 0x8a, 0x9b, 0x7b, 0x89,
+    0x97, 0x8f, 0x76, 0x63, 0xa3, 0xa1, 0x6b, 0x7c, 0x62, 0x95, 0x8e, 0xa3,
+    0x9f, 0x89, 0x8f, 0x7f, 0x92, 0x7c, 0xa2, 0xa4, 0xa6, 0x92, 0x89, 0x93,
+    0x74, 0x73, 0x73, 0x96, 0xad, 0x9b, 0x87, 0xac, 0x91, 0x8a, 0xa0, 0x70,
+    0x70, 0x7e, 0x8f, 0x74, 0x75, 0xaf, 0x8d, 0x82, 0x8e, 0x82, 0x96, 0x7d,
+    0x69, 0x9c, 0x64, 0xa2, 0x82, 0x89, 0x83, 0x9d, 0x83, 0x88, 0x62, 0x92,
+    0x72, 0x89, 0x6d, 0x7f, 0x92, 0x70, 0x8e, 0x80, 0x7e, 0x8d, 0x91, 0x85,
+    0x8d, 0x89, 0x83, 0x96, 0x90, 0x96, 0x9c, 0xa6, 0x8a, 0x73, 0x89, 0x79,
+    0xa9, 0x70, 0x80, 0x78, 0x96, 0x80, 0x7b, 0x85, 0xa5, 0x80, 0x93, 0x95,
+    0xc5, 0x74, 0x81, 0x88, 0xa2, 0x93, 0x86, 0x9c, 0xa3, 0x6d, 0x92, 0x8a,
+    0x92, 0x99, 0x98, 0x65, 0xad, 0x63, 0x9d, 0x95, 0x99, 0x89, 0x7f, 0x7a,
+    0x99, 0x91, 0x7f, 0x78, 0x90, 0x8f, 0x80, 0x85, 0xa1, 0x68, 0x9d, 0x6c,
+    0x83, 0x8f, 0x7c, 0x5e, 0x99, 0x7b, 0x80, 0x91, 0x66, 0x8a, 0x92, 0xb3,
+    0x7a, 0x99, 0x91, 0x7e, 0x7d, 0x96, 0x69, 0x9e, 0x7c, 0x89, 0xad, 0x8f,
+    0x9d, 0x90, 0x85, 0x8e, 0x72, 0xa9, 0x89, 0x83, 0x7c, 0x82, 0x70, 0x82,
+    0x6b, 0x79, 0x75, 0x8d, 0x77, 0x9b, 0x7c, 0x8f, 0x8a, 0x95, 0x87, 0x9f,
+    0x7c, 0x90, 0x87, 0x70, 0x83, 0x83, 0x98, 0x9f, 0x85, 0x86, 0x8d, 0x81,
+    0x87, 0x87, 0x87, 0x9d, 0x8f, 0x9d, 0x7c, 0x98, 0xa2, 0xac, 0x88, 0x93,
+    0x88, 0x7d, 0x9b, 0x76, 0x82, 0x67, 0x69, 0x7f, 0x8c, 0x8d, 0x94, 0x7d,
+    0x7b, 0xae, 0x8c, 0x85, 0x8b, 0xa7, 0x8c, 0x87, 0x96, 0x7d, 0x8b, 0x90,
+    0x90, 0x7c, 0x92, 0xa8, 0x81, 0x87, 0xa4, 0xa4, 0x82, 0x8b, 0x8d, 0x89,
+    0x8f, 0x70, 0x9d, 0x7f, 0xa0, 0x84, 0x99, 0x65, 0x99, 0x78, 0x94, 0x8b,
+    0xc5, 0x8d, 0x8d, 0x55, 0xb3, 0x8d, 0x78, 0x93, 0xb4, 0x6d, 0x84, 0x90,
+    0xd5, 0x76, 0x7a, 0x9e, 0xc8, 0x8f, 0x86, 0x8a, 0xaa, 0x8b, 0x7f, 0x90,
+    0xaa, 0x95, 0x9c, 0x81, 0xb4, 0x6b, 0x64, 0x8a, 0x99, 0x84, 0x74, 0x6e,
+    0x95, 0x75, 0x98, 0x92, 0x9a, 0x91, 0x8c, 0x7d, 0x88, 0x6e, 0x89, 0x7d,
+    0x87, 0x80, 0x8e, 0x86, 0x78, 0x9f, 0x96, 0x75, 0x76, 0x82, 0x84, 0xaf,
+    0x8a, 0xb3, 0x93, 0x97, 0x86, 0x7c, 0x7e, 0x96, 0x7c, 0x6d, 0x90, 0x8e,
+    0x85, 0x88, 0x8a, 0x9f, 0x70, 0x89, 0x9f, 0x99, 0x95, 0x87, 0x91, 0x9d,
+    0x80, 0x74, 0x88, 0x7c, 0x7f, 0xa8, 0x93, 0x77, 0x66, 0xa6, 0x80, 0xa2,
+    0x88, 0xa0, 0xaf, 0x6f, 0x76, 0x70, 0x82, 0x9a, 0x73, 0x89, 0x9a, 0x75,
+    0x75, 0x8e, 0x5f, 0x85, 0x6a, 0x76, 0x98, 0x66, 0x87, 0xa3, 0x7a, 0x73,
+    0x9d, 0xa1, 0x98, 0x8e, 0x78, 0x91, 0x83, 0x8c, 0x82, 0x9e, 0x90, 0x87,
+    0x8f, 0x9b, 0x8b, 0x8f, 0x89, 0x62, 0x74, 0x82, 0x7b, 0x7f, 0x8a, 0x9d,
+    0x89, 0x93, 0x8c, 0x7a, 0x99, 0x77, 0xac, 0x75, 0x9b, 0x7f, 0x7f, 0x56,
+    0x8c, 0x96, 0x70, 0x79, 0xc2, 0x7d, 0x90, 0x64, 0xe9, 0x79, 0x68, 0xb2,
+    0xc2, 0xa6, 0xa7, 0x7e, 0xd9, 0x98, 0x79, 0x87, 0xc0, 0x97, 0x87, 0x66,
+    0xd0, 0x9f, 0x92, 0x82, 0xa4, 0xa8, 0x8d, 0x78, 0xa6, 0xa1, 0x76, 0x7d,
+    0xa4, 0x87, 0x89, 0x51, 0xae, 0x88, 0x5b, 0x76, 0x7d, 0x70, 0x74, 0x93,
+    0x89, 0x74, 0x9e, 0x7a, 0x79, 0x64, 0x9a, 0x94, 0x65, 0x93, 0xb0, 0x8d,
+    0x88, 0x7e, 0x8e, 0xa5, 0x63, 0x94, 0x94, 0x7d, 0x91, 0x87, 0x84, 0x95,
+    0x75, 0x9e, 0x81, 0x99, 0x65, 0x76, 0x82, 0x9c, 0x6a, 0xab, 0x84, 0x85,
+    0x88, 0x72, 0x92, 0x83, 0x82, 0xaf, 0x6d, 0x9d, 0x9e, 0x73, 0x98, 0x7f,
+    0x91, 0xb4, 0x62, 0x8d, 0x74, 0x6e, 0xb4, 0x94, 0x97, 0x9e, 0x6f, 0x9a,
+    0x83, 0x7b, 0xa9, 0x7d, 0x87, 0x97, 0x60, 0xa9, 0x7a, 0x75, 0xad, 0x6c,
+    0x77, 0xa4, 0x88, 0x82, 0x6f, 0x8a, 0x83, 0x74, 0x9a, 0xa7, 0x83, 0x91,
+    0x7c, 0x7c, 0x78, 0x77, 0x83, 0x92, 0x7a, 0x83, 0x90, 0x6f, 0x79, 0x6b,
+    0x9b, 0x8d, 0x99, 0x95, 0x7b, 0x89, 0x8e, 0x6c, 0x8e, 0x6c, 0x9b, 0x91,
+    0x97, 0x80, 0x83, 0x6f, 0xaa, 0x91, 0x66, 0x76, 0xc9, 0x77, 0x82, 0x4d,
+    0xd7, 0x5f, 0x58, 0x9a, 0xb1, 0x7a, 0xb1, 0x6b, 0xe5, 0x9d, 0x76, 0x89,
+    0xb6, 0x94, 0x90, 0x5b, 0xb8, 0x92, 0x7d, 0x90, 0xbd, 0x9a, 0x85, 0x4e,
+    0xb4, 0x84, 0x61, 0x82, 0x94, 0x8e, 0x70, 0x57, 0x90, 0x89, 0x6f, 0x60,
+    0x78, 0x90, 0x78, 0x85, 0x8e, 0x7c, 0x76, 0x74, 0x71, 0x5d, 0x94, 0x93,
+    0x71, 0x8f, 0xc2, 0x80, 0x75, 0x7d, 0x77, 0xa8, 0x70, 0x8f, 0xa6, 0x83,
+    0x74, 0x6b, 0x79, 0x97, 0x76, 0xa2, 0xad, 0x93, 0x5b, 0x8c, 0x7c, 0x7e,
+    0x82, 0x9b, 0xa0, 0x76, 0x71, 0x7a, 0xa3, 0x80, 0x87, 0x90, 0x92, 0xa6,
+    0x85, 0x71, 0x99, 0x91, 0x91, 0x8c, 0x99, 0x9b, 0x92, 0x74, 0xb2, 0x79,
+    0x9c, 0x7c, 0x7b, 0xa8, 0x8c, 0x6f, 0xb5, 0x69, 0x7a, 0x8a, 0x68, 0x9f,
+    0x82, 0x7d, 0xbd, 0x5f, 0xa1, 0x92, 0x83, 0x9f, 0x6f, 0xa1, 0x88, 0x61,
+    0x7b, 0x94, 0x89, 0x83, 0x6f, 0x6e, 0x92, 0x9d, 0x65, 0x7f, 0x97, 0x83,
+    0x87, 0x75, 0x92, 0x8a, 0x82, 0x82, 0x79, 0x92, 0x78, 0x89, 0x92, 0x7a,
+    0x91, 0x64, 0x8a, 0x93, 0x9d, 0x74, 0x78, 0x64, 0xab, 0x57, 0x7a, 0x84,
+    0xcf, 0x7d, 0x95, 0x4f, 0xde, 0x63, 0x78, 0x9a, 0xb7, 0x7a, 0x8b, 0x5b,
+    0xda, 0xa3, 0x94, 0x99, 0xbd, 0x88, 0xa4, 0x53, 0xad, 0x8b, 0x81, 0x96,
+    0xca, 0x8f, 0x76, 0x5e, 0xbd, 0x9d, 0x70, 0x81, 0x9b, 0x7d, 0x8a, 0x44,
+    0xa0, 0x77, 0x52, 0x6e, 0x82, 0x62, 0x6a, 0x6b, 0x9d, 0xaa, 0x81, 0x85,
+    0x7d, 0x5f, 0x7f, 0x9c, 0x65, 0x99, 0x97, 0x81, 0x7f, 0x65, 0x65, 0xa4,
+    0x84, 0x8c, 0xa1, 0x6d, 0x7a, 0x70, 0x79, 0x90, 0x98, 0xaa, 0x76, 0x95,
+    0x7f, 0x91, 0x95, 0x96, 0x6e, 0xa5, 0x95, 0xa2, 0x7d, 0x7e, 0x93, 0x87,
+    0x7d, 0x9b, 0x85, 0x9b, 0x85, 0x79, 0x96, 0x6b, 0x9d, 0x9d, 0x61, 0x99,
+    0x9c, 0x74, 0xcc, 0x7e, 0x9a, 0x83, 0x83, 0x98, 0x6f, 0x6d, 0xc5, 0x69,
+    0xb0, 0xa5, 0x5c, 0x91, 0x6c, 0x7b, 0xcc, 0x72, 0x9a, 0x9d, 0x7e, 0xa3,
+    0x8a, 0x96, 0x8e, 0x74, 0x7b, 0x80, 0x6b, 0x85, 0x84, 0x56, 0x92, 0x83,
+    0x64, 0x90, 0x86, 0x86, 0x88, 0x79, 0x8b, 0xa0, 0x86, 0x72, 0xab, 0x95,
+    0x80, 0x81, 0x96, 0x8f, 0x75, 0x7f, 0x71, 0x92, 0x9e, 0x75, 0x62, 0x5e,
+    0xc3, 0x7a, 0x6c, 0x84, 0xba, 0x81, 0x8f, 0x49, 0xc9, 0x76, 0x54, 0x89,
+    0xc2, 0x8c, 0xa2, 0x54, 0xd8, 0xa4, 0x72, 0x90, 0xb1, 0x91, 0xa0, 0x7a,
+    0xbf, 0x9a, 0x6f, 0x82, 0xbb, 0x81, 0x6a, 0x52, 0xc2, 0x82, 0x52, 0x65,
+    0x8d, 0x8a, 0x84, 0x46, 0xa2, 0x90, 0x45, 0x52, 0x82, 0x61, 0x8c, 0x77,
+    0x92, 0x6d, 0x87, 0x5b, 0x5e, 0x72, 0x76, 0x97, 0x73, 0x8d, 0x8d, 0x70,
+    0x7a, 0x66, 0x76, 0x89, 0x72, 0xbf, 0xb0, 0x84, 0x7d, 0x80, 0x71, 0x8f,
+    0x85, 0xa9, 0xa3, 0x7d, 0x7b, 0x84, 0x83, 0xa1, 0x97, 0xa7, 0xaf, 0x84,
+    0x86, 0x7d, 0x94, 0x78, 0x80, 0x98, 0x71, 0x84, 0x94, 0x73, 0xb0, 0x74,
+    0x99, 0xa2, 0x68, 0xa7, 0x8b, 0x86, 0xe0, 0x75, 0x9e, 0x93, 0x5c, 0xb2,
+    0xa2, 0x68, 0xb8, 0x61, 0x92, 0xa3, 0x68, 0xa4, 0x89, 0x59, 0xd0, 0x77,
+    0x97, 0xa9, 0x6a, 0x9b, 0x7d, 0x69, 0x9b, 0x79, 0x8c, 0x7c, 0x68, 0x8b,
+    0x7a, 0x53, 0x99, 0x9c, 0x7e, 0x8d, 0x89, 0x96, 0x9e, 0x83, 0x89, 0x74,
+    0x7f, 0x94, 0x92, 0x8f, 0x85, 0x8a, 0x8a, 0x80, 0x99, 0x87, 0x7a, 0x7d,
+    0xac, 0x93, 0x74, 0x68, 0xba, 0x87, 0x6a, 0x98, 0xc7, 0x79, 0x91, 0x54,
+    0xeb, 0x80, 0x45, 0x80, 0xc4, 0xb4, 0x94, 0x61, 0xd2, 0xa6, 0x7b, 0x95,
+    0xa4, 0xaa, 0x93, 0x7b, 0xb1, 0x74, 0x53, 0x7c, 0xaa, 0x91, 0x64, 0x51,
+    0xa9, 0x6e, 0x5e, 0x7c, 0x79, 0x82, 0x8b, 0x2e, 0x9d, 0x66, 0x61, 0x5e,
+    0x72, 0x7f, 0x6e, 0x6d, 0x8c, 0x79, 0x7d, 0x60, 0x76, 0x79, 0x68, 0x84,
+    0x4d, 0x8e, 0xa8, 0x8f, 0x78, 0x74, 0x69, 0xa4, 0x6e, 0xa9, 0xb9, 0x59,
+    0x83, 0x7f, 0x7a, 0x93, 0x90, 0x9b, 0x8d, 0x93, 0x78, 0x80, 0x77, 0x8b,
+    0x72, 0xa3, 0x97, 0x73, 0x91, 0x6c, 0x9a, 0x97, 0xa3, 0xad, 0x89, 0x96,
+    0x9e, 0x6d, 0xb5, 0x7c, 0xa4, 0x98, 0x61, 0x8a, 0x93, 0x5f, 0xdc, 0x63,
+    0xba, 0x92, 0x84, 0x94, 0xab, 0x6f, 0xbf, 0x66, 0x98, 0x93, 0x74, 0x85,
+    0x96, 0x63, 0xb8, 0x60, 0x94, 0xbb, 0x79, 0x94, 0x7b, 0x67, 0x8a, 0x64,
+    0x99, 0xac, 0x60, 0x98, 0xb0, 0x65, 0xa2, 0x73, 0x8f, 0x94, 0x8c, 0x92,
+    0x84, 0x84, 0x9b, 0x8f, 0x84, 0x8d, 0x9f, 0x90, 0x91, 0x85, 0x93, 0x74,
+    0x97, 0x66, 0x7f, 0x78, 0xa2, 0x95, 0x73, 0x6b, 0xc5, 0x6f, 0x62, 0x79,
+    0xbd, 0x81, 0x89, 0x4a, 0xbd, 0x93, 0x57, 0x81, 0xba, 0xb0, 0x9b, 0x4c,
+    0xe8, 0xa2, 0x85, 0xa2, 0x96, 0x92, 0x93, 0x62, 0xbe, 0x7a, 0x71, 0x8b,
+    0x8d, 0x97, 0x53, 0x56, 0xb1, 0x5f, 0x67, 0x60, 0x7a, 0x8e, 0x8a, 0x3a,
+    0x86, 0x67, 0x6d, 0x53, 0x6e, 0x91, 0x7b, 0x60, 0x99, 0x6d, 0x71, 0x5d,
+    0x67, 0x65, 0x63, 0x87, 0x71, 0x8a, 0x92, 0x6d, 0x8f, 0x6f, 0x6f, 0xae,
+    0x6c, 0xa2, 0x87, 0x6f, 0x99, 0x88, 0x78, 0x94, 0x8a, 0xb2, 0x93, 0x89,
+    0x90, 0x8d, 0x8c, 0x98, 0x81, 0x86, 0x90, 0x6d, 0xa2, 0x82, 0xa2, 0xa3,
+    0x9d, 0x8f, 0x7a, 0x9f, 0x87, 0x70, 0xbd, 0x8e, 0xa5, 0x99, 0x5d, 0x70,
+    0x8c, 0x60, 0xc7, 0x78, 0x97, 0xb0, 0x6f, 0x94, 0x92, 0x5a, 0xc3, 0x6e,
+    0x8b, 0x9f, 0x79, 0xa3, 0x8c, 0x5e, 0xbf, 0x79, 0x8e, 0x98, 0x76, 0x8e,
+    0x67, 0x31, 0x9b, 0x85, 0x8e, 0x85, 0x71, 0x99, 0x72, 0x77, 0x84, 0x81,
+    0x91, 0x95, 0x80, 0x98, 0x82, 0x6f, 0x90, 0xa0, 0x91, 0x91, 0x8e, 0x75,
+    0x8a, 0x89, 0x93, 0x69, 0x95, 0x7f, 0x9a, 0xa0, 0x9e, 0x9b, 0x88, 0x4e,
+    0xc3, 0x8d, 0x65, 0x74, 0xba, 0x8d, 0x97, 0x4d, 0xd6, 0x94, 0x73, 0xa0,
+    0xb1, 0xb3, 0x8c, 0x67, 0xdd, 0x9f, 0x7f, 0xaa, 0xaf, 0x9a, 0x88, 0x67,
+    0xc2, 0x8f, 0x71, 0x7b, 0x8f, 0x9f, 0x47, 0x52, 0x93, 0x72, 0x5a, 0x52,
+    0x97, 0x9d, 0x67, 0x3c, 0xa9, 0x59, 0x59, 0x5b, 0x88, 0x92, 0x82, 0x57,
+    0x83, 0x67, 0x94, 0x77, 0x52, 0x74, 0x60, 0x9e, 0x52, 0x84, 0xa2, 0x69,
+    0x71, 0x96, 0x73, 0xb0, 0x5e, 0xb0, 0x89, 0x71, 0x94, 0x8a, 0x66, 0xa0,
+    0x75, 0xc1, 0x99, 0x8e, 0x83, 0x8a, 0x91, 0x89, 0x6b, 0xa5, 0x79, 0x82,
+    0x8b, 0x73, 0x95, 0xb0, 0x77, 0x9b, 0x82, 0x7d, 0x8f, 0x60, 0xb9, 0x78,
+    0x8b, 0x8f, 0x7b, 0x74, 0x84, 0x6d, 0xbf, 0x76, 0x8f, 0xa3, 0x91, 0xa1,
+    0x81, 0x59, 0xcb, 0x69, 0xac, 0x90, 0x98, 0x92, 0xa7, 0x5d, 0xb4, 0x8b,
+    0xaa, 0xb1, 0x98, 0x8c, 0xa2, 0x4d, 0xa1, 0x69, 0x7f, 0xa0, 0x7d, 0x8a,
+    0x9b, 0x77, 0x8e, 0x71, 0x82, 0x8a, 0x78, 0x8d, 0x98, 0x78, 0x90, 0x91,
+    0x7e, 0x7f, 0x78, 0x85, 0x97, 0x8a, 0x97, 0x6d, 0xb3, 0x94, 0x89, 0xa3,
+    0xa5, 0x9a, 0x76, 0x6b, 0xbd, 0x79, 0x71, 0x95, 0xce, 0xab, 0x93, 0x1f,
+    0xe9, 0x97, 0x4c, 0x84, 0xd5, 0x9f, 0x98, 0x6e, 0xdd, 0x8d, 0x80, 0x9c,
+    0xa8, 0x9e, 0x8d, 0x75, 0xbc, 0x8c, 0x80, 0x89, 0xa1, 0x89, 0x74, 0x58,
+    0x92, 0x86, 0x55, 0x87, 0x91, 0x8d, 0x70, 0x33, 0xb8, 0x50, 0x63, 0x6b,
+    0x79, 0x99, 0x76, 0x71, 0x75, 0x59, 0x73, 0x6b, 0x62, 0x62, 0x74, 0x85,
+    0x73, 0xa3, 0xac, 0x78, 0x77, 0x88, 0x64, 0xa0, 0x73, 0xa1, 0xa8, 0x73,
+    0x91, 0x8e, 0x5f, 0x9a, 0x68, 0xc9, 0xa1, 0x92, 0x7a, 0x7c, 0x69, 0x77,
+    0x7d, 0x9e, 0x8f, 0x76, 0x88, 0x80, 0x92, 0x93, 0x91, 0x99, 0x8c, 0x85,
+    0x9f, 0x69, 0xa8, 0x9b, 0x9f, 0x9a, 0x64, 0x7a, 0x99, 0x70, 0xc4, 0x6d,
+    0x9a, 0x99, 0x82, 0xa0, 0x8b, 0x59, 0xc8, 0x61, 0x8f, 0x95, 0x72, 0x8c,
+    0x90, 0x63, 0xa9, 0x7e, 0x88, 0x8c, 0x85, 0x78, 0x76, 0x58, 0x8e, 0x72,
+    0xa3, 0x9a, 0x7c, 0xa0, 0x7f, 0x6d, 0xa6, 0x83, 0x7e, 0x8d, 0x83, 0x88,
+    0x86, 0x68, 0x8d, 0x96, 0xaa, 0x78, 0x90, 0xa5, 0x9c, 0x9d, 0x99, 0x88,
+    0xb0, 0x82, 0x6f, 0x7e, 0xad, 0xa9, 0x7b, 0x6a, 0xba, 0x6c, 0x6d, 0x89,
+    0xc1, 0x9e, 0x8e, 0x2f, 0xf2, 0x77, 0x50, 0x73, 0xdb, 0xc4, 0x9c, 0x6c,
+    0xd0, 0x90, 0x88, 0xbe, 0x97, 0xb9, 0x9e, 0x6e, 0xbe, 0x8e, 0x83, 0x8e,
+    0x96, 0x98, 0x4c, 0x4e, 0xa7, 0x8d, 0x43, 0x92, 0x8f, 0x92, 0x6d, 0x27,
+    0x94, 0x73, 0x5f, 0x42, 0x7c, 0xa7, 0x8a, 0x5a, 0x81, 0x60, 0x85, 0x66,
+    0x73, 0x72, 0x74, 0x9d, 0x5a, 0x9e, 0xa3, 0x71, 0x75, 0x91, 0x4f, 0xa2,
+    0x67, 0xa6, 0x91, 0x64, 0x92, 0x7e, 0x95, 0x8d, 0x6e, 0xbe, 0x9b, 0x57,
+    0x9b, 0x82, 0x89, 0x70, 0x6f, 0x9e, 0x7e, 0x86, 0x97, 0x81, 0x85, 0x8e,
+    0x70, 0x96, 0x6c, 0x72, 0xab, 0x6d, 0x9c, 0x91, 0xa0, 0x8a, 0x8d, 0x88,
+    0x9e, 0x75, 0xc6, 0x76, 0x7c, 0xa7, 0x6b, 0xa8, 0x94, 0x72, 0xb6, 0x78,
+    0x8d, 0x90, 0x7b, 0x8c, 0xa6, 0x65, 0xad, 0x9b, 0xaa, 0x94, 0x89, 0x7d,
+    0x90, 0x69, 0xaa, 0x7e, 0x9e, 0xad, 0x7f, 0x94, 0x81, 0x7d, 0xa1, 0x7b,
+    0x6c, 0x65, 0x83, 0x95, 0x89, 0x75, 0x93, 0x87, 0x94, 0x87, 0xa8, 0x92,
+    0x8d, 0xa6, 0x9f, 0x78, 0xaa, 0x72, 0x95, 0x94, 0xac, 0xa6, 0x91, 0x5a,
+    0xdb, 0x82, 0x55, 0xb6, 0xc1, 0xa3, 0x84, 0x4f, 0xc9, 0x88, 0x53, 0x8f,
+    0xbb, 0xae, 0x9b, 0x8a, 0xd8, 0xa9, 0x68, 0xc2, 0xa0, 0xa9, 0x87, 0x6b,
+    0xbd, 0x99, 0x7e, 0x86, 0x88, 0xa7, 0x5e, 0x53, 0xa4, 0x84, 0x6b, 0x6e,
+    0x89, 0x95, 0x84, 0x2d, 0xb5, 0x43, 0x3e, 0x50, 0x71, 0x96, 0x9a, 0x5b,
+    0xa1, 0x60, 0x80, 0x70, 0x6a, 0x73, 0x8f, 0x95, 0x52, 0x9b, 0xae, 0x71,
+    0x76, 0x7d, 0x61, 0x99, 0x5b, 0xc3, 0xa8, 0x76, 0x98, 0x72, 0x7f, 0x8a,
+    0x66, 0xc7, 0xa3, 0x7b, 0x8e, 0x8f, 0x70, 0x74, 0x6a, 0xae, 0x85, 0x83,
+    0x96, 0x7d, 0x98, 0xa7, 0x8f, 0x94, 0x7e, 0x84, 0x96, 0x7a, 0xab, 0x7d,
+    0x83, 0xb1, 0x6f, 0x7d, 0x9f, 0x80, 0xca, 0x8f, 0x9b, 0xa9, 0x69, 0x7a,
+    0x92, 0x73, 0xaa, 0x74, 0x88, 0x98, 0x87, 0x8f, 0xa7, 0x68, 0xa0, 0x74,
+    0x97, 0x95, 0x6e, 0x6f, 0x83, 0x53, 0x9b, 0x79, 0x71, 0x87, 0x7d, 0x8b,
+    0x79, 0x87, 0xa3, 0x75, 0x68, 0x73, 0x7e, 0x89, 0x8f, 0x81, 0x98, 0x7a,
+    0x9a, 0x83, 0x9d, 0x95, 0x90, 0x98, 0x97, 0x57, 0x93, 0x7e, 0xa2, 0x9a,
+    0xa8, 0x8a, 0x85, 0x53, 0xbd, 0x7a, 0x61, 0x8b, 0xca, 0xac, 0x9b, 0x2e,
+    0xe8, 0xa5, 0x66, 0x86, 0xca, 0xa7, 0xa0, 0x85, 0xcf, 0xa4, 0x6a, 0xc2,
+    0xb0, 0xaa, 0x76, 0x76, 0xb6, 0xa2, 0x72, 0xa9, 0xa1, 0xa1, 0x67, 0x67,
+    0xac, 0x90, 0x70, 0x6d, 0x8f, 0xb5, 0x6d, 0x3b, 0x85, 0x64, 0x4a, 0x6e,
+    0x72, 0x9f, 0x98, 0x5b, 0x97, 0x3e, 0x8a, 0x6a, 0x6c, 0x7d, 0x77, 0x98,
+    0x5a, 0x92, 0xa3, 0x81, 0x6f, 0x91, 0x7b, 0xa6, 0x6e, 0x9c, 0x9b, 0x5f,
+    0x9e, 0x7e, 0x77, 0x9d, 0x88, 0xc6, 0x81, 0x5a, 0x93, 0x8b, 0x6c, 0x71,
+    0x63, 0x9e, 0x78, 0x79, 0x70, 0x90, 0x95, 0x9f, 0x71, 0xa9, 0x90, 0x73,
+    0x98, 0x8a, 0xa5, 0x8e, 0x87, 0xb0, 0x79, 0x79, 0x92, 0x7d, 0xcc, 0xa8,
+    0x7a, 0x92, 0x82, 0x91, 0x90, 0x69, 0xa4, 0x9b, 0x97, 0x8f, 0x75, 0x7c,
+    0xa3, 0x69, 0xb5, 0x87, 0x8d, 0x88, 0x7b, 0x94, 0x8b, 0x55, 0xa2, 0x6d,
+    0x89, 0x8e, 0x81, 0x8a, 0x9e, 0x87, 0x86, 0x83, 0x8b, 0x84, 0x87, 0xa7,
+    0x8e, 0x79, 0xa4, 0x9c, 0x99, 0x82, 0xa3, 0x8f, 0x91, 0x9a, 0x95, 0x5b,
+    0x9f, 0x6e, 0x85, 0x93, 0xa6, 0x9a, 0x91, 0x4c, 0xd8, 0x6b, 0x6d, 0x85,
+    0xde, 0xaa, 0x97, 0x51, 0xcf, 0x8c, 0x5f, 0x9a, 0xc2, 0x9d, 0x9a, 0x7c,
+    0xc6, 0xb1, 0x84, 0xac, 0xba, 0xa5, 0x7c, 0x76, 0xbd, 0x93, 0x7f, 0xa0,
+    0x86, 0xae, 0x47, 0x41, 0x88, 0x82, 0x62, 0x62, 0x73, 0xad, 0x6b, 0x23,
+    0xa0, 0x48, 0x5a, 0x5a, 0x8f, 0x98, 0xbd, 0x5c, 0x9c, 0x72, 0x7c, 0x68,
+    0x50, 0x78, 0x91, 0xab, 0x5c, 0xc1, 0xc6, 0x66, 0x87, 0x86, 0x60, 0x99,
+    0x65, 0xac, 0x94, 0x91, 0x7e, 0x8c, 0x7d, 0x9b, 0x70, 0xb2, 0x9a, 0x7d,
+    0x82, 0x91, 0x6b, 0x86, 0x6f, 0xbb, 0x7f, 0x66, 0x7a, 0x79, 0x94, 0x96,
+    0x71, 0xa5, 0x75, 0x73, 0x95, 0x81, 0xa4, 0x8b, 0x87, 0xaa, 0x8e, 0x92,
+    0xa9, 0x82, 0xb0, 0x92, 0x89, 0xa7, 0x83, 0x81, 0x8c, 0x6d, 0xc4, 0x7a,
+    0x89, 0xa5, 0xa1, 0xa2, 0xa4, 0x6b, 0xa4, 0x82, 0x90, 0xb2, 0x8d, 0x72,
+    0x83, 0x60, 0xa7, 0x7a, 0x80, 0x97, 0x65, 0x90, 0x87, 0x85, 0xae, 0x71,
+    0x7d, 0x71, 0x98, 0xa8, 0x90, 0x75, 0xa9, 0x96, 0xa2, 0x91, 0x7b, 0x6b,
+    0xa0, 0x9d, 0x8d, 0x5d, 0xa4, 0x79, 0x8c, 0xa4, 0xad, 0x94, 0x7e, 0x77,
+    0xb6, 0x92, 0x74, 0xaf, 0xb5, 0x9b, 0x99, 0x67, 0xe7, 0x8e, 0x6a, 0x87,
+    0xc1, 0x98, 0x9b, 0x7e, 0xd7, 0x9b, 0x5b, 0xae, 0xc9, 0x94, 0x7a, 0x6d,
+    0x9e, 0xb4, 0x86, 0x8e, 0xa3, 0xa1, 0x5e, 0x5d, 0x8e, 0x8f, 0x6b, 0x59,
+    0xa5, 0xa9, 0x69, 0x20, 0xa4, 0x64, 0x35, 0x61, 0x83, 0x9d, 0x8a, 0x4e,
+    0x8b, 0x6c, 0x5e, 0x5b, 0x68, 0x76, 0x89, 0x94, 0x5f, 0x87, 0x98, 0x7a,
+    0x5d, 0x81, 0x89, 0xa6, 0x54, 0xa3, 0xb4, 0x7b, 0x83, 0x8a, 0x90, 0x8b,
+    0x86, 0xbc, 0x86, 0x59, 0x91, 0x79, 0x71, 0x6b, 0x7c, 0x94, 0x98, 0x7f,
+    0x81, 0x76, 0x85, 0xad, 0x69, 0xa8, 0x83, 0x8c, 0x8f, 0x70, 0x9a, 0x91,
+    0x78, 0xb3, 0x8f, 0x6d, 0x90, 0x86, 0xbd, 0x97, 0x7f, 0xaf, 0x7e, 0x90,
+    0x8f, 0x63, 0xa2, 0x93, 0x6e, 0xab, 0x75, 0x72, 0x8d, 0x74, 0xa1, 0x72,
+    0x82, 0xaa, 0x70, 0x82, 0x8d, 0x67, 0x94, 0x91, 0x92, 0xa5, 0x7f, 0xa5,
+    0x6f, 0x6d, 0xaf, 0x80, 0x89, 0x7d, 0x92, 0x99, 0x92, 0x72, 0x9d, 0x7d,
+    0x92, 0x78, 0xa9, 0x89, 0xa9, 0x9b, 0xa3, 0x73, 0x98, 0x71, 0x98, 0x86,
+    0x9e, 0x97, 0x9e, 0x6a, 0xb9, 0x6a, 0x6e, 0x90, 0xde, 0x94, 0x9a, 0x52,
+    0xdd, 0xa9, 0x6a, 0x79, 0xb9, 0xa3, 0xaa, 0x95, 0xba, 0xa2, 0x75, 0xc2,
+    0xbf, 0xb5, 0x6d, 0x8d, 0xae, 0x9b, 0x8d, 0x9a, 0x92, 0xb4, 0x5e, 0x4b,
+    0x8b, 0x99, 0x4f, 0x65, 0x94, 0xb6, 0x5d, 0x3a, 0xa3, 0x77, 0x51, 0x4e,
+    0x6d, 0xa3, 0x94, 0x59, 0x80, 0x56, 0x8c, 0x67, 0x67, 0x74, 0x99, 0x85,
+    0x57, 0x7b, 0x9e, 0x7e, 0x84, 0x85, 0x94, 0x96, 0x71, 0xbf, 0x97, 0x5f,
+    0x7d, 0x80, 0x93, 0x87, 0x6b, 0xb9, 0x7d, 0x8b, 0x84, 0x84, 0x6b, 0x8c,
+    0x6c, 0xc4, 0x85, 0x82, 0x87, 0x8d, 0x64, 0x90, 0x80, 0xb6, 0x9a, 0x70,
+    0x9c, 0x68, 0xa0, 0x88, 0x81, 0x9d, 0x83, 0x75, 0x9d, 0x84, 0xbf, 0x8f,
+    0x83, 0x9b, 0x75, 0x82, 0x9c, 0x76, 0xa4, 0x9d, 0x8a, 0xa7, 0x8e, 0x96,
+    0x9c, 0x64, 0xc0, 0x95, 0x88, 0xa5, 0x6f, 0x74, 0x7e, 0x5d, 0x9f, 0x7d,
+    0x89, 0x81, 0x71, 0xa8, 0x82, 0x6e, 0x9b, 0x9a, 0x6f, 0xa5, 0x88, 0x89,
+    0xa4, 0x7e, 0xa4, 0x90, 0xa1, 0x83, 0x8b, 0x9c, 0x9a, 0x89, 0xa2, 0x89,
+    0x9d, 0x5d, 0x86, 0xa5, 0xc4, 0x96, 0x9c, 0x85, 0xd6, 0x7c, 0x69, 0x88,
+    0xc9, 0xa5, 0x9b, 0x60, 0xea, 0xab, 0x62, 0x9f, 0xd1, 0xa5, 0x86, 0x7e,
+    0xb3, 0xbd, 0x7a, 0xa1, 0xbd, 0xa0, 0x7c, 0x92, 0xa6, 0xa3, 0x7d, 0xa9,
+    0x98, 0xa6, 0x71, 0x5c, 0x9b, 0x9b, 0x58, 0x6f, 0x8f, 0xaa, 0x5e, 0x3b,
+    0xa6, 0x5f, 0x3a, 0x79, 0x94, 0xa5, 0x84, 0x6f, 0x83, 0x5d, 0x75, 0x65,
+    0x6c, 0x77, 0x86, 0xad, 0x4a, 0x92, 0x8e, 0x8a, 0x8f, 0x7b, 0x72, 0x96,
+    0x79, 0xa6, 0xa8, 0x6d, 0x7b, 0x7b, 0x98, 0xa9, 0x79, 0xb9, 0x9e, 0x8f,
+    0x90, 0x6d, 0x76, 0x82, 0x81, 0xc1, 0x95, 0x7c, 0x97, 0x8d, 0x95, 0xa2,
+    0x7c, 0xa4, 0x7b, 0x9b, 0x7f, 0x6f, 0xac, 0x83, 0x7e, 0xa1, 0x7c, 0x7c,
+    0xa1, 0x7a, 0xa1, 0x6d, 0x95, 0x86, 0x77, 0x98, 0x8e, 0x58, 0xa2, 0x76,
+    0x8e, 0xa8, 0x94, 0x90, 0xa7, 0x62, 0xb8, 0x8a, 0x9f, 0xac, 0x87, 0x91,
+    0x88, 0x50, 0xa7, 0x83, 0x88, 0x65, 0x7a, 0x92, 0x9d, 0x70, 0xa9, 0x99,
+    0x7c, 0x87, 0x8c, 0x96, 0x8e, 0x73, 0xa4, 0xa7, 0x9b, 0x70, 0x99, 0x96,
+    0x8f, 0x88, 0xb4, 0x85, 0xa8, 0x6a, 0x9e, 0x78, 0xb0, 0x82, 0x9f, 0x89,
+    0xc9, 0x8d, 0x71, 0x7f, 0xc0, 0x98, 0xa0, 0x6d, 0xd2, 0x8e, 0x64, 0x9e,
+    0xb2, 0xa9, 0x93, 0x6e, 0xcc, 0xbb, 0x89, 0xb1, 0xc1, 0x9b, 0x86, 0x94,
+    0xb5, 0xb5, 0x95, 0xa0, 0x9c, 0x9b, 0x62, 0x5f, 0x7b, 0x91, 0x69, 0x74,
+    0x9e, 0xa3, 0x81, 0x30, 0x85, 0x59, 0x49, 0x5e, 0x83, 0x85, 0x7d, 0x6a,
+    0x90, 0x51, 0x80, 0x5e, 0x64, 0x6f, 0x99, 0x93, 0x75, 0x9a, 0xa7, 0x72,
+    0x6c, 0x5d, 0xa3, 0x93, 0x87, 0xa7, 0xbd, 0x6f, 0x92, 0x6d, 0x85, 0x98,
+    0x6f, 0xc7, 0xb6, 0x7c, 0x80, 0x71, 0x8a, 0x9f, 0x71, 0xb5, 0x8c, 0x6d,
+    0xac, 0x7b, 0x72, 0xb7, 0x69, 0xa6, 0x9d, 0x66, 0xab, 0x7a, 0x8b, 0x70,
+    0x8c, 0x9e, 0x86, 0x75, 0x96, 0x7b, 0xa3, 0x93, 0x8f, 0xb7, 0x84, 0x8c,
+    0x87, 0x56, 0xae, 0x82, 0x71, 0xa3, 0x8d, 0x93, 0xaf, 0x59, 0xb3, 0x8a,
+    0x97, 0x99, 0x75, 0x73, 0x8e, 0x51, 0xae, 0x84, 0x8b, 0x7a, 0x76, 0x77,
+    0x6e, 0x75, 0xa4, 0x8a, 0x75, 0x8e, 0x8f, 0xa2, 0x96, 0x76, 0x9a, 0x80,
+    0x96, 0x7d, 0x94, 0x71, 0x8a, 0x90, 0xac, 0x82, 0xa5, 0x61, 0xa3, 0x84,
+    0xac, 0x8f, 0x74, 0x5c, 0xb6, 0x77, 0x8b, 0x9b, 0xb5, 0x8b, 0xb6, 0x52,
+    0xd7, 0xaa, 0x4b, 0x8c, 0xbf, 0xb8, 0x9f, 0x6d, 0xcb, 0xa3, 0x6e, 0x97,
+    0xaa, 0x8d, 0x7c, 0x99, 0xc0, 0xd0, 0x9e, 0xb7, 0x93, 0xaa, 0x5a, 0x6a,
+    0x7d, 0x9a, 0x63, 0x71, 0x78, 0x8c, 0x67, 0x43, 0x87, 0x52, 0x64, 0x68,
+    0x68, 0x9c, 0x65, 0x60, 0x7a, 0x35, 0x68, 0x66, 0x63, 0x69, 0x8d, 0x8f,
+    0x72, 0x9b, 0x99, 0x5b, 0x80, 0x67, 0x93, 0xa2, 0x97, 0x9d, 0x8c, 0x68,
+    0x80, 0x86, 0x96, 0x91, 0x64, 0xbf, 0x98, 0x63, 0x83, 0x85, 0x61, 0x97,
+    0x6a, 0xac, 0xb4, 0x99, 0x8d, 0x7b, 0x7b, 0xad, 0x8b, 0xb2, 0x9e, 0x7f,
+    0x9a, 0x73, 0x91, 0x84, 0x89, 0x9f, 0x8a, 0x87, 0x8b, 0x72, 0x8e, 0x79,
+    0x86, 0xa7, 0x77, 0x84, 0x90, 0x58, 0xb2, 0x90, 0x93, 0xa0, 0x7f, 0x8a,
+    0x91, 0x5a, 0xb1, 0x80, 0x99, 0xc1, 0x80, 0x7d, 0x97, 0x5c, 0x9a, 0x8c,
+    0x71, 0x96, 0x7e, 0x7f, 0xad, 0x7b, 0xb9, 0x8a, 0x84, 0x84, 0x81, 0x97,
+    0x94, 0x64, 0x9f, 0x7e, 0x9b, 0x8d, 0x7d, 0x8d, 0x9a, 0x9e, 0xac, 0x72,
+    0xb2, 0x73, 0x81, 0x84, 0xc8, 0x81, 0x88, 0x72, 0xbe, 0x85, 0x86, 0x97,
+    0xd3, 0x8a, 0xc7, 0x75, 0xce, 0x9c, 0x69, 0xa6, 0xb0, 0xa1, 0x8e, 0x64,
+    0xb1, 0xa6, 0x67, 0xaa, 0xcd, 0x95, 0x97, 0xa2, 0xb2, 0xb2, 0x85, 0x9a,
+    0x9d, 0xa3, 0x5e, 0x73, 0x6e, 0xae, 0x50, 0x83, 0x8c, 0xab, 0x92, 0x43,
+    0x6b, 0x66, 0x43, 0x5c, 0x8f, 0x8a, 0x9a, 0x6c, 0x84, 0x48, 0x80, 0x6b,
+    0x8d, 0x82, 0xaf, 0x89, 0x71, 0x9f, 0xa4, 0x9a, 0x7b, 0x68, 0x91, 0xaa,
+    0x6b, 0xa3, 0x9c, 0x62, 0x8d, 0x6d, 0x87, 0x87, 0x81, 0x9a, 0x97, 0x6c,
+    0x9c, 0x76, 0x63, 0xbc, 0x62, 0xbc, 0xb0, 0x97, 0xa7, 0x81, 0x70, 0x8f,
+    0x7d, 0xb2, 0xa6, 0x98, 0xa1, 0x7b, 0x8e, 0x83, 0x8c, 0xa2, 0x7e, 0x73,
+    0x99, 0x65, 0xc1, 0x77, 0x8e, 0xbc, 0x72, 0xa6, 0x8c, 0x55, 0xab, 0x8e,
+    0x7d, 0xa3, 0x79, 0x80, 0x9e, 0x6b, 0xa9, 0x6c, 0x80, 0xb6, 0x81, 0xa6,
+    0x92, 0x5b, 0xb7, 0x99, 0x81, 0x7e, 0x8e, 0x89, 0x97, 0x86, 0x93, 0x86,
+    0x7b, 0x9a, 0x7f, 0x9a, 0x8e, 0x69, 0xa3, 0xa4, 0x9f, 0x8b, 0x96, 0x6f,
+    0x8b, 0x97, 0xb4, 0x74, 0x96, 0x53, 0x99, 0x91, 0xa7, 0xa8, 0x69, 0x72,
+    0xc9, 0x85, 0x99, 0x93, 0xc0, 0x90, 0xaa, 0x7f, 0xc7, 0x71, 0x74, 0x8d,
+    0xb7, 0xab, 0x91, 0x69, 0xb4, 0x9b, 0x7d, 0x95, 0xc3, 0xb0, 0x9b, 0xa9,
+    0xb3, 0x9f, 0x79, 0xa5, 0x9f, 0xad, 0x6b, 0x85, 0x90, 0xad, 0x69, 0x62,
+    0x7e, 0xa6, 0x69, 0x4e, 0x80, 0x7e, 0x52, 0x57, 0x5f, 0x95, 0x72, 0x4c,
+    0x87, 0x4e, 0x5a, 0x62, 0x7d, 0x70, 0x92, 0x98, 0x76, 0x8e, 0x99, 0x7d,
+    0x73, 0x6d, 0x86, 0x8e, 0x6b, 0x80, 0xa7, 0x9d, 0x91, 0x73, 0x95, 0x70,
+    0x80, 0xc3, 0x9f, 0x8b, 0x72, 0x86, 0x6b, 0xad, 0x76, 0xbe, 0xad, 0x8e,
+    0x9c, 0x78, 0x6a, 0xbf, 0x7d, 0xa8, 0x88, 0x8a, 0x8b, 0x8c, 0x9c, 0x8c,
+    0x8a, 0x85, 0x73, 0x92, 0xa2, 0x7b, 0xa5, 0x96, 0x9b, 0xa3, 0x6c, 0x80,
+    0xa6, 0x63, 0xac, 0x98, 0xa3, 0x9a, 0x83, 0x8a, 0x8c, 0x63, 0xb9, 0x8c,
+    0x99, 0xa1, 0x7a, 0x6c, 0x9e, 0x59, 0x90, 0x84, 0x8a, 0x93, 0x8f, 0x87,
+    0x98, 0x84, 0x99, 0xa4, 0x72, 0x6d, 0x95, 0xa2, 0x95, 0x72, 0xc3, 0x88,
+    0x8f, 0x6a, 0x77, 0x7d, 0x8b, 0xae, 0xa3, 0x7c, 0xa8, 0x5d, 0x7c, 0xa8,
+    0xa1, 0x85, 0x7e, 0x8c, 0xac, 0x8d, 0x73, 0x88, 0xc1, 0x89, 0xaa, 0x89,
+    0xb2, 0x92, 0x75, 0x9a, 0x9c, 0x8e, 0xb9, 0xaa, 0xaa, 0xac, 0x78, 0x85,
+    0xbc, 0x9f, 0x6d, 0xb7, 0x89, 0xa6, 0xb3, 0x8e, 0xa5, 0xbb, 0x6b, 0x9d,
+    0x8f, 0x8b, 0x69, 0x7a, 0x82, 0x99, 0x8c, 0x49, 0x87, 0x74, 0x37, 0x63,
+    0x5d, 0x92, 0x77, 0x66, 0x63, 0x56, 0x77, 0x5d, 0x7f, 0x68, 0x97, 0x74,
+    0x84, 0x94, 0x7d, 0x7d, 0x91, 0x78, 0x87, 0x96, 0x7f, 0x97, 0x94, 0x6f,
+    0x89, 0x6c, 0x96, 0x71, 0x83, 0x8f, 0x8a, 0x89, 0x7d, 0x84, 0x8a, 0xa6,
+    0x7b, 0x95, 0x89, 0x77, 0x94, 0x80, 0x7f, 0x93, 0x5e, 0xbb, 0x9c, 0xa8,
+    0xa2, 0x7e, 0xa6, 0x86, 0x7d, 0x8b, 0x92, 0x73, 0xac, 0x78, 0xaa, 0x98,
+    0xb1, 0x94, 0x79, 0x8b, 0x8f, 0x70, 0xa7, 0xae, 0x92, 0xad, 0xb1, 0x8b,
+    0xb0, 0x78, 0xbc, 0xa9, 0xa4, 0xa3, 0x9e, 0x76, 0x89, 0x67, 0xab, 0x98,
+    0x75, 0x8c, 0x86, 0x95, 0x9e, 0x77, 0x96, 0x85, 0x8c, 0x8e, 0x8b, 0x8a,
+    0x8a, 0x4b, 0x71, 0x8a, 0x9b, 0x6d, 0x6e, 0x89, 0x81, 0x82, 0xa7, 0x98,
+    0xa5, 0x66, 0x72, 0x8b, 0x99, 0x9a, 0x8b, 0x8b, 0x9f, 0x87, 0x79, 0x84,
+    0x99, 0x6d, 0x90, 0x7d, 0x9d, 0xa7, 0x81, 0xa3, 0x9d, 0x96, 0x82, 0x86,
+    0xa2, 0x8e, 0x8d, 0x7f, 0x84, 0x8c, 0x98, 0xbc, 0x83, 0xb4, 0xb5, 0x78,
+    0x7d, 0xab, 0x8d, 0x87, 0x71, 0x8d, 0x6e, 0x8f, 0x89, 0xaa, 0x7c, 0x6f,
+    0x71, 0x69, 0x65, 0x60, 0x81, 0x91, 0x94, 0x6d, 0x76, 0x66, 0x74, 0x5e,
+    0x77, 0x7c, 0xa2, 0xa6, 0x70, 0x90, 0xa3, 0x68, 0x83, 0x69, 0x71, 0x72,
+    0x6c, 0xa9, 0x85, 0x71, 0x88, 0x60, 0x90, 0x84, 0x8a, 0xba, 0x8b, 0x8c,
+    0x72, 0x8f, 0x98, 0x84, 0x8b, 0x8a, 0xb1, 0xa2, 0x93, 0x8d, 0x86, 0x99,
+    0xa2, 0x99, 0xb0, 0xa6, 0x92, 0x78, 0x86, 0x87, 0x9c, 0x9d, 0x6f, 0x92,
+    0x9a, 0x8a, 0xbf, 0xaa, 0xa3, 0xa2, 0x71, 0x8d, 0x93, 0x70, 0xb5, 0x9c,
+    0xa8, 0x97, 0xb4, 0x93, 0xa6, 0x75, 0xbb, 0xa3, 0x92, 0x95, 0x95, 0x94,
+    0x90, 0x5b, 0xbf, 0x92, 0x8a, 0x95, 0xa0, 0xa1, 0x68, 0x7e, 0x9a, 0x7f,
+    0x88, 0xa7, 0x93, 0xa1, 0x7a, 0x93, 0x95, 0x8b, 0x96, 0x94, 0x70, 0xa0,
+    0x70, 0x8f, 0x9d, 0x96, 0x8e, 0x9c, 0x90, 0x9f, 0x7e, 0x83, 0x84, 0x9e,
+    0x7f, 0x65, 0x72, 0x84, 0x64, 0x94, 0x75, 0xa7, 0x62, 0xa3, 0x8a, 0x9b,
+    0x82, 0x99, 0x87, 0x70, 0x81, 0x6d, 0xac, 0x7b, 0x74, 0x68, 0x5d, 0x95,
+    0xa0, 0x6e, 0x84, 0xab, 0x79, 0x8e, 0x8b, 0x79, 0x7b, 0x83, 0xa0, 0x7b,
+    0x96, 0x71, 0x5d, 0xad, 0xa4, 0x82, 0x79, 0x96, 0x73, 0x84, 0x7d, 0x98,
+    0x87, 0x93, 0x86, 0xa6, 0x7f, 0x7c, 0x71, 0x9d, 0xa4, 0x9b, 0x8a, 0x7c,
+    0x87, 0x6a, 0x7f, 0x8d, 0x97, 0x92, 0xa0, 0x88, 0x77, 0x7d, 0x70, 0x9c,
+    0x9f, 0xa0, 0x71, 0xa3, 0x73, 0x95, 0x76, 0x79, 0x94, 0x95, 0x83, 0x8b,
+    0x8d, 0x82, 0x7a, 0x77, 0xa6, 0x88, 0x72, 0x7a, 0x90, 0x76, 0x7f, 0x95,
+    0x83, 0x90, 0x9e, 0x7c, 0x8e, 0x9a, 0x6b, 0xa4, 0x98, 0x9f, 0x86, 0x8c,
+    0x76, 0x70, 0x74, 0x97, 0x7e, 0xa4, 0x5f, 0xa3, 0xa7, 0x7f, 0x67, 0x8d,
+    0x82, 0x95, 0x93, 0x99, 0x82, 0x70, 0x75, 0xa8, 0xa1, 0xaf, 0x8a, 0x8a,
+    0xb0, 0x89, 0x88, 0x6b, 0x98, 0xaf, 0x75, 0x7f, 0x86, 0x90, 0x8f, 0x8c,
+    0x84, 0x8d, 0x7f, 0x8b, 0x94, 0x9f, 0x80, 0x8b, 0x93, 0xa2, 0x98, 0xa5,
+    0x83, 0x81, 0x8a, 0xaa, 0x86, 0xa3, 0xb0, 0xac, 0x64, 0x9c, 0x7c, 0x93,
+    0xac, 0x85, 0x7f, 0x88, 0x7a, 0xa5, 0x75, 0x69, 0x94, 0xa8, 0x95, 0xa9,
+    0x6f, 0x9f, 0x85, 0x8a, 0xa5, 0x97, 0x98, 0xa9, 0x76, 0x80, 0x7e, 0x95,
+    0x89, 0xaf, 0x68, 0x7b, 0xb4, 0x8a, 0x6b, 0xa4, 0x7b, 0x90, 0x79, 0xba,
+    0x9f, 0x82, 0x7d, 0x89, 0x85, 0x82, 0x94, 0xa5, 0x78, 0x8f, 0x6f, 0x71,
+    0x62, 0x66, 0x73, 0x98, 0x8c, 0x7d, 0x81, 0xa2, 0x69, 0x7c, 0x76, 0xa4,
+    0x94, 0x8f, 0x6f, 0x8a, 0x94, 0x8e, 0x8a, 0x88, 0x8c, 0xa3, 0x6f, 0xa2,
+    0x7d, 0x90, 0x8f, 0x96, 0x6c, 0x76, 0x6e, 0x8e, 0x82, 0x85, 0x7f, 0x93,
+    0x81, 0x83, 0x7b, 0x9f, 0x91, 0x89, 0x75, 0x9c, 0x9f, 0x86, 0x7a, 0x8c,
+    0x7a, 0x7b, 0x82, 0xae, 0x6a, 0x7d, 0x82, 0x82, 0xa0, 0x85, 0x99, 0x9f,
+    0x88, 0x8b, 0x8c, 0x8f, 0x90, 0x96, 0x8e, 0x98, 0xa3, 0x87, 0x7f, 0x9b,
+    0x94, 0x73, 0x96, 0x86, 0x72, 0x7c, 0x75, 0x7c, 0x90, 0x79, 0x83, 0x80,
+    0x79, 0x9e, 0x9c, 0x8e, 0x99, 0x8c, 0x7a, 0x9c, 0x8d, 0x99, 0x9d, 0x84,
+    0xa5, 0x93, 0x85, 0x96, 0x88, 0x94, 0x80, 0x90, 0x73, 0xa3, 0x7c, 0xa1,
+    0x88, 0xa4, 0x98, 0x9f, 0x9e, 0x92, 0x6c, 0xa0, 0x84, 0x87, 0x8a, 0x83,
+    0x7b, 0x91, 0x8c, 0x9e, 0x73, 0xa6, 0x93, 0xa0, 0x8d, 0x98, 0x74, 0xa1,
+    0x83, 0x9a, 0x80, 0xbc, 0x62, 0x70, 0x9e, 0xad, 0x9e, 0x8f, 0x8f, 0x9e,
+    0x7e, 0xac, 0xb0, 0xa9, 0x79, 0x6f, 0x79, 0x8f, 0x7e, 0x71, 0x8d, 0xab,
+    0x97, 0x76, 0x86, 0xa2, 0x98, 0x95, 0x8b, 0x9b, 0x75, 0x7a, 0x71, 0x85,
+    0x7f, 0x61, 0x76, 0x8e, 0x99, 0x91, 0x88, 0x73, 0x71, 0x65, 0x82, 0xa0,
+    0x9b, 0x8f, 0x79, 0x70, 0x78, 0x66, 0x85, 0x94, 0x8b, 0x91, 0x75, 0x80,
+    0x9c, 0x94, 0x7f, 0xa5, 0x82, 0x91, 0x7d, 0x76, 0x80, 0x78, 0x83, 0x82,
+    0x79, 0x98, 0x83, 0x87, 0x94, 0x71, 0x73, 0x77, 0x71, 0x94, 0x6a, 0xa8,
+    0x9e, 0x8d, 0x90, 0x78, 0x7a, 0x81, 0x9c, 0x91, 0x96, 0x80, 0x79, 0x83,
+    0x92, 0x9f, 0x8a, 0x84, 0x8e, 0x97, 0x8c, 0x81, 0x87, 0x74, 0x8b, 0x8e,
+    0xa7, 0x86, 0x8b, 0x8a, 0x8e, 0x8f, 0x9b, 0x6b, 0x82, 0x8a, 0x9f, 0x7a,
+    0x96, 0x80, 0x91, 0x94, 0xa6, 0x8e, 0x7a, 0x97, 0x8a, 0x6c, 0xad, 0xa1,
+    0x78, 0x95, 0x9d, 0x9d, 0x88, 0x94, 0x99, 0x86, 0x80, 0x9b, 0x7c, 0x9c,
+    0x87, 0x7a, 0xa0, 0xa8, 0x83, 0x74, 0x8e, 0x9b, 0x65, 0x95, 0x83, 0xc2,
+    0x69, 0x88, 0x87, 0xa7, 0x86, 0x98, 0x9f, 0xc6, 0x5c, 0x7f, 0xb9, 0x9c,
+    0x8b, 0x6e, 0x95, 0xbd, 0x72, 0x83, 0xbf, 0xb1, 0x89, 0x6d, 0x89, 0x8e,
+    0x9d, 0x87, 0x95, 0x92, 0x76, 0x8d, 0x7f, 0x7f, 0x6d, 0x9d, 0x7b, 0x95,
+    0x86, 0x69, 0x90, 0xa0, 0x62, 0x7c, 0x56, 0xa0, 0x9c, 0x8b, 0x81, 0x79,
+    0xa6, 0x73, 0x69, 0xaa, 0x7b, 0x87, 0x8b, 0x7e, 0xa1, 0x9f, 0x6d, 0xa6,
+    0x7e, 0x7e, 0x87, 0x7c, 0xa5, 0x84, 0x7b, 0xa2, 0xae, 0x92, 0x8e, 0x67,
+    0x93, 0x88, 0x8b, 0xa2, 0x8d, 0x96, 0x92, 0x8e, 0x71, 0x7a, 0x82, 0x80,
+    0x9e, 0x8b, 0x7b, 0x87, 0x96, 0xa0, 0xa4, 0x92, 0x88, 0x7e, 0x77, 0x8e,
+    0x91, 0x7e, 0x81, 0x77, 0x79, 0x93, 0x8d, 0x9d, 0x8a, 0x71, 0x8d, 0x88,
+    0x9d, 0x89, 0x85, 0x94, 0x99, 0x80, 0x89, 0x8f, 0x87, 0x81, 0x83, 0x74,
+    0x8a, 0x89, 0x68, 0x7e, 0x99, 0x82, 0x8c, 0x76, 0xc6, 0x8f, 0x90, 0x7d,
+    0x6c, 0x68, 0xbd, 0x90, 0x78, 0x9d, 0x7b, 0xa3, 0x99, 0x76, 0xaf, 0x8d,
+    0x7d, 0x84, 0x7f, 0x9f, 0x8b, 0x7a, 0xaa, 0xa8, 0x79, 0x89, 0x8f, 0x8f,
+    0x71, 0x80, 0x7f, 0xaa, 0x85, 0x70, 0xa8, 0x96, 0x6c, 0x8c, 0xaf, 0xeb,
+    0x57, 0x7e, 0xcf, 0x8d, 0x93, 0x72, 0xa6, 0xd2, 0x52, 0xab, 0xbb, 0xa8,
+    0x8d, 0x82, 0x7a, 0xbc, 0x72, 0x95, 0xa3, 0xa7, 0x8b, 0x74, 0x84, 0x85,
+    0x6a, 0x85, 0x92, 0x9f, 0x91, 0x6b, 0x9b, 0x73, 0x77, 0xa2, 0x7f, 0x81,
+    0x8e, 0x8b, 0x71, 0x8c, 0x7f, 0x60, 0x86, 0x81, 0x9c, 0x86, 0x93, 0x65,
+    0x84, 0x84, 0x89, 0xa2, 0x98, 0x67, 0x88, 0x71, 0x92, 0x80, 0x65, 0xa2,
+    0xa5, 0x99, 0x85, 0x95, 0x8f, 0x85, 0x8f, 0x82, 0x7e, 0x9a, 0x8a, 0x74,
+    0x9d, 0x75, 0x88, 0x7e, 0xa2, 0x77, 0x82, 0x9e, 0x78, 0xa1, 0x74, 0x79,
+    0x7f, 0x87, 0x91, 0x8d, 0x7a, 0x73, 0x96, 0xa2, 0xa3, 0x81, 0x7d, 0x8a,
+    0x85, 0x75, 0x84, 0x81, 0x8b, 0x7f, 0x6c, 0x86, 0x8d, 0x7b, 0x79, 0x78,
+    0x89, 0x85, 0x8c, 0x9a, 0xa6, 0x96, 0x7a, 0x78, 0xa2, 0x85, 0x9b, 0x89,
+    0xc8, 0x97, 0xa3, 0x82, 0x8b, 0x7f, 0xe7, 0x8f, 0x8f, 0x74, 0x75, 0x83,
+    0x87, 0x79, 0xb3, 0xab, 0x70, 0x9a, 0x9a, 0xa6, 0x81, 0x7e, 0xb8, 0x91,
+    0x8b, 0x8d, 0x93, 0xa1, 0x79, 0x7d, 0x81, 0xb4, 0x79, 0x94, 0xa5, 0x89,
+    0x8e, 0x7c, 0x9b, 0xe2, 0x50, 0x94, 0xdf, 0xa0, 0x53, 0x5d, 0x90, 0xde,
+    0x67, 0x90, 0xaf, 0x8a, 0x8f, 0x73, 0x7b, 0xcb, 0x64, 0x9f, 0x91, 0x86,
+    0x95, 0x84, 0x83, 0x88, 0x76, 0x8b, 0x8a, 0x8f, 0x9c, 0x9a, 0x92, 0x96,
+    0x7f, 0x8e, 0x79, 0x80, 0x91, 0x6d, 0x86, 0x59, 0x74, 0x8a, 0x53, 0x88,
+    0xae, 0x7b, 0x80, 0x70, 0x87, 0x74, 0x75, 0x91, 0xa4, 0x74, 0x8d, 0x5a,
+    0x83, 0x95, 0x65, 0xa1, 0xb3, 0x74, 0x87, 0x7d, 0xaa, 0x82, 0x79, 0x78,
+    0x9b, 0x7c, 0x78, 0x74, 0x9e, 0x74, 0x92, 0x92, 0xa3, 0x6e, 0x75, 0x92,
+    0x6a, 0x6f, 0xa3, 0x7c, 0x9e, 0x7f, 0x92, 0x6b, 0x96, 0x79, 0x9a, 0x87,
+    0x83, 0x8c, 0x72, 0x79, 0x6a, 0xa3, 0x79, 0x7d, 0x6d, 0x6c, 0x81, 0x96,
+    0x98, 0x7f, 0x94, 0x81, 0x8a, 0x8a, 0xa7, 0x8c, 0x9a, 0x84, 0xa7, 0x89,
+    0x9d, 0x85, 0xa6, 0xa8, 0xd0, 0x92, 0x97, 0x9f, 0x76, 0x86, 0xe6, 0x6f,
+    0x7c, 0x84, 0x98, 0x8d, 0x80, 0x75, 0xc5, 0x86, 0x6b, 0x8d, 0x9e, 0x9e,
+    0x7f, 0x71, 0x97, 0xa1, 0x75, 0x92, 0xa9, 0x9e, 0x91, 0x5e, 0xa2, 0xa2,
+    0x68, 0xad, 0xa5, 0xa0, 0x7e, 0x68, 0xac, 0xdc, 0x50, 0xa2, 0xc1, 0x8a,
+    0x63, 0x74, 0x7e, 0xd9, 0x3f, 0xbb, 0xba, 0x9d, 0x7f, 0x76, 0x5f, 0xb0,
+    0x74, 0x8e, 0xb1, 0x95, 0x9a, 0x81, 0x63, 0x9f, 0x98, 0x74, 0x80, 0x89,
+    0x95, 0x8e, 0x9e, 0x78, 0x87, 0x82, 0x57, 0x87, 0x8d, 0x90, 0x79, 0x80,
+    0x76, 0x7c, 0x7d, 0x8a, 0xa6, 0x82, 0x98, 0x7a, 0x96, 0x97, 0x84, 0x87,
+    0xab, 0x7f, 0x87, 0x57, 0x83, 0x6a, 0x6a, 0x84, 0x9c, 0x8d, 0x74, 0x68,
+    0xa2, 0x92, 0x90, 0x98, 0x98, 0x8b, 0x6d, 0x72, 0x90, 0x8c, 0x7c, 0x7d,
+    0x9b, 0x6e, 0x71, 0x76, 0x6b, 0x7b, 0x63, 0x81, 0xad, 0x71, 0x78, 0x8e,
+    0x74, 0x87, 0x8e, 0x8a, 0xab, 0x8e, 0x83, 0x85, 0x7d, 0xa0, 0x67, 0x7f,
+    0x9c, 0x74, 0x6b, 0x88, 0x66, 0x92, 0x7f, 0x83, 0x94, 0x92, 0xa5, 0x82,
+    0xa1, 0x7b, 0x6f, 0x70, 0xab, 0x72, 0xb5, 0x91, 0xb7, 0x89, 0x91, 0x77,
+    0x77, 0x8a, 0xdb, 0x88, 0x8a, 0x8d, 0x89, 0x6c, 0x7b, 0x83, 0xc8, 0xb5,
+    0x4b, 0x96, 0x8b, 0x92, 0x91, 0x76, 0xa9, 0xae, 0x70, 0xa8, 0x74, 0x9d,
+    0x96, 0x6d, 0xa1, 0xba, 0x86, 0xbc, 0xbc, 0xa2, 0x8d, 0x6c, 0x96, 0xd8,
+    0x71, 0xb1, 0xae, 0xb0, 0x79, 0x7b, 0x71, 0xd8, 0x32, 0xaa, 0xae, 0xa7,
+    0x7c, 0x6b, 0x77, 0xc0, 0x7c, 0x9e, 0x9f, 0x89, 0x92, 0x8a, 0x76, 0xae,
+    0x97, 0x75, 0x87, 0x8c, 0x7f, 0x86, 0x8b, 0x73, 0x6b, 0x64, 0x87, 0x6d,
+    0x99, 0x8f, 0x8d, 0x66, 0x76, 0x87, 0x6d, 0x6e, 0x98, 0x7a, 0x91, 0x92,
+    0x8c, 0x7c, 0x89, 0x9b, 0x9e, 0x83, 0x86, 0x62, 0x90, 0x6e, 0x62, 0x82,
+    0xa3, 0x7e, 0x86, 0x6a, 0x93, 0x9b, 0x73, 0x6c, 0xa8, 0x99, 0x73, 0x99,
+    0x8c, 0x89, 0x85, 0x67, 0x98, 0x78, 0x63, 0x98, 0x77, 0xa6, 0x6e, 0x81,
+    0xa4, 0x64, 0x8f, 0x8a, 0x7f, 0x9b, 0x91, 0x91, 0x94, 0x82, 0x8b, 0x8b,
+    0x76, 0x66, 0x83, 0x81, 0x94, 0x71, 0x82, 0x9e, 0x93, 0x85, 0x80, 0x8c,
+    0xae, 0x94, 0x96, 0x74, 0x91, 0x9a, 0x6f, 0x9e, 0xa9, 0x76, 0xab, 0x8e,
+    0xd6, 0x9c, 0x7d, 0x98, 0x83, 0x6e, 0xfe, 0x83, 0x71, 0x82, 0x9f, 0x93,
+    0x7b, 0x67, 0xcb, 0xb9, 0x66, 0x89, 0x99, 0x8a, 0xac, 0x8c, 0xa0, 0x9c,
+    0x70, 0xaf, 0x81, 0x88, 0x9c, 0x7e, 0xa8, 0xa5, 0x65, 0x8c, 0xa1, 0x8c,
+    0x83, 0x85, 0x9d, 0xcb, 0x4b, 0xc1, 0xb5, 0xa2, 0x75, 0x63, 0x75, 0xbd,
+    0x34, 0xae, 0xca, 0xa2, 0x89, 0x7a, 0x69, 0xb0, 0x70, 0xae, 0x94, 0x76,
+    0x85, 0x93, 0x6a, 0x90, 0x6a, 0x8a, 0xac, 0x71, 0x7e, 0x81, 0xa2, 0x71,
+    0x98, 0x86, 0x99, 0x76, 0x8f, 0x6f, 0x90, 0x93, 0x7c, 0x72, 0x81, 0x8c,
+    0x78, 0x77, 0x97, 0x84, 0x98, 0x70, 0x96, 0x9a, 0x9b, 0x93, 0x92, 0x5f,
+    0xaa, 0x88, 0x5b, 0x74, 0xaa, 0x96, 0x6a, 0x73, 0x87, 0x83, 0x72, 0x89,
+    0xab, 0x8a, 0x5f, 0x71, 0xa4, 0x94, 0x92, 0x60, 0x96, 0x7b, 0x53, 0x88,
+    0x69, 0x8b, 0x5e, 0x7b, 0xa0, 0x83, 0x70, 0x95, 0x6d, 0x9b, 0x6d, 0x98,
+    0x99, 0x86, 0x6e, 0x7a, 0x87, 0x86, 0x68, 0x8a, 0x7e, 0x87, 0x90, 0x7d,
+    0x76, 0x93, 0x80, 0x8a, 0x8f, 0x97, 0xac, 0x71, 0xa2, 0x96, 0x7f, 0x8e,
+    0xc2, 0x71, 0xab, 0xa9, 0xd1, 0x85, 0x8c, 0x74, 0x70, 0x72, 0xff, 0x77,
+    0x6d, 0x77, 0x91, 0x5d, 0x71, 0x5d, 0xb2, 0xb1, 0x38, 0x76, 0xa6, 0x80,
+    0x91, 0x86, 0xa3, 0x9c, 0x85, 0x95, 0x99, 0xab, 0x8a, 0x6e, 0x9f, 0xa6,
+    0x75, 0xa9, 0xb3, 0x97, 0x69, 0x85, 0xa4, 0xc9, 0x59, 0xb4, 0xca, 0x8d,
+    0x5c, 0x67, 0x7d, 0xcd, 0x29, 0xca, 0xdb, 0x8c, 0x86, 0x8c, 0x70, 0xaa,
+    0x5c, 0x9e, 0x98, 0x86, 0x92, 0x7e, 0x6b, 0x8e, 0x8f, 0x6a, 0x84, 0x71,
+    0x9a, 0x76, 0x87, 0x84, 0x8b, 0x7f, 0x7f, 0x6e, 0xa3, 0x83, 0x85, 0x78,
+    0x6f, 0x7c, 0x6f, 0x96, 0x95, 0x8c, 0xa3, 0x72, 0x92, 0x66, 0x7b, 0x99,
+    0x9c, 0x9c, 0x9a, 0x63, 0xaa, 0x81, 0x7f, 0x90, 0x8c, 0xa0, 0x7e, 0x67,
+    0x94, 0x96, 0x7f, 0x8a, 0x95, 0x91, 0x5c, 0x73, 0x88, 0x9b, 0x85, 0x70,
+    0x87, 0x79, 0x56, 0x92, 0x69, 0x95, 0x62, 0x78, 0x93, 0x83, 0x63, 0x98,
+    0x7a, 0xa4, 0x95, 0x7c, 0x8e, 0x69, 0x86, 0x92, 0x7d, 0x6b, 0x69, 0x85,
+    0xa8, 0x90, 0x7c, 0x7b, 0x9e, 0x87, 0x7b, 0x90, 0x98, 0x7a, 0xa4, 0x92,
+    0xad, 0x97, 0xa0, 0x6d, 0xa6, 0x74, 0xb7, 0x7f, 0xb9, 0x94, 0x6c, 0x77,
+    0x65, 0x6f, 0xfc, 0x7d, 0x68, 0x74, 0xa1, 0x6c, 0x71, 0x61, 0xc3, 0xb5,
+    0x60, 0x86, 0x8b, 0x7d, 0x89, 0x8b, 0x93, 0xa4, 0x68, 0xa0, 0x8f, 0x73,
+    0x96, 0x6e, 0x81, 0x99, 0x81, 0x9d, 0xae, 0x93, 0x6a, 0x8b, 0x9a, 0xcb,
+    0x68, 0xaf, 0xca, 0x81, 0x73, 0x6e, 0x70, 0xd7, 0x49, 0xb9, 0xc5, 0x9d,
+    0x87, 0x8d, 0x61, 0xa8, 0x5e, 0xa4, 0xb7, 0xab, 0x96, 0x84, 0x76, 0x98,
+    0x84, 0x99, 0x8f, 0x70, 0x79, 0x94, 0xa5, 0x87, 0x6e, 0x73, 0x63, 0x7e,
+    0x83, 0x8c, 0x88, 0x71, 0x7a, 0x81, 0x7d, 0x94, 0x92, 0x89, 0xab, 0x7a,
+    0x96, 0x66, 0x7b, 0x8b, 0x8f, 0x8e, 0x94, 0x5b, 0xa0, 0x7f, 0x82, 0x84,
+    0x84, 0x80, 0x7d, 0x81, 0x89, 0x7b, 0x97, 0x78, 0x83, 0x93, 0x4c, 0x95,
+    0x7f, 0x93, 0x8e, 0x70, 0x89, 0x81, 0x69, 0x87, 0x76, 0x73, 0x9a, 0x74,
+    0xa2, 0x88, 0x5e, 0xac, 0x74, 0x8e, 0x74, 0x8e, 0x94, 0x85, 0x7b, 0x7a,
+    0x72, 0x82, 0x68, 0x77, 0x96, 0x8a, 0x7b, 0x6c, 0x88, 0x8b, 0x6b, 0x86,
+    0xa4, 0x88, 0xac, 0xa1, 0x90, 0x8e, 0x85, 0x6d, 0xb1, 0x69, 0xb1, 0xa2,
+    0xbe, 0x9a, 0x7c, 0xb4, 0x63, 0x56, 0xf2, 0x90, 0x5e, 0x71, 0xa3, 0x6a,
+    0x8b, 0x67, 0xbe, 0xa8, 0x6e, 0x8b, 0x90, 0x83, 0xa0, 0x78, 0x9f, 0xa5,
+    0x65, 0xa3, 0x8b, 0x94, 0x84, 0x6c, 0xa5, 0x97, 0x7d, 0xa7, 0x9f, 0x9c,
+    0x62, 0x7d, 0xb5, 0xb1, 0x58, 0x98, 0xba, 0x8d, 0x7f, 0x57, 0x86, 0xc5,
+    0x39, 0xb3, 0xc9, 0xa9, 0x89, 0x8e, 0x55, 0xaf, 0x54, 0xb4, 0xb0, 0x8f,
+    0x8b, 0x7c, 0x6e, 0x8e, 0x96, 0x90, 0x8a, 0x83, 0x84, 0x8c, 0x96, 0x7f,
+    0x89, 0x67, 0x99, 0x60, 0x74, 0x8d, 0x9b, 0x82, 0x6f, 0x61, 0x84, 0x9a,
+    0x7c, 0x85, 0x86, 0x7c, 0x9b, 0x5f, 0x81, 0x96, 0x90, 0x9b, 0xa0, 0x58,
+    0xaf, 0x78, 0x81, 0x8f, 0x96, 0x81, 0x77, 0x7d, 0xa2, 0x85, 0x74, 0x84,
+    0x99, 0x8d, 0x5f, 0x77, 0x8a, 0x8c, 0x85, 0x78, 0x8f, 0x80, 0x5c, 0x6f,
+    0x77, 0x73, 0x80, 0x99, 0x83, 0x89, 0x6f, 0x8e, 0x85, 0x7e, 0x6c, 0x81,
+    0x99, 0x89, 0x69, 0x70, 0x8c, 0x8f, 0x6b, 0x89, 0x80, 0x7a, 0x83, 0x7a,
+    0x96, 0x99, 0x73, 0x76, 0x9c, 0x67, 0xab, 0xab, 0xbd, 0x8b, 0x85, 0x90,
+    0xb0, 0x6b, 0xbd, 0x9c, 0xb9, 0xa0, 0x7c, 0x7d, 0x66, 0x78, 0xdb, 0x97,
+    0x55, 0x67, 0x96, 0x69, 0x80, 0x49, 0xc1, 0xbb, 0x6c, 0x91, 0x8a, 0x92,
+    0x9a, 0x98, 0xa5, 0x98, 0x51, 0xa6, 0x99, 0x8e, 0x73, 0x73, 0x9d, 0x9f,
+    0x77, 0xa6, 0xa4, 0x92, 0x64, 0x75, 0xac, 0xb2, 0x5d, 0xa1, 0xab, 0xa4,
+    0x5a, 0x5b, 0xb3, 0xb7, 0x2d, 0xca, 0xc8, 0x76, 0x94, 0x8e, 0x59, 0xb0,
+    0x52, 0x9d, 0xbd, 0x89, 0x97, 0x84, 0x5d, 0x9a, 0x87, 0x9b, 0x94, 0x6c,
+    0x7b, 0xaa, 0x8a, 0x8b, 0x79, 0x5d, 0x90, 0x5c, 0x8b, 0x7b, 0xbe, 0x68,
+    0x84, 0x6f, 0x75, 0x72, 0x98, 0x82, 0x92, 0x7a, 0xa2, 0x6e, 0x7b, 0x7d,
+    0x9c, 0x99, 0x97, 0x5d, 0x9b, 0x69, 0x80, 0xa3, 0x96, 0x8d, 0x7c, 0x82,
+    0xa3, 0x76, 0x95, 0x67, 0x93, 0x8e, 0x62, 0x7b, 0x78, 0x96, 0x69, 0x67,
+    0x84, 0x8f, 0x62, 0x80, 0x88, 0x7e, 0x6c, 0x94, 0xab, 0x8b, 0x82, 0x9e,
+    0x7e, 0x8c, 0x70, 0x83, 0x9c, 0x9c, 0x80, 0x87, 0x8f, 0xa1, 0x7f, 0x81,
+    0x95, 0x83, 0x6d, 0x7a, 0xa0, 0x77, 0x6d, 0x76, 0x91, 0x7e, 0xa3, 0x62,
+    0xa0, 0x93, 0x7e, 0x97, 0xb6, 0x6c, 0xad, 0x72, 0xb2, 0x95, 0x73, 0x83,
+    0x62, 0x56, 0xe2, 0x99, 0x6e, 0x66, 0xb0, 0x6c, 0x75, 0x4e, 0xb2, 0xc7,
+    0x51, 0x98, 0x90, 0x8c, 0x82, 0x63, 0xa8, 0x99, 0x54, 0xc1, 0x87, 0x80,
+    0x79, 0x62, 0xad, 0x81, 0x76, 0x99, 0xa9, 0x9b, 0x4e, 0x8c, 0xaf, 0xb6,
+    0x5d, 0x9b, 0xb4, 0x9f, 0x6d, 0x60, 0xa5, 0xb5, 0x3e, 0xb2, 0xc4, 0x96,
+    0x86, 0x6d, 0x48, 0x99, 0x50, 0xc1, 0xa8, 0x93, 0x8a, 0x92, 0x7d, 0x8f,
+    0x74, 0x87, 0x91, 0x71, 0x8c, 0x87, 0x90, 0x80, 0x80, 0x82, 0x7b, 0x85,
+    0x81, 0x7f, 0xa7, 0x6a, 0x78, 0x4e, 0x90, 0x85, 0x9f, 0x93, 0x91, 0x91,
+    0xa5, 0x6e, 0x9d, 0xa7, 0x9e, 0x7f, 0x9a, 0x66, 0xbe, 0x6f, 0x82, 0x81,
+    0x85, 0x86, 0x89, 0x6c, 0x88, 0x92, 0x6d, 0x6a, 0x8c, 0x95, 0x68, 0x70,
+    0x91, 0x9b, 0x76, 0x59, 0x87, 0x93, 0x6f, 0x79, 0x7a, 0x99, 0x7d, 0x76,
+    0xa3, 0x9c, 0x69, 0x75, 0x8f, 0x8e, 0x7e, 0x7a, 0x80, 0x8b, 0x76, 0x82,
+    0x70, 0x71, 0x77, 0x7a, 0x88, 0xa1, 0x79, 0x75, 0x9e, 0x7e, 0x6d, 0x6f,
+    0xa5, 0x84, 0xb1, 0x77, 0xad, 0x94, 0x98, 0x90, 0xa7, 0x5c, 0xb6, 0x84,
+    0x99, 0x91, 0x71, 0x7b, 0x6d, 0x54, 0xd2, 0x84, 0x5d, 0x75, 0xb4, 0x7e,
+    0x7d, 0x53, 0xc5, 0x98, 0x70, 0xaa, 0x9e, 0x81, 0x7d, 0x68, 0xa7, 0x8d,
+    0x63, 0xab, 0x9b, 0x96, 0x7e, 0x6b, 0xa3, 0x9e, 0x6d, 0x98, 0xaf, 0x9b,
+    0x78, 0x74, 0xae, 0xc7, 0x70, 0x98, 0xd4, 0x9a, 0x6e, 0x75, 0xa2, 0xcd,
+    0x42, 0xb0, 0xc9, 0x89, 0x88, 0x77, 0x6a, 0xa4, 0x66, 0xb5, 0xbc, 0x8a,
+    0x96, 0x87, 0x5e, 0xa5, 0x87, 0x95, 0x91, 0x5d, 0x85, 0x91, 0xaa, 0x8f,
+    0x99, 0x78, 0x79, 0x74, 0x7f, 0x81, 0xa1, 0x74, 0x77, 0x64, 0x6c, 0x94,
+    0xa0, 0x8b, 0x9b, 0x8e, 0xac, 0x6a, 0x98, 0x9c, 0x7a, 0x9f, 0xab, 0x7e,
+    0xa3, 0x8b, 0x68, 0x7f, 0x84, 0x9f, 0x93, 0x77, 0x90, 0x98, 0x8f, 0x87,
+    0x81, 0x8e, 0x76, 0x95, 0x66, 0x78, 0x85, 0x79, 0x95, 0x89, 0x64, 0x8e,
+    0x8a, 0x87, 0x6f, 0x65, 0xa4, 0x98, 0x7a, 0x83, 0x85, 0x7e, 0x6b, 0xaa,
+    0x81, 0x94, 0x7c, 0x6e, 0x78, 0x85, 0x87, 0x6d, 0x7a, 0x92, 0x67, 0x7a,
+    0x8d, 0x95, 0x77, 0x7f, 0x9f, 0x71, 0xb1, 0xa1, 0xb2, 0x91, 0x7f, 0xb0,
+    0xac, 0x5c, 0xaf, 0x6a, 0xae, 0x98, 0x63, 0x7e, 0x67, 0x6f, 0xc4, 0x8a,
+    0x75, 0x61, 0xac, 0x73, 0x86, 0x54, 0xc3, 0xa8, 0x5d, 0xa9, 0xb4, 0x9b,
+    0x80, 0x6d, 0xa1, 0x8d, 0x64, 0xaa, 0x86, 0x96, 0x86, 0x6c, 0x9b, 0x8b,
+    0x73, 0x9f, 0x9a, 0x87, 0x64, 0x6c, 0xad, 0xa6, 0x64, 0x8a, 0xbe, 0x88,
+    0x67, 0x67, 0xaf, 0xb0, 0x71, 0xae, 0xde, 0x95, 0x9f, 0x7c, 0x7d, 0xa1,
+    0x79, 0xb8, 0xaa, 0x9c, 0x84, 0x91, 0x6b, 0xac, 0x74, 0xa1, 0xad, 0x74,
+    0x88, 0x93, 0x94, 0x72, 0x97, 0x7a, 0x78, 0x86, 0x76, 0x93, 0xb1, 0x6f,
+    0x91, 0x44, 0x96, 0x8e, 0x8e, 0xa5, 0x9a, 0x70, 0x99, 0x79, 0x84, 0x82,
+    0x7f, 0x78, 0xac, 0x6f, 0x9c, 0x80, 0x7d, 0x87, 0x7f, 0x9d, 0x6a, 0x71,
+    0x7c, 0x92, 0x78, 0x7a, 0x93, 0x90, 0x55, 0x83, 0x7a, 0x8a, 0x9a, 0x65,
+    0x86, 0x9b, 0x7c, 0x6b, 0xa3, 0x85, 0x86, 0x71, 0xab, 0x9a, 0x86, 0x90,
+    0x86, 0x88, 0x88, 0x88, 0x99, 0x98, 0x77, 0x86, 0x88, 0x90, 0x79, 0x7c,
+    0x6e, 0x9f, 0x76, 0x70, 0x84, 0x67, 0x7e, 0x8b, 0xa5, 0x68, 0xa7, 0x9d,
+    0xb5, 0x9b, 0x8b, 0x8a, 0xc0, 0x60, 0x9e, 0x83, 0xb0, 0xb7, 0x65, 0x7f,
+    0x7a, 0x7e, 0xc3, 0x7b, 0x74, 0x8f, 0xa4, 0x68, 0x5f, 0x47, 0xbb, 0xa4,
+    0x74, 0x95, 0xab, 0x80, 0x70, 0x5c, 0x9a, 0x8a, 0x7d, 0xa5, 0x90, 0x7d,
+    0x86, 0x68, 0xb1, 0x73, 0x6d, 0xad, 0x93, 0x8d, 0x7b, 0x64, 0xbd, 0xae,
+    0x7a, 0x98, 0xcb, 0x97, 0x83, 0x67, 0xab, 0xb0, 0x61, 0xa7, 0xcd, 0x7e,
+    0x87, 0x78, 0x76, 0x95, 0x6a, 0xba, 0xa9, 0x84, 0x8f, 0x95, 0x7c, 0x8b,
+    0x90, 0x89, 0x8b, 0x81, 0x87, 0x8b, 0x76, 0x73, 0x6f, 0x61, 0x94, 0x73,
+    0x83, 0x97, 0xb3, 0x6b, 0x9c, 0x55, 0x7f, 0x96, 0x9a, 0x92, 0x85, 0x52,
+    0xc6, 0x73, 0x88, 0x9c, 0x7c, 0x86, 0x98, 0x6d, 0x99, 0x87, 0x80, 0x7c,
+    0x7d, 0x98, 0x74, 0x7c, 0x89, 0x8a, 0x7d, 0x7b, 0x83, 0x90, 0x7d, 0x81,
+    0x7a, 0xa0, 0x86, 0x5f, 0x74, 0x8e, 0x68, 0x7b, 0x6c, 0x86, 0x90, 0x84,
+    0x7e, 0xae, 0x73, 0x6f, 0x8d, 0x81, 0x7c, 0x93, 0xa0, 0xb3, 0x6b, 0x9a,
+    0x88, 0xab, 0x8a, 0x94, 0x9c, 0x87, 0x9c, 0x75, 0x7d, 0x8f, 0x7c, 0x7f,
+    0x9b, 0x69, 0xa8, 0x99, 0x9d, 0x89, 0x8f, 0x72, 0xba, 0x61, 0xac, 0x91,
+    0xb5, 0xa7, 0x84, 0x99, 0x71, 0x7e, 0xd0, 0x7c, 0x6d, 0x66, 0xb6, 0x72,
+    0x79, 0x61, 0xb6, 0xab, 0x69, 0xa0, 0xaa, 0x7d, 0x74, 0x61, 0x95, 0xa5,
+    0x71, 0xb0, 0x93, 0x95, 0x86, 0x7d, 0x9f, 0x7e, 0x6c, 0x97, 0x85, 0x87,
+    0x72, 0x7b, 0xb4, 0xad, 0x84, 0x7b, 0xcd, 0xa9, 0x7e, 0x6d, 0xc8, 0xc7,
+    0x7e, 0xb7, 0xcf, 0x98, 0x7b, 0x7c, 0x69, 0xaf, 0x64, 0xa6, 0xc1, 0x8e,
+    0x8f, 0x9c, 0x7d, 0x93, 0x7a, 0x96, 0x8a, 0x65, 0x92, 0x95, 0x8d, 0x6f,
+    0x9f, 0x7f, 0x65, 0x69, 0x7a, 0x92, 0x9f, 0x5c, 0x90, 0x4e, 0x69, 0x89,
+    0x8f, 0x9c, 0xa8, 0x7a, 0xb6, 0x7d, 0x84, 0x97, 0x7f, 0x91, 0x8d, 0x71,
+    0xae, 0x86, 0x80, 0x78, 0x81, 0x87, 0x6e, 0x88, 0x87, 0x7f, 0x8f, 0x9d,
+    0x78, 0x91, 0x74, 0x91, 0x7f, 0x7a, 0x80, 0x63, 0x93, 0xa0, 0x7f, 0x6f,
+    0xa3, 0x88, 0x76, 0x5c, 0x6e, 0xa1, 0x6e, 0x7f, 0x84, 0x8b, 0x87, 0x6d,
+    0x87, 0x9f, 0x79, 0x7c, 0x83, 0x89, 0x7e, 0x86, 0xa0, 0x82, 0x80, 0x8e,
+    0x8b, 0x6c, 0x6e, 0x69, 0x9f, 0x79, 0xaa, 0x6e, 0xa2, 0x8f, 0x9d, 0x87,
+    0xb4, 0x5d, 0xba, 0x6c, 0xaf, 0xa0, 0x84, 0x87, 0x8c, 0x89, 0xcb, 0x6f,
+    0x8e, 0x71, 0xae, 0x5d, 0x6c, 0x61, 0xb3, 0xaf, 0x7a, 0x94, 0xb1, 0x8a,
+    0x80, 0x65, 0x8a, 0x9d, 0x61, 0xb6, 0x8b, 0x97, 0x8a, 0x73, 0xa8, 0x82,
+    0x74, 0x8a, 0x9c, 0x73, 0x61, 0x69, 0xb8, 0x9f, 0x76, 0x90, 0xc5, 0xaa,
+    0x6b, 0x5f, 0xb7, 0xce, 0x6d, 0xb7, 0xcc, 0x97, 0x7a, 0x81, 0x95, 0xbe,
+    0x78, 0xb1, 0xb4, 0x97, 0x8e, 0x99, 0x70, 0xa2, 0x72, 0x8d, 0x8e, 0x7d,
+    0x90, 0x9f, 0x7b, 0x63, 0x87, 0x89, 0x7a, 0x5f, 0x81, 0x97, 0x8d, 0x78,
+    0x94, 0x64, 0x95, 0x9d, 0x90, 0x87, 0xb3, 0x6e, 0xc2, 0x80, 0x94, 0x86,
+    0x87, 0x93, 0xb3, 0x57, 0xb8, 0x73, 0x8a, 0x81, 0x6f, 0x95, 0x89, 0x82,
+    0x94, 0x7a, 0x8e, 0x97, 0x8a, 0x91, 0x7f, 0x77, 0x98, 0x72, 0x67, 0x5f,
+    0x7b, 0x8d, 0x78, 0x74, 0x91, 0x82, 0x86, 0x5c, 0x88, 0xa3, 0x73, 0x6f,
+    0x92, 0x78, 0x9c, 0x95, 0x99, 0x9d, 0x70, 0x89, 0x8f, 0xa7, 0x74, 0x89,
+    0x77, 0x90, 0x72, 0x8d, 0x9c, 0x6f, 0x7a, 0x6c, 0x9f, 0x72, 0xad, 0x6c,
+    0xa5, 0x7a, 0x9d, 0x78, 0xa4, 0x52, 0xbd, 0x94, 0xb5, 0x97, 0x75, 0x78,
+    0x86, 0x72, 0xdf, 0x6f, 0x98, 0x81, 0xab, 0x5d, 0x62, 0x65, 0x9d, 0xbc,
+    0x68, 0x8a, 0xc1, 0x7e, 0x67, 0x7f, 0x88, 0x95, 0x7f, 0xbd, 0x9c, 0x77,
+    0x7d, 0x7e, 0x96, 0x7c, 0x7f, 0xa1, 0xa4, 0x90, 0x7c, 0x74, 0xc0, 0xac,
+    0x7d, 0xa1, 0xdb, 0x85, 0x85, 0x51, 0xbc, 0xb1, 0x6c, 0xcb, 0xd1, 0xa7,
+    0x76, 0x70, 0x7d, 0xba, 0x88, 0xb6, 0xaf, 0xa2, 0x9d, 0x9b, 0x71, 0x96,
+    0x80, 0x89, 0xa3, 0x86, 0x89, 0x8f, 0x76, 0x77, 0xa9, 0x82, 0x8f, 0x69,
+    0x7f, 0x9d, 0xac, 0x80, 0x98, 0x6c, 0x70, 0x72, 0x81, 0x8b, 0xaf, 0x80,
+    0xb1, 0x6f, 0x7c, 0x90, 0x91, 0x82, 0xa5, 0x67, 0x9c, 0x76, 0x8c, 0x6b,
+    0x9c, 0x9b, 0x87, 0x8c, 0x8e, 0x8b, 0xb0, 0x9d, 0x89, 0x8f, 0x76, 0x87,
+    0x9b, 0x90, 0x8e, 0x74, 0x73, 0x91, 0x85, 0x80, 0x81, 0x72, 0x99, 0x84,
+    0x87, 0x95, 0x84, 0x8c, 0x8a, 0x6e, 0x8c, 0x82, 0xad, 0x9d, 0x80, 0x7f,
+    0x96, 0x9c, 0x7f, 0x67, 0xb0, 0x98, 0x69, 0x84, 0x94, 0xa9, 0x7e, 0x83,
+    0x9d, 0x62, 0x92, 0x6e, 0x95, 0x88, 0xa4, 0x90, 0x97, 0x4d, 0xae, 0x89,
+    0xb6, 0xa1, 0x88, 0x9f, 0x7a, 0x70, 0xc2, 0x71, 0x7f, 0x83, 0x90, 0x83,
+    0x5e, 0x50, 0xa9, 0x9f, 0x73, 0x8c, 0xb2, 0x80, 0x79, 0x65, 0x7c, 0x90,
+    0x6d, 0x9a, 0x91, 0x8d, 0x6f, 0x65, 0x97, 0x87, 0x82, 0xa0, 0xa4, 0x8c,
+    0x68, 0x76, 0xa8, 0xa2, 0x7f, 0xa4, 0xcd, 0x91, 0x70, 0x54, 0x95, 0xc6,
+    0x6e, 0x9c, 0xe2, 0xa1, 0x86, 0x82, 0x73, 0xbc, 0x89, 0xaa, 0xb2, 0x7d,
+    0x82, 0x84, 0x8b, 0x9e, 0x84, 0x94, 0xa0, 0x7a, 0x98, 0x9d, 0x99, 0x7b,
+    0x7b, 0x89, 0x8f, 0x66, 0x89, 0x9b, 0xa7, 0x8b, 0x9b, 0x62, 0x9b, 0x78,
+    0x8b, 0x95, 0xbd, 0x7a, 0x9e, 0x61, 0x80, 0x84, 0x89, 0x8e, 0xb4, 0x7b,
+    0xb8, 0x70, 0x75, 0x8e, 0x7b, 0x9c, 0x9e, 0x9f, 0x89, 0x86, 0x9b, 0x7a,
+    0x7b, 0x95, 0x83, 0x95, 0x80, 0x94, 0x85, 0x65, 0x8c, 0x81, 0x67, 0x77,
+    0x94, 0x8a, 0x92, 0x74, 0x72, 0x90, 0x6b, 0x74, 0x7e, 0x75, 0x71, 0x84,
+    0x9e, 0xa6, 0x64, 0x80, 0x8d, 0x7a, 0x8c, 0x82, 0x98, 0x96, 0x64, 0x7d,
+    0x8b, 0x82, 0x6a, 0x7f, 0x97, 0x4e, 0x91, 0x74, 0x94, 0x99, 0x6d, 0x6a,
+    0xb3, 0x5a, 0xb8, 0x64, 0xa3, 0x95, 0x5d, 0x95, 0x90, 0x87, 0xcc, 0x72,
+    0x85, 0x85, 0x8f, 0x55, 0x6f, 0x65, 0x84, 0xb6, 0x7b, 0x77, 0xce, 0x79,
+    0x82, 0x59, 0x8a, 0xa2, 0x68, 0x9b, 0xa3, 0x81, 0x9c, 0x7a, 0x97, 0x87,
+    0x6b, 0x8c, 0x9c, 0xaa, 0x5c, 0x69, 0xb8, 0xb7, 0x7c, 0xa0, 0xb5, 0x92,
+    0x8d, 0x67, 0x96, 0xd2, 0x77, 0xa6, 0xd9, 0xad, 0xaa, 0x79, 0x90, 0xc9,
+    0x81, 0xbf, 0xd0, 0x8d, 0x9d, 0x88, 0x9c, 0x91, 0x90, 0x94, 0x89, 0x8a,
+    0x91, 0x9b, 0x89, 0x79, 0x92, 0x80, 0x8f, 0x7b, 0x7e, 0x8b, 0xb1, 0x85,
+    0xa4, 0x5a, 0xb4, 0x7a, 0xa7, 0x8c, 0xa4, 0x75, 0xb9, 0x66, 0x93, 0x86,
+    0x8a, 0x87, 0xad, 0x64, 0xa2, 0x7e, 0x99, 0x9f, 0x81, 0xa2, 0x9b, 0x88,
+    0x9e, 0xa2, 0xb9, 0x8a, 0x78, 0x84, 0x91, 0x8e, 0x8b, 0x90, 0x83, 0x80,
+    0x64, 0x93, 0x77, 0x89, 0x81, 0x86, 0x96, 0x7a, 0x81, 0xab, 0x6d, 0x73,
+    0x7d, 0x7e, 0xaa, 0x85, 0x95, 0xac, 0x8b, 0x89, 0x8b, 0x77, 0xa3, 0x8b,
+    0xa3, 0xa0, 0x87, 0x86, 0x7a, 0x74, 0x6f, 0x7c, 0x90, 0x58, 0xa2, 0x64,
+    0x94, 0x8b, 0xa0, 0x88, 0xab, 0x53, 0xce, 0x67, 0xb7, 0x7f, 0x8d, 0x69,
+    0x84, 0x74, 0xaf, 0x72, 0xab, 0x70, 0x8f, 0x6e, 0x5d, 0x61, 0x96, 0xa1,
+    0x7b, 0x6f, 0xa2, 0x75, 0x8f, 0x5d, 0x93, 0x72, 0x82, 0x97, 0x76, 0x65,
+    0x7e, 0x96, 0xb3, 0x8b, 0x8d, 0x89, 0x8f, 0x7b, 0x6f, 0x71, 0xa1, 0x9e,
+    0x91, 0x7c, 0xc9, 0x9f, 0x7c, 0x71, 0xa1, 0xba, 0x77, 0xa5, 0xd4, 0xa6,
+    0xa0, 0x82, 0x7b, 0x95, 0x9d, 0xb7, 0xaa, 0x8d, 0x71, 0x87, 0x94, 0x7e,
+    0x88, 0x7f, 0x8b, 0x6e, 0x93, 0x9f, 0x82, 0x88, 0x94, 0x8a, 0x97, 0x7f,
+    0x7d, 0x8c, 0xa0, 0x84, 0xb4, 0x7c, 0x8c, 0x7f, 0x71, 0x8c, 0x8e, 0x7f,
+    0xc6, 0x64, 0x81, 0x8d, 0x89, 0x8d, 0xc4, 0x77, 0xaf, 0x75, 0x92, 0x7f,
+    0x84, 0xa1, 0x99, 0x94, 0x9e, 0x82, 0x7a, 0x98, 0x7e, 0x8e, 0x93, 0x8c,
+    0x6b, 0x93, 0x84, 0xaa, 0x7f, 0x8f, 0x6b, 0x94, 0xa3, 0x8a, 0x78, 0x82,
+    0x60, 0x92, 0x8b, 0x8d, 0x75, 0x8c, 0x8e, 0x6e, 0x7e, 0x9d, 0x6d, 0x8e,
+    0x79, 0x8d, 0x80, 0x89, 0xaa, 0x99, 0x7e, 0xa3, 0x83, 0x95, 0x83, 0x85,
+    0x9c, 0x60, 0x99, 0x78, 0x93, 0x8b, 0x80, 0x82, 0x9d, 0x6b, 0xc2, 0x54,
+    0xb9, 0x7a, 0x83, 0x98, 0x88, 0x65, 0xcb, 0x52, 0xa7, 0x8d, 0x7f, 0x81,
+    0x6b, 0x6d, 0x9e, 0x92, 0x85, 0x82, 0x9f, 0x67, 0x6f, 0x74, 0xaa, 0x75,
+    0x99, 0x9f, 0x8a, 0x8b, 0x88, 0x82, 0xb8, 0x6b, 0x85, 0x99, 0x93, 0x90,
+    0x8d, 0x7a, 0xaa, 0x9d, 0x86, 0x7f, 0xbd, 0x91, 0x67, 0x65, 0x8c, 0xb3,
+    0x87, 0x94, 0xa3, 0x9a, 0x7e, 0x73, 0x83, 0xaa, 0x7a, 0xba, 0xaa, 0x9e,
+    0x9e, 0x86, 0x9a, 0x63, 0x9c, 0x98, 0x5e, 0xa0, 0x9c, 0x9e, 0x8b, 0x85,
+    0xa2, 0x74, 0x80, 0x8d, 0x7e, 0x89, 0xc0, 0x75, 0xa5, 0x3f, 0x97, 0xa2,
+    0x8c, 0x8c, 0x9d, 0x88, 0xa4, 0x5e, 0x75, 0x5f, 0x87, 0x82, 0xbc, 0x72,
+    0xa3, 0x77, 0x83, 0x79, 0x82, 0x95, 0x8d, 0x77, 0x73, 0x81, 0x9d, 0x9b,
+    0x6c, 0x87, 0x93, 0x96, 0x83, 0x86, 0x8b, 0x89, 0x72, 0x7d, 0x96, 0x78,
+    0x67, 0xa2, 0x8d, 0x81, 0x6a, 0x98, 0x75, 0x80, 0x8a, 0x80, 0x9e, 0x82,
+    0x76, 0x9b, 0x6c, 0x94, 0x7a, 0x96, 0x74, 0x92, 0x78, 0x91, 0x7a, 0x7c,
+    0x9a, 0x98, 0x70, 0x5d, 0x9c, 0x4b, 0x70, 0x7d, 0xa9, 0x9b, 0x70, 0x96,
+    0xad, 0x59, 0xc4, 0x63, 0xbc, 0x8f, 0x5c, 0x86, 0x8e, 0x97, 0xa0, 0x7c,
+    0xa6, 0x77, 0xaa, 0x93, 0x68, 0x66, 0x93, 0x91, 0x7b, 0x7e, 0xa2, 0x7a,
+    0x98, 0x77, 0x97, 0x59, 0x84, 0x76, 0x9c, 0x7b, 0x8b, 0x76, 0x88, 0x7a,
+    0x8c, 0x7b, 0xa4, 0xae, 0x6e, 0x7d, 0xb3, 0x99, 0x8d, 0x68, 0x9e, 0x7e,
+    0x77, 0x59, 0x80, 0xbe, 0x80, 0x83, 0xd9, 0x9f, 0x7d, 0x60, 0x8b, 0x98,
+    0x7f, 0x9e, 0xa3, 0x8d, 0x7d, 0x81, 0x9e, 0x78, 0x99, 0x94, 0x70, 0x80,
+    0x9b, 0x89, 0x8c, 0x6d, 0x9c, 0x95, 0x76, 0x7c, 0x83, 0x87, 0x97, 0x93,
+    0x89, 0x6d, 0x77, 0x7e, 0x7e, 0x87, 0x8e, 0x7e, 0x94, 0x61, 0x94, 0xa2,
+    0x94, 0x91, 0xa1, 0x64, 0xc1, 0x78, 0x79, 0xaf, 0x67, 0x7a, 0x9b, 0xa1,
+    0x95, 0x8e, 0x97, 0x84, 0x7b, 0x85, 0x80, 0xa1, 0x6f, 0x87, 0x79, 0x83,
+    0x73, 0x9d, 0x81, 0x64, 0x7a, 0x7f, 0x8f, 0x91, 0x73, 0x97, 0x74, 0x8b,
+    0x7e, 0x88, 0x7f, 0x7e, 0x6e, 0xa1, 0x85, 0x8f, 0x77, 0x93, 0x7a, 0x6f,
+    0x7b, 0x91, 0x67, 0x73, 0x8b, 0x97, 0x6d, 0x87, 0x84, 0xf8, 0xff, 0xff,
+    0x88, 0xf8, 0xff, 0xff, 0xe6, 0xf8, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x80, 0x02, 0x00, 0x00, 0x73, 0x84, 0xbb, 0xa4, 0xa5, 0x44, 0x5c, 0xb1,
+    0x8e, 0x50, 0x82, 0x8b, 0x81, 0x86, 0x48, 0x80, 0xa9, 0x61, 0xa3, 0xa8,
+    0xca, 0x5a, 0x9d, 0x8a, 0x89, 0x7c, 0x65, 0x91, 0x5e, 0x70, 0x84, 0x71,
+    0xbc, 0x36, 0x8e, 0x8b, 0xa6, 0x63, 0xb7, 0x75, 0x92, 0x59, 0x60, 0x7e,
+    0x33, 0x8f, 0x90, 0x7a, 0xa9, 0x27, 0x72, 0x80, 0x62, 0x95, 0x93, 0x7b,
+    0x60, 0x46, 0x40, 0x55, 0x01, 0x9e, 0x8a, 0x6b, 0x58, 0x8a, 0xa6, 0xb7,
+    0x91, 0x39, 0x72, 0xb4, 0x6e, 0x67, 0x83, 0x91, 0x82, 0x7b, 0x64, 0x7a,
+    0x87, 0x6e, 0xb0, 0xa0, 0xd3, 0x53, 0xb7, 0x93, 0x76, 0xa6, 0x68, 0x8a,
+    0x74, 0x6a, 0x96, 0x6e, 0xb3, 0x53, 0xaa, 0x89, 0xf1, 0x76, 0xb8, 0x75,
+    0x8b, 0x66, 0x5f, 0x6e, 0x52, 0x92, 0x6f, 0x82, 0xbe, 0x45, 0x8d, 0x69,
+    0x98, 0x98, 0x80, 0x87, 0x73, 0x7d, 0x4d, 0x42, 0x1f, 0xa5, 0x6a, 0x73,
+    0x47, 0x87, 0x8a, 0xd1, 0x75, 0x30, 0x91, 0xae, 0x60, 0x82, 0x7a, 0x94,
+    0x75, 0x71, 0x6a, 0x7c, 0x74, 0x7a, 0xac, 0xa2, 0xb6, 0x51, 0xc6, 0x97,
+    0x63, 0xa0, 0x67, 0x7f, 0x80, 0x69, 0x88, 0x6b, 0xa5, 0x5e, 0xc2, 0x72,
+    0xf4, 0x6e, 0xaf, 0x76, 0x7f, 0x7c, 0x55, 0x68, 0x67, 0x97, 0x61, 0x7b,
+    0xbe, 0x5e, 0xab, 0x58, 0xca, 0xa2, 0x77, 0x7a, 0x8f, 0x6e, 0x54, 0x33,
+    0x4d, 0xa7, 0x5d, 0x66, 0x47, 0x92, 0x6f, 0xd6, 0x5c, 0x25, 0xa9, 0xbc,
+    0x5c, 0xb8, 0x64, 0x9b, 0x58, 0x6e, 0x77, 0x76, 0x6a, 0x94, 0xb2, 0xac,
+    0x9a, 0x51, 0xd0, 0x94, 0x62, 0xcc, 0x5a, 0x7f, 0x74, 0x6e, 0x7d, 0x71,
+    0x9b, 0x69, 0xd3, 0x64, 0xef, 0x76, 0xaa, 0x75, 0x89, 0x84, 0x50, 0x76,
+    0x72, 0x97, 0x5f, 0x77, 0xc5, 0x66, 0xce, 0x3a, 0xe5, 0xad, 0x5a, 0x81,
+    0x9e, 0x8e, 0x60, 0x3d, 0x6d, 0xa9, 0x46, 0x6b, 0x44, 0x89, 0x4d, 0xd8,
+    0x4c, 0x28, 0xb1, 0xb7, 0x60, 0xc7, 0x57, 0xb5, 0x50, 0x68, 0x88, 0x7c,
+    0x60, 0x98, 0xac, 0x9a, 0x7f, 0x51, 0xce, 0x8a, 0x5e, 0xd8, 0x51, 0x7d,
+    0x68, 0x6e, 0x7f, 0x6e, 0x90, 0x7b, 0xdf, 0x60, 0xda, 0x77, 0x91, 0x6f,
+    0x85, 0xa0, 0x58, 0x73, 0x70, 0x93, 0x51, 0x7d, 0xb9, 0x70, 0xf5, 0x31,
+    0xe9, 0xa3, 0x47, 0x76, 0xa7, 0x9b, 0x72, 0x3d, 0x90, 0xb2, 0x57, 0x64,
+    0x5b, 0x6f, 0x2b, 0xcf, 0x52, 0x28, 0xc1, 0xa7, 0x6a, 0x78, 0x51, 0xad,
+    0x49, 0x70, 0x90, 0x81, 0x5c, 0x7e, 0x9e, 0x99, 0x77, 0x50, 0xc0, 0x94,
+    0x63, 0xb7, 0x4d, 0x71, 0x58, 0x66, 0x76, 0x6d, 0x78, 0x6a, 0xe1, 0x40,
+    0xc7, 0x73, 0x7f, 0x65, 0x7c, 0x7f, 0x4d, 0x80, 0x64, 0x95, 0x57, 0x81,
+    0xb1, 0x5e, 0xff, 0x26, 0xd6, 0xa2, 0x3a, 0x73, 0xa7, 0x81, 0x76, 0x5d,
+    0x92, 0xb1, 0x58, 0x48, 0x4e, 0x5e, 0x1a, 0xc8, 0x58, 0x2c, 0xb6, 0xa7,
+    0x67, 0x89, 0x5e, 0xa0, 0x4f, 0x78, 0x93, 0x8b, 0x57, 0x7b, 0x95, 0x78,
+    0x6e, 0x46, 0xb2, 0x98, 0x55, 0xd3, 0x5e, 0x66, 0x56, 0x68, 0x74, 0x7e,
+    0x72, 0x74, 0xdd, 0x36, 0xa6, 0x64, 0x65, 0x6b, 0x81, 0x98, 0x56, 0x76,
+    0x65, 0x93, 0x58, 0x7d, 0x9b, 0x82, 0xef, 0x44, 0xbf, 0xa4, 0x3d, 0x57,
+    0xa0, 0xa7, 0x7a, 0x74, 0x9f, 0xa8, 0x70, 0x52, 0x55, 0x5f, 0x1a, 0x94,
+    0x64, 0x37, 0xa7, 0xa6, 0x80, 0x7d, 0x6e, 0x99, 0x5d, 0x81, 0x8a, 0x99,
+    0x5c, 0x76, 0x8f, 0x44, 0x68, 0x50, 0x94, 0x97, 0x63, 0xb6, 0x73, 0x56,
+    0x5b, 0x70, 0x66, 0x8b, 0x72, 0x78, 0xcc, 0x31, 0x8b, 0x68, 0x4a, 0x74,
+    0x7d, 0x99, 0x54, 0x91, 0x6a, 0x90, 0x5d, 0x80, 0x8c, 0x82, 0xcd, 0x4f,
+    0xb0, 0x96, 0x63, 0x56, 0x97, 0xb3, 0x7e, 0x97, 0xa4, 0x9d, 0x7a, 0x5d,
+    0x49, 0x36, 0x18, 0x64, 0x60, 0x43, 0x89, 0xa2, 0x6a, 0x49, 0x7f, 0x58,
+    0x6a, 0x83, 0x77, 0x9d, 0x70, 0x3b, 0x83, 0x21, 0x59, 0x52, 0x6d, 0x95,
+    0x48, 0xa8, 0x8a, 0x42, 0x50, 0x6d, 0x44, 0x95, 0x69, 0x50, 0xc1, 0x4b,
+    0x7c, 0x59, 0x42, 0x78, 0x77, 0x7f, 0x5b, 0x98, 0x67, 0x89, 0x55, 0x8b,
+    0x82, 0x47, 0xb7, 0x64, 0x9d, 0x83, 0x5c, 0x53, 0x89, 0x90, 0x79, 0xb2,
+    0x90, 0x98, 0x85, 0x5a, 0x4d, 0x2b, 0x19, 0x1e, 0x52, 0x50, 0x57, 0x8b,
+    0x73, 0x3a, 0x88, 0x1e, 0x65, 0x80, 0x4d, 0x9b, 0x6c, 0x3c, 0x86, 0x26,
+    0x5b, 0x56, 0x36, 0x98, 0x49, 0x87, 0x9f, 0x2a, 0x40, 0x61, 0x27, 0x9d,
+    0x63, 0x40, 0xa8, 0x46, 0x6b, 0x52, 0x52, 0x7f, 0x67, 0x6a, 0x58, 0xa1,
+    0x5d, 0x6d, 0x5f, 0x9a, 0x72, 0x3a, 0x99, 0x63, 0x8c, 0x80, 0x68, 0x58,
+    0x72, 0x6a, 0x7c, 0xbb, 0x7e, 0x78, 0x94, 0x60, 0x72, 0xfb, 0xff, 0xff,
+    0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x96, 0xfe, 0xff, 0xff,
+    0x8f, 0x00, 0x00, 0x00, 0x8f, 0xfc, 0xff, 0xff, 0xb4, 0xfe, 0xff, 0xff,
+    0xc1, 0xfd, 0xff, 0xff, 0x59, 0xff, 0xff, 0xff, 0xbc, 0xfe, 0xff, 0xff,
+    0x09, 0xff, 0xff, 0xff, 0x9e, 0xfb, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0xe9, 0x03, 0x00, 0x00, 0x2b, 0xfd, 0xff, 0xff,
+    0x3b, 0xfd, 0xff, 0xff, 0x91, 0x01, 0x00, 0x00, 0x60, 0xfb, 0xff, 0xff,
+    0x04, 0xfd, 0xff, 0xff, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0xf0, 0x03, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x18, 0x03, 0x00, 0x00, 0x78, 0x03, 0x00, 0x00,
+    0x88, 0x01, 0x00, 0x00, 0xf8, 0x01, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+    0x0c, 0x00, 0x00, 0x00, 0x64, 0x02, 0x00, 0x00, 0xe0, 0x00, 0x00, 0x00,
+    0xb2, 0xfc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x24, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+    0x0e, 0x00, 0x00, 0x00, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x5f, 0x73,
+    0x6f, 0x66, 0x74, 0x6d, 0x61, 0x78, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x84, 0xfd, 0xff, 0xff,
+    0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3b,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0x3f, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x22, 0xfd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
+    0x1c, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x1c, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x61, 0x64, 0x64, 0x5f,
+    0x31, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0xec, 0xfd, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x28, 0x17, 0xb1, 0x3d,
+    0x01, 0x00, 0x00, 0x00, 0x84, 0xdb, 0x33, 0x41, 0x01, 0x00, 0x00, 0x00,
+    0x9d, 0xf0, 0x2c, 0xc1, 0x8e, 0xfd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
+    0x48, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x48, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x77, 0x65, 0x69, 0x67,
+    0x68, 0x74, 0x73, 0x5f, 0x71, 0x75, 0x61, 0x6e, 0x74, 0x5f, 0x31, 0x2f,
+    0x46, 0x61, 0x6b, 0x65, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x57, 0x69, 0x74,
+    0x68, 0x4d, 0x69, 0x6e, 0x4d, 0x61, 0x78, 0x56, 0x61, 0x72, 0x73, 0x2f,
+    0x74, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xa0, 0x0f, 0x00, 0x00,
+    0x84, 0xfe, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0xac, 0x5f, 0xf6, 0x39, 0x01, 0x00, 0x00, 0x00, 0x1d, 0xaf, 0x62, 0x3d,
+    0x01, 0x00, 0x00, 0x00, 0x5e, 0x1b, 0x83, 0xbd, 0x22, 0xfe, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x03, 0x1c, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x52, 0x65, 0x6c, 0x75, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0xf4, 0xfe, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x0f, 0x72, 0x3d,
+    0x01, 0x00, 0x00, 0x00, 0x38, 0x1d, 0x71, 0x41, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x96, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
+    0x20, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x28, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x52, 0x65, 0x73, 0x68,
+    0x61, 0x70, 0x65, 0x5f, 0x31, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x6c, 0xff, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xc6, 0xd0, 0xd0, 0x3d,
+    0x01, 0x00, 0x00, 0x00, 0xf5, 0xff, 0xcf, 0x41, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x0e, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
+    0x3c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x50, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x77, 0x65, 0x69, 0x67,
+    0x68, 0x74, 0x73, 0x5f, 0x71, 0x75, 0x61, 0x6e, 0x74, 0x2f, 0x46, 0x61,
+    0x6b, 0x65, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x57, 0x69, 0x74, 0x68, 0x4d,
+    0x69, 0x6e, 0x4d, 0x61, 0x78, 0x56, 0x61, 0x72, 0x73, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00,
+    0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00,
+    0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xf7, 0x5e, 0x6c, 0x3a,
+    0x01, 0x00, 0x00, 0x00, 0x30, 0x42, 0xec, 0x3d, 0x01, 0x00, 0x00, 0x00,
+    0x42, 0xca, 0xe8, 0xbd, 0xaa, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x02,
+    0x20, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x1c, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x43, 0x6f, 0x6e, 0x76,
+    0x32, 0x44, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x9c, 0xff, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0xec, 0xcd, 0xc0, 0x38, 0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x08, 0x00,
+    0x07, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x02, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+    0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x0c, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x25, 0xf5, 0xe8, 0x37, 0x03, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00,
+    0x5c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x09, 0x02, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00,
+    0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f,
+    0x14, 0x00, 0x1c, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x07, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x18, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x07, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x14, 0x00, 0x14, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x02, 0x1c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x30, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x0c, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00,
+    0x0c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xfa, 0xff, 0xff, 0xff,
+    0x00, 0x19, 0x06, 0x00, 0x06, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x00, 0x09, 0x06, 0x00, 0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x04};
+const int g_tiny_conv_micro_features_model_data_len = 18208;
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..22c0a970b774299aea629ce034b9dd2e4c04e1ca
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h
@@ -0,0 +1,27 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is a standard TensorFlow Lite model file that has been converted into a
+// C data array, so it can be easily compiled into a binary for devices that
+// don't have a file system. It was created using the command:
+// xxd -i tiny_conv.tflite > tiny_conv_simple_features_model_data.cc
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_TINY_CONV_MICRO_FEATURES_MODEL_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_TINY_CONV_MICRO_FEATURES_MODEL_DATA_H_
+
+extern const unsigned char g_tiny_conv_micro_features_model_data[];
+extern const int g_tiny_conv_micro_features_model_data_len;
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_TINY_CONV_MICRO_FEATURES_MODEL_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.cc
new file mode 100644
index 0000000000000000000000000000000000000000..51b7d8b35bcbce892fbd891415e3f09725c62faa
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.cc
@@ -0,0 +1,70 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h"
+
+#include <string.h>
+
+int WindowProcessSamples(struct WindowState* state, const int16_t* samples,
+                         size_t num_samples, size_t* num_samples_read) {
+  const int size = state->size;
+
+  // Copy samples from the samples buffer over to our local input.
+  size_t max_samples_to_copy = state->size - state->input_used;
+  if (max_samples_to_copy > num_samples) {
+    max_samples_to_copy = num_samples;
+  }
+  memcpy(state->input + state->input_used, samples,
+         max_samples_to_copy * sizeof(*samples));
+  *num_samples_read = max_samples_to_copy;
+  state->input_used += max_samples_to_copy;
+
+  if (state->input_used < state->size) {
+    // We don't have enough samples to compute a window.
+    return 0;
+  }
+
+  // Apply the window to the input.
+  const int16_t* coefficients = state->coefficients;
+  const int16_t* input = state->input;
+  int16_t* output = state->output;
+  int i;
+  int16_t max_abs_output_value = 0;
+  for (i = 0; i < size; ++i) {
+    int16_t new_value = ((static_cast<int32_t>(*input++)) * *coefficients++) >>
+                        kFrontendWindowBits;
+    *output++ = new_value;
+    if (new_value < 0) {
+      new_value = -new_value;
+    }
+    if (new_value > max_abs_output_value) {
+      max_abs_output_value = new_value;
+    }
+  }
+  // Shuffle the input down by the step size, and update how much we have used.
+  memmove(state->input, state->input + state->step,
+          sizeof(*state->input) * (state->size - state->step));
+  state->input_used -= state->step;
+  state->max_abs_output_value = max_abs_output_value;
+
+  // Indicate that the output buffer is valid for the next stage.
+  return 1;
+}
+
+void WindowReset(struct WindowState* state) {
+  memset(state->input, 0, state->size * sizeof(*state->input));
+  memset(state->output, 0, state->size * sizeof(*state->output));
+  state->input_used = 0;
+  state->max_abs_output_value = 0;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h
new file mode 100644
index 0000000000000000000000000000000000000000..b32c059d81a8efe68c8a87a250fd733e1849479b
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_WINDOW_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_WINDOW_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+#define kFrontendWindowBits 12
+
+struct WindowState {
+  size_t size;
+  int16_t coefficients[kMaxAudioSampleSize];
+  size_t step;
+
+  int16_t input[kMaxAudioSampleSize];
+  size_t input_used;
+  int16_t output[kMaxAudioSampleSize];
+  int16_t max_abs_output_value;
+};
+
+// Applies a window to the samples coming in, stepping forward at the given
+// rate.
+int WindowProcessSamples(struct WindowState* state, const int16_t* samples,
+                         size_t num_samples, size_t* num_samples_read);
+
+void WindowReset(struct WindowState* state);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_WINDOW_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..310f84fc60b32e37f7e7d9d79bc2425ce7cddf8a
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_test.cc
@@ -0,0 +1,183 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.h"
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+namespace {
+
+const int kSampleRate = 1000;
+const int kWindowSamples = 25;
+const int kStepSamples = 10;
+const int16_t kFakeAudioData[] = {
+    0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768,
+    0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768,
+    0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768};
+
+// Test window function behaviors using default config values.
+class WindowTestConfig {
+ public:
+  WindowTestConfig() {
+    config_.size_ms = 25;
+    config_.step_size_ms = 10;
+  }
+
+  struct WindowConfig config_;
+};
+
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(WindowState_CheckCoefficients) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  WindowTestConfig config;
+  struct WindowState state;
+  TF_LITE_MICRO_EXPECT(WindowPopulateState(error_reporter, &config.config_,
+                                           &state, kSampleRate));
+
+  const int16_t expected[] = {16,   144,  391,  743,  1176, 1664, 2177,
+                              2681, 3145, 3541, 3843, 4032, 4096, 4032,
+                              3843, 3541, 3145, 2681, 2177, 1664, 1176,
+                              743,  391,  144,  16};
+  TF_LITE_MICRO_EXPECT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.coefficients[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(WindowState_CheckResidualInput) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  WindowTestConfig config;
+  struct WindowState state;
+  TF_LITE_MICRO_EXPECT(WindowPopulateState(error_reporter, &config.config_,
+                                           &state, kSampleRate));
+  size_t num_samples_read;
+
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
+      &state, kFakeAudioData,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
+
+  int i;
+  for (i = kStepSamples; i < kWindowSamples; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.input[i - kStepSamples], kFakeAudioData[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(WindowState_CheckOutputValues) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  WindowTestConfig config;
+  struct WindowState state;
+  TF_LITE_MICRO_EXPECT(WindowPopulateState(error_reporter, &config.config_,
+                                           &state, kSampleRate));
+  size_t num_samples_read;
+
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
+      &state, kFakeAudioData,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
+
+  const int16_t expected[] = {
+      0, 1151,   0, -5944, 0, 13311,  0, -21448, 0, 28327, 0, -32256, 0, 32255,
+      0, -28328, 0, 21447, 0, -13312, 0, 5943,   0, -1152, 0};
+  TF_LITE_MICRO_EXPECT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.output[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(WindowState_CheckMaxAbsValue) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  WindowTestConfig config;
+  struct WindowState state;
+  TF_LITE_MICRO_EXPECT(WindowPopulateState(error_reporter, &config.config_,
+                                           &state, kSampleRate));
+  size_t num_samples_read;
+
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
+      &state, kFakeAudioData,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
+
+  TF_LITE_MICRO_EXPECT_EQ(state.max_abs_output_value, 32256);
+}
+
+TF_LITE_MICRO_TEST(WindowState_CheckConsecutiveWindow) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  WindowTestConfig config;
+  struct WindowState state;
+  TF_LITE_MICRO_EXPECT(WindowPopulateState(error_reporter, &config.config_,
+                                           &state, kSampleRate));
+  size_t num_samples_read;
+
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
+      &state, kFakeAudioData,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
+      &state, kFakeAudioData + kWindowSamples,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples,
+      &num_samples_read));
+
+  const int16_t expected[] = {
+      0, -1152, 0, 5943,   0, -13312, 0, 21447, 0, -28328, 0, 32255, 0, -32256,
+      0, 28327, 0, -21448, 0, 13311,  0, -5944, 0, 1151,   0};
+  TF_LITE_MICRO_EXPECT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.output[i], expected[i]);
+  }
+}
+
+TF_LITE_MICRO_TEST(WindowState_CheckNotEnoughSamples) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  WindowTestConfig config;
+  struct WindowState state;
+  TF_LITE_MICRO_EXPECT(WindowPopulateState(error_reporter, &config.config_,
+                                           &state, kSampleRate));
+  size_t num_samples_read;
+
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
+      &state, kFakeAudioData,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
+      &state, kFakeAudioData + kWindowSamples,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples,
+      &num_samples_read));
+  TF_LITE_MICRO_EXPECT_EQ(
+      false, WindowProcessSamples(
+                 &state, kFakeAudioData + kWindowSamples + kStepSamples,
+                 sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) -
+                     kWindowSamples - kStepSamples,
+                 &num_samples_read));
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      state.input_used,
+      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - 2 * kStepSamples);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..618973b39b2ebd2088b4c3756ea6ca1c1f7e8181
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.cc
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/static_alloc.h"
+
+// Needed because some platforms don't have M_PI defined.
+#define WINDOW_PI (3.14159265358979323846f)
+
+void WindowFillConfigWithDefaults(struct WindowConfig* config) {
+  config->size_ms = 25;
+  config->step_size_ms = 10;
+}
+
+int WindowPopulateState(tflite::ErrorReporter* error_reporter,
+                        const struct WindowConfig* config,
+                        struct WindowState* state, int sample_rate) {
+  state->size = config->size_ms * sample_rate / 1000;
+  state->step = config->step_size_ms * sample_rate / 1000;
+
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(state->coefficients,
+                                 (state->size * sizeof(*state->coefficients)));
+
+  // Populate the window values.
+  const float arg = WINDOW_PI * 2.0 / (static_cast<float>(state->size));
+  int i;
+  for (i = 0; i < state->size; ++i) {
+    float float_value = 0.5 - (0.5 * cos(arg * (i + 0.5)));
+    // Scale it to fixed point and round it.
+    state->coefficients[i] =
+        floor(float_value * (1 << kFrontendWindowBits) + 0.5);
+  }
+
+  state->input_used = 0;
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(state->input,
+                                 (state->size * sizeof(*state->input)));
+
+  STATIC_ALLOC_ENSURE_ARRAY_SIZE(state->output,
+                                 (state->size * sizeof(*state->output)));
+  return 1;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0c61c29dc9cd2a91f37ea89ace5e031235dd337
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window_util.h
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_WINDOW_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_WINDOW_UTIL_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/window.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+struct WindowConfig {
+  // length of window frame in milliseconds
+  size_t size_ms;
+  // length of step for next frame in milliseconds
+  size_t step_size_ms;
+};
+
+// Populates the WindowConfig with "sane" default values.
+void WindowFillConfigWithDefaults(struct WindowConfig* config);
+
+// Allocates any buffers.
+int WindowPopulateState(tflite::ErrorReporter* error_reporter,
+                        const struct WindowConfig* config,
+                        struct WindowState* state, int sample_rate);
+
+// Frees any allocated buffers.
+void WindowFreeStateContents(struct WindowState* state);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_WINDOW_UTIL_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.cc
new file mode 100644
index 0000000000000000000000000000000000000000..48535d12d5db850cf0a497645f9e77d98fbcb8a1
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.cc
@@ -0,0 +1,24 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See the header for documentation on the meaning of this data.
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h"
+
+const uint8_t g_yes_feature_data_slice[g_yes_feature_data_slice_size] = {
+    214, 215, 236, 202, 235, 203, 225, 191, 203, 188, 199, 194, 212, 127,
+    51,  0,   174, 188, 219, 196, 228, 221, 240, 207, 235, 220, 241, 219,
+    237, 207, 212, 142, 95,  0,   139, 78,  162, 177, 197, 183,
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h
new file mode 100644
index 0000000000000000000000000000000000000000..e73a13153b65be78a2a57edce0d09f48a8cb444f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h
@@ -0,0 +1,29 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This data was extracted from the larger feature data held in
+// no_micro_features_data.cc and consists of the 26th spectrogram slice of 40
+// values. This is the expected result of running the sample data in
+// yes_30ms_sample_data.cc through through the preprocessing pipeline.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_YES_FEATURE_DATA_SLICE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_YES_FEATURE_DATA_SLICE_H_
+
+#include <cstdint>
+
+constexpr int g_yes_feature_data_slice_size = 40;
+extern const uint8_t g_yes_feature_data_slice[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_YES_FEATURE_DATA_SLICE_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2c2ee0995c00ee0da1337c86cf9aa18ba726bfe1
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc
@@ -0,0 +1,165 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h"
+
+/* File automatically created by
+ * tensorflow/examples/speech_commands/wav_to_features.py \
+ * --sample_rate=16000 \
+ * --clip_duration_ms=1000 \
+ * --window_size_ms=30 \
+ * --window_stride_ms=20 \
+ * --feature_bin_count=40 \
+ * --quantize=1 \
+ * --preprocess="micro" \
+ * --input_wav="speech_commands_test_set_v0.02/yes/f2e59fea_nohash_1.wav" \
+ * --output_c_file="yes_micro_features_data.cc" \
+ */
+
+const int g_yes_micro_f2e59fea_nohash_1_width = 40;
+const int g_yes_micro_f2e59fea_nohash_1_height = 49;
+const unsigned char g_yes_micro_f2e59fea_nohash_1_data[] = {
+    244, 226, 245, 223, 234, 213, 228, 208, 194, 110, 95,  116, 102, 0,   137,
+    161, 183, 173, 137, 116, 133, 157, 151, 156, 128, 110, 128, 0,   68,  78,
+    78,  90,  68,  68,  78,  102, 95,  78,  95,  78,  210, 188, 209, 183, 204,
+    188, 201, 191, 166, 119, 90,  107, 110, 107, 175, 157, 179, 168, 182, 145,
+    152, 164, 171, 165, 136, 143, 122, 68,  0,   78,  90,  90,  110, 90,  102,
+    99,  90,  68,  78,  68,  223, 186, 179, 123, 182, 110, 196, 171, 159, 110,
+    102, 95,  90,  99,  160, 134, 125, 136, 153, 152, 164, 134, 164, 151, 141,
+    136, 99,  90,  90,  90,  78,  78,  102, 119, 102, 90,  110, 90,  68,  51,
+    177, 175, 211, 172, 183, 0,   95,  68,  129, 102, 68,  85,  114, 105, 110,
+    85,  102, 95,  140, 51,  85,  51,  95,  90,  143, 116, 90,  78,  78,  51,
+    107, 85,  68,  0,   68,  51,  90,  51,  68,  0,   164, 117, 193, 120, 156,
+    0,   138, 51,  90,  0,   51,  0,   51,  85,  0,   0,   51,  0,   0,   0,
+    0,   0,   114, 0,   85,  78,  90,  51,  0,   0,   51,  85,  99,  85,  107,
+    68,  90,  85,  78,  0,   51,  0,   110, 0,   68,  0,   0,   0,   51,  0,
+    51,  0,   0,   0,   68,  90,  107, 0,   68,  0,   0,   0,   68,  0,   51,
+    68,  0,   78,  68,  0,   51,  0,   78,  68,  90,  68,  78,  51,  51,  0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   90,  0,   0,   0,   0,
+    0,   51,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   51,  68,
+    0,   0,   78,  0,   78,  0,   78,  0,   51,  0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   51,  0,   51,  0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   51,  0,   51,
+    0,   51,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   51,
+    0,   0,   0,   0,   51,  78,  0,   0,   51,  51,  0,   0,   0,   78,  0,
+    213, 170, 192, 180, 196, 188, 173, 131, 173, 116, 137, 105, 159, 127, 0,
+    0,   0,   0,   127, 164, 165, 161, 170, 164, 185, 197, 195, 167, 134, 138,
+    159, 134, 136, 105, 51,  0,   99,  0,   51,  0,   228, 215, 229, 218, 237,
+    215, 228, 210, 237, 222, 239, 211, 208, 211, 234, 218, 220, 209, 225, 219,
+    235, 222, 245, 225, 245, 224, 243, 223, 241, 218, 237, 224, 234, 213, 221,
+    193, 197, 164, 157, 128, 227, 188, 232, 196, 220, 220, 240, 219, 234, 213,
+    234, 211, 231, 218, 233, 213, 239, 215, 228, 207, 229, 206, 224, 208, 226,
+    207, 232, 210, 225, 208, 230, 199, 227, 206, 210, 205, 218, 174, 178, 141,
+    235, 208, 220, 206, 225, 203, 233, 203, 225, 167, 205, 199, 208, 190, 221,
+    204, 223, 207, 225, 188, 225, 197, 215, 188, 199, 183, 225, 195, 224, 200,
+    216, 178, 208, 188, 215, 202, 214, 183, 176, 140, 198, 150, 211, 194, 203,
+    120, 175, 188, 204, 189, 219, 192, 223, 202, 216, 186, 203, 185, 210, 182,
+    214, 183, 204, 170, 204, 125, 184, 187, 206, 185, 198, 182, 210, 161, 202,
+    198, 218, 173, 145, 120, 188, 183, 205, 168, 200, 170, 210, 177, 187, 190,
+    209, 193, 193, 166, 210, 162, 175, 119, 174, 147, 182, 161, 181, 134, 176,
+    143, 187, 165, 186, 149, 185, 141, 192, 181, 202, 123, 170, 143, 144, 78,
+    149, 0,   208, 182, 170, 78,  170, 0,   117, 51,  156, 99,  195, 170, 200,
+    130, 152, 68,  175, 141, 173, 134, 194, 132, 189, 164, 198, 134, 173, 117,
+    171, 149, 183, 181, 185, 99,  153, 117, 125, 0,   166, 0,   173, 117, 144,
+    0,   117, 102, 188, 120, 193, 166, 197, 68,  163, 119, 169, 99,  134, 0,
+    162, 0,   164, 68,  171, 116, 126, 0,   120, 68,  68,  0,   105, 0,   159,
+    95,  150, 51,  90,  85,  0,   0,   131, 0,   105, 0,   145, 51,  170, 51,
+    120, 0,   107, 0,   145, 85,  160, 0,   85,  0,   0,   51,  149, 0,   78,
+    0,   0,   0,   0,   0,   0,   0,   90,  0,   112, 0,   78,  102, 122, 0,
+    0,   0,   0,   0,   105, 0,   0,   0,   0,   0,   0,   0,   0,   0,   112,
+    0,   164, 120, 143, 0,   0,   0,   0,   0,   51,  0,   90,  0,   78,  0,
+    0,   0,   0,   0,   110, 0,   139, 0,   112, 51,  0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   102, 0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   107,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   78,  0,   51,  0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   51,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   127, 110, 133, 0,   167, 0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   132, 0,   190,
+    194, 202, 0,   197, 187, 161, 0,   0,   0,   0,   0,   0,   0,   0,   0,
+    214, 213, 223, 203, 218, 189, 200, 122, 78,  0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   191, 210, 231, 197, 226, 217, 238, 216, 236, 207,
+    199, 0,   0,   0,   0,   0,   107, 122, 155, 160, 214, 215, 236, 202, 235,
+    203, 225, 191, 203, 188, 199, 194, 212, 127, 51,  0,   174, 188, 219, 196,
+    228, 221, 240, 207, 235, 220, 241, 219, 237, 207, 212, 142, 95,  0,   139,
+    78,  162, 177, 197, 183, 211, 199, 235, 208, 238, 215, 227, 207, 211, 201,
+    224, 213, 226, 192, 213, 170, 223, 205, 234, 221, 245, 225, 242, 220, 245,
+    221, 239, 221, 238, 213, 226, 180, 159, 112, 176, 159, 208, 202, 213, 191,
+    205, 191, 225, 197, 238, 219, 224, 201, 227, 200, 221, 201, 225, 203, 212,
+    195, 229, 210, 228, 210, 239, 216, 226, 212, 233, 205, 225, 200, 229, 207,
+    222, 151, 147, 119, 179, 185, 230, 218, 223, 192, 202, 136, 205, 177, 223,
+    204, 228, 215, 232, 209, 221, 189, 221, 205, 209, 200, 226, 209, 229, 205,
+    235, 192, 209, 198, 228, 190, 206, 185, 207, 187, 214, 175, 177, 184, 220,
+    195, 214, 207, 230, 184, 205, 159, 208, 184, 189, 169, 224, 213, 219, 199,
+    229, 203, 216, 205, 222, 204, 224, 206, 231, 208, 231, 176, 197, 184, 216,
+    193, 211, 139, 212, 195, 231, 164, 166, 195, 217, 182, 208, 190, 217, 179,
+    205, 68,  182, 119, 195, 168, 182, 136, 204, 179, 193, 158, 182, 140, 188,
+    154, 197, 169, 190, 99,  184, 0,   125, 0,   131, 0,   99,  68,  179, 85,
+    190, 184, 213, 203, 223, 202, 212, 190, 209, 138, 178, 0,   159, 51,  128,
+    51,  105, 0,   139, 51,  179, 125, 185, 114, 171, 128, 175, 132, 181, 174,
+    155, 0,   0,   0,   90,  0,   125, 0,   176, 188, 227, 217, 244, 215, 234,
+    221, 239, 192, 224, 210, 0,   0,   134, 0,   51,  0,   105, 0,   105, 0,
+    143, 90,  192, 119, 175, 147, 141, 51,  184, 110, 85,  0,   0,   0,   0,
+    0,   0,   0,   151, 139, 201, 203, 232, 203, 226, 208, 236, 206, 230, 212,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   169, 0,   119,
+    0,   78,  0,   0,   0,   0,   0,   0,   0,   0,   0,   68,  0,   0,   133,
+    200, 180, 220, 197, 228, 201, 221, 184, 213, 193, 110, 0,   0,   0,   0,
+    0,   0,   0,   0,   0,   78,  0,   164, 0,   0,   0,   0,   0,   107, 0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   150, 164, 202, 182, 224,
+    197, 211, 179, 212, 193, 134, 0,   0,   0,   0,   0,   0,   0,   0,   0,
+    85,  0,   150, 0,   85,  0,   95,  0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   102, 90,  193, 160, 203, 164, 200, 178, 205, 174,
+    116, 0,   0,   0,   0,   0,   0,   0,   0,   0,   120, 114, 123, 0,   114,
+    0,   145, 68,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    102, 68,  199, 170, 195, 180, 208, 176, 200, 164, 0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   110, 0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   142, 102, 172, 110, 186,
+    167, 185, 147, 189, 154, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   177, 0,   158, 136, 197, 155, 189, 166,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    85,  0,   155, 90,  175, 117, 175, 138, 202, 165, 0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   51,  0,   139,
+    0,   120, 68,  51,  123, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   119, 0,   78,  0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..d19bf8f067d7329dcda0b866d0d323b92f175e61
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h
@@ -0,0 +1,23 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_YES_MICRO_FEATURES_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_YES_MICRO_FEATURES_DATA_H_
+
+extern const int g_yes_micro_f2e59fea_nohash_1_width;
+extern const int g_yes_micro_f2e59fea_nohash_1_height;
+extern const unsigned char g_yes_micro_f2e59fea_nohash_1_data[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_YES_MICRO_FEATURES_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc
index 4e54ff670eb9badd648aee99cf154c0d3b988bff..6f0c2581771e87e69481726adaea4fab3108640b 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/no_micro_features_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/tiny_conv_micro_features_model_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/yes_micro_features_data.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/micro_interpreter.h"
@@ -32,7 +32,8 @@ TF_LITE_MICRO_TEST(TestInvoke) {
 
   // Map the model into a usable data structure. This doesn't involve any
   // copying or parsing, it's a very lightweight operation.
-  const tflite::Model* model = ::tflite::GetModel(g_tiny_conv_model_data);
+  const tflite::Model* model =
+      ::tflite::GetModel(g_tiny_conv_micro_features_model_data);
   if (model->version() != TFLITE_SCHEMA_VERSION) {
     error_reporter->Report(
         "Model provided is schema version %d not equal "
@@ -61,12 +62,12 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   TF_LITE_MICRO_EXPECT_EQ(4, input->dims->size);
   TF_LITE_MICRO_EXPECT_EQ(1, input->dims->data[0]);
   TF_LITE_MICRO_EXPECT_EQ(49, input->dims->data[1]);
-  TF_LITE_MICRO_EXPECT_EQ(43, input->dims->data[2]);
+  TF_LITE_MICRO_EXPECT_EQ(40, input->dims->data[2]);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, input->type);
 
   // Copy a spectrogram created from a .wav audio file of someone saying "Yes",
   // into the memory area used for the input.
-  const uint8_t* yes_features_data = g_yes_f2e59fea_nohash_1_data;
+  const uint8_t* yes_features_data = g_yes_micro_f2e59fea_nohash_1_data;
   for (int i = 0; i < input->bytes; ++i) {
     input->data.uint8[i] = yes_features_data[i];
   }
@@ -102,7 +103,7 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   TF_LITE_MICRO_EXPECT_GT(yes_score, no_score);
 
   // Now test with a different input, from a recording of "No".
-  const uint8_t* no_features_data = g_no_f9643d42_nohash_4_data;
+  const uint8_t* no_features_data = g_no_micro_f9643d42_nohash_4_data;
   for (int i = 0; i < input->bytes; ++i) {
     input->data.uint8[i] = no_features_data[i];
   }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..85113a90dcf610a38f21e17f0b303befd6c1e071
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.cc
@@ -0,0 +1,1477 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See the header for documentation on the meaning of this data.
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h"
+
+const int g_no_1000ms_sample_data_size = 16000;
+const int16_t g_no_1000ms_sample_data[16000] = {
+    5,     1,     -10,   -16,   -14,   -10,   -4,    -5,    -10,   -15,   -13,
+    -17,   -22,   -21,   -23,   -25,   -22,   -26,   -28,   -31,   -28,   -25,
+    -20,   -24,   -21,   -13,   -7,    -1,    -1,    3,     3,     4,     -4,
+    -6,    -8,    -10,   -13,   -4,    -2,    5,     8,     11,    26,    28,
+    34,    32,    34,    30,    21,    18,    15,    13,    8,     5,     14,
+    13,    7,     8,     4,     -5,    -7,    -4,    -9,    -13,   -17,   -21,
+    -16,   -14,   -12,   -12,   -14,   -11,   -9,    -2,    5,     -1,    2,
+    0,     2,     1,     -3,    -13,   -14,   -16,   -11,   -10,   -9,    -13,
+    -17,   -19,   -25,   -21,   -21,   -20,   -13,   -5,    -3,    0,     3,
+    6,     5,     1,     0,     -1,    -7,    -10,   -11,   -9,    -6,    -7,
+    -11,   -10,   -5,    -14,   -20,   -23,   -22,   -22,   -19,   -15,   -12,
+    -6,    -5,    3,     13,    16,    17,    25,    26,    28,    34,    34,
+    33,    34,    30,    21,    22,    18,    13,    20,    22,    24,    27,
+    26,    23,    21,    18,    9,     5,     -2,    -7,    -8,    -10,   -8,
+    -8,    -4,    2,     2,     -1,    -7,    -10,   -8,    -12,   -13,   -15,
+    -9,    -5,    -4,    -3,    -6,    -11,   -11,   -18,   -16,   -13,   -10,
+    -12,   -6,    0,     -2,    0,     -3,    -4,    -8,    -12,   -19,   -16,
+    -17,   -19,   -23,   -30,   -33,   -36,   -38,   -39,   -40,   -36,   -37,
+    -32,   -27,   -25,   -31,   -38,   -41,   -47,   -52,   -50,   -42,   -32,
+    -16,   -7,    -3,    0,     -1,    -1,    -5,    -16,   -23,   -29,   -34,
+    -33,   -27,   -17,   -11,   1,     4,     10,    18,    21,    24,    24,
+    25,    30,    34,    30,    29,    26,    23,    20,    15,    14,    13,
+    14,    16,    23,    28,    21,    23,    21,    13,    12,    12,    14,
+    17,    21,    26,    27,    30,    30,    26,    20,    15,    15,    9,
+    8,     9,     10,    7,     8,     7,     1,     -2,    -6,    -10,   -10,
+    -12,   -15,   -10,   -7,    -6,    -5,    0,     -3,    -3,    -12,   -25,
+    -35,   -49,   -53,   -49,   -51,   -48,   -46,   -48,   -39,   -33,   -31,
+    -37,   -42,   -47,   -49,   -46,   -47,   -47,   -46,   -42,   -39,   -33,
+    -26,   -23,   -14,   -8,    -9,    -7,    -10,   -11,   -13,   -13,   -19,
+    -20,   -16,   -11,   -9,    7,     16,    21,    29,    27,    29,    28,
+    21,    14,    13,    17,    19,    20,    18,    13,    17,    16,    18,
+    20,    17,    13,    16,    23,    26,    26,    25,    27,    31,    30,
+    31,    34,    32,    35,    32,    36,    31,    26,    23,    27,    27,
+    29,    27,    26,    32,    31,    28,    26,    23,    14,    6,     0,
+    -4,    -7,    -9,    -10,   -8,    -3,    4,     12,    11,    15,    11,
+    8,     2,     -3,    -3,    -4,    -6,    -11,   -14,   -20,   -28,   -32,
+    -38,   -46,   -42,   -44,   -40,   -34,   -26,   -29,   -25,   -23,   -24,
+    -17,   -21,   -26,   -23,   -25,   -19,   -10,   -11,   -10,   -10,   -12,
+    -9,    -3,    0,     -3,    -7,    -10,   -13,   -10,   -14,   -13,   -17,
+    -22,   -22,   -30,   -28,   -29,   -26,   -18,   -6,    -1,    -3,    -4,
+    -6,    -10,   -13,   -10,   -14,   -16,   -11,   -15,   -9,    -3,    -6,
+    -1,    2,     3,     4,     6,     6,     3,     4,     12,    14,    17,
+    21,    19,    20,    16,    17,    15,    21,    21,    22,    20,    17,
+    16,    16,    20,    17,    15,    9,     5,     11,    18,    24,    28,
+    26,    23,    23,    26,    22,    18,    21,    23,    26,    27,    25,
+    27,    29,    26,    20,    10,    7,     11,    8,     16,    25,    33,
+    37,    38,    39,    35,    30,    20,    13,    9,     6,     5,     13,
+    13,    14,    15,    12,    8,     3,     3,     3,     2,     9,     11,
+    10,    5,     5,     0,     -7,    -11,   -12,   -15,   -17,   -12,   -13,
+    -18,   -19,   -21,   -24,   -22,   -27,   -34,   -36,   -36,   -32,   -20,
+    -16,   -15,   -5,    -5,    -9,    -10,   -9,    -17,   -19,   -20,   -14,
+    -13,   -10,   -4,    -7,    -7,    -14,   -19,   -28,   -31,   -30,   -31,
+    -23,   -19,   -20,   -12,   -11,   -14,   -16,   -20,   -18,   -20,   -21,
+    -24,   -29,   -30,   -30,   -34,   -31,   -25,   -21,   -18,   -11,   -4,
+    2,     2,     3,     3,     2,     4,     -1,    -4,    -8,    -3,    -1,
+    7,     15,    18,    22,    20,    20,    16,    16,    14,    13,    21,
+    25,    26,    35,    28,    28,    28,    25,    21,    19,    18,    21,
+    24,    20,    25,    28,    19,    16,    15,    8,     3,     -1,    3,
+    5,     13,    18,    25,    31,    33,    39,    36,    36,    32,    36,
+    37,    39,    42,    36,    32,    27,    30,    24,    18,    15,    10,
+    7,     5,     6,     -1,    -4,    -10,   -17,   -15,   -19,   -15,   -7,
+    -4,    3,     0,     3,     4,     -2,    -7,    -13,   -21,   -23,   -28,
+    -27,   -26,   -25,   -15,   -10,   -4,    -6,    -5,    -9,    -5,    -3,
+    1,     2,     -1,    1,     -4,    -7,    -8,    -17,   -17,   -15,   -14,
+    -9,    -5,    -7,    -6,    -9,    -16,   -15,   -15,   -16,   -16,   -11,
+    -15,   -15,   -6,    -6,    -5,    -2,    0,     -9,    -10,   -12,   -13,
+    -10,   -4,    0,     8,     5,     4,     2,     0,     -5,    -8,    -16,
+    -15,   -12,   -3,    9,     17,    24,    26,    30,    28,    22,    17,
+    14,    9,     8,     9,     8,     11,    12,    12,    15,    14,    18,
+    20,    17,    19,    22,    21,    12,    5,     0,     3,     -3,    -4,
+    -6,    -7,    1,     8,     8,     8,     10,    2,     -3,    -8,    -15,
+    -20,   -24,   -22,   -23,   -13,   -6,    -7,    -5,    -10,   -8,    -15,
+    -19,   -22,   -20,   -17,   -18,   -13,   -10,   -1,    6,     5,     3,
+    1,     -5,    -11,   -10,   -14,   -19,   -15,   -13,   -8,    -2,    -3,
+    -4,    -3,    -4,    -1,    1,     0,     -3,    -4,    -8,    -18,   -21,
+    -25,   -24,   -16,   -9,    -2,    1,     5,     1,     3,     -2,    -7,
+    -10,   -23,   -30,   -29,   -23,   -9,    -3,    4,     11,    11,    6,
+    2,     0,     -12,   -20,   -28,   -24,   -22,   -17,   -22,   -19,   -14,
+    -21,   -17,   -17,   -12,   -8,    -3,    2,     0,     -6,    -5,    -8,
+    -12,   -17,   -27,   -34,   -31,   -30,   -27,   -19,   -14,   -14,   -14,
+    -14,   -19,   -22,   -21,   -19,   -14,   -1,    5,     9,     8,     6,
+    5,     -4,    -2,    -3,    -3,    -1,    -2,    -3,    2,     7,     8,
+    7,     6,     6,     3,     2,     1,     -2,    0,     6,     11,    18,
+    18,    19,    17,    14,    9,     4,     3,     3,     0,     -1,    3,
+    -1,    -5,    0,     -2,    0,     1,     7,     7,     8,     20,    29,
+    33,    31,    24,    14,    5,     -6,    -11,   -8,    -11,   -2,    6,
+    10,    12,    16,    26,    26,    24,    18,    12,    10,    4,     7,
+    6,     -2,    -12,   -17,   -17,   -20,   -23,   -23,   -18,   -8,    1,
+    3,     5,     6,     3,     0,     -6,    -12,   -12,   -15,   -12,   -7,
+    3,     3,     8,     7,     7,     7,     1,     -1,    -1,    4,     11,
+    17,    25,    32,    35,    42,    50,    52,    56,    50,    55,    53,
+    52,    47,    40,    38,    30,    26,    27,    28,    29,    25,    23,
+    23,    28,    30,    25,    26,    21,    19,    14,    9,     16,    22,
+    25,    33,    39,    45,    49,    48,    55,    51,    43,    35,    20,
+    14,    13,    23,    25,    24,    20,    22,    28,    22,    22,    17,
+    16,    13,    10,    10,    10,    9,     9,     14,    11,    10,    10,
+    4,     0,     0,     -2,    -3,    -5,    -7,    -3,    1,     -8,    -8,
+    -9,    -4,    4,     9,     11,    14,    11,    6,     8,     3,     -6,
+    -10,   -19,   -22,   -24,   -27,   -22,   -16,   -21,   -25,   -33,   -33,
+    -32,   -30,   -21,   -13,   -6,    -5,    2,     1,     4,     9,     7,
+    5,     1,     1,     8,     6,     7,     6,     0,     -6,    -15,   -18,
+    -23,   -22,   -23,   -25,   -22,   -21,   -19,   -17,   -13,   -10,   -10,
+    -16,   -17,   -15,   -13,   -8,    -9,    -14,   -13,   -17,   -20,   -26,
+    -28,   -31,   -29,   -26,   -23,   -13,   -10,   -6,    -1,    5,     7,
+    2,     -3,    -7,    -20,   -18,   -16,   -21,   -27,   -33,   -25,   -27,
+    -22,   -22,   -21,   -16,   -11,   -7,    -2,    2,     11,    18,    11,
+    9,     4,     1,     -1,    -6,    -4,    -5,    -9,    -12,   -16,   -25,
+    -29,   -37,   -37,   -38,   -37,   -33,   -23,   -16,   -14,   -7,    -1,
+    -4,    -3,    -4,    -5,    -11,   -14,   -8,    -8,    -8,    -8,    -9,
+    -4,    -14,   -21,   -22,   -21,   -18,   -15,   -2,    3,     -3,    0,
+    -2,    0,     -4,    -7,    -1,    -2,    3,     3,     -3,    -10,   -13,
+    -10,   -16,   -19,   -17,   -17,   -14,   -7,    5,     5,     7,     8,
+    12,    7,     0,     -5,    -13,   -17,   -18,   -14,   -7,    -4,    3,
+    11,    11,    12,    11,    8,     4,     -5,    -5,    -11,   -15,   -17,
+    -23,   -22,   -18,   -14,   -14,   -12,   -6,    -4,    -1,    3,     1,
+    -4,    -10,   -22,   -29,   -30,   -26,   -15,   -2,    6,     16,    21,
+    28,    32,    25,    24,    20,    9,     5,     0,     3,     7,     10,
+    11,    13,    17,    15,    16,    13,    11,    11,    8,     7,     1,
+    1,     -5,    -2,    -2,    -1,    4,     8,     17,    22,    24,    24,
+    26,    23,    20,    17,    16,    9,     4,     6,     5,     8,     2,
+    -1,    -5,    -4,    -10,   -14,   -14,   -17,   -19,   -18,   -16,   -14,
+    -6,    -3,    1,     3,     0,     -4,    -6,    -4,    -1,    -1,    2,
+    5,     3,     8,     7,     7,     14,    13,    20,    24,    29,    24,
+    12,    7,     -1,    -6,    -15,   -22,   -20,   -27,   -22,   -14,   -6,
+    2,     7,     9,     9,     2,     -3,    -7,    -8,    -10,   -9,    -3,
+    -6,    -11,   -12,   -8,    -5,    -4,    -5,    -3,    0,     3,     6,
+    6,     7,     5,     -7,    -10,   -14,   -13,   -14,   -17,   -11,   -7,
+    -4,    1,     1,     4,     -4,    -8,    -18,   -23,   -23,   -25,   -19,
+    -16,   -15,   -9,    3,     10,    19,    25,    30,    31,    26,    27,
+    23,    19,    16,    8,     7,     2,     0,     -1,    -1,    1,     5,
+    6,     6,     1,     3,     -1,    -7,    -11,   -17,   -19,   -19,   -7,
+    0,     3,     11,    12,    18,    20,    16,    9,     -2,    -7,    -14,
+    -19,   -22,   -30,   -33,   -34,   -36,   -26,   -14,   -11,   -9,    -3,
+    0,     -2,    1,     -2,    -3,    -5,    -12,   -15,   -19,   -14,   -9,
+    -8,    -2,    -6,    -13,   -15,   -19,   -22,   -25,   -26,   -21,   -20,
+    -11,   -1,    1,     5,     9,     13,    15,    12,    11,    3,     1,
+    -1,    0,     8,     13,    16,    16,    15,    16,    15,    12,    9,
+    7,     8,     4,     6,     4,     3,     3,     7,     0,     -4,    -8,
+    -11,   -18,   -18,   -15,   -20,   -23,   -21,   -22,   -21,   -27,   -25,
+    -15,   -7,    -2,    8,     9,     8,     8,     3,     3,     7,     8,
+    8,     8,     12,    11,    12,    4,     -1,    -7,    -11,   -15,   -18,
+    -17,   -17,   -20,   -19,   -13,   -11,   -3,    -3,    -1,    1,     -3,
+    1,     1,     8,     10,    15,    24,    26,    29,    34,    36,    26,
+    20,    12,    -2,    -6,    -9,    -7,    -6,    1,     10,    13,    19,
+    22,    22,    18,    21,    24,    28,    35,    37,    34,    33,    34,
+    34,    30,    19,    15,    10,    19,    21,    23,    24,    21,    19,
+    18,    21,    22,    22,    27,    30,    31,    32,    33,    32,    32,
+    24,    18,    10,    8,     10,    10,    6,     2,     -7,    -14,   -22,
+    -29,   -27,   -29,   -32,   -30,   -28,   -23,   -22,   -11,   -11,   -13,
+    -3,    2,     -1,    1,     1,     -3,    -7,    -5,    -7,    -11,   -17,
+    -23,   -25,   -26,   -27,   -26,   -23,   -14,   -5,    -3,    -1,    -2,
+    -2,    -1,    1,     -2,    -7,    -4,    2,     4,     10,    13,    6,
+    3,     -2,    -6,    -7,    -11,   -17,   -21,   -15,   -7,    -2,    11,
+    16,    22,    25,    25,    23,    24,    23,    21,    22,    25,    23,
+    17,    17,    12,    8,     -2,    -4,    1,     0,     4,     9,     8,
+    10,    9,     9,     15,    13,    10,    8,     1,     1,     -3,    1,
+    4,     11,    10,    9,     5,     5,     4,     1,     -1,    -4,    0,
+    8,     7,     4,     3,     3,     0,     -9,    -16,   -19,   -20,   -21,
+    -18,   -16,   -11,   -10,   -9,    -13,   -12,   -19,   -25,   -21,   -15,
+    -5,    8,     14,    21,    24,    18,    20,    17,    6,     1,     -2,
+    -2,    1,     1,     4,     1,     -3,    2,     0,     -3,    -3,    -4,
+    1,     0,     -5,    -11,   -17,   -21,   -20,   -20,   -20,   -14,   -9,
+    -3,    3,     7,     5,     3,     1,     -1,    -3,    -4,    -1,    1,
+    -5,    -1,    -1,    -7,    -11,   -14,   -12,   -14,   -17,   -18,   -23,
+    -29,   -24,   -27,   -19,   -12,   -13,   -2,    -3,    4,     4,     0,
+    -3,    -5,    -2,    -1,    -5,    -6,    -7,    -7,    -7,    -9,    -13,
+    -9,    -4,    1,     1,     1,     -4,    -11,   -8,    -15,   -19,   -19,
+    -12,   -5,    1,     7,     12,    8,     10,    10,    10,    11,    11,
+    19,    12,    9,     9,     2,     -4,    -13,   -22,   -24,   -25,   -24,
+    -26,   -19,   -14,   -10,   -1,    5,     4,     -1,    -4,    -5,    -10,
+    -14,   -11,   -8,    -10,   -8,    -9,    -7,    -8,    -6,    -1,    -5,
+    -10,   -18,   -27,   -29,   -24,   -19,   -11,   -7,    1,     10,    8,
+    8,     5,     2,     -5,    -1,    -1,    0,     2,     2,     -2,    -8,
+    -8,    -14,   -26,   -25,   -23,   -18,   -9,    2,     2,     7,     13,
+    6,     7,     5,     4,     3,     2,     1,     7,     2,     -1,    1,
+    -2,    2,     0,     -2,    -6,    -3,    5,     7,     9,     6,     5,
+    4,     2,     0,     -1,    -3,    3,     7,     6,     14,    18,    22,
+    20,    22,    19,    13,    9,     2,     -8,    -11,   -6,    -2,    -3,
+    -3,    0,     0,     0,     1,     -1,    -2,    1,     7,     11,    10,
+    11,    17,    17,    11,    11,    4,     6,     6,     13,    19,    22,
+    23,    27,    25,    24,    22,    14,    11,    13,    7,     0,     -3,
+    -9,    -11,   -7,    -7,    -6,    -4,    1,     7,     9,     15,    18,
+    18,    10,    5,     3,     -3,    -6,    -5,    -8,    -5,    4,     8,
+    8,     11,    10,    9,     4,     4,     1,     -3,    -10,   -11,   -8,
+    -16,   -20,   -22,   -19,   -12,   -7,    -10,   -10,   -13,   -14,   -11,
+    -11,   -13,   -18,   -21,   -19,   -17,   -22,   -18,   -22,   -22,   -16,
+    -9,    -3,    0,     3,     6,     3,     3,     -3,    -6,    -9,    -14,
+    -1,    14,    21,    30,    37,    33,    27,    26,    19,    15,    14,
+    11,    20,    12,    9,     10,    19,    20,    19,    22,    20,    22,
+    17,    13,    14,    10,    8,     12,    15,    13,    12,    12,    12,
+    9,     10,    11,    11,    9,     6,     4,     5,     -2,    1,     1,
+    -1,    5,     1,     8,     6,     3,     -1,    -4,    -15,   -24,   -27,
+    -26,   -23,   -19,   -9,    -3,    -4,    -9,    -9,    -10,   -16,   -22,
+    -19,   -18,   -15,   -2,    3,     5,     6,     7,     8,     11,    3,
+    1,     2,     1,     1,     0,     -4,    -13,   -18,   -19,   -19,   -20,
+    -23,   -15,   -10,   -5,    -3,    -1,    -1,    -1,    3,     -1,    0,
+    -8,    -11,   -13,   -14,   -13,   -8,    -6,    -3,    1,     1,     0,
+    0,     5,     4,     5,     5,     5,     4,     0,     -1,    -4,    -13,
+    -22,   -21,   -28,   -26,   -22,   -28,   -23,   -23,   -14,   -11,   -10,
+    -7,    -8,    -5,    -4,    1,     9,     10,    15,    19,    21,    17,
+    18,    19,    16,    13,    16,    21,    27,    29,    22,    22,    13,
+    4,     1,     0,     -5,    -6,    -2,    3,     5,     8,     6,     9,
+    10,    2,     -3,    -9,    -8,    -4,    -2,    -7,    -6,    -4,    -8,
+    -6,    -8,    -11,   -8,    -8,    -6,    2,     -2,    -2,    -1,    2,
+    4,     8,     5,     -1,    -8,    -10,   -7,    -6,    -5,    -6,    -5,
+    6,     13,    22,    28,    33,    31,    38,    35,    28,    27,    22,
+    22,    23,    26,    23,    21,    28,    28,    23,    23,    22,    21,
+    20,    14,    6,     -1,    -5,    -8,    -5,    -1,    2,     5,     5,
+    7,     8,     5,     4,     0,     3,     6,     10,    13,    13,    6,
+    4,     4,     0,     -2,    -3,    0,     3,     5,     7,     9,     7,
+    6,     10,    8,     3,     4,     -1,    -4,    -2,    0,     -2,    -2,
+    -2,    -3,    5,     8,     6,     4,     -1,    -7,    -6,    -7,    -12,
+    -18,   -11,   -2,    -1,    -1,    -1,    -2,    -7,    -7,    -3,    -3,
+    -5,    -6,    -6,    -6,    -6,    -6,    -9,    -12,   -9,    -5,    1,
+    3,     5,     5,     8,     7,     3,     -5,    -3,    -2,    2,     3,
+    5,     5,     -1,    -2,    -4,    -8,    -9,    -9,    -7,    -12,   -13,
+    -17,   -19,   -16,   -19,   -21,   -21,   -19,   -11,   -6,    -3,    7,
+    8,     6,     2,     0,     1,     1,     -2,    -5,    0,     -2,    2,
+    1,     2,     0,     -2,    -1,    -10,   -21,   -25,   -24,   -21,   -19,
+    -14,   -8,    -3,    -5,    0,     0,     -5,    -6,    -3,    -6,    -9,
+    -13,   -19,   -20,   -21,   -21,   -24,   -25,   -27,   -27,   -29,   -26,
+    -19,   -14,   -14,   -13,   -8,    -5,    -10,   -10,   -6,    1,     4,
+    14,    22,    23,    24,    20,    20,    18,    14,    11,    9,     6,
+    8,     12,    15,    18,    18,    12,    8,     9,     9,     9,     7,
+    4,     9,     5,     6,     5,     3,     3,     -1,    -1,    -6,    -10,
+    -6,    -8,    -3,    0,     -2,    -3,    -2,    -6,    -6,    -7,    -3,
+    -3,    -3,    -2,    1,     -1,    -10,   -7,    -13,   -21,   -23,   -20,
+    -19,   -18,   -18,   -19,   -15,   -16,   -7,    -6,    -9,    -13,   -12,
+    -6,    -1,    3,     6,     7,     5,     3,     -3,    -11,   -18,   -20,
+    -26,   -29,   -27,   -27,   -24,   -30,   -29,   -28,   -23,   -18,   -21,
+    -18,   -15,   -9,    1,     9,     17,    21,    23,    18,    14,    5,
+    -1,    -2,    -1,    0,     3,     6,     5,     4,     4,     0,     -1,
+    1,     -4,    -9,    -13,   -11,   -20,   -21,   -19,   -14,   -9,    -4,
+    1,     6,     10,    16,    24,    30,    35,    31,    38,    37,    35,
+    39,    36,    36,    32,    30,    33,    31,    24,    19,    12,    4,
+    -1,    -7,    -11,   -7,    -5,    -3,    2,     6,     10,    16,    19,
+    21,    21,    16,    10,    14,    12,    14,    13,    12,    12,    5,
+    6,     2,     0,     1,     3,     4,     6,     9,     6,     2,     -1,
+    -3,    -10,   -15,   -13,   -17,   -19,   -15,   -16,   -15,   -13,   -8,
+    -8,    -7,    -10,   -5,    -2,    1,     5,     5,     11,    10,    12,
+    10,    9,     9,     15,    23,    33,    35,    33,    34,    34,    35,
+    34,    24,    30,    26,    23,    21,    20,    15,    10,    3,     4,
+    0,     -7,    -8,    -9,    -9,    -8,    -4,    0,     5,     5,     2,
+    3,     -2,    0,     0,     -1,    0,     -1,    1,     2,     6,     3,
+    1,     -9,    -5,    -6,    -2,    -8,    -12,   -9,    -10,   -7,    -8,
+    -8,    -6,    -2,    -2,    -1,    0,     -2,    -1,    -8,    -18,   -19,
+    -27,   -37,   -42,   -40,   -39,   -33,   -30,   -23,   -16,   -16,   -9,
+    -13,   -11,   -10,   -10,   -8,    -3,    -1,    2,     0,     -1,    2,
+    6,     4,     8,     10,    17,    21,    28,    31,    33,    28,    20,
+    12,    8,     -3,    -5,    -4,    -3,    2,     6,     9,     8,     2,
+    7,     4,     -6,    -9,    -15,   -13,   -15,   -17,   -14,   -11,   -12,
+    -5,    -6,    -4,    -6,    -11,   -11,   -7,    -4,    -6,    -8,    -13,
+    -10,   -7,    -12,   -11,   -12,   -13,   -12,   -9,    -9,    -10,   -10,
+    -6,    -8,    -8,    -7,    -9,    -9,    -7,    2,     5,     5,     6,
+    3,     4,     6,     3,     -1,    -2,    -2,    -2,    1,     5,     3,
+    4,     2,     -2,    -7,    -9,    -13,   -11,   -8,    2,     12,    23,
+    31,    37,    41,    40,    37,    36,    31,    31,    27,    28,    24,
+    13,    16,    14,    15,    9,     4,     4,     5,     4,     7,     12,
+    16,    14,    11,    13,    6,     -2,    -4,    -1,    -3,    3,     6,
+    6,     9,     7,     9,     7,     5,     0,     1,     -1,    -2,    -4,
+    -1,    0,     0,     -4,    0,     -4,    -9,    -15,   -16,   -18,   -15,
+    -10,   -6,    -8,    -5,    -2,    -2,    0,     4,     7,     0,     -2,
+    -3,    4,     3,     2,     -1,    -3,    -8,    -19,   -19,   -19,   -16,
+    -8,    -5,    0,     1,     2,     1,     -1,    -2,    -10,   -12,   -10,
+    -4,    3,     4,     2,     7,     8,     4,     1,     -5,    -5,    -4,
+    -1,    9,     10,    12,    15,    15,    14,    11,    20,    16,    19,
+    18,    26,    29,    21,    23,    16,    16,    3,     -3,    -4,    -10,
+    -12,   -10,   -6,    -7,    -12,   -17,   -14,   -16,   -19,   -13,   -10,
+    -13,   -13,   -2,    2,     3,     7,     13,    22,    21,    21,    21,
+    24,    27,    23,    22,    20,    17,    17,    16,    13,    11,    5,
+    1,     1,     5,     5,     3,     2,     -1,    2,     -5,    -6,    -3,
+    -11,   -9,    -6,    -5,    -10,   -4,    -1,    1,     2,     -1,    -4,
+    -4,    -9,    -9,    -7,    -3,    3,     -2,    1,     1,     4,     -4,
+    -8,    -8,    -17,   -17,   -13,   -13,   -18,   -18,   -25,   -27,   -21,
+    -22,   -18,   -7,    -1,    5,     9,     11,    11,    11,    15,    11,
+    4,     1,     6,     8,     17,    12,    10,    5,     -2,    -3,    -14,
+    -17,   -25,   -26,   -22,   -20,   -13,   -12,   -12,   -13,   -10,   -4,
+    -6,    -6,    -4,    -6,    -4,    0,     -3,    -7,    -7,    -10,   -17,
+    -14,   -9,    -3,    4,     4,     6,     1,     0,     0,     -6,    -3,
+    -4,    -3,    -6,    -9,    -9,    -5,    0,     1,     2,     -2,    3,
+    -1,    -4,    -5,    -11,   -14,   -17,   -14,   -12,   -14,   -19,   -21,
+    -25,   -35,   -40,   -39,   -31,   -24,   -13,   -4,    -1,    0,     0,
+    2,     -2,    -5,    -8,    -8,    -9,    -6,    -2,    0,     -5,    -6,
+    2,     5,     4,     1,     6,     8,     9,     14,    13,    19,    15,
+    19,    13,    14,    20,    16,    16,    14,    14,    17,    13,    12,
+    11,    6,     -1,    -7,    -9,    -10,   -11,   -2,    8,     12,    12,
+    12,    8,     4,     1,     -3,    -4,    -4,    -3,    1,     9,     14,
+    16,    10,    12,    9,     6,     4,     -1,    8,     6,     3,     6,
+    1,     -11,   -10,   -10,   -13,   -9,    -6,    -2,    -2,    9,     13,
+    17,    17,    19,    17,    16,    9,     -2,    -5,    -5,    -3,    -9,
+    -8,    -8,    -12,   -17,   -16,   -18,   -15,   -9,    -7,    1,     10,
+    17,    18,    23,    25,    23,    20,    15,    17,    18,    23,    33,
+    40,    43,    45,    51,    53,    47,    36,    27,    10,    5,     1,
+    4,     5,     4,     0,     0,     6,     7,     8,     9,     3,     2,
+    1,     0,     -1,    3,     5,     5,     13,    7,     4,     4,     3,
+    11,    17,    21,    31,    31,    31,    31,    28,    26,    23,    19,
+    16,    17,    16,    10,    10,    12,    9,     7,     -1,    -7,    -12,
+    -15,   -15,   -15,   -13,   -13,   -16,   -19,   -19,   -23,   -31,   -34,
+    -38,   -39,   -31,   -30,   -21,   -21,   -18,   -11,   -16,   -20,   -25,
+    -22,   -18,   -14,   -7,    -8,    -3,    2,     10,    13,    12,    10,
+    6,     2,     0,     0,     0,     -6,    -4,    -1,    0,     0,     -1,
+    -2,    1,     3,     8,     9,     3,     6,     2,     -4,    -2,    -3,
+    -7,    -4,    -3,    2,     6,     8,     10,    12,    15,    11,    15,
+    12,    13,    14,    15,    18,    14,    8,     4,     4,     3,     -4,
+    -5,    -4,    -2,    -3,    -2,    4,     9,     13,    18,    21,    20,
+    18,    15,    11,    6,     7,     10,    8,     6,     3,     -3,    -7,
+    -14,   -21,   -29,   -33,   -32,   -26,   -17,   -12,   -11,   -9,    -3,
+    -10,   -13,   -18,   -23,   -21,   -26,   -26,   -24,   -28,   -25,   -29,
+    -30,   -30,   -27,   -17,   -7,    2,     10,    13,    16,    16,    17,
+    18,    17,    19,    19,    20,    15,    14,    16,    14,    10,    5,
+    0,     -4,    -18,   -21,   -25,   -20,   -16,   -13,   -8,    -5,    2,
+    6,     11,    12,    18,    16,    18,    15,    13,    17,    18,    22,
+    21,    25,    26,    25,    26,    28,    31,    27,    20,    10,    3,
+    -6,    -10,   -16,   -19,   -18,   -15,   -13,   -10,   -2,    0,     2,
+    4,     3,     5,     -1,    0,     1,     2,     0,     -2,    -1,    -6,
+    -5,    -7,    -12,   -10,   -9,    -4,    -1,    3,     4,     2,     4,
+    4,     3,     -3,    -6,    -11,   -14,   -15,   -23,   -25,   -29,   -30,
+    -28,   -25,   -22,   -19,   -21,   -19,   -11,   -7,    -7,    -3,    -3,
+    -6,    -8,    -13,   -10,   -10,   -5,    1,     4,     9,     7,     6,
+    6,     4,     -5,    -11,   -8,    -6,    -3,    0,     3,     7,     11,
+    7,     3,     5,     6,     10,    12,    14,    16,    8,     5,     -1,
+    -1,    4,     0,     0,     -3,    -5,    -5,    -4,    -2,    -2,    1,
+    4,     7,     5,     10,    9,     6,     9,     12,    19,    28,    32,
+    32,    33,    31,    29,    20,    17,    16,    14,    15,    6,     -2,
+    -5,    -7,    -10,   -10,   -11,   -9,    -6,    -3,    8,     10,    10,
+    10,    12,    12,    7,     7,     5,     3,     2,     2,     -2,    -5,
+    -4,    -7,    -2,    -6,    -5,    -6,    -11,   -14,   -13,   -10,   -11,
+    -15,   -16,   -11,   -11,   -11,   -10,   -16,   -15,   -15,   -16,   -10,
+    -11,   -11,   -5,    -1,    2,     1,     2,     0,     1,     4,     8,
+    5,     -4,    -2,    -4,    -12,   -18,   -24,   -20,   -25,   -14,   -3,
+    4,     11,    13,    13,    7,     4,     -4,    -9,    -13,   -17,   -10,
+    -6,    -1,    0,     2,     2,     -1,    1,     -8,    -18,   -22,   -19,
+    -19,   -22,   -20,   -22,   -20,   -17,   -12,   -9,    -4,    3,     9,
+    9,     9,     7,     6,     13,    10,    11,    8,     4,     -1,    5,
+    7,     7,     8,     4,     2,     2,     -2,    -8,    -11,   -16,   -18,
+    -12,   -12,   -9,    -2,    3,     3,     5,     5,     6,     9,     11,
+    20,    22,    26,    30,    28,    22,    15,    15,    10,    11,    9,
+    6,     9,     9,     11,    10,    12,    10,    8,     8,     7,     9,
+    4,     3,     9,     5,     1,     2,     0,     -3,    -3,    0,     3,
+    0,     -2,    1,     4,     6,     4,     0,     1,     -4,    -13,   -13,
+    -11,   -20,   -21,   -15,   -17,   -23,   -22,   -24,   -29,   -24,   -29,
+    -32,   -21,   -13,   -11,   -9,    -9,    -8,    -13,   -11,   -11,   -11,
+    -11,   -17,   -17,   -21,   -23,   -27,   -32,   -33,   -32,   -31,   -35,
+    -31,   -26,   -24,   -18,   -10,   -1,    5,     13,    17,    15,    13,
+    8,     4,     6,     9,     10,    13,    11,    12,    13,    9,     5,
+    6,     8,     12,    21,    25,    24,    23,    16,    8,     7,     0,
+    -3,    -8,    -9,    -2,    1,     11,    18,    25,    30,    31,    27,
+    21,    19,    19,    18,    18,    22,    24,    16,    14,    8,     2,
+    -4,    -9,    -7,    -10,   -6,    -8,    -8,    -13,   -14,   -11,   -13,
+    -8,    -7,    6,     9,     10,    15,    17,    11,    11,    9,     2,
+    2,     -2,    2,     -6,    -6,    -7,    -14,   -11,   -12,   -13,   -17,
+    -22,   -25,   -30,   -24,   -16,   -4,    5,     2,     7,     5,     2,
+    -1,    1,     -4,    -4,    4,     8,     8,     5,     6,     6,     2,
+    1,     -2,    -9,    -14,   -17,   -16,   -15,   -14,   -12,   -11,   -6,
+    -6,    -2,    -3,    -3,    6,     13,    18,    27,    27,    26,    24,
+    22,    19,    18,    19,    12,    8,     7,     -2,    0,     -6,    -8,
+    -6,    -4,    -6,    -14,   -16,   -16,   -15,   -12,   -2,    6,     12,
+    16,    18,    14,    16,    13,    12,    17,    16,    17,    17,    12,
+    13,    10,    14,    14,    10,    2,     -1,    -3,    -5,    -10,   -15,
+    -13,   -20,   -21,   -21,   -21,   -19,   -20,   -18,   -8,    -4,    -1,
+    -1,    4,     2,     -3,    0,     -5,    -5,    -3,    -1,    0,     6,
+    5,     6,     7,     7,     3,     2,     1,     -5,    -3,    0,     3,
+    5,     7,     4,     10,    15,    15,    11,    6,     8,     9,     14,
+    19,    18,    14,    12,    16,    15,    11,    9,     9,     5,     4,
+    0,     -7,    -12,   -18,   -22,   -29,   -32,   -36,   -37,   -38,   -39,
+    -32,   -24,   -20,   -14,   -10,   -2,    0,     1,     9,     13,    21,
+    26,    31,    35,    40,    38,    32,    33,    25,    14,    11,    7,
+    1,     -1,    -6,    -5,    -11,   -20,   -22,   -19,   -16,   -9,    2,
+    9,     14,    14,    13,    13,    12,    10,    3,     2,     1,     0,
+    6,     5,     -1,    -4,    -13,   -17,   -21,   -25,   -29,   -30,   -23,
+    -14,   -4,    4,     11,    11,    12,    13,    13,    5,     6,     6,
+    7,     5,     5,     9,     -2,    3,     0,     -2,    -3,    -5,    -1,
+    3,     9,     16,    18,    17,    17,    11,    5,     1,     -4,    -13,
+    -12,   -7,    -7,    1,     6,     4,     2,     3,     1,     1,     0,
+    -1,    -5,    -5,    -3,    -5,    -1,    8,     9,     7,     12,    7,
+    6,     4,     3,     -1,    -1,    -4,    -14,   -16,   -18,   -24,   -34,
+    -44,   -37,   -37,   -36,   -28,   -19,   -15,   -6,    -2,    -3,    2,
+    5,     6,     3,     6,     6,     9,     7,     3,     -4,    -15,   -25,
+    -34,   -37,   -41,   -41,   -38,   -33,   -27,   -22,   -14,   -15,   -18,
+    -18,   -15,   -8,    -7,    -2,    2,     0,     4,     12,    13,    10,
+    17,    20,    16,    17,    23,    24,    22,    24,    22,    28,    26,
+    24,    22,    26,    28,    27,    23,    17,    10,    4,     4,     1,
+    -1,    0,     4,     9,     15,    14,    15,    14,    14,    13,    8,
+    0,     -1,    -11,   -13,   -4,    -3,    -5,    -3,    -1,    -6,    -5,
+    -7,    -4,    -2,    2,     7,     15,    20,    14,    13,    8,     2,
+    -6,    -15,   -23,   -25,   -20,   -22,   -20,   -14,   -10,   -4,    -2,
+    1,     -10,   -15,   -12,   -8,    -8,    -7,    -5,    -10,   -12,   -20,
+    -28,   -26,   -24,   -16,   -8,    -5,    3,     8,     9,     12,    12,
+    12,    14,    13,    12,    10,    13,    23,    29,    28,    33,    36,
+    32,    28,    23,    25,    26,    30,    34,    27,    22,    16,    12,
+    3,     -6,    -13,   -13,   -15,   -14,   -9,    -11,   -13,   -13,   -16,
+    -15,   -20,   -22,   -20,   -32,   -30,   -29,   -24,   -18,   -18,   -18,
+    -13,   -15,   -15,   -16,   -17,   -10,   -11,   -12,   -15,   -17,   -17,
+    -19,   -21,   -22,   -26,   -28,   -21,   -18,   -14,   -5,    2,     6,
+    7,     5,     3,     -2,    0,     -4,    -2,    -3,    -6,    -9,    -12,
+    -11,   -11,   -19,   -23,   -20,   -21,   -16,   -19,   -23,   -22,   -24,
+    -21,   -22,   -17,   -15,   -8,    -1,    4,     14,    18,    23,    24,
+    25,    25,    18,    15,    7,     2,     14,    19,    22,    20,    23,
+    22,    20,    19,    20,    17,    16,    21,    22,    21,    18,    9,
+    3,     -6,    -14,   -19,   -30,   -36,   -40,   -32,   -22,   -21,   -16,
+    -7,    -1,    3,     2,     3,     6,     9,     16,    20,    22,    26,
+    27,    29,    32,    30,    23,    19,    20,    21,    18,    22,    24,
+    15,    14,    9,     9,     7,     6,     9,     9,     16,    22,    20,
+    18,    18,    9,     -1,    -10,   -16,   -19,   -22,   -22,   -20,   -16,
+    -11,   -5,    0,     1,     4,     2,     0,     3,     5,     10,    8,
+    12,    10,    11,    9,     8,     7,     -3,    -4,    -10,   -11,   -5,
+    2,     8,     12,    12,    13,    14,    15,    14,    12,    10,    14,
+    13,    8,     0,     -2,    -3,    -9,    -6,    -13,   -21,   -12,   -12,
+    -8,    -9,    -14,   -16,   -19,   -23,   -22,   -23,   -30,   -26,   -17,
+    -14,   -9,    -2,    3,     11,    16,    17,    17,    11,    12,    13,
+    12,    9,     8,     7,     10,    17,    14,    13,    9,     7,     6,
+    5,     10,    10,    6,     10,    9,     1,     -5,    -10,   -12,   -17,
+    -16,   -14,   -13,   -10,   -6,    -2,    0,     -1,    2,     2,     -1,
+    2,     6,     12,    18,    23,    22,    23,    24,    20,    16,    10,
+    6,     9,     16,    15,    15,    16,    14,    8,     4,     0,     -3,
+    -7,    -4,    -5,    -5,    0,     -4,    1,     1,     1,     -4,    -10,
+    -17,   -25,   -25,   -28,   -28,   -27,   -25,   -20,   -20,   -20,   -22,
+    -14,   -11,   -4,    4,     6,     11,    10,    12,    9,     6,     2,
+    -6,    -10,   -12,   -7,    -1,    -6,    0,     1,     2,     5,     1,
+    -1,    1,     -3,    -6,    -4,    -5,    -4,    -6,    -5,    -7,    -10,
+    -10,   -8,    -11,   -9,    -2,    9,     15,    14,    20,    19,    19,
+    16,    16,    11,    3,     2,     2,     5,     4,     5,     3,     -1,
+    -1,    -6,    -11,   -16,   -18,   -18,   -12,   -17,   -18,   -13,   -15,
+    -5,    -4,    -3,    -1,    2,     6,     7,     11,    14,    17,    17,
+    18,    21,    18,    19,    18,    23,    27,    36,    32,    35,    30,
+    24,    25,    18,    10,    3,     -1,    -4,    -11,   -16,   -21,   -33,
+    -37,   -35,   -36,   -35,   -30,   -26,   -26,   -21,   -10,   -7,    -3,
+    -4,    -3,    -3,    -9,    -12,   -16,   -25,   -22,   -11,   -6,    2,
+    5,     7,     4,     -2,    -8,    -16,   -23,   -30,   -28,   -23,   -20,
+    -11,   -11,   -8,    5,     2,     -3,    -1,    -11,   -15,   -10,   -13,
+    -8,    -8,    -12,   -9,    -10,   -15,   -8,    -4,    -3,    7,     6,
+    13,    20,    25,    24,    25,    27,    28,    25,    23,    22,    27,
+    28,    27,    30,    28,    26,    20,    16,    13,    7,     2,     1,
+    6,     3,     -4,    -6,    -13,   -18,   -19,   -21,   -15,   -3,    -1,
+    10,    16,    17,    20,    24,    28,    28,    26,    26,    28,    27,
+    24,    23,    20,    20,    24,    20,    17,    14,    6,     0,     2,
+    1,     0,     -3,    -7,    -12,   -18,   -29,   -28,   -30,   -32,   -23,
+    -27,   -25,   -20,   -17,   -13,   -11,   -14,   -17,   -21,   -22,   -18,
+    -11,   -12,   -6,    -8,    -9,    -5,    -6,    -10,   -18,   -19,   -16,
+    -13,   -9,    -6,    -7,    -13,   -10,   -14,   -22,   -30,   -37,   -35,
+    -37,   -35,   -34,   -36,   -30,   -23,   -17,   -16,   -16,   -11,   -6,
+    -2,    3,     7,     7,     6,     7,     7,     13,    21,    20,    22,
+    23,    22,    24,    17,    5,     -1,    -2,    -8,    -13,   -14,   -17,
+    -24,   -28,   -23,   -22,   -19,   -12,   -14,   -10,   -14,   -21,   -20,
+    -21,   -22,   -13,   -6,    -1,    6,     4,     10,    11,    8,     10,
+    10,    17,    20,    27,    34,    32,    26,    26,    24,    17,    13,
+    6,     9,     12,    15,    17,    12,    11,    9,     3,     -3,    -3,
+    -8,    -9,    -4,    -2,    -2,    2,     1,     -1,    -3,    -7,    -8,
+    -11,   -15,   -8,    -5,    1,     9,     7,     10,    13,    17,    14,
+    12,    8,     6,     3,     6,     9,     8,     5,     0,     -2,    1,
+    1,     -3,    -6,    -12,   -17,   -17,   -23,   -28,   -33,   -31,   -29,
+    -30,   -35,   -28,   -25,   -17,   -5,    0,     6,     10,    14,    27,
+    31,    26,    31,    30,    32,    41,    42,    42,    43,    34,    32,
+    21,    12,    2,     1,     -3,    -1,    8,     13,    20,    19,    18,
+    19,    13,    8,     5,     7,     6,     7,     6,     4,     3,     -2,
+    0,     2,     -4,    -1,    -3,    2,     12,    22,    33,    32,    31,
+    35,    35,    34,    32,    26,    27,    26,    21,    17,    10,    1,
+    -3,    -14,   -21,   -19,   -21,   -19,   -24,   -24,   -19,   -16,   -13,
+    -16,   -13,   -15,   -17,   -12,   -9,    -4,    7,     19,    27,    33,
+    37,    34,    35,    30,    24,    23,    25,    21,    20,    18,    15,
+    12,    13,    8,     2,     -4,    -12,   -18,   -17,   -14,   -10,   -14,
+    -8,    -14,   -14,   -12,   -14,   -19,   -23,   -31,   -32,   -28,   -30,
+    -22,   -20,   -13,   1,     0,     6,     14,    15,    20,    22,    20,
+    16,    9,     2,     1,     3,     6,     7,     9,     10,    14,    17,
+    16,    14,    4,     -7,    -16,   -31,   -40,   -41,   -40,   -38,   -34,
+    -40,   -37,   -33,   -28,   -22,   -17,   -11,   -10,   -12,   -5,    -5,
+    -8,    -4,    0,     -1,    1,     1,     6,     11,    14,    22,    25,
+    28,    31,    32,    32,    31,    31,    20,    13,    12,    5,     4,
+    4,     2,     0,     -3,    -6,    -8,    -4,    -4,    -4,    -1,    7,
+    9,     10,    13,    13,    16,    10,    7,     3,     6,     8,     8,
+    15,    20,    23,    18,    15,    12,    4,     1,     0,     -4,    -4,
+    -1,    8,     11,    13,    21,    24,    19,    12,    2,     -5,    -11,
+    -15,   -17,   -17,   -19,   -23,   -28,   -34,   -33,   -37,   -29,   -27,
+    -24,   -17,   -13,   -8,    -6,    -2,    5,     3,     4,     -2,    -5,
+    -4,    0,     2,     3,     1,     -5,    -5,    -6,    -11,   -11,   -15,
+    -15,   -19,   -17,   -17,   -21,   -23,   -21,   -22,   -24,   -28,   -27,
+    -25,   -15,   -8,    -1,    2,     2,     3,     3,     2,     -2,    0,
+    1,     -1,    2,     5,     7,     2,     0,     2,     -6,    -9,    -8,
+    -6,    -3,    -3,    3,     0,     5,     0,     0,     -5,    -12,   -13,
+    -20,   -14,   -14,   -6,    -5,    -2,    0,     6,     11,    9,     9,
+    11,    10,    13,    19,    26,    29,    36,    37,    40,    35,    27,
+    20,    13,    6,     3,     -1,    -1,    -1,    -3,    -6,    -8,    -14,
+    -16,   -25,   -28,   -23,   -21,   -24,   -22,   -22,   -22,   -24,   -28,
+    -35,   -43,   -42,   -37,   -29,   -20,   -5,    2,     10,    23,    28,
+    30,    31,    30,    39,    43,    40,    41,    43,    43,    38,    29,
+    18,    14,    12,    3,     6,     3,     3,     0,     -1,    -3,    -5,
+    -5,    -8,    -8,    -10,   -6,    -1,    1,     5,     1,     2,     6,
+    0,     -3,    -7,    -13,   -10,   -7,    -8,    -7,    -3,    -5,    -4,
+    -4,    -4,    -5,    -2,    2,     3,     6,     4,     3,     -1,    -2,
+    -5,    -16,   -22,   -31,   -39,   -38,   -42,   -47,   -42,   -42,   -35,
+    -27,   -30,   -28,   -25,   -26,   -24,   -20,   -19,   -19,   -19,   -19,
+    -14,   -16,   -13,   -9,    -10,   -1,    8,     17,    21,    28,    26,
+    28,    24,    14,    8,     2,     0,     -4,    -4,    -13,   -16,   -16,
+    -13,   -12,   -7,    -5,    0,     -4,    -1,    2,     4,     8,     8,
+    10,    10,    10,    14,    16,    17,    23,    20,    27,    27,    27,
+    21,    14,    11,    0,     -4,    -8,    -8,    -1,    -1,    1,     6,
+    8,     23,    22,    23,    23,    25,    26,    26,    22,    21,    20,
+    22,    17,    12,    8,     3,     -2,    -2,    -4,    -5,    -3,    1,
+    7,     6,     8,     9,     12,    6,     1,     -4,    -8,    -6,    -3,
+    -4,    -5,    -3,    -7,    -6,    -6,    -11,   -11,   -19,   -23,   -26,
+    -28,   -34,   -41,   -41,   -44,   -45,   -47,   -40,   -39,   -33,   -29,
+    -21,   -14,   -16,   -6,    -7,    -3,    1,     6,     8,     11,    14,
+    14,    15,    15,    18,    18,    16,    17,    12,    15,    20,    21,
+    19,    21,    23,    22,    21,    16,    12,    8,     7,     7,     10,
+    13,    13,    16,    16,    16,    16,    15,    15,    12,    14,    14,
+    15,    12,    11,    17,    19,    19,    14,    13,    15,    17,    18,
+    20,    24,    27,    24,    19,    11,    10,    1,     0,     0,     -1,
+    3,     8,     16,    18,    17,    22,    22,    21,    19,    7,     0,
+    1,     -1,    -2,    -1,    -6,    -8,    -12,   -14,   -20,   -21,   -24,
+    -19,   -9,    -4,    -3,    2,     2,     3,     0,     -10,   -19,   -23,
+    -29,   -31,   -35,   -29,   -33,   -28,   -25,   -25,   -19,   -22,   -23,
+    -24,   -21,   -17,   -15,   -17,   -13,   -15,   -12,   -15,   -14,   -14,
+    -12,   -9,    -5,    1,     9,     13,    13,    17,    17,    15,    11,
+    12,    8,     13,    20,    24,    30,    29,    33,    30,    26,    23,
+    13,    9,     4,     3,     3,     5,     3,     2,     5,     3,     2,
+    1,     3,     6,     10,    14,    19,    23,    21,    20,    21,    17,
+    11,    5,     -3,    -7,    -12,   -15,   -16,   -13,   -15,   -13,   -7,
+    -4,    -5,    -5,    -1,    5,     11,    8,     7,     -2,    -2,    -5,
+    -6,    -1,    -2,    0,     2,     8,     13,    15,    17,    15,    16,
+    10,    13,    3,     -1,    -4,    -4,    -4,    0,     8,     13,    15,
+    9,     11,    9,     12,    9,     10,    10,    5,     11,    16,    21,
+    20,    15,    13,    5,     3,     -3,    1,     1,     0,     -4,    -7,
+    -9,    -7,    -9,    -10,   -7,    -6,    -3,    -2,    -3,    -3,    -6,
+    -12,   -16,   -22,   -21,   -26,   -28,   -25,   -24,   -23,   -23,   -28,
+    -32,   -29,   -26,   -26,   -23,   -29,   -23,   -16,   -11,   -7,    -9,
+    -10,   -12,   -18,   -20,   -20,   -26,   -23,   -16,   -17,   -10,   -7,
+    0,     3,     -2,    0,     -4,    -7,    -8,    -6,    -3,    -7,    -5,
+    -5,    1,     0,     -3,    -2,    -3,    5,     7,     10,    19,    17,
+    22,    21,    20,    16,    8,     9,     10,    12,    20,    28,    31,
+    28,    28,    26,    21,    14,    8,     5,     4,     5,     8,     9,
+    9,     13,    17,    16,    14,    20,    17,    13,    16,    17,    18,
+    18,    15,    11,    5,     -2,    -8,    -15,   -17,   -17,   -24,   -24,
+    -23,   -18,   -13,   -13,   -9,    -7,    -4,    0,     3,     6,     2,
+    2,     -4,    -5,    -5,    -4,    -4,    -2,    2,     6,     10,    7,
+    4,     2,     -2,    -3,    -8,    -10,   -14,   -27,   -29,   -37,   -36,
+    -29,   -27,   -19,   -7,    -3,    0,     -2,    2,     8,     13,    18,
+    15,    10,    10,    6,     1,     -5,    -12,   -17,   -20,   -23,   -23,
+    -22,   -19,   -17,   -10,   -6,    -3,    2,     0,     4,     11,    14,
+    19,    16,    6,     7,     3,     3,     4,     1,     7,     8,     7,
+    3,     -2,    0,     0,     0,     -1,    -2,    0,     4,     3,     5,
+    9,     9,     12,    7,     5,     0,     0,     1,     0,     2,     -6,
+    -10,   -9,    -13,   -15,   -19,   -15,   -18,   -16,   -17,   -9,    -5,
+    -2,    2,     2,     3,     7,     2,     -3,    -8,    -13,   -8,    1,
+    8,     12,    15,    17,    17,    11,    7,     0,     -4,    -8,    -8,
+    -3,    -1,    -4,    -6,    -6,    -13,   -12,   -12,   -13,   -12,   -8,
+    -9,    -5,    -4,    -2,    0,     -1,    -6,    -7,    -6,    -10,   -10,
+    -8,    -6,    1,     5,     6,     15,    18,    16,    12,    12,    12,
+    10,    13,    7,     0,     -9,    -10,   -11,   -6,    -8,    -8,    -4,
+    0,     6,     10,    11,    15,    15,    15,    12,    10,    6,     6,
+    11,    12,    20,    25,    23,    25,    18,    12,    6,     -1,    -4,
+    -10,   -12,   -9,    -13,   -16,   -15,   -18,   -18,   -22,   -22,   -17,
+    -14,   -12,   -8,    -3,    1,     4,     11,    13,    7,     0,     -8,
+    -11,   -11,   -13,   -14,   -12,   -11,   -9,    -6,    -5,    -2,    1,
+    5,     6,     10,    18,    17,    15,    13,    11,    12,    13,    10,
+    9,     13,    16,    16,    13,    11,    6,     5,     0,     -5,    -4,
+    -3,    2,     6,     5,     6,     11,    14,    20,    23,    28,    27,
+    22,    24,    23,    22,    16,    17,    12,    7,     -1,    -9,    -10,
+    -9,    -9,    -13,   -11,   -9,    -2,    -2,    -7,    -8,    -6,    -7,
+    -12,   -12,   -10,   0,     5,     11,    13,    11,    10,    7,     3,
+    0,     0,     3,     10,    14,    16,    18,    19,    21,    14,    15,
+    12,    7,     6,     7,     9,     7,     11,    6,     4,     4,     -1,
+    -9,    -12,   -12,   -14,   -9,    -9,    -6,    -5,    -4,    -6,    -7,
+    -12,   -15,   -17,   -27,   -23,   -20,   -19,   -19,   -18,   -24,   -20,
+    -25,   -28,   -33,   -31,   -29,   -27,   -15,   -12,   -7,    -3,    1,
+    -3,    -3,    -5,    -8,    -6,    0,     13,    17,    24,    25,    23,
+    24,    18,    8,     -3,    -4,    -4,    -7,    -3,    1,     4,     7,
+    9,     10,    14,    14,    20,    28,    35,    38,    42,    43,    43,
+    39,    30,    27,    19,    15,    8,     10,    12,    19,    25,    26,
+    27,    23,    22,    15,    10,    6,     8,     4,     6,     6,     3,
+    7,     7,     15,    11,    7,     6,     5,     9,     6,     0,     -3,
+    -14,   -21,   -21,   -30,   -39,   -42,   -40,   -37,   -37,   -36,   -32,
+    -30,   -24,   -21,   -22,   -23,   -24,   -28,   -31,   -31,   -29,   -27,
+    -30,   -31,   -31,   -31,   -34,   -33,   -34,   -26,   -21,   -15,   -10,
+    -5,    -3,    -2,    -3,    -6,    -5,    -11,   -14,   -10,   -5,    0,
+    9,     10,    18,    21,    19,    21,    11,    7,     4,     6,     6,
+    7,     3,     -6,    -9,    -16,   -23,   -24,   -23,   -26,   -18,   -16,
+    -11,   -8,    0,     6,     5,     6,     10,    8,     8,     16,    24,
+    24,    23,    24,    24,    24,    18,    9,     4,     -3,    -11,   -16,
+    -15,   -18,   -14,   -12,   -9,    -3,    -4,    -1,    8,     11,    10,
+    19,    21,    21,    23,    20,    22,    15,    9,     7,     5,     3,
+    1,     12,    13,    10,    18,    23,    31,    37,    40,    36,    38,
+    40,    40,    38,    27,    24,    21,    14,    12,    12,    7,     7,
+    15,    18,    19,    18,    17,    18,    14,    12,    11,    7,     5,
+    7,     9,     9,     15,    14,    15,    18,    16,    7,     0,     -5,
+    -6,    -6,    -6,    -1,    7,     9,     12,    6,     4,     4,     2,
+    -1,    2,     3,     3,     5,     4,     -1,    -13,   -19,   -29,   -34,
+    -39,   -43,   -49,   -54,   -53,   -55,   -55,   -56,   -59,   -58,   -49,
+    -41,   -32,   -19,   -10,   -2,    -4,    -1,    -6,    -19,   -27,   -26,
+    -27,   -27,   -21,   -22,   -20,   -26,   -26,   -20,   -20,   -20,   -21,
+    -17,   -18,   -7,    -6,    -6,    -5,    -1,    7,     18,    10,    16,
+    25,    24,    31,    30,    32,    30,    26,    24,    22,    23,    21,
+    23,    21,    24,    19,    17,    13,    12,    15,    6,     2,     -5,
+    -9,    -13,   -10,   -5,    1,     10,    13,    17,    13,    8,     5,
+    5,     6,     5,     13,    19,    16,    14,    12,    7,     15,    18,
+    19,    16,    4,     -1,    0,     -1,    -2,    -9,    -15,   -19,   -21,
+    -13,   -13,   -10,   -7,    -7,    -7,    -6,    -11,   -22,   -18,   -19,
+    -22,   -22,   -19,   -18,   -10,   -7,    -9,    -7,    -12,   -16,   -20,
+    -27,   -35,   -37,   -37,   -33,   -24,   -14,   -4,    8,     14,    19,
+    19,    16,    12,    6,     2,     -5,    -6,    -11,   -17,   -16,   -14,
+    -13,   -12,   -17,   -21,   -22,   -24,   -18,   -14,   -12,   -1,    4,
+    9,     17,    14,    9,     13,    14,    13,    14,    14,    12,    11,
+    15,    11,    16,    21,    20,    20,    22,    31,    30,    26,    15,
+    13,    6,     8,     5,     1,     -5,    -3,    2,     9,     14,    13,
+    16,    17,    18,    13,    10,    8,     7,     9,     12,    21,    23,
+    23,    21,    19,    16,    14,    5,     -4,    -12,   -15,   -16,   -12,
+    -9,    -12,   -14,   -17,   -16,   -15,   -14,   -15,   -28,   -27,   -24,
+    -12,   -8,    -3,    3,     9,     15,    18,    25,    25,    31,    32,
+    35,    36,    33,    36,    24,    13,    2,     -11,   -19,   -18,   -18,
+    -10,   -6,    -4,    0,     -3,    -3,    -15,   -18,   -17,   -9,    -7,
+    2,     5,     7,     6,     2,     -2,    -12,   -16,   -16,   -9,    -3,
+    6,     8,     15,    17,    16,    18,    11,    5,     -4,    -8,    -17,
+    -16,   -22,   -24,   -25,   -28,   -23,   -19,   -11,   -3,    5,     11,
+    22,    26,    29,    24,    14,    12,    7,     6,     -2,    -1,    2,
+    10,    23,    33,    36,    32,    31,    16,    3,     -4,    -3,    -3,
+    1,     8,     11,    13,    12,    8,     3,     5,     3,     1,     -1,
+    4,     2,     3,     8,     5,     5,     1,     -2,    -1,    -3,    -1,
+    5,     8,     10,    17,    17,    15,    19,    27,    18,    21,    23,
+    19,    20,    15,    1,     -7,    -18,   -24,   -24,   -33,   -28,   -32,
+    -30,   -30,   -30,   -30,   -29,   -30,   -41,   -43,   -50,   -51,   -49,
+    -42,   -32,   -19,   -10,   0,     4,     -2,    5,     9,     8,     12,
+    19,    17,    10,    9,     3,     1,     -4,    -8,    -4,    0,     5,
+    7,     10,    9,     12,    0,     -6,    -7,    -13,   -16,   -10,   -10,
+    -9,    -1,    -1,    -2,    -6,    -11,   -14,   -17,   -18,   -10,   -3,
+    -3,    0,     6,     1,     6,     4,     3,     3,     9,     16,    22,
+    28,    27,    32,    18,    21,    25,    20,    21,    18,    18,    22,
+    23,    15,    8,     -3,    -9,    -10,   -13,   -8,    3,     7,     18,
+    26,    23,    26,    30,    17,    11,    9,     -1,    0,     2,     2,
+    12,    15,    6,     1,     0,     -5,    2,     1,     -3,    -1,    -6,
+    -2,    -4,    -11,   -18,   -30,   -38,   -36,   -33,   -32,   -27,   -19,
+    -18,   -14,   -13,   -16,   -11,   -12,   -12,   -4,    0,     7,     13,
+    13,    10,    11,    6,     3,     3,     3,     4,     10,    4,     -1,
+    -3,    -11,   -21,   -27,   -34,   -33,   -31,   -33,   -28,   -22,   -21,
+    -14,   -8,    -13,   -10,   -8,    -12,   -7,    -11,   -3,    3,     5,
+    7,     7,     -1,    -12,   -13,   -17,   -21,   -8,    -2,    4,     7,
+    13,    18,    18,    16,    15,    13,    11,    15,    13,    12,    17,
+    18,    15,    15,    11,    -3,    -1,    2,     11,    15,    10,    18,
+    13,    10,    12,    9,     2,     2,     4,     -1,    6,     9,     11,
+    5,     7,     13,    8,     9,     10,    11,    9,     7,     11,    5,
+    3,     1,     -9,    -19,   -31,   -40,   -42,   -33,   -27,   -24,   -22,
+    -20,   -25,   -20,   -12,   -17,   -23,   -23,   -25,   -25,   -20,   -18,
+    -17,   -19,   -15,   -22,   -20,   -19,   -13,   -8,    -12,   0,     2,
+    -6,    -1,    -5,    -15,   -10,   -12,   -19,   -8,    -6,    -3,    9,
+    5,     12,    22,    10,    9,     12,    5,     8,     28,    13,    20,
+    25,    11,    16,    19,    10,    15,    14,    6,     23,    19,    18,
+    32,    17,    12,    19,    -1,    -8,    11,    -4,    -8,    9,     -4,
+    -6,    0,     -10,   -7,    -3,    -8,    -11,   -11,   -23,   -7,    -4,
+    -4,    14,    6,     4,     9,     3,     -4,    4,     2,     9,     26,
+    19,    26,    33,    22,    22,    24,    13,    20,    18,    18,    28,
+    28,    19,    24,    16,    -1,    1,     -12,   -34,   -28,   -25,   -27,
+    -13,   6,     8,     21,    25,    22,    19,    3,     4,     0,     -5,
+    6,     8,     1,     6,     8,     -4,    -3,    -10,   -23,   -17,   -9,
+    -10,   3,     6,     -1,    3,     -10,   -22,   -28,   -49,   -49,   -36,
+    -29,   -10,   8,     -1,    4,     14,    -3,    -14,   -5,    -16,   -10,
+    8,     7,     21,    24,    17,    25,    15,    -4,    13,    -7,    -23,
+    0,     -7,    -14,   12,    1,     -18,   -10,   -27,   -43,   -31,   -34,
+    -19,   -3,    -10,   15,    20,    -7,    10,    9,     -20,   7,     28,
+    14,    42,    54,    32,    34,    24,    5,     10,    -11,   -13,   11,
+    -6,    -4,    31,    7,     0,     34,    3,     -9,    5,     -24,   -33,
+    -14,   -11,   -1,    8,     0,     10,    7,     -7,    11,    10,    -6,
+    17,    16,    0,     10,    3,     -26,   -23,   -33,   -39,   -26,   -29,
+    -18,   -6,    -9,    -1,    5,     -11,   -6,    7,     -6,    1,     13,
+    8,     1,     3,     -13,   -23,   -25,   -33,   -28,   -21,   -9,    2,
+    4,     1,     8,     4,     -13,   -5,    -12,   -14,   3,     14,    18,
+    26,    30,    21,    20,    15,    15,    10,    5,     13,    11,    20,
+    25,    29,    18,    19,    9,     -10,   -15,   -13,   -12,   1,     16,
+    20,    30,    39,    37,    21,    15,    3,     -7,    -9,    -1,    2,
+    -6,    -7,    -10,   -20,   -19,   -19,   -31,   -25,   -12,   -15,   -13,
+    -17,   -18,   -14,   -24,   -24,   -18,   -28,   -24,   -3,    1,     17,
+    46,    48,    43,    46,    34,    12,    6,     -14,   -19,   -10,   -14,
+    3,     15,    3,     7,     7,     -13,   4,     9,     -2,    3,     22,
+    19,    25,    41,    48,    46,    36,    42,    40,    24,    33,    50,
+    29,    30,    57,    35,    13,    29,    17,    -9,    5,     15,    7,
+    13,    38,    47,    40,    56,    72,    42,    29,    40,    18,    14,
+    36,    52,    50,    58,    55,    42,    22,    20,    13,    -8,    8,
+    32,    26,    41,    70,    48,    51,    65,    36,    27,    23,    4,
+    5,     1,     -3,    2,     -8,    -23,   -6,    -30,   -46,   -24,   -40,
+    -45,   -22,   -32,   -35,   -24,   -50,   -41,   -35,   -56,   -38,   -29,
+    -55,   -25,   -7,    -40,   -26,   -25,   -63,   -51,   -40,   -61,   -47,
+    -38,   -38,   -5,    2,     3,     26,    -1,    -7,    8,     -20,   -17,
+    10,    -14,   -6,    41,    24,    27,    52,    26,    13,    25,    5,
+    -6,    2,     -7,    -2,    10,    4,     29,    36,    30,    74,    93,
+    91,    131,   150,   132,   167,   177,   158,   189,   188,   178,   200,
+    199,   187,   212,   202,   188,   210,   188,   173,   187,   175,   183,
+    215,   218,   236,   264,   253,   279,   296,   275,   290,   288,   261,
+    261,   261,   230,   216,   199,   157,   160,   147,   115,   108,   84,
+    50,    32,    7,     -30,   -56,   -96,   -130,  -146,  -179,  -199,  -223,
+    -255,  -280,  -293,  -326,  -341,  -352,  -391,  -410,  -429,  -464,  -489,
+    -507,  -538,  -559,  -577,  -602,  -634,  -656,  -679,  -696,  -702,  -700,
+    -699,  -700,  -687,  -666,  -665,  -656,  -634,  -626,  -609,  -572,  -539,
+    -518,  -484,  -462,  -444,  -418,  -390,  -364,  -336,  -295,  -245,  -210,
+    -175,  -127,  -97,   -63,   -28,   10,    45,    83,    121,   167,   222,
+    272,   324,   369,   396,   439,   485,   502,   536,   571,   585,   618,
+    656,   676,   705,   729,   744,   767,   776,   786,   798,   796,   813,
+    849,   855,   865,   883,   862,   843,   834,   794,   781,   778,   767,
+    746,   744,   721,   702,   681,   638,   607,   562,   521,   490,   447,
+    398,   361,   313,   255,   204,   123,   20,    -59,   -143,  -217,  -270,
+    -328,  -400,  -462,  -529,  -607,  -666,  -737,  -797,  -854,  -906,  -936,
+    -944,  -955,  -965,  -976,  -993,  -1003, -1007, -1032, -1040, -1045, -1055,
+    -1039, -1016, -1003, -990,  -995,  -1026, -1046, -1070, -1079, -1058, -1060,
+    -1062, -1028, -1010, -1006, -991,  -1000, -1004, -987,  -981,  -958,  -921,
+    -890,  -852,  -798,  -754,  -713,  -681,  -682,  -658,  -617,  -585,  -524,
+    -452,  -404,  -332,  -258,  -224,  -183,  -144,  -132,  -94,   -64,   -31,
+    37,    99,    147,   219,   280,   329,   389,   439,   483,   563,   632,
+    702,   799,   884,   965,   1050,  1107,  1150,  1209,  1260,  1308,  1383,
+    1446,  1514,  1582,  1632,  1679,  1727,  1770,  1804,  1837,  1872,  1916,
+    1961,  1999,  2038,  2071,  2089,  2097,  2107,  2091,  2084,  2072,  2051,
+    2021,  1998,  1940,  1868,  1814,  1734,  1641,  1559,  1480,  1395,  1305,
+    1213,  1115,  1015,  901,   785,   667,   520,   381,   256,   110,   -26,
+    -141,  -284,  -417,  -528,  -670,  -805,  -935,  -1080, -1206, -1324, -1438,
+    -1527, -1622, -1725, -1798, -1879, -1956, -2006, -2063, -2128, -2166, -2201,
+    -2238, -2257, -2292, -2316, -2337, -2357, -2356, -2362, -2382, -2375, -2368,
+    -2367, -2358, -2337, -2329, -2318, -2296, -2273, -2240, -2195, -2140, -2095,
+    -2044, -1990, -1932, -1872, -1803, -1737, -1673, -1602, -1520, -1428, -1325,
+    -1219, -1112, -1006, -896,  -780,  -681,  -591,  -481,  -388,  -294,  -189,
+    -85,   30,    148,   252,   348,   466,   579,   692,   811,   918,   1041,
+    1162,  1271,  1389,  1507,  1611,  1735,  1864,  1965,  2085,  2203,  2312,
+    2436,  2536,  2614,  2697,  2760,  2812,  2886,  2956,  3010,  3066,  3088,
+    3098,  3120,  3110,  3101,  3106,  3108,  3130,  3149,  3139,  3122,  3085,
+    3016,  2951,  2874,  2770,  2671,  2559,  2435,  2315,  2198,  2059,  1915,
+    1761,  1570,  1387,  1185,  984,   787,   601,   413,   224,   40,    -158,
+    -348,  -560,  -760,  -960,  -1147, -1312, -1471, -1621, -1779, -1925, -2069,
+    -2206, -2333, -2463, -2570, -2664, -2743, -2811, -2860, -2886, -2934, -2976,
+    -3015, -3057, -3074, -3076, -3079, -3060, -3032, -2998, -2950, -2920, -2893,
+    -2863, -2837, -2806, -2761, -2715, -2662, -2607, -2554, -2486, -2402, -2325,
+    -2264, -2190, -2127, -2063, -1989, -1932, -1862, -1788, -1724, -1640, -1545,
+    -1455, -1346, -1234, -1112, -984,  -859,  -735,  -610,  -494,  -384,  -280,
+    -176,  -68,   40,    140,   244,   363,   478,   596,   739,   876,   1001,
+    1128,  1240,  1352,  1474,  1595,  1717,  1853,  1972,  2093,  2215,  2328,
+    2432,  2533,  2641,  2744,  2855,  2949,  3055,  3157,  3242,  3329,  3415,
+    3479,  3528,  3569,  3588,  3617,  3649,  3676,  3708,  3747,  3751,  3753,
+    3744,  3693,  3640,  3576,  3470,  3369,  3248,  3098,  2976,  2838,  2690,
+    2557,  2395,  2222,  2055,  1872,  1675,  1488,  1279,  1057,  851,   623,
+    393,   180,   -74,   -315,  -537,  -771,  -979,  -1161, -1373, -1558, -1729,
+    -1932, -2110, -2294, -2478, -2636, -2785, -2917, -3007, -3094, -3183, -3247,
+    -3319, -3402, -3450, -3510, -3564, -3595, -3622, -3635, -3627, -3635, -3639,
+    -3620, -3620, -3610, -3596, -3581, -3535, -3495, -3455, -3410, -3361, -3323,
+    -3265, -3202, -3141, -3078, -3001, -2919, -2830, -2739, -2640, -2540, -2430,
+    -2320, -2192, -2057, -1909, -1761, -1603, -1422, -1244, -1059, -887,  -726,
+    -570,  -425,  -256,  -92,   69,    238,   411,   557,   728,   910,   1066,
+    1229,  1403,  1561,  1727,  1895,  2050,  2208,  2352,  2492,  2638,  2765,
+    2893,  3025,  3145,  3263,  3387,  3496,  3595,  3707,  3804,  3884,  3975,
+    4046,  4105,  4167,  4204,  4220,  4237,  4243,  4247,  4260,  4255,  4251,
+    4246,  4201,  4143,  4092,  3996,  3885,  3772,  3604,  3435,  3283,  3086,
+    2923,  2742,  2535,  2341,  2130,  1887,  1649,  1411,  1137,  915,   659,
+    398,   163,   -81,   -351,  -580,  -814,  -1069, -1262, -1476, -1689, -1850,
+    -2043, -2237, -2395, -2591, -2763, -2918, -3095, -3224, -3319, -3435, -3508,
+    -3582, -3698, -3772, -3858, -3950, -4008, -4047, -4088, -4093, -4085, -4098,
+    -4064, -4052, -4057, -4033, -4028, -4018, -3991, -3971, -3933, -3865, -3802,
+    -3727, -3633, -3562, -3477, -3392, -3300, -3210, -3115, -3018, -2924, -2819,
+    -2721, -2606, -2490, -2381, -2246, -2111, -1963, -1810, -1638, -1460, -1293,
+    -1132, -980,  -828,  -666,  -496,  -322,  -125,  72,    264,   470,   676,
+    879,   1087,  1280,  1457,  1633,  1799,  1970,  2152,  2327,  2501,  2678,
+    2840,  3007,  3165,  3301,  3434,  3558,  3667,  3791,  3912,  4023,  4140,
+    4257,  4359,  4475,  4554,  4614,  4656,  4682,  4697,  4726,  4749,  4775,
+    4810,  4812,  4812,  4810,  4768,  4697,  4620,  4502,  4368,  4210,  4031,
+    3860,  3663,  3472,  3291,  3076,  2849,  2642,  2392,  2140,  1890,  1610,
+    1325,  1064,  782,   494,   231,   -50,   -329,  -593,  -861,  -1112, -1345,
+    -1588, -1812, -2022, -2257, -2467, -2682, -2924, -3126, -3317, -3495, -3630,
+    -3737, -3855, -3941, -4031, -4128, -4200, -4281, -4348, -4388, -4427, -4449,
+    -4444, -4450, -4458, -4452, -4464, -4460, -4451, -4444, -4425, -4384, -4344,
+    -4289, -4234, -4160, -4076, -4000, -3917, -3837, -3753, -3669, -3558, -3460,
+    -3354, -3230, -3111, -2966, -2824, -2665, -2495, -2333, -2151, -1951, -1752,
+    -1554, -1367, -1222, -1053, -882,  -716,  -520,  -331,  -141,  62,    270,
+    476,   707,   923,   1133,  1349,  1534,  1735,  1943,  2124,  2317,  2511,
+    2668,  2839,  3002,  3140,  3317,  3481,  3615,  3771,  3920,  4050,  4196,
+    4319,  4430,  4556,  4657,  4765,  4868,  4945,  4999,  5057,  5075,  5100,
+    5123,  5133,  5134,  5127,  5104,  5084,  5058,  4968,  4896,  4750,  4575,
+    4381,  4179,  3971,  3776,  3590,  3394,  3209,  2991,  2800,  2535,  2269,
+    1972,  1654,  1319,  998,   697,   384,   105,   -187,  -476,  -759,  -1047,
+    -1316, -1579, -1841, -2085, -2317, -2550, -2745, -2938, -3145, -3326, -3523,
+    -3706, -3859, -3998, -4124, -4218, -4288, -4346, -4386, -4437, -4495, -4550,
+    -4619, -4680, -4732, -4779, -4813, -4820, -4842, -4825, -4791, -4773, -4742,
+    -4715, -4709, -4683, -4652, -4605, -4527, -4428, -4315, -4194, -4086, -3978,
+    -3872, -3779, -3685, -3569, -3458, -3313, -3121, -2921, -2693, -2454, -2230,
+    -1998, -1783, -1588, -1414, -1240, -1069, -886,  -690,  -473,  -256,  -36,
+    170,   384,   594,   797,   1015,  1235,  1449,  1664,  1882,  2098,  2311,
+    2504,  2681,  2843,  3019,  3171,  3337,  3534,  3709,  3885,  4072,  4235,
+    4380,  4524,  4641,  4746,  4864,  4979,  5087,  5213,  5308,  5393,  5450,
+    5468,  5475,  5472,  5452,  5462,  5467,  5453,  5451,  5425,  5342,  5255,
+    5113,  4914,  4725,  4512,  4273,  4053,  3866,  3632,  3436,  3205,  2955,
+    2705,  2420,  2095,  1794,  1503,  1195,  941,   639,   342,   56,    -269,
+    -601,  -894,  -1208, -1499, -1736, -1994, -2239, -2426, -2652, -2891, -3099,
+    -3361, -3588, -3793, -4013, -4183, -4302, -4439, -4523, -4613, -4734, -4809,
+    -4891, -4999, -5056, -5090, -5131, -5092, -5061, -5044, -4987, -4954, -4955,
+    -4924, -4911, -4873, -4809, -4755, -4673, -4555, -4440, -4316, -4187, -4088,
+    -3986, -3881, -3802, -3717, -3605, -3495, -3359, -3207, -3063, -2889, -2698,
+    -2504, -2306, -2088, -1861, -1627, -1415, -1201, -1000, -799,  -593,  -410,
+    -220,  -7,    203,   412,   634,   865,   1126,  1367,  1602,  1838,  2052,
+    2257,  2474,  2659,  2863,  3076,  3255,  3429,  3617,  3773,  3939,  4102,
+    4222,  4358,  4501,  4611,  4733,  4846,  4939,  5056,  5147,  5217,  5301,
+    5357,  5388,  5428,  5417,  5400,  5430,  5422,  5406,  5442,  5446,  5431,
+    5437,  5381,  5304,  5212,  5057,  4874,  4683,  4465,  4249,  4026,  3767,
+    3545,  3304,  3021,  2741,  2450,  2113,  1807,  1490,  1151,  841,   544,
+    212,   -102,  -439,  -788,  -1091, -1413, -1730, -2033, -2336, -2627, -2854,
+    -3118, -3350, -3560, -3781, -4008, -4194, -4376, -4524, -4640, -4757, -4865,
+    -4945, -5016, -5083, -5131, -5170, -5184, -5198, -5208, -5211, -5210, -5209,
+    -5192, -5174, -5154, -5108, -5052, -5002, -4932, -4854, -4780, -4704, -4604,
+    -4514, -4421, -4309, -4208, -4111, -4004, -3880, -3751, -3622, -3496, -3367,
+    -3210, -3047, -2867, -2654, -2430, -2177, -1897, -1651, -1417, -1182, -983,
+    -793,  -593,  -406,  -211,  17,    232,   461,   716,   958,   1197,  1441,
+    1674,  1899,  2130,  2355,  2573,  2788,  3004,  3220,  3419,  3612,  3809,
+    3973,  4120,  4277,  4433,  4573,  4742,  4902,  5037,  5165,  5282,  5377,
+    5460,  5539,  5596,  5654,  5716,  5741,  5759,  5770,  5776,  5762,  5751,
+    5737,  5706,  5675,  5644,  5550,  5446,  5324,  5169,  4974,  4767,  4530,
+    4289,  4067,  3823,  3621,  3391,  3145,  2878,  2575,  2228,  1890,  1525,
+    1149,  807,   473,   145,   -152,  -454,  -769,  -1057, -1374, -1703, -2033,
+    -2372, -2701, -2977, -3258, -3495, -3694, -3897, -4089, -4270, -4483, -4668,
+    -4840, -5015, -5140, -5225, -5304, -5334, -5350, -5390, -5398, -5403, -5428,
+    -5438, -5449, -5472, -5463, -5441, -5401, -5333, -5252, -5151, -5051, -4974,
+    -4880, -4805, -4729, -4626, -4526, -4403, -4248, -4088, -3939, -3778, -3617,
+    -3464, -3308, -3173, -3027, -2852, -2669, -2461, -2233, -1979, -1713, -1455,
+    -1216, -996,  -796,  -610,  -397,  -198,  21,    272,   517,   775,   1037,
+    1295,  1544,  1790,  2007,  2211,  2423,  2634,  2848,  3081,  3319,  3551,
+    3792,  4000,  4171,  4303,  4418,  4518,  4596,  4679,  4807,  4913,  5044,
+    5172,  5288,  5405,  5518,  5609,  5664,  5713,  5735,  5735,  5737,  5701,
+    5691,  5656,  5633,  5611,  5552,  5475,  5394,  5293,  5177,  5064,  4924,
+    4737,  4599,  4420,  4237,  4048,  3828,  3623,  3413,  3183,  2915,  2622,
+    2308,  1980,  1657,  1261,  901,   549,   205,   -85,   -383,  -688,  -969,
+    -1246, -1530, -1850, -2206, -2561, -2915, -3224, -3482, -3713, -3921, -4107,
+    -4287, -4470, -4660, -4850, -5057, -5239, -5395, -5540, -5619, -5697, -5724,
+    -5697, -5675, -5633, -5590, -5579, -5530, -5486, -5442, -5426, -5391, -5348,
+    -5276, -5197, -5124, -5039, -4925, -4808, -4677, -4581, -4479, -4343, -4218,
+    -4087, -3970, -3858, -3729, -3570, -3384, -3206, -3020, -2839, -2636, -2453,
+    -2287, -2185, -2154, -1926, -1562, -1223, -758,  -473,  -64,   395,   599,
+    880,   814,   938,   1172,  1498,  1928,  2127,  2422,  2608,  2841,  2937,
+    2886,  2815,  2985,  3324,  3757,  4152,  4481,  4652,  4917,  4965,  4766,
+    4583,  4328,  4503,  4815,  5118,  5408,  5682,  5956,  6082,  6055,  5744,
+    5426,  5341,  5427,  5606,  5882,  6065,  6226,  6428,  6477,  6385,  6009,
+    5728,  5552,  5439,  5339,  5200,  5008,  4947,  4835,  4614,  4330,  3887,
+    3521,  3111,  2460,  1983,  1297,  650,   279,   -353,  -720,  -1044, -1518,
+    -1668, -2117, -2496, -2743, -3266, -3607, -3790, -4149, -4075, -4042, -4096,
+    -3981, -4138, -4226, -4214, -4503, -4455, -4577, -4642, -4346, -4351, -4270,
+    -4263, -4522, -4521, -4673, -4814, -4731, -4950, -5011, -5004, -5288, -5341,
+    -5566, -5833, -5783, -5929, -5847, -5765, -5828, -5644, -5613, -5615, -5428,
+    -5291, -5014, -4554, -4277, -3964, -3854, -3829, -3612, -3603, -3438, -3137,
+    -2831, -2164, -1438, -939,  -330,  -156,  46,    242,   73,    242,   220,
+    239,   542,   565,   739,   872,   801,   857,   676,   543,   586,   567,
+    828,   1142,  1490,  1985,  2508,  2982,  3438,  3699,  3939,  4069,  4178,
+    4420,  4622,  4917,  5338,  5801,  6285,  6658,  6963,  7213,  7233,  7328,
+    7176,  7038,  7031,  6860,  6957,  6767,  6599,  6523,  6212,  6147,  6063,
+    5860,  6020,  6015,  6033,  6184,  5722,  5607,  5016,  4337,  4063,  3229,
+    3080,  3006,  2804,  3035,  2541,  2136,  1879,  1012,  401,   -575,  -1584,
+    -1930, -2278, -2485, -2477, -2712, -2747, -2766, -3320, -3592, -4188, -4669,
+    -4672, -4939, -4789, -4426, -4203, -3674, -3563, -3656, -3759, -4067, -4257,
+    -4522, -4970, -5204, -5237, -5139, -4907, -4911, -4917, -4921, -5007, -5230,
+    -5654, -6122, -6464, -6733, -6948, -7067, -6972, -6800, -6520, -6132, -5830,
+    -5382, -5091, -4797, -4546, -4472, -4362, -4350, -4235, -3851, -3454, -3144,
+    -2735, -2341, -1845, -1262, -958,  -549,  -166,  66,    382,   366,   352,
+    341,   85,    -13,   -176,  -303,  -235,  -341,  -309,  -227,  -249,  -50,
+    143,   384,   874,   1149,  1552,  2155,  2767,  3499,  3994,  4460,  4920,
+    5288,  5569,  5704,  5881,  6094,  6461,  6653,  6803,  7115,  7311,  7521,
+    7612,  7443,  7380,  7124,  6742,  6495,  5964,  5656,  5415,  5167,  5656,
+    5813,  6027,  6401,  6351,  6787,  7019,  6581,  6512,  5965,  5308,  5140,
+    4336,  4147,  3899,  3398,  3360,  2830,  2624,  1968,  1026,  395,   -699,
+    -1424, -2327, -3006, -3192, -3435, -3337, -3686, -3513, -3350, -3502, -3261,
+    -3878, -4005, -4063, -4187, -3767, -3598, -3384, -3300, -3094, -2857, -3023,
+    -3274, -3851, -4352, -4523, -4943, -5477, -5612, -5682, -5733, -5714, -5965,
+    -6110, -5950, -6158, -6548, -6897, -7165, -7281, -7352, -7258, -7185, -6659,
+    -5946, -5470, -4738, -4046, -3707, -3210, -3108, -3270, -3227, -3222, -3218,
+    -3017, -2943, -2668, -2296, -1593, -1061, -811,  -403,  -513,  -361,  -128,
+    -595,  -633,  -991,  -1205, -1159, -1284, -1330, -1164, -999,  -729,  -538,
+    -336,  27,    350,   794,   1245,  1646,  2446,  3210,  4017,  4835,  5271,
+    5739,  6028,  6140,  6212,  6161,  6066,  5984,  6081,  5995,  6152,  6301,
+    6278,  6424,  6377,  6396,  6362,  6152,  5788,  5309,  5071,  4860,  4704,
+    4804,  4919,  5258,  5869,  6121,  6365,  6694,  6692,  6694,  6532,  6187,
+    5808,  5704,  5302,  4816,  4611,  4043,  3775,  3249,  2600,  1933,  982,
+    336,   -848,  -1538, -2242, -3103, -3374, -3756, -3975, -4017, -4061, -3972,
+    -3749, -3609, -3853, -3850, -3714, -3760, -3736, -3914, -3923, -3830, -3541,
+    -3649, -3757, -3661, -3913, -4038, -4231, -4594, -4769, -5009, -5273, -5588,
+    -5676, -5937, -5997, -6060, -6164, -6414, -6623, -6765, -6857, -6771, -6921,
+    -6914, -6535, -6187, -5626, -5206, -4742, -4189, -3618, -3120, -2823, -2606,
+    -2550, -2703, -2736, -2626, -2498, -2406, -2133, -1852, -1348, -753,  -318,
+    162,   330,   524,   375,   9,     -204,  -866,  -1249, -1532, -1669, -1455,
+    -1235, -723,  -283,  262,   535,   862,   1340,  1712,  2316,  2625,  3171,
+    4015,  4698,  5516,  6006,  6452,  6838,  6921,  7003,  6735,  6339,  6138,
+    5768,  5575,  5593,  5568,  5728,  6041,  6233,  6260,  6175,  6048,  5728,
+    5366,  4931,  4340,  4194,  4174,  4330,  4743,  5028,  5754,  6250,  6598,
+    7120,  7114,  6962,  6675,  6157,  5373,  4797,  4081,  3237,  3153,  2588,
+    2143,  1639,  1021,  681,   -149,  -816,  -1987, -3003, -3493, -4138, -4420,
+    -4607, -4841, -4725, -4254, -4033, -3845, -3842, -4063, -4035, -4099, -4582,
+    -4718, -4779, -4689, -4437, -4327, -4352, -4119, -3881, -4061, -4345, -4768,
+    -5248, -5610, -5920, -6383, -6779, -6731, -6673, -6677, -6597, -6659, -6619,
+    -6417, -6516, -6862, -7017, -7069, -6944, -6715, -6376, -6000, -5162, -4333,
+    -3577, -2884, -2355, -1807, -1366, -1380, -1590, -1869, -1962, -1945, -2006,
+    -2141, -1960, -1516, -1025, -471,  -135,  85,    348,   239,   -8,    -475,
+    -951,  -1245, -1520, -1569, -1448, -1188, -517,  134,   827,   1585,  2114,
+    2792,  3214,  3651,  4230,  4546,  4894,  5321,  5588,  6105,  6583,  6877,
+    7014,  7087,  7068,  6876,  6695,  6280,  5684,  5385,  5205,  5064,  5033,
+    5028,  5080,  5322,  5510,  5461,  5390,  5541,  5494,  5443,  5306,  5065,
+    5193,  5338,  5513,  5818,  5911,  6345,  6506,  6514,  6543,  5981,  5703,
+    5082,  4228,  3517,  2424,  1880,  1245,  562,   -130,  -864,  -1156, -1561,
+    -1970, -2597, -3357, -3707, -4189, -4521, -4975, -5477, -5478, -5585, -5445,
+    -5353, -5327, -4971, -4580, -4431, -4469, -4432, -4422, -4275, -4227, -4507,
+    -4745, -4758, -4752, -4845, -4933, -5118, -5117, -5124, -5324, -5673, -5971,
+    -6152, -6366, -6702, -6970, -7159, -7136, -6929, -6917, -6703, -6520, -6302,
+    -5794, -5484, -5123, -4694, -4254, -3722, -3334, -2917, -2410, -1721, -1010,
+    -584,  -312,  27,    321,   327,   214,   -17,   -363,  -402,  -550,  -638,
+    -469,  -315,  -86,   142,   242,   387,   448,   458,   423,   321,   194,
+    285,   417,   717,   1176,  1673,  2402,  3144,  3985,  4764,  5406,  6056,
+    6507,  6783,  6891,  6868,  6850,  6717,  6532,  6359,  6248,  6303,  6279,
+    6140,  6071,  5927,  5687,  5480,  5146,  4835,  4572,  4447,  4481,  4578,
+    4840,  4936,  5246,  5659,  5732,  5856,  5658,  5403,  5282,  5004,  4949,
+    4843,  4681,  4884,  4886,  4967,  5108,  4781,  4647,  4240,  3443,  2768,
+    1830,  983,   309,   -769,  -1382, -1987, -2553, -2750, -3346, -3555, -4052,
+    -4400, -4599, -5196, -5437, -5945, -6340, -6343, -6554, -6611, -6381, -6184,
+    -5681, -5398, -5098, -4751, -4529, -4138, -4100, -4088, -4044, -4186, -4189,
+    -4263, -4453, -4465, -4598, -4651, -4726, -4919, -4926, -5142, -5286, -5490,
+    -5831, -6002, -6341, -6492, -6562, -6710, -6553, -6506, -6219, -5766, -5521,
+    -5008, -4556, -4002, -3293, -2769, -2069, -1467, -824,  -34,   509,   1034,
+    1385,  1560,  1650,  1664,  1419,  1016,  834,   511,   353,   381,   299,
+    523,   833,   956,   1280,  1492,  1425,  1547,  1350,  1143,  1114,  931,
+    1054,  1217,  1583,  2217,  2917,  4017,  4965,  5827,  6816,  7393,  7875,
+    8197,  8175,  7924,  7578,  7040,  6566,  6242,  5746,  5530,  5334,  5222,
+    5237,  5074,  5146,  5011,  4902,  4753,  4442,  4482,  4254,  4247,  4319,
+    4187,  4516,  4690,  4935,  5193,  5229,  5350,  5332,  5486,  5386,  5143,
+    4999,  4494,  4304,  3961,  3421,  2781,  2032,  1404,  614,   -88,   -956,
+    -1714, -2155, -2684, -3038, -3237, -3368, -3423, -3569, -3809, -4213, -4533,
+    -4973, -5514, -6011, -6663, -7084, -7258, -7158, -6947, -6639, -6111, -5548,
+    -4887, -4362, -4043, -3895, -3940, -4107, -4452, -4836, -5143, -5500, -5532,
+    -5510, -5485, -5096, -4739, -4375, -4065, -4063, -4094, -4252, -4576, -4904,
+    -5431, -5837, -6190, -6402, -6310, -6292, -5992, -5516, -5025, -4342, -3899,
+    -3386, -2697, -2077, -1493, -994,  -392,  232,   931,   1608,  1988,  2360,
+    2589,  2639,  2623,  2471,  2121,  1708,  1478,  1181,  1167,  1296,  1279,
+    1648,  1859,  2107,  2368,  2359,  2390,  2122,  1904,  1629,  1418,  1502,
+    1524,  1859,  2357,  3041,  3909,  4810,  5751,  6449,  7128,  7534,  7767,
+    7908,  7699,  7460,  7032,  6647,  6301,  5876,  5556,  5190,  4948,  4762,
+    4576,  4464,  4370,  4338,  4275,  4287,  4265,  4320,  4221,  4066,  3947,
+    3514,  3379,  3003,  2635,  2534,  2078,  2040,  1950,  1958,  2152,  2085,
+    2390,  2321,  2319,  2359,  1851,  1643,  877,   168,   -527,  -1245, -1704,
+    -2519, -2739, -3251, -3382, -3236, -3527, -3294, -3523, -3732, -3916, -4434,
+    -4888, -5615, -6161, -6729, -7283, -7543, -7920, -7865, -7660, -7430, -7034,
+    -6758, -6224, -5866, -5441, -5076, -4998, -4760, -4673, -4539, -4410, -4308,
+    -4131, -3992, -3791, -3611, -3448, -3213, -3070, -3046, -3048, -3168, -3244,
+    -3354, -3607, -3834, -4170, -4439, -4648, -4864, -4892, -4928, -4821, -4524,
+    -4211, -3576, -2819, -1968, -929,  -19,   1029,  2064,  2949,  3716,  4159,
+    4450,  4536,  4503,  4301,  3968,  3655,  3242,  2979,  2856,  2744,  2750,
+    2771,  2749,  2859,  2850,  2793,  2702,  2402,  2179,  1877,  1672,  1581,
+    1543,  1769,  1967,  2485,  3089,  3783,  4662,  5406,  6246,  6950,  7542,
+    8016,  8200,  8245,  8027,  7584,  6958,  6241,  5494,  4710,  3974,  3255,
+    2653,  2274,  2038,  1986,  1964,  2141,  2321,  2513,  2772,  2756,  2743,
+    2636,  2406,  2125,  1836,  1456,  1247,  1145,  995,   1077,  1140,  1290,
+    1561,  1685,  1762,  1609,  1391,  1147,  544,   84,    -754,  -1546, -2107,
+    -2806, -3137, -3522, -3732, -3826, -3834, -3609, -3493, -3340, -3254, -3499,
+    -3621, -3981, -4455, -4859, -5513, -6080, -6626, -7061, -7372, -7556, -7573,
+    -7515, -7366, -7091, -6799, -6366, -5887, -5484, -5098, -4746, -4334, -3941,
+    -3558, -3269, -3053, -2844, -2663, -2497, -2314, -2227, -2185, -2141, -2139,
+    -2070, -2037, -2031, -2062, -2205, -2348, -2544, -2774, -2979, -3298, -3520,
+    -3647, -3622, -3395, -3054, -2513, -1829, -948,  64,    1090,  2169,  3127,
+    3987,  4712,  5229,  5560,  5754,  5741,  5619,  5401,  5005,  4666,  4287,
+    3967,  3734,  3476,  3322,  3203,  3147,  3144,  3116,  3080,  3011,  2871,
+    2735,  2544,  2363,  2245,  2075,  2032,  2118,  2263,  2688,  3066,  3605,
+    4244,  4746,  5384,  5819,  6151,  6319,  6194,  5938,  5495,  4929,  4305,
+    3581,  2924,  2279,  1713,  1372,  1086,  1006,  983,   1006,  1146,  1249,
+    1349,  1360,  1231,  1084,  794,   502,   264,   -85,   -238,  -411,  -504,
+    -394,  -322,  -51,   188,   420,   589,   624,   666,   573,   338,   -86,
+    -564,  -1056, -1560, -1925, -2434, -2806, -3017, -3341, -3320, -3375, -3480,
+    -3410, -3567, -3553, -3595, -3805, -3919, -4284, -4482, -4754, -5190, -5354,
+    -5806, -6050, -6136, -6387, -6343, -6330, -6206, -5851, -5468, -4960, -4549,
+    -4080, -3542, -3150, -2698, -2440, -2318, -2132, -2067, -2081, -2017, -2099,
+    -2151, -2060, -2067, -1916, -1823, -1718, -1523, -1386, -1221, -1189, -1141,
+    -1014, -1008, -966,  -996,  -1015, -916,  -809,  -648,  -467,  -128,  237,
+    735,   1358,  1969,  2697,  3399,  4060,  4732,  5295,  5720,  6077,  6169,
+    6139,  5928,  5614,  5292,  4766,  4247,  3705,  3262,  3030,  2827,  2702,
+    2684,  2728,  2887,  3092,  3216,  3310,  3313,  3214,  3098,  2873,  2620,
+    2343,  2031,  1799,  1589,  1491,  1537,  1645,  1913,  2210,  2548,  2922,
+    3295,  3650,  3951,  4100,  4099,  3972,  3740,  3421,  2948,  2427,  1762,
+    1136,  574,   44,    -330,  -642,  -846,  -852,  -751,  -520,  -229,  44,
+    272,   446,   502,   443,   329,   66,    -191,  -492,  -841,  -1002, -1240,
+    -1237, -1199, -1177, -936,  -867,  -660,  -456,  -508,  -464,  -706,  -997,
+    -1265, -1780, -2178, -2724, -3270, -3735, -4142, -4378, -4609, -4666, -4749,
+    -4575, -4355, -4137, -3767, -3563, -3218, -2970, -2834, -2630, -2716, -2776,
+    -2920, -3210, -3363, -3764, -4023, -4125, -4268, -4194, -4223, -4005, -3639,
+    -3258, -2891, -2644, -2297, -1987, -1751, -1587, -1570, -1485, -1415, -1342,
+    -1194, -1100, -889,  -613,  -267,  161,   482,   865,   1269,  1639,  2005,
+    2202,  2381,  2549,  2628,  2700,  2625,  2559,  2481,  2357,  2319,  2192,
+    2142,  2199,  2283,  2514,  2670,  2919,  3214,  3510,  3830,  3971,  4080,
+    4073,  3911,  3700,  3359,  2954,  2549,  2094,  1766,  1556,  1442,  1462,
+    1560,  1808,  2070,  2357,  2606,  2730,  2831,  2737,  2582,  2309,  1931,
+    1585,  1178,  834,   529,   288,   214,   218,   302,   470,   679,   944,
+    1211,  1420,  1562,  1674,  1631,  1548,  1355,  1072,  776,   375,   25,
+    -320,  -614,  -818,  -992,  -991,  -906,  -755,  -525,  -291,  -17,   225,
+    447,   528,   546,   466,   270,   96,    -205,  -536,  -861,  -1148, -1383,
+    -1586, -1688, -1814, -1783, -1772, -1745, -1630, -1611, -1505, -1488, -1462,
+    -1409, -1519, -1489, -1609, -1723, -1755, -1977, -2042, -2132, -2215, -2184,
+    -2268, -2205, -2170, -2107, -1978, -1990, -1909, -1886, -1943, -1997, -2152,
+    -2326, -2500, -2762, -2987, -3227, -3392, -3522, -3630, -3579, -3469, -3262,
+    -2916, -2555, -2103, -1581, -1090, -531,  -20,   457,   873,   1228,  1561,
+    1809,  1999,  2105,  2139,  2196,  2201,  2149,  2113,  2038,  1990,  1913,
+    1787,  1705,  1595,  1490,  1372,  1201,  1113,  998,   917,   917,   894,
+    961,   1007,  1098,  1321,  1470,  1681,  1882,  2067,  2317,  2465,  2626,
+    2750,  2777,  2783,  2694,  2569,  2431,  2142,  1843,  1597,  1306,  1069,
+    824,   622,   532,   430,   388,   357,   377,   438,   414,   481,   468,
+    431,   454,   383,   374,   305,   207,   187,   133,   157,   115,   113,
+    206,   244,   382,   475,   591,   753,   821,   916,   908,   855,   754,
+    577,   399,   123,   -159,  -399,  -647,  -784,  -923,  -1010, -965,  -918,
+    -806,  -647,  -504,  -355,  -253,  -179,  -130,  -138,  -156,  -262,  -339,
+    -401,  -552,  -600,  -671,  -697,  -662,  -673,  -616,  -597,  -522,  -495,
+    -513,  -490,  -624,  -701,  -804,  -961,  -1073, -1328, -1503, -1656, -1798,
+    -1801, -1913, -1863, -1785, -1720, -1453, -1309, -1051, -846,  -715,  -487,
+    -457,  -357,  -331,  -400,  -427,  -627,  -765,  -873,  -1021, -1105, -1255,
+    -1312, -1357, -1370, -1288, -1261, -1165, -1139, -1062, -917,  -808,  -680,
+    -597,  -452,  -277,  -104,  122,   312,   558,   771,   919,   1110,  1205,
+    1312,  1355,  1302,  1280,  1151,  1049,  946,   818,   733,   569,   451,
+    429,   388,   408,   387,   376,   426,   463,   542,   576,   632,   666,
+    673,   740,   766,   791,   845,   829,   857,   841,   822,   835,   796,
+    773,   671,   600,   560,   484,   460,   371,   311,   284,   242,   277,
+    261,   261,   277,   273,   358,   380,   410,   433,   435,   471,   432,
+    414,   386,   330,   294,   194,   149,   108,   69,    84,    69,    92,
+    83,    75,    88,    53,    12,    -96,   -194,  -269,  -369,  -438,  -523,
+    -553,  -528,  -500,  -392,  -277,  -136,  53,    240,   466,   678,   870,
+    1050,  1178,  1294,  1336,  1310,  1247,  1080,  916,   677,   387,   120,
+    -182,  -471,  -740,  -972,  -1148, -1273, -1343, -1402, -1363, -1263, -1129,
+    -922,  -724,  -518,  -288,  -79,   111,   250,   364,   405,   405,   395,
+    284,   199,   83,    -43,   -126,  -244,  -313,  -400,  -451,  -497,  -610,
+    -672,  -807,  -951,  -1087, -1325, -1517, -1736, -1929, -2086, -2260, -2318,
+    -2356, -2271, -2125, -1967, -1685, -1379, -1000, -598,  -238,  149,   481,
+    790,   1042,  1185,  1287,  1274,  1195,  1068,  868,   654,   386,   138,
+    -65,   -273,  -450,  -598,  -665,  -670,  -669,  -620,  -553,  -425,  -288,
+    -179,  -72,   15,    122,   205,   263,   324,   357,   435,   518,   603,
+    709,   779,   892,   1006,  1107,  1170,  1183,  1190,  1173,  1116,  1016,
+    890,   750,   628,   488,   331,   197,   95,    43,    25,    1,     22,
+    97,    209,   363,   495,   615,   724,   833,   937,   984,   990,   933,
+    884,   851,   747,   678,   573,   497,   469,   401,   391,   352,   339,
+    352,   337,   354,   361,   370,   402,   411,   418,   440,   468,   526,
+    576,   619,   683,   766,   857,   965,   1038,  1114,  1159,  1172,  1167,
+    1106,  1006,  840,   644,   426,   177,   -110,  -390,  -665,  -929,  -1160,
+    -1375, -1497, -1550, -1592, -1553, -1507, -1394, -1201, -1084, -863,  -685,
+    -540,  -322,  -234,  -68,   29,    59,    160,   141,   170,   140,   79,
+    77,    -11,   -53,   -179,  -274,  -327,  -480,  -564,  -736,  -884,  -995,
+    -1185, -1300, -1461, -1617, -1711, -1832, -1831, -1863, -1865, -1776, -1691,
+    -1516, -1353, -1168, -954,  -729,  -490,  -305,  -93,   81,    211,   322,
+    364,   392,   384,   332,   264,   146,   29,    -101,  -230,  -357,  -486,
+    -616,  -705,  -752,  -801,  -809,  -788,  -750,  -654,  -546,  -456,  -328,
+    -200,  -78,   45,    137,   232,   316,   388,   447,   485,   528,   578,
+    630,   697,   760,   835,   910,   988,   1068,  1124,  1154,  1157,  1166,
+    1163,  1116,  1070,  1024,  994,   986,   988,   1030,  1110,  1212,  1303,
+    1411,  1498,  1551,  1599,  1587,  1565,  1481,  1336,  1212,  1028,  847,
+    669,   466,   330,   187,   61,    -9,    -54,   -55,   -20,   11,    69,
+    133,   195,   244,   253,   225,   182,   133,   62,    -11,   -96,   -168,
+    -199,  -214,  -213,  -197,  -167,  -127,  -105,  -86,   -83,   -109,  -140,
+    -217,  -323,  -448,  -588,  -717,  -854,  -971,  -1086, -1185, -1211, -1227,
+    -1180, -1135, -1099, -992,  -918,  -788,  -704,  -651,  -562,  -542,  -470,
+    -421,  -431,  -391,  -429,  -386,  -344,  -336,  -260,  -257,  -162,  -61,
+    -6,    100,   120,   178,   215,   179,   132,   15,    -106,  -238,  -416,
+    -595,  -765,  -929,  -1066, -1170, -1252, -1278, -1290, -1258, -1173, -1114,
+    -1012, -945,  -868,  -741,  -695,  -612,  -547,  -494,  -388,  -332,  -225,
+    -110,  22,    182,   318,   496,   677,   835,   992,   1104,  1162,  1166,
+    1133,  1054,  916,   709,   430,   164,   -90,   -340,  -600,  -853,  -1033,
+    -1135, -1177, -1146, -1079, -946,  -746,  -500,  -208,  83,    377,   673,
+    950,   1183,  1356,  1503,  1627,  1707,  1735,  1708,  1678,  1668,  1645,
+    1588,  1494,  1419,  1354,  1291,  1194,  1052,  900,   718,   524,   325,
+    110,   -114,  -330,  -500,  -630,  -729,  -803,  -834,  -795,  -727,  -627,
+    -492,  -325,  -125,  54,    238,   393,   528,   642,   691,   706,   661,
+    585,   504,   380,   245,   87,    -61,   -195,  -320,  -435,  -556,  -663,
+    -742,  -814,  -883,  -952,  -1009, -1038, -1047, -1067, -1063, -1050, -1020,
+    -949,  -888,  -795,  -698,  -574,  -405,  -257,  -70,   68,    203,   381,
+    479,   580,   619,   623,   645,   565,   492,   364,   206,   106,   -71,
+    -191,  -331,  -460,  -469,  -527,  -471,  -441,  -386,  -222,  -123,  60,
+    168,   245,   404,   470,   596,   605,   581,   633,   548,   562,   468,
+    355,   334,   192,   161,   62,    -36,   -39,   -146,  -121,  -167,  -243,
+    -229,  -302,  -276,  -327,  -415,  -419,  -444,  -396,  -433,  -455,  -407,
+    -357,  -244,  -221,  -158,  -63,   36,    172,   210,   296,   326,   351,
+    424,   367,   369,   300,   224,   235,   124,   54,    -39,   -122,  -118,
+    -239,  -304,  -360,  -403,  -361,  -418,  -427,  -394,  -342,  -259,  -232,
+    -176,  -110,  -48,   27,    48,    78,    90,    86,    91,    76,    57,
+    -1,    -34,   -53,   -103,  -151,  -209,  -239,  -261,  -319,  -354,  -372,
+    -382,  -385,  -411,  -432,  -428,  -431,  -446,  -471,  -496,  -512,  -532,
+    -562,  -570,  -567,  -543,  -499,  -457,  -379,  -290,  -204,  -94,   -11,
+    78,    155,   196,   234,   222,   198,   160,   113,   64,    5,     -57,
+    -108,  -136,  -175,  -186,  -196,  -184,  -125,  -90,   -25,   58,    146,
+    271,   372,   472,   562,   636,   709,   741,   760,   752,   730,   710,
+    688,   655,   608,   595,   570,   556,   540,   517,   513,   511,   497,
+    481,   449,   417,   401,   347,   325,   295,   248,   261,   238,   250,
+    294,   295,   367,   380,   416,   454,   430,   479,   443,   431,   430,
+    386,   397,   333,   292,   238,   176,   153,   54,    24,    -37,   -84,
+    -109,  -172,  -155,  -199,  -220,  -219,  -261,  -227,  -255,  -280,  -266,
+    -293,  -277,  -273,  -243,  -214,  -221,  -179,  -153,  -130,  -109,  -154,
+    -149,  -151,  -155,  -186,  -243,  -253,  -311,  -326,  -358,  -434,  -427,
+    -491,  -533,  -554,  -598,  -596,  -655,  -668,  -679,  -714,  -671,  -694,
+    -643,  -607,  -602,  -532,  -496,  -409,  -408,  -377,  -309,  -289,  -211,
+    -223,  -196,  -145,  -147,  -104,  -157,  -123,  -125,  -177,  -152,  -229,
+    -192,  -204,  -243,  -213,  -259,  -194,  -190,  -172,  -98,   -123,  -43,
+    -12,   41,    103,   87,    148,   150,   166,   154,   113,   118,   80,
+    54,    8,     4,     25,    12,    59,    70,    162,   260,   305,   387,
+    427,   501,   549,   564,   571,   517,   488,   423,   355,   294,   206,
+    165,   113,   92,    77,    62,    115,   116,   154,   162,   171,   218,
+    210,   221,   208,   192,   215,   176,   169,   114,   89,    89,    52,
+    62,    29,    35,    73,    98,    167,   195,   261,   325,   349,   401,
+    382,   393,   368,   302,   254,   174,   104,   6,     -78,   -136,  -203,
+    -229,  -291,  -303,  -284,  -294,  -241,  -235,  -222,  -186,  -187,  -156,
+    -160,  -149,  -122,  -114,  -71,   -44,   -28,   6,     20,    47,    57,
+    54,    52,    55,    53,    23,    9,     -16,   -59,   -86,   -158,  -223,
+    -292,  -372,  -421,  -498,  -532,  -561,  -570,  -531,  -512,  -456,  -367,
+    -297,  -206,  -125,  -37,   26,    88,    147,   157,   188,   169,   152,
+    152,   131,   99,    62,    44,    46,    53,    61,    61,    79,    110,
+    159,   175,   185,   237,   220,   278,   276,   239,   264,   203,   190,
+    138,   70,    34,    -9,    18,    1,     10,    71,    115,   191,   220,
+    255,   265,   296,   319,   270,   266,   214,   189,   187,   155,   145,
+    123,   149,   166,   172,   186,   179,   195,   213,   201,   182,   161,
+    150,   116,   76,    41,    -29,   -58,   -101,  -183,  -209,  -269,  -314,
+    -342,  -385,  -379,  -380,  -348,  -304,  -273,  -197,  -144,  -88,   -28,
+    -5,    11,    20,    27,    -5,    -24,   -22,   -61,   -73,   -87,   -124,
+    -118,  -133,  -150,  -160,  -198,  -196,  -219,  -228,  -239,  -281,  -276,
+    -275,  -288,  -277,  -305,  -324,  -302,  -294,  -292,  -266,  -261,  -224,
+    -203,  -210,  -190,  -198,  -176,  -180,  -201,  -196,  -198,  -175,  -166,
+    -151,  -127,  -114,  -59,   -48,   -8,    39,    75,    126,   131,   168,
+    160,   152,   142,   82,    36,    -13,   -49,   -81,   -105,  -105,  -103,
+    -65,   -38,   -16,   19,    33,    67,    82,    95,    110,   98,    111,
+    98,    87,    67,    54,    66,    52,    49,    53,    71,    106,   139,
+    186,   224,   270,   320,   361,   413,   433,   462,   473,   478,   480,
+    459,   441,   391,   339,   298,   239,   206,   159,   149,   120,   114,
+    117,   95,    106,   81,    67,    61,    30,    11,    -29,   -42,   -76,
+    -97,   -98,   -124,  -107,  -107,  -103,  -69,   -71,   -36,   -12,   23,
+    69,    86,    129,   152,   158,   162,   152,   127,   81,    48,    -9,
+    -80,   -120,  -172,  -201,  -225,  -276,  -297,  -311,  -330,  -339,  -361,
+    -375,  -389,  -376,  -365,  -374,  -378,  -375,  -370,  -358,  -347,  -355,
+    -338,  -314,  -289,  -244,  -212,  -168,  -129,  -80,   -26,   -12,   47,
+    79,    92,    105,   105,   113,   99,    85,    29,    -18,   -53,   -110,
+    -133,  -167,  -186,  -196,  -199,  -176,  -177,  -150,  -122,  -106,  -73,
+    -61,   -30,   -34,   -29,   -40,   -68,   -63,   -85,   -84,   -71,   -65,
+    -40,   -16,   23,    56,    87,    144,   167,   196,   206,   221,   243,
+    226,   233,   210,   192,   190,   150,   140,   110,   91,    77,    43,
+    27,    -10,   -5,    -5,    -22,   -9,    -7,    27,    48,    59,    64,
+    70,    87,    104,   139,   151,   188,   239,   270,   317,   311,   336,
+    349,   341,   330,   274,   254,   223,   195,   163,   102,   81,    43,
+    20,    8,     -37,   -28,   -31,   -29,   -21,   -39,   -16,   -22,   -11,
+    -21,   -41,   -32,   -47,   -39,   -60,   -75,   -71,   -94,   -98,   -131,
+    -147,  -139,  -145,  -146,  -165,  -150,  -136,  -112,  -90,   -106,  -86,
+    -91,   -87,   -98,   -136,  -121,  -135,  -124,  -132,  -144,  -114,  -108,
+    -87,   -74,   -75,   -50,   -30,   -5,    -18,   -24,   -3,    -3,    -6,
+    -41,   -76,   -98,   -127,  -159,  -215,  -257,  -263,  -268,  -266,  -262,
+    -237,  -194,  -144,  -113,  -99,   -61,   -28,   12,    21,    46,    76,
+    92,    130,   115,   123,   132,   135,   149,   134,   133,   132,   135,
+    138,   94,    76,    51,    19,    -15,   -72,   -98,   -125,  -135,  -154,
+    -174,  -171,  -164,  -139,  -130,  -99,   -74,   -40,   9,     34,    86,
+    129,   176,   214,   226,   245,   250,   280,   271,   256,   250,   226,
+    234,   212,   187,   178,   148,   144,   104,   79,    64,    37,    36,
+    9,     -10,   -23,   -38,   -35,   -62,   -67,   -67,   -82,   -70,   -80,
+    -75,   -59,   -34,   -3,    9,     48,    76,    101,   120,   120,   123,
+    126,   131,   112,   92,    77,    61,    54,    32,    3,     -18,   -28,
+    -39,   -56,   -71,   -91,   -92,   -100,  -124,  -134,  -142,  -144,  -155,
+    -177,  -178,  -175,  -171,  -168,  -160,  -141,  -123,  -89,   -73,   -64,
+    -46,   -39,   -18,   -19,   -34,   -32,   -46,   -51,   -63,   -74,   -73,
+    -81,   -70,   -83,   -71,   -49,   -39,   -12,   -1,    30,    48,    65,
+    94,    100,   125,   136,   148,   156,   138,   140,   124,   115,   86,
+    58,    57,    32,    43,    40,    44,    63,    60,    83,    90,    99,
+    115,   113,   135,   140,   148,   164,   172,   187,   182,   190,   183,
+    171,   171,   146,   139,   121,   105,   94,    61,    46,    17,    -6,
+    -34,   -70,   -89,   -121,  -138,  -158,  -178,  -190,  -206,  -206,  -210,
+    -214,  -204,  -196,  -173,  -154,  -128,  -97,   -81,   -58,   -51,   -46,
+    -38,   -47,   -49,   -57,   -58,   -57,   -59,   -49,   -58,   -58,   -54,
+    -60,   -48,   -65,   -72,   -72,   -78,   -70,   -77,   -73,   -76,   -79,
+    -76,   -90,   -90,   -91,   -88,   -76,   -67,   -43,   -16,   6,     27,
+    39,    55,    69,    71,    74,    65,    56,    60,    47,    37,    27,
+    8,     -5,    -29,   -50,   -71,   -89,   -96,   -114,  -111,  -113,  -115,
+    -105,  -112,  -90,   -78,   -68,   -49,   -46,   -26,   -14,   5,     18,
+    10,    14,    3,     5,     -9,    -20,   -15,   -30,   -26,   -33,   -31,
+    -23,   -23,   -12,   -21,   -20,   -16,   -23,   -20,   -13,   -7,    6,
+    28,    47,    69,    96,    115,   134,   147,   154,   166,   174,   186,
+    196,   202,   204,   198,   193,   181,   164,   144,   125,   113,   102,
+    96,    90,    92,    91,    96,    99,    99,    100,   99,    99,    93,
+    94,    86,    68,    55,    44,    36,    22,    13,    15,    13,    15,
+    21,    16,    11,    3,     -15,   -31,   -50,   -75,   -105,  -125,  -145,
+    -154,  -155,  -164,  -178,  -189,  -186,  -177,  -174,  -169,  -152,  -134,
+    -114,  -93,   -65,   -42,   -23,   -4,    -1,    6,     6,     2,     -4,
+    -18,   -26,   -25,   -25,   -23,   -32,   -31,   -33,   -39,   -50,   -68,
+    -69,   -74,   -79,   -78,   -83,   -85,   -85,   -77,   -71,   -61,   -42,
+    -27,   -3,    28,    59,    95,    123,   146,   155,   160,   162,   144,
+    130,   112,   94,    82,    67,    60,    46,    35,    35,    22,    4,
+    -14,   -27,   -35,   -45,   -52,   -61,   -62,   -65,   -68,   -55,   -52,
+    -43,   -38,   -34,   -20,   -8,    8,     18,    24,    34,    36,    37,
+    42,    46,    51,    50,    58,    76,    75,    70,    67,    58,    53,
+    48,    36,    23,    18,    10,    3,     9,     14,    24,    39,    43,
+    53,    62,    63,    66,    62,    66,    64,    59,    51,    25,    19,
+    6,     -10,   -19,   -26,   -35,   -43,   -44,   -37,   -47,   -43,   -50,
+    -54,   -60,   -69,   -75,   -84,   -91,   -93,   -98,   -96,   -99,   -91,
+    -87,   -91,   -88,   -84,   -80,   -75,   -61,   -48,   -44,   -40,   -37,
+    -34,   -45,   -52,   -58,   -72,   -82,   -84,   -78,   -68,   -65,   -63,
+    -51,   -42,   -27,   -22,   -13,   -3,    8,     20,    26,    31,    31,
+    37,    33,    29,    33,    31,    32,    31,    34,    44,    55,    68,
+    74,    69,    75,    73,    72,    65,    63,    67,    70,    83,    81,
+    81,    85,    84,    80,    75,    69,    53,    44,    36,    27,    20,
+    11,    1,     -4,    -19,   -26,   -27,   -25,   -21,   -14,   -12,   -12,
+    -14,   -9,    -21,   -29,   -40,   -50,   -50,   -54,   -46,   -35,   -17,
+    -4,    -1,    7,     20,    28,    26,    22,    23,    21,    23,    18,
+    13,    12,    7,     6,     3,     2,     -1,    -1,    4,     6,     17,
+    29,    35,    34,    34,    32,    28,    33,    26,    22,    16,    16,
+    22,    20,    13,    -1,    -1,    -7,    -15,   -20,   -30,   -32,   -38,
+    -39,   -45,   -45,   -53,   -63,   -70,   -83,   -96,   -107,  -113,  -122,
+    -122,  -118,  -114,  -114,  -113,  -112,  -111,  -110,  -107,  -103,  -102,
+    -94,   -80,   -71,   -58,   -52,   -47,   -40,   -43,   -47,   -48,   -50,
+    -39,   -46,   -44,   -44,   -44,   -43,   -45,   -41,   -40,   -34,   -32,
+    -23,   -12,   -6,    -1,    -1,    6,     12,    18,    20,    22,    32,
+    48,    65,    80,    93,    109,   122,   128,   131,   135,   135,   129,
+    126,   130,   127,   124,   125,   121,   122,   115,   118,   122,   128,
+    137,   143,   143,   141,   142,   134,   131,   121,   109,   105,   97,
+    93,    99,    96,    96,    94,    83,    84,    80,    77,    66,    59,
+    46,    42,    44,    32,    28,    20,    12,    8,     4,     4,     5,
+    3,     -4,    -7,    -6,    -14,   -19,   -24,   -34,   -40,   -45,   -52,
+    -61,   -62,   -60,   -57,   -57,   -61,   -63,   -61,   -65,   -73,   -81,
+    -89,   -94,   -93,   -89,   -87,   -82,   -82,   -84,   -81,   -86,   -82,
+    -84,   -86,   -90,   -86,   -83,   -82,   -81,   -80,   -80,   -76,   -75,
+    -76,   -70,   -69,   -68,   -61,   -53,   -50,   -43,   -38,   -42,   -43,
+    -41,   -41,   -39,   -34,   -27,   -21,   -16,   -20,   -22,   -27,   -36,
+    -39,   -38,   -40,   -37,   -35,   -28,   -14,   -6,    -3,    -2,    2,
+    4,     5,     15,    18,    25,    35,    36,    41,    45,    48,    52,
+    54,    52,    50,    60,    67,    76,    85,    85,    90,    86,    83,
+    84,    77,    77,    72,    77,    81,    89,    91,    93,    99,    101,
+    102,   98,    94,    87,    77,    70,    69,    63,    62,    55,    59,
+    58,    54,    51,    53,    57,    62,    65,    60,    54,    48,    45,
+    40,    29,    17,    8,     -3,    -14,   -17,   -18,   -20,   -25,   -34,
+    -40,   -44,   -53,   -56,   -63,   -71,   -71,   -69,   -66,   -62,   -66,
+    -67,   -68,   -71,   -75,   -79,   -79,   -73,   -67,   -60,   -49,   -46,
+    -45,   -45,   -46,   -55,   -64,   -67,   -72,   -74,   -70,   -68,   -67,
+    -69,   -70,   -64,   -56,   -55,   -54,   -51,   -41,   -30,   -26,   -28,
+    -29,   -30,   -28,   -25,   -27,   -20,   -12,   -5,    -2,    2,     3,
+    -3,    0,     -7,    -8,    -14,   -15,   -9,    -7,    4,     12,    24,
+    36,    41,    52,    58,    59,    51,    45,    48,    44,    46,    43,
+    40,    42,    47,    53,    52,    52,    63,    69,    74,    75,    80,
+    78,    69,    68,    59,    60,    54,    54,    54,    58,    66,    71,
+    78,    78,    75,    78,    72,    71,    61,    55,    53,    42,    36,
+    31,    28,    29,    23,    19,    25,    27,    27,    23,    29,    29,
+    20,    11,    5,     -4,    -10,   -31,   -38,   -39,   -36,   -33,   -27,
+    -17,   -15,   -14,   -17,   -13,   -14,   -25,   -33,   -44,   -51,   -61,
+    -63,   -63,   -65,   -67,   -66,   -63,   -59,   -52,   -48,   -45,   -44,
+    -50,   -62,   -74,   -84,   -89,   -100,  -101,  -102,  -96,   -95,   -85,
+    -76,   -78,   -72,   -71,   -66,   -61,   -63,   -60,   -62,   -72,   -69,
+    -69,   -58,   -56,   -50,   -37,   -28,   -17,   -17,   -16,   -17,   -18,
+    -18,   -13,   -7,    -4,    6,     17,    23,    25,    28,    24,    21,
+    17,    21,    27,    30,    33,    35,    46,    49,    48,    54,    56,
+    57,    58,    60,    64,    62,    64,    66,    67,    64,    70,    77,
+    83,    82,    84,    88,    89,    95,    86,    75,    64,    51,    36,
+    29,    26,    21,    26,    31,    38,    40,    55,    63,    65,    65,
+    64,    60,    54,    54,    49,    41,    34,    26,    21,    9,     6,
+    6,     5,     -1,    3,     5,     3,     2,     -4,    -13,   -13,   -24,
+    -32,   -33,   -36,   -33,   -24,   -18,   -15,   -9,    -5,    -5,    -14,
+    -17,   -24,   -34,   -36,   -42,   -43,   -36,   -42,   -43,   -43,   -38,
+    -36,   -27,   -20,   -23,   -21,   -28,   -25,   -22,   -24,   -25,   -23,
+    -22,   -30,   -31,   -26,   -25,   -20,   -15,   -8,    -10,   -11,   -13,
+    -18,   -22,   -30,   -36,   -35,   -39,   -35,   -34,   -27,   -24,   -19,
+    -15,   -7,    -6,    -7,    -2,    0,     7,     12,    14,    19,    20,
+    26,    26,    24,    16,    10,    4,     1,     3,     2,     9,     11,
+    17,    19,    27,    31,    31,    32,    30,    27,    25,    28,    27,
+    25,    22,    23,    23,    20,    21,    25,    36,    38,    40,    43,
+    40,    32,    27,    20,    9,     4,     1,     12,    27,    37,    49,
+    63,    73,    72,    73,    70,    67,    53,    39,    33,    26,    23,
+    13,    9,     6,     0,     -2,    -3,    0,     -1,    0,     -1,    -4,
+    -9,    -16,   -22,   -21,   -24,   -21,   -19,   -12,   -3,    0,     12,
+    14,    13,    3,     -6,    -13,   -27,   -34,   -42,   -41,   -44,   -42,
+    -43,   -46,   -42,   -40,   -39,   -36,   -31,   -29,   -30,   -22,   -19,
+    -21,   -20,   -17,   -17,   -22,   -31,   -41,   -45,   -54,   -65,   -64,
+    -68,   -70,   -74,   -70,   -64,   -62,   -61,   -60,   -58,   -52,   -46,
+    -43,   -37,   -35,   -40,   -41,   -47,   -52,   -58,   -62,   -61,   -53,
+    -54,   -46,   -41,   -40,   -34,   -29,   -20,   -15,   -8,    2,     12,
+    28,    35,    41,    42,    42,    43,    41,    43,    39,    45,    44,
+    46,    55,    54,    55,    55,    51,    48,    42,    43,    39,    40,
+    46,    54,    65,    70,    76,    81,    86,    89,    79,    73,    70,
+    62,    56,    52,    39,    32,    28,    17,    18,    19,    18,    15,
+    19,    20,    15,    13,    13,    10,    6,     5,     12,    10,    15,
+    20,    24,    30,    31,    28,    22,    17,    2,     -15,   -24,   -39,
+    -52,   -53,   -55,   -46,   -40,   -34,   -26,   -21,   -22,   -31,   -32,
+    -38,   -36,   -35,   -32,   -33,   -34,   -30,   -28,   -27,   -35,   -40,
+    -42,   -45,   -44,   -45,   -44,   -52,   -54,   -57,   -57,   -53,   -60,
+    -63,   -63,   -65,   -51,   -45,   -40,   -40,   -39,   -39,   -43,   -44,
+    -46,   -52,   -46,   -51,   -49,   -45,   -45,   -47,   -47,   -45,   -50,
+    -47,   -40,   -35,   -32,   -24,   -17,   -19,   -14,   -13,   -9,    -7,
+    -7,    -7,    -9,    0,     3,     7,     13,    12,    14,    15,    13,
+    6,     -1,    -3,    -9,    -10,   -5,    -2,    6,     9,     11,    12,
+    15,    19,    24,    37,    47,    47,    56,    53,    51,    52,    52,
+    47,    39,    38,    40,    41,    43,    44,    42,    43,    42,    41,
+    43,    40,    41,    35,    37,    39,    40,    41,    38,    30,    21,
+    14,    5,     2,     -1,    -2,    1,     -2,    6,     2,     4,     2,
+    -1,    -11,   -16,   -23,   -25,   -20,   -18,   -25,   -27,   -32,   -27,
+    -24,   -16,   -15,   -11,   -9,    -3,    -4,    -2,    -9,    -10,   -18,
+    -28,   -33,   -38,   -37,   -41,   -41,   -33,   -24,   -22,   -25,   -25,
+    -25,   -24,   -33,   -38,   -42,   -52,   -57,   -55,   -50,   -51,   -53,
+    -52,   -48,   -49,   -49,   -53,   -55,   -58,   -51,   -34,   -19,   -12,
+    -12,   -5,    1,     1,     0,     -6,    -2,    -10,   -11,   -11,   -6,
+    0,     -6,    2,     -2,    -6,    2,     5,     16,    18,    18,    21,
+    16,    18,    18,    20,    20,    13,    18,    9,     7,     12,    7,
+    8,     10,    16,    17,    18,    23,    26,    36,    44,    51,    55,
+    60,    64,    69,    68,    71,    70,    62,    58,    52,    44,    35,
+    31,    34,    32,    33,    36,    37,    38,    41,    47,    55,    56,
+    58,    60,    60,    57,    48,    41,    29,    19,    7,     4,     8,
+    9,     10,    8,     13,    15,    13,    8,     8,     6,     4,     10,
+    8,     -4,    -6,    -9,    -20,   -28,   -39,   -38,   -27,   -24,   -22,
+    -19,   -23,   -32,   -35,   -36,   -41,   -48,   -51,   -50,   -52,   -55,
+    -60,   -67,   -72,   -76,   -84,   -82,   -80,   -81,   -75,   -64,   -50,
+    -36,   -28,   -18,   -14,   -12,   -15,   -12,   -18,   -24,   -21,   -22,
+    -19,   -21,   -19,   -22,   -20,   -18,   -16,   -17,   -19,   -15,   -7,
+    1,     0,     0,     9,     14,    20,    24,    20,    16,    17,    20,
+    20,    25,    27,    26,    32,    33,    35,    38,    42,    38,    37,
+    39,    46,    44,    43,    45,    45,    42,    37,    34,    25,    21,
+    22,    33,    44,    49,    54,    53,    58,    54,    51,    46,    40,
+    37,    37,    39,    34,    37,    39,    31,    39,    38,    36,    35,
+    32,    33,    33,    32,    28,    23,    18,    22,    28,    31,    27,
+    18,    3,     4,     0,     -4,    -7,    -15,   -18,   -24,   -32,   -34,
+    -39,   -42,   -36,   -31,   -24,   -12,   -10,   -10,   -13,   -20,   -28,
+    -34,   -44,   -49,   -50,   -53,   -56,   -54,   -52,   -53,   -47,   -43,
+    -41,   -45,   -41,   -38,   -38,   -33,   -32,   -34,   -35,   -33,   -40,
+    -45,   -53,   -62,   -61,   -67,   -72,   -70,   -67,   -68,   -59,   -51,
+    -47,   -38,   -31,   -20,   -13,   -13,   -13,   -14,   -17,   -21,   -22,
+    -29,   -31,   -27,   -23,   -13,   -6,    4,     12,    17,    25,    23,
+    23,    25,    30,    30,    32,    31,    28,    27,    18,    14,    13,
+    3,     5,     7,     19,    35,    47,    61,    70,    84,    90,    95,
+    92,    94,    89,    77,    71,    66,    59,    50,    51,    50,    51,
+    53,    56,    65,    67,    69,    75,    74,    69,    67,    56,    51,
+    44,    34,    25,    17,    10,    6,     7,     7,     4,     6,     -1,
+    -1,    -2,    -9,    -9,    -9,    -7,    -5,    1,     -2,    -5,    -11,
+    -19,   -27,   -39,   -38,   -44,   -45,   -48,   -48,   -54,   -59,   -53,
+    -51,   -49,   -52,   -50,   -50,   -47,   -42,   -32,   -28,   -28,   -26,
+    -27,   -34,   -40,   -40,   -36,   -37,   -37,   -34,   -37,   -36,   -41,
+    -36,   -40,   -46,   -48,   -52,   -47,   -44,   -40,   -40,   -38,   -43,
+    -43,   -47,   -59,   -62,   -59,   -59,   -51,   -41,   -29,   -19,   -8,
+    -2,    1,     1,     -4,    -9,    -19,   -23,   -29,   -29,   -25,   -23,
+    -15,   -7,    -2,    6,     8,     15,    27,    35,    43,    40,    36,
+    35,    32,    25,    22,    19,    17,    13,    13,    21,    25,    28,
+    36,    44,    50,    57,    56,    58,    59,    62,    66,    70,    73,
+    69,    66,    66,    66,    62,    53,    48,    44,    38,    39,    44,
+    52,    51,    55,    57,    52,    49,    44,    36,    26,    16,    13,
+    13,    14,    14,    17,    14,    10,    6,     -5,    -14,   -23,   -24,
+    -21,   -28,   -25,   -27,   -29,   -29,   -33,   -33,   -39,   -42,   -43,
+    -41,   -40,   -43,   -46,   -45,   -43,   -42,   -41,   -41,   -46,   -46,
+    -52,   -52,   -52,   -59,   -63,   -70,   -68,   -73,   -77,   -73,   -68,
+    -66,   -62,   -64,   -66,   -58,   -54,   -51,   -52,   -48,   -47,   -43,
+    -40,   -39,   -33,   -26,   -19,   -17,   -16,   -17,   -14,   -9,    -10,
+    -3,    5,     5,     9,     5,     9,     8,     4,     3,     0,     -5,
+    -10,   -3,    2,     8,     14,    16,    20,    27,    39,    40,    44,
+    48,    43,    39,    34,    29,    22,    12,    8,     5,     0,     -2,
+    -3,    5,     12,    16,    19,    22,    25,    28,    35,    28,    30,
+    31,    30,    39,    43,    47,    43,    42,    41,    41,    41,    37,
+    37,    39,    37,    38,    43,    44,    41,    43,    34,    28,    25,
+    23,    30,    34,    32,    33,    29,    21,    18,    13,    14,    11,
+    3,     2,     1,     3,     1,     -1,    0,     -3,    -1,    -3,    -8,
+    -9,    -7,    -9,    -2,    0,     -3,    0,     1,     5,     0,     -1,
+    -9,    -13,   -8,    -11,   -18,   -23,   -25,   -29,   -29,   -26,   -27,
+    -29,   -25,   -24,   -23,   -18,   -19,   -18,   -17,   -21,   -22,   -30,
+    -38,   -42,   -42,   -42,   -40,   -41,   -43,   -39,   -38,   -37,   -36,
+    -33,   -31,   -28,   -27,   -18,   -15,   -7,    -8,    -8,    -1,    1,
+    3,     -5,    0,     -4,    -5,    -4,    -8,    -10,   -14,   -21,   -24,
+    -25,   -20,   -11,   -4,    3,     6,     13,    15,    12,    17,    16,
+    17,    17,    15,    21,    28,    33,    36,    35,    35,    29,    31,
+    29,    28,    23,    21,    14,    15,    27,    36,    40,    40,    43,
+    51,    56,    62,    69,    77,    80,    88,    88,    88,    82,    76,
+    63,    52,    44,    36,    26,    23,    25,    24,    27,    26,    31,
+    21,    13,    8,     -8,    -8,    -11,   -14,   -18,   -28,   -28,   -30,
+    -32,   -29,   -26,   -26,   -27,   -24,   -20,   -14,   -8,    -6,    -8,
+    -5,    -10,   -14,   -18,   -26,   -34,   -36,   -38,   -44,   -51,   -57,
+    -66,   -64,   -68,   -72,   -75,   -75,   -70,   -68,   -65,   -64,   -62,
+    -68,   -63,   -60,   -65,   -65,   -69,   -68,   -67,   -57,   -46,   -41,
+    -38,   -34,   -31,   -39,   -40,   -45,   -45,   -48,   -47,   -40,   -39,
+    -32,   -26,   -24,   -14,   -9,    -7,    -3,    -2,    3,     4,     0,
+    -2,    -2,    -2,    1,     3,     2,     3,     8,     13,    20,    25,
+    29,    31,    26,    17,    11,    3,     -5,    2,     6,     9,     11,
+    19,    26,    40,    51,    61,    60,    58,    61,    55,    55,    57,
+    60,    54,    40,    42,    38,    34,    38,    37,    34,    32,    35,
+    36,    35,    41,    36,    32,    29,    23,    22,    23,    22,    14,
+    13,    19,    19,    20,    22,    22,    17,    13,    6,     9,     13,
+    15,    17,    19,    11,    15,    8,     4,     6,     -1,    -3,    3,
+    7,     11,    8,     10,    7,     6,     4,     -4,    -5,    -11,   -9,
+    -16,   -14,   -14,   -16,   -16,   -22,   -19,   -19,   -13,   -9,    -4,
+    1,     1,     2,     -6,    -14,   -25,   -32,   -41,   -46,   -50,   -49,
+    -42,   -39,   -34,   -24,   -14,   -18,   -15,   -17,   -21,   -23,   -21,
+    -19,   -21,   -20,   -19,   -20,   -19,   -16,   -17,   -19,   -20,   -20,
+    -20,   -20,   -22,   -22,   -23,   -22,   -22,   -14,   -5,    5,     8,
+    13,    16,    19,    23,    19,    21,    16,    16,    18,    13,    18,
+    13,    15,    18,    12,    12,    6,     11,    8,     5,     5,     9,
+    17,    14,    15,    14,    16,    14,    14,    12,    9,     7,     9,
+    11,    13,    15,    15,    19,    17,    14,    8,     7,     4,     0,
+    3,     8,     10,    7,     8,     19,    15,    19,    18,    19,    17,
+    9,     14,    10,    4,     -3,    -11,   -19,   -25,   -31,   -35,   -36,
+    -28,   -21,   -8,    5,     8,     11,    13,    7,     4,     1,     -7,
+    -15,   -17,   -17,   -21,   -28,   -33,   -37,   -40,   -39,   -41,   -45,
+    -46,   -44,   -40,   -41,   -36,   -31,   -41,   -40,   -42,   -44,   -47,
+    -50,   -49,   -55,   -52,   -52,   -52,   -45,   -50,   -52,   -56,   -58,
+    -60,   -69,   -75,   -82,   -86,   -91,   -87,   -80,   -80,   -72,   -58,
+    -52,   -45,   -33,   -21,   -13,   -12,   -10,   -6,    -1,    -2,    -7,
+    -7,    -5,    -6,    -3,    9,     15,    25,    36,    35,    39,    28,
+    16,    11,    8,     11,    17,    27,    34,    36,    47,    49,    52,
+    52,    42,    46,    49,    55,    65,    66,    67,    62,    56,    53,
+    49,    50,    55,    53,    62,    69,    72,    73,    68,    61,    54,
+    46,    43,    38,    34,    39,    43,    42,    39,    36,    31,    26,
+    24,    17,    13,    14,    14,    21,    26,    29,    28,    26,    24,
+    18,    19,    16,    11,    6,     2,     -2,    1,     3,     2,     -4,
+    -3,    -1,    -3,    -2,    -2,    -5,    -3,    0,     3,     -3,    -6,
+    -6,    -15,   -19,   -25,   -30,   -35,   -39,   -34,   -34,   -34,   -31,
+    -17,   -17,   -8,    -2,    -2,    8,     14,    25,    24,    26,    22,
+    16,    10,    2,     -3,    -5,    -12,   -15,   -11,   -14,   -16,   -17,
+    -17,   -16,   -21,   -18,   -18,   -21,   -23,   -21,   -15,   -11,   -4,
+    -2,    3,     8,     10,    17,    18,    25,    24,    24,    24,    21,
+    24,    23,    24,    22,    23,    31,    39,    49,    58,    64,    67,
+    63,    57,    53,    52,    44,    45,    43,    40,    45,    42,    49,
+    50,    49,    52,    51,    48,    46,    38,    37,    35,    36,    37,
+    37,    37,    44,    45,    47,    42,    42,    36,    35,    44,    40,
+    40,    28,    24,    23,    18,    12,    9,     8,     10,    17,    17,
+    18,    12,    5,     -2,    -12,   -16,   -20,   -27,   -29,   -29,   -26,
+    -22,   -17,   -16,   -15,   -14,   -15,   -11,   -11,   -15,   -19,   -15,
+    -20,   -22,   -24,   -37,   -52,   -62,   -63,   -68,   -64,   -59,   -51,
+    -43,   -42,   -36,   -32,   -33,   -33,   -33,   -41,   -48,   -51,   -49,
+    -48,   -47,   -42,   -45,   -42,   -41,   -40,   -39,   -33,   -29,   -25,
+    -14,   -1,    -4,    -6,    -11,   -16,   -19,   -26,   -29,   -28,   -25,
+    -17,   -10,   -1,    -1,    3,     7,     -1,    -3,    -8,    -18,   -20,
+    -20,   -16,   -13,   -11,   -8,    0,     6,     8,     11,    14,    15,
+    20,    26,    26,    26,    24,    23,    24,    30,    34,    41,    52,
+    61,    70,    80,    85,    86,    89,    84,    87,    79,    67,    60,
+    57,    59,    63,    68,    74,    78,    84,    89,    91,    87,    81,
+    74,    69,    63,    59,    59,    56,    58,    60,    60,    59,    54,
+    49,    41,    40,    34,    25,    19,    11,    1,     0,     -1,    -4,
+    -8,    -12,   -12,   -17,   -22,   -31,   -44,   -54,   -58,   -68,   -74,
+    -80,   -80,   -73,   -65,   -61,   -61,   -55,   -50,   -50,   -59,   -65,
+    -69,   -73,   -73,   -78,   -79,   -83,   -87,   -87,   -88,   -94,   -103,
+    -107,  -107,  -109,  -106,  -113,  -115,  -110,  -105,  -100,  -100,  -92,
+    -78,   -62,   -49,   -39,   -35,   -27,   -26,   -25,   -24,   -22,   -23,
+    -28,   -26,   -22,   -15,   -11,   -4,    4,     13,    21,    32,    31,
+    28,    30,    30,    28,    23,    25,    23,    21,    25,    21,    26,
+    27,    32,    40,    48,    53,    55,    54,    55,    55,    54,    48,
+    44,    47,    48,    54,    60,    71,    79,    79,    74,    72,    59,
+    48,    42,    32,    26,    22,    21,    23,    22,    31,    42,    44,
+    41,    36,    30,    30,    33,    38,    35,    30,    28,    20,    15,
+    8,     4,     6,     9,     16,    26,    27,    23,    19,    16,    10,
+    4,     -4,    -12,   -12,   -16,   -16,   -19,   -24,   -23,   -23,   -31,
+    -34,   -38,   -40,   -41,   -39,   -39,   -36,   -36,   -40,   -45,   -48,
+    -53,   -66,   -73,   -76,   -76,   -78,   -75,   -71,   -65,   -59,   -58,
+    -59,   -56,   -60,   -62,   -62,   -62,   -64,   -68,   -73,   -79,   -80,
+    -85,   -87,   -85,   -78,   -72,   -66,   -56,   -48,   -42,   -37,   -35,
+    -32,   -33,   -31,   -25,   -26,   -27,   -16,   -18,   -18,   -13,   -14,
+    -17,   -22,   -24,   -25,   -23,   -19,   -14,   -12,   -11,   -7,    -4,
+    -1,    2,     5,     8,     10,    10,    18,    28,    29,    25,    22,
+    29,    21,    20,    21,    22,    30,    32,    41,    41,    45,    46,
+    49,    52,    57,    59,    58,    52,    46,    47,    56,    58,    49,
+    49,    46,    40,    33,    23,    14,    11,    16,    29,    34,    37,
+    41,    42,    48,    54,    60,    61,    62,    62,    69,    79,    76,
+    71,    72,    71,    64,    59,    54,    49,    40,    42,    34,    23,
+    27,    18,    13,    9,     3,     -4,    -8,    -16,   -18,   -20,   -26,
+    -28,   -30,   -32,   -29,   -32,   -35,   -39,   -41,   -38,   -34,   -31,
+    -26,   -18,   -21,   -20,   -22,   -28,   -35,   -34,   -31,   -33,   -31,
+    -31,   -40,   -43,   -45,   -53,   -64,   -67,   -74,   -75,   -74,   -75,
+    -70,   -61,   -56,   -45,   -37,   -30,   -33,   -35,   -32,   -31,   -27,
+    -25,   -19,   -17,   -14,   -9,    -4,    -1,    -3,    -4,    1,     8,
+    14,    20,    24,    25,    18,    11,    7,     -3,    -9,    -3,    4,
+    15,    30,    29,    33,    33,    36,    35,    31,    33,    34,    42,
+    43,    42,    47,    49,    53,    61,    69,    73,    74,    79,    81,
+    84,    76,    69,    62,    47,    39,    31,    19,    8,     2,     -6,
+    -5,    -3,    -3,    -1,    1,     -2,    -3,    -3,    -6,    -12,   -13,
+    -15,   -11,   -5,    -4,    -8,    -14,   -9,    -3,    0,     -3,    -4,
+    0,     3,     0,     -6,    -14,   -23,   -33,   -38,   -41,   -38,   -38,
+    -34,   -30,   -29,   -29,   -26,   -31,   -33,   -41,   -49,   -50,   -56,
+    -57,   -58,   -54,   -46,   -39,   -39,   -34,   -31,   -28,   -30,   -30,
+    -31,   -29,   -27,   -16,   -18,   -17,   -15,   -13,   -15,   -12,   -7,
+    -11,   -9,    -9,    -4,    -11,   -7,    -7,    -8,    -9,    -10,   -7,
+    -9,    1,     9,     15,    12,    19,    19,    18,    17,    13,    11,
+    8,     6,     10,    17,    20,    26,    28,    33,    39,    30,    25,
+    25,    18,    16,    21,    26,    30,    33,    32,    36,    42,    49,
+    46,    39,    44,    44,    37,    35,    30,    24,    22,    23,    26,
+    23,    25,    21,    24,    24,    22,
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..4cc8030cdac7c4e8364e0fcd7dcc5fff63617908
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/no_1000ms_sample_data.h
@@ -0,0 +1,29 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This data was created from the PCM data in a WAV file held in v2 of the
+// Speech Commands test dataset, at the path:
+// speech_commands_test_set_v0.02/no/f9643d42_nohash_4.wav
+// This should contain all 16,000 samples from the one-second file.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_1000MS_SAMPLE_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_1000MS_SAMPLE_DATA_H_
+
+#include <cstdint>
+
+extern const int g_no_1000ms_sample_data_size;
+extern const int16_t g_no_1000ms_sample_data[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_1000MS_SAMPLE_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/osx/Makefile.inc b/tensorflow/lite/experimental/micro/examples/micro_speech/osx/Makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..8f8b33a9fa2afca902ef5fbcfa7f641b5cc58028
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/osx/Makefile.inc
@@ -0,0 +1,8 @@
+# Settings for Mac OS platforms.
+ifeq ($(TARGET), osx)
+  LINKER_FLAGS := \
+    -framework Foundation \
+    -framework AudioToolbox
+
+  MICROLITE_LIBS += $(LINKER_FLAGS)
+endif
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/osx/audio_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/osx/audio_provider.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6468c1a95a9cd3f844595bf2c6e88c1e2833823b
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/osx/audio_provider.cc
@@ -0,0 +1,139 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+
+#include <AudioToolbox/AudioToolbox.h>
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.h"
+
+namespace {
+
+constexpr int kNumberRecordBuffers = 3;
+bool g_is_audio_initialized = false;
+constexpr int kAudioCaptureBufferSize = kAudioSampleFrequency * 0.5;
+int16_t g_audio_capture_buffer[kAudioCaptureBufferSize];
+int16_t g_audio_output_buffer[kMaxAudioSampleSize];
+int32_t g_latest_audio_timestamp = 0;
+
+// Checks for MacOS errors, prints information and returns a TF Lite version.
+#define RETURN_IF_OS_ERROR(error, error_reporter)                       \
+  do {                                                                  \
+    if (error != noErr) {                                               \
+      error_reporter->Report("Error: %s:%d (%d)\n", __FILE__, __LINE__, \
+                             error);                                    \
+      return kTfLiteError;                                              \
+    }                                                                   \
+  } while (0);
+
+// Called when an audio input buffer has been filled.
+void OnAudioBufferFilledCallback(
+    void* user_data, AudioQueueRef queue, AudioQueueBufferRef buffer,
+    const AudioTimeStamp* start_time, UInt32 num_packets,
+    const AudioStreamPacketDescription* packet_description) {
+  const int sample_size = buffer->mAudioDataByteSize / sizeof(float);
+  const int64_t sample_offset = start_time->mSampleTime;
+  const int32_t time_in_ms =
+      (sample_offset + sample_size) / (kAudioSampleFrequency / 1000);
+  const float* float_samples = static_cast<const float*>(buffer->mAudioData);
+  for (int i = 0; i < sample_size; ++i) {
+    const int capture_index = (sample_offset + i) % kAudioCaptureBufferSize;
+    g_audio_capture_buffer[capture_index] = float_samples[i] * ((1 << 15) - 1);
+  }
+  // This is how we let the outside world know that new audio data has arrived.
+  g_latest_audio_timestamp = time_in_ms;
+  AudioQueueEnqueueBuffer(queue, buffer, 0, nullptr);
+}
+
+// Set up everything we need to capture audio samples from the default recording
+// device on MacOS.
+TfLiteStatus InitAudioRecording(tflite::ErrorReporter* error_reporter) {
+  // Set up the format of the audio - single channel, 32-bit float at 16KHz.
+  AudioStreamBasicDescription recordFormat = {0};
+  recordFormat.mSampleRate = kAudioSampleFrequency;
+  recordFormat.mFormatID = kAudioFormatLinearPCM;
+  recordFormat.mFormatFlags =
+      kAudioFormatFlagIsFloat | kAudioFormatFlagIsPacked;
+  recordFormat.mBitsPerChannel = 8 * sizeof(float);
+  recordFormat.mChannelsPerFrame = 1;
+  recordFormat.mBytesPerFrame = sizeof(float) * recordFormat.mChannelsPerFrame;
+  recordFormat.mFramesPerPacket = 1;
+  recordFormat.mBytesPerPacket =
+      recordFormat.mBytesPerFrame * recordFormat.mFramesPerPacket;
+  recordFormat.mReserved = 0;
+
+  UInt32 propSize = sizeof(recordFormat);
+  RETURN_IF_OS_ERROR(AudioFormatGetProperty(kAudioFormatProperty_FormatInfo, 0,
+                                            NULL, &propSize, &recordFormat),
+                     error_reporter);
+
+  // Create a recording queue.
+  AudioQueueRef queue;
+  RETURN_IF_OS_ERROR(
+      AudioQueueNewInput(&recordFormat, OnAudioBufferFilledCallback,
+                         error_reporter, nullptr, nullptr, 0, &queue),
+      error_reporter);
+
+  // Set up the buffers we'll need.
+  int buffer_bytes = 512 * sizeof(float);
+  for (int i = 0; i < kNumberRecordBuffers; ++i) {
+    AudioQueueBufferRef buffer;
+    RETURN_IF_OS_ERROR(AudioQueueAllocateBuffer(queue, buffer_bytes, &buffer),
+                       error_reporter);
+    RETURN_IF_OS_ERROR(AudioQueueEnqueueBuffer(queue, buffer, 0, nullptr),
+                       error_reporter);
+  }
+
+  // Start capturing audio.
+  RETURN_IF_OS_ERROR(AudioQueueStart(queue, nullptr), error_reporter);
+
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
+                             int start_ms, int duration_ms,
+                             int* audio_samples_size, int16_t** audio_samples) {
+  if (!g_is_audio_initialized) {
+    TfLiteStatus init_status = InitAudioRecording(error_reporter);
+    if (init_status != kTfLiteOk) {
+      return init_status;
+    }
+    for (int i = 0; i < kMaxAudioSampleSize; ++i) {
+      g_audio_output_buffer[i] = 0;
+    }
+    g_is_audio_initialized = true;
+  }
+  // This should only be called when the main thread notices that the latest
+  // audio sample data timestamp has changed, so that there's new data in the
+  // capture ring buffer. The ring buffer will eventually wrap around and
+  // overwrite the data, but the assumption is that the main thread is checking
+  // often enough and the buffer is large enough that this call will be made
+  // before that happens.
+  const int start_offset = start_ms * (kAudioSampleFrequency / 1000);
+  const int duration_sample_count =
+      duration_ms * (kAudioSampleFrequency / 1000);
+  for (int i = 0; i < duration_sample_count; ++i) {
+    const int capture_index = (start_offset + i) % kAudioCaptureBufferSize;
+    g_audio_output_buffer[i] = g_audio_capture_buffer[capture_index];
+  }
+
+  *audio_samples_size = kMaxAudioSampleSize;
+  *audio_samples = g_audio_output_buffer;
+  return kTfLiteOk;
+}
+
+int32_t LatestAudioTimestamp() { return g_latest_audio_timestamp; }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8187962c3e780a76413134771dc63ba30910f3b6
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.cc
@@ -0,0 +1,139 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h"
+
+#include <limits>
+
+RecognizeCommands::RecognizeCommands(tflite::ErrorReporter* error_reporter,
+                                     int32_t average_window_duration_ms,
+                                     uint8_t detection_threshold,
+                                     int32_t suppression_ms,
+                                     int32_t minimum_count)
+    : error_reporter_(error_reporter),
+      average_window_duration_ms_(average_window_duration_ms),
+      detection_threshold_(detection_threshold),
+      suppression_ms_(suppression_ms),
+      minimum_count_(minimum_count),
+      previous_results_(error_reporter) {
+  previous_top_label_ = "silence";
+  previous_top_label_time_ = std::numeric_limits<int32_t>::min();
+}
+
+TfLiteStatus RecognizeCommands::ProcessLatestResults(
+    const TfLiteTensor* latest_results, const int32_t current_time_ms,
+    const char** found_command, uint8_t* score, bool* is_new_command) {
+  if ((latest_results->dims->size != 2) ||
+      (latest_results->dims->data[0] != 1) ||
+      (latest_results->dims->data[1] != kCategoryCount)) {
+    error_reporter_->Report(
+        "The results for recognition should contain %d elements, but there are "
+        "%d in an %d-dimensional shape",
+        kCategoryCount, latest_results->dims->data[1],
+        latest_results->dims->size);
+    return kTfLiteError;
+  }
+
+  if (latest_results->type != kTfLiteUInt8) {
+    error_reporter_->Report(
+        "The results for recognition should be uint8 elements, but are %d",
+        latest_results->type);
+    return kTfLiteError;
+  }
+
+  if ((!previous_results_.empty()) &&
+      (current_time_ms < previous_results_.front().time_)) {
+    error_reporter_->Report(
+        "Results must be fed in increasing time order, but received a "
+        "timestamp of %d that was earlier than the previous one of %d",
+        current_time_ms, previous_results_.front().time_);
+    return kTfLiteError;
+  }
+
+  // Add the latest results to the head of the queue.
+  previous_results_.push_back({current_time_ms, latest_results->data.uint8});
+
+  // Prune any earlier results that are too old for the averaging window.
+  const int64_t time_limit = current_time_ms - average_window_duration_ms_;
+  while ((!previous_results_.empty()) &&
+         previous_results_.front().time_ < time_limit) {
+    previous_results_.pop_front();
+  }
+
+  // If there are too few results, assume the result will be unreliable and
+  // bail.
+  const int64_t how_many_results = previous_results_.size();
+  const int64_t earliest_time = previous_results_.front().time_;
+  const int64_t samples_duration = current_time_ms - earliest_time;
+  if ((how_many_results < minimum_count_) ||
+      (samples_duration < (average_window_duration_ms_ / 4))) {
+    *found_command = previous_top_label_;
+    *score = 0;
+    *is_new_command = false;
+    return kTfLiteOk;
+  }
+
+  // Calculate the average score across all the results in the window.
+  int32_t average_scores[kCategoryCount];
+  for (int offset = 0; offset < previous_results_.size(); ++offset) {
+    PreviousResultsQueue::Result previous_result =
+        previous_results_.from_front(offset);
+    const uint8_t* scores = previous_result.scores_;
+    for (int i = 0; i < kCategoryCount; ++i) {
+      if (offset == 0) {
+        average_scores[i] = scores[i];
+      } else {
+        average_scores[i] += scores[i];
+      }
+    }
+  }
+  for (int i = 0; i < kCategoryCount; ++i) {
+    average_scores[i] /= how_many_results;
+  }
+
+  // Find the current highest scoring category.
+  int current_top_index = 0;
+  int32_t current_top_score = 0;
+  for (int i = 0; i < kCategoryCount; ++i) {
+    if (average_scores[i] > current_top_score) {
+      current_top_score = average_scores[i];
+      current_top_index = i;
+    }
+  }
+  const char* current_top_label = kCategoryLabels[current_top_index];
+
+  // If we've recently had another label trigger, assume one that occurs too
+  // soon afterwards is a bad result.
+  int64_t time_since_last_top;
+  if ((previous_top_label_ == kCategoryLabels[0]) ||
+      (previous_top_label_time_ == std::numeric_limits<int32_t>::min())) {
+    time_since_last_top = std::numeric_limits<int32_t>::max();
+  } else {
+    time_since_last_top = current_time_ms - previous_top_label_time_;
+  }
+  if ((current_top_score > detection_threshold_) &&
+      (current_top_label != previous_top_label_) &&
+      (time_since_last_top > suppression_ms_)) {
+    previous_top_label_ = current_top_label;
+    previous_top_label_time_ = current_time_ms;
+    *is_new_command = true;
+  } else {
+    *is_new_command = false;
+  }
+  *found_command = current_top_label;
+  *score = current_top_score;
+
+  return kTfLiteOk;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h
new file mode 100644
index 0000000000000000000000000000000000000000..292cd3e88dcd63f925cb16995b5e8a16554a8547
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h
@@ -0,0 +1,156 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_RECOGNIZE_COMMANDS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_RECOGNIZE_COMMANDS_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+// Partial implementation of std::dequeue, just providing the functionality
+// that's needed to keep a record of previous neural network results over a
+// short time period, so they can be averaged together to produce a more
+// accurate overall prediction. This doesn't use any dynamic memory allocation
+// so it's a better fit for microcontroller applications, but this does mean
+// there are hard limits on the number of results it can store.
+class PreviousResultsQueue {
+ public:
+  PreviousResultsQueue(tflite::ErrorReporter* error_reporter)
+      : error_reporter_(error_reporter), front_index_(0), size_(0) {}
+
+  // Data structure that holds an inference result, and the time when it
+  // was recorded.
+  struct Result {
+    Result() : time_(0), scores_() {}
+    Result(int32_t time, uint8_t* scores) : time_(time) {
+      for (int i = 0; i < kCategoryCount; ++i) {
+        scores_[i] = scores[i];
+      }
+    }
+    int32_t time_;
+    uint8_t scores_[kCategoryCount];
+  };
+
+  int size() { return size_; }
+  bool empty() { return size_ == 0; }
+  Result& front() { return results_[front_index_]; }
+  Result& back() {
+    int back_index = front_index_ + (size_ - 1);
+    if (back_index >= kMaxResults) {
+      back_index -= kMaxResults;
+    }
+    return results_[back_index];
+  }
+
+  void push_back(const Result& entry) {
+    if (size() >= kMaxResults) {
+      error_reporter_->Report(
+          "Couldn't push_back latest result, too many already!");
+      return;
+    }
+    size_ += 1;
+    back() = entry;
+  }
+
+  Result pop_front() {
+    if (size() <= 0) {
+      error_reporter_->Report("Couldn't pop_front result, none present!");
+      return Result();
+    }
+    Result result = front();
+    front_index_ += 1;
+    if (front_index_ >= kMaxResults) {
+      front_index_ = 0;
+    }
+    size_ -= 1;
+    return result;
+  }
+
+  // Most of the functions are duplicates of dequeue containers, but this
+  // is a helper that makes it easy to iterate through the contents of the
+  // queue.
+  Result& from_front(int offset) {
+    if ((offset < 0) || (offset >= size_)) {
+      error_reporter_->Report("Attempt to read beyond the end of the queue!");
+      offset = size_ - 1;
+    }
+    int index = front_index_ + offset;
+    if (index >= kMaxResults) {
+      index -= kMaxResults;
+    }
+    return results_[index];
+  }
+
+ private:
+  tflite::ErrorReporter* error_reporter_;
+  static constexpr int kMaxResults = 50;
+  Result results_[kMaxResults];
+
+  int front_index_;
+  int size_;
+};
+
+// This class is designed to apply a very primitive decoding model on top of the
+// instantaneous results from running an audio recognition model on a single
+// window of samples. It applies smoothing over time so that noisy individual
+// label scores are averaged, increasing the confidence that apparent matches
+// are real.
+// To use it, you should create a class object with the configuration you
+// want, and then feed results from running a TensorFlow model into the
+// processing method. The timestamp for each subsequent call should be
+// increasing from the previous, since the class is designed to process a stream
+// of data over time.
+class RecognizeCommands {
+ public:
+  // labels should be a list of the strings associated with each one-hot score.
+  // The window duration controls the smoothing. Longer durations will give a
+  // higher confidence that the results are correct, but may miss some commands.
+  // The detection threshold has a similar effect, with high values increasing
+  // the precision at the cost of recall. The minimum count controls how many
+  // results need to be in the averaging window before it's seen as a reliable
+  // average. This prevents erroneous results when the averaging window is
+  // initially being populated for example. The suppression argument disables
+  // further recognitions for a set time after one has been triggered, which can
+  // help reduce spurious recognitions.
+  explicit RecognizeCommands(tflite::ErrorReporter* error_reporter,
+                             int32_t average_window_duration_ms = 1000,
+                             uint8_t detection_threshold = 200,
+                             int32_t suppression_ms = 1500,
+                             int32_t minimum_count = 3);
+
+  // Call this with the results of running a model on sample data.
+  TfLiteStatus ProcessLatestResults(const TfLiteTensor* latest_results,
+                                    const int32_t current_time_ms,
+                                    const char** found_command, uint8_t* score,
+                                    bool* is_new_command);
+
+ private:
+  // Configuration
+  tflite::ErrorReporter* error_reporter_;
+  int32_t average_window_duration_ms_;
+  uint8_t detection_threshold_;
+  int32_t suppression_ms_;
+  int32_t minimum_count_;
+
+  // Working variables
+  PreviousResultsQueue previous_results_;
+  const char* previous_top_label_;
+  int32_t previous_top_label_time_;
+};
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_RECOGNIZE_COMMANDS_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6582c948d16f9493a4b1e5bdf43bdc1f30e6dc31
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands_test.cc
@@ -0,0 +1,211 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/recognize_commands.h"
+
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(PreviousResultsQueueBasic) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  PreviousResultsQueue queue(error_reporter);
+  TF_LITE_MICRO_EXPECT_EQ(0, queue.size());
+
+  uint8_t scores_a[4] = {0, 0, 0, 1};
+  queue.push_back({0, scores_a});
+  TF_LITE_MICRO_EXPECT_EQ(1, queue.size());
+  TF_LITE_MICRO_EXPECT_EQ(0, queue.front().time_);
+  TF_LITE_MICRO_EXPECT_EQ(0, queue.back().time_);
+
+  uint8_t scores_b[4] = {0, 0, 1, 0};
+  queue.push_back({1, scores_b});
+  TF_LITE_MICRO_EXPECT_EQ(2, queue.size());
+  TF_LITE_MICRO_EXPECT_EQ(0, queue.front().time_);
+  TF_LITE_MICRO_EXPECT_EQ(1, queue.back().time_);
+
+  PreviousResultsQueue::Result pop_result = queue.pop_front();
+  TF_LITE_MICRO_EXPECT_EQ(0, pop_result.time_);
+  TF_LITE_MICRO_EXPECT_EQ(1, queue.size());
+  TF_LITE_MICRO_EXPECT_EQ(1, queue.front().time_);
+  TF_LITE_MICRO_EXPECT_EQ(1, queue.back().time_);
+
+  uint8_t scores_c[4] = {0, 1, 0, 0};
+  queue.push_back({2, scores_c});
+  TF_LITE_MICRO_EXPECT_EQ(2, queue.size());
+  TF_LITE_MICRO_EXPECT_EQ(1, queue.front().time_);
+  TF_LITE_MICRO_EXPECT_EQ(2, queue.back().time_);
+}
+
+TF_LITE_MICRO_TEST(PreviousResultsQueuePushPop) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  PreviousResultsQueue queue(error_reporter);
+  TF_LITE_MICRO_EXPECT_EQ(0, queue.size());
+
+  for (int i = 0; i < 123; ++i) {
+    uint8_t scores[4] = {0, 0, 0, 1};
+    queue.push_back({i, scores});
+    TF_LITE_MICRO_EXPECT_EQ(1, queue.size());
+    TF_LITE_MICRO_EXPECT_EQ(i, queue.front().time_);
+    TF_LITE_MICRO_EXPECT_EQ(i, queue.back().time_);
+
+    PreviousResultsQueue::Result pop_result = queue.pop_front();
+    TF_LITE_MICRO_EXPECT_EQ(i, pop_result.time_);
+    TF_LITE_MICRO_EXPECT_EQ(0, queue.size());
+  }
+}
+
+TF_LITE_MICRO_TEST(RecognizeCommandsTestBasic) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  RecognizeCommands recognize_commands(error_reporter);
+
+  TfLiteTensor results = tflite::testing::CreateQuantizedTensor(
+      {255, 0, 0, 0}, tflite::testing::IntArrayFromInitializer({2, 1, 4}),
+      "input_tensor", 0.0f, 128.0f);
+
+  const char* found_command;
+  uint8_t score;
+  bool is_new_command;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, recognize_commands.ProcessLatestResults(
+                     &results, 0, &found_command, &score, &is_new_command));
+}
+
+TF_LITE_MICRO_TEST(RecognizeCommandsTestFindCommands) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  RecognizeCommands recognize_commands(error_reporter, 1000, 51);
+
+  TfLiteTensor yes_results = tflite::testing::CreateQuantizedTensor(
+      {0, 0, 255, 0}, tflite::testing::IntArrayFromInitializer({2, 1, 4}),
+      "input_tensor", 0.0f, 128.0f);
+
+  bool has_found_new_command = false;
+  const char* new_command;
+  for (int i = 0; i < 10; ++i) {
+    const char* found_command;
+    uint8_t score;
+    bool is_new_command;
+    int32_t current_time_ms = 0 + (i * 100);
+    TF_LITE_MICRO_EXPECT_EQ(
+        kTfLiteOk, recognize_commands.ProcessLatestResults(
+                       &yes_results, current_time_ms, &found_command, &score,
+                       &is_new_command));
+    if (is_new_command) {
+      TF_LITE_MICRO_EXPECT(!has_found_new_command);
+      has_found_new_command = true;
+      new_command = found_command;
+    }
+  }
+  TF_LITE_MICRO_EXPECT(has_found_new_command);
+  if (has_found_new_command) {
+    TF_LITE_MICRO_EXPECT_EQ(0, tflite::testing::TestStrcmp("yes", new_command));
+  }
+
+  TfLiteTensor no_results = tflite::testing::CreateQuantizedTensor(
+      {0, 0, 0, 255}, tflite::testing::IntArrayFromInitializer({2, 1, 4}),
+      "input_tensor", 0.0f, 128.0f);
+  has_found_new_command = false;
+  new_command = "";
+  uint8_t score;
+  for (int i = 0; i < 10; ++i) {
+    const char* found_command;
+    bool is_new_command;
+    int32_t current_time_ms = 1000 + (i * 100);
+    TF_LITE_MICRO_EXPECT_EQ(
+        kTfLiteOk, recognize_commands.ProcessLatestResults(
+                       &no_results, current_time_ms, &found_command, &score,
+                       &is_new_command));
+    if (is_new_command) {
+      TF_LITE_MICRO_EXPECT(!has_found_new_command);
+      has_found_new_command = true;
+      new_command = found_command;
+    }
+  }
+  TF_LITE_MICRO_EXPECT(has_found_new_command);
+  if (has_found_new_command) {
+    TF_LITE_MICRO_EXPECT_EQ(231, score);
+    TF_LITE_MICRO_EXPECT_EQ(0, tflite::testing::TestStrcmp("no", new_command));
+  }
+}
+
+TF_LITE_MICRO_TEST(RecognizeCommandsTestBadInputLength) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  RecognizeCommands recognize_commands(error_reporter, 1000, 51);
+
+  TfLiteTensor bad_results = tflite::testing::CreateQuantizedTensor(
+      {0, 0, 255}, tflite::testing::IntArrayFromInitializer({2, 1, 3}),
+      "input_tensor", 0.0f, 128.0f);
+
+  const char* found_command;
+  uint8_t score;
+  bool is_new_command;
+  TF_LITE_MICRO_EXPECT_NE(
+      kTfLiteOk, recognize_commands.ProcessLatestResults(
+                     &bad_results, 0, &found_command, &score, &is_new_command));
+}
+
+TF_LITE_MICRO_TEST(RecognizeCommandsTestBadInputTimes) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  RecognizeCommands recognize_commands(error_reporter, 1000, 51);
+
+  TfLiteTensor results = tflite::testing::CreateQuantizedTensor(
+      {0, 0, 255, 0}, tflite::testing::IntArrayFromInitializer({2, 1, 4}),
+      "input_tensor", 0.0f, 128.0f);
+
+  const char* found_command;
+  uint8_t score;
+  bool is_new_command;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, recognize_commands.ProcessLatestResults(
+                     &results, 100, &found_command, &score, &is_new_command));
+  TF_LITE_MICRO_EXPECT_NE(
+      kTfLiteOk, recognize_commands.ProcessLatestResults(
+                     &results, 0, &found_command, &score, &is_new_command));
+}
+
+TF_LITE_MICRO_TEST(RecognizeCommandsTestTooFewInputs) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  RecognizeCommands recognize_commands(error_reporter, 1000, 51);
+
+  TfLiteTensor results = tflite::testing::CreateQuantizedTensor(
+      {0, 0, 255, 0}, tflite::testing::IntArrayFromInitializer({2, 1, 4}),
+      "input_tensor", 0.0f, 128.0f);
+
+  const char* found_command;
+  uint8_t score;
+  bool is_new_command;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, recognize_commands.ProcessLatestResults(
+                     &results, 100, &found_command, &score, &is_new_command));
+  TF_LITE_MICRO_EXPECT_EQ(0, score);
+  TF_LITE_MICRO_EXPECT_EQ(false, is_new_command);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/CMSIS/simple_features_generator.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/CMSIS/simple_features_generator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..403976e222fe549f6f8c755bf7460d245d9370e8
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/CMSIS/simple_features_generator.cc
@@ -0,0 +1,95 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h"
+
+extern "C" {
+#define IFFT_FLAG_R 0
+#define BIT_REVERSE_FLAG 1
+#define FFT_SIZE 512
+#define FFT_SIZE_DIV2 256
+#include <arm_math.h>
+#include "arm_cmplx_mag_squared_q10p6.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS/hanning.h"
+}
+
+void quantize(q15_t* bufA, q15_t* bufB, uint8_t* output);
+
+q15_t bufA[FFT_SIZE];
+q15_t bufB[FFT_SIZE];
+arm_rfft_instance_q15 S_arm_fft;
+arm_status arm_math_status;
+
+namespace {
+// These constants allow us to allocate fixed-sized arrays on the stack for our
+// working memory.
+constexpr int kInputSize = 512;
+constexpr int kAverageWindowSize = 6;
+constexpr int kOutputSize =
+    ((kInputSize / 2) + (kAverageWindowSize - 1)) / kAverageWindowSize;
+}  // namespace
+
+TfLiteStatus GenerateSimpleFeatures(tflite::ErrorReporter* error_reporter,
+                                    const int16_t* input, int input_size,
+                                    int output_size, uint8_t* output) {
+  if (input_size > kInputSize) {
+    error_reporter->Report("Input size %d larger than %d", input_size,
+                           kInputSize);
+    return kTfLiteError;
+  }
+  if (output_size != kOutputSize) {
+    error_reporter->Report("Requested output size %d doesn't match %d",
+                           output_size, kOutputSize);
+    return kTfLiteError;
+  }
+
+  // 30ms at 16 kHz = 480 samples
+  // We want to pad the rest of the 512-sample buffer with zeros
+  arm_mult_q15((q15_t*)input, g_hanning, bufB, 480);
+  int i;
+  for (i = 480; i < 512; i++) {
+    bufB[i] = 0;
+  }
+
+  // Should move init code outside of Preprocess() function
+  arm_math_status =
+      arm_rfft_init_q15(&S_arm_fft, FFT_SIZE, IFFT_FLAG_R, BIT_REVERSE_FLAG);
+  arm_rfft_q15(&S_arm_fft, bufB, bufA);
+
+  // The rfft function packs data as follows:
+  // {real[0], real[N/2], real[1], imag[1], ..., real[N/2-1], imag[N/2-1]}
+  // Below we pack as follows:
+  // {real[0], 0, real[1], imag[1], ..., real[N/2-1], imag[N/2-1, real[N/2], 0}
+  bufA[FFT_SIZE_DIV2] = bufA[1];
+  bufA[FFT_SIZE_DIV2 + 1] = 0;
+  bufA[1] = 0;
+  arm_cmplx_mag_squared_q10p6(bufA, bufB, FFT_SIZE_DIV2 + 1);
+
+  quantize(bufA, bufB, output);
+
+  return kTfLiteOk;
+}
+
+void quantize(q15_t* bufA, q15_t* bufB, uint8_t* output) {
+  int i;
+  for (i = 0; i < 42; i++) {
+    arm_mean_q15(bufB + 6 * i, 6, bufA + i);
+  }
+  arm_mean_q15(bufB + 252, 5, bufA + 42);
+
+  for (i = 0; i < 43; i++) {
+    output[i] = (uint8_t)(bufA[i] >> 5);
+  }
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/fixed_point/simple_features_generator.cc
similarity index 96%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/fixed_point/simple_features_generator.cc
index b623d8d11b75d59600cc6a029527d3957084a328..ad11684b0a94e630580aa9a95d4b1db92f914d6f 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/fixed_point/simple_features_generator.cc
@@ -27,11 +27,11 @@ limitations under the License.
 // instead of floating point, to help show how this can work on platforms that
 // don't have good float support.
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h"
 
 #include <cmath>
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.h"
 
 namespace {
 
@@ -118,9 +118,9 @@ void CalculatePeriodicHann(int window_length, int16_t* window_function) {
 
 }  // namespace
 
-TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
-                        const int16_t* input, int input_size, int output_size,
-                        uint8_t* output) {
+TfLiteStatus GenerateSimpleFeatures(tflite::ErrorReporter* error_reporter,
+                                    const int16_t* input, int input_size,
+                                    int output_size, uint8_t* output) {
   // Ensure our input and output data arrays are valid.
   if (input_size > kMaxAudioSampleSize) {
     error_reporter->Report("Input size %d larger than %d", input_size,
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.cc
similarity index 95%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.cc
index c4fc5c33bb329cba4e1abcf6d36b01f14e9e2b27..0b20f2f86fb6455d4251cb81d3e70c3c15de7c6b 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // See the header for documentation on the meaning of this data.
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.h"
 
 const uint8_t g_no_power_spectrum_data[g_no_power_spectrum_data_size] = {
     255, 7, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.h
similarity index 90%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.h
index fa39d3c70d78ce261db81cf8ad7c416efd2c468c..9693950fb5ee1d56242b83c6265e9e2315ec8971 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.h
@@ -18,12 +18,12 @@ limitations under the License.
 // This is the expected result of running the sample data in
 // no_30ms_sample_data.cc through through the preprocessing pipeline.
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_POWER_SPECTRUM_DATA_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_POWER_SPECTRUM_DATA_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_NO_POWER_SPECTRUM_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_NO_POWER_SPECTRUM_DATA_H_
 
 #include <cstdint>
 
 constexpr int g_no_power_spectrum_data_size = 43;
 extern const uint8_t g_no_power_spectrum_data[];
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_NO_POWER_SPECTRUM_DATA_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_NO_POWER_SPECTRUM_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_simple_features_data.cc
similarity index 97%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_simple_features_data.cc
index e98c84f7ed2e678eb91580a2b6fb69514cee4740..3d3a9538fb527888e3bdf0e1aa9ca00d4d5f1544 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_simple_features_data.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_simple_features_data.h"
 
 /* File automatically created by
  * tensorflow/examples/speech_commands/wav_to_features.py \
@@ -22,15 +22,15 @@ limitations under the License.
  * --window_size_ms=30 \
  * --window_stride_ms=20 \
  * --feature_bin_count=40 \
- * --quantize \
+ * --quantize=1 \
  * --preprocess="average" \
  * --input_wav="speech_commands_test_set_v0.02/no/f9643d42_nohash_4.wav" \
- * --output_c_file="no_features_data.cc" \
+ * --output_c_file="no_simple_features_data.cc" \
  */
 
-const int g_no_f9643d42_nohash_4_width = 43;
-const int g_no_f9643d42_nohash_4_height = 49;
-const unsigned char g_no_f9643d42_nohash_4_data[] = {
+const int g_no_simple_f9643d42_nohash_4_width = 43;
+const int g_no_simple_f9643d42_nohash_4_height = 49;
+const unsigned char g_no_simple_f9643d42_nohash_4_data[] = {
     0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
     0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
     0, 0,   0,   0,  0, 0,   0,  0,  0,   0,   0,  0, 0,   0,   0,  0,  0,   0,
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_simple_features_data.h
similarity index 73%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_simple_features_data.h
index 39a3bb914cc1986aa851ace0e39ce63ed1a93282..30332b30c5c8325edb53713d572fcf987446844a 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_simple_features_data.h
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_FEATURES_DATA_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_FEATURES_DATA_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_NO_SIMPLE_FEATURES_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_NO_SIMPLE_FEATURES_DATA_H_
 
-extern const int g_yes_f2e59fea_nohash_1_width;
-extern const int g_yes_f2e59fea_nohash_1_height;
-extern const unsigned char g_yes_f2e59fea_nohash_1_data[];
+extern const int g_no_simple_f9643d42_nohash_4_width;
+extern const int g_no_simple_f9643d42_nohash_4_height;
+extern const unsigned char g_no_simple_f9643d42_nohash_4_data[];
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_FEATURES_DATA_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_NO_SIMPLE_FEATURES_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.cc
similarity index 90%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.cc
index f4a7f801cc6251b82339509f691fd64012fbe390..3aa05b7bf1d5d1762c9c6744ac8a5fe99f922332 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.cc
@@ -24,14 +24,17 @@ limitations under the License.
 // functions used here, for example replacing the DFT with an FFT, so this
 // version shouldn't be used where performance is critical.
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h"
 
 #include <cmath>
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.h"
 
 namespace {
 
+// Needed because some platforms don't have M_PI defined.
+constexpr float kPi = 3.14159265358979323846f;
+
 // Performs a discrete Fourier transform on the real inputs. This corresponds to
 // rdft() in the FFT package at http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html,
 // and to kiss_fftr() in KISSFFT at https://github.com/mborgerding/kissfft.
@@ -48,11 +51,11 @@ void CalculateDiscreteFourierTransform(float* time_series, int time_series_size,
   for (int i = 0; i < time_series_size / 2; ++i) {
     float real = 0;
     for (int j = 0; j < time_series_size; ++j) {
-      real += time_series[j] * cos(j * i * M_PI * 2 / time_series_size);
+      real += time_series[j] * cos(j * i * kPi * 2 / time_series_size);
     }
     float imaginary = 0;
     for (int j = 0; j < time_series_size; ++j) {
-      imaginary -= time_series[j] * sin(j * i * M_PI * 2 / time_series_size);
+      imaginary -= time_series[j] * sin(j * i * kPi * 2 / time_series_size);
     }
     fourier_output[(i * 2) + 0] = real;
     fourier_output[(i * 2) + 1] = imaginary;
@@ -63,15 +66,15 @@ void CalculateDiscreteFourierTransform(float* time_series, int time_series_size,
 // of the current sample window are weighted more heavily than those at the end.
 void CalculatePeriodicHann(int window_length, float* window_function) {
   for (int i = 0; i < window_length; ++i) {
-    window_function[i] = 0.5 - 0.5 * cos((2 * M_PI * i) / window_length);
+    window_function[i] = 0.5 - 0.5 * cos((2 * kPi * i) / window_length);
   }
 }
 
 }  // namespace
 
-TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
-                        const int16_t* input, int input_size, int output_size,
-                        uint8_t* output) {
+TfLiteStatus GenerateSimpleFeatures(tflite::ErrorReporter* error_reporter,
+                                    const int16_t* input, int input_size,
+                                    int output_size, uint8_t* output) {
   // Ensure our input and output data arrays are valid.
   if (input_size > kMaxAudioSampleSize) {
     error_reporter->Report("Input size %d larger than %d", input_size,
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h
similarity index 78%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h
index adff790d6cc527578dbfb9dc481c99c1021b92db..f4e86b18a4c3d1c0a5beb32eb6806faaf1c11c14 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_PREPROCESSOR_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_PREPROCESSOR_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_SIMPLE_FEATURES_GENERATOR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_SIMPLE_FEATURES_GENERATOR_H_
 
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
@@ -24,8 +24,8 @@ limitations under the License.
 // both floating point and fixed point available, but because the calculations
 // involved can be time-consuming, it's recommended that you use or write
 // specialized versions for your platform.
-TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
-                        const int16_t* input, int input_size, int output_size,
-                        uint8_t* output);
+TfLiteStatus GenerateSimpleFeatures(tflite::ErrorReporter* error_reporter,
+                                    const int16_t* input, int input_size,
+                                    int output_size, uint8_t* output);
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_PREPROCESSOR_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_SIMPLE_FEATURES_GENERATOR_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator_test.cc
similarity index 90%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator_test.cc
index e8b49f67e3d72faa4700c4bdec7f94a5b79cd72e..65e526327c77c727ec88cee421a466f0df34ee76 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator_test.cc
@@ -13,23 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_features_generator.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/no_power_spectrum_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.h"
 #include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.h"
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 TF_LITE_MICRO_TESTS_BEGIN
 
-TF_LITE_MICRO_TEST(TestPreprocessor) {
+TF_LITE_MICRO_TEST(TestSimpleFeaturesGenerator) {
   tflite::MicroErrorReporter micro_error_reporter;
   tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
   uint8_t yes_calculated_data[g_yes_power_spectrum_data_size];
-  TfLiteStatus yes_status = Preprocess(
+  TfLiteStatus yes_status = GenerateSimpleFeatures(
       error_reporter, g_yes_30ms_sample_data, g_yes_30ms_sample_data_size,
       g_yes_power_spectrum_data_size, yes_calculated_data);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, yes_status);
@@ -45,7 +45,7 @@ TF_LITE_MICRO_TEST(TestPreprocessor) {
   }
 
   uint8_t no_calculated_data[g_yes_power_spectrum_data_size];
-  TfLiteStatus no_status = Preprocess(
+  TfLiteStatus no_status = GenerateSimpleFeatures(
       error_reporter, g_no_30ms_sample_data, g_no_30ms_sample_data_size,
       g_no_power_spectrum_data_size, no_calculated_data);
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, no_status);
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4842f8dbd907dbbd73aab14c7767a8d64476b52d
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.cc
@@ -0,0 +1,23 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.h"
+
+const char* kCategoryLabels[kCategoryCount] = {
+    "silence",
+    "unknown",
+    "yes",
+    "no",
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.h
similarity index 91%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.h
index 1d8f3123a57bc5b807d39151adaf64f29d2f5f95..d31d6b33622b3a15c90fab4c52d7452960a54930 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/simple_model_settings.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MODEL_SETTINGS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MODEL_SETTINGS_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_SIMPLE_MODEL_SETTINGS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_SIMPLE_MODEL_SETTINGS_H_
 
 // Keeping these as constant expressions allow us to allocate fixed-sized arrays
 // on the stack for our working memory.
@@ -23,6 +23,7 @@ limitations under the License.
 // frequency information. This has to be a power of two, and since we're dealing
 // with 30ms of 16KHz inputs, which means 480 samples, this is the next value.
 constexpr int kMaxAudioSampleSize = 512;
+constexpr int kAudioSampleFrequency = 16000;
 
 // All of these values are derived from the values used during model training,
 // if you change your model you'll need to update these constants.
@@ -39,4 +40,4 @@ constexpr int kSilenceIndex = 0;
 constexpr int kUnknownIndex = 1;
 extern const char* kCategoryLabels[kCategoryCount];
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MODEL_SETTINGS_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_SIMPLE_MODEL_SETTINGS_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.cc
similarity index 99%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.cc
index 62e4359859a422c96ec368b6f91cba99e3c4a4eb..a14412edc941e8a7df0aef9dd66b79b1d9a1d7a6 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.cc
@@ -14,12 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 // Automatically created from a TensorFlow Lite flatbuffer using the command:
-// xxd -i tiny_conv.tflite > tiny_conv_model_data.cc
+// xxd -i tiny_conv.tflite > tiny_conv_simple_features_model_data.cc
 // See the README for a full description of the creation process.
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.h"
 
-const unsigned char g_tiny_conv_model_data[] = {
+const unsigned char g_tiny_conv_simple_features_model_data[] = {
     0x18, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x0e, 0x00,
     0x18, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,
     0x0e, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x08, 0x4d, 0x00, 0x00,
@@ -1670,4 +1670,4 @@ const unsigned char g_tiny_conv_model_data[] = {
     0x04, 0x00, 0x00, 0x00, 0xfa, 0xff, 0xff, 0xff, 0x00, 0x19, 0x06, 0x00,
     0x06, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x09, 0x06, 0x00,
     0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04};
-const int g_tiny_conv_model_data_len = 19800;
+const int g_tiny_conv_simple_features_model_data_len = 19800;
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.h
similarity index 74%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.h
index a465dbfabf7cbba44473ae7e2ff94b1de2092b20..cadf7d0de754e032ae9ff77cdd8deec43bc03847 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/tiny_conv_simple_features_model_data.h
@@ -16,12 +16,12 @@ limitations under the License.
 // This is a standard TensorFlow Lite model file that has been converted into a
 // C data array, so it can be easily compiled into a binary for devices that
 // don't have a file system. It was created using the command:
-// xxd -i tiny_conv.tflite > tiny_conv_model_data.cc
+// xxd -i tiny_conv.tflite > tiny_conv_simple_features_model_data.cc
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TINY_CONV_MODEL_DATA_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TINY_CONV_MODEL_DATA_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_TINY_CONV_SIMPLE_FEATURES_MODEL_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_TINY_CONV_SIMPLE_FEATURES_MODEL_DATA_H_
 
-extern const unsigned char g_tiny_conv_model_data[];
-extern const int g_tiny_conv_model_data_len;
+extern const unsigned char g_tiny_conv_simple_features_model_data[];
+extern const int g_tiny_conv_simple_features_model_data_len;
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TINY_CONV_MODEL_DATA_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_TINY_CONV_SIMPLE_FEATURES_MODEL_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.cc
similarity index 95%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.cc
index 9a34a2045a221e2eee8c51f23000e819b1638499..cd46408c0fb5c2c5dad12ae67c5456c8cb178b2d 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // See the header for documentation on the meaning of this data.
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.h"
 
 const uint8_t g_yes_power_spectrum_data[g_yes_power_spectrum_data_size] = {
     8, 89, 8, 0, 0, 0, 0, 0, 0, 0, 0, 4, 13, 1, 6, 23, 20, 6, 4, 0, 0, 0,
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.h
similarity index 90%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.h
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.h
index 5c8c00ac1116dcbd7ad4aeda1828603e962c2001..77e52d58b54763ec8df46729ab6f8dd84086d59b 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_power_spectrum_data.h
@@ -18,12 +18,12 @@ limitations under the License.
 // This is the expected result of running the sample data in
 // yes_30ms_sample_data.cc through through the preprocessing pipeline.
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_POWER_SPECTRUM_DATA_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_POWER_SPECTRUM_DATA_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_YES_POWER_SPECTRUM_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_YES_POWER_SPECTRUM_DATA_H_
 
 #include <cstdint>
 
 constexpr int g_yes_power_spectrum_data_size = 43;
 extern const uint8_t g_yes_power_spectrum_data[];
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_POWER_SPECTRUM_DATA_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_YES_POWER_SPECTRUM_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_simple_features_data.cc
similarity index 97%
rename from tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc
rename to tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_simple_features_data.cc
index 2eb737fb8e1204a02f7ea4852016e85d03980bfd..2d660bb8b5c5b825eb48490699c89e5ba241369f 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_simple_features_data.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_simple_features_data.h"
 
 /* File automatically created by
  * tensorflow/examples/speech_commands/wav_to_features.py \
@@ -22,15 +22,15 @@ limitations under the License.
  * --window_size_ms=30 \
  * --window_stride_ms=20 \
  * --feature_bin_count=40 \
- * --quantize \
+ * --quantize=1 \
  * --preprocess="average" \
  * --input_wav="speech_commands_test_set_v0.02/yes/f2e59fea_nohash_1.wav" \
- * --output_c_file="yes_features_data.cc" \
+ * --output_c_file="yes_simple_features_data.cc" \
  */
 
-const int g_yes_f2e59fea_nohash_1_width = 43;
-const int g_yes_f2e59fea_nohash_1_height = 49;
-const unsigned char g_yes_f2e59fea_nohash_1_data[] = {
+const int g_yes_simple_f2e59fea_nohash_1_width = 43;
+const int g_yes_simple_f2e59fea_nohash_1_height = 49;
+const unsigned char g_yes_simple_f2e59fea_nohash_1_data[] = {
     0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
     0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
     0,  0,  0,   0,   0,   0,  0,   0,   0,  0,  0,   0,   0,  0,  0,   0,  0,
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_simple_features_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_simple_features_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..87ea4a4aea89d02189bca9c37872e27b95672190
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/simple_features/yes_simple_features_data.h
@@ -0,0 +1,23 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_YES_SIMPLE_FEATURES_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_YES_SIMPLE_FEATURES_DATA_H_
+
+extern const int g_yes_simple_f2e59fea_nohash_1_width;
+extern const int g_yes_simple_f2e59fea_nohash_1_height;
+extern const unsigned char g_yes_simple_f2e59fea_nohash_1_data[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_SIMPLE_FEATURES_YES_SIMPLE_FEATURES_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/timer_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/timer_test.cc
deleted file mode 100644
index 0487a12b25fc17208f1d9ab2b51538102f7ec914..0000000000000000000000000000000000000000
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/timer_test.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/experimental/micro/examples/micro_speech/timer.h"
-
-#include <limits>
-
-#include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
-#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
-
-TF_LITE_MICRO_TESTS_BEGIN
-
-TF_LITE_MICRO_TEST(TestTimer) {
-  // Make sure that the technically-undefined overflow behavior we rely on below
-  // works on this platform. It's still not guaranteed, but at least this is a
-  // sanity check.  Turn off when running with ASan, as it will complain about
-  // the following undefined behavior.
-#ifndef ADDRESS_SANITIZER
-  int32_t overflow_value = std::numeric_limits<int32_t>::max();
-  overflow_value += 1;
-  TF_LITE_MICRO_EXPECT_EQ(std::numeric_limits<int32_t>::min(), overflow_value);
-#endif
-
-  const int32_t first_time = TimeInMilliseconds();
-  const int32_t second_time = TimeInMilliseconds();
-
-  // It's possible that the timer may have wrapped around from +BIG_NUM to
-  // -BIG_NUM between the first and second calls, since we're storing
-  // milliseconds in a 32-bit integer. It's not reasonable that the call itself
-  // would have taken more than 2^31 milliseconds though, so look at the
-  // difference and rely on integer overflow to ensure it's accurate.
-  const int32_t time_delta = (second_time - first_time);
-  TF_LITE_MICRO_EXPECT_LE(0, time_delta);
-}
-
-TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e5f6ceb3f0b3935d084fa9463c72e98d4e0cad83
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.cc
@@ -0,0 +1,1800 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See the header for documentation on the meaning of this data.
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h"
+
+const int g_yes_1000ms_sample_data_size = 16000;
+const int16_t g_yes_1000ms_sample_data[16000] = {
+    -7,     -12,    -18,    -20,    -20,    -21,    -21,    -25,    -29,
+    -31,    -31,    -30,    -30,    -29,    -30,    -30,    -29,    -28,
+    -24,    -22,    -17,    -12,    -8,     -7,     -6,     -1,     2,
+    5,      7,      8,      11,     15,     18,     19,     23,     24,
+    24,     27,     27,     26,     25,     28,     30,     32,     33,
+    31,     29,     27,     28,     30,     28,     26,     26,     24,
+    22,     17,     16,     15,     13,     10,     5,      0,      -4,
+    -4,     -7,     -9,     -12,    -14,    -14,    -13,    -11,    -10,
+    -8,     -6,     -3,     3,      7,      8,      12,     15,     18,
+    21,     19,     19,     21,     23,     24,     23,     22,     19,
+    17,     11,     5,      -3,     -12,    -22,    -28,    -35,    -45,
+    -54,    -62,    -69,    -76,    -84,    -92,    -100,   -109,   -116,
+    -117,   -120,   -120,   -120,   -122,   -124,   -126,   -123,   -121,
+    -116,   -113,   -107,   -97,    -88,    -75,    -61,    -50,    -41,
+    -27,    -12,    4,      21,     37,     58,     76,     93,     108,
+    121,    137,    156,    172,    184,    196,    205,    215,    224,
+    235,    242,    245,    242,    240,    238,    231,    223,    214,
+    205,    195,    178,    158,    135,    112,    90,     69,     46,
+    19,     -11,    -45,    -76,    -105,   -133,   -159,   -186,   -211,
+    -236,   -260,   -280,   -294,   -308,   -320,   -331,   -336,   -338,
+    -335,   -326,   -316,   -301,   -286,   -267,   -246,   -225,   -203,
+    -180,   -154,   -124,   -91,    -59,    -34,    -8,     19,     42,
+    64,     87,     103,    119,    134,    148,    162,    174,    182,
+    188,    190,    189,    187,    184,    180,    177,    171,    162,
+    154,    144,    137,    129,    118,    106,    95,     81,     69,
+    58,     48,     37,     26,     14,     3,      -7,     -22,    -31,
+    -42,    -52,    -62,    -69,    -75,    -79,    -82,    -87,    -88,
+    -92,    -94,    -91,    -87,    -85,    -81,    -74,    -70,    -64,
+    -55,    -47,    -40,    -33,    -25,    -19,    -12,    -6,     -4,
+    -1,     1,      1,      -2,     -9,     -15,    -17,    -18,    -20,
+    -22,    -22,    -26,    -31,    -33,    -35,    -31,    -26,    -17,
+    -4,     8,      19,     31,     44,     54,     64,     71,     79,
+    86,     92,     102,    109,    111,    109,    104,    96,     84,
+    70,     60,     51,     38,     27,     13,     4,      -3,     -9,
+    -13,    -18,    -26,    -33,    -32,    -27,    -20,    -10,    -4,
+    2,      6,      10,     14,     16,     21,     25,     29,     31,
+    33,     35,     37,     33,     22,     15,     13,     11,     12,
+    9,      5,      2,      1,      -3,     -9,     -17,    -27,    -32,
+    -35,    -36,    -36,    -42,    -50,    -56,    -66,    -77,    -85,
+    -96,    -100,   -106,   -113,   -118,   -121,   -119,   -117,   -119,
+    -122,   -124,   -123,   -112,   -94,    -77,    -64,    -51,    -37,
+    -22,    -3,     17,     37,     54,     68,     86,     100,    114,
+    134,    154,    167,    174,    178,    182,    189,    189,    187,
+    185,    179,    177,    174,    171,    157,    138,    123,    108,
+    94,     76,     50,     25,     6,      -8,     -20,    -37,    -59,
+    -86,    -110,   -132,   -147,   -159,   -169,   -178,   -191,   -203,
+    -213,   -217,   -215,   -208,   -199,   -194,   -195,   -190,   -178,
+    -165,   -155,   -144,   -134,   -123,   -103,   -80,    -56,    -35,
+    -18,    -4,     11,     23,     36,     50,     65,     78,     93,
+    111,    122,    129,    132,    131,    127,    125,    126,    126,
+    128,    127,    125,    122,    118,    111,    108,    104,    99,
+    93,     89,     90,     87,     82,     78,     75,     68,     65,
+    67,     69,     66,     61,     54,     39,     28,     15,     3,
+    -7,     -18,    -25,    -29,    -35,    -42,    -52,    -66,    -78,
+    -83,    -85,    -86,    -86,    -82,    -83,    -84,    -83,    -81,
+    -75,    -62,    -57,    -53,    -49,    -46,    -41,    -34,    -26,
+    -16,    -10,    -7,     -2,     2,      6,      12,     15,     19,
+    18,     15,     17,     21,     24,     30,     33,     27,     22,
+    21,     20,     23,     24,     21,     15,     13,     8,      3,
+    1,      -1,     -3,     -4,     -6,     -9,     -11,    -11,    -8,
+    -10,    -13,    -15,    -19,    -17,    -11,    -2,     1,      2,
+    6,      9,      10,     12,     13,     9,      8,      10,     13,
+    20,     18,     13,     10,     4,      1,      -2,     -6,     -11,
+    -13,    -16,    -18,    -15,    -18,    -21,    -21,    -22,    -23,
+    -25,    -23,    -22,    -20,    -19,    -16,    -12,    -10,    -9,
+    -11,    -15,    -19,    -22,    -19,    -14,    -11,    -9,     -11,
+    -17,    -20,    -18,    -19,    -15,    -11,    -8,     -2,     8,
+    19,     30,     36,     37,     36,     38,     45,     57,     69,
+    77,     81,     79,     75,     76,     74,     69,     66,     60,
+    53,     45,     36,     28,     22,     17,     10,     0,      -5,
+    -11,    -15,    -18,    -26,    -31,    -33,    -34,    -34,    -35,
+    -37,    -37,    -35,    -28,    -24,    -29,    -37,    -45,    -46,
+    -41,    -36,    -31,    -32,    -33,    -37,    -37,    -36,    -36,
+    -34,    -27,    -19,    -14,    -11,    -8,     -1,     6,      14,
+    19,     21,     25,     30,     34,     38,     38,     33,     26,
+    22,     19,     20,     18,     17,     15,     10,     2,      -3,
+    -5,     -10,    -13,    -13,    -13,    -16,    -16,    -16,    -15,
+    -13,    -14,    -13,    -16,    -19,    -20,    -18,    -17,    -18,
+    -16,    -16,    -24,    -28,    -28,    -28,    -23,    -21,    -21,
+    -20,    -24,    -27,    -23,    -18,    -14,    -7,     4,      11,
+    15,     19,     21,     25,     33,     39,     41,     45,     47,
+    50,     56,     58,     57,     59,     59,     55,     50,     47,
+    39,     34,     30,     24,     18,     11,     8,      3,      0,
+    -3,     -8,     -14,    -15,    -13,    -13,    -12,    -14,    -17,
+    -17,    -12,    -10,    -4,     -7,     -12,    -10,    -14,    -17,
+    -17,    -19,    -25,    -28,    -27,    -29,    -30,    -31,    -35,
+    -38,    -43,    -47,    -51,    -52,    -50,    -49,    -48,    -47,
+    -45,    -39,    -32,    -30,    -31,    -35,    -35,    -31,    -24,
+    -17,    -12,    -11,    -14,    -15,    -17,    -16,    -9,     -5,
+    -3,     -1,     0,      1,      0,      3,      12,     21,     26,
+    33,     35,     38,     45,     50,     53,     53,     54,     58,
+    61,     64,     69,     67,     66,     64,     58,     54,     51,
+    46,     44,     45,     41,     35,     31,     27,     25,     27,
+    25,     20,     13,     12,     16,     17,     17,     12,     7,
+    3,      2,      -2,     -4,     -8,     -14,    -19,    -25,    -29,
+    -38,    -49,    -60,    -69,    -73,    -71,    -74,    -82,    -89,
+    -98,    -103,   -104,   -103,   -99,    -98,    -98,    -98,    -99,
+    -97,    -94,    -91,    -85,    -82,    -78,    -74,    -74,    -71,
+    -68,    -61,    -54,    -52,    -47,    -41,    -36,    -32,    -21,
+    -12,    -3,     11,     26,     36,     44,     48,     55,     64,
+    77,     92,     100,    108,    117,    120,    122,    128,    130,
+    129,    130,    127,    124,    122,    121,    118,    114,    110,
+    102,    92,     85,     80,     77,     68,     55,     46,     39,
+    36,     34,     31,     27,     15,     5,      -1,     -5,     -11,
+    -20,    -29,    -37,    -43,    -46,    -47,    -54,    -61,    -65,
+    -74,    -82,    -84,    -91,    -94,    -96,    -104,   -109,   -111,
+    -111,   -112,   -113,   -111,   -112,   -110,   -104,   -99,    -96,
+    -93,    -89,    -87,    -81,    -71,    -63,    -54,    -45,    -43,
+    -37,    -30,    -24,    -17,    -12,    -8,     -2,     2,      15,
+    23,     28,     35,     41,     42,     44,     52,     58,     66,
+    74,     78,     80,     82,     85,     88,     90,     92,     92,
+    88,     87,     87,     79,     73,     69,     64,     62,     55,
+    50,     45,     41,     36,     29,     24,     20,     16,     12,
+    8,      5,      2,      1,      1,      0,      1,      -4,     -4,
+    -4,     -4,     -1,     1,      2,      1,      -3,     -6,     -1,
+    5,      6,      7,      8,      4,      2,      0,      -2,     -3,
+    0,      -3,     -4,     -3,     -4,     -5,     -8,     -15,    -20,
+    -25,    -28,    -32,    -37,    -38,    -39,    -43,    -48,    -55,
+    -62,    -69,    -75,    -75,    -78,    -81,    -83,    -89,    -89,
+    -92,    -91,    -91,    -89,    -83,    -81,    -74,    -66,    -63,
+    -54,    -45,    -39,    -31,    -23,    -15,    -4,     6,      14,
+    23,     29,     35,     41,     45,     49,     55,     61,     69,
+    75,     75,     76,     75,     74,     74,     73,     74,     72,
+    69,     69,     65,     62,     57,     52,     44,     35,     33,
+    29,     24,     14,     7,      3,      -4,     -12,    -17,    -20,
+    -22,    -27,    -32,    -34,    -39,    -42,    -43,    -42,    -43,
+    -40,    -38,    -36,    -36,    -37,    -36,    -33,    -31,    -27,
+    -24,    -23,    -22,    -17,    -11,    -7,     -7,     -7,     -3,
+    5,      13,     19,     25,     27,     25,     27,     35,     40,
+    40,     41,     45,     47,     50,     54,     52,     50,     45,
+    43,     44,     40,     34,     28,     24,     18,     11,     6,
+    -2,     -9,     -14,    -21,    -27,    -35,    -39,    -43,    -50,
+    -57,    -62,    -66,    -68,    -71,    -72,    -73,    -74,    -76,
+    -76,    -77,    -75,    -75,    -74,    -67,    -61,    -55,    -49,
+    -45,    -40,    -30,    -21,    -11,    -4,     4,      13,     23,
+    34,     44,     52,     59,     65,     70,     77,     84,     87,
+    88,     90,     91,     90,     89,     85,     80,     75,     72,
+    71,     64,     56,     48,     41,     34,     27,     21,     12,
+    1,      -11,    -19,    -28,    -33,    -39,    -46,    -50,    -53,
+    -58,    -63,    -66,    -71,    -73,    -76,    -76,    -74,    -73,
+    -71,    -67,    -65,    -62,    -60,    -55,    -51,    -45,    -39,
+    -35,    -31,    -27,    -20,    -13,    -6,     -3,     1,      8,
+    12,     18,     24,     26,     30,     35,     38,     44,     47,
+    47,     51,     53,     52,     53,     52,     50,     51,     49,
+    50,     51,     50,     48,     48,     45,     43,     42,     37,
+    34,     31,     31,     30,     26,     24,     21,     15,     12,
+    11,     7,      4,      1,      -3,     -5,     -7,     -9,     -15,
+    -21,    -26,    -28,    -31,    -35,    -39,    -46,    -48,    -49,
+    -53,    -58,    -63,    -67,    -69,    -71,    -72,    -74,    -75,
+    -77,    -77,    -73,    -72,    -69,    -65,    -60,    -55,    -50,
+    -47,    -43,    -38,    -30,    -25,    -20,    -12,    -4,     4,
+    9,      16,     20,     24,     28,     35,     43,     50,     58,
+    61,     65,     72,     74,     74,     76,     79,     78,     76,
+    78,     76,     76,     74,     70,     64,     59,     52,     46,
+    41,     33,     26,     19,     12,     5,      -2,     -8,     -15,
+    -20,    -26,    -31,    -37,    -39,    -41,    -44,    -44,    -47,
+    -51,    -52,    -52,    -48,    -45,    -46,    -48,    -45,    -42,
+    -40,    -36,    -32,    -27,    -24,    -22,    -18,    -16,    -11,
+    -10,    -5,     0,      3,      8,      11,     16,     18,     21,
+    23,     25,     26,     27,     28,     30,     31,     31,     30,
+    29,     27,     26,     23,     19,     17,     13,     10,     6,
+    0,      -2,     -5,     -10,    -12,    -15,    -19,    -23,    -26,
+    -29,    -30,    -30,    -32,    -33,    -34,    -35,    -34,    -31,
+    -29,    -29,    -28,    -28,    -23,    -19,    -17,    -12,    -12,
+    -10,    -5,     -2,     3,      7,      10,     13,     14,     19,
+    22,     26,     31,     34,     34,     35,     36,     39,     43,
+    45,     47,     47,     48,     49,     51,     48,     47,     50,
+    45,     41,     41,     38,     34,     34,     30,     23,     17,
+    11,     7,      4,      -4,     -9,     -15,    -23,    -28,    -32,
+    -35,    -39,    -45,    -46,    -49,    -53,    -52,    -53,    -55,
+    -56,    -56,    -55,    -54,    -53,    -53,    -51,    -47,    -44,
+    -42,    -40,    -37,    -33,    -28,    -25,    -23,    -18,    -15,
+    -8,     -6,     -2,     3,      8,      15,     18,     23,     26,
+    27,     32,     36,     36,     36,     39,     38,     38,     40,
+    39,     35,     31,     29,     25,     23,     19,     15,     11,
+    7,      5,      3,      1,      -1,     -6,     -8,     -7,     -10,
+    -9,     -10,    -11,    -10,    -7,     -6,     -8,     -6,     -5,
+    -4,     1,      2,      4,      7,      7,      9,      11,     11,
+    9,      9,      10,     11,     13,     17,     15,     15,     15,
+    17,     19,     17,     17,     17,     15,     15,     13,     11,
+    12,     8,      7,      5,      3,      0,      -4,     -4,     -6,
+    -9,     -12,    -14,    -15,    -15,    -16,    -20,    -19,    -20,
+    -20,    -20,    -18,    -18,    -21,    -22,    -21,    -21,    -23,
+    -20,    -20,    -23,    -24,    -23,    -25,    -25,    -25,    -25,
+    -26,    -24,    -23,    -23,    -23,    -23,    -22,    -19,    -18,
+    -15,    -14,    -10,    -8,     -4,     -1,     1,      3,      6,
+    8,      9,      14,     19,     22,     24,     26,     29,     32,
+    31,     34,     39,     42,     42,     46,     49,     50,     50,
+    52,     53,     52,     49,     49,     48,     48,     46,     45,
+    40,     34,     30,     25,     21,     17,     13,     10,     6,
+    2,      -4,     -9,     -12,    -15,    -18,    -21,    -26,    -28,
+    -31,    -32,    -33,    -35,    -35,    -38,    -37,    -36,    -34,
+    -35,    -35,    -33,    -33,    -34,    -30,    -26,    -27,    -25,
+    -23,    -22,    -18,    -15,    -16,    -12,    -9,     -9,     -6,
+    -1,     2,      3,      5,      8,      7,      9,      12,     15,
+    17,     18,     18,     19,     18,     20,     19,     18,     21,
+    20,     19,     18,     16,     15,     15,     15,     14,     12,
+    9,      9,      10,     8,      6,      4,      2,      1,      -1,
+    -3,     -1,     -3,     -2,     -4,     -5,     -5,     -8,     -8,
+    -10,    -10,    -8,     -8,     -8,     -7,     -8,     -8,     -8,
+    -9,     -11,    -12,    -11,    -9,     -7,     -8,     -8,     -8,
+    -10,    -8,     -7,     -8,     -7,     -6,     -7,     -5,     -3,
+    -3,     -3,     -3,     -2,     0,      3,      3,      5,      7,
+    10,     11,     10,     10,     12,     13,     16,     16,     16,
+    17,     15,     16,     17,     16,     14,     16,     13,     11,
+    11,     9,      9,      6,      4,      4,      3,      0,      -2,
+    -4,     -7,     -7,     -7,     -13,    -15,    -13,    -14,    -16,
+    -15,    -15,    -17,    -16,    -16,    -18,    -19,    -19,    -20,
+    -19,    -16,    -15,    -13,    -12,    -10,    -7,     -6,     -4,
+    -4,     -2,     0,      2,      6,      8,      10,     12,     14,
+    15,     14,     13,     13,     13,     15,     15,     17,     17,
+    17,     18,     17,     16,     15,     15,     14,     11,     9,
+    8,      8,      9,      8,      5,      5,      3,      -1,     -1,
+    -4,     -5,     -7,     -8,     -8,     -8,     -9,     -10,    -8,
+    -11,    -12,    -12,    -12,    -12,    -13,    -11,    -11,    -9,
+    -8,     -7,     -8,     -7,     -6,     -7,     -6,     -5,     -4,
+    -4,     -2,     -2,     -3,     -2,     -2,     -3,     0,      -1,
+    -3,     1,      1,      2,      4,      3,      5,      6,      3,
+    3,      4,      3,      3,      4,      5,      4,      6,      7,
+    7,      7,      6,      3,      3,      5,      3,      3,      6,
+    6,      7,      6,      4,      5,      2,      1,      1,      0,
+    0,      2,      1,      1,      1,      -1,     -2,     -3,     -5,
+    -4,     -5,     -4,     -4,     -6,     -4,     -4,     -4,     -5,
+    -6,     -5,     -6,     -5,     -4,     -5,     -4,     -3,     -4,
+    0,      2,      2,      2,      2,      2,      2,      3,      3,
+    5,      6,      6,      5,      6,      7,      6,      8,      6,
+    5,      5,      5,      6,      6,      6,      5,      5,      2,
+    2,      1,      2,      0,      -1,     -1,     -1,     -1,     0,
+    -1,     -4,     -6,     -8,     -8,     -9,     -8,     -7,     -6,
+    -5,     -5,     -6,     -3,     -4,     -5,     -4,     -7,     -6,
+    -4,     -2,     -1,     -1,     1,      1,      1,      1,      1,
+    2,      2,      1,      3,      4,      4,      6,      6,      6,
+    6,      4,      4,      4,      4,      3,      2,      2,      2,
+    2,      1,      1,      1,      0,      1,      1,      0,      -2,
+    -2,     -3,     -3,     -3,     -3,     -5,     -4,     -3,     -5,
+    -5,     -3,     -5,     -4,     -4,     -2,     -2,     -2,     -1,
+    -3,     -2,     -2,     -1,     -3,     -2,     -1,     -2,     -2,
+    -2,     0,      0,      0,      0,      0,      1,      0,      0,
+    1,      2,      3,      3,      3,      4,      5,      4,      3,
+    4,      5,      5,      7,      7,      6,      9,      8,      6,
+    7,      8,      6,      5,      7,      8,      8,      8,      7,
+    6,      5,      4,      4,      4,      5,      4,      2,      1,
+    2,      1,      0,      -2,     -3,     -2,     -4,     -6,     -6,
+    -7,     -7,     -8,     -9,     -9,     -9,     -9,     -9,     -9,
+    -9,     -10,    -10,    -10,    -8,     -7,     -8,     -6,     -5,
+    -4,     -3,     -5,     -2,     -2,     -2,     -1,     -1,     0,
+    1,      1,      2,      3,      2,      4,      3,      3,      5,
+    3,      3,      5,      4,      5,      6,      5,      4,      5,
+    3,      2,      2,      3,      4,      4,      4,      4,      4,
+    3,      4,      4,      4,      3,      2,      2,      2,      2,
+    2,      2,      2,      2,      1,      1,      1,      2,      1,
+    1,      2,      1,      1,      2,      1,      1,      1,      -1,
+    0,      1,      0,      -1,     1,      -1,     -1,     -1,     -2,
+    -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -2,
+    -1,     0,      -1,     -1,     1,      1,      2,      0,      -1,
+    0,      -1,     -1,     0,      0,      1,      2,      2,      2,
+    1,      1,      0,      0,      0,      0,      1,      1,      0,
+    0,      0,      0,      0,      -1,     -2,     -1,     -3,     -4,
+    -4,     -4,     -4,     -4,     -4,     -4,     -3,     -3,     -5,
+    -6,     -4,     -2,     -2,     -1,     -1,     -1,     -2,     1,
+    -1,     1,      0,      0,      1,      1,      1,      1,      2,
+    1,      2,      2,      3,      3,      3,      3,      4,      5,
+    5,      5,      5,      5,      5,      5,      5,      6,      6,
+    5,      5,      5,      6,      6,      5,      3,      6,      5,
+    4,      5,      3,      2,      2,      2,      2,      1,      1,
+    2,      0,      -1,     0,      -1,     -1,     -1,     -1,     -1,
+    -1,     -1,     -3,     -3,     -3,     -3,     -4,     -4,     -5,
+    -6,     -6,     -6,     -6,     -6,     -6,     -5,     -5,     -6,
+    -5,     -4,     -4,     -4,     -4,     -2,     -2,     -2,     -1,
+    -2,     0,      1,      0,      1,      3,      4,      4,      4,
+    4,      4,      4,      5,      4,      4,      4,      5,      7,
+    5,      4,      4,      4,      4,      3,      2,      2,      2,
+    2,      2,      0,      1,      1,      0,      1,      1,      -1,
+    0,      -1,     -2,     -1,     -3,     -4,     -4,     -3,     -5,
+    -5,     -5,     -5,     -5,     -5,     -4,     -3,     -3,     -2,
+    -3,     -2,     -2,     -5,     -3,     -3,     -3,     -2,     0,
+    1,      1,      1,      1,      1,      1,      1,      1,      3,
+    3,      4,      4,      4,      4,      5,      5,      2,      3,
+    4,      3,      5,      4,      3,      4,      3,      3,      5,
+    5,      3,      4,      2,      1,      1,      3,      4,      3,
+    1,      3,      2,      1,      2,      1,      0,      1,      0,
+    1,      0,      1,      1,      1,      1,      0,      -1,     0,
+    0,      -1,     -1,     -2,     -1,     -1,     -2,     0,      -1,
+    -2,     -1,     -1,     -2,     -2,     -1,     -3,     -3,     -3,
+    -3,     -3,     -4,     -3,     -5,     -6,     -4,     -4,     -5,
+    -4,     -3,     -5,     -6,     -4,     -5,     -6,     -4,     -3,
+    -5,     -4,     -3,     -4,     -3,     -2,     -2,     -2,     0,
+    0,      1,      1,      0,      0,      0,      1,      1,      3,
+    3,      3,      4,      3,      3,      3,      3,      3,      3,
+    3,      3,      3,      3,      3,      3,      3,      3,      3,
+    1,      1,      1,      1,      1,      1,      1,      0,      0,
+    0,      1,      -2,     -1,     1,      0,      -1,     -2,     -2,
+    0,      1,      0,      1,      1,      1,      1,      0,      0,
+    1,      0,      0,      2,      1,      0,      1,      1,      1,
+    1,      3,      3,      3,      4,      3,      3,      4,      2,
+    2,      2,      2,      2,      2,      2,      1,      2,      2,
+    2,      2,      -1,     -1,     -1,     -1,     -1,     -1,     -1,
+    -1,     -1,     -3,     -3,     -3,     -5,     -4,     -5,     -5,
+    -5,     -5,     -7,     -7,     -7,     -8,     -7,     -8,     -7,
+    -8,     -8,     -7,     -8,     -8,     -8,     -8,     -7,     -6,
+    -6,     -6,     -7,     -6,     -6,     -5,     -5,     -3,     -2,
+    -2,     -1,     0,      -1,     0,      1,      2,      2,      3,
+    3,      3,      6,      7,      7,      7,      8,      9,      8,
+    10,     10,     9,      10,     11,     9,      10,     12,     11,
+    10,     9,      9,      9,      9,      10,     9,      6,      6,
+    5,      5,      6,      3,      1,      1,      0,      1,      0,
+    0,      1,      -1,     -2,     -2,     -1,     -3,     -3,     -2,
+    -4,     -4,     -3,     -2,     -4,     -4,     -4,     -5,     -3,
+    -3,     -5,     -3,     -3,     -5,     -4,     -2,     -2,     -3,
+    -3,     -1,     0,      -1,     0,      0,      0,      -2,     -1,
+    0,      -1,     -2,     -2,     -2,     -2,     -1,     -3,     -2,
+    -3,     -4,     -3,     -3,     -3,     -3,     -3,     -3,     -3,
+    -2,     -4,     -6,     -5,     -3,     -2,     -4,     -3,     -2,
+    -4,     -4,     -4,     -3,     -4,     -5,     -4,     -5,     -3,
+    -2,     -5,     -2,     -4,     -4,     -3,     -2,     -1,     -1,
+    -1,     0,      2,      2,      1,      1,      3,      3,      3,
+    3,      4,      4,      5,      6,      5,      5,      6,      7,
+    7,      7,      8,      8,      7,      9,      9,      9,      9,
+    10,     9,      9,      9,      9,      9,      9,      8,      7,
+    9,      9,      6,      7,      5,      2,      3,      2,      1,
+    1,      0,      -2,     -2,     -2,     -3,     -3,     -2,     -2,
+    -4,     -5,     -4,     -4,     -4,     -4,     -5,     -4,     -4,
+    -5,     -4,     -5,     -4,     -5,     -6,     -4,     -4,     -5,
+    -5,     -5,     -5,     -6,     -4,     -4,     -4,     -3,     -2,
+    -3,     -3,     -2,     -2,     -1,     -2,     -3,     -1,     0,
+    -1,     0,      0,      0,      0,      1,      0,      0,      0,
+    0,      -1,     1,      1,      1,      0,      -2,     -2,     -3,
+    -3,     -4,     -4,     -6,     -7,     -5,     -4,     -5,     -5,
+    -4,     -6,     -8,     -7,     -6,     -5,     -5,     -5,     -4,
+    -4,     -5,     -4,     -3,     -3,     0,      0,      -2,     -1,
+    0,      0,      1,      1,      2,      2,      2,      2,      2,
+    4,      5,      5,      5,      6,      7,      7,      9,      10,
+    10,     10,     12,     12,     13,     14,     14,     14,     15,
+    15,     15,     15,     15,     15,     14,     15,     15,     12,
+    13,     13,     12,     10,     11,     11,     11,     10,     8,
+    6,      5,      7,      6,      6,      4,      3,      4,      5,
+    3,      2,      2,      1,      1,      2,      3,      1,      0,
+    0,      1,      0,      -2,     -1,     -2,     -3,     -3,     -3,
+    -3,     -4,     -6,     -8,     -9,     -9,     -10,    -12,    -14,
+    -15,    -18,    -21,    -21,    -21,    -21,    -22,    -24,    -26,
+    -26,    -27,    -27,    -28,    -26,    -25,    -26,    -28,    -27,
+    -24,    -23,    -23,    -24,    -21,    -17,    -17,    -15,    -12,
+    -12,    -12,    -12,    -9,     -7,     -6,     -5,     -3,     -3,
+    -2,     0,      0,      1,      3,      7,      6,      4,      6,
+    7,      8,      11,     10,     10,     13,     15,     14,     13,
+    18,     20,     18,     19,     21,     23,     24,     23,     22,
+    24,     26,     26,     26,     27,     25,     23,     25,     27,
+    28,     28,     28,     23,     19,     23,     24,     20,     20,
+    21,     15,     13,     15,     16,     14,     11,     8,      7,
+    8,      11,     11,     6,      4,      8,      7,      6,      7,
+    6,      4,      7,      13,     12,     7,      8,      8,      4,
+    1,      1,      1,      2,      -4,     -12,    -18,    -24,    -25,
+    -25,    -32,    -41,    -55,    -59,    -61,    -75,    -87,    -96,
+    -109,   -122,   -133,   -141,   -148,   -157,   -168,   -180,   -191,
+    -198,   -202,   -207,   -206,   -207,   -211,   -211,   -208,   -203,
+    -189,   -171,   -153,   -132,   -114,   -96,    -75,    -54,    -30,
+    -5,     19,     43,     61,     77,     93,     106,    123,    143,
+    161,    182,    198,    202,    201,    209,    229,    242,    240,
+    235,    239,    249,    258,    255,    242,    233,    245,    268,
+    278,    256,    223,    223,    253,    263,    235,    198,    178,
+    188,    215,    230,    200,    143,    113,    128,    158,    158,
+    128,    99,     90,     82,     70,     56,     32,     7,      14,
+    46,     36,     -23,    -71,    -76,    -54,    -36,    -39,    -74,
+    -118,   -134,   -122,   -101,   -104,   -129,   -164,   -174,   -129,
+    -86,    -109,   -184,   -219,   -191,   -147,   -141,   -183,   -249,
+    -290,   -269,   -236,   -266,   -346,   -394,   -366,   -325,   -353,
+    -431,   -472,   -406,   -313,   -316,   -398,   -449,   -401,   -287,
+    -194,   -164,   -193,   -245,   -212,   -55,    75,     67,     26,
+    67,     165,    237,    269,    293,    319,    333,    368,    414,
+    432,    463,    488,    448,    404,    391,    377,    361,    365,
+    376,    308,    197,    150,    129,    73,     53,     91,     43,
+    -107,   -165,   -54,    1,      -148,   -312,   -273,   -125,   -62,
+    -128,   -258,   -294,   -141,   70,     57,     -217,   -378,   -145,
+    198,    289,    169,    -47,    -219,   -101,   264,    458,    217,
+    -163,   -199,   13,     121,    101,    -51,    -293,   -319,   -62,
+    24,     -274,   -474,   -296,   -170,   -336,   -422,   -285,   -248,
+    -302,   -130,   98,     -11,    -257,   -146,   184,    278,    264,
+    331,    192,    -35,    235,    805,    830,    315,    82,     322,
+    503,    522,    619,    557,    242,    163,    399,    507,    489,
+    618,    602,    156,    -164,   112,    476,    406,    94,     -154,
+    -242,   -132,   56,     5,      -325,   -566,   -527,   -478,   -624,
+    -692,   -561,   -551,   -744,   -836,   -671,   -520,   -626,   -736,
+    -647,   -581,   -639,   -687,   -702,   -739,   -665,   -383,   -236,
+    -414,   -513,   -321,   -114,   -43,    32,     65,     -98,    -236,
+    34,     608,    924,    680,    218,    56,     329,    847,    1214,
+    1006,   341,    11,     340,    667,    553,    353,    355,    415,
+    416,    364,    257,    108,    6,      113,    293,    233,    46,
+    4,      25,     -10,    -12,    55,     40,     -65,    -56,    -26,
+    -101,   -61,    143,    229,    78,     -161,   -210,   103,    424,
+    377,    86,     -274,   -491,   -328,   -37,    60,     128,    188,
+    -105,   -625,   -823,   -464,   138,    389,    111,    -343,   -526,
+    -306,   13,     205,    250,    -35,    -554,   -764,   -498,   -42,
+    167,    -210,   -639,   -448,   -101,   -110,   -171,   -74,    -39,
+    47,     424,    616,    324,    98,     367,    853,    942,    416,
+    -184,   -130,   339,    472,    369,    239,    -165,   -418,   101,
+    742,    659,    325,    365,    476,    233,    -14,    270,    785,
+    719,    -29,    -533,   -220,   237,    305,    179,    -190,   -644,
+    -610,   -380,   -526,   -601,   -237,   48,     -36,    -124,   -49,
+    -6,     23,     117,    55,     -199,   -428,   -512,   -338,   -238,
+    -424,   -323,   -135,   -464,   -657,   -189,   100,    -379,   -964,
+    -893,   -346,   -64,    -322,   -650,   -480,   32,     238,    201,
+    386,    616,    611,    400,    195,    357,    842,    1051,   832,
+    712,    829,    1070,   1307,   1081,   551,    363,    544,    623,
+    239,    -374,   -609,   -230,   375,    486,    -52,    -446,   -270,
+    181,    645,    601,    -135,   -654,   -256,   567,    840,    380,
+    -54,    18,     334,    386,    21,     -214,   83,     243,    -316,
+    -937,   -1074,  -1006,  -896,   -674,   -424,   -331,   -354,   -380,
+    -481,   -392,   80,     358,    171,    -170,   -624,   -796,   -130,
+    706,    803,    381,    152,    367,    620,    685,    655,    347,
+    36,     180,    417,    412,    358,    288,    189,    150,    16,
+    -240,   -428,   -428,   -266,   -335,   -819,   -1150,  -946,   -587,
+    -437,   -580,   -961,   -1218,  -1065,  -704,   -431,   -350,   -315,
+    -214,   -162,   -81,    26,     -8,     -52,    -117,   -226,   -40,
+    285,    241,    -2,     -69,    57,     207,    81,     -144,   -69,
+    65,     84,     49,     -168,   -248,   126,    502,    472,    192,
+    120,    442,    667,    551,    512,    634,    814,    1014,   1098,
+    1156,   1112,   974,    1144,   1330,   1099,   825,    847,    877,
+    555,    2,      -243,   -102,   -196,   -471,   -377,   -235,   -439,
+    -622,   -547,   -470,   -495,   -431,   -197,   -21,    21,     -9,
+    -246,   -438,   -238,   -31,    0,      96,     137,    -25,    -211,
+    -181,   -149,   -350,   -368,   -33,    21,     -308,   -323,   32,
+    379,    605,    531,    85,     -374,   -367,   9,      277,    147,
+    -356,   -698,   -494,   -140,   -126,   -354,   -549,   -673,   -642,
+    -428,   -269,   -273,   -246,   -216,   -349,   -323,   -16,    32,
+    -387,   -742,   -662,   -434,   -223,   41,     140,    -58,    -227,
+    -80,    93,     20,     -166,   -360,   -536,   -555,   -305,   -33,
+    -23,    -86,    -75,    -9,     82,     -1,     -156,   24,     532,
+    916,    956,    835,    901,    1127,   1279,   1417,   1435,   1144,
+    822,    862,    1214,   1352,   1001,   611,    539,    532,    369,
+    189,    170,    308,    465,    430,    232,    64,     14,     51,
+    -37,    -244,   -321,   -276,   -144,   57,     77,     -215,   -467,
+    -335,   -186,   -245,   -133,   -81,    -588,   -1130,  -959,   -520,
+    -631,   -1122,  -1270,  -971,   -873,   -1118,  -1157,  -1078,  -1296,
+    -1365,  -1010,  -873,   -1138,  -1061,  -379,   89,     51,     177,
+    372,    185,    -14,    63,     197,    125,    -123,   -60,    243,
+    195,    88,     201,    115,    -63,    -12,    -79,    -492,   -751,
+    -489,   49,     163,    -293,   -424,   -52,    229,    302,    212,
+    217,    315,    70,     -207,   -210,   -173,   129,    619,    556,
+    213,    181,    170,    112,    167,    322,    451,    206,    -136,
+    58,     426,    526,    524,    394,    387,    568,    481,    297,
+    164,    8,      263,    664,    777,    943,    989,    934,    1283,
+    1495,   1153,   861,    738,    582,    614,    692,    655,    629,
+    432,    127,    -119,   -338,   -313,   -138,   -204,   -561,   -994,
+    -1168,  -948,   -700,   -658,   -788,   -1053,  -1027,  -684,   -566,
+    -528,   -355,   -335,   -323,   -28,    206,    87,     56,     387,
+    585,    296,    24,     261,    492,    248,    -132,   -469,   -674,
+    -502,   -235,   -255,   -517,   -847,   -1038,  -965,   -707,   -630,
+    -767,   -639,   -298,   -193,   -290,   -310,   -118,   74,     -77,
+    -337,   -324,   -120,   187,    323,    -72,    -552,   -454,   -14,
+    29,     -427,   -803,   -735,   -586,   -762,   -918,   -783,   -649,
+    -723,   -857,   -786,   -626,   -591,   -417,   -83,    167,    262,
+    49,     -161,   157,    842,    1298,   1356,   1206,   1041,   1194,
+    1461,   1323,   1070,   1221,   1687,   2051,   2002,   1673,   1464,
+    1550,   1851,   1907,   1531,   1327,   1399,   1342,   1287,   1264,
+    1152,   1030,   878,    716,    601,    454,    264,    264,    352,
+    151,    -193,   -296,   -161,   -93,    -215,   -423,   -617,   -668,
+    -547,   -416,   -464,   -807,   -1175,  -1174,  -1045,  -1076,  -1023,
+    -829,   -710,   -745,   -1069,  -1443,  -1417,  -1099,  -939,   -1165,
+    -1307,  -1056,  -843,   -638,   -304,   -190,   -334,   -578,   -770,
+    -705,   -675,   -947,   -957,   -565,   -437,   -617,   -843,   -1015,
+    -813,   -489,   -584,   -904,   -1054,  -797,   -229,   -26,    -208,
+    -66,    398,    710,    644,    390,    413,    726,    992,    1204,
+    1337,   1234,   1104,   1038,   1001,   1043,   982,    847,    885,
+    1024,   1098,   1138,   1108,   1038,   966,    885,    882,    878,
+    929,    1005,   944,    1008,   1284,   1415,   1289,   1007,   760,
+    812,    947,    806,    455,    111,    -72,    -290,   -611,   -626,
+    -559,   -765,   -1034,  -1375,  -1632,  -1565,  -1588,  -1728,  -1585,
+    -1477,  -1547,  -1533,  -1371,  -1103,  -995,   -1090,  -1102,  -947,
+    -686,   -403,   -295,   -250,   -107,   -86,    -171,   -150,   12,
+    234,    283,    185,    300,    461,    393,    382,    434,    378,
+    306,    202,    195,    253,    -8,     -307,   -105,   264,    342,
+    212,    34,     -57,    78,     435,    571,    180,    -165,   -51,
+    339,    705,    683,    464,    658,    958,    825,    579,    465,
+    390,    241,    61,     202,    429,    128,    -122,   241,    406,
+    39,     -167,   -60,    15,     -31,    -68,    146,    402,    344,
+    227,    208,    87,     -25,    -31,    -66,    -169,   -249,   -87,
+    75,     -181,   -438,   -249,   49,     87,     -40,    -16,    53,
+    -86,    -74,    98,     78,     110,    169,    -84,    -323,   -251,
+    -102,   -172,   -513,   -750,   -675,   -568,   -587,   -583,   -523,
+    -450,   -302,   -245,   -356,   -480,   -590,   -495,   -183,   -105,
+    -191,   -215,   -308,   -206,   39,     4,      -77,    -21,    74,
+    186,    218,    356,    611,    489,    83,     13,     246,    371,
+    348,    240,    61,     -66,    -107,   -170,   -205,   -74,    200,
+    277,    45,     -11,    180,    263,    100,    -74,    102,    246,
+    6,      -154,   -162,   -197,   -128,   -189,   -227,   -49,    -238,
+    -490,   -333,   -188,   1,      215,    150,    144,    128,    -33,
+    187,    532,    676,    911,    773,    283,    351,    673,    620,
+    349,    105,    205,    425,    325,    295,    372,    340,    511,
+    628,    394,    224,    187,    91,     -174,   -556,   -482,   -37,
+    -9,     -226,   -382,   -568,   -466,   -208,   -241,   -426,   -656,
+    -814,   -788,   -902,   -1065,  -946,   -860,   -896,   -831,   -744,
+    -672,   -685,   -743,   -723,   -783,   -813,   -570,   -341,   -239,
+    -57,    137,    348,    576,    593,    454,    429,    503,    449,
+    238,    173,    350,    423,    419,    530,    501,    272,    156,
+    207,    295,    404,    568,    676,    419,    30,     113,    463,
+    550,    473,    349,    126,    33,     144,    207,    193,    267,
+    304,    81,     -252,   -401,   -368,   -347,   -404,   -452,   -408,
+    -272,   -40,    234,    281,    48,     -72,    -18,    54,     208,
+    309,    285,    245,    164,    38,     -20,    148,    430,    563,
+    655,    679,    453,    300,    319,    219,    25,     -15,    54,
+    -117,   -444,   -431,   -135,   -147,   -468,   -667,   -722,   -593,
+    -301,   -217,   -428,   -642,   -598,   -400,   -422,   -602,   -628,
+    -554,   -509,   -501,   -541,   -488,   -250,   -129,   -284,   -441,
+    -358,   -161,   -82,    4,      134,    157,    290,    516,    582,
+    702,    859,    871,    858,    759,    615,    616,    754,    839,
+    725,    464,    259,    187,    127,    150,    280,    238,    92,
+    78,     5,      -86,    6,      67,     -14,    -92,    -143,   -211,
+    -89,    213,    300,    107,    -91,    -154,   -153,   -238,   -355,
+    -314,   -227,   -168,   -92,    -142,   -219,   -156,   -47,    53,
+    -15,    -195,   -161,   -186,   -382,   -395,   -297,   -238,   -240,
+    -390,   -502,   -336,   -97,    -29,    -116,   -290,   -289,   -67,
+    74,     112,    119,    182,    358,    382,    315,    341,    290,
+    218,    190,    101,    -51,    -168,   -132,   -41,    -39,    -15,
+    104,    186,    151,    68,     89,     154,    67,     10,     143,
+    120,    -185,   -382,   -365,   -263,   -145,   -111,   -159,   -190,
+    -53,    151,    177,    179,    384,    553,    502,    490,    572,
+    600,    573,    442,    119,    -212,   -260,   -166,   -318,   -506,
+    -413,   -279,   -285,   -354,   -390,   -278,   -142,   -85,    -18,
+    -19,    -121,   -143,   -32,    88,     118,    42,     -96,    -187,
+    -167,   -113,   -172,   -270,   -256,   -178,   -192,   -249,   -128,
+    103,    132,    -47,    -147,   -104,   -56,    -9,     45,     35,
+    109,    315,    381,    326,    336,    457,    667,    786,    675,
+    489,    460,    569,    595,    470,    303,    272,    448,    620,
+    545,    226,    -92,    -128,   91,     172,    -98,    -385,   -378,
+    -264,   -284,   -362,   -314,   -148,   -72,    -198,   -350,   -353,
+    -344,   -389,   -353,   -292,   -327,   -413,   -473,   -519,   -588,
+    -577,   -546,   -737,   -989,   -1030,  -997,   -1010,  -861,   -683,
+    -731,   -690,   -419,   -197,   -47,    112,    167,    74,     41,
+    176,    309,    438,    671,    781,    793,    868,    904,    991,
+    1099,   987,    812,    816,    869,    766,    605,    633,    728,
+    592,    424,    460,    405,    170,    75,     30,     -105,   -58,
+    63,     -58,    -242,   -359,   -415,   -255,   -44,    -127,   -266,
+    -191,   -187,   -296,   -273,   -260,   -341,   -345,   -324,   -384,
+    -467,   -421,   -233,   -125,   -227,   -341,   -256,   -168,   -217,
+    -249,   -302,   -447,   -425,   -274,   -289,   -299,   -229,   -275,
+    -272,   -103,   -57,    -117,   -106,   -162,   -256,   -184,   -31,
+    51,     69,     31,     -19,    72,     256,    318,    331,    254,
+    28,     -7,     121,    48,     -64,    58,     183,    152,    161,
+    201,    167,    190,    287,    278,    157,    56,     103,    332,
+    460,    299,    166,    238,    308,    374,    508,    509,    373,
+    275,    270,    298,    229,    185,    192,    23,     -160,   -80,
+    67,     31,     -170,   -378,   -384,   -330,   -500,   -648,   -615,
+    -686,   -716,   -510,   -510,   -771,   -752,   -475,   -434,   -556,
+    -480,   -403,   -515,   -464,   -255,   -177,   -105,   29,     95,
+    152,    210,    190,    180,    279,    408,    325,    225,    462,
+    607,    537,    759,    1022,   973,    945,    964,    846,    818,
+    952,    907,    584,    313,    302,    428,    533,    479,    260,
+    178,    262,    185,    18,     -77,    -263,   -370,   -208,   -240,
+    -589,   -739,   -572,   -444,   -405,   -357,   -475,   -738,   -771,
+    -542,   -441,   -529,   -651,   -803,   -823,   -556,   -285,   -227,
+    -233,   -202,   -168,   -110,   -78,    -220,   -302,   -56,    129,
+    -60,    -149,   54,     130,    169,    324,    231,    24,     89,
+    269,    320,    262,    231,    225,    138,    67,     153,    310,
+    399,    269,    -21,    -197,   -183,   -59,    144,    234,    -13,
+    -274,   -168,   32,     -37,    -277,   -417,   -441,   -416,   -324,
+    -312,   -467,   -540,   -373,   -166,   -161,   -297,   -365,   -341,
+    -246,   -69,    81,     99,     -3,     11,     305,    540,    449,
+    394,    586,    667,    606,    685,    665,    425,    410,    585,
+    509,    360,    424,    538,    583,    482,    250,    159,    310,
+    423,    217,    -131,   -280,   -204,   -51,    -12,    -204,   -338,
+    -232,   -143,   -201,   -306,   -374,   -336,   -229,   -257,   -453,
+    -576,   -497,   -379,   -326,   -302,   -372,   -504,   -453,   -229,
+    -133,   -226,   -328,   -326,   -261,   -151,   -6,     97,     143,
+    164,    143,    138,    267,    433,    500,    470,    297,    143,
+    279,    504,    556,    475,    333,    233,    225,    228,    198,
+    128,    24,     -17,    4,      -55,    -187,   -251,   -213,   -119,
+    -94,    -214,   -357,   -349,   -246,   -195,   -183,   -261,   -440,
+    -533,   -476,   -341,   -213,   -170,   -220,   -299,   -220,   -8,
+    51,     -11,    19,     172,    292,    189,    9,      -6,     102,
+    238,    384,    477,    448,    353,    304,    354,    473,    543,
+    400,    229,    275,    380,    425,    415,    371,    398,    460,
+    377,    202,    154,    199,    110,    -123,   -365,   -524,   -524,
+    -360,   -134,   -47,    -182,   -348,   -453,   -542,   -503,   -376,
+    -398,   -521,   -595,   -621,   -560,   -439,   -284,   -115,   -80,
+    -123,   -57,    28,     -15,    -60,    -9,     47,     119,    203,
+    288,    435,    571,    635,    706,    750,    627,    436,    345,
+    330,    398,    460,    368,    213,    127,    140,    215,    202,
+    58,     -99,    -244,   -387,   -470,   -527,   -637,   -754,   -791,
+    -768,   -742,   -739,   -735,   -704,   -649,   -552,   -479,   -491,
+    -494,   -454,   -433,   -422,   -398,   -315,   -115,   75,     175,
+    244,    307,    360,    398,    460,    532,    529,    446,    422,
+    497,    541,    504,    541,    702,    803,    744,    645,    621,
+    727,    877,    873,    734,    593,    513,    523,    516,    412,
+    336,    334,    274,    199,    163,    123,    125,    117,    107,
+    140,    72,     -73,    -114,   -68,    -15,    13,     -122,   -338,
+    -367,   -325,   -386,   -497,   -608,   -634,   -546,   -477,   -427,
+    -377,   -412,   -464,   -436,   -343,   -276,   -327,   -390,   -313,
+    -149,   -17,    2,      -93,    -146,   -104,   -76,    -87,    -131,
+    -224,   -280,   -194,   -46,    12,     -76,    -189,   -151,   18,
+    160,    200,    99,     -81,    -149,   -95,    -31,    -6,     -45,
+    -93,    -97,    -71,    0,      73,     34,     -82,    -129,   -102,
+    -84,    -96,    -107,   -69,    -5,     6,      18,     48,     35,
+    27,     32,     -4,     -71,    -30,    119,    205,    266,    352,
+    325,    237,    282,    352,    358,    342,    265,    203,    200,
+    159,    120,    159,    195,    185,    133,    37,     20,     152,
+    312,    363,    316,    255,    251,    259,    211,    160,    86,
+    -4,     -30,    -79,    -154,   -213,   -271,   -243,   -146,   -147,
+    -211,   -283,   -319,   -219,   -157,   -207,   -237,   -252,   -245,
+    -136,   0,      42,     -22,    -108,   -82,    34,     130,    179,
+    152,    98,     105,    110,    116,    180,    175,    66,     -9,
+    -9,     36,     82,     75,     12,     -39,    -14,    23,     1,
+    12,     31,     -61,    -155,   -184,   -158,   -86,    -60,    -67,
+    -63,    -84,    -100,   -81,    -115,   -171,   -157,   -150,   -179,
+    -191,   -209,   -245,   -217,   -128,   -54,    -42,    -73,    -100,
+    -88,    -10,    104,    199,    249,    227,    201,    204,    151,
+    83,     75,     87,     84,     67,     34,     18,     44,     110,
+    218,    275,    232,    190,    209,    263,    294,    256,    174,
+    108,    37,     -54,    -110,   -129,   -179,   -293,   -360,   -339,
+    -282,   -190,   -135,   -188,   -239,   -234,   -227,   -182,   -127,
+    -89,    -51,    -73,    -136,   -151,   -85,    0,      72,     129,
+    122,    65,     44,     103,    202,    272,    252,    170,    148,
+    167,    152,    130,    127,    79,     14,     70,     157,    142,
+    109,    70,     -25,    -57,    -6,     46,     98,     135,    135,
+    82,     16,     10,     68,     87,     -20,    -120,   -116,   -98,
+    -102,   -129,   -204,   -271,   -282,   -252,   -216,   -215,   -221,
+    -156,   -70,    -66,    -120,   -156,   -146,   -126,   -84,    -15,
+    -21,    -76,    -8,     131,    146,    86,     42,     12,     44,
+    110,    169,    171,    91,     68,     173,    262,    248,    160,
+    36,     -90,    -109,   -24,    -12,    -57,    -64,    -78,    -89,
+    -75,    -87,    -101,   -82,    -72,    -76,    -81,    -63,    -34,
+    -4,     61,     87,     46,     23,     -1,     -8,     40,     63,
+    46,     45,     39,     14,     -11,    -25,    -16,    36,     78,
+    85,     110,    120,    132,    189,    228,    217,    154,    89,
+    57,     14,     -14,    -6,     0,      13,     8,      -50,    -68,
+    -60,    -107,   -140,   -126,   -122,   -151,   -147,   -118,   -105,
+    -85,    -83,    -100,   -139,   -195,   -194,   -168,   -183,   -173,
+    -148,   -166,   -168,   -123,   -59,    -11,    20,     64,     98,
+    80,     58,     83,     111,    143,    176,    171,    152,    146,
+    165,    174,    143,    93,     30,     5,      21,     42,     35,
+    -37,    -94,    -61,    -12,    -5,     -27,    -58,    -85,    -81,
+    -11,    79,     65,     -14,    -17,    15,     -4,     -2,     39,
+    20,     -29,    -19,    3,      -11,    -39,    -62,    -43,    -34,
+    -60,    -77,    -119,   -163,   -128,   -5,     87,     73,     51,
+    116,    189,    217,    240,    234,    177,    192,    295,    344,
+    313,    263,    236,    240,    230,    179,    99,     19,     -25,
+    -16,    -9,     -35,    -66,    -53,    -16,    -40,    -70,    -81,
+    -102,   -86,    -87,    -156,   -225,   -228,   -145,   -52,    -22,
+    -57,    -171,   -255,   -247,   -208,   -165,   -187,   -242,   -275,
+    -261,   -168,   -75,    -13,    8,      -62,    -125,   -136,   -133,
+    -81,    -11,    -17,    -80,    -115,   -103,   -27,    71,     134,
+    137,    44,     -48,    -24,    69,     156,    194,    175,    112,
+    55,     54,     101,    148,    157,    142,    100,    44,     27,
+    63,     106,    107,    89,     67,     37,     17,     30,     63,
+    69,     61,     21,     -37,    -55,    -72,    -53,    -26,    -53,
+    -77,    -87,    -109,   -119,   -80,    -36,    -29,    -38,    -48,
+    -57,    -65,    -16,    52,     83,     83,     24,     -27,    -14,
+    9,      27,     52,     50,     45,     90,     132,    117,    75,
+    16,     -1,     60,     95,     55,     25,     26,     20,     61,
+    119,    89,     1,      -61,    -68,    -46,    -36,    -40,    -39,
+    -49,    -58,    -16,    30,     13,     -12,    18,     35,     6,
+    3,      30,     22,     25,     52,     32,     12,     9,      -5,
+    -16,    -25,    -33,    -38,    -44,    -76,    -118,   -118,   -96,
+    -54,    -3,     9,      -31,    -82,    -84,    -35,    18,     25,
+    -26,    -72,    -48,    8,      25,     8,      -20,    -66,    -105,
+    -102,   -80,    -73,    -79,    -80,    -70,    -59,    -55,    -82,
+    -113,   -85,    -51,    -59,    -57,    -38,    -13,    -7,     -18,
+    -6,     20,     51,     55,     18,     -8,     -7,     24,     78,
+    119,    137,    135,    139,    153,    144,    155,    179,    166,
+    128,    56,     8,      38,     85,     94,     72,     20,     -32,
+    -9,     25,     17,     -15,    -84,    -123,   -106,   -82,    -62,
+    -60,    -43,    -4,     -12,    -45,    -68,    -108,   -100,   -47,
+    -49,    -64,    -50,    -9,     37,     59,     68,     62,     53,
+    49,     25,     13,     32,     40,     60,     109,    82,     18,
+    10,     -1,     21,     102,    111,    40,     -10,    -9,     20,
+    31,     0,      -51,    -108,   -135,   -89,    -21,    1,      -54,
+    -125,   -129,   -113,   -144,   -205,   -227,   -167,   -118,   -114,
+    -100,   -71,    5,      34,     -51,    -119,   -120,   -72,    10,
+    56,     51,     58,     65,     98,     135,    84,     20,     -3,
+    -1,     57,     135,    137,    90,     88,     107,    102,    45,
+    -4,     9,      48,     95,     99,     65,     42,     44,     78,
+    80,     29,     11,     39,     27,     0,      7,      19,     10,
+    -45,    -99,    -86,    -77,    -74,    -57,    -74,    -84,    -92,
+    -134,   -114,   -65,    -73,    -76,    -96,    -105,   -50,    -31,
+    -17,    17,     9,      18,     62,     75,     55,     63,     76,
+    61,     61,     80,     103,    107,    110,    131,    134,    120,
+    94,     66,     70,     78,     59,     52,     57,     53,     72,
+    76,     31,     -18,    -53,    -57,    -35,    -17,    -9,     -27,
+    -34,    -7,     -17,    -26,    -13,    -60,    -86,    -53,    -42,
+    -36,    -36,    -46,    -13,    19,     -16,    -47,    -15,    11,
+    -9,     -18,    -26,    -24,    14,     8,      -53,    -54,    15,
+    43,     15,     -9,     -5,     5,      -12,    -40,    -57,    -74,
+    -94,    -105,   -91,    -20,    30,     -10,    -50,    -58,    -52,
+    -42,    -47,    -54,    -61,    -83,    -64,    -30,    -3,     31,
+    9,      -35,    -43,    -31,    6,      50,     54,     55,     67,
+    53,     43,     30,     27,     62,     37,     -26,    -52,    -54,
+    -29,    3,      -12,    -23,    11,     26,     23,     31,     57,
+    66,     46,     32,     35,     83,     124,    111,    124,    157,
+    143,    101,    80,     60,     27,     11,     21,     22,     9,
+    -4,     -26,    -41,    -35,    -50,    -103,   -138,   -116,   -90,
+    -89,    -90,    -79,    -74,    -58,    -18,    -12,    -29,    -36,
+    -17,    22,     30,     -1,     -8,     8,      10,     19,     31,
+    36,     38,     41,     28,     -7,     -14,    -6,     -20,    -30,
+    -11,    -2,     -9,     0,      25,     56,     78,     68,     40,
+    34,     47,     50,     40,     37,     26,     28,     53,     61,
+    57,     25,     -35,    -75,    -65,    -48,    -65,    -81,    -67,
+    -53,    -41,    3,      19,     -3,     -9,     -2,     -1,     -24,
+    -36,    -23,    -26,    -29,    -9,     0,      -15,    -17,    -9,
+    12,     50,     45,     14,     19,     37,     24,     9,      16,
+    13,     -16,    -19,    3,      -3,     -12,    -10,    -23,    -43,
+    -47,    -38,    -46,    -44,    -7,     3,      -19,    -13,    -26,
+    -52,    -29,    -19,    -32,    0,      11,     -26,    -24,    -20,
+    -41,    -30,    -24,    -53,    -67,    -26,    23,     20,     9,
+    6,      -8,     3,      16,     7,      3,      -5,     2,      33,
+    53,     72,     94,     86,     69,     96,     118,    95,     91,
+    78,     32,     26,     48,     48,     37,     21,     7,      -6,
+    -8,     8,      1,      -17,    -2,     18,     1,      -28,    -51,
+    -84,    -93,    -74,    -46,    -18,    -19,    -31,    -10,    10,
+    10,     7,      -5,     -30,    -39,    -28,    -9,     10,     17,
+    11,     14,     20,     -1,     2,      18,     7,      15,     40,
+    40,     32,     27,     23,     31,     43,     33,     7,      -3,
+    18,     51,     53,     31,     21,     14,     16,     14,     4,
+    11,     16,     1,      -24,    -38,    -33,    -27,    -50,    -74,
+    -70,    -60,    -54,    -44,    -22,    -22,    -43,    -33,    -16,
+    -35,    -36,    -18,    -27,    -42,    -46,    -36,    -17,    -15,
+    -22,    -21,    -20,    -2,     15,     12,     22,     27,     22,
+    41,     57,     60,     63,     54,     56,     65,     62,     68,
+    58,     34,     53,     70,     58,     60,     51,     33,     41,
+    39,     16,     -3,     -16,    -18,    -15,    -18,    -32,    -76,
+    -85,    -62,    -82,    -87,    -68,    -84,    -75,    -40,    -48,
+    -55,    -45,    -42,    -24,    -14,    -1,     27,     23,     -1,
+    -2,     12,     15,     32,     55,     52,     55,     82,     81,
+    58,     62,     59,     37,     24,     20,     17,     18,     19,
+    15,     14,     5,      -18,    -27,    -20,    -19,    -34,    -39,
+    -29,    -30,    -27,    -27,    -48,    -52,    -54,    -77,    -48,
+    -18,    -36,    -34,    -13,    -21,    -38,    -28,    -15,    -7,
+    -6,     -20,    -18,    2,      4,      -11,    -5,     7,      1,
+    1,      12,     -2,     -17,    7,      15,     2,      15,     34,
+    48,     78,     94,     82,     66,     66,     64,     47,     44,
+    57,     64,     74,     65,     34,     26,     31,     32,     33,
+    18,     5,      -1,     -18,    -22,    -31,    -54,    -37,    -32,
+    -74,    -89,    -77,    -73,    -65,    -72,    -75,    -39,    -21,
+    -31,    -31,    -24,    -19,    -8,     -4,     7,      26,     22,
+    15,     13,     11,     28,     47,     42,     35,     28,     5,
+    18,     55,     55,     45,     44,     18,     9,      18,     -2,
+    -5,     6,      -15,    -16,    -12,    -20,    -4,     4,      -15,
+    -18,    -10,    -5,     -2,     -16,    -24,    -14,    -7,     -14,
+    -33,    -33,    -20,    -17,    -17,    -18,    -30,    -37,    -35,
+    -34,    -13,    -3,     -28,    -28,    -10,    -21,    -17,    -4,
+    -12,    -16,    -20,    -27,    -16,    -8,     -4,     14,     24,
+    11,     17,     30,     27,     14,     7,      28,     30,     22,
+    45,     47,     23,     31,     23,     -5,     10,     17,     -5,
+    2,      15,     9,      20,     29,     11,     -9,     -8,     8,
+    10,     -1,     -14,    -30,    -30,    -8,     -9,     -20,    -17,
+    -17,    -12,    1,      6,      -7,     -18,    -6,     10,     -6,
+    -7,     29,     35,     21,     16,     9,      25,     44,     26,
+    21,     34,     28,     40,     41,     9,      -2,     1,      12,
+    34,     18,     -12,    -10,    -16,    -29,    -24,    -25,    -20,
+    -17,    -35,    -29,    -12,    -29,    -39,    -32,    -30,    -17,
+    -12,    -28,    -20,    -5,     -4,     7,      14,     10,     3,
+    -3,     0,      19,     27,     4,      -21,    -18,    -7,     -4,
+    0,      1,      -6,     -17,    -30,    -24,    -11,    -9,     0,
+    -1,     0,      -3,     -12,    1,      15,     -2,     3,      16,
+    -3,     -8,     7,      3,      13,     32,     23,     10,     -6,
+    -11,    8,      4,      -12,    -9,     3,      12,     -2,     -31,
+    -36,    -33,    -37,    -17,    -5,     -20,    -14,    4,      5,
+    4,      6,      17,     31,     27,     23,     16,     -1,     -4,
+    15,     24,     21,     18,     7,      -7,     -14,    18,     41,
+    25,     14,     13,     2,      5,      12,     8,      15,     10,
+    2,      13,     10,     3,      5,      -1,     0,      11,     10,
+    6,      2,      7,      10,     -4,     -3,     2,      -13,    -4,
+    14,     -4,     -17,    -11,    -4,     8,      3,      -8,     -1,
+    -7,     -20,    -4,     23,     23,     8,      5,      24,     21,
+    -5,     -2,     7,      -9,     -15,    -8,     -6,     6,      2,
+    -26,    -19,    1,      -19,    -31,    -27,    -34,    -41,    -47,
+    -39,    -12,    -12,    -29,    -32,    -41,    -36,    -26,    -36,
+    -35,    -33,    -29,    -1,     5,      -13,    -21,    -21,    -3,
+    12,     1,      -7,     -1,     2,      12,     9,      -1,     15,
+    21,     18,     25,     4,      -13,    5,      12,     16,     33,
+    33,     19,     21,     26,     30,     30,     24,     23,     19,
+    22,     34,     39,     28,     15,     14,     24,     24,     18,
+    12,     10,     4,      8,      28,     29,     2,      -7,     6,
+    8,      10,     2,      -13,    -8,     -2,     0,      12,     13,
+    -1,     3,      21,     26,     24,     17,     11,     15,     19,
+    19,     19,     11,     1,      3,      3,      0,      -5,     -11,
+    -16,    -26,    -18,    3,      -5,     -17,    2,      10,     6,
+    6,      -8,     -11,    4,      -3,     -17,    -10,    -17,    -37,
+    -31,    -17,    -26,    -37,    -42,    -53,    -49,    -34,    -40,
+    -39,    -21,    -17,    -23,    -23,    -25,    -30,    -24,    -13,
+    -10,    -10,    1,      1,      -7,     7,      19,     11,     4,
+    -3,     -8,     1,      6,      7,      25,     22,     -5,     3,
+    20,     7,      -1,     14,     17,     18,     20,     12,     25,
+    41,     23,     19,     37,     39,     21,     17,     23,     17,
+    6,      9,      15,     4,      -15,    -8,     8,      7,      1,
+    -12,    -18,    -14,    -15,    -10,    0,      -3,     3,      13,
+    -8,     -21,    -8,     -26,    -29,    -1,     -9,     -24,    -19,
+    -22,    -24,    -18,    -25,    -27,    -28,    -34,    -26,    -9,
+    -14,    -14,    -8,     -8,     -5,     4,      4,      -10,    -12,
+    -7,     -8,     -10,    -15,    -19,    -10,    -5,     -9,     -9,
+    -19,    -33,    -27,    -14,    -15,    -14,    -16,    -25,    -10,
+    5,      -7,     -11,    2,      3,      7,      17,     28,     33,
+    32,     33,     39,     49,     57,     63,     62,     64,     67,
+    59,     55,     67,     71,     58,     53,     53,     44,     38,
+    44,     51,     51,     45,     35,     34,     46,     55,     48,
+    36,     21,     3,      -5,     2,      7,      0,      -17,    -30,
+    -34,    -48,    -62,    -64,    -66,    -66,    -62,    -79,    -90,
+    -85,    -88,    -88,    -85,    -88,    -103,   -112,   -112,   -102,
+    -99,    -102,   -103,   -110,   -100,   -80,    -60,    -57,    -68,
+    -59,    -45,    -35,    -6,     9,      -3,     2,      32,     45,
+    48,     51,     40,     51,     78,     85,     83,     87,     94,
+    101,    104,    105,    100,    86,     82,     96,     102,    96,
+    85,     68,     63,     65,     55,     50,     46,     28,     32,
+    43,     33,     30,     27,     8,      18,     36,     27,     20,
+    13,     -14,    -19,    8,      12,     0,      -1,     -12,    -24,
+    -20,    -27,    -39,    -39,    -39,    -44,    -38,    -32,    -42,
+    -38,    -33,    -43,    -55,    -57,    -60,    -61,    -56,    -57,
+    -55,    -43,    -46,    -58,    -55,    -50,    -50,    -51,    -48,
+    -46,    -44,    -36,    -26,    -20,    -13,    -11,    -8,     1,
+    5,      0,      8,      21,     31,     42,     39,     43,     56,
+    48,     37,     45,     45,     47,     52,     46,     40,     26,
+    18,     28,     30,     22,     14,     0,      -3,     8,      0,
+    -7,     0,      -10,    -13,    -9,     -13,    -13,    -18,    -33,
+    -32,    -26,    -37,    -41,    -32,    -26,    -30,    -34,    -31,
+    -38,    -40,    -24,    -25,    -29,    -15,    -18,    -23,    -4,
+    2,      -7,     0,      5,      10,     22,     23,     25,     31,
+    33,     37,     38,     39,     43,     46,     41,     44,     46,
+    37,     35,     46,     63,     67,     52,     38,     30,     35,
+    41,     41,     41,     29,     15,     16,     4,      -4,     3,
+    -12,    -18,    -13,    -27,    -39,    -47,    -55,    -44,    -43,
+    -53,    -45,    -36,    -37,    -37,    -38,    -40,    -49,    -57,
+    -41,    -24,    -28,    -31,    -26,    -20,    -15,    -21,    -23,
+    -18,    -19,    -14,    -10,    -11,    1,      -6,     -26,    -14,
+    -1,     -7,     -10,    -11,    -9,     0,      -4,     -9,     3,
+    8,      0,      -2,     1,      16,     20,     7,      9,      10,
+    8,      18,     12,     11,     17,     -6,     -19,    0,      0,
+    -10,    -6,     -12,    -14,    -11,    -9,     -2,     -10,    -19,
+    -9,     -11,    -4,     18,     7,      -3,     9,      17,     23,
+    28,     25,     19,     19,     24,     33,     37,     30,     28,
+    35,     44,     43,     33,     31,     30,     26,     33,     39,
+    35,     31,     27,     19,     23,     24,     19,     13,     0,
+    0,      2,      -7,     -9,     -10,    -13,    -6,     -6,     -23,
+    -28,    -15,    -9,     -20,    -34,    -30,    -15,    -12,    -11,
+    -3,     -4,     -4,     6,      15,     9,      -11,    -20,    3,
+    26,     23,     1,      -16,    -3,     12,     2,      -22,    -36,
+    -35,    -28,    -20,    -13,    -19,    -38,    -43,    -29,    -11,
+    -5,     -15,    -37,    -40,    -9,     12,     -1,     -23,    -30,
+    -16,    12,     21,     -1,     -25,    -21,    4,      34,     55,
+    34,     -12,    -11,    47,     99,     107,    58,     0,      8,
+    78,     148,    151,    56,     -40,    -2,     142,    215,    99,
+    -67,    -64,    76,     153,    99,     -21,    -107,   -92,    -1,
+    106,    107,    -123,   -395,   -334,   60,     274,    -69,    -597,
+    -626,   -126,   238,    18,     -447,   -577,   -312,   -34,    20,
+    -89,    -242,   -332,   -222,   74,     262,    64,     -285,   -232,
+    259,    563,    294,    -138,   -130,   312,    642,    515,    189,
+    57,     187,    415,    538,    467,    277,    109,    134,    334,
+    441,    299,    59,     -7,     128,    228,    146,    -20,    -99,
+    -34,    60,     24,     -108,   -188,   -147,   -57,    -48,    -142,
+    -224,   -210,   -144,   -122,   -175,   -212,   -176,   -150,   -199,
+    -256,   -210,   -100,   -79,    -195,   -298,   -248,   -107,   -48,
+    -110,   -192,   -224,   -189,   -112,   -40,    -31,    -124,   -238,
+    -193,   -3,     87,     -53,    -221,   -165,   48,     132,    -2,
+    -150,   -109,   61,     147,    83,     -20,    -60,    -13,    85,
+    157,    130,    17,     -68,    -10,    147,    217,    116,    -20,
+    -21,    103,    200,    158,    52,     35,     105,    155,    132,
+    81,     74,     110,    114,    74,     48,     68,     100,    77,
+    27,     30,     48,     19,     -15,    7,      63,     53,     -56,
+    -123,   -41,    81,     75,     -61,    -154,   -84,    45,     68,
+    -24,    -105,   -76,    22,     53,     -13,    -63,    -21,    54,
+    59,     -1,     -34,    16,     80,     81,     48,     37,     61,
+    89,     88,     101,    134,    132,    100,    83,     125,    188,
+    173,    101,    95,     172,    214,    149,    68,     94,     181,
+    177,    103,    83,     132,    165,    122,    83,     140,    191,
+    153,    92,     106,    198,    226,    138,    85,     146,    215,
+    187,    110,    77,     115,    146,    115,    91,     96,     78,
+    27,     -3,     42,     102,    71,     -23,    -46,    30,     95,
+    63,     -18,    -25,    77,     174,    138,    13,     -25,    96,
+    218,    181,    34,     -70,    -45,    17,     2,      -67,    -174,
+    -346,   -516,   -553,   -446,   -455,   -789,   -1213,  -1308,  -1046,
+    -878,   -1179,  -1691,  -1839,  -1528,  -1219,  -1292,  -1623,  -1772,
+    -1538,  -1147,  -921,   -951,   -1038,  -929,   -549,   -95,    155,
+    127,    97,     387,    931,    1339,   1380,   1234,   1276,   1661,
+    2102,   2223,   2027,   1848,   1942,   2198,   2295,   2119,   1856,
+    1725,   1745,   1752,   1601,   1335,   1102,   993,    952,    830,
+    570,    286,    139,    133,    85,     -135,   -436,   -638,   -645,
+    -571,   -620,   -835,   -1064,  -1151,  -1069,  -951,   -964,   -1109,
+    -1209,  -1162,  -1044,  -961,   -944,   -977,   -1001,  -912,   -687,
+    -517,   -623,   -887,   -897,   -469,   10,     -35,    -590,   -934,
+    -545,   184,    427,    -53,    -619,   -563,   40,     489,    339,
+    -128,   -306,   -6,     403,    497,    232,    -55,    0,      388,
+    704,    584,    145,    -76,    260,    816,    942,    485,    2,
+    65,     575,    923,    744,    290,    76,     276,    596,    662,
+    419,    134,    92,     280,    434,    344,    88,     -66,    8,
+    151,    126,    -81,    -239,   -176,   -29,    -74,    -351,   -574,
+    -487,   -208,   -132,   -426,   -780,   -797,   -577,   -595,   -978,
+    -1169,  -667,   -36,    -548,   -2285,  -3281,  -1756,  927,    1236,
+    -1911,  -5006,  -4073,  -66,    2017,   -295,   -3701,  -3797,  -892,
+    975,    -165,   -1978,  -1636,  374,    1482,   679,    -567,   -591,
+    706,    2337,   3224,   2743,   1269,   287,    1221,   3597,   5083,
+    4106,   1858,   972,    2334,   4096,   4167,   2806,   1916,   2383,
+    3045,   2508,   1220,   820,    1784,   2669,   1981,   204,    -876,
+    -470,   510,    803,    170,    -787,   -1568,  -1893,  -1598,  -1027,
+    -992,   -1803,  -2610,  -2484,  -1905,  -2113,  -3113,  -3399,  -2267,
+    -1261,  -2007,  -3637,  -3909,  -2340,  -893,   -1158,  -2272,  -2486,
+    -1639,  -915,   -777,   -596,   -91,    196,    85,     210,    875,
+    1373,   1247,   1219,   1958,   2718,   2328,   1196,   1008,   2350,
+    3677,   3269,   1503,   366,    922,    2264,   2810,   1996,   608,
+    -168,   75,     680,    811,    395,    -56,    -318,   -607,   -966,
+    -1108,  -925,   -613,   -368,   -369,   -919,   -1926,  -2460,  -1685,
+    -300,   155,    -611,   -1524,  -2204,  -3227,  -3859,  -2037,  1622,
+    2382,   -2583,  -8448,  -7544,  -84,    4814,   915,    -6423,  -7558,
+    -1746,  2515,   -59,    -4587,  -3858,  1260,   3625,   187,    -4148,
+    -3500,  1542,   5467,   4780,   1256,   -1127,  -403,   2481,   5332,
+    6346,   5014,   2536,   1216,   2467,   5039,   6238,   5070,   3381,
+    3269,   4173,   3905,   2248,   1586,   3299,   5240,   4362,   1004,
+    -1382,  -489,   2113,   3168,   1620,   -742,   -1824,  -1435,  -897,
+    -1058,  -1500,  -1545,  -1398,  -1965,  -3266,  -4136,  -3756,  -2609,
+    -1804,  -1986,  -3087,  -4599,  -5296,  -4051,  -1731,  -781,   -2228,
+    -4092,  -3977,  -2325,  -1353,  -1568,  -1490,  -428,   178,    -672,
+    -1650,  -1058,  749,    2039,   2079,   1540,   897,    310,    572,
+    2266,   4265,   4265,   1869,   -231,   559,    3332,   4752,   3229,
+    768,    101,    1364,   2463,   1984,   819,    411,    723,    675,
+    -162,   -923,   -743,   -32,    185,    -516,   -1653,  -2359,  -2103,
+    -986,   42,     -205,   -1702,  -2870,  -2337,  -809,   -221,   -982,
+    -1544,  -946,   -598,   -2117,  -4291,  -4100,  -857,   1948,   338,
+    -4799,  -7972,  -5403,  173,    2371,   -1063,  -5533,  -5578,  -1777,
+    605,    -985,   -3249,  -2213,  1184,   2691,   560,    -2356,  -2288,
+    1233,   5244,   6441,   4004,   370,    -663,   2555,   7404,   9282,
+    6573,   2612,   1836,   4662,   7467,   7393,   5421,   4262,   4741,
+    5362,   4705,   3163,   2397,   3337,   4887,   4810,   2254,   -749,
+    -1316,  772,    2706,   2016,   -573,   -2552,  -2746,  -2012,  -1647,
+    -1978,  -2579,  -3105,  -3473,  -3911,  -4484,  -4891,  -4795,  -4163,
+    -3543,  -3538,  -4275,  -5356,  -5743,  -4637,  -2614,  -1301,  -1825,
+    -3341,  -4011,  -2937,  -751,   1007,   1245,   235,    -639,   -61,
+    1626,   2864,   2967,   2734,   3013,   3329,   2914,   2312,   2666,
+    3839,   4308,   3162,   1453,   768,    1255,   1887,   2006,   1715,
+    1031,   -297,   -1660,  -1690,  -277,   813,    -30,    -2137,  -3370,
+    -2854,  -1553,  -593,   -413,   -1146,  -2567,  -3440,  -2369,  -205,
+    379,    -1258,  -2315,  -812,   262,    -3205,  -8576,  -7894,  738,
+    7492,   1951,   -11595, -17098, -6934,  7139,   8065,   -4575,  -14199,
+    -8946,  3606,   7504,   -547,   -8242,  -5113,  4406,   8113,   2134,
+    -5040,  -4089,  4157,   10934,  10158,  4167,   -565,   -192,   4428,
+    9765,   12201,  9861,   4512,   1225,   3451,   8483,   10133,  6497,
+    2574,   3333,   6806,   6986,   2487,   -1214,  623,    5416,   6647,
+    2204,   -3289,  -4556,  -1565,  1544,   1525,   -1236,  -4293,  -5695,
+    -5174,  -3995,  -3403,  -3449,  -3750,  -4505,  -6014,  -7296,  -6523,
+    -3849,  -2096,  -3288,  -5722,  -6004,  -3581,  -1497,  -1960,  -3330,
+    -2800,  -434,   964,    -111,   -1739,  -1136,  1736,   4151,   3736,
+    1274,   -451,   469,    3386,   5833,   5898,   3646,   1085,   272,
+    1743,   4061,   5108,   3837,   1490,   246,    967,    1866,   859,
+    -1069,  -974,   1542,   2835,   47,     -4285,  -5068,  -1567,  1781,
+    1223,   -1997,  -4227,  -3747,  -1720,  41,     245,    -1228,  -2972,
+    -2673,  22,     1980,   -930,   -7721,  -11271, -5725,  4974,   8484,
+    -2007,  -16979, -19255, -4670,  11057,  9690,   -6417,  -17537, -10841,
+    4262,   9292,   206,    -9128,  -6224,  4828,   10018,  3699,   -5183,
+    -5121,  4702,   14279,  14466,  5778,   -2633,  -2185,  7036,   16118,
+    16305,  8081,   390,    499,    6580,   11150,  10036,  5704,   2902,
+    3378,   4664,   3786,   863,    -796,   1216,   4609,   4493,   -338,
+    -5670,  -6486,  -2751,  884,    571,    -3095,  -6446,  -6997,  -5770,
+    -5041,  -5016,  -4216,  -2579,  -2468,  -5088,  -8129,  -7964,  -4228,
+    -323,   497,    -1556,  -3653,  -3615,  -1718,  464,    1808,   2386,
+    2832,   3085,   2905,   2676,   3473,   5501,   7094,   6442,   3929,
+    1663,   1436,   3254,   5807,   7100,   5044,   -34,    -4091,  -2992,
+    2149,   5333,   2562,   -3067,  -5877,  -4480,  -2080,  -1793,  -3026,
+    -3838,  -3735,  -3663,  -4472,  -5756,  -5753,  -3576,  -640,   -274,
+    -3965,  -7787,  -6757,  -717,   4380,   3595,   -1553,  -5936,  -8603,
+    -10223, -8952,  -922,   9700,   9355,   -7788,  -25795, -22413, 2268,
+    20887,  12133,  -11291, -20129, -5899,  10236,  8585,   -3645,  -6300,
+    4667,   14216,  9346,   -3593,  -8558,  715,    15085,  21179,  14887,
+    3733,   -2703,  -675,   7170,   15131,  18360,  13959,  4205,   -2825,
+    -656,   7594,   11845,  7182,   319,    -439,   3255,   3213,   -3299,
+    -8972,  -6318,  2300,   7190,   2254,   -9247,  -17334, -15064, -4452,
+    5160,   5127,   -4268,  -14501, -17256, -11145, -1830,  3786,   2984,
+    -2498,  -8101,  -9587,  -5703,  622,    4570,   4035,   1442,   729,
+    2493,   3534,   2433,   2239,   5944,   11438,  12371,  6496,   -211,
+    -156,   7092,   13566,  11979,  3928,   -2545,  -2226,  2713,   6150,
+    5117,   1270,   -1851,  -2859,  -2376,  -1909,  -2364,  -3401,  -4183,
+    -3897,  -2875,  -3205,  -5503,  -7822,  -7501,  -3934,  -942,   -1572,
+    -4262,  -5939,  -4671,  -2353,  -1387,  -1159,  -1270,  -1328,  -606,
+    474,    1044,   -2647,  -11603, -17081, -10374, 5922,   14849,  2056,
+    -22033, -31238, -14612, 11094,  17910,  1778,   -15538, -15417, -2045,
+    6690,   2855,   -2559,  473,    8823,   11423,  3782,   -4649,  -2775,
+    9111,   20847,  21610,  11572,  962,    -1465,  5731,   15559,  20008,
+    16950,  9230,   2204,   114,    3088,   8130,   10523,  7643,   2045,
+    -2107,  -2945,  -2538,  -3593,  -5210,  -4403,  -857,   1328,   -2497,
+    -11667, -18881, -16866, -6286,  3400,   2835,   -7811,  -18322, -19279,
+    -10025, 1525,   6930,   3766,   -4647,  -11401, -9904,  -322,   10100,
+    12428,  5874,   -274,   926,    6762,   9360,   6778,   5904,   10509,
+    15077,  12681,  3846,   -1653,  2460,   11036,  14737,  8967,   -1021,
+    -6168,  -3899,  2328,   6041,   3404,   -2878,  -7672,  -6869,  -1918,
+    801,    -2188,  -7419,  -8083,  -2687,  1898,   -692,   -8121,  -11198,
+    -5642,  2830,   5915,   1120,   -5666,  -8314,  -5770,  118,    4614,
+    4713,   1482,   -2544,  -3331,  -3779,  -8931,  -13840, -10273, 3355,
+    13432,  2906,   -20058, -30890, -17080, 7759,   16047,  2886,   -12525,
+    -15117, -5998,  1614,   2294,   2684,   4610,   6236,   5486,   2514,
+    1346,   1962,   4564,   11022,  17438,  18182,  10179,  -796,   -3019,
+    5456,   15942,  18468,  11176,  2796,   -143,   1670,   3922,   3836,
+    3337,   3330,   1623,   -2609,  -7177,  -7654,  -4250,  -2210,  -3491,
+    -5312,  -4380,  -3103,  -6738,  -13209, -14278, -6529,  3346,   4931,
+    -2861,  -11176, -12097, -5552,  2679,   7102,   6050,   1301,   -3350,
+    -3378,  1785,   7413,   9059,   7013,   5043,   5331,   5197,   3143,
+    1862,   3790,   8037,   10159,  7236,   1450,   -3393,  -3980,  598,
+    6251,   7410,   1502,   -7144,  -10260, -5116,  2386,   4197,   -894,
+    -6255,  -6026,  -1493,  873,    -1639,  -4426,  -2720,  2252,   4206,
+    158,    -4631,  -4466,  537,    4709,   4528,   1691,   -828,   -1394,
+    -455,   756,    2662,   3101,   1730,   -3579,  -12987, -18531, -12998,
+    1944,   11963,  1503,   -19826, -29919, -18138, 2254,   7644,   -1829,
+    -9260,  -6516,  134,    -793,   -5234,  -2336,  6264,   12828,  11829,
+    6589,   3429,   2592,   4795,   11433,  19490,  21681,  13136,  379,
+    -4138,  3585,   14812,  17633,  10124,  623,    -2287,  696,    2273,
+    -926,   -5000,  -4391,  -386,   139,    -4657,  -11003, -13946, -11930,
+    -7460,  -1932,  1277,   -2311,  -10543, -16920, -14512, -4039,  4987,
+    7518,   3175,   -4213,  -7535,  -4747,  3590,   12231,  13419,  8429,
+    2377,   1080,   5563,   8497,   7304,   5331,   5656,   8235,   6997,
+    998,    -3131,  -1857,  3017,   5883,   3744,   -408,   -4503,  -6489,
+    -4796,  -374,   3254,   1651,   -2830,  -5206,  -3690,  -681,   -969,
+    -2819,  -2616,  19,     3379,   2359,   -2476,  -6413,  -6111,  -463,
+    4664,   4106,   -565,   -4801,  -4960,  -1242,  2479,   3706,   2168,
+    -1104,  -3048,  -1563,  1217,   2013,   -5714,  -17921, -21743, -10839,
+    7751,   13091,  -4648,  -26509, -29653, -9872,  10100,  9523,   -4335,
+    -12121, -5509,  4923,   6380,   1839,   -508,   3312,   10704,  14545,
+    12317,  5508,   -243,   2421,   11485,  19096,  18306,  8626,   -1357,
+    -5542,  -1695,  7815,   13549,  10229,  -23,    -8373,  -7496,  -2775,
+    -1016,  -2900,  -4868,  -4103,  -4535,  -6851,  -8099,  -8137,  -6414,
+    -4023,  -1790,  -45,    -1513,  -4791,  -6160,  -4105,  1060,   5970,
+    7099,   3934,   -996,   -2213,  1973,   6975,   7927,   4726,   2474,
+    3951,   5221,   2642,   -2359,  -3579,  1362,   6614,   6282,   116,
+    -5643,  -5733,  -1884,  2107,   3418,   2566,   684,    -2319,  -3803,
+    -2133,  1512,   2943,   475,    -1004,  753,    3095,   1652,   -3074,
+    -4562,  -932,   3815,   4486,   -22,    -4199,  -4666,  -2201,  284,
+    316,    -914,   -2297,  -2441,  -1538,  -435,   909,    626,    -1222,
+    -1534,  -429,   1711,   2386,   -1786,  -10676, -18200, -16272, -3805,
+    9505,   8238,   -9397,  -24577, -22256, -4907,  8659,   5940,   -3701,
+    -6764,  40,     6190,   4239,   208,    238,    7081,   14458,  15143,
+    10726,  3479,   -706,   1700,   9131,   17577,  17708,  7959,   -5009,
+    -11508, -5347,  5635,   10789,  6499,   -3121,  -9303,  -9814,  -6625,
+    -3333,  -3193,  -4349,  -5615,  -6188,  -5123,  -4441,  -4550,  -4074,
+    -2769,  -61,    2441,   2881,   1395,   -578,   -341,   2509,   6034,
+    8202,   6377,   2696,   1272,   2589,   4787,   4611,   2378,   2124,
+    3911,   4872,   2049,   -3374,  -5770,  -2705,  3179,   5905,   2589,
+    -2792,  -5419,  -3176,  1056,   2875,   2483,   1205,   605,    856,
+    1012,   892,    105,    -411,   707,    2924,   4184,   1755,   -2553,
+    -4857,  -3556,  401,    2466,   945,    -2315,  -5556,  -5549,  -2241,
+    534,    601,    -1774,  -3034,  -1962,  -886,   -448,   -720,   -467,
+    864,    760,    -22,    -2546,  -10211, -17121, -15877, -4803,  7993,
+    7254,   -6563,  -18374, -17755, -6143,  3291,   4322,   1822,   416,
+    2788,   5190,   4256,   2627,   2590,   6398,   12709,  15757,  12829,
+    5542,   -667,   167,    7241,   14346,  14826,  6392,   -3516,  -7434,
+    -4607,  1054,   2988,   847,    -1549,  -2641,  -3046,  -5363,  -8256,
+    -9130,  -6906,  -1460,  2260,   1568,   -2911,  -8580,  -9418,  -3675,
+    5021,   10127,  7909,   1478,   -4015,  -3331,  2450,   7291,   7632,
+    2567,   -2022,  -899,   3418,   5544,   1349,   -4117,  -3409,  1758,
+    6000,   3526,   -3975,  -7331,  -3931,  2747,   7037,   4962,   -21,
+    -2902,  -2008,  1306,   4461,   6364,   5956,   3623,   1734,   793,
+    44,     -893,   -1041,  1633,   5264,   4870,   -943,   -7404,  -8611,
+    -4974,  -1192,  185,    -1334,  -3672,  -4910,  -5132,  -4387,  -3532,
+    -3233,  -2430,  -469,   1245,   892,    -969,   -2441,  -2140,  320,
+    4999,   5954,   -4638,  -20056, -24424, -8954,  13558,  16089,  -3145,
+    -20665, -19447, -4802,  4488,   3733,   943,    683,    3109,   6219,
+    9247,   7736,   782,    -1410,  8024,   20877,  20174,  4723,   -7148,
+    -2758,  11240,  17896,  11462,  414,    -6134,  -4913,  113,    2818,
+    98,     -5900,  -8369,  -4446,  924,    1657,   -3389,  -10569, -13223,
+    -7690,  2339,   7741,   1634,   -9014,  -10982, -1172,  9642,   9098,
+    1310,   -2795,  -1040,  2790,   3808,   3559,   3064,   -527,   -3160,
+    -1391,  3120,   5224,   -144,   -6714,  -6416,  -719,   5630,   7253,
+    2735,   -2973,  -4325,  679,    7146,   8220,   4055,   -42,    814,
+    5288,   7658,   6592,   3051,   -746,   -541,   3401,   6030,   1953,
+    -6340,  -8619,  -2689,  4076,   3217,   -4875,  -9612,  -7826,  -4293,
+    -2441,  -4080,  -5740,  -5529,  -3656,  -506,   -1035,  -5787,  -9518,
+    -7034,  2323,   9287,   6495,   -1853,  -6110,  -3281,  -1708,  -8958,
+    -19544, -18870, -2771,  13029,  10762,  -7491,  -21837, -18923, -4183,
+    8733,   12580,  9779,   4597,   738,    1460,   6302,   9711,   8375,
+    8143,   12512,  15808,  11272,  389,    -5554,  161,    11080,  15851,
+    10426,  692,    -6372,  -6808,  -2525,  652,    827,    -219,   -349,
+    -622,   -3328,  -7883,  -11020, -8961,  -3240,  1884,   4155,   1995,
+    -3530,  -7816,  -6444,  -218,   6086,   9279,   7901,   3113,   -2352,
+    -5757,  -3836,  2022,   4572,   894,    -3519,  -3311,  -534,   -618,
+    -3716,  -5515,  -3290,  1495,   4374,   4455,   2961,   -645,   -3247,
+    -656,   5273,   9838,   9751,   5755,   1863,   158,    1457,   4585,
+    6390,   5379,   2894,   2284,   1867,   -2279,  -7051,  -6578,  70,
+    4745,   1660,   -4524,  -8007,  -7088,  -5690,  -5467,  -4178,  -2679,
+    -2218,  -3422,  -4167,  -4313,  -6105,  -6633,  -4202,  864,    5119,
+    4084,   -163,   -5331,  -8699,  -8710,  -7313,  -4649,  -2471,  -1419,
+    -1136,  -3199,  -6428,  -8048,  -4902,  1089,   4681,   5723,   5535,
+    5146,   4006,   2052,   2314,   5274,   8680,   9907,   8776,   6722,
+    2548,   -2403,  -3303,  1224,   7406,   9468,   5089,   -1197,  -4384,
+    -3570,  -298,   1776,   2005,   2041,   1326,   971,    -180,   -2334,
+    -1170,  1913,   4281,   4732,   2874,   1174,   -1341,  -3384,  -2503,
+    368,    4031,   3270,   -986,   -3519,  -5360,  -6004,  -5576,  -3603,
+    208,    708,    -2137,  -4940,  -5349,  -3588,  -2796,  -1399,  1017,
+    3144,   4196,   2483,   828,    338,    919,    3842,   6202,   7189,
+    7499,   6330,   4847,   3252,   2136,   3698,   5845,   5566,   3019,
+    267,    -55,    -1091,  -4220,  -5041,  -3430,  -280,   171,    -4649,
+    -8723,  -9280,  -5975,  -3192,  -3974,  -3912,  -4053,  -3748,  -3570,
+    -5871,  -5499,  -3552,  -1691,  320,    341,    748,    -313,   -3436,
+    -4687,  -3681,  21,     2550,   643,    -2123,  -3254,  -2226,  -1044,
+    -1617,  -1510,  183,    1250,   726,    -1662,  -3388,  -1759,  933,
+    3817,   5242,   3025,   248,    -1339,  -514,   2022,   3410,   3970,
+    3324,   2632,   2603,   2240,   2166,   1271,   487,    1076,   2039,
+    3296,   3836,   3610,   2913,   2718,   4213,   5555,   6023,   4769,
+    2442,   2067,   2173,   1623,   1201,   348,    52,     -124,   -1528,
+    -2834,  -3604,  -3463,  -2357,  -2564,  -3775,  -3801,  -1929,  -465,
+    -2109,  -3743,  -2657,  200,    2580,   954,    -1304,  -95,    1549,
+    2303,   1795,   1633,   3356,   3699,   2361,   792,    1148,   4045,
+    4820,   3851,   3197,   2449,   2704,   1722,   -652,   -1154,  -393,
+    113,    -1010,  -3328,  -4342,  -3939,  -3345,  -3697,  -5115,  -5610,
+    -4202,  -3639,  -5088,  -5351,  -3216,  -862,   -414,   -1839,  -3996,
+    -4831,  -2467,  147,    1055,   1288,   -247,   -2225,  -2233,  -1562,
+    -1278,  -936,   -961,   -935,   -367,   -323,   -459,   -1940,  -3974,
+    -2262,  -13,    2,      -401,   -1825,  -2308,  -1124,  448,    2154,
+    2434,   1300,   -812,   -1337,  1325,   3374,   3466,   2500,   2156,
+    3439,   3549,   2068,   1392,   1986,   3025,   3944,   3898,   3259,
+    4467,   6347,   5356,   2893,   1690,   2072,   4136,   5313,   2776,
+    -236,   -1063,  -794,   524,    802,    -1377,  -2879,  -2167,  -1439,
+    -1595,  -1539,  -1666,  -2495,  -2375,  -1253,  -515,   -187,   -1409,
+    -2847,  -511,   2411,   1761,   492,    -18,    607,    2350,   3288,
+    3505,   2741,   1099,   699,    2017,   3214,   3333,   1567,   33,
+    1260,   1925,   808,    -377,   -2558,  -3781,  -1677,  164,    -580,
+    -1727,  -2619,  -3421,  -3586,  -3957,  -4562,  -3646,  -2285,  -3437,
+    -5293,  -4792,  -4128,  -4012,  -2920,  -2249,  -2439,  -3737,  -5607,
+    -4427,  -1259,  71,     609,    555,    -1039,  -3354,  -5388,  -3760,
+    415,    2513,   2513,   819,    -1436,  -2780,  -2740,  -501,   2727,
+    3936,   1491,   -965,   -766,   -484,   -223,   361,    695,    1771,
+    1130,   -1839,  -1764,  797,    -31,    -2549,  -1790,  2108,   4043,
+    887,    -154,   2411,   2605,   2012,   1977,   3923,   6630,   4176,
+    107,    -311,   1731,   1910,   1011,   3119,   3219,   998,    -1282,
+    -2832,  -1645,  -685,   945,    2574,   2543,   -267,   -5015,  -3819,
+    -342,   1228,   2055,   -619,   -1233,  2069,   2896,   1095,   62,
+    1365,   3366,   4584,   4956,   3323,   -19,    -50,    4024,   5222,
+    3695,   3118,   1933,   1256,   1443,   128,    -119,   2043,   2477,
+    1823,   1324,   30,     -1363,  -3023,  -3074,  -188,   621,    -1775,
+    -2806,  -2961,  -2753,  -4359,  -5350,  -1220,  -116,   -4157,  -4811,
+    -2793,  -1040,  -1957,  -2862,  -1901,  -3192,  -3720,  -2357,  -1727,
+    -387,   -2131,  -5011,  -3650,  -454,   596,    -1298,  -3716,  -3122,
+    496,    136,    -2415,  -1675,  -811,   -837,   140,    -1243,  -187,
+    -1431,  -5320,  -2121,  100,    -467,   2465,   681,    -2093,  1224,
+    1632,   1428,   1776,   648,    2480,   3622,   876,    259,    1403,
+    2139,   3117,   497,    -763,   -170,   279,    1769,   342,    -871,
+    -25,    -1549,  -2290,  290,    1042,   -796,   -4291,  -3895,  159,
+    1264,   -540,   -2328,  -702,   1972,   852,    -2274,  -798,   1126,
+    -579,   -480,   3481,   3833,   1004,   901,    1536,   1809,   3103,
+    2521,   3183,   5220,   1800,   -266,   4663,   4230,   -790,   159,
+    2274,   5114,   4304,   -1998,  344,    4921,   -343,   -2048,  1180,
+    2112,   3109,   -10,    -1818,  552,    -1360,  -2889,  -1302,  -1918,
+    -37,    1406,   -1762,  -3054,  -1446,  -2073,  -4292,  -3214,  1163,
+    2333,   -712,   -2583,  -2058,  -1034,  -600,   -3796,  -2395,  2137,
+    -1122,  -1927,  702,    -2196,  -4374,  -3257,  -1558,  -256,   -728,
+    -395,   -176,   -1529,  -2772,  -1121,  -340,   -1147,  -250,   -4079,
+    -473,   4241,   -2818,  -3523,  3255,   2355,   -2550,  -1082,  1197,
+    2213,   -94,    -237,   3123,   1314,   -1075,  977,    1081,   2045,
+    2966,   -1328,  -1069,  -741,   -524,   -380,   -2766,  -986,   926,
+    -3281,  -1554,  2554,   -3620,  -6394,  -1680,  -321,   2889,   243,
+    -1567,  2276,   -1294,  -525,   2010,   -4883,  -1495,  6778,   2085,
+    -873,   2496,   418,    -1156,  -1179,  1604,   6173,   1190,   -2381,
+    5788,   2431,   -4941,  -242,   1248,   1023,   4426,   3399,   2726,
+    1388,   -922,   595,    392,    1414,   6260,   2673,   -973,   2237,
+    1776,   -2393,  -757,   4158,   2842,   -2327,  505,    1230,   -3623,
+    -917,   336,    -1400,  -1018,  1771,   2696,   -570,   -2435,  886,
+    2309,   -2865,  -1328,  2077,   -1967,  -3486,  -411,   961,    -1661,
+    -1979,  1179,   -493,   -2597,  1995,   284,    -3300,  -2213,  184,
+    312,    -1665,  -641,   -1325,  -1276,  90,     69,     476,    -778,
+    -1099,  853,    1515,   1630,   1188,   -877,   -1751,  702,    2983,
+    -201,   664,    4018,   -352,   -1864,  875,    2367,   813,    -2463,
+    -702,   886,    -2204,  -2216,  399,    -1729,  -2408,  1412,   -2757,
+    -3530,  449,    -2554,  -3910,  906,    697,    -1696,  566,    -1360,
+    -1991,  81,     -1756,  -159,   1180,   -667,   -584,   -359,   183,
+    1943,   -412,   -1747,  1659,   1961,   280,    294,    222,    2000,
+    2076,   829,    -43,    -880,   3353,   3615,   1279,   1746,   -1031,
+    1301,   3477,   -777,   2567,   1215,   -2344,  3556,   561,    -2166,
+    1119,   2377,   -391,   -1825,  -2359,  49,     1764,   391,    -291,
+    325,    1223,   1443,   -624,   -2828,  1381,   2438,   28,     -652,
+    -166,   581,    -2039,  -374,   -20,    -2459,  -1149,  1505,   2008,
+    -1798,  -3848,  -1796,  -2208,  -2224,  -878,   728,    -154,   -534,
+    1061,   538,    -1465,  73,     1147,   82,     -119,   3800,   4797,
+    -873,   784,    1458,   -148,   3180,   1319,   908,    4951,   584,
+    -57,    2394,   -967,   586,    405,    -1601,  3566,   -285,   -3949,
+    -1301,  -1953,  -1223,  -1831,  -3477,  -779,   -389,   -3169,  -1828,
+    -1496,  -1451,  -556,   -3327,  -209,   534,    -4908,  131,    -386,
+    -5232,  1373,   2129,   -1740,  -1957,  -1102,  76,     396,    -1426,
+    -179,   1357,   -3276,  -1420,  3819,   -44,    56,     2777,   -1202,
+    1908,   1410,   2031,   3495,   -2197,  -163,   1565,   239,    2803,
+    480,    -1636,  1180,   616,    1206,   1166,   -1579,  1572,   814,
+    -774,   2310,   740,    -2606,  1234,   -603,   -362,   1562,   -2134,
+    652,    -777,   -2353,  5464,   377,    -2490,  1012,   157,    680,
+    -1389,  -1898,  1135,   -1,     -1730,  1800,   -1466,  -1687,  -1469,
+    -3250,  -1081,  1381,   -81,    -204,   -26,    353,    1941,   174,
+    104,    2009,   1032,   -871,   3280,   3398,   -651,   -154,   3309,
+    1964,   448,    812,    -17,    887,    2405,   3295,   -54,    -2396,
+    1410,   1380,   -1156,  296,    -1706,  -1729,  401,    -970,   -878,
+    -723,   -2285,  1259,   1320,   -1960,  -1039,  -211,   -661,   -763,
+    -1599,  -43,    308,    -1841,  72,     -2075,  -3010,  -497,   506,
+    -377,   247,    1932,   -1788,  -2419,  257,    208,    -2176,  488,
+    2827,   -1720,  -1649,  -619,   520,    1103,   -1231,  -1327,  2162,
+    1535,   -383,   315,    -1488,  -235,   1761,   -27,    -232,   515,
+    127,    -2239,  654,    2871,   -379,   -1274,  2445,   874,    -2444,
+    514,    -206,   -1289,  1314,   1869,   1316,   1878,   -1454,  -982,
+    476,    359,    2084,   -708,   405,    -246,   -1071,  1757,   -866,
+    -2331,  783,    501,    -853,   896,    36,     -2468,  -1138,  1445,
+    -613,   -687,   1999,   -449,   -731,   1478,   384,    -45,    96,
+    1530,   1919,   186,    -94,    1347,   -329,   -348,   1631,   574,
+    1062,   735,    -1652,  675,    244,    1241,   1137,   -2469,  621,
+    45,     -612,   1308,   -2015,  -208,   2392,   -1646,  -67,    77,
+    -1558,  113,    1263,   -236,   -971,   -333,   -733,   -555,   2024,
+    -135,   -3817,  -398,   1696,   -1179,  -1473,  1175,   -166,   618,
+    1132,   -2504,  -575,   146,    -688,   1323,   150,    -2021,  15,
+    1673,   347,    -1535,  -106,   235,    -32,    1167,   -471,   -503,
+    -1260,  416,    -13,    -1082,  1036,   -790,   -1676,  487,    985,
+    77,     57,     -1175,  1146,   2023,   -1706,  -404,   3249,   -739,
+    -979,   3044,   -514,   -168,   2201,   -2863,  1009,   1833,   -2309,
+    1565,   476,    -1698,  1667,   -496,   -2193,  1686,   532,    336,
+    -1095,  -1655,  578,    -909,   -1263,  2569,   -2833,  -1808,  2860,
+    -822,   27,     1098,   -1371,  1585,   -284,   -1074,  2944,   -764,
+    -2871,  2484,   1179,   -1213,  -670,   -1226,  1112,   1837,   -299,
+    -388,   -51,    1,      992,    -723,   -361,   1723,   -1115,  -2012,
+    1261,   -9,     -127,   -510,   -1550,  1448,   957,    -1930,  171,
+    776,    -2104,  14,     764,    -599,   -745,   -438,   -371,   -659,
+    1075,   282,    -3116,  684,    3747,   22,     -2139,  816,    1413,
+    -333,   458,    906,    483,    -1084,  797,    1039,   -467,   -377,
+    1386,   -1182,  610,    1787,   -1354,  -2800,  2638,   424,    -2372,
+    1153,   -51,    -689,   290,    -2199,  818,    3755,   -2674,  -1689,
+    3497,   -507,   -1978,  1729,   1413,   215,    -76,    53,     759,
+    371,    -1529,  1005,   -770,   -685,   1754,   -908,   -653,   1047,
+    -1066,  -784,   -199,   -526,   86,     -1750,  -916,   1839,   580,
+    -1884,  319,    226,    -977,   212,    202,    -741,   -1013,  2057,
+    69,     -2961,  974,    1964,   -512,   -224,   1554,   -79,    -1142,
+    1853,   -71,    1009,   1174,   -718,   2040,   -158,   -1508,  1042,
+    0,      -1219,  1212,   448,    -208,   -47,    -779,   -867,   1924,
+    -254,   -1085,  -221,   -1283,  1543,   -584,   -951,   225,    -1089,
+    -464,   -853,   -615,   1576,   -2313,  -1214,  950,    -2548,  -314,
+    1201,   -1527,  952,    764,    -1915,  528,    169,    -1676,  1742,
+    425,    -2346,  932,    290,    109,    492,    -379,   932,    70,
+    582,    135,    769,    1665,   -1751,  576,    1013,   366,    2339,
+    71,     637,    1500,   576,    111,    494,    765,    1170,   1421,
+    -5,     -892,   2054,   -640,   160,    1426,   -651,   348,    -841,
+    -558,   1563,   277,    -408,   -1468,  482,    -1538,  -2255,  968,
+    -1307,  -454,   1306,   -3085,  -1680,  2624,   -2191,  -1719,  1891,
+    -3826,  -1441,  2736,   -3694,  -266,   1897,   -4468,  841,    2828,
+    -4060,  -318,   2305,   -1662,  528,    3056,   -2429,  -156,   2045,
+    -753,   475,    419,    -597,   1100,   1845,   504,    1067,   -402,
+    -824,   1807,   1192,   459,    200,    1728,   50,     -497,   678,
+    -355,   938,    1239,   -1223,  360,    1251,   -95,    981,    1029,
+    -1940,  260,    1627,   -2387,  3426,   519,    -3141,  1822,   -506,
+    -1471,  1101,   -2137,  1069,   885,    -2618,  1673,   -463,   -1558,
+    1439,   -386,   -1923,  1538,   -1313,  -1735,  540,    -1433,  -915,
+    494,    -839,   -1527,  -1143,  480,    -1081,  27,     1732,   -1285,
+    -1833,  1952,   -667,   -1626,  1819,   -1293,  -1323,  2139,   -376,
+    -1392,  1277,   -1172,  -240,   2907,   -1875,  -238,   2573,   -1068,
+    -471,   2065,   -686,   -1315,  2575,   233,    -1005,  1135,   706,
+    534,    278,    -182,   1091,   -21,    -222,   1413,   -371,   -54,
+    1108,   -103,   382,    -70,    787,    894,    -108,   1308,   1113,
+    -1412,  574,    1140,   -2032,  500,    569,    -1251,  951,    -50,
+    -1398,  772,    -474,   -1536,  1297,   251,    -2321,  109,    -703,
+    -425,   40,     -1354,  -773,   -225,   -1743,  -1839,  1244,   261,
+    -3082,  -424,   1162,   -937,   123,    -322,   -407,   -561,   -331,
+    1369,   -1142,  -1050,  1024,   1116,   -213,   -752,   1521,   -383,
+    -415,   1011,   947,    -713,   743,    1945,   -237,   881,    600,
+    -757,   885,    -835,   756,    2454,   -1985,  699,    1572,   -1652,
+    673,    232,    -42,    1975,   -736,   -270,   1660,   -704,   -96,
+    1264,   -428,   278,    774,    -954,   -1325,  756,    1275,   -594,
+    -353,   204,    -1130,  -782,   -432,   -979,   268,    378,    20,
+    -870,   405,    -357,   -1661,  637,    473,    293,    -314,   -895,
+    3,      -175,   -1016,  -643,   204,    -588,   -1007,  -131,   401,
+    -849,   -476,   271,    320,    -198,   533,    -25,    -1994,  1421,
+    525,    -1611,  1261,   507,    -488,   1093,   361,    -1814,  2230,
+    312,    -196,   3242,   -803,   -962,   1714,   -1479,  1426,   1612,
+    -1953,  1376,   -581,   -669,   1370,   -1251,  426,    1274,   -470,
+    1757,   807,    -589,   1275,   126,    -871,   1025,   -1331,  287,
+    1258,   -1813,  146,    -839,   -1471,  828,    -402,   -281,   1704,
+    -1341,  -231,   939,    -1035,  -472,   -197,   -764,   -380,   -816,
+    -266,   382,    -497,   -1708,  -591,   1119,   -1941,  178,    969,
+    -1656,  685,    1004,   -1114,  -127,   -1473,  -678,   1610,   -1253,
+    277,    1807,   -1642,  -461,   2033,   -1449,  392,    98,     -157,
+    1525,   -860,   2455,   413,    -2159,  2457,   475,    -374,   1532,
+    -981,   843,    973,    324,    1168,   225,    -407,   1487,   681,
+    -680,   1098,   117,    245,    1238,   -223,   1076,   -428,   -466,
+    2593,   -663,   -1225,  1303,   -933,   -561,   1190,   -1071,  -1229,
+    406,    -284,   -13,    198,    -1494,  -637,   352,    -1960,  420,
+    49,     -1472,  -761,   -234,   -2213,  -1750,  -521,   -1554,  -813,
+    662,    -633,   -1388,  -15,    -947,   -391,   -152,   -894,   631,
+    -461,   -885,   633,    -51,    -1063,  218,    1149,   -61,    -274,
+    988,    -140,   7,      1774,   1558,   -623,   755,    1352,   -511,
+    1106,   744,    17,     2640,   -91,    697,    1547,   -1757,  1832,
+    1859,   -206,   1505,   575,    -444,   556,    250,    1786,   792,
+    -125,   -266,   407,    501,    798,    -536,   -1214,  58,     6,
+    354,    -685,   613,    99,     -2022,  -116,   -236,   -182,   263,
+    -824,   -1187,  -142,   -138,   -1228,  -1008,  786,    -1421,  -1127,
+    -269,   -2278,  841,    222,    -2423,  678,    -1153,  -2082,  574,
+    -570,   -729,   180,    -777,   212,    270,    -274,   1077,   -493,
+    118,    804,    -1260,  349,    799,    545,    481,    971,    1099,
+    1146,   -273,   34,     1728,   1128,   411,    758,    308,    -808,
+    950,    1490,   209,    -265,   1154,   -11,    -460,   2644,   -122,
+    -728,   2033,   -1100,  -305,   1774,   -208,   -1567,  -57,    -140,
+    -670,   -454,   -1390,  -80,    978,    -438,   -731,   -684,   344,
+    -458,   -199,   -126,   -1663,  -883,   642,    -1517,  -1144,  -375,
+    -422,   -452,   -1815,  -791,   763,    -1502,  -205,   684,    -1641,
+    448,    1399,   -2160,  804,    1088,   -2214,  1030,   1585,   -1093,
+    -11,    1718,   -360,   -81,    1294,   398,    218,    1225,   644,
+    505,    2090,   -385,   526,    2111,   -303,   -316,   1550,   1323,
+    -459,   881,    1874,   -1256,  1429,   2485,   -1003,  -552,   14,
+    432,    952,    471,    -633,   408,    -358,   140,    554,    -1260,
+    -404,   245,    -2572,  954,    1005,   -1621,  -82,    -175,   -957,
+    112,    106,    -1117,  -819,   -62,    -785,   71,     93,     -1296,
+    -1680,  242,    -956,   -2696,  302,    -204,   -1404,  254,    -558,
+    -201,   -630,   16,     -436,   -1647,  1649,   -1096,  -1267,  2273,
+    -1270,  20,     1749,   -2509,  780,    942,    -1859,  2762,   304,
+    -300,   2617,   -947,   861,    2601,   -1153,  754,    1629,   -681,
+    686,    1443,   -235,   1900,   5,      -565,   1559,   285,    -170,
+    757,    480,    547,    752,    -427,   50,     839,    -95,    -791,
+    -1698,  -291,   -62,    -1730,  524,    1008,   -2176,  -369,   165,
+    -749,   -972,   -287,   889,    -1218,  -1712,  833,    -855,   -995,
+    -14,    -793,   -1815,  605,    -607,   -1890,  769,    -781,   230,
+    1155,   -2000,  876,    1835,   -1617,  9,      1058,   -1232,  859,
+    1486,   -1301,  1595,   501,    -951,   2935,   -921,   -634,   2826,
+    -793,   655,    2660,   -232,   235,    1879,   481,    -51,    804,
+    987,    -360,   -331,   2099,   -302,   -149,   1966,   -1233,  -12,
+    1330,   -2265,  1256,   -116,   -1394,  2937,   -995,   -1572,  2964,
+    -2257,  -2587,  1820,   -2132,  -1609,  778,    -1596,  -486,   560,
+    -1749,  274,    -706,   -1714,  1304,   -360,   -2657,  1833,   -750,
+    -1729,  433,    -1461,  -794,   -1545,  -892,   385,    -891,   -374,
+    1261,   -589,   235,    815,    -773,   -669,   636,    -471,   136,
+    871,    -392,   782,    677,    -472,   1130,   1029,   -1262,  1070,
+    2171,   575,    675,    600,    2104,   1077,   -182,   2621,   -604,
+    -30,    3302,   -1331,  599,    742,    291,    1329,   -551,   1043,
+    1729,   -1754,  1220,   1113,   -2174,  1281,   743,    -2027,  851,
+    -205,   -1576,  214,    -1629,  -605,   -394,   -1508,  -254,   -63,
+    -489,   -847,   -26,    -997,   -1065,  -120,   -376,   -1283,  -1393,
+    83,     -212,   -1610,  419,    -1120,  -590,   395,    -1210,  -21,
+    -273,   -622,   899,    -196,   -1059,  1130,   616,    -529,   -166,
+    794,    22,     -216,   862,    664,    -390,   980,    228,    789,
+    182,    402,    2149,   -1133,  799,    2637,   -799,   176,    1306,
+    905,    -93,    677,    338,    121,    483,    297,    339,    347,
+    249,    731,    40,     66,     112,    -889,   -128,   582,    -1191,
+    -67,    -1364,  -233,   488,    -1734,  -634,   1517,   -1657,  -1015,
+    594,    -1422,  1396,   -1357,  -1617,  1254,   -1596,  -941,   789,
+    -1860,  -77,    245,    -327,   569,    -723,   104,    905,    -543,
+    -918,   1387,   -42,    -440,   619,    68,     45,     1364,   -880,
+    19,     1491,   -561,   1174,   1403,   -1411,  1351,   1222,   -612,
+    864,    877,    -658,   382,    864,    -552,   1286,   309,    -105,
+    1083,   -170,   -289,   1049,   -248,   -537,   625,    -48,    337,
+    -385,   532,    -315,   -1398,  588,    -628,   -1192,  649,    -806,
+    -170,   541,    -2267,  1052,   274,    -1970,  833,    253,    -1345,
+    -290,   -120,   -959,   -94,    -189,   -1397,  -136,   -155,   -654,
+    207,    -706,   617,    415,    -1962,  1169,   670,    -1132,  319,
+    297,    -589,   100,    510,    -620,   610,    -153,   -15,    1327,
+    -99,    229,    281,    169,    1015,   -106,   1197,   577,    -698,
+    577,    931,    -964,   1605,   505,    -1713,  2369,   115,    -1585,
+    1839,   664,    -1411,  867,    620,    329,    491,    -1119,  420,
+    266,    -1708,  499,    -69,    -1037,  795,    -321,   -959,   32,
+    235,    -1748,  295,    -249,   -230,   485,    -1185,  -97,    489,
+    -2036,  711,    405,    -2800,  593,    434,    -1038,  536,    347,
+    -570,   705,    -806,   -290,   818,    -999,   53,     1585,   -756,
+    -657,   1180,   115,    -364,   217,    -226,   1033,   347,    -20,
+    611,    658,    590,    -128,   -451,   1676,   -660,   -21,    805,
+    -880,   1481,   412,    -1534,  1522,   221,    -132,   662,    -407,
+    613,    1132,   -551,   -187,   1184,   -577,   -444,   953,    -1034,
+    -472,   461,    -865,   -99,    637,    -572,   300,    450,    -591,
+    137,    404,    -972,   306,    -524,   -1167,  433,    124,    -1326,
+    -368,   -305,   -917,   452,    -626,   -695,   656,    258,    -1401,
+    270,    446,    -1045,  636,    -357,   -1072,  913,    512,    -1732,
+    489,    952,    -747,   58,     673,    -453,   1125,   -488,   46,
+    1723,   -1244,  417,    1803,   -1215,  623,    659,    -560,   676,
+    -9,     92,     701,    1100,   -623,   142,    283,    -512,   547,
+    576,    -525,   -155,   1143,   -1286,  -329,   1959,   -1302,  -459,
+    1188,   -1199,  1020,   -118,   -1303,  956,    -905,   -647,   595,
+    -356,   -1354,  -74,    750,    -791,   -335,   56,     -862,   -36,
+    276,    -279,   46,     -485,   -181,   196,    -584,   -238,   259,
+    -314,   -77,    383,    509,    -386,   -180,   859,    -542,   955,
+    372,    -362,   1458,   113,    -106,   1495,   -534,   63,     1295,
+    -505,   846,    983,    -1097,  1764,   320,    -185,   1061,   -525,
+    115,    217,    -328,   326,    312,    374,    179,    -683,   485,
+    -1286,  147,    -583,   -979,   888,    -504,   -1235,  715,    -1050,
+    -1111,  848,    -828,   -1043,  -115,   -327,   22,     -451,   -1008,
+    98,     -262,   -545,   -363,   -48,    -257,   -731,   878,    96,
+    -1186,  426,    359,    -1101,  1074,   -267,   521,    -375,   -166,
+    1398,   -994,   780,    550,    124,    -298,   581,    236,    305,
+    -111,   396,    741,    -10,    662,    155,    271,    563,    65,
+    -318,   812,    -483,   843,    75,     -714,   1152,   -26,    -190,
+    -97,    533,    -111,   -564,   724,    -24,    -820,   835,    -473,
+    -632,   154,    -104,   -932,   919,    -606,   -619,   496,    -310,
+    -271,   -360,   120,    -630,   126,    65,     -931,   548,    -207,
+    -455,   410,    -282,   -931,   944,    -354,   69,     412,    -661,
+    1068,   -969,   -443,   1894,   -1281,  -442,   2003,   -1640,  713,
+    852,    -1344,  1338,   -457,   243,    498,    -697,   -129,   993,
+    -388,   -76,    1039,   -768,   492,    -104,   -58,    951,    -854,
+    181,    1093,   -1111,  491,    544,    -1061,  118,    586,    -477,
+    -411,   392,    233,    91,     -908,   532,    218,    -1176,  670,
+    -74,    -674,   696,    -801,   194,    592,    -1790,  762,    -564,
+    -791,   595,    -145,   -727,   228,    434,    -246,   -232,   -169,
+    281,    -324,   289,    -120,   -270,   -49,    282,    250,    -56,
+    -405,   507,    27,     -1060,  1329,   -203,   -204,   1677,   -767,
+    -313,   1272,   -968,   717,    183,    -1652,  2157,   -75,    -1906,
+    2590,   -428,   -1614,  2564,   -1511,  -240,   1421,   -1911,  1420,
+    396,    -1397,  1691,   -694,   -1500,  1942,   -823,   -784,   841,
+    -635,   759,    -447,   351,    44,     -946,   227,    441,    -564,
+    155,    -719,   182,    509,    -320,   -300,   205,    -662,   726,
+    469,    -1240,  191,    664,    -269,   -152,   -18,    214,    -149,
+    -257,   347,    76,     -79,    -384,   874,    -387,   -269,   892,
+    -783,   537,    46,     27,     251,    -332,   133,    377,    -522,
+    232,    626,    -362,   -499,   1112,   -342,   -522,   362,    -187,
+    547,    -384,   -155,   517,    -551,   227,    651,    -825,   -88,
+    579,    -758,   -40,    456,    -774,   542,    -164,   -482,   968,
+    -1000,  -394,   1094,   -885,   431,    74,     -348,   403,    -959,
+    831,    -465,   -330,   762,    -717,   -645,   1342,   -499,   -416,
+    944,    -417,   -438,   737,    -368,   -42,    740,    -1234,  689,
+    29,     -106,   619,    -824,   -10,    1047,   -824,   146,    -59,
+    210,    163,    -43,    522,    -352,   213,    460,    -1049,  599,
+    308,    -843,   632,    223,    -504,   296,    530,    -931,   751,
+    -176,   -524,   379,    236,    -626,   66,     662,    -575,   191,
+    -175,   -619,   660,    -424,   -217,   704,    -498,   200,    62,
+    -543,   280,    91,     -378,   54,     168,    -554,   670,    -215,
+    -1097,  1805,   -1015,  -617,   1642,   -1560,  727,    61,     7,
+    -48,    -659,   1308,   -752,   -613,   914,    160,    -469,   164,
+    -167,   274,    326,    -667,   497,    333,    -757,   1252,   -481,
+    -1257,  2019,   -949,   -719,   1676,   -1078,  250,    323,    -1100,
+    1550,   145,    -1697,  972,    522,    -966,   374,    -365,   846,
+    -276,   -756,   629,    -278,   302,    -151,   -243,   -363,   841,
+    -7,     -1092,  476,    45,     201,    -378,   -456,   1113,   -926,
+    97,     178,    -240,   326,    -597,   472,    -10,    -190,   394,
+    -501,   -259,   307,    133,    240,    -433,   -192,   472,    -190,
+    12,     398,    -191,   -605,   1295,   -576,   -154,   474,    -661,
+    866,    -968,   172,    887,    -736,   36,     259,    -201,   265,
+    460,    -859,   622,    102,    -690,   776,    -80,    -745,   919,
+    140,    -750,   224,    134,    -236,   -196,   456,    409,    -1069,
+    600,    239,    -306,   -383,   541,    -213,   -323,   -121,   700,
+    -735,   179,    222,    -613,   653,    -711,   -81,    592,    -694,
+    117,    703,    -772,   -264,   644,    -117,   -422,   276,    64,
+    -355,   -430,   800,    -74,    -619,   1207,   -1057,  4,      960,
+    -1219,  977,    -78,    -1186,  1536,   267,    -1388,  1144,   -90,
+    -1052,  1889,   -1255,  -387,   1815,   -1763,  1037,   421,    -1003,
+    767,    -24,    -277,   -54,    759,    -285,   -1015,  1422,   -581,
+    -121,   547,    -687,   288,    440,    -626,   -623,   1261,   -248,
+    -1133,  1204,   -714,   382,    219,    -851,   240,    -161,   672,
+    -261,   -855,   1043,   -599,   111,    -362,   225,    641,    -913,
+    -122,   1075,   -1165,  432,    131,    -803,   978,    33,     -1291,
+    992,    224,    -1054,  789,    -121,   -215,   262,    -11,    89,
+    -174,   365,    -240,   114,    406,    -813,   291,    233,    158,
+    -377,   194,    216,    -477,   635,    -228,   -512,   599,    23,
+    -273,   71,     258,    10,     -155,   -198,   354,    61,     -749,
+    768,    -19,    -709,   596,    97,     -276,   164,    69,     -144,
+    -20,    529,    -897,   188,    480,    -703,   836,    -874,   259,
+    917,    -1044,  -7,     566,    -97,    -439,   256,    -466,   998,
+    -360,   -1134,  1619,   -762,   -752,   1446,   -707,   -177,   652,
+    -899,   579,    253,    -410,   146,    -262,   275,    353,    -610,
+    52,     671,    -862,   419,    -140,   273,    247,    -1062,  1005,
+    -175,   -497,   772,    -431,   -101,   450,    -598,   266,    428,
+    -842,   477,    -11,    -554,   642,    17,     -787,   544,    445,
+    -625,   -205,   796,    -222,   -733,   764,    -572,   423,    166,
+    -994,   931,    -228,   -303,   362,    -214,   104,    448,    -1091,
+    722,    570,    -1311,  773,    259,    -648,   477,    193,    -682,
+    302,    459,    -464,   -383,   1120,   -561,   -564,   1083,   -372,
+    -354,   864,    -586,   -200,   502,    -331,   27,     446,    -657,
+    281,    571,    -888,   502,    251,    -423,   116,    277,    -263,
+    118,    -170,   168,    367,    -723,   202,    438,    -793,   451,
+    -30,    -292,   202,    38,     -188,   -66,    221,    -90,    -105,
+    7,      346,    -578,   337,    247,    -371,   -14,    22,     36,
+    151,    -322,   -244,   692,    -556,   -5,     550,    -560,   200,
+    161,    -347,   191,    258,    -520,   441,    -212,   -215,   584,
+    -428,   -251,   213,    90,     -187,   109,    138,    -211,   -17,
+    191,    111,    -259,   161,    -141,   232,    -175,   0,      154,
+    -369,   539,    -171,   -438,   484,    43,     -375,   -37,    249,
+    196,    -328,   -106,   541,    -531,   103,    240,    -191,   186,
+    -363,   40,     585,    -573,   258,    170,    -593,   515,    -261,
+    -86,    407,    -339,   164,    -214,   -34,    464,    -377,   -206,
+    336,    -230,   239,    -85,    -69,    322,    -503,   322,    142,
+    -748,   867,    -160,   -753,   836,    -249,   -362,   750,    -374,
+    -222,   448,    -82,    -246,   399,    13,     -429,   441,    -47,
+    -127,   -29,    337,    -502,   318,    132,    -457,   498,    -145,
+    -91,    98,     208,    -179,   54,     62,     -260,   237,    96,
+    -161,   32,     -150,   93,     21,     -31,    74,     75,     -322,
+    164,    168,    -191,   119,    -121,   -66,    -195,   296,    -128,
+    -251,   381,    -56,    -338,   281,    -29,    -472,   664,    -301,
+    -275,   423,    -285,   -77,    258,    -82,    -139,   160,    -54,
+    -26,    27,     75,     -49,    -196,   305,    -131,   -187,   262,
+    -37,    -206,   65,     269,    -240,   -144,   261,    54,     -338,
+    355,    3,      -503,   535,    -253,   -210,   433,    -290,   -33,
+    381,    -546,   173,    252,    -364,   271,    -329,   166,    266,
+    -564,   507,    -32,    -648,   861,    -400,   -357,   819,    -519,
+    -74,    392,    -423,   426,    -306,   -93,    691,    -991,   537,
+    467,    -992,   614,    426,    -823,   491,    182,    -371,   174,
+    84,     -64,    98,     -96,    23,     182,    -69,    -211,   226,
+    18,     -134,   334,    -514,   352,    378,    -623,   363,    266,
+    -592,   493,    -46,    -369,   594,    -440,   -10,    295,    -368,
+    326,    -192,   -140,   306,    -305,   140,    198,    -396,   202,
+    154,    -341,   208,    -8,     -169,   -76,    106,    20,     -347,
+    233,    30,     -193,   117,    -9,     -165,   182,    -4,     -195,
+    96,     131,    -188,   -106,   166,    -71,    -99,    57,     4,
+    -31,    -131,   101,    63,     -199,   225,    -25,    -281,   342,
+    -247,   -170,   516,    -289,   -263,   422,    -158,   -148,   363,
+    -192,   -138,   122,    62,     -105,   7,      194,    -53,    -224,
+    83,     173,    -182,   20,     178,    -274,   182,    74,     -109,
+    -5,     319,    -303,   -72,    428,    -371,   50,     271,    -204,
+    17,     161,    -256,   169,    93,     -169,   94,     -89,    139,
+    80,     -199,   325,    -67,    -83,    202,    -154,   16,     202,
+    -325,   162,    61,     -93,    201,    -278,   236,    108,    -477,
+    594,    -145,   -370,   647,    -261,   -356,   669,    -369,   -181,
+    420,    -266,   -154,   159,    -25,    53,     -40,    -22,    68,
+    -203,   144,    -2,     -173,   88,     -3,     -62,    2,      75,
+    55,     -95,    -130,   219,    -142,   -191,   164,    -170,   44,
+    0,      -246,   249,    -27,    -413,   461,    27,     -490,   292,
+    19,     -145,   13,     99,     91,     -466,   209,    295,    -773,
+    465,    210,    -680,   410,    163,    -358,   399,    -201,   87,
+    23,     -212,   270,    -230,   86,     159,    -353,   381,    -73,
+    -456,   726,    -353,   -357,   754,    -367,   -344,   657,    -59,
+    -417,   432,    35,     -309,   153,    97,     -69,    89,     -101,
+    63,     107,    -127,   106,    112,    -26,    -236,   376,    43,
+    -479,   544,    -57,    -407,   447,    -148,   -103,   195,    -198,
+    80,     156,    -228,   35,     145,    -77,    -55,    130,    -33,
+    -190,   123,    41,     -170,   74,     114,    -241,   67,     192,
+    -195,   -76,    186,    -136,   -133,   213,    -105,   -110,   144,
+    -51,    -126,   154,    -59,    -124,   147,    -49,    -132,   82,
+    26,     -130,   63,     68,     -211,   97,     131,    -224,   59,
+    184,    -250,   59,     205,    -225,   -67,    163,    -135,   -24,
+    74,     -22,    -4,     -81,    21,     71,     -137,   71,     47,
+    -120,   71,     34,     -65,    138,    -6,     -116,   112,    -47,
+    -39,    20,     -75,    64,     -7,     2,      35,     52,     -61,
+    -29,    81,     -61,    -30,    195,    -91,    -136,   261,    -11,
+    -186,   162,    -86,    -35,    152,    -106,   -32,    126,    -4,
+    49,     33,     -9,     -11,    46,     111,    -132,   -3,     204,
+    -175,   -10,    281,    -146,   -94,    226,    -126,   -36,    58,
+    -14,    61,     -172,   48,     193,    -221,   83,     149,    -279,
+    195,    130,    -357,   226,    102,    -260,   191,    16,     -223,
+    124,    14,     -144,   90,     -31,    -81,    -66,    54,     103,
+    -181,   29,     174,    -281,   92,     81,     -226,   139,    -133,
+    -41,    167,    -147,   44,     27,     -132,   107,    -34,    -122,
+    105,    -54,    17,     52,     -131,   138,    33,     -206,   158,
+    43,     -80,    24,     10,     -27,    33,     43,     -71,    15,
+    71,     -42,    14,     18,     0,      -3,     -14,    -14,    58,
+    46,     -99,    122,    105,    -202,   125,    119,    -238,   112,
+    133,    -242,   113,    129,    -301,   52,     161,    -177,   82,
+    73,     -139,   46,     122,    -119,   22,     155,    -230,   23,
+    242,    -211,   -12,    182,    -184,   -57,    190,    -34,    -101,
+    58,     -20,    6,      103,    -61,    -78,    12,     18,     12,
+    86,     -71,    -27,    43,     -24,    8,      39,     -109,   21,
+    -4,     -44,    66,     13,     -59,    61,     -39,    35,     113,
+    -179,   19,     171,    -158,   14,     112,    -133,   26,     9,
+    -43,    -9,     6,      41,     -77,    22,     80,     -61,    -63,
+    65,     -32,    -32,    125,    -105,   -11,    114,    -120,   42,
+    42,     -92,    45,     -56,    -25,    131,    -83,    -24,    97,
+    -51,    -5,     67,     -69,    7,      41,     -27,    8,      3,
+    -10,    8,      -3,     -87,    -28,    122,    -33,    -58,    124,
+    -53,    -50,    67,     -115,   -17,    111,    -112,   -30,    101,
+    -24,    -13,    41,     3,      45,     -13,    -34,    23,     23,
+    -19,    13,     -49,    -49,    68,     -68,    -32,    91,     -58,
+    -18,    73,     -19,    -27,    17,     -33,    -35,    99,     -38,
+    -99,    78,     -31,    -62,    95,     -71,    -124,   184,    -15,
+    -146,   160,    -27,    -109,   140,    -25,    -63,    84,     -34,
+    -18,    58,     -68,    -16,    22,     -87,    86,     23,     -130,
+    61,     62,     -132,   51,     168,    -139,   35,     133,    -121,
+    50,     102,    -120,   40,     126,    -87,    -40,    119,    -14,
+    -59,    78,     11,     -68,    41,     24,     -25,    55,     -2,
+    15,     21,     -73,    56,     88,     -74,    -41,    4,      -10,
+    -4,     5,      7,      -39,    -3,     -4,     -39,    94,     52,
+    -135,   42,     90,     -86,    12,     21,     -55,    -70,    -37,
+    55,     -63,    -35,    50,     -100,   21,     84,     -151,   24,
+    87,     -94,    51,     2,      -58,    104,    -61,    -70,    60,
+    -25,    -42,    -31,    55,     35,     -129,   47,     69,     -65,
+    77,     2,      -60,    110,    -32,    -69,    84,     -54,    -26,
+    98,     -28,    -7,     49,     -49,    -19,    119,    -11,    -157,
+    20,     106,    29,     -8,     -38,    -30,    72,     30,     -3,
+    1,      -32,    -11,    -9,     52,     46,     -144,   -38,    86,
+    -31,    -9,     -42,    -75,    142,    34,     -64,    79,     -109,
+    -55,    195,    -69,    -80,    48,     -49,    62,     25,     -111,
+    -42,    52,     19,     -41,    1,      -16,    -33,    44,     30,
+    -21,    17,     -2,     -30,    111,    34,     -111,   83,     55,
+    -119,   66,     62,     -89,    63,     -39,    -143,   168,    21,
+    -158,   158,    32,     -132,   134,    -3,     -77,    88,     -45,
+    -18,    117,    -51,    -71,    10,     30,     35,     -27,    -63,
+    13,     34,     23,     -23,    19,     -4,     -92,    34,     74,
+    -69,    -15,    20,     -36,    56,     -36,    -96,    69,     -34,
+    -122,   32,     31,     -51,    -3,     -21,    4,      43,     -44,
+    6,      81,     -39,    -35,    26,     -38,    -24,    29,     -16,
+    -47,    -6,     19,     -7,     -9,     41,     32,     13,     -2,
+    -21,    3,      24,     49,     -3,     -66,    14,     95,     -7,
+    -52,    80,     68,     -72,    -14,    39,     2,      24,     -6,
+    -53,    86,     21,     -78,    67,     28,     -34,    16,     -23,
+    -1,     70,     -3,     -58,    45,     33,     -94,    -34,    62,
+    41,     -11,    -27,    27,     46,     14,     -33,    -12,    44,
+    -16,    -59,    6,      45,     -3,     -42,    2,      13,     19,
+    -1,     -71,    3,      42,     -36,    6,      17,     26,     5,
+    -46,    6,      -68,    -75,    86,     -20,    -90,    80,     4,
+    -86,    5,      2,      -33,    -15,    -2,     -8,     -18,    15,
+    -7,     -25,    27,     -28,    -88,    39,     -2,     -85,    58,
+    40,     -45,    3,      17,     0,      11,     -4,     -3,     84,
+    22,     -113,   8,      94,     10,     9,      28,     6,      -3,
+    5,      -2,     23,     23,     -1,     -40,    20,     48,     -40,
+    -21,    72,     7,      -40,    -1,     27,     16,     30,     31,
+    -16,    11,     9,      -71,    -7,     62,     21,     -61,    -19,
+    78,     -2,     -22,    67,     -42,    -12,    75,     -79,    47,
+    86,     -124,   -42,    21,     4,      23,     -32,    -7,     19,
+    1,      -13,    -46,    2,      32,     -43,    -7,     86,     -16,
+    -22,    46,     -61,    -35,    11,     -64,    -38,    17,     -12,
+    -27,    20,     41,     6,      -58,    -61,    58,     -51,    -77,
+    36,     -25,    19,     93,     -76,    1,      72,     -92,    15,
+    40,     -56,    65,     13,     -29,    82,     -9,     -21,    24,
+    -83,    -5,     4,      -63,    77,     80,     -58,    -6,     -19,
+    -43,    100,    5,      -36,    63,     33,     -26,    -48,    26,
+    -18,    -75,    34,     24,     -45,    -1,     6,      -35,    -24,
+    -23,    -22,    47,     -15,    -46,    31,     -40,    -41,    74,
+    -32,    -73,    59,     -51,    -26,    143,    -29,    -42,    93,
+    -44,    -21,    56,     -7,     55,     51,     -61,    74,     111,
+    -71,    35,     124,    -123,   -3,     62,     -79,    100,    49,
+    -122,   143,    79,     -137,   72,     30,     -82,    75,     -10,
+    -48,    35,     -23,    -25,    34,     0,      -54,    -6,     34,
+    -46,    -59,    -7,     -72,    -6,     70,     -41,    -39,    23,
+    -33,    11,     104,    -44,    -30,    54,     -69,    -20,    62,
+    -75,    1,      45,     -69,    1,      40,     -59,    -15,    18,
+    -16,    38,     -1,     -52,    8,      14,     -32,    11,     -15,
+    -58,    18,     -22,    -44,    69,     40,     -50,    -21,    1,
+    -35,    -3,     -5,     -20,    40,     36,     -41,    -36,    -43,
+    -11,    48,     -34,    -40,    51,     -10,    -9,     30,     10,
+    12,     51,     51,     -8,     -16,    32,     -6,     31,     24,
+    -38,    43,     18,     -15,    53,     -10,    -55,    9,      8,
+    -28,    21,     10,     -26,    21,     10,     -9,     5,      -29,
+    -13,    38,     -1,     -11,    49,     0,      -41,    10,     23,
+    -25,    -35,    -2,     -32,    -10,    58,     -6,     -18,    16,
+    -9,     4,      11,     17,     21,     21,     12,     -2,     49,
+    -16,    -128,   21,     75,     -32,    22,     34,     -59,    48,
+    75,     -69,    -11,    -2,     -65,    39,     57,     -54,    -79,
+    -11,    -20,    -13,    38,     4,      -9,     -22,    -22,    33,
+    -7,     -52,    10,     -10,    -19,    54,     47,     -21,    -35,
+    -6,     -4,     11,     8,      -28,    1,      8,      -4,     30,
+    1,      -22,    26,     -7,     -24,    56,     25,     -45,    13,
+    24,     -32,    13,     22,     -46,    -2,     15,     -39,    28,
+    32,     -69,    0,      27,     -69,    0,      39,     -40,    28,
+    55,     -27,    -13,    0,      -14,    37,     25,     -25,    34,
+    -3,     -69,    26,     39,     -41,    -6,     29,     -7,     5,
+    66,     41,     -27,    -17,    6,      -14,    -21,    0,      29,
+    -9,     -26,    32,     -5,     -34,    60,     15,     -60,    20,
+    13,     11,     43,     -48,    -15,    88,     -13,    -55,    26,
+    -32,    -46,    35,     14,     -37,    -11,    12,     -20,    11,
+    9,      -64,    -16,    17,     5,      38,     7,      -30,    -9,
+    -49,    -11,    52,     -15,    -38,    -27,    -12,    36,     53,
+    1,      -37,    -17,    -12,    0,      31,     1,      13,     40,
+    -15,    2,      47,     -15,    -17,    28,     -2,     -4,     25,
+    -6,     -12,    2,      -17,    -9,     5,      -15,    17,     21,
+    -28,    0,      15,     -43,    -63,    -6,     -14,    -8,     37,
+    -34,    -40,    30,     -12,    -14,    37,     -13,    -16,    26,
+    -15,    -2,     13,     -37,    -13,    32,     13,     -8,     -2,
+    -12,    -8,     9,      9,      -3,     4,      13,     34,     -2,
+    -22,    40,     19,     29,     25,     -48,    -17,    23,     17,
+    7,      3,      0,      12,     37,     -1,     -25,    30,     41,
+    -7,     7,      29,     -31,    -31,    -23,    -27,    5,      2,
+    -18,    -2,     22,     9,      -6,     5,      -7,     -24,    9,
+    0,      -28,    19,     61,     -11,    -45,    21,     -28,    -65,
+    28,     33,     -44,    -27,    -6,     -26,    -8,     4,      5,
+    9,      -10,    -46,    -20,    20,     -7,     -7,     -33,    -26,
+    50,     9,      -65,    -22,    -3,     -20,    15,     21,     20,
+    24,     -16,    -27,    -13,    14,     21,     -38,    -48,    9,
+    35,     28,     21,     3,      -31,    -8,     57,     32,     -35,
+    -22,    20,     14,     12,     28,     39,     0,      -18,    44,
+    -2,     -17,    53,     0,      -27,    33,     43,     5,      -10,
+    25,     47,     -3,     -4,     36,     15,     -12,    -3,     29,
+    41,     23,     23,     -8,     -32,    15,     37,     0,      3,
+    22,     31,     1,      -20,    27,     2,      -50,    0,      33,
+    16,     -16,    -17,    18,     -26,    -34,    31,     -27,    -84,
+    -33,    4,      -5,     -22,    -17,    -28,    -66,    -24,    8,
+    -16,    -25,    -51,    -13,    45,     -11,    -49,    -26,    -49,
+    -38,    21,     10,     -52,    -58,    -19,    -4,     9,      -31,
+    -29,    55,     2,      -45,    29,     10,     -22,    49,     33,
+    -27,    -19,    -5,     30,     47,     11,     -11,    -2,     8,
+    5,      17,     8,      3,      57,     63,     28,     24,     11,
+    2,      14,     22,     7,      7,      2,      23,     33,     -2,
+    -8,     14,     7,      20,     57,     32,     -5,     12,     23,
+    10,     17,     26,     -18,    -72,    -6,     74,     61,     13,
+    -17,    -21,    -7,     29,     45,     5,      -52,    -49,    1,
+    10,     35,     40,     -46,    -66,    7,      31,     -27,    -44,
+    -12,    -41,    -22,    32,     -12,    -32,    -3,     -17,    -22,
+    -22,    -31,    -30,    -23,    -13,    3,      0,      -21,    -19,
+    -7,     -17,    -9,     18,     -40,    -64,    1,      4,      -4,
+    8,      -17,    -28,    -1,     9,      -7,     -9,     27,     6,
+    -63,    -32,    52,     25,     -46,    -23,    -6,     -11,    35,
+    29,     -50,    -44,    17,     -6,     -12,    53,     28,     -17,
+    -9,     28,     34,     -20,    -18,    22,     43,     28,     -6,
+    8,      14,     19,     28,     14,     27,     26,     12,     76,
+    66,     -18,    -2,     18,     -12,    -1,     -2,     -1,     51,
+    30,     -18,    5,      14,     -12,    2,      13,     -25,    -9,
+    32,     7,      -5,     15,     -12,    -33,    -18,    -13,    6,
+    0,      -25,    -12,    1,      -17,    0,      13,     -24,    -27,
+    4,      35,     14,     -22,    5,      13,     -18,    -30,    -10,
+    -7,     -7,     31,     23,     -27,    -26,    9,      47,     6,
+    -50,    -11,    19,     1,      11,     12,     -19,    -43,    -18,
+    10,     -6,     -3,     12,     2,      -12,    -16,    10,     9,
+    -25,    -21,    -10,    -13,    0,      8,      -1,     -9,     10,
+    4,      -34,    14,     46,     5,      18,     24,     -15,    -7,
+    20,     -1,     -13,    7,      11,     14,     11,     -2,     8,
+    27,     10,     -1,     13,     -2,     -7,     48,     44,     -15,
+    -16,    -6,     3,      7,      -35,    -25,    8,      -31,    -16,
+    30,     36,     22,     -13,    -21,    -10,    8,      2,      -58,
+    -37,    32,     25,     -1,     -25,    -21,    3,      3,      -6,
+    -11,    -3,     2,      4,      34,     22,     -25,    -19,    0,
+    -6,     -10,    -8,     -35,    -32,    8,      -3,     -20,    -11,
+    -6,     3,      8,      -8,     3,      25,     23,     -7,     -35,
+    -15,    8,      -20,    -6,     15,     -44,    -29,    19,     -5,
+    -1,     18,     28,     6,      -21,    9,      11,     -20,    -10,
+    18,     22,     6,      -2,     12,     6,      23,     34,     -20,
+    -19,    1,      -10,    34,     41,     13,     6,      3,      22,
+    11,     -4,     4,      -12,    -8,     17,     18,     12,     -1,
+    5,      9,      -6,     -2,     4,      1,      3,      2,      -6,
+    -32,    -25,    9,      18,     27,     -4,     -54,    -29,    2,
+    -3,     -18,    -38,    -28,    -10,    9,      20,     5,      -9,
+    -15,    -3,     2,      -14,    -15,    -6,     5,      10,     6,
+    3,      -11,    -9,     -5,     -20,    -13,    8,      3,      -14,
+    6,      20,     -15,    -21,    9,      19,     21,     12,     -4,
+    -21,    -17,    16,     27,     -4,     -28,    -2,     26,     9,
+    -12,    -16,    -28,    -28,    -4,     4,      -15,    -9,     3,
+    -10,    -16,    2,      17,     -10,    -26,    3,      16,     26,
+    17,     -12,    -9,     2,      -2,     -5,     -11,    5,      28,
+    1,      -14,    13,     14,     5,      18,     6,      -17,    -5,
+    7,      2,      -3,     11,     10,     -1,     50,     36,     -28,
+    21,     39,     -9,     -6,     2,      10,     36,     20,     -2,
+    -3,     -11,    -10,    -6,     -5,     -4,     -8,     2,      17,
+    1,      -13,    11,     -13,    -36,    11,     14,     -19,    -6,
+    3,      0,      20,     -5,     -24,    12,     7,      -11,    2,
+    -15,    -28,    -1,     6,      -14,    -31,    -39,    -19,    19,
+    37,     3,      -32,    -27,    -6,     13,     31,     15,     -41,
+    -41,    25,     35,     -3,     -16,    -25,    -19,    -10,    -3,
+    19,     10,     -4,     7,      -4,     -19,    -12,    -13,    -9,
+    6,      2,      -12,    -6,     12,     6,      -1,     -5,     -19,
+    -7,     7,      40,     56,     -3,     -13,    21,     24,     7,
+    -11,    -9,     -3,     24,     28,     -10,    1,      12,     21,
+    24,     -16,    -15,    4,      -7,     -2,     19,     13,     -11,
+    -7,     -8,     15,     41,     5,      -16,    -18,    -11,    26,
+    26,     -5,     -12,    -14,    -6,     10,     8,      -8,     -16,
+    -16,    -3,     10,     1,      -3,     -3,     -2,     -15,    -18,
+    6,      -4,     -4,     21,     4,      -2,     15,     13,     0,
+    -2,     12,     7,      -15,    -9,     1,      -2,     2,      -1,
+    -9,     -15,    -17,    -14,    -10,    1,      -4,     -16,    -17,
+    -1,     18,     8,      1,      22,     11,     -19,    -10,    4,
+    -23,    -29,    0,      -2,     -14,    -6,     13,     7,      -23,
+    -13,    10,     9,      11,     10,     4,      -4,     -4,     1,
+    6,      14,     9,      2,      0,      2,      6,      4,      -9,
+    -18,    -8,     8,      18,     8,      13,     9,      -27,    -22,
+    -10,    -24,    -9,     17,     11,     2,      9,      3,      -13,
+    -10,    -1,     -7,     -1,     10,     -4,     1,      16,     12,
+    -6,     -14,    -2,     -5,     -1,     0,      -1,     6,      -9,
+    -3,     12,     4,      1,      -2,     2,      17,     24,     22,
+    9,      8,      21,     14,     -2,     -2,     4,      -1,     -7,
+    -7,     -6,     -1,     -6,     17,     30,     -7,     -10,    -3,
+    -19,    -18,    2,      21,     4,      -20,    -6,     -1,     -18,
+    -14,    -6,     -7,     -1,     6,      10,     8,      -5,     0,
+    10,     -22,    -40,    -22,    4,      34,     16,     -19,    -16,
+    -12,    -17,    -16,    -17,    -29,    -28,    -4,     10,     16,
+    22,     13,     4,      -1,     -5,     16,     15,     -11,    -6,
+    9,      3,      -14,    -22,    -19,    -12,    5,      -5,     -15,
+    3,      9,      27,     17,     -4,     8,      -2,     1,      16,
+    11,     9,      9,      8,      -14,    -16,    7,      -5,     -15,
+    -11,    -5,     19,     25,     25,     43,     21,     -9,     -9,
+    -19,    -10,    14,     -11,    -19,    8,      3,      1,      11,
+    -1,     -24,    -20,    -1,     2,      7,      24,     22,     11,
+    8,      6,      -2,     -11,    -3,     -2,     -4,     0,      -7,
+    0,      6,      -1,     -16,    -35,    -8,     8,      -11,    -6,
+    6,      18,     16,     7,      12,     5,      -2,     -3,     -10,
+    -21,    -27,    -10,    -3,     -3,     8,      0,      -9,     -10,
+    -3,     0,      -5,     6,      9,      19,     23,     8,      -5,
+    -19,    -16,    -5,     -6,     -27,    -22,    1,      6,      8,
+    2,      -9,     -13,    -15,    -18,    -13,    4,      25,     29,
+    26,     -2,     -22,    1,      8,      1,      -6,     -6,     -7,
+    -20,    0,      13,     -14,    -24,    -24,    -21,    2,      14,
+    16,     23,     15,     10,     10,     5,      0,      -26,    -32,
+    3,      19,     5,      -8,     -7,     -8,     -3,     17,     27,
+    -7,     -28,    10,     32,     10,     1,      10,     3,      -4,
+    22,     24,     -31,    -40,    0,      6,      5,      17,     17,
+    1,      10,     30,     8,      -12,    -6,     9,      6,      -12,
+    -5,     1,      -4,     6,      11,     0,      -9,     -4,     -3,
+    -4,     -3,     2,      0,      -2,     -9,     -27,    -23,    2,
+    13,     -6,     -9,     -3,     -12,    -2,     10,     6,      -7,
+    -19,    -31,    -13,    16,     11,     -3,     -13,    -15,    0,
+    7,      -3,     -7,     -1,     -4,     7,      15,     0,      -12,
+    -8,     -1,     -7,     -12,    -21,    -17,    5,      30,     25,
+    -6,     -6,     0,      -12,    -8,     2,      13,     11,     1,
+    5,      4,      4,      10,     -1,     -20,    -12,    -4,     3,
+    15,     11,     -7,     -24,    -4,     8,      -2,     -14,    -25,
+    -17,    7,      21,     14,     1,      0,      12,     17,     13,
+    6,      1,      6,      14,     11,     -10,    -21,    -12,    -4,
+    3,      -2,     -21,    -24,    -2,     12,     14,     17,     4,
+    -2,     11,     11,     11,     1,      -34,    -32,    -5,     10,
+    7,      -11,    -12,    6,      7,      -4,     -10,    -15,    -5,
+    17,     21,     0,      -15,    -15,    -1,     5,      -18,    -18,
+    -10,    -9,     24,     27,     -9,     -14,    0,      9,      25,
+    22,     1,      -7,     -2,     16,     13,     -14,    -10,    7,
+    0,      2,      15,     2,      -9,     5,      10,     -5,     -3,
+    10,     3,      0,      15,     15,     -1,     -3,     8,      6,
+    -7,     -7,     2,      0,      -4,     5,      -8,     -37,    -28,
+    -1,     8,      6,      10,     -1,     -12,    12,     28,     8,
+    -17,    -16,    -15,    -17,    1,      6,      -4,     -8,     -4,
+    -15,    -15,    6,      -9,     -15,    10,     9,      -13,    -8,
+    5,      -2,     -10,    5,      12,     -27,    -33,    9,      8,
+    -16,    -3,     16,     -3,     -7,     22,     22,     10,     5,
+    -11,    -16,    -4,     9,      12,     6,      -3,     2,      2,
+    -1,     4,      -7,     -8,     1,      8,      19,
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h b/tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..33aeea516fb8c7fcb080b3b971bf5d69b81b9c4c
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/yes_1000ms_sample_data.h
@@ -0,0 +1,29 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This data was created from the PCM data in a WAV file held in v2 of the
+// Speech Commands test dataset, at the path:
+// speech_commands_test_set_v0.02/yes/f2e59fea_nohash_1.wav
+// This should contain all 16,000 samples from the one-second file.
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_1000MS_SAMPLE_DATA_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_1000MS_SAMPLE_DATA_H_
+
+#include <cstdint>
+
+extern const int g_yes_1000ms_sample_data_size;
+extern const int16_t g_yes_1000ms_sample_data[];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_YES_1000MS_SAMPLE_DATA_H_
diff --git a/tensorflow/lite/experimental/micro/kernels/BUILD b/tensorflow/lite/experimental/micro/kernels/BUILD
index a54fd41760d58f2023e6b7b2aac72ac5f5e95ae3..e2d3164d4c3828bbd067e068fdbf0f6ba3babc7f 100644
--- a/tensorflow/lite/experimental/micro/kernels/BUILD
+++ b/tensorflow/lite/experimental/micro/kernels/BUILD
@@ -22,7 +22,6 @@ cc_library(
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite/c:c_api_internal",
-        "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:op_macros",
         "//tensorflow/lite/kernels:padding",
@@ -43,27 +42,10 @@ cc_library(
     copts = tflite_copts(),
     deps = [
         ":micro_ops",
-        "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
     ],
 )
 
-cc_library(
-    name = "test_utils",
-    srcs = [
-    ],
-    hdrs = [
-        "test_utils.h",
-    ],
-    copts = tflite_copts(),
-    deps = [
-        "//tensorflow/lite/c:c_api_internal",
-        "//tensorflow/lite/core/api",
-        "//tensorflow/lite/experimental/micro:micro_framework",
-        "//tensorflow/lite/experimental/micro/testing:micro_test",
-    ],
-)
-
 tflite_micro_cc_test(
     name = "depthwise_conv_test",
     srcs = [
@@ -71,7 +53,6 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
-        ":test_utils",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
@@ -85,7 +66,6 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
-        ":test_utils",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
@@ -99,7 +79,6 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
-        ":test_utils",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
diff --git a/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc b/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc
index f70437a4b943e6e71547e010a0fea9ab551194db..05ba8798c0dc34eab5c563489cf9fc928325d00f 100644
--- a/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/depthwise_conv_test.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/lite/experimental/micro/kernels/test_utils.h"
 #include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/experimental/micro/kernels/fully_connected_test.cc b/tensorflow/lite/experimental/micro/kernels/fully_connected_test.cc
index 300f8aaf78ad38a2cd4a7c715cf63315a0b2e751..c2e1446848db68a4be42eab282da34e38999670f 100644
--- a/tensorflow/lite/experimental/micro/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/fully_connected_test.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/lite/experimental/micro/kernels/test_utils.h"
 #include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/experimental/micro/kernels/softmax_test.cc b/tensorflow/lite/experimental/micro/kernels/softmax_test.cc
index 7253b3be8ce20ff6d30ca725060da606c416c8e1..8933b6c0ed090b175c5d42282dc0ec6f22142206 100644
--- a/tensorflow/lite/experimental/micro/kernels/softmax_test.cc
+++ b/tensorflow/lite/experimental/micro/kernels/softmax_test.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
-#include "tensorflow/lite/experimental/micro/kernels/test_utils.h"
 #include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+#include "tensorflow/lite/experimental/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/experimental/micro/mbed/debug_log.cc b/tensorflow/lite/experimental/micro/mbed/debug_log.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d4a4a5a8429bb7867c225a97696c28eb5ad8d3b7
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/mbed/debug_log.cc
@@ -0,0 +1,24 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/debug_log.h"
+
+#include <mbed.h>
+
+// On mbed platforms, we set up a serial port and write to it for debug logging.
+extern "C" void DebugLog(const char* s) {
+  static Serial pc(USBTX, USBRX);
+  pc.printf("%s", s);
+}
diff --git a/tensorflow/lite/experimental/micro/micro_error_reporter.h b/tensorflow/lite/experimental/micro/micro_error_reporter.h
index 0ab853ec2ac915a8eb3da87eb8b86f2ecec697c7..6c18367c95fc9f07eb67b90a0e736b64271d9291 100644
--- a/tensorflow/lite/experimental/micro/micro_error_reporter.h
+++ b/tensorflow/lite/experimental/micro/micro_error_reporter.h
@@ -17,26 +17,8 @@ limitations under the License.
 
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/experimental/micro/compatibility.h"
-
-#ifdef TF_LITE_MCU_DEBUG_LOG
-// These functions should be supplied by the micro target library
-extern "C" {
-#include <stdint.h>
-void DebugLog(const char* s);
-void DebugLogInt32(int32_t i);
-void DebugLogUInt32(uint32_t i);
-void DebugLogHex(uint32_t i);
-void DebugLogFloat(float i);
-}
-#else  // TF_LITE_MCU_DEBUG_LOG
-#include <cstdint>
-#include <cstdio>
-static void inline DebugLog(const char* s) { fprintf(stderr, "%s", s); }
-static void inline DebugLogInt32(int32_t i) { fprintf(stderr, "%d", i); }
-static void inline DebugLogUInt32(uint32_t i) { fprintf(stderr, "%d", i); }
-static void inline DebugLogHex(uint32_t i) { fprintf(stderr, "0x%8x", i); }
-static void inline DebugLogFloat(float i) { fprintf(stderr, "%f", i); }
-#endif  // TF_LITE_MCU_DEBUG_LOG
+#include "tensorflow/lite/experimental/micro/debug_log.h"
+#include "tensorflow/lite/experimental/micro/debug_log_numbers.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/experimental/micro/riscv32_mcu/README.md b/tensorflow/lite/experimental/micro/riscv32_mcu/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5477d7ae951cbd8c47312f51acdea16d87f5f910
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/riscv32_mcu/README.md
@@ -0,0 +1,7 @@
+# RISC-V MCU
+
+This folder contains TFLite kernel operations optimized for RISC-V micro
+controllers.
+
+It is designed to be portable even to 'bare metal', so it follows the same
+design goals as the micro experimental port.
diff --git a/tensorflow/lite/experimental/micro/riscv32_mcu/debug_log.cc b/tensorflow/lite/experimental/micro/riscv32_mcu/debug_log.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d1c2df866e9f8e4c99aabcc7fe73e4879b079b42
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/riscv32_mcu/debug_log.cc
@@ -0,0 +1,18 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// TODO(b/121324430): Add test for DebugLog fuctions
+// TODO(b/121275099): Remove dependency on debug_log once the platform supports
+// printf
+
+#include <stdio.h>
+
+extern "C" void DebugLog(const char* s) { puts(s); }
diff --git a/tensorflow/lite/experimental/micro/testing/BUILD b/tensorflow/lite/experimental/micro/testing/BUILD
index 5a31a709ca3f0205b8764528d6e8f2c0fe0f93d0..1623df5b8650a34aa900cb6d362e444bc640fc8e 100644
--- a/tensorflow/lite/experimental/micro/testing/BUILD
+++ b/tensorflow/lite/experimental/micro/testing/BUILD
@@ -10,8 +10,10 @@ cc_library(
     name = "micro_test",
     hdrs = [
         "micro_test.h",
+        "test_utils.h",
     ],
     deps = [
+        "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
     ],
 )
diff --git a/tensorflow/lite/experimental/micro/testing/Dockerfile.riscv b/tensorflow/lite/experimental/micro/testing/Dockerfile.riscv
new file mode 100644
index 0000000000000000000000000000000000000000..4f7ac555e6f89c1d209dc6a4d62786d357db91ed
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/testing/Dockerfile.riscv
@@ -0,0 +1,24 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# This docker configuration file lets you emulate a Hifive1 board
+# on an x86 desktop or laptop, which can be useful for debugging and
+# automated testing.
+FROM antmicro/renode:latest
+
+LABEL maintainer="Pete Warden <petewarden@google.com>"
+
+RUN apt-get update
+RUN apt-get install -y curl git unzip make g++
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/micro/testing/micro_test.h b/tensorflow/lite/experimental/micro/testing/micro_test.h
index 2f20dd5ac77dfd3f304c7cc93be0b865a0c2f0cb..32e9a57f76ecc055c67c0ede8d1c83550c602aab 100644
--- a/tensorflow/lite/experimental/micro/testing/micro_test.h
+++ b/tensorflow/lite/experimental/micro/testing/micro_test.h
@@ -107,13 +107,13 @@ extern tflite::ErrorReporter* reporter;
     }                                                                          \
   } while (false)
 
-#define TF_LITE_MICRO_EXPECT_EQ(x, y)                                         \
-  do {                                                                        \
-    if ((x) != (y)) {                                                         \
-      micro_test::reporter->Report(#x " == " #y " failed at %s:%d", __FILE__, \
-                                   __LINE__);                                 \
-      micro_test::did_test_fail = true;                                       \
-    }                                                                         \
+#define TF_LITE_MICRO_EXPECT_EQ(x, y)                                          \
+  do {                                                                         \
+    if ((x) != (y)) {                                                          \
+      micro_test::reporter->Report(#x " == " #y " failed at %s:%d (%d vs %d)", \
+                                   __FILE__, __LINE__, (x), (y));              \
+      micro_test::did_test_fail = true;                                        \
+    }                                                                          \
   } while (false)
 
 #define TF_LITE_MICRO_EXPECT_NE(x, y)                                         \
diff --git a/tensorflow/lite/experimental/micro/testing/sifive_fe310.resc b/tensorflow/lite/experimental/micro/testing/sifive_fe310.resc
new file mode 100644
index 0000000000000000000000000000000000000000..c84ce5091c778fc3226ad4a7dbb0230d38037438
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/testing/sifive_fe310.resc
@@ -0,0 +1,20 @@
+:name: SiFive-FE310
+:description: This script runs Zephyr RTOS shell sample on SiFive-FE310 platform.
+
+$name?="SiFive-FE310"
+
+using sysbus
+mach create $name
+machine LoadPlatformDescription @platforms/cpus/sifive-fe310.repl
+
+$bin?=@/workspace/tensorflow/lite/experimental/micro/tools/make/gen/riscv32_mcu_riscv32_mcu/bin/micro_speech_test
+
+showAnalyzer uart0 Antmicro.Renode.Analyzers.LoggingUartAnalyzer
+logFile @/tmp/renode_riscv_log.txt
+
+sysbus LoadELF $bin
+
+sysbus Tag <0x10008000 4> "PRCI_HFROSCCFG" 0xFFFFFFFF
+sysbus Tag <0x10008008 4> "PRCI_PLLCFG" 0xFFFFFFFF
+
+cpu PerformanceInMips 320
diff --git a/tensorflow/lite/experimental/micro/testing/test_ecm3531_binary.sh b/tensorflow/lite/experimental/micro/testing/test_ecm3531_binary.sh
new file mode 100755
index 0000000000000000000000000000000000000000..1647cf82a276d7c1725c7c3334693e0e1b7e057c
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/testing/test_ecm3531_binary.sh
@@ -0,0 +1,16 @@
+#!/bin/bash -e
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
diff --git a/tensorflow/lite/experimental/micro/kernels/test_utils.h b/tensorflow/lite/experimental/micro/testing/test_utils.h
similarity index 91%
rename from tensorflow/lite/experimental/micro/kernels/test_utils.h
rename to tensorflow/lite/experimental/micro/testing/test_utils.h
index 95f2d8a9d217a1b1f23c0198ddce5156e1c6cb36..e37eaf46e0815087cdc48c6aa23353f6f1cf9d7f 100644
--- a/tensorflow/lite/experimental/micro/kernels/test_utils.h
+++ b/tensorflow/lite/experimental/micro/testing/test_utils.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_KERNELS_TEST_UTILS_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_KERNELS_TEST_UTILS_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_TESTING_TEST_UTILS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_TESTING_TEST_UTILS_H_
 
 #include <cstdarg>
 #include <initializer_list>
@@ -21,8 +21,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
-#include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/experimental/micro/kernels/test_utils.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 #include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace tflite {
@@ -164,7 +163,20 @@ inline TfLiteTensor CreateQuantized32Tensor(std::initializer_list<int32_t> data,
   return CreateQuantized32Tensor(data.begin(), dims, name, min, max);
 }
 
+// Do a simple string comparison for testing purposes, without requiring the
+// standard C library.
+inline int TestStrcmp(const char* a, const char* b) {
+  if ((a == nullptr) || (b == nullptr)) {
+    return -1;
+  }
+  while ((*a != 0) && (*a == *b)) {
+    a++;
+    b++;
+  }
+  return *(const unsigned char*)a - *(const unsigned char*)b;
+}
+
 }  // namespace testing
 }  // namespace tflite
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_KERNELS_TEST_UTILS_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_TESTING_TEST_UTILS_H_
diff --git a/tensorflow/lite/experimental/micro/tools/ci_build/ci_build_micro_projects.sh b/tensorflow/lite/experimental/micro/tools/ci_build/ci_build_micro_projects.sh
new file mode 100755
index 0000000000000000000000000000000000000000..dcec7269bdc95ab57204f3b4cbc17f9d3cacadc0
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/ci_build/ci_build_micro_projects.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Creates the project file distributions for the TensorFlow Lite Micro test and
+# example targets aimed at embedded platforms.
+#
+# Usage: ci_build_micro_projects.sh <TARGET OS> <TAGS>
+#
+# For example:
+# ci_build_micro_projects.sh mbed "CMSIS disco_f746ng"
+
+set -e
+set -x
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR=${SCRIPT_DIR}/../../../../../..
+cd ${ROOT_DIR}
+pwd
+
+tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
+
+make -f tensorflow/lite/experimental/micro/tools/make/Makefile \
+  TARGET=${1} \
+  TAGS="${2}" \
+  generate_projects
diff --git a/tensorflow/lite/experimental/micro/tools/make/.gitignore b/tensorflow/lite/experimental/micro/tools/make/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..752f078fb56ca734056d694d0528943a82a8ef3e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/.gitignore
@@ -0,0 +1,2 @@
+downloads
+gen
diff --git a/tensorflow/lite/experimental/micro/tools/make/Makefile b/tensorflow/lite/experimental/micro/tools/make/Makefile
index 0caf0ca099e0520f90530b02f9a95efbe6e3d299..1179b2850675decb0b18dadbf4e55ab667defb3b 100644
--- a/tensorflow/lite/experimental/micro/tools/make/Makefile
+++ b/tensorflow/lite/experimental/micro/tools/make/Makefile
@@ -1,5 +1,9 @@
+
 MAKEFILE_DIR := tensorflow/lite/experimental/micro/tools/make
 
+# Pull in some convenience functions.
+include $(MAKEFILE_DIR)/helper_functions.inc
+
 # Try to figure out the host system
 HOST_OS :=
 ifeq ($(OS),Windows_NT)
@@ -21,17 +25,31 @@ HOST_ARCH := $(shell if [[ $(shell uname -m) =~ i[345678]86 ]]; then echo x86_32
 TARGET := $(HOST_OS)
 TARGET_ARCH := $(HOST_ARCH)
 
+# Specify TAGS on the command line to add a particular set of specialized
+# implementations, for example TAGS="CMSIS disco_f746ng" to target a Discovery
+# STM32F746NG board, using the CMSIS library's implementations where possible.
+ALL_TAGS := $(TAGS) $(TARGET)
+
+# This is obviously horrible.  We need to generate these 3 versions of the
+# include directories from one source.
 INCLUDES := \
 -I. \
--I$(MAKEFILE_DIR)/../../../../../ \
--I$(MAKEFILE_DIR)/../../../../../../ \
 -I$(MAKEFILE_DIR)/downloads/ \
 -I$(MAKEFILE_DIR)/downloads/gemmlowp \
--I$(MAKEFILE_DIR)/downloads/flatbuffers/include \
--I$(OBJDIR)
-# This is at the end so any globally-installed frameworks like protobuf don't
-# override local versions in the source tree.
-INCLUDES += -I/usr/local/include
+-I$(MAKEFILE_DIR)/downloads/flatbuffers/include
+
+# Same list of paths, but now relative to the generated project files.
+GENERATED_PROJECT_INCLUDES := \
+-I. \
+-I./third_party/gemmlowp \
+-I./third_party/flatbuffers/include
+
+# Same list of paths, but now in the format the generate_keil_project.py
+# script expects them.
+PROJECT_INCLUDES := \
+. \
+third_party/gemmlowp \
+third_party/flatbuffers/include
 
 TEST_SCRIPT := tensorflow/lite/experimental/micro/testing/test_linux_binary.sh
 
@@ -52,33 +70,13 @@ CC_PREFIX :=
 # runtime that can be linked in to other programs.
 MICROLITE_LIB_NAME := libtensorflow-microlite.a
 
-# Test binary for the microcontroller speech model.
-MICRO_SPEECH_TEST_SRCS := \
-tensorflow/lite/experimental/micro/examples/micro_speech/micro_speech_test.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/no_features_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/yes_features_data.cc
-
-# Test binary for the microcontroller speech model.
-PREPROCESSOR_TEST_SRCS := \
-tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor_test.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/no_30ms_sample_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/yes_30ms_sample_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/no_power_spectrum_data.cc \
-tensorflow/lite/experimental/micro/examples/micro_speech/yes_power_spectrum_data.cc
-
-PREPROCESSOR_REFERENCE_TEST_SRCS = \
-$(PREPROCESSOR_TEST_SRCS) \
-tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
-
-PREPROCESSOR_FIXED_TEST_SRCS += \
-$(PREPROCESSOR_TEST_SRCS) \
-tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
-
 MICROLITE_TEST_SRCS := \
 $(wildcard tensorflow/lite/experimental/micro/*test.cc) \
 $(wildcard tensorflow/lite/experimental/micro/kernels/*test.cc)
 
+MICROLITE_TEST_HDRS := \
+$(wildcard tensorflow/lite/experimental/micro/testing/*.h)
+
 MICROLITE_CC_BASE_SRCS := \
 $(wildcard tensorflow/lite/experimental/micro/*.cc) \
 $(wildcard tensorflow/lite/experimental/micro/kernels/*.cc) \
@@ -89,6 +87,63 @@ tensorflow/lite/core/api/op_resolver.cc \
 tensorflow/lite/kernels/kernel_util.cc \
 tensorflow/lite/kernels/internal/quantization_util.cc
 MICROLITE_CC_SRCS := $(filter-out $(MICROLITE_TEST_SRCS), $(MICROLITE_CC_BASE_SRCS))
+MICROLITE_CC_SRCS := $(call specialize,$(MICROLITE_CC_SRCS))
+
+MICROLITE_CC_HDRS := \
+$(wildcard tensorflow/lite/experimental/micro/*.h) \
+$(wildcard tensorflow/lite/experimental/micro/kernels/*.h) \
+LICENSE \
+tensorflow/lite/c/c_api_internal.h \
+tensorflow/lite/c/builtin_op_data.h \
+tensorflow/lite/core/api/error_reporter.h \
+tensorflow/lite/core/api/flatbuffer_conversions.h \
+tensorflow/lite/core/api/op_resolver.h \
+tensorflow/lite/kernels/kernel_util.h \
+tensorflow/lite/kernels/op_macros.h \
+tensorflow/lite/kernels/padding.h \
+tensorflow/lite/kernels/internal/common.h \
+tensorflow/lite/kernels/internal/compatibility.h \
+tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h \
+tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h \
+tensorflow/lite/kernels/internal/reference/fully_connected.h \
+tensorflow/lite/kernels/internal/reference/softmax.h \
+tensorflow/lite/kernels/internal/round.h \
+tensorflow/lite/kernels/internal/tensor_ctypes.h \
+tensorflow/lite/kernels/internal/types.h \
+tensorflow/lite/kernels/internal/quantization_util.h \
+tensorflow/lite/schema/schema_generated.h \
+tensorflow/lite/version.h
+
+THIRD_PARTY_CC_HDRS := \
+third_party/gemmlowp/fixedpoint/fixedpoint.h \
+third_party/gemmlowp/fixedpoint/fixedpoint_sse.h \
+third_party/gemmlowp/internal/detect_platform.h \
+third_party/gemmlowp/LICENSE \
+third_party/flatbuffers/include/flatbuffers/base.h \
+third_party/flatbuffers/include/flatbuffers/stl_emulation.h \
+third_party/flatbuffers/include/flatbuffers/flatbuffers.h \
+third_party/flatbuffers/LICENSE.txt
+
+MAKE_PROJECT_FILES := \
+  README_MAKE.md \
+  Makefile \
+  .vscode/tasks.json
+
+MBED_PROJECT_FILES := \
+  README_MBED.md \
+  mbed-os.lib \
+  mbed_app.json \
+  .vscode/tasks.json
+
+KEIL_PROJECT_FILES := \
+  README_KEIL.md \
+  keil_project.uvprojx
+
+ALL_PROJECT_TARGETS :=
+
+KEIL_PROJECT_FILES := \
+  README_KEIL.md \
+  keil_project.uvprojx
 
 # These target-specific makefiles should modify or replace options like
 # CXXFLAGS or LIBS to work for a specific targetted architecture. All logic
@@ -96,10 +151,9 @@ MICROLITE_CC_SRCS := $(filter-out $(MICROLITE_TEST_SRCS), $(MICROLITE_CC_BASE_SR
 # keep this main makefile focused on the sources and dependencies.
 include $(wildcard $(MAKEFILE_DIR)/targets/*_makefile.inc)
 
+ALL_TAGS += $(TARGET_ARCH)
+
 ALL_SRCS := \
-	$(MICRO_SPEECH_TEST_SRCS) \
-	$(PREPROCESSOR_REFERENCE_TEST_SRCS) \
-	$(PREPROCESSOR_FIXED_TEST_SRCS) \
 	$(MICROLITE_CC_SRCS) \
 	$(MICROLITE_TEST_SRCS)
 
@@ -108,31 +162,22 @@ GENDIR := $(MAKEFILE_DIR)/gen/$(TARGET)_$(TARGET_ARCH)/
 OBJDIR := $(GENDIR)obj/
 BINDIR := $(GENDIR)bin/
 LIBDIR := $(GENDIR)lib/
+PRJDIR := $(GENDIR)prj/
 
 MICROLITE_LIB_PATH := $(LIBDIR)$(MICROLITE_LIB_NAME)
 
-MICRO_SPEECH_TEST_BINARY := $(BINDIR)micro_speech_test
-PREPROCESSOR_REFERENCE_TEST_BINARY := $(BINDIR)preprocessor_reference_test
-PREPROCESSOR_FIXED_TEST_BINARY := $(BINDIR)preprocessor_fixed_test
-
 CXX := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}g++
 CC := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}gcc
 AR := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}ar
 
-MICRO_SPEECH_TEST_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICRO_SPEECH_TEST_SRCS))))
-
-PREPROCESSOR_REFERENCE_TEST_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_REFERENCE_TEST_SRCS))))
-
-PREPROCESSOR_FIXED_TEST_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(PREPROCESSOR_FIXED_TEST_SRCS))))
+# Load the examples.
+include $(wildcard tensorflow/lite/experimental/micro/examples/*/Makefile.inc)
 
 MICROLITE_LIB_OBJS := $(addprefix $(OBJDIR), \
 $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MICROLITE_CC_SRCS))))
 
-MICROLITE_TEST_TARGETS := $(addprefix $(BINDIR), \
-$(patsubst %_test.cc,%.test_target,$(MICROLITE_TEST_SRCS)))
+MICROLITE_LIB_OBJS += $(addprefix $(OBJDIR), \
+$(patsubst %.S,%.o,$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(THIRD_PARTY_CC_SRCS)))))
 
 # For normal manually-created TensorFlow C++ source files.
 $(OBJDIR)%.o: %.cc
@@ -144,8 +189,13 @@ $(OBJDIR)%.o: %.c
 	@mkdir -p $(dir $@)
 	$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
 
+# For normal manually-created TensorFlow ASM source files.
+$(OBJDIR)%.o: %.S
+	@mkdir -p $(dir $@)
+	$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
+
 # The target that's compiled if there's no command-line arguments.
-all: $(MICROLITE_LIB_PATH) $(MICRO_SPEECH_TEST_BINARY) $(PREPROCESSOR_TEST_BINARY)
+all: $(MICROLITE_LIB_PATH)
 
 microlite: $(MICROLITE_LIB_PATH)
 
@@ -158,42 +208,6 @@ $(MICROLITE_LIB_PATH): tensorflow/lite/schema/schema_generated.h $(MICROLITE_LIB
 	@mkdir -p $(dir $@)
 	$(AR) $(ARFLAGS) $(MICROLITE_LIB_PATH) $(MICROLITE_LIB_OBJS)
 
-$(MICRO_SPEECH_TEST_BINARY): $(MICRO_SPEECH_TEST_OBJS) $(MICROLITE_LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(MICRO_SPEECH_TEST_BINARY) $(MICRO_SPEECH_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
-
-micro_speech_test: $(MICRO_SPEECH_TEST_BINARY)
-micro_speech_test_bin: $(MICRO_SPEECH_TEST_BINARY).bin
-
-test_micro_speech: $(MICRO_SPEECH_TEST_BINARY)
-	$(TEST_SCRIPT) $(MICRO_SPEECH_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
-
-$(PREPROCESSOR_REFERENCE_TEST_BINARY): $(PREPROCESSOR_REFERENCE_TEST_OBJS) $(MICROLITE_LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(PREPROCESSOR_REFERENCE_TEST_BINARY) $(PREPROCESSOR_REFERENCE_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
-
-preprocessor_reference_test: $(PREPROCESSOR_REFERENCE_TEST_BINARY)
-preprocessor_reference_test_bin: $(PREPROCESSOR_REFERENCE_TEST_BINARY).bin
-
-test_preprocessor_reference: $(PREPROCESSOR_REFERENCE_TEST_BINARY)
-	$(TEST_SCRIPT) $(PREPROCESSOR_REFERENCE_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
-
-$(PREPROCESSOR_FIXED_TEST_BINARY): $(PREPROCESSOR_FIXED_TEST_OBJS) $(MICROLITE_LIB_PATH)
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(PREPROCESSOR_FIXED_TEST_BINARY) $(PREPROCESSOR_FIXED_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
-
-preprocessor_fixed_test: $(PREPROCESSOR_FIXED_TEST_BINARY)
-preprocessor_fixed_test_bin: $(PREPROCESSOR_FIXED_TEST_BINARY).bin
-
-test_preprocessor_fixed: $(PREPROCESSOR_FIXED_TEST_BINARY)
-	$(TEST_SCRIPT) $(PREPROCESSOR_FIXED_TEST_BINARY) '~~~ALL TESTS PASSED~~~'
-
 $(BINDIR)%_test : $(OBJDIR)%_test.o $(MICROLITE_LIB_PATH)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
@@ -203,9 +217,21 @@ $(BINDIR)%_test : $(OBJDIR)%_test.o $(MICROLITE_LIB_PATH)
 $(BINDIR)%.test_target: $(BINDIR)%_test
 	$(TEST_SCRIPT) $< '~~~ALL TESTS PASSED~~~'
 
-$(info $(MICROLITE_TEST_TARGETS))
+# snease: Add %.bin rule here since BINDIR is now defined
+# These are microcontroller-specific rules for converting the ELF output
+# of the linker into a binary image that can be loaded directly.
+OBJCOPY := $(TARGET_TOOLCHAIN_PREFIX)objcopy
+$(BINDIR)%.bin: $(BINDIR)%
+	@mkdir -p $(dir $@)
+	$(OBJCOPY) $< $@ -O binary
+
+# Generate standalone makefile projects for all of the test targets.
+$(foreach TEST_TARGET,$(MICROLITE_TEST_SRCS),\
+$(eval $(call microlite_test,$(notdir $(basename $(TEST_TARGET))),$(TEST_TARGET))))
+
+test: $(MICROLITE_TEST_TARGETS)
 
-test: test_micro_speech $(MICROLITE_TEST_TARGETS)
+generate_projects: $(ALL_PROJECT_TARGETS)
 
 # Gets rid of all generated files.
 clean:
diff --git a/tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh b/tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
index 6749858bdb9ffe7942efcc1dc22acb4c6aa6a533..639f002c3bff085052bd5611ed319e05cf50643d 100755
--- a/tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
+++ b/tensorflow/lite/experimental/micro/tools/make/download_dependencies.sh
@@ -33,6 +33,13 @@ GEMMLOWP_URL="https://github.com/google/gemmlowp/archive/719139ce755a0f31cbf1c37
 FLATBUFFERS_URL="https://github.com/google/flatbuffers/archive/1f5eae5d6a135ff6811724f6c57f911d1f46bb15.tar.gz"
 CMSIS_URL="https://github.com/ARM-software/CMSIS_5/archive/5.4.0.zip"
 STM32_BARE_LIB_URL="https://github.com/google/stm32_bare_lib/archive/c07d611fb0af58450c5a3e0ab4d52b47f99bc82d.zip"
+SIFIVE_FE310_LIB_URL="https://github.com/sifive/freedom-e-sdk/archive/baeeb8fd497a99b3c141d7494309ec2e64f19bdf.zip"
+RISCV_TOOLCHAIN_URL="https://static.dev.sifive.com/dev-tools/riscv64-unknown-elf-gcc-20181030-x86_64-linux-ubuntu14.tar.gz"
+AM_SDK_URL="http://s3.asia.ambiqmicro.com/downloads/AmbiqSuite-Rel2.0.0.zip"
+AP3_URL="https://github.com/AmbiqMicro/TFLiteMicro_Apollo3/archive/dfbcef9a57276c087d95aab7cb234f1d4c9eaaba.zip"
+CUST_CMSIS_URL="https://github.com/AmbiqMicro/TFLiteMicro_CustCMSIS/archive/8f63966c5692e6a3a83956efd2e4aed77c4c9949.zip"
+GCC_EMBEDDED_URL="https://developer.arm.com/-/media/Files/downloads/gnu-rm/7-2018q2/gcc-arm-none-eabi-7-2018-q2-update-linux.tar.bz2"
+KISSFFT_URL="http://downloads.sourceforge.net/project/kissfft/kissfft/v1_3_0/kiss_fft130.zip"
 
 download_and_extract() {
   local usage="Usage: download_and_extract URL DIR"
@@ -42,6 +49,8 @@ download_and_extract() {
   mkdir -p "${dir}"
   if [[ "${url}" == *gz ]]; then
     curl -Ls "${url}" | tar -C "${dir}" --strip-components=1 -xz
+  elif [[ "${url}" == *bz2 ]]; then
+    curl -Ls "${url}" | tar -C "${dir}" --strip-components=1 -xj
   elif [[ "${url}" == *zip ]]; then
     tempdir=$(mktemp -d)
     tempdir2=$(mktemp -d)
@@ -65,9 +74,54 @@ download_and_extract() {
   find "${dir}" -type f -name '*BUILD' -delete
 }
 
+patch_am_sdk() {
+  local am_dir="${1}"
+  if [ ! -f ${am_dir}/VERSION.txt ]; then
+    echo "Could not find ${am_dir}, skipping AmbiqMicro SDK patch";
+    return;
+  fi
+
+  local src_dir=${am_dir}/boards/apollo3_evb/examples/hello_world/gcc
+  local dest_dir=${am_dir}/boards/apollo3_evb/examples/hello_world/gcc_patched
+
+  rm -rf ${dest_dir}
+  mkdir ${dest_dir}
+
+  cp "${src_dir}/startup_gcc.c" "${dest_dir}/startup_gcc.c"
+  cp "${src_dir}/hello_world.ld" "${dest_dir}/apollo3evb.ld"
+
+  sed -i -e '114s/1024/1024\*20/g' "${dest_dir}/startup_gcc.c"
+  sed -i -e 's/main/_main/g' "${dest_dir}/startup_gcc.c"
+
+  sed -i -e '3s/hello_world.ld/apollo3evb.ld/g' "${dest_dir}/apollo3evb.ld"
+  sed -i -e '3s/startup_gnu/startup_gcc/g' "${dest_dir}/apollo3evb.ld"
+  sed -i -e '22s/\*(.text\*)/\*(.text\*)\n\n\t\/\* These are the C++ global constructors.  Stick them all here and\n\t \* then walk through the array in main() calling them all.\n\t \*\/\n\t_init_array_start = .;\n\tKEEP (\*(SORT(.init_array\*)))\n\t_init_array_end = .;\n\n\t\/\* XXX Currently not doing anything for global destructors. \*\/\n/g' "${dest_dir}/apollo3evb.ld"
+  sed -i -e "70s/} > SRAM/} > SRAM\n    \/\* Add this to satisfy reference to symbol 'end' from libnosys.a(sbrk.o)\n     \* to denote the HEAP start.\n     \*\/\n   end = .;/g" "${dest_dir}/apollo3evb.ld"
+
+  echo "Finished preparing Apollo3 files"
+}
+
+patch_kissfft() {
+  sed -i -E "s@#ifdef FIXED_POINT@// Patched automatically by download_dependencies.sh so default is 16 bit.\n#ifndef FIXED_POINT\n#define FIXED_POINT (16)\n#endif\n// End patch.\n\n#ifdef FIXED_POINT@g" tensorflow/lite/experimental/micro/tools/make/downloads/kissfft/kiss_fft.h
+  sed -i -E "s@#define KISS_FFT_MALLOC malloc@#define KISS_FFT_MALLOC(X) (void*)(0) /* Patched. */@g" tensorflow/lite/experimental/micro/tools/make/downloads/kissfft/kiss_fft.h
+  sed -i -E "s@#define KISS_FFT_FREE free@#define KISS_FFT_FREE(X) /* Patched. */@g" tensorflow/lite/experimental/micro/tools/make/downloads/kissfft/kiss_fft.h
+  sed -i -E "s@(fprintf.*\);)@/* \1 */@g" tensorflow/lite/experimental/micro/tools/make/downloads/kissfft/tools/kiss_fftr.c
+  sed -i -E "s@(exit.*\);)@return; /* \1 */@g" tensorflow/lite/experimental/micro/tools/make/downloads/kissfft/tools/kiss_fftr.c
+  echo "Finished patching kissfft"
+}
+
 download_and_extract "${GEMMLOWP_URL}" "${DOWNLOADS_DIR}/gemmlowp"
 download_and_extract "${FLATBUFFERS_URL}" "${DOWNLOADS_DIR}/flatbuffers"
 download_and_extract "${CMSIS_URL}" "${DOWNLOADS_DIR}/cmsis"
 download_and_extract "${STM32_BARE_LIB_URL}" "${DOWNLOADS_DIR}/stm32_bare_lib"
+download_and_extract "${SIFIVE_FE310_LIB_URL}" "${DOWNLOADS_DIR}/sifive_fe310_lib"
+download_and_extract "${RISCV_TOOLCHAIN_URL}" "${DOWNLOADS_DIR}/riscv_toolchain"
+download_and_extract "${AM_SDK_URL}" "${DOWNLOADS_DIR}/AmbiqSuite-Rel2.0.0"
+patch_am_sdk "${DOWNLOADS_DIR}/AmbiqSuite-Rel2.0.0"
+download_and_extract "${AP3_URL}" "${DOWNLOADS_DIR}/apollo3_ext"
+download_and_extract "${CUST_CMSIS_URL}" "${DOWNLOADS_DIR}/CMSIS_ext"
+download_and_extract "${GCC_EMBEDDED_URL}" "${DOWNLOADS_DIR}/gcc_embedded"
+download_and_extract "${KISSFFT_URL}" "${DOWNLOADS_DIR}/kissfft"
+patch_kissfft "${DOWNLOADS_DIR}/kissfft"
 
 echo "download_dependencies.sh completed successfully." >&2
diff --git a/tensorflow/lite/experimental/micro/tools/make/generate_keil_project.py b/tensorflow/lite/experimental/micro/tools/make/generate_keil_project.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb0c632e24b139a5a3e27fadbfb850a53fff531d
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/generate_keil_project.py
@@ -0,0 +1,117 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Generates a Keil uVision project file from a template."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os.path
+import re
+
+
+def sanitize_xml(unsanitized):
+  """Uses a whitelist to avoid generating bad XML."""
+  return re.sub(r'[^a-zA-Z0-9+_\-/\\.]', '', unsanitized)
+
+
+def main(unused_args, flags):
+  """Generates a Keil project file from a template source."""
+  with open(flags.input_template, 'r') as input_template_file:
+    template_file_text = input_template_file.read()
+
+  template_file_text = re.sub(r'%{EXECUTABLE}%', flags.executable,
+                              template_file_text)
+
+  srcs_list = flags.srcs.split(' ')
+  hdrs_list = flags.hdrs.split(' ')
+  all_srcs_list = srcs_list + hdrs_list
+  all_srcs_list.sort()
+
+  replace_srcs = ''
+  for src in all_srcs_list:
+    if not src:
+      continue
+    ext = os.path.splitext(src)[1]
+    # These extension indexes are used by uVision to keep track of the type
+    # of files. I determined them by experimentation, since the file format
+    # isn't documented.
+    if ext == '.h':
+      ext_index = '5'
+    elif ext == '.c':
+      ext_index = '1'
+    elif ext == '.cc' or ext == '.cpp':
+      ext_index = '8'
+    else:
+      ext_index = '5'
+    basename = sanitize_xml(os.path.basename(src))
+    clean_src = sanitize_xml(src)
+    replace_srcs += '            <File>\n'
+    replace_srcs += '              <FileName>' + basename + '</FileName>\n'
+    replace_srcs += '              <FileType>' + ext_index + '</FileType>\n'
+    replace_srcs += '              <FilePath>' + clean_src + '</FilePath>\n'
+    replace_srcs += '            </File>\n'
+  template_file_text = re.sub(r'%{SRCS}%', replace_srcs, template_file_text)
+
+  include_paths = re.sub(' ', ';', flags.include_paths)
+  template_file_text = re.sub(r'%{INCLUDE_PATHS}%', include_paths,
+                              template_file_text)
+
+  with open(flags.output_file, 'w') as output_file:
+    output_file.write(template_file_text)
+
+
+def parse_args():
+  """Converts the raw arguments into accessible flags."""
+  parser = argparse.ArgumentParser()
+  parser.register('type', 'bool', lambda v: v.lower() == 'true')
+  parser.add_argument(
+      '--input_template',
+      type=str,
+      default='',
+      help='Path to template project file to build from.')
+  parser.add_argument(
+      '--output_file',
+      type=str,
+      default='',
+      help='Path to write the completed project file to.')
+  parser.add_argument(
+      '--executable',
+      type=str,
+      default='',
+      help='Name of the executable the project will build.')
+  parser.add_argument(
+      '--hdrs',
+      type=str,
+      default='',
+      help='Space-separated list of C or C++ source files to compile.')
+  parser.add_argument(
+      '--srcs',
+      type=str,
+      default='',
+      help='Space-separated list of C or C++ header files to include.')
+  parser.add_argument(
+      '--include_paths',
+      type=str,
+      default='',
+      help='Space-separated list of paths to look for header files on.')
+  flags, unparsed = parser.parse_known_args()
+
+  main(unparsed, flags)
+
+
+if __name__ == '__main__':
+  parse_args()
diff --git a/tensorflow/lite/experimental/micro/tools/make/generate_keil_project_test.sh b/tensorflow/lite/experimental/micro/tools/make/generate_keil_project_test.sh
new file mode 100755
index 0000000000000000000000000000000000000000..22b68e4f68360cc28d6dd7e751381709a72892e7
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/generate_keil_project_test.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Bash unit tests for the TensorFlow Lite Micro project generator.
+
+set -e
+
+INPUT_TEMPLATE=${TEST_SRCDIR}/tensorflow/lite/experimental/micro/tools/make/templates/keil_project.uvprojx.tpl
+OUTPUT_FILE=${TEST_TMPDIR}/keil_project.uvprojx
+EXECUTABLE=test_executable
+
+${TEST_SRCDIR}/tensorflow/lite/experimental/micro/tools/make/generate_keil_project \
+  --input_template=${INPUT_TEMPLATE} \
+  --output_file=${OUTPUT_FILE} \
+  --executable=${EXECUTABLE} \
+  --hdrs="foo.h bar.h" \
+  --srcs="foo.c bar.cc some/bad<xml.cc" \
+  --include_paths=". include"
+
+if ! grep -q "${EXECUTABLE}" ${OUTPUT_FILE}; then
+  echo "ERROR: No executable name '${EXECUTABLE}' found in project file '${OUTPUT_FILE}'."
+  exit 1
+fi
+
+if ! grep -q "foo\.h" ${OUTPUT_FILE}; then
+  echo "ERROR: No header 'foo.h' found in project file '${OUTPUT_FILE}'."
+  exit 1
+fi
+
+if ! grep -q "bar\.h" ${OUTPUT_FILE}; then
+  echo "ERROR: No header 'bar.h' found in project file '${OUTPUT_FILE}'."
+  exit 1
+fi
+
+if ! grep -q "foo\.c" ${OUTPUT_FILE}; then
+  echo "ERROR: No source 'foo.c' found in project file '${OUTPUT_FILE}'."
+  exit 1
+fi
+
+if ! grep -q "bar\.cc" ${OUTPUT_FILE}; then
+  echo "ERROR: No source 'bar.cc' found in project file '${OUTPUT_FILE}'."
+  exit 1
+fi
+
+if ! grep -q "some/badxml\.cc" ${OUTPUT_FILE}; then
+  echo "ERROR: No source 'some/badxml.cc' found in project file '${OUTPUT_FILE}'."
+  exit 1
+fi
+
+if ! grep -q "\.;include" ${OUTPUT_FILE}; then
+  echo "ERROR: No include paths '.;include' found in project file '${OUTPUT_FILE}'."
+  exit 1
+fi
+
+echo
+echo "SUCCESS: generate_keil_project test PASSED"
diff --git a/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc b/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc
new file mode 100644
index 0000000000000000000000000000000000000000..89f473b3109816d0326c28b5965f00f167f64476
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/helper_functions.inc
@@ -0,0 +1,134 @@
+
+# Reverses a space-separated list of words.
+reverse = $(if $(1),$(call reverse,$(wordlist 2,$(words $(1)),$(1)))) $(firstword $(1))
+
+# Look for platform or target-specific implementation files to replace reference
+# implementations with, given a tag. These are expected to occur in subfolders
+# of a directory where a reference implementation exists, and have the same
+# interface and header file. For example,
+# tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
+# defines a module for supplying audio data, but since no platform or OS can be
+# presumed, it just always returns zeroes for its samples. The MacOS-specific
+# tensorflow/lite/experimental/micro/examples/micro_speech/osx/audio_provider.cc
+# has an implementation that relies on CoreAudio, and there are equivalent
+# versions for other operating systems.
+# The specific implementation yielded by the first tag in the list that produces
+# a match is returned, else the reference version if none of the tags produce a
+# match.
+# All lists of source files are put through this substitution process with the
+# tags of their target OS and architecture, so that implementations can be added
+# by simply placing them in the file tree, with no changes to the build files
+# needed.
+# One confusing thing about this implementation is that we're using wildcard to
+# act as a 'does file exist?' function, rather than expanding an expression.
+# Wildcard will return an empty string if given a plain file path with no actual
+# wildcards, if the file doesn't exist, so taking the first word of the list
+# between that and the reference path will pick the specialized one if it's
+# available.
+substitute_specialized_implementation = \
+  $(firstword $(wildcard $(dir $(1))$(2)/$(notdir $(1))) $(wildcard $(1)))
+substitute_specialized_implementations = \
+  $(foreach source,$(1),$(call substitute_specialized_implementation,$(source),$(2)))
+# Here we're first looking for specialized implementations in ref_dir/$(TAG1)
+# and then ref_dir/$(TAG2), etc, before falling back to ref_dir's
+# implementation.
+# The argument to this function should be a list of space-separated file paths,
+# with any wildcards already expanded.
+define specialize_on_tags
+$(if $(2),$(call substitute_specialized_implementations,$(call specialize_on_tags,$(1),$(wordlist 2,$(words $(2)),$(2))),$(firstword $(2))),$(1))
+endef
+# The entry point that most targets should use to find implementation-specific
+# versions of their source files. The only argument is a list of file paths.
+specialize = $(call specialize_on_tags,$(1),$(strip $(call reverse,$(ALL_TAGS))))
+
+# Creates a set of rules to build a standalone makefile project for an
+# executable, including all of the source and header files required in a
+# separate folder and a simple makefile.
+# Arguments are:
+# 1 - Project type (make, mbed, etc).
+# 2 - Project file template name.
+# 3 - Name of executable.
+# 4 - List of C/C++ source files needed to build the target.
+# 5 - List of C/C++ header files needed to build the target.
+# 6 - Linker flags required.
+# 7 - C++ compilation flags needed.
+# 8 - C compilation flags needed.
+# Calling eval on the output will create a <Name>_makefile target that you
+# can invoke to create the standalone project.
+define generate_project
+$(PRJDIR)$(3)/$(1)/%: %
+	@mkdir -p $$(dir $$@)
+	cp $$< $$@
+
+$(PRJDIR)$(3)/$(1)/third_party/%: tensorflow/lite/experimental/micro/tools/make/downloads/%
+	@mkdir -p $$(dir $$@)
+	cp $$< $$@
+
+$(PRJDIR)$(3)/$(1)/%: tensorflow/lite/experimental/micro/tools/make/templates/%.tpl
+	@mkdir -p $$(dir $$@)
+	sed -E 's#\%\{SRCS\}\%#$(4)#g' $$< | \
+	sed -E 's#\%\{EXECUTABLE\}\%#$(3)#g' | \
+	sed -E 's#\%\{LINKER_FLAGS\}\%#$(6)#g' | \
+	sed -E 's#\%\{CXX_FLAGS\}\%#$(7)#g' | \
+	sed -E 's#\%\{CC_FLAGS\}\%#$(8)#g' > $$@
+
+$(PRJDIR)$(3)/$(1)/keil_project.uvprojx: tensorflow/lite/experimental/micro/tools/make/templates/keil_project.uvprojx.tpl
+	@mkdir -p $$(dir $$@)
+	python tensorflow/lite/experimental/micro/tools/make/generate_keil_project.py \
+        --input_template=$$< --output_file=$$@ --executable=$(3) \
+        --srcs="$(4)" --hdrs="$(5)" --include_paths="$$(PROJECT_INCLUDES)"
+
+$(PRJDIR)$(3)/$(1)/.vscode/tasks.json : tensorflow/lite/experimental/micro/tools/make/templates/tasks.json.$(1).tpl
+	@mkdir -p $$(dir $$@)
+	cp $$< $$@
+
+generate_$(3)_$(1)_project: $(addprefix $(PRJDIR)$(3)/$(1)/, $(4) $(5) $(2))
+
+ALL_PROJECT_TARGETS += generate_$(3)_$(1)_project
+endef
+
+# Specialized version of generate_project for TF Lite Micro test targets that
+# automatically includes standard library files, so you just need to pass the
+# test name and any extra source files required.
+# Arguments are:
+# 1 - Name of test.
+# 2 - C/C++ source files implementing the test.
+# 3 - C/C++ header files needed for the test.
+# Calling eval on the output will create targets that you can invoke to
+# generate the standalone project.
+define generate_microlite_projects
+$(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
+$(call generate_project,mbed,$(MBED_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(MICROLITE_LIBS),$(CXXFLAGS),$(CCFLAGS))
+$(call generate_project,keil,$(KEIL_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(MICROLITE_LIBS),$(CXXFLAGS),$(CCFLAGS))
+endef
+
+
+# Handles the details of generating a binary target, including specializing
+# for the current platform, and generating project file targets.
+# Arguments are:
+# 1 - Name of test.
+# 2 - C/C++ source files implementing the test.
+# 3 - C/C++ header files needed for the test.
+# Calling eval on the output will create the targets that you need.
+define microlite_test
+$(1)_LOCAL_SRCS := $(2)
+$(1)_LOCAL_SRCS := $$(call specialize,$$($(1)_LOCAL_SRCS))
+ALL_SRCS += $$($(1)_LOCAL_SRCS)
+$(1)_LOCAL_HDRS := $(3)
+$(1)_LOCAL_OBJS := $$(addprefix $$(OBJDIR), \
+$$(patsubst %.cc,%.o,$$(patsubst %.c,%.o,$$($(1)_LOCAL_SRCS))))
+$(1)_BINARY := $$(BINDIR)$(1)
+$$($(1)_BINARY): $$($(1)_LOCAL_OBJS) $$(MICROLITE_LIB_PATH)
+	@mkdir -p $$(dir $$@)
+	$$(CXX) $$(CXXFLAGS) $$(INCLUDES) \
+	-o $$($(1)_BINARY) $$($(1)_LOCAL_OBJS) \
+	$$(LIBFLAGS) $$(MICROLITE_LIB_PATH) $$(LDFLAGS) $$(MICROLITE_LIBS)
+$(1): $$($(1)_BINARY)
+$(1)_bin: $$($(1)_BINARY).bin
+test_$(1): $$($(1)_BINARY)
+	$$(TEST_SCRIPT) $$($(1)_BINARY) '~~~ALL TESTS PASSED~~~'
+ifneq (,$(findstring _test,$(1)))
+  MICROLITE_TEST_TARGETS += test_$(1)
+endif
+$(eval $(call generate_microlite_projects,$(1),$(call specialize,$(2)),$(3)))
+endef
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/.gitignore b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..cb646e29d9ab950e7697b284cc5a87a302397219
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/.gitignore
@@ -0,0 +1,4 @@
+startup_gcc.c
+am_*.c
+libam*.a
+
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/_main.c b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/_main.c
deleted file mode 100644
index bd238ac55f96dbe62aa16a92180a5995ce395945..0000000000000000000000000000000000000000
--- a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/_main.c
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <stdint.h>
-#include "am_mcu_apollo.h"              // Defines AM_CMSIS_REGS
-#include "am_bsp.h"
-#include "am_util.h"
-
-//*****************************************************************************
-//
-// The entry point for the application.
-//
-//*****************************************************************************
-extern int main(int argc, char**argv);
-
-void DebugLog(const char* s) { am_util_stdio_printf( "%s", s); }
-void DebugLogInt32(int32_t i) { am_util_stdio_printf( "%d", i); }
-void DebugLogUInt32(uint32_t i) { am_util_stdio_printf( "%d", i); }
-void DebugLogHex(uint32_t i) { am_util_stdio_printf( "0x%8x", i); }
-void DebugLogFloat(float i) { am_util_stdio_printf( "%f", i); }
-
-int _main(void)
-{
-    am_util_id_t sIdDevice;
-    uint32_t ui32StrBuf;
-
-    //
-    // Set the clock frequency.
-    //
-    am_hal_clkgen_control(AM_HAL_CLKGEN_CONTROL_SYSCLK_MAX, 0);
-
-    //
-    // Set the default cache configuration
-    //
-    am_hal_cachectrl_config(&am_hal_cachectrl_defaults);
-    am_hal_cachectrl_enable();
-
-    //
-    // Configure the board for low power operation.
-    //
-    am_bsp_low_power_init();
-
-    //
-    // Initialize the printf interface for UART output
-    //
-    am_bsp_uart_printf_enable();
-
-    //
-    // Print the banner.
-    //
-    am_util_stdio_terminal_clear();
-    am_util_stdio_printf("Hello World!\n\n");
-
-    //
-    // Print the device info.
-    //
-    am_util_id_device(&sIdDevice);
-    am_util_stdio_printf("Vendor Name: %s\n", sIdDevice.pui8VendorName);
-    am_util_stdio_printf("Device type: %s\n", sIdDevice.pui8DeviceName);
-
-
-    am_util_stdio_printf("Qualified: %s\n",
-                         sIdDevice.sMcuCtrlDevice.ui32Qualified ?
-                         "Yes" : "No");
-
-    am_util_stdio_printf("Device Info:\n"
-                         "\tPart number: 0x%08X\n"
-                         "\tChip ID0:    0x%08X\n"
-                         "\tChip ID1:    0x%08X\n"
-                         "\tRevision:    0x%08X (Rev%c%c)\n",
-                         sIdDevice.sMcuCtrlDevice.ui32ChipPN,
-                         sIdDevice.sMcuCtrlDevice.ui32ChipID0,
-                         sIdDevice.sMcuCtrlDevice.ui32ChipID1,
-                         sIdDevice.sMcuCtrlDevice.ui32ChipRev,
-                         sIdDevice.ui8ChipRevMaj, sIdDevice.ui8ChipRevMin );
-
-    //
-    // If not a multiple of 1024 bytes, append a plus sign to the KB.
-    //
-    ui32StrBuf = ( sIdDevice.sMcuCtrlDevice.ui32FlashSize % 1024 ) ? '+' : 0;
-    am_util_stdio_printf("\tFlash size:  %7d (%d KB%s)\n",
-                         sIdDevice.sMcuCtrlDevice.ui32FlashSize,
-                         sIdDevice.sMcuCtrlDevice.ui32FlashSize / 1024,
-                         &ui32StrBuf);
-
-    ui32StrBuf = ( sIdDevice.sMcuCtrlDevice.ui32SRAMSize % 1024 ) ? '+' : 0;
-    am_util_stdio_printf("\tSRAM size:   %7d (%d KB%s)\n\n",
-                         sIdDevice.sMcuCtrlDevice.ui32SRAMSize,
-                         sIdDevice.sMcuCtrlDevice.ui32SRAMSize / 1024,
-                         &ui32StrBuf);
-
-    //
-    // Print the compiler version.
-    //
-    am_util_stdio_printf("App Compiler:    %s\n", COMPILER_VERSION);
-#ifdef AM_PART_APOLLO3
-    am_util_stdio_printf("HAL Compiler:    %s\n", g_ui8HALcompiler);
-    am_util_stdio_printf("HAL SDK version: %d.%d.%d\n",
-                         g_ui32HALversion.s.Major,
-                         g_ui32HALversion.s.Minor,
-                         g_ui32HALversion.s.Revision);
-    am_util_stdio_printf("HAL compiled with %s-style registers\n",
-                         g_ui32HALversion.s.bAMREGS ? "AM_REG" : "CMSIS");
-
-    am_util_stdio_printf("&sIdDevice: 0x%x, &ui32StrBuf: 0x%x\n", &sIdDevice, &ui32StrBuf);
-    am_hal_security_info_t secInfo;
-    char sINFO[32];
-    uint32_t ui32Status;
-#endif // AM_PART_APOLLO3
-    main(0, NULL);
-}
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/prep_apollo3_files.sh b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/prep_apollo3_files.sh
new file mode 100755
index 0000000000000000000000000000000000000000..7ef23095022b24922b28580ce3e8d1c76b81086f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb/prep_apollo3_files.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+AP3_DIR="tensorflow/lite/experimental/micro/tools/make/downloads/Apollo3-SDK-2018.08.13"
+if [ ! -d $AP3_DIR ]; then
+    echo "Apollo 3 SDK does not exist"
+    echo "Either the SDK has not been downloaded, or this script is not being run from the root of the repository"
+else
+    DEST_DIR="tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb"
+    cp "$AP3_DIR/boards/apollo3_evb/examples/hello_world/gcc/startup_gcc.c" "$DEST_DIR"
+    cp "$AP3_DIR/boards/apollo3_evb/examples/hello_world/gcc/hello_world.ld" "$DEST_DIR/apollo3evb.ld"
+    sed -i -e '131s/1024/1024\*20/g' "$DEST_DIR/startup_gcc.c"
+    sed -i -e 's/main/_main/g' "$DEST_DIR/startup_gcc.c"
+    sed -i -e '3s/hello_world.ld/apollo3evb.ld/g' "$DEST_DIR/apollo3evb.ld"
+    sed -i -e '3s/startup_gnu/startup_gcc/g' "$DEST_DIR/apollo3evb.ld"
+    sed -i -e '6s/am_reset_isr/Reset_Handler/g' "$DEST_DIR/apollo3evb.ld"
+    sed -i -e '22s/\*(.text\*)/\*(.text\*)\n\n\t\/\* These are the C++ global constructors.  Stick them all here and\n\t \* then walk through the array in main() calling them all.\n\t \*\/\n\t_init_array_start = .;\n\tKEEP (\*(SORT(.init_array\*)))\n\t_init_array_end = .;\n\n\t\/\* XXX Currently not doing anything for global destructors. \*\/\n/g' "$DEST_DIR/apollo3evb.ld"
+    sed -i -e "70s/} > SRAM/} > SRAM\n    \/\* Add this to satisfy reference to symbol 'end' from libnosys.a(sbrk.o)\n     \* to denote the HEAP start.\n     \*\/\n   end = .;/g" "$DEST_DIR/apollo3evb.ld"
+    echo "Finished preparing Apollo3 files"
+    
+
+fi
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
index f722204feaded521945cd269b36576e560dac3e4..fa9dc9c99b82d04d7d28af85325789719da9397a 100644
--- a/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/apollo3evb_makefile.inc
@@ -1,13 +1,14 @@
 # Settings for apollo3 evb platforms.
 ifeq ($(TARGET), apollo3evb)
+  export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH)
   TARGET_ARCH := cortex-m4
   TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
   # Download the Ambiq Apollo3 SDK and set this variable to find the header
   # files:
-  APOLLO3_SDK := /ssd/ambiq/AmbiqSuite\ SDK\ for\ Apollo3/Apollo3-SDK-2018.08.13/
+  APOLLO3_SDK := $(MAKEFILE_DIR)/downloads/AmbiqSuite-Rel2.0.0
   # Need a pointer to the GNU ARM toolchain for crtbegin.o for the fp functions
-  # with the softfp interfaces.
-  GCC_ARM := /ssd/gnu_arm_toolchain/gcc-arm-none-eabi-7-2018-q2-update/
+  # with the hard interfaces.
+  GCC_ARM := $(MAKEFILE_DIR)/downloads/gcc_embedded/
 
   PLATFORM_FLAGS = \
     -DPART_apollo3 \
@@ -16,6 +17,8 @@ ifeq ($(TARGET), apollo3evb)
     -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
     -DTF_LITE_STATIC_MEMORY \
     -DTF_LITE_MCU_DEBUG_LOG \
+    -D __FPU_PRESENT=1 \
+    -DARM_MATH_CM4 \
     -fno-rtti \
     -fmessage-length=0 \
     -fno-exceptions \
@@ -28,7 +31,7 @@ ifeq ($(TARGET), apollo3evb)
     -mcpu=cortex-m4 \
     -mthumb \
     -mfpu=fpv4-sp-d16 \
-    -mfloat-abi=softfp \
+    -mfloat-abi=hard \
     -std=gnu++11 \
     -Wvla \
     -Wall \
@@ -41,28 +44,29 @@ ifeq ($(TARGET), apollo3evb)
     -fomit-frame-pointer \
     -fpermissive \
     -nostdlib \
-    -g \
-    -Os
+    -ggdb \
+    -O3
   CXXFLAGS += $(PLATFORM_FLAGS)
   CCFLAGS += $(PLATFORM_FLAGS)
   LDFLAGS += \
-    -mthumb -mcpu=cortex-m4 -mfpu=fpv4-sp-d16 -mfloat-abi=softfp \
+    -mthumb -mcpu=cortex-m4 -mfpu=fpv4-sp-d16 -mfloat-abi=hard \
     -nostartfiles -static \
     -Wl,--gc-sections -Wl,--entry,Reset_Handler \
     -Wl,--start-group -lm -lc -lgcc -Wl,--end-group \
     -fno-exceptions \
     -nostdlib --specs=nano.specs -t -lstdc++ -lc -lnosys -lm \
-    -Wl,-T,$(MAKEFILE_DIR)/targets/apollo3evb/apollo3evb.ld \
+    -Wl,-T,$(APOLLO3_SDK)/boards/apollo3_evb/examples/hello_world/gcc_patched/apollo3evb.ld \
     -Wl,-Map=$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref
   BUILD_TYPE := micro
-  # The apollo3evb libs should be copied from the SDK after building them.
   MICROLITE_LIBS := \
-    $(MAKEFILE_DIR)/targets/apollo3evb/libam_bsp.a \
-    $(MAKEFILE_DIR)/targets/apollo3evb/libam_hal.a \
-    $(GCC_ARM)/lib/gcc/arm-none-eabi/7.3.1/thumb/v7e-m/fpv4-sp/softfp/crtbegin.o \
+    $(APOLLO3_SDK)/boards/apollo3_evb/bsp/gcc/bin/libam_bsp.a \
+    $(APOLLO3_SDK)/mcu/apollo3/hal/gcc/bin/libam_hal.a \
+    $(GCC_ARM)/lib/gcc/arm-none-eabi/7.3.1/thumb/v7e-m/fpv4-sp/hard/crtbegin.o \
     -lm
   INCLUDES += \
     -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
+    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Include/ \
+    -I$(MAKEFILE_DIR)/downloads/CMSIS_ext/ \
     -I$(GCC_ARM)/arm-none-eabi/ \
     -I$(APOLLO3_SDK)/mcu/apollo3/ \
     -I$(APOLLO3_SDK)/CMSIS/AmbiqMicro/Include/ \
@@ -79,26 +83,37 @@ ifeq ($(TARGET), apollo3evb)
   # setting clock speed, default uart setups, etc. and an implementation
   # of the DebugLog interfaces.
   MICROLITE_CC_SRCS += \
-    $(MAKEFILE_DIR)/targets/apollo3evb/startup_gcc.c \
-    $(MAKEFILE_DIR)/targets/apollo3evb/_main.c \
-    $(MAKEFILE_DIR)/targets/apollo3evb/am_util_delay.c \
-    $(MAKEFILE_DIR)/targets/apollo3evb/am_util_faultisr.c \
-    $(MAKEFILE_DIR)/targets/apollo3evb/am_util_id.c \
-    $(MAKEFILE_DIR)/targets/apollo3evb/am_util_stdio.c
+    $(APOLLO3_SDK)/boards/apollo3_evb/examples/hello_world/gcc_patched/startup_gcc.c \
+    $(APOLLO3_SDK)/utils/am_util_delay.c \
+    $(APOLLO3_SDK)/utils/am_util_faultisr.c \
+    $(APOLLO3_SDK)/utils/am_util_id.c \
+    $(APOLLO3_SDK)/utils/am_util_stdio.c
+
+  CMSIS_SRC_DIR := tensorflow/lite/experimental/micro/tools/make/downloads/cmsis/CMSIS/DSP/Source
+  CMSIS_SRCS := \
+  $(CMSIS_SRC_DIR)/BasicMathFunctions/arm_mult_q15.c \
+  $(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_init_q15.c \
+  $(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_q15.c \
+  $(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_q15.c \
+  $(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_radix4_q15.c \
+  $(CMSIS_SRC_DIR)/CommonTables/arm_const_structs.c \
+  $(CMSIS_SRC_DIR)/CommonTables/arm_common_tables.c \
+  $(CMSIS_SRC_DIR)/StatisticsFunctions/arm_mean_q15.c \
+  $(CMSIS_SRC_DIR)/StatisticsFunctions/arm_max_q7.c
+
+  AP3_EXT_MICRO_DIR := $(MAKEFILE_DIR)/downloads/apollo3_ext
+  AP3_MICRO_DIR := tensorflow/lite/experimental/micro/examples/micro_speech/apollo3
+  CMSIS_DIR := tensorflow/lite/experimental/micro/examples/micro_speech/CMSIS
+  CMSIS_EXT_DIR := $(MAKEFILE_DIR)/downloads/CMSIS_ext
+
+  MICRO_SPEECH_TEST_SRCS += \
+    $(AP3_MICRO_DIR)/_main.c
 
   TEST_SCRIPT := tensorflow/lite/experimental/log_test/test_apollo3evb_binary.sh
-  # These are tests that don't currently work on the blue pill.
+  # These are tests that don't currently work on the Apollo3 board.
   EXCLUDED_TESTS := \
     tensorflow/lite/experimental/micro/micro_interpreter_test.cc \
     tensorflow/lite/experimental/micro/simple_tensor_allocator_test.cc
   MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
 
-# These are microcontroller-specific rules for converting the ELF output
-# of the linker into a binary image that can be loaded directly.
-OBJCOPY := $(TARGET_TOOLCHAIN_PREFIX)objcopy
-
-$(BINDIR)/%.bin: $(BINDIR)/%
-	@mkdir -p $(dir $@)
-	$(OBJCOPY) $< $@ -O binary
-
 endif
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/bluepill_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/bluepill_makefile.inc
index 5e3105a109b99b061a35b9c6f6c7c5f3681e2b45..b344f844bca7e7045eafaba141dc5e6371c3f496 100644
--- a/tensorflow/lite/experimental/micro/tools/make/targets/bluepill_makefile.inc
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/bluepill_makefile.inc
@@ -47,7 +47,10 @@ ifeq ($(TARGET), bluepill)
   MICROLITE_CC_SRCS += \
     $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.c) \
     $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.cc)
-    TEST_SCRIPT := tensorflow/lite/experimental/micro/testing/test_bluepill_binary.sh
+  EXCLUDED_SRCS := \
+    $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/debug_log.c
+  MICROLITE_CC_SRCS := $(filter-out $(EXCLUDED_SRCS), $(MICROLITE_CC_SRCS))
+  TEST_SCRIPT := tensorflow/lite/experimental/micro/testing/test_bluepill_binary.sh
   # These are tests that don't currently work on the blue pill.
   EXCLUDED_TESTS := \
     tensorflow/lite/experimental/micro/micro_interpreter_test.cc \
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/README.md b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3e339fe635d4af2e9e884d0c3bdb56d9d210e9ad
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/README.md
@@ -0,0 +1,9 @@
+Compiling instructions here
+https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro
+
+CONTACT INFORMATION:
+
+Contact info@etacompute.com for more information on obtaining the Eta Compute
+SDK and evalution board.
+
+www.etacompute.com
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/_main.c b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/_main.c
new file mode 100644
index 0000000000000000000000000000000000000000..25d3e7c169d5f7419a892d35bd30aa2d9a128160
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/_main.c
@@ -0,0 +1,89 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* This is file contains the entry point to the application and is called after
+   startup.
+   The GPIOs, Uart and timer are intialized and Tensorflow is invoked with the
+   call to main().
+   Tensorflow will print out if the tests have passed or failed and the
+   execution time is also
+   printed. */
+
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include "eta_bsp.h"
+#include "eta_chip.h"
+#include "eta_csp.h"
+#include "eta_csp_buck.h"
+#include "eta_csp_gpio.h"
+#include "eta_csp_io.h"
+#include "eta_csp_pwr.h"
+#include "eta_csp_rtc.h"
+#include "eta_csp_socctrl.h"
+#include "eta_csp_sys_clock.h"
+#include "eta_csp_timer.h"
+#include "eta_csp_uart.h"
+
+tUart g_sUart0 = {eUartNum0, eUartBaud115200};
+tUart g_sUart1 = {eUartNum1, eUartBaud115200};
+
+int init_main(int);
+void EtaPrintExecutionTime(uint64_t);
+
+//*****************************************************************************
+//
+// The entry point for the application.
+//
+//*****************************************************************************
+extern int main(int argc, char** argv);
+
+int _main(void) {
+  uint64_t time_ms;
+
+  EtaCspInit();      // initialize csp registers
+  EtaCspGpioInit();  // initialize gpios
+  EtaCspUartInit(&g_sUart1, eUartNum0, eUartBaud115200,
+                 eUartFlowControlHardware);  // initialize Uart
+  EtaCspBuckInit(ETA_BSP_VDD_IO_SETTING, eBuckAo600Mv, eBuckM3Frequency60Mhz,
+                 eBuckMemVoltage900Mv);  // set M3 freq
+  EtaCspTimerInitMs();                   // start timer
+  main(0, NULL);  // Call to Tensorflow; this will print if test was successful.
+  time_ms = EtaCspTimerCountGetMs();  // read time
+  EtaPrintExecutionTime(time_ms);     // print execution time
+}
+
+void EtaPrintExecutionTime(uint64_t time_ms) {
+  uint8_t c;
+  int k1;
+  char time_string[] = "00000";
+
+  EtaCspIoPrintf("Execution time (msec) = ");
+  if (time_ms < 100000)  // Convert time to a string
+  {
+    for (k1 = 0; k1 < 5; k1++) {
+      c = time_ms % 10;
+      time_ms = time_ms / 10;
+      time_string[k1] = (char)(0x30 + c);
+    }
+    for (k1 = 4; k1 >= 0; k1--) {  // print out 1 char at a time
+      EtaCspUartPutc(&g_sUart1, time_string[k1]);
+    }
+  } else {
+    EtaCspIoPrintf("Execution time exceeds 100 sec\n");
+  }
+  EtaCspIoPrintf("\n\n");
+}
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531.lds b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531.lds
new file mode 100644
index 0000000000000000000000000000000000000000..383b7f924408b484c8ee2ada5c4d6ec66edb059a
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531.lds
@@ -0,0 +1,85 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+
+/*
+ * linker script for use with ECM3531
+ * All sections must map to 128KBytes of SRAM beginning at 0x10000000
+ *
+ */
+
+ /*
+  * Indicate to the linker the entry point.
+  */
+ENTRY(ResetISR)
+
+/*
+ *   SRAM is at 0x10000000 of length 0x00020000
+ */
+MEMORY
+{
+    SRAM (RWX) : ORIGIN = 0x10000000, LENGTH = 0x00020000
+}
+
+SECTIONS
+{
+    .text :
+    {
+        _text = .;
+        KEEP(*(.vectors))
+        . = ALIGN(0x4);
+        *(.text*)
+        . = ALIGN(0x4);
+        *(.rodata*)
+        . = ALIGN(0x4);
+        _etext = .;
+    } > SRAM= 0
+    .dummy :
+    {
+        . = ALIGN(0x4);
+        _eftext = .;
+    } > SRAM
+    .datax :
+    {
+        _datax = .;
+        KEEP(*(.mainStack))
+        . += 16384;
+        _edatax = .;
+        _stack_top = .;
+        . += 4;
+    } > SRAM
+    .data :
+       AT (ADDR(.text) + SIZEOF(.text) ) 
+    {
+        _data = .;
+        *(.data*)
+        KEEP(*(.mainHeap))
+        _edata = .;
+    } > SRAM
+
+    .bss :
+    {
+        _bss = .;
+        *(.bss*)
+        *(COMMON)
+        _ebss = .;
+    } > SRAM
+    .ARM.exidx :
+    {
+       *(.ARM.exidx*)
+    }
+
+}
+
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531_flash.lds b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531_flash.lds
new file mode 100644
index 0000000000000000000000000000000000000000..9cbbea3569ba05b8fc9269ff6c5500fb386c03a3
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/ecm3531_flash.lds
@@ -0,0 +1,85 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+
+/*
+ * linker script for use with ECM3531 chip.
+ * .text  and .ro map to FLASH all else to SRAM.
+ *
+ */
+
+ /*
+  * Indicate to the linker the entry point.
+  */
+ENTRY(ResetISR)
+
+/*
+ *   FLASH is at 0x01000000 of length 0x00080000  512KB
+ *   SRAM  is at 0x10000000 of length 0x00020000  128KB
+ */
+MEMORY
+{
+    FLASH (RX) : ORIGIN = 0x01000000, LENGTH = 0x00080000
+    SRAM (RWX) : ORIGIN = 0x10000000, LENGTH = 0x00020000
+}
+
+SECTIONS
+{
+    .text :
+    {
+        _text = .;
+        KEEP(*(.vectors))
+        . = ALIGN(0x4);
+        *(.text*)
+        . = ALIGN(0x4);
+        *(.rodata*)
+        . = ALIGN(0x4);
+        _etext = .;
+    } > FLASH= 0
+    .dummy :
+    {
+        . = ALIGN(0x4);
+        _eftext = .;
+    } > FLASH
+/* put the stack at the bottom of SRAM*/
+    .datax (NOLOAD) :
+    {
+        _datax = .;
+        KEEP(*(.mainStack))
+        . = ALIGN(0x4);
+        . += 16384;
+        _edatax = .;
+        _stack_top = .;
+    } > SRAM
+    .data :
+    {
+        _data = .;
+        *(.data*)
+        KEEP(*(.mainHeap))
+        _edata = .;
+    } > SRAM AT > FLASH
+
+    .bss (NOLOAD) :
+    {
+        _bss = .;
+        *(.bss*)
+        *(COMMON)
+        _ebss = .;
+    } > SRAM
+
+
+
+}
+
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/flash_erase b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/flash_erase
new file mode 100755
index 0000000000000000000000000000000000000000..5395b3d9965e98572fb12d61d7b862f4ce926a0f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/flash_erase
@@ -0,0 +1,47 @@
+#!/usr/bin/python3
+#Usage: cd to the directory  tensorflow/lite/experimental/micro/tools/make/targets/ecm3531 and type ./flash_erase to erase the flash.
+#
+#
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+#==============================================================================
+
+
+import os
+import telnetlib
+
+def send_ocd_cmd(line):
+    ocd_sock.write(bytes(line,encoding = 'utf-8'))
+    print(ocd_sock.read_until(b'> ').decode('utf-8'), end='')
+
+def get_ocd_response():
+    print(ocd_sock.read_until(b'> ').decode('utf-8'), end='')
+
+#get hooked up to openocd daemon
+ocd_sock = telnetlib.Telnet(host='localhost', port=4444)
+get_ocd_response() # clean it out
+
+#ocd comand
+ocd_commands = ["halt\n",
+                "flash erase_sector 0 0 127\n",
+                "mww 0x1001fff8 0\n",
+                "mdw 0x01000000 16\n",
+                "reset\n"]
+
+# OK now do what we came here for!!!
+for x in ocd_commands: 
+    print(x)
+    send_ocd_cmd(x)
+
+
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/flash_program b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/flash_program
new file mode 100755
index 0000000000000000000000000000000000000000..bc3fe5cb21aa2a89d8dda41a68185f03e43c674e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/flash_program
@@ -0,0 +1,53 @@
+#!/usr/bin/python3
+#Usage: cd to the directory  tensorflow/lite/experimental/micro/tools/make/targets/ecm3531 and type ./flash_program executable_name to load an executable from the directory tensorflow/lite/experimental/micro/tools/make/gen/ecm3531_cortex-m3/bin/ into flash
+#
+#
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+#==============================================================================
+
+
+import sys, getopt
+import os
+import telnetlib
+
+def send_ocd_cmd(line):
+    ocd_sock.write(bytes(line,encoding = 'utf-8'))
+    print(ocd_sock.read_until(b'> ').decode('utf-8'), end='')
+
+def get_ocd_response():
+    print(ocd_sock.read_until(b'> ').decode('utf-8'), end='')
+
+#get hooked up to openocd daemon
+ocd_sock = telnetlib.Telnet(host='localhost', port=4444)
+get_ocd_response() # clean it out
+
+# git path to project elf file
+cur_dir = os.getcwd()
+#elf_file = cur_dir + '/../../gen/ecm3531_cortex-m3/bin/' + 'micro_speech'
+elf_file = cur_dir + '/../../gen/ecm3531_cortex-m3/bin/' + sys.argv[1]
+print("elf_file = ",elf_file)
+
+
+# use these to download and run the elf fle
+ocd_commands = ["halt\n",
+                "flash erase_sector 0 0 127\n",
+                "flash write_image {}\n".format(elf_file),
+                "mww 0x1001fff8 0\n",
+                "reset\n"]
+
+# OK now do what we came here for!!!
+for x in ocd_commands:
+    print(x)
+    send_ocd_cmd(x)
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/load_program b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/load_program
new file mode 100755
index 0000000000000000000000000000000000000000..781231480aa2f1dec18cc468e1ea0129604c71e7
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/load_program
@@ -0,0 +1,55 @@
+#!/usr/bin/python3
+#Usage: cd to the directory  tensorflow/lite/experimental/micro/tools/make/targets/ecm3531 and type ./load_prgram executable_name to load an executable from the directory tensorflow/lite/experimental/micro/tools/make/gen/ecm3531_cortex-m3/bin/
+#
+#
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+#==============================================================================
+
+
+import sys, getopt
+import os
+import telnetlib
+
+def send_ocd_cmd(line):
+    ocd_sock.write(bytes(line,encoding = 'utf-8'))
+    print(ocd_sock.read_until(b'> ').decode('utf-8'), end='')
+
+def get_ocd_response():
+    print(ocd_sock.read_until(b'> ').decode('utf-8'), end='')
+
+#get hooked up to openocd daemon
+ocd_sock = telnetlib.Telnet(host='localhost', port=4444)
+get_ocd_response() # clean it out
+
+# git path to project elf file
+cur_dir = os.getcwd()
+#elf_file = cur_dir + '/../../gen/ecm3531_cortex-m3/bin/' + 'preprocessor_test'
+elf_file = cur_dir + '/../../gen/ecm3531_cortex-m3/bin/' + sys.argv[1]
+print("elf_file = ",elf_file)
+
+
+# use these to download and run the elf fle
+ocd_commands = ["halt\n",
+                "load_image {}\n".format(elf_file),
+                "mww 0x1001FFF8 0xDEADBEEF\n",
+                "mww 0x1001FFFC 0xC369A517\n",
+                "reset\n"]
+
+# OK now do what we came here for!!!
+for x in ocd_commands: 
+    print(x)
+    send_ocd_cmd(x)
+
+
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/startup.c b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/startup.c
new file mode 100644
index 0000000000000000000000000000000000000000..32d817ba4882f9123a9ed6321f9339355d82db5c
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531/startup.c
@@ -0,0 +1,432 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* This file is called at power up time to initialize the chip.  It in turn
+calls _main() which is the entry point into the application */
+
+#include <stdint.h>
+#include "eta_chip.h"
+#include "memio.h"
+
+#ifndef NULL
+#define NULL (0)
+#endif
+
+//*****************************************************************************
+//
+// Macro for hardware access, both direct and via the bit-band region.
+//
+//*****************************************************************************
+
+int _main(int argc, char *argv[]);
+void set_vtor(void);
+void *startup_get_my_pc(void);
+
+//*****************************************************************************
+// Forward DECLS for interrupt service routines (ISR)
+//*****************************************************************************
+extern void ResetISR(void) __attribute__((weak, alias("default_ResetISR")));
+extern void NmiSR(void) __attribute__((weak, alias("default_NmiSR")));
+extern void FaultISR(void) __attribute__((weak, alias("default_FaultISR")));
+
+extern void DebugMonitor_ISR(void)
+    __attribute__((weak, alias("default_DebugMonitor_ISR")));
+extern void SVCall_ISR(void) __attribute__((weak, alias("default_SVCall_ISR")));
+extern void PENDSV_ISR(void) __attribute__((weak, alias("default_PENDSV_ISR")));
+
+extern void SYSTICK_ISR(void)
+    __attribute__((weak, alias("default_SYSTICK_ISR")));
+
+extern void GPIO0_ISR(void) __attribute__((weak, alias("default_GPIO0_ISR")));
+extern void GPIO1_ISR(void) __attribute__((weak, alias("default_GPIO1_ISR")));
+extern void TIMER0_ISR(void) __attribute__((weak, alias("default_TIMER0_ISR")));
+extern void TIMER1_ISR(void) __attribute__((weak, alias("default_TIMER1_ISR")));
+extern void UART0_ISR(void) __attribute__((weak, alias("default_UART0_ISR")));
+extern void UART1_ISR(void) __attribute__((weak, alias("default_UART1_ISR")));
+extern void SPI0_ISR(void) __attribute__((weak, alias("default_SPI0_ISR")));
+extern void SPI1_ISR(void) __attribute__((weak, alias("default_SPI1_ISR")));
+extern void I2C0_ISR(void) __attribute__((weak, alias("default_I2C0_ISR")));
+extern void I2C1_ISR(void) __attribute__((weak, alias("default_I2C1_ISR")));
+extern void RTC0_ISR(void) __attribute__((weak, alias("default_RTC0_ISR")));
+extern void RTC1_ISR(void) __attribute__((weak, alias("default_RTC1_ISR")));
+extern void DSP_ISR(void) __attribute__((weak, alias("default_DSP_ISR")));
+extern void ADC_ISR(void) __attribute__((weak, alias("default_ADC_ISR")));
+extern void SW0_ISR(void) __attribute__((weak, alias("default_SW0_ISR")));
+extern void SW1_ISR(void) __attribute__((weak, alias("default_SW1_ISR")));
+extern void PWM_ISR(void) __attribute__((weak, alias("default_PWM_ISR")));
+extern void WDT_ISR(void) __attribute__((weak, alias("default_WDT_ISR")));
+extern void RTC_TMR_ISR(void)
+    __attribute__((weak, alias("default_RTC_TMR_ISR")));
+
+extern void SW2_ISR(void) __attribute__((weak, alias("default_SW1_ISR")));
+extern void SW3_ISR(void) __attribute__((weak, alias("default_SW1_ISR")));
+extern void SW4_ISR(void) __attribute__((weak, alias("default_SW1_ISR")));
+extern void SW5_ISR(void) __attribute__((weak, alias("default_SW1_ISR")));
+extern void SW6_ISR(void) __attribute__((weak, alias("default_SW1_ISR")));
+
+extern void IntDefaultHandler(void) __attribute__((weak));
+
+//*****************************************************************************
+//
+// Reserve space for the system stack.
+//
+//*****************************************************************************
+extern uint32_t _stack_top;
+//__attribute__ ((section(".mainStack"), used))
+// static uint32_t pui32Stack[2048];
+#define STARTUP_STACK_TOP (&_stack_top)
+
+//*****************************************************************************
+// VECTOR TABLE
+//*****************************************************************************
+__attribute__((section(".vectors"), used)) void (*const gVectors[])(void) = {
+    //(void (*)(void))((uint32_t)pui32Stack + sizeof(pui32Stack)), // Stack
+    // pointer
+    (void *)STARTUP_STACK_TOP,
+    ResetISR,           // Reset handler
+    NmiSR,              // The NMI handler
+    FaultISR,           // The hard fault handler
+    IntDefaultHandler,  // 4 The MPU fault handler
+    IntDefaultHandler,  // 5 The bus fault handler
+    IntDefaultHandler,  // 6 The usage fault handler
+    0,                  // 7 Reserved
+    0,                  // 8 Reserved
+    0,                  // 9 Reserved
+    0,                  // 10 Reserved
+    SVCall_ISR,         // 11 SVCall handler
+    DebugMonitor_ISR,   // 12 Debug monitor handler
+    0,                  // 13 Reserved
+    PENDSV_ISR,         // 14 The PendSV handler
+    SYSTICK_ISR,        // 15 The SysTick handler
+
+    // external interrupt service routines (ISR)
+    GPIO0_ISR,    // 16 GPIO Port A            [ 0]
+    GPIO1_ISR,    // 17 GPIO Port B            [ 1]
+    TIMER0_ISR,   // 18 Timer 0                [ 2]
+    TIMER1_ISR,   // 19 Timer 1                [ 3]
+    UART0_ISR,    // 20 UART 0                 [ 4]
+    UART1_ISR,    // 21 UART 1                 [ 5]
+    SPI0_ISR,     // 22 SPI0                   [ 6]
+    SPI1_ISR,     // 23 SPI1                   [ 7]
+    I2C0_ISR,     // 24 I2C 0                  [ 8]
+    I2C1_ISR,     // 25 I2C 1                  [ 9]
+    RTC0_ISR,     // 26 RTC 0                  [10]
+    RTC1_ISR,     // 27 RTC 1                  [11]
+    DSP_ISR,      // 28 DSP MAILBOX            [12]
+    ADC_ISR,      // 29 ADC                    [13]
+    PWM_ISR,      // 32 PWM                    [14]
+    WDT_ISR,      // 33 WDT                    [15]
+    RTC_TMR_ISR,  // 34 RTC                    [16]
+
+    SW0_ISR,  // 30 Software Interrupt 0   [17]
+    SW1_ISR,  // 31 Software Interrupt 1   [18]
+    SW2_ISR,  // 35 Software Interrupt 2   [19]
+    SW3_ISR,  // 36 Software Interrupt 3   [20]
+    SW4_ISR,  // 37 Software Interrupt 4   [21]
+    SW5_ISR,  // 38 Software Interrupt 5   [22]
+    SW6_ISR,  // 39 Software Interrupt 6   [23]
+
+};
+
+//*****************************************************************************
+//
+// The following are constructs created by the linker, indicating where the
+// the "data" and "bss" segments reside in memory.  The initializers for the
+// for the "data" segment resides immediately following the "text" segment.
+//
+//*****************************************************************************
+extern uint32_t _etext;
+extern uint32_t _eftext;
+extern uint32_t _data;
+extern uint32_t _edata;
+extern uint32_t _bss;
+extern uint32_t _ebss;
+
+//
+// And here are the weak interrupt handlers.
+//
+void default_NmiSR(void) {
+  __asm("    movs     r0, #2");
+  while (1) {
+  }
+}
+
+void default_FaultISR(void) {
+  __asm("    movs     r0, #3");
+  MEMIO32(0x1001FFF0) = 0xbad0beef;  // near the top of 128KB of SRAM
+  MEMIO32(0x1001FFF4) = 0xbad1beef;  // near the top of 128KB of SRAM
+  while (1) {
+    __asm("    BKPT      #1");
+  }
+}
+
+void IntDefaultHandler(void) {
+  __asm("    movs     r0, #20");
+  while (1) {
+    __asm("    BKPT      #1");
+  }
+}
+
+void default_SVCall_ISR(void) {
+  __asm("    movs     r0, #11");
+  while (1) {
+    __asm("    BKPT      #11");
+  }
+}
+
+void default_DebugMonitor_ISR(void) {
+  __asm("    movs     r0, #12");
+  while (1) {
+    __asm("    BKPT      #12");
+  }
+}
+
+void default_PENDSV_ISR(void) {
+  __asm("    movs     r0, #14");
+  while (1) {
+    __asm("    BKPT      #14");
+  }
+}
+
+void default_SYSTICK_ISR(void) {
+  __asm("    movs     r0, #15");
+  while (1) {
+    __asm("    BKPT      #15");
+  }
+}
+
+//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+void default_SPI0_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_SPI1_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_I2C0_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_I2C1_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_UART0_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_UART1_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_GPIO0_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_GPIO1_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_ADC_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_DSP_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_TIMER0_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_TIMER1_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_RTC0_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_RTC1_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_PWM_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_WDT_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_RTC_TMR_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_SW0_ISR(void) {
+  __asm("    movs     r0, #16");
+  while (1) {
+    __asm("    BKPT      #16");
+  }
+}
+
+void default_SW1_ISR(void) {
+  __asm("    movs     r0, #17");
+  while (1) {
+    __asm("    BKPT      #17");
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Reset ISR
+////////////////////////////////////////////////////////////////////////////////
+void default_ResetISR(void) {
+  int rc;
+  bool bRunningInFlash;
+
+  set_vtor();
+
+  bRunningInFlash =
+      ((((uint32_t)startup_get_my_pc()) & 0xFF000000) == 0x01000000);
+
+  if ((!REG_RTC_AO_CSR.BF.WARM_START_MODE) || bRunningInFlash) {
+    //
+    //  Copy any .ro bytes to .data so that initialized global variables
+    //  are actually properly initialized.
+    //
+    __asm(
+        "    ldr      r0, =_eftext\n"
+        "    ldr      r1, =_data\n"
+        "    ldr      r2, =_edata\n"
+        "ro_copy_loop:\n"
+        "    ldr      r3, [r0], #4\n"
+        "    str      r3, [r1], #4\n"
+        "    cmp      r1, r2\n"
+        "    ble      ro_copy_loop\n");
+
+    //
+    // Zero fill the .bss section.
+    //
+    __asm(
+        "    ldr      r0, =_bss\n"
+        "    ldr      r1, =_ebss\n"
+        "    mov      r2, #0\n"
+        "bss_zero_loop:\n"
+        "    cmp      r0, r1\n"
+        "    it       lt\n"
+        "    strlt    r2, [r0], #4\n"
+        "    blt      bss_zero_loop\n");
+  }
+
+  //
+  // call the main routine barefoot, i.e. without the normal CRTC0 entry
+  // point.
+  //
+  rc = _main(0, NULL);
+
+  //
+  //  If main ever returns, trap it here and wake up the debugger if it is
+  //  connected.
+  //
+  while (1)  // for FPGA/real chip use
+  {
+    __asm("    BKPT      #1");
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// get my PC
+////////////////////////////////////////////////////////////////////////////////
+void *startup_get_my_pc(void) {
+  void *pc;
+  asm("mov %0, pc" : "=r"(pc));
+  return pc;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// get my SP
+////////////////////////////////////////////////////////////////////////////////
+void *startup_get_my_sp(void) {
+  void *sp;
+  asm("mov %0, sp" : "=r"(sp));
+  return sp;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Set VTOR based on PC
+////////////////////////////////////////////////////////////////////////////////
+void set_vtor(void) {
+  __asm(
+      "    ldr      r0, =0xe000ed08\n"
+      "    ldr      r1, =0xFF000000\n"
+      "    mov      r2, lr\n"
+      "    and      r1, r2\n"
+      "    str      r1, [r0]\n");
+
+  return;
+}
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531_makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..897a2b66d21668c4a28573e9d068b865c8f008a8
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/ecm3531_makefile.inc
@@ -0,0 +1,116 @@
+# Settings for eta ecm3531 platform
+ifeq ($(TARGET), ecm3531)
+  TARGET_ARCH := cortex-m3
+  TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
+  ETA_SDK := /home/hari/TensaiSDK-v0.0.17/soc/
+  GCC_ARM := /home/hari/Downloads/gcc-arm-none-eabi-7-2018-q2-update/
+
+#Pick the appropriate lds file depending whether you are running frof SRAM of flash
+  ETA_LDS_FILE := ecm3531.lds
+#  ETA_LDS_FILE := ecm3531_flash.lds
+
+  ifeq ($(wildcard $(ETA_SDK)),)
+    $(error Path to ETA SDK is not set (ETA_SDK))
+  endif
+
+  ifeq ($(wildcard $(GCC_ARM)),)
+    $(error Path to gcc arm compiler is not set (GCC_ARM))
+  endif
+
+  PLATFORM_FLAGS = \
+    -DARM_MATH_CM3 \
+    -DFIRMWARE_BUILD \
+    -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
+    -DTF_LITE_STATIC_MEMORY \
+    -DTF_LITE_MCU_DEBUG_LOG \
+    -fno-rtti \
+    -fmessage-length=0 \
+    -fno-exceptions \
+    -fno-unwind-tables \
+    -fno-builtin \
+    -ffunction-sections \
+    -fdata-sections \
+    -funsigned-char \
+    -MMD \
+    -mcpu=cortex-m3 \
+    -mthumb \
+    -mlittle-endian \
+    -mno-unaligned-access \
+    -std=gnu++11 \
+    -Wvla \
+    -Wall \
+    -Wextra \
+    -Wno-unused-parameter \
+    -Wno-missing-field-initializers \
+    -Wno-write-strings \
+    -Wno-sign-compare \
+    -fno-delete-null-pointer-checks \
+    -fomit-frame-pointer \
+    -fpermissive \
+    -nostdlib \
+    -g \
+    -Os
+  CXXFLAGS += $(PLATFORM_FLAGS)
+  CCFLAGS += $(PLATFORM_FLAGS)
+# Adding the --specs=nano.specs flag causes the linker to use libc_nano.a
+# instead of libc.a.  This gets rid of lots of errors with various pieces
+# of the exception unwinding code not being found.  Not clear why it is
+# trying to link in this code to begin with, though.
+  LDFLAGS += \
+    -mthumb -mcpu=cortex-m3 \
+    -nostartfiles -static \
+    -Wl,--gc-sections -Wl,--entry,ResetISR \
+    -Wl,--start-group -lm -lc -lgcc -Wl,--end-group \
+    -fno-exceptions \
+    -nostdlib --specs=nano.specs -t -lstdc++ -lc -lnosys -lm \
+    -Wl,-T,$(MAKEFILE_DIR)/targets/ecm3531/$(ETA_LDS_FILE) \
+    -Wl,-Map=$(MAKEFILE_DIR)/targets/ecm3531/ecm3531.map,--cref
+  BUILD_TYPE := micro
+  MICROLITE_LIBS := \
+    $(GCC_ARM)/lib/gcc/arm-none-eabi/7.3.1/thumb/v7e-m/fpv4-sp/softfp/crtbegin.o \
+    -lm
+  ECM3531_INCLUDES := \
+    -I$(GCC_ARM)/arm-none-eabi/include/ \
+    -I$(ETA_SDK)/ecm3531/boards/eta_evb/projects/m3/common/inc/ \
+    -I$(ETA_SDK)/ecm3531/m3/reg/inc/ \
+    -I$(ETA_SDK)/ecm3531/m3/csp/inc/ \
+    -I$(ETA_SDK)/ecm3531/common/csp/inc/ \
+    -I$(ETA_SDK)/common/inc/  \
+    -I$(ETA_SDK)/../utils/inc/  \
+    -I$(ETA_SDK)/ecm3531/boards/eta_evb/eta_bsp/inc
+
+  INCLUDES += $(ECM3531_INCLUDES)
+  GENERATED_PROJECT_INCLUDES += $(ECM3531_INCLUDES)
+
+  # _main.c contains application and target specific initialization, like
+  # setting clock speed, default uart setups, etc. and an implementation
+  # of the DebugLog interfaces.
+  MICROLITE_CC_SRCS += \
+    $(MAKEFILE_DIR)/targets/ecm3531/startup.c \
+    $(MAKEFILE_DIR)/targets/ecm3531/_main.c \
+    $(wildcard $(ETA_SDK)/ecm3531/boards/eta_evb/projects/m3/common/src/*.c) \
+    $(wildcard $(ETA_SDK)/ecm3531/m3/csp/src/*.c) \
+    $(wildcard $(ETA_SDK)/ecm3531/m3/csp/src/*.s)
+
+  # The linker script isn't a header, but it needs to get copied to the gen/
+  # directory for generated projects.  This is similar to the behavior needed
+  # for headers.
+  MICROLITE_CC_HDRS += \
+    $(MAKEFILE_DIR)/targets/ecm3531/$(ETA_LDS_FILE)
+
+  TEST_SCRIPT := tensorflow/lite/experimental/micro/testing/test_ecm3531_binary.sh
+  # These are tests that don't currently work on the blue pill.
+  EXCLUDED_TESTS := \
+    tensorflow/lite/experimental/micro/micro_interpreter_test.cc \
+    tensorflow/lite/experimental/micro/simple_tensor_allocator_test.cc
+  MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
+
+# These are microcontroller-specific rules for converting the ELF output
+# of the linker into a binary image that can be loaded directly.
+OBJCOPY := $(TARGET_TOOLCHAIN_PREFIX)objcopy
+
+$(BINDIR)/%.bin: $(BINDIR)/%
+	@mkdir -p $(dir $@)
+	$(OBJCOPY) $< $@ -O binary
+
+endif
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/mbed_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/mbed_makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..161ff34cdbda07768d33b9af45ed9655665b9bfd
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/mbed_makefile.inc
@@ -0,0 +1,4 @@
+# Settings for mbed platforms.
+ifeq ($(TARGET), mbed)
+  TARGET_ARCH := cortex-m4
+endif
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/mcu_riscv_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/mcu_riscv_makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..a259f68a3e0759baff04105cc6776212b49e2755
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/mcu_riscv_makefile.inc
@@ -0,0 +1,76 @@
+# Settings for RISCV 32-bit MCU toolchain.
+ifeq ($(TARGET), riscv32_mcu)
+  TARGET_ARCH := riscv32_mcu
+  TARGET_TOOLCHAIN_PREFIX := riscv64-unknown-elf-
+
+  PLATFORM_FLAGS = \
+    -march=rv32imac \
+    -mabi=ilp32 \
+    -mcmodel=medany \
+    -mexplicit-relocs \
+    -fno-builtin-printf \
+    -fno-exceptions \
+    -DTF_LITE_MCU_DEBUG_LOG \
+    -DTF_LITE_USE_GLOBAL_ROUND \
+    -fno-unwind-tables \
+    -fno-builtin \
+    -ffunction-sections \
+    -fdata-sections \
+    -funsigned-char \
+    -Wvla \
+    -Wall \
+    -Wextra \
+    -Wno-unused-parameter \
+    -Wno-missing-field-initializers \
+    -Wno-write-strings \
+    -Wno-sign-compare \
+    -fno-delete-null-pointer-checks \
+    -fomit-frame-pointer \
+    -Os
+
+  CXXFLAGS += $(PLATFORM_FLAGS) \
+    -fpermissive \
+    -fno-rtti \
+    --std=gnu++11
+
+  CCFLAGS += $(PLATFORM_FLAGS)
+
+  BUILD_TYPE := micro
+
+  INCLUDES += \
+    -I$(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/include \
+    -I$(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/drivers/ \
+    -I$(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env \
+    -I$(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/freedom-e300-hifive1
+
+  MICROLITE_CC_SRCS += \
+    $(wildcard tensorflow/lite/experimental/micro/riscv32_mcu/*.cc)
+  MICRO_SPEECH_TEST_SRCS += \
+    $(wildcard $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/libwrap/sys/*.c) \
+    $(wildcard $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/libwrap/sys/*.cc) \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/libwrap/misc/write_hex.c \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/libwrap/stdlib/malloc.c \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/start.S \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/entry.S \
+    $(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/freedom-e300-hifive1/init.c
+  LIBWRAP_SYMS := malloc free \
+                  open lseek read write fstat stat close link unlink \
+                  execve fork getpid kill wait \
+                  isatty times sbrk _exit puts
+
+  LDFLAGS += $(foreach s,$(LIBWRAP_SYMS),-Wl,--wrap=$(s))
+  LDFLAGS += $(foreach s,$(LIBWRAP_SYMS),-Wl,--wrap=_$(s))
+  LDFLAGS += -L. -Wl,--start-group -lc -Wl,--end-group
+  LDFLAGS += \
+   -T$(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env/freedom-e300-hifive1/flash.lds \
+   -nostartfiles \
+   -L$(MAKEFILE_DIR)/downloads/sifive_fe310_lib/bsp/env \
+   --specs=nano.specs
+
+# These are microcontroller-specific rules for converting the ELF output
+# of the linker into a binary image that can be loaded directly.
+  OBJCOPY := $(TARGET_TOOLCHAIN_PREFIX)objcopy
+  $(BINDIR)/%.bin: $(BINDIR)/%
+		@mkdir -p $(dir $@)
+		$(OBJCOPY) $< $@ -O binary
+endif
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/osx_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/osx_makefile.inc
new file mode 100644
index 0000000000000000000000000000000000000000..3b91eeff9fd5f2df06caa9a5f73b221815f9bbdf
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/targets/osx_makefile.inc
@@ -0,0 +1,10 @@
+# Settings for Mac OS platforms.
+ifeq ($(TARGET), osx)
+
+  PLATFORM_FLAGS = \
+    -DTF_LITE_DISABLE_X86_NEON
+
+  CXXFLAGS += $(PLATFORM_FLAGS)
+  CCFLAGS += $(PLATFORM_FLAGS)
+
+endif
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/AUDIO_DISCO_F746NG.lib.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/AUDIO_DISCO_F746NG.lib.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..11dae1ea16c4ac990af07aebd8b5e59ff748fc2d
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/AUDIO_DISCO_F746NG.lib.tpl
@@ -0,0 +1 @@
+https://os.mbed.com/teams/ST/code/AUDIO_DISCO_F746NG/#7046ce26b7ed
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/BSP_DISCO_F746NG.lib.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/BSP_DISCO_F746NG.lib.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..48dc1317072d537b3c61b0481b272855eb5941be
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/BSP_DISCO_F746NG.lib.tpl
@@ -0,0 +1 @@
+https://os.mbed.com/teams/ST/code/BSP_DISCO_F746NG/#df2ea349c37a
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/Makefile.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/Makefile.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..ca6519c1390b5b783e4b6f26cac40a6b7ef32f46
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/Makefile.tpl
@@ -0,0 +1,21 @@
+SRCS := \
+%{SRCS}%
+
+OBJS := \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(SRCS)))
+
+CXXFLAGS += %{CXX_FLAGS}%
+CCFLAGS += %{CC_FLAGS}%
+
+LDFLAGS += %{LINKER_FLAGS}%
+
+%.o: %.cc
+	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+
+%.o: %.c
+	$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
+
+%{EXECUTABLE}% : $(OBJS)
+	$(CXX) $(CXXFLAGS) -o $@ $(OBJS) $(LDFLAGS)
+
+all: %{EXECUTABLE}%
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/README_KEIL.md.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/README_KEIL.md.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..945b9f9c1ae4c5761afb80febe57803d1e7fcab2
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/README_KEIL.md.tpl
@@ -0,0 +1,5 @@
+# TensorFlow Lite Micro Mbed Project
+
+This folder has been autogenerated by TensorFlow, and contains source, header,
+and project files needed to build a single TensorFlow Lite Micro target using
+the Keil uVision IDE.
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/README_MAKE.md.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/README_MAKE.md.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..7906a3226ab5b475d3f0f93f39111e8e21d39a40
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/README_MAKE.md.tpl
@@ -0,0 +1,29 @@
+# TensorFlow Lite Micro Make Project
+
+This folder has been autogenerated by TensorFlow, and contains source, header,
+and project files needed to build a single TensorFlow Lite Micro target using
+the make tool.
+
+## Usage
+
+To build this, run:
+
+```
+make
+```
+
+This should attempt to build the target locally on your platform, using the
+standard Makefile variables like CFLAGS, CC, CXX, and so on.
+
+## Project Generation
+
+See
+[tensorflow/lite/experimental/micro](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro)
+for details on how projects like this can be generated from the main source
+tree.
+
+## License
+
+TensorFlow's code is covered by the Apache2 License included in the repository,
+and third party dependencies are covered by their respective licenses, in the
+third_party folder of this package.
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/README_MBED.md.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/README_MBED.md.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..2682236edf5b847a95aa07fa6d0e30c5a9a10c9a
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/README_MBED.md.tpl
@@ -0,0 +1,48 @@
+# TensorFlow Lite Micro Mbed Project
+
+This folder has been autogenerated by TensorFlow, and contains source, header,
+and project files needed to build a single TensorFlow Lite Micro target using
+the Mbed command line interface.
+
+## Usage
+
+To load the dependencies this code requires, run:
+
+```
+mbed config root .
+mbed deploy
+```
+
+TensorFlow requires C++ 11, so you'll need to update your profiles to reflect
+this. Here's a short Python command that does that:
+
+```
+python -c 'import fileinput, glob;
+for filename in glob.glob("mbed-os/tools/profiles/*.json"):
+  for line in fileinput.input(filename, inplace=True):
+    print line.replace("\"-std=gnu++98\"","\"-std=c++11\", \"-fpermissive\"")'
+```
+
+With that setting updated, you should now be able to compile:
+
+```
+mbed compile -m auto -t GCC_ARM
+```
+
+If this works, it will give you a .bin file that you can flash onto the device
+you're targeting. For example, using a Discovery STM3246G board, you can deploy
+it by copying the bin to the volume mounted as a USB drive, just by dragging
+over the file.
+
+## Project Generation
+
+See
+[tensorflow/lite/experimental/micro](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro)
+for details on how projects like this can be generated from the main source
+tree.
+
+## License
+
+TensorFlow's code is covered by the Apache2 License included in the repository,
+and third party dependencies are covered by their respective licenses, in the
+third_party folder of this package.
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/SDRAM_DISCO_F746NG.lib.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/SDRAM_DISCO_F746NG.lib.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..e2ccd7b81b28df938f19638f953b500c387594dc
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/SDRAM_DISCO_F746NG.lib.tpl
@@ -0,0 +1 @@
+https://os.mbed.com/teams/ST/code/SDRAM_DISCO_F746NG/#370f402a2219
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/keil_project.uvprojx.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/keil_project.uvprojx.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..440d4b6b3e8a7894bc2b0c6afbd5ff78b54f198b
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/keil_project.uvprojx.tpl
@@ -0,0 +1,418 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<Project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="project_projx.xsd">
+
+  <SchemaVersion>2.1</SchemaVersion>
+
+  <Targets>
+    <Target>
+      <TargetName>%{EXECUTABLE}%</TargetName>
+      <ToolsetNumber>0x4</ToolsetNumber>
+      <ToolsetName>ARM-ADS</ToolsetName>
+      <pCCUsed>6100001::V6.10.1::.\ARMCLANG</pCCUsed>
+      <uAC6>1</uAC6>
+      <TargetOption>
+        <TargetCommonOption>
+          <Device>STM32F746NGHx</Device>
+          <Vendor>STMicroelectronics</Vendor>
+          <PackID>Keil.STM32F7xx_DFP.2.11.0</PackID>
+          <PackURL>http://www.keil.com/pack</PackURL>
+          <Cpu>IRAM(0x20010000,0x40000) IRAM2(0x20000000,0x10000) IROM(0x08000000,0x100000) IROM2(0x00200000,0x100000) CPUTYPE("Cortex-M7") FPU3(SFPU) CLOCK(12000000) ELITTLE</Cpu>
+          <FlashUtilSpec></FlashUtilSpec>
+          <StartupFile></StartupFile>
+          <FlashDriverDll>UL2CM3(-S0 -C0 -P0 -FD20010000 -FC1000 -FN2 -FF0STM32F7x_1024 -FS08000000 -FL0100000 -FF1STM32F7xTCM_1024 -FS1200000 -FL1100000 -FP0($$Device:STM32F746NGHx$CMSIS\Flash\STM32F7x_1024.FLM) -FP1($$Device:STM32F746NGHx$CMSIS\Flash\STM32F7xTCM_1024.FLM))</FlashDriverDll>
+          <DeviceId>0</DeviceId>
+          <RegisterFile>$$Device:STM32F746NGHx$Drivers\CMSIS\Device\ST\STM32F7xx\Include\stm32f7xx.h</RegisterFile>
+          <MemoryEnv></MemoryEnv>
+          <Cmp></Cmp>
+          <Asm></Asm>
+          <Linker></Linker>
+          <OHString></OHString>
+          <InfinionOptionDll></InfinionOptionDll>
+          <SLE66CMisc></SLE66CMisc>
+          <SLE66AMisc></SLE66AMisc>
+          <SLE66LinkerMisc></SLE66LinkerMisc>
+          <SFDFile>$$Device:STM32F746NGHx$CMSIS\SVD\STM32F7x6_v1r1.svd</SFDFile>
+          <bCustSvd>0</bCustSvd>
+          <UseEnv>0</UseEnv>
+          <BinPath></BinPath>
+          <IncludePath></IncludePath>
+          <LibPath></LibPath>
+          <RegisterFilePath></RegisterFilePath>
+          <DBRegisterFilePath></DBRegisterFilePath>
+          <TargetStatus>
+            <Error>0</Error>
+            <ExitCodeStop>0</ExitCodeStop>
+            <ButtonStop>0</ButtonStop>
+            <NotGenerated>0</NotGenerated>
+            <InvalidFlash>1</InvalidFlash>
+          </TargetStatus>
+          <OutputDirectory>.\Objects\</OutputDirectory>
+          <OutputName>%{EXECUTABLE}%</OutputName>
+          <CreateExecutable>1</CreateExecutable>
+          <CreateLib>0</CreateLib>
+          <CreateHexFile>0</CreateHexFile>
+          <DebugInformation>1</DebugInformation>
+          <BrowseInformation>1</BrowseInformation>
+          <ListingPath>.\Listings\</ListingPath>
+          <HexFormatSelection>1</HexFormatSelection>
+          <Merge32K>0</Merge32K>
+          <CreateBatchFile>0</CreateBatchFile>
+          <BeforeCompile>
+            <RunUserProg1>0</RunUserProg1>
+            <RunUserProg2>0</RunUserProg2>
+            <UserProg1Name></UserProg1Name>
+            <UserProg2Name></UserProg2Name>
+            <UserProg1Dos16Mode>0</UserProg1Dos16Mode>
+            <UserProg2Dos16Mode>0</UserProg2Dos16Mode>
+            <nStopU1X>0</nStopU1X>
+            <nStopU2X>0</nStopU2X>
+          </BeforeCompile>
+          <BeforeMake>
+            <RunUserProg1>0</RunUserProg1>
+            <RunUserProg2>0</RunUserProg2>
+            <UserProg1Name></UserProg1Name>
+            <UserProg2Name></UserProg2Name>
+            <UserProg1Dos16Mode>0</UserProg1Dos16Mode>
+            <UserProg2Dos16Mode>0</UserProg2Dos16Mode>
+            <nStopB1X>0</nStopB1X>
+            <nStopB2X>0</nStopB2X>
+          </BeforeMake>
+          <AfterMake>
+            <RunUserProg1>0</RunUserProg1>
+            <RunUserProg2>0</RunUserProg2>
+            <UserProg1Name></UserProg1Name>
+            <UserProg2Name></UserProg2Name>
+            <UserProg1Dos16Mode>0</UserProg1Dos16Mode>
+            <UserProg2Dos16Mode>0</UserProg2Dos16Mode>
+            <nStopA1X>0</nStopA1X>
+            <nStopA2X>0</nStopA2X>
+          </AfterMake>
+          <SelectedForBatchBuild>0</SelectedForBatchBuild>
+          <SVCSIdString></SVCSIdString>
+        </TargetCommonOption>
+        <CommonProperty>
+          <UseCPPCompiler>0</UseCPPCompiler>
+          <RVCTCodeConst>0</RVCTCodeConst>
+          <RVCTZI>0</RVCTZI>
+          <RVCTOtherData>0</RVCTOtherData>
+          <ModuleSelection>0</ModuleSelection>
+          <IncludeInBuild>1</IncludeInBuild>
+          <AlwaysBuild>0</AlwaysBuild>
+          <GenerateAssemblyFile>0</GenerateAssemblyFile>
+          <AssembleAssemblyFile>0</AssembleAssemblyFile>
+          <PublicsOnly>0</PublicsOnly>
+          <StopOnExitCode>3</StopOnExitCode>
+          <CustomArgument></CustomArgument>
+          <IncludeLibraryModules></IncludeLibraryModules>
+          <ComprImg>1</ComprImg>
+        </CommonProperty>
+        <DllOption>
+          <SimDllName>SARMCM3.DLL</SimDllName>
+          <SimDllArguments> -REMAP -MPU</SimDllArguments>
+          <SimDlgDll>DCM.DLL</SimDlgDll>
+          <SimDlgDllArguments>-pCM7</SimDlgDllArguments>
+          <TargetDllName>SARMCM3.DLL</TargetDllName>
+          <TargetDllArguments> -MPU</TargetDllArguments>
+          <TargetDlgDll>TCM.DLL</TargetDlgDll>
+          <TargetDlgDllArguments>-pCM7</TargetDlgDllArguments>
+        </DllOption>
+        <DebugOption>
+          <OPTHX>
+            <HexSelection>1</HexSelection>
+            <HexRangeLowAddress>0</HexRangeLowAddress>
+            <HexRangeHighAddress>0</HexRangeHighAddress>
+            <HexOffset>0</HexOffset>
+            <Oh166RecLen>16</Oh166RecLen>
+          </OPTHX>
+        </DebugOption>
+        <Utilities>
+          <Flash1>
+            <UseTargetDll>1</UseTargetDll>
+            <UseExternalTool>0</UseExternalTool>
+            <RunIndependent>0</RunIndependent>
+            <UpdateFlashBeforeDebugging>1</UpdateFlashBeforeDebugging>
+            <Capability>1</Capability>
+            <DriverSelection>-1</DriverSelection>
+          </Flash1>
+          <bUseTDR>1</bUseTDR>
+          <Flash2>BIN\UL2CM3.DLL</Flash2>
+          <Flash3></Flash3>
+          <Flash4></Flash4>
+          <pFcarmOut></pFcarmOut>
+          <pFcarmGrp></pFcarmGrp>
+          <pFcArmRoot></pFcArmRoot>
+          <FcArmLst>0</FcArmLst>
+        </Utilities>
+        <TargetArmAds>
+          <ArmAdsMisc>
+            <GenerateListings>0</GenerateListings>
+            <asHll>1</asHll>
+            <asAsm>1</asAsm>
+            <asMacX>1</asMacX>
+            <asSyms>1</asSyms>
+            <asFals>1</asFals>
+            <asDbgD>1</asDbgD>
+            <asForm>1</asForm>
+            <ldLst>0</ldLst>
+            <ldmm>1</ldmm>
+            <ldXref>1</ldXref>
+            <BigEnd>0</BigEnd>
+            <AdsALst>1</AdsALst>
+            <AdsACrf>1</AdsACrf>
+            <AdsANop>0</AdsANop>
+            <AdsANot>0</AdsANot>
+            <AdsLLst>1</AdsLLst>
+            <AdsLmap>1</AdsLmap>
+            <AdsLcgr>1</AdsLcgr>
+            <AdsLsym>1</AdsLsym>
+            <AdsLszi>1</AdsLszi>
+            <AdsLtoi>1</AdsLtoi>
+            <AdsLsun>1</AdsLsun>
+            <AdsLven>1</AdsLven>
+            <AdsLsxf>1</AdsLsxf>
+            <RvctClst>0</RvctClst>
+            <GenPPlst>0</GenPPlst>
+            <AdsCpuType>"Cortex-M7"</AdsCpuType>
+            <RvctDeviceName></RvctDeviceName>
+            <mOS>0</mOS>
+            <uocRom>0</uocRom>
+            <uocRam>0</uocRam>
+            <hadIROM>1</hadIROM>
+            <hadIRAM>1</hadIRAM>
+            <hadXRAM>0</hadXRAM>
+            <uocXRam>0</uocXRam>
+            <RvdsVP>2</RvdsVP>
+            <RvdsMve>0</RvdsMve>
+            <hadIRAM2>1</hadIRAM2>
+            <hadIROM2>1</hadIROM2>
+            <StupSel>8</StupSel>
+            <useUlib>0</useUlib>
+            <EndSel>0</EndSel>
+            <uLtcg>0</uLtcg>
+            <nSecure>0</nSecure>
+            <RoSelD>4</RoSelD>
+            <RwSelD>4</RwSelD>
+            <CodeSel>0</CodeSel>
+            <OptFeed>0</OptFeed>
+            <NoZi1>0</NoZi1>
+            <NoZi2>0</NoZi2>
+            <NoZi3>0</NoZi3>
+            <NoZi4>0</NoZi4>
+            <NoZi5>0</NoZi5>
+            <Ro1Chk>0</Ro1Chk>
+            <Ro2Chk>0</Ro2Chk>
+            <Ro3Chk>0</Ro3Chk>
+            <Ir1Chk>1</Ir1Chk>
+            <Ir2Chk>0</Ir2Chk>
+            <Ra1Chk>0</Ra1Chk>
+            <Ra2Chk>0</Ra2Chk>
+            <Ra3Chk>0</Ra3Chk>
+            <Im1Chk>1</Im1Chk>
+            <Im2Chk>1</Im2Chk>
+            <OnChipMemories>
+              <Ocm1>
+                <Type>0</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </Ocm1>
+              <Ocm2>
+                <Type>0</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </Ocm2>
+              <Ocm3>
+                <Type>0</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </Ocm3>
+              <Ocm4>
+                <Type>0</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </Ocm4>
+              <Ocm5>
+                <Type>0</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </Ocm5>
+              <Ocm6>
+                <Type>0</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </Ocm6>
+              <IRAM>
+                <Type>0</Type>
+                <StartAddress>0x20010000</StartAddress>
+                <Size>0x40000</Size>
+              </IRAM>
+              <IROM>
+                <Type>1</Type>
+                <StartAddress>0x8000000</StartAddress>
+                <Size>0x100000</Size>
+              </IROM>
+              <XRAM>
+                <Type>0</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </XRAM>
+              <OCR_RVCT1>
+                <Type>1</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </OCR_RVCT1>
+              <OCR_RVCT2>
+                <Type>1</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </OCR_RVCT2>
+              <OCR_RVCT3>
+                <Type>1</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </OCR_RVCT3>
+              <OCR_RVCT4>
+                <Type>1</Type>
+                <StartAddress>0x8000000</StartAddress>
+                <Size>0x100000</Size>
+              </OCR_RVCT4>
+              <OCR_RVCT5>
+                <Type>1</Type>
+                <StartAddress>0x200000</StartAddress>
+                <Size>0x100000</Size>
+              </OCR_RVCT5>
+              <OCR_RVCT6>
+                <Type>0</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </OCR_RVCT6>
+              <OCR_RVCT7>
+                <Type>0</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </OCR_RVCT7>
+              <OCR_RVCT8>
+                <Type>0</Type>
+                <StartAddress>0x0</StartAddress>
+                <Size>0x0</Size>
+              </OCR_RVCT8>
+              <OCR_RVCT9>
+                <Type>0</Type>
+                <StartAddress>0x20010000</StartAddress>
+                <Size>0x40000</Size>
+              </OCR_RVCT9>
+              <OCR_RVCT10>
+                <Type>0</Type>
+                <StartAddress>0x20000000</StartAddress>
+                <Size>0x10000</Size>
+              </OCR_RVCT10>
+            </OnChipMemories>
+            <RvctStartVector></RvctStartVector>
+          </ArmAdsMisc>
+          <Cads>
+            <interw>1</interw>
+            <Optim>7</Optim>
+            <oTime>0</oTime>
+            <SplitLS>0</SplitLS>
+            <OneElfS>1</OneElfS>
+            <Strict>0</Strict>
+            <EnumInt>0</EnumInt>
+            <PlainCh>0</PlainCh>
+            <Ropi>0</Ropi>
+            <Rwpi>0</Rwpi>
+            <wLevel>3</wLevel>
+            <uThumb>0</uThumb>
+            <uSurpInc>0</uSurpInc>
+            <uC99>0</uC99>
+            <uGnu>1</uGnu>
+            <useXO>0</useXO>
+            <v6Lang>3</v6Lang>
+            <v6LangP>3</v6LangP>
+            <vShortEn>1</vShortEn>
+            <vShortWch>1</vShortWch>
+            <v6Lto>0</v6Lto>
+            <v6WtE>0</v6WtE>
+            <v6Rtti>0</v6Rtti>
+            <VariousControls>
+              <MiscControls></MiscControls>
+              <Define></Define>
+              <Undefine></Undefine>
+              <IncludePath>%{INCLUDE_PATHS}%</IncludePath>
+            </VariousControls>
+          </Cads>
+          <Aads>
+            <interw>1</interw>
+            <Ropi>0</Ropi>
+            <Rwpi>0</Rwpi>
+            <thumb>0</thumb>
+            <SplitLS>0</SplitLS>
+            <SwStkChk>0</SwStkChk>
+            <NoWarn>0</NoWarn>
+            <uSurpInc>0</uSurpInc>
+            <useXO>0</useXO>
+            <uClangAs>0</uClangAs>
+            <VariousControls>
+              <MiscControls></MiscControls>
+              <Define></Define>
+              <Undefine></Undefine>
+              <IncludePath></IncludePath>
+            </VariousControls>
+          </Aads>
+          <LDads>
+            <umfTarg>0</umfTarg>
+            <Ropi>0</Ropi>
+            <Rwpi>0</Rwpi>
+            <noStLib>0</noStLib>
+            <RepFail>1</RepFail>
+            <useFile>0</useFile>
+            <TextAddressRange>0x08000000</TextAddressRange>
+            <DataAddressRange>0x20010000</DataAddressRange>
+            <pXoBase></pXoBase>
+            <ScatterFile></ScatterFile>
+            <IncludeLibs></IncludeLibs>
+            <IncludeLibsPath></IncludeLibsPath>
+            <Misc></Misc>
+            <LinkerInputFile></LinkerInputFile>
+            <DisabledWarnings></DisabledWarnings>
+          </LDads>
+        </TargetArmAds>
+      </TargetOption>
+      <Groups>
+        <Group>
+          <GroupName>Source</GroupName>
+          <Files>
+%{SRCS}%
+          </Files>
+        </Group>
+        <Group>
+          <GroupName>::Compiler</GroupName>
+        </Group>
+      </Groups>
+    </Target>
+  </Targets>
+
+  <RTE>
+    <apis/>
+    <components>
+      <component Cbundle="ARM Compiler" Cclass="Compiler" Cgroup="I/O" Csub="STDERR" Cvariant="ITM" Cvendor="Keil" Cversion="1.2.0" condition="ARMCC Cortex-M with ITM">
+        <package name="ARM_Compiler" schemaVersion="1.4.9" url="http://www.keil.com/pack/" vendor="Keil" version="1.6.0"/>
+        <targetInfos>
+          <targetInfo name="%{EXECUTABLE}%"/>
+        </targetInfos>
+      </component>
+      <component Cbundle="ARM Compiler" Cclass="Compiler" Cgroup="I/O" Csub="STDIN" Cvariant="ITM" Cvendor="Keil" Cversion="1.2.0" condition="ARMCC Cortex-M with ITM">
+        <package name="ARM_Compiler" schemaVersion="1.4.9" url="http://www.keil.com/pack/" vendor="Keil" version="1.6.0"/>
+        <targetInfos>
+          <targetInfo name="%{EXECUTABLE}%"/>
+        </targetInfos>
+      </component>
+      <component Cbundle="ARM Compiler" Cclass="Compiler" Cgroup="I/O" Csub="STDOUT" Cvariant="ITM" Cvendor="Keil" Cversion="1.2.0" condition="ARMCC Cortex-M with ITM">
+        <package name="ARM_Compiler" schemaVersion="1.4.9" url="http://www.keil.com/pack/" vendor="Keil" version="1.6.0"/>
+        <targetInfos>
+          <targetInfo name="%{EXECUTABLE}%"/>
+        </targetInfos>
+      </component>
+    </components>
+    <files/>
+  </RTE>
+
+</Project>
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/mbed-os.lib.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/mbed-os.lib.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..69fff22f335953f62576d3408fbf15e24be5280f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/mbed-os.lib.tpl
@@ -0,0 +1 @@
+https://github.com/ARMmbed/mbed-os/#6a0a86538c0b9b2bfcc4583b1e2b7fea8f4e71e9
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/mbed_app.json.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/mbed_app.json.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..1c547369fb2784b27a9152ba4b7ade77c12211b0
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/mbed_app.json.tpl
@@ -0,0 +1,7 @@
+{
+    "config": {
+	"main-stack-size": {
+            "value": 65536
+	}
+    }
+}
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/tasks.json.make.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/tasks.json.make.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..141994d854565dc2ad2152e440c1d29526acb3dc
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/tasks.json.make.tpl
@@ -0,0 +1,16 @@
+{
+    // See https://go.microsoft.com/fwlink/?LinkId=733558
+    // for the documentation about the tasks.json format
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "label": "Make Build",
+            "type": "shell",
+            "command": "make",
+            "group": {
+                "kind": "build",
+                "isDefault": true
+                }
+        }
+    ]
+}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/micro/tools/make/templates/tasks.json.mbed.tpl b/tensorflow/lite/experimental/micro/tools/make/templates/tasks.json.mbed.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..616f3b23188df4af934433772c86c5c1a9452539
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/tools/make/templates/tasks.json.mbed.tpl
@@ -0,0 +1,39 @@
+{
+    // See https://go.microsoft.com/fwlink/?LinkId=733558
+    // for the documentation about the tasks.json format
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "label": "Mbed Config Root",
+            "type": "shell",
+            "command": "mbed config root .",
+        },
+        {
+            "label": "Mbed Deploy",
+            "type": "shell",
+            "command": "mbed deploy",
+        },
+        {
+            "label": "Mbed Patch C++11",
+            "type": "shell",
+            "command": "python",
+            "args": [
+                "-c",
+                "import fileinput, glob;\nfor filename in glob.glob(\"mbed-os/tools/profiles/*.json\"):\n  for line in fileinput.input(filename, inplace=True):\n    print line.replace(\"\\\"-std=gnu++98\\\"\",\"\\\"-std=c++11\\\", \\\"-fpermissive\\\"\")"
+            ]
+        },
+        {
+            "label": "Mbed Init",
+            "dependsOn": ["Mbed Config Root", "Mbed Deploy", "Mbed Patch C++11"]
+        },
+        {
+            "label": "Mbed build",
+            "type": "shell",
+            "command": "mbed compile -m auto -t GCC_ARM",
+            "group": {
+                "kind": "build",
+                "isDefault": true
+                }
+        }
+    ]
+}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/microfrontend/audio_microfrontend.cc b/tensorflow/lite/experimental/microfrontend/audio_microfrontend.cc
index 4367fe74a484445289f15c83860ca08ca4e144db..84ab164d2c08623d41ed9468fe42e1e7d2fbf354 100644
--- a/tensorflow/lite/experimental/microfrontend/audio_microfrontend.cc
+++ b/tensorflow/lite/experimental/microfrontend/audio_microfrontend.cc
@@ -142,7 +142,8 @@ void GenerateFeatures(TfLiteAudioMicrofrontendParams* data,
 
     if (output.values != nullptr) {
       frame_buffer[frame_index].reserve(output.size);
-      for (int i = 0; i < output.size; ++i) {
+      int i;
+      for (i = 0; i < output.size; ++i) {
         frame_buffer[frame_index].push_back(static_cast<T>(output.values[i]) /
                                             data->out_scale);
       }
@@ -152,9 +153,10 @@ void GenerateFeatures(TfLiteAudioMicrofrontendParams* data,
 
   int index = 0;
   std::vector<T> pad(data->state->filterbank.num_channels, 0);
-  for (int anchor = 0; anchor < frame_buffer.size();
-       anchor += data->frame_stride) {
-    for (int frame = anchor - data->left_context;
+  int anchor;
+  for (anchor = 0; anchor < frame_buffer.size(); anchor += data->frame_stride) {
+    int frame;
+    for (frame = anchor - data->left_context;
          frame <= anchor + data->right_context; ++frame) {
       std::vector<T>* feature;
       if (data->zero_padding && (frame < 0 || frame >= frame_buffer.size())) {
diff --git a/tensorflow/lite/experimental/microfrontend/audio_microfrontend_test.cc b/tensorflow/lite/experimental/microfrontend/audio_microfrontend_test.cc
index a9119d01831f6892dbf887930f3626445fc8a8e3..e3a0e06f7b0faf07c9188b4b77957358c0e84d9c 100644
--- a/tensorflow/lite/experimental/microfrontend/audio_microfrontend_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/audio_microfrontend_test.cc
@@ -140,13 +140,16 @@ class BaseMicroFrontendTest : public ::testing::Test {
 
     // Mimic padding behaviour with zero_padding = true.
     std::vector<int> output_flattened;
-    for (int anchor = 0; anchor < output.size();
+    int anchor;
+    for (anchor = 0; anchor < output.size();
          anchor += micro_frontend->num_frame_stride()) {
-      for (int frame = anchor - micro_frontend->num_left_context();
+      int frame;
+      for (frame = anchor - micro_frontend->num_left_context();
            frame <= anchor + micro_frontend->num_right_context(); ++frame) {
         if (frame < 0 || frame >= output.size()) {
           // Padding with zeros.
-          for (int j = 0; j < num_frequency_per_frame; ++j) {
+          int j;
+          for (j = 0; j < num_frequency_per_frame; ++j) {
             output_flattened.push_back(0.0);
           }
         } else {
diff --git a/tensorflow/lite/experimental/microfrontend/lib/BUILD b/tensorflow/lite/experimental/microfrontend/lib/BUILD
index a055e52f71001295cf95dfcbe790bc4118140fed..8dd42fc38290dbf5be8f9f1a850ad88cbf326ace 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/BUILD
+++ b/tensorflow/lite/experimental/microfrontend/lib/BUILD
@@ -6,6 +6,11 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
+load(
+    "//tensorflow/lite/experimental/micro/testing:micro_test.bzl",
+    "tflite_micro_cc_test",
+)
+
 cc_library(
     name = "bits",
     hdrs = ["bits.h"],
@@ -117,72 +122,65 @@ cc_library(
     ],
 )
 
-cc_test(
+tflite_micro_cc_test(
     name = "fft_test",
-    size = "small",
     srcs = ["fft_test.cc"],
     deps = [
         ":fft",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
 
-cc_test(
+tflite_micro_cc_test(
     name = "filterbank_test",
-    size = "small",
     srcs = ["filterbank_test.cc"],
     deps = [
         ":filterbank",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
 
-cc_test(
+tflite_micro_cc_test(
     name = "frontend_test",
-    size = "small",
     srcs = ["frontend_test.cc"],
     deps = [
         ":frontend",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
 
-cc_test(
+tflite_micro_cc_test(
     name = "log_scale_test",
-    size = "small",
     srcs = ["log_scale_test.cc"],
     deps = [
         ":log_scale",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
 
-cc_test(
+tflite_micro_cc_test(
     name = "noise_reduction_test",
-    size = "small",
     srcs = ["noise_reduction_test.cc"],
     deps = [
         ":noise_reduction",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
 
-cc_test(
+tflite_micro_cc_test(
     name = "pcan_gain_control_test",
-    size = "small",
     srcs = ["pcan_gain_control_test.cc"],
     deps = [
         ":pcan_gain_control",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
 
-cc_test(
+tflite_micro_cc_test(
     name = "window_test",
-    size = "small",
     srcs = ["window_test.cc"],
     deps = [
         ":window",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
diff --git a/tensorflow/lite/experimental/microfrontend/lib/bits.h b/tensorflow/lite/experimental/microfrontend/lib/bits.h
index bf15466a3d6484c3059a1ded1bb51e4d4287b1bf..04b3ba6f055f956720b58720c78083b1529fb065 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/bits.h
+++ b/tensorflow/lite/experimental/microfrontend/lib/bits.h
@@ -63,14 +63,14 @@ static inline int CountLeadingZeros64Slow(uint64_t n) {
 
 static inline int CountLeadingZeros64(uint64_t n) {
 #if defined(_MSC_VER) && defined(_M_X64)
-  // MSVC does not have __buitin_clzll. Use _BitScanReverse64.
+  // MSVC does not have __builtin_clzll. Use _BitScanReverse64.
   unsigned long result = 0;  // NOLINT(runtime/int)
   if (_BitScanReverse64(&result, n)) {
     return 63 - result;
   }
   return 64;
 #elif defined(_MSC_VER)
-  // MSVC does not have __buitin_clzll. Compose two calls to _BitScanReverse
+  // MSVC does not have __builtin_clzll. Compose two calls to _BitScanReverse
   unsigned long result = 0;  // NOLINT(runtime/int)
   if ((n >> 32) && _BitScanReverse(&result, n >> 32)) {
     return 31 - result;
diff --git a/tensorflow/lite/experimental/microfrontend/lib/fft_test.cc b/tensorflow/lite/experimental/microfrontend/lib/fft_test.cc
index 7c1ee2d852201cc52a53ae07bf6e00ebf6f1ab47..ec1f247ba24ad27917330708d6f9c754515a686b 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/fft_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/fft_test.cc
@@ -15,8 +15,7 @@ limitations under the License.
 #include "tensorflow/lite/experimental/microfrontend/lib/fft.h"
 #include "tensorflow/lite/experimental/microfrontend/lib/fft_util.h"
 
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace {
 
@@ -25,9 +24,13 @@ const int16_t kFakeWindow[] = {
     0, -28328, 0, 21447, 0, -13312, 0, 5943,   0, -1152, 0};
 const int kScaleShift = 0;
 
-TEST(FftTest, CheckOutputValues) {
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(FftTest_CheckOutputValues) {
   struct FftState state;
-  ASSERT_TRUE(
+  TF_LITE_MICRO_EXPECT(
       FftPopulateState(&state, sizeof(kFakeWindow) / sizeof(kFakeWindow[0])));
 
   FftInit(&state);
@@ -37,13 +40,15 @@ TEST(FftTest, CheckOutputValues) {
       {0, 0},    {-10, 9},     {-20, 0},   {-9, -10},     {0, 25},  {-119, 119},
       {-887, 0}, {3000, 3000}, {0, -6401}, {-3000, 3000}, {886, 0}, {118, 119},
       {0, 25},   {9, -10},     {19, 0},    {9, 9},        {0, 0}};
-  ASSERT_EQ(state.fft_size / 2 + 1, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i <= state.fft_size / 2; ++i) {
-    EXPECT_EQ(state.output[i].real, expected[i].real);
-    EXPECT_EQ(state.output[i].imag, expected[i].imag);
+  TF_LITE_MICRO_EXPECT_EQ(state.fft_size / 2 + 1,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i <= state.fft_size / 2; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.output[i].real, expected[i].real);
+    TF_LITE_MICRO_EXPECT_EQ(state.output[i].imag, expected[i].imag);
   }
 
   FftFreeStateContents(&state);
 }
 
-}  // namespace
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/microfrontend/lib/filterbank_io.c b/tensorflow/lite/experimental/microfrontend/lib/filterbank_io.c
index 2dbb4b3bf09654df3be0165f14c6f3da742268f1..6ce4c7c79646485477a3067f96c7fe8526836ee6 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/filterbank_io.c
+++ b/tensorflow/lite/experimental/microfrontend/lib/filterbank_io.c
@@ -17,7 +17,8 @@ limitations under the License.
 static void PrintArray(FILE* fp, const char* name, const int16_t* values,
                        size_t size) {
   fprintf(fp, "static int16_t filterbank_%s[] = {", name);
-  for (int i = 0; i < size; ++i) {
+  int i;
+  for (i = 0; i < size; ++i) {
     fprintf(fp, "%d", values[i]);
     if (i < size - 1) {
       fprintf(fp, ", ");
diff --git a/tensorflow/lite/experimental/microfrontend/lib/filterbank_test.cc b/tensorflow/lite/experimental/microfrontend/lib/filterbank_test.cc
index 808d527186eaa920a9eb5319b328b96de6047174..16257aa11a5ca3e82aeff60f8cc0176de3c519ab 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/filterbank_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/filterbank_test.cc
@@ -17,8 +17,7 @@ limitations under the License.
 
 #include <cstring>
 
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace {
 
@@ -33,9 +32,9 @@ const uint64_t kWork[] = {1835887, 61162970173, 258694800000};
 const int kScaleShift = 0;
 
 // Test filterbank generation using scaled-down defaults.
-class FilterbankTest : public ::testing::Test {
- protected:
-  FilterbankTest() {
+class FilterbankTestConfig {
+ public:
+  FilterbankTestConfig() {
     config_.num_channels = 2;
     config_.lower_band_limit = 8.0;
     config_.upper_band_limit = 450.0;
@@ -44,105 +43,124 @@ class FilterbankTest : public ::testing::Test {
   struct FilterbankConfig config_;
 };
 
-TEST_F(FilterbankTest, CheckStartIndex) {
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(FilterbankTest_CheckStartIndex) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
 
-  EXPECT_EQ(state.start_index, kStartIndex);
+  TF_LITE_MICRO_EXPECT_EQ(state.start_index, kStartIndex);
 
   FilterbankFreeStateContents(&state);
 }
 
-TEST_F(FilterbankTest, CheckEndIndex) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckEndIndex) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
 
-  EXPECT_EQ(state.end_index, kEndIndex);
+  TF_LITE_MICRO_EXPECT_EQ(state.end_index, kEndIndex);
 
   FilterbankFreeStateContents(&state);
 }
 
-TEST_F(FilterbankTest, CheckChannelFrequencyStarts) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckChannelFrequencyStarts) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
 
   const int16_t expected[] = {0, 4, 8};
-  ASSERT_EQ(state.num_channels + 1, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i <= state.num_channels; ++i) {
-    EXPECT_EQ(state.channel_frequency_starts[i], expected[i]);
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels + 1,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i <= state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.channel_frequency_starts[i], expected[i]);
   }
 
   FilterbankFreeStateContents(&state);
 }
 
-TEST_F(FilterbankTest, CheckChannelWeightStarts) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckChannelWeightStarts) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
 
   const int16_t expected[] = {0, 8, 16};
-  ASSERT_EQ(state.num_channels + 1, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i <= state.num_channels; ++i) {
-    EXPECT_EQ(state.channel_weight_starts[i], expected[i]);
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels + 1,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i <= state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.channel_weight_starts[i], expected[i]);
   }
 
   FilterbankFreeStateContents(&state);
 }
 
-TEST_F(FilterbankTest, CheckChannelWidths) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckChannelWidths) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
 
   const int16_t expected[] = {8, 8, 8};
-  ASSERT_EQ(state.num_channels + 1, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i <= state.num_channels; ++i) {
-    EXPECT_EQ(state.channel_widths[i], expected[i]);
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels + 1,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i <= state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.channel_widths[i], expected[i]);
   }
 
   FilterbankFreeStateContents(&state);
 }
 
-TEST_F(FilterbankTest, CheckWeights) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckWeights) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
 
   const int16_t expected[] = {0, 3277, 2217, 1200, 222,  0,   0,   0,
                               0, 3376, 2468, 1591, 744,  0,   0,   0,
                               0, 4020, 3226, 2456, 1708, 983, 277, 0};
-  ASSERT_EQ(state.channel_weight_starts[state.num_channels] +
-                state.channel_widths[state.num_channels],
-            sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
-    EXPECT_EQ(state.weights[i], expected[i]);
+  TF_LITE_MICRO_EXPECT_EQ(state.channel_weight_starts[state.num_channels] +
+                              state.channel_widths[state.num_channels],
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.weights[i], expected[i]);
   }
 
   FilterbankFreeStateContents(&state);
 }
 
-TEST_F(FilterbankTest, CheckUnweights) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckUnweights) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
 
   const int16_t expected[] = {0, 819, 1879, 2896, 3874, 0,    0,    0,
                               0, 720, 1628, 2505, 3352, 0,    0,    0,
                               0, 76,  870,  1640, 2388, 3113, 3819, 0};
-  ASSERT_EQ(state.channel_weight_starts[state.num_channels] +
-                state.channel_widths[state.num_channels],
-            sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
-    EXPECT_EQ(state.unweights[i], expected[i]);
+  TF_LITE_MICRO_EXPECT_EQ(state.channel_weight_starts[state.num_channels] +
+                              state.channel_widths[state.num_channels],
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.unweights[i], expected[i]);
   }
 
   FilterbankFreeStateContents(&state);
 }
 
-TEST_F(FilterbankTest, CheckConvertFftComplexToEnergy) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckConvertFftComplexToEnergy) {
   struct FilterbankState state;
   state.start_index = kStartIndex;
   state.end_index = kEndIndex;
@@ -154,41 +172,48 @@ TEST_F(FilterbankTest, CheckConvertFftComplexToEnergy) {
   int32_t* energy = reinterpret_cast<int32_t*>(fake_fft);
   FilterbankConvertFftComplexToEnergy(&state, fake_fft, energy);
 
-  for (int i = state.start_index; i < state.end_index; ++i) {
-    EXPECT_EQ(energy[i], kEnergy[i]);
+  int i;
+  for (i = state.start_index; i < state.end_index; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(energy[i], kEnergy[i]);
   }
 }
 
-TEST_F(FilterbankTest, CheckAccumulateChannels) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckAccumulateChannels) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
 
   FilterbankAccumulateChannels(&state, kEnergy);
 
-  ASSERT_EQ(state.num_channels + 1, sizeof(kWork) / sizeof(kWork[0]));
-  for (int i = 0; i <= state.num_channels; ++i) {
-    EXPECT_EQ(state.work[i], kWork[i]);
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels + 1,
+                          sizeof(kWork) / sizeof(kWork[0]));
+  int i;
+  for (i = 0; i <= state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.work[i], kWork[i]);
   }
 
   FilterbankFreeStateContents(&state);
 }
 
-TEST_F(FilterbankTest, CheckSqrt) {
+TF_LITE_MICRO_TEST(FilterbankTest_CheckSqrt) {
+  FilterbankTestConfig config;
   struct FilterbankState state;
-  ASSERT_TRUE(
-      FilterbankPopulateState(&config_, &state, kSampleRate, kSpectrumSize));
+  TF_LITE_MICRO_EXPECT(FilterbankPopulateState(&config.config_, &state,
+                                               kSampleRate, kSpectrumSize));
   std::memcpy(state.work, kWork, sizeof(kWork));
 
   uint32_t* scaled_filterbank = FilterbankSqrt(&state, kScaleShift);
 
   const uint32_t expected[] = {247311, 508620};
-  ASSERT_EQ(state.num_channels, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i < state.num_channels; ++i) {
-    EXPECT_EQ(scaled_filterbank[i], expected[i]);
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(scaled_filterbank[i], expected[i]);
   }
 
   FilterbankFreeStateContents(&state);
 }
 
-}  // namespace
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/microfrontend/lib/frontend_test.cc b/tensorflow/lite/experimental/microfrontend/lib/frontend_test.cc
index 993e866cc08850cdfea129278783420e827d67f2..568484f14dde6b958d5c9e144ab8dfd7a68a0fb0 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/frontend_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/frontend_test.cc
@@ -15,8 +15,7 @@ limitations under the License.
 #include "tensorflow/lite/experimental/microfrontend/lib/frontend.h"
 #include "tensorflow/lite/experimental/microfrontend/lib/frontend_util.h"
 
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace {
 
@@ -29,9 +28,9 @@ const int16_t kFakeAudioData[] = {
     0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768};
 
 // Test end-to-end frontend behaviors.
-class FrontendTest : public ::testing::Test {
- protected:
-  FrontendTest() {
+class FrontendTestConfig {
+ public:
+  FrontendTestConfig() {
     config_.window.size_ms = 25;
     config_.window.step_size_ms = 10;
     config_.noise_reduction.smoothing_bits = 10;
@@ -53,9 +52,15 @@ class FrontendTest : public ::testing::Test {
   struct FrontendConfig config_;
 };
 
-TEST_F(FrontendTest, CheckOutputValues) {
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(FrontendTest_CheckOutputValues) {
+  FrontendTestConfig config;
   struct FrontendState state;
-  ASSERT_TRUE(FrontendPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      FrontendPopulateState(&config.config_, &state, kSampleRate));
   size_t num_samples_read;
 
   struct FrontendOutput output = FrontendProcessSamples(
@@ -63,17 +68,20 @@ TEST_F(FrontendTest, CheckOutputValues) {
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read);
 
   const uint16_t expected[] = {479, 425};
-  ASSERT_EQ(output.size, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i < output.size; ++i) {
-    EXPECT_EQ(output.values[i], expected[i]);
+  TF_LITE_MICRO_EXPECT_EQ(output.size, sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < output.size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(output.values[i], expected[i]);
   }
 
   FrontendFreeStateContents(&state);
 }
 
-TEST_F(FrontendTest, CheckConsecutiveWindow) {
+TF_LITE_MICRO_TEST(FrontendTest_CheckConsecutiveWindow) {
+  FrontendTestConfig config;
   struct FrontendState state;
-  ASSERT_TRUE(FrontendPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      FrontendPopulateState(&config.config_, &state, kSampleRate));
   size_t num_samples_read;
 
   FrontendProcessSamples(&state, kFakeAudioData,
@@ -85,17 +93,20 @@ TEST_F(FrontendTest, CheckConsecutiveWindow) {
       &num_samples_read);
 
   const int16_t expected[] = {436, 378};
-  ASSERT_EQ(output.size, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i < output.size; ++i) {
-    EXPECT_EQ(output.values[i], expected[i]);
+  TF_LITE_MICRO_EXPECT_EQ(output.size, sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < output.size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(output.values[i], expected[i]);
   }
 
   FrontendFreeStateContents(&state);
 }
 
-TEST_F(FrontendTest, CheckNotEnoughSamples) {
+TF_LITE_MICRO_TEST(FrontendTest_CheckNotEnoughSamples) {
+  FrontendTestConfig config;
   struct FrontendState state;
-  ASSERT_TRUE(FrontendPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      FrontendPopulateState(&config.config_, &state, kSampleRate));
   size_t num_samples_read;
 
   FrontendProcessSamples(&state, kFakeAudioData,
@@ -111,10 +122,10 @@ TEST_F(FrontendTest, CheckNotEnoughSamples) {
           kStepSamples,
       &num_samples_read);
 
-  EXPECT_EQ(output.size, 0);
-  EXPECT_EQ(output.values, nullptr);
+  TF_LITE_MICRO_EXPECT_EQ(output.size, 0);
+  TF_LITE_MICRO_EXPECT_EQ(output.values, nullptr);
 
   FrontendFreeStateContents(&state);
 }
 
-}  // namespace
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/microfrontend/lib/log_scale.c b/tensorflow/lite/experimental/microfrontend/lib/log_scale.c
index 54f370e7d9f55250279cd6c9a81b9a17e0d6e071..149ec7cfba0a7891da320f92507fc06171363e70 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/log_scale.c
+++ b/tensorflow/lite/experimental/microfrontend/lib/log_scale.c
@@ -63,7 +63,8 @@ uint16_t* LogScaleApply(struct LogScaleState* state, uint32_t* signal,
   const int scale_shift = state->scale_shift;
   uint16_t* output = (uint16_t*) signal;
   uint16_t* ret = output;
-  for (int i = 0; i < signal_size; ++i) {
+  int i;
+  for (i = 0; i < signal_size; ++i) {
     uint32_t value = *signal++;
     if (state->enable_log) {
       if (correction_bits < 0) {
diff --git a/tensorflow/lite/experimental/microfrontend/lib/log_scale_test.cc b/tensorflow/lite/experimental/microfrontend/lib/log_scale_test.cc
index 91ca657e543d2a5f89a55483df8bdfbee1365951..be52fd426a23a389aac84e4b2dac832924716f83 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/log_scale_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/log_scale_test.cc
@@ -15,15 +15,18 @@ limitations under the License.
 #include "tensorflow/lite/experimental/microfrontend/lib/log_scale.h"
 #include "tensorflow/lite/experimental/microfrontend/lib/log_scale_util.h"
 
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace {
 
 const int kScaleShift = 6;
 const int kCorrectionBits = -1;
 
-TEST(LogScaleTest, CheckOutputValues) {
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(LogScaleTest_CheckOutputValues) {
   struct LogScaleState state;
   state.enable_log = true;
   state.scale_shift = kScaleShift;
@@ -34,12 +37,13 @@ TEST(LogScaleTest, CheckOutputValues) {
                                    kCorrectionBits);
 
   const uint16_t expected[] = {479, 425};
-  for (int i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
-    EXPECT_EQ(output[i], expected[i]);
+  int i;
+  for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(output[i], expected[i]);
   }
 }
 
-TEST(LogScaleTest, CheckOutputValuesNoLog) {
+TF_LITE_MICRO_TEST(LogScaleTest_CheckOutputValuesNoLog) {
   struct LogScaleState state;
   state.enable_log = false;
   state.scale_shift = kScaleShift;
@@ -50,9 +54,10 @@ TEST(LogScaleTest, CheckOutputValuesNoLog) {
                                    kCorrectionBits);
 
   const uint16_t expected[] = {65535, 45998};
-  for (int i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
-    EXPECT_EQ(output[i], expected[i]);
+  int i;
+  for (i = 0; i < sizeof(expected) / sizeof(expected[0]); ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(output[i], expected[i]);
   }
 }
 
-}  // namespace
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_test.cc b/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_test.cc
index 16140564879305de86947044f8b8efd055a4793c..ba864c427ced36748167c9412fe2966d72d3cb0e 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/noise_reduction_test.cc
@@ -15,17 +15,16 @@ limitations under the License.
 #include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction.h"
 #include "tensorflow/lite/experimental/microfrontend/lib/noise_reduction_util.h"
 
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace {
 
 const int kNumChannels = 2;
 
 // Test noise reduction using default config values.
-class NoiseReductionTest : public ::testing::Test {
- protected:
-  NoiseReductionTest() {
+class NoiseReductionTestConfig {
+ public:
+  NoiseReductionTestConfig() {
     config_.smoothing_bits = 10;
     config_.even_smoothing = 0.025;
     config_.odd_smoothing = 0.06;
@@ -35,36 +34,48 @@ class NoiseReductionTest : public ::testing::Test {
   struct NoiseReductionConfig config_;
 };
 
-TEST_F(NoiseReductionTest, TestNoiseReductionEstimate) {
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(NoiseReductionTest_TestNoiseReductionEstimate) {
+  NoiseReductionTestConfig config;
   struct NoiseReductionState state;
-  ASSERT_TRUE(NoiseReductionPopulateState(&config_, &state, kNumChannels));
+  TF_LITE_MICRO_EXPECT(
+      NoiseReductionPopulateState(&config.config_, &state, kNumChannels));
 
   uint32_t signal[] = {247311, 508620};
   NoiseReductionApply(&state, signal);
 
   const uint32_t expected[] = {6321887, 31248341};
-  ASSERT_EQ(state.num_channels, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i < state.num_channels; ++i) {
-    EXPECT_EQ(state.estimate[i], expected[i]);
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.estimate[i], expected[i]);
   }
 
   NoiseReductionFreeStateContents(&state);
 }
 
-TEST_F(NoiseReductionTest, TestNoiseReduction) {
+TF_LITE_MICRO_TEST(NoiseReductionTest_TestNoiseReduction) {
+  NoiseReductionTestConfig config;
   struct NoiseReductionState state;
-  ASSERT_TRUE(NoiseReductionPopulateState(&config_, &state, kNumChannels));
+  TF_LITE_MICRO_EXPECT(
+      NoiseReductionPopulateState(&config.config_, &state, kNumChannels));
 
   uint32_t signal[] = {247311, 508620};
   NoiseReductionApply(&state, signal);
 
   const uint32_t expected[] = {241137, 478104};
-  ASSERT_EQ(state.num_channels, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i < state.num_channels; ++i) {
-    EXPECT_EQ(signal[i], expected[i]);
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(signal[i], expected[i]);
   }
 
   NoiseReductionFreeStateContents(&state);
 }
 
-}  // namespace
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.c b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.c
index b49eb301370a7e95497478625a97333225a83341..8ccc2fde98c810bdf238edbf2f7a8d61b9e4f495 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.c
+++ b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.c
@@ -47,7 +47,8 @@ uint32_t PcanShrink(const uint32_t x) {
 
 void PcanGainControlApply(struct PcanGainControlState* state,
                           uint32_t* signal) {
-  for (int i = 0; i < state->num_channels; ++i) {
+  int i;
+  for (i = 0; i < state->num_channels; ++i) {
     const uint32_t gain = WideDynamicFunction(state->noise_estimate[i],
                                               state->gain_lut);
     const uint32_t snr = ((uint64_t) signal[i] * gain) >> state->snr_shift;
diff --git a/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_test.cc b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_test.cc
index 830db89edd8eb39fc68d24bfa4a61fe82ef3eace..93d7a8bcb94d5e0145b9ee701b413194f2946a7b 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_test.cc
@@ -15,8 +15,7 @@ limitations under the License.
 #include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control.h"
 #include "tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.h"
 
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace {
 
@@ -25,9 +24,9 @@ const int kSmoothingBits = 10;
 const int kCorrectionBits = -1;
 
 // Test pcan auto gain control using default config values.
-class PcanGainControlTest : public ::testing::Test {
- protected:
-  PcanGainControlTest() {
+class PcanGainControlTestConfig {
+ public:
+  PcanGainControlTestConfig() {
     config_.enable_pcan = 1;
     config_.strength = 0.95;
     config_.offset = 80.0;
@@ -37,23 +36,30 @@ class PcanGainControlTest : public ::testing::Test {
   struct PcanGainControlConfig config_;
 };
 
-TEST_F(PcanGainControlTest, TestPcanGainControl) {
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(PcanGainControlTest_TestPcanGainControl) {
   uint32_t estimate[] = {6321887, 31248341};
+  PcanGainControlTestConfig config;
   struct PcanGainControlState state;
-  ASSERT_TRUE(PcanGainControlPopulateState(&config_, &state, estimate,
-                                           kNumChannels, kSmoothingBits,
-                                           kCorrectionBits));
+  TF_LITE_MICRO_EXPECT(PcanGainControlPopulateState(
+      &config.config_, &state, estimate, kNumChannels, kSmoothingBits,
+      kCorrectionBits));
 
   uint32_t signal[] = {241137, 478104};
   PcanGainControlApply(&state, signal);
 
   const uint32_t expected[] = {3578, 1533};
-  ASSERT_EQ(state.num_channels, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i < state.num_channels; ++i) {
-    EXPECT_EQ(signal[i], expected[i]);
+  TF_LITE_MICRO_EXPECT_EQ(state.num_channels,
+                          sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.num_channels; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(signal[i], expected[i]);
   }
 
   PcanGainControlFreeStateContents(&state);
 }
 
-}  // namespace
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.c b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.c
index dbe44c494ae07fb8c356723287cb32bf63381d27..5201cf045b4d43738968cc27d34ec4b5fc896d4e 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.c
+++ b/tensorflow/lite/experimental/microfrontend/lib/pcan_gain_control_util.c
@@ -62,7 +62,8 @@ int PcanGainControlPopulateState(const struct PcanGainControlConfig* config,
   state->gain_lut[0] = PcanGainLookupFunction(config, input_bits, 0);
   state->gain_lut[1] = PcanGainLookupFunction(config, input_bits, 1);
   state->gain_lut -= 6;
-  for (int interval = 2; interval <= kWideDynamicFunctionBits; ++interval) {
+  int interval;
+  for (interval = 2; interval <= kWideDynamicFunctionBits; ++interval) {
     const uint32_t x0 = (uint32_t) 1 << (interval - 1);
     const uint32_t x1 = x0 + (x0 >> 1);
     const uint32_t x2 = (interval == kWideDynamicFunctionBits)
diff --git a/tensorflow/lite/experimental/microfrontend/lib/window_io.c b/tensorflow/lite/experimental/microfrontend/lib/window_io.c
index ed4ac5eb110c0f1358656ca9e1b79d6b37052258..d12cac2c85374f3a2465d59211d7ef44958d26af 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/window_io.c
+++ b/tensorflow/lite/experimental/microfrontend/lib/window_io.c
@@ -16,7 +16,8 @@ limitations under the License.
 
 void WindowWriteMemmapPreamble(FILE* fp, const struct WindowState* state) {
   fprintf(fp, "static int16_t window_coefficients[] = {\n");
-  for (int i = 0; i < state->size; ++i) {
+  int i;
+  for (i = 0; i < state->size; ++i) {
     fprintf(fp, "%d", state->coefficients[i]);
     if (i < state->size - 1) {
       fprintf(fp, ", ");
diff --git a/tensorflow/lite/experimental/microfrontend/lib/window_test.cc b/tensorflow/lite/experimental/microfrontend/lib/window_test.cc
index 8c6c19188d3e128e7bb3b1d007fff10ec271da95..cf9df523b8f5e540d47c8e6d3d42e37540f62ce0 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/window_test.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/window_test.cc
@@ -15,8 +15,7 @@ limitations under the License.
 #include "tensorflow/lite/experimental/microfrontend/lib/window.h"
 #include "tensorflow/lite/experimental/microfrontend/lib/window_util.h"
 
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
 
 namespace {
 
@@ -29,9 +28,9 @@ const int16_t kFakeAudioData[] = {
     0, 32767, 0, -32768, 0, 32767, 0, -32768, 0, 32767, 0, -32768};
 
 // Test window function behaviors using default config values.
-class WindowTest : public ::testing::Test {
- protected:
-  WindowTest() {
+class WindowTestConfig {
+ public:
+  WindowTestConfig() {
     config_.size_ms = 25;
     config_.step_size_ms = 10;
   }
@@ -39,81 +38,98 @@ class WindowTest : public ::testing::Test {
   struct WindowConfig config_;
 };
 
-TEST_F(WindowTest, CheckCoefficients) {
+}  // namespace
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(WindowState_CheckCoefficients) {
+  WindowTestConfig config;
   struct WindowState state;
-  ASSERT_TRUE(WindowPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      WindowPopulateState(&config.config_, &state, kSampleRate));
 
   const int16_t expected[] = {16,   144,  391,  743,  1176, 1664, 2177,
                               2681, 3145, 3541, 3843, 4032, 4096, 4032,
                               3843, 3541, 3145, 2681, 2177, 1664, 1176,
                               743,  391,  144,  16};
-  ASSERT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i < state.size; ++i) {
-    EXPECT_EQ(state.coefficients[i], expected[i]);
+  TF_LITE_MICRO_EXPECT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.coefficients[i], expected[i]);
   }
 
   WindowFreeStateContents(&state);
 }
 
-TEST_F(WindowTest, CheckResidualInput) {
+TF_LITE_MICRO_TEST(WindowState_CheckResidualInput) {
+  WindowTestConfig config;
   struct WindowState state;
-  ASSERT_TRUE(WindowPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      WindowPopulateState(&config.config_, &state, kSampleRate));
   size_t num_samples_read;
 
-  ASSERT_TRUE(WindowProcessSamples(
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
       &state, kFakeAudioData,
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
 
-  for (int i = kStepSamples; i < kWindowSamples; ++i) {
-    EXPECT_EQ(state.input[i - kStepSamples], kFakeAudioData[i]);
+  int i;
+  for (i = kStepSamples; i < kWindowSamples; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.input[i - kStepSamples], kFakeAudioData[i]);
   }
 
   WindowFreeStateContents(&state);
 }
 
-TEST_F(WindowTest, CheckOutputValues) {
+TF_LITE_MICRO_TEST(WindowState_CheckOutputValues) {
+  WindowTestConfig config;
   struct WindowState state;
-  ASSERT_TRUE(WindowPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      WindowPopulateState(&config.config_, &state, kSampleRate));
   size_t num_samples_read;
 
-  ASSERT_TRUE(WindowProcessSamples(
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
       &state, kFakeAudioData,
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
 
   const int16_t expected[] = {
       0, 1151,   0, -5944, 0, 13311,  0, -21448, 0, 28327, 0, -32256, 0, 32255,
       0, -28328, 0, 21447, 0, -13312, 0, 5943,   0, -1152, 0};
-  ASSERT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i < state.size; ++i) {
-    EXPECT_EQ(state.output[i], expected[i]);
+  TF_LITE_MICRO_EXPECT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.output[i], expected[i]);
   }
 
   WindowFreeStateContents(&state);
 }
 
-TEST_F(WindowTest, CheckMaxAbsValue) {
+TF_LITE_MICRO_TEST(WindowState_CheckMaxAbsValue) {
+  WindowTestConfig config;
   struct WindowState state;
-  ASSERT_TRUE(WindowPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      WindowPopulateState(&config.config_, &state, kSampleRate));
   size_t num_samples_read;
 
-  ASSERT_TRUE(WindowProcessSamples(
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
       &state, kFakeAudioData,
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
 
-  EXPECT_EQ(state.max_abs_output_value, 32256);
+  TF_LITE_MICRO_EXPECT_EQ(state.max_abs_output_value, 32256);
 
   WindowFreeStateContents(&state);
 }
 
-TEST_F(WindowTest, CheckConsecutiveWindow) {
+TF_LITE_MICRO_TEST(WindowState_CheckConsecutiveWindow) {
+  WindowTestConfig config;
   struct WindowState state;
-  ASSERT_TRUE(WindowPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      WindowPopulateState(&config.config_, &state, kSampleRate));
   size_t num_samples_read;
 
-  ASSERT_TRUE(WindowProcessSamples(
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
       &state, kFakeAudioData,
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
-  ASSERT_TRUE(WindowProcessSamples(
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
       &state, kFakeAudioData + kWindowSamples,
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples,
       &num_samples_read));
@@ -121,37 +137,41 @@ TEST_F(WindowTest, CheckConsecutiveWindow) {
   const int16_t expected[] = {
       0, -1152, 0, 5943,   0, -13312, 0, 21447, 0, -28328, 0, 32255, 0, -32256,
       0, 28327, 0, -21448, 0, 13311,  0, -5944, 0, 1151,   0};
-  ASSERT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
-  for (int i = 0; i < state.size; ++i) {
-    EXPECT_EQ(state.output[i], expected[i]);
+  TF_LITE_MICRO_EXPECT_EQ(state.size, sizeof(expected) / sizeof(expected[0]));
+  int i;
+  for (i = 0; i < state.size; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(state.output[i], expected[i]);
   }
 
   WindowFreeStateContents(&state);
 }
 
-TEST_F(WindowTest, CheckNotEnoughSamples) {
+TF_LITE_MICRO_TEST(WindowState_CheckNotEnoughSamples) {
+  WindowTestConfig config;
   struct WindowState state;
-  ASSERT_TRUE(WindowPopulateState(&config_, &state, kSampleRate));
+  TF_LITE_MICRO_EXPECT(
+      WindowPopulateState(&config.config_, &state, kSampleRate));
   size_t num_samples_read;
 
-  ASSERT_TRUE(WindowProcessSamples(
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
       &state, kFakeAudioData,
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]), &num_samples_read));
-  ASSERT_TRUE(WindowProcessSamples(
+  TF_LITE_MICRO_EXPECT(WindowProcessSamples(
       &state, kFakeAudioData + kWindowSamples,
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples,
       &num_samples_read));
-  ASSERT_FALSE(WindowProcessSamples(
-      &state, kFakeAudioData + kWindowSamples + kStepSamples,
-      sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - kWindowSamples -
-          kStepSamples,
-      &num_samples_read));
-
-  EXPECT_EQ(
+  TF_LITE_MICRO_EXPECT_EQ(
+      false, WindowProcessSamples(
+                 &state, kFakeAudioData + kWindowSamples + kStepSamples,
+                 sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) -
+                     kWindowSamples - kStepSamples,
+                 &num_samples_read));
+
+  TF_LITE_MICRO_EXPECT_EQ(
       state.input_used,
       sizeof(kFakeAudioData) / sizeof(kFakeAudioData[0]) - 2 * kStepSamples);
 
   WindowFreeStateContents(&state);
 }
 
-}  // namespace
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/microfrontend/ops/audio_microfrontend_op.cc b/tensorflow/lite/experimental/microfrontend/ops/audio_microfrontend_op.cc
index 51094a976d297af8e807ae4f828702ace9a9306a..9f2ea7eee638285edd7c70fb1f91d868a1811790 100644
--- a/tensorflow/lite/experimental/microfrontend/ops/audio_microfrontend_op.cc
+++ b/tensorflow/lite/experimental/microfrontend/ops/audio_microfrontend_op.cc
@@ -250,7 +250,8 @@ class AudioMicrofrontendOp : public OpKernel {
 
       if (output.values != nullptr) {
         frame_buffer[frame_index].reserve(output.size);
-        for (int i = 0; i < output.size; ++i) {
+        int i;
+        for (i = 0; i < output.size; ++i) {
           frame_buffer[frame_index].push_back(static_cast<T>(output.values[i]) /
                                               out_scale_);
         }
@@ -261,9 +262,10 @@ class AudioMicrofrontendOp : public OpKernel {
 
     int index = 0;
     std::vector<T> pad(config_.filterbank.num_channels, 0);
-    for (int anchor = 0; anchor < frame_buffer.size();
-         anchor += frame_stride_) {
-      for (int frame = anchor - left_context_; frame <= anchor + right_context_;
+    int anchor;
+    for (anchor = 0; anchor < frame_buffer.size(); anchor += frame_stride_) {
+      int frame;
+      for (frame = anchor - left_context_; frame <= anchor + right_context_;
            ++frame) {
         std::vector<T>* feature;
         if (zero_padding_ && (frame < 0 || frame >= frame_buffer.size())) {
diff --git a/tensorflow/lite/experimental/objc/BUILD.apple b/tensorflow/lite/experimental/objc/BUILD.apple
new file mode 100644
index 0000000000000000000000000000000000000000..e53de173b2b89c4f6869d27b6c966c16bbae72fe
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/BUILD.apple
@@ -0,0 +1,109 @@
+# TensorFlow Lite Objective-C API.
+
+package(default_visibility = ["//visibility:private"])
+
+licenses(["notice"])  # Apache 2.0
+
+load("@build_bazel_rules_apple//apple:ios.bzl", "ios_unit_test")
+
+SOURCES = glob([
+    "sources/*.h",
+    "sources/*.m",
+    "sources/*.mm",
+])
+
+API_HEADERS = glob([
+    "apis/*.h",
+])
+
+MINIMUM_OS_VERSION = "9.0"
+
+# Compiler flags for building regular non-test libraries.
+RELEASE_COPTS = [
+    # Enables language-specific warnings for Objective-C, Objective-C++, C, and C++.
+    "-Wall",
+    # Warns if functions, variables, and types marked with the deprecated attribute are being used.
+    "-Wdeprecated-declarations",
+    # Warns for errors in documentation.
+    "-Wdocumentation",
+    # Turns all warnings into errors.
+    "-Werror",
+    # Enables extra warning flags that are not enabled by -Wall.
+    "-Wextra",
+    # Warns if a global function is defined without a previous prototype declaration.
+    "-Wmissing-prototypes",
+    # From -Wextra. Disables warning when signed value is converted to unsigned value during comparison.
+    "-Wno-sign-compare",
+    # From -Wextra. Disables warning for unused parameters, which are common in delegate methods and block callbacks.
+    "-Wno-unused-parameter",
+    # Warns if a global or local variable or type declaration shadows another variable, parameter, type, class member, or instance variable.
+    "-Wshadow",
+    # Warns if a function is declared or defined without specifying the argument types. For a block with no args, use (void) instead of ().
+    "-Wstrict-prototypes",
+    # Warns if an @selector() expression is encountered with a method name that hasn't been defined yet.
+    "-Wundeclared-selector",
+    # Turn off warnings for headers not part of TensorFlow Lite Objective-C API.
+    "--system-header-prefix=tensorflow/lite/experimental/c/",
+]
+
+# Compiler flags for building test libraries.
+TEST_COPTS = RELEASE_COPTS + [
+    # From -Wall. Disables warning when passing nil to a callee that requires a non-null argument.
+    "-Wno-nonnull",
+    # Disables warning when a global or local variable or type declaration shadows another.
+    "-Wno-shadow",
+]
+
+# Default tags for filtering targets. Targets in this file are restricted to Apple platforms.
+DEFAULT_TAGS = [
+    "apple",
+    "manual",
+]
+
+objc_library(
+    name = "TensorFlowLite",
+    srcs = SOURCES,
+    hdrs = API_HEADERS,
+    copts = RELEASE_COPTS,
+    tags = DEFAULT_TAGS,
+    deps = [
+        "//tensorflow/lite/experimental/c:c_api",
+    ],
+    alwayslink = 1,
+)
+
+ios_unit_test(
+    name = "TensorFlowLiteTests",
+    size = "small",
+    minimum_os_version = MINIMUM_OS_VERSION,
+    tags = DEFAULT_TAGS + [
+        # These sanitizer tests are not supported by iOS build toolchain (b/74292221).
+        # Disabled these for iOS test targets.
+        "noasan",
+        "notsan",
+        "nomsan",
+    ],
+    deps = [":TensorFlowLiteTestsLib"],
+)
+
+objc_library(
+    name = "TensorFlowLiteTestsLib",
+    testonly = 1,
+    srcs = glob([
+        "tests/*.m",
+    ]),
+    hdrs = glob([
+        "apis/*.h",
+        "sources/*.h",
+        "tests/*.h",
+    ]),
+    copts = TEST_COPTS,
+    data = [
+        "//tensorflow/lite:testdata/add.bin",
+        "//tensorflow/lite:testdata/add_quantized.bin",
+    ],
+    tags = DEFAULT_TAGS,
+    deps = [
+        ":TensorFlowLite",
+    ],
+)
diff --git a/tensorflow/lite/experimental/objc/README.md b/tensorflow/lite/experimental/objc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2940e0524080934147c7ee9044df0098d29c496b
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/README.md
@@ -0,0 +1,52 @@
+# TensorFlow Lite Objective-C Library
+
+[TensorFlow Lite](https://www.tensorflow.org/lite/) is TensorFlow's lightweight
+solution for Objective-C developers. It enables low-latency inference of
+on-device machine learning models with a small binary size and fast performance
+supporting hardware acceleration.
+
+## Getting Started
+
+### Bazel
+
+In your `BUILD` file, add the `TensorFlowLite` dependency:
+
+```python
+objc_library(
+  deps = [
+      "//tensorflow/lite/experimental/objc:TensorFlowLite",
+  ],
+)
+```
+
+If you would like to build the Objective-C TensorFlow Lite library using Bazel on Apple
+platforms, clone or download the [TensorFlow GitHub repo](https://github.com/tensorflow/tensorflow),
+then navigate to the root `tensorflow` directory and execute the `configure.py` script:
+
+```shell
+python configure.py
+```
+
+Follow the prompts and when asked to configure the Bazel rules for Apple
+platforms, enter `y`.
+
+Build the `TensorFlowLite` Objective-C library target:
+
+```shell
+bazel build tensorflow/lite/experimental/objc:TensorFlowLite
+```
+
+Build the `TensorFlowLiteTests` target:
+
+```shell
+bazel test tensorflow/lite/experimental/objc:TensorFlowLiteTests
+```
+
+### Tulsi
+
+Open the `TensorFlowLiteObjc.tulsiproj` using the Tulsi application on Mac or by
+running the following command in Terminal from the root source directory:
+
+```shell
+generate_xcodeproj.sh --genconfig tensorflow/lite/experimental/objc/TensorFlowLiteObjc.tulsiproj:TensorFlowLiteObjC --outputfolder ~/path/to/xcodeproj
+```
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjc.tulsiproj/Configs/TensorFlowLiteObjc.tulsigen b/tensorflow/lite/experimental/objc/TensorFlowLiteObjc.tulsiproj/Configs/TensorFlowLiteObjc.tulsigen
new file mode 100644
index 0000000000000000000000000000000000000000..091ef4e2ea8b6ea0476bac71a6cfea25104706d1
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjc.tulsiproj/Configs/TensorFlowLiteObjc.tulsigen
@@ -0,0 +1,60 @@
+{
+  "sourceFilters" : [
+    "tensorflow/lite",
+    "tensorflow/lite/experimental/c",
+    "tensorflow/lite/experimental/objc",
+    "tensorflow/lite/experimental/objc/apis",
+    "tensorflow/lite/experimental/objc/sources",
+    "tensorflow/lite/experimental/objc/tests",
+    "tensorflow/lite/kernels",
+    "tensorflow/lite/kernels/internal",
+    "tensorflow/lite/nnapi",
+    "tensorflow/lite/schema",
+  ],
+  "buildTargets" : [
+    "//tensorflow/lite/experimental/objc:TensorFlowLite",
+    "//tensorflow/lite/experimental/objc:TensorFlowLiteTests",
+  ],
+  "projectName" : "TensorFlowLiteObjC",
+  "optionSet" : {
+    "LaunchActionPreActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "BazelBuildStartupOptionsRelease" : {
+      "p" : "$(inherited)"
+    },
+    "BazelBuildOptionsRelease" : {
+      "p" : "$(inherited)"
+    },
+    "BazelBuildOptionsDebug" : {
+      "p" : "$(inherited)"
+    },
+    "EnvironmentVariables" : {
+      "p" : "$(inherited)"
+    },
+    "BuildActionPreActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "CommandlineArguments" : {
+      "p" : "$(inherited)"
+    },
+    "TestActionPreActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "BazelBuildStartupOptionsDebug" : {
+      "p" : "$(inherited)"
+    },
+    "BuildActionPostActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "TestActionPostActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "LaunchActionPostActionScript" : {
+      "p" : "$(inherited)"
+    }
+  },
+  "additionalFilePaths" : [
+    "tensorflow/lite/experimental/objc/BUILD",
+  ]
+}
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjc.tulsiproj/project.tulsiconf b/tensorflow/lite/experimental/objc/TensorFlowLiteObjc.tulsiproj/project.tulsiconf
new file mode 100644
index 0000000000000000000000000000000000000000..0b6fedff3f66d1faf3894604f756c45f8dce9547
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjc.tulsiproj/project.tulsiconf
@@ -0,0 +1,17 @@
+{
+  "configDefaults" : {
+    "optionSet" : {
+      "BazelBuildOptionsDebug" : {
+
+      },
+      "BazelBuildOptionsRelease" : {
+
+      },
+    }
+  },
+  "projectName" : "TensorFlowLiteObjC",
+  "packages" : [
+    "tensorflow/lite/experimental/objc"
+  ],
+  "workspaceRoot" : "../../../../.."
+}
diff --git a/tensorflow/lite/experimental/objc/apis/TFLInterpreter.h b/tensorflow/lite/experimental/objc/apis/TFLInterpreter.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c06a4bc82f752baabdb6db100ee96e9ce29d29f
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/apis/TFLInterpreter.h
@@ -0,0 +1,179 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+@class TFLInterpreterOptions;
+@class TFLTensor;
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * @enum TFLInterpreterErrorCode
+ * This enum specifies various error codes related to `TFLInterpreter`.
+ */
+typedef NS_ENUM(NSUInteger, TFLInterpreterErrorCode) {
+  /** Provided tensor index is invalid. */
+  TFLInterpreterErrorCodeInvalidTensorIndex,
+
+  /** Input data has invalid byte size. */
+  TFLInterpreterErrorCodeInvalidInputByteSize,
+
+  /** Provided shape is invalid. It must be a non-empty array of positive unsigned integers. */
+  TFLInterpreterErrorCodeInvalidShape,
+
+  /** Provided model cannot be loaded. */
+  TFLInterpreterErrorCodeFailedToLoadModel,
+
+  /** Failed to create `TFLInterpreter`. */
+  TFLInterpreterErrorCodeFailedToCreateInterpreter,
+
+  /** Failed to invoke `TFLInterpreter`. */
+  TFLInterpreterErrorCodeFailedToInvoke,
+
+  /** Failed to retrieve a tensor. */
+  TFLInterpreterErrorCodeFailedToGetTensor,
+
+  /** Invalid tensor. */
+  TFLInterpreterErrorCodeInvalidTensor,
+
+  /** Failed to resize an input tensor. */
+  TFLInterpreterErrorCodeFailedToResizeInputTensor,
+
+  /** Failed to copy data into an input tensor. */
+  TFLInterpreterErrorCodeFailedToCopyDataToInputTensor,
+
+  /** Copying data into an output tensor not allowed. */
+  TFLInterpreterErrorCodeCopyDataToOutputTensorNotAllowed,
+
+  /** Failed to get data from a tensor. */
+  TFLInterpreterErrorCodeFailedToGetDataFromTensor,
+
+  /** Failed to allocate memory for tensors. */
+  TFLInterpreterErrorCodeFailedToAllocateTensors,
+
+  /** Operaton not allowed without allocating memory for tensors first. */
+  TFLInterpreterErrorCodeAllocateTensorsRequired,
+
+  /** Operaton not allowed without invoking the interpreter first. */
+  TFLInterpreterErrorCodeInvokeInterpreterRequired,
+};
+
+/**
+ * A TensorFlow Lite model interpreter.
+ */
+@interface TFLInterpreter : NSObject
+
+/** The total number of input tensors. 0 if the interpreter creation failed. */
+@property(nonatomic, readonly) NSUInteger inputTensorCount;
+
+/** The total number of output tensors. 0 if the interpreter creation failed. */
+@property(nonatomic, readonly) NSUInteger outputTensorCount;
+
+/** Unavailable. */
+- (instancetype)init NS_UNAVAILABLE;
+
+/**
+ * Initializes a new TensorFlow Lite interpreter instance with the given model file path and the
+ * default interpreter options.
+ *
+ * @param modelPath An absolute path to a TensorFlow Lite model file stored locally on the device.
+ * @param error An optional error parameter populated when there is an error in initializing the
+ *     interpreter.
+ *
+ * @return A new instance of `TFLInterpreter` with the given model and the default interpreter
+ *     options. `nil` if there is an error in initializing the interpreter.
+ */
+- (nullable instancetype)initWithModelPath:(NSString *)modelPath error:(NSError **)error;
+
+/**
+ * Initializes a new TensorFlow Lite interpreter instance with the given model file path and
+ * options.
+ *
+ * @param modelPath An absolute path to a TensorFlow Lite model file stored locally on the device.
+ * @param options Options to use for configuring the TensorFlow Lite interpreter.
+ * @param error An optional error parameter populated when there is an error in initializing the
+ *     interpreter.
+ *
+ * @return A new instance of `TFLInterpreter` with the given model and options. `nil` if there is an
+ *     error in initializing the interpreter.
+ */
+- (nullable instancetype)initWithModelPath:(NSString *)modelPath
+                                   options:(TFLInterpreterOptions *)options
+                                     error:(NSError **)error NS_DESIGNATED_INITIALIZER;
+
+/**
+ * Invokes the interpreter to run inference.
+ *
+ * @param error An optional error parameter populated when there is an error in invoking the
+ *     interpreter.
+ *
+ * @return Whether the invocation is successful. Returns NO if an error occurred.
+ */
+- (BOOL)invokeWithError:(NSError **)error;
+
+/**
+ * Returns the input tensor at the given index.
+ *
+ * @param index The index of an input tensor.
+ * @param error An optional error parameter populated when there is an error in looking up the input
+ *     tensor.
+ *
+ * @return The input tensor at the given index. `nil` if there is an error. See the `TFLTensor`
+ *     class documentation for more details on the life expectancy between the returned tensor and
+ *     this interpreter.
+ */
+- (nullable TFLTensor *)inputTensorAtIndex:(NSUInteger)index error:(NSError **)error;
+
+/**
+ * Returns the output tensor at the given index.
+ *
+ * @param index The index of an output tensor.
+ * @param error An optional error parameter populated when there is an error in looking up the
+ *     output tensor.
+ *
+ * @return The output tensor at the given index. `nil` if there is an error. See the `TFLTensor`
+ *     class documentation for more details on the life expectancy between the returned tensor and
+ *     this interpreter.
+ */
+- (nullable TFLTensor *)outputTensorAtIndex:(NSUInteger)index error:(NSError **)error;
+
+/**
+ * Resizes the input tensor at the given index to the specified shape (an array of positive unsigned
+ * integers).
+ *
+ * @param index The index of an input tensor.
+ * @param shape Shape that the given input tensor should be resized to. It should be an array of
+ *     positive unsigned integer(s) containing the size of each dimension.
+ * @param error An optional error parameter populated when there is an error in resizing the input
+ *     tensor.
+ *
+ * @return Whether the input tensor was resized successfully. Returns NO if an error occurred.
+ */
+- (BOOL)resizeInputTensorAtIndex:(NSUInteger)index
+                         toShape:(NSArray<NSNumber *> *)shape
+                           error:(NSError **)error;
+
+/**
+ * Allocates memory for tensors.
+ *
+ * @param error An optional error parameter populated when there is an error in allocating memory.
+ *
+ * @return Whether memory allocation is successful. Returns NO if an error occurred.
+ */
+- (BOOL)allocateTensorsWithError:(NSError **)error;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h b/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..6461fbf0178b1e72afb81e91d58109a2d7b0226b
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h
@@ -0,0 +1,37 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+/** Custom configuration options for a TensorFlow Lite interpreter. */
+@interface TFLInterpreterOptions : NSObject
+
+/**
+ * Maximum number of threads that the interpreter should run on. Defaults to 0 (unspecified, letting
+ * TensorFlow Lite to optimize the threading decision).
+ */
+@property(nonatomic) NSUInteger numberOfThreads;
+
+/**
+ * Initializes a new instance of `TFLInterpreterOptions`.
+ *
+ * @return A new instance of `TFLInterpreterOptions`.
+ */
+- (instancetype)init NS_DESIGNATED_INITIALIZER;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/apis/TFLQuantizationParameters.h b/tensorflow/lite/experimental/objc/apis/TFLQuantizationParameters.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d5cf793c5bed984debe3a36fdec4f0945cd7c64
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/apis/TFLQuantizationParameters.h
@@ -0,0 +1,36 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * Parameters for asymmetric quantization. Quantized values can be converted to float values using:
+ * `realValue = scale * (quantizedValue - zeroPoint)`.
+ */
+@interface TFLQuantizationParameters : NSObject
+
+/** Scale of asymmetric quantization. */
+@property(nonatomic, readonly) float scale;
+
+/** Zero point of asymmetric quantization. */
+@property(nonatomic, readonly) int32_t zeroPoint;
+
+/** Unavailable. */
+- (instancetype)init NS_UNAVAILABLE;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/apis/TFLTensor.h b/tensorflow/lite/experimental/objc/apis/TFLTensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc710abf4e2ea99126be2fb359412287f3c37a33
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/apis/TFLTensor.h
@@ -0,0 +1,111 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+@class TFLQuantizationParameters;
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * @enum TFLTensorDataType
+ * This enum specifies supported TensorFlow Lite tensor data types.
+ */
+typedef NS_ENUM(NSUInteger, TFLTensorDataType) {
+  /** Tensor data type not available. This indicates an error with the model. */
+  TFLTensorDataTypeNoType,
+
+  /** 32-bit single precision floating point. */
+  TFLTensorDataTypeFloat32,
+
+  /** 32-bit signed integer. */
+  TFLTensorDataTypeInt32,
+
+  /** 8-bit unsigned integer. */
+  TFLTensorDataTypeUInt8,
+
+  /** 64-bit signed integer. */
+  TFLTensorDataTypeInt64,
+
+  /** Boolean. */
+  TFLTensorDataTypeBool,
+
+  /** 16-bit signed integer. */
+  TFLTensorDataTypeInt16,
+
+  /** 8-bit signed integer. */
+  TFLTensorDataTypeInt8,
+};
+
+/**
+ * An input or output tensor in a TensorFlow Lite model.
+ *
+ * @warning Each `TFLTensor` instance is associated with a `TFLInterpreter` instance. Multiple
+ *     `TFLTensor` instances of the same TensorFlow Lite model are associated with the same
+ *     `TFLInterpreter` instance. As long as a `TFLTensor` instance is still in use, its associated
+ *     `TFLInterpreter` instance will not be deallocated.
+ */
+@interface TFLTensor : NSObject
+
+/** Name of the tensor. */
+@property(nonatomic, readonly, copy) NSString *name;
+
+/** Data type of the tensor. */
+@property(nonatomic, readonly) TFLTensorDataType dataType;
+
+/** Parameters for asymmetric quantization. `nil` if the tensor does not use quantization. */
+@property(nonatomic, readonly, nullable) TFLQuantizationParameters *quantizationParameters;
+
+/** Unavailable. */
+- (instancetype)init NS_UNAVAILABLE;
+
+/**
+ * Copies the given data into an input tensor. This is allowed only for an input tensor and only
+ * before the interpreter is invoked; otherwise an error will be returned.
+ *
+ * @param data The data to set. The byte size of the data must match what's required by the input
+ *     tensor.
+ * @param error An optional error parameter populated when there is an error in copying the data.
+ *
+ * @return Whether the data was copied into the input tensor successfully. Returns NO if an error
+ *     occurred.
+ */
+- (BOOL)copyData:(NSData *)data error:(NSError **)error;
+
+/**
+ * Retrieves a copy of data in the tensor. For an output tensor, the data is only available after
+ * the interpreter invocation has successfully completed; otherwise an error will be returned.
+ *
+ * @param error An optional error parameter populated when there is an error in retrieving the data.
+ *
+ * @return A copy of data in the tensor. `nil` if there is an error in retrieving the data or the
+ *     data is not available.
+ */
+- (nullable NSData *)dataWithError:(NSError **)error;
+
+/**
+ * Retrieves the shape of the tensor, an array of positive unsigned integers containing the size
+ * of each dimension. For example: the shape of [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]] is
+ * [2, 2, 3] (i.e. an array of 2 arrays of 2 arrays of 3 numbers).
+ *
+ * @param error An optional error parameter populated when there is an error in retrieving the
+ *     shape.
+ *
+ * @return The shape of the tensor. `nil` if there is an error in retrieving the shape.
+ */
+- (nullable NSArray<NSNumber *> *)shapeWithError:(NSError **)error;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLErrorUtil.h b/tensorflow/lite/experimental/objc/sources/TFLErrorUtil.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce8d50c896e6d5716308c7dc8818258fa38dbd72
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLErrorUtil.h
@@ -0,0 +1,40 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreter.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+/** Helper utility for error reporting. */
+@interface TFLErrorUtil : NSObject
+
+/**
+ * Creates and saves an interpreter error with the given error code and description.
+ *
+ * @param code Error code.
+ * @param description Error description.
+ * @param error Pointer to where to save the created error. If `nil`, no error will be saved.
+ */
++ (void)saveInterpreterErrorWithCode:(TFLInterpreterErrorCode)code
+                         description:(NSString *)description
+                               error:(NSError **)error;
+
+/** Unavailable. */
+- (instancetype)init NS_UNAVAILABLE;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLErrorUtil.m b/tensorflow/lite/experimental/objc/sources/TFLErrorUtil.m
new file mode 100644
index 0000000000000000000000000000000000000000..aa973c780060f4fa67573ff1e224ab0aed2bc92b
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLErrorUtil.m
@@ -0,0 +1,38 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "TFLErrorUtil.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+/** Error domain of TensorFlow Lite interpreter related errors. */
+static NSString *const TFLInterpreterErrorDomain = @"org.tensorflow.lite.interpreter";
+
+@implementation TFLErrorUtil
+
+#pragma mark - Public
+
++ (void)saveInterpreterErrorWithCode:(TFLInterpreterErrorCode)code
+                         description:(NSString *)description
+                               error:(NSError **)error {
+  if (error) {
+    *error = [NSError errorWithDomain:TFLInterpreterErrorDomain
+                                 code:code
+                             userInfo:@{NSLocalizedDescriptionKey : description}];
+  }
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLInterpreter+Internal.h b/tensorflow/lite/experimental/objc/sources/TFLInterpreter+Internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b900c4f050451061a5d1a02b8be4dc51cade175
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLInterpreter+Internal.h
@@ -0,0 +1,63 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreter.h"
+
+@class TFLTensor;
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface TFLInterpreter (Internal)
+
+/**
+ * Copies the given data into the input tensor at the given index. This is allowed only before the
+ * interpreter is invoked.
+ *
+ * @param data The data to set. The byte size of the data must match what's required by the input
+ *     tensor at the given index.
+ * @param index An input tensor index.
+ * @param error An optional error parameter populated when there is an error in setting the data.
+ *
+ * @return Whether the data was copied into the input tensor at the given index successfully.
+ *     Returns NO if an error occurred.
+ */
+- (BOOL)copyData:(NSData *)data toInputTensorAtIndex:(NSUInteger)index error:(NSError **)error;
+
+/**
+ * Retrieves a copy of the data from the given tensor. For an output tensor, the interpreter
+ * invocation has to complete before the data can be retrieved.
+ *
+ * @param tensor A tensor.
+ * @param error An optional error parameter populated when there is an error in getting the data.
+ *
+ * @return The data of the given tensor. `nil` if there is an error or data is not available.
+ */
+- (nullable NSData *)dataFromTensor:(TFLTensor *)tensor error:(NSError **)error;
+
+/**
+ * Retrieves the shape of the given tensor, an array of positive unsigned integer(s) containing the
+ * size of each dimension. For example: shape of [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]] is
+ * [2, 2, 3].
+ *
+ * @param tensor An input or output tensor.
+ * @param error An optional error parameter populated when there is an error in retrieving the
+ *     shape.
+ *
+ * @return The shape of the tensor. `nil` if there is an error in retrieving the shape.
+ */
+- (nullable NSArray<NSNumber *> *)shapeOfTensor:(TFLTensor *)tensor error:(NSError **)error;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
new file mode 100644
index 0000000000000000000000000000000000000000..a8ca982f6dd619f9a01bd67cc028ee6fb583a75d
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
@@ -0,0 +1,407 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreter.h"
+
+#import "TFLErrorUtil.h"
+#import "TFLQuantizationParameters+Internal.h"
+#import "TFLTensor+Internal.h"
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h"
+#import "tensorflow/lite/experimental/objc/apis/TFLTensor.h"
+
+#include "tensorflow/lite/experimental/c/c_api.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * Error reporter for TFLInterpreter.
+ *
+ * @param user_data User data. Not used.
+ * @param format Error message which may contain argument formatting specifiers.
+ * @param args Values of the arguments in the error message.
+ */
+static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_list args) {
+  NSLog(@"%@", [[NSString alloc] initWithFormat:@(format) arguments:args]);
+}
+
+@interface TFLInterpreter ()
+
+/** TFL_Interpreter backed by C API. */
+@property(nonatomic, nullable) TFL_Interpreter *interpreter;
+
+@end
+
+@implementation TFLInterpreter
+
+#pragma mark - NSObject
+
+- (void)dealloc {
+  TFL_DeleteInterpreter(_interpreter);
+}
+
+#pragma mark - Public
+
+- (nullable instancetype)initWithModelPath:(NSString *)modelPath error:(NSError **)error {
+  return [self initWithModelPath:modelPath
+                         options:[[TFLInterpreterOptions alloc] init]
+                           error:error];
+}
+
+- (nullable instancetype)initWithModelPath:(NSString *)modelPath
+                                   options:(TFLInterpreterOptions *)options
+                                     error:(NSError **)error {
+  self = [super init];
+
+  if (self != nil) {
+    TFL_Model *model = nullptr;
+    TFL_InterpreterOptions *cOptions = nullptr;
+
+    @try {
+      const char *modelPathCString = modelPath.UTF8String;
+      NSString *pathErrorString =
+          [NSString stringWithFormat:@"Cannot load model from path (%@).", modelPath];
+      if (modelPathCString == nullptr) {
+        [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToLoadModel
+                                       description:pathErrorString
+                                             error:error];
+        return nil;
+      }
+
+      model = TFL_NewModelFromFile(modelPathCString);
+      if (model == nullptr) {
+        [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToLoadModel
+                                       description:pathErrorString
+                                             error:error];
+        return nil;
+      }
+
+      cOptions = TFL_NewInterpreterOptions();
+      if (cOptions == nullptr) {
+        [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToCreateInterpreter
+                                       description:@"Failed to create the interpreter."
+                                             error:error];
+        return nil;
+      }
+
+      if (options.numberOfThreads > 0) {
+        TFL_InterpreterOptionsSetNumThreads(cOptions, (int32_t)options.numberOfThreads);
+      }
+      TFL_InterpreterOptionsSetErrorReporter(cOptions, TFLInterpreterErrorReporter, nullptr);
+
+      _interpreter = TFL_NewInterpreter(model, cOptions);
+      if (_interpreter == nullptr) {
+        [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToCreateInterpreter
+                                       description:@"Failed to create the interpreter."
+                                             error:error];
+        return nil;
+      }
+
+      _inputTensorCount = (NSUInteger)TFL_InterpreterGetInputTensorCount(_interpreter);
+      _outputTensorCount = (NSUInteger)TFL_InterpreterGetOutputTensorCount(_interpreter);
+      if (_inputTensorCount <= 0 || _outputTensorCount <= 0) {
+        [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToCreateInterpreter
+                                       description:@"Failed to create the interpreter."
+                                             error:error];
+        return nil;
+      }
+    } @finally {
+      TFL_DeleteInterpreterOptions(cOptions);
+      TFL_DeleteModel(model);
+    }
+  }
+
+  return self;
+}
+
+- (BOOL)invokeWithError:(NSError **)error {
+  if (TFL_InterpreterInvoke(self.interpreter) != kTfLiteOk) {
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToInvoke
+                                   description:@"Failed to invoke the interpreter."
+                                         error:error];
+    return NO;
+  }
+
+  return YES;
+}
+
+- (nullable TFLTensor *)inputTensorAtIndex:(NSUInteger)index error:(NSError **)error {
+  if (![self isValidTensorIndex:index belowLimit:self.inputTensorCount error:error]) {
+    return nil;
+  }
+
+  return [self tensorOfType:TFLTensorTypeInput atIndex:index error:error];
+}
+
+- (nullable TFLTensor *)outputTensorAtIndex:(NSUInteger)index error:(NSError **)error {
+  if (![self isValidTensorIndex:index belowLimit:self.outputTensorCount error:error]) {
+    return nil;
+  }
+
+  return [self tensorOfType:TFLTensorTypeOutput atIndex:index error:error];
+}
+
+- (BOOL)resizeInputTensorAtIndex:(NSUInteger)index
+                         toShape:(NSArray<NSNumber *> *)shape
+                           error:(NSError **)error {
+  if (![self isValidTensorIndex:index belowLimit:self.inputTensorCount error:error]) {
+    return NO;
+  }
+
+  if (shape.count == 0) {
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeInvalidShape
+                                   description:@"Invalid shape. Must not be empty."
+                                         error:error];
+    return NO;
+  }
+
+  int cDimensions[self.inputTensorCount];
+  for (int dimIndex = 0; dimIndex < shape.count; ++dimIndex) {
+    int dimension = shape[dimIndex].intValue;
+    if (dimension <= 0) {
+      NSString *errorDescription = @"Invalid shape. Dimensions must be positive integers.";
+      [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeInvalidShape
+                                     description:errorDescription
+                                           error:error];
+      return NO;
+    }
+    cDimensions[dimIndex] = dimension;
+  }
+
+  if (TFL_InterpreterResizeInputTensor(self.interpreter, (int32_t)index, cDimensions,
+                                       (int32_t)shape.count) != kTfLiteOk) {
+    NSString *errorDescription = [NSString
+        stringWithFormat:@"Failed to resize input tensor at index (%lu).", (unsigned long)index];
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToResizeInputTensor
+                                   description:errorDescription
+                                         error:error];
+    return NO;
+  }
+
+  return YES;
+}
+
+- (BOOL)allocateTensorsWithError:(NSError **)error {
+  if (TFL_InterpreterAllocateTensors(self.interpreter) != kTfLiteOk) {
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToAllocateTensors
+                                   description:@"Failed to allocate memory for tensors."
+                                         error:error];
+    return NO;
+  }
+  return YES;
+}
+
+#pragma mark - TFLInterpreter (Internal)
+
+- (BOOL)copyData:(NSData *)data toInputTensorAtIndex:(NSUInteger)index error:(NSError **)error {
+  const TFL_Tensor *cTensor = [self cTensorOfType:TFLTensorTypeInput atIndex:index error:error];
+  if (cTensor == nullptr) {
+    return NO;
+  }
+
+  NSUInteger byteSize = (NSUInteger)TFL_TensorByteSize(cTensor);
+  if (data.length != byteSize) {
+    NSString *errorDescription = [NSString
+        stringWithFormat:@"Input tensor at index (%lu) expects data size (%lu), but got (%lu).",
+                         (unsigned long)index, byteSize, (unsigned long)data.length];
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeInvalidInputByteSize
+                                   description:errorDescription
+                                         error:error];
+    return NO;
+  }
+
+  if (TFL_TensorCopyFromBuffer((TFL_Tensor *)cTensor, data.bytes, data.length) != kTfLiteOk) {
+    NSString *errorDescription =
+        [NSString stringWithFormat:@"Failed to copy data into input tensor at index (%lu).",
+                                   (unsigned long)index];
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToCopyDataToInputTensor
+                                   description:errorDescription
+                                         error:error];
+    return NO;
+  }
+
+  return YES;
+}
+
+- (nullable NSData *)dataFromTensor:(TFLTensor *)tensor error:(NSError **)error {
+  const TFL_Tensor *cTensor = [self cTensorOfType:tensor.type atIndex:tensor.index error:error];
+  if (cTensor == nullptr) {
+    return nil;
+  }
+
+  void *bytes = TFL_TensorData(cTensor);
+  NSUInteger byteSize = (NSUInteger)TFL_TensorByteSize(cTensor);
+  if (bytes == nullptr || byteSize == 0) {
+    NSString *tensorType = [TFLTensor stringForTensorType:tensor.type];
+    NSString *errorDescription =
+        [NSString stringWithFormat:@"Failed to get data from %@ tensor at index (%lu).", tensorType,
+                                   (unsigned long)index];
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToGetDataFromTensor
+                                   description:errorDescription
+                                         error:error];
+    return nil;
+  }
+
+  return [NSData dataWithBytes:bytes length:byteSize];
+}
+
+- (nullable NSArray<NSNumber *> *)shapeOfTensor:(TFLTensor *)tensor error:(NSError **)error {
+  const TFL_Tensor *cTensor = [self cTensorOfType:tensor.type atIndex:tensor.index error:error];
+  if (cTensor == nullptr) {
+    return nil;
+  }
+
+  NSString *tensorType = [TFLTensor stringForTensorType:tensor.type];
+  int32_t rank = TFL_TensorNumDims(cTensor);
+  if (rank <= 0) {
+    NSString *errorDescription =
+        [NSString stringWithFormat:@"%@ tensor at index (%lu) has invalid rank (%d).", tensorType,
+                                   (unsigned long)index, rank];
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeInvalidTensor
+                                   description:errorDescription
+                                         error:error];
+    return nil;
+  }
+
+  NSMutableArray *shape = [NSMutableArray arrayWithCapacity:rank];
+  for (int32_t dimIndex = 0; dimIndex < rank; dimIndex++) {
+    int32_t dimension = TFL_TensorDim(cTensor, dimIndex);
+    if (dimension <= 0) {
+      NSString *errorDescription =
+          [NSString stringWithFormat:@"%@ tensor at index (%lu) has invalid %d-th dimension (%d).",
+                                     tensorType, (unsigned long)index, dimIndex, dimension];
+      [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeInvalidTensor
+                                     description:errorDescription
+                                           error:error];
+      return nil;
+    }
+    shape[dimIndex] = @((NSUInteger)dimension);
+  }
+
+  return shape;
+}
+
+#pragma mark - Private
+
+- (const TFL_Tensor *)cTensorOfType:(TFLTensorType)type
+                            atIndex:(NSUInteger)index
+                              error:(NSError **)error {
+  const TFL_Tensor *tensor = nullptr;
+
+  switch (type) {
+    case TFLTensorTypeInput:
+      tensor = TFL_InterpreterGetInputTensor(self.interpreter, (int32_t)index);
+      break;
+    case TFLTensorTypeOutput:
+      tensor = TFL_InterpreterGetOutputTensor(self.interpreter, (int32_t)index);
+      break;
+  }
+
+  if (tensor == nullptr) {
+    NSString *tensorType = [TFLTensor stringForTensorType:type];
+    NSString *errorDescription =
+        [NSString stringWithFormat:@"Failed to get %@ tensor at index (%lu).", tensorType,
+                                   (unsigned long)index];
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToGetTensor
+                                   description:errorDescription
+                                         error:error];
+  }
+
+  return tensor;
+}
+
+- (nullable TFLTensor *)tensorOfType:(TFLTensorType)type
+                             atIndex:(NSUInteger)index
+                               error:(NSError **)error {
+  const TFL_Tensor *tensor = [self cTensorOfType:type atIndex:index error:error];
+
+  if (tensor == nullptr) {
+    return nil;
+  }
+
+  NSString *tensorType = [TFLTensor stringForTensorType:type];
+  const char *cName = TFL_TensorName(tensor);
+  if (cName == nullptr) {
+    NSString *errorDescription =
+        [NSString stringWithFormat:@"Failed to get name of %@ tensor at index (%lu).", tensorType,
+                                   (unsigned long)index];
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeInvalidTensor
+                                   description:errorDescription
+                                         error:error];
+    return nil;
+  }
+  NSString *name = [NSString stringWithUTF8String:cName];
+
+  TFLTensorDataType dataType = [self tensorDataTypeFromCTensorType:TFL_TensorType(tensor)];
+
+  TFL_QuantizationParams cParams = TFL_TensorQuantizationParams(tensor);
+  TFLQuantizationParameters *quantizationParams;
+
+  // TODO(b/119735362): Update this check once the TFL_QuantizationParams struct has a mode.
+  if (cParams.scale != 0.0) {
+    quantizationParams = [[TFLQuantizationParameters alloc] initWithScale:cParams.scale
+                                                                zeroPoint:cParams.zero_point];
+  }
+
+  // TODO: Set quantization parameters when C API supports it.
+  return [[TFLTensor alloc] initWithInterpreter:self
+                                           type:type
+                                          index:index
+                                           name:name
+                                       dataType:dataType
+                         quantizationParameters:quantizationParams];
+}
+
+- (TFLTensorDataType)tensorDataTypeFromCTensorType:(TFL_Type)cTensorType {
+  switch (cTensorType) {
+    case kTfLiteFloat32:
+      return TFLTensorDataTypeFloat32;
+    case kTfLiteInt32:
+      return TFLTensorDataTypeInt32;
+    case kTfLiteUInt8:
+      return TFLTensorDataTypeUInt8;
+    case kTfLiteInt8:
+      return TFLTensorDataTypeInt8;
+    case kTfLiteInt64:
+      return TFLTensorDataTypeInt64;
+    case kTfLiteBool:
+      return TFLTensorDataTypeBool;
+    case kTfLiteInt16:
+      return TFLTensorDataTypeInt16;
+    case kTfLiteNoType:
+    case kTfLiteString:
+    case kTfLiteComplex64:
+      // kTfLiteString and kTfLiteComplex64 are not supported in TensorFlow Lite Objc API.
+      return TFLTensorDataTypeNoType;
+  }
+}
+
+- (BOOL)isValidTensorIndex:(NSUInteger)index
+                belowLimit:(NSUInteger)totalTensorCount
+                     error:(NSError **)error {
+  if (index >= totalTensorCount) {
+    NSString *errorDescription =
+        [NSString stringWithFormat:@"Invalid tensor index (%lu) exceeds max (%lu).",
+                                   (unsigned long)index, (unsigned long)(totalTensorCount - 1)];
+    [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeInvalidTensorIndex
+                                   description:errorDescription
+                                         error:error];
+    return NO;
+  }
+
+  return YES;
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLInterpreterOptions.m b/tensorflow/lite/experimental/objc/sources/TFLInterpreterOptions.m
new file mode 100644
index 0000000000000000000000000000000000000000..d129befecabc5af752ccff70e84a4a66c7ee4bca
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLInterpreterOptions.m
@@ -0,0 +1,30 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+@implementation TFLInterpreterOptions
+
+#pragma mark - Public
+
+- (instancetype)init {
+  self = [super init];
+  return self;
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLQuantizationParameters+Internal.h b/tensorflow/lite/experimental/objc/sources/TFLQuantizationParameters+Internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..37d9ef0bb4761c9ff93111ba3158d4c4d68a9ec2
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLQuantizationParameters+Internal.h
@@ -0,0 +1,33 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLQuantizationParameters.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface TFLQuantizationParameters (Internal)
+
+/**
+ * Initializes a `TFLQuantizationParameters` instance with the given scale and zero point.
+ *
+ * @param scale Scale of asymmetric quantization.
+ * @param zeroPoint Zero point of asymmetric quantization.
+ *
+ * @return A new instance of `TFLQuantizationParameters` with the given scale and zero point.
+ */
+- (instancetype)initWithScale:(float)scale zeroPoint:(int32_t)zeroPoint;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLQuantizationParameters.m b/tensorflow/lite/experimental/objc/sources/TFLQuantizationParameters.m
new file mode 100644
index 0000000000000000000000000000000000000000..44cb90d3323a73c1f79a27f319ac263c84e94408
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLQuantizationParameters.m
@@ -0,0 +1,36 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLQuantizationParameters.h"
+
+#import "TFLQuantizationParameters+Internal.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+@implementation TFLQuantizationParameters
+
+#pragma mark - TFLTensor (Internal)
+
+- (instancetype)initWithScale:(float)scale zeroPoint:(int32_t)zeroPoint {
+  self = [super init];
+  if (self != nil) {
+    _scale = scale;
+    _zeroPoint = zeroPoint;
+  }
+  return self;
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLTensor+Internal.h b/tensorflow/lite/experimental/objc/sources/TFLTensor+Internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d5c51caabd8e44ab2b30a7b44259f6878865586
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLTensor+Internal.h
@@ -0,0 +1,74 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLTensor.h"
+
+@class TFLInterpreter;
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * @enum TFLTensorType
+ * This enum specifies input or output tensor types.
+ */
+typedef NS_ENUM(NSUInteger, TFLTensorType) {
+  /** Input tensor type. */
+  TFLTensorTypeInput,
+
+  /** Output tensor type. */
+  TFLTensorTypeOutput,
+};
+
+@interface TFLTensor (Internal)
+
+/** Input or output tensor type. */
+@property(nonatomic, readonly) TFLTensorType type;
+
+/** Index of the tensor. */
+@property(nonatomic, readonly) NSUInteger index;
+
+/**
+ * Initializes a `TFLTensor` with the given interpreter, name, data type, and quantization
+ * parameters.
+ *
+ * @param interpreter Interpreter backing the tensor.
+ * @param type Input or output tensor type.
+ * @param index Index of the tensor.
+ * @param name Name of the tensor.
+ * @param dataType Data type of the tensor.
+ * @param quantizationParameters Quantization parameters of the tensor. `nil` if the tensor does not
+ *     use quantization.
+ *
+ * @return A new instance of `TFLTensor` with the given name, data type, shape, and quantization
+ *     parameters.
+ */
+- (instancetype)initWithInterpreter:(TFLInterpreter *)interpreter
+                               type:(TFLTensorType)type
+                              index:(NSUInteger)index
+                               name:(NSString *)name
+                           dataType:(TFLTensorDataType)dataType
+             quantizationParameters:(nullable TFLQuantizationParameters *)quantizationParameters;
+
+/**
+ * Returns the string name of the given input or output tensor type.
+ *
+ * @param type Input or output tensor type.
+ *
+ * @return The string name of the given input or output tensor type.
+ */
++ (NSString *)stringForTensorType:(TFLTensorType)type;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLTensor.m b/tensorflow/lite/experimental/objc/sources/TFLTensor.m
new file mode 100644
index 0000000000000000000000000000000000000000..2eaebfd6bec0483817bd4c1c3e540113cca75f5e
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLTensor.m
@@ -0,0 +1,103 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLTensor.h"
+
+#import "TFLErrorUtil.h"
+#import "TFLInterpreter+Internal.h"
+#import "TFLTensor+Internal.h"
+
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreter.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+// String names of input or output tensor types.
+static NSString *const kTFLInputTensorTypeString = @"input";
+static NSString *const kTFLOutputTensorTypeString = @"output";
+
+@interface TFLTensor ()
+
+// Redefines readonly properties.
+@property(nonatomic) TFLTensorType type;
+@property(nonatomic) NSUInteger index;
+@property(nonatomic, copy) NSString *name;
+@property(nonatomic) TFLTensorDataType dataType;
+@property(nonatomic, nullable) TFLQuantizationParameters *quantizationParameters;
+
+/**
+ * The backing interpreter. It's a strong reference to ensure that the interpreter is never released
+ * before this tensor is released.
+ *
+ * @warning Never let the interpreter hold a strong reference to the tensor to avoid retain cycles.
+ */
+@property(nonatomic) TFLInterpreter *interpreter;
+
+@end
+
+@implementation TFLTensor
+
+#pragma mark - Public
+
+- (BOOL)copyData:(NSData *)data error:(NSError **)error {
+  if (self.type == TFLTensorTypeOutput) {
+    [TFLErrorUtil
+        saveInterpreterErrorWithCode:TFLInterpreterErrorCodeCopyDataToOutputTensorNotAllowed
+                         description:@"Cannot copy data into an output tensor."
+                               error:error];
+    return NO;
+  }
+
+  return [self.interpreter copyData:data toInputTensorAtIndex:self.index error:error];
+}
+
+- (nullable NSData *)dataWithError:(NSError **)error {
+  return [self.interpreter dataFromTensor:self error:error];
+}
+
+- (nullable NSArray<NSNumber *> *)shapeWithError:(NSError **)error {
+  return [self.interpreter shapeOfTensor:self error:error];
+}
+
+#pragma mark - TFLTensor (Internal)
+
+- (instancetype)initWithInterpreter:(TFLInterpreter *)interpreter
+                               type:(TFLTensorType)type
+                              index:(NSUInteger)index
+                               name:(NSString *)name
+                           dataType:(TFLTensorDataType)dataType
+             quantizationParameters:(nullable TFLQuantizationParameters *)quantizationParameters {
+  self = [super init];
+  if (self != nil) {
+    _interpreter = interpreter;
+    _type = type;
+    _index = index;
+    _name = [name copy];
+    _dataType = dataType;
+    _quantizationParameters = quantizationParameters;
+  }
+  return self;
+}
+
++ (NSString *)stringForTensorType:(TFLTensorType)type {
+  switch (type) {
+    case TFLTensorTypeInput:
+      return kTFLInputTensorTypeString;
+    case TFLTensorTypeOutput:
+      return kTFLOutputTensorTypeString;
+  }
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m b/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m
new file mode 100644
index 0000000000000000000000000000000000000000..00b800d6af96636054f2a79f3d4c8d007dd89ea3
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/tests/TFLInterpreterOptionsTests.m
@@ -0,0 +1,49 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h"
+
+#import <XCTest/XCTest.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * Unit tests for TFLInterpreterOptions.
+ */
+@interface TFLInterpreterOptionsTests : XCTestCase
+@end
+
+@implementation TFLInterpreterOptionsTests
+
+#pragma mark - Tests
+
+- (void)testInit {
+  TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
+  XCTAssertNotNil(options);
+  XCTAssertEqual(options.numberOfThreads, 0);
+}
+
+- (void)testSetNumberOfThread {
+  TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
+  options.numberOfThreads = 2;
+  XCTAssertEqual(options.numberOfThreads, 2);
+  options.numberOfThreads = 0;
+  XCTAssertEqual(options.numberOfThreads, 0);
+  options.numberOfThreads = 3;
+  XCTAssertEqual(options.numberOfThreads, 3);
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/tests/TFLInterpreterTests.m b/tensorflow/lite/experimental/objc/tests/TFLInterpreterTests.m
new file mode 100644
index 0000000000000000000000000000000000000000..eefa9b9f05826a0782c0b236a2d7e145428b1ca1
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/tests/TFLInterpreterTests.m
@@ -0,0 +1,358 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreter.h"
+
+#import <XCTest/XCTest.h>
+
+#import "tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h"
+#import "tensorflow/lite/experimental/objc/apis/TFLQuantizationParameters.h"
+#import "tensorflow/lite/experimental/objc/apis/TFLTensor.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+/** Float model resource name. */
+static NSString *const kAddFloatModelResourceName = @"add";
+
+/** Quantized model resource name. */
+static NSString *const kAddQuantizedModelResourceName = @"add_quantized";
+
+/** Model resource type. */
+static NSString *const kAddModelResourceType = @"bin";
+
+/** Rank of the input and output tensor in the Add model. */
+static const NSUInteger kAddModelTensorRank = 1U;
+
+/** Size of the first (and only) dimension of the input and output tensor in the Add model. */
+static const NSUInteger kAddModelTensorFirstDimensionSize = 2U;
+
+/** Quantization scale of the quantized model. */
+static const float kAddQuantizedModelScale = 0.003922F;
+
+/** Quantization zero point of the quantized model. */
+static const int32_t kAddQuantizedModelZeroPoint = 0;
+
+/** Invalid input tensor index. */
+static const NSUInteger kInvalidInputTensorIndex = 1U;
+
+/** Invalid output tensor index. */
+static const NSUInteger kInvalidOutputTensorIndex = 1U;
+
+/** Accurary used in comparing floating numbers. */
+static const float kTestAccuracy = 1E-5F;
+
+/**
+ * Unit tests for TFLInterpreter.
+ */
+@interface TFLInterpreterTests : XCTestCase
+
+/** Absolute path of the Add float model resource. */
+@property(nonatomic, nullable) NSString *floatModelPath;
+
+/** Default interpreter using the Add model. */
+@property(nonatomic, nullable) TFLInterpreter *interpreter;
+
+@end
+
+@implementation TFLInterpreterTests
+
+#pragma mark - XCTestCase
+
+- (void)setUp {
+  [super setUp];
+
+  NSBundle *bundle = [NSBundle bundleForClass:[self class]];
+  self.floatModelPath = [bundle pathForResource:kAddFloatModelResourceName
+                                         ofType:kAddModelResourceType];
+  NSError *error;
+  self.interpreter = [[TFLInterpreter alloc] initWithModelPath:self.floatModelPath error:&error];
+  XCTAssertNil(error);
+  XCTAssertNotNil(self.interpreter);
+  XCTAssertTrue([self.interpreter allocateTensorsWithError:nil]);
+}
+
+- (void)tearDown {
+  self.floatModelPath = nil;
+  self.interpreter = nil;
+
+  [super tearDown];
+}
+
+#pragma mark - Tests
+
+- (void)testSuccessfulFullRunAddFloatModel {
+  // Shape for both input and output tensor.
+  NSMutableArray *shape = [NSMutableArray arrayWithCapacity:kAddModelTensorRank];
+  shape[0] = [NSNumber numberWithUnsignedInteger:kAddModelTensorFirstDimensionSize];
+
+  // Creates the interpreter options.
+  TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
+  XCTAssertNotNil(options);
+  options.numberOfThreads = 2;
+
+  // Creates the interpreter.
+  NSError *error;
+  TFLInterpreter *customInterpreter = [[TFLInterpreter alloc] initWithModelPath:self.floatModelPath
+                                                                        options:options
+                                                                          error:&error];
+  XCTAssertNil(error);
+  XCTAssertNotNil(customInterpreter);
+
+  // Allocates memory for tensors.
+  XCTAssertTrue([customInterpreter allocateTensorsWithError:&error]);
+  XCTAssertNil(error);
+
+  // Verifies input and output tensor counts.
+  XCTAssertEqual(customInterpreter.inputTensorCount, 1);
+  XCTAssertEqual(customInterpreter.outputTensorCount, 1);
+
+  // Resizes the intput tensor.
+  XCTAssertTrue([customInterpreter resizeInputTensorAtIndex:0 toShape:shape error:&error]);
+  XCTAssertNil(error);
+
+  // Re-allocates memory for tensors.
+  XCTAssertTrue([customInterpreter allocateTensorsWithError:&error]);
+  XCTAssertNil(error);
+
+  // Verifies the input tensor.
+  TFLTensor *inputTensor = [customInterpreter inputTensorAtIndex:0 error:&error];
+  XCTAssertNotNil(inputTensor);
+  XCTAssertNil(error);
+  XCTAssertTrue([inputTensor.name isEqualToString:@"input"]);
+  XCTAssertEqual(inputTensor.dataType, TFLTensorDataTypeFloat32);
+  NSArray *inputTensorShape = [inputTensor shapeWithError:&error];
+  XCTAssertNil(error);
+  XCTAssertTrue([shape isEqualToArray:inputTensorShape]);
+
+  // Copies the input data.
+  NSMutableData *inputData = [NSMutableData dataWithCapacity:0];
+  float one = 1.f;
+  float three = 3.f;
+  [inputData appendBytes:&one length:sizeof(float)];
+  [inputData appendBytes:&three length:sizeof(float)];
+  XCTAssertTrue([inputTensor copyData:inputData error:&error]);
+  XCTAssertNil(error);
+
+  // Invokes the interpreter.
+  XCTAssertTrue([customInterpreter invokeWithError:&error]);
+  XCTAssertNil(error);
+
+  // Verifies the output tensor.
+  TFLTensor *outputTensor = [customInterpreter outputTensorAtIndex:0 error:&error];
+  XCTAssertNotNil(outputTensor);
+  XCTAssertNil(error);
+  XCTAssertTrue([outputTensor.name isEqualToString:@"output"]);
+  XCTAssertEqual(outputTensor.dataType, TFLTensorDataTypeFloat32);
+  NSArray *outputTensorShape = [outputTensor shapeWithError:&error];
+  XCTAssertNil(error);
+  XCTAssertTrue([shape isEqualToArray:outputTensorShape]);
+
+  // Tries to query an invalid output tensor index.
+  TFLTensor *invalidOutputTensor = [customInterpreter outputTensorAtIndex:kInvalidOutputTensorIndex
+                                                                    error:&error];
+  XCTAssertNil(invalidOutputTensor);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeInvalidTensorIndex);
+
+  // Gets the output tensor data.
+  error = nil;
+  NSData *outputData = [outputTensor dataWithError:&error];
+  XCTAssertNotNil(outputData);
+  XCTAssertNil(error);
+  float output[kAddModelTensorFirstDimensionSize];
+  [outputData getBytes:output length:(sizeof(float) * kAddModelTensorFirstDimensionSize)];
+  XCTAssertEqualWithAccuracy(output[0], 3.f, kTestAccuracy);
+  XCTAssertEqualWithAccuracy(output[1], 9.f, kTestAccuracy);
+}
+
+- (void)testSuccessfulFullRunQuantizedModel {
+  // Shape for both input and output tensor.
+  NSMutableArray *shape = [NSMutableArray arrayWithCapacity:kAddModelTensorRank];
+  shape[0] = [NSNumber numberWithUnsignedInteger:kAddModelTensorFirstDimensionSize];
+
+  // Creates the interpreter options.
+  TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
+  XCTAssertNotNil(options);
+  options.numberOfThreads = 2;
+
+  NSBundle *bundle = [NSBundle bundleForClass:[self class]];
+  NSString *quantizedModelPath = [bundle pathForResource:kAddQuantizedModelResourceName
+                                                  ofType:kAddModelResourceType];
+
+  // Creates the interpreter.
+  NSError *error;
+  TFLInterpreter *customInterpreter =
+      [[TFLInterpreter alloc] initWithModelPath:quantizedModelPath options:options error:&error];
+  XCTAssertNil(error);
+  XCTAssertNotNil(customInterpreter);
+
+  // Allocates memory for tensors.
+  XCTAssertTrue([customInterpreter allocateTensorsWithError:&error]);
+  XCTAssertNil(error);
+
+  // Verifies input and output tensor counts.
+  XCTAssertEqual(customInterpreter.inputTensorCount, 1);
+  XCTAssertEqual(customInterpreter.outputTensorCount, 1);
+
+  // Resizes the intput tensor.
+  XCTAssertTrue([customInterpreter resizeInputTensorAtIndex:0 toShape:shape error:&error]);
+  XCTAssertNil(error);
+
+  // Re-allocates memory for tensors.
+  XCTAssertTrue([customInterpreter allocateTensorsWithError:&error]);
+  XCTAssertNil(error);
+
+  // Verifies the input tensor.
+  TFLTensor *inputTensor = [customInterpreter inputTensorAtIndex:0 error:&error];
+  XCTAssertNotNil(inputTensor);
+  XCTAssertNil(error);
+  XCTAssertTrue([inputTensor.name isEqualToString:@"input"]);
+  XCTAssertEqual(inputTensor.dataType, TFLTensorDataTypeUInt8);
+  XCTAssertEqualWithAccuracy(inputTensor.quantizationParameters.scale, kAddQuantizedModelScale,
+                             kTestAccuracy);
+  XCTAssertEqual(inputTensor.quantizationParameters.zeroPoint, kAddQuantizedModelZeroPoint);
+  NSArray *inputTensorShape = [inputTensor shapeWithError:&error];
+  XCTAssertNil(error);
+  XCTAssertTrue([shape isEqualToArray:inputTensorShape]);
+
+  // Copies the input data.
+  NSMutableData *inputData = [NSMutableData dataWithCapacity:0];
+  uint8_t one = 1;
+  uint8_t three = 3;
+  [inputData appendBytes:&one length:sizeof(uint8_t)];
+  [inputData appendBytes:&three length:sizeof(uint8_t)];
+  XCTAssertTrue([inputTensor copyData:inputData error:&error]);
+  XCTAssertNil(error);
+
+  // Invokes the interpreter.
+  XCTAssertTrue([customInterpreter invokeWithError:&error]);
+  XCTAssertNil(error);
+
+  // Verifies the output tensor.
+  TFLTensor *outputTensor = [customInterpreter outputTensorAtIndex:0 error:&error];
+  XCTAssertNotNil(outputTensor);
+  XCTAssertNil(error);
+  XCTAssertTrue([outputTensor.name isEqualToString:@"output"]);
+  XCTAssertEqual(outputTensor.dataType, TFLTensorDataTypeUInt8);
+  XCTAssertEqualWithAccuracy(outputTensor.quantizationParameters.scale, kAddQuantizedModelScale,
+                             kTestAccuracy);
+  XCTAssertEqual(outputTensor.quantizationParameters.zeroPoint, kAddQuantizedModelZeroPoint);
+  NSArray *outputTensorShape = [outputTensor shapeWithError:&error];
+  XCTAssertNil(error);
+  XCTAssertTrue([shape isEqualToArray:outputTensorShape]);
+
+  // Tries to query an invalid output tensor index.
+  TFLTensor *invalidOutputTensor = [customInterpreter outputTensorAtIndex:kInvalidOutputTensorIndex
+                                                                    error:&error];
+  XCTAssertNil(invalidOutputTensor);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeInvalidTensorIndex);
+
+  // Gets the output tensor data.
+  error = nil;
+  NSData *outputData = [outputTensor dataWithError:&error];
+  XCTAssertNotNil(outputData);
+  XCTAssertNil(error);
+  uint8_t output[kAddModelTensorFirstDimensionSize];
+  [outputData getBytes:output length:(sizeof(uint8_t) * kAddModelTensorFirstDimensionSize)];
+  XCTAssertEqual(output[0], 3);
+  XCTAssertEqual(output[1], 9);
+}
+
+- (void)testInitWithModelPath_invalidPath {
+  // Shape for both input and output tensor.
+  NSMutableArray *shape = [NSMutableArray arrayWithCapacity:kAddModelTensorRank];
+  shape[0] = [NSNumber numberWithUnsignedInteger:kAddModelTensorFirstDimensionSize];
+
+  // Creates the interpreter.
+  NSError *error;
+  TFLInterpreter *brokenInterpreter = [[TFLInterpreter alloc] initWithModelPath:@"InvalidPath"
+                                                                          error:&error];
+  XCTAssertNil(brokenInterpreter);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeFailedToLoadModel);
+}
+
+- (void)testInvoke_beforeAllocation {
+  NSError *error;
+  TFLInterpreter *interpreterWithoutAllocation =
+      [[TFLInterpreter alloc] initWithModelPath:self.floatModelPath error:&error];
+  XCTAssertNotNil(interpreterWithoutAllocation);
+  XCTAssertNil(error);
+
+  XCTAssertFalse([interpreterWithoutAllocation invokeWithError:&error]);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeFailedToInvoke);
+}
+
+- (void)testInputTensorAtIndex_invalidIndex {
+  NSError *error;
+  TFLTensor *inputTensor = [self.interpreter inputTensorAtIndex:kInvalidInputTensorIndex
+                                                          error:&error];
+  XCTAssertNil(inputTensor);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeInvalidTensorIndex);
+}
+
+- (void)testResizeInputTensorAtIndex_invalidIndex {
+  NSMutableArray *shape = [NSMutableArray arrayWithCapacity:kAddModelTensorRank];
+  shape[0] = [NSNumber numberWithUnsignedInteger:kAddModelTensorFirstDimensionSize];
+  NSError *error;
+  XCTAssertFalse([self.interpreter resizeInputTensorAtIndex:kInvalidInputTensorIndex
+                                                    toShape:shape
+                                                      error:&error]);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeInvalidTensorIndex);
+}
+
+- (void)testResizeInputTensorAtIndex_emptyShape {
+  NSMutableArray *emptyShape = [NSMutableArray arrayWithCapacity:0];
+  NSError *error;
+  XCTAssertFalse([self.interpreter resizeInputTensorAtIndex:0 toShape:emptyShape error:&error]);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeInvalidShape);
+}
+
+- (void)testResizeInputTensorAtIndex_zeroDimensionSize {
+  NSMutableArray *shape = [NSMutableArray arrayWithCapacity:kAddModelTensorRank];
+  shape[0] = [NSNumber numberWithUnsignedInteger:0];
+  NSError *error;
+  XCTAssertFalse([self.interpreter resizeInputTensorAtIndex:0 toShape:shape error:&error]);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeInvalidShape);
+}
+
+- (void)testCopyDataToInputTensorAtIndex_invalidInputDataByteSize {
+  NSMutableData *inputData = [NSMutableData dataWithCapacity:0];
+  float one = 1.f;
+  float three = 3.f;
+  [inputData appendBytes:&one length:sizeof(float)];
+  [inputData appendBytes:&three length:(sizeof(float) - 1)];
+  NSError *error;
+  TFLTensor *inputTensor = [self.interpreter inputTensorAtIndex:0 error:&error];
+  XCTAssertNotNil(inputTensor);
+  XCTAssertNil(error);
+  XCTAssertFalse([inputTensor copyData:inputData error:&error]);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeInvalidInputByteSize);
+}
+
+- (void)testCopyDataToOutputTensorAtIndex_notAllowed {
+  NSMutableData *data = [NSMutableData dataWithCapacity:0];
+  float one = 1.f;
+  float three = 3.f;
+  [data appendBytes:&one length:sizeof(float)];
+  [data appendBytes:&three length:(sizeof(float) - 1)];
+  NSError *error;
+  TFLTensor *outputTensor = [self.interpreter outputTensorAtIndex:0 error:&error];
+  XCTAssertNotNil(outputTensor);
+  XCTAssertNil(error);
+  XCTAssertFalse([outputTensor copyData:data error:&error]);
+  XCTAssertEqual(error.code, TFLInterpreterErrorCodeCopyDataToOutputTensorNotAllowed);
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/tests/TFLQuantizationParametersTests.m b/tensorflow/lite/experimental/objc/tests/TFLQuantizationParametersTests.m
new file mode 100644
index 0000000000000000000000000000000000000000..239e0bcb0dee8b6d2258be6f7e1ae2591611f501
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/tests/TFLQuantizationParametersTests.m
@@ -0,0 +1,48 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLQuantizationParameters.h"
+
+#import <XCTest/XCTest.h>
+
+#import "tensorflow/lite/experimental/objc/sources/TFLQuantizationParameters+Internal.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+/** Test scale of quantization parameters. */
+static const float kTestScale = 2.0;
+
+/** Test zero point of quantization parameters. */
+static const int32_t kTestZeroPoint = 128;
+
+/**
+ * Unit tests for TFLQuantizationParameters.
+ */
+@interface TFLQuantizationParametersTests : XCTestCase
+@end
+
+@implementation TFLQuantizationParametersTests
+
+#pragma mark - Tests
+
+- (void)testInitWithScaleAndZeroPoint {
+  TFLQuantizationParameters *params =
+      [[TFLQuantizationParameters alloc] initWithScale:kTestScale zeroPoint:kTestZeroPoint];
+  XCTAssertEqual(params.scale, kTestScale);
+  XCTAssertEqual(params.zeroPoint, kTestZeroPoint);
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/swift/BUILD.apple b/tensorflow/lite/experimental/swift/BUILD.apple
new file mode 100644
index 0000000000000000000000000000000000000000..7528b42751bd78ebbdc1ce6a7dfd1dc14e931ca3
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/BUILD.apple
@@ -0,0 +1,106 @@
+# TensorFlow Lite for Swift.
+
+package(default_visibility = ["//visibility:private"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("@build_bazel_rules_apple//apple:ios.bzl", "ios_application", "ios_unit_test")
+load("@build_bazel_rules_swift//swift:swift.bzl", "swift_library")
+
+MINIMUM_OS_VERSION = "9.0"
+
+SWIFT_COPTS = [
+    "-wmo",
+]
+
+# Default tags for filtering targets. Targets in this file are restricted to Apple platforms.
+DEFAULT_TAGS = [
+    "apple",
+    "manual",
+]
+
+swift_library(
+    name = "TensorFlowLite",
+    srcs = glob(["Sources/*.swift"]),
+    copts = SWIFT_COPTS,
+    module_name = "TensorFlowLite",
+    tags = DEFAULT_TAGS,
+    deps = [
+        "//tensorflow/lite/experimental/c:c_api",
+    ],
+)
+
+ios_unit_test(
+    name = "TensorFlowLiteTests",
+    size = "small",
+    minimum_os_version = MINIMUM_OS_VERSION,
+    tags = DEFAULT_TAGS + [
+        # DISABLED: Following sanitizer tests are not supported by iOS test targets.
+        "noasan",
+        "nomsan",
+        "notsan",
+    ],
+    deps = [":TensorFlowLiteTestsLib"],
+)
+
+swift_library(
+    name = "TensorFlowLiteTestsLib",
+    testonly = 1,
+    srcs = glob(["Tests/*.swift"]),
+    copts = SWIFT_COPTS,
+    tags = DEFAULT_TAGS,
+    deps = [
+        ":TensorFlowLite",
+        ":TestResources",
+    ],
+)
+
+objc_library(
+    name = "TestResources",
+    data = [
+        "//tensorflow/lite:testdata/add.bin",
+        "//tensorflow/lite:testdata/add_quantized.bin",
+        "//tensorflow/lite:testdata/multi_add.bin",
+    ],
+    tags = DEFAULT_TAGS,
+)
+
+ios_application(
+    name = "TensorFlowLiteApp",
+    app_icons = glob(["TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/AppIcon.appiconset/**"]),
+    bundle_id = "com.tensorflow.lite.swift.TensorFlowLite",
+    families = [
+        "ipad",
+        "iphone",
+    ],
+    infoplists = ["TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Info.plist"],
+    launch_storyboard = "TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/LaunchScreen.storyboard",
+    minimum_os_version = MINIMUM_OS_VERSION,
+    sdk_frameworks = [
+        "CoreGraphics",
+    ],
+    tags = DEFAULT_TAGS,
+    deps = [":TensorFlowLiteAppLib"],
+)
+
+swift_library(
+    name = "TensorFlowLiteAppLib",
+    srcs = glob(["TestApps/TensorFlowLiteApp/TensorFlowLiteApp/*.swift"]),
+    module_name = "TensorFlowLiteAppLib",
+    tags = DEFAULT_TAGS,
+    deps = [
+        ":TensorFlowLite",
+        ":TensorFlowLiteAppResources",
+    ],
+)
+
+objc_library(
+    name = "TensorFlowLiteAppResources",
+    data = glob([
+        "TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/*.storyboard",
+    ]),
+    tags = DEFAULT_TAGS,
+    deps = [":TestResources"],
+)
diff --git a/tensorflow/lite/experimental/swift/LICENSE b/tensorflow/lite/experimental/swift/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..d645695673349e3947e8e5ae42332d0ac3164cd7
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/tensorflow/lite/experimental/swift/README.md b/tensorflow/lite/experimental/swift/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cf7eeac6b402698785ff0829bf29a8fb5e471065
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/README.md
@@ -0,0 +1,78 @@
+# TensorFlow Lite for Swift
+
+[TensorFlow Lite](https://www.tensorflow.org/lite/) is TensorFlow's lightweight
+solution for Swift developers. It enables low-latency inference of on-device
+machine learning models with a small binary size and fast performance supporting
+hardware acceleration.
+
+## Getting Started
+
+### Bazel
+
+In your `BUILD` file, add the `TensorFlowLite` dependency:
+
+```python
+swift_library(
+  deps = [
+      "//tensorflow/lite/experimental/swift:TensorFlowLite",
+  ],
+)
+```
+
+In your Swift files, import the module:
+
+```swift
+import TensorFlowLite
+```
+
+If you would like to build the Swift TensorFlow Lite library using Bazel on Apple
+platforms, clone or download the [TensorFlow GitHub repo](https://github.com/tensorflow/tensorflow),
+then navigate to the root `tensorflow` directory and execute the `configure.py` script:
+
+```shell
+python configure.py
+```
+
+Follow the prompts and when asked to configure the Bazel rules for Apple
+platforms, enter `y`.
+
+Build the `TensorFlowLite` Swift library target:
+
+```shell
+bazel build tensorflow/lite/experimental/swift:TensorFlowLite
+```
+
+Build the `TensorFlowLiteTests` target:
+
+```shell
+bazel test tensorflow/lite/experimental/swift:TensorFlowLiteTests --swiftcopt=-enable-testing
+```
+
+Note that `--swiftcopt=-enable-testing` is required for optimized builds (`-c opt`).
+
+### Tulsi
+
+Open the `TensorFlowLite.tulsiproj` using the [TulsiApp](https://github.com/bazelbuild/tulsi) or by
+running the [`generate_xcodeproj.sh`](https://github.com/bazelbuild/tulsi/blob/master/src/tools/generate_xcodeproj.sh)
+script:
+
+```shell
+generate_xcodeproj.sh --genconfig tensorflow/lite/swift/TensorFlowLite.tulsiproj:TensorFlowLite --outputfolder ~/path/to/generated/TensorFlowLite.xcodeproj
+```
+
+### CocoaPods
+
+Add the following to your `Podfile`:
+
+```ruby
+use_frameworks!
+pod 'TensorFlowLiteSwift'
+```
+
+Then, run `pod install`.
+
+In your Swift files, import the module:
+
+```swift
+import TensorFlowLite
+```
diff --git a/tensorflow/lite/experimental/swift/Sources/Interpreter.swift b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
new file mode 100644
index 0000000000000000000000000000000000000000..a14b5966b1a24946137fddae0ddea16ed43ba46c
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Sources/Interpreter.swift
@@ -0,0 +1,265 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import Foundation
+import TensorFlowLiteCAPI
+
+/// A TensorFlow Lite interpreter that performs inference from a given model.
+public final class Interpreter {
+
+  /// The `TFL_Interpreter` C pointer type represented as an `UnsafePointer<TFL_Interpreter>`.
+  private typealias CInterpreter = OpaquePointer
+
+  /// Total number of input tensors associated with the model.
+  public var inputTensorCount: Int {
+    return Int(TFL_InterpreterGetInputTensorCount(cInterpreter))
+  }
+
+  /// Total number of output tensors associated with the model.
+  public var outputTensorCount: Int {
+    return Int(TFL_InterpreterGetOutputTensorCount(cInterpreter))
+  }
+
+  /// The underlying `TFL_Interpreter` C pointer.
+  private var cInterpreter: CInterpreter?
+
+  /// Creates a new model interpreter instance.
+  ///
+  /// - Parameters:
+  ///   - modelPath: Local file path to a TensorFlow Lite model.
+  ///   - options: Custom configurations for the interpreter. The default is `nil` indicating that
+  ///       interpreter will determine the configuration options.
+  /// - Throws: An error if the model could not be loaded or the interpreter could not be created.
+  public init(modelPath: String, options: InterpreterOptions? = nil) throws {
+    guard let model = Model(filePath: modelPath) else { throw InterpreterError.failedToLoadModel }
+
+    let cInterpreterOptions: OpaquePointer? = try options.map { options in
+      guard let cOptions = TFL_NewInterpreterOptions() else {
+        throw InterpreterError.failedToCreateInterpreter
+      }
+      if let threadCount = options.threadCount, threadCount > 0 {
+        TFL_InterpreterOptionsSetNumThreads(cOptions, Int32(threadCount))
+      }
+      if options.isErrorLoggingEnabled {
+        TFL_InterpreterOptionsSetErrorReporter(
+          cOptions,
+          { (_, format, arguments) in
+            guard let cFormat = format,
+                  let message = String(cFormat: cFormat, arguments: arguments)
+            else {
+              return
+            }
+            print(String(describing: InterpreterError.tensorFlowLiteError(message)))
+          },
+          nil
+        )
+      }
+      return cOptions
+    }
+    defer { TFL_DeleteInterpreterOptions(cInterpreterOptions) }
+
+    guard let cInterpreter = TFL_NewInterpreter(model.cModel, cInterpreterOptions) else {
+      throw InterpreterError.failedToCreateInterpreter
+    }
+    self.cInterpreter = cInterpreter
+  }
+
+  deinit {
+    TFL_DeleteInterpreter(cInterpreter)
+  }
+
+  /// Invokes the interpreter to perform inference from the loaded graph.
+  ///
+  /// - Throws: An error if the model was not ready because tensors were not allocated.
+  public func invoke() throws {
+    guard TFL_InterpreterInvoke(cInterpreter) == kTfLiteOk else {
+      // TODO(b/117510052): Determine which error to throw.
+      throw InterpreterError.allocateTensorsRequired
+    }
+  }
+
+  /// Returns the input tensor at the given index.
+  ///
+  /// - Parameters:
+  ///   - index: The index for the input tensor.
+  /// - Throws: An error if the index is invalid or the tensors have not been allocated.
+  /// - Returns: The input tensor at the given index.
+  public func input(at index: Int) throws -> Tensor {
+    let maxIndex = inputTensorCount - 1
+    guard case 0...maxIndex = index else {
+      throw InterpreterError.invalidTensorIndex(index: index, maxIndex: maxIndex)
+    }
+    guard let cTensor = TFL_InterpreterGetInputTensor(cInterpreter, Int32(index)),
+          let bytes = TFL_TensorData(cTensor),
+          let nameCString = TFL_TensorName(cTensor)
+    else {
+      throw InterpreterError.allocateTensorsRequired
+    }
+    guard let dataType = TensorDataType(type: TFL_TensorType(cTensor)) else {
+      throw InterpreterError.invalidTensorDataType
+    }
+
+    let name = String(cString: nameCString)
+    let rank = TFL_TensorNumDims(cTensor)
+    let dimensions = (0..<rank).map { Int(TFL_TensorDim(cTensor, $0)) }
+    let shape = TensorShape(dimensions)
+    let byteCount = TFL_TensorByteSize(cTensor)
+    let data = Data(bytes: bytes, count: byteCount)
+    let cQuantizationParams = TFL_TensorQuantizationParams(cTensor)
+    let scale = cQuantizationParams.scale
+    let zeroPoint = Int(cQuantizationParams.zero_point)
+    var quantizationParameters: QuantizationParameters? = nil
+    if scale != 0.0 {
+      // TODO(b/117510052): Update this check once the TfLiteQuantizationParams struct has a mode.
+      quantizationParameters = QuantizationParameters(scale: scale, zeroPoint: zeroPoint)
+    }
+    let tensor = Tensor(
+      name: name,
+      dataType: dataType,
+      shape: shape,
+      data: data,
+      quantizationParameters: quantizationParameters
+    )
+    return tensor
+  }
+
+  /// Returns the output tensor at the given index.
+  ///
+  /// - Parameters:
+  ///   - index: The index for the output tensor.
+  /// - Throws: An error if the index is invalid, tensors haven't been allocated, or interpreter
+  ///     hasn't been invoked for models that dynamically compute output tensors based on the values
+  ///     of its input tensors.
+  /// - Returns: The output tensor at the given index.
+  public func output(at index: Int) throws -> Tensor {
+    let maxIndex = outputTensorCount - 1
+    guard case 0...maxIndex = index else {
+      throw InterpreterError.invalidTensorIndex(index: index, maxIndex: maxIndex)
+    }
+    guard let cTensor = TFL_InterpreterGetOutputTensor(cInterpreter, Int32(index)),
+          let bytes = TFL_TensorData(cTensor),
+          let nameCString = TFL_TensorName(cTensor)
+    else {
+      // TODO(b/117510052): Determine which error to throw.
+      throw InterpreterError.invokeInterpreterRequired
+    }
+    guard let dataType = TensorDataType(type: TFL_TensorType(cTensor)) else {
+      throw InterpreterError.invalidTensorDataType
+    }
+
+    let name = String(cString: nameCString)
+    let rank = TFL_TensorNumDims(cTensor)
+    let dimensions = (0..<rank).map { Int(TFL_TensorDim(cTensor, $0)) }
+    let shape = TensorShape(dimensions)
+    let byteCount = TFL_TensorByteSize(cTensor)
+    let data = Data(bytes: bytes, count: byteCount)
+    let cQuantizationParams = TFL_TensorQuantizationParams(cTensor)
+    let scale = cQuantizationParams.scale
+    let zeroPoint = Int(cQuantizationParams.zero_point)
+    var quantizationParameters: QuantizationParameters? = nil
+    if scale != 0.0 {
+      // TODO(b/117510052): Update this check once the TfLiteQuantizationParams struct has a mode.
+      quantizationParameters = QuantizationParameters(scale: scale, zeroPoint: zeroPoint)
+    }
+    let tensor = Tensor(
+      name: name,
+      dataType: dataType,
+      shape: shape,
+      data: data,
+      quantizationParameters: quantizationParameters
+    )
+    return tensor
+  }
+
+  /// Resizes the input tensor at the given index to the specified tensor shape.
+  ///
+  /// - Note: After resizing an input tensor, the client **must** explicitly call
+  ///     `allocateTensors()` before attempting to access the resized tensor data or invoking the
+  ///     interpreter to perform inference.
+  /// - Parameters:
+  ///   - index: The index for the input tensor.
+  ///   - shape: The shape that the input tensor should be resized to.
+  /// - Throws: An error if the input tensor at the given index could not be resized.
+  public func resizeInput(at index: Int, to shape: TensorShape) throws {
+    let maxIndex = inputTensorCount - 1
+    guard case 0...maxIndex = index else {
+      throw InterpreterError.invalidTensorIndex(index: index, maxIndex: maxIndex)
+    }
+    guard TFL_InterpreterResizeInputTensor(
+            cInterpreter,
+            Int32(index),
+            shape.int32Dimensions,
+            Int32(shape.rank)
+          ) == kTfLiteOk
+    else {
+      throw InterpreterError.failedToResizeInputTensor(index: index)
+    }
+  }
+
+  /// Copies the given data to the input tensor at the given index.
+  ///
+  /// - Parameters:
+  ///   - data: The data to be copied to the input tensor's data buffer.
+  ///   - index: The index for the input tensor.
+  /// - Throws: An error if the `data.count` does not match the input tensor's `data.count` or if
+  ///     the given index is invalid.
+  /// - Returns: The input tensor with the copied data.
+  @discardableResult
+  public func copy(_ data: Data, toInputAt index: Int) throws -> Tensor {
+    let maxIndex = inputTensorCount - 1
+    guard case 0...maxIndex = index else {
+      throw InterpreterError.invalidTensorIndex(index: index, maxIndex: maxIndex)
+    }
+    guard let cTensor = TFL_InterpreterGetInputTensor(cInterpreter, Int32(index)) else {
+      throw InterpreterError.allocateTensorsRequired
+    }
+
+    let byteCount = TFL_TensorByteSize(cTensor)
+    guard data.count == byteCount else {
+      throw InterpreterError.invalidTensorDataCount(provided: data.count, required: byteCount)
+    }
+
+    let status = data.withUnsafeBytes { TFL_TensorCopyFromBuffer(cTensor, $0, data.count) }
+    guard status == kTfLiteOk else { throw InterpreterError.failedToCopyDataToInputTensor }
+    return try input(at: index)
+  }
+
+  /// Allocates memory for all input tensors based on their `TensorShape`s.
+  ///
+  /// - Note: This is a relatively expensive operation and should only be called after creating the
+  ///     interpreter and/or resizing any input tensors.
+  /// - Throws: An error if memory could not be allocated for the input tensors.
+  public func allocateTensors() throws {
+    guard TFL_InterpreterAllocateTensors(cInterpreter) == kTfLiteOk else {
+      throw InterpreterError.failedToAllocateTensors
+    }
+  }
+}
+
+// MARK: - Extensions
+
+extension String {
+  /// Returns a new `String` initialized by using the given format C array as a template into which
+  /// the remaining argument values are substituted according to the user’s default locale.
+  ///
+  /// - Note: Returns `nil` if a new `String` could not be constructed from the given values.
+  /// - Parameters:
+  ///   - cFormat: The format C array as a template for substituting values.
+  ///   - arguments: A C pointer to a `va_list` of arguments to substitute into `cFormat`.
+  init?(cFormat: UnsafePointer<CChar>, arguments: CVaListPointer) {
+    var buffer: UnsafeMutablePointer<CChar>?
+    guard vasprintf(&buffer, cFormat, arguments) != 0, let cString = buffer else { return nil }
+    self.init(validatingUTF8: cString)
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/Sources/InterpreterError.swift b/tensorflow/lite/experimental/swift/Sources/InterpreterError.swift
new file mode 100644
index 0000000000000000000000000000000000000000..5de58b997a76b6bf9493525694bc9f9e4e6b6c1c
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Sources/InterpreterError.swift
@@ -0,0 +1,99 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import Foundation
+
+/// TensorFlow Lite interpreter errors.
+public enum InterpreterError: Error {
+  case invalidTensorIndex(index: Int, maxIndex: Int)
+  case invalidTensorDataCount(provided: Int, required: Int)
+  case invalidTensorDataType
+  case failedToLoadModel
+  case failedToCreateInterpreter
+  case failedToResizeInputTensor(index: Int)
+  case failedToCopyDataToInputTensor
+  case failedToAllocateTensors
+  case allocateTensorsRequired
+  case invokeInterpreterRequired
+  case tensorFlowLiteError(String)
+}
+
+// MARK: - Extensions
+
+extension InterpreterError: LocalizedError {
+  /// Localized description of the interpreter error.
+  public var errorDescription: String? {
+    switch self {
+    case .invalidTensorIndex(let index, let maxIndex):
+      return "Invalid tensor index \(index), max index is \(maxIndex)."
+    case .invalidTensorDataCount(let providedCount, let requiredCount):
+      return "Provided data count \(providedCount) must match the required count \(requiredCount)."
+    case .invalidTensorDataType:
+      return "Tensor data type is unsupported or could not be determined because of a model error."
+    case .failedToLoadModel:
+      return "Failed to load the given model."
+    case .failedToCreateInterpreter:
+      return "Failed to create the interpreter."
+    case .failedToResizeInputTensor(let index):
+      return "Failed to resize input tesnor at index \(index)."
+    case .failedToCopyDataToInputTensor:
+      return "Failed to copy data to input tensor."
+    case .failedToAllocateTensors:
+      return "Failed to allocate memory for input tensors."
+    case .allocateTensorsRequired:
+      return "Must call allocateTensors()."
+    case .invokeInterpreterRequired:
+      return "Must call invoke()."
+    case .tensorFlowLiteError(let message):
+      return "TensorFlow Lite Error: \(message)"
+    }
+  }
+}
+
+extension InterpreterError: CustomStringConvertible {
+  /// Textual representation of the TensorFlow Lite interpreter error.
+  public var description: String {
+    return errorDescription ?? "Unknown error."
+  }
+}
+
+#if swift(>=4.2)
+extension InterpreterError: Equatable {}
+#else
+extension InterpreterError: Equatable {
+  public static func == (lhs: InterpreterError, rhs: InterpreterError) -> Bool {
+    switch (lhs, rhs) {
+    case (.invalidTensorDataType, .invalidTensorDataType),
+         (.failedToLoadModel, .failedToLoadModel),
+         (.failedToCreateInterpreter, .failedToCreateInterpreter),
+         (.failedToAllocateTensors, .failedToAllocateTensors),
+         (.allocateTensorsRequired, .allocateTensorsRequired),
+         (.invokeInterpreterRequired, .invokeInterpreterRequired):
+      return true
+    case (.invalidTensorIndex(let lhsIndex, let lhsMaxIndex),
+          .invalidTensorIndex(let rhsIndex, let rhsMaxIndex)):
+      return lhsIndex == rhsIndex && lhsMaxIndex == rhsMaxIndex
+    case (.invalidTensorDataCount(let lhsProvidedCount, let lhsRequiredCount),
+          .invalidTensorDataCount(let rhsProvidedCount, let rhsRequiredCount)):
+      return lhsProvidedCount == rhsProvidedCount && lhsRequiredCount == rhsRequiredCount
+    case (.failedToResizeInputTensor(let lhsIndex), .failedToResizeInputTensor(let rhsIndex)):
+      return lhsIndex == rhsIndex
+    case (.tensorFlowLiteError(let lhsMessage), .tensorFlowLiteError(let rhsMessage)):
+      return lhsMessage == rhsMessage
+    default:
+      return false
+    }
+  }
+}
+#endif  // swift(>=4.2)
diff --git a/tensorflow/lite/experimental/swift/Sources/InterpreterOptions.swift b/tensorflow/lite/experimental/swift/Sources/InterpreterOptions.swift
new file mode 100644
index 0000000000000000000000000000000000000000..2365fd7ade0f9562250b239308f6a13b16c35784
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Sources/InterpreterOptions.swift
@@ -0,0 +1,29 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import Foundation
+
+/// Custom configuration options for a TensorFlow Lite interpreter.
+public struct InterpreterOptions: Equatable {
+
+  /// Maximum number of CPU threads that the interpreter should run on. Default is `nil` which
+  /// indicates that the `Interpreter` will decide the number of threads to use.
+  public var threadCount: Int? = nil
+
+  /// Whether error logging to the console is enabled. The default is `false`.
+  public var isErrorLoggingEnabled = false
+
+  /// Creates a new instance of interpreter options.
+  public init() {}
+}
diff --git a/tensorflow/lite/experimental/swift/Sources/Model.swift b/tensorflow/lite/experimental/swift/Sources/Model.swift
new file mode 100644
index 0000000000000000000000000000000000000000..e8c49ff1ae10cc20d1c50b8e8340950cb1491722
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Sources/Model.swift
@@ -0,0 +1,40 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import Foundation
+import TensorFlowLiteCAPI
+
+/// A TensorFlow Lite model used by the 'Interpreter` to perform inference.
+final class Model {
+
+  /// The `TFL_Model` C pointer type represented as an `UnsafePointer<TFL_Model>`.
+  typealias CModel = OpaquePointer
+
+  /// The underlying `TFL_Model` C pointer.
+  let cModel: CModel?
+
+  /// Creates a new model instance.
+  ///
+  /// - Precondition: Initialization can fail if the given `filePath` is invalid.
+  /// - Parameters:
+  ///   - filePath: Local file path to a TensorFlow Lite model.
+  init?(filePath: String) {
+    guard !filePath.isEmpty, let cModel = TFL_NewModelFromFile(filePath) else { return nil }
+    self.cModel = cModel
+  }
+
+  deinit {
+    TFL_DeleteModel(cModel)
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/Sources/QuantizationParameters.swift b/tensorflow/lite/experimental/swift/Sources/QuantizationParameters.swift
new file mode 100644
index 0000000000000000000000000000000000000000..f36787564478115e19584b933a10fb0458e06c71
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Sources/QuantizationParameters.swift
@@ -0,0 +1,38 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import Foundation
+
+/// Parameters that determine the mapping of quantized values to real values. Quantized values can
+/// be mapped to float values using the following conversion:
+/// `realValue = scale * (quantizedValue - zeroPoint)`.
+public struct QuantizationParameters {
+
+  /// Difference between real values corresponding to consecutive quantized values differing by 1.
+  /// For example, the range of quantized values for `UInt8` data type is [0, 255].
+  public let scale: Float
+
+  /// Quantized value that corresponds to the real 0 value.
+  public let zeroPoint: Int
+
+  /// Creates a new quantization parameters instance.
+  ///
+  /// - Parameters:
+  ///   - scale: Scale value for asymmetric quantization.
+  ///   - zeroPoint: Zero point for asymmetric quantization.
+  init(scale: Float, zeroPoint: Int) {
+    self.scale = scale
+    self.zeroPoint = zeroPoint
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/Sources/Tensor.swift b/tensorflow/lite/experimental/swift/Sources/Tensor.swift
new file mode 100644
index 0000000000000000000000000000000000000000..b738d8754914e20ac4c1cb991c92b029828f66d2
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Sources/Tensor.swift
@@ -0,0 +1,138 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+import Foundation
+import TensorFlowLiteCAPI
+
+/// An input or output tensor in a TensorFlow Lite graph.
+public struct Tensor {
+
+  /// Name of the tensor.
+  public let name: String
+
+  /// Data type of the tensor.
+  public let dataType: TensorDataType
+
+  /// Shape of the tensor.
+  public let shape: TensorShape
+
+  /// Data in the input or output tensor.
+  public let data: Data
+
+  /// Quantization parameters for the tensor if using a quantized model.
+  public let quantizationParameters: QuantizationParameters?
+
+  /// Creates a new input or output tensor instance.
+  ///
+  /// - Parameters:
+  ///   - name: Name of the tensor.
+  ///   - dataType: Data type of the tensor.
+  ///   - data: Data in the input tensor.
+  ///   - quantizationParameters Quantization parameters for the tensor if using a quantized model.
+  ///       The default is `nil`.
+  init(
+    name: String,
+    dataType: TensorDataType,
+    shape: TensorShape,
+    data: Data,
+    quantizationParameters: QuantizationParameters? = nil
+  ) {
+    self.name = name
+    self.dataType = dataType
+    self.shape = shape
+    self.data = data
+    self.quantizationParameters = quantizationParameters
+  }
+}
+
+/// Supported TensorFlow Lite tensor data types.
+public enum TensorDataType: Equatable {
+  /// 32-bit single precision floating point tensor data type.
+  case float32
+  /// 8-bit unsigned integer tensor data type.
+  case uInt8
+  /// 16-bit signed integer tensor data type.
+  case int16
+  /// 32-bit signed integer tensor data type.
+  case int32
+  /// 64-bit signed integer tensor data type.
+  case int64
+  /// Boolean tensor data type.
+  case bool
+
+  /// Creates a new tensor data type from the given `TFL_Type` or `nil` if the data type is
+  /// unsupported or could not be determined because there was an error.
+  ///
+  /// - Parameter type: A data type supported by a tensor.
+  init?(type: TFL_Type) {
+    switch type {
+    case kTfLiteFloat32:
+      self = .float32
+    case kTfLiteUInt8:
+      self = .uInt8
+    case kTfLiteInt16:
+      self = .int16
+    case kTfLiteInt32:
+      self = .int32
+    case kTfLiteInt64:
+      self = .int64
+    case kTfLiteBool:
+      self = .bool
+    case kTfLiteNoType:
+      fallthrough
+    default:
+      return nil
+    }
+  }
+}
+
+/// The shape of a TensorFlow Lite tensor.
+public struct TensorShape {
+
+  /// The number of dimensions of the tensor.
+  public let rank: Int
+
+  /// Array of dimensions for the tensor.
+  public let dimensions: [Int]
+
+  /// Array of `Int32` dimensions for the tensor.
+  var int32Dimensions: [Int32] { return dimensions.map(Int32.init) }
+
+  /// Creates a new tensor shape instance with the given array of dimensions.
+  ///
+  /// - Parameters:
+  ///   - dimensions: Dimensions for the tensor.
+  public init(_ dimensions: [Int]) {
+    self.rank = dimensions.count
+    self.dimensions = dimensions
+  }
+
+  /// Creates a new tensor shape instance with the given elements representing the dimensions.
+  ///
+  /// - Parameters:
+  ///   - elements: Dimensions for the tensor.
+  public init(_ elements: Int...) {
+    self.init(elements)
+  }
+}
+
+extension TensorShape: ExpressibleByArrayLiteral {
+  /// Creates a new tensor shape instance with the given array literal representing the dimensions.
+  ///
+  /// - Parameters:
+  ///   - arrayLiteral: Dimensions for the tensor.
+  public init(arrayLiteral: Int...) {
+    self.init(arrayLiteral)
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj/Configs/TensorFlowLite.tulsigen b/tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj/Configs/TensorFlowLite.tulsigen
new file mode 100644
index 0000000000000000000000000000000000000000..16bc6cbfe8f554caad2cba3cae11b364b34ed64d
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj/Configs/TensorFlowLite.tulsigen
@@ -0,0 +1,57 @@
+{
+  "sourceFilters" : [
+    "tensorflow/lite/experimental/c",
+    "tensorflow/lite/experimental/swift",
+    "tensorflow/lite/experimental/swift/Sources",
+    "tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp",
+    "tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj",
+    "tensorflow/lite/experimental/swift/Tests",
+  ],
+  "buildTargets" : [
+    "//tensorflow/lite/experimental/swift:TensorFlowLite",
+    "//tensorflow/lite/experimental/swift:TensorFlowLiteApp",
+    "//tensorflow/lite/experimental/swift:TensorFlowLiteTests",
+  ],
+  "projectName" : "TensorFlowLite",
+  "optionSet" : {
+    "LaunchActionPreActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "BazelBuildStartupOptionsRelease" : {
+      "p" : "$(inherited)"
+    },
+    "BazelBuildOptionsRelease" : {
+      "p" : "$(inherited)"
+    },
+    "BazelBuildOptionsDebug" : {
+      "p" : "$(inherited)"
+    },
+    "EnvironmentVariables" : {
+      "p" : "$(inherited)"
+    },
+    "BuildActionPreActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "CommandlineArguments" : {
+      "p" : "$(inherited)"
+    },
+    "TestActionPreActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "BazelBuildStartupOptionsDebug" : {
+      "p" : "$(inherited)"
+    },
+    "BuildActionPostActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "TestActionPostActionScript" : {
+      "p" : "$(inherited)"
+    },
+    "LaunchActionPostActionScript" : {
+      "p" : "$(inherited)"
+    }
+  },
+  "additionalFilePaths" : [
+    "tensorflow/lite/experimental/swift/BUILD"
+  ]
+}
diff --git a/tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj/project.tulsiconf b/tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj/project.tulsiconf
new file mode 100644
index 0000000000000000000000000000000000000000..82ac8aa38126021c176773e4093352bcbecd8603
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TensorFlowLite.tulsiproj/project.tulsiconf
@@ -0,0 +1,14 @@
+{
+  "configDefaults" : {
+    "optionSet" : {
+      "ProjectPrioritizesSwift" : {
+        "p" : "YES"
+      }
+    }
+  },
+  "projectName" : "TensorFlowLite",
+  "packages" : [
+    "tensorflow/lite/experimental/swift"
+  ],
+  "workspaceRoot" : "../../../../.."
+}
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp.xcodeproj/project.pbxproj b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp.xcodeproj/project.pbxproj
new file mode 100644
index 0000000000000000000000000000000000000000..fbbf9a1de2c8e82ab486b99b9e9b8c6dfe80868e
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp.xcodeproj/project.pbxproj
@@ -0,0 +1,345 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 50;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		4A7304B421500B8400C90B21 /* Data+TensorFlowLite.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4A7304B321500B8300C90B21 /* Data+TensorFlowLite.swift */; };
+		4AA72B732146ED64006C3AEF /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA72B722146ED64006C3AEF /* AppDelegate.swift */; };
+		4AA72B752146ED64006C3AEF /* ViewController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA72B742146ED64006C3AEF /* ViewController.swift */; };
+		4AA72B782146ED64006C3AEF /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 4AA72B762146ED64006C3AEF /* Main.storyboard */; };
+		4AA72B7A2146ED66006C3AEF /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 4AA72B792146ED66006C3AEF /* Assets.xcassets */; };
+		4AA72B7D2146ED66006C3AEF /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 4AA72B7B2146ED66006C3AEF /* LaunchScreen.storyboard */; };
+		4ADDE0CE2176600E00FF07A2 /* Array+TensorFlowLite.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4ADDE0CD2176600900FF07A2 /* Array+TensorFlowLite.swift */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXFileReference section */
+		4A7304B321500B8300C90B21 /* Data+TensorFlowLite.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = "Data+TensorFlowLite.swift"; sourceTree = "<group>"; };
+		4AA72B6F2146ED64006C3AEF /* TensorFlowLiteApp.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = TensorFlowLiteApp.app; sourceTree = BUILT_PRODUCTS_DIR; };
+		4AA72B722146ED64006C3AEF /* AppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = "<group>"; };
+		4AA72B742146ED64006C3AEF /* ViewController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ViewController.swift; sourceTree = "<group>"; };
+		4AA72B772146ED64006C3AEF /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = "<group>"; };
+		4AA72B792146ED66006C3AEF /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
+		4AA72B7C2146ED66006C3AEF /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
+		4AA72B7E2146ED66006C3AEF /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
+		4ADDE0CD2176600900FF07A2 /* Array+TensorFlowLite.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = "Array+TensorFlowLite.swift"; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		4AA72B6C2146ED64006C3AEF /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		4AA72B662146ED64006C3AEF = {
+			isa = PBXGroup;
+			children = (
+				4AA72B712146ED64006C3AEF /* TensorFlowLiteApp */,
+				4AA72B702146ED64006C3AEF /* Products */,
+			);
+			sourceTree = "<group>";
+		};
+		4AA72B702146ED64006C3AEF /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				4AA72B6F2146ED64006C3AEF /* TensorFlowLiteApp.app */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		4AA72B712146ED64006C3AEF /* TensorFlowLiteApp */ = {
+			isa = PBXGroup;
+			children = (
+				4AA72B722146ED64006C3AEF /* AppDelegate.swift */,
+				4ADDE0CD2176600900FF07A2 /* Array+TensorFlowLite.swift */,
+				4A7304B321500B8300C90B21 /* Data+TensorFlowLite.swift */,
+				4AA72B742146ED64006C3AEF /* ViewController.swift */,
+				4AA72B762146ED64006C3AEF /* Main.storyboard */,
+				4AA72B792146ED66006C3AEF /* Assets.xcassets */,
+				4AA72B7B2146ED66006C3AEF /* LaunchScreen.storyboard */,
+				4AA72B7E2146ED66006C3AEF /* Info.plist */,
+			);
+			path = TensorFlowLiteApp;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		4AA72B6E2146ED64006C3AEF /* TensorFlowLiteApp */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 4AA72B812146ED66006C3AEF /* Build configuration list for PBXNativeTarget "TensorFlowLiteApp" */;
+			buildPhases = (
+				4AA72B6B2146ED64006C3AEF /* Sources */,
+				4AA72B6C2146ED64006C3AEF /* Frameworks */,
+				4AA72B6D2146ED64006C3AEF /* Resources */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = TensorFlowLiteApp;
+			productName = TensorFlowLiteApp;
+			productReference = 4AA72B6F2146ED64006C3AEF /* TensorFlowLiteApp.app */;
+			productType = "com.apple.product-type.application";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		4AA72B672146ED64006C3AEF /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				LastSwiftUpdateCheck = 0940;
+				LastUpgradeCheck = 0940;
+				ORGANIZATIONNAME = Google;
+				TargetAttributes = {
+					4AA72B6E2146ED64006C3AEF = {
+						CreatedOnToolsVersion = 9.4.1;
+					};
+				};
+			};
+			buildConfigurationList = 4AA72B6A2146ED64006C3AEF /* Build configuration list for PBXProject "TensorFlowLiteApp" */;
+			compatibilityVersion = "Xcode 9.3";
+			developmentRegion = en;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+				Base,
+			);
+			mainGroup = 4AA72B662146ED64006C3AEF;
+			productRefGroup = 4AA72B702146ED64006C3AEF /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				4AA72B6E2146ED64006C3AEF /* TensorFlowLiteApp */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXResourcesBuildPhase section */
+		4AA72B6D2146ED64006C3AEF /* Resources */ = {
+			isa = PBXResourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				4AA72B7D2146ED66006C3AEF /* LaunchScreen.storyboard in Resources */,
+				4AA72B7A2146ED66006C3AEF /* Assets.xcassets in Resources */,
+				4AA72B782146ED64006C3AEF /* Main.storyboard in Resources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXResourcesBuildPhase section */
+
+/* Begin PBXSourcesBuildPhase section */
+		4AA72B6B2146ED64006C3AEF /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				4AA72B732146ED64006C3AEF /* AppDelegate.swift in Sources */,
+				4ADDE0CE2176600E00FF07A2 /* Array+TensorFlowLite.swift in Sources */,
+				4A7304B421500B8400C90B21 /* Data+TensorFlowLite.swift in Sources */,
+				4AA72B752146ED64006C3AEF /* ViewController.swift in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin PBXVariantGroup section */
+		4AA72B762146ED64006C3AEF /* Main.storyboard */ = {
+			isa = PBXVariantGroup;
+			children = (
+				4AA72B772146ED64006C3AEF /* Base */,
+			);
+			name = Main.storyboard;
+			sourceTree = "<group>";
+		};
+		4AA72B7B2146ED66006C3AEF /* LaunchScreen.storyboard */ = {
+			isa = PBXVariantGroup;
+			children = (
+				4AA72B7C2146ED66006C3AEF /* Base */,
+			);
+			name = LaunchScreen.storyboard;
+			sourceTree = "<group>";
+		};
+/* End PBXVariantGroup section */
+
+/* Begin XCBuildConfiguration section */
+		4AA72B7F2146ED66006C3AEF /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = dwarf;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				ENABLE_TESTABILITY = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 11.4;
+				MTL_ENABLE_DEBUG_INFO = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = iphoneos;
+				SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
+				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
+			};
+			name = Debug;
+		};
+		4AA72B802146ED66006C3AEF /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_ANALYZER_NONNULL = YES;
+				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_ENABLE_OBJC_WEAK = YES;
+				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_COMMA = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INFINITE_RECURSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+				CLANG_WARN_STRICT_PROTOTYPES = YES;
+				CLANG_WARN_SUSPICIOUS_MOVE = YES;
+				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				CODE_SIGN_IDENTITY = "iPhone Developer";
+				COPY_PHASE_STRIP = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu11;
+				GCC_NO_COMMON_BLOCKS = YES;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				IPHONEOS_DEPLOYMENT_TARGET = 11.4;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				SDKROOT = iphoneos;
+				SWIFT_COMPILATION_MODE = wholemodule;
+				SWIFT_OPTIMIZATION_LEVEL = "-O";
+				VALIDATE_PRODUCT = YES;
+			};
+			name = Release;
+		};
+		4AA72B822146ED66006C3AEF /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CODE_SIGN_STYLE = Automatic;
+				INFOPLIST_FILE = TensorFlowLiteApp/Info.plist;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = com.tensorflow.lite.swift.TensorFlowLite;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_VERSION = 4.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Debug;
+		};
+		4AA72B832146ED66006C3AEF /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
+				CODE_SIGN_STYLE = Automatic;
+				INFOPLIST_FILE = TensorFlowLiteApp/Info.plist;
+				LD_RUNPATH_SEARCH_PATHS = (
+					"$(inherited)",
+					"@executable_path/Frameworks",
+				);
+				PRODUCT_BUNDLE_IDENTIFIER = com.tensorflow.lite.swift.TensorFlowLite;
+				PRODUCT_NAME = "$(TARGET_NAME)";
+				SWIFT_VERSION = 4.0;
+				TARGETED_DEVICE_FAMILY = "1,2";
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		4AA72B6A2146ED64006C3AEF /* Build configuration list for PBXProject "TensorFlowLiteApp" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				4AA72B7F2146ED66006C3AEF /* Debug */,
+				4AA72B802146ED66006C3AEF /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		4AA72B812146ED66006C3AEF /* Build configuration list for PBXNativeTarget "TensorFlowLiteApp" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				4AA72B822146ED66006C3AEF /* Debug */,
+				4AA72B832146ED66006C3AEF /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 4AA72B672146ED64006C3AEF /* Project object */;
+}
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/AppDelegate.swift b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/AppDelegate.swift
new file mode 100644
index 0000000000000000000000000000000000000000..ffa90a06adb0b9f93575c8390cd30bd589e43ac7
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/AppDelegate.swift
@@ -0,0 +1,24 @@
+import UIKit
+
+@UIApplicationMain
+
+final class AppDelegate: UIResponder, UIApplicationDelegate {
+
+  /// The main window of the app.
+  var window: UIWindow?
+
+  func application(
+    _ application: UIApplication,
+    didFinishLaunchingWithOptions launchOptions: [UIApplication.LaunchOptionsKey: Any]? = nil
+  ) -> Bool {
+    return true
+  }
+}
+
+// MARK: - Extensions
+
+#if !swift(>=4.2)
+extension UIApplication {
+  typealias LaunchOptionsKey = UIApplicationLaunchOptionsKey
+}
+#endif  // !swift(>=4.2)
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Array+TensorFlowLite.swift b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Array+TensorFlowLite.swift
new file mode 100644
index 0000000000000000000000000000000000000000..56df1ce6597aacf307f7a89a084527ea93c303c2
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Array+TensorFlowLite.swift
@@ -0,0 +1,22 @@
+import Foundation
+
+extension Array {
+  /// Creates a new array from the bytes of the given unsafe data.
+  ///
+  /// - Warning: The array's `Element` type must be trivial in that it can be copied bit for bit
+  ///     with no indirection or reference-counting operations; otherwise, copying the raw bytes in
+  ///     the `unsafeData`'s buffer to a new array returns an unsafe copy.
+  /// - Note: Returns `nil` if `unsafeData.count` is not a multiple of
+  ///     `MemoryLayout<Element>.stride`.
+  /// - Parameter unsafeData: The data containing the bytes to turn into an array.
+  init?(unsafeData: Data) {
+    guard unsafeData.count % MemoryLayout<Element>.stride == 0 else { return nil }
+    let elements = unsafeData.withUnsafeBytes {
+      UnsafeBufferPointer<Element>(
+        start: $0,
+        count: unsafeData.count / MemoryLayout<Element>.stride
+      )
+    }
+    self.init(elements)
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/AppIcon.appiconset/Contents.json b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/AppIcon.appiconset/Contents.json
new file mode 100644
index 0000000000000000000000000000000000000000..d8db8d65fd79fd541b2b7eba75c7378af3448f9c
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/AppIcon.appiconset/Contents.json
@@ -0,0 +1,98 @@
+{
+  "images" : [
+    {
+      "idiom" : "iphone",
+      "size" : "20x20",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "20x20",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "29x29",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "29x29",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "40x40",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "40x40",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "60x60",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "iphone",
+      "size" : "60x60",
+      "scale" : "3x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "20x20",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "20x20",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "29x29",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "29x29",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "40x40",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "40x40",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "76x76",
+      "scale" : "1x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "76x76",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ipad",
+      "size" : "83.5x83.5",
+      "scale" : "2x"
+    },
+    {
+      "idiom" : "ios-marketing",
+      "size" : "1024x1024",
+      "scale" : "1x"
+    }
+  ],
+  "info" : {
+    "version" : 1,
+    "author" : "xcode"
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/Contents.json b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/Contents.json
new file mode 100644
index 0000000000000000000000000000000000000000..da4a164c918651cdd1e11dca5cc62c333f097601
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Assets.xcassets/Contents.json
@@ -0,0 +1,6 @@
+{
+  "info" : {
+    "version" : 1,
+    "author" : "xcode"
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/LaunchScreen.storyboard b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/LaunchScreen.storyboard
new file mode 100644
index 0000000000000000000000000000000000000000..a07a1321be2e65323fadeca51487671c88f462c8
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/LaunchScreen.storyboard
@@ -0,0 +1,44 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14109" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" launchScreen="YES" useTraitCollections="YES" colorMatched="YES" initialViewController="01J-lp-oVM">
+    <device id="retina4_7" orientation="portrait">
+        <adaptation id="fullscreen"/>
+    </device>
+    <dependencies>
+        <deployment identifier="iOS"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="14088"/>
+        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
+    </dependencies>
+    <scenes>
+        <!--View Controller-->
+        <scene sceneID="EHf-IW-A2E">
+            <objects>
+                <viewController id="01J-lp-oVM" sceneMemberID="viewController">
+                    <layoutGuides>
+                        <viewControllerLayoutGuide type="top" id="Llm-lL-Icb"/>
+                        <viewControllerLayoutGuide type="bottom" id="xb3-aO-Qok"/>
+                    </layoutGuides>
+                    <view key="view" contentMode="scaleToFill" id="Ze5-6b-2t3">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="TensorFlowLite" textAlignment="center" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="3Gq-PV-hia">
+                                <rect key="frame" x="16" y="315" width="343" height="38.5"/>
+                                <fontDescription key="fontDescription" type="boldSystem" pointSize="32"/>
+                                <nil key="textColor"/>
+                                <nil key="highlightedColor"/>
+                            </label>
+                        </subviews>
+                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
+                        <constraints>
+                            <constraint firstItem="3Gq-PV-hia" firstAttribute="leading" secondItem="Ze5-6b-2t3" secondAttribute="leading" constant="16" id="aXL-9T-5Pf"/>
+                            <constraint firstItem="3Gq-PV-hia" firstAttribute="centerY" secondItem="Ze5-6b-2t3" secondAttribute="centerY" id="cDf-Go-1FR"/>
+                            <constraint firstAttribute="trailing" secondItem="3Gq-PV-hia" secondAttribute="trailing" constant="16" id="fB9-BX-A3B"/>
+                        </constraints>
+                    </view>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="iYj-Kq-Ea1" userLabel="First Responder" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="52" y="374.66266866566718"/>
+        </scene>
+    </scenes>
+</document>
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/Main.storyboard b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/Main.storyboard
new file mode 100644
index 0000000000000000000000000000000000000000..10cae6e855311791044d6a25a16d855873223c43
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Base.lproj/Main.storyboard
@@ -0,0 +1,95 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14460.31" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
+    <device id="retina4_7" orientation="portrait">
+        <adaptation id="fullscreen"/>
+    </device>
+    <dependencies>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="14460.20"/>
+        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
+    </dependencies>
+    <scenes>
+        <!--View Controller-->
+        <scene sceneID="tne-QT-ifu">
+            <objects>
+                <viewController storyboardIdentifier="viewController" useStoryboardIdentifierAsRestorationIdentifier="YES" id="BYZ-38-t0r" customClass="ViewController" customModule="TensorFlowLiteAppLib" sceneMemberID="viewController">
+                    <layoutGuides>
+                        <viewControllerLayoutGuide type="top" id="y3c-jy-aDJ"/>
+                        <viewControllerLayoutGuide type="bottom" id="wfy-db-euE"/>
+                    </layoutGuides>
+                    <view key="view" contentMode="scaleToFill" id="8bC-Xf-vdC">
+                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
+                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
+                        <subviews>
+                            <textView clipsSubviews="YES" multipleTouchEnabled="YES" contentMode="scaleToFill" editable="NO" selectable="NO" translatesAutoresizingMaskIntoConstraints="NO" id="7Mj-sL-hrd">
+                                <rect key="frame" x="0.0" y="367" width="375" height="300"/>
+                                <color key="backgroundColor" red="0.0" green="0.47843137250000001" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="300" id="YUb-MC-D5w"/>
+                                </constraints>
+                                <color key="textColor" cocoaTouchSystemColor="tableCellGroupedBackgroundColor"/>
+                                <fontDescription key="fontDescription" type="system" pointSize="14"/>
+                                <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
+                            </textView>
+                            <toolbar opaque="NO" clearsContextBeforeDrawing="NO" contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="Qwg-EP-bd6" userLabel="Bottom Toolbar">
+                                <rect key="frame" x="0.0" y="323" width="375" height="44"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="44" id="jhT-Q0-E9N"/>
+                                </constraints>
+                                <items>
+                                    <barButtonItem style="plain" systemItem="flexibleSpace" id="P3q-uA-YUa"/>
+                                    <barButtonItem title="Invoke Interpreter" id="A4J-Mg-nmd" userLabel="Invoke Button">
+                                        <connections>
+                                            <action selector="invokeInterpreter:" destination="BYZ-38-t0r" id="lZU-x7-PsJ"/>
+                                        </connections>
+                                    </barButtonItem>
+                                    <barButtonItem style="plain" systemItem="flexibleSpace" id="Qad-Pa-ySg"/>
+                                </items>
+                            </toolbar>
+                            <toolbar opaque="NO" clearsContextBeforeDrawing="NO" contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="Gkb-TR-PCB" userLabel="Top Toolbar">
+                                <rect key="frame" x="0.0" y="28" width="375" height="44"/>
+                                <constraints>
+                                    <constraint firstAttribute="height" constant="44" id="hSD-2q-fUE"/>
+                                </constraints>
+                                <items>
+                                    <barButtonItem style="plain" id="LKw-TX-bbH">
+                                        <segmentedControl key="customView" opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="left" contentVerticalAlignment="top" segmentControlStyle="bar" selectedSegmentIndex="0" id="rhA-nW-xzT">
+                                            <rect key="frame" x="16" y="7" width="343" height="30"/>
+                                            <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
+                                            <segments>
+                                                <segment title="Add"/>
+                                                <segment title="AddQuantized"/>
+                                                <segment title="MultiAdd"/>
+                                            </segments>
+                                            <connections>
+                                                <action selector="modelChanged:" destination="BYZ-38-t0r" eventType="valueChanged" id="YnG-Ov-B5D"/>
+                                            </connections>
+                                        </segmentedControl>
+                                    </barButtonItem>
+                                </items>
+                            </toolbar>
+                        </subviews>
+                        <color key="backgroundColor" red="1" green="1" blue="1" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
+                        <constraints>
+                            <constraint firstAttribute="trailing" secondItem="Gkb-TR-PCB" secondAttribute="trailing" id="4Cr-Sf-I7n"/>
+                            <constraint firstItem="7Mj-sL-hrd" firstAttribute="bottom" secondItem="wfy-db-euE" secondAttribute="top" id="6ot-zD-sze"/>
+                            <constraint firstItem="7Mj-sL-hrd" firstAttribute="top" secondItem="Qwg-EP-bd6" secondAttribute="bottom" id="ELA-C6-NiG"/>
+                            <constraint firstAttribute="trailing" secondItem="7Mj-sL-hrd" secondAttribute="trailing" id="HDO-xr-mBl"/>
+                            <constraint firstItem="Gkb-TR-PCB" firstAttribute="leading" secondItem="8bC-Xf-vdC" secondAttribute="leading" id="Kmo-6K-gS4"/>
+                            <constraint firstItem="Qwg-EP-bd6" firstAttribute="leading" secondItem="8bC-Xf-vdC" secondAttribute="leading" id="hGu-lm-fMG"/>
+                            <constraint firstAttribute="trailing" secondItem="Qwg-EP-bd6" secondAttribute="trailing" id="iXR-LK-nTO"/>
+                            <constraint firstItem="7Mj-sL-hrd" firstAttribute="leading" secondItem="8bC-Xf-vdC" secondAttribute="leading" id="nr7-jW-ZYf"/>
+                            <constraint firstItem="Gkb-TR-PCB" firstAttribute="top" secondItem="y3c-jy-aDJ" secondAttribute="bottom" constant="8" id="uCF-VW-rR0"/>
+                        </constraints>
+                    </view>
+                    <connections>
+                        <outlet property="invokeButton" destination="A4J-Mg-nmd" id="UxZ-Ft-E45"/>
+                        <outlet property="modelControl" destination="rhA-nW-xzT" id="KKf-TT-BQ2"/>
+                        <outlet property="resultsTextView" destination="7Mj-sL-hrd" id="T4I-z4-tYA"/>
+                    </connections>
+                </viewController>
+                <placeholder placeholderIdentifier="IBFirstResponder" id="dkx-z0-nzr" sceneMemberID="firstResponder"/>
+            </objects>
+            <point key="canvasLocation" x="125.59999999999999" y="133.5832083958021"/>
+        </scene>
+    </scenes>
+</document>
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Data+TensorFlowLite.swift b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Data+TensorFlowLite.swift
new file mode 100644
index 0000000000000000000000000000000000000000..bc8a70c848390ad7ba584629563d7d75a9e32341
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Data+TensorFlowLite.swift
@@ -0,0 +1,13 @@
+import Foundation
+
+extension Data {
+  /// Creates a new buffer by copying the buffer pointer of the given array.
+  ///
+  /// - Warning: The given array's element type `T` must be trivial in that it can be copied bit
+  ///     for bit with no indirection or reference-counting operations; otherwise, reinterpreting
+  ///     data from the resulting buffer has undefined behavior.
+  /// - Parameter array: An array with elements of type `T`.
+  init<T>(copyingBufferOf array: [T]) {
+    self = array.withUnsafeBufferPointer(Data.init)
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Info.plist b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Info.plist
new file mode 100644
index 0000000000000000000000000000000000000000..3ca3875f04e5789da9cfb34a44151cd06226a8f3
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/Info.plist
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>en</string>
+	<key>CFBundleExecutable</key>
+	<string>$(EXECUTABLE_NAME)</string>
+	<key>CFBundleIdentifier</key>
+	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>$(PRODUCT_NAME)</string>
+	<key>CFBundlePackageType</key>
+	<string>APPL</string>
+	<key>CFBundleShortVersionString</key>
+	<string>1.0</string>
+	<key>CFBundleVersion</key>
+	<string>0.0.1</string>
+	<key>LSRequiresIPhoneOS</key>
+	<true/>
+	<key>NSCameraUsageDescription</key>
+	<string>NSCameraUsageDescription</string>
+	<key>NSPhotoLibraryUsageDescription</key>
+	<string>Select a photo to detect objects in.</string>
+	<key>UILaunchStoryboardName</key>
+	<string>LaunchScreen</string>
+	<key>UIMainStoryboardFile</key>
+	<string>Main</string>
+	<key>UIRequiredDeviceCapabilities</key>
+	<array>
+		<string>armv7</string>
+	</array>
+	<key>UISupportedInterfaceOrientations</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationPortraitUpsideDown</string>
+	</array>
+	<key>UISupportedInterfaceOrientations~ipad</key>
+	<array>
+		<string>UIInterfaceOrientationPortrait</string>
+		<string>UIInterfaceOrientationPortraitUpsideDown</string>
+	</array>
+</dict>
+</plist>
diff --git a/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/ViewController.swift b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/ViewController.swift
new file mode 100644
index 0000000000000000000000000000000000000000..73c74fd19c996653d988977d551fcef683f18697
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/TestApps/TensorFlowLiteApp/TensorFlowLiteApp/ViewController.swift
@@ -0,0 +1,299 @@
+import TensorFlowLite
+import UIKit
+
+class ViewController: UIViewController {
+
+  // MARK: - Properties
+
+  /// TensorFlowLite interpreter object for performing inference from a given model.
+  private var interpreter: Interpreter?
+
+  /// Serial dispatch queue for managing `Interpreter` calls.
+  private let interpreterQueue = DispatchQueue(
+    label: Constant.dispatchQueueLabel,
+    qos: .userInitiated
+  )
+
+  /// The currently selected model.
+  private var currentModel: Model {
+    guard let currentModel = Model(rawValue: modelControl.selectedSegmentIndex) else {
+      preconditionFailure("Invalid model for selected segment index.")
+    }
+    return currentModel
+  }
+
+  /// A description of the current model.
+  private var modelDescription: String {
+    guard let interpreter = interpreter else { return "" }
+    let inputCount = interpreter.inputTensorCount
+    let outputCount = interpreter.outputTensorCount
+    let inputTensors = (0..<inputCount).map { index in
+      var tensorInfo = "  Input \(index + 1): "
+      do {
+        let tensor = try interpreter.input(at: index)
+        tensorInfo += "\(tensor)"
+      } catch let error {
+        tensorInfo += "\(error.localizedDescription)"
+      }
+      return tensorInfo
+    }.joined(separator: "\n")
+    let outputTensors = (0..<outputCount).map { index in
+      var tensorInfo = "  Output \(index + 1): "
+      do {
+        let tensor = try interpreter.output(at: index)
+        tensorInfo += "\(tensor)"
+      } catch let error {
+        tensorInfo += "\(error.localizedDescription)"
+      }
+      return tensorInfo
+    }.joined(separator: "\n")
+    return "Model Description:\n" +
+             "  Input Tensor Count = \(inputCount)\n\(inputTensors)\n\n" +
+             "  Output Tensor Count = \(outputCount)\n\(outputTensors)"
+  }
+
+  // MARK: - IBOutlets
+
+  /// A segmented control for changing models. See the `Model` enum for available models.
+  @IBOutlet private var modelControl: UISegmentedControl!
+
+  @IBOutlet private var resultsTextView: UITextView!
+  @IBOutlet private var invokeButton: UIBarButtonItem!
+
+  // MARK: - UIViewController
+
+  override func viewDidLoad() {
+    super.viewDidLoad()
+
+    invokeButton.isEnabled = false
+    loadModel()
+  }
+
+  // MARK: - IBActions
+
+  @IBAction func modelChanged(_ sender: Any) {
+    invokeButton.isEnabled = false
+    updateResultsText("Switched to the \(currentModel.description) model.")
+    loadModel()
+  }
+
+  @IBAction func invokeInterpreter(_ sender: Any) {
+    switch currentModel {
+    case .add:
+      invokeAdd()
+    case .addQuantized:
+      invokeAddQuantized()
+    case .multiAdd:
+      invokeMultiAdd()
+    }
+  }
+
+  // MARK: - Private
+
+  private func loadModel() {
+    let fileInfo = currentModel.fileInfo
+    guard let modelPath = Bundle.main.path(forResource: fileInfo.name, ofType: fileInfo.extension)
+    else {
+      updateResultsText("Failed to load the \(currentModel.description) model.")
+      return
+    }
+    setUpInterpreter(withModelPath: modelPath)
+  }
+
+  private func setUpInterpreter(withModelPath modelPath: String) {
+    interpreterQueue.async {
+      do {
+        var options = InterpreterOptions()
+        options.isErrorLoggingEnabled = true
+        self.interpreter = try Interpreter(modelPath: modelPath, options: options)
+      } catch let error {
+        self.updateResultsText(
+          "Failed to create the interpreter with error: \(error.localizedDescription)"
+        )
+        return
+      }
+      safeDispatchOnMain { self.invokeButton.isEnabled = true }
+    }
+  }
+
+  private func invokeAdd() {
+    interpreterQueue.async {
+      guard let interpreter = self.interpreter else {
+        self.updateResultsText(Constant.nilInterpreterErrorMessage)
+        return
+      }
+      do {
+        try interpreter.resizeInput(at: 0, to: [2])
+        try interpreter.allocateTensors()
+        let input: [Float32] = [1, 3]
+        let resultsText = self.modelDescription + "\n\n" +
+          "Performing 2 add operations on input \(input.description) equals: "
+        self.updateResultsText(resultsText)
+        let data = Data(copyingBufferOf: input)
+        try interpreter.copy(data, toInputAt: 0)
+        try interpreter.invoke()
+        let outputTensor = try interpreter.output(at: 0)
+        let results: () -> String = {
+          guard let results = [Float32](unsafeData: outputTensor.data) else { return "No results." }
+          return resultsText + results.description
+        }
+        self.updateResultsText(results())
+      } catch let error {
+        self.updateResultsText(
+          "Failed to invoke the interpreter with error: \(error.localizedDescription)"
+        )
+        return
+      }
+    }
+  }
+
+  private func invokeAddQuantized() {
+    interpreterQueue.async {
+      guard let interpreter = self.interpreter else {
+        self.updateResultsText(Constant.nilInterpreterErrorMessage)
+        return
+      }
+      do {
+        try interpreter.resizeInput(at: 0, to: [2])
+        try interpreter.allocateTensors()
+        let input: [UInt8] = [1, 3]
+        let resultsText = self.modelDescription + "\n\n" +
+          "Performing 2 add operations on quantized input \(input.description) equals: "
+        self.updateResultsText(resultsText)
+        let data = Data(input)
+        try interpreter.copy(data, toInputAt: 0)
+        try interpreter.invoke()
+        let outputTensor = try interpreter.output(at: 0)
+        let results: () -> String = {
+          guard let quantizationParameters = outputTensor.quantizationParameters else {
+            return "No results."
+          }
+          let quantizedResults = [UInt8](outputTensor.data)
+          let dequantizedResults = quantizedResults.map {
+            quantizationParameters.scale * Float(Int($0) - quantizationParameters.zeroPoint)
+          }
+          return resultsText + quantizedResults.description +
+                   ", dequantized results: " + dequantizedResults.description
+        }
+        self.updateResultsText(results())
+      } catch let error {
+        self.updateResultsText(
+          "Failed to invoke the interpreter with error: \(error.localizedDescription)"
+        )
+        return
+      }
+    }
+  }
+
+  private func invokeMultiAdd() {
+    interpreterQueue.async {
+      guard let interpreter = self.interpreter else {
+        self.updateResultsText(Constant.nilInterpreterErrorMessage)
+        return
+      }
+      do {
+        let shape = TensorShape(2)
+        try (0..<interpreter.inputTensorCount).forEach { index in
+          try interpreter.resizeInput(at: index, to: shape)
+        }
+        try interpreter.allocateTensors()
+        let inputs = try (0..<interpreter.inputTensorCount).map { index -> [Float32] in
+          let input = [Float32(index + 1), Float32(index + 2)]
+          let data = Data(copyingBufferOf: input)
+          try interpreter.copy(data, toInputAt: index)
+          return input
+        }
+        let resultsText = self.modelDescription + "\n\n" +
+          "Performing 3 add operations on inputs \(inputs.description) equals: "
+        self.updateResultsText(resultsText)
+        try interpreter.invoke()
+        let results = try (0..<interpreter.outputTensorCount).map { index -> [Float32] in
+          let tensor = try interpreter.output(at: index)
+          return [Float32](unsafeData: tensor.data) ?? []
+        }
+        self.updateResultsText(resultsText + results.description)
+      } catch let error {
+        self.updateResultsText(
+          "Failed to invoke the interpreter with error: \(error.localizedDescription)"
+        )
+        return
+      }
+    }
+  }
+
+  private func updateResultsText(_ text: String? = nil) {
+    safeDispatchOnMain { self.resultsTextView.text = text }
+  }
+}
+
+// MARK: - Constants
+
+private enum Constant {
+  static let dispatchQueueLabel = "TensorFlowLiteInterpreterQueue"
+  static let nilInterpreterErrorMessage =
+    "Failed to invoke the interpreter because the interpreter was nil."
+}
+
+/// Models that can be loaded by the TensorFlow Lite `Interpreter`.
+private enum Model: Int, CustomStringConvertible {
+  /// A float model that performs two add operations on one input tensor and returns the result in
+  /// one output tensor.
+  case add = 0
+  /// A quantized model that performs two add operations on one input tensor and returns the result
+  /// in one output tensor.
+  case addQuantized = 1
+  /// A float model that performs three add operations on four input tensors and returns the results
+  /// in 2 output tensors.
+  case multiAdd = 2
+
+  var fileInfo: (name: String, extension: String) {
+    switch self {
+    case .add:
+      return Add.fileInfo
+    case .addQuantized:
+      return AddQuantized.fileInfo
+    case .multiAdd:
+      return MultiAdd.fileInfo
+    }
+  }
+
+  // MARK: - CustomStringConvertible
+
+  var description: String {
+    switch self {
+    case .add:
+      return Add.name
+    case .addQuantized:
+      return AddQuantized.name
+    case .multiAdd:
+      return MultiAdd.name
+    }
+  }
+}
+
+/// Values for the `Add` model.
+private enum Add {
+  static let name = "Add"
+  static let fileInfo = (name: "add", extension: "bin")
+}
+
+/// Values for the `AddQuantized` model.
+private enum AddQuantized {
+  static let name = "AddQuantized"
+  static let fileInfo = (name: "add_quantized", extension: "bin")
+}
+
+/// Values for the `MultiAdd` model.
+private enum MultiAdd {
+  static let name = "MultiAdd"
+  static let fileInfo = (name: "multi_add", extension: "bin")
+}
+
+// MARK: - Fileprivate
+
+/// Safely dispatches the given block on the main queue. If the current thread is `main`, the block
+/// is executed synchronously; otherwise, the block is executed asynchronously on the main thread.
+fileprivate func safeDispatchOnMain(_ block: @escaping () -> Void) {
+  if Thread.isMainThread { block(); return }
+  DispatchQueue.main.async { block() }
+}
diff --git a/tensorflow/lite/experimental/swift/Tests/InterpreterOptionsTests.swift b/tensorflow/lite/experimental/swift/Tests/InterpreterOptionsTests.swift
new file mode 100644
index 0000000000000000000000000000000000000000..54b4f59b28942fe2398aba1a19443857e9617458
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Tests/InterpreterOptionsTests.swift
@@ -0,0 +1,54 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+@testable import TensorFlowLite
+import XCTest
+
+class InterpreterOptionsTests: XCTestCase {
+
+  func testInterpreterOptions_InitWithDefaultValues() {
+    let options = InterpreterOptions()
+    XCTAssertNil(options.threadCount)
+    XCTAssertFalse(options.isErrorLoggingEnabled)
+  }
+
+  func testInterpreterOptions_InitWithCustomValues() {
+    var options = InterpreterOptions()
+    options.threadCount = 2
+    XCTAssertEqual(options.threadCount, 2)
+    options.isErrorLoggingEnabled = true
+    XCTAssertTrue(options.isErrorLoggingEnabled)
+  }
+
+  func testInterpreterOptions_Equatable() {
+    var options1 = InterpreterOptions()
+    var options2 = InterpreterOptions()
+    XCTAssertEqual(options1, options2)
+
+    options1.threadCount = 2
+    options2.threadCount = 2
+    XCTAssertEqual(options1, options2)
+
+    options2.threadCount = 3
+    XCTAssertNotEqual(options1, options2)
+    options2.threadCount = 2
+
+    options1.isErrorLoggingEnabled = true
+    options2.isErrorLoggingEnabled = true
+    XCTAssertEqual(options1, options2)
+
+    options2.isErrorLoggingEnabled = false
+    XCTAssertNotEqual(options1, options2)
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift b/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
new file mode 100644
index 0000000000000000000000000000000000000000..e98da5f951e9bc6bfebaf6a1bd76b3c8c8bb9e83
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
@@ -0,0 +1,315 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+@testable import TensorFlowLite
+import XCTest
+
+class InterpreterTests: XCTestCase {
+
+  var interpreter: Interpreter!
+
+  override func setUp() {
+    super.setUp()
+
+    interpreter = try! Interpreter(modelPath: AddModel.path)
+  }
+
+  override func tearDown() {
+    interpreter = nil
+
+    super.tearDown()
+  }
+
+  func testInterpreter_InitWithModelPath() {
+    XCTAssertNoThrow(try Interpreter(modelPath: AddModel.path))
+  }
+
+  func testInterpreter_Init_ThrowsFailedToLoadModel() {
+    XCTAssertThrowsError(try Interpreter(modelPath: "/invalid/path")) { error in
+      self.assertEqualErrors(actual: error, expected: .failedToLoadModel)
+    }
+  }
+
+  func testInterpreter_InitWithModelPathAndOptions() {
+    var options = InterpreterOptions()
+    options.threadCount = 2
+    XCTAssertNoThrow(try Interpreter(modelPath: AddModel.path, options: options))
+  }
+
+  func testInterpreter_InputTensorCount() {
+    XCTAssertEqual(interpreter.inputTensorCount, AddModel.inputTensorCount)
+  }
+
+  func testInterpreter_OutputTensorCount() {
+    XCTAssertEqual(interpreter.outputTensorCount, AddModel.outputTensorCount)
+  }
+
+  func testInterpreter_Invoke() throws {
+    try interpreter.allocateTensors()
+    XCTAssertNoThrow(try interpreter.invoke())
+  }
+
+  func testInterpreter_Invoke_ThrowsAllocateTensorsRequired_ModelNotReady() {
+    XCTAssertThrowsError(try interpreter.invoke()) { error in
+      self.assertEqualErrors(actual: error, expected: .allocateTensorsRequired)
+    }
+  }
+
+  func testInterpreter_InputTensorAtIndex() throws {
+    try setUpAddModelInputTensor()
+    let inputTensor = try interpreter.input(at: AddModel.validIndex)
+    XCTAssertEqual(inputTensor, AddModel.inputTensor)
+  }
+
+  func testInterpreter_InputTensorAtIndex_QuantizedModel() throws {
+    interpreter = try Interpreter(modelPath: AddQuantizedModel.path)
+    try setUpAddQuantizedModelInputTensor()
+    let inputTensor = try interpreter.input(at: AddQuantizedModel.inputOutputIndex)
+    XCTAssertEqual(inputTensor, AddQuantizedModel.inputTensor)
+  }
+
+  func testInterpreter_InputTensorAtIndex_ThrowsInvalidIndex() throws {
+    try interpreter.allocateTensors()
+    XCTAssertThrowsError(try interpreter.input(at: AddModel.invalidIndex)) { error in
+      let maxIndex = AddModel.inputTensorCount - 1
+      self.assertEqualErrors(
+        actual: error,
+        expected: .invalidTensorIndex(index: AddModel.invalidIndex, maxIndex: maxIndex)
+      )
+    }
+  }
+
+  func testInterpreter_InputTensorAtIndex_ThrowsAllocateTensorsRequired() {
+    XCTAssertThrowsError(try interpreter.input(at: AddModel.validIndex)) { error in
+      self.assertEqualErrors(actual: error, expected: .allocateTensorsRequired)
+    }
+  }
+
+  func testInterpreter_OutputTensorAtIndex() throws {
+    try setUpAddModelInputTensor()
+    try interpreter.invoke()
+    let outputTensor = try interpreter.output(at: AddModel.validIndex)
+    XCTAssertEqual(outputTensor, AddModel.outputTensor)
+    let expectedResults = [Float32](unsafeData: outputTensor.data)
+    XCTAssertEqual(expectedResults, AddModel.results)
+  }
+
+  func testInterpreter_OutputTensorAtIndex_QuantizedModel() throws {
+    interpreter = try Interpreter(modelPath: AddQuantizedModel.path)
+    try setUpAddQuantizedModelInputTensor()
+    try interpreter.invoke()
+    let outputTensor = try interpreter.output(at: AddQuantizedModel.inputOutputIndex)
+    XCTAssertEqual(outputTensor, AddQuantizedModel.outputTensor)
+    let expectedResults = [UInt8](outputTensor.data)
+    XCTAssertEqual(expectedResults, AddQuantizedModel.results)
+  }
+
+  func testInterpreter_OutputTensorAtIndex_ThrowsInvalidIndex() throws {
+    try interpreter.allocateTensors()
+    try interpreter.invoke()
+    XCTAssertThrowsError(try interpreter.output(at: AddModel.invalidIndex)) { error in
+      let maxIndex = AddModel.outputTensorCount - 1
+      self.assertEqualErrors(
+        actual: error,
+        expected: .invalidTensorIndex(index: AddModel.invalidIndex, maxIndex: maxIndex)
+      )
+    }
+  }
+
+  func testInterpreter_OutputTensorAtIndex_ThrowsInvokeInterpreterRequired() {
+    XCTAssertThrowsError(try interpreter.output(at: AddModel.validIndex)) { error in
+      self.assertEqualErrors(actual: error, expected: .invokeInterpreterRequired)
+    }
+  }
+
+  func testInterpreter_ResizeInputTensorAtIndexToShape() {
+    XCTAssertNoThrow(try interpreter.resizeInput(at: AddModel.validIndex, to: [2, 2, 3]))
+    XCTAssertNoThrow(try interpreter.allocateTensors())
+  }
+
+  func testInterpreter_ResizeInputTensorAtIndexToShape_ThrowsInvalidIndex() {
+    XCTAssertThrowsError(try interpreter.resizeInput(
+      at: AddModel.invalidIndex,
+      to: [2, 2, 3]
+    )) { error in
+      let maxIndex = AddModel.inputTensorCount - 1
+      self.assertEqualErrors(
+        actual: error,
+        expected: .invalidTensorIndex(index: AddModel.invalidIndex, maxIndex: maxIndex)
+      )
+    }
+  }
+
+  func testInterpreter_CopyDataToInputTensorAtIndex() throws {
+    try interpreter.resizeInput(at: AddModel.validIndex, to: AddModel.shape)
+    try interpreter.allocateTensors()
+    let inputTensor = try interpreter.copy(AddModel.inputData, toInputAt: AddModel.validIndex)
+    XCTAssertEqual(inputTensor.data, AddModel.inputData)
+  }
+
+  func testInterpreter_CopyDataToInputTensorAtIndex_ThrowsInvalidIndex() {
+    XCTAssertThrowsError(try interpreter.copy(
+      AddModel.inputData,
+      toInputAt: AddModel.invalidIndex
+    )) { error in
+      let maxIndex = AddModel.inputTensorCount - 1
+      self.assertEqualErrors(
+        actual: error,
+        expected: .invalidTensorIndex(index: AddModel.invalidIndex, maxIndex: maxIndex)
+      )
+    }
+  }
+
+  func testInterpreter_CopyDataToInputTensorAtIndex_ThrowsInvalidDataCount() throws {
+    try interpreter.resizeInput(at: AddModel.validIndex, to: AddModel.shape)
+    try interpreter.allocateTensors()
+    let invalidData = Data(count: AddModel.dataCount - 1)
+    XCTAssertThrowsError(try interpreter.copy(
+      invalidData,
+      toInputAt: AddModel.validIndex
+    )) { error in
+      self.assertEqualErrors(
+        actual: error,
+        expected: .invalidTensorDataCount(provided: invalidData.count, required: AddModel.dataCount)
+      )
+    }
+  }
+
+  func testInterpreter_AllocateTensors() {
+    XCTAssertNoThrow(try interpreter.allocateTensors())
+  }
+
+  // MARK: - Private
+
+  private func setUpAddModelInputTensor() throws {
+    precondition(interpreter != nil)
+    try interpreter.resizeInput(at: AddModel.validIndex, to: AddModel.shape)
+    try interpreter.allocateTensors()
+    try interpreter.copy(AddModel.inputData, toInputAt: AddModel.validIndex)
+  }
+
+  private func setUpAddQuantizedModelInputTensor() throws {
+    precondition(interpreter != nil)
+    try interpreter.resizeInput(at: AddQuantizedModel.inputOutputIndex, to: AddQuantizedModel.shape)
+    try interpreter.allocateTensors()
+    try interpreter.copy(AddQuantizedModel.inputData, toInputAt: AddQuantizedModel.inputOutputIndex)
+  }
+
+  private func assertEqualErrors(actual: Error, expected: InterpreterError) {
+    guard let actual = actual as? InterpreterError else {
+      XCTFail("Actual error should be of type InterpreterError.")
+      return
+    }
+    XCTAssertEqual(actual, expected)
+  }
+}
+
+// MARK: - Constants
+
+/// Values for the `add.bin` model.
+private enum AddModel {
+  static let info = (name: "add", extension: "bin")
+  static let inputTensorCount = 1
+  static let outputTensorCount = 1
+  static let invalidIndex = 1
+  static let validIndex = 0
+  static let shape: TensorShape = [2]
+  static let dataCount = inputData.count
+  static let inputData = Data(copyingBufferOf: [Float32(1.0), Float32(3.0)])
+  static let outputData = Data(copyingBufferOf: [Float32(3.0), Float32(9.0)])
+  static let results = [Float32(3.0), Float32(9.0)]
+
+  static let inputTensor = Tensor(
+    name: "input",
+    dataType: .float32,
+    shape: shape,
+    data: inputData
+  )
+  static let outputTensor = Tensor(
+    name: "output",
+    dataType: .float32,
+    shape: shape,
+    data: outputData
+  )
+
+  static var path: String = {
+    let bundle = Bundle(for: InterpreterTests.self)
+    guard let path = bundle.path(forResource: info.name, ofType: info.extension) else { return "" }
+    return path
+  }()
+}
+
+/// Values for the `add_quantized.bin` model.
+private enum AddQuantizedModel {
+  static let info = (name: "add_quantized", extension: "bin")
+  static let inputOutputIndex = 0
+  static let shape: TensorShape = [2]
+  static let inputData = Data([1, 3])
+  static let outputData = Data([3, 9])
+  static let quantizationParameters = QuantizationParameters(scale: 0.003922, zeroPoint: 0)
+  static let results: [UInt8] = [3, 9]
+
+  static let inputTensor = Tensor(
+    name: "input",
+    dataType: .uInt8,
+    shape: shape,
+    data: inputData,
+    quantizationParameters: quantizationParameters
+  )
+  static let outputTensor = Tensor(
+    name: "output",
+    dataType: .uInt8,
+    shape: shape,
+    data: outputData,
+    quantizationParameters: quantizationParameters
+  )
+
+  static var path: String = {
+    let bundle = Bundle(for: InterpreterTests.self)
+    guard let path = bundle.path(forResource: info.name, ofType: info.extension) else { return "" }
+    return path
+  }()
+}
+
+// MARK: - Extensions
+
+extension Array {
+  /// Creates a new array from the bytes of the given unsafe data.
+  ///
+  /// - Note: Returns `nil` if `unsafeData.count` is not a multiple of
+  ///     `MemoryLayout<Element>.stride`.
+  /// - Parameter unsafeData: The data containing the bytes to turn into an array.
+  init?(unsafeData: Data) {
+    guard unsafeData.count % MemoryLayout<Element>.stride == 0 else { return nil }
+    let elements = unsafeData.withUnsafeBytes {
+      UnsafeBufferPointer<Element>(
+        start: $0,
+        count: unsafeData.count / MemoryLayout<Element>.stride
+      )
+    }
+    self.init(elements)
+  }
+}
+
+extension Data {
+  /// Creates a new buffer by copying the buffer pointer of the given array.
+  ///
+  /// - Warning: The given array's element type `T` must be trivial in that it can be copied bit
+  ///     for bit with no indirection or reference-counting operations; otherwise, reinterpreting
+  ///     data from the resulting buffer has undefined behavior.
+  /// - Parameter array: An array with elements of type `T`.
+  init<T>(copyingBufferOf array: [T]) {
+    self = array.withUnsafeBufferPointer(Data.init)
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/Tests/ModelTests.swift b/tensorflow/lite/experimental/swift/Tests/ModelTests.swift
new file mode 100644
index 0000000000000000000000000000000000000000..025db1890607641d49304ae22da1fc33fed084ef
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Tests/ModelTests.swift
@@ -0,0 +1,59 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+@testable import TensorFlowLite
+import XCTest
+
+class ModelTests: XCTestCase {
+
+  var modelPath: String!
+
+  override func setUp() {
+    super.setUp()
+
+    let bundle = Bundle(for: type(of: self))
+    guard let modelPath = bundle.path(
+            forResource: Constant.modelInfo.name,
+            ofType: Constant.modelInfo.extension)
+    else {
+      XCTFail("Failed to get the model file path.")
+      return
+    }
+    self.modelPath = modelPath
+  }
+
+  override func tearDown() {
+    modelPath = nil
+
+    super.tearDown()
+  }
+
+  func testModel_InitWithFilePath() {
+    XCTAssertNotNil(Model(filePath: modelPath))
+  }
+
+  func testModel_InitWithEmptyFilePath_FailsInitialization() {
+    XCTAssertNil(Model(filePath: ""))
+  }
+
+  func testModel_InitWithInvalidFilePath_FailsInitialization() {
+    XCTAssertNil(Model(filePath: "invalid/path"))
+  }
+}
+
+// MARK: - Constants
+
+private enum Constant {
+  static let modelInfo = (name: "add", extension: "bin")
+}
diff --git a/tensorflow/lite/experimental/swift/Tests/QuantizationParametersTests.swift b/tensorflow/lite/experimental/swift/Tests/QuantizationParametersTests.swift
new file mode 100644
index 0000000000000000000000000000000000000000..65648c26982daa0cab2a40d111d72e10563373cf
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Tests/QuantizationParametersTests.swift
@@ -0,0 +1,43 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+@testable import TensorFlowLite
+import XCTest
+
+class QuantizationParametersTests: XCTestCase {
+
+  func testQuantizationParameters_InitWithCustomValues() {
+    let parameters = QuantizationParameters(scale: 0.5, zeroPoint: 1)
+    XCTAssertEqual(parameters.scale, 0.5)
+    XCTAssertEqual(parameters.zeroPoint, 1)
+  }
+
+  func testQuantizationParameters_Equatable() {
+    let parameters1 = QuantizationParameters(scale: 0.5, zeroPoint: 1)
+    let parameters2 = QuantizationParameters(scale: 0.5, zeroPoint: 1)
+    XCTAssertEqual(parameters1, parameters2)
+
+    let parameters3 = QuantizationParameters(scale: 0.4, zeroPoint: 1)
+    XCTAssertNotEqual(parameters1, parameters3)
+    XCTAssertNotEqual(parameters2, parameters3)
+  }
+}
+
+// MARK: - Extensions
+
+extension QuantizationParameters: Equatable {
+  public static func == (lhs: QuantizationParameters, rhs: QuantizationParameters) -> Bool {
+    return lhs.scale == rhs.scale && lhs.zeroPoint == rhs.zeroPoint
+  }
+}
diff --git a/tensorflow/lite/experimental/swift/Tests/TensorTests.swift b/tensorflow/lite/experimental/swift/Tests/TensorTests.swift
new file mode 100644
index 0000000000000000000000000000000000000000..4540043a1636f43834ec496ffef1e78444ba312b
--- /dev/null
+++ b/tensorflow/lite/experimental/swift/Tests/TensorTests.swift
@@ -0,0 +1,83 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+@testable import TensorFlowLite
+import XCTest
+
+class TensorTests: XCTestCase {
+
+  // MARK: - Tensor
+
+  func testTensor_Init() {
+    let name = "InputTensor"
+    let dataType: TensorDataType = .uInt8
+    let shape = TensorShape(Constant.dimensions)
+    guard let data = name.data(using: .utf8) else { XCTFail("Data should not be nil."); return }
+    let quantizationParameters = QuantizationParameters(scale: 0.5, zeroPoint: 1)
+    let inputTensor = Tensor(
+      name: name,
+      dataType: dataType,
+      shape: shape,
+      data: data,
+      quantizationParameters: quantizationParameters
+    )
+    XCTAssertEqual(inputTensor.name, name)
+    XCTAssertEqual(inputTensor.dataType, dataType)
+    XCTAssertEqual(inputTensor.shape, shape)
+    XCTAssertEqual(inputTensor.data, data)
+    XCTAssertEqual(inputTensor.quantizationParameters, quantizationParameters)
+  }
+
+  // MARK: - TensorShape
+
+  func testTensorShape_InitWithArray() {
+    let shape = TensorShape(Constant.dimensions)
+    XCTAssertEqual(shape.rank, Constant.dimensions.count)
+    XCTAssertEqual(shape.dimensions, Constant.dimensions)
+  }
+
+  func testTensorShape_InitWithElements() {
+    let shape = TensorShape(2, 2, 3)
+    XCTAssertEqual(shape.rank, Constant.dimensions.count)
+    XCTAssertEqual(shape.dimensions, Constant.dimensions)
+  }
+
+  func testTensorShape_InitWithArrayLiteral() {
+    let shape: TensorShape = [2, 2, 3]
+    XCTAssertEqual(shape.rank, Constant.dimensions.count)
+    XCTAssertEqual(shape.dimensions, Constant.dimensions)
+  }
+}
+
+// MARK: - Constants
+
+private enum Constant {
+  /// Array of 2 arrays of 2 arrays of 3 numbers: [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]].
+  static let dimensions = [2, 2, 3]
+}
+
+// MARK: - Extensions
+
+extension TensorShape: Equatable {
+  public static func == (lhs: TensorShape, rhs: TensorShape) -> Bool {
+    return lhs.rank == rhs.rank && lhs.dimensions == rhs.dimensions
+  }
+}
+
+extension Tensor: Equatable {
+  public static func == (lhs: Tensor, rhs: Tensor) -> Bool {
+    return lhs.name == rhs.name && lhs.dataType == rhs.dataType && lhs.shape == rhs.shape &&
+           lhs.data == rhs.data && lhs.quantizationParameters == rhs.quantizationParameters
+  }
+}
diff --git a/tensorflow/lite/experimental/writer/BUILD b/tensorflow/lite/experimental/writer/BUILD
index 57ce63636714aa616cb50e04fe2c15210cc2eb1c..6aa81ff173408d3378285e8b12a7acf2d347a0a5 100644
--- a/tensorflow/lite/experimental/writer/BUILD
+++ b/tensorflow/lite/experimental/writer/BUILD
@@ -33,7 +33,6 @@ cc_library(
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs_with_reflection",
     ],
 )
diff --git a/tensorflow/lite/experimental/writer/enum_mapping.h b/tensorflow/lite/experimental/writer/enum_mapping.h
index cb6ec3e0d7e0f1b53cc8b84e10cb1be4b1f023c0..949a255abaf63cbc1cc0b3f718f6aaca4f38fd8d 100644
--- a/tensorflow/lite/experimental/writer/enum_mapping.h
+++ b/tensorflow/lite/experimental/writer/enum_mapping.h
@@ -112,5 +112,29 @@ inline LSHProjectionType LSHProjectionTypeToSchema(
   }
 }
 
+inline MirrorPadMode MirrorPaddingModeToSchema(TfLiteMirrorPaddingMode mode) {
+  switch (mode) {
+    case kTfLiteMirrorPaddingUnknown:
+      return MirrorPadMode_REFLECT;  // TODO(aselle): consider an error
+    case kTfLiteMirrorPaddingReflect:
+      return MirrorPadMode_REFLECT;
+    case kTfLiteMirrorPaddingSymmetric:
+      return MirrorPadMode_SYMMETRIC;
+  }
+}
+
+inline CombinerType CombinerTypeToSchema(TfLiteCombinerType type) {
+  switch (type) {
+    case kTfLiteCombinerTypeSum:
+      return CombinerType_SUM;
+    case kTfLiteCombinerTypeMean:
+      return CombinerType_MEAN;
+    case kTfLiteCombinerTypeSqrtn:
+      return CombinerType_SQRTN;
+  }
+}
+
+// int
+
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_WRITER_ENUM_MAPPING_H_
diff --git a/tensorflow/lite/experimental/writer/option_writer_generator.cc b/tensorflow/lite/experimental/writer/option_writer_generator.cc
index fa360a2f47e3dba34e05d2e32616821294f0e678..b425a75325300c56e609b4fb925319b6ed14a9d9 100644
--- a/tensorflow/lite/experimental/writer/option_writer_generator.cc
+++ b/tensorflow/lite/experimental/writer/option_writer_generator.cc
@@ -22,53 +22,59 @@ limitations under the License.
 namespace tflite {
 namespace {
 // This is generated by grepping
-//  cat  third_party/tensorflow/lite/builtin_op_data.h
-//| grep "^} TfLite" | sed 's/^} TfLite\(.*\)Params;/\1Params/g' | grep -v "^}"
-static const char* param_structs[] = {"TfLiteConvParams",
-                                      "TfLitePoolParams",
-                                      "TfLiteDepthwiseConvParams",
-                                      "TfLiteSVDFParams",
-                                      "TfLiteRNNParams",
-                                      "TfLiteSequenceRNNParams",
-                                      "TfLiteFullyConnectedParams",
-                                      "TfLiteLSHProjectionParams",
-                                      "TfLiteSoftmaxParams",
-                                      "TfLiteConcatenationParams",
-                                      "TfLiteAddParams",
-                                      "TfLiteSpaceToBatchNDParams",
+//  cat  third_party/tensorflow/lite/c/builtin_op_data.h | grep "^} TfLite" |
+//  sed 's/^} \(TfLite.*\)Params;/\1Params/g' | grep -v "^}" | sed
+//  's/\(.*\)/"\1",/g' | sort
+static const char* param_structs[] = {"TfLiteAddParams",
+                                      "TfLiteArgMaxParams",
+                                      "TfLiteArgMinParams",
                                       "TfLiteBatchToSpaceNDParams",
-                                      "TfLiteMulParams",
-                                      "TfLiteSubParams",
+                                      "TfLiteBidirectionalSequenceLSTMParams",
+                                      "TfLiteBidirectionalSequenceRNNParams",
+                                      "TfLiteCastParams",
+                                      "TfLiteConcatenationParams",
+                                      "TfLiteConvParams",
+                                      "TfLiteDepthwiseConvParams",
                                       "TfLiteDivParams",
+                                      "TfLiteEmbeddingLookupSparseParams",
+                                      "TfLiteFakeQuantParams",
+                                      "TfLiteFullyConnectedParams",
+                                      "TfLiteGatherParams",
                                       "TfLiteL2NormParams",
+                                      "TfLiteLeakyReluParams",
                                       "TfLiteLocalResponseNormParams",
+                                      "TfLiteLSHProjectionParams",
                                       "TfLiteLSTMParams",
-                                      "TfLiteResizeBilinearParams",
-                                      "TfLiteResizeNearestNeighborParams",
+                                      "TfLiteMirrorPaddingParams",
+                                      "TfLiteMulParams",
+                                      "TfLiteOneHotParams",
+                                      "TfLitePackParams",
                                       "TfLitePadParams",
                                       "TfLitePadV2Params",
+                                      "TfLitePoolParams",
+                                      "TfLiteReducerParams",
                                       "TfLiteReshapeParams",
+                                      "TfLiteResizeBilinearParams",
+                                      "TfLiteResizeNearestNeighborParams",
+                                      "TfLiteRNNParams",
+                                      "TfLiteSequenceRNNParams",
+                                      "TfLiteShapeParams",
                                       "TfLiteSkipGramParams",
+                                      "TfLiteSoftmaxParams",
+                                      "TfLiteSpaceToBatchNDParams",
                                       "TfLiteSpaceToDepthParams",
-                                      "TfLiteCastParams",
-                                      "TfLiteEmbeddingLookupSparseParams",
-                                      "TfLiteGatherParams",
-                                      "TfLiteTransposeParams",
-                                      "TfLiteReducerParams",
+                                      "TfLiteSparseToDenseParams",
                                       "TfLiteSplitParams",
                                       "TfLiteSplitVParams",
                                       "TfLiteSqueezeParams",
                                       "TfLiteStridedSliceParams",
-                                      "TfLiteArgMaxParams",
-                                      "TfLiteArgMinParams",
+                                      "TfLiteSubParams",
+                                      "TfLiteSVDFParams",
                                       "TfLiteTransposeConvParams",
-                                      "TfLiteSparseToDenseParams",
-                                      "TfLiteShapeParams",
-                                      "TfLiteFakeQuantParams",
-                                      "TfLitePackParams",
-                                      "TfLiteOneHotParams",
-                                      "TfLiteLeakyReluParams",
-                                      "TfLiteMirrorPaddingParams",
+                                      "TfLiteTransposeParams",
+                                      "TfLiteUnidirectionalSequenceLSTMParams",
+                                      "TfLiteUniqueParams",
+                                      "TfLiteUnpackParams",
                                       nullptr};
 }  // namespace
 
@@ -141,7 +147,6 @@ class OpOptionData {
     op_to_option_["REDUCE_MAX"] = "ReducerOptions";
     op_to_option_["REDUCE_MIN"] = "ReducerOptions";
     op_to_option_["REDUCE_ANY"] = "ReducerOptions";
-    op_to_option_["UNPACK"] = "";
     op_to_option_["SUM"] = "ReducerOptions";
     op_to_option_["REDUCE_MAX"] = "ReducerOptions";
     op_to_option_["REDUCE_PROD"] = "ReducerOptions";
@@ -150,32 +155,31 @@ class OpOptionData {
     op_to_option_["AVERAGE_POOL_2D"] = "Pool2DOptions";
     op_to_option_["MAX_POOL_2D"] = "Pool2DOptions";
     op_to_option_["L2_NORMALIZATION"] = "L2NormOptions";
-    op_to_option_["BIDIRECTIONAL_SEQUENCE_LSTM"] = "LSTMOptions";
-    op_to_option_["UNIDIRECTIONAL_SEQUENCE_LSTM"] = "LSTMOptions";
-    op_to_option_["BIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
-    op_to_option_["UNIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
     op_to_option_["UNIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
-    op_to_option_["MIRROR_PAD"] = "";  // TODO(karimnosseir): MirrorPadOptions.
-    // Manually specified mappings between ops and options (none)
-    op_to_option_["EMBEDDING_LOOKUP"] =
-        "";  // TODO(aselle): maybe something else.
+    op_to_option_["MAXIMUM"] = "MaximumMinimumOptions";
+    op_to_option_["MINIMUM"] = "MaximumMinimumOptions";
+    op_to_option_["CUSTOM"] = "";    // TODO(aselle): maybe something else.
+    op_to_option_["DELEGATE"] = "";  // TODO(aselle): maybe something else.
+
+    // Manually specified mappings between ops to "none" options -- these are
+    // ops without a corresponding Options message in schema as yet. If these
+    // options do get assigned an Options message in future, they need to be
+    // updated here as well.
+    op_to_option_["EMBEDDING_LOOKUP"] = "";
     op_to_option_["FLOOR"] = "";
-    op_to_option_["HASHTABLE_LOOKUP"] =
-        "";  // TODO(aselle): maybe something else.
+    op_to_option_["CEIL"] = "";
+    op_to_option_["HASHTABLE_LOOKUP"] = "";
     op_to_option_["LOGISTIC"] = "";
     op_to_option_["RELU"] = "";
     op_to_option_["RELU_N1_TO_1"] = "";
     op_to_option_["RELU6"] = "";
     op_to_option_["TANH"] = "";
-    op_to_option_["CUSTOM"] = "";    // TODO(aselle): maybe something else.
-    op_to_option_["DELEGATE"] = "";  // TODO(aselle): maybe something else.
     op_to_option_["PRELU"] = "";
-    op_to_option_["MAXIMUM"] = "";  // TODO(aselle): MaximumMinimumOptions
-    op_to_option_["MINIMUM"] = "";  // TODO(aselle): MaximumMinimumOptions
     op_to_option_["SIN"] = "";
     op_to_option_["LOG"] = "";
     op_to_option_["SQRT"] = "";
     op_to_option_["RSQRT"] = "";
+    op_to_option_["ELU"] = "";
 
     // TODO(aselle): These are undesirable hacks. Consider changing C structs
     option_to_struct_["Pool2DOptions"] = "TfLitePoolParams";
@@ -183,6 +187,7 @@ class OpOptionData {
     option_to_struct_["DepthwiseConv2DOptions"] = "TfLiteDepthwiseConvParams";
     option_to_struct_["LocalResponseNormalizationOptions"] =
         "TfLiteLocalResponseNormParams";
+    option_to_struct_["MirrorPadOptions"] = "TfLiteMirrorPaddingParams";
     // Now for every op, try to find an option.
     bool fatal = false;
     for (auto op_name : ops_) {
@@ -222,13 +227,15 @@ class OpOptionData {
           if (!param_struct_found) {
             std::cerr << "Failed to get param struct for option " << option_name
                       << std::endl;
-            fatal = true;
           } else {
             option_to_struct_.insert(std::make_pair(option_name, params_guess));
           }
         }
       }
     }
+    if (fatal) {
+      exit(1);
+    }
   }
 
  private:
@@ -239,16 +246,28 @@ class OpOptionData {
       option_to_type_function_;
 };
 
+void GenerateImportForResizeBilinearOp(FILE* fp) {
+  fprintf(fp,
+          "  case BuiltinOperator_RESIZE_BILINEAR:  {\n"
+          "    const auto* params = reinterpret_cast<const "
+          "TfLiteResizeBilinearParams*>(builtin_op_data);\n"
+          "    auto union_type = CreateResizeBilinearOptions(*fbb, "
+          "params->align_corners).Union();\n"
+          "    return std::make_pair(BuiltinOptions_ResizeBilinearOptions, "
+          "union_type);\n"
+          "  }\n  break;\n");
+}
+
 void GenerateImportForOp(FILE* fp, const std::string& op_name,
                          const std::string& option_name,
                          const std::string& option_type,
                          const flatbuffers::TypeTable* options,
                          const std::string& struct_name) {
-  // Skip tricky ones for now
-  if (struct_name == "TfLiteResizeBilinearParams") return;
-  if (struct_name == "TfLiteSqueezeParams") return;
-  if (struct_name == "TfLiteEmbeddingLookupSparseParams") return;
-  if (struct_name == "TfLiteReshapeParams") return;
+  // Special-case ResizeBilinear which has some deprecated fields.
+  if (struct_name == "TfLiteResizeBilinearParams") {
+    GenerateImportForResizeBilinearOp(fp);
+    return;
+  }
 
   fprintf(fp, "  case BuiltinOperator_%s:  {\n", op_name.c_str());
   fprintf(fp,
@@ -258,6 +277,9 @@ void GenerateImportForOp(FILE* fp, const std::string& op_name,
 
   for (size_t i = 0; i < options->num_elems; i++) {
     std::string elem_name = options->names[i];
+    bool is_int_vector = false;
+    std::string vector_name = elem_name;
+    std::string vector_size;
     // TODO(aselle): Irregular naming in builtins
     if (elem_name == "fused_activation_function")
       elem_name = "activation";
@@ -269,8 +291,26 @@ void GenerateImportForOp(FILE* fp, const std::string& op_name,
       elem_name = "dilation_height_factor";
     else if (elem_name == "dilation_w_factor")
       elem_name = "dilation_width_factor";
-    else if (elem_name == "new_shape")
-      elem_name = "shape";
+    else if (elem_name == "idx_out_type")
+      elem_name = "index_out_type";
+
+    // Vector fields treated specially.
+    if (elem_name == "new_shape") {
+      is_int_vector = true;
+      vector_name = "shape";
+      vector_size = "num_dimensions";
+    } else if (elem_name == "squeeze_dims") {
+      is_int_vector = true;
+      vector_size = "num_squeeze_dims";
+    }
+
+    if (is_int_vector) {
+      fprintf(fp,
+              "    auto val%zu = fbb->CreateVector("
+              "std::vector<int>(params->%s, params->%s + params->%s));\n",
+              i, vector_name.c_str(), vector_name.c_str(), vector_size.c_str());
+      continue;
+    }
 
     flatbuffers::TypeCode code = options->type_codes[i];
     auto contained_type = code.sequence_ref != -1
@@ -289,6 +329,10 @@ void GenerateImportForOp(FILE* fp, const std::string& op_name,
       mapper = "LSTMKernelTypeToSchema";
     } else if (contained_type == LSHProjectionTypeTypeTable) {
       mapper = "LSHProjectionTypeToSchema";
+    } else if (contained_type == MirrorPadModeTypeTable) {
+      mapper = "MirrorPaddingModeToSchema";
+    } else if (contained_type == CombinerTypeTypeTable) {
+      mapper = "CombinerTypeToSchema";
     }
 
     fprintf(fp,
diff --git a/tensorflow/lite/experimental/writer/writer_lib.cc b/tensorflow/lite/experimental/writer/writer_lib.cc
index a0ce4b716d62c5a24342f5a3863e58eb203f7441..2bdc41bae84341949631f77a1be8631b007f2985 100644
--- a/tensorflow/lite/experimental/writer/writer_lib.cc
+++ b/tensorflow/lite/experimental/writer/writer_lib.cc
@@ -219,6 +219,11 @@ std::vector<int> InterpreterWriter::RemapTensorIndicesToWritten(
   std::vector<int> output;
   output.reserve(input.size());
   for (int x : input) {
+    // Special value representing an optional tensor which is not present.
+    if (x == -1) {
+      output.push_back(x);
+      continue;
+    }
     if (tensor_to_written_tensor_[x] != -1) {
       output.push_back(tensor_to_written_tensor_[x]);
     }
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index 36bf4f4618c42f4e56ce79b73c50c0454644a26d..7eaf64c9a4a5d2e102d17ef4dcaf1ccb63ba7057 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -30,8 +30,6 @@ upper_tabs:
         path: /lite/ops_versioning
       - title: TensorFlow Lite compatibility guide
         path: /lite/tf_ops_compatibility
-      - title: List of hosted models
-        path: /lite/models
       - title: TensorFlow Lite for iOS
         path: /lite/ios
       - title: TensorFlow Lite for Raspberry Pi
@@ -59,6 +57,10 @@ upper_tabs:
       - title: Post-training quantization example
         path: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tutorials/post_training_quant.ipynb
         status: external
+      - title: GPU delegate
+        path: /lite/performance/gpu
+      - title: Advanced GPU
+        path: /lite/performance/gpu_advanced
 
       - title: TF Mobile
         style: accordion
@@ -77,6 +79,41 @@ upper_tabs:
         - title: Optimizing for mobile
           path: /lite/tfmobile/optimizing
 
+    # - name: Models
+    #   contents:
+    #   - title: Overview
+    #     path: /lite/models/
+    #   - title: Hosted models
+    #     path: /lite/models/hosted
+    #   - title: Image classification
+    #     section:
+    #     - title: Overview
+    #       path: /lite/models/image_classification/overview
+    #     - title: Android
+    #       path: /lite/models/image_classification/android
+    #     - title: iOS
+    #       path: /lite/models/image_classification/ios
+    #   - title: Object detection
+    #     section:
+    #     - title: Overview
+    #       path: /lite/models/object_detection/overview
+    #   - title: Speech recognition
+    #     section:
+    #     - title: Overview
+    #       path: /lite/models/speech_recognition/overview
+    #   - title: Pose estimation
+    #     section:
+    #     - title: Overview
+    #       path: /lite/models/pose_estimation/overview
+    #   - title: Segmentation
+    #     section:
+    #     - title: Overview
+    #       path: /lite/models/segmentation/overview
+    #   - title: Smart reply
+    #     section:
+    #     - title: Overview
+    #       path: /lite/models/smart_reply/overview
+
     - name: API
       skip_translation: true
       contents:
diff --git a/tensorflow/lite/g3doc/_index.yaml b/tensorflow/lite/g3doc/_index.yaml
index 1b3f1d616ae953e3c6a659301d7a7dd6860dcbf2..7153b7c6f670375df8183a9269bb7eaf096ac0c2 100644
--- a/tensorflow/lite/g3doc/_index.yaml
+++ b/tensorflow/lite/g3doc/_index.yaml
@@ -4,7 +4,7 @@ description: <!--no description-->
 landing_page:
   custom_css_path: /site-assets/css/style.css
   rows:
-  - heading: TensorFlow Lite is for mobile and embedded devices.
+  - heading: TensorFlow Lite is for mobile and embedded devices
     description: >
       <p style="max-width: 75%;">
         TensorFlow Lite is the official solution for running machine learning
@@ -13,9 +13,6 @@ landing_page:
         iOS, and other operating systems.
       </p>
       <style>
-      .tfo-landing-row-heading {
-        padding-top: 0 !important;
-      }
       .tfo-landing-row-heading h2 {
         margin-top: 0 !important;
       }
@@ -71,58 +68,16 @@ landing_page:
           icon_name: lens
           foreground: theme
 
-  - classname: devsite-landing-row-logos tfo-landing-row-heading
-    heading: Companies using TensorFlow Lite
-    items:
-    - custom_image:
-        path: ./images/landing-page/photos_logo.png
-      path: https://www.photos.google.com
-    - custom_image:
-        path: ./images/landing-page/gboard_logo.png
-      path: https://play.google.com/store/apps/details?id=com.google.android.inputmethod.latin&hl=en_US
-    - custom_image:
-        path: ./images/landing-page/gmail_logo.png
-      path: https://www.google.com/gmail/
-    - custom_image:
-        path: ./images/landing-page/assistant_logo.png
-      path: https://assistant.google.com/
-
-  - classname: devsite-landing-row-logos
-    items:
-    - custom_image:
-        path: ./images/landing-page/vsco_logo.png
-      path: https://vsco.co
-    - custom_image:
-        path: ./images/landing-page/shazam_logo.png
-      path: https://www.shazam.com/
-    - custom_image:
-        path: ./images/landing-page/nest_logo.png
-      path: https://nest.com/
-    - custom_image:
-        path: ./images/landing-page/loseit_logo.png
-      path: https://www.loseit.com/
-
-  - classname: devsite-landing-row-no-image-background devsite-landing-row-67
-    background: grey
-    items:
-    - description: >
-        <em>“TensorFlow Lite helped us introduce machine learning and AI into our
-        app in an easy and streamlined way. We could reduce the size of our
-        models while keeping the accuracy high. This helped us create an amazing
-        fishing experience for our users by allowing them to identify any fish
-        species with just a photo.”</em>
-      image_path: ./images/landing-page/fishbrain_logo_big.png
-
   - heading: How it works
     items:
-    - heading: Build
+    - heading: Pick a model
       icon:
         icon_name: build
       description: >
-        Build a new model or retrain an existing one, such as using transfer learning.
+        Pick a new model or retrain an existing one.
       buttons:
-      - label: Read the developer guide
-        path: /lite/devguide
+      - label: Pick
+        path: /lite/devguide#1_choose_a_model
         classname: button button-primary tfo-button-primary
     - heading: Convert
       icon:
@@ -131,18 +86,29 @@ landing_page:
         Convert a TensorFlow model into a compressed flat buffer with the
         TensorFlow Lite Converter.
       buttons:
-      - label: Read the converter guide
-        path: /lite/convert/
+      - label: Convert
+        path: /lite/devguide#2_convert_the_model_format
         classname: button button-primary tfo-button-primary
     - heading: Deploy
+      icon:
+        icon_name: settings_cell
+      description: >
+        Take the compressed <code>.tflite</code> file and load it into a mobile or embedded device.
+      buttons:
+      - label: Deploy
+        path: /lite/devguide#3_use_the_tensorflow_lite_model_for_inference_in_a_mobile_app
+        classname: button button-primary tfo-button-primary
+    - heading: Optimize
       icon:
         icon_name: bolt
       description: >
-        Take the compressed <code>.tflite</code> file and load it into a mobile
-        or embedded device.<br/>
-        See the <a href="#build-your-first-tensorflow-lite-app">tutorials below</a> to build an app.
+        [optional] Quantize by converting 32-bit floats to more efficient 8-bit integers or run on GPU.
+      buttons:
+      - label: Optimize
+        path: /lite/devguide#4_optimize_your_model_optional
+        classname: button button-primary tfo-button-primary
 
-  - heading: Build your first TensorFlow Lite app
+  - heading: Build your first TensorFlow Lite app with Codelabs
     background: grey
     items:
     - classname: tfo-landing-row-item-inset-white
@@ -160,28 +126,40 @@ landing_page:
         We love to hear what you're working on—it may even get highlighted on
         our social media! <a href="https://groups.google.com/a/tensorflow.org/forum/#!forum/discuss" class="external">Tell us</a>.
 
-  - classname: devsite-landing-row-no-image-background devsite-landing-row-67
+  - classname: devsite-landing-row-logos tfo-landing-row-heading
+    heading: TensorFlow Lite users
     items:
-    - description: >
-        <p>
-          <em>“The release of TensorFlow Lite has allowed us to deploy an engaging
-          real-time experience to our users that eliminates the requirement
-          for a data connection. TensorFlow Lite’s ability to compress and
-          optimize the TensorFlow graph for mobile deployment has been
-          transformative in expanding the capabilities of Snap It.</em>
-        </p>
-        <p>
-          <em>Through TensorFlow Lite, our users can now enjoy a state of the
-          art, computer-vision-based food logging experience without worrying
-          about signal strength. We look forward to future collaborations
-          with the TensorFlow Lite team.”</em>
-        </p>
-      image_path: ./images/landing-page/loseit_logo_big.png
+    - custom_image:
+        path: ./images/landing-page/photos_logo.png
+    - custom_image:
+        path: ./images/landing-page/gboard_logo.png
+    - custom_image:
+        path: ./images/landing-page/gmail_logo.png
+    - custom_image:
+        path: ./images/landing-page/assistant_logo.png
+
+  - classname: devsite-landing-row-logos
+    items:
+    - custom_image:
+        path: ./images/landing-page/vsco_logo.png
+    - custom_image:
+        path: ./images/landing-page/shazam_logo.png
+    - custom_image:
+        path: ./images/landing-page/nest_logo.png
+    - custom_image:
+        path: ./images/landing-page/loseit_logo.png
+
 
   - classname: devsite-landing-row-cards
     background: grey
     heading: Updates
     items:
+    - heading: "TensorFlow Lite Now Faster with Mobile GPUs (Developer Preview)"
+      image_path: ./images/landing-page/facial_contour_detection.png
+      path: https://medium.com/tensorflow/tensorflow-lite-now-faster-with-mobile-gpus-developer-preview-e15797e6dee7
+      buttons:
+      - label: Read more
+        path: https://medium.com/tensorflow/tensorflow-lite-now-faster-with-mobile-gpus-developer-preview-e15797e6dee7
     - heading: "AI in motion: react in the real world"
       image_path: ./images/landing-page/ai_in_motion.png
       path: https://cloud.google.com/blog/products/ai-machine-learning/ai-motion-designing-simple-system-see-understand-and-react-real-world-part-ii
diff --git a/tensorflow/lite/g3doc/apis.md b/tensorflow/lite/g3doc/apis.md
index b15159ce4145727863c335126557e06402f8dbd3..28af7e25f851ad8881bca5b193cfdecc9dd7bcf3 100644
--- a/tensorflow/lite/g3doc/apis.md
+++ b/tensorflow/lite/g3doc/apis.md
@@ -1,4 +1,3 @@
-
 # TensorFlow Lite APIs
 
 TensorFlow Lite provides programming APIs in C++ and Java, and in both cases
@@ -8,8 +7,7 @@ no surprise that the APIs try to avoid unnecessary copies at the expense of
 convenience.  Similarly, consistency with TensorFlow APIs was not an explicit
 goal and some variance is to be expected.
 
-There is also a Python API for TensorFlow Lite described
-[here](../toco/g3doc/python_api.md#interpreter).
+There is also a [Python API for TensorFlow Lite](./convert/python_api.md).
 
 ## C++
 
diff --git a/tensorflow/lite/g3doc/convert/cmdline_examples.md b/tensorflow/lite/g3doc/convert/cmdline_examples.md
index de81e2cfdd41d6232ee1b76985a2e7dc9167e88f..169f2d91d8a72278ff61f170f0b450885e4c2c93 100644
--- a/tensorflow/lite/g3doc/convert/cmdline_examples.md
+++ b/tensorflow/lite/g3doc/convert/cmdline_examples.md
@@ -95,11 +95,11 @@ tflite_convert \
 
 The TensorFlow Lite Converter is compatible with fixed point quantization models
 described
-[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/q
-uantize/README.md). These are float models with `FakeQuant*` ops inserted at the
-boundaries of fused layers to record min-max range information. This generates a
-quantized inference workload that reproduces the quantization behavior that was
-used during training.
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/quantize/README.md).
+These are float models with `FakeQuant*` ops inserted at the boundaries of fused
+layers to record min-max range information. This generates a quantized inference
+workload that reproduces the quantization behavior that was used during
+training.
 
 The following command generates a quantized TensorFlow Lite FlatBuffer from a
 "quantized" TensorFlow GraphDef.
diff --git a/tensorflow/lite/g3doc/custom_operators.md b/tensorflow/lite/g3doc/custom_operators.md
index 4a22d6a67577cf5c06f2c0d32e30650fd4d4bb32..2d80668f37d645054596e1150f1eee6249122f75 100644
--- a/tensorflow/lite/g3doc/custom_operators.md
+++ b/tensorflow/lite/g3doc/custom_operators.md
@@ -137,9 +137,9 @@ operations instead of a single operator.
 
 ## Special TF Graph Attributes
 
-When Toco convertes a TF graph into TFLite format, it makes some assumption
-about custom operations that might be not correct. In this case, the generated
-graph can be not executable.
+When `tflite_convert` converts a TensorFlow graph into TFLite format, it makes
+some assumption about custom operations that might be not correct. In this case,
+the generated graph may not execute.
 
 It is possible to add aditional information about your custom op output to TF
 graph before it is converted. The following attributes are supported:
diff --git a/tensorflow/lite/g3doc/demo_ios.md b/tensorflow/lite/g3doc/demo_ios.md
index fbf1dd6392591183d0dc484018bba501de1851d8..33e74f1a402f2e3778ded1e815db72be5ff28c74 100644
--- a/tensorflow/lite/g3doc/demo_ios.md
+++ b/tensorflow/lite/g3doc/demo_ios.md
@@ -1,69 +1,229 @@
-
 # iOS Demo App
 
-The TensorFlow Lite demo is a camera app that continuously classifies whatever
-it sees from your device's back camera, using a quantized MobileNet model. These
-instructions walk you through building and running the demo on an iOS device.
+This tutorial provides a simple iOS mobile application to classify images using
+the iOS device camera. In this tutorial, you will download the demo application
+from the Tensorflow repository, build it on your computer, and install it on
+your iOS Device. You will also learn how to customize the application to suit
+your requirements.
 
 ## Prerequisites
 
-* You must have [Xcode](https://developer.apple.com/xcode/) installed and have a
-  valid Apple Developer ID, and have an iOS device set up and linked to your
-  developer account with all of the appropriate certificates. For these
-  instructions, we assume that you have already been able to build and deploy an
-  app to an iOS device with your current developer environment.
+*   You must have [Xcode](https://developer.apple.com/xcode/) installed and have
+    a valid Apple Developer ID, and have an iOS device set up and linked to your
+    developer account with all of the appropriate certificates. For these
+    instructions, we assume that you have already been able to build and deploy
+    an app to an iOS device with your current developer environment.
 
-* The demo app requires a camera and must be executed on a real iOS device. You
-  can build it and run with the iPhone Simulator but it won't have any camera
-  information to classify.
+*   The demo app requires a camera and must be executed on a real iOS device.
+    You can build it and run with the iPhone Simulator but it won't have any
+    camera information to classify.
 
-* You don't need to build the entire TensorFlow library to run the demo, but you
-  will need to clone the TensorFlow repository if you haven't already:
+*   You don't need to build the entire TensorFlow library to run the demo, but
+    you will need to clone the TensorFlow repository if you haven't already:
 
         git clone https://github.com/tensorflow/tensorflow
+        cd tensorflow
 
-* You'll also need the Xcode command-line tools:
+*   You'll also need the Xcode command-line tools:
 
         xcode-select --install
 
     If this is a new install, you will need to run the Xcode application once to
     agree to the license before continuing.
 
-## Building the iOS Demo App
-
-1. Install CocoaPods if you don't have it:
+*   Install CocoaPods if you don't have it:
 
         sudo gem install cocoapods
 
-2. Download the model files used by the demo app (this is done from inside the
-   cloned directory):
+### Step 1. Clone the TensorFlow source code
+
+First, we clone the GitHub repository on the computer in a folder to get the
+demo application.
+
+```
+git clone https://github.com/tensorflow/tensorflow
+```
+
+### Step 2. Download required dependencies
+
+Execute the shell script to download the model files used by the demo app (this
+is done from inside the cloned directory):
+
+```
+    tensorflow/lite/examples/ios/download_models.sh
+```
+
+Run the following command to install TensorFlow Lite pod:
 
-        sh tensorflow/lite/examples/ios/download_models.sh
+```
+    cd tensorflow/lite/examples/ios/camera
+    pod install
+```
 
-3. Install the pod to generate the workspace file:
+If you have installed this pod before and that command doesn't work, try
 
-        cd tensorflow/lite/examples/ios/camera
-        pod install
+```
+    pod repo update
+```
 
-    If you have installed this pod before and that command doesn't work, try
+### Step 3. Build the XCode project
 
-        pod update
+Open the `tflite_camera_example.xcworkspace` project file generated in the last
+step:
 
-    At the end of this step you should have a file called 
-    `tflite_camera_example.xcworkspace`.
+```
+    open tflite_camera_example.xcworkspace
+```
 
-4. Open the project in Xcode by typing this on the command line:
+Under `Project navigator -> tflite_camera_example -> Targets ->
+tflite_camera_example -> General` change the bundle identifier by pre-pending
+your name:
 
-        open tflite_camera_example.xcworkspace
+![pre-pend your name to the bundle identifier](images/ios/bundle_identifier.png)
 
-    This launches Xcode if it isn't open already and opens the
-    `tflite_camera_example` project.
+Plug in your iOS device. Note the app must be executed with a real device with
+camera. Select the iOS device from the drop-down menu.
 
-5. Build and run the app in Xcode.
+![Device selection](images/ios/device_selection.png)
 
-    Note that as mentioned earlier, you must already have a device set up and
-    linked to your Apple Developer account in order to deploy the app on a
-    device.
+Click the "Run" button to build and run the app
+
+![Build and execute](images/ios/build_and_execute.png)
+
+Note that as mentioned earlier, you must already have a device set up and linked
+to your Apple Developer account in order to deploy the app on a device.
 
 You'll have to grant permissions for the app to use the device's camera. Point
 the camera at various objects and enjoy seeing how the model classifies things!
+
+## Understanding iOS App Code
+
+### Get camera input
+
+The main logic of this app is in the Objective C++ source file
+`tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm`.
+
+The `setupAVCapture` method constructs a `AVCaptureSession` and set itself as a
+delegate. The `captureOutput:didOutputSampleBuffer:fromConnection:` method is
+called for every captured frame. It calls `runModelOnFrame` to run the model for
+every frame.
+
+### Create an interpreter
+
+To create the interpreter, we need to load the model file. The following code
+will load a model and create an interpreter.
+
+```
+model = tflite::FlatBufferModel::BuildFromFile([graph_path UTF8String]);
+```
+
+Behind the scenes, the model is loaded as a memory-mapped file. It offers faster
+load times and reduce the dirty pages in memory.
+
+Construct a `BuiltinOpResolver` to use the TensorFliw Lite buildin ops. Then,
+create the interpreter object using `InterpreterBuilder` that takes the model
+file as argument as shown below.
+
+```
+tflite::ops::builtin::BuiltinOpResolver resolver;
+tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+```
+
+### Obtain the input buffer
+
+By default, the app uses quantized model since it's smaller and faster. The
+buffer is a raw pointer to an array of 8 bit unsigned integers (`uint8_t`). The
+following code obtains the input buffer from the interpreter:
+
+```
+// Get the index of first input tensor.
+int input_tensor_index = interpreter->inputs()[0];
+// Get the pointer to the input buffer.
+uint8_t* buffer = interpreter->typed_tensor<uint8_t>(input_tensor_index);
+```
+
+Throughout this document, it's assumed a quantized model is used.
+
+### Pre-process of bitmap image
+
+The MobileNet model we're using takes 224x224x3 inputs, where the dimensions are
+width, height, and colors (RGB). The images returned from `AVCaptureSession` is
+bigger, and has 4 color channels (RGBA).
+
+Many image classification models (like MobileNet) take fixe-sized inputs. It's
+required to scale or crop the image before feeding it into the model, and change
+the channels from RGBA to RGB.
+
+The code to pre-process the images is in `ProcessInputWithQuantizedModel`
+function in
+`tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm`. It's a
+simple implementation for nearest neighbor color sampling, and it only copies
+the first 3 bytes for each pixel.
+
+```
+void ProcessInputWithQuantizedModel(
+    uint8_t* input, uint8_t* output, int image_width, int image_height, int image_channels) {
+  for (int y = 0; y < wanted_input_height; ++y) {
+    uint8_t* out_row = output + (y * wanted_input_width * wanted_input_channels);
+    for (int x = 0; x < wanted_input_width; ++x) {
+      const int in_x = (y * image_width) / wanted_input_width;
+      const int in_y = (x * image_height) / wanted_input_height;
+      uint8_t* in_pixel = input + (in_y * image_width * image_channels) + (in_x * image_channels);
+      uint8_t* out_pixel = out_row + (x * wanted_input_channels);
+      for (int c = 0; c < wanted_input_channels; ++c) {
+        out_pixel[c] = in_pixel[c];
+      }
+    }
+  }
+}
+```
+
+Note the code is preprocessing and preparing the model input from the camera
+data. Therefore the first parameter `input` should be the camera buffer. The
+second parameter `output` should be the buffer of model input.
+
+### Run inference and obtain output buffer
+
+After preprocessing and filling the data into the input buffer of the
+interpreter, it's really easy to run the interpreter:
+
+```
+if (interpreter->Invoke() != kTfLiteOk) {
+  NSLog("Failed to invoke!");
+}
+```
+
+The result is stored in the output tensor buffer of the interpreter. The
+following code obtains the pointer to the buffer:
+
+```
+// Get the index of first output tensor.
+const int output_tensor_index = interpreter->outputs()[0];
+// Get the pointer to the output buffer.
+uint8_t* buffer = interpreter->typed_tensor<uint8_t>(output_tensor_index);
+```
+
+### Post-process values
+
+The output buffer contains an array of `uint8_t`, and the value range is 0-255.
+We need to convert the value to float to get the probabilities with value range
+0.0-1.0. The formula of the quantization value mapping is:
+
+    float_value = (quantized_value - zero_point) * scale
+
+The following code converts quantized values back to float values, using the
+quantizaiton parameters in tensors:
+
+```
+uint8_t* quantized_output = interpreter->typed_output_tensor<uint8_t>(0);
+int32_t zero_point = input_tensor->params.zero_point;
+float scale = input_tensor->params.scale;
+float output[output_size];
+for (int i = 0; i < output_size; ++i) {
+  output[i] = (quantized_output[i] - zero_point) * scale;
+}
+```
+
+Finally, we find the best set of classifications by storing them in a priority
+queue based on their confidence scores. See the `GetTopN` function in
+`tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm`.
diff --git a/tensorflow/lite/g3doc/devguide.md b/tensorflow/lite/g3doc/devguide.md
index fdd02638f9b78e05e77cfeb22644bfb37878a580..7e23f1d1b6ef3b5e85736099db96417e2dd72010 100644
--- a/tensorflow/lite/g3doc/devguide.md
+++ b/tensorflow/lite/g3doc/devguide.md
@@ -92,7 +92,7 @@ formats:
 
 TensorFlow models may be saved as a .pb or .pbtxt `tf.GraphDef` file. In order
 to convert the `tf.GraphDef` file to TensorFlow Lite, the model must first be
-frozen. This process invovles several file formats including the `frozen
+frozen. This process involves several file formats including the `frozen
 GraphDef`:
 
 *   `tf.GraphDef` (.pb or .pbtxt) — A protobuf that represents the TensorFlow
@@ -166,7 +166,7 @@ Refer to the [ops compatibility guide](tf_ops_compatibility.md) for
 troubleshooting help, and if that doesn't help, please
 [file an issue](https://github.com/tensorflow/tensorflow/issues).
 
-### Graph vizualization tool
+### Graph Visualization tool
 
 The [development repo](https://github.com/tensorflow/tensorflow) contains a tool
 to visualize TensorFlow Lite models after conversion. To build the
@@ -180,7 +180,6 @@ bazel run tensorflow/lite/tools:visualize -- model.tflite model_viz.html
 This generates an interactive HTML page listing subgraphs, operations, and a
 graph visualization.
 
-
 ## 3. Use the TensorFlow Lite model for inference in a mobile app
 
 After completing the prior steps, you should now have a `.tflite` model file.
@@ -221,3 +220,47 @@ devices. To use the converter, refer to the
 Compile Tensorflow Lite for a Raspberry Pi by following the
 [RPi build instructions](rpi.md) This compiles a static library file (`.a`) used
 to build your app. There are plans for Python bindings and a demo app.
+
+## 4. Optimize your model (optional)
+
+There are two options. If you plan to run on CPU, we recommend that you quantize
+your weights and activation tensors. If the hardware is available, another
+option is to run on GPU for massively parallelizable workloads.
+
+### Quantization
+Compress your model size by lowering the precision of the parameters (i.e.
+neural network weights) from their training-time 32-bit floating-point
+representations into much smaller and efficient 8-bit integer ones.
+
+This will execute the heaviest computations fast in lower precision, but the
+most sensitive ones with higher precision, thus typically resulting in little to
+no final accuracy losses for the task, yet a significant speed-up over pure
+floating-point execution.
+
+The post-training quantization technique is integrated into the TensorFlow Lite
+conversion tool. Getting started is easy: after building your TensorFlow model,
+simply enable the ‘post_training_quantize’ flag in the TensorFlow Lite
+conversion tool. Assuming that the saved model is stored in saved_model_dir, the
+quantized tflite flatbuffer can be generated in command line:
+
+```
+converter=tf.contrib.lite.TocoConverter.from_saved_model(saved_model_dir)
+converter.post_training_quantize=True
+tflite_quantized_model=converter.convert()
+open(“quantized_model.tflite”, “wb”).write(tflite_quantized_model)
+```
+
+Read the full documentation [here](performance/post_training_quantization) and see a tutorial [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tutorials/post_training_quant.ipynb).
+
+### GPU
+Run on GPU GPUs are designed to have high throughput for massively
+parallelizable workloads. Thus, they are well-suited for deep neural nets, which
+consist of a huge number of operators, each working on some input tensor(s) that
+can be easily divided into smaller workloads and carried out in parallel,
+typically resulting in lower latency.
+
+Another benefit with GPU inference is its power efficiency. GPUs carry out the
+computations in a very efficient and optimized manner, so that they consume less
+power and generate less heat than when the same task is run on CPUs.
+
+Read the tutorial [here](performance/gpu) and full documentation [here](performance/gpu_advanced).
diff --git a/tensorflow/lite/g3doc/images/convert/sample_after.png b/tensorflow/lite/g3doc/images/convert/sample_after.png
index 6c451f97903f7f70a9f28dee8abf6daeb7ec5693..db09d0a6ca70695205833acfd2bd8ac6682cb065 100644
Binary files a/tensorflow/lite/g3doc/images/convert/sample_after.png and b/tensorflow/lite/g3doc/images/convert/sample_after.png differ
diff --git a/tensorflow/lite/g3doc/images/convert/sample_before.png b/tensorflow/lite/g3doc/images/convert/sample_before.png
index e5317ef295062e79c66430512ef1c45925858ce0..55440d324977f0ff5b795bc80898857918066e96 100644
Binary files a/tensorflow/lite/g3doc/images/convert/sample_before.png and b/tensorflow/lite/g3doc/images/convert/sample_before.png differ
diff --git a/tensorflow/lite/g3doc/images/ios/build_and_execute.png b/tensorflow/lite/g3doc/images/ios/build_and_execute.png
new file mode 100644
index 0000000000000000000000000000000000000000..a305350cb7b413b70d2095cfd8fea6d396c63695
Binary files /dev/null and b/tensorflow/lite/g3doc/images/ios/build_and_execute.png differ
diff --git a/tensorflow/lite/g3doc/images/ios/bundle_identifier.png b/tensorflow/lite/g3doc/images/ios/bundle_identifier.png
new file mode 100644
index 0000000000000000000000000000000000000000..398763916b353e61f236392e2b8898aad2aafe8e
Binary files /dev/null and b/tensorflow/lite/g3doc/images/ios/bundle_identifier.png differ
diff --git a/tensorflow/lite/g3doc/images/ios/device_selection.png b/tensorflow/lite/g3doc/images/ios/device_selection.png
new file mode 100644
index 0000000000000000000000000000000000000000..1565fa0f2b65d48f4308ba50caacf745125c7431
Binary files /dev/null and b/tensorflow/lite/g3doc/images/ios/device_selection.png differ
diff --git a/tensorflow/lite/g3doc/images/landing-page/facial_contour_detection.png b/tensorflow/lite/g3doc/images/landing-page/facial_contour_detection.png
new file mode 100644
index 0000000000000000000000000000000000000000..27bb49826ff3246bd5a971e7ecd0926121dbf749
Binary files /dev/null and b/tensorflow/lite/g3doc/images/landing-page/facial_contour_detection.png differ
diff --git a/tensorflow/lite/g3doc/models/_index.yaml b/tensorflow/lite/g3doc/models/_index.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5cd679cd614ac53fb11f172cdea25416455ab3ed
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/_index.yaml
@@ -0,0 +1,93 @@
+book_path: /lite/_book.yaml
+project_path: /lite/_project.yaml
+description: <!--no description-->
+landing_page:
+  custom_css_path: /site-assets/css/style.css
+  nav: left
+  rows:
+  - classname: devsite-landing-row-100
+    heading: Optimized models for common mobile and edge use cases
+    items:
+    - description: >
+        Take state-of-the-art optimized research models and easily deploy them to mobile and edge devices.
+      buttons:
+        - label: Skip to full models repo
+          path: /lite/models/hosted
+
+  - background: grey
+    items:
+    - heading: Optimized for edge
+      description: >
+        Edge has many constraints, such as limited memory, speed, and power. These models are specifically designed to run on-device.
+    - heading: Built by TensorFlow
+      description: >
+        These models were optimized by the talented TensorFlow Lite team. Breathe easy.
+    - heading: Customizable
+      description: >
+        The models were trained with certain labels in mind. With transfer learning, you can customize them to fit your needs.
+
+  - classname: devsite-landing-row-cards
+    heading: Solutions to common problems
+    items:
+    - heading: "Image classification"
+      description: >
+        Identify hundreds of objects, including people, activities, animals, plants, and places.
+      image_path: /resources/images/tf-logo-card-16x9.png
+      path: /lite/models/image_classification/overview
+      buttons:
+      - label: Use the model
+        path: /lite/models/image_classification/overview
+    - heading: "Object detection"
+      description: >
+        Detect multiple objects with bounding boxes. Yes, dogs and cats, too.
+      image_path: /resources/images/tf-logo-card-16x9.png
+      path: /lite/models/object_detection/overview
+      buttons:
+      - label: Use the model
+        path: /lite/models/object_detection/overview
+    - heading: "Speech recognition"
+      description: >
+        “Okay, Google.” How about, “Okay, custom keyword spotting?”
+      image_path: /resources/images/tf-logo-card-16x9.png
+      path: /lite/models/speech_recognition/overview
+      buttons:
+      - label: Use the model
+        path: /lite/models/speech_recognition/overview
+
+  - classname: devsite-landing-row-cards
+    items:
+    - heading: "Pose estimation"
+      description: >
+        Estimate poses for single or multiple people. Imagine the possibilities, including stick figure dance parties.
+      image_path: /resources/images/tf-logo-card-16x9.png
+      path: /lite/models/pose_estimation/overview
+      buttons:
+      - label: Use the model
+        path: /lite/models/pose_estimation/overview
+    - heading: "Segmentation"
+      description: >
+        Pinpoint the shape of objects with strict localization accuracy and semantic labels. Trained with people, places, animals, and more.
+      image_path: /resources/images/tf-logo-card-16x9.png
+      path: /lite/models/segmentation/overview
+      buttons:
+      - label: Use the model
+        path: /lite/models/segmentation/overview
+    - heading: "Smart reply"
+      description: >
+        Generate reply suggestions to input conversational chat messages.
+      image_path: /resources/images/tf-logo-card-16x9.png
+      path: /lite/models/smart_reply/overview
+      buttons:
+      - label: Use the model
+        path: /lite/models/smart_reply/overview
+
+  - classname: devsite-landing-row-large-headings
+    foreground: theme
+    items:
+    - heading: Benefits of pre-trained models
+      description: >
+        The best way to get started is to used pre-trained models because they
+        allow you to save time and money, leverage learnings from research scientists,
+        and avoid expensive datasets. Instead of having to start from scratch, you
+        could use an existing model and just retrain the last few layers of the
+        neural network to suit your needs. This process is called transfer learning.
diff --git a/tensorflow/lite/g3doc/models.md b/tensorflow/lite/g3doc/models/hosted.md
similarity index 99%
rename from tensorflow/lite/g3doc/models.md
rename to tensorflow/lite/g3doc/models/hosted.md
index 62b3f17c79aa3688011a1452da18e098008f414e..84421e1fc4b5892e0f5c27888f81abcca1c05ba0 100644
--- a/tensorflow/lite/g3doc/models.md
+++ b/tensorflow/lite/g3doc/models/hosted.md
@@ -1,5 +1,4 @@
-
-# List of Hosted Models
+# Hosted models
 
 # AutoML mobile image classification models (Float Models)
 
diff --git a/tensorflow/lite/g3doc/models/image_classification/android.md b/tensorflow/lite/g3doc/models/image_classification/android.md
new file mode 100644
index 0000000000000000000000000000000000000000..61606096f77ce810c2b1a686cd05599b35200f57
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/image_classification/android.md
@@ -0,0 +1,206 @@
+# TensorFlow Lite Android Image Classifier App Example
+
+This tutorial provides a simple Android mobile application to classify images
+using the Android device camera. In this tutorial, you will download the demo
+application from the Tensorflow repository, build it on your computer, and
+install it on your Android device. You will also learn how to customize the
+application to suit your requirements.
+
+### Prerequisites
+
+*   Android Studio 3.2 (installed on a Linux, Mac or Windows machine)
+
+*   Android device
+
+*   USB cable (to connect Android device to your computer)
+
+### Step 1. Clone the TensorFlow source code
+
+Clone the GitHub repository to your computer to get the demo application.
+
+```
+
+git clone https://github.com/tensorflow/tensorflow
+
+```
+
+Open the TensorFlow source code in Android Studio. To do this, open Android
+Studio and select `Open an existing project` setting the folder to
+`tensorflow/lite/examples/android`
+
+<img src="images/classifydemo_img1.png" />
+
+This folder contains the demo application for image classification, object
+detection, and speech hotword detection.
+
+### Step 2. Build the Android Studio project
+
+Select `Build -> Make Project` and check that the project builds
+successfully. You will need Android SDK configured in the settings. You'll need
+at least SDK version 23. The gradle file will prompt you to download any missing
+libraries.
+
+<img src="images/classifydemo_img4.png" style="width: 40%" />
+
+<img src="images/classifydemo_img2.png" style="width: 60%" />
+
+#### TensorFlow Lite AAR from JCenter:
+
+Note that the `build.gradle` is configured to use TensorFlow Lite's nightly
+build.
+
+If you see a build error related to compatibility with Tensorflow Lite's Java
+API (example: method X is undefined for type Interpreter), there has likely been
+a backwards compatible change to the API. You will need to pull new app code
+that's compatible with the nightly build by running `git pull`.
+
+### Step 3. Install and run the app
+
+Connect the Android device to the computer and be sure to approve any ADB
+permission prompts that appear on your phone. Select `Run -> Run app.` Select
+the deployment target in the connected devices to the device on which the app will
+be installed. This will install the app on the device.
+
+<img src="images/classifydemo_img5.png" style="width: 60%" />
+
+<img src="images/classifydemo_img6.png" style="width: 70%" />
+
+<img src="images/classifydemo_img7.png" style="width: 40%" />
+
+<img src="images/classifydemo_img8.png" style="width: 80%" />
+
+To test the app, open the app called `TFL Classify` on your device. When you run
+the app the first time, the app will request permission to access the camera.
+Re-installing the app may require you to uninstall the previous installations.
+
+## Understanding Android App Code
+
+### Get camera input
+
+This mobile application gets the camera input using the functions defined in the
+file CameraActivity.java in the folder
+`tensorflow/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/CameraActivity.java.`
+This file depends on `AndroidManifest.xml` in the folder
+`tensorflow/tensorflow/lite/examples/android/app/src/main` to set the camera
+orientation.
+
+### Pre-process bitmap image
+
+The mobile application code that pre-processes the images and runs inference is
+in
+`tensorflow/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/TFLiteImageClassifier.java.`
+Here, we take the input camera bitmap image and convert it to a Bytebuffer
+format for efficient processing. We pre-allocate the memory for ByteBuffer
+object based on the image dimensions because Bytebuffer objects can't infer the
+object shape.
+
+```
+c.imgData =
+ByteBuffer.allocateDirect( DIM_BATCH_SIZE * DIM_IMG_SIZE_X * DIM_IMG_SIZE_Y *
+DIM_PIXEL_SIZE);
+c.imgData.order(ByteOrder.nativeOrder());
+```
+
+While running the application, we pre-process the incoming bitmap images from the
+camera to a Bytebuffer. Since this model is quantized 8-bit, we will put a
+single byte for each channel. `imgData` will contain an encoded `Color` for each
+pixel in ARGB format, so we need to mask the least significant 8 bits to get
+blue, and next 8 bits to get green and next 8 bits to get blue, and we have an
+opaque image so alpha can be ignored.
+
+```
+ imgData.rewind();
+ bitmap.getPixels(intValues, 0, bitmap.getWidth(), 0, 0, bitmap.getWidth(), bitmap.getHeight());
+ // Convert the image to floating point.
+ int pixel = 0;
+ for (int i = 0; i < DIM_IMG_SIZE_X; ++i) {
+   for (int j = 0; j < DIM_IMG_SIZE_Y; ++j) {
+     final int val = intValues[pixel++];
+     imgData.put((byte) ((val >> 16) & 0xFF));
+     imgData.put((byte) ((val >> 8) & 0xFF));
+     imgData.put((byte) (val & 0xFF));
+     }
+  }
+```
+
+### Create interpreter
+
+To create the interpreter, we need to load the model file. In Android devices,
+we recommend pre-loading and memory mapping the model file as shown below to
+offer faster load times and reduce the dirty pages in memory. If your model file
+is compressed, then you will have to load the model as a `File`, as it cannot be
+directly mapped and used from memory.
+
+```
+// Memory-map the model file
+AssetFileDescriptor fileDescriptor = assets.openFd(modelFilename);
+FileInputStream inputStream = new
+FileInputStream(fileDescriptor.getFileDescriptor()); FileChannel fileChannel =
+inputStream.getChannel(); long startOffset = fileDescriptor.getStartOffset();
+long declaredLength = fileDescriptor.getDeclaredLength(); return
+fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
+```
+
+Then, create the interpreter object using `new Interpreter()` that takes the
+model file as argument as shown below.
+
+```
+// Create Interpreter
+c.tfLite = new Interpreter(loadModelFile(assetManager, modelFilename));
+```
+
+### Run inference
+
+The output of the inference is stored in a byte array `labelprob.` We
+pre-allocate the memory for the output buffer. Then, we run inference on the
+interpreter object using function `run()` that takes input and output buffers as
+arguments.
+
+```
+// Pre-allocate output buffers.
+c.labelProb = new byte[1][c.labels.size()];
+// Run Inference
+tfLite.run(imgData, labelProb);
+```
+
+### Post-process values
+
+Finally, we find the best set of classifications by storing them in a priority
+queue based on their confidence scores.
+
+```
+// Find the best classifications
+PriorityQueue<Recognition> pq = ...
+for (int i = 0; i < labels.size(); ++i)
+{
+  pq.add( new Recognition( ' '+ i,
+  labels.size() > i ? labels.get(i) : unknown,
+  (float) labelProb[0][i], null));
+}
+```
+
+And we display up to MAX_RESULTS number of classifications in the application,
+where Recognition is a generic class defined in `Classifier.java` that contains
+the following information of the classified object: id, title, label, and its
+location when the model is an object detection model.
+
+```
+// Display the best classifications
+final ArrayList<Recognition> recognitions =
+  new ArrayList<Recognition>();
+int recognitionsSize = Math.min(pq.size(), MAX_RESULTS);
+for (int i = 0; i < recognitionsSize; ++i) {
+  recognitions.add(pq.poll());
+}
+```
+
+### Load onto display
+
+We render the results on the Android device screen using the following lines in
+`processImage()` function in `ClassifierActivity.java` which uses the UI defined
+in `RecognitionScoreView.java.`
+
+```
+resultsView.setResults(results);
+requestRender();
+```
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/build_and_execute.png b/tensorflow/lite/g3doc/models/image_classification/images/build_and_execute.png
new file mode 100644
index 0000000000000000000000000000000000000000..a305350cb7b413b70d2095cfd8fea6d396c63695
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image_classification/images/build_and_execute.png differ
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/bundle_identifier.png b/tensorflow/lite/g3doc/models/image_classification/images/bundle_identifier.png
new file mode 100644
index 0000000000000000000000000000000000000000..398763916b353e61f236392e2b8898aad2aafe8e
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image_classification/images/bundle_identifier.png differ
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img1.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img1.png
new file mode 100644
index 0000000000000000000000000000000000000000..916639c067081b5a193f479d6a9ce61239fc0c6e
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img1.png differ
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img2.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img2.png
new file mode 100644
index 0000000000000000000000000000000000000000..366ec834a842fa8030369d35d21126cf22a93d5c
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img2.png differ
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img4.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img4.png
new file mode 100644
index 0000000000000000000000000000000000000000..360b843c9430bb39191cf7e49adaaada5f372338
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img4.png differ
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img5.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img5.png
new file mode 100644
index 0000000000000000000000000000000000000000..d6192ae9a76d78479fed168e48429c1f96d13593
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img5.png differ
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img6.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img6.png
new file mode 100644
index 0000000000000000000000000000000000000000..4216153d3886ee814f9e13657795815fac280dce
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img6.png differ
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img7.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img7.png
new file mode 100644
index 0000000000000000000000000000000000000000..034eedbc1e5370f597b5b6d95564efbf66074dcc
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img7.png differ
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img8.png b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img8.png
new file mode 100644
index 0000000000000000000000000000000000000000..940395346510815a7d0454ddc5e47eca8c5be6cd
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image_classification/images/classifydemo_img8.png differ
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/device_selection.png b/tensorflow/lite/g3doc/models/image_classification/images/device_selection.png
new file mode 100644
index 0000000000000000000000000000000000000000..1565fa0f2b65d48f4308ba50caacf745125c7431
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image_classification/images/device_selection.png differ
diff --git a/tensorflow/lite/g3doc/models/image_classification/images/dog.png b/tensorflow/lite/g3doc/models/image_classification/images/dog.png
new file mode 100644
index 0000000000000000000000000000000000000000..65c6eb551468be3b53dc805009688c7b5808c660
Binary files /dev/null and b/tensorflow/lite/g3doc/models/image_classification/images/dog.png differ
diff --git a/tensorflow/lite/g3doc/models/image_classification/ios.md b/tensorflow/lite/g3doc/models/image_classification/ios.md
new file mode 100644
index 0000000000000000000000000000000000000000..63e3abd779355b842964ae8836f24a1cd7a8832f
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/image_classification/ios.md
@@ -0,0 +1,229 @@
+# TensorFlow Lite iOS Image Classifier App Example
+
+This tutorial provides a simple iOS mobile application to classify images using
+the iOS device camera. In this tutorial, you will download the demo application
+from the Tensorflow repository, build it on your computer, and install it on
+your iOS Device. You will also learn how to customize the application to suit
+your needs.
+
+## Prerequisites
+
+*   You must have [Xcode](https://developer.apple.com/xcode/) installed and have
+    a valid Apple Developer ID, and have an iOS device set up and linked to your
+    developer account with all of the appropriate certificates. For these
+    instructions, we assume that you have already been able to build and deploy
+    an app to an iOS device with your current developer environment.
+
+*   The demo app requires a camera and must be executed on a real iOS device.
+    You can build it and run with the iPhone Simulator but it won't have any
+    camera information to classify.
+
+*   You don't need to build the entire TensorFlow library to run the demo, but
+    you will need to clone the TensorFlow repository if you haven't already:
+
+        git clone https://github.com/tensorflow/tensorflow
+        cd tensorflow
+
+*   You'll also need the Xcode command-line tools:
+
+        xcode-select --install
+
+    If this is a new install, you will need to run the Xcode application once to
+    agree to the license before continuing.
+
+*   Install CocoaPods if you don't have it:
+
+        sudo gem install cocoapods
+
+### Step 1. Clone the TensorFlow source code
+
+lone the GitHub repository onto your computer to get the
+demo application.
+
+```
+git clone https://github.com/tensorflow/tensorflow
+```
+
+### Step 2. Download required dependencies
+
+Execute the shell script to download the model files used by the demo app (this
+is done from inside the cloned directory):
+
+```
+    tensorflow/lite/examples/ios/download_models.sh
+```
+
+Run the following command to install TensorFlow Lite pod:
+
+```
+    cd tensorflow/lite/examples/ios/camera
+    pod install
+```
+
+If you have installed this pod before and that command doesn't work, try
+
+```
+    pod repo update
+```
+
+### Step 3. Build the XCode project
+
+Open the `tflite_camera_example.xcworkspace` project file generated in the last
+step:
+
+```
+    open tflite_camera_example.xcworkspace
+```
+
+Under `Project navigator -> tflite_camera_example -> Targets ->
+tflite_camera_example -> General` change the bundle identifier by pre-pending
+your name:
+
+![pre-pend your name to the bundle identifier](images/bundle_identifier.png)
+
+Plug in your iOS device. Note that the app must be executed with a real device with
+a camera. Select the iOS device from the drop-down menu.
+
+![Device selection](images/device_selection.png)
+
+Click the "Run" button to build and run the app
+
+![Build and execute](images/build_and_execute.png)
+
+Note that, as mentioned earlier, you must already have a device set up and linked
+to your Apple Developer account in order to deploy the app onto a device.
+
+You'll have to grant permissions for the app to use the device's camera. Point
+the camera at various objects and enjoy seeing how the model classifies things!
+
+## Understanding iOS App Code
+
+### Get camera input
+
+The main logic of this app is in the Objective C++ source file
+`tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm`.
+
+The `setupAVCapture` method constructs a `AVCaptureSession` and set itself as a
+delegate. The `captureOutput:didOutputSampleBuffer:fromConnection:` method is
+called for every captured frame. It calls `runModelOnFrame` to run the model for
+every frame.
+
+### Create an interpreter
+
+To create the interpreter, we need to load the model file. The following code
+will load a model and create an interpreter.
+
+```
+model = tflite::FlatBufferModel::BuildFromFile([graph_path UTF8String]);
+```
+
+Behind the scenes, the model is loaded as a memory-mapped file. It offers faster
+load times and reduce the dirty pages in memory.
+
+Construct a `BuiltinOpResolver` to use the TensorFliw Lite buildin ops. Then,
+create the interpreter object using `InterpreterBuilder` that takes the model
+file as argument as shown below.
+
+```
+tflite::ops::builtin::BuiltinOpResolver resolver;
+tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+```
+
+### Obtain the input buffer
+
+By default, the app uses a quantized model since it's smaller and faster. The
+buffer is a raw pointer to an array of 8 bit unsigned integers (`uint8_t`). The
+following code obtains the input buffer from the interpreter:
+
+```
+// Get the index of first input tensor.
+int input_tensor_index = interpreter->inputs()[0];
+// Get the pointer to the input buffer.
+uint8_t* buffer = interpreter->typed_tensor<uint8_t>(input_tensor_index);
+```
+
+Throughout this document, it's assumed that a quantized model is used.
+
+### Pre-process bitmap image
+
+The MobileNet model that we're using takes 224x224x3 inputs, where the dimensions are
+width, height, and colors (RGB). The images returned from `AVCaptureSession` is
+bigger and has 4 color channels (RGBA).
+
+Many image classification models (like MobileNet) take fixe-sized inputs. It's
+required to scale or crop the image before feeding it into the model and change
+the channels from RGBA to RGB.
+
+The code to pre-process the images is in `ProcessInputWithQuantizedModel`
+function in
+`tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm`. It's a
+simple implementation for nearest neighbor color sampling and it only copies
+the first 3 bytes for each pixel.
+
+```
+void ProcessInputWithQuantizedModel(
+    uint8_t* input, uint8_t* output, int image_width, int image_height, int image_channels) {
+  for (int y = 0; y < wanted_input_height; ++y) {
+    uint8_t* out_row = output + (y * wanted_input_width * wanted_input_channels);
+    for (int x = 0; x < wanted_input_width; ++x) {
+      const int in_x = (y * image_width) / wanted_input_width;
+      const int in_y = (x * image_height) / wanted_input_height;
+      uint8_t* in_pixel = input + (in_y * image_width * image_channels) + (in_x * image_channels);
+      uint8_t* out_pixel = out_row + (x * wanted_input_channels);
+      for (int c = 0; c < wanted_input_channels; ++c) {
+        out_pixel[c] = in_pixel[c];
+      }
+    }
+  }
+}
+```
+
+Note that the code pre-processes and prepares the model input from the camera
+data. Therefore, the first parameter `input` should be the camera buffer. The
+second parameter `output` should be the buffer of model input.
+
+### Run inference and obtain output buffer
+
+After pre-processing and filling the data into the input buffer of the
+interpreter, it's really easy to run the interpreter:
+
+```
+if (interpreter->Invoke() != kTfLiteOk) {
+  NSLog("Failed to invoke!");
+}
+```
+
+The result is stored in the output tensor buffer of the interpreter. The
+following code obtains the pointer to the buffer:
+
+```
+// Get the index of first output tensor.
+const int output_tensor_index = interpreter->outputs()[0];
+// Get the pointer to the output buffer.
+uint8_t* buffer = interpreter->typed_tensor<uint8_t>(output_tensor_index);
+```
+
+### Post-process values
+
+The output buffer contains an array of `uint8_t`, and the value range is from 0-255.
+We need to convert the value to float to get the probabilities with a value range from
+0.0-1.0. The formula of the quantization value mapping is:
+
+    float_value = (quantized_value - zero_point) * scale
+
+The following code converts quantized values back to float values, using the
+quantizaiton parameters in tensors:
+
+```
+uint8_t* quantized_output = interpreter->typed_output_tensor<uint8_t>(0);
+int32_t zero_point = input_tensor->params.zero_point;
+float scale = input_tensor->params.scale;
+float output[output_size];
+for (int i = 0; i < output_size; ++i) {
+  output[i] = (quantized_output[i] - zero_point) * scale;
+}
+```
+
+Finally, we find the best set of classifications by storing them in a priority
+queue based on their confidence scores. See the `GetTopN` function in
+`tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm`.
diff --git a/tensorflow/lite/g3doc/models/image_classification/overview.md b/tensorflow/lite/g3doc/models/image_classification/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..3ffaf33d35957ddac8809ab5b82c771f5cc032b2
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/image_classification/overview.md
@@ -0,0 +1,211 @@
+# Image classification
+<img src="../images/image.png" class="attempt-right">
+
+Use a pre-trained and optimized model to identify hundreds of classes of objects, including people, activities, animals, plants, and places.
+
+## Get started
+
+If you are unfamiliar with the concept of image classification, you should start by reading <a href="#what_is_image_classification">What is image classification?</a>
+
+If you understand image classification, you’re new to TensorFlow Lite, and you’re working with Android or iOS, we recommend following the corresponding tutorial that will walk you through our sample code.
+
+<a class="button button-primary" href="android">Android</a>
+<a class="button button-primary" href="ios">iOS</a>
+
+If you are using a platform other than Android or iOS, or you are already familiar with the <a href="https://www.tensorflow.org/lite/apis">TensorFlow Lite APIs</a>, you can download our starter image classification model and the accompanying labels.
+
+Once you have the starter model running on your target device, you can experiment with different models to find the optimal balance between performance, accuracy, and model size. For guidance, see Choose a different model.
+
+
+If you are using a platform other than Android or iOS, or you are already familiar with the <a href="../apis">TensorFlow Lite APIs</a>, you can download our starter image classification model and the accompanying labels.
+
+<a class="button button-primary" href="">Download starter model and labels</a>
+
+## What is image classification?
+A common use of machine learning is to identify what an image represents. For example, we might want to know what type of animal appears in the following photograph.
+
+<img src="images/dog.png" alt="dog" width="50%">
+
+The task of predicting what an image represents is called image classification. An image classification model is trained to recognize various classes of images. For example, a model might be trained to recognize photos representing three different types of animals: rabbits, hamsters, and dogs.
+
+When we subsequently provide a new image as input to the model, it will output the probabilities of the image representing each of the types of animal it was trained on. An example output might be as follows:
+
+<table style="width: 40%;">
+  <thead>
+    <tr>
+      <th>Animal type</th>
+      <th>Probability</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>Rabbit</td>
+      <td>0.07</td>
+    </tr>
+    <tr>
+      <td>Hamster</td>
+      <td>0.02</td>
+    </tr>
+    <tr>
+      <td style="background-color: #fcb66d;">Dog</td>
+      <td style="background-color: #fcb66d;">0.91</td>
+    </tr>
+  </tbody>
+</table>
+
+Based on the output, we can see that the classification model has predicted that the image has a high probability of representing a dog.
+
+Note: Image classification can only tell you the probability that an image represents one or more of the classes that the model was trained on. It cannot tell you the position or identity of objects within the image. If you need to identify objects and their positions within images, you should use an <a href="object_detection">object detection</a> model.
+
+### Training, labels, and inference
+
+During training, an image classification model is fed images and their associated labels. Each label is the name of a distinct concept, or class, that the model will learn to recognize. Here are some examples of labels and training data for our hypothetical model that classifies animal photos:
+
+<table>
+  <thead>
+    <tr>
+      <th>Label</th>
+      <th>Training data</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>rabbit</td>
+      <td>[three different images of rabbits]</td>
+    </tr>
+    <tr>
+      <td>hamster</td>
+      <td>[three different images of hamsters]</td>
+    </tr>
+    <tr>
+      <td>dog</td>
+      <td>[three different images of dogs]</td>
+    </tr>
+  </tbody>
+</table>
+
+Given sufficient training data (often hundreds or thousands of images per label), an image classification model can learn to predict whether new images belong to any of the classes it has been trained on. This process of prediction is called inference.
+
+To perform inference, an image is passed as input to a model. The model will then output an array of probabilities between 0 and 1. With our example model, this process might look like the following:
+
+<table style="width: 60%">
+  <tr style="border-top: 0px;">
+    <td style="width: 40%"><img src="images/dog.png" alt="dog"></td>
+    <td style="width: 20%; font-size: 2em; vertical-align: middle;">→</td>
+    <td style="width: 40%; vertical-align: middle;">[0.07, 0.02, 0.91]</td>
+</table>
+
+Each number in the output corresponds to a label in our training data. Associating our output with the three labels the model was trained on, we can see the model has predicted a high probability that the image represents a dog.
+
+<table style="width: 40%;">
+  <thead>
+    <tr>
+      <th>Label</th>
+      <th>Probability</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>rabbit</td>
+      <td>0.07</td>
+    </tr>
+    <tr>
+      <td>hamster</td>
+      <td>0.02</td>
+    </tr>
+    <tr>
+      <td style="background-color: #fcb66d;">dog</td>
+      <td style="background-color: #fcb66d;">0.91</td>
+    </tr>
+  </tbody>
+</table>
+
+You might notice that the sum of all the probabilities (for rabbit, hamster, and dog) is equal to 1. This is a common type of output for models with multiple classes (see <a href="https://developers.google.com/machine-learning/crash-course/multi-class-neural-networks/softmax">Softmax</a> for more information).
+
+### Ambiguous results
+
+Since the probabilities will always sum to 1, if the image is not confidently recognized as belonging to any of the classes the model was trained on you may see the probability distributed throughout the labels without any one value being significantly larger.
+
+For example, the following might indicate an ambiguous result:
+
+<table style="width: 40%;">
+  <thead>
+    <tr>
+      <th>Label</th>
+      <th>Probability</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>rabbit</td>
+      <td>0.31</td>
+    </tr>
+    <tr>
+      <td>hamster</td>
+      <td>0.35</td>
+    </tr>
+    <tr>
+      <td>dog</td>
+      <td>0.34</td>
+    </tr>
+  </tbody>
+</table>
+
+### Uses and limitations
+
+The image classification models that we provide are useful for single-label classification, which means predicting which single label the image is most likely to represent. They are trained to recognize 1000 classes of image. For a full list of classes, see the labels file.
+
+If you want to train a model to recognize new classes, see <a href="#customize_model">Customize model</a>.
+
+For the following use cases, you should use a different type of model:
+
+<ul>
+  <li>Predicting the type and position of one or more objects within an image (see <a href="object_detection">object detection</a>)</li>
+  <li>Predicting the composition of an image, for example subject versus background (see <a href="segmentation">segmentation</a>)</li>
+</ul>
+
+Once you have the starter model running on your target device, you can experiment with different models to find the optimal balance between performance, accuracy, and model size. For guidance, see <a href="#choose_a_different_model">Choose a different model</a>.
+
+## Choose a different model
+
+There are a large number of image classification models available on our List of hosted models. You should aim to choose the optimal model for your application based on performance, accuracy and model size. There are trade-offs between each of them.
+
+### Performance
+
+We measure performance in terms of the amount of time it takes for a model to run inference on a given piece of hardware. The less time, the faster the model.
+
+The performance you require depends on your application. Performance can be important for applications like real-time video, where it may be important to analyze each frame in the time before the next frame is drawn (e.g. inference must be faster than 33ms to perform real-time inference on a 30fps video stream).
+
+Our quantized Mobilenet models’ performance ranges from 3.7ms to 80.3 ms.
+
+### Accuracy
+We measure accuracy in terms of how often the model correctly classifies an image. For example, a model with a stated accuracy of 60% can be expected to classify an image correctly an average of 60% of the time.
+
+Our List of hosted models provides Top-1 and Top-5 accuracy statistics. Top-1 refers to how often the correct label appears as the label with the highest probability in the model’s output. Top-5 refers to how often the correct label appears in the top 5 highest probabilities in the model’s output.
+
+Our quantized Mobilenet models’ Top-5 accuracy ranges from 64.4 to 89.9%.
+
+### Size
+The size of a model on-disk varies with its performance and accuracy. Size may be important for mobile development (where it might impact app download sizes) or when working with hardware (where available storage might be limited).
+
+Our quantized Mobilenet models’ size ranges from 0.5 to 3.4 Mb.
+
+### Architecture
+There are several different architectures of models available on List of hosted models, indicated by the model’s name. For example, you can choose between Mobilenet, Inception, and others.
+
+The architecture of a model impacts its performance, accuracy, and size. All of our hosted models are trained on the same data, meaning you can use the provided statistics to compare them and choose which is optimal for your application.
+
+Note: The image classification models we provide accept varying sizes of input. For some models, this is indicated in the filename. For example, the Mobilenet_V1_1.0_224 model accepts an input of 224x224 pixels. <br /><br />All of the models require three color channels per pixel (red, green, and blue). Quantized models require 1 byte per channel, and float models require 4 bytes per channel.<br /><br />Our Android and iOS code samples demonstrate how to process full-sized camera images into the required format for each model.
+
+## Customize model
+The pre-trained models we provide are trained to recognize 1000 classes of image. For a full list of classes, see the labels file.
+
+You can use a technique known as transfer learning to re-train a model to recognize classes not in the original set. For example, you could re-train the model to distinguish between different species of tree, despite there being no trees in the original training data. To do this, you will need a set of training images for each of the new labels you wish to train.
+
+Learn how to perform transfer learning in the TensorFlow for Poets codelab.
+
+## Read more about this
+<ul>
+  <li>Blog post:</li>
+  <li>Image classification GitHub:</li>
+</ul>
diff --git a/tensorflow/lite/g3doc/models/images/audio.png b/tensorflow/lite/g3doc/models/images/audio.png
new file mode 100644
index 0000000000000000000000000000000000000000..ce6b25c442016a21600eb8249eafa55bacbba4e9
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/audio.png differ
diff --git a/tensorflow/lite/g3doc/models/images/blank.png b/tensorflow/lite/g3doc/models/images/blank.png
new file mode 100644
index 0000000000000000000000000000000000000000..d099da5da07271410883554e07e37765ca048590
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/blank.png differ
diff --git a/tensorflow/lite/g3doc/models/images/camera.png b/tensorflow/lite/g3doc/models/images/camera.png
new file mode 100644
index 0000000000000000000000000000000000000000..95a9218d47864aba12255bd32b67bb74b0d6704e
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/camera.png differ
diff --git a/tensorflow/lite/g3doc/models/images/detection.png b/tensorflow/lite/g3doc/models/images/detection.png
new file mode 100644
index 0000000000000000000000000000000000000000..30e10f59cd53af21fe9b6a86aa5b45ca07131b1b
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/detection.png differ
diff --git a/tensorflow/lite/g3doc/models/images/image.png b/tensorflow/lite/g3doc/models/images/image.png
new file mode 100644
index 0000000000000000000000000000000000000000..e72aac9b25eec69e8c0252f441d125340b88cab5
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/image.png differ
diff --git a/tensorflow/lite/g3doc/models/images/object.png b/tensorflow/lite/g3doc/models/images/object.png
new file mode 100644
index 0000000000000000000000000000000000000000..aa8ed428ed15e7b166bdde560669563a224e6f6c
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/object.png differ
diff --git a/tensorflow/lite/g3doc/models/images/output_stride.png b/tensorflow/lite/g3doc/models/images/output_stride.png
new file mode 100644
index 0000000000000000000000000000000000000000..5d4663f8675eef733e18b2a5cb05670cd40d8293
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/output_stride.png differ
diff --git a/tensorflow/lite/g3doc/models/images/pose.png b/tensorflow/lite/g3doc/models/images/pose.png
new file mode 100644
index 0000000000000000000000000000000000000000..f071d789963d0f48efb5ba20633391403f75ddf8
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/pose.png differ
diff --git a/tensorflow/lite/g3doc/models/images/segmentation.png b/tensorflow/lite/g3doc/models/images/segmentation.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c61330687cc9a388a443bc6b771027d15b66d98
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/segmentation.png differ
diff --git a/tensorflow/lite/g3doc/models/images/sentiment.png b/tensorflow/lite/g3doc/models/images/sentiment.png
new file mode 100644
index 0000000000000000000000000000000000000000..2ba494fcb6e62a90015d2aead4779fcacab70529
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/sentiment.png differ
diff --git a/tensorflow/lite/g3doc/models/images/smart_reply.png b/tensorflow/lite/g3doc/models/images/smart_reply.png
new file mode 100644
index 0000000000000000000000000000000000000000..802cc80feebe2a46b059b23d52ccf794701e4d99
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/smart_reply.png differ
diff --git a/tensorflow/lite/g3doc/models/images/tabular.png b/tensorflow/lite/g3doc/models/images/tabular.png
new file mode 100644
index 0000000000000000000000000000000000000000..2eac8f4c4ac74029c755a207b1f8a25592f468ac
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/tabular.png differ
diff --git a/tensorflow/lite/g3doc/models/images/text.png b/tensorflow/lite/g3doc/models/images/text.png
new file mode 100644
index 0000000000000000000000000000000000000000..227594f07e3d38fd4110249eb2c4c6541fb89baa
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/text.png differ
diff --git a/tensorflow/lite/g3doc/models/images/tflite_models.png b/tensorflow/lite/g3doc/models/images/tflite_models.png
new file mode 100644
index 0000000000000000000000000000000000000000..f60cd26a3177f95e40875ed92aa4a30c59a7623f
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/tflite_models.png differ
diff --git a/tensorflow/lite/g3doc/models/images/video.png b/tensorflow/lite/g3doc/models/images/video.png
new file mode 100644
index 0000000000000000000000000000000000000000..88b3b7d3c76840625abec821220413a03d384a45
Binary files /dev/null and b/tensorflow/lite/g3doc/models/images/video.png differ
diff --git a/tensorflow/lite/g3doc/models/object_detection/overview.md b/tensorflow/lite/g3doc/models/object_detection/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f62d017bfd7dbfdfdeee9757b12775ede227a8c
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/object_detection/overview.md
@@ -0,0 +1,220 @@
+# Object detection
+<img src="../images/detection.png" class="attempt-right">
+
+Detect multiple objects with bounding boxes. Yes, dogs and cats too.
+
+<a class="button button-primary" href="http://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">Download starter model and labels</a>
+
+## Tutorials (coming soon)
+<a class="button button-primary" href="">iOS</a>
+<a class="button button-primary" href="">Android</a>
+
+## What is object detection?
+Given an image or a video stream, an object detection model can identify which of a known set of objects might be present and provide information about their positions within the image.
+
+<!-- TODO -->
+For example, this screenshot of our <a href="">object detection sample app</a> shows how several objects have been recognized and their positions annotated:
+
+
+<!-- TODO -->
+TODO: Insert image
+
+An object detection model is trained to detect the presence and location of multiple classes of objects. For example, a model might be trained with images that contain various pieces of computer hardware, along with a label that specifies the class of hardware they represent (e.g. a laptop, a keyboard, or a monitor), and data specifying where each object appears in the image.
+
+When we subsequently provide an image to the model, it will output a list of the objects it detects, the location of a bounding box that contains each object, and a score that indicates the confidence that detection was correct.
+
+### Model output
+
+<table style="width: 60%;">
+  <thead>
+    <tr>
+      <th>Class</th>
+      <th>Score</th>
+      <th>Location</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>Laptop</td>
+      <td>0.92</td>
+      <td>[18, 21, 57, 63]</td>
+    </tr>
+    <tr>
+      <td>Keyboard</td>
+      <td>0.88</td>
+      <td>[100, 30, 180, 150]</td>
+    </tr>
+    <tr>
+      <td>Monitor</td>
+      <td>0.87</td>
+      <td>[7, 82, 89, 163] </td>
+    </tr>
+    <tr>
+      <td>Keyboard</td>
+      <td>0.23</td>
+      <td>[42, 66, 57, 83]</td>
+    </tr>
+    <tr>
+      <td>Monitor</td>
+      <td>0.11</td>
+      <td>[6, 42, 31, 58]</td>
+    </tr>
+  </tbody>
+</table>
+
+### Confidence score
+
+To interpret these results, we can look at the score and the location for each detected object. The score is a number between 0 and 1 that indicates confidence that the object was genuinely detected. The closer the number is to 1, the more confident the model is.
+
+Depending on your application, you can decide a cut-off threshold below which you will discard detection results. For our example, we might decide a sensible cut-off is a score of 0.5 (meaning a 50% probability that the detection is valid). In that case, we would ignore the last two objects in the array, because those confidence scores are below 0.5:
+
+<table style="width: 60%;">
+  <thead>
+    <tr>
+      <th>Class</th>
+      <th>Score</th>
+      <th>Location</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>Laptop</td>
+      <td>0.92</td>
+      <td>[18, 21, 57, 63]</td>
+    </tr>
+    <tr>
+      <td>Keyboard</td>
+      <td>0.88</td>
+      <td>[100, 30, 180, 150]</td>
+    </tr>
+    <tr>
+      <td>Monitor</td>
+      <td>0.87</td>
+      <td>[7, 82, 89, 163] </td>
+    </tr>
+    <tr>
+      <td style="background-color: #e9cecc; text-decoration-line: line-through;">Keyboard</td>
+      <td style="background-color: #e9cecc; text-decoration-line: line-through;">0.23</td>
+      <td style="background-color: #e9cecc; text-decoration-line: line-through;">[42, 66, 57, 83]</td>
+    </tr>
+    <tr>
+      <td style="background-color: #e9cecc; text-decoration-line: line-through;">Monitor</td>
+      <td style="background-color: #e9cecc; text-decoration-line: line-through;">0.11</td>
+      <td style="background-color: #e9cecc; text-decoration-line: line-through;">[6, 42, 31, 58]</td>
+    </tr>
+  </tbody>
+</table>
+
+The cut-off you use should be based on whether you are more comfortable with false positives (objects that are wrongly identified, or areas of the image that are erroneously identified as objects when they are not), or false negatives (genuine objects that are missed because their confidence was low).
+
+<!-- TODO -->
+TODO: Insert screenshot showing both
+
+### Location
+
+For each detected object, the model will return an array of four numbers representing a bounding rectangle that surrounds its position. The numbers are ordered as follows:
+
+<table style="width: 50%; margin: 0 auto;">
+  <tbody>
+    <tr style="border-top: none;">
+      <td>[</td>
+      <td>top,</td>
+      <td>left,</td>
+      <td>bottom,</td>
+      <td>right</td>
+      <td>]</td>
+    </tr>
+  </tbody>
+</table>
+
+The top value represents the distance of the rectangle’s top edge from the top of the image, in pixels. The left value represents the left edge’s distance from the left of the input image. The other values represent the bottom and right edges in a similar manner.
+
+<!-- TODO -->
+Note: Object detection models accept input images of a specific size. This is likely to be different from the size of the raw image captured by your device’s camera, and you will have to write code to crop and scale your raw image to fit the model’s input size (there are examples of this in our <a href="">sample code</a>).<br /><br />The pixel values output by the model refer to the position in the cropped and scaled image, so you must scale them to fit the raw image in order to interpret them correctly.
+
+
+### Uses and limitations
+
+<!-- TODO -->
+The object detection model we provide can identify and locate up to 10 objects in an image. It is trained to recognize 80 classes of object. For a full list of classes, see the labels file in the <a href="">model zip</a>.
+
+If you want to train a model to recognize new classes, see <a href="#customize_model">Customize model</a>.
+
+For the following use cases, you should use a different type of model:
+
+<ul>
+  <li>Predicting which single label the image most likely represents (see <a href="image_classification">image classification</a>)</li>
+  <li>Predicting the composition of an image, for example subject versus background (see <a href="segmentation">segmentation</a>)</li>
+</ul>
+
+Get started
+If you are new to TensorFlow Lite and are working with Android or iOS, we recommend following the corresponding tutorial that will walk you through our sample code.
+
+<!-- TODO -->
+<a class="button button-primary" href="">iOS</a>
+<a class="button button-primary" href="">Android</a>
+
+If you are using a platform other than Android or iOS, or you are already familiar with the <a href="../apis">TensorFlow Lite APIs</a>, you can download our starter object detection model and the accompanying labels.
+
+<a href="http://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">Download starter model and labels</a>
+
+The model will return 10 detection results...
+
+## Starter model
+We recommend starting to implement object detection using the quantized COCO SSD MobileNet v1 model, available with labels from this download link:
+
+<a href="http://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">Download starter model and labels</a>
+
+### Input
+The model takes an image as input. The expected image is 300x300 pixels, with three channels (red, blue, and green) per pixel. This should be fed to the model as a flattened buffer of 270,000 byte values (300x300x3). Since the model is <a href="">quantized</a>, each value should be a single byte representing a value between 0 and 255.
+
+### Output
+The model outputs four arrays, mapped to the indices 0-4. Arrays 0, 1, and 2 describe 10 detected objects, with one element in each array corresponding to each object. There will always be 10 objects detected.
+
+<table>
+  <thead>
+    <tr>
+      <th>Index</th>
+      <th>Name</th>
+      <th>Description</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>0</td>
+      <td>Locations</td>
+      <td>Multidimensional array of [10][4] floating point values between 0 and 1, the inner arrays representing bounding boxes in the form [top, left, bottom, right]</td>
+    </tr>
+    <tr>
+      <td>1</td>
+      <td>Classes</td>
+      <td>Array of 10 integers (output as floating point values) each indicating the index of a class label from the labels file</td>
+    </tr>
+    <tr>
+      <td>2</td>
+      <td>Scores</td>
+      <td>Array of 10 floating point values between 0 and 1 representing probability that a class was detected</td>
+    </tr>
+    <tr>
+      <td>3</td>
+      <td>Number and detections</td>
+      <td>Array of length 1 containing a floating point value expressing the total number of detection results</td>
+    </tr>
+  </tbody>
+</table>
+
+## Customize model
+
+<!-- TODO -->
+The pre-trained models we provide are trained to detect 80 classes of object. For a full list of classes, see the labels file in the <a href="">model zip</a>.
+
+You can use a technique known as transfer learning to re-train a model to recognize classes not in the original set. For example, you could re-train the model to detect multiple types of vegetable, despite there only being one vegetable in the original training data. To do this, you will need a set of training images for each of the new labels you wish to train.
+
+Learn how to perform transfer learning in the <a href="https://medium.com/tensorflow/training-and-serving-a-realtime-mobile-object-detector-in-30-minutes-with-cloud-tpus-b78971cf1193">Training and serving a real-time mobile object detector in 30 minutes</a> blog post.
+
+<!-- TODO -->
+Read more about this
+<ul>
+  <li>Blog post:</li>
+  <li>Object detection GitHub:</li>
+</ul>
diff --git a/tensorflow/lite/g3doc/models/pose_estimation/overview.md b/tensorflow/lite/g3doc/models/pose_estimation/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..f19a5a10edb7d2ea89ebe5f1689cf95ba9a90bec
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/pose_estimation/overview.md
@@ -0,0 +1,128 @@
+# Pose estimation
+<img src="../images/pose.png" class="attempt-right" />
+
+<i>PoseNet</i> is a vision model that can be used to estimate the pose of a person in an image/video by estimating where key body joints are.
+
+<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/multi_person_mobilenet_v1_075_float.tflite">Download starter model</a>
+
+## Tutorials (coming soon)
+<a class="button button-primary" href="">iOS</a>
+<a class="button button-primary" href="">Android</a>
+
+## How it works
+Pose estimation refers to computer vision techniques that detect human figures in images and videos, so that one could determine, for example, where someone’s elbow shows up in an image.
+
+To be clear, this technology is not recognizing who is in an image — there is no personal identifiable information associated to pose detection. The algorithm is simply estimating where key body joints are.
+
+The key points detected are indexed by part id with a confidence score between 0.0 and 1.0; 1.0 being the highest.
+
+<table style="width: 30%;">
+  <thead>
+    <tr>
+      <th>Id</th>
+      <th>Part</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>0</td>
+      <td>nose</td>
+    </tr>
+    <tr>
+      <td>1</td>
+      <td>leftEye</td>
+    </tr>
+    <tr>
+      <td>2</td>
+      <td>rightEye</td>
+    </tr>
+    <tr>
+      <td>3</td>
+      <td>leftEar</td>
+    </tr>
+    <tr>
+      <td>4</td>
+      <td>rightEar</td>
+    </tr>
+    <tr>
+      <td>5</td>
+      <td>leftShoulder</td>
+    </tr>
+    <tr>
+      <td>6</td>
+      <td>rightShoulder</td>
+    </tr>
+    <tr>
+      <td>7</td>
+      <td>leftElbow</td>
+    </tr>
+    <tr>
+      <td>8</td>
+      <td>rightElbow</td>
+    </tr>
+    <tr>
+      <td>9</td>
+      <td>leftWrist</td>
+    </tr>
+    <tr>
+      <td>10</td>
+      <td>rightWrist</td>
+    </tr>
+    <tr>
+      <td>11</td>
+      <td>leftHip</td>
+    </tr>
+    <tr>
+      <td>12</td>
+      <td>rightHip</td>
+    </tr>
+    <tr>
+      <td>13</td>
+      <td>leftKnee</td>
+    </tr>
+    <tr>
+      <td>14</td>
+      <td>rightKnee</td>
+    </tr>
+    <tr>
+      <td>15</td>
+      <td>leftAnkle</td>
+    </tr>
+    <tr>
+      <td>16</td>
+      <td>rightAnkle</td>
+    </tr>
+  </tbody>
+</table>
+
+## Example output
+<img src="https://www.tensorflow.org/images/models/pose_estimation.gif" />
+
+## Get started
+Android and iOS end-to-end tutorials are coming soon. In the meantime, if you want to experiment this on a web browser, check out the TensorFlow.js <a href="https://github.com/tensorflow/tfjs-models/tree/master/posenet">GitHub repository</a>.
+
+
+## How it performs
+Performance varies based on your device and output stride (heatmaps and offset vectors). The PoseNet model is image size invariant, which means it can predict pose positions in the same scale as the original image regardless of whether the image is downscaled. This means PoseNet can be configured to have a higher accuracy at the expense of performance.
+
+The output stride determines how much we’re scaling down the output relative to the input image size. It affects the size of the layers and the model outputs. The higher the output stride, the smaller the resolution of layers in the network and the outputs, and correspondingly their accuracy. In this implementation, the output stride can have values of 8, 16, or 32. In other words, an output stride of 32 will result in the fastest performance but lowest accuracy, while 8 will result in the highest accuracy but slowest performance. We recommend starting with 16.
+
+<img src="../images/models/output_stride.png" >
+<span style="font-size: 0.8em">The output stride determines how much we’re scaling down the output relative to the input image size. A higher output stride is faster but results in lower accuracy.</span>
+
+## Read more about this
+<ul>
+  <li><a href="">Blog post: Real-time Human Pose Estimation in the Browser with TensorFlow.js</a></li>
+  <li><a href="">TF.js GitHub: Pose Detection in the Browser: PoseNet Model</a></li>
+</ul>
+
+## Users
+<ul>
+  <li><a href="">‘PomPom Mirror’</a></li>
+  <li><a href="">Amazing Art Installation Turns You Into A Bird | Chris Milk "The Treachery of Sanctuary"</a></li>
+  <li><a href="">Puppet Parade - Interactive Kinect Puppets</a></li>
+  <li><a href="">Messa di Voce (Performance), Excerpts</a></li>
+  <li><a href="">Augmented reality</a></li>
+  <li><a href="">Interactive animation</a></li>
+  <li><a href="">Gait analysis</a></li>
+</ul>
diff --git a/tensorflow/lite/g3doc/models/segmentation/images/segmentation.gif b/tensorflow/lite/g3doc/models/segmentation/images/segmentation.gif
new file mode 100644
index 0000000000000000000000000000000000000000..e664adf700b396b9cd06d48378b782efbbca4282
Binary files /dev/null and b/tensorflow/lite/g3doc/models/segmentation/images/segmentation.gif differ
diff --git a/tensorflow/lite/g3doc/models/segmentation/overview.md b/tensorflow/lite/g3doc/models/segmentation/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..a1f1cf1aa69279dac9a17a810e931786c7d5273d
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/segmentation/overview.md
@@ -0,0 +1,32 @@
+# Segmentation (GPU)
+<img src="../images/segmentation.png" class="attempt-right" />
+
+<i>DeepLab</i> is a state-of-art deep learning model for semantic image segmentation, where the goal is to assign semantic labels (e.g., person, dog, cat and so on) to every pixel in the input image.
+
+<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/deeplabv3_257_mv_gpu.tflite">Download starter model</a>
+
+## Tutorials (coming soon)
+<a class="button button-primary" href="">iOS</a>
+<a class="button button-primary" href="">Android</a>
+
+## How it works
+It all started with classification where the model predicts an entire input. With advances in data, hardware, and software, object detection can infer objects with spatial location. Semantic segmentation offers the highest level of granularity with labels at a pixel level.
+
+Current implementation includes the following features:
+<ol>
+  <li>DeepLabv1: We use atrous convolution to explicitly control the resolution at which feature responses are computed within Deep Convolutional Neural Networks.</li>
+  <li>DeepLabv2: We use atrous spatial pyramid pooling (ASPP) to robustly segment objects at multiple scales with filters at multiple sampling rates and effective fields-of-views.</li>
+  <li>DeepLabv3: We augment the ASPP module with image-level feature [5, 6] to capture longer range information. We also include batch normalization [7] parameters to facilitate the training. In particular, we applying atrous convolution to extract output features at different output strides during training and evaluation, which efficiently enables training BN at output stride = 16 and attains a high performance at output stride = 8 during evaluation.</li>
+  <li>DeepLabv3+: We extend DeepLabv3 to include a simple yet effective decoder module to refine the segmentation results especially along object boundaries. Furthermore, in this encoder-decoder structure one can arbitrarily control the resolution of extracted encoder features by atrous convolution to trade-off precision and runtime.</li>
+</ol>
+
+## Example output
+The model will create a mask over the target objects with high accuracy.
+<img src="images/segmentation.gif" />
+
+## Read more about this
+<ul>
+  <li>Blog post: <a href="https://ai.googleblog.com/2018/03/semantic-image-segmentation-with.html">Semantic Image Segmentation with DeepLab in TensorFlow</a></li>
+  <li><a href="https://medium.com/tensorflow/tensorflow-lite-now-faster-with-mobile-gpus-developer-preview-e15797e6dee7">Blog post: TensorFlow Lite Now Faster with Mobile GPUs (Developer Preview)</a></li>
+  <li><a href="https://github.com/tensorflow/models/tree/master/research/deeplab">DeepLab GitHub: DeepLab: Deep Labelling for Semantic Image Segmentation</a></li>
+</ul>
diff --git a/tensorflow/lite/g3doc/models/smart_reply/images/smart_reply.gif b/tensorflow/lite/g3doc/models/smart_reply/images/smart_reply.gif
new file mode 100644
index 0000000000000000000000000000000000000000..4a61691fd8714102409d290e7f6d6e361d9cbf13
Binary files /dev/null and b/tensorflow/lite/g3doc/models/smart_reply/images/smart_reply.gif differ
diff --git a/tensorflow/lite/g3doc/models/smart_reply/overview.md b/tensorflow/lite/g3doc/models/smart_reply/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..c35a5f2642519decc3a02e29c9da08241c6f915e
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/smart_reply/overview.md
@@ -0,0 +1,40 @@
+# Smart reply
+<img src="../images/smart_reply.png" class="attempt-right" />
+
+Smart replies are contextually relevant, one-touch responses that help the user to reply to an incoming text message (or email) efficiently and effortlessly.
+
+<a class="button button-primary" href="http://download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip">Download starter model and labels</a>
+
+## Tutorials (coming soon)
+<a class="button button-primary" href="">iOS</a>
+<a class="button button-primary" href="">Android</a>
+
+## How it works
+The model generates reply suggestions to input conversational chat messages with an efficient inference that can be easily be plugged in to your chat application to power on-device conversational intelligence.
+
+The on-device model comes with several benefits. It is:
+<ul>
+  <li>Faster: The model resides on the device and does not require internet connectivity. Thus, the inference is very fast and has an average latency of only a few milliseconds.</li>
+  <li>Resource efficient: The model has a small memory footprint on the device.</li>
+  <li>Privacy-friendly: The user data never leaves the device and this eliminates any privacy restrictions.</li>
+</ul>
+
+## Example output
+<img src="images/smart_reply.gif" />
+
+## How to use this model?
+We have provided a pre-built demo APK that you can download, install, and test on your phone. Go to the <a href="https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/smartreply/g3doc">GitHub page</a> for instructions and list of support ops and functionalities.
+
+## Read more about this
+<ul>
+  <li><a href="https://arxiv.org/pdf/1708.00630.pdf">Research paper</a></li>
+  <li><a href="https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/smartreply/">Source code</a></li>
+</ul>
+
+## Users
+<ul>
+  <li><a href="https://www.blog.google/products/gmail/save-time-with-smart-reply-in-gmail/">Gmail</a></li>
+  <li><a href="https://www.blog.google/products/gmail/computer-respond-to-this-email/">Inbox</a></li>
+  <li><a href="https://blog.google/products/allo/google-allo-smarter-messaging-app/">Allo</a></li>
+  <li><a href="https://research.googleblog.com/2017/02/on-device-machine-intelligence.html">Smart Replies on Android Wear</a></li>
+</ul>
diff --git a/tensorflow/lite/g3doc/models/speech_recognition/overview.md b/tensorflow/lite/g3doc/models/speech_recognition/overview.md
new file mode 100644
index 0000000000000000000000000000000000000000..5d6d855db36e8a5b433d038136399da0e0f64213
--- /dev/null
+++ b/tensorflow/lite/g3doc/models/speech_recognition/overview.md
@@ -0,0 +1,14 @@
+# Speech recognition
+
+<img src="../images/audio.png" class="attempt-right">
+
+Recognize audio keywords!
+
+<a class="button button-primary" href="">Download starter model</a>
+
+## Tutorials (coming soon)
+<a class="button button-primary" href="">iOS</a>
+<a class="button button-primary" href="">Android</a>
+
+## What is speech recognition?
+Coming soon.
diff --git a/tensorflow/lite/g3doc/performance/best_practices.md b/tensorflow/lite/g3doc/performance/best_practices.md
index b76414cebe0d7092086073a478eb6330cbea713e..5f41a7027538f571601c85a0a367208200155dd6 100644
--- a/tensorflow/lite/g3doc/performance/best_practices.md
+++ b/tensorflow/lite/g3doc/performance/best_practices.md
@@ -1,6 +1,9 @@
 # Performance best practices
 
-Mobile and embedded devices have limited computational resources and it is important to keep your application resource efficient. We have compiled a list of best practices and strategies you can use to optimize your model and application when using Tensorflow Lite.
+Mobile and embedded devices have limited computational resources and it is
+important to keep your application resource efficient. We have compiled a list
+of best practices and strategies that you can use to optimize your model and
+application when using TensorFlow Lite.
 
 ## Choose the best model for the task
 Depending on the task you will need to make a tradeoff between model complexity and size. If your task requires high accuracy then you may need a large and complex model. Some tasks may work with a less precise model, for these tasks it is better to use a smaller but less precise model. Smaller models not only use less disk space and memory but are generally faster and more energy efficient. For example, graphs below show accuracy and latency tradeoff for some common image classification models.
@@ -10,7 +13,7 @@ Depending on the task you will need to make a tradeoff between model complexity
 
 ![latency vs model size](../images/performance/model_size_vs_latency.png "Latency vs Model size")
 
-One example of models optimized for mobile devices are [MobileNets](https://arxiv.org/abs/1704.04861), which are optimized for mobile vision applications. Tensorflow Lite [models page](../models.md) lists several other models that have been optimized specifically for mobile and embedded devices.
+One example of models optimized for mobile devices are [MobileNets](https://arxiv.org/abs/1704.04861), which are optimized for mobile vision applications. TensorFlow Lite [models page](../models.md) lists several other models that have been optimized specifically for mobile and embedded devices.
 
 You can retrain the listed models on your own dataset by using transfer learning. Check out our transfer learning tutorial for
 [image classification](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/#0) and
@@ -18,27 +21,70 @@ You can retrain the listed models on your own dataset by using transfer learning
 
 
 ## Profile your model
-Once you have selected a candidate model that is right for your task, it is a good practice to profile and benchmark your model. Tensorflow Lite [benchmarking tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark) has a built-in profiler that shows per operator profiling statistics. This can help in understanding performance bottlenecks and which operators dominate the computation time.
+Once you have selected a candidate model that is right for your task, it is a good practice to profile and benchmark your model. TensorFlow Lite [benchmarking tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark) has a built-in profiler that shows per operator profiling statistics. This can help in understanding performance bottlenecks and which operators dominate the computation time.
 
 ## Profile and optimize operators in the graph
 If a particular operator appears frequently in the model and based on profiling you find the operator consuming the most amount of time, you can look into optimizing the operator.
- This scenario should be rare as Tensorflow Lite has optimized versions for most ops. However you may be able to write a faster version of a custom op, if you know the constraints in which the operator is executed. Check out our [custom operator documentation](../custom_operators.md).
+ This scenario should be rare as TensorFlow Lite has optimized versions for most ops. However you may be able to write a faster version of a custom op, if you know the constraints in which the operator is executed. Check out our [custom operator documentation](../custom_operators.md).
 
 ## Quantize your model
 If your model uses floating point weights or activations then it may be possible to reduce the size of model up to ~4x by using quantization and other model optimizations. Check out our [model optimization toolkit](model_optimization.md) for details about optimizing your model. 
 
 ## Tweak the number of threads
-Tensorflow Lite supports multi-threaded kernels for many operators. You can increase the number of threads and speed up execution of operators. Increasing the number of threads will however make your model use more resources and power. For some applications latency may be more important than energy efficiency. You can increase the number of threads by setting the number of [interpreter](https://github.com/tensorflow/tensorflow/blob/1084594657a5d139102ac794f84d1427a710e39a/tensorflow/lite/interpreter.h#L337) threads. Multi-threaded execution however comes at the cost of increased performance variability depending on what else is been executed concurrently. This is particularly the case for mobile apps. For example, isolated tests may show 2x speed up vs single-threaded but if another app is executing at the same time may result in worst performance than single-threaded.
+
+TensorFlow Lite supports multi-threaded kernels for many operators. You can
+increase the number of threads and speed up execution of operators. Increasing
+the number of threads will however make your model use more resources and power.
+For some applications latency may be more important than energy efficiency. You
+can increase the number of threads by setting the number of
+[interpreter](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/interpreter.h#L333)
+threads. Multi-threaded execution however comes at the cost of increased
+performance variability depending on what else is been executed concurrently.
+This is particularly the case for mobile apps. For example, isolated tests may
+show 2x speed up vs single-threaded but if another app is executing at the same
+time may result in worst performance than single-threaded.
 
 ## Eliminate redundant copies
-If your application is not careful, there can be redundant copies when feeding the input to the model and reading output from the model. Make sure to eliminate redundant copies. If you are using higher level APIs like Java API, make sure to carefully check the documentation for performance caveats. For example, the Java API is a lot faster if ByteBuffers are used as [inputs](https://github.com/tensorflow/tensorflow/blob/6305a6d83552ba6a472cd72398b60d9241467f1f/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java#L151).
+
+If your application is not careful, there can be redundant copies when feeding
+the input to the model and reading output from the model. Make sure to eliminate
+redundant copies. If you are using higher level APIs like Java API, make sure to
+carefully check the documentation for performance caveats. For example, the Java
+API is a lot faster if ByteBuffers are used as
+[inputs](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java#L175).
 
 ## Profile your application with platform specific tools
 Platform specific tools like [Android profiler](https://developer.android.com/studio/profile/android-profiler) and [Instruments](https://help.apple.com/instruments/mac/current/) provide a wealth of profiling information that can be used to debug your app. Sometimes the performance bug may be not in the model but in parts of application code that interact with the model. Make sure to familiarize yourself with platform specific profiling tools and best practices for your platform.
 
 ## Evaluate whether your model benefits from using hardware accelerators available on the device
-Tensorflow Lite is working on adding support for accelerators like GPU and provides acceleration through [Neural Networks API](https://developer.android.com/ndk/guides/neuralnetworks/) on Android.
-You can utilize these hardware accelerator backends to improve the speed and efficiency of your model. To enable Neural Networks API call [UseNNAPI](https://github.com/tensorflow/tensorflow/blob/6305a6d83552ba6a472cd72398b60d9241467f1f/tensorflow/lite/interpreter.h#L334) on the interpreter instance.
+
+TensorFlow Lite has added been new ways to accelerate models with faster
+hardware like GPUs, DSPs, and neural accelerators. Typically, these accelerators
+are exposed through *delegate* submodules that take over parts of the
+interpreter execution. TensorFlow Lite can use delegates by:
+
+*   Using Android's
+    [Neural Networks API](https://developer.android.com/ndk/guides/neuralnetworks/).
+    You can utilize these hardware accelerator backends to improve the speed and
+    efficiency of your model. To enable the Neural Networks API, call
+    [UseNNAPI](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/interpreter.h#L330)
+    on the interpreter instance.
+*   A binary-only GPU delegate has been released for Android and iOS—using
+    OpenGL and Metal, respectively. To try them out, see the
+    [GPU delegate tutorial](gpu.md) and [documentation](gpu_advanced.md).
+*   It is possible to create your own delegate if you have access to
+    non-standard hardware. View the NN API delegate in the source code as an
+    example.
+
+Be aware that some accelerators work better for different types of models. It is
+important to benchmark each delegate to see if it is a good choice for your
+application. For example, if you have a very small model, it may not be worth
+delegating the model to either the NN API or the GPU. Conversely, accelerators
+are a great choice for large models that have high arithmetic intensity.
 
 ## Need more help
-The Tensorflow team is happy to help diagnose and address specific performance issues you may be facing. Please file an issue on [GitHub](https://github.com/tensorflow/tensorflow/issues) with details of the issue.
+
+The TensorFlow team is happy to help diagnose and address specific performance
+issues you may be facing. Please file an issue on
+[GitHub](https://github.com/tensorflow/tensorflow/issues) with details of the
+issue.
diff --git a/tensorflow/lite/g3doc/performance/gpu.md b/tensorflow/lite/g3doc/performance/gpu.md
new file mode 100644
index 0000000000000000000000000000000000000000..c7389226123746180c8c5e6020431ffe579112a7
--- /dev/null
+++ b/tensorflow/lite/g3doc/performance/gpu.md
@@ -0,0 +1,236 @@
+# TensorFlow Lite GPU Delegate Tutorial
+
+[TensorFlow Lite](https://www.tensorflow.org/lite) supports several hardware
+accelerators. This document describes how to preview the experimental GPU backend using the
+TensorFlow Lite delegate APIs on Android and iOS.
+
+GPUs are designed to have high throughput for massively parallelizable
+workloads. Thus, they are well-suited for deep neural nets, which consist of a
+huge number of operators, each working on some input tensor(s) that can be
+easily divided into smaller workloads and carried out in parallel, typically
+resulting in lower latency. In the best scenario, inference on the GPU may now
+run fast enough for previously not available real-time applications.
+
+Unlike CPUs, GPUs compute with 16-bit or 32-bit floating point numbers and do
+not require quantization for optimal performance.
+
+Another benefit with GPU inference is its power efficiency. GPUs carry out the
+computations in a very efficient and optimized manner, so that they consume less
+power and generate less heat than when the same task is run on CPUs.
+
+## Demo App Tutorials
+
+The easiest way to try out the experimental GPU delegate is to follow the below tutorials, which go through building our classification demo applications with GPU support. The GPU code is only binary for now; it will be open-sourced soon. Once you understand how to get our demos working, you can try this out on your own custom models.
+
+### Android (with Android Studio)
+
+For a step-by-step tutorial, watch the
+[Experimental GPU Delegate for Android](https://youtu.be/Xkhgre8r5G0) video.
+
+Note: This requires OpenGL ES 3.1 or higher.
+
+#### Step 1. Clone the TensorFlow source code and open it in Android Studio
+
+```
+git clone https://github.com/tensorflow/tensorflow
+```
+
+#### Step 2. Edit `app/build.gradle` to use the experimental GPU AAR
+
+Replace the existing `tensorflow-lite` package in the existing `dependencies`
+block.
+
+```
+dependencies {
+    ...
+    // implementation 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
+    implementation 'org.tensorflow:tensorflow-lite:0.0.0-gpu-experimental'
+}
+```
+
+#### Step 3. Build and run
+
+Run → Run ‘app’.  When you run the application you will see a button for
+enabling the GPU. Change from quantized to a float model and then click GPU to
+run on the GPU.
+
+![running android gpu demo and switch to gpu](images/android_gpu_demo.gif)
+
+### iOS (with XCode)
+
+For a step-by-step tutorial, watch the
+[Experimental GPU Delegate for iOS](https://youtu.be/a5H4Zwjp49c) video.
+
+Note: This requires XCode v10.1 or later.
+
+#### Step 1. Get the demo source code and make sure it compiles.
+
+Follow our iOS Demo App [tutorial](https://www.tensorflow.org/lite/demo_ios).
+This will get you to a point where the unmodified iOS camera demo is working
+on your phone.
+
+
+#### Step 2. Modify the Podfile to use the TensorFlow Lite GPU CocoaPod
+
+We have built a binary CocoaPod that includes the GPU delegate. To switch the
+project to use it, modify the
+`tensorflow/tensorflow/lite/examples/ios/camera/Podfile` file to use
+the `TensorFlowLiteGpuExperimental` pod instead of `TensorFlowLite`.
+
+```
+target 'YourProjectName'
+  # pod 'TensorFlowLite', '1.12.0'
+  pod 'TensorFlowLiteGpuExperimental'
+```
+
+#### Step 3. Enable the GPU Delegate
+
+You will need to change two `#define` flags in `CameraExampleViewController.h`
+to enable the GPU delegate. First, change `TFLITE_USE_CONTRIB_LITE` from 1 to 0
+since TensorFlow Lite has moved from TensorFlow contrib into core.
+
+```c
+#define TFLITE_USE_CONTRIB_LITE 0
+```
+
+Next, change `TFLITE_USE_GPU_DELEGATE` from 0 to 1, to enable the code that will
+use the GPU delegate.
+
+```c
+#define TFLITE_USE_GPU_DELEGATE 1
+```
+
+#### Step 4. Build and run the demo app
+
+After following the previous step, you should be able to run the app.
+
+
+#### Step 5. Release mode.
+
+While in Step 4 you ran in debug mode, to get better performance, you should
+change to a release build with the appropriate optimal Metal settings. In
+particular, To edit these settings go to the `Product > Scheme > Edit
+Scheme...`. Select `Run`. On the `Info` tab, change `Build Configuration`, from
+`Debug` to `Release`, uncheck `Debug executable`.
+
+![setting up release](images/iosdebug.png)
+
+Then
+click the `Options` tab and change `GPU Frame Capture` to `Disabled` and
+`Metal API Validation` to `Disabled`.
+
+![setting up metal options](images/iosmetal.png)
+
+Lastly make sure Release only builds on 64-bit architecture. Under `Project
+navigator -> tflite_camera_example -> PROJECT -> tflite_camera_example -> Build
+Settings` set `Build Active Architecture Only > Release` to Yes.
+
+![setting up release options](images/iosrelease.png)
+
+## Trying the GPU Delegate on your own model
+
+### Android
+
+Look at the demo to see how to add the
+delegate. In your application, add the AAR as above, import
+`org.tensorflow.lite.experimental.GpuDelegate` module, and use the`addDelegate`
+function to register the GPU delegate to the interpreter:
+
+```java
+import org.tensorflow.lite.Interpreter;
+import org.tensorflow.lite.experimental.GpuDelegate;
+
+// Initialize interpreter with GPU delegate
+GpuDelegate delegate = new GpuDelegate();
+Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate);
+Interpreter interpreter = new Interpreter(model, options);
+
+// Run inference
+while (true) {
+  writeToInput(input);
+  interpreter.run(input, output);
+  readFromOutput(output);
+}
+
+// Clean up
+delegate.close();
+```
+
+### iOS
+
+In your application code, include the GPU delegate header and call the
+`Interpreter::ModifyGraphWithDelegate` function to register the GPU delegate to
+the interpreter:
+
+```cpp
+#import "tensorflow/lite/delegates/gpu/metal_delegate.h"
+
+// Initialize interpreter with GPU delegate
+std::unique_ptr<Interpreter> interpreter;
+InterpreterBuilder(*model, resolver)(&interpreter);
+auto* delegate = NewGpuDelegate(nullptr);  // default config
+if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
+
+// Run inference
+while (true) {
+  WriteToInputTensor(interpreter->typed_input_tensor<float>(0));
+  if (interpreter->Invoke() != kTfLiteOk) return false;
+  ReadFromOutputTensor(interpreter->typed_output_tensor<float>(0));
+}
+
+// Clean up
+interpreter = nullptr;
+DeleteGpuDelegate(delegate);
+```
+
+## Supported Models and Ops
+
+With the release of the GPU delegate, we included a handful of models that can
+be run on the backend:
+
+* [MobileNet v1 (224x224) image classification](https://ai.googleblog.com/2017/06/mobilenets-open-source-models-for.html) [[download]](https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/mobilenet_v1_1.0_224.tflite)
+<br /><i>(image classification model designed for mobile and embedded based vision applications)</i>
+* [DeepLab segmentation (257x257)](https://ai.googleblog.com/2018/03/semantic-image-segmentation-with.html) [[download]](https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/deeplabv3_257_mv_gpu.tflite)
+<br /><i>(image segmentation model that assigns semantic labels (e.g., dog, cat, car) to every pixel in the input image)</i>
+* [MobileNet SSD object detection](https://ai.googleblog.com/2018/07/accelerated-training-and-inference-with.html) [[download]](https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/mobile_ssd_v2_float_coco.tflite)
+<br /><i>(image classification model that detects multiple objects with bounding boxes)</i>
+* [PoseNet for pose estimation](https://github.com/tensorflow/tfjs-models/tree/master/posenet) [[download]](https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/multi_person_mobilenet_v1_075_float.tflite)
+<br /><i>(vision model that estimates the poses of a person(s) in image or video)</i>
+
+To see a full list of supported ops, please see the [advanced documentation](gpu_advanced.md).
+
+## Non-supported models and ops
+
+If some of the ops are not supported by the GPU delegate, the framework will
+only run a part of the graph on the GPU and the remaining part on the CPU.  Due
+to the high cost of CPU/GPU synchronization, a split execution mode like this
+will often result in a performance slower than when the whole network is run on
+the CPU alone.  In this case, the user will get a warning like:
+
+```
+WARNING: op code #42 cannot be handled by this delegate.
+```
+
+We did not provide a callback for this failure, as this is not a true run-time
+failure, but something that the developer can observe while trying to get the
+network to run on the delegate.
+
+## Tips for optimization
+
+Some operations that are trivial on the CPU may have a high cost for the GPU.
+One class of such operation is various forms of reshape operations, including
+`BATCH_TO_SPACE`, `SPACE_TO_BATCH`, `SPACE_TO_DEPTH`, and so forth. If those ops
+are inserted into the network just for the network architect's logical thinking,
+it is worth removing them for performance.
+
+On GPU, tensor data is sliced into 4-channels. Thus, a computation on a tensor
+of shape `[B,H,W,5]` will perform about the same on a tensor of shape
+`[B,H,W,8]` but significantly worse than `[B,H,W,4]`.
+
+In that sense, if the camera hardware supports image frames in RGBA, feeding
+that 4-channel input is significantly faster as a memory copy (from 3-channel
+RGB to 4-channel RGBX) can be avoided.
+
+For best performance, do not hesitate to retrain your classifier with a mobile-
+optimized network architecture. That is a significant part of optimization for
+on-device inference.
diff --git a/tensorflow/lite/g3doc/performance/gpu_advanced.md b/tensorflow/lite/g3doc/performance/gpu_advanced.md
new file mode 100644
index 0000000000000000000000000000000000000000..627494804029a42d1fc0d89c6a7d5af888051d83
--- /dev/null
+++ b/tensorflow/lite/g3doc/performance/gpu_advanced.md
@@ -0,0 +1,303 @@
+# TensorFlow Lite on GPU
+
+[TensorFlow Lite](https://www.tensorflow.org/mobile/tflite/) supports several
+hardware accelerators.  This document describes how to use the GPU backend using
+the TensorFlow Lite delegate APIs on Android (requires OpenGL ES 3.1 or higher)
+and iOS (requires iOS 8 or later).
+
+## Benefits of GPU Acceleration
+
+### Speed
+
+GPUs are designed to have high throughput for massively parallelizable
+workloads. Thus, they are well-suited for deep neural nets, which consist of a
+huge number of operators, each working on some input tensor(s) that can be
+easily divided into smaller workloads and carried out in parallel. This
+parallelism typically results in lower latency. In the best scenario, inference
+on the GPU may run fast enough to become suitable for real-time applications
+that were not previously possible.
+
+### Accuracy
+
+GPUs do their computation with 16-bit or 32-bit floating point numbers and
+(unlike the CPUs) do not require quantization for optimal performance. If
+decreased accuracy made quantization untenable for your models, running your
+neural network on a GPU may eliminate this concern.
+
+### Energy Efficiency
+
+Another benefit that comes with GPU inference is its power efficiency. A GPU
+carries out computations in a very efficient and optimized way, consuming less
+power and generating less heat than the same task run on a CPU.
+
+## Supported Ops
+
+TensorFlow Lite on GPU supports the following ops in 16-bit and 32-bit float
+precision:
+
+* `ADD v1`
+* `AVERAGE_POOL_2D v1`
+* `CONCATENATION v1`
+* `CONV_2D v1`
+* `DEPTHWISE_CONV_2D v1-2`
+* `FULLY_CONNECTED v1`
+* `LOGISTIC v1`
+* `MAX_POOL_2D v1`
+* `MUL v1`
+* `PAD v1`
+* `PRELU v1`
+* `RELU v1`
+* `RELU6 v1`
+* `RESHAPE v1`
+* `RESIZE_BILINEAR v1`
+* `SOFTMAX v1`
+* `STRIDED_SLICE v1`
+* `SUB v1`
+* `TRANSPOSE_CONV v1`
+
+## Basic Usage
+
+### Android
+
+Run TensorFlow Lite on GPU with `TfLiteDelegate`. In Java, you can specify the
+GpuDelegate through `Interpreter.Options`.
+
+```java
+// NEW: Prepare GPU delegate.
+GpuDelegate delegate = new GpuDelegate();
+Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate);
+
+// Set up interpreter.
+Interpreter interpreter = new Interpreter(model, options);
+
+// Run inference.
+writeToInputTensor(inputTensor);
+interpreter.run(inputTensor, outputTensor);
+readFromOutputTensor(outputTensor);
+
+// Clean up.
+delegate.close();
+```
+
+### iOS
+
+To use TensorFlow Lite on GPU, get the GPU delegate via `NewGpuDelegate()` and
+then pass it to `Interpreter::ModifyGraphWithDelegate()` (instead of calling
+`Interpreter::AllocateTensors()`).
+
+```c++
+// Set up interpreter.
+auto model = FlatBufferModel::BuildFromFile(model_path);
+if (!model) return false;
+tflite::ops::builtin::BuiltinOpResolver op_resolver;
+std::unique_ptr<Interpreter> interpreter;
+InterpreterBuilder(*model, op_resolver)(&interpreter);
+
+// NEW: Prepare GPU delegate.
+
+const GpuDelegateOptions options = {
+  .allow_precision_loss = false,
+  .wait_type = kGpuDelegateOptions::WaitType::Passive,
+};
+
+auto* delegate = NewGpuDelegate(options);
+if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
+
+// Run inference.
+WriteToInputTensor(interpreter->typed_input_tensor<float>(0));
+if (interpreter->Invoke() != kTfLiteOk) return false;
+ReadFromOutputTensor(interpreter->typed_output_tensor<float>(0));
+
+// Clean up.
+DeleteGpuDelegate(delegate);
+```
+
+Note: When calling `Interpreter::ModifyGraphWithDelegate()` or
+`Interpreter::Invoke()`, the caller must have an `EGLContext` in the current
+thread and `Interpreter::Invoke()` must be called from the same `EGLContext`. If
+an `EGLContext` does not exist, the delegate will internally create one, but
+then the developer must ensure that `Interpreter::Invoke()` is always called
+from the same thread in which `Interpreter::ModifyGraphWithDelegate()` was
+called.
+
+## Advanced Usage
+
+### Delegate Options for iOS
+
+`NewGpuDelegate()` accepts a `struct` of options.
+
+```c++
+struct GpuDelegateOptions {
+  // Allows to quantify tensors, downcast values, process in float16 etc.
+  bool allow_precision_loss;
+
+  enum class WaitType {
+    // waitUntilCompleted
+    kPassive,
+    // Minimize latency. It uses active spinning instead of mutex and consumes
+    // additional CPU resources.
+    kActive,
+    // Useful when the output is used with GPU pipeline then or if external
+    // command encoder is set
+    kDoNotWait,
+  };
+  WaitType wait_type;
+};
+```
+
+Passing `nullptr` into `NewGpuDelegate()` sets the default options (which are
+explicated in the Basic Usage example above).
+
+```c++
+
+// THIS:
+const GpuDelegateOptions options = {
+  .allow_precision_loss = false,
+  .wait_type = kGpuDelegateOptions::WaitType::Passive,
+};
+
+auto* delegate = NewGpuDelegate(options);
+
+// IS THE SAME AS THIS:
+auto* delegate = NewGpuDelegate(nullptr);
+
+```
+
+While it is convenient to use `nullptr`, we recommend that you explicitly set
+the options, to avoid any unexpected behavior if default values are changed in
+the future.
+
+### Input/Output Buffers
+
+To do computation on the GPU, data must be made available to the GPU. This often
+requires performing a memory copy. It is desirable not to cross the CPU/GPU
+memory boundary if possible, as this can take up a significant amount of time.
+Usually, such crossing is inevitable, but in some special cases, one or the
+other can be omitted.
+
+If the network's input is an image already loaded in the GPU memory (for
+example, a GPU texture containing the camera feed) it can stay in the GPU memory
+without ever entering the CPU memory. Similarly, if the network's output is in
+the form of a renderable image (for example,
+[image style transfer](https://www.cv-foundation.org/openaccess/content_cvpr_2016/papers/Gatys_Image_Style_Transfer_CVPR_2016_paper.pdf)_)
+it can be directly displayed on the screen.
+
+To achieve best performance, TensorFlow Lite makes it possible for users to
+directly read from and write to the TensorFlow hardware buffer and bypass
+avoidable memory copies.
+
+#### Android
+
+Assuming the image input is in the GPU memory, it must first be converted to an
+OpenGL Shader Storage Buffer Object (SSBO). You can associate a TfLiteTensor to
+a user-prepared SSBO with `Interpreter.bindGlBufferToTensor()`. Note that
+`Interpreter.bindGlBufferToTensor()` must be called before
+`Interpreter.modifyGraphWithDelegate()`.
+
+```java
+// Ensure a valid EGL rendering context.
+EGLContext eglContext = eglGetCurrentContext();
+if (eglContext.equals(EGL_NO_CONTEXT)) return false;
+
+// Create an SSBO.
+int[] id = new int[1];
+glGenBuffers(id.length, id, 0);
+glBindBuffer(GL_SHADER_STORAGE_BUFFER, id[0]);
+glBufferData(GL_SHADER_STORAGE_BUFFER, inputSize, null, GL_STREAM_COPY);
+glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);  // unbind
+int inputSsboId = id[0];
+
+// Create interpreter.
+Interpreter interpreter = new Interpreter(tfliteModel);
+Tensor inputTensor = interpreter.getInputTensor(0);
+GpuDelegate gpuDelegate = new GpuDelegate();
+// The buffer must be bound before the delegate is installed.
+gpuDelegate.bindGlBufferToTensor(inputTensor, inputSsboId);
+interpreter.modifyGraphWithDelegate(gpuDelegate);
+
+// Run inference; the null input argument indicates use of the bound buffer for input.
+fillSsboWithCameraImageTexture(inputSsboId);
+float[] outputArray = new float[outputSize];
+interpreter.runInference(null, outputArray);
+```
+
+A similar approach can be applied to the output tensor. In that case,
+`Interpreter.Options.setAllowBufferHandleOutput(true)` should be passed on, to
+disable the default copying of the network's output from GPU memory to CPU
+memory.
+
+```java
+// Ensure a valid EGL rendering context.
+EGLContext eglContext = eglGetCurrentContext();
+if (eglContext.equals(EGL_NO_CONTEXT)) return false;
+
+// Create a SSBO.
+int[] id = new int[1];
+glGenBuffers(id.length, id, 0);
+glBindBuffer(GL_SHADER_STORAGE_BUFFER, id[0]);
+glBufferData(GL_SHADER_STORAGE_BUFFER, outputSize, null, GL_STREAM_COPY);
+glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);  // unbind
+int outputSsboId = id[0];
+
+// Create interpreter.
+Interpreter.Options options = (new Interpreter.Options()).setAllowBufferHandleOutput(true);
+Interpreter interpreter = new Interpreter(tfliteModel, options);
+Tensor outputTensor = interpreter.getOutputTensor(0);
+GpuDelegate gpuDelegate = new GpuDelegate();
+// The buffer must be bound before the delegate is installed.
+gpuDelegate.bindGlBufferToTensor(outputTensor, outputSsboId);
+interpreter.modifyGraphWithDelegate(gpuDelegate);
+
+// Run inference; the null output argument indicates use of the bound buffer for output.
+ByteBuffer input = getCameraImageByteBuffer();
+interpreter.runInference(input, null);
+renderOutputSsbo(outputSsboId);
+```
+
+#### iOS
+
+Assuming the image input is in GPU memory, it must first be converted to a
+`MTLBuffer` object for Metal. You can associate a TfLiteTensor to a
+user-prepared `MTLBuffer` with `BindMetalBufferToTensor()`. Note that
+`BindMetalBufferToTensor()` must be called before
+`Interpreter::ModifyGraphWithDelegate()`. Additionally, the inference output is,
+by default, copied from GPU memory to CPU memory. This behavior can be turned
+off by calling `Interpreter::SetAllowBufferHandleOutput(true)` during
+initialization.
+
+```c++
+// Prepare GPU delegate.
+auto* delegate = NewGpuDelegate(nullptr);
+interpreter->SetAllowBufferHandleOutput(true);  // disable default gpu->cpu copy
+if (!BindMetalBufferToTensor(delegate, interpreter->inputs()[0], user_provided_input_buffer)) return false;
+if (!BindMetalBufferToTensor(delegate, interpreter->outputs()[0], user_provided_output_buffer)) return false;
+if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
+
+// Run inference.
+if (interpreter->Invoke() != kTfLiteOk) return false;
+```
+
+Note: Once the default behavior is turned off, copying the inference output from
+GPU memory to CPU memory requires an explicit call to
+`Interpreter::EnsureTensorDataIsReadable()` for each output tensor.
+
+## Tips and Tricks
+
+*   Some operations that are trivial on the CPU may be high cost on a GPU. One
+    class of such operation includes various forms of reshape operations
+    (including `BATCH_TO_SPACE`, `SPACE_TO_BATCH`, `SPACE_TO_DEPTH`, and similar
+    operation). If these operations are not required (for example, they were
+    inserted to help the network architect reason about the system but do not
+    otherwise affect output), it is worth removing them for performance.
+
+*   On a GPU, tensor data is sliced into 4-channels. Thus, a computation on a
+    tensor of shape `[B, H, W, 5]` will perform about the same on a tensor of
+    shape `[B, H, W, 8]`, but significantly worse than `[B, H, W, 4]`.
+
+    *   For example, if the camera hardware supports image frames in RGBA,
+        feeding that 4-channel input is significantly faster, because a memory
+        copy (from 3-channel RGB to 4-channel RGBX) can be avoided.
+
+*   For best performance, do not hesitate to re-train your classifier with
+    mobile-optimized network architecture. That is a significant part of
+    optimization for on-device inference.
diff --git a/tensorflow/lite/g3doc/performance/images/android_gpu_demo.gif b/tensorflow/lite/g3doc/performance/images/android_gpu_demo.gif
new file mode 100644
index 0000000000000000000000000000000000000000..31aee24c34b6451727abffd95bb7f2f0d40f55af
Binary files /dev/null and b/tensorflow/lite/g3doc/performance/images/android_gpu_demo.gif differ
diff --git a/tensorflow/lite/g3doc/performance/images/iosdebug.png b/tensorflow/lite/g3doc/performance/images/iosdebug.png
new file mode 100644
index 0000000000000000000000000000000000000000..8cebbb84688b8129c149108ee4a47736a23dddff
Binary files /dev/null and b/tensorflow/lite/g3doc/performance/images/iosdebug.png differ
diff --git a/tensorflow/lite/g3doc/performance/images/iosmetal.png b/tensorflow/lite/g3doc/performance/images/iosmetal.png
new file mode 100644
index 0000000000000000000000000000000000000000..5e2b8bde8c1dac18ff66920f4f2a3f369f81bb3a
Binary files /dev/null and b/tensorflow/lite/g3doc/performance/images/iosmetal.png differ
diff --git a/tensorflow/lite/g3doc/performance/images/iosrelease.png b/tensorflow/lite/g3doc/performance/images/iosrelease.png
new file mode 100644
index 0000000000000000000000000000000000000000..a160c6700e60726d8d9775c4a1c28b3e34b1e930
Binary files /dev/null and b/tensorflow/lite/g3doc/performance/images/iosrelease.png differ
diff --git a/tensorflow/lite/g3doc/tf_ops_compatibility.md b/tensorflow/lite/g3doc/tf_ops_compatibility.md
index dcfda72137cafbc676dec2fb5dbf5da8ab8cb45a..ff8ddabdf43f8d6f01da9a3e4db1d083278687fe 100644
--- a/tensorflow/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/lite/g3doc/tf_ops_compatibility.md
@@ -165,6 +165,17 @@ Options {
 }
 ```
 
+**ADD_N**
+
+```
+Inputs {
+  0-N: any number of tensors (must have same size and shape)
+}
+Outputs {
+  0: elementwise sum of the input tensors
+}
+```
+
 **ARG_MAX**
 
 ```
@@ -185,7 +196,7 @@ Inputs {
   1: a tensor
 }
 Outputs {
-  0: A tensor of indices of minium values.
+  0: A tensor of indices of minimum values.
 }
 ```
 
@@ -291,6 +302,17 @@ Options {
 }
 ```
 
+**ELU**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: a tensor equivalent to exp(features) - 1 if < 0, features otherwise.
+}
+```
+
 **EQUAL**
 
 ```
@@ -362,6 +384,17 @@ Outputs {
 }
 ```
 
+**CEIL**
+
+```
+inputs {
+  0: tensor
+}
+outputs: {
+  0: result of computing element-wise ceil of the input tensor
+}
+```
+
 **FULLY_CONNECTED**
 
 ```
@@ -392,6 +425,18 @@ Outputs {
 }
 ```
 
+**GATHER_ND**
+
+```
+Inputs {
+  0: params tensor
+  1: indices tensor
+}
+Outputs {
+  0: a tensor with same type as the params tensor.
+}
+```
+
 **GREATER**
 
 ```
@@ -691,6 +736,17 @@ Options {
 }
 ```
 
+**RANK**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: a 0-D int32 Tensor representing the rank of input
+}
+```
+
 **RELU**
 
 ```
@@ -978,6 +1034,22 @@ Outputs {
 }
 ```
 
+**WHERE**
+
+```
+Inputs {
+  0: A tensor of type bool.
+  1: A tensor which may have the same shape as condition. If condition is rank
+     1, x may have higher rank, but its first dimension must match the size of
+     condition.
+  2: A tensor with the same shape and type as x.
+}
+Outputs {
+  0: A tensor with the same type and shape as x, y if they are non-None, or
+     a tensor with shape (num_true, dim_size(condition)).
+}
+```
+
 **ZEROS_LIKE**
 
 ```
diff --git a/tensorflow/lite/g3doc/tfmobile/android_build.md b/tensorflow/lite/g3doc/tfmobile/android_build.md
index 2eb776d10cf8ec68987d13b580eddf2f1bda8e78..f8c0243298e435382a7514e72ada89880fb00c1c 100644
--- a/tensorflow/lite/g3doc/tfmobile/android_build.md
+++ b/tensorflow/lite/g3doc/tfmobile/android_build.md
@@ -91,10 +91,10 @@ following lines to your Gradle build file:
         repositories {
             jcenter()
         }
-	}
+    }
 
     dependencies {
-        compile 'org.tensorflow:tensorflow-android:+'
+        implementation 'org.tensorflow:tensorflow-android:+'
     }
 
 This automatically downloads the latest stable version of TensorFlow as an AAR
diff --git a/tensorflow/lite/g3doc/using_select_tf_ops.md b/tensorflow/lite/g3doc/using_select_tf_ops.md
index aa51f58baa4ecf01fbe75d2ce9095bb1a5286ae8..7c1ad20e1b978f5a05a7e366ce836d0ac60b860c 100644
--- a/tensorflow/lite/g3doc/using_select_tf_ops.md
+++ b/tensorflow/lite/g3doc/using_select_tf_ops.md
@@ -49,8 +49,7 @@ partially supported by TensorFlow Lite, and one would like to avoid those
 limitations.
 
 The following example shows how to use `target_ops` in the
-[`TFLiteConverter`](https://www.tensorflow.org/lite/convert/python_api) Python
-API.
+[`TFLiteConverter`](./convert/python_api.md) Python API.
 
 ```
 import tensorflow as tf
@@ -130,7 +129,7 @@ allprojects {
 }
 
 dependencies {
-    compile 'org.tensorflow:tensorflow-lite-with-select-tf-ops:0.1.100'
+    implementation 'org.tensorflow:tensorflow-lite-with-select-tf-ops:0.1.100'
 }
 ```
 
@@ -151,8 +150,8 @@ TensorFlow Lite XCode project with support for select TensorFlow ops has been
 added to
 `tensorflow/lite/examples/ios/camera/tflite_camera_example_with_select_tf_ops.xcodeproj`.
 
-To use this feature in a your own project, either clone the example project or
-set the project settings for a new or existing project to the following:
+To use this feature in your own project, either clone the example project or set
+the project settings for a new or existing project to the following:
 
 *   In Build Phases -> Link Binary With Libraries, add the static libraries
     under `tensorflow/contrib/makefile/gen/lib/` directory:
diff --git a/tensorflow/lite/graph_info.cc b/tensorflow/lite/graph_info.cc
index 1cec0d0c290679c7755cbf84858317489c0ba159..a9091924c064341316d788704daa643bc8e247b5 100644
--- a/tensorflow/lite/graph_info.cc
+++ b/tensorflow/lite/graph_info.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/graph_info.h"
 #include <algorithm>
+#include "tensorflow/lite/c/c_api_internal.h"
 
 namespace tflite {
 
@@ -94,6 +95,10 @@ class PartitionGraphIntoIndependentNodeSubsetsImpl {
     // been identified.
     for (int output_index : info_->outputs()) {
       int output_epoch = tensor_epochs_[output_index];
+      if (output_epoch == kEpochAlwaysReady) {
+        // This happens when an input of subgraph is also an output of subgraph.
+        continue;
+      }
       NodeSubset& output_subset = (*node_subsets_)[output_epoch];
       output_subset.output_tensors.push_back(output_index);
     }
@@ -138,7 +143,8 @@ class PartitionGraphIntoIndependentNodeSubsetsImpl {
     // See if all dependencies of this node are already assigned to a
     // node sub set.
     for (int input_tensor_index : TfLiteIntArrayView(node.inputs)) {
-      if (tensor_epochs_[input_tensor_index] == kEpochNotReady) {
+      if (input_tensor_index != kOptionalTensor &&
+          tensor_epochs_[input_tensor_index] == kEpochNotReady) {
         return false;
       }
     }
@@ -162,6 +168,9 @@ class PartitionGraphIntoIndependentNodeSubsetsImpl {
       // Look at our inputs one more time to update that tensor's
       // epochs' outputs
       for (int input_tensor_index : TfLiteIntArrayView(node.inputs)) {
+        if (input_tensor_index == kOptionalTensor) {
+          continue;
+        }
         int input_epoch = tensor_epochs_[input_tensor_index];
         int node_epoch = current_epoch;
         if (input_epoch != node_epoch) {
diff --git a/tensorflow/lite/graph_info_test.cc b/tensorflow/lite/graph_info_test.cc
index 4d8bbdc0eef49b3f79b3c74c1d07fd86467e1d65..b72728a9a9c94c4ee7312e5ff6f17e7b4d3b8a95 100644
--- a/tensorflow/lite/graph_info_test.cc
+++ b/tensorflow/lite/graph_info_test.cc
@@ -101,7 +101,7 @@ void CheckPartitionSubgraphs(
 }
 
 // Test an empty trivial graph with no partitions.
-TEST(PartitionTest, Nodes0_PartitionNodes0) {
+TEST(PartitionTest, Nodes0PartitionNodes0) {
   SimpleTestGraph graph;
   std::vector<int> nodes_to_partition = {};
   std::vector<NodeSubset> generated_subgraphs;
@@ -109,6 +109,20 @@ TEST(PartitionTest, Nodes0_PartitionNodes0) {
   CheckPartitionSubgraphs(generated_subgraphs, {});
 }
 
+// Test a trivial graph with no node and only 1 tensor.
+// The tensor is input & output of the graph at the same time.
+// Note: This is a regression test to ensure the partitioning logic
+// handles this case without crashing.
+TEST(PartitionTest, Nodes0PartitionNodes0Tensors1) {
+  SimpleTestGraph graph;
+  graph.AddTensors(1);
+  graph.SetInputsAndOutputs({0}, {0});
+  std::vector<int> nodes_to_partition = {};
+  std::vector<NodeSubset> generated_subgraphs;
+  PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
+  CheckPartitionSubgraphs(generated_subgraphs, {});
+}
+
 // Test a 1 node graph with no partitions.
 // Input: tensor(0) -> node(0) -> tensor(1), nodes_to_partition=[]
 // Output: [kTfNoPartition, tensor(0) -> node(0) -> tensor(1)]
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index e2129ed46d94061211e02445a437f7adca51363e..75a23deb1441e47ed5780209e994a921aefc4585 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -32,6 +32,26 @@ limitations under the License.
 
 namespace tflite {
 
+namespace {
+
+// Gets the current TfLiteQuantization from the legacy fLiteQuantizationParams.
+TfLiteQuantization GetQuantizationFromLegacy(
+    const TfLiteQuantizationParams& legacy_quantization) {
+  TfLiteQuantization quantization;
+  quantization.type = kTfLiteAffineQuantization;
+  auto* affine_quantization = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  affine_quantization->scale = TfLiteFloatArrayCreate(1);
+  affine_quantization->zero_point = TfLiteIntArrayCreate(1);
+  affine_quantization->scale->data[0] = legacy_quantization.scale;
+  affine_quantization->zero_point->data[0] = legacy_quantization.zero_point;
+  quantization.params = affine_quantization;
+
+  return quantization;
+}
+
+}  // namespace
+
 Interpreter::Interpreter(ErrorReporter* error_reporter)
     : error_reporter_(error_reporter ? error_reporter
                                      : DefaultErrorReporter()) {
@@ -71,7 +91,7 @@ TfLiteStatus Interpreter::AllocateTensors() {
 }
 
 void Interpreter::ReserveNodes(int count) {
-  primary_subgraph().nodes_and_registration().reserve(count);
+  primary_subgraph().ReserveNodes(count);
 }
 
 void Interpreter::AddSubgraphs(int subgraphs_to_add,
@@ -102,15 +122,16 @@ TfLiteStatus Interpreter::ResizeInputTensor(int tensor_index,
 }
 
 TfLiteStatus Interpreter::Invoke() {
-  TfLiteStatus status = primary_subgraph().Invoke();
+  TF_LITE_ENSURE_STATUS(primary_subgraph().Invoke());
 
   if (!allow_buffer_handle_output_) {
     for (int tensor_index : outputs()) {
-      primary_subgraph().EnsureTensorDataIsReadable(tensor_index);
+      TF_LITE_ENSURE_STATUS(
+          primary_subgraph().EnsureTensorDataIsReadable(tensor_index));
     }
   }
 
-  return status;
+  return kTfLiteOk;
 }
 
 TfLiteStatus Interpreter::AddTensors(int tensors_to_add,
@@ -122,24 +143,49 @@ TfLiteStatus Interpreter::ResetVariableTensors() {
   return primary_subgraph().ResetVariableTensors();
 }
 
+TfLiteStatus Interpreter::SetTensorParametersReadOnly(
+    int tensor_index, TfLiteType type, const char* name,
+    const std::vector<int>& dims, TfLiteQuantization quantization,
+    const char* buffer, size_t bytes, const Allocation* allocation) {
+  return primary_subgraph().SetTensorParametersReadOnly(
+      tensor_index, type, name, dims.size(), dims.data(), quantization, buffer,
+      bytes, allocation);
+}
+
+TfLiteStatus Interpreter::SetTensorParametersReadWrite(
+    int tensor_index, TfLiteType type, const char* name,
+    const std::vector<int>& dims, TfLiteQuantization quantization,
+    bool is_variable) {
+  return primary_subgraph().SetTensorParametersReadWrite(
+      tensor_index, type, name, dims.size(), dims.data(), quantization,
+      is_variable);
+}
+
 TfLiteStatus Interpreter::SetTensorParametersReadOnly(
     int tensor_index, TfLiteType type, const char* name, const size_t rank,
     const int* dims, TfLiteQuantizationParams quantization, const char* buffer,
     size_t bytes, const Allocation* allocation) {
-  return primary_subgraph().SetTensorParametersReadOnly(
-      tensor_index, type, name, rank, dims, quantization, buffer, bytes,
-      allocation);
+  TfLiteQuantization new_quantization = GetQuantizationFromLegacy(quantization);
+  if (primary_subgraph().SetTensorParametersReadOnly(
+          tensor_index, type, name, rank, dims, new_quantization, buffer, bytes,
+          allocation) != kTfLiteOk) {
+    TfLiteQuantizationFree(&new_quantization);
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
 }
 
-// Set description of inputs/outputs/data/fptrs for node `node_index`.
-// This variant assumes an external buffer has been allocated of size
-// bytes. The lifetime of buffer must be ensured to be greater or equal
-// to Interpreter.
 TfLiteStatus Interpreter::SetTensorParametersReadWrite(
     int tensor_index, TfLiteType type, const char* name, const size_t rank,
     const int* dims, TfLiteQuantizationParams quantization, bool is_variable) {
-  return primary_subgraph().SetTensorParametersReadWrite(
-      tensor_index, type, name, rank, dims, quantization, is_variable);
+  TfLiteQuantization new_quantization = GetQuantizationFromLegacy(quantization);
+  if (primary_subgraph().SetTensorParametersReadWrite(
+          tensor_index, type, name, rank, dims, new_quantization,
+          is_variable) != kTfLiteOk) {
+    TfLiteQuantizationFree(&new_quantization);
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
 }
 
 TfLiteStatus Interpreter::SetExecutionPlan(const std::vector<int>& new_plan) {
@@ -167,8 +213,20 @@ void Interpreter::SetAllowFp16PrecisionForFp32(bool allow) {
   }
 }
 
+// TODO(b/121264966): Subgraphs added after cancellation is set will not get the
+// cancellation function added to their context.
+void Interpreter::SetCancellationFunction(void* data,
+                                          bool (*check_cancelled_func)(void*)) {
+  for (auto& subgraph : subgraphs_) {
+    subgraph->SetCancellationFunction(data, check_cancelled_func);
+  }
+}
+
 TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
-  return primary_subgraph().ModifyGraphWithDelegate(delegate);
+  for (auto& subgraph : subgraphs_) {
+    TF_LITE_ENSURE_OK(context_, subgraph->ModifyGraphWithDelegate(delegate));
+  }
+  return kTfLiteOk;
 }
 
 TfLiteStatus Interpreter::SetBufferHandle(int tensor_index,
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index 6192d56ca2b5810d7ffaddbf4cc7ae3c1b27c268..806b66c12a0bf119985927e4e937c71fc6fed487 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -160,6 +160,12 @@ class Interpreter {
   // This variant assumes an external buffer has been allocated of size
   // bytes. The lifetime of buffer must be ensured to be greater or equal
   // to Interpreter.
+  TfLiteStatus SetTensorParametersReadOnly(
+      int tensor_index, TfLiteType type, const char* name,
+      const std::vector<int>& dims, TfLiteQuantization quantization,
+      const char* buffer, size_t bytes, const Allocation* allocation = nullptr);
+
+  // Legacy. Deprecated in favor of above.
   inline TfLiteStatus SetTensorParametersReadOnly(
       int tensor_index, TfLiteType type, const char* name,
       const std::vector<int>& dims, TfLiteQuantizationParams quantization,
@@ -179,6 +185,13 @@ class Interpreter {
   // This variant assumes an external buffer has been allocated of size
   // bytes. The lifetime of buffer must be ensured to be greater or equal
   // to Interpreter.
+  TfLiteStatus SetTensorParametersReadWrite(int tensor_index, TfLiteType type,
+                                            const char* name,
+                                            const std::vector<int>& dims,
+                                            TfLiteQuantization quantization,
+                                            bool is_variable = false);
+
+  // Legacy. Deprecated in favor of above.
   inline TfLiteStatus SetTensorParametersReadWrite(
       int tensor_index, TfLiteType type, const char* name,
       const std::vector<int>& dims, TfLiteQuantizationParams quantization,
@@ -343,6 +356,15 @@ class Interpreter {
     return context_->allow_fp32_relax_to_fp16;
   }
 
+  // Sets the cancellation function pointer in order to cancel a request in the
+  // middle of a call to Invoke(). The interpreter queries this function during
+  // inference, between op invocations; when it returns true, the interpreter
+  // will abort execution and return `kTfLiteError`. The `data` parameter
+  // contains any data used by the cancellation function, and if non-null,
+  // remains owned by the caller.
+  // WARNING: This is an experimental API and subject to change.
+  void SetCancellationFunction(void* data, bool (*check_cancelled_func)(void*));
+
   // Owning handle to a TfLiteDelegate instance.
   using TfLiteDelegatePtr =
       std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index 78b5d1b8873b8b3558b098031ffa33c7857a31e5..e1aedfe65ccdfcb4c38d2b13cf53007f17f5f798 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -73,8 +73,9 @@ TEST(BasicInterpreter, TestAllocateTensorsResetVariableTensors) {
   int tensor_index;
   ASSERT_EQ(interpreter.AddTensors(1, &tensor_index), kTfLiteOk);
   constexpr int kTensorSize = 16;
+  TfLiteQuantizationParams quant;
   interpreter.SetTensorParametersReadWrite(tensor_index, kTfLiteFloat32, "",
-                                           {kTensorSize}, {}, true);
+                                           {kTensorSize}, quant, true);
   interpreter.SetVariables({tensor_index});
   ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
   TfLiteTensor* tensor = interpreter.tensor(tensor_index);
@@ -170,6 +171,55 @@ TEST(BasicInterpreter, CheckAllocate) {
   }
 }
 
+TEST(BasicInterpreter, CheckQuantization) {
+  Interpreter interpreter;
+  ASSERT_EQ(interpreter.AddTensors(2), kTfLiteOk);
+  interpreter.SetInputs({0, 1});
+  interpreter.SetOutputs({});
+  TfLiteType tensor_type = kTfLiteInt8;
+  const uint8_t int8s[] = {3, 4};
+  float scale = 0.5f;
+  int32_t zero_point = 12;
+
+  TfLiteQuantization rw_quantization;
+  rw_quantization.type = kTfLiteAffineQuantization;
+  auto* rw_affine_quantization = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  rw_affine_quantization->scale = TfLiteFloatArrayCreate(1);
+  rw_affine_quantization->zero_point = TfLiteIntArrayCreate(1);
+  rw_affine_quantization->scale->data[0] = scale;
+  rw_affine_quantization->zero_point->data[0] = zero_point;
+  rw_quantization.params = rw_affine_quantization;
+
+  TfLiteQuantization ro_quantization;
+  ro_quantization.type = kTfLiteAffineQuantization;
+  auto* ro_affine_quantization = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  ro_affine_quantization->scale = TfLiteFloatArrayCreate(1);
+  ro_affine_quantization->zero_point = TfLiteIntArrayCreate(1);
+  ro_affine_quantization->scale->data[0] = scale;
+  ro_affine_quantization->zero_point->data[0] = zero_point;
+  ro_quantization.params = ro_affine_quantization;
+
+  ASSERT_EQ(interpreter.SetTensorParametersReadWrite(0, tensor_type, "", {3},
+                                                     rw_quantization),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter.SetTensorParametersReadOnly(
+                1, tensor_type, "", {2}, ro_quantization,
+                reinterpret_cast<const char*>(int8s), 2),
+            kTfLiteOk);
+  ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+  // Check that the legacy scale and zero_point are set correctly.
+  ASSERT_EQ(interpreter.tensor(0)->params.scale, scale);
+  ASSERT_EQ(interpreter.tensor(0)->params.zero_point, zero_point);
+  ASSERT_EQ(interpreter.tensor(0)->quantization.type, rw_quantization.type);
+  ASSERT_EQ(interpreter.tensor(0)->quantization.type, rw_quantization.type);
+  ASSERT_EQ(interpreter.tensor(1)->params.scale, scale);
+  ASSERT_EQ(interpreter.tensor(1)->params.zero_point, zero_point);
+  ASSERT_EQ(interpreter.tensor(1)->quantization.type, ro_quantization.type);
+  ASSERT_EQ(interpreter.tensor(1)->quantization.type, ro_quantization.type);
+}
+
 TEST(BasicInterpreter, CheckResize) {
   const float floats[] = {-3., -4.};
   const int32_t int32s[] = {-3, -4};
@@ -1155,7 +1205,22 @@ TEST_F(TestDelegate, BasicDelegate) {
   EXPECT_EQ(params->output_tensors->data[1], 4);
 }
 
-TEST_F(TestDelegate, ComplexDeligate) {
+TEST_F(TestDelegate, StaticDelegateMakesGraphImmutable) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+
+  // As the delegate doesn't support dynamic resizing, further graph mutation is
+  // prohibited.
+  ASSERT_NE(interpreter_->ResizeInputTensor(0, {0}), kTfLiteOk);
+  ASSERT_NE(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+}
+
+TEST_F(TestDelegate, ComplexDelegate) {
   delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({1, 2}));
   interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
 
@@ -1316,6 +1381,19 @@ TEST_F(TestDelegateWithDynamicTensors, AllowDynamicTensors) {
   ASSERT_EQ(interpreter_->execution_plan()[0], 1);
 }
 
+TEST_F(TestDelegateWithDynamicTensors, ModifyGraphAfterAllocate) {
+  // Trigger allocation *before* delegate application.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  delegate_.flags = kTfLiteDelegateFlagsAllowDynamicTensors;
+  ASSERT_EQ(interpreter_->ModifyGraphWithDelegate(&delegate_), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+  ASSERT_EQ(interpreter_->execution_plan()[0], 1);
+
+  // Allocation should still succeed.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+}
+
 TEST(TestDelegateOwnership, ProperlyDisposed) {
   struct TfLiteInterpreterOwnedDelegate : public TfLiteDelegate {
     TfLiteInterpreterOwnedDelegate(bool* destroyed, bool* prepared)
@@ -1368,6 +1446,130 @@ TEST(TestDelegateOwnership, ProperlyDisposed) {
   EXPECT_TRUE(destroyed);
 }
 
+// CancellationData contains the data required to cancel a call to Invoke().
+struct CancellationData {
+  bool is_cancelled = false;
+};
+
+// Indicates whether Invoke() has been cancelled based on the value of the
+// CancellationData object passed in.
+bool CheckCancellation(void* data) {
+  CancellationData* cancellation_data =
+      static_cast<struct CancellationData*>(data);
+  return cancellation_data->is_cancelled;
+}
+
+static struct CancellationData cancellation_data_;
+
+// Test fixture to test cancellation within the Interpreter.
+class CancellationTest : public ::testing::Test {
+ public:
+  TfLiteStatus Invoke() { return interpreter_.Invoke(); }
+  void Cancel() { cancellation_data_.is_cancelled = true; }
+
+  // Adds an CancelOp with input tensor `input` and output tensor `output`.
+  void MakeCancelNode(int input, int output) {
+    TfLiteRegistration op = CancelOpRegistration();
+    ASSERT_EQ(interpreter_.AddNodeWithParameters({input}, {output}, nullptr, 0,
+                                                 nullptr, &op),
+              kTfLiteOk);
+    ASSERT_EQ(interpreter_.ResizeInputTensor(input, {3}), kTfLiteOk);
+  }
+
+  // Adds an OkOp with input tensor `input` and output tensor `output`.
+  void MakeOkNode(int input, int output) {
+    TfLiteRegistration op = OkOpRegistration();
+    ASSERT_EQ(interpreter_.AddNodeWithParameters({input}, {output}, nullptr, 0,
+                                                 nullptr, &op),
+              kTfLiteOk);
+    ASSERT_EQ(interpreter_.ResizeInputTensor(input, {3}), kTfLiteOk);
+  }
+
+  Interpreter interpreter_;
+
+ private:
+  // Build the kernel registration for an op that cancels the operation.
+  TfLiteRegistration CancelOpRegistration() {
+    TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+
+    // Set output size to the input size in CancelOp::Prepare(). Code exists to
+    // have a framework in Prepare. The input and output tensors are not used.
+    reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+      TfLiteTensor* in_tensor = &context->tensors[node->inputs->data[0]];
+      TfLiteTensor* out_tensor = &context->tensors[node->outputs->data[0]];
+      TfLiteIntArray* new_size = TfLiteIntArrayCopy(in_tensor->dims);
+      return context->ResizeTensor(context, out_tensor, new_size);
+    };
+
+    reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+      cancellation_data_.is_cancelled = true;
+      return kTfLiteOk;
+    };
+    return reg;
+  }
+
+  // Build the kernel registration for an op that returns kTfLiteOk.
+  TfLiteRegistration OkOpRegistration() {
+    TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+
+    // Set output size to the input size in OkOp::Prepare(). Code exists to have
+    // a framework in Prepare. The input and output tensors are not used.
+    reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+      TfLiteTensor* in_tensor = &context->tensors[node->inputs->data[0]];
+      TfLiteTensor* out_tensor = &context->tensors[node->outputs->data[0]];
+      TfLiteIntArray* new_size = TfLiteIntArrayCopy(in_tensor->dims);
+      return context->ResizeTensor(context, out_tensor, new_size);
+    };
+
+    reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+      return kTfLiteOk;
+    };
+    return reg;
+  }
+
+  void SetUp() final {
+    cancellation_data_.is_cancelled = false;
+
+    // Set up the interpreter. Create the input and output tensors.
+    int num_tensors = 3;
+    ASSERT_EQ(interpreter_.AddTensors(num_tensors), kTfLiteOk);
+    interpreter_.SetInputs({0});
+    interpreter_.SetOutputs({2});
+    TfLiteQuantizationParams quantized;
+    for (int tensor_index = 0; tensor_index < num_tensors; tensor_index++) {
+      ASSERT_EQ(interpreter_.SetTensorParametersReadWrite(
+                    tensor_index, kTfLiteFloat32, "", {3}, quantized),
+                kTfLiteOk);
+    }
+    interpreter_.SetCancellationFunction(&cancellation_data_,
+                                         &CheckCancellation);
+  }
+};
+
+TEST_F(CancellationTest, CancelBeforeInvoke) {
+  // Cancel prior to calling Invoke.
+  CancellationTest::MakeOkNode(1, 2);
+  ASSERT_EQ(interpreter_.AllocateTensors(), kTfLiteOk);
+
+  CancellationTest::Cancel();
+  TfLiteStatus invoke_error_code = CancellationTest::Invoke();
+  ASSERT_EQ(invoke_error_code, kTfLiteError);
+}
+
+TEST_F(CancellationTest, CancelDuringInvoke) {
+  // Tests a model which sets the cancel in order to test cancellation works
+  // between ops.
+  //
+  // The first op will set the cancellation bit to true. The second op returns
+  // `kTfLiteOk` if executed.
+  CancellationTest::MakeCancelNode(0, 1);
+  CancellationTest::MakeOkNode(1, 2);
+  ASSERT_EQ(interpreter_.AllocateTensors(), kTfLiteOk);
+
+  TfLiteStatus invoke_error_code = CancellationTest::Invoke();
+  ASSERT_EQ(invoke_error_code, kTfLiteError);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/java/AndroidManifest.xml b/tensorflow/lite/java/AndroidManifest.xml
index b91c6d149a213926be90b9b131bd632d4f79a0fc..a76a727ec75d231a506b4ef693b3dcd681515b1a 100644
--- a/tensorflow/lite/java/AndroidManifest.xml
+++ b/tensorflow/lite/java/AndroidManifest.xml
@@ -3,7 +3,6 @@
     package="org.tensorflow.lite">
 
     <uses-sdk
-        android:minSdkVersion="4"
         android:targetSdkVersion="19" />
 
     <application />
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index adf7bc9087878ad84824844139058c140d7084f8..8983079a31d7d99dbd666387c0a2c0ded63747e8 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -90,6 +90,9 @@ java_test(
     size = "small",
     srcs = ["src/test/java/org/tensorflow/lite/TensorFlowLiteTest.java"],
     javacopts = JAVACOPTS,
+    tags = [
+        "no_mac",  # TODO(b/122888913): libtensorflowlite_test_jni broke on mac.
+    ],
     test_class = "org.tensorflow.lite.TensorFlowLiteTest",
     deps = [
         ":tensorflowlitelib",
@@ -103,6 +106,9 @@ java_test(
     size = "small",
     srcs = ["src/test/java/org/tensorflow/lite/DataTypeTest.java"],
     javacopts = JAVACOPTS,
+    tags = [
+        "no_mac",  # TODO(b/122888913): libtensorflowlite_test_jni broke on mac.
+    ],
     test_class = "org.tensorflow.lite.DataTypeTest",
     deps = [
         ":tensorflowlitelib",
@@ -121,10 +127,14 @@ java_test(
         "src/testdata/int64.bin",
         "src/testdata/invalid_model.bin",
         "src/testdata/quantized.bin",
+        "src/testdata/string.bin",
         "src/testdata/uint8.bin",
         "src/testdata/with_custom_op.lite",
     ],
     javacopts = JAVACOPTS,
+    tags = [
+        "no_mac",  # TODO(b/122888913): libtensorflowlite_test_jni broke on mac.
+    ],
     test_class = "org.tensorflow.lite.NativeInterpreterWrapperTest",
     deps = [
         ":tensorflowlitelib",
@@ -144,6 +154,9 @@ java_test(
         "//tensorflow/lite:testdata/multi_add_flex.bin",
     ],
     javacopts = JAVACOPTS,
+    tags = [
+        "no_mac",  # TODO(b/122888913): libtensorflowlite_test_jni broke on mac.
+    ],
     test_class = "org.tensorflow.lite.InterpreterTest",
     visibility = ["//visibility:private"],
     deps = [
@@ -162,6 +175,9 @@ java_test(
         "//tensorflow/lite:testdata/multi_add_flex.bin",
     ],
     javacopts = JAVACOPTS,
+    tags = [
+        "no_oss",  # Currently requires --config=monolithic, b/118895218.
+    ],
     test_class = "org.tensorflow.lite.InterpreterFlexTest",
     visibility = ["//visibility:private"],
     deps = [
@@ -179,6 +195,9 @@ java_test(
         "src/testdata/add.bin",
     ],
     javacopts = JAVACOPTS,
+    tags = [
+        "no_mac",  # TODO(b/122888913): libtensorflowlite_test_jni broke on mac.
+    ],
     test_class = "org.tensorflow.lite.TensorTest",
     deps = [
         ":tensorflowlitelib",
@@ -192,6 +211,9 @@ filegroup(
     srcs = select({
         "//conditions:default": [":libtensorflowlite_jni.so"],
     }),
+    tags = [
+        "no_mac",  # TODO(b/122888913): libtensorflowlite_test_jni broke on mac.
+    ],
     visibility = ["//visibility:public"],
 )
 
diff --git a/tensorflow/lite/java/demo/app/build.gradle b/tensorflow/lite/java/demo/app/build.gradle
index b8fc282cb1dfe8a9c80692759e985bf369fc163d..8ea16a3417ca9733f518776692114501c4162a0e 100644
--- a/tensorflow/lite/java/demo/app/build.gradle
+++ b/tensorflow/lite/java/demo/app/build.gradle
@@ -2,7 +2,7 @@ apply plugin: 'com.android.application'
 
 android {
     compileSdkVersion 26
-    buildToolsVersion "26.0.1"
+    buildToolsVersion "27.0.3"
     defaultConfig {
         applicationId "android.example.com.tflitecamerademo"
         // Required by Camera2 API.
@@ -10,11 +10,6 @@ android {
         targetSdkVersion 26
         versionCode 1
         versionName "1.0"
-
-        // Remove this block.
-        jackOptions {
-            enabled true
-        }
     }
     lintOptions {
         abortOnError false
@@ -40,6 +35,7 @@ repositories {
         url 'https://google.bintray.com/tensorflow'
     }
 }
+
 allprojects {
     repositories {
         // Uncomment if you want to use a local repo.
@@ -48,20 +44,18 @@ allprojects {
     }
 }
 
-
-
 dependencies {
-    compile fileTree(dir: 'libs', include: ['*.jar'])
-    compile 'com.android.support:appcompat-v7:25.2.0'
-    compile 'com.android.support.constraint:constraint-layout:1.0.2'
-    compile 'com.android.support:design:25.2.0'
-    compile 'com.android.support:support-annotations:25.3.1'
-    compile 'com.android.support:support-v13:25.2.0'
+    implementation fileTree(dir: 'libs', include: ['*.jar'])
+    implementation 'com.android.support:appcompat-v7:25.2.0'
+    implementation 'com.android.support.constraint:constraint-layout:1.0.2'
+    implementation 'com.android.support:design:25.2.0'
+    implementation 'com.android.support:support-annotations:25.3.1'
+    implementation 'com.android.support:support-v13:25.2.0'
 
     // Build off of nightly TensorFlow Lite
-    compile 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
+    implementation 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
     // Use local TensorFlow library
-    // compile 'org.tensorflow:tensorflow-lite-local:0.0.0'
+    // implementation 'org.tensorflow:tensorflow-lite-local:0.0.0'
 }
 
 def targetFolder = "src/main/assets"
diff --git a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
index 165d33510131ac9c9fc08070f0a4d08653188fae..c6f315b545bbe8196999df07c6a4bcdfdaafa2d5 100644
--- a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
@@ -45,7 +45,6 @@ import android.os.Bundle;
 import android.os.Handler;
 import android.os.HandlerThread;
 import android.support.annotation.NonNull;
-import android.support.v13.app.FragmentCompat;
 import android.support.v4.content.ContextCompat;
 import android.text.SpannableString;
 import android.text.SpannableStringBuilder;
@@ -62,6 +61,7 @@ import android.widget.ListView;
 import android.widget.NumberPicker;
 import android.widget.TextView;
 import android.widget.Toast;
+import android.support.v13.app.FragmentCompat;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -71,7 +71,6 @@ import java.util.List;
 import java.util.concurrent.Semaphore;
 import java.util.concurrent.TimeUnit;
 
-
 /** Basic fragments for the Camera. */
 public class Camera2BasicFragment extends Fragment
     implements FragmentCompat.OnRequestPermissionsResultCallback {
@@ -190,6 +189,8 @@ public class Camera2BasicFragment extends Fragment
 
   int currentModel = -1;
 
+  int currentNumThreads = -1;
+
   /** An additional thread for running tasks that shouldn't block the UI. */
   private HandlerThread backgroundThread;
 
@@ -323,13 +324,16 @@ public class Camera2BasicFragment extends Fragment
     // Get UI information before delegating to background
     final int modelIndex = modelView.getCheckedItemPosition();
     final int deviceIndex = deviceView.getCheckedItemPosition();
+    final int numThreads = np.getValue();
 
     backgroundHandler.post(() -> {
-      if (modelIndex == currentModel && deviceIndex == currentDevice) {
+      if (modelIndex == currentModel && deviceIndex == currentDevice
+              && numThreads == currentNumThreads) {
         return;
       }
       currentModel = modelIndex;
       currentDevice = deviceIndex;
+      currentNumThreads = numThreads;
 
       // Disable classifier while updating
       if (classifier != null) {
@@ -357,7 +361,11 @@ public class Camera2BasicFragment extends Fragment
         classifier = null;
       }
 
-      // Customzie the interpreter to the type of device we want to use.
+      // Customize the interpreter to the type of device we want to use.
+      if (classifier == null) {
+        return;
+      }
+      classifier.setNumThreads(numThreads);
       if (device.equals(cpu)) {
       } else if (device.equals(gpu)) {
         if (!GpuDelegateHelper.isGpuDelegateAvailable()) {
@@ -437,7 +445,7 @@ public class Camera2BasicFragment extends Fragment
         new NumberPicker.OnValueChangeListener() {
           @Override
           public void onValueChange(NumberPicker picker, int oldVal, int newVal) {
-            backgroundHandler.post(() -> classifier.setNumThreads(newVal));
+            updateActiveModel();
           }
         });
 
@@ -476,7 +484,9 @@ public class Camera2BasicFragment extends Fragment
 
   @Override
   public void onDestroy() {
-    classifier.close();
+    if (classifier != null) {
+      classifier.close();
+    }
     super.onDestroy();
   }
 
@@ -805,7 +815,9 @@ public class Camera2BasicFragment extends Fragment
   /** Classifies a frame from the preview stream. */
   private void classifyFrame() {
     if (classifier == null || getActivity() == null || cameraDevice == null) {
-      showToast("Uninitialized Classifier or invalid context.");
+      // It's important to not call showToast every frame, or else the app will starve and
+      // hang. updateActiveModel() already puts a error message up with showToast.
+      // showToast("Uninitialized Classifier or invalid context.");
       return;
     }
     SpannableStringBuilder textToShow = new SpannableStringBuilder();
diff --git a/tensorflow/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml b/tensorflow/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
index ee71ab808f4810ac092b37b0d996331072f44652..323b21dbcea3bd45f5dbca44aaf4823e4e8009b9 100644
--- a/tensorflow/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
+++ b/tensorflow/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
@@ -16,67 +16,103 @@
 
 <LinearLayout
     xmlns:android="http://schemas.android.com/apk/res/android"
-    android:layout_width="match_parent"
+    xmlns:tools="http://schemas.android.com/tools" android:layout_width="match_parent"
     android:layout_height="match_parent"
     android:background="#bb7700"
     android:orientation="horizontal">
 
-  <com.example.android.tflitecamerademo.AutoFitTextureView
+    <com.example.android.tflitecamerademo.AutoFitTextureView
       android:id="@+id/texture"
       android:layout_width="0dp"
       android:layout_height="match_parent"
       android:layout_weight=".8"/>
 
-  <LinearLayout
+    <LinearLayout
       android:layout_width="0dp"
       android:layout_height="match_parent"
       android:layout_weight=".2"
       android:orientation="vertical">
 
-    <ImageView
-        android:id="@+id/logoview"
-        android:layout_width="wrap_content"
-        android:layout_height="wrap_content"
-        android:scaleType="centerInside"
-        android:src="@drawable/logo"/>
-
-    <RadioGroup
-        android:gravity="center"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:orientation="horizontal">
-        <RadioButton
-            android:id="@+id/radio_cpu"
-            android:background="#0000000f"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:text="@string/cpu"
-            android:textColor="@android:color/white" />
-        <RadioButton
-            android:id="@+id/radio_nnapi"
-            android:background="#0000000f"
+        <ImageView
+            android:id="@+id/logoview"
             android:layout_width="wrap_content"
+            android:layout_height="47dp"
+            android:scaleType="centerInside"
+            android:src="@drawable/logo"/>
+
+        <TextView
+            android:id="@+id/text"
+            android:layout_width="match_parent"
+            android:layout_height="160dp"
+            android:paddingTop="20dp"
+            android:textColor="#FFF"
+            android:textSize="20sp"
+            android:textStyle="bold"/>
+        <LinearLayout
+            android:id="@+id/modelLayout"
+            android:layout_width="match_parent"
+            android:layout_height="150dp"
+            android:orientation="vertical">
+
+            <TextView
+                android:id="@+id/textView"
+                android:layout_width="match_parent"
+                android:layout_height="20dp"
+                android:text="@string/modelLabel"
+                android:textAlignment="center"
+                android:textColor="@android:color/white"/>
+
+            <ListView
+                android:id="@+id/model"
+                android:layout_width="match_parent"
+                android:layout_height="wrap_content">
+
+            </ListView>
+        </LinearLayout>
+        <LinearLayout
+            android:id="@+id/deviceLayout"
+            android:layout_width="match_parent"
+            android:layout_height="150dp"
+            android:orientation="vertical">
+
+            <TextView
+                android:id="@+id/textView2"
+                android:layout_width="match_parent"
+                android:layout_height="20dp"
+                android:text="@string/deviceLabel"
+                android:textAlignment="center"
+                android:textColor="@android:color/white"/>
+
+            <ListView
+                android:id="@+id/device"
+                android:layout_width="match_parent"
+                android:layout_height="wrap_content"/>
+
+        </LinearLayout>
+
+        <LinearLayout
+            android:layout_width="match_parent"
             android:layout_height="wrap_content"
-            android:text="@string/nnapi"
-            android:textColor="@android:color/white" />
-        </RadioGroup>
-
-    <NumberPicker
-        android:id="@+id/np"
-        android:layout_width="wrap_content"
-        android:layout_height="47dp"
-        android:layout_gravity="center_horizontal"
-        android:visibility="visible"/>
-
-    <TextView
-        android:id="@+id/text"
-        android:textStyle="bold"
-        android:layout_width="match_parent"
-        android:layout_height="match_parent"
-        android:paddingTop="20dp"
-        android:textColor="#FFF"
-        android:textSize="20sp"/>
-
-  </LinearLayout>
+        >
+
+            <TextView
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:gravity="center"
+                android:text="Threads"
+                android:textAlignment="center"
+                android:textColor="@android:color/white"/>
+
+            <NumberPicker
+                android:id="@+id/np"
+                android:layout_width="match_parent"
+                android:layout_height="wrap_content"
+                android:layout_marginLeft="10dp"
+                android:theme="@style/AppTheme.Picker"
+                android:visibility="visible"/>
+
+        </LinearLayout>
+
+    </LinearLayout>
 </LinearLayout>
 
diff --git a/tensorflow/lite/java/demo/build.gradle b/tensorflow/lite/java/demo/build.gradle
index b78a0b86c939620b6f05483ce45c4d3ef0ef595e..a88b3fdc70d9bbd45fa15ad31b4d38a377621c16 100644
--- a/tensorflow/lite/java/demo/build.gradle
+++ b/tensorflow/lite/java/demo/build.gradle
@@ -2,10 +2,11 @@
 
 buildscript {
     repositories {
+        google()
         jcenter()
     }
     dependencies {
-        classpath 'com.android.tools.build:gradle:2.3.1'
+        classpath 'com.android.tools.build:gradle:3.1.4'
 
         // NOTE: Do not place your application dependencies here; they belong
         // in the individual module build.gradle files
@@ -14,6 +15,7 @@ buildscript {
 
 allprojects {
     repositories {
+        google()
         jcenter()
     }
 }
diff --git a/tensorflow/lite/java/demo/gradle/wrapper/gradle-wrapper.properties b/tensorflow/lite/java/demo/gradle/wrapper/gradle-wrapper.properties
index fa7a38a0e43eecd1e7292dd49efa79a5d0742e2a..9ff32fe2bb7afeaefdc8b3d6a1ecb0d32e1aed60 100644
--- a/tensorflow/lite/java/demo/gradle/wrapper/gradle-wrapper.properties
+++ b/tensorflow/lite/java/demo/gradle/wrapper/gradle-wrapper.properties
@@ -3,4 +3,4 @@ distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
 zipStoreBase=GRADLE_USER_HOME
 zipStorePath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-3.3-all.zip
+distributionUrl=https\://services.gradle.org/distributions/gradle-4.4-all.zip
diff --git a/tensorflow/lite/java/ovic/BUILD b/tensorflow/lite/java/ovic/BUILD
index 774320871eec9afb2fae31824dc021fb7d338e1e..b00c9cd05809c9a694f32a25ae4fde3c33d40a88 100644
--- a/tensorflow/lite/java/ovic/BUILD
+++ b/tensorflow/lite/java/ovic/BUILD
@@ -19,7 +19,10 @@ java_test(
         "//tensorflow/lite/java/ovic/src/testdata:ovic_testdata",
     ],
     javacopts = JAVACOPTS,
-    tags = ["no_oss"],
+    tags = [
+        "no_mac",
+        "no_oss",  # TODO(b/122888913): libtensorflowlite_test_jni broke on mac.
+    ],
     test_class = "org.tensorflow.ovic.OvicClassifierTest",
     visibility = ["//visibility:public"],
     deps = [
@@ -87,7 +90,10 @@ java_test(
         "//tensorflow/lite/java/ovic/src/testdata:ovic_testdata",
     ],
     javacopts = JAVACOPTS,
-    tags = ["no_oss"],
+    tags = [
+        "no_mac",
+        "no_oss",  # TODO(b/122888913): libtensorflowlite_test_jni broke on mac.
+    ],
     test_class = "org.tensorflow.ovic.OvicDetectorTest",
     visibility = ["//visibility:public"],
     deps = [
diff --git a/tensorflow/lite/java/ovic/README.md b/tensorflow/lite/java/ovic/README.md
index 368c486f4f1ddd021e0bcfcdf9d82034ba5db82b..b7bf658352206c7d1cd6fb28ef537c5696b1712a 100644
--- a/tensorflow/lite/java/ovic/README.md
+++ b/tensorflow/lite/java/ovic/README.md
@@ -137,7 +137,14 @@ If you are adding a detection model, simply modify `modelPath` and `testImagePat
 
 * Adjust the benchmark parameters when needed:
 
-You can chnage the length of each experiment, and the processor affinity below. `BIG_CORE_MASK` is an integer whose binary encoding represents the set of used cores. This number is phone-specific. For example, Pixel 2 has 8 cores: the 4 little cores are represented by the 4 less significant bits, and the 4 big cores by the 4 more significant bits. Therefore a mask value of 16, or in binary `00010000`, represents using only the first big core. The mask 32, or in binary `00100000` uses the second big core and should deliver identical results as the mask 16 because the big cores are interchangeable.
+You can change the length of each experiment, and the processor affinity below.
+`BIG_CORE_MASK` is an integer whose binary encoding represents the set of used
+cores. This number is phone-specific. For example, Pixel 2 has 8 cores: the 4
+little cores are represented by the 4 less significant bits, and the 4 big cores
+by the 4 more significant bits. Therefore a mask value of 16, or in binary
+`00010000`, represents using only the first big core. The mask 32, or in binary
+`00100000` uses the second big core and should deliver identical results as the
+mask 16 because the big cores are interchangeable.
 
 ```
   /** Wall time for each benchmarking experiment. */
diff --git a/tensorflow/lite/java/ovic/demo/app/build.gradle b/tensorflow/lite/java/ovic/demo/app/build.gradle
index 4f3a6cdb2f8fe58008c9315bf08f4d328e720073..77f568448a810c61ece9feef65fad422356be2f1 100644
--- a/tensorflow/lite/java/ovic/demo/app/build.gradle
+++ b/tensorflow/lite/java/ovic/demo/app/build.gradle
@@ -2,18 +2,13 @@ apply plugin: 'com.android.application'
 
 android {
     compileSdkVersion 26
-    buildToolsVersion "26.0.1"
+    buildToolsVersion "27.0.3"
     defaultConfig {
         applicationId "android.example.com.ovicbenchmarker"
         minSdkVersion 15
         targetSdkVersion 26
         versionCode 1
         versionName "1.0"
-
-        // Remove this block.
-        jackOptions {
-            enabled true
-        }
     }
     lintOptions {
         abortOnError false
@@ -41,12 +36,12 @@ repositories {
 }
 
 dependencies {
-    compile fileTree(dir: 'libs', include: ['*.jar'])
-    compile 'com.android.support:appcompat-v7:25.2.0'
-    compile 'com.android.support.constraint:constraint-layout:1.0.2'
-    compile 'com.android.support:design:25.2.0'
-    compile 'com.android.support:support-annotations:25.3.1'
-    compile 'com.android.support:support-v13:25.2.0'
+    implementation fileTree(dir: 'libs', include: ['*.jar'])
+    implementation 'com.android.support:appcompat-v7:25.2.0'
+    implementation 'com.android.support.constraint:constraint-layout:1.0.2'
+    implementation 'com.android.support:design:25.2.0'
+    implementation 'com.android.support:support-annotations:25.3.1'
+    implementation 'com.android.support:support-v13:25.2.0'
 
-    compile 'org.tensorflow:tensorflow-lite:+'
+    implementation 'org.tensorflow:tensorflow-lite:+'
 }
diff --git a/tensorflow/lite/java/ovic/demo/build.gradle b/tensorflow/lite/java/ovic/demo/build.gradle
index b78a0b86c939620b6f05483ce45c4d3ef0ef595e..a88b3fdc70d9bbd45fa15ad31b4d38a377621c16 100644
--- a/tensorflow/lite/java/ovic/demo/build.gradle
+++ b/tensorflow/lite/java/ovic/demo/build.gradle
@@ -2,10 +2,11 @@
 
 buildscript {
     repositories {
+        google()
         jcenter()
     }
     dependencies {
-        classpath 'com.android.tools.build:gradle:2.3.1'
+        classpath 'com.android.tools.build:gradle:3.1.4'
 
         // NOTE: Do not place your application dependencies here; they belong
         // in the individual module build.gradle files
@@ -14,6 +15,7 @@ buildscript {
 
 allprojects {
     repositories {
+        google()
         jcenter()
     }
 }
diff --git a/tensorflow/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.properties b/tensorflow/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.properties
index fa7a38a0e43eecd1e7292dd49efa79a5d0742e2a..9ff32fe2bb7afeaefdc8b3d6a1ecb0d32e1aed60 100644
--- a/tensorflow/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.properties
+++ b/tensorflow/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.properties
@@ -3,4 +3,4 @@ distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
 zipStoreBase=GRADLE_USER_HOME
 zipStorePath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-3.3-all.zip
+distributionUrl=https\://services.gradle.org/distributions/gradle-4.4-all.zip
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index 2203d5fbdb260aaf2bf826343343426a5015e889..5aef4fb05723d170e0c8b08ac18bce44bd11eb7b 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -116,9 +116,26 @@ public final class Interpreter implements AutoCloseable {
       return this;
     }
 
+    /**
+     * Advanced: Set if buffer handle output is allowed.
+     *
+     * <p>When a {@link Delegate} supports hardware acceleration, the interpreter will make the data
+     * of output tensors available in the CPU-allocated tensor buffers by default. If the client can
+     * consume the buffer handle directly (e.g. reading output from OpenGL texture), it can set this
+     * flag to false, avoiding the copy of data to the CPU buffer. The delegate documentation should
+     * indicate whether this is supported and how it can be used.
+     *
+     * <p>WARNING: This is an experimental interface that is subject to change.
+     */
+    public Options setAllowBufferHandleOutput(boolean allow) {
+      this.allowBufferHandleOutput = allow;
+      return this;
+    }
+
     int numThreads = -1;
-    boolean useNNAPI = false;
-    boolean allowFp16PrecisionForFp32 = false;
+    Boolean useNNAPI;
+    Boolean allowFp16PrecisionForFp32;
+    Boolean allowBufferHandleOutput;
     final List<Delegate> delegates = new ArrayList<>();
   }
 
@@ -217,11 +234,15 @@ public final class Interpreter implements AutoCloseable {
    *     including int, float, long, and byte. {@link ByteBuffer} is the preferred way to pass large
    *     input data for primitive types, whereas string types require using the (multi-dimensional)
    *     array input path. When {@link ByteBuffer} is used, its content should remain unchanged
-   *     until model inference is done.
+   *     until model inference is done. A {@code null} value is allowed only if the caller is using
+   *     a {@link Delegate} that allows buffer handle interop, and such a buffer has been bound to
+   *     the input {@link Tensor}.
    * @param output a multidimensional array of output data, or a {@link ByteBuffer} of primitive
-   *     types including int, float, long, and byte.
+   *     types including int, float, long, and byte. A null value is allowed only if the caller is
+   *     using a {@link Delegate} that allows buffer handle interop, and such a buffer has been
+   *     bound to the output {@link Tensor}. See also {@link Options#setAllowBufferHandleOutput()}.
    */
-  public void run(@NonNull Object input, @NonNull Object output) {
+  public void run(Object input, Object output) {
     Object[] inputs = {input};
     Map<Integer, Object> outputs = new HashMap<>();
     outputs.put(0, output);
@@ -234,6 +255,10 @@ public final class Interpreter implements AutoCloseable {
    * <p>Warning: The API runs much faster if {@link ByteBuffer} is used as input data type. Please
    * consider using {@link ByteBuffer} to feed primitive input data for better performance.
    *
+   * <p>Note: {@code null} values for invididual elements of {@code inputs} and {@code outputs} is
+   * allowed only if the caller is using a {@link Delegate} that allows buffer handle interop, and
+   * such a buffer has been bound to the corresponding input or output {@link Tensor}(s).
+   *
    * @param inputs an array of input data. The inputs should be in the same order as inputs of the
    *     model. Each input can be an array or multidimensional array, or a {@link ByteBuffer} of
    *     primitive types including int, float, long, and byte. {@link ByteBuffer} is the preferred
@@ -349,6 +374,20 @@ public final class Interpreter implements AutoCloseable {
     wrapper.setNumThreads(numThreads);
   }
 
+  /**
+   * Advanced: Modifies the graph with the provided {@link Delegate}.
+   *
+   * <p>Note: The typical path for providing delegates is via {@link Options#addDelegate}, at
+   * creation time. This path should only be used when a delegate might require coordinated
+   * interaction between Interpeter creation and delegate application.
+   *
+   * <p>WARNING: This is an experimental API and subject to change.
+   */
+  public void modifyGraphWithDelegate(Delegate delegate) {
+    checkNotClosed();
+    wrapper.modifyGraphWithDelegate(delegate);
+  }
+
   /** Release resources associated with the {@code Interpreter}. */
   @Override
   public void close() {
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index 1952db0267bb7b26f24d819a69f9f312caf776ac..580dbef084a439ac47596524d43f1dcc66333a3f 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -69,11 +69,15 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     this.interpreterHandle = createInterpreter(modelHandle, errorHandle, options.numThreads);
     this.inputTensors = new Tensor[getInputCount(interpreterHandle)];
     this.outputTensors = new Tensor[getOutputCount(interpreterHandle)];
-    if (options.useNNAPI) {
-      setUseNNAPI(options.useNNAPI);
+    if (options.useNNAPI != null) {
+      setUseNNAPI(options.useNNAPI.booleanValue());
     }
-    if (options.allowFp16PrecisionForFp32) {
-      setAllowFp16PrecisionForFp32(options.allowFp16PrecisionForFp32);
+    if (options.allowFp16PrecisionForFp32 != null) {
+      allowFp16PrecisionForFp32(
+          interpreterHandle, options.allowFp16PrecisionForFp32.booleanValue());
+    }
+    if (options.allowBufferHandleOutput != null) {
+      allowBufferHandleOutput(interpreterHandle, options.allowBufferHandleOutput.booleanValue());
     }
     for (Delegate delegate : options.delegates) {
       applyDelegate(interpreterHandle, errorHandle, delegate.getNativeHandle());
@@ -180,14 +184,15 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     useNNAPI(interpreterHandle, useNNAPI);
   }
 
-  void setAllowFp16PrecisionForFp32(boolean allow) {
-    allowFp16PrecisionForFp32(interpreterHandle, allow);
-  }
-
   void setNumThreads(int numThreads) {
     numThreads(interpreterHandle, numThreads);
   }
 
+  void modifyGraphWithDelegate(Delegate delegate) {
+    applyDelegate(interpreterHandle, errorHandle, delegate.getNativeHandle());
+    delegates.add(delegate);
+  }
+
   /** Gets index of an input given its name. */
   int getInputIndex(String name) {
     if (inputsIndexes == null) {
@@ -356,6 +361,8 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   private static native void allowFp16PrecisionForFp32(long interpreterHandle, boolean allow);
 
+  private static native void allowBufferHandleOutput(long interpreterHandle, boolean allow);
+
   private static native long createErrorReporter(int size);
 
   private static native long createModel(String modelPathOrBuffer, long errorHandle);
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
index 7aa24b4198a110f68680c0f8ec2a527b23c5e1bc..16cca45f388953e0616dd0b1b4c24114e5a6108b 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
@@ -80,17 +80,34 @@ public final class Tensor {
     return shapeCopy;
   }
 
+  /**
+   * Returns the (global) index of the tensor within the owning {@link Interpreter}.
+   *
+   * @hide
+   */
+  public int index() {
+    return index(nativeHandle);
+  }
+
   /**
    * Copies the contents of the provided {@code src} object to the Tensor.
    *
    * <p>The {@code src} should either be a (multi-dimensional) array with a shape matching that of
-   * this tensor, or a {@link ByteByffer} of compatible primitive type with a matching flat size.
+   * this tensor, a {@link ByteByffer} of compatible primitive type with a matching flat size, or
+   * {@code null} iff the tensor has an underlying delegate buffer handle.
    *
    * @throws IllegalArgumentException if the tensor is a scalar or if {@code src} is not compatible
    *     with the tensor (for example, mismatched data types or shapes).
    */
   void setTo(Object src) {
-    throwExceptionIfTypeIsIncompatible(src);
+    if (src == null) {
+      if (hasDelegateBufferHandle(nativeHandle)) {
+        return;
+      }
+      throw new IllegalArgumentException(
+          "Null inputs are allowed only if the Tensor is bound to a buffer handle.");
+    }
+    throwIfDataIsIncompatible(src);
     if (isByteBuffer(src)) {
       ByteBuffer srcBuffer = (ByteBuffer) src;
       // For direct ByteBuffer instances we support zero-copy. Note that this assumes the caller
@@ -108,12 +125,20 @@ public final class Tensor {
   /**
    * Copies the contents of the tensor to {@code dst} and returns {@code dst}.
    *
-   * @param dst the destination buffer, either an explicitly-typed array or a {@link ByteBuffer}.
+   * @param dst the destination buffer, either an explicitly-typed array, a {@link ByteBuffer} or
+   *     {@code null} iff the tensor has an underlying delegate buffer handle.
    * @throws IllegalArgumentException if {@code dst} is not compatible with the tensor (for example,
    *     mismatched data types or shapes).
    */
   Object copyTo(Object dst) {
-    throwExceptionIfTypeIsIncompatible(dst);
+    if (dst == null) {
+      if (hasDelegateBufferHandle(nativeHandle)) {
+        return dst;
+      }
+      throw new IllegalArgumentException(
+          "Null outputs are allowed only if the Tensor is bound to a buffer handle.");
+    }
+    throwIfDataIsIncompatible(dst);
     if (dst instanceof ByteBuffer) {
       ByteBuffer dstByteBuffer = (ByteBuffer) dst;
       dstByteBuffer.put(buffer());
@@ -126,11 +151,15 @@ public final class Tensor {
   /** Returns the provided buffer's shape if specified and different from this Tensor's shape. */
   // TODO(b/80431971): Remove this method after deprecating multi-dimensional array inputs.
   int[] getInputShapeIfDifferent(Object input) {
+    if (input == null) {
+      return null;
+    }
     // Implicit resizes based on ByteBuffer capacity isn't supported, so short-circuit that path.
     // The ByteBuffer's size will be validated against this Tensor's size in {@link #setTo(Object)}.
     if (isByteBuffer(input)) {
       return null;
     }
+    throwIfTypeIsIncompatible(input);
     int[] inputShape = computeShapeOf(input);
     if (Arrays.equals(shapeCopy, inputShape)) {
       return null;
@@ -215,16 +244,14 @@ public final class Tensor {
     }
   }
 
-  private void throwExceptionIfTypeIsIncompatible(Object o) {
+  private void throwIfDataIsIncompatible(Object o) {
+    throwIfTypeIsIncompatible(o);
+    throwIfShapeIsIncompatible(o);
+  }
+
+  private void throwIfTypeIsIncompatible(Object o) {
+    // ByteBuffer payloads can map to any type, so exempt it from the check.
     if (isByteBuffer(o)) {
-      ByteBuffer oBuffer = (ByteBuffer) o;
-      if (oBuffer.capacity() != numBytes()) {
-        throw new IllegalArgumentException(
-            String.format(
-                "Cannot convert between a TensorFlowLite buffer with %d bytes and a "
-                    + "ByteBuffer with %d bytes.",
-                numBytes(), oBuffer.capacity()));
-      }
       return;
     }
     DataType oType = dataTypeOf(o);
@@ -235,7 +262,20 @@ public final class Tensor {
                   + "object of type %s (which is compatible with the TensorFlowLite type %s).",
               dtype, o.getClass().getName(), oType));
     }
+  }
 
+  private void throwIfShapeIsIncompatible(Object o) {
+    if (isByteBuffer(o)) {
+      ByteBuffer oBuffer = (ByteBuffer) o;
+      if (oBuffer.capacity() != numBytes()) {
+        throw new IllegalArgumentException(
+            String.format(
+                "Cannot convert between a TensorFlowLite buffer with %d bytes and a "
+                    + "ByteBuffer with %d bytes.",
+                numBytes(), oBuffer.capacity()));
+      }
+      return;
+    }
     int[] oShape = computeShapeOf(o);
     if (!Arrays.equals(oShape, shapeCopy)) {
       throw new IllegalArgumentException(
@@ -278,10 +318,14 @@ public final class Tensor {
 
   private static native int numBytes(long handle);
 
+  private static native boolean hasDelegateBufferHandle(long handle);
+
   private static native void readMultiDimensionalArray(long handle, Object dst);
 
   private static native void writeMultiDimensionalArray(long handle, Object src);
 
+  private static native int index(long handle);
+
   static {
     TensorFlowLite.init();
   }
diff --git a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index 1e98f942504b7e4f238d8715de1dc75eedf046cf..d3759c97fb228d2b2d9b474c9b0a593a92aa647f 100644
--- a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -245,6 +245,14 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_allowFp16PrecisionForFp32(
   interpreter->SetAllowFp16PrecisionForFp32(static_cast<bool>(allow));
 }
 
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_allowBufferHandleOutput(
+    JNIEnv* env, jclass clazz, jlong handle, jboolean allow) {
+  tflite::Interpreter* interpreter = convertLongToInterpreter(env, handle);
+  if (interpreter == nullptr) return;
+  interpreter->SetAllowBufferHandleOutput(allow);
+}
+
 JNIEXPORT void JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_numThreads(JNIEnv* env,
                                                              jclass clazz,
diff --git a/tensorflow/lite/java/src/main/native/tensor_jni.cc b/tensorflow/lite/java/src/main/native/tensor_jni.cc
index 82d2679de9c868694668bca23ce6c8a6fb55dbe8..f07437e7f318944d6d254c5820d58fccc5a74f87 100644
--- a/tensorflow/lite/java/src/main/native/tensor_jni.cc
+++ b/tensorflow/lite/java/src/main/native/tensor_jni.cc
@@ -35,6 +35,7 @@ class TensorHandle {
       : interpreter_(interpreter), tensor_index_(tensor_index) {}
 
   TfLiteTensor* tensor() const { return interpreter_->tensor(tensor_index_); }
+  int index() const { return tensor_index_; }
 
  private:
   tflite::Interpreter* const interpreter_;
@@ -50,6 +51,15 @@ TfLiteTensor* GetTensorFromHandle(JNIEnv* env, jlong handle) {
   return reinterpret_cast<TensorHandle*>(handle)->tensor();
 }
 
+int GetTensorIndexFromHandle(JNIEnv* env, jlong handle) {
+  if (handle == 0) {
+    throwException(env, kIllegalArgumentException,
+                   "Internal error: Invalid handle to TfLiteTensor.");
+    return -1;
+  }
+  return reinterpret_cast<TensorHandle*>(handle)->index();
+}
+
 size_t ElementByteSize(TfLiteType data_type) {
   // The code in this file makes the assumption that the
   // TensorFlow TF_DataTypes and the Java primitive types
@@ -399,3 +409,20 @@ JNIEXPORT jint JNICALL Java_org_tensorflow_lite_Tensor_numBytes(JNIEnv* env,
   if (tensor == nullptr) return 0;
   return static_cast<jint>(tensor->bytes);
 }
+
+JNIEXPORT jboolean JNICALL
+Java_org_tensorflow_lite_Tensor_hasDelegateBufferHandle(JNIEnv* env,
+                                                        jclass clazz,
+                                                        jlong handle) {
+  const TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
+  if (tensor == nullptr) return false;
+  return tensor->delegate && (tensor->buffer_handle != kTfLiteNullBufferHandle)
+             ? JNI_TRUE
+             : JNI_FALSE;
+}
+
+JNIEXPORT jint JNICALL Java_org_tensorflow_lite_Tensor_index(JNIEnv* env,
+                                                             jclass clazz,
+                                                             jlong handle) {
+  return GetTensorIndexFromHandle(env, handle);
+}
diff --git a/tensorflow/lite/java/src/main/native/tensor_jni.h b/tensorflow/lite/java/src/main/native/tensor_jni.h
index ec0442e93f6f9d8b7e90eb1cf6b6556abac0097b..a14f24a47d0861881870558a4d7b0cd5082d713a 100644
--- a/tensorflow/lite/java/src/main/native/tensor_jni.h
+++ b/tensorflow/lite/java/src/main/native/tensor_jni.h
@@ -84,6 +84,16 @@ JNIEXPORT jint JNICALL Java_org_tensorflow_lite_Tensor_numBytes(JNIEnv* env,
                                                                 jclass clazz,
                                                                 jlong handle);
 
+/*
+ *  Class:     org_tensorflow_lite_Tensor
+ *  Method:    hasDelegateBufferHandle
+ *  Signature: (J)Z
+ */
+JNIEXPORT jboolean JNICALL
+Java_org_tensorflow_lite_Tensor_hasDelegateBufferHandle(JNIEnv* env,
+                                                        jclass clazz,
+                                                        jlong handle);
+
 /*
  *  Class:     org_tensorflow_lite_Tensor
  *  Method:    readMultiDimensionalArray
@@ -106,6 +116,15 @@ Java_org_tensorflow_lite_Tensor_writeMultiDimensionalArray(JNIEnv* env,
                                                            jlong handle,
                                                            jobject src);
 
+/*
+ *  Class:     org_tensorflow_lite_Tensor
+ *  Method:    index
+ *  Signature: (J)I
+ */
+JNIEXPORT jint JNICALL Java_org_tensorflow_lite_Tensor_index(JNIEnv* env,
+                                                             jclass clazz,
+                                                             jlong handle);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index e635515de8cfdc2b4ed283adc8fc64803816258e..ff3325633c1d71a950682764b6d1576e3b75ed6a 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -21,6 +21,7 @@ import static org.junit.Assert.fail;
 import java.io.File;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
+import java.nio.FloatBuffer;
 import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.file.Files;
@@ -169,11 +170,13 @@ public final class InterpreterTest {
   public void testRunForMultipleInputsOutputs() {
     Interpreter interpreter = new Interpreter(MULTIPLE_INPUTS_MODEL_FILE);
     assertThat(interpreter.getInputTensorCount()).isEqualTo(4);
+    assertThat(interpreter.getInputTensor(0).index()).isGreaterThan(-1);
     assertThat(interpreter.getInputTensor(0).dataType()).isEqualTo(DataType.FLOAT32);
     assertThat(interpreter.getInputTensor(1).dataType()).isEqualTo(DataType.FLOAT32);
     assertThat(interpreter.getInputTensor(2).dataType()).isEqualTo(DataType.FLOAT32);
     assertThat(interpreter.getInputTensor(3).dataType()).isEqualTo(DataType.FLOAT32);
     assertThat(interpreter.getOutputTensorCount()).isEqualTo(2);
+    assertThat(interpreter.getOutputTensor(0).index()).isGreaterThan(-1);
     assertThat(interpreter.getOutputTensor(0).dataType()).isEqualTo(DataType.FLOAT32);
     assertThat(interpreter.getOutputTensor(1).dataType()).isEqualTo(DataType.FLOAT32);
 
@@ -245,6 +248,18 @@ public final class InterpreterTest {
     interpreter.close();
   }
 
+  @Test
+  public void testRunWithUnsupportedInputType() {
+    FloatBuffer floatBuffer = FloatBuffer.allocate(10);
+    float[][][][] parsedOutputs = new float[2][8][8][3];
+    try (Interpreter interpreter = new Interpreter(MODEL_FILE)) {
+      interpreter.run(floatBuffer, parsedOutputs);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e).hasMessageThat().contains("DataType error: cannot resolve DataType of");
+    }
+  }
+
   @Test
   public void testRunWithWrongOutputType() {
     Interpreter interpreter = new Interpreter(MODEL_FILE);
@@ -332,6 +347,30 @@ public final class InterpreterTest {
     interpreter.close();
   }
 
+  @Test
+  public void testNullInputs() throws Exception {
+    Interpreter interpreter = new Interpreter(MODEL_FILE);
+    try {
+      interpreter.run(null, new float[2][8][8][3]);
+      fail();
+    } catch (IllegalArgumentException e) {
+      // Expected failure.
+    }
+    interpreter.close();
+  }
+
+  @Test
+  public void testNullOutputs() throws Exception {
+    Interpreter interpreter = new Interpreter(MODEL_FILE);
+    try {
+      interpreter.run(new float[2][8][8][3], null);
+      fail();
+    } catch (IllegalArgumentException e) {
+      // Expected failure.
+    }
+    interpreter.close();
+  }
+
   /** Smoke test validating that flex model loading fails when the flex delegate is not linked. */
   @Test
   public void testFlexModel() throws Exception {
@@ -361,10 +400,57 @@ public final class InterpreterTest {
     float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
     float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
     float[][][][] fourD = {threeD, threeD};
-    float[] output = new float[1];
-    interpreter.run(fourD, output);
-    float[] expected = {7.0f};
-    assertThat(output).usingTolerance(0.1f).containsExactly(expected).inOrder();
+    float[][][][] parsedOutputs = new float[2][8][8][3];
+    interpreter.run(fourD, parsedOutputs);
+    float[] outputOneD = parsedOutputs[0][0][0];
+    float[] expected = {7.0f, 7.0f, 7.0f};
+    assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
+
+    interpreter.close();
+  }
+
+  @Test
+  public void testNullInputsAndOutputsWithDelegate() throws Exception {
+    System.loadLibrary("tensorflowlite_test_jni");
+    Delegate delegate =
+        new Delegate() {
+          @Override
+          public long getNativeHandle() {
+            return getNativeHandleForDelegate();
+          }
+        };
+    Interpreter interpreter =
+        new Interpreter(MODEL_FILE, new Interpreter.Options().addDelegate(delegate));
+    // The delegate installs a custom buffer handle for all tensors, in turn allowing null to be
+    // provided for the inputs/outputs (as the client can reference the buffer directly).
+    interpreter.run(new float[2][8][8][3], null);
+    interpreter.run(null, new float[2][8][8][3]);
+    interpreter.close();
+  }
+
+  @Test
+  public void testModifyGraphWithDelegate() throws Exception {
+    System.loadLibrary("tensorflowlite_test_jni");
+    Delegate delegate =
+        new Delegate() {
+          @Override
+          public long getNativeHandle() {
+            return getNativeHandleForDelegate();
+          }
+        };
+    Interpreter interpreter = new Interpreter(MODEL_FILE);
+    interpreter.modifyGraphWithDelegate(delegate);
+
+    // The native delegate stubs out the graph with a single op that produces the scalar value 7.
+    float[] oneD = {1.23f, 6.54f, 7.81f};
+    float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    float[][][][] fourD = {threeD, threeD};
+    float[][][][] parsedOutputs = new float[2][8][8][3];
+    interpreter.run(fourD, parsedOutputs);
+    float[] outputOneD = parsedOutputs[0][0][0];
+    float[] expected = {7.0f, 7.0f, 7.0f};
+    assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
 
     interpreter.close();
   }
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
index 35ff4328b83e3b6bfc83c2bedf3f20c4ebed9b89..d9b20510106909d53b9024986a4daa88fc355177 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorTest.java
@@ -50,6 +50,7 @@ public final class TensorTest {
     outputs.put(0, new float[2][8][8][3]);
     wrapper.run(inputs, outputs);
     tensor = wrapper.getOutputTensor(0);
+    assertThat(tensor.index()).isGreaterThan(-1);
   }
 
   @After
@@ -77,6 +78,16 @@ public final class TensorTest {
     assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
   }
 
+  @Test
+  public void testCopyToNull() {
+    try {
+      tensor.copyTo(null);
+      fail();
+    } catch (IllegalArgumentException e) {
+      // Success.
+    }
+  }
+
   @Test
   public void testCopyToByteBuffer() {
     ByteBuffer parsedOutput =
@@ -149,6 +160,16 @@ public final class TensorTest {
     assertThat(output[0][0][0][0]).isEqualTo(3.0f);
   }
 
+  @Test
+  public void testSetToNull() {
+    try {
+      tensor.setTo(null);
+      fail();
+    } catch (IllegalArgumentException e) {
+      // Success.
+    }
+  }
+
   @Test
   public void testSetToInvalidByteBuffer() {
     ByteBuffer input = ByteBuffer.allocateDirect(3 * 4).order(ByteOrder.nativeOrder());
diff --git a/tensorflow/lite/java/src/test/native/BUILD b/tensorflow/lite/java/src/test/native/BUILD
index 481aea7ecd5dd8f9c26307e3b00992e21e6c2501..994f2389b46c32c70c353afec1c7c2bb427f99cb 100644
--- a/tensorflow/lite/java/src/test/native/BUILD
+++ b/tensorflow/lite/java/src/test/native/BUILD
@@ -16,6 +16,7 @@ cc_library(
     deps = [
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/java/jni",
+        "//tensorflow/lite/kernels:kernel_util",
     ],
 )
 
diff --git a/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc b/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc
index 1a0072a7c67b418975625aefff3a4dd84b4e6bf9..f5bcc1249f0c2d6f624e7f9f4ae40ec912e3c401 100644
--- a/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc
+++ b/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include <jni.h>
+#include <algorithm>
 #include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -23,22 +25,23 @@ extern "C" {
 JNIEXPORT jlong JNICALL
 Java_org_tensorflow_lite_InterpreterTest_getNativeHandleForDelegate(
     JNIEnv* env, jclass clazz) {
-  // A simple op which outputs a vector of length 1 with the value [7].
+  // A simple op which outputs a tensor with values of 7.
   static TfLiteRegistration registration = {
       .init = nullptr,
       .free = nullptr,
       .prepare =
           [](TfLiteContext* context, TfLiteNode* node) {
+            TfLiteTensor* input = &context->tensors[node->inputs->data[0]];
             TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
-            TfLiteIntArray* scalar_size = TfLiteIntArrayCreate(1);
-            scalar_size->data[0] = 1;
+            TfLiteIntArray* output_dims = TfLiteIntArrayCopy(input->dims);
             output->type = kTfLiteFloat32;
-            return context->ResizeTensor(context, output, scalar_size);
+            return context->ResizeTensor(context, output, output_dims);
           },
       .invoke =
           [](TfLiteContext* context, TfLiteNode* node) {
             TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
-            output->data.f[0] = 7.0f;
+            std::fill(output->data.f,
+                      output->data.f + tflite::NumElements(output), 7.0f);
             return kTfLiteOk;
           },
       .profiling_string = nullptr,
@@ -46,8 +49,6 @@ Java_org_tensorflow_lite_InterpreterTest_getNativeHandleForDelegate(
       .custom_name = "",
       .version = 1,
   };
-  // A simple delegate which replaces all ops with a single op that outputs a
-  // vector of length 1 with the value [7].
   static TfLiteDelegate delegate = {
       .data_ = nullptr,
       .Prepare = [](TfLiteContext* context,
@@ -57,6 +58,11 @@ Java_org_tensorflow_lite_InterpreterTest_getNativeHandleForDelegate(
             context->GetExecutionPlan(context, &execution_plan));
         context->ReplaceNodeSubsetsWithDelegateKernels(
             context, registration, execution_plan, delegate);
+        // Now bind delegate buffer handles for all tensors.
+        for (size_t i = 0; i < context->tensors_size; ++i) {
+          context->tensors[i].delegate = delegate;
+          context->tensors[i].buffer_handle = static_cast<int>(i);
+        }
         return kTfLiteOk;
       },
       .CopyFromBufferHandle = nullptr,
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index bad1c4aebf1e9d9c7c6d33f87a6e7ea9cab8d700..c24b6ede630a11bedbd471281997de6624e3d9de 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -25,9 +25,6 @@ tf_cc_test(
     name = "optional_tensor_test",
     size = "small",
     srcs = ["optional_tensor_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -49,6 +46,7 @@ cc_library(
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/testing:util",
+        "//tensorflow/lite/tools/optimize:quantization_utils",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -113,8 +111,8 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels/internal:quantization_util",
         "//tensorflow/lite/kernels/internal:round",
-        "//tensorflow/lite/kernels/internal:types",
     ],
 )
 
@@ -122,9 +120,6 @@ tf_cc_test(
     name = "kernel_util_test",
     size = "small",
     srcs = ["kernel_util_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":kernel_util",
         "//tensorflow/lite/testing:util",
@@ -136,9 +131,6 @@ tf_cc_test(
     name = "test_util_test",
     size = "small",
     srcs = ["test_util_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",  # TODO(b/117786830)
-    ],
     deps = [
         ":test_util",
         "//tensorflow/lite/testing:util",
@@ -160,6 +152,7 @@ cc_library(
     srcs = [
         "activations.cc",
         "add.cc",
+        "add_n.cc",
         "arg_min_max.cc",
         "audio_spectrogram.cc",
         "basic_rnn.cc",
@@ -167,6 +160,7 @@ cc_library(
         "bidirectional_sequence_lstm.cc",
         "bidirectional_sequence_rnn.cc",
         "cast.cc",
+        "ceil.cc",
         "comparisons.cc",
         "concatenation.cc",
         "conv.cc",
@@ -186,9 +180,10 @@ cc_library(
         "floor_mod.cc",
         "fully_connected.cc",
         "gather.cc",
+        "gather_nd.cc",
         "hashtable_lookup.cc",
+        "if.cc",
         "l2norm.cc",
-        "layer_norm_lstm.cc",
         "local_response_norm.cc",
         "logical.cc",
         "lsh_projection.cc",
@@ -204,18 +199,18 @@ cc_library(
         "pooling.cc",
         "pow.cc",
         "range.cc",
+        "rank.cc",
         "reduce.cc",
-        "relu1.cc",
         "reshape.cc",
         "resize_bilinear.cc",
         "resize_nearest_neighbor.cc",
+        "reverse.cc",
         "select.cc",
         "shape.cc",
         "skip_gram.cc",
         "slice.cc",
         "space_to_batch_nd.cc",
         "space_to_depth.cc",
-        "sparse_output_fully_connected.cc",
         "sparse_to_dense.cc",
         "split.cc",
         "split_v.cc",
@@ -230,7 +225,10 @@ cc_library(
         "transpose_conv.cc",
         "unidirectional_sequence_lstm.cc",
         "unidirectional_sequence_rnn.cc",
+        "unique.cc",
         "unpack.cc",
+        "where.cc",
+        "while.cc",
         "zeros_like.cc",
     ],
     hdrs = [
@@ -285,13 +283,24 @@ cc_library(
     ],
 )
 
+# The builtin_ops target will resolve to optimized kernels when available. This
+# target uses reference kernels only, and is useful for validation and testing.
+# It should *not* generally be used in production.
+cc_library(
+    name = "reference_ops",
+    srcs = ["register_ref.cc"],
+    hdrs = ["register_ref.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:util",
+        "//tensorflow/lite/c:c_api_internal",
+    ],
+)
+
 tf_cc_test(
     name = "audio_spectrogram_test",
     size = "small",
     srcs = ["audio_spectrogram_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -305,9 +314,6 @@ tf_cc_test(
     name = "mfcc_test",
     size = "small",
     srcs = ["mfcc_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -321,25 +327,6 @@ tf_cc_test(
     name = "detection_postprocess_test",
     size = "small",
     srcs = ["detection_postprocess_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-        "@flatbuffers",
-    ],
-)
-
-tf_cc_test(
-    name = "relu1_test",
-    size = "small",
-    srcs = ["relu1_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -350,26 +337,21 @@ tf_cc_test(
 )
 
 tf_cc_test(
-    name = "sparse_output_fully_connected_test",
+    name = "activations_test",
     size = "small",
-    srcs = ["sparse_output_fully_connected_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
+    srcs = ["activations_test.cc"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
-        "@flatbuffers",
     ],
 )
 
 tf_cc_test(
-    name = "activations_test",
+    name = "add_test",
     size = "small",
-    srcs = ["activations_test.cc"],
-    tags = ["tflite_not_portable_ios"],
+    srcs = ["add_test.cc"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -379,14 +361,13 @@ tf_cc_test(
 )
 
 tf_cc_test(
-    name = "add_test",
+    name = "add_n_test",
     size = "small",
-    srcs = ["add_test.cc"],
-    tags = ["tflite_not_portable_ios"],
+    srcs = ["add_n_test.cc"],
     deps = [
         ":builtin_ops",
+        ":test_util",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -395,9 +376,6 @@ tf_cc_test(
     name = "arg_min_max_test",
     size = "small",
     srcs = ["arg_min_max_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -410,9 +388,6 @@ tf_cc_test(
     name = "div_test",
     size = "small",
     srcs = ["div_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -425,9 +400,6 @@ tf_cc_test(
     name = "sub_test",
     size = "small",
     srcs = ["sub_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -440,9 +412,6 @@ tf_cc_test(
     name = "transpose_test",
     size = "small",
     srcs = ["transpose_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -457,9 +426,6 @@ tf_cc_test(
     name = "space_to_batch_nd_test",
     size = "small",
     srcs = ["space_to_batch_nd_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -472,9 +438,6 @@ tf_cc_test(
     name = "batch_to_space_nd_test",
     size = "small",
     srcs = ["batch_to_space_nd_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -487,9 +450,6 @@ tf_cc_test(
     name = "cast_test",
     size = "small",
     srcs = ["cast_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -502,7 +462,6 @@ tf_cc_test(
     name = "concatenation_test",
     size = "small",
     srcs = ["concatenation_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -515,7 +474,6 @@ tf_cc_test(
     name = "conv_test",
     size = "small",
     srcs = ["conv_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -529,7 +487,6 @@ tf_cc_test(
     name = "depthwise_conv_test",
     size = "small",
     srcs = ["depthwise_conv_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -543,13 +500,11 @@ tf_cc_test(
     name = "dequantize_test",
     size = "small",
     srcs = ["dequantize_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/kernels/internal:types",
         "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
     ],
@@ -559,7 +514,6 @@ tf_cc_test(
     name = "basic_rnn_test",
     size = "small",
     srcs = ["basic_rnn_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -572,9 +526,6 @@ tf_cc_test(
     name = "bidirectional_sequence_lstm_test",
     size = "small",
     srcs = ["bidirectional_sequence_lstm_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -588,6 +539,18 @@ tf_cc_test(
     name = "floor_test",
     size = "small",
     srcs = ["floor_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "ceil_test",
+    size = "small",
+    srcs = ["ceil_test.cc"],
     tags = [
         "tflite_not_portable_ios",
     ],
@@ -603,9 +566,6 @@ tf_cc_test(
     name = "elementwise_test",
     size = "small",
     srcs = ["elementwise_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -618,9 +578,6 @@ tf_cc_test(
     name = "unidirectional_sequence_lstm_test",
     size = "small",
     srcs = ["unidirectional_sequence_lstm_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -633,9 +590,6 @@ tf_cc_test(
     name = "bidirectional_sequence_rnn_test",
     size = "small",
     srcs = ["bidirectional_sequence_rnn_test.cc"],
-    tags = [
-        "tflite_not_portable",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -648,9 +602,6 @@ tf_cc_test(
     name = "unidirectional_sequence_rnn_test",
     size = "small",
     srcs = ["unidirectional_sequence_rnn_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -663,7 +614,6 @@ tf_cc_test(
     name = "l2norm_test",
     size = "small",
     srcs = ["l2norm_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -676,9 +626,6 @@ tf_cc_test(
     name = "exp_test",
     size = "small",
     srcs = ["exp_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -691,9 +638,6 @@ tf_cc_test(
     name = "fake_quant_test",
     size = "small",
     srcs = ["fake_quant_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -706,9 +650,6 @@ tf_cc_test(
     name = "maximum_minimum_test",
     size = "small",
     srcs = ["maximum_minimum_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -721,9 +662,6 @@ tf_cc_test(
     name = "reduce_test",
     size = "small",
     srcs = ["reduce_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -736,7 +674,6 @@ tf_cc_test(
     name = "mul_test",
     size = "small",
     srcs = ["mul_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -749,9 +686,6 @@ tf_cc_test(
     name = "pad_test",
     size = "small",
     srcs = ["pad_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -764,7 +698,6 @@ tf_cc_test(
     name = "reshape_test",
     size = "small",
     srcs = ["reshape_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -777,9 +710,19 @@ tf_cc_test(
     name = "gather_test",
     size = "small",
     srcs = ["gather_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
     ],
+)
+
+tf_cc_test(
+    name = "gather_nd_test",
+    size = "small",
+    srcs = ["gather_nd_test.cc"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -793,9 +736,6 @@ tf_cc_test(
     name = "topk_v2_test",
     size = "small",
     srcs = ["topk_v2_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -809,7 +749,6 @@ tf_cc_test(
     name = "resize_bilinear_test",
     size = "small",
     srcs = ["resize_bilinear_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -822,7 +761,6 @@ tf_cc_test(
     name = "resize_nearest_neighbor_test",
     size = "small",
     srcs = ["resize_nearest_neighbor_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -835,7 +773,6 @@ tf_cc_test(
     name = "svdf_test",
     size = "small",
     srcs = ["svdf_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -848,7 +785,6 @@ tf_cc_test(
     name = "embedding_lookup_test",
     size = "small",
     srcs = ["embedding_lookup_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -861,7 +797,6 @@ tf_cc_test(
     name = "embedding_lookup_sparse_test",
     size = "small",
     srcs = ["embedding_lookup_sparse_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -874,7 +809,6 @@ tf_cc_test(
     name = "fully_connected_test",
     size = "small",
     srcs = ["fully_connected_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -889,7 +823,6 @@ tf_cc_test(
     name = "local_response_norm_test",
     size = "small",
     srcs = ["local_response_norm_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -902,7 +835,6 @@ tf_cc_test(
     name = "pooling_test",
     size = "small",
     srcs = ["pooling_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -915,7 +847,6 @@ tf_cc_test(
     name = "softmax_test",
     size = "small",
     srcs = ["softmax_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -929,9 +860,6 @@ tf_cc_test(
     name = "log_softmax_test",
     size = "small",
     srcs = ["log_softmax_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -945,7 +873,6 @@ tf_cc_test(
     name = "lsh_projection_test",
     size = "small",
     srcs = ["lsh_projection_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -958,7 +885,6 @@ tf_cc_test(
     name = "hashtable_lookup_test",
     size = "small",
     srcs = ["hashtable_lookup_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -968,25 +894,10 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
-    name = "layer_norm_lstm_test",
-    size = "small",
-    srcs = ["layer_norm_lstm_test.cc"],
-    tags = ["tflite_not_portable_ios"],
-    deps = [
-        ":builtin_ops",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite/kernels:test_util",
-        "@com_google_googletest//:gtest",
-        "@flatbuffers",
-    ],
-)
-
 tf_cc_test(
     name = "lstm_test",
     size = "small",
     srcs = ["lstm_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -999,7 +910,6 @@ tf_cc_test(
     name = "skip_gram_test",
     size = "small",
     srcs = ["skip_gram_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1013,7 +923,6 @@ tf_cc_test(
     name = "space_to_depth_test",
     size = "small",
     srcs = ["space_to_depth_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1026,9 +935,6 @@ tf_cc_test(
     name = "split_test",
     size = "small",
     srcs = ["split_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1041,9 +947,6 @@ tf_cc_test(
     name = "split_v_test",
     size = "small",
     srcs = ["split_v_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1056,9 +959,6 @@ tf_cc_test(
     name = "squeeze_test",
     size = "small",
     srcs = ["squeeze_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1071,9 +971,6 @@ tf_cc_test(
     name = "strided_slice_test",
     size = "small",
     srcs = ["strided_slice_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1086,9 +983,6 @@ tf_cc_test(
     name = "tile_test",
     size = "small",
     srcs = ["tile_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1104,9 +998,6 @@ tf_cc_test(
     srcs = [
         "comparisons_test.cc",
     ],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1119,9 +1010,6 @@ tf_cc_test(
     name = "neg_test",
     size = "small",
     srcs = ["neg_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1136,9 +1024,6 @@ tf_cc_test(
     srcs = [
         "select_test.cc",
     ],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1153,9 +1038,6 @@ tf_cc_test(
     srcs = [
         "slice_test.cc",
     ],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1168,9 +1050,6 @@ tf_cc_test(
     name = "transpose_conv_test",
     size = "small",
     srcs = ["transpose_conv_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1184,9 +1063,6 @@ tf_cc_test(
     name = "expand_dims_test",
     size = "small",
     srcs = ["expand_dims_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1200,9 +1076,6 @@ tf_cc_test(
     name = "sparse_to_dense_test",
     size = "small",
     srcs = ["sparse_to_dense_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1216,9 +1089,19 @@ tf_cc_test(
     name = "shape_test",
     size = "small",
     srcs = ["shape_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
     ],
+)
+
+tf_cc_test(
+    name = "rank_test",
+    size = "small",
+    srcs = ["rank_test.cc"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1232,9 +1115,6 @@ tf_cc_test(
     name = "pow_test",
     size = "small",
     srcs = ["pow_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1248,7 +1128,6 @@ tf_cc_test(
     name = "pack_test",
     size = "small",
     srcs = ["pack_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1262,7 +1141,6 @@ tf_cc_test(
     name = "one_hot_test",
     size = "small",
     srcs = ["one_hot_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1275,7 +1153,6 @@ tf_cc_test(
     name = "logical_test",
     size = "small",
     srcs = ["logical_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1289,7 +1166,6 @@ tf_cc_test(
     name = "unpack_test",
     size = "small",
     srcs = ["unpack_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:builtin_op_data",
@@ -1303,7 +1179,19 @@ tf_cc_test(
     name = "floor_div_test",
     size = "small",
     srcs = ["floor_div_test.cc"],
-    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:builtin_op_data",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "where_test",
+    size = "small",
+    srcs = ["where_test.cc"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:builtin_op_data",
@@ -1317,7 +1205,6 @@ tf_cc_test(
     name = "zeros_like_test",
     size = "small",
     srcs = ["zeros_like_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:builtin_op_data",
@@ -1331,7 +1218,6 @@ tf_cc_test(
     name = "floor_mod_test",
     size = "small",
     srcs = ["floor_mod_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:builtin_op_data",
@@ -1345,7 +1231,6 @@ tf_cc_test(
     name = "range_test",
     size = "small",
     srcs = ["range_test.cc"],
-    tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:builtin_op_data",
@@ -1359,12 +1244,46 @@ tf_cc_test(
     name = "squared_difference_test",
     size = "small",
     srcs = ["squared_difference_test.cc"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:builtin_op_data",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "if_test",
+    size = "small",
+    srcs = ["if_test.cc"],
     tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
+        ":kernel_util",
+        ":subgraph_test_util",
+        ":test_util",
+        "//tensorflow/lite:builtin_op_data",
+        "//tensorflow/lite:framework",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "while_test",
+    size = "small",
+    srcs = ["while_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        ":kernel_util",
+        ":subgraph_test_util",
+        "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1372,7 +1291,29 @@ tf_cc_test(
     name = "fill_test",
     size = "small",
     srcs = ["fill_test.cc"],
-    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "unique_test",
+    srcs = ["unique_test.cc"],
+    deps = [
+        ":builtin_ops",
+        ":test_util",
+        "//tensorflow/lite:framework",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_cc_test(
+    name = "reverse_test",
+    size = "small",
+    srcs = ["reverse_test.cc"],
     deps = [
         ":builtin_ops",
         "//tensorflow/lite:framework",
@@ -1405,3 +1346,31 @@ tf_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "subgraph_test_util",
+    testonly = 1,
+    srcs = ["subgraph_test_util.cc"],
+    hdrs = ["subgraph_test_util.h"],
+    deps = [
+        ":builtin_ops",
+        ":kernel_util",
+        ":test_util",
+        "//tensorflow/lite:builtin_op_data",
+        "//tensorflow/lite:framework",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "subgraph_test_util_test",
+    size = "small",
+    srcs = ["subgraph_test_util_test.cc"],
+    deps = [
+        ":subgraph_test_util",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index a76654256044702736a2855d4bb12d445c90be55..930eabaeccfde5c9fce824a58d28d14783dde419 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -23,6 +23,10 @@ limitations under the License.
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/softmax.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -33,6 +37,11 @@ namespace ops {
 namespace builtin {
 namespace activations {
 
+enum KernelType {
+  kReference,
+  kGenericOptimized,
+};
+
 struct OpData {
   int32_t input_multiplier = 0;
   int input_left_shift = 0;
@@ -50,6 +59,20 @@ struct PreluOpData : public OpData {
   int output_shift = 0;
 };
 
+namespace {
+TfLiteStatus CheckOutputQuantParams(TfLiteContext* context,
+                                    const TfLiteTensor* input,
+                                    const TfLiteTensor* output) {
+  TF_LITE_ENSURE(context, output->params.scale == 1. / 256);
+  if (input->type == kTfLiteUInt8) {
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+  } else {
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, -128);
+  }
+  return kTfLiteOk;
+}
+}  // namespace
+
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   // This is a builtin op, so we don't use the contents in 'buffer', if any.
   // Instead, we allocate a new object to carry information from Prepare() to
@@ -98,7 +121,7 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
-  if (input->type == kTfLiteUInt8) {
+  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
     static constexpr int kInputIntegerBits = 4;
 
     const double input_real_multiplier =
@@ -157,8 +180,15 @@ TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
-  if (input->type == kTfLiteUInt8) {
-    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
+    if (input->type == kTfLiteUInt8) {
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point,
+                        std::numeric_limits<uint8_t>::min());
+    }
+    if (input->type == kTfLiteInt8) {
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point,
+                        std::numeric_limits<int8_t>::min());
+    }
     TF_LITE_ENSURE(context, output->params.scale == 1. / 256);
 
     static constexpr int kInputIntegerBits = 4;
@@ -215,12 +245,12 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
   const int num_dims = NumDimensions(input);
   TF_LITE_ENSURE(context, num_dims >= 1 && num_dims <= 4);
 
-  if (input->type == kTfLiteUInt8) {
-    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
-    TF_LITE_ENSURE(context, output->params.scale == 1. / 256);
+  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
+    if (CheckOutputQuantParams(context, input, output) == kTfLiteError) {
+      return kTfLiteError;
+    }
 
     static const int kScaledDiffIntegerBits = 5;
-
     tflite::PreprocessSoftmaxScaling(
         params->beta, input->params.scale, kScaledDiffIntegerBits,
         &data->input_multiplier, &data->input_left_shift);
@@ -241,8 +271,13 @@ TfLiteStatus LogSoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
-  if (input->type == kTfLiteUInt8) {
-    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 255);
+  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
+    if (input->type == kTfLiteUInt8) {
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 255);
+    }
+    if (input->type == kTfLiteInt8) {
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 127);
+    }
     TF_LITE_ENSURE_EQ(context, output->params.scale, 16.0 / 256);
 
     static const double kBeta = 1.0;
@@ -333,6 +368,24 @@ TfLiteStatus Relu1Eval(TfLiteContext* context, TfLiteNode* node) {
   }
 }
 
+namespace {
+template <typename T>
+void QuantizedRelu6(const TfLiteTensor* input, TfLiteTensor* output) {
+  ActivationParams params;
+  params.activation_type = FusedActivationFunctionType::kRelu6;
+  params.quantized_activation_min =
+      std::max(static_cast<int32_t>(std::numeric_limits<T>::min()),
+               output->params.zero_point +
+                   static_cast<int32>(roundf(0.f / output->params.scale)));
+  params.quantized_activation_max =
+      std::min(static_cast<int32_t>(std::numeric_limits<T>::max()),
+               output->params.zero_point +
+                   static_cast<int32>(roundf(6.f / output->params.scale)));
+  optimized_ops::ReluX(params, GetTensorShape(input), GetTensorData<T>(input),
+                       GetTensorShape(output), GetTensorData<T>(output));
+}
+}  // namespace
+
 TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
@@ -345,47 +398,51 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
       for (; in < in_end; in++, out++) *out = std::min(std::max(0.f, *in), 6.f);
       return kTfLiteOk;
     } break;
-    case kTfLiteUInt8: {
-      ActivationParams params;
-      params.activation_type = FusedActivationFunctionType::kRelu6;
-      params.quantized_activation_min = std::max(
-          0, output->params.zero_point +
-                 static_cast<int32>(roundf(0.f / output->params.scale)));
-      params.quantized_activation_max = std::min(
-          255, output->params.zero_point +
-                   static_cast<int32>(roundf(6.f / output->params.scale)));
-      optimized_ops::ReluX(params, GetTensorShape(input),
-                           GetTensorData<uint8>(input), GetTensorShape(output),
-                           GetTensorData<uint8>(output));
+    case kTfLiteUInt8:
+      QuantizedRelu6<uint8_t>(input, output);
+      return kTfLiteOk;
+    case kTfLiteInt8: {
+      QuantizedRelu6<int8_t>(input, output);
       return kTfLiteOk;
     } break;
     default:
       context->ReportError(
-          context, "Only float32 and uint8 supported currently, got %s.",
+          context, "Only float32, uint8 and int8 supported currently, got %s.",
           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
 
+template <KernelType kernel_type>
 TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   switch (input->type) {
     case kTfLiteFloat32: {
-      size_t elements = input->bytes / sizeof(float);
-      float* in = input->data.f;
-      float* in_end = in + elements;
-      float* out = output->data.f;
-      for (; in < in_end; in++, out++) *out = std::tanh(*in);
+      if (kernel_type == kGenericOptimized) {
+        optimized_ops::Tanh(GetTensorShape(input), GetTensorData<float>(input),
+                            GetTensorShape(output),
+                            GetTensorData<float>(output));
+      } else {
+        reference_ops::Tanh(GetTensorShape(input), GetTensorData<float>(input),
+                            GetTensorShape(output),
+                            GetTensorData<float>(output));
+      }
       return kTfLiteOk;
     } break;
     case kTfLiteInt16: {
       TanhParams params;
       params.input_left_shift = data->input_left_shift;
-      optimized_ops::Tanh(params, GetTensorShape(input),
-                          GetTensorData<int16_t>(input), GetTensorShape(output),
-                          GetTensorData<int16_t>(output));
+      if (kernel_type == kGenericOptimized) {
+        optimized_ops::Tanh(
+            params, GetTensorShape(input), GetTensorData<int16_t>(input),
+            GetTensorShape(output), GetTensorData<int16_t>(output));
+      } else {
+        reference_ops::Tanh(
+            params, GetTensorShape(input), GetTensorData<int16_t>(input),
+            GetTensorShape(output), GetTensorData<int16_t>(output));
+      }
       return kTfLiteOk;
     } break;
     case kTfLiteUInt8: {
@@ -394,9 +451,25 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
       params.input_range_radius = data->input_range_radius;
       params.input_multiplier = data->input_multiplier;
       params.input_left_shift = data->input_left_shift;
-      optimized_ops::Tanh(params, GetTensorShape(input),
-                          GetTensorData<uint8_t>(input), GetTensorShape(output),
-                          GetTensorData<uint8_t>(output));
+      if (kernel_type == kGenericOptimized) {
+        optimized_ops::Tanh(
+            params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+            GetTensorShape(output), GetTensorData<uint8_t>(output));
+      } else {
+        reference_ops::Tanh(
+            params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+            GetTensorShape(output), GetTensorData<uint8_t>(output));
+      }
+      return kTfLiteOk;
+    } break;
+    case kTfLiteInt8: {
+      const auto input_shape = GetTensorShape(input);
+      const auto output_shape = GetTensorShape(output);
+      const int size = MatchingFlatSize(input_shape, output_shape);
+      reference_integer_ops::Tanh(
+          input->params.zero_point, data->input_range_radius,
+          data->input_multiplier, data->input_left_shift, size,
+          GetTensorData<int8_t>(input), GetTensorData<int8_t>(output));
       return kTfLiteOk;
     } break;
     default:
@@ -407,6 +480,7 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 // Sigmoid is also know as "Logistic".
+template <KernelType kernel_type>
 TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
@@ -414,18 +488,28 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   switch (input->type) {
     case kTfLiteFloat32: {
-      size_t elements = input->bytes / sizeof(float);
-      float* in = input->data.f;
-      float* in_end = in + elements;
-      float* out = output->data.f;
-      for (; in < in_end; in++, out++) *out = 1.f / (1.f + std::exp(-*in));
+      if (kernel_type == kGenericOptimized) {
+        optimized_ops::Logistic(
+            GetTensorShape(input), GetTensorData<float>(input),
+            GetTensorShape(output), GetTensorData<float>(output));
+      } else {
+        reference_ops::Logistic(
+            GetTensorShape(input), GetTensorData<float>(input),
+            GetTensorShape(output), GetTensorData<float>(output));
+      }
       break;
     }
     case kTfLiteInt16: {
       LogisticParams params;
-      optimized_ops::Logistic(
-          params, GetTensorShape(input), GetTensorData<int16_t>(input),
-          GetTensorShape(output), GetTensorData<int16_t>(output));
+      if (kernel_type == kGenericOptimized) {
+        optimized_ops::Logistic(
+            params, GetTensorShape(input), GetTensorData<int16_t>(input),
+            GetTensorShape(output), GetTensorData<int16_t>(output));
+      } else {
+        reference_ops::Logistic(
+            params, GetTensorShape(input), GetTensorData<int16_t>(input),
+            GetTensorShape(output), GetTensorData<int16_t>(output));
+      }
       break;
     }
     case kTfLiteUInt8: {
@@ -434,9 +518,24 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
       params.input_range_radius = data->input_range_radius;
       params.input_multiplier = data->input_multiplier;
       params.input_left_shift = data->input_left_shift;
-      optimized_ops::Logistic(
-          params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-          GetTensorShape(output), GetTensorData<uint8_t>(output));
+      if (kernel_type == kGenericOptimized) {
+        optimized_ops::Logistic(
+            params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+            GetTensorShape(output), GetTensorData<uint8_t>(output));
+      } else {
+        reference_ops::Logistic(
+            params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+            GetTensorShape(output), GetTensorData<uint8_t>(output));
+      }
+      break;
+    }
+    case kTfLiteInt8: {
+      const int input_size =
+          MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
+      reference_integer_ops::Logistic(
+          input->params.zero_point, data->input_range_radius,
+          data->input_multiplier, data->input_left_shift, input_size,
+          GetTensorData<int8_t>(input), GetTensorData<int8_t>(output));
       break;
     }
     default:
@@ -508,8 +607,8 @@ void Softmax3DFloat(const TfLiteTensor* input, TfLiteTensor* output,
       GetTensorData<float>(output));
 }
 
-void Softmax1DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
-                        TfLiteSoftmaxParams* params, OpData* data) {
+void Softmax1DQuantizedUint8(const TfLiteTensor* input, TfLiteTensor* output,
+                             TfLiteSoftmaxParams* params, OpData* data) {
   // TODO(ahentz): this is arguably a dirty trick. Since the implementation
   // always traverses the last dimension of a 4D tensor, we will pretend our 1D
   // tensor is 4D in a special way. We will convert a (Y) shape into a (1,
@@ -524,8 +623,8 @@ void Softmax1DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
                          GetTensorShape({1, 1, 1, input_size}),
                          GetTensorData<uint8_t>(output));
 }
-void Softmax2DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
-                        TfLiteSoftmaxParams* params, OpData* data) {
+void Softmax2DQuantizedUint8(const TfLiteTensor* input, TfLiteTensor* output,
+                             TfLiteSoftmaxParams* params, OpData* data) {
   // TODO(ahentz): this is arguably a dirty trick. Since the implementation
   // always traverses the last dimension of a 4D tensor, we will pretend our 2D
   // tensor is 4D in a special way. We will convert a (X, Y) shape into a (X,
@@ -543,8 +642,8 @@ void Softmax2DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
                          GetTensorData<uint8_t>(output));
 }
 
-void Softmax3DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
-                        TfLiteSoftmaxParams* params, OpData* data) {
+void Softmax3DQuantizedUint8(const TfLiteTensor* input, TfLiteTensor* output,
+                             TfLiteSoftmaxParams* params, OpData* data) {
   const int batch_size = input->dims->data[0];
   const int intermediate_size = input->dims->data[1];
   const int input_size = input->dims->data[2];
@@ -569,8 +668,8 @@ void Softmax4DFloat(const TfLiteTensor* input, TfLiteTensor* output,
                          GetTensorData<float>(output));
 }
 
-void Softmax4DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
-                        TfLiteSoftmaxParams* params, OpData* data) {
+void Softmax4DQuantizedUint8(const TfLiteTensor* input, TfLiteTensor* output,
+                             TfLiteSoftmaxParams* params, OpData* data) {
   SoftmaxParams op_params;
   op_params.input_multiplier = data->input_multiplier;
   op_params.input_left_shift = data->input_left_shift;
@@ -580,6 +679,63 @@ void Softmax4DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
                          GetTensorData<uint8_t>(output));
 }
 
+// TODO(jianlijianli): Try merging Softmax<n>DQuantizedInt8 with
+// Softmax<n>DQuantized, which needs a larger refactor.
+void Softmax1DQuantizedInt8(const TfLiteTensor* input, TfLiteTensor* output,
+                            TfLiteSoftmaxParams* params, OpData* data) {
+  const int input_size = input->dims->data[0];
+  SoftmaxParams op_params;
+  op_params.input_multiplier = data->input_multiplier;
+  op_params.input_left_shift = data->input_left_shift;
+  op_params.diff_min = data->diff_min;
+  reference_integer_ops::Softmax(
+      op_params, GetTensorShape({1, 1, 1, input_size}),
+      GetTensorData<int8_t>(input), GetTensorShape({1, 1, 1, input_size}),
+      GetTensorData<int8_t>(output));
+}
+
+void Softmax2DQuantizedInt8(const TfLiteTensor* input, TfLiteTensor* output,
+                            TfLiteSoftmaxParams* params, OpData* data) {
+  const int batch_size = input->dims->data[0];
+  const int input_size = input->dims->data[1];
+  SoftmaxParams op_params;
+  op_params.input_multiplier = data->input_multiplier;
+  op_params.input_left_shift = data->input_left_shift;
+  op_params.diff_min = data->diff_min;
+  reference_integer_ops::Softmax(op_params,
+                                 GetTensorShape({batch_size, 1, 1, input_size}),
+                                 GetTensorData<int8_t>(input),
+                                 GetTensorShape({batch_size, 1, 1, input_size}),
+                                 GetTensorData<int8_t>(output));
+}
+
+void Softmax3DQuantizedInt8(const TfLiteTensor* input, TfLiteTensor* output,
+                            TfLiteSoftmaxParams* params, OpData* data) {
+  const int batch_size = input->dims->data[0];
+  const int intermediate_size = input->dims->data[1];
+  const int input_size = input->dims->data[2];
+  SoftmaxParams op_params;
+  op_params.input_multiplier = data->input_multiplier;
+  op_params.input_left_shift = data->input_left_shift;
+  op_params.diff_min = data->diff_min;
+  reference_integer_ops::Softmax(
+      op_params, GetTensorShape({batch_size, intermediate_size, 1, input_size}),
+      GetTensorData<int8_t>(input),
+      GetTensorShape({batch_size, intermediate_size, 1, input_size}),
+      GetTensorData<int8_t>(output));
+}
+
+void Softmax4DQuantizedInt8(const TfLiteTensor* input, TfLiteTensor* output,
+                            TfLiteSoftmaxParams* params, OpData* data) {
+  SoftmaxParams op_params;
+  op_params.input_multiplier = data->input_multiplier;
+  op_params.input_left_shift = data->input_left_shift;
+  op_params.diff_min = data->diff_min;
+  reference_integer_ops::Softmax(
+      op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+      GetTensorShape(output), GetTensorData<int8_t>(output));
+}
+
 TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSoftmaxParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
@@ -614,19 +770,19 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
     }
     case kTfLiteUInt8: {
       if (NumDimensions(input) == 1) {
-        Softmax1DQuantized(input, output, params, data);
+        Softmax1DQuantizedUint8(input, output, params, data);
         return kTfLiteOk;
       }
       if (NumDimensions(input) == 2) {
-        Softmax2DQuantized(input, output, params, data);
+        Softmax2DQuantizedUint8(input, output, params, data);
         return kTfLiteOk;
       }
       if (NumDimensions(input) == 3) {
-        Softmax3DQuantized(input, output, params, data);
+        Softmax3DQuantizedUint8(input, output, params, data);
         return kTfLiteOk;
       }
       if (NumDimensions(input) == 4) {
-        Softmax4DQuantized(input, output, params, data);
+        Softmax4DQuantizedUint8(input, output, params, data);
         return kTfLiteOk;
       }
       context->ReportError(
@@ -634,6 +790,30 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
           NumDimensions(input));
       return kTfLiteError;
     }
+    case kTfLiteInt8: {
+      if (NumDimensions(input) == 1) {
+        Softmax1DQuantizedInt8(input, output, params, data);
+        return kTfLiteOk;
+      }
+      if (NumDimensions(input) == 2) {
+        Softmax2DQuantizedInt8(input, output, params, data);
+        return kTfLiteOk;
+      }
+      if (NumDimensions(input) == 3) {
+        Softmax3DQuantizedInt8(input, output, params, data);
+        return kTfLiteOk;
+      }
+      if (NumDimensions(input) == 4) {
+        Softmax4DQuantizedInt8(input, output, params, data);
+        return kTfLiteOk;
+      }
+      context->ReportError(
+          context,
+          "Only 4D tensors supported currently for Int8 kernels, got %dD.",
+          NumDimensions(input));
+      return kTfLiteError;
+    }
+
     default:
       context->ReportError(
           context, "Only float32 and uint8_t supported currently, got %s.",
@@ -642,6 +822,7 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   }
 }
 
+template <KernelType kernel_type>
 TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   const LogSoftmaxOpData* data =
       reinterpret_cast<LogSoftmaxOpData*>(node->user_data);
@@ -650,9 +831,15 @@ TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   switch (input->type) {
     case kTfLiteFloat32: {
       SoftmaxParams op_params;
-      optimized_ops::LogSoftmax(
-          op_params, GetTensorShape(input), GetTensorData<float>(input),
-          GetTensorShape(output), GetTensorData<float>(output));
+      if (kernel_type == kGenericOptimized) {
+        optimized_ops::LogSoftmax(
+            op_params, GetTensorShape(input), GetTensorData<float>(input),
+            GetTensorShape(output), GetTensorData<float>(output));
+      } else {
+        reference_ops::LogSoftmax(
+            op_params, GetTensorShape(input), GetTensorData<float>(input),
+            GetTensorShape(output), GetTensorData<float>(output));
+      }
       return kTfLiteOk;
     }
     case kTfLiteUInt8: {
@@ -662,9 +849,30 @@ TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
       op_params.reverse_scaling_divisor = data->reverse_scaling_divisor;
       op_params.reverse_scaling_right_shift = data->reverse_scaling_right_shift;
       op_params.diff_min = data->diff_min;
-      optimized_ops::LogSoftmax(
-          op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-          GetTensorShape(output), GetTensorData<uint8_t>(output));
+      if (kernel_type == kGenericOptimized) {
+        optimized_ops::LogSoftmax(
+            op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+            GetTensorShape(output), GetTensorData<uint8_t>(output));
+      } else {
+        reference_ops::LogSoftmax(
+            op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+            GetTensorShape(output), GetTensorData<uint8_t>(output));
+      }
+      return kTfLiteOk;
+    }
+    case kTfLiteInt8: {
+      const auto input_shape = GetTensorShape(input);
+      const auto output_shape = GetTensorShape(output);
+      const int trailing_dim = input_shape.DimensionsCount() - 1;
+      const int outer_size =
+          MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+      const int depth =
+          MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+      reference_integer_ops::LogSoftmax(
+          data->input_multiplier, data->input_left_shift,
+          data->reverse_scaling_divisor, data->reverse_scaling_right_shift,
+          data->diff_min, outer_size, depth, GetTensorData<int8_t>(input),
+          GetTensorData<int8_t>(output));
       return kTfLiteOk;
     }
     default:
@@ -736,8 +944,31 @@ TfLiteStatus LeakyReluEval(TfLiteContext* context, TfLiteNode* node) {
   }
 }
 
+TfLiteStatus EluEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      optimized_ops::Elu(GetTensorShape(input), GetTensorData<float>(input),
+                         GetTensorShape(output), GetTensorData<float>(output));
+      return kTfLiteOk;
+    } break;
+    default:
+      context->ReportError(context, "Only float32 supported currently, got %s.",
+                           TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+}
+
 }  // namespace activations
 
+TfLiteRegistration* Register_ELU() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 activations::GenericPrepare,
+                                 activations::EluEval};
+  return &r;
+}
+
 TfLiteRegistration* Register_RELU() {
   static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
                                  activations::GenericPrepare,
@@ -759,17 +990,31 @@ TfLiteRegistration* Register_RELU6() {
   return &r;
 }
 
+TfLiteRegistration* Register_TANH_REF() {
+  static TfLiteRegistration r = {
+      activations::Init, activations::Free, activations::TanhPrepare,
+      activations::TanhEval<activations::kReference>};
+  return &r;
+}
+
 TfLiteRegistration* Register_TANH() {
-  static TfLiteRegistration r = {activations::Init, activations::Free,
-                                 activations::TanhPrepare,
-                                 activations::TanhEval};
+  static TfLiteRegistration r = {
+      activations::Init, activations::Free, activations::TanhPrepare,
+      activations::TanhEval<activations::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_LOGISTIC_REF() {
+  static TfLiteRegistration r = {
+      activations::Init, activations::Free, activations::SigmoidPrepare,
+      activations::SigmoidEval<activations::kReference>};
   return &r;
 }
 
 TfLiteRegistration* Register_LOGISTIC() {
-  static TfLiteRegistration r = {activations::Init, activations::Free,
-                                 activations::SigmoidPrepare,
-                                 activations::SigmoidEval};
+  static TfLiteRegistration r = {
+      activations::Init, activations::Free, activations::SigmoidPrepare,
+      activations::SigmoidEval<activations::kGenericOptimized>};
   return &r;
 }
 
@@ -780,10 +1025,19 @@ TfLiteRegistration* Register_SOFTMAX() {
   return &r;
 }
 
+TfLiteRegistration* Register_LOG_SOFTMAX_REF() {
+  static TfLiteRegistration r = {
+      activations::LogSoftmaxInit, activations::LogSoftmaxFree,
+      activations::LogSoftmaxPrepare,
+      activations::LogSoftmaxEval<activations::kReference>};
+  return &r;
+}
+
 TfLiteRegistration* Register_LOG_SOFTMAX() {
   static TfLiteRegistration r = {
       activations::LogSoftmaxInit, activations::LogSoftmaxFree,
-      activations::LogSoftmaxPrepare, activations::LogSoftmaxEval};
+      activations::LogSoftmaxPrepare,
+      activations::LogSoftmaxEval<activations::kGenericOptimized>};
   return &r;
 }
 
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index 67f137baff29808d7a03571e1880901e44c34712..25b17a9678728f0ee82ccf22e2a5b63eee2c3537 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -32,6 +32,8 @@ class BaseActivationsOpModel : public SingleOpModel {
     input_ = AddInput(input);
     if (input.type == TensorType_UINT8) {
       output_ = AddOutput({input.type, {}, 0, 0, 1. / 256});
+    } else if (input.type == TensorType_INT8) {
+      output_ = AddOutput({input.type, {}, 0, 0, 1. / 256, -128});
     } else {
       output_ = AddOutput({input.type, {}});
     }
@@ -44,6 +46,8 @@ class BaseActivationsOpModel : public SingleOpModel {
     input_ = AddInput(input);
     if (input.type == TensorType_UINT8) {
       output_ = AddOutput({input.type, {}, 0, 0, 1. / 256});
+    } else if (input.type == TensorType_INT8) {
+      output_ = AddOutput({TensorType_INT8, {}, 0, 0, 1. / 256, -128});
     } else {
       output_ = AddOutput({input.type, {}});
     }
@@ -52,8 +56,8 @@ class BaseActivationsOpModel : public SingleOpModel {
     BuildInterpreter({GetShape(input_)});
   }
 
-  BaseActivationsOpModel(BuiltinOperator type, const TensorData &input,
-                         const TensorData &output) {
+  BaseActivationsOpModel(BuiltinOperator type, const TensorData& input,
+                         const TensorData& output) {
     input_ = AddInput(input);
     output_ = AddOutput(output);
     SetBuiltinOp(type, BuiltinOptions_NONE, 0);
@@ -114,6 +118,20 @@ class QuantizedActivationsOpModel : public BaseActivationsOpModel {
   }
 };
 
+TEST(FloatActivationsOpTest, Elu) {
+  FloatActivationsOpModel m(BuiltinOperator_ELU,
+                            /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+  m.SetInput({
+      0, -6, 2, -4,     //
+      3, -2, 10, -0.1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                 0.0, -0.997521, 2.0, -0.981684,    //
+                                 3.0, -0.864665, 10.0, -0.0951626,  //
+                             })));
+}
+
 TEST(FloatActivationsOpTest, Relu) {
   FloatActivationsOpModel m(BuiltinOperator_RELU,
                             /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
@@ -170,7 +188,7 @@ TEST(FloatActivationsOpTest, Tanh) {
                              })));
 }
 
-TEST(QuantizedActivationsOpTest, Relu6) {
+TEST(QuantizedActivationsOpTest, Relu6Uint8) {
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
   QuantizedActivationsOpModel m(
@@ -193,7 +211,29 @@ TEST(QuantizedActivationsOpTest, Relu6) {
               ElementsAreArray({128, 128, 160, 192, 176, 128, 224, 144}));
 }
 
-TEST(QuantizedActivationsOpTest, Tanh) {
+TEST(QuantizedActivationsOpTest, Relu6Int8) {
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  QuantizedActivationsOpModel m(
+      BuiltinOperator_RELU6,
+      /*input=*/{TensorType_INT8, {1, 2, 4, 1}, 8 * kMin, 8 * kMax},
+      /*output=*/{TensorType_INT8, {1, 2, 4, 1}, 8 * kMin, 8 * kMax});
+  m.SetInput<int8_t>({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(), ElementsAreArray(ArrayFloatNear(
+                                                    {
+                                                        0, 0, 2, 4,  //
+                                                        3, 0, 6, 1,  //
+                                                    },
+                                                    kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray({0, 0, 32, 64, 48, 0, 96, 16}));
+}
+
+TEST(QuantizedActivationsOpTest, TanhUint8) {
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
   QuantizedActivationsOpModel m(
@@ -216,6 +256,29 @@ TEST(QuantizedActivationsOpTest, Tanh) {
               ElementsAreArray({128, 0, 251, 255, 0, 5, 255, 225}));
 }
 
+TEST(QuantizedActivationsOpTest, TanhInt8) {
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  QuantizedActivationsOpModel m(
+      BuiltinOperator_TANH,
+      /*input=*/{TensorType_INT8, {1, 2, 4, 1}, 8 * kMin, 8 * kMax},
+      /*output=*/{TensorType_INT8, {1, 2, 4, 1}, kMin, kMax});
+  m.SetInput<int8_t>({
+      0, -6, 2, 4,   //
+      -4, -2, 8, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.0, -0.999987, 0.964027, 0.999329,     //
+                      -0.999329, -0.96402, 0.99999, 0.76159,  //
+                  },
+                  kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray({0, -128, 123, 127, -128, -123, 127, 97}));
+}
+
 TEST(QuantizedActivationsOpTest, TanhInt16) {
   const float kMin = -1;
   const float kMax = 32767.f / 32768.f;
@@ -251,7 +314,7 @@ TEST(FloatActivationsOpTest, Sigmoid) {
                              })));
 }
 
-TEST(QuantizedActivationsOpTest, Sigmoid) {
+TEST(QuantizedActivationsOpTest, SigmoidUint8) {
   QuantizedActivationsOpModel m(
       BuiltinOperator_LOGISTIC,
       /*input=*/{TensorType_UINT8, {1, 2, 4, 1}, -10, 10});
@@ -271,6 +334,26 @@ TEST(QuantizedActivationsOpTest, Sigmoid) {
               ElementsAreArray({128, 1, 227, 251, 244, 32, 255, 188}));
 }
 
+TEST(QuantizedActivationsOpTest, SigmoidInt8) {
+  QuantizedActivationsOpModel m(
+      BuiltinOperator_LOGISTIC,
+      /*input=*/{TensorType_INT8, {1, 2, 4, 1}, -10, 10});
+  m.SetInput<int8_t>({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.5, 0.002473, 0.880797, 0.982014,       //
+                      0.952574, 0.119203, 0.999955, 0.731059,  //
+                  },
+                  kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray({0, -127, 99, 123, 116, -99, 127, 60}));
+}
+
 TEST(QuantizedActivationsOpTest, SigmoidInt16) {
   const float kMin = -1;
   const float kMax = 32767.f / 32768.f;
@@ -323,7 +406,7 @@ TEST(FloatActivationsOpTest, Softmax4D) {
                               })));
 }
 
-TEST(QuantizedActivationsOpTest, Softmax4D) {
+TEST(QuantizedActivationsOpTest, Softmax4DUint8) {
   QuantizedActivationsOpModel m(
       0.1,
       /*input=*/{TensorType_UINT8, {1, 2, 1, 4}, -10, 10});
@@ -362,6 +445,145 @@ TEST(QuantizedActivationsOpTest, Softmax4D) {
                   kQuantizedTolerance)));
 }
 
+// Test quantized softmax with int8 input and output. With the same input as in
+// QuantizedActivationsOpTest.Softmax1D, the dequantized output is identical.
+TEST(QuantizedActivationsOpTest, Softmax1DInt8) {
+  QuantizedActivationsOpModel m(0.1,
+                                /*input=*/{TensorType_INT8, {8}, -10, 10});
+  m.SetInput<int8_t>({0, -6, 2, 4, 3, -2, 10, 1});
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetDequantizedOutput<int8_t>(),
+      ElementsAreArray(ArrayFloatNear({0.09766, 0.05469, 0.12109, 0.14453,
+                                       0.13281, 0.07813, 0.26563, 0.10938},
+                                      kQuantizedTolerance)));
+}
+
+// Test quantized softmax with int8 input and output. With the same input as in
+// QuantizedActivationsOpTest.Softmax2D, the dequantized output is identical.
+TEST(QuantizedActivationsOpTest, Softmax2DInt8) {
+  QuantizedActivationsOpModel m(0.1,
+                                /*input=*/{TensorType_INT8, {2, 4}, -10, 10});
+  m.SetInput<int8_t>({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      .23463, .12877, .28658, .35003,  //
+                      .22528, .13664, .45365, .18443,  //
+                  },
+                  kQuantizedTolerance)));
+
+  // Same input, but a different shape.
+  QuantizedActivationsOpModel m2(0.1,
+                                 /*input=*/{TensorType_INT8, {4, 2}, -10, 10});
+  m2.SetInput<int8_t>({
+      0, -6,  //
+      2, 4,   //
+      3, -2,  //
+      10, 1,  //
+  });
+  m2.Invoke();
+  EXPECT_THAT(m2.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.645656, 0.354344,  //
+                      0.450166, 0.549834,  //
+                      0.622459, 0.377541,  //
+                      0.710949, 0.28905,   //
+                  },
+                  kQuantizedTolerance)));
+}
+
+// Test quantized softmax with int8 input and output. With the same input as in
+// QuantizedActivationsOpTest.Softmax3D, the dequantized output is identical.
+TEST(QuantizedActivationsOpTest, Softmax3DInt8) {
+  QuantizedActivationsOpModel m(
+      0.1,
+      /*input=*/{TensorType_INT8, {1, 2, 4}, -10, 10});
+  m.SetInput<int8_t>({
+      0, -6, 2, 4,   // depth = 0
+      3, -2, 10, 1,  // depth = 1
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      .23463, .12877, .28658, .35003,  //
+                      .22528, .13664, .45365, .18443,  //
+                  },
+                  kQuantizedTolerance)));
+
+  // Same input, but a different shape.
+  QuantizedActivationsOpModel m2(
+      0.1,
+      /*input=*/{TensorType_INT8, {4, 1, 2}, -10, 10});
+  m2.SetInput<int8_t>({
+      0, -6,  //
+      2, 4,   //
+      3, -2,  //
+      10, 1,  //
+  });
+  m2.Invoke();
+  EXPECT_THAT(m2.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.645656, 0.354344,  //
+                      0.450166, 0.549834,  //
+                      0.622459, 0.377541,  //
+                      0.710949, 0.28905,   //
+                  },
+                  kQuantizedTolerance)));
+}
+
+// Test quantized softmax with int8 input and output. With the same input as in
+// QuantizedActivationsOpTest.Softmax4D, the dequantized output is identical.
+TEST(QuantizedActivationsOpTest, Softmax4DInt8) {
+  QuantizedActivationsOpModel m(
+      0.1,
+      /*input=*/{TensorType_INT8, {1, 2, 1, 4}, -10, 10});
+  m.SetInput<int8_t>({
+      0, -6, 2, 4,   // depth = 0
+      3, -2, 10, 1,  // depth = 1
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({
+                                         -68, -95, -54, -38,  //
+                                         -70, -93, -12, -81,  //
+                                     }));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      .23463, .12877, .28658, .35003,  //
+                      .22528, .13664, .45365, .18443,  //
+                  },
+                  kQuantizedTolerance)));
+
+  // Same input, but a different shape.
+  QuantizedActivationsOpModel m2(
+      0.1,
+      /*input=*/{TensorType_INT8, {4, 1, 1, 2}, -10, 10});
+  m2.SetInput<int8_t>({
+      0, -6,  //
+      2, 4,   //
+      3, -2,  //
+      10, 1,  //
+  });
+  m2.Invoke();
+  EXPECT_THAT(m2.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.645656, 0.354344,  //
+                      0.450166, 0.549834,  //
+                      0.622459, 0.377541,  //
+                      0.710949, 0.28905,   //
+                  },
+                  kQuantizedTolerance)));
+}
+
 TEST(FloatActivationsOpTest, Softmax3D) {
   FloatActivationsOpModel m(0.1,
                             /*input=*/{TensorType_FLOAT32, {1, 2, 4}});
@@ -393,7 +615,7 @@ TEST(FloatActivationsOpTest, Softmax3D) {
                               })));
 }
 
-TEST(QuantizedActivationsOpTest, Softmax3D) {
+TEST(QuantizedActivationsOpTest, Softmax3DUint8) {
   QuantizedActivationsOpModel m(
       0.1,
       /*input=*/{TensorType_UINT8, {1, 2, 4}, -10, 10});
@@ -443,7 +665,7 @@ TEST(FloatActivationsOpTest, Softmax1D) {
           {.09752, .05352, .11911, .14548, .13164, .07984, .26509, .10778})));
 }
 
-TEST(QuantizedActivationsOpTest, Softmax1D) {
+TEST(QuantizedActivationsOpTest, Softmax1DUint8) {
   QuantizedActivationsOpModel m(0.1,
                                 /*input=*/{TensorType_UINT8, {8}, -10, 10});
   m.SetInput<uint8_t>({0, -6, 2, 4, 3, -2, 10, 1});
@@ -486,7 +708,7 @@ TEST(FloatActivationsOpTest, Softmax2D) {
                               })));
 }
 
-TEST(QuantizedActivationsOpTest, Softmax2D) {
+TEST(QuantizedActivationsOpTest, Softmax2DUint8) {
   QuantizedActivationsOpModel m(0.1,
                                 /*input=*/{TensorType_UINT8, {2, 4}, -10, 10});
   m.SetInput<uint8_t>({
@@ -564,7 +786,7 @@ TEST(FloatActivationsOpTest, LogSoftmax) {
                               })));
 }
 
-TEST(QuantizedActivationsOpTest, LogSoftmax) {
+TEST(QuantizedActivationsOpTest, LogSoftmaxUint8) {
   const float kLogSoftmaxQuantizedTolerance = 16 / 256.0;
   QuantizedActivationsOpModel m(
       BuiltinOperator_LOG_SOFTMAX,
@@ -586,6 +808,30 @@ TEST(QuantizedActivationsOpTest, LogSoftmax) {
               ElementsAreArray({189, 93, 221, 253, 142, 63, 255, 111}));
 }
 
+TEST(QuantizedActivationsOpTest, LogSoftmaxInt8) {
+  const float kLogSoftmaxQuantizedTolerance = 0.06355;
+  QuantizedActivationsOpModel m(
+      BuiltinOperator_LOG_SOFTMAX,
+      /*input=*/{TensorType_INT8, {2, 4}, -10, 10},
+      /*output=*/{TensorType_INT8, {}, 0, 0, 16. / 256, 127});
+  m.SetInput<int8_t>({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      -4.14297, -10.14297, -2.14297, -.142971,    //
+                      -7.00104, -12.00104, -.00104087, -9.00104,  //
+                  },
+                  kLogSoftmaxQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({
+                                         61, -36, 93, 125,   //
+                                         15, -65, 127, -16,  //
+                                     }));
+}
+
 // A base class of PRelu op model. It provides the constructor for
 // FloatPReluOpModel and QuantizedPReluOpModel.
 class BasePReluOpModel : public SingleOpModel {
diff --git a/tensorflow/lite/kernels/add.cc b/tensorflow/lite/kernels/add.cc
index 32a7c100ce53101063d81345bcb052e680e64a28..4cfe435e9e2c4dab7253c4be4fffcb991cea4abd 100644
--- a/tensorflow/lite/kernels/add.cc
+++ b/tensorflow/lite/kernels/add.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
@@ -92,7 +93,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     output_size = TfLiteIntArrayCopy(input1->dims);
   }
 
-  if (output->type == kTfLiteUInt8) {
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
     // 8bit -> 8bit general quantized path, with general rescalings
     data->input1_offset = -input1->params.zero_point;
     data->input2_offset = -input2->params.zero_point;
@@ -117,10 +118,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     QuantizeMultiplierSmallerThanOneExp(
         real_output_multiplier, &data->output_multiplier, &data->output_shift);
 
-    CalculateActivationRangeUint8(params->activation, output,
-                                  &data->output_activation_min,
-                                  &data->output_activation_max);
-
+    if (output->type == kTfLiteUInt8) {
+      CalculateActivationRangeUint8(params->activation, output,
+                                    &data->output_activation_min,
+                                    &data->output_activation_max);
+    } else {
+      CalculateActivationRangeInt8(params->activation, output,
+                                   &data->output_activation_min,
+                                   &data->output_activation_max);
+    }
   } else if (output->type == kTfLiteInt16) {
     // 16bit -> 16bit special quantized path, supporting only a rather
     // narrow case of quantization parameters: zero_points must all be 0
@@ -219,7 +225,7 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
                               const TfLiteTensor* input1,
                               const TfLiteTensor* input2,
                               TfLiteTensor* output) {
-  if (output->type == kTfLiteUInt8) {
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
     tflite::ArithmeticParams op_params;
     op_params.left_shift = data->left_shift;
     op_params.input1_offset = data->input1_offset;
@@ -235,22 +241,33 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
                         data->output_activation_max, &op_params);
     bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
         GetTensorShape(input1), GetTensorShape(input2), &op_params);
-#define TF_LITE_ADD(type, opname)                                      \
-  type::opname(op_params, GetTensorShape(input1),                      \
-               GetTensorData<uint8_t>(input1), GetTensorShape(input2), \
-               GetTensorData<uint8_t>(input2), GetTensorShape(output), \
-               GetTensorData<uint8_t>(output));
-    if (kernel_type == kReference) {
+#define TF_LITE_ADD(type, opname, dtype)                             \
+  type::opname(op_params, GetTensorShape(input1),                    \
+               GetTensorData<dtype>(input1), GetTensorShape(input2), \
+               GetTensorData<dtype>(input2), GetTensorShape(output), \
+               GetTensorData<dtype>(output));
+    if (output->type == kTfLiteInt8) {
       if (need_broadcast) {
-        TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow);
+        TF_LITE_ADD(reference_integer_ops, BroadcastAdd4DSlow, int8_t);
       } else {
-        TF_LITE_ADD(reference_ops, Add);
+        TF_LITE_ADD(reference_integer_ops, Add, int8_t);
       }
     } else {
-      if (need_broadcast) {
-        TF_LITE_ADD(optimized_ops, BroadcastAddFivefold);
+      if (kernel_type == kReference) {
+        if (need_broadcast) {
+          TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, uint8_t);
+        } else {
+          TF_LITE_ADD(reference_ops, Add, uint8_t);
+        }
       } else {
-        TF_LITE_ADD(optimized_ops, Add);
+        if (op_params.broadcast_category ==
+            BroadcastableOpCategory::kGenericBroadcast) {
+          TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow, uint8_t);
+        } else if (need_broadcast) {
+          TF_LITE_ADD(optimized_ops, BroadcastAddFivefold, uint8_t);
+        } else {
+          TF_LITE_ADD(optimized_ops, Add, uint8_t);
+        }
       }
     }
 #undef TF_LITE_ADD
@@ -289,7 +306,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32) {
     EvalAdd<kernel_type>(context, node, params, data, input1, input2, output);
-  } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) {
+  } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
+             output->type == kTfLiteInt16) {
     TF_LITE_ENSURE_OK(context,
                       EvalAddQuantized<kernel_type>(context, node, params, data,
                                                     input1, input2, output));
diff --git a/tensorflow/lite/kernels/add_n.cc b/tensorflow/lite/kernels/add_n.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3e9b2ea24afcd3eff107b110e7d5bb6226d95d3a
--- /dev/null
+++ b/tensorflow/lite/kernels/add_n.cc
@@ -0,0 +1,88 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace add_n {
+
+constexpr int kInputTensor1 = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  int num_inputs = NumInputs(node);
+  TF_LITE_ENSURE(context, num_inputs >= 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  output->type = input1->type;
+
+  // Check that all input tensors have the same shape and type.
+  for (int i = kInputTensor1 + 1; i < num_inputs; ++i) {
+    const TfLiteTensor* input = GetInput(context, node, i);
+    TF_LITE_ENSURE(context, HaveSameShapes(input1, input));
+    TF_LITE_ENSURE_EQ(context, input1->type, input->type);
+  }
+
+  // Use the first input node's dimension to be the dimension of the output
+  // node.
+  TfLiteIntArray* input1_dims = input1->dims;
+  TfLiteIntArray* output_dims = TfLiteIntArrayCopy(input1_dims);
+  return context->ResizeTensor(context, output, output_dims);
+}
+
+template <typename T>
+void EvalAddN(TfLiteContext* context, TfLiteNode* node) {
+  // TODO(haoliang): Initialize all_inputs only once during init.
+  VectorOfTensors<T> all_inputs(*context, *node->inputs);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  int num_inputs = NumInputs(node);
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  reference_ops::AddN<T>(GetTensorShape(input1), num_inputs, all_inputs.data(),
+                         GetTensorData<T>(output));
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  if (output->type == kTfLiteFloat32) {
+    EvalAddN<float>(context, node);
+  } else if (output->type == kTfLiteInt32) {
+    EvalAddN<int32_t>(context, node);
+  } else {
+    context->ReportError(context,
+                         "AddN only supports FLOAT32|INT32 now, got %s.",
+                         TfLiteTypeGetName(output->type));
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace add_n
+
+TfLiteRegistration* Register_ADD_N() {
+  static TfLiteRegistration r = {/*init*/ nullptr, /*free*/ nullptr,
+                                 add_n::Prepare, add_n::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/add_n_test.cc b/tensorflow/lite/kernels/add_n_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ee9477d2ff13c4e4f4e2da815d8f5660ab5b6c4e
--- /dev/null
+++ b/tensorflow/lite/kernels/add_n_test.cc
@@ -0,0 +1,98 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseAddNOpModel : public SingleOpModel {
+ public:
+  BaseAddNOpModel(const std::vector<TensorData>& inputs,
+                  const TensorData& output) {
+    int num_inputs = inputs.size();
+    std::vector<std::vector<int>> input_shapes;
+
+    for (int i = 0; i < num_inputs; ++i) {
+      inputs_.push_back(AddInput(inputs[i]));
+      input_shapes.push_back(GetShape(inputs_[i]));
+    }
+
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_ADD_N, BuiltinOptions_AddNOptions,
+                 CreateAddNOptions(builder_).Union());
+    BuildInterpreter(input_shapes);
+  }
+
+  int input(int i) { return inputs_[i]; }
+
+ protected:
+  std::vector<int> inputs_;
+  int output_;
+};
+
+class FloatAddNOpModel : public BaseAddNOpModel {
+ public:
+  using BaseAddNOpModel::BaseAddNOpModel;
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+class IntegerAddNOpModel : public BaseAddNOpModel {
+ public:
+  using BaseAddNOpModel::BaseAddNOpModel;
+
+  std::vector<int32_t> GetOutput() { return ExtractVector<int32_t>(output_); }
+};
+
+TEST(FloatAddNOpModel, AddMultipleTensors) {
+  FloatAddNOpModel m({{TensorType_FLOAT32, {1, 2, 2, 1}},
+                      {TensorType_FLOAT32, {1, 2, 2, 1}},
+                      {TensorType_FLOAT32, {1, 2, 2, 1}}},
+                     {TensorType_FLOAT32, {}});
+  m.PopulateTensor<float>(m.input(0), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input(1), {0.1, 0.2, 0.3, 0.5});
+  m.PopulateTensor<float>(m.input(2), {0.5, 0.1, 0.1, 0.2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.4, 0.5, 1.1, 1.5}));
+}
+
+TEST(IntegerAddNOpModel, AddMultipleTensors) {
+  IntegerAddNOpModel m({{TensorType_INT32, {1, 2, 2, 1}},
+                        {TensorType_INT32, {1, 2, 2, 1}},
+                        {TensorType_INT32, {1, 2, 2, 1}}},
+                       {TensorType_INT32, {}});
+  m.PopulateTensor<int32_t>(m.input(0), {-20, 2, 7, 8});
+  m.PopulateTensor<int32_t>(m.input(1), {1, 2, 3, 5});
+  m.PopulateTensor<int32_t>(m.input(2), {10, -5, 1, -2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-9, -1, 11, 11}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/add_test.cc b/tensorflow/lite/kernels/add_test.cc
index 1d33adf1999ecde581badf041276ec15b4370689..2904f4a11a947264cb12fc2e8c0a7822df24c678 100644
--- a/tensorflow/lite/kernels/add_test.cc
+++ b/tensorflow/lite/kernels/add_test.cc
@@ -63,9 +63,10 @@ class QuantizedAddOpModel : public BaseAddOpModel {
  public:
   using BaseAddOpModel::BaseAddOpModel;
 
+  template <typename integer_dtype>
   std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
+    return Dequantize<integer_dtype>(ExtractVector<integer_dtype>(output_),
+                                     GetScale(output_), GetZeroPoint(output_));
   }
 
   std::vector<float> GetDequantizedOutputInt16() {
@@ -74,17 +75,15 @@ class QuantizedAddOpModel : public BaseAddOpModel {
   }
 };
 
-// for quantized Add, the error shouldn't exceed 2*step
+// for quantized Add, the error shouldn't exceed step
 float GetTolerance(float min, float max) {
   float kQuantizedStep = (max - min) / 255.0;
-  float kQuantizedTolerance = 2.0 * kQuantizedStep;
-  return kQuantizedTolerance;
+  return kQuantizedStep;
 }
 
 float GetToleranceInt16(float min, float max) {
   float kQuantizedStep = (max - min) / 32767.f;
-  float kQuantizedTolerance = 2.0 * kQuantizedStep;
-  return kQuantizedTolerance;
+  return kQuantizedStep;
 }
 
 TEST(FloatAddOpModel, NoActivation) {
@@ -191,7 +190,8 @@ TEST(IntegerAddOpModel, WithBroadcast) {
   }
 }
 
-TEST(QuantizedAddOpModel, QuantizedTestsNoActivation) {
+template <TensorType tensor_type, typename integer_dtype>
+void QuantizedTestsNoActivation() {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
   std::vector<std::vector<float>> inputs1 = {
       {0.1, 0.2, 0.3, 0.4}, {-0.8, 0.2, 0.4, 0.7}, {-0.8, 0.2, 0.7, 0.3}};
@@ -200,19 +200,28 @@ TEST(QuantizedAddOpModel, QuantizedTestsNoActivation) {
   std::vector<std::vector<float>> results = {
       {0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.1, 0.8}};
   for (int i = 0; i < inputs1.size(); ++i) {
-    QuantizedAddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                          {TensorType_UINT8, {}, -1.0, 1.0},
+    QuantizedAddOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                          {tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                          {tensor_type, {}, -1.0, 1.0},
                           ActivationFunctionType_NONE);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[i]);
+    m.QuantizeAndPopulate<integer_dtype>(m.input1(), inputs1[i]);
+    m.QuantizeAndPopulate<integer_dtype>(m.input2(), inputs2[i]);
     m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
-                                              results[i], kQuantizedTolerance)))
+    EXPECT_THAT(
+        m.GetDequantizedOutput<integer_dtype>(),
+        ElementsAreArray(ArrayFloatNear(results[i], kQuantizedTolerance)))
         << "With test number " << i;
   }
 }
 
+TEST(QuantizedAddOpModel, QuantizedTestsNoActivationUInt8) {
+  QuantizedTestsNoActivation<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedAddOpModel, QuantizedTestsNoActivationInt8) {
+  QuantizedTestsNoActivation<TensorType_INT8, int8_t>();
+}
+
 TEST(QuantizedAddOpModel, QuantizedTestsNoActivationInt16) {
   const float kMin = -1.f;
   const float kMax = 32767.f / 32768.f;
@@ -238,7 +247,8 @@ TEST(QuantizedAddOpModel, QuantizedTestsNoActivationInt16) {
   }
 }
 
-TEST(QuantizedAddOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
+template <enum TensorType tensor_type, typename integer_dtype>
+void QuantizedTestsActivationRELU_N1_TO_1() {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
   std::vector<std::vector<float>> inputs1 = {{-0.8, 0.2, 0.9, 0.7},
                                              {-0.8, 0.2, 0.7, 0.3}};
@@ -247,55 +257,164 @@ TEST(QuantizedAddOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
   std::vector<std::vector<float>> results = {{-0.2, 0.6, 1.0, -0.1},
                                              {-0.2, 0.6, -0.1, 0.8}};
   for (int i = 0; i < inputs1.size(); ++i) {
-    QuantizedAddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                          {TensorType_UINT8, {}, -1.0, 1.0},
+    QuantizedAddOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                          {tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                          {tensor_type, {}, -1.0, 1.0},
                           ActivationFunctionType_RELU_N1_TO_1);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[i]);
+    m.QuantizeAndPopulate<integer_dtype>(m.input1(), inputs1[i]);
+    m.QuantizeAndPopulate<integer_dtype>(m.input2(), inputs2[i]);
     m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
-                                              results[i], kQuantizedTolerance)))
+    EXPECT_THAT(
+        m.GetDequantizedOutput<integer_dtype>(),
+        ElementsAreArray(ArrayFloatNear(results[i], kQuantizedTolerance)))
         << "With test number " << i;
   }
 }
 
-TEST(QuantizedAddOpModel, QuantizedVariousInputShapes) {
+TEST(QuantizedAddOpModel, QuantizedTestsActivationRELU_N1_TO_1UInt8) {
+  QuantizedTestsActivationRELU_N1_TO_1<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedAddOpModel, QuantizedTestsActivationRELU_N1_TO_1Int8) {
+  QuantizedTestsActivationRELU_N1_TO_1<TensorType_INT8, int8_t>();
+}
+
+template <enum TensorType tensor_type, typename integer_dtype>
+void QuantizedVariousInputShapes() {
   float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
   std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
-    QuantizedAddOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
-                          {TensorType_UINT8, test_shapes[i], -3.0, 3.0},
-                          {TensorType_UINT8, {}, -3.0, 3.0},
+    QuantizedAddOpModel m({tensor_type, test_shapes[i], -3.0, 3.0},
+                          {tensor_type, test_shapes[i], -3.0, 3.0},
+                          {tensor_type, {}, -3.0, 3.0},
                           ActivationFunctionType_NONE);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1, 0.3, 0.3, 0.5, 1.1, 0.1});
+    m.QuantizeAndPopulate<integer_dtype>(m.input1(),
+                                         {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<integer_dtype>(m.input2(),
+                                         {0.1, 0.3, 0.3, 0.5, 1.1, 0.1});
     m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(),
+    EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
                 ElementsAreArray(ArrayFloatNear({-1.9, 0.5, 1.0, 1.3, 2.2, 2.1},
                                                 kQuantizedTolerance)))
         << "With shape number " << i;
   }
 }
 
-TEST(QuantizedAddOpModel, QuantizedWithBroadcast) {
-  float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
+TEST(QuantizedAddOpModel, QuantizedVariousInputShapesUInt8) {
+  QuantizedVariousInputShapes<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedAddOpModel, QuantizedVariousInputShapesInt8) {
+  QuantizedVariousInputShapes<TensorType_INT8, int8_t>();
+}
+
+template <enum TensorType tensor_type, typename integer_dtype>
+void QuantizedWithScalarBroadcast() {
+  float kQuantizedTolerance = GetTolerance(-3.f, 3.f);
   std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
-    QuantizedAddOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
-                          {TensorType_UINT8, {}, -3.0, 3.0},
-                          {TensorType_UINT8, {}, -3.0, 3.0},
-                          ActivationFunctionType_NONE);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1});
-    m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(),
-                ElementsAreArray(ArrayFloatNear({-1.9, 0.3, 0.8, 0.9, 1.2, 2.1},
-                                                kQuantizedTolerance)))
+    QuantizedAddOpModel model_fixture(
+        {tensor_type, test_shapes[i], -3.f, 3.f}, {tensor_type, {}, -3.f, 3.f},
+        {tensor_type, {}, -3.f, 3.f}, ActivationFunctionType_NONE);
+    model_fixture.QuantizeAndPopulate<integer_dtype>(
+        model_fixture.input1(), {-2.0f, 0.2f, 0.7f, 0.8f, 1.1f, 2.0f});
+    model_fixture.QuantizeAndPopulate<integer_dtype>(model_fixture.input2(),
+                                                     {0.1f});
+    model_fixture.Invoke();
+    EXPECT_THAT(
+        model_fixture.GetDequantizedOutput<integer_dtype>(),
+        ElementsAreArray(ArrayFloatNear({-1.9f, 0.3f, 0.8f, 0.9f, 1.2f, 2.1f},
+                                        kQuantizedTolerance)))
         << "With shape number " << i;
   }
+  // Re-run with exchanged inputs.
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedAddOpModel model_fixture(
+        {tensor_type, {}, -3.f, 3.f}, {tensor_type, test_shapes[i], -3.f, 3.f},
+        {tensor_type, {}, -3.f, 3.f}, ActivationFunctionType_NONE);
+    model_fixture.QuantizeAndPopulate<integer_dtype>(model_fixture.input1(),
+                                                     {0.1f});
+    model_fixture.QuantizeAndPopulate<integer_dtype>(
+        model_fixture.input2(), {-2.0f, 0.2f, 0.7f, 0.8f, 1.1f, 2.0f});
+    model_fixture.Invoke();
+    EXPECT_THAT(
+        model_fixture.GetDequantizedOutput<integer_dtype>(),
+        ElementsAreArray(ArrayFloatNear({-1.9f, 0.3f, 0.8f, 0.9f, 1.2f, 2.1f},
+                                        kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+}
+
+TEST(QuantizedAddOpModel, QuantizedWithScalarBroadcastUInt8) {
+  QuantizedWithScalarBroadcast<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedAddOpModel, QuantizedWithScalarBroadcastInt8) {
+  QuantizedWithScalarBroadcast<TensorType_INT8, int8_t>();
+}
+
+template <enum TensorType tensor_type, typename integer_dtype>
+void QuantizedWithMixedBroadcast() {
+  float kQuantizedTolerance = GetTolerance(-3.f, 3.f);
+  const std::vector<int> base_shape = {2, 3, 1, 2};
+  std::vector<std::vector<int>> test_shapes = {
+      {1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
+  std::vector<std::vector<float>> test_outputs = {
+      {-0.1f, 2.6f,  -0.7f, 2.8f, 0.7f,  3.0f, 1.1f,  0.8f, 0.5f,
+       1.0f,  1.9f,  1.4f,  1.0f, -0.8f, 0.4f, -0.6f, 1.8f, -0.2f,
+       1.4f,  3.0f,  0.8f,  3.0f, 2.2f,  3.0f, -1.4f, 0.3f, -2.0f,
+       0.5f,  -0.6f, 0.9f,  0.9f, -1.9f, 0.3f, -1.7f, 1.7f, -1.3f},
+      {-0.1f, 2.6f, 0.5f, 1.0f, 1.8f, -0.2f, 1.4f, 3.0f, -2.0f, 0.5f, 1.7f,
+       -1.3f},
+      {-0.1f, 2.5f,  0.0f, 2.6f, -0.7f, 1.9f, 1.1f,  0.7f, 1.2f,
+       0.8f,  0.5f,  0.1f, 1.0f, -0.9f, 1.1f, -0.8f, 0.4f, -1.5f,
+       1.7f,  3.0f,  2.2f, 3.0f, 2.1f,  3.0f, -1.1f, 0.5f, -0.6f,
+       1.0f,  -0.7f, 0.9f, 1.2f, -1.7f, 1.7f, -1.2f, 1.6f, -1.3f},
+      {-0.1f, 2.5f, 1.2f, 0.8f, 0.4f, -1.5f, 1.7f, 3.0f, -0.6f, 1.0f, 1.6f,
+       -1.3f}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedAddOpModel model_fixture({tensor_type, base_shape, -3.f, 3.f},
+                                      {tensor_type, test_shapes[i], -3.f, 3.f},
+                                      {tensor_type, {}, -3.f, 3.f},
+                                      ActivationFunctionType_NONE);
+    model_fixture.QuantizeAndPopulate<integer_dtype>(
+        model_fixture.input1(), {-0.3f, 2.3f, 0.9f, 0.5f, 0.8f, -1.1f, 1.2f,
+                                 2.8f, -1.6f, 0.0f, 0.7f, -2.2f});
+    model_fixture.QuantizeAndPopulate<integer_dtype>(
+        model_fixture.input2(), {0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f});
+    model_fixture.Invoke();
+    EXPECT_THAT(
+        model_fixture.GetDequantizedOutput<integer_dtype>(),
+        ElementsAreArray(ArrayFloatNear(test_outputs[i], kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+  // Re-run with exchanged inputs.
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedAddOpModel model_fixture({tensor_type, test_shapes[i], -3.f, 3.f},
+                                      {tensor_type, base_shape, -3.f, 3.f},
+                                      {tensor_type, {}, -3.f, 3.f},
+                                      ActivationFunctionType_NONE);
+    model_fixture.QuantizeAndPopulate<integer_dtype>(
+        model_fixture.input1(), {0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f});
+    model_fixture.QuantizeAndPopulate<integer_dtype>(
+        model_fixture.input2(), {-0.3f, 2.3f, 0.9f, 0.5f, 0.8f, -1.1f, 1.2f,
+                                 2.8f, -1.6f, 0.0f, 0.7f, -2.2f});
+    model_fixture.Invoke();
+    EXPECT_THAT(
+        model_fixture.GetDequantizedOutput<integer_dtype>(),
+        ElementsAreArray(ArrayFloatNear(test_outputs[i], kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+}
+
+TEST(QuantizedAddOpModel, QuantizedWithMixedBroadcastUInt8) {
+  QuantizedWithMixedBroadcast<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedAddOpModel, QuantizedWithMixedBroadcastInt8) {
+  QuantizedWithMixedBroadcast<TensorType_INT8, int8_t>();
 }
 
 }  // namespace
diff --git a/tensorflow/lite/kernels/arg_min_max.cc b/tensorflow/lite/kernels/arg_min_max.cc
index eea2de27f74af8bf73df92c28ed6042e4d8fa4ff..e5223badc407059511f06cd538b6057c1e276966 100644
--- a/tensorflow/lite/kernels/arg_min_max.cc
+++ b/tensorflow/lite/kernels/arg_min_max.cc
@@ -36,9 +36,15 @@ TfLiteStatus ResizeOutput(TfLiteContext* context, const TfLiteTensor* input,
     axis_value += NumDimensions(input);
   }
 
-  // Copy the input dimensions to output except make the axis dimension 1.
-  TfLiteIntArray* output_dims = TfLiteIntArrayCopy(input->dims);
-  output_dims->data[axis_value] = 1;
+  // Copy the input dimensions to output except the axis dimension.
+  TfLiteIntArray* output_dims = TfLiteIntArrayCreate(NumDimensions(input) - 1);
+  int j = 0;
+  for (int i = 0; i < NumDimensions(input); ++i) {
+    if (i != axis_value) {
+      output_dims->data[j] = SizeOfDimension(input, i);
+      ++j;
+    }
+  }
   return context->ResizeTensor(context, output, output_dims);
 }
 
@@ -74,13 +80,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   switch (input->type) {
     case kTfLiteFloat32:
     case kTfLiteUInt8:
+    case kTfLiteInt8:
     case kTfLiteInt32:
       break;
 
     default:
       context->ReportError(
           context,
-          "Unkonwn input type: %d, only float32 and int types are supported",
+          "Unknown input type: %d, only float32 and int types are supported",
           input->type);
       return kTfLiteError;
   }
@@ -129,6 +136,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node, bool is_arg_max) {
           case kTfLiteUInt8:
             TF_LITE_ARG_MIN_MAX(uint8_t, int32_t, int32_t);
             break;
+          case kTfLiteInt8:
+            TF_LITE_ARG_MIN_MAX(int8_t, int32_t, int32_t);
+            break;
           case kTfLiteInt32:
             TF_LITE_ARG_MIN_MAX(int32_t, int32_t, int32_t);
             break;
@@ -144,6 +154,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node, bool is_arg_max) {
           case kTfLiteUInt8:
             TF_LITE_ARG_MIN_MAX(uint8_t, int32_t, int64_t);
             break;
+          case kTfLiteInt8:
+            TF_LITE_ARG_MIN_MAX(int8_t, int32_t, int64_t);
+            break;
           case kTfLiteInt32:
             TF_LITE_ARG_MIN_MAX(int32_t, int32_t, int64_t);
             break;
diff --git a/tensorflow/lite/kernels/arg_min_max_test.cc b/tensorflow/lite/kernels/arg_min_max_test.cc
index dcdff74cc6f376b3418b64c025e8eb4a36c429a0..01ea923f26d3ca32ec109a61d0484b0ecbd30c93 100644
--- a/tensorflow/lite/kernels/arg_min_max_test.cc
+++ b/tensorflow/lite/kernels/arg_min_max_test.cc
@@ -83,7 +83,29 @@ TEST(ArgMaxOpTest, GetMaxArgFloat) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({1}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1}));
+}
+
+TEST(ArgMaxOpTest, GetMaxArgUInt8) {
+  ArgMaxOpModel<int32_t> model({1, 1, 1, 4}, TensorType_UINT8, TensorType_INT32,
+                               TensorType_INT32);
+  model.PopulateTensor<uint8_t>(model.input(), {1, 9, 7, 3});
+  model.PopulateTensor<int>(model.axis(), {3});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1}));
+}
+
+TEST(ArgMaxOpTest, GetMaxArgInt8) {
+  ArgMaxOpModel<int32_t> model({1, 1, 1, 4}, TensorType_INT8, TensorType_INT32,
+                               TensorType_INT32);
+  model.PopulateTensor<int8_t>(model.input(), {-1, -9, 7, 3});
+  model.PopulateTensor<int>(model.axis(), {3});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({2}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1}));
 }
 
 TEST(ArgMaxOpTest, GetMaxArgInt) {
@@ -94,7 +116,7 @@ TEST(ArgMaxOpTest, GetMaxArgInt) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({1}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1}));
 }
 
 TEST(ArgMaxOpTest, GetMaxArgMulDimensions) {
@@ -105,7 +127,7 @@ TEST(ArgMaxOpTest, GetMaxArgMulDimensions) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({3, 1}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2}));
 }
 
 TEST(ArgMaxOpTest, GetMaxArgNegativeAxis) {
@@ -116,7 +138,7 @@ TEST(ArgMaxOpTest, GetMaxArgNegativeAxis) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({0, 1, 0, 0}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 4}));
 }
 
 TEST(ArgMaxOpTest, GetMaxArgOutput64) {
@@ -127,7 +149,7 @@ TEST(ArgMaxOpTest, GetMaxArgOutput64) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({0, 1}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2}));
 }
 
 TEST(ArgMinOpTest, GetMinArgFloat) {
@@ -138,7 +160,7 @@ TEST(ArgMinOpTest, GetMinArgFloat) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({0}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1}));
 }
 
 TEST(ArgMinOpTest, GetMinArgInt) {
@@ -149,7 +171,7 @@ TEST(ArgMinOpTest, GetMinArgInt) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({0}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1}));
 }
 
 TEST(ArgMinOpTest, GetMinArgMulDimensions) {
@@ -160,7 +182,7 @@ TEST(ArgMinOpTest, GetMinArgMulDimensions) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({0, 0}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2}));
 }
 
 TEST(ArgMinOpTest, GetMinArgNegativeAxis) {
@@ -171,7 +193,7 @@ TEST(ArgMinOpTest, GetMinArgNegativeAxis) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({0, 0, 0, 1}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 4}));
 }
 
 TEST(ArgMinOpTest, GetMinArgOutput64) {
@@ -182,7 +204,7 @@ TEST(ArgMinOpTest, GetMinArgOutput64) {
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 0}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2}));
 }
 
 }  // namespace
diff --git a/tensorflow/lite/kernels/basic_rnn.cc b/tensorflow/lite/kernels/basic_rnn.cc
index 7c66ce1992f4c341d7518742cd209a53fa1de16b..a2c38b3b7d8c573244be803225398504a6c45f86 100644
--- a/tensorflow/lite/kernels/basic_rnn.cc
+++ b/tensorflow/lite/kernels/basic_rnn.cc
@@ -27,6 +27,16 @@ namespace ops {
 namespace builtin {
 namespace rnn {
 
+namespace {
+int8_t* GetInt8DataPtr(const TfLiteTensor* tensor, const bool is_uint8) {
+  if (is_uint8) {
+    return reinterpret_cast<int8_t*>(tensor->data.uint8);
+  } else {
+    return tensor->data.int8;
+  }
+}
+}  // namespace
+
 constexpr int kInputTensor = 0;
 constexpr int kWeightsTensor = 1;
 constexpr int kRecurrentWeightsTensor = 2;
@@ -85,15 +95,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context,
                     context->ResizeTensor(context, output, output_size_array));
 
+  bool is_hybrid =
+      input->type == kTfLiteFloat32 && (input_weights->type == kTfLiteUInt8 ||
+                                        input_weights->type == kTfLiteInt8);
+
   // Allocate temporary tensors to store quantized values of input and
   // hidden_state tensors.
-  if (input->type == kTfLiteFloat32 && input_weights->type == kTfLiteUInt8) {
+  if (is_hybrid) {
     int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
     TfLiteIntArrayFree(node->temporaries);
     node->temporaries = TfLiteIntArrayCreate(3);
     node->temporaries->data[0] = *scratch_tensor_index;
     TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/0);
-    input_quantized->type = kTfLiteUInt8;
+    input_quantized->type = input_weights->type;
     input_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
       TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
@@ -103,7 +117,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     node->temporaries->data[1] = *scratch_tensor_index + 1;
     TfLiteTensor* hidden_state_quantized =
         GetTemporary(context, node, /*index=*/1);
-    hidden_state_quantized->type = kTfLiteUInt8;
+    hidden_state_quantized->type = input_weights->type;
     hidden_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(hidden_state_quantized->dims,
                              hidden_state->dims)) {
@@ -165,6 +179,7 @@ TfLiteStatus EvalHybrid(const TfLiteTensor* input,
                         TfLiteTensor* hidden_state_scratch,
                         TfLiteTensor* scaling_factors,
                         TfLiteTensor* hidden_state, TfLiteTensor* output) {
+  const bool is_uint8_hybrid = input_weights->type == kTfLiteUInt8;
   const int batch_size = input->dims->data[0];
   const int num_units = input_weights->dims->data[0];
   const int input_size = input->dims->data[1];
@@ -178,18 +193,17 @@ TfLiteStatus EvalHybrid(const TfLiteTensor* input,
   float* output_ptr_batch = output->data.f;
   // Initialize input_weights, recurrent_weights and bias.
   const int8_t* input_weights_ptr =
-      reinterpret_cast<const int8_t*>(input_weights->data.uint8);
+      GetInt8DataPtr(input_weights, is_uint8_hybrid);
   const int8_t* recurrent_weights_ptr =
-      reinterpret_cast<const int8_t*>(recurrent_weights->data.uint8);
+      GetInt8DataPtr(recurrent_weights, is_uint8_hybrid);
   const float* bias_ptr = bias->data.f;
   // Get the scale of the quantized weights.
   float input_weights_scale = input_weights->params.scale;
   float recurrent_weights_scale = recurrent_weights->params.scale;
   // Initialize temporary storage for quantized values.
-  int8_t* quantized_input_ptr =
-      reinterpret_cast<int8_t*>(input_scratch->data.uint8);
+  int8_t* quantized_input_ptr = GetInt8DataPtr(input_scratch, is_uint8_hybrid);
   int8_t* quantized_hidden_state_ptr =
-      reinterpret_cast<int8_t*>(hidden_state_scratch->data.uint8);
+      GetInt8DataPtr(hidden_state_scratch, is_uint8_hybrid);
   float* scaling_factors_ptr = scaling_factors->data.f;
 
   kernel_utils::RnnBatchStep(
@@ -218,7 +232,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteFloat32:
       return EvalFloat(input, input_weights, recurrent_weights, bias, params,
                        hidden_state, output);
-    case kTfLiteUInt8: {
+    case kTfLiteUInt8:
+    case kTfLiteInt8: {
       // TODO(mirkov): implement eval with quantized inputs as well.
       TfLiteTensor* input_quantized = GetTemporary(context, node, 0);
       TfLiteTensor* hidden_state_quantized = GetTemporary(context, node, 1);
diff --git a/tensorflow/lite/kernels/basic_rnn_test.cc b/tensorflow/lite/kernels/basic_rnn_test.cc
index 240057d18a176dbb77e4962b48493c1a8d2dddab..9eb20444a6d119ec940a140a66e59961f1451c1c 100644
--- a/tensorflow/lite/kernels/basic_rnn_test.cc
+++ b/tensorflow/lite/kernels/basic_rnn_test.cc
@@ -233,15 +233,25 @@ class RNNOpModel : public SingleOpModel {
 // The hybrid model has quantized weights and recurrent_weights.
 class HybridRNNOpModel : public RNNOpModel {
  public:
-  HybridRNNOpModel(int batches, int units, int size)
-      : RNNOpModel(batches, units, size, TensorType_UINT8, TensorType_UINT8) {}
+  HybridRNNOpModel(int batches, int units, int size, TensorType tensor_type)
+      : RNNOpModel(batches, units, size, tensor_type, tensor_type) {
+    tensor_type_ = tensor_type;
+  }
 
-  void SetWeights(std::initializer_list<float> f) {
-    SymmetricQuantizeAndPopulate(weights_, f);
+  TensorType tensor_type_;
+
+  void SetWeights(int weights_idx, const std::vector<float>& f) {
+    if (tensor_type_ == TensorType_UINT8) {
+      SymmetricQuantizeAndPopulate(weights_idx, f);
+    } else {
+      SignedSymmetricQuantizeAndPopulate(weights_idx, f);
+    }
   }
 
+  void SetWeights(std::initializer_list<float> f) { SetWeights(weights_, f); }
+
   void SetRecurrentWeights(std::initializer_list<float> f) {
-    SymmetricQuantizeAndPopulate(recurrent_weights_, f);
+    SetWeights(recurrent_weights_, f);
   }
 };
 
@@ -272,8 +282,36 @@ TEST(RnnOpTest, BlackBoxTest) {
   }
 }
 
-TEST(HybridRnnOpTest, BlackBoxTest) {
-  HybridRNNOpModel rnn(2, 16, 8);
+TEST(HybridRnnOpTest, BlackBoxTestUint8) {
+  HybridRNNOpModel rnn(2, 16, 8, TensorType_UINT8);
+  rnn.SetWeights(rnn_weights);
+  rnn.SetBias(rnn_bias);
+  rnn.SetRecurrentWeights(rnn_recurrent_weights);
+
+  const int input_sequence_size = sizeof(rnn_input) / sizeof(float) /
+                                  (rnn.input_size() * rnn.num_batches());
+
+  for (int i = 0; i < input_sequence_size; i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    rnn.SetInput(0, batch_start, batch_end);
+    rnn.SetInput(rnn.input_size(), batch_start, batch_end);
+
+    rnn.Invoke();
+
+    float* golden_start = rnn_golden_output + i * rnn.num_units();
+    float* golden_end = golden_start + rnn.num_units();
+    std::vector<float> expected;
+    expected.insert(expected.end(), golden_start, golden_end);
+    expected.insert(expected.end(), golden_start, golden_end);
+
+    EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                     expected, /*max_abs_error=*/0.0104)));
+  }
+}
+
+TEST(HybridRnnOpTest, BlackBoxTestInt8) {
+  HybridRNNOpModel rnn(2, 16, 8, TensorType_INT8);
   rnn.SetWeights(rnn_weights);
   rnn.SetBias(rnn_bias);
   rnn.SetRecurrentWeights(rnn_recurrent_weights);
diff --git a/tensorflow/lite/kernels/batch_to_space_nd.cc b/tensorflow/lite/kernels/batch_to_space_nd.cc
index 34fdf34f70c9660266e23260bd5a6b645a3c5ccb..ce85aeddedcebdecf4d2944bade2ed5f823b0592 100644
--- a/tensorflow/lite/kernels/batch_to_space_nd.cc
+++ b/tensorflow/lite/kernels/batch_to_space_nd.cc
@@ -148,6 +148,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         TF_LITE_BATCH_TO_SPACE_ND(optimized_ops, uint8_t);
       }
       break;
+    case kTfLiteInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_BATCH_TO_SPACE_ND(reference_ops, int8_t);
+      } else {
+        TF_LITE_BATCH_TO_SPACE_ND(optimized_ops, int8_t);
+      }
+      break;
     case kTfLiteInt32:
       if (kernel_type == kReference) {
         TF_LITE_BATCH_TO_SPACE_ND(reference_ops, int32_t);
diff --git a/tensorflow/lite/kernels/batch_to_space_nd_test.cc b/tensorflow/lite/kernels/batch_to_space_nd_test.cc
index a3e06d4c89327050625ac514d41bc29c4f6493f3..bd806b55ca48424e143a77d1f95640365af5fe77 100644
--- a/tensorflow/lite/kernels/batch_to_space_nd_test.cc
+++ b/tensorflow/lite/kernels/batch_to_space_nd_test.cc
@@ -26,8 +26,9 @@ using ::testing::ElementsAreArray;
 
 class BatchToSpaceNDOpModel : public SingleOpModel {
  public:
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor<float>(input_, data);
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor<T>(input_, data);
   }
 
   void SetBlockShape(std::initializer_list<int> data) {
@@ -38,7 +39,10 @@ class BatchToSpaceNDOpModel : public SingleOpModel {
     PopulateTensor<int>(crops_, data);
   }
 
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
  protected:
@@ -58,11 +62,12 @@ class BatchToSpaceNDOpConstModel : public BatchToSpaceNDOpModel {
  public:
   BatchToSpaceNDOpConstModel(std::initializer_list<int> input_shape,
                              std::initializer_list<int> block_shape,
-                             std::initializer_list<int> crops) {
-    input_ = AddInput(TensorType_FLOAT32);
+                             std::initializer_list<int> crops,
+                             const TensorType& type = TensorType_FLOAT32) {
+    input_ = AddInput(type);
     block_shape_ = AddConstInput(TensorType_INT32, block_shape, {2});
     crops_ = AddConstInput(TensorType_INT32, crops, {2, 2});
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(type);
 
     SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
                  BuiltinOptions_BatchToSpaceNDOptions,
@@ -81,11 +86,12 @@ class BatchToSpaceNDOpConstModel : public BatchToSpaceNDOpModel {
 //    m.Invoke();
 class BatchToSpaceNDOpDynamicModel : public BatchToSpaceNDOpModel {
  public:
-  BatchToSpaceNDOpDynamicModel(std::initializer_list<int> input_shape) {
-    input_ = AddInput(TensorType_FLOAT32);
+  BatchToSpaceNDOpDynamicModel(std::initializer_list<int> input_shape,
+                               const TensorType& type = TensorType_FLOAT32) {
+    input_ = AddInput(type);
     block_shape_ = AddInput(TensorType_INT32);
     crops_ = AddInput(TensorType_INT32);
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(type);
 
     SetBuiltinOp(BuiltinOperator_BATCH_TO_SPACE_ND,
                  BuiltinOptions_BatchToSpaceNDOptions,
@@ -96,24 +102,50 @@ class BatchToSpaceNDOpDynamicModel : public BatchToSpaceNDOpModel {
 
 TEST(BatchToSpaceNDOpTest, SimpleConstTest) {
   BatchToSpaceNDOpConstModel m({4, 2, 2, 1}, {2, 2}, {0, 0, 0, 0});
-  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(
+                  {1, 5, 2, 6, 9, 13, 10, 14, 3, 7, 4, 8, 11, 15, 12, 16}));
+}
+
+TEST(BatchToSpaceNDOpTest, SimpleConstTestInt8) {
+  BatchToSpaceNDOpConstModel m({4, 2, 2, 1}, {2, 2}, {0, 0, 0, 0},
+                               TensorType_INT8);
+  m.SetInput<int8_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 5, 2, 6, 9, 13, 10, 14, 3, 7,
-                                               4, 8, 11, 15, 12, 16}));
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray(
+                  {1, 5, 2, 6, 9, 13, 10, 14, 3, 7, 4, 8, 11, 15, 12, 16}));
 }
 
 TEST(BatchToSpaceNDOpTest, SimpleDynamicTest) {
   BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1});
-  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetBlockShape({2, 2});
+  m.SetCrops({0, 0, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(
+                  {1, 5, 2, 6, 9, 13, 10, 14, 3, 7, 4, 8, 11, 15, 12, 16}));
+}
+
+TEST(BatchToSpaceNDOpTest, SimpleDynamicTestInt8) {
+  BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1}, TensorType_INT8);
+  m.SetInput<int8_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.SetBlockShape({2, 2});
   m.SetCrops({0, 0, 0, 0});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 5, 2, 6, 9, 13, 10, 14, 3, 7,
-                                               4, 8, 11, 15, 12, 16}));
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray(
+                  {1, 5, 2, 6, 9, 13, 10, 14, 3, 7, 4, 8, 11, 15, 12, 16}));
 }
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(BatchToSpaceNDOpTest, InvalidShapeTest) {
   EXPECT_DEATH(BatchToSpaceNDOpConstModel({3, 2, 2, 1}, {2, 2}, {0, 0, 0, 0}),
                "Cannot allocate tensors");
@@ -126,11 +158,12 @@ TEST(BatchToSpaceNDOpTest, InvalidCropsConstTest) {
 
 TEST(BatchToSpaceNDOpTest, InvalidCropsDynamicTest) {
   BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1});
-  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.SetInput<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.SetBlockShape({2, 2});
   m.SetCrops({0, 0, -1, 0});
   EXPECT_DEATH(m.Invoke(), "crops.2. >= 0 was not true.");
 }
+#endif
 
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
index 2c345bba69e4879586c6204dab21c1d28e404870..31c6e3f44c8323cee38d196b4cd24031586ad1b0 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
@@ -105,7 +105,10 @@ constexpr int kBwInputActivationStateTensor = 37;
 // Cell state tensors of size {n_batch, n_cell}
 constexpr int kBwInputCellStateTensor = 38;
 
-// Auxiliary input and weights when stacking.
+// Used as auxiliary input and weights when stacking for
+// tf.contrib.rnn.stack_bidirectional_rnn case (with cross links); Used as input
+// to the backward cell when stacking for tf.nn.static_bidirectional_rnn case
+// (without cross links).
 constexpr int kAuxInputTensor = 39;  // Optional
 // Forward weights.
 constexpr int kFwAuxInputToInputWeightsTensor = 40;   // Optional
@@ -182,7 +185,7 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
 
   const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, input_to_input_weights_tensor);
-  if (input_to_input_weights) {
+  if (input_to_input_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
@@ -208,7 +211,7 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
 
   const TfLiteTensor* recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, recurrent_to_input_weights_tensor);
-  if (recurrent_to_input_weights) {
+  if (recurrent_to_input_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[0],
                       n_cell);
@@ -248,7 +251,7 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
 
   const TfLiteTensor* cell_to_input_weights =
       GetOptionalInputTensor(context, node, cell_to_input_weights_tensor);
-  if (cell_to_input_weights) {
+  if (cell_to_input_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->type,
@@ -257,7 +260,7 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
 
   const TfLiteTensor* cell_to_forget_weights =
       GetOptionalInputTensor(context, node, cell_to_forget_weights_tensor);
-  if (cell_to_forget_weights) {
+  if (cell_to_forget_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->type,
@@ -266,7 +269,7 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
 
   const TfLiteTensor* cell_to_output_weights =
       GetOptionalInputTensor(context, node, cell_to_output_weights_tensor);
-  if (cell_to_output_weights) {
+  if (cell_to_output_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->data[0], n_cell);
     TF_LITE_ENSURE_EQ(context, cell_to_output_weights->type,
@@ -315,7 +318,7 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
 
   const TfLiteTensor* projection_weights =
       GetOptionalInputTensor(context, node, projection_weights_tensor);
-  if (projection_weights) {
+  if (projection_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[0], n_output);
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
@@ -325,7 +328,7 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
 
   const TfLiteTensor* projection_bias =
       GetOptionalInputTensor(context, node, projection_bias_tensor);
-  if (projection_bias) {
+  if (projection_bias != nullptr) {
     TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output);
     TF_LITE_ENSURE_EQ(context, projection_bias->type, kTfLiteFloat32);
@@ -395,8 +398,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
   TF_LITE_ENSURE_EQ(context, input->dims->size, 3);
-  const int max_time = input->dims->data[0];
-  const int n_batch = input->dims->data[1];
+  const bool time_major = params->time_major;
+  const int max_time = time_major ? input->dims->data[0] : input->dims->data[1];
+  const int n_batch = time_major ? input->dims->data[1] : input->dims->data[0];
   const int n_input = input->dims->data[2];
 
   const TfLiteTensor* fw_input_to_output_weights =
@@ -458,8 +462,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* bw_aux_input_to_output_weights =
       GetOptionalInputTensor(context, node, kBwAuxInputToOutputWeightsTensor);
 
-  const bool aux_inputs_all_or_none =
-      ((aux_input != nullptr) && (fw_aux_input_to_cell_weights != nullptr) &&
+  const bool aux_inputs_weights_all_or_none =
+      ((fw_aux_input_to_cell_weights != nullptr) &&
        (fw_aux_input_to_forget_weights != nullptr) &&
        (fw_aux_input_to_output_weights != nullptr) &&
        (bw_aux_input_to_cell_weights != nullptr) &&
@@ -471,8 +475,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
        (bw_aux_input_to_cell_weights == nullptr) &&
        (bw_aux_input_to_forget_weights == nullptr) &&
        (bw_aux_input_to_output_weights == nullptr));
-  TF_LITE_ENSURE(context, aux_inputs_all_or_none);
-  const bool has_aux_input = (aux_input != nullptr);
+  TF_LITE_ENSURE(context, aux_inputs_weights_all_or_none);
+
+  const bool has_aux_input = (fw_aux_input_to_forget_weights != nullptr);
 
   if (has_aux_input) {
     // Check that aux_input has the same dimensions (except last) as the input.
@@ -496,15 +501,16 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // Resize the output tensors.
   TfLiteIntArray* fw_output_size = TfLiteIntArrayCreate(3);
-  fw_output_size->data[0] = max_time;
-  fw_output_size->data[1] = n_batch;
+  fw_output_size->data[0] = time_major ? max_time : n_batch;
+  fw_output_size->data[1] = time_major ? n_batch : max_time;
   fw_output_size->data[2] =
       params->merge_outputs ? n_bw_output + n_fw_output : n_fw_output;
   TF_LITE_ENSURE_OK(context,
                     context->ResizeTensor(context, fw_output, fw_output_size));
 
   // The weights are of consistent type, so it suffices to check one.
-  const bool is_hybrid_op = (fw_input_to_output_weights->type == kTfLiteUInt8);
+  const bool is_hybrid_op = (fw_input_to_output_weights->type == kTfLiteUInt8 ||
+                             fw_input_to_output_weights->type == kTfLiteInt8);
 
   TfLiteIntArrayFree(node->temporaries);
   if (is_hybrid_op) {
@@ -555,8 +561,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   if (!params->merge_outputs) {
     TfLiteTensor* bw_output = GetOutput(context, node, kBwOutputTensor);
     TfLiteIntArray* bw_output_size = TfLiteIntArrayCreate(3);
-    bw_output_size->data[0] = max_time;
-    bw_output_size->data[1] = n_batch;
+    bw_output_size->data[0] = time_major ? max_time : n_batch;
+    bw_output_size->data[1] = time_major ? n_batch : max_time;
     bw_output_size->data[2] = n_bw_output;
     TF_LITE_ENSURE_OK(
         context, context->ResizeTensor(context, bw_output, bw_output_size));
@@ -602,7 +608,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         *scratch_tensor_index + kInputQuantized;
     TfLiteTensor* input_quantized =
         GetTemporary(context, node, kInputQuantized);
-    input_quantized->type = kTfLiteUInt8;
+    input_quantized->type = fw_input_to_output_weights->type;
     input_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
       TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
@@ -614,7 +620,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         *scratch_tensor_index + kFwActivationStateQuantized;
     TfLiteTensor* fw_activation_state_quantized =
         GetTemporary(context, node, kFwActivationStateQuantized);
-    fw_activation_state_quantized->type = kTfLiteUInt8;
+    fw_activation_state_quantized->type = fw_input_to_output_weights->type;
     fw_activation_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(fw_activation_state_quantized->dims,
                              fw_activation_state->dims)) {
@@ -628,7 +634,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         *scratch_tensor_index + kBwActivationStateQuantized;
     TfLiteTensor* bw_activation_state_quantized =
         GetTemporary(context, node, kBwActivationStateQuantized);
-    bw_activation_state_quantized->type = kTfLiteUInt8;
+    bw_activation_state_quantized->type = fw_input_to_output_weights->type;
     bw_activation_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(bw_activation_state_quantized->dims,
                              bw_activation_state->dims)) {
@@ -642,7 +648,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         *scratch_tensor_index + kFwCellStateQuantized;
     TfLiteTensor* fw_cell_state_quantized =
         GetTemporary(context, node, kFwCellStateQuantized);
-    fw_cell_state_quantized->type = kTfLiteUInt8;
+    fw_cell_state_quantized->type = fw_input_to_output_weights->type;
     fw_cell_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(fw_cell_state_quantized->dims,
                              fw_cell_state->dims)) {
@@ -656,7 +662,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         *scratch_tensor_index + kBwCellStateQuantized;
     TfLiteTensor* bw_cell_state_quantized =
         GetTemporary(context, node, kBwCellStateQuantized);
-    bw_cell_state_quantized->type = kTfLiteUInt8;
+    bw_cell_state_quantized->type = fw_input_to_output_weights->type;
     bw_cell_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(bw_cell_state_quantized->dims,
                              bw_cell_state->dims)) {
@@ -725,7 +731,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
           *scratch_tensor_index + kAuxInputQuantized;
       TfLiteTensor* aux_input_quantized =
           GetTemporary(context, node, kAuxInputQuantized);
-      aux_input_quantized->type = kTfLiteUInt8;
+      aux_input_quantized->type = fw_input_to_output_weights->type;
       aux_input_quantized->allocation_type = kTfLiteArenaRw;
       if (!TfLiteIntArrayEqual(aux_input_quantized->dims, aux_input->dims)) {
         TfLiteIntArray* aux_input_quantized_size =
@@ -868,6 +874,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* bw_aux_input_to_output_weights =
       GetOptionalInputTensor(context, node, kBwAuxInputToOutputWeightsTensor);
 
+  const bool has_previous_bw_output = (aux_input != nullptr);
+  const bool use_aux_input = (fw_aux_input_to_forget_weights != nullptr);
+
   // Populate a TfLiteLSTMParams struct for the evaluation functions.
   TfLiteLSTMParams lstm_params = {params->activation, params->cell_clip,
                                   params->proj_clip, kTfLiteLSTMFullKernel};
@@ -876,7 +885,27 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       params->merge_outputs ? fw_recurrent_to_output_weights->dims->data[1] : 0;
   const auto actual_bw_output = params->merge_outputs ? fw_output : bw_output;
 
-  // TODO(mirkov): add batch_major support (http://b/117326122).
+  const bool time_major = params->time_major;
+
+  // We want to cover the following cases:
+  //
+  // If not stacking (not connected after other bidi lstms):
+  //   both fw & bw will just use `input`; aux_input will be null.
+  //
+  // If stacking with cross_links, TensorFlow equivalent
+  // (tf.contrib.rnn.stack_bidirectional_rnn):
+  //   both fw & bw will use `input`, but aux_input will be none null.
+  //   Note, this time, whether connected after other bidi lstms both works.
+  //
+  // If stacking without cross_links, but connected after other bidi lstms,
+  // TensorFlow equivalent (tf.nn.static_bidirectional_rnn):
+  //   fw will use `input`, bw will use aux_input, and the `real aux_input`
+  //   will be null.
+
+  const bool non_stacking_mode = !use_aux_input && has_previous_bw_output;
+  const TfLiteTensor* bw_input = non_stacking_mode ? aux_input : input;
+  const TfLiteTensor* real_aux_input = non_stacking_mode ? nullptr : aux_input;
+
   switch (fw_input_to_output_weights->type) {
     case kTfLiteFloat32: {
       TfLiteStatus fw_pass_status = lstm_eval::EvalFloat(
@@ -885,33 +914,44 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           fw_recurrent_to_input_weights, fw_recurrent_to_forget_weights,
           fw_recurrent_to_cell_weights, fw_recurrent_to_output_weights,
           fw_cell_to_input_weights, fw_cell_to_forget_weights,
-          fw_cell_to_output_weights, aux_input, fw_aux_input_to_input_weights,
-          fw_aux_input_to_forget_weights, fw_aux_input_to_cell_weights,
-          fw_aux_input_to_output_weights, fw_input_gate_bias,
-          fw_forget_gate_bias, fw_cell_bias, fw_output_gate_bias,
-          fw_projection_weights, fw_projection_bias, &lstm_params,
-          /*forward_sequence=*/true, /*time_major=*/true, /*output_offset=*/0,
+          fw_cell_to_output_weights,
+          /*input_layer_norm_coefficients=*/nullptr,
+          /*forget_layer_norm_coefficients=*/nullptr,
+          /*cell_layer_norm_coefficients=*/nullptr,
+          /*output_layer_norm_coefficients=*/nullptr, real_aux_input,
+          fw_aux_input_to_input_weights, fw_aux_input_to_forget_weights,
+          fw_aux_input_to_cell_weights, fw_aux_input_to_output_weights,
+          fw_input_gate_bias, fw_forget_gate_bias, fw_cell_bias,
+          fw_output_gate_bias, fw_projection_weights, fw_projection_bias,
+          &lstm_params,
+          /*forward_sequence=*/true, time_major, /*output_offset=*/0,
           fw_scratch_buffer, fw_activation_state, fw_cell_state, fw_output);
       TF_LITE_ENSURE_OK(context, fw_pass_status);
 
       TfLiteStatus bw_pass_status = lstm_eval::EvalFloat(
-          input, bw_input_to_input_weights, bw_input_to_forget_weights,
+          bw_input, bw_input_to_input_weights, bw_input_to_forget_weights,
           bw_input_to_cell_weights, bw_input_to_output_weights,
           bw_recurrent_to_input_weights, bw_recurrent_to_forget_weights,
           bw_recurrent_to_cell_weights, bw_recurrent_to_output_weights,
           bw_cell_to_input_weights, bw_cell_to_forget_weights,
-          bw_cell_to_output_weights, aux_input, bw_aux_input_to_input_weights,
-          bw_aux_input_to_forget_weights, bw_aux_input_to_cell_weights,
-          bw_aux_input_to_output_weights, bw_input_gate_bias,
-          bw_forget_gate_bias, bw_cell_bias, bw_output_gate_bias,
-          bw_projection_weights, bw_projection_bias, &lstm_params,
-          /*forward_sequence=*/false, /*time_major=*/true, bw_output_offset,
+          bw_cell_to_output_weights,
+          /*input_layer_norm_coefficients=*/nullptr,
+          /*forget_layer_norm_coefficients=*/nullptr,
+          /*cell_layer_norm_coefficients=*/nullptr,
+          /*output_layer_norm_coefficients=*/nullptr, real_aux_input,
+          bw_aux_input_to_input_weights, bw_aux_input_to_forget_weights,
+          bw_aux_input_to_cell_weights, bw_aux_input_to_output_weights,
+          bw_input_gate_bias, bw_forget_gate_bias, bw_cell_bias,
+          bw_output_gate_bias, bw_projection_weights, bw_projection_bias,
+          &lstm_params,
+          /*forward_sequence=*/false, time_major, bw_output_offset,
           bw_scratch_buffer, bw_activation_state, bw_cell_state,
           actual_bw_output);
       TF_LITE_ENSURE_OK(context, bw_pass_status);
       return kTfLiteOk;
     }
-    case kTfLiteUInt8: {
+    case kTfLiteUInt8:
+    case kTfLiteInt8: {
       TfLiteTensor* input_quantized =
           GetTemporary(context, node, kInputQuantized);
       TfLiteTensor* fw_activation_state_quantized =
@@ -929,9 +969,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TfLiteTensor* recovered_cell_weights =
           GetTemporary(context, node, kRecoveredCellWeights);
       TfLiteTensor* aux_input_quantized =
-          (aux_input == nullptr)
-              ? nullptr
-              : GetTemporary(context, node, kAuxInputQuantized);
+          use_aux_input ? GetTemporary(context, node, kAuxInputQuantized)
+                        : nullptr;
 
       TfLiteStatus fw_pass_status = lstm_eval::EvalHybrid(
           input, fw_input_to_input_weights, fw_input_to_forget_weights,
@@ -939,11 +978,16 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           fw_recurrent_to_input_weights, fw_recurrent_to_forget_weights,
           fw_recurrent_to_cell_weights, fw_recurrent_to_output_weights,
           fw_cell_to_input_weights, fw_cell_to_forget_weights,
-          fw_cell_to_output_weights, aux_input, fw_aux_input_to_input_weights,
-          fw_aux_input_to_forget_weights, fw_aux_input_to_cell_weights,
-          fw_aux_input_to_output_weights, fw_input_gate_bias,
-          fw_forget_gate_bias, fw_cell_bias, fw_output_gate_bias,
-          fw_projection_weights, fw_projection_bias, &lstm_params,
+          fw_cell_to_output_weights,
+          /*input_layer_norm_coefficients=*/nullptr,
+          /*forget_layer_norm_coefficients=*/nullptr,
+          /*cell_layer_norm_coefficients=*/nullptr,
+          /*output_layer_norm_coefficients=*/nullptr, real_aux_input,
+          fw_aux_input_to_input_weights, fw_aux_input_to_forget_weights,
+          fw_aux_input_to_cell_weights, fw_aux_input_to_output_weights,
+          fw_input_gate_bias, fw_forget_gate_bias, fw_cell_bias,
+          fw_output_gate_bias, fw_projection_weights, fw_projection_bias,
+          &lstm_params,
           /*forward_sequence=*/true, /*time_major=*/true, /*output_offset=*/0,
           fw_scratch_buffer, scaling_factors, prod_scaling_factors,
           recovered_cell_weights, input_quantized, aux_input_quantized,
@@ -952,16 +996,21 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_ENSURE_OK(context, fw_pass_status);
 
       TfLiteStatus bw_pass_status = lstm_eval::EvalHybrid(
-          input, bw_input_to_input_weights, bw_input_to_forget_weights,
+          bw_input, bw_input_to_input_weights, bw_input_to_forget_weights,
           bw_input_to_cell_weights, bw_input_to_output_weights,
           bw_recurrent_to_input_weights, bw_recurrent_to_forget_weights,
           bw_recurrent_to_cell_weights, bw_recurrent_to_output_weights,
           bw_cell_to_input_weights, bw_cell_to_forget_weights,
-          bw_cell_to_output_weights, aux_input, bw_aux_input_to_input_weights,
-          bw_aux_input_to_forget_weights, bw_aux_input_to_cell_weights,
-          bw_aux_input_to_output_weights, bw_input_gate_bias,
-          bw_forget_gate_bias, bw_cell_bias, bw_output_gate_bias,
-          bw_projection_weights, bw_projection_bias, &lstm_params,
+          bw_cell_to_output_weights,
+          /*input_layer_norm_coefficients=*/nullptr,
+          /*forget_layer_norm_coefficients=*/nullptr,
+          /*cell_layer_norm_coefficients=*/nullptr,
+          /*output_layer_norm_coefficients=*/nullptr, real_aux_input,
+          bw_aux_input_to_input_weights, bw_aux_input_to_forget_weights,
+          bw_aux_input_to_cell_weights, bw_aux_input_to_output_weights,
+          bw_input_gate_bias, bw_forget_gate_bias, bw_cell_bias,
+          bw_output_gate_bias, bw_projection_weights, bw_projection_bias,
+          &lstm_params,
           /*forward_sequence=*/false, /*time_major=*/true, bw_output_offset,
           bw_scratch_buffer, scaling_factors, prod_scaling_factors,
           recovered_cell_weights, input_quantized, aux_input_quantized,
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
index b865322682a6dbe2aa7337af0692830fe79efe23..707f06af8322234c3a09b12168445fe285573fee 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
@@ -38,8 +38,8 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
                            int sequence_length, bool use_cifg,
                            bool use_peephole, bool use_projection_weights,
                            bool use_projection_bias, bool merge_outputs,
-                           float cell_clip, float proj_clip,
-                           bool quantize_weights,
+                           bool use_aux_input, float cell_clip, float proj_clip,
+                           bool quantize_weights, bool time_major,
                            const std::vector<std::vector<int>>& input_shapes)
       : n_batch_(n_batch),
         n_input_(n_input),
@@ -185,7 +185,11 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
       bw_output_ = AddOutput(TensorType_FLOAT32);
     }
 
-    aux_input_ = AddNullInput();
+    if (use_aux_input) {
+      aux_input_ = AddInput(TensorType_FLOAT32);
+    } else {
+      aux_input_ = AddNullInput();
+    }
     fw_aux_input_to_input_weights_ = AddNullInput();
     fw_aux_input_to_forget_weights_ = AddNullInput();
     fw_aux_input_to_cell_weights_ = AddNullInput();
@@ -199,7 +203,7 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
                  BuiltinOptions_BidirectionalSequenceLSTMOptions,
                  CreateBidirectionalSequenceLSTMOptions(
                      builder_, ActivationFunctionType_TANH, cell_clip,
-                     proj_clip, merge_outputs)
+                     proj_clip, merge_outputs, time_major)
                      .Union());
     BuildInterpreter(input_shapes);
   }
@@ -302,6 +306,10 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
     PopulateTensor(input_, offset, begin, end);
   }
 
+  void SetAuxInput(int offset, float* begin, float* end) {
+    PopulateTensor(aux_input_, offset, begin, end);
+  }
+
   std::vector<float> GetFwOutput() { return ExtractVector<float>(fw_output_); }
   std::vector<float> GetBwOutput() { return ExtractVector<float>(bw_output_); }
 
@@ -392,7 +400,7 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
 // indicating whether to use quantization or not.
 class LSTMOpTest : public ::testing::TestWithParam<bool> {};
 
-INSTANTIATE_TEST_CASE_P(QuantizationOrNot, LSTMOpTest, ::testing::Bool());
+INSTANTIATE_TEST_SUITE_P(QuantizationOrNot, LSTMOpTest, ::testing::Bool());
 
 TEST_P(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
   const int n_batch = 1;
@@ -406,8 +414,9 @@ TEST_P(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
       /*use_peephole=*/false, /*use_projection_weights=*/false,
-      /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
-      /*proj_clip=*/0.0, quantize_weights,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false,
+      /*use_aux_input=*/false, /*cell_clip=*/0.0,
+      /*proj_clip=*/0.0, quantize_weights, /*time_major=*/true,
       {
           {sequence_length, n_batch, n_input},  // input tensor
 
@@ -463,7 +472,9 @@ TEST_P(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
           {n_batch, n_output},  // activation_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
-          {n_batch, sequence_length, 0},  // aux_input tensor
+          // TODO(b/121134029): Update tests so tensor shapes after state tensor
+          // are used. They are currently ignored by test_util.
+          {sequence_length, n_batch, 0},  // aux_input tensor
           {n_cell, 0},                    // aux_fw_input_to_input tensor
           {n_cell, 0},                    // aux_fw_input_to_forget tensor
           {n_cell, 0},                    // aux_fw_input_to_cell tensor
@@ -568,8 +579,9 @@ TEST_P(LSTMOpTest, BlackBoxTestMergedOutput) {
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
       /*use_peephole=*/false, /*use_projection_weights=*/false,
-      /*use_projection_bias=*/false, /*merge_outputs=*/true, /*cell_clip=*/0.0,
-      /*proj_clip=*/0.0, quantize_weights,
+      /*use_projection_bias=*/false, /*merge_outputs=*/true,
+      /*use_aux_input=*/false, /*cell_clip=*/0.0,
+      /*proj_clip=*/0.0, quantize_weights, /*time_major=*/true,
       {
           {sequence_length, n_batch, n_input},  // input tensor
 
@@ -625,7 +637,9 @@ TEST_P(LSTMOpTest, BlackBoxTestMergedOutput) {
           {n_batch, n_output},  // activation_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
-          {n_batch, sequence_length, 0},  // aux_input tensor
+          // TODO(b/121134029): Update tests so tensor shapes after state tensor
+          // are used. They are currently ignored by test_util.
+          {sequence_length, n_batch, 0},  // aux_input tensor
           {n_cell, 0},                    // aux_fw_input_to_input tensor
           {n_cell, 0},                    // aux_fw_input_to_forget tensor
           {n_cell, 0},                    // aux_fw_input_to_cell tensor
@@ -729,8 +743,9 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClippingReverse) {
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
       /*use_peephole=*/false, /*use_projection_weights=*/false,
-      /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
-      /*proj_clip=*/0.0, /*quantize_weights=*/false,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false,
+      /*use_aux_input=*/false, /*cell_clip=*/0.0,
+      /*proj_clip=*/0.0, /*quantize_weights=*/false, /*time_major=*/true,
       {
           {sequence_length, n_batch, n_input},  // input tensor
 
@@ -786,7 +801,9 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClippingReverse) {
           {n_batch, n_output},  // activation_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
-          {n_batch, sequence_length, 0},  // aux_input tensor
+          // TODO(b/121134029): Update tests so tensor shapes after state tensor
+          // are used. They are currently ignored by test_util.
+          {sequence_length, n_batch, 0},  // aux_input tensor
           {n_cell, 0},                    // aux_fw_input_to_input tensor
           {n_cell, 0},                    // aux_fw_input_to_forget tensor
           {n_cell, 0},                    // aux_fw_input_to_cell tensor
@@ -889,8 +906,9 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/true,
       /*use_peephole=*/true, /*use_projection_weights=*/false,
-      /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
-      /*proj_clip=*/0.0, /*quantize_weights=*/false,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false,
+      /*use_aux_input=*/false, /*cell_clip=*/0.0,
+      /*proj_clip=*/0.0, /*quantize_weights=*/false, /*time_major=*/true,
       {
           {sequence_length, n_batch, n_input},  // input tensor
 
@@ -944,7 +962,9 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
           {n_batch, n_output},  // activation_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
-          {n_batch, sequence_length, 0},  // aux_input tensor
+          // TODO(b/121134029): Update tests so tensor shapes after state tensor
+          // are used. They are currently ignored by test_util.
+          {sequence_length, n_batch, 0},  // aux_input tensor
           {n_cell, 0},                    // aux_fw_input_to_input tensor
           {n_cell, 0},                    // aux_fw_input_to_forget tensor
           {n_cell, 0},                    // aux_fw_input_to_cell tensor
@@ -1039,8 +1059,9 @@ TEST(LSTMOpTest,
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/true,
       /*use_peephole=*/true, /*use_projection_weights=*/false,
-      /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
-      /*proj_clip=*/0.0, /*quantize_weights=*/false,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false,
+      /*use_aux_input=*/false, /*cell_clip=*/0.0,
+      /*proj_clip=*/0.0, /*quantize_weights=*/false, /*time_major=*/true,
       {
           {sequence_length, n_batch, n_input},  // input tensor
 
@@ -1094,7 +1115,9 @@ TEST(LSTMOpTest,
           {n_batch, n_output},  // activation_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
-          {n_batch, sequence_length, 0},  // aux_input tensor
+          // TODO(b/121134029): Update tests so tensor shapes after state tensor
+          // are used. They are currently ignored by test_util.
+          {sequence_length, n_batch, 0},  // aux_input tensor
           {n_cell, 0},                    // aux_fw_input_to_input tensor
           {n_cell, 0},                    // aux_fw_input_to_forget tensor
           {n_cell, 0},                    // aux_fw_input_to_cell tensor
@@ -1189,8 +1212,9 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) {
   BidirectionalLSTMOpModel lstm(
       n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
       /*use_peephole=*/true, /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, /*merge_outputs=*/false, /*cell_clip=*/0.0,
-      /*proj_clip=*/0.0, /*quantize_weights=*/false,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false,
+      /*use_aux_input=*/false, /*cell_clip=*/0.0,
+      /*proj_clip=*/0.0, /*quantize_weights=*/false, /*time_major=*/true,
       {
           {sequence_length, n_batch, n_input},  // input tensor
 
@@ -1244,7 +1268,9 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) {
           {n_batch, n_output},  // activation_state tensor
           {n_batch, n_cell},    // cell_state tensor
 
-          {n_batch, sequence_length, 0},  // aux_input tensor
+          // TODO(b/121134029): Update tests so tensor shapes after state tensor
+          // are used. They are currently ignored by test_util.
+          {sequence_length, n_batch, 0},  // aux_input tensor
           {n_cell, 0},                    // aux_fw_input_to_input tensor
           {n_cell, 0},                    // aux_fw_input_to_forget tensor
           {n_cell, 0},                    // aux_fw_input_to_cell tensor
@@ -1880,6 +1906,874 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) {
   EXPECT_THAT(combined, ElementsAreArray(ArrayFloatNear(expected)));
 }
 
+// Same as above but with batch_major input/output.
+TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClippingBatchMajor) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 20;
+  const int n_output = 16;
+  const int sequence_length = 4;
+
+  BidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
+      /*use_peephole=*/true, /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false,
+      /*use_aux_input=*/false, /*cell_clip=*/0.0,
+      /*proj_clip=*/0.0, /*quantize_weights=*/false, /*time_major=*/false,
+      {
+          {n_batch, sequence_length, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {n_cell},  // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {n_cell},  // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+
+          {n_batch, sequence_length, 0},  // aux_input tensor
+          {n_cell, 0},                    // aux_fw_input_to_input tensor
+          {n_cell, 0},                    // aux_fw_input_to_forget tensor
+          {n_cell, 0},                    // aux_fw_input_to_cell tensor
+          {n_cell, 0},                    // aux_fw_input_to_output tensor
+          {n_cell, 0},                    // aux_bw_input_to_input tensor
+          {n_cell, 0},                    // aux_bw_input_to_forget tensor
+          {n_cell, 0},                    // aux_bw_input_to_cell tensor
+          {n_cell, 0},                    // aux_bw_input_to_output tensor
+      });
+
+  lstm.SetInputToInputWeights(
+      {0.021393683,  0.06124551,    0.046905167,  -0.014657677,  -0.03149463,
+       0.09171803,   0.14647801,    0.10797193,   -0.0057968358, 0.0019193048,
+       -0.2726754,   0.10154029,    -0.018539885, 0.080349885,   -0.10262385,
+       -0.022599787, -0.09121155,   -0.008675967, -0.045206103,  -0.0821282,
+       -0.008045952, 0.015478081,   0.055217247,  0.038719587,   0.044153627,
+       -0.06453243,  0.05031825,    -0.046935108, -0.008164439,  0.014574226,
+       -0.1671009,   -0.15519552,   -0.16819797,  -0.13971269,   -0.11953059,
+       0.25005487,   -0.22790983,   0.009855087,  -0.028140958,  -0.11200698,
+       0.11295408,   -0.0035217577, 0.054485075,  0.05184695,    0.064711206,
+       0.10989193,   0.11674786,    0.03490607,   0.07727357,    0.11390585,
+       -0.1863375,   -0.1034451,    -0.13945189,  -0.049401227,  -0.18767063,
+       0.042483903,  0.14233552,    0.13832581,   0.18350165,    0.14545603,
+       -0.028545704, 0.024939531,   0.050929718,  0.0076203286,  -0.0029723682,
+       -0.042484224, -0.11827596,   -0.09171104,  -0.10808628,   -0.16327988,
+       -0.2273378,   -0.0993647,    -0.017155107, 0.0023917493,  0.049272764,
+       0.0038534778, 0.054764505,   0.089753784,  0.06947234,    0.08014476,
+       -0.04544234,  -0.0497073,    -0.07135631,  -0.048929106,  -0.004042012,
+       -0.009284026, 0.018042054,   0.0036860977, -0.07427302,   -0.11434604,
+       -0.018995456, 0.031487543,   0.012834908,  0.019977754,   0.044256654,
+       -0.39292613,  -0.18519334,   -0.11651281,  -0.06809892,   0.011373677});
+
+  lstm.SetInputToForgetWeights(
+      {-0.0018401089, -0.004852237,  0.03698424,   0.014181704,   0.028273236,
+       -0.016726194,  -0.05249759,   -0.10204261,  0.00861066,    -0.040979505,
+       -0.009899187,  0.01923892,    -0.028177269, -0.08535103,   -0.14585495,
+       0.10662567,    -0.01909731,   -0.017883534, -0.0047269356, -0.045103323,
+       0.0030784295,  0.076784775,   0.07463696,   0.094531395,   0.0814421,
+       -0.12257899,   -0.033945758,  -0.031303465, 0.045630626,   0.06843887,
+       -0.13492945,   -0.012480007,  -0.0811829,   -0.07224499,   -0.09628791,
+       0.045100946,   0.0012300825,  0.013964662,  0.099372394,   0.02543059,
+       0.06958324,    0.034257296,   0.0482646,    0.06267997,    0.052625068,
+       0.12784666,    0.07077897,    0.025725935,  0.04165009,    0.07241905,
+       0.018668644,   -0.037377294,  -0.06277783,  -0.08833636,   -0.040120605,
+       -0.011405586,  -0.007808335,  -0.010301386, -0.005102167,  0.027717464,
+       0.05483423,    0.11449111,    0.11289652,   0.10939839,    0.13396506,
+       -0.08402166,   -0.01901462,   -0.044678304, -0.07720565,   0.014350063,
+       -0.11757958,   -0.0652038,    -0.08185733,  -0.076754324,  -0.092614375,
+       0.10405491,    0.052960336,   0.035755895,  0.035839386,   -0.012540553,
+       0.036881298,   0.02913376,    0.03420159,   0.05448447,    -0.054523353,
+       0.02582715,    0.02327355,    -0.011857179, -0.0011980024, -0.034641717,
+       -0.026125094,  -0.17582615,   -0.15923657,  -0.27486774,   -0.0006143371,
+       0.0001771948,  -8.470171e-05, 0.02651807,   0.045790765,   0.06956496});
+
+  lstm.SetInputToCellWeights(
+      {-0.04580283,   -0.09549462,   -0.032418985,  -0.06454633,
+       -0.043528453,  0.043018587,   -0.049152344,  -0.12418144,
+       -0.078985475,  -0.07596889,   0.019484362,   -0.11434962,
+       -0.0074034138, -0.06314844,   -0.092981495,  0.0062155537,
+       -0.025034338,  -0.0028890965, 0.048929527,   0.06235075,
+       0.10665918,    -0.032036792,  -0.08505916,   -0.10843358,
+       -0.13002433,   -0.036816437,  -0.02130134,   -0.016518239,
+       0.0047691227,  -0.0025825808, 0.066017866,   0.029991534,
+       -0.10652836,   -0.1037554,    -0.13056071,   -0.03266643,
+       -0.033702414,  -0.006473424,  -0.04611692,   0.014419339,
+       -0.025174323,  0.0396852,     0.081777506,   0.06157468,
+       0.10210095,    -0.009658194,  0.046511717,   0.03603906,
+       0.0069369148,  0.015960095,   -0.06507666,   0.09551598,
+       0.053568836,   0.06408714,    0.12835667,    -0.008714329,
+       -0.20211966,   -0.12093674,   0.029450472,   0.2849013,
+       -0.029227901,  0.1164364,     -0.08560263,   0.09941786,
+       -0.036999565,  -0.028842626,  -0.0033637602, -0.017012902,
+       -0.09720865,   -0.11193351,   -0.029155117,  -0.017936034,
+       -0.009768936,  -0.04223324,   -0.036159635,  0.06505112,
+       -0.021742892,  -0.023377212,  -0.07221364,   -0.06430552,
+       0.05453865,    0.091149814,   0.06387331,    0.007518393,
+       0.055960953,   0.069779344,   0.046411168,   0.10509911,
+       0.07463894,    0.0075130584,  0.012850982,   0.04555431,
+       0.056955688,   0.06555285,    0.050801456,   -0.009862683,
+       0.00826772,    -0.026555609,  -0.0073611983, -0.0014897042});
+
+  lstm.SetInputToOutputWeights(
+      {-0.0998932,   -0.07201956,  -0.052803773,  -0.15629593,  -0.15001918,
+       -0.07650751,  0.02359855,   -0.075155355,  -0.08037709,  -0.15093534,
+       0.029517552,  -0.04751393,  0.010350531,   -0.02664851,  -0.016839722,
+       -0.023121163, 0.0077019283, 0.012851257,   -0.05040649,  -0.0129761,
+       -0.021737747, -0.038305793, -0.06870586,   -0.01481247,  -0.001285394,
+       0.10124236,   0.083122835,  0.053313006,   -0.062235646, -0.075637154,
+       -0.027833903, 0.029774971,  0.1130802,     0.09218906,   0.09506135,
+       -0.086665764, -0.037162706, -0.038880914,  -0.035832845, -0.014481564,
+       -0.09825003,  -0.12048569,  -0.097665586,  -0.05287633,  -0.0964047,
+       -0.11366429,  0.035777505,  0.13568819,    0.052451383,  0.050649304,
+       0.05798951,   -0.021852335, -0.099848844,  0.014740475,  -0.078897946,
+       0.04974699,   0.014160473,  0.06973932,    0.04964942,   0.033364646,
+       0.08190124,   0.025535367,  0.050893165,   0.048514254,  0.06945813,
+       -0.078907564, -0.06707616,  -0.11844508,   -0.09986688,  -0.07509403,
+       0.06263226,   0.14925587,   0.20188436,    0.12098451,   0.14639415,
+       0.0015017595, -0.014267382, -0.03417257,   0.012711468,  0.0028300495,
+       -0.024758482, -0.05098548,  -0.0821182,    0.014225672,  0.021544158,
+       0.08949725,   0.07505268,   -0.0020780868, 0.04908258,   0.06476295,
+       -0.022907063, 0.027562456,  0.040185735,   0.019567577,  -0.015598739,
+       -0.049097303, -0.017121866, -0.083368234,  -0.02332002,  -0.0840956});
+
+  lstm.SetInputGateBias(
+      {0.02234832,  0.14757581,   0.18176508,  0.10380666,  0.053110216,
+       -0.06928846, -0.13942584,  -0.11816189, 0.19483899,  0.03652339,
+       -0.10250295, 0.036714908,  -0.18426876, 0.036065217, 0.21810818,
+       0.02383196,  -0.043370757, 0.08690144,  -0.04444982, 0.00030581196});
+
+  lstm.SetForgetGateBias({0.035185695, -0.042891346, -0.03032477, 0.23027696,
+                          0.11098921,  0.15378423,   0.09263801,  0.09790885,
+                          0.09508917,  0.061199076,  0.07665568,  -0.015443159,
+                          -0.03499149, 0.046190713,  0.08895977,  0.10899629,
+                          0.40694186,  0.06030037,   0.012413437, -0.06108739});
+
+  lstm.SetCellBias({-0.024379363, 0.0055531194, 0.23377132,   0.033463873,
+                    -0.1483596,   -0.10639995,  -0.091433935, 0.058573797,
+                    -0.06809782,  -0.07889636,  -0.043246906, -0.09829136,
+                    -0.4279842,   0.034901652,  0.18797937,   0.0075234566,
+                    0.016178843,  0.1749513,    0.13975595,   0.92058027});
+
+  lstm.SetOutputGateBias(
+      {0.046159424,  -0.0012809046, 0.03563469,   0.12648113, 0.027195795,
+       0.35373217,   -0.018957434,  0.008907322,  -0.0762701, 0.12018895,
+       0.04216877,   0.0022856654,  0.040952638,  0.3147856,  0.08225149,
+       -0.057416286, -0.14995944,   -0.008040261, 0.13208859, 0.029760877});
+
+  lstm.SetRecurrentToInputWeights(
+      {-0.001374326,   -0.078856036,   0.10672688,    0.029162422,
+       -0.11585556,    0.02557986,     -0.13446963,   -0.035785314,
+       -0.01244275,    0.025961924,    -0.02337298,   -0.044228926,
+       -0.055839065,   -0.046598054,   -0.010546039,  -0.06900766,
+       0.027239809,    0.022582639,    -0.013296484,  -0.05459212,
+       0.08981,        -0.045407712,   0.08682226,    -0.06867011,
+       -0.14390695,    -0.02916037,    0.000996957,   0.091420636,
+       0.14283475,     -0.07390571,    -0.06402044,   0.062524505,
+       -0.093129106,   0.04860203,     -0.08364217,   -0.08119002,
+       0.009352075,    0.22920375,     0.0016303885,  0.11583097,
+       -0.13732095,    0.012405723,    -0.07551853,   0.06343048,
+       0.12162708,     -0.031923793,   -0.014335606,  0.01790974,
+       -0.10650317,    -0.0724401,     0.08554849,    -0.05727212,
+       0.06556731,     -0.042729504,   -0.043227166,  0.011683251,
+       -0.013082158,   -0.029302018,   -0.010899579,  -0.062036745,
+       -0.022509435,   -0.00964907,    -0.01567329,   0.04260106,
+       -0.07787477,    -0.11576462,    0.017356863,   0.048673786,
+       -0.017577527,   -0.05527947,    -0.082487635,  -0.040137455,
+       -0.10820036,    -0.04666372,    0.022746278,   -0.07851417,
+       0.01068115,     0.032956902,    0.022433773,   0.0026891115,
+       0.08944216,     -0.0685835,     0.010513544,   0.07228705,
+       0.02032331,     -0.059686817,   -0.0005566496, -0.086984694,
+       0.040414046,    -0.1380399,     0.094208956,   -0.05722982,
+       0.012092817,    -0.04989123,    -0.086576,     -0.003399834,
+       -0.04696032,    -0.045747425,   0.10091314,    0.048676282,
+       -0.029037097,   0.031399418,    -0.0040285117, 0.047237843,
+       0.09504992,     0.041799378,    -0.049185462,  -0.031518843,
+       -0.10516937,    0.026374253,    0.10058866,    -0.0033195973,
+       -0.041975245,   0.0073591834,   0.0033782164,  -0.004325073,
+       -0.10167381,    0.042500053,    -0.01447153,   0.06464186,
+       -0.017142897,   0.03312627,     0.009205989,   0.024138335,
+       -0.011337001,   0.035530265,    -0.010912711,  0.0706555,
+       -0.005894094,   0.051841937,    -0.1401738,    -0.02351249,
+       0.0365468,      0.07590991,     0.08838724,    0.021681072,
+       -0.10086113,    0.019608743,    -0.06195883,   0.077335775,
+       0.023646897,    -0.095322326,   0.02233014,    0.09756986,
+       -0.048691444,   -0.009579111,   0.07595467,    0.11480546,
+       -0.09801813,    0.019894179,    0.08502348,    0.004032281,
+       0.037211012,    0.068537936,    -0.048005626,  -0.091520436,
+       -0.028379958,   -0.01556313,    0.06554592,    -0.045599163,
+       -0.01672207,    -0.020169014,   -0.011877351,  -0.20212261,
+       0.010889619,    0.0047078193,   0.038385306,   0.08540671,
+       -0.017140968,   -0.0035865551,  0.016678626,   0.005633034,
+       0.015963363,    0.00871737,     0.060130805,   0.028611384,
+       0.10109069,     -0.015060172,   -0.07894427,   0.06401885,
+       0.011584063,    -0.024466386,   0.0047652307,  -0.09041358,
+       0.030737216,    -0.0046374933,  0.14215417,    -0.11823516,
+       0.019899689,    0.006106124,    -0.027092824,  0.0786356,
+       0.05052217,     -0.058925,      -0.011402121,  -0.024987547,
+       -0.0013661642,  -0.06832946,    -0.015667673,  -0.1083353,
+       -0.00096863037, -0.06988685,    -0.053350925,  -0.027275559,
+       -0.033664223,   -0.07978348,    -0.025200296,  -0.017207067,
+       -0.058403496,   -0.055697463,   0.005798788,   0.12965427,
+       -0.062582195,   0.0013350133,   -0.10482091,   0.0379771,
+       0.072521195,    -0.0029455067,  -0.13797039,   -0.03628521,
+       0.013806405,    -0.017858358,   -0.01008298,   -0.07700066,
+       -0.017081132,   0.019358726,    0.0027079724,  0.004635139,
+       0.062634714,    -0.02338735,    -0.039547626,  -0.02050681,
+       0.03385117,     -0.083611414,   0.002862572,   -0.09421313,
+       0.058618143,    -0.08598433,    0.00972939,    0.023867095,
+       -0.053934585,   -0.023203006,   0.07452513,    -0.048767887,
+       -0.07314807,    -0.056307215,   -0.10433547,   -0.06440842,
+       0.04328182,     0.04389765,     -0.020006588,  -0.09076438,
+       -0.11652589,    -0.021705797,   0.03345259,    -0.010329105,
+       -0.025767034,   0.013057034,    -0.07316461,   -0.10145612,
+       0.06358255,     0.18531723,     0.07759293,    0.12006465,
+       0.1305557,      0.058638252,    -0.03393652,   0.09622831,
+       -0.16253184,    -2.4580743e-06, 0.079869635,   -0.070196845,
+       -0.005644518,   0.06857898,     -0.12598175,   -0.035084512,
+       0.03156317,     -0.12794146,    -0.031963028,  0.04692781,
+       0.030070418,    0.0071660685,   -0.095516115,  -0.004643372,
+       0.040170413,    -0.062104587,   -0.0037324072, 0.0554317,
+       0.08184801,     -0.019164372,   0.06791302,    0.034257166,
+       -0.10307039,    0.021943003,    0.046745934,   0.0790918,
+       -0.0265588,     -0.007824208,   0.042546265,   -0.00977924,
+       -0.0002440307,  -0.017384544,   -0.017990116,  0.12252321,
+       -0.014512694,   -0.08251313,    0.08861942,    0.13589665,
+       0.026351685,    0.012641483,    0.07466548,    0.044301085,
+       -0.045414884,   -0.051112458,   0.03444247,    -0.08502782,
+       -0.04106223,    -0.028126027,   0.028473156,   0.10467447});
+
+  lstm.SetRecurrentToForgetWeights(
+      {-0.057784554,  -0.026057621,  -0.068447545,   -0.022581743,
+       0.14811787,    0.10826372,    0.09471067,     0.03987225,
+       -0.0039523416, 0.00030638507, 0.053185795,    0.10572994,
+       0.08414449,    -0.022036452,  -0.00066928595, -0.09203576,
+       0.032950465,   -0.10985798,   -0.023809856,   0.0021431844,
+       -0.02196096,   -0.00326074,   0.00058621005,  -0.074678116,
+       -0.06193199,   0.055729095,   0.03736828,     0.020123724,
+       0.061878487,   -0.04729229,   0.034919553,    -0.07585433,
+       -0.04421272,   -0.044019096,  0.085488975,    0.04058006,
+       -0.06890133,   -0.030951202,  -0.024628663,   -0.07672815,
+       0.034293607,   0.08556707,    -0.05293577,    -0.033561368,
+       -0.04899627,   0.0241671,     0.015736353,    -0.095442444,
+       -0.029564252,  0.016493602,   -0.035026584,   0.022337519,
+       -0.026871363,  0.004780428,   0.0077918363,   -0.03601621,
+       0.016435321,   -0.03263031,   -0.09543275,    -0.047392778,
+       0.013454138,   0.028934088,   0.01685226,     -0.086110644,
+       -0.046250615,  -0.01847454,   0.047608484,    0.07339695,
+       0.034546845,   -0.04881143,   0.009128804,    -0.08802852,
+       0.03761666,    0.008096139,   -0.014454086,   0.014361001,
+       -0.023502491,  -0.0011840804, -0.07607001,    0.001856849,
+       -0.06509276,   -0.006021153,  -0.08570962,    -0.1451793,
+       0.060212336,   0.055259194,   0.06974018,     0.049454916,
+       -0.027794661,  -0.08077226,   -0.016179763,   0.1169753,
+       0.17213494,    -0.0056326236, -0.053934924,   -0.0124349,
+       -0.11520337,   0.05409887,    0.088759385,    0.0019655675,
+       0.0042065294,  0.03881498,    0.019844765,    0.041858196,
+       -0.05695512,   0.047233116,   0.038937137,    -0.06542224,
+       0.014429736,   -0.09719407,   0.13908425,     -0.05379757,
+       0.012321099,   0.082840554,   -0.029899208,   0.044217527,
+       0.059855383,   0.07711018,    -0.045319796,   0.0948846,
+       -0.011724666,  -0.0033288454, -0.033542685,   -0.04764985,
+       -0.13873616,   0.040668588,   0.034832682,    -0.015319203,
+       -0.018715994,  0.046002675,   0.0599172,      -0.043107376,
+       0.0294216,     -0.002314414,  -0.022424703,   0.0030315618,
+       0.0014641669,  0.0029166266,  -0.11878115,    0.013738511,
+       0.12375372,    -0.0006038222, 0.029104086,    0.087442465,
+       0.052958444,   0.07558703,    0.04817258,     0.044462286,
+       -0.015213451,  -0.08783778,   -0.0561384,     -0.003008196,
+       0.047060397,   -0.002058388,  0.03429439,     -0.018839769,
+       0.024734668,   0.024614193,   -0.042046934,   0.09597743,
+       -0.0043254104, 0.04320769,    0.0064070094,   -0.0019131786,
+       -0.02558259,   -0.022822596,  -0.023273505,   -0.02464396,
+       -0.10991725,   -0.006240552,  0.0074488563,   0.024044557,
+       0.04383914,    -0.046476185,  0.028658995,    0.060410924,
+       0.050786525,   0.009452605,   -0.0073054377,  -0.024810238,
+       0.0052906186,  0.0066939713,  -0.0020913032,  0.014515517,
+       0.015898481,   0.021362653,   -0.030262267,   0.016587038,
+       -0.011442813,  0.041154444,   -0.007631438,   -0.03423484,
+       -0.010977775,  0.036152758,   0.0066366293,   0.11915515,
+       0.02318443,    -0.041350313,  0.021485701,    -0.10906167,
+       -0.028218046,  -0.00954771,   0.020531068,    -0.11995105,
+       -0.03672871,   0.024019798,   0.014255957,    -0.05221243,
+       -0.00661567,   -0.04630967,   0.033188973,    0.10107534,
+       -0.014027541,  0.030796422,   -0.10270911,    -0.035999842,
+       0.15443139,    0.07684145,    0.036571592,    -0.035900835,
+       -0.0034699554, 0.06209149,    0.015920248,    -0.031122351,
+       -0.03858649,   0.01849943,    0.13872518,     0.01503974,
+       0.069941424,   -0.06948533,   -0.0088794185,  0.061282158,
+       -0.047401894,  0.03100163,    -0.041533746,   -0.10430945,
+       0.044574402,   -0.01425562,   -0.024290353,   0.034563623,
+       0.05866852,    0.023947537,   -0.09445152,    0.035450947,
+       0.02247216,    -0.0042998926, 0.061146557,    -0.10250651,
+       0.020881841,   -0.06747029,   0.10062043,     -0.0023941975,
+       0.03532124,    -0.016341697,  0.09685456,     -0.016764693,
+       0.051808182,   0.05875331,    -0.04536488,    0.001626336,
+       -0.028892258,  -0.01048663,   -0.009793449,   -0.017093895,
+       0.010987891,   0.02357273,    -0.00010856845, 0.0099760275,
+       -0.001845119,  -0.03551521,   0.0018358806,   0.05763657,
+       -0.01769146,   0.040995963,   0.02235177,     -0.060430344,
+       0.11475477,    -0.023854522,  0.10071741,     0.0686208,
+       -0.014250481,  0.034261297,   0.047418304,    0.08562733,
+       -0.030519066,  0.0060542435,  0.014653856,    -0.038836084,
+       0.04096551,    0.032249358,   -0.08355519,    -0.026823482,
+       0.056386515,   -0.010401743,  -0.028396193,   0.08507674,
+       0.014410365,   0.020995233,   0.17040324,     0.11511526,
+       0.02459721,    0.0066619175,  0.025853224,    -0.023133837,
+       -0.081302024,  0.017264642,   -0.009585969,   0.09491168,
+       -0.051313367,  0.054532815,   -0.014298593,   0.10657464,
+       0.007076659,   0.10964551,    0.0409152,      0.008275321,
+       -0.07283536,   0.07937492,    0.04192024,     -0.1075027});
+
+  lstm.SetRecurrentToCellWeights(
+      {-0.037322544,   0.018592842,   0.0056175636,  -0.06253426,
+       0.055647098,    -0.05713207,   -0.05626563,   0.005559383,
+       0.03375411,     -0.025757805,  -0.088049285,  0.06017052,
+       -0.06570978,    0.007384076,   0.035123326,   -0.07920549,
+       0.053676967,    0.044480428,   -0.07663568,   0.0071805613,
+       0.08089997,     0.05143358,    0.038261272,   0.03339287,
+       -0.027673481,   0.044746667,   0.028349208,   0.020090483,
+       -0.019443132,   -0.030755889,  -0.0040000007, 0.04465846,
+       -0.021585021,   0.0031670958,  0.0053199246,  -0.056117613,
+       -0.10893326,    0.076739706,   -0.08509834,   -0.027997585,
+       0.037871376,    0.01449768,    -0.09002357,   -0.06111149,
+       -0.046195522,   0.0422062,     -0.005683705,  -0.1253618,
+       -0.012925729,   -0.04890792,   0.06985068,    0.037654128,
+       0.03398274,     -0.004781977,  0.007032333,   -0.031787455,
+       0.010868644,    -0.031489216,  0.09525667,    0.013939797,
+       0.0058680447,   0.0167067,     0.02668468,    -0.04797466,
+       -0.048885044,   -0.12722108,   0.035304096,   0.06554885,
+       0.00972396,     -0.039238118,  -0.05159735,   -0.11329045,
+       0.1613692,      -0.03750952,   0.06529313,    -0.071974665,
+       -0.11769596,    0.015524369,   -0.0013754242, -0.12446318,
+       0.02786344,     -0.014179351,  0.005264273,   0.14376344,
+       0.015983658,    0.03406988,    -0.06939408,   0.040699873,
+       0.02111075,     0.09669095,    0.041345075,   -0.08316494,
+       -0.07684199,    -0.045768797,  0.032298047,   -0.041805092,
+       0.0119405,      0.0061010392,  0.12652606,    0.0064572375,
+       -0.024950314,   0.11574242,    0.04508852,    -0.04335324,
+       0.06760663,     -0.027437469,  0.07216407,    0.06977076,
+       -0.05438599,    0.034033038,   -0.028602652,  0.05346137,
+       0.043184172,    -0.037189785,  0.10420091,    0.00882477,
+       -0.054019816,   -0.074273005,  -0.030617684,  -0.0028467078,
+       0.024302477,    -0.0038869337, 0.005332455,   0.0013399826,
+       0.04361412,     -0.007001822,  0.09631092,    -0.06702025,
+       -0.042049985,   -0.035070654,  -0.04103342,   -0.10273396,
+       0.0544271,      0.037184782,   -0.13150354,   -0.0058036847,
+       -0.008264958,   0.042035464,   0.05891794,    0.029673764,
+       0.0063542654,   0.044788733,   0.054816857,   0.062257513,
+       -0.00093483756, 0.048938446,   -0.004952862,  -0.007730018,
+       -0.04043371,    -0.017094059,  0.07229206,    -0.023670016,
+       -0.052195564,   -0.025616996,  -0.01520939,   0.045104615,
+       -0.007376126,   0.003533447,   0.006570588,   0.056037236,
+       0.12436656,     0.051817212,   0.028532185,   -0.08686856,
+       0.11868599,     0.07663395,    -0.07323171,   0.03463402,
+       -0.050708205,   -0.04458982,   -0.11590894,   0.021273347,
+       0.1251325,      -0.15313013,   -0.12224372,   0.17228661,
+       0.023029093,    0.086124025,   0.006445803,   -0.03496501,
+       0.028332196,    0.04449512,    -0.042436164,  -0.026587414,
+       -0.006041347,   -0.09292539,   -0.05678812,   0.03897832,
+       0.09465633,     0.008115513,   -0.02171956,   0.08304309,
+       0.071401566,    0.019622514,   0.032163795,   -0.004167056,
+       0.02295182,     0.030739572,   0.056506045,   0.004612461,
+       0.06524936,     0.059999723,   0.046395954,   -0.0045512207,
+       -0.1335546,     -0.030136576,  0.11584653,    -0.014678886,
+       0.0020118146,   -0.09688814,   -0.0790206,    0.039770417,
+       -0.0329582,     0.07922767,    0.029322514,   0.026405897,
+       0.04207835,     -0.07073373,   0.063781224,   0.0859677,
+       -0.10925287,    -0.07011058,   0.048005477,   0.03438226,
+       -0.09606514,    -0.006669445,  -0.043381985,  0.04240257,
+       -0.06955775,    -0.06769346,   0.043903265,   -0.026784198,
+       -0.017840602,   0.024307009,   -0.040079936,  -0.019946516,
+       0.045318738,    -0.12233574,   0.026170589,   0.0074471775,
+       0.15978073,     0.10185836,    0.10298046,    -0.015476589,
+       -0.039390966,   -0.072174534,  0.0739445,     -0.1211869,
+       -0.0347889,     -0.07943156,   0.014809798,   -0.12412325,
+       -0.0030663363,  0.039695457,   0.0647603,     -0.08291318,
+       -0.018529687,   -0.004423833,  0.0037507233,  0.084633216,
+       -0.01514876,    -0.056505352,  -0.012800942,  -0.06994386,
+       0.012962922,    -0.031234352,  0.07029052,    0.016418684,
+       0.03618972,     0.055686004,   -0.08663945,   -0.017404709,
+       -0.054761406,   0.029065743,   0.052404847,   0.020238016,
+       0.0048197987,   -0.0214882,    0.07078733,    0.013016777,
+       0.06262858,     0.009184685,   0.020785125,   -0.043904778,
+       -0.0270329,     -0.03299152,   -0.060088247,  -0.015162964,
+       -0.001828936,   0.12642565,    -0.056757294,  0.013586685,
+       0.09232601,     -0.035886683,  0.06000002,    0.05229691,
+       -0.052580316,   -0.082029596,  -0.010794592,  0.012947712,
+       -0.036429964,   -0.085508935,  -0.13127148,   -0.017744139,
+       0.031502828,    0.036232427,   -0.031581745,  0.023051167,
+       -0.05325106,    -0.03421577,   0.028793324,   -0.034633752,
+       -0.009881397,   -0.043551125,  -0.018609839,  0.0019097115,
+       -0.008799762,   0.056595087,   0.0022273948,  0.055752404});
+
+  lstm.SetRecurrentToOutputWeights({
+      0.025825322,   -0.05813119,  0.09495884,   -0.045984812,   -0.01255415,
+      -0.0026479573, -0.08196161,  -0.054914974, -0.0046604523,  -0.029587349,
+      -0.044576716,  -0.07480124,  -0.082868785, 0.023254942,    0.027502948,
+      -0.0039728214, -0.08683098,  -0.08116779,  -0.014675607,   -0.037924774,
+      -0.023314456,  -0.007401714, -0.09255757,  0.029460307,    -0.08829125,
+      -0.005139627,  -0.08989442,  -0.0555066,   0.13596267,     -0.025062224,
+      -0.048351806,  -0.03850004,  0.07266485,   -0.022414139,   0.05940088,
+      0.075114764,   0.09597592,   -0.010211725, -0.0049794707,  -0.011523867,
+      -0.025980417,  0.072999895,  0.11091378,   -0.081685916,   0.014416728,
+      0.043229222,   0.034178585,  -0.07530371,  0.035837382,    -0.085607,
+      -0.007721233,  -0.03287832,  -0.043848954, -0.06404588,    -0.06632928,
+      -0.073643476,  0.008214239,  -0.045984086, 0.039764922,    0.03474462,
+      0.060612556,   -0.080590084, 0.049127717,  0.04151091,     -0.030063879,
+      0.008801774,   -0.023021035, -0.019558564, 0.05158114,     -0.010947698,
+      -0.011825728,  0.0075720972, 0.0699727,    -0.0039981045,  0.069350146,
+      0.08799282,    0.016156472,  0.035502106,  0.11695009,     0.006217345,
+      0.13392477,    -0.037875112, 0.025745004,  0.08940699,     -0.00924166,
+      0.0046702605,  -0.036598757, -0.08811812,  0.10522024,     -0.032441203,
+      0.008176899,   -0.04454919,  0.07058152,   0.0067963637,   0.039206743,
+      0.03259838,    0.03725492,   -0.09515802,  0.013326398,    -0.052055415,
+      -0.025676316,  0.03198509,   -0.015951829, -0.058556724,   0.036879618,
+      0.043357447,   0.028362012,  -0.05908629,  0.0059240665,   -0.04995891,
+      -0.019187413,  0.0276265,    -0.01628143,  0.0025863599,   0.08800015,
+      0.035250366,   -0.022165963, -0.07328642,  -0.009415526,   -0.07455109,
+      0.11690406,    0.0363299,    0.07411125,   0.042103454,    -0.009660886,
+      0.019076364,   0.018299393,  -0.046004917, 0.08891175,     0.0431396,
+      -0.026327137,  -0.051502608, 0.08979574,   -0.051670972,   0.04940282,
+      -0.07491107,   -0.021240504, 0.022596184,  -0.034280192,   0.060163025,
+      -0.058211457,  -0.051837247, -0.01349775,  -0.04639988,    -0.035936575,
+      -0.011681591,  0.064818054,  0.0073146066, -0.021745546,   -0.043124277,
+      -0.06471268,   -0.07053354,  -0.029321948, -0.05330136,    0.016933719,
+      -0.053782392,  0.13747959,   -0.1361751,   -0.11569455,    0.0033329215,
+      0.05693899,    -0.053219706, 0.063698,     0.07977434,     -0.07924483,
+      0.06936997,    0.0034815092, -0.007305279, -0.037325785,   -0.07251102,
+      -0.033633437,  -0.08677009,  0.091591336,  -0.14165086,    0.021752775,
+      0.019683983,   0.0011612234, -0.058154266, 0.049996935,    0.0288841,
+      -0.0024567875, -0.14345716,  0.010955264,  -0.10234828,    0.1183656,
+      -0.0010731248, -0.023590032, -0.072285876, -0.0724771,     -0.026382286,
+      -0.0014920527, 0.042667855,  0.0018776858, 0.02986552,     0.009814309,
+      0.0733756,     0.12289186,   0.018043943,  -0.0458958,     0.049412545,
+      0.033632483,   0.05495232,   0.036686596,  -0.013781798,   -0.010036754,
+      0.02576849,    -0.08307328,  0.010112348,  0.042521734,    -0.05869831,
+      -0.071689695,  0.03876447,   -0.13275425,  -0.0352966,     -0.023077697,
+      0.10285965,    0.084736146,  0.15568255,   -0.00040734606, 0.027835453,
+      -0.10292561,   -0.032401145, 0.10053256,   -0.026142767,   -0.08271222,
+      -0.0030240538, -0.016368777, 0.1070414,    0.042672627,    0.013456989,
+      -0.0437609,    -0.022309763, 0.11576483,   0.04108048,     0.061026827,
+      -0.0190714,    -0.0869359,   0.037901703,  0.0610107,      0.07202949,
+      0.01675338,    0.086139716,  -0.08795751,  -0.014898893,   -0.023771819,
+      -0.01965048,   0.007955471,  -0.043740474, 0.03346837,     -0.10549954,
+      0.090567775,   0.042013682,  -0.03176985,  0.12569028,     -0.02421228,
+      -0.029526481,  0.023851605,  0.031539805,  0.05292009,     -0.02344001,
+      -0.07811758,   -0.08834428,  0.10094801,   0.16594367,     -0.06861939,
+      -0.021256343,  -0.041093912, -0.06669611,  0.035498552,    0.021757556,
+      -0.09302526,   -0.015403468, -0.06614931,  -0.051798206,   -0.013874718,
+      0.03630673,    0.010412845,  -0.08077351,  0.046185967,    0.0035662893,
+      0.03541868,    -0.094149634, -0.034814864, 0.003128424,    -0.020674974,
+      -0.03944324,   -0.008110165, -0.11113267,  0.08484226,     0.043586485,
+      0.040582247,   0.0968012,    -0.065249965, -0.028036479,   0.0050708856,
+      0.0017462453,  0.0326779,    0.041296225,  0.09164146,     -0.047743853,
+      -0.015952192,  -0.034451712, 0.084197424,  -0.05347844,    -0.11768019,
+      0.085926116,   -0.08251791,  -0.045081906, 0.0948852,      0.068401024,
+      0.024856757,   0.06978981,   -0.057309967, -0.012775832,   -0.0032452994,
+      0.01977615,    -0.041040014, -0.024264973, 0.063464895,    0.05431621,
+  });
+
+  lstm.SetCellToInputWeights(
+      {0.040369894, 0.030746894,  0.24704495,  0.018586371,  -0.037586458,
+       -0.15312155, -0.11812848,  -0.11465643, 0.20259799,   0.11418174,
+       -0.10116027, -0.011334949, 0.12411352,  -0.076769054, -0.052169047,
+       0.21198851,  -0.38871562,  -0.09061183, -0.09683246,  -0.21929175});
+
+  lstm.SetCellToForgetWeights(
+      {-0.01998659,  -0.15568835,  -0.24248174,   -0.012770197, 0.041331276,
+       -0.072311886, -0.052123554, -0.0066330447, -0.043891653, 0.036225766,
+       -0.047248036, 0.021479502,  0.033189066,   0.11952997,   -0.020432774,
+       0.64658105,   -0.06650122,  -0.03467612,   0.095340036,  0.23647355});
+
+  lstm.SetCellToOutputWeights(
+      {0.08286371,  -0.08261836, -0.51210177, 0.002913762, 0.17764764,
+       -0.5495371,  -0.08460716, -0.24552552, 0.030037103, 0.04123544,
+       -0.11940523, 0.007358328, 0.1890978,   0.4833202,   -0.34441817,
+       0.36312827,  -0.26375428, 0.1457655,   -0.19724406, 0.15548733});
+
+  lstm.SetProjectionWeights(
+      {-0.009802181,  0.09401916,    0.0717386,     -0.13895074,  0.09641832,
+       0.060420845,   0.08539281,    0.054285463,   0.061395317,  0.034448683,
+       -0.042991187,  0.019801661,   -0.16840284,   -0.015726732, -0.23041931,
+       -0.024478018,  -0.10959692,   -0.013875541,  0.18600968,   -0.061274476,
+       0.0138165,     -0.08160894,   -0.07661644,   0.032372914,  0.16169067,
+       0.22465782,    -0.03993472,   -0.004017731,  0.08633481,   -0.28869787,
+       0.08682067,    0.17240396,    0.014975425,   0.056431185,  0.031037588,
+       0.16702051,    0.0077946745,  0.15140012,    0.29405436,   0.120285,
+       -0.188994,     -0.027265169,  0.043389652,   -0.022061434, 0.014777949,
+       -0.20203483,   0.094781205,   0.19100232,    0.13987629,   -0.036132768,
+       -0.06426278,   -0.05108664,   0.13221376,    0.009441198,  -0.16715929,
+       0.15859416,    -0.040437475,  0.050779544,   -0.022187516, 0.012166504,
+       0.027685808,   -0.07675938,   -0.0055694645, -0.09444123,  0.0046453946,
+       0.050794356,   0.10770313,    -0.20790008,   -0.07149004,  -0.11425117,
+       0.008225835,   -0.035802525,  0.14374903,    0.15262283,   0.048710253,
+       0.1847461,     -0.007487823,  0.11000021,    -0.09542012,  0.22619456,
+       -0.029149994,  0.08527916,    0.009043713,   0.0042746216, 0.016261552,
+       0.022461696,   0.12689082,    -0.043589946,  -0.12035478,  -0.08361797,
+       -0.050666027,  -0.1248618,    -0.1275799,    -0.071875185, 0.07377272,
+       0.09944291,    -0.18897448,   -0.1593054,    -0.06526116,  -0.040107165,
+       -0.004618631,  -0.067624845,  -0.007576253,  0.10727444,   0.041546922,
+       -0.20424393,   0.06907816,    0.050412357,   0.00724631,   0.039827548,
+       0.12449835,    0.10747581,    0.13708383,    0.09134148,   -0.12617786,
+       -0.06428341,   0.09956831,    0.1208086,     -0.14676677,  -0.0727722,
+       0.1126304,     0.010139365,   0.015571211,   -0.038128063, 0.022913318,
+       -0.042050496,  0.16842307,    -0.060597885,  0.10531834,   -0.06411776,
+       -0.07451711,   -0.03410368,   -0.13393489,   0.06534304,   0.003620307,
+       0.04490757,    0.05970546,    0.05197996,    0.02839995,   0.10434969,
+       -0.013699693,  -0.028353551,  -0.07260381,   0.047201227,  -0.024575593,
+       -0.036445823,  0.07155557,    0.009672501,   -0.02328883,  0.009533515,
+       -0.03606021,   -0.07421458,   -0.028082801,  -0.2678904,   -0.13221288,
+       0.18419984,    -0.13012612,   -0.014588381,  -0.035059117, -0.04824723,
+       0.07830115,    -0.056184657,  0.03277091,    0.025466874,  0.14494097,
+       -0.12522776,   -0.098633975,  -0.10766018,   -0.08317623,  0.08594209,
+       0.07749552,    0.039474737,   0.1776665,     -0.07409566,  -0.0477268,
+       0.29323658,    0.10801441,    0.1154011,     0.013952499,  0.10739139,
+       0.10708251,    -0.051456142,  0.0074137426,  -0.10430189,  0.10034707,
+       0.045594677,   0.0635285,     -0.0715442,    -0.089667566, -0.10811871,
+       0.00026344223, 0.08298446,    -0.009525053,  0.006585689,  -0.24567553,
+       -0.09450807,   0.09648481,    0.026996298,   -0.06419476,  -0.04752702,
+       -0.11063944,   -0.23441927,   -0.17608605,   -0.052156363, 0.067035615,
+       0.19271925,    -0.0032889997, -0.043264326,  0.09663576,   -0.057112187,
+       -0.10100678,   0.0628376,     0.04447668,    0.017961001,  -0.10094388,
+       -0.10190601,   0.18335468,    0.10494553,    -0.052095775, -0.0026118709,
+       0.10539724,    -0.04383912,   -0.042349473,  0.08438151,   -0.1947263,
+       0.02251204,    0.11216432,    -0.10307853,   0.17351969,   -0.039091777,
+       0.08066188,    -0.00561982,   0.12633002,    0.11335965,   -0.0088127935,
+       -0.019777594,  0.06864014,    -0.059751723,  0.016233567,  -0.06894641,
+       -0.28651384,   -0.004228674,  0.019708522,   -0.16305895,  -0.07468996,
+       -0.0855457,    0.099339016,   -0.07580735,   -0.13775392,  0.08434318,
+       0.08330512,    -0.12131499,   0.031935584,   0.09180414,   -0.08876437,
+       -0.08049874,   0.008753825,   0.03498998,    0.030215185,  0.03907079,
+       0.089751154,   0.029194152,   -0.03337423,   -0.019092513, 0.04331237,
+       0.04299654,    -0.036394123,  -0.12915532,   0.09793732,   0.07512415,
+       -0.11319543,   -0.032502122,  0.15661901,    0.07671967,   -0.005491124,
+       -0.19379048,   -0.218606,     0.21448623,    0.017840758,  0.1416943,
+       -0.07051762,   0.19488361,    0.02664691,    -0.18104725,  -0.09334311,
+       0.15026465,    -0.15493552,   -0.057762887,  -0.11604192,  -0.262013,
+       -0.01391798,   0.012185008,   0.11156489,    -0.07483202,  0.06693364,
+       -0.26151478,   0.046425626,   0.036540434,   -0.16435726,  0.17338543,
+       -0.21401681,   -0.11385144,   -0.08283257,   -0.069031075, 0.030635102,
+       0.010969227,   0.11109743,    0.010919218,   0.027526086,  0.13519906,
+       0.01891392,    -0.046839405,  -0.040167913,  0.017953383,  -0.09700955,
+       0.0061885654,  -0.07000971,   0.026893595,   -0.038844477, 0.14543656});
+
+  static float lstm_input[][20] = {
+      {// Batch0: 4 (input_sequence_size) * 5 (n_input)
+       0.787926, 0.151646, 0.071352, 0.118426, 0.458058, 0.596268, 0.998386,
+       0.568695, 0.864524, 0.571277, 0.073204, 0.296072, 0.743333, 0.069199,
+       0.045348, 0.867394, 0.291279, 0.013714, 0.482521, 0.626339},
+
+      {// Batch1: 4 (input_sequence_size) * 5 (n_input)
+       0.295743, 0.544053, 0.690064, 0.858138, 0.497181, 0.642421, 0.524260,
+       0.134799, 0.003639, 0.162482, 0.640394, 0.930399, 0.050782, 0.432485,
+       0.988078, 0.082922, 0.563329, 0.865614, 0.333232, 0.259916}};
+
+  static float lstm_fw_golden_output[][64] = {
+      {// Batch0: 4 (input_sequence_size) * 16 (n_output)
+       -0.00396806, 0.029352,     -0.00279226, 0.0159977,   -0.00835576,
+       -0.0211779,  0.0283512,    -0.0114597,  0.00907307,  -0.0244004,
+       -0.0152191,  -0.0259063,   0.00914318,  0.00415118,  0.017147,
+       0.0134203,   -0.0166936,   0.0381209,   0.000889694, 0.0143363,
+       -0.0328911,  -0.0234288,   0.0333051,   -0.012229,   0.0110322,
+       -0.0457725,  -0.000832209, -0.0202817,  0.0327257,   0.0121308,
+       0.0155969,   0.0312091,    -0.0213783,  0.0350169,   0.000324794,
+       0.0276012,   -0.0263374,   -0.0371449,  0.0446149,   -0.0205474,
+       0.0103729,   -0.0576349,   -0.0150052,  -0.0292043,  0.0376827,
+       0.0136115,   0.0243435,    0.0354492,   -0.0189322,  0.0464512,
+       -0.00251373, 0.0225745,    -0.0308346,  -0.0317124,  0.0460407,
+       -0.0189395,  0.0149363,    -0.0530162,  -0.0150767,  -0.0340193,
+       0.0286833,   0.00824207,   0.0264887,   0.0305169},
+      {// Batch1: 4 (input_sequence_size) * 16 (n_output)
+       -0.013869,    0.0287268,   -0.00334693, 0.00733398,  -0.0287926,
+       -0.0186926,   0.0193662,   -0.0115437,  0.00422612,  -0.0345232,
+       0.00223253,   -0.00957321, 0.0210624,   0.013331,    0.0150954,
+       0.02168,      -0.0141913,  0.0322082,   0.00227024,  0.0260507,
+       -0.0188721,   -0.0296489,  0.0399134,   -0.0160509,  0.0116039,
+       -0.0447318,   -0.0150515,  -0.0277406,  0.0316596,   0.0118233,
+       0.0214762,    0.0293641,   -0.0204549,  0.0450315,   -0.00117378,
+       0.0167673,    -0.0375007,  -0.0238314,  0.038784,    -0.0174034,
+       0.0131743,    -0.0506589,  -0.0048447,  -0.0240239,  0.0325789,
+       0.00790065,   0.0220157,   0.0333314,   -0.0264787,  0.0387855,
+       -0.000764675, 0.0217599,   -0.037537,   -0.0335206,  0.0431679,
+       -0.0211424,   0.010203,    -0.062785,   -0.00832363, -0.025181,
+       0.0412031,    0.0118723,   0.0239643,   0.0394009}};
+
+  static float lstm_combined_golden_output[][64] = {
+      {-0.022014, 0.073544,  -0.002235, 0.040068,  -0.037136, -0.052788,
+       0.075325,  -0.029378, 0.024298,  -0.07733,  -0.030674, -0.060229,
+       0.040599,  0.011608,  0.042005,  0.045977,  -0.039225, 0.076294,
+       0.000735,  0.032852,  -0.069869, -0.053312, 0.073527,  -0.028136,
+       0.021585,  -0.102679, -0.004327, -0.043304, 0.072861,  0.027077,
+       0.034558,  0.068292,  -0.036292, 0.069832,  -0.003032, 0.053829,
+       -0.043821, -0.072713, 0.085029,  -0.040374, 0.020014,  -0.104521,
+       -0.034504, -0.059759, 0.062569,  0.025652,  0.049306,  0.061189,
+       -0.025146, 0.079643,  -0.005188, 0.033080,  -0.048079, -0.048082,
+       0.069369,  -0.028900, 0.024572,  -0.077547, -0.022517, -0.054477,
+       0.038857,  0.013336,  0.043234,  0.044788},
+      {-0.039186, 0.070792,  -0.005913, 0.02642,   -0.068274, -0.05022,
+       0.061444,  -0.031241, 0.014996,  -0.094544, -0.004146, -0.03464,
+       0.058981,  0.026097,  0.039781,  0.058408,  -0.031887, 0.069252,
+       0.00576,   0.054062,  -0.042801, -0.059974, 0.085272,  -0.034453,
+       0.026097,  -0.0959,   -0.031164, -0.058699, 0.06839,   0.020512,
+       0.044727,  0.063609,  -0.039863, 0.084819,  -0.003909, 0.028666,
+       -0.075677, -0.045125, 0.070379,  -0.033895, 0.022111,  -0.097184,
+       -0.004921, -0.040851, 0.062316,  0.017435,  0.041437,  0.064568,
+       -0.039656, 0.060726,  -0.003402, 0.036854,  -0.056503, -0.058554,
+       0.068588,  -0.034879, 0.01352,   -0.09962,  -0.01434,  -0.039505,
+       0.065133,  0.024321,  0.038473,  0.062438}};
+
+  const int input_sequence_size = lstm.sequence_length() * lstm.num_inputs();
+  EXPECT_EQ(input_sequence_size, 20);
+  float* batch0_start = lstm_input[0];
+  float* batch0_end = batch0_start + input_sequence_size;
+  lstm.SetInput(0, batch0_start, batch0_end);
+
+  float* batch1_start = lstm_input[1];
+  float* batch1_end = batch1_start + input_sequence_size;
+  lstm.SetInput(input_sequence_size, batch1_start, batch1_end);
+
+  lstm.Invoke();
+
+  const int output_sequence_size =
+      lstm.sequence_length() * lstm.num_fw_outputs();
+  EXPECT_EQ(output_sequence_size, 64);
+  std::vector<float> expected;
+  const float* golden_start_batch0 = lstm_fw_golden_output[0];
+  const float* golden_end_batch0 = golden_start_batch0 + output_sequence_size;
+  expected.insert(expected.end(), golden_start_batch0, golden_end_batch0);
+
+  const float* golden_start_batch1 = lstm_fw_golden_output[1];
+  const float* golden_end_batch1 = golden_start_batch1 + output_sequence_size;
+  expected.insert(expected.end(), golden_start_batch1, golden_end_batch1);
+  EXPECT_THAT(lstm.GetFwOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+
+  // Check if the sum of forward backward matches the golden.
+  expected.clear();
+  golden_start_batch0 = lstm_combined_golden_output[0];
+  golden_end_batch0 = golden_start_batch0 + output_sequence_size;
+  expected.insert(expected.end(), golden_start_batch0, golden_end_batch0);
+
+  golden_start_batch1 = lstm_combined_golden_output[1];
+  golden_end_batch1 = golden_start_batch1 + output_sequence_size;
+  expected.insert(expected.end(), golden_start_batch1, golden_end_batch1);
+
+  std::vector<float> combined;
+  for (int i = 0; i < lstm.GetFwOutput().size(); ++i) {
+    combined.push_back(lstm.GetFwOutput()[i] + lstm.GetBwOutput()[i]);
+  }
+  EXPECT_THAT(combined, ElementsAreArray(ArrayFloatNear(expected)));
+}
+
+// Same as the no cifg no peephole no projection no clipping test, but have an
+// aux input (without aux input weights), this is the case when stacking but no
+// cross-links.
+TEST_P(LSTMOpTest, BlackBoxTestWithAuxInput) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+  const int sequence_length = 3;
+  const bool quantize_weights = GetParam();
+
+  BidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length, /*use_cifg=*/false,
+      /*use_peephole=*/false, /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false, /*merge_outputs=*/false,
+      /*use_aux_input=*/true, /*cell_clip=*/0.0,
+      /*proj_clip=*/0.0, quantize_weights, /*time_major=*/true,
+      {
+          {sequence_length, n_batch, n_input},  // input tensor
+
+          // Forward cell
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},  // cell_to_input_weight tensor
+          {0},  // cell_to_forget_weight tensor
+          {0},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+
+          // Backward cell
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},  // cell_to_input_weight tensor
+          {0},  // cell_to_forget_weight tensor
+          {0},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+
+          // TODO(b/121134029): Update tests so tensor shapes after state tensor
+          // are used. They are currently ignored by test_util.
+          {sequence_length, n_batch, n_input},  // aux_input tensor
+          {n_cell, 0},                          // aux_fw_input_to_input tensor
+          {n_cell, 0},                          // aux_fw_input_to_forget tensor
+          {n_cell, 0},                          // aux_fw_input_to_cell tensor
+          {n_cell, 0},                          // aux_fw_input_to_output tensor
+          {n_cell, 0},                          // aux_bw_input_to_input tensor
+          {n_cell, 0},                          // aux_bw_input_to_forget tensor
+          {n_cell, 0},                          // aux_bw_input_to_cell tensor
+          {n_cell, 0},                          // aux_bw_input_to_output tensor
+      });
+
+  lstm.SetInputToInputWeights({-0.45018822, -0.02338299, -0.0870589,
+                               -0.34550029, 0.04266912, -0.15680569,
+                               -0.34856534, 0.43890524});
+
+  lstm.SetInputToCellWeights({-0.50013041, 0.1370284, 0.11810488, 0.2013163,
+                              -0.20583314, 0.44344562, 0.22077113,
+                              -0.29909778});
+
+  lstm.SetInputToForgetWeights({0.09701663, 0.20334584, -0.50592935,
+                                -0.31343272, -0.40032279, 0.44781327,
+                                0.01387155, -0.35593212});
+
+  lstm.SetInputToOutputWeights({-0.25065863, -0.28290087, 0.04613829,
+                                0.40525138, 0.44272184, 0.03897077, -0.1556896,
+                                0.19487578});
+
+  lstm.SetInputGateBias({0., 0., 0., 0.});
+
+  lstm.SetCellBias({0., 0., 0., 0.});
+
+  lstm.SetForgetGateBias({1., 1., 1., 1.});
+
+  lstm.SetOutputGateBias({0., 0., 0., 0.});
+
+  lstm.SetRecurrentToInputWeights(
+      {-0.0063535, -0.2042388, 0.31454784, -0.35746509, 0.28902304, 0.08183324,
+       -0.16555229, 0.02286911, -0.13566875, 0.03034258, 0.48091322,
+       -0.12528998, 0.24077177, -0.51332325, -0.33502164, 0.10629296});
+
+  lstm.SetRecurrentToCellWeights(
+      {-0.3407414, 0.24443203, -0.2078532, 0.26320225, 0.05695659, -0.00123841,
+       -0.4744786, -0.35869038, -0.06418842, -0.13502428, -0.501764, 0.22830659,
+       -0.46367589, 0.26016325, -0.03894562, -0.16368064});
+
+  lstm.SetRecurrentToForgetWeights(
+      {-0.48684245, -0.06655136, 0.42224967, 0.2112639, 0.27654213, 0.20864892,
+       -0.07646349, 0.45877004, 0.00141793, -0.14609534, 0.36447752, 0.09196436,
+       0.28053468, 0.01560611, -0.20127171, -0.01140004});
+
+  lstm.SetRecurrentToOutputWeights(
+      {0.43385774, -0.17194885, 0.2718237, 0.09215671, 0.24107647, -0.39835793,
+       0.18212086, 0.01301402, 0.48572797, -0.50656658, 0.20047462, -0.20607421,
+       -0.51818722, -0.15390486, 0.0468148, 0.39922136});
+
+  // Input should have n_input * sequence_length many values.
+  static float lstm_input[] = {2., 3., 3., 4., 1., 1.};
+  static float lstm_fw_golden_output[] = {
+      -0.02973187, 0.1229473,  0.20885126, -0.15358765,
+      -0.03716109, 0.12507336, 0.41193449, -0.20860538,
+      -0.15053082, 0.09120187, 0.24278517, -0.12222792};
+  static float lstm_bw_golden_output[] = {
+      -0.0806187, 0.139077, 0.400476,   -0.197842, -0.0332076, 0.123838,
+      0.309777,   -0.17621, -0.0490733, 0.0739237, 0.067706,   -0.0208124};
+
+  float* batch0_start = lstm_input;
+  float* batch0_end = batch0_start + lstm.num_inputs() * lstm.sequence_length();
+
+  lstm.SetInput(0, batch0_start, batch0_end);
+  // Aux input and input are the same, so we should observe the same outputs
+  // as there's no aux input.
+  lstm.SetAuxInput(0, batch0_start, batch0_end);
+
+  lstm.Invoke();
+
+  float* fw_golden_start = lstm_fw_golden_output;
+  float* fw_golden_end =
+      fw_golden_start + lstm.num_fw_outputs() * lstm.sequence_length();
+  std::vector<float> fw_expected;
+  fw_expected.insert(fw_expected.end(), fw_golden_start, fw_golden_end);
+  EXPECT_THAT(lstm.GetFwOutput(),
+              ElementsAreArray(
+                  ArrayFloatNear(fw_expected, quantize_weights ? 1e-2 : 1e-5)));
+
+  float* bw_golden_start = lstm_bw_golden_output;
+  float* bw_golden_end =
+      bw_golden_start + lstm.num_bw_outputs() * lstm.sequence_length();
+  std::vector<float> bw_expected;
+  bw_expected.insert(bw_expected.end(), bw_golden_start, bw_golden_end);
+  EXPECT_THAT(lstm.GetBwOutput(),
+              ElementsAreArray(
+                  ArrayFloatNear(bw_expected, quantize_weights ? 1e-2 : 1e-5)));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc b/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
index 5194c2463092eedd41f634dda8b8db201b03e699..0adf574bb0641b2ddd2774f1563a92a66023f7a2 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
@@ -31,6 +31,18 @@ namespace ops {
 namespace builtin {
 namespace bidirectional_sequence_rnn {
 
+namespace {
+
+int8_t* GetInt8DataPtr(const TfLiteTensor* tensor, const bool is_uint8) {
+  if (is_uint8) {
+    return reinterpret_cast<int8_t*>(tensor->data.uint8);
+  } else {
+    return tensor->data.int8;
+  }
+}
+
+}  // namespace
+
 constexpr int kInputTensor = 0;
 // Forward and backward cell tensors.
 constexpr int kFwWeightsTensor = 1;
@@ -41,7 +53,10 @@ constexpr int kBwWeightsTensor = 5;
 constexpr int kBwRecurrentWeightsTensor = 6;
 constexpr int kBwBiasTensor = 7;
 constexpr int kBwHiddenStateTensor = 8;
-// Auxiliary inputs.
+// Used as auxiliary input and weights when stacking for
+// tf.contrib.rnn.stack_bidirectional_rnn case (with cross links); Used as input
+// to the backward cell when stacking for tf.nn.static_bidirectional_rnn case
+// (without cross links).
 constexpr int kAuxInputTensor = 9;       // Optional.
 constexpr int kFwAuxWeightsTensor = 10;  // Optional.
 constexpr int kBwAuxWeightsTensor = 11;  // Optional.
@@ -101,13 +116,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* bw_aux_input_weights =
       GetOptionalInputTensor(context, node, kBwAuxWeightsTensor);
 
-  const bool aux_inputs_all_or_none =
-      ((aux_input != nullptr) && (fw_aux_input_weights != nullptr) &&
+  const bool aux_inputs_weights_or_none =
+      ((fw_aux_input_weights != nullptr) &&
        (bw_aux_input_weights != nullptr)) ||
-      ((aux_input == nullptr) && (fw_aux_input_weights == nullptr) &&
-       (bw_aux_input_weights == nullptr));
-  TF_LITE_ENSURE(context, aux_inputs_all_or_none);
-  const bool has_aux_input = (aux_input != nullptr);
+      ((fw_aux_input_weights == nullptr) && (bw_aux_input_weights == nullptr));
+  TF_LITE_ENSURE(context, aux_inputs_weights_or_none);
+  const bool has_aux_input = (fw_aux_input_weights != nullptr);
 
   // Check all the parameters of tensor match within themselves and match the
   // input configuration.
@@ -154,8 +168,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                       bw_aux_input_weights->dims->data[1]);
   }
 
-  const bool is_hybrid_op =
-      (fw_input_weights->type == kTfLiteUInt8 && input->type == kTfLiteFloat32);
+  const bool is_hybrid_op = ((fw_input_weights->type == kTfLiteUInt8 ||
+                              fw_input_weights->type == kTfLiteInt8) &&
+                             input->type == kTfLiteFloat32);
 
   if (is_hybrid_op) {
     int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
@@ -172,7 +187,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         *scratch_tensor_index + kInputQuantized;
     TfLiteTensor* input_quantized =
         GetTemporary(context, node, kInputQuantized);
-    input_quantized->type = kTfLiteUInt8;
+    input_quantized->type = fw_input_weights->type;
     input_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
       TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
@@ -184,7 +199,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         *scratch_tensor_index + kFwHiddenStateQuantized;
     TfLiteTensor* fw_hidden_state_quantized =
         GetTemporary(context, node, kFwHiddenStateQuantized);
-    fw_hidden_state_quantized->type = kTfLiteUInt8;
+    fw_hidden_state_quantized->type = fw_input_weights->type;
     fw_hidden_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(fw_hidden_state_quantized->dims,
                              fw_hidden_state->dims)) {
@@ -199,7 +214,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         *scratch_tensor_index + kBwHiddenStateQuantized;
     TfLiteTensor* bw_hidden_state_quantized =
         GetTemporary(context, node, kBwHiddenStateQuantized);
-    bw_hidden_state_quantized->type = kTfLiteUInt8;
+    bw_hidden_state_quantized->type = fw_input_weights->type;
     bw_hidden_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(bw_hidden_state_quantized->dims,
                              bw_hidden_state->dims)) {
@@ -230,7 +245,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
           *scratch_tensor_index + kAuxInputQuantized;
       TfLiteTensor* aux_input_quantized =
           GetTemporary(context, node, kAuxInputQuantized);
-      aux_input_quantized->type = kTfLiteUInt8;
+      aux_input_quantized->type = fw_input_weights->type;
       aux_input_quantized->allocation_type = kTfLiteArenaRw;
       if (!TfLiteIntArrayEqual(aux_input_quantized->dims, aux_input->dims)) {
         TfLiteIntArray* aux_input_quantized_size =
@@ -264,16 +279,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-TfLiteStatus EvalFloat(
-    const TfLiteTensor* input, const TfLiteTensor* fw_input_weights,
-    const TfLiteTensor* fw_recurrent_weights, const TfLiteTensor* fw_bias,
-    const TfLiteTensor* bw_input_weights,
-    const TfLiteTensor* bw_recurrent_weights, const TfLiteTensor* bw_bias,
-    const TfLiteTensor* aux_input, const TfLiteTensor* fw_aux_input_weights,
-    const TfLiteTensor* bw_aux_input_weights,
-    const TfLiteBidirectionalSequenceRNNParams* params,
-    TfLiteTensor* fw_hidden_state, TfLiteTensor* fw_output,
-    TfLiteTensor* bw_hidden_state, TfLiteTensor* bw_output) {
+TfLiteStatus EvalFloat(const TfLiteTensor* input, const TfLiteTensor* bw_input,
+                       const TfLiteTensor* fw_input_weights,
+                       const TfLiteTensor* fw_recurrent_weights,
+                       const TfLiteTensor* fw_bias,
+                       const TfLiteTensor* bw_input_weights,
+                       const TfLiteTensor* bw_recurrent_weights,
+                       const TfLiteTensor* bw_bias,
+                       const TfLiteTensor* aux_input,
+                       const TfLiteTensor* fw_aux_input_weights,
+                       const TfLiteTensor* bw_aux_input_weights,
+                       const TfLiteBidirectionalSequenceRNNParams* params,
+                       TfLiteTensor* fw_hidden_state, TfLiteTensor* fw_output,
+                       TfLiteTensor* bw_hidden_state, TfLiteTensor* bw_output) {
   const bool time_major = params->time_major;
   const int batch_size =
       (time_major) ? input->dims->data[1] : input->dims->data[0];
@@ -326,7 +344,7 @@ TfLiteStatus EvalFloat(
     float* bw_hidden_state_ptr_batch = bw_hidden_state->data.f;
     for (int s = max_time - 1; s >= 0; s--) {
       const float* input_ptr_batch =
-          input->data.f + s * input_size * batch_size;
+          bw_input->data.f + s * input_size * batch_size;
       const float* aux_input_ptr_batch =
           (aux_input != nullptr)
               ? aux_input->data.f + s * input_size * batch_size
@@ -394,7 +412,8 @@ TfLiteStatus EvalFloat(
 }
 
 TfLiteStatus EvalHybrid(
-    const TfLiteTensor* input, const TfLiteTensor* fw_input_weights,
+    const TfLiteTensor* input, const TfLiteTensor* bw_input,
+    const TfLiteTensor* fw_input_weights,
     const TfLiteTensor* fw_recurrent_weights, const TfLiteTensor* fw_bias,
     const TfLiteTensor* bw_input_weights,
     const TfLiteTensor* bw_recurrent_weights, const TfLiteTensor* bw_bias,
@@ -406,6 +425,7 @@ TfLiteStatus EvalHybrid(
     TfLiteTensor* fw_hidden_state, TfLiteTensor* fw_output,
     TfLiteTensor* bw_hidden_state_quantized, TfLiteTensor* bw_hidden_state,
     TfLiteTensor* bw_output) {
+  const bool is_uint8_hybrid = fw_input_weights->type == kTfLiteUInt8;
   const bool time_major = params->time_major;
   const int batch_size =
       (time_major) ? input->dims->data[1] : input->dims->data[0];
@@ -417,19 +437,19 @@ TfLiteStatus EvalHybrid(
   const int fw_num_units = fw_input_weights->dims->data[0];
   const float* fw_bias_ptr = fw_bias->data.f;
   const int8_t* fw_input_weights_ptr =
-      reinterpret_cast<const int8_t*>(fw_input_weights->data.uint8);
+      GetInt8DataPtr(fw_input_weights, is_uint8_hybrid);
   float fw_input_weights_scale = fw_input_weights->params.scale;
   const int8_t* fw_recurrent_weights_ptr =
-      reinterpret_cast<const int8_t*>(fw_recurrent_weights->data.uint8);
+      GetInt8DataPtr(fw_recurrent_weights, is_uint8_hybrid);
   float fw_recurrent_weights_scale = fw_recurrent_weights->params.scale;
 
   const int bw_num_units = bw_input_weights->dims->data[0];
   const float* bw_bias_ptr = bw_bias->data.f;
   const int8_t* bw_input_weights_ptr =
-      reinterpret_cast<const int8_t*>(bw_input_weights->data.uint8);
+      GetInt8DataPtr(bw_input_weights, is_uint8_hybrid);
   float bw_input_weights_scale = bw_input_weights->params.scale;
   const int8_t* bw_recurrent_weights_ptr =
-      reinterpret_cast<const int8_t*>(bw_recurrent_weights->data.uint8);
+      GetInt8DataPtr(bw_recurrent_weights, is_uint8_hybrid);
   float bw_recurrent_weights_scale = bw_recurrent_weights->params.scale;
 
   // Set the auxiliary pointers and scales if needed.
@@ -440,21 +460,22 @@ TfLiteStatus EvalHybrid(
   int8_t* aux_quantized_input_ptr = nullptr;
   if (aux_input_size > 0) {
     aux_fw_input_weights_ptr =
-        reinterpret_cast<int8_t*>(aux_fw_input_weights->data.uint8);
+        GetInt8DataPtr(aux_fw_input_weights, is_uint8_hybrid);
     aux_fw_input_weights_scale = aux_fw_input_weights->params.scale;
     aux_bw_input_weights_ptr =
-        reinterpret_cast<int8_t*>(aux_bw_input_weights->data.uint8);
+        GetInt8DataPtr(aux_bw_input_weights, is_uint8_hybrid);
     aux_bw_input_weights_scale = aux_bw_input_weights->params.scale;
-    aux_quantized_input_ptr = reinterpret_cast<int8_t*>(aux_input_quantized);
+    aux_quantized_input_ptr =
+        GetInt8DataPtr(aux_input_quantized, is_uint8_hybrid);
   }
 
   // Initialize temporary storage for quantized values.
   int8_t* quantized_input_ptr =
-      reinterpret_cast<int8_t*>(input_quantized->data.uint8);
+      GetInt8DataPtr(input_quantized, is_uint8_hybrid);
   int8_t* fw_quantized_hidden_state_ptr =
-      reinterpret_cast<int8_t*>(fw_hidden_state_quantized->data.uint8);
+      GetInt8DataPtr(fw_hidden_state_quantized, is_uint8_hybrid);
   int8_t* bw_quantized_hidden_state_ptr =
-      reinterpret_cast<int8_t*>(bw_hidden_state_quantized->data.uint8);
+      GetInt8DataPtr(bw_hidden_state_quantized, is_uint8_hybrid);
   float* scaling_factors_ptr = scaling_factors->data.f;
 
   const int fw_output_step =
@@ -489,7 +510,7 @@ TfLiteStatus EvalHybrid(
       float* bw_hidden_state_ptr_batch = bw_hidden_state->data.f;
       for (int s = max_time - 1; s >= 0; s--) {
         const float* input_ptr_batch =
-            input->data.f + s * input_size * batch_size;
+            bw_input->data.f + s * input_size * batch_size;
         const float* aux_input_ptr_batch =
             (aux_input != nullptr)
                 ? aux_input->data.f + s * input_size * batch_size
@@ -601,14 +622,37 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                                 ? nullptr
                                 : GetOutput(context, node, kBwOutputTensor);
 
+  const bool has_previous_bw_output = (aux_input != nullptr);
+  const bool use_aux_input = (fw_aux_input_weights != nullptr);
+
+  // We want to cover the following cases:
+  //
+  // If not stacking (not connected after other bidi lstms):
+  //   both fw & bw will just use `input`; aux_input will be null.
+  //
+  // If stacking with cross_links, TensorFlow equivalent
+  // (tf.contrib.rnn.stack_bidirectional_rnn):
+  //   both fw & bw will use `input`, but aux_input will be none null.
+  //   Note, this time, whether connected after other bidi lstms both works.
+  //
+  // If stacking without cross_links, but connected after other bidi lstms,
+  // TensorFlow equivalent (tf.nn.static_bidirectional_rnn):
+  //   fw will use `input`, bw will use aux_input, and the `real aux_input`
+  //   will be null.
+
+  const bool non_stacking_mode = !use_aux_input && has_previous_bw_output;
+  const TfLiteTensor* bw_input = non_stacking_mode ? aux_input : input;
+  const TfLiteTensor* real_aux_input = non_stacking_mode ? nullptr : aux_input;
+
   switch (fw_input_weights->type) {
     case kTfLiteFloat32:
-      return EvalFloat(input, fw_input_weights, fw_recurrent_weights, fw_bias,
-                       bw_input_weights, bw_recurrent_weights, bw_bias,
-                       aux_input, fw_aux_input_weights, bw_aux_input_weights,
-                       params, fw_hidden_state, fw_output, bw_hidden_state,
-                       bw_output);
-    case kTfLiteUInt8: {
+      return EvalFloat(input, bw_input, fw_input_weights, fw_recurrent_weights,
+                       fw_bias, bw_input_weights, bw_recurrent_weights, bw_bias,
+                       real_aux_input, fw_aux_input_weights,
+                       bw_aux_input_weights, params, fw_hidden_state, fw_output,
+                       bw_hidden_state, bw_output);
+    case kTfLiteUInt8:
+    case kTfLiteInt8: {
       TfLiteTensor* input_quantized =
           GetTemporary(context, node, kInputQuantized);
       TfLiteTensor* fw_hidden_state_quantized =
@@ -618,17 +662,16 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TfLiteTensor* scaling_factors =
           GetTemporary(context, node, kScalingFactors);
       TfLiteTensor* aux_input_quantized =
-          (aux_input != nullptr)
-              ? GetTemporary(context, node, kAuxInputQuantized)
-              : nullptr;
-
-      return EvalHybrid(input, fw_input_weights, fw_recurrent_weights, fw_bias,
-                        bw_input_weights, bw_recurrent_weights, bw_bias,
-                        aux_input, fw_aux_input_weights, bw_aux_input_weights,
-                        params, scaling_factors, input_quantized,
-                        aux_input_quantized, fw_hidden_state_quantized,
-                        fw_hidden_state, fw_output, bw_hidden_state_quantized,
-                        bw_hidden_state, bw_output);
+          use_aux_input ? GetTemporary(context, node, kAuxInputQuantized)
+                        : nullptr;
+
+      return EvalHybrid(input, bw_input, fw_input_weights, fw_recurrent_weights,
+                        fw_bias, bw_input_weights, bw_recurrent_weights,
+                        bw_bias, real_aux_input, fw_aux_input_weights,
+                        bw_aux_input_weights, params, scaling_factors,
+                        input_quantized, aux_input_quantized,
+                        fw_hidden_state_quantized, fw_hidden_state, fw_output,
+                        bw_hidden_state_quantized, bw_hidden_state, bw_output);
     }
     default:
       context->ReportError(context, "Type not currently supported.");
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc b/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
index 5bad8e02c29608fa058d0d1104acbf09626f1b66..9b61f8238b558042e7a957d09dac162d8ea6450b 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
@@ -654,8 +654,8 @@ const std::initializer_list<float> recurrent_weights = {
 class BidirectionalRNNOpModel : public SingleOpModel {
  public:
   BidirectionalRNNOpModel(int batches, int sequence_len, int fw_units,
-                          int bw_units, int input_size, bool time_major,
-                          bool merge_outputs)
+                          int bw_units, int input_size, bool use_aux_input,
+                          bool time_major, bool merge_outputs)
       : batches_(batches),
         sequence_len_(sequence_len),
         fw_units_(fw_units),
@@ -671,7 +671,13 @@ class BidirectionalRNNOpModel : public SingleOpModel {
     bw_bias_ = AddInput(TensorType_FLOAT32);
     bw_hidden_state_ = AddInput(TensorType_FLOAT32, true);
 
-    aux_input_ = AddNullInput();
+    int aux_input_size = 0;
+    if (use_aux_input) {
+      aux_input_ = AddInput(TensorType_FLOAT32);
+      aux_input_size = input_size_;
+    } else {
+      aux_input_ = AddNullInput();
+    }
     aux_fw_weights_ = AddNullInput();
     aux_bw_weights_ = AddNullInput();
 
@@ -691,18 +697,18 @@ class BidirectionalRNNOpModel : public SingleOpModel {
                      : std::vector<int>({batches_, sequence_len_, input_size_});
 
     BuildInterpreter({
-        input_shape,                   // input
-        {fw_units_, input_size_},      // fw_weights
-        {fw_units_, fw_units_},        // fw_recurrent_weights
-        {fw_units_},                   // fw_bias
-        {batches_, fw_units_},         // fw_hidden_state
-        {bw_units_, input_size_},      // bw_weights
-        {bw_units_, bw_units_},        // bw_recurrent_weights
-        {bw_units_},                   // bw_bias
-        {batches_, bw_units_},         // bw_hidden_state
-        {batches_, sequence_len_, 0},  // aux_input
-        {fw_units_, 0},                // aux_fw_weights
-        {bw_units_, 0},                // aux_bw_weights
+        input_shape,                                // input
+        {fw_units_, input_size_},                   // fw_weights
+        {fw_units_, fw_units_},                     // fw_recurrent_weights
+        {fw_units_},                                // fw_bias
+        {batches_, fw_units_},                      // fw_hidden_state
+        {bw_units_, input_size_},                   // bw_weights
+        {bw_units_, bw_units_},                     // bw_recurrent_weights
+        {bw_units_},                                // bw_bias
+        {batches_, bw_units_},                      // bw_hidden_state
+        {batches_, sequence_len_, aux_input_size},  // aux_input
+        {fw_units_, 0},                             // aux_fw_weights
+        {bw_units_, 0},                             // aux_bw_weights
     });
   }
 
@@ -738,6 +744,10 @@ class BidirectionalRNNOpModel : public SingleOpModel {
     PopulateTensor(input_, offset, begin, end);
   }
 
+  void SetAuxInput(int offset, float* begin, float* end) {
+    PopulateTensor(aux_input_, offset, begin, end);
+  }
+
   std::vector<float> GetFwOutput() { return ExtractVector<float>(fw_output_); }
   std::vector<float> GetBwOutput() { return ExtractVector<float>(bw_output_); }
 
@@ -775,7 +785,8 @@ class BidirectionalRNNOpModel : public SingleOpModel {
 TEST(BidirectionalRNNOpTest, BlackBoxTest) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*time_major=*/false,
+                              /*input_size=*/8, /*use_aux_input=*/false,
+                              /*time_major=*/false,
                               /*merge_outputs=*/false);
   rnn.SetFwWeights(weights);
   rnn.SetBwWeights(weights);
@@ -813,7 +824,8 @@ TEST(BidirectionalRNNOpTest, BlackBoxTest) {
 TEST(BidirectionalRNNOpTest, BlackBoxTestTimeMajor) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*time_major=*/true,
+                              /*input_size=*/8, /*use_aux_input=*/false,
+                              /*time_major=*/true,
                               /*merge_outputs=*/false);
   rnn.SetFwWeights(weights);
   rnn.SetBwWeights(weights);
@@ -822,7 +834,6 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestTimeMajor) {
   rnn.SetFwRecurrentWeights(recurrent_weights);
   rnn.SetBwRecurrentWeights(recurrent_weights);
 
-  // const int input_sequence_size = rnn.input_size() * rnn.sequence_len();
   // Insert the inputs in time_major format. The batch_major format is:
   // [b0t0, b0t1, ..., b0t15, b1t0, b1t1, ..., b1t15]. This is reshuffled as:
   // [b0t0, b1t0, b0t1, b1t1, ..., b0t15, b1t15].
@@ -850,7 +861,8 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestTimeMajor) {
 TEST(BidirectionalRNNOpTest, BlackBoxTestMergeOutputs) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*time_major=*/false,
+                              /*input_size=*/8, /*use_aux_input=*/false,
+                              /*time_major=*/false,
                               /*merge_outputs=*/true);
   rnn.SetFwWeights(weights);
   rnn.SetBwWeights(weights);
@@ -888,7 +900,8 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestMergeOutputs) {
 TEST(BidirectionalRNNOpTest, BlackBoxTestTimeMajorMergeOutputs) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*time_major=*/true,
+                              /*input_size=*/8, /*use_aux_input=*/false,
+                              /*time_major=*/true,
                               /*merge_outputs=*/true);
   rnn.SetFwWeights(weights);
   rnn.SetBwWeights(weights);
@@ -932,7 +945,8 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestTimeMajorMergeOutputs) {
 TEST(BidirectionalRNNOpTest, BlackBoxTestReverseInputs) {
   BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*time_major=*/false,
+                              /*input_size=*/8, /*use_aux_input=*/false,
+                              /*time_major=*/false,
                               /*merge_outputs=*/false);
   rnn.SetFwWeights(weights);
   rnn.SetBwWeights(weights);
@@ -979,7 +993,8 @@ TEST(BidirectionalRNNOpTest, BlackBoxTestReverseInputs) {
 TEST(BidirectionalRNNOpTest, EndToEndTest) {
   BidirectionalRNNOpModel rnn(/*batches=*/1, /*sequence_len=*/4,
                               /*fw_units=*/16, /*bw_units=*/16,
-                              /*input_size=*/8, /*time_major=*/false,
+                              /*input_size=*/8, /*use_aux_input=*/false,
+                              /*time_major=*/false,
                               /*merge_outputs=*/false);
   const int output_size = 4;
   float dnn_weights[] = {
@@ -1046,6 +1061,137 @@ TEST(BidirectionalRNNOpTest, EndToEndTest) {
   }
 }
 
+// Same as BlackBox test, but has aux input.
+TEST(BidirectionalRNNOpTest, BlackBoxTestAuxInput) {
+  BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8, /*use_aux_input=*/true,
+                              /*time_major=*/true,
+                              /*merge_outputs=*/false);
+  rnn.SetFwWeights(weights);
+  rnn.SetBwWeights(weights);
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+
+  // Insert the inputs in time_major format. The batch_major format is:
+  // [b0t0, b0t1, ..., b0t15, b1t0, b1t1, ..., b1t15]. This is reshuffled as:
+  // [b0t0, b1t0, b0t1, b1t1, ..., b0t15, b1t15].
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    // The two batches are identical.
+    // Also make aux input the same as input.
+    rnn.SetInput(2 * i * rnn.input_size(), batch_start, batch_end);
+    rnn.SetAuxInput(2 * i * rnn.input_size(), batch_start, batch_end);
+    rnn.SetInput((2 * i + 1) * rnn.input_size(), batch_start, batch_end);
+    rnn.SetAuxInput((2 * i + 1) * rnn.input_size(), batch_start, batch_end);
+  }
+
+  rnn.Invoke();
+
+  std::vector<float> fw_expected;
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* golden_fw_start = rnn_golden_fw_output + i * rnn.num_fw_units();
+    float* golden_fw_end = golden_fw_start + rnn.num_fw_units();
+    fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+    fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+  }
+  EXPECT_THAT(rnn.GetFwOutput(), ElementsAreArray(ArrayFloatNear(fw_expected)));
+}
+
+// Same as previous test, but has aux input is all zeros.
+TEST(BidirectionalRNNOpTest, BlackBoxTestAuxInputZeros) {
+  BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8, /*use_aux_input=*/true,
+                              /*time_major=*/true,
+                              /*merge_outputs=*/false);
+  rnn.SetFwWeights(weights);
+  rnn.SetBwWeights(weights);
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+
+  // Initialize bw inputs with zeros.
+  std::vector<float> bw_inputs(rnn.sequence_len(), 0);
+
+  // Insert the inputs in time_major format. The batch_major format is:
+  // [b0t0, b0t1, ..., b0t15, b1t0, b1t1, ..., b1t15]. This is reshuffled as:
+  // [b0t0, b1t0, b0t1, b1t1, ..., b0t15, b1t15].
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    // The two batches are identical.
+    // Also make aux input the same as input.
+    rnn.SetInput(2 * i * rnn.input_size(), batch_start, batch_end);
+    rnn.SetAuxInput(2 * i * rnn.input_size(), &bw_inputs[0],
+                    &bw_inputs[bw_inputs.size() - 1]);
+    rnn.SetInput((2 * i + 1) * rnn.input_size(), batch_start, batch_end);
+    rnn.SetAuxInput((2 * i + 1) * rnn.input_size(), &bw_inputs[0],
+                    &bw_inputs[bw_inputs.size() - 1]);
+  }
+
+  rnn.Invoke();
+
+  std::vector<float> fw_expected;
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* golden_fw_start = rnn_golden_fw_output + i * rnn.num_fw_units();
+    float* golden_fw_end = golden_fw_start + rnn.num_fw_units();
+    fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+    fw_expected.insert(fw_expected.end(), golden_fw_start, golden_fw_end);
+  }
+  EXPECT_THAT(rnn.GetFwOutput(), ElementsAreArray(ArrayFloatNear(fw_expected)));
+}
+
+// Same as previous test, but has input is all zeros, and aux input is the real
+// input. This is testing the bw path is functional.
+TEST(BidirectionalRNNOpTest, BlackBoxTestAuxInputInputZeros) {
+  BidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                              /*fw_units=*/16, /*bw_units=*/16,
+                              /*input_size=*/8, /*use_aux_input=*/true,
+                              /*time_major=*/true,
+                              /*merge_outputs=*/false);
+  rnn.SetFwWeights(weights);
+  rnn.SetBwWeights(weights);
+  rnn.SetFwBias(biases);
+  rnn.SetBwBias(biases);
+  rnn.SetFwRecurrentWeights(recurrent_weights);
+  rnn.SetBwRecurrentWeights(recurrent_weights);
+
+  // Initialize bw inputs with zeros.
+  std::vector<float> fw_inputs(rnn.sequence_len(), 0);
+
+  // Insert the inputs in time_major format. The batch_major format is:
+  // [b0t0, b0t1, ..., b0t15, b1t0, b1t1, ..., b1t15]. This is reshuffled as:
+  // [b0t0, b1t0, b0t1, b1t1, ..., b0t15, b1t15].
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    // The two batches are identical.
+    // Also make aux input the same as input.
+    rnn.SetAuxInput(2 * i * rnn.input_size(), batch_start, batch_end);
+    rnn.SetInput(2 * i * rnn.input_size(), &fw_inputs[0],
+                 &fw_inputs[fw_inputs.size() - 1]);
+    rnn.SetAuxInput((2 * i + 1) * rnn.input_size(), batch_start, batch_end);
+    rnn.SetInput((2 * i + 1) * rnn.input_size(), &fw_inputs[0],
+                 &fw_inputs[fw_inputs.size() - 1]);
+  }
+
+  rnn.Invoke();
+
+  std::vector<float> bw_expected;
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* golden_bw_start = rnn_golden_bw_output + i * rnn.num_fw_units();
+    float* golden_bw_end = golden_bw_start + rnn.num_fw_units();
+    bw_expected.insert(bw_expected.end(), golden_bw_start, golden_bw_end);
+    bw_expected.insert(bw_expected.end(), golden_bw_start, golden_bw_end);
+  }
+  EXPECT_THAT(rnn.GetBwOutput(), ElementsAreArray(ArrayFloatNear(bw_expected)));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/relu1.cc b/tensorflow/lite/kernels/ceil.cc
similarity index 55%
rename from tensorflow/lite/kernels/relu1.cc
rename to tensorflow/lite/kernels/ceil.cc
index 5a55631405b6b32a602cfe21ba863d0dc92213ea..6bb763255b136f1d5103dd2e72ce6aebf38f06d3 100644
--- a/tensorflow/lite/kernels/relu1.cc
+++ b/tensorflow/lite/kernels/ceil.cc
@@ -12,48 +12,48 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/context.h"
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
 namespace ops {
-namespace custom {
-namespace relu1 {
+namespace builtin {
+namespace ceil {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-  TfLiteTensor* output = GetOutput(context, node, 0);
   output->type = input->type;
-  return context->ResizeTensor(context, output,
-                               TfLiteIntArrayCopy(input->dims));
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input->dims);
+  return context->ResizeTensor(context, output, output_size);
 }
 
-// This is derived from lite/kernels/activations.cc.
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
-  const int elements = NumElements(input);
-  const float* in = input->data.f;
-  const float* in_end = in + elements;
-  float* out = output->data.f;
-  for (; in < in_end; ++in, ++out) {
-    *out = std::min(std::max(0.f, *in), 1.f);
-  }
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  optimized_ops::Ceil(GetTensorShape(input), GetTensorData<float>(input),
+                      GetTensorShape(output), GetTensorData<float>(output));
+
   return kTfLiteOk;
 }
+}  // namespace ceil
 
-}  // namespace relu1
-
-TfLiteRegistration* Register_RELU_1() {
-  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
-                                 relu1::Prepare, relu1::Eval};
+TfLiteRegistration* Register_CEIL() {
+  static TfLiteRegistration r = {/*init=*/nullptr,
+                                 /*free=*/nullptr, ceil::Prepare, ceil::Eval};
   return &r;
 }
 
-}  // namespace custom
+}  // namespace builtin
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/ceil_test.cc b/tensorflow/lite/kernels/ceil_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e120105082751a732bb8812944c318ad9e5ecff5
--- /dev/null
+++ b/tensorflow/lite/kernels/ceil_test.cc
@@ -0,0 +1,83 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class CeilOpModel : public SingleOpModel {
+ public:
+  CeilOpModel(std::initializer_list<int> input_shape, TensorType input_type) {
+    input_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_CEIL, BuiltinOptions_NONE, 0);
+    BuildInterpreter({
+        input_shape,
+    });
+  }
+
+  int input() { return input_; }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(CeilOpTest, SingleDim) {
+  CeilOpModel model({2}, TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.input(), {8.5, 0.0});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({9, 0}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2}));
+}
+
+TEST(CeilOpTest, MultiDims) {
+  CeilOpModel model({2, 1, 1, 5}, TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.input(), {
+                                                 0.0001,
+                                                 8.0001,
+                                                 0.9999,
+                                                 9.9999,
+                                                 0.5,
+                                                 -0.0001,
+                                                 -8.0001,
+                                                 -0.9999,
+                                                 -9.9999,
+                                                 -0.5,
+                                             });
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 9, 1, 10, 1, 0, -8, 0, -9, 0}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 1, 5}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/comparisons.cc b/tensorflow/lite/kernels/comparisons.cc
index a914449ae552e37249f2cecb5c88f3b49e83f133..d4924192551ec0399863433bc021c21f81207bd1 100644
--- a/tensorflow/lite/kernels/comparisons.cc
+++ b/tensorflow/lite/kernels/comparisons.cc
@@ -59,11 +59,12 @@ TfLiteStatus ComparisonPrepare(TfLiteContext* context, TfLiteNode* node) {
 
 // TODO(ruic): optimize macros below to using template functions.
 #define TF_LITE_QUANTIZE_COMPARISON(opname)                                    \
+  template <typename input_dtype>                                              \
   void EvalQuantized##opname(TfLiteContext* context, TfLiteNode* node,         \
                              const TfLiteTensor* input1,                       \
                              const TfLiteTensor* input2, TfLiteTensor* output, \
                              bool requires_broadcast) {                        \
-    if (input1->type == kTfLiteUInt8) {                                        \
+    if (input1->type == kTfLiteUInt8 || input1->type == kTfLiteInt8) {         \
       auto input1_offset = -input1->params.zero_point;                         \
       auto input2_offset = -input2->params.zero_point;                         \
       const int left_shift = 8;                                                \
@@ -87,14 +88,16 @@ TfLiteStatus ComparisonPrepare(TfLiteContext* context, TfLiteNode* node) {
       op_params.input2_shift = input2_shift;                                   \
       if (requires_broadcast) {                                                \
         reference_ops::Broadcast4DSlow##opname##WithScaling(                   \
-            op_params, GetTensorShape(input1), GetTensorData<uint8_t>(input1), \
-            GetTensorShape(input2), GetTensorData<uint8_t>(input2),            \
-            GetTensorShape(output), GetTensorData<bool>(output));              \
+            op_params, GetTensorShape(input1),                                 \
+            GetTensorData<input_dtype>(input1), GetTensorShape(input2),        \
+            GetTensorData<input_dtype>(input2), GetTensorShape(output),        \
+            GetTensorData<bool>(output));                                      \
       } else {                                                                 \
         reference_ops::opname##WithScaling(                                    \
-            op_params, GetTensorShape(input1), GetTensorData<uint8_t>(input1), \
-            GetTensorShape(input2), GetTensorData<uint8_t>(input2),            \
-            GetTensorShape(output), GetTensorData<bool>(output));              \
+            op_params, GetTensorShape(input1),                                 \
+            GetTensorData<input_dtype>(input1), GetTensorShape(input2),        \
+            GetTensorData<input_dtype>(input2), GetTensorShape(output),        \
+            GetTensorData<bool>(output));                                      \
       }                                                                        \
     }                                                                          \
   }
@@ -136,8 +139,12 @@ TfLiteStatus EqualEval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_COMPARISON(int64_t, Equal, requires_broadcast);
       break;
     case kTfLiteUInt8:
-      EvalQuantizedEqual(context, node, input1, input2, output,
-                         requires_broadcast);
+      EvalQuantizedEqual<uint8_t>(context, node, input1, input2, output,
+                                  requires_broadcast);
+      break;
+    case kTfLiteInt8:
+      EvalQuantizedEqual<int8_t>(context, node, input1, input2, output,
+                                 requires_broadcast);
       break;
     default:
       context->ReportError(context,
@@ -165,8 +172,12 @@ TfLiteStatus NotEqualEval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_COMPARISON(int64_t, NotEqual, requires_broadcast);
       break;
     case kTfLiteUInt8:
-      EvalQuantizedNotEqual(context, node, input1, input2, output,
-                            requires_broadcast);
+      EvalQuantizedNotEqual<uint8_t>(context, node, input1, input2, output,
+                                     requires_broadcast);
+      break;
+    case kTfLiteInt8:
+      EvalQuantizedNotEqual<int8_t>(context, node, input1, input2, output,
+                                    requires_broadcast);
       break;
     default:
       context->ReportError(context,
@@ -193,8 +204,12 @@ TfLiteStatus GreaterEval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_COMPARISON(int64_t, Greater, requires_broadcast);
       break;
     case kTfLiteUInt8:
-      EvalQuantizedGreater(context, node, input1, input2, output,
-                           requires_broadcast);
+      EvalQuantizedGreater<uint8_t>(context, node, input1, input2, output,
+                                    requires_broadcast);
+      break;
+    case kTfLiteInt8:
+      EvalQuantizedGreater<int8_t>(context, node, input1, input2, output,
+                                   requires_broadcast);
       break;
     default:
       context->ReportError(context,
@@ -221,8 +236,12 @@ TfLiteStatus GreaterEqualEval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_COMPARISON(int64_t, GreaterEqual, requires_broadcast);
       break;
     case kTfLiteUInt8:
-      EvalQuantizedGreaterEqual(context, node, input1, input2, output,
-                                requires_broadcast);
+      EvalQuantizedGreaterEqual<uint8_t>(context, node, input1, input2, output,
+                                         requires_broadcast);
+      break;
+    case kTfLiteInt8:
+      EvalQuantizedGreaterEqual<int8_t>(context, node, input1, input2, output,
+                                        requires_broadcast);
       break;
     default:
       context->ReportError(context,
@@ -249,8 +268,12 @@ TfLiteStatus LessEval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_COMPARISON(int64_t, Less, requires_broadcast);
       break;
     case kTfLiteUInt8:
-      EvalQuantizedLess(context, node, input1, input2, output,
-                        requires_broadcast);
+      EvalQuantizedLess<uint8_t>(context, node, input1, input2, output,
+                                 requires_broadcast);
+      break;
+    case kTfLiteInt8:
+      EvalQuantizedLess<int8_t>(context, node, input1, input2, output,
+                                requires_broadcast);
       break;
     default:
       context->ReportError(context,
@@ -277,8 +300,12 @@ TfLiteStatus LessEqualEval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_COMPARISON(int64_t, LessEqual, requires_broadcast);
       break;
     case kTfLiteUInt8:
-      EvalQuantizedLessEqual(context, node, input1, input2, output,
-                             requires_broadcast);
+      EvalQuantizedLessEqual<uint8_t>(context, node, input1, input2, output,
+                                      requires_broadcast);
+      break;
+    case kTfLiteInt8:
+      EvalQuantizedLessEqual<int8_t>(context, node, input1, input2, output,
+                                     requires_broadcast);
       break;
     default:
       context->ReportError(context,
diff --git a/tensorflow/lite/kernels/comparisons_test.cc b/tensorflow/lite/kernels/comparisons_test.cc
index ab10c959a4d6b234cb6ae0810174e8f1c48898d1..6ec1f09a6c000e7e8fb7f27f4bbdfc174fc99ed5 100644
--- a/tensorflow/lite/kernels/comparisons_test.cc
+++ b/tensorflow/lite/kernels/comparisons_test.cc
@@ -363,7 +363,7 @@ TEST(ComparisonsTest, LessEqualBroadcastTwoD) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 2, 4));
 }
 
-TEST(QuantizedComparisonsTest, EqualQuantized) {
+TEST(QuantizedComparisonsTest, EqualUInt8Quantized) {
   const float kMin = -1.f;
   const float kMax = 128.f;
   ComparisonOpModel model({TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax},
@@ -376,7 +376,20 @@ TEST(QuantizedComparisonsTest, EqualQuantized) {
   EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, true, false));
 }
 
-TEST(QuantizedComparisonsTest, NotEqualQuantized) {
+TEST(QuantizedComparisonsTest, EqualInt8Quantized) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  ComparisonOpModel model({TensorType_INT8, {1, 2, 2, 1}, kMin, kMax},
+                          {TensorType_INT8, {1, 2, 2, 1}, kMin, kMax},
+                          TensorType_INT8, BuiltinOperator_EQUAL);
+  model.QuantizeAndPopulate<int8_t>(model.input1(), {1, -9, 7, 3});
+  model.QuantizeAndPopulate<int8_t>(model.input2(), {-1, 2, 7, 5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAre(false, false, true, false));
+}
+
+TEST(QuantizedComparisonsTest, NotEqualUInt8Quantized) {
   const float kMin = -1.f;
   const float kMax = 128.f;
   ComparisonOpModel model({TensorType_UINT8, {1, 2, 2, 1}, kMin, kMax},
@@ -389,6 +402,19 @@ TEST(QuantizedComparisonsTest, NotEqualQuantized) {
   EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, false, true));
 }
 
+TEST(QuantizedComparisonsTest, NotEqualInt8Quantized) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  ComparisonOpModel model({TensorType_INT8, {1, 2, 2, 1}, kMin, kMax},
+                          {TensorType_INT8, {1, 2, 2, 1}, kMin, kMax},
+                          TensorType_INT8, BuiltinOperator_NOT_EQUAL);
+  model.QuantizeAndPopulate<int8_t>(model.input1(), {1, -9, 7, 3});
+  model.QuantizeAndPopulate<int8_t>(model.input2(), {1, 2, 7, 5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, false, true));
+}
+
 TEST(ComparisonsTest, GreaterQuantized) {
   const float kMin = -1.f;
   const float kMax = 128.f;
@@ -470,7 +496,7 @@ TEST(ComparisonsTest, QuantizedEqualWithBroadcast) {
   }
 }
 
-TEST(ComparisonsTest, QuantizedNotEqualWithBroadcast) {
+TEST(ComparisonsTest, QuantizedUInt8NotEqualWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
   std::vector<std::vector<int>> test_shapes = {
@@ -488,7 +514,25 @@ TEST(ComparisonsTest, QuantizedNotEqualWithBroadcast) {
   }
 }
 
-TEST(ComparisonsTest, QuantizedGreaterWithBroadcast) {
+TEST(ComparisonsTest, QuantizedInt8NotEqualWithBroadcast) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    ComparisonOpModel model({TensorType_INT8, test_shapes[i], kMin, kMax},
+                            {TensorType_INT8, {}, kMin, kMax}, TensorType_INT8,
+                            BuiltinOperator_NOT_EQUAL);
+    model.QuantizeAndPopulate<int8_t>(model.input1(), {-20, 2, 7, -8, 11, 20});
+    model.QuantizeAndPopulate<int8_t>(model.input2(), {2});
+    model.Invoke();
+    EXPECT_THAT(model.GetOutput(),
+                ElementsAre(true, false, true, true, true, true))
+        << "With shape number " << i;
+  }
+}
+
+TEST(ComparisonsTest, QuantizedUInt8GreaterWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
   std::vector<std::vector<int>> test_shapes = {
@@ -506,7 +550,25 @@ TEST(ComparisonsTest, QuantizedGreaterWithBroadcast) {
   }
 }
 
-TEST(ComparisonsTest, QuantizedGreaterEqualWithBroadcast) {
+TEST(ComparisonsTest, QuantizedInt8GreaterWithBroadcast) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    ComparisonOpModel model({TensorType_INT8, test_shapes[i], kMin, kMax},
+                            {TensorType_INT8, {}, kMin, kMax}, TensorType_INT8,
+                            BuiltinOperator_GREATER);
+    model.QuantizeAndPopulate<int8_t>(model.input1(), {20, -2, -71, 8, 11, 20});
+    model.QuantizeAndPopulate<int8_t>(model.input2(), {8});
+    model.Invoke();
+    EXPECT_THAT(model.GetOutput(),
+                ElementsAre(true, false, false, false, true, true))
+        << "With shape number " << i;
+  }
+}
+
+TEST(ComparisonsTest, QuantizedUInt8GreaterEqualWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
   std::vector<std::vector<int>> test_shapes = {
@@ -524,7 +586,25 @@ TEST(ComparisonsTest, QuantizedGreaterEqualWithBroadcast) {
   }
 }
 
-TEST(ComparisonsTest, QuantizedLessWithBroadcast) {
+TEST(ComparisonsTest, QuantizedInt8GreaterEqualWithBroadcast) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    ComparisonOpModel model({TensorType_INT8, test_shapes[i], kMin, kMax},
+                            {TensorType_INT8, {}, kMin, kMax}, TensorType_INT8,
+                            BuiltinOperator_GREATER_EQUAL);
+    model.QuantizeAndPopulate<int8_t>(model.input1(), {20, -2, -71, 8, 11, 20});
+    model.QuantizeAndPopulate<int8_t>(model.input2(), {8});
+    model.Invoke();
+    EXPECT_THAT(model.GetOutput(),
+                ElementsAre(true, false, false, true, true, true))
+        << "With shape number " << i;
+  }
+}
+
+TEST(ComparisonsTest, QuantizedUInt8LessWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
   std::vector<std::vector<int>> test_shapes = {
@@ -542,7 +622,25 @@ TEST(ComparisonsTest, QuantizedLessWithBroadcast) {
   }
 }
 
-TEST(ComparisonsTest, QuantizedLessEqualWithBroadcast) {
+TEST(ComparisonsTest, QuantizedInt8LessWithBroadcast) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    ComparisonOpModel model({TensorType_INT8, test_shapes[i], kMin, kMax},
+                            {TensorType_INT8, {}, kMin, kMax}, TensorType_INT8,
+                            BuiltinOperator_LESS);
+    model.QuantizeAndPopulate<int8_t>(model.input1(), {20, -2, -71, 8, 11, 20});
+    model.QuantizeAndPopulate<int8_t>(model.input2(), {8});
+    model.Invoke();
+    EXPECT_THAT(model.GetOutput(),
+                ElementsAre(false, true, true, false, false, false))
+        << "With shape number " << i;
+  }
+}
+
+TEST(ComparisonsTest, QuantizedUInt8LessEqualWithBroadcast) {
   const float kMin = -1.f;
   const float kMax = 128.f;
   std::vector<std::vector<int>> test_shapes = {
@@ -560,6 +658,24 @@ TEST(ComparisonsTest, QuantizedLessEqualWithBroadcast) {
   }
 }
 
+TEST(ComparisonsTest, QuantizedInt8LessEqualWithBroadcast) {
+  const float kMin = -127.f;
+  const float kMax = 127.f;
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    ComparisonOpModel model({TensorType_INT8, test_shapes[i], kMin, kMax},
+                            {TensorType_INT8, {}, kMin, kMax}, TensorType_INT8,
+                            BuiltinOperator_LESS_EQUAL);
+    model.QuantizeAndPopulate<int8_t>(model.input1(), {20, -2, -71, 8, 11, 20});
+    model.QuantizeAndPopulate<int8_t>(model.input2(), {8});
+    model.Invoke();
+    EXPECT_THAT(model.GetOutput(),
+                ElementsAre(false, true, true, true, false, false))
+        << "With shape number " << i;
+  }
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc
index 1fd870be93eda12d1c057e29b017d80e2a96412b..75e75fae6b09e6f3653719e3cf69c8cc1b4956f6 100644
--- a/tensorflow/lite/kernels/conv.cc
+++ b/tensorflow/lite/kernels/conv.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
@@ -68,6 +69,11 @@ struct OpData {
   // be represented as a fixed point multiplier plus a left shift.
   int32_t output_multiplier;
   int output_shift;
+
+  // Per channel output multiplier and shift.
+  std::vector<int32_t> per_channel_output_multiplier;
+  std::vector<int> per_channel_output_shift;
+
   // The range of the fused activation layer. For example for kNone and
   // uint8_t these would be 0 and 255.
   int32_t output_activation_min;
@@ -133,7 +139,8 @@ void TransposeFloatTensor(TfLiteTensor* input, TfLiteTensor* output) {
 // Note: `context->AddTensors` might invalidate pointers to existing tensors.
 // Therefore the logic to add tensors are isolated into this function.
 static TfLiteStatus AllocateTemporaryTensorsIfRequired(TfLiteContext* context,
-                                                       TfLiteNode* node) {
+                                                       TfLiteNode* node,
+                                                       bool is_hybrid) {
   auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
@@ -141,9 +148,6 @@ static TfLiteStatus AllocateTemporaryTensorsIfRequired(TfLiteContext* context,
   TfLiteTensor* input = &context->tensors[node->inputs->data[0]];
   TfLiteTensor* filter = &context->tensors[node->inputs->data[1]];
 
-  const bool is_hybrid =
-      (input->type == kTfLiteFloat32 && filter->type == kTfLiteUInt8);
-
   int filter_width = filter->dims->data[2];
   int filter_height = filter->dims->data[1];
 
@@ -228,8 +232,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // Check types. (We assume that UINT8 refers to quantized tensors)
   TfLiteType input_type = input->type;
-  TF_LITE_ENSURE(context,
-                 input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8);
+  TF_LITE_ENSURE(context, input_type == kTfLiteFloat32 ||
+                              input_type == kTfLiteUInt8 ||
+                              input_type == kTfLiteInt8);
   TF_LITE_ENSURE_EQ(context, output->type, input_type);
 
   TfLiteTensor* bias = nullptr;
@@ -240,7 +245,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   if (has_bias) {
     bias = &context->tensors[node->inputs->data[2]];
-    if (input_type == kTfLiteUInt8) {
+    if (input_type == kTfLiteUInt8 || input_type == kTfLiteInt8) {
       TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
       TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
     } else {
@@ -250,7 +255,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   }
 
   const bool is_hybrid =
-      (input->type == kTfLiteFloat32 && filter->type == kTfLiteUInt8);
+      (input->type == kTfLiteFloat32 &&
+       (filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8));
 
   data->run_multithreaded_kernel = context->recommended_num_threads != 1;
   // Hybrid kernels don't support multithreading yet.
@@ -258,7 +264,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     data->run_multithreaded_kernel = false;
   }
 
-  TF_LITE_ENSURE_STATUS(AllocateTemporaryTensorsIfRequired(context, node));
+  TF_LITE_ENSURE_STATUS(
+      AllocateTemporaryTensorsIfRequired(context, node, is_hybrid));
 
   int channels_in = filter->dims->data[3];
   int channels_out = filter->dims->data[0];
@@ -296,18 +303,25 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, has_bias);
 
   // Note that full fixed-point inference requires that all tensors have their
-  // parameters set. This is usually done during quantized training.
+  // parameters set. This is usually done during quantized training or
+  // calibration.
   if (input_type != kTfLiteFloat32) {
-    double real_multiplier = 0.0;
-    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
-        context, input, filter, bias, output, &real_multiplier));
-
-    int exponent;
-    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
-    data->output_shift = -exponent;
-    CalculateActivationRangeUint8(params->activation, output,
-                                  &data->output_activation_min,
-                                  &data->output_activation_max);
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+    const auto* affine_quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(
+            filter->quantization.params);
+    TF_LITE_ENSURE(context, affine_quantization);
+    TF_LITE_ENSURE(context, affine_quantization->scale);
+    const int number_channel = affine_quantization->scale->size;
+    data->per_channel_output_multiplier.resize(number_channel);
+    data->per_channel_output_shift.resize(number_channel);
+    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
+        context, input, filter, bias, output, params->activation,
+        &data->output_multiplier, &data->output_shift,
+        &data->output_activation_min, &data->output_activation_max,
+        data->per_channel_output_multiplier.data(),
+        data->per_channel_output_shift.data()));
   }
 
   TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
@@ -334,7 +348,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         &context->tensors[node->temporaries->data[data->im2col_index]];
     im2col->type = input->type;
     if (is_hybrid) {
-      im2col->type = kTfLiteUInt8;
+      im2col->type = filter->type;
     }
     im2col->allocation_type = kTfLiteArenaRw;
     auto im2col_status = context->ResizeTensor(context, im2col, im2col_size);
@@ -372,7 +386,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         data->input_quantized_id;
     TfLiteTensor* input_quantized =
         GetTemporary(context, node, data->input_quantized_index);
-    input_quantized->type = kTfLiteUInt8;
+    input_quantized->type = kTfLiteInt8;
     input_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
       TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
@@ -481,6 +495,29 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   }
 }
 
+void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteConvParams* params, OpData* data,
+                             TfLiteTensor* input, TfLiteTensor* filter,
+                             TfLiteTensor* bias, TfLiteTensor* output) {
+  ConvParams op_params;
+  op_params.input_offset = input->params.zero_point;
+  op_params.output_offset = output->params.zero_point;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+
+  reference_integer_ops::ConvPerChannel(
+      op_params, data->per_channel_output_multiplier.data(),
+      data->per_channel_output_shift.data(), GetTensorShape(input),
+      GetTensorData<int8>(input), GetTensorShape(filter),
+      GetTensorData<int8>(filter), GetTensorShape(bias),
+      GetTensorData<int32>(bias), GetTensorShape(output),
+      GetTensorData<int8>(output));
+}
+
 template <KernelType kernel_type>
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
                TfLiteConvParams* params, OpData* data, TfLiteTensor* input,
@@ -562,8 +599,7 @@ void EvalHybrid(TfLiteContext* context, TfLiteNode* node,
 
   const TfLiteTensor* input_quantized =
       GetTemporary(context, node, data->input_quantized_index);
-  int8_t* quantized_input_ptr_batch =
-      reinterpret_cast<int8_t*>(input_quantized->data.uint8);
+  int8_t* quantized_input_ptr_batch = input_quantized->data.int8;
   float* scaling_factors_ptr =
       GetTemporary(context, node, data->scaling_factors_index)->data.f;
 
@@ -578,10 +614,21 @@ void EvalHybrid(TfLiteContext* context, TfLiteNode* node,
   }
 
   int8_t* im2col_ptr = nullptr;
-  if (im2col != nullptr) {
-    im2col_ptr = reinterpret_cast<int8_t*>(im2col->data.uint8);
+  int8_t* filter_ptr = nullptr;
+  if (filter->type == kTfLiteUInt8) {
+    // For backward compatibility, we need to support the case where filters
+    // are quantized to int8 but stored as uint8.
+    if (im2col != nullptr) {
+      im2col_ptr = reinterpret_cast<int8_t*>(im2col->data.uint8);
+    }
+    filter_ptr = reinterpret_cast<int8_t*>(filter->data.uint8);
+  } else {
+    // Code at head uses the int8 type so we do not need to do the cast.
+    if (im2col != nullptr) {
+      im2col_ptr = im2col->data.int8;
+    }
+    filter_ptr = filter->data.int8;
   }
-  int8_t* filter_ptr = reinterpret_cast<int8_t*>(filter->data.uint8);
 
   switch (kernel_type) {
     case kReference:
@@ -640,7 +687,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // separate ops to avoid dispatch overhead here.
   switch (input->type) {  // Already know in/outtypes are same.
     case kTfLiteFloat32:
-      if (filter->type == kTfLiteUInt8) {
+      if (filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8) {
         EvalHybrid<kernel_type>(context, node, params, data, input, filter,
                                 bias, im2col, hwcn_weights, output);
       } else if (data->run_multithreaded_kernel) {
@@ -655,6 +702,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       EvalQuantized<kernel_type>(context, node, params, data, input, filter,
                                  bias, im2col, hwcn_weights, output);
       break;
+    case kTfLiteInt8:
+      EvalQuantizedPerChannel(context, node, params, data, input, filter, bias,
+                              output);
+      break;
     default:
       context->ReportError(context, "Type %d not currently supported.",
                            input->type);
diff --git a/tensorflow/lite/kernels/conv_test.cc b/tensorflow/lite/kernels/conv_test.cc
index eebf9f9de4694352cf3bf959f0f639380a3054f7..91ad223154be64cc11a1ffcfe2c3666357a2eda6 100644
--- a/tensorflow/lite/kernels/conv_test.cc
+++ b/tensorflow/lite/kernels/conv_test.cc
@@ -58,9 +58,35 @@ class BaseConvolutionOpModel : public SingleOpModel {
       // This is a quantized version. The scale of 'bias' depends on the scales
       // of input and filter. Supposedly this is correctly set during quantized
       // training.
-      auto bias_scale = GetScale(input_) * GetScale(filter_);
-      TensorData bias{TensorType_INT32, {bias_size}, 0, 0, bias_scale};
-      bias_ = AddInput(bias);
+      if (filter.per_channel_quantization) {
+        // per channel quantization.
+        std::vector<float> bias_scale(
+            filter.per_channel_quantization_scales.size());
+        std::vector<int64_t> bias_zero_points(
+            filter.per_channel_quantization_scales.size());
+        for (int i = 0; i < filter.per_channel_quantization_scales.size();
+             ++i) {
+          bias_scale[i] =
+              input.scale * filter.per_channel_quantization_scales[i];
+          bias_zero_points[i] = 0;
+        }
+        TensorData bias{TensorType_INT32,
+                        {bias_size},
+                        /*min=*/0,
+                        /*max=*/0,
+                        /*scale=*/0,
+                        /*zero_point=*/0,
+                        true,
+                        /*per_channel_scale=*/bias_scale,
+                        /*per_channel_zero_point=*/bias_zero_points,
+                        /*channel_index==*/0};
+        bias_ = AddInput(bias);
+      } else {
+        // per tensor quantization.
+        auto bias_scale = GetScale(input_) * GetScale(filter_);
+        TensorData bias{TensorType_INT32, {bias_size}, 0, 0, bias_scale};
+        bias_ = AddInput(bias);
+      }
     }
 
     output_ = AddOutput(output);
@@ -758,6 +784,10 @@ class HybridConvolutionOpModel : public BaseConvolutionOpModel {
     SymmetricQuantizeAndPopulate(filter_, f);
   }
 
+  void SetSignedFilter(std::initializer_list<float> f) {
+    SignedSymmetricQuantizeAndPopulate(filter_, f);
+  }
+
   void SetBias(std::initializer_list<float> data) {
     PopulateTensor(bias_, data);
   }
@@ -765,7 +795,7 @@ class HybridConvolutionOpModel : public BaseConvolutionOpModel {
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
 };
 
-TEST_P(ConvolutionOpTest, SimpleTestHybrid) {
+TEST_P(ConvolutionOpTest, SimpleTestHybridUint8) {
   HybridConvolutionOpModel m(
       GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 1}},
       {TensorType_UINT8, {3, 2, 2, 1}}, {TensorType_FLOAT32, {}});
@@ -824,7 +854,7 @@ TEST_P(ConvolutionOpTest, SimpleTestHybrid) {
 // while keeping the filters for each channel equivalent.
 //
 // 2 * (A/2) * B = A * B, where the left side is this new test.
-TEST_P(ConvolutionOpTest, SimpleTestHybridWithChannels) {
+TEST_P(ConvolutionOpTest, SimpleTestHybridWithChannelsUint8) {
   HybridConvolutionOpModel m(
       GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 2}},
       {TensorType_UINT8, {3, 2, 2, 2}}, {TensorType_FLOAT32, {}});
@@ -856,7 +886,7 @@ TEST_P(ConvolutionOpTest, SimpleTestHybridWithChannels) {
                                  0.16)));
 }
 
-TEST_P(ConvolutionOpTest, PointwiseHybrid) {
+TEST_P(ConvolutionOpTest, PointwiseHybridUint8) {
   HybridConvolutionOpModel m(
       GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 2}},
       {TensorType_UINT8, {1, 1, 1, 2}}, {TensorType_FLOAT32, {}}, 1, 1);
@@ -898,6 +928,139 @@ TEST_P(ConvolutionOpTest, PointwiseHybrid) {
                   0.0316)));
 }
 
+TEST_P(ConvolutionOpTest, SimpleTestHybridInt8) {
+  HybridConvolutionOpModel m(
+      GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 1}},
+      {TensorType_INT8, {3, 2, 2, 1}}, {TensorType_FLOAT32, {}});
+
+  m.SetInput({
+      // First batch
+      1, 1, 1, 1,  // row = 1
+      2, 2, 2, 2,  // row = 2
+      // Second batch
+      1, 2, 3, 4,  // row = 1
+      1, 2, 3, 4,  // row = 2
+  });
+  m.SetSignedFilter({
+      1, 2, 3, 4,    // first 2x2 filter
+      -1, 1, -1, 1,  // second 2x2 filter
+      -1, -1, 1, 1,  // third 2x2 filter
+  });
+  m.SetBias({1, 2, 3});
+
+  m.Invoke();
+
+  // Example: we get 17.1577 instead of 17.
+  //
+  // Second batch:
+  // 1 2 3 4  -> 32 64 95 127 with scale factor 127/4.
+  // 1 2 3 4     32 64 95 127
+  //
+  // First filter:
+  // 1 2  -> 32 64  with scale factor of 127/4.
+  // 3 4     95 127
+  //
+  // The left half of the input gives us 16288. Multiply by (4/127)^2 for
+  // dequantization and adding 1 for the bias gives us the result. and adding
+  // the bias gives us the result.
+  //
+  // The optimized kernel converts the input into this matrix via Im2Col
+  //
+  // 1 1 2 2
+  // 1 1 2 2
+  // 1 2 1 2
+  // 3 4 3 4
+  //
+  // and multiplies it with the filter directly.
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {
+                                     18, 2, 5,  // first batch, left
+                                     18, 2, 5,  // first batch, right
+                                     17, 4, 3,  // second batch, left
+                                     37, 4, 3,  // second batch, right
+                                 },
+                                 0.16)));
+}
+
+// This test's output is equivalent to the SimpleTestHybrid
+// because we break each input into two channels, each with half of the value,
+// while keeping the filters for each channel equivalent.
+//
+// 2 * (A/2) * B = A * B, where the left side is this new test.
+TEST_P(ConvolutionOpTest, SimpleTestHybridWithChannelsInt8) {
+  HybridConvolutionOpModel m(
+      GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 2}},
+      {TensorType_INT8, {3, 2, 2, 2}}, {TensorType_FLOAT32, {}});
+
+  m.SetInput({
+      // First batch
+      0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,  // row = 1
+      1, 1, 1, 1, 1, 1, 1, 1,                  // row = 2
+      // Second batch
+      0.5, 0.5, 1, 1, 1.5, 1.5, 2, 2,  // row = 1
+      0.5, 0.5, 1, 1, 1.5, 1.5, 2, 2   // row = 2
+  });
+  m.SetSignedFilter({
+      1,  1,  2,  2,  3,  3,  4, 4,  // first 2x2 filter
+      -1, -1, 1,  1,  -1, -1, 1, 1,  // second 2x2 filter
+      -1, -1, -1, -1, 1,  1,  1, 1   // third 2x2 filter
+  });
+  m.SetBias({1, 2, 3});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {
+                                     18, 2, 5,  // first batch, left
+                                     18, 2, 5,  // first batch, right
+                                     17, 4, 3,  // second batch, left
+                                     37, 4, 3,  // second batch, right
+                                 },
+                                 0.16)));
+}
+
+TEST_P(ConvolutionOpTest, PointwiseHybridInt8) {
+  HybridConvolutionOpModel m(
+      GetRegistration(), {TensorType_FLOAT32, {2, 2, 4, 2}},
+      {TensorType_INT8, {1, 1, 1, 2}}, {TensorType_FLOAT32, {}}, 1, 1);
+
+  m.SetInput({
+      // First batch
+      0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,  // row = 1
+      1, 1, 1, 1, 1, 1, 1, 1,                  // row = 2
+      // Second batch
+      0.5, 0.5, 1, 1, 1.5, 1.5, 2, 2,  // row = 1
+      0.5, 0.5, 1, 1, 1.5, 1.5, 2, 2   // row = 2
+  });
+
+  m.SetSignedFilter({
+      1, 2,  // first filter
+  });
+  m.SetBias({0});
+
+  m.Invoke();
+
+  // Example: we get 3.03156 instead of 3.
+  //
+  // Second batch:
+  // 0.5 0.5 1 1 1.5 1.5 2 2  -> 32 32 64 64 95 95 127 127 with scale factor
+  // 127/2. We care about the two 64's.
+  //
+  // Filter:
+  // 64 127 with scale factor of 127/2.
+  //
+  // (64 * 64 + 64 * 127) * (2/127)^2 gives us the expected result.
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      1.5, 1.5, 1.5, 1.5,  // first batch, row = 1
+                      3., 3., 3., 3.,      // first batch, row = 2
+                      1.5, 3., 4.5, 6.,    // second batch, row = 1
+                      1.5, 3., 4.5, 6.,    // second batch, row = 2
+                  },
+                  0.0316)));
+}
+
 // TODO(alanchiao): this passes locally, but fails on continuous build system.
 // Re-enable when root cause found.
 TEST_P(ConvolutionOpTest, DISABLED_PointwiseMultifilterHybrid) {
@@ -932,7 +1095,77 @@ TEST_P(ConvolutionOpTest, DISABLED_PointwiseMultifilterHybrid) {
                   0.0474)));
 }
 
-INSTANTIATE_TEST_CASE_P(
+class PerChannelQuantizedConvolutionOpModel : public BaseConvolutionOpModel {
+ public:
+  using BaseConvolutionOpModel::BaseConvolutionOpModel;
+
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<int8_t>(input_, data);
+  }
+
+  void SetFilter(std::initializer_list<float> data) {
+    PerChannelSymmetricQuantizeAndPopulate(filter_, data);
+  }
+
+  void SetBias(std::initializer_list<float> data) {
+    PerChannelQuantizeBias(bias_, data);
+  }
+
+  std::vector<int8_t> GetOutput() { return ExtractVector<int8_t>(output_); }
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<int8_t>(ExtractVector<int8_t>(output_), GetScale(output_),
+                              GetZeroPoint(output_));
+  }
+};
+
+TEST_P(ConvolutionOpTest, SimpleTest) {
+  PerChannelQuantizedConvolutionOpModel m(
+      GetRegistration(), {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1},
+      {TensorType_INT8,
+       // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+       {2, 2, 2, 2},
+       0,
+       0,
+       0,
+       0,
+       /*per_channel=*/true,
+       /*per_channel_scales=*/{1, 2},
+       /*per_channel_zeros=*/{0, 0},
+       /*channel_index=*/0},
+      {TensorType_INT8, {}, -63.5, 64, 0.5, -1},
+      /*stride_width=*/1, /*stride_height=*/1);
+  m.SetInput({
+      // [1 * 2 * 3 * 2] as [batch, y, x, input_channel]
+      3, 2,    // batch = 0, y = 0, x = 0
+      1, -1,   // batch = 0, y = 0, x = 1
+      -2, -3,  // batch = 0, y = 0, x = 2
+      4, 3,    // batch = 0, y = 1, x = 0
+      2, -2,   // batch = 0, y = 1, x = 1
+      -3, -4,  // batch = 0, y = 1, x = 2
+  });
+  m.SetFilter(
+      // [2 * 2 * 2 * 2] as [output_channel, y, x, input_channel]
+      {
+          1, 2,  // out channel = 0, y = 0, x = 0
+          3, 4,  // out channel = 0, y = 0, x = 1
+          3, 4,  // out channel = 0, y = 1, x = 0
+          5, 6,  // out channel = 0, y = 1, x = 1
+          7, 8,  // out channel = 1, y = 0, x = 0
+          5, 6,  // out channel = 1, y = 0, x = 1
+          3, 4,  // out channel = 1, y = 1, x = 0
+          1, 2,  // out channel = 1, y = 1, x = 1
+      });
+  m.SetBias({3, -2});
+
+  // Invoke and verify output.
+  // output has dimension [1 * 1 * 2 * 2] as [batch, y, x, output_channel]
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({28.5, 64, -59.5, -46})));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({56, 127, -120, -93}));
+}
+
+INSTANTIATE_TEST_SUITE_P(
     ConvolutionOpTest, ConvolutionOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
 
diff --git a/tensorflow/lite/kernels/depthwise_conv.cc b/tensorflow/lite/kernels/depthwise_conv.cc
index 3f4ae5087b267a62d4d4237a8f5f534ff346a493..a349b2790531a674be1faa40d928677a9144e265 100644
--- a/tensorflow/lite/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/kernels/depthwise_conv.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
@@ -58,6 +59,10 @@ struct OpData {
   // uint8_t these would be 0 and 255.
   int32_t output_activation_min;
   int32_t output_activation_max;
+
+  // Per channel output multiplier and shift.
+  std::vector<int32_t> per_channel_output_multiplier;
+  std::vector<int> per_channel_output_shift;
 };
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
@@ -99,14 +104,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                     SizeOfDimension(filter, 3));
 
   const TfLiteType data_type = input->type;
-  TF_LITE_ENSURE(context,
-                 data_type == kTfLiteFloat32 || data_type == kTfLiteUInt8);
+  TF_LITE_ENSURE(context, data_type == kTfLiteFloat32 ||
+                              data_type == kTfLiteUInt8 ||
+                              data_type == kTfLiteInt8);
   TF_LITE_ENSURE_EQ(context, output->type, data_type);
   TF_LITE_ENSURE_EQ(context, filter->type, data_type);
 
   if (hasBias) {
     bias = GetInput(context, node, kBiasTensor);
-    if (data_type == kTfLiteUInt8) {
+    if (data_type == kTfLiteUInt8 || data_type == kTfLiteInt8) {
       TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
       TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
     } else {
@@ -150,17 +156,25 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                      filter_width, out_width);
 
   // Note that quantized inference requires that all tensors have their
-  // parameters set. This is usually done during quantized training.
+  // parameters set. This is usually done during quantized training or
+  // calibration.
   if (data_type != kTfLiteFloat32) {
-    double real_multiplier = 0.0;
-    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
-        context, input, filter, bias, output, &real_multiplier));
-    int exponent;
-    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
-    data->output_shift = -exponent;
-    CalculateActivationRangeUint8(params->activation, output,
-                                  &data->output_activation_min,
-                                  &data->output_activation_max);
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+    const auto* affine_quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(
+            filter->quantization.params);
+    TF_LITE_ENSURE(context, affine_quantization);
+    TF_LITE_ENSURE(context, affine_quantization->scale);
+    const int number_channel = affine_quantization->scale->size;
+    data->per_channel_output_multiplier.resize(number_channel);
+    data->per_channel_output_shift.resize(number_channel);
+    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
+        context, input, filter, bias, output, params->activation,
+        &data->output_multiplier, &data->output_shift,
+        &data->output_activation_min, &data->output_activation_max,
+        data->per_channel_output_multiplier.data(),
+        data->per_channel_output_shift.data()));
   }
 
   TfLiteIntArray* outputSize = TfLiteIntArrayCreate(4);
@@ -250,6 +264,33 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                  GetTensorData<uint8_t>(output));
 }
 
+void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteDepthwiseConvParams* params, OpData* data,
+                             const TfLiteTensor* input,
+                             const TfLiteTensor* filter,
+                             const TfLiteTensor* bias, TfLiteTensor* output) {
+  DepthwiseParams op_params;
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.input_offset = input->params.zero_point;
+  op_params.weights_offset = 0;
+  op_params.output_offset = output->params.zero_point;
+
+  reference_integer_ops::DepthwiseConvPerChannel(
+      op_params, data->per_channel_output_multiplier.data(),
+      data->per_channel_output_shift.data(), GetTensorShape(input),
+      GetTensorData<int8>(input), GetTensorShape(filter),
+      GetTensorData<int8>(filter), GetTensorShape(bias),
+      GetTensorData<int32>(bias), GetTensorShape(output),
+      GetTensorData<int8>(output));
+}
+
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
@@ -273,6 +314,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       EvalQuantized<kernel_type>(context, node, params, data, input, filter,
                                  bias, output);
       break;
+    case kTfLiteInt8: {
+      EvalQuantizedPerChannel(context, node, params, data, input, filter, bias,
+                              output);
+      break;
+    }
     default:
       context->ReportError(context, "Type %d not currently supported.",
                            input->type);
diff --git a/tensorflow/lite/kernels/depthwise_conv_test.cc b/tensorflow/lite/kernels/depthwise_conv_test.cc
index d924e6f700781e4aceef3d8554ed3d88d17ed774..5dc513262b2d71aaab458a815aacba473d788859 100644
--- a/tensorflow/lite/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/kernels/depthwise_conv_test.cc
@@ -56,9 +56,35 @@ class BaseDepthwiseConvolutionOpModel : public SingleOpModel {
       // This is a quantized version. The scale of 'bias' depends on the scales
       // of input and filter. Supposedly this is correctly set during quantized
       // training.
-      auto bias_scale = GetScale(input_) * GetScale(filter_);
-      TensorData bias{TensorType_INT32, {bias_size}, 0, 0, bias_scale};
-      bias_ = AddInput(bias);
+      if (filter.per_channel_quantization) {
+        // per channel quantization.
+        std::vector<float> bias_scale(
+            filter.per_channel_quantization_scales.size());
+        std::vector<int64_t> bias_zero_points(
+            filter.per_channel_quantization_scales.size());
+        for (int i = 0; i < filter.per_channel_quantization_scales.size();
+             ++i) {
+          bias_scale[i] =
+              input.scale * filter.per_channel_quantization_scales[i];
+          bias_zero_points[i] = 0;
+        }
+        TensorData bias{TensorType_INT32,
+                        {bias_size},
+                        /*min=*/0,
+                        /*max=*/0,
+                        /*scale=*/0,
+                        /*zero_point=*/0,
+                        true,
+                        /*per_channel_scale=*/bias_scale,
+                        /*per_channel_zero_point=*/bias_zero_points,
+                        /*channel_index==*/0};
+        bias_ = AddInput(bias);
+      } else {
+        // per tensor quantization.
+        auto bias_scale = GetScale(input_) * GetScale(filter_);
+        TensorData bias{TensorType_INT32, {bias_size}, 0, 0, bias_scale};
+        bias_ = AddInput(bias);
+      }
     }
 
     output_ = AddOutput(output);
@@ -437,11 +463,81 @@ TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleDilatedTestPaddingSame) {
               ElementsAreArray({4, 7, 3, 6, 10, 4, 2, 3, 1}));
 }
 
-INSTANTIATE_TEST_CASE_P(
+class PerChannelQuantizedDepthwiseConvolutionOpModel
+    : public BaseDepthwiseConvolutionOpModel {
+ public:
+  using BaseDepthwiseConvolutionOpModel::BaseDepthwiseConvolutionOpModel;
+
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<int8_t>(input_, data);
+  }
+
+  void SetFilter(std::initializer_list<float> data) {
+    PerChannelSymmetricQuantizeAndPopulate(filter_, data);
+  }
+
+  void SetBias(std::initializer_list<float> data) {
+    PerChannelQuantizeBias(bias_, data);
+  }
+
+  std::vector<int8_t> GetOutput() { return ExtractVector<int8_t>(output_); }
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<int8_t>(ExtractVector<int8_t>(output_), GetScale(output_),
+                              GetZeroPoint(output_));
+  }
+};
+
+TEST_P(QuantizedDepthwiseConvolutionOpTest, SimpleTest) {
+  PerChannelQuantizedDepthwiseConvolutionOpModel m(
+      GetRegistration(), {TensorType_INT8, {1, 2, 3, 2}, -63.5, 64, 0.5, -1},
+      {TensorType_INT8,
+       // [1 * 2 * 2 * 4] as [input_channel, y, x, output_channel]
+       {1, 2, 2, 4},
+       0,
+       0,
+       0,
+       0,
+       /*per_channel=*/true,
+       /*per_channel_scales=*/{1, 2, 3, 4},
+       /*per_channel_zeros=*/{0, 0, 0, 0},
+       /*channel_index=*/3},
+      {TensorType_INT8, {}, -63.5, 64, 0.5, -1}, Padding_VALID);
+  m.SetInput({
+      // [1 * 2 * 3 * 2] as [batch, y, x, input_channel]
+      3, 2,    // batch = 0, y = 0, x = 0
+      1, -1,   // batch = 0, y = 0, x = 1
+      -2, -3,  // batch = 0, y = 0, x = 2
+      4, 3,    // batch = 0, y = 1, x = 0
+      2, -2,   // batch = 0, y = 1, x = 1
+      -3, -4,  // batch = 0, y = 1, x = 2
+  });
+  m.SetFilter(
+      /*filter data*/
+      {
+          // [1 * 2 * 2 * 4] as [input_channel, y, x, output_channel]
+          // depth multiplier = 2
+          1, 2, 3, 4,  // y = 0, x = 0
+          3, 4, 5, 6,  // y = 0, x = 1
+          7, 8, 5, 6,  // y = 1, x = 0
+          3, 4, 1, 2,  // y = 1, x = 1
+      });
+  m.SetBias({3, -2, 4, 6});
+
+  // Invoke and verify output.
+  // output has dimension [1 * 1 * 2 * 4] as [batch, y, x, output_channel]
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetDequantizedOutput(),
+      ElementsAreArray(ArrayFloatNear({40.5, 48, 27, 40, 0.5, -4, -24, -36})));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({80, 95, 53, 79, 0, -9, -49, -73}));
+}
+
+INSTANTIATE_TEST_SUITE_P(
     DepthwiseConvolutionOpTest, DepthwiseConvolutionOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     QuantizedDepthwiseConvolutionOpTest, QuantizedDepthwiseConvolutionOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
 
diff --git a/tensorflow/lite/kernels/dequantize_test.cc b/tensorflow/lite/kernels/dequantize_test.cc
index bb5f1e74a8b0174209043e14af9c35db32bf14b5..77254335fbde0ff4246af00291ccfba9ec8b0acf 100644
--- a/tensorflow/lite/kernels/dequantize_test.cc
+++ b/tensorflow/lite/kernels/dequantize_test.cc
@@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstdint>
+
 #include <gtest/gtest.h>
 #include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/model.h"
@@ -27,13 +30,7 @@ class DequantizeOpModel : public SingleOpModel {
  public:
   DequantizeOpModel(TensorType type, std::initializer_list<int> shape,
                     float scale, int32_t zero_point) {
-    TensorData input_tensor_data;
-    input_tensor_data.type = type;
-    input_tensor_data.shape = shape;
-    input_tensor_data.min = 0;
-    input_tensor_data.max = 0;
-    input_tensor_data.scale = scale;
-    input_tensor_data.zero_point = zero_point;
+    const TensorData input_tensor_data = {type, shape, 0, 0, scale, zero_point};
     input_ = AddInput(input_tensor_data);
     output_ = AddOutput({TensorType_FLOAT32, shape});
     SetBuiltinOp(BuiltinOperator_DEQUANTIZE, BuiltinOptions_DequantizeOptions,
@@ -58,7 +55,7 @@ TEST(DequantizeOpTest, UINT8) {
   // [-63.5, 64] -> scale=0.5 zero_point=127 for UINT8
   DequantizeOpModel m(TensorType_UINT8, {2, 5}, 0.5, 127);
 
-  m.SetInput<uint8>({0, 1, 2, 3, 4, 251, 252, 253, 254, 255});
+  m.SetInput<uint8_t>({0, 1, 2, 3, 4, 251, 252, 253, 254, 255});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(),
               ElementsAreArray(ArrayFloatNear(
@@ -69,7 +66,7 @@ TEST(DequantizeOpTest, INT8) {
   // [-63.5, 64] -> scale=0.5, zero_point=1 for INT8
   DequantizeOpModel m(TensorType_INT8, {2, 5}, 0.5, -1);
 
-  m.SetInput<int8>({-128, -127, -126, -125, -124, 123, 124, 125, 126, 127});
+  m.SetInput<int8_t>({-128, -127, -126, -125, -124, 123, 124, 125, 126, 127});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(),
               ElementsAreArray(ArrayFloatNear(
diff --git a/tensorflow/lite/kernels/detection_postprocess.cc b/tensorflow/lite/kernels/detection_postprocess.cc
index 84e2a0efb27c5e2381d76dba89ddf3445077576c..a0df4a10fa1bb2f5441c9a6bdf1b36b1fe05ada1 100644
--- a/tensorflow/lite/kernels/detection_postprocess.cc
+++ b/tensorflow/lite/kernels/detection_postprocess.cc
@@ -498,8 +498,9 @@ TfLiteStatus NonMaxSuppressionMultiClassRegularHelper(TfLiteContext* context,
     }
     // Perform non-maximal suppression on single class
     std::vector<int> selected;
-    NonMaxSuppressionSingleClassHelper(context, node, op_data, class_scores,
-                                       &selected, num_detections_per_class);
+    TF_LITE_ENSURE_STATUS(NonMaxSuppressionSingleClassHelper(
+        context, node, op_data, class_scores, &selected,
+        num_detections_per_class));
     // Add selected indices from non-max suppression of boxes in this class
     int output_index = size_of_sorted_indices;
     for (int selected_index : selected) {
@@ -614,8 +615,8 @@ TfLiteStatus NonMaxSuppressionMultiClassFastHelper(TfLiteContext* context,
   }
   // Perform non-maximal suppression on max scores
   std::vector<int> selected;
-  NonMaxSuppressionSingleClassHelper(context, node, op_data, max_scores,
-                                     &selected, op_data->max_detections);
+  TF_LITE_ENSURE_STATUS(NonMaxSuppressionSingleClassHelper(
+      context, node, op_data, max_scores, &selected, op_data->max_detections));
   // Allocate output tensors
   int output_box_index = 0;
   for (const auto& selected_index : selected) {
@@ -688,11 +689,11 @@ TfLiteStatus NonMaxSuppressionMultiClass(TfLiteContext* context,
       return kTfLiteError;
   }
   if (op_data->use_regular_non_max_suppression)
-    NonMaxSuppressionMultiClassRegularHelper(context, node, op_data,
-                                             GetTensorData<float>(scores));
+    TF_LITE_ENSURE_STATUS(NonMaxSuppressionMultiClassRegularHelper(
+        context, node, op_data, GetTensorData<float>(scores)));
   else
-    NonMaxSuppressionMultiClassFastHelper(context, node, op_data,
-                                          GetTensorData<float>(scores));
+    TF_LITE_ENSURE_STATUS(NonMaxSuppressionMultiClassFastHelper(
+        context, node, op_data, GetTensorData<float>(scores)));
 
   return kTfLiteOk;
 }
@@ -710,12 +711,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // This fills in temporary decoded_boxes
   // by transforming input_box_encodings and input_anchors from
   // CenterSizeEncodings to BoxCornerEncoding
-  DecodeCenterSizeBoxes(context, node, op_data);
+  TF_LITE_ENSURE_STATUS(DecodeCenterSizeBoxes(context, node, op_data));
   // This fills in the output tensors
   // by choosing effective set of decoded boxes
   // based on Non Maximal Suppression, i.e. selecting
   // highest scoring non-overlapping boxes.
-  NonMaxSuppressionMultiClass(context, node, op_data);
+  TF_LITE_ENSURE_STATUS(NonMaxSuppressionMultiClass(context, node, op_data));
 
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/kernels/eigen_support.cc b/tensorflow/lite/kernels/eigen_support.cc
index bad5975a7c187cc4bdcd65721d397897ff2cf09d..e2a2c4aac9456dfae2e26d75d903c300e382b1d0 100644
--- a/tensorflow/lite/kernels/eigen_support.cc
+++ b/tensorflow/lite/kernels/eigen_support.cc
@@ -39,7 +39,7 @@ void SetEigenNbThreads(int threads) {
 #if defined(EIGEN_HAS_OPENMP)
   // The global Eigen thread count is only used when OpenMP is enabled. As this
   // call causes problems with tsan, make it only when OpenMP is available.
-  Eigen::setNbThreads(context->recommended_num_threads);
+  Eigen::setNbThreads(threads);
 #endif  // defined(EIGEN_HAS_OPENMP)
 }
 
diff --git a/tensorflow/lite/kernels/elementwise.cc b/tensorflow/lite/kernels/elementwise.cc
index a79388b900eb89b56a4d18f887dbe52e84fb123f..1cc188ae5f7bfe91bee48c60b692d9dca2b7cf0e 100644
--- a/tensorflow/lite/kernels/elementwise.cc
+++ b/tensorflow/lite/kernels/elementwise.cc
@@ -83,6 +83,10 @@ TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) {
   return EvalNumeric(context, node, std::sin);
 }
 
+TfLiteStatus CosEval(TfLiteContext* context, TfLiteNode* node) {
+  return EvalNumeric(context, node, std::cos);
+}
+
 TfLiteStatus LogEval(TfLiteContext* context, TfLiteNode* node) {
   return EvalNumeric(context, node, std::log);
 }
@@ -122,6 +126,14 @@ TfLiteRegistration* Register_SIN() {
   return &r;
 }
 
+TfLiteRegistration* Register_COS() {
+  static TfLiteRegistration r = {
+      /*init=*/nullptr, /*free=*/nullptr,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::CosEval};
+  return &r;
+}
+
 TfLiteRegistration* Register_LOG() {
   static TfLiteRegistration r = {
       /*init=*/nullptr, /*free=*/nullptr,
diff --git a/tensorflow/lite/kernels/elementwise_test.cc b/tensorflow/lite/kernels/elementwise_test.cc
index 7d24320081257925508b2aa53503c1cf71d0e913..89f2a506f0cc00df021d8b5113174833df7e33cb 100644
--- a/tensorflow/lite/kernels/elementwise_test.cc
+++ b/tensorflow/lite/kernels/elementwise_test.cc
@@ -65,6 +65,15 @@ TEST(ElementWise, Sin) {
   EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1}));
 }
 
+TEST(ElementWise, Cos) {
+  ElementWiseOpFloatModel m(BuiltinOperator_COS, {1, 1, 4, 1});
+  m.PopulateTensor<float>(m.input(), {0, 3.1415926, -3.1415926, 1});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()),
+              ElementsAreArray(ArrayFloatNear({1, -1, -1, 0.54030})));
+  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1}));
+}
+
 TEST(ElementWise, Log) {
   ElementWiseOpFloatModel m(BuiltinOperator_LOG, {1, 1, 4, 1});
   m.PopulateTensor<float>(m.input(), {1, 3.1415926, 1, 1});
diff --git a/tensorflow/lite/kernels/embedding_lookup.cc b/tensorflow/lite/kernels/embedding_lookup.cc
index fad32607b4980ce5d0e6b6a8540adf3b19529403..3f1d62389f470744d1628cf586d486059b0582fc 100644
--- a/tensorflow/lite/kernels/embedding_lookup.cc
+++ b/tensorflow/lite/kernels/embedding_lookup.cc
@@ -117,7 +117,12 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
       // TODO(alanchiao): refactor scalar multiply into separate function
       // for ease of adding a neon equivalent if ever necessary.
       for (int j = 0; j < col_size; j++) {
-        const int8_t* value_ptr = reinterpret_cast<int8_t*>(value->data.uint8);
+        const int8_t* value_ptr;
+        if (value->type == kTfLiteUInt8) {
+          value_ptr = reinterpret_cast<int8_t*>(value->data.uint8);
+        } else {
+          value_ptr = value->data.int8;
+        }
         output->data.f[j + i * col_size] =
             value_ptr[j + idx * col_size] * scaling_factor;
       }
@@ -135,6 +140,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteFloat32:
       return EvalFloat(context, node, lookup, value, output);
     case kTfLiteUInt8:
+    case kTfLiteInt8:
       return EvalHybrid(context, node, lookup, value, output);
     default:
       context->ReportError(context, "Type not currently supported.");
diff --git a/tensorflow/lite/kernels/embedding_lookup_test.cc b/tensorflow/lite/kernels/embedding_lookup_test.cc
index 8ea98a5f0dcbfbcec826c0b9dee0d28cd0bd2885..2462ff26933ef645769e87ca6b6a1eb8a650b662 100644
--- a/tensorflow/lite/kernels/embedding_lookup_test.cc
+++ b/tensorflow/lite/kernels/embedding_lookup_test.cc
@@ -28,6 +28,8 @@ License.
 namespace tflite {
 namespace {
 
+float kTestTolerance = 7.41e-03;
+
 using ::testing::ElementsAreArray;
 
 class BaseEmbeddingLookupOpModel : public SingleOpModel {
@@ -76,13 +78,17 @@ class EmbeddingLookupOpModel : public BaseEmbeddingLookupOpModel {
 class HybridEmbeddingLookupOpModel : public BaseEmbeddingLookupOpModel {
  public:
   HybridEmbeddingLookupOpModel(std::initializer_list<int> index_shape,
-                               std::initializer_list<int> weight_shape)
-      : BaseEmbeddingLookupOpModel(index_shape, weight_shape,
-                                   TensorType_UINT8) {}
+                               std::initializer_list<int> weight_shape,
+                               TensorType type)
+      : BaseEmbeddingLookupOpModel(index_shape, weight_shape, type) {}
 
   void SetWeight(std::initializer_list<float> data) {
     SymmetricQuantizeAndPopulate(weight_, data);
   }
+
+  void SetSignedWeight(std::initializer_list<float> data) {
+    SignedSymmetricQuantizeAndPopulate(weight_, data);
+  }
 };
 
 // TODO(ahentz): write more tests that exercise the details of the op, such as
@@ -103,8 +109,8 @@ TEST(EmbeddingLookupOpTest, SimpleTest) {
               })));
 }
 
-TEST(HybridEmbeddingLookupHybridOpTest, Simple2DTest) {
-  HybridEmbeddingLookupOpModel m({3}, {3, 8});
+TEST(HybridEmbeddingLookupHybridOpTest, Simple2DTestUint8) {
+  HybridEmbeddingLookupOpModel m({3}, {3, 8}, TensorType_UINT8);
   m.SetInput({1, 0, 2});
   m.SetWeight({
       0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
@@ -121,11 +127,11 @@ TEST(HybridEmbeddingLookupHybridOpTest, Simple2DTest) {
                       0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
                       2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
                   },
-                  7.41e-03)));
+                  kTestTolerance)));
 }
 
-TEST(HybridEmbeddingLookupHybridOpTest, Simple3DTest) {
-  HybridEmbeddingLookupOpModel m({3}, {3, 2, 4});
+TEST(HybridEmbeddingLookupHybridOpTest, Simple3DTestUint8) {
+  HybridEmbeddingLookupOpModel m({3}, {3, 2, 4}, TensorType_UINT8);
   m.SetInput({1, 0, 2});
   m.SetWeight({
       0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
@@ -142,11 +148,11 @@ TEST(HybridEmbeddingLookupHybridOpTest, Simple3DTest) {
                       0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
                       2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
                   },
-                  7.41e-03)));
+                  kTestTolerance)));
 }
 
-TEST(HybridEmbeddingLookupHybridOpTest, Simple4DTest) {
-  HybridEmbeddingLookupOpModel m({3}, {3, 2, 2, 2});
+TEST(HybridEmbeddingLookupHybridOpTest, Simple4DTestUint8) {
+  HybridEmbeddingLookupOpModel m({3}, {3, 2, 2, 2}, TensorType_UINT8);
   m.SetInput({1, 0, 2});
   m.SetWeight({
       0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
@@ -163,7 +169,70 @@ TEST(HybridEmbeddingLookupHybridOpTest, Simple4DTest) {
                       0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
                       2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
                   },
-                  7.41e-03)));
+                  kTestTolerance)));
+}
+
+TEST(HybridEmbeddingLookupHybridOpTest, Simple2DTestInt8) {
+  HybridEmbeddingLookupOpModel m({3}, {3, 8}, TensorType_INT8);
+  m.SetInput({1, 0, 2});
+  m.SetSignedWeight({
+      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+                      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+                      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+                  },
+                  kTestTolerance)));
+}
+
+TEST(HybridEmbeddingLookupHybridOpTest, Simple3DTestInt8) {
+  HybridEmbeddingLookupOpModel m({3}, {3, 2, 4}, TensorType_INT8);
+  m.SetInput({1, 0, 2});
+  m.SetSignedWeight({
+      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+                      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+                      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+                  },
+                  kTestTolerance)));
+}
+
+TEST(HybridEmbeddingLookupHybridOpTest, Simple4DTestInt8) {
+  HybridEmbeddingLookupOpModel m({3}, {3, 2, 2, 2}, TensorType_INT8);
+  m.SetInput({1, 0, 2});
+  m.SetSignedWeight({
+      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      1.00, -1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+                      0.00, 0.01,  0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+                      2.00, 2.01,  2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+                  },
+                  kTestTolerance)));
 }
 
 }  // namespace
diff --git a/tensorflow/lite/kernels/floor.cc b/tensorflow/lite/kernels/floor.cc
index aa117e3cacfc4624d347ba812e23801c223bae7b..b6ccce3b938ed7b7a540b872daaea1459ca59e85 100644
--- a/tensorflow/lite/kernels/floor.cc
+++ b/tensorflow/lite/kernels/floor.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
@@ -26,6 +27,11 @@ namespace floor {
 constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;
 
+enum KernelType {
+  kReference,
+  kGenericOptimized,
+};
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
@@ -37,20 +43,34 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return context->ResizeTensor(context, output, output_size);
 }
 
+template <KernelType type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  optimized_ops::Floor(GetTensorShape(input), GetTensorData<float>(input),
-                       GetTensorShape(output), GetTensorData<float>(output));
+  if (type == kGenericOptimized) {
+    optimized_ops::Floor(GetTensorShape(input), GetTensorData<float>(input),
+                         GetTensorShape(output), GetTensorData<float>(output));
+  } else {
+    reference_ops::Floor(GetTensorShape(input), GetTensorData<float>(input),
+                         GetTensorShape(output), GetTensorData<float>(output));
+  }
 
   return kTfLiteOk;
 }
 }  // namespace floor
 
+TfLiteRegistration* Register_FLOOR_REF() {
+  static TfLiteRegistration r = {/*init=*/nullptr,
+                                 /*free=*/nullptr, floor::Prepare,
+                                 floor::Eval<floor::kReference>};
+  return &r;
+}
+
 TfLiteRegistration* Register_FLOOR() {
   static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/nullptr, floor::Prepare, floor::Eval};
+                                 /*free=*/nullptr, floor::Prepare,
+                                 floor::Eval<floor::kGenericOptimized>};
   return &r;
 }
 
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index a1eecb284ab647e8b7fc7b18dfd8ad82aedeece3..7ed77c428dcb79dbbc3a36f3c2e55d2ae66ae4d5 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/gemm_support.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
@@ -132,13 +133,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // If we have to perform on-the-fly quantization (with quantized weights and
   // float inputs) first we need to quantize the inputs. Allocate a temporary
   // buffer to store the intermediate quantized values.
-  if (input->type == kTfLiteFloat32 && filter->type == kTfLiteUInt8) {
+  if (input->type == kTfLiteFloat32 &&
+      (filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8)) {
     TfLiteIntArrayFree(node->temporaries);
     node->temporaries = TfLiteIntArrayCreate(2);
     node->temporaries->data[0] = data->scratch_tensor_index;
 
     TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/0);
-    input_quantized->type = kTfLiteUInt8;
+    input_quantized->type = filter->type;
     input_quantized->allocation_type = kTfLiteArenaRw;
 
     // TODO(raziel): add this logic to ResizeTensor.
@@ -209,8 +211,11 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
                         TfLiteTensor* scaling_factors, TfLiteTensor* output) {
   // Check the types for this hybrid Op.
   TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, filter->type, kTfLiteUInt8);
-  TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteFloat32);
+  TF_LITE_ENSURE(context,
+                 filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8);
+  if (bias) {
+    TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteFloat32);
+  }
   TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
 
   int total_input_size = 1;
@@ -241,7 +246,15 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
   // Quantize input from float to uint8 + quantization params (scaling factor).
   float unused_min, unused_max;
   float* scaling_factors_ptr = scaling_factors->data.f;
-  int8_t* quant_data = reinterpret_cast<int8_t*>(input_quantized->data.uint8);
+  int8_t* quant_data;
+  int8_t* filter_data;
+  if (filter->type == kTfLiteUInt8) {
+    quant_data = reinterpret_cast<int8_t*>(input_quantized->data.uint8);
+    filter_data = reinterpret_cast<int8_t*>(filter->data.uint8);
+  } else {
+    quant_data = input_quantized->data.int8;
+    filter_data = filter->data.int8;
+  }
 
   // Quantize each batch independently.
   for (int b = 0; b < batch_size; ++b) {
@@ -255,8 +268,8 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
 
   // Compute output += weight * quantized_input
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      reinterpret_cast<int8_t*>(filter->data.uint8), num_units, input_size,
-      quant_data, scaling_factors_ptr, batch_size, output->data.f,
+      filter_data, num_units, input_size, quant_data, scaling_factors_ptr,
+      batch_size, output->data.f,
       /*result_stride=*/1);
 
   // Apply activation function to floats.
@@ -276,6 +289,27 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
     macro_name(target_namespace, kRelu6);                            \
   }
 
+namespace {
+void FullyConnectedInt8(const OpData* data, const TfLiteTensor* input,
+                        const TfLiteTensor* filter, const TfLiteTensor* bias,
+                        TfLiteTensor* output,
+                        gemmlowp::GemmContext* gemm_context) {
+  FullyConnectedParams op_params;
+  op_params.input_offset = -input->params.zero_point;
+  op_params.weights_offset = -filter->params.zero_point;
+  op_params.output_offset = output->params.zero_point;
+  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_shift = -data->output_shift;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+  reference_integer_ops::FullyConnected(
+      op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+      GetTensorShape(filter), GetTensorData<int8_t>(filter),
+      GetTensorShape(bias), GetTensorData<int32_t>(bias),
+      GetTensorShape(output), GetTensorData<int8_t>(output), gemm_context);
+}
+}  // namespace
+
 template <KernelType kernel_type>
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                            TfLiteFullyConnectedParams* params, OpData* data,
@@ -309,6 +343,9 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
       case kTfLiteUInt8:
         TF_LITE_FULLY_CONNECTED(reference_ops, uint8_t);
         break;
+      case kTfLiteInt8:
+        FullyConnectedInt8(data, input, filter, bias, output, gemm_context);
+        break;
       case kTfLiteInt16:
         TF_LITE_FULLY_CONNECTED(reference_ops, int16_t);
         break;
@@ -329,6 +366,9 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
       case kTfLiteUInt8:
         TF_LITE_FULLY_CONNECTED(optimized_ops, uint8_t);
         break;
+      case kTfLiteInt8:
+        FullyConnectedInt8(data, input, filter, bias, output, gemm_context);
+        break;
       case kTfLiteInt16:
         TF_LITE_FULLY_CONNECTED(optimized_ops, int16_t);
         break;
@@ -452,6 +492,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                              "Unhandled fully-connected weights format");
         return kTfLiteError;
       }
+    case kTfLiteInt8:
+      if (params->weights_format == kTfLiteFullyConnectedWeightsFormatDefault) {
+        return EvalQuantized<kernel_type>(context, node, params, data, input,
+                                          filter, bias, output);
+      } else {
+        context->ReportError(context,
+                             "Unhandled fully-connected weights format");
+        return kTfLiteError;
+      }
     default:
       context->ReportError(context, "Type %d not currently supported.",
                            filter->type);
diff --git a/tensorflow/lite/kernels/fully_connected_test.cc b/tensorflow/lite/kernels/fully_connected_test.cc
index 3351a30b123b12751f2411f71037f2ecfb1d4b43..ae8e2ac35db4d9aedfda664902b1423ba3c2de7a 100644
--- a/tensorflow/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/kernels/fully_connected_test.cc
@@ -137,6 +137,7 @@ class BaseFullyConnectedOpModel : public SingleOpModel {
   BaseFullyConnectedOpModel(
       TfLiteRegistration* registration, int units, int batches,
       const TensorData& input, const TensorData& output = {TensorType_FLOAT32},
+      bool bias_tensor_optional = false,
       ActivationFunctionType activation_func = ActivationFunctionType_RELU,
       FullyConnectedOptionsWeightsFormat weights_format =
           FullyConnectedOptionsWeightsFormat_DEFAULT)
@@ -151,7 +152,9 @@ class BaseFullyConnectedOpModel : public SingleOpModel {
     weights_ =
         AddInput({input.type, {units_, input_size_}, input.min, input.max});
 
-    if (input.type == TensorType_FLOAT32) {
+    if (bias_tensor_optional) {
+      bias_ = AddNullInput();
+    } else if (input.type == TensorType_FLOAT32) {
       bias_ = AddInput({TensorType_FLOAT32, {units_}});
     } else {
       // This is a quantized version. The scale of 'bias' depends on the scales
@@ -173,7 +176,9 @@ class BaseFullyConnectedOpModel : public SingleOpModel {
             .Union());
     resolver_ = absl::make_unique<SingleOpResolver>(
         BuiltinOperator_FULLY_CONNECTED, registration);
-    BuildInterpreter({GetShape(input_), GetShape(weights_), GetShape(bias_)});
+    BuildInterpreter(
+        {GetShape(input_), GetShape(weights_),
+         (bias_ == kOptionalTensor) ? std::vector<int>() : GetShape(bias_)});
   }
 
   int input_size() { return input_size_; }
@@ -216,9 +221,12 @@ class QuantizedFullyConnectedOpModel : public BaseFullyConnectedOpModel {
   void SetBias(const std::vector<float>& data) {
     QuantizeAndPopulate<int32_t>(bias_, data);
   }
+  template <typename T>
   void SetWeights(const std::vector<float>& data) {
-    QuantizeAndPopulate<uint8_t>(weights_, data);
+    QuantizeAndPopulate<T>(weights_, data);
   }
+
+  template <typename T>
   void ShuffleAndSetWeights(const std::vector<float>& data, int input_depth,
                             int output_depth) {
     std::vector<float> shuffled_data(data.size());
@@ -237,15 +245,17 @@ class QuantizedFullyConnectedOpModel : public BaseFullyConnectedOpModel {
     }
     TfLiteTensor* t = interpreter_->tensor(weights_);
     auto quantized_data =
-        Quantize<uint8_t>(shuffled_data, t->params.scale, t->params.zero_point);
-    for (uint8_t& q : quantized_data) {
+        Quantize<T>(shuffled_data, t->params.scale, t->params.zero_point);
+    for (T& q : quantized_data) {
       q ^= 0x80;
     }
     PopulateTensor(weights_, 0, quantized_data.data(),
                    quantized_data.data() + quantized_data.size());
   }
+
+  template <typename T>
   void SetInput(const std::vector<float>& data) {
-    QuantizeAndPopulate<uint8_t>(input_, data);
+    QuantizeAndPopulate<T>(input_, data);
   }
 
   template <typename T>
@@ -296,6 +306,10 @@ class HybridFullyConnectedOpModel : public SingleOpModel {
     SymmetricQuantizeAndPopulate(weights_, data);
   }
 
+  void SetSignedWeights(std::initializer_list<float> f) {
+    SignedSymmetricQuantizeAndPopulate(weights_, f);
+  }
+
   void SetInput(const std::vector<float>& f) { PopulateTensor(input_, f); }
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
 
@@ -393,21 +407,42 @@ TEST_P(FloatFullyConnectedOpTest, SimpleTest2) {
   EXPECT_THAT(m.GetOutput(), ElementsAre(11, 9));
 }
 
-TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantized) {
+TEST(FloatFullyConnectedOpTest, SimpleTestNoBias) {
+  // The optimized kernel assumes that the bias is specified.
+  FloatFullyConnectedOpModel m(ops::builtin::Register_FULLY_CONNECTED_PIE(),
+                               /*units=*/1, /*batches=*/2,
+                               /*input=*/{TensorType_FLOAT32, {2, 2}},
+                               /*output=*/{TensorType_FLOAT32},
+                               /*bias_tensor_optional=*/true);
+  m.SetWeights({
+      2, 4,  // u = 0
+  });
+
+  m.SetInput({
+      1, 2,  // b = 0
+      2, 1,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAre(10, 8));
+}
+
+TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedUint8) {
   QuantizedFullyConnectedOpModel m(
       GetRegistration(), /*units=*/3, /*batches*/ 2,
       /*input=*/{TensorType_UINT8, {2, 10}, -63.5, 64},
       /*output=*/{TensorType_UINT8, {}, -127, 128});
 
   // input_product_scale < output_scale was not true.
-  m.SetWeights({
+  m.SetWeights<uint8_t>({
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
   });
   m.SetBias({1, 2, 3});
 
-  m.SetInput({
+  m.SetInput<uint8_t>({
       1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
       1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
   });
@@ -423,22 +458,48 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantized) {
               ElementsAre(151, 152, 153, 185, 186, 187));
 }
 
+TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantizedInt8) {
+  QuantizedFullyConnectedOpModel m(
+      ops::builtin::Register_FULLY_CONNECTED_REF(), /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_INT8, {2, 10}, -63.5, 64},
+      /*output=*/{TensorType_INT8, {}, -127, 128});
+
+  // input_product_scale < output_scale was not true.
+  m.SetWeights<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({24, 25, 26, 58, 59, 60})));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(23, 24, 25, 57, 58, 59));
+}
+
 TEST_P(QuantizedFullyConnectedOpTest,
-       SimpleTestQuantizedOutputMultiplierGreaterThan1) {
+       SimpleTestQuantizedOutputMultiplierGreaterThan1Uint8) {
   // real_multiplier = 2.
   QuantizedFullyConnectedOpModel m(
       GetRegistration(), /*units=*/3, /*batches*/ 2,
       /*input=*/{TensorType_UINT8, {2, 10}, -127, 128},
       /*output=*/{TensorType_UINT8, {}, -63.5, 64});
 
-  m.SetWeights({
+  m.SetWeights<uint8_t>({
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
   });
   m.SetBias({1, 2, 3});
 
-  m.SetInput({
+  m.SetInput<uint8_t>({
       1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
       1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
   });
@@ -454,6 +515,36 @@ TEST_P(QuantizedFullyConnectedOpTest,
               ElementsAre(175, 177, 179, 243, 245, 247));
 }
 
+TEST_P(QuantizedFullyConnectedOpTest,
+       SimpleTestQuantizedOutputMultiplierGreaterThan1Int8) {
+  // real_multiplier = 2.
+  QuantizedFullyConnectedOpModel m(
+      ops::builtin::Register_FULLY_CONNECTED_REF(), /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_INT8, {2, 10}, -127, 128},
+      /*output=*/{TensorType_INT8, {}, -63.5, 64});
+
+  m.SetWeights<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({
+                  24, 25, 26,  // first batch
+                  58, 59, 60,  // second batch
+              })));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(47, 49, 51, 115, 117, 119));
+}
+
 void SimpleTestQuantizedInt16OutputCase(
     TfLiteRegistration* registration, int input_depth, int output_depth,
     int batches, FullyConnectedOptionsWeightsFormat weights_format) {
@@ -473,6 +564,7 @@ void SimpleTestQuantizedInt16OutputCase(
       /*input=*/
       {TensorType_UINT8, {batches, input_depth}, kInputMin, kInputMax},
       /*output=*/{TensorType_INT16, {}, kOutputMin, kOutputMax},
+      /*bias_tensor_optional=*/false,
       /*activation_func=*/ActivationFunctionType_NONE, weights_format);
 
   std::mt19937 random_engine;
@@ -488,7 +580,7 @@ void SimpleTestQuantizedInt16OutputCase(
   // and set the (possibly shuffled) weights.
   switch (weights_format) {
     case FullyConnectedOptionsWeightsFormat_DEFAULT:
-      m.SetWeights(weights_data);
+      m.SetWeights<uint8_t>(weights_data);
       break;
     case FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8:
       // The shuffled path currently supports only a restrictive subset of
@@ -496,7 +588,7 @@ void SimpleTestQuantizedInt16OutputCase(
       CHECK_EQ(input_depth % 16, 0);
       CHECK_EQ(output_depth % 4, 0);
       CHECK(batches == 1 || batches == 4);
-      m.ShuffleAndSetWeights(weights_data, input_depth, output_depth);
+      m.ShuffleAndSetWeights<uint8_t>(weights_data, input_depth, output_depth);
       break;
     default:
       LOG(FATAL) << "Unhandled weights format";
@@ -518,7 +610,7 @@ void SimpleTestQuantizedInt16OutputCase(
   }
 
   m.SetBias(bias_data);
-  m.SetInput(input_data);
+  m.SetInput<uint8_t>(input_data);
 
   m.Invoke();
 
@@ -577,11 +669,11 @@ TEST_P(QuantizedFullyConnectedOpTest,
   }
 }
 
-TEST(HybridFullyConnectedOpTest, SimpleTestQuantized) {
+TEST(HybridFullyConnectedOpTest, SimpleTestQuantizedUint8) {
   HybridFullyConnectedOpModel m(
       /*units=*/3, /*batches=*/2,
       /*input=*/{TensorType_FLOAT32, {2, 10}},
-      /*weights=*/{TensorType_UINT8, {3, 10}, -63.5, 64});  // PIE
+      /*weights=*/{TensorType_UINT8, {3, 10}, -63.5, 64});  // Hybrid
 
   m.SetWeights({
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
@@ -605,6 +697,34 @@ TEST(HybridFullyConnectedOpTest, SimpleTestQuantized) {
                                  /*max_abs_error=*/1.3f)));
 }
 
+TEST(HybridFullyConnectedOpTest, SimpleTestQuantizedInt8) {
+  HybridFullyConnectedOpModel m(
+      /*units=*/3, /*batches=*/2,
+      /*input=*/{TensorType_FLOAT32, {2, 10}},
+      /*weights=*/{TensorType_INT8, {3, 10}, -63.5, 64});  // Hybrid
+
+  m.SetSignedWeights({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {
+                                     24, 25, 26,  //
+                                     58, 59, 60,  //
+                                 },
+                                 /*max_abs_error=*/1.3f)));
+}
+
 TEST_P(FloatFullyConnectedOpTest, SimpleTest4DInput) {
   // Note that it is not required that the first dimension be the number of
   // batches. All we care is that the input can be evenly distributed in
@@ -632,21 +752,21 @@ TEST_P(FloatFullyConnectedOpTest, SimpleTest4DInput) {
                              }));
 }
 
-TEST_P(QuantizedFullyConnectedOpTest, SimpleTest4dInputQuantized) {
+TEST_P(QuantizedFullyConnectedOpTest, SimpleTest4dInputQuantizedUint8) {
   QuantizedFullyConnectedOpModel m(
       GetRegistration(), /*units=*/3, /*batches=*/2,
       /*input=*/{TensorType_UINT8, {4, 1, 5, 1}, -63.5, 64},
       /*output=*/{TensorType_UINT8, {}, -127, 128});
 
   // input_product_scale < output_scale was not true.
-  m.SetWeights({
+  m.SetWeights<uint8_t>({
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
   });
   m.SetBias({1, 2, 3});
 
-  m.SetInput({
+  m.SetInput<uint8_t>({
       1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
       1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
   });
@@ -663,21 +783,21 @@ TEST_P(QuantizedFullyConnectedOpTest, SimpleTest4dInputQuantized) {
 }
 
 TEST_P(QuantizedFullyConnectedOpTest,
-       SimpleTest4dInputQuantizedOutputMultiplierGreaterThan1) {
+       SimpleTest4dInputQuantizedOutputMultiplierGreaterThan1Uint8) {
   // real_multiplier = 2.
   QuantizedFullyConnectedOpModel m(
       GetRegistration(), /*units=*/3, /*batches=*/2,
       /*input=*/{TensorType_UINT8, {4, 1, 5, 1}, -127, 128},
       /*output=*/{TensorType_UINT8, {}, -63.5, 64});
 
-  m.SetWeights({
+  m.SetWeights<uint8_t>({
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
   });
   m.SetBias({1, 2, 3});
 
-  m.SetInput({
+  m.SetInput<uint8_t>({
       1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
       1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
   });
@@ -693,11 +813,11 @@ TEST_P(QuantizedFullyConnectedOpTest,
               ElementsAre(175, 177, 179, 243, 245, 247));
 }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     FloatFullyConnectedOpTest, FloatFullyConnectedOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     QuantizedFullyConnectedOpTest, QuantizedFullyConnectedOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMapNoPie)));
 
diff --git a/tensorflow/lite/kernels/gather.cc b/tensorflow/lite/kernels/gather.cc
index f205daae1343cb0abecc95e7d1b280c10f55d897..54d05adbcf161a2af88bea4a0de1eec06e70c09a 100644
--- a/tensorflow/lite/kernels/gather.cc
+++ b/tensorflow/lite/kernels/gather.cc
@@ -57,6 +57,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   switch (input->type) {
     case kTfLiteFloat32:
     case kTfLiteUInt8:
+    case kTfLiteInt8:
     case kTfLiteInt64:
     case kTfLiteInt32:
       break;
@@ -135,6 +136,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         return Gather<float, int32_t>(*params, input, positions, output);
       case kTfLiteUInt8:
         return Gather<uint8_t, int32_t>(*params, input, positions, output);
+      case kTfLiteInt8:
+        return Gather<int8_t, int32_t>(*params, input, positions, output);
       case kTfLiteInt32:
         return Gather<int32_t, int32_t>(*params, input, positions, output);
       case kTfLiteInt64:
@@ -153,6 +156,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         return Gather<float, int64_t>(*params, input, positions, output);
       case kTfLiteUInt8:
         return Gather<uint8_t, int64_t>(*params, input, positions, output);
+      case kTfLiteInt8:
+        return Gather<int8_t, int64_t>(*params, input, positions, output);
       case kTfLiteInt32:
         return Gather<int32_t, int64_t>(*params, input, positions, output);
       case kTfLiteInt64:
diff --git a/tensorflow/lite/kernels/gather_nd.cc b/tensorflow/lite/kernels/gather_nd.cc
new file mode 100644
index 0000000000000000000000000000000000000000..20e98652ee57ec7b6b86a20cbc474b4b9c29b2aa
--- /dev/null
+++ b/tensorflow/lite/kernels/gather_nd.cc
@@ -0,0 +1,154 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace gather_nd {
+constexpr int kParams = 0;
+constexpr int kIndices = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* params = GetInput(context, node, kParams);
+  const TfLiteTensor* indices = GetInput(context, node, kIndices);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (params->type) {
+    case kTfLiteFloat32:
+    case kTfLiteUInt8:
+    case kTfLiteInt8:
+    case kTfLiteInt64:
+    case kTfLiteInt32:
+      break;
+    default:
+      context->ReportError(
+          context, "Params of type '%s' are not supported by gather_nd.",
+          TfLiteTypeGetName(params->type));
+      return kTfLiteError;
+  }
+  switch (indices->type) {
+    case kTfLiteInt64:
+    case kTfLiteInt32:
+      break;
+    default:
+      context->ReportError(
+          context, "Indices of type '%s' are not supported by gather_nd.",
+          TfLiteTypeGetName(indices->type));
+      return kTfLiteError;
+  }
+
+  const int params_rank = NumDimensions(params);
+  const int indices_rank = NumDimensions(indices);
+  const int indices_nd = SizeOfDimension(indices, indices_rank - 1);
+  if (params_rank < 1) {
+    context->ReportError(context, "Params must be at least a vector.");
+    return kTfLiteError;
+  }
+  if (indices_rank < 1) {
+    context->ReportError(context, "Indices must be at least a vector.");
+    return kTfLiteError;
+  }
+  if (indices_nd > params_rank) {
+    context->ReportError(
+        context, "Index innermost dimension length must be <= params rank.");
+    return kTfLiteError;
+  }
+
+  // Assign to output the input type.
+  output->type = params->type;
+
+  // The result shape is
+  // indices.shape[:-1] + params.shape[indices.shape[-1]:]
+  const int output_rank = indices_rank + params_rank - indices_nd - 1;
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(output_rank);
+  int output_index = 0;
+  for (int i = 0; i < indices_rank - 1; ++i) {
+    output_shape->data[output_index++] = indices->dims->data[i];
+  }
+  for (int i = indices_nd; i < params_rank; ++i) {
+    output_shape->data[output_index++] = params->dims->data[i];
+  }
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+template <typename ParamsT, typename IndicesT>
+TfLiteStatus GatherNd(const TfLiteTensor* params, const TfLiteTensor* indices,
+                      TfLiteTensor* output) {
+  reference_ops::GatherNd(
+      GetTensorShape(params), GetTensorData<ParamsT>(params),
+      GetTensorShape(indices), GetTensorData<IndicesT>(indices),
+      GetTensorShape(output), GetTensorData<ParamsT>(output));
+  return kTfLiteOk;
+}
+
+template <typename IndicesT>
+TfLiteStatus EvalGatherNd(TfLiteContext* context, const TfLiteTensor* params,
+                          const TfLiteTensor* indices, TfLiteTensor* output) {
+  switch (params->type) {
+    case kTfLiteFloat32:
+      return GatherNd<float, IndicesT>(params, indices, output);
+    case kTfLiteUInt8:
+      return GatherNd<uint8_t, IndicesT>(params, indices, output);
+    case kTfLiteInt8:
+      return GatherNd<int8_t, IndicesT>(params, indices, output);
+    case kTfLiteInt32:
+      return GatherNd<int32_t, IndicesT>(params, indices, output);
+    case kTfLiteInt64:
+      return GatherNd<int64_t, IndicesT>(params, indices, output);
+    default:
+      context->ReportError(context,
+                           "Params type '%s' are not supported by gather_nd.",
+                           TfLiteTypeGetName(params->type));
+      return kTfLiteError;
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* params = GetInput(context, node, kParams);
+  const TfLiteTensor* indices = GetInput(context, node, kIndices);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (indices->type) {
+    case kTfLiteInt32:
+      return EvalGatherNd<int32_t>(context, params, indices, output);
+    case kTfLiteInt64:
+      return EvalGatherNd<int64_t>(context, params, indices, output);
+    default:
+      context->ReportError(
+          context, "Indices of type '%s' are not supported by gather_nd.",
+          TfLiteTypeGetName(indices->type));
+      return kTfLiteError;
+  }
+}
+}  // namespace gather_nd
+
+TfLiteRegistration* Register_GATHER_ND() {
+  static TfLiteRegistration r = {/*init*/ nullptr, /*free*/ nullptr,
+                                 gather_nd::Prepare, gather_nd::Eval};
+  return &r;
+}
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/gather_nd_test.cc b/tensorflow/lite/kernels/gather_nd_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a5e93efb8ff468f9e1cd6d2cd8c4343c0fe62e79
--- /dev/null
+++ b/tensorflow/lite/kernels/gather_nd_test.cc
@@ -0,0 +1,323 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class GatherNdOpModel : public SingleOpModel {
+ public:
+  GatherNdOpModel(const TensorData& params, const TensorData& indices) {
+    params_ = AddInput(params);
+    indices_ = AddInput(indices);
+    output_ = AddOutput(params.type);
+    SetBuiltinOp(BuiltinOperator_GATHER_ND, BuiltinOptions_GatherNdOptions,
+                 CreateGatherNdOptions(builder_).Union());
+    BuildInterpreter({GetShape(params_), GetShape(indices_)});
+  }
+
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor<T>(params_, data);
+  }
+
+  template <typename T>
+  void SetPositions(std::initializer_list<T> data) {
+    PopulateTensor<T>(indices_, data);
+  }
+
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int params_;
+  int indices_;
+  int output_;
+};
+
+TEST(GatherNdOpTest, ElementIndexingIntoMatrix) {
+  GatherNdOpModel m({TensorType_FLOAT32, {2, 2}}, {TensorType_INT32, {2, 2}});
+  m.SetInput<float>({1.1, 1.2, 2.1, 2.2});
+  m.SetPositions<int32_t>({0, 0, 1, 1});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({1.1, 2.2}));
+}
+
+TEST(GatherNdOpTest, SliceIndexingIntoMatrix) {
+  GatherNdOpModel m({TensorType_FLOAT32, {2, 2}}, {TensorType_INT32, {2, 1}});
+  m.SetInput<float>({1.1, 1.2, 2.1, 2.2});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({2.1, 2.2, 1.1, 1.2}));
+}
+
+TEST(GatherNdOpTest, BatchedIndexingIntoMatrix1) {
+  GatherNdOpModel m({TensorType_FLOAT32, {2, 2}},
+                    {TensorType_INT32, {2, 1, 1}});
+  m.SetInput<float>({1.1, 1.2, 2.1, 2.2});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({2.1, 2.2, 1.1, 1.2}));
+}
+
+TEST(GatherNdOpTest, BatchedIndexingIntoMatrix2) {
+  GatherNdOpModel m({TensorType_FLOAT32, {2, 2}},
+                    {TensorType_INT32, {2, 1, 2}});
+  m.SetInput<float>({1.1, 1.2, 2.1, 2.2});
+  m.SetPositions<int32_t>({0, 0, 1, 1});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({1.1, 2.2}));
+}
+
+TEST(GatherNdOpTest, DuplicateIndexingIntoMatrix) {
+  GatherNdOpModel m({TensorType_FLOAT32, {2, 2}}, {TensorType_INT32, {2, 2}});
+  m.SetInput<float>({1.1, 1.2, 2.1, 2.2});
+  m.SetPositions<int32_t>({0, 0, 0, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({1.1, 1.1}));
+}
+
+TEST(GatherNdOpTest, ElementIndexingIntoRank3Tensor) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT32, {1, 2, 3}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int32_t>({0, 0, 1, 1, 1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({-1.2, -4.1}));
+}
+
+TEST(GatherNdOpTest, SliceIndexingIntoRank3Tensor) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT32, {2, 1}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int32_t>({0, 2});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray({1.1, -1.2, 1.3, -2.1, 2.2, 2.3, 5.1, -5.2, 5.3,
+                                6.1, -6.2, 6.3}));
+}
+
+TEST(GatherNdOpTest, BatchedIndexingIntoRank3Tensor1) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT32, {2, 1, 3}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int32_t>({0, 0, 1, 1, 1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({-1.2, -4.1}));
+}
+
+TEST(GatherNdOpTest, BatchedIndexingIntoRank3Tensor2) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT32, {2, 1, 1}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray({3.1, 3.2, -3.3, -4.1, -4.2, 4.3, 1.1, -1.2, 1.3,
+                                -2.1, 2.2, 2.3}));
+}
+
+TEST(GatherNdOpTest, BatchedIndexingIntoRank3Tensor3) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT32, {2, 2, 2}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int32_t>({0, 1, 1, 0, 0, 0, 2, 1});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray({-2.1, 2.2, 2.3, 3.1, 3.2, -3.3, 1.1, -1.2, 1.3,
+                                6.1, -6.2, 6.3}));
+}
+
+TEST(GatherNdOpTest, BatchedIndexingIntoRank3Tensor4) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT32, {2, 2, 3}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int32_t>({0, 0, 1, 1, 0, 1, 1, 1, 2, 2, 1, 2});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray({-1.2, 3.2, 4.3, 6.3}));
+}
+
+TEST(GatherNdOpTest, DuplicateIndexingIntoRank3Tensor) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT32, {2, 2}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int32_t>({0, 1, 0, 1});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray({-2.1, 2.2, 2.3, -2.1, 2.2, 2.3}));
+}
+
+TEST(GatherNdOpTest, Float32Int32) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT32, {2, 2}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int32_t>({0, 1, 1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray({-2.1, 2.2, 2.3, 3.1, 3.2, -3.3}));
+}
+
+TEST(GatherNdOpTest, Float32Int64) {
+  GatherNdOpModel m({TensorType_FLOAT32, {3, 2, 3}},
+                    {TensorType_INT64, {2, 2}});
+  m.SetInput<float>({1.1, -1.2, 1.3, -2.1, 2.2, 2.3,   //
+                     3.1, 3.2, -3.3, -4.1, -4.2, 4.3,  //
+                     5.1, -5.2, 5.3, 6.1, -6.2, 6.3});
+  m.SetPositions<int64_t>({0LL, 1LL, 1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray({-2.1, 2.2, 2.3, 3.1, 3.2, -3.3}));
+}
+
+TEST(GatherNdOpTest, Int32Int32) {
+  GatherNdOpModel m({TensorType_INT32, {3, 2, 3}}, {TensorType_INT32, {2, 2}});
+  m.SetInput<int32_t>({1, -1, 1, -2, 2, 2,   //
+                       3, 3, -3, -4, -4, 4,  //
+                       5, -5, 5, 6, -6, 6});
+  m.SetPositions<int32_t>({0, 1, 1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int32_t>(), ElementsAreArray({-2, 2, 2, 3, 3, -3}));
+}
+
+TEST(GatherNdOpTest, Int32Int64) {
+  GatherNdOpModel m({TensorType_INT32, {3, 2, 3}}, {TensorType_INT64, {2, 2}});
+  m.SetInput<int32_t>({1, -1, 1, -2, 2, 2,   //
+                       3, 3, -3, -4, -4, 4,  //
+                       5, -5, 5, 6, -6, 6});
+  m.SetPositions<int64_t>({0LL, 1LL, 1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int32_t>(), ElementsAreArray({-2, 2, 2, 3, 3, -3}));
+}
+
+TEST(GatherNdOpTest, Uint8Int32) {
+  GatherNdOpModel m({TensorType_UINT8, {3, 2, 3}}, {TensorType_INT32, {2, 2}});
+  m.SetInput<uint8_t>({1, 1, 1, 2, 2, 2,  //
+                       3, 3, 3, 4, 4, 4,  //
+                       5, 5, 5, 6, 6, 6});
+  m.SetPositions<int32_t>({0, 1, 1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({2, 2, 2, 3, 3, 3}));
+}
+
+TEST(GatherNdOpTest, Uint8Int64) {
+  GatherNdOpModel m({TensorType_UINT8, {3, 2, 3}}, {TensorType_INT64, {2, 2}});
+  m.SetInput<uint8_t>({1, 1, 1, 2, 2, 2,  //
+                       3, 3, 3, 4, 4, 4,  //
+                       5, 5, 5, 6, 6, 6});
+  m.SetPositions<int64_t>({0, 1, 1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({2, 2, 2, 3, 3, 3}));
+}
+
+TEST(GatherNdOpTest, Int8Int32) {
+  GatherNdOpModel m({TensorType_INT8, {3, 2, 3}}, {TensorType_INT32, {2, 2}});
+  m.SetInput<int8_t>({1, -1, 1, -2, 2, 2,   //
+                      3, 3, -3, -4, -4, 4,  //
+                      5, -5, 5, 6, -6, 6});
+  m.SetPositions<int32_t>({0, 1, 1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({-2, 2, 2, 3, 3, -3}));
+}
+
+TEST(GatherNdOpTest, Int8Int64) {
+  GatherNdOpModel m({TensorType_INT8, {3, 2, 3}}, {TensorType_INT64, {2, 2}});
+  m.SetInput<int8_t>({1, -1, 1, -2, 2, 2,   //
+                      3, 3, -3, -4, -4, 4,  //
+                      5, -5, 5, 6, -6, 6});
+  m.SetPositions<int64_t>({0LL, 1LL, 1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({-2, 2, 2, 3, 3, -3}));
+}
+
+TEST(GatherNdOpTest, Int64Int32) {
+  GatherNdOpModel m({TensorType_INT64, {3, 2, 3}}, {TensorType_INT32, {2, 2}});
+  m.SetInput<int64_t>({1LL, -1LL, 1LL, -2LL, 2LL, 2LL,   //
+                       3LL, 3LL, -3LL, -4LL, -4LL, 4LL,  //
+                       5LL, -5LL, 5LL, 6LL, -6LL, 6LL});
+  m.SetPositions<int32_t>({0, 1, 1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int64_t>(),
+              ElementsAreArray({-2LL, 2LL, 2LL, 3LL, 3LL, -3LL}));
+}
+
+TEST(GatherNdOpTest, Int64Int64) {
+  GatherNdOpModel m({TensorType_INT64, {3, 2, 3}}, {TensorType_INT64, {2, 2}});
+  m.SetInput<int64_t>({1LL, -1LL, 1LL, -2LL, 2LL, 2LL,   //
+                       3LL, 3LL, -3LL, -4LL, -4LL, 4LL,  //
+                       5LL, -5LL, 5LL, 6LL, -6LL, 6LL});
+  m.SetPositions<int64_t>({0LL, 1LL, 1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int64_t>(),
+              ElementsAreArray({-2LL, 2LL, 2LL, 3LL, 3LL, -3LL}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/gather_test.cc b/tensorflow/lite/kernels/gather_test.cc
index 7b5f84348903a3cc436f1bd6cf32b3175b2f5815..b5461c204f3f11cdcc6f54cdc1b0aff40265ce54 100644
--- a/tensorflow/lite/kernels/gather_test.cc
+++ b/tensorflow/lite/kernels/gather_test.cc
@@ -205,6 +205,24 @@ TEST(TypesGatherOpTest, Uint8Int64) {
   EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({14, 15, 133, 134}));
 }
 
+TEST(TypesGatherOpTest, Int8Int32) {
+  GatherOpModel m({TensorType_INT8, {2, 2}}, {TensorType_INT32, {2}});
+  m.SetInput<int8_t>({-13, -120, 14, 15});
+  m.SetPositions<int32_t>({1, 0});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({14, 15, -13, -120}));
+}
+
+TEST(TypesGatherOpTest, Int8Int64) {
+  GatherOpModel m({TensorType_INT8, {2, 2}}, {TensorType_INT64, {2}});
+  m.SetInput<int8_t>({-13, -120, 14, 15});
+  m.SetPositions<int64_t>({1LL, 0LL});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({14, 15, -13, -120}));
+}
+
 TEST(TypesGatherOpTest, Int64Int32) {
   GatherOpModel m({TensorType_INT64, {2, 2}}, {TensorType_INT32, {2}});
   m.SetInput<int64_t>({-(1LL << 34), 134LL, 14LL, 15LL});
diff --git a/tensorflow/lite/kernels/if.cc b/tensorflow/lite/kernels/if.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1bd394e980073b73674ca972d28fafe04f7b8adf
--- /dev/null
+++ b/tensorflow/lite/kernels/if.cc
@@ -0,0 +1,200 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+namespace if_kernel {
+
+struct OpData {
+  int then_subgraph_index;
+  int else_subgraph_index;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* op_data = new OpData;
+  const uint8_t* buffer_t = reinterpret_cast<const uint8_t*>(buffer);
+  const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap();
+  op_data->then_subgraph_index = m["then_subgraph_index"].AsInt32();
+  op_data->else_subgraph_index = m["else_subgraph_index"].AsInt32();
+  return op_data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+
+  TF_LITE_ENSURE(context, node->inputs->size > 0);
+
+  // The first input is the condition.
+  const TfLiteTensor* cond = GetInput(context, node, 0);
+  // Currently only bool is supported.
+  // TODO(ycling): Support other types since TensorFlow also support
+  // non-bool types as condition.
+  TF_LITE_ENSURE_EQ(context, cond->type, kTfLiteBool);
+  TF_LITE_ENSURE_EQ(context, NumElements(cond), 1);
+
+  // The first input of the node is the condition. The rest of inputs are
+  // passed to the branch subgraphs. Therefore, the number of subgraph inputs
+  // will be the number of node inputs - 1.
+  int num_inputs = node->inputs->size - 1;
+  int num_outputs = node->outputs->size;
+
+  Subgraph* this_subgraph = reinterpret_cast<Subgraph*>(context->impl_);
+  auto* subgraphs = this_subgraph->GetSubgraphs();
+  TF_LITE_ENSURE(context, op_data->then_subgraph_index < subgraphs->size());
+  TF_LITE_ENSURE(context, op_data->else_subgraph_index < subgraphs->size());
+
+  Subgraph* then_subgraph = (*subgraphs)[op_data->then_subgraph_index].get();
+  Subgraph* else_subgraph = (*subgraphs)[op_data->else_subgraph_index].get();
+
+  for (auto* subgraph : {then_subgraph, else_subgraph}) {
+    TF_LITE_ENSURE_EQ(context, num_inputs, subgraph->inputs().size());
+    TF_LITE_ENSURE_EQ(context, num_outputs, subgraph->outputs().size());
+  }
+
+  bool has_dynamic_output_tensors = false;
+  for (auto* subgraph : {then_subgraph, else_subgraph}) {
+    for (int i = 0; i < num_inputs; ++i) {
+      // The first input of the node is the condition. The indices of the inputs
+      // passed to the subgraphs are offset by 1.
+      const TfLiteTensor* input = GetInput(context, node, i + 1);
+      std::vector<int> dims(input->dims->data,
+                            input->dims->data + input->dims->size);
+      subgraph->ResizeInputTensor(i, dims);
+      TfLiteTensor* subgraph_input = subgraph->tensor(subgraph->inputs()[i]);
+      TF_LITE_ENSURE_EQ(context, input->type, subgraph_input->type);
+    }
+    // Note: The `Prepare` function is responsible to run `AllocateTensors` on
+    // both subgraphs. It's intentionally not to break out of the loop when
+    // finding a dynamic output tensor.
+    TF_LITE_ENSURE_OK(context, subgraph->AllocateTensors());
+    has_dynamic_output_tensors |= subgraph->HasDynamicTensors();
+  }
+
+  if (!has_dynamic_output_tensors) {
+    for (int i = 0; i < num_outputs; ++i) {
+      TfLiteTensor* then_output =
+          then_subgraph->tensor(then_subgraph->outputs()[i]);
+      TfLiteTensor* else_output =
+          else_subgraph->tensor(else_subgraph->outputs()[i]);
+      // If the 2 subgraphs have static but different output shapes, the output
+      // tensors of the IF op have dynamic sizes.
+      if (!TfLiteIntArrayEqual(then_output->dims, else_output->dims)) {
+        has_dynamic_output_tensors = true;
+        break;
+      }
+    }
+  }
+
+  for (int i = 0; i < num_outputs; ++i) {
+    TfLiteTensor* output = GetOutput(context, node, i);
+    if (has_dynamic_output_tensors) {
+      SetTensorToDynamic(output);
+    } else {
+      // When there's no dynamic output tensors, the 2 subgraph has exactly
+      // the same static sized outputs.
+      TfLiteTensor* then_output =
+          then_subgraph->tensor(then_subgraph->outputs()[i]);
+      TfLiteIntArray* output_size = TfLiteIntArrayCopy(then_output->dims);
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, output, output_size));
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* cond = GetInput(context, node, 0);
+  bool cond_value = cond->data.b[0];
+
+  Subgraph* this_subgraph = reinterpret_cast<Subgraph*>(context->impl_);
+  auto* subgraphs = this_subgraph->GetSubgraphs();
+
+  // Currently we copy the input / output between the subgraphs. This isn't
+  // optimized yet.
+  // TODO(b/120234921): Optimize and avoid copying tensors between subgraphs.
+  int active_branch_subgraph_index =
+      cond_value ? op_data->then_subgraph_index : op_data->else_subgraph_index;
+  Subgraph& active_branch_subgraph =
+      *(*subgraphs)[active_branch_subgraph_index];
+  for (int i = 0; i < active_branch_subgraph.inputs().size(); ++i) {
+    const TfLiteTensor* input = GetInput(context, node, i + 1);
+    TfLiteTensor* subgraph_input =
+        active_branch_subgraph.tensor(active_branch_subgraph.inputs()[i]);
+    TF_LITE_ENSURE_EQ(context, input->bytes, subgraph_input->bytes);
+    memcpy(subgraph_input->data.raw, input->data.raw, input->bytes);
+  }
+
+  // Note: It's guaranteed that the subgraphs' `AllocateTensors` are called
+  // in `Prepare`, so we don't need to do it here again.
+  TF_LITE_ENSURE_OK(context, active_branch_subgraph.Invoke());
+
+  for (int tensor_index : active_branch_subgraph.outputs()) {
+    active_branch_subgraph.EnsureTensorDataIsReadable(tensor_index);
+  }
+
+  bool has_dynamic_output_tensors = false;
+  for (int i = 0; i < node->outputs->size; ++i) {
+    TfLiteTensor* output = GetOutput(context, node, i);
+    if (IsDynamicTensor(output)) {
+      has_dynamic_output_tensors = true;
+      break;
+    }
+  }
+
+  if (has_dynamic_output_tensors) {
+    for (int i = 0; i < node->outputs->size; ++i) {
+      TfLiteTensor* output = GetOutput(context, node, i);
+      TfLiteTensor* subgraph_output =
+          active_branch_subgraph.tensor(active_branch_subgraph.outputs()[i]);
+      TfLiteIntArray* output_size = TfLiteIntArrayCopy(subgraph_output->dims);
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, output, output_size));
+    }
+  }
+
+  for (int i = 0; i < active_branch_subgraph.outputs().size(); ++i) {
+    const TfLiteTensor* subgraph_output =
+        active_branch_subgraph.tensor(active_branch_subgraph.outputs()[i]);
+    TfLiteTensor* output = GetOutput(context, node, i);
+    TF_LITE_ENSURE_EQ(context, output->bytes, subgraph_output->bytes);
+    memcpy(output->data.raw, subgraph_output->data.raw, output->bytes);
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace if_kernel
+
+TfLiteRegistration* Register_IF() {
+  static TfLiteRegistration r = {if_kernel::Init, if_kernel::Free,
+                                 if_kernel::Prepare, if_kernel::Eval};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/if_test.cc b/tensorflow/lite/kernels/if_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0f90db131b0bc335b54f4f8c24fa5d8dd02862f4
--- /dev/null
+++ b/tensorflow/lite/kernels/if_test.cc
@@ -0,0 +1,113 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/subgraph_test_util.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+
+using subgraph_test_util::CheckIntTensor;
+using subgraph_test_util::ControlFlowOpTest;
+using subgraph_test_util::FillIntTensor;
+
+namespace {
+
+// A simple test that performs `ADD` if condition is true, and `MUL` otherwise.
+// The computation is: `cond ? a + b : a * b`.
+class SimpleIfTest : public ControlFlowOpTest {
+ protected:
+  void SetUp() override {
+    interpreter_->AddSubgraphs(2);
+    builder_->BuildAddSubgraph(interpreter_->subgraph(1));
+    builder_->BuildMulSubgraph(interpreter_->subgraph(2));
+    builder_->BuildIfSubgraph(&interpreter_->primary_subgraph());
+
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {1});
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {2});
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[2], {1, 2});
+    ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+    FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]), {5, 7});
+    FillIntTensor(interpreter_->tensor(interpreter_->inputs()[2]), {1, 2});
+  }
+};
+
+TEST_F(SimpleIfTest, TestIfTrue) {
+  interpreter_->typed_input_tensor<bool>(0)[0] = true;
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[0]);
+  CheckIntTensor(output, {1, 2}, {6, 9});
+}
+
+TEST_F(SimpleIfTest, TestIfFalse) {
+  interpreter_->typed_input_tensor<bool>(0)[0] = false;
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[0]);
+  CheckIntTensor(output, {1, 2}, {5, 14});
+}
+
+// Test IF op using subgraphs with dynamically sized outputs.
+// The computation is: `cond ? a + b : pad(a, b)`.
+class DynamicSubgraphIfTest : public ControlFlowOpTest {
+ protected:
+  void SetUp() override {
+    interpreter_->AddSubgraphs(2);
+    builder_->BuildAddSubgraph(interpreter_->subgraph(1));
+    builder_->BuildPadSubgraph(interpreter_->subgraph(2));
+    builder_->BuildIfSubgraph(&interpreter_->primary_subgraph());
+
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {1});
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {2});
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[2], {1, 2});
+    ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+    FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]), {5, 7});
+    FillIntTensor(interpreter_->tensor(interpreter_->inputs()[2]), {1, 2});
+  }
+};
+
+TEST_F(DynamicSubgraphIfTest, TestIfTrue) {
+  interpreter_->typed_input_tensor<bool>(0)[0] = true;
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[0]);
+  // Even if the true branch has a static type output, the output of the
+  // if op is dynamic because the other branch has dynamic output.
+  EXPECT_TRUE(IsDynamicTensor(output));
+  CheckIntTensor(output, {1, 2}, {6, 9});
+}
+
+TEST_F(DynamicSubgraphIfTest, TestIfFalse) {
+  interpreter_->typed_input_tensor<bool>(0)[0] = false;
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[0]);
+  // The false branch has dynamic output.
+  EXPECT_TRUE(IsDynamicTensor(output));
+  CheckIntTensor(output, {5}, {0, 5, 7, 0, 0});
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 69816583f5020843aeff76890f51c6c306f11a4f..816b88d675ca3861e2042fdaedfe8a029d7a37aa 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -1,12 +1,13 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "transitive_hdrs")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+
 package(default_visibility = [
     "//visibility:public",
 ])
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow/lite:build_def.bzl", "tflite_copts")
-load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
-
 tflite_deps_intel = [
     "@arm_neon_2_x86_sse",
 ]
@@ -45,7 +46,6 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/kernels:op_macros",
-        "@com_google_absl//absl/base:core_headers",
     ],
 )
 
@@ -59,7 +59,6 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/kernels:op_macros",
-        "@com_google_absl//absl/base:core_headers",
     ],
 )
 
@@ -253,9 +252,6 @@ cc_library(
 cc_test(
     name = "tensor_test",
     srcs = ["tensor_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",  # TODO(b/117786830)
-    ],
     deps = [
         ":tensor",
         "@com_google_googletest//:gtest",
@@ -285,9 +281,6 @@ cc_library(
 cc_test(
     name = "quantization_util_test",
     srcs = ["quantization_util_test.cc"],
-    tags = [
-        "tflite_not_portable_ios",  # TODO(b/117786830)
-    ],
     deps = [
         ":quantization_util",
         "@com_google_googletest//:gtest",
@@ -313,7 +306,17 @@ cc_library(
         "reference/depthwiseconv_float.h",
         "reference/depthwiseconv_uint8.h",
         "reference/fully_connected.h",
+        "reference/integer_ops/add.h",
+        "reference/integer_ops/conv.h",
+        "reference/integer_ops/depthwise_conv.h",
         "reference/integer_ops/dequantize.h",
+        "reference/integer_ops/fully_connected.h",
+        "reference/integer_ops/log_softmax.h",
+        "reference/integer_ops/logistic.h",
+        "reference/integer_ops/mul.h",
+        "reference/integer_ops/pooling.h",
+        "reference/integer_ops/softmax.h",
+        "reference/integer_ops/tanh.h",
         "reference/reference_ops.h",
         "reference/softmax.h",
     ],
@@ -410,6 +413,7 @@ cc_library(
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/kernels:activation_functor",
         "//tensorflow/lite/kernels:op_macros",
+        "//tensorflow/lite/kernels/internal:types",
     ],
 )
 
@@ -543,16 +547,20 @@ cc_library(
     name = "test_util",
     srcs = ["test_util.cc"],
     hdrs = ["test_util.h"],
+    linkopts = select({
+        "//tensorflow:windows": [],
+        "//conditions:default": ["-lm"],
+    }),
     deps = [
         ":types",
-        "//tensorflow/lite:string",
     ],
 )
 
-cc_test(
+# TODO(b/122597976): Eliminate TF dependency from lite/kernels:test_util,
+# in turn eliminating the need to use tf_cc_test for any dependent tests.
+tf_cc_test(
     name = "tensor_utils_test",
     srcs = ["tensor_utils_test.cc"],
-    copts = NEON_FLAGS_IF_APPLICABLE,
     linkopts = select({
         "//tensorflow:android": [
             "-fPIE -pie",
@@ -560,9 +568,6 @@ cc_test(
         "//conditions:default": [],
     }),
     linkstatic = 1,
-    tags = [
-        "tflite_not_portable_ios",
-    ],
     deps = [
         ":tensor_utils",
         "//tensorflow/lite/c:c_api_internal",
@@ -585,26 +590,25 @@ cc_test(
 
 cc_test(
     name = "depthwiseconv_quantized_test",
-    srcs = ["depthwiseconv_quantized_test.cc"],
-    shard_count = 2,
-    tags = [
-        "tflite_not_portable_ios",
+    srcs = [
+        "depthwiseconv_quantized_test.cc",
+        "optimized/depthwiseconv_uint8_transitional.h",
     ],
+    shard_count = 2,
     deps = [
         ":optimized_base",
         ":reference_base",
         ":test_util",
         ":types",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
+        "@gemmlowp",
     ],
 )
 
 cc_test(
     name = "resize_bilinear_test",
     srcs = ["resize_bilinear_test.cc"],
-    tags = [
-        "tflite_not_portable",
-    ],
     deps = [
         ":optimized_base",
         ":reference_base",
@@ -617,9 +621,6 @@ cc_test(
 cc_test(
     name = "resize_nearest_neighbor_test",
     srcs = ["resize_nearest_neighbor_test.cc"],
-    tags = [
-        "tflite_not_portable",
-    ],
     deps = [
         ":optimized_base",
         ":reference_base",
@@ -635,6 +636,7 @@ cc_test(
     srcs = [
         "softmax_quantized_test.cc",
     ],
+    shard_count = 3,
     deps = [
         ":optimized_base",
         ":quantization_util",
@@ -651,7 +653,10 @@ cc_test(
     srcs = [
         "logsoftmax_quantized_test.cc",
     ],
+    shard_count = 4,
     tags = [
+        # TODO(b/122242739): Reenable after fixing the flakiness?
+        "nomac",
         "tflite_not_portable",
     ],
     deps = [
@@ -667,6 +672,10 @@ cc_test(
 cc_test(
     name = "log_quantized_test",
     srcs = ["log_quantized_test.cc"],
+    linkopts = select({
+        "//tensorflow:windows": [],
+        "//conditions:default": ["-lm"],
+    }),
     deps = [
         ":optimized_base",
         ":reference_base",
@@ -702,4 +711,78 @@ cc_test(
 
 exports_files(["optimized/eigen_tensor_reduced_instantiations_oss.h"])
 
+filegroup(
+    name = "optimized_op_headers",
+    srcs = glob([
+        "optimized/*.h",
+    ]),
+    visibility = ["//tensorflow/lite:__subpackages__"],
+)
+
+filegroup(
+    name = "reference_op_headers",
+    srcs = glob([
+        "reference/*.h",
+    ]),
+    visibility = ["//tensorflow/lite:__subpackages__"],
+)
+
+filegroup(
+    name = "headers",
+    srcs = glob([
+        "*.h",
+    ]),
+    visibility = ["//tensorflow/lite:__subpackages__"],
+)
+
+transitive_hdrs(
+    name = "nnapi_external_headers",
+    visibility = ["//tensorflow/lite:__subpackages__"],
+    deps = [
+        "//third_party/eigen3",
+        "@gemmlowp",
+    ],
+)
+
+# ---------------------------------------------------------
+# The public target "install_nnapi_extra_headers" is only
+# used for external targets that requires exporting optmized
+# and reference op headers.
+
+genrule(
+    name = "install_nnapi_extra_headers",
+    srcs = [
+        ":nnapi_external_headers",
+        ":headers",
+        ":optimized_op_headers",
+        ":reference_op_headers",
+    ],
+    outs = ["include"],
+    cmd = """
+    mkdir $@
+    for f in $(SRCS); do
+      d="$${f%/*}"
+      d="$${d#bazel-out*genfiles/}"
+      d="$${d#*external/eigen_archive/}"
+
+      if [[ $${d} == *local_config_* ]]; then
+        continue
+      fi
+
+      if [[ $${d} == external* ]]; then
+        extname="$${d#*external/}"
+        extname="$${extname%%/*}"
+        if [[ $${TF_SYSTEM_LIBS:-} == *$${extname}* ]]; then
+          continue
+        fi
+      fi
+
+      mkdir -p "$@/$${d}"
+      cp "$${f}" "$@/$${d}/"
+    done
+    """,
+    tags = ["manual"],
+    visibility = ["//visibility:private"],
+)
+
 tflite_portable_test_suite()
diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h
index fdb72037f84e4cea9018516ef70eb8c8fa039082..e00a3f405e071df7034da9c54770c17397a3670f 100644
--- a/tensorflow/lite/kernels/internal/common.h
+++ b/tensorflow/lite/kernels/internal/common.h
@@ -131,6 +131,238 @@ int CountLeadingZeros(T integer_input) {
 #endif
 }
 
+// TODO(b/77858996): Add these to gemmlowp.
+template <typename IntegerType>
+IntegerType SaturatingAddNonGemmlowp(IntegerType a, IntegerType b) {
+  static_assert(std::is_same<IntegerType, void>::value, "unimplemented");
+  return a;
+}
+
+template <>
+inline std::int32_t SaturatingAddNonGemmlowp(std::int32_t a, std::int32_t b) {
+  std::int64_t a64 = a;
+  std::int64_t b64 = b;
+  std::int64_t sum = a64 + b64;
+  return static_cast<std::int32_t>(std::min(
+      static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::max()),
+      std::max(
+          static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()),
+          sum)));
+}
+
+template <typename tRawType, int tIntegerBits>
+gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingAddNonGemmlowp(
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> a,
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> b) {
+  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
+      SaturatingAddNonGemmlowp(a.raw(), b.raw()));
+}
+
+template <typename IntegerType>
+IntegerType SaturatingSub(IntegerType a, IntegerType b) {
+  static_assert(std::is_same<IntegerType, void>::value, "unimplemented");
+  return a;
+}
+
+template <>
+inline std::int16_t SaturatingSub(std::int16_t a, std::int16_t b) {
+  std::int32_t a32 = a;
+  std::int32_t b32 = b;
+  std::int32_t diff = a32 - b32;
+  return static_cast<std::int16_t>(std::min(32767, std::max(-32768, diff)));
+}
+
+template <>
+inline std::int32_t SaturatingSub(std::int32_t a, std::int32_t b) {
+  std::int64_t a64 = a;
+  std::int64_t b64 = b;
+  std::int64_t diff = a64 - b64;
+  return static_cast<std::int32_t>(std::min(
+      static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::max()),
+      std::max(
+          static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()),
+          diff)));
+}
+
+template <typename tRawType, int tIntegerBits>
+gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingSub(
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> a,
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> b) {
+  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
+      SaturatingSub(a.raw(), b.raw()));
+}
+// End section to be moved to gemmlowp.
+
+template <typename IntegerType>
+IntegerType SaturatingRoundingMultiplyByPOTParam(IntegerType x, int exponent) {
+  if (exponent == 0) {
+    return x;
+  }
+  using ScalarIntegerType =
+      typename gemmlowp::FixedPointRawTypeTraits<IntegerType>::ScalarRawType;
+  const IntegerType min =
+      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::min());
+  const IntegerType max =
+      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::max());
+  const int ScalarIntegerTypeBits = 8 * sizeof(ScalarIntegerType);
+
+  const std::int32_t threshold =
+      ((1 << (ScalarIntegerTypeBits - 1 - exponent)) - 1);
+  const IntegerType positive_mask =
+      gemmlowp::MaskIfGreaterThan(x, gemmlowp::Dup<IntegerType>(threshold));
+  const IntegerType negative_mask =
+      gemmlowp::MaskIfLessThan(x, gemmlowp::Dup<IntegerType>(-threshold));
+
+  IntegerType result = gemmlowp::ShiftLeft(x, exponent);
+  result = gemmlowp::SelectUsingMask(positive_mask, max, result);
+  result = gemmlowp::SelectUsingMask(negative_mask, min, result);
+  return result;
+}
+
+// If we want to leave IntegerBits fixed, then multiplication
+// by a power of two has to be saturating/rounding, not exact anymore.
+template <typename tRawType, int tIntegerBits>
+gemmlowp::FixedPoint<tRawType, tIntegerBits>
+SaturatingRoundingMultiplyByPOTParam(
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> a, int exponent) {
+  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
+      SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent));
+}
+
+// Minimum output bits to accommodate log of maximum input range.  It actually
+// does not matter if one considers, say, [-64,64] or [-64,64).
+//
+// For example, run this through Octave:
+// [0:127; ...
+//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2)); ...
+//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2))]
+constexpr int min_log_x_output_bits(int input_bits) {
+  return input_bits > 90
+             ? 7
+             : input_bits > 44
+                   ? 6
+                   : input_bits > 21
+                         ? 5
+                         : input_bits > 10
+                               ? 4
+                               : input_bits > 4 ? 3 : input_bits > 1 ? 2 : 1;
+}
+
+// Although currently the name of this function says that it cannot handle
+// values less than 1, in practice it can handle as low as 1/x_max, where
+// x_max is the largest representable input.  In other words, the output range
+// is symmetric.
+template <int OutputIntegerBits, int InputIntegerBits>
+inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
+log_x_for_x_greater_than_or_equal_to_1_impl(
+    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
+  // assert(__builtin_clz(0u) >= std::numeric_limits<uint32>::digits - 1);
+  // assert(__builtin_clz(0u) <= std::numeric_limits<uint32>::digits);
+  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+  // The reason for accumulating the result with an extra bit of headroom is
+  // that z_pow_2_adj * log_2 might be saturated, and adding num_scaled *
+  // recip_denom will otherwise introduce an error.
+  static constexpr int kAccumIntegerBits = OutputIntegerBits + 1;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumIntegerBits>;
+
+  const FixedPoint0 log_2 = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1488522236, std::log(2.0));
+  const FixedPoint0 sqrt_sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1805811301, std::sqrt(std::sqrt(0.5)));
+  const FixedPoint0 sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1518500250, std::sqrt(0.5));
+  const FixedPoint0 one_quarter =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(FixedPoint0, 536870912, 1.0 / 4.0);
+
+  const FixedPoint0 alpha_n = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 117049297, 11.0 / 240.0 * std::sqrt(std::sqrt(2.0)));
+  const FixedPoint0 alpha_d = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 127690142, 1.0 / 20.0 * std::sqrt(std::sqrt(2.0)));
+  const FixedPoint0 alpha_i = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1057819769,
+      2.0 / std::sqrt(std::sqrt(2.0)) - std::sqrt(std::sqrt(2.0)));
+  const FixedPoint0 alpha_f = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 638450708, 1.0 / 4.0 * std::sqrt(std::sqrt(2.0)));
+
+  const FixedPointAccum shifted_quarter =
+      gemmlowp::Rescale<kAccumIntegerBits>(one_quarter);
+
+  // Reinterpret the input value as Q0.31, because we will figure out the
+  // required shift "ourselves" instead of using, say, Rescale.
+  FixedPoint0 z_a = FixedPoint0::FromRaw(input_val.raw());
+  // z_a_pow_2 = input_integer_bits - z_a_headroom;
+  int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32>(z_a.raw()));
+  FixedPoint0 r_a_tmp =
+      SaturatingRoundingMultiplyByPOTParam(z_a, (z_a_headroom_plus_1 - 1));
+  const int32 r_a_raw =
+      SaturatingRoundingMultiplyByPOTParam((r_a_tmp * sqrt_half).raw(), 1);
+  // z_pow_2_adj = max(z_pow_2_a - 0.75, z_pow_2_b - 0.25);
+  // z_pow_2_adj = max(InputIntegerBits - z_a_headroom_plus_1 + 0.25,
+  //                   InputIntegerBits - z_b_headroom - 0.25);
+  const FixedPointAccum z_a_pow_2_adj = SaturatingAddNonGemmlowp(
+      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
+          InputIntegerBits - z_a_headroom_plus_1, 31 - kAccumIntegerBits)),
+      shifted_quarter);
+
+  // z_b is treated like z_a, but premultiplying by sqrt(0.5).
+  FixedPoint0 z_b = z_a * sqrt_half;
+  int z_b_headroom = CountLeadingZeros(static_cast<uint32>(z_b.raw())) - 1;
+  const int32 r_b_raw =
+      SaturatingRoundingMultiplyByPOTParam(z_a.raw(), z_b_headroom);
+  const FixedPointAccum z_b_pow_2_adj = SaturatingSub(
+      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
+          InputIntegerBits - z_b_headroom, 31 - kAccumIntegerBits)),
+      shifted_quarter);
+
+  const FixedPoint0 r = FixedPoint0::FromRaw(std::min(r_a_raw, r_b_raw));
+  const FixedPointAccum z_pow_2_adj = FixedPointAccum::FromRaw(
+      std::max(z_a_pow_2_adj.raw(), z_b_pow_2_adj.raw()));
+
+  const FixedPoint0 p = gemmlowp::RoundingHalfSum(r, sqrt_sqrt_half);
+  FixedPoint0 q = r - sqrt_sqrt_half;
+  q = q + q;
+
+  const FixedPoint0 common_sq = q * q;
+  const FixedPoint0 num = q * r + q * common_sq * alpha_n;
+  const FixedPoint0 denom_minus_one_0 =
+      p * (alpha_i + q + alpha_d * common_sq) + alpha_f * q;
+  const FixedPoint0 recip_denom =
+      one_over_one_plus_x_for_x_in_0_1(denom_minus_one_0);
+
+  const FixedPointAccum num_scaled = gemmlowp::Rescale<kAccumIntegerBits>(num);
+  return gemmlowp::Rescale<OutputIntegerBits>(z_pow_2_adj * log_2 +
+                                              num_scaled * recip_denom);
+}
+
+template <int OutputIntegerBits, int InputIntegerBits>
+inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
+log_x_for_x_greater_than_or_equal_to_1(
+    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
+  static_assert(
+      OutputIntegerBits >= min_log_x_output_bits(InputIntegerBits),
+      "Output integer bits must be sufficent to accommodate logs of inputs.");
+  return log_x_for_x_greater_than_or_equal_to_1_impl<OutputIntegerBits,
+                                                     InputIntegerBits>(
+      input_val);
+}
+
+inline int32 GetReciprocal(int32 x, int x_integer_digits,
+                           int* num_bits_over_unit) {
+  int headroom_plus_one = CountLeadingZeros(static_cast<uint32>(x));
+  // This is the number of bits to the left of the binary point above 1.0.
+  // Consider x=1.25.  In that case shifted_scale=0.8 and
+  // no later adjustment will be needed.
+  *num_bits_over_unit = x_integer_digits - headroom_plus_one;
+  const int32 shifted_sum_minus_one =
+      static_cast<int32>((static_cast<uint32>(x) << headroom_plus_one) -
+                         (static_cast<uint32>(1) << 31));
+
+  gemmlowp::FixedPoint<int32, 0> shifted_scale =
+      gemmlowp::one_over_one_plus_x_for_x_in_0_1(
+          gemmlowp::FixedPoint<int32, 0>::FromRaw(shifted_sum_minus_one));
+  return shifted_scale.raw();
+}
+
 // DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
 // BROADCASTING.
 //
diff --git a/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc b/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
index 3682499d494cc4e63712b6c57d80482899b2185d..5a0539064755d9cd93205d680723de3550a177b9 100644
--- a/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstdlib>
 #include <iterator>
 #include <limits>
+#include <string>
 #include <vector>
 
 #include <gtest/gtest.h>
@@ -26,33 +27,82 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/types.h"
 
 #define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#include "absl/strings/substitute.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
 
 namespace tflite {
 namespace {
 
-enum class ForceKernelInvocation {
-  // Run all tests against kUseStandardEntry even if also testing another
-  // kernel, since we need to be sure that the main DepthwiseConv() function in
-  // optimized_ops.h dispatches to a correctly-executing kernel.
-  kNone = 0,  // The "default" option: use the normal DepthwiseConv
-              // kernel (entry) function.
-  kUseGenericKernel,
-  kUseNeon3x3,            // 3x3 kernel that uses NEON when available.
-  kUseNeon3x3DotProduct,  // 3x3 kernel that uses dot-product enabled NEON when
-                          // available.
+using optimized_ops::depthwise_conv::DotProduct3x3KernelType;
+using ::testing::Bool;
+using ::testing::Values;
+
+// Currently, this is used in place of a Boolean "is symmetric?".
+enum class ParamsSpecialization {
+  kNone = 0,
+  kSymmetric,  // Symmetric quantization: zero represented by 128.
+};
+
+static constexpr int kSymmetricZeroPoint = 128;
+
+// Extend coverage distribution in a specific aspect, either explicitly chosen
+// or randomly chosen as in a mixture distribution.
+enum class CoverageExtension {
+  kNone = 0,
+  kLargeHeights = 1,
+  kLargeWidths = 2,
+  kNumOptions
+};
+
+// The TestParam structure below is the preferred parameterization of tests. A
+// tuple version is defined in order to support value-parameterized tests.
+typedef std::tuple<DepthwiseConvImplementation, int, bool, bool, bool,
+                   DepthwiseConvOutputRounding, bool>
+    TestParamTuple;
+
+struct TestParam {
+  TestParam() = default;
+
+  explicit TestParam(TestParamTuple param_tuple)
+      : forced_invocation(::testing::get<0>(param_tuple)),
+        tests_to_run(::testing::get<1>(param_tuple)),
+        test_stride(::testing::get<2>(param_tuple)),
+        test_pad(::testing::get<3>(param_tuple)),
+        test_depth_multiplier(::testing::get<4>(param_tuple)),
+        output_rounding(::testing::get<5>(param_tuple)),
+        loose_tolerance(::testing::get<6>(param_tuple)) {}
+
+  static std::string TestNameSuffix(
+      const ::testing::TestParamInfo<TestParamTuple>& info) {
+    const TestParam param(info.param);
+    return absl::Substitute("invocation_$0_stride_$1_pad_$2_depth_mult_$3",
+                            static_cast<int>(param.forced_invocation),
+                            param.test_stride, param.test_pad,
+                            param.test_depth_multiplier);
+  }
+
+  DepthwiseConvImplementation forced_invocation =
+      DepthwiseConvImplementation::kNone;
+  int tests_to_run = 0;
+  bool test_stride = false;
+  bool test_pad = false;
+  bool test_depth_multiplier = false;
+  DepthwiseConvOutputRounding output_rounding =
+      DepthwiseConvOutputRounding::kNone;
+  bool loose_tolerance = false;
 };
 
 inline void DispatchDepthwiseConv(
-    ForceKernelInvocation forced_invocation, const DepthwiseParams& params,
+    const TestParam& test_param, const DepthwiseParams& params,
     const RuntimeShape& input_shape, const uint8* input_data,
     const RuntimeShape& filter_shape, const uint8* filter_data,
     const RuntimeShape& bias_shape, const int32* bias_data,
     const RuntimeShape& output_shape, uint8* output_data) {
-  switch (forced_invocation) {
-    case ForceKernelInvocation::kUseNeon3x3: {
+  switch (test_param.forced_invocation) {
+    case DepthwiseConvImplementation::kUseNeon3x3: {
 // Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
 // Jetson TX-2. This compiler does not support the offsetof() macro.
 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
@@ -67,20 +117,20 @@ inline void DispatchDepthwiseConv(
 
       // Check that parameter combination is supported.
       const bool basic_3x3_kernel_supported =
-          optimized_ops::Fast3x3FilterKernelSupported(
+          optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported(
               input_shape, filter_shape, stride_width, stride_height,
               dilation_width_factor, dilation_height_factor, pad_width,
               pad_height, depth_multiplier, output_shape, output_shift);
       ASSERT_TRUE(basic_3x3_kernel_supported)
           << "pad_width = " << params.padding_values.width
           << " pad_height = " << params.padding_values.height
-          << " input_width = " << input_shape.Dims(1)
-          << " input_height = " << input_shape.Dims(2)
-          << " output_width = " << output_shape.Dims(1)
-          << " output_height = " << output_shape.Dims(2);
+          << " input_width = " << input_shape.Dims(2)
+          << " input_height = " << input_shape.Dims(1)
+          << " output_width = " << output_shape.Dims(2)
+          << " output_height = " << output_shape.Dims(1);
 
       // Call kernel optimized for depthwise convolutions using 3x3 filters.
-      optimized_ops::DepthwiseConv3x3Filter(
+      optimized_ops::depthwise_conv::DepthwiseConv3x3Filter(
           params, input_shape, input_data, filter_shape, filter_data,
           bias_shape, bias_data, output_shape, output_data);
       return;
@@ -88,56 +138,42 @@ inline void DispatchDepthwiseConv(
       break;
 #endif
     }
-    case ForceKernelInvocation::kUseNeon3x3DotProduct: {
-// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
-// Jetson TX-2. This compiler does not support the offsetof() macro.
-#if defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__) && \
-    !defined(GOOGLE_L4T)
-      using optimized_ops::DotProduct3x3KernelType;
-      DotProduct3x3KernelType kernel_type =
-          optimized_ops::CategorizeDotProductKernel(params);
-      switch (kernel_type) {
-        case DotProduct3x3KernelType::kPlain:
-          // TODO(b/118430534): Implement optimized kernel.
-          optimized_ops::DepthwiseConv3x3Filter(
-              params, input_shape, input_data, filter_shape, filter_data,
-              bias_shape, bias_data, output_shape, output_data);
-          return;
-        case DotProduct3x3KernelType::kWithDepthMultiplication:
-          // TODO(b/118430338): Implement optimized kernel.
-          optimized_ops::DepthwiseConvGeneral(
-              params, input_shape, input_data, filter_shape, filter_data,
-              bias_shape, bias_data, output_shape, output_data);
-          return;
-        case DotProduct3x3KernelType::kWithPad0Stride2:
-          // TODO(b/118430338): Implement optimized kernel.
-          optimized_ops::DepthwiseConv3x3Filter(
-              params, input_shape, input_data, filter_shape, filter_data,
-              bias_shape, bias_data, output_shape, output_data);
-          return;
-        case DotProduct3x3KernelType::kWithPad1Stride1:
-          // TODO(b/118430338): Implement optimized kernel.
-          optimized_ops::DepthwiseConvGeneral(
-              params, input_shape, input_data, filter_shape, filter_data,
-              bias_shape, bias_data, output_shape, output_data);
-          return;
-        case DotProduct3x3KernelType::kNone:
-        default:
-          break;
-      }
-#endif
+    case DepthwiseConvImplementation::kUseNeon3x3DotProduct:
+    case DepthwiseConvImplementation::kUseUnwound3x3DotProduct:
+    case DepthwiseConvImplementation::kUseIntrinsics3x3DotProduct:
+      // TODO(b/118426582) Placeholder for future dispatches.
       break;
+    case DepthwiseConvImplementation::kUseCModel3x3DotProduct: {
+      DotProduct3x3KernelType kernel_type =
+          optimized_ops::depthwise_conv::CategorizeDotProductKernel(params);
+
+      ASSERT_TRUE(
+          kernel_type == DotProduct3x3KernelType::kPlain ||
+          kernel_type == DotProduct3x3KernelType::kStride2 ||
+          kernel_type ==
+              DotProduct3x3KernelType::kWithDepthMultiplicationStride1 ||
+          kernel_type ==
+              DotProduct3x3KernelType::kWithDepthMultiplicationStride2)
+          << "Kernel type = " << static_cast<int>(kernel_type);
+
+      optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3<
+          DepthwiseConvImplementation::kUseCModel3x3DotProduct>(
+          params, input_shape, input_data, filter_shape, filter_data,
+          bias_shape, bias_data, output_shape, output_data);
+      return;
     }
-    case ForceKernelInvocation::kUseGenericKernel: {
-      optimized_ops::DepthwiseConvGeneral(params, input_shape, input_data,
-                                          filter_shape, filter_data, bias_shape,
-                                          bias_data, output_shape, output_data);
+    case DepthwiseConvImplementation::kUseGenericKernel: {
+      optimized_ops::depthwise_conv::DepthwiseConvGeneral(
+          params, input_shape, input_data, filter_shape, filter_data,
+          bias_shape, bias_data, output_shape, output_data);
       return;
     }
-    case ForceKernelInvocation::kNone:
+    case DepthwiseConvImplementation::kNone:
     default:
       break;
   }
+  EXPECT_EQ(test_param.forced_invocation, DepthwiseConvImplementation::kNone)
+      << "TODO(b/118426582) requested kernel was not invoked / available yet";
   optimized_ops::DepthwiseConv(params, input_shape, input_data, filter_shape,
                                filter_data, bias_shape, bias_data, output_shape,
                                output_data);
@@ -145,7 +181,7 @@ inline void DispatchDepthwiseConv(
 
 // Runs the DepthwiseConv and compares against the reference implementation.
 int TestOneDepthwiseConvWithGivenOutputShift(
-    ForceKernelInvocation forced_invocation, const std::uint8_t* input_data,
+    const TestParam& test_param, const std::uint8_t* input_data,
     const RuntimeShape& input_shape, std::int32_t input_offset,
     const std::uint8_t* filter_data, const RuntimeShape& filter_shape,
     std::int32_t filter_offset, const std::int32_t* bias_data,
@@ -174,10 +210,31 @@ int TestOneDepthwiseConvWithGivenOutputShift(
   op_params.output_offset = output_offset;
   op_params.output_multiplier = output_multiplier;
   op_params.output_shift = -output_shift;
-  reference_ops::DepthwiseConv(op_params, input_shape, input_data, filter_shape,
-                               filter_data, bias_shape, bias_data, output_shape,
-                               reference_output_data.data());
-  DispatchDepthwiseConv(forced_invocation, op_params, input_shape, input_data,
+  switch (test_param.output_rounding) {
+    case DepthwiseConvOutputRounding::kUpward:
+      reference_ops::depthwise_conv::DepthwiseConvBasicKernel<
+          DepthwiseConvOutputRounding::kAwayFromZero>::Run(op_params,
+                                                           input_shape,
+                                                           input_data,
+                                                           filter_shape,
+                                                           filter_data,
+                                                           bias_shape,
+                                                           bias_data,
+                                                           output_shape,
+                                                           reference_output_data
+                                                               .data());
+      break;
+    case DepthwiseConvOutputRounding::kAwayFromZero:
+      reference_ops::DepthwiseConv(
+          op_params, input_shape, input_data, filter_shape, filter_data,
+          bias_shape, bias_data, output_shape, reference_output_data.data());
+      break;
+    case DepthwiseConvOutputRounding::kNone:
+    default:
+      EXPECT_NE(test_param.output_rounding, DepthwiseConvOutputRounding::kNone);
+      break;
+  }
+  DispatchDepthwiseConv(test_param, op_params, input_shape, input_data,
                         filter_shape, filter_data, bias_shape, bias_data,
                         output_shape, output_data.data());
   int saturated_min = 0;
@@ -201,15 +258,26 @@ int TestOneDepthwiseConvWithGivenOutputShift(
   const float mean_diff = static_cast<float>(sum_diff) / output_buffer_size;
   const float mean_abs_diff =
       static_cast<float>(sum_abs_diff) / output_buffer_size;
+
+  constexpr int diff_mean_tolerance = 1;
+  constexpr int diff_median_tolerance = 0;
+  // The tolerance that we apply to means is tight, but we allow for a rounding
+  // difference in one pixel, and loosen by another 1% for float comparison.
+  const float mean_tolerance =
+      std::max(1e-5f, 1.01f * 2.f / output_buffer_size *
+                          std::sqrt(1.f * depth_multiplier));
+
   // Normally we should require bit-for-bit exact results. Unfortunately a bug
   // in the Intel arm_neon_sse.h translation header that we use for x86 tests
-  // causes 1-bit inaccuracy in
-  // the vqrdmulh_n_s32 intrinsic, which causes off-by-1 errors in quantized
-  // DepthwiseConv ops. So we have to live with a few off-by-one errors for now,
-  // yet still ensure that no more than a small minority of values are wrong.
-  EXPECT_TRUE(std::abs(mean_diff) < 1e-5f && mean_abs_diff < 1e-5f &&
-              std::abs(median_diff) == 0 && std::abs(min_diff) <= 1 &&
-              std::abs(max_diff) <= 1);
+  // causes 1-bit inaccuracy in the vqrdmulh_n_s32 intrinsic, which causes
+  // off-by-1 errors in quantized DepthwiseConv ops. So we have to live with a
+  // few off-by-one errors for now, yet still ensure that no more than a small
+  // minority of values are wrong.
+  EXPECT_LT(std::abs(mean_diff), mean_tolerance);
+  EXPECT_LT(mean_abs_diff, mean_tolerance);
+  EXPECT_LE(std::abs(median_diff), diff_median_tolerance);
+  EXPECT_LE(std::abs(min_diff), diff_mean_tolerance);
+  EXPECT_LE(std::abs(max_diff), diff_mean_tolerance);
   if (saturated_min > 2 * saturated_max) {
     return -1;
   }
@@ -221,13 +289,12 @@ int TestOneDepthwiseConvWithGivenOutputShift(
 
 // The point of this function is that we can't practically know which
 // output_shift value to pass to test DepthwiseConv. It's not easy to guess (we
-// could do some
-// statistics for large size, but they would be fragile at smaller sizes), and
-// guessing wrong would mean that all the values get saturated so the test
-// becomes
-// vacuous. So we just bisect our way to reasonable output_shift values.
+// could do some statistics for large size, but they would be fragile at smaller
+// sizes), and guessing wrong would mean that all the values get saturated so
+// the test becomes vacuous. So we just bisect our way to reasonable
+// output_shift values.
 void TestOneDepthwiseConvBisectOutputShift(
-    ForceKernelInvocation forced_invocation, const std::uint8_t* input_data,
+    const TestParam& test_param, const std::uint8_t* input_data,
     const RuntimeShape& input_shape, std::int32_t input_offset,
     const std::uint8_t* filter_data, const RuntimeShape& filter_shape,
     std::int32_t filter_offset, const std::int32_t* bias_data,
@@ -242,7 +309,7 @@ void TestOneDepthwiseConvBisectOutputShift(
   int output_shift_bisect_midpoint =
       (output_activation_bisect_start + output_activation_bisect_end) / 2;
   int bisect_result = TestOneDepthwiseConvWithGivenOutputShift(
-      forced_invocation, input_data, input_shape, input_offset, filter_data,
+      test_param, input_data, input_shape, input_offset, filter_data,
       filter_shape, filter_offset, bias_data, bias_shape, stride, padding_type,
       pad_width, pad_height, depth_multiplier, output_offset, output_multiplier,
       output_shift_bisect_midpoint, output_activation_min,
@@ -269,7 +336,7 @@ void TestOneDepthwiseConvBisectOutputShift(
                                              ? output_activation_bisect_end
                                              : output_shift_bisect_midpoint;
   TestOneDepthwiseConvBisectOutputShift(
-      forced_invocation, input_data, input_shape, input_offset, filter_data,
+      test_param, input_data, input_shape, input_offset, filter_data,
       filter_shape, filter_offset, bias_data, bias_shape, stride, padding_type,
       pad_width, pad_height, depth_multiplier, output_offset, output_multiplier,
       new_output_activation_bisect_start, new_output_activation_bisect_end,
@@ -277,7 +344,7 @@ void TestOneDepthwiseConvBisectOutputShift(
 }
 
 void TestOneDepthwiseConv(
-    ForceKernelInvocation forced_invocation, const std::uint8_t* input_data,
+    const TestParam& test_param, const std::uint8_t* input_data,
     const RuntimeShape& input_shape, std::int32_t input_offset,
     const std::uint8_t* filter_data, const RuntimeShape& filter_shape,
     std::int32_t filter_offset, const std::int32_t* bias_data,
@@ -287,13 +354,14 @@ void TestOneDepthwiseConv(
     std::int32_t output_activation_min, std::int32_t output_activation_max,
     const RuntimeShape& output_shape) {
   TestOneDepthwiseConvBisectOutputShift(
-      forced_invocation, input_data, input_shape, input_offset, filter_data,
+      test_param, input_data, input_shape, input_offset, filter_data,
       filter_shape, filter_offset, bias_data, bias_shape, stride, padding_type,
       pad_width, pad_height, depth_multiplier, output_offset, output_multiplier,
       0, 32, output_activation_min, output_activation_max, output_shape);
 }
 
-bool TryTestDepthwiseConv(ForceKernelInvocation forced_invocation, int batch,
+bool TryTestDepthwiseConv(const TestParam& test_param,
+                          ParamsSpecialization params_specialization, int batch,
                           int input_depth, int input_width, int input_height,
                           int filter_width, int filter_height,
                           int depth_multiplier, int stride,
@@ -318,9 +386,12 @@ bool TryTestDepthwiseConv(ForceKernelInvocation forced_invocation, int batch,
   }
   const std::int32_t output_multiplier =
       UniformRandomInt(1 << 29, std::numeric_limits<std::int32_t>::max());
-  const std::int32_t input_offset = UniformRandomInt(-256, 0);
-  const std::int32_t filter_offset = UniformRandomInt(-256, 0);
-  const std::int32_t output_offset = UniformRandomInt(-256, 0);
+  std::int32_t filter_offset = -kSymmetricZeroPoint;
+  if (params_specialization != ParamsSpecialization::kSymmetric) {
+    filter_offset = UniformRandomInt(-255, 0);
+  }
+  const std::int32_t input_offset = UniformRandomInt(-255, 0);
+  const std::int32_t output_offset = UniformRandomInt(0, 255);
   RuntimeShape input_shape_inference(
       {batch, input_height, input_width, input_depth});
   RuntimeShape output_shape_inference;
@@ -343,7 +414,7 @@ bool TryTestDepthwiseConv(ForceKernelInvocation forced_invocation, int batch,
   FillRandom(&filter_data);
   FillRandom(&bias_data, -10000, 10000);
   TestOneDepthwiseConv(
-      forced_invocation, input_data.data(), input_shape_inference, input_offset,
+      test_param, input_data.data(), input_shape_inference, input_offset,
       filter_data.data(), filter_shape_inference, filter_offset,
       bias_data.data(), bias_shape_inference, stride, padding_type, pad_width,
       pad_height, depth_multiplier, output_offset, output_multiplier,
@@ -355,7 +426,8 @@ bool TryTestDepthwiseConv(ForceKernelInvocation forced_invocation, int batch,
 // be legal. If they're not legal, it returns false. If they're legal,
 // it runs the DepthwiseConv test and returns true. This allows the caller
 // to loop until a test has been run.
-bool TryTestOneDepthwiseConv(ForceKernelInvocation forced_invocation) {
+bool TryTestOneDepthwiseConv(const TestParam& test_param,
+                             ParamsSpecialization params_specialization) {
   // We have to pick a lot of positive values, where we are particularly
   // interested in small values because they are most likely to be special
   // cases in optimized implementations, and secondarily because they allow
@@ -375,13 +447,14 @@ bool TryTestOneDepthwiseConv(ForceKernelInvocation forced_invocation) {
       UniformRandomInt(0, 1) ? PaddingType::kSame : PaddingType::kValid;
 
   return TryTestDepthwiseConv(
-      forced_invocation, batch, input_depth, input_width, input_height,
-      filter_width, filter_height, depth_multiplier, stride,
+      test_param, params_specialization, batch, input_depth, input_width,
+      input_height, filter_width, filter_height, depth_multiplier, stride,
       dilation_width_factor, dilation_height_factor, padding_type);
 }
 
 // Tests parameters for the 3x3 filter kernel.
-bool TryTestOneDepthwiseConv3x3Filter(ForceKernelInvocation forced_invocation) {
+bool TryTestOneDepthwiseConv3x3Filter(
+    const TestParam& test_param, ParamsSpecialization params_specialization) {
   const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
   const int input_depth = 8 * ExponentialRandomPositiveInt(0.9f, 10, 50);
   int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
@@ -397,7 +470,7 @@ bool TryTestOneDepthwiseConv3x3Filter(ForceKernelInvocation forced_invocation) {
       UniformRandomInt(0, 1) ? PaddingType::kSame : PaddingType::kValid;
 
   // Adjust for, or reject, special cases.
-  if (forced_invocation != ForceKernelInvocation::kNone) {
+  if (test_param.forced_invocation != DepthwiseConvImplementation::kNone) {
     // With stride == 2 and SAME, padding width and height are the left and top
     // padding amounts. When there is an even input dimension, padding + 1 is
     // required on the right / bottom. This is not handled by these kernels, so
@@ -416,59 +489,77 @@ bool TryTestOneDepthwiseConv3x3Filter(ForceKernelInvocation forced_invocation) {
   }
 
   return TryTestDepthwiseConv(
-      forced_invocation, batch, input_depth, input_width, input_height,
-      filter_width, filter_height, depth_multiplier, stride,
+      test_param, params_specialization, batch, input_depth, input_width,
+      input_height, filter_width, filter_height, depth_multiplier, stride,
       dilation_width_factor, dilation_height_factor, padding_type);
 }
 
 // Tests with parameters suited to dot-product-NEON 3x3 filter kernels.
-bool TryTestOneNeonDot3x3(ForceKernelInvocation forced_invocation,
-                          bool test_stride, bool test_pad,
-                          bool test_depth_multiplier) {
+bool TryTestOneNeonDot3x3(const TestParam& test_param,
+                          ParamsSpecialization params_specialization) {
+  const CoverageExtension coverage_extension = static_cast<CoverageExtension>(
+      UniformRandomInt(0, static_cast<int>(CoverageExtension::kNumOptions)));
+
   const int batch = 1;
-  const int input_depth = test_depth_multiplier
+  const int input_depth = test_param.test_depth_multiplier
                               ? 1
-                              : 8 * ExponentialRandomPositiveInt(0.9f, 10, 50);
-  const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
-  const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+                              : 8 * ExponentialRandomPositiveInt(0.9f, 3, 50);
+  const int input_width = coverage_extension == CoverageExtension::kLargeWidths
+                              ? ExponentialRandomPositiveInt(0.9f, 50, 200)
+                              : ExponentialRandomPositiveInt(0.9f, 20, 60);
+  const int input_height =
+      coverage_extension == CoverageExtension::kLargeHeights
+          ? ExponentialRandomPositiveInt(0.9f, 50, 200)
+          : ExponentialRandomPositiveInt(0.9f, 20, 60);
   const int filter_width = 3;
   const int filter_height = 3;
   const int depth_multiplier =
-      test_depth_multiplier ? 8 * ExponentialRandomPositiveInt(0.8f, 1, 6) : 1;
-  const int stride = test_stride ? 2 : 1;
+      test_param.test_depth_multiplier
+          ? 8 * ExponentialRandomPositiveInt(0.2f, 1, 9)
+          : 1;
+  const int stride = test_param.test_stride ? 2 : 1;
   // We don't support dilations in the 3x3 filter.
   const int dilation_width_factor = 1;
   const int dilation_height_factor = 1;
-  const auto padding_type = test_pad ? PaddingType::kSame : PaddingType::kValid;
+  const auto padding_type =
+      test_param.test_pad ? PaddingType::kSame : PaddingType::kValid;
 
   return TryTestDepthwiseConv(
-      forced_invocation, batch, input_depth, input_width, input_height,
-      filter_width, filter_height, depth_multiplier, stride,
+      test_param, params_specialization, batch, input_depth, input_width,
+      input_height, filter_width, filter_height, depth_multiplier, stride,
       dilation_width_factor, dilation_height_factor, padding_type);
 }
 
-void TestOneDepthwiseConv(ForceKernelInvocation forced_invocation) {
-  while (!TryTestOneDepthwiseConv(forced_invocation)) {
+void TestOneDepthwiseConv(DepthwiseConvImplementation forced_invocation,
+                          DepthwiseConvOutputRounding output_rounding) {
+  TestParam test_param;
+  test_param.forced_invocation = forced_invocation;
+  test_param.output_rounding = output_rounding;
+  while (!TryTestOneDepthwiseConv(test_param, ParamsSpecialization::kNone)) {
   }
 }
 
-void TestOneDepthwiseConv3x3Filter(ForceKernelInvocation forced_invocation) {
-  while (!TryTestOneDepthwiseConv3x3Filter(forced_invocation)) {
+void TestOneDepthwiseConv3x3Filter(
+    DepthwiseConvImplementation forced_invocation,
+    DepthwiseConvOutputRounding output_rounding) {
+  TestParam test_param;
+  test_param.forced_invocation = forced_invocation;
+  test_param.output_rounding = output_rounding;
+  while (!TryTestOneDepthwiseConv3x3Filter(test_param,
+                                           ParamsSpecialization::kNone)) {
   }
 }
 
-void TestOneNeonDot3x3(ForceKernelInvocation forced_invocation,
-                       bool test_stride, bool test_pad,
-                       bool test_depth_multiplier) {
-  while (!TryTestOneNeonDot3x3(forced_invocation, test_stride, test_pad,
-                               test_depth_multiplier)) {
+void TestOneNeonDot3x3(const TestParam& test_param) {
+  while (!TryTestOneNeonDot3x3(test_param, ParamsSpecialization::kSymmetric)) {
   }
 }
 
 TEST(TestDepthwiseConv, TestDepthwiseConv) {
   const int kTestsToRun = 10 * 1000;
   for (int i = 0; i < kTestsToRun; i++) {
-    TestOneDepthwiseConv(ForceKernelInvocation::kNone);
+    TestOneDepthwiseConv(DepthwiseConvImplementation::kNone,
+                         DepthwiseConvOutputRounding::kAwayFromZero);
   }
 }
 
@@ -476,69 +567,93 @@ TEST(TestDepthwiseConv, TestDepthwiseConv) {
 TEST(TestDepthwiseConv, TestGenericKernel) {
   const int kTestsToRun = 10 * 1000;
   for (int i = 0; i < kTestsToRun; i++) {
-    TestOneDepthwiseConv(ForceKernelInvocation::kUseGenericKernel);
+    TestOneDepthwiseConv(DepthwiseConvImplementation::kUseGenericKernel,
+                         DepthwiseConvOutputRounding::kAwayFromZero);
   }
 }
 
 TEST(TestDepthwiseConv, TestKernel3x3Filter) {
   const int kTestsToRun = 1000;
   for (int i = 0; i < kTestsToRun; i++) {
-    TestOneDepthwiseConv3x3Filter(ForceKernelInvocation::kNone);
+    TestOneDepthwiseConv3x3Filter(DepthwiseConvImplementation::kNone,
+                                  DepthwiseConvOutputRounding::kAwayFromZero);
   }
 }
 
-// While the 3x3 coverage test is primarily targeted at specialized kernels, we
-// also run it against the generic kernel, optionally with fewer invocations.
+// While 3x3 coverage tests are primarily targeted at specialized kernels, we
+// also run it against the generic kernel.
 TEST(TestDepthwiseConv, TestGenericKernel3x3Filter) {
-  const int kTestsToRun = 1000;
+  const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
-    TestOneDepthwiseConv3x3Filter(ForceKernelInvocation::kUseGenericKernel);
+    TestOneDepthwiseConv3x3Filter(
+        DepthwiseConvImplementation::kUseGenericKernel,
+        DepthwiseConvOutputRounding::kAwayFromZero);
   }
 }
 
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
 TEST(TestDepthwiseConv, TestNeon3x3Filter) {
   const int kTestsToRun = 3 * 1000;
   for (int i = 0; i < kTestsToRun; i++) {
-    TestOneDepthwiseConv3x3Filter(ForceKernelInvocation::kUseNeon3x3);
+    TestOneDepthwiseConv3x3Filter(DepthwiseConvImplementation::kUseNeon3x3,
+                                  DepthwiseConvOutputRounding::kAwayFromZero);
   }
 }
+#endif
 
-// No stride, no depth multiplier, no pad.
-TEST(TestDepthwiseConv, TestNeonDot3x3Plain) {
-  const int kTestsToRun = 3 * 1000;
-  for (int i = 0; i < kTestsToRun; i++) {
-    TestOneNeonDot3x3(ForceKernelInvocation::kUseNeon3x3DotProduct,
-                      /*test_stride=*/false, /*test_pad=*/false,
-                      /*test_depth_multiplier=*/false);
-  }
-}
+class DepthwiseConvTest : public ::testing::TestWithParam<TestParamTuple> {};
 
-TEST(TestDepthwiseConv, TestNeonDot3x3DepthMultiplier) {
-  const int kTestsToRun = 3 * 1000;
-  for (int i = 0; i < kTestsToRun; i++) {
-    TestOneNeonDot3x3(ForceKernelInvocation::kUseNeon3x3DotProduct,
-                      /*test_stride=*/false, /*test_pad=*/false,
-                      /*test_depth_multiplier=*/true);
+TEST_P(DepthwiseConvTest, NeonDot3x3) {
+  const TestParam param(GetParam());
+  for (int i = 0; i < param.tests_to_run; i++) {
+    TestOneNeonDot3x3(param);
   }
 }
 
-TEST(TestDepthwiseConv, TestNeonDot3x3Stride2) {
-  const int kTestsToRun = 3 * 1000;
-  for (int i = 0; i < kTestsToRun; i++) {
-    TestOneNeonDot3x3(ForceKernelInvocation::kUseNeon3x3DotProduct,
-                      /*test_stride=*/true, /*test_pad=*/false,
-                      /*test_depth_multiplier=*/false);
-  }
-}
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+INSTANTIATE_TEST_SUITE_P(
+    Neon3x3Kernel, DepthwiseConvTest,
+    testing::Combine(
+        Values(DepthwiseConvImplementation::kUseNeon3x3),  // forced_invocation
+        Values(1000),                                      // tests_to_run
+        Bool(),                                            // test_stride
+        Values(false),                                     // test_pad
+        Values(false),  // test_depth_multiplier
+        Values(DepthwiseConvOutputRounding::kAwayFromZero),  // output_rounding
+        Values(false)                                        // loose_tolerance
+        ),
+    TestParam::TestNameSuffix);
+#endif
 
-TEST(TestDepthwiseConv, TestNeonDot3x3Pad1) {
-  const int kTestsToRun = 3 * 1000;
-  for (int i = 0; i < kTestsToRun; i++) {
-    TestOneNeonDot3x3(ForceKernelInvocation::kUseNeon3x3DotProduct,
-                      /*test_stride=*/false, /*test_pad=*/true,
-                      /*test_depth_multiplier=*/false);
-  }
-}
+// While 3x3 coverage tests are primarily targeted at specialized kernels, we
+// also run it against the generic kernel.
+INSTANTIATE_TEST_SUITE_P(
+    GenericKernel, DepthwiseConvTest,
+    testing::Combine(
+        Values(DepthwiseConvImplementation::
+                   kUseGenericKernel),                 // forced_invocation
+        Values(100),                                   // tests_to_run
+        Bool(),                                        // test_stride
+        Bool(),                                        // test_pad
+        Bool(),                                        // test_depth_multiplier
+        Values(DepthwiseConvOutputRounding::kUpward),  // output_rounding
+        Values(false)                                  // loose_tolerance
+        ),
+    TestParam::TestNameSuffix);
+
+INSTANTIATE_TEST_SUITE_P(
+    CModel, DepthwiseConvTest,
+    testing::Combine(
+        Values(DepthwiseConvImplementation::
+                   kUseCModel3x3DotProduct),           // forced_invocation
+        Values(1000),                                  // tests_to_run
+        Bool(),                                        // test_stride
+        Bool(),                                        // test_pad
+        Bool(),                                        // test_depth_multiplier
+        Values(DepthwiseConvOutputRounding::kUpward),  // output_rounding
+        Values(false)                                  // loose_tolerance
+        ),
+    TestParam::TestNameSuffix);
 
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/log_quantized_test.cc b/tensorflow/lite/kernels/internal/log_quantized_test.cc
index 8c39350ab1dd8996799e6539755f040399974106..c31c8e307751bcf1030e121eec23ac6cb217f461 100644
--- a/tensorflow/lite/kernels/internal/log_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/log_quantized_test.cc
@@ -121,8 +121,7 @@ void RunSingleTest(const std::vector<int32>& test_input,
                    const string& check_label, int tolerance) {
   const int n = test_input.size();
   std::vector<int32> float_gen_output(n, 0);
-  std::vector<int32> reference_output(n, 0);
-  std::vector<int32> optimized_output(n, 0);
+  std::vector<int32> quantized_output(n, 0);
 
   // Workaround the stupid things that intelligent humans do.
   // Consequence of __builtin_clz(0u) may equal 31 instead of 32.
@@ -132,45 +131,21 @@ void RunSingleTest(const std::vector<int32>& test_input,
   }
 
   for (int i = 0; i < n; ++i) {
-    reference_output[i] =
-        tflite::reference_ops::log_x_for_x_greater_than_or_equal_to_1_impl<
-            OutputIntegerBits, InputIntegerBits>(
-            gemmlowp::FixedPoint<int32, InputIntegerBits>::FromRaw(
-                fudged_input[i]))
-            .raw();
-    optimized_output[i] =
-        tflite::optimized_ops::log_x_for_x_greater_than_or_equal_to_1_impl<
-            OutputIntegerBits, InputIntegerBits>(
+    quantized_output[i] =
+        tflite::log_x_for_x_greater_than_or_equal_to_1_impl<OutputIntegerBits,
+                                                            InputIntegerBits>(
             gemmlowp::FixedPoint<int32, InputIntegerBits>::FromRaw(
                 fudged_input[i]))
             .raw();
     float_gen_output[i] = LogPositiveValuesViaFloat(
         fudged_input[i], InputIntegerBits, OutputIntegerBits);
   }
-  // Note that first check is intolerant.
-  {
-    std::ostringstream label;
-    label << check_label << " / optimized vs reference / InputIntegerBits="
-          << InputIntegerBits << ", OutputIntegerBits=" << OutputIntegerBits;
-    CheckOutputData(
-        optimized_output, reference_output, test_input, label.str(),
-        InputIntegerBits, OutputIntegerBits, 0);
-  }
   {
     std::ostringstream label;
     label << check_label << " / reference vs float-gen / InputIntegerBits="
           << InputIntegerBits << ", OutputIntegerBits=" << OutputIntegerBits;
-    CheckOutputData(
-        reference_output, float_gen_output, test_input, label.str(),
-        InputIntegerBits, OutputIntegerBits, tolerance);
-  }
-  {
-    std::ostringstream label;
-    label << check_label << " optimized vs float-gen / InputIntegerBits="
-          << InputIntegerBits << ", OutputIntegerBits=" << OutputIntegerBits;
-    CheckOutputData(
-        optimized_output, float_gen_output, test_input, label.str(),
-        InputIntegerBits, OutputIntegerBits, tolerance);
+    CheckOutputData(quantized_output, float_gen_output, test_input, label.str(),
+                    InputIntegerBits, OutputIntegerBits, tolerance);
   }
 }
 
diff --git a/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc b/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc
index 889a726f3a915fb592511d34c036b9726542fee9..d0d2654d4123e5025d000a796907f675ca29b05c 100644
--- a/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/logsoftmax_quantized_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -25,6 +25,8 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/test_util.h"
 #include "tensorflow/lite/string.h"
@@ -61,7 +63,42 @@ void RunLogSoftmaxFloatReference(const uint8* input_data,
   }
 }
 
-void CheckOutputData(const uint8* test_output, const uint8* reference_output,
+// Same as above except for the following change:
+// - input and output data type
+// - Dequnatize function
+// - clamping values
+void RunLogSoftmaxFloatReference(const int8* input_data,
+                                 const RuntimeShape& shape_common,
+                                 int32 input_offset, const double input_scale,
+                                 int stride, float beta,
+                                 int8* reference_output_data) {
+  const int ref_buffer_size = shape_common.FlatSize();
+  std::vector<float> reference_dequant_data(ref_buffer_size);
+  std::vector<float> reference_output_float_data(ref_buffer_size);
+
+  // Reference data generated via Dequant of input into float, and then applying
+  // float LogSoftmax.
+  DequantizationParams dq_params;
+  dq_params.zero_point = input_offset;
+  dq_params.scale = input_scale;
+  reference_integer_ops::Dequantize(dq_params, shape_common, input_data,
+                                    shape_common,
+                                    reference_dequant_data.data());
+  SoftmaxParams sm_params;
+  optimized_ops::LogSoftmax(sm_params, shape_common,
+                            reference_dequant_data.data(), shape_common,
+                            reference_output_float_data.data());
+  // Work with quantized scaling for LogSoftmax, under which 255 represents 0,
+  // and -16 gets nudged up to 0.
+  for (int i = 0; i < ref_buffer_size; i++) {
+    reference_output_data[i] = std::max(
+        -128, static_cast<int>(
+                  127 + std::round(16.0f * reference_output_float_data[i])));
+  }
+}
+
+template <typename T>
+void CheckOutputData(const T* test_output, const T* reference_output,
                      const RuntimeShape& shape_common,
                      const string& check_label, bool be_exacting) {
   const int buffer_size = shape_common.FlatSize();
@@ -144,15 +181,58 @@ void RunOneLogSoftmaxTest(const uint8* input_data,
   reference_ops::LogSoftmax(params, shape_common, input_data, shape_common,
                             reference_quant_logsoftmax_output.data());
 
-  CheckOutputData(optimized_logsoftmax_output.data(),
-                  reference_float_logsoftmax_output.data(), shape_common,
-                  "Optimized vs float reference", false);
-  CheckOutputData(optimized_logsoftmax_output.data(),
-                  reference_quant_logsoftmax_output.data(), shape_common,
-                  "Optimized vs quant reference", true);
-  CheckOutputData(reference_quant_logsoftmax_output.data(),
-                  reference_float_logsoftmax_output.data(), shape_common,
-                  "Quant reference vs float reference", false);
+  CheckOutputData<uint8_t>(optimized_logsoftmax_output.data(),
+                           reference_float_logsoftmax_output.data(),
+                           shape_common, "Optimized vs float reference", false);
+  CheckOutputData<uint8_t>(optimized_logsoftmax_output.data(),
+                           reference_quant_logsoftmax_output.data(),
+                           shape_common, "Optimized vs quant reference", true);
+  CheckOutputData<uint8_t>(reference_quant_logsoftmax_output.data(),
+                           reference_float_logsoftmax_output.data(),
+                           shape_common, "Quant reference vs float reference",
+                           false);
+}
+
+// Runs the LogSoftmax and compares against the float reference implementation
+// and the int8 quantized reference implementation.
+void RunOneLogSoftmaxTest(const int8* input_data,
+                          const RuntimeShape& shape_common, int32 input_offset,
+                          const double input_scale, int stride, float beta) {
+  const int buffer_size = shape_common.FlatSize();
+  std::vector<int8> quantized_logsoftmax_reference_implementation(buffer_size);
+  std::vector<int8> float_logsoftmax_optimized_implementation(buffer_size);
+
+  RunLogSoftmaxFloatReference(input_data, shape_common, input_offset,
+                              input_scale, stride, beta,
+                              float_logsoftmax_optimized_implementation.data());
+
+  int32 input_beta_multiplier;
+  int input_beta_left_shift;
+  int32 reverse_scaling_divisor;
+  int reverse_scaling_right_shift;
+  static const int kScaledDiffIntegerBits = 5;
+  tflite::PreprocessLogSoftmaxScalingExp(
+      beta, input_scale, kScaledDiffIntegerBits, &input_beta_multiplier,
+      &input_beta_left_shift, &reverse_scaling_divisor,
+      &reverse_scaling_right_shift);
+  reverse_scaling_right_shift *= -1;
+  // diff_min has a negative value, and is used to limit the maximum magnitude
+  // of the diffs, which are <= 0.
+  const int diff_min = -tflite::CalculateInputRadius(kScaledDiffIntegerBits,
+                                                     input_beta_left_shift);
+
+  const int outer_size =
+      shape_common.Dims(0) * shape_common.Dims(1) * shape_common.Dims(2);
+  const int inner_size = shape_common.Dims(3);
+  reference_integer_ops::LogSoftmax(
+      input_beta_multiplier, input_beta_left_shift, reverse_scaling_divisor,
+      reverse_scaling_right_shift, diff_min, outer_size, inner_size, input_data,
+      quantized_logsoftmax_reference_implementation.data());
+
+  CheckOutputData<int8_t>(quantized_logsoftmax_reference_implementation.data(),
+                          float_logsoftmax_optimized_implementation.data(),
+                          shape_common, "Quant reference vs float reference",
+                          false);
 }
 
 // This function picks some random LogSoftmax params, which are checked for
@@ -161,6 +241,7 @@ void RunOneLogSoftmaxTest(const uint8* input_data,
 // to loop until a test has been run.
 //
 // Currently we do not reject for any reason.
+template <typename T>
 bool TryOneUniformLogSoftmax() {
   // We pick mostly positive values, on the whole emphasizing smaller values and
   // therefore faster tests.  We test a wider range of depths.  In the case of
@@ -178,7 +259,7 @@ bool TryOneUniformLogSoftmax() {
       RuntimeShape({batch, input_height, input_width, input_depth});
   const int buffer_size = shape_common.FlatSize();
 
-  std::vector<uint8> input_data(buffer_size);
+  std::vector<T> input_data(buffer_size);
   FillRandom(&input_data);
   RunOneLogSoftmaxTest(input_data.data(), shape_common, input_offset,
                        input_scale, stride, beta);
@@ -224,24 +305,32 @@ bool TryOneSkyscraperLogSoftmax(bool small_depth) {
   return true;
 }
 
-TEST(TestQuantizedLogSoftmax, UniformLogSoftmaxTests) {
-  const int kTestsToRun = 1000;
+TEST(TestQuantizedLogSoftmax, UniformLogSoftmaxUint8Tests) {
+  const int kTestsToRun = 100;
+  for (int i = 0; i < kTestsToRun; i++) {
+    while (!TryOneUniformLogSoftmax<uint8_t>()) {
+    }
+  }
+}
+
+TEST(TestQuantizedLogSoftmax, UniformLogSoftmaxUint8Int8Tests) {
+  const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
-    while (!TryOneUniformLogSoftmax()) {
+    while (!TryOneUniformLogSoftmax<int8_t>()) {
     }
   }
 }
 
-TEST(TestQuantizedLogSoftmax, SkyscraperLogSoftmaxTests) {
-  const int kTestsToRun = 1000;
+TEST(TestQuantizedLogSoftmax, SkyscraperLogSoftmaxUint8Tests) {
+  const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
     while (!TryOneSkyscraperLogSoftmax(false)) {
     }
   }
 }
 
-TEST(TestQuantizedLogSoftmax, SmallSkyscraperLogSoftmaxTests) {
-  const int kTestsToRun = 1000;
+TEST(TestQuantizedLogSoftmax, SmallSkyscraperLogSoftmaxUint8Tests) {
+  const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
     while (!TryOneSkyscraperLogSoftmax(true)) {
     }
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
index d3dca799a7cca4a3048cd2d19477ba2b57fbcdac..84d701676b8f4122cb4d66b11969675549cee60f 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -19,10 +19,12 @@ limitations under the License.
 #include "public/gemmlowp.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
 namespace optimized_ops {
+namespace depthwise_conv {
 
 // Implementation of quantized DepthwiseConv
 
@@ -1945,6 +1947,8 @@ inline void DepthwiseConvGeneral(
   }
 }
 
+}  // namespace depthwise_conv
+
 inline void DepthwiseConv(
     const DepthwiseParams& params, const RuntimeShape& input_shape,
     const uint8* input_data, const RuntimeShape& filter_shape,
@@ -1979,20 +1983,23 @@ inline void DepthwiseConv(
 
   // Call kernel optimized for depthwise convolutions using 3x3 filters if
   // parameters are supported.
-  if (Fast3x3FilterKernelSupported(
+  if (depthwise_conv::Fast3x3FilterKernelSupported(
           input_shape, filter_shape, stride_width, stride_height,
           dilation_width_factor, dilation_height_factor, pad_width, pad_height,
           depth_multiplier, output_shape, output_shift)) {
-    DepthwiseConv3x3Filter(params, input_shape, input_data, filter_shape,
-                           filter_data, bias_shape, bias_data, output_shape,
-                           output_data);
+    gemmlowp::ScopedProfilingLabel specialized_label("DepthwiseConv/8bit/3x3");
+    depthwise_conv::DepthwiseConv3x3Filter(
+        params, input_shape, input_data, filter_shape, filter_data, bias_shape,
+        bias_data, output_shape, output_data);
     return;
   }
 #endif
 
-  DepthwiseConvGeneral(params, input_shape, input_data, filter_shape,
-                       filter_data, bias_shape, bias_data, output_shape,
-                       output_data);
+  gemmlowp::ScopedProfilingLabel specialized_label(
+      "DepthwiseConv/8bit/General");
+  depthwise_conv::DepthwiseConvGeneral(params, input_shape, input_data,
+                                       filter_shape, filter_data, bias_shape,
+                                       bias_data, output_shape, output_data);
 }
 
 }  // namespace optimized_ops
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
index 5859bcaed4ac2b991ca22e7d9c17d34d3267a120..8a87d424e8329fe060425ee85ef175351c6ec61d 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -18,55 +18,65 @@ limitations under the License.
 #include "fixedpoint/fixedpoint.h"
 #include "public/gemmlowp.h"
 #include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
 namespace optimized_ops {
+namespace depthwise_conv {
+
+constexpr int kDepthwiseConvScratchWorkspaceSize = 10 * 10 * 64;
+constexpr int kDepthwiseConvAdjustedBiasLimit = 256;
+// In cases such as depth multiplication, we want to be able to load data from
+// the workspace that is beyond the valid range. Macro-block sizes are adjusted
+// to allow for this.
+constexpr int kWorkspaceExtension = 16;
 
 // See CategorizeDotProductKernel for definitive taxonomy.
 enum class DotProduct3x3KernelType {
   kNone = 0,  // Parameter combination is not supported for dot product kernels.
   kPlain,
-  kWithDepthMultiplication,
-  kWithPad0Stride2,
-  kWithPad1Stride1,
+  kWithDepthMultiplicationStride1,
+  kWithDepthMultiplicationStride2,
+  kStride2,
 };
 
 inline DotProduct3x3KernelType CategorizeDotProductKernel(
     const DepthwiseParams& params) {
-  const int padding = params.padding_values.width;
+  const int padding =
+      std::max(params.padding_values.width, params.padding_values.height);
   const int stride = params.stride_width;
-  if (padding != params.padding_values.height ||
-      stride != params.stride_height) {
+  if (stride != params.stride_height || padding > 1) {
     return DotProduct3x3KernelType::kNone;
   }
 
   if (params.depth_multiplier == 1) {
-    if (padding == 0 && stride == 1) {
+    if (stride == 1) {
       return DotProduct3x3KernelType::kPlain;
-    } else if (padding == 0 && stride == 2) {
-      return DotProduct3x3KernelType::kWithPad0Stride2;
-    } else if (padding == 1 && stride == 1) {
-      return DotProduct3x3KernelType::kWithPad1Stride1;
+    } else if (stride == 2) {
+      return DotProduct3x3KernelType::kStride2;
     } else {
       return DotProduct3x3KernelType::kNone;
     }
   } else {
-    if (padding == 0 && stride == 1) {
-      return DotProduct3x3KernelType::kWithDepthMultiplication;
+    if (stride == 1) {
+      return DotProduct3x3KernelType::kWithDepthMultiplicationStride1;
+    } else if (stride == 2) {
+      return DotProduct3x3KernelType::kWithDepthMultiplicationStride2;
     } else {
       return DotProduct3x3KernelType::kNone;
     }
   }
 }
 
+#define STR(s) STR_UNEXPANDED(s)
+#define STR_UNEXPANDED(s) #s
+
 // Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
 // Jetson TX-2. This compiler does not support the offsetof() macro.
 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
 #include <stddef.h>
 
-#define DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE 10 * 10 * 64
-
 // Encapsulates constant parameters used in DepthwiseConv.
 // 64-bit is used for types that will be added to 64-bit addresses in asm.
 struct DepthwiseConvParams {
@@ -90,9 +100,6 @@ struct DepthwiseConvParams {
   int32 output_height;
 };
 
-#define STR(s) STR_UNEXPANDED(s)
-#define STR_UNEXPANDED(s) #s
-
 // Represents the number of bytes offset from the start of the
 // DepthwiseConvParams struct. This is used in the asm to load parameters.
 // Keep these values in sync with the static_asserts below.
@@ -167,7 +174,46 @@ static_assert(offsetof(DepthwiseConvParams, output_width) ==
 static_assert(offsetof(DepthwiseConvParams, output_height) ==
                   OFFSET_OUTPUT_HEIGHT,
               "");
+#endif
 
+// Encapsulates constant parameters used in DepthwiseConv using dot-product ops.
+// 64-bit is used for types that will be added to 64-bit addresses in asm.
+//
+// This structure is specifically designed for use in asm.
+struct DepthwiseConvDotProdParams {
+  int64_t input_depth;
+  int64_t output_depth;
+  int32 workspace_height_stride;
+  int32 input_width_overall_micro_repeats;
+  int32 input_width_micro_repeats;
+  int32 depth_micro_repeats;
+  int32 inbound_block_height;
+  int32 residual_width;
+  int32 input_height_stride;
+  int32 stride;
+  int32 output_width_overall_micro_repeats;
+  int32 output_width_micro_repeats;
+  int32 output_residual_width;
+  int32 output_height_stride;
+  int32 bias_increment;
+  int32 padding_left;
+  int32 padding_right;
+  int32 padding_top;
+  int32 padding_bottom;
+  int32 height_macro_count;
+  int32 width_macro_count;
+  int32 outbound_block_height;
+  int32 workspace_width_micro_repeats;
+  int32 input_offset;
+  int32 output_offset;
+  int32 output_multiplier;
+  int32 output_shift;
+  int32 quantized_activation_min;
+  int32 quantized_activation_max;
+  int32 four_over_stride;
+};
+
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
 template <int32 kDepth, int32 kStrideWidth, int32 kStrideHeight>
 struct DepthwiseConvWindow {};
 
@@ -2963,8 +3009,6 @@ struct DepthwiseConvPartial<EdgeType::kVertical, 1, 1> {
 #undef OFFSET_INPUT_HEIGHT
 #undef OFFSET_OUTPUT_WIDTH
 #undef OFFSET_OUTPUT_HEIGHT
-#undef STR
-#undef STR_UNEXPANDED
 
 // Copies a subset of the input designated by |input_ptr| into |output_ptr|
 // with the specified output dimensions. Supports output depths of 64 only as
@@ -3047,7 +3091,7 @@ struct DepthwiseConvMultiRow {
         get_shuffle_input_size(kStrideWidth, shuffle_params.output_width));
     TFLITE_DCHECK(64 * shuffle_params.input_width *
                       shuffle_params.input_height <=
-                  DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE);
+                  kDepthwiseConvScratchWorkspaceSize);
 
     int32 out_x = start_x;
 
@@ -3375,7 +3419,7 @@ inline void DepthwiseConv3x3Filter(
   // allocated on the stack. Eventually we will want to move it to the heap
   // and have it allocated outside of this function, like the im2col_array
   // used in gemmlowp.
-  uint8 shuffle_workspace[DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE];
+  uint8 shuffle_workspace[kDepthwiseConvScratchWorkspaceSize];
 
   for (int32 b = 0; b < batches; ++b) {
     const uint8* input_ptr = input_data + b * input_batch_size;
@@ -3454,9 +3498,12 @@ inline void DepthwiseConv3x3Filter(
     }
   }
 }
-
 #endif  // __aarch64__
 
+#undef STR
+#undef STR_UNEXPANDED
+
+}  // namespace depthwise_conv
 }  // namespace optimized_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h
new file mode 100644
index 0000000000000000000000000000000000000000..1940353f74c05f6ff059d9c62d5c157228406dd5
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h
@@ -0,0 +1,1395 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_TRANSITIONAL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_TRANSITIONAL_H_
+
+// This file provides kernel implementations that are not used in shipped
+// inference code, but rather (a) show how model C++ code is designed and then
+// transformed into asm code, and (b) aid with maintenance and later development
+// of variations. Many projects (even including, say, the classic NAG libraries)
+// develop highly optimized code, but do not maintain intermediate versions.
+// Often the result is incomprehensible final-version code.
+
+#include <algorithm>
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace optimized_ops {
+namespace depthwise_conv {
+
+// Permute filter data, and adjust bias data to account for symmetric input
+// offset. Details are provided in the implementation of the
+// kUseCModel3x3DotProduct version.
+//
+// See the comments preceding DepthwiseConvDotProduct3x3() for further notes.
+template <DepthwiseConvImplementation implementation>
+struct ProcessPerDepth {
+  // Routine is contained in a static Run() method. No default template version
+  // is supplied, so that all implementations are deliberate choices of template
+  // specialization.
+  //
+  // Note that the signature of the Run() method will be designed for the asm
+  // implementation rather than conforming to style.
+};
+
+template <>
+struct ProcessPerDepth<DepthwiseConvImplementation::kUseCModel3x3DotProduct> {
+  // Filter data is provided as filter_block[3][3][depth/8][2][4]: height 3,
+  // width 3,  sub-block 0 or 1, depth 4. Filter data is written as
+  // filter_bank[3][2][4][4]; height 3, sub-block, depth 4, width 4.
+  //
+  // Note that this rearrangement is much like that performed on input data when
+  // filling the workspace, and optimized versions will be similar.
+  static inline void FillFilterBank(int depth, const uint8* filter_block,
+                                    int8 filter_bank[3][2][4][4]) {
+    constexpr int kSymmetricZeroPoint = 128;
+    // Load filter data in, 8-bytes down depth / sub-block at a time.
+    //
+    // loaded_filter has dimensions height 3, width 4, sub-block 0 or 1,
+    // depth 4.
+    uint8 loaded_filter[3][4][2][4];
+    for (int y = 0; y < 3; ++y) {
+      for (int x = 0; x < 3; ++x) {
+        memcpy(loaded_filter[y][x][0], &filter_block[3 * y * depth + x * depth],
+               8);
+      }
+      // Pad the filter with symmetric representation of 0, so that the values
+      // become 0 when the zero-poing is added below. Thus these filter taps are
+      // effectively disregarded in later filtering.
+      memset(loaded_filter[y][3][0], kSymmetricZeroPoint, 8);
+    }
+    for (int y = 0; y < 3; ++y) {
+      for (int z = 0; z < 4; ++z) {
+        for (int x = 0; x < 4; ++x) {
+          filter_bank[y][0][z][x] =
+              loaded_filter[y][x][0][z] - kSymmetricZeroPoint;
+          filter_bank[y][1][z][x] =
+              loaded_filter[y][x][1][z] - kSymmetricZeroPoint;
+        }
+      }
+    }
+  }
+
+  // Adjust the bias (weights) data according to the input offset.
+  //
+  // The output calculation is
+  // out[h][w][d] = bias[d] + sum_ij (in[h+i][w+j][d] + in_offset) *
+  //                                 (filter[i][j][d] + filter_offset)
+  // (where offsets are expressed as differences from 128).
+  //
+  // Since we cannot efficiently handle varying offsets / bias across the image,
+  // we insist on filter_offset = 0.
+  //
+  // This function calculates
+  // adjusted_bias[d] = bias[d] + sum_ij in_offset * filter[i][j][d]
+  // which accounts for input offset. If the bias is constant over the depth,
+  // the adjusted bias will vary.
+  static inline void AdjustBias(int32 input_offset,
+                                const int8 filter_bank[3][2][4][4],
+                                const int32* bias_data,
+                                int32 adjusted_bias_block[2][4]) {
+    constexpr int kSymmetricZeroPoint = 128;
+    TFLITE_DCHECK_GE(input_offset, -255);
+    TFLITE_DCHECK_LE(input_offset, 0);
+    // For instance, if input_offset == 128, no adjustment is needed.
+    const int32 input_offset_difference = input_offset + kSymmetricZeroPoint;
+
+    for (int s = 0; s < 2; ++s) {
+      for (int z = 0; z < 4; ++z) {
+        adjusted_bias_block[s][z] = bias_data[4 * s + z];
+        for (int i = 0; i < 9; ++i) {
+          adjusted_bias_block[s][z] +=
+              input_offset_difference * filter_bank[i % 3][s][z][i / 3];
+        }
+      }
+    }
+  }
+
+  static void Run(const uint8* filter_data, const int32* bias_data,
+                  int8* shuffled_filter_data, int32* adjusted_bias_data,
+                  const DepthwiseConvDotProdParams* function_params) {
+    constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
+    const int depth = function_params->output_depth;
+    const int bias_increment = function_params->bias_increment;
+    const int32 input_offset = function_params->input_offset;
+
+    int8 filter_bank[3][2][4][4];
+    int32 adjusted_bias_block[2][4];
+
+    for (int j_depth = 0; j_depth < (depth >> 3); ++j_depth) {
+      FillFilterBank(depth, filter_data + 8 * j_depth, filter_bank);
+      AdjustBias(input_offset, filter_bank,
+                 bias_data + 2 * j_depth * bias_increment, adjusted_bias_block);
+
+      memcpy(shuffled_filter_data, filter_bank[0][0][0],
+             shuffled_filter_increment);
+      shuffled_filter_data += shuffled_filter_increment;
+      memcpy(adjusted_bias_data, adjusted_bias_block[0],
+             8 * sizeof(adjusted_bias_block[0][0]));
+      adjusted_bias_data += 8;
+    }
+  }
+};
+
+// Copy a macro block of data from the input buffer into the workspace,
+// permuting data within each micro block.
+//
+// (a) Copy a macro block of data, padding as required along the width and
+//     height.
+// (b) Transpose the data within each micro block.
+//
+// See the comments preceding DepthwiseConvDotProduct3x3() for further notes.
+template <DepthwiseConvImplementation implementation,
+          DepthwiseConvDepthMultiplication depth_multiplication,
+          int32 max_padding>
+struct PackMacroBlock {
+  // Routine is contained in a static Run() method. No default template version
+  // is supplied, so that all implementations are deliberate choices of template
+  // specialization.
+  //
+  // Note that the signature of the Run() method will be designed for the asm
+  // implementation rather than conforming to style.
+};
+
+template <int32 max_padding>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseCModel3x3DotProduct,
+                      DepthwiseConvDepthMultiplication::kNoMultiplication,
+                      max_padding> {
+  // A straight copy of a macro block of input data into a scratch buffer.
+  //
+  // Requirement: depth_micro_repeats > 0.
+  static inline void CopyMacroBlock(
+      int32 height_block_number, int32 width_block_number,
+      const DepthwiseConvDotProdParams& function_params,
+      const uint8* input_block_data, int8* scratch_block_data) {
+    TFLITE_DCHECK_LE(max_padding, 1);
+
+    // Strides.
+    // The input depth and count of micro blocks provide the width strides.
+    const int input_height_stride = function_params.input_height_stride;
+    const int workspace_height_stride = function_params.workspace_height_stride;
+    const int input_depth = function_params.input_depth;
+    const int depth_micro_repeats = function_params.depth_micro_repeats;
+    TFLITE_DCHECK_GT(depth_micro_repeats, 0);
+
+    // Remaining iteration and dimension parameters.
+    //
+    // If width_overall_micro_repeats = input_width_micro_repeats + 1, then the
+    // final micro block is incomplete.
+    const int width_overall_micro_repeats =
+        function_params.input_width_overall_micro_repeats;
+    int input_width_micro_repeats = function_params.input_width_micro_repeats;
+    const int residual_width = function_params.residual_width;
+    const int block_height = function_params.inbound_block_height;
+
+    const int padding_left = function_params.padding_left;
+    const int padding_right = function_params.padding_right;
+    const int padding_top = function_params.padding_top;
+    const int padding_bottom = function_params.padding_bottom;
+
+    const bool leading_width_padding =
+        padding_left > 0 && width_block_number == 0;
+    const bool trailing_width_padding =
+        padding_right > 0 &&
+        width_block_number == (function_params.width_macro_count - 1);
+    const bool leading_height_padding =
+        padding_top > 0 && height_block_number < 0;
+    const bool trailing_height_padding =
+        padding_bottom > 0 &&
+        height_block_number == (function_params.height_macro_count - 1);
+
+    // Modify the trailing case to reflect the input width.
+    int input_residual_width =
+        input_width_micro_repeats < width_overall_micro_repeats ? residual_width
+                                                                : 4;
+    if (trailing_width_padding) {
+      input_residual_width -= 1;
+      input_width_micro_repeats = width_overall_micro_repeats - 1;
+    }
+
+    constexpr int kSymmetricZeroPoint = 128;
+    const int32 input_offset_difference =
+        function_params.input_offset + kSymmetricZeroPoint;
+
+    // We load data into a temporary buffer and then save, to match subsequent
+    // processing. This will make it easier to combine stages into one ASM
+    // routine.
+    int8 tmp_load[4][2][4];
+
+    int copy_block_height = block_height;
+    if (leading_height_padding) {
+      memset(scratch_block_data, -input_offset_difference,
+             workspace_height_stride);
+      scratch_block_data += workspace_height_stride;
+      input_block_data += input_height_stride;
+      copy_block_height -= 1;
+    }
+    if (trailing_height_padding) {
+      copy_block_height -= 1;
+    }
+
+    // The outer 3 loops go through all the micro blocks in a macro block.
+    for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+      for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
+        // Figure out division of work (available input vs trailing padding).
+        int adjusted_residual_width =
+            j_width == input_width_micro_repeats ? input_residual_width : 4;
+
+        int start_width = 0;
+        if (leading_width_padding && j_width == 0) {
+          start_width = 1;
+          memset(tmp_load[0][0], -input_offset_difference, 8);
+        }
+        if (adjusted_residual_width < 4) {
+          for (int x = adjusted_residual_width; x < 4; ++x) {
+            memset(tmp_load[x][0], -input_offset_difference, 8);
+          }
+        }
+
+        for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+          // The inner 3 loops go through the sub-block, depth and width within
+          // each micro block.
+
+          // Load, and apply symmetric offset.
+          int8* scratch_data =
+              scratch_block_data + k_height * workspace_height_stride +
+              j_width * 4 * 8 + i_depth * 4 * 8 * width_overall_micro_repeats;
+          const uint8* input_data = input_block_data +
+                                    k_height * input_height_stride +
+                                    j_width * 4 * input_depth + i_depth * 8;
+          // Full-size macro blocks are 2*4*4 = 32 bytes.
+          for (int x = start_width; x < adjusted_residual_width; ++x) {
+            for (int s = 0; s < 2; ++s) {
+              for (int d = 0; d < 4; ++d) {
+                tmp_load[x][s][d] = input_data[x * input_depth + 4 * s + d] -
+                                    kSymmetricZeroPoint;
+              }
+            }
+          }
+
+          // Save results.
+          memcpy(&scratch_data[0], tmp_load[0][0], 8);
+          memcpy(&scratch_data[8], tmp_load[1][0], 8);
+          memcpy(&scratch_data[16], tmp_load[2][0], 8);
+          memcpy(&scratch_data[24], tmp_load[3][0], 8);
+        }
+      }
+    }
+
+    if (trailing_height_padding) {
+      memset(scratch_block_data + copy_block_height * workspace_height_stride,
+             -input_offset_difference, workspace_height_stride);
+    }
+  }
+
+  // Transpose 4x4 blocks within each sub-micro-block.
+  //
+  // Implemented somewhat like NEON register manipulation, so that we can see
+  // equivalence of the two approaches.
+  static inline void MicroTransposeBlocks(
+      const DepthwiseConvDotProdParams& function_params,
+      int8* scratch_block_data) {
+    const int workspace_height_stride = function_params.workspace_height_stride;
+    const int width_overall_micro_repeats =
+        function_params.input_width_overall_micro_repeats;
+    const int depth_micro_repeats = function_params.depth_micro_repeats;
+    const int block_height = function_params.inbound_block_height;
+
+    // Transpositions are 4x4, but doing 2 at a time is more efficient in the
+    // NEON code we are simulating.
+    int8 tmp_load[4][2][4];         // [width][sub-block][depth]
+    int8 tmp_transposed[4][2][4];   // [depth][sub-block][width]
+    int8 tmp_interleaved[2][4][4];  // [sub-block][depth][width]
+
+    // The outer 3 loops go through all the micro blocks in a macro block.
+    for (int k_height = 0; k_height < block_height; ++k_height) {
+      for (int j_width = 0; j_width < width_overall_micro_repeats; ++j_width) {
+        for (int i_depth = 0; i_depth < depth_micro_repeats; ++i_depth) {
+          int8* scratch_data =
+              scratch_block_data + k_height * workspace_height_stride +
+              j_width * 4 * 8 + i_depth * 4 * 8 * width_overall_micro_repeats;
+          // A. Load data
+          memcpy(tmp_load[0][0], &scratch_data[0], 8);
+          memcpy(tmp_load[1][0], &scratch_data[8], 8);
+          memcpy(tmp_load[2][0], &scratch_data[16], 8);
+          memcpy(tmp_load[3][0], &scratch_data[24], 8);
+
+          // B. Simulate between-register transposition.
+          for (int x = 0; x < 4; ++x) {
+            for (int y = 0; y < 4; ++y) {
+              tmp_transposed[x][0][y] = tmp_load[y][0][x];
+              tmp_transposed[x][1][y] = tmp_load[y][1][x];
+            }
+          }
+
+          // C. Simulate between-register interleaving.
+          for (int x = 0; x < 4; ++x) {
+            for (int y = 0; y < 4; ++y) {
+              tmp_interleaved[0][x][y] = tmp_transposed[x][0][y];
+              tmp_interleaved[1][x][y] = tmp_transposed[x][1][y];
+            }
+          }
+          // D. Simulate mangled storage arrangement.
+          memcpy(&scratch_data[0], tmp_interleaved[0][0], 16);
+          memcpy(&scratch_data[16], tmp_interleaved[1][0], 16);
+        }
+      }
+    }
+  }
+
+  static inline void Run(int32 height_block_number, int32 width_block_number,
+                         const uint8* input_block_data,
+                         int8* scratch_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    CopyMacroBlock(height_block_number, width_block_number, *function_params,
+                   input_block_data, scratch_block_data);
+    MicroTransposeBlocks(*function_params, scratch_block_data);
+  }
+};
+
+template <int32 max_padding>
+struct PackMacroBlock<DepthwiseConvImplementation::kUseCModel3x3DotProduct,
+                      DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                      max_padding> {
+  static inline void Run(int32 height_block_number, int32 width_block_number,
+                         const uint8* input_block_data,
+                         int8* scratch_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    // Currently support for padding is limited to 1 on any side.
+    TFLITE_DCHECK_LE(max_padding, 1);
+
+    // Strides.
+    // The count of micro blocks (below) provides the width strides.
+    const int input_height_stride = function_params->input_height_stride;
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+
+    // Remaining iteration and dimension parameters.
+    //
+    // If width_overall_micro_repeats = input_width_micro_repeats + 1, then the
+    // final micro block is incomplete.
+    const int width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int input_width_micro_repeats =
+        function_params->input_width_micro_repeats;
+    const int residual_width = function_params->residual_width;
+    const int block_height = function_params->inbound_block_height;
+    TFLITE_DCHECK_GE(workspace_height_stride, 4 * width_overall_micro_repeats);
+
+    const int padding_left = function_params->padding_left;
+    const int padding_right = function_params->padding_right;
+    const int padding_top = function_params->padding_top;
+    const int padding_bottom = function_params->padding_bottom;
+
+    const bool leading_width_padding =
+        padding_left > 0 && width_block_number == 0;
+    const bool trailing_width_padding =
+        padding_right > 0 &&
+        width_block_number == (function_params->width_macro_count - 1);
+    const bool leading_height_padding =
+        padding_top > 0 && height_block_number < 0;
+    const bool trailing_height_padding =
+        padding_bottom > 0 &&
+        height_block_number == (function_params->height_macro_count - 1);
+
+    constexpr int kSymmetricZeroPoint = 128;
+    const int32 input_offset_difference =
+        function_params->input_offset + kSymmetricZeroPoint;
+
+    int copy_block_height = block_height;
+    if (leading_height_padding) {
+      memset(scratch_block_data, -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+      scratch_block_data += workspace_height_stride;
+      input_block_data += input_height_stride;
+      copy_block_height -= 1;
+    }
+    if (trailing_height_padding) {
+      copy_block_height -= 1;
+    }
+
+    int adjusted_residual_width =
+        input_width_micro_repeats < width_overall_micro_repeats ? residual_width
+                                                                : 4;
+
+    if (trailing_width_padding) {
+      adjusted_residual_width -= 1;
+    }
+    int start_width = 0;
+    if (leading_width_padding) {
+      start_width = 1;
+      input_block_data += 1;
+    }
+
+    const int copy_size = (width_overall_micro_repeats - 1) * 4 +
+                          adjusted_residual_width - start_width;
+
+    TFLITE_DCHECK_LE(
+        copy_size,
+        input_height_stride - width_block_number * input_width_micro_repeats);
+    // We may drop up to stride-1 of trailing input.
+    TFLITE_DCHECK_GE(copy_size, input_height_stride - 1);
+
+    // When there is unit input depth, the micro-block iteration need only be
+    // through the height. The micro blocks are contiguous across the width.
+    for (int k_height = 0; k_height < copy_block_height; ++k_height) {
+      const uint8* input_data =
+          input_block_data + k_height * input_height_stride;
+      int8* scratch_data =
+          scratch_block_data + k_height * workspace_height_stride;
+
+      // Handle leading padding. This is overwritten if there is no padding.
+      scratch_data[0] = -input_offset_difference;
+
+      memcpy(&scratch_data[start_width], input_data, copy_size);
+      for (int i = 0; i < copy_size; ++i) {
+        scratch_data[start_width + i] += -kSymmetricZeroPoint;
+      }
+
+      // Handle trailing padding, and fill in remainder of micro block.
+      memset(&scratch_data[start_width + copy_size], -input_offset_difference,
+             4 - adjusted_residual_width + kWorkspaceExtension);
+    }
+
+    if (trailing_height_padding) {
+      memset(scratch_block_data + copy_block_height * workspace_height_stride,
+             -input_offset_difference,
+             workspace_height_stride + kWorkspaceExtension);
+    }
+  }
+};
+
+// Apply filter to macro block of input data and store results. Details are
+// provided in the implementation of the kUseCModel3x3DotProduct version.
+//
+// Parameters for repeats and residual sizes are in terms of outputs.
+//
+// See the comments preceding DepthwiseConvDotProduct3x3() for further notes.
+template <DepthwiseConvImplementation implementation,
+          DepthwiseConvDepthMultiplication depth_multiplication, int32 stride>
+struct KernelMacroBlock {
+  // Routine is contained in a static Run() method. No default template version
+  // is supplied, so that all implementations are deliberate choices of template
+  // specialization.
+  //
+  // Note that the signature of the Run() method will be designed for the asm
+  // implementation rather than conforming to style.
+};
+
+// Apply filter to macro block of input data and store results.
+//
+// Requirement: depth_micro_repeats > 0 || residual_depth > 0.
+template <int32 stride>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseCModel3x3DotProduct,
+                        DepthwiseConvDepthMultiplication::kNoMultiplication,
+                        stride> {
+  // Construct a width-shifted combination of two input sub-blocks, effectively
+  // concatenating them.
+  //
+  // The filter is applied using sub-blocks. These are in the needed form for
+  // the first (width) offset. For subsequent offsets, the filter is applied to
+  // shifted and combined data. The concatentation and shifting herein is fairly
+  // straightforward, but in the optimized code is an area of creativity in
+  // design because NEON instructions do not directly support the required
+  // between-register permutation.
+  //
+  // In NEON optimized code, input data is grouped in 4-byte blocks. In order to
+  // move along the width for each output point calculation, data is shifted, in
+  // essence between two such blocks.
+  //
+  // selected_data has format height 3, depth 4, width 4.
+  //
+  // When the micro block is trailing (the last across the macro-block width),
+  // it would be illegal to load the right (next) block, and the no_right_block
+  // indicates this scenario.
+  static inline void ConcatenateInputSubBlocks(int offset, int sub_block,
+                                               int workspace_height_stride,
+                                               int width_micro_stride,
+                                               bool no_right_block,
+                                               const int8* input_block,
+                                               int8 selected_data[3][4][4]) {
+    TFLITE_DCHECK_GE(offset, 0);
+    TFLITE_DCHECK_LT(offset, 4);
+
+    // The input banks have same format as selected_data.
+    int8 left_bank[3][4][4];
+    int8 right_bank[3][4][4];
+
+    // Work through one slice, by row, at a time.
+    for (int k_height = 0; k_height < 3; ++k_height) {
+      // Simulate demangling of mangled storage arrangement.
+      const int8* left_input_block =
+          &input_block[k_height * workspace_height_stride + sub_block * 2 * 8];
+      memcpy(left_bank[k_height][0], left_input_block, 16);
+      if (no_right_block) {
+        memset(right_bank[k_height][0], 0, 16);
+      } else {
+        const int8* right_input_block =
+            &input_block[k_height * workspace_height_stride +
+                         sub_block * 2 * 8 + width_micro_stride];
+        memcpy(right_bank[k_height][0], right_input_block, 16);
+      }
+      for (int depth_index = 0; depth_index < 4; ++depth_index) {
+        memcpy(selected_data[k_height][depth_index],
+               &left_bank[k_height][depth_index][offset], 4 - offset);
+        memcpy(&selected_data[k_height][depth_index][4 - offset],
+               right_bank[k_height][depth_index], offset);
+      }
+    }
+  }
+
+  // Straight implementation of 3x3 filter within sub-micro block.
+  static inline void Calculate3x3FilterOutput(
+      const DepthwiseConvDotProdParams& params, int sub_block,
+      const int8 selected_data[3][4][4], const int8 filter_bank[3][2][4][4],
+      const int32* bias_data, uint8 output_values[4]) {
+    const int32 output_activation_min = params.quantized_activation_min;
+    const int32 output_activation_max = params.quantized_activation_max;
+    const int32 output_multiplier = params.output_multiplier;
+    const int32 output_shift = params.output_shift;
+    const int32 output_offset = params.output_offset;
+    for (int d = 0; d < 4; ++d) {
+      int32 acc = 0;
+      for (int y = 0; y < 3; ++y) {
+        for (int x = 0; x < 4; ++x) {
+          int32 input_val = selected_data[y][d][x];
+          int32 filter_val = filter_bank[y][sub_block][d][x];
+          acc += filter_val * input_val;
+        }
+      }
+      acc += bias_data[d];
+      acc = reference_ops::depthwise_conv::DepthwiseConvRound<
+          DepthwiseConvOutputRounding::kUpward>(acc, output_multiplier,
+                                                output_shift);
+      acc += output_offset;
+      acc = std::max(acc, output_activation_min);
+      acc = std::min(acc, output_activation_max);
+      output_values[d] = static_cast<uint8>(acc);
+    }
+  }
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         uint8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int input_width_overall_micro_repeats =
+        function_params->input_width_overall_micro_repeats;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int depth = function_params->input_depth;
+    const int stride_val = function_params->stride;
+    const int four_over_stride = function_params->four_over_stride;
+
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    constexpr int bias_increment = 4;
+    TFLITE_DCHECK_EQ(function_params->bias_increment, bias_increment);
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+    const int width_micro_stride = 4 * 8;
+    const int depth_micro_stride =
+        width_micro_stride * input_width_overall_micro_repeats;
+
+    constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
+
+    // Simulate NEON-register transposition of subset of filter.
+    int8 filter_bank[3][2][4][4];  // Height 3, sub-block,  depth 4, width 4.
+    // Simulate NEON-register input data concatenation + sub-selection.
+    int8 sub_selected_input_data[3][4][4];  // Height 3, depth 4, width 4.
+    uint8 output_values[4];                 // Depth 4.
+
+    // The outer 3 loops go through all the micro blocks in a macro block, and
+    // separately treat the two sub-blocks within each micro block.
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      memcpy(filter_bank[0][0][0],
+             filter_workspace + j_depth * shuffled_filter_increment,
+             shuffled_filter_increment);
+
+      for (int s = 0; s < 2; ++s) {
+        for (int k_height = 0; k_height < block_height; ++k_height) {
+          const int8* scratch_data =
+              scratch_block_data +
+              workspace_height_stride * k_height * stride_val +
+              depth_micro_stride * j_depth;
+          uint8* output_data =
+              output_block_data + output_height_stride * k_height + 8 * j_depth;
+
+          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+               ++i_width) {
+            const int output_width = i_width == output_width_micro_repeats
+                                         ? residual_width
+                                         : four_over_stride;
+            const bool no_right_block = (output_width - 1) * stride_val < 2;
+            TFLITE_DCHECK_LE(output_width * stride_val, 4);
+            const int8* input_data =
+                scratch_data + width_micro_stride * i_width;
+            // Iterate over input width shifts within sub-micro blocks.
+            for (int x = 0; x < output_width; ++x) {
+              ConcatenateInputSubBlocks(x * stride_val, s,
+                                        workspace_height_stride,
+                                        width_micro_stride, no_right_block,
+                                        input_data, sub_selected_input_data);
+              Calculate3x3FilterOutput(
+                  *function_params, s, sub_selected_input_data, filter_bank,
+                  bias_data + (2 * j_depth + s) * bias_increment,
+                  output_values);
+              for (int d = 0; d < 4; ++d) {
+                output_data[depth * (four_over_stride * i_width + x) + 4 * s +
+                            d] = output_values[d];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+// Apply filter to macro block of input data and store results.
+//
+// Parameters for repeats and residual sizes are in terms of outputs.
+//
+// Requirement: depth_micro_repeats > 0 || residual_depth > 0.
+template <int32 stride>
+struct KernelMacroBlock<DepthwiseConvImplementation::kUseCModel3x3DotProduct,
+                        DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                        stride> {
+  // Construct a width-shifted combination of two input sub-blocks, effectively
+  // concatenating them.
+  //
+  // The filter is applied using sub-blocks. These are in the needed form for
+  // the first (width) offset. For subsequent offsets, the filter is applied to
+  // shifted and combined data. The concatentation and shifting herein is fairly
+  // straightforward, but in the optimized code is an area of creativity in
+  // design because NEON instructions do not directly support the required
+  // between-register permutation.
+  //
+  // In NEON optimized code, input data is grouped in 4-byte blocks. In order to
+  // move along the width for each output point calculation, data is shifted, in
+  // essence between two such blocks.
+  //
+  // selected_data has format height 3, width 4.
+  //
+  // When the micro block is trailing (the last across the macro-block width),
+  // it would be illegal to load the right (next) block, and the no_right_block
+  // indicates this scenario.
+  static inline void ConcatenateInputSubBlocks(int offset,
+                                               int workspace_height_stride,
+                                               bool no_right_block,
+                                               const int8* input_block,
+                                               int8 selected_data[3][4]) {
+    TFLITE_DCHECK_GE(offset, 0);
+    TFLITE_DCHECK_LT(offset, 4);
+    if (no_right_block) {
+      for (int k_height = 0; k_height < 3; ++k_height) {
+        memcpy(selected_data[k_height],
+               &input_block[k_height * workspace_height_stride + offset],
+               4 - offset);
+      }
+    } else {
+      for (int k_height = 0; k_height < 3; ++k_height) {
+        memcpy(selected_data[k_height],
+               &input_block[k_height * workspace_height_stride + offset], 4);
+      }
+    }
+  }
+
+  // Straight implementation of 3x3 filter within sub-micro block.
+  static inline void Calculate3x3FilterOutput(
+      const DepthwiseConvDotProdParams& function_params, int sub_block,
+      const int8 selected_data[3][4], const int8 filter_bank[3][2][4][4],
+      const int32* bias_data, uint8 output_values[4]) {
+    const int32 output_activation_min =
+        function_params.quantized_activation_min;
+    const int32 output_activation_max =
+        function_params.quantized_activation_max;
+    const int32 output_multiplier = function_params.output_multiplier;
+    const int32 output_shift = function_params.output_shift;
+    const int32 output_offset = function_params.output_offset;
+    for (int d = 0; d < 4; ++d) {
+      int32 acc = 0;
+      for (int y = 0; y < 3; ++y) {
+        for (int x = 0; x < 4; ++x) {
+          int32 input_val = selected_data[y][x];
+          int32 filter_val = filter_bank[y][sub_block][d][x];
+          acc += filter_val * input_val;
+        }
+      }
+      acc += bias_data[d];
+      acc = reference_ops::depthwise_conv::DepthwiseConvRound<
+          DepthwiseConvOutputRounding::kUpward>(acc, output_multiplier,
+                                                output_shift);
+      acc += output_offset;
+      acc = std::max(acc, output_activation_min);
+      acc = std::min(acc, output_activation_max);
+      output_values[d] = static_cast<uint8>(acc);
+    }
+  }
+
+  static inline void Run(const int8* scratch_block_data,
+                         const int8* filter_workspace, const int32* bias_data,
+                         uint8* output_block_data,
+                         const DepthwiseConvDotProdParams* function_params) {
+    const int workspace_height_stride =
+        function_params->workspace_height_stride;
+    const int output_width_micro_repeats =
+        function_params->output_width_micro_repeats;
+    const int depth_micro_repeats = function_params->depth_micro_repeats;
+    const int depth = function_params->output_depth;
+    const int stride_val = function_params->stride;
+    const int four_over_stride = function_params->four_over_stride;
+
+    const int workspace_width_micro_repeats =
+        function_params->workspace_width_micro_repeats;
+    const int output_width_overall_micro_repeats =
+        function_params->output_width_overall_micro_repeats;
+    const int block_height = function_params->outbound_block_height;
+    const int residual_width = function_params->output_residual_width;
+    const int output_height_stride = function_params->output_height_stride;
+    constexpr int bias_increment = 4;
+    TFLITE_DCHECK_EQ(function_params->bias_increment, bias_increment);
+
+    TFLITE_DCHECK(depth_micro_repeats > 0);
+
+    constexpr int shuffled_filter_increment = 2 * 3 * 4 * 4;
+
+    // Simulate NEON-register transposition of subset of filter.
+    int8 filter_bank[3][2][4][4];  // Height 3, sub-block,  depth 4, width 4.
+    // Simulate NEON-register input data concatenation + sub-selection.
+    int8 sub_selected_input_data[3][4];  // Height 3, depth 4, width 4.
+    uint8 output_values[4];              // Depth 4.
+
+    // The outer 3 loops go through all the micro blocks in a macro block, and
+    // separately treat the two sub-blocks within each micro block.
+    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
+      memcpy(filter_bank[0][0][0],
+             filter_workspace + j_depth * shuffled_filter_increment,
+             shuffled_filter_increment);
+
+      for (int s = 0; s < 2; ++s) {
+        for (int k_height = 0; k_height < block_height; ++k_height) {
+          const int8* scratch_data =
+              scratch_block_data +
+              workspace_height_stride * k_height * stride_val;
+          uint8* output_data =
+              output_block_data + output_height_stride * k_height + 8 * j_depth;
+
+          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
+               ++i_width) {
+            const int output_width = i_width == output_width_micro_repeats
+                                         ? residual_width
+                                         : four_over_stride;
+            const bool no_right_block = i_width == output_width_micro_repeats &&
+                                        output_width_overall_micro_repeats ==
+                                            workspace_width_micro_repeats;
+            TFLITE_DCHECK_LE(output_width * stride_val, 4);
+            const int8* input_data = scratch_data + 4 * i_width;
+            // Iterate over input width shifts within 4x4 blocks.
+            for (int x = 0; x < output_width; ++x) {
+              ConcatenateInputSubBlocks(x * stride_val, workspace_height_stride,
+                                        no_right_block, input_data,
+                                        sub_selected_input_data);
+              Calculate3x3FilterOutput(
+                  *function_params, s, sub_selected_input_data, filter_bank,
+                  bias_data + (2 * j_depth + s) * bias_increment,
+                  output_values);
+              for (int d = 0; d < 4; ++d) {
+                output_data[depth * (four_over_stride * i_width + x) + 4 * s +
+                            d] = output_values[d];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+// Top-level implementation function for 3x3 depthwise convolution using
+// NEON dot-product instructions.
+//
+// MACRO & MICRO BLOCKS
+//
+// The task is divided into macro blocks. Data is copied first into a macro
+// block in a workspace. This has two purposes: (a) bringing data into
+// cache, and (b) permuting data so that it can be used much more easily in
+// a dot-product filter.
+//
+// When there is no depth multiplication:
+//
+// The permutations required for dot-products are local, within 4 data points
+// down the depth and 4 across the width. We want to pull in input data at least
+// 8-bytes at a time, down the depth, and so we divide the macro blocks into
+// 1x4x8 (height, width, depth) and further divide the micro blocks into
+// sub-blocks with shape (1x4x4).
+//
+// Each macro-block is constructed from micro-blocks that are internally
+// rearranged during loading into the macro-block workspace.
+//
+// In other words, the micro-block shape is
+//     {1, 1, 4, 8}
+// Each macro block is typically shape
+//     {1, height_block_size, 4 * workspace_width_micro_repeats, 64}
+// and workspace_width_micro_repeats is chosen so it fits into the
+// workspace.
+//
+// However, if depth < 64, we decrease the macro block depth, enabling us to
+// increase the macro-block width.
+//
+// When there is depth multiplication:
+//
+// We require input-depth = 1 and exploit that instead.  Note that output data
+// is still full-depth, *as is the filter and bias data after certain
+// adjustments*, and so the filter stage in this case still proceeds in
+// terms of sub-blocks.
+//
+// The Magic of these numbers:
+//     4 is the number of input elements used in each dot-product.
+//     8 is the number of inputs we load at a time into a register.
+//     64 is min amount of data to be loaded in a stretch (when possible).
+//
+// FILTER DATA PREPARATION
+//
+// Filter data needs to be permuted in a fashion like that of input data, and
+// this is done in a preprocessing stage. In addition, this stage extends the
+// filter in the direction of width from 3 to 4. The extra filter taps are set
+// to zero so that input data does not have to be zeroed before applying
+// dot-products.
+//
+// OVERALL COUNTS: HANDLING TRAILING ITERATION
+//
+// Often it is necessary to handle the last iteration in a loop differently,
+// generally because the final item is shorter. The logic to detect the
+// special case can be a bit expensive. We use a scheme in which there are
+// two counts, in a pattern like xxx_yyy_repeats and
+// xxx_overall_yyy_repeats. The first gives the count of "normal"
+// iterations. The loop iterates over the second count, and the induction
+// variable is checked to see if it reaches xxx_yyy_repeats. If there is no
+// special trailing iteration, xxx_yyy_repeats = xxx_overall_yyy_repeats,
+// and the special code is not executed.
+//
+// Example:
+// Suppose that we characterize a size s as
+// f(s) -> (block-4-repetitions, remainder, overall_repetitions):
+// f(11) -> (2, 3, 3)
+// f(12) -> (3, 0, 3)
+// f(13) -> (3, 1, 4)
+//
+// POINTING OUTSIDE OF INPUT ARRAY.
+//
+// When there is padding, the input data pointer passed to the fill routines
+// points outside of the input array and into a kind-of virtual padded
+// margin. It turns out that this simplifies the code and removes
+// conditional statements. It is hard to explain why without comparing two
+// versions of the code. In summary, this way the adjustment into the margin
+// can be made unconditionally, and the correction back into the input array
+// is done where there is a conditional already.
+//
+// OVERLAP
+//
+// Since this is *depthwise* conv, neither the batch nor the depth have overlap.
+// The height and depth overlap by (filter_size - 1). Thus some data is used
+// twice on the borders of macro blocks.
+//
+template <DepthwiseConvImplementation implementation>
+inline void DepthwiseConvDotProduct3x3(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const uint8* input_data, const RuntimeShape& filter_shape,
+    const uint8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    uint8* output_data) {
+  // Check kernel restrictions.
+  constexpr int filter_size = 3;
+  constexpr int kSymmetricZeroPoint = 128;
+  constexpr int kMaxStride = 2;
+  constexpr int kMaxPadding = 1;
+  TFLITE_DCHECK_EQ(params.weights_offset, -kSymmetricZeroPoint);
+  TFLITE_DCHECK_LE(params.stride_width, kMaxStride);
+  TFLITE_DCHECK_EQ(params.stride_height, params.stride_width);
+  TFLITE_DCHECK_EQ(params.dilation_width_factor, 1);
+  TFLITE_DCHECK_EQ(params.dilation_height_factor, 1);
+  TFLITE_DCHECK_LE(params.padding_values.width, kMaxPadding);
+  TFLITE_DCHECK_LE(params.padding_values.height, kMaxPadding);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+
+  // Key kernel parameters (along with padding handled later).
+  const int stride = params.stride_width;
+  const int depth_multiplier = params.depth_multiplier;
+  const bool has_depth_multiplication = depth_multiplier > 1;
+
+  // Extract task dimensions.
+  const int input_depth = input_shape.Dims(3);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  TFLITE_DCHECK(!has_depth_multiplication || input_depth == 1);
+  TFLITE_DCHECK(has_depth_multiplication || input_depth == output_depth);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  TFLITE_DCHECK_EQ(input_depth * depth_multiplier, output_depth);
+  TFLITE_DCHECK_EQ(MatchingDim(filter_shape, 1, filter_shape, 2), filter_size);
+
+  // Return now if nothing to do.
+  if (output_width == 0 || output_height == 0) {
+    return;
+  }
+
+  // Kernel parameter structure: set basic fields.
+  //
+  // In asm it is easier to pass a structure than more than, say, 8 parameters.
+  DepthwiseConvDotProdParams function_params;
+  function_params.input_depth = input_depth;
+  function_params.output_depth = output_depth;
+  function_params.input_offset = params.input_offset;
+  function_params.output_offset = params.output_offset;
+  function_params.output_multiplier = params.output_multiplier;
+  function_params.output_shift = params.output_shift;
+  function_params.quantized_activation_min = params.quantized_activation_min;
+  function_params.quantized_activation_max = params.quantized_activation_max;
+  function_params.stride = stride;
+
+  // Handle inbound bias data.
+  //
+  // Note that this data is adjusted in a per-depth process before the main
+  // filters. The adjustment accounts for a non-symmetric input offset.
+  //
+  // Kernel subroutines need to be able to operate consistently on an bias
+  // array. Where there is no bias, we provide one filled with zeros.
+  constexpr int kMinBiasLoad = 8;
+  int32 zero_bias_data[kMinBiasLoad];
+  if (bias_data) {
+    function_params.bias_increment = 4;
+  } else {
+    memset(zero_bias_data, 0, sizeof(zero_bias_data));
+    bias_data = &zero_bias_data[0];
+    function_params.bias_increment = 0;
+  }
+  TFLITE_DCHECK_LE(2 * function_params.bias_increment, kMinBiasLoad);
+
+  // Process padding.
+  //
+  // Whether "correct" or not, this matches ComputeConvSizes. When there is
+  // stride > 1 there can be padding on the bottom or top, and therefore
+  // we need to consider padding. This is true even if one or other of the
+  // padding_values is 0.
+  const int padded_width = (output_width - 1) * stride + filter_size;
+  {
+    const int padding_left = params.padding_values.width;
+    // Right padding would be -1 if discarding input because of stride.
+    const int padding_right =
+        std::max(padded_width - input_width - padding_left, 0);
+    const int padding_top = params.padding_values.height;
+    const int padded_height = (output_height - 1) * stride + filter_size;
+    const int padding_bottom =
+        std::max(padded_height - input_height - padding_top, 0);
+
+    function_params.padding_left = padding_left;
+    function_params.padding_right = padding_right;
+    function_params.padding_top = padding_top;
+    function_params.padding_bottom = padding_bottom;
+
+    TFLITE_DCHECK_LE(padding_left, padding_right);
+    TFLITE_DCHECK_LE(padding_top, padding_bottom);
+  }
+  // When stride == 1 left or top padding may only be non-zero.
+  // This is when padding is specified but not needed on a trailing dimension.
+  // When stride == 2 right or bottom padding may only be non-zero.
+  // This is a result of the details of the padding calculations.
+  const bool padding_required =
+      params.padding_type == tflite::PaddingType::kSame ||
+      function_params.padding_right > 0 || function_params.padding_bottom > 0;
+
+  // Choose parameter-specific kernel subroutines.
+  //
+  // The main part of the kernel has two stages. First, a temporary workspace is
+  // filled with padded and permuted data. Second, the filter is applied to the
+  // workspace data to generate output.
+  //
+  // The workspace fill stage handles padding so that the filter stage does not
+  // need to account for it. The workspace fill stage does not need to
+  // understand striding, and implicitly handles striding through the parameters
+  // that it is given.
+  using pack_macro_block_func_t = decltype(
+      &PackMacroBlock<implementation,
+                      DepthwiseConvDepthMultiplication::kNoMultiplication,
+                      0>::Run);
+  using kernel_macro_block_func_t = decltype(
+      &KernelMacroBlock<implementation,
+                        DepthwiseConvDepthMultiplication::kNoMultiplication,
+                        1>::Run);
+  pack_macro_block_func_t pack_macro_block_func;
+  kernel_macro_block_func_t kernel_macro_block_func;
+  {
+    if (has_depth_multiplication) {
+      if (padding_required) {
+        pack_macro_block_func =
+            PackMacroBlock<implementation,
+                           DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                           /*max_padding=*/1>::Run;
+      } else {
+        pack_macro_block_func =
+            PackMacroBlock<implementation,
+                           DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                           /*max_padding=*/0>::Run;
+      }
+      if (stride == 1) {
+        kernel_macro_block_func =
+            KernelMacroBlock<implementation,
+                             DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                             /*stride=*/1>::Run;
+      } else {
+        kernel_macro_block_func =
+            KernelMacroBlock<implementation,
+                             DepthwiseConvDepthMultiplication::kUnitInputDepth,
+                             /*stride=*/2>::Run;
+      }
+    } else {
+      if (padding_required) {
+        pack_macro_block_func =
+            PackMacroBlock<implementation,
+                           DepthwiseConvDepthMultiplication::kNoMultiplication,
+                           /*max_padding=*/1>::Run;
+      } else {
+        pack_macro_block_func =
+            PackMacroBlock<implementation,
+                           DepthwiseConvDepthMultiplication::kNoMultiplication,
+                           /*max_padding=*/0>::Run;
+      }
+      if (stride == 1) {
+        kernel_macro_block_func = KernelMacroBlock<
+            implementation, DepthwiseConvDepthMultiplication::kNoMultiplication,
+            /*stride=*/1>::Run;
+      } else {
+        kernel_macro_block_func = KernelMacroBlock<
+            implementation, DepthwiseConvDepthMultiplication::kNoMultiplication,
+            /*stride=*/2>::Run;
+      }
+    }
+  }
+
+  // Stride-only variables.
+  //
+  // stride == 1 ? 4 : 2:
+  const int output_height_per_macro = 6 - 2 * stride;
+  // output_height_per_macro * stride:
+  constexpr int input_height_per_macro = 4;
+  // Number of rows per micro block (= rows per macro block) is
+  //   (output_height_per_macro - 1) * stride + 1 + (filter_size - 1)
+  //   = stride == 1 ? 3 + filter_size : 2 + filter_size:
+  const int height_block_size = 4 + filter_size - stride;
+  const int input_height_overlap = filter_size - stride;
+  // stride == 1 ? 4 : 2:
+  function_params.four_over_stride = output_height_per_macro;
+
+  TFLITE_DCHECK_EQ(stride * function_params.four_over_stride, 4);
+  TFLITE_DCHECK_EQ(height_block_size,
+                   input_height_per_macro + input_height_overlap);
+
+  // Create workspaces.
+  //
+  // Filter workspace is for shuffle: only first depth/8 is used.
+  // indexed as [depth/8][sub-block][height][depth][width].
+  TFLITE_DCHECK_LE(output_depth, kDepthwiseConvAdjustedBiasLimit);
+  TFLITE_DCHECK_EQ(kDepthwiseConvAdjustedBiasLimit % 8, 0);
+  int8 macroblock_workspace[kDepthwiseConvScratchWorkspaceSize];
+  int32 adjusted_bias_data[kDepthwiseConvAdjustedBiasLimit];
+  int8 filter_workspace[kDepthwiseConvAdjustedBiasLimit >> 3][3][2][4][4];
+
+  // Output depth characterization.
+  //
+  const int depth_macro_count = output_depth / 64;
+  const int depth_overall_macro_count = (output_depth + 63) / 64;
+  // Number of micro blocks down the depth in a final incomplete macro block.
+  const int depth_trailing_micro_repeats = output_depth / 8 % 8;
+  // The output_depth may not have a remainder: it must be a multiple of 8.
+  TFLITE_DCHECK_EQ(output_depth,
+                   64 * depth_macro_count + 8 * depth_trailing_micro_repeats);
+
+  // Characterize the first macro block depth, the largest.
+  //
+  // We base treatment of the width on the trailing macro block if there are
+  // no full blocks, in order to do more work together (that is, increase
+  // workspace_width_micro_repeats when largest_macro_depth < 64).
+  const int largest_macro_depth =
+      has_depth_multiplication
+          ? 1
+          : (depth_macro_count > 0 ? 64 : 8 * depth_trailing_micro_repeats);
+
+  // Characterize width, consumption of input and generation of output.
+  //
+  // In the case of depth multiplication, we ensure that some of the workspace
+  // at the end remains unused. This enables the filter routines to load the
+  // "next" data, of at least 16 bytes, even when at the end of the workspace.
+  // It is relatively expensive to detect the end micro block. It is also very
+  // difficult to test for (to trigger) erroneous reads (past end of array) in
+  // the depth multplication case.
+  int workspace_width_micro_repeats =
+      (has_depth_multiplication
+           ? kDepthwiseConvScratchWorkspaceSize - kWorkspaceExtension
+           : kDepthwiseConvScratchWorkspaceSize) /
+      (4 * largest_macro_depth * height_block_size);
+  // When there is no depth multiplication, the workspace depth is a multiple of
+  // 8, which ensures that workspace rows are 16-byte aligned. (Actually 32,
+  // because of the micro width of 4.) This is not necessarily the case under
+  // depth multiplication, so we adjust now to impose this restriction.
+  if (has_depth_multiplication) {
+    workspace_width_micro_repeats = (workspace_width_micro_repeats / 4) * 4;
+  }
+  TFLITE_DCHECK_EQ((workspace_width_micro_repeats * largest_macro_depth) % 4,
+                   0);
+  // Discount 1 of the micro-block repeats in each macro block to account for
+  // overlap.
+  const int consumed_width_per_macro_block =
+      4 * (workspace_width_micro_repeats - 1);
+  const int output_width_per_macro_block =
+      function_params.four_over_stride * (workspace_width_micro_repeats - 1);
+  TFLITE_DCHECK_GT(workspace_width_micro_repeats, 1);
+  TFLITE_DCHECK_EQ(output_width_per_macro_block * stride,
+                   consumed_width_per_macro_block);
+
+  // Width repetitions and residuals.
+  //
+  // Use of the workspace is characterized primarily in terms of *padded input*.
+  // Striding only matters in a few places.
+  //
+  // Simplifications: We require that there always be at least one full
+  // micro-block across the width. Since the maximum padding is 1, the trailing
+  // padding cannot span two micro blocks.
+  const int residual_micro_width = padded_width % 4;
+  // We base the count of macro blocks on the amount of padded input data each
+  // one consumes.
+  int width_overall_macro_count = (padded_width - residual_micro_width +
+                                   consumed_width_per_macro_block - 1) /
+                                  consumed_width_per_macro_block;
+  // Recall that we left a micro block at the end of each macro block for use as
+  // overlap. There is a special case in which we can use one fewer macro
+  // blocks, with the last one consuming extra input. (But not if the
+  // calculation thinks that we can use zero blocks.)
+  if (padded_width <=
+      ((width_overall_macro_count - 1) * consumed_width_per_macro_block + 4)) {
+    width_overall_macro_count -= 1;
+  }
+  width_overall_macro_count = std::max(width_overall_macro_count, 1);
+  // We always have to treat the final macro block along width as trailing,
+  // because even if it is full in terms of padded input, it will be incomplete
+  // in terms of output.
+  const int width_macro_count = width_overall_macro_count - 1;
+  // Micro blocks are traversed in terms of input in fill routines.
+  const int width_trailing_micro_repeats =
+      (padded_width - consumed_width_per_macro_block * width_macro_count) / 4;
+  const int width_overall_trailing_micro_repeats =
+      (padded_width - consumed_width_per_macro_block * width_macro_count + 3) /
+      4;
+  // Micro blocks are traversed in terms of output in filtering routines.
+  const int residual_output_micro_width =
+      (output_width - 1) % function_params.four_over_stride + 1;
+  const int output_width_trailing_micro_repeats =
+      residual_micro_width > (filter_size - 1)
+          ? width_trailing_micro_repeats
+          : width_trailing_micro_repeats - 1;
+  // Check results.
+  TFLITE_DCHECK_GT(width_overall_trailing_micro_repeats, 0);
+  TFLITE_DCHECK_EQ(padded_width,
+                   residual_micro_width +
+                       consumed_width_per_macro_block * width_macro_count +
+                       4 * width_trailing_micro_repeats);
+  TFLITE_DCHECK_LE(width_overall_macro_count, width_macro_count + 1);
+  TFLITE_DCHECK_GE(width_overall_macro_count, width_macro_count);
+
+  // Height repetitions and residuals.
+  //
+  const int height_macro_count = output_height / output_height_per_macro;
+  const int residual_output_height = output_height % output_height_per_macro;
+  const int height_overall_macro_count =
+      (output_height + output_height_per_macro - 1) / output_height_per_macro;
+  TFLITE_DCHECK_EQ(
+      output_height,
+      residual_output_height + output_height_per_macro * height_macro_count);
+  TFLITE_DCHECK_LE(height_overall_macro_count, height_macro_count + 1);
+  TFLITE_DCHECK_GE(height_overall_macro_count, height_macro_count);
+
+  // Data strides.
+  //
+  const int input_height_stride = input_width * input_depth;
+  const int output_height_stride = output_width * output_depth;
+  const int input_batch_stride = input_height_stride * input_height;
+  const int output_batch_stride = output_height_stride * output_height;
+  const int input_depth_macro_stride = has_depth_multiplication ? 0 : 64;
+  const int input_width_macro_stride =
+      input_depth * consumed_width_per_macro_block;
+  const int output_width_macro_stride =
+      output_depth * output_width_per_macro_block;
+
+  // Store parameters that do not vary across macro blocks.
+  //
+  function_params.workspace_width_micro_repeats = workspace_width_micro_repeats;
+  function_params.height_macro_count = height_overall_macro_count;
+  function_params.width_macro_count = width_overall_macro_count;
+  function_params.input_height_stride = input_height_stride;
+  function_params.output_height_stride = output_height_stride;
+  function_params.residual_width = residual_micro_width;
+
+  // Preprocess filter and bias data.
+  //
+  ProcessPerDepth<implementation>::Run(filter_data, bias_data,
+                                       filter_workspace[0][0][0][0],
+                                       adjusted_bias_data, &function_params);
+  function_params.bias_increment = 4;  // Adjusted bias data always spans depth.
+
+  // Main process.
+  //
+  // Most kernels are nested batch-height-width-depth. Here we proceed over
+  // macro blocks batch-width-depth-height.
+  //
+  // Example of handling of trailing iteration: when there is trailing depth,
+  // depth_overall_macro_count = depth_macro_count + 1, so we can adjust the
+  // dimensions for trailing macro blocks by looking for
+  // j_depth == depth_macro_count.
+  for (int b = 0; b < batches; ++b) {
+    for (int k_width = 0; k_width < width_overall_macro_count; ++k_width) {
+      // Figure out the work to be done for this macro block. If it trails in
+      // any dimension, the work in that dimension is adjusted.
+      // The work to be done across widths has 3 cases:
+      // (a) A full macro block,
+      // (b) Partial terminal macro block, with input and output ending in
+      //     same micro block, and
+      // (c) Partial terminal macro block, with output corresponding to one
+      //     fewer micro blocks, because filter extends across micro-block
+      //     boundary.
+      if (k_width != width_macro_count) {
+        function_params.output_residual_width = 0;
+        function_params.input_width_micro_repeats =
+            workspace_width_micro_repeats;
+        function_params.input_width_overall_micro_repeats =
+            workspace_width_micro_repeats;
+        function_params.output_width_micro_repeats =
+            workspace_width_micro_repeats - 1;
+      } else {
+        function_params.output_residual_width = residual_output_micro_width;
+        function_params.input_width_micro_repeats =
+            width_trailing_micro_repeats;
+        function_params.input_width_overall_micro_repeats =
+            width_overall_trailing_micro_repeats;
+        function_params.output_width_micro_repeats =
+            output_width_trailing_micro_repeats;
+      }
+      function_params.output_width_overall_micro_repeats =
+          function_params.output_residual_width == 0
+              ? function_params.output_width_micro_repeats
+              : function_params.output_width_micro_repeats + 1;
+
+      for (int j_depth = 0; j_depth < depth_overall_macro_count; ++j_depth) {
+        const uint8* input_data_block =
+            input_data + b * input_batch_stride +
+            j_depth * input_depth_macro_stride +
+            k_width * input_width_macro_stride -
+            function_params.padding_left * input_depth -
+            function_params.padding_top * input_height_stride;
+        uint8* output_data_block = output_data + b * output_batch_stride +
+                                   j_depth * 64 +
+                                   k_width * output_width_macro_stride;
+
+        function_params.depth_micro_repeats =
+            j_depth == depth_macro_count ? depth_trailing_micro_repeats : 8;
+        // Under depth multiplication the workspace_height_stride does not have
+        // to depend on input_width_overall_micro_repeats, but this improves the
+        // compactness of workspace use.
+        const int workspace_height_stride =
+            has_depth_multiplication
+                ? 16 * ((function_params.input_width_overall_micro_repeats +
+                         3) >>
+                        2)
+                : 4 * function_params.input_width_overall_micro_repeats * 8 *
+                      function_params.depth_micro_repeats;
+        TFLITE_DCHECK_EQ(workspace_height_stride % 16, 0);
+        function_params.workspace_height_stride = workspace_height_stride;
+
+        // For the first macro block for output rows we fill in the first few
+        // rows.  After this we will copy them (see below in loop.)
+        function_params.inbound_block_height = input_height_overlap;
+        pack_macro_block_func(-1, k_width, input_data_block,
+                              macroblock_workspace, &function_params);
+        input_data_block += input_height_stride * input_height_overlap;
+
+        for (int i_height = 0; i_height < height_overall_macro_count;
+             ++i_height) {
+          if (i_height != height_macro_count) {
+            function_params.inbound_block_height = input_height_per_macro;
+            function_params.outbound_block_height = output_height_per_macro;
+          } else {
+            function_params.inbound_block_height =
+                residual_output_height * stride;
+            function_params.outbound_block_height = residual_output_height;
+          }
+          TFLITE_DCHECK_LT(i_height * output_height_per_macro, output_height);
+          TFLITE_DCHECK_LT(i_height * input_height_per_macro, input_height);
+          TFLITE_DCHECK_LT(k_width * output_width_per_macro_block,
+                           output_width);
+          TFLITE_DCHECK_LT(k_width * consumed_width_per_macro_block,
+                           input_width);
+
+          // Macro blocks overlap by input_height_overlap rows, so we copy
+          // those instead of filling in afresh.  The first macro block across
+          // output rows was filled in outside of the loop (above).
+          if (i_height > 0) {
+            memcpy(macroblock_workspace,
+                   macroblock_workspace +
+                       input_height_per_macro * workspace_height_stride,
+                   input_height_overlap * workspace_height_stride);
+          }
+
+          pack_macro_block_func(
+              i_height, k_width, input_data_block,
+              macroblock_workspace +
+                  input_height_overlap * workspace_height_stride,
+              &function_params);
+
+          kernel_macro_block_func(macroblock_workspace,
+                                  filter_workspace[8 * j_depth][0][0][0],
+                                  adjusted_bias_data + 64 * j_depth,
+                                  output_data_block, &function_params);
+
+          input_data_block += input_height_stride * input_height_per_macro;
+          output_data_block += output_height_stride * output_height_per_macro;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace depthwise_conv
+}  // namespace optimized_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_TRANSITIONAL_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
index cf40ebb241d013a4853854f57fd55ebbce8a1752..a69a547cb9f15268d60919f4b4cb718e832d08bd 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -90,20 +90,28 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
     int n_batch, float* __restrict__ result, int result_stride) {
   const int kWeightsPerUint32 = 4;
   const int kWeightsPerNeonLane = 16;
-  // If the number of rows is not divisible by kWeightsPerUint32, we set a
-  // flag and allocate an aligned memory block. The flag is used to use the
-  // aligned memory block later in the kernel loop.
+  // Assuming *matrix is kWeightsPerUint32-byte aligned,
+  // every row of the matrix is also
+  // kWeightsPerUint32-byte aligned as long as cols is
+  // a multiple of kWeightsPerUint32. The assumption
+  // is currently satisfied by TFLite's 16-byte memory
+  // alignment scheme.
+  //
+  // Otherwise, we allocate an aligned memory block and set
+  // a flag to later copy rows from matrix to the block
+  // for aligned multiplication.
   bool unaligned = false;
-  int8* aligned_row = nullptr;
+  int8_t* aligned_row = nullptr;
   void* aligned_row_free = nullptr;
   if ((m_cols & (kWeightsPerUint32 - 1)) != 0) {
     unaligned = true;
-    aligned_row = (int8*)aligned_alloc(kWeightsPerUint32, m_cols,  // NOLINT
-                                       &aligned_row_free);
+    aligned_row = (int8_t*)aligned_alloc(kWeightsPerUint32, m_cols,  // NOLINT
+                                         &aligned_row_free);
   }
   void* aligned_vec_free = nullptr;
-  int8* aligned_vec = (int8*)aligned_alloc(kWeightsPerUint32, m_cols,  // NOLINT
-                                           &aligned_vec_free);
+  int8_t* aligned_vec =
+      (int8_t*)aligned_alloc(kWeightsPerUint32, m_cols,  // NOLINT
+                             &aligned_vec_free);
 
   // If m_cols is not at least kWeightsPerNeonLane, we cannot use the main
   // vectorized loop, and we need to process sequentially. postamble_start shows
@@ -114,13 +122,13 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
   for (batch = 0; batch < n_batch; ++batch) {
     const float batch_scaling_factor = scaling_factors[batch];
     // Copy the vector data to an aligned vector.
-    memcpy(aligned_vec, vectors + batch * m_cols, sizeof(int8) * m_cols);
+    memcpy(aligned_vec, vectors + batch * m_cols, sizeof(int8_t) * m_cols);
     // Compute dot-product for every column.
     for (row = 0; row < m_rows; ++row, result += result_stride) {
       // Get the address of the first element of the row.
-      int8* row_ptr = (int8*)matrix + row * m_cols;  // NOLINT
+      int8_t* row_ptr = (int8_t*)matrix + row * m_cols;  // NOLINT
       if (unaligned) {
-        memcpy(aligned_row, row_ptr, sizeof(int8) * m_cols);
+        memcpy(aligned_row, row_ptr, sizeof(int8_t) * m_cols);
         row_ptr = aligned_row;
       }
 
@@ -135,16 +143,17 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
       col = 0;
       for (; col < postamble_start; col += kWeightsPerNeonLane) {
         // Load 16 8-bit values from the row and vector, each, to operate on.
-        // Here the assumption is that each buffer is 4-byte aligned.
-        TFLITE_CHECK_EQ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1),
-                        0);
+        // Here the assumption is that each buffer is 4-byte aligned. Otherwise,
+        // performance may suffer significantly.
+        TFLITE_DCHECK_EQ(  // NOLINT
+            (uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1), 0);
         const int8x16_t s1_8x16 = vld1q_s8((const int8_t*)(aligned_vec + col));
         const int8x16_t s2_8x16 = vld1q_s8((const int8_t*)(row_ptr + col));
         // Multiply the low bits (i.e. the lower 8 8bit numbers in the
         // registers).
         int16x8_t prod_16x8 =
             vmull_s8(vget_low_s8(s1_8x16), vget_low_s8(s2_8x16));
-        // Multiply the high bits (i.e. the lower 8 8bit numbers in the
+        // Multiply the high bits (i.e. the higher 8 8bit numbers in the
         // registers), and accumulate with the result of the low bits product.
         // The assumption here is that overflow will not happen as we quantize
         // our values to be in the range [-127, 127]. As such the sum of the 2
@@ -164,8 +173,9 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
         if ((m_cols - postamble_start) >= (kWeightsPerNeonLane >> 1)) {
           // Load 8 8-bit values from the row and column each to operate on.
           // Here the assumption is that each buffer is 4-bytes aligned.
-          TFLITE_CHECK_EQ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1),
-                          0);
+          // Otherwise, performance may suffer significantly.
+          TFLITE_DCHECK_EQ(  // NOLINT
+              (uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1), 0);
           const int8x8_t s1_8x8 = vld1_s8((const int8_t*)(aligned_vec + col));
           const int8x8_t s2_8x8 = vld1_s8((const int8_t*)(row_ptr + col));
           const int16x8_t prod_16x8 = vmull_s8(s1_8x8, s2_8x8);
@@ -192,6 +202,118 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
   free(aligned_vec_free);
 }
 
+void NeonSparseMatrixBatchVectorMultiplyAccumulate(
+    const float* matrix, const uint8_t* ledger, int m_rows, int m_cols,
+    const float* vector, int n_batch, float* result, int result_stride) {
+  const int kBlockSize = 16;
+  const int kNeonLanesPerBlock = 4;
+  TFLITE_DCHECK_EQ(  // NOLINT
+      m_cols % kBlockSize, 0);
+
+  float* result_in_batch = result;
+  for (int b = 0; b < n_batch; b++) {
+    const float* matrix_ptr = matrix;
+    const uint8_t* ledger_ptr = ledger;
+    for (int r = 0; r < m_rows; r++) {
+      int num_nonzero_blocks = *ledger_ptr++;
+      if (num_nonzero_blocks > 0) {
+        float32x4_t acc_32x4 = vmovq_n_f32(0.0);
+        const float* vector_in_batch = vector + b * m_cols;
+
+        for (int i = 0; i < num_nonzero_blocks; i++) {
+          const int block_start_index = *ledger_ptr++ * kBlockSize;
+          const float* vector_block_in_batch_ptr =
+              vector_in_batch + block_start_index;
+
+          for (int c = 0; c < kNeonLanesPerBlock; c++) {
+            // Load 4 float values from the vector and matrix row.
+            float32x4_t vector_f32x4 = vld1q_f32(vector_block_in_batch_ptr +
+                                                 c * kFloatWeightsPerNeonLane);
+            float32x4_t matrix_f32x4 =
+                vld1q_f32(matrix_ptr + c * kFloatWeightsPerNeonLane);
+            // Multiply the vector and matrix row and add to accumulator.
+            acc_32x4 = vmlaq_f32(acc_32x4, matrix_f32x4, vector_f32x4);
+          }
+          matrix_ptr += kBlockSize;
+        }
+        *result_in_batch +=
+            (vgetq_lane_f32(acc_32x4, 0) + vgetq_lane_f32(acc_32x4, 1) +
+             vgetq_lane_f32(acc_32x4, 2) + vgetq_lane_f32(acc_32x4, 3));
+      }
+      result_in_batch += result_stride;
+    }
+  }
+}
+
+void NeonSparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride) {
+  const int kWeightsPerUint32 = 4;
+  const int kWeightsPerNeonLane = 16;
+  const int kBlockSize = kWeightsPerNeonLane;
+  TFLITE_DCHECK_EQ(  // NOLINT
+      m_cols % kBlockSize, 0);
+  void* aligned_vec_free = nullptr;
+  int8_t* aligned_vec =
+      (int8_t*)aligned_alloc(kWeightsPerUint32, m_cols,  // NOLINT
+                             &aligned_vec_free);
+
+  int batch, row;
+  for (batch = 0; batch < n_batch; ++batch) {
+    const float batch_scaling_factor = scaling_factors[batch];
+    // Copy the vector data to an aligned vector.
+    memcpy(aligned_vec, vectors + batch * m_cols, sizeof(int8) * m_cols);
+
+    const uint8_t* ledger_ptr = ledger;
+    const int8_t* row_ptr = matrix;
+    for (row = 0; row < m_rows; ++row, result += result_stride) {
+      // Initialize the dot product sum for the row to 0.
+      int32x4_t dotprod = vmovq_n_s32(0);
+      int num_nonzero_blocks = *ledger_ptr++;
+      if (num_nonzero_blocks > 0) {
+        // Prefetch the row to cache.
+        __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
+                           3 /* temporal locality */);
+        for (int i = 0; i < num_nonzero_blocks; i++) {
+          const int col_index = *ledger_ptr++ * kBlockSize;
+          // Load 16 8-bit values from the row and vector, each, to operate on.
+          // Here the assumption is that each buffer is 4-byte aligned.
+          // Otherwise, performance may suffer significantly.
+          TFLITE_DCHECK_EQ(  // NOLINT
+              (uintptr_t)(&row_ptr) & (kWeightsPerUint32 - 1), 0);
+          const int8x16_t s1_8x16 =
+              vld1q_s8((const int8_t*)(aligned_vec + col_index));
+          const int8x16_t s2_8x16 = vld1q_s8((const int8_t*)(row_ptr));
+          // Multiply the low bits (i.e. the lower 8 8bit numbers in the
+          // registers).
+          int16x8_t prod_16x8 =
+              vmull_s8(vget_low_s8(s1_8x16), vget_low_s8(s2_8x16));
+          // Multiply the high bits (i.e. the lower 8 8bit numbers in the
+          // registers), and accumulate with the result of the low bits product.
+          // The assumption here is that overflow will not happen as we quantize
+          // our values to be in the range [-127, 127]. As such the sum of the 2
+          // products is always strictly smaller than 15-bits (32767 in absolute
+          // value).
+          prod_16x8 =
+              vmlal_s8(prod_16x8, vget_high_s8(s1_8x16), vget_high_s8(s2_8x16));
+
+          dotprod = vpadalq_s16(dotprod, prod_16x8);
+          row_ptr += kBlockSize;
+        }
+        // Add the 4 intermediate sum values to get the final dot-prod value for
+        // this row.
+        int64x2_t pairwiseAdded = vpaddlq_s32(dotprod);
+        int32 neon_sum =
+            vgetq_lane_s64(pairwiseAdded, 0) + vgetq_lane_s64(pairwiseAdded, 1);
+        *result += neon_sum * batch_scaling_factor;
+      }
+    }  // for row
+  }    // for batch
+  free(aligned_vec_free);
+}
+
 void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
                                   int v_size, float* result) {
   // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
index 903f4c80139cd326b354ef6292a393c75af11608..a86457dba745dbe94ce3e1dc718012545f258804 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -40,6 +40,24 @@ void MatrixBatchVectorMultiplyAccumulate(
                    vectors, scaling_factors, n_batch, result, result_stride);
 }
 
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const float* matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const float* vector, int n_batch, float* result,
+    int result_stride) {
+  NeonSparseMatrixBatchVectorMultiplyAccumulate(
+      matrix, ledger, m_rows, m_cols, vector, n_batch, result, result_stride);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride) {
+  NeonSparseMatrixBatchVectorMultiplyAccumulate(matrix, ledger, m_rows, m_cols,
+                                                vectors, scaling_factors,
+                                                n_batch, result, result_stride);
+}
+
 void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
                               int v_size, float* result) {
   NEON_OR_PORTABLE(VectorVectorCwiseProduct, vector1, vector2, v_size, result);
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index c79b69a22e4dcdac5c32d03c0edd9f3cfb09a0ae..6e8390ce5509c836a5dfd4859fc6495301e77a8e 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -63,6 +63,7 @@ using reference_ops::ConcatenationWithScaling;
 using reference_ops::DepthConcatenation;
 using reference_ops::Dequantize;
 using reference_ops::Div;
+using reference_ops::Elu;
 using reference_ops::FakeQuant;
 using reference_ops::Fill;
 using reference_ops::Gather;
@@ -85,6 +86,7 @@ using reference_ops::Select;
 using reference_ops::SpaceToBatchND;
 using reference_ops::Split;
 using reference_ops::StridedSlice;
+using reference_ops::Sub16;
 using reference_ops::Transpose;
 
 // TODO(b/80247582) Remove this constant.
@@ -181,45 +183,6 @@ MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data,
   return MatrixMap<Scalar>(data, rows, cols);
 }
 
-// This is like the template-parameter version, except that the power-of-two is
-// passed as a function parameter. The template version is to be preferred,
-// since some target hardware optimizations depend on the range of the exponent.
-template <typename IntegerType>
-IntegerType SaturatingRoundingMultiplyByPOTParam(IntegerType x, int exponent) {
-  if (exponent == 0) {
-    return x;
-  }
-  using ScalarIntegerType =
-      typename gemmlowp::FixedPointRawTypeTraits<IntegerType>::ScalarRawType;
-  const IntegerType min =
-      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::min());
-  const IntegerType max =
-      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::max());
-  const int ScalarIntegerTypeBits = 8 * sizeof(ScalarIntegerType);
-
-  const std::int32_t threshold =
-      ((1 << (ScalarIntegerTypeBits - 1 - exponent)) - 1);
-  const IntegerType positive_mask =
-      gemmlowp::MaskIfGreaterThan(x, gemmlowp::Dup<IntegerType>(threshold));
-  const IntegerType negative_mask =
-      gemmlowp::MaskIfLessThan(x, gemmlowp::Dup<IntegerType>(-threshold));
-
-  IntegerType result = gemmlowp::ShiftLeft(x, exponent);
-  result = gemmlowp::SelectUsingMask(positive_mask, max, result);
-  result = gemmlowp::SelectUsingMask(negative_mask, min, result);
-  return result;
-}
-
-// This is like the template-parameter version, except that the power-of-two is
-// passed as a function parameter. See raw-integer version for further comments.
-template <typename tRawType, int tIntegerBits>
-gemmlowp::FixedPoint<tRawType, tIntegerBits>
-SaturatingRoundingMultiplyByPOTParam(
-    gemmlowp::FixedPoint<tRawType, tIntegerBits> a, int exponent) {
-  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
-      SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent));
-}
-
 inline void AddBiasAndEvalActivationFunction(float output_activation_min,
                                              float output_activation_max,
                                              const RuntimeShape& bias_shape,
@@ -840,24 +803,21 @@ inline void FullyConnected(
 }
 
 #ifdef USE_NEON
-inline void FullyConnectedAsGEMV(
+inline void FullyConnectedAsGEMVWorkerImpl(
     const RuntimeShape& input_shape, const uint8* input_data,
     int32 input_offset, const RuntimeShape& filter_shape,
     const uint8* filter_data, int32 filter_offset,
     const RuntimeShape& bias_shape, const int32* bias_data, int32 output_offset,
     int32 output_multiplier, int output_shift, int32 output_activation_min,
     int32 output_activation_max, const RuntimeShape& output_shape,
-    uint8* output_data) {
+    uint8* output_data, int row_start, int row_end) {
   gemmlowp::ScopedProfilingLabel label("FullyConnectedAsGEMV/8bit");
   TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
   TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
   TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
   const int output_dim_count = output_shape.DimensionsCount();
-  const int filter_dim_count = filter_shape.DimensionsCount();
   TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_shape, output_dim_count - 1), 1);
   const int input_size = FlatSizeSkipDim(input_shape, 0);
-  const int output_size = MatchingDim(filter_shape, filter_dim_count - 2,
-                                      output_shape, output_dim_count - 1);
   static constexpr int kPeel = 4;
   const bool shift_left = (output_shift > 0);
   for (int k = 0; k < input_size; k += 64) {
@@ -866,81 +826,139 @@ inline void FullyConnectedAsGEMV(
   for (int k = 0; k < kPeel * input_size; k += 64) {
     optimized_ops_preload_l1_stream(filter_data + k);
   }
-  TFLITE_DCHECK(!(output_size % kPeel));
-  const int32* bias_ptr = bias_data;
-  uint8* output_ptr = output_data;
-  for (int out = 0; out < output_size; out += kPeel) {
-    int32x4_t acc[kPeel];
-    for (int k = 0; k < kPeel; k++) {
-      acc[k] = vdupq_n_s32(0);
-    }
+
+  TFLITE_DCHECK_GE(row_end - row_start, kPeel);
+
+  for (int out = row_start; out < row_end; out += kPeel) {
+    out = std::min(out, row_end - kPeel);
+    int32x4_t acc0 = vdupq_n_s32(0);
+    int32x4_t acc1 = acc0;
+    int32x4_t acc2 = acc0;
+    int32x4_t acc3 = acc0;
     const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
     const int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset);
     int in = 0;
     for (; in <= input_size - 16; in += 16) {
       const uint8x16_t input_val_u8 = vld1q_u8(input_data + in);
-      uint8x16_t filter_val_u8[kPeel];
-      for (int k = 0; k < kPeel; k++) {
-        const uint8* filter_ptr = filter_data + in + (out + k) * input_size;
-        filter_val_u8[k] = vld1q_u8(filter_ptr);
-        optimized_ops_preload_l1_stream(filter_ptr + 64);
-      }
-      int16x8_t input_val[2];
-      const uint8x8_t low = vget_low_u8(input_val_u8);
-      const uint8x8_t high = vget_high_u8(input_val_u8);
-      input_val[0] = vreinterpretq_s16_u16(vmovl_u8(low));
-      input_val[1] = vreinterpretq_s16_u16(vmovl_u8(high));
-      input_val[0] = vaddq_s16(input_val[0], input_offset_vec);
-      input_val[1] = vaddq_s16(input_val[1], input_offset_vec);
-      int16x8_t filter_val[kPeel][2];
-      for (int k = 0; k < kPeel; k++) {
-        const uint8x8_t low = vget_low_u8(filter_val_u8[k]);
-        const uint8x8_t high = vget_high_u8(filter_val_u8[k]);
-        filter_val[k][0] = vreinterpretq_s16_u16(vmovl_u8(low));
-        filter_val[k][1] = vreinterpretq_s16_u16(vmovl_u8(high));
-        filter_val[k][0] = vaddq_s16(filter_val[k][0], filter_offset_vec);
-        filter_val[k][1] = vaddq_s16(filter_val[k][1], filter_offset_vec);
-      }
-      for (int p = 0; p < 2; p++) {
-        for (int k = 0; k < kPeel; k++) {
-          acc[k] = vmlal_s16(acc[k], vget_low_s16(filter_val[k][p]),
-                             vget_low_s16(input_val[p]));
-        }
-        for (int k = 0; k < kPeel; k++) {
-          acc[k] = vmlal_s16(acc[k], vget_high_s16(filter_val[k][p]),
-                             vget_high_s16(input_val[p]));
-        }
-      }
+      const uint8* filter_ptr = filter_data + in + out * input_size;
+      uint8x16_t filter_val_u8_0 = vld1q_u8(filter_ptr);
+      optimized_ops_preload_l1_stream(filter_ptr + 64);
+      filter_ptr += input_size;
+      uint8x16_t filter_val_u8_1 = vld1q_u8(filter_ptr);
+      optimized_ops_preload_l1_stream(filter_ptr + 64);
+      filter_ptr += input_size;
+      uint8x16_t filter_val_u8_2 = vld1q_u8(filter_ptr);
+      optimized_ops_preload_l1_stream(filter_ptr + 64);
+      filter_ptr += input_size;
+      uint8x16_t filter_val_u8_3 = vld1q_u8(filter_ptr);
+      optimized_ops_preload_l1_stream(filter_ptr + 64);
+      int16x8_t input_val_0, input_val_1;
+      uint8x8_t low = vget_low_u8(input_val_u8);
+      uint8x8_t high = vget_high_u8(input_val_u8);
+      input_val_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      input_val_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      input_val_0 = vaddq_s16(input_val_0, input_offset_vec);
+      input_val_1 = vaddq_s16(input_val_1, input_offset_vec);
+      low = vget_low_u8(filter_val_u8_0);
+      high = vget_high_u8(filter_val_u8_0);
+      int16x8_t filter_val_0_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      int16x8_t filter_val_0_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      filter_val_0_0 = vaddq_s16(filter_val_0_0, filter_offset_vec);
+      filter_val_0_1 = vaddq_s16(filter_val_0_1, filter_offset_vec);
+      low = vget_low_u8(filter_val_u8_1);
+      high = vget_high_u8(filter_val_u8_1);
+      int16x8_t filter_val_1_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      int16x8_t filter_val_1_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      filter_val_1_0 = vaddq_s16(filter_val_1_0, filter_offset_vec);
+      filter_val_1_1 = vaddq_s16(filter_val_1_1, filter_offset_vec);
+      low = vget_low_u8(filter_val_u8_2);
+      high = vget_high_u8(filter_val_u8_2);
+      int16x8_t filter_val_2_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      int16x8_t filter_val_2_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      filter_val_2_0 = vaddq_s16(filter_val_2_0, filter_offset_vec);
+      filter_val_2_1 = vaddq_s16(filter_val_2_1, filter_offset_vec);
+      low = vget_low_u8(filter_val_u8_3);
+      high = vget_high_u8(filter_val_u8_3);
+      int16x8_t filter_val_3_0 = vreinterpretq_s16_u16(vmovl_u8(low));
+      int16x8_t filter_val_3_1 = vreinterpretq_s16_u16(vmovl_u8(high));
+      filter_val_3_0 = vaddq_s16(filter_val_3_0, filter_offset_vec);
+      filter_val_3_1 = vaddq_s16(filter_val_3_1, filter_offset_vec);
+      acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0_0),
+                       vget_low_s16(input_val_0));
+      acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1_0),
+                       vget_low_s16(input_val_0));
+      acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2_0),
+                       vget_low_s16(input_val_0));
+      acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3_0),
+                       vget_low_s16(input_val_0));
+      acc0 = vmlal_s16(acc0, vget_low_s16(filter_val_0_1),
+                       vget_low_s16(input_val_1));
+      acc1 = vmlal_s16(acc1, vget_low_s16(filter_val_1_1),
+                       vget_low_s16(input_val_1));
+      acc2 = vmlal_s16(acc2, vget_low_s16(filter_val_2_1),
+                       vget_low_s16(input_val_1));
+      acc3 = vmlal_s16(acc3, vget_low_s16(filter_val_3_1),
+                       vget_low_s16(input_val_1));
+      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0_0),
+                       vget_high_s16(input_val_0));
+      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1_0),
+                       vget_high_s16(input_val_0));
+      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2_0),
+                       vget_high_s16(input_val_0));
+      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3_0),
+                       vget_high_s16(input_val_0));
+      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0_1),
+                       vget_high_s16(input_val_1));
+      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1_1),
+                       vget_high_s16(input_val_1));
+      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2_1),
+                       vget_high_s16(input_val_1));
+      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3_1),
+                       vget_high_s16(input_val_1));
     }
     for (; in <= input_size - 8; in += 8) {
       const uint8x8_t input_val_u8 = vld1_u8(input_data + in);
-      uint8x8_t filter_val_u8[kPeel];
-      for (int k = 0; k < kPeel; k++) {
-        const uint8* filter_ptr = filter_data + in + (out + k) * input_size;
-        filter_val_u8[k] = vld1_u8(filter_ptr);
-      }
-      int16x8_t input_val;
-      input_val = vreinterpretq_s16_u16(vmovl_u8(input_val_u8));
+      const uint8* filter_ptr = filter_data + in + out * input_size;
+      uint8x8_t filter_val_u8_0 = vld1_u8(filter_ptr);
+      filter_ptr += input_size;
+      uint8x8_t filter_val_u8_1 = vld1_u8(filter_ptr);
+      filter_ptr += input_size;
+      uint8x8_t filter_val_u8_2 = vld1_u8(filter_ptr);
+      filter_ptr += input_size;
+      uint8x8_t filter_val_u8_3 = vld1_u8(filter_ptr);
+      int16x8_t input_val = vreinterpretq_s16_u16(vmovl_u8(input_val_u8));
       input_val = vaddq_s16(input_val, input_offset_vec);
-      int16x8_t filter_val[kPeel];
-      for (int k = 0; k < kPeel; k++) {
-        filter_val[k] = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8[k]));
-        filter_val[k] = vaddq_s16(filter_val[k], filter_offset_vec);
-      }
-      for (int k = 0; k < kPeel; k++) {
-        acc[k] = vmlal_s16(acc[k], vget_low_s16(filter_val[k]),
-                           vget_low_s16(input_val));
-      }
-      for (int k = 0; k < kPeel; k++) {
-        acc[k] = vmlal_s16(acc[k], vget_high_s16(filter_val[k]),
-                           vget_high_s16(input_val));
-      }
+      int16x8_t filter_val_0 = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8_0));
+      filter_val_0 = vaddq_s16(filter_val_0, filter_offset_vec);
+      int16x8_t filter_val_1 = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8_1));
+      filter_val_1 = vaddq_s16(filter_val_1, filter_offset_vec);
+      int16x8_t filter_val_2 = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8_2));
+      filter_val_2 = vaddq_s16(filter_val_2, filter_offset_vec);
+      int16x8_t filter_val_3 = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8_3));
+      filter_val_3 = vaddq_s16(filter_val_3, filter_offset_vec);
+      acc0 =
+          vmlal_s16(acc0, vget_low_s16(filter_val_0), vget_low_s16(input_val));
+      acc1 =
+          vmlal_s16(acc1, vget_low_s16(filter_val_1), vget_low_s16(input_val));
+      acc2 =
+          vmlal_s16(acc2, vget_low_s16(filter_val_2), vget_low_s16(input_val));
+      acc3 =
+          vmlal_s16(acc3, vget_low_s16(filter_val_3), vget_low_s16(input_val));
+      acc0 = vmlal_s16(acc0, vget_high_s16(filter_val_0),
+                       vget_high_s16(input_val));
+      acc1 = vmlal_s16(acc1, vget_high_s16(filter_val_1),
+                       vget_high_s16(input_val));
+      acc2 = vmlal_s16(acc2, vget_high_s16(filter_val_2),
+                       vget_high_s16(input_val));
+      acc3 = vmlal_s16(acc3, vget_high_s16(filter_val_3),
+                       vget_high_s16(input_val));
     }
     if (in < input_size) {
-      int32 buf[4 * kPeel];
-      for (int k = 0; k < 4; k++) {
-        vst1q_s32(buf + 4 * k, acc[k]);
-      }
+      int32 buf[16];
+      vst1q_s32(buf + 0, acc0);
+      vst1q_s32(buf + 4, acc1);
+      vst1q_s32(buf + 8, acc2);
+      vst1q_s32(buf + 12, acc3);
       for (; in < input_size; in++) {
         int lane = (in + 8 - input_size) % 4;
         const int32 input_val = input_data[in] + input_offset;
@@ -950,26 +968,28 @@ inline void FullyConnectedAsGEMV(
           buf[lane + 4 * k] += filter_val * input_val;
         }
       }
-      for (int k = 0; k < 4; k++) {
-        acc[k] = vld1q_s32(buf + 4 * k);
-      }
+      acc0 = vld1q_s32(buf + 0);
+      acc1 = vld1q_s32(buf + 4);
+      acc2 = vld1q_s32(buf + 8);
+      acc3 = vld1q_s32(buf + 12);
     }
 
     // Horizontally reduce accumulators
-    int32x2_t pairwise_reduced_acc[kPeel];
-    for (int k = 0; k < kPeel; k++) {
-      pairwise_reduced_acc[k] =
-          vpadd_s32(vget_low_s32(acc[k]), vget_high_s32(acc[k]));
-    }
-    static_assert(kPeel == 4, "the code below currently assumes kPeel = 4");
+    int32x2_t pairwise_reduced_acc_0 =
+        vpadd_s32(vget_low_s32(acc0), vget_high_s32(acc0));
+    int32x2_t pairwise_reduced_acc_1 =
+        vpadd_s32(vget_low_s32(acc1), vget_high_s32(acc1));
+    int32x2_t pairwise_reduced_acc_2 =
+        vpadd_s32(vget_low_s32(acc2), vget_high_s32(acc2));
+    int32x2_t pairwise_reduced_acc_3 =
+        vpadd_s32(vget_low_s32(acc3), vget_high_s32(acc3));
     const int32x2_t reduced_lo =
-        vpadd_s32(pairwise_reduced_acc[0], pairwise_reduced_acc[1]);
+        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
     const int32x2_t reduced_hi =
-        vpadd_s32(pairwise_reduced_acc[2], pairwise_reduced_acc[3]);
+        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
     int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
     // Add bias values.
-    int32x4_t bias_vec = vld1q_s32(bias_ptr);
-    bias_ptr += 4;
+    int32x4_t bias_vec = vld1q_s32(bias_data + out);
     reduced = vaddq_s32(reduced, bias_vec);
     if (shift_left) {
       const int32 multiplier_power_of_two = 1 << output_shift;
@@ -992,11 +1012,116 @@ inline void FullyConnectedAsGEMV(
     // Apply the clamping from the activation function
     res8 = vmax_u8(res8, vdup_n_u8(output_activation_min));
     res8 = vmin_u8(res8, vdup_n_u8(output_activation_max));
-    // Store results to destination. Assumes 32bit alignment.
-    vst1_lane_u32(reinterpret_cast<uint32*>(output_ptr),
-                  vreinterpret_u32_u8(res8), 0);
-    output_ptr += kPeel;
+    // Store results to destination.
+    vst1_lane_u8(output_data + out + 0, res8, 0);
+    vst1_lane_u8(output_data + out + 1, res8, 1);
+    vst1_lane_u8(output_data + out + 2, res8, 2);
+    vst1_lane_u8(output_data + out + 3, res8, 3);
+  }
+}
+
+struct FullyConnectedAsGEMVWorkerTask : public gemmlowp::Task {
+  FullyConnectedAsGEMVWorkerTask(const RuntimeShape& input_shape,
+                                 const uint8* input_data, int32 input_offset,
+                                 const RuntimeShape& filter_shape,
+                                 const uint8* filter_data, int32 filter_offset,
+                                 const RuntimeShape& bias_shape,
+                                 const int32* bias_data, int32 output_offset,
+                                 int32 output_multiplier, int output_shift,
+                                 int32 output_activation_min,
+                                 int32 output_activation_max,
+                                 const RuntimeShape& output_shape,
+                                 uint8* output_data, int row_start, int row_end)
+      : input_shape_(input_shape),
+        input_data_(input_data),
+        input_offset_(input_offset),
+        filter_shape_(filter_shape),
+        filter_data_(filter_data),
+        filter_offset_(filter_offset),
+        bias_shape_(bias_shape),
+        bias_data_(bias_data),
+        output_offset_(output_offset),
+        output_multiplier_(output_multiplier),
+        output_shift_(output_shift),
+        output_activation_min_(output_activation_min),
+        output_activation_max_(output_activation_max),
+        output_shape_(output_shape),
+        output_data_(output_data),
+        row_start_(row_start),
+        row_end_(row_end) {}
+
+  void Run() override {
+    FullyConnectedAsGEMVWorkerImpl(
+        input_shape_, input_data_, input_offset_, filter_shape_, filter_data_,
+        filter_offset_, bias_shape_, bias_data_, output_offset_,
+        output_multiplier_, output_shift_, output_activation_min_,
+        output_activation_max_, output_shape_, output_data_, row_start_,
+        row_end_);
+  }
+
+  const RuntimeShape& input_shape_;
+  const uint8* input_data_;
+  int32 input_offset_;
+  const RuntimeShape& filter_shape_;
+  const uint8* filter_data_;
+  int32 filter_offset_;
+  const RuntimeShape& bias_shape_;
+  const int32* bias_data_;
+  int32 output_offset_;
+  int32 output_multiplier_;
+  int output_shift_;
+  int32 output_activation_min_;
+  int32 output_activation_max_;
+  const RuntimeShape& output_shape_;
+  uint8* output_data_;
+  gemmlowp::GemmContext* gemm_context_;
+  int row_start_;
+  int row_end_;
+};
+
+inline void FullyConnectedAsGEMV(
+    const RuntimeShape& input_shape, const uint8* input_data,
+    int32 input_offset, const RuntimeShape& filter_shape,
+    const uint8* filter_data, int32 filter_offset,
+    const RuntimeShape& bias_shape, const int32* bias_data, int32 output_offset,
+    int32 output_multiplier, int output_shift, int32 output_activation_min,
+    int32 output_activation_max, const RuntimeShape& output_shape,
+    uint8* output_data, gemmlowp::GemmContext* gemm_context) {
+  const int output_dim_count = output_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
+  const int output_rows = output_shape.Dims(output_dim_count - 1);
+  const int input_size = FlatSizeSkipDim(input_shape, 0);
+  static constexpr int kKernelRows = 4;
+  const int thread_count = gemmlowp::HowManyThreads<kKernelRows>(
+      gemm_context->max_num_threads(), output_rows, batches, input_size);
+  if (thread_count == 1) {
+    // Single-thread case: do the computation on the current thread, don't
+    // use a threadpool
+    FullyConnectedAsGEMVWorkerImpl(
+        input_shape, input_data, input_offset, filter_shape, filter_data,
+        filter_offset, bias_shape, bias_data, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_shape, output_data, 0, output_rows);
+    return;
   }
+
+  // Multi-threaded case: use the gemmlowp context's threadpool.
+  TFLITE_DCHECK_GT(thread_count, 1);
+  std::vector<gemmlowp::Task*> tasks(thread_count);
+  const int kRowsPerWorker =
+      gemmlowp::RoundUp<kKernelRows>(output_rows / thread_count);
+  int row_start = 0;
+  for (int i = 0; i < thread_count; ++i) {
+    int row_end = std::min(output_rows, row_start + kRowsPerWorker);
+    tasks[i] = new FullyConnectedAsGEMVWorkerTask(
+        input_shape, input_data, input_offset, filter_shape, filter_data,
+        filter_offset, bias_shape, bias_data, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max,
+        output_shape, output_data, row_start, row_end);
+    row_start = row_end;
+  }
+  TFLITE_DCHECK_EQ(row_start, output_rows);
+  gemm_context->workers_pool()->Execute(tasks);
 }
 #endif  // USE_NEON
 
@@ -1053,14 +1178,16 @@ inline void FullyConnected(
   const int filter_dim_count = filter_shape.DimensionsCount();
   const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
 #ifdef USE_NEON
-  const int output_size = MatchingDim(filter_shape, filter_dim_count - 2,
-                                      output_shape, output_dim_count - 1);
-  if (batches == 1 && !(output_size % 4)) {
-    return FullyConnectedAsGEMV(
-        input_shape, input_data, input_offset, filter_shape, filter_data,
-        filter_offset, bias_shape, bias_data, output_offset, output_multiplier,
-        output_shift, output_activation_min, output_activation_max,
-        output_shape, output_data);
+  if (batches == 1) {
+    const int output_size = MatchingDim(filter_shape, filter_dim_count - 2,
+                                        output_shape, output_dim_count - 1);
+    if (output_size >= 4) {
+      return FullyConnectedAsGEMV(
+          input_shape, input_data, input_offset, filter_shape, filter_data,
+          filter_offset, bias_shape, bias_data, output_offset,
+          output_multiplier, output_shift, output_activation_min,
+          output_activation_max, output_shape, output_data, gemm_context);
+    }
   }
 #endif  // USE_NEON
   const int filter_rows = filter_shape.Dims(filter_dim_count - 2);
@@ -1906,7 +2033,20 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
   MatrixRef matrix_c(c, m, n);
   ConstMatrixRef matrix_a(a, m, k);
   ConstMatrixRef matrix_b(b, n, k);
-  matrix_c.noalias() = matrix_a * matrix_b.transpose();
+
+  // The following special casing for when a or b is a vector is required
+  // as Eigen seem to fail to make this optimization on its own.
+  if (n == 1) {
+    gemmlowp::ScopedProfilingLabel label("GEMV");
+    matrix_c.col(0).noalias() = matrix_a * matrix_b.row(0).transpose();
+  } else if (m == 1) {
+    gemmlowp::ScopedProfilingLabel label("GEMV");
+    matrix_c.row(0).noalias() = matrix_a.row(0) * matrix_b.transpose();
+  } else {
+    gemmlowp::ScopedProfilingLabel label("GEMM");
+    matrix_c.noalias() = matrix_a * matrix_b.transpose();
+  }
+
 #endif  //  defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
 
   optimized_ops::AddBiasAndEvalActivationFunction(
@@ -2070,6 +2210,21 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
   TFLITE_DCHECK_EQ(output_cols, gemm_input_cols);
   TFLITE_DCHECK_EQ(filter_cols, gemm_input_rows);
   TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_rows);
+
+#ifdef USE_NEON
+  if (gemm_input_cols == 1 && output_rows >= 4) {
+    RuntimeShape fc_filter_shape{
+        filter_shape.Dims(0),
+        filter_shape.Dims(filter_shape.DimensionsCount() - 1)};
+
+    return FullyConnectedAsGEMV(
+        *gemm_input_shape, gemm_input_data, input_offset, fc_filter_shape,
+        filter_data, filter_offset, bias_shape, bias_data, output_offset,
+        output_multiplier, output_shift, output_activation_min,
+        output_activation_max, output_shape, output_data, gemm_context);
+  }
+#endif
+
   gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
       filter_data, filter_rows, filter_cols);
   gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
@@ -2347,36 +2502,37 @@ inline void Add(const ArithmeticParams& params,
 inline void AddElementwise(int size, const ArithmeticParams& params,
                            const uint8* input1_data, const uint8* input2_data,
                            uint8* output_data) {
+  gemmlowp::ScopedProfilingLabel label("AddElementwise/8bit");
   int i = 0;
   TFLITE_DCHECK_GT(params.input1_offset, -256);
   TFLITE_DCHECK_GT(params.input2_offset, -256);
   TFLITE_DCHECK_LT(params.input1_offset, 256);
   TFLITE_DCHECK_LT(params.input2_offset, 256);
 #ifdef USE_NEON
-  const auto output_activation_min_vector =
+  const uint8x8_t output_activation_min_vector =
       vdup_n_u8(params.quantized_activation_min);
-  const auto output_activation_max_vector =
+  const uint8x8_t output_activation_max_vector =
       vdup_n_u8(params.quantized_activation_max);
   for (; i <= size - 8; i += 8) {
-    const auto input1_val_original = vld1_u8(input1_data + i);
-    const auto input2_val_original = vld1_u8(input2_data + i);
-    const auto input1_val_s16 =
+    const uint8x8_t input1_val_original = vld1_u8(input1_data + i);
+    const uint8x8_t input2_val_original = vld1_u8(input2_data + i);
+    const int16x8_t input1_val_s16 =
         vreinterpretq_s16_u16(vmovl_u8(input1_val_original));
-    const auto input2_val_s16 =
+    const int16x8_t input2_val_s16 =
         vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
-    const auto input1_val =
+    const int16x8_t input1_val =
         vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
-    const auto input2_val =
+    const int16x8_t input2_val =
         vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
-    const auto input1_val_high = vget_high_s16(input1_val);
-    const auto input1_val_low = vget_low_s16(input1_val);
-    const auto input2_val_high = vget_high_s16(input2_val);
-    const auto input2_val_low = vget_low_s16(input2_val);
-    auto x11 = vmovl_s16(input1_val_low);
-    auto x12 = vmovl_s16(input1_val_high);
-    auto x21 = vmovl_s16(input2_val_low);
-    auto x22 = vmovl_s16(input2_val_high);
-    const auto left_shift_dup = vdupq_n_s32(params.left_shift);
+    const int16x4_t input1_val_high = vget_high_s16(input1_val);
+    const int16x4_t input1_val_low = vget_low_s16(input1_val);
+    const int16x4_t input2_val_high = vget_high_s16(input2_val);
+    const int16x4_t input2_val_low = vget_low_s16(input2_val);
+    int32x4_t x11 = vmovl_s16(input1_val_low);
+    int32x4_t x12 = vmovl_s16(input1_val_high);
+    int32x4_t x21 = vmovl_s16(input2_val_low);
+    int32x4_t x22 = vmovl_s16(input2_val_high);
+    const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift);
     x11 = vshlq_s32(x11, left_shift_dup);
     x12 = vshlq_s32(x12, left_shift_dup);
     x21 = vshlq_s32(x21, left_shift_dup);
@@ -2385,24 +2541,24 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
     x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
     x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
     x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
-    const auto input1_shift_dup = vdupq_n_s32(params.input1_shift);
-    const auto input2_shift_dup = vdupq_n_s32(params.input2_shift);
+    const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift);
+    const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift);
     x11 = vshlq_s32(x11, input1_shift_dup);
     x12 = vshlq_s32(x12, input1_shift_dup);
     x21 = vshlq_s32(x21, input2_shift_dup);
     x22 = vshlq_s32(x22, input2_shift_dup);
-    auto s1 = vaddq_s32(x11, x21);
-    auto s2 = vaddq_s32(x12, x22);
+    int32x4_t s1 = vaddq_s32(x11, x21);
+    int32x4_t s2 = vaddq_s32(x12, x22);
     s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
     s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
     using gemmlowp::RoundingDivideByPOT;
     s1 = RoundingDivideByPOT(s1, -params.output_shift);
     s2 = RoundingDivideByPOT(s2, -params.output_shift);
-    const auto s1_narrowed = vmovn_s32(s1);
-    const auto s2_narrowed = vmovn_s32(s2);
-    const auto s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
-                             vdupq_n_s16(params.output_offset));
-    const auto clamped =
+    const int16x4_t s1_narrowed = vmovn_s32(s1);
+    const int16x4_t s2_narrowed = vmovn_s32(s2);
+    const int16x8_t s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
+                                  vdupq_n_s16(params.output_offset));
+    const uint8x8_t clamped =
         vmax_u8(output_activation_min_vector,
                 vmin_u8(output_activation_max_vector, vqmovun_s16(s)));
     vst1_u8(output_data + i, clamped);
@@ -2432,6 +2588,109 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
   }
 }
 
+// Scalar-broadcast add that can be used for inner loop of more general
+// broadcast add, so that, for example, scalar-broadcast with batch will still
+// be fast.
+inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
+                               uint8 input1_data, const uint8* input2_data,
+                               uint8* output_data) {
+  using gemmlowp::RoundingDivideByPOT;
+
+  gemmlowp::ScopedProfilingLabel label("AddScalarBroadcast/8bit");
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+
+  int i = 0;
+
+#ifdef USE_NEON
+  const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift);
+  const uint8x8_t output_activation_min_vector =
+      vdup_n_u8(params.quantized_activation_min);
+  const uint8x8_t output_activation_max_vector =
+      vdup_n_u8(params.quantized_activation_max);
+
+  // Process broadcast scalar.
+  const uint8x8_t input1_val_original = vdup_n_u8(input1_data);
+  const int16x8_t input1_val_s16 =
+      vreinterpretq_s16_u16(vmovl_u8(input1_val_original));
+  const int16x8_t input1_val =
+      vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
+  const int16x4_t input1_val_high = vget_high_s16(input1_val);
+  const int16x4_t input1_val_low = vget_low_s16(input1_val);
+  int32x4_t x11 = vmovl_s16(input1_val_low);
+  int32x4_t x12 = vmovl_s16(input1_val_high);
+  x11 = vshlq_s32(x11, left_shift_dup);
+  x12 = vshlq_s32(x12, left_shift_dup);
+  x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier);
+  x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
+  const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift);
+  x11 = vshlq_s32(x11, input1_shift_dup);
+  x12 = vshlq_s32(x12, input1_shift_dup);
+
+  for (; i <= size - 8; i += 8) {
+    const uint8x8_t input2_val_original = vld1_u8(input2_data + i);
+    const int16x8_t input2_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
+    const int16x8_t input2_val =
+        vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
+    const int16x4_t input2_val_high = vget_high_s16(input2_val);
+    const int16x4_t input2_val_low = vget_low_s16(input2_val);
+    int32x4_t x21 = vmovl_s16(input2_val_low);
+    int32x4_t x22 = vmovl_s16(input2_val_high);
+    x21 = vshlq_s32(x21, left_shift_dup);
+    x22 = vshlq_s32(x22, left_shift_dup);
+    x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
+    x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
+    const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift);
+    x21 = vshlq_s32(x21, input2_shift_dup);
+    x22 = vshlq_s32(x22, input2_shift_dup);
+    int32x4_t s1 = vaddq_s32(x11, x21);
+    int32x4_t s2 = vaddq_s32(x12, x22);
+    s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
+    s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
+    s1 = RoundingDivideByPOT(s1, -params.output_shift);
+    s2 = RoundingDivideByPOT(s2, -params.output_shift);
+    const int16x4_t s1_narrowed = vmovn_s32(s1);
+    const int16x4_t s2_narrowed = vmovn_s32(s2);
+    const int16x8_t s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
+                                  vdupq_n_s16(params.output_offset));
+    const uint8x8_t clamped =
+        vmax_u8(output_activation_min_vector,
+                vmin_u8(output_activation_max_vector, vqmovun_s16(s)));
+    vst1_u8(output_data + i, clamped);
+  }
+#endif  // NEON
+
+  if (i < size) {
+    // Process broadcast scalar.
+    const int32 input1_val = params.input1_offset + input1_data;
+    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32 scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, params.input1_multiplier, params.input1_shift);
+
+    for (; i < size; ++i) {
+      const int32 input2_val = params.input2_offset + input2_data[i];
+      const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
+      const int32 scaled_input2_val =
+          MultiplyByQuantizedMultiplierSmallerThanOneExp(
+              shifted_input2_val, params.input2_multiplier,
+              params.input2_shift);
+      const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+      const int32 raw_output =
+          MultiplyByQuantizedMultiplierSmallerThanOneExp(
+              raw_sum, params.output_multiplier, params.output_shift) +
+          params.output_offset;
+      const int32 clamped_output =
+          std::min(params.quantized_activation_max,
+                   std::max(params.quantized_activation_min, raw_output));
+      output_data[i] = static_cast<uint8>(clamped_output);
+    }
+  }
+}
+
 inline void Add(const ArithmeticParams& params,
                 const RuntimeShape& input1_shape, const uint8* input1_data,
                 const RuntimeShape& input2_shape, const uint8* input2_data,
@@ -2546,26 +2805,63 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
   uint8* output_data_ptr = output_data;
   const uint8* input1_data_ptr = input1_data;
   const uint8* input2_data_reset = input2_data;
+  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
+  // between input shapes. y3 for input 1 is always broadcast, and so the
+  // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
+  // Put another way,
+  // input1.shape.FlatSize = y0 * y1 * y2 * y4,
+  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
   int y0 = params.broadcast_shape[0];
   int y1 = params.broadcast_shape[1];
   int y2 = params.broadcast_shape[2];
   int y3 = params.broadcast_shape[3];
   int y4 = params.broadcast_shape[4];
-  for (int i0 = 0; i0 < y0; ++i0) {
-    const uint8* input2_data_ptr;
-    for (int i1 = 0; i1 < y1; ++i1) {
-      input2_data_ptr = input2_data_reset;
-      for (int i2 = 0; i2 < y2; ++i2) {
-        for (int i3 = 0; i3 < y3; ++i3) {
-          AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
-                         output_data_ptr);
-          input2_data_ptr += y4;
-          output_data_ptr += y4;
+  if (y4 > 1) {
+    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
+    // dimension.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const uint8* input2_data_ptr = nullptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          for (int i3 = 0; i3 < y3; ++i3) {
+            AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
+                           output_data_ptr);
+            input2_data_ptr += y4;
+            output_data_ptr += y4;
+          }
+          // We have broadcast y4 of input1 data y3 times, and now move on.
+          input1_data_ptr += y4;
         }
-        input1_data_ptr += y4;
       }
+      // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
+      input2_data_reset = input2_data_ptr;
+    }
+  } else {
+    // Special case of y4 == 1, in which the innermost loop is a single element
+    // and can be combined with the next (y3) as an inner broadcast.
+    //
+    // Note that this handles the case of pure scalar broadcast when
+    // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
+    // broadcast with batch (as y2 > 1).
+    //
+    // NOTE The process is the same as the above general case except simplified
+    // for y4 == 1 and the loop over y3 is contained within the
+    // AddScalarBroadcast function.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const uint8* input2_data_ptr = nullptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
+                             output_data_ptr);
+          input2_data_ptr += y3;
+          output_data_ptr += y3;
+          input1_data_ptr += 1;
+        }
+      }
+      input2_data_reset = input2_data_ptr;
     }
-    input2_data_reset = input2_data_ptr;
   }
 }
 
@@ -2910,7 +3206,7 @@ inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
   int y4 = params.broadcast_shape[4];
   if (y4 > 1) {
     for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr;
+      const uint8* input2_data_ptr = nullptr;
       for (int i1 = 0; i1 < y1; ++i1) {
         input2_data_ptr = input2_data_reset;
         for (int i2 = 0; i2 < y2; ++i2) {
@@ -2927,7 +3223,7 @@ inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
     }
   } else {
     for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr;
+      const uint8* input2_data_ptr = nullptr;
       for (int i1 = 0; i1 < y1; ++i1) {
         input2_data_ptr = input2_data_reset;
         for (int i2 = 0; i2 < y2; ++i2) {
@@ -3555,6 +3851,14 @@ inline void AveragePool(const PoolParams& params,
                         const uint8* input_data,
                         const RuntimeShape& output_shape, uint8* output_data) {
   gemmlowp::ScopedProfilingLabel label("AveragePool/8bit");
+
+  // Here, and in other pooling ops, in order to maintain locality of reference,
+  // to minimize some recalculations, and to load into NEON vector registers, we
+  // use an inner loop down the depth. Since depths can be large and hence we
+  // would need arbitrarily large temporary storage, we divide the work up into
+  // depth tranches just within the batch loop.
+  static constexpr int kPoolingAccTrancheSize = 256;
+
   TFLITE_DCHECK_LE(params.quantized_activation_min,
                    params.quantized_activation_max);
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -3567,69 +3871,76 @@ inline void AveragePool(const PoolParams& params,
   const int output_width = output_shape.Dims(2);
   const int stride_height = params.stride_height;
   const int stride_width = params.stride_width;
+
+  uint16 acc[kPoolingAccTrancheSize];
   for (int batch = 0; batch < batches; ++batch) {
-    for (int out_y = 0; out_y < output_height; ++out_y) {
-      for (int out_x = 0; out_x < output_width; ++out_x) {
-        const int in_x_origin =
-            (out_x * stride_width) - params.padding_values.width;
-        const int in_y_origin =
-            (out_y * stride_height) - params.padding_values.height;
-        const int filter_x_start = std::max(0, -in_x_origin);
-        const int filter_x_end =
-            std::min(params.filter_width, input_width - in_x_origin);
-        const int filter_y_start = std::max(0, -in_y_origin);
-        const int filter_y_end =
-            std::min(params.filter_height, input_height - in_y_origin);
-        const int filter_count =
-            (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
-        // 2560 is required by MobileNetV2 with depth multiplier 2.
-        static constexpr int kAccBufferMaxSize = 4096;
-        TFLITE_DCHECK_LE(depth, kAccBufferMaxSize);
-        uint16 acc[kAccBufferMaxSize];
-        memset(acc, 0, depth * sizeof(acc[0]));
-        const uint8* input_ptr =
-            input_data +
-            depth * (in_x_origin +
-                     input_width * (in_y_origin + input_height * batch));
-        for (int fy = filter_y_start; fy < filter_y_end; fy++) {
-          const uint8* input_row_ptr =
-              input_ptr + depth * (fy * input_width + filter_x_start);
-          for (int fx = filter_x_start; fx < filter_x_end; fx++) {
-            int channel = 0;
+    // We proceed through the depth in tranches (see comment above). The
+    // depth_base is the depth at the beginning of the tranche. The
+    // tranche_depth is the depth dimension of the tranche.
+    for (int depth_base = 0; depth_base < depth;
+         depth_base += kPoolingAccTrancheSize) {
+      const int tranche_depth =
+          std::min(depth - depth_base, kPoolingAccTrancheSize);
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          const int filter_count =
+              (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
+          memset(acc, 0, tranche_depth * sizeof(acc[0]));
+          const uint8* input_ptr =
+              input_data + depth_base +
+              depth * (in_x_origin +
+                       input_width * (in_y_origin + input_height * batch));
+          for (int fy = filter_y_start; fy < filter_y_end; fy++) {
+            const uint8* input_row_ptr =
+                input_ptr + depth * (fy * input_width + filter_x_start);
+            for (int fx = filter_x_start; fx < filter_x_end; fx++) {
+              const uint8* input_channel_ptr = input_row_ptr;
+              int channel = 0;
 #ifdef USE_NEON
-            for (; channel <= depth - 16; channel += 16) {
-              uint16x8_t acc_reg[2];
-              for (int i = 0; i < 2; i++) {
-                acc_reg[i] = vld1q_u16(acc + channel + 8 * i);
+              for (; channel <= tranche_depth - 16; channel += 16) {
+                uint16x8_t acc_reg[2];
+                for (int i = 0; i < 2; i++) {
+                  acc_reg[i] = vld1q_u16(acc + channel + 8 * i);
+                }
+                uint8x16_t input_reg = vld1q_u8(input_channel_ptr);
+                input_channel_ptr += 16;
+                acc_reg[0] = vaddw_u8(acc_reg[0], vget_low_u8(input_reg));
+                acc_reg[1] = vaddw_u8(acc_reg[1], vget_high_u8(input_reg));
+                for (int i = 0; i < 2; i++) {
+                  vst1q_u16(acc + channel + 8 * i, acc_reg[i]);
+                }
               }
-              uint8x16_t input_reg = vld1q_u8(input_row_ptr);
-              input_row_ptr += 16;
-              acc_reg[0] = vaddw_u8(acc_reg[0], vget_low_u8(input_reg));
-              acc_reg[1] = vaddw_u8(acc_reg[1], vget_high_u8(input_reg));
-              for (int i = 0; i < 2; i++) {
-                vst1q_u16(acc + channel + 8 * i, acc_reg[i]);
+              for (; channel <= tranche_depth - 8; channel += 8) {
+                uint16x8_t acc_reg = vld1q_u16(acc + channel);
+                uint8x8_t input_reg = vld1_u8(input_channel_ptr);
+                input_channel_ptr += 8;
+                acc_reg = vaddw_u8(acc_reg, input_reg);
+                vst1q_u16(acc + channel, acc_reg);
               }
-            }
-            for (; channel <= depth - 8; channel += 8) {
-              uint16x8_t acc_reg = vld1q_u16(acc + channel);
-              uint8x8_t input_reg = vld1_u8(input_row_ptr);
-              input_row_ptr += 8;
-              acc_reg = vaddw_u8(acc_reg, input_reg);
-              vst1q_u16(acc + channel, acc_reg);
-            }
 #endif
-            for (; channel < depth; ++channel) {
-              acc[channel] += *input_row_ptr++;
+              for (; channel < tranche_depth; ++channel) {
+                acc[channel] += *input_channel_ptr++;
+              }
+              input_row_ptr += depth;
             }
           }
-        }
-        uint8* output_ptr =
-            output_data + Offset(output_shape, batch, out_y, out_x, 0);
-        int channel = 0;
+          uint8* output_ptr = output_data + Offset(output_shape, batch, out_y,
+                                                   out_x, depth_base);
+          int channel = 0;
 #ifdef USE_NEON
 #define AVGPOOL_DIVIDING_BY(FILTER_COUNT)                               \
   if (filter_count == FILTER_COUNT) {                                   \
-    for (; channel <= depth - 8; channel += 8) {                        \
+    for (; channel <= tranche_depth - 8; channel += 8) {                \
       uint16 buf[8];                                                    \
       for (int i = 0; i < 8; i++) {                                     \
         buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT;  \
@@ -3640,25 +3951,26 @@ inline void AveragePool(const PoolParams& params,
       vst1_u8(output_ptr + channel, buf8);                              \
     }                                                                   \
   }
-        AVGPOOL_DIVIDING_BY(9)
-        AVGPOOL_DIVIDING_BY(15)
+          AVGPOOL_DIVIDING_BY(9)
+          AVGPOOL_DIVIDING_BY(15)
 #undef AVGPOOL_DIVIDING_BY
-        for (; channel <= depth - 8; channel += 8) {
-          uint16 buf[8];
-          for (int i = 0; i < 8; i++) {
-            buf[i] = (acc[channel + i] + filter_count / 2) / filter_count;
+          for (; channel <= tranche_depth - 8; channel += 8) {
+            uint16 buf[8];
+            for (int i = 0; i < 8; i++) {
+              buf[i] = (acc[channel + i] + filter_count / 2) / filter_count;
+            }
+            uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));
+            buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max));
+            buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min));
+            vst1_u8(output_ptr + channel, buf8);
           }
-          uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));
-          buf8 = vmin_u8(buf8, vdup_n_u8(params.quantized_activation_max));
-          buf8 = vmax_u8(buf8, vdup_n_u8(params.quantized_activation_min));
-          vst1_u8(output_ptr + channel, buf8);
-        }
 #endif
-        for (; channel < depth; ++channel) {
-          uint16 a = (acc[channel] + filter_count / 2) / filter_count;
-          a = std::max<uint16>(a, params.quantized_activation_min);
-          a = std::min<uint16>(a, params.quantized_activation_max);
-          output_ptr[channel] = static_cast<uint8>(a);
+          for (; channel < tranche_depth; ++channel) {
+            uint16 a = (acc[channel] + filter_count / 2) / filter_count;
+            a = std::max<uint16>(a, params.quantized_activation_min);
+            a = std::min<uint16>(a, params.quantized_activation_max);
+            output_ptr[channel] = static_cast<uint8>(a);
+          }
         }
       }
     }
@@ -3723,6 +4035,14 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
                     const uint8* input_data, const RuntimeShape& output_shape,
                     uint8* output_data) {
   gemmlowp::ScopedProfilingLabel label("MaxPool/8bit");
+
+  // Here, and in other pooling ops, in order to maintain locality of reference,
+  // to minimize some recalculations, and to load into NEON vector registers, we
+  // use an inner loop down the depth. Since depths can be large and hence we
+  // would need arbitrarily large temporary storage, we divide the work up into
+  // depth tranches just within the batch loop.
+  static constexpr int kPoolingAccTrancheSize = 256;
+
   TFLITE_DCHECK_LE(params.quantized_activation_min,
                    params.quantized_activation_max);
   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
@@ -3735,77 +4055,85 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
   const int output_width = output_shape.Dims(2);
   const int stride_height = params.stride_height;
   const int stride_width = params.stride_width;
+
+  uint8 acc[kPoolingAccTrancheSize];
   for (int batch = 0; batch < batches; ++batch) {
-    for (int out_y = 0; out_y < output_height; ++out_y) {
-      for (int out_x = 0; out_x < output_width; ++out_x) {
-        const int in_x_origin =
-            (out_x * stride_width) - params.padding_values.width;
-        const int in_y_origin =
-            (out_y * stride_height) - params.padding_values.height;
-        const int filter_x_start = std::max(0, -in_x_origin);
-        const int filter_x_end =
-            std::min(params.filter_width, input_width - in_x_origin);
-        const int filter_y_start = std::max(0, -in_y_origin);
-        const int filter_y_end =
-            std::min(params.filter_height, input_height - in_y_origin);
-        // 2560 is required by MobileNetV2 with depth multiplier 2.
-        static constexpr int kAccBufferMaxSize = 4096;
-        TFLITE_DCHECK_LE(depth, kAccBufferMaxSize);
-        uint8 acc[kAccBufferMaxSize];
-        memset(acc, 0, depth * sizeof(acc[0]));
-        const uint8* input_ptr =
-            input_data +
-            depth * (in_x_origin +
-                     input_width * (in_y_origin + input_height * batch));
-        for (int fy = filter_y_start; fy < filter_y_end; fy++) {
-          const uint8* input_row_ptr =
-              input_ptr + depth * (fy * input_width + filter_x_start);
-          for (int fx = filter_x_start; fx < filter_x_end; fx++) {
-            int channel = 0;
+    // We proceed through the depth in tranches (see comment above). The
+    // depth_base is the depth at the beginning of the tranche. The
+    // tranche_depth is the depth dimension of the tranche.
+    for (int depth_base = 0; depth_base < depth;
+         depth_base += kPoolingAccTrancheSize) {
+      const int tranche_depth =
+          std::min(depth - depth_base, kPoolingAccTrancheSize);
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          memset(acc, 0, tranche_depth * sizeof(acc[0]));
+          const uint8* input_ptr =
+              input_data + depth_base +
+              depth * (in_x_origin +
+                       input_width * (in_y_origin + input_height * batch));
+          for (int fy = filter_y_start; fy < filter_y_end; fy++) {
+            const uint8* input_row_ptr =
+                input_ptr + depth * (fy * input_width + filter_x_start);
+            for (int fx = filter_x_start; fx < filter_x_end; fx++) {
+              const uint8* input_channel_ptr = input_row_ptr;
+              int channel = 0;
 #ifdef USE_NEON
-            for (; channel <= depth - 16; channel += 16) {
-              uint8x16_t acc_reg = vld1q_u8(acc + channel);
-              uint8x16_t input_reg = vld1q_u8(input_row_ptr);
-              input_row_ptr += 16;
-              acc_reg = vmaxq_u8(acc_reg, input_reg);
-              vst1q_u8(acc + channel, acc_reg);
-            }
+              for (; channel <= tranche_depth - 16; channel += 16) {
+                uint8x16_t acc_reg = vld1q_u8(acc + channel);
+                uint8x16_t input_reg = vld1q_u8(input_channel_ptr);
+                input_channel_ptr += 16;
+                acc_reg = vmaxq_u8(acc_reg, input_reg);
+                vst1q_u8(acc + channel, acc_reg);
+              }
 
-            for (; channel <= depth - 8; channel += 8) {
-              uint8x8_t acc_reg = vld1_u8(acc + channel);
-              uint8x8_t input_reg = vld1_u8(input_row_ptr);
-              input_row_ptr += 8;
-              acc_reg = vmax_u8(acc_reg, input_reg);
-              vst1_u8(acc + channel, acc_reg);
-            }
+              for (; channel <= tranche_depth - 8; channel += 8) {
+                uint8x8_t acc_reg = vld1_u8(acc + channel);
+                uint8x8_t input_reg = vld1_u8(input_channel_ptr);
+                input_channel_ptr += 8;
+                acc_reg = vmax_u8(acc_reg, input_reg);
+                vst1_u8(acc + channel, acc_reg);
+              }
 #endif
-            for (; channel < depth; ++channel) {
-              acc[channel] = std::max(acc[channel], *input_row_ptr++);
+              for (; channel < tranche_depth; ++channel) {
+                acc[channel] = std::max(acc[channel], *input_channel_ptr++);
+              }
+              input_row_ptr += depth;
             }
           }
-        }
-        uint8* output_ptr =
-            output_data + Offset(output_shape, batch, out_y, out_x, 0);
-        int channel = 0;
+          uint8* output_ptr = output_data + Offset(output_shape, batch, out_y,
+                                                   out_x, depth_base);
+          int channel = 0;
 #ifdef USE_NEON
-        for (; channel <= depth - 16; channel += 16) {
-          uint8x16_t a = vld1q_u8(acc + channel);
-          a = vminq_u8(a, vdupq_n_u8(params.quantized_activation_max));
-          a = vmaxq_u8(a, vdupq_n_u8(params.quantized_activation_min));
-          vst1q_u8(output_ptr + channel, a);
-        }
-        for (; channel <= depth - 8; channel += 8) {
-          uint8x8_t a = vld1_u8(acc + channel);
-          a = vmin_u8(a, vdup_n_u8(params.quantized_activation_max));
-          a = vmax_u8(a, vdup_n_u8(params.quantized_activation_min));
-          vst1_u8(output_ptr + channel, a);
-        }
+          for (; channel <= tranche_depth - 16; channel += 16) {
+            uint8x16_t a = vld1q_u8(acc + channel);
+            a = vminq_u8(a, vdupq_n_u8(params.quantized_activation_max));
+            a = vmaxq_u8(a, vdupq_n_u8(params.quantized_activation_min));
+            vst1q_u8(output_ptr + channel, a);
+          }
+          for (; channel <= tranche_depth - 8; channel += 8) {
+            uint8x8_t a = vld1_u8(acc + channel);
+            a = vmin_u8(a, vdup_n_u8(params.quantized_activation_max));
+            a = vmax_u8(a, vdup_n_u8(params.quantized_activation_min));
+            vst1_u8(output_ptr + channel, a);
+          }
 #endif
-        for (; channel < depth; ++channel) {
-          uint8 a = acc[channel];
-          a = std::max<uint8>(a, params.quantized_activation_min);
-          a = std::min<uint8>(a, params.quantized_activation_max);
-          output_ptr[channel] = static_cast<uint8>(a);
+          for (; channel < tranche_depth; ++channel) {
+            uint8 a = acc[channel];
+            a = std::max<uint8>(a, params.quantized_activation_min);
+            a = std::min<uint8>(a, params.quantized_activation_max);
+            output_ptr[channel] = static_cast<uint8>(a);
+          }
         }
       }
     }
@@ -4191,119 +4519,6 @@ inline void LogSoftmax(const SoftmaxParams& params,
   }
 }
 
-template <int OutputIntegerBits, int InputIntegerBits>
-inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
-log_x_for_x_greater_than_or_equal_to_1_impl(
-    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
-  // assert(__builtin_clz(0u) >= std::numeric_limits<uint32>::digits - 1);
-  // assert(__builtin_clz(0u) <= std::numeric_limits<uint32>::digits);
-  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
-  // The reason for accumulating the result with an extra bit of headroom is
-  // that z_pow_2_adj * log_2 might be saturated, and adding num_scaled *
-  // recip_denom will otherwise introduce an error.
-  static constexpr int kAccumIntegerBits = OutputIntegerBits + 1;
-  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumIntegerBits>;
-
-  const FixedPoint0 log_2 = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 1488522236, std::log(2.0));
-  const FixedPoint0 sqrt_sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 1805811301, std::sqrt(std::sqrt(0.5)));
-  const FixedPoint0 sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 1518500250, std::sqrt(0.5));
-  const FixedPoint0 one_quarter =
-      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(FixedPoint0, 536870912, 1.0 / 4.0);
-
-  const FixedPoint0 alpha_n = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 117049297, 11.0 / 240.0 * std::sqrt(std::sqrt(2.0)));
-  const FixedPoint0 alpha_d = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 127690142, 1.0 / 20.0 * std::sqrt(std::sqrt(2.0)));
-  const FixedPoint0 alpha_i = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 1057819769,
-      2.0 / std::sqrt(std::sqrt(2.0)) - std::sqrt(std::sqrt(2.0)));
-  const FixedPoint0 alpha_f = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 638450708, 1.0 / 4.0 * std::sqrt(std::sqrt(2.0)));
-
-  const FixedPointAccum shifted_quarter =
-      gemmlowp::Rescale<kAccumIntegerBits>(one_quarter);
-
-  // Reinterpret the input value as Q0.31, because we will figure out the
-  // required shift "ourselves" instead of using, say, Rescale.
-  FixedPoint0 z_a = FixedPoint0::FromRaw(input_val.raw());
-  // z_a_pow_2 = input_integer_bits - z_a_headroom;
-  int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32>(z_a.raw()));
-  FixedPoint0 r_a_tmp =
-      SaturatingRoundingMultiplyByPOTParam(z_a, (z_a_headroom_plus_1 - 1));
-  const int32 r_a_raw =
-      SaturatingRoundingMultiplyByPOTParam((r_a_tmp * sqrt_half).raw(), 1);
-  // z_pow_2_adj = max(z_pow_2_a - 0.75, z_pow_2_b - 0.25);
-  // z_pow_2_adj = max(InputIntegerBits - z_a_headroom_plus_1 + 0.25,
-  //                   InputIntegerBits - z_b_headroom - 0.25);
-  const FixedPointAccum z_a_pow_2_adj = SaturatingAddNonGemmlowp(
-      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
-          InputIntegerBits - z_a_headroom_plus_1, 31 - kAccumIntegerBits)),
-      shifted_quarter);
-
-  // z_b is treated like z_a, but premultiplying by sqrt(0.5).
-  FixedPoint0 z_b = z_a * sqrt_half;
-  int z_b_headroom = CountLeadingZeros(static_cast<uint32>(z_b.raw())) - 1;
-  const int32 r_b_raw =
-      SaturatingRoundingMultiplyByPOTParam(z_a.raw(), z_b_headroom);
-  const FixedPointAccum z_b_pow_2_adj = SaturatingSub(
-      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
-          InputIntegerBits - z_b_headroom, 31 - kAccumIntegerBits)),
-      shifted_quarter);
-
-  const FixedPoint0 r = FixedPoint0::FromRaw(std::min(r_a_raw, r_b_raw));
-  const FixedPointAccum z_pow_2_adj = FixedPointAccum::FromRaw(
-      std::max(z_a_pow_2_adj.raw(), z_b_pow_2_adj.raw()));
-
-  const FixedPoint0 p = gemmlowp::RoundingHalfSum(r, sqrt_sqrt_half);
-  FixedPoint0 q = r - sqrt_sqrt_half;
-  q = q + q;
-
-  const FixedPoint0 common_sq = q * q;
-  const FixedPoint0 num = q * r + q * common_sq * alpha_n;
-  const FixedPoint0 denom_minus_one_0 =
-      p * (alpha_i + q + alpha_d * common_sq) + alpha_f * q;
-  const FixedPoint0 recip_denom =
-      one_over_one_plus_x_for_x_in_0_1(denom_minus_one_0);
-
-  const FixedPointAccum num_scaled = gemmlowp::Rescale<kAccumIntegerBits>(num);
-  return gemmlowp::Rescale<OutputIntegerBits>(z_pow_2_adj * log_2 +
-                                              num_scaled * recip_denom);
-}
-
-// Minimum output bits to accommodate log of maximum input range.  It actually
-// does not matter if one considers, say, [-64,64] or [-64,64).
-//
-// For example, run this through Octave:
-// [0:127; ...
-//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2)); ...
-//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2))]
-constexpr int min_log_x_output_bits(int input_bits) {
-  return input_bits > 90
-             ? 7
-             : input_bits > 44
-                   ? 6
-                   : input_bits > 21
-                         ? 5
-                         : input_bits > 10
-                               ? 4
-                               : input_bits > 4 ? 3 : input_bits > 1 ? 2 : 1;
-}
-
-template <int OutputIntegerBits, int InputIntegerBits>
-inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
-log_x_for_x_greater_than_or_equal_to_1(
-    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
-  static_assert(
-      OutputIntegerBits >= min_log_x_output_bits(InputIntegerBits),
-      "Output integer bits must be sufficent to accommodate logs of inputs.");
-  return log_x_for_x_greater_than_or_equal_to_1_impl<OutputIntegerBits,
-                                                     InputIntegerBits>(
-      input_val);
-}
-
 // Currently just a copy of the reference code.
 inline void LogSoftmax(const SoftmaxParams& params,
                        const RuntimeShape& input_shape, const uint8* input_data,
@@ -4898,6 +5113,14 @@ inline void Floor(const RuntimeShape& input_shape, const float* input_data,
   output_map.array() = Eigen::floor(input_map.array());
 }
 
+inline void Ceil(const RuntimeShape& input_shape, const float* input_data,
+                 const RuntimeShape& output_shape, float* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Ceil");
+  auto input_map = MapAsVector(input_data, input_shape);
+  auto output_map = MapAsVector(output_data, output_shape);
+  output_map.array() = Eigen::ceil(input_map.array());
+}
+
 #ifdef USE_NEON
 inline void ResizeBilinearKernel(const float* input_ptr, int32 depth,
                                  float scale, float* output_ptr) {
@@ -5233,9 +5456,6 @@ inline void ResizeBilinearGenericSmallChannel(
     int32 output_height, int32 output_width, float height_scale,
     float width_scale, const RuntimeShape& input_shape, const T* input_data,
     const RuntimeShape& output_shape, T* output_data) {
-  memset(output_data, 0,
-         batches * output_height * output_width * depth * sizeof(T));
-
   T* output_ptr = &output_data[0];
   for (int b = 0; b < batches; ++b) {
     for (int y = 0; y < output_height; ++y) {
@@ -5244,7 +5464,7 @@ inline void ResizeBilinearGenericSmallChannel(
       int32 y1 = std::min(y0 + 1, input_height - 1);
       for (int x = 0; x < output_width; ++x) {
         float input_x = x * width_scale;
-        int32 x0 = static_cast<int32>(input_x);
+        int32 x0 = static_cast<int32>(std::floor((input_x)));
         int32 x1 = std::min(x0 + 1, input_width - 1);
 
         int32 input_offset[4] = {Offset(input_shape, b, y0, x0, 0),
@@ -5928,7 +6148,27 @@ inline void TransposeConv(
     const float* filter_data, const RuntimeShape& output_shape,
     float* output_data, const RuntimeShape& im2col_shape, float* im2col_data) {
   gemmlowp::ScopedProfilingLabel label("TransposeConv");
-
+  // The complexity of the reference implementation is input.flat_size() *
+  // filter.flat_size() / in_channel.
+  //
+  // While the complexity of im2col->gemm
+  // implmentation is batch * output_height * output_width *
+  // (filter.flat_size() / out_channel)^2 * out_channel.
+  //
+  // so if input.flat_size() * out_channel^2 is much smaller than
+  // output.flat_size() * filter.size() * in_channel we should fall back to the
+  // reference implementation.
+  //
+  // TODO(b/122331966): optimize the intuitive implementation.
+  const int out_channel = output_shape.Dims(3);
+  const int in_channel = input_shape.Dims(3);
+  if ((input_shape.FlatSize() * out_channel * out_channel * 4) <
+      (filter_shape.FlatSize() * output_shape.FlatSize() * in_channel)) {
+    reference_ops::TransposeConv(params, input_shape, input_data, filter_shape,
+                                 filter_data, output_shape, output_data,
+                                 im2col_shape, im2col_data);
+    return;
+  }
   // Note we could use transposed weights with forward conv for unstrided
   // cases. But we are already getting good performance with this code as-is.
   TFLITE_DCHECK(im2col_data);
diff --git a/tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h
index 8f52ef131dedf4d0270c0346b1094add57f52dfc..00b2d7e063254e2941fd3453f15dbaf2dbd4451e 100644
--- a/tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/optimized/tensor_utils_impl.h
@@ -54,6 +54,25 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ vectors, const float* scaling_factors,
     int n_batch, float* __restrict__ result, int result_stride);
 
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
+    const float* matrix, const uint8_t* ledger, int m_rows, int m_cols,
+    const float* vector, int n_batch, float* result, int result_stride);
+void NeonSparseMatrixBatchVectorMultiplyAccumulate(
+    const float* matrix, const uint8_t* ledger, int m_rows, int m_cols,
+    const float* vector, int n_batch, float* result, int result_stride);
+
+// Matrix multiplication for quantized values using symmetric quantization.
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride);
+void NeonSparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride);
+
 // Cwise product of two vectors.
 void PortableVectorVectorCwiseProduct(const float* vector1,
                                       const float* vector2, int v_size,
diff --git a/tensorflow/lite/kernels/internal/quantization_util.cc b/tensorflow/lite/kernels/internal/quantization_util.cc
index 0279d2a9229e02721c01d15d380db1919b7bfd23..71eef71372c0afd17c0dd3e416648dd20e983ba3 100644
--- a/tensorflow/lite/kernels/internal/quantization_util.cc
+++ b/tensorflow/lite/kernels/internal/quantization_util.cc
@@ -366,4 +366,13 @@ bool CheckedLog2(const float x, int* log2_result) {
   return std::abs(x_log2_fracpart) < 1e-3;
 }
 
+void QuantizeMultiplierArray(const double* effective_scales, size_t size,
+                             int32_t* effective_scale_significand,
+                             int* effective_shift) {
+  for (size_t i = 0; i < size; ++i) {
+    QuantizeMultiplier(effective_scales[i], &effective_scale_significand[i],
+                       &effective_shift[i]);
+  }
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/quantization_util.h b/tensorflow/lite/kernels/internal/quantization_util.h
index bf313f39cd8b407f6fb57dcbdf0540e98d96b7e8..5d67c0d0277b84f5c1a74871d9acdd652beef83b 100644
--- a/tensorflow/lite/kernels/internal/quantization_util.h
+++ b/tensorflow/lite/kernels/internal/quantization_util.h
@@ -275,6 +275,17 @@ void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
 // returns false.
 bool CheckedLog2(const float x, int* log2_result);
 
+// Decomposes an array of double multipliers into a Q0.31 int32 representation
+// of its significand, and shift representation of its exponent.
+//
+// Handles an arbitrary multiplier. The 'shift' output-value is
+// basically the 'floating-point exponent' of the multiplier:
+// Negative for a right-shift (when the multiplier is <1), positive for a
+// left-shift (when the multiplier is >1)
+void QuantizeMultiplierArray(const double* effective_scales, size_t size,
+                             int32_t* effective_scale_significand,
+                             int* effective_shift);
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
diff --git a/tensorflow/lite/kernels/internal/quantization_util_test.cc b/tensorflow/lite/kernels/internal/quantization_util_test.cc
index 2f8f7713795bf0e736fe85fcb582744974654b9e..ca4ff370ad4dff4bc6c58a074ce96a8a52029d9e 100644
--- a/tensorflow/lite/kernels/internal/quantization_util_test.cc
+++ b/tensorflow/lite/kernels/internal/quantization_util_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 namespace tflite {
 namespace {
 
+using ::testing::ElementsAreArray;
 using ::testing::Pair;
 
 template <class FloatIn, class IntOut>
@@ -406,6 +407,52 @@ TEST(QuantizationUtilTest, CalculateInputRadius) {
   EXPECT_EQ(CalculateInputRadius(4, 2), 503316480);
 }
 
+TEST(QuantizationUtilTest, QuantizeMultiplierArray) {
+  const std::vector<double> weights = {-4,    -2,   -1,  -0.5, -0.25, -0.125, 0,
+                                       0.125, 0.25, 0.5, 1,    2,     4};
+  const int size = weights.size();
+  std::vector<int32> effective_scale_significand(size);
+  std::vector<int> effective_scale_shift(size);
+  QuantizeMultiplierArray(weights.data(), size,
+                          effective_scale_significand.data(),
+                          effective_scale_shift.data());
+  const std::vector<int32> expected_effective_scale_significand = {
+      -1073741824,  // float scale = -4
+      -1073741824,  // float scale = -2
+      -1073741824,  // float scale = -1
+      -1073741824,  // float scale = -0.5
+      -1073741824,  // float scale = -0.25
+      -1073741824,  // float scale = -0.125
+      0,            // float scale = 0
+      1073741824,   // float scale = 0.125
+      1073741824,   // float scale = 0.25
+      1073741824,   // float scale = 0.5
+      1073741824,   // float scale = 1
+      1073741824,   // float scale = 2
+      1073741824,   // float scale = 4
+  };
+
+  const std::vector<int> expected_effective_scale_shift = {
+      3,   // float scale = -4
+      2,   // float scale = -2
+      1,   // float scale = -1
+      0,   // float scale = -0.5
+      -1,  // float scale = -0.25
+      -2,  // float scale = -0.125
+      0,   // float scale = 0
+      -2,  // float scale = 0.125
+      -1,  // float scale = 0.25
+      0,   // float scale = 0.5
+      1,   // float scale = 1
+      2,   // float scale = 2
+      3,   // float scale = 4
+  };
+  EXPECT_THAT(effective_scale_significand,
+              ElementsAreArray(expected_effective_scale_significand));
+  EXPECT_THAT(effective_scale_shift,
+              ElementsAreArray(expected_effective_scale_shift));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h b/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
index 002444b6810925910a651dd5c919a46ac8e5fb47..c38f37416dde30cf16a41d6cc6f08dc40f3dfe7d 100644
--- a/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
+++ b/tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h
@@ -23,90 +23,173 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
+
+// Used in tests and template parameters to control which version of depthwise
+// convolution is called. Primarily for reference code, and specializations
+// forced in tests.
+enum class DepthwiseConvImplementation {
+  // Run all tests against kUseStandardEntry even if also testing another
+  // kernel, since we need to be sure that the main DepthwiseConv() function in
+  // optimized_ops.h dispatches to a correctly-executing kernel.
+  kNone = 0,                 // The "default" option: use the normal
+                             // DepthwiseConv kernel (entry) function.
+  kUseGenericKernel,         // Forced use of generic kernel.
+  kUseNeon3x3,               // 3x3 kernel that uses NEON when available.
+  kUseNeon3x3DotProduct,     // 3x3 kernel that uses dot-product enabled NEON
+                             // when available.
+  kUseCModel3x3DotProduct,   // 3x3 kernel, reference C model that is intended
+                             // to match overall design NEON code.
+  kUseUnwound3x3DotProduct,  // 3x3 kernel, reference C model with unwound loops
+                             // and some arrays.
+  kUseIntrinsics3x3DotProduct,  // 3x3 kernel using NEON intrinsics.
+};
+
+// Category of depthwise convolution output rounding.
+enum class DepthwiseConvOutputRounding {
+  kNone = 0,      // Invalid: specific method must be specified.
+  kAwayFromZero,  // Original method: exact halves rounded away from zero.
+  kUpward,        // Halves towards +infinity: adds 0.5 before truncate.
+  // This is where a future kNearestEven would be placed.
+};
+
+// Category of depthwise convolution depth multiplication.
+enum class DepthwiseConvDepthMultiplication {
+  kNoMultiplication = 0,  // Depth multiplier = 1.
+  kUnitInputDepth,        // Input depth = 1, output depth = depth multiplier.
+};
+
 namespace reference_ops {
+namespace depthwise_conv {
 
-inline void DepthwiseConv(
-    const DepthwiseParams& params, const RuntimeShape& input_shape,
-    const uint8* input_data, const RuntimeShape& filter_shape,
-    const uint8* filter_data, const RuntimeShape& bias_shape,
-    const int32* bias_data, const RuntimeShape& output_shape,
-    uint8* output_data) {
-  const int stride_width = params.stride_width;
-  const int stride_height = params.stride_height;
-  const int dilation_width_factor = params.dilation_width_factor;
-  const int dilation_height_factor = params.dilation_height_factor;
-  const int pad_width = params.padding_values.width;
-  const int pad_height = params.padding_values.height;
-  const int depth_multiplier = params.depth_multiplier;
-  const int32 output_activation_min = params.quantized_activation_min;
-  const int32 output_activation_max = params.quantized_activation_max;
-  const int32 input_offset = params.input_offset;
-  const int32 filter_offset = params.weights_offset;
-  const int32 output_offset = params.output_offset;
-  const int32 output_multiplier = params.output_multiplier;
-  const int output_shift = params.output_shift;
-  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
-
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const int input_depth = input_shape.Dims(3);
-  const int filter_height = filter_shape.Dims(1);
-  const int filter_width = filter_shape.Dims(2);
-  const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
-  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
-
-  for (int b = 0; b < batches; ++b) {
-    for (int out_y = 0; out_y < output_height; ++out_y) {
-      for (int out_x = 0; out_x < output_width; ++out_x) {
-        for (int ic = 0; ic < input_depth; ++ic) {
-          for (int m = 0; m < depth_multiplier; m++) {
-            const int oc = m + ic * depth_multiplier;
-            const int in_x_origin = (out_x * stride_width) - pad_width;
-            const int in_y_origin = (out_y * stride_height) - pad_height;
-            int32 acc = 0;
-            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
-              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-                const int in_y =
-                    in_y_origin + dilation_height_factor * filter_y;
-                // If the location is outside the bounds of the input image,
-                // use zero as a default value.
-                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
-                    (in_y < input_height)) {
-                  int32 input_val =
-                      input_data[Offset(input_shape, b, in_y, in_x, ic)];
-                  int32 filter_val = filter_data[Offset(
-                      filter_shape, 0, filter_y, filter_x, oc)];
-                  acc +=
-                      (filter_val + filter_offset) * (input_val + input_offset);
+template <DepthwiseConvOutputRounding output_rounding>
+inline int32 DepthwiseConvRound(int32 x, int32 quantized_multiplier,
+                                int shift) {
+  TFLITE_DCHECK_NE(output_rounding, DepthwiseConvOutputRounding::kNone);
+  return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
+}
+
+template <>
+inline int32 DepthwiseConvRound<DepthwiseConvOutputRounding::kAwayFromZero>(
+    int32 x, int32 quantized_multiplier, int shift) {
+  return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
+}
+
+template <>
+inline int32 DepthwiseConvRound<DepthwiseConvOutputRounding::kUpward>(
+    int32 x, int32 quantized_multiplier, int shift) {
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  const int left_shift = shift > 0 ? shift : 0;
+  const int right_shift = shift > 0 ? 0 : -shift;
+  const int rounding_offset = right_shift > 0 ? 1 << (right_shift - 1) : 0;
+  return (SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
+                                            quantized_multiplier) +
+          rounding_offset) >>
+         right_shift;
+}
+
+template <DepthwiseConvOutputRounding output_rounding>
+struct DepthwiseConvBasicKernel {
+  static inline void Run(const DepthwiseParams& params,
+                         const RuntimeShape& input_shape,
+                         const uint8* input_data,
+                         const RuntimeShape& filter_shape,
+                         const uint8* filter_data,
+                         const RuntimeShape& bias_shape, const int32* bias_data,
+                         const RuntimeShape& output_shape, uint8* output_data) {
+    const int stride_width = params.stride_width;
+    const int stride_height = params.stride_height;
+    const int dilation_width_factor = params.dilation_width_factor;
+    const int dilation_height_factor = params.dilation_height_factor;
+    const int pad_width = params.padding_values.width;
+    const int pad_height = params.padding_values.height;
+    const int depth_multiplier = params.depth_multiplier;
+    const int32 output_activation_min = params.quantized_activation_min;
+    const int32 output_activation_max = params.quantized_activation_max;
+    const int32 input_offset = params.input_offset;
+    const int32 filter_offset = params.weights_offset;
+    const int32 output_offset = params.output_offset;
+    const int32 output_multiplier = params.output_multiplier;
+    const int output_shift = params.output_shift;
+    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+    TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+    const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+    const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+    const int input_height = input_shape.Dims(1);
+    const int input_width = input_shape.Dims(2);
+    const int input_depth = input_shape.Dims(3);
+    const int filter_height = filter_shape.Dims(1);
+    const int filter_width = filter_shape.Dims(2);
+    const int output_height = output_shape.Dims(1);
+    const int output_width = output_shape.Dims(2);
+    TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+    for (int b = 0; b < batches; ++b) {
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          for (int ic = 0; ic < input_depth; ++ic) {
+            for (int m = 0; m < depth_multiplier; m++) {
+              const int oc = m + ic * depth_multiplier;
+              const int in_x_origin = (out_x * stride_width) - pad_width;
+              const int in_y_origin = (out_y * stride_height) - pad_height;
+              int32 acc = 0;
+              for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+                for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                  const int in_x =
+                      in_x_origin + dilation_width_factor * filter_x;
+                  const int in_y =
+                      in_y_origin + dilation_height_factor * filter_y;
+                  // If the location is outside the bounds of the input image,
+                  // use zero as a default value.
+                  if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                      (in_y < input_height)) {
+                    int32 input_val =
+                        input_data[Offset(input_shape, b, in_y, in_x, ic)];
+                    int32 filter_val = filter_data[Offset(
+                        filter_shape, 0, filter_y, filter_x, oc)];
+                    acc += (filter_val + filter_offset) *
+                           (input_val + input_offset);
+                  }
                 }
               }
+              if (bias_data) {
+                acc += bias_data[oc];
+              }
+              acc = DepthwiseConvRound<output_rounding>(acc, output_multiplier,
+                                                        output_shift);
+              acc += output_offset;
+              acc = std::max(acc, output_activation_min);
+              acc = std::min(acc, output_activation_max);
+              output_data[Offset(output_shape, b, out_y, out_x, oc)] =
+                  static_cast<uint8>(acc);
             }
-            if (bias_data) {
-              acc += bias_data[oc];
-            }
-            acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
-                                                output_shift);
-            acc += output_offset;
-            acc = std::max(acc, output_activation_min);
-            acc = std::min(acc, output_activation_max);
-            output_data[Offset(output_shape, b, out_y, out_x, oc)] =
-                static_cast<uint8>(acc);
           }
         }
       }
     }
   }
+};
+
+}  // namespace depthwise_conv
+
+inline void DepthwiseConv(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const uint8* input_data, const RuntimeShape& filter_shape,
+    const uint8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    uint8* output_data) {
+  return depthwise_conv::DepthwiseConvBasicKernel<
+      DepthwiseConvOutputRounding::kAwayFromZero>::Run(params, input_shape,
+                                                       input_data, filter_shape,
+                                                       filter_data, bias_shape,
+                                                       bias_data, output_shape,
+                                                       output_data);
 }
 
-}  // end namespace reference_ops
+}  // namespace reference_ops
 }  // end namespace tflite
 
 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_DEPTHWISECONV_UINT8_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/add.h b/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
new file mode 100644
index 0000000000000000000000000000000000000000..a694ba2aaa993b0631958e0b338a7a62e154de75
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/add.h
@@ -0,0 +1,144 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
+
+#include <limits>
+#include "public/gemmlowp.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+// Element-wise add that can often be used for inner loop of broadcast add as
+// well as the non-broadcast add.
+inline void AddElementwise(int size, const ArithmeticParams& params,
+                           const int8_t* input1_data, const int8_t* input2_data,
+                           int8_t* output_data) {
+  const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
+  TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
+  TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
+  TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
+  TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
+
+  for (int i = 0; i < size; ++i) {
+    const int32 input1_val = params.input1_offset + input1_data[i];
+    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32 scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    const int32 scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32 raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sum, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32 clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int8_t* input1_data,
+                const RuntimeShape& input2_shape, const int8_t* input2_data,
+                const RuntimeShape& output_shape, int8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+
+  const int32_t int8_max_value = std::numeric_limits<int8_t>::max();
+  TFLITE_DCHECK_GE(params.input1_offset, -1 * int8_max_value);
+  TFLITE_DCHECK_GE(params.input2_offset, -1 * int8_max_value);
+  TFLITE_DCHECK_LE(params.input1_offset, int8_max_value);
+  TFLITE_DCHECK_LE(params.input2_offset, int8_max_value);
+  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const int8_t* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const int8_t* input2_data,
+                               const RuntimeShape& output_shape,
+                               int8_t* output_data) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/int8");
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          const int32_t input1_val =
+              params.input1_offset +
+              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
+          const int32_t input2_val =
+              params.input2_offset +
+              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+          const int32_t shifted_input1_val =
+              input1_val * (1 << params.left_shift);
+          const int32_t shifted_input2_val =
+              input2_val * (1 << params.left_shift);
+          const int32_t scaled_input1_val =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input1_val, params.input1_multiplier,
+                  params.input1_shift);
+          const int32_t scaled_input2_val =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input2_val, params.input2_multiplier,
+                  params.input2_shift);
+          const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+          const int32_t raw_output =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  raw_sum, params.output_multiplier, params.output_shift) +
+              params.output_offset;
+          const int32_t clamped_output =
+              std::min(params.quantized_activation_max,
+                       std::max(params.quantized_activation_min, raw_output));
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              static_cast<int8_t>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h b/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
new file mode 100644
index 0000000000000000000000000000000000000000..14e449e2bb32de6bcb54d7a9d2442764aa49cbe8
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
@@ -0,0 +1,128 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+// Fixed-point per-channel-quantization convolution reference kernel.
+inline void ConvPerChannel(
+    const ConvParams& params, const int32* output_multiplier,
+    const int32* output_shift, const RuntimeShape& input_shape,
+    const int8* input_data, const RuntimeShape& filter_shape,
+    const int8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    int8* output_data) {
+  // Get parameters.
+  const int32 input_offset = params.input_offset;  // r = s(q - Z)
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int32 output_offset = params.output_offset;
+
+  // Set min and max value of the output.
+  const int32 output_activation_min = std::numeric_limits<int8_t>::min();
+  const int32 output_activation_max = std::numeric_limits<int8_t>::max();
+
+  // Sanity check.
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+
+  // Check dimensions of the tensors.
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          int32 acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
+                                                      in_x, in_channel)];
+                  int32 filter_val =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  // Accumulate with 32 bits accumulator.
+                  // In the nudging process during model quantization, we force
+                  // real value of 0.0 be represented by a quantized value. This
+                  // guarantees that the input_offset is a int8, even though it
+                  // is represented using int32.
+                  // int32 += int8 * (int8 - int8) so the highest value we can
+                  // get from each accumulation is [-127, 127] * ([-128, 127] -
+                  // [-128, 127]), which is [-32512, 32512]. log2(32512)
+                  // = 14.98, which means we can accumulate at least 2^16
+                  // multiplications without overflow. The accumulator is
+                  // applied to a filter so the accumation logic will hold as
+                  // long as the filter size (filter_y * filter_x * in_channel)
+                  // does not exceed 2^16, which is the case in all the models
+                  // we have seen so far.
+                  // TODO(jianlijianli): Add a check to make sure the
+                  // accumulator depth is smaller than 2^16.
+                  acc += filter_val * (input_val - input_offset);
+                }
+              }
+            }
+          }
+
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+          acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier[out_channel], output_shift[out_channel]);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+              static_cast<int8_t>(acc);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_CONV_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
new file mode 100644
index 0000000000000000000000000000000000000000..90a7d613a92df70f54e989705d077c6660e66db1
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h
@@ -0,0 +1,125 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+inline void DepthwiseConvPerChannel(
+    const DepthwiseParams& params, const int32* output_multiplier,
+    const int32* output_shift, const RuntimeShape& input_shape,
+    const int8* input_data, const RuntimeShape& filter_shape,
+    const int8* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    int8* output_data) {
+  // Get parameters.
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32 input_offset = params.input_offset;
+  const int32 output_offset = params.output_offset;
+
+  // Set min and max value of the output.
+  const int32 output_activation_min = std::numeric_limits<int8_t>::min();
+  const int32 output_activation_max = std::numeric_limits<int8_t>::max();
+
+  // Check dimensions of the tensors.
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            int32 acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32 input_val = input_data[Offset(input_shape, batch, in_y,
+                                                      in_x, in_channel)];
+                  int32 filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, output_channel)];
+                  // Accumulate with 32 bits accumulator.
+                  // In the nudging process during model quantization, we force
+                  // real value of 0.0 be represented by a quantized value. This
+                  // guarentees that the input_offset is a int8, even though it
+                  // is represented using int32.
+                  // int32 += int8 * (int8 - int8) so the highest value we can
+                  // get from each accumulation is [-127, 127] * ([-128, 127] -
+                  // [-128, 127]), which is [-32512, 32512]. log2(32512)
+                  // = 14.98, which means we can accumulate at least 2^16
+                  // multiplications without overflow. The accumulator is
+                  // applied to a filter so the accumation logic will hold as
+                  // long as the filter size (filter_y * filter_x * in_channel)
+                  // does not exceed 2^16, which is the case in all the models
+                  // we have seen so far.
+                  // TODO(jianlijianli): Add a check to make sure the
+                  // accumulator depth is smaller than 2^16.
+                  acc += filter_val * (input_val - input_offset);
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[output_channel];
+            }
+            acc = MultiplyByQuantizedMultiplier(
+                acc, output_multiplier[output_channel],
+                output_shift[output_channel]);
+            acc += output_offset;
+            acc = std::max(acc, output_activation_min);
+            acc = std::min(acc, output_activation_max);
+            output_data[Offset(output_shape, batch, out_y, out_x,
+                               output_channel)] = static_cast<int8_t>(acc);
+          }
+        }
+      }
+    }
+  }
+}
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h b/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
new file mode 100644
index 0000000000000000000000000000000000000000..36b349f4d49bb66d6f60ed92629945640e24cb9c
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h
@@ -0,0 +1,69 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_FULLY_CONNECTED_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_FULLY_CONNECTED_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void FullyConnected(
+    const FullyConnectedParams& params, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data, void* gemm_context) {
+  (void)gemm_context;  // only used in optimized code.
+  const int32 input_offset = params.input_offset;
+  const int32 filter_offset = params.weights_offset;
+  const int32 output_offset = params.output_offset;
+  const int32 output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32 output_activation_min = params.quantized_activation_min;
+  const int32 output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int filter_dim_count = filter_shape.DimensionsCount();
+  const int batches = output_shape.Dims(0);
+  const int output_depth = output_shape.Dims(1);
+  TFLITE_DCHECK_LE(output_depth, filter_shape.Dims(filter_dim_count - 2));
+  const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      int32 acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        int32 input_val = input_data[b * accum_depth + d];
+        int32 filter_val = filter_data[out_c * accum_depth + d];
+        acc += (filter_val + filter_offset) * (input_val + input_offset);
+      }
+      if (bias_data) {
+        acc += bias_data[out_c];
+      }
+      acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+      acc += output_offset;
+      acc = std::max(acc, output_activation_min);
+      acc = std::min(acc, output_activation_max);
+      output_data[out_c + output_depth * b] = static_cast<int8_t>(acc);
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_FULLY_CONNECTED_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h b/tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..f22bb4f13803cf4e14c8b4fd18b9c301fab07359
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/log_softmax.h
@@ -0,0 +1,111 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOG_SOFTMAX_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOG_SOFTMAX_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void LogSoftmax(int32_t input_multiplier, int32_t input_shift,
+                       int32_t reverse_multiplier, int32_t reverse_shift,
+                       int32_t diff_min, int32_t outer_size, int32_t depth,
+                       const int8* input_data, int8* output_data) {
+  static constexpr int8_t kMinInt8 = std::numeric_limits<int8_t>::min();
+  static constexpr int8_t kMaxInt8 = std::numeric_limits<int8_t>::max();
+  static constexpr int32_t kMinInt32 = std::numeric_limits<int32_t>::min();
+
+  // [-16, 0] is mapped to [-128, 127] with 1/16 as scale and 127 as zero
+  // point. This nudges the output to [-255/16, 0].
+  static constexpr int32_t kOutputZeroPoint = 127;
+
+  // All IntegerBits must agree with Prepare function.
+  // Input is chosen as Q5.26 so exp(-1 * 2^5 * 2^-1) = exp(-16) is negligible.
+  static constexpr int kInputIntegerBits = 5;
+  static constexpr int kAccumulationIntegerBits = 12;
+  static constexpr int kOutputIntegerBits = 4;
+  using F5 = gemmlowp::FixedPoint<int32, kInputIntegerBits>;
+  using F12 = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
+
+  for (int outer_index = 0; outer_index < outer_size; ++outer_index) {
+    int8 max_in_row = kMinInt8;
+    for (int inner_index = 0; inner_index < depth; ++inner_index) {
+      max_in_row =
+          std::max(max_in_row, input_data[outer_index * depth + inner_index]);
+    }
+
+    // Accumulator "sum_of_exps_in_q12" is safe from overflowing in 2^12 steps.
+    F12 sum_of_exps_in_q12 = F12::FromRaw(0);
+    for (int inner_index = 0; inner_index < depth; ++inner_index) {
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[outer_index * depth + inner_index]) -
+          max_in_row;
+      if (input_diff >= diff_min) {
+        const int32_t input_diff_in_q5 = MultiplyByQuantizedMultiplier(
+            input_diff, input_multiplier, input_shift);
+        sum_of_exps_in_q12 =
+            sum_of_exps_in_q12 +
+            gemmlowp::Rescale<kAccumulationIntegerBits>(
+                exp_on_negative_values(F5::FromRaw(input_diff_in_q5)));
+      }
+    }
+
+    const int32_t log_sum_of_exps_in_q5 =
+        log_x_for_x_greater_than_or_equal_to_1<kInputIntegerBits>(
+            sum_of_exps_in_q12)
+            .raw();
+
+    // Potentially reduced the valid range. shifted_log_sum_of_exps_in_q5 is
+    // smallest representable in Q5.26 plus the log_sum_of_exps.
+    const int32_t shifted_log_sum_of_exps_in_q5 =
+        log_sum_of_exps_in_q5 + kMinInt32;
+    const int32_t adjusted_diff_min = std::max(
+        diff_min - 1,
+        MultiplyByQuantizedMultiplier(shifted_log_sum_of_exps_in_q5,
+                                      reverse_multiplier, -reverse_shift));
+
+    for (int inner_index = 0; inner_index < depth; ++inner_index) {
+      int32_t input_diff =
+          static_cast<int32_t>(input_data[outer_index * depth + inner_index]) -
+          max_in_row;
+      // Note use of > below instead of >= above.
+      if (input_diff > adjusted_diff_min) {
+        const int32_t input_diff_in_q5 = MultiplyByQuantizedMultiplier(
+            input_diff, input_multiplier, input_shift);
+
+        // Rescale and downcast.
+        int32_t output_in_q27 =
+            gemmlowp::RoundingDivideByPOT(
+                (input_diff_in_q5 - log_sum_of_exps_in_q5),
+                31 - kInputIntegerBits - kOutputIntegerBits) +
+            kOutputZeroPoint;
+
+        output_in_q27 =
+            std::max(std::min(output_in_q27, static_cast<int32_t>(kMaxInt8)),
+                     static_cast<int32_t>(kMinInt8));
+        output_data[outer_index * depth + inner_index] =
+            static_cast<int8_t>(output_in_q27);
+      } else {
+        output_data[outer_index * depth + inner_index] = kMinInt8;
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOG_SOFTMAX_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h b/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
new file mode 100644
index 0000000000000000000000000000000000000000..8277c3b3d565d845da4cc8931a4256c1005db77c
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
@@ -0,0 +1,64 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOGISTIC_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOGISTIC_H_
+
+#include <limits>
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void Logistic(int32_t input_zero_point, int32_t input_range_radius,
+                     int32_t input_multiplier, int32_t input_left_shift,
+                     int32_t input_size, const int8_t* input_data,
+                     int8_t* output_data) {
+  // Integer bits must be in sync with Prepare() function.
+  static constexpr int32_t kInputIntegerBits = 4;
+  static constexpr int32_t kOutputIntegerBits = 8;
+  static constexpr int8_t kMinInt8 = std::numeric_limits<int8_t>::min();
+  static constexpr int8_t kMaxInt8 = std::numeric_limits<int8_t>::max();
+  static constexpr int32_t kOutputZeroPoint = -128;
+
+  for (int i = 0; i < input_size; ++i) {
+    const int32_t input =
+        static_cast<int32_t>(input_data[i]) - input_zero_point;
+    if (input <= -input_range_radius) {
+      output_data[i] = kMinInt8;
+    } else if (input >= input_range_radius) {
+      output_data[i] = kMaxInt8;
+    } else {
+      const int32_t input_in_q4 = MultiplyByQuantizedMultiplier(
+          input, input_multiplier, input_left_shift);
+      using FixedPoint4 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
+      const int32_t output_in_q0 =
+          gemmlowp::logistic(FixedPoint4::FromRaw(input_in_q4)).raw();
+
+      // Rescale and downcast.
+      using gemmlowp::RoundingDivideByPOT;
+      int32_t output_in_q23 =
+          RoundingDivideByPOT(output_in_q0, 31 - kOutputIntegerBits);
+      output_in_q23 = std::min(std::max(output_in_q23 + kOutputZeroPoint,
+                                        static_cast<int32_t>(kMinInt8)),
+                               static_cast<int32_t>(kMaxInt8));
+      output_data[i] = static_cast<int8_t>(output_in_q23);
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_LOGISTIC_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h b/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e33d089945a2907e489c51c117eec77b194ed7e
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/mul.h
@@ -0,0 +1,130 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
+
+#include "public/gemmlowp.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void MulElementwise(int size, const ArithmeticParams& params,
+                           const int8_t* input1_data, const int8_t* input2_data,
+                           int8_t* output_data) {
+  for (int i = 0; i < size; ++i) {
+    const int32 input1_val = params.input1_offset + input1_data[i];
+    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32 unclamped_result =
+        params.output_offset +
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
+                                                       params.output_multiplier,
+                                                       params.output_shift);
+    const int32 clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int8_t* input1_data,
+                const RuntimeShape& input2_shape, const int8_t* input2_data,
+                const RuntimeShape& output_shape, int8_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  gemmlowp::ScopedProfilingLabel label("Mul/8bit");
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+
+  MulElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+// Mul with 16 bit inputs and int8_t outputs.
+inline void Mul(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int16* input1_data,
+                const RuntimeShape& input2_shape, const int16* input2_data,
+                const RuntimeShape& output_shape, int8_t* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Mul/Int16Int8");
+  int32 output_offset = params.output_offset;
+  int32 output_activation_min = params.quantized_activation_min;
+  int32 output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 unclamped_result =
+        F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
+    int16 rescaled_result =
+        gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
+    int16 clamped_result =
+        std::min<int16>(output_activation_max - output_offset, rescaled_result);
+    clamped_result =
+        std::max<int16>(output_activation_min - output_offset, clamped_result);
+    output_data[i] = output_offset + clamped_result;
+  }
+}
+
+inline void BroadcastMul4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const int8_t* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const int8_t* input2_data,
+                               const RuntimeShape& output_shape,
+                               int8_t* output_data) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastMul4DSlow/8bit");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  // The input shapes are extended as part of NdArrayDesc initialization.
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          const int32 input1_val =
+              params.input1_offset +
+              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
+          const int32 input2_val =
+              params.input2_offset +
+              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+          const int32 unclamped_result =
+              params.output_offset +
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  input1_val * input2_val, params.output_multiplier,
+                  params.output_shift);
+          const int32 clamped_output = std::min(
+              params.quantized_activation_max,
+              std::max(params.quantized_activation_min, unclamped_result));
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              static_cast<int8_t>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h b/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..2762bec8e6c3c8d69198456cbd16b04dc45ef2ab
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h
@@ -0,0 +1,141 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_POOLING_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_POOLING_H_
+
+#include <limits>
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void AveragePool(const PoolParams& params,
+                        const RuntimeShape& input_shape, const int8* input_data,
+                        const RuntimeShape& output_shape, int8* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          int32 acc = 0;
+          int filter_count = 0;
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              acc +=
+                  input_data[Offset(input_shape, batch, in_y, in_x, channel)];
+              filter_count++;
+            }
+          }
+          // Round to the closest integer value.
+          acc = acc > 0 ? (acc + filter_count / 2) / filter_count
+                        : (acc - filter_count / 2) / filter_count;
+          acc = std::max(acc, params.quantized_activation_min);
+          acc = std::min(acc, params.quantized_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+              static_cast<int8>(acc);
+        }
+      }
+    }
+  }
+}
+
+inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
+                    const int8* input_data, const RuntimeShape& output_shape,
+                    int8* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  TFLITE_DCHECK_GE(params.quantized_activation_min,
+                   std::numeric_limits<int8_t>::min());
+  TFLITE_DCHECK_LE(params.quantized_activation_max,
+                   std::numeric_limits<int8_t>::max());
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int channel = 0; channel < depth; ++channel) {
+          const int in_x_origin =
+              (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin =
+              (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end =
+              std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end =
+              std::min(params.filter_height, input_height - in_y_origin);
+          int8_t max = std::numeric_limits<int8_t>::lowest();
+          for (int filter_y = filter_y_start; filter_y < filter_y_end;
+               ++filter_y) {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end;
+                 ++filter_x) {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+              max = std::max(
+                  max,
+                  input_data[Offset(input_shape, batch, in_y, in_x, channel)]);
+            }
+          }
+          max = std::max<int8_t>(max, params.quantized_activation_min);
+          max = std::min<int8_t>(max, params.quantized_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, channel)] =
+              static_cast<int8_t>(max);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_POOLING_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/softmax.h b/tensorflow/lite/kernels/internal/reference/integer_ops/softmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f6bf1cb73e40b2bc396a59f5b47cefaea071d02
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/softmax.h
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_SOFTMAX_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_SOFTMAX_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+// Quantized softmax with int8 input and output.
+inline void Softmax(const SoftmaxParams& params,
+                    const RuntimeShape& input_shape, const int8* input_data,
+                    const RuntimeShape& output_shape, int8* output_data) {
+  const int32 input_beta_multiplier = params.input_multiplier;
+  const int32 input_beta_left_shift = params.input_left_shift;
+  const int diff_min = params.diff_min;
+  // The representation chosen for the input to the exp() function is Q5.26.
+  // We need to leave extra space since values that we skip might be as large as
+  // -32 before multiplying by input_beta_multiplier, and therefore as large as
+  // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
+  // accumulation, but exp(-16) definitely is.
+  static const int kScaledDiffIntegerBits = 5;
+  static const int kAccumulationIntegerBits = 12;
+  using FixedPointScaledDiff =
+      gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int i = 0; i < outer_size; ++i) {
+    int8 max_in_row = -128;
+    for (int c = 0; c < depth; ++c) {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
+
+    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_beta_multiplier, input_beta_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+        sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+                                        exp_on_negative_values(scaled_diff_f8));
+      }
+    }
+
+    int num_bits_over_unit;
+    FixedPoint0 shifted_scale = FixedPoint0::FromRaw(GetReciprocal(
+        sum_of_exps.raw(), kAccumulationIntegerBits, &num_bits_over_unit));
+
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_beta_multiplier, input_beta_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+
+        FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
+        const int32 unsat_output = gemmlowp::RoundingDivideByPOT(
+            (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
+        const int32 shifted_output = unsat_output - 128;
+
+        output_data[i * depth + c] = static_cast<int8>(
+            std::max(std::min(shifted_output, static_cast<int32>(127)),
+                     static_cast<int32>(-128)));
+
+      } else {
+        output_data[i * depth + c] = -128;
+      }
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_SOFTMAX_H_
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
new file mode 100644
index 0000000000000000000000000000000000000000..081928bc88d9c59e15b5ed857daf4a144abe2ad7
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
@@ -0,0 +1,63 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
+
+#include <limits>
+#include "tensorflow/lite/kernels/internal/common.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void Tanh(int32_t input_zero_point, int32_t input_range_radius,
+                 int32_t input_multiplier, int32_t input_shift,
+                 int32_t input_size, const int8_t* input_data,
+                 int8_t* output_data) {
+  // Integer bits must be in sync with Prepare() function.
+  static constexpr int32_t kInputIntegerBits = 4;
+  static constexpr int32_t kOutputScale = 7;
+  static constexpr int8_t kMinInt8 = std::numeric_limits<int8_t>::min();
+  static constexpr int8_t kMaxInt8 = std::numeric_limits<int8_t>::max();
+  using F4 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
+
+  for (int i = 0; i < input_size; ++i) {
+    const int32_t input =
+        static_cast<int32_t>(input_data[i]) - input_zero_point;
+    if (input <= -input_range_radius) {
+      output_data[i] = kMinInt8;
+    } else if (input >= input_range_radius) {
+      output_data[i] = kMaxInt8;
+    } else {
+      const int32_t input_in_q4 =
+          MultiplyByQuantizedMultiplier(input, input_multiplier, input_shift);
+      const int32_t output_in_q0 =
+          gemmlowp::tanh(F4::FromRaw(input_in_q4)).raw();
+
+      // Rescale and downcast.
+      using gemmlowp::RoundingDivideByPOT;
+      int32_t output_in_q24 =
+          RoundingDivideByPOT(output_in_q0, 31 - kOutputScale);
+      output_in_q24 =
+          std::min(std::max(output_in_q24, static_cast<int32_t>(kMinInt8)),
+                   static_cast<int32_t>(kMaxInt8));
+      output_data[i] = static_cast<int8_t>(output_in_q24);
+    }
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
diff --git a/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h b/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
index 380fc8f98ebbdd90bb68144a46903640734bff08..390bf08e30300625471f8fe0bfceac21fc43756d 100644
--- a/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/legacy_reference_ops.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
 
@@ -2033,7 +2034,16 @@ template <typename T1, typename T2, typename T3>
 void ArgMax(const T3* axis, const T1* input_data,
             const tflite::Dims<4>& input_dims, T2* output_data,
             const tflite::Dims<4>& output_dims) {
-  ArgMinMax(DimsToShape(input_dims), input_data, axis, DimsToShape(output_dims),
+  // Assumes the input always has 4 dimensions, and therefore,
+  // output always has three dimensions.
+  auto output_shape = RuntimeShape(
+      {output_dims.sizes[2], output_dims.sizes[1], output_dims.sizes[0]});
+  // Another way to interpret this is that output_dims.sizes[4] is always 1.
+  TFLITE_DCHECK_EQ(output_shape.FlatSize(),
+                   DimsToShape(output_dims).FlatSize());
+  // Legacy path only supported this.
+  TFLITE_DCHECK_EQ(axis[0], 3);
+  ArgMinMax(DimsToShape(input_dims), input_data, axis, output_shape,
             output_data, std::greater<T1>());
 }
 
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index d692063a968dab654eaf46b9956ddcd338b64410..f5c4b78dc1429f45e477ecc9528e976aeda2ab1f 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/activation_functor.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/round.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 
@@ -101,7 +102,6 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
       __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
                          3 /* temporal locality */);
 #endif
-      // For every block of 16 8-bit elements (128-bit register) from each row.
       for (col = 0; col < m_cols; ++col, ++row_ptr) {
         dotprod += (*row_ptr) * (vectors[col]);
       }  // for col
@@ -110,6 +110,73 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
   }    // for batch
 }
 
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
+    const float* matrix, const uint8_t* ledger, int m_rows, int m_cols,
+    const float* vector, int n_batch, float* result, int result_stride) {
+  const int kBlockSize = 16;
+  TFLITE_DCHECK_EQ(  // NOLINT
+      m_cols % kBlockSize, 0);
+  float* result_in_batch = result;
+  for (int b = 0; b < n_batch; b++) {
+    const float* matrix_ptr = matrix;
+    const uint8_t* ledger_ptr = ledger;
+    for (int r = 0; r < m_rows; r++) {
+      float dot_prod = 0.0f;
+      int num_nonzero_blocks = *ledger_ptr++;
+      if (num_nonzero_blocks > 0) {
+        const float* vector_in_batch = vector + b * m_cols;
+        for (int i = 0; i < num_nonzero_blocks; i++) {
+          const int block_start_index = *ledger_ptr++ * kBlockSize;
+          const float* vector_block_in_batch_ptr =
+              vector_in_batch + block_start_index;
+          for (int c = 0; c < kBlockSize; c++) {
+            dot_prod += *matrix_ptr++ * *vector_block_in_batch_ptr++;
+          }
+        }
+      }
+      *result_in_batch += dot_prod;
+      result_in_batch += result_stride;
+    }
+  }
+}
+
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride) {
+  const int kBlockSize = 16;
+  TFLITE_DCHECK_EQ(  // NOLINT
+      m_cols % kBlockSize, 0);
+  int batch, row;
+  for (batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
+    const float batch_scaling_factor = scaling_factors[batch];
+    // Get the address of the first row.
+    const int8_t* row_ptr = matrix;
+    const uint8_t* ledger_ptr = ledger;
+    for (row = 0; row < m_rows; ++row, result += result_stride) {
+      // Initialize the dot product sum for the row to 0.
+      int32_t dotprod = 0;
+#if defined(__GNUC__)
+      // Prefetch the row to cache.
+      __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
+                         3 /* temporal locality */);
+#endif
+      int num_nonzero_blocks = *ledger_ptr++;
+      if (num_nonzero_blocks > 0) {
+        for (int i = 0; i < num_nonzero_blocks; i++) {
+          const int block_start_index = *ledger_ptr++ * kBlockSize;
+          const int8_t* vector_block_ptr = vectors + block_start_index;
+          for (int c = 0; c < kBlockSize; c++) {
+            dotprod += (*row_ptr++) * (*vector_block_ptr++);
+          }  // for block
+        }
+      }
+      *result += (dotprod * batch_scaling_factor);
+    }  // for row
+  }    // for batch
+}
+
 void PortableVectorVectorCwiseProduct(const float* vector1,
                                       const float* vector2, int v_size,
                                       float* result) {
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
index a06ebc1600d4fe47cf054b4e157bc21a5f70ddfc..49b59da0bbaf7aec6ba1b66b499df8d5426f5951 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -48,6 +48,16 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ vectors, const float* scaling_factors,
     int n_batch, float* __restrict__ result, int result_stride);
 
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
+    const float* matrix, const uint8_t* ledger, int m_rows, int m_cols,
+    const float* vector, int n_batch, float* result, int result_stride);
+
+void PortableSparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride);
+
 // Cwise product of two vectors.
 void PortableVectorVectorCwiseProduct(const float* vector1,
                                       const float* vector2, int v_size,
@@ -165,6 +175,23 @@ void MatrixBatchVectorMultiplyAccumulate(
                                               result_stride);
 }
 
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const float* matrix, const uint8_t* ledger, int m_rows, int m_cols,
+    const float* vector, int n_batch, float* result, int result_stride) {
+  PortableSparseMatrixBatchVectorMultiplyAccumulate(
+      matrix, ledger, m_rows, m_cols, vector, n_batch, result, result_stride);
+}
+
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride) {
+  PortableSparseMatrixBatchVectorMultiplyAccumulate(
+      matrix, ledger, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+      result_stride);
+}
+
 void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
                               int v_size, float* result) {
   PortableVectorVectorCwiseProduct(vector1, vector2, v_size, result);
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index ea3ab06da1f775b5ea0771bbb3f32c91c9caacd0..d2f5d987b9c2a481d807e9e11975b1f543cec678 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -36,68 +36,6 @@ limitations under the License.
 
 namespace tflite {
 
-// TODO(b/77858996): Add these to gemmlowp.
-template <typename IntegerType>
-IntegerType SaturatingAddNonGemmlowp(IntegerType a, IntegerType b) {
-  static_assert(std::is_same<IntegerType, void>::value, "unimplemented");
-  return a;
-}
-
-template <>
-inline std::int32_t SaturatingAddNonGemmlowp(std::int32_t a, std::int32_t b) {
-  std::int64_t a64 = a;
-  std::int64_t b64 = b;
-  std::int64_t sum = a64 + b64;
-  return static_cast<std::int32_t>(std::min(
-      static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::max()),
-      std::max(
-          static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()),
-          sum)));
-}
-
-template <typename tRawType, int tIntegerBits>
-gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingAddNonGemmlowp(
-    gemmlowp::FixedPoint<tRawType, tIntegerBits> a,
-    gemmlowp::FixedPoint<tRawType, tIntegerBits> b) {
-  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
-      SaturatingAddNonGemmlowp(a.raw(), b.raw()));
-}
-
-template <typename IntegerType>
-IntegerType SaturatingSub(IntegerType a, IntegerType b) {
-  static_assert(std::is_same<IntegerType, void>::value, "unimplemented");
-  return a;
-}
-
-template <>
-inline std::int16_t SaturatingSub(std::int16_t a, std::int16_t b) {
-  std::int32_t a32 = a;
-  std::int32_t b32 = b;
-  std::int32_t diff = a32 - b32;
-  return static_cast<std::int16_t>(std::min(32767, std::max(-32768, diff)));
-}
-
-template <>
-inline std::int32_t SaturatingSub(std::int32_t a, std::int32_t b) {
-  std::int64_t a64 = a;
-  std::int64_t b64 = b;
-  std::int64_t diff = a64 - b64;
-  return static_cast<std::int32_t>(std::min(
-      static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::max()),
-      std::max(
-          static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()),
-          diff)));
-}
-
-template <typename tRawType, int tIntegerBits>
-gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingSub(
-    gemmlowp::FixedPoint<tRawType, tIntegerBits> a,
-    gemmlowp::FixedPoint<tRawType, tIntegerBits> b) {
-  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
-      SaturatingSub(a.raw(), b.raw()));
-}
-// End section to be moved to gemmlowp.
-
 namespace reference_ops {
 
 // Return true for broadcast case, false otherwise.
@@ -192,59 +130,6 @@ inline bool ProcessBroadcastShapes(const RuntimeShape& shape0,
   return true;
 }
 
-template <typename T>
-int CountLeadingZeros(T integer_input) {
-  static_assert(std::is_unsigned<T>::value,
-                "Only unsigned integer types handled.");
-  if (integer_input == 0) {
-    return std::numeric_limits<T>::digits;
-  }
-  const T one_in_leading_positive = static_cast<T>(1)
-                                    << (std::numeric_limits<T>::digits - 1);
-  int leading_zeros = 0;
-  while (integer_input < one_in_leading_positive) {
-    integer_input <<= 1;
-    ++leading_zeros;
-  }
-  return leading_zeros;
-}
-
-template <typename IntegerType>
-IntegerType SaturatingRoundingMultiplyByPOTParam(IntegerType x, int exponent) {
-  if (exponent == 0) {
-    return x;
-  }
-  using ScalarIntegerType =
-      typename gemmlowp::FixedPointRawTypeTraits<IntegerType>::ScalarRawType;
-  const IntegerType min =
-      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::min());
-  const IntegerType max =
-      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::max());
-  const int ScalarIntegerTypeBits = 8 * sizeof(ScalarIntegerType);
-
-  const std::int32_t threshold =
-      ((1 << (ScalarIntegerTypeBits - 1 - exponent)) - 1);
-  const IntegerType positive_mask =
-      gemmlowp::MaskIfGreaterThan(x, gemmlowp::Dup<IntegerType>(threshold));
-  const IntegerType negative_mask =
-      gemmlowp::MaskIfLessThan(x, gemmlowp::Dup<IntegerType>(-threshold));
-
-  IntegerType result = gemmlowp::ShiftLeft(x, exponent);
-  result = gemmlowp::SelectUsingMask(positive_mask, max, result);
-  result = gemmlowp::SelectUsingMask(negative_mask, min, result);
-  return result;
-}
-
-// If we want to leave IntegerBits fixed, then multiplication
-// by a power of two has to be saturating/rounding, not exact anymore.
-template <typename tRawType, int tIntegerBits>
-gemmlowp::FixedPoint<tRawType, tIntegerBits>
-SaturatingRoundingMultiplyByPOTParam(
-    gemmlowp::FixedPoint<tRawType, tIntegerBits> a, int exponent) {
-  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
-      SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent));
-}
-
 inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
                  const float* input_data, const RuntimeShape& filter_shape,
                  const float* filter_data, const RuntimeShape& bias_shape,
@@ -506,6 +391,15 @@ inline void SpaceToDepth(const tflite::SpaceToDepthParams& op_params,
   }
 }
 
+inline void Elu(const RuntimeShape& input_shape, const float* input_data,
+                const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    output_data[i] = val < 0.0 ? std::exp(val) - 1 : val;
+  }
+}
+
 inline void Relu(const RuntimeShape& input_shape, const float* input_data,
                  const RuntimeShape& output_shape, float* output_data) {
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
@@ -543,16 +437,17 @@ inline void Relu6(const RuntimeShape& input_shape, const float* input_data,
   }
 }
 
+template <typename T>
 inline void ReluX(const tflite::ActivationParams& params,
-                  const RuntimeShape& input_shape, const uint8* input_data,
-                  const RuntimeShape& output_shape, uint8* output_data) {
+                  const RuntimeShape& input_shape, const T* input_data,
+                  const RuntimeShape& output_shape, T* output_data) {
   gemmlowp::ScopedProfilingLabel label("Quantized ReluX (not fused)");
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
-  const uint8 max_value = params.quantized_activation_max;
-  const uint8 min_value = params.quantized_activation_min;
+  const T max_value = params.quantized_activation_max;
+  const T min_value = params.quantized_activation_min;
   for (int i = 0; i < flat_size; ++i) {
-    const uint8 val = input_data[i];
-    const uint8 clamped =
+    const T val = input_data[i];
+    const T clamped =
         val > max_value ? max_value : val < min_value ? min_value : val;
     output_data[i] = clamped;
   }
@@ -702,6 +597,22 @@ inline void Add(const ArithmeticParams& params,
   }
 }
 
+// T is expected to be either float or int.
+template <typename T>
+inline void AddN(const RuntimeShape& input_shape, const size_t num_inputs,
+                 T* const* input_data, T* output_data) {
+  // All inputs and output should have the same shape, this is checked during
+  // Prepare stage.
+  const size_t size = input_shape.FlatSize();
+  for (int i = 0; i < size; ++i) {
+    T x = 0;
+    for (int j = 0; j < num_inputs; ++j) {
+      x += input_data[j][i];
+    }
+    output_data[i] = x;
+  }
+}
+
 // Element-wise add that can often be used for inner loop of broadcast add as
 // well as the non-broadcast add.
 inline void AddElementwise(int size, const ArithmeticParams& params,
@@ -735,6 +646,40 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
   }
 }
 
+// Scalar-broadcast add that can be used for inner loop of more general
+// broadcast add, so that, for example, scalar-broadcast with batch will still
+// be fast.
+inline void AddScalarBroadcast(int size, const ArithmeticParams& params,
+                               uint8 input1_data, const uint8* input2_data,
+                               uint8* output_data) {
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+
+  const int32 input1_val = params.input1_offset + input1_data;
+  const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
+  const int32 scaled_input1_val =
+      MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          shifted_input1_val, params.input1_multiplier, params.input1_shift);
+  for (int i = 0; i < size; ++i) {
+    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32 scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32 raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sum, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32 clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<uint8>(clamped_output);
+  }
+}
+
 inline void Add(const ArithmeticParams& params,
                 const RuntimeShape& input1_shape, const uint8* input1_data,
                 const RuntimeShape& input2_shape, const uint8* input2_data,
@@ -975,26 +920,63 @@ inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
   uint8* output_data_ptr = output_data;
   const uint8* input1_data_ptr = input1_data;
   const uint8* input2_data_reset = input2_data;
+  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
+  // between input shapes. y3 for input 1 is always broadcast, and so the
+  // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
+  // Put another way,
+  // input1.shape.FlatSize = y0 * y1 * y2 * y4,
+  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
   int y0 = params.broadcast_shape[0];
   int y1 = params.broadcast_shape[1];
   int y2 = params.broadcast_shape[2];
   int y3 = params.broadcast_shape[3];
   int y4 = params.broadcast_shape[4];
-  for (int i0 = 0; i0 < y0; ++i0) {
-    const uint8* input2_data_ptr;
-    for (int i1 = 0; i1 < y1; ++i1) {
-      input2_data_ptr = input2_data_reset;
-      for (int i2 = 0; i2 < y2; ++i2) {
-        for (int i3 = 0; i3 < y3; ++i3) {
-          AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
-                         output_data_ptr);
-          input2_data_ptr += y4;
-          output_data_ptr += y4;
+  if (y4 > 1) {
+    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
+    // dimension.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const uint8* input2_data_ptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          for (int i3 = 0; i3 < y3; ++i3) {
+            AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
+                           output_data_ptr);
+            input2_data_ptr += y4;
+            output_data_ptr += y4;
+          }
+          // We have broadcast y4 of input1 data y3 times, and now move on.
+          input1_data_ptr += y4;
         }
-        input1_data_ptr += y4;
       }
+      // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
+      input2_data_reset = input2_data_ptr;
+    }
+  } else {
+    // Special case of y4 == 1, in which the innermost loop is a single element
+    // and can be combined with the next (y3) as an inner broadcast.
+    //
+    // Note that this handles the case of pure scalar broadcast when
+    // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
+    // broadcast with batch (as y2 > 1).
+    //
+    // NOTE The process is the same as the above general case except simplified
+    // for y4 == 1 and the loop over y3 is contained within the
+    // AddScalarBroadcast function.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const uint8* input2_data_ptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
+                             output_data_ptr);
+          input2_data_ptr += y3;
+          output_data_ptr += y3;
+          input1_data_ptr += 1;
+        }
+      }
+      input2_data_reset = input2_data_ptr;
     }
-    input2_data_reset = input2_data_ptr;
   }
 }
 
@@ -1591,6 +1573,7 @@ inline void SubWithActivation(const ArithmeticParams& params,
                               const int32* input2_data,
                               const RuntimeShape& output_shape,
                               int32* output_data) {
+  gemmlowp::ScopedProfilingLabel label("SubWithActivation");
   const int flat_size =
       MatchingFlatSize(input1_shape, input2_shape, input2_shape);
   for (int i = 0; i < flat_size; ++i) {
@@ -1616,12 +1599,61 @@ inline void SubWithActivation(const ArithmeticParams& params,
   }
 }
 
+inline void Sub16(const ArithmeticParams& params,
+                  const RuntimeShape& input1_shape, const int16_t* input1_data,
+                  const RuntimeShape& input2_shape, const int16_t* input2_data,
+                  const RuntimeShape& output_shape, int16_t* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Sub/Int16");
+  const int input1_shift = params.input1_shift;
+  const int flat_size =
+      MatchingFlatSize(output_shape, input1_shape, input2_shape);
+  const int16 output_activation_min = params.quantized_activation_min;
+  const int16 output_activation_max = params.quantized_activation_max;
+
+  TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
+  TFLITE_DCHECK_LE(input1_shift, 0);
+  TFLITE_DCHECK_LE(params.input2_shift, 0);
+  const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data;
+  const int16* shift_input = input1_shift == 0 ? input2_data : input1_data;
+  const int input_right_shift =
+      input1_shift == 0 ? -params.input2_shift : -input1_shift;
+
+  if (input1_shift == 0) {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+    for (int i = 0; i < flat_size; ++i) {
+      F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
+      F0 scaled_input = F0::FromRaw(
+          gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
+      F0 result = SaturatingSub(input_ready_scaled, scaled_input);
+      const int16 raw_output = result.raw();
+      const int16 clamped_output = std::min(
+          output_activation_max, std::max(output_activation_min, raw_output));
+      output_data[i] = clamped_output;
+    }
+  } else {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+    for (int i = 0; i < flat_size; ++i) {
+      F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
+      F0 scaled_input = F0::FromRaw(
+          gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
+      F0 result = SaturatingSub(scaled_input, input_ready_scaled);
+      const int16 raw_output = result.raw();
+      const int16 clamped_output = std::min(
+          output_activation_max, std::max(output_activation_min, raw_output));
+      output_data[i] = clamped_output;
+    }
+  }
+}
+
 template <typename Scalar>
 inline void Concatenation(const ConcatenationParams& params,
                           const RuntimeShape* const* input_shapes,
                           const Scalar* const* input_data,
                           const RuntimeShape& output_shape,
                           Scalar* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Concatenation");
   int axis = params.axis;
   int inputs_count = params.inputs_count;
   const int concat_dimensions = output_shape.DimensionsCount();
@@ -1669,6 +1701,7 @@ inline void ConcatenationWithScaling(const ConcatenationParams& params,
                                      const uint8* const* input_data,
                                      const RuntimeShape& output_shape,
                                      uint8* output_data) {
+  gemmlowp::ScopedProfilingLabel label("ConcatenationWithScaling/Uint8");
   int axis = params.axis;
   const int32* input_zeropoint = params.input_zeropoint;
   const float* input_scale = params.input_scale;
@@ -1730,6 +1763,7 @@ template <typename Scalar>
 void Pack(const PackParams& params, const RuntimeShape* const* input_shapes,
           const Scalar* const* input_data, const RuntimeShape& output_shape,
           Scalar* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Pack");
   const int dimensions = output_shape.DimensionsCount();
   int axis = params.axis;
   int inputs_count = params.inputs_count;
@@ -1757,15 +1791,22 @@ template <typename Scalar>
 void Unpack(const UnpackParams& params, const RuntimeShape& input_shape,
             const Scalar* input_data, const RuntimeShape& output_shape,
             Scalar* const* output_datas) {
+  gemmlowp::ScopedProfilingLabel label("Unpack");
   const int dimensions = input_shape.DimensionsCount();
   const int outputs_count = params.num_split;
 
   int outer_size = 1;
-  for (int i = 0; i < params.axis; i++) {
+  int axis = params.axis;
+  if (axis < 0) {
+    axis += dimensions;
+  }
+  TFLITE_DCHECK_GE(axis, 0);
+  TFLITE_DCHECK_LT(axis, dimensions);
+  for (int i = 0; i < axis; ++i) {
     outer_size *= input_shape.Dims(i);
   }
   int copy_size = 1;
-  for (int i = params.axis + 1; i < dimensions; i++) {
+  for (int i = axis + 1; i < dimensions; ++i) {
     copy_size *= input_shape.Dims(i);
   }
   TFLITE_DCHECK_EQ(output_shape.FlatSize(), copy_size * outer_size);
@@ -1784,6 +1825,7 @@ void PackWithScaling(const PackParams& params,
                      const RuntimeShape* const* input_shapes,
                      const uint8* const* input_data,
                      const RuntimeShape& output_shape, uint8* output_data) {
+  gemmlowp::ScopedProfilingLabel label("PackWithScaling");
   const int dimensions = output_shape.DimensionsCount();
   int axis = params.axis;
   const int32* input_zeropoint = params.input_zeropoint;
@@ -1833,6 +1875,7 @@ void DepthConcatenation(const ConcatenationParams& params,
                         const RuntimeShape* const* input_shapes,
                         const Scalar* const* input_data,
                         const RuntimeShape& output_shape, Scalar* output_data) {
+  gemmlowp::ScopedProfilingLabel label("DepthConcatenation");
   auto params_copy = params;
   params_copy.axis = 3;
   Concatenation(params_copy, input_shapes, input_data, output_shape,
@@ -2234,6 +2277,7 @@ template <typename Scalar>
 void Split(const SplitParams& params, const RuntimeShape& input_shape,
            const Scalar* input_data, const RuntimeShape* const* output_shapes,
            Scalar* const* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Split");
   const int concat_dimensions = input_shape.DimensionsCount();
   int axis = params.axis < 0 ? params.axis + concat_dimensions : params.axis;
   int outputs_count = params.num_split;
@@ -2602,124 +2646,10 @@ inline void LogSoftmax(const SoftmaxParams& params,
   }
 }
 
-// Although currently the name of this function says that it cannot handle
-// values less than 1, in practice it can handle as low as 1/x_max, where
-// x_max is the largest representable input.  In other words, the output range
-// is symmetric.
-template <int OutputIntegerBits, int InputIntegerBits>
-inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
-log_x_for_x_greater_than_or_equal_to_1_impl(
-    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
-  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
-  // The reason for accumulating the result with an extra bit of headroom is
-  // that z_pow_2_adj * log_2 might be saturated, and adding num_scaled *
-  // recip_denom will otherwise introduce an error.
-  static constexpr int kAccumIntegerBits = OutputIntegerBits + 1;
-  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumIntegerBits>;
-
-  const FixedPoint0 log_2 = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 1488522236, std::log(2.0));
-  const FixedPoint0 sqrt_sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 1805811301, std::sqrt(std::sqrt(0.5)));
-  const FixedPoint0 sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 1518500250, std::sqrt(0.5));
-  const FixedPoint0 one_quarter =
-      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(FixedPoint0, 536870912, 1.0 / 4.0);
-
-  const FixedPoint0 alpha_n = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 117049297, 11.0 / 240.0 * std::sqrt(std::sqrt(2.0)));
-  const FixedPoint0 alpha_d = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 127690142, 1.0 / 20.0 * std::sqrt(std::sqrt(2.0)));
-  const FixedPoint0 alpha_i = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 1057819769,
-      2.0 / std::sqrt(std::sqrt(2.0)) - std::sqrt(std::sqrt(2.0)));
-  const FixedPoint0 alpha_f = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
-      FixedPoint0, 638450708, 1.0 / 4.0 * std::sqrt(std::sqrt(2.0)));
-
-  const FixedPointAccum shifted_quarter =
-      gemmlowp::Rescale<kAccumIntegerBits>(one_quarter);
-
-  // Reinterpret the input value as Q0.31, because we will figure out the
-  // required shift "ourselves" instead of using, say, Rescale.
-  FixedPoint0 z_a = FixedPoint0::FromRaw(input_val.raw());
-  // z_a_pow_2 = input_integer_bits - z_a_headroom;
-  int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32>(z_a.raw()));
-  FixedPoint0 r_a_tmp =
-      SaturatingRoundingMultiplyByPOTParam(z_a, (z_a_headroom_plus_1 - 1));
-  const int32 r_a_raw =
-      SaturatingRoundingMultiplyByPOTParam((r_a_tmp * sqrt_half).raw(), 1);
-  // z_pow_2_adj = max(z_pow_2_a - 0.75, z_pow_2_b - 0.25);
-  // z_pow_2_adj = max(InputIntegerBits - z_a_headroom_plus_1 + 0.25,
-  //                   InputIntegerBits - z_b_headroom - 0.25);
-  const FixedPointAccum z_a_pow_2_adj = SaturatingAddNonGemmlowp(
-      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
-          InputIntegerBits - z_a_headroom_plus_1, 31 - kAccumIntegerBits)),
-      shifted_quarter);
-
-  // z_b is treated like z_a, but premultiplying by sqrt(0.5).
-  FixedPoint0 z_b = z_a * sqrt_half;
-  int z_b_headroom = CountLeadingZeros(static_cast<uint32>(z_b.raw())) - 1;
-  const int32 r_b_raw =
-      SaturatingRoundingMultiplyByPOTParam(z_a.raw(), z_b_headroom);
-  const FixedPointAccum z_b_pow_2_adj = SaturatingSub(
-      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
-          InputIntegerBits - z_b_headroom, 31 - kAccumIntegerBits)),
-      shifted_quarter);
-
-  const FixedPoint0 r = FixedPoint0::FromRaw(std::min(r_a_raw, r_b_raw));
-  const FixedPointAccum z_pow_2_adj = FixedPointAccum::FromRaw(
-      std::max(z_a_pow_2_adj.raw(), z_b_pow_2_adj.raw()));
-
-  const FixedPoint0 p = gemmlowp::RoundingHalfSum(r, sqrt_sqrt_half);
-  FixedPoint0 q = r - sqrt_sqrt_half;
-  q = q + q;
-
-  const FixedPoint0 common_sq = q * q;
-  const FixedPoint0 num = q * r + q * common_sq * alpha_n;
-  const FixedPoint0 denom_minus_one_0 =
-      p * (alpha_i + q + alpha_d * common_sq) + alpha_f * q;
-  const FixedPoint0 recip_denom =
-      one_over_one_plus_x_for_x_in_0_1(denom_minus_one_0);
-
-  const FixedPointAccum num_scaled = gemmlowp::Rescale<kAccumIntegerBits>(num);
-  return gemmlowp::Rescale<OutputIntegerBits>(z_pow_2_adj * log_2 +
-                                              num_scaled * recip_denom);
-}
-
-// Minimum output bits to accommodate log of maximum input range.  It actually
-// does not matter if one considers, say, [-64,64] or [-64,64).
-//
-// For example, run this through Octave:
-// [0:127; ...
-//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2)); ...
-//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2))]
-constexpr int min_log_x_output_bits(int input_bits) {
-  return input_bits > 90
-             ? 7
-             : input_bits > 44
-                   ? 6
-                   : input_bits > 21
-                         ? 5
-                         : input_bits > 10
-                               ? 4
-                               : input_bits > 4 ? 3 : input_bits > 1 ? 2 : 1;
-}
-
-template <int OutputIntegerBits, int InputIntegerBits>
-inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
-log_x_for_x_greater_than_or_equal_to_1(
-    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
-  static_assert(
-      OutputIntegerBits >= min_log_x_output_bits(InputIntegerBits),
-      "Output integer bits must be sufficent to accommodate logs of inputs.");
-  return log_x_for_x_greater_than_or_equal_to_1_impl<OutputIntegerBits,
-                                                     InputIntegerBits>(
-      input_val);
-}
-
 inline void LogSoftmax(const SoftmaxParams& params,
                        const RuntimeShape& input_shape, const uint8* input_data,
                        const RuntimeShape& output_shape, uint8* output_data) {
+  gemmlowp::ScopedProfilingLabel label("LogSoftmax/8bit");
   const int32 input_multiplier = params.input_multiplier;
   const int32 input_left_shift = params.input_left_shift;
   const int32 reverse_scaling_divisor = params.reverse_scaling_divisor;
@@ -2985,6 +2915,7 @@ inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
 inline void Dequantize(const tflite::DequantizationParams& op_params,
                        const RuntimeShape& input_shape, const uint8* input_data,
                        const RuntimeShape& output_shape, float* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Dequantize");
   int32 zero_point = op_params.zero_point;
   double scale = op_params.scale;
   const int flat_size = MatchingFlatSize(input_shape, output_shape);
@@ -2999,6 +2930,7 @@ inline void Dequantize(const tflite::DequantizationParams& op_params,
 inline void FakeQuant(const tflite::FakeQuantParams& op_params,
                       const RuntimeShape& input_shape, const float* input_data,
                       const RuntimeShape& output_shape, float* output_data) {
+  gemmlowp::ScopedProfilingLabel label("FakeQuant");
   float rmin = op_params.minmax.min;
   float rmax = op_params.minmax.max;
   int num_bits = op_params.num_bits;
@@ -3040,11 +2972,22 @@ inline void Floor(const RuntimeShape& input_shape, const float* input_data,
   }
 }
 
+inline void Ceil(const RuntimeShape& input_shape, const float* input_data,
+                 const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    int offset = i;
+    output_data[offset] = std::ceil(input_data[offset]);
+  }
+}
+
 template <typename T, typename CoordsT = int32>
 inline void Gather(const tflite::GatherParams& op_params,
                    const RuntimeShape& input_shape, const T* input_data,
                    const RuntimeShape& coords_shape, const CoordsT* coords_data,
                    const RuntimeShape& output_shape, T* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Gather");
   int axis = op_params.axis;
   if (axis < 0) {
     axis += input_shape.DimensionsCount();
@@ -3076,6 +3019,43 @@ inline void Gather(const tflite::GatherParams& op_params,
   }
 }
 
+template <typename ParamsT, typename IndicesT = int32>
+inline void GatherNd(const RuntimeShape& params_shape,
+                     const ParamsT* params_data,
+                     const RuntimeShape& indices_shape,
+                     const IndicesT* indices_data,
+                     const RuntimeShape& output_shape, ParamsT* output_data) {
+  gemmlowp::ScopedProfilingLabel label("GatherNd");
+
+  int n_slices = 1;
+  int slice_size = 1;
+  const int indices_dims = indices_shape.DimensionsCount();
+  const int indices_nd = indices_shape.Dims(indices_dims - 1);
+  const int params_dims = params_shape.DimensionsCount();
+  for (int i = 0; i < indices_dims - 1; ++i) {
+    n_slices *= indices_shape.Dims(i);
+  }
+  for (int i = indices_nd; i < params_dims; ++i) {
+    slice_size *= params_shape.Dims(i);
+  }
+
+  int remain_flat_size = params_shape.FlatSize();
+  std::vector<int> dims_to_count(indices_nd, 0);
+  for (int i = 0; i < indices_nd; ++i) {
+    dims_to_count[i] = remain_flat_size / params_shape.Dims(i);
+    remain_flat_size = dims_to_count[i];
+  }
+
+  for (int i = 0; i < n_slices; ++i) {
+    int from_pos = 0;
+    for (int j = 0; j < indices_nd; ++j) {
+      from_pos += indices_data[i * indices_nd + j] * dims_to_count[j];
+    }
+    std::memcpy(output_data + i * slice_size, params_data + from_pos,
+                sizeof(ParamsT) * slice_size);
+  }
+}
+
 template <typename T>
 inline void ResizeBilinear(const tflite::ResizeBilinearParams& op_params,
                            const RuntimeShape& unextended_input_shape,
@@ -3148,6 +3128,7 @@ inline void SpaceToBatchND(
     const RuntimeShape& unextended_input2_shape, const int32* block_shape_data,
     const RuntimeShape& unextended_input3_shape, const int32* paddings_data,
     const RuntimeShape& unextended_output_shape, T* output_data) {
+  gemmlowp::ScopedProfilingLabel label("SpaceToBatchND");
   TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
   const RuntimeShape input1_shape =
@@ -3205,6 +3186,7 @@ inline void BatchToSpaceND(
     const RuntimeShape& unextended_input2_shape, const int32* block_shape_data,
     const RuntimeShape& unextended_input3_shape, const int32* crops_data,
     const RuntimeShape& unextended_output_shape, T* output_data) {
+  gemmlowp::ScopedProfilingLabel label("BatchToSpaceND");
   TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
   const RuntimeShape input1_shape =
@@ -3478,6 +3460,7 @@ inline void Slice(const tflite::SliceParams& op_params,
 template <typename T>
 inline void Exp(const T* input_data, const size_t num_elements,
                 T* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Exp");
   for (size_t idx = 0; idx < num_elements; ++idx) {
     output_data[idx] = exp(input_data[idx]);
   }
@@ -3608,6 +3591,7 @@ inline bool Mean(const T* input_data, const int* input_dims,
                  const int* output_dims, const int output_num_dims,
                  const int* axis, const int num_axis_dimensions, bool keep_dims,
                  int* temp_index, int* resolved_axis, U* temp_sum) {
+  gemmlowp::ScopedProfilingLabel label("Mean");
   // Reset output data.
   size_t num_outputs = 1;
   for (int idx = 0; idx < output_num_dims; ++idx) {
@@ -3661,7 +3645,7 @@ inline void Mean(const tflite::MeanParams& op_params,
                  const RuntimeShape& unextended_input_shape,
                  const T* input_data,
                  const RuntimeShape& unextended_output_shape, T* output_data) {
-  gemmlowp::ScopedProfilingLabel label("Mean");
+  gemmlowp::ScopedProfilingLabel label("Mean4D");
 
   // Current implementation only supports dimension equals 4 and simultaneous
   // reduction over width and height.
@@ -3700,6 +3684,65 @@ inline void Mean(const tflite::MeanParams& op_params,
   }
 }
 
+inline void Mean(const tflite::MeanParams& op_params,
+                 const RuntimeShape& unextended_input_shape,
+                 const uint8_t* input_data, int32 input_zero_point,
+                 float input_scale, const RuntimeShape& unextended_output_shape,
+                 uint8_t* output_data, int32 output_zero_point,
+                 float output_scale) {
+  gemmlowp::ScopedProfilingLabel label("Mean4D/Uint8");
+
+  // Current implementation only supports dimension equals 4 and simultaneous
+  // reduction over width and height.
+  TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape =
+      RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape output_shape =
+      RuntimeShape::ExtendedShape(4, unextended_output_shape);
+  const int output_batch = output_shape.Dims(0);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int output_depth = output_shape.Dims(3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const float num_elements_in_axis = input_width * input_height;
+
+  TFLITE_DCHECK_EQ(op_params.axis_count, 2);
+  TFLITE_DCHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+                (op_params.axis[0] == 2 && op_params.axis[1] == 1));
+  TFLITE_DCHECK_EQ(output_height, 1);
+  TFLITE_DCHECK_EQ(output_width, 1);
+
+  const bool ordinary_mean =
+      (input_zero_point == output_zero_point && input_scale == output_scale);
+  float scale, bias;
+  if (!ordinary_mean) {
+    scale = input_scale / output_scale;
+    bias = -input_zero_point * scale + 0.5;
+  }
+  for (int out_b = 0; out_b < output_batch; ++out_b) {
+    for (int out_d = 0; out_d < output_depth; ++out_d) {
+      float temp_value = 0;
+      for (int in_h = 0; in_h < input_height; ++in_h) {
+        for (int in_w = 0; in_w < input_width; ++in_w) {
+          temp_value +=
+              input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
+        }
+      }
+      temp_value = temp_value / num_elements_in_axis;
+      if (ordinary_mean) {
+        output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
+            static_cast<uint8_t>(round(temp_value));
+      } else {
+        output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
+            static_cast<uint8_t>(round(temp_value * scale + bias)) +
+            output_zero_point;
+      }
+    }
+  }
+}
+
 // Computes the mean of elements across dimensions given in axis.
 // It does so in two stages, first calculates the sum of elements along the axis
 // then divides it by the number of element in axis for quantized values.
@@ -3713,6 +3756,8 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32 input_zero_point,
                                const int num_axis_dimensions, bool keep_dims,
                                int* temp_index, int* resolved_axis, U* temp_sum,
                                bool compute_sum) {
+  gemmlowp::ScopedProfilingLabel label(compute_sum ? "Sum/Uint8"
+                                                   : "Mean/Uint8");
   // Reset output data.
   size_t num_outputs = 1;
   for (int idx = 0; idx < output_num_dims; ++idx) {
@@ -3828,6 +3873,7 @@ void MaximumMinimumBroadcast4DSlow(const RuntimeShape& unextended_input1_shape,
                                    const T* input2_data,
                                    const RuntimeShape& unextended_output_shape,
                                    T* output_data, Op op) {
+  gemmlowp::ScopedProfilingLabel label("MaximumMinimumBroadcast4DSlow");
   TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
@@ -3859,11 +3905,9 @@ template <typename T1, typename T2, typename T3, typename Cmp>
 void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
                const T3* input2_data, const RuntimeShape& output_shape,
                T2* output_data, const Cmp& cmp) {
-  // For ArgMax, the number of output dimensions = (number of input dimensions -
-  // 1). For the sake of simplicity, the output dimensions are equal to the
-  // input dimensions here. We enforce the constraint that the axis dimension
-  // must always be 1.
-  TFLITE_DCHECK_EQ(input1_shape.DimensionsCount(),
+  gemmlowp::ScopedProfilingLabel label("ArgMinMax");
+  TFLITE_DCHECK_GT(input1_shape.DimensionsCount(), 0);
+  TFLITE_DCHECK_EQ(input1_shape.DimensionsCount() - 1,
                    output_shape.DimensionsCount());
 
   int axis = input2_data[0];
@@ -3872,7 +3916,6 @@ void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
   }
 
   const int axis_size = input1_shape.Dims(axis);
-  TFLITE_DCHECK_EQ(output_shape.Dims(axis), 1);
 
   int outer_size = 1;
   for (int i = 0; i < axis; ++i) {
@@ -3883,7 +3926,7 @@ void ArgMinMax(const RuntimeShape& input1_shape, const T1* input1_data,
   int inner_size = 1;
   const int dims_count = input1_shape.DimensionsCount();
   for (int i = axis + 1; i < dims_count; ++i) {
-    TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i));
+    TFLITE_DCHECK_EQ(input1_shape.Dims(i), output_shape.Dims(i - 1));
     inner_size *= input1_shape.Dims(i);
   }
 
@@ -4328,6 +4371,34 @@ void RankOneSelect(const RuntimeShape& input_condition_shape,
   }
 }
 
+template <typename D, typename T>
+void SelectTrueCoords(const RuntimeShape& input_condition_shape,
+                      const D* input_condition_data, T* output_data) {
+  const size_t size = input_condition_shape.FlatSize();
+  const size_t cond_rank = input_condition_shape.DimensionsCount();
+
+  std::vector<int> dims_to_count(cond_rank, 0);
+  int cur_flat_size = size;
+  for (int i = 0; i < cond_rank; ++i) {
+    dims_to_count[i] = cur_flat_size / input_condition_shape.Dims(i);
+    cur_flat_size = dims_to_count[i];
+  }
+
+  int output_index = 0;
+  for (int i = 0; i < size; ++i) {
+    if (input_condition_data[i]) {
+      // Insert the coordinate of the current item (row major) into output.
+      int flat_index = i;
+      for (int j = 0; j < cond_rank; ++j) {
+        int coord_j = flat_index / dims_to_count[j];
+        output_data[output_index * cond_rank + j] = coord_j;
+        flat_index %= dims_to_count[j];
+      }
+      output_index++;
+    }
+  }
+}
+
 // For easy implementation, the indices is always a vector of size-4 vectors.
 template <typename T, typename TI>
 inline void SparseToDense(const std::vector<std::vector<TI>>& indices,
@@ -4623,6 +4694,33 @@ void Fill(const RuntimeShape& value_shape, const T* value_data,
   }
 }
 
+template <typename Scalar>
+void Reverse(int axis, const RuntimeShape& input_shape,
+             const Scalar* input_data, const RuntimeShape& output_shape,
+             Scalar* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Reverse");
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; ++i) {
+    outer_size *= input_shape.Dims(i);
+  }
+
+  int copy_size = 1;
+  for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i) {
+    copy_size *= input_shape.Dims(i);
+  }
+
+  const int dims_at_axis = input_shape.Dims(axis);
+  for (int i = 0; i < outer_size; ++i) {
+    for (int j = 0; j < dims_at_axis; ++j) {
+      const int start_pos = (i * dims_at_axis + j) * copy_size;
+      Scalar* output_ptr = output_data + start_pos;
+      int loc = (i * dims_at_axis + dims_at_axis - j - 1) * copy_size;
+      memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
+    }
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/reference/softmax.h b/tensorflow/lite/kernels/internal/reference/softmax.h
index 51de6b51aa5308b69dd5b9ad6bf29cd18c0550ba..45a18cdb47f64b4a8f5f0c7cd53cb9b13956b151 100644
--- a/tensorflow/lite/kernels/internal/reference/softmax.h
+++ b/tensorflow/lite/kernels/internal/reference/softmax.h
@@ -102,19 +102,9 @@ inline void Softmax(const SoftmaxParams& params,
       }
     }
 
-    int32 fixed_sum_of_exps = sum_of_exps.raw();
-    int headroom_plus_one =
-        CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps));
-    // This is the number of bits to the left of the binary point above 1.0.
-    // Consider fixed_sum_of_exps=1.25.  In that case shifted_scale=0.8 and
-    // no later adjustment will be needed.
-    int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
-    int32 shifted_sum_minus_one = static_cast<int32>(
-        (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) -
-        (static_cast<uint32>(1) << 31));
-
-    FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
-        FixedPoint0::FromRaw(shifted_sum_minus_one));
+    int num_bits_over_unit;
+    FixedPoint0 shifted_scale = FixedPoint0::FromRaw(GetReciprocal(
+        sum_of_exps.raw(), kAccumulationIntegerBits, &num_bits_over_unit));
 
     for (int c = 0; c < depth; ++c) {
       int32 input_diff =
diff --git a/tensorflow/lite/kernels/internal/resize_bilinear_test.cc b/tensorflow/lite/kernels/internal/resize_bilinear_test.cc
index 1c5ac1992f0f649ca47e2a5bc81ea332abc46bf5..4a19b69a7c9dfc70192d446f922052606c516365 100644
--- a/tensorflow/lite/kernels/internal/resize_bilinear_test.cc
+++ b/tensorflow/lite/kernels/internal/resize_bilinear_test.cc
@@ -76,6 +76,7 @@ void TestOneResizeBilinear(int batch, int depth, int input_width,
 }
 
 TEST(ResizeBilinear, TestResizeBilinear8Bit) {
+  RandomEngine().seed(38291);
   const int kTestsToRun = 100 * 1000;
   for (int i = 0; i < kTestsToRun; i++) {
     const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
@@ -91,6 +92,7 @@ TEST(ResizeBilinear, TestResizeBilinear8Bit) {
 }
 
 TEST(ResizeBilinear2x2, TestResizeBilinear8Bit) {
+  RandomEngine().seed(38291);
   const int kTestsToRun = 100 * 1000;
   for (int i = 0; i < kTestsToRun; i++) {
     const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
@@ -106,6 +108,7 @@ TEST(ResizeBilinear2x2, TestResizeBilinear8Bit) {
 }
 
 TEST(ResizeBilinear, TestResizeBilinear) {
+  RandomEngine().seed(38291);
   const int kTestsToRun = 100 * 1000;
   for (int i = 0; i < kTestsToRun; i++) {
     const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
@@ -121,6 +124,7 @@ TEST(ResizeBilinear, TestResizeBilinear) {
 }
 
 TEST(ResizeBilinear2x2, TestResizeBilinear) {
+  RandomEngine().seed(38291);
   const int kTestsToRun = 100 * 1000;
   for (int i = 0; i < kTestsToRun; i++) {
     const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
diff --git a/tensorflow/lite/kernels/internal/round.h b/tensorflow/lite/kernels/internal/round.h
index cb494bfd5374d90bac0c8f444e186f137f45a91f..135deced448afa63468bb018705e61bd03694a25 100644
--- a/tensorflow/lite/kernels/internal/round.h
+++ b/tensorflow/lite/kernels/internal/round.h
@@ -21,7 +21,8 @@ namespace tflite {
 
 // TODO(aselle): See if we can do this only on jdk. Also mikecase, check
 // if you need this for java host build.
-#if defined(__ANDROID__) && !defined(__NDK_MAJOR__)
+#if defined(TF_LITE_USE_GLOBAL_ROUND) || \
+    (defined(__ANDROID__) && !defined(__NDK_MAJOR__))
 template <class T>
 inline float TfLiteRound(const float x) {
   return ::round(x);
diff --git a/tensorflow/lite/kernels/internal/softmax_quantized_test.cc b/tensorflow/lite/kernels/internal/softmax_quantized_test.cc
index 743ce0355c96fd2766fd2315299c2419703f11b7..8ac62d9af787b2846a0f2031a3c9bcd9f2ab44d7 100644
--- a/tensorflow/lite/kernels/internal/softmax_quantized_test.cc
+++ b/tensorflow/lite/kernels/internal/softmax_quantized_test.cc
@@ -210,7 +210,7 @@ bool TryOneSkyscraperSoftmax(bool small_depth) {
 }
 
 TEST(TestQuantizedSoftmax, UniformSoftmaxTests) {
-  const int kTestsToRun = 1000;
+  const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
     while (!TryOneUniformSoftmax()) {
     }
@@ -218,7 +218,7 @@ TEST(TestQuantizedSoftmax, UniformSoftmaxTests) {
 }
 
 TEST(TestQuantizedSoftmax, SkyscraperSoftmaxTests) {
-  const int kTestsToRun = 1000;
+  const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
     while (!TryOneSkyscraperSoftmax(false)) {
     }
@@ -226,7 +226,7 @@ TEST(TestQuantizedSoftmax, SkyscraperSoftmaxTests) {
 }
 
 TEST(TestQuantizedSoftmax, SmallSkyscraperSoftmaxTests) {
-  const int kTestsToRun = 1000;
+  const int kTestsToRun = 100;
   for (int i = 0; i < kTestsToRun; i++) {
     while (!TryOneSkyscraperSoftmax(true)) {
     }
diff --git a/tensorflow/lite/kernels/internal/tensor_utils.h b/tensorflow/lite/kernels/internal/tensor_utils.h
index 71ae69522f9a45745a9ed9eae211db3d048ba43d..4f18f283b6094c66fb89080115d359ffce776dd8 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/tensor_utils.h
@@ -55,6 +55,21 @@ void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
                                          int n_batch, float* result,
                                          int result_stride);
 
+// Same as the function above, but the matrix is stored in block compressed
+// sparse row format with block pattern 1x16 which consists of two arrays:
+//   1. A matrix array stores non-zero blocks of the matrix in row major.
+//   2. A ledger array stores nrows groups, one group per row. Each group starts
+//   with
+//      an integer representing the number of non-zero blocks for the
+//      corresponding row and follows with column indexes of the first element
+//      of each non-zero block.
+// This function assumes that
+//   1. m_cols is a multiple of 16 so that all blocks are full blocks.
+//   2. m_cols < 254 * 16 so that block index can be represented by uint8.
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const float* matrix, const uint8_t* ledger, int m_rows, int m_cols,
+    const float* vector, int n_batch, float* result, int result_stride);
+
 // Same as the function above, but for values quantized using symmetric
 // quantization (e.g. by calling SymmetricQuantizeFloats).
 // The passed scaling factors is a buffer of the quantization scaling factors
@@ -67,6 +82,23 @@ void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ vectors, const float* scaling_factors,
     int n_batch, float* __restrict__ result, int result_stride);
 
+// Same as the function above, but the matrix is stored in block compressed
+// sparse row format with block pattern 1x16 which consists of two arrays:
+//   1. A matrix array stores non-zero blocks of the matrix in row major.
+//   2. A ledger array stores nrows groups, one group per row. Each group starts
+//   with
+//      an integer representing the number of non-zero blocks for the
+//      corresponding row followed by column index of the first element of
+//      each non-zero block.
+// This function assumes that
+//   1. m_cols is a multiple of 16 so that all blocks are full blocks.
+//   2. m_cols < 254 * 16 so that block index can be represented by uint8.
+void SparseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
+    const int m_cols, const int8_t* __restrict__ vectors,
+    const float* scaling_factors, int n_batch, float* __restrict__ result,
+    int result_stride);
+
 // Cwise product of two vectors.
 void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
                               int v_size, float* result);
diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
index 29866d066406e58e06e6caa2e5b410460564c966..3ba4af7c468421cbc8d559e3f8777854ba2fc53b 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -149,6 +149,7 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) {
   // 16-block SIMD code, the 8-block postamble, and the leftover postamble.
   const int a_rows = 4, a_cols = 29;
   const int kWeightsPerUint32 = 4;
+  /* clang-format off */
   const float a_float_data[] = {
       /* 1st row */
       1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.11, 12.12, 13.13,
@@ -174,126 +175,18 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) {
   SymmetricQuantizeFloats(a_float_data, a_rows * a_cols, a_int8_data, &a_min,
                           &a_max, &scaling_factor_a);
   const int8_t expected_a_int8_data[] = {
-      /* 1st row */
-      5,
-      10,
-      15,
-      20,
-      25,
-      30,
-      35,
-      40,
-      44,
-      45,
-      50,
-      54,
-      59,
-      64,
-      68,
-      73,
-      77,
-      82,
-      86,
-      91,
-      95,
-      100,
-      104,
-      109,
-      113,
-      118,
-      122,
-      127,
-      0,
-      /* 2nd row */
-      -5,
-      -10,
-      -15,
-      -20,
-      -25,
-      -30,
-      -35,
-      -40,
-      -44,
-      -45,
-      -50,
-      -54,
-      -59,
-      -64,
-      -68,
-      -73,
-      -77,
-      -82,
-      -86,
-      -91,
-      -95,
-      -100,
-      -104,
-      -109,
-      -113,
-      -118,
-      -122,
-      -127,
-      0,
-      /* 3rd row */
-      5,
-      -10,
-      15,
-      -20,
-      25,
-      -30,
-      35,
-      -40,
-      44,
-      -45,
-      50,
-      -54,
-      59,
-      -64,
-      68,
-      -73,
-      77,
-      -82,
-      86,
-      -91,
-      95,
-      -100,
-      104,
-      -109,
-      113,
-      -118,
-      122,
-      -127,
-      0,
-      /* 4th row */
-      -5,
-      10,
-      -15,
-      20,
-      -25,
-      30,
-      -35,
-      40,
-      -44,
-      45,
-      -50,
-      54,
-      -59,
-      64,
-      -68,
-      73,
-      -77,
-      82,
-      -86,
-      91,
-      -95,
-      100,
-      -104,
-      109,
-      -113,
-      118,
-      -122,
-      127,
-      0,
+    /* 1st row */
+    5, 10, 15, 20, 25, 30, 35, 40, 44, 45, 50, 54, 59, 64, 68, 73, 77, 82, 86,
+    91, 95, 100, 104, 109, 113, 118, 122, 127, 0,
+    /* 2nd row */
+    -5, -10, -15, -20, -25, -30, -35, -40, -44, -45, -50, -54, -59, -64, -68,
+    -73, -77, -82, -86, -91, -95, -100, -104, -109, -113, -118, -122, -127, 0,
+    /* 3rd row */
+    5, -10, 15, -20, 25, -30, 35, -40, 44, -45, 50, -54, 59, -64, 68, -73, 77,
+    -82, 86, -91, 95, -100, 104, -109, 113, -118, 122, -127, 0,
+    /* 4th row */
+    -5, 10, -15, 20, -25, 30, -35, 40, -44, 45, -50, 54, -59, 64, -68, 73, -77,
+    82, -86, 91, -95, 100, -104, 109, -113, 118, -122, 127, 0,
   };
   for (int i = 0; i < a_rows * a_cols; ++i) {
     EXPECT_EQ(expected_a_int8_data[i], a_int8_data[i]);
@@ -301,66 +194,14 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) {
 
   const int b_rows = 29, b_cols = 1, batches = 2;
   const float b_float_data[] = {
-      /* batch 1 */
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      -1.0,
-      1.0,
-      /* batch 2 */
-      2.5,
-      -2.1,
-      3.0,
-      -1.3,
-      1.3,
-      -1.1,
-      2.0,
-      -1.7,
-      1.9,
-      -1.5,
-      0.5,
-      -0.7,
-      0.8,
-      -0.3,
-      2.8,
-      -2.8,
-      1.1,
-      -2.3,
-      1.9,
-      -1.9,
-      2.1,
-      -0.5,
-      2.4,
-      -0.1,
-      1.0,
-      -2.5,
-      0.7,
-      -1.9,
-      0.2,
+    /* batch 1 */
+    1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+    1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+    1.0,
+    /* batch 2 */
+    2.5, -2.1, 3.0, -1.3, 1.3, -1.1, 2.0, -1.7, 1.9, -1.5, 0.5, -0.7, 0.8, -0.3,
+    2.8, -2.8, 1.1, -2.3, 1.9, -1.9, 2.1, -0.5, 2.4, -0.1, 1.0, -2.5, 0.7, -1.9,
+    0.2,
   };
 
   // Quantized values of B:
@@ -374,67 +215,15 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) {
                           &scaling_factor_b[1]);
 
   const int8_t expected_b_int8_data[] = {
-      /* batch 1 */
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      -127,
-      127,
-      /* batch 2 */
-      106,
-      -89,
-      127,
-      -55,
-      55,
-      -47,
-      85,
-      -72,
-      80,
-      -64,
-      21,
-      -30,
-      34,
-      -13,
-      119,
-      -119,
-      47,
-      -97,
-      80,
-      -80,
-      89,
-      -21,
-      102,
-      -4,
-      42,
-      -106,
-      30,
-      -80,
-      8,
+    /* batch 1 */
+    127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127,
+    127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127,
+    127,
+    /* batch 2 */
+    106, -89, 127, -55, 55, -47, 85, -72, 80, -64, 21, -30, 34, -13, 119, -119,
+    47, -97, 80, -80, 89, -21, 102, -4, 42, -106, 30, -80, 8,
   };
+  /* clang-format on */
   for (int i = 0; i < b_rows * b_cols * batches; ++i) {
     EXPECT_EQ(expected_b_int8_data[i], b_int8_data[i]);
   }
@@ -468,6 +257,161 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) {
 }
 #endif  // __ANDROID__
 
+TEST(uKernels, SparseMatrixBatchVectorMultiplyAccumulateTest) {
+  const int kRow = 4;
+  const int kCol = 48;
+  const int kBatch = 2;
+  /* clang-format off */
+  float matrix[kRow * kCol] = {
+      /* 1st row */
+      1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.11, 12.12, 13.13,
+      14.14, 15.15, 16.16, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 33.33, 34.34, 35.35, 36.36, 37.37, 38.38,
+      39.39, 40.40, 41.41, 42.42, 43.43, 44.44, 0, 0, 0, 0,
+      /* 2nd row */
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, -17.17, -18.18, -19.19, -20.2, -21.21, -22.22, -23.23, -24.24,
+      -25.25, -26.26, -27.27, -28.28, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0,
+      /* 3rd row */
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 17.17, -18.18, 19.19, -20.2, 21.21, -22.22, 23.23, -24.24, 25.25,
+      -26.26, 27.27, -28.28, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0,
+      /* 4th row */
+      -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, -9.9, 10.1, -11.11, 12.12,
+      -13.13, 14.14, -15.15, 16.16, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -33.33, 34.34, -35.35, 36.36, -37.37,
+      38.38, -39.39, 40.40, -41.41, 42.42, -43.43, 44.44, 0, 0, 0, 0};
+
+  // BCSR format of the above matrix.
+  float matrix_values[] = {
+      /* 1st row */
+      1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.11, 12.12, 13.13,
+      14.14, 15.15, 16.16, 33.33, 34.34, 35.35, 36.36, 37.37, 38.38, 39.39,
+      40.40, 41.41, 42.42, 43.43, 44.44, 0, 0, 0, 0,
+      /* 2nd row */
+      -17.17, -18.18, -19.19, -20.2, -21.21, -22.22, -23.23, -24.24, -25.25,
+      -26.26, -27.27, -28.28, 0, 0.0, 0.0, 0.0,
+      /* 3rd row */
+      17.17, -18.18, 19.19, -20.2, 21.21, -22.22, 23.23, -24.24, 25.25, -26.26,
+      27.27, -28.28, 0, 0.0, 0.0, 0.0,
+      /* 4th row */
+      -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, -9.9, 10.1, -11.11, 12.12,
+      -13.13, 14.14, -15.15, 16.16, -33.33, 34.34, -35.35, 36.36, -37.37, 38.38,
+      -39.39, 40.40, -41.41, 42.42, -43.43, 44.44, 0, 0, 0, 0};
+  uint8_t ledger[] = {
+      2, 0,  2,  // 1st row
+      1, 1,      // 2nd row
+      1, 1,      // 3rd row
+      2, 0,  2   // 4th row
+  };
+
+  float vector[kBatch * kCol] = {
+    /* 1st batch */
+    1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+    1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+    1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+    1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+    /* 2nd batch */
+    2.5, 0.0, -2.1, 0.0, 3.0, 0.0, -1.3, 0.0, 1.3, 0.0, -1.1, 0.0, 2.0, 0.0,
+    -1.7, 0.0, 1.9, 0.0, -1.5, 0.0, 0.5, 0.0, -0.7, 0.0, 0.8, 0.0, -0.3, 0.0,
+    2.8, 0.0, -2.8, 0.0, 1.1, -2.3, 1.9, -1.9, 2.1, -0.5, 2.4, -0.1, 1.0, -2.5,
+    0.7, -1.9, 0.2, 0.0, 0.1, 0.2,
+  };
+  /* clang-format on */
+
+  std::vector<float> dense_output(kRow * kBatch, 0.0);
+  MatrixBatchVectorMultiplyAccumulate(matrix, kRow, kCol, vector, kBatch,
+                                      dense_output.data(), /*result_stride=*/1);
+
+  std::vector<float> sparse_output(kRow * kBatch, 0.0);
+  SparseMatrixBatchVectorMultiplyAccumulate(
+      matrix_values, ledger, kRow, kCol, vector, kBatch, sparse_output.data(),
+      /*result_stride=*/1);
+
+  EXPECT_THAT(sparse_output,
+              ElementsAreArray(ArrayFloatNear(dense_output, 1e-4)));
+}
+
+#ifdef __ANDROID__
+TEST(uKernels,
+     SparseMatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) {
+  const int kRow = 4;
+  const int kCol = 48;
+  const int kBatch = 2;
+  /* clang-format off */
+  const int8_t quantized_matrix[] = {
+      /* 1st row */
+      3, 6, 9, 13, 16, 19, 22, 25, 28, 29, 32, 35, 38, 40, 43, 46, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 95, 98, 101, 104, 107, 110, 113, 115,
+      118, 121, 124, 127, 0, 0, 0, 0,
+      /* 2nd row */
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -49, -52, -55, -58, -61,
+      -64, -66, -69, -72, -75, -78, -81, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0,
+      /* 3rd row */
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 49, -52, 55, -58, 61, -64,
+      66, -69, 72, -75, 78, -81, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0,
+      /* 4th row */
+      -3, 6, -9, 13, -16, 19, -22, 25, -28, 29, -32, 35, -38, 40, -43, 46, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -95, 98, -101, 104, -107, 110,
+      -113, 115, -118, 121, -124, 127, 0, 0, 0, 0,
+  };
+  const int8_t quantized_matrix_values[] = {
+      /* 1st row */
+      3, 6, 9, 13, 16, 19, 22, 25, 28, 29, 32, 35, 38, 40, 43, 46, 95, 98, 101,
+      104, 107, 110, 113, 115, 118, 121, 124, 127, 0, 0, 0, 0,
+      /* 2nd row */
+      -49, -52, -55, -58, -61, -64, -66, -69, -72, -75, -78, -81, 0, 0, 0, 0,
+      /* 3rd row */
+      49, -52, 55, -58, 61, -64, 66, -69, 72, -75, 78, -81, 0, 0, 0, 0,
+      /* 4th row */
+      -3, 6, -9, 13, -16, 19, -22, 25, -28, 29, -32, 35, -38, 40, -43, 46, -95,
+      98, -101, 104, -107, 110, -113, 115, -118, 121, -124, 127, 0, 0, 0, 0,
+  };
+  uint8_t ledger[] = {
+      2, 0,  2,  // 1st row
+      1, 1,      // 2nd row
+      1, 1,      // 3rd row
+      2, 0,  2   // 4th row
+  };
+
+  float matrix_scaling_factor = 0.349921;
+
+  const int8_t quantized_vector[] = {
+      /* 1st batch */
+      127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127,
+      -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127,
+      127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127, -127, 127,
+      -127, 127, -127, 127, -127, 127, -127, 127, -127,
+      /* 2nd batch */
+      106, 0, -89, 0, 127, 0, -55, 0, 55, 0, -47, 0, 85, 0, -72, 0, 80, 0,
+      -64, 0, 21, 0, -30, 0, 34, 0, -13, 0, 119, 0, -119, 0, 47, -97, 80, -80,
+      89, -21, 102, -4, 42, -106, 30, -80, 8, 1, 2, 3,
+  };
+  float vector_scaling_factor[2] = {0.00787402, 0.023622};
+
+  /* clang-format on */
+  float result_scaling_factor[2] = {
+      matrix_scaling_factor * vector_scaling_factor[0],
+      matrix_scaling_factor * vector_scaling_factor[1],
+  };
+  std::vector<float> dense_output(kRow * kBatch, 0.0);
+  MatrixBatchVectorMultiplyAccumulate(quantized_matrix, kRow, kCol,
+                                      quantized_vector, result_scaling_factor,
+                                      kBatch, dense_output.data(),
+                                      /*result_stride=*/1);
+  std::vector<float> sparse_output(kRow * kBatch, 0.0);
+  SparseMatrixBatchVectorMultiplyAccumulate(
+      quantized_matrix_values, ledger, kRow, kCol, quantized_vector,
+      result_scaling_factor, kBatch, sparse_output.data(),
+      /*result_stride=*/1);
+  EXPECT_THAT(sparse_output, ElementsAreArray(ArrayFloatNear(dense_output)));
+}
+#endif  // __ANDROID__
+
 TEST(uKernels, VectorVectorCwiseProductTest) {
   constexpr int kVectorSize = 10;
   static float input1[kVectorSize] = {0.0,  -0.5, 1.0,  -1.5, 2.0,
diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc
index e39890e3320eb4d1e2dcd0c8256bb96631e75011..d6984e9ec22958a4bb7d299aaa20bd39bd36604d 100644
--- a/tensorflow/lite/kernels/kernel_util.cc
+++ b/tensorflow/lite/kernels/kernel_util.cc
@@ -18,10 +18,78 @@ limitations under the License.
 #include <cmath>
 #include <memory>
 
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/round.h"
 
 namespace tflite {
 
+TfLiteStatus PopulateConvolutionQuantizationParams(
+    TfLiteContext* context, const TfLiteTensor* input,
+    const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output,
+    const TfLiteFusedActivation& activation, int32_t* multiplier, int* shift,
+    int32_t* output_activation_min, int32_t* output_activation_max,
+    int32_t* per_channel_multiplier, int* per_channel_shift) {
+  TF_LITE_ENSURE_EQ(context, input->quantization.type,
+                    kTfLiteAffineQuantization);
+  TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                    kTfLiteAffineQuantization);
+  // TODO(jianlijianli): Enable bias type check and bias scale == input scale
+  // * filter scale for each channel in affine quantization once bias
+  // quantization is properly populated.
+  // TF_LITE_ENSURE_EQ(context, bias->quantization.type,
+  // kTfLiteAffineQuantization);
+
+  // Check data type.
+  const auto* affine_quantization =
+      reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params);
+  TF_LITE_ENSURE(context, affine_quantization);
+  TF_LITE_ENSURE(context, affine_quantization->scale);
+  const bool is_per_channel = affine_quantization->scale->size > 1;
+  if (is_per_channel) {
+    //  Currently only Int8 is supported for per channel quantization.
+    TF_LITE_ENSURE_EQ(context, input->type, kTfLiteInt8);
+    TF_LITE_ENSURE_EQ(context, filter->type, kTfLiteInt8);
+    TF_LITE_ENSURE_EQ(
+        context, affine_quantization->scale->size,
+        filter->dims->data[affine_quantization->quantized_dimension]);
+  }
+
+  // Populate multiplier and shift using affine quantization.
+  const int num_channels = affine_quantization->scale->size;
+  const float input_scale = input->params.scale;
+  const float output_scale = output->params.scale;
+  const float* filter_scales = affine_quantization->scale->data;
+  for (int i = 0; i < num_channels; ++i) {
+    const double filter_scale = static_cast<double>(filter_scales[i]);
+    const double effective_output_scale = static_cast<double>(input_scale) *
+                                          filter_scale /
+                                          static_cast<double>(output_scale);
+    int32_t significand;
+    int shift;
+    QuantizeMultiplier(effective_output_scale, &significand, &shift);
+    per_channel_multiplier[i] = significand;
+    per_channel_shift[i] = shift;
+  }
+
+  // Populate scalar quantization parameters.
+  // This check on legacy quantization parameters is kept only for backward
+  // compatibility.
+  if (input->type == kTfLiteUInt8) {
+    // Check bias scale == input scale * filter scale.
+    double real_multiplier = 0.0;
+    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+        context, input, filter, bias, output, &real_multiplier));
+    int exponent;
+
+    // Populate quantization parameteters with multiplier and shift.
+    QuantizeMultiplier(real_multiplier, multiplier, &exponent);
+    *shift = -exponent;
+    CalculateActivationRangeUint8(activation, output, output_activation_min,
+                                  output_activation_max);
+  }
+  return kTfLiteOk;
+}
+
 TfLiteStatus GetQuantizedConvolutionMultipler(TfLiteContext* context,
                                               const TfLiteTensor* input,
                                               const TfLiteTensor* filter,
@@ -81,6 +149,9 @@ TfLiteStatus CalculateActivationRangeQuantized(TfLiteContext* context,
   if (output->type == kTfLiteUInt8) {
     qmin = std::numeric_limits<uint8_t>::min();
     qmax = std::numeric_limits<uint8_t>::max();
+  } else if (output->type == kTfLiteInt8) {
+    qmin = std::numeric_limits<int8_t>::min();
+    qmax = std::numeric_limits<int8_t>::max();
   } else if (output->type == kTfLiteInt16) {
     qmin = std::numeric_limits<int16_t>::min();
     qmax = std::numeric_limits<int16_t>::max();
@@ -103,6 +174,16 @@ void CalculateActivationRangeUint8(TfLiteFusedActivation activation,
                                         act_max);
 }
 
+void CalculateActivationRangeInt8(TfLiteFusedActivation activation,
+                                  TfLiteTensor* output, int32_t* act_min,
+                                  int32_t* act_max) {
+  const int32_t qmin = std::numeric_limits<int8_t>::min();
+  const int32_t qmax = std::numeric_limits<int8_t>::max();
+
+  CalculateActivationRangeQuantizedImpl(activation, qmin, qmax, output, act_min,
+                                        act_max);
+}
+
 bool HaveSameShapes(const TfLiteTensor* input1, const TfLiteTensor* input2) {
   return TfLiteIntArrayEqual(input1->dims, input2->dims);
 }
diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h
index 3cc00588d63feddc90d17997cebe2c8d063c45eb..57ff65fcea0237081e1e848f40d7850abf5569d6 100644
--- a/tensorflow/lite/kernels/kernel_util.h
+++ b/tensorflow/lite/kernels/kernel_util.h
@@ -84,6 +84,14 @@ inline void SetTensorToDynamic(TfLiteTensor* tensor) {
   }
 }
 
+// Check dimensionality match and populate OpData for Conv and DepthwiseConv.
+TfLiteStatus PopulateConvolutionQuantizationParams(
+    TfLiteContext* context, const TfLiteTensor* input,
+    const TfLiteTensor* filter, const TfLiteTensor* bias, TfLiteTensor* output,
+    const TfLiteFusedActivation& activation, int32_t* multiplier, int* shift,
+    int32_t* output_activation_min, int32_t* output_activation_max,
+    int32_t* per_channel_multiplier, int* per_channel_shift);
+
 // Calculates the multiplication factor for a quantized convolution (or
 // quantized depthwise convolution) involving the given tensors. Returns an
 // error if the scales of the tensors are not compatible.
@@ -104,6 +112,9 @@ TfLiteStatus CalculateActivationRangeQuantized(TfLiteContext* context,
 void CalculateActivationRangeUint8(TfLiteFusedActivation activation,
                                    TfLiteTensor* output, int32_t* act_min,
                                    int32_t* act_max);
+void CalculateActivationRangeInt8(TfLiteFusedActivation activation,
+                                  TfLiteTensor* output, int32_t* act_min,
+                                  int32_t* act_max);
 // Calculates the useful range of an activation layer given its activation
 // tensor.a
 template <typename T>
diff --git a/tensorflow/lite/kernels/kernel_util_test.cc b/tensorflow/lite/kernels/kernel_util_test.cc
index 70eb18365891097686d579bde4a5457703e84aee..4e792542a19eafbe8703e5a5472a7a9851080ef7 100644
--- a/tensorflow/lite/kernels/kernel_util_test.cc
+++ b/tensorflow/lite/kernels/kernel_util_test.cc
@@ -28,6 +28,8 @@ class KernelUtilTest : public ::testing::Test {
   KernelUtilTest() {
     context_.ReportError = ReportError;
 
+    memset(&tensor1_, 0, sizeof(TfLiteTensor));
+    memset(&tensor2_, 0, sizeof(TfLiteTensor));
     tensor1_.dims = nullptr;
     tensor2_.dims = nullptr;
     tensor1_.allocation_type = kTfLiteMmapRo;
@@ -142,6 +144,113 @@ TEST_F(KernelUtilTest, BroadcastShapeDifferentSizes) {
   TfLiteIntArrayFree(output);
 }
 
+// TODO(jianlijianli): Add more test cases.
+TEST_F(KernelUtilTest, CheckAndPopulate) {
+  // Create input.
+  TfLiteTensor input;
+  input.type = kTfLiteInt8;
+  input.allocation_type = kTfLiteArenaRw;
+  input.dims = TfLiteIntArrayCreate(1);
+  input.dims->data[0] = 2;
+  TfLiteQuantizationParams input_quant = {0.5, 5};
+  input.params = input_quant;
+  input.quantization.type = kTfLiteAffineQuantization;
+  auto* input_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  input_params->scale = TfLiteFloatArrayCreate(1);
+  input_params->scale->data[0] = 0.5;
+  input_params->zero_point = TfLiteIntArrayCreate(1);
+  input_params->zero_point->data[0] = 5;
+  input.quantization.params = reinterpret_cast<void*>(input_params);
+
+  // Create filter.
+  TfLiteTensor filter;
+  filter.type = kTfLiteInt8;
+  filter.allocation_type = kTfLiteArenaRw;
+  filter.dims = TfLiteIntArrayCreate(4);
+  filter.dims->data[0] = 3;
+  filter.dims->data[1] = 4;
+  filter.dims->data[2] = 5;
+  filter.dims->data[3] = 6;
+  TfLiteQuantizationParams filter_quant = {0.25, 0};
+  filter.params = filter_quant;
+  filter.quantization.type = kTfLiteAffineQuantization;
+  auto* filter_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  filter_params->scale = TfLiteFloatArrayCreate(3);
+  filter_params->scale->data[0] = 0.25;
+  filter_params->scale->data[1] = 0.125;
+  filter_params->scale->data[2] = 0.25;
+  filter_params->zero_point = TfLiteIntArrayCreate(3);
+  filter_params->zero_point->data[0] = 0;
+  filter_params->zero_point->data[1] = 0;
+  filter_params->zero_point->data[2] = 0;
+  filter_params->quantized_dimension = 0;
+  filter.quantization.params = reinterpret_cast<void*>(filter_params);
+
+  // Create bias.
+  TfLiteTensor bias;
+  bias.type = kTfLiteInt32;
+  bias.allocation_type = kTfLiteArenaRw;
+  bias.dims = TfLiteIntArrayCreate(4);
+  TfLiteQuantizationParams bias_quant = {0.125, 9};
+  bias.params = bias_quant;
+  bias.quantization.type = kTfLiteAffineQuantization;
+  auto* bias_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  bias_params->scale = TfLiteFloatArrayCreate(3);
+  bias_params->scale->data[0] = 0.125;
+  bias_params->scale->data[1] = 0.0625;
+  bias_params->scale->data[2] = 0.125;
+  bias_params->zero_point = TfLiteIntArrayCreate(3);
+  bias_params->zero_point->data[0] = 11;
+  bias_params->zero_point->data[1] = 12;
+  bias_params->zero_point->data[2] = 15;
+  bias.quantization.params = reinterpret_cast<void*>(bias_params);
+
+  // Create output.
+  TfLiteTensor output;
+  output.type = kTfLiteInt8;
+  output.allocation_type = kTfLiteArenaRw;
+  output.dims = nullptr;
+  TfLiteQuantizationParams output_quant = {0.5, -128};
+  output.params = output_quant;
+  output.quantization.type = kTfLiteAffineQuantization;
+  auto* output_params = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  output_params->scale = TfLiteFloatArrayCreate(1);
+  output_params->scale->data[0] = 0.5;
+  output_params->zero_point = TfLiteIntArrayCreate(1);
+  output_params->zero_point->data[0] = -128;
+  output.quantization.params = reinterpret_cast<void*>(output_params);
+
+  // Create call parameters.
+  TfLiteContext context;
+  int32_t multiplier;
+  int shift;
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+  std::vector<int32_t> per_channel_multiplier(3);
+  std::vector<int> per_channel_shift(3);
+
+  // Call and verify results for per channel case.
+  EXPECT_EQ(
+      kTfLiteOk,
+      PopulateConvolutionQuantizationParams(
+          &context, &input, &filter, &bias, &output, kTfLiteActRelu,
+          &multiplier, &shift, &output_activation_min, &output_activation_max,
+          per_channel_multiplier.data(), per_channel_shift.data()));
+  EXPECT_THAT(per_channel_multiplier,
+              ::testing::ElementsAre(1073741824, 1073741824, 1073741824));
+  EXPECT_THAT(per_channel_shift, ::testing::ElementsAre(-1, -2, -1));
+
+  // Release.
+  TfLiteTensorFree(&input);
+  TfLiteTensorFree(&filter);
+  TfLiteTensorFree(&bias);
+  TfLiteTensorFree(&output);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/layer_norm_lstm.cc b/tensorflow/lite/kernels/layer_norm_lstm.cc
deleted file mode 100644
index 49e8a53c829a0c4a8ae355f8e7a6b97e3bbb81e1..0000000000000000000000000000000000000000
--- a/tensorflow/lite/kernels/layer_norm_lstm.cc
+++ /dev/null
@@ -1,1320 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Layer Normalization LSTM op that applies normalization by mean and standard
-// deviation to the activation of the LSTM layers. Please see
-// https://arxiv.org/abs/1607.06450 for details.
-#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
-#include "tensorflow/lite/context.h"
-#include "tensorflow/lite/kernels/internal/tensor_utils.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-
-namespace tflite {
-namespace ops {
-namespace custom {
-namespace layer_norm_lstm {
-
-// Struct to hold Layer Norm LSTM option data.
-struct OpData {
-  TfLiteFusedActivation activation;
-  float cell_clip;
-  float proj_clip;
-  int scratch_tensor_index;
-};
-
-// Input Tensors of size {n_batch, n_input}
-constexpr int kInputTensor = 0;
-
-// Input weight tensors of size: {n_cell, n_input}
-constexpr int kInputToInputWeightsTensor = 1;  // Optional
-constexpr int kInputToForgetWeightsTensor = 2;
-constexpr int kInputToCellWeightsTensor = 3;
-constexpr int kInputToOutputWeightsTensor = 4;
-
-// Recurrent weight tensors of size {n_cell, n_output}
-constexpr int kRecurrentToInputWeightsTensor = 5;  // Optional
-constexpr int kRecurrentToForgetWeightsTensor = 6;
-constexpr int kRecurrentToCellWeightsTensor = 7;
-constexpr int kRecurrentToOutputWeightsTensor = 8;
-
-// Peephole weights tensors of size {n_cell}, representing a diagonal matrix.
-constexpr int kCellToInputWeightsTensor = 9;    // Optional
-constexpr int kCellToForgetWeightsTensor = 10;  // Optional
-constexpr int kCellToOutputWeightsTensor = 11;  // Optional
-
-// Layer norm weights tensors of size {n_cell}, representing a diagonal matrix.
-constexpr int kInputLayerNormWeightsTensor = 12;  // Optional
-constexpr int kForgetLayerNormWeightsTensor = 13;
-constexpr int kCellLayerNormWeightsTensor = 14;
-constexpr int kOutputLayerNormWeightsTensor = 15;
-
-// Gates bias tensors of size {n_cell}
-constexpr int kInputGateBiasTensor = 16;  // Optional
-constexpr int kForgetGateBiasTensor = 17;
-constexpr int kCellGateBiasTensor = 18;
-constexpr int kOutputGateBiasTensor = 19;
-
-// Projection weight tensor of size {n_output, n_cell}
-constexpr int kProjectionWeightsTensor = 20;  // Optional
-// Projection bias tensor of size {n_output}
-constexpr int kProjectionBiasTensor = 21;  // Optional
-
-// State tensors.
-constexpr int kInputActivationStateTensor = 22;
-constexpr int kInputCellStateTensor = 23;
-
-// Output tensor.
-constexpr int kOutputTensor = 0;
-
-// Total number of scratch tensors for hybrid Op.
-constexpr int kTensorsToAdd = 7;
-
-// Small float to avoid divergence during calculation of deviation.
-const float kLayerNormEpsilon = 1e-8;
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* data = new OpData;
-
-  // Turn custom option data into flexbuffer map format.
-  const uint8_t* buffer_t = reinterpret_cast<const uint8_t*>(buffer);
-  const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap();
-
-  // Get activation function, cell_clip and proj_clip from the flexbuffer.
-  // TODO(b/113824099): make activation more generic.
-  assert(m["fused_activation_function"].ToString() == "TANH");
-  data->activation = kTfLiteActTanh;
-  data->cell_clip = m["cell_clip"].AsFloat();
-  data->proj_clip = m["proj_clip"].AsFloat();
-
-  // Populate scratch_tensor_index.
-  context->AddTensors(context, /*tensors_to_add=*/kTensorsToAdd,
-                      &data->scratch_tensor_index);
-  return data;
-}
-
-// Check that input tensor dimensions matches with each other.
-TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
-                                        TfLiteNode* node, int n_input,
-                                        int n_output, int n_cell) {
-  const OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
-
-  // Making sure clipping parameters have valid values.
-  // == 0 means no clipping
-  //  > 0 means clipping
-  TF_LITE_ENSURE(context, op_data->cell_clip >= 0);
-  TF_LITE_ENSURE(context, op_data->proj_clip >= 0);
-
-  const TfLiteTensor* input_to_input_weights =
-      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  if (!use_cifg) {
-    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
-    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
-    TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
-  }
-
-  const TfLiteTensor* input_to_forget_weights =
-      GetInput(context, node, kInputToForgetWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
-  TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
-
-  const TfLiteTensor* input_to_cell_weights =
-      GetInput(context, node, kInputToCellWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
-  TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
-
-  const TfLiteTensor* recurrent_to_input_weights =
-      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
-  if (use_cifg) {
-    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights, nullptr);
-  } else {
-    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
-    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[0],
-                      n_cell);
-    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[1],
-                      n_output);
-  }
-
-  const TfLiteTensor* recurrent_to_forget_weights =
-      GetInput(context, node, kRecurrentToForgetWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0],
-                    n_cell);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
-                    n_output);
-
-  const TfLiteTensor* recurrent_to_cell_weights =
-      GetInput(context, node, kRecurrentToCellWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
-                    n_output);
-
-  const TfLiteTensor* cell_to_input_weights =
-      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
-  if (cell_to_input_weights) {
-    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
-  }
-
-  const TfLiteTensor* cell_to_forget_weights =
-      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
-  if (cell_to_forget_weights) {
-    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
-  }
-
-  const TfLiteTensor* cell_to_output_weights =
-      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
-  if (cell_to_output_weights) {
-    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->data[0], n_cell);
-  }
-
-  // Making sure the peephole weights are there all or none.
-  const bool peephole_weights_all_or_none =
-      ((cell_to_input_weights != nullptr || use_cifg) &&
-       (cell_to_forget_weights != nullptr) &&
-       (cell_to_output_weights != nullptr)) ||
-      ((cell_to_input_weights == nullptr) &&
-       (cell_to_forget_weights == nullptr) &&
-       (cell_to_output_weights == nullptr));
-  TF_LITE_ENSURE(context, peephole_weights_all_or_none == true);
-
-  // Making sure layer norm weights are not null and have the right dimension.
-  const TfLiteTensor* input_layer_norm_weights =
-      GetOptionalInputTensor(context, node, kInputLayerNormWeightsTensor);
-  if (use_cifg) {
-    TF_LITE_ENSURE_EQ(context, input_layer_norm_weights, nullptr);
-  } else {
-    TF_LITE_ENSURE(context, input_layer_norm_weights != nullptr);
-    TF_LITE_ENSURE_EQ(context, input_layer_norm_weights->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, input_layer_norm_weights->dims->data[0], n_cell);
-  }
-
-  const TfLiteTensor* forget_layer_norm_weights =
-      GetInput(context, node, kForgetLayerNormWeightsTensor);
-  TF_LITE_ENSURE(context, forget_layer_norm_weights != nullptr);
-  TF_LITE_ENSURE_EQ(context, forget_layer_norm_weights->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, forget_layer_norm_weights->dims->data[0], n_cell);
-
-  const TfLiteTensor* cell_layer_norm_weights =
-      GetInput(context, node, kCellLayerNormWeightsTensor);
-  TF_LITE_ENSURE(context, cell_layer_norm_weights != nullptr);
-  TF_LITE_ENSURE_EQ(context, cell_layer_norm_weights->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, cell_layer_norm_weights->dims->data[0], n_cell);
-
-  const TfLiteTensor* output_layer_norm_weights =
-      GetInput(context, node, kOutputLayerNormWeightsTensor);
-  TF_LITE_ENSURE(context, output_layer_norm_weights != nullptr);
-  TF_LITE_ENSURE_EQ(context, output_layer_norm_weights->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, output_layer_norm_weights->dims->data[0], n_cell);
-
-  // Make sure the input gate bias is present only when not a CIFG-LSTM.
-  const TfLiteTensor* input_gate_bias =
-      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
-  if (use_cifg) {
-    TF_LITE_ENSURE_EQ(context, input_gate_bias, nullptr);
-  } else {
-    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
-  }
-
-  const TfLiteTensor* forget_gate_bias =
-      GetInput(context, node, kForgetGateBiasTensor);
-  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
-
-  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
-  TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
-
-  const TfLiteTensor* output_gate_bias =
-      GetInput(context, node, kOutputGateBiasTensor);
-  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
-
-  const TfLiteTensor* projection_weights =
-      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
-  if (projection_weights != nullptr) {
-    TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
-    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[0], n_output);
-    TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
-  }
-
-  const TfLiteTensor* projection_bias =
-      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
-  if (projection_bias != nullptr) {
-    TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output);
-  }
-
-  // Making sure the projection tensors are consistent:
-  // 1) If projection weight is not present, then projection bias should not be
-  // present.
-  // 2) If projection weight is present, then projection bias is optional.
-  const bool projection_tensors_consistent =
-      ((projection_weights != nullptr) || (projection_bias == nullptr));
-  TF_LITE_ENSURE(context, projection_tensors_consistent == true);
-
-  return kTfLiteOk;
-}
-
-// Resize the output, state tensors based on the sizes of the input tensors.
-// Allocate a temporary scratch tensor. Also check that the sizes of the input
-// tensors match each other.
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
-  TF_LITE_ENSURE_EQ(context, node->inputs->size, 24);
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-
-  // Inferring batch size, number of outputs and number of cells from the
-  // input tensors.
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-  TF_LITE_ENSURE(context, input->dims->size > 1);
-  const int n_batch = input->dims->data[0];
-  const int n_input = input->dims->data[1];
-
-  const TfLiteTensor* input_to_output_weights =
-      GetInput(context, node, kInputToOutputWeightsTensor);
-  const int n_cell = input_to_output_weights->dims->data[0];
-  TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->data[1], n_input);
-
-  const TfLiteTensor* recurrent_to_output_weights =
-      GetInput(context, node, kRecurrentToOutputWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->size, 2);
-  TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->data[0],
-                    n_cell);
-  const int n_output = recurrent_to_output_weights->dims->data[1];
-
-  // Check that input tensor dimensions matches with each other.
-  TF_LITE_ENSURE_OK(context, CheckInputTensorDimensions(context, node, n_input,
-                                                        n_output, n_cell));
-
-  // Get the pointer to output, activation_state and cell_state tensors.
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  const TfLiteTensor* activation_state =
-      GetInput(context, node, kInputActivationStateTensor);
-  const TfLiteTensor* cell_state =
-      GetInput(context, node, kInputCellStateTensor);
-
-  // Check the shape of input state tensors.
-  // These tensor may be 1D or 2D. It's fine as long as the total size is
-  // correct.
-  TF_LITE_ENSURE_EQ(context, NumElements(activation_state), n_batch * n_output);
-  TF_LITE_ENSURE_EQ(context, NumElements(cell_state), n_batch * n_cell);
-  // Resize the output tensors.
-  TfLiteIntArray* output_size = TfLiteIntArrayCreate(2);
-  output_size->data[0] = n_batch;
-  output_size->data[1] = n_output;
-  TF_LITE_ENSURE_OK(context,
-                    context->ResizeTensor(context, output, output_size));
-
-  // The weights are of consistent type, so it suffices to check one.
-  const bool is_hybrid_op = (input_to_output_weights->type == kTfLiteUInt8 &&
-                             input->type == kTfLiteFloat32);
-
-  TfLiteIntArrayFree(node->temporaries);
-  if (is_hybrid_op) {
-    node->temporaries = TfLiteIntArrayCreate(7);
-  } else {
-    node->temporaries = TfLiteIntArrayCreate(1);
-  }
-  node->temporaries->data[0] = op_data->scratch_tensor_index;
-
-  // Create a scratch buffer tensor.
-  TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
-  scratch_buffer->type = input->type;
-  scratch_buffer->allocation_type = kTfLiteArenaRw;
-
-  const TfLiteTensor* input_to_input_weights =
-      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
-  scratch_buffer_size->data[0] = n_batch;
-  if (use_cifg) {
-    // Reserving space for Cell, Forget, Output gates
-    scratch_buffer_size->data[1] = n_cell * 3;
-  } else {
-    // Reserving space for Input, Cell, Forget, Output gates
-    scratch_buffer_size->data[1] = n_cell * 4;
-  }
-  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
-                                                   scratch_buffer_size));
-
-  if (is_hybrid_op) {
-    // Allocate temporary tensors to store quantized values of input,
-    // activation_state and cell_state tensors.
-    node->temporaries->data[1] = op_data->scratch_tensor_index + 1;
-    TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
-    input_quantized->type = kTfLiteUInt8;
-    input_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
-      TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
-      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
-                                                       input_quantized_size));
-    }
-    node->temporaries->data[2] = op_data->scratch_tensor_index + 2;
-    TfLiteTensor* activation_state_quantized =
-        GetTemporary(context, node, /*index=*/2);
-    activation_state_quantized->type = kTfLiteUInt8;
-    activation_state_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(activation_state_quantized->dims,
-                             activation_state->dims)) {
-      TfLiteIntArray* activation_state_quantized_size =
-          TfLiteIntArrayCopy(activation_state->dims);
-      TF_LITE_ENSURE_OK(
-          context, context->ResizeTensor(context, activation_state_quantized,
-                                         activation_state_quantized_size));
-    }
-    node->temporaries->data[3] = op_data->scratch_tensor_index + 3;
-    TfLiteTensor* cell_state_quantized =
-        GetTemporary(context, node, /*index=*/3);
-    cell_state_quantized->type = kTfLiteUInt8;
-    cell_state_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(cell_state_quantized->dims, cell_state->dims)) {
-      TfLiteIntArray* cell_state_quantized_size =
-          TfLiteIntArrayCopy(cell_state->dims);
-      TF_LITE_ENSURE_OK(context,
-                        context->ResizeTensor(context, cell_state_quantized,
-                                              cell_state_quantized_size));
-    }
-
-    // Allocate temporary tensors to store scaling factors and product scaling
-    // factors. The latter is a convenience storage which allows to quantize
-    // a vector once (which produces the scaling factors) and multiply it with
-    // different matrices (which requires multiplying the scaling factors with
-    // the scaling factor of the matrix).
-    node->temporaries->data[4] = op_data->scratch_tensor_index + 4;
-    TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/4);
-    scaling_factors->type = kTfLiteFloat32;
-    scaling_factors->allocation_type = kTfLiteArenaRw;
-    int scaling_dims[1] = {n_batch};
-    if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
-      TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
-      scaling_factors_size->data[0] = n_batch;
-      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
-                                                       scaling_factors_size));
-    }
-    node->temporaries->data[5] = op_data->scratch_tensor_index + 5;
-    TfLiteTensor* prod_scaling_factors =
-        GetTemporary(context, node, /*index=*/5);
-    prod_scaling_factors->type = kTfLiteFloat32;
-    prod_scaling_factors->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqualsArray(prod_scaling_factors->dims, 1,
-                                   scaling_dims)) {
-      TfLiteIntArray* prod_scaling_factors_size = TfLiteIntArrayCreate(1);
-      prod_scaling_factors_size->data[0] = n_batch;
-      TF_LITE_ENSURE_OK(context,
-                        context->ResizeTensor(context, prod_scaling_factors,
-                                              prod_scaling_factors_size));
-    }
-
-    // Allocate a temporary tensor to store the recovered weights. Since
-    // this is used for diagonal matrices, only need to store n_cell values.
-    node->temporaries->data[6] = op_data->scratch_tensor_index + 6;
-    TfLiteTensor* recovered_weights = GetTemporary(context, node, /*index=*/6);
-    recovered_weights->type = kTfLiteFloat32;
-    recovered_weights->allocation_type = kTfLiteArenaRw;
-    int recovered_dims[1] = {n_cell};
-    if (!TfLiteIntArrayEqualsArray(recovered_weights->dims, 1,
-                                   recovered_dims)) {
-      TfLiteIntArray* recovered_weights_size = TfLiteIntArrayCreate(1);
-      recovered_weights_size->data[0] = n_cell;
-      TF_LITE_ENSURE_OK(context,
-                        context->ResizeTensor(context, recovered_weights,
-                                              recovered_weights_size));
-    }
-  }
-  return kTfLiteOk;
-}
-
-void LayerNormLstmStep(
-    const float* input_ptr_batch, const float* input_to_input_weights_ptr,
-    const float* input_to_forget_weights_ptr,
-    const float* input_to_cell_weights_ptr,
-    const float* input_to_output_weights_ptr,
-    const float* recurrent_to_input_weights_ptr,
-    const float* recurrent_to_forget_weights_ptr,
-    const float* recurrent_to_cell_weights_ptr,
-    const float* recurrent_to_output_weights_ptr,
-    const float* cell_to_input_weights_ptr,
-    const float* cell_to_forget_weights_ptr,
-    const float* cell_to_output_weights_ptr,
-    const float* input_layer_norm_weight_ptr,
-    const float* forget_layer_norm_weight_ptr,
-    const float* cell_layer_norm_weight_ptr,
-    const float* output_layer_norm_weight_ptr, const float* input_gate_bias_ptr,
-    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
-    const float* output_gate_bias_ptr, const float* projection_weights_ptr,
-    const float* projection_bias_ptr, float cell_clip, float proj_clip,
-    const TfLiteFusedActivation& activation, int n_batch, int n_cell,
-    int n_input, int n_output, float* output_state_ptr, float* cell_state_ptr,
-    float* input_gate_scratch, float* forget_gate_scratch, float* cell_scratch,
-    float* output_gate_scratch, float* output_ptr_batch) {
-  // Since we have already checked that weights are all there or none, we can
-  // check the existense of only one to the get the condition.
-  const bool use_cifg = (input_to_input_weights_ptr == nullptr);
-  const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
-
-  // Initialize scratch buffers with 0.
-  if (!use_cifg) {
-    tensor_utils::ZeroVector(input_gate_scratch, n_cell * n_batch);
-  }
-  tensor_utils::ZeroVector(forget_gate_scratch, n_cell * n_batch);
-  tensor_utils::ZeroVector(cell_scratch, n_cell * n_batch);
-  tensor_utils::ZeroVector(output_gate_scratch, n_cell * n_batch);
-
-  // For each batch and cell: compute input_weight * input.
-  if (!use_cifg) {
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_input_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
-        input_gate_scratch, /*result_stride=*/1);
-  }
-
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      input_to_forget_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
-      forget_gate_scratch, /*result_stride=*/1);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      input_to_cell_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
-      cell_scratch, /*result_stride=*/1);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      input_to_output_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
-      output_gate_scratch, /*result_stride=*/1);
-
-  // For each batch and cell: compute recurrent_weight * output_state.
-  if (!use_cifg) {
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        recurrent_to_input_weights_ptr, n_cell, n_output, output_state_ptr,
-        n_batch, input_gate_scratch, /*result_stride=*/1);
-  }
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      recurrent_to_forget_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, forget_gate_scratch,
-      /*result_stride=*/1);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      recurrent_to_cell_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, cell_scratch, /*result_stride=*/1);
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      recurrent_to_output_weights_ptr, n_cell, n_output, output_state_ptr,
-      n_batch, output_gate_scratch,
-      /*result_stride=*/1);
-
-  // For each batch and cell: update input gate.
-  if (!use_cifg) {
-    if (use_peephole) {
-      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-          cell_to_input_weights_ptr, n_cell, cell_state_ptr, n_batch,
-          input_gate_scratch);
-    }
-    tensor_utils::MeanStddevNormalization(input_gate_scratch,
-                                          input_gate_scratch, n_cell, n_batch,
-                                          kLayerNormEpsilon);
-    tensor_utils::VectorBatchVectorCwiseProduct(input_layer_norm_weight_ptr,
-                                                n_cell, input_gate_scratch,
-                                                n_batch, input_gate_scratch);
-    tensor_utils::VectorBatchVectorAdd(input_gate_bias_ptr, n_cell, n_batch,
-                                       input_gate_scratch);
-    tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
-                                       input_gate_scratch);
-  }
-
-  // For each batch and cell: update forget gate.
-  if (use_peephole) {
-    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        cell_to_forget_weights_ptr, n_cell, cell_state_ptr, n_batch,
-        forget_gate_scratch);
-  }
-  tensor_utils::MeanStddevNormalization(forget_gate_scratch,
-                                        forget_gate_scratch, n_cell, n_batch,
-                                        kLayerNormEpsilon);
-  tensor_utils::VectorBatchVectorCwiseProduct(forget_layer_norm_weight_ptr,
-                                              n_cell, forget_gate_scratch,
-                                              n_batch, forget_gate_scratch);
-  tensor_utils::VectorBatchVectorAdd(forget_gate_bias_ptr, n_cell, n_batch,
-                                     forget_gate_scratch);
-  tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
-                                     forget_gate_scratch);
-
-  // For each batch and cell: update the cell.
-  tensor_utils::MeanStddevNormalization(cell_scratch, cell_scratch, n_cell,
-                                        n_batch, kLayerNormEpsilon);
-  tensor_utils::VectorBatchVectorCwiseProduct(
-      cell_layer_norm_weight_ptr, n_cell, cell_scratch, n_batch, cell_scratch);
-  tensor_utils::VectorBatchVectorAdd(cell_bias_ptr, n_cell, n_batch,
-                                     cell_scratch);
-  tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr,
-                                         n_batch * n_cell, cell_state_ptr);
-  tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
-                                        activation, cell_scratch);
-  if (use_cifg) {
-    tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
-                             forget_gate_scratch);
-    tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_scratch, forget_gate_scratch, n_batch * n_cell, cell_state_ptr);
-  } else {
-    tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr);
-  }
-  if (cell_clip > 0.0) {
-    tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell, cell_clip,
-                             cell_state_ptr);
-  }
-
-  // For each batch and cell: update the output gate.
-  if (use_peephole) {
-    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        cell_to_output_weights_ptr, n_cell, cell_state_ptr, n_batch,
-        output_gate_scratch);
-  }
-  tensor_utils::MeanStddevNormalization(output_gate_scratch,
-                                        output_gate_scratch, n_cell, n_batch,
-                                        kLayerNormEpsilon);
-  tensor_utils::VectorBatchVectorCwiseProduct(output_layer_norm_weight_ptr,
-                                              n_cell, output_gate_scratch,
-                                              n_batch, output_gate_scratch);
-  tensor_utils::VectorBatchVectorAdd(output_gate_bias_ptr, n_cell, n_batch,
-                                     output_gate_scratch);
-  tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
-                                     output_gate_scratch);
-  tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
-                                        activation, cell_scratch);
-  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
-                                         n_batch * n_cell, output_gate_scratch);
-
-  // For each batch: update the projection and output_state.
-  const bool use_projection_weight = (projection_weights_ptr != nullptr);
-  const bool use_projection_bias = (projection_bias_ptr != nullptr);
-  if (use_projection_weight) {
-    if (use_projection_bias) {
-      tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output,
-                                            n_batch, output_ptr_batch);
-    } else {
-      tensor_utils::ZeroVector(output_ptr_batch, n_batch * n_output);
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        projection_weights_ptr, n_output, n_cell, output_gate_scratch, n_batch,
-        output_ptr_batch, /*result_stride=*/1);
-    if (proj_clip > 0.0) {
-      tensor_utils::ClipVector(output_ptr_batch, n_batch * n_output, proj_clip,
-                               output_ptr_batch);
-    }
-  } else {
-    tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output,
-                             output_ptr_batch);
-  }
-  tensor_utils::CopyVector(output_ptr_batch, n_batch * n_output,
-                           output_state_ptr);
-}
-
-void LayerNormLstmStep(
-    const float* input_ptr_batch, const int8_t* input_to_input_weights_ptr,
-    float input_to_input_weights_scale,
-    const int8_t* input_to_forget_weights_ptr,
-    float input_to_forget_weights_scale,
-    const int8_t* input_to_cell_weights_ptr, float input_to_cell_weights_scale,
-    const int8_t* input_to_output_weights_ptr,
-    float input_to_output_weights_scale,
-    const int8_t* recurrent_to_input_weights_ptr,
-    float recurrent_to_input_weights_scale,
-    const int8_t* recurrent_to_forget_weights_ptr,
-    float recurrent_to_forget_weights_scale,
-    const int8_t* recurrent_to_cell_weights_ptr,
-    float recurrent_to_cell_weights_scale,
-    const int8_t* recurrent_to_output_weights_ptr,
-    float recurrent_to_output_weights_scale,
-    const int8_t* cell_to_input_weights_ptr, float cell_to_input_weights_scale,
-    const int8_t* cell_to_forget_weights_ptr,
-    float cell_to_forget_weights_scale,
-    const int8_t* cell_to_output_weights_ptr,
-    float cell_to_output_weights_scale,
-    const float* input_layer_norm_weight_ptr,
-    const float* forget_layer_norm_weight_ptr,
-    const float* cell_layer_norm_weight_ptr,
-    const float* output_layer_norm_weight_ptr, const float* input_gate_bias_ptr,
-    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
-    const float* output_gate_bias_ptr, const int8_t* projection_weights_ptr,
-    float projection_weights_scale, const float* projection_bias_ptr,
-    float cell_clip, float proj_clip, const TfLiteFusedActivation& activation,
-    int n_batch, int n_cell, int n_input, int n_output,
-    float* input_gate_scratch, float* forget_gate_scratch, float* cell_scratch,
-    float* output_gate_scratch, float* scaling_factors,
-    float* product_scaling_factors, float* recovered_weights,
-    int8_t* quantized_input_ptr_batch, int8_t* quantized_output_state_ptr,
-    int8_t* quantized_cell_state_ptr, float* output_state_ptr,
-    float* cell_state_ptr, float* output_ptr_batch) {
-  // Since we have already checked that weights are all there or none, we can
-  // check the existense of only one to the get the condition.
-  const bool use_cifg = (input_to_input_weights_ptr == nullptr);
-  const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
-
-  // Initialize scratch buffers with 0.
-  if (!use_cifg) {
-    tensor_utils::ZeroVector(input_gate_scratch, n_cell * n_batch);
-  }
-  tensor_utils::ZeroVector(forget_gate_scratch, n_cell * n_batch);
-  tensor_utils::ZeroVector(cell_scratch, n_cell * n_batch);
-  tensor_utils::ZeroVector(output_gate_scratch, n_cell * n_batch);
-
-  if (!tensor_utils::IsZeroVector(input_ptr_batch, n_batch * n_input)) {
-    // Save quantization and matmul computation for all zero input.
-    float unused_min, unused_max;
-    for (int b = 0; b < n_batch; ++b) {
-      const int offset = b * n_input;
-      tensor_utils::SymmetricQuantizeFloats(
-          input_ptr_batch + offset, n_input, quantized_input_ptr_batch + offset,
-          &unused_min, &unused_max, &scaling_factors[b]);
-    }
-    // For each batch and cell: compute input_weight * input.
-    if (!use_cifg) {
-      for (int b = 0; b < n_batch; ++b) {
-        product_scaling_factors[b] =
-            scaling_factors[b] * input_to_input_weights_scale;
-      }
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          input_to_input_weights_ptr, n_cell, n_input,
-          quantized_input_ptr_batch, product_scaling_factors, n_batch,
-          input_gate_scratch, /*result_stride=*/1);
-    }
-
-    for (int b = 0; b < n_batch; ++b) {
-      product_scaling_factors[b] =
-          scaling_factors[b] * input_to_forget_weights_scale;
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_forget_weights_ptr, n_cell, n_input, quantized_input_ptr_batch,
-        product_scaling_factors, n_batch, forget_gate_scratch,
-        /*result_stride=*/1);
-
-    for (int b = 0; b < n_batch; ++b) {
-      product_scaling_factors[b] =
-          scaling_factors[b] * input_to_cell_weights_scale;
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_cell_weights_ptr, n_cell, n_input, quantized_input_ptr_batch,
-        product_scaling_factors, n_batch, cell_scratch, /*result_stride=*/1);
-
-    for (int b = 0; b < n_batch; ++b) {
-      product_scaling_factors[b] =
-          scaling_factors[b] * input_to_output_weights_scale;
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_output_weights_ptr, n_cell, n_input, quantized_input_ptr_batch,
-        product_scaling_factors, n_batch, output_gate_scratch,
-        /*result_stride=*/1);
-  }
-
-  if (!tensor_utils::IsZeroVector(output_state_ptr, n_batch * n_output)) {
-    // Save quantization and matmul computation for all zero input.
-    float unused_min, unused_max;
-    for (int b = 0; b < n_batch; ++b) {
-      const int offset = b * n_output;
-      tensor_utils::SymmetricQuantizeFloats(output_state_ptr + offset, n_output,
-                                            quantized_output_state_ptr + offset,
-                                            &unused_min, &unused_max,
-                                            &scaling_factors[b]);
-    }
-    // For each batch and cell: compute recurrent_weight * output_state.
-    if (!use_cifg) {
-      for (int b = 0; b < n_batch; ++b) {
-        product_scaling_factors[b] =
-            scaling_factors[b] * recurrent_to_input_weights_scale;
-      }
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          recurrent_to_input_weights_ptr, n_cell, n_output,
-          quantized_output_state_ptr, product_scaling_factors, n_batch,
-          input_gate_scratch, /*result_stride=*/1);
-    }
-
-    for (int b = 0; b < n_batch; ++b) {
-      product_scaling_factors[b] =
-          scaling_factors[b] * recurrent_to_forget_weights_scale;
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        recurrent_to_forget_weights_ptr, n_cell, n_output,
-        quantized_output_state_ptr, product_scaling_factors, n_batch,
-        forget_gate_scratch, /*result_stride=*/1);
-
-    for (int b = 0; b < n_batch; ++b) {
-      product_scaling_factors[b] =
-          scaling_factors[b] * recurrent_to_cell_weights_scale;
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        recurrent_to_cell_weights_ptr, n_cell, n_output,
-        quantized_output_state_ptr, product_scaling_factors, n_batch,
-        cell_scratch, /*result_stride=*/1);
-
-    for (int b = 0; b < n_batch; ++b) {
-      product_scaling_factors[b] =
-          scaling_factors[b] * recurrent_to_output_weights_scale;
-    }
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        recurrent_to_output_weights_ptr, n_cell, n_output,
-        quantized_output_state_ptr, product_scaling_factors, n_batch,
-        output_gate_scratch, /*result_stride=*/1);
-  }
-
-  // Save quantization and matmul computation for all zero input.
-  bool is_cell_state_all_zeros =
-      tensor_utils::IsZeroVector(cell_state_ptr, n_batch * n_cell);
-
-  // For each batch and cell: update input gate.
-  if (!use_cifg) {
-    if (use_peephole && !is_cell_state_all_zeros) {
-      tensor_utils::VectorScalarMultiply(cell_to_input_weights_ptr, n_cell,
-                                         cell_to_input_weights_scale,
-                                         recovered_weights);
-      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-          recovered_weights, n_cell, cell_state_ptr, n_batch,
-          input_gate_scratch);
-    }
-    tensor_utils::MeanStddevNormalization(input_gate_scratch,
-                                          input_gate_scratch, n_cell, n_batch,
-                                          kLayerNormEpsilon);
-    tensor_utils::VectorBatchVectorCwiseProduct(input_layer_norm_weight_ptr,
-                                                n_cell, input_gate_scratch,
-                                                n_batch, input_gate_scratch);
-    tensor_utils::VectorBatchVectorAdd(input_gate_bias_ptr, n_cell, n_batch,
-                                       input_gate_scratch);
-    tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
-                                       input_gate_scratch);
-  }
-
-  // For each batch and cell: update forget gate.
-  if (use_peephole && !is_cell_state_all_zeros) {
-    tensor_utils::VectorScalarMultiply(cell_to_forget_weights_ptr, n_cell,
-                                       cell_to_forget_weights_scale,
-                                       recovered_weights);
-    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        recovered_weights, n_cell, cell_state_ptr, n_batch,
-        forget_gate_scratch);
-  }
-  tensor_utils::MeanStddevNormalization(forget_gate_scratch,
-                                        forget_gate_scratch, n_cell, n_batch,
-                                        kLayerNormEpsilon);
-  tensor_utils::VectorBatchVectorCwiseProduct(forget_layer_norm_weight_ptr,
-                                              n_cell, forget_gate_scratch,
-                                              n_batch, forget_gate_scratch);
-  tensor_utils::VectorBatchVectorAdd(forget_gate_bias_ptr, n_cell, n_batch,
-                                     forget_gate_scratch);
-  tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
-                                     forget_gate_scratch);
-
-  // For each batch and cell: update the cell.
-  tensor_utils::MeanStddevNormalization(cell_scratch, cell_scratch, n_cell,
-                                        n_batch, kLayerNormEpsilon);
-  tensor_utils::VectorBatchVectorCwiseProduct(
-      cell_layer_norm_weight_ptr, n_cell, cell_scratch, n_batch, cell_scratch);
-  tensor_utils::VectorBatchVectorAdd(cell_bias_ptr, n_cell, n_batch,
-                                     cell_scratch);
-  tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr,
-                                         n_batch * n_cell, cell_state_ptr);
-  tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
-                                        activation, cell_scratch);
-  if (use_cifg) {
-    tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
-                             forget_gate_scratch);
-    tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_scratch, forget_gate_scratch, n_batch * n_cell, cell_state_ptr);
-  } else {
-    tensor_utils::VectorVectorCwiseProductAccumulate(
-        cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr);
-  }
-  if (cell_clip > 0.0) {
-    tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell, cell_clip,
-                             cell_state_ptr);
-  }
-
-  is_cell_state_all_zeros =
-      tensor_utils::IsZeroVector(cell_state_ptr, n_batch * n_cell);
-  // For each batch and cell: update the output gate.
-  if (use_peephole && !is_cell_state_all_zeros) {
-    tensor_utils::VectorScalarMultiply(cell_to_output_weights_ptr, n_cell,
-                                       cell_to_output_weights_scale,
-                                       recovered_weights);
-    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
-        recovered_weights, n_cell, cell_state_ptr, n_batch,
-        output_gate_scratch);
-  }
-  tensor_utils::MeanStddevNormalization(output_gate_scratch,
-                                        output_gate_scratch, n_cell, n_batch,
-                                        kLayerNormEpsilon);
-  tensor_utils::VectorBatchVectorCwiseProduct(output_layer_norm_weight_ptr,
-                                              n_cell, output_gate_scratch,
-                                              n_batch, output_gate_scratch);
-  tensor_utils::VectorBatchVectorAdd(output_gate_bias_ptr, n_cell, n_batch,
-                                     output_gate_scratch);
-  tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
-                                     output_gate_scratch);
-  tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
-                                        activation, cell_scratch);
-  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
-                                         n_batch * n_cell, output_gate_scratch);
-
-  // For each batch: update the projection and output_state.
-  const bool use_projection_weight = (projection_weights_ptr != nullptr);
-  const bool use_projection_bias = (projection_bias_ptr != nullptr);
-  if (use_projection_weight) {
-    if (use_projection_bias) {
-      tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output,
-                                            n_batch, output_ptr_batch);
-    } else {
-      tensor_utils::ZeroVector(output_ptr_batch, n_batch * n_output);
-    }
-    if (!tensor_utils::IsZeroVector(output_gate_scratch, n_batch * n_cell)) {
-      // Save quantization and matmul computation for all zero input.
-      float unused_min, unused_max;
-      for (int b = 0; b < n_batch; ++b) {
-        const int offset = b * n_cell;
-        tensor_utils::SymmetricQuantizeFloats(
-            output_gate_scratch + offset, n_cell,
-            quantized_cell_state_ptr + offset, &unused_min, &unused_max,
-            &scaling_factors[b]);
-      }
-      for (int b = 0; b < n_batch; ++b) {
-        product_scaling_factors[b] =
-            scaling_factors[b] * projection_weights_scale;
-      }
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          projection_weights_ptr, n_output, n_cell, quantized_cell_state_ptr,
-          product_scaling_factors, n_batch, output_ptr_batch,
-          /*result_stride=*/1);
-    }
-    if (proj_clip > 0.0) {
-      tensor_utils::ClipVector(output_ptr_batch, n_batch * n_output, proj_clip,
-                               output_ptr_batch);
-    }
-  } else {
-    tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output,
-                             output_ptr_batch);
-  }
-  tensor_utils::CopyVector(output_ptr_batch, n_batch * n_output,
-                           output_state_ptr);
-}
-
-// The LayerNormLSTM Op engine.
-TfLiteStatus EvalFloat(
-    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
-    const TfLiteTensor* input_to_forget_weights,
-    const TfLiteTensor* input_to_cell_weights,
-    const TfLiteTensor* input_to_output_weights,
-    const TfLiteTensor* recurrent_to_input_weights,
-    const TfLiteTensor* recurrent_to_forget_weights,
-    const TfLiteTensor* recurrent_to_cell_weights,
-    const TfLiteTensor* recurrent_to_output_weights,
-    const TfLiteTensor* cell_to_input_weights,
-    const TfLiteTensor* cell_to_forget_weights,
-    const TfLiteTensor* cell_to_output_weights,
-    const TfLiteTensor* input_layer_norm_weights,
-    const TfLiteTensor* forget_layer_norm_weights,
-    const TfLiteTensor* cell_layer_norm_weights,
-    const TfLiteTensor* output_layer_norm_weights,
-    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
-    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    float cell_clip, float proj_clip, const TfLiteFusedActivation& activation,
-    TfLiteTensor* scratch_buffer, TfLiteTensor* activation_state,
-    TfLiteTensor* cell_state, TfLiteTensor* output) {
-  const int n_batch = input->dims->data[0];
-  const int n_input = input->dims->data[1];
-  // n_cell and n_output will be the same size when there is no projection.
-  const int n_cell = input_to_output_weights->dims->data[0];
-  const int n_output = recurrent_to_output_weights->dims->data[1];
-
-  // Since we have already checked that weights are all there or none, we can
-  // check the existence of only one to get the condition.
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  const bool use_peephole = (cell_to_output_weights != nullptr);
-
-  float* input_gate_scratch = nullptr;
-  float* cell_scratch = nullptr;
-  float* forget_gate_scratch = nullptr;
-  float* output_gate_scratch = nullptr;
-  if (use_cifg) {
-    cell_scratch = scratch_buffer->data.f;
-    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-  } else {
-    input_gate_scratch = scratch_buffer->data.f;
-    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
-  }
-
-  // Check optional tensors, the respective pointers can be null.
-  const float* input_to_input_weights_ptr =
-      (use_cifg) ? nullptr : input_to_input_weights->data.f;
-  const float* recurrent_to_input_weights_ptr =
-      (use_cifg) ? nullptr : recurrent_to_input_weights->data.f;
-  const float* input_gate_bias_ptr =
-      (use_cifg) ? nullptr : input_gate_bias->data.f;
-  const float* cell_to_input_weights_ptr =
-      (use_peephole && !use_cifg) ? cell_to_input_weights->data.f : nullptr;
-  const float* cell_to_forget_weights_ptr =
-      (use_peephole) ? cell_to_forget_weights->data.f : nullptr;
-  const float* cell_to_output_weights_ptr =
-      (use_peephole) ? cell_to_output_weights->data.f : nullptr;
-  const float* projection_weights_ptr =
-      (projection_weights == nullptr) ? nullptr : projection_weights->data.f;
-  const float* projection_bias_ptr =
-      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
-  const float* input_layer_norm_weight_ptr =
-      (input_layer_norm_weights == nullptr) ? nullptr
-                                            : input_layer_norm_weights->data.f;
-
-  // Required tensors, pointers are non-null.
-  const float* input_ptr_batch = input->data.f;
-  const float* input_to_forget_weights_ptr = input_to_forget_weights->data.f;
-  const float* input_to_cell_weights_ptr = input_to_cell_weights->data.f;
-  const float* input_to_output_weights_ptr = input_to_output_weights->data.f;
-  const float* recurrent_to_forget_weights_ptr =
-      recurrent_to_forget_weights->data.f;
-  const float* recurrent_to_cell_weights_ptr =
-      recurrent_to_cell_weights->data.f;
-  const float* recurrent_to_output_weights_ptr =
-      recurrent_to_output_weights->data.f;
-  const float* forget_layer_norm_weight_ptr = forget_layer_norm_weights->data.f;
-  const float* cell_layer_norm_weight_ptr = cell_layer_norm_weights->data.f;
-  const float* output_layer_norm_weight_ptr = output_layer_norm_weights->data.f;
-  const float* forget_gate_bias_ptr = forget_gate_bias->data.f;
-  const float* cell_bias_ptr = cell_bias->data.f;
-  const float* output_gate_bias_ptr = output_gate_bias->data.f;
-
-  float* activation_state_ptr = activation_state->data.f;
-  float* cell_state_ptr = cell_state->data.f;
-  float* output_ptr_batch = output->data.f;
-
-  LayerNormLstmStep(
-      input_ptr_batch, input_to_input_weights_ptr, input_to_forget_weights_ptr,
-      input_to_cell_weights_ptr, input_to_output_weights_ptr,
-      recurrent_to_input_weights_ptr, recurrent_to_forget_weights_ptr,
-      recurrent_to_cell_weights_ptr, recurrent_to_output_weights_ptr,
-      cell_to_input_weights_ptr, cell_to_forget_weights_ptr,
-      cell_to_output_weights_ptr, input_layer_norm_weight_ptr,
-      forget_layer_norm_weight_ptr, cell_layer_norm_weight_ptr,
-      output_layer_norm_weight_ptr, input_gate_bias_ptr, forget_gate_bias_ptr,
-      cell_bias_ptr, output_gate_bias_ptr, projection_weights_ptr,
-      projection_bias_ptr, cell_clip, proj_clip, activation, n_batch, n_cell,
-      n_input, n_output, activation_state_ptr, cell_state_ptr,
-      input_gate_scratch, forget_gate_scratch, cell_scratch,
-      output_gate_scratch, output_ptr_batch);
-
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalHybrid(
-    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
-    const TfLiteTensor* input_to_forget_weights,
-    const TfLiteTensor* input_to_cell_weights,
-    const TfLiteTensor* input_to_output_weights,
-    const TfLiteTensor* recurrent_to_input_weights,
-    const TfLiteTensor* recurrent_to_forget_weights,
-    const TfLiteTensor* recurrent_to_cell_weights,
-    const TfLiteTensor* recurrent_to_output_weights,
-    const TfLiteTensor* cell_to_input_weights,
-    const TfLiteTensor* cell_to_forget_weights,
-    const TfLiteTensor* cell_to_output_weights,
-    const TfLiteTensor* input_layer_norm_weights,
-    const TfLiteTensor* forget_layer_norm_weights,
-    const TfLiteTensor* cell_layer_norm_weights,
-    const TfLiteTensor* output_layer_norm_weights,
-    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
-    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
-    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    float cell_clip, float proj_clip, const TfLiteFusedActivation& activation,
-    TfLiteTensor* scratch_buffer, TfLiteTensor* scaling_factors,
-    TfLiteTensor* prod_scaling_factors, TfLiteTensor* recovered_weights,
-    TfLiteTensor* input_quantized, TfLiteTensor* activation_state_quantized,
-    TfLiteTensor* cell_state_quantized, TfLiteTensor* activation_state,
-    TfLiteTensor* cell_state, TfLiteTensor* output) {
-  const int n_batch = input->dims->data[0];
-  const int n_input = input->dims->data[1];
-  // n_cell and n_output will be the same size when there is no projection.
-  const int n_cell = input_to_output_weights->dims->data[0];
-  const int n_output = recurrent_to_output_weights->dims->data[1];
-
-  // Since we have already checked that weights are all there or none, we can
-  // check the existence of only one to get the condition.
-  const bool use_cifg = (input_to_input_weights == nullptr);
-  const bool use_peephole = (cell_to_output_weights != nullptr);
-
-  float* input_gate_scratch = nullptr;
-  float* cell_scratch = nullptr;
-  float* forget_gate_scratch = nullptr;
-  float* output_gate_scratch = nullptr;
-  if (use_cifg) {
-    cell_scratch = scratch_buffer->data.f;
-    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-  } else {
-    input_gate_scratch = scratch_buffer->data.f;
-    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
-    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
-    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
-  }
-
-  // Check optional tensors, the respective pointers can be null.
-  int8_t* input_to_input_weights_ptr = nullptr;
-  float input_to_input_weights_scale = 1.0f;
-  int8_t* recurrent_to_input_weights_ptr = nullptr;
-  float recurrent_to_input_weights_scale = 1.0f;
-  float* input_gate_bias_ptr = nullptr;
-  if (!use_cifg) {
-    input_to_input_weights_ptr =
-        reinterpret_cast<int8_t*>(input_to_input_weights->data.uint8);
-    recurrent_to_input_weights_ptr =
-        reinterpret_cast<int8_t*>(recurrent_to_input_weights->data.uint8);
-    input_gate_bias_ptr = input_gate_bias->data.f;
-    input_to_input_weights_scale = input_to_input_weights->params.scale;
-    recurrent_to_input_weights_scale = recurrent_to_input_weights->params.scale;
-  }
-
-  int8_t* cell_to_input_weights_ptr = nullptr;
-  int8_t* cell_to_forget_weights_ptr = nullptr;
-  int8_t* cell_to_output_weights_ptr = nullptr;
-  float cell_to_input_weights_scale = 1.0f;
-  float cell_to_forget_weights_scale = 1.0f;
-  float cell_to_output_weights_scale = 1.0f;
-  if (use_peephole) {
-    if (!use_cifg) {
-      cell_to_input_weights_ptr =
-          reinterpret_cast<int8_t*>(cell_to_input_weights->data.uint8);
-      cell_to_input_weights_scale = cell_to_input_weights->params.scale;
-    }
-    cell_to_forget_weights_ptr =
-        reinterpret_cast<int8_t*>(cell_to_forget_weights->data.uint8);
-    cell_to_output_weights_ptr =
-        reinterpret_cast<int8_t*>(cell_to_output_weights->data.uint8);
-    cell_to_forget_weights_scale = cell_to_forget_weights->params.scale;
-    cell_to_output_weights_scale = cell_to_output_weights->params.scale;
-  }
-
-  const int8_t* projection_weights_ptr =
-      (projection_weights == nullptr)
-          ? nullptr
-          : reinterpret_cast<int8_t*>(projection_weights->data.uint8);
-  const float projection_weights_scale =
-      (projection_weights == nullptr) ? 1.0f : projection_weights->params.scale;
-  const float* projection_bias_ptr =
-      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
-  const float* input_layer_norm_weight_ptr =
-      (input_layer_norm_weights == nullptr) ? nullptr
-                                            : input_layer_norm_weights->data.f;
-
-  // Required tensors, pointers are non-null.
-  const float* input_ptr_batch = input->data.f;
-  const int8_t* input_to_forget_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_forget_weights->data.uint8);
-  const float input_to_forget_weights_scale =
-      input_to_forget_weights->params.scale;
-  const int8_t* input_to_cell_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_cell_weights->data.uint8);
-  const float input_to_cell_weights_scale = input_to_cell_weights->params.scale;
-  const int8_t* input_to_output_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_output_weights->data.uint8);
-  const float input_to_output_weights_scale =
-      input_to_output_weights->params.scale;
-  const int8_t* recurrent_to_forget_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_forget_weights->data.uint8);
-  const float recurrent_to_forget_weights_scale =
-      recurrent_to_forget_weights->params.scale;
-  const int8_t* recurrent_to_cell_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_cell_weights->data.uint8);
-  const float recurrent_to_cell_weights_scale =
-      recurrent_to_cell_weights->params.scale;
-  const int8_t* recurrent_to_output_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_output_weights->data.uint8);
-  const float recurrent_to_output_weights_scale =
-      recurrent_to_output_weights->params.scale;
-  const float* forget_layer_norm_weight_ptr = forget_layer_norm_weights->data.f;
-  const float* cell_layer_norm_weight_ptr = cell_layer_norm_weights->data.f;
-  const float* output_layer_norm_weight_ptr = output_layer_norm_weights->data.f;
-  const float* forget_gate_bias_ptr = forget_gate_bias->data.f;
-  const float* cell_bias_ptr = cell_bias->data.f;
-  const float* output_gate_bias_ptr = output_gate_bias->data.f;
-
-  float* activation_state_ptr = activation_state->data.f;
-  float* cell_state_ptr = cell_state->data.f;
-  float* output_ptr_batch = output->data.f;
-
-  // Temporary storage for quantized values and scaling factors.
-  int8_t* quantized_input_ptr =
-      reinterpret_cast<int8_t*>(input_quantized->data.uint8);
-  int8_t* quantized_activation_state_ptr =
-      reinterpret_cast<int8_t*>(activation_state_quantized->data.uint8);
-  int8_t* quantized_cell_state_ptr =
-      reinterpret_cast<int8_t*>(cell_state_quantized->data.uint8);
-  float* scaling_factors_ptr = scaling_factors->data.f;
-  float* prod_scaling_factors_ptr = prod_scaling_factors->data.f;
-  float* recovered_weights_ptr = recovered_weights->data.f;
-
-  LayerNormLstmStep(
-      input_ptr_batch, input_to_input_weights_ptr, input_to_input_weights_scale,
-      input_to_forget_weights_ptr, input_to_forget_weights_scale,
-      input_to_cell_weights_ptr, input_to_cell_weights_scale,
-      input_to_output_weights_ptr, input_to_output_weights_scale,
-      recurrent_to_input_weights_ptr, recurrent_to_input_weights_scale,
-      recurrent_to_forget_weights_ptr, recurrent_to_forget_weights_scale,
-      recurrent_to_cell_weights_ptr, recurrent_to_cell_weights_scale,
-      recurrent_to_output_weights_ptr, recurrent_to_output_weights_scale,
-      cell_to_input_weights_ptr, cell_to_input_weights_scale,
-      cell_to_forget_weights_ptr, cell_to_forget_weights_scale,
-      cell_to_output_weights_ptr, cell_to_output_weights_scale,
-      input_layer_norm_weight_ptr, forget_layer_norm_weight_ptr,
-      cell_layer_norm_weight_ptr, output_layer_norm_weight_ptr,
-      input_gate_bias_ptr, forget_gate_bias_ptr, cell_bias_ptr,
-      output_gate_bias_ptr, projection_weights_ptr, projection_weights_scale,
-      projection_bias_ptr, cell_clip, proj_clip, activation, n_batch, n_cell,
-      n_input, n_output, input_gate_scratch, forget_gate_scratch, cell_scratch,
-      output_gate_scratch, scaling_factors_ptr, prod_scaling_factors_ptr,
-      recovered_weights_ptr, quantized_input_ptr,
-      quantized_activation_state_ptr, quantized_cell_state_ptr,
-      activation_state_ptr, cell_state_ptr, output_ptr_batch);
-
-  return kTfLiteOk;
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
-
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-
-  const TfLiteTensor* input_to_input_weights =
-      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  const TfLiteTensor* input_to_forget_weights =
-      GetInput(context, node, kInputToForgetWeightsTensor);
-  const TfLiteTensor* input_to_cell_weights =
-      GetInput(context, node, kInputToCellWeightsTensor);
-  const TfLiteTensor* input_to_output_weights =
-      GetInput(context, node, kInputToOutputWeightsTensor);
-
-  const TfLiteTensor* recurrent_to_input_weights =
-      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
-  const TfLiteTensor* recurrent_to_forget_weights =
-      GetInput(context, node, kRecurrentToForgetWeightsTensor);
-  const TfLiteTensor* recurrent_to_cell_weights =
-      GetInput(context, node, kRecurrentToCellWeightsTensor);
-  const TfLiteTensor* recurrent_to_output_weights =
-      GetInput(context, node, kRecurrentToOutputWeightsTensor);
-
-  const TfLiteTensor* cell_to_input_weights =
-      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
-  const TfLiteTensor* cell_to_forget_weights =
-      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
-  const TfLiteTensor* cell_to_output_weights =
-      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
-
-  const TfLiteTensor* input_layer_norm_weights =
-      GetOptionalInputTensor(context, node, kInputLayerNormWeightsTensor);
-  const TfLiteTensor* forget_layer_norm_weights =
-      GetInput(context, node, kForgetLayerNormWeightsTensor);
-  const TfLiteTensor* cell_layer_norm_weights =
-      GetInput(context, node, kCellLayerNormWeightsTensor);
-  const TfLiteTensor* output_layer_norm_weights =
-      GetInput(context, node, kOutputLayerNormWeightsTensor);
-
-  const TfLiteTensor* input_gate_bias =
-      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
-  const TfLiteTensor* forget_gate_bias =
-      GetInput(context, node, kForgetGateBiasTensor);
-  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
-  const TfLiteTensor* output_gate_bias =
-      GetInput(context, node, kOutputGateBiasTensor);
-
-  const TfLiteTensor* projection_weights =
-      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
-  const TfLiteTensor* projection_bias =
-      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
-
-  // Index the scratch buffers pointers to the global scratch buffer.
-  TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
-
-  TfLiteTensor* activation_state =
-      &context->tensors[node->inputs->data[kInputActivationStateTensor]];
-  TfLiteTensor* cell_state =
-      &context->tensors[node->inputs->data[kInputCellStateTensor]];
-
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  switch (input_to_output_weights->type) {
-    case kTfLiteFloat32: {
-      return EvalFloat(input, input_to_input_weights, input_to_forget_weights,
-                       input_to_cell_weights, input_to_output_weights,
-                       recurrent_to_input_weights, recurrent_to_forget_weights,
-                       recurrent_to_cell_weights, recurrent_to_output_weights,
-                       cell_to_input_weights, cell_to_forget_weights,
-                       cell_to_output_weights, input_layer_norm_weights,
-                       forget_layer_norm_weights, cell_layer_norm_weights,
-                       output_layer_norm_weights, input_gate_bias,
-                       forget_gate_bias, cell_bias, output_gate_bias,
-                       projection_weights, projection_bias, op_data->cell_clip,
-                       op_data->proj_clip, op_data->activation, scratch_buffer,
-                       activation_state, cell_state, output);
-    }
-    case kTfLiteUInt8: {
-      TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
-      TfLiteTensor* activation_state_quantized =
-          GetTemporary(context, node, /*index=*/2);
-      TfLiteTensor* cell_state_quantized =
-          GetTemporary(context, node, /*index=*/3);
-      TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/4);
-      TfLiteTensor* prod_scaling_factors =
-          GetTemporary(context, node, /*index=*/5);
-      TfLiteTensor* recovered_weights =
-          GetTemporary(context, node, /*index=*/6);
-      return EvalHybrid(
-          input, input_to_input_weights, input_to_forget_weights,
-          input_to_cell_weights, input_to_output_weights,
-          recurrent_to_input_weights, recurrent_to_forget_weights,
-          recurrent_to_cell_weights, recurrent_to_output_weights,
-          cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights,
-          input_layer_norm_weights, forget_layer_norm_weights,
-          cell_layer_norm_weights, output_layer_norm_weights, input_gate_bias,
-          forget_gate_bias, cell_bias, output_gate_bias, projection_weights,
-          projection_bias, op_data->cell_clip, op_data->proj_clip,
-          op_data->activation, scratch_buffer, scaling_factors,
-          prod_scaling_factors, recovered_weights, input_quantized,
-          activation_state_quantized, cell_state_quantized, activation_state,
-          cell_state, output);
-    }
-    default:
-      context->ReportError(context, "Type %d is not currently supported.",
-                           input_to_output_weights->type);
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-void Free(TfLiteContext* context, void* buffer) {
-  delete reinterpret_cast<OpData*>(buffer);
-}
-
-}  // namespace layer_norm_lstm
-
-TfLiteRegistration* Register_LAYER_NORM_LSTM() {
-  static TfLiteRegistration r = {layer_norm_lstm::Init, layer_norm_lstm::Free,
-                                 layer_norm_lstm::Prepare,
-                                 layer_norm_lstm::Eval};
-  return &r;
-}
-
-}  // namespace custom
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/lite/kernels/layer_norm_lstm_test.cc b/tensorflow/lite/kernels/layer_norm_lstm_test.cc
deleted file mode 100644
index 1c13cee1c3f66ed2a3459cd2bcc32211c3b1f00e..0000000000000000000000000000000000000000
--- a/tensorflow/lite/kernels/layer_norm_lstm_test.cc
+++ /dev/null
@@ -1,883 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Unit test for TFLite Layer Norm LSTM op.
-
-#include <memory>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/kernels/test_util.h"
-#include "tensorflow/lite/model.h"
-
-namespace tflite {
-namespace ops {
-namespace custom {
-
-TfLiteRegistration* Register_LAYER_NORM_LSTM();
-
-namespace {
-
-using ::testing::ElementsAreArray;
-
-class LayerNormLSTMOpModel : public SingleOpModel {
- public:
-  LayerNormLSTMOpModel(int n_batch, int n_input, int n_cell, int n_output,
-                       bool use_cifg, bool use_peephole,
-                       bool use_projection_weights, bool use_projection_bias,
-                       float cell_clip, float proj_clip,
-                       const std::vector<std::vector<int>>& input_shapes,
-                       const TensorType& weight_type = TensorType_FLOAT32)
-      : n_batch_(n_batch),
-        n_input_(n_input),
-        n_cell_(n_cell),
-        n_output_(n_output) {
-    input_ = AddInput(TensorType_FLOAT32);
-
-    if (use_cifg) {
-      input_to_input_weights_ = AddNullInput();
-    } else {
-      input_to_input_weights_ = AddInput(weight_type);
-    }
-
-    input_to_forget_weights_ = AddInput(weight_type);
-    input_to_cell_weights_ = AddInput(weight_type);
-    input_to_output_weights_ = AddInput(weight_type);
-
-    if (use_cifg) {
-      recurrent_to_input_weights_ = AddNullInput();
-    } else {
-      recurrent_to_input_weights_ = AddInput(weight_type);
-    }
-
-    recurrent_to_forget_weights_ = AddInput(weight_type);
-    recurrent_to_cell_weights_ = AddInput(weight_type);
-    recurrent_to_output_weights_ = AddInput(weight_type);
-
-    if (use_peephole) {
-      if (use_cifg) {
-        cell_to_input_weights_ = AddNullInput();
-      } else {
-        cell_to_input_weights_ = AddInput(weight_type);
-      }
-      cell_to_forget_weights_ = AddInput(weight_type);
-      cell_to_output_weights_ = AddInput(weight_type);
-    } else {
-      cell_to_input_weights_ = AddNullInput();
-      cell_to_forget_weights_ = AddNullInput();
-      cell_to_output_weights_ = AddNullInput();
-    }
-
-    if (use_cifg) {
-      input_layer_norm_weights_ = AddNullInput();
-    } else {
-      input_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
-    }
-    forget_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
-    cell_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
-    output_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
-
-    if (use_cifg) {
-      input_gate_bias_ = AddNullInput();
-    } else {
-      input_gate_bias_ = AddInput(TensorType_FLOAT32);
-    }
-    forget_gate_bias_ = AddInput(TensorType_FLOAT32);
-    cell_bias_ = AddInput(TensorType_FLOAT32);
-    output_gate_bias_ = AddInput(TensorType_FLOAT32);
-
-    if (use_projection_weights) {
-      projection_weights_ = AddInput(weight_type);
-      if (use_projection_bias) {
-        projection_bias_ = AddInput(TensorType_FLOAT32);
-      } else {
-        projection_bias_ = AddNullInput();
-      }
-    } else {
-      projection_weights_ = AddNullInput();
-      projection_bias_ = AddNullInput();
-    }
-
-    // Adding the 2 state tensors.
-    output_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_output_ * n_batch_}}, true);
-    cell_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}}, true);
-
-    output_ = AddOutput(TensorType_FLOAT32);
-
-    // Set up and pass in custom options using flexbuffer.
-    flexbuffers::Builder fbb;
-    fbb.Map([&]() {
-      fbb.Int("cell_clip", cell_clip);
-      fbb.Int("proj_clip", proj_clip);
-      fbb.String("fused_activation_function", "TANH");
-    });
-    fbb.Finish();
-    SetCustomOp("LAYER_NORM_LSTM", fbb.GetBuffer(), Register_LAYER_NORM_LSTM);
-    BuildInterpreter(input_shapes);
-  }
-
-  void SetInputToInputWeights(std::vector<float> f) {
-    PopulateTensor(input_to_input_weights_, f);
-  }
-
-  void SetInputToForgetWeights(std::vector<float> f) {
-    PopulateTensor(input_to_forget_weights_, f);
-  }
-
-  void SetInputToCellWeights(std::vector<float> f) {
-    PopulateTensor(input_to_cell_weights_, f);
-  }
-
-  void SetInputToOutputWeights(std::vector<float> f) {
-    PopulateTensor(input_to_output_weights_, f);
-  }
-
-  void SetRecurrentToInputWeights(std::vector<float> f) {
-    PopulateTensor(recurrent_to_input_weights_, f);
-  }
-
-  void SetRecurrentToForgetWeights(std::vector<float> f) {
-    PopulateTensor(recurrent_to_forget_weights_, f);
-  }
-
-  void SetRecurrentToCellWeights(std::vector<float> f) {
-    PopulateTensor(recurrent_to_cell_weights_, f);
-  }
-
-  void SetRecurrentToOutputWeights(std::vector<float> f) {
-    PopulateTensor(recurrent_to_output_weights_, f);
-  }
-
-  void SetCellToInputWeights(std::vector<float> f) {
-    PopulateTensor(cell_to_input_weights_, f);
-  }
-
-  void SetCellToForgetWeights(std::vector<float> f) {
-    PopulateTensor(cell_to_forget_weights_, f);
-  }
-
-  void SetCellToOutputWeights(std::vector<float> f) {
-    PopulateTensor(cell_to_output_weights_, f);
-  }
-
-  void SetInputLayerNormWeights(std::vector<float> f) {
-    PopulateTensor(input_layer_norm_weights_, f);
-  }
-
-  void SetForgetLayerNormWeights(std::vector<float> f) {
-    PopulateTensor(forget_layer_norm_weights_, f);
-  }
-
-  void SetCellLayerNormWeights(std::vector<float> f) {
-    PopulateTensor(cell_layer_norm_weights_, f);
-  }
-
-  void SetOutputLayerNormWeights(std::vector<float> f) {
-    PopulateTensor(output_layer_norm_weights_, f);
-  }
-
-  void SetInputGateBias(std::vector<float> f) {
-    PopulateTensor(input_gate_bias_, f);
-  }
-
-  void SetForgetGateBias(std::vector<float> f) {
-    PopulateTensor(forget_gate_bias_, f);
-  }
-
-  void SetCellBias(std::vector<float> f) { PopulateTensor(cell_bias_, f); }
-
-  void SetOutputGateBias(std::vector<float> f) {
-    PopulateTensor(output_gate_bias_, f);
-  }
-
-  void SetProjectionWeights(std::vector<float> f) {
-    PopulateTensor(projection_weights_, f);
-  }
-
-  void SetProjectionBias(std::vector<float> f) {
-    PopulateTensor(projection_bias_, f);
-  }
-
-  void SetInput(int offset, const float* begin, const float* end) {
-    PopulateTensor(input_, offset, const_cast<float*>(begin),
-                   const_cast<float*>(end));
-  }
-
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
-
-  int num_inputs() { return n_input_; }
-  int num_outputs() { return n_output_; }
-  int num_cells() { return n_cell_; }
-  int num_batches() { return n_batch_; }
-
- protected:
-  int input_;
-  int input_to_input_weights_;
-  int input_to_forget_weights_;
-  int input_to_cell_weights_;
-  int input_to_output_weights_;
-
-  int recurrent_to_input_weights_;
-  int recurrent_to_forget_weights_;
-  int recurrent_to_cell_weights_;
-  int recurrent_to_output_weights_;
-
-  int cell_to_input_weights_;
-  int cell_to_forget_weights_;
-  int cell_to_output_weights_;
-
-  int input_layer_norm_weights_;
-  int forget_layer_norm_weights_;
-  int cell_layer_norm_weights_;
-  int output_layer_norm_weights_;
-
-  int input_gate_bias_;
-  int forget_gate_bias_;
-  int cell_bias_;
-  int output_gate_bias_;
-
-  int projection_weights_;
-  int projection_bias_;
-
-  int output_state_;
-  int cell_state_;
-
-  int output_;
-
-  int n_batch_;
-  int n_input_;
-  int n_cell_;
-  int n_output_;
-};
-
-class HybridLayerNormLSTMOpModel : public LayerNormLSTMOpModel {
- public:
-  HybridLayerNormLSTMOpModel(int n_batch, int n_input, int n_cell, int n_output,
-                             bool use_cifg, bool use_peephole,
-                             bool use_projection_weights,
-                             bool use_projection_bias, float cell_clip,
-                             float proj_clip,
-                             const std::vector<std::vector<int>>& input_shapes)
-      : LayerNormLSTMOpModel(n_batch, n_input, n_cell, n_output, use_cifg,
-                             use_peephole, use_projection_weights,
-                             use_projection_bias, cell_clip, proj_clip,
-                             input_shapes, TensorType_UINT8) {}
-
-  void SetInputToInputWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(input_to_input_weights_, f);
-  }
-
-  void SetInputToForgetWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(input_to_forget_weights_, f);
-  }
-
-  void SetInputToCellWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(input_to_cell_weights_, f);
-  }
-
-  void SetInputToOutputWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(input_to_output_weights_, f);
-  }
-
-  void SetRecurrentToInputWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_input_weights_, f);
-  }
-
-  void SetRecurrentToForgetWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_forget_weights_, f);
-  }
-
-  void SetRecurrentToCellWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_cell_weights_, f);
-  }
-
-  void SetRecurrentToOutputWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_output_weights_, f);
-  }
-
-  void SetCellToInputWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(cell_to_input_weights_, f);
-  }
-
-  void SetCellToForgetWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(cell_to_forget_weights_, f);
-  }
-
-  void SetCellToOutputWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(cell_to_output_weights_, f);
-  }
-
-  void SetInputLayerNormWeights(std::vector<float> f) {
-    PopulateTensor(input_layer_norm_weights_, f);
-  }
-
-  void SetForgetLayerNormWeights(std::vector<float> f) {
-    PopulateTensor(forget_layer_norm_weights_, f);
-  }
-
-  void SetCellLayerNormWeights(std::vector<float> f) {
-    PopulateTensor(cell_layer_norm_weights_, f);
-  }
-
-  void SetOutputLayerNormWeights(std::vector<float> f) {
-    PopulateTensor(output_layer_norm_weights_, f);
-  }
-
-  void SetProjectionWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(projection_weights_, f);
-  }
-};
-
-class BaseLayerNormLstmTest : public ::testing::Test {
- protected:
-  // Weights of the Layer Norm LSTM model. Some are optional.
-  std::vector<float> input_to_input_weights_;
-  std::vector<float> input_to_cell_weights_;
-  std::vector<float> input_to_forget_weights_;
-  std::vector<float> input_to_output_weights_;
-  std::vector<float> input_gate_bias_;
-  std::vector<float> cell_gate_bias_;
-  std::vector<float> forget_gate_bias_;
-  std::vector<float> output_gate_bias_;
-  std::vector<float> recurrent_to_input_weights_;
-  std::vector<float> recurrent_to_cell_weights_;
-  std::vector<float> recurrent_to_forget_weights_;
-  std::vector<float> recurrent_to_output_weights_;
-  std::vector<float> cell_to_input_weights_;
-  std::vector<float> cell_to_forget_weights_;
-  std::vector<float> cell_to_output_weights_;
-  std::vector<float> input_layer_norm_weights_;
-  std::vector<float> forget_layer_norm_weights_;
-  std::vector<float> cell_layer_norm_weights_;
-  std::vector<float> output_layer_norm_weights_;
-  std::vector<float> projection_weights_;
-
-  // Layer Norm LSTM input is stored as num_batch x num_inputs vector.
-  std::vector<std::vector<float>> layer_norm_lstm_input_;
-
-  // Compares output up to tolerance to the result of the layer_norm_lstm given
-  // the input.
-  void VerifyGoldens(const std::vector<std::vector<float>>& input,
-                     const std::vector<std::vector<float>>& output,
-                     LayerNormLSTMOpModel* layer_norm_lstm,
-                     float tolerance = 1e-5) {
-    const int num_batches = input.size();
-    EXPECT_GT(num_batches, 0);
-    const int num_inputs = layer_norm_lstm->num_inputs();
-    EXPECT_GT(num_inputs, 0);
-    const int input_sequence_size = input[0].size() / num_inputs;
-    EXPECT_GT(input_sequence_size, 0);
-    for (int i = 0; i < input_sequence_size; ++i) {
-      for (int b = 0; b < num_batches; ++b) {
-        const float* batch_start = input[b].data() + i * num_inputs;
-        const float* batch_end = batch_start + num_inputs;
-
-        layer_norm_lstm->SetInput(b * layer_norm_lstm->num_inputs(),
-                                  batch_start, batch_end);
-      }
-
-      layer_norm_lstm->Invoke();
-
-      const int num_outputs = layer_norm_lstm->num_outputs();
-      std::vector<float> expected;
-      for (int b = 0; b < num_batches; ++b) {
-        const float* golden_start_batch = output[b].data() + i * num_outputs;
-        const float* golden_end_batch = golden_start_batch + num_outputs;
-        expected.insert(expected.end(), golden_start_batch, golden_end_batch);
-      }
-      EXPECT_THAT(layer_norm_lstm->GetOutput(),
-                  ElementsAreArray(ArrayFloatNear(expected, tolerance)));
-    }
-  }
-};
-
-class NoCifgPeepholeProjectionNoClippingLayerNormLstmTest
-    : public BaseLayerNormLstmTest {
-  void SetUp() override {
-    input_to_input_weights_ = {0.5,  0.6,  0.7,  -0.8, -0.9, 0.1,  0.2,
-                               0.3,  -0.4, 0.5,  -0.8, 0.7,  -0.6, 0.5,
-                               -0.4, -0.5, -0.4, -0.3, -0.2, -0.1};
-
-    input_to_forget_weights_ = {-0.6, -0.1, 0.3,  0.2,  0.9,  -0.5, -0.2,
-                                -0.4, 0.3,  -0.8, -0.4, 0.3,  -0.5, -0.4,
-                                -0.6, 0.3,  -0.4, -0.6, -0.5, -0.5};
-
-    input_to_cell_weights_ = {-0.4, -0.3, -0.2, -0.1, -0.5, 0.5,  -0.2,
-                              -0.3, -0.2, -0.6, 0.6,  -0.1, -0.4, -0.3,
-                              -0.7, 0.7,  -0.9, -0.5, 0.8,  0.6};
-
-    input_to_output_weights_ = {-0.8, -0.4, -0.2, -0.9, -0.1, -0.7, 0.3,
-                                -0.3, -0.8, -0.2, 0.6,  -0.2, 0.4,  -0.7,
-                                -0.3, -0.5, 0.1,  0.5,  -0.6, -0.4};
-
-    input_gate_bias_ = {0.03, 0.15, 0.22, 0.38};
-
-    forget_gate_bias_ = {0.1, -0.3, -0.2, 0.1};
-
-    cell_gate_bias_ = {-0.05, 0.72, 0.25, 0.08};
-
-    output_gate_bias_ = {0.05, -0.01, 0.2, 0.1};
-
-    recurrent_to_input_weights_ = {-0.2, -0.3, 0.4,  0.1,  -0.5, 0.9,
-                                   -0.2, -0.3, -0.7, 0.05, -0.2, -0.6};
-
-    recurrent_to_cell_weights_ = {-0.3, 0.2, 0.1, -0.3, 0.8,  -0.08,
-                                  -0.2, 0.3, 0.8, -0.6, -0.1, 0.2};
-
-    recurrent_to_forget_weights_ = {-0.5, -0.3, -0.5, -0.2, 0.6, 0.4,
-                                    0.9,  0.3,  -0.1, 0.2,  0.5, 0.2};
-
-    recurrent_to_output_weights_ = {0.3,  -0.1, 0.1,  -0.2, -0.5, -0.7,
-                                    -0.2, -0.6, -0.1, -0.4, -0.7, -0.2};
-
-    cell_to_input_weights_ = {0.05, 0.1, 0.25, 0.15};
-
-    cell_to_forget_weights_ = {-0.02, -0.15, -0.25, -0.03};
-
-    cell_to_output_weights_ = {0.1, -0.1, -0.5, 0.05};
-
-    input_layer_norm_weights_ = {0.1, 0.2, 0.3, 0.5};
-    forget_layer_norm_weights_ = {0.2, 0.2, 0.4, 0.3};
-    cell_layer_norm_weights_ = {0.7, 0.2, 0.3, 0.8};
-    output_layer_norm_weights_ = {0.6, 0.2, 0.2, 0.5};
-
-    projection_weights_ = {-0.1, 0.2,  0.01, -0.2, 0.1,  0.5,
-                           0.3,  0.08, 0.07, 0.2,  -0.4, 0.2};
-
-    layer_norm_lstm_input_ = {
-        {// Batch0: 3 (input_sequence_size) * 5 (n_input)
-         0.7, 0.8, 0.1, 0.2, 0.3,   // seq 0
-         0.8, 0.1, 0.2, 0.4, 0.5,   // seq 1
-         0.2, 0.7, 0.7, 0.1, 0.7},  // seq 2
-
-        {// Batch1: 3 (input_sequence_size) * 5 (n_input)
-         0.3, 0.2, 0.9, 0.8, 0.1,   // seq 0
-         0.1, 0.5, 0.2, 0.4, 0.2,   // seq 1
-         0.6, 0.9, 0.2, 0.5, 0.7},  // seq 2
-    };
-  }
-};
-
-TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
-       LayerNormLstmBlackBoxTest) {
-  const int n_batch = 2;
-  const int n_input = 5;
-  const int n_cell = 4;
-  const int n_output = 3;
-  const float ceil_clip = 0.0;
-  const float proj_clip = 0.0;
-
-  LayerNormLSTMOpModel layer_norm_lstm(
-      n_batch, n_input, n_cell, n_output,
-      /*use_cifg=*/false, /*use_peephole=*/true,
-      /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, ceil_clip, proj_clip,
-      {
-          {n_batch, n_input},  // input tensor
-
-          {n_cell, n_input},  // input_to_input_weight tensor
-          {n_cell, n_input},  // input_to_forget_weight tensor
-          {n_cell, n_input},  // input_to_cell_weight tensor
-          {n_cell, n_input},  // input_to_output_weight tensor
-
-          {n_cell, n_output},  // recurrent_to_input_weight tensor
-          {n_cell, n_output},  // recurrent_to_forget_weight tensor
-          {n_cell, n_output},  // recurrent_to_cell_weight tensor
-          {n_cell, n_output},  // recurrent_to_output_weight tensor
-
-          {n_cell},  // cell_to_input_weight tensor
-          {n_cell},  // cell_to_forget_weight tensor
-          {n_cell},  // cell_to_output_weight tensor
-
-          {n_cell},  // input_layer_norm_weight tensor
-          {n_cell},  // forget_layer_norm_weight tensor
-          {n_cell},  // cell_layer_norm_weight tensor
-          {n_cell},  // output_layer_norm_weight tensor
-
-          {n_cell},  // input_gate_bias tensor
-          {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
-          {n_cell},  // output_gate_bias tensor
-
-          {n_output, n_cell},  // projection_weight tensor
-          {0},                 // projection_bias tensor
-      });
-
-  layer_norm_lstm.SetInputToInputWeights(input_to_input_weights_);
-  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
-  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
-
-  layer_norm_lstm.SetInputGateBias(input_gate_bias_);
-  layer_norm_lstm.SetCellBias(cell_gate_bias_);
-  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
-  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
-
-  layer_norm_lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
-  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
-
-  layer_norm_lstm.SetCellToInputWeights(cell_to_input_weights_);
-  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
-  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
-
-  layer_norm_lstm.SetInputLayerNormWeights(input_layer_norm_weights_);
-  layer_norm_lstm.SetForgetLayerNormWeights(forget_layer_norm_weights_);
-  layer_norm_lstm.SetCellLayerNormWeights(cell_layer_norm_weights_);
-  layer_norm_lstm.SetOutputLayerNormWeights(output_layer_norm_weights_);
-
-  layer_norm_lstm.SetProjectionWeights(projection_weights_);
-
-  // Verify the final output.
-  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
-      {
-          // Batch0: 3 (input_sequence_size) * 3 (n_output)
-          0.0244077, 0.128027, -0.00170918,  // seq 0
-          0.0137642, 0.140751, 0.0395835,    // seq 1
-          -0.00459231, 0.155278, 0.0837377,  // seq 2
-      },
-      {
-          // Batch1: 3 (input_sequence_size) * 3 (n_output)
-          -0.00692428, 0.0848741, 0.063445,  // seq 0
-          -0.00403912, 0.139963, 0.072681,   // seq 1
-          0.00752706, 0.161903, 0.0561371,   // seq 2
-      }};
-
-  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
-                &layer_norm_lstm);
-}
-
-TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
-       HybridLayerNormLstmBlackBoxTest) {
-  const int n_batch = 2;
-  const int n_input = 5;
-  const int n_cell = 4;
-  const int n_output = 3;
-  const float ceil_clip = 0.0;
-  const float proj_clip = 0.0;
-
-  HybridLayerNormLSTMOpModel layer_norm_lstm(
-      n_batch, n_input, n_cell, n_output,
-      /*use_cifg=*/false, /*use_peephole=*/true,
-      /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, ceil_clip, proj_clip,
-      {
-          {n_batch, n_input},  // input tensor
-
-          {n_cell, n_input},  // input_to_input_weight tensor
-          {n_cell, n_input},  // input_to_forget_weight tensor
-          {n_cell, n_input},  // input_to_cell_weight tensor
-          {n_cell, n_input},  // input_to_output_weight tensor
-
-          {n_cell, n_output},  // recurrent_to_input_weight tensor
-          {n_cell, n_output},  // recurrent_to_forget_weight tensor
-          {n_cell, n_output},  // recurrent_to_cell_weight tensor
-          {n_cell, n_output},  // recurrent_to_output_weight tensor
-
-          {n_cell},  // cell_to_input_weight tensor
-          {n_cell},  // cell_to_forget_weight tensor
-          {n_cell},  // cell_to_output_weight tensor
-
-          {n_cell},  // input_layer_norm_weight tensor
-          {n_cell},  // forget_layer_norm_weight tensor
-          {n_cell},  // cell_layer_norm_weight tensor
-          {n_cell},  // output_layer_norm_weight tensor
-
-          {n_cell},  // input_gate_bias tensor
-          {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
-          {n_cell},  // output_gate_bias tensor
-
-          {n_output, n_cell},  // projection_weight tensor
-          {0},                 // projection_bias tensor
-      });
-
-  layer_norm_lstm.SetInputToInputWeights(input_to_input_weights_);
-  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
-  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
-
-  layer_norm_lstm.SetInputGateBias(input_gate_bias_);
-  layer_norm_lstm.SetCellBias(cell_gate_bias_);
-  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
-  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
-
-  layer_norm_lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
-  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
-
-  layer_norm_lstm.SetCellToInputWeights(cell_to_input_weights_);
-  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
-  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
-
-  layer_norm_lstm.SetInputLayerNormWeights(input_layer_norm_weights_);
-  layer_norm_lstm.SetForgetLayerNormWeights(forget_layer_norm_weights_);
-  layer_norm_lstm.SetCellLayerNormWeights(cell_layer_norm_weights_);
-  layer_norm_lstm.SetOutputLayerNormWeights(output_layer_norm_weights_);
-
-  layer_norm_lstm.SetProjectionWeights(projection_weights_);
-
-  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
-      {
-          // Batch0: 3 (input_sequence_size) * 3 (n_output)
-          0.0244576, 0.127847, -0.00181765,  // seq 0
-          0.0137518, 0.140892, 0.0402234,    // seq 1
-          -0.0048839, 0.155096, 0.0840309,   // seq 2
-      },
-      {
-          // Batch1: 3 (input_sequence_size) * 3 (n_output)
-          -0.00728636, 0.0843957, 0.0634786,  // seq 0
-          -0.00448382, 0.139278, 0.0737372,   // seq 1
-          0.00734616, 0.161793, 0.0560238,    // seq 2
-      }};
-
-  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
-                &layer_norm_lstm);
-}
-
-class CifgPeepholeProjectionNoClippingLayerNormLstmTest
-    : public BaseLayerNormLstmTest {
-  void SetUp() override {
-    input_to_forget_weights_ = {-0.6, -0.1, 0.3,  0.2,  0.9,  -0.5, -0.2,
-                                -0.4, 0.3,  -0.8, -0.4, 0.3,  -0.5, -0.4,
-                                -0.6, 0.3,  -0.4, -0.6, -0.5, -0.5};
-    input_to_cell_weights_ = {-0.4, -0.3, -0.2, -0.1, -0.5, 0.5,  -0.2,
-                              -0.3, -0.2, -0.6, 0.6,  -0.1, -0.4, -0.3,
-                              -0.7, 0.7,  -0.9, -0.5, 0.8,  0.6};
-    input_to_output_weights_ = {-0.8, -0.4, -0.2, -0.9, -0.1, -0.7, 0.3,
-                                -0.3, -0.8, -0.2, 0.6,  -0.2, 0.4,  -0.7,
-                                -0.3, -0.5, 0.1,  0.5,  -0.6, -0.4};
-
-    forget_gate_bias_ = {0.1, -0.3, -0.2, 0.1};
-    cell_gate_bias_ = {-0.05, 0.72, 0.25, 0.08};
-    output_gate_bias_ = {0.05, -0.01, 0.2, 0.1};
-
-    recurrent_to_cell_weights_ = {-0.3, 0.2, 0.1, -0.3, 0.8,  -0.08,
-                                  -0.2, 0.3, 0.8, -0.6, -0.1, 0.2};
-    recurrent_to_forget_weights_ = {-0.5, -0.3, -0.5, -0.2, 0.6, 0.4,
-                                    0.9,  0.3,  -0.1, 0.2,  0.5, 0.2};
-    recurrent_to_output_weights_ = {0.3,  -0.1, 0.1,  -0.2, -0.5, -0.7,
-                                    -0.2, -0.6, -0.1, -0.4, -0.7, -0.2};
-
-    cell_to_forget_weights_ = {-0.02, -0.15, -0.25, -0.03};
-    cell_to_output_weights_ = {0.1, -0.1, -0.5, 0.05};
-
-    forget_layer_norm_weights_ = {0.2, 0.2, 0.4, 0.3};
-    cell_layer_norm_weights_ = {0.7, 0.2, 0.3, 0.8};
-    output_layer_norm_weights_ = {0.6, 0.2, 0.2, 0.5};
-    projection_weights_ = {-0.1, 0.2,  0.01, -0.2, 0.1,  0.5,
-                           0.3,  0.08, 0.07, 0.2,  -0.4, 0.2};
-
-    layer_norm_lstm_input_ = {
-        {// Batch0: 3 (input_sequence_size) * 5 (n_input)
-         0.7, 0.8, 0.1, 0.2, 0.3,   // seq 0
-         0.8, 0.1, 0.2, 0.4, 0.5,   // seq 1
-         0.2, 0.7, 0.7, 0.1, 0.7},  // seq 2
-
-        {// Batch1: 3 (input_sequence_size) * 5 (n_input)
-         0.3, 0.2, 0.9, 0.8, 0.1,   // seq 0
-         0.1, 0.5, 0.2, 0.4, 0.2,   // seq 1
-         0.6, 0.9, 0.2, 0.5, 0.7},  // seq 2
-    };
-  }
-};
-
-TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
-       LayerNormLstmBlackBoxTest) {
-  const int n_batch = 2;
-  const int n_input = 5;
-  const int n_cell = 4;
-  const int n_output = 3;
-  const float ceil_clip = 0.0;
-  const float proj_clip = 0.0;
-
-  LayerNormLSTMOpModel layer_norm_lstm(
-      n_batch, n_input, n_cell, n_output,
-      /*use_cifg=*/true, /*use_peephole=*/true,
-      /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, ceil_clip, proj_clip,
-      {
-          {n_batch, n_input},  // input tensor
-
-          {0, 0},             // input_to_input_weight tensor
-          {n_cell, n_input},  // input_to_forget_weight tensor
-          {n_cell, n_input},  // input_to_cell_weight tensor
-          {n_cell, n_input},  // input_to_output_weight tensor
-
-          {0, 0},              // recurrent_to_input_weight tensor
-          {n_cell, n_output},  // recurrent_to_forget_weight tensor
-          {n_cell, n_output},  // recurrent_to_cell_weight tensor
-          {n_cell, n_output},  // recurrent_to_output_weight tensor
-
-          {0},       // cell_to_input_weight tensor
-          {n_cell},  // cell_to_forget_weight tensor
-          {n_cell},  // cell_to_output_weight tensor
-
-          {0},       // input_layer_norm_weight tensor
-          {n_cell},  // forget_layer_norm_weight tensor
-          {n_cell},  // cell_layer_norm_weight tensor
-          {n_cell},  // output_layer_norm_weight tensor
-
-          {0},       // input_gate_bias tensor
-          {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
-          {n_cell},  // output_gate_bias tensor
-
-          {n_output, n_cell},  // projection_weight tensor
-          {0},                 // projection_bias tensor
-      });
-
-  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
-  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
-
-  layer_norm_lstm.SetCellBias(cell_gate_bias_);
-  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
-  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
-
-  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
-
-  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
-  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
-
-  layer_norm_lstm.SetForgetLayerNormWeights(forget_layer_norm_weights_);
-  layer_norm_lstm.SetCellLayerNormWeights(cell_layer_norm_weights_);
-  layer_norm_lstm.SetOutputLayerNormWeights(output_layer_norm_weights_);
-
-  layer_norm_lstm.SetProjectionWeights(projection_weights_);
-
-  // Verify the final output.
-  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
-      {
-          // Batch0: 3 (input_sequence_size) * 3 (n_output)
-          0.02129706, 0.140816242, 0.0112733059,     // seq 0
-          0.0132302344, 0.152308047, 0.0346313119,   // seq 1
-          -0.0123688057, 0.165790111, 0.0893077999,  // seq 2
-      },
-      {
-          // Batch1: 3 (input_sequence_size) * 3 (n_output)
-          -0.0226350538, 0.0916948169, 0.0769175813,  // seq 0
-          -0.0269966982, 0.149707705, 0.094149217,    // seq 1
-          -0.0103429332, 0.173016444, 0.0720508844,   // seq 2
-      }};
-
-  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
-                &layer_norm_lstm);
-}
-
-TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
-       HybridLayerNormLstmBlackBoxTest) {
-  const int n_batch = 2;
-  const int n_input = 5;
-  const int n_cell = 4;
-  const int n_output = 3;
-  const float ceil_clip = 0.0;
-  const float proj_clip = 0.0;
-
-  HybridLayerNormLSTMOpModel layer_norm_lstm(
-      n_batch, n_input, n_cell, n_output,
-      /*use_cifg=*/true, /*use_peephole=*/true,
-      /*use_projection_weights=*/true,
-      /*use_projection_bias=*/false, ceil_clip, proj_clip,
-      {
-          {n_batch, n_input},  // input tensor
-
-          {0, 0},             // input_to_input_weight tensor
-          {n_cell, n_input},  // input_to_forget_weight tensor
-          {n_cell, n_input},  // input_to_cell_weight tensor
-          {n_cell, n_input},  // input_to_output_weight tensor
-
-          {0, 0},              // recurrent_to_input_weight tensor
-          {n_cell, n_output},  // recurrent_to_forget_weight tensor
-          {n_cell, n_output},  // recurrent_to_cell_weight tensor
-          {n_cell, n_output},  // recurrent_to_output_weight tensor
-
-          {0},       // cell_to_input_weight tensor
-          {n_cell},  // cell_to_forget_weight tensor
-          {n_cell},  // cell_to_output_weight tensor
-
-          {0},       // input_layer_norm_weight tensor
-          {n_cell},  // forget_layer_norm_weight tensor
-          {n_cell},  // cell_layer_norm_weight tensor
-          {n_cell},  // output_layer_norm_weight tensor
-
-          {0},       // input_gate_bias tensor
-          {n_cell},  // forget_gate_bias tensor
-          {n_cell},  // cell_bias tensor
-          {n_cell},  // output_gate_bias tensor
-
-          {n_output, n_cell},  // projection_weight tensor
-          {0},                 // projection_bias tensor
-      });
-
-  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
-  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
-  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
-
-  layer_norm_lstm.SetCellBias(cell_gate_bias_);
-  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
-  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
-
-  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
-  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
-  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
-
-  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
-  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
-
-  layer_norm_lstm.SetForgetLayerNormWeights(forget_layer_norm_weights_);
-  layer_norm_lstm.SetCellLayerNormWeights(cell_layer_norm_weights_);
-  layer_norm_lstm.SetOutputLayerNormWeights(output_layer_norm_weights_);
-
-  layer_norm_lstm.SetProjectionWeights(projection_weights_);
-
-  // Verify the final output.
-  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
-      {
-          // Batch0: 3 (input_sequence_size) * 3 (n_output)
-          0.0212250091, 0.140474007, 0.0115012666,   // seq 0
-          0.0130806509, 0.152660668, 0.0347516984,   // seq 1
-          -0.0124010444, 0.166042402, 0.0898982584,  // seq 2
-      },
-      {
-          // Batch1: 3 (input_sequence_size) * 3 (n_output)
-          -0.0228835996, 0.0917588323, 0.0778886303,  // seq 0
-          -0.0275101066, 0.148769245, 0.0938384682,   // seq 1
-          -0.0103605557, 0.172605693, 0.0728750974,   // seq 2
-      }};
-
-  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
-                &layer_norm_lstm);
-}
-
-}  // namespace
-}  // namespace custom
-}  // namespace ops
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/lstm.cc b/tensorflow/lite/kernels/lstm.cc
index b57e2883b05232325d55ae9e6a08ed142b9a2dbb..470c74d207d51688c3c48de0fc8bdecda43097a7 100644
--- a/tensorflow/lite/kernels/lstm.cc
+++ b/tensorflow/lite/kernels/lstm.cc
@@ -38,17 +38,24 @@ namespace builtin {
 namespace lstm {
 
 struct OpData {
-  // Which kernel type to use. Full kernel (20 inputs) or basic kernel
-  // (5 inputs).
+  // Which kernel type to use. Full kernel (24 inputs) or basic kernel (5
+  // inputs).
+  // Please note the 20-input full kernel is deprecated and only kept
+  // here for backward compatibility.
   TfLiteLSTMKernelType kernel_type;
 
+  // If the lstm is layer norm.
+  bool is_layer_norm_lstm;
+
   // These fields are only used by full kernel.
   int activation_state_tensor_index;
   int cell_state_tensor_index;
   int scratch_tensor_index;
 };
 
-// For full inputs kernel (20-inputs).
+// For full inputs kernel (24-inputs).
+// Please note the 20-input full kernel is deprecated and only kept
+// here for backward compatibility.
 namespace full {
 
 // Input Tensors of size {n_batch, n_input}
@@ -87,6 +94,13 @@ constexpr int kProjectionBiasTensor = 17;  // Optional
 constexpr int kInputActivationStateTensor = 18;
 constexpr int kInputCellStateTensor = 19;
 
+// Layer norm coefficient tensors of size {n_cell}, representing a diagonal
+// matrix.
+constexpr int kInputLayerNormCoefficientsTensor = 20;   // Optional
+constexpr int kForgetLayerNormCoefficientsTensor = 21;  // Optional
+constexpr int kCellLayerNormCoefficientsTensor = 22;    // Optional
+constexpr int kOutputLayerNormCoefficientsTensor = 23;  // Optional
+
 // Output tensors.
 constexpr int kOutputTensor = 0;
 
@@ -101,7 +115,8 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 // Check that input tensor dimensions matches with each other.
 TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
                                         TfLiteNode* node, int n_input,
-                                        int n_output, int n_cell) {
+                                        int n_output, int n_cell,
+                                        bool is_layer_norm_lstm) {
   const auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
 
   // Making sure clipping parameters have valid values.
@@ -112,7 +127,8 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 
   const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  if (input_to_input_weights != nullptr) {
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  if (!use_cifg) {
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
@@ -186,7 +202,6 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   }
 
   // Making sure the peephole weights are there all or none.
-  const bool use_cifg = (input_to_input_weights == nullptr);
   const bool peephole_weights_all_or_none =
       ((cell_to_input_weights != nullptr || use_cifg) &&
        (cell_to_forget_weights != nullptr) &&
@@ -244,6 +259,40 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
       ((projection_weights != nullptr) || (projection_bias == nullptr));
   TF_LITE_ENSURE(context, projection_tensors_consistent == true);
 
+  if (is_layer_norm_lstm) {
+    const TfLiteTensor* input_layer_norm_coefficients = GetOptionalInputTensor(
+        context, node, kInputLayerNormCoefficientsTensor);
+    if (use_cifg) {
+      TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients, nullptr);
+    } else {
+      TF_LITE_ENSURE(context, input_layer_norm_coefficients != nullptr);
+      TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients->dims->size, 1);
+      TF_LITE_ENSURE_EQ(context, input_layer_norm_coefficients->dims->data[0],
+                        n_cell);
+    }
+
+    const TfLiteTensor* forget_layer_norm_coefficients =
+        GetInput(context, node, kForgetLayerNormCoefficientsTensor);
+    TF_LITE_ENSURE(context, forget_layer_norm_coefficients != nullptr);
+    TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->dims->data[0],
+                      n_cell);
+
+    const TfLiteTensor* cell_layer_norm_coefficients =
+        GetInput(context, node, kCellLayerNormCoefficientsTensor);
+    TF_LITE_ENSURE(context, cell_layer_norm_coefficients != nullptr);
+    TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->dims->data[0],
+                      n_cell);
+
+    const TfLiteTensor* output_layer_norm_coefficients =
+        GetInput(context, node, kOutputLayerNormCoefficientsTensor);
+    TF_LITE_ENSURE(context, output_layer_norm_coefficients != nullptr);
+    TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->dims->data[0],
+                      n_cell);
+  }
+
   return kTfLiteOk;
 }
 
@@ -254,8 +303,32 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
 
   TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-  TF_LITE_ENSURE_EQ(context, node->inputs->size, 20);
+  // Logic for determining regular lstm and layer norm lstm:
+  // input_size, forget_gate_layer_norm_tensor (20) null? is_layer_norm?
+  // 20,         N/A,                                     No.
+  // 24,         null,                                    No.
+  // 24,         not null,                                Yes.
+  // 20-inputs lstm are deprecated and is only kept here for backward
+  // compatibility.
+  if (node->inputs->size == 24) {
+    const TfLiteTensor* forget_layer_norm_coefficients =
+        GetInput(context, node, kForgetLayerNormCoefficientsTensor);
+    if (forget_layer_norm_coefficients == nullptr) {
+      op_data->is_layer_norm_lstm = false;
+    } else {
+      op_data->is_layer_norm_lstm = true;
+    }
+  } else if (node->inputs->size == 20) {
+    // This is deprecated and is only kept here for backward compatibility.
+    op_data->is_layer_norm_lstm = false;
+  } else {
+    context->ReportError(
+        context, "The LSTM Full kernel expects 20 or 24 inputs. Got %d inputs",
+        node->inputs->size);
+    return kTfLiteError;
+  }
 
+  const bool is_layer_norm_lstm = op_data->is_layer_norm_lstm;
   op_data->activation_state_tensor_index =
       node->inputs->data[kInputActivationStateTensor];
   op_data->cell_state_tensor_index = node->inputs->data[kInputCellStateTensor];
@@ -282,8 +355,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const int n_output = recurrent_to_output_weights->dims->data[1];
 
   // Check that input tensor dimensions matches with each other.
-  TF_LITE_ENSURE_OK(context, CheckInputTensorDimensions(context, node, n_input,
-                                                        n_output, n_cell));
+  TF_LITE_ENSURE_OK(context,
+                    CheckInputTensorDimensions(context, node, n_input, n_output,
+                                               n_cell, is_layer_norm_lstm));
 
   // Get the pointer to output, activation_state and cell_state tensors.
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
@@ -308,7 +382,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // The weights are of consistent type, so it suffices to check one.
   // TODO(mirkov): create a utility/macro for this check, so all Ops can use it.
-  const bool is_hybrid_op = (input_to_output_weights->type == kTfLiteUInt8 &&
+  const bool is_hybrid_op = ((input_to_output_weights->type == kTfLiteUInt8 ||
+                              input_to_output_weights->type == kTfLiteInt8) &&
                              input->type == kTfLiteFloat32);
 
   TfLiteIntArrayFree(node->temporaries);
@@ -344,7 +419,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // activation_state and cell_state tensors.
     node->temporaries->data[1] = op_data->scratch_tensor_index + 1;
     TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
-    input_quantized->type = kTfLiteUInt8;
+    input_quantized->type = input_to_output_weights->type;
     input_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
       TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
@@ -354,7 +429,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     node->temporaries->data[2] = op_data->scratch_tensor_index + 2;
     TfLiteTensor* activation_state_quantized =
         GetTemporary(context, node, /*index=*/2);
-    activation_state_quantized->type = kTfLiteUInt8;
+    activation_state_quantized->type = input_to_output_weights->type;
     activation_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(activation_state_quantized->dims,
                              activation_state->dims)) {
@@ -367,7 +442,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     node->temporaries->data[3] = op_data->scratch_tensor_index + 3;
     TfLiteTensor* cell_state_quantized =
         GetTemporary(context, node, /*index=*/3);
-    cell_state_quantized->type = kTfLiteUInt8;
+    cell_state_quantized->type = input_to_output_weights->type;
     cell_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(cell_state_quantized->dims, cell_state->dims)) {
       TfLiteIntArray* cell_state_quantized_size =
@@ -430,6 +505,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
   OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+  const bool is_layer_norm_lstm = op_data->is_layer_norm_lstm;
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
 
@@ -458,6 +534,23 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* cell_to_output_weights =
       GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
 
+  const TfLiteTensor* input_layer_norm_coefficients =
+      is_layer_norm_lstm ? GetOptionalInputTensor(
+                               context, node, kInputLayerNormCoefficientsTensor)
+                         : nullptr;
+  const TfLiteTensor* forget_layer_norm_coefficients =
+      is_layer_norm_lstm
+          ? GetInput(context, node, kForgetLayerNormCoefficientsTensor)
+          : nullptr;
+  const TfLiteTensor* cell_layer_norm_coefficients =
+      is_layer_norm_lstm
+          ? GetInput(context, node, kCellLayerNormCoefficientsTensor)
+          : nullptr;
+  const TfLiteTensor* output_layer_norm_coefficients =
+      is_layer_norm_lstm
+          ? GetInput(context, node, kOutputLayerNormCoefficientsTensor)
+          : nullptr;
+
   const TfLiteTensor* input_gate_bias =
       GetOptionalInputTensor(context, node, kInputGateBiasTensor);
   const TfLiteTensor* forget_gate_bias =
@@ -490,6 +583,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           recurrent_to_input_weights, recurrent_to_forget_weights,
           recurrent_to_cell_weights, recurrent_to_output_weights,
           cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights,
+          input_layer_norm_coefficients, forget_layer_norm_coefficients,
+          cell_layer_norm_coefficients, output_layer_norm_coefficients,
           /*aux_input=*/nullptr,
           /*aux_input_to_input_weights=*/nullptr,
           /*aux_input_to_forget_weights=*/nullptr,
@@ -501,7 +596,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*output_offset=*/0, scratch_buffer, activation_state, cell_state,
           output);
     }
-    case kTfLiteUInt8: {
+    case kTfLiteUInt8:
+    case kTfLiteInt8: {
       TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
       TfLiteTensor* activation_state_quantized =
           GetTemporary(context, node, /*index=*/2);
@@ -518,6 +614,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           recurrent_to_input_weights, recurrent_to_forget_weights,
           recurrent_to_cell_weights, recurrent_to_output_weights,
           cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights,
+          input_layer_norm_coefficients, forget_layer_norm_coefficients,
+          cell_layer_norm_coefficients, output_layer_norm_coefficients,
           /*aux_input=*/nullptr,
           /*aux_input_to_input_weights=*/nullptr,
           /*aux_input_to_forget_weights=*/nullptr,
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index f179ecb195e4dd999cb6e3ed0582e6385a3436b0..244cfae4a20b93b32022bee412f241397df53c49 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -14,8 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/lstm_eval.h"
 
-#include <stdint.h>
+#include <cstdint>
 
+#include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/op_macros.h"
@@ -27,6 +28,10 @@ namespace lstm_eval {
 
 namespace {
 
+// Small float to avoid divergence during calculation of deviation for layer
+// norm lstm.
+const float kLayerNormEpsilon = 1e-8;
+
 // Performs an LSTM batch inference step for input specified by input_ptr_batch.
 // The LSTM cell is specified by the pointers to its weights (*_weights_ptr) and
 // biases (*_bias_ptr), and buffers (*_scratch), along with additional
@@ -35,9 +40,44 @@ namespace {
 //  - n_batch: size of batch,
 //  - n_cell: number of cells (or units),
 //  - n_input: the input size,
+//  - n_aux_input: the auxilary input size.
 //  - n_output: the output size.
 //  - output_batch_leading_dim: the leading dimension of the output buffer.
 //
+// LSTM weights:
+// Input weights of size 'n_cell * n_input':
+//   input_to_input_weights            - optional (can be nullptr)
+//   input_to_forget_weights
+//   input_to_cell_weights
+//   input_to_output_weights
+// Auxilary input weights of size 'n_cell * n_aux_input':
+//   aux_input_to_input_weights        - optional
+//   aux_input_to_forget_weights       - optional
+//   aux_input_to_cell_weights         - optional
+//   aux_input_to_output_weights       - optional
+// Recurrent weights of size 'n_cell * n_output':
+//   recurrent_to_input_weights        - optional
+//   recurrent_to_forget_weights
+//   recurrent_to_cell_weights
+//   recurrent_to_input_weights
+// Peephole weights of size 'n_cell', representing diagonal matrices.
+//   cell_to_input_weights             - optional
+//   cell_to_cell_weights              - optional
+//   cell_to_output_weights            - optional
+// Projection weights of size 'n_output * n_cell'
+//   projection_weights_ptr            - optional
+// Gate biases of size 'n_cell':
+//   input_gate_bias_ptr               - optional
+//   forget_gate_bias_ptr
+//   cell_gate_bias_ptr
+//   output_gate_bias_ptr
+//
+// Layer norm coefficients of size 'n_cell', representing diagonal matrices.
+//   input_layer_norm_coefficients_ptr  - optional
+//   forget_layer_norm_coefficients_ptr - optional
+//   cell_layer_norm_coefficients_ptr   - optional
+//   output_layer_norm_coefficients_ptr - optional
+//
 // The pointers to the cell and output state and the output are updated.
 //
 // The pointers with the suffix "_batch" point to data aligned in batch_major
@@ -65,30 +105,47 @@ inline void LstmStepWithAuxInput(
     const float* recurrent_to_output_weights_ptr,
     const float* cell_to_input_weights_ptr,
     const float* cell_to_forget_weights_ptr,
-    const float* cell_to_output_weights_ptr, const float* input_gate_bias_ptr,
-    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
-    const float* output_gate_bias_ptr, const float* projection_weights_ptr,
-    const float* projection_bias_ptr, const TfLiteLSTMParams* params,
-    int n_batch, int n_cell, int n_input, int n_aux_input, int n_output,
-    int output_batch_leading_dim, float* output_state_ptr,
-    float* cell_state_ptr, float* input_gate_scratch,
+    const float* cell_to_output_weights_ptr,
+    const float* input_layer_norm_coefficients_ptr,
+    const float* forget_layer_norm_coefficients_ptr,
+    const float* cell_layer_norm_coefficients_ptr,
+    const float* output_layer_norm_coefficients_ptr,
+    const float* input_gate_bias_ptr, const float* forget_gate_bias_ptr,
+    const float* cell_bias_ptr, const float* output_gate_bias_ptr,
+    const float* projection_weights_ptr, const float* projection_bias_ptr,
+    const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
+    int n_aux_input, int n_output, int output_batch_leading_dim,
+    float* output_state_ptr, float* cell_state_ptr, float* input_gate_scratch,
     float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
     float* output_ptr_batch) {
   // Since we have already checked that weights are all there or none, we can
   // check the existense of only one to the get the condition.
   const bool use_cifg = (input_to_input_weights_ptr == nullptr);
   const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
-  // Initialize scratch buffers with bias.
-  if (!use_cifg) {
-    tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell, n_batch,
-                                          input_gate_scratch);
+  const bool is_layer_norm_lstm =
+      (forget_layer_norm_coefficients_ptr != nullptr);
+
+  // Initialize scratch buffers with bias for regular lstm or initialize with
+  // zero for layer norm lstm.
+  if (is_layer_norm_lstm) {
+    if (!use_cifg) {
+      tensor_utils::ZeroVector(input_gate_scratch, n_cell * n_batch);
+    }
+    tensor_utils::ZeroVector(forget_gate_scratch, n_cell * n_batch);
+    tensor_utils::ZeroVector(cell_scratch, n_cell * n_batch);
+    tensor_utils::ZeroVector(output_gate_scratch, n_cell * n_batch);
+  } else {
+    if (!use_cifg) {
+      tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell,
+                                            n_batch, input_gate_scratch);
+    }
+    tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch,
+                                          forget_gate_scratch);
+    tensor_utils::VectorBatchVectorAssign(cell_bias_ptr, n_cell, n_batch,
+                                          cell_scratch);
+    tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch,
+                                          output_gate_scratch);
   }
-  tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch,
-                                        forget_gate_scratch);
-  tensor_utils::VectorBatchVectorAssign(cell_bias_ptr, n_cell, n_batch,
-                                        cell_scratch);
-  tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch,
-                                        output_gate_scratch);
 
   // For each batch and cell: compute input_weight * input.
   if (!use_cifg) {
@@ -152,6 +209,16 @@ inline void LstmStepWithAuxInput(
           cell_to_input_weights_ptr, n_cell, cell_state_ptr, n_batch,
           input_gate_scratch);
     }
+    if (is_layer_norm_lstm) {
+      tensor_utils::MeanStddevNormalization(input_gate_scratch,
+                                            input_gate_scratch, n_cell, n_batch,
+                                            kLayerNormEpsilon);
+      tensor_utils::VectorBatchVectorCwiseProduct(
+          input_layer_norm_coefficients_ptr, n_cell, input_gate_scratch,
+          n_batch, input_gate_scratch);
+      tensor_utils::VectorBatchVectorAdd(input_gate_bias_ptr, n_cell, n_batch,
+                                         input_gate_scratch);
+    }
     tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
                                        input_gate_scratch);
   }
@@ -162,12 +229,31 @@ inline void LstmStepWithAuxInput(
         cell_to_forget_weights_ptr, n_cell, cell_state_ptr, n_batch,
         forget_gate_scratch);
   }
+  if (is_layer_norm_lstm) {
+    tensor_utils::MeanStddevNormalization(forget_gate_scratch,
+                                          forget_gate_scratch, n_cell, n_batch,
+                                          kLayerNormEpsilon);
+    tensor_utils::VectorBatchVectorCwiseProduct(
+        forget_layer_norm_coefficients_ptr, n_cell, forget_gate_scratch,
+        n_batch, forget_gate_scratch);
+    tensor_utils::VectorBatchVectorAdd(forget_gate_bias_ptr, n_cell, n_batch,
+                                       forget_gate_scratch);
+  }
   tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
                                      forget_gate_scratch);
 
   // For each batch and cell: update the cell.
   tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr,
                                          n_batch * n_cell, cell_state_ptr);
+  if (is_layer_norm_lstm) {
+    tensor_utils::MeanStddevNormalization(cell_scratch, cell_scratch, n_cell,
+                                          n_batch, kLayerNormEpsilon);
+    tensor_utils::VectorBatchVectorCwiseProduct(
+        cell_layer_norm_coefficients_ptr, n_cell, cell_scratch, n_batch,
+        cell_scratch);
+    tensor_utils::VectorBatchVectorAdd(cell_bias_ptr, n_cell, n_batch,
+                                       cell_scratch);
+  }
   tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
                                         params->activation, cell_scratch);
   if (use_cifg) {
@@ -190,6 +276,16 @@ inline void LstmStepWithAuxInput(
         cell_to_output_weights_ptr, n_cell, cell_state_ptr, n_batch,
         output_gate_scratch);
   }
+  if (is_layer_norm_lstm) {
+    tensor_utils::MeanStddevNormalization(output_gate_scratch,
+                                          output_gate_scratch, n_cell, n_batch,
+                                          kLayerNormEpsilon);
+    tensor_utils::VectorBatchVectorCwiseProduct(
+        output_layer_norm_coefficients_ptr, n_cell, output_gate_scratch,
+        n_batch, output_gate_scratch);
+    tensor_utils::VectorBatchVectorAdd(output_gate_bias_ptr, n_cell, n_batch,
+                                       output_gate_scratch);
+  }
   tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
                                      output_gate_scratch);
   tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
@@ -275,6 +371,11 @@ inline void LstmStepWithAuxInput(
 //   input_to_forget_weights
 //   input_to_cell_weights
 //   input_to_input_weights
+// Quantized auxilary input weights of size 'n_cell * n_aux_input':
+//   aux_input_to_input_weights        - optional
+//   aux_input_to_forget_weights       - optional
+//   aux_input_to_cell_weights         - optional
+//   aux_input_to_output_weights       - optional
 // Quantized recurrent weights of size 'n_cell * n_output':
 //   recurrent_to_input_weights        - optional
 //   recurrent_to_forget_weights
@@ -291,6 +392,10 @@ inline void LstmStepWithAuxInput(
 //   input_to_forget_weights_scale
 //   input_to_cell_weights_scale
 //   input_to_output_weights_scale
+//   aux_input_to_input_weights_scale  - optional
+//   aux_input_to_forget_weights_scale - optional
+//   aux_input_to_cell_weights_scale   - optional
+//   aux_input_to_output_weights_scale - optional
 //   recurrent_to_input_weights_scale  - optional
 //   recurrent_to_forget_weights_scale
 //   recurrent_to_cell_weights_scale
@@ -305,6 +410,12 @@ inline void LstmStepWithAuxInput(
 //   cell_gate_bias_ptr
 //   output_gate_bias_ptr
 //
+// Layer norm coefficients of size 'n_cell', representing diagonal matrices.
+//   input_layer_norm_coefficients_ptr  - optional
+//   forget_layer_norm_coefficients_ptr - optional
+//   cell_layer_norm_coefficients_ptr   - optional
+//   output_layer_norm_coefficients_ptr - optional
+//
 // Temporary pre-allocated storage for quantized values:
 //   quantized_input_ptr_batch (same size as input_ptr_batch)
 //   quantized_output_state_ptr (same size as output_state_ptr)
@@ -344,33 +455,50 @@ inline void LstmStepWithAuxInput(
     const int8_t* cell_to_forget_weights_ptr,
     float cell_to_forget_weights_scale,
     const int8_t* cell_to_output_weights_ptr,
-    float cell_to_output_weights_scale, const float* input_gate_bias_ptr,
-    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
-    const float* output_gate_bias_ptr, const int8_t* projection_weights_ptr,
-    float projection_weights_scale, const float* projection_bias_ptr,
-    const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
-    int n_aux_input, int n_output, int output_batch_leading_dim,
-    float* input_gate_scratch, float* forget_gate_scratch, float* cell_scratch,
-    float* output_gate_scratch, float* scaling_factors,
-    float* product_scaling_factors, float* recovered_cell_weights,
-    int8_t* quantized_input_ptr_batch, int8_t* quantized_aux_input_ptr_batch,
-    int8_t* quantized_output_state_ptr, int8_t* quantized_cell_state_ptr,
-    float* output_state_ptr, float* cell_state_ptr, float* output_ptr_batch) {
+    float cell_to_output_weights_scale,
+    const float* input_layer_norm_coefficients_ptr,
+    const float* forget_layer_norm_coefficients_ptr,
+    const float* cell_layer_norm_coefficients_ptr,
+    const float* output_layer_norm_coefficients_ptr,
+    const float* input_gate_bias_ptr, const float* forget_gate_bias_ptr,
+    const float* cell_bias_ptr, const float* output_gate_bias_ptr,
+    const int8_t* projection_weights_ptr, float projection_weights_scale,
+    const float* projection_bias_ptr, const TfLiteLSTMParams* params,
+    int n_batch, int n_cell, int n_input, int n_aux_input, int n_output,
+    int output_batch_leading_dim, float* input_gate_scratch,
+    float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
+    float* scaling_factors, float* product_scaling_factors,
+    float* recovered_cell_weights, int8_t* quantized_input_ptr_batch,
+    int8_t* quantized_aux_input_ptr_batch, int8_t* quantized_output_state_ptr,
+    int8_t* quantized_cell_state_ptr, float* output_state_ptr,
+    float* cell_state_ptr, float* output_ptr_batch) {
   // Since we have already checked that weights are all there or none, we
   // can check the existense of only one to the get the condition.
   const bool use_cifg = (input_to_input_weights_ptr == nullptr);
   const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
+  const bool is_layer_norm_lstm =
+      (forget_layer_norm_coefficients_ptr != nullptr);
+
   // Initialize scratch buffers with bias.
-  if (!use_cifg) {
-    tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell, n_batch,
-                                          input_gate_scratch);
+  if (is_layer_norm_lstm) {
+    if (!use_cifg) {
+      tensor_utils::ZeroVector(input_gate_scratch, n_cell * n_batch);
+    }
+    tensor_utils::ZeroVector(forget_gate_scratch, n_cell * n_batch);
+    tensor_utils::ZeroVector(cell_scratch, n_cell * n_batch);
+    tensor_utils::ZeroVector(output_gate_scratch, n_cell * n_batch);
+  } else {
+    if (!use_cifg) {
+      tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell,
+                                            n_batch, input_gate_scratch);
+    }
+    tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch,
+                                          forget_gate_scratch);
+    tensor_utils::VectorBatchVectorAssign(cell_bias_ptr, n_cell, n_batch,
+                                          cell_scratch);
+    tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch,
+                                          output_gate_scratch);
   }
-  tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch,
-                                        forget_gate_scratch);
-  tensor_utils::VectorBatchVectorAssign(cell_bias_ptr, n_cell, n_batch,
-                                        cell_scratch);
-  tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch,
-                                        output_gate_scratch);
 
   if (!tensor_utils::IsZeroVector(input_ptr_batch, n_batch * n_input)) {
     // Save quantization and matmul computation for all zero input.
@@ -535,6 +663,16 @@ inline void LstmStepWithAuxInput(
           recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
           input_gate_scratch);
     }
+    if (is_layer_norm_lstm) {
+      tensor_utils::MeanStddevNormalization(input_gate_scratch,
+                                            input_gate_scratch, n_cell, n_batch,
+                                            kLayerNormEpsilon);
+      tensor_utils::VectorBatchVectorCwiseProduct(
+          input_layer_norm_coefficients_ptr, n_cell, input_gate_scratch,
+          n_batch, input_gate_scratch);
+      tensor_utils::VectorBatchVectorAdd(input_gate_bias_ptr, n_cell, n_batch,
+                                         input_gate_scratch);
+    }
     tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
                                        input_gate_scratch);
   }
@@ -548,12 +686,31 @@ inline void LstmStepWithAuxInput(
         recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
         forget_gate_scratch);
   }
+  if (is_layer_norm_lstm) {
+    tensor_utils::MeanStddevNormalization(forget_gate_scratch,
+                                          forget_gate_scratch, n_cell, n_batch,
+                                          kLayerNormEpsilon);
+    tensor_utils::VectorBatchVectorCwiseProduct(
+        forget_layer_norm_coefficients_ptr, n_cell, forget_gate_scratch,
+        n_batch, forget_gate_scratch);
+    tensor_utils::VectorBatchVectorAdd(forget_gate_bias_ptr, n_cell, n_batch,
+                                       forget_gate_scratch);
+  }
   tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
                                      forget_gate_scratch);
 
   // For each batch and cell: update the cell.
   tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr,
                                          n_batch * n_cell, cell_state_ptr);
+  if (is_layer_norm_lstm) {
+    tensor_utils::MeanStddevNormalization(cell_scratch, cell_scratch, n_cell,
+                                          n_batch, kLayerNormEpsilon);
+    tensor_utils::VectorBatchVectorCwiseProduct(
+        cell_layer_norm_coefficients_ptr, n_cell, cell_scratch, n_batch,
+        cell_scratch);
+    tensor_utils::VectorBatchVectorAdd(cell_bias_ptr, n_cell, n_batch,
+                                       cell_scratch);
+  }
   tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
                                         params->activation, cell_scratch);
   if (use_cifg) {
@@ -581,6 +738,16 @@ inline void LstmStepWithAuxInput(
         recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
         output_gate_scratch);
   }
+  if (is_layer_norm_lstm) {
+    tensor_utils::MeanStddevNormalization(output_gate_scratch,
+                                          output_gate_scratch, n_cell, n_batch,
+                                          kLayerNormEpsilon);
+    tensor_utils::VectorBatchVectorCwiseProduct(
+        output_layer_norm_coefficients_ptr, n_cell, output_gate_scratch,
+        n_batch, output_gate_scratch);
+    tensor_utils::VectorBatchVectorAdd(output_gate_bias_ptr, n_cell, n_batch,
+                                       output_gate_scratch);
+  }
   tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
                                      output_gate_scratch);
   tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
@@ -689,6 +856,15 @@ inline void LstmStepWithAuxInput(
     }
   }
 }
+
+int8_t* GetInt8DataPtr(const TfLiteTensor* tensor, const bool is_uint8) {
+  if (is_uint8) {
+    return reinterpret_cast<int8_t*>(tensor->data.uint8);
+  } else {
+    return tensor->data.int8;
+  }
+}
+
 }  // namespace
 
 TfLiteStatus EvalFloat(
@@ -702,7 +878,12 @@ TfLiteStatus EvalFloat(
     const TfLiteTensor* recurrent_to_output_weights,
     const TfLiteTensor* cell_to_input_weights,
     const TfLiteTensor* cell_to_forget_weights,
-    const TfLiteTensor* cell_to_output_weights, const TfLiteTensor* aux_input,
+    const TfLiteTensor* cell_to_output_weights,
+    const TfLiteTensor* input_layer_norm_coefficients,
+    const TfLiteTensor* forget_layer_norm_coefficients,
+    const TfLiteTensor* cell_layer_norm_coefficients,
+    const TfLiteTensor* output_layer_norm_coefficients,
+    const TfLiteTensor* aux_input,
     const TfLiteTensor* aux_input_to_input_weights,
     const TfLiteTensor* aux_input_to_forget_weights,
     const TfLiteTensor* aux_input_to_cell_weights,
@@ -735,6 +916,7 @@ TfLiteStatus EvalFloat(
   // check the existense of only one to the get the condition.
   const bool use_cifg = (input_to_input_weights == nullptr);
   const bool use_peephole = (cell_to_output_weights != nullptr);
+  const bool is_layer_norm_lstm = (forget_layer_norm_coefficients != nullptr);
 
   // Index the scratch buffers pointers to the global scratch buffer.
   float* input_gate_scratch = nullptr;
@@ -765,6 +947,15 @@ TfLiteStatus EvalFloat(
       (use_peephole) ? cell_to_forget_weights->data.f : nullptr;
   const float* cell_to_output_weights_ptr =
       (use_peephole) ? cell_to_output_weights->data.f : nullptr;
+  const float* input_layer_norm_coefficients_ptr =
+      (is_layer_norm_lstm && !use_cifg) ? input_layer_norm_coefficients->data.f
+                                        : nullptr;
+  const float* forget_layer_norm_coefficients_ptr =
+      is_layer_norm_lstm ? forget_layer_norm_coefficients->data.f : nullptr;
+  const float* cell_layer_norm_coefficients_ptr =
+      is_layer_norm_lstm ? cell_layer_norm_coefficients->data.f : nullptr;
+  const float* output_layer_norm_coefficients_ptr =
+      is_layer_norm_lstm ? output_layer_norm_coefficients->data.f : nullptr;
   const float* projection_weights_ptr =
       (projection_weights == nullptr) ? nullptr : projection_weights->data.f;
   const float* projection_bias_ptr =
@@ -794,7 +985,7 @@ TfLiteStatus EvalFloat(
       // If this is the forward_sequence, step forward, otherwise step
       // backwards.
       const int t_rel = forward_sequence ? t : max_time - t - 1;
-      const float* input_ptr = input->data.f + t_rel * input_step;
+      const float* input_ptr_batch = input->data.f + t_rel * input_step;
       if (aux_input) {
         aux_input_ptr = aux_input->data.f + t_rel * input_step;
       }
@@ -802,7 +993,7 @@ TfLiteStatus EvalFloat(
           output->data.f + t_rel * output_step + output_offset;
 
       LstmStepWithAuxInput(
-          input_ptr, input_to_input_weights_ptr,
+          input_ptr_batch, input_to_input_weights_ptr,
           input_to_forget_weights->data.f, input_to_cell_weights->data.f,
           input_to_output_weights->data.f, aux_input_ptr,
           aux_input_to_input_weights_ptr, aux_input_to_forget_weights_ptr,
@@ -811,6 +1002,8 @@ TfLiteStatus EvalFloat(
           recurrent_to_cell_weights->data.f,
           recurrent_to_output_weights->data.f, cell_to_input_weights_ptr,
           cell_to_forget_weights_ptr, cell_to_output_weights_ptr,
+          input_layer_norm_coefficients_ptr, forget_layer_norm_coefficients_ptr,
+          cell_layer_norm_coefficients_ptr, output_layer_norm_coefficients_ptr,
           input_gate_bias_ptr, forget_gate_bias->data.f, cell_bias->data.f,
           output_gate_bias->data.f, projection_weights_ptr, projection_bias_ptr,
           params, n_batch, n_cell, n_input, aux_input_size, n_output,
@@ -826,12 +1019,24 @@ TfLiteStatus EvalFloat(
         // If this is the forward_sequence, step forward, otherwise step
         // backwards.
         const int t_rel = forward_sequence ? t : max_time - t - 1;
-        const float* input_ptr = input->data.f + t_rel * input_step;
+        const int time_offset = b * max_time + t_rel;
+        const float* input_ptr = input->data.f + time_offset * input_step;
         if (aux_input) {
-          aux_input_ptr = aux_input->data.f + t_rel * input_step;
+          aux_input_ptr = aux_input->data.f + time_offset * input_step;
         }
-        float* output_ptr_time =
-            output->data.f + t_rel * output_step + output_offset;
+        float* output_ptr =
+            output->data.f + time_offset * output_step + output_offset;
+
+        // Offset the {activation,cell}_state pointers to the right batch.
+        float* activation_state_ptr =
+            activation_state->data.f + b * output_batch_leading_dim;
+        float* cell_state_ptr = cell_state->data.f + b * n_cell;
+        // Offset the scratch pointers to the right batch.
+        float* input_gate_scratch_ptr =
+            input_gate_scratch ? input_gate_scratch + b * n_cell : nullptr;
+        float* forget_gate_scratch_ptr = forget_gate_scratch + b * n_cell;
+        float* cell_scratch_ptr = cell_scratch + b * n_cell;
+        float* output_gate_scratch_ptr = output_gate_scratch + b * n_cell;
 
         LstmStepWithAuxInput(
             input_ptr, input_to_input_weights_ptr,
@@ -843,13 +1048,17 @@ TfLiteStatus EvalFloat(
             recurrent_to_cell_weights->data.f,
             recurrent_to_output_weights->data.f, cell_to_input_weights_ptr,
             cell_to_forget_weights_ptr, cell_to_output_weights_ptr,
-            input_gate_bias_ptr, forget_gate_bias->data.f, cell_bias->data.f,
+            input_layer_norm_coefficients_ptr,
+            forget_layer_norm_coefficients_ptr,
+            cell_layer_norm_coefficients_ptr,
+            output_layer_norm_coefficients_ptr, input_gate_bias_ptr,
+            forget_gate_bias->data.f, cell_bias->data.f,
             output_gate_bias->data.f, projection_weights_ptr,
             projection_bias_ptr, params, /*n_batch=*/1, n_cell, n_input,
             aux_input_size, n_output, output_batch_leading_dim,
-            activation_state->data.f, cell_state->data.f, input_gate_scratch,
-            forget_gate_scratch, cell_scratch, output_gate_scratch,
-            output_ptr_time);
+            activation_state_ptr, cell_state_ptr, input_gate_scratch_ptr,
+            forget_gate_scratch_ptr, cell_scratch_ptr, output_gate_scratch_ptr,
+            output_ptr);
       }
     }
   }
@@ -867,7 +1076,12 @@ TfLiteStatus EvalHybrid(
     const TfLiteTensor* recurrent_to_output_weights,
     const TfLiteTensor* cell_to_input_weights,
     const TfLiteTensor* cell_to_forget_weights,
-    const TfLiteTensor* cell_to_output_weights, const TfLiteTensor* aux_input,
+    const TfLiteTensor* cell_to_output_weights,
+    const TfLiteTensor* input_layer_norm_coefficients,
+    const TfLiteTensor* forget_layer_norm_coefficients,
+    const TfLiteTensor* cell_layer_norm_coefficients,
+    const TfLiteTensor* output_layer_norm_coefficients,
+    const TfLiteTensor* aux_input,
     const TfLiteTensor* aux_input_to_input_weights,
     const TfLiteTensor* aux_input_to_forget_weights,
     const TfLiteTensor* aux_input_to_cell_weights,
@@ -882,6 +1096,9 @@ TfLiteStatus EvalHybrid(
     TfLiteTensor* aux_input_quantized, TfLiteTensor* output_state_quantized,
     TfLiteTensor* cell_state_quantized, TfLiteTensor* output_state,
     TfLiteTensor* cell_state, TfLiteTensor* output) {
+  // For operations that use int8 instead of uint8 we need to fetch raw data
+  // from the tensor different. We use this bool for that condition.
+  const bool is_uint8_hybrid = input_to_output_weights->type == kTfLiteUInt8;
   TF_LITE_ASSERT(input->dims->size >= 2 && input->dims->size <= 3);
   const int n_input = input->dims->data[input->dims->size - 1];
   int max_time, n_batch;
@@ -902,6 +1119,7 @@ TfLiteStatus EvalHybrid(
   // check the existence of only one to get the condition.
   const bool use_cifg = (input_to_input_weights == nullptr);
   const bool use_peephole = (cell_to_output_weights != nullptr);
+  const bool is_layer_norm_lstm = (forget_layer_norm_coefficients != nullptr);
 
   float* input_gate_scratch = nullptr;
   float* cell_scratch = nullptr;
@@ -926,9 +1144,9 @@ TfLiteStatus EvalHybrid(
   float* input_gate_bias_ptr = nullptr;
   if (!use_cifg) {
     input_to_input_weights_ptr =
-        reinterpret_cast<int8_t*>(input_to_input_weights->data.uint8);
+        GetInt8DataPtr(input_to_input_weights, is_uint8_hybrid);
     recurrent_to_input_weights_ptr =
-        reinterpret_cast<int8_t*>(recurrent_to_input_weights->data.uint8);
+        GetInt8DataPtr(recurrent_to_input_weights, is_uint8_hybrid);
     input_gate_bias_ptr = input_gate_bias->data.f;
     input_to_input_weights_scale = input_to_input_weights->params.scale;
     recurrent_to_input_weights_scale = recurrent_to_input_weights->params.scale;
@@ -943,21 +1161,31 @@ TfLiteStatus EvalHybrid(
   if (use_peephole) {
     if (!use_cifg) {
       cell_to_input_weights_ptr =
-          reinterpret_cast<int8_t*>(cell_to_input_weights->data.uint8);
+          GetInt8DataPtr(cell_to_input_weights, is_uint8_hybrid);
       cell_to_input_weights_scale = cell_to_input_weights->params.scale;
     }
     cell_to_forget_weights_ptr =
-        reinterpret_cast<int8_t*>(cell_to_forget_weights->data.uint8);
+        GetInt8DataPtr(cell_to_forget_weights, is_uint8_hybrid);
     cell_to_output_weights_ptr =
-        reinterpret_cast<int8_t*>(cell_to_output_weights->data.uint8);
+        GetInt8DataPtr(cell_to_output_weights, is_uint8_hybrid);
     cell_to_forget_weights_scale = cell_to_forget_weights->params.scale;
     cell_to_output_weights_scale = cell_to_output_weights->params.scale;
   }
 
+  const float* input_layer_norm_coefficients_ptr =
+      (is_layer_norm_lstm && !use_cifg) ? input_layer_norm_coefficients->data.f
+                                        : nullptr;
+  const float* forget_layer_norm_coefficients_ptr =
+      is_layer_norm_lstm ? forget_layer_norm_coefficients->data.f : nullptr;
+  const float* cell_layer_norm_coefficients_ptr =
+      is_layer_norm_lstm ? cell_layer_norm_coefficients->data.f : nullptr;
+  const float* output_layer_norm_coefficients_ptr =
+      is_layer_norm_lstm ? output_layer_norm_coefficients->data.f : nullptr;
+
   const int8_t* projection_weights_ptr =
       (projection_weights == nullptr)
           ? nullptr
-          : reinterpret_cast<int8_t*>(projection_weights->data.uint8);
+          : GetInt8DataPtr(projection_weights, is_uint8_hybrid);
   const float projection_weights_scale =
       (projection_weights == nullptr) ? 1.0f : projection_weights->params.scale;
   const float* projection_bias_ptr =
@@ -965,46 +1193,43 @@ TfLiteStatus EvalHybrid(
 
   // Required tensors, pointers are non-null.
   const int8_t* input_to_forget_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_forget_weights->data.uint8);
+      GetInt8DataPtr(input_to_forget_weights, is_uint8_hybrid);
   const float input_to_forget_weights_scale =
       input_to_forget_weights->params.scale;
   const int8_t* input_to_cell_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_cell_weights->data.uint8);
+      GetInt8DataPtr(input_to_cell_weights, is_uint8_hybrid);
   const float input_to_cell_weights_scale = input_to_cell_weights->params.scale;
   const int8_t* input_to_output_weights_ptr =
-      reinterpret_cast<int8_t*>(input_to_output_weights->data.uint8);
+      GetInt8DataPtr(input_to_output_weights, is_uint8_hybrid);
   const float input_to_output_weights_scale =
       input_to_output_weights->params.scale;
   const int8_t* recurrent_to_forget_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_forget_weights->data.uint8);
+      GetInt8DataPtr(recurrent_to_forget_weights, is_uint8_hybrid);
   const float recurrent_to_forget_weights_scale =
       recurrent_to_forget_weights->params.scale;
   const int8_t* recurrent_to_cell_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_cell_weights->data.uint8);
+      GetInt8DataPtr(recurrent_to_cell_weights, is_uint8_hybrid);
   const float recurrent_to_cell_weights_scale =
       recurrent_to_cell_weights->params.scale;
   const int8_t* recurrent_to_output_weights_ptr =
-      reinterpret_cast<int8_t*>(recurrent_to_output_weights->data.uint8);
+      GetInt8DataPtr(recurrent_to_output_weights, is_uint8_hybrid);
   const float recurrent_to_output_weights_scale =
       recurrent_to_output_weights->params.scale;
   const float* forget_gate_bias_ptr = forget_gate_bias->data.f;
   const float* cell_bias_ptr = cell_bias->data.f;
   const float* output_gate_bias_ptr = output_gate_bias->data.f;
 
-  float* output_state_ptr = output_state->data.f;
-  float* cell_state_ptr = cell_state->data.f;
-
   // Temporary storage for quantized values and scaling factors.
   int8_t* quantized_input_ptr =
-      reinterpret_cast<int8_t*>(input_quantized->data.uint8);
+      GetInt8DataPtr(input_quantized, is_uint8_hybrid);
   int8_t* quantized_aux_input_ptr =
       (aux_input_quantized == nullptr)
           ? nullptr
-          : reinterpret_cast<int8_t*>(aux_input_quantized->data.uint8);
+          : GetInt8DataPtr(aux_input_quantized, is_uint8_hybrid);
   int8_t* quantized_output_state_ptr =
-      reinterpret_cast<int8_t*>(output_state_quantized->data.uint8);
+      GetInt8DataPtr(output_state_quantized, is_uint8_hybrid);
   int8_t* quantized_cell_state_ptr =
-      reinterpret_cast<int8_t*>(cell_state_quantized->data.uint8);
+      GetInt8DataPtr(cell_state_quantized, is_uint8_hybrid);
   float* scaling_factors_ptr = scaling_factors->data.f;
   float* prod_scaling_factors_ptr = prod_scaling_factors->data.f;
   float* recovered_cell_weights_ptr = recovered_cell_weights->data.f;
@@ -1022,14 +1247,14 @@ TfLiteStatus EvalHybrid(
   if (aux_input_size > 0) {
     if (!use_cifg) {
       aux_input_to_input_weights_ptr =
-          reinterpret_cast<int8_t*>(aux_input_to_input_weights->data.uint8);
+          GetInt8DataPtr(aux_input_to_input_weights, is_uint8_hybrid);
     }
     aux_input_to_forget_weights_ptr =
-        reinterpret_cast<int8_t*>(aux_input_to_forget_weights->data.uint8);
+        GetInt8DataPtr(aux_input_to_forget_weights, is_uint8_hybrid);
     aux_input_to_cell_weights_ptr =
-        reinterpret_cast<int8_t*>(aux_input_to_cell_weights->data.uint8);
+        GetInt8DataPtr(aux_input_to_cell_weights, is_uint8_hybrid);
     aux_input_to_output_weights_ptr =
-        reinterpret_cast<int8_t*>(aux_input_to_output_weights->data.uint8);
+        GetInt8DataPtr(aux_input_to_output_weights, is_uint8_hybrid);
     if (!use_cifg) {
       aux_input_to_input_weights_scale =
           aux_input_to_input_weights->params.scale;
@@ -1051,38 +1276,42 @@ TfLiteStatus EvalHybrid(
       // If this is the forward_sequence, step forward, otherwise step
       // backwards.
       const int t_rel = forward_sequence ? t : max_time - t - 1;
-      const float* input_ptr = input->data.f + t_rel * input_step;
+      const float* input_ptr_batch = input->data.f + t_rel * input_step;
       if (aux_input) {
         aux_input_ptr = aux_input->data.f + t_rel * input_step;
       }
-      float* output_ptr = output->data.f + t_rel * output_step + output_offset;
+      float* output_ptr_batch =
+          output->data.f + t_rel * output_step + output_offset;
 
       LstmStepWithAuxInput(
-          input_ptr, input_to_input_weights_ptr, input_to_input_weights_scale,
-          input_to_forget_weights_ptr, input_to_forget_weights_scale,
-          input_to_cell_weights_ptr, input_to_cell_weights_scale,
-          input_to_output_weights_ptr, input_to_output_weights_scale,
-          aux_input_ptr, aux_input_to_input_weights_ptr,
-          aux_input_to_input_weights_scale, aux_input_to_forget_weights_ptr,
-          aux_input_to_forget_weights_scale, aux_input_to_cell_weights_ptr,
-          aux_input_to_cell_weights_scale, aux_input_to_output_weights_ptr,
-          aux_input_to_output_weights_scale, recurrent_to_input_weights_ptr,
-          recurrent_to_input_weights_scale, recurrent_to_forget_weights_ptr,
-          recurrent_to_forget_weights_scale, recurrent_to_cell_weights_ptr,
-          recurrent_to_cell_weights_scale, recurrent_to_output_weights_ptr,
-          recurrent_to_output_weights_scale, cell_to_input_weights_ptr,
-          cell_to_input_weights_scale, cell_to_forget_weights_ptr,
-          cell_to_forget_weights_scale, cell_to_output_weights_ptr,
-          cell_to_output_weights_scale, input_gate_bias_ptr,
-          forget_gate_bias_ptr, cell_bias_ptr, output_gate_bias_ptr,
-          projection_weights_ptr, projection_weights_scale, projection_bias_ptr,
-          params, n_batch, n_cell, n_input, aux_input_size, n_output,
-          output_batch_leading_dim, input_gate_scratch, forget_gate_scratch,
-          cell_scratch, output_gate_scratch, scaling_factors_ptr,
-          prod_scaling_factors_ptr, recovered_cell_weights_ptr,
-          quantized_input_ptr, quantized_aux_input_ptr,
-          quantized_output_state_ptr, quantized_cell_state_ptr,
-          output_state_ptr, cell_state_ptr, output_ptr);
+          input_ptr_batch, input_to_input_weights_ptr,
+          input_to_input_weights_scale, input_to_forget_weights_ptr,
+          input_to_forget_weights_scale, input_to_cell_weights_ptr,
+          input_to_cell_weights_scale, input_to_output_weights_ptr,
+          input_to_output_weights_scale, aux_input_ptr,
+          aux_input_to_input_weights_ptr, aux_input_to_input_weights_scale,
+          aux_input_to_forget_weights_ptr, aux_input_to_forget_weights_scale,
+          aux_input_to_cell_weights_ptr, aux_input_to_cell_weights_scale,
+          aux_input_to_output_weights_ptr, aux_input_to_output_weights_scale,
+          recurrent_to_input_weights_ptr, recurrent_to_input_weights_scale,
+          recurrent_to_forget_weights_ptr, recurrent_to_forget_weights_scale,
+          recurrent_to_cell_weights_ptr, recurrent_to_cell_weights_scale,
+          recurrent_to_output_weights_ptr, recurrent_to_output_weights_scale,
+          cell_to_input_weights_ptr, cell_to_input_weights_scale,
+          cell_to_forget_weights_ptr, cell_to_forget_weights_scale,
+          cell_to_output_weights_ptr, cell_to_output_weights_scale,
+          input_layer_norm_coefficients_ptr, forget_layer_norm_coefficients_ptr,
+          cell_layer_norm_coefficients_ptr, output_layer_norm_coefficients_ptr,
+          input_gate_bias_ptr, forget_gate_bias_ptr, cell_bias_ptr,
+          output_gate_bias_ptr, projection_weights_ptr,
+          projection_weights_scale, projection_bias_ptr, params, n_batch,
+          n_cell, n_input, aux_input_size, n_output, output_batch_leading_dim,
+          input_gate_scratch, forget_gate_scratch, cell_scratch,
+          output_gate_scratch, scaling_factors_ptr, prod_scaling_factors_ptr,
+          recovered_cell_weights_ptr, quantized_input_ptr,
+          quantized_aux_input_ptr, quantized_output_state_ptr,
+          quantized_cell_state_ptr, output_state->data.f, cell_state->data.f,
+          output_ptr_batch);
     }
   } else {
     for (int b = 0; b < n_batch; b++) {
@@ -1092,12 +1321,24 @@ TfLiteStatus EvalHybrid(
         // If this is the forward_sequence, step forward, otherwise step
         // backwards.
         const int t_rel = forward_sequence ? t : max_time - t - 1;
-        const float* input_ptr = input->data.f + t_rel * input_step;
+        const int time_offset = b * max_time + t_rel;
+        const float* input_ptr = input->data.f + time_offset * input_step;
         if (aux_input) {
-          aux_input_ptr = aux_input->data.f + t_rel * input_step;
+          aux_input_ptr = aux_input->data.f + time_offset * input_step;
         }
         float* output_ptr =
-            output->data.f + t_rel * output_step + output_offset;
+            output->data.f + time_offset * output_step + output_offset;
+
+        // Offset the {output,cell}_state pointers to the right batch.
+        float* output_state_ptr =
+            output_state->data.f + b * output_batch_leading_dim;
+        float* cell_state_ptr = cell_state->data.f + b * n_cell;
+        // Offset the scratch pointers to the right batch.
+        float* input_gate_scratch_ptr =
+            input_gate_scratch ? input_gate_scratch + b * n_cell : nullptr;
+        float* forget_gate_scratch_ptr = forget_gate_scratch + b * n_cell;
+        float* cell_scratch_ptr = cell_scratch + b * n_cell;
+        float* output_gate_scratch_ptr = output_gate_scratch + b * n_cell;
 
         LstmStepWithAuxInput(
             input_ptr, input_to_input_weights_ptr, input_to_input_weights_scale,
@@ -1115,13 +1356,17 @@ TfLiteStatus EvalHybrid(
             recurrent_to_output_weights_scale, cell_to_input_weights_ptr,
             cell_to_input_weights_scale, cell_to_forget_weights_ptr,
             cell_to_forget_weights_scale, cell_to_output_weights_ptr,
-            cell_to_output_weights_scale, input_gate_bias_ptr,
+            cell_to_output_weights_scale, input_layer_norm_coefficients_ptr,
+            forget_layer_norm_coefficients_ptr,
+            cell_layer_norm_coefficients_ptr,
+            output_layer_norm_coefficients_ptr, input_gate_bias_ptr,
             forget_gate_bias_ptr, cell_bias_ptr, output_gate_bias_ptr,
             projection_weights_ptr, projection_weights_scale,
-            projection_bias_ptr, params, n_batch, n_cell, n_input,
-            aux_input_size, n_output, output_batch_leading_dim,
-            input_gate_scratch, forget_gate_scratch, cell_scratch,
-            output_gate_scratch, scaling_factors_ptr, prod_scaling_factors_ptr,
+            projection_bias_ptr, params,
+            /*n_batch=*/1, n_cell, n_input, aux_input_size, n_output,
+            output_batch_leading_dim, input_gate_scratch_ptr,
+            forget_gate_scratch_ptr, cell_scratch_ptr, output_gate_scratch_ptr,
+            scaling_factors_ptr, prod_scaling_factors_ptr,
             recovered_cell_weights_ptr, quantized_input_ptr,
             quantized_aux_input_ptr, quantized_output_state_ptr,
             quantized_cell_state_ptr, output_state_ptr, cell_state_ptr,
diff --git a/tensorflow/lite/kernels/lstm_eval.h b/tensorflow/lite/kernels/lstm_eval.h
index c8a4d284f3c431e88fd0d52c98807161de14cba9..33e5bc0781925d714ec519b4548601db238e1e03 100644
--- a/tensorflow/lite/kernels/lstm_eval.h
+++ b/tensorflow/lite/kernels/lstm_eval.h
@@ -34,7 +34,12 @@ TfLiteStatus EvalFloat(
     const TfLiteTensor* recurrent_to_output_weights,
     const TfLiteTensor* cell_to_input_weights,
     const TfLiteTensor* cell_to_forget_weights,
-    const TfLiteTensor* cell_to_output_weights, const TfLiteTensor* aux_input,
+    const TfLiteTensor* cell_to_output_weights,
+    const TfLiteTensor* input_layer_norm_coefficients,
+    const TfLiteTensor* forget_layer_norm_coefficients,
+    const TfLiteTensor* cell_layer_norm_coefficients,
+    const TfLiteTensor* output_layer_norm_coefficients,
+    const TfLiteTensor* aux_input,
     const TfLiteTensor* aux_input_to_input_weights,
     const TfLiteTensor* aux_input_to_forget_weights,
     const TfLiteTensor* aux_input_to_cell_weights,
@@ -58,7 +63,12 @@ TfLiteStatus EvalHybrid(
     const TfLiteTensor* recurrent_to_output_weights,
     const TfLiteTensor* cell_to_input_weights,
     const TfLiteTensor* cell_to_forget_weights,
-    const TfLiteTensor* cell_to_output_weights, const TfLiteTensor* aux_input,
+    const TfLiteTensor* cell_to_output_weights,
+    const TfLiteTensor* input_layer_norm_coefficients,
+    const TfLiteTensor* forget_layer_norm_coefficients,
+    const TfLiteTensor* cell_layer_norm_coefficients,
+    const TfLiteTensor* output_layer_norm_coefficients,
+    const TfLiteTensor* aux_input,
     const TfLiteTensor* aux_input_to_input_weights,
     const TfLiteTensor* aux_input_to_forget_weights,
     const TfLiteTensor* aux_input_to_cell_weights,
diff --git a/tensorflow/lite/kernels/lstm_test.cc b/tensorflow/lite/kernels/lstm_test.cc
index 03ad2e899d29b17d430bf51721e9b8b75cdb79d4..40ee94888136207eddcb38577377027c718a0a58 100644
--- a/tensorflow/lite/kernels/lstm_test.cc
+++ b/tensorflow/lite/kernels/lstm_test.cc
@@ -38,7 +38,8 @@ class LSTMOpModel : public SingleOpModel {
               bool use_peephole, bool use_projection_weights,
               bool use_projection_bias, float cell_clip, float proj_clip,
               const std::vector<std::vector<int>>& input_shapes,
-              const TensorType& weight_type = TensorType_FLOAT32)
+              const TensorType& weight_type = TensorType_FLOAT32,
+              bool is_layer_norm = false)
       : n_batch_(n_batch),
         n_input_(n_input),
         n_cell_(n_cell),
@@ -106,6 +107,18 @@ class LSTMOpModel : public SingleOpModel {
     input_cell_state_ =
         AddInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}}, true);
 
+    // Layer norm weights.
+    if (is_layer_norm) {
+      if (use_cifg) {
+        input_layer_norm_coefficients_ = AddNullInput();
+      } else {
+        input_layer_norm_coefficients_ = AddInput(TensorType_FLOAT32);
+      }
+      forget_layer_norm_coefficients_ = AddInput(TensorType_FLOAT32);
+      cell_layer_norm_coefficients_ = AddInput(TensorType_FLOAT32);
+      output_layer_norm_coefficients_ = AddInput(TensorType_FLOAT32);
+    }
+
     output_ = AddOutput(TensorType_FLOAT32);
 
     SetBuiltinOp(BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions,
@@ -116,69 +129,87 @@ class LSTMOpModel : public SingleOpModel {
     BuildInterpreter(input_shapes);
   }
 
-  void SetInputToInputWeights(std::vector<float> f) {
+  void SetInputToInputWeights(const std::vector<float>& f) {
     PopulateTensor(input_to_input_weights_, f);
   }
 
-  void SetInputToForgetWeights(std::vector<float> f) {
+  void SetInputToForgetWeights(const std::vector<float>& f) {
     PopulateTensor(input_to_forget_weights_, f);
   }
 
-  void SetInputToCellWeights(std::vector<float> f) {
+  void SetInputToCellWeights(const std::vector<float>& f) {
     PopulateTensor(input_to_cell_weights_, f);
   }
 
-  void SetInputToOutputWeights(std::vector<float> f) {
+  void SetInputToOutputWeights(const std::vector<float>& f) {
     PopulateTensor(input_to_output_weights_, f);
   }
 
-  void SetRecurrentToInputWeights(std::vector<float> f) {
+  void SetRecurrentToInputWeights(const std::vector<float>& f) {
     PopulateTensor(recurrent_to_input_weights_, f);
   }
 
-  void SetRecurrentToForgetWeights(std::vector<float> f) {
+  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
     PopulateTensor(recurrent_to_forget_weights_, f);
   }
 
-  void SetRecurrentToCellWeights(std::vector<float> f) {
+  void SetRecurrentToCellWeights(const std::vector<float>& f) {
     PopulateTensor(recurrent_to_cell_weights_, f);
   }
 
-  void SetRecurrentToOutputWeights(std::vector<float> f) {
+  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
     PopulateTensor(recurrent_to_output_weights_, f);
   }
 
-  void SetCellToInputWeights(std::vector<float> f) {
+  void SetCellToInputWeights(const std::vector<float>& f) {
     PopulateTensor(cell_to_input_weights_, f);
   }
 
-  void SetCellToForgetWeights(std::vector<float> f) {
+  void SetCellToForgetWeights(const std::vector<float>& f) {
     PopulateTensor(cell_to_forget_weights_, f);
   }
 
-  void SetCellToOutputWeights(std::vector<float> f) {
+  void SetCellToOutputWeights(const std::vector<float>& f) {
     PopulateTensor(cell_to_output_weights_, f);
   }
 
-  void SetInputGateBias(std::vector<float> f) {
+  void SetInputLayerNormCoefficients(const std::vector<float>& f) {
+    PopulateTensor(input_layer_norm_coefficients_, f);
+  }
+
+  void SetForgetLayerNormCoefficients(const std::vector<float>& f) {
+    PopulateTensor(forget_layer_norm_coefficients_, f);
+  }
+
+  void SetCellLayerNormCoefficients(const std::vector<float>& f) {
+    PopulateTensor(cell_layer_norm_coefficients_, f);
+  }
+
+  void SetOutputLayerNormCoefficients(const std::vector<float>& f) {
+    PopulateTensor(output_layer_norm_coefficients_, f);
+  }
+
+  void SetInputGateBias(const std::vector<float>& f) {
     PopulateTensor(input_gate_bias_, f);
   }
 
-  void SetForgetGateBias(std::vector<float> f) {
+  void SetForgetGateBias(const std::vector<float>& f) {
     PopulateTensor(forget_gate_bias_, f);
   }
 
-  void SetCellBias(std::vector<float> f) { PopulateTensor(cell_bias_, f); }
+  void SetCellBias(const std::vector<float>& f) {
+    PopulateTensor(cell_bias_, f);
+  }
 
-  void SetOutputGateBias(std::vector<float> f) {
+  void SetOutputGateBias(const std::vector<float>& f) {
     PopulateTensor(output_gate_bias_, f);
   }
 
-  void SetProjectionWeights(std::vector<float> f) {
+  void SetProjectionWeights(const std::vector<float>& f) {
     PopulateTensor(projection_weights_, f);
   }
 
-  void SetProjectionBias(std::vector<float> f) {
+  void SetProjectionBias(const std::vector<float>& f) {
     PopulateTensor(projection_bias_, f);
   }
 
@@ -210,6 +241,11 @@ class LSTMOpModel : public SingleOpModel {
   int cell_to_forget_weights_;
   int cell_to_output_weights_;
 
+  int input_layer_norm_coefficients_;
+  int forget_layer_norm_coefficients_;
+  int cell_layer_norm_coefficients_;
+  int output_layer_norm_coefficients_;
+
   int input_gate_bias_;
   int forget_gate_bias_;
   int cell_bias_;
@@ -236,57 +272,70 @@ class HybridLSTMOpModel : public LSTMOpModel {
                     bool use_cifg, bool use_peephole,
                     bool use_projection_weights, bool use_projection_bias,
                     float cell_clip, float proj_clip,
-                    const std::vector<std::vector<int>>& input_shapes)
+                    const std::vector<std::vector<int>>& input_shapes,
+                    TensorType tensor_type)
       : LSTMOpModel(n_batch, n_input, n_cell, n_output, use_cifg, use_peephole,
                     use_projection_weights, use_projection_bias, cell_clip,
-                    proj_clip, input_shapes, TensorType_UINT8) {}
+                    proj_clip, input_shapes, tensor_type) {
+    tensor_type_ = tensor_type;
+  }
 
-  void SetInputToInputWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(input_to_input_weights_, f);
+  TensorType tensor_type_;
+
+  void SetWeights(int weights_idx, const std::vector<float>& f) {
+    if (tensor_type_ == TensorType_UINT8) {
+      SymmetricQuantizeAndPopulate(weights_idx, f);
+    } else {
+      SignedSymmetricQuantizeAndPopulate(weights_idx, f);
+    }
+  }
+
+  void SetInputToInputWeights(const std::vector<float>& f) {
+    SetWeights(input_to_input_weights_, f);
   }
 
-  void SetInputToForgetWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(input_to_forget_weights_, f);
+  void SetInputToForgetWeights(const std::vector<float>& f) {
+    SetWeights(input_to_forget_weights_, f);
   }
 
-  void SetInputToCellWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(input_to_cell_weights_, f);
+  void SetInputToCellWeights(const std::vector<float>& f) {
+    SetWeights(input_to_cell_weights_, f);
   }
 
-  void SetInputToOutputWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(input_to_output_weights_, f);
+  void SetInputToOutputWeights(const std::vector<float>& f) {
+    SetWeights(input_to_output_weights_, f);
   }
 
-  void SetRecurrentToInputWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_input_weights_, f);
+  void SetRecurrentToInputWeights(const std::vector<float>& f) {
+    SetWeights(recurrent_to_input_weights_, f);
   }
 
-  void SetRecurrentToForgetWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_forget_weights_, f);
+  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
+    SetWeights(recurrent_to_forget_weights_, f);
   }
 
-  void SetRecurrentToCellWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_cell_weights_, f);
+  void SetRecurrentToCellWeights(const std::vector<float>& f) {
+    SetWeights(recurrent_to_cell_weights_, f);
   }
 
-  void SetRecurrentToOutputWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_output_weights_, f);
+  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
+    SetWeights(recurrent_to_output_weights_, f);
   }
 
-  void SetCellToInputWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(cell_to_input_weights_, f);
+  void SetCellToInputWeights(const std::vector<float>& f) {
+    SetWeights(cell_to_input_weights_, f);
   }
 
-  void SetCellToForgetWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(cell_to_forget_weights_, f);
+  void SetCellToForgetWeights(const std::vector<float>& f) {
+    SetWeights(cell_to_forget_weights_, f);
   }
 
-  void SetCellToOutputWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(cell_to_output_weights_, f);
+  void SetCellToOutputWeights(const std::vector<float>& f) {
+    SetWeights(cell_to_output_weights_, f);
   }
 
-  void SetProjectionWeights(std::vector<float> f) {
-    SymmetricQuantizeAndPopulate(projection_weights_, f);
+  void SetProjectionWeights(const std::vector<float>& f) {
+    SetWeights(projection_weights_, f);
   }
 };
 
@@ -453,7 +502,8 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
 }
 
-TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
+TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest,
+       HybridLstmBlackBoxTestUint8) {
   const int n_batch = 1;
   const int n_input = 2;
   // n_cell and n_output have the same size when there is no projection.
@@ -489,7 +539,67 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
 
           {0, 0},  // projection_weight tensor
           {0},     // projection_bias tensor
-      });
+      },
+      TensorType_UINT8);
+
+  lstm.SetInputToInputWeights(input_to_input_weights_);
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  lstm.SetInputGateBias(input_gate_bias_);
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
+
+  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm,
+                /*tolerance=*/0.0157651);
+}
+
+TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest,
+       HybridLstmBlackBoxTestInt8) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+
+  HybridLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/false, /*use_peephole=*/false,
+      /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},  // cell_to_input_weight tensor
+          {0},  // cell_to_forget_weight tensor
+          {0},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+      },
+      TensorType_INT8);
 
   lstm.SetInputToInputWeights(input_to_input_weights_);
   lstm.SetInputToCellWeights(input_to_cell_weights_);
@@ -613,7 +723,8 @@ TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
 }
 
-TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
+TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest,
+       HybridLstmBlackBoxTestUint8) {
   const int n_batch = 1;
   const int n_input = 2;
   // n_cell and n_output have the same size when there is no projection.
@@ -650,7 +761,67 @@ TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
 
           {0, 0},  // projection_weight tensor
           {0},     // projection_bias tensor
-      });
+      },
+      TensorType_UINT8);
+
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
+
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.03573);
+}
+
+TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest,
+       HybridLstmBlackBoxTestInt8) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+
+  HybridLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/true, /*use_peephole=*/true,
+      /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false,
+      /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {0, 0},             // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {0, 0},              // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},       // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {0},       // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+      },
+      TensorType_INT8);
 
   lstm.SetInputToCellWeights(input_to_cell_weights_);
   lstm.SetInputToForgetWeights(input_to_forget_weights_);
@@ -1330,7 +1501,7 @@ TEST_F(NoCifgPeepholeProjectionNoClippingLstmTest, LstmBlackBoxTest) {
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
 }
 
-TEST_F(NoCifgPeepholeProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
+TEST_F(NoCifgPeepholeProjectionNoClippingLstmTest, HybridLstmBlackBoxTesInt8) {
   const int n_batch = 2;
   const int n_input = 5;
   const int n_cell = 20;
@@ -1366,7 +1537,72 @@ TEST_F(NoCifgPeepholeProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
 
           {n_output, n_cell},  // projection_weight tensor
           {0},                 // projection_bias tensor
-      });
+      },
+      TensorType_UINT8);
+
+  lstm.SetInputToInputWeights(input_to_input_weights_);
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  lstm.SetInputGateBias(input_gate_bias_);
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
+
+  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  lstm.SetCellToInputWeights(cell_to_input_weights_);
+  lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  lstm.SetProjectionWeights(projection_weights_);
+
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.00467);
+}
+
+TEST_F(NoCifgPeepholeProjectionNoClippingLstmTest,
+       HybridLstmBlackBoxTestUint8) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 20;
+  const int n_output = 16;
+
+  HybridLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/false, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false,
+      /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {n_cell},  // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+      },
+      TensorType_INT8);
 
   lstm.SetInputToInputWeights(input_to_input_weights_);
   lstm.SetInputToCellWeights(input_to_cell_weights_);
@@ -1392,6 +1628,845 @@ TEST_F(NoCifgPeepholeProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.00467);
 }
 
+class LayerNormLSTMOpModel : public LSTMOpModel {
+ public:
+  LayerNormLSTMOpModel(int n_batch, int n_input, int n_cell, int n_output,
+                       bool use_cifg, bool use_peephole,
+                       bool use_projection_weights, bool use_projection_bias,
+                       float cell_clip, float proj_clip,
+                       const std::vector<std::vector<int>>& input_shapes,
+                       const TensorType& weight_type = TensorType_FLOAT32)
+      : LSTMOpModel(n_batch, n_input, n_cell, n_output, use_cifg, use_peephole,
+                    use_projection_weights, use_projection_bias, cell_clip,
+                    proj_clip, input_shapes, weight_type,
+                    /*is_layer_norm*/ true) {}
+};
+
+class HybridLayerNormLSTMOpModel : public LayerNormLSTMOpModel {
+ public:
+  HybridLayerNormLSTMOpModel(int n_batch, int n_input, int n_cell, int n_output,
+                             bool use_cifg, bool use_peephole,
+                             bool use_projection_weights,
+                             bool use_projection_bias, float cell_clip,
+                             float proj_clip,
+                             const std::vector<std::vector<int>>& input_shapes,
+                             TensorType tensor_type)
+      : LayerNormLSTMOpModel(n_batch, n_input, n_cell, n_output, use_cifg,
+                             use_peephole, use_projection_weights,
+                             use_projection_bias, cell_clip, proj_clip,
+                             input_shapes, tensor_type) {
+    tensor_type_ = tensor_type;
+  }
+
+  TensorType tensor_type_;
+
+  void SetWeights(int weights_idx, const std::vector<float>& f) {
+    if (tensor_type_ == TensorType_UINT8) {
+      SymmetricQuantizeAndPopulate(weights_idx, f);
+    } else {
+      SignedSymmetricQuantizeAndPopulate(weights_idx, f);
+    }
+  }
+
+  void SetInputToInputWeights(const std::vector<float>& f) {
+    SetWeights(input_to_input_weights_, f);
+  }
+
+  void SetInputToForgetWeights(const std::vector<float>& f) {
+    SetWeights(input_to_forget_weights_, f);
+  }
+
+  void SetInputToCellWeights(const std::vector<float>& f) {
+    SetWeights(input_to_cell_weights_, f);
+  }
+
+  void SetInputToOutputWeights(const std::vector<float>& f) {
+    SetWeights(input_to_output_weights_, f);
+  }
+
+  void SetRecurrentToInputWeights(const std::vector<float>& f) {
+    SetWeights(recurrent_to_input_weights_, f);
+  }
+
+  void SetRecurrentToForgetWeights(const std::vector<float>& f) {
+    SetWeights(recurrent_to_forget_weights_, f);
+  }
+
+  void SetRecurrentToCellWeights(const std::vector<float>& f) {
+    SetWeights(recurrent_to_cell_weights_, f);
+  }
+
+  void SetRecurrentToOutputWeights(const std::vector<float>& f) {
+    SetWeights(recurrent_to_output_weights_, f);
+  }
+
+  void SetCellToInputWeights(const std::vector<float>& f) {
+    SetWeights(cell_to_input_weights_, f);
+  }
+
+  void SetCellToForgetWeights(const std::vector<float>& f) {
+    SetWeights(cell_to_forget_weights_, f);
+  }
+
+  void SetCellToOutputWeights(const std::vector<float>& f) {
+    SetWeights(cell_to_output_weights_, f);
+  }
+
+  void SetInputLayerNormCoefficients(const std::vector<float>& f) {
+    PopulateTensor(input_layer_norm_coefficients_, f);
+  }
+
+  void SetForgetLayerNormCoefficients(const std::vector<float>& f) {
+    PopulateTensor(forget_layer_norm_coefficients_, f);
+  }
+
+  void SetCellLayerNormCoefficients(const std::vector<float>& f) {
+    PopulateTensor(cell_layer_norm_coefficients_, f);
+  }
+
+  void SetOutputLayerNormCoefficients(const std::vector<float>& f) {
+    PopulateTensor(output_layer_norm_coefficients_, f);
+  }
+
+  void SetProjectionWeights(const std::vector<float>& f) {
+    SetWeights(projection_weights_, f);
+  }
+};
+
+class BaseLayerNormLstmTest : public ::testing::Test {
+ protected:
+  // Weights of the Layer Norm LSTM model. Some are optional.
+  std::vector<float> input_to_input_weights_;
+  std::vector<float> input_to_cell_weights_;
+  std::vector<float> input_to_forget_weights_;
+  std::vector<float> input_to_output_weights_;
+  std::vector<float> input_gate_bias_;
+  std::vector<float> cell_gate_bias_;
+  std::vector<float> forget_gate_bias_;
+  std::vector<float> output_gate_bias_;
+  std::vector<float> recurrent_to_input_weights_;
+  std::vector<float> recurrent_to_cell_weights_;
+  std::vector<float> recurrent_to_forget_weights_;
+  std::vector<float> recurrent_to_output_weights_;
+  std::vector<float> cell_to_input_weights_;
+  std::vector<float> cell_to_forget_weights_;
+  std::vector<float> cell_to_output_weights_;
+  std::vector<float> projection_weights_;
+  std::vector<float> input_layer_norm_coefficients_;
+  std::vector<float> forget_layer_norm_coefficients_;
+  std::vector<float> cell_layer_norm_coefficients_;
+  std::vector<float> output_layer_norm_coefficients_;
+
+  // Layer Norm LSTM input is stored as num_batch x num_inputs vector.
+  std::vector<std::vector<float>> layer_norm_lstm_input_;
+
+  // Compares output up to tolerance to the result of the layer_norm_lstm given
+  // the input.
+  void VerifyGoldens(const std::vector<std::vector<float>>& input,
+                     const std::vector<std::vector<float>>& output,
+                     LayerNormLSTMOpModel* layer_norm_lstm,
+                     float tolerance = 1e-5) {
+    const int num_batches = input.size();
+    EXPECT_GT(num_batches, 0);
+    const int num_inputs = layer_norm_lstm->num_inputs();
+    EXPECT_GT(num_inputs, 0);
+    const int input_sequence_size = input[0].size() / num_inputs;
+    EXPECT_GT(input_sequence_size, 0);
+    for (int i = 0; i < input_sequence_size; ++i) {
+      for (int b = 0; b < num_batches; ++b) {
+        const float* batch_start = input[b].data() + i * num_inputs;
+        const float* batch_end = batch_start + num_inputs;
+
+        layer_norm_lstm->SetInput(b * layer_norm_lstm->num_inputs(),
+                                  batch_start, batch_end);
+      }
+
+      layer_norm_lstm->Invoke();
+
+      const int num_outputs = layer_norm_lstm->num_outputs();
+      std::vector<float> expected;
+      for (int b = 0; b < num_batches; ++b) {
+        const float* golden_start_batch = output[b].data() + i * num_outputs;
+        const float* golden_end_batch = golden_start_batch + num_outputs;
+        expected.insert(expected.end(), golden_start_batch, golden_end_batch);
+      }
+      EXPECT_THAT(layer_norm_lstm->GetOutput(),
+                  ElementsAreArray(ArrayFloatNear(expected, tolerance)));
+    }
+  }
+};
+
+class NoCifgPeepholeProjectionNoClippingLayerNormLstmTest
+    : public BaseLayerNormLstmTest {
+  void SetUp() override {
+    input_to_input_weights_ = {0.5,  0.6,  0.7,  -0.8, -0.9, 0.1,  0.2,
+                               0.3,  -0.4, 0.5,  -0.8, 0.7,  -0.6, 0.5,
+                               -0.4, -0.5, -0.4, -0.3, -0.2, -0.1};
+
+    input_to_forget_weights_ = {-0.6, -0.1, 0.3,  0.2,  0.9,  -0.5, -0.2,
+                                -0.4, 0.3,  -0.8, -0.4, 0.3,  -0.5, -0.4,
+                                -0.6, 0.3,  -0.4, -0.6, -0.5, -0.5};
+
+    input_to_cell_weights_ = {-0.4, -0.3, -0.2, -0.1, -0.5, 0.5,  -0.2,
+                              -0.3, -0.2, -0.6, 0.6,  -0.1, -0.4, -0.3,
+                              -0.7, 0.7,  -0.9, -0.5, 0.8,  0.6};
+
+    input_to_output_weights_ = {-0.8, -0.4, -0.2, -0.9, -0.1, -0.7, 0.3,
+                                -0.3, -0.8, -0.2, 0.6,  -0.2, 0.4,  -0.7,
+                                -0.3, -0.5, 0.1,  0.5,  -0.6, -0.4};
+
+    input_gate_bias_ = {0.03, 0.15, 0.22, 0.38};
+
+    forget_gate_bias_ = {0.1, -0.3, -0.2, 0.1};
+
+    cell_gate_bias_ = {-0.05, 0.72, 0.25, 0.08};
+
+    output_gate_bias_ = {0.05, -0.01, 0.2, 0.1};
+
+    recurrent_to_input_weights_ = {-0.2, -0.3, 0.4,  0.1,  -0.5, 0.9,
+                                   -0.2, -0.3, -0.7, 0.05, -0.2, -0.6};
+
+    recurrent_to_cell_weights_ = {-0.3, 0.2, 0.1, -0.3, 0.8,  -0.08,
+                                  -0.2, 0.3, 0.8, -0.6, -0.1, 0.2};
+
+    recurrent_to_forget_weights_ = {-0.5, -0.3, -0.5, -0.2, 0.6, 0.4,
+                                    0.9,  0.3,  -0.1, 0.2,  0.5, 0.2};
+
+    recurrent_to_output_weights_ = {0.3,  -0.1, 0.1,  -0.2, -0.5, -0.7,
+                                    -0.2, -0.6, -0.1, -0.4, -0.7, -0.2};
+
+    cell_to_input_weights_ = {0.05, 0.1, 0.25, 0.15};
+
+    cell_to_forget_weights_ = {-0.02, -0.15, -0.25, -0.03};
+
+    cell_to_output_weights_ = {0.1, -0.1, -0.5, 0.05};
+
+    input_layer_norm_coefficients_ = {0.1, 0.2, 0.3, 0.5};
+    forget_layer_norm_coefficients_ = {0.2, 0.2, 0.4, 0.3};
+    cell_layer_norm_coefficients_ = {0.7, 0.2, 0.3, 0.8};
+    output_layer_norm_coefficients_ = {0.6, 0.2, 0.2, 0.5};
+
+    projection_weights_ = {-0.1, 0.2,  0.01, -0.2, 0.1,  0.5,
+                           0.3,  0.08, 0.07, 0.2,  -0.4, 0.2};
+
+    layer_norm_lstm_input_ = {
+        {// Batch0: 3 (input_sequence_size) * 5 (n_input)
+         0.7, 0.8, 0.1, 0.2, 0.3,   // seq 0
+         0.8, 0.1, 0.2, 0.4, 0.5,   // seq 1
+         0.2, 0.7, 0.7, 0.1, 0.7},  // seq 2
+
+        {// Batch1: 3 (input_sequence_size) * 5 (n_input)
+         0.3, 0.2, 0.9, 0.8, 0.1,   // seq 0
+         0.1, 0.5, 0.2, 0.4, 0.2,   // seq 1
+         0.6, 0.9, 0.2, 0.5, 0.7},  // seq 2
+    };
+  }
+};
+
+TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
+       LayerNormLstmBlackBoxTest) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 4;
+  const int n_output = 3;
+  const float ceil_clip = 0.0;
+  const float proj_clip = 0.0;
+
+  LayerNormLSTMOpModel layer_norm_lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/false, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, ceil_clip, proj_clip,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {n_cell},  // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+
+          {n_cell},  // input_layer_norm_coefficient tensor
+          {n_cell},  // forget_layer_norm_coefficient tensor
+          {n_cell},  // cell_layer_norm_coefficient tensor
+          {n_cell},  // output_layer_norm_coefficient tensor
+      });
+
+  layer_norm_lstm.SetInputToInputWeights(input_to_input_weights_);
+  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
+  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  layer_norm_lstm.SetInputGateBias(input_gate_bias_);
+  layer_norm_lstm.SetCellBias(cell_gate_bias_);
+  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
+  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
+
+  layer_norm_lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
+  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  layer_norm_lstm.SetCellToInputWeights(cell_to_input_weights_);
+  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  layer_norm_lstm.SetInputLayerNormCoefficients(input_layer_norm_coefficients_);
+  layer_norm_lstm.SetForgetLayerNormCoefficients(
+      forget_layer_norm_coefficients_);
+  layer_norm_lstm.SetCellLayerNormCoefficients(cell_layer_norm_coefficients_);
+  layer_norm_lstm.SetOutputLayerNormCoefficients(
+      output_layer_norm_coefficients_);
+
+  layer_norm_lstm.SetProjectionWeights(projection_weights_);
+
+  // Verify the final output.
+  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
+      {
+          // Batch0: 3 (input_sequence_size) * 3 (n_output)
+          0.0244077, 0.128027, -0.00170918,  // seq 0
+          0.0137642, 0.140751, 0.0395835,    // seq 1
+          -0.00459231, 0.155278, 0.0837377,  // seq 2
+      },
+      {
+          // Batch1: 3 (input_sequence_size) * 3 (n_output)
+          -0.00692428, 0.0848741, 0.063445,  // seq 0
+          -0.00403912, 0.139963, 0.072681,   // seq 1
+          0.00752706, 0.161903, 0.0561371,   // seq 2
+      }};
+
+  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
+                &layer_norm_lstm);
+}
+
+TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
+       HybridLayerNormLstmBlackBoxTestUint8) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 4;
+  const int n_output = 3;
+  const float ceil_clip = 0.0;
+  const float proj_clip = 0.0;
+
+  HybridLayerNormLSTMOpModel layer_norm_lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/false, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, ceil_clip, proj_clip,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {n_cell},  // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+
+          {n_cell},  // input_layer_norm_coefficient tensor
+          {n_cell},  // forget_layer_norm_coefficient tensor
+          {n_cell},  // cell_layer_norm_coefficient tensor
+          {n_cell},  // output_layer_norm_coefficient tensor
+      },
+      TensorType_UINT8);
+
+  layer_norm_lstm.SetInputToInputWeights(input_to_input_weights_);
+  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
+  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  layer_norm_lstm.SetInputGateBias(input_gate_bias_);
+  layer_norm_lstm.SetCellBias(cell_gate_bias_);
+  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
+  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
+
+  layer_norm_lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
+  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  layer_norm_lstm.SetCellToInputWeights(cell_to_input_weights_);
+  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  layer_norm_lstm.SetInputLayerNormCoefficients(input_layer_norm_coefficients_);
+  layer_norm_lstm.SetForgetLayerNormCoefficients(
+      forget_layer_norm_coefficients_);
+  layer_norm_lstm.SetCellLayerNormCoefficients(cell_layer_norm_coefficients_);
+  layer_norm_lstm.SetOutputLayerNormCoefficients(
+      output_layer_norm_coefficients_);
+
+  layer_norm_lstm.SetProjectionWeights(projection_weights_);
+
+  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
+      {
+          // Batch0: 3 (input_sequence_size) * 3 (n_output)
+          0.0244576, 0.127847, -0.00181765,  // seq 0
+          0.0137518, 0.140892, 0.0402234,    // seq 1
+          -0.0048839, 0.155096, 0.0840309,   // seq 2
+      },
+      {
+          // Batch1: 3 (input_sequence_size) * 3 (n_output)
+          -0.00728636, 0.0843957, 0.0634786,  // seq 0
+          -0.00448382, 0.139278, 0.0737372,   // seq 1
+          0.00734616, 0.161793, 0.0560238,    // seq 2
+      }};
+
+  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
+                &layer_norm_lstm);
+}
+
+TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
+       HybridLayerNormLstmBlackBoxTestInt8) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 4;
+  const int n_output = 3;
+  const float ceil_clip = 0.0;
+  const float proj_clip = 0.0;
+
+  HybridLayerNormLSTMOpModel layer_norm_lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/false, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, ceil_clip, proj_clip,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {n_cell},  // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+
+          {n_cell},  // input_layer_norm_coefficient tensor
+          {n_cell},  // forget_layer_norm_coefficient tensor
+          {n_cell},  // cell_layer_norm_coefficient tensor
+          {n_cell},  // output_layer_norm_coefficient tensor
+      },
+      TensorType_INT8);
+
+  layer_norm_lstm.SetInputToInputWeights(input_to_input_weights_);
+  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
+  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  layer_norm_lstm.SetInputGateBias(input_gate_bias_);
+  layer_norm_lstm.SetCellBias(cell_gate_bias_);
+  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
+  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
+
+  layer_norm_lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
+  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  layer_norm_lstm.SetCellToInputWeights(cell_to_input_weights_);
+  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  layer_norm_lstm.SetInputLayerNormCoefficients(input_layer_norm_coefficients_);
+  layer_norm_lstm.SetForgetLayerNormCoefficients(
+      forget_layer_norm_coefficients_);
+  layer_norm_lstm.SetCellLayerNormCoefficients(cell_layer_norm_coefficients_);
+  layer_norm_lstm.SetOutputLayerNormCoefficients(
+      output_layer_norm_coefficients_);
+
+  layer_norm_lstm.SetProjectionWeights(projection_weights_);
+
+  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
+      {
+          // Batch0: 3 (input_sequence_size) * 3 (n_output)
+          0.0244576, 0.127847, -0.00181765,  // seq 0
+          0.0137518, 0.140892, 0.0402234,    // seq 1
+          -0.0048839, 0.155096, 0.0840309,   // seq 2
+      },
+      {
+          // Batch1: 3 (input_sequence_size) * 3 (n_output)
+          -0.00728636, 0.0843957, 0.0634786,  // seq 0
+          -0.00448382, 0.139278, 0.0737372,   // seq 1
+          0.00734616, 0.161793, 0.0560238,    // seq 2
+      }};
+
+  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
+                &layer_norm_lstm);
+}
+
+class CifgPeepholeProjectionNoClippingLayerNormLstmTest
+    : public BaseLayerNormLstmTest {
+  void SetUp() override {
+    input_to_forget_weights_ = {-0.6, -0.1, 0.3,  0.2,  0.9,  -0.5, -0.2,
+                                -0.4, 0.3,  -0.8, -0.4, 0.3,  -0.5, -0.4,
+                                -0.6, 0.3,  -0.4, -0.6, -0.5, -0.5};
+    input_to_cell_weights_ = {-0.4, -0.3, -0.2, -0.1, -0.5, 0.5,  -0.2,
+                              -0.3, -0.2, -0.6, 0.6,  -0.1, -0.4, -0.3,
+                              -0.7, 0.7,  -0.9, -0.5, 0.8,  0.6};
+    input_to_output_weights_ = {-0.8, -0.4, -0.2, -0.9, -0.1, -0.7, 0.3,
+                                -0.3, -0.8, -0.2, 0.6,  -0.2, 0.4,  -0.7,
+                                -0.3, -0.5, 0.1,  0.5,  -0.6, -0.4};
+
+    forget_gate_bias_ = {0.1, -0.3, -0.2, 0.1};
+    cell_gate_bias_ = {-0.05, 0.72, 0.25, 0.08};
+    output_gate_bias_ = {0.05, -0.01, 0.2, 0.1};
+
+    recurrent_to_cell_weights_ = {-0.3, 0.2, 0.1, -0.3, 0.8,  -0.08,
+                                  -0.2, 0.3, 0.8, -0.6, -0.1, 0.2};
+    recurrent_to_forget_weights_ = {-0.5, -0.3, -0.5, -0.2, 0.6, 0.4,
+                                    0.9,  0.3,  -0.1, 0.2,  0.5, 0.2};
+    recurrent_to_output_weights_ = {0.3,  -0.1, 0.1,  -0.2, -0.5, -0.7,
+                                    -0.2, -0.6, -0.1, -0.4, -0.7, -0.2};
+
+    cell_to_forget_weights_ = {-0.02, -0.15, -0.25, -0.03};
+    cell_to_output_weights_ = {0.1, -0.1, -0.5, 0.05};
+
+    forget_layer_norm_coefficients_ = {0.2, 0.2, 0.4, 0.3};
+    cell_layer_norm_coefficients_ = {0.7, 0.2, 0.3, 0.8};
+    output_layer_norm_coefficients_ = {0.6, 0.2, 0.2, 0.5};
+    projection_weights_ = {-0.1, 0.2,  0.01, -0.2, 0.1,  0.5,
+                           0.3,  0.08, 0.07, 0.2,  -0.4, 0.2};
+
+    layer_norm_lstm_input_ = {
+        {// Batch0: 3 (input_sequence_size) * 5 (n_input)
+         0.7, 0.8, 0.1, 0.2, 0.3,   // seq 0
+         0.8, 0.1, 0.2, 0.4, 0.5,   // seq 1
+         0.2, 0.7, 0.7, 0.1, 0.7},  // seq 2
+
+        {// Batch1: 3 (input_sequence_size) * 5 (n_input)
+         0.3, 0.2, 0.9, 0.8, 0.1,   // seq 0
+         0.1, 0.5, 0.2, 0.4, 0.2,   // seq 1
+         0.6, 0.9, 0.2, 0.5, 0.7},  // seq 2
+    };
+  }
+};
+
+TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
+       LayerNormLstmBlackBoxTest) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 4;
+  const int n_output = 3;
+  const float ceil_clip = 0.0;
+  const float proj_clip = 0.0;
+
+  LayerNormLSTMOpModel layer_norm_lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/true, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, ceil_clip, proj_clip,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {0, 0},             // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {0, 0},              // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},       // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {0},       // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+
+          {0},       // input_layer_norm_coefficient tensor
+          {n_cell},  // forget_layer_norm_coefficient tensor
+          {n_cell},  // cell_layer_norm_coefficient tensor
+          {n_cell},  // output_layer_norm_coefficient tensor
+      });
+
+  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
+  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  layer_norm_lstm.SetCellBias(cell_gate_bias_);
+  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
+  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
+
+  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  layer_norm_lstm.SetForgetLayerNormCoefficients(
+      forget_layer_norm_coefficients_);
+  layer_norm_lstm.SetCellLayerNormCoefficients(cell_layer_norm_coefficients_);
+  layer_norm_lstm.SetOutputLayerNormCoefficients(
+      output_layer_norm_coefficients_);
+
+  layer_norm_lstm.SetProjectionWeights(projection_weights_);
+
+  // Verify the final output.
+  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
+      {
+          // Batch0: 3 (input_sequence_size) * 3 (n_output)
+          0.02129706, 0.140816242, 0.0112733059,     // seq 0
+          0.0132302344, 0.152308047, 0.0346313119,   // seq 1
+          -0.0123688057, 0.165790111, 0.0893077999,  // seq 2
+      },
+      {
+          // Batch1: 3 (input_sequence_size) * 3 (n_output)
+          -0.0226350538, 0.0916948169, 0.0769175813,  // seq 0
+          -0.0269966982, 0.149707705, 0.094149217,    // seq 1
+          -0.0103429332, 0.173016444, 0.0720508844,   // seq 2
+      }};
+
+  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
+                &layer_norm_lstm);
+}
+
+TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
+       HybridLayerNormLstmBlackBoxTestUint8) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 4;
+  const int n_output = 3;
+  const float ceil_clip = 0.0;
+  const float proj_clip = 0.0;
+
+  HybridLayerNormLSTMOpModel layer_norm_lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/true, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, ceil_clip, proj_clip,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {0, 0},             // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {0, 0},              // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},       // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {0},       // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+
+          {0},       // input_layer_norm_coefficient tensor
+          {n_cell},  // forget_layer_norm_coefficient tensor
+          {n_cell},  // cell_layer_norm_coefficient tensor
+          {n_cell},  // output_layer_norm_coefficient tensor
+      },
+      TensorType_UINT8);
+
+  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
+  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  layer_norm_lstm.SetCellBias(cell_gate_bias_);
+  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
+  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
+
+  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  layer_norm_lstm.SetForgetLayerNormCoefficients(
+      forget_layer_norm_coefficients_);
+  layer_norm_lstm.SetCellLayerNormCoefficients(cell_layer_norm_coefficients_);
+  layer_norm_lstm.SetOutputLayerNormCoefficients(
+      output_layer_norm_coefficients_);
+
+  layer_norm_lstm.SetProjectionWeights(projection_weights_);
+
+  // Verify the final output.
+  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
+      {
+          // Batch0: 3 (input_sequence_size) * 3 (n_output)
+          0.0212250091, 0.140474007, 0.0115012666,   // seq 0
+          0.0130806509, 0.152660668, 0.0347516984,   // seq 1
+          -0.0124010444, 0.166042402, 0.0898982584,  // seq 2
+      },
+      {
+          // Batch1: 3 (input_sequence_size) * 3 (n_output)
+          -0.0228835996, 0.0917588323, 0.0778886303,  // seq 0
+          -0.0275101066, 0.148769245, 0.0938384682,   // seq 1
+          -0.0103605557, 0.172605693, 0.0728750974,   // seq 2
+      }};
+
+  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
+                &layer_norm_lstm);
+}
+
+TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
+       HybridLayerNormLstmBlackBoxTestInt8) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 4;
+  const int n_output = 3;
+  const float ceil_clip = 0.0;
+  const float proj_clip = 0.0;
+
+  HybridLayerNormLSTMOpModel layer_norm_lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/true, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, ceil_clip, proj_clip,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {0, 0},             // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {0, 0},              // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},       // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {0},       // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+
+          {0},       // input_layer_norm_coefficient tensor
+          {n_cell},  // forget_layer_norm_coefficient tensor
+          {n_cell},  // cell_layer_norm_coefficient tensor
+          {n_cell},  // output_layer_norm_coefficient tensor
+      },
+      TensorType_INT8);
+
+  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
+  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  layer_norm_lstm.SetCellBias(cell_gate_bias_);
+  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
+  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
+
+  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  layer_norm_lstm.SetForgetLayerNormCoefficients(
+      forget_layer_norm_coefficients_);
+  layer_norm_lstm.SetCellLayerNormCoefficients(cell_layer_norm_coefficients_);
+  layer_norm_lstm.SetOutputLayerNormCoefficients(
+      output_layer_norm_coefficients_);
+
+  layer_norm_lstm.SetProjectionWeights(projection_weights_);
+
+  // Verify the final output.
+  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
+      {
+          // Batch0: 3 (input_sequence_size) * 3 (n_output)
+          0.0212250091, 0.140474007, 0.0115012666,   // seq 0
+          0.0130806509, 0.152660668, 0.0347516984,   // seq 1
+          -0.0124010444, 0.166042402, 0.0898982584,  // seq 2
+      },
+      {
+          // Batch1: 3 (input_sequence_size) * 3 (n_output)
+          -0.0228835996, 0.0917588323, 0.0778886303,  // seq 0
+          -0.0275101066, 0.148769245, 0.0938384682,   // seq 1
+          -0.0103605557, 0.172605693, 0.0728750974,   // seq 2
+      }};
+
+  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
+                &layer_norm_lstm);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/maximum_minimum.cc b/tensorflow/lite/kernels/maximum_minimum.cc
index 3bcaabf675eba4f528fe73b01610d915e7780f85..0e15254f477f0bbc4ba15c4c4ac189fe1c8d3da0 100644
--- a/tensorflow/lite/kernels/maximum_minimum.cc
+++ b/tensorflow/lite/kernels/maximum_minimum.cc
@@ -108,6 +108,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       case kTfLiteUInt8:
         TFLiteOperation<uint8_t, OpType>(context, node, op_context);
         break;
+      case kTfLiteInt8:
+        TFLiteOperation<int8_t, OpType>(context, node, op_context);
+        break;
       case kTfLiteInt32:
        TFLiteOperation<int32_t, OpType>(context, node, op_context);
         break;
diff --git a/tensorflow/lite/kernels/maximum_minimum_test.cc b/tensorflow/lite/kernels/maximum_minimum_test.cc
index acb74e09d3fb47c33c6c146af4d0b1b1030491be..6567c8f3611204af3bdeecbdb11a07f6f16be908 100644
--- a/tensorflow/lite/kernels/maximum_minimum_test.cc
+++ b/tensorflow/lite/kernels/maximum_minimum_test.cc
@@ -112,6 +112,17 @@ TEST(MaxMinOpTest, Uint8Test) {
                      {0, 0, 1, 11, 2, 1});
 }
 
+TEST(MaxMinOpTest, Int8Test) {
+  std::initializer_list<int8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<int8_t> data2 = {0, 0, 1, 12, 123, 1};
+  TestModel<int8_t>(BuiltinOperator_MAXIMUM, {TensorType_INT8, {3, 1, 2}},
+                    {TensorType_INT8, {3, 1, 2}}, {TensorType_INT8, {3, 1, 2}},
+                    data1, data2, {1, 0, 2, 12, 123, 23});
+  TestModel<int8_t>(BuiltinOperator_MINIMUM, {TensorType_INT8, {3, 1, 2}},
+                    {TensorType_INT8, {3, 1, 2}}, {TensorType_INT8, {3, 1, 2}},
+                    data1, data2, {0, 0, 1, 11, 2, 1});
+}
+
 TEST(MaximumOpTest, FloatWithBroadcastTest) {
   std::initializer_list<float> data1 = {1.0, 0.0, -1.0, -2.0, -1.44, 11.0};
   std::initializer_list<float> data2 = {0.5, 2.0};
diff --git a/tensorflow/lite/kernels/mirror_pad.cc b/tensorflow/lite/kernels/mirror_pad.cc
index e74e47f7a37b0f449fb2a63237e95066bb452de6..c0924da2a185758b7c508c6d39185b337ce7d90f 100644
--- a/tensorflow/lite/kernels/mirror_pad.cc
+++ b/tensorflow/lite/kernels/mirror_pad.cc
@@ -39,8 +39,8 @@ struct PaddedTensor {
   const void* value = nullptr;
   // If this tensor is not one value, then this vector will have
   // all the tensors that belongs to this tensor.
-  // Pointers are owned.
-  std::vector<std::unique_ptr<PaddedTensor>> values;
+  // Pointers are not owned.
+  std::vector<PaddedTensor*> values;
   // Pointers to PaddedTensors that are padded on the left of the current
   // tensor.
   std::vector<PaddedTensor*> left_pad_ptrs;
@@ -55,24 +55,47 @@ struct PaddedTensor {
       if (indices[i] >= result->values.size()) {
         return nullptr;
       }
-      result = result->values[indices[i]].get();
+      result = result->values[indices[i]];
       if (result == nullptr) break;
     }
     return result;
   }
 };
 
+// Wrapper for all intermediate data used by the op.
+struct OpData {
+  // Holds intermediate data structure of the padded tensor.
+  std::vector<PaddedTensor> pad_tensor_buffer;
+  // Total number of intermediate elements in the pad_tensor_buffer.
+  int num_elements;
+};
+
 // Util method to initialize the memory of the padded tensor.
-void InitializeTensorMemory(const TfLiteIntArray* const dims, int dim_index,
-                            int dims_size, PaddedTensor* padded_tensor) {
-  if (dim_index >= dims_size) {
-    return;
-  }
-  padded_tensor->values.reserve(dims->data[dim_index]);
-  for (int i = 0; i < dims->data[dim_index]; ++i) {
-    padded_tensor->values.emplace_back(new PaddedTensor());
-    InitializeTensorMemory(dims, dim_index + 1, dims_size,
-                           padded_tensor->values.back().get());
+void InitializeTensorMemory(const TfLiteIntArray* const dims, int dims_size,
+                            std::vector<PaddedTensor>* padded_tensor_buffer) {
+  int dimension_index = 0;
+  int element_index = 0;
+  // We hold 2 vectors with values for nodes in current level, and
+  // nodes in the next level, and swap while moving on dimensions of the tensor.
+  std::vector<PaddedTensor*> current_nodes, next_level;
+  current_nodes.push_back(&(*padded_tensor_buffer)[element_index]);
+  element_index++;
+  int next_level_size = 1;
+  while (!current_nodes.empty() && dimension_index < dims_size) {
+    next_level_size *= dims->data[dimension_index];
+    next_level.resize(next_level_size);
+    // Index of elements in next level.
+    int index = 0;
+    for (auto* padded_tensor : current_nodes) {
+      padded_tensor->values.resize(dims->data[dimension_index]);
+      for (int i = 0; i < dims->data[dimension_index]; ++i) {
+        padded_tensor->values[i] = &(*padded_tensor_buffer)[element_index];
+        next_level[index++] = padded_tensor->values[i];
+        element_index++;
+      }
+    }
+    std::swap(current_nodes, next_level);
+    dimension_index++;
   }
 }
 
@@ -101,20 +124,6 @@ inline const void* GetValuePointerAtIndex(const void* data, int index,
   return nullptr;
 }
 
-// Util method that increment index in the N-d array.
-void IncrementTensorIndex(const TfLiteIntArray* dims,
-                          std::vector<int>* tensor_index_ptr) {
-  int dimension_index = dims->size - 1;
-  auto& tensor_index = *tensor_index_ptr;
-  tensor_index[dimension_index]++;
-  while (dimension_index >= 0 &&
-         tensor_index[dimension_index] == dims->data[dimension_index]) {
-    tensor_index[dimension_index] = 0;
-    dimension_index--;
-    if (dimension_index >= 0) tensor_index[dimension_index]++;
-  }
-}
-
 // Fills the 'padded_tensor' with data from 'input_tensor'.
 TfLiteStatus InitFromInputTensor(const TfLiteTensor* input_tensor,
                                  PaddedTensor* padded_tensor) {
@@ -129,13 +138,13 @@ TfLiteStatus InitFromInputTensor(const TfLiteTensor* input_tensor,
   std::vector<int> tensor_index(dims->size, 0);
   int flat_index = 0;
   const int num_elements = NumElements(input_tensor);
+  auto* tensor = padded_tensor->GetMutable(tensor_index);
   while (flat_index < num_elements) {
-    auto* tensor = padded_tensor->GetMutable(tensor_index);
     if (tensor == nullptr) {
       return kTfLiteError;
     }
     tensor->value = GetValuePointerAtIndex(data, flat_index, data_type);
-    IncrementTensorIndex(dims, &tensor_index);
+    ++tensor;
     ++flat_index;
   }
 
@@ -191,7 +200,7 @@ TfLiteStatus ValidateTensor(const TfLiteTensor* padding_matrix, int offset,
   }
   if (!padded_tensor->values.empty()) {
     ValidateTensor(padding_matrix, offset, dimension_index + 1,
-                   padded_tensor->values[0].get(), context);
+                   padded_tensor->values[0], context);
   }
   return kTfLiteOk;
 }
@@ -208,18 +217,20 @@ TfLiteStatus PadTensor(const TfLiteTensor* padding_matrix, int offset,
   TF_LITE_ENSURE_STATUS(
       GetPadding(padding_matrix, dimension_index, &left_pad, &right_pad));
 
+  padded_tensor->left_pad_ptrs.clear();
   for (int i = left_pad + offset - 1; i >= offset && left_pad > 0;
        --i, --left_pad) {
-    padded_tensor->left_pad_ptrs.push_back(padded_tensor->values[i].get());
+    padded_tensor->left_pad_ptrs.push_back(padded_tensor->values[i]);
   }
+  padded_tensor->right_pad_ptrs.clear();
   for (int i = padded_tensor->values.size() - (1 + offset);
        i >= 0 && right_pad > 0; --i, --right_pad) {
-    padded_tensor->right_pad_ptrs.push_back(padded_tensor->values[i].get());
+    padded_tensor->right_pad_ptrs.push_back(padded_tensor->values[i]);
   }
 
   for (auto& tensor : padded_tensor->values) {
     TF_LITE_ENSURE_STATUS(PadTensor(padding_matrix, offset, dimension_index + 1,
-                                    tensor.get(), context));
+                                    tensor, context));
   }
   return kTfLiteOk;
 }
@@ -241,7 +252,7 @@ int FillOutput(const PaddedTensor* padded_tensor, T* output_data,
     index_in_output = FillOutput(tensor, output_data, index_in_output);
   }
   for (const auto& tensor : padded_tensor->values) {
-    index_in_output = FillOutput(tensor.get(), output_data, index_in_output);
+    index_in_output = FillOutput(tensor, output_data, index_in_output);
   }
   for (const auto* tensor : padded_tensor->right_pad_ptrs) {
     index_in_output = FillOutput(tensor, output_data, index_in_output);
@@ -271,6 +282,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* padding_matrix = GetInput(context, node, 1);
   auto* params =
       reinterpret_cast<TfLiteMirrorPaddingParams*>(node->builtin_data);
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
 
   if (params == nullptr) {
     return kTfLiteError;
@@ -287,12 +299,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         context->ResizeTensor(context, output_tensor, output_size.release()));
   }
 
-  PaddedTensor padded_tensor;
+  PaddedTensor& padded_tensor = op_data->pad_tensor_buffer[0];
   // Initialize memory.
-  InitializeTensorMemory(input_tensor->dims, 0, input_dims, &padded_tensor);
+  InitializeTensorMemory(input_tensor->dims, input_dims,
+                         &op_data->pad_tensor_buffer);
   // Set the values from the input_tensor.
   TF_LITE_ENSURE_STATUS(InitFromInputTensor(input_tensor, &padded_tensor));
-
   const int offset =
       params->mode != TfLiteMirrorPaddingMode::kTfLiteMirrorPaddingReflect ? 0
                                                                            : 1;
@@ -335,20 +347,34 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  return nullptr;
+  return new OpData();
 }
 
-void Free(TfLiteContext* context, void* buffer) {}
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input_tensor = GetInput(context, node, 0);
   const TfLiteTensor* padding_matrix = GetInput(context, node, 1);
   TfLiteTensor* output_tensor = GetOutput(context, node, 0);
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(padding_matrix), 2);
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(padding_matrix, 0),
                     NumDimensions(input_tensor));
 
+  // Calculate total number of nodes in the tree structure of a tensor
+  // and pre-allocates it.
+  int num_elements = NumElements(input_tensor) + 1;
+  int extra_nodes = 1;
+  for (int i = 0; i < NumDimensions(input_tensor) - 1; ++i) {
+    extra_nodes *= input_tensor->dims->data[i];
+    num_elements += extra_nodes;
+  }
+  op_data->pad_tensor_buffer.resize(num_elements);
+  op_data->num_elements = num_elements;
+
   if (!IsConstantTensor(padding_matrix)) {
     SetTensorToDynamic(output_tensor);
     return kTfLiteOk;
diff --git a/tensorflow/lite/kernels/mul.cc b/tensorflow/lite/kernels/mul.cc
index 01039a705438af2a92a68b01c2146daf69c46250..e0ff6724ea2f3ea0fd4693571d6c509f5385a5d4 100644
--- a/tensorflow/lite/kernels/mul.cc
+++ b/tensorflow/lite/kernels/mul.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/mul.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
@@ -87,8 +88,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                   &data->output_activation_min,
                                   &data->output_activation_max);
   }
+  if (output->type == kTfLiteInt8) {
+    CalculateActivationRangeInt8(params->activation, output,
+                                 &data->output_activation_min,
+                                 &data->output_activation_max);
+  }
 
-  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) {
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
+      output->type == kTfLiteInt16) {
     double real_multiplier =
         input1->params.scale * input2->params.scale / output->params.scale;
     QuantizeMultiplierSmallerThanOneExp(
@@ -151,8 +158,8 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                            TfLiteMulParams* params, const OpData* data,
                            const TfLiteTensor* input1,
                            const TfLiteTensor* input2, TfLiteTensor* output) {
-  if (input1->type == kTfLiteUInt8 && input2->type == kTfLiteUInt8 &&
-      output->type == kTfLiteUInt8) {
+  if (input1->type == input2->type && input1->type == output->type &&
+      (input1->type == kTfLiteUInt8 || input1->type == kTfLiteInt8)) {
     tflite::ArithmeticParams op_params;
     SetActivationParams(data->output_activation_min,
                         data->output_activation_max, &op_params);
@@ -163,23 +170,31 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
     op_params.output_shift = data->output_shift;
     bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
         GetTensorShape(input1), GetTensorShape(input2), &op_params);
-#define TF_LITE_MUL(type, opname)                                      \
-  type::opname(op_params, GetTensorShape(input1),                      \
-               GetTensorData<uint8_t>(input1), GetTensorShape(input2), \
-               GetTensorData<uint8_t>(input2), GetTensorShape(output), \
-               GetTensorData<uint8_t>(output))
-
-    if (kernel_type == kReference) {
+#define TF_LITE_MUL(type, opname, dtype)                             \
+  type::opname(op_params, GetTensorShape(input1),                    \
+               GetTensorData<dtype>(input1), GetTensorShape(input2), \
+               GetTensorData<dtype>(input2), GetTensorShape(output), \
+               GetTensorData<dtype>(output))
+    if (input1->type == kTfLiteInt8) {
       if (need_broadcast) {
-        TF_LITE_MUL(reference_ops, BroadcastMul4DSlow);
+        TF_LITE_MUL(reference_integer_ops, BroadcastMul4DSlow, int8_t);
       } else {
-        TF_LITE_MUL(reference_ops, Mul);
+        TF_LITE_MUL(reference_integer_ops, Mul, int8_t);
       }
     } else {
-      if (need_broadcast) {
-        TF_LITE_MUL(optimized_ops, BroadcastMulFivefold);
+      // type == kTfLiteUInt8
+      if (kernel_type == kReference) {
+        if (need_broadcast) {
+          TF_LITE_MUL(reference_ops, BroadcastMul4DSlow, uint8_t);
+        } else {
+          TF_LITE_MUL(reference_ops, Mul, uint8_t);
+        }
       } else {
-        TF_LITE_MUL(optimized_ops, Mul);
+        if (need_broadcast) {
+          TF_LITE_MUL(optimized_ops, BroadcastMulFivefold, uint8_t);
+        } else {
+          TF_LITE_MUL(optimized_ops, Mul, uint8_t);
+        }
       }
     }
 #undef TF_LITE_MUL
@@ -198,8 +213,8 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
     }
 #undef TF_LITE_MUL
   } else if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 &&
-             output->type == kTfLiteUInt8) {
-#define TF_LITE_MUL(type, opname)                                      \
+             (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8)) {
+#define TF_LITE_MUL(type, opname, output_dtype)                        \
   tflite::ArithmeticParams op_params;                                  \
   SetActivationParams(data->output_activation_min,                     \
                       data->output_activation_max, &op_params);        \
@@ -207,11 +222,15 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   type::opname(op_params, GetTensorShape(input1),                      \
                GetTensorData<int16_t>(input1), GetTensorShape(input2), \
                GetTensorData<int16_t>(input2), GetTensorShape(output), \
-               GetTensorData<uint8_t>(output))
-    if (kernel_type == kReference) {
-      TF_LITE_MUL(reference_ops, Mul);
+               GetTensorData<output_dtype>(output))
+    if (output->type == kTfLiteInt8) {
+      TF_LITE_MUL(reference_integer_ops, Mul, int8_t);
     } else {
-      TF_LITE_MUL(optimized_ops, Mul);
+      if (kernel_type == kReference) {
+        TF_LITE_MUL(reference_ops, Mul, uint8_t);
+      } else {
+        TF_LITE_MUL(optimized_ops, Mul, uint8_t);
+      }
     }
 #undef TF_LITE_MUL
   } else {
@@ -233,14 +252,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32) {
     EvalMul<kernel_type>(context, node, params, data, input1, input2, output);
-  } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) {
+  } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
+             output->type == kTfLiteInt16) {
     TF_LITE_ENSURE_OK(
         context, EvalQuantized<kernel_type>(context, node, params, data, input1,
                                             input2, output));
   } else {
     context->ReportError(context,
-                         "Mul only supports FLOAT32, INT32 and quantized UINT8 "
-                         "and INT16 now, got %d.",
+                         "Mul only supports FLOAT32, INT32 and quantized UINT8,"
+                         " INT8 and INT16 now, got %d.",
                          output->type);
     return kTfLiteError;
   }
diff --git a/tensorflow/lite/kernels/mul_test.cc b/tensorflow/lite/kernels/mul_test.cc
index 200cc26dadc3527813a7dabd3b9ca4811d4c8856..96f5a8a0e07e730394510f432b3313724e6c9172 100644
--- a/tensorflow/lite/kernels/mul_test.cc
+++ b/tensorflow/lite/kernels/mul_test.cc
@@ -73,9 +73,10 @@ class QuantizedMulOpModel : public BaseMulOpModel {
  public:
   using BaseMulOpModel::BaseMulOpModel;
 
+  template <typename integer_dtype>
   std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
+    return Dequantize<integer_dtype>(ExtractVector<integer_dtype>(output_),
+                                     GetScale(output_), GetZeroPoint(output_));
   }
 
   std::vector<float> GetDequantizedOutputInt16() {
@@ -191,19 +192,28 @@ TEST(IntegerMulOpTest, WithBroadcast) {
   }
 }
 
-TEST(QuantizedMulOpTest, NoActivation) {
-  QuantizedMulOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                        {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                        {TensorType_UINT8, {}, -1.0, 1.0},
+template <TensorType tensor_type, typename integer_dtype>
+void NoActivation() {
+  QuantizedMulOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                        {tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                        {tensor_type, {}, -1.0, 1.0},
                         ActivationFunctionType_NONE);
-  m.QuantizeAndPopulate<uint8_t>(m.input1(), {-0.8, 0.2, 0.9, 0.7});
-  m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.6, 0.4, 0.9, 0.8});
+  m.QuantizeAndPopulate<integer_dtype>(m.input1(), {-0.8, 0.2, 0.9, 0.7});
+  m.QuantizeAndPopulate<integer_dtype>(m.input2(), {0.6, 0.4, 0.9, 0.8});
   m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
               ElementsAreArray(ArrayFloatNear({-0.48, 0.08, 0.81, 0.56},
                                               kQuantizedTolerance)));
 }
 
+TEST(QuantizedMulOpTest, NoActivationUInt8) {
+  NoActivation<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedMulOpTest, NoActivationInt8) {
+  NoActivation<TensorType_INT8, int8_t>();
+}
+
 TEST(QuantizedMulOpTest, NoActivationInt16) {
   const float kMin = -1.f;
   const float kMax = 32767.f / 32768.f;
@@ -219,23 +229,32 @@ TEST(QuantizedMulOpTest, NoActivationInt16) {
                                               kQuantizedToleranceInt16)));
 }
 
-TEST(QuantizedMulOpTest, NoActivationInt16WithUint8Output) {
+template <TensorType tensor_type, typename integer_dtype>
+void NoActivationInt16With8BitOutput() {
   const float kMinInt16 = -1.f;
   const float kMaxInt16 = 32767.f / 32768.f;
   const float kMinUint8 = -1.f;
   const float kMaxUint8 = 127.f / 128.f;
   QuantizedMulOpModel m({TensorType_INT16, {1, 2, 2, 1}, kMinInt16, kMaxInt16},
                         {TensorType_INT16, {1, 2, 2, 1}, kMinInt16, kMaxInt16},
-                        {TensorType_UINT8, {}, kMinUint8, kMaxUint8},
+                        {tensor_type, {}, kMinUint8, kMaxUint8},
                         ActivationFunctionType_NONE);
   m.QuantizeAndPopulate<int16_t>(m.input1(), {-0.8, 0.2, 0.9, 0.7});
   m.QuantizeAndPopulate<int16_t>(m.input2(), {0.6, 0.4, 0.9, 0.8});
   m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
               ElementsAreArray(ArrayFloatNear({-0.48, 0.08, 0.81, 0.56},
                                               kQuantizedTolerance)));
 }
 
+TEST(QuantizedMulOpTest, NoActivationInt16WithUint8Output) {
+  NoActivationInt16With8BitOutput<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedMulOpTest, NoActivationInt16Withint8Output) {
+  NoActivationInt16With8BitOutput<TensorType_INT8, int8_t>();
+}
+
 // for quantized Mul, the error shouldn't exceed 2*step
 float GetTolerance(int min, int max) {
   float kQuantizedStep = (max - min) / 255.0;
@@ -243,25 +262,35 @@ float GetTolerance(int min, int max) {
   return kQuantizedTolerance;
 }
 
-TEST(QuantizedMulOpTest, WithBroadcast) {
+template <TensorType tensor_type, typename integer_dtype>
+void WithBroadcast() {
   float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
   std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
-    QuantizedMulOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
-                          {TensorType_UINT8, {}, -3.0, 3.0},  // always a scalar
-                          {TensorType_UINT8, {}, -3.0, 3.0},
+    QuantizedMulOpModel m({tensor_type, test_shapes[i], -3.0, 3.0},
+                          {tensor_type, {}, -3.0, 3.0},  // always a scalar
+                          {tensor_type, {}, -3.0, 3.0},
                           ActivationFunctionType_NONE);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1});
+    m.QuantizeAndPopulate<integer_dtype>(m.input1(),
+                                         {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<integer_dtype>(m.input2(), {0.1});
     m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(),
+    EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
                 ElementsAreArray(ArrayFloatNear(
                     {-0.2, 0.02, 0.07, 0.08, 0.11, 0.2}, kQuantizedTolerance)))
         << "With shape number " << i;
   }
 }
 
+TEST(QuantizedMulOpTest, WithBroadcastUInt8) {
+  WithBroadcast<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedMulOpTest, WithBroadcastInt8) {
+  WithBroadcast<TensorType_INT8, int8_t>();
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/pack.cc b/tensorflow/lite/kernels/pack.cc
index 479495c875dac5d4e827864548c6b4a188e284ee..e26abaaff1e5c9e460621048eb15d0549b81fb36 100644
--- a/tensorflow/lite/kernels/pack.cc
+++ b/tensorflow/lite/kernels/pack.cc
@@ -35,13 +35,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
   const TfLiteTensor* input0 = GetInput(context, node, 0);
-  TF_LITE_ENSURE(context, NumDimensions(input0) < 4);
   TF_LITE_ENSURE(context, NumDimensions(input0) >= data->axis);
   // TODO(renjieliu): Support negative axis.
   TF_LITE_ENSURE(context, data->axis >= 0);
   if (input0->type != kTfLiteInt32 && input0->type != kTfLiteFloat32 &&
-      input0->type != kTfLiteUInt8 && input0->type != kTfLiteInt16 &&
-      input0->type != kTfLiteInt64) {
+      input0->type != kTfLiteUInt8 && input0->type != kTfLiteInt8 &&
+      input0->type != kTfLiteInt16 && input0->type != kTfLiteInt64) {
     context->ReportError(context, "Type '%s' is not supported by pack.",
                          TfLiteTypeGetName(input0->type));
     return kTfLiteError;
@@ -107,6 +106,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       PackImpl<uint8_t>(context, node, output, data->values_count, data->axis);
       break;
     }
+    case kTfLiteInt8: {
+      PackImpl<int8_t>(context, node, output, data->values_count, data->axis);
+      break;
+    }
     case kTfLiteInt32: {
       PackImpl<int32_t>(context, node, output, data->values_count, data->axis);
       break;
diff --git a/tensorflow/lite/kernels/pack_test.cc b/tensorflow/lite/kernels/pack_test.cc
index 4f58debc5c872ea640ed97cd51884a39b412ff2f..f44111567fc34f17912af7db352b47e57f8704f3 100644
--- a/tensorflow/lite/kernels/pack_test.cc
+++ b/tensorflow/lite/kernels/pack_test.cc
@@ -82,6 +82,19 @@ TEST(PackOpTest, FloatMultilDimensions) {
               ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
 }
 
+TEST(PackOpTest, FloatFiveDimensions) {
+  PackOpModel<float> model({TensorType_FLOAT32, {2, 2, 2, 2}}, 1, 2);
+  model.SetInput(0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  model.SetInput(
+      1, {17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 2, 2, 2, 2));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1,  2,  3,  4,  5,  6,  7,  8,  17, 18, 19,
+                                20, 21, 22, 23, 24, 9,  10, 11, 12, 13, 14,
+                                15, 16, 25, 26, 27, 28, 29, 30, 31, 32}));
+}
+
 // int32 tests.
 TEST(PackOpTest, Int32ThreeInputs) {
   PackOpModel<int32_t> model({TensorType_INT32, {2}}, 0, 3);
@@ -178,6 +191,37 @@ TEST(PackOpTest, Uint8MultilDimensions) {
               ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
 }
 
+// int8
+TEST(PackOpTest, Int8ThreeInputs) {
+  PackOpModel<int8_t> model({TensorType_INT8, {2}}, 0, 3);
+  model.SetInput(0, {1, 4});
+  model.SetInput(1, {2, 5});
+  model.SetInput(2, {3, 6});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(3, 2));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 4, 2, 5, 3, 6}));
+}
+
+TEST(PackOpTest, Int8ThreeInputsDifferentAxis) {
+  PackOpModel<int8_t> model({TensorType_INT8, {2}}, 1, 3);
+  model.SetInput(0, {1, 4});
+  model.SetInput(1, {2, 5});
+  model.SetInput(2, {3, 6});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 3));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(PackOpTest, Int8MultilDimensions) {
+  PackOpModel<int8_t> model({TensorType_INT8, {2, 3}}, 1, 2);
+  model.SetInput(0, {1, 2, 3, 4, 5, 6});
+  model.SetInput(1, {7, 8, 9, 10, 11, 12});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 2, 3));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/pad.cc b/tensorflow/lite/kernels/pad.cc
index 8e6ed6e741f782f070714164a7af7b4f98a1558f..b60b3dd9c871bf864492505dd9fa4aabf496364c 100644
--- a/tensorflow/lite/kernels/pad.cc
+++ b/tensorflow/lite/kernels/pad.cc
@@ -214,6 +214,31 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         }
       }
     } break;
+    case kTfLiteInt8: {
+      int8_t pad_value;
+      if (op_context.constant_values == nullptr) {
+        // Quantized Pad requires that 0 is represented in the quantized
+        // range.
+        TF_LITE_ENSURE(context, op_context.output->params.zero_point >=
+                                    std::numeric_limits<int8_t>::min());
+        TF_LITE_ENSURE(context, op_context.output->params.zero_point <=
+                                    std::numeric_limits<int8_t>::max());
+        pad_value = static_cast<int8_t>(op_context.output->params.zero_point);
+      } else {
+        // Quantized Pad requires that 'constant_values' is represented in the
+        // same quantized range as the input and output tensors.
+        TF_LITE_ENSURE_EQ(context, op_context.output->params.zero_point,
+                          op_context.constant_values->params.zero_point);
+        TF_LITE_ENSURE_EQ(context, op_context.output->params.scale,
+                          op_context.constant_values->params.scale);
+        pad_value = *GetTensorData<int8_t>(op_context.constant_values);
+      }
+      if (op_context.resizing_category == ResizingCategory::kImageStyle) {
+        TF_LITE_PAD(reference_ops, PadImageStyle, int8_t, pad_value);
+      } else {
+        TF_LITE_PAD(reference_ops, Pad, int8_t, pad_value);
+      }
+    } break;
     case kTfLiteInt32: {
       int32_t pad_value =
           op_context.constant_values == nullptr
diff --git a/tensorflow/lite/kernels/pad_test.cc b/tensorflow/lite/kernels/pad_test.cc
index 415a285c707e6aa7a5a2029822cdf54d57692839..97f95264f1a376b502be1db76e2f84c392d6c1cf 100644
--- a/tensorflow/lite/kernels/pad_test.cc
+++ b/tensorflow/lite/kernels/pad_test.cc
@@ -24,31 +24,34 @@ namespace {
 using ::testing::ElementsAreArray;
 using ::testing::Matcher;
 
-template <typename T>
+template <typename RegularInputOuput, typename QuantizedInputOuput>
 class PadOpModel : public SingleOpModel {
  public:
-  void SetInput(std::initializer_list<T> data) {
-    PopulateTensor<T>(input_, data);
+  void SetInput(std::initializer_list<RegularInputOuput> data) {
+    PopulateTensor<RegularInputOuput>(input_, data);
   }
 
   void SetQuantizedInput(std::initializer_list<float> data) {
-    QuantizeAndPopulate<uint8_t>(input_, data);
+    QuantizeAndPopulate<QuantizedInputOuput>(input_, data);
   }
 
   void SetQuantizedPadValue(float data) {
-    QuantizeAndPopulate<uint8_t>(constant_values_, {data});
+    QuantizeAndPopulate<QuantizedInputOuput>(constant_values_, {data});
   }
 
   void SetPaddings(std::initializer_list<int> paddings) {
     PopulateTensor<int>(paddings_, paddings);
   }
 
-  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+  std::vector<RegularInputOuput> GetOutput() {
+    return ExtractVector<RegularInputOuput>(output_);
+  }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
   std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
+    return Dequantize<QuantizedInputOuput>(
+        ExtractVector<QuantizedInputOuput>(output_), GetScale(output_),
+        GetZeroPoint(output_));
   }
 
  protected:
@@ -59,18 +62,18 @@ class PadOpModel : public SingleOpModel {
 };
 
 // Tests case where paddings is a const tensor. Type T is the dtype.
-template <typename T>
-class PadV2OpConstModel : public PadOpModel<T> {
+template <typename T1, typename T2>
+class PadV2OpConstModel : public PadOpModel<T1, T2> {
  public:
   PadV2OpConstModel(const TensorData& input,
                     std::initializer_list<int> paddings_shape,
-                    std::initializer_list<int> paddings, T constant_values,
+                    std::initializer_list<int> paddings, T1 constant_values,
                     const TensorData& output) {
     this->input_ = this->AddInput(input);
     this->paddings_ =
         this->AddConstInput(TensorType_INT32, paddings, paddings_shape);
     this->constant_values_ =
-        this->AddConstInput(GetTensorType<T>(), {constant_values}, {1});
+        this->AddConstInput(GetTensorType<T1>(), {constant_values}, {1});
 
     this->output_ = this->AddOutput(output);
 
@@ -103,7 +106,7 @@ class PadV2OpConstModel : public PadOpModel<T> {
 //    PadOpDynamicModel m(input_shape, paddings_shape, paddings_data);
 //    m.SetInput(input_data);
 //    m.Invoke();
-class PadOpConstModel : public PadOpModel<float> {
+class PadOpConstModel : public PadOpModel<float, uint8_t> {
  public:
   PadOpConstModel(const TensorData& input,
                   std::initializer_list<int> paddings_shape,
@@ -121,16 +124,18 @@ class PadOpConstModel : public PadOpModel<float> {
 };
 
 // Test case where paddings is a non-const tensor.
-template <typename T>
-class PadV2OpDynamicModel : public PadOpModel<T> {
+template <typename RegularInputOuput, typename QuantizedInputOuput>
+class PadV2OpDynamicModel
+    : public PadOpModel<RegularInputOuput, QuantizedInputOuput> {
  public:
   PadV2OpDynamicModel(const TensorData& input,
                       std::initializer_list<int> paddings_shape,
-                      T constant_values, const TensorData& output) {
+                      RegularInputOuput constant_values,
+                      const TensorData& output) {
     this->input_ = this->AddInput(input);
     this->paddings_ = this->AddInput(TensorType_INT32);
-    this->constant_values_ =
-        this->AddConstInput(GetTensorType<T>(), {constant_values}, {1});
+    this->constant_values_ = this->AddConstInput(
+        GetTensorType<RegularInputOuput>(), {constant_values}, {1});
     this->output_ = this->AddOutput(output);
 
     this->SetBuiltinOp(BuiltinOperator_PADV2, BuiltinOptions_PadV2Options,
@@ -159,7 +164,7 @@ class PadV2OpDynamicModel : public PadOpModel<T> {
 //    m.SetInput(input_data);
 //    m.SetPaddings(paddings_data);
 //    m.Invoke();
-class PadOpDynamicModel : public PadOpModel<float> {
+class PadOpDynamicModel : public PadOpModel<float, uint8_t> {
  public:
   PadOpDynamicModel(const TensorData& input,
                     std::initializer_list<int> paddings_shape,
@@ -175,6 +180,7 @@ class PadOpDynamicModel : public PadOpModel<float> {
   }
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(PadOpTest, TooManyDimensions) {
   EXPECT_DEATH(
       PadOpConstModel({TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2},
@@ -195,6 +201,7 @@ TEST(PadOpTest, InvalidPadValue) {
                       {0, 0, 1, -1, 2, -1, 0, 0}, {TensorType_FLOAT32}),
       "Pad value has to be greater than equal to 0.");
 }
+#endif
 
 TEST(PadOpTest, SimpleConstTest) {
   // Padding is represented as four 2-D lists representing above padding and
@@ -306,6 +313,7 @@ class QuantizedPadOpTest : public ::testing::Test {
   }
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST_F(QuantizedPadOpTest, ZeroNotInQuantizationRange) {
   // The test_util and actual quantization code currently ensure that the range
   // must include zero, but if that ever changes, this test will catch it.
@@ -314,6 +322,7 @@ TEST_F(QuantizedPadOpTest, ZeroNotInQuantizationRange) {
                                  {TensorType_UINT8, {}, 1.0, 2.0}),
                ".*Check failed: f_min <= 0.*");
 }
+#endif
 
 TEST_F(QuantizedPadOpTest, SimpleConstTest) {
   // Padding is represented as four 2-D lists representing above padding and
@@ -371,34 +380,49 @@ TEST_F(QuantizedPadOpTest, AdvancedDynamicTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
 }
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(PadV2OpTest, TooManyDimensions) {
-  EXPECT_DEATH(PadV2OpConstModel<float>(
-                   {TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2},
-                   {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9}, 0.0,
-                   {TensorType_FLOAT32}),
+  typedef PadV2OpConstModel<float, uint8_t> f;
+  EXPECT_DEATH(f({TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2},
+                 {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9}, 0.0,
+                 {TensorType_FLOAT32}),
                "dims <= 4");
 }
 
 TEST(PadV2OpTest, UnequalDimensions) {
-  EXPECT_DEATH(
-      PadV2OpConstModel<float>({TensorType_FLOAT32, {1, 1, 2, 1}}, {3, 2},
-                               {1, 1, 2, 2, 3, 3}, 0.0, {TensorType_FLOAT32}),
-      "3 != 4");
+  typedef PadV2OpConstModel<float, uint8_t> f;
+  EXPECT_DEATH(f({TensorType_FLOAT32, {1, 1, 2, 1}}, {3, 2}, {1, 1, 2, 2, 3, 3},
+                 0.0, {TensorType_FLOAT32}),
+               "3 != 4");
 }
 
 TEST(PadV2OpTest, InvalidPadValue) {
-  EXPECT_DEATH(PadV2OpConstModel<float>({TensorType_FLOAT32, {1, 1, 2, 1}},
-                                        {4, 2}, {0, 0, 1, -1, 2, -1, 0, 0}, 0.0,
-                                        {TensorType_FLOAT32}),
+  typedef PadV2OpConstModel<float, uint8_t> f;
+  EXPECT_DEATH(f({TensorType_FLOAT32, {1, 1, 2, 1}}, {4, 2},
+                 {0, 0, 1, -1, 2, -1, 0, 0}, 0.0, {TensorType_FLOAT32}),
                "Pad value has to be greater than equal to 0.");
 }
+#endif
+
+TEST(PadV2OpTest, SimpleConstTestUint8) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadV2OpConstModel<float, uint8_t> m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                                      {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0}, 0.0,
+                                      {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
+                                               0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
 
-TEST(PadV2OpTest, SimpleConstTest) {
+TEST(PadV2OpTest, SimpleConstTestInt8) {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
-                             {0, 0, 1, 1, 1, 1, 0, 0}, 0.0,
-                             {TensorType_FLOAT32});
+  PadV2OpConstModel<float, int8_t> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
+                                     {0, 0, 1, 1, 1, 1, 0, 0}, 0.0,
+                                     {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
@@ -406,11 +430,25 @@ TEST(PadV2OpTest, SimpleConstTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
-TEST(PadV2OpTest, SimpleConstFloat32ValuedTest) {
+TEST(PadV2OpTest, SimpleConstFloat32ValuedTestUint8) {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
-                             {0, 0, 1, 1, 1, 1, 0, 0}, 5, {TensorType_FLOAT32});
+  PadV2OpConstModel<float, uint8_t> m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                                      {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0}, 5,
+                                      {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 1, 2, 5, 5, 3, 4,
+                                               5, 5, 5, 5, 5}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(PadV2OpTest, SimpleConstFloat32ValuedTestInt8) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadV2OpConstModel<float, int8_t> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
+                                     {0, 0, 1, 1, 1, 1, 0, 0}, 5,
+                                     {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 1, 2, 5, 5, 3, 4,
@@ -421,8 +459,9 @@ TEST(PadV2OpTest, SimpleConstFloat32ValuedTest) {
 TEST(PadV2OpTest, Simple4DConstFloat32ValuedTest) {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 1, 2, 1}}, {4, 2},
-                             {0, 1, 0, 0, 0, 0, 0, 1}, 5, {TensorType_FLOAT32});
+  PadV2OpConstModel<float, uint8_t> m({TensorType_FLOAT32, {1, 1, 2, 1}},
+                                      {4, 2}, {0, 1, 0, 0, 0, 0, 0, 1}, 5,
+                                      {TensorType_FLOAT32});
   m.SetInput({3, 3});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 5, 3, 5, 5, 5, 5, 5}));
@@ -432,8 +471,9 @@ TEST(PadV2OpTest, Simple4DConstFloat32ValuedTest) {
 TEST(PadV2OpTest, SimpleConstInt32ValuedTest) {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadV2OpConstModel<int32_t> m({TensorType_INT32, {1, 2, 2, 1}}, {4, 2},
-                               {0, 0, 1, 1, 1, 1, 0, 0}, 5, {TensorType_INT32});
+  PadV2OpConstModel<int32_t, uint8_t> m({TensorType_INT32, {1, 2, 2, 1}},
+                                        {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0}, 5,
+                                        {TensorType_INT32});
   m.SetInput({1, 2, 3, 4});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 1, 2, 5, 5, 3, 4,
@@ -442,8 +482,8 @@ TEST(PadV2OpTest, SimpleConstInt32ValuedTest) {
 }
 
 TEST(PadV2OpTest, SimpleDynamicTest) {
-  PadV2OpDynamicModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2}, 0.0,
-                               {TensorType_FLOAT32});
+  PadV2OpDynamicModel<float, uint8_t> m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                                        {4, 2}, 0.0, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
   m.Invoke();
@@ -453,8 +493,8 @@ TEST(PadV2OpTest, SimpleDynamicTest) {
 }
 
 TEST(PadV2OpTest, SimpleDynamicValuedTest) {
-  PadV2OpDynamicModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2}, 5,
-                               {TensorType_FLOAT32});
+  PadV2OpDynamicModel<float, uint8_t> m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                                        {4, 2}, 5, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
   m.Invoke();
@@ -464,8 +504,9 @@ TEST(PadV2OpTest, SimpleDynamicValuedTest) {
 }
 
 TEST(PadV2OpTest, AdvancedConstTest) {
-  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
-                             {0, 0, 0, 2, 1, 3, 0, 0}, 0, {TensorType_FLOAT32});
+  PadV2OpConstModel<float, uint8_t> m({TensorType_FLOAT32, {1, 2, 3, 1}},
+                                      {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0}, 0,
+                                      {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(),
@@ -475,8 +516,8 @@ TEST(PadV2OpTest, AdvancedConstTest) {
 }
 
 TEST(PadV2OpTest, AdvancedDynamicTest) {
-  PadV2OpDynamicModel<float> m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2}, 0,
-                               {TensorType_FLOAT32});
+  PadV2OpDynamicModel<float, uint8_t> m({TensorType_FLOAT32, {1, 2, 3, 1}},
+                                        {4, 2}, 0, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
   m.Invoke();
@@ -495,23 +536,24 @@ class QuantizedPadV2OpTest : public ::testing::Test {
   }
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST_F(QuantizedPadV2OpTest, ZeroNotInQuantizationRange) {
   // The test_util and actual quantization code currently ensure that the range
   // must include zero, but if that ever changes, this test will catch it.
-  EXPECT_DEATH(
-      PadV2OpConstModel<float> m({TensorType_UINT8, {1, 2, 2, 1}, 1.0, 2.0},
-                                 {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0}, 0,
-                                 {TensorType_UINT8, {}, 1.0, 2.0}),
-      ".*Check failed: f_min <= 0.*");
+  typedef PadV2OpConstModel<float, uint8_t> f;
+  EXPECT_DEATH(f({TensorType_UINT8, {1, 2, 2, 1}, 1.0, 2.0}, {4, 2},
+                 {0, 0, 1, 1, 1, 1, 0, 0}, 0, {TensorType_UINT8, {}, 1.0, 2.0}),
+               ".*Check failed: f_min <= 0.*");
 }
+#endif
 
 TEST_F(QuantizedPadV2OpTest, SimpleConstTest) {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadV2OpConstModel<uint8_t> m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                               {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
-                               {TensorType_UINT8, {1}, -1.0, 1.0},
-                               {TensorType_UINT8, {}, -1.0, 1.0});
+  PadV2OpConstModel<uint8_t, uint8_t> m(
+      {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
+      {0, 0, 1, 1, 1, 1, 0, 0}, {TensorType_UINT8, {1}, -1.0, 1.0},
+      {TensorType_UINT8, {}, -1.0, 1.0});
   m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
   m.SetQuantizedPadValue(0);
   m.Invoke();
@@ -523,9 +565,9 @@ TEST_F(QuantizedPadV2OpTest, SimpleConstTest) {
 }
 
 TEST_F(QuantizedPadV2OpTest, SimpleDynamicTest) {
-  PadV2OpDynamicModel<uint8_t> m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                                 {4, 2}, {TensorType_UINT8, {1}, -1.0, 1.0},
-                                 {TensorType_UINT8, {}, -1.0, 1.0});
+  PadV2OpDynamicModel<uint8_t, uint8_t> m(
+      {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
+      {TensorType_UINT8, {1}, -1.0, 1.0}, {TensorType_UINT8, {}, -1.0, 1.0});
   m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
   m.SetQuantizedPadValue(0);
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
@@ -538,10 +580,10 @@ TEST_F(QuantizedPadV2OpTest, SimpleDynamicTest) {
 }
 
 TEST_F(QuantizedPadV2OpTest, AdvancedConstTest) {
-  PadV2OpConstModel<uint8_t> m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0},
-                               {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0},
-                               {TensorType_UINT8, {1}, -1.0, 1.0},
-                               {TensorType_UINT8, {}, -1.0, 1.0});
+  PadV2OpConstModel<uint8_t, uint8_t> m(
+      {TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
+      {0, 0, 0, 2, 1, 3, 0, 0}, {TensorType_UINT8, {1}, -1.0, 1.0},
+      {TensorType_UINT8, {}, -1.0, 1.0});
   m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
   m.SetQuantizedPadValue(0);
   m.Invoke();
@@ -554,9 +596,9 @@ TEST_F(QuantizedPadV2OpTest, AdvancedConstTest) {
 }
 
 TEST_F(QuantizedPadV2OpTest, AdvancedDynamicTest) {
-  PadV2OpDynamicModel<uint8_t> m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0},
-                                 {4, 2}, {TensorType_UINT8, {1}, -1.0, 1.0},
-                                 {TensorType_UINT8, {}, -1.0, 1.0});
+  PadV2OpDynamicModel<uint8_t, uint8_t> m(
+      {TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
+      {TensorType_UINT8, {1}, -1.0, 1.0}, {TensorType_UINT8, {}, -1.0, 1.0});
   m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
   m.SetQuantizedPadValue(0);
   m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
@@ -572,10 +614,10 @@ TEST_F(QuantizedPadV2OpTest, AdvancedDynamicTest) {
 TEST_F(QuantizedPadV2OpTest, SimpleConstValuedTest) {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadV2OpConstModel<uint8_t> m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                               {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
-                               {TensorType_UINT8, {1}, -1.0, 1.0},
-                               {TensorType_UINT8, {}, -1.0, 1.0});
+  PadV2OpConstModel<uint8_t, uint8_t> m(
+      {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
+      {0, 0, 1, 1, 1, 1, 0, 0}, {TensorType_UINT8, {1}, -1.0, 1.0},
+      {TensorType_UINT8, {}, -1.0, 1.0});
   m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
   m.SetQuantizedPadValue(-0.5);
   m.Invoke();
@@ -588,9 +630,9 @@ TEST_F(QuantizedPadV2OpTest, SimpleConstValuedTest) {
 }
 
 TEST_F(QuantizedPadV2OpTest, SimpleDynamicValuedTest) {
-  PadV2OpDynamicModel<uint8_t> m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                                 {4, 2}, {TensorType_UINT8, {1}, -1.0, 1.0},
-                                 {TensorType_UINT8, {}, -1.0, 1.0});
+  PadV2OpDynamicModel<uint8_t, uint8_t> m(
+      {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
+      {TensorType_UINT8, {1}, -1.0, 1.0}, {TensorType_UINT8, {}, -1.0, 1.0});
   m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
   m.SetQuantizedPadValue(-0.5);
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
@@ -604,10 +646,10 @@ TEST_F(QuantizedPadV2OpTest, SimpleDynamicValuedTest) {
 }
 
 TEST_F(QuantizedPadV2OpTest, AdvancedConstValuedTest) {
-  PadV2OpConstModel<uint8_t> m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0},
-                               {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0},
-                               {TensorType_UINT8, {1}, -1.0, 1.0},
-                               {TensorType_UINT8, {}, -1.0, 1.0});
+  PadV2OpConstModel<uint8_t, uint8_t> m(
+      {TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
+      {0, 0, 0, 2, 1, 3, 0, 0}, {TensorType_UINT8, {1}, -1.0, 1.0},
+      {TensorType_UINT8, {}, -1.0, 1.0});
   m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
   m.SetQuantizedPadValue(-0.5);
   m.Invoke();
@@ -621,9 +663,9 @@ TEST_F(QuantizedPadV2OpTest, AdvancedConstValuedTest) {
 }
 
 TEST_F(QuantizedPadV2OpTest, AdvancedDynamicValuedTest) {
-  PadV2OpDynamicModel<uint8_t> m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0},
-                                 {4, 2}, {TensorType_UINT8, {1}, -1.0, 1.0},
-                                 {TensorType_UINT8, {}, -1.0, 1.0});
+  PadV2OpDynamicModel<uint8_t, uint8_t> m(
+      {TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
+      {TensorType_UINT8, {1}, -1.0, 1.0}, {TensorType_UINT8, {}, -1.0, 1.0});
   m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
   m.SetQuantizedPadValue(-0.5);
   m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
diff --git a/tensorflow/lite/kernels/pooling.cc b/tensorflow/lite/kernels/pooling.cc
index 694a36ffbcf3c8c9d8fe65e1b922ca03921883b3..bdf736dcfb278ad93f43c25b9ae1c0b4038b695f 100644
--- a/tensorflow/lite/kernels/pooling.cc
+++ b/tensorflow/lite/kernels/pooling.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -98,7 +99,7 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   data->padding.width = ComputePadding(params->stride_width, 1, width,
                                        params->filter_width, out_width);
 
-  if (input->type == kTfLiteUInt8) {
+  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
     if (pool_type == kAverage || pool_type == kMax) {
       TF_LITE_ENSURE_EQ(context, input->params.scale, output->params.scale);
       TF_LITE_ENSURE_EQ(context, input->params.zero_point,
@@ -147,9 +148,10 @@ void AverageEvalFloat(TfLiteContext* context, TfLiteNode* node,
 }
 
 template <KernelType kernel_type>
-void AverageEvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                          TfLitePoolParams* params, OpData* data,
-                          const TfLiteTensor* input, TfLiteTensor* output) {
+void AverageEvalQuantizedUint8(TfLiteContext* context, TfLiteNode* node,
+                               TfLitePoolParams* params, OpData* data,
+                               const TfLiteTensor* input,
+                               TfLiteTensor* output) {
   int32_t activation_min;
   int32_t activation_max;
   CalculateActivationRangeUint8(params->activation, output, &activation_min,
@@ -175,6 +177,27 @@ void AverageEvalQuantized(TfLiteContext* context, TfLiteNode* node,
 #undef TF_LITE_AVERAGE_POOL
 }
 
+void AverageEvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
+                              TfLitePoolParams* params, OpData* data,
+                              const TfLiteTensor* input, TfLiteTensor* output) {
+  int32_t activation_min;
+  int32_t activation_max;
+  CalculateActivationRangeInt8(params->activation, output, &activation_min,
+                               &activation_max);
+  tflite::PoolParams op_params;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.filter_height = params->filter_height;
+  op_params.filter_width = params->filter_width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+  op_params.quantized_activation_min = activation_min;
+  op_params.quantized_activation_max = activation_max;
+  reference_integer_ops::AveragePool(
+      op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+      GetTensorShape(output), GetTensorData<int8_t>(output));
+}
+
 template <KernelType kernel_type>
 void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
                   TfLitePoolParams* params, OpData* data,
@@ -203,9 +226,9 @@ void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
 }
 
 template <KernelType kernel_type>
-void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                      TfLitePoolParams* params, OpData* data,
-                      const TfLiteTensor* input, TfLiteTensor* output) {
+void MaxEvalQuantizedUInt8(TfLiteContext* context, TfLiteNode* node,
+                           TfLitePoolParams* params, OpData* data,
+                           const TfLiteTensor* input, TfLiteTensor* output) {
   int32_t activation_min;
   int32_t activation_max;
   CalculateActivationRangeUint8(params->activation, output, &activation_min,
@@ -231,6 +254,31 @@ void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
 #undef TF_LITE_MAX_POOL
 }
 
+template <KernelType kernel_type>
+void MaxEvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
+                          TfLitePoolParams* params, OpData* data,
+                          const TfLiteTensor* input, TfLiteTensor* output) {
+  int32_t activation_min;
+  int32_t activation_max;
+  CalculateActivationRangeInt8(params->activation, output, &activation_min,
+                               &activation_max);
+#define TF_LITE_MAX_POOL(type)                                        \
+  tflite::PoolParams op_params;                                       \
+  op_params.stride_height = params->stride_height;                    \
+  op_params.stride_width = params->stride_width;                      \
+  op_params.filter_height = params->filter_height;                    \
+  op_params.filter_width = params->filter_width;                      \
+  op_params.padding_values.height = data->padding.height;             \
+  op_params.padding_values.width = data->padding.width;               \
+  op_params.quantized_activation_min = activation_min;                \
+  op_params.quantized_activation_max = activation_max;                \
+  type::MaxPool(op_params, GetTensorShape(input),                     \
+                GetTensorData<int8_t>(input), GetTensorShape(output), \
+                GetTensorData<int8_t>(output))
+  TF_LITE_MAX_POOL(reference_integer_ops);
+#undef TF_LITE_MAX_POOL
+}
+
 template <KernelType kernel_type>
 void L2EvalFloat(TfLiteContext* context, TfLiteNode* node,
                  TfLitePoolParams* params, OpData* data,
@@ -272,8 +320,11 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
       AverageEvalFloat<kernel_type>(context, node, params, data, input, output);
       break;
     case kTfLiteUInt8:
-      AverageEvalQuantized<kernel_type>(context, node, params, data, input,
-                                        output);
+      AverageEvalQuantizedUint8<kernel_type>(context, node, params, data, input,
+                                             output);
+      break;
+    case kTfLiteInt8:
+      AverageEvalQuantizedInt8(context, node, params, data, input, output);
       break;
     default:
       context->ReportError(context, "Type %d not currently supported.",
@@ -295,7 +346,12 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
       MaxEvalFloat<kernel_type>(context, node, params, data, input, output);
       break;
     case kTfLiteUInt8:
-      MaxEvalQuantized<kernel_type>(context, node, params, data, input, output);
+      MaxEvalQuantizedUInt8<kernel_type>(context, node, params, data, input,
+                                         output);
+      break;
+    case kTfLiteInt8:
+      MaxEvalQuantizedInt8<kernel_type>(context, node, params, data, input,
+                                        output);
       break;
     default:
       context->ReportError(context, "Type %d not currently supported.",
diff --git a/tensorflow/lite/kernels/pooling_test.cc b/tensorflow/lite/kernels/pooling_test.cc
index 98777f1c13ff97551c05cddc1d319918ea6ed69a..4627d7a5f0c2803635b9df85dd9275cc7851e8fb 100644
--- a/tensorflow/lite/kernels/pooling_test.cc
+++ b/tensorflow/lite/kernels/pooling_test.cc
@@ -78,6 +78,43 @@ class QuantizedPoolingOpModel : public BasePoolingOpModel {
   }
 };
 
+class SymmetricQuantizedPoolingOpModel : public BasePoolingOpModel {
+ public:
+  using BasePoolingOpModel::BasePoolingOpModel;
+
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<int8_t>(input_, data);
+  }
+
+  void SetInput(const std::vector<float>& data) {
+    QuantizeAndPopulate<int8_t>(input_, data);
+  }
+
+  std::vector<int8_t> GetOutput() { return ExtractVector<int8_t>(output_); }
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<int8_t>(ExtractVector<int8_t>(output_), GetScale(output_),
+                              GetZeroPoint(output_));
+  }
+};
+
+// Replicate each entry in a vector n times along depth (innermost dimension).
+// The values are incremented by delta, creating ramps offset by each input
+// value. This is used to create simple and predicatable variation.
+std::vector<float> ReplicateDepthRamp(const std::vector<float>& image_plane,
+                                      int n, float delta) {
+  const int size = image_plane.size();
+  std::vector<float> ramped_data(n * size);
+  // The input is treated as a 1-D even if logically it is multi-dimensional.
+  for (int input_index = 0; input_index < size; ++input_index) {
+    for (int depth = 0; depth < n; ++depth) {
+      ramped_data[n * input_index + depth] =
+          image_plane[input_index] + depth * delta;
+    }
+  }
+
+  return ramped_data;
+}
+
 TEST(FloatPoolingOpTest, AveragePool) {
   FloatPoolingOpModel m(BuiltinOperator_AVERAGE_POOL_2D,
                         /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}},
@@ -128,6 +165,54 @@ TEST(QuantizedPoolingOpTest, AveragePoolImageSize16) {
   EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear({16})));
 }
 
+TEST(QuantizedPoolingOpTest, AveragePoolLargeDepth) {
+  // Test with a larger depth that is not a multiple of the tranche size, or of
+  // any register-oriented multiples such as 8 and 16.
+  constexpr int depth = 1999;  // Prime number.
+  QuantizedPoolingOpModel m(
+      BuiltinOperator_AVERAGE_POOL_2D,
+      /*input=*/{TensorType_UINT8, {1, 2, 4, depth}, 0, 15.9375},
+      /*filter_width=*/2, /*filter_height=*/2,
+      /*output=*/{TensorType_UINT8, {}, 0, 15.9375});
+
+  std::vector<float> input_image_plane({
+      0.f, 6.f, 2.f, 4.f,   //
+      3.f, 2.f, 10.f, 7.f,  //
+  });
+  std::vector<float> output_image_plane({2.75f, 5.75f});
+
+  m.SetInput(ReplicateDepthRamp(input_image_plane, depth, 1.f / 512.f));
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  ReplicateDepthRamp(output_image_plane, depth, 1.f / 512.f),
+                  1. / 32.f)));
+}
+
+// Test quantized AveragePool with int8 input and output. The input is the same
+// as the uint8 test QuantizedPoolingOpTest.AveragePool. The float output is
+// identical to uint8 test and quantized output is identical to uint8 test with
+// a 128 shift.
+TEST(QuantizedPoolingOpTest, SymmetricAveragePool) {
+  // Choose the input ranges carefully so that the dequantized output matches
+  // the results of the float model above.
+  SymmetricQuantizedPoolingOpModel m(
+      BuiltinOperator_AVERAGE_POOL_2D,
+      /*input=*/{TensorType_INT8, {1, 2, 4, 1}, 0, 15.9375},
+      /*filter_width=*/2, /*filter_height=*/2,
+      /*output=*/{TensorType_INT8, {}, 0, 15.9375});
+  m.SetInput({
+      0, 6, 2, 4,   //
+      3, 2, 10, 7,  //
+  });
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({2.75, 5.75})));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({44 - 128, 92 - 128}));
+}
+
 // Send in a white image, expect something other than a white pixel, due to
 // overflow.
 TEST(QuantizedPoolingOpTest, AveragePoolImageSize17) {
@@ -162,7 +247,7 @@ TEST(FloatPoolingOpTest, MaxPool) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({6, 10}));
 }
 
-TEST(QuantizedPoolingOpTest, MaxPool) {
+TEST(QuantizedUInt8PoolingOpTest, MaxPool) {
   // Choose the input ranges carefully so that the dequantized output matches
   // the results of the float model above.
   QuantizedPoolingOpModel m(
@@ -181,6 +266,50 @@ TEST(QuantizedPoolingOpTest, MaxPool) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({96, 160}));
 }
 
+TEST(QuantizedPoolingOpTest, MaxPoolLargeDepth) {
+  // Test with a larger depth that is not a multiple of the tranche size, or of
+  // any register-oriented multiples such as 8 and 16.
+  constexpr int depth = 1999;  // Prime number.
+  QuantizedPoolingOpModel m(
+      BuiltinOperator_MAX_POOL_2D,
+      /*input=*/{TensorType_UINT8, {1, 2, 4, depth}, 0, 15.9375},
+      /*filter_width=*/2, /*filter_height=*/2,
+      /*output=*/{TensorType_UINT8, {}, 0, 15.9375});
+
+  std::vector<float> input_image_plane({
+      0.f, 6.f, 2.f, 4.f,   //
+      3.f, 2.f, 10.f, 7.f,  //
+  });
+  std::vector<float> output_image_plane({6.f, 10.f});
+
+  m.SetInput(ReplicateDepthRamp(input_image_plane, depth, 1.f / 512.f));
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  ReplicateDepthRamp(output_image_plane, depth, 1.f / 512.f),
+                  1. / 32.f)));
+}
+
+TEST(QuantizedInt8PoolingOpTest, MaxPool) {
+  // Choose the input ranges carefully so that the dequantized output matches
+  // the results of the float model above.
+  SymmetricQuantizedPoolingOpModel m(
+      BuiltinOperator_MAX_POOL_2D,
+      /*input=*/{TensorType_INT8, {1, 2, 4, 1}, 0, 15.9375},
+      /*filter_width=*/2, /*filter_height=*/2,
+      /*output=*/{TensorType_INT8, {}, 0, 15.9375});
+  m.SetInput({
+      0, -6, 2, 4,   //
+      3, 2, -10, 7,  //
+  });
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({3, 7})));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-80, -16}));
+}
+
 TEST(FloatPoolingOpTest, L2Pool) {
   FloatPoolingOpModel m(BuiltinOperator_L2_POOL_2D,
                         /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}},
diff --git a/tensorflow/lite/kernels/rank.cc b/tensorflow/lite/kernels/rank.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8cef1f53a024b833034deb497909beac4b4753e6
--- /dev/null
+++ b/tensorflow/lite/kernels/rank.cc
@@ -0,0 +1,65 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace rank {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  output->type = kTfLiteInt32;
+
+  // Rank produces a 0-D int32 Tensor representing the rank of input.
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(0);
+  return context->ResizeTensor(context, output, output_size);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(output), 0);
+
+  if (output->type == kTfLiteInt32) {
+    int32_t* output_data = GetTensorData<int32_t>(output);
+    *output_data = NumDimensions(input);
+  } else {
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace rank
+
+TfLiteRegistration* Register_RANK() {
+  static TfLiteRegistration r = {nullptr, nullptr, rank::Prepare, rank::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/rank_test.cc b/tensorflow/lite/kernels/rank_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3c31fc5866931708eb8155c2dc88026b623039ed
--- /dev/null
+++ b/tensorflow/lite/kernels/rank_test.cc
@@ -0,0 +1,91 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <initializer_list>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class RankOpModel : public SingleOpModel {
+ public:
+  RankOpModel(std::initializer_list<int> input_shape, TensorType input_type) {
+    TensorType output_type = TensorType_INT32;
+    input_ = AddInput(input_type);
+    output_ = AddOutput(output_type);
+    SetBuiltinOp(BuiltinOperator_RANK, BuiltinOptions_RankOptions,
+                 CreateRankOptions(builder_).Union());
+    BuildInterpreter({input_shape});
+  }
+
+  TfLiteStatus InvokeWithResult() { return interpreter_->Invoke(); }
+
+  int input() { return input_; }
+
+  std::vector<int32_t> GetOutput() { return ExtractVector<int32_t>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(RankOpTest, InputTypeFloat) {
+  RankOpModel model({1, 3, 1, 3, 5}, TensorType_FLOAT32);
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({5}));
+  EXPECT_TRUE(model.GetOutputShape().empty());
+}
+
+TEST(RankOpTest, InputTypeInt) {
+  RankOpModel model({1, 3, 1, 3, 5}, TensorType_INT32);
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({5}));
+  EXPECT_TRUE(model.GetOutputShape().empty());
+}
+
+TEST(RankOpTest, ScalarTensor) {
+  RankOpModel model({}, TensorType_FLOAT32);
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({0}));
+  EXPECT_TRUE(model.GetOutputShape().empty());
+}
+
+TEST(RankOpTest, EmptyTensor) {
+  RankOpModel model({1, 0}, TensorType_FLOAT32);
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({2}));
+  EXPECT_TRUE(model.GetOutputShape().empty());
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/reduce.cc b/tensorflow/lite/kernels/reduce.cc
index 336e827ca4c76abf3a08492249dfc0ce9cd81439..3fb2715e48fabeee09b5d6670b6a45cf26d89bc8 100644
--- a/tensorflow/lite/kernels/reduce.cc
+++ b/tensorflow/lite/kernels/reduce.cc
@@ -180,6 +180,9 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
     case kTfLiteUInt8:
       temp_sum->type = kTfLiteInt32;
       break;
+    case kTfLiteInt8:
+      temp_sum->type = kTfLiteInt32;
+      break;
     case kTfLiteBool:
       temp_sum->type = kTfLiteBool;
       break;
@@ -257,6 +260,35 @@ TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_OK(context, ResizeTempSum(context, &op_context, temp_sum));
   }
 
+  // Defer to specialized implementation for 4D Mean across axes 1 & 2.
+  if (op_context.input->type == kTfLiteFloat32 ||
+      op_context.input->type == kTfLiteUInt8) {
+    tflite::MeanParams op_params;
+    op_params.axis_count = num_axis;
+    ResolveAxis(GetTensorData<int>(op_context.axis), num_axis, &op_params);
+    const TfLiteTensor* input = op_context.input;
+    if (op_context.params->keep_dims && NumDimensions(input) == 4 &&
+        op_params.axis_count == 2 &&
+        ((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+         (op_params.axis[0] == 2 && op_params.axis[1] == 1))) {
+      if (op_context.input->type == kTfLiteUInt8) {
+        reference_ops::Mean(
+            op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+            op_context.input->params.zero_point, op_context.input->params.scale,
+            GetTensorShape(op_context.output),
+            GetTensorData<uint8_t>(op_context.output),
+            op_context.output->params.zero_point,
+            op_context.output->params.scale);
+      } else {
+        reference_ops::Mean(op_params, GetTensorShape(input),
+                            GetTensorData<float>(input),
+                            GetTensorShape(op_context.output),
+                            GetTensorData<float>(op_context.output));
+      }
+      return kTfLiteOk;
+    }
+  }
+
 #define TF_LITE_MEAN(kernel_type, data_type, temp_data_type)        \
   kernel_type::Mean<>(                                              \
       GetTensorData<data_type>(op_context.input),                   \
@@ -436,6 +468,9 @@ TfLiteStatus EvalGeneric(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteUInt8:
       return EvalType<uint8_t>(context, node, &op_context, reduce_type);
       break;
+    case kTfLiteInt8:
+      return EvalType<int8_t>(context, node, &op_context, reduce_type);
+      break;
     case kTfLiteBool:
       return EvalType<bool>(context, node, &op_context, reduce_type);
       break;
diff --git a/tensorflow/lite/kernels/reduce_test.cc b/tensorflow/lite/kernels/reduce_test.cc
index c1526bddb719e74a6396dc4aeac4b5827220a65a..373fa56fc1983baec55ed6421950012f4564e9c6 100644
--- a/tensorflow/lite/kernels/reduce_test.cc
+++ b/tensorflow/lite/kernels/reduce_test.cc
@@ -38,9 +38,10 @@ class BaseOpModel : public SingleOpModel {
     return ExtractVector<T>(output_);
   }
 
+  template <typename T>
   std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
   }
 
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
@@ -256,6 +257,49 @@ TEST(ConstFloatMeanOpTest, KeepDims) {
               ElementsAreArray(ArrayFloatNear({10.5, 12.5, 14.5})));
 }
 
+// Uses a set of reduction conditions that trigger the specialized 4D version
+// of Mean.
+TEST(ConstFloatMeanOpTest, KeepDims_4DMean) {
+  std::vector<float> data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
+                             9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                             17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  MeanOpConstModel m({TensorType_FLOAT32, {2, 2, 3, 2}},
+                     {TensorType_FLOAT32, {3}}, {2}, {1, 2}, true);
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 1, 2}));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({6, 7, 18, 19})));
+}
+
+TEST(ConstFloatMeanOpTest, KeepDims_4DMean_UInt8) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::vector<float> data = {0.1, 0.2, 0.3, 0.4, 0.1, 0.2,
+                             0.3, 0.4, 0.1, 0.2, 0.3, 0.4};
+  MeanOpConstModel m({TensorType_UINT8, {1, 2, 2, 3}, -1.0, 1.0},
+                     {TensorType_UINT8, {2}, -1.0, 1.0}, {2}, {1, 2}, true);
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 1, 3}));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({0.25098, 0.25098, 0.25098},
+                                              kQuantizedTolerance)));
+}
+
+TEST(ConstFloatMeanOpTest, KeepDims_4DMean_Quantized) {
+  float kQuantizedTolerance = GetTolerance(-5.0, 5.0);
+  std::vector<float> data = {0.1, 0.2, 0.3, 0.4, 0.1, 0.2,
+                             0.3, 0.4, 0.1, 0.2, 0.3, 0.4};
+  MeanOpConstModel m({TensorType_UINT8, {1, 2, 3, 2}, 0.0, 1.0},
+                     {TensorType_UINT8, {3}, -5.0, 5.0}, {2}, {1, 2}, true);
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 1, 2}));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(
+                  ArrayFloatNear({0.235294, 0.313726}, kQuantizedTolerance)));
+}
+
 TEST(ConstFloatMeanOpTest, Scalar) {
   std::vector<float> data = {3.27};
   MeanOpConstModel m({TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}}, {},
@@ -318,8 +362,9 @@ TEST(ConstUint8MeanOpTest, NotKeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-  EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
-                                            {0.4, 0.4}, kQuantizedTolerance)));
+  EXPECT_THAT(
+      m.GetDequantizedOutput<uint8_t>(),
+      ElementsAreArray(ArrayFloatNear({0.4, 0.4}, kQuantizedTolerance)));
 }
 
 TEST(ConstUint8MeanOpTest, KeepDims) {
@@ -331,7 +376,7 @@ TEST(ConstUint8MeanOpTest, KeepDims) {
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
   EXPECT_THAT(
-      m.GetDequantizedOutput(),
+      m.GetDequantizedOutput<uint8_t>(),
       ElementsAreArray(ArrayFloatNear({0.3, 0.35, 0.55}, kQuantizedTolerance)));
 }
 
@@ -347,7 +392,7 @@ TEST(DynamicUint8MeanOpTest, NotKeepDims) {
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
   EXPECT_THAT(
-      m.GetDequantizedOutput(),
+      m.GetDequantizedOutput<uint8_t>(),
       ElementsAreArray(ArrayFloatNear({-1.75, -1.68}, kQuantizedTolerance)));
 }
 
@@ -363,7 +408,7 @@ TEST(DynamicUint8MeanOpTest, KeepDims) {
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
   EXPECT_THAT(
-      m.GetDequantizedOutput(),
+      m.GetDequantizedOutput<uint8_t>(),
       ElementsAreArray(ArrayFloatNear({9.2815, 0.3695}, kQuantizedTolerance)));
 }
 
@@ -377,7 +422,7 @@ TEST(DynamicUint8MeanOpTest, QuantizedScalar) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), IsEmpty());
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(ArrayFloatNear({0.643}, kQuantizedTolerance)));
 }
 
@@ -390,7 +435,7 @@ TEST(ConstUint8MeanOpTest, QuantizedKeepDims) {
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
   EXPECT_THAT(
-      m.GetDequantizedOutput(),
+      m.GetDequantizedOutput<uint8_t>(),
       ElementsAreArray(ArrayFloatNear({0.3, 0.35, 0.55}, kQuantizedTolerance)));
 }
 
@@ -483,7 +528,7 @@ TEST(ConstUint8SumOpTest, NotKeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(
                   ArrayFloatNear({-0.823529, -0.815686}, kQuantizedTolerance)));
 }
@@ -496,8 +541,9 @@ TEST(ConstUint8SumOpTest, NotKeepDimsRescaling) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-  EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
-                                            {1.2, 1.2}, kQuantizedTolerance)));
+  EXPECT_THAT(
+      m.GetDequantizedOutput<uint8_t>(),
+      ElementsAreArray(ArrayFloatNear({1.2, 1.2}, kQuantizedTolerance)));
 }
 
 TEST(ConstUint8SumOpTest, KeepDims) {
@@ -508,7 +554,7 @@ TEST(ConstUint8SumOpTest, KeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(ArrayFloatNear({-0.407843, -0.313726, 0.0941177},
                                               kQuantizedTolerance)));
 }
@@ -524,7 +570,7 @@ TEST(DynamicUint8SumOpTest, NotKeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(
                   ArrayFloatNear({1.48235, 1.64706}, kQuantizedTolerance)));
 }
@@ -541,7 +587,7 @@ TEST(DynamicUint8SumOpTest, KeepDims) {
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
   EXPECT_THAT(
-      m.GetDequantizedOutput(),
+      m.GetDequantizedOutput<uint8_t>(),
       ElementsAreArray(ArrayFloatNear({6.47059, 10.698}, kQuantizedTolerance)));
 }
 
@@ -698,7 +744,20 @@ TEST(ConstUint8MaxOpTest, NotKeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(
+                  ArrayFloatNear({0.501961, 0.603922}, kQuantizedTolerance)));
+}
+
+TEST(ConstInt8MaxOpTest, NotKeepDims) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  MaxOpConstModel m({TensorType_INT8, {1, 3, 2}, -1.0, 1.0},
+                    {TensorType_INT8, {2}, -1.0, 1.0}, {1}, {1}, false);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(
                   ArrayFloatNear({0.501961, 0.603922}, kQuantizedTolerance)));
 }
@@ -711,7 +770,20 @@ TEST(ConstUint8MaxOpTest, KeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(
+                  ArrayFloatNear({0.4, 0.4, 0.603922}, kQuantizedTolerance)));
+}
+
+TEST(ConstInt8MaxOpTest, KeepDims) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  MaxOpConstModel m({TensorType_INT8, {3, 2}, -1.0, 1.0},
+                    {TensorType_INT8, {3}, -1.0, 1.0}, {1}, {1}, true);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(
                   ArrayFloatNear({0.4, 0.4, 0.603922}, kQuantizedTolerance)));
 }
@@ -727,7 +799,23 @@ TEST(DynamicUint8MaxOpTest, NotKeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(
+                  ArrayFloatNear({1.2902, 0.247059}, kQuantizedTolerance)));
+}
+
+TEST(DynamicInt8MaxOpTest, NotKeepDims) {
+  float kQuantizedTolerance = GetTolerance(-5.0, 2.0);
+  std::vector<float> data = {1.3, -4.8, -3.6, 0.24};
+  MaxOpDynamicModel m({TensorType_INT8, {2, 2}, -5.0, 2.0},
+                      {TensorType_INT8, {2}, -5.0, 2.0},
+                      {TensorType_INT32, {1}}, false);
+  std::vector<int> axis = {1};
+  m.SetAxis(axis);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(
                   ArrayFloatNear({1.2902, 0.247059}, kQuantizedTolerance)));
 }
@@ -743,7 +831,23 @@ TEST(DynamicUint8MaxOpTest, KeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(
+                  ArrayFloatNear({11.1294, 0.862745}, kQuantizedTolerance)));
+}
+
+TEST(DynamicInt8MaxOpTest, KeepDims) {
+  float kQuantizedTolerance = GetTolerance(-10.0, 12.0);
+  std::vector<float> data = {11.14, -0.14, 7.423, 0.879};
+  MaxOpDynamicModel m({TensorType_INT8, {2, 2}, -10.0, 12.0},
+                      {TensorType_INT8, {2}, -10.0, 12.0},
+                      {TensorType_INT32, {1}}, true);
+  std::vector<int> axis = {0};
+  m.SetAxis(axis);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(
                   ArrayFloatNear({11.1294, 0.862745}, kQuantizedTolerance)));
 }
@@ -758,7 +862,21 @@ TEST(DynamicUint8MaxOpTest, Scalar) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), IsEmpty());
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({11.1294}, kQuantizedTolerance)));
+}
+
+TEST(DynamicInt8MaxOpTest, Scalar) {
+  float kQuantizedTolerance = GetTolerance(-10.0, 12.0);
+  std::vector<float> data = {11.14};
+  MaxOpDynamicModel m({TensorType_INT8, {}, -10.0, 12.0},
+                      {TensorType_INT8, {}, -10.0, 12.0},
+                      {TensorType_INT32, {1}}, true);
+  std::vector<int> axis = {0};
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), IsEmpty());
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(ArrayFloatNear({11.1294}, kQuantizedTolerance)));
 }
 
@@ -840,7 +958,20 @@ TEST(ConstUint8MinOpTest, NotKeepDims) {
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
   EXPECT_THAT(
-      m.GetDequantizedOutput(),
+      m.GetDequantizedOutput<uint8_t>(),
+      ElementsAreArray(ArrayFloatNear({0.294117, 0.2}, kQuantizedTolerance)));
+}
+
+TEST(ConstInt8MinOpTest, NotKeepDims) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  MinOpConstModel m({TensorType_INT8, {1, 3, 2}, -1.0, 1.0},
+                    {TensorType_INT8, {2}, -1.0, 1.0}, {1}, {1}, false);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(
+      m.GetDequantizedOutput<int8_t>(),
       ElementsAreArray(ArrayFloatNear({0.294117, 0.2}, kQuantizedTolerance)));
 }
 
@@ -853,7 +984,20 @@ TEST(ConstUint8MinOpTest, KeepDims) {
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
   EXPECT_THAT(
-      m.GetDequantizedOutput(),
+      m.GetDequantizedOutput<uint8_t>(),
+      ElementsAreArray(ArrayFloatNear({0.2, 0.3, 0.5}, kQuantizedTolerance)));
+}
+
+TEST(ConstInt8MinOpTest, KeepDims) {
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  MinOpConstModel m({TensorType_INT8, {3, 2}, -1.0, 1.0},
+                    {TensorType_INT8, {3}, -1.0, 1.0}, {1}, {1}, true);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
+  EXPECT_THAT(
+      m.GetDequantizedOutput<int8_t>(),
       ElementsAreArray(ArrayFloatNear({0.2, 0.3, 0.5}, kQuantizedTolerance)));
 }
 
@@ -869,7 +1013,23 @@ TEST(DynamicUint8MinOpTest, NotKeepDims) {
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
   EXPECT_THAT(
-      m.GetDequantizedOutput(),
+      m.GetDequantizedOutput<uint8_t>(),
+      ElementsAreArray(ArrayFloatNear({-4.807843, -3.6}, kQuantizedTolerance)));
+}
+
+TEST(DynamicInt8MinOpTest, NotKeepDims) {
+  float kQuantizedTolerance = GetTolerance(-5.0, 2.0);
+  std::vector<float> data = {1.3, -4.8, -3.6, 0.24};
+  MinOpDynamicModel m({TensorType_INT8, {2, 2}, -5.0, 2.0},
+                      {TensorType_INT8, {2}, -5.0, 2.0},
+                      {TensorType_INT32, {1}}, false);
+  std::vector<int> axis = {1};
+  m.SetAxis(axis);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(
+      m.GetDequantizedOutput<int8_t>(),
       ElementsAreArray(ArrayFloatNear({-4.807843, -3.6}, kQuantizedTolerance)));
 }
 
@@ -884,7 +1044,23 @@ TEST(DynamicUint8MinOpTest, KeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(
+                  ArrayFloatNear({7.427451, -0.164706}, kQuantizedTolerance)));
+}
+
+TEST(DynamicInt8MinOpTest, KeepDims) {
+  float kQuantizedTolerance = GetTolerance(-10.0, 12.0);
+  std::vector<float> data = {11.14, -0.14, 7.423, 0.879};
+  MinOpDynamicModel m({TensorType_INT8, {2, 2}, -10.0, 12.0},
+                      {TensorType_INT8, {2}, -10.0, 12.0},
+                      {TensorType_INT32, {1}}, true);
+  std::vector<int> axis = {0};
+  m.SetAxis(axis);
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(
                   ArrayFloatNear({7.427451, -0.164706}, kQuantizedTolerance)));
 }
@@ -899,7 +1075,21 @@ TEST(DynamicUint8MinOpTest, Scalar) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), IsEmpty());
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({11.1294}, kQuantizedTolerance)));
+}
+
+TEST(DynamicInt8MinOpTest, Scalar) {
+  float kQuantizedTolerance = GetTolerance(-10.0, 12.0);
+  std::vector<float> data = {11.14};
+  MinOpDynamicModel m({TensorType_INT8, {}, -10.0, 12.0},
+                      {TensorType_INT8, {}, -10.0, 12.0},
+                      {TensorType_INT32, {1}}, true);
+  std::vector<int> axis = {0};
+  m.QuantizeAndPopulate<int8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), IsEmpty());
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(ArrayFloatNear({11.1294}, kQuantizedTolerance)));
 }
 
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index c0e6f6994fd2334917b178d4d3b16d73c27121c4..2a4654a8a38b336aca1840487b73f4b4716de4bd 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -22,10 +22,10 @@ namespace ops {
 namespace custom {
 
 TfLiteRegistration* Register_AUDIO_SPECTROGRAM();
-TfLiteRegistration* Register_LAYER_NORM_LSTM();
 TfLiteRegistration* Register_MFCC();
 TfLiteRegistration* Register_DETECTION_POSTPROCESS();
-TfLiteRegistration* Register_RELU_1();
+TfLiteRegistration* Register_IF();
+TfLiteRegistration* Register_WHILE();
 
 }  // namespace custom
 
@@ -94,6 +94,7 @@ TfLiteRegistration* Register_GREATER_EQUAL();
 TfLiteRegistration* Register_LESS();
 TfLiteRegistration* Register_LESS_EQUAL();
 TfLiteRegistration* Register_FLOOR();
+TfLiteRegistration* Register_CEIL();
 TfLiteRegistration* Register_TILE();
 TfLiteRegistration* Register_NEG();
 TfLiteRegistration* Register_SUM();
@@ -104,6 +105,7 @@ TfLiteRegistration* Register_REDUCE_ANY();
 TfLiteRegistration* Register_SELECT();
 TfLiteRegistration* Register_SLICE();
 TfLiteRegistration* Register_SIN();
+TfLiteRegistration* Register_COS();
 TfLiteRegistration* Register_TRANSPOSE_CONV();
 TfLiteRegistration* Register_EXPAND_DIMS();
 TfLiteRegistration* Register_SPARSE_TO_DENSE();
@@ -112,6 +114,7 @@ TfLiteRegistration* Register_NOT_EQUAL();
 TfLiteRegistration* Register_SQRT();
 TfLiteRegistration* Register_RSQRT();
 TfLiteRegistration* Register_SHAPE();
+TfLiteRegistration* Register_RANK();
 TfLiteRegistration* Register_POW();
 TfLiteRegistration* Register_FAKE_QUANT();
 TfLiteRegistration* Register_PACK();
@@ -129,6 +132,12 @@ TfLiteRegistration* Register_LEAKY_RELU();
 TfLiteRegistration* Register_SQUARED_DIFFERENCE();
 TfLiteRegistration* Register_FILL();
 TfLiteRegistration* Register_MIRROR_PAD();
+TfLiteRegistration* Register_UNIQUE();
+TfLiteRegistration* Register_REVERSE_V2();
+TfLiteRegistration* Register_ADD_N();
+TfLiteRegistration* Register_GATHER_ND();
+TfLiteRegistration* Register_WHERE();
+TfLiteRegistration* Register_ELU();
 
 TfLiteStatus UnsupportedTensorFlowOp(TfLiteContext* context, TfLiteNode* node) {
   context->ReportError(
@@ -161,101 +170,184 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_ABS, Register_ABS());
   AddBuiltin(BuiltinOperator_RELU, Register_RELU());
   AddBuiltin(BuiltinOperator_RELU_N1_TO_1, Register_RELU_N1_TO_1());
-  AddBuiltin(BuiltinOperator_RELU6, Register_RELU6());
-  AddBuiltin(BuiltinOperator_TANH, Register_TANH());
-  AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC());
-  AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_2D());
-  AddBuiltin(BuiltinOperator_MAX_POOL_2D, Register_MAX_POOL_2D());
+  AddBuiltin(BuiltinOperator_RELU6, Register_RELU6(), /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_TANH, Register_TANH(), /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_2D(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_MAX_POOL_2D, Register_MAX_POOL_2D(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_L2_POOL_2D, Register_L2_POOL_2D());
-  AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D());
+  AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D(),
+             /* min_version */ 1,
+             /* max_version */ 3);
   AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D(),
+             /* min_version */ 1,
+             /* max_version */ 3);
+  AddBuiltin(BuiltinOperator_SVDF, Register_SVDF(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_RNN, Register_RNN(),
              /* min_version */ 1,
              /* max_version */ 2);
-  AddBuiltin(BuiltinOperator_SVDF, Register_SVDF());
-  AddBuiltin(BuiltinOperator_RNN, Register_RNN());
   AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
-             Register_BIDIRECTIONAL_SEQUENCE_RNN());
+             Register_BIDIRECTIONAL_SEQUENCE_RNN(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN,
-             Register_UNIDIRECTIONAL_SEQUENCE_RNN());
-  AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP, Register_EMBEDDING_LOOKUP());
+             Register_UNIDIRECTIONAL_SEQUENCE_RNN(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP, Register_EMBEDDING_LOOKUP(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP_SPARSE,
              Register_EMBEDDING_LOOKUP_SPARSE());
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED(),
              /* min_version */ 1,
-             /* max_version */ 2);
+             /* max_version */ 4);
   AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
   AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
-  AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX());
+  AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_CONCATENATION, Register_CONCATENATION());
-  AddBuiltin(BuiltinOperator_ADD, Register_ADD());
-  AddBuiltin(BuiltinOperator_SPACE_TO_BATCH_ND, Register_SPACE_TO_BATCH_ND());
-  AddBuiltin(BuiltinOperator_BATCH_TO_SPACE_ND, Register_BATCH_TO_SPACE_ND());
+  AddBuiltin(BuiltinOperator_ADD, Register_ADD(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_SPACE_TO_BATCH_ND, Register_SPACE_TO_BATCH_ND(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_BATCH_TO_SPACE_ND, Register_BATCH_TO_SPACE_ND(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_MUL, Register_MUL());
   AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION());
   AddBuiltin(BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
              Register_LOCAL_RESPONSE_NORMALIZATION());
   AddBuiltin(BuiltinOperator_LSTM, Register_LSTM(), /* min_version */ 1,
-             /* max_version */ 2);
+             /* max_version */ 3);
   AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM,
-             Register_BIDIRECTIONAL_SEQUENCE_LSTM());
+             Register_BIDIRECTIONAL_SEQUENCE_LSTM(), /* min_version */ 1,
+             /* max_version */ 3);
   AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
-             Register_UNIDIRECTIONAL_SEQUENCE_LSTM());
-  AddBuiltin(BuiltinOperator_PAD, Register_PAD());
-  AddBuiltin(BuiltinOperator_PADV2, Register_PADV2());
+             Register_UNIDIRECTIONAL_SEQUENCE_LSTM(), /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_PAD, Register_PAD(), /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_PADV2, Register_PADV2(), /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_RESHAPE, Register_RESHAPE());
-  AddBuiltin(BuiltinOperator_RESIZE_BILINEAR, Register_RESIZE_BILINEAR());
+  AddBuiltin(BuiltinOperator_RESIZE_BILINEAR, Register_RESIZE_BILINEAR(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
-             Register_RESIZE_NEAREST_NEIGHBOR());
+             Register_RESIZE_NEAREST_NEIGHBOR(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_SKIP_GRAM, Register_SKIP_GRAM());
-  AddBuiltin(BuiltinOperator_SPACE_TO_DEPTH, Register_SPACE_TO_DEPTH());
-  AddBuiltin(BuiltinOperator_GATHER, Register_GATHER());
-  AddBuiltin(BuiltinOperator_TRANSPOSE, Register_TRANSPOSE());
+  AddBuiltin(BuiltinOperator_SPACE_TO_DEPTH, Register_SPACE_TO_DEPTH(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_GATHER, Register_GATHER(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_TRANSPOSE, Register_TRANSPOSE(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_MEAN, Register_MEAN());
   AddBuiltin(BuiltinOperator_DIV, Register_DIV());
-  AddBuiltin(BuiltinOperator_SUB, Register_SUB());
-  AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT());
+  AddBuiltin(BuiltinOperator_SUB, Register_SUB(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT(), /* min_version */ 1,
+             /* max_version */ 3);
   AddBuiltin(BuiltinOperator_SPLIT_V, Register_SPLIT_V());
   AddBuiltin(BuiltinOperator_SQUEEZE, Register_SQUEEZE());
-  AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE());
+  AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_EXP, Register_EXP());
-  AddBuiltin(BuiltinOperator_TOPK_V2, Register_TOPK_V2());
+  AddBuiltin(BuiltinOperator_TOPK_V2, Register_TOPK_V2(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_LOG, Register_LOG());
-  AddBuiltin(BuiltinOperator_LOG_SOFTMAX, Register_LOG_SOFTMAX());
+  AddBuiltin(BuiltinOperator_LOG_SOFTMAX, Register_LOG_SOFTMAX(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_CAST, Register_CAST());
   AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE(),
              /* min_version */ 1,
              /* max_version */ 2);
   AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
-  AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
-  AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM());
-  AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX());
-  AddBuiltin(BuiltinOperator_ARG_MIN, Register_ARG_MIN());
-  AddBuiltin(BuiltinOperator_GREATER, Register_GREATER());
-  AddBuiltin(BuiltinOperator_GREATER_EQUAL, Register_GREATER_EQUAL());
-  AddBuiltin(BuiltinOperator_LESS, Register_LESS());
-  AddBuiltin(BuiltinOperator_LESS_EQUAL, Register_LESS_EQUAL());
+  AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_ARG_MIN, Register_ARG_MIN(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_GREATER, Register_GREATER(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_GREATER_EQUAL, Register_GREATER_EQUAL(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_LESS, Register_LESS(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_LESS_EQUAL, Register_LESS_EQUAL(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_FLOOR, Register_FLOOR());
+  AddBuiltin(BuiltinOperator_CEIL, Register_CEIL());
   AddBuiltin(BuiltinOperator_NEG, Register_NEG());
-  AddBuiltin(BuiltinOperator_SELECT, Register_SELECT());
-  AddBuiltin(BuiltinOperator_SLICE, Register_SLICE());
+  AddBuiltin(BuiltinOperator_SELECT, Register_SELECT(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_SLICE, Register_SLICE(), /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_SIN, Register_SIN());
+  AddBuiltin(BuiltinOperator_COS, Register_COS());
   AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSE_CONV());
   AddBuiltin(BuiltinOperator_TILE, Register_TILE());
   AddBuiltin(BuiltinOperator_SUM, Register_SUM());
   AddBuiltin(BuiltinOperator_REDUCE_PROD, Register_REDUCE_PROD());
-  AddBuiltin(BuiltinOperator_REDUCE_MAX, Register_REDUCE_MAX());
-  AddBuiltin(BuiltinOperator_REDUCE_MIN, Register_REDUCE_MIN());
+  AddBuiltin(BuiltinOperator_REDUCE_MAX, Register_REDUCE_MAX(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_REDUCE_MIN, Register_REDUCE_MIN(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_REDUCE_ANY, Register_REDUCE_ANY());
   AddBuiltin(BuiltinOperator_EXPAND_DIMS, Register_EXPAND_DIMS());
   AddBuiltin(BuiltinOperator_SPARSE_TO_DENSE, Register_SPARSE_TO_DENSE());
-  AddBuiltin(BuiltinOperator_EQUAL, Register_EQUAL());
-  AddBuiltin(BuiltinOperator_NOT_EQUAL, Register_NOT_EQUAL());
+  AddBuiltin(BuiltinOperator_EQUAL, Register_EQUAL(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_NOT_EQUAL, Register_NOT_EQUAL(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_SQRT, Register_SQRT());
   AddBuiltin(BuiltinOperator_RSQRT, Register_RSQRT());
   AddBuiltin(BuiltinOperator_SHAPE, Register_SHAPE());
+  AddBuiltin(BuiltinOperator_RANK, Register_RANK());
   AddBuiltin(BuiltinOperator_POW, Register_POW());
   AddBuiltin(BuiltinOperator_FAKE_QUANT, Register_FAKE_QUANT(), 1, 2);
-  AddBuiltin(BuiltinOperator_PACK, Register_PACK());
+  AddBuiltin(BuiltinOperator_PACK, Register_PACK(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_ONE_HOT, Register_ONE_HOT());
   AddBuiltin(BuiltinOperator_LOGICAL_OR, Register_LOGICAL_OR());
   AddBuiltin(BuiltinOperator_LOGICAL_AND, Register_LOGICAL_AND());
@@ -270,16 +362,24 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_SQUARED_DIFFERENCE, Register_SQUARED_DIFFERENCE());
   AddBuiltin(BuiltinOperator_FILL, Register_FILL());
   AddBuiltin(BuiltinOperator_MIRROR_PAD, Register_MIRROR_PAD());
+  AddBuiltin(BuiltinOperator_UNIQUE, Register_UNIQUE());
+  AddBuiltin(BuiltinOperator_REVERSE_V2, Register_REVERSE_V2());
+  AddBuiltin(BuiltinOperator_ADD_N, Register_ADD_N());
+  AddBuiltin(BuiltinOperator_GATHER_ND, Register_GATHER_ND());
+  AddBuiltin(BuiltinOperator_WHERE, Register_WHERE());
+  AddBuiltin(BuiltinOperator_ELU, Register_ELU());
 
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
   AddCustom("Mfcc", tflite::ops::custom::Register_MFCC());
   AddCustom("AudioSpectrogram",
             tflite::ops::custom::Register_AUDIO_SPECTROGRAM());
-  AddCustom("LayerNormLstm", tflite::ops::custom::Register_LAYER_NORM_LSTM());
-  AddCustom("Relu1", tflite::ops::custom::Register_RELU_1());
   AddCustom("TFLite_Detection_PostProcess",
             tflite::ops::custom::Register_DETECTION_POSTPROCESS());
+
+  // WARNING: Control flow ops are experimental and subject to change.
+  AddCustom("Experimental_If", tflite::ops::custom::Register_IF());
+  AddCustom("Experimental_While", tflite::ops::custom::Register_WHILE());
 }
 
 }  // namespace builtin
diff --git a/tensorflow/lite/kernels/register_ref.cc b/tensorflow/lite/kernels/register_ref.cc
new file mode 100644
index 0000000000000000000000000000000000000000..faa864b0e236e4a61453c6fcecafd2ca09f41ce1
--- /dev/null
+++ b/tensorflow/lite/kernels/register_ref.cc
@@ -0,0 +1,293 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/register_ref.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+namespace ops {
+
+namespace custom {
+
+TfLiteRegistration* Register_AUDIO_SPECTROGRAM();
+TfLiteRegistration* Register_MFCC();
+TfLiteRegistration* Register_DETECTION_POSTPROCESS();
+
+}  // namespace custom
+
+namespace builtin {
+
+// TODO(yunluli): Some of the registries, e.g. Tanh(), could only invoke
+// optimized kernels. Add a _REF() variant for them.
+TfLiteRegistration* Register_ABS();
+TfLiteRegistration* Register_RELU();
+TfLiteRegistration* Register_RELU_N1_TO_1();
+TfLiteRegistration* Register_RELU6();
+TfLiteRegistration* Register_TANH_REF();
+TfLiteRegistration* Register_LOGISTIC_REF();
+TfLiteRegistration* Register_AVERAGE_POOL_REF();
+TfLiteRegistration* Register_MAX_POOL_REF();
+TfLiteRegistration* Register_L2_POOL_REF();
+TfLiteRegistration* Register_CONVOLUTION_REF();
+TfLiteRegistration* Register_DEPTHWISE_CONVOLUTION_REF();
+TfLiteRegistration* Register_SVDF();
+TfLiteRegistration* Register_RNN();
+TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_RNN();
+TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_RNN();
+TfLiteRegistration* Register_EMBEDDING_LOOKUP();
+TfLiteRegistration* Register_EMBEDDING_LOOKUP_SPARSE();
+TfLiteRegistration* Register_FULLY_CONNECTED_REF();
+TfLiteRegistration* Register_LSH_PROJECTION();
+TfLiteRegistration* Register_HASHTABLE_LOOKUP();
+TfLiteRegistration* Register_SOFTMAX();
+TfLiteRegistration* Register_CONCATENATION_REF();
+TfLiteRegistration* Register_ADD_REF();
+TfLiteRegistration* Register_SPACE_TO_BATCH_ND_REF();
+TfLiteRegistration* Register_DIV_REF();
+TfLiteRegistration* Register_SUB_REF();
+TfLiteRegistration* Register_BATCH_TO_SPACE_ND_REF();
+TfLiteRegistration* Register_MUL_REF();
+TfLiteRegistration* Register_L2NORM_REF();
+TfLiteRegistration* Register_LOCAL_RESPONSE_NORM_REF();
+TfLiteRegistration* Register_LSTM();
+TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_LSTM();
+TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_LSTM();
+TfLiteRegistration* Register_PAD_REF();
+TfLiteRegistration* Register_PADV2_REF();
+TfLiteRegistration* Register_RESHAPE();
+TfLiteRegistration* Register_RESIZE_BILINEAR_REF();
+TfLiteRegistration* Register_RESIZE_NEAREST_NEIGHBOR_REF();
+TfLiteRegistration* Register_SKIP_GRAM();
+TfLiteRegistration* Register_SPACE_TO_DEPTH_REF();
+TfLiteRegistration* Register_GATHER();
+TfLiteRegistration* Register_TRANSPOSE_REF();
+TfLiteRegistration* Register_MEAN_REF();
+TfLiteRegistration* Register_SPLIT();
+TfLiteRegistration* Register_SPLIT_V();
+TfLiteRegistration* Register_SQUEEZE();
+TfLiteRegistration* Register_STRIDED_SLICE_REF();
+TfLiteRegistration* Register_EXP();
+TfLiteRegistration* Register_TOPK_V2();
+TfLiteRegistration* Register_LOG();
+TfLiteRegistration* Register_LOG_SOFTMAX_REF();
+TfLiteRegistration* Register_CAST();
+TfLiteRegistration* Register_DEQUANTIZE();
+TfLiteRegistration* Register_PRELU();
+TfLiteRegistration* Register_MAXIMUM();
+TfLiteRegistration* Register_MINIMUM();
+TfLiteRegistration* Register_ARG_MAX();
+TfLiteRegistration* Register_ARG_MIN();
+TfLiteRegistration* Register_GREATER();
+TfLiteRegistration* Register_GREATER_EQUAL();
+TfLiteRegistration* Register_LESS();
+TfLiteRegistration* Register_LESS_EQUAL();
+TfLiteRegistration* Register_FLOOR_REF();
+TfLiteRegistration* Register_TILE();
+TfLiteRegistration* Register_NEG();
+TfLiteRegistration* Register_SUM();
+TfLiteRegistration* Register_REDUCE_PROD();
+TfLiteRegistration* Register_REDUCE_MAX();
+TfLiteRegistration* Register_REDUCE_MIN();
+TfLiteRegistration* Register_REDUCE_ANY();
+TfLiteRegistration* Register_SELECT();
+TfLiteRegistration* Register_SLICE_REF();
+TfLiteRegistration* Register_SIN();
+TfLiteRegistration* Register_TRANSPOSECONV_REF();
+TfLiteRegistration* Register_EXPAND_DIMS();
+TfLiteRegistration* Register_SPARSE_TO_DENSE();
+TfLiteRegistration* Register_EQUAL();
+TfLiteRegistration* Register_NOT_EQUAL();
+TfLiteRegistration* Register_SQRT();
+TfLiteRegistration* Register_RSQRT();
+TfLiteRegistration* Register_SHAPE();
+TfLiteRegistration* Register_POW();
+TfLiteRegistration* Register_FAKE_QUANT();
+TfLiteRegistration* Register_PACK();
+TfLiteRegistration* Register_ONE_HOT();
+TfLiteRegistration* Register_LOGICAL_OR();
+TfLiteRegistration* Register_LOGICAL_AND();
+TfLiteRegistration* Register_LOGICAL_NOT();
+TfLiteRegistration* Register_UNPACK();
+TfLiteRegistration* Register_FLOOR_DIV();
+TfLiteRegistration* Register_SQUARE();
+TfLiteRegistration* Register_ZEROS_LIKE();
+TfLiteRegistration* Register_FLOOR_MOD();
+TfLiteRegistration* Register_RANGE();
+TfLiteRegistration* Register_LEAKY_RELU();
+TfLiteRegistration* Register_SQUARED_DIFFERENCE();
+TfLiteRegistration* Register_FILL();
+TfLiteRegistration* Register_MIRROR_PAD();
+
+namespace {
+
+TfLiteStatus UnsupportedTensorFlowOp(TfLiteContext* context, TfLiteNode* node) {
+  context->ReportError(
+      context,
+      "Regular TensorFlow ops are not supported by this interpreter. Make sure "
+      "you invoke the Flex delegate before inference.");
+  return kTfLiteError;
+}
+
+}  // namespace
+
+const TfLiteRegistration* BuiltinRefOpResolver::FindOp(
+    tflite::BuiltinOperator op, int version) const {
+  return MutableOpResolver::FindOp(op, version);
+}
+
+const TfLiteRegistration* BuiltinRefOpResolver::FindOp(const char* op,
+                                                       int version) const {
+  // Return the NULL Op for all ops whose name start with "Flex", allowing
+  // the interpreter to delegate their execution.
+  if (IsFlexOp(op)) {
+    static TfLiteRegistration null_op{
+        nullptr, nullptr, &UnsupportedTensorFlowOp,
+        nullptr, nullptr, BuiltinOperator_CUSTOM,
+        "Flex",  1};
+    return &null_op;
+  }
+  return MutableOpResolver::FindOp(op, version);
+}
+
+BuiltinRefOpResolver::BuiltinRefOpResolver() {
+  AddBuiltin(BuiltinOperator_ABS, Register_ABS());
+  AddBuiltin(BuiltinOperator_RELU, Register_RELU());
+  AddBuiltin(BuiltinOperator_RELU_N1_TO_1, Register_RELU_N1_TO_1());
+  AddBuiltin(BuiltinOperator_RELU6, Register_RELU6());
+  AddBuiltin(BuiltinOperator_TANH, Register_TANH_REF());
+  AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC_REF());
+  AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_REF());
+  AddBuiltin(BuiltinOperator_MAX_POOL_2D, Register_MAX_POOL_REF());
+  AddBuiltin(BuiltinOperator_L2_POOL_2D, Register_L2_POOL_REF());
+  AddBuiltin(BuiltinOperator_CONV_2D, Register_CONVOLUTION_REF());
+  AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D,
+             Register_DEPTHWISE_CONVOLUTION_REF(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_SVDF, Register_SVDF());
+  AddBuiltin(BuiltinOperator_RNN, Register_RNN());
+  AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
+             Register_BIDIRECTIONAL_SEQUENCE_RNN());
+  AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN,
+             Register_UNIDIRECTIONAL_SEQUENCE_RNN());
+  AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP, Register_EMBEDDING_LOOKUP());
+  AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP_SPARSE,
+             Register_EMBEDDING_LOOKUP_SPARSE());
+  AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED_REF(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
+  AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
+  AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX());
+  AddBuiltin(BuiltinOperator_CONCATENATION, Register_CONCATENATION_REF());
+  AddBuiltin(BuiltinOperator_ADD, Register_ADD_REF());
+  AddBuiltin(BuiltinOperator_SPACE_TO_BATCH_ND,
+             Register_SPACE_TO_BATCH_ND_REF());
+  AddBuiltin(BuiltinOperator_BATCH_TO_SPACE_ND,
+             Register_BATCH_TO_SPACE_ND_REF());
+  AddBuiltin(BuiltinOperator_MUL, Register_MUL_REF());
+  AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2NORM_REF());
+  AddBuiltin(BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
+             Register_LOCAL_RESPONSE_NORM_REF());
+  AddBuiltin(BuiltinOperator_LSTM, Register_LSTM(), /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM,
+             Register_BIDIRECTIONAL_SEQUENCE_LSTM(), /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
+             Register_UNIDIRECTIONAL_SEQUENCE_LSTM());
+  AddBuiltin(BuiltinOperator_PAD, Register_PAD_REF());
+  AddBuiltin(BuiltinOperator_PADV2, Register_PADV2_REF());
+  AddBuiltin(BuiltinOperator_RESHAPE, Register_RESHAPE());
+  AddBuiltin(BuiltinOperator_RESIZE_BILINEAR, Register_RESIZE_BILINEAR_REF());
+  AddBuiltin(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+             Register_RESIZE_NEAREST_NEIGHBOR_REF());
+  AddBuiltin(BuiltinOperator_SKIP_GRAM, Register_SKIP_GRAM());
+  AddBuiltin(BuiltinOperator_SPACE_TO_DEPTH, Register_SPACE_TO_DEPTH_REF());
+  AddBuiltin(BuiltinOperator_GATHER, Register_GATHER());
+  AddBuiltin(BuiltinOperator_TRANSPOSE, Register_TRANSPOSE_REF());
+  AddBuiltin(BuiltinOperator_MEAN, Register_MEAN_REF());
+  AddBuiltin(BuiltinOperator_DIV, Register_DIV_REF());
+  AddBuiltin(BuiltinOperator_SUB, Register_SUB_REF());
+  AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT());
+  AddBuiltin(BuiltinOperator_SPLIT_V, Register_SPLIT_V());
+  AddBuiltin(BuiltinOperator_SQUEEZE, Register_SQUEEZE());
+  AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE_REF());
+  AddBuiltin(BuiltinOperator_EXP, Register_EXP());
+  AddBuiltin(BuiltinOperator_TOPK_V2, Register_TOPK_V2());
+  AddBuiltin(BuiltinOperator_LOG, Register_LOG());
+  AddBuiltin(BuiltinOperator_LOG_SOFTMAX, Register_LOG_SOFTMAX_REF());
+  AddBuiltin(BuiltinOperator_CAST, Register_CAST());
+  AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE(),
+             /* min_version */ 1,
+             /* max_version */ 2);
+  AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
+  AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
+  AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM());
+  AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX());
+  AddBuiltin(BuiltinOperator_ARG_MIN, Register_ARG_MIN());
+  AddBuiltin(BuiltinOperator_GREATER, Register_GREATER());
+  AddBuiltin(BuiltinOperator_GREATER_EQUAL, Register_GREATER_EQUAL());
+  AddBuiltin(BuiltinOperator_LESS, Register_LESS());
+  AddBuiltin(BuiltinOperator_LESS_EQUAL, Register_LESS_EQUAL());
+  AddBuiltin(BuiltinOperator_FLOOR, Register_FLOOR_REF());
+  AddBuiltin(BuiltinOperator_NEG, Register_NEG());
+  AddBuiltin(BuiltinOperator_SELECT, Register_SELECT());
+  AddBuiltin(BuiltinOperator_SLICE, Register_SLICE_REF());
+  AddBuiltin(BuiltinOperator_SIN, Register_SIN());
+  AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSECONV_REF());
+  AddBuiltin(BuiltinOperator_TILE, Register_TILE());
+  AddBuiltin(BuiltinOperator_SUM, Register_SUM());
+  AddBuiltin(BuiltinOperator_REDUCE_PROD, Register_REDUCE_PROD());
+  AddBuiltin(BuiltinOperator_REDUCE_MAX, Register_REDUCE_MAX());
+  AddBuiltin(BuiltinOperator_REDUCE_MIN, Register_REDUCE_MIN());
+  AddBuiltin(BuiltinOperator_REDUCE_ANY, Register_REDUCE_ANY());
+  AddBuiltin(BuiltinOperator_EXPAND_DIMS, Register_EXPAND_DIMS());
+  AddBuiltin(BuiltinOperator_SPARSE_TO_DENSE, Register_SPARSE_TO_DENSE());
+  AddBuiltin(BuiltinOperator_EQUAL, Register_EQUAL());
+  AddBuiltin(BuiltinOperator_NOT_EQUAL, Register_NOT_EQUAL());
+  AddBuiltin(BuiltinOperator_SQRT, Register_SQRT());
+  AddBuiltin(BuiltinOperator_RSQRT, Register_RSQRT());
+  AddBuiltin(BuiltinOperator_SHAPE, Register_SHAPE());
+  AddBuiltin(BuiltinOperator_POW, Register_POW());
+  AddBuiltin(BuiltinOperator_FAKE_QUANT, Register_FAKE_QUANT(), 1, 2);
+  AddBuiltin(BuiltinOperator_PACK, Register_PACK());
+  AddBuiltin(BuiltinOperator_ONE_HOT, Register_ONE_HOT());
+  AddBuiltin(BuiltinOperator_LOGICAL_OR, Register_LOGICAL_OR());
+  AddBuiltin(BuiltinOperator_LOGICAL_AND, Register_LOGICAL_AND());
+  AddBuiltin(BuiltinOperator_LOGICAL_NOT, Register_LOGICAL_NOT());
+  AddBuiltin(BuiltinOperator_UNPACK, Register_UNPACK());
+  AddBuiltin(BuiltinOperator_FLOOR_DIV, Register_FLOOR_DIV());
+  AddBuiltin(BuiltinOperator_SQUARE, Register_SQUARE());
+  AddBuiltin(BuiltinOperator_ZEROS_LIKE, Register_ZEROS_LIKE());
+  AddBuiltin(BuiltinOperator_FLOOR_MOD, Register_FLOOR_MOD());
+  AddBuiltin(BuiltinOperator_RANGE, Register_RANGE());
+  AddBuiltin(BuiltinOperator_LEAKY_RELU, Register_LEAKY_RELU());
+  AddBuiltin(BuiltinOperator_SQUARED_DIFFERENCE, Register_SQUARED_DIFFERENCE());
+  AddBuiltin(BuiltinOperator_FILL, Register_FILL());
+  AddBuiltin(BuiltinOperator_MIRROR_PAD, Register_MIRROR_PAD());
+
+  // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
+  // custom ops aren't always included by default.
+  AddCustom("Mfcc", tflite::ops::custom::Register_MFCC());
+  AddCustom("AudioSpectrogram",
+            tflite::ops::custom::Register_AUDIO_SPECTROGRAM());
+  AddCustom("TFLite_Detection_PostProcess",
+            tflite::ops::custom::Register_DETECTION_POSTPROCESS());
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/register_ref.h b/tensorflow/lite/kernels/register_ref.h
new file mode 100644
index 0000000000000000000000000000000000000000..c66d4a25bc43a9e336f071ce6058ccd7ecce4d31
--- /dev/null
+++ b/tensorflow/lite/kernels/register_ref.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_REGISTER_REF_H_
+#define TENSORFLOW_LITE_KERNELS_REGISTER_REF_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+
+class BuiltinRefOpResolver : public MutableOpResolver {
+ public:
+  BuiltinRefOpResolver();
+
+  const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
+                                   int version) const override;
+  const TfLiteRegistration* FindOp(const char* op, int version) const override;
+};
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_REGISTER_REF_H_
diff --git a/tensorflow/lite/kernels/relu1_test.cc b/tensorflow/lite/kernels/relu1_test.cc
deleted file mode 100644
index f52d10b0b7f32af3444c702835f0674d7181bb7a..0000000000000000000000000000000000000000
--- a/tensorflow/lite/kernels/relu1_test.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <gtest/gtest.h>
-#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/kernels/test_util.h"
-
-namespace tflite {
-namespace ops {
-namespace custom {
-
-TfLiteRegistration* Register_RELU_1();
-
-namespace {
-
-using ::testing::ElementsAreArray;
-
-class BaseActivationsOpModel : public SingleOpModel {
- public:
-  explicit BaseActivationsOpModel(const TensorData& input) {
-    input_ = AddInput(input);
-    output_ = AddOutput({input.type, {}});
-    flexbuffers::Builder fbb;
-    fbb.Map([&]() {});
-    fbb.Finish();
-    SetCustomOp("RELU_1", fbb.GetBuffer(), Register_RELU_1);
-    BuildInterpreter({GetShape(input_)});
-  }
-
- protected:
-  int input_;
-  int output_;
-};
-
-class FloatActivationsOpModel : public BaseActivationsOpModel {
- public:
-  using BaseActivationsOpModel::BaseActivationsOpModel;
-
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor(input_, data);
-  }
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
-};
-
-TEST(FloatActivationsOpTest, Relu1) {
-  FloatActivationsOpModel m(/*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
-  m.SetInput({
-      0.0, -0.6, 0.2, -0.4,  //
-      0.3, -2.0, 1.1, -0.1,  //
-  });
-  m.Invoke();
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
-                                 0.0, 0.0, 0.2, 0.0,  //
-                                 0.3, 0.0, 1.0, 0.0,  //
-                             }));
-}
-
-}  // namespace
-}  // namespace custom
-}  // namespace ops
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/reshape_test.cc b/tensorflow/lite/kernels/reshape_test.cc
index 00bbbef57eccef67d043e85c02ebe80c3f9387ef..e9d12a9def7e1a33bc0b6db47d7b2f09036b84f2 100644
--- a/tensorflow/lite/kernels/reshape_test.cc
+++ b/tensorflow/lite/kernels/reshape_test.cc
@@ -123,6 +123,7 @@ class ReshapeOpModel : public SingleOpModel {
   int output_;
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST_P(ReshapeOpTest, MismatchedDimensions) {
   if (GetParam() == kAsTensor) {
     ReshapeOpModel<float> m({1, 2, 4, 1}, {2}, {2, 1}, GetParam());
@@ -133,23 +134,17 @@ TEST_P(ReshapeOpTest, MismatchedDimensions) {
                  "num_input_elements != num_output_elements");
   }
 }
+#endif
 
 TEST_P(ReshapeOpTest, TooManyDimensions) {
-  if (GetParam() == kAsReshapeOption) {
+#ifdef GTEST_HAS_DEATH_TEST
     EXPECT_DEATH(ReshapeOpModel<float>({1, 1, 2, 1, 1, 1, 1, 1, 1}, {9},
                                        {1, 1, 1, 1, 1, 1, 1, 1, 2}, GetParam()),
                  "Found too many dimensions");
-  } else {
-    ReshapeOpModel<float> m({1, 1, 2, 1, 1, 1, 1, 1, 1}, {9},
-                            {1, 1, 1, 1, 1, 1, 1, 1, 2}, GetParam());
-    m.SetInput({3, 4});
-    m.Invoke();
-    EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 4}));
-    EXPECT_THAT(m.GetOutputShape(),
-                ElementsAreArray({1, 1, 1, 1, 1, 1, 1, 1, 2}));
-  }
+#endif
 }
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST_P(ReshapeOpTest, TooManySpecialDimensions) {
   if (GetParam() != kAsTensor) {
     EXPECT_DEATH(
@@ -160,6 +155,7 @@ TEST_P(ReshapeOpTest, TooManySpecialDimensions) {
     EXPECT_DEATH(m.Invoke(), "stretch_dim != -1");
   }
 }
+#endif
 
 // Create the model with a 2x2 shape. Processing still works because the new
 // shape ends up being hardcoded as a flat vector.
@@ -202,12 +198,16 @@ TEST_P(ReshapeOpTest, ScalarOutput) {
 // and output are scalars.
 TEST_P(ReshapeOpTest, LegacyScalarOutput) {
   if (GetParam() == kAsConstantTensor) {
+#ifdef GTEST_HAS_DEATH_TEST
     EXPECT_DEATH(ReshapeOpModel<float>({1}, {1}, {0}, GetParam()),
                  "num_input_elements != num_output_elements");
+#endif
   } else if (GetParam() == kAsTensor) {
+#ifdef GTEST_HAS_DEATH_TEST
     ReshapeOpModel<float> m({1}, {1}, {0}, GetParam());
     m.SetInput({3});
     EXPECT_DEATH(m.Invoke(), "num_input_elements != num_output_elements");
+#endif
   } else {
     ReshapeOpModel<float> m({1}, {1}, {0}, GetParam());
     m.SetInput({3});
@@ -226,9 +226,9 @@ TEST_P(ReshapeOpTest, Strings) {
               ElementsAreArray({"1", "2", "3", "4", "5", "6", "7", "8"}));
 }
 
-INSTANTIATE_TEST_CASE_P(VariedShapeSpec, ReshapeOpTest,
-                        ::testing::Values(kAsReshapeOption, kAsConstantTensor,
-                                          kAsTensor));
+INSTANTIATE_TEST_SUITE_P(VariedShapeSpec, ReshapeOpTest,
+                         ::testing::Values(kAsReshapeOption, kAsConstantTensor,
+                                           kAsTensor));
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/resize_bilinear.cc b/tensorflow/lite/kernels/resize_bilinear.cc
index d42cb188669587a957dd085f9ecb123f44b59437..7383d03438c65a710efbfe30f3d3c0ce261f0ca8 100644
--- a/tensorflow/lite/kernels/resize_bilinear.cc
+++ b/tensorflow/lite/kernels/resize_bilinear.cc
@@ -109,6 +109,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     if (kernel_type == kGenericOptimized || kernel_type == kNeonOptimized) {
       TF_LITE_RESIZE_BILINEAR(optimized_ops, uint8_t);
     }
+  } else if (output->type == kTfLiteInt8) {
+    TF_LITE_RESIZE_BILINEAR(reference_ops, int8_t);
 #undef TF_LITE_RESIZE_BILINEAR
   } else {
     context->ReportError(context, "Output type is %d, requires float.",
diff --git a/tensorflow/lite/kernels/resize_bilinear_test.cc b/tensorflow/lite/kernels/resize_bilinear_test.cc
index d3f4837a287accd93c23e17fa3a361efd4120101..a5ead9c874d5cf1e4e1abb7998150df44eced3c5 100644
--- a/tensorflow/lite/kernels/resize_bilinear_test.cc
+++ b/tensorflow/lite/kernels/resize_bilinear_test.cc
@@ -78,7 +78,7 @@ TEST(ResizeBilinearOpTest, HorizontalResize) {
               ElementsAreArray(ArrayFloatNear({3, 5, 6})));
 }
 
-TEST(ResizeBilinearOpTest, HorizontalResize8Bit) {
+TEST(ResizeBilinearOpTest, HorizontalResizeUInt8) {
   ResizeBilinearOpModel m({TensorType_UINT8, {1, 1, 2, 1}});
   m.SetInput<uint8>({3, 6});
   m.SetSize({1, 3});
@@ -93,6 +93,21 @@ TEST(ResizeBilinearOpTest, HorizontalResize8Bit) {
               ElementsAreArray(ArrayFloatNear({3, 5, 6})));
 }
 
+TEST(ResizeBilinearOpTest, HorizontalResizeInt8) {
+  ResizeBilinearOpModel m({TensorType_INT8, {1, 1, 2, 1}});
+  m.SetInput<int8_t>({3, 6});
+  m.SetSize({1, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+
+  ResizeBilinearOpModel const_m({TensorType_INT8, {1, 1, 2, 1}}, {1, 3});
+  const_m.SetInput<int8_t>({3, 6});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+}
+
 TEST(ResizeBilinearOpTest, VerticalResize) {
   ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 1, 1}});
   m.SetInput<float>({3, 9});
@@ -108,7 +123,7 @@ TEST(ResizeBilinearOpTest, VerticalResize) {
               ElementsAreArray(ArrayFloatNear({3, 7, 9})));
 }
 
-TEST(ResizeBilinearOpTest, VerticalResize8Bit) {
+TEST(ResizeBilinearOpTest, VerticalResizeUInt8) {
   ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 1, 1}});
   m.SetInput<uint8>({3, 9});
   m.SetSize({3, 1});
@@ -123,6 +138,21 @@ TEST(ResizeBilinearOpTest, VerticalResize8Bit) {
               ElementsAreArray(ArrayFloatNear({3, 7, 9})));
 }
 
+TEST(ResizeBilinearOpTest, VerticalResizeInt8) {
+  ResizeBilinearOpModel m({TensorType_INT8, {1, 2, 1, 1}});
+  m.SetInput<int8_t>({3, 9});
+  m.SetSize({3, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+
+  ResizeBilinearOpModel const_m({TensorType_INT8, {1, 2, 1, 1}}, {3, 1});
+  const_m.SetInput<int8_t>({3, 9});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+}
+
 TEST(ResizeBilinearOpTest, TwoDimensionalResize) {
   ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}});
   m.SetInput<float>({
@@ -150,7 +180,7 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResize) {
                                           })));
 }
 
-TEST(ResizeBilinearOpTest, TwoDimensionalResize8Bit) {
+TEST(ResizeBilinearOpTest, TwoDimensionalResizeUInt8) {
   ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 2, 1}});
   m.SetInput<uint8>({
       3, 6,  //
@@ -177,6 +207,33 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResize8Bit) {
                                           })));
 }
 
+TEST(ResizeBilinearOpTest, TwoDimensionalResizeInt8) {
+  ResizeBilinearOpModel m({TensorType_INT8, {1, 2, 2, 1}});
+  m.SetInput<int8_t>({
+      3, 6,  //
+      9, 12  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                         3, 5, 6,    //
+                                         7, 9, 10,   //
+                                         9, 11, 12,  //
+                                     })));
+
+  ResizeBilinearOpModel const_m({TensorType_INT8, {1, 2, 2, 1}}, {3, 3});
+  const_m.SetInput<int8_t>({
+      3, 6,  //
+      9, 12  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                               3, 5, 6,    //
+                                               7, 9, 10,   //
+                                               9, 11, 12,  //
+                                           })));
+}
+
 TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) {
   ResizeBilinearOpModel m({TensorType_FLOAT32, {2, 2, 2, 1}});
   m.SetInput<float>({
@@ -241,7 +298,7 @@ TEST(ResizeBilinearOpTest, ThreeDimensionalResize) {
                                           })));
 }
 
-TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches8Bit) {
+TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatchesUInt8) {
   ResizeBilinearOpModel m({TensorType_UINT8, {2, 2, 2, 1}});
   m.SetInput<uint8>({
       3, 6,   //
@@ -278,7 +335,44 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches8Bit) {
                                           })));
 }
 
-TEST(ResizeBilinearOpTest, ThreeDimensionalResize8Bit) {
+TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatchesInt8) {
+  ResizeBilinearOpModel m({TensorType_INT8, {2, 2, 2, 1}});
+  m.SetInput<int8_t>({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      12, 16  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                         3, 5, 6,     //
+                                         7, 9, 10,    //
+                                         9, 11, 12,   //
+                                         4, 8, 10,    //
+                                         9, 12, 13,   //
+                                         12, 14, 16,  //
+                                     })));
+
+  ResizeBilinearOpModel const_m({TensorType_INT8, {2, 2, 2, 1}}, {3, 3});
+  const_m.SetInput<int8_t>({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      12, 16  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                               3, 5, 6,     //
+                                               7, 9, 10,    //
+                                               9, 11, 12,   //
+                                               4, 8, 10,    //
+                                               9, 12, 13,   //
+                                               12, 14, 16,  //
+                                           })));
+}
+
+TEST(ResizeBilinearOpTest, ThreeDimensionalResizeUInt8) {
   ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 2, 2}});
   m.SetInput<uint8>({
       3, 4, 6, 10,     //
@@ -304,6 +398,33 @@ TEST(ResizeBilinearOpTest, ThreeDimensionalResize8Bit) {
                                               10, 12, 12, 14, 14, 16,  //
                                           })));
 }
+
+TEST(ResizeBilinearOpTest, ThreeDimensionalResizeInt8) {
+  ResizeBilinearOpModel m({TensorType_INT8, {1, 2, 2, 2}});
+  m.SetInput<int8_t>({
+      3, 4, 6, 10,     //
+      10, 12, 14, 16,  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                         3, 4, 5, 8, 6, 10,       //
+                                         7, 9, 10, 12, 11, 13,    //
+                                         10, 12, 12, 14, 14, 16,  //
+                                     })));
+
+  ResizeBilinearOpModel const_m({TensorType_INT8, {1, 2, 2, 2}}, {3, 3});
+  const_m.SetInput<int8_t>({
+      3, 4, 6, 10,     //
+      10, 12, 14, 16,  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                               3, 4, 5, 8, 6, 10,       //
+                                               7, 9, 10, 12, 11, 13,    //
+                                               10, 12, 12, 14, 14, 16,  //
+                                           })));
+}
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/resize_nearest_neighbor.cc b/tensorflow/lite/kernels/resize_nearest_neighbor.cc
index a48d8004f8b6cead177286328082310237af515a..3030a4f28e22396cbc51e55ff04562fa76a0264e 100644
--- a/tensorflow/lite/kernels/resize_nearest_neighbor.cc
+++ b/tensorflow/lite/kernels/resize_nearest_neighbor.cc
@@ -106,8 +106,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           GetTensorShape(size), GetTensorData<int32>(size),
           GetTensorShape(output), GetTensorData<uint8_t>(output));
     }
+  } else if (output->type == kTfLiteInt8) {
+    reference_ops::ResizeNearestNeighbor(
+        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+        GetTensorShape(size), GetTensorData<int32>(size),
+        GetTensorShape(output), GetTensorData<int8_t>(output));
   } else {
-    context->ReportError(context, "Output type is %d, requires float or uint8.",
+    context->ReportError(context,
+                         "Output type is %d, requires float, uint8 or int8.",
                          output->type);
     return kTfLiteError;
   }
diff --git a/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc b/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
index 03e2effd84c4adb13db1bb3ada4f5cfe1c0b12c9..63b4f13d643e1da312b835f4bf3257b39d103b5a 100644
--- a/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
+++ b/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
@@ -79,7 +79,7 @@ TEST(ResizeNearestNeighborOpTest, HorizontalResize) {
               ElementsAreArray(ArrayFloatNear({3, 3, 6})));
 }
 
-TEST(ResizeNearestNeighborOpTest, HorizontalResize8Bit) {
+TEST(ResizeNearestNeighborOpTest, HorizontalResizeUInt8) {
   ResizeNearestNeighborOpModel m({TensorType_UINT8, {1, 1, 2, 1}});
   m.SetInput<uint8>({3, 6});
   m.SetSize({1, 3});
@@ -95,6 +95,21 @@ TEST(ResizeNearestNeighborOpTest, HorizontalResize8Bit) {
               ElementsAreArray(ArrayFloatNear({3, 3, 6})));
 }
 
+TEST(ResizeNearestNeighborOpTest, HorizontalResizeInt8) {
+  ResizeNearestNeighborOpModel m({TensorType_INT8, {1, 1, 2, 1}});
+  m.SetInput<int8_t>({-3, 6});
+  m.SetSize({1, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({-3, -3, 6})));
+
+  ResizeNearestNeighborOpModel const_m({TensorType_INT8, {1, 1, 2, 1}}, {1, 3});
+  const_m.SetInput<int8_t>({-3, 6});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({-3, -3, 6})));
+}
+
 TEST(ResizeNearestNeighborOpTest, VerticalResize) {
   ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {1, 2, 1, 1}});
   m.SetInput<float>({3, 9});
@@ -111,7 +126,7 @@ TEST(ResizeNearestNeighborOpTest, VerticalResize) {
               ElementsAreArray(ArrayFloatNear({3, 3, 9})));
 }
 
-TEST(ResizeNearestNeighborOpTest, VerticalResize8Bit) {
+TEST(ResizeNearestNeighborOpTest, VerticalResizeUInt8) {
   ResizeNearestNeighborOpModel m({TensorType_UINT8, {1, 2, 1, 1}});
   m.SetInput<uint8>({3, 9});
   m.SetSize({3, 1});
@@ -127,6 +142,21 @@ TEST(ResizeNearestNeighborOpTest, VerticalResize8Bit) {
               ElementsAreArray(ArrayFloatNear({3, 3, 9})));
 }
 
+TEST(ResizeNearestNeighborOpTest, VerticalResizeInt8) {
+  ResizeNearestNeighborOpModel m({TensorType_INT8, {1, 2, 1, 1}});
+  m.SetInput<int8_t>({3, -9});
+  m.SetSize({3, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({3, 3, -9})));
+
+  ResizeNearestNeighborOpModel const_m({TensorType_INT8, {1, 2, 1, 1}}, {3, 1});
+  const_m.SetInput<int8_t>({3, -9});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({3, 3, -9})));
+}
+
 TEST(ResizeNearestNeighborOpTest, TwoDimensionalResize) {
   ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}});
   m.SetInput<float>({
@@ -155,7 +185,7 @@ TEST(ResizeNearestNeighborOpTest, TwoDimensionalResize) {
                                           })));
 }
 
-TEST(ResizeNearestNeighborOpTest, TwoDimensionalResize8Bit) {
+TEST(ResizeNearestNeighborOpTest, TwoDimensionalResizeUInt8) {
   ResizeNearestNeighborOpModel m({TensorType_UINT8, {1, 2, 2, 1}});
   m.SetInput<uint8>({
       3, 6,  //
@@ -183,6 +213,33 @@ TEST(ResizeNearestNeighborOpTest, TwoDimensionalResize8Bit) {
                                           })));
 }
 
+TEST(ResizeNearestNeighborOpTest, TwoDimensionalResizeInt8) {
+  ResizeNearestNeighborOpModel m({TensorType_INT8, {1, 2, 2, 1}});
+  m.SetInput<int8_t>({
+      3, -6,  //
+      9, 12   //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                         3, 3, -6,  //
+                                         3, 3, -6,  //
+                                         9, 9, 12,  //
+                                     })));
+
+  ResizeNearestNeighborOpModel const_m({TensorType_INT8, {1, 2, 2, 1}}, {3, 3});
+  const_m.SetInput<int8_t>({
+      3, -6,  //
+      9, 12   //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                               3, 3, -6,  //
+                                               3, 3, -6,  //
+                                               9, 9, 12,  //
+                                           })));
+}
+
 TEST(ResizeNearestNeighborOpTest, TwoDimensionalResizeWithTwoBatches) {
   ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {2, 2, 2, 1}});
   m.SetInput<float>({
@@ -249,7 +306,7 @@ TEST(ResizeNearestNeighborOpTest, ThreeDimensionalResize) {
                                           })));
 }
 
-TEST(ResizeNearestNeighborOpTest, TwoDimensionalResizeWithTwoBatches8Bit) {
+TEST(ResizeNearestNeighborOpTest, TwoDimensionalResizeWithTwoBatchesUInt8) {
   ResizeNearestNeighborOpModel m({TensorType_UINT8, {2, 2, 2, 1}});
   m.SetInput<uint8>({
       3, 6,   //
@@ -287,7 +344,44 @@ TEST(ResizeNearestNeighborOpTest, TwoDimensionalResizeWithTwoBatches8Bit) {
                                           })));
 }
 
-TEST(ResizeNearestNeighborOpTest, ThreeDimensionalResize8Bit) {
+TEST(ResizeNearestNeighborOpTest, TwoDimensionalResizeWithTwoBatchesInt8) {
+  ResizeNearestNeighborOpModel m({TensorType_INT8, {2, 2, 2, 1}});
+  m.SetInput<int8_t>({
+      3, 6,    //
+      9, -12,  //
+      -4, 10,  //
+      12, 16   //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                         3, 3, 6,     //
+                                         3, 3, 6,     //
+                                         9, 9, -12,   //
+                                         -4, -4, 10,  //
+                                         -4, -4, 10,  //
+                                         12, 12, 16,  //
+                                     })));
+
+  ResizeNearestNeighborOpModel const_m({TensorType_INT8, {2, 2, 2, 1}}, {3, 3});
+  const_m.SetInput<int8_t>({
+      3, 6,    //
+      9, -12,  //
+      -4, 10,  //
+      12, 16   //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                               3, 3, 6,     //
+                                               3, 3, 6,     //
+                                               9, 9, -12,   //
+                                               -4, -4, 10,  //
+                                               -4, -4, 10,  //
+                                               12, 12, 16,  //
+                                           })));
+}
+
+TEST(ResizeNearestNeighborOpTest, ThreeDimensionalResizeUInt8) {
   ResizeNearestNeighborOpModel m({TensorType_UINT8, {1, 2, 2, 2}});
   m.SetInput<uint8>({
       3, 4, 6, 10,     //
@@ -315,6 +409,33 @@ TEST(ResizeNearestNeighborOpTest, ThreeDimensionalResize8Bit) {
                                           })));
 }
 
+TEST(ResizeNearestNeighborOpTest, ThreeDimensionalResizeInt8) {
+  ResizeNearestNeighborOpModel m({TensorType_INT8, {1, 2, 2, 2}});
+  m.SetInput<int8_t>({
+      3, 4, -6, 10,     //
+      10, 12, -14, 16,  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                         3, 4, 3, 4, -6, 10,       //
+                                         3, 4, 3, 4, -6, 10,       //
+                                         10, 12, 10, 12, -14, 16,  //
+                                     })));
+
+  ResizeNearestNeighborOpModel const_m({TensorType_INT8, {1, 2, 2, 2}}, {3, 3});
+  const_m.SetInput<int8_t>({
+      3, 4, -6, 10,     //
+      10, 12, -14, 16,  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<int8_t>(), ElementsAreArray(ArrayFloatNear({
+                                               3, 4, 3, 4, -6, 10,       //
+                                               3, 4, 3, 4, -6, 10,       //
+                                               10, 12, 10, 12, -14, 16,  //
+                                           })));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/reverse.cc b/tensorflow/lite/kernels/reverse.cc
new file mode 100644
index 0000000000000000000000000000000000000000..855aee8df1c0969bba9ec7d32bee78e04aeccbca
--- /dev/null
+++ b/tensorflow/lite/kernels/reverse.cc
@@ -0,0 +1,127 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace reverse {
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kAxisTensor = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* axis = GetInput(context, node, kAxisTensor);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(axis), 1);
+  TF_LITE_ENSURE(context, NumDimensions(input) >= NumElements(axis));
+
+  if (input->type != kTfLiteInt32 && input->type != kTfLiteFloat32 &&
+      input->type != kTfLiteUInt8 && input->type != kTfLiteInt16 &&
+      input->type != kTfLiteInt64) {
+    context->ReportError(context, "Type '%s' is not supported by reverse.",
+                         TfLiteTypeGetName(input->type));
+    return kTfLiteError;
+  }
+
+  if (axis->type != kTfLiteInt32) {
+    context->ReportError(context, "Axis Type '%s' is not supported by reverse.",
+                         TfLiteTypeGetName(axis->type));
+    return kTfLiteError;
+  }
+
+  // TODO(renjieliu): support multi-axis case.
+  if (NumElements(axis) > 1) {
+    context->ReportError(context, "Current does not support more than 1 axis.");
+  }
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteIntArray* output_shape = TfLiteIntArrayCopy(input->dims);
+  TF_LITE_ENSURE_EQ(context, output->type, input->type);
+
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* axis_tensor = GetInput(context, node, kAxisTensor);
+  int axis = GetTensorData<int32_t>(axis_tensor)[0];
+
+  TF_LITE_ENSURE(context, axis >= 0 && axis < NumDimensions(input));
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (output->type) {
+    case kTfLiteFloat32: {
+      reference_ops::Reverse<float>(
+          axis, GetTensorShape(input), GetTensorData<float>(input),
+          GetTensorShape(output), GetTensorData<float>(output));
+      break;
+    }
+    case kTfLiteUInt8: {
+      reference_ops::Reverse<uint8_t>(
+          axis, GetTensorShape(input), GetTensorData<uint8_t>(input),
+          GetTensorShape(output), GetTensorData<uint8_t>(output));
+      break;
+    }
+    case kTfLiteInt16: {
+      reference_ops::Reverse<int16_t>(
+          axis, GetTensorShape(input), GetTensorData<int16_t>(input),
+          GetTensorShape(output), GetTensorData<int16_t>(output));
+      break;
+    }
+    case kTfLiteInt32: {
+      reference_ops::Reverse<int32_t>(
+          axis, GetTensorShape(input), GetTensorData<int32_t>(input),
+          GetTensorShape(output), GetTensorData<int32_t>(output));
+      break;
+    }
+    case kTfLiteInt64: {
+      reference_ops::Reverse<int64_t>(
+          axis, GetTensorShape(input), GetTensorData<int64_t>(input),
+          GetTensorShape(output), GetTensorData<int64_t>(output));
+      break;
+    }
+    default: {
+      context->ReportError(context, "Type '%s' is not supported by reverse.",
+                           TfLiteTypeGetName(output->type));
+      return kTfLiteError;
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace
+}  // namespace reverse
+
+TfLiteRegistration* Register_REVERSE_V2() {
+  static TfLiteRegistration r = {nullptr, nullptr, reverse::Prepare,
+                                 reverse::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/reverse_test.cc b/tensorflow/lite/kernels/reverse_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9bc0c24b64c197d5c9a60ff74bdd53c5ae0352b9
--- /dev/null
+++ b/tensorflow/lite/kernels/reverse_test.cc
@@ -0,0 +1,199 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+
+template <typename T>
+class ReverseOpModel : public SingleOpModel {
+ public:
+  ReverseOpModel(const TensorData& input, const TensorData& axis) {
+    input_ = AddInput(input);
+    axis_ = AddInput(axis);
+
+    output_ = AddOutput({input.type, {}});
+
+    SetBuiltinOp(BuiltinOperator_REVERSE_V2, BuiltinOptions_ReverseV2Options,
+                 CreateReverseV2Options(builder_).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  int input() { return input_; }
+  int axis() { return axis_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int axis_;
+  int output_;
+};
+
+// float32 tests.
+TEST(ReverseOpTest, FloatOneDimension) {
+  ReverseOpModel<float> model({TensorType_FLOAT32, {4}},
+                              {TensorType_INT32, {1}});
+  model.PopulateTensor<float>(model.input(), {1, 2, 3, 4});
+  model.PopulateTensor<int32_t>(model.axis(), {0});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({4, 3, 2, 1}));
+}
+
+TEST(ReverseOpTest, FloatMultiDimensions) {
+  ReverseOpModel<float> model({TensorType_FLOAT32, {4, 3, 2}},
+                              {TensorType_INT32, {1}});
+  model.PopulateTensor<float>(model.input(),
+                              {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                               13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.PopulateTensor<int32_t>(model.axis(), {1});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({5,  6,  3,  4,  1,  2,  11, 12, 9,  10, 7,  8,
+                        17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}));
+}
+
+// int32 tests
+TEST(ReverseOpTest, Int32OneDimension) {
+  ReverseOpModel<int32_t> model({TensorType_INT32, {4}},
+                                {TensorType_INT32, {1}});
+  model.PopulateTensor<int32_t>(model.input(), {1, 2, 3, 4});
+  model.PopulateTensor<int32_t>(model.axis(), {0});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({4, 3, 2, 1}));
+}
+
+TEST(ReverseOpTest, Int32MultiDimensions) {
+  ReverseOpModel<int32_t> model({TensorType_INT32, {4, 3, 2}},
+                                {TensorType_INT32, {1}});
+  model.PopulateTensor<int32_t>(
+      model.input(), {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                      13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.PopulateTensor<int32_t>(model.axis(), {1});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({5,  6,  3,  4,  1,  2,  11, 12, 9,  10, 7,  8,
+                        17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}));
+}
+
+// int64 tests
+TEST(ReverseOpTest, Int64OneDimension) {
+  ReverseOpModel<int64_t> model({TensorType_INT64, {4}},
+                                {TensorType_INT32, {1}});
+  model.PopulateTensor<int64_t>(model.input(), {1, 2, 3, 4});
+  model.PopulateTensor<int32_t>(model.axis(), {0});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({4, 3, 2, 1}));
+}
+
+TEST(ReverseOpTest, Int64MultiDimensions) {
+  ReverseOpModel<int64_t> model({TensorType_INT64, {4, 3, 2}},
+                                {TensorType_INT32, {1}});
+  model.PopulateTensor<int64_t>(
+      model.input(), {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                      13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.PopulateTensor<int32_t>(model.axis(), {1});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({5,  6,  3,  4,  1,  2,  11, 12, 9,  10, 7,  8,
+                        17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}));
+}
+
+// uint8 tests
+TEST(ReverseOpTest, Uint8OneDimension) {
+  ReverseOpModel<uint8_t> model({TensorType_UINT8, {4}},
+                                {TensorType_INT32, {1}});
+  model.PopulateTensor<uint8_t>(model.input(), {1, 2, 3, 4});
+  model.PopulateTensor<int32_t>(model.axis(), {0});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({4, 3, 2, 1}));
+}
+
+TEST(ReverseOpTest, Uint8MultiDimensions) {
+  ReverseOpModel<uint8_t> model({TensorType_UINT8, {4, 3, 2}},
+                                {TensorType_INT32, {1}});
+  model.PopulateTensor<uint8_t>(
+      model.input(), {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                      13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.PopulateTensor<int32_t>(model.axis(), {1});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({5,  6,  3,  4,  1,  2,  11, 12, 9,  10, 7,  8,
+                        17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}));
+}
+
+// int16 tests
+TEST(ReverseOpTest, Int16OneDimension) {
+  ReverseOpModel<int16_t> model({TensorType_INT16, {4}},
+                                {TensorType_INT32, {1}});
+  model.PopulateTensor<int16_t>(model.input(), {1, 2, 3, 4});
+  model.PopulateTensor<int32_t>(model.axis(), {0});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({4, 3, 2, 1}));
+}
+
+TEST(ReverseOpTest, Int16MultiDimensions) {
+  ReverseOpModel<int16_t> model({TensorType_INT16, {4, 3, 2}},
+                                {TensorType_INT32, {1}});
+  model.PopulateTensor<int16_t>(
+      model.input(), {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                      13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  model.PopulateTensor<int32_t>(model.axis(), {1});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({5,  6,  3,  4,  1,  2,  11, 12, 9,  10, 7,  8,
+                        17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/select.cc b/tensorflow/lite/kernels/select.cc
index 4687ab44171fab73ff1b4ef93592b25680f3a59f..d1c63d887db00143fb6b154306313411643cf2b8 100644
--- a/tensorflow/lite/kernels/select.cc
+++ b/tensorflow/lite/kernels/select.cc
@@ -89,6 +89,9 @@ TfLiteStatus SelectEval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteUInt8:                                                         \
       TF_LITE_SELECT(uint8_t, op);                                             \
       break;                                                                   \
+    case kTfLiteInt8:                                                          \
+      TF_LITE_SELECT(int8_t, op);                                              \
+      break;                                                                   \
     case kTfLiteInt16:                                                         \
       TF_LITE_SELECT(int16_t, op);                                             \
       break;                                                                   \
diff --git a/tensorflow/lite/kernels/select_test.cc b/tensorflow/lite/kernels/select_test.cc
index 5111300e479a92ad9cbf00628750dc61effc50d3..d7cadeb51eb3ee0645eaccc1bbcea59bd279e0d7 100644
--- a/tensorflow/lite/kernels/select_test.cc
+++ b/tensorflow/lite/kernels/select_test.cc
@@ -96,6 +96,19 @@ TEST(SelectOpTest, SelectUInt8) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
 }
 
+TEST(SelectOpTest, SelectInt8) {
+  SelectOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, {1, 1, 1, 4},
+                      TensorType_INT8);
+
+  model.PopulateTensor<bool>(model.input1(), {false, true, false, false});
+  model.PopulateTensor<int8_t>(model.input2(), {1, -2, 3, 4});
+  model.PopulateTensor<int8_t>(model.input3(), {5, 6, 7, -8});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput<int8_t>(), ElementsAreArray({5, -2, 7, -8}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
 TEST(SelectOpTest, SelectInt16) {
   SelectOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, {1, 1, 1, 4},
                       TensorType_INT16);
diff --git a/tensorflow/lite/kernels/slice.cc b/tensorflow/lite/kernels/slice.cc
index 116c81e4d57a9a27dfb0581fe0096f461aa6ab81..650c65d7dc01efb77f28dfc29b674726954c9edf 100644
--- a/tensorflow/lite/kernels/slice.cc
+++ b/tensorflow/lite/kernels/slice.cc
@@ -28,6 +28,11 @@ namespace ops {
 namespace builtin {
 namespace slice {
 
+enum KernelType {
+  kReference,
+  kGenericOptimized,
+};
+
 constexpr int kInputTensor = 0;
 constexpr int kBeginTensor = 1;
 constexpr int kSizeTensor = 2;
@@ -126,6 +131,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return ResizeOutputShape(context, input, begin, size, output);
 }
 
+template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* begin = GetInput(context, node, kBeginTensor);
@@ -165,38 +171,47 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // The dimensions in the kernel used to be in reverse-order, and TFLite
   // arranged the begins and sizes vectors accordingly. This macro incorporates
   // the needed reversing.
-#define TF_LITE_SLICE(data_type)                                           \
-  {                                                                        \
-    TF_LITE_ENSURE_EQ(context, begins.size(), 4);                          \
-    TF_LITE_ENSURE_EQ(context, sizes.size(), 4);                           \
-    tflite::SliceParams op_params;                                         \
-    op_params.begin_count = 4;                                             \
-    op_params.size_count = 4;                                              \
-    for (int i = 0; i < 4; ++i) {                                          \
-      op_params.begin[i] = begins[3 - i];                                  \
-      op_params.size[i] = sizes[3 - i];                                    \
-    }                                                                      \
-                                                                           \
-    optimized_ops::Slice<data_type>(                                       \
-        op_params, GetTensorShape(input), GetTensorData<data_type>(input), \
-        GetTensorShape(output), GetTensorData<data_type>(output));         \
+#define TF_LITE_SLICE(data_type, kernel_type)                                \
+  {                                                                          \
+    TF_LITE_ENSURE_EQ(context, begins.size(), 4);                            \
+    TF_LITE_ENSURE_EQ(context, sizes.size(), 4);                             \
+    tflite::SliceParams op_params;                                           \
+    op_params.begin_count = 4;                                               \
+    op_params.size_count = 4;                                                \
+    for (int i = 0; i < 4; ++i) {                                            \
+      op_params.begin[i] = begins[3 - i];                                    \
+      op_params.size[i] = sizes[3 - i];                                      \
+    }                                                                        \
+                                                                             \
+    if (kernel_type == kGenericOptimized) {                                  \
+      optimized_ops::Slice<data_type>(                                       \
+          op_params, GetTensorShape(input), GetTensorData<data_type>(input), \
+          GetTensorShape(output), GetTensorData<data_type>(output));         \
+    } else {                                                                 \
+      reference_ops::Slice<data_type>(                                       \
+          op_params, GetTensorShape(input), GetTensorData<data_type>(input), \
+          GetTensorShape(output), GetTensorData<data_type>(output));         \
+    }                                                                        \
   }
 
   switch (input->type) {
     case kTfLiteFloat32:
-      TF_LITE_SLICE(float);
+      TF_LITE_SLICE(float, kernel_type);
       break;
     case kTfLiteInt32:
-      TF_LITE_SLICE(int32_t);
+      TF_LITE_SLICE(int32_t, kernel_type);
       break;
     case kTfLiteInt64:
-      TF_LITE_SLICE(int64_t);
+      TF_LITE_SLICE(int64_t, kernel_type);
+      break;
+    case kTfLiteInt8:
+      TF_LITE_SLICE(int8_t, kernel_type);
       break;
     case kTfLiteUInt8:
-      TF_LITE_SLICE(uint8_t);
+      TF_LITE_SLICE(uint8_t, kernel_type);
       break;
     case kTfLiteBool:
-      TF_LITE_SLICE(bool);
+      TF_LITE_SLICE(bool, kernel_type);
       break;
     default:
       context->ReportError(
@@ -209,8 +224,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace slice
 
+TfLiteRegistration* Register_SLICE_REF() {
+  static TfLiteRegistration r = {nullptr, nullptr, slice::Prepare,
+                                 slice::Eval<slice::kReference>};
+  return &r;
+}
+
 TfLiteRegistration* Register_SLICE() {
-  static TfLiteRegistration r = {nullptr, nullptr, slice::Prepare, slice::Eval};
+  static TfLiteRegistration r = {nullptr, nullptr, slice::Prepare,
+                                 slice::Eval<slice::kGenericOptimized>};
   return &r;
 }
 
diff --git a/tensorflow/lite/kernels/slice_test.cc b/tensorflow/lite/kernels/slice_test.cc
index 563329ddb164d3aa5f13c8ee0d6482d79b84ed32..102218ba23c105014ee6d501d2941f8b4755a44e 100644
--- a/tensorflow/lite/kernels/slice_test.cc
+++ b/tensorflow/lite/kernels/slice_test.cc
@@ -163,6 +163,28 @@ TEST(SliceOpTest, SizeMinus1) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3, 5, 5, 5}));
 }
 
+TEST(SliceOpTest, SliceUint8) {
+  SliceOpModel<uint8_t, int32_t> m({3, 2, 3, 1}, {4}, {4}, TensorType_INT32,
+                                   TensorType_UINT8);
+  m.SetInput({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.SetBegin({1, 0, 0, 0});
+  m.SetSize({2, 1, -1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 3, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3, 5, 5, 5}));
+}
+
+TEST(SliceOpTest, SliceInt8) {
+  SliceOpModel<int8_t, int32_t> m({3, 2, 3, 1}, {4}, {4}, TensorType_INT32,
+                                  TensorType_INT8);
+  m.SetInput({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.SetBegin({1, 0, 0, 0});
+  m.SetSize({2, 1, -1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 3, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3, 5, 5, 5}));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/space_to_batch_nd.cc b/tensorflow/lite/kernels/space_to_batch_nd.cc
index 1c61b2ef30379e808085f3b0d16a5b1157bea314..2fb7198cd67e8b9d13873d25a2eaa04fd2ff2ae0 100644
--- a/tensorflow/lite/kernels/space_to_batch_nd.cc
+++ b/tensorflow/lite/kernels/space_to_batch_nd.cc
@@ -141,6 +141,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                                   op_context.output->params.zero_point);
       }
       break;
+    case kTfLiteInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_SPACE_TO_BATCH_ND(reference_ops, int8_t,
+                                  op_context.output->params.zero_point);
+      } else {
+        TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, int8_t,
+                                  op_context.output->params.zero_point);
+      }
+      break;
     case kTfLiteInt32:
       if (kernel_type == kReference) {
         TF_LITE_SPACE_TO_BATCH_ND(reference_ops, int32_t, 0);
diff --git a/tensorflow/lite/kernels/space_to_batch_nd_test.cc b/tensorflow/lite/kernels/space_to_batch_nd_test.cc
index 4d55ba56b71c5e0c44f0145981db56cbef6ec99a..52a77984d935d04a79807707729754abd21d3be6 100644
--- a/tensorflow/lite/kernels/space_to_batch_nd_test.cc
+++ b/tensorflow/lite/kernels/space_to_batch_nd_test.cc
@@ -31,8 +31,9 @@ class SpaceToBatchNDOpModel : public SingleOpModel {
     PopulateTensor<float>(input_, data);
   }
 
+  template <typename T>
   void SetQuantizedInput(std::initializer_list<float> data) {
-    QuantizeAndPopulate<uint8_t>(input_, data);
+    QuantizeAndPopulate<T>(input_, data);
   }
 
   void SetBlockShape(std::initializer_list<int> data) {
@@ -46,9 +47,10 @@ class SpaceToBatchNDOpModel : public SingleOpModel {
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
+  template <typename T>
   std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
   }
 
  protected:
@@ -106,12 +108,14 @@ class SpaceToBatchNDOpDynamicModel : public SpaceToBatchNDOpModel {
   }
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(SpaceToBatchNDOpTest, InvalidShapeTest) {
   EXPECT_DEATH(
       SpaceToBatchNDOpConstModel({TensorType_FLOAT32, {1, 3, 3, 1}}, {2, 2},
                                  {0, 0, 0, 0}, {TensorType_FLOAT32}),
       "Cannot allocate tensors");
 }
+#endif
 
 TEST(SpaceToBatchNDOpTest, SimpleConstTest) {
   SpaceToBatchNDOpConstModel m({TensorType_FLOAT32, {1, 4, 4, 1}}, {2, 2},
@@ -220,6 +224,7 @@ class QuantizedSpaceToBatchNDOpTest : public ::testing::Test {
   }
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST_F(QuantizedSpaceToBatchNDOpTest, ZeroNotInQuantizationRange) {
   // The test_util and actual quantization code currently ensure that the range
   // must include zero, but if that ever changes, this test will catch it.
@@ -228,30 +233,64 @@ TEST_F(QuantizedSpaceToBatchNDOpTest, ZeroNotInQuantizationRange) {
                    {0, 0, 1, 1, 1, 1, 0, 0}, {TensorType_UINT8, {}, 1.0, 2.0}),
                ".*Check failed: f_min <= 0.*");
 }
+#endif
 
-TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingConstTest) {
+TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingConstTestUint8) {
   SpaceToBatchNDOpConstModel m({TensorType_UINT8, {1, 5, 2, 1}, -1.0, 1.0},
                                {3, 2}, {1, 0, 2, 0},
                                {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
+  m.SetQuantizedInput<uint8_t>(
+      {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(DequantizedArrayNear(
                   {0, 0,   0, -0.5, 0, 0,    0, 0.6,  0, -0.1, 0, -0.7,
                    0, 0.2, 0, 0.8,  0, -0.3, 0, -0.9, 0, 0.4,  0, 0.1},
                   -1.0, 1.0)));
 }
 
-TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingDynamicTest) {
+TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingConstTestInt8) {
+  SpaceToBatchNDOpConstModel m({TensorType_INT8, {1, 5, 2, 1}, -1.0, 1.0},
+                               {3, 2}, {1, 0, 2, 0},
+                               {TensorType_INT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput<int8_t>(
+      {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0,   0, -0.5, 0, 0,    0, 0.6,  0, -0.1, 0, -0.7,
+                   0, 0.2, 0, 0.8,  0, -0.3, 0, -0.9, 0, 0.4,  0, 0.1},
+                  -1.0, 1.0)));
+}
+
+TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingDynamicTestUint8) {
   SpaceToBatchNDOpDynamicModel m({TensorType_UINT8, {1, 5, 2, 1}, -1.0, 1.0},
                                  {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
+  m.SetQuantizedInput<uint8_t>(
+      {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
+  m.SetBlockShape({3, 2});
+  m.SetPaddings({1, 0, 2, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0,   0, -0.5, 0, 0,    0, 0.6,  0, -0.1, 0, -0.7,
+                   0, 0.2, 0, 0.8,  0, -0.3, 0, -0.9, 0, 0.4,  0, 0.1},
+                  -1.0, 1.0)));
+}
+
+TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingDynamicTestInt8) {
+  SpaceToBatchNDOpDynamicModel m({TensorType_INT8, {1, 5, 2, 1}, -1.0, 1.0},
+                                 {TensorType_INT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput<int8_t>(
+      {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
   m.SetBlockShape({3, 2});
   m.SetPaddings({1, 0, 2, 0});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
               ElementsAreArray(DequantizedArrayNear(
                   {0, 0,   0, -0.5, 0, 0,    0, 0.6,  0, -0.1, 0, -0.7,
                    0, 0.2, 0, 0.8,  0, -0.3, 0, -0.9, 0, 0.4,  0, 0.1},
@@ -262,10 +301,10 @@ TEST_F(QuantizedSpaceToBatchNDOpTest, ComplexPaddingConstTest) {
   SpaceToBatchNDOpConstModel m({TensorType_UINT8, {1, 4, 2, 1}, -1.0, 1.0},
                                {3, 2}, {1, 1, 2, 4},
                                {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8});
+  m.SetQuantizedInput<uint8_t>({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(DequantizedArrayNear(
                   {
                       0, 0,    0, 0, 0, -0.5, 0, 0, 0, 0,   0, 0, 0, 0.6, 0, 0,
@@ -278,12 +317,12 @@ TEST_F(QuantizedSpaceToBatchNDOpTest, ComplexPaddingConstTest) {
 TEST_F(QuantizedSpaceToBatchNDOpTest, ComplexPaddingDynamicTest) {
   SpaceToBatchNDOpDynamicModel m({TensorType_UINT8, {1, 4, 2, 1}, -1.0, 1.0},
                                  {TensorType_UINT8, {}, -1.0, 1.0});
-  m.SetQuantizedInput({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8});
+  m.SetQuantizedInput<uint8_t>({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8});
   m.SetBlockShape({3, 2});
   m.SetPaddings({1, 1, 2, 4});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1}));
-  EXPECT_THAT(m.GetDequantizedOutput(),
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
               ElementsAreArray(DequantizedArrayNear(
                   {
                       0, 0,    0, 0, 0, -0.5, 0, 0, 0, 0,   0, 0, 0, 0.6, 0, 0,
diff --git a/tensorflow/lite/kernels/space_to_depth.cc b/tensorflow/lite/kernels/space_to_depth.cc
index 79e28bf47d98b64572d9e7404f8d69788cd30e08..cf6b0bd4d3d4b61b87a14d1090a7e89d9b77a0f2 100644
--- a/tensorflow/lite/kernels/space_to_depth.cc
+++ b/tensorflow/lite/kernels/space_to_depth.cc
@@ -50,7 +50,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   auto data_type = output->type;
   TF_LITE_ENSURE(context,
                  data_type == kTfLiteFloat32 || data_type == kTfLiteUInt8 ||
-                     data_type == kTfLiteInt32 || data_type == kTfLiteInt64);
+                     data_type == kTfLiteInt8 || data_type == kTfLiteInt32 ||
+                     data_type == kTfLiteInt64);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
   const int block_size = params->block_size;
@@ -100,6 +101,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         TF_LITE_SPACE_TO_DEPTH(optimized_ops, uint8_t);
       }
       break;
+    case kTfLiteInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_SPACE_TO_DEPTH(reference_ops, int8_t);
+      } else {
+        TF_LITE_SPACE_TO_DEPTH(optimized_ops, int8_t);
+      }
+      break;
     case kTfLiteInt32:
       if (kernel_type == kReference) {
         TF_LITE_SPACE_TO_DEPTH(reference_ops, int32_t);
diff --git a/tensorflow/lite/kernels/space_to_depth_test.cc b/tensorflow/lite/kernels/space_to_depth_test.cc
index 5744669b6d62af61a0b20e7723b78c72f6db952d..58665fc9d83007d7bed638418cba058e4ff189c5 100644
--- a/tensorflow/lite/kernels/space_to_depth_test.cc
+++ b/tensorflow/lite/kernels/space_to_depth_test.cc
@@ -50,10 +50,12 @@ class SpaceToDepthOpModel : public SingleOpModel {
   int output_;
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(SpaceToDepthOpModel, BadBlockSize) {
   EXPECT_DEATH(SpaceToDepthOpModel({TensorType_FLOAT32, {1, 2, 2, 1}}, 3),
                "Cannot allocate tensors");
 }
+#endif
 
 TEST(SpaceToDepthOpModel, Float32) {
   SpaceToDepthOpModel m({TensorType_FLOAT32, {1, 2, 2, 2}}, 2);
@@ -72,6 +74,14 @@ TEST(SpaceToDepthOpModel, Uint8) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAre(1, 1, 1, 4));
 }
 
+TEST(SpaceToDepthOpModel, int8) {
+  SpaceToDepthOpModel m({TensorType_INT8, {1, 2, 2, 1}}, 2);
+  m.SetInput<int8_t>({1, 2, 3, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({1, 2, 3, 4}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAre(1, 1, 1, 4));
+}
+
 TEST(SpaceToDepthOpModel, Int32) {
   SpaceToDepthOpModel m({TensorType_INT32, {1, 2, 2, 3}}, 2);
   m.SetInput<int32_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
diff --git a/tensorflow/lite/kernels/sparse_output_fully_connected.cc b/tensorflow/lite/kernels/sparse_output_fully_connected.cc
deleted file mode 100644
index 73d850f0e2d094e9cc620f4f4733354d603b2a77..0000000000000000000000000000000000000000
--- a/tensorflow/lite/kernels/sparse_output_fully_connected.cc
+++ /dev/null
@@ -1,243 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// SparseOutputFullyConnected is a fully connected layer that uses a single
-// row in the weights and bias via a lookup.
-#include "tensorflow/lite/context.h"
-#include "tensorflow/lite/kernels/internal/tensor.h"
-#include "tensorflow/lite/kernels/internal/tensor_utils.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-
-namespace tflite {
-namespace ops {
-namespace custom {
-namespace sparse_output_fully_connected {
-
-// Input tensors of size {n_batch, n_input}
-constexpr int kInputTensor = 0;
-// Auxiliary input tensor of size { 1 }
-constexpr int kInputLookupTensor = 1;
-
-// Weights tensor of size { n_embeddings , n_input }
-constexpr int kWeightsTensor = 2;
-// Bias tensor of size { n_embeddings }
-constexpr int kBiasTensor = 3;
-
-// Output tensor.
-constexpr int kOutputTensor = 0;
-
-// Temporary tensors.
-enum TemporaryTensor {
-  kInputQuantized = 0,
-  kScalingFactors = 1,
-  kNumTemporaryTensors = 2
-};
-
-// Struct to hold op data.
-struct OpData {
-  int scratch_tensor_index;
-};
-
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* data = new OpData;
-  context->AddTensors(context, /*tensors_to_add=*/kNumTemporaryTensors,
-                      &data->scratch_tensor_index);
-  return data;
-}
-
-void Free(TfLiteContext* context, void* buffer) {
-  delete reinterpret_cast<OpData*>(buffer);
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
-
-  TF_LITE_ENSURE_EQ(context, node->inputs->size, 4);
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 2);
-  const int n_batch = SizeOfDimension(input, 0);
-  const int n_input = SizeOfDimension(input, 1);
-
-  const TfLiteTensor* lookup = GetInput(context, node, kInputLookupTensor);
-  TF_LITE_ENSURE_EQ(context, lookup->type, kTfLiteInt32);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(lookup), 1);
-  // Only support single lookup.
-  TF_LITE_ENSURE_EQ(context, SizeOfDimension(lookup, 0), 1);
-
-  const TfLiteTensor* weights = GetInput(context, node, kWeightsTensor);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(weights), 2);
-  TF_LITE_ENSURE_EQ(context, SizeOfDimension(weights, 1), n_input);
-
-  const TfLiteTensor* bias = GetInput(context, node, kBiasTensor);
-  TF_LITE_ENSURE_EQ(context, NumElements(bias), SizeOfDimension(weights, 0));
-
-  const bool is_hybrid_op =
-      (weights->type == kTfLiteUInt8 && input->type == kTfLiteFloat32);
-
-  // Resize output.
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(1);
-  output_size_array->data[0] = 1;
-  TF_LITE_ENSURE_OK(context,
-                    context->ResizeTensor(context, output, output_size_array));
-
-  if (is_hybrid_op) {
-    TfLiteIntArrayFree(node->temporaries);
-    node->temporaries = TfLiteIntArrayCreate(kNumTemporaryTensors);
-
-    // Allocate temporary tensors to store quantized values of input.
-    node->temporaries->data[kInputQuantized] = op_data->scratch_tensor_index;
-    TfLiteTensor* input_quantized =
-        GetTemporary(context, node, /*index=*/kInputQuantized);
-    input_quantized->type = kTfLiteUInt8;
-    input_quantized->allocation_type = kTfLiteArenaRw;
-    if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
-      TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
-      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
-                                                       input_quantized_size));
-    }
-
-    // Tell interpreter to allocate temporary tensors to store scaling factors.
-    node->temporaries->data[kScalingFactors] =
-        op_data->scratch_tensor_index + kScalingFactors;
-    TfLiteTensor* scaling_factors =
-        GetTemporary(context, node, /*index=*/kScalingFactors);
-    scaling_factors->type = kTfLiteFloat32;
-    scaling_factors->allocation_type = kTfLiteArenaRw;
-    int scaling_dims[1] = {n_batch};
-    if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
-      TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
-      scaling_factors_size->data[0] = n_batch;
-      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
-                                                       scaling_factors_size));
-    }
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalFloat(const TfLiteTensor* input, const TfLiteTensor* lookup,
-                       const TfLiteTensor* weights, const TfLiteTensor* bias,
-                       TfLiteTensor* output) {
-  const int n_batch = SizeOfDimension(input, 0);
-  const int n_input = SizeOfDimension(input, 1);
-
-  const float* input_ptr_batch = input->data.f;
-
-  // Initialize pointer to right row according to lookup value.
-  int32 lookup_index = lookup->data.i32[0];
-  const float* weights_ptr = weights->data.f + lookup_index * n_input;
-
-  // Initialize output to bias.
-  if (bias) {
-    float* bias_ptr = bias->data.f + lookup_index;
-    tensor_utils::VectorBatchVectorAssign(bias_ptr, 1, n_batch, output->data.f);
-  } else {
-    tensor_utils::ZeroVector(output->data.f, n_batch * 1);
-  }
-
-  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      weights_ptr, /*m_rows=*/1, n_input, input_ptr_batch, n_batch,
-      output->data.f, /*result_stride=*/1);
-
-  return kTfLiteOk;
-}
-
-TfLiteStatus EvalHybrid(const TfLiteTensor* input, const TfLiteTensor* lookup,
-                        const TfLiteTensor* weights, const TfLiteTensor* bias,
-                        TfLiteTensor* scaling_factors,
-                        TfLiteTensor* input_quantized, TfLiteTensor* output) {
-  const int n_batch = SizeOfDimension(input, 0);
-  const int n_input = SizeOfDimension(input, 1);
-
-  const float* input_ptr_batch = input->data.f;
-  // Initialize the pointer to storage for quantized values and
-  // scaling factors.
-  int8_t* quantized_input_ptr_batch =
-      reinterpret_cast<int8_t*>(input_quantized->data.uint8);
-  float* scaling_factors_ptr = scaling_factors->data.f;
-
-  // Initialize pointer to right row according to lookup value.
-  int32 lookup_index = lookup->data.i32[0];
-  int8_t* weights_ptr =
-      reinterpret_cast<int8_t*>(weights->data.uint8) + lookup_index * n_input;
-
-  // Initialize output to bias.
-  if (bias) {
-    float* bias_ptr = bias->data.f + lookup_index;
-    tensor_utils::VectorBatchVectorAssign(bias_ptr, 1, n_batch, output->data.f);
-  } else {
-    tensor_utils::ZeroVector(output->data.f, n_batch * 1);
-  }
-
-  if (!tensor_utils::IsZeroVector(input_ptr_batch, n_batch * n_input)) {
-    // Quantize input from float to int8.
-    float unused_min, unused_max;
-    for (int b = 0; b < n_batch; ++b) {
-      const int offset = b * n_input;
-      tensor_utils::SymmetricQuantizeFloats(
-          input_ptr_batch + offset, n_input, quantized_input_ptr_batch + offset,
-          &unused_min, &unused_max, &scaling_factors_ptr[b]);
-      scaling_factors_ptr[b] *= weights->params.scale;
-    }
-
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        weights_ptr, /*m_rows=*/1, n_input, quantized_input_ptr_batch,
-        scaling_factors_ptr, n_batch, output->data.f, /*result_stride=*/1);
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* lookup = GetInput(context, node, kInputLookupTensor);
-  const TfLiteTensor* weights = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* bias = GetInput(context, node, kBiasTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
-  switch (weights->type) {
-    case kTfLiteFloat32: {
-      return EvalFloat(input, lookup, weights, bias, output);
-    }
-    case kTfLiteUInt8: {
-      TfLiteTensor* input_quantized =
-          GetTemporary(context, node, /*index=*/kInputQuantized);
-      TfLiteTensor* scaling_factors =
-          GetTemporary(context, node, /*index=*/kScalingFactors);
-      return EvalHybrid(input, lookup, weights, bias, scaling_factors,
-                        input_quantized, output);
-    }
-    default:
-      context->ReportError(context, "Type %d is not currently supported.",
-                           weights->type);
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace sparse_output_fully_connected
-
-TfLiteRegistration* Register_SPARSE_OUTPUT_FULLY_CONNECTED() {
-  static TfLiteRegistration r = {sparse_output_fully_connected::Init,
-                                 sparse_output_fully_connected::Free,
-                                 sparse_output_fully_connected::Prepare,
-                                 sparse_output_fully_connected::Eval};
-  return &r;
-}
-
-}  // namespace custom
-}  // namespace ops
-}  // namespace tflite
diff --git a/tensorflow/lite/kernels/sparse_output_fully_connected_test.cc b/tensorflow/lite/kernels/sparse_output_fully_connected_test.cc
deleted file mode 100644
index c25a32bde001e632afff2a34ad168467c092bcf5..0000000000000000000000000000000000000000
--- a/tensorflow/lite/kernels/sparse_output_fully_connected_test.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Unit test for TFLite sparse output fully connected op.
-#include <iomanip>
-#include <random>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/kernels/test_util.h"
-
-namespace tflite {
-
-namespace ops {
-namespace custom {
-
-TfLiteRegistration* Register_SPARSE_OUTPUT_FULLY_CONNECTED();
-
-namespace {
-
-using ::testing::ElementsAreArray;
-
-class BaseSparseOutputFullyConnectedOpModel : public SingleOpModel {
- public:
-  BaseSparseOutputFullyConnectedOpModel(const TensorData& input,
-                                        const TensorData& weights,
-                                        const TensorData& output = {
-                                            TensorType_FLOAT32}) {
-    input_ = AddInput(input);
-    lookup_ = AddInput({TensorType_INT32, {1}});
-    weights_ = AddInput(weights);
-    int bias_size = GetShape(weights_)[0];
-    bias_ = AddInput({TensorType_FLOAT32, {bias_size}});
-    output_ = AddOutput(output);
-
-    // Create empty (required) options map.
-    flexbuffers::Builder fbb;
-    fbb.Map([&]() {});
-    fbb.Finish();
-
-    SetCustomOp("SPARSE_OUTPUT_FULLY_CONNECTED", fbb.GetBuffer(),
-                Register_SPARSE_OUTPUT_FULLY_CONNECTED);
-    BuildInterpreter({GetShape(input_), GetShape(lookup_), GetShape(weights_),
-                      GetShape(bias_)});
-  }
-
-  void SetInput(const std::vector<float>& data) {
-    PopulateTensor(input_, data);
-  }
-
-  void SetLookup(const std::vector<int32>& f) { PopulateTensor(lookup_, f); }
-
-  void SetBias(const std::vector<float>& f) { PopulateTensor(bias_, f); }
-
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
-
- protected:
-  int input_;
-  int lookup_;
-  int weights_;
-  int bias_;
-  int output_;
-};
-
-class FloatSparseOutputFullyConnectedOpModel
-    : public BaseSparseOutputFullyConnectedOpModel {
- public:
-  using BaseSparseOutputFullyConnectedOpModel::
-      BaseSparseOutputFullyConnectedOpModel;
-
-  void SetWeights(const std::vector<float>& f) { PopulateTensor(weights_, f); }
-};
-
-class HybridSparseOutputFullyConnectedOpModel
-    : public BaseSparseOutputFullyConnectedOpModel {
- public:
-  using BaseSparseOutputFullyConnectedOpModel::
-      BaseSparseOutputFullyConnectedOpModel;
-
-  void SetWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(weights_, f);
-  }
-};
-
-TEST(SparseOutputFullyConnectedOpTest, SimpleTestFloat) {
-  FloatSparseOutputFullyConnectedOpModel m({TensorType_FLOAT32, {1, 5}},
-                                           {TensorType_FLOAT32, {3, 5}},
-                                           {TensorType_FLOAT32, {}});
-
-  m.SetInput({-1.0, 0.0, 1.0, 2.0, 3.0});
-
-  m.SetLookup({2});
-
-  m.SetWeights({
-      -1.0, 0.0, 1.0, 2.0, 3.0,  //
-      0.0, 1.0, 2.0, 3.0, 4.0,   //
-      1.0, 2.0, 3.0, 4.0, 5.0,   //
-  });
-
-  m.SetBias({1.0, 2.0, 3.0});
-
-  m.Invoke();
-
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({28}));
-}
-
-TEST(SparseOutputFullyConnectedOpTest, SimpleTestHybrid) {
-  HybridSparseOutputFullyConnectedOpModel m({TensorType_FLOAT32, {1, 5}},
-                                            {TensorType_UINT8, {3, 5}},
-                                            {TensorType_FLOAT32, {}});
-
-  m.SetInput({-1.0, 0.0, 1.0, 2.0, 3.0});
-
-  m.SetLookup({2});
-
-  m.SetWeights({
-      -1.0, 0.0, 1.0, 2.0, 3.0,  //
-      0.0, 1.0, 2.0, 3.0, 4.0,   //
-      1.0, 2.0, 3.0, 4.0, 5.0,   //
-  });
-
-  m.SetBias({1.0, 2.0, 3.0});
-
-  m.Invoke();
-
-  // We get 28.0552 instead of 28.
-  //
-  // Input -> -42, 0, 42, 85, 127 with scale factor of 127/3.
-  // Looked up weights ->  25, 51, 76, 102, 127 with scale factor of 127/5.
-  //
-  // (-42 * 25 + 0 * 51 + 42 * 76 + 85 * 102 + 127 * 127) * (3*5/127^2) + 3.0
-  // gives us the expected result.
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({28}, 0.0553)));
-}
-
-}  // namespace
-}  // namespace custom
-}  // namespace ops
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/split.cc b/tensorflow/lite/kernels/split.cc
index 7902ed2a46d297cca6f076bf1bb48580f3c4bf40..c0f701f55dd096279e1e9f1e54817490cb0c230b 100644
--- a/tensorflow/lite/kernels/split.cc
+++ b/tensorflow/lite/kernels/split.cc
@@ -76,9 +76,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), op_context.params->num_splits);
 
   auto input_type = op_context.input->type;
-  TF_LITE_ENSURE(context, input_type == kTfLiteFloat32 ||
-                              input_type == kTfLiteUInt8 ||
-                              input_type == kTfLiteInt16);
+  TF_LITE_ENSURE(context,
+                 input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
+                     input_type == kTfLiteInt8 || input_type == kTfLiteInt16 ||
+                     input_type == kTfLiteInt32);
   for (int i = 0; i < NumOutputs(node); ++i) {
     GetOutput(context, node, i)->type = input_type;
   }
@@ -137,15 +138,23 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_SPLIT(uint8_t);
       break;
     }
+    case kTfLiteInt8: {
+      TF_LITE_SPLIT(int8_t);
+      break;
+    }
     case kTfLiteInt16: {
       TF_LITE_SPLIT(int16_t);
       break;
     }
+    case kTfLiteInt32: {
+      TF_LITE_SPLIT(int32_t);
+      break;
+    }
     default:
-      context->ReportError(
-          context,
-          "Only float32, uint8 and int16 are currently supported, got %d.",
-          op_context.input->type);
+      context->ReportError(context,
+                           "Only float32, uint8, int8, int16 and int32 are "
+                           "currently supported, got %d.",
+                           op_context.input->type);
       return kTfLiteError;
   }
 #undef TF_LITE_SPLIT
diff --git a/tensorflow/lite/kernels/split_test.cc b/tensorflow/lite/kernels/split_test.cc
index f3d9ea3bf4158dd51b5102b942125b7561024c19..fa313d4b18f803dc5060425d8162af25129dd5d9 100644
--- a/tensorflow/lite/kernels/split_test.cc
+++ b/tensorflow/lite/kernels/split_test.cc
@@ -47,13 +47,15 @@ class SplitOpModel : public SingleOpModel {
     }
   }
 
-  void SetInput(std::initializer_list<float> data) {
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
     PopulateTensor(input_, data);
   }
   void SetAxis(int axis) { PopulateTensor(axis_, {axis}); }
 
-  std::vector<float> GetOutput(int i) {
-    return ExtractVector<float>(outputs_[i]);
+  template <typename T>
+  std::vector<T> GetOutput(int i) {
+    return ExtractVector<T>(outputs_[i]);
   }
   std::vector<int> GetOutputShape(int i) { return GetTensorShape(outputs_[i]); }
 
@@ -63,33 +65,34 @@ class SplitOpModel : public SingleOpModel {
   std::vector<int> outputs_;
 };
 
-using TensorValues = std::initializer_list<float>;
-
+template <typename T>
 void Check(int axis, int num_splits, std::initializer_list<int> input_shape,
            std::initializer_list<int> output_shape,
-           const TensorValues& input_data,
-           const std::vector<TensorValues>& output_data) {
+           const std::initializer_list<T>& input_data,
+           const std::vector<std::initializer_list<T>>& output_data,
+           const TensorType& type = TensorType_FLOAT32) {
   auto debug = [&](int i) {
     std::stringstream ss;
     ss << "for output tensor " << i << " axis=" << axis
        << " and num_splits=" << num_splits;
     return ss.str();
   };
-  SplitOpModel m({TensorType_FLOAT32, input_shape}, num_splits);
+  SplitOpModel m({type, input_shape}, num_splits);
   m.SetInput(input_data);
   m.SetAxis(axis);
   m.Invoke();
   for (int i = 0; i < num_splits; ++i) {
-    EXPECT_THAT(m.GetOutput(i), ElementsAreArray(output_data[i])) << debug(i);
+    EXPECT_THAT(m.GetOutput<T>(i), ElementsAreArray(output_data[i]))
+        << debug(i);
     EXPECT_THAT(m.GetOutputShape(i), ElementsAreArray(output_shape))
         << debug(i);
   }
 
-  SplitOpModel const_m({TensorType_FLOAT32, input_shape}, num_splits, axis);
+  SplitOpModel const_m({type, input_shape}, num_splits, axis);
   const_m.SetInput(input_data);
   const_m.Invoke();
   for (int i = 0; i < num_splits; ++i) {
-    EXPECT_THAT(const_m.GetOutput(i), ElementsAreArray(output_data[i]))
+    EXPECT_THAT(const_m.GetOutput<T>(i), ElementsAreArray(output_data[i]))
         << debug(i);
     EXPECT_THAT(const_m.GetOutputShape(i), ElementsAreArray(output_shape))
         << debug(i);
@@ -97,44 +100,106 @@ void Check(int axis, int num_splits, std::initializer_list<int> input_shape,
 }
 
 TEST(SplitOpTest, FourDimensional) {
-  Check(/*axis=*/0, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-        {
-            {1, 2, 3, 4, 5, 6, 7, 8},
-            {9, 10, 11, 12, 13, 14, 15, 16},
-        });
-  Check(/*axis=*/1, /*num_splits=*/2, {2, 2, 2, 2}, {2, 1, 2, 2},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-        {
-            {1, 2, 3, 4, 9, 10, 11, 12},
-            {5, 6, 7, 8, 13, 14, 15, 16},
-        });
-  Check(/*axis=*/2, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 1, 2},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-        {
-            {1, 2, 5, 6, 9, 10, 13, 14},
-            {3, 4, 7, 8, 11, 12, 15, 16},
-        });
-  Check(/*axis=*/3, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 2, 1},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-        {
-            {1, 3, 5, 7, 9, 11, 13, 15},
-            {2, 4, 6, 8, 10, 12, 14, 16},
-        });
+  Check<float>(/*axis=*/0, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
+               {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+               {
+                   {1, 2, 3, 4, 5, 6, 7, 8},
+                   {9, 10, 11, 12, 13, 14, 15, 16},
+               });
+  Check<float>(/*axis=*/1, /*num_splits=*/2, {2, 2, 2, 2}, {2, 1, 2, 2},
+               {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+               {
+                   {1, 2, 3, 4, 9, 10, 11, 12},
+                   {5, 6, 7, 8, 13, 14, 15, 16},
+               });
+  Check<float>(/*axis=*/2, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 1, 2},
+               {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+               {
+                   {1, 2, 5, 6, 9, 10, 13, 14},
+                   {3, 4, 7, 8, 11, 12, 15, 16},
+               });
+  Check<float>(/*axis=*/3, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 2, 1},
+               {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+               {
+                   {1, 3, 5, 7, 9, 11, 13, 15},
+                   {2, 4, 6, 8, 10, 12, 14, 16},
+               });
+}
+
+TEST(SplitOpTest, FourDimensionalInt8) {
+  Check<int8_t>(/*axis=*/0, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
+                {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                {
+                    {1, 2, 3, 4, 5, 6, 7, 8},
+                    {9, 10, 11, 12, 13, 14, 15, 16},
+                },
+                TensorType_INT8);
+  Check<int8_t>(/*axis=*/1, /*num_splits=*/2, {2, 2, 2, 2}, {2, 1, 2, 2},
+                {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                {
+                    {1, 2, 3, 4, 9, 10, 11, 12},
+                    {5, 6, 7, 8, 13, 14, 15, 16},
+                },
+                TensorType_INT8);
+  Check<int8_t>(/*axis=*/2, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 1, 2},
+                {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                {
+                    {1, 2, 5, 6, 9, 10, 13, 14},
+                    {3, 4, 7, 8, 11, 12, 15, 16},
+                },
+                TensorType_INT8);
+  Check<int8_t>(/*axis=*/3, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 2, 1},
+                {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                {
+                    {1, 3, 5, 7, 9, 11, 13, 15},
+                    {2, 4, 6, 8, 10, 12, 14, 16},
+                },
+                TensorType_INT8);
+}
+
+TEST(SplitOpTest, FourDimensionalInt32) {
+  Check<int32_t>(/*axis=*/0, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
+                 {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                 {
+                     {1, 2, 3, 4, 5, 6, 7, 8},
+                     {9, 10, 11, 12, 13, 14, 15, 16},
+                 },
+                 TensorType_INT32);
+  Check<int32_t>(/*axis=*/1, /*num_splits=*/2, {2, 2, 2, 2}, {2, 1, 2, 2},
+                 {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                 {
+                     {1, 2, 3, 4, 9, 10, 11, 12},
+                     {5, 6, 7, 8, 13, 14, 15, 16},
+                 },
+                 TensorType_INT32);
+  Check<int32_t>(/*axis=*/2, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 1, 2},
+                 {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                 {
+                     {1, 2, 5, 6, 9, 10, 13, 14},
+                     {3, 4, 7, 8, 11, 12, 15, 16},
+                 },
+                 TensorType_INT32);
+  Check<int32_t>(/*axis=*/3, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 2, 1},
+                 {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                 {
+                     {1, 3, 5, 7, 9, 11, 13, 15},
+                     {2, 4, 6, 8, 10, 12, 14, 16},
+                 },
+                 TensorType_INT32);
 }
 
 TEST(SplitOpTest, OneDimensional) {
-  Check(/*axis=*/0, /*num_splits=*/8, {8}, {1}, {1, 2, 3, 4, 5, 6, 7, 8},
-        {{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}});
+  Check<float>(/*axis=*/0, /*num_splits=*/8, {8}, {1}, {1, 2, 3, 4, 5, 6, 7, 8},
+               {{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}});
 }
 
 TEST(SplitOpTest, NegativeAxis) {
-  Check(/*axis=*/-4, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
-        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-        {
-            {1, 2, 3, 4, 5, 6, 7, 8},
-            {9, 10, 11, 12, 13, 14, 15, 16},
-        });
+  Check<float>(/*axis=*/-4, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
+               {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+               {
+                   {1, 2, 3, 4, 5, 6, 7, 8},
+                   {9, 10, 11, 12, 13, 14, 15, 16},
+               });
 }
 
 }  // namespace
diff --git a/tensorflow/lite/kernels/squared_difference.cc b/tensorflow/lite/kernels/squared_difference.cc
index 59b53a6287dbbc863a61875be82090c1b9c6d442..3661cf9f98c5d0133090ae926f8d76e54f428eba 100644
--- a/tensorflow/lite/kernels/squared_difference.cc
+++ b/tensorflow/lite/kernels/squared_difference.cc
@@ -105,10 +105,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   } else if (output->type == kTfLiteInt32) {
     EvalSquaredDifference<int32_t>(context, node, data, input1, input2, output);
   } else {
-    context->ReportError(context,
-                         "SquaredDifference only supports FLOAT32, INT32 and "
-                         "quantized UINT8 now, got %d.",
-                         output->type);
+    context->ReportError(
+        context,
+        "SquaredDifference only supports FLOAT32 and INT32 now, got %d.",
+        output->type);
     return kTfLiteError;
   }
 
diff --git a/tensorflow/lite/kernels/strided_slice.cc b/tensorflow/lite/kernels/strided_slice.cc
index c797a98e9f1bda8595e6822638949bab48cb2eab..8c25ffa3a1a669684d9fb1b552893de3a450264f 100644
--- a/tensorflow/lite/kernels/strided_slice.cc
+++ b/tensorflow/lite/kernels/strided_slice.cc
@@ -234,6 +234,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         TF_LITE_STRIDED_SLICE(reference_ops, uint8_t);
       }
       break;
+    case kTfLiteInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_STRIDED_SLICE(reference_ops, int8_t);
+      }
+      break;
     default:
       context->ReportError(context,
                            "Type %d is currently not supported "
diff --git a/tensorflow/lite/kernels/strided_slice_test.cc b/tensorflow/lite/kernels/strided_slice_test.cc
index 122e01b99ecbed1255ea4b2d29e82b57f04be80c..cac9e1672f871268d6d37b3488d00a0c1399aaa7 100644
--- a/tensorflow/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/kernels/strided_slice_test.cc
@@ -72,6 +72,7 @@ class StridedSliceOpModel : public SingleOpModel {
   int output_;
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(StridedSliceOpTest, UnsupportedInputSize) {
   EXPECT_DEATH(
       StridedSliceOpModel<>({2, 2, 2, 2, 2}, {5}, {5}, {5}, 0, 0, 0, 0, 0),
@@ -84,6 +85,7 @@ TEST(StridedSliceOpTest, UnssupportedArgs) {
   EXPECT_DEATH(StridedSliceOpModel<>({3, 2}, {2}, {2}, {2}, 0, 0, 0, 1, 0),
                "new_axis_mask is not implemented yet.");
 }
+#endif
 
 TEST(StridedSliceOpTest, In1D) {
   StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
@@ -575,6 +577,18 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1Uint8) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
 }
+
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1int8) {
+  StridedSliceOpModel<int8_t, TensorType_INT8> m({2, 3, 2}, {3}, {3}, {3}, 0, 0,
+                                                 0, 0, 1);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({1, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/sub.cc b/tensorflow/lite/kernels/sub.cc
index 06a3b3499a005f19bfd1461dfe861835f8331b96..8bd6052307cc0e032a566e437923cac2f16be69e 100644
--- a/tensorflow/lite/kernels/sub.cc
+++ b/tensorflow/lite/kernels/sub.cc
@@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <limits>
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -39,6 +41,23 @@ constexpr int kOutputTensor = 0;
 
 struct OpData {
   bool requires_broadcast;
+
+  // These fields are used in both the general 8-bit -> 8bit quantized path,
+  // and the special 16-bit -> 16bit quantized path
+  int input1_shift;
+  int input2_shift;
+  int32 output_activation_min;
+  int32 output_activation_max;
+
+  // These fields are used only in the general 8-bit -> 8bit quantized path
+  int32 input1_multiplier;
+  int32 input2_multiplier;
+  int32 output_multiplier;
+  int output_shift;
+  int left_shift;
+  int32 input1_offset;
+  int32 input2_offset;
+  int32 output_offset;
 };
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
@@ -51,8 +70,126 @@ void Free(TfLiteContext* context, void* buffer) {
   delete reinterpret_cast<OpData*>(buffer);
 }
 
+TfLiteStatus Prepare8BitSubOp(TfLiteContext* context,
+                              const TfLiteTensor* input_1,
+                              const TfLiteTensor* input_2, TfLiteTensor* output,
+                              TfLiteSubParams* params, OpData* op_params,
+                              int op_sign) {
+  TF_LITE_ENSURE(context,
+                 output->type == kTfLiteUInt8 || output->type == kTfLiteInt8);
+  const auto& input1_quantization_params = input_1->params;
+  const auto& input2_quantization_params = input_2->params;
+  const auto& output_quantization_params = output->params;
+  int32_t integer_type_min = 0;
+  int32_t integer_type_max = 0;
+  if (output->type == kTfLiteUInt8) {
+    integer_type_min = std::numeric_limits<uint8_t>::min();
+    integer_type_max = std::numeric_limits<uint8_t>::max();
+  } else {
+    // output->type == kTfLiteInt8
+    integer_type_min = std::numeric_limits<int8_t>::min();
+    integer_type_max = std::numeric_limits<int8_t>::max();
+  }
+
+  TF_LITE_ENSURE(context,
+                 input1_quantization_params.zero_point >= integer_type_min);
+  TF_LITE_ENSURE(context,
+                 input1_quantization_params.zero_point <= integer_type_max);
+  TF_LITE_ENSURE(context,
+                 input2_quantization_params.zero_point >= integer_type_min);
+  TF_LITE_ENSURE(context,
+                 input2_quantization_params.zero_point <= integer_type_max);
+  TF_LITE_ENSURE(context,
+                 output_quantization_params.zero_point >= integer_type_min);
+  TF_LITE_ENSURE(context,
+                 output_quantization_params.zero_point <= integer_type_max);
+
+  op_params->input1_offset = -input1_quantization_params.zero_point;
+  op_params->input2_offset = -input2_quantization_params.zero_point;
+  op_params->output_offset = output_quantization_params.zero_point;
+  op_params->left_shift = 20;
+  const double twice_max_input_scale =
+      2 * std::max(input1_quantization_params.scale,
+                   input2_quantization_params.scale);
+  const double real_input1_multiplier =
+      input1_quantization_params.scale / twice_max_input_scale;
+  const double real_input2_multiplier =
+      input2_quantization_params.scale / twice_max_input_scale;
+  const double real_output_multiplier =
+      twice_max_input_scale /
+      ((1 << op_params->left_shift) * output_quantization_params.scale);
+
+  tflite::QuantizeMultiplierSmallerThanOneExp(real_input1_multiplier,
+                                              &op_params->input1_multiplier,
+                                              &op_params->input1_shift);
+  tflite::QuantizeMultiplierSmallerThanOneExp(real_input2_multiplier,
+                                              &op_params->input2_multiplier,
+                                              &op_params->input2_shift);
+  op_params->input2_multiplier *= op_sign;
+  tflite::QuantizeMultiplierSmallerThanOneExp(real_output_multiplier,
+                                              &op_params->output_multiplier,
+                                              &op_params->output_shift);
+  if (output->type == kTfLiteUInt8) {
+    CalculateActivationRangeUint8(params->activation, output,
+                                  &op_params->output_activation_min,
+                                  &op_params->output_activation_max);
+  } else {
+    CalculateActivationRangeInt8(params->activation, output,
+                                 &op_params->output_activation_min,
+                                 &op_params->output_activation_max);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus PrepareInt16SubOp(TfLiteContext* context,
+                               const TfLiteTensor* input1,
+                               const TfLiteTensor* input2, TfLiteTensor* output,
+                               TfLiteSubParams* params, OpData* data) {
+  // 16bit -> 16bit special quantized path, supporting only a rather
+  // narrow case of quantization parameters: zero_points must all be 0
+  // ("symmetric quantization") and scales must be power-of-two (which
+  // we abbreviate as "POT" below). The intended use case for this path
+  // is in LSTM cells, where, due to the constraints of implementing
+  // some of the math in these LSTM cells in fixed-point arithmetic,
+  // we need to have such symmetric, power-of-two quantization
+  // (Fixed-point formats are inherently symmetric, power-of-two).
+  TF_LITE_ENSURE_EQ(context, input1->params.zero_point, 0);
+  TF_LITE_ENSURE_EQ(context, input2->params.zero_point, 0);
+  TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+
+  int input1_scale_log2_rounded;
+  bool input1_scale_is_pot =
+      CheckedLog2(input1->params.scale, &input1_scale_log2_rounded);
+  TF_LITE_ENSURE(context, input1_scale_is_pot);
+
+  int input2_scale_log2_rounded;
+  bool input2_scale_is_pot =
+      CheckedLog2(input2->params.scale, &input2_scale_log2_rounded);
+  TF_LITE_ENSURE(context, input2_scale_is_pot);
+
+  int output_scale_log2_rounded;
+  bool output_scale_is_pot =
+      CheckedLog2(output->params.scale, &output_scale_log2_rounded);
+  TF_LITE_ENSURE(context, output_scale_is_pot);
+
+  data->input1_shift = input1_scale_log2_rounded - output_scale_log2_rounded;
+  data->input2_shift = input2_scale_log2_rounded - output_scale_log2_rounded;
+
+  // Shifting of one input is supported. The graph quantization should ensure
+  // that the other input matches the output.
+  TF_LITE_ENSURE(context, data->input1_shift == 0 || data->input2_shift == 0);
+  TF_LITE_ENSURE(context, data->input1_shift <= 0);
+  TF_LITE_ENSURE(context, data->input2_shift <= 0);
+
+  CalculateActivationRangeQuantized(context, params->activation, output,
+                                    &data->output_activation_min,
+                                    &data->output_activation_max);
+  return kTfLiteOk;
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
+  auto* params = reinterpret_cast<TfLiteSubParams*>(node->builtin_data);
 
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
@@ -74,6 +211,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     output_size = TfLiteIntArrayCopy(input1->dims);
   }
 
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_OK(context, Prepare8BitSubOp(context, input1, input2, output,
+                                                params, data, -1));
+  } else if (output->type == kTfLiteInt16) {
+    TF_LITE_ENSURE_OK(context, PrepareInt16SubOp(context, input1, input2,
+                                                 output, params, data));
+  }
+
   return context->ResizeTensor(context, output, output_size);
 }
 
@@ -129,60 +274,67 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                    TfLiteSubParams* params, const OpData* data,
                    const TfLiteTensor* input1, const TfLiteTensor* input2,
                    TfLiteTensor* output) {
-  auto input1_offset = -input1->params.zero_point;
-  auto input2_offset = -input2->params.zero_point;
-  auto output_offset = output->params.zero_point;
-  const int left_shift = 20;
-  const double twice_max_input_scale =
-      2 * std::max(input1->params.scale, input2->params.scale);
-  const double real_input1_multiplier =
-      input1->params.scale / twice_max_input_scale;
-  const double real_input2_multiplier =
-      input2->params.scale / twice_max_input_scale;
-  const double real_output_multiplier =
-      twice_max_input_scale / ((1 << left_shift) * output->params.scale);
+  tflite::ArithmeticParams op_params;
+  op_params.left_shift = data->left_shift;
+  op_params.input1_offset = data->input1_offset;
+  op_params.input1_multiplier = data->input1_multiplier;
+  op_params.input1_shift = data->input1_shift;
+  op_params.input2_offset = data->input2_offset;
+  op_params.input2_multiplier = data->input2_multiplier;
+  op_params.input2_shift = data->input2_shift;
+  op_params.output_offset = data->output_offset;
+  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_shift = data->output_shift;
+  SetActivationParams(data->output_activation_min, data->output_activation_max,
+                      &op_params);
 
-  int32 input1_multiplier;
-  int input1_shift;
-  QuantizeMultiplierSmallerThanOneExp(real_input1_multiplier,
-                                      &input1_multiplier, &input1_shift);
-  int32 input2_multiplier;
-  int input2_shift;
-  QuantizeMultiplierSmallerThanOneExp(real_input2_multiplier,
-                                      &input2_multiplier, &input2_shift);
-  int32 output_multiplier;
-  int output_shift;
-  QuantizeMultiplierSmallerThanOneExp(real_output_multiplier,
-                                      &output_multiplier, &output_shift);
-
-  int32 output_activation_min, output_activation_max;
-  CalculateActivationRangeUint8(params->activation, output,
-                                &output_activation_min, &output_activation_max);
-
-#define TF_LITE_SUB(type, opname)                                      \
-  tflite::ArithmeticParams op_params;                                  \
-  op_params.left_shift = left_shift;                                   \
-  op_params.input1_offset = input1_offset;                             \
-  op_params.input1_multiplier = input1_multiplier;                     \
-  op_params.input1_shift = input1_shift;                               \
-  op_params.input2_offset = input2_offset;                             \
-  op_params.input2_multiplier = input2_multiplier;                     \
-  op_params.input2_shift = input2_shift;                               \
-  op_params.output_offset = output_offset;                             \
-  op_params.output_multiplier = output_multiplier;                     \
-  op_params.output_shift = output_shift;                               \
-  SetActivationParams(output_activation_min, output_activation_max,    \
-                      &op_params);                                     \
-  type::opname(op_params, GetTensorShape(input1),                      \
-               GetTensorData<uint8_t>(input1), GetTensorShape(input2), \
-               GetTensorData<uint8_t>(input2), GetTensorShape(output), \
-               GetTensorData<uint8_t>(output))
-  // The quantized version of Sub doesn't support activations, so we
-  // always use BroadcastSub.
-  if (kernel_type == kReference) {
-    TF_LITE_SUB(reference_ops, BroadcastSub4DSlow);
+  const bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
+      GetTensorShape(input1), GetTensorShape(input2), &op_params);
+
+#define TF_LITE_SUB(type, opname, data_type)                             \
+  type::opname(op_params, GetTensorShape(input1),                        \
+               GetTensorData<data_type>(input1), GetTensorShape(input2), \
+               GetTensorData<data_type>(input2), GetTensorShape(output), \
+               GetTensorData<data_type>(output))
+    // NOTE: We are using the add kernels. This is possible as the second values
+    // multiplier is negated before being passed down.
+  if (output->type == kTfLiteInt8) {
+    if (need_broadcast) {
+      TF_LITE_SUB(reference_integer_ops, BroadcastAdd4DSlow, int8_t);
+    } else {
+      TF_LITE_SUB(reference_integer_ops, Add, int8_t);
+    }
+  } else if (output->type == kTfLiteUInt8) {
+    if (kernel_type == kReference) {
+      if (need_broadcast) {
+        TF_LITE_SUB(reference_ops, BroadcastAdd4DSlow, uint8_t);
+      } else {
+        TF_LITE_SUB(reference_ops, Add, uint8_t);
+      }
+    } else {
+      if (op_params.broadcast_category ==
+          BroadcastableOpCategory::kGenericBroadcast) {
+        TF_LITE_SUB(optimized_ops, BroadcastAdd4DSlow, uint8_t);
+      } else if (need_broadcast) {
+        TF_LITE_SUB(optimized_ops, BroadcastAddFivefold, uint8_t);
+      } else {
+        TF_LITE_SUB(optimized_ops, Add, uint8_t);
+      }
+    }
   } else {
-    TF_LITE_SUB(optimized_ops, BroadcastSub4DSlow);
+    if (kernel_type == kReference) {
+      if (need_broadcast) {
+        TF_LITE_SUB(reference_ops, BroadcastSub4DSlow, int16_t);
+      } else {
+        TF_LITE_SUB(reference_ops, Sub16, int16_t);
+      }
+    } else {
+      if (need_broadcast) {
+        TF_LITE_SUB(optimized_ops, BroadcastSub4DSlow, int16_t);
+      } else {
+        TF_LITE_SUB(optimized_ops, Sub16, int16_t);
+      }
+    }
   }
 #undef TF_LITE_SUB
 }
@@ -198,7 +350,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32) {
     EvalSub<kernel_type>(context, node, params, data, input1, input2, output);
-  } else if (output->type == kTfLiteUInt8) {
+  } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
+             output->type == kTfLiteInt16) {
     EvalQuantized<kernel_type>(context, node, params, data, input1, input2,
                                output);
   } else {
diff --git a/tensorflow/lite/kernels/sub_test.cc b/tensorflow/lite/kernels/sub_test.cc
index 41503300ab599fbfcfee425c41033dd3bc10d2ea..3c19678b20f21894461f5ef79b1df6c45e1cac5a 100644
--- a/tensorflow/lite/kernels/sub_test.cc
+++ b/tensorflow/lite/kernels/sub_test.cc
@@ -63,17 +63,27 @@ class QuantizedSubOpModel : public BaseSubOpModel {
  public:
   using BaseSubOpModel::BaseSubOpModel;
 
+  template <typename integer_dtype>
   std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+    return Dequantize<integer_dtype>(ExtractVector<integer_dtype>(output_),
+                                     GetScale(output_), GetZeroPoint(output_));
+  }
+
+  std::vector<float> GetDequantizedOutputInt16() {
+    return Dequantize<int16_t>(ExtractVector<int16_t>(output_),
                                GetScale(output_), GetZeroPoint(output_));
   }
 };
 
-// for quantized Sub, the error shouldn't exceed 2*step
+// for quantized Sub, the error shouldn't exceed step
 float GetTolerance(int min, int max) {
   float kQuantizedStep = (max - min) / 255.0;
-  float kQuantizedTolerance = 2.0 * kQuantizedStep;
-  return kQuantizedTolerance;
+  return kQuantizedStep;
+}
+
+float GetToleranceInt16(float min, float max) {
+  float kQuantizedStep = (max - min) / std::numeric_limits<int16_t>::max();
+  return kQuantizedStep;
 }
 
 TEST(FloatSubOpModel, NoActivation) {
@@ -183,7 +193,8 @@ TEST(IntegerSubOpModel, WithBroadcast) {
   }
 }
 
-TEST(QuantizedSubOpModel, QuantizedTestsNoActivation) {
+template <TensorType tensor_type, typename integer_dtype>
+void QuantizedTestsNoActivation() {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
   std::vector<std::vector<float>> inputs1 = {
       {0.1, 0.2, 0.3, 0.4}, {-0.2, 0.2, 0.4, 0.7}, {-0.01, 0.2, 0.7, 0.3}};
@@ -193,20 +204,30 @@ TEST(QuantizedSubOpModel, QuantizedTestsNoActivation) {
                                              {-0.8, -0.2, -0.1, 0.9},
                                              {-0.61, -0.2, 0.88, -0.2}};
   for (int i = 0; i < inputs1.size(); ++i) {
-    QuantizedSubOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                          {TensorType_UINT8, {}, -1.0, 1.0},
+    QuantizedSubOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                          {tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                          {tensor_type, {}, -1.0, 1.0},
                           ActivationFunctionType_NONE);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[i]);
+    m.QuantizeAndPopulate<integer_dtype>(m.input1(), inputs1[i]);
+    m.QuantizeAndPopulate<integer_dtype>(m.input2(), inputs2[i]);
     m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
-                                              results[i], kQuantizedTolerance)))
+    EXPECT_THAT(
+        m.GetDequantizedOutput<integer_dtype>(),
+        ElementsAreArray(ArrayFloatNear(results[i], kQuantizedTolerance)))
         << "With test number " << i;
   }
 }
 
-TEST(QuantizedSubOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
+TEST(QuantizedSubOpModel, QuantizedTestsNoActivationUInt8) {
+  QuantizedTestsNoActivation<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedSubOpModel, QuantizedTestsNoActivationInt8) {
+  QuantizedTestsNoActivation<TensorType_INT8, int8_t>();
+}
+
+template <TensorType tensor_type, typename integer_dtype>
+void QuantizedTestsActivationRELU_N1_TO_1() {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
   std::vector<std::vector<float>> inputs1 = {{-0.8, 0.2, 0.9, 0.7},
                                              {-0.8, 0.2, 0.7, 0.5}};
@@ -215,57 +236,185 @@ TEST(QuantizedSubOpModel, QuantizedTestsActivationRELU_N1_TO_1) {
   std::vector<std::vector<float>> results = {{-1.0, -0.2, 0.0, 1.0},
                                              {-1.0, -0.2, 1.0, 0.2}};
   for (int i = 0; i < inputs1.size(); ++i) {
-    QuantizedSubOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                          {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
-                          {TensorType_UINT8, {}, -1.0, 1.0},
+    QuantizedSubOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                          {tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                          {tensor_type, {}, -1.0, 1.0},
                           ActivationFunctionType_RELU_N1_TO_1);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), inputs1[i]);
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), inputs2[i]);
+    m.QuantizeAndPopulate<integer_dtype>(m.input1(), inputs1[i]);
+    m.QuantizeAndPopulate<integer_dtype>(m.input2(), inputs2[i]);
     m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
-                                              results[i], kQuantizedTolerance)))
+    EXPECT_THAT(
+        m.GetDequantizedOutput<integer_dtype>(),
+        ElementsAreArray(ArrayFloatNear(results[i], kQuantizedTolerance)))
         << "With test number " << i;
   }
 }
+TEST(QuantizedSubOpModel, QuantizedTestsActivationRELUN1TO1UInt8) {
+  QuantizedTestsActivationRELU_N1_TO_1<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedSubOpModel, QuantizedTestsActivationRELUN1TO1Int8) {
+  QuantizedTestsActivationRELU_N1_TO_1<TensorType_INT8, int8_t>();
+}
 
-TEST(QuantizedSubOpModel, QuantizedVariousInputShapes) {
+template <TensorType tensor_type, typename integer_dtype>
+void QuantizedVariousInputShapes() {
   float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
   std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
-    QuantizedSubOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
-                          {TensorType_UINT8, test_shapes[i], -3.0, 3.0},
-                          {TensorType_UINT8, {}, -3.0, 3.0},
+    QuantizedSubOpModel m({tensor_type, test_shapes[i], -3.0, 3.0},
+                          {tensor_type, test_shapes[i], -3.0, 3.0},
+                          {tensor_type, {}, -3.0, 3.0},
                           ActivationFunctionType_NONE);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.1, 0.3, 0.3, 0.5, 1.1, 0.1});
+    m.QuantizeAndPopulate<integer_dtype>(m.input1(),
+                                         {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<integer_dtype>(m.input2(),
+                                         {0.1, 0.3, 0.3, 0.5, 1.1, 0.1});
     m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(),
+    EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
                 ElementsAreArray(ArrayFloatNear(
                     {-2.1, -0.1, 0.4, 0.3, 0.0, 1.9}, kQuantizedTolerance)))
         << "With shape number " << i;
   }
 }
 
-TEST(QuantizedSubOpModel, QuantizedWithBroadcast) {
+TEST(QuantizedSubOpModel, QuantizedVariousInputShapesUInt8) {
+  QuantizedVariousInputShapes<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedSubOpModel, QuantizedVariousInputShapesInt8) {
+  QuantizedVariousInputShapes<TensorType_INT8, int8_t>();
+}
+
+template <TensorType tensor_type, typename integer_dtype>
+void QuantizedWithBroadcast() {
   float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
   std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
-    QuantizedSubOpModel m({TensorType_UINT8, test_shapes[i], -3.0, 3.0},
-                          {TensorType_UINT8, {}, -3.0, 3.0},
-                          {TensorType_UINT8, {}, -3.0, 3.0},
-                          ActivationFunctionType_NONE);
-    m.QuantizeAndPopulate<uint8_t>(m.input1(), {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
-    m.QuantizeAndPopulate<uint8_t>(m.input2(), {0.7});
+    QuantizedSubOpModel m(
+        {tensor_type, test_shapes[i], -3.0, 3.0}, {tensor_type, {}, -3.0, 3.0},
+        {tensor_type, {}, -3.0, 3.0}, ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<integer_dtype>(m.input1(),
+                                         {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+    m.QuantizeAndPopulate<integer_dtype>(m.input2(), {0.7});
     m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput(),
+    EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
                 ElementsAreArray(ArrayFloatNear(
                     {-2.7, -0.5, 0.0, 0.1, 0.4, 1.3}, kQuantizedTolerance)))
         << "With shape number " << i;
   }
 }
 
+TEST(QuantizedSubOpModel, QuantizedWithBroadcastUInt8) {
+  QuantizedWithBroadcast<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedSubOpModel, QuantizedWithBroadcastInt8) {
+  QuantizedWithBroadcast<TensorType_INT8, int8_t>();
+}
+
+TEST(QuantizedSubOpModel, QuantizedTestsNoActivationInt16) {
+  const float kMin = -1.f;
+  const float kMax =
+      static_cast<float>(std::numeric_limits<int16_t>::max() - 1) /
+      std::numeric_limits<int16_t>::max();
+  float kQuantizedTolerance = GetToleranceInt16(kMin, kMax);
+  std::vector<std::vector<float>> inputs1 = {
+      {0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.3, 0.8}};
+  std::vector<std::vector<float>> inputs2 = {
+      {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.8}, {0.6, 0.4, 0.8, 0.5}};
+  std::vector<std::vector<float>> results = {
+      {0.1, 0.2, 0.3, 0.4}, {-0.8, 0.2, 0.4, 0.7}, {-0.8, 0.2, -1.0, 0.3}};
+  for (int i = 0; i < inputs1.size(); ++i) {
+    QuantizedSubOpModel m({TensorType_INT16, {1, 2, 2, 1}, kMin, kMax},
+                          {TensorType_INT16, {1, 2, 2, 1}, kMin, kMax},
+                          {TensorType_INT16, {}, kMin, kMax},
+                          ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<int16_t>(m.input1(), inputs1[i]);
+    m.QuantizeAndPopulate<int16_t>(m.input2(), inputs2[i]);
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetDequantizedOutputInt16(),
+        ElementsAreArray(ArrayFloatNear(results[i], kQuantizedTolerance)))
+        << "With test number " << i;
+  }
+}
+
+TEST(QuantizedSubOpModel, QuantizedTestsReluActivationInt16) {
+  const float kMin = -2.f;
+  const float kMax = 2.0 * (std::numeric_limits<int16_t>::max() - 1) /
+                     std::numeric_limits<int16_t>::max();
+  float kQuantizedTolerance = GetToleranceInt16(kMin, kMax);
+  std::vector<std::vector<float>> inputs1 = {{-0.8, 0.2, 0.9, 0.7},
+                                             {-0.8, 0.2, 0.7, 0.5}};
+  std::vector<std::vector<float>> inputs2 = {{0.6, 0.4, 0.9, -0.8},
+                                             {0.6, 0.4, -0.8, 0.3}};
+  std::vector<std::vector<float>> results = {{-1.0, -0.2, 0.0, 1.0},
+                                             {-1.0, -0.2, 1.0, 0.2}};
+  for (int i = 0; i < inputs1.size(); ++i) {
+    QuantizedSubOpModel m({TensorType_INT16, {1, 2, 2, 1}, kMin, kMax},
+                          {TensorType_INT16, {1, 2, 2, 1}, kMin, kMax},
+                          {TensorType_INT16, {}, kMin, kMax},
+                          ActivationFunctionType_RELU_N1_TO_1);
+    m.QuantizeAndPopulate<int16_t>(m.input1(), inputs1[i]);
+    m.QuantizeAndPopulate<int16_t>(m.input2(), inputs2[i]);
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetDequantizedOutputInt16(),
+        ElementsAreArray(ArrayFloatNear(results[i], kQuantizedTolerance)))
+        << "With test number " << i;
+  }
+}
+
+TEST(QuantizedSubOpModel, QuantizedTestsNoActivationBroadcastInt16) {
+  const float kMin = -1.f;
+  const float kMax =
+      static_cast<float>(std::numeric_limits<int16_t>::max() - 1) /
+      std::numeric_limits<int16_t>::max();
+  float kQuantizedTolerance = GetToleranceInt16(kMin, kMax);
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedSubOpModel m({TensorType_INT16, test_shapes[i], kMin, kMax},
+                          {TensorType_INT16, {}, kMin, kMax},
+                          {TensorType_INT16, {}, kMin, kMax},
+                          ActivationFunctionType_NONE);
+    m.QuantizeAndPopulate<int16_t>(m.input1(),
+                                   {-0.9, -0.7, -0.3, 0.0, 0.3, 0.5});
+    m.QuantizeAndPopulate<int16_t>(m.input2(), {0.2});
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutputInt16(),
+                ElementsAreArray(ArrayFloatNear(
+                    {-1.0, -0.9, -0.5, -0.2, 0.1, 0.3}, kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+}
+
+TEST(QuantizedSubOpModel, QuantizedTestsReluActivationBroadcastInt16) {
+  const float kMin = -2.f;
+  const float kMax = 2.0 * (std::numeric_limits<int16_t>::max() - 1) /
+                     std::numeric_limits<int16_t>::max();
+  float kQuantizedTolerance = GetToleranceInt16(kMin, kMax);
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    QuantizedSubOpModel m({TensorType_INT16, test_shapes[i], kMin, kMax},
+                          {TensorType_INT16, {}, kMin, kMax},
+                          {TensorType_INT16, {}, kMin, kMax},
+                          ActivationFunctionType_RELU_N1_TO_1);
+    m.QuantizeAndPopulate<int16_t>(m.input1(),
+                                   {-0.9, -0.7, -0.3, 0.0, 0.3, 0.5});
+    m.QuantizeAndPopulate<int16_t>(m.input2(), {0.2});
+    m.Invoke();
+    EXPECT_THAT(m.GetDequantizedOutputInt16(),
+                ElementsAreArray(ArrayFloatNear(
+                    {-1.0, -0.9, -0.5, -0.2, 0.1, 0.3}, kQuantizedTolerance)))
+        << "With shape number " << i;
+  }
+}
+
 }  // namespace
 }  // namespace tflite
 int main(int argc, char** argv) {
diff --git a/tensorflow/lite/kernels/subgraph_test_util.cc b/tensorflow/lite/kernels/subgraph_test_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e712be1b516ed0dca5097e66c2d1f20e63a78038
--- /dev/null
+++ b/tensorflow/lite/kernels/subgraph_test_util.cc
@@ -0,0 +1,409 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/subgraph_test_util.h"
+
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+
+namespace ops {
+namespace builtin {
+// ADD and MUL are used to test simple branch.
+TfLiteRegistration* Register_ADD();
+TfLiteRegistration* Register_MUL();
+// ADD and MUL are used to test dynamic sized subgraphs.
+TfLiteRegistration* Register_PAD();
+TfLiteRegistration* Register_LESS_EQUAL();
+}  // namespace builtin
+namespace custom {
+TfLiteRegistration* Register_IF();
+TfLiteRegistration* Register_WHILE();
+}  // namespace custom
+}  // namespace ops
+
+namespace subgraph_test_util {
+
+namespace {
+
+void SetupTensor(Subgraph* subgraph, int tensor_index, TfLiteType type) {
+  ASSERT_EQ(subgraph->SetTensorParametersReadWrite(tensor_index, type, "", 0,
+                                                   nullptr, {}, false),
+            kTfLiteOk);
+}
+
+}  // namespace
+
+SubgraphBuilder::~SubgraphBuilder() {
+  for (auto buffer : buffers_) {
+    free(buffer);
+  }
+}
+
+void SubgraphBuilder::BuildAddSubgraph(Subgraph* subgraph) {
+  const int kInput1 = 0;
+  const int kInput2 = 1;
+  const int kOutput = 2;
+  const int kTensorCount = 3;
+  // kInput1(0) --> +---+
+  //                |ADD| --> kOutput(2)
+  // kInput2(1) --> +---+
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph->SetInputs({kInput1, kInput2}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({kOutput}), kTfLiteOk);
+
+  SetupTensor(subgraph, kInput1, kTfLiteInt32);
+  SetupTensor(subgraph, kInput2, kTfLiteInt32);
+  SetupTensor(subgraph, kOutput, kTfLiteInt32);
+
+  TfLiteAddParams* params =
+      reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
+  params->activation = kTfLiteActNone;
+  int node_index;
+  subgraph->AddNodeWithParameters(
+      {kInput1, kInput2}, {kOutput}, nullptr, 0, params,
+      ::tflite::ops::builtin::Register_ADD(), &node_index);
+}
+
+// Build a subgraph with an mul op. Helper function for testing.
+void SubgraphBuilder::BuildMulSubgraph(Subgraph* subgraph) {
+  const int kInput1 = 0;
+  const int kInput2 = 1;
+  const int kOutput = 2;
+  const int kTensorCount = 3;
+  // kInput1(0) --> +---+
+  //                |MUL| --> kOutput(2)
+  // kInput2(1) --> +---+
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph->SetInputs({kInput1, kInput2}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({kOutput}), kTfLiteOk);
+
+  SetupTensor(subgraph, kInput1, kTfLiteInt32);
+  SetupTensor(subgraph, kInput2, kTfLiteInt32);
+  SetupTensor(subgraph, kOutput, kTfLiteInt32);
+
+  TfLiteMulParams* params =
+      reinterpret_cast<TfLiteMulParams*>(malloc(sizeof(TfLiteMulParams)));
+  params->activation = kTfLiteActNone;
+  int node_index;
+  subgraph->AddNodeWithParameters(
+      {kInput1, kInput2}, {kOutput}, nullptr, 0, params,
+      ::tflite::ops::builtin::Register_MUL(), &node_index);
+}
+
+// Build a subgraph with a pad op. Helper function for testing.
+void SubgraphBuilder::BuildPadSubgraph(Subgraph* subgraph) {
+  const int kInput1 = 0;
+  const int kInput2 = 1;
+  const int kOutput = 2;
+  const int kTensorCount = 3;
+  // kInput1(0) --> +---+
+  //                |PAD| --> kOutput(2)
+  // kInput2(1) --> +---+
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph->SetInputs({kInput1, kInput2}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({kOutput}), kTfLiteOk);
+
+  SetupTensor(subgraph, kInput1, kTfLiteInt32);
+  SetupTensor(subgraph, kInput2, kTfLiteInt32);
+  SetupTensor(subgraph, kOutput, kTfLiteInt32);
+
+  TfLitePadParams* params =
+      reinterpret_cast<TfLitePadParams*>(malloc(sizeof(TfLitePadParams)));
+  int node_index;
+  subgraph->AddNodeWithParameters(
+      {kInput1, kInput2}, {kOutput}, nullptr, 0, params,
+      ::tflite::ops::builtin::Register_PAD(), &node_index);
+}
+
+void SubgraphBuilder::BuildIfSubgraph(Subgraph* subgraph) {
+  const int kCondInput = 0;
+  const int kInput1 = 1;
+  const int kInput2 = 2;
+  const int kOutput = 3;
+  const int kTensorCount = 4;
+
+  // kCondInput(0) --> +----+
+  // kInput1(1)  ----> | IF | --> kOutput(3)
+  // kInput2(2)  ----> +----+
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph->SetInputs({kCondInput, kInput1, kInput2}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({kOutput}), kTfLiteOk);
+
+  SetupTensor(subgraph, kCondInput, kTfLiteBool);
+  SetupTensor(subgraph, kInput1, kTfLiteInt32);
+  SetupTensor(subgraph, kInput2, kTfLiteInt32);
+  SetupTensor(subgraph, kOutput, kTfLiteInt32);
+
+  flexbuffers::Builder fbb;
+  fbb.Map([&]() {
+    fbb.Int("then_subgraph_index", 1);
+    fbb.Int("else_subgraph_index", 2);
+  });
+  fbb.Finish();
+  const auto& buffer = fbb.GetBuffer();
+
+  int node_index;
+  subgraph->AddNodeWithParameters(
+      {kCondInput, kInput1, kInput2}, {kOutput},
+      reinterpret_cast<const char*>(buffer.data()), buffer.size(), nullptr,
+      ::tflite::ops::custom::Register_IF(), &node_index);
+}
+
+void SubgraphBuilder::BuildLessEqualCondSubgraph(Subgraph* subgraph, int rhs) {
+  const int kInput1 = 0;
+  const int kInput2 = 1;
+  const int kOutput = 2;
+  const int kConstRhs = 3;
+  const int kTensorCount = 4;
+
+  // kInput1(0) ----> +------------+
+  //                  | LESS_EQUAL | --> kOutput(2)
+  // kConstRhs(3) --> +------------+
+  //
+  // kInput2(1) --> (unused)
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph->SetInputs({kInput1, kInput2}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({kOutput}), kTfLiteOk);
+
+  SetupTensor(subgraph, kInput1, kTfLiteInt32);
+  SetupTensor(subgraph, kInput2, kTfLiteInt32);
+  SetupTensor(subgraph, kOutput, kTfLiteBool);
+
+  CreateConstantInt32Tensor(subgraph, kConstRhs, {1}, {rhs});
+  int node_index;
+  subgraph->AddNodeWithParameters(
+      {kInput1, kConstRhs}, {kOutput}, nullptr, 0, nullptr,
+      ::tflite::ops::builtin::Register_LESS_EQUAL(), &node_index);
+}
+
+void SubgraphBuilder::BuildAccumulateLoopBodySubgraph(Subgraph* subgraph) {
+  const int kInputCounter = 0;
+  const int kInputValue = 1;
+  const int kOutputCounter = 2;
+  const int kOutputValue = 3;
+  const int kConstStep = 4;
+  const int kTensorCount = 5;
+
+  // kInputCounter(0) --> +-----+
+  //                      | ADD | --> kOutputCounter(2)
+  // kConstStep(4) -----> +-----+            |
+  //                                         |
+  //                                         v
+  //                                      +-----+
+  //                                      | ADD | --> kOutputValue(3)
+  // kInputValue(1) ----------------------+-----+
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph->SetInputs({kInputCounter, kInputValue}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({kOutputCounter, kOutputValue}), kTfLiteOk);
+
+  SetupTensor(subgraph, kInputCounter, kTfLiteInt32);
+  SetupTensor(subgraph, kInputValue, kTfLiteInt32);
+  SetupTensor(subgraph, kOutputCounter, kTfLiteInt32);
+  SetupTensor(subgraph, kOutputValue, kTfLiteInt32);
+  CreateConstantInt32Tensor(subgraph, kConstStep, {1}, {1});
+
+  int node_index;
+  TfLiteAddParams* params =
+      reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
+  params->activation = kTfLiteActNone;
+  subgraph->AddNodeWithParameters({0, 4}, {2}, nullptr, 0, params,
+                                  ::tflite::ops::builtin::Register_ADD(),
+                                  &node_index);
+  params = reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
+  params->activation = kTfLiteActNone;
+  subgraph->AddNodeWithParameters({2, 1}, {3}, nullptr, 0, params,
+                                  ::tflite::ops::builtin::Register_ADD(),
+                                  &node_index);
+}
+
+void SubgraphBuilder::BuildPadLoopBodySubgraph(Subgraph* subgraph,
+                                               const std::vector<int> padding) {
+  const int kInputCounter = 0;
+  const int kInputValue = 1;
+  const int kOutputCounter = 2;
+  const int kOutputValue = 3;
+  const int kConstStep = 4;
+  const int kConstPadding = 5;
+  const int kTensorCount = 6;
+
+  // kInputCounter(0) --> +-----+
+  //                      | ADD | --> kOutputCounter(2)
+  // kConstStep(4) -----> +-----+
+  //
+  // kInputValue(1) ----> +-----+
+  //                      | PAD | --> kOutputValue(3)
+  // kConstPadding(5) --> +-----+
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph->SetInputs({kInputCounter, kInputValue}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({kOutputCounter, kOutputValue}), kTfLiteOk);
+
+  SetupTensor(subgraph, kInputCounter, kTfLiteInt32);
+  SetupTensor(subgraph, kInputValue, kTfLiteInt32);
+  SetupTensor(subgraph, kOutputCounter, kTfLiteInt32);
+  SetupTensor(subgraph, kOutputValue, kTfLiteInt32);
+
+  CreateConstantInt32Tensor(subgraph, kConstStep, {1}, {1});
+  ASSERT_EQ(padding.size() % 2, 0);
+  int padding_dims = padding.size();
+  CreateConstantInt32Tensor(subgraph, kConstPadding, {1, padding_dims},
+                            padding);
+
+  int node_index;
+  TfLiteAddParams* add_params =
+      reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
+  add_params->activation = kTfLiteActNone;
+  subgraph->AddNodeWithParameters(
+      {kInputCounter, kConstStep}, {kOutputCounter}, nullptr, 0, add_params,
+      ::tflite::ops::builtin::Register_ADD(), &node_index);
+  TfLitePadParams* pad_params =
+      reinterpret_cast<TfLitePadParams*>(malloc(sizeof(TfLiteAddParams)));
+  subgraph->AddNodeWithParameters(
+      {kInputValue, kConstPadding}, {kOutputValue}, nullptr, 0, pad_params,
+      ::tflite::ops::builtin::Register_PAD(), &node_index);
+}
+
+void SubgraphBuilder::BuildWhileSubgraph(Subgraph* subgraph) {
+  const int kInput1 = 0;
+  const int kInput2 = 1;
+  const int kOutput1 = 2;
+  const int kOutput2 = 3;
+  const int kTensorCount = 4;
+
+  // kInput1(0) --> +-------+ --> kOutput1(2)
+  //                | WHILE |
+  // kInput2(1) --> +-------+ --> kOutput2(3)
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph->SetInputs({kInput1, kInput2}), kTfLiteOk);
+  ASSERT_EQ(subgraph->SetOutputs({kOutput1, kOutput2}), kTfLiteOk);
+
+  SetupTensor(subgraph, kInput1, kTfLiteInt32);
+  SetupTensor(subgraph, kInput2, kTfLiteInt32);
+  SetupTensor(subgraph, kOutput1, kTfLiteInt32);
+  SetupTensor(subgraph, kOutput2, kTfLiteInt32);
+
+  flexbuffers::Builder fbb;
+  fbb.Map([&]() {
+    fbb.Int("cond_subgraph_index", 1);
+    fbb.Int("body_subgraph_index", 2);
+  });
+  fbb.Finish();
+  const auto& buffer = fbb.GetBuffer();
+
+  int node_index;
+  subgraph->AddNodeWithParameters(
+      {0, 1}, {2, 3}, reinterpret_cast<const char*>(buffer.data()),
+      buffer.size(), nullptr, ::tflite::ops::custom::Register_WHILE(),
+      &node_index);
+}
+
+void SubgraphBuilder::CreateConstantInt32Tensor(Subgraph* subgraph,
+                                                int tensor_index,
+                                                const std::vector<int>& shape,
+                                                const std::vector<int>& data) {
+  ASSERT_GT(shape.size(), 0);
+  int num_elements = 1;
+  for (int dim : shape) {
+    num_elements *= dim;
+  }
+  ASSERT_EQ(data.size(), num_elements);
+  size_t size_in_bytes = sizeof(int32_t) * num_elements;
+  // Maybe aligned.
+  int32_t* buffer = reinterpret_cast<int32_t*>(malloc(size_in_bytes));
+  for (int i = 0; i < num_elements; ++i) {
+    buffer[i] = data[i];
+  }
+  buffers_.push_back(buffer);
+  ASSERT_EQ(subgraph->SetTensorParametersReadOnly(
+                tensor_index, kTfLiteInt32, "", shape, {},
+                reinterpret_cast<const char*>(buffer), size_in_bytes),
+            kTfLiteOk);
+}
+
+void FillIntTensor(TfLiteTensor* tensor, const std::vector<int32_t>& data) {
+  int count = NumElements(tensor);
+  ASSERT_EQ(count, data.size());
+  for (int i = 0; i < count; ++i) {
+    tensor->data.i32[i] = data[i];
+  }
+}
+
+void CheckIntTensor(const TfLiteTensor* tensor, const std::vector<int>& shape,
+                    const std::vector<int32_t>& data) {
+  ASSERT_EQ(tensor->dims->size, shape.size());
+  for (int i = 0; i < tensor->dims->size; ++i) {
+    ASSERT_EQ(tensor->dims->data[i], shape[i]);
+  }
+  ASSERT_EQ(tensor->type, kTfLiteInt32);
+  int count = NumElements(tensor);
+  ASSERT_EQ(count, data.size());
+  for (int i = 0; i < count; ++i) {
+    EXPECT_EQ(tensor->data.i32[i], data[i]);
+  }
+}
+
+void CheckBoolTensor(const TfLiteTensor* tensor, const std::vector<int>& shape,
+                     const std::vector<bool>& data) {
+  ASSERT_EQ(tensor->dims->size, shape.size());
+  for (int i = 0; i < tensor->dims->size; ++i) {
+    ASSERT_EQ(tensor->dims->data[i], shape[i]);
+  }
+  ASSERT_EQ(tensor->type, kTfLiteBool);
+  int count = NumElements(tensor);
+  ASSERT_EQ(count, data.size());
+  for (int i = 0; i < count; ++i) {
+    EXPECT_EQ(tensor->data.b[i], data[i]);
+  }
+}
+
+}  // namespace subgraph_test_util
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/subgraph_test_util.h b/tensorflow/lite/kernels/subgraph_test_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..972f1381af2804252461bf81dfbce3563be41c3b
--- /dev/null
+++ b/tensorflow/lite/kernels/subgraph_test_util.h
@@ -0,0 +1,123 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This module provides helper functions for testing the interaction between
+// control flow ops and subgraphs.
+// For convenience, we mostly only use `kTfLiteInt32` in this module.
+
+#ifndef TENSORFLOW_LITE_KERNELS_SUBGRAPH_TEST_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_SUBGRAPH_TEST_UTIL_H_
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/interpreter.h"
+
+namespace tflite {
+namespace subgraph_test_util {
+
+// TODO(ycling): This file should be renamed as
+// `control_flow_test_util` to avoid confusion. I'll do it immediately
+// in a separated change.
+class SubgraphBuilder {
+ public:
+  ~SubgraphBuilder();
+
+  // Build a subgraph with a single Add op.
+  // 2 inputs. 1 output.
+  void BuildAddSubgraph(Subgraph* subgraph);
+
+  // Build a subgraph with a single Mul op.
+  // 2 inputs. 1 output.
+  void BuildMulSubgraph(Subgraph* subgraph);
+
+  // Build a subgraph with a single Pad op.
+  // 2 inputs. 1 output.
+  void BuildPadSubgraph(Subgraph* subgraph);
+
+  // Build a subgraph with a single If op.
+  // 3 inputs:
+  //   The 1st input is condition with boolean type.
+  //   The 2nd and 3rd inputs are feed input the branch subgraphs.
+  // 1 output.
+  void BuildIfSubgraph(Subgraph* subgraph);
+
+  // Build a subgraph with a single Less op.
+  // The subgraph is used as the condition subgraph for testing `While` op.
+  // 2 inputs:
+  //   The 1st input is a counter with `kTfLiteInt32` type.
+  //   The 2nd input is ignored in this subgraph.
+  // 1 output with `kTfLiteBool` type.
+  //   Equivalent to (input < rhs).
+  void BuildLessEqualCondSubgraph(Subgraph* subgraph, int rhs);
+
+  // An accumulate loop body subgraph. Used to produce triangle number
+  // seqeuence. 2 inputs and 2 outpus
+  //   Equivalent to (counter, value) -> (counter + 1, counter + 1 + value)
+  void BuildAccumulateLoopBodySubgraph(Subgraph* subgraph);
+
+  // A pad loop body subgraph. When used in a loop it will repeatively enlarge
+  // the
+  //   tensor.
+  // 2 inputs and 2 outputs.
+  //   Equivalent to (counter, value) -> (counter + 1, tf.pad(value, padding))
+  // Note the padding is created as a constant tensor.
+  void BuildPadLoopBodySubgraph(Subgraph* subgraph,
+                                const std::vector<int> padding);
+
+  // Build a subgraph with a single While op.
+  // 2 inputs, 2 outputs.
+  void BuildWhileSubgraph(Subgraph* subgraph);
+
+ private:
+  void CreateConstantInt32Tensor(Subgraph* subgraph, int tensor_index,
+                                 const std::vector<int>& shape,
+                                 const std::vector<int>& data);
+  std::vector<void*> buffers_;
+};
+
+class ControlFlowOpTest : public ::testing::Test {
+ public:
+  ControlFlowOpTest()
+      : interpreter_(new Interpreter), builder_(new SubgraphBuilder) {}
+
+  ~ControlFlowOpTest() override {
+    interpreter_.reset();
+    builder_.reset();
+  }
+
+ protected:
+  std::unique_ptr<Interpreter> interpreter_;
+  std::unique_ptr<SubgraphBuilder> builder_;
+};
+
+// Fill a `TfLiteTensor` with a 32-bits integer vector.
+// Preconditions:
+// * The tensor must have `kTfLiteInt32` type.
+// * The tensor must be allocated.
+// * The element count of the tensor must be equal to the length or
+//   the vector.
+void FillIntTensor(TfLiteTensor* tensor, const std::vector<int32_t>& data);
+
+// Check if the shape and int32 data of a tensor is as expected.
+void CheckIntTensor(const TfLiteTensor* tensor, const std::vector<int>& shape,
+                    const std::vector<int32_t>& data);
+// Check if the shape and bool data of a tensor is as expected.
+void CheckBoolTensor(const TfLiteTensor* tensor, const std::vector<int>& shape,
+                     const std::vector<bool>& data);
+
+}  // namespace subgraph_test_util
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_SUBGRAPH_TEST_UTIL_H_
diff --git a/tensorflow/lite/kernels/subgraph_test_util_test.cc b/tensorflow/lite/kernels/subgraph_test_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..04e5118b543c1723e1de1875ffd9315991a4dd69
--- /dev/null
+++ b/tensorflow/lite/kernels/subgraph_test_util_test.cc
@@ -0,0 +1,157 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/subgraph_test_util.h"
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/test_util.h"
+
+namespace tflite {
+
+namespace subgraph_test_util {
+
+namespace {
+
+class SubgraphBuilderTest : public ::testing::Test {
+ public:
+  SubgraphBuilderTest()
+      : interpreter_(new Interpreter), builder_(new SubgraphBuilder) {}
+
+  ~SubgraphBuilderTest() override {
+    interpreter_.reset();
+    builder_.reset();
+  }
+
+ protected:
+  void TestAccumelateLoopBody(int input1, int input2, int output1,
+                              int output2) {
+    interpreter_.reset(new Interpreter);
+    builder_->BuildAccumulateLoopBodySubgraph(
+        &interpreter_->primary_subgraph());
+
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {1});
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {1});
+    ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+    FillIntTensor(interpreter_->tensor(interpreter_->inputs()[0]), {input1});
+    FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]), {input2});
+
+    ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+    TfLiteTensor* output_tensor1 =
+        interpreter_->tensor(interpreter_->outputs()[0]);
+    CheckIntTensor(output_tensor1, {1}, {output1});
+    TfLiteTensor* output_tensor2 =
+        interpreter_->tensor(interpreter_->outputs()[1]);
+    CheckIntTensor(output_tensor2, {1}, {output2});
+  }
+
+  std::unique_ptr<Interpreter> interpreter_;
+  std::unique_ptr<SubgraphBuilder> builder_;
+};
+
+TEST_F(SubgraphBuilderTest, TestBuildAddSubgraph) {
+  builder_->BuildAddSubgraph(&interpreter_->primary_subgraph());
+
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {2});
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {1, 2});
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[0]), {5, 7});
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]), {1, 2});
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+
+  TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[0]);
+  CheckIntTensor(output, {1, 2}, {6, 9});
+}
+
+TEST_F(SubgraphBuilderTest, TestBuildMulSubgraph) {
+  builder_->BuildMulSubgraph(&interpreter_->primary_subgraph());
+
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {2});
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {1, 2});
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[0]), {5, 7});
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]), {1, 2});
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+
+  TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[0]);
+  CheckIntTensor(output, {1, 2}, {5, 14});
+}
+
+TEST_F(SubgraphBuilderTest, TestBuildPadSubgraph) {
+  builder_->BuildPadSubgraph(&interpreter_->primary_subgraph());
+
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {2});
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {1, 2});
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[0]), {5, 7});
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]), {1, 2});
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+
+  TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[0]);
+  CheckIntTensor(output, {5}, {0, 5, 7, 0, 0});
+}
+
+TEST_F(SubgraphBuilderTest, TestBuildLessEqualCondSubgraph) {
+  builder_->BuildLessEqualCondSubgraph(&interpreter_->primary_subgraph(), 3);
+
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {5});
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {10, 10});
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  // Test [1, 2, 3, 4, 5] <= 3 == [true, true, true, false, false]
+  // (with broadcasting).
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[0]),
+                {1, 2, 3, 4, 5});
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  TfLiteTensor* output = interpreter_->tensor(interpreter_->outputs()[0]);
+  CheckBoolTensor(output, {5}, {true, true, true, false, false});
+}
+
+TEST_F(SubgraphBuilderTest, TestBuildAccumulateLoopBodySubgraph) {
+  TestAccumelateLoopBody(1, 1, 2, 3);
+  TestAccumelateLoopBody(2, 3, 3, 6);
+  TestAccumelateLoopBody(3, 6, 4, 10);
+}
+
+TEST_F(SubgraphBuilderTest, TestBuildPadLoopBodySubgraph) {
+  builder_->BuildPadLoopBodySubgraph(&interpreter_->primary_subgraph(), {1, 2});
+
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {1});
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {5});
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[0]), {1});
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]),
+                {0, 5, 7, 0, 0});
+
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  TfLiteTensor* output1 = interpreter_->tensor(interpreter_->outputs()[0]);
+  CheckIntTensor(output1, {1}, {2});
+  TfLiteTensor* output2 = interpreter_->tensor(interpreter_->outputs()[1]);
+  CheckIntTensor(output2, {8}, {0, 0, 5, 7, 0, 0, 0, 0});
+}
+
+}  // namespace
+}  // namespace subgraph_test_util
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/svdf.cc b/tensorflow/lite/kernels/svdf.cc
index f07937140e9ac4abfbae47a1679ddbfba4d30938..d8fc7ce1cea6f8bbf7b4f08fa80e635b0735d08c 100644
--- a/tensorflow/lite/kernels/svdf.cc
+++ b/tensorflow/lite/kernels/svdf.cc
@@ -176,8 +176,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                     context->ResizeTensor(context, output, output_size_array));
 
   // The weights are of consistent type, so it suffices to check one.
-  const bool is_hybrid_op =
-      (input->type == kTfLiteFloat32 && weights_feature->type == kTfLiteUInt8);
+  const bool is_hybrid_op = (input->type == kTfLiteFloat32 &&
+                             (weights_feature->type == kTfLiteUInt8 ||
+                              weights_feature->type == kTfLiteInt8));
 
   // Resize scratch.
   TfLiteIntArrayFree(node->temporaries);
@@ -203,7 +204,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // of input tensors.
     node->temporaries->data[1] = scratch_tensor_index + 1;
     TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
-    input_quantized->type = kTfLiteUInt8;
+    input_quantized->type = weights_feature->type;
     input_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
       TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
@@ -297,16 +298,24 @@ TfLiteStatus EvalHybrid(
   // Initialize the pointer to input.
   const float* input_ptr_batch = input->data.f;
 
-  // Initialize the pointer to storage for quantized values and
-  // scaling factors.
-  int8_t* quantized_input_ptr_batch =
-      reinterpret_cast<int8_t*>(input_quantized->data.uint8);
+  // Initialize the pointer to storage for quantized values and the weights
+  // feature.
+  int8_t* quantized_input_ptr_batch;
+  const int8_t* weights_feature_ptr;
+  if (weights_feature->type == kTfLiteUInt8) {
+    quantized_input_ptr_batch =
+        reinterpret_cast<int8_t*>(input_quantized->data.uint8);
+    weights_feature_ptr =
+        reinterpret_cast<int8_t*>(weights_feature->data.uint8);
+  } else {
+    quantized_input_ptr_batch = input_quantized->data.int8;
+    weights_feature_ptr = weights_feature->data.int8;
+  }
 
+  // Initialize the pointer to storage for scaling factors.
   float* scaling_factors_ptr = scaling_factors->data.f;
 
-  // Other initializations.
-  const int8_t* weights_feature_ptr =
-      reinterpret_cast<int8_t*>(weights_feature->data.uint8);
+  // Initialize the weights scale.
   const float weights_feature_scale = weights_feature->params.scale;
 
   // Clear the activation (state left most column).
@@ -374,7 +383,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                        bias, params, scratch, activation_state, output);
       break;
     }
-    case kTfLiteUInt8: {
+    case kTfLiteUInt8:
+    case kTfLiteInt8: {
       TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
       TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/2);
       TfLiteTensor* float_weights_time =
@@ -388,8 +398,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       // TODO(alanchiao): refactor logic out into dequantize function.
       if (!op_data->float_weights_time_initialized) {
         const float dequantization_scale = weights_time->params.scale;
-        const int8_t* weights_time_ptr =
-            reinterpret_cast<int8_t*>(weights_time->data.uint8);
+        const int8_t* weights_time_ptr;
+        if (weights_feature->type == kTfLiteUInt8) {
+          weights_time_ptr =
+              reinterpret_cast<int8_t*>(weights_time->data.uint8);
+        } else {
+          weights_time_ptr = weights_time->data.int8;
+        }
         for (int i = 0; i < NumElements(float_weights_time); ++i) {
           float_weights_time->data.f[i] =
               weights_time_ptr[i] * dequantization_scale;
diff --git a/tensorflow/lite/kernels/svdf_test.cc b/tensorflow/lite/kernels/svdf_test.cc
index 8accaa465ca8a51f2b6e00648a6195f31039d3f7..c420260bf51bd45944a7b77a81e20e56999c8fbb 100644
--- a/tensorflow/lite/kernels/svdf_test.cc
+++ b/tensorflow/lite/kernels/svdf_test.cc
@@ -203,17 +203,30 @@ class SVDFOpModel : public BaseSVDFOpModel {
 class HybridSVDFOpModel : public BaseSVDFOpModel {
  public:
   HybridSVDFOpModel(int batches, int units, int input_size, int memory_size,
-                    int rank)
+                    int rank, TensorType tensor_type)
       : BaseSVDFOpModel(batches, units, input_size, memory_size, rank,
-                        TensorType_UINT8, TensorType_UINT8) {}
+                        tensor_type, tensor_type) {
+    tensor_type_ = tensor_type;
+  }
+
+  void SetWeights(int weights_idx, const std::vector<float>& f) {
+    if (tensor_type_ == TensorType_UINT8) {
+      SymmetricQuantizeAndPopulate(weights_idx, f);
+    } else {
+      SignedSymmetricQuantizeAndPopulate(weights_idx, f);
+    }
+  }
 
   void SetWeightsFeature(std::initializer_list<float> f) {
-    SymmetricQuantizeAndPopulate(weights_feature_, f);
+    SetWeights(weights_feature_, f);
   }
 
   void SetWeightsTime(std::initializer_list<float> f) {
-    SymmetricQuantizeAndPopulate(weights_time_, f);
+    SetWeights(weights_time_, f);
   }
+
+ protected:
+  TensorType tensor_type_;
 };
 
 class SVDFOpTest : public ::testing::Test {
@@ -312,9 +325,74 @@ TEST_F(SVDFOpTest, BlackBoxTestRank2) {
                 &svdf);
 }
 
-TEST_F(SVDFOpTest, BlackBoxTestHybridRank1) {
+TEST_F(SVDFOpTest, BlackBoxTestHybridRank1Uint8) {
+  HybridSVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3,
+                         /*memory_size=*/10, /*rank=*/1, TensorType_UINT8);
+  svdf.SetWeightsFeature({-0.31930989, -0.36118156, 0.0079667, 0.37613347,
+                          0.22197971, 0.12416199, 0.27901134, 0.27557442,
+                          0.3905206, -0.36137494, -0.06634006, -0.10640851});
+
+  svdf.SetWeightsTime(
+      {-0.31930989, 0.37613347,  0.27901134,  -0.36137494, -0.36118156,
+       0.22197971,  0.27557442,  -0.06634006, 0.0079667,   0.12416199,
+
+       0.3905206,   -0.10640851, -0.0976817,  0.15294972,  0.39635518,
+       -0.02702999, 0.39296314,  0.15785322,  0.21931258,  0.31053296,
+
+       -0.36916667, 0.38031587,  -0.21580373, 0.27072677,  0.23622236,
+       0.34936687,  0.18174365,  0.35907319,  -0.17493086, 0.324846,
+
+       -0.10781813, 0.27201805,  0.14324132,  -0.23681851, -0.27115166,
+       -0.01580888, -0.14943552, 0.15465137,  0.09784451,  -0.0337657});
+
+  VerifyGoldens(svdf_input, svdf_golden_output_rank_1, sizeof(svdf_input),
+                &svdf,
+                /*tolerance=*/0.002945);
+}
+
+TEST_F(SVDFOpTest, BlackBoxTestHybridRank2Uint8) {
+  HybridSVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3,
+                         /*memory_size=*/10, /*rank=*/2, TensorType_UINT8);
+  svdf.SetWeightsFeature({-0.31930989, 0.0079667,   0.39296314,  0.37613347,
+                          0.12416199,  0.15785322,  0.27901134,  0.3905206,
+                          0.21931258,  -0.36137494, -0.10640851, 0.31053296,
+                          -0.36118156, -0.0976817,  -0.36916667, 0.22197971,
+                          0.15294972,  0.38031587,  0.27557442,  0.39635518,
+                          -0.21580373, -0.06634006, -0.02702999, 0.27072677});
+
+  svdf.SetWeightsTime(
+      {-0.31930989, 0.37613347,  0.27901134,  -0.36137494, -0.36118156,
+       0.22197971,  0.27557442,  -0.06634006, 0.0079667,   0.12416199,
+
+       0.3905206,   -0.10640851, -0.0976817,  0.15294972,  0.39635518,
+       -0.02702999, 0.39296314,  0.15785322,  0.21931258,  0.31053296,
+
+       -0.36916667, 0.38031587,  -0.21580373, 0.27072677,  0.23622236,
+       0.34936687,  0.18174365,  0.35907319,  -0.17493086, 0.324846,
+
+       -0.10781813, 0.27201805,  0.14324132,  -0.23681851, -0.27115166,
+       -0.01580888, -0.14943552, 0.15465137,  0.09784451,  -0.0337657,
+
+       -0.14884081, 0.19931212,  -0.36002168, 0.34663299,  -0.11405486,
+       0.12672701,  0.39463779,  -0.07886535, -0.06384811, 0.08249187,
+
+       -0.26816407, -0.19905911, 0.29211238,  0.31264046,  -0.28664589,
+       0.05698794,  0.11613581,  0.14078894,  0.02187902,  -0.21781836,
+
+       -0.15567942, 0.08693647,  -0.38256618, 0.36580828,  -0.22922277,
+       -0.0226903,  0.12878349,  -0.28122205, -0.10850525, -0.11955214,
+
+       0.27179423,  -0.04710215, 0.31069002,  0.22672787,  0.09580326,
+       0.08682203,  0.1258215,   0.1851041,   0.29228821,  0.12366763});
+
+  VerifyGoldens(svdf_input, svdf_golden_output_rank_2, sizeof(svdf_input),
+                &svdf,
+                /*tolerance=*/0.00625109);
+}
+
+TEST_F(SVDFOpTest, BlackBoxTestHybridRank1Int8) {
   HybridSVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3,
-                         /*memory_size=*/10, /*rank=*/1);
+                         /*memory_size=*/10, /*rank=*/1, TensorType_INT8);
   svdf.SetWeightsFeature({-0.31930989, -0.36118156, 0.0079667, 0.37613347,
                           0.22197971, 0.12416199, 0.27901134, 0.27557442,
                           0.3905206, -0.36137494, -0.06634006, -0.10640851});
@@ -337,9 +415,9 @@ TEST_F(SVDFOpTest, BlackBoxTestHybridRank1) {
                 /*tolerance=*/0.002945);
 }
 
-TEST_F(SVDFOpTest, BlackBoxTestHybridRank2) {
+TEST_F(SVDFOpTest, BlackBoxTestHybridRank2Int8) {
   HybridSVDFOpModel svdf(/*batches=*/2, /*units=*/4, /*input_size=*/3,
-                         /*memory_size=*/10, /*rank=*/2);
+                         /*memory_size=*/10, /*rank=*/2, TensorType_INT8);
   svdf.SetWeightsFeature({-0.31930989, 0.0079667,   0.39296314,  0.37613347,
                           0.12416199,  0.15785322,  0.27901134,  0.3905206,
                           0.21931258,  -0.36137494, -0.10640851, 0.31053296,
diff --git a/tensorflow/lite/kernels/test_util.cc b/tensorflow/lite/kernels/test_util.cc
index 549ea78f5b45b20139b023552a98c3dcb0d75610..295204f62e56488b06f8d5ed23a1ae62a4d1b106 100644
--- a/tensorflow/lite/kernels/test_util.cc
+++ b/tensorflow/lite/kernels/test_util.cc
@@ -47,7 +47,12 @@ std::vector<Matcher<std::complex<float>>> ArrayComplex64Near(
 }
 
 int SingleOpModel::AddInput(const TensorData& t, bool is_variable) {
-  int id = AddTensor<float>(t, {}, is_variable);
+  int id = 0;
+  if (t.per_channel_quantization) {
+    id = AddTensorPerChannelQuant(t);
+  } else {
+    id = AddTensor<float>(t, {}, is_variable);
+  }
   inputs_.push_back(id);
   return id;
 }
@@ -119,10 +124,10 @@ void SingleOpModel::BuildInterpreter(std::vector<std::vector<int>> input_shapes,
 
   CHECK(interpreter_ != nullptr);
 
-  int i = 0;
-  for (const auto& shape : input_shapes) {
-    int input_idx = interpreter_->inputs()[i++];
+  for (int i = 0; i < input_shapes.size(); ++i) {
+    const int input_idx = interpreter_->inputs()[i];
     if (input_idx == kOptionalTensor) continue;
+    const auto& shape = input_shapes[i];
     if (shape.empty()) continue;
     CHECK(interpreter_->ResizeInputTensor(input_idx, shape) == kTfLiteOk);
   }
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index dadabb86abbe3b13da74fda9224e693d310ada26..9bec8ce3ce230f1c694b12cb836265885a18173e 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -21,13 +21,14 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/testing/util.h"
-#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/tools/optimize/quantization_utils.h"
 
 namespace tflite {
 
@@ -82,14 +83,36 @@ inline std::vector<float> Dequantize(const std::vector<T>& data, float scale,
 // A helper struct to construct test tensors. This is particularly useful for
 // quantized tensor which must have their scale and zero_point defined before
 // the actual data is known. This mimics what happens in practice: quantization
-// parameters are calculated during training.
+// parameters are calculated during training or post training..
 struct TensorData {
+  TensorData(TensorType type = TensorType_FLOAT32, std::vector<int> shape = {},
+             float min = 0.0f, float max = 0.0f, float scale = 0.0f,
+             int32_t zero_point = 0, bool per_channel_quantization = false,
+             std::vector<float> per_channel_quantization_scales = {},
+             std::vector<int64_t> per_channel_quantization_offsets = {},
+             int32_t channel_index = 0)
+      : type(type),
+        shape(shape),
+        min(min),
+        max(max),
+        scale(scale),
+        zero_point(zero_point),
+        per_channel_quantization(per_channel_quantization),
+        per_channel_quantization_scales(
+            std::move(per_channel_quantization_scales)),
+        per_channel_quantization_offsets(
+            std::move(per_channel_quantization_offsets)),
+        channel_index(channel_index) {}
   TensorType type;
   std::vector<int> shape;
   float min;
   float max;
   float scale;
   int32_t zero_point;
+  bool per_channel_quantization;
+  std::vector<float> per_channel_quantization_scales;
+  std::vector<int64_t> per_channel_quantization_offsets;
+  int32_t channel_index;
 };
 
 class SingleOpResolver : public OpResolver {
@@ -161,19 +184,57 @@ class SingleOpModel {
   }
 
   void SymmetricQuantizeAndPopulate(int index, const std::vector<float>& data) {
-    TfLiteTensor* t = interpreter_->tensor(index);
-    const int length = data.size();
-    std::vector<int8_t> q(length);
-    float min, max, scaling_factor;
-    tensor_utils::SymmetricQuantizeFloats(data.data(), length, q.data(), &min,
-                                          &max, &scaling_factor);
-    // Update quantization params.
-    t->params.scale = scaling_factor;
-    t->params.zero_point = 0;
+    std::vector<int8_t> q = QuantizeTensor(index, data);
     PopulateTensor(index, /*offset=*/0, reinterpret_cast<uint8_t*>(q.data()),
                    reinterpret_cast<uint8_t*>(q.data() + q.size()));
   }
 
+  void SignedSymmetricQuantizeAndPopulate(int index,
+                                          const std::vector<float>& data) {
+    std::vector<int8_t> q = QuantizeTensor(index, data);
+    PopulateTensor(index, /*offset=*/0, q.data(), q.data() + q.size());
+  }
+
+  // Quantize and populate data for filter with per channel quantization.
+  void PerChannelSymmetricQuantizeAndPopulate(
+      int index, const std::vector<float>& input_data) {
+    TfLiteTensor* t = interpreter_->tensor(index);
+    auto* params =
+        reinterpret_cast<TfLiteAffineQuantization*>(t->quantization.params);
+    const int channel_index = params->quantized_dimension;
+
+    std::vector<int32_t> shape(t->dims->size);
+    for (int i = 0; i < shape.size(); ++i) {
+      shape[i] = t->dims->data[i];
+    }
+    const int32_t num_inputs = input_data.size();
+    const int32_t num_channel = shape[channel_index];
+    std::vector<int8_t> quantized_output(num_inputs);
+    std::vector<float> scales_inv(num_channel);
+    for (int i = 0; i < num_channel; ++i) {
+      scales_inv[i] = 1.0f / params->scale->data[i];
+    }
+    optimize::utils::SymmetricPerChannelQuantizeValues(
+        input_data.data(), scales_inv, shape, channel_index, &quantized_output);
+
+    PopulateTensor(index, /*offset=*/0, quantized_output.data(),
+                   quantized_output.data() + quantized_output.size());
+  }
+
+  // Quantize and populate data for bias with per channel quantization.
+  void PerChannelQuantizeBias(int index, const std::vector<float>& input_data) {
+    const int32_t num_inputs = input_data.size();
+    std::vector<int32_t> quantized_output(num_inputs);
+    TfLiteTensor* t = interpreter_->tensor(index);
+    auto* params =
+        reinterpret_cast<TfLiteAffineQuantization*>(t->quantization.params);
+    for (int i = 0; i < num_inputs; ++i) {
+      quantized_output[i] = input_data[i] * params->scale->data[i];
+    }
+    PopulateTensor(index, /*offset=*/0, quantized_output.data(),
+                   quantized_output.data() + quantized_output.size());
+  }
+
   const std::vector<int>& GetShape(int id) { return tensor_data_.at(id).shape; }
 
   float GetScale(int id) { return tensor_data_.at(id).scale; }
@@ -294,6 +355,24 @@ class SingleOpModel {
     return {scale, zero_point};
   }
 
+  int AddTensorPerChannelQuant(TensorData t) {
+    const int id = tensors_.size();
+    flatbuffers::Offset<QuantizationParameters> q_params = 0;
+    q_params = CreateQuantizationParameters(
+        builder_, /*min=*/0, /*max=*/0,
+        /*scale=*/
+        builder_.CreateVector<float>(t.per_channel_quantization_scales),
+        /*zero point=*/
+        builder_.CreateVector<int64_t>(t.per_channel_quantization_offsets),
+        QuantizationDetails_NONE, 0, t.channel_index);
+    tensors_.push_back(
+        CreateTensor(builder_, builder_.CreateVector<int>(t.shape), t.type,
+                     /*buffer=*/0,
+                     /*name=*/0, q_params, /*is_variable=*/false));
+    tensor_data_[id] = t;
+    return id;
+  }
+
   template <typename T>
   int AddTensor(TensorData t, std::initializer_list<T> data,
                 bool is_variable = false) {
@@ -307,10 +386,12 @@ class SingleOpModel {
 
     if (is_quantized) {
       if (t.min != 0 || t.max != 0) {
-        // TODO(b/119422369): Handle signed int8 here.
         if (t.type == TensorType_UINT8) {
           std::tie(t.scale, t.zero_point) =
               QuantizationParams<uint8_t>(t.min, t.max);
+        } else if (t.type == TensorType_INT8) {
+          std::tie(t.scale, t.zero_point) =
+              QuantizationParams<int8_t>(t.min, t.max);
         } else if (t.type == TensorType_INT32) {
           std::tie(t.scale, t.zero_point) =
               QuantizationParams<int32_t>(t.min, t.max);
@@ -356,6 +437,31 @@ class SingleOpModel {
     return id;
   }
 
+  std::vector<int8_t> QuantizeTensor(int index,
+                                     const std::vector<float>& data) {
+    TfLiteTensor* t = interpreter_->tensor(index);
+    const int length = data.size();
+    std::vector<int8_t> q(length);
+    float min, max, scaling_factor;
+    tensor_utils::SymmetricQuantizeFloats(data.data(), length, q.data(), &min,
+                                          &max, &scaling_factor);
+    // Update quantization params.
+    t->params.scale = scaling_factor;
+    t->params.zero_point = 0;
+    // Populate the new quantization params.
+    TfLiteQuantizationFree(&t->quantization);
+    t->quantization.type = kTfLiteAffineQuantization;
+    auto* affine_quantization = reinterpret_cast<TfLiteAffineQuantization*>(
+        malloc(sizeof(TfLiteAffineQuantization)));
+    affine_quantization->quantized_dimension = 0;
+    affine_quantization->scale = TfLiteFloatArrayCreate(1);
+    affine_quantization->zero_point = TfLiteIntArrayCreate(1);
+    affine_quantization->scale->data[0] = scaling_factor;
+    affine_quantization->zero_point->data[0] = 0;
+    t->quantization.params = affine_quantization;
+    return q;
+  }
+
   std::map<int, TensorData> tensor_data_;
   std::vector<int32_t> inputs_;
   std::vector<int32_t> outputs_;
diff --git a/tensorflow/lite/kernels/topk_v2.cc b/tensorflow/lite/kernels/topk_v2.cc
index 444b01e7b2e055ab4e26a2ea1dce28642dc430b7..64973d7b860fc3089850cc3648ee4fb9da11047e 100644
--- a/tensorflow/lite/kernels/topk_v2.cc
+++ b/tensorflow/lite/kernels/topk_v2.cc
@@ -207,6 +207,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TopK(row_size, num_rows, input->data.uint8, k, output_indexes->data.i32,
            output_values->data.uint8);
       break;
+    case kTfLiteInt8:
+      TopK(row_size, num_rows, input->data.int8, k, output_indexes->data.i32,
+           output_values->data.int8);
+      break;
     case kTfLiteInt32:
       TopK(row_size, num_rows, input->data.i32, k, output_indexes->data.i32,
            output_values->data.i32);
diff --git a/tensorflow/lite/kernels/topk_v2_test.cc b/tensorflow/lite/kernels/topk_v2_test.cc
index 108b8123666aaddcc8ba8438bac82c91ce98d50d..0097ae2f9aece116c963a4b460c2e3ff0fc127c4 100644
--- a/tensorflow/lite/kernels/topk_v2_test.cc
+++ b/tensorflow/lite/kernels/topk_v2_test.cc
@@ -46,6 +46,10 @@ class TopKV2OpModel : public SingleOpModel {
     PopulateTensor<uint8_t>(input_, data);
   }
 
+  void SetInputInt8(std::initializer_list<int8_t> data) {
+    PopulateTensor<int8_t>(input_, data);
+  }
+
   void SetInputInt32(std::initializer_list<int32_t> data) {
     PopulateTensor<int32_t>(input_, data);
   }
@@ -66,6 +70,10 @@ class TopKV2OpModel : public SingleOpModel {
     return ExtractVector<uint8_t>(output_values_);
   }
 
+  std::vector<int8_t> GetValuesInt8() {
+    return ExtractVector<int8_t>(output_values_);
+  }
+
   std::vector<int32_t> GetValuesInt32() {
     return ExtractVector<int32_t>(output_values_);
   }
@@ -128,6 +136,14 @@ TEST(TopKV2OpTest, TypeUint8) {
   EXPECT_THAT(m.GetValuesUInt8(), ElementsAreArray({3, 2, 251, 250}));
 }
 
+TEST(TopKV2OpTest, TypeInt8) {
+  TopKV2OpModel m({2, 3}, TensorType_INT8, 2);
+  m.SetInputInt8({1, 2, 3, -126, 125, -24});
+  m.Invoke();
+  EXPECT_THAT(m.GetIndexes(), ElementsAreArray({2, 1, 1, 2}));
+  EXPECT_THAT(m.GetValuesInt8(), ElementsAreArray({3, 2, 125, -24}));
+}
+
 // Check that int32_t works.
 TEST(TopKV2OpTest, TypeInt32) {
   TopKV2OpModel m({2, 3}, TensorType_INT32, 2);
diff --git a/tensorflow/lite/kernels/transpose.cc b/tensorflow/lite/kernels/transpose.cc
index 7a6d320674ad1c8302f8bf3a9d1d5153223deed3..0ef4972d1a856f84d3511657ec9d9f2f3cc36182 100644
--- a/tensorflow/lite/kernels/transpose.cc
+++ b/tensorflow/lite/kernels/transpose.cc
@@ -117,6 +117,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         TF_LITE_TRANSPOSE(reference_ops, uint8_t);
       }
       break;
+    case kTfLiteInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_TRANSPOSE(reference_ops, int8_t);
+      }
+      break;
     case kTfLiteInt32:
       if (kernel_type == kReference) {
         TF_LITE_TRANSPOSE(reference_ops, int32_t);
diff --git a/tensorflow/lite/kernels/transpose_conv.cc b/tensorflow/lite/kernels/transpose_conv.cc
index 59eee51068c0efcf26d66d933e13ee2f931463bc..343f2ca59bad5df9c55b129bbf317b0bf25d26f0 100644
--- a/tensorflow/lite/kernels/transpose_conv.cc
+++ b/tensorflow/lite/kernels/transpose_conv.cc
@@ -119,8 +119,8 @@ TfLiteStatus ResizeIm2ColTensor(TfLiteContext* context,
   im2col_shape_array->data[1] = output_shape->data.i32[1];
   im2col_shape_array->data[2] = output_shape->data.i32[2];
   const int input_depth = SizeOfDimension(input, 3);
-  const int filter_width = SizeOfDimension(weights, 1);
-  const int filter_height = SizeOfDimension(weights, 2);
+  const int filter_width = SizeOfDimension(weights, 2);
+  const int filter_height = SizeOfDimension(weights, 1);
   im2col_shape_array->data[3] = input_depth * filter_height * filter_width;
 
   im2col->type = input->type;
@@ -197,8 +197,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // Get height and width of the output image.
   const int width = SizeOfDimension(output, 2);
   const int height = SizeOfDimension(output, 1);
-  const int filter_width = SizeOfDimension(weights, 1);
-  const int filter_height = SizeOfDimension(weights, 2);
+  const int filter_width = SizeOfDimension(weights, 2);
+  const int filter_height = SizeOfDimension(weights, 1);
 
   const int stride_width = params->stride_width;
   const int stride_height = params->stride_height;
diff --git a/tensorflow/lite/kernels/transpose_conv_test.cc b/tensorflow/lite/kernels/transpose_conv_test.cc
index 0520d84a30b50212bb3d86288236b49da523f4c2..44d1336b99fe03535451c7dbacfe77be58fd6fad 100644
--- a/tensorflow/lite/kernels/transpose_conv_test.cc
+++ b/tensorflow/lite/kernels/transpose_conv_test.cc
@@ -252,7 +252,7 @@ TEST_P(TransposeConvOpTest, AccuracyTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 4, 1}));
 }
 
-INSTANTIATE_TEST_CASE_P(
+INSTANTIATE_TEST_SUITE_P(
     TransposeConvOpTest, TransposeConvOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
 
diff --git a/tensorflow/lite/kernels/transpose_test.cc b/tensorflow/lite/kernels/transpose_test.cc
index 3ebaf3ca27ffd285ef86a81b2e63409fde565ef1..71644159209cc289329f65d1cac929585f2f4200 100644
--- a/tensorflow/lite/kernels/transpose_test.cc
+++ b/tensorflow/lite/kernels/transpose_test.cc
@@ -25,16 +25,17 @@ namespace {
 
 using ::testing::ElementsAreArray;
 
+template <typename T>
 void RunTestPermutation(const std::vector<int>& shape,
                         const std::vector<int>& perms,
-                        std::vector<float>* input_transposed) {
+                        std::vector<T>* input_transposed) {
   // Count elements and allocate output.
   int count = 1;
   for (auto factor : shape) count *= factor;
   input_transposed->resize(count);
 
   // Create the dummy data
-  std::vector<float> input(count);
+  std::vector<T> input(count);
   for (int i = 0; i < input.size(); i++) {
     input[i] = i;
   }
@@ -64,8 +65,8 @@ void RunTestPermutation(const std::vector<int>& shape,
     params.perm[i] = perms[i];
   }
 
-  reference_ops::Transpose<float>(params, input_shape, input.data(),
-                                  output_shape, input_transposed->data());
+  reference_ops::Transpose<T>(params, input_shape, input.data(), output_shape,
+                              input_transposed->data());
 }
 
 TEST(TransposeTest, TestRefOps1D) {
@@ -125,6 +126,28 @@ TEST(TransposeTest, TestRefOps4D) {
   ASSERT_EQ(out, ref);
 }
 
+TEST(TransposeTest, TestRefOps4DInt8) {
+  std::vector<int8_t> out;
+  // Basic 4d.
+  RunTestPermutation({2, 3, 4, 5}, {2, 0, 1, 3}, &out);
+  ASSERT_EQ(
+      out,
+      std::vector<int8_t>(
+          {0,  1,  2,  3,  4,  20, 21, 22, 23, 24, 40,  41,  42,  43,  44,
+           60, 61, 62, 63, 64, 80, 81, 82, 83, 84, 100, 101, 102, 103, 104,
+           5,  6,  7,  8,  9,  25, 26, 27, 28, 29, 45,  46,  47,  48,  49,
+           65, 66, 67, 68, 69, 85, 86, 87, 88, 89, 105, 106, 107, 108, 109,
+           10, 11, 12, 13, 14, 30, 31, 32, 33, 34, 50,  51,  52,  53,  54,
+           70, 71, 72, 73, 74, 90, 91, 92, 93, 94, 110, 111, 112, 113, 114,
+           15, 16, 17, 18, 19, 35, 36, 37, 38, 39, 55,  56,  57,  58,  59,
+           75, 76, 77, 78, 79, 95, 96, 97, 98, 99, 115, 116, 117, 118, 119}));
+  RunTestPermutation({2, 3, 4, 5}, {0, 1, 2, 3}, &out);
+  // Basic identity.
+  std::vector<int8_t> ref(out.size());
+  for (int k = 0; k < ref.size(); k++) ref[k] = k;
+  ASSERT_EQ(out, ref);
+}
+
 class TransposeOpModel : public SingleOpModel {
  public:
   void SetInput(std::initializer_list<float> data) {
@@ -184,6 +207,7 @@ class TransposeOpDynamicModel : public TransposeOpModel {
   }
 };
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(TransposeTest, TestUnequalPermSize) {
   EXPECT_DEATH(TransposeOpConstModel({1, 3, 3, 1}, {2}, {2, 2}), "2 != 4");
 }
@@ -194,6 +218,7 @@ TEST(TransposeTest, TestPermOutOfBounds) {
   EXPECT_DEATH(TransposeOpConstModel({1, 3, 3, 1}, {4}, {0, 1, 2, 4}),
                "Transpose op permutations array is out of bounds.");
 }
+#endif
 
 TEST(TransposeTest, Test1DInputConstTensor) {
   TransposeOpConstModel m({3}, {1}, {0});
@@ -252,10 +277,12 @@ TEST(TransposeTest, Test3DInputDynamicTensor) {
                                 2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23}));
 }
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(TransposeTest, Test5DInputTensor) {
   EXPECT_DEATH(TransposeOpConstModel({1, 2, 3, 4, 5}, {5}, {0, 1, 2, 3, 4}),
                "Transpose op only supports 1D-4D input arrays.");
 }
+#endif
 
 TEST(TransposeTest, SimpleTestNoReorderConstTensor) {
   TransposeOpConstModel m({1, 2, 3, 1}, {4}, {0, 1, 2, 3});
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
index 497777b9aff6c6bd5084f2d36b505c998b12273b..e2fc73ba29b5c96ad83536fb8752c11d70191d4d 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
@@ -110,7 +110,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 
   const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  if (input_to_input_weights) {
+  if (input_to_input_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
@@ -130,7 +130,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 
   const TfLiteTensor* recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
-  if (recurrent_to_input_weights) {
+  if (recurrent_to_input_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[0],
                       n_cell);
@@ -164,21 +164,21 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 
   const TfLiteTensor* cell_to_input_weights =
       GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
-  if (cell_to_input_weights) {
+  if (cell_to_input_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
   }
 
   const TfLiteTensor* cell_to_forget_weights =
       GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
-  if (cell_to_forget_weights) {
+  if (cell_to_forget_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
   }
 
   const TfLiteTensor* cell_to_output_weights =
       GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
-  if (cell_to_output_weights) {
+  if (cell_to_output_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->data[0], n_cell);
   }
@@ -220,7 +220,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 
   const TfLiteTensor* projection_weights =
       GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
-  if (projection_weights) {
+  if (projection_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[0], n_output);
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
@@ -228,7 +228,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 
   const TfLiteTensor* projection_bias =
       GetOptionalInputTensor(context, node, kProjectionBiasTensor);
-  if (projection_bias) {
+  if (projection_bias != nullptr) {
     TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output);
   }
@@ -306,7 +306,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // The weights are of consistent type, so it suffices to check one.
   // TODO(mirkov): create a utility/macro for this check, so all Ops can use it.
-  const bool is_hybrid_op = (input_to_output_weights->type == kTfLiteUInt8 &&
+  const bool is_hybrid_op = ((input_to_output_weights->type == kTfLiteUInt8 ||
+                              input_to_output_weights->type == kTfLiteInt8) &&
                              input->type == kTfLiteFloat32);
 
   TfLiteIntArrayFree(node->temporaries);
@@ -344,7 +345,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         *scratch_tensor_index + kInputQuantized;
     TfLiteTensor* input_quantized =
         GetTemporary(context, node, kInputQuantized);
-    input_quantized->type = kTfLiteUInt8;
+    input_quantized->type = input_to_output_weights->type;
     input_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
       TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
@@ -355,7 +356,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         *scratch_tensor_index + kOutputStateQuantized;
     TfLiteTensor* activation_state_quantized =
         GetTemporary(context, node, kOutputStateQuantized);
-    activation_state_quantized->type = kTfLiteUInt8;
+    activation_state_quantized->type = input_to_output_weights->type;
     activation_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(activation_state_quantized->dims,
                              activation_state->dims)) {
@@ -369,7 +370,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         *scratch_tensor_index + kCellStateQuantized;
     TfLiteTensor* cell_state_quantized =
         GetTemporary(context, node, kCellStateQuantized);
-    cell_state_quantized->type = kTfLiteUInt8;
+    cell_state_quantized->type = input_to_output_weights->type;
     cell_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(cell_state_quantized->dims, cell_state->dims)) {
       TfLiteIntArray* cell_state_quantized_size =
@@ -502,6 +503,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           recurrent_to_input_weights, recurrent_to_forget_weights,
           recurrent_to_cell_weights, recurrent_to_output_weights,
           cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights,
+          /*input_layer_norm_coefficients=*/nullptr,
+          /*forget_layer_norm_coefficients=*/nullptr,
+          /*cell_layer_norm_coefficients=*/nullptr,
+          /*output_layer_norm_coefficients=*/nullptr,
           /*aux_input=*/nullptr,
           /*aux_input_to_input_weights=*/nullptr,
           /*aux_input_to_forget_weights=*/nullptr,
@@ -512,7 +517,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*output_offset=*/0, scratch_buffer, activation_state, cell_state,
           output);
     }
-    case kTfLiteUInt8: {
+    case kTfLiteUInt8:
+    case kTfLiteInt8: {
       TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
       TfLiteTensor* activation_state_quantized =
           GetTemporary(context, node, /*index=*/2);
@@ -529,6 +535,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           recurrent_to_input_weights, recurrent_to_forget_weights,
           recurrent_to_cell_weights, recurrent_to_output_weights,
           cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights,
+          /*input_layer_norm_coefficients=*/nullptr,
+          /*forget_layer_norm_coefficients=*/nullptr,
+          /*cell_layer_norm_coefficients=*/nullptr,
+          /*output_layer_norm_coefficients=*/nullptr,
           /*aux_input=*/nullptr,
           /*aux_input_to_input_weights=*/nullptr,
           /*aux_input_to_forget_weights=*/nullptr,
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
index ae7dd6b2bee1da06d9dc48f259585f541c72842f..bc35d90773b522d22e4373c60ca83121ff7fd09e 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
@@ -243,59 +243,73 @@ class HybridUnidirectionalLSTMOpModel : public UnidirectionalLSTMOpModel {
       int n_batch, int n_input, int n_cell, int n_output, int sequence_length,
       bool time_major, bool use_cifg, bool use_peephole,
       bool use_projection_weights, bool use_projection_bias, float cell_clip,
-      float proj_clip, const std::vector<std::vector<int>>& input_shapes)
+      float proj_clip, const std::vector<std::vector<int>>& input_shapes,
+      TensorType tensor_type)
       : UnidirectionalLSTMOpModel(
             n_batch, n_input, n_cell, n_output, sequence_length, time_major,
             use_cifg, use_peephole, use_projection_weights, use_projection_bias,
-            cell_clip, proj_clip, input_shapes, TensorType_UINT8) {}
+            cell_clip, proj_clip, input_shapes, tensor_type) {
+    tensor_type_ = tensor_type;
+  }
+
+  void SetWeights(int weights_idx, const std::vector<float>& f) {
+    if (tensor_type_ == TensorType_UINT8) {
+      SymmetricQuantizeAndPopulate(weights_idx, f);
+    } else {
+      SignedSymmetricQuantizeAndPopulate(weights_idx, f);
+    }
+  }
 
   void SetInputToInputWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(input_to_input_weights_, f);
+    SetWeights(input_to_input_weights_, f);
   }
 
   void SetInputToForgetWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(input_to_forget_weights_, f);
+    SetWeights(input_to_forget_weights_, f);
   }
 
   void SetInputToCellWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(input_to_cell_weights_, f);
+    SetWeights(input_to_cell_weights_, f);
   }
 
   void SetInputToOutputWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(input_to_output_weights_, f);
+    SetWeights(input_to_output_weights_, f);
   }
 
   void SetRecurrentToInputWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_input_weights_, f);
+    SetWeights(recurrent_to_input_weights_, f);
   }
 
   void SetRecurrentToForgetWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_forget_weights_, f);
+    SetWeights(recurrent_to_forget_weights_, f);
   }
 
   void SetRecurrentToCellWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_cell_weights_, f);
+    SetWeights(recurrent_to_cell_weights_, f);
   }
 
   void SetRecurrentToOutputWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(recurrent_to_output_weights_, f);
+    SetWeights(recurrent_to_output_weights_, f);
   }
 
   void SetCellToInputWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(cell_to_input_weights_, f);
+    SetWeights(cell_to_input_weights_, f);
   }
 
   void SetCellToForgetWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(cell_to_forget_weights_, f);
+    SetWeights(cell_to_forget_weights_, f);
   }
 
   void SetCellToOutputWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(cell_to_output_weights_, f);
+    SetWeights(cell_to_output_weights_, f);
   }
 
   void SetProjectionWeights(const std::vector<float>& f) {
-    SymmetricQuantizeAndPopulate(projection_weights_, f);
+    SetWeights(projection_weights_, f);
   }
+
+ protected:
+  TensorType tensor_type_;
 };
 
 class BaseLstmTest : public ::testing::Test {
@@ -561,7 +575,8 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest,
                 /*time_major=*/false);
 }
 
-TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
+TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest,
+       HybridLstmBlackBoxTestUint8) {
   const int n_batch = 1;
   const int n_input = 2;
   // n_cell and n_output have the same size when there is no projection.
@@ -601,7 +616,71 @@ TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
 
           {n_batch, n_output},  // activation_state tensor
           {n_batch, n_cell},    // cell_state tensor
-      });
+      },
+      TensorType_UINT8);
+
+  lstm.SetInputToInputWeights(input_to_input_weights_);
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  lstm.SetInputGateBias(input_gate_bias_);
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
+
+  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm,
+                /*tolerance=*/0.0157651);
+}
+
+TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest,
+       HybridLstmBlackBoxTestInt8) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+  const int sequence_length = 3;
+
+  HybridUnidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length,
+      /*time_major=*/true, /*use_cifg=*/false, /*use_peephole=*/false,
+      /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {sequence_length, n_batch, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},  // cell_to_input_weight tensor
+          {0},  // cell_to_forget_weight tensor
+          {0},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+      },
+      TensorType_INT8);
 
   lstm.SetInputToInputWeights(input_to_input_weights_);
   lstm.SetInputToCellWeights(input_to_cell_weights_);
@@ -730,7 +809,8 @@ TEST_F(CifgPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
 }
 
-TEST_F(CifgPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
+TEST_F(CifgPeepholeNoProjectionNoClippingLstmTest,
+       HybridLstmBlackBoxTestUint8) {
   const int n_batch = 1;
   const int n_input = 2;
   // n_cell and n_output have the same size when there is no projection.
@@ -771,7 +851,70 @@ TEST_F(CifgPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
 
           {n_batch, n_output},  // activation_state tensor
           {n_batch, n_cell},    // cell_state tensor
-      });
+      },
+      TensorType_UINT8);
+
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
+
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.03573);
+}
+
+TEST_F(CifgPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTestInt8) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+  const int sequence_length = 3;
+
+  HybridUnidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length,
+      /*time_major=*/true, /*use_cifg=*/true, /*use_peephole=*/true,
+      /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false,
+      /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {sequence_length, n_batch, n_input},  // input tensor
+
+          {0, 0},             // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {0, 0},              // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},       // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {0},       // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+      },
+      TensorType_INT8);
 
   lstm.SetInputToCellWeights(input_to_cell_weights_);
   lstm.SetInputToForgetWeights(input_to_forget_weights_);
@@ -1456,7 +1599,7 @@ TEST_F(NoCifgPeepholeProjectionClippingLstmTest, LstmBlackBoxTest) {
   VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
 }
 
-TEST_F(NoCifgPeepholeProjectionClippingLstmTest, HybridLstmBlackBoxTest) {
+TEST_F(NoCifgPeepholeProjectionClippingLstmTest, HybridLstmBlackBoxTestUint8) {
   const int n_batch = 2;
   const int n_input = 5;
   const int n_cell = 20;
@@ -1496,7 +1639,75 @@ TEST_F(NoCifgPeepholeProjectionClippingLstmTest, HybridLstmBlackBoxTest) {
 
           {n_batch, n_output},  // activation_state tensor
           {n_batch, n_cell},    // cell_state tensor
-      });
+      },
+      TensorType_UINT8);
+
+  lstm.SetInputToInputWeights(input_to_input_weights_);
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  lstm.SetInputGateBias(input_gate_bias_);
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
+
+  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  lstm.SetCellToInputWeights(cell_to_input_weights_);
+  lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  lstm.SetProjectionWeights(projection_weights_);
+
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.00467);
+}
+
+TEST_F(NoCifgPeepholeProjectionClippingLstmTest, HybridLstmBlackBoxTestInt8) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 20;
+  const int n_output = 16;
+  const int sequence_length = 4;
+
+  HybridUnidirectionalLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output, sequence_length,
+      /*time_major=*/true, /*use_cifg=*/false, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false,
+      /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {sequence_length, n_batch, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {n_cell},  // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+
+          {n_batch, n_output},  // activation_state tensor
+          {n_batch, n_cell},    // cell_state tensor
+      },
+      TensorType_INT8);
 
   lstm.SetInputToInputWeights(input_to_input_weights_);
   lstm.SetInputToCellWeights(input_to_cell_weights_);
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc b/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc
index 4c0fe00272a04ef3edc0787839f235f12aa546cb..3854695d0bfde5d6c3a14b0c3aa449f5ca2eb4fa 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc
@@ -96,15 +96,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context,
                     context->ResizeTensor(context, output, output_size_array));
 
+  const bool is_hybrid =
+      input->type == kTfLiteFloat32 && (input_weights->type == kTfLiteUInt8 ||
+                                        input_weights->type == kTfLiteInt8);
+
   // Allocate temporary tensors to store quantized values of input and
   // hidden_state tensors.
-  if (input->type == kTfLiteFloat32 && input_weights->type == kTfLiteUInt8) {
+  if (is_hybrid) {
     int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
     TfLiteIntArrayFree(node->temporaries);
     node->temporaries = TfLiteIntArrayCreate(3);
     node->temporaries->data[0] = *scratch_tensor_index;
     TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/0);
-    input_quantized->type = kTfLiteUInt8;
+    input_quantized->type = input_weights->type;
     input_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
       TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
@@ -114,7 +118,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     node->temporaries->data[1] = *scratch_tensor_index + 1;
     TfLiteTensor* hidden_state_quantized =
         GetTemporary(context, node, /*index=*/1);
-    hidden_state_quantized->type = kTfLiteUInt8;
+    hidden_state_quantized->type = input_weights->type;
     hidden_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(hidden_state_quantized->dims,
                              hidden_state->dims)) {
@@ -213,19 +217,31 @@ TfLiteStatus EvalHybrid(
 
   // Initialize the pointer bias.
   const float* bias_ptr = bias->data.f;
-  // Initialize input_weights and recurrent_weights.
-  const int8_t* input_weights_ptr =
-      reinterpret_cast<const int8_t*>(input_weights->data.uint8);
-  const int8_t* recurrent_weights_ptr =
-      reinterpret_cast<const int8_t*>(recurrent_weights->data.uint8);
+
+  // Initialize input_weights, recurrent_weights, and temporary storage for
+  // quantized values.
+  const int8_t* input_weights_ptr;
+  const int8_t* recurrent_weights_ptr;
+  int8_t* quantized_input_ptr;
+  int8_t* quantized_hidden_state_ptr;
+  if (input_weights->type == kTfLiteUInt8) {
+    input_weights_ptr =
+        reinterpret_cast<const int8_t*>(input_weights->data.uint8);
+    recurrent_weights_ptr =
+        reinterpret_cast<const int8_t*>(recurrent_weights->data.uint8);
+    quantized_input_ptr = reinterpret_cast<int8_t*>(input_scratch->data.uint8);
+    quantized_hidden_state_ptr =
+        reinterpret_cast<int8_t*>(hidden_state_scratch->data.uint8);
+  } else {
+    input_weights_ptr = input_weights->data.int8;
+    recurrent_weights_ptr = recurrent_weights->data.int8;
+    quantized_input_ptr = input_scratch->data.int8;
+    quantized_hidden_state_ptr = hidden_state_scratch->data.int8;
+  }
+
   // Get the scale of the quantized weights.
   float input_weights_scale = input_weights->params.scale;
   float recurrent_weights_scale = recurrent_weights->params.scale;
-  // Initialize temporary storage for quantized values.
-  int8_t* quantized_input_ptr =
-      reinterpret_cast<int8_t*>(input_scratch->data.uint8);
-  int8_t* quantized_hidden_state_ptr =
-      reinterpret_cast<int8_t*>(hidden_state_scratch->data.uint8);
   float* scaling_factors_ptr = scaling_factors->data.f;
 
   if (time_major) {
@@ -286,7 +302,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteFloat32:
       return EvalFloat(input, input_weights, recurrent_weights, bias, params,
                        hidden_state, output);
-    case kTfLiteUInt8: {
+    case kTfLiteUInt8:
+    case kTfLiteInt8: {
       // TODO(mirkov): implement eval with quantized inputs as well.
       TfLiteTensor* input_quantized = GetTemporary(context, node, 0);
       TfLiteTensor* hidden_state_quantized = GetTemporary(context, node, 1);
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc b/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc
index a2f82ac67b1b22b226e7046af7158ed6095dcc8e..de1f7818bd0f2a1420b6f277c08670f7e70fef27 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc
@@ -248,17 +248,29 @@ class UnidirectionalRNNOpModel : public SingleOpModel {
 class HybridUnidirectionalRNNOpModel : public UnidirectionalRNNOpModel {
  public:
   HybridUnidirectionalRNNOpModel(int batches, int sequence_len, int units,
-                                 int size, bool time_major)
+                                 int size, bool time_major,
+                                 TensorType tensor_type)
       : UnidirectionalRNNOpModel(batches, sequence_len, units, size, time_major,
-                                 TensorType_UINT8, TensorType_UINT8) {}
+                                 tensor_type, tensor_type) {
+    tensor_type_ = tensor_type;
+  }
 
-  void SetWeights(std::initializer_list<float> f) {
-    SymmetricQuantizeAndPopulate(weights_, f);
+  void SetWeights(int weights_idx, const std::vector<float>& f) {
+    if (tensor_type_ == TensorType_UINT8) {
+      SymmetricQuantizeAndPopulate(weights_idx, f);
+    } else {
+      SignedSymmetricQuantizeAndPopulate(weights_idx, f);
+    }
   }
 
+  void SetWeights(std::initializer_list<float> f) { SetWeights(weights_, f); }
+
   void SetRecurrentWeights(std::initializer_list<float> f) {
-    SymmetricQuantizeAndPopulate(recurrent_weights_, f);
+    SetWeights(recurrent_weights_, f);
   }
+
+ protected:
+  TensorType tensor_type_;
 };
 
 TEST(UnidirectionalRNNOpTest, BlackBoxTest) {
@@ -285,10 +297,36 @@ TEST(UnidirectionalRNNOpTest, BlackBoxTest) {
   EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
 }
 
-TEST(HybridUnidirectionalRNNOpModelOpTest, BlackBoxTest) {
+TEST(HybridUnidirectionalRNNOpModelOpTest, BlackBoxTestUint8) {
   HybridUnidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                                      /*units=*/16, /*size=*/8,
-                                     /*time_major=*/false);
+                                     /*time_major=*/false, TensorType_UINT8);
+  rnn.SetWeights(rnn_weights);
+  rnn.SetBias(rnn_bias);
+  rnn.SetRecurrentWeights(rnn_recurrent_weights);
+
+  const int input_sequence_size = rnn.input_size() * rnn.sequence_len();
+  float* batch_start = rnn_input;
+  float* batch_end = batch_start + input_sequence_size;
+  rnn.SetInput(0, batch_start, batch_end);
+  rnn.SetInput(input_sequence_size, batch_start, batch_end);
+
+  rnn.Invoke();
+
+  float* golden_start = rnn_golden_output;
+  float* golden_end = golden_start + rnn.num_units() * rnn.sequence_len();
+  std::vector<float> expected;
+  expected.insert(expected.end(), golden_start, golden_end);
+  expected.insert(expected.end(), golden_start, golden_end);
+
+  EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                   expected, /*max_abs_error=*/0.013)));
+}
+
+TEST(HybridUnidirectionalRNNOpModelOpTest, BlackBoxTestInt8) {
+  HybridUnidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                                     /*units=*/16, /*size=*/8,
+                                     /*time_major=*/false, TensorType_INT8);
   rnn.SetWeights(rnn_weights);
   rnn.SetBias(rnn_bias);
   rnn.SetRecurrentWeights(rnn_recurrent_weights);
@@ -340,10 +378,40 @@ TEST(UnidirectionalRNNOpTest, TimeMajorBlackBoxTest) {
   EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
 }
 
-TEST(HybridUnidirectionalRNNOpModelOpTest, TimeMajorBlackBoxTest) {
+TEST(HybridUnidirectionalRNNOpModelOpTest, TimeMajorBlackBoxTestUint8) {
+  HybridUnidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                                     /*units=*/16, /*size=*/8,
+                                     /*time_major=*/true, TensorType_UINT8);
+  rnn.SetWeights(rnn_weights);
+  rnn.SetBias(rnn_bias);
+  rnn.SetRecurrentWeights(rnn_recurrent_weights);
+
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    // The two batches are identical.
+    rnn.SetInput(2 * i * rnn.input_size(), batch_start, batch_end);
+    rnn.SetInput((2 * i + 1) * rnn.input_size(), batch_start, batch_end);
+  }
+
+  rnn.Invoke();
+
+  std::vector<float> expected;
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* golden_batch_start = rnn_golden_output + i * rnn.num_units();
+    float* golden_batch_end = golden_batch_start + rnn.num_units();
+    expected.insert(expected.end(), golden_batch_start, golden_batch_end);
+    expected.insert(expected.end(), golden_batch_start, golden_batch_end);
+  }
+
+  EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                   expected, /*max_abs_error=*/0.013)));
+}
+
+TEST(HybridUnidirectionalRNNOpModelOpTest, TimeMajorBlackBoxTestInt8) {
   HybridUnidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                                      /*units=*/16, /*size=*/8,
-                                     /*time_major=*/true);
+                                     /*time_major=*/true, TensorType_INT8);
   rnn.SetWeights(rnn_weights);
   rnn.SetBias(rnn_bias);
   rnn.SetRecurrentWeights(rnn_recurrent_weights);
diff --git a/tensorflow/lite/kernels/unique.cc b/tensorflow/lite/kernels/unique.cc
new file mode 100644
index 0000000000000000000000000000000000000000..80c033aa5ce1f0fb302f7b2f06d3e2cae69b9062
--- /dev/null
+++ b/tensorflow/lite/kernels/unique.cc
@@ -0,0 +1,164 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <map>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace unique {
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  return nullptr;
+}
+
+void Free(TfLiteContext* context, void* buffer) {}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  static const int kOutputUniqueTensor = 0;
+  static const int kOutputIndexTensor = 1;
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 2);
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output_unique_tensor =
+      GetOutput(context, node, kOutputUniqueTensor);
+  TfLiteTensor* output_index_tensor =
+      GetOutput(context, node, kOutputIndexTensor);
+
+  // The op only supports 1D input.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 1);
+  TfLiteIntArray* output_index_shape = TfLiteIntArrayCopy(input->dims);
+  // The unique values are determined during evaluation, so we don't know yet
+  // the size of the output tensor.
+  SetTensorToDynamic(output_unique_tensor);
+  return context->ResizeTensor(context, output_index_tensor,
+                               output_index_shape);
+}
+
+namespace {
+
+// Actual evaluation for the unique op.
+template <typename T, typename I>
+TfLiteStatus EvalImpl(TfLiteContext* context, const TfLiteTensor* input,
+                      TfLiteNode* node) {
+  // Map from value, to index in the unique elements vector.
+  // Note that we prefer to use map than unordered_map as it showed less
+  // increase in the binary size.
+  std::map<T, int> unique_values;
+  TfLiteTensor* output_indexes = GetOutput(context, node, 1);
+  I* indexes = GetTensorData<I>(output_indexes);
+  const T* data = GetTensorData<T>(input);
+  const int num_elements = NumElements(input);
+
+  for (int i = 0; i < num_elements; ++i) {
+    const auto element_it = unique_values.find(data[i]);
+    if (element_it != unique_values.end()) {
+      indexes[i] = element_it->second;
+    } else {
+      const int unique_index = unique_values.size();
+      unique_values[data[i]] = unique_index;
+      indexes[i] = unique_index;
+    }
+  }
+  // Allocate output tensor.
+  TfLiteTensor* unique_output = GetOutput(context, node, 0);
+  std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> shape(
+      TfLiteIntArrayCreate(NumDimensions(input)), TfLiteIntArrayFree);
+  shape->data[0] = unique_values.size();
+  TF_LITE_ENSURE_STATUS(
+      context->ResizeTensor(context, unique_output, shape.release()));
+  // Set the values in the output tensor.
+  T* output_unique_values = GetTensorData<T>(unique_output);
+  for (int i = 0; i < unique_values.size(); ++i) {
+    output_unique_values[i] = data[indexes[i]];
+  }
+  return kTfLiteOk;
+}
+
+template <typename T>
+TfLiteStatus EvalImpl(TfLiteContext* context, const TfLiteTensor* input,
+                      TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteUniqueParams*>(node->builtin_data);
+  if (params == nullptr) {
+    context->ReportError(context, "Null params passed");
+    return kTfLiteError;
+  }
+  switch (params->index_out_type) {
+    case kTfLiteInt32:
+      return EvalImpl<T, int32_t>(context, input, node);
+    case kTfLiteInt64:
+      return EvalImpl<T, int64_t>(context, input, node);
+    default:
+      context->ReportError(
+          context,
+          "Unique index output array can only be Int32 or In64, requested: ",
+          TfLiteTypeGetName(params->index_out_type));
+  }
+  return kTfLiteError;
+}
+
+}  // namespace
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output_index_tensor = GetOutput(context, node, 1);
+  TF_LITE_ENSURE_EQ(context, NumElements(output_index_tensor),
+                    NumElements(input));
+
+  switch (input->type) {
+    case kTfLiteInt8:
+      TF_LITE_ENSURE_STATUS(EvalImpl<int8_t>(context, input, node));
+      break;
+    case kTfLiteInt16:
+      TF_LITE_ENSURE_STATUS(EvalImpl<int16_t>(context, input, node));
+      break;
+    case kTfLiteInt32:
+      TF_LITE_ENSURE_STATUS(EvalImpl<int32_t>(context, input, node));
+      break;
+    case kTfLiteInt64:
+      TF_LITE_ENSURE_STATUS(EvalImpl<int64_t>(context, input, node));
+      break;
+    case kTfLiteFloat32:
+      TF_LITE_ENSURE_STATUS(EvalImpl<float>(context, input, node));
+      break;
+    case kTfLiteUInt8:
+      TF_LITE_ENSURE_STATUS(EvalImpl<uint8_t>(context, input, node));
+      break;
+    default:
+      context->ReportError(context, "Currently Unique doesn't support type: %s",
+                           TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace unique
+
+TfLiteRegistration* Register_UNIQUE() {
+  static TfLiteRegistration r = {unique::Init, unique::Free, unique::Prepare,
+                                 unique::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/unique_test.cc b/tensorflow/lite/kernels/unique_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1df5e6b7967ea701c573e6d1f9abc04f0067b65a
--- /dev/null
+++ b/tensorflow/lite/kernels/unique_test.cc
@@ -0,0 +1,103 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+template <typename T, typename I>
+class UniqueOpModel : public SingleOpModel {
+ public:
+  UniqueOpModel(const TensorData& input, TensorType input_type,
+                TensorType index_out_type) {
+    input_id_ = AddInput(input);
+    output_id_ = AddOutput(input_type);
+    output_index_id_ = AddOutput(index_out_type);
+    SetBuiltinOp(BuiltinOperator_UNIQUE, BuiltinOptions_UniqueOptions,
+                 CreateUniqueOptions(builder_, index_out_type).Union());
+    BuildInterpreter({GetShape(input_id_)});
+  }
+
+  int input_tensor_id() { return input_id_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_id_); }
+  std::vector<I> GetIndexesOutput() {
+    return ExtractVector<I>(output_index_id_);
+  }
+
+ protected:
+  int input_id_;
+  int output_id_;
+  int output_index_id_;
+};
+
+TEST(UniqueOpModelTest, OneElement) {
+  UniqueOpModel<float, int32_t> model({TensorType_FLOAT32, {1}},
+                                      TensorType_FLOAT32, TensorType_INT32);
+  model.PopulateTensor<float>(model.input_tensor_id(), {5});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({5}));
+  EXPECT_THAT(model.GetIndexesOutput(), ElementsAreArray({0}));
+}
+
+TEST(UniqueOpModelTest, MultipleElements_AllUnique) {
+  UniqueOpModel<float, int32_t> model({TensorType_FLOAT32, {8}},
+                                      TensorType_FLOAT32, TensorType_INT32);
+  model.PopulateTensor<float>(model.input_tensor_id(),
+                              {5, 2, 3, 51, 6, 72, 7, 8});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({5, 2, 3, 51, 6, 72, 7, 8}));
+  EXPECT_THAT(model.GetIndexesOutput(),
+              ElementsAreArray({0, 1, 2, 3, 4, 5, 6, 7}));
+}
+
+TEST(UniqueOpModelTest, MultipleElements_AllDuplicates) {
+  UniqueOpModel<float, int32_t> model({TensorType_FLOAT32, {7}},
+                                      TensorType_FLOAT32, TensorType_INT32);
+  model.PopulateTensor<float>(model.input_tensor_id(), {5, 5, 5, 5, 5, 5, 5});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({5}));
+  EXPECT_THAT(model.GetIndexesOutput(),
+              ElementsAreArray({0, 0, 0, 0, 0, 0, 0}));
+}
+
+TEST(UniqueOpModelTest, MultipleElements_SomeDuplicates) {
+  UniqueOpModel<float, int32_t> model({TensorType_FLOAT32, {7}},
+                                      TensorType_FLOAT32, TensorType_INT32);
+  model.PopulateTensor<float>(model.input_tensor_id(), {2, 3, 5, 7, 2, 7, 3});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({2, 3, 5, 7}));
+  EXPECT_THAT(model.GetIndexesOutput(),
+              ElementsAreArray({0, 1, 2, 3, 0, 3, 1}));
+}
+
+TEST(UniqueOpModelTest, MultipleElements_SomeDuplicates_IndexInt64) {
+  UniqueOpModel<float, int64_t> model({TensorType_FLOAT32, {7}},
+                                      TensorType_FLOAT32, TensorType_INT64);
+  model.PopulateTensor<float>(model.input_tensor_id(), {2, 3, 5, 7, 2, 7, 3});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({2, 3, 5, 7}));
+  EXPECT_THAT(model.GetIndexesOutput(),
+              ElementsAreArray({0, 1, 2, 3, 0, 3, 1}));
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/unpack.cc b/tensorflow/lite/kernels/unpack.cc
index 1caffe14f90b8ce9d13d8c781e87bf918c02b9f4..99ad4bb4e817ed435043fb17469381192db843ff 100644
--- a/tensorflow/lite/kernels/unpack.cc
+++ b/tensorflow/lite/kernels/unpack.cc
@@ -52,9 +52,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TF_LITE_ENSURE(context, NumDimensions(input) <= 4);
   TF_LITE_ENSURE(context, NumDimensions(input) > 1);
-  TF_LITE_ENSURE(context, NumDimensions(input) > data->axis);
-  // TODO(renjieliu): Support negative axis.
-  TF_LITE_ENSURE(context, data->axis >= 0);
+  int axis = data->axis;
+  if (axis < 0) {
+    axis += NumDimensions(input);
+  }
+  TF_LITE_ENSURE(context, 0 <= axis && axis < NumDimensions(input));
   if (input->type != kTfLiteInt32 && input->type != kTfLiteFloat32) {
     context->ReportError(context,
                          "Currently pack only supports int32 and float32.");
@@ -67,12 +69,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteIntArray* output_shape = TfLiteIntArrayCreate(NumDimensions(input) - 1);
   int o = 0;
   for (int index = 0; index < NumDimensions(input); ++index) {
-    if (index != data->axis) {
+    if (index != axis) {
       output_shape->data[o++] = input_shape->data[index];
     }
   }
 
-  TF_LITE_ENSURE_EQ(context, data->num, input_shape->data[data->axis]);
+  TF_LITE_ENSURE_EQ(context, data->num, input_shape->data[axis]);
   for (int i = 0; i < data->num; ++i) {
     TfLiteIntArray* copied_output_shape = TfLiteIntArrayCopy(output_shape);
     TfLiteTensor* output = GetOutput(context, node, i);
diff --git a/tensorflow/lite/kernels/unpack_test.cc b/tensorflow/lite/kernels/unpack_test.cc
index 9b60cce549804a59e343f3e26f978679a1624c00..76f7dff93e395414f0e5a69fe4cef151a7517315 100644
--- a/tensorflow/lite/kernels/unpack_test.cc
+++ b/tensorflow/lite/kernels/unpack_test.cc
@@ -28,14 +28,16 @@ template <typename T>
 class UnpackOpModel : public SingleOpModel {
  public:
   UnpackOpModel(const TensorData& input, int axis) {
-    CHECK_LE(axis, input.shape.size());
+    if (axis < 0) {
+      axis += input.shape.size();
+    }
     const int num_outputs = input.shape[axis];
     input_ = AddInput(input);
     for (int i = 0; i < num_outputs; ++i) {
       outputs_.push_back(AddOutput(input.type));
     }
     SetBuiltinOp(BuiltinOperator_UNPACK, BuiltinOptions_UnpackOptions,
-                 CreatePackOptions(builder_, num_outputs, axis).Union());
+                 CreateUnpackOptions(builder_, num_outputs, axis).Union());
     BuildInterpreter({GetShape(input_)});
   }
 
@@ -104,6 +106,44 @@ TEST(UnpackOpTest, FloatThreeOutputsAxisOne) {
   EXPECT_THAT(output_datas[1], ElementsAre(2, 4, 6));
 }
 
+TEST(UnpackOpTest, FloatThreeOutputsNegativeAxisOne) {
+  UnpackOpModel<float> model({TensorType_FLOAT32, {3, 2}}, -1);
+  model.SetInput({1, 2, 3, 4, 5, 6});
+  model.Invoke();
+
+  // Check outputs shapes.
+  const std::vector<std::vector<int>>& output_shapes = model.GetOutputShapes();
+  EXPECT_EQ(output_shapes.size(), 2);
+  EXPECT_THAT(output_shapes[0], ElementsAre(3));
+  EXPECT_THAT(output_shapes[1], ElementsAre(3));
+
+  // Check outputs values.
+  const std::vector<std::vector<float>>& output_datas = model.GetOutputDatas();
+  EXPECT_EQ(output_datas.size(), 2);
+  EXPECT_THAT(output_datas[0], ElementsAre(1, 3, 5));
+  EXPECT_THAT(output_datas[1], ElementsAre(2, 4, 6));
+}
+
+TEST(UnpackOpTest, FloatThreeOutputsNegativeAxisTwo) {
+  UnpackOpModel<float> model({TensorType_FLOAT32, {3, 2}}, -2);
+  model.SetInput({1, 2, 3, 4, 5, 6});
+  model.Invoke();
+
+  // Check outputs shapes.
+  const std::vector<std::vector<int>>& output_shapes = model.GetOutputShapes();
+  EXPECT_EQ(output_shapes.size(), 3);
+  EXPECT_THAT(output_shapes[0], ElementsAre(2));
+  EXPECT_THAT(output_shapes[1], ElementsAre(2));
+  EXPECT_THAT(output_shapes[2], ElementsAre(2));
+
+  // Check outputs values.
+  const std::vector<std::vector<float>>& output_datas = model.GetOutputDatas();
+  EXPECT_EQ(output_datas.size(), 3);
+  EXPECT_THAT(output_datas[0], ElementsAre(1, 2));
+  EXPECT_THAT(output_datas[1], ElementsAre(3, 4));
+  EXPECT_THAT(output_datas[2], ElementsAre(5, 6));
+}
+
 TEST(UnpackOpTest, FloatOneOutput) {
   UnpackOpModel<float> model({TensorType_FLOAT32, {1, 6}}, 0);
   model.SetInput({1, 2, 3, 4, 5, 6});
diff --git a/tensorflow/lite/kernels/where.cc b/tensorflow/lite/kernels/where.cc
new file mode 100644
index 0000000000000000000000000000000000000000..96ee36f08bc0144ce0e4a66d3d7350a791d26d86
--- /dev/null
+++ b/tensorflow/lite/kernels/where.cc
@@ -0,0 +1,105 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace where {
+
+constexpr int kInputConditionTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                const TfLiteTensor* cond_tensor,
+                                TfLiteTensor* output_tensor) {
+  // Output tensor should have shape:
+  // (num_true, cond_rank), where num_true denotes the number of true values
+  // in condition.
+  const RuntimeShape& cond_shape = GetTensorShape(cond_tensor);
+  const int size = cond_shape.FlatSize();
+  const int cond_rank = cond_shape.DimensionsCount();
+  const bool* cond_data = GetTensorData<bool>(cond_tensor);
+
+  int true_count = 0;
+  for (int i = 0; i < size; ++i) {
+    if (cond_data[i]) {
+      true_count++;
+    }
+  }
+  TfLiteIntArray* output_dims = TfLiteIntArrayCreate(2);
+  output_dims->data[0] = true_count;
+  output_dims->data[1] = cond_rank;
+  return context->ResizeTensor(context, output_tensor, output_dims);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* cond_tensor =
+      GetInput(context, node, kInputConditionTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (cond_tensor->type != kTfLiteBool) {
+    context->ReportError(context,
+                         "Condition tensor must be of type bool, but saw '%s'.",
+                         TfLiteTypeGetName(cond_tensor->type));
+    return kTfLiteError;
+  }
+
+  // As output will be a 2D tensor of indices, we use int32 as data type.
+  output->type = kTfLiteInt32;
+
+  // Exit early if cond is a non-const tensor. Set output tensor to dynamic so
+  // output size can be determined in Eval.
+  if (!IsConstantTensor(cond_tensor)) {
+    SetTensorToDynamic(output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputTensor(context, cond_tensor, output);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* cond_tensor =
+      GetInput(context, node, kInputConditionTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (IsDynamicTensor(output)) {
+    TF_LITE_ENSURE_OK(context,
+                      ResizeOutputTensor(context, cond_tensor, output));
+  }
+
+  reference_ops::SelectTrueCoords(GetTensorShape(cond_tensor),
+                                  GetTensorData<bool>(cond_tensor),
+                                  GetTensorData<int32_t>(output));
+  return kTfLiteOk;
+}
+}  // namespace where
+
+TfLiteRegistration* Register_WHERE() {
+  static TfLiteRegistration r = {/*init*/ nullptr, /*free*/ nullptr,
+                                 where::Prepare, where::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/where_test.cc b/tensorflow/lite/kernels/where_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..89bd7c43646f80f8b0adb4ef4026f1d9bc7b43c4
--- /dev/null
+++ b/tensorflow/lite/kernels/where_test.cc
@@ -0,0 +1,161 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseWhereOpModel : public SingleOpModel {
+ public:
+  BaseWhereOpModel(const TensorData& input, const TensorData& output) {
+    input_ = AddInput(input);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_WHERE, BuiltinOptions_WhereOptions,
+                 CreateWhereOptions(builder_).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  int input() { return input_; }
+
+ protected:
+  int input_;
+  int output_;
+};
+
+class IntegerWhereOpModel : public BaseWhereOpModel {
+ public:
+  using BaseWhereOpModel::BaseWhereOpModel;
+
+  std::vector<int32_t> GetOutput() { return ExtractVector<int32_t>(output_); }
+};
+
+TEST(WhereOpTest, SelectFromVectorNoResult) {
+  IntegerWhereOpModel m({TensorType_BOOL, {3}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {false, false, false});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput().size(), 0);
+}
+
+TEST(WhereOpTest, SelectFromVector) {
+  IntegerWhereOpModel m({TensorType_BOOL, {3}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {true, false, true});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 2}));
+}
+
+TEST(WhereOpTest, SelectFromMatrixNoResult) {
+  IntegerWhereOpModel m({TensorType_BOOL, {3, 3}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {false, false, false,  //
+                                     false, false, false,  //
+                                     false, false, false});
+  m.Invoke();
+  EXPECT_EQ(m.GetOutput().size(), 0);
+}
+
+TEST(WhereOpTest, SelectFromMatrix1) {
+  IntegerWhereOpModel m({TensorType_BOOL, {3, 1}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {true, false, true});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0,  //
+                                               2, 0}));
+}
+
+TEST(WhereOpTest, SelectFromMatrix2) {
+  IntegerWhereOpModel m({TensorType_BOOL, {3, 3}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {true, true, false,   //
+                                     true, false, false,  //
+                                     true, false, true});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0,  //
+                                               0, 1,  //
+                                               1, 0,  //
+                                               2, 0,  //
+                                               2, 2}));
+}
+
+TEST(WhereOpTest, SelectFromMatrix3) {
+  IntegerWhereOpModel m({TensorType_BOOL, {3, 5}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {true, false, false, true, true,   //
+                                     false, true, true, false, false,  //
+                                     true, false, true, false, false});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0,  //
+                                               0, 3,  //
+                                               0, 4,  //
+                                               1, 1,  //
+                                               1, 2,  //
+                                               2, 0,  //
+                                               2, 2}));
+}
+
+TEST(WhereOpTest, SelectFromRank3TensorNoResult) {
+  IntegerWhereOpModel m({TensorType_BOOL, {2, 2, 2}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {false, false, false, false,  //
+                                     false, false, false, false});
+  m.Invoke();
+  EXPECT_EQ(m.GetOutput().size(), 0);
+}
+
+TEST(WhereOpTest, SelectFromRank3Tensor1) {
+  IntegerWhereOpModel m({TensorType_BOOL, {2, 1, 3}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {true, false, true,  //
+                                     false, false, true});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0,  //
+                                               0, 0, 2,  //
+                                               1, 0, 2}));
+}
+
+TEST(WhereOpTest, SelectFromRank3Tensor2) {
+  IntegerWhereOpModel m({TensorType_BOOL, {2, 2, 2}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {true, true, false, true,  //
+                                     false, false, true, true});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0,  //
+                                               0, 0, 1,  //
+                                               0, 1, 1,  //
+                                               1, 1, 0,  //
+                                               1, 1, 1}));
+}
+
+TEST(WhereOpTest, SelectFromRank3Tensor3) {
+  IntegerWhereOpModel m({TensorType_BOOL, {2, 3, 2}}, {TensorType_INT32, {}});
+  m.PopulateTensor<bool>(m.input(), {true, true, false, true, false, false,  //
+                                     false, false, true, false, true, true});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0,  //
+                                               0, 0, 1,  //
+                                               0, 1, 1,  //
+                                               1, 1, 0,  //
+                                               1, 2, 0,  //
+                                               1, 2, 1}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/while.cc b/tensorflow/lite/kernels/while.cc
new file mode 100644
index 0000000000000000000000000000000000000000..07a48e67506978cea9a91f99c20b638de7ffbab9
--- /dev/null
+++ b/tensorflow/lite/kernels/while.cc
@@ -0,0 +1,312 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+namespace while_kernel {
+
+namespace {
+
+// Propagate tensor shapes and types from `src_tensor_indices` in `src_subgraph`
+// to `dst_tensor_indices` in `dst_subgraph`.
+template <typename SrcVector, typename DstVector>
+TfLiteStatus CopyTensorsShapeAndType(TfLiteContext* context,
+                                     Subgraph* src_subgraph,
+                                     const SrcVector& src_tensor_indices,
+                                     Subgraph* dst_subgraph,
+                                     const DstVector& dst_tensor_indices) {
+  TF_LITE_ENSURE_EQ(context, src_tensor_indices.size(),
+                    dst_tensor_indices.size());
+  for (int i = 0; i < src_tensor_indices.size(); ++i) {
+    const TfLiteTensor* src_tensor =
+        src_subgraph->tensor(src_tensor_indices[i]);
+    std::vector<int> dims(src_tensor->dims->data,
+                          src_tensor->dims->data + src_tensor->dims->size);
+    dst_subgraph->ResizeInputTensor(dst_tensor_indices[i], dims);
+    TfLiteTensor* dst_tensor = dst_subgraph->tensor(dst_tensor_indices[i]);
+    dst_tensor->type = src_tensor->type;
+  }
+  return kTfLiteOk;
+}
+
+// Copy the tensors data from tensors `src_tensor_indices` in `src_subgraph`
+// to `dst_tensor_indices` in `dst_subgraph`.
+template <typename SrcVector, typename DstVector>
+TfLiteStatus CopyTensorsData(TfLiteContext* context, Subgraph* src_subgraph,
+                             const SrcVector& src_tensor_indices,
+                             Subgraph* dst_subgraph,
+                             const DstVector& dst_tensor_indices) {
+  TF_LITE_ENSURE_EQ(context, src_tensor_indices.size(),
+                    dst_tensor_indices.size());
+  for (int i = 0; i < src_tensor_indices.size(); ++i) {
+    const TfLiteTensor* src_tensor =
+        src_subgraph->tensor(src_tensor_indices[i]);
+    TfLiteTensor* dst_tensor = dst_subgraph->tensor(dst_tensor_indices[i]);
+    TF_LITE_ENSURE_EQ(context, src_tensor->bytes, dst_tensor->bytes);
+    memcpy(dst_tensor->data.raw, src_tensor->data.raw, src_tensor->bytes);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus CheckCondOutput(TfLiteContext* context,
+                             const TfLiteTensor* cond_output) {
+  // The condition output must be a single boolean value.
+  TF_LITE_ENSURE_EQ(context, cond_output->type, kTfLiteBool);
+  if (cond_output->dims->size == 0) {
+    // It's okay if it's a 0D scalar.
+    return kTfLiteOk;
+  }
+  // Otherwise it must be 1D with shape [1].
+  TF_LITE_ENSURE_EQ(context, cond_output->dims->size, 1);
+  TF_LITE_ENSURE_EQ(context, cond_output->dims->data[0], 1);
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+struct OpData {
+  int cond_subgraph_index;
+  int body_subgraph_index;
+  bool cond_has_dynamic_output_tensors;
+  bool body_has_dynamic_output_tensors;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* op_data = new OpData;
+  const uint8_t* buffer_t = reinterpret_cast<const uint8_t*>(buffer);
+  const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap();
+  op_data->cond_subgraph_index = m["cond_subgraph_index"].AsInt32();
+  op_data->body_subgraph_index = m["body_subgraph_index"].AsInt32();
+  op_data->cond_has_dynamic_output_tensors = false;
+  op_data->body_has_dynamic_output_tensors = false;
+  return op_data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+  int num_inputs = node->inputs->size;
+  // The number of outputs should be the same as number of inputs.
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, num_inputs);
+
+  // Check subgraph indices and get subgraphs.
+  Subgraph* this_subgraph = reinterpret_cast<Subgraph*>(context->impl_);
+  auto* subgraphs = this_subgraph->GetSubgraphs();
+  TF_LITE_ENSURE(context, op_data->cond_subgraph_index < subgraphs->size());
+  TF_LITE_ENSURE(context, op_data->body_subgraph_index < subgraphs->size());
+
+  Subgraph* cond_subgraph = (*subgraphs)[op_data->cond_subgraph_index].get();
+  Subgraph* body_subgraph = (*subgraphs)[op_data->body_subgraph_index].get();
+
+  // Check input & output count of the condition subgraph.
+  TF_LITE_ENSURE_EQ(context, cond_subgraph->inputs().size(), num_inputs);
+  TF_LITE_ENSURE_EQ(context, cond_subgraph->outputs().size(), 1);
+
+  // Check input & output count of the body subgraph.
+  TF_LITE_ENSURE_EQ(context, body_subgraph->inputs().size(), num_inputs);
+  TF_LITE_ENSURE_EQ(context, body_subgraph->outputs().size(), num_inputs);
+
+  // Prepare and check the condition subgraph.
+  TF_LITE_ENSURE_OK(
+      context, CopyTensorsShapeAndType(context, this_subgraph,
+                                       TfLiteIntArrayView(node->inputs),
+                                       cond_subgraph, cond_subgraph->inputs()));
+  TF_LITE_ENSURE_OK(context, cond_subgraph->AllocateTensors());
+  TfLiteTensor* cond_output =
+      cond_subgraph->tensor(cond_subgraph->outputs()[0]);
+  // TODO(ycling): Handle the case the cond subgraph has dynamic tensor outputs.
+  // This should rarely happens. In most cases the output is static with shape
+  // [1]. However theoretically intermediate tensors in the cond subgraph
+  // can be dynamic.
+  if (IsDynamicTensor(cond_output)) {
+    op_data->cond_has_dynamic_output_tensors = true;
+  } else {
+    TF_LITE_ENSURE_STATUS(CheckCondOutput(context, cond_output));
+  }
+
+  // Prepare and check the body subgraph.
+  TF_LITE_ENSURE_OK(
+      context, CopyTensorsShapeAndType(context, this_subgraph,
+                                       TfLiteIntArrayView(node->inputs),
+                                       body_subgraph, body_subgraph->inputs()));
+  TF_LITE_ENSURE_OK(context, body_subgraph->AllocateTensors());
+  if (body_subgraph->HasDynamicTensors()) {
+    op_data->body_has_dynamic_output_tensors = true;
+  } else {
+    for (int i = 0; i < num_inputs; ++i) {
+      TfLiteTensor* body_input =
+          body_subgraph->tensor(body_subgraph->inputs()[i]);
+      TfLiteTensor* body_output =
+          body_subgraph->tensor(body_subgraph->outputs()[i]);
+      TF_LITE_ENSURE_EQ(context, body_input->type, body_output->type);
+
+      // TODO(ycling): Support dynamic sized body subgraph.
+      TF_LITE_ENSURE(context, !IsDynamicTensor(body_output));
+      if (!TfLiteIntArrayEqual(body_input->dims, body_output->dims)) {
+        // If the output shape of the body subgraph is static w.r.t. a fixed
+        // input size, but it's different from input size, it's still considered
+        // dynamic. For example: If a subgraph keeps padding its input with a
+        // fixed padding, the output shape is static w.r.t the input shape and
+        // padding, but running it in a loop will keep bloating the tensor.
+        op_data->body_has_dynamic_output_tensors = true;
+        break;
+      }
+    }
+  }
+  for (int i = 0; i < num_inputs; ++i) {
+    TfLiteTensor* output = GetOutput(context, node, i);
+    if (op_data->body_has_dynamic_output_tensors) {
+      SetTensorToDynamic(output);
+    } else {
+      TfLiteTensor* body_output =
+          body_subgraph->tensor(body_subgraph->outputs()[i]);
+      TfLiteIntArray* output_size = TfLiteIntArrayCopy(body_output->dims);
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, output, output_size));
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+  Subgraph* this_subgraph = reinterpret_cast<Subgraph*>(context->impl_);
+  auto* subgraphs = this_subgraph->GetSubgraphs();
+  Subgraph* cond_subgraph = (*subgraphs)[op_data->cond_subgraph_index].get();
+  Subgraph* body_subgraph = (*subgraphs)[op_data->body_subgraph_index].get();
+
+  // The follow graph illustrates the current implementation.
+  //
+  // This Subgraph          Cond Subgraph         Body Subgraph
+  // +-----------+   (1)   +------------+   (3)   +------------+
+  // |   WHILE   |-------->|  SUBGRAPH  |-------->|  SUBGRAPH  |
+  // |   INPUT   |        /|   INPUT    |<-----   |   INPUT    |
+  // +-----------+       / +------------+      \  +------------+
+  //                    /        |              \       |
+  //               (6) /         | (2)       (5) \      | (4)
+  //                  /          v                \     v
+  // +-----------+   /     +------------+         +------------+
+  // |   WHILE   |<--      |  SUBGRAPH  |         |  SUBGRAPH  |
+  // |   OUTPUT  |         |   OUTPUT   |         |   OUTPUT   |
+  // +-----------+         +------------+         +------------+
+  //
+  // (1) Copy the inputs of WHILE op to the inputs of condition subgraph.
+  // (2) Invoke condition subgraph.
+  //     Jump to step 5 if result is false.
+  // (3) Copy the inputs of condition subgraph to the inputs of body subgraph.
+  // (4) Invoke body subgraph.
+  // (5) Copy the outputs of body subgraph to the inputs condition subgraph.
+  //     Jump back to step 2!
+  // (6) Copy the inputs of condition subgraph to the outputs of WHILE op.
+  //
+  // If the body subgraph has dynamic sized outputs, it's required to resize the
+  // tensor before copying in step 1, 3, 4 and 6.
+  //
+  // Note the flow is carefully designed to handle the dynamic sized output
+  // case. The loop invariant is: The newest value is in the inputs of condition
+  // subgraph. This is always true before step 2.
+  //
+  // This is the best we can do without sharing tensor buffer across subgraph
+  // boundry. Currently we copy the input / output between the subgraphs. This
+  // isn't optimized yet and a lot of redundent copies are made.
+  // TODO(b/120234921): Optimize and avoid copying tensors between subgraphs.
+  TF_LITE_ENSURE_OK(
+      context,
+      CopyTensorsData(context, this_subgraph, TfLiteIntArrayView(node->inputs),
+                      cond_subgraph, cond_subgraph->inputs()));
+
+  while (true) {
+    TF_LITE_ENSURE_OK(context, cond_subgraph->Invoke());
+    int cond_subgraph_output_index = cond_subgraph->outputs()[0];
+    cond_subgraph->EnsureTensorDataIsReadable(cond_subgraph_output_index);
+    TfLiteTensor* cond_output =
+        cond_subgraph->tensor(cond_subgraph_output_index);
+    if (op_data->cond_has_dynamic_output_tensors) {
+      TF_LITE_ENSURE_STATUS(CheckCondOutput(context, cond_output));
+    }
+
+    if (!cond_output->data.b[0]) {
+      break;
+    }
+    if (op_data->body_has_dynamic_output_tensors) {
+      TF_LITE_ENSURE_OK(context,
+                        CopyTensorsShapeAndType(
+                            context, cond_subgraph, cond_subgraph->inputs(),
+                            body_subgraph, body_subgraph->inputs()));
+      TF_LITE_ENSURE_OK(context, body_subgraph->AllocateTensors());
+    }
+
+    TF_LITE_ENSURE_OK(
+        context,
+        CopyTensorsData(context, cond_subgraph, cond_subgraph->inputs(),
+                        body_subgraph, body_subgraph->inputs()));
+
+    TF_LITE_ENSURE_OK(context, body_subgraph->Invoke());
+
+    for (int tensor_index : body_subgraph->outputs()) {
+      body_subgraph->EnsureTensorDataIsReadable(tensor_index);
+    }
+
+    if (op_data->body_has_dynamic_output_tensors) {
+      TF_LITE_ENSURE_OK(context,
+                        CopyTensorsShapeAndType(
+                            context, body_subgraph, body_subgraph->outputs(),
+                            cond_subgraph, cond_subgraph->inputs()));
+      TF_LITE_ENSURE_OK(context, cond_subgraph->AllocateTensors());
+    }
+
+    TF_LITE_ENSURE_OK(
+        context,
+        CopyTensorsData(context, body_subgraph, body_subgraph->outputs(),
+                        cond_subgraph, cond_subgraph->inputs()));
+  }
+
+  // Note that copying from body's output will fail if body is never invoked.
+  // TODO(b/120234921): Optimize and avoid copying tensors between subgraphs.
+  if (op_data->body_has_dynamic_output_tensors) {
+    TF_LITE_ENSURE_OK(
+        context, CopyTensorsShapeAndType(context, cond_subgraph,
+                                         cond_subgraph->inputs(), this_subgraph,
+                                         TfLiteIntArrayView(node->outputs)));
+  }
+
+  TF_LITE_ENSURE_OK(
+      context,
+      CopyTensorsData(context, cond_subgraph, cond_subgraph->inputs(),
+                      this_subgraph, TfLiteIntArrayView(node->outputs)));
+  return kTfLiteOk;
+}
+
+}  // namespace while_kernel
+
+TfLiteRegistration* Register_WHILE() {
+  static TfLiteRegistration r = {while_kernel::Init, while_kernel::Free,
+                                 while_kernel::Prepare, while_kernel::Eval};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/while_test.cc b/tensorflow/lite/kernels/while_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9946b4a3280116d7cb176f54b94b73bb956a5f71
--- /dev/null
+++ b/tensorflow/lite/kernels/while_test.cc
@@ -0,0 +1,94 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/subgraph_test_util.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+
+using subgraph_test_util::CheckIntTensor;
+using subgraph_test_util::ControlFlowOpTest;
+using subgraph_test_util::FillIntTensor;
+
+namespace {
+
+class WhileTest : public ControlFlowOpTest {};
+
+// The test builds a model that produces the i-th number of
+// triangular number sequence.
+//
+// TODO(ycling): Consider to improve this test case by adding a
+// concat into the body subgraph.
+TEST_F(WhileTest, TestTriangularNumberSequence) {
+  const std::vector<int> expected = {1, 3, 6, 10, 15, 21, 28};
+  for (int i = 0; i < expected.size(); ++i) {
+    interpreter_.reset(new Interpreter);
+    interpreter_->AddSubgraphs(2);
+    builder_->BuildLessEqualCondSubgraph(interpreter_->subgraph(1), i);
+    builder_->BuildAccumulateLoopBodySubgraph(interpreter_->subgraph(2));
+    builder_->BuildWhileSubgraph(&interpreter_->primary_subgraph());
+
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {1});
+    interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {1});
+    ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+    FillIntTensor(interpreter_->tensor(interpreter_->inputs()[0]), {1});
+    FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]), {1});
+
+    ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+    TfLiteTensor* output1 = interpreter_->tensor(interpreter_->outputs()[0]);
+    CheckIntTensor(output1, {1}, {i + 1});
+    TfLiteTensor* output2 = interpreter_->tensor(interpreter_->outputs()[1]);
+    CheckIntTensor(output2, {1}, {expected[i]});
+  }
+}
+
+// This requires dynamic sized subgraphs and it's not supported right now.
+// TODO(ycling): Support dynamic sized subgraphs.
+TEST_F(WhileTest, TestPadLoop) {
+  interpreter_.reset(new Interpreter);
+  interpreter_->AddSubgraphs(2);
+  builder_->BuildLessEqualCondSubgraph(interpreter_->subgraph(1), 3);
+  builder_->BuildPadLoopBodySubgraph(interpreter_->subgraph(2), {1, 2});
+  builder_->BuildWhileSubgraph(&interpreter_->primary_subgraph());
+
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[0], {1});
+  interpreter_->ResizeInputTensor(interpreter_->inputs()[1], {2});
+  // This is not supported yet. The test ensures thatit doesn't crash and raises
+  // an error properly.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[0]), {1});
+  FillIntTensor(interpreter_->tensor(interpreter_->inputs()[1]), {5, 7});
+
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  TfLiteTensor* output1 = interpreter_->tensor(interpreter_->outputs()[0]);
+  CheckIntTensor(output1, {1}, {4});
+  TfLiteTensor* output2 = interpreter_->tensor(interpreter_->outputs()[1]);
+  CheckIntTensor(output2, {11}, {0, 0, 0, 5, 7, 0, 0, 0, 0, 0, 0});
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/minimal_logging.cc b/tensorflow/lite/minimal_logging.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8768ef6e312ec80c9b3653983421e07c662f8e5e
--- /dev/null
+++ b/tensorflow/lite/minimal_logging.cc
@@ -0,0 +1,44 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/minimal_logging.h"
+
+#include <cstdarg>
+
+namespace tflite {
+namespace logging_internal {
+
+void MinimalLogger::Log(LogSeverity severity, const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  VLog(severity, format, args);
+  va_end(args);
+}
+
+const char* MinimalLogger::GetSeverityName(LogSeverity severity) {
+  switch (severity) {
+    case TFLITE_LOG_INFO:
+      return "INFO";
+    case TFLITE_LOG_WARNING:
+      return "WARNING";
+    case TFLITE_LOG_ERROR:
+      return "ERROR";
+    default:
+      return "<Unknown severity>";
+  }
+}
+
+}  // namespace logging_internal
+}  // namespace tflite
diff --git a/tensorflow/lite/minimal_logging.h b/tensorflow/lite/minimal_logging.h
new file mode 100644
index 0000000000000000000000000000000000000000..7682ed8edc401762613a9cae582fdf1e16e61f51
--- /dev/null
+++ b/tensorflow/lite/minimal_logging.h
@@ -0,0 +1,56 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MINIMAL_LOGGING_H_
+#define TENSORFLOW_LITE_MINIMAL_LOGGING_H_
+
+#include <cstdarg>
+
+namespace tflite {
+
+enum LogSeverity {
+  TFLITE_LOG_INFO = 0,
+  TFLITE_LOG_WARNING = 1,
+  TFLITE_LOG_ERROR = 2,
+};
+
+namespace logging_internal {
+
+// Helper class for simple platform-specific console logging. Note that we
+// explicitly avoid the convenience of ostream-style logging to minimize binary
+// size impact.
+class MinimalLogger {
+ public:
+  // Logging hook that takes variadic args.
+  static void Log(LogSeverity severity, const char* format, ...);
+
+  // Logging hook that takes a formatted va_list.
+  static void VLog(LogSeverity severity, const char* format, va_list args);
+
+ private:
+  static const char* GetSeverityName(LogSeverity severity);
+};
+
+}  // namespace logging_internal
+}  // namespace tflite
+
+// Convenience macro for basic internal logging in production builds.
+// Note: This should never be used for debug-type logs, as it will *not* be
+// stripped in release optimized builds. In general, prefer the error reporting
+// APIs for developer-facing errors, and only use this for diagnostic output
+// that should always be logged in user builds.
+#define TFLITE_LOG_PROD(severity, format, ...) \
+  tflite::logging_internal::MinimalLogger::Log(severity, format, ##__VA_ARGS__);
+
+#endif  // TENSORFLOW_LITE_MINIMAL_LOGGING_H_
diff --git a/tensorflow/lite/minimal_logging_android.cc b/tensorflow/lite/minimal_logging_android.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f87e6fa18e1dac7e2b4e093f6d4e91a1e652ba3d
--- /dev/null
+++ b/tensorflow/lite/minimal_logging_android.cc
@@ -0,0 +1,55 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/minimal_logging.h"
+
+#include <android/log.h>
+#include <cstdio>
+
+namespace tflite {
+namespace logging_internal {
+namespace {
+
+int GetPlatformSeverity(LogSeverity severity) {
+  switch (severity) {
+    case TFLITE_LOG_INFO:
+      return ANDROID_LOG_INFO;
+    case TFLITE_LOG_WARNING:
+      return ANDROID_LOG_WARN;
+    case TFLITE_LOG_ERROR:
+      return ANDROID_LOG_ERROR;
+    default:
+      return ANDROID_LOG_DEBUG;
+  }
+}
+
+}  // namespace
+
+void MinimalLogger::VLog(LogSeverity severity, const char* format,
+                         va_list args) {
+  // First log to Android's explicit log(cat) API.
+  va_list args_for_android_log;
+  va_copy(args_for_android_log, args);
+  __android_log_vprint(GetPlatformSeverity(severity), "tflite", format, args);
+  va_end(args_for_android_log);
+
+  // Also print to stderr for standard console applications.
+  fprintf(stderr, "%s: ", GetSeverityName(severity));
+  vfprintf(stderr, format, args);
+  fputc('\n', stderr);
+}
+
+}  // namespace logging_internal
+}  // namespace tflite
diff --git a/tensorflow/lite/minimal_logging_default.cc b/tensorflow/lite/minimal_logging_default.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9fa13e47e63a01b5c15ada3b09498fdb755f6376
--- /dev/null
+++ b/tensorflow/lite/minimal_logging_default.cc
@@ -0,0 +1,31 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/minimal_logging.h"
+
+#include <cstdio>
+
+namespace tflite {
+namespace logging_internal {
+
+void MinimalLogger::VLog(LogSeverity severity, const char* format,
+                         va_list args) {
+  fprintf(stderr, "%s: ", GetSeverityName(severity));
+  vfprintf(stderr, format, args);
+  fputc('\n', stderr);
+}
+
+}  // namespace logging_internal
+}  // namespace tflite
diff --git a/tensorflow/lite/minimal_logging_ios.cc b/tensorflow/lite/minimal_logging_ios.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a774682a5b42f71d0cc77c49bbcf9a4ec6ef21b7
--- /dev/null
+++ b/tensorflow/lite/minimal_logging_ios.cc
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/minimal_logging.h"
+
+#include <syslog.h>
+#include <cstdarg>
+
+namespace tflite {
+namespace logging_internal {
+namespace {
+
+int GetPlatformSeverity(LogSeverity severity) {
+  switch (severity) {
+    case TFLITE_LOG_INFO:
+      return LOG_INFO;
+    case TFLITE_LOG_WARNING:
+      return LOG_WARNING;
+    case TFLITE_LOG_ERROR:
+      return LOG_ERR;
+    default:
+      return LOG_DEBUG;
+  }
+}
+
+}  // namespace
+
+void MinimalLogger::VLog(LogSeverity severity, const char* format,
+                         va_list args) {
+  // TODO(b/123704468): Use os_log when available.
+  vsyslog(GetPlatformSeverity(severity), format, args);
+}
+
+}  // namespace logging_internal
+}  // namespace tflite
diff --git a/tensorflow/lite/minimal_logging_test.cc b/tensorflow/lite/minimal_logging_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e59425a2b264b72d44477c6484fc0ffea014a750
--- /dev/null
+++ b/tensorflow/lite/minimal_logging_test.cc
@@ -0,0 +1,60 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/minimal_logging.h"
+
+#include <string>
+
+#include <gtest/gtest.h>
+
+namespace tflite {
+
+TEST(MinimalLogging, Basic) {
+  testing::internal::CaptureStderr();
+  TFLITE_LOG_PROD(TFLITE_LOG_INFO, "Foo");
+  EXPECT_EQ("INFO: Foo\n", testing::internal::GetCapturedStderr());
+}
+
+TEST(MinimalLogging, BasicFormatted) {
+  testing::internal::CaptureStderr();
+  TFLITE_LOG_PROD(TFLITE_LOG_INFO, "Foo %s %s", "Bar", "Baz");
+  EXPECT_EQ("INFO: Foo Bar Baz\n", testing::internal::GetCapturedStderr());
+}
+
+TEST(MinimalLogging, Warn) {
+  testing::internal::CaptureStderr();
+  TFLITE_LOG_PROD(TFLITE_LOG_WARNING, "One", "");
+  EXPECT_EQ("WARNING: One\n", testing::internal::GetCapturedStderr());
+}
+
+TEST(MinimalLogging, Error) {
+  testing::internal::CaptureStderr();
+  TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Two");
+  EXPECT_EQ("ERROR: Two\n", testing::internal::GetCapturedStderr());
+}
+
+TEST(MinimalLogging, UnknownSeverity) {
+  testing::internal::CaptureStderr();
+  TFLITE_LOG_PROD(static_cast<LogSeverity>(-1), "Three");
+  EXPECT_EQ("<Unknown severity>: Three\n",
+            testing::internal::GetCapturedStderr());
+}
+
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/model.cc b/tensorflow/lite/model.cc
index 5ac0532afeffc0801a207c385be9816fa459b416..e333138fb78e81316e29a7c37b1fba2df7b1408a 100644
--- a/tensorflow/lite/model.cc
+++ b/tensorflow/lite/model.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 #include "tensorflow/lite/model.h"
@@ -85,48 +86,79 @@ std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromFile(
   std::unique_ptr<FlatBufferModel> model;
   auto allocation = GetAllocationFromFile(filename, /*mmap_file=*/true,
                                           error_reporter, /*use_nnapi=*/true);
-  model.reset(new FlatBufferModel(allocation.release(), error_reporter));
+  model.reset(new FlatBufferModel(std::move(allocation), error_reporter));
   if (!model->initialized()) model.reset();
   return model;
 }
 
 std::unique_ptr<FlatBufferModel> FlatBufferModel::VerifyAndBuildFromFile(
-    const char* filename, TfLiteVerifier* verifier,
+    const char* filename, TfLiteVerifier* extra_verifier,
     ErrorReporter* error_reporter) {
   error_reporter = ValidateErrorReporter(error_reporter);
 
   std::unique_ptr<FlatBufferModel> model;
   auto allocation = GetAllocationFromFile(filename, /*mmap_file=*/true,
                                           error_reporter, /*use_nnapi=*/true);
-  if (verifier &&
-      !verifier->Verify(static_cast<const char*>(allocation->base()),
-                        allocation->bytes(), error_reporter)) {
+
+  flatbuffers::Verifier base_verifier(
+      reinterpret_cast<const uint8_t*>(allocation->base()),
+      allocation->bytes());
+  if (!VerifyModelBuffer(base_verifier)) {
+    error_reporter->Report("The model is not a valid Flatbuffer file");
+    return nullptr;
+  }
+
+  if (extra_verifier &&
+      !extra_verifier->Verify(static_cast<const char*>(allocation->base()),
+                              allocation->bytes(), error_reporter)) {
     return model;
   }
-  model.reset(new FlatBufferModel(allocation.release(), error_reporter));
+  model.reset(new FlatBufferModel(std::move(allocation), error_reporter));
   if (!model->initialized()) model.reset();
   return model;
 }
 #endif
 
 std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromBuffer(
-    const char* buffer, size_t buffer_size, ErrorReporter* error_reporter) {
+    const char* caller_owned_buffer, size_t buffer_size,
+    ErrorReporter* error_reporter) {
   error_reporter = ValidateErrorReporter(error_reporter);
 
   std::unique_ptr<FlatBufferModel> model;
-  Allocation* allocation =
-      new MemoryAllocation(buffer, buffer_size, error_reporter);
-  model.reset(new FlatBufferModel(allocation, error_reporter));
+  std::unique_ptr<Allocation> allocation(
+      new MemoryAllocation(caller_owned_buffer, buffer_size, error_reporter));
+  model.reset(new FlatBufferModel(std::move(allocation), error_reporter));
   if (!model->initialized()) model.reset();
   return model;
 }
 
+std::unique_ptr<FlatBufferModel> FlatBufferModel::VerifyAndBuildFromBuffer(
+    const char* buffer, size_t buffer_size, TfLiteVerifier* extra_verifier,
+    ErrorReporter* error_reporter) {
+  error_reporter = ValidateErrorReporter(error_reporter);
+
+  flatbuffers::Verifier base_verifier(reinterpret_cast<const uint8_t*>(buffer),
+                                      buffer_size);
+  if (!VerifyModelBuffer(base_verifier)) {
+    error_reporter->Report("The model is not a valid Flatbuffer buffer");
+    return nullptr;
+  }
+
+  if (extra_verifier &&
+      !extra_verifier->Verify(buffer, buffer_size, error_reporter)) {
+    return nullptr;
+  }
+
+  return BuildFromBuffer(buffer, buffer_size, error_reporter);
+}
+
 std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromModel(
-    const tflite::Model* model_spec, ErrorReporter* error_reporter) {
+    const tflite::Model* caller_owned_model_spec,
+    ErrorReporter* error_reporter) {
   error_reporter = ValidateErrorReporter(error_reporter);
 
   std::unique_ptr<FlatBufferModel> model;
-  model.reset(new FlatBufferModel(model_spec, error_reporter));
+  model.reset(new FlatBufferModel(caller_owned_model_spec, error_reporter));
   if (!model->initialized()) model.reset();
   return model;
 }
@@ -144,20 +176,18 @@ bool FlatBufferModel::CheckModelIdentifier() const {
 
 FlatBufferModel::FlatBufferModel(const Model* model,
                                  ErrorReporter* error_reporter)
-    : error_reporter_(ValidateErrorReporter(error_reporter)) {
-  model_ = model;
-}
+    : model_(model), error_reporter_(ValidateErrorReporter(error_reporter)) {}
 
-FlatBufferModel::FlatBufferModel(Allocation* allocation,
+FlatBufferModel::FlatBufferModel(std::unique_ptr<Allocation> allocation,
                                  ErrorReporter* error_reporter)
-    : error_reporter_(ValidateErrorReporter(error_reporter)) {
-  allocation_ = allocation;
+    : error_reporter_(ValidateErrorReporter(error_reporter)),
+      allocation_(std::move(allocation)) {
   if (!allocation_->valid() || !CheckModelIdentifier()) return;
 
   model_ = ::tflite::GetModel(allocation_->base());
 }
 
-FlatBufferModel::~FlatBufferModel() { delete allocation_; }
+FlatBufferModel::~FlatBufferModel() {}
 
 InterpreterBuilder::InterpreterBuilder(const FlatBufferModel& model,
                                        const OpResolver& op_resolver)
@@ -216,11 +246,11 @@ class MallocDataAllocator : public BuiltinDataAllocator {
 
 TfLiteStatus InterpreterBuilder::ParseNodes(
     const flatbuffers::Vector<flatbuffers::Offset<Operator>>* operators,
-    Interpreter* interpreter) {
+    Subgraph* subgraph) {
   TfLiteStatus status = kTfLiteOk;
 
   // Reduce the number of redundant allocations
-  interpreter->ReserveNodes(operators->Length());
+  subgraph->ReserveNodes(operators->Length());
 
   for (int i = 0; i < operators->Length(); ++i) {
     const auto* op = operators->Get(i);
@@ -250,7 +280,7 @@ TfLiteStatus InterpreterBuilder::ParseNodes(
     }
 
     if (op->custom_options()) {
-      interpreter->AddNodeWithParameters(
+      subgraph->AddNodeWithParameters(
           FlatBufferIntArrayToVector(op->inputs()),
           FlatBufferIntArrayToVector(op->outputs()),
           reinterpret_cast<const char*>(op->custom_options()->data()),
@@ -260,24 +290,73 @@ TfLiteStatus InterpreterBuilder::ParseNodes(
       MallocDataAllocator malloc_allocator;
       TF_LITE_ENSURE_STATUS(ParseOpData(op, op_type, error_reporter_,
                                         &malloc_allocator, &builtin_data));
-      interpreter->AddNodeWithParameters(
-          FlatBufferIntArrayToVector(op->inputs()),
-          FlatBufferIntArrayToVector(op->outputs()), nullptr, 0, builtin_data,
-          registration);
+      subgraph->AddNodeWithParameters(FlatBufferIntArrayToVector(op->inputs()),
+                                      FlatBufferIntArrayToVector(op->outputs()),
+                                      nullptr, 0, builtin_data, registration);
     }
   }
 
   return status;
 }
 
+TfLiteStatus InterpreterBuilder::ParseQuantization(
+    const QuantizationParameters* src_quantization,
+    TfLiteQuantization* quantization) {
+  quantization->type = kTfLiteNoQuantization;
+  if (!src_quantization || !src_quantization->scale() ||
+      src_quantization->scale()->size() == 0) {
+    return kTfLiteOk;
+  }
+  if (!src_quantization->zero_point()) {
+    error_reporter_->Report(
+        "Quantization parameters has non-null scale but null zero_point.");
+    return kTfLiteError;
+  }
+
+  // Ensure that the number of scales matches the number of zero_points.
+  if (src_quantization->scale()->size() !=
+      src_quantization->zero_point()->size()) {
+    error_reporter_->Report(
+        "QuantizationParam has %d zero_point values and %d scale values. Must "
+        "have same number.",
+        src_quantization->zero_point()->size(),
+        src_quantization->scale()->size());
+    return kTfLiteError;
+  }
+
+  // Affine-quantization.
+  quantization->type = kTfLiteAffineQuantization;
+  auto* affine_quantization = reinterpret_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  const size_t num_scales = src_quantization->scale()->size();
+  affine_quantization->scale = TfLiteFloatArrayCreate(num_scales);
+  affine_quantization->zero_point = TfLiteIntArrayCreate(num_scales);
+  for (size_t i = 0; i < num_scales; ++i) {
+    affine_quantization->scale->data[i] = src_quantization->scale()->Get(i);
+    affine_quantization->zero_point->data[i] =
+        src_quantization->zero_point()->Get(i);
+  }
+  if (src_quantization->quantized_dimension() < 0 ||
+      src_quantization->quantized_dimension() >= num_scales) {
+    error_reporter_->Report(
+        "quantized_dimension must be in range [0, %d). Was %d.", num_scales,
+        src_quantization->quantized_dimension());
+    return kTfLiteError;
+  }
+  affine_quantization->quantized_dimension =
+      src_quantization->quantized_dimension();
+  quantization->params = reinterpret_cast<void*>(affine_quantization);
+  return kTfLiteOk;
+}
+
 TfLiteStatus InterpreterBuilder::ParseTensors(
     const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
     const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors,
-    Interpreter* interpreter) {
+    Subgraph* subgraph) {
   TfLiteStatus status = kTfLiteOk;
 
   // A little helper to get the names of inputs and outputs. Note that they
-  // must outlive the interpreter.
+  // must outlive the subgraph.
   auto get_name = [](const tflite::Tensor* t) -> const char* {
     auto name = t->name();
     if (name) return name->c_str();
@@ -288,36 +367,11 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
     const auto* tensor = tensors->Get(i);
     std::vector<int> dims = FlatBufferIntArrayToVector(tensor->shape());
 
-    TfLiteQuantizationParams quantization;
-    quantization.scale = 0;
-    quantization.zero_point = 0;
-    auto* q_params = tensor->quantization();
-    if (q_params) {
-      // Note that the schema could hold per-channel quantization parameters
-      // but we really only support one value for the whole tensor.
-      // TODO(aselle): This breaks as well if these are nullptr's.
-      // TODO(aselle): This assumes non per-channel quantization.
-
-      if (q_params->scale()) {
-        if (q_params->scale()->size() != 1) {
-          error_reporter_->Report(
-              "QuantizationParam has %d scale values (only 1 is supported).",
-              q_params->scale()->size());
-          return kTfLiteError;
-        }
-        quantization.scale = q_params->scale()->Get(0);
-      }
-
-      if (q_params->zero_point()) {
-        if (q_params->zero_point()->size() != 1) {
-          error_reporter_->Report(
-              "QuantizationParam has %d zero_point values"
-              " (only 1 is supported).",
-              q_params->zero_point()->size());
-          return kTfLiteError;
-        }
-        quantization.zero_point = q_params->zero_point()->Get(0);
-      }
+    const auto* src_quantization = tensor->quantization();
+    TfLiteQuantization quantization;
+    if (ParseQuantization(src_quantization, &quantization) != kTfLiteOk) {
+      status = kTfLiteError;
+      continue;
     }
 
     TfLiteType type;
@@ -363,7 +417,7 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
         status = kTfLiteError;
       }
 
-      if (interpreter->SetTensorParametersReadOnly(
+      if (subgraph->SetTensorParametersReadOnly(
               i, type, get_name(tensor), dims, quantization, buffer_ptr,
               buffer_size, allocation_) != kTfLiteOk) {
         error_reporter_->Report("Tensor %d is invalidly specified in schema.\n",
@@ -371,9 +425,9 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
         status = kTfLiteError;
       }
     } else {
-      if (interpreter->SetTensorParametersReadWrite(i, type, get_name(tensor),
-                                                    dims, quantization,
-                                                    is_variable) != kTfLiteOk) {
+      if (subgraph->SetTensorParametersReadWrite(i, type, get_name(tensor),
+                                                 dims, quantization,
+                                                 is_variable) != kTfLiteOk) {
         error_reporter_->Report("Tensor %d is invalidly specified in schema.\n",
                                 i);
         status = kTfLiteError;
@@ -455,42 +509,56 @@ TfLiteStatus InterpreterBuilder::operator()(
   // Construct interpreter with correct number of tensors and operators.
   auto* subgraphs = model_->subgraphs();
   auto* buffers = model_->buffers();
-  if (subgraphs->size() != 1) {
-    error_reporter_->Report("Only 1 subgraph is currently supported.\n");
-    return cleanup_and_error();
-  }
-  const tflite::SubGraph* subgraph = (*subgraphs)[0];
-  auto operators = subgraph->operators();
-  auto tensors = subgraph->tensors();
-  if (!operators || !tensors || !buffers) {
-    error_reporter_->Report(
-        "Did not get operators, tensors, or buffers in input flat buffer.\n");
+
+  if (subgraphs->size() == 0) {
+    error_reporter_->Report("No subgraph in the model.\n");
     return cleanup_and_error();
   }
+
   interpreter->reset(new Interpreter(error_reporter_));
-  if ((**interpreter).AddTensors(tensors->Length()) != kTfLiteOk) {
-    return cleanup_and_error();
+  (*interpreter)->SetNumThreads(num_threads);
+  if (subgraphs->Length() > 1) {
+    (*interpreter)->AddSubgraphs(subgraphs->Length() - 1);
   }
-  // Set num threads
-  (**interpreter).SetNumThreads(num_threads);
-  // Parse inputs/outputs
-  (**interpreter).SetInputs(FlatBufferIntArrayToVector(subgraph->inputs()));
-  (**interpreter).SetOutputs(FlatBufferIntArrayToVector(subgraph->outputs()));
-
-  // Finally setup nodes and tensors
-  if (ParseNodes(operators, interpreter->get()) != kTfLiteOk)
-    return cleanup_and_error();
-  if (ParseTensors(buffers, tensors, interpreter->get()) != kTfLiteOk)
-    return cleanup_and_error();
 
-  std::vector<int> variables;
-  for (int i = 0; i < (*interpreter)->tensors_size(); ++i) {
-    auto* tensor = (*interpreter)->tensor(i);
-    if (tensor->is_variable) {
-      variables.push_back(i);
+  for (int subgraph_index = 0; subgraph_index < subgraphs->Length();
+       ++subgraph_index) {
+    const tflite::SubGraph* subgraph = (*subgraphs)[subgraph_index];
+    tflite::Subgraph* modified_subgraph =
+        (*interpreter)->subgraph(subgraph_index);
+    auto operators = subgraph->operators();
+    auto tensors = subgraph->tensors();
+    if (!operators || !tensors || !buffers) {
+      error_reporter_->Report(
+          "Did not get operators, tensors, or buffers in subgraph %d.\n",
+          subgraph_index);
+      return cleanup_and_error();
+    }
+    if (modified_subgraph->AddTensors(tensors->Length()) != kTfLiteOk) {
+      return cleanup_and_error();
+    }
+    // Set num threads
+    // Parse inputs/outputs
+    modified_subgraph->SetInputs(
+        FlatBufferIntArrayToVector(subgraph->inputs()));
+    modified_subgraph->SetOutputs(
+        FlatBufferIntArrayToVector(subgraph->outputs()));
+
+    // Finally setup nodes and tensors
+    if (ParseNodes(operators, modified_subgraph) != kTfLiteOk)
+      return cleanup_and_error();
+    if (ParseTensors(buffers, tensors, modified_subgraph) != kTfLiteOk)
+      return cleanup_and_error();
+
+    std::vector<int> variables;
+    for (int i = 0; i < modified_subgraph->tensors_size(); ++i) {
+      auto* tensor = modified_subgraph->tensor(i);
+      if (tensor->is_variable) {
+        variables.push_back(i);
+      }
     }
+    modified_subgraph->SetVariables(std::move(variables));
   }
-  (**interpreter).SetVariables(std::move(variables));
 
   if (ApplyDelegates(interpreter->get()) != kTfLiteOk)
     return cleanup_and_error();
diff --git a/tensorflow/lite/model.h b/tensorflow/lite/model.h
index 01e7c682056b2b14155394f978545470c7748c2d..bae4229cbab672397392349437e3c1e7e871c5d9 100644
--- a/tensorflow/lite/model.h
+++ b/tensorflow/lite/model.h
@@ -35,6 +35,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_MODEL_H_
 
 #include <memory>
+#include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/interpreter.h"
@@ -56,6 +57,9 @@ class TfLiteVerifier {
 
 // An RAII object that represents a read-only tflite model, copied from disk,
 // or mmapped. This uses flatbuffers as the serialization format.
+//
+// NOTE: The current API requires that a FlatBufferModel instance be kept alive
+// by the client as long as it is in use by any dependent Interpreter instances.
 class FlatBufferModel {
  public:
   // Builds a model based on a file.
@@ -68,29 +72,51 @@ class FlatBufferModel {
 
   // Verifies whether the content of the file is legit, then builds a model
   // based on the file.
+  // The extra_verifier argument is an additional optional verifier for the file
+  // contents. By default, we always check with tflite::VerifyModelBuffer. If
+  // extra_verifier is supplied, the file contents is also checked against the
+  // extra_verifier after the check against tflite::VerifyModelBuilder.
   // Caller retains ownership of `error_reporter` and must ensure its lifetime
   // is longer than the FlatBufferModel instance.
   // Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> VerifyAndBuildFromFile(
-      const char* filename, TfLiteVerifier* verifier = nullptr,
+      const char* filename, TfLiteVerifier* extra_verifier = nullptr,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
-  // Builds a model based on a pre-loaded flatbuffer. The caller retains
-  // ownership of the buffer and should keep it alive until the returned object
-  // is destroyed. Caller retains ownership of `error_reporter` and must ensure
-  // its lifetime is longer than the FlatBufferModel instance.
+  // Builds a model based on a pre-loaded flatbuffer.
+  // Caller retains ownership of the buffer and should keep it alive until
+  // the returned object is destroyed. Caller also retains ownership of
+  // `error_reporter` and must ensure its lifetime is longer than the
+  // FlatBufferModel instance.
   // Returns a nullptr in case of failure.
+  // NOTE: this does NOT validate the buffer so it should NOT be called on
+  // invalid/untrusted input. Use VerifyAndBuildFromBuffer in that case
   static std::unique_ptr<FlatBufferModel> BuildFromBuffer(
+      const char* caller_owned_buffer, size_t buffer_size,
+      ErrorReporter* error_reporter = DefaultErrorReporter());
+
+  // Verifies whether the content of the buffer is legit, then builds a model
+  // based on the pre-loaded flatbuffer.
+  // The extra_verifier argument is an additional optional verifier for the
+  // buffer. By default, we always check with tflite::VerifyModelBuffer. If
+  // extra_verifier is supplied, the buffer is checked against the
+  // extra_verifier after the check against tflite::VerifyModelBuilder. The
+  // caller retains ownership of the buffer and should keep it alive until the
+  // returned object is destroyed. Caller retains ownership of `error_reporter`
+  // and must ensure its lifetime is longer than the FlatBufferModel instance.
+  // Returns a nullptr in case of failure.
+  static std::unique_ptr<FlatBufferModel> VerifyAndBuildFromBuffer(
       const char* buffer, size_t buffer_size,
+      TfLiteVerifier* extra_verifier = nullptr,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
-  // Builds a model directly from a flatbuffer pointer. The caller retains
-  // ownership of the buffer and should keep it alive until the returned object
-  // is destroyed. Caller retains ownership of `error_reporter` and must ensure
-  // its lifetime is longer than the FlatBufferModel instance.
+  // Builds a model directly from a flatbuffer pointer
+  // Caller retains ownership of the buffer and should keep it alive until the
+  // returned object is destroyed. Caller retains ownership of `error_reporter`
+  // and must ensure its lifetime is longer than the FlatBufferModel instance.
   // Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> BuildFromModel(
-      const tflite::Model* model_spec,
+      const tflite::Model* caller_owned_model_spec,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
   // Releases memory or unmaps mmaped memory.
@@ -104,7 +130,7 @@ class FlatBufferModel {
   const tflite::Model* operator->() const { return model_; }
   const tflite::Model* GetModel() const { return model_; }
   ErrorReporter* error_reporter() const { return error_reporter_; }
-  const Allocation* allocation() const { return allocation_; }
+  const Allocation* allocation() const { return allocation_.get(); }
 
   // Returns true if the model identifier is correct (otherwise false and
   // reports an error).
@@ -116,7 +142,7 @@ class FlatBufferModel {
   // `error_reporter`remains with the caller and must have lifetime at least
   // as much as FlatBufferModel. This is to allow multiple models to use the
   // same ErrorReporter instance.
-  FlatBufferModel(Allocation* allocation,
+  FlatBufferModel(std::unique_ptr<Allocation> allocation,
                   ErrorReporter* error_reporter = DefaultErrorReporter());
 
   // Loads a model from Model flatbuffer. The `model` has to remain alive and
@@ -129,24 +155,28 @@ class FlatBufferModel {
   // The error reporter to use for model errors and subsequent errors when
   // the interpreter is created
   ErrorReporter* error_reporter_;
-  // The allocator used for holding memory of the model.
-  Allocation* allocation_ = nullptr;
+  // The allocator used for holding memory of the model. Note that this will
+  // be null if the client provides a tflite::Model directly.
+  std::unique_ptr<Allocation> allocation_;
 };
 
 // Build an interpreter capable of interpreting `model`.
 //
-// model: a scoped model whose lifetime must be at least as long as
-//   the interpreter. In principle multiple interpreters can be made from
-//   a single model.
-// op_resolver: An instance that implements the Resolver interface which maps
-//   custom op names and builtin op codes to op registrations.
-// reportError: a functor that is called to report errors that handles
-//   printf var arg semantics. The lifetime of the reportError object must
+// model: A model whose lifetime must be at least as long as any
+//   interpreter(s) created by the builder. In principle multiple interpreters
+//   can be made from a single model.
+// op_resolver: An instance that implements the OpResolver interface, which maps
+//   custom op names and builtin op codes to op registrations. The lifetime
+//   of the provided `op_resolver` object must be at least as long as the
+//   InterpreterBuilder; unlike `model` and `error_reporter`, the `op_resolver`
+//   does not need to exist for the duration of any created Interpreter objects.
+// error_reporter: a functor that is called to report errors that handles
+//   printf var arg semantics. The lifetime of the `error_reporter` object must
 //   be greater than or equal to the Interpreter created by operator().
 //
 // Returns a kTfLiteOk when successful and sets interpreter to a valid
-// Interpreter. Note: the user must ensure the model lifetime is at least as
-// long as interpreter's lifetime.
+// Interpreter. Note: The user must ensure the model lifetime (and error
+// reporter, if provided) is at least as long as interpreter's lifetime.
 class InterpreterBuilder {
  public:
   InterpreterBuilder(const FlatBufferModel& model,
@@ -168,12 +198,14 @@ class InterpreterBuilder {
   TfLiteStatus BuildLocalIndexToRegistrationMapping();
   TfLiteStatus ParseNodes(
       const flatbuffers::Vector<flatbuffers::Offset<Operator>>* operators,
-      Interpreter* interpreter);
+      Subgraph* subgraph);
   TfLiteStatus ParseTensors(
       const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
       const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors,
-      Interpreter* interpreter);
+      Subgraph* subgraph);
   TfLiteStatus ApplyDelegates(Interpreter* interpreter);
+  TfLiteStatus ParseQuantization(const QuantizationParameters* src_quantization,
+                                 TfLiteQuantization* quantization);
 
   const ::tflite::Model* model_;
   const OpResolver& op_resolver_;
diff --git a/tensorflow/lite/model_test.cc b/tensorflow/lite/model_test.cc
index e677ea94a71b979a01fd4b56e331d592cef76cd5..67d2380be0b748554de0ef3a91824fb670a530a0 100644
--- a/tensorflow/lite/model_test.cc
+++ b/tensorflow/lite/model_test.cc
@@ -87,20 +87,21 @@ TEST(BasicFlatBufferModel, TestEmptyModelsAndNullDestination) {
 
 // Make sure currently unsupported # of subgraphs are checked
 // TODO(aselle): Replace this test when multiple subgraphs are supported.
-TEST(BasicFlatBufferModel, TestZeroAndMultipleSubgraphs) {
-  auto m1 = FlatBufferModel::BuildFromFile(
+TEST(BasicFlatBufferModel, TestZeroSubgraphs) {
+  auto m = FlatBufferModel::BuildFromFile(
       "tensorflow/lite/testdata/0_subgraphs.bin");
-  ASSERT_TRUE(m1);
-  std::unique_ptr<Interpreter> interpreter1;
-  ASSERT_NE(InterpreterBuilder(*m1, TrivialResolver())(&interpreter1),
-            kTfLiteOk);
+  ASSERT_TRUE(m);
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_NE(InterpreterBuilder(*m, TrivialResolver())(&interpreter), kTfLiteOk);
+}
 
-  auto m2 = FlatBufferModel::BuildFromFile(
+TEST(BasicFlatBufferModel, TestMultipleSubgraphs) {
+  auto m = FlatBufferModel::BuildFromFile(
       "tensorflow/lite/testdata/2_subgraphs.bin");
-  ASSERT_TRUE(m2);
-  std::unique_ptr<Interpreter> interpreter2;
-  ASSERT_NE(InterpreterBuilder(*m2, TrivialResolver())(&interpreter2),
-            kTfLiteOk);
+  ASSERT_TRUE(m);
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(InterpreterBuilder(*m, TrivialResolver())(&interpreter), kTfLiteOk);
+  EXPECT_EQ(interpreter->subgraphs_size(), 2);
 }
 
 // Test what happens if we cannot bind any of the ops.
diff --git a/tensorflow/lite/models/smartreply/BUILD b/tensorflow/lite/models/smartreply/BUILD
index 078b8e6bc6a288542575293be66c19f7bb733fc4..5be2aaff1f2d39f961da9ae1d666b27f41ddb039 100644
--- a/tensorflow/lite/models/smartreply/BUILD
+++ b/tensorflow/lite/models/smartreply/BUILD
@@ -1,9 +1,14 @@
-package(default_visibility = ["//visibility:public"])
+package(default_visibility = [
+    "//visibility:public",
+])
 
-load("//tensorflow/lite:build_def.bzl", "tflite_copts", "gen_selected_ops")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/lite:build_def.bzl", "gen_selected_ops", "tflite_copts")
 
 licenses(["notice"])  # Apache 2.0
 
+exports_files(["LICENSE"])
+
 gen_selected_ops(
     name = "smartreply_ops",
     model = "@tflite_smartreply//:smartreply.tflite",
@@ -22,10 +27,12 @@ cc_library(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:kernel_util",
         "@com_google_absl//absl/strings",
         "@com_googlesource_code_re2//:re2",
         "@farmhash_archive//:farmhash",
     ],
+    alwayslink = 1,
 )
 
 cc_library(
@@ -43,7 +50,25 @@ cc_library(
     ],
 )
 
-cc_test(
+tf_cc_test(
+    name = "predictor_test",
+    srcs = ["predictor_test.cc"],
+    data = [
+        "//tensorflow/lite/models:testdata/smartreply_samples.tsv",
+        "@tflite_smartreply//:smartreply.tflite",
+    ],
+    tags = ["no_oss"],
+    deps = [
+        ":predictor_lib",
+        "//tensorflow/core:test",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/testing:util",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
     name = "extract_feature_op_test",
     size = "small",
     srcs = ["ops/extract_feature_test.cc"],
@@ -58,7 +83,7 @@ cc_test(
     ],
 )
 
-cc_test(
+tf_cc_test(
     name = "normalize_op_test",
     size = "small",
     srcs = ["ops/normalize_test.cc"],
@@ -73,7 +98,7 @@ cc_test(
     ],
 )
 
-cc_test(
+tf_cc_test(
     name = "predict_op_test",
     size = "small",
     srcs = ["ops/predict_test.cc"],
diff --git a/tensorflow/lite/models/smartreply/ops/extract_feature_test.cc b/tensorflow/lite/models/smartreply/ops/extract_feature_test.cc
index efe59eeb4667cc55fb0a70d3005c1f9c2aaa73ce..914b47c1a9deba4e601fdc1b787f3a03179c2e6a 100644
--- a/tensorflow/lite/models/smartreply/ops/extract_feature_test.cc
+++ b/tensorflow/lite/models/smartreply/ops/extract_feature_test.cc
@@ -94,7 +94,7 @@ TEST(ExtractFeatureOpTest, AllBlacklistInput) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/lite/models/smartreply/ops/normalize_test.cc b/tensorflow/lite/models/smartreply/ops/normalize_test.cc
index 8c5131565d5892be946a9a115bb7c6cad8733214..46d2aebe756b84f067def401010e5ee4b37cfd8b 100644
--- a/tensorflow/lite/models/smartreply/ops/normalize_test.cc
+++ b/tensorflow/lite/models/smartreply/ops/normalize_test.cc
@@ -84,7 +84,7 @@ TEST(NormalizeOpTest, EmptyInput) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/lite/models/smartreply/ops/predict.cc b/tensorflow/lite/models/smartreply/ops/predict.cc
index bb2ed4a3153ceb2ef2e6b6d7f8c640f41616d4b0..24b7d5489756de36c2bcc8a47ef1c8e478c3a9c0 100644
--- a/tensorflow/lite/models/smartreply/ops/predict.cc
+++ b/tensorflow/lite/models/smartreply/ops/predict.cc
@@ -28,6 +28,7 @@ limitations under the License.
 //
 
 #include <algorithm>
+#include <cstdlib>
 #include <unordered_map>
 #include <vector>
 
diff --git a/tensorflow/lite/models/smartreply/ops/predict_test.cc b/tensorflow/lite/models/smartreply/ops/predict_test.cc
index ca64dcaad47108e346bd03f0b7b15edfbd6a50dc..6896a342c79a73390f1ad02a60db6cb70a1cf23b 100644
--- a/tensorflow/lite/models/smartreply/ops/predict_test.cc
+++ b/tensorflow/lite/models/smartreply/ops/predict_test.cc
@@ -177,7 +177,7 @@ TEST(PredictOpTest, NoLabelGenerated) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: tflite::LogToStderr();
+  ::tflite::LogToStderr();
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/lite/models/smartreply/predictor_test.cc b/tensorflow/lite/models/smartreply/predictor_test.cc
index 7eba26993e59172d8ae85a8961b6f3b171057a48..f4a9453b4220b45af937923a6b916c1516f9e22f 100644
--- a/tensorflow/lite/models/smartreply/predictor_test.cc
+++ b/tensorflow/lite/models/smartreply/predictor_test.cc
@@ -22,21 +22,24 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
-//#include "tensorflow/lite/models/test_utils.h"
-#include "tensorflow/lite/string_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/lite/string_util.h"
+#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace custom {
 namespace smartreply {
 namespace {
 
-const char kModelName[] = "smartreply_ondevice_model.bin";
 const char kSamples[] = "smartreply_samples.tsv";
 
-string TestDataPath() {
+string GetModelFilePath() {
+  return "external/tflite_smartreply/smartreply.tflite";  // NOLINT
+}
+
+string GetSamplesFilePath() {
   return string(absl::StrCat(tensorflow::testing::TensorFlowSrcRoot(), "/",
-                             "lite/models/testdata/"));
+                             "lite/models/testdata/", kSamples));
 }
 
 MATCHER_P(IncludeAnyResponesIn, expected_response, "contains the response") {
@@ -53,13 +56,14 @@ MATCHER_P(IncludeAnyResponesIn, expected_response, "contains the response") {
 
 class PredictorTest : public ::testing::Test {
  protected:
-  PredictorTest() {
-    model_ = tflite::FlatBufferModel::BuildFromFile(
-        absl::StrCat(TestDataPath(), "/", kModelName).c_str());
-    CHECK(model_);
-  }
+  PredictorTest() {}
   ~PredictorTest() override {}
 
+  void SetUp() override {
+    model_ = tflite::FlatBufferModel::BuildFromFile(GetModelFilePath().c_str());
+    ASSERT_NE(model_.get(), nullptr);
+  }
+
   std::unique_ptr<::tflite::FlatBufferModel> model_;
 };
 
@@ -121,7 +125,7 @@ TEST_F(PredictorTest, BatchTest) {
   int total_triggers = 0;
 
   string line;
-  std::ifstream fin(absl::StrCat(TestDataPath(), "/", kSamples));
+  std::ifstream fin(GetSamplesFilePath());
   while (std::getline(fin, line)) {
     const std::vector<string> fields = absl::StrSplit(line, '\t');
     if (fields.empty()) {
@@ -151,3 +155,9 @@ TEST_F(PredictorTest, BatchTest) {
 }  // namespace smartreply
 }  // namespace custom
 }  // namespace tflite
+
+int main(int argc, char **argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/models/speech_test.cc b/tensorflow/lite/models/speech_test.cc
index 17b7e8f28e8fb0988ee2269d9d833626c2aec701..a3713c55312cb7cb6526b7e82606cb949e5c2af4 100644
--- a/tensorflow/lite/models/speech_test.cc
+++ b/tensorflow/lite/models/speech_test.cc
@@ -139,7 +139,7 @@ TEST_P(SpeechTest, DISABLED_SpeakerIdOkGoogleTest) {
       << test_driver.GetErrorMessage();
 }
 
-TEST_P(SpeechTest, DISABLED_AsrAmTest) {
+TEST_P(SpeechTest, AsrAmTest) {
   std::stringstream os;
   ASSERT_TRUE(
       ConvertCsvData("speech_asr_am_model.tflite", "speech_asr_am_model_in.csv",
@@ -152,6 +152,19 @@ TEST_P(SpeechTest, DISABLED_AsrAmTest) {
       << test_driver.GetErrorMessage();
 }
 
+TEST_P(SpeechTest, AsrAmQuantizedTest) {
+  std::stringstream os;
+  ASSERT_TRUE(ConvertCsvData(
+      "speech_asr_am_model_int8.tflite", "speech_asr_am_model_in.csv",
+      "speech_asr_am_model_int8_out.csv", /*input_tensor=*/"0",
+      /*output_tensor=*/"104",
+      /*persistent_tensors=*/"18,19,38,39,58,59,78,79,98,99",
+      /*sequence_size=*/320, &os));
+  testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
+  ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver, GetMaxInvocations()))
+      << test_driver.GetErrorMessage();
+}
+
 // The original version of speech_asr_lm_model_test.cc ran a few sequences
 // through the interpreter and stored the sum of all the output, which was them
 // compared for correctness. In this test we are comparing all the intermediate
@@ -196,10 +209,10 @@ TEST_P(SpeechTest, DISABLED_TtsTest) {
 // 200s just to bring up the Android emulator.)
 static const int kAllInvocations = -1;
 static const int kFirstFewInvocations = 10;
-INSTANTIATE_TEST_CASE_P(LongTests, SpeechTest,
-                        ::testing::Values(kAllInvocations));
-INSTANTIATE_TEST_CASE_P(ShortTests, SpeechTest,
-                        ::testing::Values(kFirstFewInvocations));
+INSTANTIATE_TEST_SUITE_P(LongTests, SpeechTest,
+                         ::testing::Values(kAllInvocations));
+INSTANTIATE_TEST_SUITE_P(ShortTests, SpeechTest,
+                         ::testing::Values(kFirstFewInvocations));
 
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/models/testdata/g3doc/README.md b/tensorflow/lite/models/testdata/g3doc/README.md
index 2a4f1c143a21722945e8e396b81bd23e3312e87e..afe5f16b383b26efd7aab866c3215a8d2a203f4c 100644
--- a/tensorflow/lite/models/testdata/g3doc/README.md
+++ b/tensorflow/lite/models/testdata/g3doc/README.md
@@ -3,6 +3,42 @@
 Sample test data has been provided for speech related models in Tensorflow Lite
 to help users working with speech models to verify and test their models.
 
+### Models and Inputs and Outputs:
+
+[ASR AM model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_asr_am_model.tflite)
+
+[ASR AM quantized model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_asr_am_model_int8.tflite)
+
+[ASR AM test inputs](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_asr_am_model_in.csv)
+
+[ASR AM test outputs](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_asr_am_model_out.csv)
+
+[ASR AM int8 test outputs](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_asr_am_model_int8_out.csv)
+
+The models below are not maintained.
+
+[Speech hotword model (Svdf
+rank=1)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank1_2017_11_14.tflite)
+
+[Speech hotword model (Svdf
+rank=2)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank2_2017_11_14.tflite)
+
+[Speaker-id
+model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_speakerid_model_2017_11_14.tflite)
+
+[TTS
+model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_tts_model_2017_11_14.tflite)
+
+### Test Bench
+
+[Model tests](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_test.cc)
+
+Download the ASR AM test models and inputs and output files to the
+models/testdata directory to run the tests.
+
+
+## Speech Model Architectures
+
 For the hotword, speaker-id and automatic speech recognition sample models, the
 architecture assumes that the models receive their input from a speech
 pre-processing module. The speech pre-processing module receives the audio
@@ -87,57 +123,3 @@ The model consists of a convolutional layer, followed by a fully-connected
 layer, two LSTM layers, and two additional fully-connected layers.
 The corresponding parameters as shown in the figure.
 ![endpointer_model](endpointer.svg "Endpointer model")
-
-
-## Speech models test input/output generation
-
-As mentioned above the input to models are generated from a pre-processing
-module (output of a log-mel filterbank, or linguistic features), and the outputs
-are generated by running the equivalent TensorFlow model by feeding them the
-same input.
-
-## Link to the open source code
-
-### Models:
-
-[Speech hotword model (Svdf
-rank=1)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank1_2017_11_14.tflite)
-
-[Speech hotword model (Svdf
-rank=2)](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_hotword_model_rank2_2017_11_14.tflite)
-
-[Speaker-id
-model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_speakerid_model_2017_11_14.tflite)
-
-[TTS
-model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_tts_model_2017_11_14.tflite)
-
-[ASR AM
-model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_terse_am_model_2017_11_14.tflite)
-
-### Test benches
-
-[Speech hotword model
-test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_hotword_model_test.cc)
-
-[Speaker-id model
-test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_speakerid_model_test.cc)
-
-[TTS model
-test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_tts_model_test.cc)
-
-[ASR AM model
-test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_asr_am_model_test.cc)
-
-[ASR LM model
-test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_asr_lm_model_test.cc)
-
-[Endpointer model
-test](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/models/speech_endpointer_model_test.cc)
-
-## Android Support
-The models have been tested on Android phones, using the following tests:
-
-[Hotword] (https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/android/BUILD?rcl=172930882&l=25)
-
-[Speaker-id] (https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/android/BUILD?rcl=172930882&l=36)
diff --git a/tensorflow/lite/nnapi/BUILD b/tensorflow/lite/nnapi/BUILD
index 467a2b7a7bc9a40135428240585cd2c2a133cf9f..8ee9f3c221a10bef39d4b4e0798e7a453f8c3fcb 100644
--- a/tensorflow/lite/nnapi/BUILD
+++ b/tensorflow/lite/nnapi/BUILD
@@ -8,6 +8,44 @@ cc_library(
     name = "nnapi_lib",
     hdrs = [
         "NeuralNetworksShim.h",
+        "NeuralNetworksTypes.h",
     ],
     linkopts = ["-ldl"],
 )
+
+cc_library(
+    name = "nnapi_implementation",
+    srcs = select({
+        "//tensorflow:ios": [
+            "nnapi_implementation_disabled.cc",
+        ],
+        "//tensorflow:windows": [
+            "nnapi_implementation_disabled.cc",
+        ],
+        "//conditions:default": [
+            "nnapi_implementation.cc",
+        ],
+    }),
+    hdrs = [
+        "nnapi_implementation.h",
+    ],
+    linkopts = ["-ldl"] + select({
+        "//tensorflow:android": [],
+        "//tensorflow:darwin": [],
+        "//tensorflow:ios": [],
+        "//tensorflow:windows": [],
+        "//conditions:default": ["-lrt"],
+    }),
+    deps = [
+        "//tensorflow/lite/nnapi:nnapi_lib",
+    ],
+)
+
+cc_test(
+    name = "nnapi_implementation_test",
+    srcs = ["nnapi_implementation_test.cc"],
+    deps = [
+        "//tensorflow/lite/nnapi:nnapi_implementation",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/nnapi/NeuralNetworksShim.h b/tensorflow/lite/nnapi/NeuralNetworksShim.h
index c39502f4acc5dc6262746a61688cd075861e6135..c48528fa2da5e7992beb9b029d2d112a8e48ba4c 100644
--- a/tensorflow/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/lite/nnapi/NeuralNetworksShim.h
@@ -20,6 +20,13 @@ limitations under the License.
 #include <stdio.h>
 #include <stdlib.h>
 
+#include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
+
+// This interface is now deprecated. You should use instead
+// nnapi_implementation.
+
+// TODO(b/123017568): Update all current usages of this file.
+
 // helpers
 
 #define NNAPI_LOG(format, ...) fprintf(stderr, format "\n", __VA_ARGS__);
@@ -44,8 +51,6 @@ inline void* loadLibrary(const char* name) {
   return handle;
 }
 
-typedef int (*ASharedMemory_create_fn)(const char* name, size_t size);
-
 // ASharedMemory_create was added in Android 8.0, so safe to use with NNAPI
 // which was added in 8.1.
 inline int ASharedMemory_create(const char* name, size_t size) {
@@ -54,7 +59,8 @@ inline int ASharedMemory_create(const char* name, size_t size) {
       handle != nullptr ? reinterpret_cast<ASharedMemory_create_fn>(
                               dlsym(handle, "ASharedMemory_create"))
                         : nullptr;
-  return fn(name, size);
+  int fd = fn != nullptr ? fn(name, size) : -1;
+  return fd;
 }
 
 inline void* getLibraryHandle() {
@@ -81,332 +87,6 @@ inline bool NNAPIExists() {
 // NN api types based on NNAPI header file
 // https://developer.android.com/ndk/reference/group/neural-networks
 
-/**
- * Operand types.
- *
- * The type of operands that can be added to a model.
- *
- * Although we define many types, most operators accept just a few
- * types.  Most used are ANEURALNETWORKS_TENSOR_FLOAT32,
- * ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, and ANEURALNETWORKS_INT32.
- */
-enum {
-  ANEURALNETWORKS_FLOAT32 = 0,
-  ANEURALNETWORKS_INT32 = 1,
-  ANEURALNETWORKS_UINT32 = 2,
-  ANEURALNETWORKS_TENSOR_FLOAT32 = 3,
-  ANEURALNETWORKS_TENSOR_INT32 = 4,
-  ANEURALNETWORKS_TENSOR_QUANT8_ASYMM = 5,
-};
-
-/**
- * Operation types.
- *
- * The type of operations that can be added to a model.
- */
-enum {
-  ANEURALNETWORKS_ADD = 0,
-  ANEURALNETWORKS_AVERAGE_POOL_2D = 1,
-  ANEURALNETWORKS_CONCATENATION = 2,
-  ANEURALNETWORKS_CONV_2D = 3,
-  ANEURALNETWORKS_DEPTHWISE_CONV_2D = 4,
-  ANEURALNETWORKS_DEPTH_TO_SPACE = 5,
-  ANEURALNETWORKS_DEQUANTIZE = 6,
-  ANEURALNETWORKS_EMBEDDING_LOOKUP = 7,
-  ANEURALNETWORKS_FLOOR = 8,
-  ANEURALNETWORKS_FULLY_CONNECTED = 9,
-  ANEURALNETWORKS_HASHTABLE_LOOKUP = 10,
-  ANEURALNETWORKS_L2_NORMALIZATION = 11,
-  ANEURALNETWORKS_L2_POOL_2D = 12,
-  ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION = 13,
-  ANEURALNETWORKS_LOGISTIC = 14,
-  ANEURALNETWORKS_LSH_PROJECTION = 15,
-  ANEURALNETWORKS_LSTM = 16,
-  ANEURALNETWORKS_MAX_POOL_2D = 17,
-  ANEURALNETWORKS_MUL = 18,
-  ANEURALNETWORKS_RELU = 19,
-  ANEURALNETWORKS_RELU1 = 20,
-  ANEURALNETWORKS_RELU6 = 21,
-  ANEURALNETWORKS_RESHAPE = 22,
-  ANEURALNETWORKS_RESIZE_BILINEAR = 23,
-  ANEURALNETWORKS_RNN = 24,
-  ANEURALNETWORKS_SOFTMAX = 25,
-  ANEURALNETWORKS_SPACE_TO_DEPTH = 26,
-  ANEURALNETWORKS_SVDF = 27,
-  ANEURALNETWORKS_TANH = 28,
-  ANEURALNETWORKS_BATCH_TO_SPACE_ND = 29,
-  ANEURALNETWORKS_DIV = 30,
-  ANEURALNETWORKS_MEAN = 31,
-  ANEURALNETWORKS_PAD = 32,
-  ANEURALNETWORKS_SPACE_TO_BATCH_ND = 33,
-  ANEURALNETWORKS_SQUEEZE = 34,
-  ANEURALNETWORKS_STRIDED_SLICE = 35,
-  ANEURALNETWORKS_SUB = 36,
-  ANEURALNETWORKS_TRANSPOSE = 37,
-};
-
-/**
- * Fused activation function types.
- *
- */
-enum {
-  ANEURALNETWORKS_FUSED_NONE = 0,
-  ANEURALNETWORKS_FUSED_RELU = 1,
-  ANEURALNETWORKS_FUSED_RELU1 = 2,
-  ANEURALNETWORKS_FUSED_RELU6 = 3,
-};
-
-/**
- * Execution preferences.
- */
-enum {
-  ANEURALNETWORKS_PREFER_LOW_POWER = 0,
-  ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER = 1,
-  ANEURALNETWORKS_PREFER_SUSTAINED_SPEED = 2,
-};
-
-/**
- * Result codes.
- */
-enum {
-  ANEURALNETWORKS_NO_ERROR = 0,
-  ANEURALNETWORKS_OUT_OF_MEMORY = 1,
-  ANEURALNETWORKS_INCOMPLETE = 2,
-  ANEURALNETWORKS_UNEXPECTED_NULL = 3,
-  ANEURALNETWORKS_BAD_DATA = 4,
-  ANEURALNETWORKS_OP_FAILED = 5,
-  ANEURALNETWORKS_UNMAPPABLE = 5,
-  ANEURALNETWORKS_BAD_STATE = 6,
-};
-
-/**
- * Implicit padding algorithms.
- */
-enum {
-  ANEURALNETWORKS_PADDING_SAME = 1,
-  ANEURALNETWORKS_PADDING_VALID = 2,
-};
-
-/**
- * ANeuralNetworksMemory is an opaque type that represents memory.
- *
- * This type is used to represent shared memory, memory mapped files,
- * and similar memories.
- *
- * By using shared memory, a program can efficiently communicate to the
- * runtime and drivers the tensors that define a model. See
- * {@link ANeuralNetworksModel_setOperandValueFromMemory}. An application
- * should typically create one shared memory object that contains every tensor
- * needed to define a model. {@link ANeuralNetworksMemory_createFromFd} can be
- * used to create shared memory from a file handle. {@link
- * ANeuralNetworksMemory_createShared} can be used to directly created shared
- * memory.
- *
- * Memory objects can also be used to specify the input and output arguments of
- * an execution. See {@link ANeuralNetworksExecution_setInputFromMemory}
- * and {@link ANeuralNetworksExecution_setOutputFromMemory}.
- */
-typedef struct ANeuralNetworksMemory ANeuralNetworksMemory;
-
-/**
- * ANeuralNetworksModel is an opaque type that contains a description of the
- * mathematical operations that constitute the model.
- *
- * <p>The model will be built by calling<ul>
- * <li>{@link ANeuralNetworksModel_create},</li>
- * <li>{@link ANeuralNetworksModel_addOperation},</li>
- * <li>{@link ANeuralNetworksModel_addOperand},</li>
- * </ul>
- *
- * A model is completed by calling {@link ANeuralNetworksModel_finish}.
- * A model is destroyed by calling {@link ANeuralNetworksModel_free}.
- *
- * <p>It is the application's responsibility to make sure that only one thread
- * modifies a model at a given time. It is however safe for more than one
- * thread to use the model once {@link ANeuralNetworksModel_finish} has
- * returned.</p>
- *
- * <p>It is also the application's responsibility to ensure that there are no
- * other uses of the model after calling {@link ANeuralNetworksModel_free}. This
- * includes any compilation or execution object created using the model.</p>
- */
-typedef struct ANeuralNetworksModel ANeuralNetworksModel;
-
-/**
- * ANeuralNetworksCompilation is an opaque type that can be used to compile
- * a machine learning model.
- *
- * <p>To use:<ul>
- *    <li>Create a new compilation instance by calling the
- *        {@link ANeuralNetworksCompilation_create} function.</li>
- *    <li>Perform the compilation with {@link
- * ANeuralNetworksCompilation_start}.</li> <li>Wait for the compilation to
- * complete with {@link ANeuralNetworksCompilation_wait}.</li> <li>Use the
- * compilation as many times as needed with {@link
- * ANeuralNetworksExecution_create}.</li> <li>Destroy the compilation with
- * {@link ANeuralNetworksCompilation_free} once all executions using the
- * compilation have completed.</li></ul></p>
- *
- * <p>A compilation cannot be modified once {@link
- * ANeuralNetworksCompilation_start} has been called on it.</p>
- *
- * <p>It is the application's responsibility to make sure that only one thread
- * modifies a compilation at a given time. It is however safe for more than one
- * thread to use {@link ANeuralNetworksCompilation_wait} at the same time.
- * It is also safe for multiple threads to use a compilation object once
- * {@link ANeuralNetworksCompilation_wait} has completed.</p>
- *
- * <p>It is also the application's responsibility to ensure that there are no
- * other uses of the compilation after calling {@link
- * ANeuralNetworksCompilation_free}. This includes any execution object created
- * using the compilation.</p>
- */
-typedef struct ANeuralNetworksCompilation ANeuralNetworksCompilation;
-
-/**
- * ANeuralNetworksExecution is an opaque type that can be used to apply a
- * machine learning model to a set of inputs.
- *
- * <p>To use:<ul>
- *    <li>Create a new execution instance by calling the
- *        {@link ANeuralNetworksExecution_create} function.</li>
- *    <li>Associate data to the model inputs with
- *        {@link ANeuralNetworksExecution_setInput} or
- *        {@link ANeuralNetworksExecution_setInputFromMemory}.</li>
- *    <li>Associate output buffers to the model outputs with
- *        {@link ANeuralNetworksExecution_setOutput} or
- *        {@link ANeuralNetworksExecution_setOutputFromMemory}.</li>
- *    <li>Apply the model with {@link
- * ANeuralNetworksExecution_startCompute}.</li> <li>Wait for the execution to
- * complete with {@link ANeuralNetworksExecution_wait}.</li> <li>Destroy the
- * execution with
- *        {@link ANeuralNetworksExecution_free}.</li></ul></p>
- *
- * <p>An execution cannot be modified once {@link
- * ANeuralNetworksExecution_start} has been called on it.</p>
- *
- * <p>An execution can be applied to a model with
- * {@link ANeuralNetworksExecution_startCompute} only once. Create new
- * executions to do new evaluations of the model.</p>
- *
- * <p>It is the application's responsibility to make sure that only one thread
- * modifies an execution at a given time. It is however safe for more than one
- * thread to use {@link ANeuralNetworksExecution_wait} at the same time.</p>
- *
- * <p>It is also the application's responsibility to ensure that there are no
- * other uses of the request after calling {@link
- * ANeuralNetworksRequest_free}.</p>
- */
-typedef struct ANeuralNetworksExecution ANeuralNetworksExecution;
-
-/**
- * ANeuralNetworksOperandType describes the type of an operand.
- * This structure is used to describe both scalars and tensors.
- */
-typedef struct ANeuralNetworksOperandType {
-  /** The data type, e.g ANEURALNETWORKS_INT8. */
-  int32_t type;
-  /** The number of dimensions. It should be 0 for scalars. */
-  uint32_t dimensionCount;
-  /** The dimensions of the tensor. It should be nullptr for scalars. */
-  const uint32_t* dimensions;
-  /** These two fields are only used for quantized tensors.
-   * They should be zero for scalars and non-fixed point tensors.
-   * The dequantized value of each entry is (value - offset) * scale.
-   */
-  float scale;
-  int32_t zeroPoint;
-} ANeuralNetworksOperandType;
-
-/**
- * ANeuralNetworksEvent is an opaque type that represents an event
- * that will be signaled once an execution completes.
- */
-typedef struct ANeuralNetworksEvent ANeuralNetworksEvent;
-
-typedef int32_t ANeuralNetworksOperationType;
-
-// nn api function types
-
-typedef int (*ANeuralNetworksMemory_createFromFd_fn)(
-    size_t size, int protect, int fd, size_t offset,
-    ANeuralNetworksMemory** memory);
-
-typedef void (*ANeuralNetworksMemory_free_fn)(ANeuralNetworksMemory* memory);
-
-typedef int (*ANeuralNetworksModel_create_fn)(ANeuralNetworksModel** model);
-
-typedef int (*ANeuralNetworksModel_finish_fn)(ANeuralNetworksModel* model);
-
-typedef void (*ANeuralNetworksModel_free_fn)(ANeuralNetworksModel* model);
-
-typedef int (*ANeuralNetworksCompilation_create_fn)(
-    ANeuralNetworksModel* model, ANeuralNetworksCompilation** compilation);
-
-typedef void (*ANeuralNetworksCompilation_free_fn)(
-    ANeuralNetworksCompilation* compilation);
-
-typedef int (*ANeuralNetworksCompilation_setPreference_fn)(
-    ANeuralNetworksCompilation* compilation, int32_t preference);
-
-typedef int (*ANeuralNetworksCompilation_finish_fn)(
-    ANeuralNetworksCompilation* compilation);
-
-typedef int (*ANeuralNetworksModel_addOperand_fn)(
-    ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type);
-
-typedef int (*ANeuralNetworksModel_setOperandValue_fn)(
-    ANeuralNetworksModel* model, int32_t index, const void* buffer,
-    size_t length);
-
-typedef int (*ANeuralNetworksModel_setOperandValueFromMemory_fn)(
-    ANeuralNetworksModel* model, int32_t index,
-    const ANeuralNetworksMemory* memory, size_t offset, size_t length);
-
-typedef int (*ANeuralNetworksModel_addOperation_fn)(
-    ANeuralNetworksModel* model, ANeuralNetworksOperationType type,
-    uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount,
-    const uint32_t* outputs);
-
-typedef int (*ANeuralNetworksModel_identifyInputsAndOutputs_fn)(
-    ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs,
-    uint32_t outputCount, const uint32_t* outputs);
-
-typedef int (*ANeuralNetworksModel_relaxComputationFloat32toFloat16_fn)(
-    ANeuralNetworksModel* model, bool allow);
-
-typedef int (*ANeuralNetworksExecution_create_fn)(
-    ANeuralNetworksCompilation* compilation,
-    ANeuralNetworksExecution** execution);
-
-typedef void (*ANeuralNetworksExecution_free_fn)(
-    ANeuralNetworksExecution* execution);
-
-typedef int (*ANeuralNetworksExecution_setInput_fn)(
-    ANeuralNetworksExecution* execution, int32_t index,
-    const ANeuralNetworksOperandType* type, const void* buffer, size_t length);
-
-typedef int (*ANeuralNetworksExecution_setInputFromMemory_fn)(
-    ANeuralNetworksExecution* execution, int32_t index,
-    const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory,
-    size_t offset, size_t length);
-
-typedef int (*ANeuralNetworksExecution_setOutput_fn)(
-    ANeuralNetworksExecution* execution, int32_t index,
-    const ANeuralNetworksOperandType* type, void* buffer, size_t length);
-
-typedef int (*ANeuralNetworksExecution_setOutputFromMemory_fn)(
-    ANeuralNetworksExecution* execution, int32_t index,
-    const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory,
-    size_t offset, size_t length);
-
-typedef int (*ANeuralNetworksExecution_startCompute_fn)(
-    ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event);
-
-typedef int (*ANeuralNetworksEvent_wait_fn)(ANeuralNetworksEvent* event);
-
-typedef void (*ANeuralNetworksEvent_free_fn)(ANeuralNetworksEvent* event);
-
 /**
  * Creates a shared memory object from a file descriptor.
  *
@@ -576,6 +256,32 @@ inline int ANeuralNetworksModel_setOperandValue(ANeuralNetworksModel* model,
   EXECUTE_FUNCTION_RETURN(model, index, buffer, length);
 }
 
+/**
+ * Sets an operand's per channel quantization parameters.
+ *
+ * Sets parameters required by a tensor of type
+ * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}.
+ * This function must be called for every tensor of type
+ * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} before
+ * calling {@link ANeuralNetworksModel_finish}.
+ *
+ * Available since API level 29.
+ *
+ * @param model The model to be modified.
+ * @param index The index of the model operand we're setting.
+ * @param channelQuant The per channel quantization parameters for the operand.
+ *                    No memory in this struct needs to outlive the call to
+ *                    this function.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksModel_setOperandSymmPerChannelQuantParams(
+    ANeuralNetworksModel* model, int32_t index,
+    const ANeuralNetworksSymmPerChannelQuantParams* channelQuant) {
+  LOAD_FUNCTION(ANeuralNetworksModel_setOperandSymmPerChannelQuantParams);
+  EXECUTE_FUNCTION_RETURN(model, index, channelQuant);
+}
+
 /**
  * Sets an operand to a value stored in a memory object.
  *
@@ -1007,6 +713,445 @@ inline void ANeuralNetworksEvent_free(ANeuralNetworksEvent* event) {
   EXECUTE_FUNCTION(event);
 }
 
+/**
+ * Get the number of available devices.
+ *
+ * @param numDevices Used to return the number of devices.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworks_getDeviceCount(uint32_t* numDevices) {
+  LOAD_FUNCTION(ANeuralNetworks_getDeviceCount);
+  EXECUTE_FUNCTION_RETURN(numDevices);
+}
+
+/**
+ * Get the representation of the specified device.
+ *
+ * @param devIndex The index of the specified device. Must be less than the
+ *                 number of available devices.
+ * @param device The representation of the specified device.
+ *               The same representation will always be returned for the
+ *               specified device.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 29.
+ */
+
+inline int ANeuralNetworks_getDevice(uint32_t devIndex,
+                                     ANeuralNetworksDevice** device) {
+  LOAD_FUNCTION(ANeuralNetworks_getDevice);
+  EXECUTE_FUNCTION_RETURN(devIndex, device);
+}
+
+/**
+ * Get the name of the specified device.
+ *
+ * @param device The representation of the specified device.
+ * @param name   The returned name of the specified device. The name will be in
+ *               UTF-8 and will be null-terminated. It will be recognizable as a
+ *               known device name rather than a cryptic string. For devices
+ *               with API level 29 and above, the format of the name is
+ *               {VENDOR}-{DEVICE}, e.g. “google-ipu”. For devices with feature
+ *               level 28 or lower, the name will always be “unknown-device”.
+ *               The name will remain valid for the duration of the application.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworksDevice_getName(const ANeuralNetworksDevice* device,
+                                         const char** name) {
+  LOAD_FUNCTION(ANeuralNetworksDevice_getName);
+  EXECUTE_FUNCTION_RETURN(device, name);
+}
+
+/**
+ * Get the version of the driver implementation of the specified device.
+ *
+ * It’s the responsibility of the driver implementor to insure that this version
+ * string uniquely distinguishes this implementation from all previous
+ * implementations.
+ *
+ * This version string must not be confused with the feature level which is
+ * solely defined by {@link ANeuralNetworksDevice_getFeatureLevel}. There is no
+ * implicit ordering of the versions. For example, it is not possible to filter
+ * all drivers older than a certain version.
+ *
+ * Application developers may use this version string to avoid or prefer
+ * specific driver implementations. For example, an application may want to do
+ * so because:
+ *     - A specific version of the driver does not provide the required
+ * performance, perhaps because of a performance regression.
+ *     - A specific version of the driver has a bug or returns results that
+ * don’t match the minimum precision requirement for the application.
+ *
+ * @param device  The representation of the specified device.
+ * @param version The returned version string of the driver for the specified
+ *                device. The string will be in UTF-8 and will be
+ *                null-terminated. For devices with feature level 28 or lower,
+ *                "UNKNOWN" will be returned. The version string will remain
+ *                valid for the duration of the application.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworksDevice_getVersion(const ANeuralNetworksDevice* device,
+                                            const char** version) {
+  LOAD_FUNCTION(ANeuralNetworksDevice_getVersion);
+  EXECUTE_FUNCTION_RETURN(device, version);
+}
+
+/**
+ * Get the supported NNAPI version of the specified device.
+ *
+ * Each device has a supported feature level, which is the most advanced feature
+ * this driver implements. For example, if the driver implements the features
+ * introduced in Android P, but does not implement the features introduced after
+ * Android P, the value would be 28. Developers could decide whether or not the
+ * specified device should be used for a Model that has certain feature
+ * requirements.
+ *
+ * @param device       The representation of the specified device.
+ * @param featureLevel The API level of the most advanced feature this driver
+ *                     implements.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworksDevice_getFeatureLevel(
+    const ANeuralNetworksDevice* device, int64_t* featureLevel) {
+  LOAD_FUNCTION(ANeuralNetworksDevice_getFeatureLevel);
+  EXECUTE_FUNCTION_RETURN(device, featureLevel);
+}
+
+/**
+ * Get the supported operations for a specified set of devices. If multiple
+ * devices are selected, the supported operation list is a union of supported
+ * operations of all selected devices.
+ *
+ * @param model        The model to be queried.
+ * @param devices      The set of devices. Must not contain duplicates.
+ * @param numDevices   The number of devices in the set.
+ * @param supportedOps The boolean array to be filled. True means supported. The
+ *                     size of the boolean array must be at least as large as
+ *                     the number of operations in the model. The order of
+ *                     elements in the supportedOps array matches the order in
+ *                     which the corresponding operations were added to the
+ *                     model.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworksModel_getSupportedOperationsForDevices(
+    const ANeuralNetworksModel* model,
+    const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
+    bool* supportedOps) {
+  LOAD_FUNCTION(ANeuralNetworksModel_getSupportedOperationsForDevices);
+  EXECUTE_FUNCTION_RETURN(model, devices, numDevices, supportedOps);
+}
+
+/**
+ * Create a {@link ANeuralNetworksCompilation} to compile the given model for a
+ * specified set of devices. If more than one device is specified, the
+ * compilation will distribute the workload automatically across the devices.
+ * The model must be fully supported by the specified set of devices. This means
+ * that ANeuralNetworksModel_getSupportedOperationsForDevices() must have
+ * returned true for every operation for that model/devices pair.
+ *
+ * @param model       The {@link ANeuralNetworksModel} to be compiled.
+ * @param devices     The set of devices. Must not contain duplicates.
+ * @param numDevices  The number of devices in the set.
+ * @param compilation The newly created object or NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+ *         if the model is invalid.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworksCompilation_createForDevices(
+    ANeuralNetworksModel* model, const ANeuralNetworksDevice* const* devices,
+    uint32_t numDevices, ANeuralNetworksCompilation** compilation) {
+  LOAD_FUNCTION(ANeuralNetworksCompilation_createForDevices);
+  EXECUTE_FUNCTION_RETURN(model, devices, numDevices, compilation);
+}
+
+/**
+ * Sets the compilation caching signature and the cache directory.
+ *
+ * Provides optional caching information to the runtime for faster repeated
+ * compilation.
+ *
+ * See {@link ANeuralNetworksCompilation} for information on multithreaded
+ * usage.
+ *
+ * @param compilation The compilation to be modified.
+ * @param cacheDir The cache directory to store and retrieve caching data. It is
+ *                 recommended to use the code_cache provided by the Android
+ *                 runtime. If not using the code_cache, the user should choose
+ *                 a directory local to the application, and is responsible to
+ *                 manage and clean the cache entries.
+ * @param token The token provided by the user to specify a model, must be of
+ *              length ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN. The user should
+ *              ensure that the token is unique to a model within the
+ *              application. The NNAPI runtime will not detected token
+ *              collisions. If there is a collision, the compilation outcome may
+ *              be incorrect without notifying with error.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworksCompilation_setCaching(
+    ANeuralNetworksCompilation* compilation, const char* cacheDir,
+    const uint8_t* token) {
+  LOAD_FUNCTION(ANeuralNetworksCompilation_setCaching);
+  EXECUTE_FUNCTION_RETURN(compilation, cacheDir, token);
+}
+
+/**
+ * Schedule synchronous evaluation of the execution.
+ *
+ * <p>Schedules synchronous evaluation of the execution. Returns once the
+ * execution has completed and the outputs are ready to be consumed.
+ * </p>
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * See {@link ANeuralNetworksExecution_startCompute} for asynchronous execution.
+ * Synchronous execution incurs lower overhead than asynchronous execution.
+ *
+ * Available since API level 29.
+ *
+ * @param execution The execution to be scheduled and executed.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if the execution completed normally.
+ *         ANEURALNETWORKS_UNMAPPABLE if the execution input or output memory
+ *         cannot be properly mapped.
+ */
+inline int ANeuralNetworksExecution_compute(
+    ANeuralNetworksExecution* execution) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_compute);
+  EXECUTE_FUNCTION_RETURN(execution);
+}
+
+/**
+ * Get the dimensional information of the specified output operand of the model
+ * of the
+ * {@link ANeuralNetworksExecution}.
+ *
+ * On asynchronous execution initiated by {@link
+ * ANeuralNetworksExecution_startCompute},
+ * {@link ANeuralNetworksEvent_wait} must be called prior to this function to
+ * recuperate the resources used by the execution.
+ *
+ * @param execution The execution to be queried.
+ * @param index The index of the output argument we are querying. It is
+ *              an index into the lists passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with {@link
+ * ANeuralNetworksModel_addOperand}.
+ * @param rank The rank of the output operand.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful,
+ * ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE if the target output is provided an
+ * insufficient buffer at execution time, ANEURALNETWORKS_BAD_DATA if the index
+ * is invalid.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworksExecution_getOutputOperandRank(
+    ANeuralNetworksExecution* execution, int32_t index, uint32_t* rank) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_getOutputOperandRank);
+  EXECUTE_FUNCTION_RETURN(execution, index, rank);
+}
+
+/**
+ * Get the dimensional information of the specified output operand of the model
+ * of the
+ * {@link ANeuralNetworksExecution}. The target output operand cannot be a
+ * scalar.
+ *
+ * On asynchronous execution initiated by
+ * {@link ANeuralNetworksExecution_startCompute},
+ * {@link ANeuralNetworksEvent_wait} must be called prior to this function to
+ * recuperate the resources used by the execution.
+ *
+ * @param execution The execution to be queried.
+ * @param index The index of the output argument we are querying. It is an index
+ *              into the lists passed to
+ *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not
+ *              the index associated with
+ *              {@link ANeuralNetworksModel_addOperand}.
+ * @param dimensions The dimension array to be filled. The size of the array
+ *                   must be exactly as large as the rank of the output operand
+ *                   to be queried in the model.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful,
+ * ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE if the target output is provided an
+ * insufficient buffer at execution time, ANEURALNETWORKS_BAD_DATA if the index
+ * is invalid or if the target is a scalar.
+ *
+ * Available since API level 29.
+ */
+inline int ANeuralNetworksExecution_getOutputOperandDimensions(
+    ANeuralNetworksExecution* execution, int32_t index, uint32_t* dimensions) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_getOutputOperandDimensions);
+  EXECUTE_FUNCTION_RETURN(execution, index, dimensions);
+}
+
+/**
+ * Create a {@link ANeuralNetworksBurst} to apply the given compilation.
+ * This only creates the burst object. Computation is only performed once
+ * {@link ANeuralNetworksExecution_burstCompute} is invoked with a valid
+ * {@link ANeuralNetworksExecution} and {@link ANeuralNetworksBurst}.
+ *
+ * <p>The provided compilation must outlive the burst object.</p>
+ *
+ * Available since API level 29.
+ *
+ * @param compilation The {@link ANeuralNetworksCompilation} to be evaluated.
+ * @param burst The newly created object or NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+ *         if the compilation is invalid.
+ */
+inline int ANeuralNetworksBurst_create(ANeuralNetworksCompilation* compilation,
+                                       ANeuralNetworksBurst** burst) {
+  LOAD_FUNCTION(ANeuralNetworksBurst_create);
+  EXECUTE_FUNCTION_RETURN(compilation, burst);
+}
+
+/**
+ * Destroys the burst object.
+ *
+ * Available since API level 29.
+ *
+ * @param burst The burst object to be destroyed. Passing NULL is acceptable and
+ *              results in no operation.
+ */
+inline void ANeuralNetworksBurst_free(ANeuralNetworksBurst* burst) {
+  LOAD_FUNCTION(ANeuralNetworksBurst_free);
+  EXECUTE_FUNCTION(burst);
+}
+
+/**
+ * Schedule synchronous evaluation of the execution on a burst object.
+ *
+ * <p>Schedules synchronous evaluation of the execution. Returns once the
+ * execution has completed and the outputs are ready to be consumed.</p>
+ *
+ * <p>There must be at most one {@link ANeuralNetworksExecution} processing at
+ * any given time for any given burst object. Any
+ * {@link ANeuralNetworksExecution} launched before the previous has finished
+ * will result in ANEURALNETWORKS_BAD_STATE.</p>
+ *
+ * Available since API level 29.
+ *
+ * @param burst The burst object to execute on.
+ * @param execution The execution to be scheduled and executed. The execution
+ *                  must be created from the same {@link
+ *                  ANeuralNetworksCompilation} as the burst object.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if the execution completed normally.
+ */
+inline int ANeuralNetworksExecution_burstCompute(
+    ANeuralNetworksExecution* execution, ANeuralNetworksBurst* burst) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_burstCompute);
+  EXECUTE_FUNCTION_RETURN(execution, burst);
+}
+
+/**
+ * Creates a shared memory object from an AHardwareBuffer handle.
+ *
+ * If the shared memory is backed by an AHardwareBuffer of
+ * AHARDWAREBUFFER_FORMAT_BLOB format, it can be used the same way as shared
+ * memory created from a file handle. See
+ * {@link ANeuralNetworksMemory} for a description on how to use this shared
+ * memory.
+ *
+ * If the shared memory is backed by an AHardwareBuffer of a format other than
+ * AHARDWAREBUFFER_FORMAT_BLOB, it can only be used for Model inputs and
+ * outputs. When calling {@link ANeuralNetworksExecution_setInputFromMemory} or
+ * {@link ANeuralNetworksExecution_setOutputFromMemory} with the shared memory,
+ * both offset and length must be set to zero and the entire memory region will
+ * be associated with the specified input or output operand. There is no
+ * guarantee that an arbitrary AHardwareBuffer_Format and
+ * AHardwareBuffer_UsageFlags combination can be used by arbitrary devices. The
+ * execution will fail if selected set of devices cannot consume the buffer.
+ *
+ * Calling {@link ANeuralNetworksModel_setOperandValueFromMemory} with shared
+ * memory backed by an AHardwareBuffer of a format other than
+ * AHARDWAREBUFFER_FORMAT_BLOB is disallowed.
+ *
+ * TODO(miaowang): add documentation about intended usage with introspection
+ * API.
+ *
+ * Available since API level 29.
+ *
+ * @param ahwb The AHardwareBuffer handle.
+ * @param memory The memory object to be created.
+ *               Set to NULL if unsuccessful.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if the request completed normally.
+ *
+ * @see AHardwareBuffer
+ */
+inline int ANeuralNetworksMemory_createFromAHardwareBuffer(
+    const AHardwareBuffer* ahwb, ANeuralNetworksMemory** memory) {
+  LOAD_FUNCTION(ANeuralNetworksMemory_createFromAHardwareBuffer);
+  EXECUTE_FUNCTION_RETURN(ahwb, memory);
+}
+
+/**
+ * Specifies whether duration of the {@link ANeuralNetworksExecution} is to be
+ * measured. By default, duration is not measured.
+ *
+ * The {@link ANeuralNetworksExecution} must have been created with
+ * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1.
+ *
+ * See {@link ANeuralNetworksExecution} for information on multithreaded usage.
+ *
+ * Available since API level 29.
+ *
+ * @param execution The execution to be modified.
+ * @param measure 'true' if duration is to be measured, 'false' if not.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksExecution_setMeasureTiming(
+    ANeuralNetworksExecution* execution, bool measure) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_setMeasureTiming);
+  EXECUTE_FUNCTION_RETURN(execution, measure);
+}
+
+/**
+ * Get the time spent in the specified {@link ANeuralNetworksExecution}, in
+ * nanoseconds. The execution must have completed.
+ *
+ * @param execution The execution to be queried.
+ * @param durationCode The measurement to be queried, specified by {@link
+ * DurationCode}.
+ * @param duration The returned duration. If no measurement was requested by
+ *                 {@link ANeuralNetworksExecution_setMeasureTiming}, or for
+ * some other reason the duration is not available, UINT64_MAX will be returned.
+ *                 A particular device need not support any given measurement.
+ *
+ * @return ANEURALNETWORKS_NO_ERROR if successful.
+ */
+inline int ANeuralNetworksExecution_getDuration(
+    const ANeuralNetworksExecution* execution, int32_t durationCode,
+    uint64_t* duration) {
+  LOAD_FUNCTION(ANeuralNetworksExecution_getDuration);
+  EXECUTE_FUNCTION_RETURN(execution, durationCode, duration);
+}
+
 /**/
 
 #endif  // TENSORFLOW_LITE_NNAPI_NEURALNETWORKSSHIM_H_
diff --git a/tensorflow/lite/nnapi/NeuralNetworksTypes.h b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba7eaf67633c3d1d3cc8c4aac814ef2e5a1fcd8b
--- /dev/null
+++ b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
@@ -0,0 +1,484 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_NNAPI_NEURALNETWORKSTYPES_H_
+#define TENSORFLOW_LITE_NNAPI_NEURALNETWORKSTYPES_H_
+
+#include <stdint.h>
+#include <stdio.h>
+
+typedef struct AHardwareBuffer AHardwareBuffer;
+
+// NN api types based on NNAPI header file
+// https://developer.android.com/ndk/reference/group/neural-networks
+
+/**
+ * Operand types.
+ *
+ * The type of operands that can be added to a model.
+ *
+ * Although we define many types, most operators accept just a few
+ * types.  Most used are ANEURALNETWORKS_TENSOR_FLOAT32,
+ * ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, and ANEURALNETWORKS_INT32.
+ */
+enum {
+  ANEURALNETWORKS_FLOAT32 = 0,
+  ANEURALNETWORKS_INT32 = 1,
+  ANEURALNETWORKS_UINT32 = 2,
+  ANEURALNETWORKS_TENSOR_FLOAT32 = 3,
+  ANEURALNETWORKS_TENSOR_INT32 = 4,
+  ANEURALNETWORKS_TENSOR_QUANT8_ASYMM = 5,
+  ANEURALNETWORKS_TENSOR_QUANT8_SYMM = 13,
+};
+
+/**
+ * Operation types.
+ *
+ * The type of operations that can be added to a model.
+ */
+enum {
+  ANEURALNETWORKS_ADD = 0,
+  ANEURALNETWORKS_AVERAGE_POOL_2D = 1,
+  ANEURALNETWORKS_CONCATENATION = 2,
+  ANEURALNETWORKS_CONV_2D = 3,
+  ANEURALNETWORKS_DEPTHWISE_CONV_2D = 4,
+  ANEURALNETWORKS_DEPTH_TO_SPACE = 5,
+  ANEURALNETWORKS_DEQUANTIZE = 6,
+  ANEURALNETWORKS_EMBEDDING_LOOKUP = 7,
+  ANEURALNETWORKS_FLOOR = 8,
+  ANEURALNETWORKS_FULLY_CONNECTED = 9,
+  ANEURALNETWORKS_HASHTABLE_LOOKUP = 10,
+  ANEURALNETWORKS_L2_NORMALIZATION = 11,
+  ANEURALNETWORKS_L2_POOL_2D = 12,
+  ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION = 13,
+  ANEURALNETWORKS_LOGISTIC = 14,
+  ANEURALNETWORKS_LSH_PROJECTION = 15,
+  ANEURALNETWORKS_LSTM = 16,
+  ANEURALNETWORKS_MAX_POOL_2D = 17,
+  ANEURALNETWORKS_MUL = 18,
+  ANEURALNETWORKS_RELU = 19,
+  ANEURALNETWORKS_RELU1 = 20,
+  ANEURALNETWORKS_RELU6 = 21,
+  ANEURALNETWORKS_RESHAPE = 22,
+  ANEURALNETWORKS_RESIZE_BILINEAR = 23,
+  ANEURALNETWORKS_RNN = 24,
+  ANEURALNETWORKS_SOFTMAX = 25,
+  ANEURALNETWORKS_SPACE_TO_DEPTH = 26,
+  ANEURALNETWORKS_SVDF = 27,
+  ANEURALNETWORKS_TANH = 28,
+  ANEURALNETWORKS_BATCH_TO_SPACE_ND = 29,
+  ANEURALNETWORKS_DIV = 30,
+  ANEURALNETWORKS_MEAN = 31,
+  ANEURALNETWORKS_PAD = 32,
+  ANEURALNETWORKS_SPACE_TO_BATCH_ND = 33,
+  ANEURALNETWORKS_SQUEEZE = 34,
+  ANEURALNETWORKS_STRIDED_SLICE = 35,
+  ANEURALNETWORKS_SUB = 36,
+  ANEURALNETWORKS_TRANSPOSE = 37,
+};
+
+/**
+ * Fused activation function types.
+ *
+ */
+enum {
+  ANEURALNETWORKS_FUSED_NONE = 0,
+  ANEURALNETWORKS_FUSED_RELU = 1,
+  ANEURALNETWORKS_FUSED_RELU1 = 2,
+  ANEURALNETWORKS_FUSED_RELU6 = 3,
+};
+
+/**
+ * Execution preferences.
+ */
+enum {
+  ANEURALNETWORKS_PREFER_LOW_POWER = 0,
+  ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER = 1,
+  ANEURALNETWORKS_PREFER_SUSTAINED_SPEED = 2,
+};
+
+/**
+ * Result codes.
+ */
+enum {
+  ANEURALNETWORKS_NO_ERROR = 0,
+  ANEURALNETWORKS_OUT_OF_MEMORY = 1,
+  ANEURALNETWORKS_INCOMPLETE = 2,
+  ANEURALNETWORKS_UNEXPECTED_NULL = 3,
+  ANEURALNETWORKS_BAD_DATA = 4,
+  ANEURALNETWORKS_OP_FAILED = 5,
+  ANEURALNETWORKS_BAD_STATE = 6,
+  ANEURALNETWORKS_UNMAPPABLE = 7,
+  ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE = 8,
+  ANEURALNETWORKS_UNAVAILABLE_DEVICE = 9,
+};
+
+/**
+ * Implicit padding algorithms.
+ */
+enum {
+  ANEURALNETWORKS_PADDING_SAME = 1,
+  ANEURALNETWORKS_PADDING_VALID = 2,
+};
+
+/**
+ * ANeuralNetworksMemory is an opaque type that represents memory.
+ *
+ * This type is used to represent shared memory, memory mapped files,
+ * and similar memories.
+ *
+ * By using shared memory, a program can efficiently communicate to the
+ * runtime and drivers the tensors that define a model. See
+ * {@link ANeuralNetworksModel_setOperandValueFromMemory}. An application
+ * should typically create one shared memory object that contains every tensor
+ * needed to define a model. {@link ANeuralNetworksMemory_createFromFd} can be
+ * used to create shared memory from a file handle. {@link
+ * ANeuralNetworksMemory_createShared} can be used to directly created shared
+ * memory.
+ *
+ * Memory objects can also be used to specify the input and output arguments of
+ * an execution. See {@link ANeuralNetworksExecution_setInputFromMemory}
+ * and {@link ANeuralNetworksExecution_setOutputFromMemory}.
+ */
+typedef struct ANeuralNetworksMemory ANeuralNetworksMemory;
+
+/**
+ * ANeuralNetworksModel is an opaque type that contains a description of the
+ * mathematical operations that constitute the model.
+ *
+ * <p>The model will be built by calling<ul>
+ * <li>{@link ANeuralNetworksModel_create},</li>
+ * <li>{@link ANeuralNetworksModel_addOperation},</li>
+ * <li>{@link ANeuralNetworksModel_addOperand},</li>
+ * </ul>
+ *
+ * A model is completed by calling {@link ANeuralNetworksModel_finish}.
+ * A model is destroyed by calling {@link ANeuralNetworksModel_free}.
+ *
+ * <p>It is the application's responsibility to make sure that only one thread
+ * modifies a model at a given time. It is however safe for more than one
+ * thread to use the model once {@link ANeuralNetworksModel_finish} has
+ * returned.</p>
+ *
+ * <p>It is also the application's responsibility to ensure that there are no
+ * other uses of the model after calling {@link ANeuralNetworksModel_free}. This
+ * includes any compilation or execution object created using the model.</p>
+ */
+typedef struct ANeuralNetworksModel ANeuralNetworksModel;
+
+/**
+ * ANeuralNetworksCompilation is an opaque type that can be used to compile
+ * a machine learning model.
+ *
+ * <p>To use:<ul>
+ *    <li>Create a new compilation instance by calling the
+ *        {@link ANeuralNetworksCompilation_create} function.</li>
+ *    <li>Perform the compilation with {@link
+ * ANeuralNetworksCompilation_start}.</li> <li>Wait for the compilation to
+ * complete with {@link ANeuralNetworksCompilation_wait}.</li> <li>Use the
+ * compilation as many times as needed with {@link
+ * ANeuralNetworksExecution_create}.</li> <li>Destroy the compilation with
+ * {@link ANeuralNetworksCompilation_free} once all executions using the
+ * compilation have completed.</li></ul></p>
+ *
+ * <p>A compilation cannot be modified once {@link
+ * ANeuralNetworksCompilation_start} has been called on it.</p>
+ *
+ * <p>It is the application's responsibility to make sure that only one thread
+ * modifies a compilation at a given time. It is however safe for more than one
+ * thread to use {@link ANeuralNetworksCompilation_wait} at the same time.
+ * It is also safe for multiple threads to use a compilation object once
+ * {@link ANeuralNetworksCompilation_wait} has completed.</p>
+ *
+ * <p>It is also the application's responsibility to ensure that there are no
+ * other uses of the compilation after calling {@link
+ * ANeuralNetworksCompilation_free}. This includes any execution object created
+ * using the compilation.</p>
+ */
+typedef struct ANeuralNetworksCompilation ANeuralNetworksCompilation;
+
+/**
+ * ANeuralNetworksExecution is an opaque type that can be used to apply a
+ * machine learning model to a set of inputs.
+ *
+ * <p>To use:<ul>
+ *    <li>Create a new execution instance by calling the
+ *        {@link ANeuralNetworksExecution_create} function.</li>
+ *    <li>Associate data to the model inputs with
+ *        {@link ANeuralNetworksExecution_setInput} or
+ *        {@link ANeuralNetworksExecution_setInputFromMemory}.</li>
+ *    <li>Associate output buffers to the model outputs with
+ *        {@link ANeuralNetworksExecution_setOutput} or
+ *        {@link ANeuralNetworksExecution_setOutputFromMemory}.</li>
+ *    <li>Apply the model with {@link
+ * ANeuralNetworksExecution_startCompute}.</li> <li>Wait for the execution to
+ * complete with {@link ANeuralNetworksExecution_wait}.</li> <li>Destroy the
+ * execution with
+ *        {@link ANeuralNetworksExecution_free}.</li></ul></p>
+ *
+ * <p>An execution cannot be modified once {@link
+ * ANeuralNetworksExecution_start} has been called on it.</p>
+ *
+ * <p>An execution can be applied to a model with
+ * {@link ANeuralNetworksExecution_startCompute} only once. Create new
+ * executions to do new evaluations of the model.</p>
+ *
+ * <p>It is the application's responsibility to make sure that only one thread
+ * modifies an execution at a given time. It is however safe for more than one
+ * thread to use {@link ANeuralNetworksExecution_wait} at the same time.</p>
+ *
+ * <p>It is also the application's responsibility to ensure that there are no
+ * other uses of the request after calling {@link
+ * ANeuralNetworksRequest_free}.</p>
+ */
+typedef struct ANeuralNetworksExecution ANeuralNetworksExecution;
+
+/**
+ * Parameters for ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL operand.
+ */
+typedef struct ANeuralNetworksSymmPerChannelQuantParams {
+  /* The index of the channel dimension. */
+  uint32_t channelDim;
+  /** The size of the scale array. Should be equal to dimension[channelDim] of
+   * the Operand. */
+  uint32_t scaleCount;
+  /** The array of scaling values for each channel. Each value must be greater
+   * than zero. */
+  const float* scales;
+} ANeuralNetworksSymmPerChannelQuantParams;
+
+/**
+ * ANeuralNetworksBurst is an opaque type that can be used to reduce the latency
+ * of a rapid sequence of executions. It will likely cause overhead if only used
+ * for a single execution.
+ *
+ * ANeuralNetworksBurst serves as a context object for any number of inferences
+ * using {@link ANeuralNetworksExecution} objects. An ANeuralNetworksBurst
+ * object and the {@link ANeuralNetworksExecution} objects used with it must all
+ * have been created from the same {@link ANeuralNetworksCompilation} object.
+ *
+ * This object is also used as a hint to drivers, providing insight to the
+ * lifetime of a rapid sequence of executions. For example, a driver may choose
+ * to increase the clock frequency of its accelerator for the lifetime of a
+ * burst object.
+ *
+ * <p>To use:<ul>
+ *    <li>Create a new burst object by calling the
+ *        {@link ANeuralNetworksBurst_create} function.</li>
+ *    <li>For each execution:</li><ul>
+ *        <li>Create {@link ANeuralNetworksExecution} and configure its
+ *            properties (see {@link ANeuralNetworksExecution} for
+ * details).</li> <li>Apply the model synchronously with
+ *            {@link ANeuralNetworksExecution_burstCompute}, reusing the same
+ *            {@link ANeuralNetworksBurst} with the new
+ *            {@link ANeuralNetworksExecution}.</li>
+ *        <li>Use and free the {@link ANeuralNetworksExecution}.</li></ul>
+ *    <li>Destroy the burst with
+ *        {@link ANeuralNetworksBurst_free}.</li></ul></p>
+ *
+ * Available since API level 29.
+ */
+typedef struct ANeuralNetworksBurst ANeuralNetworksBurst;
+
+/**
+ * ANeuralNetworksOperandType describes the type of an operand.
+ * This structure is used to describe both scalars and tensors.
+ */
+typedef struct ANeuralNetworksOperandType {
+  /** The data type, e.g ANEURALNETWORKS_INT8. */
+  int32_t type;
+  /** The number of dimensions. It should be 0 for scalars. */
+  uint32_t dimensionCount;
+  /** The dimensions of the tensor. It should be nullptr for scalars. */
+  const uint32_t* dimensions;
+  /** These two fields are only used for quantized tensors.
+   * They should be zero for scalars and non-fixed point tensors.
+   * The dequantized value of each entry is (value - offset) * scale.
+   */
+  float scale;
+  int32_t zeroPoint;
+} ANeuralNetworksOperandType;
+
+/**
+ * ANeuralNetworksEvent is an opaque type that represents an event
+ * that will be signaled once an execution completes.
+ */
+typedef struct ANeuralNetworksEvent ANeuralNetworksEvent;
+
+typedef int32_t ANeuralNetworksOperationType;
+
+/**
+ * ANeuralNetworksDevice is an opaque type that represents a device.
+ *
+ * This type is used to query basic properties and supported operations of the
+ * corresponding device, and control which device(s) a model is to be run on.
+ *
+ * Available since API level 29.
+ */
+typedef struct ANeuralNetworksDevice ANeuralNetworksDevice;
+
+// nn api function types
+
+typedef int (*ANeuralNetworksMemory_createFromFd_fn)(
+    size_t size, int protect, int fd, size_t offset,
+    ANeuralNetworksMemory** memory);
+
+typedef void (*ANeuralNetworksMemory_free_fn)(ANeuralNetworksMemory* memory);
+
+typedef int (*ANeuralNetworksModel_create_fn)(ANeuralNetworksModel** model);
+
+typedef int (*ANeuralNetworksModel_finish_fn)(ANeuralNetworksModel* model);
+
+typedef void (*ANeuralNetworksModel_free_fn)(ANeuralNetworksModel* model);
+
+typedef int (*ANeuralNetworksCompilation_create_fn)(
+    ANeuralNetworksModel* model, ANeuralNetworksCompilation** compilation);
+
+typedef void (*ANeuralNetworksCompilation_free_fn)(
+    ANeuralNetworksCompilation* compilation);
+
+typedef int (*ANeuralNetworksCompilation_setPreference_fn)(
+    ANeuralNetworksCompilation* compilation, int32_t preference);
+
+typedef int (*ANeuralNetworksCompilation_finish_fn)(
+    ANeuralNetworksCompilation* compilation);
+
+typedef int (*ANeuralNetworksModel_addOperand_fn)(
+    ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type);
+
+typedef int (*ANeuralNetworksModel_setOperandValue_fn)(
+    ANeuralNetworksModel* model, int32_t index, const void* buffer,
+    size_t length);
+
+typedef int (*ANeuralNetworksModel_setOperandSymmPerChannelQuantParams_fn)(
+    ANeuralNetworksModel* model, int32_t index,
+    const ANeuralNetworksSymmPerChannelQuantParams* channelQuant);
+
+typedef int (*ANeuralNetworksModel_setOperandValueFromMemory_fn)(
+    ANeuralNetworksModel* model, int32_t index,
+    const ANeuralNetworksMemory* memory, size_t offset, size_t length);
+
+typedef int (*ANeuralNetworksModel_addOperation_fn)(
+    ANeuralNetworksModel* model, ANeuralNetworksOperationType type,
+    uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount,
+    const uint32_t* outputs);
+
+typedef int (*ANeuralNetworksModel_identifyInputsAndOutputs_fn)(
+    ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs,
+    uint32_t outputCount, const uint32_t* outputs);
+
+typedef int (*ANeuralNetworksModel_relaxComputationFloat32toFloat16_fn)(
+    ANeuralNetworksModel* model, bool allow);
+
+typedef int (*ANeuralNetworksExecution_create_fn)(
+    ANeuralNetworksCompilation* compilation,
+    ANeuralNetworksExecution** execution);
+
+typedef void (*ANeuralNetworksExecution_free_fn)(
+    ANeuralNetworksExecution* execution);
+
+typedef int (*ANeuralNetworksExecution_setInput_fn)(
+    ANeuralNetworksExecution* execution, int32_t index,
+    const ANeuralNetworksOperandType* type, const void* buffer, size_t length);
+
+typedef int (*ANeuralNetworksExecution_setInputFromMemory_fn)(
+    ANeuralNetworksExecution* execution, int32_t index,
+    const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory,
+    size_t offset, size_t length);
+
+typedef int (*ANeuralNetworksExecution_setOutput_fn)(
+    ANeuralNetworksExecution* execution, int32_t index,
+    const ANeuralNetworksOperandType* type, void* buffer, size_t length);
+
+typedef int (*ANeuralNetworksExecution_setOutputFromMemory_fn)(
+    ANeuralNetworksExecution* execution, int32_t index,
+    const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory,
+    size_t offset, size_t length);
+
+typedef int (*ANeuralNetworksExecution_startCompute_fn)(
+    ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event);
+
+typedef int (*ANeuralNetworksEvent_wait_fn)(ANeuralNetworksEvent* event);
+
+typedef void (*ANeuralNetworksEvent_free_fn)(ANeuralNetworksEvent* event);
+
+typedef int (*ASharedMemory_create_fn)(const char* name, size_t size);
+
+typedef int (*ANeuralNetworks_getDeviceCount_fn)(uint32_t* numDevices);
+
+typedef int (*ANeuralNetworks_getDevice_fn)(uint32_t devIndex,
+                                            ANeuralNetworksDevice** device);
+
+typedef int (*ANeuralNetworksDevice_getName_fn)(
+    const ANeuralNetworksDevice* device, const char** name);
+
+typedef int (*ANeuralNetworksDevice_getType_fn)(
+    const ANeuralNetworksDevice* device, int32_t* type);
+
+typedef int (*ANeuralNetworksDevice_getVersion_fn)(
+    const ANeuralNetworksDevice* device, const char** version);
+
+typedef int (*ANeuralNetworksDevice_getFeatureLevel_fn)(
+    const ANeuralNetworksDevice* device, int64_t* featureLevel);
+
+typedef int (*ANeuralNetworksModel_getSupportedOperationsForDevices_fn)(
+    const ANeuralNetworksModel* model,
+    const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
+    bool* supportedOps);
+
+typedef int (*ANeuralNetworksCompilation_createForDevices_fn)(
+    ANeuralNetworksModel* model, const ANeuralNetworksDevice* const* devices,
+    uint32_t numDevices, ANeuralNetworksCompilation** compilation);
+
+typedef int (*ANeuralNetworksCompilation_setCaching_fn)(
+    ANeuralNetworksCompilation* compilation, const char* cacheDir,
+    const uint8_t* token);
+
+typedef int (*ANeuralNetworksExecution_compute_fn)(
+    ANeuralNetworksExecution* execution);
+
+typedef int (*ANeuralNetworksExecution_getOutputOperandRank_fn)(
+    ANeuralNetworksExecution* execution, int32_t index, uint32_t* rank);
+
+typedef int (*ANeuralNetworksExecution_getOutputOperandDimensions_fn)(
+    ANeuralNetworksExecution* execution, int32_t index, uint32_t* dimensions);
+
+typedef int (*ANeuralNetworksBurst_create_fn)(
+    ANeuralNetworksCompilation* compilation, ANeuralNetworksBurst** burst);
+
+typedef void (*ANeuralNetworksBurst_free_fn)(ANeuralNetworksBurst* burst);
+
+typedef int (*ANeuralNetworksExecution_burstCompute_fn)(
+    ANeuralNetworksExecution* execution, ANeuralNetworksBurst* burst);
+
+typedef int (*ANeuralNetworksMemory_createFromAHardwareBuffer_fn)(
+    const AHardwareBuffer* ahwb, ANeuralNetworksMemory** memory);
+
+typedef int (*ANeuralNetworksExecution_setMeasureTiming_fn)(
+    ANeuralNetworksExecution* execution, bool measure);
+
+typedef enum {
+  // Execution time on hardware (not driver, which runs on host processor).
+  ANEURALNETWORKS_DURATION_ON_HARDWARE = 0,
+  // Execution time in driver (including time on hardware).  Excludes overhead
+  // such as that of the runtime itself and the IPC needed for the runtime to
+  // communicate with the driver.
+  ANEURALNETWORKS_DURATION_IN_DRIVER = 1,
+} DurationCode;
+
+typedef int (*ANeuralNetworksExecution_getDuration_fn)(
+    const ANeuralNetworksExecution* execution, int32_t durationCode,
+    uint64_t* duration);
+
+#endif  // TENSORFLOW_LITE_NNAPI_NEURALNETWORKSTYPES_H_
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.cc b/tensorflow/lite/nnapi/nnapi_implementation.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bbc0c866e6352de40925d896886317d94814a308
--- /dev/null
+++ b/tensorflow/lite/nnapi/nnapi_implementation.cc
@@ -0,0 +1,202 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
+
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cstdlib>
+
+#ifdef __ANDROID__
+#include <sys/system_properties.h>
+#endif  // __ANDROID__
+
+#define NNAPI_LOG(format, ...) fprintf(stderr, format "\n", __VA_ARGS__);
+
+namespace {
+
+#ifdef __ANDROID__
+int32_t GetAndroidSdkVersion() {
+  const char* sdkProp = "ro.build.version.sdk";
+  char sdkVersion[PROP_VALUE_MAX];
+  int length = __system_property_get(sdkProp, sdkVersion);
+  if (length != 0) {
+    int32_t result = 0;
+    for (int i = 0; i < length; ++i) {
+      int digit = sdkVersion[i] - '0';
+      if (digit < 0 || digit > 9) {
+        // Non-numeric SDK version, assume it's higher than expected;
+        return 0xffff;
+      }
+      result = result * 10 + digit;
+    }
+    // TODO(levp): remove once SDK gets updated to 29th level
+    // Upgrade SDK version for pre-release Q to be able to test functionality
+    // available from SDK level 29.
+    if (result == 28) {
+      char versionCodename[PROP_VALUE_MAX];
+      const char* versionCodenameProp = "ro.build.version.codename";
+      length = __system_property_get(versionCodenameProp, versionCodename);
+      if (length != 0) {
+        if (versionCodename[0] == 'Q') {
+          return 29;
+        }
+      }
+    }
+    return result;
+  }
+  return 0;
+}
+#endif  // __ANDROID__
+
+void* LoadFunction(void* handle, const char* name, bool optional) {
+  if (handle == nullptr) {
+    return nullptr;
+  }
+  void* fn = dlsym(handle, name);
+  if (fn == nullptr && !optional) {
+    NNAPI_LOG("nnapi error: unable to open function %s", name);
+  }
+  return fn;
+}
+
+#ifndef __ANDROID__
+// Add /dev/shm implementation of shared memory for non-Android platforms
+int ASharedMemory_create(const char* name, size_t size) {
+  int fd = shm_open(name, O_RDWR | O_CREAT, 0644);
+  if (fd < 0) {
+    return fd;
+  }
+  int result = ftruncate(fd, size);
+  if (result < 0) {
+    close(fd);
+    return -1;
+  }
+  return fd;
+}
+#endif  // __ANDROID__
+
+#define LOAD_FUNCTION(handle, name)         \
+  nnapi.name = reinterpret_cast<name##_fn>( \
+      LoadFunction(handle, #name, /*optional*/ false));
+
+#define LOAD_FUNCTION_OPTIONAL(handle, name) \
+  nnapi.name = reinterpret_cast<name##_fn>(  \
+      LoadFunction(handle, #name, /*optional*/ true));
+
+const NnApi LoadNnApi() {
+  NnApi nnapi = {};
+  nnapi.android_sdk_version = 0;
+
+#ifdef __ANDROID__
+  void* libandroid = nullptr;
+  nnapi.android_sdk_version = GetAndroidSdkVersion();
+  if (nnapi.android_sdk_version < 27) {
+    NNAPI_LOG("nnapi error: requires android sdk version to be at least %d",
+              27);
+    nnapi.nnapi_exists = false;
+    return nnapi;
+  }
+  libandroid = dlopen("libandroid.so", RTLD_LAZY | RTLD_LOCAL);
+  if (libandroid == nullptr) {
+    NNAPI_LOG("nnapi error: unable to open library %s", "libandroid.so");
+  }
+#endif  // __ANDROID__
+
+  void* libneuralnetworks = nullptr;
+  // TODO(b/123243014): change RTLD_LOCAL? Assumes there can be multiple
+  // instances of nn api RT
+  libneuralnetworks = dlopen("libneuralnetworks.so", RTLD_LAZY | RTLD_LOCAL);
+  if (libneuralnetworks == nullptr) {
+    NNAPI_LOG("nnapi error: unable to open library %s", "libneuralnetworks.so");
+  }
+
+  nnapi.nnapi_exists = libneuralnetworks != nullptr;
+
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksMemory_createFromFd);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksMemory_free);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksModel_create);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksModel_free);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksModel_finish);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksModel_addOperand);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksModel_setOperandValue);
+  LOAD_FUNCTION_OPTIONAL(
+      libneuralnetworks,
+      ANeuralNetworksModel_setOperandSymmPerChannelQuantParams);
+  LOAD_FUNCTION(libneuralnetworks,
+                ANeuralNetworksModel_setOperandValueFromMemory);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksModel_addOperation);
+  LOAD_FUNCTION(libneuralnetworks,
+                ANeuralNetworksModel_identifyInputsAndOutputs);
+  LOAD_FUNCTION(libneuralnetworks,
+                ANeuralNetworksModel_relaxComputationFloat32toFloat16);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksCompilation_create);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksCompilation_free);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksCompilation_setPreference);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksCompilation_finish);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksExecution_create);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksExecution_free);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksExecution_setInput);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksExecution_setInputFromMemory);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksExecution_setOutput);
+  LOAD_FUNCTION(libneuralnetworks,
+                ANeuralNetworksExecution_setOutputFromMemory);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksExecution_startCompute);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksEvent_wait);
+  LOAD_FUNCTION(libneuralnetworks, ANeuralNetworksEvent_free);
+#ifdef __ANDROID__
+  LOAD_FUNCTION(libandroid, ASharedMemory_create);
+#else
+  nnapi.ASharedMemory_create = ASharedMemory_create;
+#endif  // __ANDROID__
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks, ANeuralNetworks_getDeviceCount);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks, ANeuralNetworks_getDevice);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks, ANeuralNetworksDevice_getName);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks, ANeuralNetworksDevice_getVersion);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksDevice_getFeatureLevel);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksModel_getSupportedOperationsForDevices);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksCompilation_createForDevices);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksCompilation_setCaching);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks, ANeuralNetworksExecution_compute);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksExecution_getOutputOperandRank);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksExecution_getOutputOperandDimensions);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks, ANeuralNetworksBurst_create);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks, ANeuralNetworksBurst_free);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksExecution_burstCompute);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksMemory_createFromAHardwareBuffer);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksExecution_setMeasureTiming);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksExecution_getDuration);
+  return nnapi;
+}
+
+}  // namespace
+
+const NnApi* NnApiImplementation() {
+  static const NnApi nnapi = LoadNnApi();
+  return &nnapi;
+}
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.h b/tensorflow/lite/nnapi/nnapi_implementation.h
new file mode 100644
index 0000000000000000000000000000000000000000..66a36dbbc3ce6a9e9c199f6eb2327db6a8d55a53
--- /dev/null
+++ b/tensorflow/lite/nnapi/nnapi_implementation.h
@@ -0,0 +1,998 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_NNAPI_NNAPI_IMPLEMENTATION_H_
+#define TENSORFLOW_LITE_NNAPI_NNAPI_IMPLEMENTATION_H_
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
+
+struct NnApi {
+  bool nnapi_exists;
+  int32_t android_sdk_version;
+
+  /**
+   * Creates a shared memory object from a file descriptor.
+   *
+   * The shared memory is backed by a file descriptor via mmap.
+   * See {@link ANeuralNetworksMemory} for a description on how to use
+   * this shared memory.
+   *
+   * @param size The requested size in bytes.
+   *             Must not be larger than the file size.
+   * @param prot The desired memory protection for the mapping.
+   *             It is either PROT_NONE or the bitwise OR of one or
+   *             more of the following flags: PROT_READ, PROT_WRITE.
+   * @param fd The requested file descriptor.
+   *           The file descriptor has to be mmap-able. The file
+   *           descriptor will be duplicated.
+   * @param offset The offset to the beginning of the file of the area to map.
+   *               The offset has to be aligned to a page size.
+   * @param memory The memory object to be created.
+   *               Set to NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if the request completed normally.
+   */
+  int (*ANeuralNetworksMemory_createFromFd)(size_t size, int protect, int fd,
+                                            size_t offset,
+                                            ANeuralNetworksMemory** memory);
+
+  /**
+   * Delete a memory object.
+   *
+   * Destroys the object used by the run time to keep track of the memory.
+   * This will free the underlying actual memory if no other code has open
+   * handles to this memory.
+   *
+   * @param memory The memory object to be freed.
+   */
+  void (*ANeuralNetworksMemory_free)(ANeuralNetworksMemory* memory);
+
+  /**
+   * Create an empty {@link ANeuralNetworksModel}.
+   *
+   * <p>This only creates the object. Computation is performed once
+   * {@link ANeuralNetworksExecution_startCompute} is invoked.
+   *
+   * The model should be constructed with calls to
+   * {@link ANeuralNetworksModel_addOperation} and
+   * {@link ANeuralNetworksModel_addOperand}
+   *
+   * <p>{@link ANeuralNetworksModel_finish} should be called once the model
+   * has been fully constructed.</p>
+   *
+   * <p>{@link ANeuralNetworksModel_free} should be called once the model
+   * is no longer needed.</p>
+   *
+   * @param model The {@link ANeuralNetworksModel} to be created.
+   *              Set to NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_create)(ANeuralNetworksModel** model);
+
+  /**
+   * Destroy a model.
+   *
+   * The model need not have been finished by a call to
+   * {@link ANeuralNetworksModel_finish}.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @param model The model to be destroyed. Passing NULL is acceptable and
+   *              results in no operation.
+   */
+  void (*ANeuralNetworksModel_free)(ANeuralNetworksModel* model);
+
+  /**
+   * Indicate that we have finished modifying a model. Required before
+   * calling {@link ANeuralNetworksCompilation_compile}.
+   *
+   * An application is responsible to make sure that no other thread uses
+   * the model at the same time.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @param model The model to be finished.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_finish)(ANeuralNetworksModel* model);
+
+  /**
+   * Add an operand to a model.
+   *
+   * The order in which the operands are added is important. The first one added
+   * to a model will have the index value 0, the second 1, etc. These indexes
+   * are used as operand identifiers in
+   * {@link ANeuralNetworksModel_addOperation},
+   * {@link ANeuralNetworksExecution_setInput},
+   * {@link ANeuralNetworksExecution_setInputFromMemory},
+   * {@link ANeuralNetworksExecution_setOutput},
+   * {@link ANeuralNetworksExecution_setOutputFromMemory} and
+   * {@link ANeuralNetworksExecution_setOperandValue}.
+   *
+   * To build a model that can accommodate inputs of various sizes, as you may
+   * want to do for a CNN, set the size of the dimensions that will vary at run
+   * time to 0. If you do so, provide the full dimensions when calling
+   * {@link ANeuralNetworksExecution_setInput} or {@link
+   * ANeuralNetworksExecution_setInputFromMemory}.
+   *
+   * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+   * been called will return an error.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @param model The model to be modified.
+   * @param type The {@link ANeuralNetworksOperandType} that describes the shape
+   * of the operand.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_addOperand)(
+      ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type);
+
+  /**
+   * Sets an operand to a constant value.
+   *
+   * For scalar values, the content of buffer is copied into the model.
+   *
+   * For tensor values, a pointer to the buffer is stored within the model.
+   * The application is responsible for not changing the content of this region
+   * until all executions using this model have completed. As the data may
+   * be copied during processing, modifying the data after this call yields
+   * undefined results.
+   *
+   * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+   * been called will return an error.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @param model The model to be modified.
+   * @param index The index of the model operand we're setting.
+   * @param buffer A pointer to the data to use.
+   * @param length The size in bytes of the data value.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_setOperandValue)(ANeuralNetworksModel* model,
+                                              int32_t index, const void* buffer,
+                                              size_t length);
+
+  /**
+   * Sets an operand's per channel quantization parameters.
+   *
+   * Sets parameters required by a tensor of type
+   * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}.
+   * This function must be called for every tensor of type
+   * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} before
+   * calling {@link ANeuralNetworksModel_finish}.
+   *
+   * Available since API level 29.
+   *
+   * @param model The model to be modified.
+   * @param index The index of the model operand we're setting.
+   * @param channelQuant The per channel quantization parameters for the
+   *                     operand. No memory in this struct needs to outlive the
+   *                     call to this function.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_setOperandSymmPerChannelQuantParams)(
+      ANeuralNetworksModel* model, int32_t index,
+      const ANeuralNetworksSymmPerChannelQuantParams* channelQuant);
+
+  /**
+   * Sets an operand to a value stored in a memory object.
+   *
+   * The content of the memory is not copied. A reference to that memory is
+   * stored inside the model. The application is responsible for not changing
+   * the content of the memory region until all executions using this model have
+   * completed.
+   * As the data may be copied during processing, modifying the data after this
+   * call yields undefined results.
+   *
+   * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+   * been called will return an error.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @param model The model to be modified.
+   * @param index The index of the model operand we're setting.
+   * @param buffer A pointer to the data to use.
+   * @param memory The memory containing the data.
+   * @param offset This specifies the location of the data within the memory.
+   *               The offset is in bytes from the start of memory.
+   * @param length The size in bytes of the data value.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_setOperandValueFromMemory)(
+      ANeuralNetworksModel* model, int32_t index,
+      const ANeuralNetworksMemory* memory, size_t offset, size_t length);
+
+  /**
+   * Add an operation to a model.
+   *
+   * @param model The model to be modified.
+   * @param type The type of the operation.
+   * @param inputCount The number of entries in the inputs array.
+   * @param inputs An array of indexes identifying each operand.
+   * @param outputCount The number of entries in the outputs array.
+   * @param outputs An array of indexes identifying each operand.
+   *
+   * The operands specified by inputs and outputs must have been
+   * previously added by calls to {@link ANeuralNetworksModel_addOperand}.
+   *
+   * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+   * been called will return an error.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksModel_addOperation)(ANeuralNetworksModel* model,
+                                           ANeuralNetworksOperationType type,
+                                           uint32_t inputCount,
+                                           const uint32_t* inputs,
+                                           uint32_t outputCount,
+                                           const uint32_t* outputs);
+
+  /**
+   * Specifies which operands will be the model's inputs and outputs.
+   *
+   * An operand cannot be used for both input and output. Doing so will
+   * return an error.
+   *
+   * @param model The model to be modified.
+   * @param inputCount The number of entries in the inputs array.
+   * @param inputs An array of indexes identifying the input operands.
+   * @param outputCount The number of entries in the outputs array.
+   * @param outputs An array of indexes identifying the output operands.
+   *
+   * The operands specified by inputs and outputs must have been
+   * previously added by calls to {@link ANeuralNetworksModel_addOperand}.
+   *
+   * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+   * been called will return an error.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   */
+  int (*ANeuralNetworksModel_identifyInputsAndOutputs)(
+      ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs,
+      uint32_t outputCount, const uint32_t* outputs);
+
+  /**
+   * Specifies whether {@link ANEURALNETWORKS_TENSOR_FLOAT32} is allowed to be
+   * calculated with range and/or precision as low as that of the
+   * IEEE 754 16-bit floating-point format. By default,
+   * {@link ANEURALNETWORKS_TENSOR_FLOAT32} must be calculated using at least
+   * the range and precision of the IEEE 754 32-bit floating-point format.
+   *
+   * @param model The model to be modified.
+   * @param allow 'true' indicates {@link ANEURALNETWORKS_TENSOR_FLOAT32} may be
+   *              calculated with range and/or precision as low as that of the
+   *              IEEE 754 16-bit floating point format. 'false' indicates
+   *              {@link ANEURALNETWORKS_TENSOR_FLOAT32} must be calculated
+   *              using at least the range and precision of the IEEE 754 32-bit
+   *              floating point format.
+   *
+   * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has
+   * been called will return an error.
+   *
+   * Available since API level 28.
+   *
+   * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   */
+  int (*ANeuralNetworksModel_relaxComputationFloat32toFloat16)(
+      ANeuralNetworksModel* model, bool allow);
+
+  /**
+   * Create a {@link ANeuralNetworksCompilation} to compile the given model.
+   * This only creates the object. Compilation is only performed once
+   * {@link ANeuralNetworksCompilation_start} is invoked.
+   *
+   * <p>The provided model must outlive the compilation.</p>
+   *
+   * The model must already have been finished by a call to
+   * {@link ANeuralNetworksModel_finish}.
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * @param model The {@link ANeuralNetworksModel} to be compiled.
+   * @param compilation The newly created object or NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+   *         if the model is invalid.
+   */
+  int (*ANeuralNetworksCompilation_create)(
+      ANeuralNetworksModel* model, ANeuralNetworksCompilation** compilation);
+
+  /**
+   * Destroy a compilation.
+   *
+   * <p>If called on a compilation for which
+   * {@link ANeuralNetworksCompilation_start} has been called, the
+   * function will return immediately but will mark the compilation to be
+   * deleted once the compilation completes. The
+   * {@link ANeuralNetworksCompilation_wait} will return ERROR_DELETED.
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * @param compilation The compilation to be destroyed. Passing NULL is
+   * acceptable and results in no operation.
+   */
+  void (*ANeuralNetworksCompilation_free)(
+      ANeuralNetworksCompilation* compilation);
+
+  /**
+   * Sets the execution preference.
+   *
+   * <p>Provides guidance to the runtime when trade-offs are possible.</p>
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * @param compilation The compilation to be modified.
+   * @param preference Either {@link PREFER_LOW_POWER},
+   *                  {@link PREFER_SINGLE_FAST_ANSWER}, or
+   *                  {@link PREFER_SUSTAINED_SPEED}.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksCompilation_setPreference)(
+      ANeuralNetworksCompilation* compilation, int32_t preference);
+
+  /**
+   * Waits until the compilation completes.
+   *
+   * More than one thread can wait on a compilation. When the compilation
+   * completes, all threads will be released.
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if the compilation completed normally.
+   */
+  int (*ANeuralNetworksCompilation_finish)(
+      ANeuralNetworksCompilation* compilation);
+
+  /**
+   * Create a {@link ANeuralNetworksExecution} to apply the given compilation.
+   * This only creates the object. Computation is only performed once
+   * {@link ANeuralNetworksExecution_startCompute} is invoked.
+   *
+   * <p>The provided compilation must outlive the execution.</p>
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param compilation The {@link ANeuralNetworksCompilation} to be evaluated.
+   * @param execution The newly created object or NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+   *         if the compilation is invalid.
+   */
+  int (*ANeuralNetworksExecution_create)(
+      ANeuralNetworksCompilation* compilation,
+      ANeuralNetworksExecution** execution);
+
+  /**
+   * Destroy an execution.
+   *
+   * <p>If called on an execution for which
+   * {@link ANeuralNetworksExecution_startCompute} has been called, the
+   * function will return immediately but will mark the execution to be deleted
+   * once the computation completes.   The {link ANeuralNetworksExecution_wait}
+   * will return ANEURALNETWORKS_ERROR_DELETED.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be destroyed. Passing NULL is acceptable
+   * and results in no operation.
+   */
+  void (*ANeuralNetworksExecution_free)(ANeuralNetworksExecution* execution);
+
+  /**
+   * Associate a user buffer with an input of the model of the
+   * {@link ANeuralNetworksExecution}.
+   *
+   * <p>The provided buffer must outlive the execution.</p>
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param index The index of the input argument we are setting. It is
+   *              an index into the lists passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   *              not the index associated with {@link
+   * ANeuralNetworksModel_addOperand}.
+   * @param type The type of the operand. This should be used to specify the
+   *             dimensions that were set to 0 when the operand was added to the
+   *             model. All other properties of the type must be the same as
+   *             specified in the model. If the type is the same as specified
+   *             when the model was built, NULL can be passed.
+   * @param buffer The buffer containing the data.
+   * @param length The length in bytes of the buffer.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
+   * the name is not recognized or the buffer is too small for the input.
+   */
+  int (*ANeuralNetworksExecution_setInput)(
+      ANeuralNetworksExecution* execution, int32_t index,
+      const ANeuralNetworksOperandType* type, const void* buffer,
+      size_t length);
+
+  /**
+   * Associate part of a memory object with an input of the model of the
+   * {@link ANeuralNetworksExecution}.
+   *
+   * <p>The provided memory must outlive the execution.</p>
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param index The index of the input argument we are setting. It is
+   *              an index into the lists passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   *              not the index associated with {@link
+   * ANeuralNetworksModel_addOperand}.
+   * @param type The type of the operand. This can be used to specify the
+   *             dimensions that were set to 0 when the operand was added to the
+   *             model. All other values must be the same as specified in the
+   *             model. If the type is the same as specified when the model
+   *             was built, NULL can be passed.
+   * @param memory The memory containing the data.
+   * @param offset This specifies the location of the data within the memory.
+   *               The offset is in bytes from the start of memory.
+   * @param length The size in bytes of the data value.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
+   * the name is not recognized or the buffer is too small for the input.
+   */
+  int (*ANeuralNetworksExecution_setInputFromMemory)(
+      ANeuralNetworksExecution* execution, int32_t index,
+      const ANeuralNetworksOperandType* type,
+      const ANeuralNetworksMemory* memory, size_t offset, size_t length);
+
+  /**
+   * Associate a user buffer with an output of the model of the
+   * {@link ANeuralNetworksExecution}.
+   *
+   * <p>The provided buffer must outlive the execution.</p>
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param index The index of the output argument we are setting. It is
+   *              an index into the lists passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   *              not the index associated with {@link
+   * ANeuralNetworksModel_addOperand}.
+   * @param type The type of the operand. This can be used to specify the
+   *             dimensions that were set to 0 when the operand was added to the
+   *             model. All other values must be the same as specified in the
+   *             model. If the type is the same as specified when the model
+   *             was built, NULL can be passed.
+   * @param buffer The buffer where the data is to be written.
+   * @param length The length in bytes of the buffer.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
+   * the name is not recognized or the buffer is too small for the output.
+   */
+  int (*ANeuralNetworksExecution_setOutput)(
+      ANeuralNetworksExecution* execution, int32_t index,
+      const ANeuralNetworksOperandType* type, void* buffer, size_t length);
+
+  /**
+   * Associate part of a memory object with an output of the model of the
+   * {@link ANeuralNetworksExecution}.
+   *
+   * <p>The provided memory must outlive the execution.</p>
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param index The index of the output argument we are setting. It is
+   *              an index into the lists passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   *              not the index associated with {@link
+   * ANeuralNetworksModel_addOperand}.
+   * @param type The type of the operand. This can be used to specify the
+   *             dimensions that were set to 0 when the operand was added to the
+   *             model. All other values must be the same as specified in the
+   *             model. If the type is the same as specified when the model
+   *             was built, NULL can be passed.
+   * @param memory The memory where the data is to be stored.
+   * @param offset This specifies the location of the data within the memory.
+   *               The offset is in bytes from the start of memory.
+   * @param length The length in bytes of the data value.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if
+   * the name is not recognized or the buffer is too small for the output.
+   */
+  int (*ANeuralNetworksExecution_setOutputFromMemory)(
+      ANeuralNetworksExecution* execution, int32_t index,
+      const ANeuralNetworksOperandType* type,
+      const ANeuralNetworksMemory* memory, size_t offset, size_t length);
+
+  /**
+   * Schedule evaluation of the execution.
+   *
+   * <p>Schedules evaluation of the execution. Once the model has been
+   * applied and the outputs are ready to be consumed, the execution will be
+   * signaled. Use {@link ANeuralNetworksExecution_wait} to wait for that
+   * signal.
+   * </p>
+   *
+   * Multiple executions can be scheduled and evaluated concurrently, and
+   * compilations can be performed concurrently with executions. The runtime
+   * makes no guarantee on the ordering of the completion of compilations and
+   * executions. If it's important to the application, the application should
+   * enforce the ordering by using {@link ANeuralNetworksCompilation_wait} and
+   * {@link ANeuralNetworksExecution_wait}.
+   *
+   * ANeuralNetworksExecution_wait must be called to recuperate the resources
+   * used by the execution.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be scheduled and executed.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksExecution_startCompute)(
+      ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event);
+
+  /**
+   * Waits until the execution completes.
+   *
+   * More than one thread can wait on an event. When the execution completes,
+   * all threads will be released.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if the execution completed normally.
+   */
+  int (*ANeuralNetworksEvent_wait)(ANeuralNetworksEvent* event);
+
+  /**
+   * Destroys the event.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   */
+  void (*ANeuralNetworksEvent_free)(ANeuralNetworksEvent* event);
+
+  // ASharedMemory_create was added in Android 8.0, so safe to use with NNAPI
+  // which was added in 8.1.
+  int (*ASharedMemory_create)(const char* name, size_t size);
+
+  /**
+   * Get the number of available devices.
+   *
+   * @param numDevices Used to return the number of devices.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworks_getDeviceCount)(uint32_t* numDevices);
+
+  /**
+   * Get the representation of the specified device.
+   *
+   * @param devIndex The index of the specified device. Must be less than the
+   *                 number of available devices.
+   * @param device The representation of the specified device.
+   *               The same representation will always be returned for the
+   *               specified device.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 29.
+   */
+
+  int (*ANeuralNetworks_getDevice)(uint32_t devIndex,
+                                   ANeuralNetworksDevice** device);
+
+  /**
+   * Get the name of the specified device.
+   *
+   * @param device The representation of the specified device.
+   * @param name The returned name of the specified device. The name will be
+   *             in UTF-8 and will be null-terminated. It will be recognizable
+   *             as a known device name rather than a cryptic string. For
+   *             devices with API level 29 and above, the format of the name is
+   *             {VENDOR}-{DEVICE}, e.g. “google-ipu”. For devices with feature
+   *             level 28 or lower, the name will always be “unknown-device”.
+   *             The name will remain valid for the duration of the application.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksDevice_getName)(const ANeuralNetworksDevice* device,
+                                       const char** name);
+
+  /**
+   * Get the version of the driver implementation of the specified device.
+   *
+   * It’s the responsibility of the driver implementor to insure that this
+   * version string uniquely distinguishes this implementation from all previous
+   * implementations.
+   *
+   * This version string must not be confused with the feature level which is
+   * solely defined by {@link ANeuralNetworksDevice_getFeatureLevel}. There is
+   * no implicit ordering of the versions. For example, it is not possible to
+   * filter all drivers older than a certain version.
+   *
+   * Application developers may use this version string to avoid or prefer
+   * specific driver implementations. For example, an application may want to do
+   * so because:
+   *     - A specific version of the driver does not provide the required
+   * performance, perhaps because of a performance regression.
+   *     - A specific version of the driver has a bug or returns results that
+   * don’t match the minimum precision requirement for the application.
+   *
+   * @param device  The representation of the specified device.
+   * @param version The returned version string of the driver for the specified
+   *                device. The string will be in UTF-8 and will be
+   *                null-terminated. For devices with feature level 28 or lower,
+   *                "UNKNOWN" will be returned. The version string will remain
+   *                valid for the duration of the application.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksDevice_getVersion)(const ANeuralNetworksDevice* device,
+                                          const char** version);
+
+  /**
+   * Get the supported NNAPI version of the specified device.
+   *
+   * Each device has a supported feature level, which is the most advanced
+   * feature this driver implements. For example, if the driver implements the
+   * features introduced in Android P, but does not implement the features
+   * introduced after Android P, the value would be 28. Developers could decide
+   * whether or not the specified device should be used for a Model that has
+   * certain feature requirements.
+   *
+   * @param device       The representation of the specified device.
+   * @param featureLevel The API level of the most advanced feature this driver
+   *                     implements.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksDevice_getFeatureLevel)(
+      const ANeuralNetworksDevice* device, int64_t* featureLevel);
+
+  /**
+   * Get the supported operations for a specified set of devices. If multiple
+   * devices are selected, the supported operation list is a union of supported
+   * operations of all selected devices.
+   *
+   * @param model        The model to be queried.
+   * @param devices      The set of devices. Must not contain duplicates.
+   * @param numDevices   The number of devices in the set.
+   * @param supportedOps The boolean array to be filled. True means supported.
+   *                     The size of the boolean array must be at least as large
+   *                     as the number of operations in the model. The order of
+   *                     elements in the supportedOps array matches the order in
+   *                     which the corresponding operations were added to the
+   *                     model.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksModel_getSupportedOperationsForDevices)(
+      const ANeuralNetworksModel* model,
+      const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
+      bool* supportedOps);
+
+  /**
+   * Create a {@link ANeuralNetworksCompilation} to compile the given model for
+   * a specified set of devices. If more than one device is specified, the
+   * compilation will distribute the workload automatically across the devices.
+   * The model must be fully supported by the specified set of devices. This
+   * means that ANeuralNetworksModel_getSupportedOperationsForDevices() must
+   * have returned true for every operation for that model/devices pair.
+   *
+   * @param model       The {@link ANeuralNetworksModel} to be compiled.
+   * @param devices     The set of devices. Must not contain duplicates.
+   * @param numDevices  The number of devices in the set.
+   * @param compilation The newly created object or NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+   *         if the model is invalid.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksCompilation_createForDevices)(
+      ANeuralNetworksModel* model, const ANeuralNetworksDevice* const* devices,
+      uint32_t numDevices, ANeuralNetworksCompilation** compilation);
+
+  /**
+   * Sets the compilation caching signature and the cache directory.
+   *
+   * Provides optional caching information to the runtime for faster repeated
+   * compilation.
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * @param compilation The compilation to be modified.
+   * @param cacheDir The cache directory to store and retrieve caching data. It
+   *                 is recommended to use the code_cache provided by the
+   *                 Android runtime. If not using the code_cache, the user
+   *                 should choose a directory local to the application, and is
+   *                 responsible to manage and clean the cache entries.
+   * @param token The token provided by the user to specify a model, must be of
+   *              length ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN. The user
+   *              should ensure that the token is unique to a model within the
+   *              application. The NNAPI runtime will not detected token
+   *              collisions. If there is a collision, the compilation outcome
+   *              may be incorrect without notifying with error.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksCompilation_setCaching)(
+      ANeuralNetworksCompilation* compilation, const char* cacheDir,
+      const uint8_t* token);
+
+  /**
+   * Schedule synchronous evaluation of the execution.
+   *
+   * <p>Schedules synchronous evaluation of the execution. Returns once the
+   * execution has completed and the outputs are ready to be consumed.
+   * </p>
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * See {@link ANeuralNetworksExecution_startCompute} for asynchronous
+   * execution. Synchronous execution incurs lower overhead than asynchronous
+   * execution.
+   *
+   * Available since API level 29.
+   *
+   * @param execution The execution to be scheduled and executed.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if the execution completed normally.
+   *         ANEURALNETWORKS_UNMAPPABLE if the execution input or output memory
+   *         cannot be properly mapped.
+   */
+  int (*ANeuralNetworksExecution_compute)(ANeuralNetworksExecution* execution);
+
+  /**
+   * Get the dimensional information of the specified output operand of the
+   * model of the
+   * {@link ANeuralNetworksExecution}.
+   *
+   * On asynchronous execution initiated by {@link
+   * ANeuralNetworksExecution_startCompute},
+   * {@link ANeuralNetworksEvent_wait} must be called prior to this function to
+   * recuperate the resources used by the execution.
+   *
+   * @param execution The execution to be queried.
+   * @param index The index of the output argument we are querying. It is
+   *              an index into the lists passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   *              not the index associated with
+   *              {@link ANeuralNetworksModel_addOperand}.
+   * @param rank The rank of the output operand.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful,
+   *         ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE if the target output is
+   *         provided an insufficient buffer at execution time,
+   *         ANEURALNETWORKS_BAD_DATA if the index is invalid.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksExecution_getOutputOperandRank)(
+      ANeuralNetworksExecution* execution, int32_t index, uint32_t* rank);
+
+  /**
+   * Get the dimensional information of the specified output operand of the
+   * model of the
+   * {@link ANeuralNetworksExecution}. The target output operand cannot be a
+   * scalar.
+   *
+   * On asynchronous execution initiated by {@link
+   * ANeuralNetworksExecution_startCompute},
+   * {@link ANeuralNetworksEvent_wait} must be called prior to this function to
+   * recuperate the resources used by the execution.
+   *
+   * @param execution The execution to be queried.
+   * @param index The index of the output argument we are querying. It is an
+   *              index into the lists passed to
+   *              {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is
+   *              not the index associated with
+   *              {@link ANeuralNetworksModel_addOperand}.
+   * @param dimensions The dimension array to be filled. The size of the array
+   *                   must be exactly as large as the rank of the output
+   *                   operand to be queried in the model.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful,
+   *         ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE if the target output is
+   *         provided an insufficient buffer at execution time,
+   *         ANEURALNETWORKS_BAD_DATA if the index is invalid or if the target
+   *         is a scalar.
+   *
+   * Available since API level 29.
+   */
+  int (*ANeuralNetworksExecution_getOutputOperandDimensions)(
+      ANeuralNetworksExecution* execution, int32_t index, uint32_t* dimensions);
+
+  /**
+   * Create a {@link ANeuralNetworksBurst} to apply the given compilation.
+   * This only creates the burst object. Computation is only performed once
+   * {@link ANeuralNetworksExecution_burstCompute} is invoked with a valid
+   * {@link ANeuralNetworksExecution} and {@link ANeuralNetworksBurst}.
+   *
+   * <p>The provided compilation must outlive the burst object.</p>
+   *
+   * Available since API level 29.
+   *
+   * @param compilation The {@link ANeuralNetworksCompilation} to be evaluated.
+   * @param burst The newly created object or NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA
+   *         if the compilation is invalid.
+   */
+  int (*ANeuralNetworksBurst_create)(ANeuralNetworksCompilation* compilation,
+                                     ANeuralNetworksBurst** burst);
+
+  /**
+   * Destroys the burst object.
+   *
+   * Available since API level 29.
+   *
+   * @param burst The burst object to be destroyed. Passing NULL is acceptable
+   * and results in no operation.
+   */
+  void (*ANeuralNetworksBurst_free)(ANeuralNetworksBurst* burst);
+
+  /**
+   * Schedule synchronous evaluation of the execution on a burst object.
+   *
+   * <p>Schedules synchronous evaluation of the execution. Returns once the
+   * execution has completed and the outputs are ready to be consumed.</p>
+   *
+   * <p>There must be at most one {@link ANeuralNetworksExecution} processing at
+   * any given time for any given burst object. Any
+   * {@link ANeuralNetworksExecution} launched before the previous has finished
+   * will result in ANEURALNETWORKS_BAD_STATE.</p>
+   *
+   * Available since API level 29.
+   *
+   * @param burst The burst object to execute on.
+   * @param execution The execution to be scheduled and executed. The execution
+   *                  must be created from the same {@link
+   *                  ANeuralNetworksCompilation} as the burst object.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if the execution completed normally.
+   */
+  int (*ANeuralNetworksExecution_burstCompute)(
+      ANeuralNetworksExecution* execution, ANeuralNetworksBurst* burst);
+
+  /**
+   * Creates a shared memory object from an AHardwareBuffer handle.
+   *
+   * If the shared memory is backed by an AHardwareBuffer of
+   * AHARDWAREBUFFER_FORMAT_BLOB format, it can be used the same way as
+   * shared memory created from a file handle. See
+   * {@link ANeuralNetworksMemory} for a description on how to use this
+   * shared memory.
+   *
+   * If the shared memory is backed by an AHardwareBuffer of a format other
+   * than AHARDWAREBUFFER_FORMAT_BLOB, it can only be used for Model inputs
+   * and outputs. When calling
+   * {@link ANeuralNetworksExecution_setInputFromMemory} or
+   * {@link ANeuralNetworksExecution_setOutputFromMemory} with the shared
+   * memory, both offset and length must be set to zero and the entire
+   * memory region will be associated with the specified input or output
+   * operand. There is no guarantee that an arbitrary AHardwareBuffer_Format
+   * and AHardwareBuffer_UsageFlags combination can be used by arbitrary
+   * devices. The execution will fail if selected set of devices cannot
+   * consume the buffer.
+   *
+   * Calling {@link ANeuralNetworksModel_setOperandValueFromMemory} with
+   * shared memory backed by an AHardwareBuffer of a format other than
+   * AHARDWAREBUFFER_FORMAT_BLOB is disallowed.
+   *
+   * TODO(miaowang): add documentation about intended usage with
+   * introspection API.
+   *
+   * Available since API level 29.
+   *
+   * @param ahwb The AHardwareBuffer handle.
+   * @param memory The memory object to be created.
+   *               Set to NULL if unsuccessful.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if the request completed normally.
+   *
+   * @see AHardwareBuffer
+   */
+  int (*ANeuralNetworksMemory_createFromAHardwareBuffer)(
+      const AHardwareBuffer* ahwb, ANeuralNetworksMemory** memory);
+
+  /**
+   * Specifies whether duration of the {@link ANeuralNetworksExecution} is to be
+   * measured. By default, duration is not measured.
+   *
+   * The {@link ANeuralNetworksExecution} must have been created with
+   * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * Available since API level 29.
+   *
+   * @param execution The execution to be modified.
+   * @param measure 'true' if duration is to be measured, 'false' if not.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksExecution_setMeasureTiming)(
+      ANeuralNetworksExecution* execution, bool measure);
+
+  /**
+   * Get the time spent in the specified {@link ANeuralNetworksExecution}, in
+   * nanoseconds. The execution must have completed.
+   *
+   * @param execution The execution to be queried.
+   * @param durationCode The measurement to be queried, specified by {@link
+   * DurationCode}.
+   * @param duration The returned duration. If no measurement was requested by
+   *                 {@link ANeuralNetworksExecution_setMeasureTiming}, or for
+   * some other reason the duration is not available, UINT64_MAX will be
+   * returned. A particular device need not support any given measurement.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksExecution_getDuration)(
+      const ANeuralNetworksExecution* execution, int32_t durationCode,
+      uint64_t* duration);
+
+  /**/
+};
+
+/**
+ * Load the NNAPI implementation from the shared libraries.
+ * The NnApi structure is filled with all the pointers. If one function doesn't
+ * exist, a null pointer is stored.
+ */
+const NnApi* NnApiImplementation();
+
+#endif  // TENSORFLOW_LITE_NNAPI_NNAPI_IMPLEMENTATION_H_
diff --git a/tensorflow/lite/nnapi/nnapi_implementation_disabled.cc b/tensorflow/lite/nnapi/nnapi_implementation_disabled.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6bc78e53da64b209d53bfcfc97e194e7430f016c
--- /dev/null
+++ b/tensorflow/lite/nnapi/nnapi_implementation_disabled.cc
@@ -0,0 +1,20 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
+
+const NnApi* NnApiImplementation() {
+  static const NnApi nnapi = {};
+  return &nnapi;
+}
diff --git a/tensorflow/lite/nnapi/nnapi_implementation_test.cc b/tensorflow/lite/nnapi/nnapi_implementation_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9f30b95ec37e3c878d3bdbc1acc96026dfeef9e1
--- /dev/null
+++ b/tensorflow/lite/nnapi/nnapi_implementation_test.cc
@@ -0,0 +1,142 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
+#include <gtest/gtest.h>
+
+namespace {
+
+TEST(NnapiLibTest, NnApiImplementation) {
+  const NnApi* nnapi = NnApiImplementation();
+  EXPECT_NE(nnapi, nullptr);
+#ifdef __ANDROID__
+  EXPECT_GT(nnapi->android_sdk_version, 0);
+  if (nnapi.android_sdk_version < 27) {
+    EXPECT_FALSE(nnapi->nnapi_exists);
+    EXPECT_EQ(nnapi->ANeuralNetworksMemory_createFromFd, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksMemory_free, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_create, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_free, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_finish, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_addOperand, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_setOperandValue, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_setOperandValueFromMemory, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_addOperation, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_identifyInputsAndOutputs, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksModel_relaxComputationFloat32toFloat16,
+              nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksCompilation_create, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksCompilation_free, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksCompilation_setPreference, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksCompilation_finish, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksExecution_create, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksExecution_free, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksExecution_setInput, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksExecution_setInputFromMemory, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksExecution_setOutput, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksExecution_setOutputFromMemory, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksExecution_startCompute, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksEvent_wait, nullptr);
+    EXPECT_EQ(nnapi->ANeuralNetworksEvent_free, nullptr);
+    EXPECT_EQ(nnapi->ASharedMemory_create, nullptr);
+  } else {
+    EXPECT_TRUE(nnapi->nnapi_exists);
+    EXPECT_NE(nnapi->ANeuralNetworksMemory_createFromFd, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksMemory_free, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksModel_create, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksModel_free, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksModel_finish, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksModel_addOperand, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksModel_setOperandValue, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksModel_setOperandValueFromMemory, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksModel_addOperation, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksModel_identifyInputsAndOutputs, nullptr);
+    if (nnapi->android_sdk_version >= 28) {
+      // relaxComputationFloat32toFloat16 only available with Android 9.0 (P).
+      EXPECT_NE(nnapi->ANeuralNetworksModel_relaxComputationFloat32toFloat16,
+                nullptr);
+    } else {
+      EXPECT_EQ(nnapi->ANeuralNetworksModel_relaxComputationFloat32toFloat16,
+                nullptr);
+    }
+    EXPECT_NE(nnapi->ANeuralNetworksCompilation_create, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksCompilation_free, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksCompilation_setPreference, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksCompilation_finish, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksExecution_create, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksExecution_free, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksExecution_setInput, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksExecution_setInputFromMemory, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksExecution_setOutput, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksExecution_setOutputFromMemory, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksExecution_startCompute, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksEvent_wait, nullptr);
+    EXPECT_NE(nnapi->ANeuralNetworksEvent_free, nullptr);
+    EXPECT_NE(nnapi->ASharedMemory_create, nullptr);
+    // TODO(b/123423795): Test Q-specific APIs after release.
+  }
+#else
+  EXPECT_FALSE(nnapi->nnapi_exists);
+  EXPECT_EQ(nnapi->android_sdk_version, 0);
+  EXPECT_EQ(nnapi->ANeuralNetworksMemory_createFromFd, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksMemory_free, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_create, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_free, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_finish, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_addOperand, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_setOperandValue, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_setOperandSymmPerChannelQuantParams,
+            nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_setOperandValueFromMemory, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_addOperation, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_identifyInputsAndOutputs, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_relaxComputationFloat32toFloat16,
+            nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksCompilation_create, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksCompilation_free, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksCompilation_setPreference, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksCompilation_finish, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_create, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_free, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_setInput, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_setInputFromMemory, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_setOutput, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_setOutputFromMemory, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_startCompute, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksEvent_wait, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksEvent_free, nullptr);
+  EXPECT_NE(nnapi->ASharedMemory_create, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworks_getDeviceCount, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworks_getDevice, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksDevice_getName, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksDevice_getVersion, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksDevice_getFeatureLevel, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksModel_getSupportedOperationsForDevices,
+            nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksCompilation_createForDevices, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksCompilation_setCaching, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_compute, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_getOutputOperandRank, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_getOutputOperandDimensions,
+            nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksBurst_create, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksBurst_free, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_burstCompute, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksMemory_createFromAHardwareBuffer, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_setMeasureTiming, nullptr);
+  EXPECT_EQ(nnapi->ANeuralNetworksExecution_getDuration, nullptr);
+#endif
+}
+
+}  // namespace
diff --git a/tensorflow/lite/nnapi_delegate.cc b/tensorflow/lite/nnapi_delegate.cc
index 26d75696a1c889d752f9715358701da6300f49df..d0605ef6b355c61ef85b55c601861771975e73c6 100644
--- a/tensorflow/lite/nnapi_delegate.cc
+++ b/tensorflow/lite/nnapi_delegate.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/nnapi/NeuralNetworksShim.h"
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
 
 #ifdef __ANDROID__
 #include <android/log.h>
@@ -84,56 +84,27 @@ void logError(const char* format, ...) {
 static const int64_t kOperandIdNotSet = -1;
 static const int64_t kOperandNotNeeded = -2;
 
-namespace {
-
-int32_t GetAndroidSdkVersion() {
-#ifdef __ANDROID__
-  const char* sdkProp = "ro.build.version.sdk";
-  char sdkVersion[PROP_VALUE_MAX];
-  int length = __system_property_get(sdkProp, sdkVersion);
-  if (length != 0) {
-    for (int i = 0; i < length; ++i) {
-      int digit = sdkVersion[i] - '0';
-      if (digit < 0 || digit > 9) {
-        // Non-numeric SDK version, assume it's higher then expected;
-        return 0xFFFF;
-      }
-    }
-    return atoi(sdkVersion);
-  }
-  FATAL("No %s prop", sdkProp);
-#endif  // __ANDROID__
-  return 0;
-}
-
-int32_t GetAndroidSdkVersionCached() {
-  static int32_t androidSdkVersion = GetAndroidSdkVersion();
-  return androidSdkVersion;
-}
-
-}  // namespace
-
 NNAPIAllocation::NNAPIAllocation(const char* filename,
                                  ErrorReporter* error_reporter)
     : MMAPAllocation(filename, error_reporter) {
   if (mmapped_buffer_ != MAP_FAILED)
-    CHECK_NN(ANeuralNetworksMemory_createFromFd(buffer_size_bytes_, PROT_READ,
-                                                mmap_fd_, 0, &handle_));
+    CHECK_NN(NnApiImplementation()->ANeuralNetworksMemory_createFromFd(
+        buffer_size_bytes_, PROT_READ, mmap_fd_, 0, &handle_));
 }
 
 NNAPIAllocation::~NNAPIAllocation() {
   if (handle_) {
-    ANeuralNetworksMemory_free(handle_);
+    NnApiImplementation()->ANeuralNetworksMemory_free(handle_);
   }
 }
 
 NNAPIDelegate::~NNAPIDelegate() {
   if (nn_compiled_model_) {
-    ANeuralNetworksCompilation_free(nn_compiled_model_);
+    NnApiImplementation()->ANeuralNetworksCompilation_free(nn_compiled_model_);
     nn_compiled_model_ = nullptr;
   }
   if (nn_model_) {
-    ANeuralNetworksModel_free(nn_model_);
+    NnApiImplementation()->ANeuralNetworksModel_free(nn_model_);
     nn_model_ = nullptr;
     // TODO(aselle): Is this thread-safe and callable multiple times?
   }
@@ -145,6 +116,7 @@ TfLiteStatus addTensorOperands(tflite::Subgraph* subgraph,
                                ANeuralNetworksModel* nn_model,
                                uint32_t* no_of_operands_added,
                                std::vector<int64_t>* nnapi_ids) {
+  const NnApi* nnapi = NnApiImplementation();
   uint32_t next_id = 0;
   for (size_t i = 0; i < subgraph->tensors_size(); i++) {
     // Skip temporaries and RNN back-edges.
@@ -198,24 +170,24 @@ TfLiteStatus addTensorOperands(tflite::Subgraph* subgraph,
         nn_type, static_cast<uint32_t>(tensor->dims->size),
         reinterpret_cast<uint32_t*>(tensor->dims->data), scale, zeroPoint};
     RETURN_ERROR_IF_NN_FAILED(
-        ANeuralNetworksModel_addOperand(nn_model, &operand_type));
+        nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type));
     // TODO(aselle): Based on Michael's suggestion, limiting this to read
     // only memory
     if (tensor->allocation_type == kTfLiteMmapRo) {
       if (const NNAPIAllocation* alloc = dynamic_cast<const NNAPIAllocation*>(
               static_cast<const Allocation*>(tensor->allocation))) {
         RETURN_ERROR_IF_NN_FAILED(
-            ANeuralNetworksModel_setOperandValueFromMemory(
+            nnapi->ANeuralNetworksModel_setOperandValueFromMemory(
                 nn_model, next_id, alloc->memory(),
                 alloc->offset(tensor->data.raw), tensor->bytes));
       } else {
-        RETURN_ERROR_IF_NN_FAILED(ANeuralNetworksModel_setOperandValue(
+        RETURN_ERROR_IF_NN_FAILED(nnapi->ANeuralNetworksModel_setOperandValue(
             nn_model, next_id, tensor->data.raw, tensor->bytes));
       }
     } else if (tensor->bytes == 0) {
       // These size 0 tensors are optional tensors reserved.
-      RETURN_ERROR_IF_NN_FAILED(
-          ANeuralNetworksModel_setOperandValue(nn_model, next_id, nullptr, 0));
+      RETURN_ERROR_IF_NN_FAILED(nnapi->ANeuralNetworksModel_setOperandValue(
+          nn_model, next_id, nullptr, 0));
     }
 
     ++next_id;
@@ -244,6 +216,7 @@ TfLiteStatus AddOpsAndParams(
     uint32_t next_id, std::vector<int>* model_state_inputs,
     std::vector<int>* model_state_outputs,
     const std::vector<int64_t>& tensor_id_to_nnapi_id) {
+  const NnApi* nnapi = NnApiImplementation();
   for (size_t i = 0; i < subgraph->nodes_size(); i++) {
     const auto* node_and_registration = subgraph->node_and_registration(i);
     const TfLiteNode& node = node_and_registration->first;
@@ -258,21 +231,21 @@ TfLiteStatus AddOpsAndParams(
     MapAndAddTensorIds(node.outputs->data, node.outputs->size,
                        &augmented_outputs, tensor_id_to_nnapi_id);
 
-    auto add_scalar_int32 = [&nn_model, &augmented_inputs,
+    auto add_scalar_int32 = [nnapi, &nn_model, &augmented_inputs,
                              &next_id](int value) {
       ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_INT32};
-      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
-      CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, &value,
-                                                    sizeof(int32_t)))
+      CHECK_NN(nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+      CHECK_NN(nnapi->ANeuralNetworksModel_setOperandValue(
+          nn_model, next_id, &value, sizeof(int32_t)))
       augmented_inputs.push_back(next_id++);
     };
 
-    auto add_scalar_float32 = [&nn_model, &augmented_inputs,
+    auto add_scalar_float32 = [nnapi, &nn_model, &augmented_inputs,
                                &next_id](float value) {
       ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_FLOAT32};
-      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
-      CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, &value,
-                                                    sizeof(float)))
+      CHECK_NN(nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+      CHECK_NN(nnapi->ANeuralNetworksModel_setOperandValue(
+          nn_model, next_id, &value, sizeof(float)))
       augmented_inputs.push_back(next_id++);
     };
 
@@ -281,8 +254,8 @@ TfLiteStatus AddOpsAndParams(
           .type = ANEURALNETWORKS_TENSOR_INT32,
           .dimensionCount = 1,
           .dimensions = &num_values};
-      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
-      CHECK_NN(ANeuralNetworksModel_setOperandValue(
+      CHECK_NN(nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+      CHECK_NN(nnapi->ANeuralNetworksModel_setOperandValue(
           nn_model, next_id, values, sizeof(int32_t) * num_values));
       augmented_inputs.push_back(next_id++);
     };
@@ -291,15 +264,16 @@ TfLiteStatus AddOpsAndParams(
     // For each state_out tensor, a corresponding state_in operand needs to be
     // created for NNAPI.
     auto duplicate_state_tensor_float32 =
-        [subgraph, &nn_model, &next_id, &augmented_inputs, &model_state_inputs,
-         &model_state_outputs](int tensor_id) {
+        [nnapi, subgraph, &nn_model, &next_id, &augmented_inputs,
+         &model_state_inputs, &model_state_outputs](int tensor_id) {
           const TfLiteTensor* tensor = subgraph->tensor(tensor_id);
           ANeuralNetworksOperandType operand_type{
               ANEURALNETWORKS_TENSOR_FLOAT32,
               static_cast<uint32_t>(tensor->dims->size),
               reinterpret_cast<uint32_t*>(tensor->dims->data),
               tensor->params.scale, tensor->params.zero_point};
-          CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type));
+          CHECK_NN(
+              nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type));
           augmented_inputs.push_back(next_id);
           model_state_inputs->push_back(next_id);
           model_state_outputs->push_back(tensor_id);
@@ -388,7 +362,7 @@ TfLiteStatus AddOpsAndParams(
     };
 
     // LSTM in NNAPI requires scratch tensor as an output operand.
-    auto add_lstm_scratch_tensor_float32 = [subgraph, &node, &nn_model,
+    auto add_lstm_scratch_tensor_float32 = [nnapi, subgraph, &node, &nn_model,
                                             &next_id, &augmented_outputs]() {
       if (node.temporaries->size == 0) return;
       int scratch_buffer_index = node.temporaries->data[0];
@@ -398,7 +372,7 @@ TfLiteStatus AddOpsAndParams(
           static_cast<uint32_t>(tensor->dims->size),
           reinterpret_cast<uint32_t*>(tensor->dims->data), tensor->params.scale,
           tensor->params.zero_point};
-      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type));
+      CHECK_NN(nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type));
       augmented_outputs.insert(augmented_outputs.begin(), next_id++);
     };
 
@@ -427,15 +401,16 @@ TfLiteStatus AddOpsAndParams(
     };
 
     // Handle optional input tensors.
-    auto add_optional_tensors = [&nn_model, &augmented_inputs,
+    auto add_optional_tensors = [nnapi, &nn_model, &augmented_inputs,
                                  &next_id](int nn_type) {
       for (size_t idx = 0; idx < augmented_inputs.size(); idx++) {
         if (augmented_inputs[idx] == kOptionalTensor) {
           const std::vector<uint32_t> dim = {0, 0};
           ANeuralNetworksOperandType operand_type{nn_type, 2, dim.data(), 0, 0};
-          CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
-          CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id,
-                                                        nullptr, 0))
+          CHECK_NN(
+              nnapi->ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+          CHECK_NN(nnapi->ANeuralNetworksModel_setOperandValue(
+              nn_model, next_id, nullptr, 0))
           augmented_inputs[idx] = next_id++;
         }
       }
@@ -635,6 +610,7 @@ TfLiteStatus AddOpsAndParams(
       case tflite::BuiltinOperator_SPLIT:
       case tflite::BuiltinOperator_STRIDED_SLICE:
       case tflite::BuiltinOperator_EXP:
+      case tflite::BuiltinOperator_COS:
       case tflite::BuiltinOperator_LOG_SOFTMAX:
       case tflite::BuiltinOperator_DEQUANTIZE:
       case tflite::BuiltinOperator_DELEGATE:
@@ -686,6 +662,14 @@ TfLiteStatus AddOpsAndParams(
       case tflite::BuiltinOperator_MIRROR_PAD:
       case tflite::BuiltinOperator_ABS:
       case tflite::BuiltinOperator_SPLIT_V:
+      case tflite::BuiltinOperator_UNIQUE:
+      case tflite::BuiltinOperator_CEIL:
+      case tflite::BuiltinOperator_REVERSE_V2:
+      case tflite::BuiltinOperator_ADD_N:
+      case tflite::BuiltinOperator_GATHER_ND:
+      case tflite::BuiltinOperator_WHERE:
+      case tflite::BuiltinOperator_RANK:
+      case tflite::BuiltinOperator_ELU:
         logError("Op code %d is currently not delegated to NNAPI", builtin);
         return kTfLiteError;
         break;
@@ -695,13 +679,13 @@ TfLiteStatus AddOpsAndParams(
         break;
     }
 
-    if (nnapi_version == 11 && GetAndroidSdkVersionCached() < 28) {
+    if (nnapi_version == 11 && nnapi->android_sdk_version < 28) {
       logError("Op %d needs NNAPI1.1", builtin);
       return kTfLiteError;
     }
 
     // Add the operation.
-    RETURN_ERROR_IF_NN_FAILED(ANeuralNetworksModel_addOperation(
+    RETURN_ERROR_IF_NN_FAILED(nnapi->ANeuralNetworksModel_addOperation(
         nn_model, nn_op_type, static_cast<uint32_t>(augmented_inputs.size()),
         augmented_inputs.data(),
         static_cast<uint32_t>(augmented_outputs.size()),
@@ -713,9 +697,10 @@ TfLiteStatus AddOpsAndParams(
 TfLiteStatus NNAPIDelegate::BuildGraph(Subgraph* subgraph) {
   if (nn_model_ && nn_compiled_model_) return model_status_;
 
+  const NnApi* nnapi = NnApiImplementation();
   // TODO(aselle): This is not correct. need to handle resize invalidation.
   if (!nn_model_) {
-    CHECK_NN(ANeuralNetworksModel_create(&nn_model_));
+    CHECK_NN(nnapi->ANeuralNetworksModel_create(&nn_model_));
 
     // Find which tensors should be added to NNAPI. TFLite has temporaries
     // and RNN back-edges which are are not valid for NNAPI. We look through all
@@ -762,21 +747,22 @@ TfLiteStatus NNAPIDelegate::BuildGraph(Subgraph* subgraph) {
                        model_states_outputs_.size(), &augmented_outputs,
                        tensor_id_to_nnapi_id);
 
-    CHECK_NN(ANeuralNetworksModel_identifyInputsAndOutputs(
+    CHECK_NN(nnapi->ANeuralNetworksModel_identifyInputsAndOutputs(
         nn_model_, static_cast<uint32_t>(augmented_inputs.size()),
         reinterpret_cast<const uint32_t*>(augmented_inputs.data()),
         static_cast<uint32_t>(augmented_outputs.size()),
         reinterpret_cast<const uint32_t*>(augmented_outputs.data())));
 
-    if (GetAndroidSdkVersionCached() >= 28) {
-      CHECK_NN(ANeuralNetworksModel_relaxComputationFloat32toFloat16(
+    if (nnapi->android_sdk_version >= 28) {
+      CHECK_NN(nnapi->ANeuralNetworksModel_relaxComputationFloat32toFloat16(
           nn_model_, subgraph->GetAllowFp16PrecisionForFp32()));
     }
-    CHECK_NN(ANeuralNetworksModel_finish(nn_model_));
+    CHECK_NN(nnapi->ANeuralNetworksModel_finish(nn_model_));
   }
   if (!nn_compiled_model_) {
-    CHECK_NN(ANeuralNetworksCompilation_create(nn_model_, &nn_compiled_model_));
-    CHECK_NN(ANeuralNetworksCompilation_finish(nn_compiled_model_));
+    CHECK_NN(nnapi->ANeuralNetworksCompilation_create(nn_model_,
+                                                      &nn_compiled_model_));
+    CHECK_NN(nnapi->ANeuralNetworksCompilation_finish(nn_compiled_model_));
   }
   return kTfLiteOk;
 }
@@ -792,8 +778,10 @@ TfLiteStatus NNAPIDelegate::Invoke(Subgraph* subgraph) {
     return model_status_;
   }
 
+  const NnApi* nnapi = NnApiImplementation();
   ANeuralNetworksExecution* execution = nullptr;
-  CHECK_NN(ANeuralNetworksExecution_create(nn_compiled_model_, &execution));
+  CHECK_NN(
+      nnapi->ANeuralNetworksExecution_create(nn_compiled_model_, &execution));
 
   // Currently perform deep copy of input buffer
   for (size_t i = 0; i < subgraph->inputs().size(); i++) {
@@ -801,7 +789,7 @@ TfLiteStatus NNAPIDelegate::Invoke(Subgraph* subgraph) {
     // TODO(aselle): Is this what we want or do we want input instead?
     // TODO(aselle): This should be called setInputValue maybe to be cons.
     TfLiteTensor* tensor = subgraph->tensor(input);
-    CHECK_NN(ANeuralNetworksExecution_setInput(
+    CHECK_NN(nnapi->ANeuralNetworksExecution_setInput(
         execution, i, nullptr, tensor->data.raw, tensor->bytes));
   }
 
@@ -809,7 +797,7 @@ TfLiteStatus NNAPIDelegate::Invoke(Subgraph* subgraph) {
   for (size_t i = 0; i < subgraph->outputs().size(); i++) {
     int output = subgraph->outputs()[i];
     TfLiteTensor* tensor = subgraph->tensor(output);
-    CHECK_NN(ANeuralNetworksExecution_setOutput(
+    CHECK_NN(nnapi->ANeuralNetworksExecution_setOutput(
         execution, i, nullptr, tensor->data.raw, tensor->bytes));
   }
 
@@ -821,21 +809,21 @@ TfLiteStatus NNAPIDelegate::Invoke(Subgraph* subgraph) {
     // Here we are using a deep copy for state_in tensors so that we are not
     // reading and writing into the same buffer during a invocation.
     // TODO(miaowang): using double shared buffer to minimize the copies.
-    CHECK_NN(ANeuralNetworksExecution_setInput(
+    CHECK_NN(nnapi->ANeuralNetworksExecution_setInput(
         execution, i + subgraph->inputs().size(), nullptr, tensor->data.raw,
         tensor->bytes));
     // Tell NNAPI where to output the state_out.
-    CHECK_NN(ANeuralNetworksExecution_setOutput(
+    CHECK_NN(nnapi->ANeuralNetworksExecution_setOutput(
         execution, i + subgraph->outputs().size(), nullptr, tensor->data.raw,
         tensor->bytes));
   }
 
   // Currently use blocking compute.
   ANeuralNetworksEvent* event = nullptr;
-  CHECK_NN(ANeuralNetworksExecution_startCompute(execution, &event));
-  CHECK_NN(ANeuralNetworksEvent_wait(event));
-  ANeuralNetworksEvent_free(event);
-  ANeuralNetworksExecution_free(execution);
+  CHECK_NN(nnapi->ANeuralNetworksExecution_startCompute(execution, &event));
+  CHECK_NN(nnapi->ANeuralNetworksEvent_wait(event));
+  nnapi->ANeuralNetworksEvent_free(event);
+  nnapi->ANeuralNetworksExecution_free(execution);
 
 #if 0
   printf("From the NN API:\n");
@@ -853,6 +841,8 @@ TfLiteStatus NNAPIDelegate::Invoke(Subgraph* subgraph) {
   return kTfLiteOk;
 }
 
-bool NNAPIDelegate::IsSupported() { return NNAPIExists(); }
+bool NNAPIDelegate::IsSupported() {
+  return NnApiImplementation()->nnapi_exists;
+}
 
 }  // namespace tflite
diff --git a/tensorflow/lite/profiling/BUILD b/tensorflow/lite/profiling/BUILD
index 52ea6fe636247ec0a4d5fedb41c56fc095e6ac61..bbc252045baad0316333bf9bc19dd78b8bd58590 100644
--- a/tensorflow/lite/profiling/BUILD
+++ b/tensorflow/lite/profiling/BUILD
@@ -2,6 +2,7 @@ package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # Apache 2.0
 
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 
 common_copts = [
@@ -41,6 +42,17 @@ cc_library(
     copts = common_copts,
 )
 
+cc_test(
+    name = "time_test",
+    srcs = ["time_test.cc"],
+    copts = common_copts,
+    deps = [
+        ":time",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "profile_summarizer",
     srcs = ["profile_summarizer.cc"],
@@ -54,15 +66,14 @@ cc_library(
     ],
 )
 
-cc_test(
+tf_cc_test(
     name = "profile_summarizer_test",
     srcs = ["profile_summarizer_test.cc"],
-    copts = common_copts,
+    extra_copts = common_copts,
     deps = [
         ":profile_summarizer",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:test_util",
         "//tensorflow/lite/testing:util",
diff --git a/tensorflow/lite/profiling/profile_buffer.h b/tensorflow/lite/profiling/profile_buffer.h
index 247ebb37c53e7a1a7197155c0f63c877857289e1..9aa9e411314b2f389fda1bedaa290a87021ee254 100644
--- a/tensorflow/lite/profiling/profile_buffer.h
+++ b/tensorflow/lite/profiling/profile_buffer.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
+#include <cstdio>
 
 #include "tensorflow/lite/profiling/time.h"
 
@@ -78,6 +79,9 @@ class ProfileBuffer {
     }
     uint64_t timestamp = time::NowMicros();
     int index = current_index_ % event_buffer_.size();
+    if (current_index_ != 0 && index == 0) {
+      fprintf(stderr, "Warning: ProfileBuffer wrapping.\n");
+    }
     event_buffer_[index].tag = tag;
     event_buffer_[index].event_type = event_type;
     event_buffer_[index].event_metadata = event_metadata;
@@ -101,6 +105,7 @@ class ProfileBuffer {
     const uint32_t max_size = event_buffer_.size();
     if (current_index_ > (max_size + event_handle)) {
       // Ignore, buffer has already overflowed.
+      fprintf(stderr, "Warning: Dropping ProfileBuffer event.\n");
       return;
     }
 
diff --git a/tensorflow/lite/profiling/profile_summarizer.cc b/tensorflow/lite/profiling/profile_summarizer.cc
index 64b1bd7ad771c11412a2558bf4454ad2e06c0096..aaf35d64c4c472eb838c090933957011246c3411 100644
--- a/tensorflow/lite/profiling/profile_summarizer.cc
+++ b/tensorflow/lite/profiling/profile_summarizer.cc
@@ -123,14 +123,20 @@ void ProfileSummarizer::ProcessProfiles(
   int64_t base_start_us = events[0]->begin_timestamp_us;
   int node_num = 0;
   int64_t curr_total_us = 0;
+  auto tag_string = [](const string& s, const string& t) {
+    return t == "OpInvoke" ? s : s + "/" + t;
+  };
   for (auto event : events) {
     auto op_details = GetOperatorDetails(interpreter, event->event_metadata);
     auto node_name = ToString(op_details.outputs);
     int64_t start_us = event->begin_timestamp_us - base_start_us;
     int64_t node_exec_time =
         event->end_timestamp_us - event->begin_timestamp_us;
-    stats_calculator_->AddNodeStats(node_name, op_details.name, node_num,
-                                    start_us, node_exec_time, 0 /*memory */);
+    stats_calculator_->AddNodeStats(tag_string(node_name, event->tag),
+                                    tag_string(op_details.name, event->tag),
+                                    node_num, start_us, node_exec_time,
+                                    0 /*memory */);
+
     curr_total_us += node_exec_time;
     ++node_num;
   }
diff --git a/tensorflow/lite/profiling/profiler.h b/tensorflow/lite/profiling/profiler.h
index 89c05cba37b37a88b9d91db8f997e1fcecf43174..dd45518b5bfb3d84a5b2440c24dac707fcd02a78 100644
--- a/tensorflow/lite/profiling/profiler.h
+++ b/tensorflow/lite/profiling/profiler.h
@@ -153,9 +153,11 @@ class ScopedOperatorProfile {
 
 #define VARNAME_UNIQ(name, ctr) name##ctr
 
-#define SCOPED_OPERATOR_PROFILE(profiler, node_index)    \
-  tflite::profiling::ScopedOperatorProfile VARNAME_UNIQ( \
-      _profile_, __COUNTER__)((profiler), "OpInvoke", (node_index))
+#define SCOPED_TAGGED_OPERATOR_PROFILE(profiler, tag, node_index) \
+  tflite::profiling::ScopedOperatorProfile VARNAME_UNIQ(          \
+      _profile_, __COUNTER__)((profiler), (tag), (node_index))
+#define SCOPED_OPERATOR_PROFILE(profiler, node_index) \
+  SCOPED_TAGGED_OPERATOR_PROFILE((profiler), "OpInvoke", (node_index))
 #else
 
 namespace tflite {
@@ -172,6 +174,7 @@ class Profiler {
 }  // namespace profiling
 }  // namespace tflite
 
+#define SCOPED_TAGGED_OPERATOR_PROFILE(profiler, tag, node_index)
 #define SCOPED_OPERATOR_PROFILE(profiler, node_index)
 
 #endif  // TFLITE_PROFILING_ENABLED
diff --git a/tensorflow/lite/profiling/time.cc b/tensorflow/lite/profiling/time.cc
index 3e7db03d9d8df1eeb0c82d388324716c5e7d7896..32eb30070fb7d882cf7fd206fcfca2f81a09cfff 100644
--- a/tensorflow/lite/profiling/time.cc
+++ b/tensorflow/lite/profiling/time.cc
@@ -16,8 +16,10 @@ limitations under the License.
 
 #if defined(_MSC_VER)
 #include <chrono>  // NOLINT(build/c++11)
+#include <thread>  // NOLINT(build/c++11)
 #else
 #include <sys/time.h>
+#include <time.h>
 #endif
 
 namespace tflite {
@@ -32,12 +34,24 @@ uint64_t NowMicros() {
       .count();
 }
 
+void SleepForMicros(uint64_t micros) {
+  std::this_thread::sleep_for(std::chrono::microseconds(micros));
+}
+
 #else
 
 uint64_t NowMicros() {
   struct timeval tv;
   gettimeofday(&tv, nullptr);
-  return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+  return static_cast<uint64_t>(tv.tv_sec) * 1e6 + tv.tv_usec;
+}
+
+void SleepForMicros(uint64_t micros) {
+  timespec sleep_time;
+  sleep_time.tv_sec = micros / 1e6;
+  micros -= sleep_time.tv_sec * 1e6;
+  sleep_time.tv_nsec = micros * 1e3;
+  nanosleep(&sleep_time, nullptr);
 }
 
 #endif  // defined(_MSC_VER)
diff --git a/tensorflow/lite/profiling/time.h b/tensorflow/lite/profiling/time.h
index 66233a480fd390619629e26a05284202057e0f4a..c7527ad0d2943e048518c78cf7375a65857c8dfe 100644
--- a/tensorflow/lite/profiling/time.h
+++ b/tensorflow/lite/profiling/time.h
@@ -21,6 +21,7 @@ namespace tflite {
 namespace profiling {
 namespace time {
 uint64_t NowMicros();
+void SleepForMicros(uint64_t micros);
 }  // namespace time
 }  // namespace profiling
 }  // namespace tflite
diff --git a/tensorflow/lite/profiling/time_test.cc b/tensorflow/lite/profiling/time_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6f08479adeb9311f7cf098f64edd8f3656928eeb
--- /dev/null
+++ b/tensorflow/lite/profiling/time_test.cc
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/profiling/time.h"
+#include <gtest/gtest.h>
+#include "tensorflow/lite/testing/util.h"
+
+namespace tflite {
+namespace profiling {
+namespace time {
+
+TEST(TimeTest, NowMicros) {
+  auto now0 = NowMicros();
+  EXPECT_GT(now0, 0);
+  auto now1 = NowMicros();
+  EXPECT_GE(now1, now0);
+}
+
+TEST(TimeTest, SleepForMicros) {
+  // A zero sleep shouldn't cause issues.
+  SleepForMicros(0);
+
+  // Sleeping should be reflected in the current time.
+  auto now0 = NowMicros();
+  SleepForMicros(50);
+  auto now1 = NowMicros();
+  EXPECT_GE(now1, now0 + 50);
+
+  // Sleeping more than a second should function properly.
+  now0 = NowMicros();
+  SleepForMicros(1e6 + 50);
+  now1 = NowMicros();
+  EXPECT_GE(now1, now0 + 1e6 + 50);
+}
+
+}  // namespace time
+}  // namespace profiling
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index acf827892bfd0081f1bbc7d0c3fa4f65af3a0817..a31f6cec707718d0a9c9ba5a96c7625f09cd724e 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -4,12 +4,6 @@ package(default_visibility = ["//tensorflow:internal"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
-filegroup(
-    name = "interpreter_test_data",
-    srcs = glob(["**/testdata/*"]),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "interpreter",
     srcs = [
@@ -19,7 +13,6 @@ py_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/lite/python/interpreter_wrapper:tensorflow_wrap_interpreter_wrapper",
-        "//tensorflow/python:util",
         "//third_party/py/numpy",
     ],
 )
@@ -27,9 +20,11 @@ py_library(
 py_test(
     name = "interpreter_test",
     srcs = ["interpreter_test.py"],
-    data = [":interpreter_test_data"],
+    data = ["//tensorflow/lite/python/testdata:interpreter_test_data"],
     srcs_version = "PY2AND3",
-    tags = ["no_oss"],
+    tags = [
+        "no_windows",
+    ],
     deps = [
         ":interpreter",
         "//tensorflow/python:client_testlib",
@@ -44,6 +39,22 @@ py_binary(
     srcs = ["tflite_convert.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
+    deps = [":tflite_convert_main_lib"],
+)
+
+py_library(
+    name = "tflite_convert_main_lib",
+    srcs = ["tflite_convert.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [":tflite_convert_lib"],
+)
+
+py_library(
+    name = "tflite_convert_lib",
+    srcs = ["tflite_convert.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
     deps = [
         ":lite",
     ],
@@ -60,7 +71,10 @@ py_library(
         ":interpreter",
         ":lite_constants",
         ":op_hint",
+        "//tensorflow/lite/experimental/examples/lstm:tflite_lstm_ops",
+        "//tensorflow/lite/python/optimize:calibrator",
         "//tensorflow/python:graph_util",
+        "//tensorflow/python:tf_optimizer",
         "//tensorflow/python/keras",
         "//tensorflow/python/saved_model:constants",
         "//tensorflow/python/saved_model:loader",
@@ -71,8 +85,39 @@ py_test(
     name = "lite_test",
     srcs = ["lite_test.py"],
     data = ["@tflite_mobilenet_ssd_quant_protobuf//:tflite_graph.pb"],
+    shard_count = 4,
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+    ],
+    deps = [
+        ":lite",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+py_test(
+    name = "lite_v2_test",
+    srcs = ["lite_v2_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_windows",
+    ],
+    deps = [
+        ":lite",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+py_test(
+    name = "lite_flex_test",
+    srcs = ["lite_flex_test.py"],
     srcs_version = "PY2AND3",
     tags = [
+        # TODO(b/111881877): Enable in oss after resolving op registry issues.
         "no_oss",
         "no_windows",
     ],
@@ -115,8 +160,6 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/contrib/framework:framework_py",
-        "//tensorflow/contrib/graph_editor:graph_editor_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform",
@@ -145,7 +188,6 @@ py_library(
     srcs = ["convert_saved_model.py"],
     srcs_version = "PY2AND3",
     visibility = [
-        "//tensorflow/contrib/lite:__subpackages__",
         "//tensorflow/lite:__subpackages__",
     ],
     deps = [
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 9c603998717019ac8624868b16d720e300a30efd..4d38ee9be498beed9384e7ecf6be147c4086e7c0 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -443,7 +443,7 @@ def toco_convert_impl(input_data, input_tensors, output_tensors, *args,
   return data
 
 
-@_tf_export("lite.toco_convert")
+@_tf_export(v1=["lite.toco_convert"])
 @deprecation.deprecated(None, "Use `lite.TFLiteConverter` instead.")
 def toco_convert(input_data, input_tensors, output_tensors, *args, **kwargs):
   """Convert a model using TOCO.
diff --git a/tensorflow/lite/python/convert_saved_model_test.py b/tensorflow/lite/python/convert_saved_model_test.py
index 11bfcdc79548378a0cec8d13a089a8d505ccf7b0..fdcbc79ee9cfd4ccad15d59a0df4a7e520471b6c 100644
--- a/tensorflow/lite/python/convert_saved_model_test.py
+++ b/tensorflow/lite/python/convert_saved_model_test.py
@@ -93,7 +93,7 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
         str(error.exception))
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_deprecated_v1
   def testSetTensorShapeDimensionInvalid(self):
     # Tests set_tensor_shape where the shape passed in is incompatiable.
     tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
@@ -102,9 +102,8 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError) as error:
       convert_saved_model.set_tensor_shapes([tensor],
                                             {"Placeholder": [1, 5, 5]})
-    self.assertIn(
-        "The shape of tensor 'Placeholder' cannot be changed from "
-        "(?, 3, 5) to [1, 5, 5].", str(error.exception))
+    self.assertIn("The shape of tensor 'Placeholder' cannot be changed",
+                  str(error.exception))
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
 
   @test_util.run_v1_only("b/120545219")
diff --git a/tensorflow/lite/python/convert_test.py b/tensorflow/lite/python/convert_test.py
index cf49ee2b472d2c6617811cde0978eb8ae3a16f8e..e270abaa5afa0f2b3bb255e896c706794277c26e 100644
--- a/tensorflow/lite/python/convert_test.py
+++ b/tensorflow/lite/python/convert_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework.graph_util_impl import _bfs_for_reachable_nodes
 from tensorflow.python.framework.graph_util_impl import _extract_graph_summary
+from tensorflow.python.framework.graph_util_impl import _node_name
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -389,6 +390,29 @@ class ConvertTestOpHint(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError):
       convert.convert_dtype_to_tflite_type(dtypes.bool)
 
+  def testFindHintedOutputNodes(self):
+    """Test if all hinted output nodes are correctly found."""
+
+    def _build_ophinted_op(name, input1, input2):
+      custom_op = op_hint.OpHint(name)
+      input1 = custom_op.add_input(input1)
+      input2 = custom_op.add_input(input2)
+      output = math_ops.mul(input1, input2)
+      return custom_op.add_output(output)
+
+    output_1 = _build_ophinted_op("custom_op_1", array_ops.constant([1.]),
+                                  array_ops.constant([2.]))
+    output_2 = _build_ophinted_op("custom_op_2", array_ops.constant([3.]),
+                                  array_ops.constant([4.]))
+    with self.cached_session() as sess:
+      hinted_outputs_nodes = op_hint.find_all_hinted_output_nodes(sess)
+      expected_hinted_output_nodes = [
+          _node_name(output_1.name),
+          _node_name(output_2.name)
+      ]
+      self.assertEqual(
+          len(hinted_outputs_nodes), len(expected_hinted_output_nodes))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/lite/python/create_custom_op.py b/tensorflow/lite/python/create_custom_op.py
index 344cd28d160f2d3d4f277bbfb41aa21087659af5..e793f7fe2bc92d4eb5518adfd3a80a23d2195f29 100644
--- a/tensorflow/lite/python/create_custom_op.py
+++ b/tensorflow/lite/python/create_custom_op.py
@@ -62,7 +62,7 @@ def _read_graph_def(filename):
     raise ValueError("Input graph file '" + filename + "' does not exist!")
 
   graph_def = graph_pb2.GraphDef()
-  with gfile.FastGFile(filename, "rb") as f:
+  with gfile.GFile(filename, "rb") as f:
     graph_def.ParseFromString(f.read())
   return graph_def
 
diff --git a/tensorflow/lite/python/interpreter.py b/tensorflow/lite/python/interpreter.py
index a6183d13b56c787aac0d9d9fc190eff277eb4c8e..9b9516f6d0bdedb30e9ddcb419639920fe6e000f 100644
--- a/tensorflow/lite/python/interpreter.py
+++ b/tensorflow/lite/python/interpreter.py
@@ -19,20 +19,32 @@ from __future__ import print_function
 
 import sys
 import numpy as np
-from tensorflow.python.util.lazy_loader import LazyLoader
-from tensorflow.python.util.tf_export import tf_export as _tf_export
 
-# Lazy load since some of the performance benchmark skylark rules
-# break dependencies. Must use double quotes to match code internal rewrite
-# rule.
-# pylint: disable=g-inconsistent-quotes
-_interpreter_wrapper = LazyLoader(
-    "_interpreter_wrapper", globals(),
-    "tensorflow.lite.python.interpreter_wrapper."
-    "tensorflow_wrap_interpreter_wrapper")
-# pylint: enable=g-inconsistent-quotes
-
-del LazyLoader
+# pylint: disable=g-import-not-at-top
+try:
+  from tensorflow.python.util.lazy_loader import LazyLoader
+  from tensorflow.python.util.tf_export import tf_export as _tf_export
+
+  # Lazy load since some of the performance benchmark skylark rules
+  # break dependencies. Must use double quotes to match code internal rewrite
+  # rule.
+  # pylint: disable=g-inconsistent-quotes
+  _interpreter_wrapper = LazyLoader(
+      "_interpreter_wrapper", globals(),
+      "tensorflow.lite.python.interpreter_wrapper."
+      "tensorflow_wrap_interpreter_wrapper")
+  # pylint: enable=g-inconsistent-quotes
+
+  del LazyLoader
+except ImportError:
+  # When full Tensorflow Python PIP is not available do not use lazy load
+  # and instead uf the tflite_runtime path.
+  from tflite_runtime.lite.python import interpreter_wrapper as _interpreter_wrapper
+
+  def tf_export_dummy(*x, **kwargs):
+    del x, kwargs
+    return lambda x: x
+  _tf_export = tf_export_dummy
 
 
 @_tf_export('lite.Interpreter')
@@ -204,7 +216,8 @@ class Interpreter(object):
   def get_tensor(self, tensor_index):
     """Gets the value of the input tensor (get a copy).
 
-    If you wish to avoid the copy, use `tensor()`.
+    If you wish to avoid the copy, use `tensor()`. This function cannot be used
+    to read intermediate results.
 
     Args:
       tensor_index: Tensor index of tensor to get. This value can be gotten from
@@ -221,7 +234,8 @@ class Interpreter(object):
     This allows reading and writing to this tensors w/o copies. This more
     closely mirrors the C++ Interpreter class interface's tensor() member, hence
     the name. Be careful to not hold these output references through calls
-    to `allocate_tensors()` and `invoke()`.
+    to `allocate_tensors()` and `invoke()`. This function cannot be used to read
+    intermediate results.
 
     Usage:
 
diff --git a/tensorflow/lite/python/interpreter_test.py b/tensorflow/lite/python/interpreter_test.py
index 7ec56a21c9ffa82e1893d3846d92564539ac34ae..b21779226f62ead3fd4bde5aacdfc393a4d5bff9 100644
--- a/tensorflow/lite/python/interpreter_test.py
+++ b/tensorflow/lite/python/interpreter_test.py
@@ -91,6 +91,41 @@ class InterpreterTest(test_util.TensorFlowTestCase):
     output_data = interpreter.get_tensor(output_details[0]['index'])
     self.assertTrue((expected_output == output_data).all())
 
+  def testString(self):
+    interpreter = interpreter_wrapper.Interpreter(
+        model_path=resource_loader.get_path_to_datafile(
+            'testdata/gather_string.tflite'))
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(2, len(input_details))
+    self.assertEqual('input', input_details[0]['name'])
+    self.assertEqual(np.string_, input_details[0]['dtype'])
+    self.assertTrue(([10] == input_details[0]['shape']).all())
+    self.assertEqual((0.0, 0), input_details[0]['quantization'])
+    self.assertEqual('indices', input_details[1]['name'])
+    self.assertEqual(np.int64, input_details[1]['dtype'])
+    self.assertTrue(([3] == input_details[1]['shape']).all())
+    self.assertEqual((0.0, 0), input_details[1]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('output', output_details[0]['name'])
+    self.assertEqual(np.string_, output_details[0]['dtype'])
+    self.assertTrue(([3] == output_details[0]['shape']).all())
+    self.assertEqual((0.0, 0), output_details[0]['quantization'])
+
+    test_input = np.array([1, 2, 3], dtype=np.int64)
+    interpreter.set_tensor(input_details[1]['index'], test_input)
+
+    test_input = np.array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'])
+    expected_output = np.array([b'b', b'c', b'd'])
+    interpreter.set_tensor(input_details[0]['index'], test_input)
+    interpreter.invoke()
+
+    output_data = interpreter.get_tensor(output_details[0]['index'])
+    self.assertTrue((expected_output == output_data).all())
+
 
 class InterpreterTestErrorPropagation(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/lite/python/interpreter_wrapper/BUILD b/tensorflow/lite/python/interpreter_wrapper/BUILD
index 767a9fc476398dd8fb60128f73f8ae7c518d9a21..6ec7ce497a51b9b7b66d680ea9a81ef47df51718 100644
--- a/tensorflow/lite/python/interpreter_wrapper/BUILD
+++ b/tensorflow/lite/python/interpreter_wrapper/BUILD
@@ -6,12 +6,26 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 
+cc_library(
+    name = "numpy",
+    srcs = ["numpy.cc"],
+    hdrs = ["numpy.h"],
+    deps = [
+        "//third_party/py/numpy:headers",
+        "//third_party/python_runtime:headers",
+    ],
+)
+
 cc_library(
     name = "interpreter_wrapper_lib",
     srcs = ["interpreter_wrapper.cc"],
     hdrs = ["interpreter_wrapper.h"],
     deps = [
+        ":numpy",
+        ":python_error_reporter",
+        ":python_utils",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
         "//tensorflow/lite/kernels:builtin_ops",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",
@@ -19,6 +33,29 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "python_error_reporter",
+    srcs = ["python_error_reporter.cc"],
+    hdrs = ["python_error_reporter.h"],
+    deps = [
+        "//tensorflow/lite/core/api",
+        "//third_party/python_runtime:headers",
+    ],
+)
+
+cc_library(
+    name = "python_utils",
+    srcs = ["python_utils.cc"],
+    hdrs = ["python_utils.h"],
+    deps = [
+        ":numpy",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
+        "//third_party/py/numpy:headers",
+        "//third_party/python_runtime:headers",
+    ],
+)
+
 tf_py_wrap_cc(
     name = "tensorflow_wrap_interpreter_wrapper",
     srcs = [
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index d14af439ec0ab600ea260da17ef0041cca25d629..6023587d3b191d8c486dac78b889510ff1c22805 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -21,22 +21,10 @@ limitations under the License.
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
-
-// Disallow Numpy 1.7 deprecated symbols.
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-
-#include <Python.h>
-
-#include "numpy/arrayobject.h"
-#include "numpy/ufuncobject.h"
-
-#if PY_MAJOR_VERSION >= 3
-#define PY_TO_CPPSTRING PyBytes_AsStringAndSize
-#define CPP_TO_PYSTRING PyBytes_FromStringAndSize
-#else
-#define PY_TO_CPPSTRING PyString_AsStringAndSize
-#define CPP_TO_PYSTRING PyString_FromStringAndSize
-#endif
+#include "tensorflow/lite/python/interpreter_wrapper/numpy.h"
+#include "tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h"
+#include "tensorflow/lite/python/interpreter_wrapper/python_utils.h"
+#include "tensorflow/lite/string_util.h"
 
 #define TFLITE_PY_CHECK(x)               \
   if ((x) != kTfLiteOk) {                \
@@ -60,43 +48,9 @@ limitations under the License.
 namespace tflite {
 namespace interpreter_wrapper {
 
-class PythonErrorReporter : public tflite::ErrorReporter {
- public:
-  PythonErrorReporter() {}
-
-  // Report an error message
-  int Report(const char* format, va_list args) override {
-    char buf[1024];
-    int formatted = vsnprintf(buf, sizeof(buf), format, args);
-    buffer_ << buf;
-    return formatted;
-  }
-
-  // Set's a Python runtime exception with the last error.
-  PyObject* exception() {
-    std::string last_message = message();
-    PyErr_SetString(PyExc_RuntimeError, last_message.c_str());
-    return nullptr;
-  }
-
-  // Gets the last error message and clears the buffer.
-  std::string message() {
-    std::string value = buffer_.str();
-    buffer_.clear();
-    return value;
-  }
-
- private:
-  std::stringstream buffer_;
-};
-
 namespace {
 
-// Calls PyArray's initialization to initialize all the API pointers. Note that
-// this usage implies only this translation unit can use the pointers. See
-// tensorflow/python/core/numpy.cc for a strategy if we ever need to extend
-// this further.
-void ImportNumpy() { import_array1(); }
+using python_utils::PyDecrefDeleter;
 
 std::unique_ptr<tflite::Interpreter> CreateInterpreter(
     const tflite::FlatBufferModel* model,
@@ -105,7 +59,7 @@ std::unique_ptr<tflite::Interpreter> CreateInterpreter(
     return nullptr;
   }
 
-  ImportNumpy();
+  ::tflite::python::ImportNumpy();
 
   std::unique_ptr<tflite::Interpreter> interpreter;
   if (tflite::InterpreterBuilder(*model, resolver)(&interpreter) != kTfLiteOk) {
@@ -114,65 +68,6 @@ std::unique_ptr<tflite::Interpreter> CreateInterpreter(
   return interpreter;
 }
 
-int TfLiteTypeToPyArrayType(TfLiteType tf_lite_type) {
-  switch (tf_lite_type) {
-    case kTfLiteFloat32:
-      return NPY_FLOAT32;
-    case kTfLiteInt32:
-      return NPY_INT32;
-    case kTfLiteInt16:
-      return NPY_INT16;
-    case kTfLiteUInt8:
-      return NPY_UINT8;
-    case kTfLiteInt8:
-      return NPY_INT8;
-    case kTfLiteInt64:
-      return NPY_INT64;
-    case kTfLiteString:
-      return NPY_OBJECT;
-    case kTfLiteBool:
-      return NPY_BOOL;
-    case kTfLiteComplex64:
-      return NPY_COMPLEX64;
-    case kTfLiteNoType:
-      return NPY_NOTYPE;
-      // Avoid default so compiler errors created when new types are made.
-  }
-  return NPY_NOTYPE;
-}
-
-TfLiteType TfLiteTypeFromPyArray(PyArrayObject* array) {
-  int pyarray_type = PyArray_TYPE(array);
-  switch (pyarray_type) {
-    case NPY_FLOAT32:
-      return kTfLiteFloat32;
-    case NPY_INT32:
-      return kTfLiteInt32;
-    case NPY_INT16:
-      return kTfLiteInt16;
-    case NPY_UINT8:
-      return kTfLiteUInt8;
-    case NPY_INT8:
-      return kTfLiteInt8;
-    case NPY_INT64:
-      return kTfLiteInt64;
-    case NPY_BOOL:
-      return kTfLiteBool;
-    case NPY_OBJECT:
-    case NPY_STRING:
-    case NPY_UNICODE:
-      return kTfLiteString;
-    case NPY_COMPLEX64:
-      return kTfLiteComplex64;
-      // Avoid default so compiler errors created when new types are made.
-  }
-  return kTfLiteNoType;
-}
-
-struct PyDecrefDeleter {
-  void operator()(PyObject* p) const { Py_DECREF(p); }
-};
-
 PyObject* PyArrayFromIntVector(const int* data, npy_intp size) {
   void* pydata = malloc(size * sizeof(int));
   memcpy(pydata, data, size * sizeof(int));
@@ -307,7 +202,7 @@ PyObject* InterpreterWrapper::TensorType(int i) const {
     return nullptr;
   }
 
-  int code = TfLiteTypeToPyArrayType(tensor->type);
+  int code = python_utils::TfLiteTypeToPyArrayType(tensor->type);
   if (code == -1) {
     PyErr_Format(PyExc_ValueError, "Invalid tflite type code %d", code);
     return nullptr;
@@ -350,38 +245,53 @@ PyObject* InterpreterWrapper::SetTensor(int i, PyObject* value) {
   }
 
   PyArrayObject* array = reinterpret_cast<PyArrayObject*>(array_safe.get());
-  const TfLiteTensor* tensor = interpreter_->tensor(i);
+  TfLiteTensor* tensor = interpreter_->tensor(i);
 
-  if (TfLiteTypeFromPyArray(array) != tensor->type) {
+  if (python_utils::TfLiteTypeFromPyArray(array) != tensor->type) {
     PyErr_Format(PyExc_ValueError,
                  "Cannot set tensor:"
                  " Got tensor of type %d"
                  " but expected type %d for input %d ",
-                 TfLiteTypeFromPyArray(array), tensor->type, i);
+                 python_utils::TfLiteTypeFromPyArray(array), tensor->type, i);
     return nullptr;
   }
 
   if (PyArray_NDIM(array) != tensor->dims->size) {
-    PyErr_SetString(PyExc_ValueError, "Cannot set tensor: Dimension mismatch");
+    PyErr_Format(PyExc_ValueError,
+                 "Cannot set tensor: Dimension mismatch."
+                 " Got %d"
+                 " but expected %d for input %d.",
+                 PyArray_NDIM(array), tensor->dims->size, i);
     return nullptr;
   }
 
   for (int j = 0; j < PyArray_NDIM(array); j++) {
     if (tensor->dims->data[j] != PyArray_SHAPE(array)[j]) {
-      PyErr_SetString(PyExc_ValueError,
-                      "Cannot set tensor: Dimension mismatch");
+      PyErr_Format(PyExc_ValueError,
+                   "Cannot set tensor: Dimension mismatch."
+                   " Got %ld"
+                   " but expected %d for dimension %d of input %d.",
+                   PyArray_SHAPE(array)[j], tensor->dims->data[j], j, i);
       return nullptr;
     }
   }
 
-  size_t size = PyArray_NBYTES(array);
-  if (size != tensor->bytes) {
-    PyErr_Format(PyExc_ValueError,
-                 "numpy array had %zu bytes but expected %zu bytes.", size,
-                 tensor->bytes);
-    return nullptr;
+  if (tensor->type != kTfLiteString) {
+    size_t size = PyArray_NBYTES(array);
+    if (size != tensor->bytes) {
+      PyErr_Format(PyExc_ValueError,
+                   "numpy array had %zu bytes but expected %zu bytes.", size,
+                   tensor->bytes);
+      return nullptr;
+    }
+    memcpy(tensor->data.raw, PyArray_DATA(array), size);
+  } else {
+    DynamicBuffer dynamic_buffer;
+    if (!python_utils::FillStringBufferWithPyArray(value, &dynamic_buffer)) {
+      return nullptr;
+    }
+    dynamic_buffer.WriteToTensor(tensor, nullptr);
   }
-  memcpy(tensor->data.raw, PyArray_DATA(array), size);
   Py_RETURN_NONE;
 }
 
@@ -400,7 +310,7 @@ PyObject* CheckGetTensorArgs(Interpreter* interpreter_, int tensor_index,
     return nullptr;
   }
 
-  *type_num = TfLiteTypeToPyArrayType((*tensor)->type);
+  *type_num = python_utils::TfLiteTypeToPyArrayType((*tensor)->type);
   if (*type_num == -1) {
     PyErr_SetString(PyExc_ValueError, "Unknown tensor type.");
     return nullptr;
@@ -428,19 +338,51 @@ PyObject* InterpreterWrapper::GetTensor(int i) const {
 
   std::vector<npy_intp> dims(tensor->dims->data,
                              tensor->dims->data + tensor->dims->size);
-  // Make a buffer copy but we must tell Numpy It owns that data or else
-  // it will leak.
-  void* data = malloc(tensor->bytes);
-  if (!data) {
-    PyErr_SetString(PyExc_ValueError, "Malloc to copy tensor failed.");
-    return nullptr;
+  if (tensor->type != kTfLiteString) {
+    // Make a buffer copy but we must tell Numpy It owns that data or else
+    // it will leak.
+    void* data = malloc(tensor->bytes);
+    if (!data) {
+      PyErr_SetString(PyExc_ValueError, "Malloc to copy tensor failed.");
+      return nullptr;
+    }
+    memcpy(data, tensor->data.raw, tensor->bytes);
+    PyObject* np_array =
+        PyArray_SimpleNewFromData(dims.size(), dims.data(), type_num, data);
+    PyArray_ENABLEFLAGS(reinterpret_cast<PyArrayObject*>(np_array),
+                        NPY_ARRAY_OWNDATA);
+    return PyArray_Return(reinterpret_cast<PyArrayObject*>(np_array));
+  } else {
+    // Create a C-order array so the data is contiguous in memory.
+    const int32_t kCOrder = 0;
+    PyObject* py_object =
+        PyArray_EMPTY(dims.size(), dims.data(), NPY_OBJECT, kCOrder);
+
+    if (py_object == nullptr) {
+      PyErr_SetString(PyExc_MemoryError, "Failed to allocate PyArray.");
+      return nullptr;
+    }
+
+    PyArrayObject* py_array = reinterpret_cast<PyArrayObject*>(py_object);
+    PyObject** data = reinterpret_cast<PyObject**>(PyArray_DATA(py_array));
+    auto num_strings = GetStringCount(tensor->data.raw);
+    for (int j = 0; j < num_strings; ++j) {
+      auto ref = GetString(tensor->data.raw, j);
+
+      PyObject* bytes = PyBytes_FromStringAndSize(ref.str, ref.len);
+      if (bytes == nullptr) {
+        Py_DECREF(py_object);
+        PyErr_Format(PyExc_ValueError,
+                     "Could not create PyBytes from string %d of input %d.", j,
+                     i);
+        return nullptr;
+      }
+      // PyArray_EMPTY produces an array full of Py_None, which we must decref.
+      Py_DECREF(data[j]);
+      data[j] = bytes;
+    }
+    return py_object;
   }
-  memcpy(data, tensor->data.raw, tensor->bytes);
-  PyObject* np_array =
-      PyArray_SimpleNewFromData(dims.size(), dims.data(), type_num, data);
-  PyArray_ENABLEFLAGS(reinterpret_cast<PyArrayObject*>(np_array),
-                      NPY_ARRAY_OWNDATA);
-  return PyArray_Return(reinterpret_cast<PyArrayObject*>(np_array));
 }
 
 PyObject* InterpreterWrapper::tensor(PyObject* base_object, int i) {
@@ -477,7 +419,8 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
   char * buf = nullptr;
   Py_ssize_t length;
   std::unique_ptr<PythonErrorReporter> error_reporter(new PythonErrorReporter);
-  if (PY_TO_CPPSTRING(data, &buf, &length) == -1) {
+
+  if (python_utils::ConvertFromPyString(data, &buf, &length) == -1) {
     return nullptr;
   }
   std::unique_ptr<tflite::FlatBufferModel> model =
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.i b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.i
index f52ef1eeca7db397d84d249b74445a3276bc65fb..ef4b28f04723ab8d7f4f395a028bb565b4ca9cf3 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.i
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.i
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.h"
+#include "tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h"
 %}
 
 
diff --git a/tensorflow/lite/python/interpreter_wrapper/numpy.cc b/tensorflow/lite/python/interpreter_wrapper/numpy.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ff5403d2a60a66886681db73c4aa69bf43369170
--- /dev/null
+++ b/tensorflow/lite/python/interpreter_wrapper/numpy.cc
@@ -0,0 +1,25 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define TFLITE_IMPORT_NUMPY  // See numpy.h for explanation.
+#include "tensorflow/lite/python/interpreter_wrapper/numpy.h"
+
+namespace tflite {
+namespace python {
+
+void ImportNumpy() { import_array1(); }
+
+}  // namespace python
+}  // namespace tflite
diff --git a/tensorflow/lite/python/interpreter_wrapper/numpy.h b/tensorflow/lite/python/interpreter_wrapper/numpy.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3b013fcb27ad1837dfb83efbcec2ae800850058
--- /dev/null
+++ b/tensorflow/lite/python/interpreter_wrapper/numpy.h
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_NUMPY_H_
+#define TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_NUMPY_H_
+
+#ifdef PyArray_Type
+#error "Numpy cannot be included before numpy.h."
+#endif
+
+// Disallow Numpy 1.7 deprecated symbols.
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+
+// To handle PyArray_* calles, numpy defines a static lookup table called
+// PyArray_API, or PY_ARRAY_UNIQUE_SYMBOL, if defined. This causes the
+// PyArray_* pointers to be different for different translation units, unless
+// we take care of selectivel defined NO_IMPORT_ARRAY.
+//
+// Virtually every usage will define NO_IMPORT_ARRAY, and will have access to
+// the lookup table via:
+//   extern void **PyArray_API;
+// In numpy.cc we will define TFLITE_IMPORT_NUMPY, effectively disabling that
+// and instead using:
+//   void **PyArray_API;
+// which is initialized when ImportNumpy() is called.
+//
+// If we don't define PY_ARRAY_UNIQUE_SYMBOL then PyArray_API is a static
+// variable, which causes strange crashes when the pointers are used across
+// translation unit boundaries.
+//
+// For mone info see https://sourceforge.net/p/numpy/mailman/message/5700519
+// See also tensorflow/python/lib/core/numpy.h for a similar approach.
+#define PY_ARRAY_UNIQUE_SYMBOL _tensorflow_numpy_api
+#ifndef TFLITE_IMPORT_NUMPY
+#define NO_IMPORT_ARRAY
+#endif
+
+#include <Python.h>
+
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+
+namespace tflite {
+namespace python {
+
+void ImportNumpy();
+
+}  // namespace python
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_NUMPY_H_
diff --git a/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.cc b/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..803a4c29345a44bcdba41d851884fa86d6e87d3e
--- /dev/null
+++ b/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.cc
@@ -0,0 +1,43 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h"
+
+namespace tflite {
+namespace interpreter_wrapper {
+
+// Report an error message
+int PythonErrorReporter::Report(const char* format, va_list args) {
+  char buf[1024];
+  int formatted = vsnprintf(buf, sizeof(buf), format, args);
+  buffer_ << buf;
+  return formatted;
+}
+
+// Set's a Python runtime exception with the last error.
+PyObject* PythonErrorReporter::exception() {
+  std::string last_message = message();
+  PyErr_SetString(PyExc_RuntimeError, last_message.c_str());
+  return nullptr;
+}
+
+// Gets the last error message and clears the buffer.
+std::string PythonErrorReporter::message() {
+  std::string value = buffer_.str();
+  buffer_.clear();
+  return value;
+}
+}  // namespace interpreter_wrapper
+}  // namespace tflite
diff --git a/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h b/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d4e308834a21b795644f0c1f89607a3b75ad7ce
--- /dev/null
+++ b/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_ERROR_REPORTER_H_
+#define TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_ERROR_REPORTER_H_
+
+#include <Python.h>
+
+#include <sstream>
+#include <string>
+
+#include "tensorflow/lite/core/api/error_reporter.h"
+
+namespace tflite {
+namespace interpreter_wrapper {
+
+class PythonErrorReporter : public tflite::ErrorReporter {
+ public:
+  PythonErrorReporter() {}
+
+  // Report an error message
+  int Report(const char* format, va_list args) override;
+
+  // Sets a Python runtime exception with the last error and
+  // clears the error message buffer.
+  PyObject* exception();
+
+  // Gets the last error message and clears the buffer.
+  std::string message();
+
+ private:
+  std::stringstream buffer_;
+};
+
+}  // namespace interpreter_wrapper
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_ERROR_REPORTER_H_
diff --git a/tensorflow/lite/python/interpreter_wrapper/python_utils.cc b/tensorflow/lite/python/interpreter_wrapper/python_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e3d713630f6d39dd21b3c01cc4c75d4408243827
--- /dev/null
+++ b/tensorflow/lite/python/interpreter_wrapper/python_utils.cc
@@ -0,0 +1,180 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/python/interpreter_wrapper/python_utils.h"
+
+#include <memory>
+
+#include "tensorflow/lite/python/interpreter_wrapper/numpy.h"
+
+namespace tflite {
+namespace python_utils {
+
+struct PyObjectDereferencer {
+  void operator()(PyObject* py_object) const { Py_DECREF(py_object); }
+};
+
+using UniquePyObjectRef = std::unique_ptr<PyObject, PyObjectDereferencer>;
+
+int TfLiteTypeToPyArrayType(TfLiteType tf_lite_type) {
+  switch (tf_lite_type) {
+    case kTfLiteFloat32:
+      return NPY_FLOAT32;
+    case kTfLiteInt32:
+      return NPY_INT32;
+    case kTfLiteInt16:
+      return NPY_INT16;
+    case kTfLiteUInt8:
+      return NPY_UINT8;
+    case kTfLiteInt8:
+      return NPY_INT8;
+    case kTfLiteInt64:
+      return NPY_INT64;
+    case kTfLiteString:
+      return NPY_STRING;
+    case kTfLiteBool:
+      return NPY_BOOL;
+    case kTfLiteComplex64:
+      return NPY_COMPLEX64;
+    case kTfLiteNoType:
+      return NPY_NOTYPE;
+      // Avoid default so compiler errors created when new types are made.
+  }
+  return NPY_NOTYPE;
+}
+
+TfLiteType TfLiteTypeFromPyArray(PyArrayObject* array) {
+  int pyarray_type = PyArray_TYPE(array);
+  switch (pyarray_type) {
+    case NPY_FLOAT32:
+      return kTfLiteFloat32;
+    case NPY_INT32:
+      return kTfLiteInt32;
+    case NPY_INT16:
+      return kTfLiteInt16;
+    case NPY_UINT8:
+      return kTfLiteUInt8;
+    case NPY_INT8:
+      return kTfLiteInt8;
+    case NPY_INT64:
+      return kTfLiteInt64;
+    case NPY_BOOL:
+      return kTfLiteBool;
+    case NPY_OBJECT:
+    case NPY_STRING:
+    case NPY_UNICODE:
+      return kTfLiteString;
+    case NPY_COMPLEX64:
+      return kTfLiteComplex64;
+      // Avoid default so compiler errors created when new types are made.
+  }
+  return kTfLiteNoType;
+}
+
+#if PY_VERSION_HEX >= 0x03030000
+bool FillStringBufferFromPyUnicode(PyObject* value,
+                                   DynamicBuffer* dynamic_buffer) {
+  Py_ssize_t len = -1;
+  const char* buf = PyUnicode_AsUTF8AndSize(value, &len);
+  if (buf == NULL) {
+    PyErr_SetString(PyExc_ValueError, "PyUnicode_AsUTF8AndSize() failed.");
+    return false;
+  }
+  dynamic_buffer->AddString(buf, len);
+  return true;
+}
+#else
+bool FillStringBufferFromPyUnicode(PyObject* value,
+                                   DynamicBuffer* dynamic_buffer) {
+  UniquePyObjectRef utemp(PyUnicode_AsUTF8String(value));
+  if (!utemp) {
+    PyErr_SetString(PyExc_ValueError, "PyUnicode_AsUTF8String() failed.");
+    return false;
+  }
+  char* buf = nullptr;
+  Py_ssize_t len = -1;
+  if (PyBytes_AsStringAndSize(utemp.get(), &buf, &len) == -1) {
+    PyErr_SetString(PyExc_ValueError, "PyBytes_AsStringAndSize() failed.");
+    return false;
+  }
+  dynamic_buffer->AddString(buf, len);
+  return true;
+}
+#endif
+
+bool FillStringBufferFromPyString(PyObject* value,
+                                  DynamicBuffer* dynamic_buffer) {
+  if (PyUnicode_Check(value)) {
+    return FillStringBufferFromPyUnicode(value, dynamic_buffer);
+  }
+
+  char* buf = nullptr;
+  Py_ssize_t len = -1;
+  if (PyBytes_AsStringAndSize(value, &buf, &len) == -1) {
+    PyErr_SetString(PyExc_ValueError, "PyBytes_AsStringAndSize() failed.");
+    return false;
+  }
+  dynamic_buffer->AddString(buf, len);
+  return true;
+}
+
+bool FillStringBufferWithPyArray(PyObject* value,
+                                 DynamicBuffer* dynamic_buffer) {
+  PyArrayObject* array = reinterpret_cast<PyArrayObject*>(value);
+  switch (PyArray_TYPE(array)) {
+    case NPY_OBJECT:
+    case NPY_STRING:
+    case NPY_UNICODE: {
+      UniquePyObjectRef iter(PyArray_IterNew(value));
+      while (PyArray_ITER_NOTDONE(iter.get())) {
+        UniquePyObjectRef item(PyArray_GETITEM(
+            array, reinterpret_cast<char*>(PyArray_ITER_DATA(iter.get()))));
+
+        if (!FillStringBufferFromPyString(item.get(), dynamic_buffer)) {
+          return false;
+        }
+
+        PyArray_ITER_NEXT(iter.get());
+      }
+      return true;
+    }
+    default:
+      break;
+  }
+
+  PyErr_Format(PyExc_ValueError,
+               "Cannot use numpy array of type %d for string tensor.",
+               PyArray_TYPE(array));
+  return false;
+}
+
+int ConvertFromPyString(PyObject* obj, char** data, Py_ssize_t* length) {
+#if PY_MAJOR_VERSION >= 3
+  return PyBytes_AsStringAndSize(obj, data, length);
+#else
+  return PyString_AsStringAndSize(obj, data, length);
+#endif
+}
+
+PyObject* ConvertToPyString(const char* data, size_t length) {
+#if PY_MAJOR_VERSION >= 3
+  return PyBytes_FromStringAndSize(data, length);
+#else
+  return PyString_FromStringAndSize(data, length);
+#endif
+}
+
+}  // namespace python_utils
+}  // namespace tflite
diff --git a/tensorflow/lite/python/interpreter_wrapper/python_utils.h b/tensorflow/lite/python/interpreter_wrapper/python_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..f4677378cbc177b42c1b802b40beeba86ed605c4
--- /dev/null
+++ b/tensorflow/lite/python/interpreter_wrapper/python_utils.h
@@ -0,0 +1,42 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_UTILS_H_
+#define TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_UTILS_H_
+
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/python/interpreter_wrapper/numpy.h"
+#include "tensorflow/lite/string_util.h"
+
+namespace tflite {
+namespace python_utils {
+
+struct PyDecrefDeleter {
+  void operator()(PyObject* p) const { Py_DECREF(p); }
+};
+
+int TfLiteTypeToPyArrayType(TfLiteType tf_lite_type);
+
+TfLiteType TfLiteTypeFromPyArray(PyArrayObject* array);
+
+bool FillStringBufferWithPyArray(PyObject* value,
+                                 DynamicBuffer* dynamic_buffer);
+
+int ConvertFromPyString(PyObject* obj, char** data, Py_ssize_t* length);
+PyObject* ConvertToPyString(const char* data, size_t length);
+
+}  // namespace python_utils
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_PYTHON_INTERPRETER_WRAPPER_PYTHON_UTILS_H_
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 1b20ff2f92b6a84c21972ccccbc27ec6f999d74b..c8254518bd0a5a02769a9b503757342c2e9ac52c 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -12,31 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TensorFlow Lite tooling helper functionality.
+"""TensorFlow Lite tooling helper functionality."""
 
-EXPERIMENTAL: APIs here are unstable and likely to change without notice.
-
-@@TocoConverter
-@@TFLiteConverter
-@@toco_convert
-@@toco_convert_protos
-@@Interpreter
-@@OpHint
-@@convert_op_hints_to_stubs
-@@build_toco_convert_protos
-
-@@TFLITE
-@@GRAPHVIZ_DOT
-
-"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import warnings
+import enum
 from six import PY3
 
 from google.protobuf import text_format as _text_format
 from google.protobuf.message import DecodeError
+from tensorflow.lite.experimental.examples.lstm.rnn import dynamic_rnn  # pylint: disable=unused-import
+from tensorflow.lite.experimental.examples.lstm.rnn_cell import TFLiteLSTMCell  # pylint: disable=unused-import
+from tensorflow.lite.experimental.examples.lstm.rnn_cell import TfLiteRNNCell  # pylint: disable=unused-import
 from tensorflow.lite.python import lite_constants as constants
 from tensorflow.lite.python.convert import build_toco_convert_protos  # pylint: disable=unused-import
 from tensorflow.lite.python.convert import ConverterError  # pylint: disable=unused-import
@@ -52,20 +42,251 @@ from tensorflow.lite.python.convert_saved_model import set_tensor_shapes as _set
 from tensorflow.lite.python.interpreter import Interpreter  # pylint: disable=unused-import
 from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs  # pylint: disable=unused-import
 from tensorflow.lite.python.op_hint import OpHint  # pylint: disable=unused-import
+from tensorflow.lite.python.optimize import calibrator as _calibrator
 from tensorflow.core.framework import graph_pb2 as _graph_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2 as _rewriter_config_pb2
+from tensorflow.core.protobuf import config_pb2 as _config_pb2
+from tensorflow.core.protobuf import meta_graph_pb2 as _meta_graph_pb2
 from tensorflow.python import keras as _keras
 from tensorflow.python.client import session as _session
+from tensorflow.python.eager import def_function as _def_function
+from tensorflow.python.eager import function as _function
+from tensorflow.python.framework import convert_to_constants as _convert_to_constants
+from tensorflow.python.framework import dtypes as _dtypes
 from tensorflow.python.framework import graph_util as _tf_graph_util
 from tensorflow.python.framework import ops as _ops
 from tensorflow.python.framework.errors_impl import NotFoundError as _NotFoundError
 from tensorflow.python.framework.importer import import_graph_def as _import_graph_def
+from tensorflow.python.grappler import tf_optimizer as _tf_optimizer
 from tensorflow.python.lib.io import file_io as _file_io
 from tensorflow.python.saved_model import signature_constants as _signature_constants
 from tensorflow.python.saved_model import tag_constants as _tag_constants
+from tensorflow.python.training.saver import export_meta_graph as _export_meta_graph
 from tensorflow.python.util import deprecation as _deprecation
 from tensorflow.python.util.tf_export import tf_export as _tf_export
 
 
+def _run_graph_optimizations(graph_def, input_arrays, output_arrays,
+                             graph=None):
+  """Apply standard TensorFlow optimizations to the graph_def.
+
+  Args:
+    graph_def: Frozen GraphDef to be optimized.
+    input_arrays: List of arrays that are considered inputs of the graph.
+    output_arrays: List of arrays that are considered outputs of the graph.
+    graph: TensorFlow Graph. Required when Eager mode is enabled. (default None)
+
+  Returns:
+    A new, optimized GraphDef.
+  """
+  meta_graph = _export_meta_graph(graph_def=graph_def, graph=graph)
+
+  # We need to add a collection called 'train_op' so that grappler
+  # knows what the outputs are.
+  fetch_collection = _meta_graph_pb2.CollectionDef()
+  for array in input_arrays + output_arrays:
+    fetch_collection.node_list.value.append(array.name)
+  meta_graph.collection_def["train_op"].CopyFrom(fetch_collection)
+
+  config = _config_pb2.ConfigProto()
+  rewrite_options = config.graph_options.rewrite_options
+  rewrite_options.layout_optimizer = _rewriter_config_pb2.RewriterConfig.ON
+  # Avoid remapping as it creates ops like _FusedConv2D, which are not
+  # supported by TF Lite.
+  rewrite_options.remapping = _rewriter_config_pb2.RewriterConfig.OFF
+  return _tf_optimizer.OptimizeGraph(config, meta_graph)
+
+
+@_tf_export("lite.Optimize")
+class Optimize(enum.Enum):
+  """Enum defining the optimizations to apply when generating tflite graphs.
+
+  Some optimizations may come at the cost of accuracy.
+  """
+
+  # Optimize for size.
+  #
+  # Optimizations that reduce the size of the model.
+  # The model size will be reduced. Optimizations can include quantizing the
+  # weights of the floating point model.
+  OPTIMIZE_FOR_SIZE = "OPTIMIZE_FOR_SIZE"
+
+  # Optimize for latency.
+  #
+  # Optimizations that reduce the latency of the model.
+  # The model latency will be reduced. Optimizations can include quantizing the
+  # weights of the floating point model.
+  OPTIMIZE_FOR_LATENCY = "OPTIMIZE_FOR_LATENCY"
+
+  def __str__(self):
+    return self.value
+
+
+@_tf_export("lite.RepresentativeDataset")
+class RepresentativeDataset(object):
+  """Representative dataset to evaluate optimizations.
+
+  A representative dataset that can be used to evaluate optimizations by the
+  converter. E.g. converter can use these examples to estimate (min, max) ranges
+  by calibrating the model on inputs. This can allow converter to quantize a
+  converted floating point model.
+  """
+
+  def __init__(self, input_gen, output_gen=None):
+    """Creates a representative dataset.
+
+    Args:
+      input_gen: an input generator that can be used to generate input samples
+        for the model. This must be a callable object that returns an object
+        that supports the `iter()` protocol (e.g. a generator function). The
+        elements generated must have same type and shape as inputs to the model.
+      output_gen: (optional) an output generator that can be used to generate
+        output samples for the model. This must be a callable object that
+        returns an object that supports the `iter()` protocol (e.g. a generator
+        function). The elements generated must have same type and shape as
+        outputs to the model. (default None)
+    """
+    self.input_gen = input_gen
+    self.output_gen = output_gen
+
+
+class TFLiteConverterV2(object):
+  """Converts a TensorFlow model into TensorFlow Lite model.
+
+  Attributes:
+    allow_custom_ops: Boolean indicating whether to allow custom operations.
+      When false any unknown operation is an error. When true, custom ops are
+      created for any op that is unknown. The developer will need to provide
+      these to the TensorFlow Lite runtime with a custom resolver. (default
+      False)
+    target_ops: Experimental flag, subject to change. Set of OpsSet options
+      indicating which converter to use. (default set([OpsSet.TFLITE_BUILTINS]))
+    optimizations: Experimental flag, subject to change, A list of optimizations
+      to apply when converting the model. The converter applies the
+      optimizations by giving priority to the optimizations specified earlier in
+      the list. E.g. `[optimize.OPTIMIZE_FOR_SIZE,
+      optimize.OPTIMIZE_FOR_LATENCY]` requires the converter to do both size and
+      latency optimizations giving priority to size optimizations over latency
+      optimizations.
+    representative_dataset: a representative dataset that can be used to
+      generate input and output samples for the model. The converter can use the
+      dataset to evaluate different optimizations.
+
+  Example usage:
+
+    ```python
+    # Converting a GraphDef from a ConcreteFunction.
+    converter = lite.TFLiteConverter.from_concrete_function(func)
+    tflite_model = converter.convert()
+    open("converted_model.tflite", "wb").write(tflite_model)
+    ```
+  """
+
+  def __init__(self, func):
+    """Constructor for TFLiteConverter.
+
+    Args:
+      func: TensorFlow ConcreteFunction.
+    """
+    self._func = func
+    self.allow_custom_ops = False
+    self.target_ops = set([OpsSet.TFLITE_BUILTINS])
+    self.representative_dataset = None
+    self.optimizations = []
+
+  @classmethod
+  def from_concrete_function(cls, func):
+    """Creates a TFLiteConverter class from a ConcreteFunction.
+
+    Args:
+      func: TensorFlow ConcreteFunction.
+
+    Returns:
+      TFLiteConverter class.
+    """
+    if not isinstance(func, _function.ConcreteFunction):
+      message = "This function takes in a ConcreteFunction."
+      if isinstance(func, _def_function.Function):
+        message += (" To get the ConcreteFunction from a Function,"
+                    " call from_concrete_function.")
+      raise ValueError(message)
+    return cls(func)
+
+  def convert(self):
+    """Converts a TensorFlow GraphDef based on instance variables.
+
+    Returns:
+      The converted data in serialized format.
+
+    Raises:
+      ValueError:
+        Input shape is not specified.
+        None value for dimension in input_tensor.
+    """
+    graph_def = _convert_to_constants.convert_variables_to_constants_v2(
+        self._func)
+    input_tensors = [
+        tensor for tensor in self._func.inputs
+        if tensor.dtype != _dtypes.resource
+    ]
+    output_tensors = self._func.outputs
+
+    # Run a Grappler pass.
+    graph_def = _run_graph_optimizations(graph_def, input_tensors,
+                                         output_tensors, self._func.graph)
+
+    # Checks dimensions in input tensor.
+    for tensor in input_tensors:
+      # Note that shape_list might be empty for scalar shapes.
+      shape_list = tensor.get_shape().as_list()
+      if None in shape_list[1:]:
+        raise ValueError(
+            "None is only supported in the 1st dimension. Tensor '{0}' has "
+            "invalid shape '{1}'.".format(_tensor_name(tensor), shape_list))
+      elif shape_list and shape_list[0] is None:
+        self._set_batch_size(batch_size=1)
+
+    if self.representative_dataset:
+      if not isinstance(self.representative_dataset, RepresentativeDataset):
+        raise TypeError("representative_dataset must be an instance of "
+                        "RepresentativeDataset")
+      if self.representative_dataset.input_gen is None:
+        raise ValueError(
+            "Provide an input generator for representative_dataset")
+
+    # TODO(shashishekhar): For now use optimizations order is ignored.
+    # Both size and latency optimizations decide whether to apply post
+    # training optimizations.
+    post_training_optimize = bool(
+        len(
+            set(self.optimizations)
+            & set([Optimize.OPTIMIZE_FOR_LATENCY, Optimize.OPTIMIZE_FOR_SIZE])))
+    # Do weights only quantization if there is no dataset for calibration.
+    weights_only_quantize_flag = (
+        post_training_optimize and (self.representative_dataset is None))
+
+    converter_kwargs = {
+        "input_format": constants.TENSORFLOW_GRAPHDEF,
+        "allow_custom_ops": self.allow_custom_ops,
+        "post_training_quantize": weights_only_quantize_flag,
+        "target_ops": self.target_ops,
+    }
+
+    # Converts model.
+    result = _toco_convert_impl(
+        input_data=graph_def,
+        input_tensors=input_tensors,
+        output_tensors=output_tensors,
+        **converter_kwargs)
+
+    if self.representative_dataset and post_training_optimize:
+      calibrate_quantize = _calibrator.Calibrator(result)
+      result = calibrate_quantize.calibrate_and_quantize(
+          self.representative_dataset.input_gen)
+
+    return result
+
+
 @_tf_export("lite.TFLiteConverter")
 class TFLiteConverter(object):
   """Convert a TensorFlow model into `output_format` using TOCO.
@@ -107,10 +328,11 @@ class TFLiteConverter(object):
       created for any op that is unknown. The developer will need to provide
       these to the TensorFlow Lite runtime with a custom resolver.
       (default False)
-    post_training_quantize: Boolean indicating whether to quantize the weights
-      of the converted float model. Model size will be reduced and there will be
-      latency improvements (at the cost of accuracy).
-      (default False)
+    post_training_quantize: deprecated, please specify
+     `[optimize.OPTIMIZE_FOR_SIZE]` for `optimizations` instead. Boolean
+     indicating whether to quantize the weights of the converted float model.
+     Model size will be reduced and there will be latency improvements
+     (at the cost of accuracy). (default False)
     dump_graphviz_dir: Full filepath of folder to dump the graphs at various
       stages of processing GraphViz .dot files. Preferred over
       --output_format=GRAPHVIZ_DOT in order to keep the requirements of the
@@ -120,6 +342,16 @@ class TFLiteConverter(object):
     target_ops: Experimental flag, subject to change. Set of OpsSet
       options indicating which converter to use.
       (default set([OpsSet.TFLITE_BUILTINS]))
+    optimizations: Experimental flag, subject to change, A list of
+      optimizations to apply when converting the model. The converter applies
+      the optimizations by giving priority to the optimizations specified
+      earlier in the list. E.g.
+      `[optimize.OPTIMIZE_FOR_SIZE, optimize.OPTIMIZE_FOR_LATENCY]` requires
+      the converter to do both size and latency optimizations giving priority
+      to size optimizations over latency optimizations.
+    representative_dataset: a representative dataset that can be used to
+      generate input and output samples for the model. The converter can use
+      the dataset to evaluate different optimizations.
 
   Example usage:
 
@@ -182,10 +414,12 @@ class TFLiteConverter(object):
     self.reorder_across_fake_quant = False
     self.change_concat_input_ranges = False
     self.allow_custom_ops = False
-    self.post_training_quantize = False
+    self._post_training_quantize = False
     self.dump_graphviz_dir = None
     self.dump_graphviz_video = False
     self.target_ops = set([OpsSet.TFLITE_BUILTINS])
+    self.representative_dataset = None
+    self.optimizations = []
 
     # Attributes are used by models that cannot be loaded into TensorFlow.
     if not self._has_valid_tensors():
@@ -385,6 +619,27 @@ class TFLiteConverter(object):
     graph_def = _freeze_graph(sess, output_tensors)
     return cls(graph_def, input_tensors, output_tensors)
 
+  def __setattr__(self, name, value):
+    if name == "post_training_quantize":
+      warnings.warn("Property %s is deprecated, "
+                    "please use optimizations=[Optimize.OPTIMIZE_FOR_SIZE]"
+                    " instead." % name)
+      if value:
+        # Use OPTIMIZE_FOR_SIZE for post training for now.
+        self.optimizations = [Optimize.OPTIMIZE_FOR_SIZE]
+      else:
+        self.optimizations = []
+      return
+    object.__setattr__(self, name, value)
+
+  def __getattribute__(self, name):
+    if name == "post_training_quantize":
+      warnings.warn("Property %s is deprecated, "
+                    "please use optimizations=[Optimize.OPTIMIZE_FOR_SIZE]"
+                    " instead." % name)
+      return Optimize.OPTIMIZE_FOR_SIZE in set(self.optimizations)
+    return object.__getattribute__(self, name)
+
   def convert(self):
     """Converts a TensorFlow GraphDef based on instance variables.
 
@@ -401,15 +656,16 @@ class TFLiteConverter(object):
     if self._has_valid_tensors():
       for tensor in self._input_tensors:
         shape = tensor.get_shape()
-        if not shape or not shape.as_list():
+        if not shape:
           raise ValueError("Provide an input shape for input array "
                            "'{0}'.".format(_tensor_name(tensor)))
+        # Note that shape_list might be empty for scalar shapes.
         shape_list = shape.as_list()
         if None in shape_list[1:]:
           raise ValueError(
               "None is only supported in the 1st dimension. Tensor '{0}' has "
               "invalid shape '{1}'.".format(_tensor_name(tensor), shape_list))
-        elif shape_list[0] is None:
+        elif shape_list and shape_list[0] is None:
           self._set_batch_size(batch_size=1)
 
     # Get quantization stats. Ensures there is one stat per name if the stats
@@ -428,6 +684,24 @@ class TFLiteConverter(object):
                          "tensors '{0}'.".format(",".join(invalid_stats)))
     else:
       quantized_stats = None
+    if self.representative_dataset:
+      if not isinstance(self.representative_dataset, RepresentativeDataset):
+        raise TypeError(
+            "representative_dataset must be an instance of "
+            "RepresentativeDataset")
+      if self.representative_dataset.input_gen is None:
+        raise ValueError(
+            "Provide an input generator for representative_dataset")
+
+    # TODO(shashishekhar): For now use optimizations order is ignored.
+    # Both size and latency optimizations decide whether to apply post
+    # training optimizations.
+    post_training_optimize = bool(
+        len(set(self.optimizations) & set([Optimize.OPTIMIZE_FOR_LATENCY,
+                                           Optimize.OPTIMIZE_FOR_SIZE])))
+    # Do weights only quantization if there is no dataset for calibration.
+    weights_only_quantize_flag = (
+        post_training_optimize and (self.representative_dataset is None))
 
     converter_kwargs = {
         "inference_type": self.inference_type,
@@ -440,25 +714,41 @@ class TFLiteConverter(object):
         "reorder_across_fake_quant": self.reorder_across_fake_quant,
         "change_concat_input_ranges": self.change_concat_input_ranges,
         "allow_custom_ops": self.allow_custom_ops,
-        "post_training_quantize": self.post_training_quantize,
+        "post_training_quantize": weights_only_quantize_flag,
         "target_ops": self.target_ops,
         "dump_graphviz_dir": self.dump_graphviz_dir,
         "dump_graphviz_video": self.dump_graphviz_video
     }
 
+    optimized_graph = None
+    if self.inference_type == constants.QUANTIZED_UINT8:
+      optimized_graph = self._graph_def
+    else:
+      try:
+        optimized_graph = _run_graph_optimizations(
+            self._graph_def, self._input_tensors, self._output_tensors)
+      except Exception:
+        optimized_graph = self._graph_def
+
     # Converts model.
     if self._has_valid_tensors():
       result = _toco_convert_impl(
-          input_data=self._graph_def,
+          input_data=optimized_graph,
           input_tensors=self._input_tensors,
           output_tensors=self._output_tensors,
           **converter_kwargs)
     else:
       result = _toco_convert_graph_def(
-          input_data=self._graph_def,
+          input_data=optimized_graph,
           input_arrays_with_shape=self._input_arrays_with_shape,
           output_arrays=self._output_arrays,
           **converter_kwargs)
+
+    if self.representative_dataset and post_training_optimize:
+      calibrate_quantize = _calibrator.Calibrator(result)
+      result = calibrate_quantize.calibrate_and_quantize(
+          self.representative_dataset.input_gen)
+
     return result
 
   def get_input_arrays(self):
@@ -500,7 +790,7 @@ class TFLiteConverter(object):
       tensor.set_shape(shape)
 
 
-@_tf_export("lite.TocoConverter")
+@_tf_export(v1=["lite.TocoConverter"])
 class TocoConverter(object):
   """Convert a TensorFlow model into `output_format` using TOCO.
 
diff --git a/tensorflow/lite/python/lite_flex_test.py b/tensorflow/lite/python/lite_flex_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5ae629413e782d011fafdb3b7e294cd884a301c
--- /dev/null
+++ b/tensorflow/lite/python/lite_flex_test.py
@@ -0,0 +1,58 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for lite.py functionality related to select TF op usage."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.lite.python import lite
+from tensorflow.lite.python.interpreter import Interpreter
+from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_v1_only('b/120545219')
+class FromSessionTest(test_util.TensorFlowTestCase):
+
+  def testFlexMode(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    out_tensor = in_tensor + in_tensor
+    sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
+                                                  [out_tensor])
+    converter.target_ops = set([lite.OpsSet.SELECT_TF_OPS])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Ensures the model contains TensorFlow ops.
+    # TODO(nupurgarg): Check values once there is a Python delegate interface.
+    interpreter = Interpreter(model_content=tflite_model)
+    with self.assertRaises(RuntimeError) as error:
+      interpreter.allocate_tensors()
+    self.assertIn(
+        'Regular TensorFlow ops are not supported by this interpreter. Make '
+        'sure you invoke the Flex delegate before inference.',
+        str(error.exception))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index 1f9c768b4441cc1385d93285d26eeee9b651ca83..d41b7a75fd1d7523551a37baac8038af2624cb28 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.variables import global_variables_initializer as _global_variables_initializer
 from tensorflow.python.platform import gfile
@@ -113,6 +114,35 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
+  def testString(self):
+    in_tensor = array_ops.placeholder(shape=[4], dtype=dtypes.string)
+    out_tensor = array_ops.reshape(in_tensor, shape=[2, 2])
+    sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
+                                                  [out_tensor])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual('Placeholder', input_details[0]['name'])
+    self.assertEqual(np.string_, input_details[0]['dtype'])
+    self.assertTrue(([4] == input_details[0]['shape']).all())
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('Reshape', output_details[0]['name'])
+    self.assertEqual(np.string_, output_details[0]['dtype'])
+    self.assertTrue(([2, 2] == output_details[0]['shape']).all())
+    # TODO(b/122659643): Test setting/getting string data via the python
+    # interpreter API after support has been added.
+
   def testQuantization(self):
     in_tensor_1 = array_ops.placeholder(
         shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputA')
@@ -223,18 +253,42 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual('Provide an input shape for input array \'Placeholder\'.',
                      str(error.exception))
 
-  def testSizeEmptyInvalid(self):
+  def testScalarValid(self):
+    # Construct a graph using a scalar (empty shape) input.
     in_tensor = array_ops.placeholder(dtype=dtypes.float32, shape=[])
     out_tensor = in_tensor + in_tensor
     sess = session.Session()
 
-    # Test empty shape.
+    # Test conversion with the scalar input shape.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
-    with self.assertRaises(ValueError) as error:
-      converter.convert()
-    self.assertEqual('Provide an input shape for input array \'Placeholder\'.',
-                     str(error.exception))
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual('Placeholder', input_details[0]['name'])
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertTrue(([] == input_details[0]['shape']).all())
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('add', output_details[0]['name'])
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+    self.assertTrue(([] == input_details[0]['shape']).all())
+
+    # Validate inference using the scalar inputs/outputs.
+    test_input = np.array(4.0, dtype=np.float32)
+    expected_output = np.array(8.0, dtype=np.float32)
+    interpreter.set_tensor(input_details[0]['index'], test_input)
+    interpreter.invoke()
+
+    output_data = interpreter.get_tensor(output_details[0]['index'])
+    self.assertTrue((expected_output == output_data).all())
 
   def testSizeInvalid(self):
     in_tensor = array_ops.placeholder(
@@ -428,6 +482,29 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
     self.assertTrue(output_details[0]['quantization'][0] > 0)  # scale
 
+  def testPostTrainingQuantizeDeprecatedAttribute(self):
+    in_tensor_1 = array_ops.placeholder(
+        shape=[33, 33], dtype=dtypes.float32, name='inputA')
+    in_tensor_2 = constant_op.constant(
+        np.random.uniform(low=-10., high=10., size=(33, 33)),
+        shape=[33, 33],
+        dtype=dtypes.float32,
+        name='inputB')
+    out_tensor = math_ops.matmul(in_tensor_1, in_tensor_2, name='output')
+    sess = session.Session()
+
+    quantized_converter = lite.TFLiteConverter.from_session(
+        sess, [in_tensor_1], [out_tensor])
+    self.assertFalse(quantized_converter.post_training_quantize)
+
+    quantized_converter.post_training_quantize = True
+    self.assertTrue(quantized_converter.post_training_quantize)
+    self.assertEqual(quantized_converter.optimizations,
+                     [lite.Optimize.OPTIMIZE_FOR_SIZE])
+
+    quantized_tflite = quantized_converter.convert()
+    self.assertTrue(quantized_tflite)
+
   def testPostTrainingQuantize(self):
     np.random.seed(0)
     # We need the tensor to have more than 1024 elements for quantize_weights
@@ -451,35 +528,58 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     # Convert quantized weights model.
     quantized_converter = lite.TFLiteConverter.from_session(
         sess, [in_tensor_1], [out_tensor])
-    quantized_converter.post_training_quantize = True
+    quantized_converter.optimizations = [lite.Optimize.OPTIMIZE_FOR_SIZE]
     quantized_tflite = quantized_converter.convert()
     self.assertTrue(quantized_tflite)
 
     # Ensure that the quantized weights tflite model is smaller.
     self.assertTrue(len(quantized_tflite) < len(float_tflite))
 
-  def testFlexMode(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32)
-    out_tensor = in_tensor + in_tensor
+  def testPostTrainingCalibrateAndQuantize(self):
+    np.random.seed(0)
+    # Create a mobilenet like model.
+    output_channel = 16
+    depth_multiplier = 1
+    inp = array_ops.placeholder(dtype=dtypes.float32, shape=(1, 5, 5, 3))
+    conv = nn_ops.conv2d(
+        inp,
+        filter=array_ops.zeros([3, 3, 3, output_channel]),
+        strides=[1, 1, 1, 1],
+        padding='SAME')
+    dconv = nn_ops.depthwise_conv2d_native(
+        conv,
+        filter=array_ops.zeros(
+            [16, 16, output_channel, output_channel * depth_multiplier]),
+        strides=[1, 1, 1, 1],
+        padding='SAME')
+    pool = nn_ops.pool(
+        dconv, window_shape=[2, 2], pooling_type='AVG', padding='SAME')
+    max_pool = nn_ops.pool(
+        pool, window_shape=[2, 2], pooling_type='MAX', padding='SAME')
+    output = nn_ops.softmax(max_pool)
+
+    def calibration_gen():
+      for _ in range(10):
+        yield np.random.uniform(-1, 1, size=(1, 5, 5, 3)).astype(np.float32)
+
     sess = session.Session()
 
-    # Convert model and ensure model is not None.
-    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
-                                                  [out_tensor])
-    converter.target_ops = set([lite.OpsSet.SELECT_TF_OPS])
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    # Convert float model.
+    float_converter = lite.TFLiteConverter.from_session(sess, [inp], [output])
+    float_tflite = float_converter.convert()
+    self.assertTrue(float_tflite)
 
-    # Ensures the model contains TensorFlow ops.
-    # TODO(nupurgarg): Check values once there is a Python delegate interface.
-    interpreter = Interpreter(model_content=tflite_model)
-    with self.assertRaises(RuntimeError) as error:
-      interpreter.allocate_tensors()
-    self.assertIn(
-        'Regular TensorFlow ops are not supported by this interpreter. Make '
-        'sure you invoke the Flex delegate before inference.',
-        str(error.exception))
+    # Convert quantized weights model.
+    quantized_converter = lite.TFLiteConverter.from_session(
+        sess, [inp], [output])
+    quantized_converter.optimizations = [lite.Optimize.OPTIMIZE_FOR_SIZE]
+    quantized_converter.representative_dataset = lite.RepresentativeDataset(
+        calibration_gen)
+    quantized_tflite = quantized_converter.convert()
+    self.assertTrue(quantized_tflite)
+
+    # Ensure that the quantized weights tflite model is smaller.
+    self.assertTrue(len(quantized_tflite) < len(float_tflite))
 
   def testFloatTocoConverter(self):
     """Tests deprecated test TocoConverter."""
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..126a7783486dde402fcf7957d8eaca6a71a93e01
--- /dev/null
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -0,0 +1,182 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for lite.py functionality related to TensorFlow 2.0."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.lite.python import lite
+from tensorflow.lite.python.interpreter import Interpreter
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model.load import load
+from tensorflow.python.saved_model.save import save
+from tensorflow.python.training.tracking import tracking
+
+
+class FromConcreteFunctionTest(test_util.TensorFlowTestCase):
+
+  def _evaluateTFLiteModel(self, tflite_model, input_data):
+    """Evaluates the model on the `input_data`."""
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    output_details = interpreter.get_output_details()
+
+    for input_tensor, tensor_data in zip(input_details, input_data):
+      interpreter.set_tensor(input_tensor['index'], tensor_data.numpy())
+    interpreter.invoke()
+    return interpreter.get_tensor(output_details[0]['index'])
+
+  @test_util.run_v2_only
+  def testTypeInvalid(self):
+    root = tracking.AutoTrackable()
+    root.v1 = variables.Variable(3.)
+    root.v2 = variables.Variable(2.)
+    root.f = def_function.function(lambda x: root.v1 * root.v2 * x)
+
+    with self.assertRaises(ValueError) as error:
+      _ = lite.TFLiteConverterV2.from_concrete_function(root.f)
+    self.assertIn('call from_concrete_function', str(error.exception))
+
+  @test_util.run_v2_only
+  def testFloat(self):
+    input_data = constant_op.constant(1., shape=[1])
+    root = tracking.AutoTrackable()
+    root.v1 = variables.Variable(3.)
+    root.v2 = variables.Variable(2.)
+    root.f = def_function.function(lambda x: root.v1 * root.v2 * x)
+    concrete_func = root.f.get_concrete_function(input_data)
+
+    # Convert model.
+    converter = lite.TFLiteConverterV2.from_concrete_function(concrete_func)
+    tflite_model = converter.convert()
+
+    # Check values from converted model.
+    expected_value = root.f(input_data)
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+  @test_util.run_v2_only
+  def testSizeNone(self):
+    # Test with a shape of None
+    input_data = constant_op.constant(1., shape=None)
+    root = tracking.AutoTrackable()
+    root.v1 = variables.Variable(3.)
+    root.f = def_function.function(lambda x: root.v1 * x)
+    concrete_func = root.f.get_concrete_function(input_data)
+
+    # Convert model.
+    converter = lite.TFLiteConverterV2.from_concrete_function(concrete_func)
+    tflite_model = converter.convert()
+
+    # Check values from converted model.
+    expected_value = root.f(input_data)
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+  @test_util.run_v2_only
+  def testConstSavedModel(self):
+    """Test a basic model with functions to make sure functions are inlined."""
+    self.skipTest('b/124205572')
+    input_data = constant_op.constant(1., shape=[1])
+    root = tracking.AutoTrackable()
+    root.f = def_function.function(lambda x: 2. * x)
+    to_save = root.f.get_concrete_function(input_data)
+
+    save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
+    save(root, save_dir, to_save)
+    saved_model = load(save_dir)
+    concrete_func = saved_model.signatures['serving_default']
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverterV2.from_concrete_function(concrete_func)
+    tflite_model = converter.convert()
+
+    # Check values from converted model.
+    expected_value = root.f(input_data)
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+  @test_util.run_v2_only
+  def testVariableSavedModel(self):
+    """Test a basic model with Variables with saving/loading the SavedModel."""
+    self.skipTest('b/124205572')
+    input_data = constant_op.constant(1., shape=[1])
+    root = tracking.AutoTrackable()
+    root.v1 = variables.Variable(3.)
+    root.v2 = variables.Variable(2.)
+    root.f = def_function.function(lambda x: root.v1 * root.v2 * x)
+    to_save = root.f.get_concrete_function(input_data)
+
+    save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
+    save(root, save_dir, to_save)
+    saved_model = load(save_dir)
+    concrete_func = saved_model.signatures['serving_default']
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverterV2.from_concrete_function(concrete_func)
+    tflite_model = converter.convert()
+
+    # Check values from converted model.
+    expected_value = root.f(input_data)
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+  @test_util.run_v2_only
+  def testMultiFunctionModel(self):
+    """Test a basic model with Variables."""
+
+    class BasicModel(tracking.AutoTrackable):
+
+      def __init__(self):
+        self.y = None
+        self.z = None
+
+      @def_function.function
+      def add(self, x):
+        if self.y is None:
+          self.y = variables.Variable(2.)
+        return x + self.y
+
+      @def_function.function
+      def sub(self, x):
+        if self.z is None:
+          self.z = variables.Variable(3.)
+        return x - self.z
+
+    input_data = constant_op.constant(1., shape=[1])
+    root = BasicModel()
+    concrete_func = root.add.get_concrete_function(input_data)
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverterV2.from_concrete_function(concrete_func)
+    tflite_model = converter.convert()
+
+    # Check values from converted model.
+    expected_value = root.add(input_data)
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/lite/python/op_hint.py b/tensorflow/lite/python/op_hint.py
index 8d7f9316bfe81255510fc5aca9ffdf9671cd64df..90c87503f48e9307bdd79130953f5c1a12e2708f 100644
--- a/tensorflow/lite/python/op_hint.py
+++ b/tensorflow/lite/python/op_hint.py
@@ -36,9 +36,7 @@ Example:
   session = tf.Session()
 
   graphdef_to_convert = tf.lite.convert_op_hints_to_stubs(session)
-  tflite_graph = tf.lite.toco_convert(graphdef_to_convert,
-                                              [image], [output])
-                                              [image], [output])
+  tflite_graph = tf.lite.toco_convert(graphdef_to_convert, [image], [output])
   with open("/tmp/graph.fb", "wb") as fp:
     fp.write(tflite_graph)
 
@@ -73,6 +71,7 @@ from __future__ import print_function
 
 import collections as _collections
 import copy as _copy
+import json as _json
 import uuid as _uuid
 import six as _six
 
@@ -134,6 +133,14 @@ class OpHint(object):
   # "stuff", "foo", "bar", -1 (where -1 is unused). So you would set this
   # attribute to [2, 0, 1, -1].
   TFLITE_INPUT_INDICES = "_tflite_input_indices"
+  # OpHint level.
+  FUNCTION_LEVEL_ATTR = "_tflite_ophint_level"
+  # Ophint internal mapping, this is for high level Ophint only.
+  # This basically contains three kinds of mapping:
+  #   1) How parental ophinted inputs map to the first child ophinted inputs;
+  #   2) How internal children nodes are connected;
+  #   3) How parental ophinted outputs map to the last child ophinted outputs.
+  CHILDREN_INPUTS_MAPPINGS = "_tflite_children_ophint_inputs_mapping"
 
   # Types of aggregations
   #  stack: stacks all ophints with matching tags. i.e. for a static rnn.
@@ -151,10 +158,16 @@ class OpHint(object):
     """Conceptually tracks indices of arguments of "OpHint functions".
 
     The inputs and arguments of these functions both use an instance
-    of the class so they can have independent numbering."""
+    of the class so they can have independent numbering.
+    """
 
-    def __init__(self, function_name, unique_function_id, node_name_prefix,
-                 attr_name):
+    def __init__(self,
+                 function_name,
+                 unique_function_id,
+                 node_name_prefix,
+                 attr_name,
+                 level=1,
+                 children_inputs_mappings=None):
       """Initialize ophint argument.
 
       Args:
@@ -163,6 +176,8 @@ class OpHint(object):
         node_name_prefix: How identities that are created are named.
         attr_name: Name of attribute to use to store the index for this hint.
           i.e. FUNCTION_INPUT_INDEX or FUNCTION_OUTPUT_INDEX
+        level: Hierarchical level of the Ophint node, a number.
+        children_inputs_mappings: Inputs/Outputs mapping for children hints.
       """
 
       # The global index is the argument index of the op. This is in contrast
@@ -178,6 +193,8 @@ class OpHint(object):
       self._tag_to_next_sort_index = {}  # The current index for each tag
       self._node_name_prefix = node_name_prefix
       self._attr_name = attr_name
+      self._level = level
+      self._children_inputs_mappings = children_inputs_mappings
 
     def _get_new_global_index(self, index_override):
       """Return the next unused argument index in order or use an override.
@@ -253,6 +270,7 @@ class OpHint(object):
       uuid = self._unique_function_id
       name = "%s-%s-%s-%r-%r-%s" % (self._node_name_prefix, self._function_name,
                                     uuid, global_index, sort_index, name)
+
       identity_op = _array_ops.identity(arg, name=name)
 
       # pylint: disable=protected-access
@@ -266,6 +284,15 @@ class OpHint(object):
               s=_compat.as_bytes(self._unique_function_id)))
       identity_op.op._set_attr(
           self._attr_name, _attr_value_pb2.AttrValue(i=global_index))
+      identity_op.op._set_attr(OpHint.FUNCTION_LEVEL_ATTR,
+                               _attr_value_pb2.AttrValue(i=self._level))
+      if self._children_inputs_mappings:
+        identity_op.op._set_attr(
+            OpHint.CHILDREN_INPUTS_MAPPINGS,
+            _attr_value_pb2.AttrValue(
+                s=_compat.as_bytes(_json.dumps(
+                    self._children_inputs_mappings))))
+
       if sort_index is not None:
         identity_op.op._set_attr(
             OpHint.FUNCTION_SORT_INDEX_ATTR,
@@ -277,23 +304,74 @@ class OpHint(object):
       # pylint: enable=protected-access
       return identity_op
 
-  def __init__(self, function_name, **kwargs):
+  def __init__(self,
+               function_name,
+               level=1,
+               children_inputs_mappings=None,
+               **kwargs):
     """Create a OpHint.
 
     Args:
       function_name: Name of the function (the custom op name in tflite)
+      level: OpHint level.
+      children_inputs_mappings: Children OpHint inputs/outputs mapping.
+        children_inputs_mappings should like below:
+        "parent_first_child_input":
+            [{"parent_input_index": num, "child_input_index": num}, ...]
+        "parent_last_child_output":
+            [{"parent_output_index": num, "child_output_index": num}, ...]
+        "internal_children_input_output":
+            [{"child_input_index": num, "child_output_index": num}, ...]
       **kwargs: Keyword arguments of any constant attributes for the function.
     """
     self._function_name = function_name
+    self._level = level
+    if self._level == 1:
+      assert children_inputs_mappings is None
+    else:
+      assert isinstance(children_inputs_mappings, dict)
+    self._children_inputs_mappings = children_inputs_mappings
+    if self._children_inputs_mappings is not None:
+      self._validate_children_inputs_mappings(self._children_inputs_mappings)
     self._unique_function_id = _uuid.uuid1().hex  # TODO(aselle): Unique enough?
     self._attrs_to_store_later = kwargs
     self._stored_attrs = False
     self._inputs = OpHint.OpHintArgumentTracker(
         self._function_name, self._unique_function_id, "InputHint",
-        OpHint.FUNCTION_INPUT_INDEX_ATTR)
+        OpHint.FUNCTION_INPUT_INDEX_ATTR, level, self._children_inputs_mappings)
     self._outputs = OpHint.OpHintArgumentTracker(
         self._function_name, self._unique_function_id, "OutputHint",
-        OpHint.FUNCTION_OUTPUT_INDEX_ATTR)
+        OpHint.FUNCTION_OUTPUT_INDEX_ATTR, level,
+        self._children_inputs_mappings)
+
+  def _validate_children_inputs_mappings(self, children_inputs_mappings):
+    """Validate children inputs mappings is in the right format.
+
+    Args:
+      children_inputs_mappings: the Children ophint inputs/outputs mapping.
+    """
+    assert isinstance(children_inputs_mappings, dict)
+    assert "parent_first_child_input" in children_inputs_mappings
+    assert "parent_last_child_output" in children_inputs_mappings
+    assert "internal_children_input_output" in children_inputs_mappings
+
+    # validate parent_first_child_input.
+
+    def assert_dictlist_has_keys(dictlist, keys):
+      for dikt in dictlist:
+        assert isinstance(dikt, dict)
+        for key in keys:
+          assert key in dikt
+
+    assert_dictlist_has_keys(
+        children_inputs_mappings["parent_first_child_input"],
+        ["parent_ophint_input_index", "first_child_ophint_input_index"])
+    assert_dictlist_has_keys(
+        children_inputs_mappings["parent_last_child_output"],
+        ["parent_output_index", "child_output_index"])
+    assert_dictlist_has_keys(
+        children_inputs_mappings["internal_children_input_output"],
+        ["child_input_index", "child_output_index"])
 
   def _setattr(self, dest_op, name, value):
     tensor_value = _ops.convert_to_tensor(value)
@@ -384,7 +462,7 @@ class OpHint(object):
 
 
 class _LiteOperand(object):
-  """Abstract operand for a tflite hint function.
+  """Abstract operand for a tflite hint function._dynamic_rnn_loop.
 
   This is a base class that handles representing arguments to an OpHint.
   It also is able to serialize operands to the stubbed graph_def.
@@ -582,15 +660,18 @@ class _LiteFuncCall(object):
   This is uses to accumulate found hints in the graphdef into a single
   conceptual unit.
 
-  Properties:
-    self.inputs: inputs to the op (hash from index # to argument)
-    self.outputs: outputs to the op (hash from index # to argument)
-    self.function_name: the tflite custom op name to use
-    self.uuid: a unique call id for this particular call  (i.e.
+  Attributes:
+    inputs: inputs to the op (hash from index # to argument)
+    outputs: outputs to the op (hash from index # to argument)
+    function_name: the tflite custom op name to use
+    uuid: a unique call id for this particular call  (i.e.
       multiple function calls would have the same function_name but different
       uuids.
-    self.params: A param name to key value for op constant data. I.e. for
+    params: A param name to key value for op constant data. I.e. for
       axis on a reduction, strides on a convolution, etc.
+    level: Level of the OpHint.
+    children_inputs_mappings: If the Ophint has children, children inputs
+      mappings indicate how their inputs & outputs are mapped.
   """
 
   def __init__(self):
@@ -599,6 +680,8 @@ class _LiteFuncCall(object):
     self.function_name = None
     self.uuid = None
     self.params = {}
+    self.level = -1
+    self.children_inputs_mappings = {}
 
   def flattened_inputs_and_outputs(self):
     """Return a list of inputs and outputs in a flattened format.
@@ -624,22 +707,25 @@ class _LiteFuncCall(object):
     inputs_str = "\tInputs\n" + format_args(self.inputs)
     outputs_str = "\tOutputs\n" + format_args(self.outputs)
 
-    return ("tflite function %s call %s\n\tinputs:\n\t\t%s\n\toutputs:\n\t\t%s"
-            % (self.function_name, self.uuid, inputs_str, outputs_str))
+    return (
+        "tflite function %s call %s level %d "
+        "\n\tinputs:\n\t\t%s\n\toutputs:\n\t\t%s" %
+        (self.function_name, self.uuid, self.level, inputs_str, outputs_str))
 
 
-def _find_all_hints_in_graph_def(graphdef):
-  """Look at the current default graph and return a list of LiteFuncCall objs.
+def _find_all_hints_in_nodes(nodes):
+  """Look at the all the input nodes and return a list of LiteFuncCall objs.
 
   Args:
-    graphdef: A TensorFlow graph_def to look for LiteFuncCalls.
+    nodes: A TensorFlow graph_def to look for LiteFuncCalls.
+
   Returns:
     a list of `LifeFuncCall` objects in the form
 
   """
   func_calls = _collections.defaultdict(_LiteFuncCall)
 
-  for node in graphdef.node:
+  for node in nodes:
     attr = node.attr
     # This is an op hint if it has a FUNCTION_UUID_ATTR, otherwise skip
     uuid = attr[OpHint.FUNCTION_UUID_ATTR].s
@@ -651,6 +737,7 @@ def _find_all_hints_in_graph_def(graphdef):
     call_def = func_calls[uuid]
     call_def.uuid = uuid
     call_def.function_name = attr[OpHint.FUNCTION_NAME_ATTR].s
+    call_def.level = attr[OpHint.FUNCTION_LEVEL_ATTR].i
     # Get sorting and aggregation information
 
     sort = (attr[OpHint.FUNCTION_SORT_INDEX_ATTR].i
@@ -660,6 +747,10 @@ def _find_all_hints_in_graph_def(graphdef):
     if OpHint.FUNCTION_AGGREGATE_ATTR in attr:
       aggregation = _compat.as_text(attr[OpHint.FUNCTION_AGGREGATE_ATTR].s)
 
+    if OpHint.CHILDREN_INPUTS_MAPPINGS in attr:
+      call_def.children_inputs_mappings = _json.loads(
+          _compat.as_text(attr[OpHint.CHILDREN_INPUTS_MAPPINGS].s))
+
     # Add the input or output
     def put_operand(stuff, index, sort, operand, aggregation):
       """Add a given index into the function structure."""
@@ -685,6 +776,98 @@ def _find_all_hints_in_graph_def(graphdef):
   return func_calls
 
 
+def _extract_topology_sequence_mapping(nodes):
+  return dict(
+      (_tensor_name_base(node.name), idx) for idx, node in enumerate(nodes))
+
+
+def _find_children_hints_in_while_loop(function_def, nodes_mapping):
+  """Find children hints and all nodes inside the while loop.
+
+  Args:
+    function_def: Function def of the while loop.
+    nodes_mapping: While loop input_arg : real node name.
+
+  Returns:
+    Ordered children hints and all re-mapped nodes inside the while loop.
+  """
+  new_nodes = []
+
+  # Make nodes inside function def inputs point to the real nodes.
+  for node in function_def.node_def:
+    for i in range(len(node.input)):
+      if node.input[i] in nodes_mapping:
+        node.input[i] = nodes_mapping[node.input[i]]
+    new_nodes.append(_copy.deepcopy(node))
+  name_to_seq_num = _extract_topology_sequence_mapping(function_def.node_def)
+  children_hints = _find_all_hints_in_nodes(new_nodes)
+  children_hints_q = []
+  # Ordered by the outputs.
+  for hint in _six.itervalues(children_hints):
+    _, output_names = hint.flattened_inputs_and_outputs()
+    seq = name_to_seq_num[output_names[0]]
+    for output_name in output_names:
+      seq = min(seq, name_to_seq_num[output_name])
+    children_hints_q.append((seq, hint))
+  children_hints_q.sort(key=lambda tup: tup[0])
+  ordered_children_hints = [x[1] for x in children_hints_q]
+  return ordered_children_hints, new_nodes
+
+
+def _find_children_hints(call, graph_def):
+  """Find all children hints.
+
+  For a given OpHint, we find all children hints inside it, we also copy all the
+  nodes inside function defs (if applicable) to the original graph_def, they are
+  returned in a list as well.
+
+  Args:
+    call: Parent OpHint that contains children ophints.
+    graph_def: Original graph def.
+
+  Returns:
+    Ordered children hints inside the parent ophint; new graph def that contains
+    nodes inside function defs (if applicable); nodes inside function defs.
+  """
+  name_to_input_name, _, _ = _extract_graph_summary(graph_def)
+  input_names, output_names = call.flattened_inputs_and_outputs()
+
+  reachable_by_input = _bfs_for_reachable_nodes(input_names, name_to_input_name)
+  reachable_by_output = _bfs_for_reachable_nodes(output_names,
+                                                 name_to_input_name)
+  output_nodes_set = set(output_names)
+  children_hints = []
+  out = _graph_pb2.GraphDef()
+  out.library.CopyFrom(graph_def.library)
+  out.versions.CopyFrom(graph_def.versions)
+  function_def_nodes = set()
+  for node in graph_def.node:
+    out.node.extend([_copy.deepcopy(node)])
+    n = _tensor_name_base(node.name)
+    if n in reachable_by_output:
+      if n not in reachable_by_input and n not in output_nodes_set:
+        # special handle for while loop function def.
+        if node.op == "While":
+          body_name = node.attr["body"].func.name
+          inputs_outside_loop = node.input
+          for function_def in graph_def.library.function:
+            if function_def.signature.name == body_name:
+              function_inputs = function_def.signature.input_arg
+              assert len(inputs_outside_loop) == len(function_inputs)
+              nodes_mapping = {}
+              for i in range(len(function_inputs)):
+                nodes_mapping[function_inputs[i].name] = inputs_outside_loop[i]
+              # TODO(b/123050804): Consider use grappler.
+              (children_hints_in_loop,
+               new_nodes) = _find_children_hints_in_while_loop(
+                   function_def, nodes_mapping)
+              function_def_nodes.update([x.name for x in new_nodes])
+              children_hints.extend(children_hints_in_loop)
+              out.node.extend(new_nodes)
+
+  return children_hints, out, function_def_nodes
+
+
 def _tensor_name_base(full_tensor_name):
   """Removes the device assignment code from a tensor.
 
@@ -737,12 +920,20 @@ def _check_subgraph_closed(n, reachable_by_input, input_nodes_set,
 
 
 # TODO(aselle): This should be converted to grappler in the future.
-def _convert_single_op_hint_to_stub(call, graph_def):
+def _convert_single_op_hint_to_stub(call,
+                                    graph_def,
+                                    function_def_nodes=None,
+                                    is_last_run=True):
   """Given a graph_def, converts `call` into a stub and returns a new graph_def.
 
   Args:
     call: A single function call to be converted.
     graph_def: A graph_def to use as input (that hass call obviously).
+    function_def_nodes: Nodes inside the function def those are not connected to
+      the graph.
+    is_last_run: Whether it is the last run for a given pass (for OpHint has
+      children).
+
   Returns:
     A new transformed graph-def that has call as a stub (single op).
 
@@ -750,6 +941,8 @@ def _convert_single_op_hint_to_stub(call, graph_def):
       the tensorflow runtime, so all future manipulations are done in graph_def
       level.
   """
+  if function_def_nodes is None:
+    function_def_nodes = set()
   name_to_input_name, name_to_node, name_to_seq_num = _extract_graph_summary(
       graph_def)
   input_names, output_names = call.flattened_inputs_and_outputs()
@@ -757,7 +950,6 @@ def _convert_single_op_hint_to_stub(call, graph_def):
   reachable_by_input = _bfs_for_reachable_nodes(input_names, name_to_input_name)
   reachable_by_output = _bfs_for_reachable_nodes(output_names,
                                                  name_to_input_name)
-  input_nodes_set = set(input_names)
   output_nodes_set = set(output_names)
   nodes_after_fuse = []
   nodes_deleted_by_fuse = set()
@@ -768,19 +960,16 @@ def _convert_single_op_hint_to_stub(call, graph_def):
     n = _tensor_name_base(node.name)
     if n in reachable_by_output:
       if n not in reachable_by_input and n not in output_nodes_set:
-        # n is an internal node. Check to make sure it is really internal.
-        # TODO(aselle): this could be done more efficiently by flooding
-        # the graph first.
-        _check_subgraph_closed(n, reachable_by_input, input_nodes_set,
-                               name_to_input_name)
         nodes_deleted_by_fuse.add(n)
-    elif n not in reachable_by_input:
+    elif n not in reachable_by_input and n not in function_def_nodes:
       # n is a node that after all the fusings, so keep it.
       nodes_after_fuse.append(n)
     else:
-      # n is a node that is randomly in the graph but not connected to
-      # the chain of dependencies.
-      pass
+      # In the last run, n is a node that is randomly in the graph but not
+      # connected to the chain of dependencies, we will delete n, otherwise
+      # we keep them.
+      if not is_last_run:
+        nodes_after_fuse.append(n)
 
   # Make a new graphdef with all the pre-input and input nodes
   out = _graph_pb2.GraphDef()
@@ -802,7 +991,8 @@ def _convert_single_op_hint_to_stub(call, graph_def):
   # non-fused things.
   for input_index in sorted_input_indices:
     inputs = call.inputs[input_index]
-    new_node.input.append(inputs.aggregate_and_return_name_for_input(out))
+    input_name = inputs.aggregate_and_return_name_for_input(out)
+    new_node.input.append(input_name)
   new_node.attr[OpHint.TFLITE_INPUT_INDICES].list.i.extend(sorted_input_indices)
 
   # Ceate the function
@@ -938,6 +1128,18 @@ def _remove_redundant_stack_unstack(graph_def):
   return curr
 
 
+def _get_correct_mapping(original_index, nodes):
+  # Special handle for the index is -1 case.
+  # If it is -1, return the last index.
+  if original_index == -1:
+    node_indices = nodes.keys()
+    node_indices = sorted(node_indices)
+    return node_indices[-1]
+  else:
+    return original_index
+  return original_index
+
+
 @_tf_export("lite.convert_op_hints_to_stubs")
 def _convert_op_hints_to_stubs_helper(
     graph_def, write_callback=lambda sess, graph_def: None):
@@ -950,20 +1152,102 @@ def _convert_op_hints_to_stubs_helper(
   Returns:
     A new stubbed graph_def.
   """
+  hints = _find_all_hints_in_nodes(graph_def.node)
+
+  hints_q = []
+  for hint in _six.itervalues(hints):
+    hints_q.append((hint.level, hint.uuid))
+
+  hints_q.sort(key=lambda tup: tup[0])
+  for i in range(len(hints_q) - 1, -1, -1):
+    level, hint_uuid = hints_q[i]
 
-  hints = _find_all_hints_in_graph_def(graph_def)
   curr_graph_def = graph_def
   del graph_def  # prevent using graph_def again (common source of error)
-  for hint in _six.itervalues(hints):
-    curr_graph_def = _convert_single_op_hint_to_stub(
-        hint, curr_graph_def)
-    write_callback(curr_graph_def, "initial")
+  for i in range(len(hints_q) - 1, -1, -1):
+    level, hint_uuid = hints_q[i]
+    if level >= 2:
+      children_hints, curr_graph_def, function_def_nodes = _find_children_hints(
+          hints[hint_uuid], curr_graph_def)
+      # pylint: disable=superfluous-parens
+      assert (len(children_hints) > 0)  #  pylint: disable=g-explicit-length-test
+      # pylint: enable=superfluous-parens
+
+      # Re-wire the children hints inputs/outputs, so latter child's inputs
+      # connect to previous child node's outputs.
+      children_inputs_mappings = hints[hint_uuid].children_inputs_mappings
+      for j in range(len(children_hints)):
+        child_hint = children_hints[j]
+        if j == 0:
+          for mapping in children_inputs_mappings["parent_first_child_input"]:
+            parent_input_index = _get_correct_mapping(
+                mapping["parent_ophint_input_index"], hints[hint_uuid].inputs)
+            child_input_index = _get_correct_mapping(
+                mapping["first_child_ophint_input_index"], child_hint.inputs)
+            child_hint.inputs[child_input_index] = hints[hint_uuid].inputs[
+                parent_input_index]
+        else:
+          for mapping in children_inputs_mappings[
+              "internal_children_input_output"]:
+            input_index = _get_correct_mapping(mapping["child_input_index"],
+                                               child_hint.inputs)
+            output_index = _get_correct_mapping(mapping["child_output_index"],
+                                                children_hints[j - 1].outputs)
+            child_hint.inputs[input_index] = children_hints[
+                j - 1].outputs[output_index]
+        if j == len(children_hints) - 1:
+          for mapping in children_inputs_mappings["parent_last_child_output"]:
+            parent_output_index = _get_correct_mapping(
+                mapping["parent_output_index"], hints[hint_uuid].outputs)
+            child_output_index = _get_correct_mapping(
+                mapping["child_output_index"], child_hint.outputs)
+            child_hint.outputs[child_output_index] = hints[hint_uuid].outputs[
+                parent_output_index]
+
+      for j in range(len(children_hints)):
+        child_hint = children_hints[j]
+        curr_graph_def = _convert_single_op_hint_to_stub(
+            child_hint, curr_graph_def, function_def_nodes,
+            j == len(children_hints) - 1)
+    else:
+      curr_graph_def = _convert_single_op_hint_to_stub(hints[hint_uuid],
+                                                       curr_graph_def)
+      write_callback(curr_graph_def, "initial")
   # The stubbing process can create stacks/unstacks in the case of LSTMs
   # remove them.
   curr_graph_def = _remove_redundant_stack_unstack(curr_graph_def)
   return curr_graph_def
 
 
+def find_all_hinted_output_nodes(session=None, graph_def=None):
+  """Find all Ophints output nodes in the graph.
+
+  This is used to get all the output nodes those are ophinted, it is important
+  for operation like convert_variables_to_constants keep all ophints structure.
+  Note: only one of session or graph_def should be used, not both.
+
+  Args:
+    session: A TensorFlow session that contains the graph to convert.
+    graph_def: A graph def that we should convert.
+
+  Returns:
+    A list of OpHints output nodes.
+  Raises:
+    ValueError: If both session and graph_def are provided.
+  """
+  if session is not None and graph_def is not None:
+    raise ValueError("Provide only one of session and graph_def.")
+  hinted_outputs_nodes = []
+  if session is not None:
+    hints = _find_all_hints_in_nodes(session.graph_def.node)
+  elif graph_def is not None:
+    hints = _find_all_hints_in_nodes(graph_def.node)
+  for hint in _six.itervalues(hints):
+    _, ouput_nodes = hint.flattened_inputs_and_outputs()
+    hinted_outputs_nodes.extend(ouput_nodes)
+  return hinted_outputs_nodes
+
+
 def convert_op_hints_to_stubs(session=None,
                               graph_def=None,
                               write_callback=lambda graph_def, comments: None):
@@ -996,6 +1280,7 @@ def convert_op_hints_to_stubs(session=None,
 
 
 _allowed_symbols = [
-    "OpHint", "convert_op_hints_to_stubs", "convert_op_hints_to_stubs_new"
+    "OpHint", "convert_op_hints_to_stubs", "convert_op_hints_to_stubs_new",
+    "find_all_hinted_output_nodes"
 ]
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/lite/python/optimize/BUILD b/tensorflow/lite/python/optimize/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..51310af14a46ef66d8e968a11ceef55bd5799b81
--- /dev/null
+++ b/tensorflow/lite/python/optimize/BUILD
@@ -0,0 +1,67 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
+
+cc_library(
+    name = "calibration_wrapper_lib",
+    srcs = ["calibration_wrapper.cc"],
+    hdrs = ["calibration_wrapper.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/python/interpreter_wrapper:numpy",
+        "//tensorflow/lite/python/interpreter_wrapper:python_error_reporter",
+        "//tensorflow/lite/python/interpreter_wrapper:python_utils",
+        "//tensorflow/lite/tools/optimize:quantize_model",
+        "//tensorflow/lite/tools/optimize/calibration:calibration_reader",
+        "//tensorflow/lite/tools/optimize/calibration:calibrator_lib",
+        "//third_party/py/numpy:headers",
+        "//third_party/python_runtime:headers",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+tf_py_wrap_cc(
+    name = "tensorflow_lite_wrap_calibration_wrapper",
+    srcs = [
+        "calibration_wrapper.i",
+    ],
+    deps = [
+        ":calibration_wrapper_lib",
+        "//third_party/python_runtime:headers",
+    ],
+)
+
+py_library(
+    name = "calibrator",
+    srcs = [
+        "calibrator.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/lite/python/optimize:tensorflow_lite_wrap_calibration_wrapper",
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "calibrator_test",
+    srcs = ["calibrator_test.py"],
+    data = [":test_data"],
+    srcs_version = "PY2AND3",
+    tags = ["no_oss"],
+    deps = [
+        ":calibrator",
+        "//tensorflow/lite/python/optimize:tensorflow_lite_wrap_calibration_wrapper",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.cc b/tensorflow/lite/python/optimize/calibration_wrapper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..12bcd6a6283ccc71f4df7758b46aec298a87bb7d
--- /dev/null
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.cc
@@ -0,0 +1,212 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/python/optimize/calibration_wrapper.h"
+
+#include <memory>
+#include <sstream>
+#include <string>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/python/interpreter_wrapper/numpy.h"
+#include "tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h"
+#include "tensorflow/lite/python/interpreter_wrapper/python_utils.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_reader.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibrator.h"
+#include "tensorflow/lite/tools/optimize/quantize_model.h"
+
+#define TFLITE_PY_CHECK(x)               \
+  if ((x) != kTfLiteOk) {                \
+    return error_reporter_->exception(); \
+  }
+
+#define TFLITE_PY_ENSURE_VALID_INTERPRETER()                               \
+  if (!interpreter_) {                                                     \
+    PyErr_SetString(PyExc_ValueError, "Interpreter was not initialized."); \
+    return nullptr;                                                        \
+  }
+
+namespace tflite {
+namespace calibration_wrapper {
+
+namespace {
+
+using python_utils::PyDecrefDeleter;
+
+std::unique_ptr<tflite::ModelT> CreateMutableModel(const tflite::Model& model) {
+  std::unique_ptr<tflite::ModelT> copied_model =
+      absl::make_unique<tflite::ModelT>();
+  model.UnPackTo(copied_model.get(), nullptr);
+  return copied_model;
+}
+
+}  // namespace
+
+CalibrationWrapper::CalibrationWrapper(
+    std::unique_ptr<tflite::Interpreter> interpreter,
+    std::unique_ptr<tflite::ops::builtin::BuiltinOpResolver> resolver,
+    std::unique_ptr<tflite::interpreter_wrapper::PythonErrorReporter>
+        error_reporter,
+    std::unique_ptr<tflite::FlatBufferModel> model,
+    std::unique_ptr<tflite::optimize::calibration::CalibrationReader> reader)
+    : interpreter_(std::move(interpreter)),
+      error_reporter_(std::move(error_reporter)),
+      resolver_(std::move(resolver)),
+      model_(std::move(model)),
+      reader_(std::move(reader)) {}
+
+CalibrationWrapper::~CalibrationWrapper() {}
+
+PyObject* CalibrationWrapper::Prepare() {
+  TFLITE_PY_ENSURE_VALID_INTERPRETER();
+  TFLITE_PY_CHECK(interpreter_->AllocateTensors());
+  TFLITE_PY_CHECK(interpreter_->ResetVariableTensors());
+  Py_RETURN_NONE;
+}
+
+PyObject* CalibrationWrapper::FeedTensor(PyObject* input_value) {
+  TFLITE_PY_ENSURE_VALID_INTERPRETER();
+  if (!PyList_Check(input_value)) {
+    PyErr_Format(PyExc_ValueError,
+                 "Invalid input type: expected input to be a list.");
+    return nullptr;
+  }
+
+  const size_t inputs_size = PyList_Size(input_value);
+
+  if (inputs_size != interpreter_->inputs().size()) {
+    PyErr_Format(PyExc_ValueError,
+                 "Invalid input size: expected %ld items got %ld items.",
+                 interpreter_->inputs().size(), inputs_size);
+    return nullptr;
+  }
+
+  for (size_t i = 0; i < inputs_size; i++) {
+    PyObject* input = PyList_GetItem(input_value, i);
+    if (!input) {
+      return nullptr;
+    }
+    int input_tensor_idx = interpreter_->inputs()[i];
+    if (!SetTensor(input_tensor_idx, input)) {
+      return nullptr;
+    }
+  }
+
+  TFLITE_PY_CHECK(interpreter_->Invoke());
+  Py_RETURN_NONE;
+}
+
+PyObject* CalibrationWrapper::SetTensor(int index, PyObject* value) {
+  TFLITE_PY_ENSURE_VALID_INTERPRETER();
+
+  std::unique_ptr<PyObject, PyDecrefDeleter> array_safe(
+      PyArray_FromAny(value, nullptr, 0, 0, NPY_ARRAY_CARRAY, nullptr));
+  if (!array_safe) {
+    PyErr_SetString(PyExc_ValueError,
+                    "Failed to convert value into readable tensor.");
+    return nullptr;
+  }
+
+  PyArrayObject* array = reinterpret_cast<PyArrayObject*>(array_safe.get());
+  const TfLiteTensor* tensor = interpreter_->tensor(index);
+
+  if (python_utils::TfLiteTypeFromPyArray(array) != tensor->type) {
+    PyErr_Format(PyExc_ValueError,
+                 "Cannot set tensor:"
+                 " Got tensor of type %d"
+                 " but expected type %d for input %d, name: %s ",
+                 python_utils::TfLiteTypeFromPyArray(array), tensor->type,
+                 index, tensor->name);
+    return nullptr;
+  }
+
+  if (PyArray_NDIM(array) != tensor->dims->size) {
+    PyErr_SetString(PyExc_ValueError, "Cannot set tensor: Dimension mismatch");
+    return nullptr;
+  }
+
+  for (int j = 0; j < PyArray_NDIM(array); j++) {
+    if (tensor->dims->data[j] != PyArray_SHAPE(array)[j]) {
+      PyErr_SetString(PyExc_ValueError,
+                      "Cannot set tensor: Dimension mismatch");
+      return nullptr;
+    }
+  }
+
+  size_t size = PyArray_NBYTES(array);
+  if (size != tensor->bytes) {
+    PyErr_Format(PyExc_ValueError,
+                 "numpy array had %zu bytes but expected %zu bytes.", size,
+                 tensor->bytes);
+    return nullptr;
+  }
+  memcpy(tensor->data.raw, PyArray_DATA(array), size);
+  Py_RETURN_NONE;
+}
+
+PyObject* CalibrationWrapper::QuantizeModel() {
+  auto tflite_model = CreateMutableModel(*model_->GetModel());
+  reader_->AddCalibrationToModel(tflite_model.get());
+  flatbuffers::FlatBufferBuilder builder;
+  auto status = tflite::optimize::QuantizeModel(&builder, tflite_model.get(),
+                                                error_reporter_.get());
+  if (status != kTfLiteOk) {
+    error_reporter_->exception();
+    return nullptr;
+  }
+
+  return python_utils::ConvertToPyString(
+      reinterpret_cast<const char*>(builder.GetCurrentBufferPointer()),
+      builder.GetSize());
+}
+
+/*static*/ CalibrationWrapper* CalibrationWrapper::CreateWrapperCPPFromBuffer(
+    PyObject* data) {
+  using tflite::interpreter_wrapper::PythonErrorReporter;
+  char* buf = nullptr;
+  Py_ssize_t length;
+  std::unique_ptr<PythonErrorReporter> error_reporter(new PythonErrorReporter);
+  ::tflite::python::ImportNumpy();
+
+  if (python_utils::ConvertFromPyString(data, &buf, &length) == -1) {
+    return nullptr;
+  }
+  std::unique_ptr<tflite::FlatBufferModel> model =
+      tflite::FlatBufferModel::BuildFromBuffer(buf, length,
+                                               error_reporter.get());
+  if (!model) {
+    PyErr_Format(PyExc_ValueError, "Invalid model");
+    return nullptr;
+  }
+  auto resolver = absl::make_unique<tflite::ops::builtin::BuiltinOpResolver>();
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  std::unique_ptr<tflite::optimize::calibration::CalibrationReader> reader;
+  auto status = tflite::optimize::calibration::BuildLoggingInterpreter(
+      *model, *resolver, &interpreter, &reader);
+  if (status != kTfLiteOk) {
+    error_reporter->exception();
+    return nullptr;
+  }
+
+  auto wrapper = new CalibrationWrapper(
+      std::move(interpreter), std::move(resolver), std::move(error_reporter),
+      std::move(model), std::move(reader));
+  return wrapper;
+}
+
+}  // namespace calibration_wrapper
+}  // namespace tflite
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.h b/tensorflow/lite/python/optimize/calibration_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..213bc4a182d348e5a19b5c2624cca375d367aba7
--- /dev/null
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.h
@@ -0,0 +1,90 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_PYTHON_OPTIMIZE_CALIBRATION_WRAPPER_H_
+#define TENSORFLOW_LITE_PYTHON_OPTIMIZE_CALIBRATION_WRAPPER_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+// Place `<locale>` before <Python.h> to avoid build failures in macOS.
+#include <locale>
+
+// The empty line above is on purpose as otherwise clang-format will
+// automatically move <Python.h> before <locale>.
+#include <Python.h>
+
+// We forward declare TFLite classes here to avoid exposing them to SWIG.
+namespace tflite {
+namespace ops {
+namespace builtin {
+class BuiltinOpResolver;
+}  // namespace builtin
+}  // namespace ops
+
+class FlatBufferModel;
+class Interpreter;
+
+namespace interpreter_wrapper {
+class PythonErrorReporter;
+}  // namespace interpreter_wrapper
+
+namespace optimize {
+namespace calibration {
+class CalibrationReader;
+}  // namespace calibration
+}  // namespace optimize
+
+namespace calibration_wrapper {
+
+class CalibrationWrapper {
+ public:
+  // SWIG caller takes ownership of pointer.
+  static CalibrationWrapper* CreateWrapperCPPFromBuffer(PyObject* data);
+  ~CalibrationWrapper();
+
+  PyObject* Prepare();
+
+  PyObject* FeedTensor(PyObject* input_value);
+
+  PyObject* QuantizeModel();
+
+ private:
+  // CalibrationWrapper is not copyable or assignable. We avoid the use of
+  // CalibrationWrapper() = delete here for SWIG compatibility.
+  CalibrationWrapper(
+      std::unique_ptr<tflite::Interpreter> interpreter,
+      std::unique_ptr<tflite::ops::builtin::BuiltinOpResolver> resolver,
+      std::unique_ptr<tflite::interpreter_wrapper::PythonErrorReporter>
+          error_reporter,
+      std::unique_ptr<tflite::FlatBufferModel> model,
+      std::unique_ptr<tflite::optimize::calibration::CalibrationReader> reader);
+
+  CalibrationWrapper(const CalibrationWrapper& rhs);
+
+  PyObject* SetTensor(int index, PyObject* value);
+
+  std::unique_ptr<tflite::Interpreter> interpreter_;
+  std::unique_ptr<tflite::interpreter_wrapper::PythonErrorReporter>
+      error_reporter_;
+  std::unique_ptr<tflite::ops::builtin::BuiltinOpResolver> resolver_;
+  std::unique_ptr<tflite::FlatBufferModel> model_;
+  std::unique_ptr<tflite::optimize::calibration::CalibrationReader> reader_;
+};
+
+}  // namespace calibration_wrapper
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_PYTHON_OPTIMIZE_CALIBRATION_WRAPPER_H_
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.i b/tensorflow/lite/python/optimize/calibration_wrapper.i
new file mode 100644
index 0000000000000000000000000000000000000000..094ac20733abc3797d5d325b838215c2909045ba
--- /dev/null
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.i
@@ -0,0 +1,27 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+%include "std_string.i"
+
+
+%{
+#define SWIG_FILE_WITH_INIT
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/python/optimize/calibration_wrapper.h"
+%}
+
+
+%include "tensorflow/lite/python/optimize/calibration_wrapper.h"
\ No newline at end of file
diff --git a/tensorflow/lite/python/optimize/calibrator.py b/tensorflow/lite/python/optimize/calibrator.py
new file mode 100644
index 0000000000000000000000000000000000000000..18aabba96e7c7224405d4c345083331d795fbd4c
--- /dev/null
+++ b/tensorflow/lite/python/optimize/calibrator.py
@@ -0,0 +1,68 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python wrapper for post training quantization with calibration."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.util.lazy_loader import LazyLoader
+
+# Lazy load since some of the performance benchmark skylark rules
+# break dependencies. Must use double quotes to match code internal rewrite
+# rule.
+_calibration_wrapper = LazyLoader(
+    "_calibration_wrapper", globals(),
+    "tensorflow.lite.python.optimize."
+    "tensorflow_lite_wrap_calibration_wrapper")
+
+
+class Calibrator(object):
+  """Calibrates a floating point model and then quantizes it.
+
+  This is an internal class, not a public interface.
+  """
+
+  def __init__(self, model_content):
+    """Constructor.
+
+    Args:
+      model_content: Content of a TF-Lite Flatbuffer file.
+
+    Raises:
+      ValueError: If the calibrator was unable to open the model.
+    """
+    if not model_content:
+      raise ValueError("`model_content` must be specified.")
+    try:
+      self._calibrator = (_calibration_wrapper.CalibrationWrapper
+                          .CreateWrapperCPPFromBuffer(model_content))
+    except Exception as e:
+      raise ValueError("Failed to parse the model: %s." % e)
+    if not self._calibrator:
+      raise ValueError("Failed to parse the model.")
+
+  def calibrate_and_quantize(self, dataset_gen):
+    """Calibrates the model with specified generator and then quantizes it.
+
+    Returns:
+      A quantized model.
+
+    Args:
+      dataset_gen: A generator that generates calibration samples.
+    """
+    self._calibrator.Prepare()
+    for calibration_sample in dataset_gen():
+      self._calibrator.FeedTensor([calibration_sample])
+    return self._calibrator.QuantizeModel()
diff --git a/tensorflow/lite/python/optimize/calibrator_test.py b/tensorflow/lite/python/optimize/calibrator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..230b46b3dfe924cc9834eccfc3e90be661baae5c
--- /dev/null
+++ b/tensorflow/lite/python/optimize/calibrator_test.py
@@ -0,0 +1,93 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Calibrator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+
+from tensorflow.lite.python.optimize import calibrator as _calibrator
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.platform import test
+
+
+class CalibratorTest(test_util.TensorFlowTestCase):
+
+  def test_calibration_with_quantization(self):
+    model_path = resource_loader.get_path_to_datafile(
+        'test_data/mobilenet_like_model.bin')
+    float_model = open(model_path, 'rb').read()
+    quantizer = _calibrator.Calibrator(float_model)
+
+    # Input generator for the model.
+    def input_gen():
+      for _ in range(10):
+        yield np.ones(shape=(1, 5, 5, 3), dtype=np.float32)
+
+    quantized_model = quantizer.calibrate_and_quantize(input_gen)
+    self.assertIsNotNone(quantized_model)
+
+  def test_invalid_model_buffer(self):
+    float_model = b'\0' * 100
+    with self.assertRaisesWithRegexpMatch(ValueError,
+                                          'Failed to parse the model'):
+      _calibrator.Calibrator(float_model)
+
+  def test_empty_calibrator_gen(self):
+    model_path = resource_loader.get_path_to_datafile(
+        'test_data/mobilenet_like_model.bin')
+    float_model = open(model_path, 'rb').read()
+    quantizer = _calibrator.Calibrator(float_model)
+
+    def empty_input_gen():
+      for i in ():
+        yield i
+
+    with self.assertRaises(RuntimeError):
+      quantizer.calibrate_and_quantize(empty_input_gen)
+
+  def test_invalid_shape_calibrator_gen(self):
+    model_path = resource_loader.get_path_to_datafile(
+        'test_data/mobilenet_like_model.bin')
+    float_model = open(model_path, 'rb').read()
+    quantizer = _calibrator.Calibrator(float_model)
+
+    # Input generator with incorrect shape.
+    def input_gen():
+      for _ in range(10):
+        yield np.ones(shape=(1, 2, 2, 3), dtype=np.float32)
+
+    with self.assertRaisesWithRegexpMatch(ValueError, 'Dimension mismatch'):
+      quantizer.calibrate_and_quantize(input_gen)
+
+  def test_invalid_type_calibrator_gen(self):
+    model_path = resource_loader.get_path_to_datafile(
+        'test_data/mobilenet_like_model.bin')
+    float_model = open(model_path, 'rb').read()
+    quantizer = _calibrator.Calibrator(float_model)
+
+    # Input generator with incorrect shape.
+    def input_gen():
+      for _ in range(10):
+        yield np.ones(shape=(1, 5, 5, 3), dtype=np.int32)
+
+    with self.assertRaises(ValueError):
+      quantizer.calibrate_and_quantize(input_gen)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/lite/python/optimize/test_data/mobilenet_like_model.bin b/tensorflow/lite/python/optimize/test_data/mobilenet_like_model.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a2909249ffd2675fad9c0cd60a6ff75f940b3fb0
Binary files /dev/null and b/tensorflow/lite/python/optimize/test_data/mobilenet_like_model.bin differ
diff --git a/tensorflow/lite/python/testdata/BUILD b/tensorflow/lite/python/testdata/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..2fa08e5326990ecda1857fec8eb9caadac1f4102
--- /dev/null
+++ b/tensorflow/lite/python/testdata/BUILD
@@ -0,0 +1,53 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow/lite:build_def.bzl", "tf_to_tflite")
+
+exports_files(glob(["*.pb"]))
+
+tf_to_tflite(
+    name = "permute_float",
+    src = "permute.pbtxt",
+    out = "permute_float.tflite",
+    options = [
+        "--input_arrays=input",
+        "--output_arrays=output",
+    ],
+)
+
+tf_to_tflite(
+    name = "permute_uint8",
+    src = "permute.pbtxt",
+    out = "permute_uint8.tflite",
+    options = [
+        "--input_arrays=input",
+        "--output_arrays=output",
+        "--inference_type=QUANTIZED_UINT8",
+        "--std_values=1",
+        "--mean_values=0",
+        "--default_ranges_min=0",
+        "--default_ranges_max=255",
+    ],
+)
+
+tf_to_tflite(
+    name = "gather_string",
+    src = "gather.pbtxt",
+    out = "gather_string.tflite",
+    options = [
+        "--input_arrays=input,indices",
+        "--output_arrays=output",
+    ],
+)
+
+filegroup(
+    name = "interpreter_test_data",
+    srcs = [
+        ":gather_string",
+        ":permute_float",
+        ":permute_uint8",
+    ],
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/lite/python/testdata/gather.pbtxt b/tensorflow/lite/python/testdata/gather.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0b1193c475d3b4b663accf036753bfbe9d8adb7d
--- /dev/null
+++ b/tensorflow/lite/python/testdata/gather.pbtxt
@@ -0,0 +1,93 @@
+node {
+  name: "input"
+  op: "Placeholder"
+  device: "/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+}
+node {
+  name: "indices"
+  op: "Placeholder"
+  device: "/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+      }
+    }
+  }
+}
+node {
+  name: "axis"
+  op: "Const"
+  device: "/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "output"
+  op: "GatherV2"
+  input: "input"
+  input: "indices"
+  input: "axis"
+  device: "/device:CPU:0"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_STRING
+    }
+  }
+}
+versions {
+  producer: 27
+}
diff --git a/tensorflow/lite/python/testdata/permute.pbtxt b/tensorflow/lite/python/testdata/permute.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..482b5c64828f4f5ef2057b4552a81425485d0841
--- /dev/null
+++ b/tensorflow/lite/python/testdata/permute.pbtxt
@@ -0,0 +1,98 @@
+node {
+  name: "input"
+  op: "Placeholder"
+  device: "/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 4
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Const"
+  op: "Const"
+  device: "/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 4
+          }
+          dim {
+            size: 4
+          }
+        }
+        float_val: 0.0
+        float_val: 0.0
+        float_val: 0.0
+        float_val: 1.0
+
+        float_val: 0.0
+        float_val: 0.0
+        float_val: 1.0
+        float_val: 0.0
+
+        float_val: 0.0
+        float_val: 1.0
+        float_val: 0.0
+        float_val: 0.0
+
+        float_val: 1.0
+        float_val: 0.0
+        float_val: 0.0
+        float_val: 0.0
+      }
+    }
+  }
+}
+node {
+  name: "output"
+  op: "MatMul"
+  input: "input"
+  input: "Const"
+  device: "/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+versions {
+  producer: 27
+}
diff --git a/tensorflow/lite/python/tflite_convert.py b/tensorflow/lite/python/tflite_convert.py
index 341b539bead296ca28c1f5f8c17928e553ebabc4..401a592273c9c76f1f371bb8972f7f9a3d494278 100644
--- a/tensorflow/lite/python/tflite_convert.py
+++ b/tensorflow/lite/python/tflite_convert.py
@@ -343,13 +343,13 @@ def run_main(_):
             "floats. Used for quantized input tensors. (default None)"))
   parser.add_argument(
       "--default_ranges_min",
-      type=int,
+      type=float,
       help=("Default value for min bound of min/max range values used for all "
             "arrays without a specified range, Intended for experimenting with "
             "quantization via \"dummy quantization\". (default None)"))
   parser.add_argument(
       "--default_ranges_max",
-      type=int,
+      type=float,
       help=("Default value for max bound of min/max range values used for all "
             "arrays without a specified range, Intended for experimenting with "
             "quantization via \"dummy quantization\". (default None)"))
diff --git a/tensorflow/lite/schema/BUILD b/tensorflow/lite/schema/BUILD
index 69d5458c6e432a2370a2ca4998a5d4664398c528..e55419186e16f62f27f9df0201e814cb8936fc27 100644
--- a/tensorflow/lite/schema/BUILD
+++ b/tensorflow/lite/schema/BUILD
@@ -9,6 +9,12 @@ load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 py_binary(
     name = "upgrade_schema",
+    srcs = ["upgrade_schema.py"],
+    deps = [":upgrade_schema_main_lib"],
+)
+
+py_library(
+    name = "upgrade_schema_main_lib",
     srcs = [
         "upgrade_schema.py",
     ],
@@ -39,7 +45,7 @@ py_test(
         "notap",
     ],
     deps = [
-        ":upgrade_schema",
+        ":upgrade_schema_main_lib",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
     ],
@@ -70,7 +76,6 @@ flatbuffer_cc_library(
         "--no-union-value-namespacing",
         "--gen-object-api",
     ],
-    gen_reflections = True,
     out_prefix = "reflection/",
 )
 
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index 980f13b19b4f6a32fe8b693c560be2b4f4f95fd9..69a0a0acebdc2fa75eed688751085dccc0713479 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -64,9 +64,20 @@ table QuantizationParameters {
   scale:[float];  // For dequantizing the tensor's values.
   zero_point:[long];
 
-  // If this is not none, the quantization parameters above are ignored and the
-  // value of the QuantizationDetails union below should be used.
+  // If this is not none, the other quantization parameters (i.e. min, max,
+  // scale, zero_point fields above) are ignored and the value of the
+  // QuantizationDetails union should be used.
   details:QuantizationDetails;
+
+  // Specifies the dimension of the Tensor's shape that the scales and
+  // zero_points correspond to. For example, a tensor t, with dims=[4, 3, 2, 1]
+  // with quantization params:
+  //   scale=[1.0, 2.0, 3.0], zero_point=[1, 2, 3], quantization_dimension=1
+  // will be quantized across the second dimension of t.
+  //   t[:, 0, :, :] will have scale[0]=1.0, zero_point[0]=1
+  //   t[:, 1, :, :] will have scale[1]=2.0, zero_point[0]=2
+  //   t[:, 2, :, :] will have scale[2]=3.0, zero_point[0]=3
+  quantized_dimension:int;
 }
 
 table Tensor {
@@ -205,6 +216,15 @@ enum BuiltinOperator : byte {
   MIRROR_PAD = 100,
   ABS = 101,
   SPLIT_V = 102,
+  UNIQUE = 103,
+  CEIL = 104,
+  REVERSE_V2 = 105,
+  ADD_N = 106,
+  GATHER_ND = 107,
+  COS = 108,
+  WHERE = 109,
+  RANK = 110,
+  ELU = 111,
 }
 
 // Options for the builtin operators.
@@ -288,6 +308,13 @@ union BuiltinOptions {
   MirrorPadOptions,
   AbsOptions,
   SplitVOptions,
+  UniqueOptions,
+  ReverseV2Options,
+  AddNOptions,
+  GatherNdOptions,
+  CosOptions,
+  WhereOptions,
+  RankOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -443,12 +470,19 @@ table UnidirectionalSequenceLSTMOptions {
 }
 
 table BidirectionalSequenceLSTMOptions {
+  // Parameters supported by version 1:
   fused_activation_function:ActivationFunctionType;
   cell_clip: float; // Optional, 0.0 means no clipping
   proj_clip: float; // Optional, 0.0 means no clipping
 
   // If true, store the outputs of both directions into the first output.
   merge_outputs: bool;
+
+  // Parameters supported by version 2:
+  // If true then first dimension is sequence, otherwise batch.
+  // Version 1 implementations assumed time_major to be true, so this default
+  // value should never change.
+  time_major: bool = true;
 }
 
 table ResizeBilinearOptions {
@@ -524,6 +558,9 @@ table TransposeOptions {
 table ExpOptions {
 }
 
+table CosOptions {
+}
+
 table ReducerOptions {
   keep_dims: bool;
 }
@@ -618,6 +655,9 @@ table ShapeOptions {
   out_type : TensorType;
 }
 
+table RankOptions {
+}
+
 table PowOptions {
 }
 
@@ -694,6 +734,22 @@ table MirrorPadOptions {
   mode:MirrorPadMode;
 }
 
+table UniqueOptions {
+  idx_out_type:TensorType = INT32;
+}
+
+table ReverseV2Options {
+}
+
+table AddNOptions {
+}
+
+table GatherNdOptions {
+}
+
+table WhereOptions {
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index 637cbafabdad47892b1e3f4a93837b44d50a5b46..ad9ae192db4e5005631c57621d50664dcac132ed 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -139,6 +139,9 @@ struct TransposeOptionsT;
 struct ExpOptions;
 struct ExpOptionsT;
 
+struct CosOptions;
+struct CosOptionsT;
+
 struct ReducerOptions;
 struct ReducerOptionsT;
 
@@ -214,6 +217,9 @@ struct NotEqualOptionsT;
 struct ShapeOptions;
 struct ShapeOptionsT;
 
+struct RankOptions;
+struct RankOptionsT;
+
 struct PowOptions;
 struct PowOptionsT;
 
@@ -268,6 +274,21 @@ struct SquaredDifferenceOptionsT;
 struct MirrorPadOptions;
 struct MirrorPadOptionsT;
 
+struct UniqueOptions;
+struct UniqueOptionsT;
+
+struct ReverseV2Options;
+struct ReverseV2OptionsT;
+
+struct AddNOptions;
+struct AddNOptionsT;
+
+struct GatherNdOptions;
+struct GatherNdOptionsT;
+
+struct WhereOptions;
+struct WhereOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -520,11 +541,20 @@ enum BuiltinOperator {
   BuiltinOperator_MIRROR_PAD = 100,
   BuiltinOperator_ABS = 101,
   BuiltinOperator_SPLIT_V = 102,
+  BuiltinOperator_UNIQUE = 103,
+  BuiltinOperator_CEIL = 104,
+  BuiltinOperator_REVERSE_V2 = 105,
+  BuiltinOperator_ADD_N = 106,
+  BuiltinOperator_GATHER_ND = 107,
+  BuiltinOperator_COS = 108,
+  BuiltinOperator_WHERE = 109,
+  BuiltinOperator_RANK = 110,
+  BuiltinOperator_ELU = 111,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_SPLIT_V
+  BuiltinOperator_MAX = BuiltinOperator_ELU
 };
 
-inline const BuiltinOperator (&EnumValuesBuiltinOperator())[102] {
+inline const BuiltinOperator (&EnumValuesBuiltinOperator())[111] {
   static const BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -627,7 +657,16 @@ inline const BuiltinOperator (&EnumValuesBuiltinOperator())[102] {
     BuiltinOperator_SQUARED_DIFFERENCE,
     BuiltinOperator_MIRROR_PAD,
     BuiltinOperator_ABS,
-    BuiltinOperator_SPLIT_V
+    BuiltinOperator_SPLIT_V,
+    BuiltinOperator_UNIQUE,
+    BuiltinOperator_CEIL,
+    BuiltinOperator_REVERSE_V2,
+    BuiltinOperator_ADD_N,
+    BuiltinOperator_GATHER_ND,
+    BuiltinOperator_COS,
+    BuiltinOperator_WHERE,
+    BuiltinOperator_RANK,
+    BuiltinOperator_ELU
   };
   return values;
 }
@@ -737,6 +776,15 @@ inline const char * const *EnumNamesBuiltinOperator() {
     "MIRROR_PAD",
     "ABS",
     "SPLIT_V",
+    "UNIQUE",
+    "CEIL",
+    "REVERSE_V2",
+    "ADD_N",
+    "GATHER_ND",
+    "COS",
+    "WHERE",
+    "RANK",
+    "ELU",
     nullptr
   };
   return names;
@@ -828,11 +876,18 @@ enum BuiltinOptions {
   BuiltinOptions_MirrorPadOptions = 77,
   BuiltinOptions_AbsOptions = 78,
   BuiltinOptions_SplitVOptions = 79,
+  BuiltinOptions_UniqueOptions = 80,
+  BuiltinOptions_ReverseV2Options = 81,
+  BuiltinOptions_AddNOptions = 82,
+  BuiltinOptions_GatherNdOptions = 83,
+  BuiltinOptions_CosOptions = 84,
+  BuiltinOptions_WhereOptions = 85,
+  BuiltinOptions_RankOptions = 86,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_SplitVOptions
+  BuiltinOptions_MAX = BuiltinOptions_RankOptions
 };
 
-inline const BuiltinOptions (&EnumValuesBuiltinOptions())[80] {
+inline const BuiltinOptions (&EnumValuesBuiltinOptions())[87] {
   static const BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -913,7 +968,14 @@ inline const BuiltinOptions (&EnumValuesBuiltinOptions())[80] {
     BuiltinOptions_SquaredDifferenceOptions,
     BuiltinOptions_MirrorPadOptions,
     BuiltinOptions_AbsOptions,
-    BuiltinOptions_SplitVOptions
+    BuiltinOptions_SplitVOptions,
+    BuiltinOptions_UniqueOptions,
+    BuiltinOptions_ReverseV2Options,
+    BuiltinOptions_AddNOptions,
+    BuiltinOptions_GatherNdOptions,
+    BuiltinOptions_CosOptions,
+    BuiltinOptions_WhereOptions,
+    BuiltinOptions_RankOptions
   };
   return values;
 }
@@ -1000,6 +1062,13 @@ inline const char * const *EnumNamesBuiltinOptions() {
     "MirrorPadOptions",
     "AbsOptions",
     "SplitVOptions",
+    "UniqueOptions",
+    "ReverseV2Options",
+    "AddNOptions",
+    "GatherNdOptions",
+    "CosOptions",
+    "WhereOptions",
+    "RankOptions",
     nullptr
   };
   return names;
@@ -1330,6 +1399,34 @@ template<> struct BuiltinOptionsTraits<SplitVOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_SplitVOptions;
 };
 
+template<> struct BuiltinOptionsTraits<UniqueOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_UniqueOptions;
+};
+
+template<> struct BuiltinOptionsTraits<ReverseV2Options> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ReverseV2Options;
+};
+
+template<> struct BuiltinOptionsTraits<AddNOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_AddNOptions;
+};
+
+template<> struct BuiltinOptionsTraits<GatherNdOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_GatherNdOptions;
+};
+
+template<> struct BuiltinOptionsTraits<CosOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_CosOptions;
+};
+
+template<> struct BuiltinOptionsTraits<WhereOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_WhereOptions;
+};
+
+template<> struct BuiltinOptionsTraits<RankOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_RankOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -1993,6 +2090,62 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_SplitVOptions ?
       reinterpret_cast<const SplitVOptionsT *>(value) : nullptr;
   }
+  UniqueOptionsT *AsUniqueOptions() {
+    return type == BuiltinOptions_UniqueOptions ?
+      reinterpret_cast<UniqueOptionsT *>(value) : nullptr;
+  }
+  const UniqueOptionsT *AsUniqueOptions() const {
+    return type == BuiltinOptions_UniqueOptions ?
+      reinterpret_cast<const UniqueOptionsT *>(value) : nullptr;
+  }
+  ReverseV2OptionsT *AsReverseV2Options() {
+    return type == BuiltinOptions_ReverseV2Options ?
+      reinterpret_cast<ReverseV2OptionsT *>(value) : nullptr;
+  }
+  const ReverseV2OptionsT *AsReverseV2Options() const {
+    return type == BuiltinOptions_ReverseV2Options ?
+      reinterpret_cast<const ReverseV2OptionsT *>(value) : nullptr;
+  }
+  AddNOptionsT *AsAddNOptions() {
+    return type == BuiltinOptions_AddNOptions ?
+      reinterpret_cast<AddNOptionsT *>(value) : nullptr;
+  }
+  const AddNOptionsT *AsAddNOptions() const {
+    return type == BuiltinOptions_AddNOptions ?
+      reinterpret_cast<const AddNOptionsT *>(value) : nullptr;
+  }
+  GatherNdOptionsT *AsGatherNdOptions() {
+    return type == BuiltinOptions_GatherNdOptions ?
+      reinterpret_cast<GatherNdOptionsT *>(value) : nullptr;
+  }
+  const GatherNdOptionsT *AsGatherNdOptions() const {
+    return type == BuiltinOptions_GatherNdOptions ?
+      reinterpret_cast<const GatherNdOptionsT *>(value) : nullptr;
+  }
+  CosOptionsT *AsCosOptions() {
+    return type == BuiltinOptions_CosOptions ?
+      reinterpret_cast<CosOptionsT *>(value) : nullptr;
+  }
+  const CosOptionsT *AsCosOptions() const {
+    return type == BuiltinOptions_CosOptions ?
+      reinterpret_cast<const CosOptionsT *>(value) : nullptr;
+  }
+  WhereOptionsT *AsWhereOptions() {
+    return type == BuiltinOptions_WhereOptions ?
+      reinterpret_cast<WhereOptionsT *>(value) : nullptr;
+  }
+  const WhereOptionsT *AsWhereOptions() const {
+    return type == BuiltinOptions_WhereOptions ?
+      reinterpret_cast<const WhereOptionsT *>(value) : nullptr;
+  }
+  RankOptionsT *AsRankOptions() {
+    return type == BuiltinOptions_RankOptions ?
+      reinterpret_cast<RankOptionsT *>(value) : nullptr;
+  }
+  const RankOptionsT *AsRankOptions() const {
+    return type == BuiltinOptions_RankOptions ?
+      reinterpret_cast<const RankOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -2314,7 +2467,9 @@ struct QuantizationParametersT : public flatbuffers::NativeTable {
   std::vector<float> scale;
   std::vector<int64_t> zero_point;
   QuantizationDetailsUnion details;
-  QuantizationParametersT() {
+  int32_t quantized_dimension;
+  QuantizationParametersT()
+      : quantized_dimension(0) {
   }
 };
 
@@ -2326,7 +2481,8 @@ struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
     VT_SCALE = 8,
     VT_ZERO_POINT = 10,
     VT_DETAILS_TYPE = 12,
-    VT_DETAILS = 14
+    VT_DETAILS = 14,
+    VT_QUANTIZED_DIMENSION = 16
   };
   const flatbuffers::Vector<float> *min() const {
     return GetPointer<const flatbuffers::Vector<float> *>(VT_MIN);
@@ -2350,6 +2506,9 @@ struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
   const CustomQuantization *details_as_CustomQuantization() const {
     return details_type() == QuantizationDetails_CustomQuantization ? static_cast<const CustomQuantization *>(details()) : nullptr;
   }
+  int32_t quantized_dimension() const {
+    return GetField<int32_t>(VT_QUANTIZED_DIMENSION, 0);
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_MIN) &&
@@ -2363,6 +2522,7 @@ struct QuantizationParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab
            VerifyField<uint8_t>(verifier, VT_DETAILS_TYPE) &&
            VerifyOffset(verifier, VT_DETAILS) &&
            VerifyQuantizationDetails(verifier, details(), details_type()) &&
+           VerifyField<int32_t>(verifier, VT_QUANTIZED_DIMENSION) &&
            verifier.EndTable();
   }
   QuantizationParametersT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -2395,6 +2555,9 @@ struct QuantizationParametersBuilder {
   void add_details(flatbuffers::Offset<void> details) {
     fbb_.AddOffset(QuantizationParameters::VT_DETAILS, details);
   }
+  void add_quantized_dimension(int32_t quantized_dimension) {
+    fbb_.AddElement<int32_t>(QuantizationParameters::VT_QUANTIZED_DIMENSION, quantized_dimension, 0);
+  }
   explicit QuantizationParametersBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2414,8 +2577,10 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
     flatbuffers::Offset<flatbuffers::Vector<float>> scale = 0,
     flatbuffers::Offset<flatbuffers::Vector<int64_t>> zero_point = 0,
     QuantizationDetails details_type = QuantizationDetails_NONE,
-    flatbuffers::Offset<void> details = 0) {
+    flatbuffers::Offset<void> details = 0,
+    int32_t quantized_dimension = 0) {
   QuantizationParametersBuilder builder_(_fbb);
+  builder_.add_quantized_dimension(quantized_dimension);
   builder_.add_details(details);
   builder_.add_zero_point(zero_point);
   builder_.add_scale(scale);
@@ -2432,7 +2597,8 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParametersD
     const std::vector<float> *scale = nullptr,
     const std::vector<int64_t> *zero_point = nullptr,
     QuantizationDetails details_type = QuantizationDetails_NONE,
-    flatbuffers::Offset<void> details = 0) {
+    flatbuffers::Offset<void> details = 0,
+    int32_t quantized_dimension = 0) {
   return tflite::CreateQuantizationParameters(
       _fbb,
       min ? _fbb.CreateVector<float>(*min) : 0,
@@ -2440,7 +2606,8 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParametersD
       scale ? _fbb.CreateVector<float>(*scale) : 0,
       zero_point ? _fbb.CreateVector<int64_t>(*zero_point) : 0,
       details_type,
-      details);
+      details,
+      quantized_dimension);
 }
 
 flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -3963,11 +4130,13 @@ struct BidirectionalSequenceLSTMOptionsT : public flatbuffers::NativeTable {
   float cell_clip;
   float proj_clip;
   bool merge_outputs;
+  bool time_major;
   BidirectionalSequenceLSTMOptionsT()
       : fused_activation_function(ActivationFunctionType_NONE),
         cell_clip(0.0f),
         proj_clip(0.0f),
-        merge_outputs(false) {
+        merge_outputs(false),
+        time_major(true) {
   }
 };
 
@@ -3977,7 +4146,8 @@ struct BidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbu
     VT_FUSED_ACTIVATION_FUNCTION = 4,
     VT_CELL_CLIP = 6,
     VT_PROJ_CLIP = 8,
-    VT_MERGE_OUTPUTS = 10
+    VT_MERGE_OUTPUTS = 10,
+    VT_TIME_MAJOR = 12
   };
   ActivationFunctionType fused_activation_function() const {
     return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
@@ -3991,12 +4161,16 @@ struct BidirectionalSequenceLSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbu
   bool merge_outputs() const {
     return GetField<uint8_t>(VT_MERGE_OUTPUTS, 0) != 0;
   }
+  bool time_major() const {
+    return GetField<uint8_t>(VT_TIME_MAJOR, 1) != 0;
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            VerifyField<float>(verifier, VT_CELL_CLIP) &&
            VerifyField<float>(verifier, VT_PROJ_CLIP) &&
            VerifyField<uint8_t>(verifier, VT_MERGE_OUTPUTS) &&
+           VerifyField<uint8_t>(verifier, VT_TIME_MAJOR) &&
            verifier.EndTable();
   }
   BidirectionalSequenceLSTMOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -4019,6 +4193,9 @@ struct BidirectionalSequenceLSTMOptionsBuilder {
   void add_merge_outputs(bool merge_outputs) {
     fbb_.AddElement<uint8_t>(BidirectionalSequenceLSTMOptions::VT_MERGE_OUTPUTS, static_cast<uint8_t>(merge_outputs), 0);
   }
+  void add_time_major(bool time_major) {
+    fbb_.AddElement<uint8_t>(BidirectionalSequenceLSTMOptions::VT_TIME_MAJOR, static_cast<uint8_t>(time_major), 1);
+  }
   explicit BidirectionalSequenceLSTMOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -4036,10 +4213,12 @@ inline flatbuffers::Offset<BidirectionalSequenceLSTMOptions> CreateBidirectional
     ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
     float cell_clip = 0.0f,
     float proj_clip = 0.0f,
-    bool merge_outputs = false) {
+    bool merge_outputs = false,
+    bool time_major = true) {
   BidirectionalSequenceLSTMOptionsBuilder builder_(_fbb);
   builder_.add_proj_clip(proj_clip);
   builder_.add_cell_clip(cell_clip);
+  builder_.add_time_major(time_major);
   builder_.add_merge_outputs(merge_outputs);
   builder_.add_fused_activation_function(fused_activation_function);
   return builder_.Finish();
@@ -4899,6 +5078,46 @@ inline flatbuffers::Offset<ExpOptions> CreateExpOptions(
 
 flatbuffers::Offset<ExpOptions> CreateExpOptions(flatbuffers::FlatBufferBuilder &_fbb, const ExpOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct CosOptionsT : public flatbuffers::NativeTable {
+  typedef CosOptions TableType;
+  CosOptionsT() {
+  }
+};
+
+struct CosOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef CosOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  CosOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(CosOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<CosOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct CosOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit CosOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  CosOptionsBuilder &operator=(const CosOptionsBuilder &);
+  flatbuffers::Offset<CosOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<CosOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<CosOptions> CreateCosOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  CosOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<CosOptions> CreateCosOptions(flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct ReducerOptionsT : public flatbuffers::NativeTable {
   typedef ReducerOptions TableType;
   bool keep_dims;
@@ -6145,6 +6364,46 @@ inline flatbuffers::Offset<ShapeOptions> CreateShapeOptions(
 
 flatbuffers::Offset<ShapeOptions> CreateShapeOptions(flatbuffers::FlatBufferBuilder &_fbb, const ShapeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct RankOptionsT : public flatbuffers::NativeTable {
+  typedef RankOptions TableType;
+  RankOptionsT() {
+  }
+};
+
+struct RankOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef RankOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  RankOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(RankOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<RankOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct RankOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit RankOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  RankOptionsBuilder &operator=(const RankOptionsBuilder &);
+  flatbuffers::Offset<RankOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<RankOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<RankOptions> CreateRankOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  RankOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<RankOptions> CreateRankOptions(flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct PowOptionsT : public flatbuffers::NativeTable {
   typedef PowOptions TableType;
   PowOptionsT() {
@@ -7009,6 +7268,220 @@ inline flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(
 
 flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct UniqueOptionsT : public flatbuffers::NativeTable {
+  typedef UniqueOptions TableType;
+  TensorType idx_out_type;
+  UniqueOptionsT()
+      : idx_out_type(TensorType_INT32) {
+  }
+};
+
+struct UniqueOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef UniqueOptionsT NativeTableType;
+  enum {
+    VT_IDX_OUT_TYPE = 4
+  };
+  TensorType idx_out_type() const {
+    return static_cast<TensorType>(GetField<int8_t>(VT_IDX_OUT_TYPE, 2));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_IDX_OUT_TYPE) &&
+           verifier.EndTable();
+  }
+  UniqueOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(UniqueOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<UniqueOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct UniqueOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_idx_out_type(TensorType idx_out_type) {
+    fbb_.AddElement<int8_t>(UniqueOptions::VT_IDX_OUT_TYPE, static_cast<int8_t>(idx_out_type), 2);
+  }
+  explicit UniqueOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  UniqueOptionsBuilder &operator=(const UniqueOptionsBuilder &);
+  flatbuffers::Offset<UniqueOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<UniqueOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<UniqueOptions> CreateUniqueOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    TensorType idx_out_type = TensorType_INT32) {
+  UniqueOptionsBuilder builder_(_fbb);
+  builder_.add_idx_out_type(idx_out_type);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<UniqueOptions> CreateUniqueOptions(flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ReverseV2OptionsT : public flatbuffers::NativeTable {
+  typedef ReverseV2Options TableType;
+  ReverseV2OptionsT() {
+  }
+};
+
+struct ReverseV2Options FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ReverseV2OptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  ReverseV2OptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ReverseV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ReverseV2Options> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ReverseV2OptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit ReverseV2OptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ReverseV2OptionsBuilder &operator=(const ReverseV2OptionsBuilder &);
+  flatbuffers::Offset<ReverseV2Options> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ReverseV2Options>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ReverseV2Options> CreateReverseV2Options(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  ReverseV2OptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ReverseV2Options> CreateReverseV2Options(flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct AddNOptionsT : public flatbuffers::NativeTable {
+  typedef AddNOptions TableType;
+  AddNOptionsT() {
+  }
+};
+
+struct AddNOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef AddNOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  AddNOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(AddNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<AddNOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const AddNOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct AddNOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit AddNOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  AddNOptionsBuilder &operator=(const AddNOptionsBuilder &);
+  flatbuffers::Offset<AddNOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<AddNOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<AddNOptions> CreateAddNOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  AddNOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<AddNOptions> CreateAddNOptions(flatbuffers::FlatBufferBuilder &_fbb, const AddNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct GatherNdOptionsT : public flatbuffers::NativeTable {
+  typedef GatherNdOptions TableType;
+  GatherNdOptionsT() {
+  }
+};
+
+struct GatherNdOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef GatherNdOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  GatherNdOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GatherNdOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<GatherNdOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct GatherNdOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit GatherNdOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  GatherNdOptionsBuilder &operator=(const GatherNdOptionsBuilder &);
+  flatbuffers::Offset<GatherNdOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<GatherNdOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<GatherNdOptions> CreateGatherNdOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  GatherNdOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<GatherNdOptions> CreateGatherNdOptions(flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct WhereOptionsT : public flatbuffers::NativeTable {
+  typedef WhereOptions TableType;
+  WhereOptionsT() {
+  }
+};
+
+struct WhereOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef WhereOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  WhereOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(WhereOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<WhereOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct WhereOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit WhereOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  WhereOptionsBuilder &operator=(const WhereOptionsBuilder &);
+  flatbuffers::Offset<WhereOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<WhereOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<WhereOptions> CreateWhereOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  WhereOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<WhereOptions> CreateWhereOptions(flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   BuiltinOperator builtin_code;
@@ -7379,6 +7852,27 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const SplitVOptions *builtin_options_as_SplitVOptions() const {
     return builtin_options_type() == BuiltinOptions_SplitVOptions ? static_cast<const SplitVOptions *>(builtin_options()) : nullptr;
   }
+  const UniqueOptions *builtin_options_as_UniqueOptions() const {
+    return builtin_options_type() == BuiltinOptions_UniqueOptions ? static_cast<const UniqueOptions *>(builtin_options()) : nullptr;
+  }
+  const ReverseV2Options *builtin_options_as_ReverseV2Options() const {
+    return builtin_options_type() == BuiltinOptions_ReverseV2Options ? static_cast<const ReverseV2Options *>(builtin_options()) : nullptr;
+  }
+  const AddNOptions *builtin_options_as_AddNOptions() const {
+    return builtin_options_type() == BuiltinOptions_AddNOptions ? static_cast<const AddNOptions *>(builtin_options()) : nullptr;
+  }
+  const GatherNdOptions *builtin_options_as_GatherNdOptions() const {
+    return builtin_options_type() == BuiltinOptions_GatherNdOptions ? static_cast<const GatherNdOptions *>(builtin_options()) : nullptr;
+  }
+  const CosOptions *builtin_options_as_CosOptions() const {
+    return builtin_options_type() == BuiltinOptions_CosOptions ? static_cast<const CosOptions *>(builtin_options()) : nullptr;
+  }
+  const WhereOptions *builtin_options_as_WhereOptions() const {
+    return builtin_options_type() == BuiltinOptions_WhereOptions ? static_cast<const WhereOptions *>(builtin_options()) : nullptr;
+  }
+  const RankOptions *builtin_options_as_RankOptions() const {
+    return builtin_options_type() == BuiltinOptions_RankOptions ? static_cast<const RankOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -7726,6 +8220,34 @@ template<> inline const SplitVOptions *Operator::builtin_options_as<SplitVOption
   return builtin_options_as_SplitVOptions();
 }
 
+template<> inline const UniqueOptions *Operator::builtin_options_as<UniqueOptions>() const {
+  return builtin_options_as_UniqueOptions();
+}
+
+template<> inline const ReverseV2Options *Operator::builtin_options_as<ReverseV2Options>() const {
+  return builtin_options_as_ReverseV2Options();
+}
+
+template<> inline const AddNOptions *Operator::builtin_options_as<AddNOptions>() const {
+  return builtin_options_as_AddNOptions();
+}
+
+template<> inline const GatherNdOptions *Operator::builtin_options_as<GatherNdOptions>() const {
+  return builtin_options_as_GatherNdOptions();
+}
+
+template<> inline const CosOptions *Operator::builtin_options_as<CosOptions>() const {
+  return builtin_options_as_CosOptions();
+}
+
+template<> inline const WhereOptions *Operator::builtin_options_as<WhereOptions>() const {
+  return builtin_options_as_WhereOptions();
+}
+
+template<> inline const RankOptions *Operator::builtin_options_as<RankOptions>() const {
+  return builtin_options_as_RankOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -8169,6 +8691,7 @@ inline void QuantizationParameters::UnPackTo(QuantizationParametersT *_o, const
   { auto _e = zero_point(); if (_e) { _o->zero_point.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->zero_point[_i] = _e->Get(_i); } } };
   { auto _e = details_type(); _o->details.type = _e; };
   { auto _e = details(); if (_e) _o->details.value = QuantizationDetailsUnion::UnPack(_e, details_type(), _resolver); };
+  { auto _e = quantized_dimension(); _o->quantized_dimension = _e; };
 }
 
 inline flatbuffers::Offset<QuantizationParameters> QuantizationParameters::Pack(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -8185,6 +8708,7 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
   auto _zero_point = _o->zero_point.size() ? _fbb.CreateVector(_o->zero_point) : 0;
   auto _details_type = _o->details.type;
   auto _details = _o->details.Pack(_fbb);
+  auto _quantized_dimension = _o->quantized_dimension;
   return tflite::CreateQuantizationParameters(
       _fbb,
       _min,
@@ -8192,7 +8716,8 @@ inline flatbuffers::Offset<QuantizationParameters> CreateQuantizationParameters(
       _scale,
       _zero_point,
       _details_type,
-      _details);
+      _details,
+      _quantized_dimension);
 }
 
 inline TensorT *Tensor::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@@ -8816,6 +9341,7 @@ inline void BidirectionalSequenceLSTMOptions::UnPackTo(BidirectionalSequenceLSTM
   { auto _e = cell_clip(); _o->cell_clip = _e; };
   { auto _e = proj_clip(); _o->proj_clip = _e; };
   { auto _e = merge_outputs(); _o->merge_outputs = _e; };
+  { auto _e = time_major(); _o->time_major = _e; };
 }
 
 inline flatbuffers::Offset<BidirectionalSequenceLSTMOptions> BidirectionalSequenceLSTMOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BidirectionalSequenceLSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -8830,12 +9356,14 @@ inline flatbuffers::Offset<BidirectionalSequenceLSTMOptions> CreateBidirectional
   auto _cell_clip = _o->cell_clip;
   auto _proj_clip = _o->proj_clip;
   auto _merge_outputs = _o->merge_outputs;
+  auto _time_major = _o->time_major;
   return tflite::CreateBidirectionalSequenceLSTMOptions(
       _fbb,
       _fused_activation_function,
       _cell_clip,
       _proj_clip,
-      _merge_outputs);
+      _merge_outputs,
+      _time_major);
 }
 
 inline ResizeBilinearOptionsT *ResizeBilinearOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@@ -9265,6 +9793,29 @@ inline flatbuffers::Offset<ExpOptions> CreateExpOptions(flatbuffers::FlatBufferB
       _fbb);
 }
 
+inline CosOptionsT *CosOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new CosOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void CosOptions::UnPackTo(CosOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<CosOptions> CosOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCosOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<CosOptions> CreateCosOptions(flatbuffers::FlatBufferBuilder &_fbb, const CosOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CosOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateCosOptions(
+      _fbb);
+}
+
 inline ReducerOptionsT *ReducerOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new ReducerOptionsT();
   UnPackTo(_o, _resolver);
@@ -9894,6 +10445,29 @@ inline flatbuffers::Offset<ShapeOptions> CreateShapeOptions(flatbuffers::FlatBuf
       _out_type);
 }
 
+inline RankOptionsT *RankOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new RankOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void RankOptions::UnPackTo(RankOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<RankOptions> RankOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateRankOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<RankOptions> CreateRankOptions(flatbuffers::FlatBufferBuilder &_fbb, const RankOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const RankOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateRankOptions(
+      _fbb);
+}
+
 inline PowOptionsT *PowOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new PowOptionsT();
   UnPackTo(_o, _resolver);
@@ -10341,6 +10915,124 @@ inline flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(flatbuffers:
       _mode);
 }
 
+inline UniqueOptionsT *UniqueOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new UniqueOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void UniqueOptions::UnPackTo(UniqueOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = idx_out_type(); _o->idx_out_type = _e; };
+}
+
+inline flatbuffers::Offset<UniqueOptions> UniqueOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateUniqueOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<UniqueOptions> CreateUniqueOptions(flatbuffers::FlatBufferBuilder &_fbb, const UniqueOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const UniqueOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _idx_out_type = _o->idx_out_type;
+  return tflite::CreateUniqueOptions(
+      _fbb,
+      _idx_out_type);
+}
+
+inline ReverseV2OptionsT *ReverseV2Options::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new ReverseV2OptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void ReverseV2Options::UnPackTo(ReverseV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<ReverseV2Options> ReverseV2Options::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateReverseV2Options(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ReverseV2Options> CreateReverseV2Options(flatbuffers::FlatBufferBuilder &_fbb, const ReverseV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ReverseV2OptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateReverseV2Options(
+      _fbb);
+}
+
+inline AddNOptionsT *AddNOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new AddNOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void AddNOptions::UnPackTo(AddNOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<AddNOptions> AddNOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const AddNOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateAddNOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<AddNOptions> CreateAddNOptions(flatbuffers::FlatBufferBuilder &_fbb, const AddNOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const AddNOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateAddNOptions(
+      _fbb);
+}
+
+inline GatherNdOptionsT *GatherNdOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new GatherNdOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void GatherNdOptions::UnPackTo(GatherNdOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<GatherNdOptions> GatherNdOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateGatherNdOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<GatherNdOptions> CreateGatherNdOptions(flatbuffers::FlatBufferBuilder &_fbb, const GatherNdOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const GatherNdOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateGatherNdOptions(
+      _fbb);
+}
+
+inline WhereOptionsT *WhereOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new WhereOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void WhereOptions::UnPackTo(WhereOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<WhereOptions> WhereOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateWhereOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<WhereOptions> CreateWhereOptions(flatbuffers::FlatBufferBuilder &_fbb, const WhereOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const WhereOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateWhereOptions(
+      _fbb);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
@@ -10915,6 +11607,34 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const SplitVOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_UniqueOptions: {
+      auto ptr = reinterpret_cast<const UniqueOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ReverseV2Options: {
+      auto ptr = reinterpret_cast<const ReverseV2Options *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_AddNOptions: {
+      auto ptr = reinterpret_cast<const AddNOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_GatherNdOptions: {
+      auto ptr = reinterpret_cast<const GatherNdOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_CosOptions: {
+      auto ptr = reinterpret_cast<const CosOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_WhereOptions: {
+      auto ptr = reinterpret_cast<const WhereOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_RankOptions: {
+      auto ptr = reinterpret_cast<const RankOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -11249,6 +11969,34 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const SplitVOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_UniqueOptions: {
+      auto ptr = reinterpret_cast<const UniqueOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ReverseV2Options: {
+      auto ptr = reinterpret_cast<const ReverseV2Options *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_AddNOptions: {
+      auto ptr = reinterpret_cast<const AddNOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_GatherNdOptions: {
+      auto ptr = reinterpret_cast<const GatherNdOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_CosOptions: {
+      auto ptr = reinterpret_cast<const CosOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_WhereOptions: {
+      auto ptr = reinterpret_cast<const WhereOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_RankOptions: {
+      auto ptr = reinterpret_cast<const RankOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -11571,6 +12319,34 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const SplitVOptionsT *>(value);
       return CreateSplitVOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_UniqueOptions: {
+      auto ptr = reinterpret_cast<const UniqueOptionsT *>(value);
+      return CreateUniqueOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ReverseV2Options: {
+      auto ptr = reinterpret_cast<const ReverseV2OptionsT *>(value);
+      return CreateReverseV2Options(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_AddNOptions: {
+      auto ptr = reinterpret_cast<const AddNOptionsT *>(value);
+      return CreateAddNOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_GatherNdOptions: {
+      auto ptr = reinterpret_cast<const GatherNdOptionsT *>(value);
+      return CreateGatherNdOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_CosOptions: {
+      auto ptr = reinterpret_cast<const CosOptionsT *>(value);
+      return CreateCosOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_WhereOptions: {
+      auto ptr = reinterpret_cast<const WhereOptionsT *>(value);
+      return CreateWhereOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_RankOptions: {
+      auto ptr = reinterpret_cast<const RankOptionsT *>(value);
+      return CreateRankOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -11893,6 +12669,34 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new SplitVOptionsT(*reinterpret_cast<SplitVOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_UniqueOptions: {
+      value = new UniqueOptionsT(*reinterpret_cast<UniqueOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ReverseV2Options: {
+      value = new ReverseV2OptionsT(*reinterpret_cast<ReverseV2OptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_AddNOptions: {
+      value = new AddNOptionsT(*reinterpret_cast<AddNOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_GatherNdOptions: {
+      value = new GatherNdOptionsT(*reinterpret_cast<GatherNdOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_CosOptions: {
+      value = new CosOptionsT(*reinterpret_cast<CosOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_WhereOptions: {
+      value = new WhereOptionsT(*reinterpret_cast<WhereOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_RankOptions: {
+      value = new RankOptionsT(*reinterpret_cast<RankOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -12295,6 +13099,41 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_UniqueOptions: {
+      auto ptr = reinterpret_cast<UniqueOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ReverseV2Options: {
+      auto ptr = reinterpret_cast<ReverseV2OptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_AddNOptions: {
+      auto ptr = reinterpret_cast<AddNOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_GatherNdOptions: {
+      auto ptr = reinterpret_cast<GatherNdOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_CosOptions: {
+      auto ptr = reinterpret_cast<CosOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_WhereOptions: {
+      auto ptr = reinterpret_cast<WhereOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_RankOptions: {
+      auto ptr = reinterpret_cast<RankOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/tensorflow/lite/stderr_reporter.cc b/tensorflow/lite/stderr_reporter.cc
index 09eb1d254a608ba2d19c824a323f0b5173afe15f..366a1816ef2b2ef62e093bbe99690eae52fdc8c4 100644
--- a/tensorflow/lite/stderr_reporter.cc
+++ b/tensorflow/lite/stderr_reporter.cc
@@ -13,28 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/stderr_reporter.h"
-#include <cstdarg>
-#include <cstdio>
 
-#ifdef __ANDROID__
-#include <android/log.h>
-#endif
+#include "tensorflow/lite/minimal_logging.h"
 
 namespace tflite {
 
 int StderrReporter::Report(const char* format, va_list args) {
-#ifdef __ANDROID__
-  // On Android stderr is not captured for applications, only for code run from
-  // the shell. Rather than assume all users will set up a custom error
-  // reporter, let's output to logcat here
-  va_list args_for_log;
-  va_copy(args_for_log, args);
-  __android_log_vprint(ANDROID_LOG_ERROR, "tflite", format, args_for_log);
-  va_end(args_for_log);
-#endif
-  const int result = vfprintf(stderr, format, args);
-  fputc('\n', stderr);
-  return result;
+  logging_internal::MinimalLogger::VLog(TFLITE_LOG_ERROR, format, args);
+  return 0;
 }
 
 ErrorReporter* DefaultErrorReporter() {
diff --git a/tensorflow/lite/string_util.h b/tensorflow/lite/string_util.h
index f076db76f2d4ef416e5f7ec98ac2ec0aa94d95c2..cb268ee805c89951e4ef302595339ea0e771b48a 100644
--- a/tensorflow/lite/string_util.h
+++ b/tensorflow/lite/string_util.h
@@ -35,7 +35,7 @@ limitations under the License.
 //   buf.AddString("AB", 2);
 //   # Write content of DynamicBuffer to tensor in format of string tensor
 //   # described above.
-//   buf.WriteToTensor(tensor)
+//   buf.WriteToTensor(tensor, nullptr)
 
 #ifndef TENSORFLOW_LITE_STRING_UTIL_H_
 #define TENSORFLOW_LITE_STRING_UTIL_H_
@@ -83,10 +83,6 @@ class DynamicBuffer {
   // Fill content into a string tensor. Set shape to {num_strings}.
   void WriteToTensorAsVector(TfLiteTensor* tensor);
 
-  // Deprecated. Use WriteToTensorAsVector() or pass in the new shpe.
-  // TODO(b/120230709): remove when people migrate away.
-  void WriteToTensor(TfLiteTensor* tensor) { WriteToTensorAsVector(tensor); }
-
  private:
   // Data buffer to store contents of strings, not including headers.
   std::vector<char> data_;
diff --git a/tensorflow/lite/string_util_test.cc b/tensorflow/lite/string_util_test.cc
index cbf1d7b226af20251d5f70a354a21f1eb40ae1c6..6fc7de90ea534f9c8c4f61b4607ff7d2d8647d00 100644
--- a/tensorflow/lite/string_util_test.cc
+++ b/tensorflow/lite/string_util_test.cc
@@ -35,8 +35,11 @@ TEST(StringUtil, TestStringUtil) {
 
   char data[] = {1, 0, 0, 0, 12, 0, 0, 0, 15, 0, 0, 0, 'X', 'Y', 'Z'};
 
-  interpreter.SetTensorParametersReadOnly(2, kTfLiteString, "", {1}, {}, data,
-                                          15);
+  TfLiteQuantization quant;
+  quant.type = kTfLiteNoQuantization;
+  quant.params = nullptr;
+  interpreter.SetTensorParametersReadOnly(2, kTfLiteString, "", {1}, quant,
+                                          data, 15);
   TfLiteTensor* t2 = interpreter.tensor(2);
   interpreter.AllocateTensors();
 
diff --git a/tensorflow/lite/testdata/multi_add.json b/tensorflow/lite/testdata/multi_add.json
index 97b931dba8b1050ecf91939d1d9dcea5e0ea56fb..ae559255a85300bfacf5c3658b2915ce7738f5b7 100644
--- a/tensorflow/lite/testdata/multi_add.json
+++ b/tensorflow/lite/testdata/multi_add.json
@@ -1,46 +1,131 @@
 {
-  "version": 1,
+  "version": 3,
   "operator_codes": [
     {
-      "builtin_code": "ADD"
     }
   ],
   "subgraphs": [
     {
       "tensors": [
-        { "shape": [ 1, 8, 8, 3 ], "name": "a" },
-        { "shape": [ 1, 8, 8, 3 ], "name": "b" },
-        { "shape": [ 1, 8, 8, 3 ], "name": "c" },
-        { "shape": [ 1, 8, 8, 3 ], "name": "d" },
-        { "shape": [ 1, 8, 8, 3 ], "name": "i" },
-        { "shape": [ 1, 8, 8, 3 ], "name": "x" },
-        { "shape": [ 1, 8, 8, 3 ], "name": "y" }
+        {
+          "shape": [
+            1,
+            8,
+            8,
+            3
+          ],
+          "name": "a"
+        },
+        {
+          "shape": [
+            1,
+            8,
+            8,
+            3
+          ],
+          "name": "b"
+        },
+        {
+          "shape": [
+            1,
+            8,
+            8,
+            3
+          ],
+          "name": "c"
+        },
+        {
+          "shape": [
+            1,
+            8,
+            8,
+            3
+          ],
+          "name": "d"
+        },
+        {
+          "shape": [
+            1,
+            8,
+            8,
+            3
+          ],
+          "name": "i"
+        },
+        {
+          "shape": [
+            1,
+            8,
+            8,
+            3
+          ],
+          "name": "x"
+        },
+        {
+          "shape": [
+            1,
+            8,
+            8,
+            3
+          ],
+          "name": "y"
+        }
+      ],
+      "inputs": [
+        0,
+        1,
+        2,
+        3
+      ],
+      "outputs": [
+        5,
+        6
       ],
-      "inputs": [ 0, 1, 2, 3 ],
-      "outputs": [ 5, 6 ],
       "operators": [
         {
-          "inputs": [ 1, 2 ],
-          "outputs": [ 4 ],
+          "inputs": [
+            1,
+            2
+          ],
+          "outputs": [
+            4
+          ],
           "builtin_options_type": "AddOptions",
           "builtin_options": {
           }
         },
         {
-          "inputs": [ 0, 4 ],
-          "outputs": [ 5 ],
+          "inputs": [
+            0,
+            4
+          ],
+          "outputs": [
+            5
+          ],
           "builtin_options_type": "AddOptions",
           "builtin_options": {
           }
         },
         {
-          "inputs": [ 3, 4 ],
-          "outputs": [ 6 ],
+          "inputs": [
+            3,
+            4
+          ],
+          "outputs": [
+            6
+          ],
           "builtin_options_type": "AddOptions",
           "builtin_options": {
           }
         }
       ]
     }
+  ],
+  "buffers": [
+    {
+      "data": [
+
+      ]
+    }
   ]
 }
diff --git a/tensorflow/lite/testdata/test_input.csv b/tensorflow/lite/testdata/test_input.csv
new file mode 100644
index 0000000000000000000000000000000000000000..33894d3063f35a885fb34c3c5b85bb6a4d8e711e
--- /dev/null
+++ b/tensorflow/lite/testdata/test_input.csv
@@ -0,0 +1 @@
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
\ No newline at end of file
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index 22ffed43cc0e08ac45a9a07077450d2642ba7f26..68512b952a9a34c67452b676db97534b1fb3c733 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -10,8 +10,10 @@ load(
     "generated_test_models_all",
 )
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 load(
     "//tensorflow:tensorflow.bzl",
+    "tf_cc_binary",
     "tf_cc_test",
     "py_test",
 )
@@ -77,6 +79,7 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = [
         ":generate_examples_report",
+        ":string_util_wrapper",
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:graph_util",
         "//third_party/py/numpy",
@@ -158,6 +161,7 @@ cc_library(
     srcs = ["tflite_driver.cc"],
     hdrs = ["tflite_driver.h"],
     deps = [
+        ":join",
         ":split",
         ":test_runner",
         "//tensorflow/lite:builtin_op_data",
@@ -165,6 +169,7 @@ cc_library(
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/delegates/flex:delegate",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:reference_ops",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -229,13 +234,13 @@ cc_test(
     ],
 )
 
-cc_binary(
+tf_cc_binary(
     name = "nnapi_example",
     srcs = ["nnapi_example.cc"],
     deps = [
         ":parse_testdata_lib",
         ":tflite_driver",
-        "//tensorflow/lite/nnapi:nnapi_lib",
+        "//tensorflow/lite/nnapi:nnapi_implementation",
     ],
 )
 
@@ -256,7 +261,7 @@ cc_library(
     ],
 )
 
-cc_test(
+tf_cc_test(
     name = "tf_driver_test",
     size = "small",
     srcs = ["tf_driver_test.cc"],
@@ -285,7 +290,7 @@ cc_library(
     ],
 )
 
-cc_test(
+tf_cc_test(
     name = "generate_testspec_test",
     size = "small",
     srcs = ["generate_testspec_test.cc"],
@@ -379,7 +384,7 @@ tf_cc_test(
     ],
 )
 
-cc_binary(
+tf_cc_binary(
     name = "tflite_diff",
     srcs = ["tflite_diff_example_test.cc"],
     deps = [
@@ -389,4 +394,29 @@ cc_binary(
     ],
 )
 
+cc_library(
+    name = "string_util_lib",
+    srcs = ["string_util.cc"],
+    hdrs = ["string_util.h"],
+    deps = [
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/python/interpreter_wrapper:numpy",
+        "//tensorflow/lite/python/interpreter_wrapper:python_utils",
+        "//third_party/py/numpy:headers",
+        "//third_party/python_runtime:headers",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_py_wrap_cc(
+    name = "string_util_wrapper",
+    srcs = [
+        "string_util.i",
+    ],
+    deps = [
+        ":string_util_lib",
+        "//third_party/python_runtime:headers",
+    ],
+)
+
 tflite_portable_test_suite()
diff --git a/tensorflow/lite/testing/generate_examples.py b/tensorflow/lite/testing/generate_examples.py
index dd7b3d07456fbd9943e9f45b815e6015f4973a94..20c23017c53d0b8929e701ef505c70ba156be821 100644
--- a/tensorflow/lite/testing/generate_examples.py
+++ b/tensorflow/lite/testing/generate_examples.py
@@ -36,6 +36,7 @@ import operator
 import os
 import random
 import re
+import string
 import sys
 import tempfile
 import traceback
@@ -52,6 +53,7 @@ import tensorflow as tf
 from google.protobuf import text_format
 # TODO(aselle): switch to TensorFlow's resource_loader
 from tensorflow.lite.testing import generate_examples_report as report_lib
+from tensorflow.lite.testing import string_util_wrapper
 from tensorflow.python.framework import graph_util as tf_graph_util
 from tensorflow.python.ops import rnn
 
@@ -163,6 +165,16 @@ def toco_options(data_types,
   return s
 
 
+def format_result(t):
+  """Convert a tensor to a format that can be used in test specs."""
+  if t.dtype.kind not in [np.dtype(np.string_).kind, np.dtype(np.object_).kind]:
+    # Output 9 digits after the point to ensure the precision is good enough.
+    values = ["{:.9f}".format(value) for value in list(t.flatten())]
+    return ",".join(values)
+  else:
+    return string_util_wrapper.SerializeAsHexString(t.flatten())
+
+
 def write_examples(fp, examples):
   """Given a list `examples`, write a text format representation.
 
@@ -179,9 +191,7 @@ def write_examples(fp, examples):
     """Write tensor in file format supported by TFLITE example."""
     fp.write("dtype,%s\n" % x.dtype)
     fp.write("shape," + ",".join(map(str, x.shape)) + "\n")
-    # Output 9 digits after the point to ensure the precision is good enough.
-    values = ["{:.9f}".format(value) for value in list(x.flatten())]
-    fp.write("values," + ",".join(values) + "\n")
+    fp.write("values," + format_result(x) + "\n")
 
   fp.write("test_cases,%d\n" % len(examples))
   for example in examples:
@@ -214,11 +224,9 @@ def write_test_cases(fp, model_name, examples):
     fp.write("invoke {\n")
 
     for t in example["inputs"]:
-      values = ["{:.9f}".format(value) for value in list(t.flatten())]
-      fp.write("  input: \"" + ",".join(values) + "\"\n")
+      fp.write("  input: \"" + format_result(t) + "\"\n")
     for t in example["outputs"]:
-      values = ["{:.9f}".format(value) for value in list(t.flatten())]
-      fp.write("  output: \"" + ",".join(values) + "\"\n")
+      fp.write("  output: \"" + format_result(t) + "\"\n")
     fp.write("}\n")
 
 
@@ -230,6 +238,7 @@ _TF_TYPE_INFO = {
     tf.int16: (np.int16, "QUANTIZED_INT16"),
     tf.int64: (np.int64, "INT64"),
     tf.bool: (np.bool, "BOOL"),
+    tf.string: (np.string_, "STRING"),
 }
 
 
@@ -245,6 +254,10 @@ def create_tensor_data(dtype, shape, min_value=-100, max_value=100):
     value = np.random.randint(min_value, max_value+1, shape)
   elif dtype == tf.bool:
     value = np.random.choice([True, False], size=shape)
+  elif dtype == np.string_:
+    # Not the best strings, but they will do for some basic testing.
+    letters = list(string.ascii_uppercase)
+    return np.random.choice(letters, size=shape).astype(dtype)
   return np.dtype(dtype).type(value) if np.isscalar(value) else value.astype(
       dtype)
 
@@ -300,8 +313,13 @@ def make_control_dep_tests(zip_path):
 
   extra_toco_options = ExtraTocoOptions()
   extra_toco_options.drop_control_dependency = True
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs,
-                    extra_toco_options)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      extra_toco_options,
+      expected_tf_failures=3)
 
 
 def toco_convert(graph_def_str, input_tensors, output_tensors,
@@ -369,7 +387,7 @@ def make_zip_of_tests(zip_path,
                       make_test_inputs,
                       extra_toco_options=ExtraTocoOptions(),
                       use_frozen_graph=False,
-                      expected_tf_success=None):
+                      expected_tf_failures=0):
   """Helper to make a zip file of a bunch of TensorFlow models.
 
   This does a cartestian product of the dictionary of test_parameters and
@@ -389,8 +407,9 @@ def make_zip_of_tests(zip_path,
       `output_tensors` and returns tuple `(input_values, output_values)`.
     extra_toco_options: Additional toco options.
     use_frozen_graph: Whether or not freeze graph before toco converter.
-    expected_tf_success: Number of times tensorflow is supposed to succeed in
-      executing the input graphs. `None` means "unknown".
+    expected_tf_failures: Number of times tensorflow is expected to fail in
+      executing the input graphs. In some cases it is OK for TensorFlow to
+      fail because the one or more combination of parameters is invalid.
 
   Raises:
     RuntimeError: if there are toco errors that can't be ignored.
@@ -551,10 +570,17 @@ def make_zip_of_tests(zip_path,
                    " and %d TOCO converted graphs (%.1f%%"), zip_path,
                   total_conversions, tf_success, toco_success, percent)
 
-  if expected_tf_success is not None and tf_success != expected_tf_success:
-    raise RuntimeError(
-        "Expected TF to succeed %d times, but that happened %d times" %
-        (expected_tf_success, tf_success))
+  tf_failures = parameter_count - tf_success
+
+  if tf_failures / parameter_count > 0.8:
+    raise RuntimeError(("Test for '%s' is not very useful. "
+                        "TensorFlow fails in %d percent of the cases.") %
+                       (zip_path, int(100 * tf_failures / parameter_count)))
+
+  if tf_failures != expected_tf_failures:
+    raise RuntimeError(("Expected TF to fail %d times while generating '%s', "
+                        "but that happened %d times") % (expected_tf_failures,
+                                                         zip_path, tf_failures))
 
   if not FLAGS.ignore_toco_errors and toco_errors > 0:
     raise RuntimeError(
@@ -573,11 +599,12 @@ def make_pool_tests(pool_op_in):
 
   pool_op = pool_op_in
 
-  def f(zip_path):
+  def f(zip_path, expected_tf_failures=0):
     """Actual function that generates examples.
 
     Args:
       zip_path: path to write zip to.
+      expected_tf_failures: number of expected tensorflow failures.
     """
 
     # Chose a set of parameters
@@ -606,20 +633,26 @@ def make_pool_tests(pool_op_in):
       return [input_values], sess.run(
           outputs, feed_dict=dict(zip(inputs, [input_values])))
 
-    make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+    make_zip_of_tests(
+        zip_path,
+        test_parameters,
+        build_graph,
+        build_inputs,
+        expected_tf_failures=expected_tf_failures)
+
   return f
 
 
 def make_l2_pool_tests(zip_path):
-  make_pool_tests(make_l2_pool)(zip_path)
+  make_pool_tests(make_l2_pool)(zip_path, expected_tf_failures=80)
 
 
 def make_avg_pool_tests(zip_path):
-  make_pool_tests(tf.nn.avg_pool)(zip_path)
+  make_pool_tests(tf.nn.avg_pool)(zip_path, expected_tf_failures=80)
 
 
 def make_max_pool_tests(zip_path):
-  make_pool_tests(tf.nn.max_pool)(zip_path)
+  make_pool_tests(tf.nn.max_pool)(zip_path, expected_tf_failures=80)
 
 
 def make_abs_tests(zip_path):
@@ -645,6 +678,32 @@ def make_abs_tests(zip_path):
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
+def make_elu_tests(zip_path):
+  """Make a set of tests to do (float) tf.nn.elu."""
+
+  test_parameters = [
+      {
+          "input_shape": [[], [1], [2, 3], [1, 1, 1, 1], [1, 3, 4, 3],
+                          [3, 15, 14, 3], [3, 1, 2, 4, 6], [2, 2, 3, 4, 5, 6]],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build the graph for the test case."""
+
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+    out = tf.nn.elu(input_tensor)
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    """Build the inputs for the test case."""
+    input_values = create_tensor_data(
+        np.float32, parameters["input_shape"], min_value=-4, max_value=10)
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 def make_relu_tests(zip_path):
   """Make a set of tests to do relu."""
@@ -814,6 +873,9 @@ def make_constant_tests(zip_path):
       "dtype": [tf.float32, tf.int32],
       "input_shape": [[], [1], [2], [1, 1, 1, 1], [2, 2, 2, 2]],
       "constant_is_also_output": [True, False],
+      # This is a regression test for a bug where Toco rejects models with
+      # unread inputs.
+      "has_unread_input": [True, False],
   }]
 
   def build_graph(parameters):
@@ -823,22 +885,28 @@ def make_constant_tests(zip_path):
         shape=parameters["input_shape"])
     constant = tf.constant(
         create_tensor_data(parameters["dtype"], parameters["input_shape"]))
-    out = [tf.maximum(dummy_input, constant)]
+    outputs = [tf.maximum(dummy_input, constant)]
     if parameters["constant_is_also_output"]:
-      out.append(constant)
+      outputs.append(constant)
+    inputs = [dummy_input]
+    if parameters["has_unread_input"]:
+      unread_input = tf.placeholder(
+          dtype=parameters["dtype"],
+          name="unread_input",
+          shape=parameters["input_shape"])
+      inputs.append(unread_input)
 
-    return [dummy_input], out
+    return inputs, outputs
 
   def build_inputs(parameters, sess, inputs, outputs):
     dummy_input = np.zeros(
         parameters["input_shape"], dtype=_TF_TYPE_INFO[parameters["dtype"]][0])
     return [dummy_input], sess.run(outputs, feed_dict={inputs[0]: dummy_input})
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs,
-                    expected_tf_success=20)
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
-def make_binary_op_tests(zip_path, binary_operator):
+def make_binary_op_tests(zip_path, binary_operator, expected_tf_failures=0):
   """Make a set of tests to do binary ops with and without broadcast."""
 
   test_parameters = [
@@ -908,7 +976,12 @@ def make_binary_op_tests(zip_path, binary_operator):
             inputs[1]: input2
         })
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=expected_tf_failures)
 
 
 def make_reduce_tests(reduce_op,
@@ -1074,6 +1147,34 @@ def make_exp_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_cos_tests(zip_path):
+  """Make a set of tests to do cos."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32],
+      "input_shape": [[], [3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+  }]
+
+  def build_graph(parameters):
+    """Build the cos op testing graph."""
+    input_tensor = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+
+    out = tf.cos(input_tensor)
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    values = [
+        create_tensor_data(parameters["input_dtype"], parameters["input_shape"],
+                           min_value=-np.pi, max_value=np.pi)
+    ]
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_log_softmax_tests(zip_path):
   """Make a set of tests to do log_softmax."""
 
@@ -1137,7 +1238,12 @@ def make_maximum_tests(zip_path):
     ]
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=8)
 
 
 def make_minimum_tests(zip_path):
@@ -1172,7 +1278,12 @@ def make_minimum_tests(zip_path):
     ]
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=8)
 
 
 def make_binary_op_tests_func(binary_operator):
@@ -1184,6 +1295,51 @@ def make_add_tests(zip_path):
   make_binary_op_tests(zip_path, tf.add)
 
 
+def make_add_n_tests(zip_path):
+  """Make a set of tests for AddN op."""
+
+  test_parameters = [
+      {
+          "dtype": [tf.float32, tf.int32],
+          "input_shape": [[2, 5, 3, 1]],
+          "num_inputs": [2, 3, 4, 5],
+      },
+      {
+          "dtype": [tf.float32, tf.int32],
+          "input_shape": [[5]],
+          "num_inputs": [2, 3, 4, 5],
+      },
+      {
+          "dtype": [tf.float32, tf.int32],
+          "input_shape": [[]],
+          "num_inputs": [2, 3, 4, 5],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Builds the graph given the current parameters."""
+    input_tensors = []
+    for i in range(parameters["num_inputs"]):
+      input_tensors.append(
+          tf.placeholder(
+              dtype=parameters["dtype"],
+              name="input_{}".format(i),
+              shape=parameters["input_shape"]))
+    out = tf.add_n(input_tensors)
+    return input_tensors, [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    """Builds operand inputs for op."""
+    input_data = []
+    for i in range(parameters["num_inputs"]):
+      input_data.append(
+          create_tensor_data(parameters["dtype"], parameters["input_shape"]))
+    return input_data, sess.run(
+        outputs, feed_dict={i: d for i, d in zip(inputs, input_data)})
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_div_tests(zip_path):
   make_binary_op_tests(zip_path, tf.div)
 
@@ -1197,7 +1353,7 @@ def make_mul_tests(zip_path):
 
 
 def make_pow_tests(zip_path):
-  make_binary_op_tests(zip_path, tf.pow)
+  make_binary_op_tests(zip_path, tf.pow, expected_tf_failures=7)
 
 
 def make_floor_div_tests(zip_path):
@@ -1215,16 +1371,23 @@ def make_squared_difference_tests(zip_path):
 def make_gather_tests(zip_path):
   """Make a set of tests to do gather."""
 
-  test_parameters = [{
-      # TODO(mgubin): add string tests when they are supported by Toco.
-      # TODO(mgubin): add tests for Nd indices when they are supported by
-      # TfLite.
-      "params_dtype": [tf.float32, tf.int32, tf.int64],
-      "params_shape": [[10], [1, 2, 20]],
-      "indices_dtype": [tf.int32, tf.int64],
-      "indices_shape": [[3], [5]],
-      "axis": [-1, 0, 1],
-  }]
+  test_parameters = [
+      {
+          "params_dtype": [tf.float32, tf.int32, tf.int64],
+          "params_shape": [[10], [1, 2, 20]],
+          "indices_dtype": [tf.int32, tf.int64],
+          "indices_shape": [[3], [5]],
+          "axis": [-1, 0, 1],
+      },
+      {
+          # TODO(b/123895910): add Nd support for strings.
+          "params_dtype": [tf.string],
+          "params_shape": [[8]],
+          "indices_dtype": [tf.int32],
+          "indices_shape": [[3]],
+          "axis": [0],
+      }
+  ]
 
   def build_graph(parameters):
     """Build the gather op testing graph."""
@@ -1255,7 +1418,56 @@ def make_gather_tests(zip_path):
       test_parameters,
       build_graph,
       build_inputs,
-      expected_tf_success=60)
+      expected_tf_failures=12)
+
+
+def make_gather_nd_tests(zip_path):
+  """Make a set of tests to do gather_nd."""
+
+  test_parameters = [
+      {
+          "params_dtype": [tf.float32, tf.int32, tf.int64],
+          "params_shape": [[5, 1]],
+          "indices_dtype": [tf.int32, tf.int64],
+          "indices_shape": [[1, 1]],
+      },
+      {
+          "params_dtype": [tf.float32, tf.int32, tf.int64],
+          "params_shape": [[5, 5]],
+          "indices_dtype": [tf.int32, tf.int64],
+          "indices_shape": [[2, 1], [2, 2]],
+      },
+      {
+          "params_dtype": [tf.float32, tf.int32, tf.int64],
+          "params_shape": [[5, 5, 10]],
+          "indices_dtype": [tf.int32, tf.int64],
+          "indices_shape": [[3, 1], [2, 2], [2, 3], [2, 1, 3]],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build the gather_nd op testing graph."""
+    params = tf.placeholder(
+        dtype=parameters["params_dtype"],
+        name="params",
+        shape=parameters["params_shape"])
+    indices = tf.placeholder(
+        dtype=parameters["indices_dtype"],
+        name="indices",
+        shape=parameters["indices_shape"])
+    out = tf.gather_nd(params, indices)
+    return [params, indices], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    params = create_tensor_data(parameters["params_dtype"],
+                                parameters["params_shape"])
+    indices = create_tensor_data(parameters["indices_dtype"],
+                                 parameters["indices_shape"], 0,
+                                 parameters["params_shape"][0] - 1)
+    return [params, indices], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [params, indices])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
 def make_gather_with_constant_tests(zip_path):
@@ -1284,8 +1496,7 @@ def make_gather_with_constant_tests(zip_path):
     return [reference_values], sess.run(
         outputs, feed_dict={inputs[0]: reference_values})
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs,
-                    expected_tf_success=2)
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
 def make_global_batch_norm_tests(zip_path):
@@ -1421,6 +1632,41 @@ def make_conv_tests(zip_path):
       values.append(create_tensor_data(np.float32, filter_shape))
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=40)
+
+
+# Note: This is a regression test for a bug (b/122651451) that Toco incorrectly
+# erases the reduction indices array while it's shared with other ops.
+def make_l2norm_shared_epsilon_tests(zip_path):
+  """Regression test for a bug (b/122651451)."""
+
+  # Chose a set of parameters
+  test_parameters = [{
+      "input_shape": [[5, 7]],
+      "dim": [1],
+      "epsilon": [1e-8],
+  }]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+    epsilon = tf.constant(parameters["epsilon"])
+    out1 = tf.nn.l2_normalize(input_tensor, parameters["dim"], epsilon=epsilon)
+    out2 = tf.nn.l2_normalize(input_tensor, parameters["dim"], epsilon=epsilon)
+    out = out1 + out2
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(
+        np.float32, parameters["input_shape"], min_value=-4, max_value=10)
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
@@ -1633,7 +1879,12 @@ def make_depthwiseconv_tests(zip_path):
       values.append(create_tensor_data(np.float32, filter_shape))
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=4)
 
 
 def make_split_tests(zip_path):
@@ -1656,7 +1907,12 @@ def make_split_tests(zip_path):
     values = [create_tensor_data(np.float32, parameters["input_shape"])]
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=112)
 
 
 def make_splitv_tests(zip_path):
@@ -1679,7 +1935,12 @@ def make_splitv_tests(zip_path):
     values = [create_tensor_data(np.float32, parameters["input_shape"])]
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=158)
 
 
 def make_concat_tests(zip_path):
@@ -1721,7 +1982,12 @@ def make_concat_tests(zip_path):
     return all_values, sess.run(
         outputs, feed_dict=dict(zip(inputs, all_values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=60)
 
 
 def make_fully_connected_tests(zip_path):
@@ -1782,7 +2048,12 @@ def make_fully_connected_tests(zip_path):
       values.append(create_tensor_data(np.float32, parameters["shape2"]))
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=10)
 
 
 def make_l2norm_tests(zip_path):
@@ -1812,7 +2083,12 @@ def make_l2norm_tests(zip_path):
     return [input_values], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_values])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=9)
 
 
 def make_local_response_norm_tests(zip_path):
@@ -2038,6 +2314,29 @@ def make_shape_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_rank_tests(zip_path):
+  """Make a set of tests to do rank."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32, tf.int32],
+      "input_shape": [[], [0], [1, 1, 1, 3], [2, 3, 4, 5], [5, 5], [10]],
+  }]
+
+  def build_graph(parameters):
+    """Build the rank op testing graph."""
+    input_value = tf.placeholder(dtype=parameters["input_dtype"], name="input")
+    out = tf.rank(input_value)
+    return [input_value], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(parameters["input_dtype"],
+                                     parameters["input_shape"])
+    return [input_value], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_one_hot_tests(zip_path):
   """Make a set of tests to do one_hot."""
 
@@ -2297,7 +2596,12 @@ def make_space_to_batch_nd_tests(zip_path):
       values.append(np.array(parameters["paddings"]))
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=56)
 
 
 def make_batch_to_space_nd_tests(zip_path):
@@ -2410,7 +2714,12 @@ def make_transpose_tests(zip_path):
       values.append(np.array(parameters["perm"]))
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=9)
 
 
 def make_squeeze_tests(zip_path):
@@ -2448,10 +2757,16 @@ def make_squeeze_tests(zip_path):
     return [input_values], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_values])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=12)
 
 
-def _make_strided_slice_tests(zip_path, test_parameters):
+def _make_strided_slice_tests(zip_path, test_parameters,
+                              expected_tf_failures=0):
   """Utility function to make strided_slice_tests based on parameters."""
 
   def build_graph(parameters):
@@ -2511,7 +2826,12 @@ def _make_strided_slice_tests(zip_path, test_parameters):
 
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=expected_tf_failures)
 
 
 def make_strided_slice_tests(zip_path):
@@ -2585,7 +2905,7 @@ def make_strided_slice_tests(zip_path):
           "constant_indices": [False],
       },
   ]
-  _make_strided_slice_tests(zip_path, test_parameters)
+  _make_strided_slice_tests(zip_path, test_parameters, expected_tf_failures=2)
 
 
 def make_strided_slice_1d_exhaustive_tests(zip_path):
@@ -2608,7 +2928,10 @@ def make_strided_slice_1d_exhaustive_tests(zip_path):
   _make_strided_slice_tests(zip_path, test_parameters)
 
 
-def make_strided_slice_buggy_tests(zip_path):
+# For verifying https://github.com/tensorflow/tensorflow/issues/23599
+# TODO(chaomei): refactor the test to cover more cases, like negative stride,
+# negative array index etc.
+def make_resolve_constant_strided_slice_tests(zip_path):
   """Make a set of tests to show strided_slice yields incorrect results."""
 
   test_parameters = [{
@@ -2787,7 +3110,12 @@ def make_arg_min_max_tests(zip_path):
     return [input_value], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=4)
 
 
 def make_equal_tests(zip_path):
@@ -2822,7 +3150,12 @@ def make_equal_tests(zip_path):
     return [input_value1, input_value2], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=3)
 
 
 def make_not_equal_tests(zip_path):
@@ -2856,7 +3189,12 @@ def make_not_equal_tests(zip_path):
     return [input_value1, input_value2], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=3)
 
 
 def make_greater_tests(zip_path):
@@ -2890,7 +3228,12 @@ def make_greater_tests(zip_path):
     return [input_value1, input_value2], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=3)
 
 
 def make_greater_equal_tests(zip_path):
@@ -2924,7 +3267,12 @@ def make_greater_equal_tests(zip_path):
     return [input_value1, input_value2], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=3)
 
 
 def make_less_tests(zip_path):
@@ -2958,7 +3306,12 @@ def make_less_tests(zip_path):
     return [input_value1, input_value2], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=3)
 
 
 def make_less_equal_tests(zip_path):
@@ -2992,7 +3345,12 @@ def make_less_equal_tests(zip_path):
     return [input_value1, input_value2], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=3)
 
 
 def make_floor_tests(zip_path):
@@ -3012,6 +3370,31 @@ def make_floor_tests(zip_path):
     out = tf.floor(input_value)
     return [input_value], [out]
 
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(parameters["input_dtype"],
+                                     parameters["input_shape"])
+    return [input_value], sess.run(outputs, feed_dict={inputs[0]: input_value})
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_ceil_tests(zip_path):
+  """Make a set of tests to do ceil."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32],
+      "input_shape": [[1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+  }]
+
+  def build_graph(parameters):
+    """Build the ceil op testing graph."""
+    input_value = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input1",
+        shape=parameters["input_shape"])
+    out = tf.ceil(input_value)
+    return [input_value], [out]
+
   def build_inputs(parameters, sess, inputs, outputs):
     input_value = create_tensor_data(parameters["input_dtype"],
                                      parameters["input_shape"])
@@ -3219,6 +3602,48 @@ def make_slice_tests(zip_path):
 
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=18)
+
+
+def make_conv2d_transpose_tests(zip_path):
+  """Make a set of tests to do transpose_conv."""
+
+  test_parameters = [{
+      "input_shape": [[1, 50, 54, 3]],
+      "filter_shape": [[1, 1, 8, 3], [1, 2, 8, 3], [1, 3, 8, 3], [1, 4, 8, 3]],
+      "output_shape": [[1, 100, 108, 8]],
+  }]
+
+  def build_graph(parameters):
+    """Build a transpose_conv graph given `parameters`."""
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+
+    filter_tensor = tf.placeholder(
+        dtype=tf.float32, name="filter", shape=parameters["filter_shape"])
+
+    out = tf.nn.conv2d_transpose(
+        input_tensor,
+        filter_tensor,
+        output_shape=parameters["output_shape"],
+        padding="SAME",
+        strides=(1, 2, 2, 1))
+
+    input_tensors = [input_tensor, filter_tensor]
+    return input_tensors, [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    values = [
+        create_tensor_data(np.float32, parameters["input_shape"]),
+        create_tensor_data(np.float32, parameters["filter_shape"])
+    ]
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
@@ -3469,7 +3894,12 @@ def make_pack_tests(zip_path):
     return all_values, sess.run(
         outputs, feed_dict=dict(zip(inputs, all_values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=72)
 
 
 def make_unpack_tests(zip_path):
@@ -3560,13 +3990,18 @@ def make_fill_tests(zip_path):
     return [input1, input2], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input1, input2])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_failures=12)
 
 
 def _make_logical_tests(op):
   """Make a set of tests to do logical operations."""
 
-  def logical(zip_path):
+  def logical(zip_path, expected_tf_failures=0):
     """Generate examples."""
     test_parameters = [{
         "input_shape_pair": [([], []), ([1, 1, 1, 3], [1, 1, 1, 3]),
@@ -3591,19 +4026,24 @@ def _make_logical_tests(op):
       return [input_value1, input_value2], sess.run(
           outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
 
-    make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+    make_zip_of_tests(
+        zip_path,
+        test_parameters,
+        build_graph,
+        build_inputs,
+        expected_tf_failures=expected_tf_failures)
 
   return logical
 
 
 def make_logical_or_tests(zip_path):
   """Make a set of tests to do logical_or."""
-  return _make_logical_tests(tf.logical_or)(zip_path)
+  return _make_logical_tests(tf.logical_or)(zip_path, expected_tf_failures=1)
 
 
 def make_logical_and_tests(zip_path):
   """Make a set of tests to do logical_and."""
-  return _make_logical_tests(tf.logical_and)(zip_path)
+  return _make_logical_tests(tf.logical_and)(zip_path, expected_tf_failures=1)
 
 
 def make_logical_xor_tests(zip_path):
@@ -3611,7 +4051,7 @@ def make_logical_xor_tests(zip_path):
 
     Test logical_not as well.
   """
-  return _make_logical_tests(tf.logical_xor)(zip_path)
+  return _make_logical_tests(tf.logical_xor)(zip_path, expected_tf_failures=1)
 
 
 def make_mirror_pad_tests(zip_path):
@@ -3660,6 +4100,12 @@ def make_mirror_pad_tests(zip_path):
           "mode": ["REFLECT"],
           "type": ["const"]
       },
+      {
+          "input_shape": [[3, 2, 4, 5]],
+          "padding_matrix": [[[1, 1], [2, 2], [1, 1], [1, 1]]],
+          "mode": ["SYMMETRIC"],
+          "type": ["placeholder"]
+      },
   ]
 
   def build_graph(parameters):
@@ -3688,18 +4134,23 @@ def make_mirror_pad_tests(zip_path):
     return input_values, sess.run(
         outputs, feed_dict=dict(zip(inputs, input_values)))
 
-  make_zip_of_tests(
-      zip_path,
-      test_parameters,
-      build_graph,
-      build_inputs,
-      expected_tf_success=7)
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
 def make_unroll_batch_matmul_tests(zip_path):
   """Make a set of tests to test unroll_batch_matmul."""
 
-  test_parameters = [{"dtype": [tf.float32], "shape": [[(2, 2, 3), (2, 3, 2)]]}]
+  test_parameters = [{
+      "dtype": [tf.float32],
+      "shape": [[(2, 2, 3), (2, 3, 2), False, False],
+                [(2, 2, 3), (2, 3, 2), True, True],
+                [(2, 2, 3), (2, 2, 3), False, True],
+                [(2, 2, 3), (2, 2, 3), True, False],
+                [(4, 2, 2, 3), (4, 2, 3, 2), False, False],
+                [(4, 2, 2, 3), (4, 2, 3, 2), True, True],
+                [(4, 2, 2, 3), (4, 2, 2, 3), False, True],
+                [(4, 2, 2, 3), (4, 2, 2, 3), True, False]]
+  }]
 
   def build_graph(parameters):
     """Build the batch_matmul op testing graph."""
@@ -3708,7 +4159,11 @@ def make_unroll_batch_matmul_tests(zip_path):
     input_tensor2 = tf.placeholder(
         dtype=parameters["dtype"], shape=parameters["shape"][1])
     # Should be unrolled and replaced with fully_connected ops in the end.
-    out = tf.matmul(input_tensor1, input_tensor2)
+    out = tf.matmul(
+        input_tensor1,
+        input_tensor2,
+        transpose_a=parameters["shape"][2],
+        transpose_b=parameters["shape"][3])
     return [input_tensor1, input_tensor2], [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
@@ -3745,8 +4200,82 @@ def make_placeholder_with_default_tests(zip_path):
     return [input_value], sess.run(
         outputs, feed_dict=dict(zip(inputs, [input_value])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs,
-                    expected_tf_success=3)
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_unique_tests(zip_path):
+  """Make a set of tests for Unique op."""
+
+  test_parameters = [
+      {
+          "input_shape": [[1]],
+          "index_type": [tf.int32, tf.int64, None],
+          "input_values": [3]
+      },
+      {
+          "input_shape": [[5]],
+          "index_type": [tf.int32, tf.int64],
+          "input_values": [[3, 2, 1, 2, 3]]
+      },
+      {
+          "input_shape": [[7]],
+          "index_type": [tf.int32, tf.int64],
+          "input_values": [[1, 1, 1, 1, 1, 1, 1]]
+      },
+      {
+          "input_shape": [[5]],
+          "index_type": [tf.int32, tf.int64],
+          "input_values": [[3, 2, 1, 0, -1]]
+      }]
+
+  def build_graph(parameters):
+    """Build the graph for the test case."""
+
+    input_tensor = tf.placeholder(
+        dtype=tf.int32, name="input", shape=parameters["input_shape"])
+    if parameters["index_type"] is None:
+      output = tf.unique(input_tensor)
+    else:
+      output = tf.unique(input_tensor, parameters["index_type"])
+
+    return [input_tensor], output
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = [create_tensor_data(tf.int32, parameters["input_shape"])]
+    return input_values, sess.run(
+        outputs, feed_dict=dict(zip(inputs, input_values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_reverse_v2_tests(zip_path):
+  """Make a set of tests to do reverse_v2."""
+
+  test_parameters = [{
+      "base_shape": [[3, 4, 3], [3, 4], [5, 6, 7, 8]],
+      "axis": [0, 1, 2, 3],
+  }]
+
+  def get_valid_axis(parameters):
+    """Return a tweaked version of 'axis'."""
+    axis = parameters["axis"]
+    shape = parameters["base_shape"][:]
+    while axis > len(shape) - 1:
+      axis -= 1
+    return axis
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name=("input"), shape=parameters["base_shape"])
+    outs = tf.reverse(input_tensor, axis=[get_valid_axis(parameters)])
+    return [input_tensor], [outs]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(np.float32, shape=parameters["base_shape"])
+    return [input_value], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
 # Toco binary path provided by the generate rule.
diff --git a/tensorflow/lite/testing/generated_examples_zip_test.cc b/tensorflow/lite/testing/generated_examples_zip_test.cc
index a9a31ad088e6f4b0297ba313c585abbe6189728b..fb98cc9b1725f8295bb060ae60ceb151569616e6 100644
--- a/tensorflow/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/lite/testing/generated_examples_zip_test.cc
@@ -102,9 +102,28 @@ std::map<string, string> kBrokenTests = {
     {R"(^\/add.*dtype=tf\.int64)", "119126484"},
     {R"(^\/floor_div.*dtype=tf\.int64)", "119126484"},
     {R"(^\/squared_difference.*dtype=tf\.int64)", "119126484"},
+};
 
-    // Strided Slice chooses the wrong dimension.
-    {R"(^\/strided_slice_buggy)", "119786029"},
+// Additional list of tests that are expected to fail when
+//   --test_arg=--ignore_known_bugs=false
+// and
+//   --test_arg=--use_nnapi=true
+// Note that issues related to lack of NNAPI support for a particular op are
+// handled separately; this list is specifically for broken cases where
+// execution produces broken output.
+// Key is a substring of the test name and value is a bug number.
+std::map<string, string> kBrokenNnapiTests = {
+    // Certain NNAPI kernels silently fail with int32 types.
+    {R"(^\/add.*dtype=tf\.int32)", "122987564"},
+    {R"(^\/concat.*dtype=tf\.int32)", "122987564"},
+    {R"(^\/mul.*dtype=tf\.int32)", "122987564"},
+    {R"(^\/space_to_depth.*dtype=tf\.int32)", "122987564"},
+
+    // Certain NNAPI fully_connected shape permutations fail.
+    {R"(^\/fully_connected_constant_filter=True.*shape1=\[3,3\])", "122987564"},
+    {R"(^\/fully_connected_constant_filter=True.*shape1=\[4,4\])", "122987564"},
+    {R"(^\/fully_connected.*shape1=\[3,3\].*transpose_b=True)", "122987564"},
+    {R"(^\/fully_connected.*shape1=\[4,4\].*shape2=\[4,1\])", "122987564"},
 };
 
 // Allows test data to be unarchived into a temporary directory and makes
@@ -242,8 +261,13 @@ TEST_P(OpsTest, RunZipTests) {
   tflite::testing::TfLiteDriver test_driver(FLAGS_use_nnapi);
   test_driver.SetModelBaseDir(tflite_dir);
 
+  auto broken_tests = kBrokenTests;
+  if (FLAGS_use_nnapi) {
+    broken_tests.insert(kBrokenNnapiTests.begin(), kBrokenNnapiTests.end());
+  }
+
   string bug_number;
-  for (const auto& p : kBrokenTests) {
+  for (const auto& p : broken_tests) {
     if (RE2::PartialMatch(test_name, p.first)) {
       bug_number = p.second;
     }
diff --git a/tensorflow/lite/testing/join.h b/tensorflow/lite/testing/join.h
index d1c314608687f045b346cc5526ea46c8149c2755..d10d2909b5ec4a269fd1a67d7a22f4c1e76f707e 100644
--- a/tensorflow/lite/testing/join.h
+++ b/tensorflow/lite/testing/join.h
@@ -24,7 +24,21 @@ limitations under the License.
 namespace tflite {
 namespace testing {
 
-// Join a list of data separated by delimiter.
+// Join a list of data with default precision separated by delimiter.
+template <typename T>
+string JoinDefault(T* data, size_t len, const string& delimiter) {
+  if (len == 0 || data == nullptr) {
+    return "";
+  }
+  std::stringstream result;
+  result << data[0];
+  for (int i = 1; i < len; i++) {
+    result << delimiter << data[i];
+  }
+  return result.str();
+}
+
+// Join a list of data with fixed precision separated by delimiter.
 template <typename T>
 string Join(T* data, size_t len, const string& delimiter) {
   if (len == 0 || data == nullptr) {
diff --git a/tensorflow/lite/testing/join_test.cc b/tensorflow/lite/testing/join_test.cc
index 0b3c07f37e14e3815ac1eb4acd0aefac3515064c..476a7f20591691ccddff6829c894c640608f6471 100644
--- a/tensorflow/lite/testing/join_test.cc
+++ b/tensorflow/lite/testing/join_test.cc
@@ -26,6 +26,11 @@ TEST(JoinTest, JoinInt) {
   EXPECT_EQ(Join(data.data(), data.size(), ","), "1,2,3");
 }
 
+TEST(JoinDefaultTest, JoinFloat) {
+  float data[] = {1.0, -3, 2.3, 1e-5};
+  EXPECT_EQ(JoinDefault(data, 4, " "), "1 -3 2.3 1e-05");
+}
+
 TEST(JoinTest, JoinFloat) {
   float data[] = {1.0, -3, 2.3, 1e-5};
   EXPECT_EQ(Join(data, 4, " "), "1 -3 2.29999995 9.99999975e-06");
diff --git a/tensorflow/lite/testing/kernel_test/BUILD b/tensorflow/lite/testing/kernel_test/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..c46e80cc360043158928544a54c0221a7b405ad0
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/BUILD
@@ -0,0 +1,124 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_binary",
+    "tf_cc_test",
+)
+
+cc_library(
+    name = "util",
+    hdrs = ["util.h"],
+    deps = [
+        ":input_generator",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/testing:split",
+        "//tensorflow/lite/testing:tflite_driver",
+    ] + select({
+        "//conditions:default": [
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+        ],
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib",
+        ],
+    }),
+)
+
+tf_cc_test(
+    name = "util_test",
+    size = "small",
+    srcs = ["util_test.cc"],
+    data = [
+        "//tensorflow/lite:testdata/add.bin",
+        "//tensorflow/lite:testdata/test_input.csv",
+    ],
+    tags = [
+        "no_oss",
+    ],
+    deps = [
+        ":util",
+        "//tensorflow/lite/testing:tflite_driver",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_cc_binary(
+    name = "tflite_kernel_runner",
+    srcs = ["tflite_kernel_runner.cc"],
+    deps = [
+        ":util",
+    ],
+)
+
+tf_cc_binary(
+    name = "generate_diff_report",
+    srcs = ["generate_diff_report.cc"],
+    deps = [
+        ":diff_analyzer",
+        "//tensorflow/core:framework_internal",
+    ],
+)
+
+cc_library(
+    name = "input_generator",
+    srcs = ["input_generator.cc"],
+    hdrs = ["input_generator.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/testing:join",
+        "//tensorflow/lite/testing:split",
+    ],
+)
+
+tf_cc_test(
+    name = "input_generator_test",
+    size = "small",
+    srcs = ["input_generator_test.cc"],
+    data = [
+        "//tensorflow/lite:testdata/multi_add.bin",
+        "//tensorflow/lite:testdata/test_input.csv",
+    ],
+    tags = [
+        "no_oss",
+    ],
+    deps = [
+        ":input_generator",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "diff_analyzer",
+    srcs = ["diff_analyzer.cc"],
+    hdrs = ["diff_analyzer.h"],
+    deps = [
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/testing:split",
+    ],
+)
+
+tf_cc_test(
+    name = "diff_analyzer_test",
+    size = "small",
+    srcs = ["diff_analyzer_test.cc"],
+    data = [
+        "//tensorflow/lite:testdata/test_input.csv",
+    ],
+    tags = [
+        "no_oss",
+    ],
+    deps = [
+        ":diff_analyzer",
+        "//tensorflow/core:lib",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/testing/kernel_test/diff_analyzer.cc b/tensorflow/lite/testing/kernel_test/diff_analyzer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7d6fcc80be17b4020f53dddb8215a083031fd501
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/diff_analyzer.cc
@@ -0,0 +1,115 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/testing/kernel_test/diff_analyzer.h"
+
+#include <cmath>
+#include <fstream>
+#include "tensorflow/lite/testing/split.h"
+
+namespace tflite {
+namespace testing {
+
+namespace {
+float CalculateNormalizedMaxDiff(const std::vector<float>& base,
+                                 const std::vector<float>& test) {
+  float diff = 0;
+  // For numerical stability in case the tensor is all 0.
+  float base_max = 1e-6;
+
+  for (int i = 0; i < base.size(); i++) {
+    diff = std::max(diff, std::abs(base[i] - test[i]));
+    base_max = std::max(base_max, base[i]);
+  }
+
+  return diff / base_max;
+}
+
+float CalculateNormalizedL2Norm(const std::vector<float>& base,
+                                const std::vector<float>& test) {
+  float l2_error = 0;
+  // For numerical stability in case the tensor is all 0.
+  float base_max = 1e-6;
+
+  for (int i = 0; i < base.size(); i++) {
+    float diff = base[i] - test[i];
+    l2_error += diff * diff;
+    base_max = std::max(base_max, base[i]);
+  }
+
+  l2_error /= base.size();
+
+  return std::sqrt(l2_error) / base_max;
+}
+
+TfLiteStatus Populate(const string& filename,
+                      std::vector<std::vector<float>>* tensors) {
+  if (filename.empty()) {
+    fprintf(stderr, "Empty input file name.");
+    return kTfLiteError;
+  }
+
+  std::ifstream file(filename);
+  string content;
+  while (std::getline(file, content, '\n')) {
+    tensors->push_back(Split<float>(content, ","));
+  }
+
+  file.close();
+  return kTfLiteOk;
+}
+}  // namespace
+
+TfLiteStatus DiffAnalyzer::ReadFiles(const string& base, const string& test) {
+  TF_LITE_ENSURE_STATUS(Populate(base, &base_tensors_));
+  TF_LITE_ENSURE_STATUS(Populate(test, &test_tensors_));
+
+  if (base_tensors_.size() != test_tensors_.size()) {
+    fprintf(stderr, "Golden and test tensor dimensions don't match.");
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus DiffAnalyzer::WriteReport(const string& filename) {
+  if (filename.empty()) {
+    fprintf(stderr, "Empty output file name.");
+    return kTfLiteError;
+  }
+
+  std::ofstream output_file;
+  output_file.open(filename, std::fstream::out | std::fstream::trunc);
+  if (!output_file) {
+    fprintf(stderr, "Failed to open output file %s.", filename.c_str());
+    return kTfLiteError;
+  }
+
+  output_file << "Normalized L2 Error"
+              << ","
+              << "Normalized Max Diff"
+              << "\n";
+  for (int i = 0; i < base_tensors_.size(); i++) {
+    float l2_error =
+        CalculateNormalizedL2Norm(base_tensors_[i], test_tensors_[i]);
+    float max_diff =
+        CalculateNormalizedMaxDiff(base_tensors_[i], test_tensors_[i]);
+    output_file << l2_error << "," << max_diff << "\n";
+  }
+
+  output_file.close();
+  return kTfLiteOk;
+}
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/lite/testing/kernel_test/diff_analyzer.h b/tensorflow/lite/testing/kernel_test/diff_analyzer.h
new file mode 100644
index 0000000000000000000000000000000000000000..aecbaea449bda3edd1e5176b9a91b4542afc64f3
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/diff_analyzer.h
@@ -0,0 +1,42 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_KERNEL_TEST_DIFF_ANALYZER_H_
+#define TENSORFLOW_LITE_TESTING_KERNEL_TEST_DIFF_ANALYZER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/string.h"
+
+namespace tflite {
+namespace testing {
+
+// Reads the baseline and test files with output tensor values, and calculates
+// the diff metrics.
+class DiffAnalyzer {
+ public:
+  DiffAnalyzer() = default;
+  TfLiteStatus ReadFiles(const string& base, const string& test);
+  TfLiteStatus WriteReport(const string& filename);
+
+ private:
+  std::vector<std::vector<float>> base_tensors_;
+  std::vector<std::vector<float>> test_tensors_;
+};
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_KERNEL_TEST_DIFF_ANALYZER_H_
diff --git a/tensorflow/lite/testing/kernel_test/diff_analyzer_test.cc b/tensorflow/lite/testing/kernel_test/diff_analyzer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f92a3e6af756b932aa0a78ddd2ab5cfb48f9dc8b
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/diff_analyzer_test.cc
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/testing/kernel_test/diff_analyzer.h"
+
+#include <fstream>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/io/path.h"
+
+namespace tflite {
+namespace testing {
+
+namespace {
+
+TEST(DiffAnalyzerTest, ZeroDiff) {
+  DiffAnalyzer diff_analyzer;
+  string filename = "third_party/tensorflow/lite/testdata/test_input.csv";
+  ASSERT_EQ(diff_analyzer.ReadFiles(filename, filename), kTfLiteOk);
+
+  string output_file =
+      tensorflow::io::JoinPath(FLAGS_test_tmpdir + "diff_report.csv");
+  ASSERT_EQ(diff_analyzer.WriteReport(output_file), kTfLiteOk);
+
+  std::string content;
+  std::ifstream file(output_file);
+  std::getline(file, content);
+  std::getline(file, content);
+  ASSERT_EQ(content, "0,0");
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/lite/testing/kernel_test/generate_diff_report.cc b/tensorflow/lite/testing/kernel_test/generate_diff_report.cc
new file mode 100644
index 0000000000000000000000000000000000000000..afa6a9a94ec2ffd824d66a363c53b69455706d06
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/generate_diff_report.cc
@@ -0,0 +1,34 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/lite/testing/kernel_test/diff_analyzer.h"
+
+int main(int argc, char** argv) {
+  string base, test, output;
+  std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("base", &base, "Path to the base serialized tensor."),
+      tensorflow::Flag("test", &test, "Path to the test serialized tensor."),
+      tensorflow::Flag("output", &output, "Path to the output file."),
+  };
+  tensorflow::Flags::Parse(&argc, argv, flag_list);
+
+  tflite::testing::DiffAnalyzer diff_analyzer;
+  diff_analyzer.ReadFiles(base, test);
+  diff_analyzer.WriteReport(output);
+  return 0;
+}
diff --git a/tensorflow/lite/testing/kernel_test/input_generator.cc b/tensorflow/lite/testing/kernel_test/input_generator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..897e18560d18584a4ba93957f372073de47b44d3
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/input_generator.cc
@@ -0,0 +1,208 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/testing/kernel_test/input_generator.h"
+
+#include <fstream>
+#include <limits>
+#include <random>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/testing/join.h"
+#include "tensorflow/lite/testing/split.h"
+
+namespace tflite {
+namespace testing {
+
+namespace {
+
+template <typename T>
+std::vector<T> GenerateRandomTensor(TfLiteIntArray* dims,
+                                    const std::function<T(int)>& random_func) {
+  int64_t num_elements = 1;
+  for (int i = 0; i < dims->size; i++) {
+    num_elements *= dims->data[i];
+  }
+
+  std::vector<T> result(num_elements);
+  for (int i = 0; i < num_elements; i++) {
+    result[i] = random_func(i);
+  }
+  return result;
+}
+
+template <typename T>
+std::vector<T> GenerateUniform(TfLiteIntArray* dims, float min, float max) {
+  auto random_float = [](float min, float max) {
+    // TODO(yunluli): Change seed for each invocation if needed.
+    static unsigned int seed;
+    return min + (max - min) * static_cast<float>(rand_r(&seed)) / RAND_MAX;
+  };
+
+  std::function<T(int)> random_t = [&](int) {
+    return static_cast<T>(random_float(min, max));
+  };
+  std::vector<T> data = GenerateRandomTensor(dims, random_t);
+  return data;
+}
+
+template <typename T>
+std::vector<T> GenerateGaussian(TfLiteIntArray* dims, float min, float max) {
+  auto random_float = [](float min, float max) {
+    static std::default_random_engine generator;
+    // We generate a float number within [0, 1) following a mormal distribution
+    // with mean = 0.5 and stddev = 1/3, and use it to scale the final random
+    // number into the desired range.
+    static std::normal_distribution<double> distribution(0.5, 1.0 / 3);
+    auto rand_n = distribution(generator);
+    while (rand_n < 0 || rand_n >= 1) {
+      rand_n = distribution(generator);
+    }
+
+    return min + (max - min) * static_cast<float>(rand_n);
+  };
+
+  std::function<T(int)> random_t = [&](int) {
+    return static_cast<T>(random_float(min, max));
+  };
+  std::vector<T> data = GenerateRandomTensor(dims, random_t);
+  return data;
+}
+
+}  // namespace
+
+TfLiteStatus InputGenerator::LoadModel(const string& model_dir) {
+  model_ = FlatBufferModel::BuildFromFile(model_dir.c_str());
+  if (!model_) {
+    fprintf(stderr, "Cannot load model %s", model_dir.c_str());
+    return kTfLiteError;
+  }
+
+  ::tflite::ops::builtin::BuiltinOpResolver builtin_ops;
+  InterpreterBuilder(*model_, builtin_ops)(&interpreter_);
+  if (!interpreter_) {
+    fprintf(stderr, "Failed to build interpreter.");
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus InputGenerator::ReadInputsFromFile(const string& filename) {
+  if (filename.empty()) {
+    fprintf(stderr, "Empty input file name.");
+    return kTfLiteError;
+  }
+
+  std::ifstream input_file(filename);
+  string input;
+  while (std::getline(input_file, input, '\n')) {
+    inputs_.push_back(input);
+  }
+  input_file.close();
+  return kTfLiteOk;
+}
+
+TfLiteStatus InputGenerator::WriteInputsToFile(const string& filename) {
+  if (filename.empty()) {
+    fprintf(stderr, "Empty input file name.");
+    return kTfLiteError;
+  }
+
+  std::ofstream output_file;
+  output_file.open(filename, std::fstream::out | std::fstream::trunc);
+  if (!output_file) {
+    fprintf(stderr, "Failed to open output file %s.", filename.c_str());
+    return kTfLiteError;
+  }
+
+  for (const auto& input : inputs_) {
+    output_file << input << "\n";
+  }
+  output_file.close();
+
+  return kTfLiteOk;
+}
+
+// TODO(yunluli): Support more tensor types when needed.
+TfLiteStatus InputGenerator::GenerateInput(const string& distribution) {
+  auto input_tensor_ids = interpreter_->inputs();
+  for (auto id : input_tensor_ids) {
+    auto* tensor = interpreter_->tensor(id);
+    if (distribution == "UNIFORM") {
+      switch (tensor->type) {
+        case kTfLiteInt8: {
+          auto data = GenerateUniform<int8_t>(
+              tensor->dims, std::numeric_limits<int8_t>::min(),
+              std::numeric_limits<int8_t>::max());
+          inputs_.push_back(Join(data.data(), data.size(), ","));
+          break;
+        }
+        case kTfLiteUInt8: {
+          auto data = GenerateUniform<uint8_t>(
+              tensor->dims, std::numeric_limits<uint8_t>::min(),
+              std::numeric_limits<uint8_t>::max());
+          inputs_.push_back(Join(data.data(), data.size(), ","));
+          break;
+        }
+        case kTfLiteFloat32: {
+          auto data = GenerateUniform<float>(tensor->dims, -1, 1);
+          inputs_.push_back(JoinDefault(data.data(), data.size(), ","));
+          break;
+        }
+        default:
+          fprintf(stderr, "Unsupported input tensor type %s.",
+                  TfLiteTypeGetName(tensor->type));
+          break;
+      }
+    } else if (distribution == "GAUSSIAN") {
+      switch (tensor->type) {
+        case kTfLiteInt8: {
+          auto data = GenerateGaussian<int8_t>(
+              tensor->dims, std::numeric_limits<int8_t>::min(),
+              std::numeric_limits<int8_t>::max());
+          inputs_.push_back(Join(data.data(), data.size(), ","));
+          break;
+        }
+        case kTfLiteUInt8: {
+          auto data = GenerateGaussian<uint8_t>(
+              tensor->dims, std::numeric_limits<uint8_t>::min(),
+              std::numeric_limits<uint8_t>::max());
+          inputs_.push_back(Join(data.data(), data.size(), ","));
+          break;
+        }
+        case kTfLiteFloat32: {
+          auto data = GenerateGaussian<float>(tensor->dims, -1, 1);
+          inputs_.push_back(JoinDefault(data.data(), data.size(), ","));
+          break;
+        }
+        default:
+          fprintf(stderr, "Unsupported input tensor type %s.",
+                  TfLiteTypeGetName(tensor->type));
+          break;
+      }
+    } else {
+      fprintf(stderr, "Unsupported distribution %s.", distribution.c_str());
+      return kTfLiteError;
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+std::vector<string> InputGenerator::GetInputs() { return inputs_; }
+
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/lite/testing/kernel_test/input_generator.h b/tensorflow/lite/testing/kernel_test/input_generator.h
new file mode 100644
index 0000000000000000000000000000000000000000..859c7068e5448c837580fe79e89918fbd34c2a66
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/input_generator.h
@@ -0,0 +1,50 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_KERNEL_TEST_INPUT_GENERATOR_H_
+#define TENSORFLOW_LITE_TESTING_KERNEL_TEST_INPUT_GENERATOR_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/string.h"
+
+namespace tflite {
+namespace testing {
+
+// Generate random input, or read input from a file for kernel diff test.
+// Needs to load the tflite graph to get information like tensor shape and
+// data type.
+class InputGenerator {
+ public:
+  InputGenerator() = default;
+  TfLiteStatus LoadModel(const string& model_dir);
+  TfLiteStatus ReadInputsFromFile(const string& filename);
+  TfLiteStatus GenerateInput(const string& distribution);
+  std::vector<string> GetInputs();
+  TfLiteStatus WriteInputsToFile(const string& filename);
+
+ private:
+  std::unique_ptr<FlatBufferModel> model_;
+  std::unique_ptr<Interpreter> interpreter_;
+  std::vector<string> inputs_;
+};
+
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_KERNEL_TEST_INPUT_GENERATOR_H_
diff --git a/tensorflow/lite/testing/kernel_test/input_generator_test.cc b/tensorflow/lite/testing/kernel_test/input_generator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b6bd8b94fbd0e8288f0f3beab4aa8a8f91563e90
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/input_generator_test.cc
@@ -0,0 +1,81 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/testing/kernel_test/input_generator.h"
+
+#include <fstream>
+#include <map>
+
+#include <gmock/gmock.h>
+#include "testing/base/public/googletest.h"
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace testing {
+
+namespace {
+
+TEST(InputGeneratorTest, LoadModel) {
+  InputGenerator input_generator;
+  ASSERT_EQ(input_generator.LoadModel(
+                "third_party/tensorflow/lite/testdata/multi_add.bin"),
+            kTfLiteOk);
+}
+
+TEST(InputGeneratorTest, ReadWriteSimpleFile) {
+  InputGenerator input_generator;
+  ASSERT_EQ(input_generator.ReadInputsFromFile(
+                "third_party/tensorflow/lite/testdata/test_input.csv"),
+            kTfLiteOk);
+
+  std::vector<string> inputs;
+  std::string content = "1";
+  for (int i = 0; i < 1 * 8 * 8 * 3 - 1; i++) {
+    content.append(",1");
+  }
+  inputs.push_back(content);
+  ASSERT_EQ(input_generator.GetInputs(), inputs);
+
+  auto output_filename = FLAGS_test_tmpdir + "/out.csv";
+  ASSERT_EQ(input_generator.WriteInputsToFile(output_filename), kTfLiteOk);
+
+  std::ifstream in(output_filename);
+  std::string out;
+  std::getline(in, out, '\n');
+  ASSERT_EQ(out, content);
+}
+
+TEST(InputGeneratorTest, GenerateUniformInput) {
+  InputGenerator input_generator;
+  ASSERT_EQ(input_generator.LoadModel(
+                "third_party/tensorflow/lite/testdata/multi_add.bin"),
+            kTfLiteOk);
+  input_generator.GenerateInput("UNIFORM");
+  auto inputs = input_generator.GetInputs();
+  ASSERT_EQ(inputs.size(), 4);
+}
+
+TEST(InputGeneratorTest, GenerateGaussianInput) {
+  InputGenerator input_generator;
+  ASSERT_EQ(input_generator.LoadModel(
+                "third_party/tensorflow/lite/testdata/multi_add.bin"),
+            kTfLiteOk);
+  input_generator.GenerateInput("GAUSSIAN");
+  auto inputs = input_generator.GetInputs();
+  ASSERT_EQ(inputs.size(), 4);
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/lite/testing/kernel_test/tflite_kernel_runner.cc b/tensorflow/lite/testing/kernel_test/tflite_kernel_runner.cc
new file mode 100644
index 0000000000000000000000000000000000000000..34c1728ed1da6ec962989479dccfdc64bc8ca6cd
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/tflite_kernel_runner.cc
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/testing/kernel_test/util.h"
+
+int main(int argc, char** argv) {
+  tflite::testing::kernel_test::TestOptions options =
+      tflite::testing::kernel_test::ParseTfliteKernelTestFlags(&argc, argv);
+  const bool run_reference_kernel = options.kernel_type == "REFERENCE";
+  const bool use_nnapi = options.kernel_type == "NNAPI";
+
+  auto runner = absl::make_unique<tflite::testing::TfLiteDriver>(
+      use_nnapi, "", run_reference_kernel);
+  if (tflite::testing::kernel_test::RunKernelTest(options, runner.get()) ==
+      kTfLiteOk) {
+    return 0;
+  }
+
+  return -1;
+}
diff --git a/tensorflow/lite/testing/kernel_test/util.h b/tensorflow/lite/testing/kernel_test/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..d940e5ad12f497ec827ce0dc6be9e6311078b1a9
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/util.h
@@ -0,0 +1,122 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_KERNEL_TEST_UTIL_H_
+#define TENSORFLOW_LITE_TESTING_KERNEL_TEST_UTIL_H_
+
+#include <fstream>
+
+#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/testing/kernel_test/input_generator.h"
+#include "tensorflow/lite/testing/split.h"
+#include "tensorflow/lite/testing/tflite_driver.h"
+
+namespace tflite {
+namespace testing {
+namespace kernel_test {
+
+struct TestOptions {
+  // Path of tensorflow lite model.
+  string tflite_model;
+  // Path of the input file. If empty, generate at runtime.
+  string read_input_from_file;
+  // Path to dump the input file.
+  string dump_input_to_file;
+  // Path to dump the output.
+  string dump_output_to_file;
+  // Input distribution.
+  string input_distribution;
+  // Kernel type.
+  string kernel_type;
+};
+
+TestOptions ParseTfliteKernelTestFlags(int* argc, char** argv) {
+  TestOptions options;
+  std::vector<tensorflow::Flag> flags = {
+      tensorflow::Flag("tflite_model", &options.tflite_model,
+                       "Path of tensorflow lite model."),
+      tensorflow::Flag("read_input_from_file", &options.read_input_from_file,
+                       "File to read input data from. If empty, generates "
+                       "input at runtime."),
+      tensorflow::Flag("dump_input_to_file", &options.dump_input_to_file,
+                       "File to dump randomly generated input."),
+      tensorflow::Flag("dump_output_to_file", &options.dump_output_to_file,
+                       "File to dump output."),
+      tensorflow::Flag("input_distribution", &options.input_distribution,
+                       "Input distribution. Default: Gaussian."),
+      tensorflow::Flag("kernel_type", &options.kernel_type, "Kernel type."),
+  };
+
+  tensorflow::Flags::Parse(argc, argv, flags);
+
+  return options;
+}
+
+TfLiteStatus RunKernelTest(const kernel_test::TestOptions& options,
+                           TestRunner* runner) {
+  InputGenerator input_generator;
+
+  if (options.read_input_from_file.empty()) {
+    TF_LITE_ENSURE_STATUS(input_generator.LoadModel(options.tflite_model));
+    TF_LITE_ENSURE_STATUS(
+        input_generator.GenerateInput(options.input_distribution));
+  } else {
+    TF_LITE_ENSURE_STATUS(
+        input_generator.ReadInputsFromFile(options.read_input_from_file));
+  }
+
+  runner->LoadModel(options.tflite_model);
+  runner->AllocateTensors();
+  if (!runner->IsValid()) return kTfLiteError;
+  auto input_tensor_ids = runner->GetInputs();
+  auto inputs = input_generator.GetInputs();
+  if (inputs.size() != input_tensor_ids.size()) {
+    fprintf(stderr,
+            "Number of input tensors generated doesn't match what the model "
+            "asks for.");
+  }
+  for (int i = 0; i < inputs.size(); i++) {
+    runner->SetInput(input_tensor_ids[i], inputs[i]);
+  }
+
+  runner->Invoke();
+
+  if (!options.dump_input_to_file.empty()) {
+    TF_LITE_ENSURE_STATUS(
+        input_generator.WriteInputsToFile(options.dump_input_to_file));
+  }
+
+  if (!options.dump_output_to_file.empty()) {
+    std::ofstream output_file;
+    output_file.open(options.dump_output_to_file,
+                     std::fstream::out | std::fstream::trunc);
+    if (!output_file) {
+      return kTfLiteError;
+    }
+
+    for (auto id : runner->GetOutputs()) {
+      output_file << runner->ReadOutput(id) << "\n";
+    }
+    output_file.close();
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace kernel_test
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_KERNEL_TEST_UTIL_H_
diff --git a/tensorflow/lite/testing/kernel_test/util_test.cc b/tensorflow/lite/testing/kernel_test/util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..751c77e13621e338e4aba628d36950956c326593
--- /dev/null
+++ b/tensorflow/lite/testing/kernel_test/util_test.cc
@@ -0,0 +1,52 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/testing/kernel_test/util.h"
+
+#include <fstream>
+#include <memory>
+
+#include <gmock/gmock.h>
+#include "testing/base/public/googletest.h"
+#include <gtest/gtest.h>
+#include "tensorflow/lite/testing/tflite_driver.h"
+
+namespace tflite {
+namespace testing {
+namespace kernel_test {
+namespace {
+
+TEST(UtilTest, SimpleE2ETest) {
+  TestOptions options;
+  options.tflite_model = "third_party/tensorflow/lite/testdata/add.bin";
+  options.read_input_from_file =
+      "third_party/tensorflow/lite/testdata/test_input.csv";
+  options.dump_output_to_file = FLAGS_test_tmpdir + "/test_out.csv";
+  options.kernel_type = "REFERENCE";
+  std::unique_ptr<TestRunner> runner(new TfLiteDriver(false, "", true));
+  RunKernelTest(options, runner.get());
+  std::string expected = "3";
+  for (int i = 0; i < 1 * 8 * 8 * 3 - 1; i++) {
+    expected.append(",3");
+  }
+  std::string content;
+  std::ifstream file(options.dump_output_to_file);
+  std::getline(file, content);
+  EXPECT_EQ(content, expected);
+}
+
+}  // namespace
+}  // namespace kernel_test
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/lite/testing/nnapi_example.cc b/tensorflow/lite/testing/nnapi_example.cc
index 22df8dbd8821436ab9a960d0acd4423278c078d8..309cb19628cd54a39ea926a6f3506cf570ff3679 100644
--- a/tensorflow/lite/testing/nnapi_example.cc
+++ b/tensorflow/lite/testing/nnapi_example.cc
@@ -25,11 +25,14 @@ limitations under the License.
 #include <fstream>
 #include <iostream>
 #include <sstream>
-#include "tensorflow/lite/nnapi/NeuralNetworksShim.h"
+#include <string>
+#include "tensorflow/lite/nnapi/nnapi_implementation.h"
 #include "tensorflow/lite/testing/parse_testdata.h"
 #include "tensorflow/lite/testing/tflite_driver.h"
 
-string dirname(const string& s) { return s.substr(0, s.find_last_of("/")); }
+std::string dirname(const std::string& s) {
+  return s.substr(0, s.find_last_of("/"));
+}
 
 bool Interpret(const char* examples_filename, bool use_nnapi) {
   std::ifstream tflite_stream(examples_filename);
@@ -65,14 +68,14 @@ int main(int argc, char* argv[]) {
     return 1;
   }
 
-  string base_dir = dirname(argv[1]);
+  std::string base_dir = dirname(argv[1]);
   DIR* dir = opendir(base_dir.c_str());
   if (dir == nullptr) {
     fprintf(stderr, "Can't open dir %s\n", base_dir.c_str());
     return 1;
   }
   while (struct dirent* ent = readdir(dir)) {
-    string name = ent->d_name;
+    std::string name = ent->d_name;
     if (name.rfind(".txt") == name.length() - 4) {
       printf("%s: ", name.c_str());
       if (Interpret((base_dir + "/" + name).c_str(), use_nnapi)) {
diff --git a/tensorflow/lite/testing/string_util.cc b/tensorflow/lite/testing/string_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cf9d5087644cc52415a83dd80b457249b85765b5
--- /dev/null
+++ b/tensorflow/lite/testing/string_util.cc
@@ -0,0 +1,45 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "tensorflow/lite/testing/string_util.h"
+
+#include "absl/strings/escaping.h"
+#include "tensorflow/lite/python/interpreter_wrapper/numpy.h"
+#include "tensorflow/lite/python/interpreter_wrapper/python_utils.h"
+#include "tensorflow/lite/string_util.h"
+
+namespace tflite {
+namespace testing {
+namespace python {
+
+PyObject* SerializeAsHexString(PyObject* value) {
+  DynamicBuffer dynamic_buffer;
+  if (!python_utils::FillStringBufferWithPyArray(value, &dynamic_buffer)) {
+    return nullptr;
+  }
+
+  char* char_buffer = nullptr;
+  size_t size = dynamic_buffer.WriteToBuffer(&char_buffer);
+  string s = absl::BytesToHexString({char_buffer, size});
+  free(char_buffer);
+
+  return python_utils::ConvertToPyString(s.data(), s.size());
+}
+
+}  // namespace python
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/lite/testing/string_util.h b/tensorflow/lite/testing/string_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..56c024d918df37641c12851a2a02187d12e03b7d
--- /dev/null
+++ b/tensorflow/lite/testing/string_util.h
@@ -0,0 +1,33 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TESTING_STRING_UTIL_H_
+#define TENSORFLOW_LITE_TESTING_STRING_UTIL_H_
+
+#include <Python.h>
+#include <string>
+
+namespace tflite {
+namespace testing {
+namespace python {
+
+// Take a python string array, convert it to TF Lite dynamic buffer format and
+// serialize it as a HexString.
+PyObject* SerializeAsHexString(PyObject* value);
+
+}  // namespace python
+}  // namespace testing
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TESTING_STRING_UTIL_H_
diff --git a/tensorflow/lite/testing/string_util.i b/tensorflow/lite/testing/string_util.i
new file mode 100644
index 0000000000000000000000000000000000000000..574abb79653ff858721e28d0d33225e3e24cbbfd
--- /dev/null
+++ b/tensorflow/lite/testing/string_util.i
@@ -0,0 +1,31 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+%{
+
+#define SWIG_FILE_WITH_INIT
+#include "tensorflow/lite/testing/string_util.h"
+
+%}
+
+namespace tflite {
+namespace testing {
+namespace python {
+
+PyObject* SerializeAsHexString(PyObject* string_tensor);
+
+}  // namespace python
+}  // namespace testing
+}  // namespace tflite
diff --git a/tensorflow/lite/testing/tf_driver_test.cc b/tensorflow/lite/testing/tf_driver_test.cc
index 363d162d56a1670821d29768bc36411bf22d61e9..e79704d616cf59585228851b91c2e93259d84c0b 100644
--- a/tensorflow/lite/testing/tf_driver_test.cc
+++ b/tensorflow/lite/testing/tf_driver_test.cc
@@ -93,7 +93,7 @@ TEST(TfDriverTest, SimpleTest) {
                    {"1,8,8,3", "1,8,8,3", "1,8,8,3", "1,8,8,3"}, {"x", "y"}));
 
   runner->LoadModel(
-      "third_party/tensorflow/lite/testdata/multi_add.pb");
+      "tensorflow/lite/testdata/multi_add.pb");
   EXPECT_TRUE(runner->IsValid()) << runner->GetErrorMessage();
 
   ASSERT_THAT(runner->GetInputs(), ElementsAre(0, 1, 2, 3));
diff --git a/tensorflow/lite/testing/tflite_driver.cc b/tensorflow/lite/testing/tflite_driver.cc
index 4e11d49f252818f9f7024b8bbafa8b17ad77ad48..55670858338bda0bfe04828c33da6c64982a6656 100644
--- a/tensorflow/lite/testing/tflite_driver.cc
+++ b/tensorflow/lite/testing/tflite_driver.cc
@@ -19,7 +19,10 @@ limitations under the License.
 #include "absl/strings/escaping.h"
 #include "tensorflow/lite/builtin_op_data.h"
 #include "tensorflow/lite/delegates/flex/delegate.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/register_ref.h"
 #include "tensorflow/lite/string_util.h"
+#include "tensorflow/lite/testing/join.h"
 #include "tensorflow/lite/testing/split.h"
 
 namespace tflite {
@@ -77,32 +80,7 @@ class TfLiteDriver::Expectation {
     SetTensorData(values, &data_);
   }
 
-  template <>
-  void SetData<string>(const string& csv_values) {
-    string s = absl::HexStringToBytes(csv_values);
-    data_.raw = new char[s.size()];
-    memcpy(data_.raw, s.data(), s.size());
-  }
-
-  bool Check(bool verbose, const TfLiteTensor& tensor) {
-    switch (tensor.type) {
-      case kTfLiteFloat32:
-        return TypedCheck<float>(verbose, tensor);
-      case kTfLiteInt32:
-        return TypedCheck<int32_t>(verbose, tensor);
-      case kTfLiteInt64:
-        return TypedCheck<int64_t>(verbose, tensor);
-      case kTfLiteUInt8:
-        return TypedCheck<uint8_t>(verbose, tensor);
-      case kTfLiteBool:
-        return TypedCheck<bool>(verbose, tensor);
-      case kTfLiteString:
-        return TypedCheck<string>(verbose, tensor);
-      default:
-        fprintf(stderr, "Unsupported type %d in Check\n", tensor.type);
-        return false;
-    }
-  }
+  bool Check(bool verbose, const TfLiteTensor& tensor);
 
  private:
   template <typename T>
@@ -144,52 +122,87 @@ class TfLiteDriver::Expectation {
     return good_output;
   }
 
-  template <>
-  bool TypedCheck<string>(bool verbose, const TfLiteTensor& tensor) {
-    if (tensor.data.raw == nullptr) {
+  TfLitePtrUnion data_;
+  size_t num_elements_;
+};
+
+template <>
+void TfLiteDriver::Expectation::SetData<string>(const string& csv_values) {
+  string s = absl::HexStringToBytes(csv_values);
+  data_.raw = new char[s.size()];
+  memcpy(data_.raw, s.data(), s.size());
+}
+
+template <>
+bool TfLiteDriver::Expectation::TypedCheck<string>(bool verbose,
+                                                   const TfLiteTensor& tensor) {
+  if (tensor.data.raw == nullptr) {
+    if (verbose) {
+      std::cerr << "  got empty string" << std::endl;
+    }
+    return false;
+  }
+  int expected_num_strings = GetStringCount(data_.raw);
+  int returned_num_strings = GetStringCount(tensor.data.raw);
+  if (expected_num_strings != returned_num_strings) {
+    if (verbose) {
+      std::cerr << "  string count differ: got " << returned_num_strings
+                << ", but expected " << expected_num_strings << std::endl;
+    }
+    return false;
+  }
+  for (int i = 0; i < returned_num_strings; ++i) {
+    auto expected_ref = GetString(data_.raw, i);
+    auto returned_ref = GetString(tensor.data.raw, i);
+    if (expected_ref.len != returned_ref.len) {
       if (verbose) {
-        std::cerr << "  got empty string" << std::endl;
+        std::cerr << "  index " << i << ": got string of size "
+                  << returned_ref.len << ", but expected size "
+                  << expected_ref.len << std::endl;
       }
       return false;
     }
-    int expected_num_strings = GetStringCount(data_.raw);
-    int returned_num_strings = GetStringCount(tensor.data.raw);
-    if (expected_num_strings != returned_num_strings) {
+    if (strncmp(expected_ref.str, returned_ref.str, returned_ref.len) != 0) {
       if (verbose) {
-        std::cerr << "  string count differ: got " << returned_num_strings
-                  << ", but expected " << expected_num_strings << std::endl;
+        std::cerr << "  index " << i << ": strings are different" << std::endl;
       }
       return false;
     }
-    for (int i = 0; i < returned_num_strings; ++i) {
-      auto expected_ref = GetString(data_.raw, i);
-      auto returned_ref = GetString(tensor.data.raw, i);
-      if (expected_ref.len != returned_ref.len) {
-        if (verbose) {
-          std::cerr << "  index " << i << ": got string of size "
-                    << returned_ref.len << ", but expected size "
-                    << expected_ref.len << std::endl;
-        }
-        return false;
-      }
-      if (strncmp(expected_ref.str, returned_ref.str, returned_ref.len) != 0) {
-        if (verbose) {
-          std::cerr << "  index " << i << ": strings are different"
-                    << std::endl;
-        }
-        return false;
-      }
-    }
-
-    return true;
   }
 
-  TfLitePtrUnion data_;
-  size_t num_elements_;
-};
+  return true;
+}
 
-TfLiteDriver::TfLiteDriver(bool use_nnapi, const string& delegate_name)
+bool TfLiteDriver::Expectation::Check(bool verbose,
+                                      const TfLiteTensor& tensor) {
+  switch (tensor.type) {
+    case kTfLiteFloat32:
+      return TypedCheck<float>(verbose, tensor);
+    case kTfLiteInt32:
+      return TypedCheck<int32_t>(verbose, tensor);
+    case kTfLiteInt64:
+      return TypedCheck<int64_t>(verbose, tensor);
+    case kTfLiteUInt8:
+      return TypedCheck<uint8_t>(verbose, tensor);
+    case kTfLiteBool:
+      return TypedCheck<bool>(verbose, tensor);
+    case kTfLiteString:
+      return TypedCheck<string>(verbose, tensor);
+    default:
+      fprintf(stderr, "Unsupported type %d in Check\n", tensor.type);
+      return false;
+  }
+}
+
+TfLiteDriver::TfLiteDriver(bool use_nnapi, const string& delegate_name,
+                           bool reference_kernel)
     : use_nnapi_(use_nnapi) {
+  if (reference_kernel) {
+    resolver_.reset(new ops::builtin::BuiltinRefOpResolver);
+  } else {
+    resolver_.reset(new ops::builtin::BuiltinOpResolver);
+  }
+
   if (delegate_name == "FLEX") {
     delegate_ = FlexDelegate::Create();
   }
@@ -221,8 +234,7 @@ void TfLiteDriver::LoadModel(const string& bin_file_path) {
     Invalidate("Failed to mmap model " + bin_file_path);
     return;
   }
-  ops::builtin::BuiltinOpResolver builtins;
-  InterpreterBuilder(*model_, builtins)(&interpreter_);
+  InterpreterBuilder(*model_, *resolver_)(&interpreter_);
   if (!interpreter_) {
     Invalidate("Failed build interpreter");
     return;
@@ -372,5 +384,34 @@ void TfLiteDriver::ResetLSTMStateTensors() {
   interpreter_->ResetVariableTensors();
 }
 
+string TfLiteDriver::ReadOutput(int id) {
+  auto* tensor = interpreter_->tensor(id);
+  int num_elements = 1;
+
+  for (int i = 0; i < tensor->dims->size; ++i) {
+    num_elements *= tensor->dims->data[i];
+  }
+
+  switch (tensor->type) {
+    case kTfLiteFloat32:
+      return JoinDefault(tensor->data.f, num_elements, ",");
+    case kTfLiteInt32:
+      return JoinDefault(tensor->data.i32, num_elements, ",");
+    case kTfLiteInt64:
+      return JoinDefault(tensor->data.i64, num_elements, ",");
+    case kTfLiteUInt8:
+      return Join(tensor->data.uint8, num_elements, ",");
+    case kTfLiteInt8:
+      return JoinDefault(tensor->data.int8, num_elements, ",");
+    case kTfLiteBool:
+      return JoinDefault(tensor->data.b, num_elements, ",");
+    default:
+      Invalidate(absl::StrCat("Unsupported tensor type ",
+                              TfLiteTypeGetName(tensor->type),
+                              " in TfLiteDriver::ReadOutput"));
+      return "";
+  }
+}
+
 }  // namespace testing
 }  // namespace tflite
diff --git a/tensorflow/lite/testing/tflite_driver.h b/tensorflow/lite/testing/tflite_driver.h
index 1da0533c57cf51f442253f28b6d9ba13078ef9a7..3cce6c4222ec36f5eac2f144062b5b850c326345 100644
--- a/tensorflow/lite/testing/tflite_driver.h
+++ b/tensorflow/lite/testing/tflite_driver.h
@@ -16,10 +16,12 @@ limitations under the License.
 #define TENSORFLOW_LITE_TESTING_TFLITE_DRIVER_H_
 
 #include <map>
+#include <memory>
 
 #include "tensorflow/lite/delegates/flex/delegate.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/register_ref.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/testing/test_runner.h"
 
@@ -29,7 +31,8 @@ namespace testing {
 // A test runner that feeds inputs into TF Lite and verifies its outputs.
 class TfLiteDriver : public TestRunner {
  public:
-  explicit TfLiteDriver(bool use_nnapi, const string& delegate = "");
+  explicit TfLiteDriver(bool use_nnapi, const string& delegate = "",
+                        bool reference_kernel = false);
   ~TfLiteDriver() override;
 
   void LoadModel(const string& bin_file_path) override;
@@ -46,7 +49,7 @@ class TfLiteDriver : public TestRunner {
   void SetExpectation(int id, const string& csv_values) override;
   void Invoke() override;
   bool CheckResults() override;
-  string ReadOutput(int id) override { return "no-op"; }
+  string ReadOutput(int id) override;
 
  private:
   void DeallocateStringTensor(TfLiteTensor* t) {
@@ -65,6 +68,7 @@ class TfLiteDriver : public TestRunner {
 
   class Expectation;
 
+  std::unique_ptr<OpResolver> resolver_;
   std::unique_ptr<FlexDelegate> delegate_;
   bool use_nnapi_ = false;
   std::unique_ptr<FlatBufferModel> model_;
diff --git a/tensorflow/lite/testing/tflite_driver_test.cc b/tensorflow/lite/testing/tflite_driver_test.cc
index 6e953e5e19b8f6cac1a4349145b03a7f8b5e1969..e80816bdf5ecd21d4f147e824188dd3a206d68dd 100644
--- a/tensorflow/lite/testing/tflite_driver_test.cc
+++ b/tensorflow/lite/testing/tflite_driver_test.cc
@@ -54,6 +54,44 @@ TEST(TfliteDriverTest, SimpleTest) {
   ASSERT_TRUE(runner->IsValid());
 
   ASSERT_TRUE(runner->CheckResults());
+  EXPECT_EQ(runner->ReadOutput(5), "0.101,0.202,0.303,0.404");
+  EXPECT_EQ(runner->ReadOutput(6), "0.011,0.022,0.033,0.044");
+}
+
+TEST(TfliteDriverTest, SingleAddOpTest) {
+  std::unique_ptr<TestRunner> runner(new TfLiteDriver(
+      /*use_nnapi*/ false, /*delegate*/ "", /*reference_kernel*/ true));
+
+  runner->SetModelBaseDir("tensorflow/lite");
+  runner->LoadModel("testdata/multi_add.bin");
+  ASSERT_TRUE(runner->IsValid());
+
+  ASSERT_THAT(runner->GetInputs(), ElementsAre(0, 1, 2, 3));
+  ASSERT_THAT(runner->GetOutputs(), ElementsAre(5, 6));
+
+  for (int i : {0, 1, 2, 3}) {
+    runner->ReshapeTensor(i, "1,2,2,1");
+  }
+  ASSERT_TRUE(runner->IsValid());
+
+  runner->AllocateTensors();
+
+  runner->SetInput(0, "0.1,0.2,0.3,0.4");
+  runner->SetInput(1, "0.001,0.002,0.003,0.004");
+  runner->SetInput(2, "0.001,0.002,0.003,0.004");
+  runner->SetInput(3, "0.01,0.02,0.03,0.04");
+
+  runner->ResetTensor(2);
+
+  runner->SetExpectation(5, "0.101,0.202,0.303,0.404");
+  runner->SetExpectation(6, "0.011,0.022,0.033,0.044");
+
+  runner->Invoke();
+  ASSERT_TRUE(runner->IsValid());
+
+  ASSERT_TRUE(runner->CheckResults());
+  EXPECT_EQ(runner->ReadOutput(5), "0.101,0.202,0.303,0.404");
+  EXPECT_EQ(runner->ReadOutput(6), "0.011,0.022,0.033,0.044");
 }
 
 }  // namespace
diff --git a/tensorflow/lite/tflite_exported_symbols.lds b/tensorflow/lite/tflite_exported_symbols.lds
new file mode 100644
index 0000000000000000000000000000000000000000..b145204aa1e2b039aa7075047b1fd9ca73157320
--- /dev/null
+++ b/tensorflow/lite/tflite_exported_symbols.lds
@@ -0,0 +1,3 @@
+*TfLite*
+*tflite*
+*TFL_*
diff --git a/tensorflow/lite/tflite_version_script.lds b/tensorflow/lite/tflite_version_script.lds
new file mode 100644
index 0000000000000000000000000000000000000000..1df70705ebf4a85d2f4c9f2301c53d48e623dff7
--- /dev/null
+++ b/tensorflow/lite/tflite_version_script.lds
@@ -0,0 +1,8 @@
+VERS_1.0 {
+  global:
+    *TfLite*;
+    *tflite*;
+    *TFL_*;
+  local:
+    *;
+};
diff --git a/tensorflow/lite/toco/BUILD b/tensorflow/lite/toco/BUILD
index 93d41fcae14c8130de87471bdce64edad131c11f..c477e2f4c089d8165186671ea5f6cd703297c7fe 100644
--- a/tensorflow/lite/toco/BUILD
+++ b/tensorflow/lite/toco/BUILD
@@ -133,6 +133,7 @@ cc_library(
 cc_library(
     name = "model_cmdline_flags",
     srcs = [
+        "args.cc",
         "model_cmdline_flags.cc",
     ],
     hdrs = [
@@ -192,6 +193,7 @@ cc_library(
         "graph_transformations/fuse_binary_into_preceding_affine.cc",
         "graph_transformations/fuse_broadcast_into_following_binary.cc",
         "graph_transformations/graph_transformations.cc",
+        "graph_transformations/group_bidirectional_sequence_ops.cc",
         "graph_transformations/hardcode_min_max.cc",
         "graph_transformations/identify_dilated_conv.cc",
         "graph_transformations/identify_l2_normalization.cc",
@@ -342,13 +344,15 @@ tf_cc_test(
     name = "import_tensorflow_test",
     srcs = ["import_tensorflow_test.cc"],
     deps = [
+        ":toco_port",
         ":toco_tooling",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -374,6 +378,7 @@ cc_library(
         ":types_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/lite/kernels/internal:types",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_googlesource_code_re2//:re2",
         "@protobuf_archive//:protobuf_headers",
@@ -385,9 +390,11 @@ tf_cc_test(
     srcs = ["tooling_util_test.cc"],
     deps = [
         ":model",
+        ":toco_port",
         ":tooling_util",
         "//tensorflow/core:lib",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -451,12 +458,13 @@ tf_cc_test(
         ":toco_port",
         ":toco_tooling",
         ":types_proto_cc",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
         "@com_google_absl//absl/strings",
         "//tensorflow/core:lib",
         # We cannot embed the core:ops dependency directly into :toco_tooling as
         # it can conflict with downstream deps when toco is used as a library.
         "//tensorflow/core:ops",
+        "//tensorflow/lite/testing:util",
     ],
 )
 
@@ -468,6 +476,20 @@ tf_cc_test(
     ],
     deps = [
         ":toco_port",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "model_cmdline_flags_test",
+    srcs = [
+        "model_cmdline_flags_test.cc",
+    ],
+    deps = [
+        ":model_cmdline_flags",
+        ":model_flags_proto_cc",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/lite/toco/args.cc b/tensorflow/lite/toco/args.cc
new file mode 100644
index 0000000000000000000000000000000000000000..da8debc49a697fb77832c93940b60c0bebe1a7f9
--- /dev/null
+++ b/tensorflow/lite/toco/args.cc
@@ -0,0 +1,169 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/toco/args.h"
+#include "absl/strings/str_split.h"
+
+namespace toco {
+namespace {
+
+// Helper class for SplitStructuredLine parsing.
+class ClosingSymbolLookup {
+ public:
+  explicit ClosingSymbolLookup(const char* symbol_pairs)
+      : closing_(), valid_closing_() {
+    // Initialize the opening/closing arrays.
+    for (const char* symbol = symbol_pairs; *symbol != 0; ++symbol) {
+      unsigned char opening = *symbol;
+      ++symbol;
+      // If the string ends before the closing character has been found,
+      // use the opening character as the closing character.
+      unsigned char closing = *symbol != 0 ? *symbol : opening;
+      closing_[opening] = closing;
+      valid_closing_[closing] = true;
+      if (*symbol == 0) break;
+    }
+  }
+
+  ClosingSymbolLookup(const ClosingSymbolLookup&) = delete;
+  ClosingSymbolLookup& operator=(const ClosingSymbolLookup&) = delete;
+
+  // Returns the closing character corresponding to an opening one,
+  // or 0 if the argument is not an opening character.
+  char GetClosingChar(char opening) const {
+    return closing_[static_cast<unsigned char>(opening)];
+  }
+
+  // Returns true if the argument is a closing character.
+  bool IsClosing(char c) const {
+    return valid_closing_[static_cast<unsigned char>(c)];
+  }
+
+ private:
+  // Maps an opening character to its closing. If the entry contains 0,
+  // the character is not in the opening set.
+  char closing_[256];
+  // Valid closing characters.
+  bool valid_closing_[256];
+};
+
+bool SplitStructuredLine(absl::string_view line, char delimiter,
+                         const char* symbol_pairs,
+                         std::vector<absl::string_view>* cols) {
+  ClosingSymbolLookup lookup(symbol_pairs);
+
+  // Stack of symbols expected to close the current opened expressions.
+  std::vector<char> expected_to_close;
+
+  ABSL_RAW_CHECK(cols != nullptr, "");
+  cols->push_back(line);
+  for (size_t i = 0; i < line.size(); ++i) {
+    char c = line[i];
+    if (expected_to_close.empty() && c == delimiter) {
+      // We don't have any open expression, this is a valid separator.
+      cols->back().remove_suffix(line.size() - i);
+      cols->push_back(line.substr(i + 1));
+    } else if (!expected_to_close.empty() && c == expected_to_close.back()) {
+      // Can we close the currently open expression?
+      expected_to_close.pop_back();
+    } else if (lookup.GetClosingChar(c)) {
+      // If this is an opening symbol, we open a new expression and push
+      // the expected closing symbol on the stack.
+      expected_to_close.push_back(lookup.GetClosingChar(c));
+    } else if (lookup.IsClosing(c)) {
+      // Error: mismatched closing symbol.
+      return false;
+    }
+  }
+  if (!expected_to_close.empty()) {
+    return false;  // Missing closing symbol(s)
+  }
+  return true;  // Success
+}
+
+inline bool TryStripPrefixString(absl::string_view str,
+                                 absl::string_view prefix, string* result) {
+  bool res = absl::ConsumePrefix(&str, prefix);
+  result->assign(str.begin(), str.end());
+  return res;
+}
+
+inline bool TryStripSuffixString(absl::string_view str,
+                                 absl::string_view suffix, string* result) {
+  bool res = absl::ConsumeSuffix(&str, suffix);
+  result->assign(str.begin(), str.end());
+  return res;
+}
+
+}  // namespace
+
+bool Arg<toco::IntList>::Parse(string text) {
+  parsed_value_.elements.clear();
+  specified_ = true;
+  // strings::Split("") produces {""}, but we need {} on empty input.
+  // TODO(aselle): Moved this from elsewhere, but ahentz recommends we could
+  // use absl::SplitLeadingDec32Values(text.c_str(), &parsed_values_.elements)
+  if (!text.empty()) {
+    int32 element;
+    for (absl::string_view part : absl::StrSplit(text, ',')) {
+      if (!SimpleAtoi(part, &element)) return false;
+      parsed_value_.elements.push_back(element);
+    }
+  }
+  return true;
+}
+
+bool Arg<toco::StringMapList>::Parse(string text) {
+  parsed_value_.elements.clear();
+  specified_ = true;
+
+  if (text.empty()) {
+    return true;
+  }
+
+  std::vector<absl::string_view> outer_vector;
+  absl::string_view text_disposable_copy = text;
+  // TODO(aselle): Change argument parsing when absl supports structuredline.
+  SplitStructuredLine(text_disposable_copy, ',', "{}", &outer_vector);
+  for (const absl::string_view& outer_member_stringpiece : outer_vector) {
+    string outer_member(outer_member_stringpiece);
+    if (outer_member.empty()) {
+      continue;
+    }
+    string outer_member_copy = outer_member;
+    absl::StripAsciiWhitespace(&outer_member);
+    if (!TryStripPrefixString(outer_member, "{", &outer_member)) return false;
+    if (!TryStripSuffixString(outer_member, "}", &outer_member)) return false;
+    const std::vector<string> inner_fields_vector =
+        absl::StrSplit(outer_member, ',');
+
+    std::unordered_map<string, string> element;
+    for (const string& member_field : inner_fields_vector) {
+      std::vector<string> outer_member_key_value =
+          absl::StrSplit(member_field, ':');
+      if (outer_member_key_value.size() != 2) return false;
+      string& key = outer_member_key_value[0];
+      string& value = outer_member_key_value[1];
+      absl::StripAsciiWhitespace(&key);
+      absl::StripAsciiWhitespace(&value);
+      if (element.count(key) != 0) return false;
+      element[key] = value;
+    }
+    parsed_value_.elements.push_back(element);
+  }
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/lite/toco/args.h b/tensorflow/lite/toco/args.h
index 188f2f7e7af61c6c9e94da42d528d3fcff4b5e39..c6eeb2859a91643c3e87bdeb25c32a8ef5611c87 100644
--- a/tensorflow/lite/toco/args.h
+++ b/tensorflow/lite/toco/args.h
@@ -22,10 +22,6 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 #include "tensorflow/lite/toco/toco_port.h"
-#if defined(PLATFORM_GOOGLE)
-#include "strings/split.h"
-#include "strings/strip.h"
-#endif
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/lite/toco/toco_types.h"
@@ -64,7 +60,7 @@ class Arg final {
   const T& value() const { return value_; }
 
   // Parsing callback for the tensorflow::Flags code
-  bool parse(T value_in) {
+  bool Parse(T value_in) {
     value_ = value_in;
     specified_ = true;
     return true;
@@ -72,7 +68,7 @@ class Arg final {
 
   // Bind the parse member function so tensorflow::Flags can call it.
   std::function<bool(T)> bind() {
-    return std::bind(&Arg::parse, this, std::placeholders::_1);
+    return std::bind(&Arg::Parse, this, std::placeholders::_1);
   }
 
  private:
@@ -90,24 +86,10 @@ class Arg<toco::IntList> final {
   // Return true if the command line argument was specified on the command line.
   bool specified() const { return specified_; }
   // Bind the parse member function so tensorflow::Flags can call it.
-  bool parse(string text) {
-    parsed_value_.elements.clear();
-    specified_ = true;
-    // strings::Split("") produces {""}, but we need {} on empty input.
-    // TODO(aselle): Moved this from elsewhere, but ahentz recommends we could
-    // use absl::SplitLeadingDec32Values(text.c_str(), &parsed_values_.elements)
-    if (!text.empty()) {
-      int32 element;
-      for (absl::string_view part : absl::StrSplit(text, ',')) {
-        if (!SimpleAtoi(part, &element)) return false;
-        parsed_value_.elements.push_back(element);
-      }
-    }
-    return true;
-  }
+  bool Parse(string text);
 
   std::function<bool(string)> bind() {
-    return std::bind(&Arg::parse, this, std::placeholders::_1);
+    return std::bind(&Arg::Parse, this, std::placeholders::_1);
   }
 
   const toco::IntList& value() const { return parsed_value_; }
@@ -126,57 +108,10 @@ class Arg<toco::StringMapList> final {
   bool specified() const { return specified_; }
   // Bind the parse member function so tensorflow::Flags can call it.
 
-  bool parse(string text) {
-    parsed_value_.elements.clear();
-    specified_ = true;
-
-    if (text.empty()) {
-      return true;
-    }
-
-#if defined(PLATFORM_GOOGLE)
-    std::vector<absl::string_view> outer_vector;
-    absl::string_view text_disposable_copy = text;
-    SplitStructuredLine(text_disposable_copy, ',', "{}", &outer_vector);
-    for (const absl::string_view& outer_member_stringpiece : outer_vector) {
-      string outer_member(outer_member_stringpiece);
-      if (outer_member.empty()) {
-        continue;
-      }
-      string outer_member_copy = outer_member;
-      absl::StripAsciiWhitespace(&outer_member);
-      if (!strings::TryStripPrefixString(outer_member, "{", &outer_member))
-        return false;
-      if (!strings::TryStripSuffixString(outer_member, "}", &outer_member))
-        return false;
-      const std::vector<string> inner_fields_vector =
-          absl::StrSplit(outer_member, ',');
-
-      std::unordered_map<string, string> element;
-      for (const string& member_field : inner_fields_vector) {
-        std::vector<string> outer_member_key_value =
-            absl::StrSplit(member_field, ':');
-        if (outer_member_key_value.size() != 2) return false;
-        string& key = outer_member_key_value[0];
-        string& value = outer_member_key_value[1];
-        absl::StripAsciiWhitespace(&key);
-        absl::StripAsciiWhitespace(&value);
-        if (element.count(key) != 0) return false;
-        element[key] = value;
-      }
-      parsed_value_.elements.push_back(element);
-    }
-    return true;
-#else
-    // TODO(aselle): Fix argument parsing when absl supports structuredline
-    fprintf(stderr, "%s:%d StringMapList arguments not supported\n", __FILE__,
-            __LINE__);
-    abort();
-#endif
-  }
+  bool Parse(string text);
 
   std::function<bool(string)> bind() {
-    return std::bind(&Arg::parse, this, std::placeholders::_1);
+    return std::bind(&Arg::Parse, this, std::placeholders::_1);
   }
 
   const toco::StringMapList& value() const { return parsed_value_; }
diff --git a/tensorflow/lite/toco/dump_graphviz.cc b/tensorflow/lite/toco/dump_graphviz.cc
index 8896893f3579abcefa87e3411f9b186ca7a45a1b..ad69e4f7b7a4285f36750c60291d7a6a97e7e9f7 100644
--- a/tensorflow/lite/toco/dump_graphviz.cc
+++ b/tensorflow/lite/toco/dump_graphviz.cc
@@ -15,17 +15,21 @@ limitations under the License.
 #include "tensorflow/lite/toco/dump_graphviz.h"
 
 #include <cmath>
+#include <functional>
 #include <memory>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "absl/strings/str_replace.h"
+#include "absl/strings/str_split.h"
 #include "absl/strings/strip.h"
+#include "re2/re2.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_graphviz_dump_options.h"
 #include "tensorflow/lite/toco/toco_port.h"
 #include "tensorflow/lite/toco/toco_types.h"
 #include "tensorflow/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
 
 using toco::port::AppendF;
 using toco::port::StringF;
@@ -33,72 +37,158 @@ using toco::port::StringF;
 namespace toco {
 namespace {
 
+// 'nslimit' is a graphviz (dot) paramater that limits the iterations during
+// the layout phase. Omitting it allows infinite iterations, causing some
+// complex graphs to never finish. A value of 125 produces good graphs
+// while allowing complex graphs to finish.
+constexpr char kGraphFmt[] = R"CODE(digraph Computegraph { tooltip = "/"
+    nslimit=125 margin=36 ranksep = 2 labelloc="t" label=%s
+)CODE";
+// Note: tooltip's are only supported on SVGs in Chrome.
+constexpr char kSubgraphFmt[] =
+    R"CODE(    subgraph "cluster_%s" { style=rounded bgcolor="%s" penwidth=0.0 label=%s
+)CODE";
+constexpr char kArrayNodeFmt[] =
+    R"CODE(        "%s" [label=%s tooltip="%s" shape=%s style=filled fillcolor="%s" fontcolor="%sDD"];
+)CODE";
+constexpr char kOpNodeFmt[] =
+    R"CODE(        %s [label=%s tooltip=" " shape=box margin=0 style=filled fillcolor="%s" fontcolor="%sDD"];
+)CODE";
+constexpr char kInputEdgeFmt[] =
+    R"CODE(        "%s"%s -> %s:i%d:n [penwidth=%f weight=%f];
+)CODE";
+constexpr char kOutputEdgeFmt[] =
+    R"CODE(        %s:o%d:s -> "%s"%s [penwidth=%f weight=%f];
+)CODE";
+constexpr char kRNNBackEdgeFmt[] =
+    R"CODE(        "%s":s -> "%s":n [color="#0F9D58" constraint=false];
+)CODE";
+constexpr char kUnicodeMult[] = "\u00D7";
+constexpr char kUnicodeEllipsis[] = " \u2026 ";
+
 class Color {
  public:
   Color() {}
   Color(uint8 r, uint8 g, uint8 b) : r_(r), g_(g), b_(b) {}
+  explicit Color(uint32 word)
+      : r_((word & 0x00FF0000) >> 16),
+        g_((word & 0x0000FF00) >> 8),
+        b_((word & 0x000000FF) >> 0) {}
+
   // Returns the string serialization of this color in graphviz format,
   // for use as 'fillcolor' in boxes.
-  string FillColorString() const { return StringF("%.2X%.2X%.2X", r_, g_, b_); }
+  string AsHexString() const { return StringF("#%.2X%.2X%.2X", r_, g_, b_); }
+  // The color to use for this node; will be used as 'fillcolor'
+  // for its box. See Color::AsHexString. A suitable, different
+  // color will be chosen for the 'fontcolor' for the inside text
+  // label, see Color::TextColorString.
   // Returns the serialization in graphviz format of a suitable color to use
   // 'fontcolor' in the same boxes. It should black or white, whichever offers
-  // the better contrast from FillColorString().
+  // the better contrast from AsHexString().
   string TextColorString() const {
     // https://en.wikipedia.org/wiki/Relative_luminance
     const float luminance = 0.2126f * r_ + 0.7152f * g_ + 0.0722f * b_;
     const uint8 l = luminance > 128.f ? 0 : 255;
-    return StringF("%.2X%.2X%.2X", l, l, l);
+    return StringF("#%.2X%.2X%.2X", l, l, l);
   }
 
  private:
   uint8 r_ = 0, g_ = 0, b_ = 0;
 };
 
-struct NodeProperties {
-  // The text to display inside the box for this node.
-  string label;
-  // The color to use for this node; will be used as 'fillcolor'
-  // for its box. See Color::FillColorString. A suitable, different
-  // color will be chosen for the 'fontcolor' for the inside text
-  // label, see Color::TextColorString.
-  Color color;
-  float log2_buffer_size;
-};
-
-// All colors in this file are from:
-// https://material.io/guidelines/style/color.html
+Color HashStringToColor(string s) {
+  // Return a unique color for a name.
+  //
+  // This function removes Tensorflow anti-collision suffixes (eg "_2"), hashes
+  // the string to a uint_32, then twiddles some bits to get a light and subtle
+  // color. This seems to be a good heuristic for keeping enough of the name to
+  // hash to a unique color while still revealing structure through naming
+  // similarities.
+  //
+  // The regular expression "_\d+" matches any underscore followed by numbers,
+  // which we strip out. Examples:
+  //
+  //     "Conv"      -> "Conv"
+  //     "Conv_2"    -> "Conv"
+  //     "Conv_72"   -> "Conv"
+  //     "Pad_1_bias -> "Pad_bias"
+  //     "Conv_abc"  -> "Conv_abc"
+
+  RE2::GlobalReplace(&s, R"CODE(_\d+)CODE", "");
+  uint32 color_word = std::hash<std::string>{}(s);
+  color_word |= 0x00E0E0E0;
+  return Color(color_word);
+}
 
-Color GetColorForArray(const Model& model, const string& array_name) {
+void GetArrayColorAndShape(const Model& model, const string& array_name,
+                           Color* color, string* shape) {
+  // All colors in this file are from:
+  // https://material.io/guidelines/style/color.html
   // Arrays involved in RNN back-edges have a different color
   for (const auto& rnn_state : model.flags.rnn_states()) {
     // RNN state, fed by a back-edge. Bold color.
     if (array_name == rnn_state.state_array()) {
-      return Color(0x0F, 0x9D, 0x58);
+      *color = Color(0x0F, 0x9D, 0x58);
+      *shape = "invhouse";
+      return;
     }
     // RNN back-edge source, feeding a RNN state.
     // Light tone of the same color as RNN states.
     if (array_name == rnn_state.back_edge_source_array()) {
-      return Color(0xB7, 0xE1, 0xCD);
+      *color = Color(0xB7, 0xE1, 0xCD);
+      *shape = "house";
+      return;
     }
   }
   // Constant parameter arrays have their own bold color
   if (model.GetArray(array_name).buffer) {
-    return Color(0x42, 0x85, 0xF4);
+    *color = Color(0x42, 0x85, 0xF4);
+    *shape = "cylinder";
+    return;
   }
   // Remaining arrays are activations.
   // We use gray colors for them because they are the majority
   // of arrays so we want to highlight other arrays instead of them.
   // First, we use a bolder gray for input/output arrays:
   if (IsInputArray(model, array_name)) {
-    return Color(0x9E, 0x9E, 0x9E);
+    *color = Color(0x9E, 0x9E, 0x9E);
+    *shape = "invhouse";
+    return;
   }
   if (IsOutputArray(model, array_name)) {
-    return Color(0x9E, 0x9E, 0x9E);
+    *color = Color(0x9E, 0x9E, 0x9E);
+    *shape = "house";
+    return;
   }
   // Remaining arrays are intermediate activation arrays.
   // Lighter tone of the same grey as for input/output arrays:
   // We want these to be very discrete.
-  return Color(0xF5, 0xF5, 0xF5);
+  *color = Color(0xF5, 0xF5, 0xF5);
+  *shape = "box";
+}
+
+string GetArrayCompassPt(const Model& model, const string& array_name) {
+  // The "compass point" is the point on the node where edge connections are
+  // made. For most arrays we don't care, but input's and outputs look better
+  // connected at the tip of the "house" and "invhouse" shapes used. So we
+  // append ":n" and ":s" respectively for those.
+  for (const auto& rnn_state : model.flags.rnn_states()) {
+    // RNN state is essentially an input
+    if (array_name == rnn_state.state_array()) {
+      return ":s";
+    }
+    // RNN back-edge source is essentially an output
+    if (array_name == rnn_state.back_edge_source_array()) {
+      return ":n";
+    }
+  }
+  if (IsInputArray(model, array_name)) {
+    return ":s";
+  }
+  if (IsOutputArray(model, array_name)) {
+    return ":n";
+  }
+  return "";
 }
 
 void AppendArrayVal(string* string, Array const& array, int index) {
@@ -141,239 +231,550 @@ void AppendArrayVal(string* string, Array const& array, int index) {
   }
 }
 
-NodeProperties GetPropertiesForArray(const Model& model,
-                                     const string& array_name) {
-  NodeProperties node_properties;
-  node_properties.color = GetColorForArray(model, array_name);
-  node_properties.label = absl::StrReplaceAll(array_name, {{"/", "/\\n"}});
-  node_properties.log2_buffer_size = 0.0f;
+typedef std::map<string, string> Attributes;
+
+string AttributesToHtml(Attributes attributes) {
+  string html;
+  for (const auto& attr : attributes) {
+    html += R"CODE(<TR><TD CELLPADDING="1" ALIGN="RIGHT">)CODE";
+    html += attr.first;
+    html += R"CODE(:</TD><TD CELLPADDING="1" ALIGN="LEFT">)CODE";
+    html += attr.second;
+    html += "</TD></TR>";
+  }
+  return html;
+}
+
+string GetArrayLabel(const Model& model, const string& array_id) {
+  string html;
 
-  // Append array shape to the label.
-  auto& array = model.GetArray(array_name);
-  AppendF(&node_properties.label, "\\nType: %s",
-          ArrayDataTypeName(array.data_type));
+  // Use HTML-like labels (http://www.graphviz.org/doc/info/shapes.html#html)
+  html += "<";
 
+  // Begin Table
+  html += R"CODE(<FONT POINT-SIZE="10" FACE="Courier">)CODE";
+  html += R"CODE(<TABLE BORDER="0" CELLSPACING="2" CELLPADDING="0">)CODE";
+
+  auto& array = model.GetArray(array_id);
+  if (array.buffer) {
+    // "cylinder" shapes require some extra head room.
+    html += R"CODE(<TR><TD COLSPAN="2"> </TD></TR>)CODE";
+  }
+
+  // "Primary" name of array (last non-slash delimited group of characters).
+  html += R"CODE(<TR><TD COLSPAN="2" ALIGN="CENTER">)CODE";
+  html += R"CODE(<FONT POINT-SIZE="16" FACE="Helvetica"><I>)CODE";
+  AppendF(&html, R"CODE(%s)CODE",
+          std::vector<string>(absl::StrSplit(array_id, '/')).back());
+  html += R"CODE(</I></FONT>)CODE";
+  html += "</TD></TR>";
+
+  // Array data type and dimensions
+  html += R"CODE(<TR><TD COLSPAN="2" ALIGN="CENTER">)CODE";
+  html += R"CODE(<FONT POINT-SIZE="14" FACE="Courier"><B>)CODE";
+  // Type
+  html += ArrayDataTypeName(array.data_type);
+  // Shape
   if (array.has_shape()) {
     auto& array_shape = array.shape();
-    node_properties.label += "\\n[";
-    for (int id = 0; id < array_shape.dimensions_count(); id++) {
-      if (id == 0) {
-        AppendF(&node_properties.label, "%d", array_shape.dims(id));
-      } else {
-        // 0x00D7 is the unicode multiplication symbol
-        AppendF(&node_properties.label, "\u00D7%d", array_shape.dims(id));
+    html += "[";
+    for (int dim = 0; dim < array_shape.dimensions_count(); dim++) {
+      AppendF(&html, "%d", array_shape.dims(dim));
+      if (dim + 1 < array_shape.dimensions_count()) {
+        html += kUnicodeMult;
       }
     }
-    node_properties.label += "]";
+    html += "]";
+  }
 
-    int buffer_size = 0;
-    if (IsNonEmpty(array.shape())) {
-      buffer_size = RequiredBufferSizeForShape(array.shape());
-      node_properties.log2_buffer_size =
-          std::log2(static_cast<float>(buffer_size));
+  // Small buffer sample
+  int buffer_size = 0;
+  if (array.buffer) {
+    buffer_size = RequiredBufferSizeForShape(array.shape());
+  }
+  if ((buffer_size > 0) && (buffer_size <= 4)) {
+    html += " = ";
+    if (array.shape().dimensions_count() > 0) {
+      html += "{";
     }
-
-    if (array.buffer) {
-      const auto& array = model.GetArray(array_name);
-      if (buffer_size <= 4) {
-        AppendF(&node_properties.label, " = ");
-        if (array.shape().dimensions_count() > 0) {
-          AppendF(&node_properties.label, "{");
-        }
-        for (int i = 0; i < buffer_size; i++) {
-          AppendArrayVal(&node_properties.label, array, i);
-          if (i + 1 < buffer_size) {
-            AppendF(&node_properties.label, ", ");
-          }
-        }
-      } else {
-        AppendF(&node_properties.label, "\\n = ");
-        if (array.shape().dimensions_count() > 0) {
-          AppendF(&node_properties.label, "{");
-        }
-        AppendArrayVal(&node_properties.label, array, 0);
-        AppendF(&node_properties.label, ", ");
-        AppendArrayVal(&node_properties.label, array, 1);
-        // 0x2026 is the unicode ellipsis symbol
-        AppendF(&node_properties.label, " \u2026 ");
-        AppendArrayVal(&node_properties.label, array, buffer_size - 2);
-        AppendF(&node_properties.label, ", ");
-        AppendArrayVal(&node_properties.label, array, buffer_size - 1);
-      }
-      if (array.shape().dimensions_count() > 0) {
-        AppendF(&node_properties.label, "}");
+    for (int i = 0; i < buffer_size; i++) {
+      AppendArrayVal(&html, array, i);
+      if (i + 1 < buffer_size) {
+        html += ", ";
       }
     }
+    if (array.shape().dimensions_count() > 0) {
+      html += "}";
+    }
+  }
+  html += R"CODE(</B></FONT>)CODE";
+  html += "</TD></TR>";
+
+  // Large buffer samples get their own line
+  if (buffer_size > 4) {
+    html += R"CODE(<TR><TD COLSPAN="2" ALIGN="CENTER"> = {)CODE";
+    AppendArrayVal(&html, array, 0);
+    html += ", ";
+    AppendArrayVal(&html, array, 1);
+    html += kUnicodeEllipsis;
+    AppendArrayVal(&html, array, buffer_size - 2);
+    html += ", ";
+    AppendArrayVal(&html, array, buffer_size - 1);
+    html += "}</TD></TR>";
   }
 
+  // Other array properties
+  Attributes attrs;
   if (array.minmax) {
-    AppendF(&node_properties.label, "\\nMinMax: [%.7g, %.7g]",
-            array.minmax->min, array.minmax->max);
+    attrs["minmax"] =
+        StringF("[%.7g, %.7g]", array.minmax->min, array.minmax->max);
   }
-
   if (array.quantization_params) {
-    AppendF(&node_properties.label, "\\nQuantization: %7g * (x - %d)",
-            array.quantization_params->scale,
-            array.quantization_params->zero_point);
+    attrs["quant"] = StringF("%7g\u00B7(x-%d)",  // Unicode "cdot"
+                             array.quantization_params->scale,
+                             array.quantization_params->zero_point);
   }
-
   if (array.alloc) {
-    AppendF(&node_properties.label, "\\nTransient Alloc: [%d, %d)",
-            array.alloc->start, array.alloc->end);
+    attrs["alloc"] = StringF("[%d, %d)", array.alloc->start, array.alloc->end);
   }
-
-  return node_properties;
+  html += AttributesToHtml(attrs);
+
+  // output array_id in ultra-small font so it can be searched and copied.
+  html += R"CODE(<TR><TD COLSPAN="2" ALIGN="CENTER">)CODE";
+  html += R"CODE(<FONT POINT-SIZE="3" FACE="">)CODE";
+  AppendF(&html, R"CODE("%s")CODE", array_id);
+  html += R"CODE(</FONT>)CODE";
+  html += "</TD></TR>";
+
+  // End Table and HTML-like label
+  html += R"CODE(</TABLE></FONT>)CODE";
+  html += ">";
+  return html;
 }
 
-NodeProperties GetPropertiesForOperator(const Operator& op) {
-  NodeProperties node_properties;
-  if (op.type == OperatorType::kUnsupported) {
-    node_properties.label =
-        static_cast<const TensorFlowUnsupportedOperator&>(op).tensorflow_op;
-  } else {
-    node_properties.label =
-        string(absl::StripPrefix(OperatorTypeName(op.type), "TensorFlow"));
-  }
+Attributes GetOpAttributes(const Model& model, const Operator& op) {
+  Attributes attrs;
   switch (op.fused_activation_function) {
     case FusedActivationFunctionType::kRelu:
-      AppendF(&node_properties.label, "\\nReLU");
+      attrs["func"] = "ReLU";
       break;
     case FusedActivationFunctionType::kRelu6:
-      AppendF(&node_properties.label, "\\nReLU6");
+      attrs["func"] = "ReLU6";
       break;
     case FusedActivationFunctionType::kRelu1:
-      AppendF(&node_properties.label, "\\nReLU1");
+      attrs["func"] = "ReLU1";
       break;
     default:
       break;
   }
-  // Additional information for some of the operators.
+  // Output state of member vars on derived operators.
   switch (op.type) {
     case OperatorType::kConv: {
       const auto& conv_op = static_cast<const ConvOperator&>(op);
-      node_properties.color = Color(0xC5, 0x39, 0x29);  // Bolder color
-      AppendF(&node_properties.label, "\\n%dx%d/%s", conv_op.stride_width,
-              conv_op.stride_height,
-              conv_op.padding.type == PaddingType::kSame ? "S" : "V");
+      string stride;
+      AppendF(&stride, "%d", conv_op.stride_width);
+      stride += kUnicodeMult;
+      AppendF(&stride, "%d", conv_op.stride_height);
+      attrs["stride"] = stride;
+      attrs["padding"] =
+          (conv_op.padding.type == PaddingType::kSame) ? "same" : "valid";
       break;
     }
     case OperatorType::kDepthwiseConv: {
-      const auto& conv_op = static_cast<const DepthwiseConvOperator&>(op);
-      node_properties.color = Color(0xC5, 0x39, 0x29);  // Bolder color
-      AppendF(&node_properties.label, "\\n%dx%d/%s", conv_op.stride_width,
-              conv_op.stride_height,
-              conv_op.padding.type == PaddingType::kSame ? "S" : "V");
-      break;
-    }
-    case OperatorType::kFullyConnected: {
-      node_properties.color = Color(0xC5, 0x39, 0x29);  // Bolder color
+      const auto& depthconv_op = static_cast<const ConvOperator&>(op);
+      string stride;
+      AppendF(&stride, "%d", depthconv_op.stride_width);
+      stride += kUnicodeMult;
+      AppendF(&stride, "%d", depthconv_op.stride_height);
+      attrs["stride"] = stride;
+      attrs["padding"] =
+          (depthconv_op.padding.type == PaddingType::kSame) ? "same" : "valid";
       break;
     }
     case OperatorType::kFakeQuant: {
       const auto& fakequant_op = static_cast<const FakeQuantOperator&>(op);
-      node_properties.color = Color(0xC5, 0x39, 0x29);  // Bolder color
+      attrs["bits"] = StringF("%d", fakequant_op.num_bits);
       if (fakequant_op.minmax) {
-        AppendF(&node_properties.label, "\\n%dbit [%g,%g]",
-                fakequant_op.num_bits, fakequant_op.minmax->min,
-                fakequant_op.minmax->max);
+        attrs["range"] = StringF("[%g,%g]", fakequant_op.minmax->min,
+                                 fakequant_op.minmax->max);
       } else {
-        AppendF(&node_properties.label, "\\n%dbit [?,?]",
-                fakequant_op.num_bits);
+        attrs["range"] = "[?,?]";
       }
       break;
     }
     default:
-      node_properties.color = Color(0xDB, 0x44, 0x37);
       break;
   }
+  int64 math_ops_count;
+  if (EstimateArithmeticOpsCount(model, op, &math_ops_count) &&
+      (math_ops_count != 0)) {
+    attrs["math"] = FormattedNumber(math_ops_count) + "ops";
+  }
 
-  return node_properties;
+  return attrs;
 }
 
-}  // namespace
+Color GetOpColor(const Operator& op) {
+  if ((op.type == OperatorType::kDepthwiseConv) ||
+      (op.type == OperatorType::kConv) ||
+      (op.type == OperatorType::kFullyConnected) ||
+      (op.type == OperatorType::kFakeQuant)) {
+    // Give some ops a bolder red
+    return Color(0xC5, 0x39, 0x29);
+  } else {
+    return Color(0xDB, 0x44, 0x37);
+  }
+}
+
+string GetOpLabel(const Model& model, const Operator& op) {
+  // Use HTML-like labels (http://www.graphviz.org/doc/info/shapes.html#html)
+  string html;
+  html += "<";
+
+  // Begin Table
+  html += R"CODE(<FONT POINT-SIZE="10" FACE="Courier">)CODE";
+  html +=
+      R"CODE(<TABLE BORDER="0" CELLBORDER="0" CELLSPACING="0" CELLPADDING="0">)CODE";
+
+  // Input Ports
+  if (!op.inputs.empty()) {
+    html += R"CODE(<TR><TD COLSPAN="2" ALIGN="CENTER">)CODE";
+    // Distribute evenly using a sub-table
+    html += R"CODE(<TABLE BORDER="0" CELLBORDER="0" CELLSPACING="0">)CODE";
+    html += R"CODE(<TR>)CODE";
+    for (int i = 0; i < op.inputs.size(); i++) {
+      html += R"CODE(<TD PORT=")CODE";
+      AppendF(&html, "i%d", i);
+      html += R"CODE(">)CODE";
+      if (op.inputs.size() > 1) {
+        // Only number inputs when op has two or more inputs
+        AppendF(&html, "%d", i);
+      }
+      html += "</TD>";
+    }
+    html += "</TR>";
+    html += R"CODE(</TABLE></TD></TR>)CODE";
+  }
 
-void DumpGraphviz(const Model& model, string* output_file_contents) {
-  AppendF(output_file_contents, "digraph Computegraph {\n");
-  // 'nslimit' is a graphviz (dot) paramater that limits the iterations during
-  // the layout phase. Omitting it allows infinite iterations, causing some
-  // complex graphs to never finish. A value of 125 produces good graphs
-  // while allowing complex graphs to finish.
-  AppendF(output_file_contents, "\t nslimit=125;\n");
-
-  constexpr char kNodeFormat[] =
-      "\t \"%s\" [label=\"%s\", shape=%s, style=filled, fillcolor=\"#%s\", "
-      "fontcolor = \"#%sDD\"];\n";
-
-  constexpr char kEdgeFormat[] =
-      "\t \"%s\" -> \"%s\" [penwidth=%f, weight=%f];\n";
-
-  constexpr char kRNNBackEdgeFormat[] =
-      "\t \"%s\" -> \"%s\" [color=\"#0F9D58\"];\n";
-
-  for (const auto& array_kv : model.GetArrayMap()) {
-    // Add node for array.
-    const string& array_name = array_kv.first;
-    const auto& array_properties = GetPropertiesForArray(model, array_name);
-    AppendF(output_file_contents, kNodeFormat, array_name,
-            array_properties.label, "octagon",
-            array_properties.color.FillColorString().c_str(),
-            array_properties.color.TextColorString().c_str());
+  // Name
+  html += R"CODE(<TR><TD COLSPAN="2" CELLPADDING="3" ALIGN="CENTER">)CODE";
+  html += R"CODE(<FONT POINT-SIZE="16" FACE="Helvetica"><B>)CODE";
+  if (op.type == OperatorType::kUnsupported) {
+    html += static_cast<const TensorFlowUnsupportedOperator&>(op).tensorflow_op;
+  } else {
+    html += string(absl::StripPrefix(OperatorTypeName(op.type), "TensorFlow"));
+  }
+  html += R"CODE(</B></FONT>)CODE";
+  html += "</TD></TR>";
+
+  // Attributes
+  Attributes attrs = GetOpAttributes(model, op);
+  html += AttributesToHtml(attrs);
+
+  // Output Ports
+  if (!op.outputs.empty()) {
+    html += R"CODE(<TR><TD COLSPAN="2" ALIGN="CENTER">)CODE";
+    // Distribute evenly using a sub-table
+    html += R"CODE(<TABLE BORDER="0" CELLBORDER="0" CELLSPACING="0">)CODE";
+    html += R"CODE(<TR>)CODE";
+    for (int i = 0; i < op.outputs.size(); i++) {
+      html += R"CODE(<TD PORT=")CODE";
+      AppendF(&html, "o%d", i);
+      html += R"CODE(">)CODE";
+      if (op.outputs.size() > 1) {
+        // Only number outputs when op has two or more outputs
+        AppendF(&html, "%d", i);
+      }
+      html += "</TD>";
+    }
+    html += "</TR>";
+    html += R"CODE(</TABLE></TD></TR>)CODE";
   }
+
+  // End Table and HTML-like label
+  html += R"CODE(</TABLE></FONT>)CODE";
+  html += ">";
+
+  return html;
+}
+
+float GetLog2BufferSize(const Model& model, const string& array_id) {
+  auto& array = model.GetArray(array_id);
+  if (array.has_shape()) {
+    int buffer_size = 0;
+    if (IsNonEmpty(array.shape())) {
+      buffer_size = RequiredBufferSizeForShape(array.shape());
+      return std::log2(static_cast<float>(buffer_size));
+    }
+  }
+  return 0.0f;
+}
+
+string GetOpId(int op_index) { return StringF("op%05d", op_index); }
+
+void DumpOperator(const Model& model, string* output_file, int op_index) {
+  // Dump node for operator.
+  const Operator& op = *model.operators[op_index];
+  Color color = GetOpColor(op);
+  string label = GetOpLabel(model, op);
+  string op_id = GetOpId(op_index);
+  AppendF(output_file, kOpNodeFmt, op_id, label, color.AsHexString(),
+          color.TextColorString());
+}
+
+void DumpOperatorEdges(const Model& model, string* output_file, int op_index) {
+  // Inputs
+  const Operator& op = *model.operators[op_index];
+  string op_id = GetOpId(op_index);
+  for (int i = 0; i < op.inputs.size(); i++) {
+    const auto& input = op.inputs[i];
+    if (!model.HasArray(input)) {
+      // Connected arrays should _always_ exist. Except, perhaps, during
+      // development.
+      continue;
+    }
+    float log2_buffer_size = GetLog2BufferSize(model, input);
+    // Draw lines that transport more data thicker (Otherwise, where would the
+    // data fit? right?).
+    float line_width = std::max(0.5f, log2_buffer_size / 3.0f);
+    // Keep edges that transport more data shorter than those with less.
+    float weight = std::max(1.0f, log2_buffer_size);
+    if (!IsInputArray(model, input) &&
+        GetOpWithOutput(model, input) == nullptr) {
+      // Give the main line of data flow a straighter path by penalizing edges
+      // to standalone buffers. Weights are generally very large buffers that
+      // would otherwise skew the layout.
+      weight = 1.0f;
+    }
+    string compass_pt = GetArrayCompassPt(model, input);
+    AppendF(output_file, kInputEdgeFmt, input, compass_pt, op_id, i, line_width,
+            weight);
+  }
+  // Outputs
+  for (int i = 0; i < op.outputs.size(); i++) {
+    const auto& output = op.outputs[i];
+    if (!model.HasArray(output)) {
+      continue;
+    }
+    float log2_buffer_size = GetLog2BufferSize(model, output);
+    // See comments above regarding weight and line_width calculations.
+    float line_width = std::max(0.5f, log2_buffer_size / 3.0f);
+    float weight = std::max(1.0f, log2_buffer_size);
+    if (!IsArrayConsumed(model, output)) {
+      weight = 1.0f;
+    }
+    string compass_pt = GetArrayCompassPt(model, output);
+    AppendF(output_file, kOutputEdgeFmt, op_id, i, output, compass_pt,
+            line_width, weight);
+  }
+}
+
+struct Node {
+  Node() : math_ops(0) {}
+  // Name used as a key in the model's array map
+  string array_id;
+
+  // Estimated number of math ops incurred by this node (the sum of the op
+  // with this array as 1st output, plus all children nodes).
+  int64 math_ops;
+
+  // A map of child nodes keyed by name.
+  std::map<const string, std::unique_ptr<Node>> children;
+};
+
+string GetSubgraphLabel(Node const& node, const string& subgraph) {
+  // Use HTML-like labels (http://www.graphviz.org/doc/info/shapes.html#html)
+  string html;
+  html += "<";
+
+  // Begin Table
+  html += R"CODE(<FONT POINT-SIZE="12" FACE="Courier">)CODE";
+  html +=
+      R"CODE(<TABLE BORDER="0" CELLBORDER="0" CELLSPACING="0" CELLPADDING="0">)CODE";
+
+  // Name
+  html += R"CODE(<TR><TD COLSPAN="2" CELLPADDING="3" ALIGN="CENTER">)CODE";
+  html += R"CODE(<FONT POINT-SIZE="18" FACE="Helvetica"><I>)CODE";
+  html += subgraph;
+  html += R"CODE(</I></FONT>)CODE";
+  html += "</TD></TR>";
+
+  // Attributes
+  Attributes attrs;
+  if (node.math_ops > 0) {
+    attrs["math"] = FormattedNumber(node.math_ops) + "ops";
+  }
+  html += AttributesToHtml(attrs);
+
+  // End Table and HTML-like label
+  html += R"CODE(</TABLE></FONT>)CODE";
+  html += ">";
+
+  return html;
+}
+
+void DumpSubgraphHeader(string* output_file, Node const& node,
+                        const string& node_name) {
+  Color color = HashStringToColor(node_name);
+  string label = GetSubgraphLabel(node, node_name);
+  AppendF(output_file, kSubgraphFmt, node_name, color.AsHexString(), label);
+}
+
+void DumpArray(const Model& model, string* output_file,
+               const string& array_id) {
+  Color color;
+  string shape;
+  GetArrayColorAndShape(model, array_id, &color, &shape);
+  string label = GetArrayLabel(model, array_id);
+  AppendF(output_file, kArrayNodeFmt, array_id, label, array_id, shape,
+          color.AsHexString(), color.TextColorString());
+
+  // Ops are placed in the same subgraph as their first output.
   for (int op_index = 0; op_index < model.operators.size(); op_index++) {
     const Operator& op = *model.operators[op_index];
-    // Add node for operator.
-    auto op_properties = GetPropertiesForOperator(op);
-    string operator_id = StringF("op%05d", op_index);
-    AppendF(output_file_contents, kNodeFormat, operator_id, op_properties.label,
-            "box", op_properties.color.FillColorString().c_str(),
-            op_properties.color.TextColorString().c_str());
-    // Add edges for all inputs of the operator.
-    for (const auto& input : op.inputs) {
-      if (!model.HasArray(input)) {
-        // Arrays should _always_ exist. Except, perhaps, during development.
-        continue;
-      }
-      auto array_properties = GetPropertiesForArray(model, input);
-      // Draw lines that transport more data thicker (Otherwise, where would the
-      // data fit? right?).
-      float line_width =
-          std::max(0.5f, array_properties.log2_buffer_size / 3.0f);
-      // Keep edges that transport more data shorter than those with less.
-      float weight = std::max(1.0f, array_properties.log2_buffer_size);
-      if (!IsInputArray(model, input) &&
-          GetOpWithOutput(model, input) == nullptr) {
-        // Give the main line of data flow a straighter path by penalizing edges
-        // to standalone buffers. Weights are generally very large buffers that
-        // otherwise skew the layout without this.
-        weight = 1.0f;
-      }
-      AppendF(output_file_contents, kEdgeFormat, input, operator_id, line_width,
-              weight);
+    if (!op.outputs.empty() && (op.outputs[0] == array_id)) {
+      DumpOperator(model, output_file, op_index);
     }
-    // Add edges for all outputs of the operator.
-    for (const auto& output : op.outputs) {
-      if (!model.HasArray(output)) {
-        // Arrays should _always_ exist. Except, perhaps, during development.
-        continue;
-      }
-      auto array_properties = GetPropertiesForArray(model, output);
-      // See comments above regarding weight and line_width calculations.
-      float line_width =
-          std::max(0.5f, array_properties.log2_buffer_size / 3.0f);
-      float weight = std::max(1.0f, array_properties.log2_buffer_size);
-      if (!IsArrayConsumed(model, output)) {
-        weight = 1.0f;
+  }
+}
+
+void DumpNode(const Model& model, string* output_file, const string& node_name,
+              Node const& node) {
+  bool not_root = !node_name.empty();
+  if (not_root) {
+    DumpSubgraphHeader(output_file, node, node_name);
+  }
+
+  for (const auto& child : node.children) {
+    if (!child.second->array_id.empty()) {
+      // Dump array if this node posesses one.
+      DumpArray(model, output_file, child.second->array_id);
+    }
+    // Note that it is always possible to have children. Unlike a filesystem,
+    // the existence of array "foo/bar" does _not_ prevent other arrays, such as
+    // and "foo/bar/baz", from being nested beneath it.
+    DumpNode(model, output_file, child.first, *child.second);
+  }
+
+  if (not_root) {
+    // End subgraph
+    AppendF(output_file, "    }\n");
+  }
+}
+
+int64 GetArithmeticOpsCount(const Model& model, const string& array_id) {
+  for (const auto& op : model.operators) {
+    if (!op->outputs.empty() && op->outputs[0] == array_id) {
+      int64 count;
+      if (EstimateArithmeticOpsCount(model, *op, &count)) {
+        return count;
+      } else {
+        return 0;
       }
-      AppendF(output_file_contents, kEdgeFormat, operator_id, output,
-              line_width, weight);
     }
   }
+  return 0;
+}
 
-  for (const auto& rnn_state : model.flags.rnn_states()) {
-    AppendF(output_file_contents, kRNNBackEdgeFormat,
-            rnn_state.back_edge_source_array(), rnn_state.state_array());
+void InsertNode(const Model& model, const string& array_id, Node* node,
+                std::vector<string> prefixes, int64* math_ops) {
+  if (prefixes.empty()) {
+    // Base case: store array in this node.
+    node->array_id = array_id;
+    *math_ops = GetArithmeticOpsCount(model, array_id);
+  } else {
+    // Insert into the sub-tree for that prefix.
+    string prefix = prefixes.back();
+    prefixes.pop_back();
+    if (node->children.count(prefix) == 0) {
+      // Create a new node if this prefix is unseen.
+      node->children[prefix] = absl::make_unique<Node>();
+    }
+    InsertNode(model, array_id, node->children[prefix].get(), prefixes,
+               math_ops);
   }
+  // Sum estimated math ops into all nodes.
+  node->math_ops += *math_ops;
+}
 
-  AppendF(output_file_contents, "}\n");
+void BuildArrayTree(const Model& model, Node* tree) {
+  // Delimit array names by path "/", then place into a tree based on this path.
+  for (const auto& array_id : model.GetArrayMap()) {
+    std::vector<string> prefixes = absl::StrSplit(array_id.first, '/');
+    std::reverse(prefixes.begin(), prefixes.end());
+    int64 math_ops;  // Temporary storage for math ops used during recursion.
+    InsertNode(model, array_id.first, tree, prefixes, &math_ops);
+  }
+}
+
+string GetGraphLabel(const Model& model, const string& graph_name) {
+  // Use HTML-like labels (http://www.graphviz.org/doc/info/shapes.html#html)
+  string html;
+  html += "<";
+
+  // Begin Table
+  html += R"CODE(<FONT POINT-SIZE="36" FACE="Courier">)CODE";
+  html +=
+      R"CODE(<TABLE BORDER="0" CELLBORDER="0" CELLSPACING="0" CELLPADDING="0">)CODE";
+
+  // Name
+  html += R"CODE(<TR><TD COLSPAN="2" CELLPADDING="3" ALIGN="CENTER">)CODE";
+  html += R"CODE(<FONT POINT-SIZE="64" FACE="Helvetica"><B><I>)CODE";
+  html += graph_name;
+  html += R"CODE(</I></B></FONT>)CODE";
+  html += "</TD></TR>";
+
+  // Attributes
+  Attributes attrs;
+  attrs["arrays"] = StringF("%d", model.GetArrayMap().size());
+  if (!model.optional_arrays.empty()) {
+    attrs["optional arrays"] = StringF("%d", model.optional_arrays.size());
+  }
+  attrs["operators"] = StringF("%d", model.operators.size());
+  int64 ops_count;
+  if (EstimateArithmeticOpsCount(model, &ops_count) && (ops_count > 0)) {
+    attrs["math"] = FormattedNumber(ops_count) + "ops";
+  }
+  if (model.transient_data_size > 0) {
+    attrs["transient data size"] =
+        StringF("%d KiB", model.transient_data_size / 1024);
+  }
+  if (model.transient_data_alignment > 0) {
+    attrs["transient data alignment"] =
+        StringF("%d bytes", model.transient_data_alignment);
+  }
+  html += AttributesToHtml(attrs);
+
+  // End Table and HTML-like label
+  html += R"CODE(</TABLE></FONT>)CODE";
+  html += ">";
+
+  return html;
+}
+}  // namespace
+
+void DumpGraphviz(const Model& model, string* output_file,
+                  const string& graph_name) {
+  // Start graphviz format
+  AppendF(output_file, kGraphFmt, GetGraphLabel(model, graph_name));
+
+  // Organize arrays into a tree for subgraphing
+  Node tree;
+  BuildArrayTree(model, &tree);
+  DumpNode(model, output_file, "", tree);
+
+  // Dump edges outside all subgraphs (otherwise the referred-to nodes are
+  // implicitly included in that subgraph).
+  for (int op_index = 0; op_index < model.operators.size(); op_index++) {
+    DumpOperatorEdges(model, output_file, op_index);
+  }
+
+  // Dump RNN Backedges
+  for (const auto& rnn_state : model.flags.rnn_states()) {
+    AppendF(output_file, kRNNBackEdgeFmt, rnn_state.back_edge_source_array(),
+            rnn_state.state_array());
+  }
+  // End graphviz format
+  AppendF(output_file, "}\n");
 }
 }  // namespace toco
diff --git a/tensorflow/lite/toco/dump_graphviz.h b/tensorflow/lite/toco/dump_graphviz.h
index 9697bd6f0dc434aaf98762698c64fb60cb97f2ee..9bb74dac3f8fb34fb2a440e499c4ed0066ffea4d 100644
--- a/tensorflow/lite/toco/dump_graphviz.h
+++ b/tensorflow/lite/toco/dump_graphviz.h
@@ -21,7 +21,8 @@ limitations under the License.
 
 namespace toco {
 
-void DumpGraphviz(const Model& model, string* output_file_contents);
+void DumpGraphviz(const Model& model, string* output_file_contents,
+                  const string& graph_name);
 
 }  // namespace toco
 
diff --git a/tensorflow/lite/toco/export_tensorflow.cc b/tensorflow/lite/toco/export_tensorflow.cc
index 9fff0015527ebadf501f571bdd5ed0a7643d66e0..c2952c7dd1abc4d87b603db03b4d4ea4318cf870 100644
--- a/tensorflow/lite/toco/export_tensorflow.cc
+++ b/tensorflow/lite/toco/export_tensorflow.cc
@@ -1205,6 +1205,16 @@ void ConvertFloorOperator(const Model& model, const FloorOperator& src_op,
   (*floor_op->mutable_attr())["T"].set_type(DT_FLOAT);
 }
 
+void ConvertCeilOperator(const Model& model, const CeilOperator& src_op,
+                         GraphDef* tensorflow_graph) {
+  tensorflow::NodeDef* ceil_op = tensorflow_graph->add_node();
+  ceil_op->set_op("Ceil");
+  ceil_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 1);
+  *ceil_op->add_input() = src_op.inputs[0];
+  (*ceil_op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
 void ConvertGatherOperator(const Model& model, const GatherOperator& src_op,
                            GraphDef* tensorflow_graph) {
   tensorflow::NodeDef* gather_op = tensorflow_graph->add_node();
@@ -1295,7 +1305,8 @@ void ConvertTensorFlowShapeOperator(const Model& model,
       GetTensorFlowDataType(model, src_op.outputs[0]));
 }
 
-void ConvertRankOperator(const Model& model, const RankOperator& src_op,
+void ConvertRankOperator(const Model& model,
+                         const TensorFlowRankOperator& src_op,
                          GraphDef* tensorflow_graph) {
   tensorflow::NodeDef* rank_op = tensorflow_graph->add_node();
   rank_op->set_op("Rank");
@@ -2052,6 +2063,20 @@ void ConvertZerosLikeOperator(const Model& model,
   (*zeros_like_op->mutable_attr())["T"].set_type(data_type);
 }
 
+void ConvertReverseV2Operator(const Model& model,
+                              const ReverseV2Operator& src_op,
+                              const char* op_name, GraphDef* tensorflow_graph) {
+  tensorflow::NodeDef* reverse_v2_op = tensorflow_graph->add_node();
+  reverse_v2_op->set_op(op_name);
+  reverse_v2_op->set_name(src_op.outputs[0]);
+  DCHECK_EQ(src_op.inputs.size(), 2);
+  *reverse_v2_op->add_input() = src_op.inputs[0];
+  *reverse_v2_op->add_input() = src_op.inputs[1];
+  const tensorflow::DataType data_type =
+      GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*reverse_v2_op->mutable_attr())["T"].set_type(data_type);
+}
+
 void ConvertOperator(const Model& model, const Operator& src_op,
                      GraphDef* tensorflow_graph) {
   if (src_op.fused_activation_function != FusedActivationFunctionType::kNone) {
@@ -2169,6 +2194,9 @@ void ConvertOperator(const Model& model, const Operator& src_op,
   } else if (src_op.type == OperatorType::kFloor) {
     ConvertFloorOperator(model, static_cast<const FloorOperator&>(src_op),
                          tensorflow_graph);
+  } else if (src_op.type == OperatorType::kCeil) {
+    ConvertCeilOperator(model, static_cast<const CeilOperator&>(src_op),
+                        tensorflow_graph);
   } else if (src_op.type == OperatorType::kGather) {
     ConvertGatherOperator(model, static_cast<const GatherOperator&>(src_op),
                           tensorflow_graph);
@@ -2247,7 +2275,8 @@ void ConvertOperator(const Model& model, const Operator& src_op,
         model, static_cast<const TensorFlowShapeOperator&>(src_op),
         tensorflow_graph);
   } else if (src_op.type == OperatorType::kRank) {
-    ConvertRankOperator(model, static_cast<const RankOperator&>(src_op),
+    ConvertRankOperator(model,
+                        static_cast<const TensorFlowRankOperator&>(src_op),
                         tensorflow_graph);
   } else if (src_op.type == OperatorType::kRange) {
     ConvertRangeOperator(model, static_cast<const RangeOperator&>(src_op),
@@ -2328,6 +2357,10 @@ void ConvertOperator(const Model& model, const Operator& src_op,
     ConvertZerosLikeOperator(
         model, static_cast<const TensorFlowZerosLikeOperator&>(src_op),
         "ZerosLike", tensorflow_graph);
+  } else if (src_op.type == OperatorType::kReverseV2) {
+    ConvertReverseV2Operator(model,
+                             static_cast<const ReverseV2Operator&>(src_op),
+                             "Reverse_V2", tensorflow_graph);
   } else {
     LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(src_op.type);
   }
diff --git a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
index 436b639253f2e190fcaab895cd077b06796c1ca1..9ea8d8fa5b9792ccc9a9402ddc132462251b00c2 100644
--- a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
+++ b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
@@ -218,6 +218,12 @@ void FuseMulOrDivParamsIntoFollowingAffine(Model* model, Operator* following_op,
     return ::tensorflow::Status::OK();
   }
 
+  if (CountOpsWithInput(*model, binary_op->outputs[0]) != 1) {
+    AddMessageF("Not fusing %s because it's consumed by multiple ops",
+                LogName(*binary_op));
+    return ::tensorflow::Status::OK();
+  }
+
   Operator* following_op = GetOpWithInput(*model, binary_op->outputs[0]);
 
   if (!following_op) {
@@ -287,9 +293,7 @@ void FuseMulOrDivParamsIntoFollowingAffine(Model* model, Operator* following_op,
   AddMessageF("Fusing %s into the following %s", LogName(*binary_op),
               LogName(*following_op));
 
-  if (CountOpsWithInput(*model, binary_op->outputs[0]) == 1) {
-    model->EraseArray(binary_op->outputs[0]);
-  }
+  model->EraseArray(binary_op->outputs[0]);
 
   following_op->inputs[0] = binary_op->inputs[index_of_variable_input];
   const auto& old_constant_param_name =
diff --git a/tensorflow/lite/toco/graph_transformations/graph_transformations.cc b/tensorflow/lite/toco/graph_transformations/graph_transformations.cc
index a0260e24013bfda8718e0dc04052abb49b65debf..e4eb7698597f588947bc19f5ab449c9d3ff14adc 100644
--- a/tensorflow/lite/toco/graph_transformations/graph_transformations.cc
+++ b/tensorflow/lite/toco/graph_transformations/graph_transformations.cc
@@ -128,7 +128,8 @@ void DiscardUselessConnectedComponentsAndRNNBackEdges(Model* model) {
 }
 
 bool GraphTransformationsPass(int increment, Model* model,
-                              const GraphTransformationsSet& transformations) {
+                              const GraphTransformationsSet& transformations,
+                              tensorflow::Status* status) {
   CHECK(increment == 1 || increment == -1);
   bool changed = false;
   if (model->operators.empty()) {
@@ -142,7 +143,10 @@ bool GraphTransformationsPass(int increment, Model* model,
     for (const auto& transformation : transformations) {
       CHECK(!changed_now);
       CHECK(transformation->Messages().empty());
-      CHECK(transformation->Run(model, op_index, &changed_now).ok());
+      *status = transformation->Run(model, op_index, &changed_now);
+      if (!status->ok()) {
+        return false;
+      }
       const char* made_a_change_msg =
           changed_now ? "made a change" : "did NOT make a change";
       const int log_level =
@@ -186,18 +190,21 @@ bool GraphTransformationsPass(int increment, Model* model,
 
 }  // namespace
 
-void RunGraphTransformations(Model* model, const string& msg,
-                             const GraphTransformationsSet& transformations) {
+tensorflow::Status RunGraphTransformationsWithStatus(
+    Model* model, const string& msg,
+    const GraphTransformationsSet& transformations) {
   PrintModelStats(toco::port::StringF("Before %s", msg), *model);
   int pass_index = 0;
+  tensorflow::Status status;
   while (GraphTransformationsPass((pass_index % 2) ? -1 : 1, model,
-                                  transformations)) {
+                                  transformations, &status)) {
     pass_index++;
     const auto& label =
         toco::port::StringF("After %s pass %d", msg, pass_index);
     PrintModelStats(label, *model);
     CheckInvariants(*model);
   }
+  return status;
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
index 187b584b6989cc55894160fc5508c13474a1d2d3..d92733ba3b5490b0b77e88e3beb1bbe9d4508a3a 100644
--- a/tensorflow/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
@@ -102,8 +102,16 @@ class GraphTransformationsSet {
 // construct GraphTransformation objects by using 'new', pass us
 // the resulting raw pointers, and this RunGraphTransformations
 // takes care of delete'ing these pointers.
-void RunGraphTransformations(Model* model, const string& message,
-                             const GraphTransformationsSet& transformations);
+tensorflow::Status RunGraphTransformationsWithStatus(
+    Model* model, const string& msg,
+    const GraphTransformationsSet& transformations);
+
+inline void RunGraphTransformations(
+    Model* model, const string& msg,
+    const GraphTransformationsSet& transformations) {
+  auto s = RunGraphTransformationsWithStatus(model, msg, transformations);
+  CHECK(s.ok()) << s.error_message();
+}
 
 #define DECLARE_GRAPH_TRANSFORMATION(GTName)                     \
   class GTName : public GraphTransformation {                    \
@@ -127,6 +135,10 @@ DECLARE_GRAPH_TRANSFORMATION(FuseActivationFunctions)
 DECLARE_GRAPH_TRANSFORMATION(FuseBinaryIntoFollowingAffine)
 DECLARE_GRAPH_TRANSFORMATION(FuseBinaryIntoPrecedingAffine)
 DECLARE_GRAPH_TRANSFORMATION(FuseBroadcastIntoFollowingBinary)
+DECLARE_GRAPH_TRANSFORMATION(GroupBidirectionalSequenceLstm)
+DECLARE_GRAPH_TRANSFORMATION(GroupBidirectionalSequenceRnn)
+DECLARE_GRAPH_TRANSFORMATION(GroupDynamicBidirectionalSequenceLstm)
+DECLARE_GRAPH_TRANSFORMATION(GroupDynamicBidirectionalSequenceRnn)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyL2Normalization)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyL2Pool)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyLstmCell)
diff --git a/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc b/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..05c85b33d2a2c947eac7ada775d520bd8d3a6641
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/group_bidirectional_sequence_ops.cc
@@ -0,0 +1,624 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdio>
+#include <iterator>
+#include <memory>
+#include <stack>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+
+namespace toco {
+namespace {
+
+std::vector<std::unique_ptr<Operator>>::iterator FindOperator(
+    Model* model, const Operator& op) {
+  return std::find_if(
+      model->operators.begin(), model->operators.end(),
+      [&op](const std::unique_ptr<Operator>& ptr) { return ptr.get() == &op; });
+}
+
+bool MatchTwoUnpackOps(const Operator& op, const Model& model,
+                       Operator** fw_output, Operator** bw_output) {
+  if (op.inputs.size() != 2) {
+    return false;
+  }
+
+  *fw_output = GetOpWithOutput(model, op.inputs[0]);
+  *bw_output = GetOpWithOutput(model, op.inputs[1]);
+  if (*fw_output == nullptr || *bw_output == nullptr) {
+    return false;
+  }
+
+  if ((*fw_output)->type != OperatorType::kUnpack ||
+      (*bw_output)->type != OperatorType::kUnpack) {
+    return false;
+  }
+
+  // TODO(renjieliu): Check the shapes are matching.
+
+  return true;
+}
+
+bool MatchDynamicBidirectionalSequenceOutputs(Operator* op, const Model& model,
+                                              Operator** fw_output,
+                                              Operator** bw_output) {
+  if (op->inputs.size() != 2) {
+    return false;
+  }
+
+  // The concat op is already the fw_rnn_output.
+  *fw_output = op;
+  auto* reverse_output = GetOpWithOutput(model, op->inputs[1]);
+  if (*fw_output == nullptr || reverse_output == nullptr) {
+    return false;
+  }
+
+  if (reverse_output->type != OperatorType::kReverseV2) {
+    return false;
+  }
+
+  *bw_output = reverse_output;
+
+  return true;
+}
+
+bool FindUnidirectionalSequenceOp(const Model& model, const Operator& output_op,
+                                  OperatorType operator_type,
+                                  std::stack<Operator*>* sequence_ops,
+                                  Operator** input_op) {
+  Operator* op_it = nullptr;
+  op_it = GetOpWithOutput(model, output_op.inputs[0]);
+  if (op_it == nullptr) {
+    return false;
+  }
+
+  while (op_it->type == operator_type) {
+    sequence_ops->push(op_it);
+    // Check the first input of the unidirectional sequence op.
+    op_it = GetOpWithOutput(model, op_it->inputs[0]);
+    if (op_it == nullptr) {
+      return false;
+    }
+  }
+
+  *input_op = op_it;
+  return true;
+}
+
+bool CheckTwoUnidirectionalSequenceOpsAreValid(
+    const Model& model,
+    const std::stack<Operator*>& fw_unidirectional_sequence_ops,
+    const std::stack<Operator*>& bw_unidirectional_sequence_ops,
+    const Operator* first_fw_sequence_op_input,
+    const Operator* first_bw_sequence_op_input, bool is_dynamic_rnn) {
+  if (fw_unidirectional_sequence_ops.size() !=
+          bw_unidirectional_sequence_ops.size() ||
+      fw_unidirectional_sequence_ops.empty()) {
+    return false;
+  }
+
+  if (is_dynamic_rnn) {
+    // For dynamic bidirectional sequence ops, bw_sequence will have a reverse
+    // op.
+    if (first_bw_sequence_op_input->type != OperatorType::kReverseV2) {
+      return false;
+    }
+
+    const auto* bw_real_input_op =
+        GetOpWithOutput(model, first_bw_sequence_op_input->inputs[0]);
+    if (first_fw_sequence_op_input != bw_real_input_op) {
+      return false;
+    }
+
+  } else {
+    // For static bidirectional sequence ops, we should have two pack ops.
+    if (first_fw_sequence_op_input->type != OperatorType::kPack ||
+        first_bw_sequence_op_input->type != OperatorType::kPack) {
+      return false;
+    }
+
+    // fw_lstm & bw_lstm should point to the same input, but reversed sequence.
+    for (int i = 0; i < first_fw_sequence_op_input->inputs.size(); ++i) {
+      if (first_fw_sequence_op_input->inputs[i] !=
+          first_bw_sequence_op_input
+              ->inputs[first_fw_sequence_op_input->inputs.size() - i - 1]) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+void ConstructBidirectionalSequenceOp(
+    const Operator& fw_lstm_op, const Operator& bw_lstm_op, Model* model,
+    BidirectionalSequenceLstmOperator** bi_op) {
+  // TODO(renjieliu): Check the shapes & configurations are equal.
+  constexpr int kBidirectionalSequenceLstmInputsCount = 47;
+  constexpr int kFwLstmInputsStartIndex = 1;
+  constexpr int kBwLstmInputsStartIndex = 18;
+  constexpr int kFwInputActivationStartIndex = 35;
+  constexpr int kBwInputActivationStartIndex = 37;
+  constexpr int kAuxInputStartIndex = 39;
+  (*bi_op)->inputs.reserve(kBidirectionalSequenceLstmInputsCount);
+  const string& input_array_name =
+      AvailableArrayName(*model, "bidirectional_sequence_lstm_input_0");
+  model->GetOrCreateArray(input_array_name);
+  // The input will be changed later.
+  (*bi_op)->inputs.push_back(input_array_name);
+  int i = 1;
+  // Fill in the fw_lstm weights.
+  for (; i < kBwLstmInputsStartIndex; ++i) {
+    (*bi_op)->inputs.push_back(fw_lstm_op.inputs[i]);
+  }
+
+  // Fill in the bw_lstm weights. bidirectional lstm backward weights start
+  // from 18.
+  for (; i < kFwInputActivationStartIndex; ++i) {
+    (*bi_op)->inputs.push_back(
+        bw_lstm_op
+            .inputs[i - (kBwLstmInputsStartIndex - kFwLstmInputsStartIndex)]);
+  }
+
+  // Fill in fw_lstm previous states.
+  for (; i < kBwInputActivationStartIndex; ++i) {
+    (*bi_op)->inputs.push_back(
+        fw_lstm_op.inputs[i - (kFwInputActivationStartIndex -
+                               kBwLstmInputsStartIndex)]);
+  }
+
+  // Fill in bw_lstm previous states.
+  for (; i < kAuxInputStartIndex; ++i) {
+    (*bi_op)->inputs.push_back(
+        bw_lstm_op.inputs[i - (kBwInputActivationStartIndex -
+                               kBwLstmInputsStartIndex)]);
+  }
+
+  // TODO(renjieliu): Deal with Auxiliary input and weights for 39 - 47.
+  for (; i <= kBidirectionalSequenceLstmInputsCount; ++i) {
+    const string& temp_array_name = AvailableArrayName(
+        *model, "bidirectional_sequence_lstm_temp_" + std::to_string(i));
+    model->CreateOptionalArray(temp_array_name);
+    (*bi_op)->inputs.push_back(temp_array_name);
+  }
+
+  // Deal with outputs.
+  (*bi_op)->outputs.reserve(2);
+  const string& fw_output_array_name =
+      AvailableArrayName(*model, "bidirectional_sequence_lstm_fw_output_0");
+  const string& bw_output_array_name =
+      AvailableArrayName(*model, "bidirectional_sequence_lstm_bw_output_0");
+  model->GetOrCreateArray(fw_output_array_name);
+  model->GetOrCreateArray(bw_output_array_name);
+  (*bi_op)->outputs.push_back(fw_output_array_name);
+  (*bi_op)->outputs.push_back(bw_output_array_name);
+  (*bi_op)->merge_outputs = false;
+}
+
+void ConstructBidirectionalSequenceOp(
+    const Operator& fw_rnn_op, const Operator& bw_rnn_op, Model* model,
+    BidirectionalSequenceRnnOperator** bi_op) {
+  // TODO(renjieliu): Check the shapes & configurations are equal.
+  constexpr int kBidirectionalSequenceRnnInputsCount = 12;
+  constexpr int kFwInputsStartIndex = 1;
+  constexpr int kBwInputsStartIndex = 5;
+  constexpr int kAuxInputsStartIndex = 9;
+  (*bi_op)->inputs.reserve(kBidirectionalSequenceRnnInputsCount);
+  const string& input_array_name =
+      AvailableArrayName(*model, "bidirectional_sequence_rnn_input_0");
+  model->GetOrCreateArray(input_array_name);
+  // The input will be changed later.
+  (*bi_op)->inputs.push_back(input_array_name);
+  int i = 1;
+
+  // Fill in the fw_rnn weights.
+  for (; i < kBwInputsStartIndex; ++i) {
+    (*bi_op)->inputs.push_back(fw_rnn_op.inputs[i]);
+  }
+
+  // Fill in the bw_rnn weights.
+  for (; i < kAuxInputsStartIndex; ++i) {
+    (*bi_op)->inputs.push_back(
+        bw_rnn_op.inputs[i - (kBwInputsStartIndex - kFwInputsStartIndex)]);
+  }
+
+  // TODO(renjieliu): Deal with optional weights.
+  for (; i < kBidirectionalSequenceRnnInputsCount; ++i) {
+    const string& temp_array_name = AvailableArrayName(
+        *model, "bidirectional_sequence_rnn_temp_" + std::to_string(i));
+    model->CreateOptionalArray(temp_array_name);
+    (*bi_op)->inputs.push_back(temp_array_name);
+  }
+
+  // Deal with outputs.
+  (*bi_op)->outputs.reserve(2);
+  const string& fw_output_array_name =
+      AvailableArrayName(*model, "bidirectional_sequence_rnn_fw_output_0");
+  const string& bw_output_array_name =
+      AvailableArrayName(*model, "bidirectional_sequence_rnn_bw_output_0");
+  model->GetOrCreateArray(fw_output_array_name);
+  model->GetOrCreateArray(bw_output_array_name);
+  (*bi_op)->outputs.push_back(fw_output_array_name);
+  (*bi_op)->outputs.push_back(bw_output_array_name);
+  (*bi_op)->merge_outputs = false;
+}
+
+template <typename T>
+void GroupFwBwSequenceOps(Model* model, std::stack<Operator*> fw_sequence_ops,
+                          std::stack<Operator*> bw_sequence_ops,
+                          std::vector<T*>* bidirectional_sequence_ops) {
+  while (!fw_sequence_ops.empty()) {
+    Operator* fw_sequence_op = fw_sequence_ops.top();
+    Operator* bw_sequence_op = bw_sequence_ops.top();
+    T* bidirectional_sequence_op = new T;
+    ConstructBidirectionalSequenceOp(*fw_sequence_op, *bw_sequence_op, model,
+                                     &bidirectional_sequence_op);
+
+    bidirectional_sequence_ops->push_back(bidirectional_sequence_op);
+    fw_sequence_ops.pop();
+    bw_sequence_ops.pop();
+  }
+}
+
+template <typename T>
+void RewireBidirectionalSequenceSequenceOpsConnections(
+    OperatorType operator_type, const string& input_array_name,
+    const std::vector<T*>& bidirectional_sequence_ops,
+    std::vector<std::unique_ptr<Operator>>::iterator* op_it, Model* model) {
+  int aux_input_index = -1;
+  switch (operator_type) {
+    case OperatorType::kBidirectionalSequenceLstm:
+      aux_input_index = 39;
+      break;
+    case OperatorType::kBidirectionalSequenceRnn:
+      aux_input_index = 9;
+      break;
+    default:
+      // Should not reach here.
+      DCHECK(false);
+  }
+  string cur_fw_input = input_array_name;
+  string cur_bw_input = input_array_name;
+  for (int i = 0; i < bidirectional_sequence_ops.size(); ++i) {
+    DeleteArrayIfUsedOnce(bidirectional_sequence_ops[i]->inputs[0], model);
+    bidirectional_sequence_ops[i]->inputs[0] = cur_fw_input;
+    if (i != 0) {
+      DeleteArrayIfUsedOnce(
+          bidirectional_sequence_ops[i]->inputs[aux_input_index], model);
+      bidirectional_sequence_ops[i]->inputs[aux_input_index] = cur_bw_input;
+    }
+    cur_fw_input = bidirectional_sequence_ops[i]->outputs[0];
+    cur_bw_input = bidirectional_sequence_ops[i]->outputs[1];
+    if (i != (bidirectional_sequence_ops.size() - 1)) {
+      bidirectional_sequence_ops[i]->merge_outputs = false;
+    } else {
+      // TODO(renjieliu): We need to check whether the outputs of the last bidi
+      // lstms needs merged outputs or not.
+      bidirectional_sequence_ops[i]->merge_outputs = true;
+      DeleteArrayIfUnused(bidirectional_sequence_ops[i]->outputs[1], model);
+      bidirectional_sequence_ops[i]->outputs.pop_back();
+    }
+    model->operators.emplace(*op_it, bidirectional_sequence_ops[i]);
+    *op_it += 1;
+  }
+}
+
+template <typename T>
+void RewireFinalUnpackOutputs(const UnpackOperator& original_unpack_operator,
+                              UnpackOperator** final_unpack_operator,
+                              T** final_bidi_sequence_operator, Model* model) {
+  (*final_unpack_operator)
+      ->inputs.push_back((*final_bidi_sequence_operator)->outputs[0]);
+  (*final_unpack_operator)->axis = original_unpack_operator.axis;
+  (*final_unpack_operator)->num = original_unpack_operator.num;
+
+  for (int i = 0; i < original_unpack_operator.outputs.size(); ++i) {
+    const string& output_array_name = original_unpack_operator.outputs[i];
+    const string& final_unpack_output_array_name = AvailableArrayName(
+        *model, "bidirectional_sequence_unpack_" + std::to_string(i));
+    model->GetOrCreateArray(final_unpack_output_array_name);
+    (*final_unpack_operator)->outputs.push_back(final_unpack_output_array_name);
+    Operator* unpack_following_op = GetOpWithInput(*model, output_array_name);
+    if (unpack_following_op != nullptr) {
+      // If there's a following op after the unpack, it must be a concat op.
+      DCHECK(unpack_following_op->type == OperatorType::kConcatenation);
+      // For every output of the concat, rewire the outputs.
+      for (const string& concat_output : unpack_following_op->outputs) {
+        (*final_unpack_operator)->outputs[i] = concat_output;
+      }
+      // Remove the concat op.
+      model->operators.erase(FindOperator(model, *unpack_following_op));
+    }
+  }
+}
+
+void RemoveUnpackOperator(const Operator& unpack_op, Model* model) {
+  for (const string& output_array_name : unpack_op.outputs) {
+    DeleteArrayIfUnused(output_array_name, model);
+  }
+  model->operators.erase(FindOperator(model, unpack_op));
+}
+
+void RemoveUnidirectionalSequenceOps(std::stack<Operator*> uni_sequence_ops,
+                                     Model* model) {
+  while (!uni_sequence_ops.empty()) {
+    Operator* uni_sequence_op = uni_sequence_ops.top();
+    DeleteArrayIfUnused(uni_sequence_op->outputs[0], model);
+    model->operators.erase(FindOperator(model, *uni_sequence_op));
+    uni_sequence_ops.pop();
+  }
+}
+
+template <typename T>
+::tensorflow::Status GroupDynamicSequenceOps(Model* model, std::size_t op_index,
+                                             OperatorType operator_type,
+                                             bool* modified) {
+  *modified = false;
+
+  // We assume there's a concatenation right after the bidirectional sequence
+  // ops, it may not be the case.
+  auto op_it = model->operators.begin() + op_index;
+  Operator* final_concat_op = op_it->get();
+  if (final_concat_op->type != OperatorType::kConcatenation &&
+      final_concat_op->type != OperatorType::kConcat &&
+      final_concat_op->type != OperatorType::kConcatV2) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // for bw, there will be a reverse op at the end.
+  Operator *fw_sequence_output, *bw_sequence_output;
+  if (!MatchDynamicBidirectionalSequenceOutputs(
+          final_concat_op, *model, &fw_sequence_output, &bw_sequence_output)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // Find all upstream unidirectional sequence ops.
+  std::stack<Operator*> fw_unidirectional_sequence_ops,
+      bw_unidirectional_sequence_ops;
+  OperatorType unidirectional_op_type;
+  if (operator_type == OperatorType::kBidirectionalSequenceLstm) {
+    unidirectional_op_type = OperatorType::kUnidirectionalSequenceLstm;
+  } else {
+    unidirectional_op_type = OperatorType::kUnidirectionalSequenceRnn;
+  }
+  Operator *first_fw_sequence_input, *first_bw_sequence_input;
+  if (!FindUnidirectionalSequenceOp(
+          *model, *fw_sequence_output, unidirectional_op_type,
+          &fw_unidirectional_sequence_ops, &first_fw_sequence_input) ||
+      !FindUnidirectionalSequenceOp(
+          *model, *bw_sequence_output, unidirectional_op_type,
+          &bw_unidirectional_sequence_ops, &first_bw_sequence_input)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  if (!CheckTwoUnidirectionalSequenceOpsAreValid(
+          *model, fw_unidirectional_sequence_ops,
+          bw_unidirectional_sequence_ops, first_fw_sequence_input,
+          first_bw_sequence_input, /*is_dynamic_rnn=*/true)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // TODO(b/125143808): Before really group the fw & bw sequence ops and
+  // modified the model, we should check both the fw & bw sequence ops have the
+  // same data_type, inputs_shapes, output_shapes etc.
+  std::vector<T> bidirectional_sequence_ops;
+  GroupFwBwSequenceOps(model, fw_unidirectional_sequence_ops,
+                       bw_unidirectional_sequence_ops,
+                       &bidirectional_sequence_ops);
+
+  // Rewire the inputs & outputs.
+  string current_input = first_fw_sequence_input->outputs[0];
+  RewireBidirectionalSequenceSequenceOpsConnections(
+      operator_type, current_input, bidirectional_sequence_ops, &op_it, model);
+
+  // Change last bidirectional sequence rnn output to the concat output.
+  bidirectional_sequence_ops[bidirectional_sequence_ops.size() - 1]
+      ->outputs[0] = final_concat_op->outputs[0];
+
+  // Delete unused ops.
+  RemoveUnidirectionalSequenceOps(fw_unidirectional_sequence_ops, model);
+  RemoveUnidirectionalSequenceOps(bw_unidirectional_sequence_ops, model);
+
+  DeleteArrayIfUnused(final_concat_op->inputs[0], model);
+  DeleteArrayIfUnused(final_concat_op->inputs[1], model);
+  model->operators.erase(FindOperator(model, *final_concat_op));
+
+  // Only keep the fw lstm's input.
+  DeleteArrayIfUnused(first_bw_sequence_input->outputs[0], model);
+  model->operators.erase(FindOperator(model, *first_bw_sequence_input));
+  *modified = true;
+  return ::tensorflow::Status::OK();
+}
+
+}  // namespace
+
+::tensorflow::Status GroupBidirectionalSequenceLstm::Run(Model* model,
+                                                         std::size_t op_index,
+                                                         bool* modified) {
+  *modified = false;
+  // Bidirectional sequence lstm will generate two separate unidirectional
+  // sequence lstm ops, for static bidirectional sequence lstm, there will be
+  // a concatenation op at very end; for dynamic bidirectional sequence lstm,
+  // it is not guaranteed, but currently we do not support that.
+  auto op_it = model->operators.begin() + op_index;
+  Operator* final_concat_op = op_it->get();
+  if (final_concat_op->type != OperatorType::kConcatenation &&
+      final_concat_op->type != OperatorType::kConcat &&
+      final_concat_op->type != OperatorType::kConcatV2) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // Match fw unidirectional lstm outputs and bw unidirectional lstm outputs:
+  // should be two unstack ops.
+  Operator *fw_lstm_output, *bw_lstm_output;
+  if (!MatchTwoUnpackOps(*final_concat_op, *model, &fw_lstm_output,
+                         &bw_lstm_output)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // Find all upstream unidirectional lstm ops.
+  std::stack<Operator*> fw_unidirectional_sequence_lstm_ops,
+      bw_unidirectional_sequence_lstm_ops;
+  Operator *first_fw_lstm_input, *first_bw_lstm_input;
+  if (!FindUnidirectionalSequenceOp(
+          *model, *fw_lstm_output, OperatorType::kUnidirectionalSequenceLstm,
+          &fw_unidirectional_sequence_lstm_ops, &first_fw_lstm_input) ||
+      !FindUnidirectionalSequenceOp(
+          *model, *bw_lstm_output, OperatorType::kUnidirectionalSequenceLstm,
+          &bw_unidirectional_sequence_lstm_ops, &first_bw_lstm_input)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  if (!CheckTwoUnidirectionalSequenceOpsAreValid(
+          *model, fw_unidirectional_sequence_lstm_ops,
+          bw_unidirectional_sequence_lstm_ops, first_fw_lstm_input,
+          first_bw_lstm_input, /*is_dynamic_rnn=*/false)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // TODO(b/125143808): Before really group the fw & bw sequence ops and
+  // modified the model, we should check both the fw & bw sequence ops have the
+  // same data_type, inputs_shapes, output_shapes etc.
+  std::vector<BidirectionalSequenceLstmOperator*>
+      bidirectional_sequence_lstm_ops;
+  GroupFwBwSequenceOps(model, fw_unidirectional_sequence_lstm_ops,
+                       bw_unidirectional_sequence_lstm_ops,
+                       &bidirectional_sequence_lstm_ops);
+
+  // Rewire the inputs & outputs.
+  string current_input = first_fw_lstm_input->outputs[0];
+  RewireBidirectionalSequenceSequenceOpsConnections(
+      OperatorType::kBidirectionalSequenceLstm, current_input,
+      bidirectional_sequence_lstm_ops, &op_it, model);
+
+  // Insert a unpack op for the output.
+  UnpackOperator* unpack_operator = new UnpackOperator;
+
+  RewireFinalUnpackOutputs(
+      static_cast<const UnpackOperator&>(*fw_lstm_output), &unpack_operator,
+      &bidirectional_sequence_lstm_ops[bidirectional_sequence_lstm_ops.size() -
+                                       1],
+      model);
+  model->operators.emplace(op_it, unpack_operator);
+
+  // Delete unused ops.
+  RemoveUnpackOperator(*fw_lstm_output, model);
+  RemoveUnpackOperator(*bw_lstm_output, model);
+  RemoveUnidirectionalSequenceOps(fw_unidirectional_sequence_lstm_ops, model);
+  RemoveUnidirectionalSequenceOps(bw_unidirectional_sequence_lstm_ops, model);
+  // Only keep the fw lstm's pack input.
+  DeleteArrayIfUnused(first_bw_lstm_input->outputs[0], model);
+  model->operators.erase(FindOperator(model, *first_bw_lstm_input));
+  *modified = true;
+  return ::tensorflow::Status::OK();
+}
+
+::tensorflow::Status GroupBidirectionalSequenceRnn::Run(Model* model,
+                                                        std::size_t op_index,
+                                                        bool* modified) {
+  *modified = false;
+  // Bidirectional sequence rnn will generate two separate unidirectional
+  // sequence rnn ops, for static bidirectional sequence rnn, there will be
+  // a concatenation op at very end; for dynamic bidirectional sequence rnn,
+  // it is not guaranteed, but currently we do not support that.
+  auto op_it = model->operators.begin() + op_index;
+  Operator* final_concat_op = op_it->get();
+  if (final_concat_op->type != OperatorType::kConcatenation &&
+      final_concat_op->type != OperatorType::kConcat &&
+      final_concat_op->type != OperatorType::kConcatV2) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // Match fw unidirectional rnn outputs and bw unidirectional rnn outputs:
+  // should be two unstack ops.
+  Operator *fw_rnn_output, *bw_rnn_output;
+  if (!MatchTwoUnpackOps(*final_concat_op, *model, &fw_rnn_output,
+                         &bw_rnn_output)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // Find all upstream unidirectional rnn ops.
+  std::stack<Operator*> fw_unidirectional_sequence_rnn_ops,
+      bw_unidirectional_sequence_rnn_ops;
+  Operator *first_fw_rnn_input, *first_bw_rnn_input;
+  if (!FindUnidirectionalSequenceOp(
+          *model, *fw_rnn_output, OperatorType::kUnidirectionalSequenceRnn,
+          &fw_unidirectional_sequence_rnn_ops, &first_fw_rnn_input) ||
+      !FindUnidirectionalSequenceOp(
+          *model, *bw_rnn_output, OperatorType::kUnidirectionalSequenceRnn,
+          &bw_unidirectional_sequence_rnn_ops, &first_bw_rnn_input)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  if (!CheckTwoUnidirectionalSequenceOpsAreValid(
+          *model, fw_unidirectional_sequence_rnn_ops,
+          bw_unidirectional_sequence_rnn_ops, first_fw_rnn_input,
+          first_bw_rnn_input, /*is_dynamic_rnn=*/false)) {
+    return ::tensorflow::Status::OK();
+  }
+
+  // TODO(b/125143808): Before really group the fw & bw sequence ops and
+  // modified the model, we should check both the fw & bw sequence ops have the
+  // same data_type, inputs_shapes, output_shapes etc.
+  std::vector<BidirectionalSequenceRnnOperator*> bidirectional_sequence_rnn_ops;
+  GroupFwBwSequenceOps(model, fw_unidirectional_sequence_rnn_ops,
+                       bw_unidirectional_sequence_rnn_ops,
+                       &bidirectional_sequence_rnn_ops);
+
+  // Rewire the inputs & outputs.
+  string current_input = first_fw_rnn_input->outputs[0];
+  RewireBidirectionalSequenceSequenceOpsConnections(
+      OperatorType::kBidirectionalSequenceRnn, current_input,
+      bidirectional_sequence_rnn_ops, &op_it, model);
+
+  // Insert a unpack op for the output.
+  UnpackOperator* unpack_operator = new UnpackOperator;
+  RewireFinalUnpackOutputs(
+      static_cast<const UnpackOperator&>(*fw_rnn_output), &unpack_operator,
+      &bidirectional_sequence_rnn_ops[bidirectional_sequence_rnn_ops.size() -
+                                      1],
+      model);
+  model->operators.emplace(op_it, unpack_operator);
+
+  // Delete unused ops.
+  RemoveUnpackOperator(*fw_rnn_output, model);
+  RemoveUnpackOperator(*bw_rnn_output, model);
+  RemoveUnidirectionalSequenceOps(fw_unidirectional_sequence_rnn_ops, model);
+  RemoveUnidirectionalSequenceOps(bw_unidirectional_sequence_rnn_ops, model);
+  // Only keep the fw rnn's pack input.
+  DeleteArrayIfUnused(first_bw_rnn_input->outputs[0], model);
+  model->operators.erase(FindOperator(model, *first_bw_rnn_input));
+  *modified = true;
+  return ::tensorflow::Status::OK();
+}
+
+::tensorflow::Status GroupDynamicBidirectionalSequenceRnn::Run(
+    Model* model, std::size_t op_index, bool* modified) {
+  return GroupDynamicSequenceOps<BidirectionalSequenceRnnOperator*>(
+      model, op_index, OperatorType::kBidirectionalSequenceRnn, modified);
+}
+
+::tensorflow::Status GroupDynamicBidirectionalSequenceLstm::Run(
+    Model* model, std::size_t op_index, bool* modified) {
+  return GroupDynamicSequenceOps<BidirectionalSequenceLstmOperator*>(
+      model, op_index, OperatorType::kBidirectionalSequenceLstm, modified);
+}
+
+}  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
index 2e41767095fb3cde09a7fb5d690ac57b1cfcd762..6882a19801538f64e71e317d6c947dd2316815c1 100644
--- a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -440,6 +440,8 @@ bool HardcodeMinMaxForLstmCell(Model* model, Operator* op) {
     case OperatorType::kGather:
     case OperatorType::kTranspose:
     case OperatorType::kMean:
+    case OperatorType::kReduceMax:
+    case OperatorType::kReduceMin:
       changed = HardcodeMinMaxFromFirstInput(model, op);
       break;
     case OperatorType::kSum:
@@ -448,7 +450,7 @@ bool HardcodeMinMaxForLstmCell(Model* model, Operator* op) {
       // in special circumstances like when computing expected value using
       // reduce_sum the input range and the output range matches. Hence the
       // below code would act as a fallback. If a fake_quant node is observed in
-      // the output that takes precendence over the hard coding logic below.
+      // the output that takes precedence over the hard coding logic below.
       changed = HardcodeMinMaxFromFirstInput(model, op);
       if (changed) {
         LOG(WARNING) << "Using the input range for output in reduce_sum op."
diff --git a/tensorflow/lite/toco/graph_transformations/identify_l2_normalization.cc b/tensorflow/lite/toco/graph_transformations/identify_l2_normalization.cc
index dabd4bd209f450645d12b76c782b36fa5198f84a..3b7c88ac62e48e6a8a571cfd046cc50c2c35f813 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_l2_normalization.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_l2_normalization.cc
@@ -151,20 +151,12 @@ std::vector<std::unique_ptr<Operator>>::iterator FindOperator(
 
   // Erase the subgraph that is now replaced by L2Normalization
   model->operators.erase(FindOperator(model, square_op));
-  model->EraseArray(sum_op->inputs[0]);
-  if (sum_op->inputs.size() > 1) {
-    model->EraseArray(sum_op->inputs[1]);
-  }
-  model->operators.erase(FindOperator(model, sum_op));
+  DeleteOpAndArraysIfUnused(model, sum_op);
   if (add_op) {
-    model->EraseArray(add_op->inputs[0]);
-    model->EraseArray(add_op->inputs[1]);
-    model->operators.erase(FindOperator(model, add_op));
+    DeleteOpAndArraysIfUnused(model, add_op);
   }
-  model->EraseArray(sqrt_or_rsqrt_op->inputs[0]);
-  model->operators.erase(FindOperator(model, sqrt_or_rsqrt_op));
-  model->EraseArray(div_or_mul_op->inputs[1]);
-  model->operators.erase(FindOperator(model, div_or_mul_op));
+  DeleteOpAndArraysIfUnused(model, sqrt_or_rsqrt_op);
+  DeleteOpAndArraysIfUnused(model, div_or_mul_op);
   *modified = true;
   return ::tensorflow::Status::OK();
 }
diff --git a/tensorflow/lite/toco/graph_transformations/identify_lstm.cc b/tensorflow/lite/toco/graph_transformations/identify_lstm.cc
index 089ecee959a3ab80474782a88fa176b7a9f42001..65dbb8a1766a6aae4347435b392ff4af49e3d44e 100644
--- a/tensorflow/lite/toco/graph_transformations/identify_lstm.cc
+++ b/tensorflow/lite/toco/graph_transformations/identify_lstm.cc
@@ -147,12 +147,26 @@ bool MatchOperatorInputs(const Operator& op, const Model& model,
   if (final_output_mul->type != OperatorType::kMul) {
     return ::tensorflow::Status::OK();
   }
+  // final_output_mul->outputs[0] would be one of the two outputs of our
+  // LstmCell. Exit if it does not already have a data type.
+  // We won't be able to propagate data types through a fused LstmCell.
+  if (model->GetArray(final_output_mul->outputs[0]).data_type ==
+      ArrayDataType::kNone) {
+    return ::tensorflow::Status::OK();
+  }
   Operator *state_output_tanh, *fc_output_sig;
   if (!MatchOperatorInputs(*final_output_mul, *model, OperatorType::kTanh,
                            &state_output_tanh, OperatorType::kLogistic,
                            &fc_output_sig)) {
     return ::tensorflow::Status::OK();
   }
+  // state_output_tanh->inputs[0] would be one of the two outputs of our
+  // LstmCell. Exit if it does not already have a data type.
+  // We won't be able to propagate data types through a fused LstmCell.
+  if (model->GetArray(state_output_tanh->inputs[0]).data_type ==
+      ArrayDataType::kNone) {
+    return ::tensorflow::Status::OK();
+  }
 
   // State output TanH
   // (We don't count an operator as ID'd until we verify it has the correct
@@ -262,11 +276,15 @@ bool MatchOperatorInputs(const Operator& op, const Model& model,
       lstm_cell_op->outputs[LstmCellOperator::ACTIV_OUTPUT]));
   const string& concat_temp_array_name =
       AvailableArrayName(*model, base_name + "concat_temp");
-  model->GetOrCreateArray(concat_temp_array_name);
+  auto& concat_temp_array = model->GetOrCreateArray(concat_temp_array_name);
+  concat_temp_array.data_type =
+      model->GetArray(concat_inputs->outputs[0]).data_type;
   lstm_cell_op->outputs[LstmCellOperator::CONCAT_TEMP] = concat_temp_array_name;
   const string& activ_temp_array_name =
       AvailableArrayName(*model, base_name + "activ_temp");
-  model->GetOrCreateArray(activ_temp_array_name);
+  auto& activ_temp_array = model->GetOrCreateArray(activ_temp_array_name);
+  activ_temp_array.data_type =
+      model->GetArray(fully_connected->outputs[0]).data_type;
   lstm_cell_op->outputs[LstmCellOperator::ACTIV_TEMP] = activ_temp_array_name;
   AddMessageF("Created temp outputs %s and %s on operator %s",
               concat_temp_array_name, activ_temp_array_name,
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
index cbae6610d7f4703a898d8d6f35351a09cd70173c..cb66a2372fdd3edf484902c336821b35befae48d 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -252,6 +252,40 @@ void SetDataTypeForAllOutputs(Model* model, Operator* op,
       SetDataTypeForAllOutputs(model, op, data_type);
       break;
     }
+    case OperatorType::kUnidirectionalSequenceRnn: {
+      const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
+      if (data_type != ArrayDataType::kFloat) return ::tensorflow::Status::OK();
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
+    }
+    case OperatorType::kUnique: {
+      CHECK_EQ(op->outputs.size(), 2);
+      const UniqueOperator* unique_op = static_cast<UniqueOperator*>(op);
+      const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
+      model->GetArray(op->outputs[0]).data_type = data_type;
+      model->GetArray(op->outputs[1]).data_type = unique_op->idx_out_type;
+      break;
+    }
+    case OperatorType::kBidirectionalSequenceLstm: {
+      const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
+      if (data_type != ArrayDataType::kFloat) return ::tensorflow::Status::OK();
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
+    }
+    case OperatorType::kBidirectionalSequenceRnn: {
+      const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
+      if (data_type != ArrayDataType::kFloat) return ::tensorflow::Status::OK();
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
+    }
+    case OperatorType::kLstmCell: {
+      // It's tricky to propagate data types through a LstmCell, as that has
+      // multiple inputs and outputs, and there are quantized cases with
+      // mixed (8bit vs 16bit) cases. Fortunately, that should never be needed,
+      // as the data formats, such as TFLITE, that have LstmCell nodes, also
+      // have data type fields for all their arrays.
+      break;
+    }
     default: {
       // These operators produce outputs with the same type as their 1st input
       CHECK_GT(op->inputs.size(), 0);
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 0e653f08a04f237c861038639a1469eb62f35dfa..1ea4b298834b66c616a4b1a14d57a55481283ebb 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -1109,6 +1109,154 @@ void ProcessUnidirectionalSequenceLstmOperator(
   output_shape->ReplaceDims({timestamp, batch_size, output_size});
 }
 
+void ProcessUnidirectionalSequenceRnnOperator(
+    Model* model, UnidirectionalSequenceRnnOperator* op) {
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.has_shape()) {
+    // Shape already propagated.
+    return;
+  }
+
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes
+    return;
+  }
+
+  // TODO(renjieliu): check the inputs, as well as all kinds of weights.
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+  const int batch_size = input_shape.dims(1);
+  const int timestamp = input_shape.dims(0);
+
+  const auto& bias_array = model->GetArray(op->inputs[3]);
+  // Yield until input dims have been resolved.
+  if (!bias_array.has_shape()) {
+    return;
+  }
+
+  constexpr int kHiddenStateTensor = 4;
+  // b(115961645): This is a hack to work around.
+  model->GetArray(op->inputs[kHiddenStateTensor]).buffer.reset();
+
+  const auto& bias_shape = bias_array.shape();
+  const int output_size = bias_shape.dims(0);
+
+  Shape* output_shape = output_array.mutable_shape();
+  output_shape->ReplaceDims({timestamp, batch_size, output_size});
+}
+
+void ProcessBidirectionalSequenceLstmOperator(
+    Model* model, BidirectionalSequenceLstmOperator* op) {
+  // We assume time major.
+  auto& fw_output_array = model->GetArray(op->outputs[0]);
+  auto& bw_output_array = model->GetArray(op->outputs[1]);
+  if (fw_output_array.has_shape()) {
+    // Shape already propagated
+    return;
+  }
+
+  if (fw_output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes
+    return;
+  }
+
+  // TODO(renjieliu): check the inputs, as well as all kinds of weights.
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+  const int batch_size = input_shape.dims(1);
+  const int timestamp = input_shape.dims(0);
+
+  constexpr int kBwRecurrentToOutputWeightsTensor = 25;
+  const auto& recurrent_to_output_weights_array =
+      model->GetArray(op->inputs[kBwRecurrentToOutputWeightsTensor]);
+  // Yield until input dims have been resolved.
+  if (!recurrent_to_output_weights_array.has_shape()) {
+    return;
+  }
+
+  constexpr int kFwInputActivationStateTensor = 35;
+  constexpr int kFwInputCellStateTensor = 36;
+  constexpr int kBwInputActivationStateTensor = 37;
+  constexpr int kBwInputCellStateTensor = 38;
+  // b(115961645): This is a hack to work around.
+  model->GetArray(op->inputs[kFwInputActivationStateTensor]).buffer.reset();
+  model->GetArray(op->inputs[kFwInputCellStateTensor]).buffer.reset();
+  model->GetArray(op->inputs[kBwInputActivationStateTensor]).buffer.reset();
+  model->GetArray(op->inputs[kBwInputCellStateTensor]).buffer.reset();
+
+  const auto& output_weights_shape = recurrent_to_output_weights_array.shape();
+  const int output_size = output_weights_shape.dims(1);
+
+  Shape* fw_output_shape = fw_output_array.mutable_shape();
+  if (op->merge_outputs) {
+    fw_output_shape->ReplaceDims({timestamp, batch_size, 2 * output_size});
+  } else {
+    fw_output_shape->ReplaceDims({timestamp, batch_size, output_size});
+    Shape* bw_output_shape = bw_output_array.mutable_shape();
+    bw_output_shape->ReplaceDims({timestamp, batch_size, output_size});
+  }
+}
+
+void ProcessBidirectionalSequenceRnnOperator(
+    Model* model, BidirectionalSequenceRnnOperator* op) {
+  // We assume time major.
+  auto& fw_output_array = model->GetArray(op->outputs[0]);
+  auto& bw_output_array = model->GetArray(op->outputs[1]);
+  if (fw_output_array.has_shape()) {
+    // Shape already propagated
+    return;
+  }
+
+  if (fw_output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes
+    return;
+  }
+
+  // TODO(renjieliu): check the inputs, as well as all kinds of weights.
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const auto& input_shape = input_array.shape();
+  const int batch_size = input_shape.dims(1);
+  const int timestamp = input_shape.dims(0);
+
+  constexpr int kFwWeightsTensor = 1;
+  const auto& forward_weights_array =
+      model->GetArray(op->inputs[kFwWeightsTensor]);
+  // Yield until input dims have been resolved.
+  if (!forward_weights_array.has_shape()) {
+    return;
+  }
+
+  constexpr int kFwHiddenStateTensor = 4;
+  constexpr int kBwHiddenStateTensor = 8;
+  // b(115961645): This is a hack to work around.
+  model->GetArray(op->inputs[kFwHiddenStateTensor]).buffer.reset();
+  model->GetArray(op->inputs[kBwHiddenStateTensor]).buffer.reset();
+
+  const auto& output_weights_shape = forward_weights_array.shape();
+  const int output_size = output_weights_shape.dims(0);
+
+  Shape* fw_output_shape = fw_output_array.mutable_shape();
+  if (op->merge_outputs) {
+    fw_output_shape->ReplaceDims({timestamp, batch_size, 2 * output_size});
+  } else {
+    fw_output_shape->ReplaceDims({timestamp, batch_size, output_size});
+    Shape* bw_output_shape = bw_output_array.mutable_shape();
+    bw_output_shape->ReplaceDims({timestamp, batch_size, output_size});
+  }
+}
+
 void ProcessSpaceToBatchNDOperator(Model* model, SpaceToBatchNDOperator* op) {
   const auto& input_array = model->GetArray(op->inputs[0]);
   // Yield until input dims have been resolved.
@@ -1252,6 +1400,38 @@ void ProcessGatherOperator(Model* model, GatherOperator* op) {
   }
 }
 
+void ProcessGatherNdOperator(Model* model, GatherNdOperator* op) {
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  const auto& indices_array = model->GetArray(op->inputs[1]);
+  auto& output_array = model->GetArray(op->outputs[0]);
+
+  // Bail if we already know the output shape.
+  if (output_array.has_shape()) {
+    return;
+  }
+
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape() || !indices_array.has_shape()) {
+    return;
+  }
+
+  const auto& input_shape = input_array.shape();
+  const auto& indices_shape = indices_array.shape();
+  QCHECK_GE(input_shape.dimensions_count(), 1);
+  QCHECK_GE(indices_shape.dimensions_count(), 1);
+  const int indices_nd =
+      indices_shape.dims(indices_shape.dimensions_count() - 1);
+  QCHECK_LE(indices_nd, input_shape.dimensions_count());
+
+  auto output_dims = output_array.mutable_shape()->mutable_dims();
+  for (int dim = 0; dim < indices_shape.dimensions_count() - 1; ++dim) {
+    output_dims->push_back(indices_shape.dims(dim));
+  }
+  for (int dim = indices_nd; dim < input_shape.dimensions_count(); ++dim) {
+    output_dims->push_back(input_shape.dims(dim));
+  }
+}
+
 void ProcessTopkV2Operator(Model* model, TopKV2Operator* op) {
   const auto& input_values = model->GetArray(op->inputs[0]);
   const auto& input_k = model->GetArray(op->inputs[1]);
@@ -1337,7 +1517,7 @@ void ProcessPadV2Operator(Model* model, PadV2Operator* op) {
   output_array.copy_shape(output_shape);
 }
 
-void ProcessRankOperator(Model* model, RankOperator* op) {
+void ProcessRankOperator(Model* model, TensorFlowRankOperator* op) {
   CHECK_GE(op->inputs.size(), 1);
   CHECK_EQ(op->outputs.size(), 1);
   auto& output_array = model->GetArray(op->outputs[0]);
@@ -1616,14 +1796,37 @@ void ProcessArgMinMaxOperator(Model* model, Op* op) {
     return;
   }
 
+  const Array& axis_array = model->GetArray(op->inputs[1]);
+  // Yield until input axis array shape has been resolved.
+  if (!axis_array.has_shape()) {
+    return;
+  }
+
   const std::vector<int>& input_dims = input_array.shape().dims();
+
+  CHECK(axis_array.data_type == ArrayDataType::kInt32 ||
+        axis_array.data_type == ArrayDataType::kInt64)
+      << "axis_array must be int32, int64";
+
+  CHECK_EQ(RequiredBufferSizeForShape(axis_array.shape()), 1)
+      << "Axis array must be scalar.";
+
+  int64 axis;
+  if (axis_array.data_type == ArrayDataType::kInt32) {
+    axis = axis_array.GetBuffer<ArrayDataType::kInt32>().data[0];
+  } else {
+    axis = axis_array.GetBuffer<ArrayDataType::kInt64>().data[0];
+  }
+
   std::vector<int> output_dims;
 
-  output_dims.reserve(input_dims.size());
-  for (int i = 0; i < input_dims.size() - 1; ++i) {
-    output_dims.push_back(input_dims[i]);
+  output_dims.reserve(input_dims.size() - 1);
+  for (int i = 0; i < input_dims.size(); ++i) {
+    if (i != axis) {
+      output_dims.push_back(input_dims[i]);
+    }
   }
-  output_dims.push_back(1);
+
   const string& output_name = op->outputs[0];
   auto& output_array = model->GetArray(output_name);
   if (output_array.has_shape()) {
@@ -1828,6 +2031,20 @@ void ProcessMirrorPadOperator(Model* model, MirrorPadOperator* op) {
   output_array.copy_shape(output_shape);
 }
 
+void ProcessUniqueOperator(Model* model, UniqueOperator* op) {
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  // We have 2 outputs, the shape of the index tensor, is the same size
+  // as the input array. The unique values tensor, is unknown until runtime.
+  CHECK_EQ(op->outputs.size(), 2);
+  auto& idx_output_array = model->GetArray(op->outputs[1]);
+
+  // Yield until input dims have been resolved, or output already computed
+  if (!input_array.has_shape() || idx_output_array.has_shape()) {
+    return;
+  }
+  idx_output_array.copy_shape(input_array.shape());
+}
+
 }  // namespace
 
 ::tensorflow::Status PropagateFixedSizes::Run(Model* model,
@@ -1848,6 +2065,7 @@ void ProcessMirrorPadOperator(Model* model, MirrorPadOperator* op) {
     case OperatorType::kBatchNormalization:
     case OperatorType::kL2Normalization:
     case OperatorType::kDequantize:
+    case OperatorType::kElu:
     case OperatorType::kRelu:
     case OperatorType::kRelu1:
     case OperatorType::kRelu6:
@@ -1869,17 +2087,23 @@ void ProcessMirrorPadOperator(Model* model, MirrorPadOperator* op) {
     case OperatorType::kAssert:
     case OperatorType::kCast:
     case OperatorType::kFloor:
+    case OperatorType::kCeil:
     case OperatorType::kExp:
     case OperatorType::kSin:
+    case OperatorType::kCos:
     case OperatorType::kLogicalAnd:
     case OperatorType::kLogicalNot:
     case OperatorType::kLogicalOr:
     case OperatorType::kZerosLike:
+    case OperatorType::kReverseV2:
       ProcessSimpleOperator(model, op, 0);
       break;
     case OperatorType::kGather:
       ProcessGatherOperator(model, static_cast<GatherOperator*>(op));
       break;
+    case OperatorType::kGatherNd:
+      ProcessGatherNdOperator(model, static_cast<GatherNdOperator*>(op));
+      break;
     case OperatorType::kTopK_V2:
       ProcessTopkV2Operator(model, static_cast<TopKV2Operator*>(op));
       break;
@@ -1996,7 +2220,7 @@ void ProcessMirrorPadOperator(Model* model, MirrorPadOperator* op) {
       ProcessRangeOperator(model, static_cast<RangeOperator*>(op));
       break;
     case OperatorType::kRank:
-      ProcessRankOperator(model, static_cast<RankOperator*>(op));
+      ProcessRankOperator(model, static_cast<TensorFlowRankOperator*>(op));
       break;
     case OperatorType::kShape:
       ProcessShapeOperator(model, static_cast<TensorFlowShapeOperator*>(op));
@@ -2023,6 +2247,18 @@ void ProcessMirrorPadOperator(Model* model, MirrorPadOperator* op) {
       ProcessUnidirectionalSequenceLstmOperator(
           model, static_cast<UnidirectionalSequenceLstmOperator*>(op));
       break;
+    case OperatorType::kUnidirectionalSequenceRnn:
+      ProcessUnidirectionalSequenceRnnOperator(
+          model, static_cast<UnidirectionalSequenceRnnOperator*>(op));
+      break;
+    case OperatorType::kBidirectionalSequenceLstm:
+      ProcessBidirectionalSequenceLstmOperator(
+          model, static_cast<BidirectionalSequenceLstmOperator*>(op));
+      break;
+    case OperatorType::kBidirectionalSequenceRnn:
+      ProcessBidirectionalSequenceRnnOperator(
+          model, static_cast<BidirectionalSequenceRnnOperator*>(op));
+      break;
     case OperatorType::kLstmCell:
       ProcessLstmCellOperator(model, static_cast<LstmCellOperator*>(op));
       break;
@@ -2103,6 +2339,14 @@ void ProcessMirrorPadOperator(Model* model, MirrorPadOperator* op) {
     case OperatorType::kMirrorPad:
       ProcessMirrorPadOperator(model, static_cast<MirrorPadOperator*>(op));
       break;
+    case OperatorType::kUnique:
+      ProcessUniqueOperator(model, static_cast<UniqueOperator*>(op));
+      break;
+    case OperatorType::kWhere:
+      // The size of the output can only be known after evaluating the cond
+      // tensor. Ignore shape propagation here and defer that to the
+      // interpreter.
+      break;
     default:
       // Unimplemented, another graph transformation should drop it.
       LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(op->type);
diff --git a/tensorflow/lite/toco/graph_transformations/quantize.cc b/tensorflow/lite/toco/graph_transformations/quantize.cc
index 1146078c301fd1b880c99da23e5be8223efe31e3..c7836f61ac30c2ba3784511193b0ed0217f64fcb 100644
--- a/tensorflow/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/quantize.cc
@@ -64,8 +64,10 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kRelu1 || type == OperatorType::kRelu6 ||
          type == OperatorType::kShape || type == OperatorType::kExpandDims ||
          type == OperatorType::kPack || type == OperatorType::kTopK_V2 ||
+         type == OperatorType::kRandomUniform ||
          type == OperatorType::kResizeNearestNeighbor ||
-         type == OperatorType::kPRelu;
+         type == OperatorType::kPRelu || type == OperatorType::kReduceMax ||
+         type == OperatorType::kReduceMin;
 }
 
 // The quantized op allows output arrays of type float using
@@ -487,20 +489,20 @@ void FixMinMaxPostQuantization(GraphTransformation* transformation,
     }
   }
   if (!SupportsQuantization(op)) {
-    LOG(FATAL) << "Unimplemented: this graph contains an operator of type "
-               << HelpfulOperatorTypeName(op)
-               << " for which the quantized form is not yet implemented. "
-                  "Sorry, and patches welcome (that's a relatively fun patch "
-                  "to write, mostly providing the actual quantized arithmetic "
-                  "code for this op).";
+    return tensorflow::errors::InvalidArgument(
+        "Unimplemented: this graph contains an operator of type ",
+        HelpfulOperatorTypeName(op),
+        " for which the quantized form is not yet implemented. Sorry, and "
+        "patches welcome (that's a relatively fun patch to write, mostly "
+        "providing the actual quantized arithmetic code for this op).");
   }
 
   for (const auto& input : op.inputs) {
     const auto& array = model->GetArray(input);
     if (array.data_type == ArrayDataType::kFloat) {
       if (!array.minmax && !array.buffer) {
-        LOG(ERROR) << "Can't quantize input array " << input
-                   << " because it lacks min/max info";
+        LOG(WARNING) << "Can't quantize input array " << input
+                     << " because it lacks min/max info";
         return ::tensorflow::Status::OK();
       }
       const auto* other_op = GetOpWithOutput(*model, input);
diff --git a/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
index 6a4b9198548956217d24693bceff2bd6b3b8f0a6..98105d384e176573b248ffc3fd75710768002750 100644
--- a/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
+++ b/tensorflow/lite/toco/graph_transformations/reorder_elementwise_unary.cc
@@ -30,6 +30,7 @@ namespace {
 bool IsElementwiseOperator(OperatorType optype) {
   switch (optype) {
     case OperatorType::kCast:
+    case OperatorType::kCeil:
     case OperatorType::kExp:
     case OperatorType::kFloor:
     case OperatorType::kNeg:
diff --git a/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc b/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
index fdd411c84c2678bc483b00849d5142665e706fac..77803d580e98aea94f0a7191666212cb15f58a7a 100644
--- a/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
+++ b/tensorflow/lite/toco/graph_transformations/reorder_reshape_transpose.cc
@@ -218,6 +218,7 @@ std::vector<int> ComputeNewPerm(std::vector<int> input_dims,
   CHECK_EQ(input_dims.size(), new_perm.size());
 
   auto& transpose_array = model->GetOrCreateArray(transpose_op->inputs[1]);
+  transpose_array.data_type = ArrayDataType::kInt32;
   transpose_array.GetMutableBuffer<ArrayDataType::kInt32>().data = new_perm;
   *(transpose_array.mutable_shape()->mutable_dims()) = {
       static_cast<int>(new_perm.size())};
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
index 0c9effee1fd364fa83f61339251e48070f503d1e..1c8f4619bc40d976717bba16ee3857531cac76e0 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
@@ -71,34 +71,29 @@ void StridedSlice(StridedSliceOperator const& op, Array const& input_array,
   // Each "digit" is incremented individually (by the stride). When it overflows
   // (becomes greater than the stop), that digit is reset and a carry flag is
   // used to increment the next digit.
-  int dst_offset = 0;
-  do {
+  for (int dst_offset = 0; dst_offset < output_data.size(); ++dst_offset) {
     // Copy element.
     output_data[dst_offset] = input_buffer.data[Offset(input_shape, src_coord)];
 
-    // Compute next source input coordinates.
-    bool carry = true;
-    for (int axis = 0; axis < num_input_axes; axis++) {
+    // Note we consider elements in the highest dimension are stored
+    // contiguously. So, we increment the stride starting from the highest
+    // dimension.
+    for (int axis = num_input_axes - 1; axis >= 0; --axis) {
       int stride = op.strides[axis];
-      // Increment this axis if we carried from the previous one
-      if (carry) {
-        src_coord[axis] += stride;
-      }
+      src_coord[axis] += stride;
 
-      // Check if we've overflowed.
+      // Check if we've overflowed. If not, we just break from the loop to
+      // continue w/ the element copy. Otherwise, reset the starting coordinate
+      // for this axis and move to the next lower axis.
       int stop = stop_for_axis[axis];
-      if (tflite::strided_slice::LoopCondition(src_coord[axis], stop, stride)) {
-        // Reset axis and set carry
-        src_coord[axis] = tflite::strided_slice::StartForAxis(
-            strided_slice_params, ToRuntimeShape(input_shape), axis);
-        carry = true;
-      } else {
-        carry = false;
+      if (!tflite::strided_slice::LoopCondition(src_coord[axis], stop,
+                                                stride)) {
+        break;
       }
+      src_coord[axis] = tflite::strided_slice::StartForAxis(
+          strided_slice_params, ToRuntimeShape(input_shape), axis);
     }
-    // increment destination buffer offset
-    dst_offset++;
-  } while (dst_offset < output_data.size());
+  }
 }
 
 }  // anonymous namespace
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_fake_quant_args_from_vars.cc b/tensorflow/lite/toco/graph_transformations/resolve_fake_quant_args_from_vars.cc
index c0becaf7d39cdbc01217bbb9b5a6b50017cc2eaa..2c860c30974766a093ef1bf2d9a93fb29bb65949 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_fake_quant_args_from_vars.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_fake_quant_args_from_vars.cc
@@ -61,11 +61,11 @@ namespace toco {
   minmax.max = max_array.GetBuffer<ArrayDataType::kFloat>().data[0];
   // We always want [min, max] to contain 0.
   if (minmax.min > 0 || minmax.max < 0) {
-    LOG(ERROR) << "For " << LogName(*fakequant_op) << " the MinMax range "
-               << "[" << minmax.min << ", " << minmax.max
-               << "] does not contain 0. "
-               << "Proceeding by tweaking it to contain 0, which will result "
-                  "in poor accuracy.";
+    LOG(WARNING) << "For " << LogName(*fakequant_op) << " the MinMax range "
+                 << "[" << minmax.min << ", " << minmax.max
+                 << "] does not contain 0. "
+                 << "Proceeding by tweaking it to contain 0, which will result "
+                    "in poor accuracy.";
   }
   minmax.min = std::min(minmax.min, 0.);
   minmax.max = std::max(minmax.max, 0.);
diff --git a/tensorflow/lite/toco/graph_transformations/tests/BUILD b/tensorflow/lite/toco/graph_transformations/tests/BUILD
index bbbedbe3a93065e3a7007073aad7f6e7600e2651..03d331226d885e86bf47d219691591a5a8c53d7a 100644
--- a/tensorflow/lite/toco/graph_transformations/tests/BUILD
+++ b/tensorflow/lite/toco/graph_transformations/tests/BUILD
@@ -40,3 +40,15 @@ tf_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+tf_cc_test(
+    name = "fuse_binary_into_following_affine_test",
+    srcs = ["fuse_binary_into_following_affine_test.cc"],
+    deps = [
+        "//tensorflow/lite/toco:graph_transformations",
+        "//tensorflow/lite/toco:model",
+        "//tensorflow/lite/toco:tooling_util",
+        "@com_google_absl//absl/memory",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/toco/graph_transformations/tests/fuse_binary_into_following_affine_test.cc b/tensorflow/lite/toco/graph_transformations/tests/fuse_binary_into_following_affine_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2cba6824cfbe55f05b92f70cc45fc87b58d56559
--- /dev/null
+++ b/tensorflow/lite/toco/graph_transformations/tests/fuse_binary_into_following_affine_test.cc
@@ -0,0 +1,148 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/tooling_util.h"
+
+namespace toco {
+
+namespace {
+// A gmock matcher that check that elements of a float vector match to a given
+// tolerance.
+std::vector<testing::Matcher<float>> ArrayFloatNear(
+    const std::vector<float>& values, float max_abs_error = 1e-5) {
+  std::vector<testing::Matcher<float>> matchers;
+  matchers.reserve(values.size());
+  for (const float& v : values) {
+    matchers.emplace_back(testing::FloatNear(v, max_abs_error));
+  }
+  return matchers;
+}
+}  // namespace
+
+class FuseBinaryIntoFollowingAffineTest : public ::testing::Test {
+ protected:
+  FuseBinaryIntoFollowingAffineTest() {}
+
+  void SetUp() override { model_.reset(new Model); }
+
+  void CreateArray(const string& name, const std::vector<int>& shape) {
+    Array& array = model_->GetOrCreateArray(name);
+    array.data_type = ArrayDataType::kFloat;
+    Shape* array_shape = array.mutable_shape();
+    *(array_shape->mutable_dims()) = shape;
+  }
+
+  void CreateConstantArray(const string& name, const std::vector<int>& shape,
+                           const std::vector<float>& data) {
+    CreateArray(name, shape);
+    Array& array = model_->GetOrCreateArray(name);
+    auto& array_buffer = array.GetMutableBuffer<ArrayDataType::kFloat>();
+    int bufsize = 1;
+    for (int dim : shape) {
+      bufsize *= dim;
+    }
+    array_buffer.data.resize(bufsize);
+    float* buf_ptr = array_buffer.data.data();
+    for (int i = 0; i < bufsize; ++i) {
+      buf_ptr[i] = data[i];
+    }
+  }
+
+  std::unique_ptr<Model> model_;
+};
+
+TEST_F(FuseBinaryIntoFollowingAffineTest, FuseMulIntoFullyConnected) {
+  // Creating a model.
+  {
+    CreateArray("Input", {2, 2});
+    CreateConstantArray("MulInput2", {1}, {2.0});
+    CreateArray("MulOutput", {2, 2});
+    CreateConstantArray("FCWeight", {2, 2}, {1.0, 2.0, 3.0, 4.0});
+    CreateConstantArray("FCBias", {1}, {1.0});
+    CreateArray("Output", {2, 2});
+
+    auto* mul_op = new MulOperator;
+    mul_op->inputs = {"Input", "MulInput2"};
+    mul_op->outputs = {"MulOutput"};
+    model_->operators.push_back(std::unique_ptr<Operator>(mul_op));
+
+    auto* fc_op = new FullyConnectedOperator;
+    fc_op->inputs = {"MulOutput", "FCWeight", "FCBias"};
+    fc_op->outputs = {"Output"};
+    model_->operators.push_back(std::unique_ptr<Operator>(fc_op));
+  }
+  toco::FuseBinaryIntoFollowingAffine transformation;
+  bool modified;
+  ASSERT_TRUE(transformation.Run(model_.get(), /*op_index=*/0, &modified).ok());
+  EXPECT_TRUE(modified);
+
+  // `Mul` should be fused into `FullyConnected`. Only 1 op is left.
+  ASSERT_EQ(model_->operators.size(), 1);
+  const auto& op = model_->operators[0];
+  ASSERT_EQ(op->type, OperatorType::kFullyConnected);
+  ASSERT_EQ(op->inputs.size(), 3);
+
+  auto& weights_array = model_->GetArray(op->inputs[1]);
+  EXPECT_THAT(weights_array.GetBuffer<toco::ArrayDataType::kFloat>().data,
+              ElementsAreArray(ArrayFloatNear({2.0, 4.0, 6.0, 8.0})));
+
+  auto& bias_array = model_->GetArray(op->inputs[2]);
+  EXPECT_THAT(bias_array.GetBuffer<toco::ArrayDataType::kFloat>().data,
+              ElementsAreArray(ArrayFloatNear({1.0})));
+}
+
+// This is a regression test of b/121287325. Toco crashes before the fix.
+TEST_F(FuseBinaryIntoFollowingAffineTest, DoNotFuseWithMultipleConsumers) {
+  // Creating a model.
+  {
+    CreateArray("Input", {2, 2});
+    CreateConstantArray("MulInput2", {1}, {2.0});
+    CreateArray("MulOutput", {2, 2});
+    CreateConstantArray("FCWeight", {2, 2}, {1.0, 2.0, 3.0, 4.0});
+    CreateConstantArray("FCBias", {1}, {1.0});
+    CreateArray("Output", {2, 2});
+    CreateArray("AnotherOutput", {2, 2});
+
+    auto* mul_op = new MulOperator;
+    mul_op->inputs = {"Input", "MulInput2"};
+    mul_op->outputs = {"MulOutput"};
+    model_->operators.push_back(std::unique_ptr<Operator>(mul_op));
+
+    auto* fc_op = new FullyConnectedOperator;
+    fc_op->inputs = {"MulOutput", "FCWeight", "FCBias"};
+    fc_op->outputs = {"Output"};
+    model_->operators.push_back(std::unique_ptr<Operator>(fc_op));
+
+    auto identity_op = new TensorFlowIdentityOperator;
+    identity_op->inputs = {"MulOutput"};
+    identity_op->outputs = {"AnotherOutput"};
+    model_->operators.push_back(std::unique_ptr<Operator>(identity_op));
+  }
+
+  toco::FuseBinaryIntoFollowingAffine transformation;
+  bool modified;
+  ASSERT_TRUE(transformation.Run(model_.get(), /*op_index=*/0, &modified).ok());
+  // Do not modify the graph if the binary operator has another output.
+  EXPECT_FALSE(modified);
+  EXPECT_EQ(model_->operators.size(), 3);
+}
+
+}  // namespace toco
diff --git a/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
index 41a735394d714b65a4c9fc309927e34a7f610431..7492f3e116c60ca2c574bf8d2fd4b08f5914f3d0 100644
--- a/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
+++ b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
@@ -13,17 +13,192 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <memory>
+#include <numeric>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/lite/toco/model.h"
 #include "tensorflow/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
+namespace {
+
+void UnrollBatchMatMul3D(
+    const string& input_lhs, const string& input_rhs,
+    const BatchMatMulOperator* batch_op, const std::vector<int> batch,
+    Model* model, std::vector<std::unique_ptr<Operator>>::iterator* tail_it,
+    std::vector<string>* pack_inputs) {
+  const std::string batch_name =
+      absl::StrCat(batch_op->outputs[0], "_b", absl::StrJoin(batch, "-"));
+  const auto& input_array_a = model->GetArray(input_lhs);
+  const auto& input_array_b = model->GetArray(input_rhs);
+  const int dims_count = input_array_a.shape().dimensions_count();
+
+  // tf.slice(a, ...).
+  std::vector<int> begin_indices_a = batch;
+  begin_indices_a.resize(dims_count);
+  std::vector<int> slice_size_a = input_array_a.shape().dims();
+  for (int i = 0; i < batch.size(); ++i) {
+    slice_size_a[i] = 1;
+  }
+  auto* slice_a_op = new SliceOperator;
+  slice_a_op->inputs = {
+      input_lhs,
+      CreateInt32Array(model, batch_name + "/slice_a/slice/begin",
+                       begin_indices_a),
+      CreateInt32Array(model, batch_name + "/slice_a/slice/size", slice_size_a),
+  };
+  slice_a_op->outputs = {AvailableArrayName(*model, batch_name + "/slice_a")};
+  auto& slice_a_op_output = model->GetOrCreateArray(slice_a_op->outputs[0]);
+  slice_a_op_output.data_type = input_array_a.data_type;
+  *tail_it = model->operators.emplace(*tail_it, slice_a_op) + 1;
+
+  // Reshape to remove the first dimension ([1,M,N] -> [M,N]).
+  auto* slice_a_reshape_op = new TensorFlowReshapeOperator;
+  slice_a_reshape_op->inputs = {
+      slice_a_op->outputs[0],
+      CreateInt32Array(model, batch_name + "/slice_a/reshape/shape",
+                       {-1, input_array_a.shape().dims(dims_count - 1)})};
+  slice_a_reshape_op->outputs = {
+      AvailableArrayName(*model, batch_name + "/slice_a/reshape")};
+  auto& slice_a_reshape_op_output =
+      model->GetOrCreateArray(slice_a_reshape_op->outputs[0]);
+  slice_a_reshape_op_output.data_type = input_array_a.data_type;
+  *tail_it = model->operators.emplace(*tail_it, slice_a_reshape_op) + 1;
+
+  // tf.slice(b, ...).
+  std::vector<int> begin_indices_b = batch;
+  begin_indices_b.resize(dims_count);
+  std::vector<int> slice_size_b = input_array_b.shape().dims();
+  for (int i = 0; i < batch.size(); ++i) {
+    slice_size_b[i] = 1;
+  }
+  auto* slice_b_op = new SliceOperator;
+  slice_b_op->inputs = {
+      input_rhs,
+      CreateInt32Array(model, batch_name + "/slice_b/slice/begin",
+                       begin_indices_b),
+      CreateInt32Array(model, batch_name + "/slice_b/slice/size", slice_size_b),
+  };
+  slice_b_op->outputs = {AvailableArrayName(*model, batch_name + "/slice_b")};
+  auto& slice_b_op_output = model->GetOrCreateArray(slice_b_op->outputs[0]);
+  slice_b_op_output.data_type = input_array_b.data_type;
+  *tail_it = model->operators.emplace(*tail_it, slice_b_op) + 1;
+
+  // Reshape to remove the first dimension ([1,M,N] -> [M,N]).
+  auto* slice_b_reshape_op = new TensorFlowReshapeOperator;
+  slice_b_reshape_op->inputs = {
+      slice_b_op->outputs[0],
+      CreateInt32Array(model, batch_name + "/slice_b/reshape/shape",
+                       {-1, input_array_b.shape().dims(dims_count - 1)})};
+  slice_b_reshape_op->outputs = {
+      AvailableArrayName(*model, batch_name + "/slice_b/reshape")};
+  auto& slice_b_reshape_op_output =
+      model->GetOrCreateArray(slice_b_reshape_op->outputs[0]);
+  slice_b_reshape_op_output.data_type = input_array_b.data_type;
+  *tail_it = model->operators.emplace(*tail_it, slice_b_reshape_op) + 1;
+
+  // tf.matmul(slice_a, slice_b).
+  auto* matmul_op = new TensorFlowMatMulOperator;
+  matmul_op->inputs = {slice_a_reshape_op->outputs[0],
+                       slice_b_reshape_op->outputs[0]};
+  matmul_op->outputs = {AvailableArrayName(*model, batch_name)};
+  auto& matmul_op_output = model->GetOrCreateArray(matmul_op->outputs[0]);
+  matmul_op_output.data_type = input_array_a.data_type;
+  *tail_it = model->operators.emplace(*tail_it, matmul_op) + 1;
+
+  // Add to stack.
+  pack_inputs->push_back(matmul_op->outputs[0]);
+}
+
+std::vector<string> UnrollBatchMatMulRecursion(
+    const string& input_lhs, const string& input_rhs,
+    const BatchMatMulOperator* batch_op, Model* model,
+    std::vector<std::unique_ptr<Operator>>::iterator* tail_it,
+    const std::vector<int>& batch_prefix) {
+  const auto& input_array_a = model->GetArray(input_lhs);
+  const auto& dims_vec = input_array_a.shape().dims();
+  const int current_dim_size = dims_vec[batch_prefix.size()];
+  std::vector<string> batch_pack_inputs;
+
+  if (batch_prefix.size() + 3 == dims_vec.size()) {
+    // Base case
+    for (int batch = 0; batch < current_dim_size; ++batch) {
+      std::vector<int> new_batch_prefix = batch_prefix;
+      new_batch_prefix.emplace_back(batch);
+      UnrollBatchMatMul3D(input_lhs, input_rhs, batch_op, new_batch_prefix,
+                          model, tail_it, &batch_pack_inputs);
+    }
+  } else {
+    // Recursion
+    for (int batch = 0; batch < current_dim_size; ++batch) {
+      std::vector<int> new_batch_prefix = batch_prefix;
+      new_batch_prefix.emplace_back(batch);
+      std::vector<string> pack_inputs = UnrollBatchMatMulRecursion(
+          input_lhs, input_rhs, batch_op, model, tail_it, new_batch_prefix);
+
+      // The pack that will join all the individual matmul results together.
+      auto* pack_op = new PackOperator;
+      std::string batch_name = absl::StrCat(
+          batch_op->outputs[0], "_b", absl::StrJoin(new_batch_prefix, "-"));
+      pack_op->inputs = pack_inputs;
+      pack_op->outputs = {AvailableArrayName(*model, batch_name + "/pack")};
+      auto& pack_op_output = model->GetOrCreateArray(pack_op->outputs[0]);
+      pack_op_output.data_type = input_array_a.data_type;
+      pack_op->axis = 0;
+      pack_op->values_count = pack_inputs.size();
+      *tail_it = model->operators.emplace(*tail_it, pack_op) + 1;
+
+      batch_pack_inputs.push_back(pack_op->outputs[0]);
+    }
+  }
+  return batch_pack_inputs;
+}
+
+std::vector<int32> GetTransposePerm(const Array& input_array) {
+  const int32 dims = input_array.shape().dimensions_count();
+  std::vector<int32> perm_array_val(dims);
+  for (int i = 0; i < dims; ++i) {
+    perm_array_val[i] = i;
+  }
+  perm_array_val[dims - 2] = dims - 1;
+  perm_array_val[dims - 1] = dims - 2;
+  return perm_array_val;
+}
+
+std::vector<int32> GetTransposeShape(const Shape& input_shape,
+                                     const std::vector<int32>& perm_array_val) {
+  const int32 dims = input_shape.dimensions_count();
+  std::vector<int32> output_shape(dims);
+  for (int i = 0; i < dims; ++i) {
+    output_shape[i] = input_shape.dims(perm_array_val[i]);
+  }
+  return output_shape;
+}
+
+TransposeOperator* TransposeInput(const string& input, Model* model) {
+  const auto& input_array = model->GetArray(input);
+  const auto perm_array = GetTransposePerm(input_array);
+  const string perm_array_name = CreateInt32Array(
+      model, AvailableArrayName(*model, input + "/transpose/perm"), perm_array);
+  auto* transpose_op = new TransposeOperator;
+  transpose_op->inputs = {input, perm_array_name};
+  transpose_op->outputs = {AvailableArrayName(*model, input + "/transpose")};
+  auto& transpose_array = model->GetOrCreateArray(transpose_op->outputs[0]);
+  *transpose_array.mutable_shape()->mutable_dims() =
+      GetTransposeShape(input_array.shape(), perm_array);
+  model->GetOrCreateArray(transpose_op->outputs[0]);
+  return transpose_op;
+}
+
+}  // namespace
+
 // Unrolls a BatchMatMul on the batch dimension.
 // We need to slice each batch out of the inputs, matmul them individually, then
 // stack them all back together at the end.
@@ -46,115 +221,67 @@ namespace toco {
   const auto* batch_op =
       static_cast<const BatchMatMulOperator*>(batch_op_it->get());
 
-  // We must have the shape of at least one input to know our batch size.
-  const auto& input_array_a = model->GetArray(batch_op->inputs[0]);
-  const auto& input_array_b = model->GetArray(batch_op->inputs[1]);
-  if (!input_array_a.has_shape() || !input_array_b.has_shape())
+  auto& tail_it = batch_op_it;
+
+  string input_lhs = batch_op->inputs[0];
+  string input_rhs = batch_op->inputs[1];
+  const auto& input_lhs_array = model->GetArray(input_lhs);
+  const auto& input_rhs_array = model->GetArray(input_rhs);
+  if (!input_lhs_array.has_shape() || !input_rhs_array.has_shape())
     return ::tensorflow::Status::OK();
 
-  // We only support the rank 3 case. If you are batching on rank > 3 you'll
-  // have to figure that out.
-  CHECK_EQ(input_array_a.shape().dimensions_count(),
-           input_array_b.shape().dimensions_count())
-      << "Input dimensions must have the same rank";
-  if (input_array_a.shape().dimensions_count() == 2) {
+  // Transpose LHS input if necessary.
+  if (batch_op->adj_x) {
+    TransposeOperator* transpose_op = TransposeInput(input_lhs, model);
+    tail_it = model->operators.emplace(tail_it, transpose_op) + 1;
+    input_lhs = transpose_op->outputs[0];
+  }
+  const auto& input_array_a = model->GetArray(input_lhs);
+
+  // Transpose RHS input if necessary.
+  if (batch_op->adj_y) {
+    TransposeOperator* transpose_op = TransposeInput(input_rhs, model);
+    tail_it = model->operators.emplace(tail_it, transpose_op) + 1;
+    input_rhs = transpose_op->outputs[0];
+  }
+  const auto& input_array_b = model->GetArray(input_rhs);
+
+  const int dims = input_array_a.shape().dimensions_count();
+  for (int i = 0; i < dims - 2; ++i) {
+    CHECK_EQ(input_array_a.shape().dims(i), input_array_b.shape().dims(i))
+        << "input array not consistent at index " << i;
+  }
+  CHECK_EQ(input_array_a.shape().dims(dims - 1),
+           input_array_b.shape().dims(dims - 2))
+      << "Input dimensions must be compatible for multipication. shape a = ["
+      << absl::StrJoin(input_array_a.shape().dims(), ", ") << "], shape b = ["
+      << absl::StrJoin(input_array_b.shape().dims(), ", ") << "]";
+
+  if (dims == 2) {
     // This is really just a MatMul. This likely means that someone hand-crafted
     // a graphdef with a BatchMatMul when they really wanted a MatMul.
     AddMessageF("Replacing non-batch BatchMatMul %s by a MatMul operator",
                 LogName(*batch_op));
     auto* matmul_op = new TensorFlowMatMulOperator;
-    matmul_op->inputs = batch_op->inputs;
+    matmul_op->inputs = {input_lhs, input_rhs};
     matmul_op->outputs = batch_op->outputs;
-    const auto matmul_op_it = model->operators.emplace(batch_op_it, matmul_op);
-    batch_op_it = matmul_op_it + 1;
-    CHECK_EQ(batch_op_it->get(), batch_op);
-    model->operators.erase(batch_op_it);
+    tail_it = model->operators.emplace(tail_it, matmul_op) + 1;
+    CHECK_EQ(tail_it->get(), batch_op);
+    model->operators.erase(tail_it);
     *modified = true;
     return ::tensorflow::Status::OK();
   }
-  CHECK_EQ(input_array_a.shape().dimensions_count(), 3)
-      << "Input arrays must have rank 3";
 
-  // Perform the matmul for each slice of the batch.
-  int batch_count = input_array_a.shape().dims(0);
-  AddMessageF("Unrolling BatchMatMul %s %d times", LogName(*batch_op),
-              batch_count);
-  auto tail_it = batch_op_it;
-  std::vector<string> pack_inputs;
-  for (int batch = 0; batch < batch_count; ++batch) {
-    std::string batch_name =
-        std::string(batch_op->outputs[0]) + "_b" + std::to_string(batch);
-
-    // tf.slice(a, ...).
-    auto* slice_a_op = new SliceOperator;
-    slice_a_op->inputs = {
-        batch_op->inputs[0],
-        CreateInt32Array(model, batch_name + "/slice_a/slice/begin",
-                         {batch, 0, 0}),
-        CreateInt32Array(
-            model, batch_name + "/slice_a/slice/size",
-            {1, input_array_a.shape().dims(1), input_array_a.shape().dims(2)}),
-    };
-    slice_a_op->outputs = {AvailableArrayName(*model, batch_name + "/slice_a")};
-    auto& slice_a_op_output = model->GetOrCreateArray(slice_a_op->outputs[0]);
-    slice_a_op_output.data_type = input_array_a.data_type;
-    tail_it = model->operators.emplace(tail_it, slice_a_op) + 1;
-
-    // Reshape to remove the first dimension ([1,M,N] -> [M,N]).
-    auto* slice_a_reshape_op = new TensorFlowReshapeOperator;
-    slice_a_reshape_op->inputs = {
-        slice_a_op->outputs[0],
-        CreateInt32Array(model, batch_name + "/slice_a/reshape/shape",
-                         {-1, input_array_a.shape().dims(2)})};
-    slice_a_reshape_op->outputs = {
-        AvailableArrayName(*model, batch_name + "/slice_a/reshape")};
-    auto& slice_a_reshape_op_output =
-        model->GetOrCreateArray(slice_a_reshape_op->outputs[0]);
-    slice_a_reshape_op_output.data_type = input_array_a.data_type;
-    tail_it = model->operators.emplace(tail_it, slice_a_reshape_op) + 1;
-
-    // tf.slice(b, ...).
-    auto* slice_b_op = new SliceOperator;
-    slice_b_op->inputs = {
-        batch_op->inputs[1],
-        CreateInt32Array(model, batch_name + "/slice_b/slice/begin",
-                         {batch, 0, 0}),
-        CreateInt32Array(
-            model, batch_name + "/slice_b/slice/size",
-            {1, input_array_b.shape().dims(1), input_array_b.shape().dims(2)}),
-    };
-    slice_b_op->outputs = {AvailableArrayName(*model, batch_name + "/slice_b")};
-    auto& slice_b_op_output = model->GetOrCreateArray(slice_b_op->outputs[0]);
-    slice_b_op_output.data_type = input_array_b.data_type;
-    tail_it = model->operators.emplace(tail_it, slice_b_op) + 1;
-
-    // Reshape to remove the first dimension ([1,M,N] -> [M,N]).
-    auto* slice_b_reshape_op = new TensorFlowReshapeOperator;
-    slice_b_reshape_op->inputs = {
-        slice_b_op->outputs[0],
-        CreateInt32Array(model, batch_name + "/slice_b/reshape/shape",
-                         {-1, input_array_b.shape().dims(2)})};
-    slice_b_reshape_op->outputs = {
-        AvailableArrayName(*model, batch_name + "/slice_b/reshape")};
-    auto& slice_b_reshape_op_output =
-        model->GetOrCreateArray(slice_b_reshape_op->outputs[0]);
-    slice_b_reshape_op_output.data_type = input_array_b.data_type;
-    tail_it = model->operators.emplace(tail_it, slice_b_reshape_op) + 1;
-
-    // tf.matmul(slice_a, slice_b).
-    auto* matmul_op = new TensorFlowMatMulOperator;
-    matmul_op->inputs = {slice_a_reshape_op->outputs[0],
-                         slice_b_reshape_op->outputs[0]};
-    matmul_op->outputs = {AvailableArrayName(*model, batch_name)};
-    auto& matmul_op_output = model->GetOrCreateArray(matmul_op->outputs[0]);
-    matmul_op_output.data_type = input_array_a.data_type;
-    tail_it = model->operators.emplace(tail_it, matmul_op) + 1;
+  CHECK_GE(input_array_a.shape().dimensions_count(), 3)
+      << "Input arrays must have rank >= 3";
 
-    // Add to stack.
-    pack_inputs.push_back(matmul_op->outputs[0]);
-  }
+  const auto& dims_vec = input_array_a.shape().dims();
+  AddMessageF("Unrolling BatchMatMul %s %d times", LogName(*batch_op),
+              std::accumulate(dims_vec.begin(), dims_vec.end() - 2, 1,
+                              std::multiplies<int>()));
 
-  // The pack that will join all the individual matmul results together.
+  std::vector<string> pack_inputs = UnrollBatchMatMulRecursion(
+      input_lhs, input_rhs, batch_op, model, &tail_it, {});
   auto* pack_op = new PackOperator;
   pack_op->inputs = pack_inputs;
   pack_op->outputs = {batch_op->outputs[0]};
diff --git a/tensorflow/lite/toco/import_tensorflow.cc b/tensorflow/lite/toco/import_tensorflow.cc
index 0b2f810394311a33899b9242e73131e109a2b4c0..db7c5e6fd56549bb6e005c57db62edb23a56218d 100644
--- a/tensorflow/lite/toco/import_tensorflow.cc
+++ b/tensorflow/lite/toco/import_tensorflow.cc
@@ -235,6 +235,131 @@ tensorflow::Status ImportShape(
   return NumElements(input_dims_only_sizes, input_flat_size);
 }
 
+// Define ways to retrieve data from tensors of different types.
+// TODO(b/80208043): simply use tensorflow::Tensor::FromProto() instead.
+template <typename T>
+struct TensorTraits;
+
+template <>
+struct TensorTraits<float> {
+  static int size(const TensorProto& p) { return p.float_val_size(); }
+  static float get(const TensorProto& p, int i) { return p.float_val(i); }
+  static string accessor_name() { return "float_val"; }
+  static string type_name() { return "float"; }
+  static void CopyFromContent(const TensorProto& p, std::vector<float>* data) {
+    toco::port::CopyToBuffer(p.tensor_content(),
+                             reinterpret_cast<char*>(data->data()));
+  }
+};
+
+template <>
+struct TensorTraits<uint8_t> {
+  static int size(const TensorProto& p) { return p.int_val_size(); }
+  static uint8_t get(const TensorProto& p, int i) { return p.int_val(i); }
+  static string accessor_name() { return "int_val"; }
+  static string type_name() { return "uint8"; }
+  static void CopyFromContent(const TensorProto& p,
+                              std::vector<uint8_t>* data) {
+    toco::port::CopyToBuffer(p.tensor_content(),
+                             reinterpret_cast<char*>(data->data()));
+  }
+};
+
+template <>
+struct TensorTraits<std::complex<float>> {
+  static int size(const TensorProto& p) { return p.scomplex_val_size() / 2; }
+  static std::complex<float> get(const TensorProto& p, int i) {
+    return std::complex<float>(p.scomplex_val(2 * i),
+                               p.scomplex_val(2 * i + 1));
+  }
+  static string accessor_name() { return "scomplex_val"; }
+  static string type_name() { return "complex64"; }
+  static void CopyFromContent(const TensorProto& p,
+                              std::vector<std::complex<float>>* data) {
+    toco::port::CopyToBuffer(p.tensor_content(),
+                             reinterpret_cast<char*>(data->data()));
+  }
+};
+
+template <>
+struct TensorTraits<int32> {
+  static int size(const TensorProto& p) { return p.int_val_size(); }
+  static int32 get(const TensorProto& p, int i) { return p.int_val(i); }
+  static string accessor_name() { return "int_val"; }
+  static string type_name() { return "int32"; }
+  static void CopyFromContent(const TensorProto& p, std::vector<int32>* data) {
+    toco::port::CopyToBuffer(p.tensor_content(),
+                             reinterpret_cast<char*>(data->data()));
+  }
+};
+
+template <>
+struct TensorTraits<int64> {
+  static int size(const TensorProto& p) { return p.int64_val_size(); }
+  static int64 get(const TensorProto& p, int i) { return p.int64_val(i); }
+  static string accessor_name() { return "int64_val"; }
+  static string type_name() { return "int64"; }
+  static void CopyFromContent(const TensorProto& p, std::vector<int64>* data) {
+    toco::port::CopyToBuffer(p.tensor_content(),
+                             reinterpret_cast<char*>(data->data()));
+  }
+};
+
+template <>
+struct TensorTraits<bool> {
+  static int size(const TensorProto& p) { return p.bool_val_size(); }
+  static bool get(const TensorProto& p, int i) { return p.bool_val(i); }
+  static string accessor_name() { return "bool_val"; }
+  static string type_name() { return "bool"; }
+  static void CopyFromContent(const TensorProto& p, std::vector<bool>* data) {
+    std::vector<char> buf(p.tensor_content().size());
+    toco::port::CopyToBuffer(p.tensor_content(), buf.data());
+    for (int i = 0; i < p.tensor_content().size(); i++) {
+      (*data)[i] = static_cast<bool>(buf[i]);
+    }
+  }
+};
+
+template <typename T>
+tensorflow::Status ImportTensorData(const TensorProto& input_tensor,
+                                    int input_flat_size,
+                                    std::vector<T>* output_data) {
+  CHECK_GE(output_data->size(), input_flat_size);
+  int num_elements_in_tensor = TensorTraits<T>::size(input_tensor);
+  if (num_elements_in_tensor == input_flat_size) {
+    for (int i = 0; i < num_elements_in_tensor; i++) {
+      (*output_data)[i] = TensorTraits<T>::get(input_tensor, i);
+    }
+  } else if (input_tensor.tensor_content().size() ==
+             input_flat_size * sizeof(T)) {
+    TensorTraits<T>::CopyFromContent(input_tensor, output_data);
+  } else if (num_elements_in_tensor > 0 &&
+             num_elements_in_tensor < input_flat_size) {
+    // TODO(b/80208043): use tensorflow::Tensor::FromProto() which is the
+    // official way to import tensor data. This particular else-if handles a
+    // grappler optimization where the last few elements in a tensor are
+    // omitted if they are repeated.
+    int i = 0;
+    for (; i < num_elements_in_tensor; ++i) {
+      (*output_data)[i] = TensorTraits<T>::get(input_tensor, i);
+    }
+    auto last = (*output_data)[i - 1];
+    for (; i < input_flat_size; ++i) {
+      (*output_data)[i] = last;
+    }
+  } else {
+    string accessor_name = TensorTraits<T>::accessor_name();
+    string type_name = TensorTraits<T>::type_name();
+    return tensorflow::errors::InvalidArgument(
+        absl::StrCat("Neither input_content (",
+                     input_tensor.tensor_content().size() / sizeof(T), ") nor ",
+                     accessor_name, " (", num_elements_in_tensor,
+                     ") have the right dimensions (", input_flat_size,
+                     ") for this ", type_name, " tensor"));
+  }
+  return tensorflow::Status::OK();
+}
+
 tensorflow::Status ImportFloatArray(const TensorProto& input_tensor,
                                     Array* output_array) {
   CHECK_EQ(input_tensor.dtype(), DT_FLOAT);
@@ -249,28 +374,8 @@ tensorflow::Status ImportFloatArray(const TensorProto& input_tensor,
       output_array->GetMutableBuffer<ArrayDataType::kFloat>().data;
   output_float_data.resize(RequiredBufferSizeForShape(output_array->shape()),
                            0.f);
-  CHECK_GE(output_float_data.size(), input_flat_size);
-  if (input_tensor.float_val_size() == 1) {
-    for (int i = 0; i < input_flat_size; i++) {
-      output_float_data[i] = input_tensor.float_val(0);
-    }
-  } else if (input_tensor.float_val_size() == input_flat_size) {
-    for (int i = 0; i < input_tensor.float_val_size(); i++) {
-      output_float_data[i] = input_tensor.float_val(i);
-    }
-  } else if (input_tensor.tensor_content().size() ==
-             input_flat_size * sizeof(float)) {
-    toco::port::CopyToBuffer(input_tensor.tensor_content(),
-                             reinterpret_cast<char*>(output_float_data.data()));
-  } else {
-    return tensorflow::errors::InvalidArgument(
-        absl::StrCat("Neither input_content (",
-                     input_tensor.tensor_content().size() / sizeof(float),
-                     ") nor float_val (", input_tensor.float_val_size(),
-                     ") have the right dimensions (", input_flat_size,
-                     ") for this float tensor"));
-  }
-  return tensorflow::Status::OK();
+  return ImportTensorData<float>(input_tensor, input_flat_size,
+                                 &output_float_data);
 }
 
 tensorflow::Status ImportComplex64Array(const TensorProto& input_tensor,
@@ -287,32 +392,8 @@ tensorflow::Status ImportComplex64Array(const TensorProto& input_tensor,
       output_array->GetMutableBuffer<ArrayDataType::kComplex64>().data;
   output_complex_data.resize(RequiredBufferSizeForShape(output_array->shape()),
                              std::complex<float>(0.f, 0.f));
-  CHECK_GE(output_complex_data.size(), input_flat_size);
-  if (input_tensor.scomplex_val_size() == 2) {
-    for (int i = 0; i < input_flat_size; i++) {
-      output_complex_data[i] = std::complex<float>(
-          input_tensor.scomplex_val(0), input_tensor.scomplex_val(1));
-    }
-  } else if (input_tensor.scomplex_val_size() == 2 * input_flat_size) {
-    for (int i = 0; i < input_flat_size; ++i) {
-      output_complex_data[i] =
-          std::complex<float>(input_tensor.scomplex_val(2 * i),
-                              input_tensor.scomplex_val(2 * i + 1));
-    }
-  } else if (input_tensor.tensor_content().size() ==
-             input_flat_size * sizeof(std::complex<float>)) {
-    toco::port::CopyToBuffer(
-        input_tensor.tensor_content(),
-        reinterpret_cast<char*>(output_complex_data.data()));
-  } else {
-    return tensorflow::errors::InvalidArgument(absl::StrCat(
-        "Neither input_content (",
-        input_tensor.tensor_content().size() / sizeof(std::complex<float>),
-        ") nor scomplex_val (", input_tensor.scomplex_val_size(),
-        ") have the right dimensions (", input_flat_size,
-        ") for this complex64 tensor"));
-  }
-  return tensorflow::Status::OK();
+  return ImportTensorData<std::complex<float>>(input_tensor, input_flat_size,
+                                               &output_complex_data);
 }
 
 tensorflow::Status ImportQuint8Array(const TensorProto& input_tensor,
@@ -328,28 +409,8 @@ tensorflow::Status ImportQuint8Array(const TensorProto& input_tensor,
   auto& output_int_data =
       output_array->GetMutableBuffer<ArrayDataType::kUint8>().data;
   output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0);
-  CHECK_GE(output_int_data.size(), input_flat_size);
-  if (input_tensor.int_val_size() == 1) {
-    for (int i = 0; i < input_flat_size; i++) {
-      output_int_data[i] = input_tensor.int_val(0);
-    }
-  } else if (input_tensor.int_val_size() == input_flat_size) {
-    for (int i = 0; i < input_tensor.int_val_size(); i++) {
-      output_int_data[i] = input_tensor.int_val(i);
-    }
-  } else if (input_tensor.tensor_content().size() ==
-             input_flat_size * sizeof(uint8_t)) {
-    toco::port::CopyToBuffer(input_tensor.tensor_content(),
-                             reinterpret_cast<char*>(output_int_data.data()));
-  } else {
-    return tensorflow::errors::InvalidArgument(
-        absl::StrCat("Neither input_content (",
-                     input_tensor.tensor_content().size() / sizeof(uint8_t),
-                     ") nor int_val (", input_tensor.int_val_size(),
-                     ") have the right dimensions (", input_flat_size,
-                     ") for this uint8 tensor"));
-  }
-  return tensorflow::Status::OK();
+  return ImportTensorData<uint8_t>(input_tensor, input_flat_size,
+                                   &output_int_data);
 }
 
 tensorflow::Status ImportInt32Array(const TensorProto& input_tensor,
@@ -365,27 +426,8 @@ tensorflow::Status ImportInt32Array(const TensorProto& input_tensor,
   auto& output_int_data =
       output_array->GetMutableBuffer<ArrayDataType::kInt32>().data;
   output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0);
-  CHECK_GE(output_int_data.size(), input_flat_size);
-  if (input_tensor.int_val_size() == 1) {
-    for (int i = 0; i < input_flat_size; i++) {
-      output_int_data[i] = input_tensor.int_val(0);
-    }
-  } else if (input_tensor.int_val_size() == input_flat_size) {
-    for (int i = 0; i < input_tensor.int_val_size(); i++) {
-      output_int_data[i] = input_tensor.int_val(i);
-    }
-  } else if (input_tensor.tensor_content().size() ==
-             input_flat_size * sizeof(int32)) {
-    toco::port::CopyToBuffer(input_tensor.tensor_content(),
-                             reinterpret_cast<char*>(output_int_data.data()));
-  } else {
-    return tensorflow::errors::InvalidArgument(absl::StrCat(
-        "Neither input_content (",
-        input_tensor.tensor_content().size() / sizeof(int32), ") nor int_val (",
-        input_tensor.int_val_size(), ") have the right dimensions (",
-        input_flat_size, ") for this int32 tensor"));
-  }
-  return tensorflow::Status::OK();
+  return ImportTensorData<int32>(input_tensor, input_flat_size,
+                                 &output_int_data);
 }
 
 tensorflow::Status ImportInt64Array(const TensorProto& input_tensor,
@@ -401,28 +443,8 @@ tensorflow::Status ImportInt64Array(const TensorProto& input_tensor,
   auto& output_int_data =
       output_array->GetMutableBuffer<ArrayDataType::kInt64>().data;
   output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0);
-  CHECK_GE(output_int_data.size(), input_flat_size);
-  if (input_tensor.int64_val_size() == 1) {
-    for (int i = 0; i < input_flat_size; i++) {
-      output_int_data[i] = input_tensor.int64_val(0);
-    }
-  } else if (input_tensor.int64_val_size() == input_flat_size) {
-    for (int i = 0; i < input_tensor.float_val_size(); i++) {
-      output_int_data[i] = input_tensor.int64_val(i);
-    }
-  } else if (input_tensor.tensor_content().size() ==
-             input_flat_size * sizeof(int64)) {
-    toco::port::CopyToBuffer(input_tensor.tensor_content(),
-                             reinterpret_cast<char*>(output_int_data.data()));
-  } else {
-    return tensorflow::errors::InvalidArgument(
-        absl::StrCat("Neither input_content (",
-                     input_tensor.tensor_content().size() / sizeof(int64),
-                     ") nor int64_val (", input_tensor.int64_val_size(),
-                     ") have the right dimensions (", input_flat_size,
-                     ") for this int64 tensor"));
-  }
-  return tensorflow::Status::OK();
+  return ImportTensorData<int64>(input_tensor, input_flat_size,
+                                 &output_int_data);
 }
 
 tensorflow::Status ImportBoolArray(const TensorProto& input_tensor,
@@ -439,36 +461,17 @@ tensorflow::Status ImportBoolArray(const TensorProto& input_tensor,
       output_array->GetMutableBuffer<ArrayDataType::kBool>().data;
   output_bool_data.resize(RequiredBufferSizeForShape(output_array->shape()),
                           false);
-  CHECK_GE(output_bool_data.size(), input_flat_size);
-  if (input_tensor.bool_val_size() == 1) {
-    for (int i = 0; i < input_flat_size; i++) {
-      output_bool_data[i] = input_tensor.bool_val(0);
-    }
-  } else if (input_tensor.bool_val_size() == input_flat_size) {
-    for (int i = 0; i < input_tensor.bool_val_size(); i++) {
-      output_bool_data[i] = input_tensor.bool_val(i);
-    }
-  } else if (input_tensor.tensor_content().size() == input_flat_size) {
-    std::vector<char> buf(input_tensor.tensor_content().size());
-    toco::port::CopyToBuffer(input_tensor.tensor_content(), buf.data());
-    for (int i = 0; i < input_tensor.tensor_content().size(); i++) {
-      output_bool_data[i] = static_cast<bool>(buf[i]);
-    }
-  } else {
+  status =
+      ImportTensorData<bool>(input_tensor, input_flat_size, &output_bool_data);
+  if (!status.ok() && output_bool_data.size() == 1) {
     // Some graphs have bool const nodes without actual value...
     // assuming that 'false' is implied.
     // So far only encountered that in an array with 1 entry, let's
     // require that until we encounter a graph where that's not the case.
-    if (output_bool_data.size() != 1) {
-      return tensorflow::errors::InvalidArgument(absl::StrCat(
-          "Neither input_content (", input_tensor.tensor_content().size(),
-          ") nor bool_val (", input_tensor.bool_val_size(),
-          ") have the right dimensions (", input_flat_size,
-          ") for this bool tensor"));
-    }
     output_bool_data[0] = false;
+    return tensorflow::Status::OK();
   }
-  return tensorflow::Status::OK();
+  return status;
 }
 
 tensorflow::Status ImportStringArray(const TensorProto& input_tensor,
@@ -1089,11 +1092,14 @@ tensorflow::Status ConvertBatchMatMulOperator(
     Model* model) {
   TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 2));
 
-  // https://www.tensorflow.org/versions/r0.12/api_docs/python/math_ops/matrix_math_functions
-  CHECK(!HasAttr(node, "adj_a") || (GetBoolAttr(node, "adj_a") == false));
-  CHECK(!HasAttr(node, "adj_b") || (GetBoolAttr(node, "adj_b") == false));
-
   auto* batch_matmul = new BatchMatMulOperator;
+  // https://www.tensorflow.org/versions/r0.12/api_docs/python/math_ops/matrix_math_functions
+  if (HasAttr(node, "adj_x")) {
+    batch_matmul->adj_x = GetBoolAttr(node, "adj_x");
+  }
+  if (HasAttr(node, "adj_y")) {
+    batch_matmul->adj_y = GetBoolAttr(node, "adj_y");
+  }
   batch_matmul->inputs = {node.input(0), node.input(1)};
   batch_matmul->outputs = {node.name()};
 
@@ -1187,7 +1193,7 @@ enum FlexSupport { kFlexOk, kFlexNotOk };
 // taken from the given NodeDef, and its number must match NumInputs, unless
 // kAnyNumInputs is passed in. If kFlexOk is passed in the resulting operator
 // will be eligible for being exported as a flex op.
-template <typename Op, int NumInputs, FlexSupport flex>
+template <typename Op, int NumInputs, int NumOutputs, FlexSupport flex>
 tensorflow::Status ConvertSimpleOperatorGeneric(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
@@ -1200,6 +1206,11 @@ tensorflow::Status ConvertSimpleOperatorGeneric(
     op->inputs.push_back(node.input(i));
   }
   op->outputs.push_back(node.name());
+  if (NumOutputs > 1) {
+    for (int i = 1; i < NumOutputs; ++i) {
+      op->outputs.push_back(node.name() + ":" + std::to_string(i));
+    }
+  }
 
   if (flex == kFlexOk) {
     RetainTensorFlowNodeDef(node, op);
@@ -1210,20 +1221,20 @@ tensorflow::Status ConvertSimpleOperatorGeneric(
 }
 
 // Convert a simple operator which is not valid as a flex op.
-template <typename Op, int NumInputs = kAnyNumInputs>
+template <typename Op, int NumInputs, int NumOutputs>
 tensorflow::Status ConvertSimpleOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
-  return ConvertSimpleOperatorGeneric<Op, NumInputs, kFlexNotOk>(
+  return ConvertSimpleOperatorGeneric<Op, NumInputs, NumOutputs, kFlexNotOk>(
       node, tf_import_flags, model);
 }
 
 // Convert a simple operator which is valid as a flex op.
-template <typename Op, int NumInputs = kAnyNumInputs>
+template <typename Op, int NumInputs, int NumOutputs>
 tensorflow::Status ConvertSimpleOperatorFlexOk(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
-  return ConvertSimpleOperatorGeneric<Op, NumInputs, kFlexOk>(
+  return ConvertSimpleOperatorGeneric<Op, NumInputs, NumOutputs, kFlexOk>(
       node, tf_import_flags, model);
 }
 
@@ -1338,7 +1349,7 @@ tensorflow::Status ConvertUnsupportedOperator(
   }
 
   // Parse outputs. Name them after the node's name, plus an ordinal suffix.
-  // Note that some outputs are to be multipled by a named attribute.
+  // Note that some outputs are to be multiplied by a named attribute.
   const tensorflow::OpDef* op_def = nullptr;
   if (tensorflow::OpRegistry::Global()->LookUpOpDef(node.op(), &op_def).ok()) {
     GetOutputNamesFromNodeDef(node, *op_def, op);
@@ -1399,6 +1410,36 @@ tensorflow::Status ConvertUnsupportedOperator(
   return tensorflow::Status::OK();
 }
 
+// Same as ConvertConstOperator, but revert to ConvertUnsupportedOperator if
+// the types are not supported. Converting Const operators here avoids
+// expensive copies of the protocol buffers downstream in the flex delegate.
+tensorflow::Status ConditionallyConvertConstOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  // We avoid incomplete and zero shapes because the resulting arrays
+  // are not completely compatible with Eager/TensorFlow.
+  const auto& tensor = GetTensorAttr(node, "value");
+  const auto& shape = tensor.tensor_shape();
+  for (const auto& dim : shape.dim()) {
+    if (dim.size() <= 0) {
+      return ConvertUnsupportedOperator(node, tf_import_flags, model);
+    }
+  }
+
+  switch (GetDataTypeAttr(node, "dtype")) {
+    case DT_FLOAT:
+    case DT_INT32:
+    case DT_QUINT8:
+    case DT_INT64:
+    case DT_STRING:
+    case DT_BOOL:
+    case DT_COMPLEX64:
+      return ConvertConstOperator(node, tf_import_flags, model);
+    default:
+      return ConvertUnsupportedOperator(node, tf_import_flags, model);
+  }
+}
+
 tensorflow::Status ConvertStridedSliceOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
@@ -1442,7 +1483,7 @@ tensorflow::Status ConvertPlaceholderOperator(
   if (node.attr().count("shape")) {
     const auto& shape = GetShapeAttr(node, "shape");
     auto num_dims = shape.dim_size();
-    // TODO(b/62716978): This logic needs to be revisted.  During dims
+    // TODO(b/62716978): This logic needs to be revisited.  During dims
     // refactoring it is an interim fix.
     if (num_dims > 0 && !HasWildcardDimension(shape)) {
       auto& dst_array_dims = *array.mutable_shape()->mutable_dims();
@@ -1491,6 +1532,20 @@ tensorflow::Status ConvertFloorOperator(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertCeilOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  CHECK_EQ(node.op(), "Ceil");
+  TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 1));
+  const auto data_type = GetDataTypeAttr(node, "T");
+  CHECK(data_type == DT_FLOAT);
+  auto* op = new CeilOperator;
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+  return tensorflow::Status::OK();
+}
+
 tensorflow::Status ConvertGatherOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
@@ -1520,6 +1575,21 @@ tensorflow::Status ConvertGatherOperator(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertGatherNdOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  CHECK_EQ(node.op(), "GatherNd");
+  TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 2));
+  const auto indices_data_type = GetDataTypeAttr(node, "Tindices");
+  CHECK(indices_data_type == DT_INT32 || indices_data_type == DT_INT64);
+  auto* op = new GatherNdOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op);
+  return tensorflow::Status::OK();
+}
+
 template <typename Op>
 tensorflow::Status ConvertArgMinMaxOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
@@ -2279,6 +2349,27 @@ tensorflow::Status ConvertLeakyReluOperator(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertUnidirectionalSequenceRnn(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  DCHECK_EQ(node.op(), "UnidirectionalSequenceRnn");
+
+  auto* op = new UnidirectionalSequenceRnnOperator();
+  const auto& indices = GetListAttr(node, "_tflite_input_indices");
+  if (indices.i_size() != node.input().size()) {
+    return tensorflow::errors::InvalidArgument("Input size does not match.");
+  }
+
+  for (const string& input : node.input()) {
+    op->inputs.push_back(input);
+  }
+  // Only use the last one as input.
+  op->outputs.push_back(node.name() + ":1");
+  model->operators.emplace_back(op);
+
+  return tensorflow::Status::OK();
+}
+
 }  // namespace
 
 namespace internal {
@@ -2290,23 +2381,25 @@ using ConverterMapType = std::unordered_map<std::string, ConverterType>;
 
 ConverterMapType GetTensorFlowNodeConverterMapForFlex() {
   return std::unordered_map<std::string, ConverterType>({
-      // We need to let TCO convert Placeholder information into
+      // We need to let TOCO convert Placeholder information into
       // array data, so that the data types are correct.
       {"LegacyFedInput", ConvertPlaceholderOperator},
       {"Placeholder", ConvertPlaceholderOperator},
+      {"Const", ConditionallyConvertConstOperator},
   });
 }
 
 ConverterMapType GetTensorFlowNodeConverterMap() {
   return std::unordered_map<std::string, ConverterType>({
-      {"Abs", ConvertSimpleOperator<AbsOperator>},
-      {"Add", ConvertSimpleOperator<AddOperator, 2>},
-      {"AddN", ConvertSimpleOperatorFlexOk<AddNOperator>},
-      {"All", ConvertSimpleOperator<TensorFlowAllOperator>},
+      {"Abs", ConvertSimpleOperator<AbsOperator, kAnyNumInputs, 1>},
+      {"Add", ConvertSimpleOperator<AddOperator, 2, 1>},
+      {"AddN", ConvertSimpleOperator<AddNOperator, kAnyNumInputs, 1>},
+      {"All", ConvertSimpleOperator<TensorFlowAllOperator, kAnyNumInputs, 1>},
       {"Any", ConvertReduceOperator<TensorFlowAnyOperator>},
       {"ArgMax", ConvertArgMaxOperator},
       {"ArgMin", ConvertArgMinOperator},
-      {"Assert", ConvertSimpleOperator<TensorFlowAssertOperator>},
+      {"Assert",
+       ConvertSimpleOperator<TensorFlowAssertOperator, kAnyNumInputs, 1>},
       {"AvgPool", ConvertAvgPoolOperator},
       {"BatchMatMul", ConvertBatchMatMulOperator},
       {"BatchNormWithGlobalNormalization",
@@ -2314,107 +2407,115 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"BatchToSpaceND", ConvertBatchToSpaceNDOperator},
       {"BiasAdd", ConvertBiasAddOperator},
       {"Cast", ConvertCastOperator},
+      {"Ceil", ConvertCeilOperator},
       {"CheckNumerics", ConvertIdentityOperator},
       {"Concat", ConvertConcatOperator},
       {"ConcatV2", ConvertConcatOperator},
       {"Const", ConvertConstOperator},
       {"Conv2D", ConvertConvOperator},
       {"Conv2DBackpropInput", ConvertTransposeConvOperator},
+      {"Cos", ConvertSimpleOperator<CosOperator, 1, 1>},
       {"CTCBeamSearchDecoder", ConvertCTCBeamSearchDecoderOperator},
       {"DepthToSpace", ConvertDepthToSpaceOperator},
       {"DepthwiseConv2dNative", ConvertDepthwiseConvOperator},
-      {"Div", ConvertSimpleOperator<DivOperator, 2>},
+      {"Div", ConvertSimpleOperator<DivOperator, 2, 1>},
       {"DynamicPartition", ConvertDynamicPartitionOperator},
       {"DynamicStitch", ConvertDynamicStitchOperator},
-      {"Equal", ConvertSimpleOperator<TensorFlowEqualOperator, 2>},
-      {"Exp", ConvertSimpleOperator<ExpOperator, 1>},
-      {"ExpandDims", ConvertSimpleOperator<ExpandDimsOperator, 2>},
+      {"Elu", ConvertSimpleOperator<EluOperator, 1, 1>},
+      {"Equal", ConvertSimpleOperator<TensorFlowEqualOperator, 2, 1>},
+      {"Exp", ConvertSimpleOperator<ExpOperator, 1, 1>},
+      {"ExpandDims", ConvertSimpleOperator<ExpandDimsOperator, 2, 1>},
       {"FakeQuantWithMinMaxArgs", ConvertFakeQuantWithMinMaxArgs},
       {"FakeQuantWithMinMaxVars", ConvertFakeQuantWithMinMaxVars},
-      {"Fill", ConvertSimpleOperator<FillOperator, 2>},
+      {"Fill", ConvertSimpleOperator<FillOperator, 2, 1>},
       {"Floor", ConvertFloorOperator},
-      {"FloorDiv", ConvertSimpleOperator<FloorDivOperator, 2>},
-      {"FloorMod", ConvertSimpleOperator<FloorModOperator, 2>},
+      {"FloorDiv", ConvertSimpleOperator<FloorDivOperator, 2, 1>},
+      {"FloorMod", ConvertSimpleOperator<FloorModOperator, 2, 1>},
       {"FusedBatchNorm", ConvertFusedBatchNormOperator},
       {"Gather", ConvertGatherOperator},
       {"GatherV2", ConvertGatherOperator},
-      {"Greater", ConvertSimpleOperator<TensorFlowGreaterOperator, 2>},
+      {"GatherNd", ConvertGatherNdOperator},
+      {"Greater", ConvertSimpleOperator<TensorFlowGreaterOperator, 2, 1>},
       {"GreaterEqual",
-       ConvertSimpleOperator<TensorFlowGreaterEqualOperator, 2>},
+       ConvertSimpleOperator<TensorFlowGreaterEqualOperator, 2, 1>},
       {"Identity", ConvertIdentityOperator},
       {"LRN", ConvertLRNOperator},
       {"LeakyRelu", ConvertLeakyReluOperator},
       {"LegacyFedInput", ConvertPlaceholderOperator},
-      {"Less", ConvertSimpleOperator<TensorFlowLessOperator, 2>},
-      {"LessEqual", ConvertSimpleOperator<TensorFlowLessEqualOperator, 2>},
-      {"Log", ConvertSimpleOperator<LogOperator, 1>},
-      {"LogicalAnd", ConvertSimpleOperator<LogicalAndOperator, 2>},
-      {"LogicalOr", ConvertSimpleOperator<LogicalOrOperator, 2>},
-      {"LogicalNot", ConvertSimpleOperator<LogicalNotOperator, 1>},
-      {"LogSoftmax", ConvertSimpleOperator<LogSoftmaxOperator, 1>},
+      {"Less", ConvertSimpleOperator<TensorFlowLessOperator, 2, 1>},
+      {"LessEqual", ConvertSimpleOperator<TensorFlowLessEqualOperator, 2, 1>},
+      {"Log", ConvertSimpleOperator<LogOperator, 1, 1>},
+      {"LogicalAnd", ConvertSimpleOperator<LogicalAndOperator, 2, 1>},
+      {"LogicalOr", ConvertSimpleOperator<LogicalOrOperator, 2, 1>},
+      {"LogicalNot", ConvertSimpleOperator<LogicalNotOperator, 1, 1>},
+      {"LogSoftmax", ConvertSimpleOperator<LogSoftmaxOperator, 1, 1>},
       {"MatMul", ConvertMatMulOperator},
       {"Max", ConvertReduceOperator<TensorFlowMaxOperator>},
       {"MaxPool", ConvertMaxPoolOperator},
-      {"Maximum", ConvertSimpleOperator<TensorFlowMaximumOperator, 2>},
+      {"Maximum", ConvertSimpleOperator<TensorFlowMaximumOperator, 2, 1>},
       {"Mean", ConvertReduceOperator<MeanOperator>},
-      {"Merge", ConvertSimpleOperator<TensorFlowMergeOperator, 2>},
+      {"Merge", ConvertSimpleOperator<TensorFlowMergeOperator, 2, 1>},
       {"Min", ConvertReduceOperator<TensorFlowMinOperator>},
-      {"Minimum", ConvertSimpleOperator<TensorFlowMinimumOperator, 2>},
-      {"Mul", ConvertSimpleOperator<MulOperator, 2>},
-      {"Neg", ConvertSimpleOperator<NegOperator, 1>},
+      {"Minimum", ConvertSimpleOperator<TensorFlowMinimumOperator, 2, 1>},
+      {"Mul", ConvertSimpleOperator<MulOperator, 2, 1>},
+      {"Neg", ConvertSimpleOperator<NegOperator, 1, 1>},
       {"NextIteration", ConvertOperatorSpecialCasedAsRNNBackEdge},
       {"NoOp", ConvertNoOpOperator},
-      {"NotEqual", ConvertSimpleOperator<TensorFlowNotEqualOperator, 2>},
+      {"NotEqual", ConvertSimpleOperator<TensorFlowNotEqualOperator, 2, 1>},
       {"OneHot", ConvertOneHotOperator},
       {"Pack", ConvertPackOperator},
-      {"Pad", ConvertSimpleOperator<PadOperator, 2>},
-      {"PadV2", ConvertSimpleOperator<PadV2Operator, 3>},
+      {"Pad", ConvertSimpleOperator<PadOperator, 2, 1>},
+      {"PadV2", ConvertSimpleOperator<PadV2Operator, 3, 1>},
       {"ParallelDynamicStitch", ConvertDynamicStitchOperator},
       {"Placeholder", ConvertPlaceholderOperator},
       {"PlaceholderWithDefault", ConvertIdentityOperator},
-      {"Pow", ConvertSimpleOperator<PowOperator, 2>},
+      {"Pow", ConvertSimpleOperator<PowOperator, 2, 1>},
       {"Prod", ConvertReduceOperator<TensorFlowProdOperator>},
       {"RandomUniform", ConvertRandomUniform},
       {"Range", ConvertRangeOperator},
-      {"Rank", ConvertSimpleOperator<RankOperator, 1>},
-      {"RealDiv", ConvertSimpleOperator<DivOperator, 2>},
-      {"Relu", ConvertSimpleOperator<ReluOperator, 1>},
-      {"Relu6", ConvertSimpleOperator<Relu6Operator, 1>},
-      {"Reshape", ConvertSimpleOperator<TensorFlowReshapeOperator, 2>},
+      {"Rank", ConvertSimpleOperator<TensorFlowRankOperator, 1, 1>},
+      {"RealDiv", ConvertSimpleOperator<DivOperator, 2, 1>},
+      {"Relu", ConvertSimpleOperator<ReluOperator, 1, 1>},
+      {"Relu6", ConvertSimpleOperator<Relu6Operator, 1, 1>},
+      {"Reshape", ConvertSimpleOperator<TensorFlowReshapeOperator, 2, 1>},
       {"ResizeBilinear", ConvertResizeBilinearOperator},
       {"ResizeNearestNeighbor", ConvertResizeNearestNeighborOperator},
-      {"Rsqrt", ConvertSimpleOperator<TensorFlowRsqrtOperator, 1>},
-      {"Select", ConvertSimpleOperator<SelectOperator, 3>},
+      {"ReverseV2", ConvertSimpleOperator<ReverseV2Operator, 2, 1>},
+      {"Rsqrt", ConvertSimpleOperator<TensorFlowRsqrtOperator, 1, 1>},
+      {"Select", ConvertSimpleOperator<SelectOperator, 3, 1>},
       {"Shape", ConvertShapeOperator},
-      {"Sigmoid", ConvertSimpleOperator<LogisticOperator, 1>},
-      {"Sin", ConvertSimpleOperator<SinOperator, 1>},
-      {"Slice", ConvertSimpleOperator<SliceOperator, 3>},
+      {"Sigmoid", ConvertSimpleOperator<LogisticOperator, 1, 1>},
+      {"Sin", ConvertSimpleOperator<SinOperator, 1, 1>},
+      {"Slice", ConvertSimpleOperator<SliceOperator, 3, 1>},
       {"Softmax", ConvertSoftmaxOperator},
       {"SpaceToBatchND", ConvertSpaceToBatchNDOperator},
       {"SpaceToDepth", ConvertSpaceToDepthOperator},
       {"SparseToDense", ConvertSparseToDenseOperator},
       {"Split", ConvertSplitOperator},
       {"SplitV", ConvertSplitVOperator},
-      {"Sqrt", ConvertSimpleOperator<TensorFlowSqrtOperator, 1>},
-      {"Square", ConvertSimpleOperator<TensorFlowSquareOperator, 1>},
+      {"Sqrt", ConvertSimpleOperator<TensorFlowSqrtOperator, 1, 1>},
+      {"Square", ConvertSimpleOperator<TensorFlowSquareOperator, 1, 1>},
       {"SquaredDifference",
-       ConvertSimpleOperator<SquaredDifferenceOperator, 2>},
+       ConvertSimpleOperator<SquaredDifferenceOperator, 2, 1>},
       {"Squeeze", ConvertSqueezeOperator},
       {"StopGradient", ConvertIdentityOperator},
       {"StridedSlice", ConvertStridedSliceOperator},
-      {"Sub", ConvertSimpleOperator<SubOperator, 2>},
+      {"Sub", ConvertSimpleOperator<SubOperator, 2, 1>},
       {"Sum", ConvertReduceOperator<TensorFlowSumOperator>},
       {"Svdf", ConvertSvdfOperator},
       {"Switch", ConvertSwitchOperator},
-      {"Tanh", ConvertSimpleOperator<TanhOperator, 1>},
-      {"Tile", ConvertSimpleOperator<TensorFlowTileOperator, 2>},
+      {"Tanh", ConvertSimpleOperator<TanhOperator, 1, 1>},
+      {"Tile", ConvertSimpleOperator<TensorFlowTileOperator, 2, 1>},
       {"TopK", ConvertTopKV2Operator},
       {"TopKV2", ConvertTopKV2Operator},
-      {"Transpose", ConvertSimpleOperator<TransposeOperator, 2>},
+      {"Transpose", ConvertSimpleOperator<TransposeOperator, 2, 1>},
       {"Unpack", ConvertUnpackOperator},
-      {"ZerosLike", ConvertSimpleOperator<TensorFlowZerosLikeOperator, 1>},
+      {"ZerosLike", ConvertSimpleOperator<TensorFlowZerosLikeOperator, 1, 1>},
       {"UnidirectionalSequenceLstm", ConvertUnidirectionalSequenceLstm},
+      {"UnidirectionalSequenceRnn", ConvertUnidirectionalSequenceRnn},
       {"MirrorPad", ConvertMirrorPadOperator},
+      {"Unique", ConvertSimpleOperator<UniqueOperator, 1, 2>},
+      {"Where", ConvertSimpleOperator<WhereOperator, 1, 1>},
   });
 }
 
diff --git a/tensorflow/lite/toco/import_tensorflow_test.cc b/tensorflow/lite/toco/import_tensorflow_test.cc
index 0be358b1f7be2cc632322558eda3da86d16688af..8ff3f7733afb4355a8e7863594633a6555287c10 100644
--- a/tensorflow/lite/toco/import_tensorflow_test.cc
+++ b/tensorflow/lite/toco/import_tensorflow_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/toco/import_tensorflow.h"
+#include "tensorflow/lite/toco/toco_port.h"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -23,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/lite/testing/util.h"
 
 namespace toco {
 
@@ -32,10 +34,12 @@ using tensorflow::DT_COMPLEX64;
 using tensorflow::DT_FLOAT;
 using tensorflow::DT_INT32;
 using tensorflow::DT_INT64;
+using tensorflow::DT_INVALID;
 using tensorflow::DT_QUINT8;
 using tensorflow::DT_STRING;
 using tensorflow::NodeDef;
 using tensorflow::Status;
+using ::testing::ElementsAre;
 
 namespace internal {
 using ConverterType = tensorflow::Status (*)(
@@ -44,6 +48,7 @@ using ConverterType = tensorflow::Status (*)(
 using ConverterMapType = std::unordered_map<std::string, ConverterType>;
 
 ConverterMapType GetTensorFlowNodeConverterMap();
+ConverterMapType GetTensorFlowNodeConverterMapForFlex();
 Status ImportTensorFlowNode(const NodeDef&, const TensorFlowImportFlags&,
                             Model*, const ConverterMapType&);
 }  // namespace internal
@@ -114,35 +119,35 @@ void BuildConstNode(std::initializer_list<int64_t> shape,
   switch (dtype) {
     case DT_FLOAT:
       for (int64_t i = 0; i < num_elements; ++i) {
-        t.add_float_val(i / 10000.0);
+        t.add_float_val(i / 10000.0 + 1);
       }
       break;
     case DT_INT32:
       for (int64_t i = 0; i < num_elements; ++i) {
-        t.add_int_val(i % std::numeric_limits<int>::max());
+        t.add_int_val(i % std::numeric_limits<int>::max() + 1);
       }
       break;
     case DT_QUINT8:
       for (int64_t i = 0; i < num_elements; ++i) {
-        t.add_int_val(i % std::numeric_limits<uint8_t>::max());
+        t.add_int_val(i % std::numeric_limits<uint8_t>::max() + 1);
       }
       break;
     case DT_INT64:
       for (int64_t i = 0; i < num_elements; ++i) {
-        t.add_int64_val(i);
+        t.add_int64_val(i + 1);
       }
       break;
     case DT_STRING:
       break;
     case DT_BOOL:
       for (int64_t i = 0; i < num_elements; ++i) {
-        t.add_bool_val(i % 2);
+        t.add_bool_val((i % 2) == 0);
       }
       break;
     case DT_COMPLEX64:
       for (int64_t i = 0; i < num_elements; ++i) {
-        t.add_scomplex_val(i / 10000.0);
-        t.add_scomplex_val(-i / 10000.0);
+        t.add_scomplex_val(i / 10000.0 + 1);
+        t.add_scomplex_val(-i / 10000.0 - 1);
       }
       break;
     default:
@@ -155,6 +160,32 @@ void BuildConstNode(std::initializer_list<int64_t> shape,
 }
 }  //  namespace
 
+TEST(FlexImportTest, ConditionalConst) {
+  Model model;
+  auto build_and_import_node =
+      [&model](const string& name, std::initializer_list<int64_t> shape,
+               tensorflow::DataType dtype, int64_t num_elements) {
+        NodeDef node;
+        BuildConstNode(shape, dtype, num_elements, &node);
+        node.set_name(name);
+
+        const auto converter = internal::GetTensorFlowNodeConverterMapForFlex();
+        return internal::ImportTensorFlowNode(node, TensorFlowImportFlags(),
+                                              &model, converter);
+      };
+
+  EXPECT_TRUE(build_and_import_node("Known", {1, 2, 3}, DT_INT32, 6).ok());
+  EXPECT_TRUE(build_and_import_node("BadType", {1, 2, 3}, DT_INVALID, 6).ok());
+  EXPECT_TRUE(build_and_import_node("Unknown", {1, -2, 3}, DT_INT32, 6).ok());
+
+  // We expect the "Known" node to be converted into an array, while the
+  // "Unknown" and "BadType" nodes are kept as operators.
+  EXPECT_EQ(model.operators.size(), 2);
+  EXPECT_TRUE(model.HasArray("Known"));
+  EXPECT_FALSE(model.HasArray("Unknown"));
+  EXPECT_FALSE(model.HasArray("BadType"));
+}
+
 class ShapeImportTest : public ::testing::TestWithParam<tensorflow::DataType> {
 };
 
@@ -226,23 +257,126 @@ std::vector<tensorflow::DataType> TestTypes() {
   return {DT_FLOAT, DT_INT32, DT_INT64, DT_BOOL, DT_QUINT8, DT_COMPLEX64};
 }
 
-INSTANTIATE_TEST_CASE_P(ShapeImportTest, ShapeImportTest,
-                        ::testing::ValuesIn(TestTypes()));
+INSTANTIATE_TEST_SUITE_P(ShapeImportTest, ShapeImportTest,
+                         ::testing::ValuesIn(TestTypes()));
+
+class ContentImportTest : public ::testing::Test {
+ public:
+  template <ArrayDataType T>
+  std::vector<DataType<T>> ImportAndGetData(const NodeDef& node) {
+    Model model;
+    auto status = ImportNode(node, &model);
+    CHECK(status.ok()) << status.error_message();
+    const auto& array = model.GetArray("Node1");
+    return array.GetBuffer<T>().data;
+  }
+  void RemoveTrailingElements(NodeDef* node, int num) {
+    tensorflow::TensorProto* p =
+        node->mutable_attr()->at("value").mutable_tensor();
+    for (int i = 0; i < num; ++i) {
+      if (p->int_val_size() > 0) p->mutable_int_val()->RemoveLast();
+      if (p->int64_val_size() > 0) p->mutable_int64_val()->RemoveLast();
+      if (p->float_val_size() > 0) p->mutable_float_val()->RemoveLast();
+      if (p->bool_val_size() > 0) p->mutable_bool_val()->RemoveLast();
+      if (p->scomplex_val_size() > 0) p->mutable_scomplex_val()->RemoveLast();
+      if (p->scomplex_val_size() > 0) p->mutable_scomplex_val()->RemoveLast();
+    }
+  }
+};
+
+TEST_F(ContentImportTest, Int32) {
+  constexpr ArrayDataType kType = ArrayDataType::kInt32;
+
+  NodeDef node;
+  BuildConstNode({1, 2, 3}, DT_INT32, 6, &node);
+
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 2, 3, 4, 5, 6));
+  RemoveTrailingElements(&node, 1);
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 2, 3, 4, 5, 5));
+  RemoveTrailingElements(&node, 4);
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 1, 1, 1, 1, 1));
+}
+
+TEST_F(ContentImportTest, Int64) {
+  constexpr ArrayDataType kType = ArrayDataType::kInt64;
+
+  NodeDef node;
+  BuildConstNode({1, 2, 3}, DT_INT64, 6, &node);
+
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 2, 3, 4, 5, 6));
+  RemoveTrailingElements(&node, 1);
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 2, 3, 4, 5, 5));
+  RemoveTrailingElements(&node, 4);
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 1, 1, 1, 1, 1));
+}
+
+TEST_F(ContentImportTest, Quint8) {
+  constexpr ArrayDataType kType = ArrayDataType::kUint8;
+
+  NodeDef node;
+  BuildConstNode({1, 2, 3}, DT_QUINT8, 6, &node);
+
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 2, 3, 4, 5, 6));
+  RemoveTrailingElements(&node, 1);
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 2, 3, 4, 5, 5));
+  RemoveTrailingElements(&node, 4);
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 1, 1, 1, 1, 1));
+}
+
+TEST_F(ContentImportTest, Bool) {
+  constexpr ArrayDataType kType = ArrayDataType::kBool;
+
+  NodeDef node;
+  BuildConstNode({1, 2, 3}, DT_BOOL, 6, &node);
+
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 0, 1, 0, 1, 0));
+  RemoveTrailingElements(&node, 1);
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 0, 1, 0, 1, 1));
+  RemoveTrailingElements(&node, 4);
+  EXPECT_THAT(ImportAndGetData<kType>(node), ElementsAre(1, 1, 1, 1, 1, 1));
+}
+
+TEST_F(ContentImportTest, Float) {
+  constexpr ArrayDataType kType = ArrayDataType::kFloat;
+
+  NodeDef node;
+  BuildConstNode({1, 2, 3}, DT_FLOAT, 6, &node);
+
+  EXPECT_THAT(ImportAndGetData<kType>(node),
+              ElementsAre(1.0000, 1.0001, 1.0002, 1.0003, 1.0004, 1.0005));
+  RemoveTrailingElements(&node, 1);
+  EXPECT_THAT(ImportAndGetData<kType>(node),
+              ElementsAre(1.0000, 1.0001, 1.0002, 1.0003, 1.0004, 1.0004));
+  RemoveTrailingElements(&node, 4);
+  EXPECT_THAT(ImportAndGetData<kType>(node),
+              ElementsAre(1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000));
+}
+
+TEST_F(ContentImportTest, Complex64) {
+  constexpr ArrayDataType kType = ArrayDataType::kComplex64;
 
-TEST(ImportTest, Complex64ConstNode) {
   NodeDef node;
   BuildConstNode({1, 2, 3}, DT_COMPLEX64, 6, &node);
-  Model model;
-  EXPECT_TRUE(ImportNode(node, &model).ok());
-  const auto& array = model.GetArray("Node1");
-  EXPECT_EQ(ArrayDataType::kComplex64, array.data_type);
-  EXPECT_EQ(6, array.GetBuffer<ArrayDataType::kComplex64>().Length());
-  int64_t i = 0;
-  for (const auto& datum : array.GetBuffer<ArrayDataType::kComplex64>().data) {
-    EXPECT_EQ(i / 10000.0f, std::real(datum));
-    EXPECT_EQ(-i / 10000.0f, std::imag(datum));
-    i++;
-  }
+
+  using cplx = std::complex<float>;
+  EXPECT_THAT(
+      ImportAndGetData<kType>(node),
+      ElementsAre(std::complex<float>(1.0000, -1.0000), cplx(1.0001, -1.0001),
+                  cplx(1.0002, -1.0002), cplx(1.0003, -1.0003),
+                  cplx(1.0004, -1.0004), cplx(1.0005, -1.0005)));
+  RemoveTrailingElements(&node, 1);
+  EXPECT_THAT(
+      ImportAndGetData<kType>(node),
+      ElementsAre(std::complex<float>(1.0000, -1.0000), cplx(1.0001, -1.0001),
+                  cplx(1.0002, -1.0002), cplx(1.0003, -1.0003),
+                  cplx(1.0004, -1.0004), cplx(1.0004, -1.0004)));
+
+  RemoveTrailingElements(&node, 4);
+  EXPECT_THAT(
+      ImportAndGetData<kType>(node),
+      ElementsAre(std::complex<float>(1.0000, -1.0000), cplx(1.0000, -1.0000),
+                  cplx(1.0000, -1.0000), cplx(1.0000, -1.0000),
+                  cplx(1.0000, -1.0000), cplx(1.0000, -1.0000)));
 }
 
 std::vector<std::pair<tensorflow::DataType, ArrayDataType>> UnaryTestTypes() {
@@ -284,8 +418,8 @@ TEST_P(TypeImportTest, BasicTypeInference) {
           model.operators[0].get());
   ASSERT_THAT(op->output_data_types, ::testing::ElementsAre(GetParam().second));
 }
-INSTANTIATE_TEST_CASE_P(BasicTypeInference, TypeImportTest,
-                        ::testing::ValuesIn(UnaryTestTypes()));
+INSTANTIATE_TEST_SUITE_P(BasicTypeInference, TypeImportTest,
+                         ::testing::ValuesIn(UnaryTestTypes()));
 
 TEST(ImportTest, TypeInferenceWithFixedOutputType) {
   // Create an op that has a fixed output type (bool).
@@ -432,3 +566,10 @@ TEST(ImportTest, UnsupportedOpWithMultipleOutputs) {
 
 }  // namespace
 }  // namespace toco
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  ::toco::port::InitGoogleWasDoneElsewhere();
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/toco/model.h b/tensorflow/lite/toco/model.h
index d392535f5c98cdd3532299064f2c6d9305214e71..d9909c11fa863d5ba7e3d7fbd755023bf56a1264 100644
--- a/tensorflow/lite/toco/model.h
+++ b/tensorflow/lite/toco/model.h
@@ -24,11 +24,11 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/optional.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/runtime/types.h"
 #include "tensorflow/lite/toco/toco_port.h"
 #include "tensorflow/lite/toco/toco_types.h"
-#include "tensorflow/core/platform/logging.h"
 
 namespace toco {
 
@@ -42,8 +42,10 @@ enum class OperatorType : uint8 {
   kAveragePool,
   kBatchMatMul,
   kBatchNormalization,
+  kCeil,
   kConv,
   kConcatenation,
+  kCos,
   kDepthwiseConv,
   kDepthToSpace,
   kSpaceToDepth,
@@ -157,7 +159,15 @@ enum class OperatorType : uint8 {
   kResizeNearestNeighbor,
   kLeakyRelu,
   kAbs,
-  kMirrorPad
+  kMirrorPad,
+  kUnique,
+  kUnidirectionalSequenceRnn,
+  kBidirectionalSequenceLstm,
+  kReverseV2,
+  kBidirectionalSequenceRnn,
+  kGatherNd,
+  kWhere,
+  kElu
 };
 
 // Helper to deal with TensorFlow arrays using a different ordering of
@@ -376,7 +386,7 @@ struct Operator {
   // Output activation arrays. Same comments as for inputs apply here too.
   std::vector<string> outputs;
 
-  // If true, the array has more outputs than are listed in the 'outputs'
+  // If true, the operator has more outputs than are listed in the 'outputs'
   // member. These need to be resolved by some graph transformation.
   // This flag is only here to indicate that an operator should not be
   // discarded as unused, even if from its 'outputs' member alone it
@@ -647,6 +657,18 @@ struct UnidirectionalSequenceLstmOperator : Operator {
       : Operator(OperatorType::kUnidirectionalSequenceLstm) {}
 };
 
+struct BidirectionalSequenceLstmOperator : Operator {
+  BidirectionalSequenceLstmOperator()
+      : Operator(OperatorType::kBidirectionalSequenceLstm) {}
+  bool merge_outputs;
+};
+
+struct BidirectionalSequenceRnnOperator : Operator {
+  BidirectionalSequenceRnnOperator()
+      : Operator(OperatorType::kBidirectionalSequenceRnn) {}
+  bool merge_outputs;
+};
+
 // Element-wise multiplication operator.
 //
 // Inputs:
@@ -669,6 +691,17 @@ struct AbsOperator : Operator {
   AbsOperator() : Operator(OperatorType::kAbs) {}
 };
 
+// Elu
+//   f(x) -> exp(x) - 1 for x < 0, x for x >= 0.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Elu
+struct EluOperator : Operator {
+  EluOperator() : Operator(OperatorType::kElu) {}
+};
+
 // Element-wise Relu operator:
 //   x -> max(0, x)
 //
@@ -946,6 +979,8 @@ struct TensorFlowIdentityOperator : Operator {
 // TensorFlow equivalent: MatMul
 struct BatchMatMulOperator : Operator {
   BatchMatMulOperator() : Operator(OperatorType::kBatchMatMul) {}
+  bool adj_x = false;
+  bool adj_y = false;
 };
 
 // General matrix multiplication operator. We don't want to support general
@@ -1148,6 +1183,17 @@ struct ExpOperator : Operator {
   ExpOperator() : Operator(OperatorType::kExp) {}
 };
 
+// Given a tensor input, this operation calculates element-wise exponential
+// (y = cos(x)).
+//
+// Inputs:
+//   inputs[0]: required: input tensor
+//
+// TensorFlow equivalent: Cos
+struct CosOperator : Operator {
+  CosOperator() : Operator(OperatorType::kCos) {}
+};
+
 // Given a tensor input, this operation inserts a dimension of 1 at the
 // dimension index axis of input's shape. The dimension index axis starts at
 // zero; if you specify a negative number for axis it is counted backward from
@@ -1226,13 +1272,12 @@ struct RangeOperator : Operator {
 // Inputs:
 //   inputs[0]: required: the input array
 //
-// This operation outputs a 0-D integer tensor representing the rank of
-// the input.
+// This operation outputs a 0-D int32 Tensor representing the rank of input.
 //
-// TensorFlow equivalent: Rank.  We currently assume that the output is int32
-// and not int64.  The output type could be stored herein.
-struct RankOperator : Operator {
-  RankOperator() : Operator(OperatorType::kRank) {}
+// TensorFlow equivalent: Rank.
+struct TensorFlowRankOperator : Operator {
+  TensorFlowRankOperator() : Operator(OperatorType::kRank) {}
+  ArrayDataType output_data_type = ArrayDataType::kInt32;
 };
 
 // Element-wise negation (-x) operator.
@@ -1658,6 +1703,16 @@ struct FloorOperator : Operator {
   FloorOperator() : Operator(OperatorType::kFloor) {}
 };
 
+// Ceil operator.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Ceil
+struct CeilOperator : Operator {
+  CeilOperator() : Operator(OperatorType::kCeil) {}
+};
+
 // Gather operator. It gathers slices from params according to indices.
 // Only 1-D indices are supported at the moment.
 //
@@ -1679,10 +1734,22 @@ struct GatherOperator : Operator {
   int input_rank = 0;
 };
 
+// GatherNd operator. It gathers slices from params according to indices.
+//
+// Inputs:
+//   inputs[0]: required: the params array
+//   inputs[1]: required: the indices to gather
+//
+// TensorFlow equivalent: GatherNd
+struct GatherNdOperator : Operator {
+  GatherNdOperator() : Operator(OperatorType::kGatherNd) {}
+};
+
 // ArgMax operator. It returns the index of the maximum value along axis.
 //
 // Inputs:
 //   inputs[0]: required: the input tensor
+//   inputs[1]: optional: 0-D (scalar) axis
 //
 // TensorFlow equivalent: ArgMax
 struct ArgMaxOperator : Operator {
@@ -1694,6 +1761,7 @@ struct ArgMaxOperator : Operator {
 //
 // Inputs:
 //   inputs[0]: required: the input tensor
+//   inputs[1]: optional: 0-D (scalar) axis
 //
 // TensorFlow equivalent: ArgMin
 struct ArgMinOperator : Operator {
@@ -1936,6 +2004,16 @@ struct TensorFlowZerosLikeOperator : Operator {
   TensorFlowZerosLikeOperator() : Operator(OperatorType::kZerosLike) {}
 };
 
+// ReverseV2 operator:
+//
+// Inputs:
+// Inputs[0]: required: the input array.
+//
+// TensorFlow equivalent: ReverseV2.
+struct ReverseV2Operator : Operator {
+  ReverseV2Operator() : Operator(OperatorType::kReverseV2) {}
+};
+
 enum class MirrorPadMode { kNone, kSymmetric, kReflect };
 
 // MirrorPad Operator:
@@ -1953,6 +2031,36 @@ struct MirrorPadOperator : Operator {
   MirrorPadMode mode;
 };
 
+// Unique Operator:
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Unique
+struct UniqueOperator : Operator {
+  UniqueOperator() : Operator(OperatorType::kUnique) {}
+  ArrayDataType idx_out_type = ArrayDataType::kInt32;
+};
+
+struct UnidirectionalSequenceRnnOperator : Operator {
+  UnidirectionalSequenceRnnOperator()
+      : Operator(OperatorType::kUnidirectionalSequenceRnn) {}
+  bool time_major;
+  FusedActivationFunctionType fused_activation_function;
+};
+
+// Where Operator:
+// Return the coordinates of the true values in condition tensor in row-major
+// order.
+//
+// Inputs:
+//  inputs[0]: required: boolean condition tensor
+//
+//  TensorFlow equivalent: Where
+struct WhereOperator : Operator {
+  WhereOperator() : Operator(OperatorType::kWhere) {}
+};
+
 // Alloc's are used for transient arrays only. An Alloc specifies which interval
 // of the "transient_data" workspace buffer passed to inference functions, is to
 // be used for the transient array at hand. The 'start' and 'end' values are
@@ -2208,6 +2316,16 @@ class Model {
   // addresses. See Operator::inputs, Operator::outputs.
   std::unordered_map<string, std::unique_ptr<Array>> arrays;
 };
+
+// OperatorSignature contains the information required to making versioning
+// decisions.
+struct OperatorSignature {
+  // The operator.
+  const Operator* op;
+
+  // The model in which the operator resides.
+  const Model* model;
+};
 }  // namespace toco
 
 #endif  // TENSORFLOW_LITE_TOCO_MODEL_H_
diff --git a/tensorflow/lite/toco/model_cmdline_flags_test.cc b/tensorflow/lite/toco/model_cmdline_flags_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b2f8dc59852d06af001c7e084f1eeedcb040b7a8
--- /dev/null
+++ b/tensorflow/lite/toco/model_cmdline_flags_test.cc
@@ -0,0 +1,67 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+#include <unordered_map>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/toco/args.h"
+#include "tensorflow/lite/toco/model_cmdline_flags.h"
+
+namespace toco {
+namespace {
+
+TEST(ModelCmdlineFlagsTest, ParseArgsStringMapList) {
+  int args_count = 3;
+  const char* args[] = {
+      "toco",
+      "--input_arrays=input_1",
+      "--rnn_states={state_array:rnn/BasicLSTMCellZeroState/zeros,"
+      "back_edge_source_array:rnn/basic_lstm_cell/Add_1,size:4},"
+      "{state_array:rnn/BasicLSTMCellZeroState/zeros_1,"
+      "back_edge_source_array:rnn/basic_lstm_cell/Mul_2,size:4}",
+  };
+
+  string expected_input_arrays = "input_1";
+  std::vector<std::unordered_map<string, string>> expected_rnn_states;
+  expected_rnn_states.push_back(
+      {{"state_array", "rnn/BasicLSTMCellZeroState/zeros"},
+       {"back_edge_source_array", "rnn/basic_lstm_cell/Add_1"},
+       {"size", "4"}});
+  expected_rnn_states.push_back(
+      {{"state_array", "rnn/BasicLSTMCellZeroState/zeros_1"},
+       {"back_edge_source_array", "rnn/basic_lstm_cell/Mul_2"},
+       {"size", "4"}});
+
+  string message;
+  ParsedModelFlags result_flags;
+
+  EXPECT_TRUE(ParseModelFlagsFromCommandLineFlags(
+      &args_count, const_cast<char**>(args), &message, &result_flags));
+  EXPECT_EQ(result_flags.input_arrays.value(), expected_input_arrays);
+  EXPECT_EQ(result_flags.rnn_states.value().elements, expected_rnn_states);
+}
+
+}  // namespace
+}  // namespace toco
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  ::toco::port::InitGoogleWasDoneElsewhere();
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/toco/python/BUILD b/tensorflow/lite/toco/python/BUILD
index 07056f66c35536e82b8f1fdd7938161e216b850a..2f5654c56e0acca57a2d644a7c50e87c185f721b 100644
--- a/tensorflow/lite/toco/python/BUILD
+++ b/tensorflow/lite/toco/python/BUILD
@@ -1,4 +1,7 @@
-package(default_visibility = ["//visibility:public"])
+package(default_visibility = [
+    "//tensorflow/lite:__subpackages__",
+    "//tensorflow/tools/pip_package:__subpackages__",
+])
 
 licenses(["notice"])  # Apache 2.0
 
@@ -9,7 +12,10 @@ load("//tensorflow:tensorflow.bzl", "py_binary")
 config_setting(
     name = "tflite_convert_with_select_tf_ops",
     define_values = {"tflite_convert_with_select_tf_ops": "true"},
-    visibility = ["//visibility:public"],
+    visibility = [
+        "//tensorflow/contrib/lite:__subpackages__",
+        "//tensorflow/lite:__subpackages__",
+    ],
 )
 
 cc_library(
@@ -19,6 +25,7 @@ cc_library(
     deps = [
         "//third_party/python_runtime:headers",
         "//tensorflow/core:lib",
+        "//tensorflow/lite/python/interpreter_wrapper:python_utils",
         "//tensorflow/lite/toco:model_flags_proto_cc",
         "//tensorflow/lite/toco:toco_flags_proto_cc",
         "//tensorflow/lite/toco:toco_graphviz_dump_options",
@@ -37,6 +44,11 @@ cc_library(
 tf_py_wrap_cc(
     name = "tensorflow_wrap_toco",
     srcs = ["toco.i"],
+    visibility = [
+        "//learning/expander/pod/deep_pod/utils:__subpackages__",
+        "//research/handwriting/converters/tflite:__subpackages__",
+        "//tensorflow/lite:__subpackages__",
+    ],
     deps = [
         ":toco_python_api",
         "//tensorflow/lite/toco:model_flags_proto_cc",
diff --git a/tensorflow/lite/toco/python/toco_python_api.cc b/tensorflow/lite/toco/python/toco_python_api.cc
index ce8e3c9df88ba511fcca9d9a256896624194463b..6fad092f35aa386757885f9320f47e9f372e9f47 100644
--- a/tensorflow/lite/toco/python/toco_python_api.cc
+++ b/tensorflow/lite/toco/python/toco_python_api.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/platform/logging.h"
 
+#include "tensorflow/lite/python/interpreter_wrapper/python_utils.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/python/toco_python_api.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
@@ -26,14 +27,6 @@ limitations under the License.
 
 namespace toco {
 
-#if PY_MAJOR_VERSION >= 3
-#define TOCO_PY_TO_CPPSTRING PyBytes_AsStringAndSize
-#define TOCO_FROM_CPPSTRING_TO_PY PyBytes_FromStringAndSize
-#else
-#define TOCO_PY_TO_CPPSTRING PyString_AsStringAndSize
-#define TOCO_FROM_CPPSTRING_TO_PY PyString_FromStringAndSize
-#endif
-
 // NOTE(aselle): We are using raw PyObject's here because we want to make
 // sure we input and output bytes rather than unicode strings for Python3.
 PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
@@ -44,7 +37,7 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
   auto ConvertArg = [&](PyObject* obj, bool* error) {
     char* buf;
     Py_ssize_t len;
-    if (TOCO_PY_TO_CPPSTRING(obj, &buf, &len) == -1) {
+    if (::tflite::python_utils::ConvertFromPyString(obj, &buf, &len) == -1) {
       *error = true;
       return std::string();
     } else {
@@ -96,15 +89,15 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
     PyObject* dict = PyDict_New();
     PyDict_SetItemString(
         dict, "flatbuffer",
-        TOCO_FROM_CPPSTRING_TO_PY(output_file_contents_txt.data(),
-                                  output_file_contents_txt.size()));
+        ::tflite::python_utils::ConvertToPyString(
+            output_file_contents_txt.data(), output_file_contents_txt.size()));
     PyDict_SetItemString(dict, "arithmetic_ops",
                          PyLong_FromLong(model->ArithmeticOpsCount()));
     return dict;
   }
   // Convert arguments back to byte (py3) or str (py2)
-  return TOCO_FROM_CPPSTRING_TO_PY(output_file_contents_txt.data(),
-                                   output_file_contents_txt.size());
+  return ::tflite::python_utils::ConvertToPyString(
+      output_file_contents_txt.data(), output_file_contents_txt.size());
 }
 
 }  // namespace toco
diff --git a/tensorflow/lite/toco/tflite/export.cc b/tensorflow/lite/toco/tflite/export.cc
index 8b9448486dfb60695cddda9dc320c4ab616e8217..1ce37d29018ffdb7ee185e002a1543b053e1eec0 100644
--- a/tensorflow/lite/toco/tflite/export.cc
+++ b/tensorflow/lite/toco/tflite/export.cc
@@ -63,12 +63,12 @@ bool IsControlFlowOp(const string& tensorflow_op) {
   return false;
 }
 
-// Check if a TensorFlow Op is unsupportred by the Flex runtime.
+// Check if a TensorFlow Op is unsupported by the Flex runtime.
 bool IsUnsupportedFlexOp(const string& tensorflow_op) {
   if (IsControlFlowOp(tensorflow_op)) {
     return true;
   }
-  // `HashTableV2` isn't supported for now since it requires an additinonal
+  // `HashTableV2` isn't supported for now since it requires an additional
   // initialization step.
   // TODO(b/117651199): Support `HashTableV2` with Flex runtime.
   if (tensorflow_op == "HashTableV2") {
@@ -106,16 +106,17 @@ void WriteModelToString(const flatbuffers::FlatBufferBuilder& builder,
 namespace details {
 
 OperatorKey::OperatorKey(
-    const ::toco::Operator& op,
+    const ::toco::OperatorSignature& op_signature,
     const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
     bool enable_select_tf_ops) {
   // Get the op name (by Toco definition).
+  const ::toco::Operator& op = *op_signature.op;
   string name = HelpfulOperatorTypeName(op);
 
   bool is_builtin = false;
   const auto& builtin_ops = GetBuiltinOpsMap();
   if (ops_by_type.count(op.type) != 0) {
-    version_ = ops_by_type.at(op.type)->GetVersion(op);
+    version_ = ops_by_type.at(op.type)->GetVersion(op_signature);
     name = ops_by_type.at(op.type)->name();
     is_builtin = (builtin_ops.count(name) > 0);
   }
@@ -156,7 +157,7 @@ OperatorKey::OperatorKey(
         string(::tflite::kFlexCustomCodePrefix) + flex_tensorflow_op_;
   } else {
     // If Flex is disabled or the original TensorFlow NodeDef isn't available,
-    // we produce a custom op. This gives developers a chance to implemenr
+    // we produce a custom op. This gives developers a chance to implement
     // custom ops.
     custom_code_ = name;
   }
@@ -190,7 +191,8 @@ void LoadOperatorsMap(
   // First find a list of unique operator types.
   std::set<OperatorKey> keys;
   for (const auto& op : model.operators) {
-    keys.insert(OperatorKey(*op, ops_by_type, enable_select_tf_ops));
+    const toco::OperatorSignature op_signature = {op.get(), &model};
+    keys.insert(OperatorKey(op_signature, ops_by_type, enable_select_tf_ops));
   }
   // Now assign indices to them and fill in the map.
   int index = 0;
@@ -220,7 +222,7 @@ Offset<Vector<Offset<Tensor>>> ExportTensors(
 
     std::vector<int> shape;
     if (array.has_shape()) {
-      for (int d : array.shape().dims()) {
+      for (const auto& d : array.shape().dims()) {
         shape.push_back(d);
       }
     }
@@ -301,8 +303,9 @@ Offset<Vector<Offset<OperatorCode>>> ExportOperatorCodes(
   std::map<int, Offset<OperatorCode>> ordered_opcodes;
 
   for (const auto& op : model.operators) {
-    const details::OperatorKey operator_key =
-        details::OperatorKey(*op, ops_by_type, params.enable_select_tf_ops);
+    const toco::OperatorSignature op_signature = {op.get(), &model};
+    const details::OperatorKey operator_key = details::OperatorKey(
+        op_signature, ops_by_type, params.enable_select_tf_ops);
     int op_index = operators_map.at(operator_key);
 
     flatbuffers::Offset<flatbuffers::String> custom_code = 0;
@@ -349,9 +352,9 @@ Offset<Vector<Offset<Operator>>> ExportOperators(
     for (const string& output : op->outputs) {
       outputs.push_back(tensors_map.at(output));
     }
-
-    const auto key =
-        details::OperatorKey(*op, ops_by_type, params.enable_select_tf_ops);
+    const toco::OperatorSignature op_signature = {op.get(), &model};
+    const auto key = details::OperatorKey(op_signature, ops_by_type,
+                                          params.enable_select_tf_ops);
     int op_index = operators_map.at(key);
 
     auto tflite_op_it = ops_by_type.find(op->type);
diff --git a/tensorflow/lite/toco/tflite/export.h b/tensorflow/lite/toco/tflite/export.h
index adf6757a3027e53912af03a51dbdfdcdca6b60e8..08d9c9567788b80ddf95aa2c2d20991c9f0fe06d 100644
--- a/tensorflow/lite/toco/tflite/export.h
+++ b/tensorflow/lite/toco/tflite/export.h
@@ -76,7 +76,7 @@ inline void Export(const Model& model, string* output_file_contents) {
 
 namespace details {
 
-// A maps from tensor name to its final position in the TF Lite buffer.
+// A map from tensor name to its final position in the TF Lite buffer.
 using TensorsMap = std::unordered_map<string, int>;
 
 // A key to identify an operator.
@@ -88,7 +88,7 @@ class OperatorKey {
 
   // Construct OperatorKey by Toco op.
   OperatorKey(
-      const ::toco::Operator& op,
+      const ::toco::OperatorSignature& op_signature,
       const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type,
       bool enable_select_tf_ops);
 
@@ -158,7 +158,7 @@ class OperatorKey {
   std::string flex_tensorflow_op_;
 };
 
-// A maps from operator type to its final position in the TF Lite buffer.
+// A map from OperatorKey to its final position in the TF Lite buffer.
 using OperatorsMap = std::unordered_map<OperatorKey, int, OperatorKey::Hash>;
 
 void LoadTensorsMap(const Model& model, TensorsMap* tensors_map);
diff --git a/tensorflow/lite/toco/tflite/export_test.cc b/tensorflow/lite/toco/tflite/export_test.cc
index b371296784a34e081ae9bc5c1497348d9eb925ba..fb640f776abdef3e5a59d075d3bc15e8d0f9565f 100644
--- a/tensorflow/lite/toco/tflite/export_test.cc
+++ b/tensorflow/lite/toco/tflite/export_test.cc
@@ -41,11 +41,37 @@ class ExportTest : public ::testing::Test {
       if (name == "Conv") {
         auto* op = new ConvOperator;
         op->padding.type = PaddingType::kSame;
+        op->inputs = {"input", "filter"};
+        op->outputs = {"output"};
+        Array& input_array = input_model_.GetOrCreateArray(op->inputs[0]);
+        Array& filter_array = input_model_.GetOrCreateArray(op->inputs[1]);
+        Array& output_array = input_model_.GetOrCreateArray(op->outputs[0]);
+        input_array.data_type = ArrayDataType::kFloat;
+        filter_array.data_type = ArrayDataType::kFloat;
+        output_array.data_type = ArrayDataType::kFloat;
         input_model_.operators.emplace_back(op);
       } else if (name == "Add") {
-        input_model_.operators.emplace_back(new AddOperator);
+        auto* op = new AddOperator;
+        op->inputs = {"input1", "input2"};
+        op->outputs = {"output"};
+        Array& input1_array = input_model_.GetOrCreateArray(op->inputs[0]);
+        Array& input2_array = input_model_.GetOrCreateArray(op->inputs[1]);
+        Array& output_array = input_model_.GetOrCreateArray(op->outputs[0]);
+        input1_array.data_type = ArrayDataType::kFloat;
+        input2_array.data_type = ArrayDataType::kFloat;
+        output_array.data_type = ArrayDataType::kFloat;
+        input_model_.operators.emplace_back(op);
       } else if (name == "Sub") {
-        input_model_.operators.emplace_back(new SubOperator);
+        auto* op = new SubOperator;
+        op->inputs = {"input1", "input2"};
+        op->outputs = {"output"};
+        Array& input1_array = input_model_.GetOrCreateArray(op->inputs[0]);
+        Array& input2_array = input_model_.GetOrCreateArray(op->inputs[1]);
+        Array& output_array = input_model_.GetOrCreateArray(op->outputs[0]);
+        input1_array.data_type = ArrayDataType::kFloat;
+        input2_array.data_type = ArrayDataType::kFloat;
+        output_array.data_type = ArrayDataType::kFloat;
+        input_model_.operators.emplace_back(op);
       } else if (name == "Assert") {
         auto* op = new TensorFlowAssertOperator;
 
@@ -97,9 +123,27 @@ class ExportTest : public ::testing::Test {
       auto* op = new ConvOperator;
       op->padding.type = PaddingType::kSame;
       op->inputs = {"inputs", "weights"};
+      op->outputs = {"output"};
+      Array& input_array = input_model_.GetArray(op->inputs[0]);
+      Array& filter_array = input_model_.GetArray(op->inputs[1]);
+      Array& output_array = input_model_.GetOrCreateArray(op->outputs[0]);
+      input_array.data_type = ArrayDataType::kFloat;
+      filter_array.data_type = ArrayDataType::kFloat;
+      output_array.data_type = ArrayDataType::kFloat;
+      input_model_.operators.emplace_back(op);
+    }
+    {
+      auto* op = new AddOperator;
+      op->inputs = {"input1", "input2"};
+      op->outputs = {"output"};
+      Array& input1_array = input_model_.GetOrCreateArray(op->inputs[0]);
+      Array& input2_array = input_model_.GetOrCreateArray(op->inputs[1]);
+      Array& output_array = input_model_.GetOrCreateArray(op->outputs[0]);
+      input1_array.data_type = ArrayDataType::kFloat;
+      input2_array.data_type = ArrayDataType::kFloat;
+      output_array.data_type = ArrayDataType::kFloat;
       input_model_.operators.emplace_back(op);
     }
-    input_model_.operators.emplace_back(new AddOperator);
   }
 
   std::vector<string> ExportAndSummarizeOperators(const ExportParams& params) {
@@ -301,8 +345,9 @@ class FakeConvolutionOperator
                         OperatorType::kConv) {}
 
   // Returning the op version according to the op parameters.
-  int GetVersion(const Operator& op) const override {
-    const TocoOperator& conv_op = static_cast<const TocoOperator&>(op);
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const TocoOperator& conv_op =
+        static_cast<const TocoOperator&>(*op_signature.op);
     if (conv_op.dilation_width_factor != 1 ||
         conv_op.dilation_height_factor != 1) {
       // Version 2 if dilation is used.
@@ -429,7 +474,7 @@ TEST_F(VersionedOpExportTest, Export) {
   auto* model = ::tflite::GetModel(result.data());
   auto operator_codes = model->operator_codes();
 
-  // Verify that 2 operator codes are populdated. Both are CONV_2D but with
+  // Verify that 2 operator codes are populated. Both are CONV_2D but with
   // different versions.
   EXPECT_EQ(2, operator_codes->size());
   EXPECT_EQ(::tflite::BuiltinOperator_CONV_2D,
@@ -448,22 +493,58 @@ TEST_F(VersionedOpExportTest, Export) {
 }
 
 TEST(OperatorKeyTest, TestBuiltinOp) {
+  Model model;
   auto op = absl::make_unique<ConvOperator>();
 
+  // Test a normal float operation.
+  op->inputs = {"input", "filter"};
+  op->outputs = {"output"};
+  Array& input_array = model.GetOrCreateArray(op->inputs[0]);
+  Array& filter_array = model.GetOrCreateArray(op->inputs[1]);
+  Array& output_array = model.GetOrCreateArray(op->outputs[0]);
+  input_array.data_type = ArrayDataType::kFloat;
+  filter_array.data_type = ArrayDataType::kFloat;
+  output_array.data_type = ArrayDataType::kFloat;
+
   const auto ops_by_type = BuildOperatorByTypeMap();
-  const auto key = details::OperatorKey(*op, ops_by_type, false);
+  const toco::OperatorSignature op_signature = {op.get(), &model};
+  const auto key = details::OperatorKey(op_signature, ops_by_type, false);
 
   EXPECT_EQ(key.type(), ::tflite::BuiltinOperator_CONV_2D);
   EXPECT_EQ(key.custom_code(), "");
   EXPECT_EQ(key.version(), 1);
 }
 
+TEST(OperatorKeyTest, TestBuiltinOpWithVersionedInputTypes) {
+  Model model;
+  auto op = absl::make_unique<DequantizeOperator>();
+
+  op->inputs = {"input"};
+  op->outputs = {"output"};
+  Array& input_array = model.GetOrCreateArray(op->inputs[0]);
+  Array& output_array = model.GetOrCreateArray(op->outputs[0]);
+  input_array.data_type = ArrayDataType::kInt8;
+  output_array.data_type = ArrayDataType::kFloat;
+
+  const auto ops_by_type = BuildOperatorByTypeMap();
+
+  // Test a signed int8 dequantize operation.
+  const toco::OperatorSignature op_signature = {op.get(), &model};
+  const auto key = details::OperatorKey(op_signature, ops_by_type, false);
+
+  EXPECT_EQ(key.type(), ::tflite::BuiltinOperator_DEQUANTIZE);
+  EXPECT_EQ(key.custom_code(), "");
+  EXPECT_EQ(key.version(), 2);
+}
+
 TEST(OperatorKeyTest, TestCustomOp) {
+  Model model;
   auto op = absl::make_unique<TensorFlowUnsupportedOperator>();
   op->tensorflow_op = "MyCrazyCustomOp";
 
   const auto ops_by_type = BuildOperatorByTypeMap();
-  const auto key = details::OperatorKey(*op, ops_by_type, false);
+  const toco::OperatorSignature op_signature = {op.get(), &model};
+  const auto key = details::OperatorKey(op_signature, ops_by_type, false);
 
   EXPECT_EQ(key.type(), ::tflite::BuiltinOperator_CUSTOM);
   EXPECT_EQ(key.custom_code(), "MyCrazyCustomOp");
@@ -471,12 +552,14 @@ TEST(OperatorKeyTest, TestCustomOp) {
 }
 
 TEST(OperatorKeyTest, TestFlexOp) {
+  Model model;
   auto op = absl::make_unique<TensorFlowUnsupportedOperator>();
   op->tensorflow_op = "BatchMatMul";
 
   const auto ops_by_type = BuildOperatorByTypeMap();
   {
-    const auto key = details::OperatorKey(*op, ops_by_type, false);
+    const toco::OperatorSignature op_signature = {op.get(), &model};
+    const auto key = details::OperatorKey(op_signature, ops_by_type, false);
     // It shouldn't be converted to Flex op if `allow_flex_op` is false.
     EXPECT_EQ(key.type(), ::tflite::BuiltinOperator_CUSTOM);
     EXPECT_EQ(key.custom_code(), "BatchMatMul");
@@ -488,7 +571,8 @@ TEST(OperatorKeyTest, TestFlexOp) {
   {
     // Verify that the custom op name is prefixed by "Flex" and `is_flex_op`
     // is true.
-    const auto key = details::OperatorKey(*op, ops_by_type, true);
+    const toco::OperatorSignature op_signature = {op.get(), &model};
+    const auto key = details::OperatorKey(op_signature, ops_by_type, true);
     EXPECT_EQ(key.type(), ::tflite::BuiltinOperator_CUSTOM);
     EXPECT_EQ(key.custom_code(), "FlexBatchMatMul");
     EXPECT_EQ(key.version(), 1);
@@ -498,11 +582,13 @@ TEST(OperatorKeyTest, TestFlexOp) {
 }
 
 TEST(OperatorKeyTest, TestFlexWithControlFlowOp) {
+  Model model;
   auto op = absl::make_unique<TensorFlowUnsupportedOperator>();
   op->tensorflow_op = "Merge";
 
   const auto ops_by_type = BuildOperatorByTypeMap();
-  const auto key = details::OperatorKey(*op, ops_by_type, true);
+  const toco::OperatorSignature op_signature = {op.get(), &model};
+  const auto key = details::OperatorKey(op_signature, ops_by_type, true);
 
   EXPECT_EQ(key.type(), ::tflite::BuiltinOperator_CUSTOM);
   EXPECT_EQ(key.custom_code(), "FlexMerge");
@@ -514,11 +600,13 @@ TEST(OperatorKeyTest, TestFlexWithControlFlowOp) {
 }
 
 TEST(OperatorKeyTest, TestFlexWithUnsupportedOp) {
+  Model model;
   auto op = absl::make_unique<TensorFlowUnsupportedOperator>();
   op->tensorflow_op = "HashTableV2";
 
   const auto ops_by_type = BuildOperatorByTypeMap();
-  const auto key = details::OperatorKey(*op, ops_by_type, true);
+  const toco::OperatorSignature op_signature = {op.get(), &model};
+  const auto key = details::OperatorKey(op_signature, ops_by_type, true);
 
   EXPECT_EQ(key.type(), ::tflite::BuiltinOperator_CUSTOM);
   EXPECT_EQ(key.custom_code(), "HashTableV2");
@@ -532,6 +620,7 @@ TEST(OperatorKeyTest, TestFlexWithUnsupportedOp) {
 
 TEST(OperatorKeyTest, TestFlexWithPartiallySupportedOps) {
   // Test Toco-supported/TFLite-unsupported operators.
+  Model model;
   // TODO(ycling): The test will be broken if TensorFlowAssert is implemented in
   // TFLite. Find a more robust way to test the fallback logic.
   auto op = absl::make_unique<TensorFlowAssertOperator>();
@@ -541,7 +630,8 @@ TEST(OperatorKeyTest, TestFlexWithPartiallySupportedOps) {
   {
     // If NodeDef isn't retained in the Toco op, a regular custom op
     // will be exported.
-    const auto key = details::OperatorKey(*op, ops_by_type, true);
+    const toco::OperatorSignature op_signature = {op.get(), &model};
+    const auto key = details::OperatorKey(op_signature, ops_by_type, true);
     EXPECT_EQ(key.type(), ::tflite::BuiltinOperator_CUSTOM);
     EXPECT_EQ(key.custom_code(), "Assert");
     EXPECT_EQ(key.version(), 1);
@@ -556,7 +646,8 @@ TEST(OperatorKeyTest, TestFlexWithPartiallySupportedOps) {
 
   {
     // If NodeDef is retained in the Toco op, a Flex op will be exported.
-    const auto key = details::OperatorKey(*op, ops_by_type, true);
+    const toco::OperatorSignature op_signature = {op.get(), &model};
+    const auto key = details::OperatorKey(op_signature, ops_by_type, true);
     EXPECT_EQ(key.type(), ::tflite::BuiltinOperator_CUSTOM);
     EXPECT_EQ(key.custom_code(), "FlexAssert");
     EXPECT_EQ(key.version(), 1);
diff --git a/tensorflow/lite/toco/tflite/import_test.cc b/tensorflow/lite/toco/tflite/import_test.cc
index 93ab5141abe81c4ed4c1ff0ac7ca5e89577c71fb..b00c4124d83ae558b4aa6f5ecc2ba9eb06e5dac0 100644
--- a/tensorflow/lite/toco/tflite/import_test.cc
+++ b/tensorflow/lite/toco/tflite/import_test.cc
@@ -60,7 +60,7 @@ class ImportTest : public ::testing::Test {
                                builder_.CreateString("tensor_one"), q);
     auto t2 =
         ::tflite::CreateTensor(builder_, builder_.CreateVector<int>({2, 1}),
-                               ::tflite::TensorType_FLOAT32, 2,
+                               ::tflite::TensorType_FLOAT32, 0,
                                builder_.CreateString("tensor_two"), q);
     return builder_.CreateVector(
         std::vector<Offset<::tflite::Tensor>>({t1, t2}));
diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index 205af23da57b08c8c62367df1c154bea5e50cc57..6c83ef628800acc93d238dcd5edd0358f189f3f7 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -14,19 +14,22 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/toco/tflite/operator.h"
 
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/util/ptr_util.h"
+
 // TODO(ycling): Consider refactoring to extract the LSTM definition out of
 // graph_transformation module.
+#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/toco/graph_transformations/lstm_utils.h"
+#include "tensorflow/lite/toco/model.h"
 #include "tensorflow/lite/toco/tflite/builtin_operator.h"
 #include "tensorflow/lite/toco/tflite/custom_operator.h"
 #include "tensorflow/lite/toco/tflite/simple_operator.h"
 #include "tensorflow/lite/toco/tflite/types.h"
 #include "tensorflow/lite/toco/tflite/whitelisted_flex_ops.h"
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_def.pb.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace toco {
 
@@ -60,7 +63,14 @@ class AveragePool
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
 };
 
 class Convolution
@@ -92,7 +102,28 @@ class Convolution
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const string& filter_name = op_signature.op->inputs[1];
+    const string& output_name = op_signature.op->outputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    const Array& filter_array = op_signature.model->GetArray(filter_name);
+    const Array& output_array = op_signature.model->GetArray(output_name);
+    // If the op has signed int8 inputs and outputs, its version 3.
+    if (input_array.data_type == ArrayDataType::kInt8 &&
+        filter_array.data_type == ArrayDataType::kInt8 &&
+        output_array.data_type == ArrayDataType::kInt8) {
+      return 3;
+    }
+    // If the op is a signed int8 hybrid operation, we need to return
+    // version 2.
+    if (input_array.data_type == ArrayDataType::kFloat &&
+        filter_array.data_type == ArrayDataType::kInt8 &&
+        output_array.data_type == ArrayDataType::kFloat) {
+      return 2;
+    }
+    return 1;
+  }
 };
 
 class DepthwiseConvolution
@@ -126,8 +157,21 @@ class DepthwiseConvolution
     op->dilation_height_factor = options.dilation_h_factor();
   }
 
-  int GetVersion(const Operator& op) const override {
-    const auto& conv_op = static_cast<const DepthwiseConvOperator&>(op);
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const auto& conv_op =
+        static_cast<const DepthwiseConvOperator&>(*op_signature.op);
+    const string& input_name = op_signature.op->inputs[0];
+    const string& filter_name = op_signature.op->inputs[1];
+    const string& output_name = op_signature.op->outputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    const Array& filter_array = op_signature.model->GetArray(filter_name);
+    const Array& output_array = op_signature.model->GetArray(output_name);
+    // If the op has signed int8 inputs and outputs, its version 3.
+    if (input_array.data_type == ArrayDataType::kInt8 &&
+        filter_array.data_type == ArrayDataType::kInt8 &&
+        output_array.data_type == ArrayDataType::kInt8) {
+      return 3;
+    }
     if (conv_op.dilation_width_factor != 1 ||
         conv_op.dilation_height_factor != 1) {
       return 2;
@@ -155,7 +199,34 @@ class Add : public BuiltinOperator<AddOperator, ::tflite::AddOptions,
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class AddN : public BuiltinOperator<AddNOperator, ::tflite::AddNOptions,
+                                    ::tflite::BuiltinOptions_AddNOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateAddNOptions(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
 };
 
 class SpaceToBatchND
@@ -174,7 +245,15 @@ class SpaceToBatchND
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {}
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
 };
 
 class Sub : public BuiltinOperator<SubOperator, ::tflite::SubOptions,
@@ -196,7 +275,15 @@ class Sub : public BuiltinOperator<SubOperator, ::tflite::SubOptions,
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
 };
 
 class Div : public BuiltinOperator<DivOperator, ::tflite::DivOptions,
@@ -218,7 +305,9 @@ class Div : public BuiltinOperator<DivOperator, ::tflite::DivOptions,
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
 };
 
 class BatchToSpaceND
@@ -237,7 +326,15 @@ class BatchToSpaceND
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {}
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
 };
 
 class Cast : public BuiltinOperator<CastOperator, ::tflite::CastOptions,
@@ -258,7 +355,9 @@ class Cast : public BuiltinOperator<CastOperator, ::tflite::CastOptions,
     op->dst_data_type = DataType::Deserialize(options.out_data_type());
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
 };
 
 class Concatenation
@@ -278,7 +377,9 @@ class Concatenation
     op->axis = options.axis();
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
 };
 
 class DepthToSpace : public CustomOperator<DepthToSpaceOperator> {
@@ -292,7 +393,9 @@ class DepthToSpace : public CustomOperator<DepthToSpaceOperator> {
     op->block_size = m["block_size"].AsInt64();
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
 };
 
 class FakeQuant
@@ -315,9 +418,8 @@ class FakeQuant
     op->num_bits = options.num_bits();
     op->narrow_range = options.narrow_range();
   }
-
-  int GetVersion(const Operator& op) const override {
-    const auto& fq_op = static_cast<const FakeQuantOperator&>(op);
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const auto& fq_op = static_cast<const FakeQuantOperator&>(*op_signature.op);
     return fq_op.narrow_range ? 2 : 1;
   }
 };
@@ -369,10 +471,45 @@ class FullyConnected
     }
   }
 
-  int GetVersion(const Operator& op) const override {
-    const auto& fc_op = static_cast<const FullyConnectedOperator&>(op);
-    return fc_op.weights_format == FullyConnectedWeightsFormat::kDefault ? 1
-                                                                         : 2;
+  // +-----------------+--------------------+--------------------------+
+  // |                 |    Weight::Default | Weight::Shuffled4x16Int8 |
+  // +-----------------+--------------------+--------------------------+
+  // | Float           |                  1 |                        2 |
+  // | Quantized Uint8 |                  1 |                        2 |
+  // | Hybrid          |                  3 |                        3 |
+  // | Quantized Int8  |                  4 |                        4 |
+  // +-----------------+--------------------+--------------------------+
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const auto& fc_op =
+        static_cast<const FullyConnectedOperator&>(*op_signature.op);
+    const string& input_name = op_signature.op->inputs[0];
+    const string& weights_name = op_signature.op->inputs[1];
+    const string& output_name = op_signature.op->outputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    const Array& weights_array = op_signature.model->GetArray(weights_name);
+    const Array& output_array = op_signature.model->GetArray(output_name);
+    // Int8 fully fixed point kernel is at version 4.
+    if (input_array.data_type == ArrayDataType::kInt8 &&
+        weights_array.data_type == ArrayDataType::kInt8 &&
+        output_array.data_type == ArrayDataType::kInt8) {
+      return 4;
+    }
+    // If the op is a signed int8 hybrid operation, we need to return
+    // version 3.
+    if (input_array.data_type == ArrayDataType::kFloat &&
+        weights_array.data_type == ArrayDataType::kInt8 &&
+        output_array.data_type == ArrayDataType::kFloat) {
+      return 3;
+    }
+    // For float and uint8 fixed point kernels, if the weight is
+    // Shuffled4x16Int8, is is version 2.
+    if (fc_op.weights_format ==
+        FullyConnectedWeightsFormat::kShuffled4x16Int8) {
+      return 2;
+    }
+
+    // Otherwise (weight is default), the version is 1.
+    return 1;
   }
 };
 
@@ -392,7 +529,35 @@ class Gather : public BuiltinOperator<GatherOperator, ::tflite::GatherOptions,
     op->axis = {options.axis()};
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class GatherNd
+    : public BuiltinOperator<GatherNdOperator, ::tflite::GatherNdOptions,
+                             ::tflite::BuiltinOptions_GatherNdOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateGatherNdOptions(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
 };
 
 class Svdf : public BuiltinOperator<SvdfOperator, ::tflite::SVDFOptions,
@@ -414,7 +579,23 @@ class Svdf : public BuiltinOperator<SvdfOperator, ::tflite::SVDFOptions,
     op->rank = options.rank();
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const string& weights_feature_name = op_signature.op->inputs[1];
+    const string& output_name = op_signature.op->outputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    const Array& weights_feature_array =
+        op_signature.model->GetArray(weights_feature_name);
+    const Array& output_array = op_signature.model->GetArray(output_name);
+    // If the op is a signed int8 hybrid operation, we need to return
+    // version 2.
+    if (input_array.data_type == ArrayDataType::kFloat &&
+        weights_feature_array.data_type == ArrayDataType::kInt8 &&
+        output_array.data_type == ArrayDataType::kFloat) {
+      return 2;
+    }
+    return 1;
+  }
 };
 
 class L2Normalization
@@ -436,7 +617,9 @@ class L2Normalization
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
 };
 
 class L2Pool : public BuiltinOperator<L2PoolOperator, ::tflite::Pool2DOptions,
@@ -465,7 +648,9 @@ class L2Pool : public BuiltinOperator<L2PoolOperator, ::tflite::Pool2DOptions,
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
 };
 
 class LocalResponseNormalization
@@ -490,7 +675,9 @@ class LocalResponseNormalization
     op->beta = options.beta();
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
 };
 
 class MaxPool : public BuiltinOperator<MaxPoolOperator, ::tflite::Pool2DOptions,
@@ -519,7 +706,42 @@ class MaxPool : public BuiltinOperator<MaxPoolOperator, ::tflite::Pool2DOptions,
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class Maximum : public SimpleOperator<TensorFlowMaximumOperator> {
+ public:
+  explicit Maximum() : SimpleOperator("MAXIMUM", OperatorType::kMaximum) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class Minimum : public SimpleOperator<TensorFlowMinimumOperator> {
+ public:
+  explicit Minimum() : SimpleOperator("MINIMUM", OperatorType::kMinimum) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
 };
 
 class Mul : public BuiltinOperator<MulOperator, ::tflite::MulOptions,
@@ -541,7 +763,15 @@ class Mul : public BuiltinOperator<MulOperator, ::tflite::MulOptions,
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
 };
 
 class Pad : public BuiltinOperator<PadOperator, ::tflite::PadOptions,
@@ -558,7 +788,15 @@ class Pad : public BuiltinOperator<PadOperator, ::tflite::PadOptions,
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {}
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
 };
 
 class Tile
@@ -574,7 +812,9 @@ class Tile
 
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {}
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
 };
 
 class PadV2 : public BuiltinOperator<PadV2Operator, ::tflite::PadV2Options,
@@ -591,7 +831,15 @@ class PadV2 : public BuiltinOperator<PadV2Operator, ::tflite::PadV2Options,
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {}
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
 };
 
 class Reshape
@@ -614,7 +862,9 @@ class Reshape
                      options.new_shape()->end());
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
 };
 
 class Softmax
@@ -633,7 +883,14 @@ class Softmax
     op->beta = options.beta();
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
 };
 
 class SpaceToDepth
@@ -653,7 +910,15 @@ class SpaceToDepth
     op->block_size = options.block_size();
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
 };
 
 class Transpose
@@ -670,7 +935,15 @@ class Transpose
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {}
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
 };
 
 class Lstm : public BuiltinOperator<LstmCellOperator, ::tflite::LSTMOptions,
@@ -713,12 +986,28 @@ class Lstm : public BuiltinOperator<LstmCellOperator, ::tflite::LSTMOptions,
     }
   }
 
-  int GetVersion(const Operator& op) const override {
-    const auto& lstm_op = static_cast<const LstmCellOperator&>(op);
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const auto& lstm_op =
+        static_cast<const LstmCellOperator&>(*op_signature.op);
     switch (lstm_op.kernel_type) {
-      case LstmCellOperator::KERNEL_FULL:
+      case LstmCellOperator::KERNEL_FULL: {
+        // If the input tensor is float and a weight is int8, this is a version
+        // 3 hybrid operation.
+        const string& input_name = op_signature.op->inputs[0];
+        const string& weights_name = op_signature.op->inputs[2];
+        const string& output_name = op_signature.op->outputs[0];
+        const Array& input_array = op_signature.model->GetArray(input_name);
+        const Array& weights_array = op_signature.model->GetArray(weights_name);
+        const Array& output_array = op_signature.model->GetArray(output_name);
+        if (input_array.data_type == ArrayDataType::kFloat &&
+            weights_array.data_type == ArrayDataType::kInt8 &&
+            output_array.data_type == ArrayDataType::kFloat) {
+          return 3;
+        }
         return 1;
+      }
       case LstmCellOperator::KERNEL_BASIC:
+        // KERNEL_BASIC was added in version 2.
         return 2;
     }
   }
@@ -770,7 +1059,22 @@ class UnidirectionalSequenceLstm
            ::tflite::ActivationFunctionType_TANH);
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    // If the input tensor is float and a weight is int8, this is a version
+    // 2 hybrid operation.
+    const string& input_name = op_signature.op->inputs[0];
+    const string& weights_name = op_signature.op->inputs[2];
+    const string& output_name = op_signature.op->outputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    const Array& weights_array = op_signature.model->GetArray(weights_name);
+    const Array& output_array = op_signature.model->GetArray(output_name);
+    if (input_array.data_type == ArrayDataType::kFloat &&
+        weights_array.data_type == ArrayDataType::kInt8 &&
+        output_array.data_type == ArrayDataType::kFloat) {
+      return 2;
+    }
+    return 1;
+  }
 
   std::vector<bool> GetMutatingInputVariables(
       const Operator& op) const override {
@@ -781,6 +1085,94 @@ class UnidirectionalSequenceLstm
   }
 };
 
+class BidirectionalSequenceLstm
+    : public BuiltinOperator<
+          BidirectionalSequenceLstmOperator,
+          ::tflite::BidirectionalSequenceLSTMOptions,
+          ::tflite::BuiltinOptions_BidirectionalSequenceLSTMOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    // Current toco converter only supports tanh, no clip.
+    return ::tflite::CreateBidirectionalSequenceLSTMOptions(
+        *builder, /*fused_activation_function=*/
+        ::tflite::ActivationFunctionType_TANH,
+        /*cell_clip=*/0.0,
+        /*proj_clip=*/0.0,
+        /*merge_outputs=*/op.merge_outputs,
+        /*time_major=*/true);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    // Only support tanh activation, so check that tflite type is tanh.
+    DCHECK(options.fused_activation_function() ==
+           ::tflite::ActivationFunctionType_TANH);
+    op->merge_outputs = options.merge_outputs();
+  }
+
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
+
+  std::vector<bool> GetMutatingInputVariables(
+      const Operator& op) const override {
+    std::vector<bool> mutating_input_variables(op.inputs.size(), false);
+    // Forward input activation state.
+    mutating_input_variables[35] = true;
+    // Forward input cell state.
+    mutating_input_variables[36] = true;
+    // Backward input activation state.
+    mutating_input_variables[37] = true;
+    // Backward input cell state.
+    mutating_input_variables[38] = true;
+    return mutating_input_variables;
+  }
+};
+
+class BidirectionalSequenceRnn
+    : public BuiltinOperator<
+          BidirectionalSequenceRnnOperator,
+          ::tflite::BidirectionalSequenceRNNOptions,
+          ::tflite::BuiltinOptions_BidirectionalSequenceRNNOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    // Current toco converter only supports tanh, no clip.
+    return ::tflite::CreateBidirectionalSequenceRNNOptions(
+        *builder, /*time_major=*/true,
+        /*fused_activation_function=*/
+        ::tflite::ActivationFunctionType_TANH,
+        /*merge_outputs=*/op.merge_outputs);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    // Only support tanh activation, so check that tflite type is tanh.
+    DCHECK(options.fused_activation_function() ==
+           ::tflite::ActivationFunctionType_TANH);
+    op->merge_outputs = options.merge_outputs();
+  }
+
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
+
+  std::vector<bool> GetMutatingInputVariables(
+      const Operator& op) const override {
+    std::vector<bool> mutating_input_variables(op.inputs.size(), false);
+    // Forward hidden state.
+    mutating_input_variables[4] = true;
+    // Backward hidden state.
+    mutating_input_variables[8] = true;
+    return mutating_input_variables;
+  }
+};
+
 class Mean : public BuiltinOperator<MeanOperator, ::tflite::ReducerOptions,
                                     ::tflite::BuiltinOptions_ReducerOptions> {
  public:
@@ -796,7 +1188,9 @@ class Mean : public BuiltinOperator<MeanOperator, ::tflite::ReducerOptions,
     op->keep_dims = options.keep_dims();
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
 };
 
 class Sum
@@ -815,7 +1209,9 @@ class Sum
     op->keep_dims = options.keep_dims();
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
 };
 
 class ReduceMax
@@ -834,7 +1230,15 @@ class ReduceMax
     op->keep_dims = options.keep_dims();
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
 };
 
 class ReduceMin
@@ -853,7 +1257,15 @@ class ReduceMin
     op->keep_dims = options.keep_dims();
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
 };
 
 class ReduceProd
@@ -872,7 +1284,9 @@ class ReduceProd
     op->keep_dims = options.keep_dims();
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
 };
 
 class ReduceAny
@@ -891,7 +1305,23 @@ class ReduceAny
     op->keep_dims = options.keep_dims();
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
+};
+
+class Relu6 : public SimpleOperator<Relu6Operator> {
+ public:
+  explicit Relu6() : SimpleOperator("RELU6", OperatorType::kRelu6) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
 };
 
 class ResizeBilinear
@@ -911,7 +1341,15 @@ class ResizeBilinear
     op->align_corners = options.align_corners();
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op takes int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
 };
 
 class ResizeNearestNeighbor
@@ -932,7 +1370,15 @@ class ResizeNearestNeighbor
     op->align_corners = options.align_corners();
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
 };
 
 class Squeeze
@@ -955,7 +1401,9 @@ class Squeeze
                             options.squeeze_dims()->end());
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
 };
 
 class Split
@@ -975,7 +1423,17 @@ class Split
     op->num_split = options.num_splits();
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2, for int32 it's version 3.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    } else if (input_array.data_type == ArrayDataType::kInt32) {
+      return 3;
+    }
+    return 1;
+  }
 };
 
 class SplitV
@@ -995,7 +1453,9 @@ class SplitV
     op->num_split = options.num_splits();
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
 };
 
 class StridedSlice
@@ -1021,7 +1481,15 @@ class StridedSlice
     op->shrink_axis_mask = options.shrink_axis_mask();
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
 };
 
 class TopK_V2 : public BuiltinOperator<TopKV2Operator, ::tflite::TopKV2Options,
@@ -1037,7 +1505,14 @@ class TopK_V2 : public BuiltinOperator<TopKV2Operator, ::tflite::TopKV2Options,
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {}
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
 };
 
 class ArgMax : public BuiltinOperator<ArgMaxOperator, ::tflite::ArgMaxOptions,
@@ -1056,7 +1531,15 @@ class ArgMax : public BuiltinOperator<ArgMaxOperator, ::tflite::ArgMaxOptions,
     op->output_data_type = DataType::Deserialize(options.output_type());
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+
+    return 1;
+  }
 };
 
 class ArgMin : public BuiltinOperator<ArgMinOperator, ::tflite::ArgMinOptions,
@@ -1075,7 +1558,15 @@ class ArgMin : public BuiltinOperator<ArgMinOperator, ::tflite::ArgMinOptions,
     op->output_data_type = DataType::Deserialize(options.output_type());
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+
+    return 1;
+  }
 };
 
 class TransposeConv
@@ -1100,7 +1591,9 @@ class TransposeConv
     op->stride_height = options.stride_h();
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
 };
 
 class SparseToDense
@@ -1121,7 +1614,9 @@ class SparseToDense
     op->validate_indices = options.validate_indices();
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
 };
 
 class ExpandDims
@@ -1139,7 +1634,9 @@ class ExpandDims
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {}
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
 };
 
 class Pack : public BuiltinOperator<PackOperator, ::tflite::PackOptions,
@@ -1159,7 +1656,15 @@ class Pack : public BuiltinOperator<PackOperator, ::tflite::PackOptions,
     op->axis = options.axis();
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // If the op take int8 input, it is version 2.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
 };
 
 class Shape
@@ -1179,7 +1684,37 @@ class Shape
     op->output_data_type = DataType::Deserialize(options.out_type());
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
+};
+
+class Slice : public SimpleOperator<SliceOperator> {
+ public:
+  explicit Slice() : SimpleOperator("SLICE", OperatorType::kSlice) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class Tanh : public SimpleOperator<TanhOperator> {
+ public:
+  explicit Tanh() : SimpleOperator("TANH", OperatorType::kTanh) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
 };
 
 class OneHot : public BuiltinOperator<OneHotOperator, ::tflite::OneHotOptions,
@@ -1196,7 +1731,9 @@ class OneHot : public BuiltinOperator<OneHotOperator, ::tflite::OneHotOptions,
     op->axis = options.axis();
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
 };
 
 class CTCBeamSearchDecoder
@@ -1217,7 +1754,9 @@ class CTCBeamSearchDecoder
     op->merge_repeated = m["merge_repeated"].AsBool();
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
 };
 
 class Unpack : public BuiltinOperator<UnpackOperator, ::tflite::UnpackOptions,
@@ -1235,7 +1774,9 @@ class Unpack : public BuiltinOperator<UnpackOperator, ::tflite::UnpackOptions,
     op->axis = options.axis();
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
 };
 
 class LeakyRelu
@@ -1253,7 +1794,38 @@ class LeakyRelu
     op->alpha = options.alpha();
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
+};
+
+class Logistic : public SimpleOperator<LogisticOperator> {
+ public:
+  explicit Logistic() : SimpleOperator("LOGISTIC", OperatorType::kLogistic) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class LogSoftmax : public SimpleOperator<LogSoftmaxOperator> {
+ public:
+  explicit LogSoftmax()
+      : SimpleOperator("LOG_SOFTMAX", OperatorType::kLogSoftmax) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
 };
 
 class SquaredDifference
@@ -1272,7 +1844,9 @@ class SquaredDifference
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {}
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
 };
 
 class MirrorPad
@@ -1295,7 +1869,86 @@ class MirrorPad
                    : MirrorPadMode::kSymmetric;
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op) const override { return 1; }
+};
+
+class Unique : public BuiltinOperator<UniqueOperator, ::tflite::UniqueOptions,
+                                      ::tflite::BuiltinOptions_UniqueOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    const UniqueOperator& unique_op = static_cast<const UniqueOperator&>(op);
+    return ::tflite::CreateUniqueOptions(
+        *builder, unique_op.idx_out_type == toco::ArrayDataType::kInt64
+                      ? ::tflite::TensorType::TensorType_INT64
+                      : ::tflite::TensorType_INT32);
+  }
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    UniqueOperator* unique_op = static_cast<UniqueOperator*>(op);
+    unique_op->idx_out_type =
+        options.idx_out_type() == ::tflite::TensorType_INT64
+            ? toco::ArrayDataType::kInt64
+            : toco::ArrayDataType::kInt32;
+  }
+
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
+};
+
+class UnidirectionalSequenceRnn
+    : public BuiltinOperator<UnidirectionalSequenceRnnOperator,
+                             ::tflite::SequenceRNNOptions,
+                             ::tflite::BuiltinOptions_SequenceRNNOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateSequenceRNNOptions(
+        *builder, /*time_major=*/true,
+        /*fused_activation_function=*/
+        ::tflite::ActivationFunctionType_TANH);
+  }
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    // Only support tanh actication, so check that tflite type is tanh.
+    DCHECK(options.fused_activation_function() ==
+           ::tflite::ActivationFunctionType_TANH);
+  }
+
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
+
+  std::vector<bool> GetMutatingInputVariables(
+      const Operator& op) const override {
+    std::vector<bool> mutating_input_variables(op.inputs.size(), false);
+    mutating_input_variables[4] = true;
+    return mutating_input_variables;
+  }
+};
+
+class Where : public BuiltinOperator<WhereOperator, ::tflite::WhereOptions,
+                                     ::tflite::BuiltinOptions_WhereOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateWhereOptions(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
 };
 
 std::unique_ptr<flexbuffers::Builder> WriteFlexOpOptions(
@@ -1396,13 +2049,27 @@ class TensorFlowUnsupported : public BaseOperator {
           has_valid_attr = true;
           break;
         case tensorflow::AttrValue::kList:
-          if (attr.list().i_size() > 0) {
+          if (attr.list().s_size() > 0) {
+            auto start = fbb->StartVector(key);
+            for (const string& v : attr.list().s()) {
+              fbb->Add(v);
+            }
+            fbb->EndVector(start, /*typed=*/true, /*fixed=*/false);
+            has_valid_attr = true;
+          } else if (attr.list().i_size() > 0) {
             auto start = fbb->StartVector(key);
             for (const int64_t v : attr.list().i()) {
               fbb->Add(v);
             }
             fbb->EndVector(start, /*typed=*/true, /*fixed=*/false);
             has_valid_attr = true;
+          } else if (attr.list().f_size() > 0) {
+            auto start = fbb->StartVector(key);
+            for (const float v : attr.list().f()) {
+              fbb->Add(v);
+            }
+            fbb->EndVector(start, /*typed=*/true, /*fixed=*/false);
+            has_valid_attr = true;
           } else {
             LOG(WARNING)
                 << "Ignoring unsupported type in list attribute with key '"
@@ -1423,10 +2090,6 @@ class TensorFlowUnsupported : public BaseOperator {
     return std::unique_ptr<flexbuffers::Builder>(fbb.release());
   }
 
-// TODO(wvo): hack to make this code compile with 2 different API versions.
-// Please remove once OS/internal versions are in sync.
-// See hardcoded values in the switch below.
-
   void ReadOptions(const flexbuffers::Map& m,
                    TensorFlowUnsupportedOperator* op) const {
     ::tensorflow::NodeDef node_def;
@@ -1436,6 +2099,10 @@ class TensorFlowUnsupported : public BaseOperator {
     for (size_t i = 0; i < keys.size(); ++i) {
       const auto key = keys[i].AsKey();
       const auto& value = m[key];
+      // TODO(wvo): hack to make this code compile with 2 different API
+      // versions.
+      // Please remove once OS/internal versions are in sync.
+      // See hardcoded values in the switch below.
       switch (value.GetType()) {
         case 5:  // flexbuffers::FBT_STRING:
           (*attr)[key].set_s(value.AsString().c_str());
@@ -1463,6 +2130,22 @@ class TensorFlowUnsupported : public BaseOperator {
           }
           break;
         }
+        case 13: {  // flexbuffers::FBT_VECTOR_FLOAT: {
+          auto* list = (*attr)[key].mutable_list();
+          const auto& vector = value.AsTypedVector();
+          for (size_t i = 0; i < vector.size(); i++) {
+            list->add_f(vector[i].AsFloat());
+          }
+          break;
+        }
+        case 15: {  // flexbuffers::FBT_VECTOR_STRING: {
+          auto* list = (*attr)[key].mutable_list();
+          const auto& vector = value.AsTypedVector();
+          for (size_t i = 0; i < vector.size(); i++) {
+            list->add_s(vector[i].AsString().str());
+          }
+          break;
+        }
         default:
           LOG(WARNING) << "Ignoring unsupported attribute type with key '"
                        << key << "'";
@@ -1472,8 +2155,8 @@ class TensorFlowUnsupported : public BaseOperator {
     node_def.SerializeToString(&op->tensorflow_node_def);
   }
 
-  int GetVersion(const Operator& op) const override {
-    // TODO(ycling): Deisng and implement a way to plumb the version of
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    // TODO(ycling): Design and implement a way to plumb the version of
     // custom ops.
     return 1;
   }
@@ -1497,11 +2180,113 @@ class Dequantize
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {}
 
-  int GetVersion(const Operator& op) const override {
-    // TODO(suharshs): Dequantize now supports INT8 in addition to
-    // QUANTIZED_UINT8. When TOCO can create models with INT8, we need
-    // to find a way to see the type here and return version 2. Right now
-    // version 2 will only be added by post training quantization tools.
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class Equal : public SimpleOperator<TensorFlowEqualOperator> {
+ public:
+  explicit Equal() : SimpleOperator("EQUAL", OperatorType::kEqual) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class NotEqual : public SimpleOperator<TensorFlowNotEqualOperator> {
+ public:
+  explicit NotEqual() : SimpleOperator("NOT_EQUAL", OperatorType::kNotEqual) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class Greater : public SimpleOperator<TensorFlowGreaterOperator> {
+ public:
+  explicit Greater() : SimpleOperator("GREATER", OperatorType::kGreater) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class GreaterEqual : public SimpleOperator<TensorFlowGreaterEqualOperator> {
+ public:
+  explicit GreaterEqual()
+      : SimpleOperator("GREATER_EQUAL", OperatorType::kGreaterEqual) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class Less : public SimpleOperator<TensorFlowLessOperator> {
+ public:
+  explicit Less() : SimpleOperator("LESS", OperatorType::kLess) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class LessEqual : public SimpleOperator<TensorFlowLessEqualOperator> {
+ public:
+  explicit LessEqual()
+      : SimpleOperator("LESS_EQUAL", OperatorType::kLessEqual) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
+    return 1;
+  }
+};
+
+class Select : public SimpleOperator<SelectOperator> {
+ public:
+  explicit Select() : SimpleOperator("SELECT", OperatorType::kSelect) {}
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    const string& input_name = op_signature.op->inputs[0];
+    const Array& input_array = op_signature.model->GetArray(input_name);
+    // Version 2 supports signed int8 input types.
+    if (input_array.data_type == ArrayDataType::kInt8) {
+      return 2;
+    }
     return 1;
   }
 };
@@ -1515,6 +2300,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
   // Builtin Operators.
   ops.push_back(
       MakeUnique<Add>(::tflite::BuiltinOperator_ADD, OperatorType::kAdd));
+  ops.push_back(
+      MakeUnique<AddN>(::tflite::BuiltinOperator_ADD_N, OperatorType::kAddN));
   ops.push_back(
       MakeUnique<Div>(::tflite::BuiltinOperator_DIV, OperatorType::kDiv));
   ops.push_back(
@@ -1534,11 +2321,15 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
   ops.push_back(MakeUnique<DepthwiseConvolution>(
       ::tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
       OperatorType::kDepthwiseConv));
+  ops.push_back(MakeUnique<Dequantize>(::tflite::BuiltinOperator_DEQUANTIZE,
+                                       OperatorType::kDequantize));
   ops.push_back(
       MakeUnique<FullyConnected>(::tflite::BuiltinOperator_FULLY_CONNECTED,
                                  OperatorType::kFullyConnected));
   ops.push_back(MakeUnique<Gather>(::tflite::BuiltinOperator_GATHER,
                                    OperatorType::kGather));
+  ops.push_back(MakeUnique<GatherNd>(::tflite::BuiltinOperator_GATHER_ND,
+                                     OperatorType::kGatherNd));
   ops.push_back(
       MakeUnique<L2Normalization>(::tflite::BuiltinOperator_L2_NORMALIZATION,
                                   OperatorType::kL2Normalization));
@@ -1619,6 +2410,12 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
   ops.emplace_back(MakeUnique<UnidirectionalSequenceLstm>(
       ::tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
       OperatorType::kUnidirectionalSequenceLstm));
+  ops.emplace_back(MakeUnique<BidirectionalSequenceLstm>(
+      ::tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM,
+      OperatorType::kBidirectionalSequenceLstm));
+  ops.emplace_back(MakeUnique<BidirectionalSequenceRnn>(
+      ::tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
+      OperatorType::kBidirectionalSequenceRnn));
   ops.push_back(MakeUnique<OneHot>(::tflite::BuiltinOperator_ONE_HOT,
                                    OperatorType::kOneHot));
   ops.push_back(MakeUnique<Unpack>(::tflite::BuiltinOperator_UNPACK,
@@ -1630,6 +2427,13 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
       OperatorType::kSquaredDifference));
   ops.push_back(MakeUnique<MirrorPad>(::tflite::BuiltinOperator_MIRROR_PAD,
                                       OperatorType::kMirrorPad));
+  ops.push_back(MakeUnique<Unique>(::tflite::BuiltinOperator_UNIQUE,
+                                   OperatorType::kUnique));
+  ops.push_back(MakeUnique<UnidirectionalSequenceRnn>(
+      ::tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN,
+      OperatorType::kUnidirectionalSequenceRnn));
+  ops.push_back(
+      MakeUnique<Where>(::tflite::BuiltinOperator_WHERE, OperatorType::kWhere));
 
   // Custom Operators.
   ops.push_back(
@@ -1645,48 +2449,38 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
   // when custom ops are exported but SimpleOperator bypasses thoses. To
   // prevent user confusion we are settling on using SimpleOperator only for
   // builtins.
-  ops.push_back(MakeUnique<SimpleOperator<DequantizeOperator>>(
-      "DEQUANTIZE", OperatorType::kDequantize));
   ops.push_back(
       MakeUnique<SimpleOperator<FloorOperator>>("FLOOR", OperatorType::kFloor));
+  ops.push_back(
+      MakeUnique<SimpleOperator<CeilOperator>>("CEIL", OperatorType::kCeil));
+  ops.push_back(
+      MakeUnique<SimpleOperator<EluOperator>>("ELU", OperatorType::kElu));
   ops.push_back(
       MakeUnique<SimpleOperator<ReluOperator>>("RELU", OperatorType::kRelu));
   ops.push_back(MakeUnique<SimpleOperator<Relu1Operator>>(
       "RELU_N1_TO_1", OperatorType::kRelu1));
-  ops.push_back(
-      MakeUnique<SimpleOperator<Relu6Operator>>("RELU6", OperatorType::kRelu6));
+  ops.push_back(MakeUnique<Relu6>());
   ops.push_back(
       MakeUnique<SimpleOperator<PReluOperator>>("PRELU", OperatorType::kPRelu));
-  ops.push_back(MakeUnique<SimpleOperator<LogisticOperator>>(
-      "LOGISTIC", OperatorType::kLogistic));
-  ops.push_back(
-      MakeUnique<SimpleOperator<TanhOperator>>("TANH", OperatorType::kTanh));
+  ops.push_back(MakeUnique<Logistic>());
+  ops.push_back(MakeUnique<Tanh>());
   ops.push_back(
       MakeUnique<SimpleOperator<ExpOperator>>("EXP", OperatorType::kExp));
-  ops.push_back(MakeUnique<SimpleOperator<LogSoftmaxOperator>>(
-      "LOG_SOFTMAX", OperatorType::kLogSoftmax));
-  ops.push_back(MakeUnique<SimpleOperator<TensorFlowMaximumOperator>>(
-      "MAXIMUM", OperatorType::kMaximum));  //  Element-wise Maximum
-  ops.push_back(MakeUnique<SimpleOperator<TensorFlowMinimumOperator>>(
-      "MINIMUM", OperatorType::kMinimum));  //  Element-wise Minimum
-  ops.push_back(MakeUnique<SimpleOperator<TensorFlowGreaterOperator>>(
-      "GREATER", OperatorType::kGreater));
-  ops.push_back(MakeUnique<SimpleOperator<TensorFlowGreaterEqualOperator>>(
-      "GREATER_EQUAL", OperatorType::kGreaterEqual));
-  ops.push_back(MakeUnique<SimpleOperator<TensorFlowLessOperator>>(
-      "LESS", OperatorType::kLess));
-  ops.push_back(MakeUnique<SimpleOperator<TensorFlowLessEqualOperator>>(
-      "LESS_EQUAL", OperatorType::kLessEqual));
-  ops.push_back(MakeUnique<SimpleOperator<TensorFlowEqualOperator>>(
-      "EQUAL", OperatorType::kEqual));
-  ops.push_back(MakeUnique<SimpleOperator<TensorFlowNotEqualOperator>>(
-      "NOT_EQUAL", OperatorType::kNotEqual));
   ops.push_back(
-      MakeUnique<SimpleOperator<NegOperator>>("NEG", OperatorType::kNeg));
-  ops.push_back(MakeUnique<SimpleOperator<SelectOperator>>(
-      "SELECT", OperatorType::kSelect));
+      MakeUnique<SimpleOperator<CosOperator>>("COS", OperatorType::kCos));
+  ops.push_back(MakeUnique<LogSoftmax>());
+  ops.push_back(MakeUnique<Maximum>());  //  Element-wise Maximum
+  ops.push_back(MakeUnique<Minimum>());  //  Element-wise Minimum
+  ops.push_back(MakeUnique<Greater>());
+  ops.push_back(MakeUnique<GreaterEqual>());
+  ops.push_back(MakeUnique<Less>());
+  ops.push_back(MakeUnique<LessEqual>());
+  ops.push_back(MakeUnique<Equal>());
+  ops.push_back(MakeUnique<NotEqual>());
   ops.push_back(
-      MakeUnique<SimpleOperator<SliceOperator>>("SLICE", OperatorType::kSlice));
+      MakeUnique<SimpleOperator<NegOperator>>("NEG", OperatorType::kNeg));
+  ops.push_back(MakeUnique<Select>());
+  ops.push_back(MakeUnique<Slice>());
   ops.push_back(
       MakeUnique<SimpleOperator<PowOperator>>("POW", OperatorType::kPow));
   ops.push_back(MakeUnique<SimpleOperator<LogicalOrOperator>>(
@@ -1718,6 +2512,10 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
       MakeUnique<SimpleOperator<AbsOperator>>("ABS", OperatorType::kAbs));
   ops.push_back(
       MakeUnique<SimpleOperator<FillOperator>>("FILL", OperatorType::kFill));
+  ops.push_back(MakeUnique<SimpleOperator<ReverseV2Operator>>(
+      "REVERSE_V2", OperatorType::kReverseV2));
+  ops.push_back(MakeUnique<SimpleOperator<TensorFlowRankOperator>>(
+      "RANK", OperatorType::kRank));
   return ops;
 }
 }  // namespace
diff --git a/tensorflow/lite/toco/tflite/operator.h b/tensorflow/lite/toco/tflite/operator.h
index 4ac531579c12c8f9c7e7904cbae261e74235e168..899db1a35931be9c3a29fdc1451e405f4c4083b4 100644
--- a/tensorflow/lite/toco/tflite/operator.h
+++ b/tensorflow/lite/toco/tflite/operator.h
@@ -87,15 +87,15 @@ class BaseOperator {
       const BuiltinOptions* builtin_options,
       const CustomOptions* custom_options) const = 0;
 
-  // Get the op version by op parameters.
-  // The function need to be overridden to return the op version based on the
+  // Get the op version using the OperatorSignature.
+  // The function needs to be overridden to return the op version based on the
   // parameters. Note:
   // * The first version for each op should be 1 (to be consistent with the
   //   default value in Flatbuffer. `return 1;` is okay for newly implemented
   //   ops.
-  // * When multiple versions are defined for an op, this function need to be
+  // * When multiple versions are defined for an op, this function needs to be
   //   overridden. (See example in `operator_test.cc`)
-  virtual int GetVersion(const Operator& op) const = 0;
+  virtual int GetVersion(const OperatorSignature& op_signature) const = 0;
 
   // Given a Toco `Operator`, return a list of booleans indicating the op
   // mutates which input variables.
diff --git a/tensorflow/lite/toco/tflite/operator_test.cc b/tensorflow/lite/toco/tflite/operator_test.cc
index 14ec89cd73f19fcd141640bda7bfba6435f59ac7..ac3b84777964583c17c7482855d942125c67c02f 100644
--- a/tensorflow/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/lite/toco/tflite/operator_test.cc
@@ -111,15 +111,16 @@ class OperatorTest : public ::testing::Test {
 };
 
 TEST_F(OperatorTest, SimpleOperators) {
-  CheckSimpleOperator<DequantizeOperator>("DEQUANTIZE",
-                                          OperatorType::kDequantize);
   CheckSimpleOperator<FloorOperator>("FLOOR", OperatorType::kFloor);
+  CheckSimpleOperator<CeilOperator>("CEIL", OperatorType::kCeil);
+  CheckSimpleOperator<EluOperator>("ELU", OperatorType::kElu);
   CheckSimpleOperator<ReluOperator>("RELU", OperatorType::kRelu);
   CheckSimpleOperator<Relu1Operator>("RELU_N1_TO_1", OperatorType::kRelu1);
   CheckSimpleOperator<Relu6Operator>("RELU6", OperatorType::kRelu6);
   CheckSimpleOperator<LogisticOperator>("LOGISTIC", OperatorType::kLogistic);
   CheckSimpleOperator<TanhOperator>("TANH", OperatorType::kTanh);
   CheckSimpleOperator<ExpOperator>("EXP", OperatorType::kExp);
+  CheckSimpleOperator<CosOperator>("COS", OperatorType::kCos);
   CheckSimpleOperator<LogSoftmaxOperator>("LOG_SOFTMAX",
                                           OperatorType::kLogSoftmax);
   CheckSimpleOperator<TensorFlowMaximumOperator>(
@@ -152,6 +153,9 @@ TEST_F(OperatorTest, SimpleOperators) {
   CheckSimpleOperator<FloorModOperator>("FLOOR_MOD", OperatorType::kFloorMod);
   CheckSimpleOperator<RangeOperator>("RANGE", OperatorType::kRange);
   CheckSimpleOperator<FillOperator>("FILL", OperatorType::kFill);
+  CheckSimpleOperator<ReverseV2Operator>("REVERSE_V2",
+                                         OperatorType::kReverseV2);
+  CheckSimpleOperator<TensorFlowRankOperator>("RANK", OperatorType::kRank);
 }
 
 TEST_F(OperatorTest, BuiltinAdd) {
@@ -163,6 +167,13 @@ TEST_F(OperatorTest, BuiltinAdd) {
             output_toco_op->fused_activation_function);
 }
 
+TEST_F(OperatorTest, BuiltinAddN) {
+  AddNOperator op;
+  auto output_toco_op =
+      SerializeAndDeserialize(GetOperator("ADD_N", OperatorType::kAddN), op);
+  ASSERT_NE(output_toco_op.get(), nullptr);
+}
+
 TEST_F(OperatorTest, BuiltinReducerOps) {
   CheckReducerOperator<MeanOperator>("MEAN", OperatorType::kMean);
   CheckReducerOperator<TensorFlowSumOperator>("SUM", OperatorType::kSum);
@@ -231,6 +242,20 @@ TEST_F(OperatorTest, BuiltinGather) {
   ASSERT_NE(nullptr, output_toco_op.get());
 }
 
+TEST_F(OperatorTest, BuiltinGatherNd) {
+  GatherNdOperator op;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("GATHER_ND", OperatorType::kGatherNd), op);
+  ASSERT_NE(output_toco_op.get(), nullptr);
+}
+
+TEST_F(OperatorTest, BuiltinWhere) {
+  WhereOperator op;
+  auto output_toco_op =
+      SerializeAndDeserialize(GetOperator("WHERE", OperatorType::kWhere), op);
+  ASSERT_NE(output_toco_op.get(), nullptr);
+}
+
 TEST_F(OperatorTest, BuiltinL2Pool) {
   L2PoolOperator op;
   op.stride_width = 123;
@@ -279,6 +304,44 @@ TEST_F(OperatorTest, BuiltinMaxPool) {
   EXPECT_EQ(op.kheight, output_toco_op->kheight);
 }
 
+TEST_F(OperatorTest, VersioningMaxTest) {
+  TensorFlowMaximumOperator max_op;
+  max_op.inputs = {"input1"};
+  auto operator_by_type_map = BuildOperatorByTypeMap(false /*enable_flex_ops*/);
+  const BaseOperator* op = operator_by_type_map.at(max_op.type).get();
+
+  Model uint8_model;
+  Array& uint8_array = uint8_model.GetOrCreateArray(max_op.inputs[0]);
+  uint8_array.data_type = ArrayDataType::kUint8;
+  OperatorSignature uint8_signature = {.model = &uint8_model, .op = &max_op};
+  EXPECT_EQ(op->GetVersion(uint8_signature), 1);
+
+  Model int8_model;
+  Array& int8_array = int8_model.GetOrCreateArray(max_op.inputs[0]);
+  int8_array.data_type = ArrayDataType::kInt8;
+  OperatorSignature int8_signature = {.model = &int8_model, .op = &max_op};
+  EXPECT_EQ(op->GetVersion(int8_signature), 2);
+}
+
+TEST_F(OperatorTest, VersioningMinTest) {
+  TensorFlowMinimumOperator min_op;
+  min_op.inputs = {"input1"};
+  auto operator_by_type_map = BuildOperatorByTypeMap(false /*enable_flex_ops*/);
+  const BaseOperator* op = operator_by_type_map.at(min_op.type).get();
+
+  Model uint8_model;
+  Array& uint8_array = uint8_model.GetOrCreateArray(min_op.inputs[0]);
+  uint8_array.data_type = ArrayDataType::kUint8;
+  OperatorSignature uint8_signature = {.model = &uint8_model, .op = &min_op};
+  EXPECT_EQ(op->GetVersion(uint8_signature), 1);
+
+  Model int8_model;
+  Array& int8_array = int8_model.GetOrCreateArray(min_op.inputs[0]);
+  int8_array.data_type = ArrayDataType::kInt8;
+  OperatorSignature int8_signature = {.model = &int8_model, .op = &min_op};
+  EXPECT_EQ(op->GetVersion(int8_signature), 2);
+}
+
 TEST_F(OperatorTest, BuiltinReshape) {
   TensorFlowReshapeOperator op;
   op.shape = {1, 2, 4, 5, 8};
@@ -469,6 +532,12 @@ TEST_F(OperatorTest, BuiltinArgMin) {
   EXPECT_EQ(op.output_data_type, output_toco_op->output_data_type);
 }
 
+TEST_F(OperatorTest, BuiltinDequantize) {
+  DequantizeOperator op;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("DEQUANTIZE", OperatorType::kDequantize), op);
+}
+
 TEST_F(OperatorTest, BuiltinTransposeConv) {
   TransposeConvOperator op;
   op.stride_width = 123;
@@ -565,6 +634,20 @@ TEST_F(OperatorTest, TensorFlowUnsupported) {
   (*attr)["str_attr"].set_s("Hello World");
   (*attr)["int_attr"].set_i(17);
   (*attr)["bool_attr"].set_b(true);
+  {
+    auto* list = (*attr)["list_string_attr"].mutable_list();
+    list->add_s("abcde");
+    list->add_s("1234");
+    list->add_s("");
+    list->add_s("zyxwv");
+    list->add_s("!-.");
+  }
+  {
+    auto* list = (*attr)["list_float_attr"].mutable_list();
+    list->add_f(std::numeric_limits<float>::min());
+    list->add_f(2.0);
+    list->add_f(-std::numeric_limits<float>::max());
+  }
   {
     auto* list = (*attr)["list_int_attr"].mutable_list();
     list->add_i(1);
@@ -584,7 +667,22 @@ TEST_F(OperatorTest, TensorFlowUnsupported) {
   EXPECT_EQ("Hello World", output_attr.at("str_attr").s());
   EXPECT_EQ(17, output_attr.at("int_attr").i());
   EXPECT_EQ(true, output_attr.at("bool_attr").b());
-
+  {
+    const auto& list = output_attr.at("list_string_attr").list();
+    ASSERT_EQ(5, list.s_size());
+    EXPECT_EQ("abcde", list.s(0));
+    EXPECT_EQ("1234", list.s(1));
+    EXPECT_EQ("", list.s(2));
+    EXPECT_EQ("zyxwv", list.s(3));
+    EXPECT_EQ("!-.", list.s(4));
+  }
+  {
+    const auto& list = output_attr.at("list_float_attr").list();
+    ASSERT_EQ(3, list.f_size());
+    EXPECT_EQ(std::numeric_limits<float>::min(), list.f(0));
+    EXPECT_EQ(2.0, list.f(1));
+    EXPECT_EQ(-std::numeric_limits<float>::max(), list.f(2));
+  }
   {
     const auto& list = output_attr.at("list_int_attr").list();
     ASSERT_EQ(4, list.i_size());
@@ -610,10 +708,11 @@ TEST_F(OperatorTest, TestShouldExportAsFlexOp) {
   EXPECT_FALSE(ShouldExportAsFlexOp(false, "Conv2D"));
   EXPECT_TRUE(ShouldExportAsFlexOp(true, "Conv2D"));
   EXPECT_TRUE(ShouldExportAsFlexOp(true, "EluGrad"));
+  EXPECT_TRUE(ShouldExportAsFlexOp(true, "RFFT"));
   EXPECT_FALSE(ShouldExportAsFlexOp(true, "MyAwesomeCustomOp"));
-  // While the RFFT op is available on desktop, it is not in the kernel
+  // While the RandomShuffle op is available on desktop, it is not in the kernel
   // set available on mobile and should be excluded.
-  EXPECT_FALSE(ShouldExportAsFlexOp(true, "RFFT"));
+  EXPECT_FALSE(ShouldExportAsFlexOp(true, "RandomShuffle"));
 }
 
 TEST_F(OperatorTest, BuiltinMirrorPad) {
@@ -624,6 +723,154 @@ TEST_F(OperatorTest, BuiltinMirrorPad) {
   EXPECT_EQ(op.mode, output_toco_op->mode);
 }
 
+TEST_F(OperatorTest, BuiltinUnique) {
+  UniqueOperator op;
+  op.idx_out_type = ArrayDataType::kInt64;
+  auto output_toco_op =
+      SerializeAndDeserialize(GetOperator("UNIQUE", OperatorType::kUnique), op);
+  ASSERT_NE(nullptr, output_toco_op.get());
+  EXPECT_EQ(output_toco_op->idx_out_type, op.idx_out_type);
+}
+
+// Test version for a simple Op with 2 versions and the input type controls the
+// version.
+template <typename Op>
+void SimpleVersioningTest() {
+  Op op;
+  op.inputs = {"input1"};
+  auto operator_by_type_map = BuildOperatorByTypeMap(false /*enable_flex_ops*/);
+  const BaseOperator* base_op = operator_by_type_map.at(op.type).get();
+
+  Model uint8_model;
+  Array& uint8_array = uint8_model.GetOrCreateArray(op.inputs[0]);
+  uint8_array.data_type = ArrayDataType::kUint8;
+  OperatorSignature uint8_signature = {.model = &uint8_model, .op = &op};
+  EXPECT_EQ(base_op->GetVersion(uint8_signature), 1);
+
+  Model int8_model;
+  Array& int8_array = int8_model.GetOrCreateArray(op.inputs[0]);
+  int8_array.data_type = ArrayDataType::kInt8;
+  OperatorSignature int8_signature = {.model = &int8_model, .op = &op};
+  EXPECT_EQ(base_op->GetVersion(int8_signature), 2);
+}
+
+TEST_F(OperatorTest, VersioningEqualTest) {
+  SimpleVersioningTest<TensorFlowEqualOperator>();
+}
+
+TEST_F(OperatorTest, VersioningNotEqualTest) {
+  SimpleVersioningTest<TensorFlowNotEqualOperator>();
+}
+
+TEST_F(OperatorTest, VersioningLessTest) {
+  SimpleVersioningTest<TensorFlowLessOperator>();
+}
+
+TEST_F(OperatorTest, VersioningLessEqualTest) {
+  SimpleVersioningTest<TensorFlowLessEqualOperator>();
+}
+
+TEST_F(OperatorTest, VersioningGreaterTest) {
+  SimpleVersioningTest<TensorFlowGreaterOperator>();
+}
+
+TEST_F(OperatorTest, VersioningGreaterEqualTest) {
+  SimpleVersioningTest<TensorFlowGreaterEqualOperator>();
+}
+
+TEST_F(OperatorTest, VersioningSpaceToBatchNDTest) {
+  SimpleVersioningTest<SpaceToBatchNDOperator>();
+}
+
+TEST_F(OperatorTest, VersioningLogSoftmaxTest) {
+  SimpleVersioningTest<LogSoftmaxOperator>();
+}
+
+TEST_F(OperatorTest, VersioningPackTest) {
+  SimpleVersioningTest<PackOperator>();
+}
+
+TEST_F(OperatorTest, VersioningBatchToSpaceNDTest) {
+  SimpleVersioningTest<BatchToSpaceNDOperator>();
+}
+
+TEST_F(OperatorTest, VersioningTanhTest) {
+  SimpleVersioningTest<TanhOperator>();
+}
+
+TEST_F(OperatorTest, VersioningStridedSliceTest) {
+  SimpleVersioningTest<StridedSliceOperator>();
+}
+
+TEST_F(OperatorTest, VersioningSpaceToDepthTest) {
+  SimpleVersioningTest<SpaceToDepthOperator>();
+}
+
+TEST_F(OperatorTest, VersioningSliceTest) {
+  SimpleVersioningTest<SliceOperator>();
+}
+
+TEST_F(OperatorTest, VersioningLogisticTest) {
+  SimpleVersioningTest<LogisticOperator>();
+}
+
+TEST_F(OperatorTest, VersioningAddTest) { SimpleVersioningTest<AddOperator>(); }
+
+TEST_F(OperatorTest, VersioningSubTest) { SimpleVersioningTest<SubOperator>(); }
+
+TEST_F(OperatorTest, VersioningMulTest) { SimpleVersioningTest<MulOperator>(); }
+
+TEST_F(OperatorTest, VersioningPadTest) { SimpleVersioningTest<PadOperator>(); }
+
+TEST_F(OperatorTest, VersioningPadV2Test) {
+  SimpleVersioningTest<PadV2Operator>();
+}
+
+TEST_F(OperatorTest, VersioningSelectTest) {
+  SimpleVersioningTest<SelectOperator>();
+}
+
+TEST_F(OperatorTest, VersioningRelu6Test) {
+  SimpleVersioningTest<Relu6Operator>();
+}
+
+TEST_F(OperatorTest, VersioningFullyConnectedTest) {
+  FullyConnectedOperator fully_connected_op;
+  fully_connected_op.inputs = {"input", "weight"};
+  fully_connected_op.outputs = {"output"};
+  auto operator_by_type_map = BuildOperatorByTypeMap(false /*enable_flex_ops*/);
+  const BaseOperator* op =
+      operator_by_type_map.at(fully_connected_op.type).get();
+
+  Model uint8_model;
+  Array& input_uint8_array =
+      uint8_model.GetOrCreateArray(fully_connected_op.inputs[0]);
+  input_uint8_array.data_type = ArrayDataType::kUint8;
+  Array& weight_uint8_array =
+      uint8_model.GetOrCreateArray(fully_connected_op.inputs[1]);
+  weight_uint8_array.data_type = ArrayDataType::kUint8;
+  Array& output_uint8_array =
+      uint8_model.GetOrCreateArray(fully_connected_op.outputs[0]);
+  output_uint8_array.data_type = ArrayDataType::kUint8;
+  OperatorSignature uint8_signature = {.model = &uint8_model,
+                                       .op = &fully_connected_op};
+  EXPECT_EQ(op->GetVersion(uint8_signature), 1);
+
+  Model int8_model;
+  Array& input_int8_array =
+      int8_model.GetOrCreateArray(fully_connected_op.inputs[0]);
+  input_int8_array.data_type = ArrayDataType::kInt8;
+  Array& weight_int8_array =
+      int8_model.GetOrCreateArray(fully_connected_op.inputs[1]);
+  weight_int8_array.data_type = ArrayDataType::kInt8;
+  Array& output_int8_array =
+      int8_model.GetOrCreateArray(fully_connected_op.outputs[0]);
+  output_int8_array.data_type = ArrayDataType::kInt8;
+  OperatorSignature int8_signature = {.model = &int8_model,
+                                      .op = &fully_connected_op};
+  EXPECT_EQ(op->GetVersion(int8_signature), 4);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/toco/tflite/simple_operator.h b/tensorflow/lite/toco/tflite/simple_operator.h
index e3e4c8551e931ff54f72c130cf1908ffa5e79514..290074831b888d2b624408a600e53a4356df5b12 100644
--- a/tensorflow/lite/toco/tflite/simple_operator.h
+++ b/tensorflow/lite/toco/tflite/simple_operator.h
@@ -42,7 +42,9 @@ class SimpleOperator : public BaseOperator {
     return std::unique_ptr<Operator>(new T);
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const OperatorSignature& op_signature) const override {
+    return 1;
+  }
 };
 
 }  // namespace tflite
diff --git a/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc b/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
index 039a918af16019292214f982326fba3eb5695c62..1b337ebc85f627b2ee90824cacd2a1f9a090428c 100644
--- a/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
+++ b/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
@@ -68,6 +68,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "BroadcastArgs",
           "BroadcastGradientArgs",
           "Cast",
+          "Ceil",
           "CheckNumerics",
           "ComplexAbs",
           "Concat",
@@ -118,6 +119,9 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "FakeQuantWithMinMaxVarsPerChannel",
           "FakeQuantWithMinMaxVarsPerChannelGradient",
           "FakeQueue",
+          "FFT",
+          "FFT2D",
+          "FFT3D",
           "FIFOQueue",
           "FIFOQueueV2",
           "Fill",
@@ -143,6 +147,12 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "_HostSend",
           "Identity",
           "IdentityN",
+          "IFFT",
+          "IFFT2D",
+          "IFFT3D",
+          "IRFFT",
+          "IRFFT2D",
+          "IRFFT3D",
           "ImmutableConst",
           "InTopK",
           "InTopKV2",
@@ -311,6 +321,9 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "Reverse",
           "ReverseSequence",
           "ReverseV2",
+          "RFFT",
+          "RFFT2D",
+          "RFFT3D",
           "Round",
           "Rsqrt",
           "RsqrtGrad",
diff --git a/tensorflow/lite/toco/toco.cc b/tensorflow/lite/toco/toco.cc
index 4a3d6a5848751f4c1d526153bd6f6d08a9f882af..aa7e43350caca295e027a433da1d96af76bb6686 100644
--- a/tensorflow/lite/toco/toco.cc
+++ b/tensorflow/lite/toco/toco.cc
@@ -49,5 +49,10 @@ int main(int argc, char** argv) {
   }
   toco::port::InitGoogle(argv[0], effective_argc, &effective_argv, true);
   auto status = toco::Convert(parsed_toco_flags, parsed_model_flags);
-  return status.ok() ? 0 : -1;
+  if (!status.ok()) {
+    fprintf(stderr, "%s\n", status.error_message().c_str());
+    fflush(stderr);
+    return 1;
+  }
+  return 0;
 }
diff --git a/tensorflow/lite/toco/toco_convert.cc b/tensorflow/lite/toco/toco_convert.cc
index 28e7b10ecd056815c8ca6d7a74f324a18d307451..2adfc1dd236bfe3ba8ee1de70e0dbdba08d9f283 100644
--- a/tensorflow/lite/toco/toco_convert.cc
+++ b/tensorflow/lite/toco/toco_convert.cc
@@ -77,7 +77,7 @@ tensorflow::Status Convert(const string& graph_def_contents,
                            string* output_file_contents) {
   std::unique_ptr<Model> model =
       Import(toco_flags, model_flags, graph_def_contents);
-  Transform(toco_flags, model.get());
+  TF_RETURN_IF_ERROR(TransformWithStatus(toco_flags, model.get()));
   return Export(toco_flags, *model, toco_flags.allow_custom_ops(),
                 output_file_contents);
 }
diff --git a/tensorflow/lite/toco/toco_convert_test.cc b/tensorflow/lite/toco/toco_convert_test.cc
index c3c440db94396def2f8cfd40242642767d11a63a..739b924607e7aa60bcdb6f081de52aed65a87d58 100644
--- a/tensorflow/lite/toco/toco_convert_test.cc
+++ b/tensorflow/lite/toco/toco_convert_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 #include "tensorflow/lite/toco/toco_convert.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/toco/toco_port.h"
 
 namespace toco {
 namespace {
@@ -171,3 +173,10 @@ TEST(TocoTest, TransientStringTensors) {
 
 }  // namespace
 }  // namespace toco
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  ::toco::port::InitGoogleWasDoneElsewhere();
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/toco/toco_port.cc b/tensorflow/lite/toco/toco_port.cc
index fb8c1b8337f1e509ed9c9ee2522e63e84d143927..b222032e61418224efddbae2c6ec2f110286ab0b 100644
--- a/tensorflow/lite/toco/toco_port.cc
+++ b/tensorflow/lite/toco/toco_port.cc
@@ -57,6 +57,11 @@ void InitGoogle(const char* usage, int* argc, char*** argv, bool remove_flags) {
   ::InitGoogle(usage, argc, argv, remove_flags);
 }
 
+void InitGoogleWasDoneElsewhere() {
+  // Nothing need be done since ::CheckInitGoogleIsDone() is aware of other
+  // possible initialization entry points.
+}
+
 void CheckInitGoogleIsDone(const char* message) {
   ::CheckInitGoogleIsDone(message);
 }
@@ -152,6 +157,8 @@ constexpr int kFileWriteFlags = O_CREAT | O_WRONLY;
 
 static bool port_initialized = false;
 
+void InitGoogleWasDoneElsewhere() { port_initialized = true; }
+
 void InitGoogle(const char* usage, int* argc, char*** argv, bool remove_flags) {
   if (!port_initialized) {
 #if defined(PLATFORM_GOOGLE)
diff --git a/tensorflow/lite/toco/toco_port.h b/tensorflow/lite/toco/toco_port.h
index 2f39e3d6d5c02457e9ade320e7525fbf881b5389..231612ecd43f3d77fc959a38642690ff6beed19b 100644
--- a/tensorflow/lite/toco/toco_port.h
+++ b/tensorflow/lite/toco/toco_port.h
@@ -55,6 +55,10 @@ double round(double x);
 namespace toco {
 namespace port {
 
+// Things like tests use other initialization routines that need control
+// of flags. However, for testing we still want to use toco_port.h facilities.
+// This function sets initialized flag trivially.
+void InitGoogleWasDoneElsewhere();
 void InitGoogle(const char* usage, int* argc, char*** argv, bool remove_flags);
 void CheckInitGoogleIsDone(const char* message);
 
diff --git a/tensorflow/lite/toco/toco_port_test.cc b/tensorflow/lite/toco/toco_port_test.cc
index f5fbb4caeb2882d51c4b586293eb202fcf60a9de..997da58b8f64386dfbf6e41ff5838373dd8d64c2 100644
--- a/tensorflow/lite/toco/toco_port_test.cc
+++ b/tensorflow/lite/toco/toco_port_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/toco/toco_port.h"
+#include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/toco/toco_types.h"
 
 #include <gmock/gmock.h>
@@ -56,3 +57,10 @@ TEST(TocoPortTest, JoinPath) {
 }  // namespace
 }  // namespace port
 }  // namespace toco
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  ::toco::port::InitGoogleWasDoneElsewhere();
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/toco/toco_tooling.cc b/tensorflow/lite/toco/toco_tooling.cc
index 55a454e66de4d0afce18421450d875911bea01f4..c66ef1db915b0b055982c06e24a9706b1943c804 100644
--- a/tensorflow/lite/toco/toco_tooling.cc
+++ b/tensorflow/lite/toco/toco_tooling.cc
@@ -178,6 +178,23 @@ void SetFinalDataTypeOnInputs(const TocoFlags& toco_flags, Model* model) {
       // Ignore non-real data types.
       continue;
     }
+    // The enum value QUANTIZED_UINT8 for --inference_type and
+    // --inference_input_type has long meant just 'QUANTIZED', being used as
+    // well in mixed 8-bit / 16-bit quantized models. However,
+    // ConvertIODataTypeToArrayDataType still interpretes it as meaning 8bit,
+    // and people have run into issues in the situation where they have an
+    // already mixed 8-bit / 16-bit quantized model in TFLITE format and
+    // want to run it again through toco, without having to re-specify all the
+    // extra array info that was used in the (complicated) process of initially
+    // quantizing that model. In order to have --inference_type=QUANTIZED_UINT8
+    // just work in that case, we implement the logic that when an array is
+    // already quantized, if  --inference_type is quantized (so we're not
+    // asking to dequantize here), no change of quantized data type is to be
+    // recorded.
+    if (array->data_type != toco::ArrayDataType::kFloat &&
+        type != toco::ArrayDataType::kFloat) {
+      continue;
+    }
 
     array->final_data_type = type;
   }
@@ -219,7 +236,8 @@ std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
   return model;
 }
 
-void Transform(const TocoFlags& toco_flags, Model* model) {
+tensorflow::Status TransformWithStatus(const TocoFlags& toco_flags,
+                                       Model* model) {
   const FileFormat output_format = toco_flags.output_format();
   const IODataType inference_type = toco_flags.inference_type();
 
@@ -241,8 +259,8 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
   // stop optimizations from crossing the input/output boundaries. For example
   // this will stop BatchNorm fusing if the output node is in between a conv
   // and BatchNorm layers.
-  RunGraphTransformations(model, "Removing unused ops",
-                          {new toco::RemoveUnusedOp});
+  TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
+      model, "Removing unused ops", {new toco::RemoveUnusedOp}));
 
   GraphTransformationsSet transformations;
   MakeGeneralGraphTransformationsSet(&transformations);
@@ -290,22 +308,36 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
     identify_dilated_conv->set_identify_depthwise_conv(false);
   }
   transformations.Add(identify_dilated_conv);
-  RunGraphTransformations(model, "general graph transformations",
-                          transformations);
+  TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
+      model, "general graph transformations", transformations));
 
   if (quantize_output) {
     if (toco_flags.propagate_fake_quant_num_bits()) {
-      RunGraphTransformations(model,
-                              "fake quant propagation graph transformations",
-                              {new PropagateFakeQuantNumBits});
+      TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
+          model, "fake quant propagation graph transformations",
+          {new PropagateFakeQuantNumBits}));
     }
-    RunGraphTransformations(model, "pre-quantization graph transformations",
-                            {
-                                new HardcodeMinMax,
-                                new DropFakeQuant,
-                            });
+    TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
+        model, "pre-quantization graph transformations",
+        {
+            new HardcodeMinMax,
+            new DropFakeQuant,
+        }));
   }
 
+  // Try to merge bidirectional sequence lstm or rnn if present.
+  GraphTransformationsSet bidirectional_transformations;
+  bidirectional_transformations.Add(new RemoveUnusedOp);
+  bidirectional_transformations.Add(new toco::GroupBidirectionalSequenceLstm);
+  bidirectional_transformations.Add(new toco::GroupBidirectionalSequenceRnn);
+  bidirectional_transformations.Add(
+      new toco::GroupDynamicBidirectionalSequenceRnn);
+  bidirectional_transformations.Add(
+      new toco::GroupDynamicBidirectionalSequenceLstm);
+  TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
+      model, "Group bidirectional sequence lstm/rnn",
+      bidirectional_transformations));
+
   // Fix any issues with IO edges. This must happen after any transform that
   // may modify the structure of the edges.
   FixEdgeArrays(model);
@@ -332,12 +364,12 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
           toco_flags.default_int16_ranges_max());
     }
     if (propagate_default_min_max->has_any_ranges_defined()) {
-      RunGraphTransformations(
+      TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
           model, "default min-max range propagation graph transformations",
           {
               propagate_default_min_max.release(),
               new HardcodeMinMax,
-          });
+          }));
     }
 
     CheckIsReadyForQuantization(*model);
@@ -347,17 +379,18 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
         toco_flags.allow_nudging_weights_to_use_fast_gemm_kernel());
     ensure_safe_for_int8_kernels->set_has_default_ranges_flag(
         has_default_ranges_flag);
-    RunGraphTransformations(model, "quantization graph transformations",
-                            {
-                                new RemoveTrivialQuantizedActivationFunc,
-                                new RemoveTrivialQuantizedMinMax,
-                                new Quantize,
-                                new RemoveFinalDequantizeOp,
-                                ensure_safe_for_int8_kernels,
-                            });
+    TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
+        model, "quantization graph transformations",
+        {
+            new RemoveTrivialQuantizedActivationFunc,
+            new RemoveTrivialQuantizedMinMax,
+            new Quantize,
+            new RemoveFinalDequantizeOp,
+            ensure_safe_for_int8_kernels,
+        }));
     if (SupportsShuffledFCWeights(output_format)) {
-      RunGraphTransformations(model, "shuffling of FC weights",
-                              {new ShuffleFCWeights});
+      TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
+          model, "shuffling of FC weights", {new ShuffleFCWeights}));
     }
   } else {
     GraphTransformationsSet dequantization_transformations{new Dequantize};
@@ -367,8 +400,9 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
       dequantization_transformations.Add(new DropFakeQuant);
     }
 
-    RunGraphTransformations(model, "dequantization graph transformations",
-                            dequantization_transformations);
+    TF_RETURN_IF_ERROR(RunGraphTransformationsWithStatus(
+        model, "dequantization graph transformations",
+        dequantization_transformations));
   }
 
   if (output_format == TENSORFLOW_GRAPHDEF) {
@@ -400,6 +434,7 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
               << " billion (note that a multiply-add is counted as 2 ops).";
   }
   model->ops_count = ops_count;
+  return tensorflow::Status::OK();
 }
 
 tensorflow::Status Export(const TocoFlags& toco_flags, const Model& model,
@@ -423,7 +458,7 @@ tensorflow::Status Export(const TocoFlags& toco_flags, const Model& model,
       return status;
     } break;
     case GRAPHVIZ_DOT:
-      DumpGraphviz(model, output_file_contents);
+      DumpGraphviz(model, output_file_contents, "Computation Graph");
       break;
     default:
       LOG(FATAL) << "Unhandled output_format='"
diff --git a/tensorflow/lite/toco/toco_tooling.h b/tensorflow/lite/toco/toco_tooling.h
index 742e3769269859c62522707ba415cd509e8df629..369961519499027ee4e3b04e4ebee6aadfd7c21c 100644
--- a/tensorflow/lite/toco/toco_tooling.h
+++ b/tensorflow/lite/toco/toco_tooling.h
@@ -31,7 +31,12 @@ std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
 
 // Transforms a Model. The resulting Model is ready to be passed
 // to Export with the exact same toco_flags.
-void Transform(const TocoFlags& toco_flags, Model* model);
+tensorflow::Status TransformWithStatus(const TocoFlags& toco_flags,
+                                       Model* model);
+inline void Transform(const TocoFlags& toco_flags, Model* model) {
+  auto s = TransformWithStatus(toco_flags, model);
+  CHECK(s.ok()) << s.error_message();
+}
 
 // Exports the Model, which must be of the 'lowered' form returned by
 // Transform, to a file of the format given by
diff --git a/tensorflow/lite/toco/tooling_util.cc b/tensorflow/lite/toco/tooling_util.cc
index af4cd386a209d82cb56a877410abe6fbdbf99c7b..41773356382f63b14c7162c33a733afcff0b9f36 100644
--- a/tensorflow/lite/toco/tooling_util.cc
+++ b/tensorflow/lite/toco/tooling_util.cc
@@ -66,29 +66,29 @@ string LogName(const Operator& op) {
 string ArrayDataTypeName(ArrayDataType data_type) {
   switch (data_type) {
     case ArrayDataType::kFloat:
-      return "Float";
+      return "float";
     case ArrayDataType::kInt8:
-      return "Int8";
+      return "int8";
     case ArrayDataType::kUint8:
-      return "Uint8";
+      return "uint8";
     case ArrayDataType::kInt16:
-      return "Int16";
+      return "int16";
     case ArrayDataType::kUint16:
-      return "Uint16";
+      return "uint16";
     case ArrayDataType::kInt32:
-      return "Int32";
+      return "int32";
     case ArrayDataType::kUint32:
-      return "Uint32";
+      return "uint32";
     case ArrayDataType::kInt64:
-      return "Int64";
+      return "int64";
     case ArrayDataType::kUint64:
-      return "Uint64";
+      return "uint64";
     case ArrayDataType::kString:
-      return "String";
+      return "string";
     case ArrayDataType::kBool:
-      return "Bool";
+      return "bool";
     case ArrayDataType::kComplex64:
-      return "Complex64";
+      return "complex64";
     case ArrayDataType::kNone:
       return "None";
     default:
@@ -173,7 +173,7 @@ bool DeleteArrayIfUsedOnce(const string& array_name, Model* model) {
   return false;
 }
 
-void DeleteOpAndArraysIfUnused(Model* model, Operator* op) {
+void DeleteOpAndArraysIfUnused(Model* model, const Operator* op) {
   for (const string& array_name : op->inputs) {
     DeleteArrayIfUsedOnce(array_name, model);
   }
@@ -331,6 +331,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(FakeQuant)
     HANDLE_OPERATORTYPENAME_CASE(Mul)
     HANDLE_OPERATORTYPENAME_CASE(RandomUniform)
+    HANDLE_OPERATORTYPENAME_CASE(Elu)
     HANDLE_OPERATORTYPENAME_CASE(Relu)
     HANDLE_OPERATORTYPENAME_CASE(Relu1)
     HANDLE_OPERATORTYPENAME_CASE(Relu6)
@@ -385,7 +386,9 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(ConcatV2)
     HANDLE_OPERATORTYPENAME_CASE(Cast)
     HANDLE_OPERATORTYPENAME_CASE(Floor)
+    HANDLE_OPERATORTYPENAME_CASE(Ceil)
     HANDLE_OPERATORTYPENAME_CASE(Gather)
+    HANDLE_OPERATORTYPENAME_CASE(GatherNd)
     HANDLE_OPERATORTYPENAME_CASE(ResizeBilinear)
     HANDLE_OPERATORTYPENAME_CASE(SpaceToBatchND)
     HANDLE_OPERATORTYPENAME_CASE(BatchToSpaceND)
@@ -412,10 +415,17 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(Unpack)
     HANDLE_OPERATORTYPENAME_CASE(ZerosLike)
     HANDLE_OPERATORTYPENAME_CASE(UnidirectionalSequenceLstm)
+    HANDLE_OPERATORTYPENAME_CASE(BidirectionalSequenceLstm)
+    HANDLE_OPERATORTYPENAME_CASE(BidirectionalSequenceRnn)
     HANDLE_OPERATORTYPENAME_CASE(ResizeNearestNeighbor)
     HANDLE_OPERATORTYPENAME_CASE(LeakyRelu)
     HANDLE_OPERATORTYPENAME_CASE(SquaredDifference)
     HANDLE_OPERATORTYPENAME_CASE(MirrorPad)
+    HANDLE_OPERATORTYPENAME_CASE(Unique)
+    HANDLE_OPERATORTYPENAME_CASE(UnidirectionalSequenceRnn)
+    HANDLE_OPERATORTYPENAME_CASE(ReverseV2)
+    HANDLE_OPERATORTYPENAME_CASE(Cos)
+    HANDLE_OPERATORTYPENAME_CASE(Where)
     default:
       LOG(FATAL) << "Unhandled op type";
 #undef HANDLE_OPERATORTYPENAME_CASE
@@ -532,7 +542,8 @@ void DumpGraphvizVideoFrame(const Model& model) {
   static int dump_id = 0;
   static std::unordered_set<std::size_t> dump_hashes;
   string graphviz_dump;
-  DumpGraphviz(model, &graphviz_dump);
+  DumpGraphviz(model, &graphviz_dump,
+               toco::port::StringF("VIDEO frame:%05d", dump_id));
   std::size_t hash = std::hash<string>{}(graphviz_dump);
   if (!dump_hashes.count(hash)) {
     LOG(INFO) << "DUMPING GRAPHVIZ VIDEO FRAME: " << dump_id;
@@ -555,7 +566,7 @@ void LogDump(int log_level, const string& message, const Model& model) {
   if (!dump_options.dump_graphviz.empty()) {
     string graphviz_dump;
 
-    DumpGraphviz(model, &graphviz_dump);
+    DumpGraphviz(model, &graphviz_dump, message);
     const auto result = port::file::SetContents(
         port::file::JoinPath(
             dump_options.dump_graphviz,
@@ -893,11 +904,6 @@ void CheckNonExistentIOArrays(const Model& model) {
   static constexpr char general_comment[] =
       "Is it a typo? To silence this message, pass this flag:  "
       "allow_nonexistent_arrays";
-  for (const auto& input_array : model.flags.input_arrays()) {
-    QCHECK(GetOpWithInput(model, input_array.name()))
-        << "Specified input array \"" << input_array.name()
-        << "\" is not consumed by any op in this graph. " << general_comment;
-  }
   for (const string& output_array : model.flags.output_arrays()) {
     if (IsConstantParameterArray(model, output_array)) {
       continue;  // It is OK to request that a constant be an output.
@@ -1857,119 +1863,140 @@ string CreateInt32Array(Model* model, const string& param_name,
   return param_array_name;
 }
 
-bool EstimateArithmeticOpsCount(const Model& model, int64* result) {
-  int64 total = 0;
-  for (const auto& op : model.operators) {
-    switch (op->type) {
-      case OperatorType::kFullyConnected:
-      case OperatorType::kConv:
-      case OperatorType::kDepthwiseConv: {
-        const auto& output_array = model.GetArray(op->outputs[0]);
-        const auto& weights_array = model.GetArray(op->inputs[1]);
-        if (!output_array.has_shape() || !weights_array.has_shape()) {
-          return false;
-        }
-        int cols = 1;
-        for (int i = 0; i < output_array.shape().dimensions_count() - 1; i++) {
-          cols *= output_array.shape().dims(i);
-        }
-        const int64 cost_per_col =
-            2 * RequiredBufferSizeForShape(weights_array.shape());
-        total += cost_per_col * cols;
-        if (op->inputs.size() > 2) {
-          // There is a bias vector. One more op per output value.
-          total += RequiredBufferSizeForShape(output_array.shape());
-        }
-        break;
+bool EstimateArithmeticOpsCount(const Model& model, const Operator& op,
+                                int64* result) {
+  switch (op.type) {
+    case OperatorType::kFullyConnected:
+    case OperatorType::kConv:
+    case OperatorType::kDepthwiseConv: {
+      const auto& output_array = model.GetArray(op.outputs[0]);
+      const auto& weights_array = model.GetArray(op.inputs[1]);
+      if (!output_array.has_shape() || !weights_array.has_shape()) {
+        return false;
       }
-      case OperatorType::kAdd:
-      case OperatorType::kSub:
-      case OperatorType::kMul: {
-        const auto& output_array = model.GetArray(op->outputs[0]);
-        if (!output_array.has_shape()) {
-          return false;
-        }
-        total += RequiredBufferSizeForShape(output_array.shape());
-        break;
+      int64 cols = 1;
+      for (int i = 0; i < output_array.shape().dimensions_count() - 1; i++) {
+        cols *= output_array.shape().dims(i);
       }
-      case OperatorType::kAddN: {
-        const auto& output_array = model.GetArray(op->outputs[0]);
-        if (!output_array.has_shape()) {
-          return false;
-        }
-        // AddN cost is roughly the same cost as N-1 Adds.
-        const int num_adds = op->inputs.size() - 1;
-        total += num_adds * RequiredBufferSizeForShape(output_array.shape());
-        break;
+      const int64 cost_per_col =
+          2 * RequiredBufferSizeForShape(weights_array.shape());
+      *result = cost_per_col * cols;
+      if (op.inputs.size() > 2) {
+        // There is a bias vector. One more op per output value.
+        *result += RequiredBufferSizeForShape(output_array.shape());
       }
-      case OperatorType::kLogistic:
-      case OperatorType::kSoftmax:
-      case OperatorType::kLogSoftmax:
-      case OperatorType::kTanh: {
-        const auto& output_array = model.GetArray(op->outputs[0]);
-        if (!output_array.has_shape()) {
-          return false;
-        }
-        // As a very rough ballpark, the cost of evaluating a math function
-        // such as tanh or logistic is about 32 multiplications, and about as
-        // many additions/subtractions. (Just a power-of-two order-of-magnitude
-        // from looking at actual implementations that we use in runtime/ code).
-        total += 64 * RequiredBufferSizeForShape(output_array.shape());
-        break;
+      break;
+    }
+    case OperatorType::kAdd:
+    case OperatorType::kSub:
+    case OperatorType::kMul: {
+      const auto& output_array = model.GetArray(op.outputs[0]);
+      if (!output_array.has_shape()) {
+        return false;
       }
-      case OperatorType::kMaxPool: {
-        const auto& maxpool = *static_cast<const MaxPoolOperator*>(op.get());
-        const auto& output_array = model.GetArray(op->outputs[0]);
-        if (!output_array.has_shape()) {
-          return false;
-        }
-        total += RequiredBufferSizeForShape(output_array.shape()) *
-                 maxpool.kheight * maxpool.kwidth;
-        break;
+      *result = RequiredBufferSizeForShape(output_array.shape());
+      break;
+    }
+    case OperatorType::kAddN: {
+      const auto& output_array = model.GetArray(op.outputs[0]);
+      if (!output_array.has_shape()) {
+        return false;
       }
-      case OperatorType::kAveragePool: {
-        const auto& avgpool =
-            *static_cast<const AveragePoolOperator*>(op.get());
-        const auto& output_array = model.GetArray(op->outputs[0]);
-        if (!output_array.has_shape()) {
-          return false;
-        }
-        total += RequiredBufferSizeForShape(output_array.shape()) *
-                 avgpool.kheight * avgpool.kwidth;
-        break;
+      // AddN cost is roughly the same cost as N-1 Adds.
+      const int64 num_adds = op.inputs.size() - 1;
+      *result = num_adds * RequiredBufferSizeForShape(output_array.shape());
+      break;
+    }
+    case OperatorType::kLogistic:
+    case OperatorType::kSoftmax:
+    case OperatorType::kLogSoftmax:
+    case OperatorType::kTanh: {
+      const auto& output_array = model.GetArray(op.outputs[0]);
+      if (!output_array.has_shape()) {
+        return false;
       }
-      case OperatorType::kL2Pool: {
-        const auto* maxpool = static_cast<const MaxPoolOperator*>(op.get());
-        const auto& output_array = model.GetArray(op->outputs[0]);
-        if (!output_array.has_shape()) {
-          return false;
-        }
-        // The sum of squares requires (kheight*kwidth) multiply-adds,
-        // and then there is the sqrt which we ballpark at 32 ops.
-        const int64 cost_per_val = 2 * maxpool->kheight * maxpool->kwidth + 32;
-        total +=
-            RequiredBufferSizeForShape(output_array.shape()) * cost_per_val;
-        break;
+      // As a very rough ballpark, the cost of evaluating a math function
+      // such as tanh or logistic is about 32 multiplications, and about as
+      // many additions/subtractions. (Just a power-of-two order-of-magnitude
+      // from looking at actual implementations that we use in runtime/ code).
+      *result = 64 * RequiredBufferSizeForShape(output_array.shape());
+      break;
+    }
+    case OperatorType::kMaxPool: {
+      const auto& maxpool = *static_cast<const MaxPoolOperator*>(&op);
+      const auto& output_array = model.GetArray(op.outputs[0]);
+      if (!output_array.has_shape()) {
+        return false;
       }
-      case OperatorType::kL2Normalization: {
-        const auto& output_array = model.GetArray(op->outputs[0]);
-        if (!output_array.has_shape()) {
-          return false;
-        }
-        // Computing the squared L2 norm is N multiply-adds so 2N ops,
-        // then the single inverse-sqrt is negligible, then we multiply each
-        // value by the resulting multiplier, so an extra N ops. Total 3N ops.
-        total += 3 * RequiredBufferSizeForShape(output_array.shape());
-        break;
+      *result = RequiredBufferSizeForShape(output_array.shape()) *
+                maxpool.kheight * maxpool.kwidth;
+      break;
+    }
+    case OperatorType::kAveragePool: {
+      const auto& avgpool = *static_cast<const AveragePoolOperator*>(&op);
+      const auto& output_array = model.GetArray(op.outputs[0]);
+      if (!output_array.has_shape()) {
+        return false;
       }
-      default:
-        break;
+      *result = RequiredBufferSizeForShape(output_array.shape()) *
+                avgpool.kheight * avgpool.kwidth;
+      break;
+    }
+    case OperatorType::kL2Pool: {
+      const auto* maxpool = static_cast<const MaxPoolOperator*>(&op);
+      const auto& output_array = model.GetArray(op.outputs[0]);
+      if (!output_array.has_shape()) {
+        return false;
+      }
+      // The sum of squares requires (kheight*kwidth) multiply-adds,
+      // and then there is the sqrt which we ballpark at 32 ops.
+      const int64 cost_per_val = 2 * maxpool->kheight * maxpool->kwidth + 32;
+      *result = RequiredBufferSizeForShape(output_array.shape()) * cost_per_val;
+      break;
+    }
+    case OperatorType::kL2Normalization: {
+      const auto& output_array = model.GetArray(op.outputs[0]);
+      if (!output_array.has_shape()) {
+        return false;
+      }
+      // Computing the squared L2 norm is N multiply-adds so 2N ops,
+      // then the single inverse-sqrt is negligible, then we multiply each
+      // value by the resulting multiplier, so an extra N ops. count 3N ops.
+      *result = 3 * RequiredBufferSizeForShape(output_array.shape());
+      break;
     }
+    default:
+      *result = 0;
+      break;
+  }
+  return true;
+}
+
+bool EstimateArithmeticOpsCount(const Model& model, int64* result) {
+  int64 total = 0;
+  for (const auto& op : model.operators) {
+    int64 num_ops;
+    if (!EstimateArithmeticOpsCount(model, *op, &num_ops)) {
+      return false;
+    }
+    total += num_ops;
   }
   *result = total;
   return true;
 }
 
+string FormattedNumber(int64 x) {
+  const int64 million = 1000000;
+  const int64 billion = 1000000000;
+  if (x < 10000) {
+    return toco::port::StringF("%d ", x);
+  } else if (x < billion) {
+    return toco::port::StringF("%.3f M", static_cast<double>(x) / million);
+  } else {
+    return toco::port::StringF("%.3f G", static_cast<double>(x) / billion);
+  }
+}
+
 void GetShuffleShape(AxesOrder input_axes_order, AxesOrder output_axes_order,
                      std::vector<int>* shuffle) {
   CHECK_EQ(AxesCount(input_axes_order), AxesCount(output_axes_order));
diff --git a/tensorflow/lite/toco/tooling_util.h b/tensorflow/lite/toco/tooling_util.h
index 53131824b532853afc1660354de92da40db0da86..fc4aac7740c64e06a7d762f3af4db504ea1213bd 100644
--- a/tensorflow/lite/toco/tooling_util.h
+++ b/tensorflow/lite/toco/tooling_util.h
@@ -72,7 +72,7 @@ bool DeleteArrayIfUsedOnce(const string& array_name, Model* model);
 
 // Deletes the op and any of its input and output arrays if they are unused
 // after the op has been deleted.
-void DeleteOpAndArraysIfUnused(Model* model, Operator* op);
+void DeleteOpAndArraysIfUnused(Model* model, const Operator* op);
 
 std::vector<std::unique_ptr<Operator>>::const_iterator FindOpWithOutput(
     const Model& model, const string& array_name);
@@ -267,7 +267,10 @@ void MakeArrayDims(int num_dims, int batch, int height, int width, int depth,
 string CreateInt32Array(Model* model, const string& param_name,
                         const std::vector<int>& value);
 
+bool EstimateArithmeticOpsCount(const Model& model, const Operator& op,
+                                int64* result);
 bool EstimateArithmeticOpsCount(const Model& model, int64* result);
+string FormattedNumber(int64 x);
 
 int AxesCount(AxesOrder axes_order);
 
diff --git a/tensorflow/lite/toco/tooling_util_test.cc b/tensorflow/lite/toco/tooling_util_test.cc
index 6f1c9c563ada01891b67094caa93cfd1847cdf6b..f063ce71e9156ce85b7b4fe1bfeb8ad5d57cda0c 100644
--- a/tensorflow/lite/toco/tooling_util_test.cc
+++ b/tensorflow/lite/toco/tooling_util_test.cc
@@ -16,9 +16,11 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/toco_port.h"
 #include "tensorflow/lite/toco/tooling_util.h"
-#include "tensorflow/core/lib/core/status.h"
 
 namespace toco {
 
@@ -94,8 +96,8 @@ TEST_P(ShapeTest, Agrees) {
   }
 }
 
-INSTANTIATE_TEST_CASE_P(AgreeBroadcast, ShapeTest,
-                        ::testing::ValuesIn(CreateShapePairs()));
+INSTANTIATE_TEST_SUITE_P(AgreeBroadcast, ShapeTest,
+                         ::testing::ValuesIn(CreateShapePairs()));
 
 static const char kNegativeValuesMessage[] =
     "Tensor shape should not include negative values";
@@ -203,3 +205,10 @@ TEST(FusedActivationTest, DefaultsToUnfused) {
 }
 
 }  // namespace toco
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  ::toco::port::InitGoogleWasDoneElsewhere();
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index 1d141b5dd01a4a03c65d0c8a119ad62eea224d52..f67b3f98e9beafd1548a2033289ffbc9e3b86356 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -70,7 +70,9 @@ cc_library(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite:string_util",
+        "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -86,6 +88,7 @@ cc_test(
         "//tensorflow/core:framework_lite",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/README.md b/tensorflow/lite/tools/accuracy/ilsvrc/README.md
index ac3a1566e2a2c834260acbfbee8908cc13efa42a..28ad2e407f331023ebc22a5692693f5669feaff3 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/README.md
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/README.md
@@ -16,18 +16,25 @@ The binary takes the following parameters:
     The path to the directory containing ground truth images.
 
 *   `ground_truth_labels`: `string` \
-    Path to ground truth labels file. This file should contain the same number of labels as    the number images in the ground truth directory. The labels are assumed to be in the
-    same order as the sorted filename of images. See [ground truth label generation](#ground-truth-label-generation)
-    section for more information about how to generate labels for images.
+    Path to ground truth labels file. This file should contain the same number
+    of labels as the number images in the ground truth directory. The labels are
+    assumed to be in the same order as the sorted filename of images. See
+    [ground truth label generation](#ground-truth-label-generation) section for
+    more information about how to generate labels for images.
 
-*    `model_output_labels`: `string` \
+*   `model_output_labels`: `string` \
     Path to the file containing labels, that is used to interpret the output of
     the model. E.g. in case of mobilenets, this is the path to
     `mobilenet_labels.txt` where each label is in the same order as the output
     1001 dimension tensor.
 
 *   `output_path`: `string` \
-    This is the path to the output file. The output is a CSV file that has top-10 accuracies in each row. Each line of output file is the cumulative accuracy after processing images in a sorted order. So first line is accuracy after processing the first image, second line is accuracy after procesing first two images. The last line of the file is accuracy after processing the entire validation set.
+    This is the path to the output file. The output is a CSV file that has
+    top-10 accuracies in each row. Each line of output file is the cumulative
+    accuracy after processing images in a sorted order. So first line is
+    accuracy after processing the first image, second line is accuracy after
+    processing first two images. The last line of the file is accuracy after
+    processing the entire validation set.
 
 and the following optional parameters:
 
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
index 9a74e221c13e72c286512175a7f633c87f75eedd..129747fe4d5c93630f9f6552a9486cbe8f8c37b7 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
@@ -22,6 +22,12 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/lite/tools/accuracy/eval_pipeline.h"
 #include "tensorflow/lite/tools/accuracy/eval_pipeline_builder.h"
 #include "tensorflow/lite/tools/accuracy/file_reader_stage.h"
@@ -29,12 +35,6 @@ limitations under the License.
 #include "tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h"
 #include "tensorflow/lite/tools/accuracy/run_tflite_model_stage.h"
 #include "tensorflow/lite/tools/accuracy/utils.h"
-#include "tensorflow/core/lib/core/blocking_counter.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 
 namespace {
 using tensorflow::string;
@@ -185,21 +185,17 @@ Status EvaluateModelForShard(const uint64_t shard_id,
   const TensorShape& input_shape = model_info.input_shapes[0];
   const int image_height = input_shape.dim_size(1);
   const int image_width = input_shape.dim_size(2);
-  const bool is_quantized = (model_info.input_types[0] == DT_UINT8);
 
   RunTFLiteModelStage::Params tfl_model_params;
   tfl_model_params.model_file_path = params.model_file_path;
-  if (is_quantized) {
-    tfl_model_params.input_type = {DT_UINT8};
-    tfl_model_params.output_type = {DT_UINT8};
-  } else {
-    tfl_model_params.input_type = {DT_FLOAT};
-    tfl_model_params.output_type = {DT_FLOAT};
-  }
+
+  tfl_model_params.input_type = {model_info.input_types[0]};
+  tfl_model_params.output_type = {model_info.input_types[0]};
 
   Scope root = Scope::NewRootScope();
   FileReaderStage reader;
-  InceptionPreprocessingStage inc(image_height, image_width, is_quantized);
+  InceptionPreprocessingStage inc(image_height, image_width,
+                                  model_info.input_types[0]);
   RunTFLiteModelStage tfl_model_stage(tfl_model_params);
   EvalPipelineBuilder builder;
 
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc
index 2b086cdf7075d7e6328ce0a41b17ca611ea3c4e2..f5642d52a89d86930023fd21a6d81e628073927c 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_topk_eval.cc
@@ -67,11 +67,18 @@ Status ImagenetTopKAccuracy::ComputeEval(
     for (size_t i = 0; i < probs.size(); i++) {
       probabilities.push_back(probs(i));
     }
-  } else {
+  } else if (output.dtype() == DT_UINT8) {
     auto probs = output.flat<uint8>();
     for (size_t i = 0; i < probs.size(); i++) {
       probabilities.push_back(probs(i));
     }
+  } else if (output.dtype() == DT_INT8) {
+    auto probs = output.flat<int8>();
+    for (size_t i = 0; i < probs.size(); i++) {
+      probabilities.push_back(probs(i));
+    }
+  } else {
+    return errors::InvalidArgument("Invalid datatype");
   }
 
   CHECK_EQ(kNumCategories, probabilities.size());
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc
index 9a889f0dd88bc4c51b2c060baf0e89c126c98c1f..b730b0804e0df3d559ec99552fb443efc3e867eb 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -57,23 +56,33 @@ void InceptionPreprocessingStage::AddToGraph(const Scope& scope,
   tensorflow::Output cropped_image;
   CentralCropImage(s, decoded_jpeg, params_.cropping_fraction, &cropped_image);
   auto dims_expander = ops::ExpandDims(s, cropped_image, 0);
-  auto resized_image = ops::ResizeBilinear(
-      s, dims_expander,
-      ops::Const(s.WithOpName("size"), {image_height_, image_width_}));
-  if (is_quantized_) {
-    this->stage_output_ =
-        ops::Cast(s.WithOpName(output_name()), resized_image, DT_UINT8);
-  } else {
-    auto squeezed_image = ops::Squeeze(s, resized_image);
-    auto normalized_image =
-        ops::Div(s,
-                 ops::Sub(s, squeezed_image,
-                          {params_.input_means[0], params_.input_means[1],
-                           params_.input_means[2]}),
-                 {params_.scale});
-    this->stage_output_ =
-        ops::ExpandDims(s.WithOpName(output_name()), normalized_image, {0});
+  auto resized_image =
+      ops::ResizeBilinear(s.WithOpName("resize"), dims_expander,
+                          ops::Const(s, {image_height_, image_width_}));
+
+  ::tensorflow::Output preprocessed_image = resized_image;
+
+  if (!params_.input_means.empty()) {
+    preprocessed_image =
+        ops::Sub(s.WithOpName("sub"), preprocessed_image,
+                 {params_.input_means[0], params_.input_means[1],
+                  params_.input_means[2]});
+  }
+
+  if (std::abs(params_.scale) > 1e-7f) {
+    auto squeezed_image = ops::Squeeze(s, preprocessed_image);
+    preprocessed_image = ops::Div(s, squeezed_image, {params_.scale});
+    preprocessed_image = ops::ExpandDims(s, preprocessed_image, {0});
   }
+
+  // Cast the output from float to output datatype.
+  if (output_datatype_ != DT_FLOAT) {
+    preprocessed_image =
+        ops::Cast(s.WithOpName("cast"), preprocessed_image, output_datatype_);
+  }
+
+  this->stage_output_ =
+      ops::Identity(s.WithOpName(output_name()), preprocessed_image);
 }
 
 }  // namespace metrics
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h
index 4a1d3ce4769d1a7d3f46f39941eb3e9bcde7785c..371feb3e76515a714286983a393c10dbaf4be3c8 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_INCEPTION_PREPROCESSING_H_
-#define TENSORFLOW_LITE_TOOLS_ACCURACY_INCEPTION_PREPROCESSING_H_
+#ifndef TENSORFLOW_LITE_TOOLS_ACCURACY_ILSVRC_INCEPTION_PREPROCESSING_H_
+#define TENSORFLOW_LITE_TOOLS_ACCURACY_ILSVRC_INCEPTION_PREPROCESSING_H_
 
 #include <utility>
 
-#include "tensorflow/lite/tools/accuracy/stage.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/lite/tools/accuracy/stage.h"
 
 namespace tensorflow {
 namespace metrics {
@@ -31,28 +31,53 @@ namespace metrics {
 // shape {1, image_height, image_width, 3}, where 3 is the number of channels.
 class InceptionPreprocessingStage : public Stage {
  public:
+  // Preprocessing params that govern scaling and normalization of channels of
+  // the image.
   struct Params {
+    // Input means are subtracted from each channel.
+    // In case of an empty vector this is skipped.
     std::vector<float> input_means;
+    // Scale is used to divide the input.
+    // A scale of 0 means divison is skipped.
     float scale;
     double cropping_fraction;
   };
 
-  static Params DefaultParams() {
-    return {.input_means = {127.5, 127.5, 127.5},
-            .scale = 127.5,
-            .cropping_fraction = 0.875};
+  // Default preprocessing for inception stage based on |output_type|
+  static Params DefaultParamsForType(DataType output_type) {
+    const float kCroppingFraction = 0.875;
+    Params params = {};
+    params.cropping_fraction = kCroppingFraction;
+    if (output_type == DT_UINT8) {
+    } else if (output_type == DT_INT8) {
+      params.input_means = {128.0, 128.0, 128.0};
+    } else {
+      // Assume floating point preprocessing.
+      params.input_means = {127.5, 127.5, 127.5};
+      params.scale = 127.5;
+    }
+    return params;
+  }
+
+  // Creates a new preprocessing stage object with provided |image_width|
+  // |image_height| as the size of output image.
+  // |output_datatype| is the datatype of output of the stage.
+  InceptionPreprocessingStage(int image_width, int image_height,
+                              DataType output_datatype)
+      : output_datatype_(output_datatype),
+        image_width_(image_width),
+        image_height_(image_height) {
+    params_ = DefaultParamsForType(output_datatype);
   }
 
   // Creates a new preprocessing stage object with provided |image_width|
   // |image_height| as the size of output image.
-  // If |is_quantized| is set to true then |params| is ignored since quantized
-  // images don't go through any preprocessing.
+  // |output_datatype| is the datatype of output of the stage.
   InceptionPreprocessingStage(int image_width, int image_height,
-                              bool is_quantized,
-                              Params params = DefaultParams())
-      : image_width_(image_width),
+                              DataType output_datatype, Params params)
+      : output_datatype_(output_datatype),
+        image_width_(image_width),
         image_height_(image_height),
-        is_quantized_(is_quantized),
         params_(std::move(params)) {}
 
   string name() const override { return "stage_inception_preprocess"; }
@@ -63,6 +88,7 @@ class InceptionPreprocessingStage : public Stage {
   void AddToGraph(const Scope& scope, const Input& input) override;
 
  private:
+  DataType output_datatype_;
   int image_width_;
   int image_height_;
   bool is_quantized_;
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc
index 5d0e01d7d18c451b978edbd08fc27934c8379961..f88847035f21ee41eb7403aae99c9d7db1484499 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing_test.cc
@@ -17,10 +17,10 @@ limitations under the License.
 #include <string>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/lite/tools/accuracy/ilsvrc/inception_preprocessing.h"
 
 namespace {
 tensorflow::string* g_test_image_file = nullptr;
@@ -48,7 +48,7 @@ Status GetContents(const string& filename, string* output) {
   }
 }
 
-TEST(InceptionPreprocessingTest, TestImagePreprocessQuantized) {
+TEST(InceptionPreprocessingTest, TestImagePreprocessUInt8Quantized) {
   ASSERT_TRUE(g_test_image_file != nullptr);
   string image_contents;
   string image_path = *g_test_image_file;
@@ -56,8 +56,8 @@ TEST(InceptionPreprocessingTest, TestImagePreprocessQuantized) {
   ASSERT_TRUE(status.ok()) << status.error_message();
   const int width = 224;
   const int height = 224;
-  const bool is_quantized = true;
-  InceptionPreprocessingStage preprocess_stage(width, height, is_quantized);
+  auto params = InceptionPreprocessingStage::DefaultParamsForType(DT_UINT8);
+  InceptionPreprocessingStage preprocess_stage(width, height, DT_UINT8, params);
   Scope scope = Scope::NewRootScope();
   preprocess_stage.AddToGraph(scope, image_contents);
   TF_CHECK_OK(scope.status());
@@ -77,6 +77,35 @@ TEST(InceptionPreprocessingTest, TestImagePreprocessQuantized) {
   EXPECT_TRUE(outputs[0].shape().IsSameSize({1, 224, 224, 3}));
 }
 
+TEST(InceptionPreprocessingTest, TestImagePreprocessInt8Quantized) {
+  ASSERT_TRUE(g_test_image_file != nullptr);
+  string image_contents;
+  string image_path = *g_test_image_file;
+  auto status = GetContents(image_path, &image_contents);
+  ASSERT_TRUE(status.ok()) << status.error_message();
+  const int width = 224;
+  const int height = 224;
+  auto params = InceptionPreprocessingStage::DefaultParamsForType(DT_INT8);
+  InceptionPreprocessingStage preprocess_stage(width, height, DT_INT8, params);
+  Scope scope = Scope::NewRootScope();
+  preprocess_stage.AddToGraph(scope, image_contents);
+  TF_CHECK_OK(scope.status());
+
+  GraphDef graph_def;
+  TF_CHECK_OK(scope.ToGraphDef(&graph_def));
+  std::unique_ptr<Session> session(NewSession(SessionOptions()));
+  TF_CHECK_OK(session->Create(graph_def));
+  std::vector<Tensor> outputs;
+  auto run_status =
+      session->Run({},                                   /*inputs*/
+                   {preprocess_stage.output_name()}, {}, /*target node names */
+                   &outputs);
+  TF_CHECK_OK(run_status);
+  EXPECT_EQ(1, outputs.size());
+  EXPECT_EQ(DT_INT8, outputs[0].dtype());
+  EXPECT_TRUE(outputs[0].shape().IsSameSize({1, 224, 224, 3}));
+}
+
 TEST(InceptionPreprocessingTest, TestImagePreprocessFloat) {
   ASSERT_TRUE(g_test_image_file != nullptr);
   string image_contents;
@@ -85,8 +114,8 @@ TEST(InceptionPreprocessingTest, TestImagePreprocessFloat) {
   ASSERT_TRUE(status.ok()) << status.error_message();
   const int width = 224;
   const int height = 224;
-  const bool is_quantized = false;
-  InceptionPreprocessingStage preprocess_stage(width, height, is_quantized);
+  auto params = InceptionPreprocessingStage::DefaultParamsForType(DT_FLOAT);
+  InceptionPreprocessingStage preprocess_stage(width, height, DT_FLOAT, params);
   Scope scope = Scope::NewRootScope();
   preprocess_stage.AddToGraph(scope, image_contents);
   TF_CHECK_OK(scope.status());
diff --git a/tensorflow/lite/tools/accuracy/utils.cc b/tensorflow/lite/tools/accuracy/utils.cc
index c19dc1ff7cca10745a367c027bef1067d117eb4a..953892b8ddff2e60d2e1618df97d867b2d553c29 100644
--- a/tensorflow/lite/tools/accuracy/utils.cc
+++ b/tensorflow/lite/tools/accuracy/utils.cc
@@ -38,6 +38,12 @@ DataType GetTFDataType(TfLiteType tflite_type) {
       return DT_FLOAT;
     case kTfLiteUInt8:
       return DT_UINT8;
+    case kTfLiteInt8:
+      return DT_INT8;
+    case kTfLiteInt32:
+      return DT_INT32;
+    case kTfLiteInt64:
+      return DT_INT64;
     default:
       return DT_INVALID;
   }
diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index bc47406cd92d406a0900743986ea67a4ba39240e..ce31eaf42f170b6ce52a961bb984197313e63f96 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -4,6 +4,7 @@ package(default_visibility = [
 
 licenses(["notice"])  # Apache 2.0
 
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite:build_def.bzl", "tflite_linkopts")
@@ -35,7 +36,7 @@ cc_binary(
     ],
 )
 
-cc_binary(
+tf_cc_binary(
     name = "benchmark_model_plus_flex",
     srcs = [
         "benchmark_plus_flex_main.cc",
@@ -140,10 +141,6 @@ cc_library(
         ":logging",
         "//tensorflow/core:stats_calculator_portable",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite:string_util",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "//tensorflow/lite/profiling:profile_summarizer",
-        "//tensorflow/lite/profiling:profiler",
         "//tensorflow/lite/profiling:time",
     ],
 )
diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index a4d9c879eb645019a7626502207e9a3f4e89b1c1..e6ba818c71f23f39e511b7866ce2356848d46493 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -5,7 +5,7 @@
 A simple C++ binary to benchmark a TFLite model and its individual operators,
 both on desktop machines and on Android. The binary takes a TFLite model,
 generates random inputs and then repeatedly runs the model for specified number
-of runs. Aggregrate latency statistics are reported after running the benchmark.
+of runs. Aggregate latency statistics are reported after running the benchmark.
 
 The instructions below are for running the binary on Desktop and Android,
 for iOS please use the
diff --git a/tensorflow/lite/tools/benchmark/android/README.md b/tensorflow/lite/tools/benchmark/android/README.md
index f5b67e3f79aa669c5424d46c23f053213ad3a101..db82c59acd3de38bbd8ffcf1542f34adf02c9098 100644
--- a/tensorflow/lite/tools/benchmark/android/README.md
+++ b/tensorflow/lite/tools/benchmark/android/README.md
@@ -51,7 +51,7 @@ and can be appended to the `args` string alongside the required `--graph` flag
 args key).
 
 ```
-adb shell am start -S -n
+adb shell am start -S -n \
   org.tensorflow.lite.benchmark/org.tensorflow.lite.benchmark.BenchmarkModelActivity \
   --es args '"--graph=/data/local/tmp/mobilenet_quant_v1_224.tflite --num_threads=4"'
 ```
diff --git a/tensorflow/lite/tools/benchmark/benchmark_model.cc b/tensorflow/lite/tools/benchmark/benchmark_model.cc
index e9b485efcaa81b011c598d5dfa39d4f253090dc8..70f4c94d3588b1645ce6c8422ca3cfe94eddc8e6 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_model.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "tensorflow/lite/tools/benchmark/benchmark_model.h"
 
-#include <time.h>
-
 #include <iostream>
 #include <sstream>
 
@@ -28,18 +26,11 @@ void SleepForSeconds(double sleep_seconds) {
   if (sleep_seconds <= 0.0) {
     return;
   }
-  // Convert the run_delay string into a timespec.
-  timespec req;
-  req.tv_sec = static_cast<time_t>(sleep_seconds);
-  req.tv_nsec = (sleep_seconds - req.tv_sec) * 1000000000;
   // If requested, sleep between runs for an arbitrary amount of time.
   // This can be helpful to determine the effect of mobile processor
   // scaling and thermal throttling.
-#ifdef PLATFORM_WINDOWS
-  Sleep(sleep_seconds * 1000);
-#else
-  nanosleep(&req, nullptr);
-#endif
+  return tflite::profiling::time::SleepForMicros(
+      static_cast<uint64_t>(sleep_seconds * 1e6));
 }
 
 }  // namespace
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index 32cf4e4292a57ebb73abfaeb3d73d5c1e5717f43..83e0ff1f872ef7849a92aa24a3ff052b61dc6fa8 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -316,18 +316,13 @@ void BenchmarkTfLiteModel::Init() {
   tflite::ops::builtin::BuiltinOpResolver resolver;
 #endif
 
-  tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+  const int32_t num_threads = params_.Get<int32_t>("num_threads");
+  tflite::InterpreterBuilder(*model, resolver)(&interpreter, num_threads);
   if (!interpreter) {
     TFLITE_LOG(FATAL) << "Failed to construct interpreter";
   }
   profiling_listener_.SetInterpreter(interpreter.get());
 
-  const int32_t num_threads = params_.Get<int32_t>("num_threads");
-
-  if (num_threads != -1) {
-    interpreter->SetNumThreads(num_threads);
-  }
-
   bool use_nnapi = params_.Get<bool>("use_nnapi");
 
   interpreter->UseNNAPI(use_nnapi);
@@ -361,11 +356,23 @@ void BenchmarkTfLiteModel::Init() {
     }
   }
 
-  if (interpreter->AllocateTensors() != kTfLiteOk) {
+  // Don't allocate tensors if we have delegates.
+  if (delegates_.empty() && interpreter->AllocateTensors() != kTfLiteOk) {
     TFLITE_LOG(FATAL) << "Failed to allocate tensors!";
   }
 }
 
+void BenchmarkTfLiteModel::ApplyDelegates() {
+  for (int i = 0; i < delegates_.size(); ++i) {
+    if (interpreter->ModifyGraphWithDelegate(delegates_[i].get()) !=
+        kTfLiteOk) {
+      TFLITE_LOG(FATAL) << "Failed to apply delegate # " << i;
+    } else {
+      TFLITE_LOG(INFO) << "Applied Delegate # " << i;
+    }
+  }
+}
+
 void BenchmarkTfLiteModel::RunImpl() {
   if (interpreter->Invoke() != kTfLiteOk) {
     TFLITE_LOG(FATAL) << "Failed to invoke!";
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
index 83599e644d1f41f70fd96f3a73f9155d6e62deef..3532d2a5ddb104d413612e00001bf213e90e283e 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
@@ -70,6 +70,9 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   uint64_t ComputeInputBytes() override;
   void Init() override;
   void RunImpl() override;
+  void SetDelegates(std::vector<std::unique_ptr<TfLiteDelegate>> delegates) {
+    delegates_ = std::move(delegates);
+  }
 
   struct InputLayerInfo {
     std::string name;
@@ -81,7 +84,7 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   void PrepareInputsAndOutputs() override;
 
   // Allows installation of custom delegates during initialization
-  virtual void ApplyDelegates() {}
+  virtual void ApplyDelegates();
 
   std::unique_ptr<tflite::FlatBufferModel> model;
   std::unique_ptr<tflite::Interpreter> interpreter;
@@ -90,6 +93,7 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   std::vector<InputLayerInfo> inputs;
   ProfilingListener profiling_listener_;
   GemmlowpProfilingListener gemmlowp_profiling_listener_;
+  std::vector<std::unique_ptr<TfLiteDelegate>> delegates_;
 };
 
 }  // namespace benchmark
diff --git a/tensorflow/lite/tools/benchmark/ios/README.md b/tensorflow/lite/tools/benchmark/ios/README.md
index fed9e7ea7e8633e00413118fa3e9e4f12d5188a4..ee880f005dfaec1cd27d5dc093720f5de5433bfa 100644
--- a/tensorflow/lite/tools/benchmark/ios/README.md
+++ b/tensorflow/lite/tools/benchmark/ios/README.md
@@ -24,11 +24,12 @@ to build TFLite.
 Running
 
 ```bash
-tensorflow/lite/build_ios_universal_lib.sh
+tensorflow/lite/tools/make/build_ios_universal_lib.sh
 ```
-will also build `tensorflow/lite/gen/lib/benchmark-lib.a` .
 
-- Now copy the downloaded model file to `benchmark_data` directory. 
+will also build `tensorflow/lite/tools/make/gen/lib/benchmark-lib.a` .
+
+- Now copy the downloaded model file to `benchmark_data` directory.
 
 - Modify `benchmark_params.json` change the `input_layer`, `input_layer_shape`
 and other benchmark parameters.
@@ -36,8 +37,8 @@ and other benchmark parameters.
 - Change `Build Phases -> Copy Bundle Resources` and add the model file to the
 resources that need to be copied.
 
-- Ensure that `Build Phases -> Link Binary With Library` contains the 
-`Accelerate framework` and `tensorflow/lite/gen/lib/benchmark-lib.a`.
+- Ensure that `Build Phases -> Link Binary With Library` contains the
+`Accelerate framework` and `tensorflow/lite/tools/make/gen/lib/benchmark-lib.a`.
 
 - Now try running the app. The app has a single button that runs the benchmark
   on the model and displays results in a text view below.
@@ -47,7 +48,7 @@ resources that need to be copied.
 If you want detailed profiling, use the following command:
 
 ```bash
-tensorflow/lite/build_ios_universal_lib.sh -p
+tensorflow/lite/tools/make/build_ios_universal_lib.sh -p
 ```
 
 Then following the same steps above and run the benchmark app. You will see the
diff --git a/tensorflow/lite/tools/evaluation/BUILD b/tensorflow/lite/tools/evaluation/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..927a908ffa4a5fb1ea50ba62b568ea9fd9c9d3ed
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/BUILD
@@ -0,0 +1,118 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
+
+common_linkopts = tflite_linkopts() + select({
+    "//conditions:default": [],
+    "//tensorflow:android": [
+        "-pie",
+        "-llog",
+    ],
+})
+
+cc_library(
+    name = "evaluation_stage",
+    srcs = ["evaluation_stage.cc"],
+    hdrs = ["evaluation_stage.h"],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/lite/tools/evaluation/proto:evaluation_config_proto_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "evaluation_stage_factory",
+    hdrs = ["evaluation_stage_factory.h"],
+    copts = tflite_copts(),
+    deps = [
+        ":evaluation_stage",
+        ":identity_stage",
+        "//tensorflow/lite/tools/evaluation/proto:evaluation_config_proto_cc",
+        "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_proto_cc",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+cc_library(
+    name = "identity_stage",
+    srcs = ["identity_stage.cc"],
+    hdrs = ["identity_stage.h"],
+    copts = tflite_copts(),
+    deps = [
+        ":evaluation_stage",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/lite/tools/evaluation/proto:evaluation_config_proto_cc",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:tensorflow",
+                "//tensorflow/core:protos_all_cc",
+                "//tensorflow/core:core_cpu",
+                "//tensorflow/core:framework",
+                "//tensorflow/core:lib",
+                "//tensorflow/core:ops",
+            ],
+        },
+    ),
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "evaluation_stage_test",
+    srcs = ["evaluation_stage_test.cc"],
+    linkopts = common_linkopts,
+    linkstatic = 1,
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":evaluation_stage",
+        ":evaluation_stage_factory",
+        ":identity_stage",
+        "@com_google_googletest//:gtest_main",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/lite/tools/evaluation/proto:evaluation_config_proto_cc",
+        "//tensorflow/lite/tools/evaluation/proto:evaluation_stages_proto_cc",
+    ] + select(
+        {
+            "//tensorflow:android": [
+                "//tensorflow/core:android_tensorflow_lib",
+                "//tensorflow/core:android_tensorflow_test_lib",
+            ],
+            "//conditions:default": [
+                "//tensorflow/core:framework",
+                "//tensorflow/core:core_cpu",
+                "//tensorflow/core:ops",
+                "//tensorflow/core:tensorflow",
+            ],
+        },
+    ),
+)
diff --git a/tensorflow/lite/tools/evaluation/evaluation_stage.cc b/tensorflow/lite/tools/evaluation/evaluation_stage.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7c06142cbe0ba8dd89f31efb6f524ff0761470df
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/evaluation_stage.cc
@@ -0,0 +1,77 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/evaluation/evaluation_stage.h"
+
+#include <string>
+
+#include "absl/strings/str_split.h"
+
+namespace tflite {
+namespace evaluation {
+
+bool EvaluationStage::Init(
+    absl::flat_hash_map<std::string, void*>& object_map) {
+  // Process & validate configuration of tags.
+  std::vector<std::string> initializers, inputs, outputs;
+  for (const auto& init : config_.initializers()) {
+    initializers.emplace_back(init);
+  }
+  for (const auto& in : config_.inputs()) {
+    inputs.emplace_back(in);
+  }
+  for (const auto& out : config_.outputs()) {
+    outputs.emplace_back(out);
+  }
+  if (!ProcessExpectedTags(GetInitializerTags(), initializers) ||
+      !ProcessExpectedTags(GetInputTags(), inputs) ||
+      !ProcessExpectedTags(GetOutputTags(), outputs)) {
+    return false;
+  }
+  // Class-specific stuff.
+  return DoInit(object_map);
+}
+
+bool EvaluationStage::ProcessExpectedTags(
+    const std::vector<std::string>& expected_tags,
+    std::vector<std::string>& tag_to_name_mappings) {
+  // Validate format of each TAG:name mapping in tag_to_name_mappings, and add
+  // it to tags_to_names_map_.
+  for (const std::string& tag_name_mapping : tag_to_name_mappings) {
+    if (!std::regex_match(tag_name_mapping, kTagNameMappingPattern)) {
+      LOG(ERROR) << "Invalid TAG:name mapping: " << tag_name_mapping;
+      return false;
+    }
+    std::vector<std::string> tag_and_name =
+        absl::StrSplit(tag_name_mapping, ':');
+    tags_to_names_map_[tag_and_name[0]] = tag_and_name[1];
+  }
+
+  // Ensure each expected TAG is valid & has been mapped to a name.
+  for (const std::string& tag : expected_tags) {
+    if (!std::regex_match(std::string(tag), kTagPattern)) {
+      LOG(ERROR) << "Invalid expected TAG: " << tag;
+      return false;
+    }
+    if (tags_to_names_map_.find(tag) == tags_to_names_map_.end()) {
+      LOG(ERROR) << "TAG " << tag << " has not been mapped to a name in config "
+                 << config_.name();
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace evaluation
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/evaluation/evaluation_stage.h b/tensorflow/lite/tools/evaluation/evaluation_stage.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ae1810f2387da08ef4e8a951776a1cbfdaec8e8
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/evaluation_stage.h
@@ -0,0 +1,165 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_EVALUATION_EVALUATION_STAGE_H_
+#define TENSORFLOW_LITE_TOOLS_EVALUATION_EVALUATION_STAGE_H_
+
+#include <regex>  // NOLINT
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
+
+namespace tflite {
+namespace evaluation {
+
+// Superclass for a single stage of an EvaluationPipeline.
+// Provides basic functionality for construction and accessing
+// initializers/inputs/outputs.
+// Every subclass of EvaluationStage will define its own behavior by specifying
+// appropriate accessor TAGs and implementing the Init, Run and Close methods.
+class EvaluationStage {
+ public:
+  // Initializes an EvaluationStage. Returns false if initialization failed,
+  // true otherwise.
+  // Should be called only once, before any call to Run().
+  // object_map should contain {initializer name : object pointer} mappings
+  // required for initialization.
+  //
+  // NOTE: EvaluationStage will not take ownership of any elements of
+  // object_map.
+  bool Init(absl::flat_hash_map<std::string, void*>& object_map);
+
+  // An individual run of the EvaluationStage. Returns false if there was a
+  // failure, true otherwise. Populates metrics into the EvaluationStageMetrics
+  // proto.
+  // Init() should be called before any calls to run().
+  // Inputs are acquired from and outputs are written to the incoming
+  // object_map, using appropriate TAGs.
+  //
+  // NOTE: The EvaluationStage should maintain ownership of outputs it
+  // populates into object_map. Ownership of inputs will be maintained
+  // elsewhere.
+  virtual bool Run(absl::flat_hash_map<std::string, void*>& object_map,
+                   EvaluationStageMetrics& metrics) = 0;
+
+  virtual ~EvaluationStage() = default;
+
+ protected:
+  // Constructs an EvaluationStage.
+  // Each subclass constructor must invoke this constructor.
+  //
+  // NOTE: Do NOT use constructors to obtain new EvaluationStages. Use
+  // tflite::evaluation::GetEvaluationStageFromConfig from
+  // evaluation_stage_factory.h instead.
+  explicit EvaluationStage(const EvaluationStageConfig& config)
+      : config_(config) {}
+
+  // Class-specific initialization, to be overridden by EvaluationStage
+  // sub-classes. Gets called in EvaluationStage::Init().
+  //
+  // NOTE: This object should not take ownership of any elements of object_map.
+  virtual bool DoInit(absl::flat_hash_map<std::string, void*>& object_map) = 0;
+
+  // The three following functions return the initializer/input/output TAGs used
+  // by an EvaluationStage. These should be mapped to meaningful names in the
+  // EvaluationStageConfig, and to required objects during calls to Init/Run.
+  // Format for TAGs: [A-Z0-9_]+ (Uppercase letters, numbers, "_")
+  // Refer docs in tflite.evaluation.EvaluationStageConfig for more information.
+
+  // Returns the expected initializer TAGs.
+  virtual std::vector<std::string> GetInitializerTags() = 0;
+
+  // Returns the expected input TAGs.
+  virtual std::vector<std::string> GetInputTags() = 0;
+
+  // Returns the expected output TAGs.
+  virtual std::vector<std::string> GetOutputTags() = 0;
+
+  // Populates a pointer to the object corresponding to provided TAG.
+  // Returns true if success, false otherwise.
+  // object_map must contain {name : object pointer} mappings, with one of the
+  // names being mapped to the expected TAG in the EvaluationStageConfig.
+  template <class T>
+  bool GetObjectFromTag(const std::string& tag,
+                        absl::flat_hash_map<std::string, void*>& object_map,
+                        T** object_ptr) {
+    *object_ptr = nullptr;
+    // Find name corresponding to TAG.
+    auto mapping_iter = tags_to_names_map_.find(tag);
+    if (mapping_iter == tags_to_names_map_.end()) {
+      LOG(ERROR) << "Unexpected TAG: " << tag;
+      return false;
+    }
+    const std::string& expected_name = mapping_iter->second;
+
+    // Find object from name.
+    auto object_iter = object_map.find(expected_name);
+    if (object_iter == object_map.end()) {
+      LOG(ERROR) << "Could not find object for name: " << expected_name;
+      return false;
+    }
+    *object_ptr = static_cast<T*>(object_iter->second);
+    return true;
+  }
+
+  // Maps the appropriate name to a given object in object_map. The name is
+  // derived from mappings provided in the EvaluationStageConfig.
+  // Returns false if tag is invalid, true otherwise.
+  //
+  // NOTE: The EvaluationStage must maintain ownership of object for the
+  // lifetime of object_map
+  bool AssignObjectToTag(const std::string& tag, void* object_ptr,
+                         absl::flat_hash_map<std::string, void*>& object_map) {
+    // Find name corresponding to TAG.
+    auto mapping_iter = tags_to_names_map_.find(tag);
+    if (mapping_iter == tags_to_names_map_.end()) {
+      LOG(ERROR) << "Unexpected TAG: " << tag;
+      return false;
+    }
+    const std::string& expected_name = mapping_iter->second;
+
+    object_map[expected_name] = object_ptr;
+    return true;
+  }
+
+  const EvaluationStageConfig config_;
+
+ private:
+  // Verifies that all TAGs from expected_tags are present in
+  // tag_to_name_mappings, and then populates tags_to_names_map_ with the
+  // appropriate entries. Returns false in case any TAG/mapping is invalid, true
+  // otherwise.
+  // expected_tags should be a list of TAG-strings.
+  // tag_to_name_mappings should be RepeatedPtrField of strings mapping TAGs to
+  // names in the form "SOME_TAG:some_name".
+  bool ProcessExpectedTags(const std::vector<std::string>& expected_tags,
+                           std::vector<std::string>& tag_to_name_mappings);
+
+  // Maps expected TAGs to their names as defined by the EvaluationStageConfig.
+  absl::flat_hash_map<std::string, std::string> tags_to_names_map_;
+
+  // To ensure correct formatting in the config.
+  const std::regex kTagNameMappingPattern{"^([A-Z0-9_]+):([a-z0-9_]+)$",
+                                          std::regex::optimize};
+
+  // To ensure correct formatting in TAG names.
+  const std::regex kTagPattern{"^[A-Z0-9_]+$", std::regex::optimize};
+};
+
+}  // namespace evaluation
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_EVALUATION_EVALUATION_STAGE_H_
diff --git a/tensorflow/lite/tools/evaluation/evaluation_stage_factory.h b/tensorflow/lite/tools/evaluation/evaluation_stage_factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..ea70fae614d18d1097340e229493d9a87145c34f
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/evaluation_stage_factory.h
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_EVALUATION_EVALUATION_STAGE_FACTORY_H_
+#define TENSORFLOW_LITE_TOOLS_EVALUATION_EVALUATION_STAGE_FACTORY_H_
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/tools/evaluation/evaluation_stage.h"
+#include "tensorflow/lite/tools/evaluation/identity_stage.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
+
+namespace tflite {
+namespace evaluation {
+
+// The canonical way to generate EvaluationStages.
+// TODO(b/122482115): Implement a Factory class for registration of classes.
+std::unique_ptr<EvaluationStage> CreateEvaluationStageFromConfig(
+    const EvaluationStageConfig& config) {
+  if (!config.has_specification() ||
+      !config.specification().has_process_class()) {
+    LOG(ERROR) << "Process specification not present in config: "
+               << config.name();
+    return nullptr;
+  }
+  switch (config.specification().process_class()) {
+    case UNKNOWN:
+      return nullptr;
+    case IDENTITY:
+      return absl::make_unique<IdentityStage>(config);
+  }
+}
+
+}  // namespace evaluation
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_EVALUATION_EVALUATION_STAGE_FACTORY_H_
diff --git a/tensorflow/lite/tools/evaluation/evaluation_stage_test.cc b/tensorflow/lite/tools/evaluation/evaluation_stage_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..79cbf1e5e53f7159a5509641ee400cc32343cef7
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/evaluation_stage_test.cc
@@ -0,0 +1,145 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/evaluation/evaluation_stage.h"
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/lite/tools/evaluation/evaluation_stage_factory.h"
+#include "tensorflow/lite/tools/evaluation/identity_stage.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_stages.pb.h"
+
+namespace tflite {
+namespace evaluation {
+namespace {
+
+using ::tensorflow::DataType;
+using ::tensorflow::Tensor;
+
+constexpr char kIdentityStageName[] = "identity_stage";
+constexpr char kInputTypeName[] = "type";
+constexpr char kInputTensorsName[] = "in";
+constexpr char kOutputTensorsName[] = "out";
+constexpr char kInitializerMapping[] = "INPUT_TYPE:type";
+constexpr char kInputMapping[] = "INPUT_TENSORS:in";
+constexpr char kOutputMapping[] = "OUTPUT_TENSORS:out";
+
+EvaluationStageConfig GetIdentityStageConfig() {
+  EvaluationStageConfig config;
+  config.set_name(kIdentityStageName);
+  config.mutable_specification()->set_process_class(IDENTITY);
+  config.add_initializers(kInitializerMapping);
+  config.add_inputs(kInputMapping);
+  config.add_outputs(kOutputMapping);
+  return config;
+}
+
+TEST(EvaluationStage, IncompleteConfig) {
+  // Construct
+  EvaluationStageConfig config = GetIdentityStageConfig();
+  config.clear_inputs();
+  std::unique_ptr<EvaluationStage> stage_ptr =
+      CreateEvaluationStageFromConfig(config);
+  // Initialize
+  absl::flat_hash_map<std::string, void*> object_map;
+  DataType input_type = tensorflow::DT_FLOAT;
+  object_map[kInputTypeName] = &input_type;
+  EXPECT_FALSE(stage_ptr->Init(object_map));
+}
+
+TEST(EvaluationStage, IncorrectlyFormattedConfig) {
+  // Construct
+  EvaluationStageConfig config = GetIdentityStageConfig();
+  config.clear_initializers();
+  config.add_initializers("INPUT_TYPE-type");
+  std::unique_ptr<EvaluationStage> stage_ptr =
+      CreateEvaluationStageFromConfig(config);
+  // Initialize
+  absl::flat_hash_map<std::string, void*> object_map;
+  DataType input_type = tensorflow::DT_FLOAT;
+  object_map[kInputTypeName] = &input_type;
+  EXPECT_FALSE(stage_ptr->Init(object_map));
+}
+
+TEST(EvaluationStage, ConstructFromConfig_UnknownProcess) {
+  // Construct
+  EvaluationStageConfig config = GetIdentityStageConfig();
+  config.mutable_specification()->clear_process_class();
+  std::unique_ptr<EvaluationStage> stage_ptr =
+      CreateEvaluationStageFromConfig(config);
+  EXPECT_EQ(stage_ptr.get(), nullptr);
+}
+
+TEST(EvaluationStage, NoInitializer) {
+  // Construct
+  EvaluationStageConfig config = GetIdentityStageConfig();
+  std::unique_ptr<EvaluationStage> stage_ptr =
+      CreateEvaluationStageFromConfig(config);
+  // Initialize
+  absl::flat_hash_map<std::string, void*> object_map;
+  EXPECT_FALSE(stage_ptr->Init(object_map));
+}
+
+TEST(EvaluationStage, NoInputs) {
+  // Construct
+  EvaluationStageConfig config = GetIdentityStageConfig();
+  std::unique_ptr<EvaluationStage> stage_ptr =
+      CreateEvaluationStageFromConfig(config);
+  // Initialize
+  absl::flat_hash_map<std::string, void*> object_map;
+  DataType input_type = tensorflow::DT_FLOAT;
+  object_map[kInputTypeName] = &input_type;
+  EXPECT_TRUE(stage_ptr->Init(object_map));
+
+  // Run
+  EvaluationStageMetrics metrics;
+  EXPECT_FALSE(stage_ptr->Run(object_map, metrics));
+}
+
+TEST(EvaluationStage, ExpectedIdentityOutput) {
+  // Construct
+  EvaluationStageConfig config = GetIdentityStageConfig();
+  std::unique_ptr<EvaluationStage> stage_ptr =
+      CreateEvaluationStageFromConfig(config);
+  // Initialize
+  absl::flat_hash_map<std::string, void*> object_map;
+  DataType input_type = tensorflow::DT_FLOAT;
+  object_map[kInputTypeName] = &input_type;
+  EXPECT_TRUE(stage_ptr->Init(object_map));
+
+  // Input Data
+  float float_value = 5.6f;
+  Tensor input_tensor(float_value);
+  std::vector<Tensor> input_tensors = {input_tensor};
+  // Run
+  object_map[kInputTensorsName] = &input_tensors;
+  EvaluationStageMetrics metrics;
+  EXPECT_TRUE(stage_ptr->Run(object_map, metrics));
+
+  // Check output
+  std::vector<Tensor>* output_tensors_ptr =
+      static_cast<std::vector<Tensor>*>(object_map[kOutputTensorsName]);
+  EXPECT_TRUE(output_tensors_ptr != nullptr);
+  EXPECT_FLOAT_EQ(output_tensors_ptr->at(0).scalar<float>()(), float_value);
+  EXPECT_GE(metrics.total_latency_ms(), 0);
+}
+
+}  // namespace
+}  // namespace evaluation
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/evaluation/identity_stage.cc b/tensorflow/lite/tools/evaluation/identity_stage.cc
new file mode 100644
index 0000000000000000000000000000000000000000..337372a475b3a1c4a35bd0e6dd67c173e7317fb9
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/identity_stage.cc
@@ -0,0 +1,91 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/evaluation/identity_stage.h"
+
+#include <ctime>
+
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tflite {
+namespace evaluation {
+
+using ::tensorflow::Scope;
+using ::tensorflow::SessionOptions;
+using ::tensorflow::Tensor;
+using ::tensorflow::ops::Identity;
+using ::tensorflow::ops::Placeholder;
+
+IdentityStage::IdentityStage(const EvaluationStageConfig& config)
+    : EvaluationStage(config) {
+  stage_input_name_ = config_.name() + "_identity_input";
+  stage_output_name_ = config_.name() + "_identity_output";
+}
+
+bool IdentityStage::DoInit(
+    absl::flat_hash_map<std::string, void*>& object_map) {
+  // Initialize TF Graph.
+  const Scope scope = Scope::NewRootScope();
+  if (!GetObjectFromTag(kInputTypeTag, object_map, &input_type_)) {
+    return false;
+  }
+  auto input_placeholder =
+      Placeholder(scope.WithOpName(stage_input_name_), *input_type_);
+  stage_output_ =
+      Identity(scope.WithOpName(stage_output_name_), input_placeholder);
+  if (!scope.status().ok() || !scope.ToGraphDef(&graph_def_).ok()) {
+    return false;
+  }
+
+  // Initialize TF Session.
+  session_.reset(NewSession(SessionOptions()));
+  if (!session_->Create(graph_def_).ok()) {
+    return false;
+  }
+
+  return true;
+}
+
+bool IdentityStage::Run(absl::flat_hash_map<std::string, void*>& object_map,
+                        EvaluationStageMetrics& metrics) {
+  std::vector<Tensor>* input_tensors;
+  if (!GetObjectFromTag(kInputTensorsTag, object_map, &input_tensors)) {
+    return false;
+  }
+  tensor_outputs_.clear();
+  // TODO(b/122482115): Encapsulate timing into its own helper.
+  std::clock_t start = std::clock();
+  if (!session_
+           ->Run({{stage_input_name_, input_tensors->at(0)}},
+                 {stage_output_name_}, {}, &tensor_outputs_)
+           .ok()) {
+    return false;
+  }
+  metrics.set_total_latency_ms(
+      static_cast<float>((std::clock() - start) / (CLOCKS_PER_SEC / 1000)));
+
+  if (!AssignObjectToTag(kOutputTensorsTag, &tensor_outputs_, object_map)) {
+    return false;
+  }
+  return true;
+}
+
+const char IdentityStage::kInputTypeTag[] = "INPUT_TYPE";
+const char IdentityStage::kInputTensorsTag[] = "INPUT_TENSORS";
+const char IdentityStage::kOutputTensorsTag[] = "OUTPUT_TENSORS";
+
+}  // namespace evaluation
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/evaluation/identity_stage.h b/tensorflow/lite/tools/evaluation/identity_stage.h
new file mode 100644
index 0000000000000000000000000000000000000000..8bdddb1d74de0bb768529c42f33973dcbae6bc8d
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/identity_stage.h
@@ -0,0 +1,72 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_EVALUATION_IDENTITY_STAGE_H_
+#define TENSORFLOW_LITE_TOOLS_EVALUATION_IDENTITY_STAGE_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/lite/tools/evaluation/evaluation_stage.h"
+#include "tensorflow/lite/tools/evaluation/proto/evaluation_config.pb.h"
+
+namespace tflite {
+namespace evaluation {
+
+// Simple EvaluationStage subclass that encapsulates the functionality of
+// tensorflow::ops::Identity. Primarily used for tests.
+// Initializer TAGs (Object Class): INPUT_TYPE (DataType)
+// Input TAGs (Object Class): INPUT_TENSORS (std::vector<Tensor>)
+// Output TAGs (Object Class): OUTPUT_TENSORS (std::vector<Tensor>)
+// TODO(b/122482115): Migrate common TF-related code into an abstract class.
+class IdentityStage : public EvaluationStage {
+ public:
+  explicit IdentityStage(const EvaluationStageConfig& config);
+
+  bool Run(absl::flat_hash_map<std::string, void*>& object_map,
+           EvaluationStageMetrics& metrics) override;
+
+  ~IdentityStage() {}
+
+ protected:
+  bool DoInit(absl::flat_hash_map<std::string, void*>& object_map) override;
+
+  std::vector<std::string> GetInitializerTags() override {
+    return {kInputTypeTag};
+  }
+  std::vector<std::string> GetInputTags() override {
+    return {kInputTensorsTag};
+  }
+  std::vector<std::string> GetOutputTags() override {
+    return {kOutputTensorsTag};
+  }
+
+ private:
+  ::tensorflow::DataType* input_type_;
+  ::tensorflow::GraphDef graph_def_;
+  ::tensorflow::Output stage_output_;
+  std::unique_ptr<::tensorflow::Session> session_;
+  std::vector<::tensorflow::Tensor> tensor_outputs_;
+  std::string stage_input_name_;
+  std::string stage_output_name_;
+
+  static const char kInputTypeTag[];
+  static const char kInputTensorsTag[];
+  static const char kOutputTensorsTag[];
+};
+
+}  // namespace evaluation
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_EVALUATION_IDENTITY_STAGE_H_
diff --git a/tensorflow/lite/tools/evaluation/proto/BUILD b/tensorflow/lite/tools/evaluation/proto/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..6c747357040a968b35ff99ca97a3b8a6677340e6
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/proto/BUILD
@@ -0,0 +1,40 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//visibility:public"])
+
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library_cc",
+)
+
+tf_proto_library_cc(
+    name = "evaluation_stages_proto",
+    srcs = [
+        "evaluation_stages.proto",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library_cc(
+    name = "evaluation_config_proto",
+    srcs = [
+        "evaluation_config.proto",
+    ],
+    protodeps = [":evaluation_stages_proto"],
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/lite/tools/evaluation/proto/evaluation_config.proto b/tensorflow/lite/tools/evaluation/proto/evaluation_config.proto
new file mode 100644
index 0000000000000000000000000000000000000000..4f36ac83b87f6b6b30bfc3e1ecd9ad1c5319a57f
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/proto/evaluation_config.proto
@@ -0,0 +1,48 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto2";
+
+package tflite.evaluation;
+
+import "tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto";
+
+// Next ID: 6
+message EvaluationStageConfig {
+  optional string name = 1;
+
+  // Specification defining what this stage does, and any required parameters.
+  optional ProcessSpecification specification = 2;
+
+  // initializers, inputs and outputs are strings that define colon-separated
+  // mappings between TAGs and their corresponding names.
+  // These names help EvaluationStages communicate with each other during runs.
+  // Format for TAGs: [A-Z0-9_]+ (Uppercase letters, numbers, "_")
+  // Format for names: [a-z0-9_]+ (Lowercase letters, numbers, "_")
+  // Example mapping: "BITMAP1:image_in"
+  // It is up to individual EvaluationStage sub-classes to specify the
+  // initializer/input TAGs they require, and outputs TAGs they provide.
+  repeated string initializers = 3;
+  repeated string inputs = 4;
+  repeated string outputs = 5;
+}
+
+message EvaluationStageMetrics {
+  // Total latency in ms.
+  optional double total_latency_ms = 1;
+
+  // Process-specific numbers such as accuracy, step-latencies, etc.
+  optional ProcessMetrics process_metrics = 2;
+}
diff --git a/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto b/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
new file mode 100644
index 0000000000000000000000000000000000000000..f45d96fafc3d712677a53d8f8321bb0753d60525
--- /dev/null
+++ b/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto2";
+
+package tflite.evaluation;
+
+// All EvaluationStage sub-classes must add a value here.
+// A corresponding entry must also be present in EvaluationStage.FromConfig
+enum ProcessClass {
+  // Default/Unknown
+  UNKNOWN = 0;
+  // Identity
+  IDENTITY = 1;
+}
+
+// Defines the functionality executed by an EvaluationStage.
+// TODO(b/122482115): Add stage-specific options using oneof.
+message ProcessSpecification {
+  optional ProcessClass process_class = 1;
+}
+
+// Contains specific metrics, which may differ based on what an EvaluationStage
+// does.
+// TODO(b/122482115): Add stage-specific metrics using oneof.
+message ProcessMetrics {}
diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index 994f660dba7742de162525dcf6a8c6a288ee71c6..4a22d7630adbdec4386034f045bd8b63e01f249a 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -109,17 +109,37 @@ $(wildcard tensorflow/lite/*test.cc) \
 $(wildcard tensorflow/lite/*/*test.cc) \
 $(wildcard tensorflow/lite/*/*/*test.cc) \
 $(wildcard tensorflow/lite/*/*/*/*test.cc) \
-$(wildcard tensorflow/lite/kernels/test_util.cc) \
+$(wildcard tensorflow/lite/kernels/*test_util.cc) \
 $(MINIMAL_SRCS)
+
+ifeq ($(BUILD_TYPE),micro)
+	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/mmap_allocation.cc
+else
+	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/mmap_allocation_disabled.cc
+endif
+
+BUILD_WITH_NNAPI=true
 ifeq ($(BUILD_TYPE),micro)
-CORE_CC_EXCLUDE_SRCS += \
-tensorflow/lite/mmap_allocation.cc \
-tensorflow/lite/nnapi_delegate.cc
+	BUILD_WITH_NNAPI=false
+endif
+ifeq ($(TARGET),ios)
+	BUILD_WITH_NNAPI=false
+endif
+ifeq ($(BUILD_WITH_NNAPI),true)
+	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/nnapi_delegate_disabled.cc
+else
+	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/nnapi_delegate.cc
+endif
+
+ifeq ($(TARGET),ios)
+	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/minimal_logging_android.cc
+	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/minimal_logging_default.cc
 else
-CORE_CC_EXCLUDE_SRCS += \
-tensorflow/lite/mmap_allocation_disabled.cc \
-tensorflow/lite/nnapi_delegate_disabled.cc
+	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/minimal_logging_android.cc
+	CORE_CC_EXCLUDE_SRCS += tensorflow/lite/minimal_logging_ios.cc
 endif
+
+
 # Filter out all the excluded files.
 TF_LITE_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
 
diff --git a/tensorflow/lite/tools/make/download_dependencies.sh b/tensorflow/lite/tools/make/download_dependencies.sh
index fa3d5d3d3b6657ff327dd6ec34bd65823da13cd2..8c4992a84304ded382e36e9e18e452100d94a391 100755
--- a/tensorflow/lite/tools/make/download_dependencies.sh
+++ b/tensorflow/lite/tools/make/download_dependencies.sh
@@ -100,5 +100,6 @@ replace_by_sed 's#static uint32x2_t p2ui_CONJ_XOR = vld1_u32( conj_XOR_DATA );#s
   "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
 replace_by_sed 's#static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA );#static uint64x2_t p2ul_CONJ_XOR;// = vld1q_u64( p2ul_conj_XOR_DATA ); - Removed by script#' \
   "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
+cat "$SCRIPT_DIR/../../../../third_party/eigen3/gebp_neon.patch" | patch "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h"
 
 echo "download_dependencies.sh completed successfully." >&2
diff --git a/tensorflow/lite/tools/optimize/BUILD b/tensorflow/lite/tools/optimize/BUILD
index 0a0d5cc4123ba64c7208c5e74344248b28af6851..cf99cada4a2487bed08a05246b706c69edd9501d 100644
--- a/tensorflow/lite/tools/optimize/BUILD
+++ b/tensorflow/lite/tools/optimize/BUILD
@@ -1,25 +1,181 @@
-# TODO(suharshs): Write quantize_weights tests that use small exportable files.
-# Then we can remove this file.
-package(
-    default_visibility = ["//visibility:public"],
-)
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+package(default_visibility = [
+    "//visibility:public",
+])
 
 licenses(["notice"])  # Apache 2.0
 
-exports_files(["LICENSE"])
+exports_files(glob([
+    "testdata/*.bin",
+]))
 
-load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+cc_library(
+    name = "quantization_utils",
+    srcs = ["quantization_utils.cc"],
+    hdrs = ["quantization_utils.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/kernels/internal:round",
+        "//tensorflow/lite/kernels/internal:types",
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
+)
+
+tf_cc_test(
+    name = "quantization_utils_test",
+    srcs = ["quantization_utils_test.cc"],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":quantization_utils",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
 
 cc_library(
     name = "quantize_weights",
     srcs = ["quantize_weights.cc"],
     hdrs = ["quantize_weights.h"],
     deps = [
+        ":quantization_utils",
+        "@com_google_absl//absl/memory",
+        "@flatbuffers",
+        "//tensorflow/lite:framework",
+        # TODO(suharshs): Move the relevant quantization utils to a non-internal location.
+        "//tensorflow/lite/kernels/internal:tensor_utils",
+        "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/core:tflite_portable_logging",
+    ],
+)
+
+tf_cc_test(
+    name = "quantize_weights_test",
+    srcs = ["quantize_weights_test.cc"],
+    args = [
+        "--test_model_file=$(location //tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin)",
+    ],
+    data = [
+        "//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin",
+        "//tensorflow/lite/tools/optimize:testdata/weight_shared_between_convs.bin",
+    ],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":quantize_weights",
+        ":test_util",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "subgraph_quantizer",
+    srcs = ["subgraph_quantizer.cc"],
+    hdrs = ["subgraph_quantizer.h"],
+    deps = [
+        ":quantization_utils",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/kernels/internal:round",
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/memory",
         "@flatbuffers",
     ],
 )
+
+cc_library(
+    name = "test_util",
+    testonly = 1,
+    srcs = ["test_util.cc"],
+    hdrs = ["test_util.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/core/api",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "subgraph_quantizer_test",
+    srcs = ["subgraph_quantizer_test.cc"],
+    args = [
+        "--test_model_file=$(location //tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin)",
+    ],
+    data = [
+        "//tensorflow/lite/tools/optimize:testdata/single_avg_pool_min_minus_5_max_plus_5.bin",
+        "//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin",
+        "//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_minus_127_max_plus_127.bin",
+        "//tensorflow/lite/tools/optimize:testdata/single_softmax_min_minus_5_max_plus_5.bin",
+    ],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":subgraph_quantizer",
+        ":test_util",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "quantize_model",
+    srcs = ["quantize_model.cc"],
+    hdrs = ["quantize_model.h"],
+    deps = [
+        ":subgraph_quantizer",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/memory",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "quantize_model_test",
+    srcs = ["quantize_model_test.cc"],
+    args = [
+        "--test_model_file=$(location //tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin)",
+    ],
+    data = [
+        "//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin",
+    ],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":quantize_model",
+        ":test_util",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+tflite_portable_test_suite()
diff --git a/tensorflow/lite/tools/optimize/calibration/BUILD b/tensorflow/lite/tools/optimize/calibration/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..c1d2ad2bca8f76b1e07dfe6d6027ec69cd821c8a
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration/BUILD
@@ -0,0 +1,138 @@
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+cc_library(
+    name = "calibrator_lib",
+    srcs = ["calibrator.cc"],
+    hdrs = ["calibrator.h"],
+    deps = [
+        ":calibration_common",
+        ":calibration_logger",
+        ":calibration_reader",
+        ":logging_op_resolver",
+        ":node_info_delegate",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/memory",
+        "@flatbuffers",
+    ],
+)
+
+tf_cc_test(
+    name = "calibrator_test",
+    srcs = ["calibrator_test.cc"],
+    args = [
+        "--test_model_file=$(location //tensorflow/lite:testdata/multi_add.bin)",
+    ],
+    data = [
+        "//tensorflow/lite:testdata/multi_add.bin",
+    ],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":calibrator_lib",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "logging_op_resolver",
+    srcs = ["logging_op_resolver.cc"],
+    hdrs = ["logging_op_resolver.h"],
+    deps = [
+        ":calibration_common",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/core/api",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+cc_test(
+    name = "logging_op_resolver_test",
+    srcs = ["logging_op_resolver_test.cc"],
+    deps = [
+        ":logging_op_resolver",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "calibration_reader",
+    srcs = ["calibration_reader.cc"],
+    hdrs = ["calibration_reader.h"],
+    deps = [
+        ":calibration_logger",
+        "//tensorflow/lite:framework",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+cc_library(
+    name = "calibration_logger",
+    hdrs = ["calibration_logger.h"],
+    deps = [
+        "//tensorflow/lite/c:c_api_internal",
+    ],
+)
+
+cc_library(
+    name = "calibration_common",
+    hdrs = ["calibration_common.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+    ],
+)
+
+cc_library(
+    name = "node_info_delegate",
+    srcs = ["node_info_delegate.cc"],
+    hdrs = ["node_info_delegate.h"],
+    deps = [
+        ":calibration_common",
+        "//tensorflow/lite:framework",
+    ],
+)
+
+tf_cc_test(
+    name = "node_info_delegate_test",
+    srcs = ["node_info_delegate_test.cc"],
+    args = [
+        "--test_model_file=$(location //tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin)",
+    ],
+    data = [
+        "//tensorflow/lite/tools/optimize:testdata/single_conv_weights_min_0_max_plus_10.bin",
+    ],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":node_info_delegate",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/tools/optimize:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tflite_portable_test_suite()
diff --git a/tensorflow/lite/tools/optimize/calibration/calibration_common.h b/tensorflow/lite/tools/optimize/calibration/calibration_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ff2d3f18a66ca4323727b8403515e857e54d8cc
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration/calibration_common.h
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_COMMON_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_COMMON_H_
+
+#include <unordered_map>
+#include <unordered_set>
+
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+using BuiltinOperatorKey = std::pair<BuiltinOperator, int>;
+
+using BuiltinOpsSet = std::unordered_set<
+    BuiltinOperatorKey,
+    op_resolver_hasher::OperatorKeyHasher<BuiltinOperatorKey>>;
+
+template <typename T>
+class BuiltinOpsMap
+    : public std::unordered_map<
+          BuiltinOperatorKey, T,
+          op_resolver_hasher::OperatorKeyHasher<BuiltinOperatorKey>> {};
+
+// An alias for |TfLiteRegistration.invoke|.
+using KernelEvalFuncPtr = TfLiteStatus (*)(TfLiteContext*, TfLiteNode*);
+
+enum class OperatorTensorType { kNone, kInput, kOutput, kIntermediate };
+
+// Information about an operator in the TfLite graph.
+struct OperatorInfo {
+  int node_index;
+  std::string name;
+  BuiltinOperator builtin_op_code;
+  bool is_custom_op;
+  std::vector<int> inputs;
+  std::vector<int> outputs;
+  // Inputs that need to be logged.
+  std::vector<int> loggable_inputs;
+  // Outputs that need to be logged.
+  std::vector<int> loggable_outputs;
+  const TfLiteRegistration* registration;
+};
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_COMMON_H_
diff --git a/tensorflow/lite/tools/optimize/calibration/calibration_logger.h b/tensorflow/lite/tools/optimize/calibration/calibration_logger.h
new file mode 100644
index 0000000000000000000000000000000000000000..8fd380423a3ee0e671fcedd5c3e2cdf566c993eb
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration/calibration_logger.h
@@ -0,0 +1,85 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_LOGGER_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_LOGGER_H_
+
+#include <unordered_map>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+class MinMax {
+ public:
+  void Update(const float* values, size_t tensor_size) {
+    // TODO(shashishekhar): Really slow implementation, optimize
+    if (tensor_size <= 0) return;
+
+    if (!has_values_) {
+      min_ = max_ = values[0];
+      has_values_ = true;
+      return;
+    }
+
+    // We are only logging absolute min/max here.
+    // TODO(shashishekhar): Make it possible to use weighted/moving average.
+    for (size_t i = 0; i < tensor_size; i++) {
+      float val = values[i];
+      if (min_ > val) {
+        min_ = val;
+      } else if (max_ < val) {
+        max_ = val;
+      }
+    }
+  }
+
+  bool HasValues() const { return has_values_; }
+
+  TfLiteStatus Get(float* min_val, float* max_val) const {
+    if (!has_values_) return kTfLiteError;
+    *min_val = min_;
+    *max_val = max_;
+    return kTfLiteOk;
+  }
+
+ private:
+  bool has_values_;
+  float min_, max_;
+};
+
+// Captures min max values for tensors.
+class Logger {
+ public:
+  // Log the value for tensor at |tensor_index| which has |tensor_values|
+  void LogTensorValue(int tensor_index, const float* tensor_values,
+                      size_t tensor_size) {
+    tensor_id_to_stats_map_[tensor_index].Update(tensor_values, tensor_size);
+  }
+
+  // Returns a map from tensor_index -> observed min max values.
+  const std::unordered_map<int, MinMax>& GetCalibrationValues() const {
+    return tensor_id_to_stats_map_;
+  }
+
+ private:
+  std::unordered_map<int, MinMax> tensor_id_to_stats_map_;
+};
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_LOGGER_H_
diff --git a/tensorflow/lite/tools/optimize/calibration/calibration_reader.cc b/tensorflow/lite/tools/optimize/calibration/calibration_reader.cc
new file mode 100644
index 0000000000000000000000000000000000000000..69e9c5aed8dc3a6a27225fc55d87b900dc9d4730
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration/calibration_reader.cc
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/calibration/calibration_reader.h"
+
+#include "absl/memory/memory.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+TfLiteStatus CalibrationReader::GetTensorStatsAsMap(
+    std::unordered_map<int, CalibrationStats>* tensor_id_to_stats_map) const {
+  tensor_id_to_stats_map->clear();
+  for (const auto& tensorid_stat : logger_->GetCalibrationValues()) {
+    auto minmax = tensorid_stat.second;
+    CalibrationReader::CalibrationStats stats;
+    TF_LITE_ENSURE_STATUS(minmax.Get(&stats.min, &stats.max));
+    tensor_id_to_stats_map->insert({tensorid_stat.first, stats});
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus CalibrationReader::AddCalibrationToModel(ModelT* model) const {
+  if (!model || model->subgraphs.empty()) {
+    return kTfLiteError;
+  }
+  const auto& subgraph = model->subgraphs[0];
+  for (const auto& tensorid_stat : logger_->GetCalibrationValues()) {
+    auto minmax = tensorid_stat.second;
+    float min, max;
+    TF_LITE_ENSURE_STATUS(minmax.Get(&min, &max));
+    auto quant_params = absl::make_unique<tflite::QuantizationParametersT>();
+    quant_params->min.push_back(min);
+    quant_params->max.push_back(max);
+    subgraph->tensors[tensorid_stat.first]->quantization =
+        std::move(quant_params);
+  }
+
+  return kTfLiteOk;
+}
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/calibration/calibration_reader.h b/tensorflow/lite/tools/optimize/calibration/calibration_reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..0120d841900e4432fcee49e285ade46007bd3660
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration/calibration_reader.h
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_READER_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_READER_H_
+
+#include <unordered_map>
+
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_logger.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+// Warning: This is not a public API and subject to change.
+//
+// Reads calibrator data collected by running the interpreter through
+// a calibration set.
+class CalibrationReader {
+ public:
+  struct CalibrationStats {
+    float min;
+    float max;
+  };
+  explicit CalibrationReader(const Logger* logger) : logger_(logger) {}
+
+  // Gets a map from tensor index to recorded calibration values.
+  virtual TfLiteStatus GetTensorStatsAsMap(
+      std::unordered_map<int, CalibrationStats>* tensor_id_to_stats_map) const;
+
+  // Annotates the tensors in the given model with statistics captured during
+  // calibration.
+  virtual TfLiteStatus AddCalibrationToModel(ModelT* model) const;
+
+  virtual ~CalibrationReader() {}
+
+ private:
+  const Logger* logger_;
+};
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATION_READER_H_
diff --git a/tensorflow/lite/tools/optimize/calibration/calibrator.cc b/tensorflow/lite/tools/optimize/calibration/calibrator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eead4e590f8a42c5362b4efb952511b48e51d2de
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration/calibrator.cc
@@ -0,0 +1,347 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/calibration/calibrator.h"
+
+#include <fstream>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_util.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_common.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_logger.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_reader.h"
+#include "tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h"
+#include "tensorflow/lite/tools/optimize/calibration/node_info_delegate.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+
+namespace {
+
+// Calibrator is used to hold information that can be accessed during kernel
+// invocations.
+// TfLite kernel invocations are C functions and cannot look at the global
+// structure of the graph. Calibrator allows the kernel invoke functions to
+// access the global structure of graph and know which node is currently being
+// executed. This also allows us to write a simple kernel invoke wrapper
+// (see LoggingEval) that can work for most builtin ops.
+class Calibrator {
+ public:
+  Calibrator(const std::unordered_map<const TfLiteNode*, OperatorInfo>&
+                 node_ptr_opinfo_map,
+             std::unique_ptr<LoggingOpResolver> logging_op_resolver)
+      : node_ptr_opinfo_map_(node_ptr_opinfo_map),
+        logging_op_resolver_(std::move(logging_op_resolver)) {
+    logger_ = absl::make_unique<Logger>();
+  }
+
+  // Returns the wrapped kernel invoke function |TfLiteRegistration.invoke|.
+  KernelEvalFuncPtr GetKernelInvoke(const TfLiteNode* node) const;
+
+  // Gets the instance of logger associated with the current context.
+  Logger* GetLogger() const { return logger_.get(); }
+
+  // Gets the operator information about the given TfLiteNode.
+  const OperatorInfo& GetOpInfo(const TfLiteNode* node) const {
+    return node_ptr_opinfo_map_.at(node);
+  }
+
+ private:
+  std::unordered_map<const TfLiteNode*, OperatorInfo> node_ptr_opinfo_map_;
+  std::unique_ptr<LoggingOpResolver> logging_op_resolver_;
+  const std::unordered_map<int, OperatorInfo> index_opinfo_;
+  std::unique_ptr<Logger> logger_;
+};
+
+KernelEvalFuncPtr Calibrator::GetKernelInvoke(const TfLiteNode* node) const {
+  auto op_info = node_ptr_opinfo_map_.at(node);
+  return logging_op_resolver_->GetWrappedKernelInvoke(op_info.builtin_op_code,
+                                                      1);
+}
+
+// A registry of |Calibrator| objects per |TfLiteContext|.
+// This global registry is needed to access |Calibrator| objects in the kernel
+// invoke functions i.e. |TfLiteRegistration.invoke|.
+// Kernel invoke functions are C functions that have limited access to
+// |TfLiteContext|. Kernel invoke functions don't have access to global state of
+// graph. That means during a kernel invocation, the function cannot know which
+// node it was invoked for. E.g. in case of a model with |Conv| op at two
+// locations, there is no easy way for the Conv.invoke function to disambiguate
+// the calls.
+//
+// For calibration we solve this problem by creating a map of calibrators
+// per |TfLiteContext|. This map is |GlobalCalibrationRegistry|.
+//
+// This registry is then accessed using a global getter function:
+// |GetCalibratorRegistry|.
+// E.g.
+// TfLiteStatus SomeKernelInvokeFn(TfLiteContext* context, TfLiteNode* node) {
+//   .... code ....
+//   auto registry = GetCalibratorRegistry();
+//   auto calibrator = registry->GetCalibrator(context);
+//   ..... code ....
+//  }
+//
+// This way the kernel invoke functions can get the access to the Calibrator
+// object associated with the |TfLiteContext|.
+class GlobalCalibratorRegistry {
+ public:
+  // Get the |Calibrator| associated with given context, returns null if no
+  // calibrator is associated with the given context.
+  Calibrator* GetCalibrator(const TfLiteContext* context) const {
+    if (calibrator_registry_.find(context) == calibrator_registry_.cend()) {
+      return nullptr;
+    }
+    return calibrator_registry_.at(context).get();
+  }
+
+  // Removes the association between calibrator and context.
+  // Note: This deletes the calibrator as well.
+  void RemoveCalibrator(const TfLiteContext* context) {
+    calibrator_registry_.erase(context);
+  }
+
+  // Creates an instance of |Calibrator|.
+  // Registry owns the |Calibrator| object which can be deleted by calling
+  // |RemoveCalibrator|.
+  TfLiteStatus CreateCalibrator(
+      const TfLiteContext* context,
+      const std::unordered_map<const TfLiteNode*, OperatorInfo>& node_to_opinfo,
+      std::unique_ptr<LoggingOpResolver> logging_op_resolver,
+      Calibrator** calibrator_ptr, ErrorReporter* reporter) {
+    if (calibrator_registry_.find(context) != calibrator_registry_.cend()) {
+      reporter->Report(
+          "Failed to create calibrator, context already registered.");
+      return kTfLiteError;
+    }
+    std::unique_ptr<Calibrator> calibrator = absl::make_unique<Calibrator>(
+        node_to_opinfo, std::move(logging_op_resolver));
+    calibrator_registry_[context] = std::move(calibrator);
+    *calibrator_ptr = calibrator_registry_.at(context).get();
+    return kTfLiteOk;
+  }
+
+ private:
+  std::unordered_map<const TfLiteContext*, std::unique_ptr<Calibrator>>
+      calibrator_registry_;
+};
+
+GlobalCalibratorRegistry* GetCalibratorRegistry() {
+  static GlobalCalibratorRegistry* registry = new GlobalCalibratorRegistry();
+  return registry;
+}
+
+// A wrapper implementation for |TfLiteRegistration.invoke| that logs inputs,
+// invokes the wrapped implementation and then logs the outputs.
+TfLiteStatus LoggingEval(TfLiteContext* context, TfLiteNode* node) {
+  Calibrator* calibrator = GetCalibratorRegistry()->GetCalibrator(context);
+
+  if (!calibrator) {
+    context->ReportError(context, "No calibrator found for context.");
+    return kTfLiteError;
+  }
+
+  auto kernel_invoke = calibrator->GetKernelInvoke(node);
+  auto logger = calibrator->GetLogger();
+  auto op_info = calibrator->GetOpInfo(node);
+
+  for (int i : op_info.loggable_inputs) {
+    auto tensor = context->tensors[i];
+    logger->LogTensorValue(i, tensor.data.f, tensor.bytes / sizeof(float));
+  }
+
+  auto status = kernel_invoke(context, node);
+  // TODO(shashishekhar): An intermediate tensor in graph will get logged twice
+  // once as an input and second time as output. This doesn't change the min max
+  // values but is inefficient.
+  // Using moving average will also break this.
+
+  for (int i : op_info.loggable_outputs) {
+    auto tensor = context->tensors[i];
+    logger->LogTensorValue(i, tensor.data.f, tensor.bytes / sizeof(float));
+  }
+
+  return status;
+}
+
+// Returns the loggable tensors. Not all inputs and outputs need to be logged.
+// For example, const weight tensors which have buffers associated with them
+// don't need to be logged.
+std::vector<int> GetLoggableTensorIndices(
+    const std::vector<int>& tensor_indices,
+    const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors,
+    const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* tensor_buffers) {
+  std::vector<int> loggable;
+  for (auto tensor_index : tensor_indices) {
+    auto tensor = tensors->Get(tensor_index);
+    auto buffer_index = tensor->buffer();
+    const bool has_no_buffer =
+        (tensor_buffers->Get(buffer_index) == nullptr) ||
+        (tensor_buffers->Get(buffer_index)->data() == nullptr) ||
+        (tensor_buffers->Get(buffer_index)->data()->size() == 0);
+    if (has_no_buffer && tensor->type() == tflite::TensorType_FLOAT32) {
+      loggable.push_back(tensor_index);
+    }
+  }
+  return loggable;
+}
+
+// Creates a mapping between the static model graph and the runtime TfLiteNode*
+// nodes in the graph for the given context.
+// This is done by querying the TfLiteContext for node and registrations using
+// the |NodeInfoDelegateObserver|.
+TfLiteStatus GetNodeOpInfoMapAndContext(
+    const std::unordered_map<int, OperatorInfo>& node_to_opinfo,
+    tflite::Interpreter* const interpreter,
+    std::unordered_map<const TfLiteNode*, OperatorInfo>* node_ptr_opinfo_map,
+    const TfLiteContext** context
+
+) {
+  NodeInfoDelegateObserver delegate_observer(node_to_opinfo,
+                                             node_ptr_opinfo_map);
+  NodeInfoDelegateParams delegate_params;
+  delegate_params.delegate_observer = &delegate_observer;
+  TfLiteDelegate logging_delegate = CreateNodeInfoDelegate(&delegate_params);
+
+  auto modify_status = interpreter->ModifyGraphWithDelegate(&logging_delegate);
+  if (modify_status != kTfLiteOk) {
+    return kTfLiteError;
+  }
+  *context = delegate_observer.GetContext();
+  return kTfLiteOk;
+}
+
+string GetOpName(const tflite::OperatorCode& opcode) {
+  if (opcode.custom_code() != nullptr) {
+    return opcode.custom_code()->str();
+  }
+  return tflite::EnumNamesBuiltinOperator()[opcode.builtin_code()];
+}
+
+// A |CalibrationReader| that owns the Calibrator.
+class Reader : public CalibrationReader {
+ public:
+  Reader(const TfLiteContext* context, const Logger* logger)
+      : CalibrationReader(logger), context_(context) {}
+
+  ~Reader() override { GetCalibratorRegistry()->RemoveCalibrator(context_); }
+
+ private:
+  const TfLiteContext* context_;
+};
+
+}  // namespace
+
+TfLiteStatus BuildLoggingInterpreter(
+    const FlatBufferModel& model, const OpResolver& op_resolver,
+    std::unique_ptr<Interpreter>* interpreter,
+    std::unique_ptr<CalibrationReader>* calibration_reader) {
+  auto tflite_model = model.GetModel();
+  auto subgraphs = tflite_model->subgraphs();
+  auto tensor_buffers = tflite_model->buffers();
+
+  if (subgraphs->size() != 1) {
+    model.error_reporter()->Report(
+        "Only models with a single subgraph are supported, model had %d "
+        "subgraphs",
+        subgraphs->size());
+    return kTfLiteError;
+  }
+
+  // Populate the node index to operator info map.
+  // We want to collect this information so we can use it during runtime to
+  // log details of which inputs and outputs.
+  // At runtime TFLite kernel invoke functions can only look into their
+  // own node in the graph (TFLiteNode*) and some limited context information.
+  auto primary_subgraph = subgraphs->Get(0);
+  auto operator_codes = tflite_model->operator_codes();
+  auto operators = primary_subgraph->operators();
+  auto tensors = primary_subgraph->tensors();
+  std::unordered_map<int, OperatorInfo> node_to_opinfo;
+  BuiltinOpsSet op_and_versions;
+
+  for (size_t i = 0; i < operators->size(); i++) {
+    OperatorInfo op_info;
+    op_info.node_index = i;
+    auto op = operators->Get(i);
+    auto operator_code = operator_codes->Get(op->opcode_index());
+    op_info.builtin_op_code = operator_code->builtin_code();
+    op_info.name = GetOpName(*operator_code);
+    op_info.is_custom_op = operator_code->custom_code() != nullptr;
+
+    auto op_inputs = op->inputs();
+    auto op_outputs = op->outputs();
+    op_info.inputs = std::vector<int>(op_inputs->begin(), op_inputs->end());
+    op_info.outputs = std::vector<int>(op_outputs->begin(), op_outputs->end());
+    op_info.loggable_inputs =
+        GetLoggableTensorIndices(op_info.inputs, tensors, tensor_buffers);
+    op_info.loggable_outputs =
+        GetLoggableTensorIndices(op_info.outputs, tensors, tensor_buffers);
+    if (!op_info.is_custom_op) {
+      op_info.registration = op_resolver.FindOp(operator_code->builtin_code(),
+                                                operator_code->version());
+    } else {
+      op_info.registration =
+          op_resolver.FindOp(op_info.name.c_str(), operator_code->version());
+    }
+    node_to_opinfo[i] = op_info;
+    op_and_versions.insert({op_info.builtin_op_code, operator_code->version()});
+  }
+
+  // Prepare the logging op resolver to use |LoggingEval| for kernel
+  // invocations.
+  auto logging_op_resolver = absl::make_unique<LoggingOpResolver>(
+      op_and_versions, op_resolver, LoggingEval);
+  tflite::InterpreterBuilder(model, *logging_op_resolver)(interpreter);
+
+  if (!(*interpreter)) {
+    model.error_reporter()->Report("Failed to construct interpreter");
+    return kTfLiteError;
+  }
+
+  // Compute the mapping between runtime and static graph structure, i.e.
+  // (TfLiteContext, TfLiteNode) -> OperatorInfo
+  std::unordered_map<const TfLiteNode*, OperatorInfo> node_ptr_opinfo_map;
+  const TfLiteContext* context = nullptr;
+  GetNodeOpInfoMapAndContext(node_to_opinfo, interpreter->get(),
+                             &node_ptr_opinfo_map, &context);
+
+  Calibrator* calibrator = nullptr;
+  // Register a calibrator object for the context. This can be accessed
+  // during invocations by the logging kernels.
+  TF_LITE_ENSURE_STATUS(GetCalibratorRegistry()->CreateCalibrator(
+      context, node_ptr_opinfo_map, std::move(logging_op_resolver), &calibrator,
+      model.error_reporter()));
+  *calibration_reader = std::unique_ptr<CalibrationReader>(
+      new Reader(context, calibrator->GetLogger()));
+
+  return kTfLiteOk;
+}
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/calibration/calibrator.h b/tensorflow/lite/tools/optimize/calibration/calibrator.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb7e03f5ce71f3601d6a1b0f8c912f570f67b1c9
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration/calibrator.h
@@ -0,0 +1,64 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATOR_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATOR_H_
+
+#include <unordered_map>
+
+#include "flatbuffers/flatbuffers.h"  // TF:flatbuffers
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_reader.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+
+// Warning: This is not a public API and subject to change.
+
+// Builds a interpreter that logs the calibration data in memory.
+// The calibration data can be recovered using |calibration_reader|.
+//
+// Sample usage:
+// std::unique_ptr<Interpreter> interpreter;
+// std::unique_ptr<CalibrationReader> calibration_reader;
+// BuiltinOpResolver resolver = ...
+// FlatBufferModel model = ..
+//
+// BuildLoggingInterpreter(model, resolver, &interpreter,
+//  &calibration_reader);
+//
+//
+// * Allocate tensors...
+// * Call interpreter->invoke on calibration dataset.
+//
+// Calibration data can be read either directly by calling
+// std::unordered_map<int,  CalibrationStats>> tensor_index_to_stats;
+// calibration_reader->GetTensorStatsAsMap(&tensor_index_to_stats);
+//
+// or adding calibration data to model itself.
+// ModelT * original_floating_point_model = ...
+// calibration_reader->AddCalibrationToModel(original_floating_point_model);
+//
+TfLiteStatus BuildLoggingInterpreter(
+    const FlatBufferModel& model, const OpResolver& op_resolver,
+    std::unique_ptr<Interpreter>* interpreter,
+    std::unique_ptr<CalibrationReader>* calibration_reader);
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_CALIBRATOR_H_
diff --git a/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc b/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..60e652ec7a1fcc0d3844f0254fa6ff6072a861ce
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
@@ -0,0 +1,212 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstring>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibrator.h"
+
+namespace {
+tensorflow::string* g_test_model_file = nullptr;
+}  // namespace
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+namespace {
+
+std::unique_ptr<FlatBufferModel> ReadModel() {
+  if (g_test_model_file) {
+    return FlatBufferModel::BuildFromFile(g_test_model_file->c_str());
+  }
+  return nullptr;
+}
+
+TEST(CalibratorTest, CalibrationStatsAreCollected) {
+  auto model = ReadModel();
+  ASSERT_TRUE(model);
+  std::unique_ptr<Interpreter> interpreter;
+  std::unique_ptr<CalibrationReader> reader;
+  auto status = BuildLoggingInterpreter(
+      *model, ops::builtin::BuiltinOpResolver{}, &interpreter, &reader);
+  EXPECT_EQ(kTfLiteOk, status);
+
+  ASSERT_TRUE(interpreter);
+  ASSERT_TRUE(reader);
+  std::unordered_map<int, CalibrationReader::CalibrationStats> stats;
+  status = reader->GetTensorStatsAsMap(&stats);
+  EXPECT_EQ(kTfLiteOk, status);
+  EXPECT_TRUE(stats.empty());
+
+  status = interpreter->AllocateTensors();
+  ASSERT_EQ(kTfLiteOk, status);
+  // Model does the following:
+  // 0        1       2        3
+  // |        |__ ____|        |
+  // |           |             |
+  // |          Add(tensor:4)  |
+  // |____ ______|______ ______|
+  //      |             |
+  //      Add          Add
+  //      |             |
+  //    Output:5      Output:6
+
+  const size_t tensor_size = 1 * 8 * 8 * 3;
+
+  std::vector<float> ones(tensor_size, 1.0f);
+  // Fill input tensor i with i+1, i.e. input[0] = 1.0f, input[1] = 2.0f,
+  // input[2] = 3.0f
+
+  for (size_t i = 0; i < interpreter->inputs().size(); i++) {
+    int input_tensor_idx = interpreter->inputs()[i];
+    TfLiteTensor* tensor = interpreter->tensor(input_tensor_idx);
+    ASSERT_EQ(tensor->bytes, tensor_size * sizeof(float));
+    for (size_t j = 0; j < tensor_size; j++) {
+      tensor->data.f[j] = i + 1;
+    }
+  }
+  status = interpreter->Invoke();
+  ASSERT_EQ(kTfLiteOk, status);
+  const float eps = 1e-6f;
+  // Verify that tensor 5: is 6
+  // Verify that tensor 6: is 9
+  TfLiteTensor* tensor = interpreter->tensor(interpreter->outputs()[0]);
+  for (size_t i = 0; i < tensor_size; i++) {
+    EXPECT_NEAR(tensor->data.f[i], 6.0f, eps);
+  }
+  tensor = interpreter->tensor(interpreter->outputs()[1]);
+  for (size_t i = 0; i < tensor_size; i++) {
+    EXPECT_NEAR(tensor->data.f[i], 9.0f, eps);
+  }
+
+  // Verify that min max of tensors.
+  status = reader->GetTensorStatsAsMap(&stats);
+  EXPECT_EQ(kTfLiteOk, status);
+  EXPECT_EQ(7, stats.size());
+  // Check inputs
+  for (int tensor_idx = 0; tensor_idx < 4; tensor_idx++) {
+    EXPECT_NEAR(stats.at(tensor_idx).min, tensor_idx + 1, eps);
+    EXPECT_NEAR(stats.at(tensor_idx).max, tensor_idx + 1, eps);
+  }
+  // Check tensor 4 max.
+  EXPECT_NEAR(stats.at(4).min, 5, eps);
+  EXPECT_NEAR(stats.at(4).max, 5, eps);
+
+  // Check outputs
+  EXPECT_NEAR(stats.at(5).min, 6, eps);
+  EXPECT_NEAR(stats.at(5).max, 6, eps);
+
+  EXPECT_NEAR(stats.at(6).min, 9, eps);
+  EXPECT_NEAR(stats.at(6).max, 9, eps);
+}
+
+TEST(CalibratorTest, MultipleInvokes) {
+  auto model = ReadModel();
+  ASSERT_TRUE(model);
+  std::unique_ptr<Interpreter> interpreter;
+  std::unique_ptr<CalibrationReader> reader;
+  auto status = BuildLoggingInterpreter(
+      *model, ops::builtin::BuiltinOpResolver{}, &interpreter, &reader);
+  EXPECT_EQ(kTfLiteOk, status);
+
+  ASSERT_TRUE(interpreter);
+  ASSERT_TRUE(reader);
+  status = interpreter->AllocateTensors();
+
+  EXPECT_EQ(kTfLiteOk, status);
+  const size_t tensor_size = 1 * 8 * 8 * 3;
+  // Fill input tensor i with i+1, i.e. input[0] = 1.0f, input[1] = 2.0f,
+  // input[2] = 3.0f
+
+  for (size_t i = 0; i < interpreter->inputs().size(); i++) {
+    int input_tensor_idx = interpreter->inputs()[i];
+    TfLiteTensor* tensor = interpreter->tensor(input_tensor_idx);
+    ASSERT_EQ(tensor->bytes, tensor_size * sizeof(float));
+    for (size_t j = 0; j < tensor_size; j++) {
+      tensor->data.f[j] = i + 1;
+    }
+  }
+  status = interpreter->Invoke();
+  ASSERT_EQ(kTfLiteOk, status);
+  const float eps = 1e-6f;
+  // Verify that min max of tensors.
+  std::unordered_map<int, CalibrationReader::CalibrationStats> stats;
+  status = reader->GetTensorStatsAsMap(&stats);
+  EXPECT_EQ(kTfLiteOk, status);
+  EXPECT_EQ(7, stats.size());
+  const float expected_values[7] = {
+      1.0f,  // input 0
+      2.0f,  // input 1
+      3.0f,  // input 2
+      4.0f,  // input 3
+      5.0f,  // Add(1, 2)
+      6.0f,  // Output 5: Add(0, Add(1,2))
+      9.0f,  // Output 6: Add(Add(1,2), 3)
+  };
+  for (int tensor_idx = 0; tensor_idx < 7; tensor_idx++) {
+    EXPECT_NEAR(stats.at(tensor_idx).min, expected_values[tensor_idx], eps);
+    EXPECT_NEAR(stats.at(tensor_idx).max, expected_values[tensor_idx], eps);
+  }
+  // Set input[0][0] = 1.5 and input[0][1] = 0.5 this should change the values
+  // only for input[0] and tensor 4 and ouputs 5, 6.
+  TfLiteTensor* input0 = interpreter->tensor(0);
+  input0->data.f[0] = 1.5f;
+  input0->data.f[1] = 0.5f;
+  status = interpreter->Invoke();
+  ASSERT_EQ(kTfLiteOk, status);
+  status = reader->GetTensorStatsAsMap(&stats);
+  EXPECT_EQ(kTfLiteOk, status);
+  EXPECT_EQ(7, stats.size());
+  EXPECT_NEAR(stats.at(0).min, 0.5f, eps);
+  EXPECT_NEAR(stats.at(0).max, 1.5f, eps);
+
+  for (int tensor_idx = 1; tensor_idx < 5; tensor_idx++) {
+    EXPECT_NEAR(stats.at(tensor_idx).min, expected_values[tensor_idx], eps);
+    EXPECT_NEAR(stats.at(tensor_idx).max, expected_values[tensor_idx], eps);
+  }
+
+  EXPECT_NEAR(stats.at(5).min, 5.5f, eps);
+  EXPECT_NEAR(stats.at(5).max, 6.5f, eps);
+
+  EXPECT_NEAR(stats.at(6).min, 9.0f, eps);
+  EXPECT_NEAR(stats.at(6).max, 9.0f, eps);
+}
+
+}  // namespace
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  tensorflow::string model_file;
+  const std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("test_model_file", &model_file,
+                       "Path to test tflite model file."),
+  };
+
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    std::cerr << "Required test_model_file\n";
+    std::abort();
+  }
+  g_test_model_file = new tensorflow::string(model_file);
+  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.cc b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d2a09e898ae213c9a2aaa6e7e26adb6eda638a67
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.cc
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h"
+
+#include "absl/memory/memory.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+
+LoggingOpResolver::LoggingOpResolver(const BuiltinOpsSet& ops_to_replace,
+                                     const OpResolver& base_resolver,
+                                     KernelEvalFuncPtr logging_eval_fn) {
+  for (const auto& op_and_version : ops_to_replace) {
+    const TfLiteRegistration* base_registration =
+        base_resolver.FindOp(op_and_version.first, op_and_version.second);
+    BuiltinOperatorKey key = op_and_version;
+    builtin_op_evalfn_map_[key] = base_registration->invoke;
+    std::unique_ptr<TfLiteRegistration> logging_registation =
+        absl::make_unique<TfLiteRegistration>(*base_registration);
+    logging_registation->invoke = logging_eval_fn;
+    builtin_op_registration_map_[key] = std::move(logging_registation);
+  }
+}
+
+const TfLiteRegistration* LoggingOpResolver::FindOp(BuiltinOperator op,
+                                                    int version) const {
+  BuiltinOperatorKey key = {op, version};
+  if (builtin_op_registration_map_.find(key) !=
+      builtin_op_registration_map_.end()) {
+    return builtin_op_registration_map_.at(key).get();
+  }
+
+  return nullptr;
+}
+
+KernelEvalFuncPtr LoggingOpResolver::GetWrappedKernelInvoke(BuiltinOperator op,
+                                                            int version) const {
+  return builtin_op_evalfn_map_.at({op, version});
+}
+
+const TfLiteRegistration* LoggingOpResolver::FindOp(const char* op,
+                                                    int version) const {
+  // TODO(b/121374947): Support custom ops as well.
+  return nullptr;
+}
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h
new file mode 100644
index 0000000000000000000000000000000000000000..af4127e42f76dcdcfff00bee4b811dd20111165d
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_LOGGING_OP_RESOLVER_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_LOGGING_OP_RESOLVER_H_
+
+#include <set>
+#include <unordered_map>
+
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
+#include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_common.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+// A resolver that replaces the kernel invocations with a wrapper
+// eval function.
+class LoggingOpResolver : public OpResolver {
+ public:
+  // Creates an instance of |LoggingOpResolver|.
+  // All |TfLiteRegistration.invoke| functions are replaced by
+  // |logging_eval_fn|.
+  // TODO(shashishekhar): This interface needs to change for custom ops and
+  // BuiltinOps that need special logging implementations.
+  LoggingOpResolver(const BuiltinOpsSet& ops_to_replace,
+                    const OpResolver& base_resolver,
+                    KernelEvalFuncPtr logging_eval_fn);
+
+  const TfLiteRegistration* FindOp(BuiltinOperator op,
+                                   int version) const override;
+
+  KernelEvalFuncPtr GetWrappedKernelInvoke(BuiltinOperator op,
+                                           int version) const;
+  const TfLiteRegistration* FindOp(const char* op, int version) const override;
+
+ private:
+  BuiltinOpsMap<std::unique_ptr<TfLiteRegistration>>
+      builtin_op_registration_map_;
+  BuiltinOpsMap<KernelEvalFuncPtr> builtin_op_evalfn_map_;
+};
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_LOGGING_OP_RESOLVER_H_
diff --git a/tensorflow/lite/tools/optimize/calibration/logging_op_resolver_test.cc b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d8d29ad8eff0cea0967a6d0e91e84714b5fbe80f
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration/logging_op_resolver_test.cc
@@ -0,0 +1,141 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/calibration/logging_op_resolver.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/mutable_op_resolver.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+namespace {
+
+TfLiteStatus ConvPrepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus ConvEval(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus AddPrepare(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus AddEval(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TfLiteStatus WrappingInvoke(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TEST(LoggingOpResolverTest, KernelInvokesAreReplaced) {
+  MutableOpResolver base_resolver;
+  TfLiteRegistration conv_registration = {};
+  conv_registration.prepare = ConvPrepare;
+  conv_registration.invoke = ConvEval;
+
+  base_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &conv_registration);
+
+  TfLiteRegistration add_registration = {};
+  add_registration.prepare = AddPrepare;
+  add_registration.invoke = AddEval;
+
+  base_resolver.AddBuiltin(BuiltinOperator_ADD, &add_registration);
+  BuiltinOpsSet ops_to_replace = {
+      {BuiltinOperator_CONV_2D, /*version*/ 1},
+      {BuiltinOperator_ADD, /*version*/ 1},
+  };
+
+  LoggingOpResolver resolver(ops_to_replace, base_resolver, WrappingInvoke);
+
+  auto reg = resolver.FindOp(BuiltinOperator_CONV_2D, 1);
+
+  EXPECT_EQ(reg->builtin_code, BuiltinOperator_CONV_2D);
+  EXPECT_TRUE(reg->prepare == ConvPrepare);
+  EXPECT_TRUE(reg->invoke == WrappingInvoke);
+
+  reg = resolver.FindOp(BuiltinOperator_ADD, 1);
+
+  EXPECT_EQ(reg->builtin_code, BuiltinOperator_ADD);
+  EXPECT_TRUE(reg->prepare == AddPrepare);
+  EXPECT_TRUE(reg->invoke == WrappingInvoke);
+}
+
+TEST(LoggingOpResolverTest, OriginalKernelInvokesAreRetained) {
+  MutableOpResolver base_resolver;
+  TfLiteRegistration conv_registration = {};
+  conv_registration.prepare = ConvPrepare;
+  conv_registration.invoke = ConvEval;
+
+  base_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &conv_registration);
+
+  TfLiteRegistration add_registration = {};
+  add_registration.prepare = AddPrepare;
+  add_registration.invoke = AddEval;
+
+  base_resolver.AddBuiltin(BuiltinOperator_ADD, &add_registration);
+  BuiltinOpsSet ops_to_replace = {
+      {BuiltinOperator_CONV_2D, /*version*/ 1},
+      {BuiltinOperator_ADD, /*version*/ 1},
+  };
+
+  LoggingOpResolver resolver(ops_to_replace, base_resolver, WrappingInvoke);
+  auto kernel_invoke =
+      resolver.GetWrappedKernelInvoke(BuiltinOperator_CONV_2D, 1);
+  EXPECT_TRUE(kernel_invoke == ConvEval);
+  kernel_invoke = resolver.GetWrappedKernelInvoke(BuiltinOperator_ADD, 1);
+  EXPECT_TRUE(kernel_invoke == AddEval);
+}
+
+TEST(LoggingOpResolverTest, OnlyOpsInReplacementSetAreReplaces) {
+  MutableOpResolver base_resolver;
+  TfLiteRegistration conv_registration = {};
+  conv_registration.prepare = ConvPrepare;
+  conv_registration.invoke = ConvEval;
+
+  base_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &conv_registration);
+
+  TfLiteRegistration add_registration = {};
+  add_registration.prepare = AddPrepare;
+  add_registration.invoke = AddEval;
+
+  base_resolver.AddBuiltin(BuiltinOperator_ADD, &add_registration);
+  // Only replace conv2d
+  BuiltinOpsSet ops_to_replace = {
+      {BuiltinOperator_CONV_2D, /*version*/ 1},
+  };
+
+  LoggingOpResolver resolver(ops_to_replace, base_resolver, WrappingInvoke);
+  auto reg = resolver.FindOp(BuiltinOperator_CONV_2D, 1);
+  EXPECT_EQ(reg->builtin_code, BuiltinOperator_CONV_2D);
+  EXPECT_TRUE(reg->prepare == ConvPrepare);
+  EXPECT_TRUE(reg->invoke == WrappingInvoke);
+
+  reg = resolver.FindOp(BuiltinOperator_ADD, 1);
+  EXPECT_EQ(nullptr, reg);
+}
+
+}  // namespace
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  // On Linux, add: FLAGS_logtostderr = true;
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/tools/optimize/calibration/node_info_delegate.cc b/tensorflow/lite/tools/optimize/calibration/node_info_delegate.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2b9197498b03dad6a37b7370ce2a0d2751ac9bcd
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration/node_info_delegate.cc
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/calibration/node_info_delegate.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+
+namespace {
+// The prepare function for delegate that forwards the prepare call to the
+// delegate observer in node info delegate params.
+// The function simply calls a delegate observer OnDelegatePrepareMethod.
+TfLiteStatus NodeInfoDelegatePrepare(TfLiteContext* context,
+                                     TfLiteDelegate* delegate) {
+  if (delegate == nullptr) return TfLiteStatus::kTfLiteError;
+
+  NodeInfoDelegateParams* params =
+      reinterpret_cast<NodeInfoDelegateParams*>(delegate->data_);
+  return params->delegate_observer->OnDelegatePrepareCalled(context);
+}
+}  // namespace
+
+TfLiteDelegate CreateNodeInfoDelegate(NodeInfoDelegateParams* params) {
+  return {/*data_ */ params,
+          /* Prepare */ NodeInfoDelegatePrepare,
+          /* CopyFromBufferHandle*/ nullptr,
+          /* CopyToBufferHandle*/ nullptr,
+          /* FreeBufferHandle*/ nullptr};
+}
+
+TfLiteStatus NodeInfoDelegateObserver::OnDelegatePrepareCalled(
+    TfLiteContext* context) {
+  context_ = context;
+  const size_t num_nodes = node_index_opinfo_map_.size();
+  for (size_t node_index = 0; node_index < num_nodes; node_index++) {
+    TfLiteNode* node = nullptr;
+    TfLiteRegistration* reg = nullptr;
+    TF_LITE_ENSURE_STATUS(
+        context->GetNodeAndRegistration(context, node_index, &node, &reg));
+    auto op_info = node_index_opinfo_map_.at(node_index);
+    op_info.registration = reg;
+    node_ptr_opinfo_map_->insert({node, op_info});
+  }
+
+  if (node_ptr_opinfo_map_->size() != node_index_opinfo_map_.size()) {
+    // Something wrong.
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/calibration/node_info_delegate.h b/tensorflow/lite/tools/optimize/calibration/node_info_delegate.h
new file mode 100644
index 0000000000000000000000000000000000000000..56f6141f21dc3f807c53ac5e92833597f6cef4a9
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration/node_info_delegate.h
@@ -0,0 +1,67 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_NODE_INFO_DELEGATE_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_NODE_INFO_DELEGATE_H_
+
+#include <unordered_map>
+
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/tools/optimize/calibration/calibration_common.h"
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+
+// An interface for delegate observer that can listen to TfLiteDelegate::Prepare
+// calls.
+class DelegateObserver {
+ public:
+  virtual TfLiteStatus OnDelegatePrepareCalled(TfLiteContext* context) = 0;
+  virtual ~DelegateObserver() {}
+};
+
+// The parameters for the node info delegate.
+struct NodeInfoDelegateParams {
+  DelegateObserver* delegate_observer;
+};
+
+// Creates a delegate with the given |params|.
+TfLiteDelegate CreateNodeInfoDelegate(NodeInfoDelegateParams* params);
+
+// A delegate observer that can construct the map from TfLiteNode* ->
+// OperatorInfo.
+class NodeInfoDelegateObserver : public DelegateObserver {
+ public:
+  NodeInfoDelegateObserver(
+      const std::unordered_map<int, OperatorInfo>& node_index_to_op,
+      std::unordered_map<const TfLiteNode*, OperatorInfo>* node_ptr_opinfo_map)
+      : node_index_opinfo_map_(node_index_to_op),
+        node_ptr_opinfo_map_(node_ptr_opinfo_map) {}
+
+  TfLiteStatus OnDelegatePrepareCalled(TfLiteContext* context) override;
+
+  // Returns the context that was used to called the prepare method.
+  const TfLiteContext* GetContext() const { return context_; }
+
+ private:
+  const TfLiteContext* context_ = nullptr;
+  const std::unordered_map<int, OperatorInfo>& node_index_opinfo_map_;
+  std::unordered_map<const TfLiteNode*, OperatorInfo>* node_ptr_opinfo_map_;
+};
+
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_NODE_INFO_DELEGATE_H_
diff --git a/tensorflow/lite/tools/optimize/calibration/node_info_delegate_test.cc b/tensorflow/lite/tools/optimize/calibration/node_info_delegate_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b110174b6325a8daadacfd472e62321ef69425f7
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/calibration/node_info_delegate_test.cc
@@ -0,0 +1,178 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unordered_map>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/tools/optimize/calibration/node_info_delegate.h"
+#include "tensorflow/lite/tools/optimize/test_util.h"
+
+namespace {
+tensorflow::string* g_test_model_dir = nullptr;
+}  // namespace
+
+namespace tflite {
+namespace optimize {
+namespace calibration {
+namespace {
+
+std::unique_ptr<FlatBufferModel> ReadModel(const char* model) {
+  auto model_path = tensorflow::io::JoinPath(*g_test_model_dir, model);
+  return FlatBufferModel::BuildFromFile(model_path.c_str());
+}
+
+std::unique_ptr<FlatBufferModel> ReadModel() {
+  return ReadModel(internal::kConvModelWith0Plus10Weights);
+}
+
+class TestDelegateObserver : public DelegateObserver {
+ public:
+  explicit TestDelegateObserver(TfLiteStatus status_to_return)
+      : status_to_return_(status_to_return) {}
+
+  TfLiteStatus OnDelegatePrepareCalled(TfLiteContext* context) override {
+    num_times_called_++;
+    return status_to_return_;
+  }
+  int num_times_called() { return num_times_called_; }
+
+ private:
+  int num_times_called_ = 0;
+  TfLiteStatus status_to_return_;
+};
+
+TEST(NodeInfoDelegateTest, DelegateObserverIsCalled) {
+  TestDelegateObserver observer(kTfLiteOk);
+  NodeInfoDelegateParams params;
+  params.delegate_observer = &observer;
+  auto model = ReadModel();
+  ASSERT_TRUE(model);
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(InterpreterBuilder(*model,
+                               ops::builtin::BuiltinOpResolver{})(&interpreter),
+            kTfLiteOk);
+  ASSERT_TRUE(interpreter);
+  EXPECT_EQ(0, observer.num_times_called());
+  TfLiteDelegate delegate = CreateNodeInfoDelegate(&params);
+
+  auto status = interpreter->ModifyGraphWithDelegate(&delegate);
+  EXPECT_EQ(kTfLiteOk, status);
+  EXPECT_EQ(1, observer.num_times_called());
+}
+
+TEST(NodeInfoDelegateTest, ObserverErrorCausesModifyGraphFailure) {
+  // Observer returns error
+  TestDelegateObserver observer(kTfLiteError);
+  NodeInfoDelegateParams params;
+  params.delegate_observer = &observer;
+  auto model = ReadModel();
+  ASSERT_TRUE(model);
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(InterpreterBuilder(*model,
+                               ops::builtin::BuiltinOpResolver{})(&interpreter),
+            kTfLiteOk);
+  ASSERT_TRUE(interpreter);
+  TfLiteDelegate delegate = CreateNodeInfoDelegate(&params);
+
+  auto status = interpreter->ModifyGraphWithDelegate(&delegate);
+  EXPECT_EQ(kTfLiteError, status);
+}
+
+TEST(NodeInfoDelegateTest, NodeInfoDelegateObserver) {
+  auto model = ReadModel();
+  ASSERT_TRUE(model);
+
+  std::unordered_map<int, OperatorInfo> index_to_opinfo;
+  auto primary_subgraph = model->GetModel()->subgraphs()->Get(0);
+  auto operators = primary_subgraph->operators();
+  auto subgraph_tensors = primary_subgraph->tensors();
+  for (size_t i = 0; i < operators->size(); i++) {
+    OperatorInfo info;
+    auto op_inputs = operators->Get(i)->inputs();
+    auto op_outputs = operators->Get(i)->outputs();
+    info.inputs = std::vector<int>(op_inputs->begin(), op_inputs->end());
+    info.outputs = std::vector<int>(op_outputs->begin(), op_outputs->end());
+    index_to_opinfo[i] = info;
+  }
+
+  std::unordered_map<const TfLiteNode*, OperatorInfo> node_to_opinfo;
+  NodeInfoDelegateObserver observer(index_to_opinfo, &node_to_opinfo);
+  NodeInfoDelegateParams params;
+  params.delegate_observer = &observer;
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(InterpreterBuilder(*model,
+                               ops::builtin::BuiltinOpResolver{})(&interpreter),
+            kTfLiteOk);
+  ASSERT_TRUE(interpreter);
+
+  TfLiteDelegate delegate = CreateNodeInfoDelegate(&params);
+
+  auto status = interpreter->ModifyGraphWithDelegate(&delegate);
+  EXPECT_EQ(kTfLiteOk, status);
+  EXPECT_EQ(index_to_opinfo.size(), node_to_opinfo.size());
+  EXPECT_EQ(interpreter->nodes_size(), node_to_opinfo.size());
+
+  for (const auto& node_and_opinfo : node_to_opinfo) {
+    const TfLiteNode* tflite_node = node_and_opinfo.first;
+    const OperatorInfo& info = node_and_opinfo.second;
+    ASSERT_EQ(tflite_node->inputs->size, info.inputs.size());
+    ASSERT_EQ(tflite_node->outputs->size, info.outputs.size());
+
+    for (size_t input_index = 0; input_index < info.inputs.size();
+         input_index++) {
+      const TfLiteTensor* tflite_tensor =
+          interpreter->tensor(tflite_node->inputs->data[input_index]);
+      EXPECT_EQ(tflite_tensor->name,
+                subgraph_tensors->Get(info.inputs[input_index])->name()->str());
+    }
+
+    for (size_t output_index = 0; output_index < info.outputs.size();
+         output_index++) {
+      const TfLiteTensor* tflite_tensor =
+          interpreter->tensor(tflite_node->outputs->data[output_index]);
+      EXPECT_EQ(
+          tflite_tensor->name,
+          subgraph_tensors->Get(info.outputs[output_index])->name()->str());
+    }
+  }
+}
+
+}  // namespace
+}  // namespace calibration
+}  // namespace optimize
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  tensorflow::string model_file;
+  const std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("test_model_file", &model_file,
+                       "Path to test tflite model file."),
+  };
+
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    std::cerr << "Required test_model_file\n";
+    std::abort();
+  }
+  g_test_model_dir =
+      new tensorflow::string(tensorflow::io::Dirname(model_file));
+  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/tools/optimize/quantization_utils.cc b/tensorflow/lite/tools/optimize/quantization_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..445fffb8dd4256b001f72576902f47f425ef9161
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/quantization_utils.cc
@@ -0,0 +1,163 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/quantization_utils.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/round.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+#include <cmath>
+#include <cstdint>
+
+namespace tflite {
+namespace optimize {
+namespace utils {
+
+namespace {
+const int8_t kMinQuantizedValue = -127;
+const int8_t kMaxQuantizedValue = 127;
+}  // namespace
+
+TfLiteStatus NumElements(const TensorT& tensor, uint64_t* num_elements) {
+  if (tensor.shape.empty()) {
+    return kTfLiteError;
+  }
+  *num_elements = 1;
+  for (const uint64_t dim : tensor.shape) {
+    *num_elements *= dim;
+  }
+  return kTfLiteOk;
+}
+
+// Nudge min and max so that floating point 0 falls exactly on a quantized
+// value, returning the nudges scale and zero_point.
+//
+// Although this code originates from FakeQuantization in quantized training,
+// we may deviate from that implementation as we please since we do not fine
+// tune the weights with quantized training.
+void GetAsymmetricQuantizationParams(
+    float min, float max, const int quant_min, const int quant_max,
+    QuantizationParametersT* quantization_params) {
+  const float quant_min_float = static_cast<float>(quant_min);
+  const float quant_max_float = static_cast<float>(quant_max);
+  // Adjust the boundaries to guarantee 0 is included.
+  min = std::min(static_cast<float>(min), 0.0f);
+  max = std::max(static_cast<float>(max), 0.0f);
+  const float scale = (max - min) / (quant_max_float - quant_min_float);
+  // Scale can be zero if min and max are exactly 0.0f.
+  float zero_point_from_min = quant_min_float;
+  if (scale != 0) {
+    zero_point_from_min = quant_min_float - min / scale;
+  }
+  int64_t zero_point;
+  if (zero_point_from_min < quant_min_float) {
+    zero_point = static_cast<int64_t>(quant_min);
+  } else if (zero_point_from_min > quant_max_float) {
+    zero_point = static_cast<int64_t>(quant_max);
+  } else {
+    zero_point = static_cast<int64_t>(std::round(zero_point_from_min));
+  }
+  quantization_params->min = std::vector<float>(1, min);
+  quantization_params->max = std::vector<float>(1, max);
+  quantization_params->scale = std::vector<float>(1, scale);
+  quantization_params->zero_point = std::vector<int64_t>(1, zero_point);
+}
+
+// Per-channel quantize a tensor at the given index and returns both scales and
+// quantized values.
+void SymmetricPerChannelQuantization(const float* const input,
+                                     const std::vector<int>& dimension,
+                                     int32_t channel_dim_index,
+                                     std::vector<float>* output_scales,
+                                     std::vector<int8_t>* output_value) {
+  const int32_t channel_dim_size = dimension[channel_dim_index];
+  std::vector<float> min_vals(channel_dim_size);
+  std::vector<float> max_vals(channel_dim_size);
+  std::vector<bool> has_min_max_value(channel_dim_size, false);
+  int indices[4];
+  RuntimeShape tensor_dims{dimension[0], dimension[1], dimension[2],
+                           dimension[3]};
+
+  // Compute min max ranges per channel
+  for (indices[0] = 0; indices[0] < dimension[0]; indices[0]++) {
+    for (indices[1] = 0; indices[1] < dimension[1]; indices[1]++) {
+      for (indices[2] = 0; indices[2] < dimension[2]; indices[2]++) {
+        for (indices[3] = 0; indices[3] < dimension[3]; indices[3]++) {
+          int channel_idx = indices[channel_dim_index];
+          const float val = input[Offset(tensor_dims, indices)];
+          if (has_min_max_value[channel_idx]) {
+            if (min_vals[channel_idx] > val) {
+              min_vals[channel_idx] = val;
+            } else if (max_vals[channel_idx] < val) {
+              max_vals[channel_idx] = val;
+            }
+          } else {
+            min_vals[channel_idx] = val;
+            max_vals[channel_idx] = val;
+            has_min_max_value[channel_idx] = true;
+          }
+        }
+      }
+    }
+  }
+
+  // Calculate scales per channel
+  std::vector<float> scale_invs(channel_dim_size);
+  const float half_scale = kMaxQuantizedValue;
+  for (size_t channel_idx = 0; channel_idx < channel_dim_size; channel_idx++) {
+    const float half_range = std::max(std::abs(min_vals[channel_idx]),
+                                      std::abs(max_vals[channel_idx]));
+    output_scales->at(channel_idx) = half_range / half_scale;
+    if (half_range == 0) {
+      scale_invs[channel_idx] = 0;
+    } else {
+      scale_invs[channel_idx] = half_scale / half_range;
+    }
+  }
+
+  // Quantize the values.
+  SymmetricPerChannelQuantizeValues(input, scale_invs, dimension,
+                                    channel_dim_index, output_value);
+}
+
+void SymmetricPerChannelQuantizeValues(const float* const input,
+                                       const std::vector<float>& scales_inv,
+                                       const std::vector<int>& dimension,
+                                       int32_t channel_dim_index,
+                                       std::vector<int8_t>* output_value) {
+  // Quantize the values.
+  int indices[4];
+  RuntimeShape tensor_dims{dimension[0], dimension[1], dimension[2],
+                           dimension[3]};
+  for (indices[0] = 0; indices[0] < dimension[0]; indices[0]++) {
+    for (indices[1] = 0; indices[1] < dimension[1]; indices[1]++) {
+      for (indices[2] = 0; indices[2] < dimension[2]; indices[2]++) {
+        for (indices[3] = 0; indices[3] < dimension[3]; indices[3]++) {
+          int channel_idx = indices[channel_dim_index];
+          int index = Offset(tensor_dims, indices);
+          const float val = input[index];
+          const int32_t quantized_value =
+              static_cast<int32_t>(TfLiteRound(val * scales_inv[channel_idx]));
+          output_value->at(index) = std::min<int8_t>(
+              kMaxQuantizedValue,
+              std::max<int8_t>(kMinQuantizedValue, quantized_value));
+        }
+      }
+    }
+  }
+}
+
+}  // namespace utils
+}  // namespace optimize
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/quantization_utils.h b/tensorflow/lite/tools/optimize/quantization_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..d20b3176bf389be1a9661610426e3b1403a3ef4d
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/quantization_utils.h
@@ -0,0 +1,67 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZATION_UTILS_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZATION_UTILS_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace optimize {
+namespace utils {
+
+// Returns the number of elements in the given tensor.
+TfLiteStatus NumElements(const TensorT& tensor, uint64_t* num_elements);
+
+// Populates the scale and zero point for quantization parameters.
+//
+// Nudges min and max so that floating point 0 falls exactly on a quantized
+// value, returning the nudges scale and zero_point.
+void GetAsymmetricQuantizationParams(
+    float min, float max, const int quant_min, const int quant_max,
+    QuantizationParametersT* quantization_params);
+
+// Per-channel quantize a tensor at the given index and returns both scales and
+// quantized values.
+// Parameters:
+// - input is the float input data to be quantized.
+// - dimension is the dimension of the input data. Only supports dimension of
+//   size 4.
+// - channel_dim_index is the channel index within "dimension".
+//   dimension[channel_dim_index] gives the number of channels.
+// - output_scale is the output scale, the size of which equals the number of
+//   channels.
+// - output_value is the output data, the size of which equals the number of
+//   inputs.
+void SymmetricPerChannelQuantization(const float* const input,
+                                     const std::vector<int>& dimension,
+                                     int32_t channel_dim_index,
+                                     std::vector<float>* output_scales,
+                                     std::vector<int8_t>* output_value);
+
+// Quantize the values given an array of scales.
+void SymmetricPerChannelQuantizeValues(const float* const input,
+                                       const std::vector<float>& scales_inv,
+                                       const std::vector<int>& dimension,
+                                       int32_t channel_dim_index,
+                                       std::vector<int8_t>* output_value);
+
+}  // namespace utils
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZATION_UTILS_H_
diff --git a/tensorflow/lite/tools/optimize/quantization_utils_test.cc b/tensorflow/lite/tools/optimize/quantization_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ecad09ed61225c2b6e0ed5a20b52561e1e2c35ef
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/quantization_utils_test.cc
@@ -0,0 +1,212 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/quantization_utils.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace optimize {
+namespace utils {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+TEST(QuantizationUtilsTest, NumElements) {
+  TensorT tensor;
+  tensor.shape = {1, 2, 3, 4};
+  uint64_t num_elements;
+  EXPECT_EQ(kTfLiteOk, NumElements(tensor, &num_elements));
+  EXPECT_EQ(num_elements, 1 * 2 * 3 * 4);
+
+  tensor.shape = {5};
+  EXPECT_EQ(kTfLiteOk, NumElements(tensor, &num_elements));
+  EXPECT_EQ(num_elements, 5);
+
+  tensor.shape = {};
+  EXPECT_EQ(kTfLiteError, NumElements(tensor, &num_elements));
+}
+
+TEST(QuantizationUtilsTest, GetAsymmetricQuantizationParamsUnitRange) {
+  const float float_min = -128.0;
+  const float float_max = 127.0;
+  const int quant_min = -128;
+  const int quant_max = 127;
+  QuantizationParametersT params;
+  GetAsymmetricQuantizationParams(float_min, float_max, quant_min, quant_max,
+                                  &params);
+  ASSERT_EQ(params.max.size(), 1);
+  ASSERT_EQ(params.min.size(), 1);
+  ASSERT_EQ(params.scale.size(), 1);
+  ASSERT_EQ(params.zero_point.size(), 1);
+  EXPECT_EQ(params.max[0], float_max);
+  EXPECT_EQ(params.min[0], float_min);
+
+  int64_t zero_point = params.zero_point[0];
+  float scale = params.scale[0];
+  const float eps = 1e-7f;
+  EXPECT_EQ(zero_point, 0);
+  EXPECT_NEAR(scale, 1, eps);
+}
+
+TEST(QuantizationUtilsTest, AsymmetricQuantizationParamsWithAllPositiveRange) {
+  // The min should get nudged to include 0, so the effective range is [0, 6].
+  const float float_min = 1.0;
+  const float float_max = 6.0;
+  const int quant_min = -128;
+  const int quant_max = 127;
+  QuantizationParametersT params;
+  GetAsymmetricQuantizationParams(float_min, float_max, quant_min, quant_max,
+                                  &params);
+  ASSERT_EQ(params.max.size(), 1);
+  ASSERT_EQ(params.min.size(), 1);
+  ASSERT_EQ(params.scale.size(), 1);
+  ASSERT_EQ(params.zero_point.size(), 1);
+  EXPECT_EQ(params.max[0], float_max);
+  EXPECT_EQ(params.min[0], 0.0);
+  int64_t zero_point = params.zero_point[0];
+  float scale = params.scale[0];
+  const float eps = 1e-7f;
+  EXPECT_EQ(zero_point, -128);
+  EXPECT_NEAR(scale, 6 / 255.0f, eps);
+}
+
+TEST(QuantizationUtilsTest, AsymmetricQuantizationParamsWithAllNegativeRange) {
+  // The min should get nudged to include 0, so the effective range is [-6, 0].
+  const float float_min = -6.0;
+  const float float_max = -1.0;
+  const int quant_min = -128;
+  const int quant_max = 127;
+  QuantizationParametersT params;
+  GetAsymmetricQuantizationParams(float_min, float_max, quant_min, quant_max,
+                                  &params);
+  ASSERT_EQ(params.max.size(), 1);
+  ASSERT_EQ(params.min.size(), 1);
+  ASSERT_EQ(params.scale.size(), 1);
+  ASSERT_EQ(params.zero_point.size(), 1);
+  EXPECT_EQ(params.max[0], 0.0);
+  EXPECT_EQ(params.min[0], float_min);
+  int64_t zero_point = params.zero_point[0];
+  float scale = params.scale[0];
+  const float eps = 1e-7f;
+  EXPECT_EQ(zero_point, 127);
+  EXPECT_NEAR(scale, 6 / 255.0f, eps);
+}
+
+TEST(QuantizationUtilsTest, AsymmetricQuantizationParamsWithZeroInRange) {
+  const float float_min = -5.0;
+  const float float_max = 1.0;
+  const int quant_min = -128;
+  const int quant_max = 127;
+  QuantizationParametersT params;
+  GetAsymmetricQuantizationParams(float_min, float_max, quant_min, quant_max,
+                                  &params);
+  ASSERT_EQ(params.max.size(), 1);
+  ASSERT_EQ(params.min.size(), 1);
+  ASSERT_EQ(params.scale.size(), 1);
+  ASSERT_EQ(params.zero_point.size(), 1);
+  EXPECT_EQ(params.max[0], float_max);
+  EXPECT_EQ(params.min[0], float_min);
+  int64_t zero_point = params.zero_point[0];
+  float scale = params.scale[0];
+  const float eps = 1e-7f;
+  EXPECT_NEAR(scale, 6 / 255.0f, eps);
+  EXPECT_GT(zero_point, quant_min);
+  EXPECT_LT(zero_point, quant_max);
+}
+
+TEST(QuantizationUtilsTest, AsymmetricQuantizationParamsWithZeroMinMax) {
+  const float float_min = 0;
+  const float float_max = 0;
+  const int quant_min = -128;
+  const int quant_max = 127;
+  QuantizationParametersT params;
+  GetAsymmetricQuantizationParams(float_min, float_max, quant_min, quant_max,
+                                  &params);
+  ASSERT_EQ(params.max.size(), 1);
+  ASSERT_EQ(params.min.size(), 1);
+  ASSERT_EQ(params.scale.size(), 1);
+  ASSERT_EQ(params.zero_point.size(), 1);
+  EXPECT_EQ(params.max[0], float_max);
+  EXPECT_EQ(params.min[0], float_min);
+  int64_t zero_point = params.zero_point[0];
+  float scale = params.scale[0];
+  const float eps = 1e-7f;
+  EXPECT_NEAR(scale, 0, eps);
+  EXPECT_NEAR(zero_point, quant_min, eps);
+  EXPECT_LT(zero_point, quant_max);
+}
+
+TEST(QuantizationUtilsTest, SymmetricPerChannelQuantization) {
+  // Set up an input with [3, 2, 2, 2] size and 0 is the channel index.
+  const std::vector<float> input = {
+      3.0, 2.0, 5.0,  -2.0, 3.0,  2.0,  5.0,  -2.0,  // Channel 1.
+      1.0, 2.0, 3.0,  4.0,  5.0,  6.0,  7.0,  8.0,   // Channel 2.
+      1.0, 0.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0,  // Channel 3.
+  };
+  const std::vector<int32_t> dimension = {3, 2, 2, 2};
+  const int channel_index = 0;
+
+  // Create holder for output scale and data.
+  std::vector<float> output_scales(3);
+  std::vector<int8_t> output_data(3 * 2 * 2 * 2);
+
+  // Call SymmetricPerChannelQuantization and verify the result.
+  SymmetricPerChannelQuantization(input.data(), dimension, channel_index,
+                                  &output_scales, &output_data);
+  const std::vector<float> expected_output_scales = {0.0393700786, 0.0629921257,
+                                                     0.0472440943};
+  const std::vector<int8_t> expected_output_data = {
+      76, 51, 127, -51, 76,  51,  127,  -51,   // Channel 1.
+      16, 32, 48,  64,  79,  95,  111,  127,   // Channel 2.
+      21, 0,  -21, -42, -64, -85, -106, -127,  // Channel 3.
+  };
+  EXPECT_THAT(output_scales, ElementsAreArray(expected_output_scales));
+  EXPECT_THAT(output_data, ElementsAreArray(expected_output_data));
+}
+
+TEST(QuantizationUtilsTest, SymmetricPerChannelQuantizeValues) {
+  // Set up an input with [3, 1, 1, 2] size and 0 is the channel index.
+  const std::vector<float> input = {
+      13.0, 21.0,  // Channel 1.
+      21.0, 22.0,  // Channel 2.
+      31.0, 40.0,  // Channel 3.
+  };
+  const std::vector<float> scales_inv = {2, 0.5, 3};
+  const std::vector<int32_t> dimension = {3, 1, 1, 2};
+  const int channel_index = 0;
+
+  // Create holder for output data.
+  std::vector<int8_t> output_data(3 * 1 * 1 * 2);
+
+  // Call SymmetricPerChannelQuantizeValues and verify the result.
+  SymmetricPerChannelQuantizeValues(input.data(), scales_inv, dimension,
+                                    channel_index, &output_data);
+  const std::vector<int8_t> expected_output_data = {
+      26, 42,   // Channel 1.
+      11, 11,   // Channel 2.
+      93, 120,  // Channel 3.
+  };
+  EXPECT_THAT(output_data, ElementsAreArray(expected_output_data));
+}
+
+}  // namespace
+}  // namespace utils
+}  // namespace optimize
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/tools/optimize/quantize_model.cc b/tensorflow/lite/tools/optimize/quantize_model.cc
new file mode 100644
index 0000000000000000000000000000000000000000..55a9b1c580a4c08a2f9dabeee527dbc919c74467
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@@ -0,0 +1,63 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/quantize_model.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "flatbuffers/flexbuffers.h"
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tools/optimize/subgraph_quantizer.h"
+
+namespace tflite {
+namespace optimize {
+
+TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
+                           ModelT* model, ErrorReporter* error_reporter) {
+  for (size_t subgraph_idx = 0; subgraph_idx < model->subgraphs.size();
+       subgraph_idx++) {
+    SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
+    internal::SubgraphQuantizer quantizer(model, subgraph, error_reporter);
+    for (int op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
+      auto status = quantizer.QuantizeOperator(op_idx);
+      if (status != kTfLiteOk) {
+        OperatorT* op = subgraph->operators[op_idx].get();
+        const BuiltinOperator op_code =
+            model->operator_codes[op->opcode_index]->builtin_code;
+        error_reporter->Report(
+            "Failed to quantized operator: %s in subgraph %d, node: %d",
+            EnumNameBuiltinOperator(op_code), subgraph_idx, op_idx);
+        return kTfLiteError;
+      }
+    }
+  }
+
+  flatbuffers::Offset<Model> output_model_location =
+      Model::Pack(*builder, model);
+  FinishModelBuffer(*builder, output_model_location);
+
+  return kTfLiteOk;
+}
+
+}  // namespace optimize
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/quantize_model.h b/tensorflow/lite/tools/optimize/quantize_model.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4a62435d3f5c719aab60755e86928487f05e4f0
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/quantize_model.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZE_MODEL_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZE_MODEL_H_
+
+#include <memory>
+
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace optimize {
+
+// Quantizes input_model and populates the provided builder with the new model.
+// input_model is required to have min/max information populated in its
+// quantization params.
+//
+// Note: This is a private API, subject to change.
+TfLiteStatus QuantizeModel(flatbuffers::FlatBufferBuilder* builder,
+                           ModelT* input_model, ErrorReporter* error_reporter);
+
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_QUANTIZE_MODEL_H_
diff --git a/tensorflow/lite/tools/optimize/quantize_model_test.cc b/tensorflow/lite/tools/optimize/quantize_model_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cf3eb2dde6c3aa95963178041545b9cd8a1909c7
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/quantize_model_test.cc
@@ -0,0 +1,153 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // TF:flatbuffers
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tools/optimize/quantize_model.h"
+#include "tensorflow/lite/tools/optimize/test_util.h"
+
+namespace {
+tensorflow::string* g_test_model_dir = nullptr;
+}  // namespace
+
+namespace tflite {
+namespace optimize {
+namespace {
+
+std::unique_ptr<FlatBufferModel> ReadTestModel() {
+  auto model_path = tensorflow::io::JoinPath(
+      *g_test_model_dir, internal::kConvModelWith0Plus10Weights);
+  return FlatBufferModel::BuildFromFile(model_path.c_str());
+}
+
+template <typename T>
+std::vector<T> GetAsVector(const flatbuffers::Vector<T>* vec) {
+  return std::vector<T>(vec->begin(), vec->end());
+}
+
+class QuantizeModelTest : public testing::Test {
+ protected:
+  QuantizeModelTest() {
+    input_model_ = ReadTestModel();
+    readonly_model_ = input_model_->GetModel();
+    readonly_model_->UnPackTo(&model_);
+  }
+
+  std::unique_ptr<FlatBufferModel> input_model_;
+  const Model* readonly_model_;
+  tflite::ModelT model_;
+  flatbuffers::FlatBufferBuilder builder_;
+  internal::FailOnErrorReporter error_reporter_;
+};
+
+TEST_F(QuantizeModelTest, QuantizationSucceeds) {
+  auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
+  EXPECT_EQ(status, kTfLiteOk);
+  const uint8_t* buffer = builder_.GetBufferPointer();
+  const Model* output_model = GetModel(buffer);
+  ASSERT_TRUE(output_model);
+}
+
+TEST_F(QuantizeModelTest, TensorShapesAndStructureIsUnchanged) {
+  auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
+  EXPECT_EQ(status, kTfLiteOk);
+  ASSERT_EQ(model_.subgraphs.size(), readonly_model_->subgraphs()->size());
+  for (size_t subgraph_idx = 0; subgraph_idx < model_.subgraphs.size();
+       subgraph_idx++) {
+    const auto quantized_graph = model_.subgraphs[subgraph_idx].get();
+    const auto float_graph = readonly_model_->subgraphs()->Get(subgraph_idx);
+    ASSERT_EQ(quantized_graph->tensors.size(), float_graph->tensors()->size());
+    for (size_t i = 0; i < quantized_graph->tensors.size(); i++) {
+      const auto quant_tensor = quantized_graph->tensors[i].get();
+      const auto float_tensor = float_graph->tensors()->Get(i);
+      EXPECT_EQ(quant_tensor->buffer, float_tensor->buffer());
+      EXPECT_EQ(quant_tensor->is_variable, float_tensor->is_variable());
+      EXPECT_EQ(quant_tensor->shape, GetAsVector(float_tensor->shape()));
+      EXPECT_EQ(quant_tensor->name, float_tensor->name()->str());
+    }
+  }
+}
+
+TEST_F(QuantizeModelTest, OperatorsAreUnchanged) {
+  auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
+  EXPECT_EQ(status, kTfLiteOk);
+  ASSERT_EQ(model_.operator_codes.size(),
+            readonly_model_->operator_codes()->size());
+  for (size_t i = 0; i < model_.operator_codes.size(); i++) {
+    const auto float_model_op = readonly_model_->operator_codes()->Get(i);
+    EXPECT_EQ(model_.operator_codes[i]->builtin_code,
+              float_model_op->builtin_code());
+    EXPECT_EQ(model_.operator_codes[i]->version, float_model_op->version());
+  }
+
+  ASSERT_EQ(model_.subgraphs.size(), readonly_model_->subgraphs()->size());
+  for (size_t subgraph_idx = 0; subgraph_idx < model_.subgraphs.size();
+       subgraph_idx++) {
+    const auto quantized_graph = model_.subgraphs[subgraph_idx].get();
+    const auto float_graph = readonly_model_->subgraphs()->Get(subgraph_idx);
+    ASSERT_EQ(quantized_graph->operators.size(),
+              float_graph->operators()->size());
+    for (size_t i = 0; i < quantized_graph->operators.size(); i++) {
+      const auto quant_op = quantized_graph->operators[i].get();
+      const auto float_op = float_graph->operators()->Get(i);
+      EXPECT_EQ(quant_op->inputs, GetAsVector(float_op->inputs()));
+      EXPECT_EQ(quant_op->outputs, GetAsVector(float_op->outputs()));
+      EXPECT_EQ(quant_op->opcode_index, float_op->opcode_index());
+    }
+  }
+}
+
+TEST_F(QuantizeModelTest, GraphIsFullyQuantized) {
+  auto status = QuantizeModel(&builder_, &model_, &error_reporter_);
+  EXPECT_EQ(status, kTfLiteOk);
+  for (const auto& subgraph : model_.subgraphs) {
+    for (const auto& tensor : subgraph->tensors) {
+      EXPECT_TRUE(tensor->type == TensorType_INT32 ||
+                  tensor->type == TensorType_INT8);
+    }
+  }
+}
+
+}  // namespace
+}  // namespace optimize
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  tensorflow::string model_file;
+  const std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("test_model_file", &model_file,
+                       "Path to test tflite model file."),
+  };
+
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    std::cerr << "Required test_model_file\n";
+    std::abort();
+  }
+  g_test_model_dir =
+      new tensorflow::string(tensorflow::io::Dirname(model_file));
+  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/tools/optimize/quantize_weights.cc b/tensorflow/lite/tools/optimize/quantize_weights.cc
index de3c0b03237c1c85d1cfbeafc2ce8db4faf70ff6..f0a280f1c1fc2a3ab45c8e5916d9c26254a8849b 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights.cc
@@ -21,11 +21,12 @@ limitations under the License.
 
 #include "flatbuffers/flexbuffers.h"
 #include "absl/memory/memory.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/context.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/tools/optimize/quantization_utils.h"
 
 namespace tflite {
 namespace optimize {
@@ -33,72 +34,36 @@ namespace optimize {
 namespace {
 
 typedef struct {
-  TensorT* tensor;
+  OperatorT* op;
+  // The index of the op in the operators vector.
+  int32_t op_idx;
   // The index of the tensor to quantize in subgraph->tensors.
-  int32_t tensor_idx;
-  // The index of the tensor of the weight tensor to be quantize in op->inputs.
   int32_t op_input_idx;
-  // True if the tensor supports hybrid evaluation.
-  bool eval_hybrid;
-} TensorInfo;
+} ConsumerOpInfo;
 
 // The default minimum number of elements a weights array must have to be
 // quantized by this transformation.
 const int kWeightsMinNumElementsDefault = 1024;
 
-// Nudge min and max so that floating point 0 falls exactly on a quantized
-// value, returning the nudges scale and zero_point.
-//
-// Although this code originates from FakeQuantization in quantized training,
-// we may deviate from that implementation as we please since we do not fine
-// tune the weights with quantized training.
-void GetAsymmetricQuantizationParams(
-    const float min, const float max, const int quant_min, const int quant_max,
-    QuantizationParametersT* quantization_params) {
-  // Adjust the boundaries to guarantee 0 is included.
-  const float quant_min_float = std::min(static_cast<float>(quant_min), 0.0f);
-  const float quant_max_float = std::max(static_cast<float>(quant_max), 0.0f);
-  const float scale = (max - min) / (quant_max_float - quant_min_float);
-  const float zero_point_from_min = quant_min_float - min / scale;
-  int64_t zero_point;
-  if (zero_point_from_min < quant_min_float) {
-    zero_point = static_cast<int64_t>(quant_min);
-  } else if (zero_point_from_min > quant_max_float) {
-    zero_point = static_cast<int64_t>(quant_max);
-  } else {
-    zero_point = static_cast<int64_t>(std::round(zero_point_from_min));
-  }
-  quantization_params->scale = std::vector<float>(1, scale);
-  quantization_params->zero_point = std::vector<int64_t>(1, zero_point);
-}
-
-// Returns the number of elements in tensor.
-uint64_t NumElements(const TensorT* tensor) {
-  if (tensor->shape.empty()) {
-    LOG(FATAL) << "Tensor has no shape information.";
-  }
-  uint64_t num_elements = 1;
-  for (const uint64_t dim : tensor->shape) {
-    num_elements *= dim;
-  }
-  return num_elements;
-}
-
-uint64_t CountTensorConsumers(const ModelT* model, const SubGraphT* subgraph,
-                              int32_t tensor_idx) {
-  uint64_t count = 0;
+// Gets the operators that consume tensor_idx.
+std::vector<ConsumerOpInfo> GetTensorConsumers(const ModelT* model,
+                                               const SubGraphT* subgraph,
+                                               int32_t tensor_idx) {
+  // TODO(suharshs): If this proves to be too slow, avoid calling it per tensor,
+  // instead doing one sweep for the entire model.
+  std::vector<ConsumerOpInfo> consumer_ops;
   for (int op_idx = 0; op_idx < subgraph->operators.size(); ++op_idx) {
-    const OperatorT* op = subgraph->operators[op_idx].get();
+    OperatorT* op = subgraph->operators[op_idx].get();
     if (op == nullptr) {
       continue;
     }
     for (int i = 0; i < op->inputs.size(); ++i) {
       if (op->inputs[i] == tensor_idx) {
-        count++;
+        consumer_ops.push_back({op, op_idx, i});
       }
     }
   }
-  return count;
+  return consumer_ops;
 }
 
 // Gets the list of op->inputs indices of the weights inputs to be quantized for
@@ -156,23 +121,39 @@ bool IsHybridEvaluationOp(const OperatorT* op, const BuiltinOperator& op_code) {
   return eval_hybrid;
 }
 
-// Returns a vector of TensorInfos for each input tensor of op that should be
-// quantized.
-std::vector<TensorInfo> GetQuantizableTensorsFromOperator(
+// Returns true if all of the op's inputs are quantized.
+bool CheckAllOpInputsQuantized(const SubGraphT* subgraph, const OperatorT* op,
+                               const BuiltinOperator& op_code) {
+  std::vector<int32_t> op_input_indices = GetWeightInputIndices(op_code);
+  for (const int32_t op_input_idx : op_input_indices) {
+    int32_t tensor_idx = op->inputs[op_input_idx];
+
+    if (tensor_idx == -1) {
+      // Optional tensor.
+      continue;
+    }
+
+    TensorT* tensor = subgraph->tensors[tensor_idx].get();
+
+    if (tensor->type != TensorType_INT8) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Inserts Tensors for each input tensor of op that should be
+// quantized into tensor_map.
+TfLiteStatus InsertQuantizableInputTensorsFromOperator(
     const ModelT* model, const OperatorT* op, uint64_t weights_min_num_elements,
-    bool use_hybrid_evaluation) {
+    std::unordered_map<int32_t, TensorT*>* tensor_map) {
   SubGraphT* subgraph = model->subgraphs.at(0).get();
   const BuiltinOperator op_code =
       model->operator_codes[op->opcode_index]->builtin_code;
 
-  std::vector<TensorInfo> tensor_infos;
-
-  bool eval_hybrid = use_hybrid_evaluation && IsHybridEvaluationOp(op, op_code);
-
   std::vector<int32_t> op_input_indices = GetWeightInputIndices(op_code);
   for (const int32_t op_input_idx : op_input_indices) {
     int32_t tensor_idx = op->inputs[op_input_idx];
-
     if (tensor_idx == -1) {
       LOG(INFO) << "Skipping optional tensor input " << op_input_idx
                 << " of operation " << EnumNameBuiltinOperator(op_code);
@@ -180,28 +161,18 @@ std::vector<TensorInfo> GetQuantizableTensorsFromOperator(
     }
 
     TensorT* tensor = subgraph->tensors[tensor_idx].get();
-    // TODO(suharshs): Support shared weights, i.e. If two tensors share the
-    // same weight array, things may break. (i.e. SSD object detection)
-    if (CountTensorConsumers(model, subgraph, tensor_idx) != 1) {
-      LOG(INFO) << "Skipping quantization of tensor " << tensor->name
-                << " that is shared between multiple multiple operations.";
-      continue;
-    }
-
     if (tensor->type != TensorType_FLOAT32) {
       LOG(INFO) << "Skipping quantization of tensor " << tensor->name
                 << " that is not type float.";
       continue;
     }
 
-    const uint64_t num_elements = NumElements(tensor);
+    uint64_t num_elements;
+    TF_LITE_ENSURE_STATUS(utils::NumElements(*tensor, &num_elements));
     if (num_elements < weights_min_num_elements) {
       LOG(INFO) << "Skipping quantization of tensor " << tensor->name
                 << " because it has fewer than " << weights_min_num_elements
                 << " elements (" << num_elements << ").";
-      // If one of the weights isn't quantized, then we cannot use the hybrid
-      // kernel for this operation, since it expects everything to be quantized.
-      eval_hybrid = false;
       continue;
     }
 
@@ -213,57 +184,8 @@ std::vector<TensorInfo> GetQuantizableTensorsFromOperator(
       continue;
     }
 
-    TensorInfo tensor_info;
-    tensor_info.eval_hybrid = eval_hybrid;
-    tensor_info.op_input_idx = op_input_idx;
-    tensor_info.tensor_idx = tensor_idx;
-    tensor_info.tensor = tensor;
-
-    tensor_infos.push_back(tensor_info);
-  }
-
-  return tensor_infos;
-}
-
-// Quantizes tensor using asymmetric quantization with the min and max elements
-// of the tensor. This is needed to pass to Dequantize operations.
-TfLiteStatus AsymmetricQuantizeTensor(ModelT* model, TensorT* tensor) {
-  BufferT* buffer = model->buffers[tensor->buffer].get();
-  float* float_data = reinterpret_cast<float*>(buffer->data.data());
-  const uint64_t num_elements = NumElements(tensor);
-  LOG(INFO) << "Quantizing tensor " << tensor->name << " with " << num_elements
-            << " elements for float evaluation.";
-
-  // Compute the quantization params.
-  float min_value = *std::min_element(float_data, float_data + num_elements);
-  float max_value = *std::max_element(float_data, float_data + num_elements);
-
-  if (tensor->quantization == nullptr) {
-    tensor->quantization = absl::make_unique<QuantizationParametersT>();
-  }
-  GetAsymmetricQuantizationParams(min_value, max_value, 0, 255,
-                                  tensor->quantization.get());
-
-  // Quantize the buffer.
-  std::vector<uint8_t> quantized_buffer;
-  quantized_buffer.resize(num_elements);
-  const double inverse_scale = 1. / tensor->quantization->scale[0];
-  for (std::size_t i = 0; i < num_elements; i++) {
-    const float src_val = float_data[i];
-    double scaled_val;
-    if (tensor->quantization->scale[0] == 0) {
-      scaled_val = tensor->quantization->zero_point[0];
-    } else {
-      scaled_val =
-          tensor->quantization->zero_point[0] + inverse_scale * src_val;
-    }
-    uint8_t integer_val = static_cast<uint8_t>(std::round(scaled_val));
-    quantized_buffer[i] = integer_val;
+    tensor_map->insert({tensor_idx, tensor});
   }
-  model->buffers[tensor->buffer]->data = quantized_buffer;
-
-  // Update the tensor type.
-  tensor->type = TensorType_UINT8;
 
   return kTfLiteOk;
 }
@@ -274,9 +196,10 @@ TfLiteStatus AsymmetricQuantizeTensor(ModelT* model, TensorT* tensor) {
 TfLiteStatus SymmetricQuantizeTensor(ModelT* model, TensorT* tensor) {
   BufferT* buffer = model->buffers[tensor->buffer].get();
   float* float_data = reinterpret_cast<float*>(buffer->data.data());
-  const uint64_t num_elements = NumElements(tensor);
+  uint64_t num_elements;
+  TF_LITE_ENSURE_STATUS(utils::NumElements(*tensor, &num_elements));
   LOG(INFO) << "Quantizing tensor " << tensor->name << " with " << num_elements
-            << " elements for hybrid evaluation.";
+            << " elements.";
 
   std::vector<int8_t> quantized_buffer;
   quantized_buffer.resize(num_elements);
@@ -297,7 +220,7 @@ TfLiteStatus SymmetricQuantizeTensor(ModelT* model, TensorT* tensor) {
                                               uint8_buffer + num_elements);
 
   // Update the tensor type.
-  tensor->type = TensorType_UINT8;
+  tensor->type = TensorType_INT8;
 
   return kTfLiteOk;
 }
@@ -313,7 +236,8 @@ int32_t GetOrInsertDequantizeOpCodeIndex(ModelT* model) {
   model->operator_codes.push_back(absl::make_unique<OperatorCodeT>());
   int op_code_idx = model->operator_codes.size() - 1;
   model->operator_codes[op_code_idx]->builtin_code = BuiltinOperator_DEQUANTIZE;
-  // TODO(suharshs): How should the version be set in this op_code?
+  // Version 2 and onwards supports INT8 inputs.
+  model->operator_codes[op_code_idx]->version = 2;
 
   // Return the index of the newly placed OperatorCodeT.
   return op_code_idx;
@@ -340,6 +264,26 @@ void MakeTensor(const string& name, const std::vector<int32_t>& shape,
   tensor->reset(tensor_raw);
 }
 
+// Updates operator code versions for the operators with INT8 inputs.
+void UpdateInt8OperatorVersions(ModelT* model) {
+  for (int i = 0; i < model->operator_codes.size(); ++i) {
+    const BuiltinOperator& op_code = model->operator_codes[i]->builtin_code;
+    if (op_code == BuiltinOperator_CONV_2D || op_code == BuiltinOperator_SVDF ||
+        op_code == BuiltinOperator_EMBEDDING_LOOKUP ||
+        op_code == BuiltinOperator_RNN ||
+        op_code == BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN ||
+        op_code == BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM ||
+        op_code == BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN) {
+      model->operator_codes[i]->version = 2;
+
+    } else if (op_code == BuiltinOperator_FULLY_CONNECTED ||
+               op_code == BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM ||
+               op_code == BuiltinOperator_LSTM) {
+      model->operator_codes[i]->version = 3;
+    }
+  }
+}
+
 TfLiteStatus QuantizeWeightsInternal(flatbuffers::FlatBufferBuilder* builder,
                                      const Model* input_model,
                                      bool use_hybrid_evaluation,
@@ -357,48 +301,82 @@ TfLiteStatus QuantizeWeightsInternal(flatbuffers::FlatBufferBuilder* builder,
   SubGraphT* subgraph = model->subgraphs.at(0).get();
 
   std::vector<std::unique_ptr<OperatorT>> new_operators;
+  std::unordered_map<int32_t, TensorT*> tensor_map;
   for (int i = 0; i < subgraph->operators.size(); ++i) {
     OperatorT* op = subgraph->operators[i].get();
+    TF_LITE_ENSURE_STATUS(InsertQuantizableInputTensorsFromOperator(
+        model.get(), op, weights_min_num_elements, &tensor_map));
+  }
+
+  // The unordered_map ensures that we quantize each tensor exactly once.
+  // TODO(suharshs): This map key isn't sufficient when we support multiple
+  // subgraphs.
+  for (std::pair<int32_t, TensorT*> tensor_pair : tensor_map) {
+    // Quantize the tensor.
+    TF_LITE_ENSURE_STATUS(
+        SymmetricQuantizeTensor(model.get(), tensor_pair.second));
+  }
 
-    std::vector<TensorInfo> tensor_infos = GetQuantizableTensorsFromOperator(
-        model.get(), op, weights_min_num_elements, use_hybrid_evaluation);
-
-    for (const TensorInfo& tensor_info : tensor_infos) {
-      if (tensor_info.eval_hybrid) {
-        // Quantize the tensor.
-        TF_LITE_ENSURE_STATUS(
-            SymmetricQuantizeTensor(model.get(), tensor_info.tensor));
-      } else {
-        // Quantize the tensor.
-        TF_LITE_ENSURE_STATUS(
-            AsymmetricQuantizeTensor(model.get(), tensor_info.tensor));
-
-        // Create a new tensor to be the output of the dequantize op.
-        std::unique_ptr<TensorT> dequantize_output;
-        MakeTensor(tensor_info.tensor->name + "_dequantize",
-                   tensor_info.tensor->shape, &dequantize_output);
-        const int32_t dequantize_output_idx = subgraph->tensors.size();
-        subgraph->tensors.push_back(std::move(dequantize_output));
-
-        // Create the Dequantize operation.
-        std::unique_ptr<OperatorT> dequantize_op;
-        MakeDequantizeOperator(model.get(), &dequantize_op,
-                               tensor_info.tensor_idx, dequantize_output_idx);
-
-        // Update the op_input of tensor_idx to dequantize_output_idx.
-        op->inputs[tensor_info.op_input_idx] = dequantize_output_idx;
-
-        // Insert the newly created Dequantize operation.
-        new_operators.push_back(std::move(dequantize_op));
+  // Examine the tensor consumers to determine which require dequantize ops.
+  for (const auto& tensor_pair : tensor_map) {
+    const int32_t tensor_idx = tensor_pair.first;
+    TensorT* tensor = tensor_pair.second;
+    std::vector<ConsumerOpInfo> consumer_op_infos =
+        GetTensorConsumers(model.get(), subgraph, tensor_idx);
+
+    std::vector<ConsumerOpInfo> dequant_op_infos;  // Ops that need dequants.
+    for (ConsumerOpInfo& consumer_op_info : consumer_op_infos) {
+      OperatorT* consumer_op = consumer_op_info.op;
+      const BuiltinOperator consumer_op_code =
+          model->operator_codes[consumer_op->opcode_index]->builtin_code;
+      // If the op is a hybrid op and all the required tensors are quantized,
+      // we have no further work to do, but for all ops that require
+      // dequantization we need to add a Dequantize op.
+      bool eval_hybrid =
+          use_hybrid_evaluation &&
+          IsHybridEvaluationOp(consumer_op, consumer_op_code) &&
+          CheckAllOpInputsQuantized(subgraph, consumer_op, consumer_op_code);
+      if (!eval_hybrid) {
+        dequant_op_infos.push_back(consumer_op_info);
       }
     }
-    // After (maybe) quantizing inputs, we copy the operator into the new list.
-    new_operators.push_back(std::move(subgraph->operators[i]));
+
+    // If no ops require dequant, we are done for this tensor.
+    if (dequant_op_infos.empty()) {
+      continue;
+    }
+
+    // Create a new tensor to be the output of the dequantize op.
+    std::unique_ptr<TensorT> dequantize_output;
+    const string dequant_name = tensor->name + "_dequantize";
+    MakeTensor(dequant_name, tensor->shape, &dequantize_output);
+    const int32_t dequantize_output_idx = subgraph->tensors.size();
+    subgraph->tensors.push_back(std::move(dequantize_output));
+
+    // Create the Dequantize operation.
+    std::unique_ptr<OperatorT> dequantize_op;
+    MakeDequantizeOperator(model.get(), &dequantize_op, tensor_idx,
+                           dequantize_output_idx);
+
+    LOG(INFO) << "Creating Dequantize op with name " << dequant_name << ".";
+
+    // Update the op_input of all the ops that need the created dequantize
+    // operation.
+    int32_t min_op_idx = 0;
+    for (ConsumerOpInfo& dequant_op_info : dequant_op_infos) {
+      dequant_op_info.op->inputs[dequant_op_info.op_input_idx] =
+          dequantize_output_idx;
+      min_op_idx = std::min(dequant_op_info.op_idx, min_op_idx);
+    }
+
+    // Insert the newly created Dequantize operation before the earliest
+    // consumer, since TFLite requires operators to be topo-sorted.
+    subgraph->operators.insert(subgraph->operators.begin() + min_op_idx,
+                               std::move(dequantize_op));
   }
 
-  // At this point all unique_ptrs in the original operators are invalid, and
-  // we need to replace it with the new_operators vector.
-  subgraph->operators = std::move(new_operators);
+  // Update the modified operator code versions.
+  UpdateInt8OperatorVersions(model.get());
 
   flatbuffers::Offset<Model> output_model_location =
       Model::Pack(*builder, model.get());
@@ -412,11 +390,12 @@ TfLiteStatus QuantizeWeightsInternal(flatbuffers::FlatBufferBuilder* builder,
 namespace internal {
 TfLiteStatus QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
                              const Model* input_model,
+                             uint64_t weights_min_num_elements,
                              bool use_hybrid_evaluation) {
   // By default we require that only weights with more than
   // kWeightsMinSizeDefault elements are quantized.
   return QuantizeWeightsInternal(builder, input_model, use_hybrid_evaluation,
-                                 kWeightsMinNumElementsDefault);
+                                 weights_min_num_elements);
 }
 }  // namespace internal
 
diff --git a/tensorflow/lite/tools/optimize/quantize_weights.h b/tensorflow/lite/tools/optimize/quantize_weights.h
index c2c0b0ce83435dc423a62cea598e35ba45a0561f..6baecc210fa0b52ddccace05a3fc7d6a9908712d 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights.h
+++ b/tensorflow/lite/tools/optimize/quantize_weights.h
@@ -48,6 +48,7 @@ namespace internal {
 // evaluation disabled.
 TfLiteStatus QuantizeWeights(flatbuffers::FlatBufferBuilder* builder,
                              const Model* input_model,
+                             uint64_t weights_min_num_elements,
                              bool use_hybrid_evaluation);
 }  // namespace internal
 
diff --git a/tensorflow/lite/tools/optimize/quantize_weights_test.cc b/tensorflow/lite/tools/optimize/quantize_weights_test.cc
index 32725e5ee29c364d56754c08a2cb1084ef049fdb..a18b3bb7ffecfa71f24890fb0cbfbdc94d66c0c2 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights_test.cc
@@ -12,215 +12,346 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/tools/optimize/quantize_weights.h"
-
+#include <cstddef>
+#include <cstdint>
 #include <memory>
 
-#include "flatbuffers/flexbuffers.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // TF:flatbuffers
+#include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tools/optimize/quantize_weights.h"
+#include "tensorflow/lite/tools/optimize/test_util.h"
+
+namespace {
+tensorflow::string* g_test_model_dir = nullptr;
+}  // namespace
 
 namespace tflite {
 namespace optimize {
 namespace {
 
-class QuantizeWeightsTest : public ::testing::Test {
+std::unique_ptr<FlatBufferModel> ReadTestModel() {
+  auto model_path = tensorflow::io::JoinPath(
+      *g_test_model_dir, internal::kConvModelWith0Plus10Weights);
+  return FlatBufferModel::BuildFromFile(model_path.c_str());
+}
+
+std::unique_ptr<FlatBufferModel> ReadSharedWeightsTestModel() {
+  auto model_path = tensorflow::io::JoinPath(*g_test_model_dir,
+                                             internal::kModelWithSharedWeights);
+  return FlatBufferModel::BuildFromFile(model_path.c_str());
+}
+
+template <typename T>
+std::vector<T> GetAsVector(const flatbuffers::Vector<T>* vec) {
+  return std::vector<T>(vec->begin(), vec->end());
+}
+
+class QuantizeWeightsTest : public testing::Test {
  protected:
-  int GetElementsNum(const TensorT* tensor) {
-    int tensor_size = 1;
-    for (const int dim : tensor->shape) {
-      tensor_size *= dim;
-    }
-    return tensor_size;
+  QuantizeWeightsTest() {}
+
+  void LoadBasicModel() {
+    input_model_ = ReadTestModel();
+    model_ = input_model_->GetModel();
   }
 
-  const OperatorT* GetOpWithOutput(const SubGraphT* subgraph,
-                                   int32_t output_tensor_idx) {
-    for (int i = 0; i < subgraph->operators.size(); ++i) {
-      OperatorT* op = subgraph->operators[i].get();
-      if (std::find(op->outputs.begin(), op->outputs.end(),
-                    output_tensor_idx) != op->outputs.end()) {
-        return op;
-      }
-    }
-    return nullptr;
+  void LoadSharedWeightsModel() {
+    input_model_ = ReadSharedWeightsTestModel();
+    model_ = input_model_->GetModel();
   }
 
-  void SymmetricDequantizeAndCompare(const BufferT* input_buffer,
-                                     const BufferT* output_buffer,
-                                     float scale) {
-    const float* input_buffer_data =
-        reinterpret_cast<const float*>(input_buffer->data.data());
-    const int8_t* output_buffer_data =
-        reinterpret_cast<const int8_t*>(output_buffer->data.data());
-    for (int i = 0; i < output_buffer->data.size(); i++) {
-      float diff = input_buffer_data[i] - (output_buffer_data[i] * scale);
-      ASSERT_TRUE(std::abs(diff) <= scale);
+  std::unique_ptr<FlatBufferModel> input_model_;
+  const Model* model_;
+
+  bool IsModelInputOrOutput(const Model* model, uint32_t tensor_idx) {
+    for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
+         ++subgraph_idx) {
+      const auto subgraph = model->subgraphs()->Get(subgraph_idx);
+      for (size_t i = 0; i < subgraph->inputs()->size(); ++i) {
+        if (subgraph->inputs()->Get(i) == tensor_idx) {
+          return true;
+        }
+      }
+      for (size_t i = 0; i < subgraph->outputs()->size(); ++i) {
+        if (subgraph->outputs()->Get(i) == tensor_idx) {
+          return true;
+        }
+      }
     }
+    return false;
   }
 
-  void AsymmetricDequantizeAndCompare(const BufferT* input_buffer,
-                                      const BufferT* output_buffer, float scale,
-                                      int64_t zero_point) {
-    const float* input_buffer_data =
-        reinterpret_cast<const float*>(input_buffer->data.data());
-    const uint8_t* output_buffer_data = output_buffer->data.data();
-    for (int i = 0; i < output_buffer->data.size(); i++) {
-      float diff =
-          input_buffer_data[i] - ((output_buffer_data[i] - zero_point) * scale);
-      ASSERT_TRUE(std::abs(diff) <= scale);
+  // Returns the producer op code of the specified tensor_idx.
+  bool GetProducerOpCode(const Model* model, uint32_t subgraph_idx,
+                         uint32_t tensor_idx,
+                         tflite::BuiltinOperator* op_code) {
+    const auto subgraph = model->subgraphs()->Get(subgraph_idx);
+    for (size_t op_idx = 0; op_idx < subgraph->operators()->size(); ++op_idx) {
+      const auto op = subgraph->operators()->Get(op_idx);
+      for (size_t i = 0; i < op->outputs()->size(); ++i) {
+        if (op->outputs()->Get(i) == tensor_idx) {
+          const uint32_t op_code_idx = op->opcode_index();
+          *op_code = model->operator_codes()->Get(op_code_idx)->builtin_code();
+          return true;
+        }
+      }
     }
+    return false;
   }
+};
 
-  void CheckWeights(const Model* input_model_packed,
-                    const Model* output_model_packed,
-                    bool use_hybrid_evaluation,
-                    uint64_t weights_min_num_elements = 1024) {
-    std::unique_ptr<ModelT> input_model;
-    input_model.reset(input_model_packed->UnPack());
-
-    std::unique_ptr<ModelT> output_model;
-    output_model.reset(output_model_packed->UnPack());
-
-    SubGraphT* subgraph = output_model->subgraphs.at(0).get();
-
-    for (int i = 0; i < subgraph->operators.size(); ++i) {
-      OperatorT* op = subgraph->operators[i].get();
-      const BuiltinOperator op_code =
-          output_model->operator_codes[op->opcode_index]->builtin_code;
-
-      // These are the operations that should be quantized.
-      // TODO(suharshs): Right now this test only checks the relevant operations
-      // for the mobilenet v1 model used in the tests below.
-      int32_t tensor_idx;
-      if (op_code == BuiltinOperator_CONV_2D ||
-          op_code == BuiltinOperator_DEPTHWISE_CONV_2D ||
-          op_code == BuiltinOperator_FULLY_CONNECTED) {
-        tensor_idx = op->inputs[1];
-      } else {
-        continue;
-      }
+TEST_F(QuantizeWeightsTest, QuantizationSucceeds) {
+  LoadBasicModel();
+  flatbuffers::FlatBufferBuilder builder;
+  auto status = QuantizeWeights(&builder, model_, 0);
+  EXPECT_EQ(status, kTfLiteOk);
 
-      bool eval_hybrid = false;
-      // These are the ops that support hybrid evaluation.
-      if (op_code == BuiltinOperator_FULLY_CONNECTED ||
-          op_code == BuiltinOperator_CONV_2D) {
-        eval_hybrid = true;
-      }
+  const uint8_t* buffer = builder.GetBufferPointer();
+  const Model* output_model = GetModel(buffer);
+  ASSERT_TRUE(output_model);
+}
 
-      const TensorT* tensor = subgraph->tensors[tensor_idx].get();
-      int tensor_size = GetElementsNum(tensor);
-      // If the tensor_size is less than 1024 we expect the tensor to remain
-      // unquantized.
-      if (tensor_size < weights_min_num_elements) {
-        ASSERT_TRUE(tensor->type == TensorType_FLOAT32)
-            << tensor->name << " of type " << tensor->type;
-        const OperatorT* preceding_op = GetOpWithOutput(subgraph, tensor_idx);
-        // The weight tensor should not come from a dequantize op.
-        ASSERT_TRUE(preceding_op == nullptr);
-      } else if (use_hybrid_evaluation && eval_hybrid) {
-        // The input to the op should still be uint8.
-        ASSERT_TRUE(tensor->type == TensorType_UINT8) << tensor->name;
-        // The weight tensor should not come from a dequantize op.
-        const OperatorT* preceding_op = GetOpWithOutput(subgraph, tensor_idx);
-        ASSERT_TRUE(preceding_op == nullptr);
-
-        // Test symmetric quantization.
-        SymmetricDequantizeAndCompare(
-            input_model->buffers[tensor->buffer].get(),
-            output_model->buffers[tensor->buffer].get(),
-            tensor->quantization->scale[0]);
+TEST_F(QuantizeWeightsTest, WeightsMinNumElements) {
+  LoadBasicModel();
+  // Make weights_min_size sufficiently large such that no quantization should
+  // happen, i.e. the original model is the same size as the old one.
+  flatbuffers::FlatBufferBuilder builder;
+  const uint64_t kWeightsMinNumElements = 1000000;
+  EXPECT_EQ(QuantizeWeights(&builder, model_, kWeightsMinNumElements),
+            kTfLiteOk);
 
+  const uint8_t* buffer = builder.GetBufferPointer();
+  const Model* output_model = GetModel(buffer);
+  ASSERT_TRUE(output_model);
+
+  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
+       subgraph_idx++) {
+    const auto quantized_graph = output_model->subgraphs()->Get(subgraph_idx);
+    const auto float_graph = model_->subgraphs()->Get(subgraph_idx);
+    ASSERT_EQ(quantized_graph->tensors()->size(),
+              float_graph->tensors()->size());
+    for (size_t i = 0; i < quantized_graph->tensors()->size(); i++) {
+      const auto quant_tensor = quantized_graph->tensors()->Get(i);
+      const auto float_tensor = float_graph->tensors()->Get(i);
+      // Everything should remain equal between the two graphs.
+      EXPECT_EQ(quant_tensor->buffer(), float_tensor->buffer());
+      EXPECT_EQ(quant_tensor->is_variable(), float_tensor->is_variable());
+      EXPECT_EQ(GetAsVector(quant_tensor->shape()),
+                GetAsVector(float_tensor->shape()));
+      EXPECT_EQ(quant_tensor->name()->str(), float_tensor->name()->str());
+      EXPECT_EQ(quant_tensor->type(), float_tensor->type());
+    }
+  }
+}
+
+TEST_F(QuantizeWeightsTest, HybridConv) {
+  LoadBasicModel();
+  flatbuffers::FlatBufferBuilder builder;
+  auto status = QuantizeWeights(&builder, model_, 0);
+  EXPECT_EQ(status, kTfLiteOk);
+
+  const uint8_t* buffer = builder.GetBufferPointer();
+  const Model* output_model = GetModel(buffer);
+  ASSERT_TRUE(output_model);
+
+  // Nothing should change.
+  ASSERT_EQ(output_model->subgraphs()->size(), model_->subgraphs()->size());
+  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
+       subgraph_idx++) {
+    const auto quantized_graph = output_model->subgraphs()->Get(subgraph_idx);
+    const auto float_graph = model_->subgraphs()->Get(subgraph_idx);
+    ASSERT_EQ(quantized_graph->tensors()->size(),
+              float_graph->tensors()->size());
+    // Make sure the graph only has one Conv operation.
+    ASSERT_EQ(quantized_graph->operators()->size(), 1);
+    const auto op = quantized_graph->operators()->Get(0);
+    const uint32_t op_code_idx = op->opcode_index();
+    ASSERT_EQ(output_model->operator_codes()->Get(op_code_idx)->builtin_code(),
+              BuiltinOperator_CONV_2D);
+    for (size_t i = 0; i < quantized_graph->tensors()->size(); i++) {
+      const auto quant_tensor = quantized_graph->tensors()->Get(i);
+      const auto float_tensor = float_graph->tensors()->Get(i);
+      EXPECT_EQ(quant_tensor->buffer(), float_tensor->buffer());
+      EXPECT_EQ(quant_tensor->is_variable(), float_tensor->is_variable());
+      EXPECT_EQ(GetAsVector(quant_tensor->shape()),
+                GetAsVector(float_tensor->shape()));
+      EXPECT_EQ(quant_tensor->name()->str(), float_tensor->name()->str());
+      // If the tensor is a weight, it should have type INT8, otherwise it
+      // should stay with type FLOAT32.
+      // If the tensor is a bias, it should have type FLOAT32.
+      if (quant_tensor->name()->str() == "conv_bias") {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
+      } else if (IsModelInputOrOutput(output_model, i)) {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
+      } else if (quant_tensor->buffer() != 0) {
+        EXPECT_EQ(quant_tensor->type(), TensorType_INT8)
+            << quant_tensor->name()->str();
       } else {
-        // The input to the op should still be float.
-        ASSERT_TRUE(tensor->type == TensorType_FLOAT32) << tensor->name;
-        const OperatorT* preceding_op = GetOpWithOutput(subgraph, tensor_idx);
-        ASSERT_TRUE(preceding_op != nullptr);
-        // The float input should be the dequantize output.
-        ASSERT_TRUE(output_model->operator_codes[preceding_op->opcode_index]
-                        ->builtin_code == BuiltinOperator_DEQUANTIZE);
-        // Finally, ensure that the input to the dequantize operation is
-        // quantized.
-        const TensorT* quantized_tensor =
-            subgraph->tensors[preceding_op->inputs[0]].get();
-        ASSERT_TRUE(quantized_tensor->type == TensorType_UINT8);
-
-        // Test the assymetric quantization.
-        AsymmetricDequantizeAndCompare(
-            input_model->buffers[quantized_tensor->buffer].get(),
-            output_model->buffers[quantized_tensor->buffer].get(),
-            quantized_tensor->quantization->scale[0],
-            quantized_tensor->quantization->zero_point[0]);
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
       }
     }
   }
-};
-
-TEST_F(QuantizeWeightsTest, SimpleTestWithHybrid) {
-  string model_path =
-      "third_party/tensorflow/lite/tools/optimize/testdata/"
-      "mobilenet_v1_0.25_128.tflite";
-  std::unique_ptr<FlatBufferModel> input_fb =
-      FlatBufferModel::BuildFromFile(model_path.data());
-  const Model* input_model = input_fb->GetModel();
+}
 
+TEST_F(QuantizeWeightsTest, DequantizeConv) {
+  LoadBasicModel();
   flatbuffers::FlatBufferBuilder builder;
-  EXPECT_EQ(QuantizeWeights(&builder, input_model), kTfLiteOk);
+  auto status = internal::QuantizeWeights(&builder, model_, 0,
+                                          /*use_hybrid_evaluation=*/false);
+  EXPECT_EQ(status, kTfLiteOk);
 
   const uint8_t* buffer = builder.GetBufferPointer();
   const Model* output_model = GetModel(buffer);
-
-  CheckWeights(input_model, output_model, true);
+  ASSERT_TRUE(output_model);
+
+  ASSERT_EQ(output_model->subgraphs()->size(), model_->subgraphs()->size());
+  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
+       ++subgraph_idx) {
+    const auto quantized_graph = output_model->subgraphs()->Get(subgraph_idx);
+    const auto float_graph = model_->subgraphs()->Get(subgraph_idx);
+    // The output graph should have an extra tensor from the added dequantize
+    // op.
+    ASSERT_EQ(quantized_graph->tensors()->size(),
+              float_graph->tensors()->size() + 1);
+    // Check that a dequantize op exists.
+    int32_t dequant_input_idx = -1;
+    int32_t dequant_output_idx = -1;
+    for (size_t i = 0; i < quantized_graph->operators()->size(); ++i) {
+      const auto op = quantized_graph->operators()->Get(i);
+      const uint32_t op_code_idx = op->opcode_index();
+      if (output_model->operator_codes()->Get(op_code_idx)->builtin_code() ==
+          BuiltinOperator_DEQUANTIZE) {
+        dequant_input_idx = op->inputs()->Get(0);
+        dequant_output_idx = op->outputs()->Get(0);
+      }
+    }
+    ASSERT_GT(dequant_input_idx, -1);
+    ASSERT_GT(dequant_output_idx, -1);
+    for (size_t i = 0; i < quantized_graph->tensors()->size(); ++i) {
+      const auto quant_tensor = quantized_graph->tensors()->Get(i);
+      // If the tensor is a weight, it should have type INT8.
+      // If the tensor is a bias, it should have type FLOAT32.
+      // If the tensor is an input or output it should have type FLOAT32.
+      // The input to dequantize should be INT8, and all other tensors should be
+      // FLOAT32.
+      if (i == dequant_input_idx) {
+        EXPECT_EQ(quant_tensor->type(), TensorType_INT8);
+      } else if (i == dequant_output_idx) {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
+      } else if (IsModelInputOrOutput(output_model, i)) {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
+      } else if (quant_tensor->name()->str() == "conv_bias") {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
+      } else if (quant_tensor->buffer() != 0) {
+        // If its a non-bias constant tensor, is must be the weight.
+        EXPECT_EQ(quant_tensor->type(), TensorType_INT8);
+      } else {
+        EXPECT_EQ(quant_tensor->type(), TensorType_FLOAT32);
+      }
+    }
+  }
 }
 
-TEST_F(QuantizeWeightsTest, SimpleTestWithoutHybrid) {
-  string model_path =
-      "third_party/tensorflow/lite/tools/optimize/testdata/"
-      "mobilenet_v1_0.25_128.tflite";
-  std::unique_ptr<FlatBufferModel> input_fb =
-      FlatBufferModel::BuildFromFile(model_path.data());
-  const Model* input_model = input_fb->GetModel();
-
+TEST_F(QuantizeWeightsTest, SharedWeights_Hybrid) {
+  LoadSharedWeightsModel();
   flatbuffers::FlatBufferBuilder builder;
-  // Disable hybrid evaluation.
-  EXPECT_EQ(internal::QuantizeWeights(&builder, input_model, false), kTfLiteOk);
+  auto status = QuantizeWeights(&builder, model_, 0);
+  EXPECT_EQ(status, kTfLiteOk);
 
   const uint8_t* buffer = builder.GetBufferPointer();
   const Model* output_model = GetModel(buffer);
-
-  CheckWeights(input_model, output_model, false);
+  ASSERT_TRUE(output_model);
+
+  ASSERT_EQ(output_model->subgraphs()->size(), model_->subgraphs()->size());
+  uint32_t num_conv_ops = 0;
+  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
+       ++subgraph_idx) {
+    const auto quantized_graph = output_model->subgraphs()->Get(subgraph_idx);
+    for (size_t i = 0; i < quantized_graph->operators()->size(); ++i) {
+      const auto op = quantized_graph->operators()->Get(i);
+      const uint32_t op_code_idx = op->opcode_index();
+      const auto op_code =
+          output_model->operator_codes()->Get(op_code_idx)->builtin_code();
+      if (op_code == BuiltinOperator_CONV_2D) {
+        num_conv_ops++;
+        // Ensure that each convolution's weights tensor is now INT8.
+        const auto weights_tensor =
+            quantized_graph->tensors()->Get(op->inputs()->Get(1));
+        EXPECT_EQ(weights_tensor->type(), TensorType_INT8);
+      }
+    }
+  }
+  // Ensure that there were exactly two convolutions in the model.
+  EXPECT_EQ(num_conv_ops, 2);
 }
 
-TEST_F(QuantizeWeightsTest, SimpleTestWithWeightsMinNumElements) {
-  string model_path =
-      "third_party/tensorflow/lite/tools/optimize/testdata/"
-      "mobilenet_v1_0.25_128.tflite";
-  std::unique_ptr<FlatBufferModel> input_fb =
-      FlatBufferModel::BuildFromFile(model_path.data());
-  const Model* input_model = input_fb->GetModel();
-
+TEST_F(QuantizeWeightsTest, SharedWeights_Dequantize) {
+  LoadSharedWeightsModel();
   flatbuffers::FlatBufferBuilder builder;
-  // Make weights_min_size sufficiently large such that no quantization should
-  // happen, i.e. the original model is the same size as the old one.
-  const uint64_t kWeightsMinNumElements = 1000000;
-  EXPECT_EQ(QuantizeWeights(&builder, input_model, kWeightsMinNumElements),
-            kTfLiteOk);
+  auto status = internal::QuantizeWeights(&builder, model_, 0,
+                                          /*use_hybrid_evaluation*/ false);
+  EXPECT_EQ(status, kTfLiteOk);
 
   const uint8_t* buffer = builder.GetBufferPointer();
   const Model* output_model = GetModel(buffer);
-  CheckWeights(input_model, output_model, true, kWeightsMinNumElements);
+  ASSERT_TRUE(output_model);
+
+  ASSERT_EQ(output_model->subgraphs()->size(), model_->subgraphs()->size());
+  uint32_t num_conv_ops = 0;
+  for (size_t subgraph_idx = 0; subgraph_idx < model_->subgraphs()->size();
+       ++subgraph_idx) {
+    const auto quantized_graph = output_model->subgraphs()->Get(subgraph_idx);
+    for (size_t i = 0; i < quantized_graph->operators()->size(); ++i) {
+      const auto op = quantized_graph->operators()->Get(i);
+      const uint32_t op_code_idx = op->opcode_index();
+      const auto op_code =
+          output_model->operator_codes()->Get(op_code_idx)->builtin_code();
+      if (op_code == BuiltinOperator_CONV_2D) {
+        num_conv_ops++;
+        // Ensure that each convolution's weights tensor is still FLOAT
+        // (the output of the dequantize).
+        uint32_t weights_tensor_index = op->inputs()->Get(1);
+        const auto weights_tensor =
+            quantized_graph->tensors()->Get(weights_tensor_index);
+        EXPECT_EQ(weights_tensor->type(), TensorType_FLOAT32);
+
+        // Check that it comes from a dequantize operation.
+        BuiltinOperator producer_op_code;
+        ASSERT_TRUE(GetProducerOpCode(output_model, subgraph_idx,
+                                      weights_tensor_index, &producer_op_code));
+        EXPECT_EQ(producer_op_code, BuiltinOperator_DEQUANTIZE);
+      }
+    }
+  }
+  // Ensure that there were exactly two convolutions in the model.
+  EXPECT_EQ(num_conv_ops, 2);
 }
 
-// TODO(suharshs): Add tests that run the resulting model.
-
 }  // namespace
 }  // namespace optimize
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  // On Linux, add: FLAGS_logtostderr = true;
-  ::testing::InitGoogleTest(&argc, argv);
+  tensorflow::string model_file;
+  const std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("test_model_file", &model_file,
+                       "Path to test tflite model file."),
+  };
+
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    std::cerr << "Required test_model_file\n";
+    std::abort();
+  }
+  g_test_model_dir =
+      new tensorflow::string(tensorflow::io::Dirname(model_file));
+  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/lite/tools/optimize/subgraph_quantizer.cc b/tensorflow/lite/tools/optimize/subgraph_quantizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..05115e8775044cd2ae3e8e89f12ba23adff62250
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/subgraph_quantizer.cc
@@ -0,0 +1,378 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/subgraph_quantizer.h"
+
+#include <algorithm>
+#include <limits>
+
+#include "flatbuffers/flexbuffers.h"
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/kernels/internal/round.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tools/optimize/quantization_utils.h"
+
+namespace tflite {
+namespace optimize {
+namespace internal {
+
+namespace {
+TfLiteStatus AddQuantizationParams(const std::vector<float>& scales,
+                                   const std::vector<int64_t>& zero_point,
+                                   int quantized_dimension,
+                                   const uint8_t* buffer_data,
+                                   size_t buffer_size, TensorType output_type,
+                                   ModelT* model, TensorT* tensor) {
+  tensor->quantization = absl::make_unique<QuantizationParametersT>();
+  tensor->quantization->scale.assign(scales.begin(), scales.end());
+  if (zero_point.size() != scales.size()) {
+    return kTfLiteError;
+  }
+  tensor->quantization->zero_point.assign(zero_point.begin(), zero_point.end());
+  tensor->quantization->quantized_dimension = quantized_dimension;
+  model->buffers[tensor->buffer]->data.assign(buffer_data,
+                                              buffer_data + buffer_size);
+  // Update the tensor type.
+  tensor->type = output_type;
+  return kTfLiteOk;
+}
+
+bool OpHasOptionalBiasTensor(BuiltinOperator op_code) {
+  return op_code == BuiltinOperator_CONV_2D ||
+         op_code == BuiltinOperator_DEPTHWISE_CONV_2D;
+}
+
+struct OpWithBiasTensors {
+  int activation_input_index;
+  int weights_input_index;
+  int bias_input_index;
+  int index_for_channel_in_weights;
+};
+
+const OpWithBiasTensors* GetInfoForOpWithBiasTensor(BuiltinOperator op_code) {
+  if (op_code == BuiltinOperator_CONV_2D) {
+    static OpWithBiasTensors op_info = {/* activation_input_index */ 0,
+                                        /* weights_input_index */ 1,
+                                        /* bias_input_index */ 2,
+                                        /* index_for_channel_in_weights */ 0};
+    return &op_info;
+  }
+  if (op_code == BuiltinOperator_DEPTHWISE_CONV_2D) {
+    static OpWithBiasTensors op_info = {/* bias_input_index */ 0,
+                                        /* bias_input_index */ 1,
+                                        /* bias_input_index */ 2,
+                                        /* index_for_channel_in_weights */ 3};
+    return &op_info;
+  }
+
+  return nullptr;
+}
+
+// Symmetrically Quantizes the given tensor as int8 values.
+TfLiteStatus SymmetricPerChannelQuantizeTensor(ModelT* model, TensorT* tensor,
+                                               int32_t channel_dim_index,
+                                               ErrorReporter* error_reporter) {
+  if (tensor->shape.size() != 4) {
+    error_reporter->Report("Only dims=4 is supported, tensor dims: %d",
+                           tensor->shape.size());
+    return kTfLiteError;
+  }
+
+  // Get dimensions.
+  uint64_t num_elements;
+  TF_LITE_ENSURE_STATUS(utils::NumElements(*tensor, &num_elements));
+  const int32_t channel_dim_size = tensor->shape[channel_dim_index];
+
+  // Get input float data.
+  BufferT* buffer = model->buffers[tensor->buffer].get();
+  float* float_input_data = reinterpret_cast<float*>(buffer->data.data());
+
+  // Create container for output scale and output data.
+  std::vector<float> scales(channel_dim_size);
+  std::vector<int8_t> final_buffer(num_elements);
+
+  // Quantize the input data with respect to channel_dim_index.
+  const std::vector<int> tensor_dims = {tensor->shape[0], tensor->shape[1],
+                                        tensor->shape[2], tensor->shape[3]};
+  utils::SymmetricPerChannelQuantization(
+      float_input_data, tensor_dims, channel_dim_index, &scales, &final_buffer);
+
+  // Set the buffers and output type.
+  uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data());
+  const size_t buffer_size = num_elements * sizeof(int8_t);
+  std::vector<int64_t> zero_point(scales.size(), 0);
+  return AddQuantizationParams(scales, zero_point, channel_dim_index,
+                               uint8_buffer, buffer_size, TensorType_INT8,
+                               model, tensor);
+}
+
+// Symmetrically quantizes the bias for ops like Conv and DepthwiseConv.
+// The scale of bias if weight_per_channel_scale[channel] * input_scale
+TfLiteStatus SymmetricPerChannelBiasQuantize(const TensorT* input_tensor,
+                                             const TensorT* weight_tensor,
+                                             int channel_dim_index,
+                                             ModelT* model, TensorT* tensor,
+                                             ErrorReporter* error_reporter) {
+  if (tensor->shape.size() != 1) {
+    error_reporter->Report("Expected bias tensor shape to be 1.");
+    return kTfLiteError;
+  }
+
+  if (tensor->type != TensorType_FLOAT32) {
+    return kTfLiteOk;
+  }
+
+  // TODO(shashishekhar): Make this support scalar biases.
+  if (tensor->shape[0] != weight_tensor->shape[channel_dim_index]) {
+    error_reporter->Report(
+        "Channel mismatch between bias and weight tensors %d vs %d",
+        tensor->shape[0], weight_tensor->shape[channel_dim_index]);
+    return kTfLiteError;
+  }
+  int32_t channel_dim_size = tensor->shape[0];
+  if (!input_tensor->quantization ||
+      input_tensor->quantization->scale.size() != 1) {
+    error_reporter->Report("Input tensor missing quantization information");
+    return kTfLiteError;
+  }
+  TF_LITE_ENSURE(error_reporter, weight_tensor->quantization);
+  const std::vector<float>& weight_scales = weight_tensor->quantization->scale;
+
+  if (weight_scales.size() != channel_dim_size) {
+    error_reporter->Report("Mismatch weight scale dimension: %d",
+                           weight_scales.size());
+    return kTfLiteError;
+  }
+
+  // Compute scales.
+  std::vector<float> scales(channel_dim_size);
+  for (size_t i = 0; i < channel_dim_size; i++) {
+    scales[i] = input_tensor->quantization->scale[0] * weight_scales[i];
+  }
+
+  BufferT* buffer = model->buffers[tensor->buffer].get();
+  float* float_data = reinterpret_cast<float*>(buffer->data.data());
+  uint64_t num_elements;
+  TF_LITE_ENSURE_STATUS(utils::NumElements(*tensor, &num_elements));
+
+  std::vector<int32_t> final_buffer(num_elements);
+  const int32_t kScale = std::numeric_limits<int32_t>::max();
+
+  for (int32_t channel_idx = 0; channel_idx < channel_dim_size; channel_idx++) {
+    float scaling_factor = scales[channel_idx];
+    float scaling_factor_inv = (scaling_factor == 0) ? 0 : 1.0 / scaling_factor;
+    const int32_t quantized_value = static_cast<int32_t>(
+        TfLiteRound(float_data[channel_idx] * scaling_factor_inv));
+    final_buffer[channel_idx] =
+        std::min(kScale, std::max(-kScale, quantized_value));
+  }
+
+  // Set the buffers and output type.
+  uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data());
+  size_t buffer_size = num_elements * sizeof(int32_t);
+  std::vector<int64_t> zero_point(scales.size(), 0);
+  return AddQuantizationParams(scales, zero_point, channel_dim_index,
+                               uint8_buffer, buffer_size, TensorType_INT32,
+                               model, tensor);
+}
+}  // namespace
+
+TfLiteStatus SubgraphQuantizer::AsymmetricQuantizeTensor(
+    BuiltinOperator op_code, int32_t tensor_idx) {
+  TensorT* tensor = subgraph_->tensors[tensor_idx].get();
+  if (tensor->type != TensorType_FLOAT32) {
+    return kTfLiteOk;
+  }
+
+  if (model_->buffers[tensor->buffer]->data.data() != nullptr) {
+    return kTfLiteError;
+  }
+  if (!tensor->quantization || tensor->quantization->min.empty() ||
+      tensor->quantization->max.empty()) {
+    error_reporter_->Report(
+        "Missing required min/max information for tensor_idx %d of operation: "
+        "%s",
+        tensor_idx, EnumNameBuiltinOperator(op_code));
+    return kTfLiteError;
+  }
+  utils::GetAsymmetricQuantizationParams(
+      tensor->quantization->min[0], tensor->quantization->max[0],
+      std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max(),
+      tensor->quantization.get());
+  tensor->type = TensorType_INT8;
+  return kTfLiteOk;
+}
+
+TfLiteStatus SubgraphQuantizer::QuantizeOpWithBias(BuiltinOperator op_code,
+                                                   OperatorT* op) {
+  auto op_tensor_info = GetInfoForOpWithBiasTensor(op_code);
+  if (!op_tensor_info) {
+    error_reporter_->Report("Cannot quantize op: %s",
+                            EnumNameBuiltinOperator(op_code));
+    return kTfLiteError;
+  }
+
+  // Conv/Depthwise conv have 2 inputs when there is no bias, 3 otherwise.
+  if (op->inputs.size() != 2 && op->inputs.size() != 3) {
+    return kTfLiteError;
+  }
+  auto input_tensor_idx = op->inputs[op_tensor_info->activation_input_index];
+  if (IsSubgraphInput(input_tensor_idx)) {
+    TF_LITE_ENSURE_STATUS(AsymmetricQuantizeTensor(op_code, input_tensor_idx));
+  }
+  auto weights_tensor_idx = op->inputs[op_tensor_info->weights_input_index];
+
+  TensorT* weights_tensor = subgraph_->tensors[weights_tensor_idx].get();
+  int weights_channel_index = op_tensor_info->index_for_channel_in_weights;
+
+  auto status = SymmetricPerChannelQuantizeTensor(
+      model_, weights_tensor, weights_channel_index, error_reporter_);
+  TF_LITE_ENSURE_STATUS(status);
+
+  // If there is bias, quantize it.
+  if (op->inputs.size() == 3) {
+    auto bias_tensor_idx = op->inputs[op_tensor_info->bias_input_index];
+    const TensorT* input_tensor = subgraph_->tensors[input_tensor_idx].get();
+    TensorT* bias_tensor = subgraph_->tensors[bias_tensor_idx].get();
+    TF_LITE_ENSURE_STATUS(SymmetricPerChannelBiasQuantize(
+        input_tensor, weights_tensor, weights_channel_index, model_,
+        bias_tensor, error_reporter_));
+  }
+
+  if (op->outputs.size() != 1) {
+    return kTfLiteError;
+  }
+  auto output_tensor_idx = op->outputs[0];
+  TF_LITE_ENSURE_STATUS(AsymmetricQuantizeTensor(op_code, output_tensor_idx));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus SubgraphQuantizer::PropagateMinMaxForAvgAndMaxPool(
+    BuiltinOperator op_code, OperatorT* op) {
+  TF_LITE_ENSURE_EQ(this->error_reporter_, op->inputs.size(), 1);
+
+  if (IsSubgraphInput(op->inputs[0])) {
+    TF_LITE_ENSURE_STATUS(AsymmetricQuantizeTensor(op_code, op->inputs[0]));
+  }
+
+  auto output_tensor = subgraph_->tensors[op->outputs[0]].get();
+  if (output_tensor->type != TensorType_FLOAT32) {
+    return kTfLiteOk;
+  }
+  auto input_tensor = subgraph_->tensors[op->inputs[0]].get();
+  if (!input_tensor->quantization) {
+    error_reporter_->Report(
+        "Missing required min/max information for input of operation: %s",
+        EnumNameBuiltinOperator(op_code));
+    return kTfLiteError;
+  }
+  if (input_tensor->quantization->min.size() != 1 ||
+      input_tensor->quantization->max.size() != 1 ||
+      input_tensor->quantization->scale.size() != 1 ||
+      input_tensor->quantization->zero_point.size() != 1) {
+    error_reporter_->Report(
+        "Invalid quantization information for Op: %s, tensor: %s",
+        EnumNameBuiltinOperator(op_code), input_tensor->name.c_str());
+    return kTfLiteError;
+  }
+  auto quant_params = absl::make_unique<QuantizationParametersT>();
+  // Nudge min, max to include the floating point zero.
+  const float min = std::min(0.f, input_tensor->quantization->min[0]);
+  const float max = std::max(0.f, input_tensor->quantization->max[0]);
+  quant_params->min.push_back(min);
+  quant_params->max.push_back(max);
+  quant_params->scale.push_back(input_tensor->quantization->scale[0]);
+  quant_params->zero_point.push_back(input_tensor->quantization->zero_point[0]);
+  // TODO(shashishekhar): Log a warning here if overriding existing
+  // min/max/scales differ from input scales.
+  output_tensor->quantization = std::move(quant_params);
+  output_tensor->type = TensorType_INT8;
+  return kTfLiteOk;
+}
+
+TfLiteStatus SubgraphQuantizer::AsymmetricQuantizeSingleInputOutputOp(
+    BuiltinOperator op_code, OperatorT* op) {
+  TF_LITE_ENSURE_EQ(this->error_reporter_, op->inputs.size(), 1);
+  TF_LITE_ENSURE_EQ(this->error_reporter_, op->outputs.size(), 1);
+
+  if (IsSubgraphInput(op->inputs[0])) {
+    TF_LITE_ENSURE_STATUS(AsymmetricQuantizeTensor(op_code, op->inputs[0]));
+  }
+
+  auto output_tensor = subgraph_->tensors[op->outputs[0]].get();
+  if (output_tensor->type != TensorType_FLOAT32) {
+    return kTfLiteOk;
+  }
+  auto quant_params = absl::make_unique<QuantizationParametersT>();
+  TF_LITE_ENSURE_STATUS(AsymmetricQuantizeTensor(op_code, op->outputs[0]));
+  return kTfLiteOk;
+}
+
+TfLiteStatus SubgraphQuantizer::AsymmetricQuantizeSoftmax(
+    BuiltinOperator op_code, OperatorT* op) {
+  TF_LITE_ENSURE_EQ(this->error_reporter_, op->inputs.size(), 1);
+  TF_LITE_ENSURE_EQ(this->error_reporter_, op->outputs.size(), 1);
+
+  if (IsSubgraphInput(op->inputs[0])) {
+    TF_LITE_ENSURE_STATUS(AsymmetricQuantizeTensor(op_code, op->inputs[0]));
+  }
+
+  auto output_tensor = subgraph_->tensors[op->outputs[0]].get();
+  if (output_tensor->type != TensorType_FLOAT32) {
+    return kTfLiteOk;
+  }
+
+  // Softmax output is hardcoded to have 1/256 as scale and -128 as zero point.
+  output_tensor->type = TensorType_INT8;
+  output_tensor->quantization->scale = {1.0f / 256.0f};
+  output_tensor->quantization->zero_point = {-128};
+  return kTfLiteOk;
+}
+
+bool SubgraphQuantizer::IsSubgraphInput(int32_t tensor_idx) const {
+  return std::find(subgraph_->inputs.begin(), subgraph_->inputs.end(),
+                   tensor_idx) != subgraph_->inputs.end();
+}
+
+TfLiteStatus SubgraphQuantizer::QuantizeOperator(int op_idx) {
+  OperatorT* op = subgraph_->operators[op_idx].get();
+  const BuiltinOperator op_code =
+      model_->operator_codes[op->opcode_index]->builtin_code;
+  if (OpHasOptionalBiasTensor(op_code)) {
+    return QuantizeOpWithBias(op_code, op);
+  }
+  switch (op_code) {
+    case BuiltinOperator_AVERAGE_POOL_2D:
+    case BuiltinOperator_MAX_POOL_2D:
+      return PropagateMinMaxForAvgAndMaxPool(op_code, op);
+    case BuiltinOperator_SQUEEZE:
+      return AsymmetricQuantizeSingleInputOutputOp(op_code, op);
+    case BuiltinOperator_SOFTMAX:
+      return AsymmetricQuantizeSoftmax(op_code, op);
+    default:
+      return kTfLiteError;
+  }
+
+  return kTfLiteError;
+}
+
+}  // namespace internal
+}  // namespace optimize
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/subgraph_quantizer.h b/tensorflow/lite/tools/optimize/subgraph_quantizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd1c39253cd1e04c2737a3ceafa429cc15e80580
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/subgraph_quantizer.h
@@ -0,0 +1,73 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_SUBGRAPH_QUANTIZER_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_SUBGRAPH_QUANTIZER_H_
+
+#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace optimize {
+namespace internal {
+
+// Quantizes a given subgraph, the subgraph needs to min/max information
+// present.
+//
+// Assumes that some ops like Conv and Depthwise conv are quantized by
+// per channel symmetric quantization.
+class SubgraphQuantizer {
+ public:
+  SubgraphQuantizer(ModelT* model, SubGraphT* subgraph,
+                    ErrorReporter* error_reporter)
+      : model_(model), subgraph_(subgraph), error_reporter_(error_reporter) {}
+
+  // Quantize operator at the given index.
+  TfLiteStatus QuantizeOperator(int op_idx);
+
+ private:
+  // Quantizes ops with bias tensors.
+  TfLiteStatus QuantizeOpWithBias(BuiltinOperator op_code, OperatorT* op);
+
+  // Average and Max pool need special treatement. The scales are propagated
+  // from inputs to outputs.
+  TfLiteStatus PropagateMinMaxForAvgAndMaxPool(BuiltinOperator op_code,
+                                               OperatorT* op);
+
+  // Asymmetric quantizes inputs and outputs of an Op that has single input and
+  // single output. E.g. Squeeze.
+  TfLiteStatus AsymmetricQuantizeSingleInputOutputOp(BuiltinOperator op_code,
+                                                     OperatorT* op);
+
+  // Asymmetric quantizes inputs and outputs of an Softmax Op.
+  // Input is quantized with the min-max range and output is hardcoded to have
+  // 1/256 as scale and -128 as zero point.
+  TfLiteStatus AsymmetricQuantizeSoftmax(BuiltinOperator op_code,
+                                         OperatorT* op);
+
+  TfLiteStatus AsymmetricQuantizeTensor(BuiltinOperator op_code,
+                                        int32_t tensor_idx);
+
+  // Returns true if |tensor_idx| is one of the inputs in the subgraph.
+  bool IsSubgraphInput(int32_t tensor_idx) const;
+
+  ModelT* model_;
+  SubGraphT* subgraph_;
+  ErrorReporter* error_reporter_;
+};
+}  // namespace internal
+}  // namespace optimize
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_SUBGRAPH_QUANTIZER_H_
diff --git a/tensorflow/lite/tools/optimize/subgraph_quantizer_test.cc b/tensorflow/lite/tools/optimize/subgraph_quantizer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7261d224c3782107c69696d36519420adf725c32
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/subgraph_quantizer_test.cc
@@ -0,0 +1,402 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/util/command_line_flags.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tools/optimize/subgraph_quantizer.h"
+#include "tensorflow/lite/tools/optimize/test_util.h"
+
+namespace {
+tensorflow::string* g_test_model_dir = nullptr;
+}  // namespace
+
+namespace tflite {
+namespace optimize {
+namespace internal {
+namespace {
+
+std::unique_ptr<FlatBufferModel> ReadModel(const char* model) {
+  auto model_path = tensorflow::io::JoinPath(*g_test_model_dir, model);
+  return FlatBufferModel::BuildFromFile(model_path.c_str());
+}
+
+std::unique_ptr<FlatBufferModel> ReadConvModel1() {
+  return ReadModel(kConvModelWithMinus128Plus127Weights);
+}
+
+std::unique_ptr<FlatBufferModel> ReadConvModel2() {
+  return ReadModel(kConvModelWith0Plus10Weights);
+}
+
+std::unique_ptr<FlatBufferModel> ReadSoftmaxModel() {
+  return ReadModel(kSingleSoftmaxModelMinMinus5MaxPlus5);
+}
+
+std::unique_ptr<FlatBufferModel> ReadAvgPoolModel() {
+  return ReadModel(kSingleAvgPoolModelMinMinus5MaxPlus5);
+}
+
+TEST(SubgraphQuantizerTest, VerifyConvQuantizationWithUnitScale) {
+  ASSERT_TRUE(g_test_model_dir);
+  ASSERT_FALSE(g_test_model_dir->empty());
+  auto test_model = ReadConvModel1();
+  ASSERT_TRUE(test_model);
+  auto readonly_model = test_model->GetModel();
+  ASSERT_TRUE(readonly_model);
+  ASSERT_TRUE(readonly_model->subgraphs());
+  ASSERT_GE(readonly_model->subgraphs()->size(), 1);
+  tflite::ModelT model;
+  readonly_model->UnPackTo(&model);
+  auto subgraph = model.subgraphs[0].get();
+  FailOnErrorReporter error_reporter;
+  SubgraphQuantizer quantizer(&model, subgraph, &error_reporter);
+  auto status = quantizer.QuantizeOperator(0);
+  ASSERT_EQ(kTfLiteOk, status);
+
+  auto conv_op = subgraph->operators[0].get();
+  const int input_tensor_idx = 0;
+  const int weights_tensor_idx = 1;
+  const int bias_tensor_index = 2;
+  const int output_tensor_idx = 0;
+  const auto bias_tensor =
+      subgraph->tensors[conv_op->inputs[bias_tensor_index]].get();
+  const auto input_tensor =
+      subgraph->tensors[conv_op->inputs[input_tensor_idx]].get();
+  const auto weights_tensor =
+      subgraph->tensors[conv_op->inputs[weights_tensor_idx]].get();
+  const auto output_tensor =
+      subgraph->tensors[conv_op->outputs[output_tensor_idx]].get();
+
+  EXPECT_EQ(bias_tensor->type, TensorType_INT32);
+  EXPECT_EQ(input_tensor->type, TensorType_INT8);
+  EXPECT_EQ(weights_tensor->type, TensorType_INT8);
+
+  ASSERT_TRUE(weights_tensor->quantization);
+  const int out_channel_size = weights_tensor->shape[0];
+  ASSERT_TRUE(bias_tensor->quantization);
+  ASSERT_TRUE(weights_tensor->quantization);
+  const std::vector<float>& bias_scales = bias_tensor->quantization->scale;
+  const std::vector<float>& weights_scales =
+      weights_tensor->quantization->scale;
+
+  const std::vector<int64_t>& weights_zero_points =
+      weights_tensor->quantization->zero_point;
+
+  ASSERT_EQ(bias_scales.size(), out_channel_size);
+  ASSERT_EQ(weights_scales.size(), out_channel_size);
+  ASSERT_EQ(weights_zero_points.size(), out_channel_size);
+  ASSERT_EQ(input_tensor->quantization->scale.size(), 1);
+  ASSERT_EQ(output_tensor->quantization->scale.size(), 1);
+
+
+  for (size_t i = 0; i < out_channel_size; i++) {
+    EXPECT_EQ(weights_scales[i], 1);
+    EXPECT_EQ(bias_scales[i], 1);
+    EXPECT_EQ(weights_zero_points[i], 0);
+  }
+
+  EXPECT_EQ(input_tensor->quantization->scale[0], 1);
+  EXPECT_EQ(output_tensor->quantization->scale[0], 1);
+
+  const auto bias_buffer = model.buffers[bias_tensor->buffer].get();
+  ASSERT_EQ(bias_buffer->data.size(), sizeof(int32_t) * bias_tensor->shape[0]);
+  const int32_t* bias_values =
+      reinterpret_cast<int32_t*>(bias_buffer->data.data());
+  const auto original_bias_buffer =
+      readonly_model->buffers()->Get(bias_tensor->buffer);
+  const float* bias_float_buffer =
+      reinterpret_cast<const float*>(original_bias_buffer->data()->data());
+
+  const float eps = 1e-7;
+  for (size_t i = 0; i < bias_tensor->shape[0]; i++) {
+    const float bias_scale =
+        input_tensor->quantization->scale[0] * weights_scales[i];
+    auto dequantized_value = bias_values[i] * bias_scale;
+    EXPECT_NEAR(dequantized_value, bias_float_buffer[i], eps);
+  }
+
+  const auto weights_buffer = model.buffers[weights_tensor->buffer].get();
+  const auto original_weights_buffer =
+      readonly_model->buffers()->Get(weights_tensor->buffer);
+  const int8_t* weight_values =
+      reinterpret_cast<int8_t*>(weights_buffer->data.data());
+  const float* weights_float_buffer =
+      reinterpret_cast<const float*>(original_weights_buffer->data()->data());
+  ASSERT_EQ(sizeof(float) * weights_buffer->data.size(),
+            original_weights_buffer->data()->size());
+  int num_values_in_channel = weights_buffer->data.size() / out_channel_size;
+  for (size_t channel_idx = 0; channel_idx < out_channel_size; channel_idx++) {
+    for (size_t j = 0; j < num_values_in_channel; j++) {
+      size_t element_idx = channel_idx * out_channel_size + j;
+      auto dequantized_value =
+          weight_values[element_idx] * weights_scales[channel_idx];
+      EXPECT_NEAR(dequantized_value, weights_float_buffer[element_idx], eps);
+    }
+  }
+}
+
+TEST(SubgraphQuantizerTest, VerifyConvQuantization) {
+  ASSERT_TRUE(g_test_model_dir);
+  ASSERT_FALSE(g_test_model_dir->empty());
+  auto test_model = ReadConvModel2();
+  ASSERT_TRUE(test_model);
+  auto readonly_model = test_model->GetModel();
+  ASSERT_TRUE(readonly_model);
+  ASSERT_TRUE(readonly_model->subgraphs());
+  ASSERT_GE(readonly_model->subgraphs()->size(), 1);
+  tflite::ModelT model;
+  readonly_model->UnPackTo(&model);
+  auto subgraph = model.subgraphs[0].get();
+  FailOnErrorReporter error_reporter;
+  SubgraphQuantizer quantizer(&model, subgraph, &error_reporter);
+  auto status = quantizer.QuantizeOperator(0);
+  ASSERT_EQ(kTfLiteOk, status);
+
+  auto conv_op = subgraph->operators[0].get();
+  const int input_tensor_idx = 0;
+  const int weights_tensor_idx = 1;
+  const int bias_tensor_index = 2;
+  const int output_tensor_idx = 0;
+  const auto bias_tensor =
+      subgraph->tensors[conv_op->inputs[bias_tensor_index]].get();
+  const auto input_tensor =
+      subgraph->tensors[conv_op->inputs[input_tensor_idx]].get();
+  const auto weights_tensor =
+      subgraph->tensors[conv_op->inputs[weights_tensor_idx]].get();
+  const auto output_tensor =
+      subgraph->tensors[conv_op->outputs[output_tensor_idx]].get();
+
+  EXPECT_EQ(bias_tensor->type, TensorType_INT32);
+  EXPECT_EQ(input_tensor->type, TensorType_INT8);
+  EXPECT_EQ(weights_tensor->type, TensorType_INT8);
+
+  ASSERT_TRUE(weights_tensor->quantization);
+  const int out_channel_size = weights_tensor->shape[0];
+  ASSERT_TRUE(bias_tensor->quantization);
+  ASSERT_TRUE(weights_tensor->quantization);
+  const std::vector<float>& bias_scales = bias_tensor->quantization->scale;
+  const std::vector<float>& weights_scales =
+      weights_tensor->quantization->scale;
+  const std::vector<int64_t>& weights_zero_points =
+      weights_tensor->quantization->zero_point;
+
+  ASSERT_EQ(bias_scales.size(), out_channel_size);
+  ASSERT_EQ(weights_scales.size(), out_channel_size);
+  ASSERT_EQ(weights_zero_points.size(), out_channel_size);
+  ASSERT_EQ(input_tensor->quantization->scale.size(), 1);
+  ASSERT_EQ(output_tensor->quantization->scale.size(), 1);
+
+  const float eps = 1e-7;
+
+  // Bias scale should be input * per_channel_weight_scale.
+  for (size_t i = 0; i < out_channel_size; i++) {
+    EXPECT_NEAR(bias_scales[i],
+                input_tensor->quantization->scale[0] * weights_scales[i], eps);
+  }
+
+  const auto bias_buffer = model.buffers[bias_tensor->buffer].get();
+  ASSERT_EQ(bias_buffer->data.size(), sizeof(int32_t) * bias_tensor->shape[0]);
+  const int32_t* bias_values =
+      reinterpret_cast<int32_t*>(bias_buffer->data.data());
+  const auto original_bias_buffer =
+      readonly_model->buffers()->Get(bias_tensor->buffer);
+  const float* bias_float_buffer =
+      reinterpret_cast<const float*>(original_bias_buffer->data()->data());
+
+  for (size_t i = 0; i < out_channel_size; i++) {
+    auto dequantized_value = bias_values[i] * bias_scales[i];
+    EXPECT_NEAR(dequantized_value, bias_float_buffer[i], bias_scales[i] / 2);
+  }
+
+  const auto weights_buffer = model.buffers[weights_tensor->buffer].get();
+  const auto original_weights_buffer =
+      readonly_model->buffers()->Get(weights_tensor->buffer);
+  const int8_t* weight_values =
+      reinterpret_cast<int8_t*>(weights_buffer->data.data());
+  const float* weights_float_buffer =
+      reinterpret_cast<const float*>(original_weights_buffer->data()->data());
+  ASSERT_EQ(sizeof(float) * weights_buffer->data.size(),
+            original_weights_buffer->data()->size());
+  int num_values_in_channel = weights_buffer->data.size() / out_channel_size;
+  for (size_t channel_idx = 0; channel_idx < out_channel_size; channel_idx++) {
+    for (size_t j = 0; j < num_values_in_channel; j++) {
+      size_t element_idx = channel_idx * out_channel_size + j;
+      auto scale = weights_scales[channel_idx];
+      auto zero_point = weights_zero_points[channel_idx];
+      auto dequantized_value = weight_values[element_idx] * scale;
+      EXPECT_NEAR(dequantized_value, weights_float_buffer[element_idx],
+                  scale / 2);
+      EXPECT_EQ(zero_point, 0);
+    }
+  }
+}
+
+void VerifyAsymmetricQuantizationScale(
+    const QuantizationParameters& float_quant_params,
+    const QuantizationParametersT& quantized_quant_params) {
+  const float eps = 1e-7;
+  ASSERT_EQ(float_quant_params.min()->size(), 1);
+  ASSERT_EQ(float_quant_params.max()->size(), 1);
+  float float_min = std::min(0.f, float_quant_params.min()->Get(0));
+  float float_max = std::max(0.f, float_quant_params.max()->Get(0));
+
+  ASSERT_EQ(quantized_quant_params.scale.size(), 1);
+  ASSERT_EQ(quantized_quant_params.zero_point.size(), 1);
+
+  float scale = (float_max - float_min) / 255;
+  EXPECT_NEAR(scale, quantized_quant_params.scale[0], eps);
+}
+
+TEST(SubgraphQuantizerTest, VerifySoftmaxQuantization) {
+  ASSERT_TRUE(g_test_model_dir);
+  ASSERT_FALSE(g_test_model_dir->empty());
+  auto test_model = ReadSoftmaxModel();
+  ASSERT_TRUE(test_model);
+  auto readonly_model = test_model->GetModel();
+  ASSERT_TRUE(readonly_model);
+  ASSERT_TRUE(readonly_model->subgraphs());
+  ASSERT_GE(readonly_model->subgraphs()->size(), 1);
+  tflite::ModelT model;
+  readonly_model->UnPackTo(&model);
+  auto subgraph = model.subgraphs[0].get();
+  FailOnErrorReporter error_reporter;
+  SubgraphQuantizer quantizer(&model, subgraph, &error_reporter);
+  auto status = quantizer.QuantizeOperator(0);
+  ASSERT_EQ(kTfLiteOk, status);
+
+  auto op = subgraph->operators[0].get();
+  // Model has a single softmax op.
+  ASSERT_EQ(op->opcode_index, 0);
+  ASSERT_EQ(model.operator_codes[0].get()->builtin_code,
+            BuiltinOperator_SOFTMAX);
+
+  ASSERT_EQ(op->inputs.size(), 1);
+  ASSERT_EQ(op->outputs.size(), 1);
+  auto float_graph = readonly_model->subgraphs()->Get(0);
+
+  // Verify input.
+  ASSERT_EQ(float_graph->tensors()->Get(op->inputs[0])->type(),
+            TensorType_FLOAT32);
+  ASSERT_EQ(float_graph->tensors()->Get(op->outputs[0])->type(),
+            TensorType_FLOAT32);
+
+  EXPECT_EQ(subgraph->tensors[op->inputs[0]].get()->type, TensorType_INT8);
+  EXPECT_EQ(subgraph->tensors[op->outputs[0]].get()->type, TensorType_INT8);
+
+  auto float_input_quant_params =
+      float_graph->tensors()->Get(op->inputs[0])->quantization();
+  auto input_quant_params =
+      subgraph->tensors[op->inputs[0]]->quantization.get();
+  VerifyAsymmetricQuantizationScale(*float_input_quant_params,
+                                    *input_quant_params);
+
+  // Verify output.
+  auto float_output_quant_params =
+      float_graph->tensors()->Get(op->outputs[0])->quantization();
+  auto output_quant_params =
+      subgraph->tensors[op->outputs[0]]->quantization.get();
+  ASSERT_EQ(float_output_quant_params->min()->size(), 1);
+  ASSERT_EQ(float_output_quant_params->max()->size(), 1);
+
+  ASSERT_EQ(output_quant_params->scale.size(), 1);
+  ASSERT_EQ(output_quant_params->zero_point.size(), 1);
+  ASSERT_EQ(1.0f / 256.0f, output_quant_params->scale[0]);
+  ASSERT_EQ(-128, output_quant_params->zero_point[0]);
+}
+
+TEST(SubgraphQuantizerTest, VerifyAvgPoolQuantization) {
+  ASSERT_TRUE(g_test_model_dir);
+  ASSERT_FALSE(g_test_model_dir->empty());
+  auto test_model = ReadAvgPoolModel();
+  ASSERT_TRUE(test_model);
+  auto readonly_model = test_model->GetModel();
+  ASSERT_TRUE(readonly_model);
+  ASSERT_TRUE(readonly_model->subgraphs());
+  ASSERT_GE(readonly_model->subgraphs()->size(), 1);
+  tflite::ModelT model;
+  readonly_model->UnPackTo(&model);
+  auto subgraph = model.subgraphs[0].get();
+  FailOnErrorReporter error_reporter;
+  SubgraphQuantizer quantizer(&model, subgraph, &error_reporter);
+  auto status = quantizer.QuantizeOperator(0);
+  ASSERT_EQ(kTfLiteOk, status);
+
+  auto op = subgraph->operators[0].get();
+  // Model has a single AveragePool op.
+  ASSERT_EQ(op->opcode_index, 0);
+  ASSERT_EQ(model.operator_codes[0].get()->builtin_code,
+            BuiltinOperator_AVERAGE_POOL_2D);
+
+  ASSERT_EQ(op->inputs.size(), 1);
+  ASSERT_EQ(op->outputs.size(), 1);
+
+  auto float_graph = readonly_model->subgraphs()->Get(0);
+  ASSERT_EQ(float_graph->tensors()->Get(op->inputs[0])->type(),
+            TensorType_FLOAT32);
+  ASSERT_EQ(float_graph->tensors()->Get(op->outputs[0])->type(),
+            TensorType_FLOAT32);
+
+  EXPECT_EQ(subgraph->tensors[op->inputs[0]].get()->type, TensorType_INT8);
+  EXPECT_EQ(subgraph->tensors[op->outputs[0]].get()->type, TensorType_INT8);
+
+  auto float_input_quant_params =
+      float_graph->tensors()->Get(op->inputs[0])->quantization();
+  auto input_quant_params =
+      subgraph->tensors[op->inputs[0]]->quantization.get();
+  VerifyAsymmetricQuantizationScale(*float_input_quant_params,
+                                    *input_quant_params);
+
+  auto float_output_quant_params =
+      float_graph->tensors()->Get(op->outputs[0])->quantization();
+  auto output_quant_params =
+      subgraph->tensors[op->outputs[0]]->quantization.get();
+  ASSERT_EQ(float_output_quant_params->min()->size(), 1);
+  ASSERT_EQ(float_output_quant_params->max()->size(), 1);
+  ASSERT_EQ(output_quant_params->min.size(), 1);
+  ASSERT_EQ(output_quant_params->max.size(), 1);
+
+  // Make sure the input min/maxes are propagated to outputs.
+  EXPECT_EQ(input_quant_params->min[0], output_quant_params->min[0]);
+  EXPECT_EQ(input_quant_params->max[0], output_quant_params->max[0]);
+  EXPECT_EQ(input_quant_params->scale[0], output_quant_params->scale[0]);
+}
+
+}  // namespace
+}  // namespace internal
+}  // namespace optimize
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  tensorflow::string model_file;
+  const std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("test_model_file", &model_file,
+                       "Path to test tflite model file."),
+  };
+
+  const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parse_result) {
+    std::cerr << "Required test_model_file\n";
+    std::abort();
+  }
+  g_test_model_dir =
+      new tensorflow::string(tensorflow::io::Dirname(model_file));
+  ::tensorflow::port::InitMain(argv[0], &argc, &argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/tools/optimize/test_util.cc b/tensorflow/lite/tools/optimize/test_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..190242402b37c74f123a1f24bc2980ce70da7ae7
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/test_util.cc
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/optimize/test_util.h"
+
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace optimize {
+namespace internal {
+const char* kConvModelWithMinus128Plus127Weights =
+    "single_conv_weights_min_minus_127_max_plus_127.bin";
+
+const char* kConvModelWith0Plus10Weights =
+    "single_conv_weights_min_0_max_plus_10.bin";
+
+const char* kSingleSoftmaxModelMinMinus5MaxPlus5 =
+    "single_softmax_min_minus_5_max_plus_5.bin";
+
+const char* kSingleAvgPoolModelMinMinus5MaxPlus5 =
+    "single_avg_pool_min_minus_5_max_plus_5.bin";
+
+const char* kModelWithSharedWeights = "weight_shared_between_convs.bin";
+
+int FailOnErrorReporter::Report(const char* format, va_list args) {
+  char buf[1024];
+  vsnprintf(buf, sizeof(buf), format, args);
+  EXPECT_TRUE(false) << "Error happened: " << buf;
+  return 0;
+}
+}  // namespace internal
+}  // namespace optimize
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/optimize/test_util.h b/tensorflow/lite/tools/optimize/test_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..21f8b3ceb0ba48abc3c95810ee1b12a8c2b00b0c
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/test_util.h
@@ -0,0 +1,58 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_OPTIMIZE_TEST_UTIL_H_
+#define TENSORFLOW_LITE_TOOLS_OPTIMIZE_TEST_UTIL_H_
+
+#include "tensorflow/lite/core/api/error_reporter.h"
+
+namespace tflite {
+namespace optimize {
+namespace internal {
+// Test model with a single convolution.
+// Floating point weights of the model are all integers and lie in
+// range[-127, 127]. The weights have been put in such a way that each
+// channel has at least one weight as -127 and one weight as 127.
+// The activations are all in range: [-128, 127]
+// This means all bias computations should result in 1.0 scale.
+extern const char* kConvModelWithMinus128Plus127Weights;
+
+// Test model with single convolution where all weights are integers between
+// [0, 10] weights are randomly distributed. It is not guaranteed that min max
+// for weights are going to appear in each channel.
+// Activations have min = 0, max = 10.
+extern const char* kConvModelWith0Plus10Weights;
+
+// A floating point model with a single softmax. The input tensor has min
+// and max in range [-5, 5], not necessarily -5 or +5.
+extern const char* kSingleSoftmaxModelMinMinus5MaxPlus5;
+
+// A floating point model with a single average pool. The input tensor has min
+// and max in range [-5, 5], not necessarily -5 or +5.
+extern const char* kSingleAvgPoolModelMinMinus5MaxPlus5;
+
+// Test model with a weights variable that is shared between a convolution layer
+// and an add operation.
+extern const char* kModelWithSharedWeights;
+
+// An error reporter that fails on testing.
+class FailOnErrorReporter : public ErrorReporter {
+ public:
+  int Report(const char* format, va_list args) override;
+};
+}  // namespace internal
+}  // namespace optimize
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_OPTIMIZE_TEST_UTIL_H_
diff --git a/tensorflow/lite/tools/optimize/testdata/README.md b/tensorflow/lite/tools/optimize/testdata/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..21fcd32b1ee85a6a821e60c4336ecd7a32e677a0
--- /dev/null
+++ b/tensorflow/lite/tools/optimize/testdata/README.md
@@ -0,0 +1,25 @@
+# Test models for testing quantization
+
+This directory contains test models for testing quantization.
+
+## Models
+
+* `single_conv_weights_min_0_max_plus_10.bin` \
+   A floating point model with single convolution where all weights are
+   integers between [0, 10] weights are randomly distributed. It is not
+   guaranteed that min max for weights are going to appear in each channel.
+   All activations have min maxes and activations are in range [0,10].
+* `single_conv_weights_min_minus_127_max_plus_127.bin` \
+   A floating point model with a single convolution where weights of the model
+   are all integers that lie in range[-127, 127]. The weights have been put in
+   such a way that each channel has at least one weight as -127 and one weight
+   as 127. The activations are all in range: [-128, 127].
+   This means all bias computations should result in 1.0 scale.
+* `single_softmax_min_minus_5_max_5.bin` \
+   A floating point model with a single softmax. The input tensor has min
+   and max in range [-5, 5], not necessarily -5 or +5.
+* `single_avg_pool_input_min_minus_5_max_5.bin` \
+   A floating point model with a single average pool. The input tensor has min
+   and max in range [-5, 5], not necessarily -5 or +5.
+* `weight_shared_between_convs.tflite` \
+   A floating point model with two convs that have a use the same weight tensor.
diff --git a/tensorflow/lite/tools/optimize/testdata/single_avg_pool_min_minus_5_max_plus_5.bin b/tensorflow/lite/tools/optimize/testdata/single_avg_pool_min_minus_5_max_plus_5.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a65f39ee29514b27ea3af861c10dd452ab9e5ce2
Binary files /dev/null and b/tensorflow/lite/tools/optimize/testdata/single_avg_pool_min_minus_5_max_plus_5.bin differ
diff --git a/tensorflow/lite/tools/optimize/testdata/single_conv_weights_min_0_max_plus_10.bin b/tensorflow/lite/tools/optimize/testdata/single_conv_weights_min_0_max_plus_10.bin
new file mode 100644
index 0000000000000000000000000000000000000000..70cbc0620ad7222817cf241030acb98387083154
Binary files /dev/null and b/tensorflow/lite/tools/optimize/testdata/single_conv_weights_min_0_max_plus_10.bin differ
diff --git a/tensorflow/lite/tools/optimize/testdata/single_conv_weights_min_minus_127_max_plus_127.bin b/tensorflow/lite/tools/optimize/testdata/single_conv_weights_min_minus_127_max_plus_127.bin
new file mode 100644
index 0000000000000000000000000000000000000000..29b9f47097d466b65831514cec3a00f19f5cbdf3
Binary files /dev/null and b/tensorflow/lite/tools/optimize/testdata/single_conv_weights_min_minus_127_max_plus_127.bin differ
diff --git a/tensorflow/lite/tools/optimize/testdata/single_softmax_min_minus_5_max_plus_5.bin b/tensorflow/lite/tools/optimize/testdata/single_softmax_min_minus_5_max_plus_5.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3caba63492e174229ef605bfbb0d2ddeda2ba61d
Binary files /dev/null and b/tensorflow/lite/tools/optimize/testdata/single_softmax_min_minus_5_max_plus_5.bin differ
diff --git a/tensorflow/lite/tools/optimize/testdata/weight_shared_between_convs.bin b/tensorflow/lite/tools/optimize/testdata/weight_shared_between_convs.bin
new file mode 100644
index 0000000000000000000000000000000000000000..4c903c82eec32df8aa0d3462262b61daa30fc251
Binary files /dev/null and b/tensorflow/lite/tools/optimize/testdata/weight_shared_between_convs.bin differ
diff --git a/tensorflow/lite/tools/pip_package/setup.py b/tensorflow/lite/tools/pip_package/setup.py
index 64d62ee1f2d5d0cc1fa1d1804c637f8220937128..c5141c17537f355fc80b37b3a7e2ed3b2c0a2dfd 100644
--- a/tensorflow/lite/tools/pip_package/setup.py
+++ b/tensorflow/lite/tools/pip_package/setup.py
@@ -57,12 +57,22 @@ RELATIVE_MAKEFILE_PATH = os.path.join(RELATIVE_MAKE_DIR, 'Makefile')
 DOWNLOAD_SCRIPT_PATH = os.path.join(MAKE_DIR, 'download_dependencies.sh')
 
 
+# Check physical memory and if we are on a reasonable non small SOC machine
+# with more than 4GB, use all the CPUs, otherwisxe only 1.
+def get_build_cpus():
+  physical_bytes = os.sysconf('SC_PAGESIZE') * os.sysconf('SC_PHYS_PAGES')
+  if physical_bytes < (1<<30) * 4:
+    return 1
+  else:
+    return multiprocessing.cpu_count()
+
+
 def make_args(target='', quiet=True):
   """Construct make command line."""
   args = (['make', 'SHELL=/bin/bash', '-C', TENSORFLOW_DIR]
           + MAKE_CROSS_OPTIONS +
           ['-f', RELATIVE_MAKEFILE_PATH, '-j',
-           str(multiprocessing.cpu_count())])
+           str(get_build_cpus())])
   if quiet:
     args.append('--quiet')
   if target:
@@ -136,7 +146,7 @@ setup(
     long_description='\n'.join(DOCLINES[2:]),
     url='https://www.tensorflow.org/lite/',
     author='Google Inc.',
-    author_email='opensource@google.com',
+    author_email='packages@tensorflow.org',
     license='Apache 2.0',
     include_package_data=True,
     keywords='tflite tensorflow tensor machine learning',
diff --git a/tensorflow/lite/tools/verifier.cc b/tensorflow/lite/tools/verifier.cc
index 02d6e6b23cdd66c9dd87700e4be6bb2cfbee407f..ffc56c19658986e39c1d8085761120b8373c919e 100644
--- a/tensorflow/lite/tools/verifier.cc
+++ b/tensorflow/lite/tools/verifier.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/tools/verifier.h"
 #include <climits>
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/version.h"
@@ -53,7 +54,7 @@ const uint32_t kMaxNumString = UINT_MAX / sizeof(int32_t) - 2;
 
 // Verifies string tensor has legit buffer contents that follow the schema
 // defined in lite/string_util.h
-bool VerifyStringTensorBuffer(const Buffer& buffer,
+bool VerifyStringTensorBuffer(const Tensor& tensor, const Buffer& buffer,
                               ErrorReporter* error_reporter) {
   uint32_t buffer_size = buffer.data()->size();
   const char* buffer_ptr = reinterpret_cast<const char*>(buffer.data()->data());
@@ -61,7 +62,8 @@ bool VerifyStringTensorBuffer(const Buffer& buffer,
   uint32_t num_strings = *GetIntPtr(buffer_ptr);
   if (num_strings > kMaxNumString) {
     ReportError(error_reporter,
-                "String tensor has invalid num of string set: %d", num_strings);
+                "String tensor %s has invalid num of string set: %d",
+                tensor.name()->c_str(), num_strings);
     return false;
   }
   uint32_t header_offsets =
@@ -69,9 +71,9 @@ bool VerifyStringTensorBuffer(const Buffer& buffer,
 
   if (buffer_size < header_offsets) {
     ReportError(error_reporter,
-                "String tensor buffer requires at least %d bytes, but is "
+                "String tensor %s buffer requires at least %d bytes, but is "
                 "allocated with %d bytes",
-                header_offsets, buffer_size);
+                tensor.name()->c_str(), header_offsets, buffer_size);
     return false;
   }
 
@@ -80,22 +82,24 @@ bool VerifyStringTensorBuffer(const Buffer& buffer,
 
   if (*GetIntPtr(buffer_ptr + offset) != header_offsets) {
     ReportError(error_reporter,
-                "String tensor buffer initial offset must be: %d",
-                header_offsets);
+                "String tensor %s buffer initial offset must be: %d",
+                tensor.name()->c_str(), header_offsets);
     return false;
   }
   offset += sizeof(int32_t);
   for (int i = 1; i <= num_strings; i++, offset += sizeof(int32_t)) {
     int string_offset = *GetIntPtr(buffer_ptr + offset);
     if (string_offset < prev_ptr || string_offset > buffer_size) {
-      ReportError(error_reporter, "String tensor buffer is invalid: index %d",
-                  i);
+      ReportError(error_reporter,
+                  "String tensor %s buffer is invalid: index %d",
+                  tensor.name()->c_str(), i);
       return false;
     }
   }
   if (*GetIntPtr(buffer_ptr + offset - sizeof(int32_t)) != buffer_size) {
-    ReportError(error_reporter, "String tensor buffer last offset must be %d",
-                buffer_size);
+    ReportError(error_reporter,
+                "String tensor %s buffer last offset must be %d",
+                tensor.name()->c_str(), buffer_size);
     return false;
   }
   return true;
@@ -105,10 +109,15 @@ bool VerifyStringTensorBuffer(const Buffer& buffer,
 bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer,
                                ErrorReporter* error_reporter) {
   uint64_t bytes_required = 1;
+  if (!tensor.shape()) {
+    // Empty tensor. Avoid further checks.
+    return true;
+  }
   for (int dim : *tensor.shape()) {
     bytes_required *= dim;
     if (bytes_required > UINT_MAX) {
-      ReportError(error_reporter, "Tensor dimension overflow");
+      ReportError(error_reporter, "Tensor %s dimension overflow",
+                  tensor.name()->c_str());
       return false;
     }
   }
@@ -116,31 +125,36 @@ bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer,
     case TensorType_FLOAT32:
       bytes_required *= sizeof(float);
       break;
-    case TensorType_INT32:
-      bytes_required *= sizeof(int32_t);
+    case TensorType_INT8:
+      bytes_required *= sizeof(int8_t);
       break;
     case TensorType_UINT8:
       bytes_required *= sizeof(uint8_t);
       break;
+    case TensorType_INT32:
+      bytes_required *= sizeof(int32_t);
+      break;
     case TensorType_INT64:
       bytes_required *= sizeof(int64_t);
       break;
     case TensorType_FLOAT16:
       // FALLTHROUGH_INTENDED;
     default:
-      ReportError(error_reporter, "Invalid tensor type: %d", tensor.type());
+      ReportError(error_reporter, "Tensor %s invalid type: %d",
+                  tensor.name()->c_str(), tensor.type());
       return false;
   }
   if (bytes_required > UINT_MAX) {
-    ReportError(error_reporter, "Tensor dimension overflow");
+    ReportError(error_reporter, "Tensor %s dimension overflow",
+                tensor.name()->c_str());
     return false;
   }
 
   if (bytes_required != buffer.data()->size()) {
     ReportError(
         error_reporter,
-        "Tensor requires %d bytes, but is allocated with %d bytes buffer",
-        bytes_required, buffer.data()->size());
+        "Tensor %s requires %d bytes, but is allocated with %d bytes buffer",
+        tensor.name()->c_str(), bytes_required, buffer.data()->size());
     return false;
   }
   return true;
@@ -166,6 +180,86 @@ bool VerifyOperators(const Vector<Offset<Operator>>& operators,
   return true;
 }
 
+bool IsConstantTensor(const Tensor& tensor, const Model& model) {
+  if (!tensor.buffer() || !model.buffers()) return false;
+  if (tensor.buffer() > 0 && tensor.buffer() < model.buffers()->size()) {
+    auto* buffer = model.buffers()->Get(tensor.buffer());
+    if (buffer && buffer->data()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Performs basic consistency checks on a sub-graph.
+bool VerifySubGraphConsistency(const Model& model, const SubGraph& subgraph,
+                               ErrorReporter* error_reporter) {
+  absl::flat_hash_set<int> subgraph_input_tensors, constant_tensors,
+      variable_tensors, output_tensors;
+  for (int i = 0; i < subgraph.tensors()->Length(); ++i) {
+    const auto* tensor = subgraph.tensors()->Get(i);
+    if (IsConstantTensor(*tensor, model)) {
+      constant_tensors.insert(i);
+    } else if (tensor->is_variable()) {
+      variable_tensors.insert(i);
+    }
+  }
+  for (const int tensor_idx : *subgraph.inputs()) {
+    subgraph_input_tensors.insert(tensor_idx);
+  }
+
+  for (int op_idx = 0; op_idx < subgraph.operators()->Length(); ++op_idx) {
+    const auto* op = subgraph.operators()->Get(op_idx);
+    const auto& opcode = model.operator_codes()->Get(op->opcode_index());
+    // Check for invalid inputs by ensuring all exist in produced_tensors.
+    for (const int input_idx : *op->inputs()) {
+      if (input_idx == kOptionalTensor) continue;
+      if (constant_tensors.find(input_idx) == constant_tensors.end() &&
+          variable_tensors.find(input_idx) == variable_tensors.end() &&
+          subgraph_input_tensors.find(input_idx) ==
+              subgraph_input_tensors.end() &&
+          output_tensors.find(input_idx) == output_tensors.end()) {
+        ReportError(error_reporter,
+                    "Input tensor %d to op %d (%s) is not produced", input_idx,
+                    op_idx, EnumNameBuiltinOperator(opcode->builtin_code()));
+        return false;
+      }
+    }
+    // Check for cycles/invalid outputs by ensuring that none exist in
+    // produced_tensors.
+    for (const int output_idx : *op->outputs()) {
+      if (constant_tensors.find(output_idx) != constant_tensors.end()) {
+        ReportError(error_reporter,
+                    "Output tensor %d to op %d (%s) is a constant", output_idx,
+                    op_idx, EnumNameBuiltinOperator(opcode->builtin_code()));
+        return false;
+      } else if (variable_tensors.find(output_idx) != variable_tensors.end()) {
+        ReportError(error_reporter,
+                    "Output tensor %d to op %d (%s) is a variable", output_idx,
+                    op_idx, EnumNameBuiltinOperator(opcode->builtin_code()));
+        return false;
+      } else if (subgraph_input_tensors.find(output_idx) !=
+                 subgraph_input_tensors.end()) {
+        ReportError(error_reporter,
+                    "Output tensor %d to op %d (%s) is a subgraph input",
+                    output_idx, op_idx,
+                    EnumNameBuiltinOperator(opcode->builtin_code()));
+        return false;
+      } else if (output_tensors.find(output_idx) != output_tensors.end()) {
+        ReportError(error_reporter,
+                    "Output tensor %d to op %d (%s) is an output from "
+                    "another op. There is a cycle in the graph",
+                    output_idx, op_idx,
+                    EnumNameBuiltinOperator(opcode->builtin_code()));
+        return false;
+      }
+      // This can be an input to a subsequent op.
+      output_tensors.insert(output_idx);
+    }
+  }
+  return true;
+}
+
 bool VerifySubGraphs(const Model& model, ErrorReporter* error_reporter) {
   if (!model.subgraphs()) {
     ReportError(error_reporter, "Missing 'subgraphs' section.");
@@ -180,6 +274,10 @@ bool VerifySubGraphs(const Model& model, ErrorReporter* error_reporter) {
     if (!VerifyOperators(*subgraph->operators(), error_reporter)) {
       return false;
     }
+
+    if (!VerifySubGraphConsistency(model, *subgraph, error_reporter)) {
+      return false;
+    }
   }
   return true;
 }
@@ -203,14 +301,14 @@ bool VerifyTensors(const Model& model, ErrorReporter* error_reporter) {
         continue;
       }
       if (tensor->buffer() >= model.buffers()->size()) {
-        ReportError(error_reporter, "Invalid tensor buffer index: %d",
-                    tensor->buffer());
+        ReportError(error_reporter, "Tensor %s invalid buffer index: %d",
+                    tensor->name(), tensor->buffer());
         return false;
       }
       auto* buffer = model.buffers()->Get(tensor->buffer());
       if (!buffer) {
-        ReportError(error_reporter, "Tensor buffer %d not set",
-                    tensor->buffer());
+        ReportError(error_reporter, "Tensor %s buffer %d not set",
+                    tensor->name(), tensor->buffer());
         return false;
       }
 
@@ -218,7 +316,7 @@ bool VerifyTensors(const Model& model, ErrorReporter* error_reporter) {
       // buffers will be allocated by the interpreter at run-time.
       if (buffer->data()) {
         if (tensor->type() == TensorType_STRING) {
-          if (!VerifyStringTensorBuffer(*buffer, error_reporter)) {
+          if (!VerifyStringTensorBuffer(*tensor, *buffer, error_reporter)) {
             return false;
           }
         } else {
diff --git a/tensorflow/lite/tools/verifier_test.cc b/tensorflow/lite/tools/verifier_test.cc
index 98abafad927ae45cd7de428d0011e234f345dd6e..7d67d9430acf10bf1fadd0703f496e9bb3af0adc 100644
--- a/tensorflow/lite/tools/verifier_test.cc
+++ b/tensorflow/lite/tools/verifier_test.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include "flatbuffers/flatbuffers.h"
 #include "flatbuffers/util.h"
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/error_reporter.h"
 #include "tensorflow/lite/op_resolver.h"
@@ -25,13 +27,29 @@ limitations under the License.
 #include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/tools/verifier.h"
 #include "tensorflow/lite/version.h"
-#include "tensorflow/core/framework/numeric_types.h"
 
 namespace tflite {
 
 using flatbuffers::FlatBufferBuilder;
 using flatbuffers::Offset;
 
+class MockErrorReporter : public ErrorReporter {
+ public:
+  MockErrorReporter() : buffer_size_(0) {}
+  int Report(const char* format, va_list args) override {
+    buffer_size_ = vsnprintf(buffer_, kBufferSize, format, args);
+    return buffer_size_;
+  }
+  int GetBufferSize() { return buffer_size_; }
+
+  string GetAsString() const { return string(buffer_, buffer_size_); }
+
+ private:
+  static constexpr int kBufferSize = 256;
+  char buffer_[kBufferSize];
+  int buffer_size_;
+};
+
 // Build single subgraph model.
 class TfLiteFlatbufferModelBuilder {
  public:
@@ -54,14 +72,22 @@ class TfLiteFlatbufferModelBuilder {
   }
 
   void AddTensor(const std::vector<int>& shape, tflite::TensorType type,
-                 const std::vector<uint8_t>& buffer, const char* name) {
+                 const std::vector<uint8_t>& buffer, const char* name,
+                 const bool is_variable = false) {
     int buffer_index = 0;
     if (!buffer.empty()) {
       buffer_index = buffers_.size();
       buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector(buffer)));
     }
+    if (shape.empty()) {
+      tensors_.push_back(CreateTensorDirect(builder_, /*shape=*/nullptr, type,
+                                            buffer_index, name,
+                                            /*quantization=*/0, is_variable));
+      return;
+    }
     tensors_.push_back(CreateTensorDirect(builder_, &shape, type, buffer_index,
-                                          name, /*quantization=*/0));
+                                          name, /*quantization=*/0,
+                                          is_variable));
   }
 
   void AddOperator(const std::vector<int32_t>& inputs,
@@ -92,13 +118,16 @@ class TfLiteFlatbufferModelBuilder {
 
   bool Verify() {
     return tflite::Verify(builder_.GetBufferPointer(), builder_.GetSize(),
-                          resolver_, DefaultErrorReporter());
+                          resolver_, &mock_reporter_);
   }
 
+  string GetErrorString() { return mock_reporter_.GetAsString(); }
+
  private:
   FlatBufferBuilder builder_;
   MutableOpResolver resolver_;
   TfLiteRegistration fake_op_;
+  MockErrorReporter mock_reporter_;
   std::vector<Offset<Operator>> operators_;
   std::vector<Offset<OperatorCode>> operator_codes_;
   std::vector<Offset<Tensor>> tensors_;
@@ -112,8 +141,25 @@ TEST(VerifyModel, TestEmptyModel) {
                            /*description=*/0, /*buffers=*/0);
   ::tflite::FinishModelBuffer(builder, model);
 
+  MockErrorReporter mock_reporter;
   ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize(),
-                      MutableOpResolver{}, DefaultErrorReporter()));
+                      MutableOpResolver{}, &mock_reporter));
+  EXPECT_THAT(mock_reporter.GetAsString(),
+              ::testing::ContainsRegex("Missing 'subgraphs' section."));
+}
+
+TEST(VerifyModel, TestEmptyVector) {
+  TfLiteFlatbufferModelBuilder builder({}, {"test"});
+  builder.AddOperator({0, 1}, {3}, BuiltinOperator_CUSTOM, "test");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input");
+  builder.AddTensor({}, TensorType_UINT8, {}, "empty_vector");
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'},
+      "data");
+  builder.AddTensor({2, 3}, TensorType_INT32, {}, "output");
+  builder.FinishModel({0, 1}, {3});
+  ASSERT_TRUE(builder.Verify());
 }
 
 TEST(VerifyModel, TestSimpleModel) {
@@ -127,12 +173,16 @@ TEST(VerifyModel, TestSimpleModel) {
   builder.AddTensor({2, 3}, TensorType_INT32, {}, "output");
   builder.FinishModel({0, 1}, {2});
   ASSERT_TRUE(builder.Verify());
+  EXPECT_EQ("", builder.GetErrorString());
 }
 
 TEST(VerifyModel, TestCorruptedData) {
   std::string model = "123";
-  ASSERT_FALSE(Verify(model.data(), model.size(), MutableOpResolver{},
-                      /*error_reporter=*/nullptr));
+  MockErrorReporter mock_reporter;
+  ASSERT_FALSE(
+      Verify(model.data(), model.size(), MutableOpResolver{}, &mock_reporter));
+  EXPECT_THAT(mock_reporter.GetAsString(),
+              ::testing::ContainsRegex("Invalid flatbuffer format"));
 }
 
 TEST(VerifyModel, TestUnsupportedVersion) {
@@ -140,8 +190,11 @@ TEST(VerifyModel, TestUnsupportedVersion) {
   auto model = CreateModel(builder, /*version=*/1, /*operator_codes=*/0,
                            /*subgraphs=*/0, /*description=*/0, /*buffers=*/0);
   ::tflite::FinishModelBuffer(builder, model);
+  MockErrorReporter mock_reporter;
   ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize(),
-                      MutableOpResolver{}, DefaultErrorReporter()));
+                      MutableOpResolver{}, &mock_reporter));
+  EXPECT_THAT(mock_reporter.GetAsString(),
+              ::testing::ContainsRegex("Invalid model version 1"));
 }
 
 TEST(VerifyModel, TestRandomModificationIsNotAllowed) {
@@ -166,6 +219,9 @@ TEST(VerifyModel, TestIntTensorShapeIsGreaterThanBuffer) {
   builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(builder.GetErrorString(),
+              ::testing::ContainsRegex("Tensor input requires 6 bytes, but is "
+                                       "allocated with 4 bytes buffer"));
 }
 
 TEST(VerifyModel, TestIntTensorShapeIsSmallerThanBuffer) {
@@ -173,6 +229,9 @@ TEST(VerifyModel, TestIntTensorShapeIsSmallerThanBuffer) {
   builder.AddTensor({2, 1}, TensorType_UINT8, {1, 2, 3, 4}, "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(builder.GetErrorString(),
+              ::testing::ContainsRegex("Tensor input requires 2 bytes, but is "
+                                       "allocated with 4 bytes buffer"));
 }
 
 TEST(VerifyModel, TestIntTensorShapeOverflow) {
@@ -181,6 +240,8 @@ TEST(VerifyModel, TestIntTensorShapeOverflow) {
                     "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(builder.GetErrorString(),
+              ::testing::ContainsRegex("Tensor input dimension overflow"));
 }
 
 TEST(VerifyModel, TensorBufferIsNotValid) {
@@ -203,8 +264,12 @@ TEST(VerifyModel, TensorBufferIsNotValid) {
                            builder.CreateString("SmartReply"), buffers);
 
   ::tflite::FinishModelBuffer(builder, model);
+  MockErrorReporter mock_reporter;
   ASSERT_FALSE(Verify(builder.GetBufferPointer(), builder.GetSize(),
-                      MutableOpResolver{}, DefaultErrorReporter()));
+                      MutableOpResolver{}, &mock_reporter));
+  EXPECT_THAT(
+      mock_reporter.GetAsString(),
+      ::testing::ContainsRegex("Missing 'operators' section in subgraph."));
 }
 
 TEST(VerifyModel, StringTensorHasInvalidNumString) {
@@ -215,6 +280,11 @@ TEST(VerifyModel, StringTensorHasInvalidNumString) {
       "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(
+      builder.GetErrorString(),
+      ::testing::ContainsRegex(
+          "String tensor input buffer requires at least -2147483640 bytes, "
+          "but is allocated with 18 bytes"));
 }
 
 TEST(VerifyModel, StringTensorOffsetTooSmall) {
@@ -224,6 +294,9 @@ TEST(VerifyModel, StringTensorOffsetTooSmall) {
       {2, 0, 0, 0, 12, 0, 0, 0, 17, 0, 0, 0, 18, 0, 0, 0, 'A', 'B'}, "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(builder.GetErrorString(),
+              ::testing::ContainsRegex(
+                  "String tensor input buffer initial offset must be: 16"));
 }
 
 TEST(VerifyModel, StringTensorOffsetOutOfRange) {
@@ -233,6 +306,9 @@ TEST(VerifyModel, StringTensorOffsetOutOfRange) {
       {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 22, 0, 0, 0, 'A', 'B'}, "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(builder.GetErrorString(),
+              ::testing::ContainsRegex(
+                  "String tensor input buffer is invalid: index 2"));
 }
 
 TEST(VerifyModel, StringTensorIsLargerThanRequired) {
@@ -243,37 +319,144 @@ TEST(VerifyModel, StringTensorIsLargerThanRequired) {
       "input");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(builder.GetErrorString(),
+              ::testing::ContainsRegex(
+                  "String tensor input buffer last offset must be 19"));
 }
 
 TEST(VerifyModel, AllOpsAreSupported) {
   TfLiteFlatbufferModelBuilder builder({BuiltinOperator_ADD}, {"CustomOp"});
-  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
-  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
-  builder.AddTensor({2, 3}, TensorType_UINT8, {}, "output");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {}, "output1");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {}, "output2");
   builder.AddOperator({0, 1}, {2}, BuiltinOperator_ADD, nullptr);
-  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "CustomOp");
+  builder.AddOperator({0, 1}, {3}, BuiltinOperator_CUSTOM, "CustomOp");
   builder.FinishModel({}, {});
-  ASSERT_FALSE(builder.Verify());
+  ASSERT_TRUE(builder.Verify());
+  EXPECT_EQ("", builder.GetErrorString());
 }
 
 TEST(VerifyModel, UseUnsupportedBuiltinOps) {
   TfLiteFlatbufferModelBuilder builder({BuiltinOperator_SUB}, {"CustomOp"});
-  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
-  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
-  builder.AddTensor({2, 3}, TensorType_UINT8, {}, "output");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {}, "output");
   builder.AddOperator({0, 1}, {2}, BuiltinOperator_ADD, nullptr);
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(
+      builder.GetErrorString(),
+      ::testing::ContainsRegex("Unsupported builtin op: ADD, version: 1"));
 }
 
 TEST(VerifyModel, UseUnsupportedCustomOps) {
   TfLiteFlatbufferModelBuilder builder({BuiltinOperator_ADD}, {"NewOp"});
-  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
-  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
-  builder.AddTensor({2, 3}, TensorType_UINT8, {}, "output");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {}, "output");
   builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "Not supported");
   builder.FinishModel({}, {});
   ASSERT_FALSE(builder.Verify());
+  EXPECT_THAT(builder.GetErrorString(),
+              ::testing::ContainsRegex(
+                  "Unsupported custom op: Not supported, version: 1"));
+}
+
+TEST(VerifyModel, UnpopulatedInputToOp) {
+  TfLiteFlatbufferModelBuilder builder({}, {"test"});
+  builder.AddOperator({1, 2}, {3}, BuiltinOperator_CUSTOM, "test");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input");
+  // This tensor will never be populated.
+  builder.AddTensor({2, 3}, TensorType_UINT8, {}, "invalid_input");
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'},
+      "data");
+  builder.AddTensor({2, 3}, TensorType_INT32, {}, "output");
+  builder.FinishModel({0, 2}, {3});
+  ASSERT_FALSE(builder.Verify());
+  EXPECT_EQ("Input tensor 1 to op 0 (CUSTOM) is not produced",
+            builder.GetErrorString());
+}
+
+TEST(VerifyModel, MultipleOpsOutputToSameTensor) {
+  TfLiteFlatbufferModelBuilder builder({BuiltinOperator_ADD}, {"CustomOp"});
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input1");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {1, 2, 3, 4}, "input2");
+  builder.AddTensor({2, 2}, TensorType_UINT8, {}, "output1");
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_ADD, nullptr);
+  // This can't output to "output1", since the first operator does that.
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "CustomOp");
+  builder.FinishModel({}, {});
+  ASSERT_FALSE(builder.Verify());
+  EXPECT_EQ(
+      "Output tensor 2 to op 1 (CUSTOM) is an output from another op. "
+      "There is a cycle in the graph",
+      builder.GetErrorString());
+}
+
+TEST(VerifyModel, OutputIsAConstantTensor) {
+  TfLiteFlatbufferModelBuilder builder({}, {"test"});
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "test");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input");
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'},
+      "data");
+  // Output shouldn't be populated with constant value.
+  builder.AddTensor({2, 3}, TensorType_INT32, {1, 2, 3, 4, 5, 6}, "output");
+  builder.FinishModel({0, 1}, {2});
+  ASSERT_FALSE(builder.Verify());
+  EXPECT_EQ("Output tensor 2 to op 0 (CUSTOM) is a constant",
+            builder.GetErrorString());
+}
+
+TEST(VerifyModel, OutputIsSubgraphInput) {
+  TfLiteFlatbufferModelBuilder builder({}, {"test"});
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "test");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input");
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'},
+      "data");
+  builder.AddTensor({2, 3}, TensorType_INT32, {}, "output");
+  // Output shouldn't be a subgraph input.
+  builder.FinishModel({0, 1, 2}, {2});
+  ASSERT_FALSE(builder.Verify());
+  EXPECT_EQ("Output tensor 2 to op 0 (CUSTOM) is a subgraph input",
+            builder.GetErrorString());
+}
+
+TEST(VerifyModel, OutputIsAVariable) {
+  TfLiteFlatbufferModelBuilder builder({}, {"test"});
+  builder.AddOperator({0, 1}, {2}, BuiltinOperator_CUSTOM, "test");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input");
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'},
+      "data");
+  // Output shouldn't be a variable.
+  builder.AddTensor({2, 3}, TensorType_INT32, {}, "output", /*variable*/ true);
+  builder.FinishModel({0, 1}, {2});
+  ASSERT_FALSE(builder.Verify());
+  EXPECT_EQ("Output tensor 2 to op 0 (CUSTOM) is a variable",
+            builder.GetErrorString());
+}
+
+TEST(VerifyModel, OpWithOptionalTensor) {
+  TfLiteFlatbufferModelBuilder builder({}, {"test"});
+  builder.AddOperator({kOptionalTensor, 0, 1}, {2}, BuiltinOperator_CUSTOM,
+                      "test");
+  builder.AddTensor({2, 3}, TensorType_UINT8, {1, 2, 3, 4, 5, 6}, "input");
+  builder.AddTensor(
+      {2}, TensorType_STRING,
+      {2, 0, 0, 0, 16, 0, 0, 0, 17, 0, 0, 0, 19, 0, 0, 0, 'A', 'B', 'C'},
+      "data");
+  builder.AddTensor({2, 3}, TensorType_INT32, {}, "output");
+  builder.FinishModel({0, 1}, {2});
+  ASSERT_TRUE(builder.Verify());
+  EXPECT_EQ("", builder.GetErrorString());
 }
 
 // TODO(yichengfan): make up malicious files to test with.
diff --git a/tensorflow/lite/tutorials/post_training_quant.ipynb b/tensorflow/lite/tutorials/post_training_quant.ipynb
index 394ab0760b5672978e0638c0ff01a8f00442302c..8bc02eedf68551036cf81eba568118e6f7e32639 100644
--- a/tensorflow/lite/tutorials/post_training_quant.ipynb
+++ b/tensorflow/lite/tutorials/post_training_quant.ipynb
@@ -301,7 +301,7 @@
         "id": "7BONhYtYocQY"
       },
       "source": [
-        "To quantize the model on export, set the `post_training_quantize` flag:"
+        "To quantize the model on export, set the `optimizations` flag to optimize for size:"
       ]
     },
     {
@@ -313,11 +313,11 @@
         "id": "g8PUvLWDlmmz"
       },
       "outputs": [],
-      "source": [
+     "source": [
         "# Note: If you don't have a recent tf-nightly installed, the\n",
-        "# \"post_training_quantize\" line will have no effect.\n",
+        "# \"optimizations\" line will have no effect.\n",
         "tf.logging.set_verbosity(tf.logging.INFO)\n",
-        "converter.post_training_quantize = True\n",
+        "converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]\n",
         "tflite_quant_model = converter.convert()\n",
         "tflite_model_quant_file = tflite_models_dir/\"mnist_model_quant.tflite\"\n",
         "tflite_model_quant_file.write_bytes(tflite_quant_model)"
@@ -329,8 +329,8 @@
         "colab_type": "text",
         "id": "PhMmUTl4sbkz"
       },
-      "source": [
-        "Note how the resulting file, with `post_training_quantize` set, is approximately `1/4` the size."
+    "source": [
+        "Note how the resulting file, is approximately `1/4` the size."
       ]
     },
     {
@@ -383,7 +383,7 @@
       "source": [
         "import numpy as np\n",
         "mnist_train, mnist_test = tf.keras.datasets.mnist.load_data()\n",
-        "images, labels = tf.to_float(mnist_test[0])/255.0, mnist_test[1]\n",
+        "images, labels = tf.cast(mnist_test[0], tf.float32)/255.0, mnist_test[1]\n",
         "\n",
         "# Note: If you change the batch size, then use \n",
         "# `tf.lite.Interpreter.resize_tensor_input` to also change it for\n",
@@ -489,7 +489,7 @@
         "plt.imshow(img[0])\n",
         "template = \"True:{true}, predicted:{predict}\"\n",
         "_ = plt.title(template.format(true= str(label[0].numpy()),\n",
-        "                              predict=str(predictions[0,0])))\n",
+        "                              predict=str(predictions[0])))\n",
         "plt.grid(False)"
       ]
     },
@@ -650,7 +650,7 @@
         "output_arrays = [\"output\"]\n",
         "converter = tf.lite.TFLiteConverter.from_frozen_graph(\n",
         "  str(graph_def_file), input_arrays, output_arrays, input_shapes={\"input\":[1,299,299,3]})\n",
-        "converter.post_training_quantize = True\n",
+        "converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]\n",
         "resnet_tflite_file = graph_def_file.parent/\"resnet_v2_101_quantized.tflite\"\n",
         "resnet_tflite_file.write_bytes(converter.convert())\n"
       ]
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 688a837dac3fe7db6badfa9688ca7640c7658c7f..9264939b3b886858b53b2ac5e893732c80898095 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -1,3 +1,31 @@
+tensorflow/contrib/tpu/profiler/pip_package/BUILD
+tensorflow/contrib/tpu/profiler/pip_package/setup.py
+tensorflow/contrib/tpu/profiler/pip_package/README
+tensorflow/contrib/tpu/profiler/pip_package/build_pip_package.sh
+tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
+tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/__init__.py
+tensorflow/contrib/mpi/BUILD
+tensorflow/tools/ci_build/remote/BUILD
+tensorflow/tools/pip_package/README
+tensorflow/tools/pip_package/MANIFEST.in
+tensorflow/tools/pip_package/simple_console.py
+tensorflow/tools/pip_package/build_pip_package.sh
+tensorflow/tools/pip_package/check_load_py_test.py
+tensorflow/tools/pip_package/pip_smoke_test.py
+tensorflow/tools/pip_package/simple_console_for_windows.py
+tensorflow/tools/pip_package/setup.py
+tensorflow/tools/pip_package/BUILD
+tensorflow/tools/lib_package/concat_licenses.sh
+tensorflow/tools/lib_package/libtensorflow_test.c
+tensorflow/tools/lib_package/LibTensorFlowTest.java
+tensorflow/tools/lib_package/BUILD
+tensorflow/tools/lib_package/libtensorflow_test.sh
+tensorflow/tools/lib_package/README.md
+tensorflow/tools/lib_package/libtensorflow_java_test.sh
+tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
+tensorflow/tools/def_file_filter/BUILD
+tensorflow/tools/def_file_filter/BUILD.tpl
+tensorflow/tools/def_file_filter/def_file_filter.py.tpl
 tensorflow/third_party/mkl/MKL_LICENSE
 tensorflow/third_party/mkl/LICENSE
 tensorflow/third_party/mkl/BUILD
@@ -9,6 +37,9 @@ tensorflow/third_party/toolchains/clang6/README.md
 tensorflow/third_party/toolchains/clang6/repo.bzl
 tensorflow/third_party/toolchains/clang6/CROSSTOOL.tpl
 tensorflow/third_party/toolchains/clang6/clang.BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/BUILD
@@ -17,11 +48,14 @@ tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUI
 tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
 tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/build_defs.bzl
 tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
-tensorflow/third_party/toolchains/preconfig/ubuntu14.04/nccl2/BUILD
 tensorflow/third_party/toolchains/preconfig/generate/workspace.bzl
 tensorflow/third_party/toolchains/preconfig/generate/containers.bzl
 tensorflow/third_party/toolchains/preconfig/generate/generate.bzl
+tensorflow/third_party/toolchains/preconfig/generate/archives.bzl
 tensorflow/third_party/toolchains/preconfig/generate/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/dummy_toolchain.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu16.04/py3/BUILD
 tensorflow/third_party/toolchains/preconfig/win_1803/bazel_018/BUILD
 tensorflow/third_party/toolchains/preconfig/win_1803/bazel_018/dummy_toolchain.bzl
 tensorflow/third_party/toolchains/preconfig/win_1803/py36/BUILD
@@ -37,30 +71,24 @@ tensorflow/third_party/toolchains/cpus/arm/CROSSTOOL.tpl
 tensorflow/third_party/toolchains/cpus/arm/BUILD
 tensorflow/third_party/toolchains/cpus/py3/BUILD
 tensorflow/third_party/toolchains/cpus/py/BUILD
+tensorflow/third_party/toolchains/remote/configure.bzl
+tensorflow/third_party/toolchains/remote/BUILD.tpl
+tensorflow/third_party/toolchains/remote/BUILD
+tensorflow/third_party/toolchains/remote/execution.bzl.tpl
 tensorflow/third_party/toolchains/BUILD
-tensorflow/third_party/nccl/remote.BUILD.tpl
-tensorflow/third_party/nccl/archive.BUILD
-tensorflow/third_party/nccl/LICENSE
-tensorflow/third_party/nccl/system.BUILD.tpl
-tensorflow/third_party/nccl/nccl_configure.bzl
-tensorflow/third_party/nccl/build_defs.bzl.tpl
-tensorflow/third_party/nccl/BUILD
 tensorflow/third_party/gpus/BUILD
 tensorflow/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
 tensorflow/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
 tensorflow/third_party/gpus/crosstool/CROSSTOOL.tpl
 tensorflow/third_party/gpus/crosstool/CROSSTOOL_hipcc.tpl
 tensorflow/third_party/gpus/crosstool/LICENSE
-tensorflow/third_party/gpus/crosstool/remote.BUILD.tpl
 tensorflow/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
-tensorflow/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.bat.tpl
 tensorflow/third_party/gpus/crosstool/BUILD.tpl
 tensorflow/third_party/gpus/crosstool/BUILD
 tensorflow/third_party/gpus/cuda/LICENSE
 tensorflow/third_party/gpus/cuda/BUILD.tpl
 tensorflow/third_party/gpus/cuda/BUILD.windows.tpl
 tensorflow/third_party/gpus/cuda/cuda_config.h.tpl
-tensorflow/third_party/gpus/cuda/remote.BUILD.tpl
 tensorflow/third_party/gpus/cuda/BUILD
 tensorflow/third_party/gpus/cuda/build_defs.bzl.tpl
 tensorflow/third_party/gpus/rocm/rocm_config.h.tpl
@@ -92,6 +120,7 @@ tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProdu
 tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool
 tensorflow/third_party/eigen3/unsupported/Eigen/SpecialFunctions
 tensorflow/third_party/eigen3/unsupported/Eigen/MatrixFunctions
+tensorflow/third_party/eigen3/gpu_packet_math.patch
 tensorflow/third_party/eigen3/LICENSE
 tensorflow/third_party/eigen3/BUILD
 tensorflow/third_party/systemlibs/build_defs.bzl.tpl
@@ -145,6 +174,12 @@ tensorflow/third_party/llvm/expand_cmake_vars.py
 tensorflow/third_party/llvm/llvm.autogenerated.BUILD
 tensorflow/third_party/llvm/llvm.bzl
 tensorflow/third_party/icu/udata.patch
+tensorflow/third_party/nccl/archive.BUILD
+tensorflow/third_party/nccl/LICENSE
+tensorflow/third_party/nccl/system.BUILD.tpl
+tensorflow/third_party/nccl/nccl_configure.bzl
+tensorflow/third_party/nccl/build_defs.bzl.tpl
+tensorflow/third_party/nccl/BUILD
 tensorflow/third_party/fft2d/BUILD
 tensorflow/third_party/fft2d/fft.h
 tensorflow/third_party/fft2d/LICENSE
@@ -179,10 +214,10 @@ tensorflow/third_party/git/BUILD.tpl
 tensorflow/third_party/git/BUILD
 tensorflow/third_party/git/git_configure.bzl
 tensorflow/third_party/protobuf/BUILD
+tensorflow/third_party/enum34.BUILD
 tensorflow/third_party/tflite_mobilenet.BUILD
 tensorflow/third_party/py/BUILD
 tensorflow/third_party/py/BUILD.tpl
-tensorflow/third_party/py/remote.BUILD.tpl
 tensorflow/third_party/py/numpy/BUILD
 tensorflow/third_party/py/python_configure.bzl
 tensorflow/third_party/termcolor.BUILD
@@ -207,4 +242,10 @@ tensorflow/third_party/jsoncpp.BUILD
 tensorflow/third_party/tflite_ovic_testdata.BUILD
 tensorflow/third_party/libxsmm.BUILD
 tensorflow/third_party/zlib.BUILD
-tensorflow/third_party/eigen.BUILD
\ No newline at end of file
+tensorflow/third_party/eigen.BUILD
+tensorflow/stream_executor/build_defs.bzl
+tensorflow/api_template_v1.__init__.py
+tensorflow/compat_template_v1.__init__.py
+tensorflow/compat_template.__init__.py
+tensorflow/api_template.__init__.py
+tensorflow/__init__.py
\ No newline at end of file
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 0a3ee65bc48013971c857fc5fb04f397c3edd2aa..5a8935a6b9f5f6c18510ff83dfaa186034318ec4 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -6,10 +6,12 @@
 
 visibility = [
     "//engedu/ml/tf_from_scratch:__pkg__",
+    "//third_party/cloud_tpu/convergence_tools:__subpackages__",
     "//tensorflow:internal",
     "//tensorflow/lite/toco/python:__pkg__",
     "//tensorflow_models:__subpackages__",
     "//tensorflow_model_optimization:__subpackages__",
+    "//third_party/py/cleverhans:__subpackages__",
     # TODO(aselle): to pass open source test.
     "//bazel_pip/tensorflow/lite/toco/python:__pkg__",
 ]
@@ -22,6 +24,7 @@ exports_files(["LICENSE"])
 
 exports_files(["platform/base.i"])
 
+load("//tensorflow:tensorflow.bzl", "if_not_v2")
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
@@ -68,8 +71,29 @@ py_library(
     ],
     deps = [
         ":no_contrib",
-        "//tensorflow/contrib:contrib_py",
         "//tensorflow/python/estimator:estimator_py",
+    ] + if_not_v2(["//tensorflow/contrib:contrib_py"]),
+)
+
+py_library(
+    name = "keras_lib",
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//tensorflow:__pkg__",
+        "//tensorflow:internal",
+        "//tensorflow/python/estimator:__subpackages__",
+        "//tensorflow/python/keras:__subpackages__",
+        "//tensorflow/python/tools:__pkg__",
+        "//tensorflow/python/tools/api/generator:__pkg__",
+        "//tensorflow/tools/api/tests:__pkg__",
+        "//tensorflow/tools/compatibility/update:__pkg__",
+        "//tensorflow_estimator:__subpackages__",
+    ],
+    deps = [
+        ":rnn",
+        "//tensorflow/python:layers",
+        "//tensorflow/python/feature_column:feature_column_py",
+        "//tensorflow/python/keras",
     ],
 )
 
@@ -80,6 +104,7 @@ py_library(
     visibility = [
         "//tensorflow:__pkg__",
         "//tensorflow/python/estimator:__subpackages__",
+        "//tensorflow/python/keras:__subpackages__",
         "//tensorflow/python/tools:__pkg__",
         "//tensorflow/python/tools/api/generator:__pkg__",
         "//tensorflow/tools/api/tests:__pkg__",
@@ -87,11 +112,13 @@ py_library(
     ],
     deps = [
         ":array_ops",
+        ":audio_ops_gen",
         ":bitwise_ops",
         ":boosted_trees_ops",
         ":check_ops",
         ":client",
         ":client_testlib",
+        ":clustering_ops",
         ":collective_ops",
         ":cond_v2",
         ":confusion_matrix",
@@ -108,11 +135,12 @@ py_library(
         ":image_ops",
         ":initializers_ns",
         ":io_ops",
+        ":keras_lib",
         ":kernels",
-        ":layers",
         ":lib",
         ":list_ops",
         ":manip_ops",
+        ":map_fn",
         ":math_ops",
         ":metrics",
         ":nccl_ops",
@@ -144,12 +172,16 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/lite/python:lite",
         "//tensorflow/python/compat",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/compiler",
         "//tensorflow/python/data",
         "//tensorflow/python/distribute",
         "//tensorflow/python/distribute:estimator_training",
         "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/feature_column:feature_column_py",
-        "//tensorflow/python/keras",
+        "//tensorflow/python/eager:profiler",
+        "//tensorflow/python/eager:profiler_client",
+        "//tensorflow/python/eager:remote",
+        "//tensorflow/python/module",
         "//tensorflow/python/ops/distributions",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/losses",
@@ -184,6 +216,7 @@ py_library(
         ":pywrap_tensorflow",
         ":util",
         "//tensorflow/core:protos_all_py",
+        "@absl_py//absl:app",
         "@absl_py//absl/flags",
         "@six_archive//:six",
     ],
@@ -204,7 +237,10 @@ py_library(
     name = "platform_test",
     srcs = ["platform/googletest.py"],
     srcs_version = "PY2AND3",
-    deps = [":platform_benchmark"],
+    deps = [
+        ":platform_benchmark",
+        "@absl_py//absl/testing:absltest",
+    ],
 )
 
 tf_py_test(
@@ -470,19 +506,11 @@ tf_cc_shared_object(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "file_system_test",
     size = "small",
     srcs = ["framework/file_system_test.py"],
-    data = [":framework/test_file_system.so"],
-    main = "framework/file_system_test.py",
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",  # Path issues due to test environment
-        "no_windows",
-        "notap",
-    ],
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":data_flow_ops",
         ":framework",
@@ -491,57 +519,59 @@ py_test(
         ":platform",
         ":util",
     ],
+    data = [":framework/test_file_system.so"],
+    main = "framework/file_system_test.py",
+    tags = [
+        "no_pip",  # Path issues due to test environment
+        "no_windows",
+        "notap",
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "decorator_utils_test",
     srcs = ["util/decorator_utils_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":platform",
         ":util",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "tf_export_test",
     srcs = ["util/tf_export_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":platform",
         ":util",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "deprecation_test",
     srcs = ["util/deprecation_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":platform",
         ":util",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "dispatch_test",
     srcs = ["util/dispatch_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":platform",
         ":util",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "keyword_args_test",
     srcs = ["util/keyword_args_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":util",
     ],
@@ -621,6 +651,7 @@ py_library(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:core",
         "//tensorflow/python/eager:execute",
+        "//tensorflow/tools/docs:doc_controls",
     ],
 )
 
@@ -648,6 +679,8 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":common_shapes",
+        ":composite_tensor",
+        ":convert_to_constants",
         ":cpp_shape_inference_proto_py",
         ":errors",
         ":framework_fast_tensor_util",
@@ -731,6 +764,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":c_api_util",
+        ":error_interpolation",
         ":util",
     ],
 )
@@ -787,13 +821,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "function_def_to_graph_test",
     size = "small",
     srcs = ["framework/function_def_to_graph_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":constant_op",
@@ -805,6 +837,7 @@ py_test(
         ":math_ops",
         ":test_ops",
     ],
+    tags = ["no_pip"],
 )
 
 py_library(
@@ -823,6 +856,22 @@ py_library(
     ],
 )
 
+py_library(
+    name = "convert_to_constants",
+    srcs = [
+        "framework/convert_to_constants.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dtypes",
+        ":framework_ops",
+        ":platform",
+        ":tensor_util",
+        ":tf_optimizer",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
 py_library(
     name = "kernels",
     srcs = [
@@ -886,6 +935,24 @@ py_library(
     ],
 )
 
+py_library(
+    name = "map_fn",
+    srcs = ["ops/map_fn.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":constant_op",
+        ":control_flow_ops",
+        ":framework_ops",
+        ":sparse_tensor",
+        ":tensor_array_ops",
+        ":tensor_shape",
+        ":util",
+        ":variable_scope",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
 py_library(
     name = "func_graph",
     srcs = ["framework/func_graph.py"],
@@ -915,14 +982,14 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "auto_control_deps_test",
     size = "small",
     srcs = ["framework/auto_control_deps_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":auto_control_deps",
         ":client_testlib",
+        "//tensorflow/python/keras",
     ],
 )
 
@@ -955,12 +1022,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "smart_cond_test",
     size = "small",
     srcs = ["framework/smart_cond_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":constant_op",
         ":framework_ops",
@@ -974,6 +1040,18 @@ py_library(
     name = "sparse_tensor",
     srcs = ["framework/sparse_tensor.py"],
     srcs_version = "PY2AND3",
+    deps = [
+        ":composite_tensor",
+        ":dtypes",
+        ":framework_ops",
+        ":tensor_util",
+    ],
+)
+
+py_library(
+    name = "composite_tensor",
+    srcs = ["framework/composite_tensor.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":dtypes",
         ":framework_ops",
@@ -981,6 +1059,21 @@ py_library(
     ],
 )
 
+py_test(
+    name = "framework_composite_tensor_test",
+    srcs = ["framework/composite_tensor_test.py"],
+    main = "framework/composite_tensor_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":composite_tensor",
+        ":framework",
+        ":framework_for_generated_wrappers",
+        ":framework_test_lib",
+        ":platform_test",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
 # This target is maintained separately from :util to provide separate visibility
 # for legacy users who were granted visibility when the functions were private
 # members of ops.Graph.
@@ -1051,6 +1144,7 @@ py_library(
     name = "extra_py_tests_deps",
     srcs_version = "PY2AND3",
     deps = [
+        ":keras_lib",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -1084,6 +1178,14 @@ py_library(
     ],
 )
 
+# Including this as a dependency will result in tests using
+# :framework_test_lib to use XLA.
+py_library(
+    name = "is_xla_test_true",
+    srcs = ["framework/is_xla_test_true.py"],
+    srcs_version = "PY2AND3",
+)
+
 py_library(
     name = "distributed_framework_test_lib",
     srcs_version = "PY2AND3",
@@ -1106,52 +1208,48 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "framework_registry_test",
     size = "small",
     srcs = ["framework/registry_test.py"],
-    main = "framework/registry_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_for_generated_wrappers",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:client_testlib",
     ],
+    main = "framework/registry_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_errors_test",
     size = "small",
     srcs = ["framework/errors_test.py"],
-    main = "framework/errors_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":errors",
         "//tensorflow/core:protos_all_py",
     ],
+    main = "framework/errors_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_error_interpolation_test",
     size = "small",
     srcs = ["framework/error_interpolation_test.py"],
-    main = "framework/error_interpolation_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":constant_op",
         ":error_interpolation",
         ":traceable_stack",
     ],
+    main = "framework/error_interpolation_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_subscribe_test",
     size = "small",
     srcs = ["framework/subscribe_test.py"],
-    main = "framework/subscribe_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -1160,50 +1258,48 @@ py_test(
         ":script_ops",
         ":subscribe",
     ],
+    main = "framework/subscribe_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "contrib_test",
     size = "small",
     srcs = ["framework/contrib_test.py"],
-    main = "framework/contrib_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
     ],
+    main = "framework/contrib_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "build_info_test",
     size = "small",
     srcs = [
         "platform/build_info.py",
         "platform/build_info_test.py",
     ],
+    additional_deps = [
+        ":client_testlib",
+        ":platform",
+    ],
     main = "platform/build_info_test.py",
-    srcs_version = "PY2AND3",
     tags = [
         "no_pip",
         "notap",
     ],
-    deps = [
-        ":client_testlib",
-        ":platform",
-    ],
 )
 
-py_test(
+tf_py_test(
     name = "proto_test",
     size = "small",
     srcs = ["framework/proto_test.py"],
-    main = "framework/proto_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
         "//third_party/py/numpy",
     ],
+    main = "framework/proto_test.py",
 )
 
 tf_gen_op_wrapper_private_py(
@@ -1233,8 +1329,8 @@ py_library(
     ],
 )
 
-cuda_py_tests(
-    name = "framework_function_test",
+cuda_py_test(
+    name = "function_test",
     size = "medium",
     srcs = ["framework/function_test.py"],
     additional_deps = [
@@ -1264,27 +1360,25 @@ cuda_py_tests(
         "noasan",
         "optonly",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
-py_test(
+tf_py_test(
     name = "framework_versions_test",
     size = "small",
     srcs = ["framework/versions_test.py"],
-    main = "framework/versions_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
     ],
+    main = "framework/versions_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_importer_test",
     size = "large",
     srcs = ["framework/importer_test.py"],
-    main = "framework/importer_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework",
@@ -1296,9 +1390,10 @@ py_test(
         ":random_ops",
         ":test_ops",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
     ],
+    main = "framework/importer_test.py",
 )
 
 filegroup(
@@ -1309,18 +1404,11 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
-py_test(
+tf_py_test(
     name = "framework_meta_graph_test",
     size = "small",
     srcs = ["framework/meta_graph_test.py"],
-    data = ["//tensorflow/python:meta_graph_testdata"],
-    main = "framework/meta_graph_test.py",
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "no_windows",
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":control_flow_ops",
@@ -1335,21 +1423,26 @@ py_test(
         ":training",
         ":variables",
     ],
+    data = ["//tensorflow/python:meta_graph_testdata"],
+    main = "framework/meta_graph_test.py",
+    tags = [
+        "no_pip",
+        "no_windows",
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "framework_traceable_stack_test",
     size = "small",
     srcs = ["framework/traceable_stack_test.py"],
-    main = "framework/traceable_stack_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_test_lib",
         ":platform_test",
         ":test_ops",
         ":traceable_stack",
         ":util",
     ],
+    main = "framework/traceable_stack_test.py",
 )
 
 tf_gen_op_wrapper_py(
@@ -1384,29 +1477,25 @@ cc_library(
     alwayslink = 1,
 )
 
-py_test(
+tf_py_test(
     name = "framework_common_shapes_test",
     size = "small",
     srcs = ["framework/common_shapes_test.py"],
-    main = "framework/common_shapes_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
         "//tensorflow/core:protos_all_py",
     ],
+    main = "framework/common_shapes_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_ops_test",
     size = "small",
     srcs = ["framework/ops_test.py"],
-    main = "framework/ops_test.py",
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],  # test_ops_2 is not available in pip.
-    deps = [
+    additional_deps = [
         ":cond_v2",
         ":control_flow_ops",
         ":errors",
@@ -1427,114 +1516,106 @@ py_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
     ],
+    main = "framework/ops_test.py",
+    tags = ["no_pip"],  # test_ops_2 is not available in pip.
 )
 
-py_test(
+tf_py_test(
     name = "framework_ops_enable_eager_test",
     size = "small",
     srcs = ["framework/ops_enable_eager_test.py"],
-    main = "framework/ops_enable_eager_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework",
         ":platform_test",
         "//tensorflow/python/eager:context",
     ],
+    main = "framework/ops_enable_eager_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_tensor_shape_test",
     size = "small",
     srcs = ["framework/tensor_shape_test.py"],
-    main = "framework/tensor_shape_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
         "//tensorflow/core:protos_all_py",
     ],
+    main = "framework/tensor_shape_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_tensor_spec_test",
     size = "small",
     srcs = ["framework/tensor_spec_test.py"],
-    main = "framework/tensor_spec_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
         ":tensor_spec",
         "//third_party/py/numpy",
     ],
+    main = "framework/tensor_spec_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_sparse_tensor_test",
     size = "small",
     srcs = ["framework/sparse_tensor_test.py"],
-    main = "framework/sparse_tensor_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
         "//tensorflow/core:protos_all_py",
     ],
+    main = "framework/sparse_tensor_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_device_test",
     size = "small",
     srcs = ["framework/device_test.py"],
-    main = "framework/device_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
         "//tensorflow/core:protos_all_py",
     ],
+    main = "framework/device_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_random_seed_test",
     size = "small",
     srcs = ["framework/random_seed_test.py"],
-    main = "framework/random_seed_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework",
     ],
+    main = "framework/random_seed_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_tensor_shape_div_test",
     size = "small",
     srcs = ["framework/tensor_shape_div_test.py"],
-    main = "framework/tensor_shape_div_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
-        "//tensorflow/core:protos_all_py",
         "@six_archive//:six",
+        "//tensorflow/core:protos_all_py",
     ],
+    main = "framework/tensor_shape_div_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "framework_tensor_util_test",
     size = "small",
     srcs = ["framework/tensor_util_test.py"],
-    main = "framework/tensor_util_test.py",
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework",
@@ -1544,16 +1625,15 @@ py_test(
         ":state_ops_gen",
         "//third_party/py/numpy",
     ],
+    main = "framework/tensor_util_test.py",
+    tags = ["no_windows"],
 )
 
-py_test(
+tf_py_test(
     name = "framework_test_util_test",
     size = "small",
     srcs = ["framework/test_util_test.py"],
-    main = "framework/test_util_test.py",
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
+    additional_deps = [
         ":control_flow_ops",
         ":errors",
         ":framework_for_generated_wrappers",
@@ -1564,35 +1644,35 @@ py_test(
         ":session",
         ":test_ops",
         ":variables",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
+    main = "framework/test_util_test.py",
+    tags = ["no_windows"],
 )
 
-py_test(
+tf_py_test(
     name = "framework_dtypes_test",
     size = "small",
     srcs = ["framework/dtypes_test.py"],
-    main = "framework/dtypes_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
+        "//third_party/py/numpy",
         "//tensorflow:tensorflow_py",
         "//tensorflow/core:protos_all_py",
-        "//third_party/py/numpy",
     ],
+    main = "framework/dtypes_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "op_def_library_test",
     size = "small",
     srcs = ["framework/op_def_library_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
         ":platform_test",
@@ -1600,18 +1680,17 @@ py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "framework_kernels_test",
     size = "small",
     srcs = ["framework/kernels_test.py"],
-    main = "framework/kernels_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":framework_test_lib",
         ":kernels",
         ":platform_test",
         ":test_ops",
     ],
+    main = "framework/kernels_test.py",
 )
 
 tf_gen_op_wrapper_private_py(
@@ -1686,6 +1765,14 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "clustering_ops_gen",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:clustering_ops_op_lib",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "collective_ops_gen",
     visibility = ["//tensorflow:internal"],
@@ -1746,7 +1833,7 @@ tf_gen_op_wrapper_private_py(
     visibility = [
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",
-        "//tensorflow/python/training/checkpointable:__pkg__",
+        "//tensorflow/python/training/tracking:__pkg__",
     ],
 )
 
@@ -1821,6 +1908,11 @@ tf_gen_op_wrapper_private_py(
     visibility = ["//learning/brain/python/ops:__pkg__"],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "stateful_random_ops_gen",
+    visibility = ["//learning/brain/python/ops:__pkg__"],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "resource_variable_ops_gen",
     visibility = [
@@ -1914,6 +2006,27 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "tpu_ops_gen",
+    visibility = [
+        "//smartass/brain/configure/python:__pkg__",
+        "//tensorflow/contrib/tpu:__pkg__",
+        "//tensorflow/python/tpu:__pkg__",
+    ],
+    deps = [
+        "//tensorflow/core:tpu_configuration_ops_op_lib",
+        "//tensorflow/core:tpu_cross_replica_ops_op_lib",
+        "//tensorflow/core:tpu_embedding_ops_op_lib",
+        "//tensorflow/core:tpu_functional_ops_op_lib",
+        "//tensorflow/core:tpu_heartbeat_ops_op_lib",
+        "//tensorflow/core:tpu_host_compute_ops_op_lib",
+        "//tensorflow/core:tpu_infeed_ops_op_lib",
+        "//tensorflow/core:tpu_ordinal_selector_ops_op_lib",
+        "//tensorflow/core:tpu_outfeed_ops_op_lib",
+        "//tensorflow/core:tpu_replication_ops_op_lib",
+    ],
+)
+
 py_library(
     name = "array_grad",
     srcs = ["ops/array_grad.py"],
@@ -2054,14 +2167,37 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "clip_ops_test",
     size = "small",
     srcs = ["ops/clip_ops_test.py"],
+    additional_deps = [
+        ":client_testlib",
+        ":clip_ops",
+        ":framework_for_generated_wrappers",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "clustering_ops",
+    srcs = ["ops/clustering_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":clustering_ops_gen",
+        ":framework",
+        ":ops",
+        ":training",
+    ],
+)
+
+tf_py_test(
+    name = "clustering_ops_test",
+    size = "medium",
+    srcs = ["ops/clustering_ops_test.py"],
+    additional_deps = [
         ":client_testlib",
-        ":clip_ops",
+        ":clustering_ops",
         ":framework_for_generated_wrappers",
         "//third_party/py/numpy",
     ],
@@ -2077,12 +2213,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "collective_ops_test",
     size = "small",
     srcs = ["ops/collective_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":collective_ops",
         ":framework_for_generated_wrappers",
@@ -2172,7 +2307,7 @@ py_library(
         ":function_def_to_graph",
         ":functional_ops_gen",
         ":gradients",
-        ":gradients_impl",
+        ":gradients_util",
         ":graph_to_function_def",
         ":pywrap_tensorflow",
         ":util",
@@ -2185,6 +2320,7 @@ py_library(
     name = "while_v2",
     srcs = [
         "ops/while_v2.py",
+        "ops/while_v2_indexed_slices_rewriter.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -2197,7 +2333,7 @@ py_library(
         ":framework_ops",
         ":function_def_to_graph",
         ":functional_ops_gen",
-        ":gradients_impl",
+        ":gradients_util",
         ":list_ops",
         ":tensor_array_ops",
         ":tensor_shape",
@@ -2288,6 +2424,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":gradients_impl",
+        ":gradients_util",
         ":unconnected_gradients",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/eager:tape",
@@ -2311,7 +2448,6 @@ py_library(
         ":framework",
         ":framework_for_generated_wrappers",
         ":framework_ops",
-        ":functional_ops",
         ":image_grad",
         ":linalg_grad",
         ":linalg_ops",
@@ -2323,15 +2459,34 @@ py_library(
         ":optional_grad",
         ":platform",
         ":random_grad",
-        ":resource_variable_ops",
         ":tensor_array_ops",
+        ":unconnected_gradients",
+        ":util",
+    ],
+)
+
+py_library(
+    name = "gradients_util",
+    srcs = [
+        "ops/gradients_util.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":control_flow_ops",
+        ":control_flow_util",
+        ":framework",
+        ":framework_for_generated_wrappers",
+        ":framework_ops",
+        ":functional_ops",
+        ":math_ops",
+        ":platform",
+        ":resource_variable_ops",
         ":tensor_util",
         ":unconnected_gradients",
         ":util",
-        ":variable_scope",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:tape",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -2413,6 +2568,23 @@ py_library(
     ],
 )
 
+py_library(
+    name = "init_ops_v2",
+    srcs = ["ops/init_ops_v2.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":constant_op",
+        ":dtypes",
+        ":linalg_ops_gen",
+        ":linalg_ops_impl",
+        ":math_ops",
+        ":random_ops",
+        ":util",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "initializers_ns",
     srcs = ["ops/initializers_ns.py"],
@@ -2456,9 +2628,9 @@ py_library(
         ":array_ops",
         ":dtypes",
         ":framework_ops",
-        ":functional_ops",
         ":linalg_ops_gen",
         ":linalg_ops_impl",
+        ":map_fn",
         ":math_ops",
         "//third_party/py/numpy",
     ],
@@ -2606,6 +2778,22 @@ py_library(
     ],
 )
 
+py_library(
+    name = "critical_section_ops",
+    srcs = ["ops/critical_section_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":control_flow_ops",
+        ":dtypes",
+        ":framework_ops",
+        ":resource_variable_ops_gen",
+        ":tensor_array_ops",
+        ":util",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
 py_library(
     name = "list_ops",
     srcs = ["ops/list_ops.py"],
@@ -2741,6 +2929,32 @@ py_library(
     ],
 )
 
+py_library(
+    name = "stateful_random_ops",
+    srcs = ["ops/stateful_random_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dtypes",
+        ":framework_ops",
+        ":math_ops",
+        ":stateful_random_ops_gen",
+        ":variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "stateful_random_ops_test",
+    size = "medium",
+    srcs = ["ops/stateful_random_ops_test.py"],
+    additional_deps = [
+        ":stateful_random_ops",
+        ":client_testlib",
+        ":logging_ops",
+        ":random_ops_gen",
+    ],
+)
+
 py_library(
     name = "stateless_random_ops",
     srcs = ["ops/stateless_random_ops.py"],
@@ -2860,11 +3074,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "sparse_ops_test",
     srcs = ["ops/sparse_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":constant_op",
         ":dtypes",
         ":framework_test_lib",
@@ -2887,11 +3100,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "sort_ops_test",
     srcs = ["ops/sort_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework",
@@ -2993,6 +3205,7 @@ py_library(
         ":clip_ops",
         ":confusion_matrix",
         ":control_flow_ops",
+        ":critical_section_ops",
         ":cudnn_rnn_grad",
         ":data_flow_grad",
         ":data_flow_ops",
@@ -3021,6 +3234,7 @@ py_library(
         ":special_math_ops",
         ":state_grad",
         ":state_ops",
+        ":stateful_random_ops",
         ":stateless_random_ops",
         ":string_ops",
         ":template",
@@ -3032,6 +3246,7 @@ py_library(
         "//tensorflow/python/eager:wrap_function",
         "//tensorflow/python/ops/distributions",
         "//tensorflow/python/ops/linalg",
+        "//tensorflow/python/ops/ragged",
     ],
 )
 
@@ -3175,7 +3390,7 @@ py_library(
         ":util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
     ],
 )
 
@@ -3228,6 +3443,7 @@ cuda_py_test(
         ":framework_test_lib",
     ],
     tags = ["no_windows"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3304,7 +3520,6 @@ cuda_py_test(
         ":framework_test_lib",
         ":functional_ops",
         ":gradients",
-        ":layers",
         ":list_ops",
         ":math_grad",
         ":math_ops",
@@ -3347,6 +3562,7 @@ cuda_py_test(
         ":image_ops",
         "//third_party/py/numpy",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3372,6 +3588,7 @@ cuda_py_test(
     ],
     data = ["//tensorflow/core:image_testdata"],
     shard_count = 5,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3388,6 +3605,21 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "init_ops_v2_test",
+    size = "medium",
+    srcs = ["ops/init_ops_v2_test.py"],
+    additional_deps = [
+        ":array_ops",
+        ":client_testlib",
+        ":init_ops_v2",
+        ":random_ops",
+        ":framework_ops",
+        "//third_party/py/numpy",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
 cuda_py_test(
     name = "math_grad_test",
     size = "small",
@@ -3444,7 +3676,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "nn_fused_batchnorm_test",
-    size = "large",
+    size = "medium",
     srcs = ["ops/nn_fused_batchnorm_test.py"],
     additional_deps = [
         ":array_ops",
@@ -3456,6 +3688,7 @@ cuda_py_test(
         "//third_party/py/numpy",
     ],
     shard_count = 16,
+    tags = ["no_rocm"],
 )
 
 cuda_py_test(
@@ -3476,6 +3709,7 @@ cuda_py_test(
         "//third_party/py/numpy",
     ],
     tags = ["no_windows"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3506,6 +3740,7 @@ cuda_py_test(
         "//third_party/py/numpy",
     ],
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 py_library(
@@ -3514,7 +3749,7 @@ py_library(
         ["training/**/*.py"],
         exclude = [
             "**/*test*",
-            "training/checkpointable/**/*.py",
+            "training/tracking/**/*.py",
             "training/saving/**/*.py",
             # The following targets have their own build rules (same name as the
             # file):
@@ -3546,7 +3781,7 @@ py_library(
         ":gradients",
         ":init_ops",
         ":io_ops",
-        ":layers_base",
+        ":layers_util",
         ":lookup_ops",
         ":math_ops",
         ":platform",
@@ -3567,19 +3802,18 @@ py_library(
         ":util",
         ":variable_scope",
         ":variables",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/distribute:distribute_coordinator_context",
+        "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        # `layers` dependency only exists due to the use of a small utility.
-        "//tensorflow/python/keras:layers",
+        "//tensorflow/python/keras/optimizer_v2:learning_rate_schedule",
         "//tensorflow/python/ops/losses",
-        "//tensorflow/python/training/checkpointable:base",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/training/tracking:util",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
     ],
 )
 
@@ -3657,9 +3891,9 @@ py_library(
         ":variables",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/training/checkpointable:base",
         "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/training/saving:saveable_object_util",
+        "//tensorflow/python/training/tracking:base",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -3677,24 +3911,17 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "evaluation_test",
     size = "small",
     srcs = ["training/evaluation_test.py"],
-    shard_count = 3,
-    srcs_version = "PY2AND3",
-    tags = [
-        "manual",
-        "notap",  # Disabling until b/33000128 and b/33040312 are fixed.
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
         ":framework",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
-        ":layers",
         ":math_ops",
         ":metrics",
         ":platform",
@@ -3702,11 +3929,16 @@ py_test(
         ":summary",
         ":training",
         ":variables",
+        "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/ops/losses",
-        "//third_party/py/numpy",
     ],
-)
+    shard_count = 3,
+    tags = [
+        "manual",
+        "notap",  # Disabling until b/33000128 and b/33040312 are fixed.
+    ],
+)
 
 py_library(
     name = "client",
@@ -3742,6 +3974,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = visibility + [
         "//tensorflow:__pkg__",
+        "//third_party/py/tf_agents:__subpackages__",
     ],
     deps = [
         "//third_party/py/numpy",
@@ -3752,76 +3985,68 @@ py_library(
 )
 
 # Placeholder for intenal nest_test comments.
-py_test(
+tf_py_test(
     name = "util_nest_test",
     size = "small",
     srcs = ["util/nest_test.py"],
-    main = "util/nest_test.py",
-    srcs_version = "PY2AND3",
-    visibility = visibility + [
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":util",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
     ],
+    main = "util/nest_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "util_serialization_test",
     size = "small",
     srcs = ["util/serialization_test.py"],
-    main = "util/serialization_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":util",
     ],
+    main = "util/serialization_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "future_api_test",
     size = "small",
     srcs = ["util/future_api_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":util",
         "//tensorflow:tensorflow_py",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "function_utils_test",
     srcs = ["util/function_utils_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":util",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "tf_contextlib_test",
     size = "small",
     srcs = ["util/tf_contextlib_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":util",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "tf_decorator_test",
     size = "small",
     srcs = ["util/tf_decorator_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":util",
     ],
@@ -3839,23 +4064,21 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "tf_should_use_test",
     size = "small",
     srcs = ["util/tf_should_use_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":tf_should_use",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "tf_inspect_test",
     size = "small",
     srcs = ["util/tf_inspect_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":util",
     ],
@@ -3873,17 +4096,16 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "lock_util_test",
     size = "small",
     srcs = ["util/lock_util_test.py"],
-    main = "util/lock_util_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":util",
         "@absl_py//absl/testing:parameterized",
     ],
+    main = "util/lock_util_test.py",
 )
 
 tf_proto_library(
@@ -3912,28 +4134,25 @@ tf_proto_library(
     visibility = ["//tensorflow:internal"],
 )
 
-py_test(
+tf_py_test(
     name = "protobuf_compare_test",
     size = "small",
     srcs = ["util/protobuf/compare_test.py"],
-    main = "util/protobuf/compare_test.py",
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],  # compare_test_pb2 proto is not available in pip.
-    deps = [
+    additional_deps = [
         ":compare_test_proto_py",
         ":platform_test",
         ":util",
         "@six_archive//:six",
     ],
+    main = "util/protobuf/compare_test.py",
+    tags = ["no_pip"],  # compare_test_pb2 proto is not available in pip.
 )
 
-py_test(
+tf_py_test(
     name = "util_example_parser_configuration_test",
     size = "small",
     srcs = ["util/example_parser_configuration_test.py"],
-    main = "util/example_parser_configuration_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client",
         ":client_testlib",
@@ -3941,14 +4160,14 @@ py_test(
         ":parsing_ops",
         ":util_example_parser_configuration",
     ],
+    main = "util/example_parser_configuration_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "events_writer_test",
     size = "small",
     srcs = ["client/events_writer_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":errors",
         ":framework_test_lib",
         ":lib",
@@ -4109,10 +4328,12 @@ tf_py_wrap_cc(
         ":tf_session_helper",
         "//third_party/python_runtime:headers",
         "//tensorflow/c:c_api",
+        "//tensorflow/c:c_api_experimental",
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/c:python_api",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_experimental",
         "//tensorflow/core/distributed_runtime/rpc:grpc_rpc_factory_registration",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_session",
@@ -4546,6 +4767,7 @@ cuda_py_test(
         ":math_ops",
         "//tensorflow/core:protos_all_py",
     ],
+    xla_enable_strict_auto_jit = False,  # Graph structure is different with autojit
 )
 
 cuda_py_test(
@@ -4562,24 +4784,22 @@ cuda_py_test(
     tags = ["no_windows_gpu"],
 )
 
-py_test(
+tf_py_test(
     name = "c_api_util_test",
     size = "small",
     srcs = ["framework/c_api_util_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":c_api_util",
         ":framework_test_lib",
         ":platform_test",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "graph_util_test",
     size = "small",
     srcs = ["framework/graph_util_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client",
         ":client_testlib",
         ":framework",
@@ -4592,37 +4812,45 @@ py_test(
     ],
 )
 
-py_test(
+tf_py_test(
+    name = "convert_to_constants_test",
+    size = "small",
+    srcs = ["framework/convert_to_constants_test.py"],
+    additional_deps = [
+        ":convert_to_constants",
+        "client_testlib",
+        "framework_test_lib",
+    ],
+)
+
+tf_py_test(
     name = "bfloat16_test",
     size = "small",
     srcs = ["lib/core/bfloat16_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":lib",
         ":pywrap_tensorflow",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "file_io_test",
     size = "small",
     srcs = ["lib/io/file_io_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":errors",
         ":lib",
     ],
+    tags = ["no_windows"],
 )
 
-py_test(
+tf_py_test(
     name = "tf_record_test",
     size = "small",
     srcs = ["lib/io/tf_record_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":errors",
         ":lib",
@@ -4679,7 +4907,6 @@ cuda_py_tests(
         "training/ftrl_test.py",
         "training/gradient_descent_test.py",
         "training/learning_rate_decay_test.py",
-        "training/learning_rate_decay_v2_test.py",
         "training/momentum_test.py",
         "training/optimizer_test.py",
         "training/proximal_adagrad_test.py",
@@ -4812,17 +5039,11 @@ cuda_py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "saver_large_variable_test",
     size = "medium",
     srcs = ["training/saver_large_variable_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "manual",
-        "noasan",  # http://b/30379628
-        "notsan",  # http://b/30379628
-    ],
-    deps = [
+    additional_deps = [
         ":client",
         ":client_testlib",
         ":errors",
@@ -4831,18 +5052,18 @@ py_test(
         ":variables",
         "//tensorflow/core:protos_all_py",
     ],
+    tags = [
+        "manual",
+        "noasan",  # http://b/30379628
+        "notsan",  # http://b/30379628
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "saver_large_partitioned_variable_test",
     size = "medium",
     srcs = ["training/saver_large_partitioned_variable_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "noasan",  # http://b/30782289
-        "notsan",  # http://b/30782289
-    ],
-    deps = [
+    additional_deps = [
         ":client",
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -4850,6 +5071,10 @@ py_test(
         ":training",
         ":variables",
     ],
+    tags = [
+        "noasan",  # http://b/30782289
+        "notsan",  # http://b/30782289
+    ],
 )
 
 cuda_py_test(
@@ -4895,16 +5120,11 @@ tf_py_test(
     tags = ["no_windows"],
 )
 
-py_test(
+tf_py_test(
     name = "basic_session_run_hooks_test",
     size = "medium",
     srcs = ["training/basic_session_run_hooks_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_windows",
-        "notsan",  # intermittent races on a few percent of runs
-    ],
-    deps = [
+    additional_deps = [
         ":client",
         ":client_testlib",
         ":control_flow_ops",
@@ -4921,21 +5141,17 @@ py_test(
         "//tensorflow/contrib/testing:testing_py",
         "//tensorflow/core:protos_all_py",
     ],
+    tags = [
+        "no_windows",
+        "notsan",  # intermittent races on a few percent of runs
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "checkpoint_utils_test",
     size = "small",
     srcs = ["training/checkpoint_utils_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "manual",
-        "no_cuda_on_cpu_tap",
-        "no_oss",
-        "no_windows",
-        "notap",
-    ],
-    deps = [
+    additional_deps = [
         ":client",
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -4949,14 +5165,20 @@ py_test(
         ":variable_scope",
         ":variables",
     ],
+    tags = [
+        "manual",
+        "no_cuda_on_cpu_tap",
+        "no_oss",
+        "no_windows",
+        "notap",
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "checkpoint_ops_test",
     size = "small",
     srcs = ["training/checkpoint_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":checkpoint_ops_gen",
         ":client",
         ":client_testlib",
@@ -4972,12 +5194,11 @@ py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "warm_starting_util_test",
     size = "medium",
     srcs = ["training/warm_starting_util_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":dtypes",
@@ -4986,21 +5207,15 @@ py_test(
         ":training",
         ":variable_scope",
         ":variables",
-        "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "monitored_session_test",
     size = "medium",
     srcs = ["training/monitored_session_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "notsan",  # b/67945581
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":checkpoint_management",
         ":client_testlib",
@@ -5019,6 +5234,10 @@ py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/distribute:distribute_coordinator",
     ],
+    tags = [
+        "no_pip",
+        "notsan",  # b/67945581
+    ],
 )
 
 py_library(
@@ -5040,12 +5259,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "training_util_test",
     size = "small",
     srcs = ["training/training_util_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework",
         ":platform",
@@ -5142,13 +5360,13 @@ py_library(
     srcs = [
         "layers/__init__.py",
         "layers/base.py",
-        "layers/utils.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
         ":control_flow_ops",
         ":framework_for_generated_wrappers",
+        ":layers_util",
         ":platform",
         ":smart_cond",
         ":tensor_util",
@@ -5161,6 +5379,20 @@ py_library(
     ],
 )
 
+py_library(
+    name = "layers_util",
+    srcs = [
+        "layers/utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":control_flow_ops",
+        ":smart_cond",
+        ":util",
+        ":variables",
+    ],
+)
+
 py_library(
     name = "layers",
     srcs = [
@@ -5198,13 +5430,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "layers_base_test",
     size = "small",
     srcs = ["layers/base_test.py"],
-    main = "layers/base_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -5217,15 +5447,14 @@ py_test(
         ":variable_scope",
         "//tensorflow/python/eager:context",
     ],
+    main = "layers/base_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "layers_core_test",
     size = "small",
     srcs = ["layers/core_test.py"],
-    main = "layers/core_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -5238,15 +5467,14 @@ py_test(
         ":variables",
         "//third_party/py/numpy",
     ],
+    main = "layers/core_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "layers_convolutional_test",
     size = "small",
     srcs = ["layers/convolutional_test.py"],
-    main = "layers/convolutional_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -5255,32 +5483,31 @@ py_test(
         ":nn_ops",
         ":random_ops",
     ],
+    main = "layers/convolutional_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "layers_utils_test",
     size = "small",
     srcs = ["layers/utils_test.py"],
-    main = "layers/utils_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":layers",
     ],
+    main = "layers/utils_test.py",
 )
 
-py_test(
+tf_py_test(
     name = "layers_pooling_test",
     size = "small",
     srcs = ["layers/pooling_test.py"],
-    main = "layers/pooling_test.py",
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework_test_lib",
         ":layers",
         ":random_ops",
     ],
+    main = "layers/pooling_test.py",
 )
 
 cuda_py_test(
@@ -5305,37 +5532,48 @@ cuda_py_test(
 # -----------------------------------------------------------------------------
 # Quantization
 
-py_test(
+tf_py_test(
     name = "dequantize_op_test",
     size = "small",
     srcs = ["ops/dequantize_op_test.py"],
-    srcs_version = "PY2AND3",
+    additional_deps = [
+        ":array_ops",
+        ":client_testlib",
+        ":framework_for_generated_wrappers",
+        "//third_party/py/numpy",
+    ],
     tags = ["no_windows"],
-    deps = [
+)
+
+tf_py_test(
+    name = "quantized_ops_test",
+    size = "small",
+    srcs = ["ops/quantized_ops_test.py"],
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework_for_generated_wrappers",
         "//third_party/py/numpy",
     ],
+    tags = ["no_windows"],
 )
 
-py_test(
+tf_py_test(
     name = "quantized_conv_ops_test",
     size = "small",
     srcs = ["ops/quantized_conv_ops_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":nn_ops",
         "//third_party/py/numpy",
     ],
+    tags = ["no_windows"],
 )
 
 cuda_py_test(
     name = "accumulate_n_benchmark",
-    size = "large",
+    size = "medium",
     srcs = ["ops/accumulate_n_benchmark.py"],
     additional_deps = [
         ":array_ops",
@@ -5350,6 +5588,7 @@ cuda_py_test(
         ":state_ops_gen",
     ],
     main = "ops/accumulate_n_benchmark.py",
+    shard_count = 6,
 )
 
 cuda_py_test(
@@ -5545,38 +5784,32 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "item_test",
     size = "small",
     srcs = [
         "grappler/item_test.py",
     ],
-    srcs_version = "PY2AND3",
-    tags = [
-        "grappler",
-        "no_pip",  # tf_optimizer is not available in pip.
-    ],
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":tf_item",
         "//tensorflow/core:protos_all_py",
     ],
+    tags = [
+        "grappler",
+        "no_pip",  # tf_optimizer is not available in pip.
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "datasets_test",
     size = "small",
     srcs = [
         "grappler/datasets_test.py",
     ],
-    srcs_version = "PY2AND3",
-    tags = [
-        "grappler",
-        "no_pip",  # tf_optimizer is not available in pip.
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -5584,6 +5817,10 @@ py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data",
     ],
+    tags = [
+        "grappler",
+        "no_pip",  # tf_optimizer is not available in pip.
+    ],
 )
 
 py_library(
@@ -5617,6 +5854,8 @@ cuda_py_test(
         "grappler",
         "no_pip",  # tf_optimizer is not available in pip.
     ],
+    # This test will not run on XLA because it primarily tests the TF Classic flow.
+    xla_enable_strict_auto_jit = False,
 )
 
 py_library(
@@ -5632,25 +5871,24 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "tf_optimizer_test",
     size = "small",
     srcs = [
         "grappler/tf_optimizer_test.py",
     ],
-    srcs_version = "PY2AND3",
-    tags = [
-        "grappler",
-        "no_pip",  # tf_optimizer is not available in pip.
-    ],
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":tf_item",
         ":tf_optimizer",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+    ],
+    tags = [
+        "grappler",
+        "no_pip",  # tf_optimizer is not available in pip.
     ],
 )
 
@@ -5667,32 +5905,28 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "graph_placer_test",
     size = "large",
     srcs = ["grappler/graph_placer_test.py"],
-    tags = [
-        "grappler",
-        "no_pip",  # graph_placer is not available in pip.
-    ],
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":graph_placer",
         "//tensorflow/python:math_ops",
     ],
+    tags = [
+        "grappler",
+        "no_pip",  # graph_placer is not available in pip.
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "memory_optimizer_test",
     size = "medium",
     srcs = [
         "grappler/memory_optimizer_test.py",
     ],
-    srcs_version = "PY2AND3",
-    tags = [
-        "grappler",
-    ],
-    deps = [
+    additional_deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":math_ops",
@@ -5703,8 +5937,11 @@ py_test(
         ":training",
         ":variable_scope",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+    ],
+    tags = [
+        "grappler",
     ],
 )
 
@@ -5744,7 +5981,6 @@ cuda_py_test(
         ":constant_op",
         ":dtypes",
         ":functional_ops",
-        ":layers",
         ":math_ops",
         ":nn",
         ":ops",
@@ -5760,6 +5996,8 @@ cuda_py_test(
     tags = [
         "grappler",
     ],
+    # This test will not run on XLA because it primarily tests the TF Classic flow.
+    xla_enable_strict_auto_jit = False,
 )
 
 py_library(
@@ -5789,17 +6027,11 @@ py_binary(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "cost_analyzer_test",
     size = "small",
     srcs = ["grappler/cost_analyzer_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "grappler",
-        "no_cuda_on_cpu_tap",
-        "no_pip",
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":cost_analyzer",
@@ -5811,8 +6043,13 @@ py_test(
         ":state_ops",
         ":training",
         ":variables",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+    ],
+    tags = [
+        "grappler",
+        "no_cuda_on_cpu_tap",
+        "no_pip",
     ],
 )
 
@@ -5825,24 +6062,23 @@ py_library(
     deps = [":pywrap_tensorflow_internal"],
 )
 
-py_test(
+tf_py_test(
     name = "model_analyzer_test",
     size = "small",
     srcs = ["grappler/model_analyzer_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "grappler",
-        "no_pip",
-    ],
-    deps = [
+    additional_deps = [
         ":array_ops",
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":math_ops",
         ":model_analyzer",
         ":state_ops",
-        "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+    ],
+    tags = [
+        "grappler",
+        "no_pip",
     ],
 )
 
@@ -5864,6 +6100,8 @@ py_library(
     deps = [
         ":framework_for_generated_wrappers",
         ":nccl_ops_gen",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
     ],
 )
 
@@ -5912,3 +6150,11 @@ py_library(
     srcs = ["tf2.py"],
     srcs_version = "PY2AND3",
 )
+
+cuda_py_test(
+    name = "raw_ops_test",
+    srcs = ["ops/raw_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index b2cc63bd1320700801d4aaf0a9b33c8da7821412..8538f8c5dc1d30ce10516effbb994830049630c3 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -82,7 +82,9 @@ from tensorflow.python import distribute
 from tensorflow.python import keras
 from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.layers import layers
+from tensorflow.python.module import module
 from tensorflow.python.ops import bitwise_ops as bitwise
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import image_ops as image
 from tensorflow.python.ops import manip_ops as manip
 from tensorflow.python.ops import metrics
@@ -99,6 +101,9 @@ from tensorflow.python.summary import summary
 from tensorflow.python.user_ops import user_ops
 from tensorflow.python.util import compat
 
+# Import audio ops to make sure the ops are registered.
+from tensorflow.python.ops import gen_audio_ops as _
+
 # Import boosted trees ops to make sure the ops are registered (but unused).
 from tensorflow.python.ops import gen_boosted_trees_ops as _gen_boosted_trees_ops
 
@@ -121,11 +126,14 @@ from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import sysconfig
 from tensorflow.python.platform import test
 
+from tensorflow.python.compat import v2_compat
+
 from tensorflow.python.util.all_util import make_all
 from tensorflow.python.util.tf_export import tf_export
 
 # Eager execution
 from tensorflow.python.eager.context import executing_eagerly
+from tensorflow.python.eager.remote import connect_to_remote_host
 from tensorflow.python.eager.def_function import function
 from tensorflow.python.framework.ops import enable_eager_execution
 
@@ -147,7 +155,7 @@ nn.rnn_cell = rnn_cell
 # pylint: disable=undefined-variable
 tf_export(v1=['AttrValue'])(AttrValue)
 tf_export(v1=['ConfigProto'])(ConfigProto)
-tf_export('Event', 'summary.Event')(Event)
+tf_export(v1=['Event', 'summary.Event'])(Event)
 tf_export(v1=['GPUOptions'])(GPUOptions)
 tf_export(v1=['GraphDef'])(GraphDef)
 tf_export(v1=['GraphOptions'])(GraphOptions)
@@ -160,10 +168,10 @@ tf_export(v1=['OptimizerOptions'])(OptimizerOptions)
 tf_export(v1=['RunMetadata'])(RunMetadata)
 tf_export(v1=['RunOptions'])(RunOptions)
 tf_export(v1=['SessionLog', 'summary.SessionLog'])(SessionLog)
-tf_export('Summary', 'summary.Summary')(Summary)
-tf_export('summary.SummaryDescription')(SummaryDescription)
-tf_export('SummaryMetadata')(SummaryMetadata)
-tf_export('summary.TaggedRunMetadata')(TaggedRunMetadata)
+tf_export(v1=['Summary', 'summary.Summary'])(Summary)
+tf_export(v1=['summary.SummaryDescription'])(SummaryDescription)
+tf_export(v1=['SummaryMetadata'])(SummaryMetadata)
+tf_export(v1=['summary.TaggedRunMetadata'])(TaggedRunMetadata)
 tf_export(v1=['TensorInfo'])(TensorInfo)
 # pylint: enable=undefined-variable
 
diff --git a/tensorflow/python/autograph/LIMITATIONS.md b/tensorflow/python/autograph/LIMITATIONS.md
index d8b1cb7616ac348981bf2b69d6e2fd8d8a6e6b78..b4e4ca661ad7a4c6d69019ce56a0832fd1cbb03f 100644
--- a/tensorflow/python/autograph/LIMITATIONS.md
+++ b/tensorflow/python/autograph/LIMITATIONS.md
@@ -8,39 +8,39 @@ Python is a large language, so hoping to convert arbitrary Python code directly
 
 Note: as more complex features in TensorFlow are made more accessible using AutoGraph, we expect to come across use cases that haven't been tried before, some of which might reveal rare bugs. If we do find any such bugs, we may add additional restrictions for the affected configurations, until those bugs are resolved.
 
- Construct | Supported now? | Plan to support? | Notes
- :--------- | :--------------: | :----------------: | :-----
-If statement | Yes |  | Converts to `tf.cond`. If variables are created in one branch that don’t exist in another, which is inexpressible in TF, we throw a clear error.
-For statement | Yes | | We will specialize `for` loops with unknown and known lengths, as well as for loops over TF datasets. Converts to `tf.while_loop`, with an additional `maximum_iterations` hint, if that is known. Creating variables inside the loop that are used later outside the loop is not supported, as the loop may have no iterations.
-While statement | Yes | | Converts to `tf.while_loop`. Creating variables inside the loop is not supported, as the loop may have no iterations.
-Continue and break | Yes | | Converts to boolean flags and extra predicates in loop tests.
-Composition of control flow | Yes | | Arbitrary composition of `if`, `while`, `for`, `break`, and `continue`, along with other supported language elements, is supported and tested.
-Iterators | Some | Yes | Not all iterators supported, but we plan to support everything that can be desugared, such as `enumerate` and `zip`.
-Multiple return values | Yes | | We desugar them into variables, boolean flags and conditionals so that the function has a single return value at the end, and provide a clear error if we are unable to do so.
-Print expression | Yes | | Wrapped in `PyFunc`, and given proper control dependencies. Optional support for using tf.Log when py_func is undesirable exists.
-Static function calls | Yes | | Non-recursive function calls
-Nested call trees | Yes | | For example, `f` calls `g` which calls `h`, all of which need conversion.
-Recursive function calls | No | Maybe | Based on available support in TF. Currently `function.Defun` is the best candidate, but it is not reentrant.
-Python built-ins | Some | Yes | `print`, `len`, `range`, `xrange`, `int`, `float` are supported, and we plan to support or clearly error on all [Python built-ins](https://docs.python.org/3/library/functions.html).
-List operations | Yes | | We convert list creation, append, pop and indexing to their TF TensorArray equivalents. However, we do need some extra type hints to fully convert correctly. We hope to remove this limitation.
-Function variables | Yes | | e.g. `f_new = f_orig; f_new()`
-Lambda functions | No | Yes | Planned feature.
-Classes | Yes | | Classes can be converted all at once, or method-by-method. Some limitations exist around static and class methods.
-Subclasses | Yes | | Subclassing library objects like tf.keras.Model is also supported.
-Dynamic types | Some | | `o = C1() if foo else C2(); o.bar()`. Some scenarios where types are data-dependent may not be supported. We will raise a meaningful error in that case.
-Dynamic code / exec | No | |
-Reflection | No | |
-Try / Except | No | No | No current sane TF equivalent.
-Global variables | Restricted | | In general, we only support read-only access to arguments or variables defined outside the converted code. A few exceptions include TensorFlow library code.
-Functions with side effects | Some | | Side effects are allowed, under certain circumstances.
-Collections | Some | Yes | We currently support lists. There are currently no TF equivalents of dictionaries or tuples.
-List Comprehensions | Yes | | We desugar `ListComp` into the appropriate combination of `For` and `If` statements. Other comprehensions are currently very low priority.
-Custom context managers | No | Yes | Currently low priority. Left unconverted currently.
-Generators | No | Maybe | Could be achievable using queues; very low priority.
-Assertions | Yes | | As `tf.Assert`
-Deletion | Yes | Maybe | Currently unconverted. If new semanti cs are required for `del`, we are able to add it in.
-Inline imports | No | Yes | For example, `import numpy as np; np.eye(3)`. Currently low priority.
-Async | No | No |
+Construct                   | Supported now? | Plan to support? | Notes
+:-------------------------- | :------------: | :--------------: | :----
+If statement                | Yes            |                  | Converts to `tf.cond`. If variables are created in one branch that don’t exist in another, which is inexpressible in TF, we throw a clear error.
+For statement               | Yes            |                  | We will specialize `for` loops with unknown and known lengths, as well as for loops over TF datasets. Converts to `tf.while_loop`, with an additional `maximum_iterations` hint, if that is known. Creating variables inside the loop that are used later outside the loop is not supported, as the loop may have no iterations.
+While statement             | Yes            |                  | Converts to `tf.while_loop`. Creating variables inside the loop is not supported, as the loop may have no iterations.
+Continue and break          | Yes            |                  | Converts to boolean flags and extra predicates in loop tests.
+Composition of control flow | Yes            |                  | Arbitrary composition of `if`, `while`, `for`, `break`, and `continue`, along with other supported language elements, is supported and tested.
+Iterators                   | Some           | Yes              | Not all iterators supported, but we plan to support everything that can be desugared, such as `enumerate` and `zip`.
+Multiple return values      | Yes            |                  | We desugar them into variables, boolean flags and conditionals so that the function has a single return value at the end, and provide a clear error if we are unable to do so.
+Print expression            | Yes            |                  | Wrapped in `PyFunc`, and given proper control dependencies. Optional support for using tf.Log when py_func is undesirable exists.
+Static function calls       | Yes            |                  | Non-recursive function calls
+Nested call trees           | Yes            |                  | For example, `f` calls `g` which calls `h`, all of which need conversion.
+Recursive function calls    | No             | Maybe            | Based on available support in TF. Currently `function.Defun` is the best candidate, but it is not reentrant.
+Python built-ins            | Some           | Yes              | `print`, `len`, `range`, `xrange`, `int`, `float` are supported, and we plan to support or clearly error on all [Python built-ins](https://docs.python.org/3/library/functions.html).
+List operations             | Yes            |                  | We convert list creation, append, pop and indexing to their TF TensorArray equivalents. However, we do need some extra type hints to fully convert correctly. We hope to remove this limitation.
+Function variables          | Yes            |                  | e.g. `f_new = f_orig; f_new()`
+Lambda functions            | No             | Yes              | Planned feature.
+Classes                     | Yes            |                  | Classes can be converted all at once, or method-by-method. Some limitations exist around static and class methods.
+Subclasses                  | Yes            |                  | Subclassing library objects like tf.keras.Model is also supported.
+Dynamic types               | Some           |                  | `o = C1() if foo else C2(); o.bar()`. Some scenarios where types are data-dependent may not be supported. We will raise a meaningful error in that case.
+Dynamic code / exec         | No             |                  |
+Reflection                  | No             |                  |
+Try / Except                | No             | No               | No current sane TF equivalent.
+Global variables            | Restricted     |                  | In general, we only support read-only access to arguments or variables defined outside the converted code. A few exceptions include TensorFlow library code.
+Functions with side effects | Some           |                  | Side effects are allowed, under certain circumstances.
+Collections                 | Some           | Yes              | We currently support lists. There are currently no TF equivalents of dictionaries or tuples.
+List Comprehensions         | Yes            |                  | We desugar `ListComp` into the appropriate combination of `For` and `If` statements. Other comprehensions are currently very low priority.
+Custom context managers     | No             | Yes              | Currently low priority. Left unconverted currently.
+Generators                  | No             | Maybe            | Could be achievable using queues; very low priority.
+Assertions                  | Yes            |                  | As `tf.Assert`
+Deletion                    | Yes            | Maybe            | Currently unconverted. If new semantics are required for `del`, we are able to add it in.
+Inline imports              | No             | Yes              | For example, `import numpy as np; np.eye(3)`. Currently low priority.
+Async                       | No             | No               |
 
 ## Extra capabilities
 
diff --git a/tensorflow/python/autograph/__init__.py b/tensorflow/python/autograph/__init__.py
index 7252e0d9bf92e430e224fe00d9a9a5ff4254b46f..3009cfffd4510e3d236edc31c70590fc00023235 100644
--- a/tensorflow/python/autograph/__init__.py
+++ b/tensorflow/python/autograph/__init__.py
@@ -12,10 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Autograph compiles Python code into equivalent TensorFlow code.
+"""Conversion of plain Python into TensorFlow graph code.
 
-Equivalent here means that they have the same effect when executed.
+NOTE: In TensorFlow 2.0, AutoGraph is automatically applied when using
+`tf.function`. This module contains lower-level APIs for advanced use.
+
+For more information, see the
+[AutoGraph guide](https://www.tensorflow.org/guide/autograph).
+
+By equivalent graph code we mean code that generates a TensorFlow graph when
+run. The generated graph has the same effects as the original code when executed
+(for example with `tf.function` or `tf.compat.v1.Session.run`). In other words,
+using AutoGraph can be thought of as running Python in TensorFlow.
 """
+# TODO(b/119833526): Link to the new tf.function + autograph tutorial.
 
 from __future__ import absolute_import
 from __future__ import division
@@ -39,10 +49,12 @@ from tensorflow.python.autograph.impl.api import to_graph
 from tensorflow.python.autograph.lang.directives import set_element_type
 from tensorflow.python.autograph.lang.directives import set_loop_options
 from tensorflow.python.autograph.lang.special_functions import stack
+from tensorflow.python.autograph.pyct.errors import AutoGraphError
 from tensorflow.python.autograph.lang.special_functions import tensor_list
-from tensorflow.python.autograph.pyct.transformer import AutographParseError
+from tensorflow.python.autograph.utils import ag_logging
 from tensorflow.python.util.all_util import remove_undocumented
 
+# TODO(mdan): Revisit this list once we finalize the generated code mechanism.
 _allowed_symbols = [
     # Main API
     'ConversionOptions',
@@ -66,7 +78,7 @@ _allowed_symbols = [
     'stack',
     'tensor_list',
     # Exceptions
-    'AutographParseError',
+    'AutoGraphError',
     # Utilities: to be removed
     'utils',
 ]
diff --git a/tensorflow/python/autograph/converters/BUILD b/tensorflow/python/autograph/converters/BUILD
index 3ac446db02c6ef1946e76a8b549a85c67fed2872..bafc5b0ca7c203255f098f6e03fa8b417b74d4f6 100644
--- a/tensorflow/python/autograph/converters/BUILD
+++ b/tensorflow/python/autograph/converters/BUILD
@@ -25,7 +25,6 @@ py_library(
         "conditional_expressions.py",
         "continue_statements.py",
         "control_flow.py",
-        "decorators.py",
         "directives.py",
         "error_handlers.py",
         "function_scopes.py",
@@ -139,21 +138,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "decorators_test",
-    srcs = ["decorators_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-        "no_windows",
-    ],
-    deps = [
-        ":converters",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python/autograph/core:test_lib",
-    ],
-)
-
 py_test(
     name = "directives_test",
     srcs = ["directives_test.py"],
diff --git a/tensorflow/python/autograph/converters/break_statements.py b/tensorflow/python/autograph/converters/break_statements.py
index e4e32ab9761aa13b5a7eefbc297ad3ea79412e99..c2ced26d8d7a40aff052232553ce0d374c0ffc57 100644
--- a/tensorflow/python/autograph/converters/break_statements.py
+++ b/tensorflow/python/autograph/converters/break_statements.py
@@ -53,7 +53,7 @@ class BreakTransformer(converter.Base):
       return block
 
     template = """
-        if not var_name:
+        if ag__.not_(var_name):
           block
       """
     node = templates.replace(
@@ -86,7 +86,7 @@ class BreakTransformer(converter.Base):
 
       template = """
         var_name = False
-        while test and not var_name:
+        while ag__.and_(lambda: test, lambda: ag__.not_(var_name)):
           body
         else:
           orelse
@@ -115,7 +115,7 @@ class BreakTransformer(converter.Base):
       # break did not trigger).
       guarded_orelse = self._guard_if_present(node.orelse, break_var)
       extra_test = templates.replace_as_expression(
-          'not var_name', var_name=break_var)
+          'ag__.not_(var_name)', var_name=break_var)
 
       # The extra test is hidden in the AST, which will confuse the static
       # analysis. To mitigate that, we insert a no-op statement that ensures
diff --git a/tensorflow/python/autograph/converters/builtin_functions_test.py b/tensorflow/python/autograph/converters/builtin_functions_test.py
index 2683be16ec7ffa91b1df3cd272336366502d9f4f..2e6cf16b9c5af5aad32e6746bf7c5503917200dd 100644
--- a/tensorflow/python/autograph/converters/builtin_functions_test.py
+++ b/tensorflow/python/autograph/converters/builtin_functions_test.py
@@ -55,7 +55,9 @@ class BuiltinFunctionsTest(converter_testing.TestCase):
     with self.converted(test_fn, builtin_functions, {'print': print}) as result:
       with self.session() as sess:
         with self.assertPrints('a\n'):
-          sess.run(result.test_fn('a'))
+          sess.run(result.test_fn(constant_op.constant('a')))
+      with self.assertPrints('a\n'):
+        result.test_fn('a')
 
   @test_util.run_deprecated_v1
   def test_print_multiple_values(self):
diff --git a/tensorflow/python/autograph/converters/call_trees.py b/tensorflow/python/autograph/converters/call_trees.py
index 9b85fc8367ceda77ab656bb889c88922cc52e173..a35ff16eca57a25269bafd99fd6742c01502cf60 100644
--- a/tensorflow/python/autograph/converters/call_trees.py
+++ b/tensorflow/python/autograph/converters/call_trees.py
@@ -22,318 +22,119 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
 import gast
 
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import ast_util
-from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import templates
-from tensorflow.python.util import tf_inspect
-
-
-class FunctionInfo(collections.namedtuple('FunctionInfo', ('dtype',))):
-  pass
-
-
-# TODO(mdan): Move this to a separate transformer.
-KNOWN_NUMPY_FUNCTIONS = {
-    ('numpy', 'random', 'binomial'): FunctionInfo(dtype='tf.int64'),
-}
 
 
-# TODO(mdan): Get rid of these interfaces. Can now depend directly on Namer.
+# TODO(mdan): Rename to FunctionCallsTransformer.
 
 
-class FunctionNamer(object):
-  """Describes the interface for CallTreeTransformer's namer."""
+class _Function(object):
 
-  def compiled_function_name(self,
-                             original_fqn,
-                             live_entity=None,
-                             owner_type=None):
-    """Generate the name corresponding to the compiled version of a function.
-
-    Args:
-      original_fqn: string or tuple(string)
-      live_entity: Callable, the actual target function, if known.
-      owner_type: Optional object. If present, it indicates that the function is
-          a member of the given type.
-    Returns:
-      string, bool
-    """
-    raise NotImplementedError()
-
-  def compiled_class_name(self, original_fqn, live_entity=None):
-    """Generate the name corresponding to the compiled version of a class.
-
-    Args:
-      original_fqn: string or tuple(string)
-      live_entity: The actual target class, if known.
-    Returns:
-      string
-    """
-    raise NotImplementedError()
-
-
-# TODO(mdan): Rename to CallsTransformer.
+  no_root = True
 
 
 class CallTreeTransformer(converter.Base):
   """Transforms the call tree by renaming transformed symbols."""
 
-  def _resolve_decorator_name(self, node):
-    """Used to resolve decorator info."""
-    if isinstance(node, gast.Call):
-      return self._resolve_decorator_name(node.func)
-    if isinstance(node, gast.Name):
-      # TODO(mdan): Add test coverage for this branch.
-      return self.ctx.info.namespace.get(node.id)
-    if isinstance(node, gast.Attribute):
-      parent = self._resolve_decorator_name(node.value)
-      if parent is not None:
-        return getattr(parent, node.attr)
-      return None
-    raise ValueError(node)
-
-  def _try_resolve_target(self, node):
-    """Works for methods of objects of known type."""
-    if anno.hasanno(node, 'live_val'):
-      return anno.getanno(node, 'live_val')
-    if isinstance(node, gast.Attribute) and anno.hasanno(node, 'type'):
-      owner_type = anno.getanno(node, 'type')
-      if hasattr(owner_type, node.attr):
-        return getattr(owner_type, node.attr)
-      else:
-        # TODO(mdan): We should probably return None here rather than an error.
-        raise ValueError('Type "%s" has no attribute "%s". Is it dynamic?' %
-                         (owner_type, node.attr))
-    return None
-
-  def _function_is_compilable(self, target_entity):
-    """Determines whether an entity can be compiled at all."""
-    # TODO(mdan): Expand.
-
-    if target_entity.__module__ is None:
-      # Functions like builtins and NumPy don't expose a module.
-      # Those in general should not be compiled.
-      return False
-
-    if inspect_utils.isbuiltin(target_entity):
-      return False
-
-    if inspect_utils.isnamedtuple(target_entity):
-      # namedtuple doesn't expose its source code, making it uncompilable.
-      return False
-
-    return True
-
-  def _should_compile(self, node, fqn):
-    """Determines whether an entity should be compiled in the context."""
-    # TODO(mdan): Needs cleanup. We should remove the use of fqn altogether.
-    module_name = fqn[0]
-    for mod in self.ctx.program.uncompiled_modules:
-      if module_name.startswith(mod[0] + '.'):
-        return False
-
-    for i in range(1, len(fqn)):
-      if fqn[:i] in self.ctx.program.uncompiled_modules:
-        return False
-
-    target_entity = self._try_resolve_target(node.func)
-
-    if target_entity is not None:
-
-      # Currently, lambdas are always converted.
-      # TODO(mdan): Allow markers of the kind f = ag.do_not_convert(lambda: ...)
-      if inspect_utils.islambda(target_entity):
-        return True
-
-      # This may be reached when "calling" a callable attribute of an object.
-      # For example:
-      #
-      #   self.fc = tf.keras.layers.Dense()
-      #   self.fc()
-      #
-      for mod in self.ctx.program.uncompiled_modules:
-        if target_entity.__module__.startswith(mod[0] + '.'):
-          return False
-
-      # Inspect the target function decorators. If any include a @convert
-      # or @do_not_convert annotation, then they must be called as they are.
-      # TODO(mdan): This may be quite heavy. Perhaps always dynamically convert?
-      # To parse and re-analyze each function for every call site could be quite
-      # wasteful. Maybe we could cache the parsed AST?
-      try:
-        target_node, _ = parser.parse_entity(target_entity)
-        target_node = target_node.body[0]
-      except TypeError:
-        # Functions whose source we cannot access are compilable (e.g. wrapped
-        # to py_func).
-        return True
-
-      # This attribute is set when the decorator was applied before the
-      # function was parsed. See api.py.
-      if hasattr(target_entity, '__ag_compiled'):
-        return False
-
-      for dec in target_node.decorator_list:
-        decorator_fn = self._resolve_decorator_name(dec)
-        if (decorator_fn is not None and
-            decorator_fn in self.ctx.program.options.strip_decorators):
-          return False
+  def visit_FunctionDef(self, node):
+    self.state[_Function].enter()
+    node.args = self.visit(node.args)
+    node.body = self.visit_block(node.body)
 
-    return True
+    if self.state[_Function].level < 2:
+      # Top-level functions lose their decorator because the conversion is
+      # always just-in-time and by the time it happens the decorators are
+      # already set to be applied.
+      node.decorator_list = []
+    else:
+      # Inner functions are converted already, so we insert a decorator to
+      # prevent double conversion. Double conversion would work too, but this
+      # saves the overhead.
+      node.decorator_list.append(
+          parser.parse_expression('ag__.do_not_convert_internal'))
 
-  def _rename_compilable_function(self, node):
-    assert anno.hasanno(node.func, 'live_val')
-    assert anno.hasanno(node.func, 'fqn')
-    target_entity = anno.getanno(node.func, 'live_val')
-    target_fqn = anno.getanno(node.func, 'fqn')
+    if node.returns:
+      node.returns = self.visit(node.returns)
 
-    if anno.hasanno(node, 'is_constructor'):
-      new_name = self.ctx.namer.compiled_class_name(
-          target_fqn, live_entity=target_entity)
-      do_rename = True
-    else:
-      if anno.hasanno(node.func, 'parent_type'):
-        owner_type = anno.getanno(node.func, 'parent_type')
-      else:
-        # Fallback - not reliable.
-        owner_type = inspect_utils.getmethodclass(target_entity)
-      new_name, do_rename = self.ctx.namer.compiled_function_name(
-          target_fqn, live_entity=target_entity, owner_type=owner_type)
+    self.state[_Function].exit()
+    return node
 
-    if do_rename:
-      if target_entity is not None:
-        if tf_inspect.ismethod(target_entity):
-          # The renaming process will transform it into a regular function.
-          # TODO(mdan): Is this complete? How does it work with nested members?
-          node.args = [node.func.value] + node.args
-      node.func = templates.replace_as_expression(
-          'func_name', func_name=new_name)
+  def visit_With(self, node):
+    # Context manager calls (in node.items) are not converted.
+    node.body = self.visit_block(node.body)
     return node
 
-  def _wrap_to_py_func_single_return(self, node, dtype):
-    # TODO(mdan): Properly handle varargs, etc.
-    template = """
-      ag__.utils.wrap_py_func(func, dtype, (args,), kwargs, False)
-    """
-    return templates.replace_as_expression(
-        template,
-        func=node.func,
-        dtype=parser.parse_expression(dtype),
-        args=node.args,
-        kwargs=ast_util.keywords_to_dict(node.keywords))
+  def visit_Call(self, node):
+    # TODO(mdan): Refactor converted_call as a 'Call' operator.
+
+    # Calls to the internal 'ag__' module are never converted (though their
+    # arguments might be).
+    full_name = str(anno.getanno(node.func, anno.Basic.QN, default=''))
+    if full_name.startswith('ag__.'):
+      return self.generic_visit(node)
+    if (full_name == 'print' and
+        not self.ctx.program.options.uses(converter.Feature.BUILTIN_FUNCTIONS)):
+      return self.generic_visit(node)
 
-  def _insert_dynamic_conversion(self, node):
-    """Inlines a dynamic conversion for a dynamic function."""
-    # TODO(mdan): Pass information on the statically compiled functions.
-    # Having access to the statically compiled functions can help avoid
-    # unnecessary compilation.
-    # For example, this would lead to function `a` being compiled twice:
-    #
-    #   def a():
-    #     v = b
-    #     b()
-    #   def b():
-    #     a()
-    #
-    # This is really a problem with recursive calls, which currently can
-    # only be gated by a static condition, and should be rare.
-    # TODO(mdan): It probably makes sense to use dynamic conversion every time.
-    # Before we could convert all the time though, we'd need a reasonable
-    # caching mechanism.
-    template = """
-      ag__.converted_call(func, owner, options, args)
-    """
     if isinstance(node.func, gast.Attribute):
       func = gast.Str(node.func.attr)
       owner = node.func.value
     else:
       func = node.func
       owner = parser.parse_expression('None')
+
+    starred_arg = None
+    normal_args = []
+    for a in node.args:
+      if isinstance(a, gast.Starred):
+        assert starred_arg is None, 'Multiple *args should be impossible.'
+        starred_arg = a
+      else:
+        normal_args.append(a)
+    if starred_arg is None:
+      args = templates.replace_as_expression('(args,)', args=normal_args)
+    else:
+      args = templates.replace_as_expression(
+          '(args,) + tuple(stararg)',
+          stararg=starred_arg.value,
+          args=normal_args)
+
+    kwargs_arg = None
+    normal_keywords = []
+    for k in node.keywords:
+      if k.arg is None:
+        assert kwargs_arg is None, 'Multiple **kwargs should be impossible.'
+        kwargs_arg = k
+      else:
+        normal_keywords.append(k)
+    if kwargs_arg is None:
+      kwargs = ast_util.keywords_to_dict(normal_keywords)
+    else:
+      kwargs = templates.replace_as_expression(
+          'dict(kwargs, **keywords)',
+          kwargs=kwargs_arg.value,
+          keywords=ast_util.keywords_to_dict(normal_keywords))
+
+    template = """
+      ag__.converted_call(func, owner, options, args, kwargs)
+    """
     new_call = templates.replace_as_expression(
         template,
         func=func,
         owner=owner,
         options=self.ctx.program.options.to_ast(
-            self.ctx.info.namespace,
+            self.ctx,
             internal_convert_user_code=self.ctx.program.options.recursive),
-        args=node.args)
-    # TODO(mdan): Improve the template mechanism to better support this.
-    new_call.keywords = node.keywords
-    return new_call
+        args=args,
+        kwargs=kwargs)
 
-  def _visit_decorators(self, decorator_list):
-    if not self.ctx.program.options.uses(converter.Feature.DECORATORS):
-      # When not processing decorators, strip everything that is encountered.
-      return []
-
-    return self.visit_block(decorator_list)
-
-  def visit_FunctionDef(self, node):
-    node.args = self.visit(node.args)
-    node.body = self.visit_block(node.body)
-    node.decorator_list = self._visit_decorators(node.decorator_list)
-    node.returns = self.visit_block(node.returns)
-    return node
-
-  def visit_Call(self, node):
-    if anno.hasanno(node.func, 'live_val'):
-      target_entity = anno.getanno(node.func, 'live_val')
-
-      if anno.hasanno(node.func, 'fqn'):
-        target_fqn = anno.getanno(node.func, 'fqn')
-      else:
-        target_fqn = None
-
-      if self._function_is_compilable(target_entity):
-        if self._should_compile(node, target_fqn):
-          node = self._rename_compilable_function(node)
-        else:
-          node = self.generic_visit(node)
-          return node
-
-      elif target_fqn and target_fqn in KNOWN_NUMPY_FUNCTIONS:
-        # TODO(mdan): Should we replace these with equivalent TF ops instead?
-        node = self._wrap_to_py_func_single_return(
-            node, KNOWN_NUMPY_FUNCTIONS[target_fqn].dtype)
-
-      elif inspect_utils.isbuiltin(target_entity):
-        # Note: Any builtin that passed the builtins converter is assumed to be
-        # safe for graph mode.
-        return node
-
-      elif inspect_utils.isnamedtuple(target_entity):
-        # Although not compilable, we assume they are safe for graph mode.
-        node = self.generic_visit(node)
-        return node
-
-      else:
-        # TODO(mdan): Instert dynamic conversion here instead.
-        raise NotImplementedError(
-            'py_func with return values (unknown function)')
-    else:
-      # Special cases
-      # TODO(mdan): These need a systematic review - there may be more.
-
-      # 1. super() calls - these are preserved. The class conversion mechanism
-      # will ensure that they return the correct value.
-      if ast_util.matches(node, 'super(_)'):
-        return node
-
-      # 2. super().method calls - these are preserved as well, when the
-      # conversion processes the entire class.
-      if (ast_util.matches(node, 'super(_)._(_)') and
-          self.ctx.info.owner_type is not None):
-        return node
-
-      node = self._insert_dynamic_conversion(node)
-    return node
+    return new_call
 
 
 def transform(node, ctx):
diff --git a/tensorflow/python/autograph/converters/call_trees_test.py b/tensorflow/python/autograph/converters/call_trees_test.py
index 454d75d755c7273d11e1f89e4138cd997eb6e49a..654682edc737f8de291f50259c28a51c131aef58 100644
--- a/tensorflow/python/autograph/converters/call_trees_test.py
+++ b/tensorflow/python/autograph/converters/call_trees_test.py
@@ -18,147 +18,97 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-
-import numpy as np
-
 from tensorflow.python.autograph.converters import call_trees
 from tensorflow.python.autograph.core import converter_testing
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
 class CallTreesTest(converter_testing.TestCase):
 
-  def test_basic(self):
-
-    def test_fn_1(_):
-      raise ValueError('This should not be called in the compiled version.')
-
-    def other_test_fn_1(a):
-      return a + 1
-
-    def test_fn_2(a):
-      return test_fn_1(a) + 1
-
-    ns = {'test_fn_1': test_fn_1}
-    node, ctx = self.prepare(test_fn_2, ns)
-    node = call_trees.transform(node, ctx)
+  def test_normal_function(self):
 
-    with self.compiled(node, ns) as result:
-      new_name, _ = ctx.namer.compiled_function_name(('test_fn_1',))
-      setattr(result, new_name, other_test_fn_1)
-      self.assertEquals(result.test_fn_2(1), 3)
-
-  def test_dynamic_function(self):
-
-    def test_fn_1():
-      raise ValueError('This should be masked by the mock in self.compiled.')
-
-    def test_fn_2(f):
+    def test_fn(f):
       return f() + 3
 
-    with self.converted(test_fn_2, call_trees, {}) as result:
-      # 10 = 7 (from the mock) + 3 (from test_fn_2)
-      self.assertEquals(10, result.test_fn_2(test_fn_1))
+    with self.converted(test_fn, call_trees, {}) as result:
+      self.assertEquals(
+          result.test_fn(None),
+          converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 3)
+      self.assertListEqual(self.dynamic_calls, [((), {})])
 
-  def test_basic_method(self):
-
-    class TestClass(object):
+  def test_function_with_kwarg(self):
 
-      def test_fn_1(self, a):
-        return a + 1
+    def test_fn(f, a, b):
+      return f(a, c=b) + 3
 
-      def test_fn_2(self, a):
-        return self.test_fn_1(a) + 1
+    with self.converted(test_fn, call_trees, {}) as result:
+      self.assertEquals(
+          result.test_fn(None, 1, 2),
+          converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 3)
+      self.assertListEqual(self.dynamic_calls, [((1,), {'c': 2})])
 
-    ns = {'TestClass': TestClass}
-    node, ctx = self.prepare(
-        TestClass.test_fn_2,
-        ns,
-        namer=converter_testing.FakeNoRenameNamer(),
-        arg_types={'self': (TestClass.__name__, TestClass)})
-    node = call_trees.transform(node, ctx)
+  def test_function_with_kwargs_starargs(self):
 
-    with self.compiled(node, ns) as result:
-      tc = TestClass()
-      self.assertEquals(3, result.test_fn_2(tc, 1))
+    def test_fn(f, a, *args, **kwargs):
+      return f(a, *args, **kwargs) + 5
 
-  def test_known_called_lambda(self):
+    with self.converted(test_fn, call_trees, {}) as result:
+      self.assertEquals(
+          result.test_fn(None, 1, *[2, 3], **{'b': 4, 'c': 5}),
+          converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 5)
+      self.assertListEqual(self.dynamic_calls, [((1, 2, 3), {'b': 4, 'c': 5})])
 
-    l = lambda x: x
+  def test_function_with_kwargs_starargs_only(self):
 
-    def test_fn(a):
-      return l(a)
+    def f(*unused_args):  # Will not be called.
+      pass
 
-    ns = {'l': l}
-    node, ctx = self.prepare(test_fn, ns)
-    node = call_trees.transform(node, ctx)
-
-    with self.compiled(node, ns) as result:
-      self.assertEquals(1, result.test_fn(1))
-
-  def test_known_called_namedtuple(self):
-
-    nt = collections.namedtuple('TestNamedTuple', ['a'])
-
-    def test_fn(a):
-      return nt(a)
-
-    ns = {'nt': nt}
-    node, ctx = self.prepare(test_fn, ns)
-    node = call_trees.transform(node, ctx)
-
-    with self.compiled(node, ns) as result:
-      self.assertEquals(nt(1), result.test_fn(1))
+    def test_fn():
+      args = [1, 2, 3]
+      return f(*args) + 11
 
-  def test_py_func_known_function(self):
+    with self.converted(test_fn, call_trees, {'f': f}) as result:
+      self.assertEquals(
+          result.test_fn(),
+          converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 11)
+      self.assertListEqual(self.dynamic_calls, [((1, 2, 3), {})])
 
-    def test_fn():
-      return np.random.binomial(2, 0.5)
+  def test_function_with_kwargs_keywords(self):
 
-    with self.converted(test_fn, call_trees, {'np': np},
-                        dtypes.int64) as result:
-      with self.cached_session() as sess:
-        self.assertTrue(isinstance(result.test_fn(), ops.Tensor))
-        self.assertIn(self.evaluate(result.test_fn()), (0, 1, 2))
+    def test_fn(f, a, b, **kwargs):
+      return f(a, b=b, **kwargs) + 5
 
-  def test_uncompiled_modules(self):
+    with self.converted(test_fn, call_trees, {}) as result:
+      self.assertEquals(
+          result.test_fn(None, 1, 2, **{'c': 3}),
+          converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 5)
+      self.assertListEqual(self.dynamic_calls, [((1,), {'b': 2, 'c': 3})])
 
-    def test_fn(a):
-      a = math_ops.multiply(a, constant_op.constant(2))
-      a = math_ops.add(a, constant_op.constant(1))
-      return a
+  def test_class_method(self):
 
-    ns = {'math_ops': math_ops, 'constant_op': constant_op}
-    node, ctx = self.prepare(
-        test_fn,
-        ns,
-        arg_types=set(((math_ops.__name__,), (constant_op.__name__,))))
-    node = call_trees.transform(node, ctx)
+    class TestClass(object):
 
-    with self.compiled(node, ns) as result:
-      with self.cached_session() as sess:
-        result_tensor = result.test_fn(constant_op.constant(1))
-        self.assertEquals(self.evaluate(result_tensor), 3)
+      def test_method(self, a):
+        return self.other_method(a) + 1
 
-  def test_call_to_decorated_function(self):
+    tc = TestClass()
+    with self.converted(TestClass.test_method, call_trees, {}) as result:
+      self.assertEquals(converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 1,
+                        result.test_method(tc, 1))
+      self.assertListEqual(self.dynamic_calls, [((1,), {})])
 
-    def decorator(f):
-      return f
+  def test_object_method(self):
 
-    @decorator
-    def called_fn(a):
-      return a
+    class TestClass(object):
 
-    def test_fn(a):
-      return called_fn(a)
+      def test_method(self, a):
+        return self.other_method(a) + 1
 
-    node, ctx = self.prepare(test_fn, {'called_fn': called_fn})
-    node = call_trees.transform(node, ctx)
+    tc = TestClass()
+    with self.converted(tc.test_method, call_trees, {}) as result:
+      self.assertEquals(converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 1,
+                        result.test_method(tc, 1))
+      self.assertListEqual(self.dynamic_calls, [((1,), {})])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/continue_statements.py b/tensorflow/python/autograph/converters/continue_statements.py
index 05e19e59fc6701db618e925e1d305f299b270e33..780f837fa3966c68383ab0ba4acdfcb7b221d005 100644
--- a/tensorflow/python/autograph/converters/continue_statements.py
+++ b/tensorflow/python/autograph/converters/continue_statements.py
@@ -29,11 +29,17 @@ class _Continue(object):
   def __init__(self):
     self.used = False
     self.control_var_name = None
-    self.create_guard = False
-    self.guard_created = False
 
   def __repr__(self):
-    return 'used: %s, var: %s' % (self.used, self.control_var_name)
+    return '<_Continue(used: {}, var: {})>'.format(self.used,
+                                                   self.control_var_name)
+
+
+class _Block(object):
+
+  def __init__(self):
+    self.guard_created = False
+    self.create_guard = False
 
 
 class ContinueCanonicalizationTransformer(converter.Base):
@@ -68,17 +74,17 @@ class ContinueCanonicalizationTransformer(converter.Base):
     #    |                #         created if node)
 
     if self.state[_Continue].used:
-      if self.state[_Continue].guard_created:
+      if self.state[_Block].guard_created:
         return node, None
 
-      elif not self.state[_Continue].create_guard:
-        self.state[_Continue].create_guard = True
+      elif not self.state[_Block].create_guard:
+        self.state[_Block].create_guard = True
         return node, None
 
       else:
-        self.state[_Continue].guard_created = True
+        self.state[_Block].guard_created = True
         template = """
-          if not var_name:
+          if ag__.not_(var_name):
             original_node
         """
         cond, = templates.replace(
@@ -90,6 +96,7 @@ class ContinueCanonicalizationTransformer(converter.Base):
 
   def _visit_loop_body(self, node, nodes):
     self.state[_Continue].enter()
+    self.state[_Block].enter()
     scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
     continue_var = self.ctx.namer.new_symbol('continue_', scope.referenced)
     self.state[_Continue].control_var_name = continue_var
@@ -103,14 +110,21 @@ class ContinueCanonicalizationTransformer(converter.Base):
       control_var_init = templates.replace(template, var_name=continue_var)
       nodes = control_var_init + nodes
 
+    self.state[_Block].exit()
     self.state[_Continue].exit()
     return nodes
 
+  def _visit_non_loop_body(self, nodes):
+    self.state[_Block].enter()
+    nodes = self.visit_block(nodes, after_visit=self._postprocess_statement)
+    self.state[_Block].exit()
+    return nodes
+
   def visit_While(self, node):
     node.test = self.visit(node.test)
     node.body = self._visit_loop_body(node, node.body)
     # A continue in the else clause applies to the containing scope.
-    node.orelse = self.visit_block(node.orelse)
+    node.orelse = self._visit_non_loop_body(node.orelse)
     return node
 
   def visit_For(self, node):
@@ -118,7 +132,29 @@ class ContinueCanonicalizationTransformer(converter.Base):
     node.iter = self.generic_visit(node.iter)
     node.body = self._visit_loop_body(node, node.body)
     # A continue in the else clause applies to the containing scope.
-    node.orelse = self.visit_block(node.orelse)
+    node.orelse = self._visit_non_loop_body(node.orelse)
+    return node
+
+  def visit_If(self, node):
+    node.body = self.visit_block(node.body)
+    node.orelse = self._visit_non_loop_body(node.orelse)
+    return node
+
+  def visit_With(self, node):
+    node.items = self.visit_block(node.items)
+    node.body = self._visit_non_loop_body(node.body)
+    return node
+
+  def visit_Try(self, node):
+    node.body = self._visit_non_loop_body(node.body)
+    node.orelse = self._visit_non_loop_body(node.orelse)
+    # In Python 3.8 and later continue is allowed in finally blocks
+    node.finalbody = self._visit_non_loop_body(node.finalbody)
+    node.handlers = self.visit_block(node.handlers)
+    return node
+
+  def visit_ExceptHandler(self, node):
+    node.body = self._visit_non_loop_body(node.body)
     return node
 
 
diff --git a/tensorflow/python/autograph/converters/continue_statements_test.py b/tensorflow/python/autograph/converters/continue_statements_test.py
index d6aaa504436aa13007142bc87623605be15667d2..5a1828e3189db7c2ae81991951d153074ff4904c 100644
--- a/tensorflow/python/autograph/converters/continue_statements_test.py
+++ b/tensorflow/python/autograph/converters/continue_statements_test.py
@@ -20,15 +20,15 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.converters import continue_statements
 from tensorflow.python.autograph.core import converter_testing
-from tensorflow.python.eager import context as tfe_ctx
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
 
 
 class ContinueCanonicalizationTest(converter_testing.TestCase):
 
   def assertTransformedEquivalent(self, test_fn, *inputs):
-    with self.converted(test_fn, continue_statements, {},
+    with self.converted(test_fn, continue_statements, {'ops': ops},
                         constant_op.constant) as result:
       self.assertEqual(test_fn(*inputs), result.test_fn(*inputs))
 
@@ -43,11 +43,10 @@ class ContinueCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v
 
-    with tfe_ctx.eager_mode():
-      self.assertTransformedEquivalent(test_fn, 0)
-      self.assertTransformedEquivalent(test_fn, 1)
-      self.assertTransformedEquivalent(test_fn, 3)
-      self.assertTransformedEquivalent(test_fn, 4)
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 1)
+    self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(test_fn, 4)
 
   def test_for_loop(self):
 
@@ -60,11 +59,89 @@ class ContinueCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v
 
-    with tfe_ctx.eager_mode():
-      self.assertTransformedEquivalent(test_fn, [])
-      self.assertTransformedEquivalent(test_fn, [1])
-      self.assertTransformedEquivalent(test_fn, [2])
-      self.assertTransformedEquivalent(test_fn, [1, 2, 3])
+    self.assertTransformedEquivalent(test_fn, [])
+    self.assertTransformedEquivalent(test_fn, [1])
+    self.assertTransformedEquivalent(test_fn, [2])
+    self.assertTransformedEquivalent(test_fn, [1, 2, 3])
+
+  def test_nested_with(self):
+
+    def test_fn(x):
+      v = []
+      while x > 0:
+        x -= 1
+        with ops.name_scope(''):
+          if x % 2 == 0:
+            continue
+        v.append(x)
+      return v
+
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 1)
+    self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(test_fn, 4)
+
+  def test_nested_multiple_withs(self):
+
+    def test_fn(x):
+      v = []
+      while x > 0:
+        x -= 1
+        with ops.name_scope(''):
+          if x % 2 == 0:
+            continue
+        with ops.name_scope(''):
+          v.append(x)
+        v.append(x)
+      return v
+
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 1)
+    self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(test_fn, 4)
+
+  def test_nested_multiple_withs_and_statements(self):
+
+    def test_fn(x):
+      v = []
+      while x > 0:
+        x -= 1
+        with ops.name_scope(''):
+          if x % 2 == 0:
+            continue
+          v.append(x)
+        v.append(x)
+        with ops.name_scope(''):
+          v.append(x)
+        v.append(x)
+      return v
+
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 1)
+    self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(test_fn, 4)
+
+  def test_nested_multiple_withs_and_nested_withs(self):
+
+    def test_fn(x):
+      v = []
+      while x > 0:
+        x -= 1
+        with ops.name_scope(''):
+          if x % 2 == 0:
+            continue
+          with ops.name_scope(''):
+            v.append(x)
+        v.append(x)
+        with ops.name_scope(''):
+          v.append(x)
+        v.append(x)
+      return v
+
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 1)
+    self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(test_fn, 4)
 
   def test_nested(self):
 
@@ -83,11 +160,10 @@ class ContinueCanonicalizationTest(converter_testing.TestCase):
         v.append(x)
       return v, u, w
 
-    with tfe_ctx.eager_mode():
-      self.assertTransformedEquivalent(test_fn, 0)
-      self.assertTransformedEquivalent(test_fn, 1)
-      self.assertTransformedEquivalent(test_fn, 3)
-      self.assertTransformedEquivalent(test_fn, 4)
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 1)
+    self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(test_fn, 4)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py
index bef6cae1bb89908bd644115e31ca5662043b060c..15ccbf74084573198031999ba81d666bde68babb 100644
--- a/tensorflow/python/autograph/converters/control_flow.py
+++ b/tensorflow/python/autograph/converters/control_flow.py
@@ -23,7 +23,6 @@ import gast
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import ast_util
-from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import templates
 from tensorflow.python.autograph.pyct.static_analysis import annos
 
@@ -49,7 +48,13 @@ class ControlFlowTransformer(converter.Base):
 
   def _create_cond_branch(self, body_name, aliased_orig_names,
                           aliased_new_names, body, returns):
-    if len(returns) == 1:
+    if not returns:
+      # TODO(b/110167197): Replace with a plain return.
+      template = """
+        return 1
+      """
+      return_stmt = templates.replace(template)
+    elif len(returns) == 1:
       template = """
         return retval
       """
@@ -167,14 +172,12 @@ class ControlFlowTransformer(converter.Base):
         s for s in created_in_body if not s.is_composite())
     basic_created_in_orelse = tuple(
         s for s in created_in_orelse if not s.is_composite())
-    if basic_created_in_body != basic_created_in_orelse:
-      raise ValueError(
-          'if statement may not initialize all variables: the true branch'
-          ' creates %s, while the false branch creates %s. Make sure all'
-          ' these variables are initialized either in both'
-          ' branches or before the if statement.' %
-          (self._fmt_symbols(basic_created_in_body),
-           self._fmt_symbols(basic_created_in_orelse)))
+
+    # These variables are defined only in a single branch. This is fine in
+    # Python so we pass them through. Another backend, e.g. Tensorflow, may need
+    # to handle these cases specially or throw an Error.
+    possibly_undefined = (set(basic_created_in_body) ^
+                          set(basic_created_in_orelse))
 
     # Alias the closure variables inside the conditional functions, to allow
     # the functions access to the respective variables.
@@ -220,7 +223,7 @@ class ControlFlowTransformer(converter.Base):
       # branch functions will return a dummy value that ensures cond
       # actually has some return value as well.
       cond_results = None
-      # TODO(mdan): This doesn't belong here; it's specific to the operator.
+      # TODO(mdan): Replace with None once side_effect_guards is retired.
       returned_from_body = (templates.replace_as_expression(
           'ag__.match_staging_level(1, cond_var_name)',
           cond_var_name=cond_var_name),)
@@ -241,10 +244,28 @@ class ControlFlowTransformer(converter.Base):
         aliased_new_names=aliased_orelse_new_names,
         body=node_orelse,
         returns=returned_from_orelse)
+    undefined_assigns = self._create_undefined_assigns(possibly_undefined)
+
     cond_expr = self._create_cond_expr(cond_results, cond_var_name, body_name,
                                        orelse_name)
 
-    return cond_assign + body_def + orelse_def + cond_expr
+    return (undefined_assigns
+            + cond_assign
+            + body_def
+            + orelse_def
+            + cond_expr)
+
+  def _create_undefined_assigns(self, undefined_symbols):
+    assignments = []
+    for s in undefined_symbols:
+      template = '''
+        var = ag__.Undefined(symbol_name)
+      '''
+      assignments += templates.replace(
+          template,
+          var=s,
+          symbol_name=gast.Str(s.ssf()))
+    return assignments
 
   def _get_loop_state(self, node):
     body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
@@ -259,34 +280,15 @@ class ControlFlowTransformer(converter.Base):
     # the loop state, regardless of whether they are later used or not.
     loop_state = body_scope.modified & live_in
 
-    undefined_lives = loop_state - defined_in
+    # Variable that are used or defined inside the loop, but not defined
+    # before entering the loop
+    undefined_lives = ((loop_state - defined_in)
+                       | ((body_scope.modified - live_in) & live_out))
     # Only simple variables must be defined. The composite ones will be
     # implicitly checked at runtime.
     undefined_simple_lives = {v for v in undefined_lives if v.is_simple()}
-    if undefined_simple_lives:
-      raise NameError(
-          'cannot convert loop: it includes symbols that are undefined'
-          ' when entering the loop: {}'.format(
-              self._fmt_symbols(undefined_simple_lives)))
-
-    live_defs_in_loop = (body_scope.modified - live_in) & live_out
-    if live_defs_in_loop:
-      # TODO(mdan): Include reference to explanation why.
-      raise NotImplementedError(
-          'cannot convert loop: it includes symbols that are defined'
-          ' inside the loop, but used later: {}. To fix, initialize'
-          ' these symbols before the loop'.format(
-              self._fmt_symbols(live_defs_in_loop)))
-
-    if not loop_state:
-      # TODO(mdan): Implement this properly.
-      # We need to check whether any variable created inside the body scope
-      # is used before being modified outside the scope. This should be done
-      # during activity analysis, and in general should cover the case where
-      # variables may not be initialized.
-      raise ValueError('cannot convert loop: no outputs')
-
-    return loop_state, reserved_symbols
+
+    return loop_state, reserved_symbols, undefined_simple_lives
 
   def _state_constructs(self, loop_state, reserved_symbols):
     loop_state = list(loop_state)
@@ -299,19 +301,18 @@ class ControlFlowTransformer(converter.Base):
         if str(name) != ssf
     }
 
+    state_ast_tuple = gast.Tuple([n.ast() for n in loop_state], None)
+
     if len(loop_state) == 1:
       loop_state = loop_state[0]
       state_ssf = state_ssf[0]
-      state_ast_tuple = loop_state
-    else:
-      state_ast_tuple = gast.Tuple([n.ast() for n in loop_state], None)
 
     return loop_state, state_ssf, state_ast_tuple, ssf_map
 
   def visit_While(self, node):
     self.generic_visit(node)
 
-    loop_state, reserved_symbols = self._get_loop_state(node)
+    loop_state, reserved_symbols, possibly_undef = self._get_loop_state(node)
 
     # Note: one might expect we can dispatch based on the loop condition.
     # But because that is dependent on the state, it cannot be evaluated ahead
@@ -329,75 +330,154 @@ class ControlFlowTransformer(converter.Base):
     cond_scope = anno.getanno(node, annos.NodeAnno.COND_SCOPE)
     cond_closure = set()
     for s in cond_scope.read:
-      cond_closure.update(s.support_set)
-    cond_closure -= loop_state
+      cond_closure |= s.support_set
 
     loop_state, state_ssf, state_ast_tuple, ssf_map = self._state_constructs(
         loop_state, reserved_symbols)
     node_body = ast_util.rename_symbols(node.body, ssf_map)
     test = ast_util.rename_symbols(node.test, ssf_map)
 
+    if loop_state:
+      template = """
+        def test_name(state_ssf):
+          return test
+        def body_name(state_ssf):
+          body
+          return state_ssf,
+        state_ast_tuple = ag__.while_stmt(
+            test_name, body_name, (state,), (extra_deps,))
+      """
+      node = templates.replace(
+          template,
+          state=loop_state,
+          state_ssf=state_ssf,
+          state_ast_tuple=state_ast_tuple,
+          test_name=self.ctx.namer.new_symbol('loop_test', reserved_symbols),
+          test=test,
+          body_name=self.ctx.namer.new_symbol('loop_body', reserved_symbols),
+          body=node_body,
+          extra_deps=tuple(s.ast() for s in cond_closure),
+      )
+    else:
+      template = """
+        def test_name():
+          return test
+        def body_name():
+          body
+          return ()
+        ag__.while_stmt(test_name, body_name, (), (extra_deps,))
+      """
+      node = templates.replace(
+          template,
+          test_name=self.ctx.namer.new_symbol('loop_test', reserved_symbols),
+          test=test,
+          body_name=self.ctx.namer.new_symbol('loop_body', reserved_symbols),
+          body=node_body,
+          extra_deps=tuple(s.ast() for s in cond_closure),
+      )
+
+    undefined_assigns = self._create_undefined_assigns(possibly_undef)
+    return undefined_assigns + node
+
+  def _create_for_loop_early_stopping(self, loop_state, state_ssf,
+                                      state_ast_tuple, original_node,
+                                      extra_test_name, extra_test,
+                                      body_name, loop_body):
+    """Create node for for-loop with early stopping (e.g. break or return)."""
     template = """
-      def test_name(state_ssf):
-        return test
-      def body_name(state_ssf):
+      def extra_test_name(state_ssf):
+        return extra_test_expr
+      def body_name(loop_vars, state_ssf):
+        # Workaround for PEP-3113
+        iterate = loop_vars
         body
         return state_ssf,
-      state_ast_tuple = ag__.while_stmt(
-          test_name, body_name, (state,), (extra_deps,))
+      state_ast_tuple = ag__.for_stmt(
+          iter_, extra_test_name, body_name, (state,))
     """
-    node = templates.replace(
+    return templates.replace(
         template,
         state=loop_state,
         state_ssf=state_ssf,
         state_ast_tuple=state_ast_tuple,
-        test_name=self.ctx.namer.new_symbol('loop_test', reserved_symbols),
-        test=test,
-        body_name=self.ctx.namer.new_symbol('loop_body', reserved_symbols),
-        body=node_body,
-        extra_deps=tuple(s.ast() for s in cond_closure),
-    )
-
-    return node
-
-  def visit_For(self, node):
-    self.generic_visit(node)
-
-    loop_state, reserved_symbols = self._get_loop_state(node)
-    loop_state, state_ssf, state_ast_tuple, ssf_map = self._state_constructs(
-        loop_state, reserved_symbols)
-    node_body = ast_util.rename_symbols(node.body, ssf_map)
-    if anno.hasanno(node, 'extra_test'):
-      extra_test = anno.getanno(node, 'extra_test')
-      extra_test = ast_util.rename_symbols(extra_test, ssf_map)
-    else:
-      extra_test = parser.parse_expression('True')
+        iter_=original_node.iter,
+        iterate=original_node.target,
+        extra_test_name=extra_test_name,
+        extra_test_expr=extra_test,
+        body_name=body_name,
+        body=loop_body)
 
+  def _create_for_loop_with_state(self, loop_state, state_ssf, state_ast_tuple,
+                                  original_node, body_name, loop_body):
+    """Create node for for-loop with loop-carried state, no early stopping."""
     template = """
-      def extra_test_name(state_ssf):
-        return extra_test_expr
       def body_name(loop_vars, state_ssf):
         # Workaround for PEP-3113
         iterate = loop_vars
         body
         return state_ssf,
       state_ast_tuple = ag__.for_stmt(
-          iter_, extra_test_name, body_name, (state,))
+          iter_, None, body_name, (state,))
     """
-    node = templates.replace(
+    return templates.replace(
         template,
         state=loop_state,
         state_ssf=state_ssf,
         state_ast_tuple=state_ast_tuple,
-        iter_=node.iter,
-        iterate=node.target,
-        extra_test_name=self.ctx.namer.new_symbol('extra_test',
-                                                  reserved_symbols),
-        extra_test_expr=extra_test,
-        body_name=self.ctx.namer.new_symbol('loop_body', reserved_symbols),
-        body=node_body)
+        iter_=original_node.iter,
+        iterate=original_node.target,
+        body_name=body_name,
+        body=loop_body)
+
+  def _create_for_loop_without_state(self, original_node, body_name, loop_body):
+    """Create node for for-loop with loop-carried state, no early stopping."""
+    template = """
+      def body_name(loop_vars):
+        # Workaround for PEP-3113
+        iterate = loop_vars
+        body
+        return ()
+      ag__.for_stmt(iter_, None, body_name, ())
+    """
+    return templates.replace(
+        template,
+        iter_=original_node.iter,
+        iterate=original_node.target,
+        body_name=body_name,
+        body=loop_body)
+
+  def visit_For(self, node):
+    self.generic_visit(node)
+
+    loop_state, reserved_symbols, possibly_undef = self._get_loop_state(node)
+    loop_state, state_ssf, state_ast_tuple, ssf_map = self._state_constructs(
+        loop_state, reserved_symbols)
+    node_body = ast_util.rename_symbols(node.body, ssf_map)
+    body_name = self.ctx.namer.new_symbol('loop_body', reserved_symbols)
+
+    has_extra_test = anno.hasanno(node, 'extra_test')
+    if loop_state:
+      if has_extra_test:
+        # Loop with early stopping (e.g. break or return)
+        extra_test = anno.getanno(node, 'extra_test')
+        extra_test = ast_util.rename_symbols(extra_test, ssf_map)
+        extra_test_name = self.ctx.namer.new_symbol('extra_test',
+                                                    reserved_symbols)
+        node = self._create_for_loop_early_stopping(
+            loop_state, state_ssf, state_ast_tuple, node, extra_test_name,
+            extra_test, body_name, node_body)
+      else:
+        # Loop with loop-carried state and no early stopping
+        node = self._create_for_loop_with_state(
+            loop_state, state_ssf, state_ast_tuple, node, body_name, node_body)
+    else:
+      # Loop with no loop-carried state and no early stopping
+      assert not has_extra_test, ('Early stoppiong (e.g. break and/or return) '
+                                  'should create state variables.')
+      node = self._create_for_loop_without_state(node, body_name, node_body)
 
-    return node
+    undefined_assigns = self._create_undefined_assigns(possibly_undef)
+    return undefined_assigns + node
 
 
 def transform(node, ctx):
diff --git a/tensorflow/python/autograph/converters/control_flow_test.py b/tensorflow/python/autograph/converters/control_flow_test.py
index 034fcbe3865cdd78cdaad19631da98359cb4690d..37ea4c2ae8e3a76954360d79b8a97f5c1ce362ae 100644
--- a/tensorflow/python/autograph/converters/control_flow_test.py
+++ b/tensorflow/python/autograph/converters/control_flow_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.converters import control_flow
 from tensorflow.python.autograph.core import converter_testing
-from tensorflow.python.autograph.pyct import transformer
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
@@ -34,8 +33,7 @@ class ControlFlowTest(converter_testing.TestCase):
       inputs = (inputs,)
     with self.converted(test_fn, control_flow, {},
                         constant_op.constant) as result:
-      with self.cached_session() as sess:
-        self.assertEqual(sess.run(result.test_fn(*inputs)), expected)
+      self.assertEqual(self.evaluate(result.test_fn(*inputs)), expected)
 
   @test_util.run_deprecated_v1
   def test_while_basic(self):
@@ -79,16 +77,32 @@ class ControlFlowTest(converter_testing.TestCase):
 
     self.assertTransformedResult(test_fn, constant_op.constant(5), 0)
 
-  def test_while_variable_defined_in_body(self):
-    def bad_while_loop(n):
+  @test_util.run_deprecated_v1
+  def test_while_dispatches_by_cond_only(self):
+
+    class TensorIncompatibleNumeric(object):
+      """Works in arithmetic expression, but errors out with TF ops."""
+
+      def __init__(self, val):
+        self.val = val
+
+      def __add__(self, other):
+        return TensorIncompatibleNumeric(self.val + other)
+
+    def test_fn(n, s):
       while n > 0:
         n -= 1
-        s = n
+        s += n
       return s
 
-    node, ctx = self.prepare(bad_while_loop, {})
-    with self.assertRaises(NameError):
-      control_flow.transform(node, ctx)
+    self.assertTransformedResult(test_fn, (constant_op.constant(5), 0), 10)
+    with self.converted(test_fn, control_flow, {}) as result:
+      # n alone controls the staging. When the loop is not staged, Python
+      # knows how to add the two objects. But when staged, tf.while_loop will
+      # not know how to deal with the TensorIncompatibleNumeric object.
+      self.assertEqual(result.test_fn(5, TensorIncompatibleNumeric(0)).val, 10)
+      with self.assertRaises(TypeError):
+        result.test_fn(constant_op.constant(5), TensorIncompatibleNumeric(0))
 
   @test_util.run_deprecated_v1
   def test_if_basic(self):
@@ -124,11 +138,10 @@ class ControlFlowTest(converter_testing.TestCase):
       return obj
 
     with self.converted(test_fn, control_flow, {}) as result:
-      with self.cached_session() as sess:
-        res_obj = result.test_fn(constant_op.constant(1), TestClass(0, 0))
-        self.assertEqual(sess.run((res_obj.a, res_obj.b)), (-1, 0))
-        res_obj = result.test_fn(constant_op.constant(-1), TestClass(0, 0))
-        self.assertEqual(sess.run((res_obj.a, res_obj.b)), (0, -2))
+      res_obj = result.test_fn(constant_op.constant(1), TestClass(0, 0))
+      self.assertEqual(self.evaluate((res_obj.a, res_obj.b)), (-1, 0))
+      res_obj = result.test_fn(constant_op.constant(-1), TestClass(0, 0))
+      self.assertEqual(self.evaluate((res_obj.a, res_obj.b)), (0, -2))
 
   @test_util.run_deprecated_v1
   def test_if_single_output(self):
@@ -176,17 +189,6 @@ class ControlFlowTest(converter_testing.TestCase):
     self.assertTransformedResult(test_fn, constant_op.constant(1), 1)
     self.assertTransformedResult(test_fn, constant_op.constant(-1), -1)
 
-  def test_if_imbalanced_outputs(self):
-
-    def test_fn(n):
-      if n > 0:
-        b = 4
-      return b
-
-    node, ctx = self.prepare(test_fn, {})
-    with self.assertRaises(transformer.AutographParseError):
-      control_flow.transform(node, ctx)
-
   @test_util.run_deprecated_v1
   def test_simple_for(self):
 
@@ -237,16 +239,6 @@ class ControlFlowTest(converter_testing.TestCase):
       self.assertEqual(result.test_fn(5), 10)
       self.assertEqual(eval_count[0], 1)
 
-  def test_for_variable_defined_in_body(self):
-    def bad_for_loop(n):
-      for i in range(n):
-        s = i
-      return s
-
-    node, ctx = self.prepare(bad_for_loop, {})
-    with self.assertRaises(NameError):
-      control_flow.transform(node, ctx)
-
   @test_util.run_deprecated_v1
   def test_for_tuple_unpacking(self):
     def test_fn(x_list):
@@ -256,5 +248,7 @@ class ControlFlowTest(converter_testing.TestCase):
       return z
 
     self.assertTransformedResult(test_fn, [3, 3], 7)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/converters/decorators.py b/tensorflow/python/autograph/converters/decorators.py
deleted file mode 100644
index f0ea51277468499937089c89eedb344149cb1ae7..0000000000000000000000000000000000000000
--- a/tensorflow/python/autograph/converters/decorators.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Handles decorators.
-
-Note: this module only deals with functions whose decorators are still recorded
-in the AST. This does not always happen. See the unit test for an example.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import gast
-
-from tensorflow.python.autograph.core import converter
-from tensorflow.python.autograph.pyct import anno
-from tensorflow.python.util import tf_inspect
-
-
-class DecoratorsTransformer(converter.Base):
-  """Converts or removes decorators."""
-
-  def visit_FunctionDef(self, node):
-    self.generic_visit(node)
-    kept_decorators = []
-    for dec in node.decorator_list:
-      if isinstance(dec, gast.Call):
-        dec_func = dec.func
-      else:
-        dec_func = dec
-
-      # Special cases.
-      # TODO(mdan): Is there any way we can treat these more generically?
-      # We may want to forego using decorators altogether if we can't
-      # properly support them.
-      if isinstance(dec_func, gast.Name) and dec_func.id in ('classmethod',):
-        # Assumption: decorators are only visible in the AST when converting
-        # a function inline (via another decorator).
-        # In that case, the converted function is no longer part of the
-        # original object that it was declared into.
-        # This is currently verified by tests.
-        continue
-
-      if not anno.hasanno(dec_func, 'live_val'):
-        raise ValueError('could not resolve the decorator "@%s"' %
-                         (anno.getanno(dec_func, anno.Basic.QN)))
-
-      original_dec = anno.getanno(dec_func, anno.Basic.QN)
-      dec_value = anno.getanno(dec_func, 'live_val')
-
-      if dec_value in self.ctx.program.options.strip_decorators:
-        continue
-
-      # When using foo.bar.baz, we only really need to grab foo and import
-      # that.
-      dec_support_node = dec_func
-      while isinstance(dec_support_node, gast.Attribute):
-        dec_support_node = dec_support_node.value
-
-      if not anno.hasanno(dec_support_node, 'live_val'):
-        raise ValueError(
-            'could not resolve symbol "%s" when looking up decorator "%s"' %
-            (anno.getanno(dec_support_node, anno.Basic.QN), original_dec))
-
-      dec_support = anno.getanno(dec_support_node, 'live_val')
-      # The tuple contains:
-      #  * the AST that represents the decorator
-      #  * the entity supporting the decorator (i.e., what we need to import)
-      #  * the name of the module that needs to be imported for this decorator
-      #    to properly resolve.
-      # Examples:
-      #  for foo.bar, the tuple is (<ast>, <module foo>, 'foo')
-      #  for baz, the tuple is (<ast>, <module baz.__module__>, 'baz')
-      kept_decorators.append((dec, dec_support,
-                              anno.getanno(dec_support_node, anno.Basic.QN)))
-
-    for _, dec_support, name in kept_decorators:
-      if tf_inspect.ismodule(dec_support):
-        self.ctx.program.additional_imports.add(
-            'import %s as %s' % (dec_support.__name__, name))
-      else:
-        if dec_support.__module__ == '__main__':
-          raise ValueError(
-              'decorator "%s" was not allowed because it is declared '
-              'in the module "%s". To fix this, declare it in a separate '
-              'module that we can import it from.' % (dec_support,
-                                                      dec_support.__module__))
-        self.ctx.program.additional_imports.add(
-            'from %s import %s' % (dec_support.__module__, name))
-
-    node.decorator_list = [dec for dec, _, _ in kept_decorators]
-    return node
-
-
-def transform(node, ctx):
-  return DecoratorsTransformer(ctx).visit(node)
diff --git a/tensorflow/python/autograph/converters/decorators_test.py b/tensorflow/python/autograph/converters/decorators_test.py
deleted file mode 100644
index abd76849d6eafd92c2d7fa540a30d699e3a57e52..0000000000000000000000000000000000000000
--- a/tensorflow/python/autograph/converters/decorators_test.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for decorators module."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from functools import wraps
-import imp
-
-from tensorflow.python import autograph
-from tensorflow.python.autograph.converters import decorators
-from tensorflow.python.autograph.core import converter_testing
-from tensorflow.python.autograph.pyct import compiler
-from tensorflow.python.autograph.pyct import transformer
-from tensorflow.python.platform import test
-
-
-# The Python parser only briefly captures decorators into the AST.
-# The interpreter desugars them on load, and the decorated function loses any
-# trace of the decorator (which is normally what you would expect, since
-# they are meant to be transparent).
-# However, decorators are still visible when you analyze the function
-# from inside a decorator, before it was applied - as is the case
-# with our conversion decorators.
-
-
-def simple_decorator(f):
-  return lambda a: f(a) + 1
-
-
-def self_transform_decorator(transform):
-
-  def decorator(f):
-    @wraps(f)
-    def wrapper(*args):
-      # This removing wrapper is defined in the test below. This setup is so
-      # intricate in order to simulate how we use the transformer in practice.
-      transformed_f = transform(f, (self_transform_decorator,))
-      return transformed_f(*args) + 1
-    return wrapper
-  return decorator
-
-
-class DecoratorsTest(converter_testing.TestCase):
-
-  def _transform(self, f, strip_decorators):
-    namespace = {
-        'self_transform_decorator': self_transform_decorator,
-        'simple_decorator': simple_decorator,
-        'converter_testing': converter_testing,
-    }
-    node, ctx = self.prepare(
-        f, namespace, recursive=False, strip_decorators=strip_decorators)
-    node = decorators.transform(node, ctx)
-    import_line = '\n'.join(ctx.program.additional_imports)
-    result, _ = compiler.ast_to_object(node, source_prefix=import_line)
-    return getattr(result, f.__name__)
-
-  def test_noop(self):
-
-    def test_fn(a):
-      return a
-
-    with self.converted(test_fn, decorators, {}) as result:
-      self.assertEqual(1, result.test_fn(1))
-
-  def test_function(self):
-
-    @self_transform_decorator(self._transform)
-    def test_fn(a):
-      return a
-
-    # 2 = 1 (a) + 1 (decorator applied exactly once)
-    self.assertEqual(2, test_fn(1))
-
-  def test_method(self):
-
-    class TestClass(object):
-
-      @self_transform_decorator(self._transform)
-      def test_fn(self, a):
-        return a
-
-    # 2 = 1 (a) + 1 (decorator applied exactly once)
-    self.assertEqual(2, TestClass().test_fn(1))
-
-  def test_multiple_decorators(self):
-
-    class TestClass(object):
-
-      # Note that reversing the order of this two doesn't work.
-      @classmethod
-      @self_transform_decorator(self._transform)
-      def test_fn(cls, a):
-        return a
-
-    # 2 = 1 (a) + 1 (decorator applied exactly once)
-    self.assertEqual(2, TestClass.test_fn(1))
-
-  def test_nested_decorators_local(self):
-
-    @self_transform_decorator(self._transform)
-    def test_fn(a):
-      @simple_decorator
-      def inner_fn(b):
-        return b + 11
-      return inner_fn(a)
-
-    # Expected to fail because simple_decorator could not be imported.
-    with self.assertRaises(transformer.AutographParseError):
-      test_fn(1)
-
-  def test_nested_decorators_imported(self):
-
-    @self_transform_decorator(self._transform)
-    def test_fn(a):
-
-      @converter_testing.imported_decorator
-      def inner_fn(b):
-        return b + 11
-
-      return inner_fn(a)
-
-    # Work around TensorFlow's symbol suppression mechanism that causes core to
-    # be invisible in the generated code.
-    core_mod = imp.new_module('core')
-    core_mod.converter_testing = converter_testing
-    autograph.core = core_mod
-
-    # 14 = 1 (a) + 1 (simple_decorator) + 11 (inner_fn)
-    self.assertEqual(14, test_fn(1))
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/autograph/converters/logical_expressions.py b/tensorflow/python/autograph/converters/logical_expressions.py
index dfcaafdc9eba61bcb3c03432eadf309484d48dee..ea9740a22e1c065f04401fa3f15e8086349eb513 100644
--- a/tensorflow/python/autograph/converters/logical_expressions.py
+++ b/tensorflow/python/autograph/converters/logical_expressions.py
@@ -38,29 +38,29 @@ from tensorflow.python.autograph.pyct import templates
 SAFE_BOOLEAN_OPERAND = 'SAFE_BOOLEAN_OPERAND'
 
 
+OP_MAPPING = {
+    gast.And: 'ag__.and_',
+    gast.Eq: 'ag__.eq',
+    gast.NotEq: 'ag__.not_eq',
+    gast.Lt: 'ag__.lt',
+    gast.LtE: 'ag__.lt_e',
+    gast.Gt: 'ag__.gt',
+    gast.GtE: 'ag__.gt_e',
+    gast.Is: 'ag__.is_',
+    gast.IsNot: 'ag__.is_not',
+    gast.In: 'ag__.in_',
+    gast.Not: 'ag__.not_',
+    gast.NotIn: 'ag__.not_in',
+    gast.Or: 'ag__.or_',
+    gast.UAdd: 'ag__.u_add',
+    gast.USub: 'ag__.u_sub',
+    gast.Invert: 'ag__.invert',
+}
+
+
 class LogicalExpressionTransformer(converter.Base):
   """Converts logical expressions to corresponding TF calls."""
 
-  def __init__(self, ctx):
-    super(LogicalExpressionTransformer, self).__init__(ctx)
-    # TODO(mdan): For completeness and consistency, overload everything.
-    self.op_mapping = {
-        gast.And: 'ag__.and_',
-        gast.Eq: 'ag__.eq',
-        gast.NotEq: 'ag__.not_eq',
-        gast.Lt: 'ag__.lt',
-        gast.LtE: 'ag__.lt_e',
-        gast.Gt: 'ag__.gt',
-        gast.GtE: 'ag__.gt_e',
-        gast.Is: 'ag__.is_',
-        gast.IsNot: 'ag__.is_not',
-        gast.In: 'ag__.in_',
-        gast.Not: 'ag__.not_',
-        gast.NotIn: 'ag__.not_in',
-        gast.Or: 'ag__.or_',
-        gast.USub: 'ag__.u_sub',
-    }
-
   def _expect_simple_symbol(self, operand):
     if isinstance(operand, gast.Name):
       return
@@ -74,11 +74,11 @@ class LogicalExpressionTransformer(converter.Base):
 
   def _has_matching_func(self, operator):
     op_type = type(operator)
-    return op_type in self.op_mapping
+    return op_type in OP_MAPPING
 
   def _matching_func(self, operator):
     op_type = type(operator)
-    return self.op_mapping[op_type]
+    return OP_MAPPING[op_type]
 
   def _as_function(self, func_name, args, args_as_lambda=False):
     if args_as_lambda:
diff --git a/tensorflow/python/autograph/converters/logical_expressions_test.py b/tensorflow/python/autograph/converters/logical_expressions_test.py
index 687412750e0b2d3e7db275f6c25e5923ffaaa831..67ccd1fb47955053e0896df07e20903d4406370b 100644
--- a/tensorflow/python/autograph/converters/logical_expressions_test.py
+++ b/tensorflow/python/autograph/converters/logical_expressions_test.py
@@ -77,6 +77,13 @@ class LogicalExpressionTest(converter_testing.TestCase):
     with self.converted(test_fn, logical_expressions, {}) as result:
       self.assertTrue(result.test_fn('a', ('a',)))
 
+  def test_unary_ops(self):
+    def test_fn(a):
+      return ~a, -a, +a
+
+    with self.converted(test_fn, logical_expressions, {}) as result:
+      self.assertEqual(result.test_fn(1), (-2, -1, 1))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/converters/return_statements.py b/tensorflow/python/autograph/converters/return_statements.py
index 496c99e3b5247c174f8a74e9b3f23517ddc649f3..3173e676e5dc383f399ca89cdc7814406afb28eb 100644
--- a/tensorflow/python/autograph/converters/return_statements.py
+++ b/tensorflow/python/autograph/converters/return_statements.py
@@ -22,310 +22,391 @@ import gast
 
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.pyct import anno
-from tensorflow.python.autograph.pyct import ast_util
+from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import templates
 from tensorflow.python.autograph.pyct.static_analysis.annos import NodeAnno
 
 
-# TODO(mdan): Move this logic into transformer_base.
-class BodyVisitor(converter.Base):
-  """Walks breadth- or depth-first the list-of-nodes bodies of AST nodes."""
+BODY_DEFINITELY_RETURNS = 'BODY_DEFINITELY_RETURNS'
+ORELSE_DEFINITELY_RETURNS = 'ORELSE_DEFINITELY_RETURNS'
+STMT_DEFINITELY_RETURNS = 'STMT_DEFINITELY_RETURNS'
 
-  def __init__(self, ctx, depth_first=False):
-    super(BodyVisitor, self).__init__(ctx)
-    self.depth_first = depth_first
-    self.changes_made = False
 
-  def visit_nodelist(self, nodelist):
-    for node in nodelist:
-      if isinstance(node, list):
-        node = self.visit_nodelist(node)
+class _Block(object):
+
+  def __init__(self):
+    self.definitely_returns = False
+
+
+class ConditionalReturnRewriter(converter.Base):
+  """Rewrites a a pattern where it's unbovious that all paths return a value.
+
+  This rewrite allows avoiding intermediate None return values.
+
+  The following pattern:
+
+      if cond:
+        <block 1>
+        return
       else:
-        node = self.generic_visit(node)
-    return nodelist
+        <block 2>
+      <block 3>
 
-  def visit_If(self, node):
-    if self.depth_first:
-      node = self.generic_visit(node)
-    node.body = self.visit_nodelist(node.body)
-    node.orelse = self.visit_nodelist(node.orelse)
-    if not self.depth_first:
-      node = self.generic_visit(node)
-    return node
+  is converted to:
 
-  def visit_For(self, node):
-    if self.depth_first:
-      node = self.generic_visit(node)
-    node.body = self.visit_nodelist(node.body)
-    node.orelse = self.visit_nodelist(node.orelse)
-    if not self.depth_first:
-      node = self.generic_visit(node)
+      if cond:
+        <block 1>
+        return
+      else:
+        <block 2>
+        <block 3>
+
+  and vice-versa (if the else returns, subsequent statements are moved under the
+  if branch).
+  """
+
+  def visit_Return(self, node):
+    self.state[_Block].definitely_returns = True
     return node
 
+  def _postprocess_statement(self, node):
+    # If the node definitely returns (e.g. it's a with statement with a
+    # return stateent in it), then the current block also definitely returns.
+    if anno.getanno(node, STMT_DEFINITELY_RETURNS, default=False):
+      self.state[_Block].definitely_returns = True
+
+    # The special case: collapse a typical conditional return pattern into
+    # a single conditional with possibly returns on both branches. This
+    # reduces the use of None return values, which don't work with TF
+    # conditionals.
+    if (isinstance(node, gast.If)
+        and anno.getanno(node, BODY_DEFINITELY_RETURNS, default=False)):
+      return node, node.orelse
+    elif (isinstance(node, gast.If)
+          and anno.getanno(node, ORELSE_DEFINITELY_RETURNS, default=False)):
+      return node, node.body
+
+    return node, None
+
+  def _visit_statement_block(self, node, nodes):
+    self.state[_Block].enter()
+    new_nodes = self.visit_block(nodes, after_visit=self._postprocess_statement)
+    block_definitely_returns = self.state[_Block].definitely_returns
+    self.state[_Block].exit()
+    return new_nodes, block_definitely_returns
+
   def visit_While(self, node):
-    if self.depth_first:
-      node = self.generic_visit(node)
-    node.body = self.visit_nodelist(node.body)
-    node.orelse = self.visit_nodelist(node.orelse)
-    if not self.depth_first:
-      node = self.generic_visit(node)
+    node.test = self.visit(node.test)
+    node.body, _ = self._visit_statement_block(node, node.body)
+    node.orelse, _ = self._visit_statement_block(node, node.orelse)
     return node
 
-  def visit_Try(self, node):
-    if self.depth_first:
-      node = self.generic_visit(node)
-    node.body = self.visit_nodelist(node.body)
-    node.orelse = self.visit_nodelist(node.orelse)
-    node.finalbody = self.visit_nodelist(node.finalbody)
-    for i in range(len(node.handlers)):
-      node.handlers[i].body = self.visit_nodelist(node.handlers[i].body)
-    if not self.depth_first:
-      node = self.generic_visit(node)
+  def visit_For(self, node):
+    node.iter = self.visit(node.iter)
+    node.target = self.visit(node.target)
+    node.body, _ = self._visit_statement_block(node, node.body)
+    node.orelse, _ = self._visit_statement_block(node, node.orelse)
     return node
 
   def visit_With(self, node):
-    if self.depth_first:
-      node = self.generic_visit(node)
-    node.body = self.visit_nodelist(node.body)
-    if not self.depth_first:
-      node = self.generic_visit(node)
+    node.items = self.visit_block(node.items)
+    node.body, definitely_returns = self._visit_statement_block(node, node.body)
+    if definitely_returns:
+      anno.setanno(node, STMT_DEFINITELY_RETURNS, True)
     return node
 
-  def visit_FunctionDef(self, node):
-    if self.depth_first:
-      node = self.generic_visit(node)
-    node.body = self.visit_nodelist(node.body)
-    self.generic_visit(node)
-    if not self.depth_first:
-      node = self.generic_visit(node)
+  def visit_Try(self, node):
+    # We could decide whether a 'try' DEFINITELY_RETURNS based on its components
+    # It is not clear whether we want to do anything with this given
+    # a 'try' is likely to throw an exception in some circumstances.
+    node.body, _ = self._visit_statement_block(node, node.body)
+    node.orelse, _ = self._visit_statement_block(node, node.orelse)
+    node.finalbody, _ = self._visit_statement_block(node, node.finalbody)
+    node.handlers = self.visit_block(node.handlers)
     return node
 
-
-class FoldElse(BodyVisitor):
-
-  def visit_nodelist(self, nodelist):
-    for i in range(len(nodelist)):
-      node = nodelist[i]
-      if isinstance(node, gast.If):
-        true_branch_returns = isinstance(node.body[-1], gast.Return)
-        false_branch_returns = len(node.orelse) and isinstance(
-            node.orelse[-1], gast.Return)
-        # If the last node in the if body is a return,
-        # then every line after this if statement effectively
-        # belongs in the else.
-        if true_branch_returns and not false_branch_returns:
-          for j in range(i + 1, len(nodelist)):
-            nodelist[i].orelse.append(ast_util.copy_clean(nodelist[j]))
-          if nodelist[i + 1:]:
-            self.changes_made = True
-          return nodelist[:i + 1]
-        elif not true_branch_returns and false_branch_returns:
-          for j in range(i + 1, len(nodelist)):
-            nodelist[i].body.append(ast_util.copy_clean(nodelist[j]))
-          if nodelist[i + 1:]:
-            self.changes_made = True
-          return nodelist[:i + 1]
-        elif true_branch_returns and false_branch_returns:
-          if nodelist[i + 1:]:
-            raise ValueError(
-                'Unreachable code after conditional where both branches return.'
-            )
-          return nodelist
-      elif isinstance(node, gast.Return) and nodelist[i + 1:]:
-        raise ValueError(
-            'Cannot have statements after a return in the same basic block')
-    return nodelist
-
-
-def contains_return(node):
-  for n in gast.walk(node):
-    if isinstance(n, gast.Return):
-      return True
-  return False
-
-
-class LiftReturn(converter.Base):
-  """Move return statements out of If and With blocks."""
-
-  def __init__(self, ctx):
-    super(LiftReturn, self).__init__(ctx)
-    self.changes_made = False
-    self.common_return_name = None
+  def visit_ExceptHandler(self, node):
+    # To determine whether `try` DEFINITELY_RETURNS we need to revisit this.
+    node.body, _ = self._visit_statement_block(node, node.body)
+    return node
 
   def visit_If(self, node):
-    # Depth-first traversal of if statements
-    node = self.generic_visit(node)
-
-    # We check if both branches return, and if so, lift the return out of the
-    # conditional. We don't enforce that the true and false branches either
-    # both return or both do not, because FoldElse might move a return
-    # into a branch after this transform completes. FoldElse and LiftReturn
-    # are alternately run until the code reaches a fixed point.
-    true_branch_returns = isinstance(node.body[-1], gast.Return)
-    false_branch_returns = len(node.orelse) and isinstance(
-        node.orelse[-1], gast.Return)
-    if true_branch_returns and false_branch_returns:
-      node.body[-1] = templates.replace(
-          'a = b', a=self.common_return_name, b=node.body[-1].value)[0]
-      node.orelse[-1] = templates.replace(
-          'a = b', a=self.common_return_name, b=node.orelse[-1].value)[0]
-      return_node = templates.replace('return a', a=self.common_return_name)[0]
-      self.changes_made = True
-      return [node, return_node]
-    else:
-      return node
+    node.test = self.visit(node.test)
 
-  def visit_With(self, node):
-    # Depth-first traversal of syntax
-    node = self.generic_visit(node)
-
-    # If the with statement returns, lift the return
-    if isinstance(node.body[-1], gast.Return):
-      node.body[-1] = templates.replace(
-          'a = b', a=self.common_return_name, b=node.body[-1].value)[0]
-      return_node = templates.replace('return a', a=self.common_return_name)[0]
-      node = self.generic_visit(node)
-      self.changes_made = True
-      return [node, return_node]
-    else:
-      return node
+    node.body, body_definitely_returns = self._visit_statement_block(
+        node, node.body)
+    if body_definitely_returns:
+      anno.setanno(node, BODY_DEFINITELY_RETURNS, True)
+
+    node.orelse, orelse_definitely_returns = self._visit_statement_block(
+        node, node.orelse)
+    if orelse_definitely_returns:
+      anno.setanno(node, ORELSE_DEFINITELY_RETURNS, True)
+
+    if body_definitely_returns and orelse_definitely_returns:
+      self.state[_Block].definitely_returns = True
+
+    return node
 
   def visit_FunctionDef(self, node):
-    # Ensure we're doing depth-first traversal
-    last_return_name = self.common_return_name
-    body_scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
-    referenced_names = body_scope.referenced
-    self.common_return_name = self.ctx.namer.new_symbol('return_',
-                                                        referenced_names)
-    node = self.generic_visit(node)
-    self.common_return_name = last_return_name
+    node.args = self.visit(node.args)
+    node.body, _ = self._visit_statement_block(node, node.body)
     return node
 
 
-class DetectReturnInUnsupportedControlFlow(gast.NodeVisitor):
-  """Throws an error if code returns inside loops or try/except."""
+class _Return(object):
+
+  def __init__(self):
+    self.used = False
+    self.create_guard = False
+    self.guard_created = False
+
+  def __repr__(self):
+    return 'used: {}'.format(
+        self.used)
 
-  # First, throw an error if we detect a return statement in a loop.
-  # TODO(alexbw): we need to learn to handle returns inside a loop,
-  # but don't currently have the TF constructs to do so (need something
-  # that looks vaguely like a goto).
+
+class _Function(object):
 
   def __init__(self):
-    self.cant_return = False
-    self.function_level = 0
-    super(DetectReturnInUnsupportedControlFlow, self).__init__()
+    self.do_return_var_name = None
+    self.retval_var_name = None
 
-  def visit_While(self, node):
-    self.cant_return = True
-    self.generic_visit(node)
-    self.cant_return = False
+  def __repr__(self):
+    return 'return control: {}, return value: {}'.format(
+        self.do_return_var_name, self.retval_var_name)
 
-  def visit_For(self, node):
-    self.cant_return = True
-    self.generic_visit(node)
-    self.cant_return = False
 
-  def visit_Try(self, node):
-    self.cant_return = True
-    self.generic_visit(node)
-    self.cant_return = False
+class ReturnStatementsTransformer(converter.Base):
+  """Lowers return statements into variables and conditionals.
 
-  def visit_FunctionDef(self, node):
-    if not self.function_level:
-      self.function_level += 1
-      self.generic_visit(node)
-      self.function_level -= 1
+  Specifically, the following pattern:
 
-  def visit_Return(self, node):
-    if self.cant_return:
-      raise ValueError(
-          '`return` statements are not supported in loops. '
-          'Try assigning to a variable in the while loop, and returning '
-          'outside of the loop')
+      <block 1>
+      return val
+      <block 2>
 
+  is converted to:
 
-class DetectReturnInConditional(gast.NodeVisitor):
-  """Assert that no return statements are present in conditionals."""
+      do_return = False
+      retval = None
 
-  def __init__(self):
-    self.cant_return = False
-    self.function_level = 0
-    super(DetectReturnInConditional, self).__init__()
+      <block 1>
 
-  def visit_If(self, node):
-    self.cant_return = True
-    self.generic_visit(node)
-    self.cant_return = False
+      do_return = True
+      retval = val
 
-  def visit_FunctionDef(self, node):
-    if not self.function_level:
-      self.function_level += 1
-      self.generic_visit(node)
-      self.function_level -= 1
+      if not do_return:
+        <block 2>
 
-  def visit_Return(self, node):
-    if self.cant_return:
-      raise ValueError(
-          'After transforms, a conditional contained a `return `statement, '
-          'which is not allowed. This is a bug, and should not happen.')
+      return retval
 
+  The conversion adjusts loops as well:
 
-class DetectReturnInFunctionDef(gast.NodeVisitor):
+      <block 1>
+      while cond:
+        <block 2>
+        return retval
 
-  def visit_FunctionDef(self, node):
-    self.generic_visit(node)
-    if not contains_return(node):
-      raise ValueError(
-          'Each function definition should contain at least one return.')
-
-
-def transform(node, ctx):
-  """Ensure a function has only a single return.
-
-  This transforms an AST node with multiple returns successively into containing
-  only a single return node.
-  There are a few restrictions on what we can handle:
-   - An AST being transformed must contain at least one return.
-   - No returns allowed in loops. We have to know the type of the return value,
-   and we currently don't have either a type inference system to discover it,
-   nor do we have a mechanism for late type binding in TensorFlow.
-   - After all transformations are finished, a Return node is not allowed inside
-   control flow. If we were unable to move a return outside of control flow,
-   this is an error.
-
-  Args:
-     node: ast.AST
-     ctx: converter.EntityContext
-
-  Returns:
-     new_node: an AST with a single return value
-
-  Raises:
-    ValueError: if the AST is structured so that we can't perform the
-   transform.
+  is converted to:
+
+      <block 1>
+      while not do_return and cond:
+        <block 2>
+        do_return = True
+        retval = val
   """
-  # Make sure that the function has at least one return statement
-  # TODO(alexbw): turning off this assertion for now --
-  # we need to not require this in e.g. class constructors.
-  # DetectReturnInFunctionDef().visit(node)
 
-  # Make sure there's no returns in unsupported locations (loops, try/except)
-  DetectReturnInUnsupportedControlFlow().visit(node)
+  def __init__(self, ctx, default_to_null_return):
+    super(ReturnStatementsTransformer, self).__init__(ctx)
+    self.default_to_null_return = default_to_null_return
+
+  def visit_Return(self, node):
+    self.state[_Return].used = True
+
+    retval = node.value if node.value else parser.parse_expression('None')
+
+    template = """
+      do_return_var_name = True
+      retval_var_name = retval
+    """
+    node = templates.replace(
+        template,
+        do_return_var_name=self.state[_Function].do_return_var_name,
+        retval_var_name=self.state[_Function].retval_var_name,
+        retval=retval)
+
+    return node
+
+  def _postprocess_statement(self, node):
+    # Example of how the state machine below works:
+    #
+    #   1| stmt           # State: _Return.used = False
+    #    |                # Action: none
+    #   3| return         # State: _Return.used = True,
+    #    |                #        _Return.guard_created = False,
+    #    |                #        _Return.create_guard = False
+    #    |                # Action: _Return.create_guard = True
+    #   4| stmt           # State: _Return.used = True,
+    #    |                #        _Return.guard_created = False,
+    #    |                #        _Return.create_guard = True
+    #    |                # Action: create `if not return_used`,
+    #    |                #         set _Return.guard_created = True
+    #   5| stmt           # State: _Return.used = True,
+    #    |                #        _Return.guard_created = True
+    #    |                # Action: none (will be wrapped under previously
+    #    |                #         created if node)
+    if self.state[_Return].used:
+      if self.state[_Return].guard_created:
+        return node, None
+
+      elif not self.state[_Return].create_guard:
+        self.state[_Return].create_guard = True
+        return node, None
+
+      elif (not self.state[_Return].guard_created and
+            self.state[_Return].create_guard):
+        self.state[_Return].guard_created = True
+        template = """
+          if ag__.not_(do_return_var_name):
+            original_node
+        """
+        cond, = templates.replace(
+            template,
+            do_return_var_name=self.state[_Function].do_return_var_name,
+            original_node=node)
+        return cond, cond.body
+
+      else:
+        assert False, 'should handle all states'
+
+    return node, None
+
+  def _visit_statement_block(self, node, nodes):
+    self.state[_Return].enter()
+    nodes = self.visit_block(nodes, after_visit=self._postprocess_statement)
+    return_used = self.state[_Return].used
+    self.state[_Return].exit()
+    if return_used:
+      self.state[_Return].used = True
+    return nodes
+
+  def visit_While(self, node):
+    node.test = self.visit(node.test)
+
+    # Add the check for return to the loop condition.
+    node.body = self._visit_statement_block(node, node.body)
+    if self.state[_Return].used:
+      node.test = templates.replace_as_expression(
+          'ag__.and_(lambda: ag__.not_(control_var), lambda: test)',
+          test=node.test,
+          control_var=self.state[_Function].do_return_var_name)
+
+    node.orelse = self._visit_statement_block(node, node.orelse)
+    return node
+
+  def visit_For(self, node):
+    node.iter = self.visit(node.iter)
+    node.target = self.visit(node.target)
+
+    # Add the check for return to the loop condition.
+    node.body = self._visit_statement_block(node, node.body)
+    if self.state[_Return].used:
+      extra_test = anno.getanno(node, 'extra_test', default=None)
+      if extra_test is not None:
+        extra_test = templates.replace_as_expression(
+            'ag__.and_(lambda: ag__.not_(control_var), lambda: extra_test)',
+            extra_test=extra_test,
+            control_var=self.state[_Function].do_return_var_name)
+      else:
+        extra_test = templates.replace_as_expression(
+            'ag__.not_(control_var)',
+            control_var=self.state[_Function].do_return_var_name)
+      anno.setanno(node, 'extra_test', extra_test)
+
+    node.orelse = self._visit_statement_block(node, node.orelse)
+    return node
+
+  def visit_With(self, node):
+    node.items = self.visit_block(node.items)
+    node.body = self._visit_statement_block(node, node.body)
+    return node
+
+  def visit_Try(self, node):
+    node.body = self._visit_statement_block(node, node.body)
+    node.orelse = self._visit_statement_block(node, node.orelse)
+    node.finalbody = self._visit_statement_block(node, node.finalbody)
+    node.handlers = self.visit_block(node.handlers)
+    return node
+
+  def visit_ExceptHandler(self, node):
+    node.body = self._visit_statement_block(node, node.body)
+    return node
+
+  def visit_If(self, node):
+    node.test = self.visit(node.test)
+    node.body = self._visit_statement_block(node, node.body)
+    node.orelse = self._visit_statement_block(node, node.orelse)
+    return node
+
+  def visit_FunctionDef(self, node):
+    self.state[_Function].enter()
+    self.state[_Return].enter()
+
+    scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
+    do_return_var_name = self.ctx.namer.new_symbol(
+        'do_return', scope.referenced)
+    retval_var_name = self.ctx.namer.new_symbol('retval_', scope.referenced)
+    self.state[_Function].do_return_var_name = do_return_var_name
+    self.state[_Function].retval_var_name = retval_var_name
+
+    converted_body = self._visit_statement_block(node, node.body)
+
+    # Avoid placing statements before any eventual docstring.
+    # TODO(mdan): Should a docstring even be included in the output?
+    docstring = None
+    if converted_body:
+      if (isinstance(converted_body[0], gast.Expr) and
+          isinstance(converted_body[0].value, gast.Str)):
+        docstring = converted_body[0]
+        converted_body = converted_body[1:]
+
+    if self.state[_Return].used:
+      if self.default_to_null_return:
+        template = """
+          do_return_var_name = False
+          retval_var_name = None
+          body
+          return retval_var_name
+        """
+      else:
+        template = """
+          body
+          return retval_var_name
+        """
+      node.body = templates.replace(
+          template,
+          body=converted_body,
+          do_return_var_name=do_return_var_name,
+          retval_var_name=retval_var_name)
+
+      if docstring:
+        node.body.insert(0, docstring)
+
+    self.state[_Return].exit()
+    self.state[_Function].exit()
+    return node
 
-  while True:
 
-    # Try to lift all returns out of if statements and with blocks
-    lr = LiftReturn(ctx)
-    node = lr.visit(node)
-    changes_made = lr.changes_made
-    fe = FoldElse(ctx)
-    node = fe.visit(node)
-    changes_made = changes_made or fe.changes_made
+def transform(node, ctx, default_to_null_return=True):
+  """Ensure a function has only a single return."""
+  # Note: Technically, these two could be merged into a single walk, but
+  # keeping them separate helps with readability.
 
-    if not changes_made:
-      break
+  node = ConditionalReturnRewriter(ctx).visit(node)
 
-  # Make sure we've scrubbed all returns from conditionals
-  DetectReturnInConditional().visit(node)
+  transformer = ReturnStatementsTransformer(
+      ctx, default_to_null_return=default_to_null_return)
+  node = transformer.visit(node)
 
   return node
diff --git a/tensorflow/python/autograph/converters/return_statements_test.py b/tensorflow/python/autograph/converters/return_statements_test.py
index 762fbc6f607f56ed6d80dd82f59f8c7653c7312a..b2d3d1b92055216d45071fef1fe9f36553a7fb42 100644
--- a/tensorflow/python/autograph/converters/return_statements_test.py
+++ b/tensorflow/python/autograph/converters/return_statements_test.py
@@ -49,17 +49,16 @@ class SingleReturnTest(converter_testing.TestCase):
     self.assertTransformedEquivalent(test_fn, 2)
     self.assertTransformedEquivalent(test_fn, -2)
 
-  def test_missing_orelse(self):
+  def test_missing_else(self):
 
     def test_fn(x):
       if x > 0:
         return x
 
-    node, ctx = self.prepare(test_fn, {})
-    with self.assertRaises(ValueError):
-      return_statements.transform(node, ctx)
+    self.assertTransformedEquivalent(test_fn, 2)
+    self.assertTransformedEquivalent(test_fn, -2)
 
-  def test_missing_orelse_recovrable(self):
+  def test_missing_else_then_default(self):
 
     def test_fn(x):
       if x > 0:
@@ -69,7 +68,7 @@ class SingleReturnTest(converter_testing.TestCase):
     self.assertTransformedEquivalent(test_fn, 2)
     self.assertTransformedEquivalent(test_fn, -2)
 
-  def test_missing_branch_return_recoverable(self):
+  def test_else_only_then_default(self):
 
     def test_fn(x):
       if x < 0:
@@ -136,7 +135,7 @@ class SingleReturnTest(converter_testing.TestCase):
 
     self.assertTransformedEquivalent(test_fn, 2)
 
-  def test_nested_functions(self):
+  def test_nested_function(self):
 
     def test_fn(x):
 
@@ -151,7 +150,7 @@ class SingleReturnTest(converter_testing.TestCase):
     self.assertTransformedEquivalent(test_fn, 2)
     self.assertTransformedEquivalent(test_fn, -2)
 
-  def test_nested_functions_in_control_flow(self):
+  def test_nested_function_in_control_flow(self):
 
     def test_fn(x):
 
@@ -163,16 +162,59 @@ class SingleReturnTest(converter_testing.TestCase):
     self.assertTransformedEquivalent(test_fn, 2)
     self.assertTransformedEquivalent(test_fn, -2)
 
-  def test_loop(self):
+  def test_for_loop(self):
 
-    def test_fn(x):
-      for _ in range(10):
-        return x
-      return x
+    def test_fn(n):
+      for _ in range(n):
+        return 1
 
-    node, ctx = self.prepare(test_fn, {})
-    with self.assertRaises(ValueError):
-      return_statements.transform(node, ctx)
+    self.assertTransformedEquivalent(test_fn, 2)
+    self.assertTransformedEquivalent(test_fn, 0)
+
+  def test_while_loop(self):
+
+    def test_fn(n):
+      i = 0
+      s = 0
+      while i < n:
+        i += 1
+        s += i
+        if s > 4:
+          return s
+      return -1
+
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 2)
+    self.assertTransformedEquivalent(test_fn, 4)
+
+  def test_null_return(self):
+
+    def test_fn(n):
+      if n > 4:
+        return
+      return
+
+    self.assertTransformedEquivalent(test_fn, 4)
+    self.assertTransformedEquivalent(test_fn, 5)
+
+  def test_nested_multiple_withs(self):
+
+    def test_fn(x):
+      v = []
+      while x > 0:
+        x -= 1
+        with ops.name_scope(''):
+          if x % 2 == 0:
+            return v
+        with ops.name_scope(''):
+          v.append(x)
+        v.append(x)
+      return v
+
+    self.assertTransformedEquivalent(test_fn, 0)
+    self.assertTransformedEquivalent(test_fn, 1)
+    self.assertTransformedEquivalent(test_fn, 3)
+    self.assertTransformedEquivalent(test_fn, 4)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/side_effect_guards.py b/tensorflow/python/autograph/converters/side_effect_guards.py
index 98e29ec8e1b27061371f0328402d8cb45a0f69e7..7e556d95139366cb9747544fbaafe4a4039d82cd 100644
--- a/tensorflow/python/autograph/converters/side_effect_guards.py
+++ b/tensorflow/python/autograph/converters/side_effect_guards.py
@@ -85,11 +85,26 @@ class SideEffectGuardTransformer(converter.Base):
         new_alias_map.update(alias_map)
         alias_map = new_alias_map
         current_dest = new_dest
-    if reindent_requested and not current_dest:
-      # TODO(mdan): There may still be something that could be done.
-      raise ValueError('Unable to insert statement into the computation flow: '
-                       'it is not followed by any computation which '
-                       'the statement could gate.')
+
+    if reindent_requested:
+      no_controls_to_gate = False
+      if not current_dest:
+        no_controls_to_gate = True
+      if len(current_dest) == 1:
+        if ast_util.matches(current_dest[0], 'return'):
+          no_controls_to_gate = True
+        if ast_util.matches(current_dest[0], 'return ()'):
+          no_controls_to_gate = True
+        if ast_util.matches(current_dest[0], 'return []'):
+          no_controls_to_gate = True
+        if ast_util.matches(current_dest[0], 'return {}'):
+          no_controls_to_gate = True
+      if no_controls_to_gate:
+        # TODO(mdan): There may still be something that could be done.
+        raise ValueError(
+            'Unable to insert statement into the computation flow: it is not'
+            ' followed by any computation which the statement could gate.')
+
     return new_nodes
 
   def visit_FunctionDef(self, node):
@@ -110,6 +125,10 @@ class SideEffectGuardTransformer(converter.Base):
     node.orelse = self._visit_and_reindent(node.orelse)
     return node
 
+  # TODO(b/123995141) Remove once ExceptionHandlers are in the CFG
+  def visit_ExceptHandler(self, node):
+    return node
+
   def visit_Expr(self, node):
     self.generic_visit(node)
     if isinstance(node.value, gast.Call):
diff --git a/tensorflow/python/autograph/converters/slices_test.py b/tensorflow/python/autograph/converters/slices_test.py
index bd049afdfcef4c839bcb3d9ba5444d885c3061cc..11e3736d4fb9e8d06d5f02c991ea66410b35b374 100644
--- a/tensorflow/python/autograph/converters/slices_test.py
+++ b/tensorflow/python/autograph/converters/slices_test.py
@@ -23,7 +23,6 @@ from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.autograph.lang import directives
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import parser
-from tensorflow.python.autograph.pyct import transformer
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import list_ops
@@ -68,7 +67,7 @@ class SliceTest(converter_testing.TestCase):
     def_.directives[directives.set_element_type] = {
         'dtype': parser.parse_expression('tf.float32')
     }
-    with self.assertRaises(transformer.AutographParseError):
+    with self.assertRaises(ValueError):
       slices.transform(node, ctx)
 
 
diff --git a/tensorflow/python/autograph/core/BUILD b/tensorflow/python/autograph/core/BUILD
index 3ab2e7b1bcacf7efe136b01a10de2bb7728e2d90..fae327e50db57474f2f72fddbc57f04f90ca4f1e 100644
--- a/tensorflow/python/autograph/core/BUILD
+++ b/tensorflow/python/autograph/core/BUILD
@@ -22,6 +22,7 @@ py_library(
         "errors.py",
         "function_wrapping.py",
         "naming.py",
+        "unsupported_features_checker.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
@@ -30,6 +31,7 @@ py_library(
         "//tensorflow/python/autograph/pyct",
         "//tensorflow/python/autograph/pyct/static_analysis",
         "//tensorflow/python/autograph/utils",
+        "@gast_archive//:gast",
     ],
 )
 
diff --git a/tensorflow/python/autograph/core/config.py b/tensorflow/python/autograph/core/config.py
index 574f819504e526420dd1956359dc974869d735f3..f038704a0741ef31d8701b41566d236f7caff0d8 100644
--- a/tensorflow/python/autograph/core/config.py
+++ b/tensorflow/python/autograph/core/config.py
@@ -28,21 +28,34 @@ PYTHON_LITERALS = {
     'float': float,
 }
 
+
+def _internal_name(name):
+  """This function correctly resolves internal and external names."""
+  reference_name = utils.__name__
+
+  reference_root = 'tensorflow.'
+  # If the TF module is foo.tensorflow, then all other modules
+  # are then assumed to be prefixed by 'foo'.
+
+  if reference_name.startswith(reference_root):
+    return name
+
+  reference_begin = reference_name.find('.' + reference_root)
+  assert reference_begin > 0
+
+  root_prefix = reference_name[:reference_begin]
+  return root_prefix + '.' + name
+
+
 DEFAULT_UNCOMPILED_MODULES = set((
     ('tensorflow',),
-    (utils.__name__,),
-
-    # All of tensorflow's subpackages. Unlike the root tf module, they don't
-    # have well-known names. Not referring to the module directly to avoid
-    # circular imports.
-    (
-        utils.__name__[:-len('.python.autograph.utils')],),
+    (_internal_name('tensorflow'),),
+    # TODO(mdan): Remove once the conversion process is optimized.
+    ('tensorflow_probability',),
+    (_internal_name('tensorflow_probability'),),
 ))
 
-NO_SIDE_EFFECT_CONSTRUCTORS = set(('tensorflow',))
 
-# TODO(mdan): Also allow controlling the generated names.
-# TODO(mdan); Consolidate all internal imports into a single __ag module.
 COMPILED_IMPORT_STATEMENTS = (
     'from __future__ import print_function',
 )
diff --git a/tensorflow/python/autograph/core/converter.py b/tensorflow/python/autograph/core/converter.py
index e88c4674ee24867dec32d62589afdc2e48dfcace..3a084836249fbb2693b44285f8f9baa141230c71 100644
--- a/tensorflow/python/autograph/core/converter.py
+++ b/tensorflow/python/autograph/core/converter.py
@@ -63,8 +63,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from enum import Enum
-from enum import IntEnum
+import weakref
+
+import enum
 
 from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import naming
@@ -83,6 +84,7 @@ from tensorflow.python.autograph.pyct.static_analysis import liveness
 from tensorflow.python.autograph.pyct.static_analysis import reaching_definitions
 from tensorflow.python.autograph.pyct.static_analysis import type_info
 from tensorflow.python.eager import function
+from tensorflow.python.util.tf_export import tf_export
 
 # TODO(mdan): These contexts can be refactored into first class objects.
 # For example, we could define Program and Entity abstractions that hold on
@@ -91,37 +93,59 @@ from tensorflow.python.eager import function
 # TODO(mdan): Add a test specific to this converter.
 
 
-class Verbosity(IntEnum):
-  """Different levels of verbosity for printing errors.
+@tf_export('autograph.experimental.Verbosity')
+class Verbosity(enum.IntEnum):
+  """Represents conversion verbosity levels.
 
   Attributes:
-   * BRIEF: No logging, minimal error messages.
-   * VERBOSE: Detailed logging of generated code, detailed error messages.
+    BRIEF: No logging, minimal error messages.
+    VERBOSE: Detailed logging of generated code, detailed error messages.
   """
+
   BRIEF = 0
   VERBOSE = 1
 
 
-class Feature(Enum):
-  """Constants to use when selecting AutoGraph features."""
+@tf_export('autograph.experimental.Feature')
+class Feature(enum.Enum):
+  """Represents conversion options that can be toggled on or off.
 
-  ALL = 'Enable all features.'
+  Attributes:
+    ALL: Enable all features.
+    AUTO_CONTROL_DEPS: Insert of control dependencies in the generated code.
+    ASSERT_STATEMENTS: Convert Tensor-dependent assert statements to tf.Assert.
+    BUILTIN_FUNCTIONS: Convert builtin functions applied to Tensors to
+      their TF counterparts.
+    ERROR_REWRITING: Rewrite errors that occur in the generated code to
+      indicate the source code to which the failing code corresponds.
+    LISTS: Convert list idioms, like initializers, slices, append, etc.
+    LOGICAL_EXPRESSIONS: Convert data-dependent logical expressions applied to
+      Tensors to their TF counterparts.
+    NAME_SCOPES: Insert name scopes that name ops according to context, like the
+      function they were defined in.
+  """
 
-  AUTO_CONTROL_DEPS = (
-      'Insert of control dependencies in the generated code.')
-  DECORATORS = (
-      'Allow decorators in local functions. Note that special decorators,'
-      ' like ag.convert or tf.function are allowed regardless of this toggle.')
-  ERROR_REWRITING = (
-      'Rewrite errors that occur in the generated code to indicate the source'
-      ' code to which the failing code corresponds.')
-  LISTS = 'Convert list idioms, like initializers, slices, append, etc.'
-  NAME_SCOPES = (
-      'Insert name scopes that name ops according to context, like the'
-      ' function they were defined in.')
+  ALL = 'ALL'
 
-  def __repr__(self):
-    return self.name
+  AUTO_CONTROL_DEPS = 'AUTO_CONTROL_DEPS'
+  ASSERT_STATEMENTS = 'ASSERT_STATEMENTS'
+  BUILTIN_FUNCTIONS = 'BUILTIN_FUNCTIONS'
+  ERROR_REWRITING = 'ERROR_REWRITING'
+  LISTS = 'LISTS'
+  LOGICAL_EXPRESSIONS = 'LOGICAL_EXPRESSIONS'
+  NAME_SCOPES = 'NAME_SCOPES'
+
+  @classmethod
+  def all(cls):
+    """Returns a tuple that enables all options."""
+    return tuple(cls.__members__.values())
+
+  @classmethod
+  def all_but(cls, exclude):
+    """Returns a tuple that enables all but the excluded options."""
+    if not isinstance(exclude, (list, tuple, set)):
+      exclude = (exclude,)
+    return tuple(set(cls.all()) - set(exclude) - {cls.ALL})
 
 
 class ConversionOptions(object):
@@ -157,7 +181,9 @@ class ConversionOptions(object):
     # TODO(mdan): Rename to conversion_recursion_depth?
     self.internal_convert_user_code = internal_convert_user_code
 
-    if isinstance(optional_features, Feature):
+    if optional_features is None:
+      optional_features = ()
+    elif isinstance(optional_features, Feature):
       optional_features = (optional_features,)
     optional_features = frozenset(optional_features)
     self.optional_features = optional_features
@@ -168,19 +194,28 @@ class ConversionOptions(object):
     # TODO(mdan): Revert if function.defun becomes a public symbol.
     return self._strip_decorators + (function.defun,)
 
+  def should_strip(self, decorator):
+    for blacklisted in self.strip_decorators:
+      if blacklisted is decorator:
+        return True
+      if isinstance(blacklisted, weakref.ref):
+        blacklisted_deref = blacklisted()
+        if (blacklisted_deref is not None and blacklisted_deref is decorator):
+          return True
+    return False
+
   def uses(self, feature):
     return (Feature.ALL in self.optional_features or
             feature in self.optional_features)
 
-  def to_ast(self, namespace, internal_convert_user_code=None):
+  def to_ast(self, ctx, internal_convert_user_code=None):
     """Returns a representation of this object as an AST node.
 
     The AST node encodes a constructor that would create an object with the
     same contents.
 
     Args:
-      namespace: Dict[str, Any], the namespace to use when serializing values to
-        names.
+      ctx: EntityContext, the entity with which this AST needs to be consistent.
       internal_convert_user_code: Optional[bool], allows ovrriding the
         corresponding value.
 
@@ -188,7 +223,7 @@ class ConversionOptions(object):
       ast.Node
     """
     template = """
-      constructor_name(
+      ag__.ConversionOptions(
           recursive=recursive_val,
           verbose=verbose_val,
           strip_decorators=strip_decorators_val,
@@ -198,10 +233,16 @@ class ConversionOptions(object):
     """
 
     def as_qualified_name(o):
-      name = inspect_utils.getqualifiedname(namespace, o)
+      name = inspect_utils.getqualifiedname(ctx.info.namespace, o, max_depth=1)
       if not name:
-        raise ValueError('Could not locate entity {} in {}'.format(
-            o, namespace))
+        if isinstance(o, weakref.ref):
+          # `o` might already be a weak reference, if this object was
+          # constructed from code generated by `to_ast` itself.
+          # If so, unpack it.
+          o = o()
+        # TODO(mdan): This needs to account for the symbols defined locally.
+        name = ctx.namer.new_symbol(o.__name__, ())
+        ctx.program.add_symbol(name, weakref.ref(o))
       return name
 
     def list_of_names(values):
@@ -210,17 +251,13 @@ class ConversionOptions(object):
 
     def list_of_features(values):
       return parser.parse_expression('({})'.format(', '.join(
-          'ag__.Feature.{}'.format(v)
-          for v in Feature.__members__
-          if v in values)))
+          'ag__.{}'.format(str(v)) for v in values)))
 
-    if internal_convert_user_code is not None:
+    if internal_convert_user_code is None:
       internal_convert_user_code = self.internal_convert_user_code
 
     expr_ast = templates.replace(
         template,
-        constructor_name=parser.parse_expression(
-            as_qualified_name(ConversionOptions)),
         recursive_val=parser.parse_expression(str(self.recursive)),
         verbose_val=parser.parse_expression(str(int(self.verbose))),
         strip_decorators_val=list_of_names(self._strip_decorators),
@@ -254,6 +291,11 @@ class ProgramContext(object):
     required_imports: str, containing an import statement on each line. These
       are all the imports necessary for the compiled code to run, in addition to
       the closures of each entity, which are attached dynamically.
+    partial_types: Tuple[Type], deprecated.
+    conversion_order: Tuple[Any], deprecated.
+    additional_symbols: Dict[str, Any], a map of new symbols that have been
+      created under this context, and need to be added to the namespace of the
+      generated code.
   """
 
   def __init__(
@@ -272,6 +314,7 @@ class ProgramContext(object):
     self.dependency_cache = {}
     self.additional_imports = set()
     self.name_map = {}
+    self.additional_symbols = {}
 
   @property
   def required_imports(self):
@@ -314,12 +357,17 @@ class ProgramContext(object):
       else:
         self.name_map[o] = name
 
+  def add_symbol(self, name, value):
+    if name in self.additional_symbols:
+      assert self.additional_symbols[name] is value
+    self.additional_symbols[name] = value
+
   def add_to_cache(self, original_entity, converted_ast):
     self.conversion_order.append(original_entity)
     self.dependency_cache[original_entity] = converted_ast
 
 
-class EntityContext(object):
+class EntityContext(transformer.Context):
   """Tracks the conversion of a single entity.
 
   This object is mutable, and is updated during conversion. Not thread safe.
@@ -331,8 +379,8 @@ class EntityContext(object):
   """
 
   def __init__(self, namer, entity_info, program_ctx):
+    super(EntityContext, self).__init__(entity_info)
     self.namer = namer
-    self.info = entity_info
     self.program = program_ctx
 
 
@@ -344,8 +392,7 @@ class Base(transformer.Base):
   """
 
   def __init__(self, ctx):
-    super(Base, self).__init__(ctx.info)
-    self.ctx = ctx  # Keeping this short because it's used frequently.
+    super(Base, self).__init__(ctx)
 
     self._used = False
     self._ast_depth = 0
@@ -419,7 +466,7 @@ class AnnotatedDef(reaching_definitions.Definition):
     self.directives = {}
 
 
-class AgAnno(Enum):
+class AgAnno(enum.Enum):
   """Annotation labels specific to AutoGraph. See anno.py."""
 
   DIRECTIVES = 'User directives associated with the annotated statement.'
@@ -445,13 +492,13 @@ def standard_analysis(node, context, is_initial=False):
   # TODO(mdan): Don't return a node because it's modified by reference.
   graphs = cfg.build(node)
   node = qual_names.resolve(node)
-  node = activity.resolve(node, context.info, None)
-  node = reaching_definitions.resolve(node, context.info, graphs, AnnotatedDef)
-  node = liveness.resolve(node, context.info, graphs)
-  node = live_values.resolve(node, context.info, config.PYTHON_LITERALS)
-  node = type_info.resolve(node, context.info)
+  node = activity.resolve(node, context, None)
+  node = reaching_definitions.resolve(node, context, graphs, AnnotatedDef)
+  node = liveness.resolve(node, context, graphs)
+  node = live_values.resolve(node, context, config.PYTHON_LITERALS)
+  node = type_info.resolve(node, context)
   # This second call allows resolving first-order class attributes.
-  node = live_values.resolve(node, context.info, config.PYTHON_LITERALS)
+  node = live_values.resolve(node, context, config.PYTHON_LITERALS)
   if is_initial:
     anno.dup(
         node,
diff --git a/tensorflow/python/autograph/core/converter_test.py b/tensorflow/python/autograph/core/converter_test.py
index b73c67e337748e1f9f2729842c309e6263b444df..4050878b929b097bc61169040368a4d56876e45e 100644
--- a/tensorflow/python/autograph/core/converter_test.py
+++ b/tensorflow/python/autograph/core/converter_test.py
@@ -18,10 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import weakref
+
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.autograph.pyct import anno
+from tensorflow.python.autograph.pyct import compiler
 from tensorflow.python.autograph.pyct import parser
+from tensorflow.python.autograph.pyct import templates
+from tensorflow.python.autograph.pyct import transformer
 from tensorflow.python.platform import test
 
 
@@ -29,6 +34,75 @@ class TestConverter(converter.Base):
   pass
 
 
+class ConversionOptionsTest(converter_testing.TestCase):
+
+  def test_to_ast(self):
+    opts = converter.ConversionOptions()
+
+    namer = converter_testing.FakeNamer()
+    program_ctx = converter.ProgramContext(
+        options=opts,
+        partial_types=None,
+        autograph_module=None,
+        uncompiled_modules=())
+    entity_info = transformer.EntityInfo(
+        source_code='',
+        source_file='<fragment>',
+        namespace={},
+        arg_values=None,
+        arg_types={},
+        owner_type=None)
+    ctx = converter.EntityContext(namer, entity_info, program_ctx)
+    opts_ast = opts.to_ast(ctx)
+
+    template = '''
+    def test_fn():
+      return opts_ast
+    '''
+    opts_packed = templates.replace(template, opts_ast=opts_ast)
+
+    reparsed, _ = compiler.ast_to_object(opts_packed)
+    reparsed.__dict__['ag__'] = self.make_fake_mod(
+        'fake_ag', converter.ConversionOptions, converter.Feature)
+
+    reparsed_opts = reparsed.test_fn()
+
+    self.assertEqual(opts.recursive, reparsed_opts.recursive)
+    self.assertEqual(opts.verbose, reparsed_opts.verbose)
+    self.assertEqual(opts.force_conversion, reparsed_opts.force_conversion)
+    self.assertEqual(
+        opts.internal_convert_user_code,
+        reparsed_opts.internal_convert_user_code)
+    self.assertEqual(opts.optional_features, reparsed_opts.optional_features)
+
+  def test_should_strip_weakrefs(self):
+    def test_fn():
+      pass
+
+    def weak_test_fn_a():
+      pass
+
+    def weak_test_fn_b():
+      pass
+
+    def weak_test_fn_c():
+      pass
+
+    wr_a = weakref.ref(weak_test_fn_a)
+    # Create an extra weakref to check whether the existence of multiple weak
+    # references influences the process.
+    _ = weakref.ref(weak_test_fn_b)
+    wr_b = weakref.ref(weak_test_fn_b)
+    _ = weakref.ref(weak_test_fn_c)
+
+    opts = converter.ConversionOptions(strip_decorators=(test_fn, wr_a, wr_b))
+
+    self.assertTrue(opts.should_strip(test_fn))
+    self.assertTrue(opts.should_strip(weak_test_fn_a))
+    self.assertTrue(opts.should_strip(weak_test_fn_b))
+    self.assertFalse(opts.should_strip(weak_test_fn_c))
+
+
 class ConverterBaseTest(converter_testing.TestCase):
 
   def test_get_definition_directive_basic(self):
diff --git a/tensorflow/python/autograph/core/converter_testing.py b/tensorflow/python/autograph/core/converter_testing.py
index f1374081d3c6e0dd93c39d331c76404859b2f40a..56445dbd456eb07d5e3b5fec6a3da3023cd069f4 100644
--- a/tensorflow/python/autograph/core/converter_testing.py
+++ b/tensorflow/python/autograph/core/converter_testing.py
@@ -39,9 +39,7 @@ from tensorflow.python.autograph.pyct import pretty_printer
 from tensorflow.python.autograph.pyct import transformer
 from tensorflow.python.platform import test
 
-
-def imported_decorator(f):
-  return lambda a: f(a) + 1
+RESULT_OF_MOCK_CONVERTED_CALL = 7
 
 
 # TODO(mdan): We should use the real namer here.
@@ -50,6 +48,7 @@ class FakeNamer(object):
 
   def __init__(self):
     self.i = 0
+    self.partial_types = ()
 
   def new_symbol(self, name_root, used):
     while True:
@@ -95,8 +94,8 @@ class TestCase(test.TestCase):
     self.dynamic_calls = []
     def converted_call(*args):
       """Mock version of api.converted_call."""
-      self.dynamic_calls.append(args)
-      return 7
+      self.dynamic_calls.append(args[3:])  # args only; see api.converted_call
+      return RESULT_OF_MOCK_CONVERTED_CALL
 
     try:
       result, source = compiler.ast_to_object(node, include_source_map=True)
@@ -107,11 +106,13 @@ class TestCase(test.TestCase):
                                    converter.ConversionOptions)
       fake_ag.__dict__.update(operators.__dict__)
       fake_ag.__dict__.update(special_functions.__dict__)
-      fake_ag.__dict__['utils'] = utils
-      fake_ag.__dict__['rewrite_graph_construction_error'] = (
+      fake_ag.ConversionOptions = converter.ConversionOptions
+      fake_ag.Feature = converter.Feature
+      fake_ag.utils = utils
+      fake_ag.rewrite_graph_construction_error = (
           errors.rewrite_graph_construction_error)
-      fake_ag.__dict__['function_scope'] = function_wrapping.function_scope
-      result.__dict__['ag__'] = fake_ag
+      fake_ag.function_scope = function_wrapping.function_scope
+      result.ag__ = fake_ag
       for k, v in namespace.items():
         result.__dict__[k] = v
       yield result
diff --git a/tensorflow/python/autograph/core/naming.py b/tensorflow/python/autograph/core/naming.py
index b8d79daebaa6d6dcf5f324f637a3b496f3742b92..245795c3d2e1c8c33f7de6ee01e17f43433bd410 100644
--- a/tensorflow/python/autograph/core/naming.py
+++ b/tensorflow/python/autograph/core/naming.py
@@ -18,8 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import enum
+
 from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import qual_names
+from tensorflow.python.autograph.utils import misc
+
+
+class _NamingStyle(enum.Enum):
+  SNAKE = 1
+  CAMEL = 2
 
 
 class Namer(object):
@@ -46,17 +54,52 @@ class Namer(object):
 
     self.generated_names = set()
 
+  def _as_symbol_name(self, fqn, style=_NamingStyle.SNAKE):
+    """Returns a symbol name that matches a fully-qualified name.
+
+    The returned name is safe to use for Python symbols. Any special characters
+    present in fqn are replaced according to the style argument.
+
+    Examples:
+
+      self._as_symbol_name('foo.bar', style=_NamingStyle.CAMEL) == 'FooBar'
+      self._as_symbol_name('foo.bar', style=_NamingStyle.SNAKE) == 'foo_bar'
+
+    See the unit tests for more examples.
+
+    Args:
+      fqn: Union[Text, Tuple[Text]] a fully-qualified symbol name. The qualifier
+        may include module, class names, attributes, etc.
+      style: _NamingStyle
+    Returns:
+      Text
+    """
+    assert style in _NamingStyle
+
+    if isinstance(fqn, tuple):
+      cn = '.'.join(fqn)
+    else:
+      cn = fqn
+
+    # Until we clean up the whole FQN mechanism, `fqn` may not be
+    # canonical, that is, in can appear as ('foo.bar', 'baz')
+    # This replaces any characters that might remain because of that.
+    pieces = cn.split('.')
+
+    if style == _NamingStyle.CAMEL:
+      pieces = tuple(misc.capitalize_initial(p) for p in pieces)
+      return ''.join(pieces)
+    elif style == _NamingStyle.SNAKE:
+      return '_'.join(pieces)
+
   def compiled_class_name(self, original_fqn, live_entity=None):
     """See call_trees.FunctionNamer.compiled_class_name."""
     if live_entity is not None and live_entity in self.renamed_calls:
       return self.renamed_calls[live_entity]
 
-    if isinstance(original_fqn, tuple):
-      original_name = '__'.join(original_fqn)
-    else:
-      original_name = original_fqn
-
-    new_name_root = 'Tf%s' % original_name
+    canonical_name = self._as_symbol_name(
+        original_fqn, style=_NamingStyle.CAMEL)
+    new_name_root = 'Tf%s' % canonical_name
     new_name = new_name_root
     n = 0
     while new_name in self.global_namespace:
@@ -73,7 +116,6 @@ class Namer(object):
                              live_entity=None,
                              owner_type=None):
     """See call_trees.FunctionNamer.compiled_function_name."""
-
     if not self.recursive:
       return None, False
 
@@ -84,15 +126,12 @@ class Namer(object):
       # Members are not renamed when part of an entire converted class.
       return None, False
 
-    if isinstance(original_fqn, tuple):
-      original_name = '__'.join(original_fqn)
-    else:
-      original_name = original_fqn
-
     if live_entity is not None and live_entity in self.renamed_calls:
       return self.renamed_calls[live_entity], True
 
-    new_name_root = 'tf__%s' % original_name
+    canonical_name = self._as_symbol_name(
+        original_fqn, style=_NamingStyle.SNAKE)
+    new_name_root = 'tf__%s' % canonical_name
     new_name = new_name_root
     n = 0
     while new_name in self.global_namespace:
diff --git a/tensorflow/python/autograph/core/naming_test.py b/tensorflow/python/autograph/core/naming_test.py
index 2db98836d1e3bce73aacd736867c96d4d19390d2..cc8c4314a700ac43ff5d21ad32706a0c3d5be0f5 100644
--- a/tensorflow/python/autograph/core/naming_test.py
+++ b/tensorflow/python/autograph/core/naming_test.py
@@ -45,6 +45,22 @@ class NamerTest(test.TestCase):
     self.assertEqual(('tf__foo', True), namer.compiled_function_name(
         'foo', foo))
 
+  def test_compiled_function_name_unsanitized_fqn(self):
+    namer = naming.Namer({}, True, None, ())
+    self.assertEqual(('tf__foo_bar', True),
+                     namer.compiled_function_name('foo.bar'))
+    self.assertEqual(('tf__foo_bar_baz', True), namer.compiled_function_name(
+        ('foo.bar', 'baz')))
+
+  def test_compiled_class_name_basic(self):
+    namer = naming.Namer({}, True, None, ())
+    self.assertEqual('TfFooBar', namer.compiled_class_name(('foo', 'Bar')))
+
+  def test_compiled_class_name_unsanitized_fqn(self):
+    namer = naming.Namer({}, True, None, ())
+    self.assertEqual('TfFooBarBaz',
+                     namer.compiled_class_name(('foo.bar', 'Baz')))
+
   def test_compiled_function_name_avoids_global_conflicts(self):
     def foo():
       pass
diff --git a/tensorflow/python/autograph/core/unsupported_features_checker.py b/tensorflow/python/autograph/core/unsupported_features_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ccbb76fea1c8c9068b1bc1f64cc0f00a0ca2e35
--- /dev/null
+++ b/tensorflow/python/autograph/core/unsupported_features_checker.py
@@ -0,0 +1,54 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Checkers for detecting unsupported Python features."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.python.autograph.pyct import errors
+
+
+class UnsupportedFeaturesChecker(gast.NodeTransformer):
+  """Quick check for Python features we know we don't support.
+
+  Any features detected will cause AutoGraph to not compile a function.
+  """
+
+  # TODO(b/124103128): Implement support for `global` statements
+  def visit_Global(self, node):
+    raise errors.AutoGraphError(
+        'The global keyword is not yet supported.')
+
+  def visit_Nonlocal(self, node):
+    raise errors.AutoGraphError(
+        'The nonlocal keyword is not yet supported.')
+
+  # These checks could potentially be replaced with inspect.isgeneratorfunction
+  # to avoid a getsource/parse/ast-walk round trip.
+  def visit_Yield(self, node):
+    raise errors.AutoGraphError(
+        'Generators are not supported by AutoGraph')
+
+  def visit_YieldFrom(self, node):
+    raise errors.AutoGraphError(
+        'Generators are not supported by AutoGraph')
+
+
+def verify(node):
+  UnsupportedFeaturesChecker().visit(node)
+
diff --git a/tensorflow/python/autograph/impl/BUILD b/tensorflow/python/autograph/impl/BUILD
index 201a88875413982b0f1a791f3408b403a3259eb8..66f7915696ec400675810b8b954e6812294f0760 100644
--- a/tensorflow/python/autograph/impl/BUILD
+++ b/tensorflow/python/autograph/impl/BUILD
@@ -1,6 +1,6 @@
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 filegroup(
     name = "all_files",
@@ -37,25 +37,23 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "api_test",
     srcs = ["api_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":impl",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/utils",
-        "//third_party/py/numpy",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "conversion_test",
     srcs = ["conversion_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":impl",
-        "//tensorflow/python:client_testlib",
         "@gast_archive//:gast",
+        "//tensorflow/python:client_testlib",
     ],
 )
diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py
index f7774888c8a5ccb8a64186476d6e78b999e527ba..356cdc0e3bc3431a43ed78c981bff30380abec2f 100644
--- a/tensorflow/python/autograph/impl/api.py
+++ b/tensorflow/python/autograph/impl/api.py
@@ -18,7 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+import copy
 import functools
+import pdb
 import sys
 
 from enum import Enum
@@ -33,13 +36,15 @@ from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.impl import conversion
 from tensorflow.python.autograph.operators import py_builtins
 from tensorflow.python.autograph.pyct import compiler
+from tensorflow.python.autograph.pyct import errors
 from tensorflow.python.autograph.pyct import inspect_utils
+from tensorflow.python.autograph.utils import ag_logging as logging
 from tensorflow.python.autograph.utils import py_func
-from tensorflow.python.data.util import nest
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import tf_export
 
 # TODO(mdan): Properly document the type hints.
 # TODO(mdan): Reduce the type hint information to (module, type).
@@ -85,7 +90,7 @@ def convert(
               verbose=verbose,
               force_conversion=True,
               optional_features=optional_features,
-          ), *args, **kwargs)
+          ), args, kwargs)
 
     wrapper = tf_decorator.make_decorator(f, wrapper)
 
@@ -111,6 +116,12 @@ class RunMode(Enum):
   PY_FUNC = 2
 
 
+def do_not_convert_internal(f):
+  """Decorator that marks internal functions which do not need conversion."""
+  setattr(f, '__ag_compiled', True)
+  return f
+
+
 def do_not_convert(run_as=RunMode.GRAPH, return_dtypes=None):
   """Decorator that suppresses the conversion of a function.
 
@@ -149,19 +160,17 @@ def do_not_convert(run_as=RunMode.GRAPH, return_dtypes=None):
     else:
       raise ValueError('unknown value for run_as: %s' % run_as)
 
-    # Sometimes the decorator is just desugared, making it impossible to detect.
-    # This attribute makes detection easier.
     setattr(wrapper, '__ag_compiled', True)
     return wrapper
 
   return decorator
 
 
-# TODO(mdan): Move to a private, undocumented module.
-def converted_call(f, owner, options, *args, **kwargs):
+def converted_call(f, owner, options, args, kwargs):
   """Compiles a function call inline. For internal use only."""
-  if options.verbose >= converter.Verbosity.VERBOSE:
-    logging.info('Converted call: {}; owner: {}'.format(f, owner))
+  logging.log(1,
+              'Converted call: %s; owner: %s\n    args: %s\n    kwargs: %s\n',
+              f, owner, args, kwargs)
 
   if owner is not None:
     if not isinstance(f, str):
@@ -180,16 +189,47 @@ def converted_call(f, owner, options, *args, **kwargs):
   if inspect_utils.isbuiltin(f):
     return py_builtins.overload_of(f)(*args, **kwargs)
 
+  # TODO(b/122265385): Remove this bypass.
+  if ('wrapt' in sys.modules and
+      hasattr(sys.modules['wrapt'], 'FunctionWrapper') and
+      isinstance(f, sys.modules['wrapt'].FunctionWrapper)):
+    logging.warn(
+        'Entity {} appears to be decorated by wrapt, which is not yet supported'
+        ' by AutoGraph. The function will be called without transformation.'
+        ' You may however apply AutoGraph before the decorator.'.format(f), 1)
+    logging.log(2, 'Permanently whitelisted: %s: wrapt decorated', f)
+    return f(*args, **kwargs)
+
+  # Constructors are permanently whitelisted.
+  # TODO(mdan): Toggle as experimental feature instead.
+  # TODO(b/124016764): Remove this limitation.
+  if tf_inspect.isclass(f):
+    logging.log(2, 'Permanently whitelisted: %s: constructor', f)
+    return f(*args, **kwargs)
+
+  # Other built-in modules are permanently whitelisted.
+  # TODO(mdan): Figure out how to do this consistently for all stdlib modules.
+  # Note: TF linter disallows importing inspect.
+  if any(f in m.__dict__.values()
+         for m in (collections, pdb, copy, tf_inspect._inspect)):  # pylint:disable=protected-access
+    logging.log(2, 'Permanently whitelisted: %s: part of builtin module', f)
+    return f(*args, **kwargs)
+
   # TODO(mdan): This needs cleanup.
-  # In particular, we may want to avoid renaming functions altogether.
   if not options.force_conversion and conversion.is_whitelisted_for_graph(f):
 
+    # TODO(mdan): This may be inconsistent in certain situations.
+    # If the function had already been annotated with @tf.function, it
+    # may be bound to the incorrect object. It's unclear if those situations
+    # are possible, but if they happen, we need to check if f is bound
+    # to a shim like WeakrefSelf and unpack it.
+
     # Args typically include `self`, as required by the conversion process.
     # When conversion is skipped, `self` is not necessary, because the
     # original bound method is being executed. This code removes it.
     if tf_inspect.ismethod(f) and args:
-      f_class = inspect_utils.getmethodclass(f)
-      if args[0] is f_class:
+      f_self = inspect_utils.getmethodself(f)
+      if args[0] is f_self:
         args = args[1:]
 
     return f(*args, **kwargs)
@@ -201,91 +241,121 @@ def converted_call(f, owner, options, *args, **kwargs):
   if not options.internal_convert_user_code:
     return f(*args, **kwargs)
 
-  # Unwrap functools.partial objects
-  # TODO(allenl, mdan): Consider sharing unwrapping logic with tf_inspect.
-  while isinstance(f, functools.partial):
-    args = f.args + args
-    new_kwargs = {}
-    if f.keywords is not None:
-      new_kwargs.update(f.keywords)
-    new_kwargs.update(kwargs)
-    kwargs = new_kwargs
-    f = f.func
-
-  if tf_inspect.isfunction(f) or tf_inspect.ismethod(f):
-    # Regular functions
-    target_entity = f
-    arg_map_target = f
-    f_class = inspect_utils.getmethodclass(f)
-
-    # TODO(b/119246461): This may be more elegantly handled using __get__?
-    if f_class is not None:
-      # If this is a method call, it may or may not include self.
-      #
-      # Example when self is included:
-      #   converted_call(to_graph(foo.bar), foo)
-      #
-      # Example when self is not included:
-      #   super(...).foo(args)
-      #
-      if owner is not None and (not args or args[0] is not owner):
-        effective_args = (owner,) + args
-      else:
-        # When the owner is not specified, use the result of
-        # inspect_utils.getmethodclass.
-        # TODO(b/119246461): Make sure an owner is always specified.
-        if not args or args[0] is not f_class:
-          effective_args = (f_class,) + args
+  # TODO(mdan): Move this entire block inside to_graph.
+  try:  # Begin of transformation error guards
+
+    # Unwrap functools.partial objects
+    # TODO(mdan): Consider sharing unwrapping logic with tf_inspect.
+    while isinstance(f, functools.partial):
+      args = f.args + args
+      new_kwargs = {}
+      if f.keywords is not None:
+        new_kwargs.update(f.keywords)
+      new_kwargs.update(kwargs)
+      kwargs = new_kwargs
+      f = f.func
+
+    if tf_inspect.isfunction(f) or tf_inspect.ismethod(f):
+      # Regular functions
+      target_entity = f
+      arg_map_target = f
+      f_self = inspect_utils.getmethodself(f)
+
+      # TODO(b/119246461): This may be more elegantly handled using __get__?
+      if f_self is not None:
+        # If this is a method call, it may or may not include self.
+        #
+        # Example when self is included:
+        #   converted_call(to_graph(foo.bar), foo)
+        #
+        # Example when self is not included:
+        #   super(...).foo(args)
+        #
+        if owner is not None and (not args or args[0] is not owner):
+          effective_args = (owner,) + args
         else:
-          effective_args = (f_class,) + args[1:]
-      partial_types = (f_class,)
-    else:
+          # When the owner is not specified, use the result of
+          # inspect_utils.getmethodclass.
+          # TODO(b/119246461): Make sure an owner is always specified.
+          if not args or args[0] is not f_self:
+            effective_args = (f_self,) + args
+          else:
+            effective_args = (f_self,) + args[1:]
+        partial_types = (f_self,)
+      else:
+        effective_args = args
+        partial_types = ()
+
+    elif tf_inspect.isclass(f):
+      # Constructors
+      # Note: Until we support class constructurs, and enable whole-class
+      # conversion with an experimental flag, this branch is dead code.
+      # TODO(mdan): Consider removing unless there is a compelling use case.
+      target_entity = f
+      arg_map_target = f.__init__
       effective_args = args
       partial_types = ()
 
-  elif tf_inspect.isclass(f):
-    # Constructors
-    target_entity = f
-    arg_map_target = f.__init__
-    effective_args = args
-    partial_types = ()
-
-  elif hasattr(f, '__call__') and hasattr(f, '__class__'):
-    # Callable objects
-    target_entity = f.__call__
-    arg_map_target = f.__call__
-    effective_args = (f,) + args
-    partial_types = (f.__class__,)
-
-  else:
-    NotImplementedError('unknown callable type "%s"' % type(f))
-
-  arg_values = tf_inspect.getcallargs(arg_map_target, *args, **kwargs)
-  arg_types = {}
-  for name, arg in arg_values.items():
-    arg_class = arg.__class__
-    arg_types[name] = (arg_class.__name__, arg_class)
-
-  # When called from within a decorator, this is the only indication that
-  # the function is a method - it appears that the decorator is applied
-  # before the method is bound.
-  if not partial_types:
-    if 'self' in arg_values:
-      if tf_inspect.isclass(arg_values['self'].__class__):
-        partial_types = (arg_values['self'].__class__,)
-    elif 'cls' in arg_values:
-      if tf_inspect.isclass(arg_values['cls']):
-        partial_types = (arg_values['cls'],)
-
-  converted_f = to_graph(
-      target_entity,
-      recursive=options.recursive,
-      verbose=options.verbose,
-      arg_values=arg_values,
-      arg_types=arg_types,
-      partial_types=partial_types,
-      strip_decorators=options.strip_decorators,
-      optional_features=options.optional_features)
+    elif hasattr(f, '__call__') and hasattr(f, '__class__'):
+      # Callable objects
+      target_entity = f.__call__
+      arg_map_target = f.__call__
+      effective_args = (f,) + args
+      partial_types = (f.__class__,)
+
+    else:
+      raise NotImplementedError('unknown callable type "%s"' % type(f))
+
+    arg_values = tf_inspect.getcallargs(arg_map_target, *args, **kwargs)
+    arg_types = {}
+    for name, arg in arg_values.items():
+      arg_class = arg.__class__
+      arg_types[name] = (arg_class.__name__, arg_class)
+
+    # When called from within a decorator, this is the only indication that
+    # the function is a method - it appears that the decorator is applied
+    # before the method is bound.
+    if not partial_types:
+      if 'self' in arg_values:
+        if tf_inspect.isclass(arg_values['self'].__class__):
+          partial_types = (arg_values['self'].__class__,)
+      elif 'cls' in arg_values:
+        if tf_inspect.isclass(arg_values['cls']):
+          partial_types = (arg_values['cls'],)
+
+    logging.log(3, 'Partial types in conversion of %s: %s', target_entity,
+                partial_types)
+
+    converted_f = to_graph(
+        target_entity,
+        recursive=options.recursive,
+        arg_values=arg_values,
+        arg_types=arg_types,
+        experimental_optional_features=options.optional_features,
+        experimental_strip_decorators=options.strip_decorators,
+        experimental_verbose=options.verbose,
+        experimental_partial_types=partial_types)
+
+    if logging.has_verbosity(2):
+      logging.log(2, 'Defaults of %s : %s', converted_f,
+                  converted_f.__defaults__)
+      callargs = tf_inspect.getcallargs(converted_f, *effective_args, **kwargs)
+      formatted_callargs = '\n'.join(
+          '    {}: {}'.format(k, v) for k, v in callargs.items())
+      logging.log(2, 'Calling %s with\n%s\n', converted_f, formatted_callargs)
+
+  # TODO(mdan): Reduce this list.
+  except (errors.AutoGraphError, AssertionError, AttributeError, IndexError,
+          KeyError, NameError, NotImplementedError, SyntaxError, TypeError,
+          ValueError, IOError) as e:
+    logging.log(1, 'Error transforming entity %s', target_entity, exc_info=True)
+    logging.warn(
+        'Entity %s could not be transformed and will be staged without change.'
+        ' Error details can be found in the logs when running with the env'
+        ' variable AUTOGRAPH_VERBOSITY >= 1. Please report this to the'
+        ' AutoGraph team. Cause: %s', target_entity, e)
+
+    return f(*args, **kwargs)
 
   result = converted_f(*effective_args, **kwargs)
 
@@ -314,139 +384,204 @@ def _is_not_callable(obj):
   return False
 
 
-# TODO(mdan): Rename: to_ops?
-# TODO(mdan): Look into overloading as function and decorator, like tfe.defun?
-# TODO(mdan): Remove partial_types.
-def to_graph(e,
+@tf_export('autograph.to_graph')
+def to_graph(entity,
              recursive=True,
-             verbose=converter.Verbosity.VERBOSE,
              arg_values=None,
              arg_types=None,
-             partial_types=None,
-             strip_decorators=None,
-             optional_features=converter.Feature.ALL):
-  """Converts a Python entity into equivalent code that uses TensorFlow ops.
+             experimental_optional_features=converter.Feature.ALL,
+             experimental_strip_decorators=None,
+             experimental_verbose=converter.Verbosity.BRIEF,
+             experimental_partial_types=None):
+  """Converts a Python entity into a TensorFlow graph.
+
+  Also see: `tf.autograph.to_code`, `tf.function`.
+
+  Unlike `tf.function`, `to_graph` is a low-level transpiler that converts
+  Python code to TensorFlow graph code. It does not implement any caching,
+  variable management or create any actual ops, and is best used where greater
+  control over the generated TensorFlow graph is desired. Another difference
+  from `tf.function` is that `to_graph` will not wrap the graph into a
+  TensorFlow function or a Python callable. Internally, `tf.function` uses
+  `to_graph`.
+
+  _Example Usage_
+
+  ```python
+    def foo(x):
+      if x > 0:
+        y = x * x
+      else:
+        y = -x
+      return y
+
+    converted_foo = to_graph(foo)
+
+    x = tf.constant(1)
+    y = converted_foo(x)  # converted_foo is a TensorFlow Op-like.
+    assert is_tensor(y)
+  ```
 
   Supported Python entities include:
     * functions
     * classes
+    * object methods
+
+  Functions are converted into new functions with converted code.
 
-  Classes are converted by converting all their methods into a new class.
+  Classes are converted by generating a new class whose methods use converted
+  code.
+
+  Methods are converted into unbound function that have an additional first
+  argument called `self`.
 
   Args:
-    e: Union[Callable, Type], the Python entity to convert.
-    recursive: bool, whether to recursively convert any functions that the
+    entity: Python callable or class to convert.
+    recursive: Whether to recursively convert any functions that the
       converted function may call.
-    verbose: converter.Verbosity, the level of printing verbosity to use.
-    arg_values: Optional[Dict[Text, Any]], value hints for symbols including
-      function arguments.
-    arg_types: Optional[Dict[Text, Type]], type hints for symbols including
-      function arguments.
-    partial_types: Set[Type], reserved for internal use.
-    strip_decorators: Tuple[Callable], same as
-      ConversionOptions.strip_decorators.
-    optional_features: Union[Feature, Set[Feature]], same as
-      ConversionOptions.optional_features.
+    arg_values: Optional dict of value hints for symbols including
+      function arguments mapping string names to actual values. For example,
+      `arg_values={'a': 1}` will map the variable `a` to the value `1`.
+    arg_types: Optional dict of type hints for symbols including function
+      arguments. Type hints allow specifying just the type of a variable, rather
+      than a specific value.
+    experimental_optional_features: `None`, a tuple of, or a single
+      `tf.autograph.experimental.Feature` value. Controls the use of
+      optional features in the conversion process.
+    experimental_strip_decorators: A tuple specifying decorators that should be
+      excluded from the compiled output. By default, when converting a function
+      before the decorators are applied, the compiled output will include those
+      decorators.
+    experimental_verbose: The level of printing verbosity to use, as a
+      `tf.autograph.experimental.Verbosity` value.
+    experimental_partial_types: A `set` of `type` values, reserved for internal
+      use.
 
   Returns:
-    Union[Callable, Type], the converted entity, which is the same kind as e
-    (that is, a function is e is a function, a class if e is a class, etc.) but
-    its code has been converted to use TF ops.
+    Same as `entity`, the converted Python function or class.
 
   Raises:
     ValueError: If the entity could not be converted.
   """
-  if strip_decorators is None:
-    strip_decorators = ()
-  strip_decorators += (convert, do_not_convert, converted_call)
-
-  program_ctx = converter.ProgramContext(
-      options=converter.ConversionOptions(
-          recursive=recursive,
-          verbose=verbose,
-          strip_decorators=strip_decorators,
-          optional_features=optional_features),
-      partial_types=partial_types,
-      autograph_module=tf_inspect.getmodule(to_graph),
-      uncompiled_modules=config.DEFAULT_UNCOMPILED_MODULES)
-  _, name, namespace = conversion.entity_to_graph(e, program_ctx, arg_values,
-                                                  arg_types)
-
-  nodes = []
-  for dep in reversed(program_ctx.conversion_order):
-    nodes.extend(program_ctx.dependency_cache[dep])
-
-  compiled_module, _ = compiler.ast_to_object(
-      nodes,
-      source_prefix=program_ctx.required_imports,
-      include_source_map=True)
-
-  # The compiled code should see everything the entry entity saw.
-  # TODO(mdan): This might not work well if the call tree spans modules?
-  for key, val in namespace.items():
-    # Avoid overwriting entities that have been transformed.
-    if key not in compiled_module.__dict__:
-      compiled_module.__dict__[key] = val
-  compiled = getattr(compiled_module, name)
-
-  if tf_inspect.isfunction(e):
-    compiled.__defaults__ = e.__defaults__
-
-  if hasattr(compiled, '__globals__'):
-    # Remove self to avoid circular references. This will probably only work
-    # so long as the function is not reentrant.
-    del compiled.__globals__[name]
-
-  # Need this so the source_mapping attribute is available for the context
-  # manager to access for runtime errors.
-  #
-  # Note that compiler.ast_to_object attaches the source map 'ag_source_map__'
-  # symbol to the compiled module.
-  # TODO(mdan): Record this statically in the generated code.
-  # TODO(mdan): Rename this attribute to 'autograph_info__'
-  source_map_attribute_name = 'ag_source_map'
-  if getattr(compiled, source_map_attribute_name, None) is not None:
-    raise ValueError('cannot convert %s because is has an attribute '
-                     '"%s", which is reserved for AutoGraph.' %
-                     (compiled, source_map_attribute_name))
-  setattr(compiled, source_map_attribute_name,
-          compiled_module.__dict__['ag_source_map__'])
-
-  return compiled
-
-
-def to_code(e,
+  try:
+    if experimental_strip_decorators is None:
+      experimental_strip_decorators = ()
+    experimental_strip_decorators += (convert, do_not_convert, converted_call)
+
+    program_ctx = converter.ProgramContext(
+        options=converter.ConversionOptions(
+            recursive=recursive,
+            verbose=experimental_verbose,
+            strip_decorators=experimental_strip_decorators,
+            optional_features=experimental_optional_features),
+        partial_types=experimental_partial_types,
+        autograph_module=tf_inspect.getmodule(to_graph),
+        uncompiled_modules=config.DEFAULT_UNCOMPILED_MODULES)
+    _, name, namespace = conversion.entity_to_graph(entity, program_ctx,
+                                                    arg_values, arg_types)
+
+    nodes = []
+    for dep in reversed(program_ctx.conversion_order):
+      nodes.extend(program_ctx.dependency_cache[dep])
+
+    compiled_module, _ = compiler.ast_to_object(
+        nodes,
+        source_prefix=program_ctx.required_imports,
+        include_source_map=True)
+
+    # The compiled code should see everything the entry entity saw.
+    # TODO(mdan): This might not work well if the call tree spans modules?
+    for key, val in namespace.items():
+      # Avoid overwriting entities that have been transformed.
+      if key not in compiled_module.__dict__:
+        compiled_module.__dict__[key] = val
+    for key, val in program_ctx.additional_symbols.items():
+      if key not in compiled_module.__dict__:
+        compiled_module.__dict__[key] = val
+    compiled = getattr(compiled_module, name)
+
+    if hasattr(entity, '__defaults__'):
+      logging.log(3, 'Default args mapping: %s has: %s', entity,
+                  entity.__defaults__)
+      compiled.__defaults__ = entity.__defaults__
+    else:
+      logging.log(3, 'Default args mapping: %s has no __defaults__', entity)
+
+    logging.log(3, 'Namespace of %s includes: %s', compiled,
+                compiled_module.__dict__.keys())
+
+    if hasattr(compiled, '__globals__'):
+      # Remove self to avoid circular references. This will probably only work
+      # so long as the function is not reentrant.
+      del compiled.__globals__[name]
+
+    # Need this so the source_mapping attribute is available for the context
+    # manager to access for runtime errors.
+    #
+    # Note that compiler.ast_to_object attaches the source map 'ag_source_map__'
+    # symbol to the compiled module.
+    # TODO(mdan): Record this statically in the generated code.
+    # TODO(mdan): Rename this attribute to 'autograph_info__'
+    source_map_attribute_name = 'ag_source_map'
+    if getattr(compiled, source_map_attribute_name, None) is not None:
+      # TODO(znado): change input problem errors into TransformError
+      raise ValueError('cannot convert %s because is has an attribute '
+                       '"%s", which is reserved for AutoGraph.' %
+                       (compiled, source_map_attribute_name))
+    setattr(compiled, source_map_attribute_name,
+            compiled_module.__dict__['ag_source_map__'])
+
+    return compiled
+  except (ValueError, AttributeError, KeyError, NameError, AssertionError) as e:
+    errors.report_internal_error(entity, e)
+
+
+@tf_export('autograph.to_code')
+def to_code(entity,
             recursive=True,
             arg_values=None,
             arg_types=None,
-            partial_types=None,
-            indentation='  '):
-  """Returns the equivalent code that uses TensorFlow ops.
+            indentation='  ',
+            experimental_optional_features=converter.Feature.ALL,
+            experimental_partial_types=None):
+  """Similar to `to_graph`, but returns Python source code as a string.
+
+  Also see: `tf.autograph.to_graph`.
 
-  Also see: `to_graph`, `convert`
+  `to_graph` returns the Python source code that can be used to generate a
+  TensorFlow graph that is functionally identical to the input Python code.
 
   Args:
-    e: Union[Callable, Type], the Python entity to convert.
-    recursive: bool, whether to recursively convert any functions that the
+    entity: Python callable or class to convert.
+    recursive: Whether to recursively convert any functions that the
       converted function may call.
-    arg_values: Optional[Dict[Text, Any]], value hints for symbols including
-      function arguments.
-    arg_types: Optional[Dict[Text, Type]], type hints for symbols including
-      function arguments.
-    partial_types: Set[Type], reserved for internal use.
-    indentation: Text, when to use for each level of indentation.
+    arg_values: Optional dict of value hints for symbols including
+      function arguments mapping string names to actual values. For example,
+      `arg_values={'a': 1}` will map the variable `a` to the value `1`.
+    arg_types: Optional dict of type hints for symbols including function
+      arguments. Type hints allow specifying just the type of a variable, rather
+      than a specific value.
+    indentation: The string to use for indenting. Typically two or four spaces,
+      or just the tab character.
+    experimental_optional_features: `None`, a tuple of, or a single
+      `tf.autograph.experimental.Feature` value. Controls the use of
+      optional features in the conversion process.
+    experimental_partial_types: A `set` of `type` values, reserved for internal
+      use.
 
   Returns:
-    Text, the converted code.
+    The converted code as string.
   """
   program_ctx = converter.ProgramContext(
       options=converter.ConversionOptions(
           recursive=recursive,
-          strip_decorators=(convert, do_not_convert, converted_call)),
-      partial_types=partial_types,
+          verbose=converter.Verbosity.BRIEF,
+          strip_decorators=(convert, do_not_convert, converted_call),
+          optional_features=experimental_optional_features),
+      partial_types=experimental_partial_types,
       autograph_module=tf_inspect.getmodule(to_graph),
       uncompiled_modules=config.DEFAULT_UNCOMPILED_MODULES)
-  conversion.entity_to_graph(e, program_ctx, arg_values, arg_types)
+  conversion.entity_to_graph(entity, program_ctx, arg_values, arg_types)
 
   code = '\n'.join(
       compiler.ast_to_source(program_ctx.dependency_cache[dep], indentation)
diff --git a/tensorflow/python/autograph/impl/api_test.py b/tensorflow/python/autograph/impl/api_test.py
index d5561ba8249f539e720fa1ecb5800b76c61a8c2f..aa9ad0c882e9699e3c46f3ea2203b1ebf42fc3a3 100644
--- a/tensorflow/python/autograph/impl/api_test.py
+++ b/tensorflow/python/autograph/impl/api_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import functools
 import gc
 
@@ -26,6 +27,8 @@ import numpy as np
 from tensorflow.python.autograph import utils
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.impl import api
+from tensorflow.python.autograph.pyct import errors
+from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.utils import py_func
 from tensorflow.python.framework import constant_op
@@ -39,6 +42,9 @@ from tensorflow.python.util import tf_inspect
 tf = utils.fake_tf()
 
 
+testing_global_numeric = 2
+
+
 class TestResource(str):
   pass
 
@@ -46,7 +52,7 @@ class TestResource(str):
 class ApiTest(test.TestCase):
 
   @test_util.run_deprecated_v1
-  def test_decorator_recurses(self):
+  def test_decorator_recursive(self):
 
     class TestClass(object):
 
@@ -69,7 +75,7 @@ class ApiTest(test.TestCase):
       self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   @test_util.run_deprecated_v1
-  def test_decorator_does_not_recurse(self):
+  def test_decorator_not_recursive(self):
 
     class TestClass(object):
 
@@ -90,7 +96,7 @@ class ApiTest(test.TestCase):
       self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   @test_util.run_deprecated_v1
-  def test_decorator_calls_unconverted_graph(self):
+  def test_convert_then_do_not_convert_graph(self):
 
     class TestClass(object):
 
@@ -105,14 +111,13 @@ class ApiTest(test.TestCase):
         return x
 
     tc = TestClass()
-    with self.cached_session() as sess:
-      x = tc.test_method(
-          constant_op.constant([2, 4]), constant_op.constant(1),
-          constant_op.constant(-2))
-      self.assertListEqual([0, 1], self.evaluate(x).tolist())
+    x = tc.test_method(
+        constant_op.constant((2, 4)), constant_op.constant(1),
+        constant_op.constant(-2))
+    self.assertAllEqual((0, 1), self.evaluate(x))
 
   @test_util.run_deprecated_v1
-  def test_decorator_calls_unconverted_py_func(self):
+  def test_convert_then_do_not_convert_py_func(self):
 
     class TestClass(object):
 
@@ -132,11 +137,10 @@ class ApiTest(test.TestCase):
         return x
 
     tc = TestClass()
-    with self.cached_session() as sess:
-      x = tc.test_method(
-          constant_op.constant([2, 4]), constant_op.constant(1),
-          constant_op.constant(-2))
-      self.assertListEqual([0, 1], self.evaluate(x).tolist())
+    x = tc.test_method(
+        constant_op.constant((2, 4)), constant_op.constant(1),
+        constant_op.constant(-2))
+    self.assertAllEqual((0, 1), self.evaluate(x))
 
   @test_util.run_deprecated_v1
   def test_decorator_calls_decorated(self):
@@ -192,18 +196,17 @@ class ApiTest(test.TestCase):
       def test_method(self, x, s, a):
         while tf.reduce_sum(x) > s:
           x //= api.converted_call(self.called_member, None,
-                                   converter.ConversionOptions(), self, a)
+                                   converter.ConversionOptions(), (self, a), {})
         return x
 
     tc = TestClass()
-    with self.cached_session() as sess:
-      x = tc.test_method(
-          constant_op.constant([2, 4]), constant_op.constant(1),
-          constant_op.constant(-2))
-      self.assertListEqual([0, 1], self.evaluate(x).tolist())
+    x = tc.test_method(
+        constant_op.constant([2, 4]), constant_op.constant(1),
+        constant_op.constant(-2))
+    self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   def test_converted_call_builtin(self):
-    x = api.converted_call(range, None, converter.ConversionOptions(), 3)
+    x = api.converted_call(range, None, converter.ConversionOptions(), (3,), {})
     self.assertEqual((0, 1, 2), tuple(x))
 
   def test_converted_call_function(self):
@@ -213,10 +216,9 @@ class ApiTest(test.TestCase):
         return -x
       return x
 
-    with self.cached_session() as sess:
-      x = api.converted_call(test_fn, None, converter.ConversionOptions(),
-                             constant_op.constant(-1))
-      self.assertEqual(1, self.evaluate(x))
+    x = api.converted_call(test_fn, None, converter.ConversionOptions(),
+                           (constant_op.constant(-1),), {})
+    self.assertEqual(1, self.evaluate(x))
 
   @test_util.run_v1_only('b/120545219')
   def test_converted_call_functools_partial(self):
@@ -227,16 +229,14 @@ class ApiTest(test.TestCase):
       return x, y, z
 
     x = api.converted_call(
-        functools.partial(test_fn, constant_op.constant(-1), z=-3),
-        None, converter.ConversionOptions(),
-        constant_op.constant(-2))
+        functools.partial(test_fn, constant_op.constant(-1), z=-3), None,
+        converter.ConversionOptions(), (constant_op.constant(-2),), {})
     self.assertEqual((1, 2, 3), self.evaluate(x))
 
     x = api.converted_call(
         functools.partial(
-            functools.partial(test_fn, constant_op.constant(-1)), z=-3),
-        None, converter.ConversionOptions(),
-        constant_op.constant(-2))
+            functools.partial(test_fn, constant_op.constant(-1)), z=-3), None,
+        converter.ConversionOptions(), (constant_op.constant(-2),), {})
     self.assertEqual((1, 2, 3), self.evaluate(x))
 
   def test_converted_call_method_explicit_owner(self):
@@ -259,11 +259,31 @@ class ApiTest(test.TestCase):
           return -self.x
         return self.x
 
-    with self.cached_session() as sess:
-      tc = TestClass(constant_op.constant(-1))
-      x = api.converted_call(tc.test_method, None,
-                             converter.ConversionOptions(), tc)
-      self.assertEqual(1, self.evaluate(x))
+    tc = TestClass(constant_op.constant(-1))
+    x = api.converted_call(tc.test_method, None, converter.ConversionOptions(),
+                           (tc,), {})
+    self.assertEqual(1, self.evaluate(x))
+
+  def test_converted_call_method_converts_recursively(self):
+
+    class TestClass(object):
+
+      def __init__(self, x):
+        self.x = x
+
+      def other_method(self):
+        if self.x < 0:
+          return -self.x
+        return self.x
+
+      def test_method(self):
+        return self.other_method()
+
+    tc = TestClass(constant_op.constant(-1))
+    x = api.converted_call(tc.test_method, None,
+                           converter.ConversionOptions(recursive=True), (tc,),
+                           {})
+    self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_method_by_class(self):
 
@@ -277,11 +297,10 @@ class ApiTest(test.TestCase):
           return -self.x
         return self.x
 
-    with self.cached_session() as sess:
-      tc = TestClass(constant_op.constant(-1))
-      x = api.converted_call(TestClass.test_method, None,
-                             converter.ConversionOptions(), tc)
-      self.assertEqual(1, self.evaluate(x))
+    tc = TestClass(constant_op.constant(-1))
+    x = api.converted_call(TestClass.test_method, None,
+                           converter.ConversionOptions(), (tc,), {})
+    self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_callable_object(self):
 
@@ -295,11 +314,11 @@ class ApiTest(test.TestCase):
           return -self.x
         return self.x
 
-    with self.cached_session() as sess:
-      tc = TestClass(constant_op.constant(-1))
-      x = api.converted_call(tc, None, converter.ConversionOptions())
-      self.assertEqual(1, self.evaluate(x))
+    tc = TestClass(constant_op.constant(-1))
+    x = api.converted_call(tc, None, converter.ConversionOptions(), (), {})
+    self.assertEqual(1, self.evaluate(x))
 
+  @test_util.run_deprecated_v1
   def test_converted_call_constructor(self):
 
     class TestClass(object):
@@ -312,27 +331,44 @@ class ApiTest(test.TestCase):
           return -self.x
         return self.x
 
-    with self.cached_session() as sess:
-      tc = api.converted_call(TestClass, None, converter.ConversionOptions(),
-                              constant_op.constant(-1))
-      # tc is now a converted object.
-      x = tc.test_method()
-      self.assertEqual(1, self.evaluate(x))
+    tc = api.converted_call(TestClass, None, converter.ConversionOptions(),
+                            (constant_op.constant(-1),), {})
+    # tc is still a TestClass - constructors are whitelisted.
+    # TODO(b/124016764): Support this use case.
+    # The error below is specific to the `if` statement not being converted.
+    with self.assertRaisesRegex(
+        TypeError, 'Using a `tf.Tensor` as a Python `bool`'):
+      tc.test_method()
 
   def test_converted_call_already_converted(self):
 
     def f(x):
       return x == 0
 
-    with self.cached_session() as sess:
-      x = api.converted_call(f, None, converter.ConversionOptions(),
-                             constant_op.constant(0))
-      self.assertTrue(self.evaluate(x))
+    x = api.converted_call(f, None, converter.ConversionOptions(),
+                           (constant_op.constant(0),), {})
+    self.assertTrue(self.evaluate(x))
+
+    converted_f = api.to_graph(f)
+    x = api.converted_call(converted_f, None, converter.ConversionOptions(),
+                           (constant_op.constant(0),), {})
+    self.assertTrue(self.evaluate(x))
+
+  def test_converted_call_then_already_converted_dynamic(self):
+
+    @api.convert()
+    def g(x):
+      if x > 0:
+        return x
+      else:
+        return -x
+
+    def f(g, x):
+      return g(x)
 
-      converted_f = api.to_graph(f)
-      x = api.converted_call(converted_f, None, converter.ConversionOptions(),
-                             constant_op.constant(0))
-      self.assertTrue(self.evaluate(x))
+    x = api.converted_call(f, None, converter.ConversionOptions(),
+                           (g, constant_op.constant(1)), {})
+    self.assertEqual(self.evaluate(x), 1)
 
   @test_util.run_deprecated_v1
   def test_converted_call_no_user_code(self):
@@ -345,10 +381,10 @@ class ApiTest(test.TestCase):
     # f should not be converted, causing len to error out.
     with self.assertRaisesRegexp(Exception,
                                  'object of type \'Tensor\' has no len()'):
-      api.converted_call(f, None, opts, constant_op.constant([0]))
+      api.converted_call(f, None, opts, (constant_op.constant([0]),), {})
 
     # len on the other hand should work fine.
-    x = api.converted_call(len, None, opts, constant_op.constant([0]))
+    x = api.converted_call(len, None, opts, (constant_op.constant([0]),), {})
     # The constant has static shape so the result is a primitive not a Tensor.
     self.assertEqual(x, 1)
 
@@ -361,11 +397,10 @@ class ApiTest(test.TestCase):
     ])
 
     x = api.converted_call(model.call, None, opts,
-                           constant_op.constant([[0.0]]), training=True)
+                           (constant_op.constant([[0.0]]),), {'training': True})
 
-    with self.cached_session() as sess:
-      self.evaluate(variables.global_variables_initializer())
-      self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
 
   def test_converted_call_whitelisted_method_extra_self(self):
 
@@ -376,11 +411,11 @@ class ApiTest(test.TestCase):
     ])
 
     x = api.converted_call(model.call, None, opts,
-                           model, constant_op.constant([[0.0]]), training=True)
+                           (model, constant_op.constant([[0.0]])),
+                           {'training': True})
 
-    with self.cached_session() as sess:
-      self.evaluate(variables.global_variables_initializer())
-      self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
 
   def test_converted_call_whitelisted_method_via_owner(self):
 
@@ -391,11 +426,28 @@ class ApiTest(test.TestCase):
     ])
 
     x = api.converted_call('call', model, opts,
-                           constant_op.constant([[0.0]]), training=True)
+                           (constant_op.constant([[0.0]]),), {'training': True})
 
-    with self.cached_session() as sess:
-      self.evaluate(variables.global_variables_initializer())
-      self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
+
+  def test_converted_call_namedtuple(self):
+
+    opts = converter.ConversionOptions()
+
+    x = api.converted_call(collections.namedtuple, None, opts,
+                           ('TestNamedtuple', ('a', 'b')), {})
+
+    self.assertTrue(inspect_utils.isnamedtuple(x))
+
+  def test_converted_call_namedtuple_via_collections(self):
+
+    opts = converter.ConversionOptions()
+
+    x = api.converted_call('namedtuple', collections, opts, ('TestNamedtuple',
+                                                             ('a', 'b')), {})
+
+    self.assertTrue(inspect_utils.isnamedtuple(x))
 
   def test_converted_call_lambda(self):
 
@@ -403,11 +455,10 @@ class ApiTest(test.TestCase):
 
     l = lambda x: x == 0
 
-    x = api.converted_call(l, None, opts, constant_op.constant(0))
+    x = api.converted_call(l, None, opts, (constant_op.constant(0),), {})
 
-    with self.cached_session() as sess:
-      self.evaluate(variables.global_variables_initializer())
-      self.assertAllEqual(True, self.evaluate(x))
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual(True, self.evaluate(x))
 
   @test_util.run_deprecated_v1
   def test_to_graph_basic(self):
@@ -439,6 +490,31 @@ class ApiTest(test.TestCase):
       x = compiled_fn(constant_op.constant([4, 8]))
       self.assertListEqual([1, 2], self.evaluate(x).tolist())
 
+  def test_to_graph_with_globals(self):
+
+    def test_fn(x):
+      global testing_global_numeric
+      testing_global_numeric = x + testing_global_numeric
+      return testing_global_numeric
+
+    # TODO(b/122368197)
+    with self.assertRaisesRegex(
+        errors.AutoGraphError, 'global keyword is not yet supported'):
+      api.to_graph(test_fn)
+
+  def test_to_graph_with_kwargs_clashing_converted_call(self):
+
+    def called_fn(**kwargs):
+      return kwargs['f'] + kwargs['owner']
+
+    def test_fn():
+      # These arg names intentionally match converted_call's
+      return called_fn(f=1, owner=2)
+
+    compiled_fn = api.to_graph(test_fn)
+
+    self.assertEqual(compiled_fn(), 3)
+
   def test_to_code_basic(self):
 
     def test_fn(x, s):
diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py
index f8decd24e8e2eb5bcad22ba64d1865e8497363e3..4d93f02695f5b35704b96ccececc4d947ed8bb41 100644
--- a/tensorflow/python/autograph/impl/conversion.py
+++ b/tensorflow/python/autograph/impl/conversion.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import functools
 import imp
+import unittest
 
 import gast
 
@@ -33,7 +34,6 @@ from tensorflow.python.autograph.converters import call_trees
 from tensorflow.python.autograph.converters import conditional_expressions
 from tensorflow.python.autograph.converters import continue_statements
 from tensorflow.python.autograph.converters import control_flow
-from tensorflow.python.autograph.converters import decorators
 from tensorflow.python.autograph.converters import directives
 from tensorflow.python.autograph.converters import error_handlers
 from tensorflow.python.autograph.converters import function_scopes
@@ -44,18 +44,21 @@ from tensorflow.python.autograph.converters import side_effect_guards
 from tensorflow.python.autograph.converters import slices
 from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
-from tensorflow.python.autograph.core import errors
+from tensorflow.python.autograph.core import errors as ag_errors
 from tensorflow.python.autograph.core import function_wrapping
+from tensorflow.python.autograph.core import unsupported_features_checker
 from tensorflow.python.autograph.lang import special_functions
 from tensorflow.python.autograph.pyct import ast_util
 from tensorflow.python.autograph.pyct import compiler
+from tensorflow.python.autograph.pyct import errors
 from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import origin_info
 from tensorflow.python.autograph.pyct import parser
+from tensorflow.python.autograph.pyct import pretty_printer
 from tensorflow.python.autograph.pyct import qual_names
 from tensorflow.python.autograph.pyct import templates
 from tensorflow.python.autograph.pyct import transformer
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.autograph.utils import ag_logging as logging
 from tensorflow.python.util import tf_inspect
 
 
@@ -80,24 +83,74 @@ def is_whitelisted_for_graph(o):
     m = functools
   else:
     m = tf_inspect.getmodule(o)
-  for prefix, in config.DEFAULT_UNCOMPILED_MODULES:
-    if m.__name__.startswith(prefix):
+
+  if hasattr(m, '__name__'):
+    # Builtins typically have unnamed modules.
+    for prefix, in config.DEFAULT_UNCOMPILED_MODULES:
+      if m.__name__.startswith(prefix):
+        logging.log(2, 'Whitelisted: %s: name starts with "%s"', o, prefix)
+        return True
+
+    # Temporary -- whitelist tensorboard modules.
+    # TODO(b/122731813): Remove.
+    if m.__name__ == 'tensorboard' or '.tensorboard' in m.__name__:
+      logging.log(2, 'Whitelisted: %s: name contains "tensorboard"', o)
       return True
 
-  if hasattr(o, 'autograph_info__'):
+  if hasattr(o, 'autograph_info__') or hasattr(o, '__ag_compiled'):
+    logging.log(2, 'Whitelisted: %s: already converted', o)
     return True
 
+  if hasattr(o, '__call__'):
+    # Callable objects: whitelisted if their __call__ method is.
+    # The type check avoids infinite recursion around the __call__ method
+    # of function objects.
+    if (type(o) != type(o.__call__)) and is_whitelisted_for_graph(o.__call__):  # pylint: disable=unidiomatic-typecheck
+      logging.log(2, 'Whitelisted: %s: object __call__ whitelisted', o)
+      return True
+
+  owner_class = None
+  if tf_inspect.ismethod(o):
+    # Methods of whitelisted classes are also whitelisted, even if they are
+    # bound via user subclasses.
+    #
+    # For example, suppose `tf.Foo` has a method called `bar`, and `baz` is
+    # defined as below. `tf.Foo` is whitelisted. Then `baz.bar` is also
+    # whitelisted.
+    #
+    #   class Custom(tf.Foo):
+    #     pass
+    #
+    #   baz = Custom()
+    #
+    # For the example above, if `Custom` did overload `bar`, then it would no
+    # longer be whitelisted.
+
+    owner_class = inspect_utils.getmethodclass(o)
+    if owner_class is not None:
+      if issubclass(owner_class, unittest.TestCase):
+        logging.log(2, 'Whitelisted: %s: method of TestCase subclass', o)
+        return True
+
+      owner_class = inspect_utils.getdefiningclass(o, owner_class)
+      if is_whitelisted_for_graph(owner_class):
+        logging.log(2, 'Whitelisted: %s: owner is whitelisted %s', o,
+                    owner_class)
+        return True
+
   if inspect_utils.isnamedtuple(o):
     # Due to the way they're constructed, namedtuple types cannot be converted
     # because they don't expose source code. But we assume they are safe for
     # graph mode since they are just containers.
     if tf_inspect.isclass(o) and len(o.__bases__) > 1:
-      logging.log_first_n(
-          logging.level_warning(),
-          'Entity {} looks like a namedtuple subclass. If it has any custom'
-          ' methods, they will not be converted by AutoGraph.'.format(o), 1)
+      logging.warn(
+          'Entity {} looks like a namedtuple subclass. Its constructor will'
+          ' not be converted by AutoGraph, but if it has any custom methods,'
+          ' those will be.'.format(o), 1)
+    logging.log(2, 'Whitelisted: %s: named tuple', o)
     return True
 
+  logging.log(2, 'Not whitelisted: %s: default rule', o)
   return False
 
 
@@ -129,8 +182,7 @@ def entity_to_graph(o, program_ctx, arg_values, arg_types):
   Raises:
     ValueError: if the entity type is not supported.
   """
-  if program_ctx.options.verbose == converter.Verbosity.VERBOSE:
-    logging.info('Converting {}'.format(o))
+  logging.log(1, 'Converting %s', o)
 
   if tf_inspect.isclass(o):
     node, name, ns = class_to_graph(o, program_ctx)
@@ -164,9 +216,13 @@ def entity_to_graph(o, program_ctx, arg_values, arg_types):
 
   program_ctx.add_to_cache(o, node)
 
-  if program_ctx.options.verbose == converter.Verbosity.VERBOSE:
-    logging.info('Compiled output of {}:\n\n{}\n'.format(
-        o, compiler.ast_to_source(node)))
+  if logging.has_verbosity(2):
+    logging.log(2, 'Compiled output of %s:\n\n%s\n', o,
+                compiler.ast_to_source(node))
+  if logging.has_verbosity(4):
+    for n in node:
+      logging.log(4, 'Compiled AST of %s:\n\n%s\n\n', o,
+                  pretty_printer.fmt(n, color=False))
 
   if program_ctx.options.recursive:
     while True:
@@ -278,10 +334,12 @@ def _add_self_references(namespace, autograph_module):
     # internal modules.
     ag_internal = imp.new_module('autograph')
     ag_internal.__dict__.update(autograph_module.__dict__)
+    ag_internal.ConversionOptions = converter.ConversionOptions
+    ag_internal.Feature = converter.Feature
     ag_internal.utils = utils
     ag_internal.function_scope = function_wrapping.function_scope
     ag_internal.rewrite_graph_construction_error = (
-        errors.rewrite_graph_construction_error)
+        ag_errors.rewrite_graph_construction_error)
     # TODO(mdan): Add safeguards against name clashes.
     # We don't want to create a submodule because we want the operators to be
     # accessible as ag__.<operator>
@@ -299,27 +357,23 @@ def function_to_graph(f,
   """Specialization of `entity_to_graph` for callable functions."""
 
   node, source = parser.parse_entity(f)
+  logging.log(3, 'Source code of %s:\n\n%s\n', f, source)
   node = node.body[0]
 
-  # In general, the output of inspect.getsource is inexact because it uses
-  # regex matching to adjust the exact location around the line number that
-  # CPython records. This is particularly problematic for lambda functions,
-  # where the entire containing lines are returned.
-  nodes = ast_util.find_matching_definitions(node, f)
-  if len(nodes) != 1:
-    if f.__name__ == '<lambda>':
+  # In general, the output of inspect.getsource is inexact for lambdas because
+  # it uses regex matching to adjust the exact location around the line number
+  # that CPython records. Then, the entire containing line is returned, which
+  # we may have trouble disambiguating. For example:
+  # x, y = lambda: 1, lambda: 2
+  if f.__name__ == '<lambda>':
+    nodes = ast_util.find_matching_definitions(node, f)
+    if len(nodes) != 1:
       raise ValueError(
           'Unable to identify source code of lambda function {}. It was'
           ' defined on this line: {}, which must contain a single lambda with'
           ' matching signature. To avoid ambiguity, define each lambda'
           ' in a separate expression.'.format(f, source))
-    else:
-      raise ValueError(
-          'Unable to identify source code of function {}. The source code'
-          ' reported by Python did not include exactly one matching signature:'
-          '\n{}\n. This is an extremely rare occurrence. Please report it to'
-          ' the TensorFlow team.'.format(f, source))
-  node, = nodes
+    node, = nodes
 
   # TODO(znado): Place inside standard_analysis.
   origin_info.resolve(node, source, f)
@@ -335,7 +389,12 @@ def function_to_graph(f,
       arg_types=arg_types,
       owner_type=owner_type)
   context = converter.EntityContext(namer, entity_info, program_ctx)
-  node = node_to_graph(node, context)
+  try:
+    node = node_to_graph(node, context)
+  except (ValueError, AttributeError, KeyError, NotImplementedError) as e:
+    logging.error(1, 'Error converting %s', f, exc_info=True)
+    raise errors.InternalError('conversion', e)
+    # TODO(mdan): Catch and rethrow syntax errors.
 
   if isinstance(node, gast.Lambda):
     new_name = namer.new_symbol('tf__lambda', ())
@@ -372,19 +431,18 @@ def node_to_graph(node, context):
             dependencies that this node has.
   """
   # TODO(mdan): Insert list_comprehensions somewhere.
+  unsupported_features_checker.verify(node)
 
   node = converter.standard_analysis(node, context, is_initial=True)
   # Past this point, line numbers are no longer accurate so we ignore the
   # source.
   # TODO(mdan): Is it feasible to reconstruct intermediate source code?
   context.info.source_code = None
-
-  if context.program.options.uses(converter.Feature.DECORATORS):
-    node = converter.apply_(node, context, decorators)
   node = converter.apply_(node, context, arg_defaults)
   node = converter.apply_(node, context, directives)
   node = converter.apply_(node, context, break_statements)
-  node = converter.apply_(node, context, asserts)
+  if context.program.options.uses(converter.Feature.ASSERT_STATEMENTS):
+    node = converter.apply_(node, context, asserts)
   # Note: sequencing continue canonicalization before for loop one avoids
   # dealing with the extra loop increment operation that the for
   # canonicalization creates.
@@ -393,11 +451,13 @@ def node_to_graph(node, context):
   if context.program.options.uses(converter.Feature.LISTS):
     node = converter.apply_(node, context, lists)
     node = converter.apply_(node, context, slices)
-  node = converter.apply_(node, context, builtin_functions)
+  if context.program.options.uses(converter.Feature.BUILTIN_FUNCTIONS):
+    node = converter.apply_(node, context, builtin_functions)
   node = converter.apply_(node, context, call_trees)
   node = converter.apply_(node, context, control_flow)
   node = converter.apply_(node, context, conditional_expressions)
-  node = converter.apply_(node, context, logical_expressions)
+  if context.program.options.uses(converter.Feature.LOGICAL_EXPRESSIONS):
+    node = converter.apply_(node, context, logical_expressions)
   if context.program.options.uses(converter.Feature.AUTO_CONTROL_DEPS):
     node = converter.apply_(node, context, side_effect_guards)
   # TODO(mdan): If function scopes ever does more, the toggle will need moving.
diff --git a/tensorflow/python/autograph/impl/conversion_test.py b/tensorflow/python/autograph/impl/conversion_test.py
index 9a4fbdad8c1994d8c8cc534b6e0b4af45f5c4c80..ddda4089fd89465d4915963e7d57143e89ed2aef 100644
--- a/tensorflow/python/autograph/impl/conversion_test.py
+++ b/tensorflow/python/autograph/impl/conversion_test.py
@@ -92,12 +92,9 @@ class ConversionTest(test.TestCase):
     conversion.entity_to_graph(f, program_ctx, None, None)
 
     self.assertTrue(f in program_ctx.dependency_cache)
-    self.assertTrue(g in program_ctx.dependency_cache)
+    self.assertFalse(g in program_ctx.dependency_cache)
     f_node = program_ctx.dependency_cache[f][0]
-    g_node = program_ctx.dependency_cache[g][0]
     self.assertEqual('tf__f', f_node.name)
-    self.assertEqual('tf__g', f_node.body[0].body[0].body[0].value.func.id)
-    self.assertEqual('tf__g', g_node.name)
 
   def test_entity_to_graph_class_hierarchy(self):
 
diff --git a/tensorflow/python/autograph/operators/BUILD b/tensorflow/python/autograph/operators/BUILD
index aedb901845b97bbee5918902875b5023a8604dcd..07c860aa3f05876be81c25407407356164db917d 100644
--- a/tensorflow/python/autograph/operators/BUILD
+++ b/tensorflow/python/autograph/operators/BUILD
@@ -26,6 +26,7 @@ py_library(
         "logical.py",
         "py_builtins.py",
         "slices.py",
+        "special_values.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
@@ -38,6 +39,7 @@ py_library(
         "//tensorflow/python:list_ops",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python:util",
         "//tensorflow/python:variables",
         "//tensorflow/python/autograph/utils",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -104,3 +106,13 @@ py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
+
+py_test(
+    name = "special_values_test",
+    srcs = ["special_values_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":operators",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/python/autograph/operators/__init__.py b/tensorflow/python/autograph/operators/__init__.py
index 7a580fe32475cbc32f20a1196c075fbf7f981d27..5b3f45de056bf0354c3864aa51fd485fbc891624 100644
--- a/tensorflow/python/autograph/operators/__init__.py
+++ b/tensorflow/python/autograph/operators/__init__.py
@@ -52,6 +52,7 @@ from tensorflow.python.autograph.operators.logical import eq
 from tensorflow.python.autograph.operators.logical import gt
 from tensorflow.python.autograph.operators.logical import gt_e
 from tensorflow.python.autograph.operators.logical import in_
+from tensorflow.python.autograph.operators.logical import invert
 from tensorflow.python.autograph.operators.logical import is_
 from tensorflow.python.autograph.operators.logical import is_not
 from tensorflow.python.autograph.operators.logical import lt
@@ -60,6 +61,7 @@ from tensorflow.python.autograph.operators.logical import not_
 from tensorflow.python.autograph.operators.logical import not_eq
 from tensorflow.python.autograph.operators.logical import not_in
 from tensorflow.python.autograph.operators.logical import or_
+from tensorflow.python.autograph.operators.logical import u_add
 from tensorflow.python.autograph.operators.logical import u_sub
 from tensorflow.python.autograph.operators.py_builtins import float_
 from tensorflow.python.autograph.operators.py_builtins import int_
@@ -69,3 +71,5 @@ from tensorflow.python.autograph.operators.py_builtins import range_
 from tensorflow.python.autograph.operators.slices import get_item
 from tensorflow.python.autograph.operators.slices import GetItemOpts
 from tensorflow.python.autograph.operators.slices import set_item
+from tensorflow.python.autograph.operators.special_values import is_undefined
+from tensorflow.python.autograph.operators.special_values import Undefined
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index 89f7b8522f569542fa935877cdd9de6a9797c2c4..adfde4623dd0c842a747a0d50272cfc8961fab24 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -19,10 +19,12 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.autograph.operators import py_builtins
+from tensorflow.python.autograph.operators import special_values
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.util import nest
 
 
 def for_stmt(iter_, extra_test, body, init_state):
@@ -61,6 +63,17 @@ def for_stmt(iter_, extra_test, body, init_state):
   if tensor_util.is_tensor(iter_):
     return _known_len_for_stmt(iter_, extra_test, body, init_state)
   elif isinstance(iter_, dataset_ops.DatasetV2):
+    # Check for undefined symbols and report an error. This prevents the error
+    # from propagating into the TF runtime. We have more information here and
+    # can provide a clearer error message.
+    undefined_symbols = _filter_undefined(init_state)
+
+    if undefined_symbols:
+      raise ValueError(
+          'TensorFlow requires that the following symbols must be initialized '
+          'to a Tensor, Variable or TensorArray before the loop: {}'
+          .format(tuple(undefined_symbols)))
+
     return _dataset_for_stmt(iter_, extra_test, body, init_state)
   else:
     return _py_for_stmt(iter_, extra_test, body, init_state)
@@ -70,13 +83,9 @@ def _py_for_stmt(iter_, extra_test, body, init_state):
   """Overload of for_stmt that executes a Python for loop."""
   state = init_state
   for target in iter_:
-    if not extra_test(*state):
+    if extra_test is not None and not extra_test(*state):
       break
     state = body(target, *state)
-
-  # TODO(mdan): Remove this special case.
-  if len(state) == 1:
-    return state[0]
   return state
 
 
@@ -87,10 +96,17 @@ def _known_len_for_stmt(iter_, extra_test, body, init_state):
   def while_body(iterate_index, *state):
     iterate = iter_[iterate_index]
     new_state = body(iterate, *state)
-    return (iterate_index + 1,) + new_state
+
+    state = (iterate_index + 1,)
+    if new_state:
+      state += new_state
+
+    return state
 
   def while_cond(iterate_index, *state):
-    return gen_math_ops.logical_and(iterate_index < n, extra_test(*state))
+    if extra_test is not None:
+      return gen_math_ops.logical_and(iterate_index < n, extra_test(*state))
+    return iterate_index < n
 
   results = while_stmt(
       while_cond,
@@ -98,21 +114,26 @@ def _known_len_for_stmt(iter_, extra_test, body, init_state):
       init_state=(0,) + init_state,
       extra_deps=(iter_,),
       opts=dict(maximum_iterations=n))
+
   # Dropping the iteration index because it's not syntactically visible.
   # TODO(mdan): Don't.
-  results = results[1:]
+  if isinstance(results, (tuple, list)):
+    assert len(results) >= 1  # Has at least the iterate.
+    if len(results) > 1:
+      results = results[1:]
+  else:
+    results = ()
 
-  # TODO(mdan): Remove this special case.
-  if len(results) == 1:
-    return results[0]
   return results
 
 
 def _dataset_for_stmt(ds, extra_test, body, init_state):
   """Overload of for_stmt that iterates over TF Datasets."""
-  if extra_test(*init_state) is not True:
+
+  if extra_test is not None:
     raise NotImplementedError(
-        'break statements are not yet supported in for/Dataset loops')
+        'break and return statements are not yet supported in '
+        'for/Dataset loops.')
 
   def reduce_body(state, iterate):
     new_state = body(iterate, *state)
@@ -120,9 +141,6 @@ def _dataset_for_stmt(ds, extra_test, body, init_state):
 
   results = ds.reduce(init_state, reduce_body)
 
-  # TODO(mdan): Remove this special case.
-  if len(results) == 1:
-    return results[0]
   return results
 
 
@@ -151,17 +169,43 @@ def while_stmt(test, body, init_state, extra_deps, opts=None):
   # TODO(mdan): Consider adding a generic mechanism for dynamic dispatch.
   # That could be something as simple as a collection of dispatch rules, with
   # some prioritization.
-  if any(tensor_util.is_tensor(v) for v in init_state + extra_deps):
+  if any(tensor_util.is_tensor(v) for v in nest.flatten(extra_deps)):
+    # Check for undefined symbols and report an error. This prevents the error
+    # from propagating into the TF runtime. We have more information here and
+    # can provide a clearer error message.
+    undefined_symbols = _filter_undefined(init_state)
+
+    if undefined_symbols:
+      raise ValueError(
+          'TensorFlow requires that the following symbols must be initialized '
+          'to a Tensor, Variable or TensorArray before the loop: {}'
+          .format(tuple(undefined_symbols)))
     return _tf_while_stmt(test, body, init_state, opts)
   else:
     return _py_while_stmt(test, body, init_state, opts)
 
 
+def _filter_undefined(all_symbols):
+  """Returns the names of undefined symbols contained in all_symbols."""
+  undefined_symbols = [
+      s.symbol_name
+      for s in all_symbols
+      if special_values.is_undefined(s)
+  ]
+  return undefined_symbols
+
+
 def _tf_while_stmt(test, body, init_state, opts):
   """Overload of while_stmt that stages a TF while_stmt."""
   if opts is None:
     opts = {}
-  return control_flow_ops.while_loop(test, body, init_state, **opts)
+
+  # Non-v2 while_loop unpacks the results when there is only one return value.
+  # This enforces consistency across versions.
+  opts['return_same_structure'] = True
+
+  retval = control_flow_ops.while_loop(test, body, init_state, **opts)
+  return retval
 
 
 def _py_while_stmt(test, body, init_state, opts):
@@ -194,7 +238,33 @@ def if_stmt(cond, body, orelse):
 
 def tf_if_stmt(cond, body, orelse):
   """Overload of if_stmt that stages a TF cond."""
-  return control_flow_ops.cond(cond, body, orelse)
+  protected_body = _wrap_in_protection_from_undefined(body, branch_name='if')
+  protected_orelse = _wrap_in_protection_from_undefined(orelse,
+                                                        branch_name='else')
+
+  return control_flow_ops.cond(cond, protected_body, protected_orelse)
+
+
+def _wrap_in_protection_from_undefined(func, branch_name):
+  """Wraps function to raise useful error when it returns undefined symbols."""
+  def protected_func():
+    """Calls function and raises an error if undefined symbols are returned."""
+    results = func()
+    undefined_symbols = None
+    if isinstance(results, tuple):
+      undefined_symbols = _filter_undefined(results)
+    elif special_values.is_undefined(results):
+      # Single return value
+      undefined_symbols = results.symbol_name
+
+    if undefined_symbols:
+      message = ('The following symbols must also be initialized in the %s '
+                 'branch: {}. Alternatively, you may initialize them before '
+                 'the if statement.') % branch_name
+      message = message.format(undefined_symbols)
+      raise ValueError(message)
+    return results
+  return protected_func
 
 
 def _py_if_stmt(cond, body, orelse):
diff --git a/tensorflow/python/autograph/operators/control_flow_test.py b/tensorflow/python/autograph/operators/control_flow_test.py
index 0a7d4b64022f583bae4effc7d0f7eb04f46cc048..c06c4bcb97034dc79415df274862be51dfabd525 100644
--- a/tensorflow/python/autograph/operators/control_flow_test.py
+++ b/tensorflow/python/autograph/operators/control_flow_test.py
@@ -36,7 +36,7 @@ class ForLoopTest(test.TestCase):
         extra_test=lambda s: True,
         body=lambda i, s: (s + i,),
         init_state=(0,))
-    with self.cached_session() as sess:
+    with self.cached_session():
       self.assertEqual((10,), self.evaluate(s))
 
   def test_python(self):
@@ -45,17 +45,17 @@ class ForLoopTest(test.TestCase):
         extra_test=lambda s: True,
         body=lambda i, s: (s + i,),
         init_state=(0,))
-    self.assertEqual(10, s)
+    self.assertEqual((10,), s)
 
   @test_util.run_deprecated_v1
   def test_dataset(self):
     to_int32 = lambda i: math_ops.cast(i, dtypes.int32)
     s = control_flow.for_stmt(
         dataset_ops.Dataset.range(5).map(to_int32),
-        extra_test=lambda s: True,
+        None,
         body=lambda i, s: (s + i,),
         init_state=(0,))
-    with self.cached_session() as sess:
+    with self.cached_session():
       self.assertEqual((10,), self.evaluate(s))
 
 
@@ -69,8 +69,31 @@ class WhileLoopTest(test.TestCase):
         body=lambda i, s: (i + 1, s + i,),
         init_state=(0, 0),
         extra_deps=(n,))
-    with self.cached_session() as sess:
-      self.assertEqual((5, 10), self.evaluate(results))
+    self.assertEqual((5, 10), self.evaluate(results))
+
+  @test_util.run_deprecated_v1
+  def test_python_with_tensor_state(self):
+    n = 5
+    results = control_flow.while_stmt(
+        test=lambda i, s: i < n,
+        body=lambda i, s: (i + 1, s + i),
+        init_state=(0, constant_op.constant(0)),
+        extra_deps=())
+    result_i, result_s = results
+    self.assertEqual(5, result_i)
+    self.assertEqual(10, self.evaluate(result_s))
+
+  @test_util.run_deprecated_v1
+  def test_python_due_to_hidden_cond_type(self):
+    n = 5
+
+    # TODO(b/124002646): Improve the error message.
+    with self.assertRaises(Exception):
+      control_flow.while_stmt(
+          test=lambda i, s: i < n,
+          body=lambda i, s: (i + 1, s + i),
+          init_state=(constant_op.constant(0), constant_op.constant(0)),
+          extra_deps=())
 
   def test_python(self):
     n = 5
@@ -93,7 +116,7 @@ class IfStmtTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def test_tensor(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
       t = self.single_return_if_stmt(constant_op.constant(True))
       self.assertEqual(1, self.evaluate(t))
       t = self.single_return_if_stmt(constant_op.constant(False))
@@ -105,7 +128,7 @@ class IfStmtTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def test_tensor_multiple_returns(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
       t = self.multi_return_if_stmt(constant_op.constant(True))
       self.assertAllEqual([1, 2], self.evaluate(t))
       t = self.multi_return_if_stmt(constant_op.constant(False))
diff --git a/tensorflow/python/autograph/operators/logical.py b/tensorflow/python/autograph/operators/logical.py
index 569db5b91bd7efb92ce2b8a8b8eb6eb773f4abcb..cafb0583e8f66841f0d905f5d98bfc3cb1780513 100644
--- a/tensorflow/python/autograph/operators/logical.py
+++ b/tensorflow/python/autograph/operators/logical.py
@@ -18,15 +18,32 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import operator
+
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_math_ops
 
 
+# Note: the implementations in this file are split into very small-grained
+# functions in preparation for the factoring out the more generic pyct library.
+# At that time, the py_* and tf_* functions will reside in different libraries.
+
+
 def not_(a):
   """Functional form of "not"."""
   if tensor_util.is_tensor(a):
-    return gen_math_ops.logical_not(a)
+    return _tf_not(a)
+  return _py_not(a)
+
+
+def _tf_not(a):
+  """Implementation of the "not_" operator for TensorFlow."""
+  return gen_math_ops.logical_not(a)
+
+
+def _py_not(a):
+  """Default Python implementation of the "not_" operator."""
   return not a
 
 
@@ -90,50 +107,28 @@ def not_eq(a, b):
   return not_(eq(a, b))
 
 
-# Default implementation for the remainings.
-
-
-def gt(a, b):
-  """Functional form of "less-than"."""
-  return a > b
-
+# Default implementation for the rest.
 
-def gt_e(a, b):
-  """Functional form of "less-than"."""
-  return a >= b
-
-
-def is_(a, b):
-  """Functional form of "less-than"."""
-  return a is b
-
-
-def is_not(a, b):
-  """Functional form of "less-than"."""
-  return a is not b
+is_ = operator.is_
+is_not = operator.is_not
 
 
 def in_(a, b):
-  """Functional form of "less-than"."""
+  """Functional form of "in"."""
   # TODO(mdan): in and not_in should probably be convertible for some types.
   return a in b
 
 
-def lt(a, b):
-  """Functional form of "less-than"."""
-  return a < b
-
-
-def lt_e(a, b):
-  """Functional form of "less-than"."""
-  return a <= b
-
-
 def not_in(a, b):
-  """Functional form of "less-than"."""
+  """Functional form of "not-in"."""
   return a not in b
 
+gt = operator.gt
+gt_e = operator.ge
+lt = operator.lt
+lt_e = operator.le
+
 
-def u_sub(a):
-  """Functional form of "unary-sub"."""
-  return -a
+u_add = operator.pos
+u_sub = operator.neg
+invert = operator.invert
diff --git a/tensorflow/python/autograph/operators/py_builtins.py b/tensorflow/python/autograph/operators/py_builtins.py
index ddf05f73f37821c6ff7e246051cd82a560f370e3..fe9486ca1ed41ce55f2219b3771639eb081a6afe 100644
--- a/tensorflow/python/autograph/operators/py_builtins.py
+++ b/tensorflow/python/autograph/operators/py_builtins.py
@@ -37,7 +37,7 @@ from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
 
 
-UNDEFINED = object()
+UNSPECIFIED = object()
 
 
 def overload_of(f):
@@ -77,14 +77,14 @@ def _py_float(x):
   return float(x)
 
 
-def int_(x=0, base=UNDEFINED):
+def int_(x=0, base=UNSPECIFIED):
   if tensor_util.is_tensor(x):
     return _tf_int(x, base)
   return _py_int(x, base)
 
 
 def _tf_int(x, base):
-  if base not in (10, UNDEFINED):
+  if base not in (10, UNSPECIFIED):
     raise NotImplementedError('base {} not supported for int'.format(base))
 
   # TODO(mdan): We shouldn't assume int32.
@@ -94,7 +94,7 @@ def _tf_int(x, base):
 
 
 def _py_int(x, base):
-  if base is UNDEFINED:
+  if base is UNSPECIFIED:
     return int(x)
   return int(x, base)
 
@@ -155,19 +155,28 @@ def _py_len(s):
 
 
 def print_(*objects, **kwargs):
+  """Overload of the print builtin."""
   # Note: Python 2.6 doesn't support explicit keywords after starargs.
   unknown_kwargs = tuple(
       set(kwargs.keys()) - set(('sep', 'end', 'file', 'flush')))
   if unknown_kwargs:
     raise ValueError('invalid keyword arguments: {}'.format(unknown_kwargs))
 
-  # TODO(mdan): use logging_ops.Print when py_func is not supported.
-  return _tf_py_func_print(objects, kwargs)
+  # TODO(mdan): Use next.flatten(objects) instead?
+  if any(tensor_util.is_tensor(o) for o in objects):
+    # TODO(mdan): use tf.print instead.
+    return _tf_py_func_print(objects, kwargs)
+  else:
+    _py_print(*objects, **kwargs)
+
+
+def _py_print(*objects, **kwargs):
+  print(*objects, **kwargs)
 
 
 def _tf_py_func_print(objects, kwargs):
   """Overload of print_ as a py_func implementation."""
-  override_kwargs = {k: v for k, v in kwargs.items() if v is not UNDEFINED}
+  override_kwargs = {k: v for k, v in kwargs.items() if v is not UNSPECIFIED}
   if 'flush' not in override_kwargs:
     # Defaulting to flushing the console in graph mode, which helps reduce
     # garbled output in IPython.
@@ -187,7 +196,7 @@ def _tf_py_func_print(objects, kwargs):
       print_wrapper, None, objects, use_dummy_return=True)
 
 
-def range_(start_or_stop, stop=UNDEFINED, step=UNDEFINED):
+def range_(start_or_stop, stop=UNSPECIFIED, step=UNSPECIFIED):
   if any(tensor_util.is_tensor(s) for s in (start_or_stop, stop, step)):
     return _tf_range(start_or_stop, stop, step)
   return _py_range(start_or_stop, stop, step)
@@ -200,10 +209,10 @@ def _tf_range(start_or_stop, stop, step):
   # graph construction error aligns the semantics with Python.
 
   # TODO(mdan): We should optimize this when a full tensor is not required.
-  if step is not UNDEFINED:
+  if step is not UNSPECIFIED:
     # TODO(mdan): Add argument coercion similar to other cases.
     return math_ops.range(start_or_stop, stop, step)
-  if stop is not UNDEFINED:
+  if stop is not UNSPECIFIED:
     stop = math_ops.maximum(start_or_stop, stop)
     return math_ops.range(start_or_stop, stop)
   start_or_stop = math_ops.maximum(start_or_stop, 0)
@@ -211,9 +220,9 @@ def _tf_range(start_or_stop, stop, step):
 
 
 def _py_range(start_or_stop, stop, step):
-  if step is not UNDEFINED:
+  if step is not UNSPECIFIED:
     return range(start_or_stop, stop, step)
-  if stop is not UNDEFINED:
+  if stop is not UNSPECIFIED:
     return range(start_or_stop, stop)
   return range(start_or_stop)
 
diff --git a/tensorflow/python/autograph/operators/special_values.py b/tensorflow/python/autograph/operators/special_values.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c1b3d1f30b36c98b969e92bd2587ab62fbfc2a9
--- /dev/null
+++ b/tensorflow/python/autograph/operators/special_values.py
@@ -0,0 +1,65 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities used to capture Python idioms."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+class Undefined(object):
+  """Represents an undefined symbol in Python.
+
+  This is used to reify undefined symbols, which is required to use the
+  functional form of loops.
+  Example:
+
+    while n > 0:
+      n = n - 1
+      s = n
+    return s  # Runtime error if n == 0
+
+  This is valid Python code and will not result in an error as long as n
+  is positive. The use of this class is to stay as close to Python semantics
+  as possible for staged code of this nature.
+
+  Converted version of the above showing the possible usage of this class:
+
+    s = Undefined('s')
+    init_state = (s,)
+    s = while_loop(cond, body, init_state)
+    return s  # s is an instance of Undefined if the loop never runs
+
+  Attributes:
+    symbol_name: Text, identifier for the undefined symbol
+  """
+
+  def __init__(self, symbol_name):
+    self.symbol_name = symbol_name
+
+
+def is_undefined(value):
+  """Checks whether Autograph has determined that a given value is undefined.
+
+  This only works in places where Autograph reifies undefined symbols. Note that
+  if this function is passed a truly undefined symbol the call-site will raise
+  NameError.
+
+  Args:
+    value: value to test for undefinedness
+  Returns:
+    Boolean, whether the input value is undefined.
+  """
+  return isinstance(value, Undefined)
diff --git a/tensorflow/python/autograph/operators/special_values_test.py b/tensorflow/python/autograph/operators/special_values_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e1e087a9f3f586b646c9a73877d9bb4470c6f3e
--- /dev/null
+++ b/tensorflow/python/autograph/operators/special_values_test.py
@@ -0,0 +1,38 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for python_lang_utils module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.autograph.operators import special_values
+from tensorflow.python.platform import test
+
+
+class SpecialValuesTest(test.TestCase):
+
+  def test_undefined(self):
+    undefined_symbol = special_values.Undefined('name')
+    self.assertEqual(undefined_symbol.symbol_name, 'name')
+
+    undefined_symbol2 = special_values.Undefined('name')
+    self.assertNotEqual(undefined_symbol, undefined_symbol2)
+
+    self.assertTrue(special_values.is_undefined(undefined_symbol))
+    self.assertTrue(special_values.is_undefined(undefined_symbol2))
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/autograph/pyct/BUILD b/tensorflow/python/autograph/pyct/BUILD
index ba8ec271394981ec878473205a8dbbd19d255f3b..e6bff2d719fa9921a42d0e57453af5f727a740e8 100644
--- a/tensorflow/python/autograph/pyct/BUILD
+++ b/tensorflow/python/autograph/pyct/BUILD
@@ -24,6 +24,7 @@ py_library(
         "ast_util.py",
         "cfg.py",
         "compiler.py",
+        "errors.py",
         "inspect_utils.py",
         "origin_info.py",
         "parser.py",
diff --git a/tensorflow/python/autograph/pyct/ast_util.py b/tensorflow/python/autograph/pyct/ast_util.py
index ea7eca6463a17d43f1a3536ebdd1770cfcf265f7..b091285cab6f2f643d7f99f3063a903c1e5efdb8 100644
--- a/tensorflow/python/autograph/pyct/ast_util.py
+++ b/tensorflow/python/autograph/pyct/ast_util.py
@@ -200,7 +200,8 @@ def matches(node, pattern):
     bool
   """
   if isinstance(pattern, str):
-    pattern = parser.parse_expression(pattern)
+    pattern, = parser.parse_str(pattern).body
+
   matcher = PatternMatcher(pattern)
   matcher.visit(node)
   return matcher.matches
@@ -282,13 +283,18 @@ def parallel_walk(node, other):
     n = node_stack.pop()
     o = other_stack.pop()
 
-    if (not isinstance(n, (ast.AST, gast.AST)) or
-        not isinstance(o, (ast.AST, gast.AST)) or
+    if (not isinstance(n, (ast.AST, gast.AST, str)) or
+        not isinstance(o, (ast.AST, gast.AST, str)) or
         n.__class__.__name__ != o.__class__.__name__):
-      raise ValueError('inconsistent nodes: {} and {}'.format(n, o))
+      raise ValueError('inconsistent nodes: {} ({}) and {} ({})'.format(
+          n, n.__class__.__name__, o, o.__class__.__name__))
 
     yield n, o
 
+    if isinstance(n, str):
+      assert isinstance(o, str), 'The check above should have ensured this'
+      continue
+
     for f in n._fields:
       n_child = getattr(n, f, None)
       o_child = getattr(o, f, None)
@@ -314,8 +320,8 @@ def parallel_walk(node, other):
                 f, n_child, o_child))
 
 
-class FunctionDefMatcher(gast.NodeVisitor):
-  """Finds nodes that match a given function's signature."""
+class LambdaDefinitionMatcher(gast.NodeVisitor):
+  """Finds lambda nodes that match a given lambda's signature."""
 
   def __init__(self, fn):
     self.fn = fn
@@ -358,18 +364,8 @@ class FunctionDefMatcher(gast.NodeVisitor):
 
     self.matching_nodes.append(node)
 
-  def visit_FunctionDef(self, node):
-    self.generic_visit(node)
-
-    if self.fn.__name__ != node.name:
-      return
-    if not self._argspec_matches(node):
-      return
-
-    self.matching_nodes.append(node)
-
 
 def find_matching_definitions(node, f):
-  matcher = FunctionDefMatcher(f)
+  matcher = LambdaDefinitionMatcher(f)
   matcher.visit(node)
   return tuple(matcher.matching_nodes)
diff --git a/tensorflow/python/autograph/pyct/ast_util_test.py b/tensorflow/python/autograph/pyct/ast_util_test.py
index 9fcbbe646c6e558b93fdafb6380ae0a46ee1d60a..c6c1132dd64351dde9274d8d3c408d4bc4988b03 100644
--- a/tensorflow/python/autograph/pyct/ast_util_test.py
+++ b/tensorflow/python/autograph/pyct/ast_util_test.py
@@ -159,11 +159,20 @@ class AstUtilTest(test.TestCase):
     })
 
   def test_parallel_walk(self):
-    node = parser.parse_str(
-        textwrap.dedent("""
+    src = """
       def f(a):
         return a + 1
-    """))
+    """
+    node = parser.parse_str(textwrap.dedent(src))
+    for child_a, child_b in ast_util.parallel_walk(node, node):
+      self.assertEqual(child_a, child_b)
+
+  def test_parallel_walk_string_leaves(self):
+    src = """
+      def f(a):
+        global g
+    """
+    node = parser.parse_str(textwrap.dedent(src))
     for child_a, child_b in ast_util.parallel_walk(node, node):
       self.assertEqual(child_a, child_b)
 
@@ -230,70 +239,6 @@ class AstUtilTest(test.TestCase):
     nodes = ast_util.find_matching_definitions(node, f)
     self.assertLambdaNodes(nodes, ('(2)',))
 
-  def assertFunctionDefNodes(self, matching_nodes, expected_bodies):
-    self.assertEqual(len(matching_nodes), len(expected_bodies))
-    for node in matching_nodes:
-      self.assertIsInstance(node, gast.FunctionDef)
-      self.assertIn(compiler.ast_to_source(node.body).strip(), expected_bodies)
-
-  def test_find_matching_definitions_function(self):
-    node = parser.parse_str(
-        textwrap.dedent("""
-      def f(x):
-        return 1
-    """))
-
-    def f(x):
-      return x
-
-    nodes = ast_util.find_matching_definitions(node, f)
-    self.assertFunctionDefNodes(nodes, ('return 1',))
-
-  def test_find_matching_definitions_nested_functions_same_name(self):
-    node = parser.parse_str(
-        textwrap.dedent("""
-      def f(x, *args, **kwargs):
-        def f(x, y):
-          return 1
-        return 2
-    """))
-
-    def f(x, y):
-      return x + y
-
-    nodes = ast_util.find_matching_definitions(node, f)
-    self.assertFunctionDefNodes(nodes, ('return 1',))
-
-  def test_find_matching_definitions_nested_functions_same_args(self):
-    node = parser.parse_str(
-        textwrap.dedent("""
-      def g(x):
-        def f(x):
-          return 1
-        return 2
-    """))
-
-    def f(x):
-      return x
-
-    nodes = ast_util.find_matching_definitions(node, f)
-    self.assertFunctionDefNodes(nodes, ('return 1',))
-
-  def test_find_matching_definitions_multiple_matches(self):
-    node = parser.parse_str(
-        textwrap.dedent("""
-      def f(x):
-        return 1
-      def f(x):
-        return 2
-    """))
-
-    def f(x):
-      return x
-
-    nodes = ast_util.find_matching_definitions(node, f)
-    self.assertFunctionDefNodes(nodes, ('return 1', 'return 2'))
-
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/cfg.py b/tensorflow/python/autograph/pyct/cfg.py
index fdfcd4dcc15b0c6238dcdc3fedef60f2984c33a4..0cedfa84ab33cc3d0931c3998214b017a6907cfc 100644
--- a/tensorflow/python/autograph/pyct/cfg.py
+++ b/tensorflow/python/autograph/pyct/cfg.py
@@ -393,6 +393,8 @@ class GraphBuilder(object):
   def _connect_jump_to_finally_sections(self, node):
     """Connects a jump node to the finally sections protecting it."""
     cursor = set((node,))
+    if node not in self.finally_sections:
+      return cursor
     for guard_section_id in self.finally_sections[node]:
       guard_begin, guard_ends = self.finally_section_subgraphs[guard_section_id]
       self._connect_nodes(cursor, guard_begin)
@@ -620,10 +622,10 @@ class AstToCfg(gast.NodeVisitor):
     leaving_node = self.lexical_scopes.pop()
     assert node == leaving_node
 
-  def _get_enclosing_scopes(self, include, stop_at):
+  def _get_enclosing_finally_scopes(self, stop_at):
     included = []
     for node in reversed(self.lexical_scopes):
-      if isinstance(node, include):
+      if isinstance(node, gast.Try) and node.finalbody:
         included.append(node)
       if isinstance(node, stop_at):
         return node, included
@@ -635,10 +637,8 @@ class AstToCfg(gast.NodeVisitor):
 
   def _process_exit_statement(self, node, *exits_nodes_of_type):
     # Note: this is safe because we process functions separately.
-    try_node, guards = self._get_enclosing_scopes(
-        include=(gast.Try,),
-        stop_at=tuple(exits_nodes_of_type),
-    )
+    try_node, guards = self._get_enclosing_finally_scopes(
+        tuple(exits_nodes_of_type))
     if try_node is None:
       raise ValueError(
           '%s that is not enclosed by any of %s' % (node, exits_nodes_of_type))
@@ -646,10 +646,8 @@ class AstToCfg(gast.NodeVisitor):
 
   def _process_continue_statement(self, node, *loops_to_nodes_of_type):
     # Note: this is safe because we process functions separately.
-    try_node, guards = self._get_enclosing_scopes(
-        include=(gast.Try,),
-        stop_at=tuple(loops_to_nodes_of_type),
-    )
+    try_node, guards = self._get_enclosing_finally_scopes(
+        tuple(loops_to_nodes_of_type))
     if try_node is None:
       raise ValueError('%s that is not enclosed by any of %s' %
                        (node, loops_to_nodes_of_type))
@@ -698,10 +696,7 @@ class AstToCfg(gast.NodeVisitor):
     self._process_basic_statement(node)
 
   def visit_Raise(self, node):
-    try_node, guards = self._get_enclosing_scopes(
-        include=(gast.Try,),
-        stop_at=(gast.FunctionDef,),
-    )
+    try_node, guards = self._get_enclosing_finally_scopes((gast.FunctionDef,))
     if try_node is None:
       raise ValueError('%s that is not enclosed by any FunctionDef' % node)
     self.builder.add_error_node(node, guards)
@@ -797,16 +792,13 @@ class AstToCfg(gast.NodeVisitor):
     for stmt in node.orelse:
       self.visit(stmt)
 
-    if node.handlers:
-      # TODO(mdan): Should we still support bare try/except? Might be confusing.
-      raise NotImplementedError('exceptions are not yet supported')
-
     self._exit_lexical_scope(node)
 
-    self.builder.enter_finally_section(node)
-    for stmt in node.finalbody:
-      self.visit(stmt)
-    self.builder.exit_finally_section(node)
+    if node.finalbody:
+      self.builder.enter_finally_section(node)
+      for stmt in node.finalbody:
+        self.visit(stmt)
+      self.builder.exit_finally_section(node)
 
   def visit_With(self, node):
     # TODO(mdan): Mark the context manager's exit call as exit guard.
diff --git a/tensorflow/python/autograph/pyct/common_transformers/anf.py b/tensorflow/python/autograph/pyct/common_transformers/anf.py
index 192621b1cd329acec56c9517f3c885ee622b62e9..246c26833f0c30c757526209b710ef6df90eebf0 100644
--- a/tensorflow/python/autograph/pyct/common_transformers/anf.py
+++ b/tensorflow/python/autograph/pyct/common_transformers/anf.py
@@ -36,10 +36,10 @@ from tensorflow.python.autograph.pyct import transformer
 class DummyGensym(object):
   """A dumb gensym that suffixes a stem by sequential numbers from 1000."""
 
-  def __init__(self, entity_info):
-    del entity_info
+  def __init__(self, ctx):
+    del ctx
     # A proper implementation needs to account for:
-    #   * entity_info.namespace
+    #   * ctx.info.namespace
     #   * all the symbols defined in the AST
     #   * the symbols generated so far
     self._idx = 0
@@ -68,19 +68,19 @@ class AnfTransformer(transformer.Base):
   # processing the `body` and the `orelse` need to be kept together with them,
   # and not accidentally lifted out of the `if`.
 
-  def __init__(self, entity_info, gensym_source=None):
+  def __init__(self, ctx, gensym_source=None):
     """Creates an ANF transformer.
 
     Args:
-      entity_info: transformer.EntityInfo
+      ctx: transformer.Context
       gensym_source: An optional object with the same interface as `DummyGensym`
         for generating unique names
     """
-    super(AnfTransformer, self).__init__(entity_info)
+    super(AnfTransformer, self).__init__(ctx)
     if gensym_source is None:
-      self._gensym = DummyGensym(entity_info)
+      self._gensym = DummyGensym(ctx)
     else:
-      self._gensym = gensym_source(entity_info)
+      self._gensym = gensym_source(ctx)
     self._pending_statements = []
 
   def _consume_pending_statements(self):
@@ -406,7 +406,7 @@ class AnfTransformer(transformer.Base):
     return node
 
 
-def transform(node, entity_info, gensym_source=None):
+def transform(node, ctx, gensym_source=None):
   """Converts the given node to A-normal form (ANF).
 
   The general idea of A-normal form: https://en.wikipedia.org/wiki/A-normal_form
@@ -416,9 +416,9 @@ def transform(node, entity_info, gensym_source=None):
 
   Args:
     node: The node to transform.
-    entity_info: transformer.EntityInfo.  TODO(mdan): What information does this
+    ctx: transformer.EntityInfo.  TODO(mdan): What information does this
       argument provide?
     gensym_source: An optional object with the same interface as `DummyGensym`
       for generating unique names.
   """
-  return AnfTransformer(entity_info, gensym_source=gensym_source).visit(node)
+  return AnfTransformer(ctx, gensym_source=gensym_source).visit(node)
diff --git a/tensorflow/python/autograph/pyct/common_transformers/anf_test.py b/tensorflow/python/autograph/pyct/common_transformers/anf_test.py
index 525d4886dee37c79d4087a293fa9ce5424a74c15..58663d21ff2626a6bad9f892263b8c721d82d004 100644
--- a/tensorflow/python/autograph/pyct/common_transformers/anf_test.py
+++ b/tensorflow/python/autograph/pyct/common_transformers/anf_test.py
@@ -30,10 +30,10 @@ from tensorflow.python.platform import test
 class DummyGensym(object):
   """A dumb gensym that suffixes a stem by sequential numbers from 1000."""
 
-  def __init__(self, entity_info):
-    del entity_info
+  def __init__(self, ctx):
+    del ctx
     # A proper implementation needs to account for:
-    #   * entity_info.namespace
+    #   * ctx.info.namespace
     #   * all the symbols defined in the AST
     #   * the symbols generated so far
     self._idx = 0
@@ -68,21 +68,22 @@ def exec_expected_result():
 
 class AnfTransformerTest(test.TestCase):
 
-  def _simple_source_info(self):
-    return transformer.EntityInfo(
+  def _simple_context(self):
+    entity_info = transformer.EntityInfo(
         source_code=None,
         source_file=None,
         namespace=None,
         arg_values=None,
         arg_types=None,
         owner_type=None)
+    return transformer.Context(entity_info)
 
   def test_basic(self):
     def test_function():
       a = 0
       return a
     node, _ = parser.parse_entity(test_function)
-    node = anf.transform(node.body[0], self._simple_source_info())
+    node = anf.transform(node.body[0], self._simple_context())
     result, _ = compiler.ast_to_object(node)
     self.assertEqual(test_function(), result.test_function())
 
@@ -100,7 +101,7 @@ class AnfTransformerTest(test.TestCase):
     exp_node, _ = parser.parse_entity(expected_fn)
     node, _ = parser.parse_entity(test_fn)
     node = anf.transform(
-        node, self._simple_source_info(), gensym_source=DummyGensym)
+        node, self._simple_context(), gensym_source=DummyGensym)
     exp_name = exp_node.body[0].name
     # Ignoring the function names in the result because they can't be
     # the same (because both functions have to exist in the same scope
@@ -109,7 +110,7 @@ class AnfTransformerTest(test.TestCase):
     self.assert_same_ast(exp_node, node)
     # Check that ANF is idempotent
     node_repeated = anf.transform(
-        node, self._simple_source_info(), gensym_source=DummyGensym)
+        node, self._simple_context(), gensym_source=DummyGensym)
     self.assert_same_ast(node_repeated, node)
 
   def test_binop_basic(self):
diff --git a/tensorflow/python/autograph/pyct/compiler.py b/tensorflow/python/autograph/pyct/compiler.py
index 06e66c5b5871d5528bccfcc9fe47268207594ea6..420f3bb22388801c54f27e8bf1701febb90ad34a 100644
--- a/tensorflow/python/autograph/pyct/compiler.py
+++ b/tensorflow/python/autograph/pyct/compiler.py
@@ -67,6 +67,13 @@ def ast_to_source(node, indentation='  '):
       trimmed_code_lines.append(l)
   code = '\n'.join(trimmed_code_lines)
 
+  # Work around the reference cycle generated by astor.
+  # See https://github.com/berkerpeksag/astor/blob/55dd323f7d8d696610c703c0296763c567685c31/astor/code_gen.py#L162  # pylint:disable=line-too-long
+  # Reference cycles are quite disliked by TensorFlow's tests.
+  if hasattr(generator, 'write'):
+    generator.write = None
+  del generator
+
   return code
 
 
diff --git a/tensorflow/python/autograph/pyct/errors.py b/tensorflow/python/autograph/pyct/errors.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f2049c40bb5fb72f7dd8d3191bc3163abdcf309
--- /dev/null
+++ b/tensorflow/python/autograph/pyct/errors.py
@@ -0,0 +1,48 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Code transformation exceptions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.autograph.utils import ag_logging
+
+
+class AutoGraphError(Exception):
+  pass
+
+
+class InternalError(AutoGraphError):
+  """Raised when AutoGraph finds an unexpected error."""
+
+  def __init__(self, message, original_exc):
+    super(InternalError, self).__init__()
+    self.message = message
+    self.original_exc = original_exc
+
+  def __str__(self):
+    return '{} during {}: {}'.format(
+        type(self.original_exc).__name__, self.message, self.original_exc)
+
+
+def report_internal_error(entity, exception):
+  ag_logging.log(1, 'Error transforming %s', entity, exc_info=True)
+  # TODO(znado): Add external bug reporting instructions.
+  raise AutoGraphError(
+      'Unexpected error transforming %s. If you believe this is due to a bug,'
+      ' please set the verbosity to 10 (on Linux, `export '
+      'AUTOGRAPH_VERBOSITY=10`) and attach the full output when filing the bug '
+      'report. Caused by: %s' % (entity, exception))
diff --git a/tensorflow/python/autograph/pyct/inspect_utils.py b/tensorflow/python/autograph/pyct/inspect_utils.py
index 7c819f364fa79d40c0fbb080b3b358b36bfd8c0c..eab01ee9cd613b25548412ea78f1fd07d3d432cb 100644
--- a/tensorflow/python/autograph/pyct/inspect_utils.py
+++ b/tensorflow/python/autograph/pyct/inspect_utils.py
@@ -31,15 +31,19 @@ from tensorflow.python.util import tf_inspect
 
 # These functions test negative for isinstance(*, types.BuiltinFunctionType)
 # and inspect.isbuiltin, and are generally not visible in globals().
+# TODO(mdan): Remove this.
 SPECIAL_BUILTINS = {
     'dict': dict,
+    'enumerate': enumerate,
     'float': float,
     'int': int,
     'len': len,
     'list': list,
     'print': print,
     'range': range,
-    'tuple': tuple
+    'tuple': tuple,
+    'type': type,
+    'zip': zip
 }
 
 if six.PY2:
@@ -70,7 +74,7 @@ def isnamedtuple(f):
 
 def isbuiltin(f):
   """Returns True if the argument is a built-in function."""
-  if f in SPECIAL_BUILTINS.values():
+  if f in six.moves.builtins.__dict__.values():
     return True
   if isinstance(f, types.BuiltinFunctionType):
     return True
@@ -101,7 +105,7 @@ def getnamespace(f):
   return namespace
 
 
-def getqualifiedname(namespace, object_, max_depth=2):
+def getqualifiedname(namespace, object_, max_depth=5, visited=None):
   """Returns the name by which a value can be referred to in a given namespace.
 
   If the object defines a parent module, the function attempts to use it to
@@ -115,16 +119,24 @@ def getqualifiedname(namespace, object_, max_depth=2):
     object_: Any, the value to search.
     max_depth: Optional[int], a limit to the recursion depth when searching
         inside modules.
+    visited: Optional[Set[int]], ID of modules to avoid visiting.
   Returns: Union[str, None], the fully-qualified name that resolves to the value
       o, or None if it couldn't be found.
   """
-  for name, value in namespace.items():
+  if visited is None:
+    visited = set()
+
+  # Copy the dict to avoid "changed size error" during concurrent invocations.
+  # TODO(mdan): This is on the hot path. Can we avoid the copy?
+  namespace = dict(namespace)
+
+  for name in namespace:
     # The value may be referenced by more than one symbol, case in which
     # any symbol will be fine. If the program contains symbol aliases that
     # change over time, this may capture a symbol that will later point to
     # something else.
     # TODO(mdan): Prefer the symbol that matches the value type name.
-    if object_ is value:
+    if object_ is namespace[name]:
       return name
 
   # If an object is not found, try to search its parent modules.
@@ -132,22 +144,25 @@ def getqualifiedname(namespace, object_, max_depth=2):
   if (parent is not None and parent is not object_ and
       parent is not namespace):
     # No limit to recursion depth because of the guard above.
-    parent_name = getqualifiedname(namespace, parent, max_depth=0)
+    parent_name = getqualifiedname(
+        namespace, parent, max_depth=0, visited=visited)
     if parent_name is not None:
-      name_in_parent = getqualifiedname(parent.__dict__, object_, max_depth=0)
+      name_in_parent = getqualifiedname(
+          parent.__dict__, object_, max_depth=0, visited=visited)
       assert name_in_parent is not None, (
           'An object should always be found in its owner module')
       return '{}.{}'.format(parent_name, name_in_parent)
 
-  # TODO(mdan): Use breadth-first search and avoid visiting modules twice.
   if max_depth:
     # Iterating over a copy prevents "changed size due to iteration" errors.
     # It's unclear why those occur - suspecting new modules may load during
     # iteration.
-    for name, value in namespace.copy().items():
-      if tf_inspect.ismodule(value):
+    for name in namespace.keys():
+      value = namespace[name]
+      if tf_inspect.ismodule(value) and id(value) not in visited:
+        visited.add(id(value))
         name_in_module = getqualifiedname(value.__dict__, object_,
-                                          max_depth - 1)
+                                          max_depth - 1, visited)
         if name_in_module is not None:
           return '{}.{}'.format(name, name_in_module)
   return None
@@ -176,6 +191,27 @@ def getdefiningclass(m, owner_class):
   return owner_class
 
 
+def isweakrefself(m):
+  """Tests whether an object is a "weakref self" wrapper, see getmethodself."""
+  return hasattr(m, '__self__') and hasattr(m.__self__, 'ag_self_weakref__')
+
+
+def getmethodself(m):
+  """An extended version of inspect.getmethodclass."""
+  if not hasattr(m, '__self__'):
+    return None
+  if m.__self__ is None:
+    return None
+
+  # A fallback allowing methods to be actually bound to a type different
+  # than __self__. This is useful when a strong reference from the method
+  # to the object is not desired, for example when caching is involved.
+  if isweakrefself(m):
+    return m.__self__.ag_self_weakref__()
+
+  return m.__self__
+
+
 def getmethodclass(m):
   """Resolves a function's owner, e.g. a method's class.
 
@@ -206,16 +242,12 @@ def getmethodclass(m):
     if isinstance(m.__class__, six.class_types):
       return m.__class__
 
-  # Instance method and class methods: should be bound to a non-null "self".
-  if hasattr(m, '__self__'):
-    if m.__self__ is not None:
-      # A fallback allowing methods to be actually bound to a type different
-      # than __self__. This is useful when a strong reference from the method
-      # to the object is not desired, for example when caching is involved.
-      if hasattr(m.__self__, 'ag_self_weakref__'):
-        return m.__self__.ag_self_weakref__()
-
-      return m.__self__
+  # Instance method and class methods: return the class of "self".
+  m_self = getmethodself(m)
+  if m_self is not None:
+    if tf_inspect.isclass(m_self):
+      return m_self
+    return m_self.__class__
 
   # Class, static and unbound methods: search all defined classes in any
   # namespace. This is inefficient but more robust method.
diff --git a/tensorflow/python/autograph/pyct/inspect_utils_test.py b/tensorflow/python/autograph/pyct/inspect_utils_test.py
index a2c39056d1b09dbae937915cf17de5c6f55d4886..4c4c0977b0fef2fdfee69d2e7c608ad1a412aa21 100644
--- a/tensorflow/python/autograph/pyct/inspect_utils_test.py
+++ b/tensorflow/python/autograph/pyct/inspect_utils_test.py
@@ -183,6 +183,63 @@ class InspectUtilsTest(test.TestCase):
     self.assertEqual(inspect_utils.getqualifiedname(ns, bar), 'bar')
     self.assertEqual(inspect_utils.getqualifiedname(ns, baz), 'bar.baz')
 
+  def test_getqualifiedname_efficiency(self):
+    foo = object()
+
+    # We create a densely connected graph consisting of a relatively small
+    # number of modules and hide our symbol in one of them. The path to the
+    # symbol is at least 10, and each node has about 10 neighbors. However,
+    # by skipping visited modules, the search should take much less.
+    ns = {}
+    prev_level = []
+    for i in range(10):
+      current_level = []
+      for j in range(10):
+        mod_name = 'mod_{}_{}'.format(i, j)
+        mod = imp.new_module(mod_name)
+        current_level.append(mod)
+        if i == 9 and j == 9:
+          mod.foo = foo
+      if prev_level:
+        # All modules at level i refer to all modules at level i+1
+        for prev in prev_level:
+          for mod in current_level:
+            prev.__dict__[mod.__name__] = mod
+      else:
+        for mod in current_level:
+          ns[mod.__name__] = mod
+      prev_level = current_level
+
+    self.assertIsNone(inspect_utils.getqualifiedname(ns, inspect_utils))
+    self.assertIsNotNone(
+        inspect_utils.getqualifiedname(ns, foo, max_depth=10000000000))
+
+  def test_getqualifiedname_cycles(self):
+    foo = object()
+
+    # We create a graph of modules that contains circular references. The
+    # search process should avoid them. The searched object is hidden at the
+    # bottom of a path of length roughly 10.
+    ns = {}
+    mods = []
+    for i in range(10):
+      mod = imp.new_module('mod_{}'.format(i))
+      if i == 9:
+        mod.foo = foo
+      # Module i refers to module i+1
+      if mods:
+        mods[-1].__dict__[mod.__name__] = mod
+      else:
+        ns[mod.__name__] = mod
+      # Module i refers to all modules j < i.
+      for prev in mods:
+        mod.__dict__[prev.__name__] = prev
+      mods.append(mod)
+
+    self.assertIsNone(inspect_utils.getqualifiedname(ns, inspect_utils))
+    self.assertIsNotNone(
+        inspect_utils.getqualifiedname(ns, foo, max_depth=10000000000))
+
   def test_getqualifiedname_finds_via_parent_module(self):
     # TODO(mdan): This test is vulnerable to change in the lib module.
     # A better way to forge modules should be found.
@@ -220,16 +277,16 @@ class InspectUtilsTest(test.TestCase):
     test_obj = TestClass()
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.member_function),
-        test_obj)
+        TestClass)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.decorated_member),
-        test_obj)
+        TestClass)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.fn_decorated_member),
-        test_obj)
+        TestClass)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.wrap_decorated_member),
-        test_obj)
+        TestClass)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.static_method),
         TestClass)
@@ -278,16 +335,16 @@ class InspectUtilsTest(test.TestCase):
     test_obj = LocalClass()
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.member_function),
-        test_obj)
+        LocalClass)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.decorated_member),
-        test_obj)
+        LocalClass)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.fn_decorated_member),
-        test_obj)
+        LocalClass)
     self.assertEqual(
         inspect_utils.getmethodclass(test_obj.wrap_decorated_member),
-        test_obj)
+        LocalClass)
 
   def test_getmethodclass_callables(self):
     class TestCallable(object):
@@ -310,12 +367,13 @@ class InspectUtilsTest(test.TestCase):
       return self
 
     bound_method = types.MethodType(test_fn, WeakrefWrapper())
-    self.assertEqual(inspect_utils.getmethodclass(bound_method), test_obj)
+    self.assertEqual(inspect_utils.getmethodclass(bound_method), TestClass)
 
   def test_getmethodclass_no_bool_conversion(self):
 
     tensor = constant_op.constant([1])
-    self.assertEqual(inspect_utils.getmethodclass(tensor.get_shape), tensor)
+    self.assertEqual(
+        inspect_utils.getmethodclass(tensor.get_shape), type(tensor))
 
   def test_getdefiningclass(self):
     class Superclass(object):
@@ -349,10 +407,12 @@ class InspectUtilsTest(test.TestCase):
         Superclass)
 
   def test_isbuiltin(self):
-    self.assertTrue(inspect_utils.isbuiltin(range))
+    self.assertTrue(inspect_utils.isbuiltin(enumerate))
     self.assertTrue(inspect_utils.isbuiltin(float))
     self.assertTrue(inspect_utils.isbuiltin(int))
     self.assertTrue(inspect_utils.isbuiltin(len))
+    self.assertTrue(inspect_utils.isbuiltin(range))
+    self.assertTrue(inspect_utils.isbuiltin(zip))
     self.assertFalse(inspect_utils.isbuiltin(function_decorator))
 
   def test_super_wrapper_for_dynamic_attrs(self):
diff --git a/tensorflow/python/autograph/pyct/origin_info.py b/tensorflow/python/autograph/pyct/origin_info.py
index 102bd42c91ca8189355fe39d014521151c0a6377..f41b3285d1ce1df0f51b2037d12125c49c9576a9 100644
--- a/tensorflow/python/autograph/pyct/origin_info.py
+++ b/tensorflow/python/autograph/pyct/origin_info.py
@@ -18,6 +18,8 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import difflib
+import os
 import tokenize
 
 import gast
@@ -26,6 +28,8 @@ import six
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import ast_util
 from tensorflow.python.autograph.pyct import parser
+from tensorflow.python.autograph.pyct import pretty_printer
+from tensorflow.python.autograph.utils import ag_logging as logging
 from tensorflow.python.util import tf_inspect
 
 
@@ -73,6 +77,13 @@ class OriginInfo(
     return (self.loc.filename, self.loc.lineno, self.function_name,
             self.source_code_line)
 
+  def __repr__(self):
+    if self.loc.filename:
+      return '{}:{}:{}'.format(
+          os.path.split(self.loc.filename)[1], self.loc.lineno,
+          self.loc.col_offset)
+    return '<no file>:{}:{}'.format(self.loc.lineno, self.loc.col_offset)
+
 
 # TODO(mdan): This source map should be a class - easier to refer to.
 def create_source_map(nodes, code, filename, indices_in_code):
@@ -97,32 +108,47 @@ def create_source_map(nodes, code, filename, indices_in_code):
   resolve(reparsed_nodes, code)
   result = {}
 
-  for before, after in ast_util.parallel_walk(nodes, reparsed_nodes):
-    # Note: generated code might not be mapped back to its origin.
-    # TODO(mdan): Generated code should always be mapped to something.
-    origin_info = anno.getanno(before, anno.Basic.ORIGIN, default=None)
-    final_info = anno.getanno(after, anno.Basic.ORIGIN, default=None)
-    if origin_info is None or final_info is None:
-      continue
-
-    line_loc = LineLocation(filename, final_info.loc.lineno)
-
-    existing_origin = result.get(line_loc)
-    if existing_origin is not None:
-      # Overlaps may exist because of child nodes, but almost never to
-      # different line locations. Exception make decorated functions, where
-      # both lines are mapped to the same line in the AST.
-
-      # Line overlaps: keep bottom node.
-      if existing_origin.loc.line_loc == origin_info.loc.line_loc:
-        if existing_origin.loc.lineno >= origin_info.loc.lineno:
-          continue
-
-      # In case of overlaps, keep the leftmost node.
-      if existing_origin.loc.col_offset <= origin_info.loc.col_offset:
+  try:
+    for before, after in ast_util.parallel_walk(nodes, reparsed_nodes):
+      # Note: generated code might not be mapped back to its origin.
+      # TODO(mdan): Generated code should always be mapped to something.
+      origin_info = anno.getanno(before, anno.Basic.ORIGIN, default=None)
+      final_info = anno.getanno(after, anno.Basic.ORIGIN, default=None)
+      if origin_info is None or final_info is None:
         continue
 
-    result[line_loc] = origin_info
+      line_loc = LineLocation(filename, final_info.loc.lineno)
+
+      existing_origin = result.get(line_loc)
+      if existing_origin is not None:
+        # Overlaps may exist because of child nodes, but almost never to
+        # different line locations. Exception make decorated functions, where
+        # both lines are mapped to the same line in the AST.
+
+        # Line overlaps: keep bottom node.
+        if existing_origin.loc.line_loc == origin_info.loc.line_loc:
+          if existing_origin.loc.lineno >= origin_info.loc.lineno:
+            continue
+
+        # In case of overlaps, keep the leftmost node.
+        if existing_origin.loc.col_offset <= origin_info.loc.col_offset:
+          continue
+
+      result[line_loc] = origin_info
+  except ValueError:
+    if logging.has_verbosity(3):
+      for n, rn in zip(nodes, reparsed_nodes):
+        nodes_str = pretty_printer.fmt(n, color=False, noanno=True)
+        reparsed_nodes_str = pretty_printer.fmt(rn, color=False, noanno=True)
+        diff = difflib.context_diff(
+            nodes_str.split('\n'),
+            reparsed_nodes_str.split('\n'),
+            fromfile='Original nodes',
+            tofile='Reparsed nodes',
+            n=7)
+        diff = '\n'.join(diff)
+        logging.log(3, 'AST seems to lack integrity. Diff:\n%s', diff)
+    raise
 
   return result
 
diff --git a/tensorflow/python/autograph/pyct/parser.py b/tensorflow/python/autograph/pyct/parser.py
index 39fc1a7ed05c06da89efe505e439b307badb4b4e..f6b2a7863bd99bbc75886b2522f4e3a7d35ec0f1 100644
--- a/tensorflow/python/autograph/pyct/parser.py
+++ b/tensorflow/python/autograph/pyct/parser.py
@@ -21,7 +21,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import re
 import textwrap
+import threading
 
 import gast
 import six
@@ -29,11 +31,24 @@ import six
 from tensorflow.python.util import tf_inspect
 
 
+_parse_lock = threading.Lock()  # Prevents linecache concurrency errors.
+
+
 def parse_entity(entity):
   """Returns the AST of given entity."""
-  source = tf_inspect.getsource(entity)
-
-  def fail(comment):
+  try:
+    with _parse_lock:
+      source = tf_inspect.getsource_no_unwrap(entity)
+  except (IOError, OSError) as e:
+    raise ValueError(
+        'Unable to locate the source code of {}. Note that functions defined'
+        ' in certain environments, like the interactive Python shell do not'
+        ' expose their source code. If that is the case, you should to define'
+        ' them in a .py source file. If you are certain the code is'
+        ' graph-compatible, wrap the call using'
+        ' @tf.autograph.do_not_convert. Original error: {}'.format(entity, e))
+
+  def raise_parse_failure(comment):
     raise ValueError(
         'Failed to parse source code of {}, which Python reported as:\n{}\n'
         '{}'.format(entity, source, comment))
@@ -49,8 +64,9 @@ def parse_entity(entity):
   except IndentationError:
     # The text below lists the causes of this error known to us. There may
     # be more.
-    fail('This may be caused by multiline strings or comments not indented at'
-         'the same level as the code.')
+    raise_parse_failure(
+        'This may be caused by multiline strings or comments not indented at'
+        ' the same level as the code.')
 
   except SyntaxError as e:
     if not tf_inspect.isfunction(entity) or entity.__name__ != '<lambda>':
@@ -71,8 +87,9 @@ def parse_entity(entity):
 
     # Give up if there's nothing we can chip away.
     if len(lines) == lineno and len(lines[-1]) == offset:
-      fail('If this is a lambda function, the error may be avoided by creating'
-           ' the lambda in a standalone statement.')
+      raise_parse_failure(
+          'If this is a lambda function, the error may be avoided by creating'
+          ' the lambda in a standalone statement.')
 
     # Drop all lines following the error location
     # TODO(mdan): What's with the pylint errors?
@@ -84,16 +101,17 @@ def parse_entity(entity):
     try:
       return parse_str(new_source), new_source
     except SyntaxError as e:
-      fail('If this is a lambda function, the error may be avoided by creating'
-           ' the lambda in a standalone statement. Tried to strip down the'
-           ' source to:\n{}\nBut that did not work.'.format(new_source))
+      raise_parse_failure(
+          'If this is a lambda function, the error may be avoided by creating'
+          ' the lambda in a standalone statement. Tried to strip down the'
+          ' source to:\n{}\nBut that did not work.'.format(new_source))
 
 
 def parse_str(src):
   """Returns the AST of given piece of code."""
   # TODO(mdan): This should exclude the module things are autowrapped in.
 
-  if six.PY2 and '.print(' in src:
+  if six.PY2 and re.search('\\Wprint\\s*\\(', src):
     # This special treatment is required because gast.parse is not aware of
     # whether print_function was present in the original context.
     src = 'from __future__ import print_function\n' + src
@@ -117,7 +135,7 @@ def parse_expression(src):
   """
   node = parse_str(src)
   assert isinstance(node, gast.Module)
-  if len(node.body) != 1 and not isinstance(node.body[0], gast.Expr):
+  if len(node.body) != 1 or not isinstance(node.body[0], gast.Expr):
     raise ValueError(
         'Expected a single expression, found instead %s' % node.body)
   return node.body[0].value
diff --git a/tensorflow/python/autograph/pyct/parser_test.py b/tensorflow/python/autograph/pyct/parser_test.py
index d3a7b7a014646601339a79e6cf97461853bccbb2..e7fa3c7aeb53c984c40e8709b2ce8e49e8879acf 100644
--- a/tensorflow/python/autograph/pyct/parser_test.py
+++ b/tensorflow/python/autograph/pyct/parser_test.py
@@ -42,6 +42,24 @@ class ParserTest(test.TestCase):
     """))
     self.assertEqual('f', mod.body[0].name)
 
+  def test_parse_str_print(self):
+    mod = parser.parse_str(
+        textwrap.dedent("""
+            def f(x):
+              print(x)
+              return x + 1
+    """))
+    self.assertEqual('f', mod.body[0].name)
+
+  def test_parse_str_weird_print(self):
+    mod = parser.parse_str(
+        textwrap.dedent("""
+            def f(x):
+              print (x)
+              return x + 1
+    """))
+    self.assertEqual('f', mod.body[0].name)
+
   def test_parse_comments(self):
     def f():
 # unindented comment
diff --git a/tensorflow/python/autograph/pyct/pretty_printer.py b/tensorflow/python/autograph/pyct/pretty_printer.py
index bacc1e4a7774ec5b84495255042392fe089150d5..a92017f4142f671f337b15104b049581309bd290 100644
--- a/tensorflow/python/autograph/pyct/pretty_printer.py
+++ b/tensorflow/python/autograph/pyct/pretty_printer.py
@@ -25,10 +25,11 @@ import termcolor
 class PrettyPrinter(gast.NodeVisitor):
   """Print AST nodes."""
 
-  def __init__(self, color):
+  def __init__(self, color, noanno):
     self.indent_lvl = 0
     self.result = ''
     self.color = color
+    self.noanno = noanno
 
   def _color(self, string, color, attrs=None):
     if self.color:
@@ -55,6 +56,15 @@ class PrettyPrinter(gast.NodeVisitor):
     self.result += '\n'
 
   def generic_visit(self, node, name=None):
+    # In very rare instances, a list can contain something other than a Node.
+    # e.g. Global contains a list of strings.
+    if isinstance(node, str):
+      if name:
+        self._print('%s%s="%s"' % (self._indent(), name, node))
+      else:
+        self._print('%s"%s"' % (self._indent(), node))
+      return
+
     if node._fields:
       cont = ':'
     else:
@@ -68,6 +78,8 @@ class PrettyPrinter(gast.NodeVisitor):
 
     self.indent_lvl += 1
     for f in node._fields:
+      if self.noanno and f.startswith('__'):
+        continue
       if not hasattr(node, f):
         self._print('%s%s' % (self._indent(), self._warning('%s=<unset>' % f)))
         continue
@@ -103,8 +115,8 @@ class PrettyPrinter(gast.NodeVisitor):
     self.indent_lvl -= 1
 
 
-def fmt(node, color=True):
-  printer = PrettyPrinter(color)
+def fmt(node, color=True, noanno=False):
+  printer = PrettyPrinter(color, noanno)
   if isinstance(node, (list, tuple)):
     for n in node:
       printer.visit(n)
diff --git a/tensorflow/python/autograph/pyct/static_analysis/activity.py b/tensorflow/python/autograph/pyct/static_analysis/activity.py
index 4359e0a2682f0f6818a0c2e0aaffeaa12718c514..dd3d1d5d1365c6a5aa5f1a7f16a485d40c3da6a1 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/activity.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/activity.py
@@ -25,6 +25,7 @@ import copy
 import weakref
 
 import gast
+import six
 
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import qual_names
@@ -149,6 +150,14 @@ class _Lambda(object):
     self.args = set()
 
 
+class _Comprehension(object):
+
+  no_root = True
+
+  def __init__(self):
+    self.targets = set()
+
+
 class ActivityAnalyzer(transformer.Base):
   """Annotates nodes with local scope information.
 
@@ -199,12 +208,27 @@ class ActivityAnalyzer(transformer.Base):
       if qn.owner_set & set(l.args):
         return
 
+    # When inside a comprehension, ignore any of the comprehensions's targets.
+    # This includes attributes or slices of those arguments.
+    # This is not true in Python2, which leaks symbols.
+    if six.PY3:
+      for l in self.state[_Comprehension]:
+        if qn in l.targets:
+          return
+        if qn.owner_set & set(l.targets):
+          return
+
     if isinstance(node.ctx, gast.Store):
-      self.scope.mark_modified(qn)
-      if qn.is_composite and composite_writes_alter_parent:
-        self.scope.mark_modified(qn.parent)
-      if self._in_aug_assign:
-        self.scope.mark_read(qn)
+      # In comprehensions, modified symbols are the comprehension targets.
+      if six.PY3 and self.state[_Comprehension].level > 0:
+        # Like a lambda's args, they are tracked separately in Python3.
+        self.state[_Comprehension].targets.add(qn)
+      else:
+        self.scope.mark_modified(qn)
+        if qn.is_composite and composite_writes_alter_parent:
+          self.scope.mark_modified(qn.parent)
+        if self._in_aug_assign:
+          self.scope.mark_read(qn)
     elif isinstance(node.ctx, gast.Load):
       self.scope.mark_read(qn)
     elif isinstance(node.ctx, gast.Param):
@@ -241,10 +265,10 @@ class ActivityAnalyzer(transformer.Base):
     self._exit_scope()
     return node
 
-  def visit_nonlocal(self, node):
+  def visit_Nonlocal(self, node):
     raise NotImplementedError()
 
-  def visit_global(self, node):
+  def visit_Global(self, node):
     raise NotImplementedError()
 
   def visit_Expr(self, node):
@@ -338,12 +362,41 @@ class ActivityAnalyzer(transformer.Base):
     self.state[_Lambda].exit()
     return node
 
+  def _process_iterable_comprehension(self, node):
+    # This handles ListComp, SetComp, GeneratorExp.
+    self.state[_Comprehension].enter()
+    # Note: it's important to visit the generators first to properly account
+    # for the variables local to these generators. Example: `x` is local to the
+    # expression `x for x in y`.
+    node.generators = self.visit_block(node.generators)
+    node.elt = self.visit(node.elt)
+    self.state[_Comprehension].exit()
+    return node
+
+  def visit_DictComp(self, node):
+    # Identical to _process_iterable_comprehension, different node names.
+    self.state[_Comprehension].enter()
+    node.generators = self.visit_block(node.generators)
+    node.key = self.visit(node.key)
+    node.value = self.visit(node.value)
+    self.state[_Comprehension].exit()
+    return node
+
+  def visit_ListComp(self, node):
+    return self._process_iterable_comprehension(node)
+
+  def visit_SetComp(self, node):
+    return self._process_iterable_comprehension(node)
+
+  def visit_GeneratorExp(self, node):
+    return self._process_iterable_comprehension(node)
+
   def visit_arguments(self, node):
     return self._process_statement(node)
 
   def visit_FunctionDef(self, node):
     # The FunctionDef node itself has a Scope object that tracks the creation
-    # of its name, along with the usage of any decorator accompany it.
+    # of its name, along with the usage of any decorator accompanying it.
     self._enter_scope(False)
     node.decorator_list = self.visit_block(node.decorator_list)
     self.scope.mark_modified(qual_names.QN(node.name))
diff --git a/tensorflow/python/autograph/pyct/static_analysis/activity_test.py b/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
index 997d9a8aff111dfb0c223840da642ce8b2f138ce..595e95bed98f88b19a68c5ceb4ce1e2156e2b27d 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/activity_test.py
@@ -121,7 +121,8 @@ class ActivityAnalyzerTest(test.TestCase):
         arg_types=None,
         owner_type=None)
     node = qual_names.resolve(node)
-    node = activity.resolve(node, entity_info)
+    ctx = transformer.Context(entity_info)
+    node = activity.resolve(node, ctx)
     return node, entity_info
 
   def assertSymbolSetsAre(self, expected, actual, name):
diff --git a/tensorflow/python/autograph/pyct/static_analysis/live_values.py b/tensorflow/python/autograph/pyct/static_analysis/live_values.py
index e8e3d229bea4bb505d58cdae24de87377b1b50e6..eca4571d38977905cc51387e47ee9a7d763f6703 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/live_values.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/live_values.py
@@ -39,7 +39,8 @@ class LiveValueResolver(transformer.Base):
 
   def visit_ClassDef(self, node):
     self.generic_visit(node)
-    anno.setanno(node, 'live_val', self.entity_info.namespace[node.name])
+    anno.setanno(
+        node, 'live_val', self.ctx.info.namespace[node.name])
     return node
 
   def visit_Name(self, node):
@@ -53,8 +54,8 @@ class LiveValueResolver(transformer.Base):
       if not is_defined:
         if node.id in self.literals:
           anno.setanno(node, 'live_val', self.literals[node.id])
-        elif node.id in self.entity_info.namespace:
-          obj = self.entity_info.namespace[node.id]
+        elif node.id in self.ctx.info.namespace:
+          obj = self.ctx.info.namespace[node.id]
           anno.setanno(node, 'live_val', obj)
           if hasattr(obj, '__name__'):
             anno.setanno(node, 'fqn', (obj.__name__,))
@@ -86,8 +87,8 @@ class LiveValueResolver(transformer.Base):
         def_, = defs
         # Note: param_of is a weakref.
         if def_.param_of and def_.param_of() is self.enclosing_entities[0]:
-          if node.id in self.entity_info.arg_values:
-            obj = self.entity_info.arg_values[node.id]
+          if node.id in self.ctx.info.arg_values:
+            obj = self.ctx.info.arg_values[node.id]
             anno.setanno(node, 'live_val', obj)
             anno.setanno(node, 'fqn', (obj.__class__.__name__,))
     return node
diff --git a/tensorflow/python/autograph/pyct/static_analysis/live_values_test.py b/tensorflow/python/autograph/pyct/static_analysis/live_values_test.py
index 882c380b7888250560e0bf69ca44c3e7f4264979..a8d4e25e3c6f221ad13cb62ebadb54b8c86e665c 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/live_values_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/live_values_test.py
@@ -51,12 +51,13 @@ class LiveValuesResolverTest(test.TestCase):
         owner_type=None)
     node = qual_names.resolve(node)
     graphs = cfg.build(node)
-    node = activity.resolve(node, entity_info)
-    node = reaching_definitions.resolve(node, entity_info, graphs,
+    ctx = transformer.Context(entity_info)
+    node = activity.resolve(node, ctx)
+    node = reaching_definitions.resolve(node, ctx, graphs,
                                         reaching_definitions.Definition)
-    node = live_values.resolve(node, entity_info, literals)
-    node = type_info.resolve(node, entity_info)
-    node = live_values.resolve(node, entity_info, literals)
+    node = live_values.resolve(node, ctx, literals)
+    node = type_info.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, literals)
     return node
 
   def test_literals(self):
diff --git a/tensorflow/python/autograph/pyct/static_analysis/liveness.py b/tensorflow/python/autograph/pyct/static_analysis/liveness.py
index f8b8d7fa77c167e0ebf96dd533e3c42b0c30b8e5..ad567a0a4fc97e246461274f33fa403634638ed8 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/liveness.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/liveness.py
@@ -144,10 +144,10 @@ class WholeTreeAnalyzer(transformer.Base):
     self.current_analyzer = parent_analyzer
     return node
 
-  def visit_nonlocal(self, node):
+  def visit_Nonlocal(self, node):
     raise NotImplementedError()
 
-  def visit_global(self, node):
+  def visit_Global(self, node):
     raise NotImplementedError()
 
 
@@ -219,6 +219,10 @@ class Annotator(transformer.Base):
                  frozenset(self.current_analyzer.out[cfg_node]))
     return node
 
+  def visit_ExceptHandler(self, node):
+    # TODO(b/123995141) Add Exception Handlers to the CFG
+    return node
+
 
 def resolve(node, source_info, graphs):
   """Resolves the live symbols at the exit of control flow statements.
diff --git a/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py b/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py
index 4366808d4962394b98cb3d939abed9666899a6d3..f14b1a3e79de80d2218366e086d649fa5493be4f 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/liveness_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import six
+
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import cfg
 from tensorflow.python.autograph.pyct import parser
@@ -40,9 +42,10 @@ class LivenessTest(test.TestCase):
         arg_types=None,
         owner_type=None)
     node = qual_names.resolve(node)
-    node = activity.resolve(node, entity_info)
+    ctx = transformer.Context(entity_info)
+    node = activity.resolve(node, ctx)
     graphs = cfg.build(node)
-    liveness.resolve(node, entity_info, graphs)
+    liveness.resolve(node, ctx, graphs)
     return node
 
   def assertHasLiveOut(self, node, expected):
@@ -242,6 +245,62 @@ class LivenessTest(test.TestCase):
 
     self.assertHasLiveIn(fn_body[0], ('a', 'x', 'y'))
 
+  def test_live_in_generator_comprehension(self):
+
+    def test_fn(y):
+      if all(x for x in y):
+        return
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body[0].body
+
+    if six.PY2:
+      self.assertHasLiveIn(fn_body[0], ('all', 'x', 'y'))
+    else:
+      self.assertHasLiveIn(fn_body[0], ('all', 'y'))
+
+  def test_live_in_list_comprehension(self):
+
+    def test_fn(y):
+      if [x for x in y]:
+        return
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body[0].body
+
+    if six.PY2:
+      self.assertHasLiveIn(fn_body[0], ('x', 'y'))
+    else:
+      self.assertHasLiveIn(fn_body[0], ('y',))
+
+  def test_live_in_set_comprehension(self):
+
+    def test_fn(y):
+      if {x for x in y}:
+        return
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body[0].body
+
+    if six.PY2:
+      self.assertHasLiveIn(fn_body[0], ('x', 'y'))
+    else:
+      self.assertHasLiveIn(fn_body[0], ('y',))
+
+  def test_live_in_dict_comprehension(self):
+
+    def test_fn(y):
+      if {k: v for k, v in y}:
+        return
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body[0].body
+
+    if six.PY2:
+      self.assertHasLiveIn(fn_body[0], ('k', 'v', 'y'))
+    else:
+      self.assertHasLiveIn(fn_body[0], ('y',))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions.py b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions.py
index d1587d81780780f56ab0ec1fb0dbb9942a3d4539..ce6f3c528477713bb3ac04af00baffb9a1b7a145 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions.py
@@ -217,12 +217,16 @@ class TreeAnnotator(transformer.Base):
 
     return node
 
-  def visit_nonlocal(self, node):
+  def visit_Nonlocal(self, node):
     raise NotImplementedError()
 
-  def visit_global(self, node):
+  def visit_Global(self, node):
     raise NotImplementedError()
 
+  def visit_ExceptHandler(self, node):
+    # TODO(b/123995141) Add Exception Handlers to the CFG
+    return node
+
   def visit_Name(self, node):
     if self.current_analyzer is None:
       # Names may appear outside function defs - for example in class
@@ -232,7 +236,8 @@ class TreeAnnotator(transformer.Base):
     analyzer = self.current_analyzer
     cfg_node = self.current_cfg_node
 
-    assert cfg_node is not None, 'name node outside of any statement?'
+    assert cfg_node is not None, ('name node, %s, outside of any statement?'
+                                  % node.id)
 
     qn = anno.getanno(node, anno.Basic.QN)
     if isinstance(node.ctx, gast.Load):
diff --git a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
index 8c0d51850770e90c6755951e4ca5b01bb0987c51..848c5460e6565281216a1b529060914288ff4572 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import six
+
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import cfg
 from tensorflow.python.autograph.pyct import parser
@@ -40,9 +42,10 @@ class DefinitionInfoTest(test.TestCase):
         arg_types=None,
         owner_type=None)
     node = qual_names.resolve(node)
-    node = activity.resolve(node, entity_info)
+    ctx = transformer.Context(entity_info)
+    node = activity.resolve(node, ctx)
     graphs = cfg.build(node)
-    node = reaching_definitions.resolve(node, entity_info, graphs,
+    node = reaching_definitions.resolve(node, ctx, graphs,
                                         reaching_definitions.Definition)
     return node
 
@@ -293,6 +296,24 @@ class DefinitionInfoTest(test.TestCase):
     self.assertNotSameDef(source, target)
     self.assertSameDef(target, retval)
 
+  def test_comprehension_leaking(self):
+
+    def test_fn(a):
+      all(x for x in a)
+      return x  # pylint:disable=undefined-variable
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body[0].body
+
+    listcomp_target = fn_body[0].value.args[0].generators[0].target
+    retval = fn_body[1].value
+
+    # Python2 leaks comprehension symbols. Python3 doesn't.
+    if six.PY2:
+      self.assertSameDef(retval, listcomp_target)
+    else:
+      self.assertHasDefs(retval, 0)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/static_analysis/type_info.py b/tensorflow/python/autograph/pyct/static_analysis/type_info.py
index edb2ef0e274c53136560ce508bfa862781e380b8..68a53661d3701960f56033edfb75fabc2a6d6956 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/type_info.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/type_info.py
@@ -45,6 +45,7 @@ import gast
 
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import ast_util
+from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import transformer
 from tensorflow.python.util import tf_inspect
 
@@ -141,10 +142,11 @@ class TypeInfoResolver(transformer.Base):
     arg_name = str(qn)
     self.scope.setval(qn, arg_node)
     if (len(self.enclosing_entities) == 1 and
-        arg_name in self.entity_info.arg_types):
+        arg_name in self.ctx.info.arg_types):
       # Forge a node to hold the type information, so that method calls on
       # it can resolve the type.
-      type_string, type_obj = self.entity_info.arg_types[arg_name]
+      type_string, type_obj = self.ctx.info.arg_types[
+          arg_name]
       anno.setanno(arg_node, 'type', type_obj)
       anno.setanno(arg_node, 'type_fqn', tuple(type_string.split('.')))
 
@@ -177,7 +179,8 @@ class TypeInfoResolver(transformer.Base):
       func = value.func
       if anno.hasanno(func, 'live_val'):
         func_obj = anno.getanno(func, 'live_val')
-        if tf_inspect.isclass(func_obj):
+        if (tf_inspect.isclass(func_obj) and
+            not inspect_utils.isbuiltin(func_obj)):
           anno.setanno(value, 'is_constructor', True)
           anno.setanno(value, 'type', func_obj)
           anno.setanno(value, 'type_fqn', anno.getanno(func, 'fqn'))
diff --git a/tensorflow/python/autograph/pyct/static_analysis/type_info_test.py b/tensorflow/python/autograph/pyct/static_analysis/type_info_test.py
index 34ba3d2f13889273ac9351b6194a46762a4ac39b..c6cf91e06207e739868282d1d0e7c2aa6cb51b62 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/type_info_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/type_info_test.py
@@ -72,12 +72,13 @@ class TypeInfoResolverTest(test.TestCase):
         owner_type=None)
     node = qual_names.resolve(node)
     graphs = cfg.build(node)
-    node = activity.resolve(node, entity_info)
-    node = reaching_definitions.resolve(node, entity_info, graphs,
+    ctx = transformer.Context(entity_info)
+    node = activity.resolve(node, ctx)
+    node = reaching_definitions.resolve(node, ctx, graphs,
                                         reaching_definitions.Definition)
-    node = live_values.resolve(node, entity_info, {})
-    node = type_info.resolve(node, entity_info)
-    node = live_values.resolve(node, entity_info, {})
+    node = live_values.resolve(node, ctx, {})
+    node = type_info.resolve(node, ctx)
+    node = live_values.resolve(node, ctx, {})
     return node
 
   def test_constructor_detection(self):
@@ -88,11 +89,22 @@ class TypeInfoResolverTest(test.TestCase):
 
     node = self._parse_and_analyze(test_fn, {'training': training})
     call_node = node.body[0].body[0].value
+    self.assertTrue(anno.getanno(call_node, 'is_constructor'))
     self.assertEquals(training.GradientDescentOptimizer,
                       anno.getanno(call_node, 'type'))
     self.assertEquals((training.__name__, 'GradientDescentOptimizer'),
                       anno.getanno(call_node, 'type_fqn'))
 
+  def test_constructor_detection_builtin_class(self):
+
+    def test_fn(x):
+      res = zip(x)
+      return res
+
+    node = self._parse_and_analyze(test_fn, {})
+    call_node = node.body[0].body[0].value
+    self.assertFalse(anno.hasanno(call_node, 'is_constructor'))
+
   def test_class_members_of_detected_constructor(self):
 
     def test_fn():
diff --git a/tensorflow/python/autograph/pyct/templates.py b/tensorflow/python/autograph/pyct/templates.py
index 2272ea42086ff726eaf02f8fccacc6b661d6207e..b682a21bec16bcfae4c873dcd9c6ab8f0f3eb73b 100644
--- a/tensorflow/python/autograph/pyct/templates.py
+++ b/tensorflow/python/autograph/pyct/templates.py
@@ -91,6 +91,18 @@ class ContextAdjuster(gast.NodeTransformer):
     self._ctx_override = None
     return self.generic_visit(node)
 
+  def visit_comprehension(self, node):
+    # We may be able to override some of these, but for now it's simpler
+    # to just assert that they're set.
+    self._ctx_override = None
+    return self.generic_visit(node)
+
+  def visit_Lambda(self, node):
+    # We may be able to override some of these, but for now it's simpler
+    # to just assert that they're set.
+    self._ctx_override = None
+    return self.generic_visit(node)
+
 
 class ReplaceTransformer(gast.NodeTransformer):
   """Replace AST nodes."""
@@ -108,6 +120,7 @@ class ReplaceTransformer(gast.NodeTransformer):
         anno.Basic.ORIGIN,
         anno.Basic.SKIP_PROCESSING,
         anno.Static.ORIG_DEFINITIONS,
+        'extra_test',
     }
 
   def _prepare_replacement(self, replaced, key):
@@ -184,6 +197,9 @@ class ReplaceTransformer(gast.NodeTransformer):
 
     new_nodes = self._prepare_replacement(node, node.id)
 
+    if not new_nodes:
+      return new_nodes
+
     # Preserve the target context.
     adjuster = ContextAdjuster(type(node.ctx))
     for n in new_nodes:
diff --git a/tensorflow/python/autograph/pyct/templates_test.py b/tensorflow/python/autograph/pyct/templates_test.py
index cdb44b822e84ad5822c78d50c2f958b1fba9ec18..4762aaf3ff68391bf4cfdee46ba88ff69cd7e8c0 100644
--- a/tensorflow/python/autograph/pyct/templates_test.py
+++ b/tensorflow/python/autograph/pyct/templates_test.py
@@ -238,6 +238,26 @@ class TemplatesTest(test.TestCase):
     source = parser.parse_expression('[a(b(1))]')
     templates.replace_as_expression(template, bar=source)
 
+  def test_star_comprehension_in_function_call(self):
+    template = """
+      a = foo(func, args)
+    """
+    source = parser.parse_expression('bar(*[i for i in range(j)])')
+    node = templates.replace(template, func=source.func, args=source.args)
+    arg_node = node[0].value.args[1].value
+    self.assertIsInstance(arg_node.generators[0].target.ctx, gast.Store)
+    self.assertIsInstance(arg_node.elt.ctx, gast.Load)
+
+  def test_lambda_in_function_call(self):
+    template = """
+      a = foo(arg)
+    """
+    source = parser.parse_expression('[lambda i: i]')
+    node = templates.replace(template, arg=source)
+    lambda_arg = node[0].value.args[0].elts[0]
+    self.assertIsInstance(lambda_arg.args.args[0].ctx, gast.Param)
+    self.assertIsInstance(lambda_arg.body.ctx, gast.Load)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/transformer.py b/tensorflow/python/autograph/pyct/transformer.py
index b6830534b3dbf2e2815957b26d715d24dc002da7..d8e093c3eb1bab8d5e16972fd5d0d70637a6aa61 100644
--- a/tensorflow/python/autograph/pyct/transformer.py
+++ b/tensorflow/python/autograph/pyct/transformer.py
@@ -18,10 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
 
 import gast
-import six
 
 from tensorflow.python.autograph.pyct import anno
 from tensorflow.python.autograph.pyct import compiler
@@ -29,21 +27,36 @@ from tensorflow.python.autograph.pyct import pretty_printer
 from tensorflow.python.autograph.pyct import templates
 
 
-class AutographParseError(SyntaxError):
-  pass
+# TODO(znado): Use namedtuple.
+class Context(object):
+  """Contains information about a source code transformation.
+
+  This object is mutable, and is updated during conversion. Not thread safe.
+
+  Attributes:
+    info: EntityInfo, immutable.
+    current_origin: origin_info.OriginInfo, holds the OriginInfo of the last
+      AST node to be processed successfully. Useful for error handling.
+  """
+
+  def __init__(self, info):
+    self.info = info
+    self.current_origin = None
 
 
 # TODO(mdan): Use namedtuple.
 class EntityInfo(object):
-  """Contains information about a Python entity. Immutable.
+  """Contains information about a Python entity.
+
+  Immutable.
 
   Examples of entities include functions and classes.
 
   Attributes:
     source_code: The entity's source code.
     source_file: The entity's source file.
-    namespace: Dict[str, ], containing symbols visible to the entity
-        (excluding parameters).
+    namespace: Dict[str, ], containing symbols visible to the entity (excluding
+      parameters).
     arg_values: dict[str->*], containing parameter values, if known.
     arg_types: dict[str->*], containing parameter types, if known.
     owner_type: The surrounding class type of the function, if present.
@@ -198,17 +211,17 @@ class Base(gast.NodeTransformer):
 
   # TODO(mdan): Document all extra features.
 
-  def __init__(self, entity_info):
-    """Initialize the transformer. Subclasses should call this.
+  def __init__(self, ctx):
+    """Initialize the transformer.
+
+    Subclasses should call this.
 
     Args:
-      entity_info: An EntityInfo object.
+      ctx: A Context object.
     """
-    self._current_origin = None
     self._lineno = 0
     self._col_offset = 0
-    # TODO(znado): remove this from the constructor of all Transformers.
-    self.entity_info = entity_info
+    self.ctx = ctx
     self._enclosing_entities = []
 
     # A stack that allows keeping mutable, scope-local state where scopes may be
@@ -232,13 +245,15 @@ class Base(gast.NodeTransformer):
     return len(self._local_scope_state)
 
   def enter_local_scope(self, inherit=None):
-    """Deprecated. Use self.state instead.
+    """Deprecated.
+
+    Use self.state instead.
 
     Marks entry into a new local scope.
 
     Args:
-      inherit: Optional enumerable of variable names to copy from the
-          parent scope.
+      inherit: Optional enumerable of variable names to copy from the parent
+        scope.
     """
     scope_entered = {}
     if inherit:
@@ -249,13 +264,15 @@ class Base(gast.NodeTransformer):
     self._local_scope_state.append(scope_entered)
 
   def exit_local_scope(self, keep=None):
-    """Deprecated. Use self.state instead.
+    """Deprecated.
+
+    Use self.state instead.
 
     Marks exit from the current local scope.
 
     Args:
-      keep: Optional enumerable of variable names to copy into the
-          parent scope.
+      keep: Optional enumerable of variable names to copy into the parent scope.
+
     Returns:
       A dict containing the scope that has just been exited.
     """
@@ -276,11 +293,17 @@ class Base(gast.NodeTransformer):
     return self._local_scope_state[-1].get(name, default)
 
   def debug_print(self, node):
-    """Helper method useful for debugging."""
+    """Helper method useful for debugging. Prints the AST."""
     if __debug__:
       print(pretty_printer.fmt(node))
     return node
 
+  def debug_print_src(self, node):
+    """Helper method useful for debugging. Prints the AST as code."""
+    if __debug__:
+      print(compiler.ast_to_source(node))
+    return node
+
   def create_assignment(self, target, expression):
     template = """
       target = expression
@@ -390,11 +413,11 @@ class Base(gast.NodeTransformer):
 
     Args:
       targets: list, tuple of or individual AST node. Should be used with the
-          targets field of an ast.Assign node.
+        targets field of an ast.Assign node.
       values: an AST node.
       apply_fn: a function of a single argument, which will be called with the
-          respective nodes of each single assignment. The signature is
-          apply_fn(target, value), no return value.
+        respective nodes of each single assignment. The signature is
+        apply_fn(target, value), no return value.
     """
     if not isinstance(targets, (list, tuple)):
       targets = (targets,)
@@ -429,75 +452,54 @@ class Base(gast.NodeTransformer):
       # call `visit`.  The error needs to be raised before the exception handler
       # below is installed, because said handler will mess up if `node` is not,
       # in fact, a node.
-      msg = (
-          'invalid value for "node": expected "ast.AST", got "{}"; to'
-          ' visit lists of nodes, use "visit_block" instead').format(type(node))
+      msg = ('invalid value for "node": expected "ast.AST", got "{}"; to'
+             ' visit lists of nodes, use "visit_block" instead').format(
+                 type(node))
       raise ValueError(msg)
 
     did_enter_function = False
     local_scope_size_at_entry = len(self._local_scope_state)
     processing_expr_node = False
 
-    try:
-      parent_origin = self._current_origin
-      if isinstance(node, (gast.FunctionDef, gast.ClassDef, gast.Lambda)):
-        did_enter_function = True
-      elif isinstance(node, gast.Expr):
-        processing_expr_node = True
-
-      if did_enter_function:
-        self._enclosing_entities.append(node)
-
-      if anno.hasanno(node, anno.Basic.ORIGIN):
-        self._current_origin = anno.getanno(node, anno.Basic.ORIGIN)
-
-      if processing_expr_node:
-        entry_expr_value = node.value
-
-      if not anno.hasanno(node, anno.Basic.SKIP_PROCESSING):
-        result = super(Base, self).visit(node)
-      self._current_origin = parent_origin
-
-      # Adjust for consistency: replacing the value of an Expr with
-      # an Assign node removes the need for the Expr node.
-      if processing_expr_node:
-        if isinstance(result, gast.Expr) and result.value != entry_expr_value:
-          # When the replacement is a list, it is assumed that the list came
-          # from a template that contained a number of statements, which
-          # themselves are standalone and don't require an enclosing Expr.
-          if isinstance(result.value,
-                        (list, tuple, gast.Assign, gast.AugAssign)):
-            result = result.value
-
-      # On exception, the local scope integrity is not guaranteed.
-      if did_enter_function:
-        self._enclosing_entities.pop()
-
-      if local_scope_size_at_entry != len(self._local_scope_state):
-        raise AssertionError(
-            'Inconsistent local scope stack. Before entering node %s, the'
-            ' stack had length %d, after exit it has length %d. This'
-            ' indicates enter_local_scope and exit_local_scope are not'
-            ' well paired.' % (node, local_scope_size_at_entry,
-                               len(self._local_scope_state)))
-      return result
-
-    except (ValueError, AttributeError, KeyError, NotImplementedError) as e:
-      if not self._current_origin:
-        raise e
-      original_file_path = self._current_origin.loc.filename
-      original_line_number = self._current_origin.loc.lineno
-      original_col_offset = self._current_origin.loc.col_offset
-      original_source_line = self._current_origin.source_code_line
-      msg = '%s: %s.' % (e.__class__.__name__, str(e))
-
-      # TODO(mdan): Avoid the printing of the original exception.
-      # In other words, we need to find how to suppress the "During handling
-      # of the above exception, another exception occurred" message.
-      six.reraise(
-          AutographParseError,
-          AutographParseError(msg, (original_file_path, original_line_number,
-                                    original_col_offset, original_source_line)),
-          sys.exc_info()[2])
-    finally:
-      self._current_origin = parent_origin
+    parent_origin = self.ctx.current_origin
+    if isinstance(node, (gast.FunctionDef, gast.ClassDef, gast.Lambda)):
+      did_enter_function = True
+    elif isinstance(node, gast.Expr):
+      processing_expr_node = True
+
+    if did_enter_function:
+      self._enclosing_entities.append(node)
+
+    if anno.hasanno(node, anno.Basic.ORIGIN):
+      self.ctx.current_origin = anno.getanno(node, anno.Basic.ORIGIN)
+
+    if processing_expr_node:
+      entry_expr_value = node.value
+
+    if not anno.hasanno(node, anno.Basic.SKIP_PROCESSING):
+      result = super(Base, self).visit(node)
+    self.ctx.current_origin = parent_origin
+
+    # Adjust for consistency: replacing the value of an Expr with
+    # an Assign node removes the need for the Expr node.
+    if processing_expr_node:
+      if isinstance(result, gast.Expr) and result.value != entry_expr_value:
+        # When the replacement is a list, it is assumed that the list came
+        # from a template that contained a number of statements, which
+        # themselves are standalone and don't require an enclosing Expr.
+        if isinstance(result.value,
+                      (list, tuple, gast.Assign, gast.AugAssign)):
+          result = result.value
+
+    # On exception, the local scope integrity is not guaranteed.
+    if did_enter_function:
+      self._enclosing_entities.pop()
+
+    if local_scope_size_at_entry != len(self._local_scope_state):
+      raise AssertionError(
+          'Inconsistent local scope stack. Before entering node %s, the'
+          ' stack had length %d, after exit it has length %d. This'
+          ' indicates enter_local_scope and exit_local_scope are not'
+          ' well paired.' % (node, local_scope_size_at_entry,
+                             len(self._local_scope_state)))
+    return result
diff --git a/tensorflow/python/autograph/pyct/transformer_test.py b/tensorflow/python/autograph/pyct/transformer_test.py
index 0c68d2a7648ccd3f44fb53db994bd0bb94a813eb..d97c1f0766a842f490d4874870441ad584ba22b2 100644
--- a/tensorflow/python/autograph/pyct/transformer_test.py
+++ b/tensorflow/python/autograph/pyct/transformer_test.py
@@ -28,14 +28,15 @@ from tensorflow.python.platform import test
 
 class TransformerTest(test.TestCase):
 
-  def _simple_source_info(self):
-    return transformer.EntityInfo(
+  def _simple_context(self):
+    entity_info = transformer.EntityInfo(
         source_code=None,
         source_file=None,
         namespace=None,
         arg_values=None,
         arg_types=None,
         owner_type=None)
+    return transformer.Context(entity_info)
 
   def test_entity_scope_tracking(self):
 
@@ -52,7 +53,7 @@ class TransformerTest(test.TestCase):
         anno.setanno(node, 'enclosing_entities', self.enclosing_entities)
         return self.generic_visit(node)
 
-    tr = TestTransformer(self._simple_source_info())
+    tr = TestTransformer(self._simple_context())
 
     def test_function():
       a = 0
@@ -126,7 +127,7 @@ class TransformerTest(test.TestCase):
         self.state[CondState].exit()
         return node
 
-    tr = TestTransformer(self._simple_source_info())
+    tr = TestTransformer(self._simple_context())
 
     def test_function(a):
       a = 1
@@ -192,7 +193,7 @@ class TransformerTest(test.TestCase):
       def visit_For(self, node):
         return self._annotate_result(node)
 
-    tr = TestTransformer(self._simple_source_info())
+    tr = TestTransformer(self._simple_context())
 
     def test_function(a):
       """Docstring."""
@@ -231,7 +232,7 @@ class TransformerTest(test.TestCase):
         self.exit_local_scope()
         return node
 
-    tr = TestTransformer(self._simple_source_info())
+    tr = TestTransformer(self._simple_context())
 
     def no_exit(a):
       if a > 0:
@@ -270,7 +271,7 @@ class TransformerTest(test.TestCase):
       z = y
       return z
 
-    tr = TestTransformer(self._simple_source_info())
+    tr = TestTransformer(self._simple_context())
 
     node, _ = parser.parse_entity(test_function)
     node = tr.visit(node)
@@ -301,7 +302,7 @@ class TransformerTest(test.TestCase):
       if x > 0:
         return x
 
-    tr = BrokenTransformer(self._simple_source_info())
+    tr = BrokenTransformer(self._simple_context())
 
     node, _ = parser.parse_entity(test_function)
     with self.assertRaises(ValueError) as cm:
@@ -332,7 +333,7 @@ class TransformerTest(test.TestCase):
       if x > 0:
         return x
 
-    tr = BrokenTransformer(self._simple_source_info())
+    tr = BrokenTransformer(self._simple_context())
 
     node, _ = parser.parse_entity(test_function)
     with self.assertRaises(ValueError) as cm:
diff --git a/tensorflow/python/autograph/utils/BUILD b/tensorflow/python/autograph/utils/BUILD
index 790c661661dabab7c5e1d5dd097a60562c8cc358..f5e0dbf00bf5ce35ae049755b32b47d12e5c9960 100644
--- a/tensorflow/python/autograph/utils/BUILD
+++ b/tensorflow/python/autograph/utils/BUILD
@@ -20,6 +20,7 @@ py_library(
     name = "utils",
     srcs = [
         "__init__.py",
+        "ag_logging.py",
         "context_managers.py",
         "misc.py",
         "py_func.py",
@@ -33,7 +34,9 @@ py_library(
     deps = [
         "//tensorflow/python:dtypes",
         "//tensorflow/python:list_ops",
+        "//tensorflow/python:platform",
         "//tensorflow/python:script_ops",
+        "//tensorflow/python:util",
         "//tensorflow/python/autograph/pyct",
         "//tensorflow/python/data/ops:dataset_ops",
         "@six_archive//:six",
diff --git a/tensorflow/python/autograph/utils/ag_logging.py b/tensorflow/python/autograph/utils/ag_logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..8229b828d305dc5acd8e61ceacb325d3f681487f
--- /dev/null
+++ b/tensorflow/python/autograph/utils/ag_logging.py
@@ -0,0 +1,144 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Logging and debugging utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+# TODO(mdan): Use a custom logger class.
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
+
+VERBOSITY_VAR_NAME = 'AUTOGRAPH_VERBOSITY'
+DEFAULT_VERBOSITY = 0
+
+verbosity_level = None  # vlog-like. Takes precedence over the env variable.
+echo_log_to_stdout = False
+
+# In interactive Python, logging echo is enabled by default.
+if hasattr(sys, 'ps1') or hasattr(sys, 'ps2'):
+  echo_log_to_stdout = True
+
+
+@tf_export('autograph.set_verbosity')
+def set_verbosity(level, alsologtostdout=False):
+  """Sets the AutoGraph verbosity level.
+
+  _Debug logging in AutoGraph_
+
+  More verbose logging is useful to enable when filing bug reports or doing
+  more in-depth debugging.
+
+  There are two controls that control the logging verbosity:
+
+   * The `set_verbosity` function
+
+   * The `AUTOGRAPH_VERBOSITY` environment variable
+
+  `set_verbosity` takes precedence over the environment variable.
+
+  For example:
+
+  ```python
+  import os
+  import tensorflow as tf
+
+  os.environ['AUTOGRAPH_VERBOSITY'] = 5
+  # Verbosity is now 5
+
+  tf.autograph.set_verbosity(0)
+  # Verbosity is now 0
+
+  os.environ['AUTOGRAPH_VERBOSITY'] = 1
+  # No effect, because set_verbosity was already called.
+  ```
+
+  Logs entries are output to [absl](https://abseil.io)'s default output,
+  with `INFO` level.
+  Logs can be mirrored to stdout by using the `alsologtostdout` argument.
+  Mirroring is enabled by default when Python runs in interactive mode.
+
+  Args:
+    level: int, the verbosity level; larger values specify increased verbosity;
+      0 means no logging. When reporting bugs, it is recommended to set this
+      value to a larges number, like 10.
+    alsologtostdout: bool, whether to also output log messages to `sys.stdout`.
+  """
+  global verbosity_level
+  global echo_log_to_stdout
+  verbosity_level = level
+  echo_log_to_stdout = alsologtostdout
+
+
+@tf_export('autograph.trace')
+def trace(*args):
+  """Traces argument information at compilation time.
+
+  `trace` is useful when debugging, and it always executes during the tracing
+  phase, that is, when the TF graph is constructed.
+
+  _Example usage_
+
+  ```python
+  import tensorflow as tf
+
+  for i in tf.range(10):
+    tf.autograph.trace(i)
+  # Output: <Tensor ...>
+  ```
+
+  Args:
+    *args: Arguments to print to `sys.stdout`.
+  """
+  print(*args)
+
+
+def get_verbosity():
+  global verbosity_level
+  if verbosity_level is not None:
+    return verbosity_level
+  return int(os.getenv(VERBOSITY_VAR_NAME, DEFAULT_VERBOSITY))
+
+
+def has_verbosity(level):
+  return get_verbosity() >= level
+
+
+def error(level, msg, *args, **kwargs):
+  if has_verbosity(level):
+    logging.error(msg, *args, **kwargs)
+    if echo_log_to_stdout:
+      print(msg % args)
+
+
+def log(level, msg, *args, **kwargs):
+  if has_verbosity(level):
+    logging.info(msg, *args, **kwargs)
+    if echo_log_to_stdout:
+      print(msg % args)
+
+
+def warn(msg, *args, **kwargs):
+  logging.warn(msg, *args, **kwargs)
+  if echo_log_to_stdout:
+    print('WARNING:', msg % args)
+
+
+def warn_first_n(msg, *args, **kwargs):
+  logging.log_first_n(logging.WARN, msg, *args, **kwargs)
diff --git a/tensorflow/python/autograph/utils/misc.py b/tensorflow/python/autograph/utils/misc.py
index 1b06caf0bdeb6f4a079e33f2e887d2dca017adc2..046e6cf97dcd40cea4f1601cf8e69259559f7adf 100644
--- a/tensorflow/python/autograph/utils/misc.py
+++ b/tensorflow/python/autograph/utils/misc.py
@@ -23,7 +23,7 @@ from tensorflow.python.ops import array_ops
 
 
 def alias_tensors(*args):
-  """Wrap any Tensor arguments with an identity op.
+  """Wraps any Tensor arguments with an identity op.
 
   Any other argument, including Variables, is returned unchanged.
 
@@ -48,3 +48,10 @@ def alias_tensors(*args):
     return alias_if_tensor(args[0])
 
   raise ValueError('at least one argument required')
+
+
+def capitalize_initial(s):
+  """Capitalizes the initial of a string only."""
+  if s:
+    return s[0].upper() + s[1:]
+  return s
diff --git a/tensorflow/python/autograph/utils/misc_test.py b/tensorflow/python/autograph/utils/misc_test.py
index c78df48d6263b121076c86198670222441e7fec7..24b5753a91a035da9edd6c8cba431a063fc3c8d6 100644
--- a/tensorflow/python/autograph/utils/misc_test.py
+++ b/tensorflow/python/autograph/utils/misc_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.autograph.utils.misc import alias_tensors
+from tensorflow.python.autograph.utils import misc
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework.constant_op import constant
 from tensorflow.python.ops.variables import Variable
@@ -27,11 +27,20 @@ from tensorflow.python.platform import test
 
 class MiscTest(test.TestCase):
 
+  def test_capitalize_initial(self):
+    self.assertEqual('', misc.capitalize_initial(''))
+    self.assertEqual('A', misc.capitalize_initial('A'))
+    self.assertEqual('Ab', misc.capitalize_initial('Ab'))
+    self.assertEqual('AbC', misc.capitalize_initial('AbC'))
+    self.assertEqual('A', misc.capitalize_initial('a'))
+    self.assertEqual('Ab', misc.capitalize_initial('ab'))
+    self.assertEqual('AbC', misc.capitalize_initial('abC'))
+
   @test_util.run_deprecated_v1
   def test_alias_single_tensor(self):
     a = constant(1)
 
-    new_a = alias_tensors(a)
+    new_a = misc.alias_tensors(a)
     self.assertFalse(new_a is a)
     with self.cached_session() as sess:
       self.assertEqual(1, self.evaluate(new_a))
@@ -43,7 +52,7 @@ class MiscTest(test.TestCase):
     s = 'a'
     l = [1, 2, 3]
 
-    new_a, new_v, new_s, new_l = alias_tensors(a, v, s, l)
+    new_a, new_v, new_s, new_l = misc.alias_tensors(a, v, s, l)
 
     self.assertFalse(new_a is a)
     self.assertTrue(new_v is v)
diff --git a/tensorflow/python/build_defs.bzl b/tensorflow/python/build_defs.bzl
index b9056f86e6d0465a8521f054a459c06eb5aeb37c..244820f41a85778a01cd811d96c3e8228d8b7c8c 100644
--- a/tensorflow/python/build_defs.bzl
+++ b/tensorflow/python/build_defs.bzl
@@ -12,22 +12,26 @@ load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 # consumers of the tf_gen_op_wrapper_py rule would be simplified if we don't
 # hard code the ops/ directory.
 
-def tf_gen_op_wrapper_private_py(name, out=None, deps=[],
-                                 require_shape_functions=True,
-                                 visibility=[]):
-  if not name.endswith("_gen"):
-    fail("name must end in _gen")
-  if not visibility:
-    visibility = ["//visibility:private"]
-  bare_op_name = name[:-4] # Strip off the _gen
-  tf_gen_op_wrapper_py(name=bare_op_name,
-    out=out,
-    visibility=visibility,
-    deps=deps,
-    require_shape_functions=require_shape_functions,
-    generated_target_name=name,
-    api_def_srcs = [
-        "//tensorflow/core/api_def:base_api_def",
-        "//tensorflow/core/api_def:python_api_def",
-    ],
-  )
+def tf_gen_op_wrapper_private_py(
+        name,
+        out = None,
+        deps = [],
+        require_shape_functions = True,
+        visibility = []):
+    if not name.endswith("_gen"):
+        fail("name must end in _gen")
+    if not visibility:
+        visibility = ["//visibility:private"]
+    bare_op_name = name[:-4]  # Strip off the _gen
+    tf_gen_op_wrapper_py(
+        name = bare_op_name,
+        out = out,
+        visibility = visibility,
+        deps = deps,
+        require_shape_functions = require_shape_functions,
+        generated_target_name = name,
+        api_def_srcs = [
+            "//tensorflow/core/api_def:base_api_def",
+            "//tensorflow/core/api_def:python_api_def",
+        ],
+    )
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 87a200ed336735f4b4abd9b0ac2352e36f7b84e4..4f3eb61d4fddbdad2758e0aef00727ede5d37b74 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -736,10 +736,11 @@ class BaseSession(SessionInterface):
     if self._session is not None:
       try:
         tf_session.TF_DeleteSession(self._session)
-      except AttributeError:
-        # At shutdown, `c_api_util` or `tf_session` may have been garbage
-        # collected, causing the above method calls to fail. In this case,
-        # silently leak since the program is about to terminate anyway.
+      except (AttributeError, TypeError):
+        # At shutdown, `c_api_util`, `tf_session`, or
+        # `tf_session.TF_DeleteSession` may have been garbage collected, causing
+        # the above method calls to fail. In this case, silently leak since the
+        # program is about to terminate anyway.
         pass
       self._session = None
 
@@ -1531,7 +1532,7 @@ class Session(BaseSession):
 
     If no `graph` argument is specified when constructing the session,
     the default graph will be launched in the session. If you are
-    using more than one graph (created with `tf.Graph()` in the same
+    using more than one graph (created with `tf.Graph()`) in the same
     process, you will have to use different sessions for each graph,
     but each graph can be used in multiple sessions. In this case, it
     is often clearer to pass the graph to be launched explicitly to
@@ -1589,7 +1590,21 @@ class Session(BaseSession):
     self._default_session_context_manager = None
     self._default_graph_context_manager = None
 
-    self.close()
+    # If we are closing due to an exception, set a time limit on our Close() to
+    # avoid blocking forever.
+    # TODO(b/120204635) remove this when deadlock is fixed.
+    if exec_type:
+      close_thread = threading.Thread(
+          name='SessionCloseThread', target=self.close)
+      close_thread.daemon = True
+      close_thread.start()
+      close_thread.join(30.0)
+      if close_thread.is_alive():
+        logging.error(
+            'Session failed to close after 30 seconds. Continuing after this '
+            'point may leave your program in an undefined state.')
+    else:
+      self.close()
 
   @staticmethod
   def reset(target, containers=None, config=None):
@@ -1674,7 +1689,7 @@ class InteractiveSession(BaseSession):
 
     If no `graph` argument is specified when constructing the session,
     the default graph will be launched in the session. If you are
-    using more than one graph (created with `tf.Graph()` in the same
+    using more than one graph (created with `tf.Graph()`) in the same
     process, you will have to use different sessions for each graph,
     but each graph can be used in multiple sessions. In this case, it
     is often clearer to pass the graph to be launched explicitly to
diff --git a/tensorflow/python/client/session_ref.cc b/tensorflow/python/client/session_ref.cc
index 4d361612b7624a23ff8c74de0d6d54bce8817139..6639cf506e0a2f3d53373959b47cf98e5fcb0887 100644
--- a/tensorflow/python/client/session_ref.cc
+++ b/tensorflow/python/client/session_ref.cc
@@ -109,21 +109,8 @@ class SessionLogger {
   }
 
   Status RecordNewSession(Session* session) {
-    LOG(INFO) << "New session discovered.  Capturing devices...";
     ReplayOp op;
     NewReplaySession* req = op.mutable_new_replay_session();
-
-    std::vector<DeviceAttributes> devices;
-    Status status = session->ListDevices(&devices);
-    if (status.ok()) {
-      LOG(INFO) << "Found: " << devices.size() << " devices.";
-      for (const DeviceAttributes& dev : devices) {
-        *req->mutable_devices()->add_local_device() = dev;
-      }
-    } else {
-      LOG(WARNING) << "Failed to list devices on session. Continuing.";
-    }
-
     req->set_session_handle(SessionToHandle(session));
     return Flush(op);
   }
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index c4a118a41406afc52586553b1d3f0b446005c46d..da6218663de8b02fcda3f3e67e68bb46e47e914a 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -2036,7 +2036,7 @@ class SessionTest(test_util.TensorFlowTestCase):
     with self.cached_session() as sess:
       a = array_ops.placeholder(dtype=dtypes.string)
       with self.assertRaisesRegexp(
-          TypeError, 'Type of feed value 1 with type <(\w+) \'int\'> is not'):
+          TypeError, r'Type of feed value 1 with type <(\w+) \'int\'> is not'):
         sess.run(a, feed_dict={a: 1})
 
 
diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index ef7527d887f062621d1fb21511e08c5f7ea389c0..3b9677bf251cad98e1ed54403f93e9de2741e1b5 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -604,6 +604,27 @@ def TF_Reset(target, containers=None, config=None):
   }
 }
 
+// $input is a Python list of wrapped TF_Operations
+%typemap(in) (const std::vector<TF_Operation*>* control_outputs)
+    (std::vector<TF_Operation*> control_outputs) {
+  if ($input != Py_None) {
+    if (!PyList_Check($input)) {
+      SWIG_exception_fail(SWIG_TypeError, "$symname: expected list");
+    }
+    size_t size = PyList_Size($input);
+    for (int i = 0; i < size; ++i) {
+      PyObject* item = PyList_GetItem($input, i);
+      TF_Operation* oper_ptr;
+      SWIG_ConvertPtr(item, reinterpret_cast<void**>(&oper_ptr),
+                      $descriptor(TF_Operation*), 0);
+      control_outputs.push_back(oper_ptr);
+    }
+    $1 = &control_outputs;
+  } else {
+    $1 = nullptr;
+  }
+}
+
 // Typemaps for TF_GraphGetTensorShapeHelper.
 
 // Convert from C++ integer vector to Python list of ints.
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index dc0c10bab74635e240502e2f8e762b61e533b319..56b4eec98e314dd6474acec51b4208d5120f2fa4 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -590,7 +590,9 @@ TF_Function* TF_GraphToFunction_wrapper(
     const TF_Graph* fn_body, const char* fn_name, bool append_hash_to_fn_name,
     const std::vector<TF_Operation*>* opers,
     const std::vector<TF_Output>& inputs, const std::vector<TF_Output>& outputs,
-    const NameVector& output_names, const TF_FunctionOptions* opts,
+    const NameVector& output_names,
+    const std::vector<TF_Operation*>* control_outputs,
+    const NameVector& control_output_names, const TF_FunctionOptions* opts,
     const char* description, TF_Status* out_status) {
   if (!output_names.empty() && output_names.size() != outputs.size()) {
     Set_TF_Status_from_Status(
@@ -613,10 +615,18 @@ TF_Function* TF_GraphToFunction_wrapper(
       output_names.empty() ? nullptr
                            : const_cast<const char**>(output_names.data());
 
-  return TF_GraphToFunction(fn_body, fn_name, append_hash_to_fn_name, nopers,
-                            opers_array, inputs.size(), inputs.data(),
-                            outputs.size(), outputs.data(), output_names_ptr,
-                            opts, description, out_status);
+  const char** control_output_names_ptr =
+      control_output_names.empty()
+          ? nullptr
+          : const_cast<const char**>(control_output_names.data());
+
+  return TF_GraphToFunctionWithControlOutputs(
+      fn_body, fn_name, append_hash_to_fn_name, nopers, opers_array,
+      inputs.size(), inputs.data(), outputs.size(), outputs.data(),
+      output_names_ptr,
+      control_outputs == nullptr ? 0 : control_outputs->size(),
+      control_outputs == nullptr ? nullptr : control_outputs->data(),
+      control_output_names_ptr, opts, description, out_status);
 }
 
 void TF_GraphSetOutputHandleShapesAndTypes_wrapper(
diff --git a/tensorflow/python/client/tf_session_helper.h b/tensorflow/python/client/tf_session_helper.h
index dab7e71aac5a7f4cbf9f8825ad6dd5d3f556bd43..d2c7dc34d8d54f384a69954db37f7ba18b527197 100644
--- a/tensorflow/python/client/tf_session_helper.h
+++ b/tensorflow/python/client/tf_session_helper.h
@@ -208,7 +208,9 @@ TF_Function* TF_GraphToFunction_wrapper(
     const TF_Graph* fn_body, const char* fn_name, bool append_hash_to_fn_name,
     const std::vector<TF_Operation*>* opers,
     const std::vector<TF_Output>& inputs, const std::vector<TF_Output>& outputs,
-    const NameVector& output_names, const TF_FunctionOptions* opts,
+    const NameVector& output_names,
+    const std::vector<TF_Operation*>* control_outputs,
+    const NameVector& control_output_names, const TF_FunctionOptions* opts,
     const char* description, TF_Status* status);
 
 // Set the shapes and types for the output's handle.
diff --git a/tensorflow/python/compat/BUILD b/tensorflow/python/compat/BUILD
index 9f2ce8c676e77480106c525bdc9c6440c599acec..87dd5d7f669f2f1cfe8fb5068a96dbdab62897d4 100644
--- a/tensorflow/python/compat/BUILD
+++ b/tensorflow/python/compat/BUILD
@@ -4,13 +4,23 @@ exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
+py_library(
+    name = "v2_compat",
+    srcs = ["v2_compat.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python:tf2",
+        "//tensorflow/python:util",
+    ],
+)
+
 py_library(
     name = "compat",
     srcs = ["compat.py"],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python:tf2",
         "//tensorflow/python:util",
     ],
 )
@@ -24,3 +34,14 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
+
+tf_py_test(
+    name = "disable_v2_behavior_test",
+    size = "small",
+    srcs = ["disable_v2_behavior_test.py"],
+    additional_deps = [
+        ":v2_compat",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 51cd68436add963e4a08d9ed7ad43400f27b83f0..c1df5c4f8b0295fbb73928083ee16c1f167a3bac 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -24,15 +24,10 @@ from __future__ import print_function
 
 import datetime
 
-from tensorflow.python import tf2
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import variable_scope
-
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 12, 9)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 2, 21)
 
 
 @tf_export("compat.forward_compatible")
@@ -138,40 +133,3 @@ def forward_compatibility_horizon(year, month, day):
     yield
   finally:
     _FORWARD_COMPATIBILITY_HORIZON = old_compat_date
-
-
-@tf_export(v1=["enable_v2_behavior"])
-def enable_v2_behavior():
-  """Enables TensorFlow 2.x behaviors.
-
-  This function can be called at the beginning of the program (before `Tensors`,
-  `Graphs` or other structures have been created, and before devices have been
-  initialized. It switches all global behaviors that are different between
-  TensorFlow 1.x and 2.x to behave as intended for 2.x.
-
-  This function is called in the main TensorFlow `__init__.py` file, user should
-  not need to call it, except during complex migrations.
-  """
-  tf2.enable()  # Switches TensorArrayV2 and control flow V2
-  ops.enable_eager_execution()
-  tensor_shape.enable_v2_tensorshape()  # Also switched by tf2
-  variable_scope.enable_resource_variables()
-
-
-@tf_export(v1=["disable_v2_behavior"])
-def disable_v2_behavior():
-  """Enables TensorFlow 2.x behaviors.
-
-  This function can be called at the beginning of the program (before `Tensors`,
-  `Graphs` or other structures have been created, and before devices have been
-  initialized. It switches all global behaviors that are different between
-  TensorFlow 1.x and 2.x to behave as intended for 1.x.
-
-  User can call this function to disable 2.x behavior during complex migrations.
-  """
-  tf2.disable()  # Switches TensorArrayV2 and control flow V2
-  ops.disable_eager_execution()
-  tensor_shape.disable_v2_tensorshape()  # Also switched by tf2
-  variable_scope.disable_resource_variables()
-
-
diff --git a/tensorflow/python/compat/disable_v2_behavior_test.py b/tensorflow/python/compat/disable_v2_behavior_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c247eac395ec3b71c2d1840964cc351b9b78de6d
--- /dev/null
+++ b/tensorflow/python/compat/disable_v2_behavior_test.py
@@ -0,0 +1,39 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for forward and backwards compatibility utilties."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+class DisableV2BehaviorTest(test.TestCase):
+
+  def test_basic(self):
+    t = constant_op.constant([1, 2, 3])  # creates a hidden context
+    self.assertTrue(isinstance(t, ops.EagerTensor))
+    v2_compat.disable_v2_behavior()
+    t = constant_op.constant([1, 2, 3])
+    self.assertFalse(isinstance(t, ops.EagerTensor))
+
+
+if __name__ == '__main__':
+  v2_compat.enable_v2_behavior()
+  test.main()
diff --git a/tensorflow/python/compat/v2_compat.py b/tensorflow/python/compat/v2_compat.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a94939ae11dbf28146ae12ab21d11990dbb2516
--- /dev/null
+++ b/tensorflow/python/compat/v2_compat.py
@@ -0,0 +1,64 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Switching v2 features on and off."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import tf2
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.layers import normalization
+from tensorflow.python.ops import variable_scope
+
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export(v1=["enable_v2_behavior"])
+def enable_v2_behavior():
+  """Enables TensorFlow 2.x behaviors.
+
+  This function can be called at the beginning of the program (before `Tensors`,
+  `Graphs` or other structures have been created, and before devices have been
+  initialized. It switches all global behaviors that are different between
+  TensorFlow 1.x and 2.x to behave as intended for 2.x.
+
+  This function is called in the main TensorFlow `__init__.py` file, user should
+  not need to call it, except during complex migrations.
+  """
+  tf2.enable()  # Switches TensorArrayV2 and control flow V2
+  ops.enable_eager_execution()
+  tensor_shape.enable_v2_tensorshape()  # Also switched by tf2
+  variable_scope.enable_resource_variables()
+  normalization.enable_v2_batch_normalization()
+
+
+@tf_export(v1=["disable_v2_behavior"])
+def disable_v2_behavior():
+  """Disables TensorFlow 2.x behaviors.
+
+  This function can be called at the beginning of the program (before `Tensors`,
+  `Graphs` or other structures have been created, and before devices have been
+  initialized. It switches all global behaviors that are different between
+  TensorFlow 1.x and 2.x to behave as intended for 1.x.
+
+  User can call this function to disable 2.x behavior during complex migrations.
+  """
+  tf2.disable()  # Switches TensorArrayV2 and control flow V2
+  ops.disable_eager_execution()
+  tensor_shape.disable_v2_tensorshape()  # Also switched by tf2
+  variable_scope.disable_resource_variables()
+  normalization.disable_v2_batch_normalization()
diff --git a/tensorflow/python/compiler/BUILD b/tensorflow/python/compiler/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..07209a9eca976e049f6e44eef6a75952dd8a1823
--- /dev/null
+++ b/tensorflow/python/compiler/BUILD
@@ -0,0 +1,19 @@
+# Description:
+# Python APIs for various Tensorflow backends.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "if_not_windows")
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "compiler",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = if_not_windows([
+        "//tensorflow/python/compiler/tensorrt:init_py",
+    ]),
+)
diff --git a/tensorflow/python/compiler/__init__.py b/tensorflow/python/compiler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tensorflow/python/compiler/tensorrt/BUILD b/tensorflow/python/compiler/tensorrt/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..6c90a165cbeacd7f5ca9c00112e354c2d5b7ff20
--- /dev/null
+++ b/tensorflow/python/compiler/tensorrt/BUILD
@@ -0,0 +1,178 @@
+# Description:
+#   Wrap NVIDIA TensorRT (http://developer.nvidia.com/tensorrt) with tensorflow
+#   and provide TensorRT operators and converter package.
+#   APIs are meant to change over time.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_copts",
+)
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
+load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
+load(
+    "@local_config_tensorrt//:build_defs.bzl",
+    "if_tensorrt",
+)
+
+exports_files(glob([
+    "test/testdata/*",
+]))
+
+py_library(
+    name = "init_py",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tf_trt_integration_test_base",
+        ":trt_convert_py",
+    ],
+)
+
+py_library(
+    name = "trt_ops_py",
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/compiler/tf2tensorrt:trt_ops",
+        "//tensorflow/compiler/tf2tensorrt:trt_ops_loader",
+    ],
+)
+
+py_library(
+    name = "trt_convert_py",
+    srcs = ["trt_convert.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":trt_ops_py",
+        ":wrap_conversion",
+        "//tensorflow/python:graph_util",
+        "//tensorflow/python:session",
+        "//tensorflow/python:tf_optimizer",
+        "//tensorflow/python/saved_model:builder",
+        "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:tag_constants",
+    ],
+)
+
+# TODO(aaroey): this wrapper has been causing troubles of double linking, so
+# either get rid of it, or split to make it contain minimum dependencies.
+tf_py_wrap_cc(
+    name = "wrap_conversion",
+    srcs = ["trt_conversion.i"],
+    copts = tf_copts(),
+    swig_includes = [
+        "//tensorflow/python:platform/base.i",
+    ],
+    deps = [
+        "//tensorflow/compiler/tf2tensorrt:test_utils",
+        "//tensorflow/compiler/tf2tensorrt:trt_conversion",
+        "//tensorflow/compiler/tf2tensorrt:trt_op_kernels",
+        "//third_party/python_runtime:headers",
+    ],
+)
+
+py_library(
+    name = "tf_trt_integration_test_base",
+    srcs = ["test/tf_trt_integration_test_base.py"],
+    deps = [
+        ":trt_convert_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+cuda_py_test(
+    name = "trt_convert_test",
+    srcs = ["trt_convert_test.py"],
+    additional_deps = [
+        ":trt_convert_py",
+        ":trt_ops_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:graph_util",
+        "//tensorflow/python/saved_model:builder",
+        "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:signature_constants",
+        "//tensorflow/python/saved_model:signature_def_utils",
+        "//tensorflow/python/saved_model:tag_constants",
+        "//tensorflow/python/saved_model:utils",
+        "//tensorflow/python/tools:freeze_graph_lib",
+        "//tensorflow/python/tools:saved_model_utils",
+    ],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+    ],
+    xla_enable_strict_auto_jit = True,
+)
+
+cuda_py_tests(
+    name = "tf_trt_integration_test",
+    srcs = [
+        "test/base_test.py",
+        "test/batch_matmul_test.py",
+        "test/biasadd_matmul_test.py",
+        "test/binary_tensor_weight_broadcast_test.py",
+        "test/concatenation_test.py",
+        "test/const_broadcast_test.py",
+        "test/conv2d_test.py",
+        "test/dynamic_input_shapes_test.py",
+        "test/identity_output_test.py",
+        "test/int32_test.py",
+        "test/lru_cache_test.py",
+        "test/memory_alignment_test.py",
+        "test/multi_connection_neighbor_engine_test.py",
+        "test/neighboring_engine_test.py",
+        "test/quantization_test.py",
+        "test/rank_two_test.py",
+        "test/reshape_transpose_test.py",
+        "test/topk_test.py",
+        "test/unary_test.py",
+        "test/vgg_block_nchw_test.py",
+        "test/vgg_block_test.py",
+    ],
+    additional_deps = [
+        ":tf_trt_integration_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+    ],
+    xla_enable_strict_auto_jit = True,
+)
+
+cuda_py_test(
+    name = "quantization_mnist_test",
+    srcs = ["test/quantization_mnist_test.py"],
+    additional_deps = [
+        ":tf_trt_integration_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/keras:keras",
+        "//tensorflow/python/estimator:estimator",
+    ],
+    data = [
+        "test/testdata/checkpoint",
+        "test/testdata/model.ckpt-46900.data-00000-of-00001",
+        "test/testdata/model.ckpt-46900.index",
+    ],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_oss",  # TODO(b/125290478): allow running in at least some OSS configurations.
+        "no_pip",
+        "no_tap",  # It is not able to download the mnist data.
+        "no_windows",
+        "nomac",
+    ],
+    xla_enable_strict_auto_jit = True,
+)
diff --git a/tensorflow/python/compiler/tensorrt/README.md b/tensorflow/python/compiler/tensorrt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a9c2ad78a3db409e6e8669c48c4df37c8db19c4b
--- /dev/null
+++ b/tensorflow/python/compiler/tensorrt/README.md
@@ -0,0 +1,58 @@
+# Using TensorRT in TensorFlow (TF-TRT)
+
+This module provides necessary bindings and introduces `TRTEngineOp` operator
+that wraps a subgraph in TensorRT. This module is under active development.
+
+## Installing TF-TRT
+
+Currently TensorFlow nightly builds include TF-TRT by default, which means you
+don't need to install TF-TRT separately. You can pull the latest TF containers
+from docker hub or install the latest TF pip package to get access to the latest
+TF-TRT.
+
+If you want to use TF-TRT on NVIDIA Jetson platform, you can find the download
+links for the relevant TensorFlow pip packages here:
+https://docs.nvidia.com/deeplearning/dgx/index.html#installing-frameworks-for-jetson
+
+## Installing TensorRT
+
+In order to make use of TF-TRT, you will need a local installation of TensorRT.
+Installation instructions for compatibility with TensorFlow are provided on the
+[TensorFlow GPU support](https://www.tensorflow.org/install/gpu) guide.
+
+## Examples
+
+You can find example scripts for running inference on deep learning models in
+this repository: https://github.com/tensorflow/tensorrt
+
+We have used these examples to verify the accuracy and performance of TF-TRT.
+For more information see
+[Verified Models](https://docs.nvidia.com/deeplearning/dgx/integrate-tf-trt/index.html#verified-models).
+
+## Documentation
+
+[TF-TRT documentation](https://docs.nvidia.com/deeplearning/dgx/integrate-tf-trt/index.html)
+gives an overview of the supported functionalities, provides tutorials and
+verified models, explains best practices with troubleshooting guides.
+
+## Tests
+
+TF-TRT includes both Python tests and C++ unit tests. Most of Python tests are
+located in the test directory and they can be executed using `bazel test` or
+directly with the Python command. Most of the C++ unit tests are used to test
+the conversion functions that convert each TF op to a number of TensorRT layers.
+
+## Compilation
+
+In order to compile the module, you need to have a local TensorRT installation
+(libnvinfer.so and respective include files). During the configuration step,
+TensorRT should be enabled and installation path should be set. If installed
+through package managers (deb,rpm), configure script should find the necessary
+components from the system automatically. If installed from tar packages, user
+has to set path to location where the library is installed during configuration.
+
+```shell
+bazel build --config=cuda --config=opt //tensorflow/tools/pip_package:build_pip_package
+bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/
+```
+
diff --git a/tensorflow/contrib/tensorrt/python/ops/trt_engine_op.py b/tensorflow/python/compiler/tensorrt/__init__.py
similarity index 57%
rename from tensorflow/contrib/tensorrt/python/ops/trt_engine_op.py
rename to tensorflow/python/compiler/tensorrt/__init__.py
index 31a313182be9a2fca7457a539670dbc911ccabb1..db3540ba45d8082079a04db9e9de5bf7aa178f93 100644
--- a/tensorflow/contrib/tensorrt/python/ops/trt_engine_op.py
+++ b/tensorflow/python/compiler/tensorrt/__init__.py
@@ -12,23 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-"""Exposes the Python wrapper of TRTEngineOp."""
+"""Exposes the python wrapper for TensorRT graph transforms."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import platform
-
-if platform.system() != "Windows":
-  # pylint: disable=wildcard-import,unused-import,g-import-not-at-top
-  from tensorflow.contrib.tensorrt.ops.gen_trt_engine_op import *
-
-  from tensorflow.contrib.util import loader
-  from tensorflow.python.platform import resource_loader
-  # pylint: enable=wildcard-import,unused-import,g-import-not-at-top
-
-  _trt_engine_op = loader.load_op_library(
-      resource_loader.get_path_to_datafile("_trt_engine_op.so"))
-else:
-  raise RuntimeError("Windows platforms are not supported")
+# pylint: disable=unused-import,line-too-long
+from tensorflow.python.compiler.tensorrt.trt_convert import create_inference_graph
+# pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/contrib/tensorrt/test/base_test.py b/tensorflow/python/compiler/tensorrt/test/base_test.py
similarity index 93%
rename from tensorflow/contrib/tensorrt/test/base_test.py
rename to tensorflow/python/compiler/tensorrt/test/base_test.py
index ff317e43e1e6ff1c0b869ae8dc6d1fda8f0ce126..a1199c5040a51bf7db2cebaad1fe1523c3b6f421 100644
--- a/tensorflow/contrib/tensorrt/test/base_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/base_test.py
@@ -20,8 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.python import trt_convert
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.wrap_conversion import add_test_value
+from tensorflow.python.compiler.tensorrt.wrap_conversion import clear_test_values
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -68,9 +69,9 @@ class SimpleSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[(100, 6, 6, 6)])
+        expected_output_dims=[[[100, 6, 6, 6]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -125,9 +126,9 @@ class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[(100, 12, 12, 6)])
+        expected_output_dims=[[[100, 12, 12, 6]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -154,7 +155,7 @@ class PartiallyConvertedTestA(trt_test.TfTrtIntegrationTestBase):
     """Setup method."""
     super(PartiallyConvertedTestA, self).setUp()
     # Let it fail to build the second engine.
-    trt_convert.add_test_value("TRTEngineOp_1:CreateTRTNode", "fail")
+    add_test_value("TRTEngineOp_1:CreateTRTNode", "fail")
 
   def GetParams(self):
     """Create a graph containing two segment."""
@@ -183,9 +184,9 @@ class PartiallyConvertedTestA(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[tuple(input_dims)])
+        expected_output_dims=[[input_dims]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -209,8 +210,8 @@ class PartiallyConvertedTestB(PartiallyConvertedTestA):
     """Setup method."""
     super(PartiallyConvertedTestB, self).setUp()
     # Let it fail to build the first engine.
-    trt_convert.clear_test_values("")
-    trt_convert.add_test_value("TRTEngineOp_0:CreateTRTNode", "fail")
+    clear_test_values("")
+    add_test_value("TRTEngineOp_0:CreateTRTNode", "fail")
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -253,9 +254,9 @@ class ConstInputTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[tuple(input_dims)])
+        expected_output_dims=[[input_dims]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -286,9 +287,9 @@ class ConstDataInputSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[tuple(input_dims)])
+        expected_output_dims=[[input_dims]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -320,9 +321,9 @@ class ConstDataInputMultipleEnginesTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[tuple(input_dims)])
+        expected_output_dims=[[input_dims]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -369,9 +370,9 @@ class ControlDependencyTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[tuple(input_dims)])
+        expected_output_dims=[[input_dims]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/contrib/tensorrt/test/batch_matmul_test.py b/tensorflow/python/compiler/tensorrt/test/batch_matmul_test.py
similarity index 75%
rename from tensorflow/contrib/tensorrt/test/batch_matmul_test.py
rename to tensorflow/python/compiler/tensorrt/test/batch_matmul_test.py
index f42308ecb7c8f8a107e78008abd3f470ddc85975..49ad09aea33eae0c750ca47804d647cd54fd26d5 100644
--- a/tensorflow/contrib/tensorrt/test/batch_matmul_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/batch_matmul_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -71,9 +71,9 @@ class BatchMatMulTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name, w1_name, w2_name],
-        input_dims=[input_dims, w1_dims, w2_dims],
+        input_dims=[[input_dims, w1_dims, w2_dims]],
         output_names=[output_name],
-        expected_output_dims=[(12, 5, 8, 7)])
+        expected_output_dims=[[[12, 5, 8, 7]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -86,28 +86,6 @@ class BatchMatMulTest(trt_test.TfTrtIntegrationTestBase):
     """Return the expected engines to run."""
     return ["TRTEngineOp_1"]
 
-  def ShouldRunTest(self, run_params):
-    """Whether to run the test."""
-    # TODO(aaroey): Trt library will fail like:
-    #
-    # ../builder/cudnnBuilder2.cpp:685:
-    # virtual std::vector<nvinfer1::query::Ports<
-    #     nvinfer1::query::TensorRequirements>>
-    # nvinfer1::builder::Node::getSupportedFormats(
-    #     const nvinfer1::query::Ports<nvinfer1::query::AbstractTensor>&,
-    #     const nvinfer1::cudnn::HardwareContext&,
-    #     nvinfer1::builder::Format::Type,
-    #     const nvinfer1::builder::FormatTypeHack&) const:
-    # Assertion `sf' failed.
-    #
-    # To reproduce, run:
-    # bazel test -c opt --copt=-mavx \
-    #   --test_arg=BatchMatMulTest.testTfTrt_ToolConversion_INT8_DynamicEngine \
-    #   tensorflow/contrib/tensorrt:batch_matmul_test
-    #
-    # Investigate and fix it.
-    return not trt_test.IsQuantizationMode(run_params.precision_mode)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py b/tensorflow/python/compiler/tensorrt/test/biasadd_matmul_test.py
similarity index 91%
rename from tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
rename to tensorflow/python/compiler/tensorrt/test/biasadd_matmul_test.py
index 053b38ff1c0578c58f39dd6dc0630d1401a105af..2b7bbbc960558a5020eca48af855885f4251a748 100644
--- a/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/biasadd_matmul_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -111,9 +111,9 @@ class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[(4, 6680)])
+        expected_output_dims=[[[4, 6680]]])
 
   def GetConversionParams(self, run_params):
     """Return a ConversionParams for test."""
@@ -130,12 +130,6 @@ class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase):
     """Return the expected engines to build."""
     return ["TRTEngineOp_0"]
 
-  def ShouldRunTest(self, run_params):
-    """Whether to run the test."""
-    # TODO(aaroey): Trt 4.0 forbids conversion for tensors with rank <3 in int8
-    # mode, which is a bug. Re-enable this when trt library is fixed.
-    return not trt_test.IsQuantizationMode(run_params.precision_mode)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py b/tensorflow/python/compiler/tensorrt/test/binary_tensor_weight_broadcast_test.py
similarity index 93%
rename from tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py
rename to tensorflow/python/compiler/tensorrt/test/binary_tensor_weight_broadcast_test.py
index 169835956c046dd675e967daa05fd81405662e38..7e1d3afdd9388813b2cf030274a3d2bd4a08b994 100644
--- a/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/binary_tensor_weight_broadcast_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -63,9 +63,9 @@ class BinaryTensorWeightBroadcastTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[(5, 23040)])
+        expected_output_dims=[[[5, 23040]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/contrib/tensorrt/test/concatenation_test.py b/tensorflow/python/compiler/tensorrt/test/concatenation_test.py
similarity index 94%
rename from tensorflow/contrib/tensorrt/test/concatenation_test.py
rename to tensorflow/python/compiler/tensorrt/test/concatenation_test.py
index c3576f81d97afe7e0e42cd10413971911e97774c..f30324e7dba2392fb0d2c1a058bf3bc53c8493c6 100644
--- a/tensorflow/contrib/tensorrt/test/concatenation_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/concatenation_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -73,9 +73,9 @@ class ConcatenationTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[(2, 126)])
+        expected_output_dims=[[[2, 126]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/contrib/tensorrt/test/const_broadcast_test.py b/tensorflow/python/compiler/tensorrt/test/const_broadcast_test.py
similarity index 94%
rename from tensorflow/contrib/tensorrt/test/const_broadcast_test.py
rename to tensorflow/python/compiler/tensorrt/test/const_broadcast_test.py
index c1c883312d867b60b88ac14318041f9750ca41e6..2d764665beffa4198c87ef5816f352288310ec4f 100644
--- a/tensorflow/contrib/tensorrt/test/const_broadcast_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/const_broadcast_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -58,9 +58,9 @@ class ConstBroadcastTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[(5, 12, 12, 1)])
+        expected_output_dims=[[[5, 12, 12, 1]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/python/compiler/tensorrt/test/conv2d_test.py b/tensorflow/python/compiler/tensorrt/test/conv2d_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..326cad529740335310a4851cdbbea8b21cdd244e
--- /dev/null
+++ b/tensorflow/python/compiler/tensorrt/test/conv2d_test.py
@@ -0,0 +1,233 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_nn_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+
+
+def conv2d_layer(inputs,
+                 filters,
+                 kernel_size,
+                 strides=(1, 1),
+                 padding="valid",
+                 data_format="channels_last",
+                 dilation_rate=(1, 1),
+                 name=None):
+  dtype = inputs.dtype
+  c_axis = -1 if data_format == "channels_last" else 1
+  nchan = inputs.shape[c_axis]
+  weights_shape = (kernel_size[0], kernel_size[1], nchan, filters)
+  weights = constant_op.constant(np.random.randn(*weights_shape), dtype=dtype)
+  padding = padding.upper()
+  if data_format == "channels_last":
+    strides = [1] + list(strides) + [1]
+    dilations = [1] + list(dilation_rate) + [1]
+    data_format = "NHWC"
+  else:
+    strides = [1, 1] + list(strides)
+    dilations = [1, 1] + list(dilation_rate)
+    data_format = "NCHW"
+  return gen_nn_ops.conv2d(
+      inputs,
+      weights,
+      strides=strides,
+      padding=padding,
+      dilations=dilations,
+      data_format=data_format)
+
+
+def div_round_up(n, d):
+  return (n - 1) // d + 1
+
+
+def build_graph(input_dims,
+                dtype,
+                num_filters,
+                data_format,
+                kernel_sizes,
+                dilation_rates,
+                padding="same"):
+  g = ops.Graph()
+  with g.as_default():
+    inp = array_ops.placeholder(
+        dtype=dtype, shape=[None] + input_dims[1:], name="input")
+    with g.device("/GPU:0"):
+      results = []
+      for kernel_size in kernel_sizes:
+        for dilation_rate in dilation_rates:
+          result = conv2d_layer(inp, num_filters, kernel_size, (1, 1), padding,
+                                data_format, dilation_rate)
+          results.append(result)
+      output = sum(results)
+      output = array_ops.identity(output, name="output")
+  return g
+
+
+class Conv2DNCHWTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Testing conversion of Conv2D (data_format=NCHW) in TF-TRT conversion."""
+    np.random.seed(1234)
+    input_dims = [13, 3, 7, 11]
+    g = build_graph(
+        input_dims=input_dims,
+        dtype=dtypes.float32,
+        num_filters=5,
+        data_format="channels_first",
+        kernel_sizes=[(3, 3), (3, 2)],
+        dilation_rates=[(1, 1), (2, 3)])
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=["input"],
+        input_dims=[[input_dims]],
+        output_names=["output"],
+        expected_output_dims=[[[13, 5, 7, 11]]])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return ["TRTEngineOp_0"]
+
+
+class Conv2DNHWCTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Testing conversion of Conv2D (data_format=NCHW) in TF-TRT conversion."""
+    np.random.seed(1234)
+    input_dims = [13, 7, 11, 3]
+    g = build_graph(
+        input_dims=input_dims,
+        dtype=dtypes.float32,
+        num_filters=5,
+        data_format="channels_last",
+        kernel_sizes=[(3, 3), (3, 2)],
+        dilation_rates=[(1, 1), (2, 3)])
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=["input"],
+        input_dims=[[input_dims]],
+        output_names=["output"],
+        expected_output_dims=[[[13, 7, 11, 5]]])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return ["TRTEngineOp_0"]
+
+
+class Conv2DStridedNCHWTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Testing conversion of strided Conv2D (data_format=NCHW) in TF-TRT
+
+    conversion.
+    """
+    np.random.seed(1234)
+    dtype = dtypes.float32
+    input_name = "input"
+    n, c, h, w = 13, 3, 7, 11
+    num_filters = 5
+    input_dims = [n, c, h, w]
+    output_name = "output"
+    g = ops.Graph()
+    with g.as_default():
+      inp = array_ops.placeholder(
+          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
+      with g.device("/GPU:0"):
+        output = inp
+        output = conv2d_layer(
+            output,
+            num_filters, (3, 2),
+            strides=(2, 2),
+            padding="same",
+            data_format="channels_first")
+        h = div_round_up(h, 2)
+        w = div_round_up(w, 2)
+        output = conv2d_layer(
+            output,
+            num_filters, (3, 3),
+            strides=(2, 2),
+            dilation_rate=(2, 3),
+            padding="same",
+            data_format="channels_first")
+        h = div_round_up(h, 2)
+        w = div_round_up(w, 2)
+        output = array_ops.identity(output, name=output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[[input_dims]],
+        output_names=[output_name],
+        expected_output_dims=[[[n, num_filters, h, w]]])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return ["TRTEngineOp_0"]
+
+
+class Conv2DTranposeTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Testing conversion of conv2d_transpose (AKA Conv2DBackpropInput)"""
+    np.random.seed(1234)
+    dtype = dtypes.float32
+    input_name = "input"
+    n, c, h, w = 13, 3, 7, 11
+    num_filters = 8
+    input_dims = [n, c, h, w]
+    output_name = "output"
+    g = ops.Graph()
+    with g.as_default():
+      inp = array_ops.placeholder(
+          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
+      with g.device("/GPU:0"):
+        weights_shape = [2, 2, num_filters, c]
+        weights = constant_op.constant(
+            np.random.randn(*weights_shape), dtype=dtype)
+        output_shape = constant_op.constant([n, num_filters, h * 2, w * 2],
+                                            dtype=dtypes.int32)
+        output = nn_ops.conv2d_transpose(
+            inp,
+            weights,
+            output_shape,
+            strides=[1, 1, 2, 2],
+            padding="SAME",
+            data_format="NCHW")
+        output = array_ops.identity(output, name=output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[[input_dims]],
+        output_names=[output_name],
+        expected_output_dims=[[[n, num_filters, h * 2, w * 2]]])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return ["TRTEngineOp_0"]
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py b/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb358d4f9bd91ddd1d45c5e7555652c5c2bca157
--- /dev/null
+++ b/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
@@ -0,0 +1,107 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Script to test TF-TRT INT8 conversion without calibration on Mnist model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.platform import test
+
+
+class DynamicInputShapesTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    # TODO(laigd): we should test the following cases:
+    # - batch size is not changed, other dims are changing
+    # - batch size is decreasing, other dims are identical
+    # - batch size is decreasing, other dims are changing
+    # - batch size is increasing, other dims are identical
+    # - batch size is increasing, other dims are changing
+    input_dims = [[[1, 5, 5, 1]], [[10, 5, 5, 1]], [[3, 5, 5, 1]],
+                  [[1, 5, 5, 1]], [[1, 3, 1, 1]], [[2, 9, 9, 1]],
+                  [[1, 224, 224, 1]], [[1, 128, 224, 1]]]
+    expected_output_dims = input_dims
+
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(
+          shape=(None, None, None, 1), dtype=dtypes.float32, name="input")
+      conv_filter1 = constant_op.constant(
+          np.ones([3, 3, 1, 8]), name="weights1", dtype=dtypes.float32)
+      bias1 = constant_op.constant(np.random.randn(8), dtype=dtypes.float32)
+      x = nn.conv2d(
+          input=x,
+          filter=conv_filter1,
+          strides=[1, 1, 1, 1],
+          padding="SAME",
+          name="conv")
+      x = nn.bias_add(x, bias1)
+      x = nn.relu(x)
+      conv_filter2 = constant_op.constant(
+          np.ones([3, 3, 8, 1]), name="weights2", dtype=dtypes.float32)
+      bias2 = constant_op.constant(np.random.randn(1), dtype=dtypes.float32)
+      x = nn.conv2d(
+          input=x,
+          filter=conv_filter2,
+          strides=[1, 1, 1, 1],
+          padding="SAME",
+          name="conv")
+      x = nn.bias_add(x, bias2)
+      x = array_ops.identity(x, name="output")
+
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=["input"],
+        input_dims=input_dims,
+        output_names=["output"],
+        expected_output_dims=expected_output_dims)
+
+  def GetConversionParams(self, run_params):
+    """Return a ConversionParams for test."""
+    conversion_params = super(DynamicInputShapesTest,
+                              self).GetConversionParams(run_params)
+    return conversion_params._replace(
+        maximum_cached_engines=10,
+        # Disable layout optimizer, since it will convert BiasAdd with NHWC
+        # format to NCHW format under four dimentional input.
+        rewriter_config=trt_test.OptimizerDisabledRewriterConfig())
+
+  def ExpectedEnginesToBuild(self, run_params):
+    return ["TRTEngineOp_0"]
+
+  def ShouldRunTest(self, run_params):
+    return (run_params.dynamic_engine and
+            not trt_test.IsQuantizationMode(run_params.precision_mode))
+
+  def ExpectedAbsoluteTolerance(self, run_params):
+    """The absolute tolerance to compare floating point results."""
+    return 1.e-03 if run_params.precision_mode == "FP32" else 1.e-01
+
+  def ExpectedRelativeTolerance(self, run_params):
+    """The relative tolerance to compare floating point results."""
+    return 1.e-03 if run_params.precision_mode == "FP32" else 1.e-01
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/identity_output_test.py b/tensorflow/python/compiler/tensorrt/test/identity_output_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..23a72c5b0b75994fb0662dd22618b95c02cfff55
--- /dev/null
+++ b/tensorflow/python/compiler/tensorrt/test/identity_output_test.py
@@ -0,0 +1,74 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""This test checks a situation where the same tensor is considered as an output
+
+multiple times because it has been duplicated by 2+ indentity ops. Previously,
+the tensor would be renamed multiple times, overwriting the output binding name
+which resulted in a runtime error when the binding would not be found.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class IdentityTest(trt_test.TfTrtIntegrationTestBase):
+
+  def _ConstOp(self, shape):
+    return constant_op.constant(np.random.randn(*shape), dtype=dtypes.float32)
+
+  def GetParams(self):
+    """Testing engine with the same tensor repeated as output via identity."""
+    input_name = 'input'
+    input_dims = [100, 32]
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(
+          dtype=dtypes.float32, shape=input_dims, name=input_name)
+
+      b = self._ConstOp((32, 4))
+      x1 = math_ops.matmul(x, b)
+      b = self._ConstOp((1, 4))
+      x1 = x1 + b
+
+      out1 = array_ops.identity(x1, name='output1')
+      out2 = array_ops.identity(x1, name='output2')
+      iden1 = array_ops.identity(x1)
+      out3 = array_ops.identity(iden1, name='output3')
+
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[[input_dims]],
+        output_names=['output1', 'output2', 'output3'],
+        expected_output_dims=[[[100, 4]] * 3])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return ['TRTEngineOp_0']
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/int32_test.py b/tensorflow/python/compiler/tensorrt/test/int32_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d4446940aadf252b3d81f9978a374e4aedbb247
--- /dev/null
+++ b/tensorflow/python/compiler/tensorrt/test/int32_test.py
@@ -0,0 +1,82 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test conversion of graphs involving INT32 tensors and operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.platform import test
+
+
+class ExcludeUnsupportedInt32Test(trt_test.TfTrtIntegrationTestBase):
+
+  def _ConstOp(self, shape, dtype):
+    return constant_op.constant(np.random.randn(*shape), dtype=dtype)
+
+  def GetParams(self):
+    """Test exclusion of ops which are not supported in INT32 mode by TF-TRT"""
+    input_name = 'input'
+    output_name = 'output'
+    input_dims = [100, 4]
+    dtype = dtypes.int32
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
+      b = self._ConstOp((4, 10), dtype)
+      x = math_ops.matmul(x, b)
+      b = self._ConstOp((10,), dtype)
+      x = nn.bias_add(x, b)
+      x = array_ops.identity(x, name=output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[[input_dims]],
+        output_names=[output_name],
+        expected_output_dims=[[[100, 10]]])
+
+  def GetConversionParams(self, run_params):
+    """Return a ConversionParams for test."""
+    conversion_params = super(ExcludeUnsupportedInt32Test,
+                              self).GetConversionParams(run_params)
+    return conversion_params._replace(
+        max_batch_size=100,
+        maximum_cached_engines=1,
+        # Disable layout optimizer, since it will convert BiasAdd with NHWC
+        # format to NCHW format under four dimentional input.
+        rewriter_config=trt_test.OptimizerDisabledRewriterConfig())
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return []
+
+  def ShouldRunTest(self, run_params):
+    """Whether to run the test."""
+    # TODO(aaroey): Trt 4.0 forbids conversion for tensors with rank <3 in int8
+    # mode, which is a bug. Re-enable this when trt library is fixed.
+    return not trt_test.IsQuantizationMode(run_params.precision_mode)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/lru_cache_test.py b/tensorflow/python/compiler/tensorrt/test/lru_cache_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..18e6d32dfe5fde6ff6f70522df599d3a8fb142a5
--- /dev/null
+++ b/tensorflow/python/compiler/tensorrt/test/lru_cache_test.py
@@ -0,0 +1,78 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test LRUCache by running different input batch sizes on same network."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.platform import test
+
+
+class LRUCacheTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [[[1, 10, 10, 2]], [[2, 10, 10, 2]], [[4, 10, 10, 2]],
+                  [[2, 10, 10, 2]]]
+    expected_output_dims = [[[1, 10, 10, 1]], [[2, 10, 10, 1]], [[4, 10, 10,
+                                                                  1]],
+                            [[2, 10, 10, 1]]]
+    output_name = "output"
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(
+          dtype=dtype, shape=[None, 10, 10, 2], name=input_name)
+      conv_filter = constant_op.constant(
+          np.random.randn(3, 3, 2, 1), dtype=dtypes.float32)
+      x = nn.conv2d(
+          input=x,
+          filter=conv_filter,
+          strides=[1, 1, 1, 1],
+          padding="SAME",
+          name="conv")
+      bias = constant_op.constant(
+          np.random.randn(1, 10, 10, 1), dtype=dtypes.float32)
+      x = math_ops.add(x, bias)
+      x = nn.relu(x)
+      x = array_ops.identity(x, name="output")
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=input_dims,
+        output_names=[output_name],
+        expected_output_dims=expected_output_dims)
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return ["TRTEngineOp_0"]
+
+  def ShouldRunTest(self, run_params):
+    return (run_params.dynamic_engine and
+            not trt_test.IsQuantizationMode(run_params.precision_mode))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/memory_alignment_test.py b/tensorflow/python/compiler/tensorrt/test/memory_alignment_test.py
similarity index 94%
rename from tensorflow/contrib/tensorrt/test/memory_alignment_test.py
rename to tensorflow/python/compiler/tensorrt/test/memory_alignment_test.py
index 104bac43a0b1166dcddee9920991582f33e93316..89625aa629b1aa824bd95bfbf31e93174294faa8 100644
--- a/tensorflow/contrib/tensorrt/test/memory_alignment_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/memory_alignment_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -62,9 +62,9 @@ class MemoryAlignmentTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[(2, 15, 15, 10)])
+        expected_output_dims=[[[2, 15, 15, 10]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py b/tensorflow/python/compiler/tensorrt/test/multi_connection_neighbor_engine_test.py
similarity index 94%
rename from tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py
rename to tensorflow/python/compiler/tensorrt/test/multi_connection_neighbor_engine_test.py
index 293f93d8a78bc8ab06002d6fc01cb8d6a0738698..d04c6958fbc466faba7c4de4be53710aedc8b3b2 100644
--- a/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/multi_connection_neighbor_engine_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -75,9 +75,9 @@ class MultiConnectionNeighborEngineTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[(2, 4, 5, 4)])
+        expected_output_dims=[[[2, 4, 5, 4]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py b/tensorflow/python/compiler/tensorrt/test/neighboring_engine_test.py
similarity index 93%
rename from tensorflow/contrib/tensorrt/test/neighboring_engine_test.py
rename to tensorflow/python/compiler/tensorrt/test/neighboring_engine_test.py
index 3e1e4b088ba200db2184dd64092cbc642a17cb3a..1f7189f0eb2cd452882d79b1371e7e6baa9b629a 100644
--- a/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/neighboring_engine_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -59,9 +59,9 @@ class NeighboringEngineTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[(2, 4, 5, 4)])
+        expected_output_dims=[[[2, 4, 5, 4]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py b/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
similarity index 90%
rename from tensorflow/contrib/tensorrt/test/quantization_mnist_test.py
rename to tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
index 31cbef89e23949ba5ceaab34e0f683fd906bf0ce..cdd25e3b981bb7944ece564ee4a9fa0d976fba66 100644
--- a/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
@@ -18,13 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tensorrt.python import trt_convert
-# pylint: disable=unused-import
-from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
-# pylint: enable=unused-import
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import data
 from tensorflow.python import keras
+from tensorflow.python.compiler.tensorrt import trt_convert
+from tensorflow.python.compiler.tensorrt.wrap_conversion import get_linked_tensorrt_version
+from tensorflow.python.compiler.tensorrt.wrap_conversion import is_tensorrt_enabled
 from tensorflow.python.estimator.estimator import Estimator
 from tensorflow.python.estimator.model_fn import EstimatorSpec
 from tensorflow.python.estimator.model_fn import ModeKeys
@@ -139,15 +138,19 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
     if use_trt:
       logging.info('Number of nodes before TF-TRT conversion: %d',
                    len(graph_def.node))
-      graph_def = trt_convert.create_inference_graph(
-          graph_def,
-          outputs=[OUTPUT_NODE_NAME],
+      converter = trt_convert.TrtGraphConverter(
+          input_graph_def=graph_def,
+          nodes_blacklist=[OUTPUT_NODE_NAME],
           max_batch_size=max_batch_size,
           precision_mode='INT8',
-          max_workspace_size_bytes=4096 << 19,
+          # There is a 2GB GPU memory limit for each test, so we set
+          # max_workspace_size_bytes to 256MB to leave enough room for TF
+          # runtime to allocate GPU memory.
+          max_workspace_size_bytes=1 << 28,
           minimum_segment_size=2,
           use_calibration=False,
       )
+      graph_def = converter.convert()
       logging.info('Number of nodes after TF-TRT conversion: %d',
                    len(graph_def.node))
       num_engines = len(
@@ -191,7 +194,7 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
               batch_size=batch_size,
               num_parallel_calls=8))
       dataset = dataset.repeat(count=1)
-      iterator = data.make_one_shot_iterator(dataset)
+      iterator = dataset.make_one_shot_iterator()
       features, labels = iterator.get_next()
       return features, labels
 
@@ -205,7 +208,7 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
               batch_size=batch_size,
               num_parallel_calls=8))
       dataset = dataset.repeat(count=num_epochs)
-      iterator = data.make_one_shot_iterator(dataset)
+      iterator = dataset.make_one_shot_iterator()
       features, labels = iterator.get_next()
       return features, labels
 
@@ -260,9 +263,9 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
   #     num_epochs=100,
   #     model_dir=model_dir)
   def testEval(self):
-    if not trt_convert.is_tensorrt_enabled():
+    if not is_tensorrt_enabled():
       return
-    model_dir = test.test_src_dir_path('contrib/tensorrt/test/testdata')
+    model_dir = test.test_src_dir_path('python/compiler/tensorrt/test/testdata')
 
     accuracy_tf_native = self._Run(
         is_training=False,
@@ -271,9 +274,9 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
         num_epochs=None,
         model_dir=model_dir)['accuracy']
     logging.info('accuracy_tf_native: %f', accuracy_tf_native)
-    self.assertAllClose(accuracy_tf_native, 0.9662)
+    self.assertAllClose(0.9662, accuracy_tf_native, rtol=3e-3, atol=3e-3)
 
-    if trt_convert.get_linked_tensorrt_version()[0] < 5:
+    if get_linked_tensorrt_version()[0] < 5:
       return
 
     accuracy_tf_trt = self._Run(
@@ -283,7 +286,7 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
         num_epochs=None,
         model_dir=model_dir)['accuracy']
     logging.info('accuracy_tf_trt: %f', accuracy_tf_trt)
-    self.assertAllClose(accuracy_tf_trt, 0.9677)
+    self.assertAllClose(0.9675, accuracy_tf_trt, rtol=1e-3, atol=1e-3)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/tensorrt/test/quantization_test.py b/tensorflow/python/compiler/tensorrt/test/quantization_test.py
similarity index 93%
rename from tensorflow/contrib/tensorrt/test/quantization_test.py
rename to tensorflow/python/compiler/tensorrt/test/quantization_test.py
index e425a3674635650d7292ab072178e98932e6b824..3e1c9ff8ddc70469ba3516111b9d3821f1bbb6bc 100644
--- a/tensorflow/contrib/tensorrt/test/quantization_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/quantization_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.python import trt_convert
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.wrap_conversion import get_linked_tensorrt_version
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -60,9 +60,9 @@ def _GetParams(add_quantization_nodes, dtype=dtypes.float32):
   return trt_test.TfTrtIntegrationTestParams(
       gdef=g.as_graph_def(),
       input_names=[input_name],
-      input_dims=[input_dims],
+      input_dims=[[input_dims]],
       output_names=[output_name],
-      expected_output_dims=[(8, 1)])
+      expected_output_dims=[[[8, 1]]])
 
 
 class QuantizationMissingAllRangesTest(trt_test.TfTrtIntegrationTestBase):
@@ -72,7 +72,7 @@ class QuantizationMissingAllRangesTest(trt_test.TfTrtIntegrationTestBase):
     return _GetParams(add_quantization_nodes=False)
 
   def ShouldRunTest(self, run_params):
-    if trt_convert.get_linked_tensorrt_version()[0] < 5:
+    if get_linked_tensorrt_version()[0] < 5:
       return False
     # Only test static engine mode, with or without calibration.
     return (trt_test.IsQuantizationMode(run_params.precision_mode) and
@@ -96,7 +96,7 @@ class QuantizationWithRangesTest(trt_test.TfTrtIntegrationTestBase):
     return _GetParams(add_quantization_nodes=True)
 
   def ShouldRunTest(self, run_params):
-    if trt_convert.get_linked_tensorrt_version()[0] < 5:
+    if get_linked_tensorrt_version()[0] < 5:
       return False
     # Test static/dynamic engine with/without calibration.
     return (trt_test.IsQuantizationMode(run_params.precision_mode) and
diff --git a/tensorflow/contrib/tensorrt/test/rank_two_test.py b/tensorflow/python/compiler/tensorrt/test/rank_two_test.py
similarity index 87%
rename from tensorflow/contrib/tensorrt/test/rank_two_test.py
rename to tensorflow/python/compiler/tensorrt/test/rank_two_test.py
index 563232fc12675d9e1b32b7ab461591af57beadb9..a951638b5055b66255bc93291ae906220590e64a 100644
--- a/tensorflow/contrib/tensorrt/test/rank_two_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/rank_two_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -63,9 +63,9 @@ class RankTwoTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=input_names,
-        input_dims=input_dims,
+        input_dims=[input_dims],
         output_names=[output_name],
-        expected_output_dims=[tuple(input_dims[1])])
+        expected_output_dims=[[input_dims[1]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -80,12 +80,6 @@ class RankTwoTest(trt_test.TfTrtIntegrationTestBase):
         ],
     }
 
-  def ShouldRunTest(self, run_params):
-    """Whether to run the test."""
-    # TODO(aaroey): Trt 4.0 forbids conversion for tensors with rank <3 in int8
-    # mode, which is a bug. Re-enable this when trt library is fixed.
-    return not trt_test.IsQuantizationMode(run_params.precision_mode)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/tensorrt/test/reshape_transpose_test.py b/tensorflow/python/compiler/tensorrt/test/reshape_transpose_test.py
similarity index 95%
rename from tensorflow/contrib/tensorrt/test/reshape_transpose_test.py
rename to tensorflow/python/compiler/tensorrt/test/reshape_transpose_test.py
index 207944468ab0b038abfe01f0096d7dc220d064ed..423d70f2e4ed7a6728bc3f77a8d598566c209d41 100644
--- a/tensorflow/contrib/tensorrt/test/reshape_transpose_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/reshape_transpose_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -72,9 +72,9 @@ class ReshapeTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[tuple(input_dims)])
+        expected_output_dims=[[input_dims]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
@@ -129,9 +129,9 @@ class TransposeTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[(24, 100, 2, 24)])
+        expected_output_dims=[[[24, 100, 2, 24]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/contrib/tensorrt/test/testdata/checkpoint b/tensorflow/python/compiler/tensorrt/test/testdata/checkpoint
similarity index 100%
rename from tensorflow/contrib/tensorrt/test/testdata/checkpoint
rename to tensorflow/python/compiler/tensorrt/test/testdata/checkpoint
diff --git a/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.data-00000-of-00001 b/tensorflow/python/compiler/tensorrt/test/testdata/model.ckpt-46900.data-00000-of-00001
similarity index 100%
rename from tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.data-00000-of-00001
rename to tensorflow/python/compiler/tensorrt/test/testdata/model.ckpt-46900.data-00000-of-00001
diff --git a/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.index b/tensorflow/python/compiler/tensorrt/test/testdata/model.ckpt-46900.index
similarity index 100%
rename from tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.index
rename to tensorflow/python/compiler/tensorrt/test/testdata/model.ckpt-46900.index
diff --git a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
similarity index 71%
rename from tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
rename to tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
index 495a9391a1e818a6078988161c9bf72f6143737f..3a14a1cf6d56df34364a0678a264f61c1bb25f2c 100644
--- a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
+++ b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
@@ -25,12 +25,13 @@ import warnings
 import numpy as np
 import six
 
-from tensorflow.contrib.tensorrt.python import trt_convert
-# pylint: disable=unused-import
-from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
-# pylint: enable=unused-import
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.compiler.tensorrt import trt_convert
+from tensorflow.python.compiler.tensorrt.wrap_conversion import clear_test_values
+from tensorflow.python.compiler.tensorrt.wrap_conversion import enable_test_value
+from tensorflow.python.compiler.tensorrt.wrap_conversion import get_test_value
+from tensorflow.python.compiler.tensorrt.wrap_conversion import is_tensorrt_enabled
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import graph_io
 from tensorflow.python.framework import importer
@@ -39,9 +40,19 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 
-TfTrtIntegrationTestParams = namedtuple("TfTrtIntegrationTestParams", [
-    "gdef", "input_names", "input_dims", "output_names", "expected_output_dims"
-])
+TfTrtIntegrationTestParams = namedtuple(
+    "TfTrtIntegrationTestParams",
+    [
+        "gdef",
+        # A list of names of the input placeholder nodes.
+        "input_names",
+        # A list of list of output shapes of the input placeholder nodes.
+        "input_dims",
+        # A list of names of the output identity nodes.
+        "output_names",
+        # A list of list of expected output shapes of the output identity nodes.
+        "expected_output_dims"
+    ])
 
 RunParams = namedtuple("RunParams", [
     "use_optimizer", "precision_mode", "dynamic_engine", "test_name",
@@ -51,7 +62,7 @@ RunParams = namedtuple("RunParams", [
 ConversionParams = namedtuple("ConversionParams", [
     "max_batch_size", "max_workspace_size_bytes", "precision_mode",
     "minimum_segment_size", "is_dynamic_op", "maximum_cached_engines",
-    "cached_engine_batch_sizes", "rewriter_config", "use_calibration"
+    "cached_engine_batches", "rewriter_config", "use_calibration"
 ])
 
 PRECISION_MODES = ["FP32", "FP16", "INT8"]
@@ -100,7 +111,7 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
 
   @property
   def trt_incompatible_op(self):
-    return math_ops.sin
+    return math_ops.erf
 
   @property
   def precision_modes(self):
@@ -141,7 +152,7 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
   def setUpClass(cls):
     """Setup method for the module."""
     super(TfTrtIntegrationTestBase, cls).setUpClass()
-    trt_convert.enable_test_value()
+    enable_test_value()
 
   def __init__(self, methodName="runTest"):  # pylint: disable=invalid-name
     super(TfTrtIntegrationTestBase, self).__init__(methodName)
@@ -151,7 +162,7 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     """Setup method."""
     super(TfTrtIntegrationTestBase, self).setUp()
     warnings.simplefilter("always")
-    trt_convert.clear_test_values("")
+    clear_test_values("")
 
   def GetParams(self):
     """Return a TfTrtIntegrationTestParams for test, implemented by subclass."""
@@ -159,16 +170,24 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
 
   def GetConversionParams(self, run_params):
     """Return a ConversionParams for test."""
+    batch_list = []
+    for dims_list in self._GetParamsCached().input_dims:
+      assert dims_list
+      # Each list of shapes should have same batch size.
+      input_batches = [dims[0] for dims in dims_list]
+      assert max(input_batches) == min(input_batches)
+      batch_list.append(input_batches[0])
     return ConversionParams(
-        max_batch_size=max([
-            dims[0] for dims in self._GetParamsCached().input_dims if len(dims)
-        ]),
+        # We use the minimum of all the batch sizes, so when multiple different
+        # input shapes are provided it'll always create new engines in the
+        # cache, and we can therefore test the cache behavior.
+        max_batch_size=min(batch_list),
         max_workspace_size_bytes=1 << 25,
         precision_mode=run_params.precision_mode,
         minimum_segment_size=2,
         is_dynamic_op=run_params.dynamic_engine,
         maximum_cached_engines=1,
-        cached_engine_batch_sizes=None,
+        cached_engine_batches=None,
         rewriter_config=None,
         use_calibration=run_params.use_calibration)
 
@@ -228,9 +247,9 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
   def _PrepareRun(self, graph_state):
     """Set up necessary testing environment before calling sess.run()."""
     # Clear test values added by TRTEngineOp.
-    trt_convert.clear_test_values("TRTEngineOp_.*:ExecuteTrtEngine")
-    trt_convert.clear_test_values("TRTEngineOp_.*:ExecuteCalibration")
-    trt_convert.clear_test_values("TRTEngineOp_.*:ExecuteNativeSegment")
+    clear_test_values("TRTEngineOp_.*:ExecuteTrtEngine")
+    clear_test_values("TRTEngineOp_.*:ExecuteCalibration")
+    clear_test_values("TRTEngineOp_.*:ExecuteNativeSegment")
 
   def _GetGPUOptions(self):
     gpu_options = config_pb2.GPUOptions()
@@ -239,21 +258,24 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
 
   def _GetConfigProto(self, run_params, graph_state):
     """Get config proto based on specific settings."""
-    if graph_state != GraphState.ORIGINAL and run_params.use_optimizer:
-      conversion_params = self.GetConversionParams(run_params)
-      rewriter_cfg = trt_convert.get_tensorrt_rewriter_config(
+    conversion_params = self.GetConversionParams(run_params)
+    if graph_state == GraphState.INFERENCE and run_params.use_optimizer:
+      rewriter_cfg = trt_convert.TrtGraphConverter.get_tensorrt_rewriter_config(
           conversion_params.rewriter_config, conversion_params.max_batch_size,
           conversion_params.max_workspace_size_bytes,
           conversion_params.precision_mode,
           conversion_params.minimum_segment_size,
           conversion_params.is_dynamic_op,
           conversion_params.maximum_cached_engines,
-          conversion_params.cached_engine_batch_sizes,
+          conversion_params.cached_engine_batches,
           conversion_params.use_calibration)
 
       graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_cfg)
     else:
       graph_options = config_pb2.GraphOptions()
+      if conversion_params.rewriter_config is not None:
+        graph_options.rewrite_options.CopyFrom(
+            conversion_params.rewriter_config)
 
     config = config_pb2.ConfigProto(
         gpu_options=self._GetGPUOptions(), graph_options=graph_options)
@@ -261,7 +283,7 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
 
   def _ExpectTestValue(self, engine_name, method, expected_value):
     label = "%s:%s" % (engine_name, method)
-    actual_value = trt_convert.get_test_value(label)
+    actual_value = get_test_value(label)
     self.assertEqual(
         expected_value,
         actual_value,
@@ -277,72 +299,112 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
   def _ExpectNativeSegment(self, engine_name, value):
     self._ExpectTestValue(engine_name, "ExecuteNativeSegment", value)
 
+  def _GetFeedNames(self):
+    params = self._GetParamsCached()
+    # Construct the feeds tensor names by appending :0 to the node names.
+    return [input_name + ":0" for input_name in params.input_names]
+
+  def _GetFetchNames(self):
+    params = self._GetParamsCached()
+    # Construct the fetches tensor names by appending :0 to the node names.
+    return [output_name + ":0" for output_name in params.output_names]
+
+  def _GetFeedDict(self, inputs_data, input_shape_index):
+    assert input_shape_index < len(inputs_data)
+    feeds = self._GetFeedNames()
+    return {
+        feeds[i]: inputs_data[input_shape_index][i] for i in range(len(feeds))
+    }
+
   def _RunGraph(self,
                 run_params,
                 gdef,
-                input_data,
+                inputs_data,
                 config,
                 graph_state,
                 num_runs=2):
     """Run given graphdef multiple times."""
     params = self._GetParamsCached()
-    assert len(params.input_names) == len(input_data)
+    for data in inputs_data:
+      assert len(params.input_names) == len(data)
+
+    fetches = self._GetFetchNames()
     g = ops.Graph()
     with g.as_default():
-      io_ops = importer.import_graph_def(
-          graph_def=gdef,
-          return_elements=params.input_names + params.output_names,
-          name="")
-      inputs = [op.outputs[0] for op in io_ops[:len(params.input_names)]]
-      assert len(inputs) == len(input_data)
-      outputs = [op.outputs[0] for op in io_ops[len(params.input_names):]]
-    with self.test_session(
-        graph=g, config=config, use_gpu=True, force_gpu=True) as sess:
-      val = None
-      # Defaults to 2 runs to verify result across multiple runs is same.
-      for _ in range(num_runs):
-        self._PrepareRun(graph_state)
-        new_val = sess.run(
-            outputs, {inputs[i]: input_data[i] for i in range(len(inputs))})
-        output_len = len(params.expected_output_dims)
-        self.assertEqual(output_len, len(new_val))
-        for i in range(output_len):
-          self.assertEqual(params.expected_output_dims[i], new_val[i].shape)
-        if val is not None:
-          self.assertAllClose(val, new_val, atol=1.e-06, rtol=1.e-06)
-        val = new_val
-        self.VerifyRun(run_params, graph_state)
-    return val
-
-  # Use real data that is representative of the inference dataset
-  # for calibration. For this test script it is random data.
-  def _RunCalibration(self, run_params, gdef, input_data, config):
-    """Run calibration on given graph."""
-    return self._RunGraph(
-        run_params, gdef, input_data, config, GraphState.CALIBRATE, num_runs=5)
-
-  def _GetTrtGraphDef(self, run_params, gdef):
-    """Return trt converted graphdef."""
+      importer.import_graph_def(graph_def=gdef, name="")
+      with self.session(
+          graph=g, config=config, use_gpu=True, force_gpu=True) as sess:
+        vals = []
+        # Run for each input(s) shape
+        for shape_index in range(len(inputs_data)):
+          val = None
+          for _ in range(num_runs):
+            self._PrepareRun(graph_state)
+            new_val = sess.run(fetches,
+                               self._GetFeedDict(inputs_data, shape_index))
+            output_len = len(params.expected_output_dims[shape_index])
+            self.assertEqual(output_len, len(new_val))
+            for i in range(output_len):
+              self.assertEqual(
+                  list(params.expected_output_dims[shape_index][i]),
+                  list(new_val[i].shape))
+            if val is not None:
+              self.assertAllClose(val, new_val, atol=1.e-06, rtol=1.e-06)
+            val = new_val
+            self.VerifyRun(run_params, graph_state)
+          vals.append(val)
+        return vals
+
+  def _CreateConverter(self, gdef, session_config, conversion_params):
+    """Return a TrtGraphConverter."""
     params = self._GetParamsCached()
-    conversion_params = self.GetConversionParams(run_params)
-    logging.info(conversion_params)
-
-    config_for_trt = config_pb2.ConfigProto(gpu_options=self._GetGPUOptions())
-    if conversion_params.rewriter_config is not None:
-      config_for_trt.graph_options.rewrite_options.CopyFrom(
-          conversion_params.rewriter_config)
-    return trt_convert.create_inference_graph(
+    converter = trt_convert.TrtGraphConverter(
         input_graph_def=gdef,
-        outputs=params.input_names + params.output_names,
+        nodes_blacklist=params.input_names + params.output_names,
+        session_config=session_config,
         max_batch_size=conversion_params.max_batch_size,
         max_workspace_size_bytes=conversion_params.max_workspace_size_bytes,
         precision_mode=conversion_params.precision_mode,
         minimum_segment_size=conversion_params.minimum_segment_size,
         is_dynamic_op=conversion_params.is_dynamic_op,
         maximum_cached_engines=conversion_params.maximum_cached_engines,
-        cached_engine_batch_sizes=conversion_params.cached_engine_batch_sizes,
-        use_calibration=conversion_params.use_calibration,
-        session_config=config_for_trt)
+        cached_engine_batches=conversion_params.cached_engine_batches,
+        use_calibration=conversion_params.use_calibration)
+    return converter
+
+  def _GetCalibratedInferGraph(self, run_params, gdef, inputs_data):
+    """Return trt converted graphdef in INT8 mode."""
+    conversion_params = self.GetConversionParams(run_params)
+    logging.info(conversion_params)
+    assert conversion_params.precision_mode == "INT8"
+    assert conversion_params.is_dynamic_op
+    assert conversion_params.maximum_cached_engines == 1
+    assert not conversion_params.cached_engine_batches
+    assert conversion_params.use_calibration
+    assert len(inputs_data) == 1  # We only support calibrating single engine.
+
+    session_config = self._GetConfigProto(run_params, GraphState.CALIBRATE)
+    logging.info("Running calibration graph, config:\n%s", str(session_config))
+
+    converter = self._CreateConverter(gdef, session_config, conversion_params)
+    int8_gdef = converter.convert()
+    self._VerifyGraphDef(run_params, int8_gdef, GraphState.CALIBRATE)
+
+    return converter.calibrate(
+        fetch_names=self._GetFetchNames(),
+        num_runs=5,
+        feed_dict_fn=lambda: self._GetFeedDict(inputs_data, 0))
+
+  def _GetInferGraph(self, run_params, gdef):
+    """Return trt converted graphdef."""
+    conversion_params = self.GetConversionParams(run_params)
+    logging.info(conversion_params)
+
+    session_config = self._GetConfigProto(run_params, GraphState.INFERENCE)
+    logging.info("Creating TRT graph for inference, config\n%s",
+                 str(session_config))
+    converter = self._CreateConverter(gdef, session_config, conversion_params)
+    return converter.convert()
 
   def _WriteGraph(self, run_params, gdef, graph_state):
     if graph_state == GraphState.ORIGINAL:
@@ -425,13 +487,14 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
 
     expected_engines = self.ExpectedEnginesToBuild(run_params)
     num_engines = 0
+    functions = [f.signature.name for f in gdef.library.function]
     for node in gdef.node:
       if node.op == "TRTEngineOp":
         logging.info("Found TRTEngineOp: " + node.name)
-    for node in gdef.node:
-      if node.op == "TRTEngineOp":
         num_engines += 1
-        self.assertTrue(node.name in expected_engines, node.name)
+        function_name = node.name + "_native_segment"
+        self.assertIn(function_name, functions)
+        self.assertIn(node.name, expected_engines)
         self.assertTrue(len(node.attr["serialized_segment"].s), node.name)
         self.assertTrue(len(node.attr["segment_funcdef_name"].s), node.name)
         self.assertEqual(
@@ -474,51 +537,43 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
             dtypes.as_dtype(node.attr["dtype"].type).as_numpy_dtype())
     assert len(params.input_names) == len(input_dtypes)
 
-    input_data = []
-    for i in range(len(params.input_names)):
-      dtype = input_dtypes[params.input_names[i]]
-      # Multiply the input by some constant to avoid all zeros input for integer
-      # types.
-      scale = 10.0 if np.issubdtype(dtype, np.integer) else 1.0
-      dims = params.input_dims[i]
-      # TODO(laigd): add debug options. E.g. we can set the input data to be
-      # continuous natural numbers:
-      # seq = np.arange(np.prod(dims))
-      # seq.resize(dims)
-      # input_data.append(scale * seq.astype(dtype))
-      input_data.append((scale * np.random.random_sample(dims)).astype(dtype))
+    inputs_data = []
+    for inp in params.input_dims:
+      current_input_data = []
+      for i in range(len(params.input_names)):
+        dtype = input_dtypes[params.input_names[i]]
+        # Multiply the input by some constant to avoid all zeros input for
+        # integer types.
+        scale = 10.0 if np.issubdtype(dtype, np.integer) else 1.0
+        dims = inp[i]
+        # TODO(laigd): add debug options. E.g. we can set the input data to be
+        # continuous natural numbers:
+        # seq = np.arange(np.prod(dims))
+        # seq.resize(dims)
+        # input_data.append(scale * seq.astype(dtype))
+        current_input_data.append(
+            (scale * np.random.random_sample(dims)).astype(dtype))
+      inputs_data.append(current_input_data)
+
+    # Verify original graph.
     self._VerifyGraphDef(run_params, input_gdef, GraphState.ORIGINAL)
 
-    # Get reference result without running trt.
+    # Run original graph without trt to get reference result.
     config_no_trt = self._GetConfigProto(run_params, GraphState.ORIGINAL)
     logging.info("Running original graph w/o trt, config:\n%s",
                  str(config_no_trt))
-    ref_result = self._RunGraph(run_params, input_gdef, input_data,
+    ref_result = self._RunGraph(run_params, input_gdef, inputs_data,
                                 config_no_trt, GraphState.ORIGINAL)
 
     # Run calibration if necessary.
     if (IsQuantizationMode(run_params.precision_mode) and
         run_params.use_calibration):
-
-      calib_config = self._GetConfigProto(run_params, GraphState.CALIBRATE)
-      logging.info("Running calibration graph, config:\n%s", str(calib_config))
-      if run_params.use_optimizer:
-        result = self._RunCalibration(run_params, input_gdef, input_data,
-                                      calib_config)
-      else:
-        calib_gdef = self._GetTrtGraphDef(run_params, input_gdef)
-        self._VerifyGraphDef(run_params, calib_gdef, GraphState.CALIBRATE)
-        result = self._RunCalibration(run_params, calib_gdef, input_data,
-                                      calib_config)
-      infer_gdef = trt_convert.calib_graph_to_infer_graph(
-          calib_gdef, run_params.dynamic_engine)
+      infer_gdef = self._GetCalibratedInferGraph(run_params, input_gdef,
+                                                 inputs_data)
+      self._VerifyGraphDef(run_params, infer_gdef, GraphState.INFERENCE)
+    elif not run_params.use_optimizer:
+      infer_gdef = self._GetInferGraph(run_params, input_gdef)
       self._VerifyGraphDef(run_params, infer_gdef, GraphState.INFERENCE)
-
-      self.assertAllClose(
-          ref_result,
-          result,
-          atol=self.ExpectedAbsoluteTolerance(run_params),
-          rtol=self.ExpectedRelativeTolerance(run_params))
     else:
       infer_gdef = input_gdef
 
@@ -526,11 +581,7 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     infer_config = self._GetConfigProto(run_params, GraphState.INFERENCE)
     logging.info("Running final inference graph, config:\n%s",
                  str(infer_config))
-    if not run_params.use_optimizer:
-      infer_gdef = self._GetTrtGraphDef(run_params, infer_gdef)
-      self._VerifyGraphDef(run_params, infer_gdef, GraphState.INFERENCE)
-
-    result = self._RunGraph(run_params, infer_gdef, input_data, infer_config,
+    result = self._RunGraph(run_params, infer_gdef, inputs_data, infer_config,
                             GraphState.INFERENCE)
     self.assertAllClose(
         ref_result,
@@ -574,9 +625,8 @@ def _AddTests(test_class):
   for (use_optimizer, precision_mode, dynamic_engine, use_calibration) in opts:
     if IsQuantizationMode(precision_mode):
       if use_optimizer:
-        # TODO(aaroey): if use_optimizer is True we need to get the inference
-        # graphdef using custom python wrapper class, which is not currently
-        # supported yet.
+        # We ignore the use_optimizer option and always use TrtGraphConverter
+        # for INT8 mode, so no need to run it twice.
         continue
       if use_calibration and not dynamic_engine:
         # Static engine with use_calibration=False will be static, so we want to
@@ -603,5 +653,5 @@ def _AddTests(test_class):
     setattr(test_class, "testTfTrt_" + test_name, _GetTest(run_params))
 
 
-if trt_convert.is_tensorrt_enabled():
+if is_tensorrt_enabled():
   _AddTests(TfTrtIntegrationTestBase)
diff --git a/tensorflow/python/compiler/tensorrt/test/topk_test.py b/tensorflow/python/compiler/tensorrt/test/topk_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e2bf3b65c32054693e92a83d1ba2d9074387f2d
--- /dev/null
+++ b/tensorflow/python/compiler/tensorrt/test/topk_test.py
@@ -0,0 +1,89 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class TopKTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Testing Top-K in TF-TRT conversion."""
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [100, 100]
+    k = 5
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
+      k_tensor = constant_op.constant(k, dtype=dtypes.int32, name="Const")
+      values, indices = nn_ops.top_k(x, k_tensor, name="TopK")
+      values = array_ops.identity(values, name="output_values")
+      indices = array_ops.identity(indices, name="output_indices")
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[[input_dims]],
+        output_names=["output_values", "output_indices"],
+        expected_output_dims=[[[100, k], [100, k]]])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return {"TRTEngineOp_0": ["Const", "TopK"]}
+
+
+class TopKOutputTypeTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Testing that output type of engine using Top-K is set correctly."""
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [100, 100]
+    k = 5
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
+      k_tensor = constant_op.constant(k, dtype=dtypes.int32, name="Const")
+      values, indices = nn_ops.top_k(x, k_tensor, name="TopK")
+      # Reshape will act as a layer between the TopK output and the engine
+      # output, requiring the output tensor of reshape to be set explicitly to
+      # int32.
+      indices = array_ops.reshape(indices, [100, 1, 5], name="Reshape")
+      values = array_ops.identity(values, name="output_values")
+      indices = array_ops.identity(indices, name="output_indices")
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[[input_dims]],
+        output_names=["output_values", "output_indices"],
+        expected_output_dims=[[[100, k], [100, 1, k]]])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return {"TRTEngineOp_0": ["Const", "TopK", "Reshape", "Reshape/shape"]}
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/unary_test.py b/tensorflow/python/compiler/tensorrt/test/unary_test.py
similarity index 94%
rename from tensorflow/contrib/tensorrt/test/unary_test.py
rename to tensorflow/python/compiler/tensorrt/test/unary_test.py
index b6e5e32db1236684a06c2d44298b9a3d39667152..83569bcfbf12a27fec8590d18a1b016b92a9cf86 100644
--- a/tensorflow/contrib/tensorrt/test/unary_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/unary_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -100,9 +100,9 @@ class UnaryTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name, input2_name],
-        input_dims=[input_dims, input2_dims],
+        input_dims=[[input_dims, input2_dims]],
         output_names=[output_name],
-        expected_output_dims=[(12, 5, 8, 12)])
+        expected_output_dims=[[[12, 5, 8, 12]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py b/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
similarity index 94%
rename from tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py
rename to tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
index b29626d2c28b4def716aef9e2703b669b5e46374..97ee11747e889e4821c64c1cbafcfcee78d4405b 100644
--- a/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -70,9 +70,9 @@ class VGGBlockNCHWTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[(5, 6, 2, 2)])
+        expected_output_dims=[[[5, 6, 2, 2]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/contrib/tensorrt/test/vgg_block_test.py b/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
similarity index 93%
rename from tensorflow/contrib/tensorrt/test/vgg_block_test.py
rename to tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
index 9b0b189626050f678c71e9abbf7eb5296440d879..a4fa1d67059093e93da1bda55a20ba75f45776ff 100644
--- a/tensorflow/contrib/tensorrt/test/vgg_block_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -61,9 +61,9 @@ class VGGBlockTest(trt_test.TfTrtIntegrationTestBase):
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
         input_names=[input_name],
-        input_dims=[input_dims],
+        input_dims=[[input_dims]],
         output_names=[output_name],
-        expected_output_dims=[(5, 2, 2, 6)])
+        expected_output_dims=[[[5, 2, 2, 6]]])
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
diff --git a/tensorflow/contrib/tensorrt/trt_conversion.i b/tensorflow/python/compiler/tensorrt/trt_conversion.i
similarity index 53%
rename from tensorflow/contrib/tensorrt/trt_conversion.i
rename to tensorflow/python/compiler/tensorrt/trt_conversion.i
index 6ea15fb8eff13663625420288a37ba002d57fa47..35a6fa137d02d968f8929409709cb669f5c619cb 100644
--- a/tensorflow/contrib/tensorrt/trt_conversion.i
+++ b/tensorflow/python/compiler/tensorrt/trt_conversion.i
@@ -17,38 +17,10 @@ limitations under the License.
 %{
 #define SWIG_FILE_WITH_INIT
 %}
-%include "std_pair.i"
+%include "std_string.i"
 %include "tensorflow/python/platform/base.i"
 
 %{
-PyObject* pair_helper(std::pair<string, string>* in) {
-  PyObject *first(nullptr), *second(nullptr), *tuple(nullptr);
-  first = PyBytes_FromStringAndSize(in->first.data(), in->first.length());
-  if (!first) {
-    if (!PyErr_Occurred()) {
-      PyErr_SetString(PyExc_TypeError, "Pair conversion first argument failed");
-    }
-    return NULL;
-  }
-  second = PyBytes_FromStringAndSize(in->second.data(), in->second.length());
-  if (!second) {
-    if (!PyErr_Occurred()) {
-      PyErr_SetString(PyExc_TypeError,
-                      "Pair conversion second argument failed");
-    }
-    return NULL;
-  }
-  tuple = Py_BuildValue("(OO)", first, second);
-  if (!tuple) {
-    if (!PyErr_Occurred()) {
-      PyErr_SetString(PyExc_TypeError,
-                      "Tuple creation from pair<string,string> failed!");
-    }
-    return NULL;
-  }
-  return tuple;
-}
-
 struct version_struct{
   int vmajor;
   int vminor;
@@ -67,6 +39,7 @@ PyObject* version_helper(version_struct* in) {
   }
   return tuple;
 }
+
 /* Define converters for vector<int> */
 template<>
 bool _PyObjAs(PyObject *pyobj, int* dest) {
@@ -83,12 +56,6 @@ PyObject *_PyObjFrom(const int& src) {
 
 _LIST_OUTPUT_TYPEMAP(int, PyLong_FromLong);
 
-%typemap(out) std::pair<string, string> {
-  PyObject *tuple = pair_helper(&$1);
-  if (!tuple) SWIG_fail;
-  $result = tuple;
-}
-
 %typemap(out) version_struct {
   PyObject *tuple = version_helper(&$1);
   if (!tuple) SWIG_fail;
@@ -96,17 +63,13 @@ _LIST_OUTPUT_TYPEMAP(int, PyLong_FromLong);
 }
 
 %{
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/util/stat_summarizer.h"
-#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
-#include "tensorflow/contrib/tensorrt/convert/utils.h"
-#include "tensorflow/contrib/tensorrt/test/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/utils/test_utils.h"
 %}
 
 %ignoreall
 %unignore tensorflow;
-%unignore calib_convert;
 %unignore get_linked_tensorrt_version;
 %unignore get_loaded_tensorrt_version;
 %unignore is_tensorrt_enabled;
@@ -117,52 +80,6 @@ _LIST_OUTPUT_TYPEMAP(int, PyLong_FromLong);
 
 %{
 
-std::pair<string, string> calib_convert(
-    string graph_def_string, bool is_dyn_op
-    // unfortunately we can't use TF_Status here since it
-    // is in c/c_api and brings in a lot of other libraries
-    // which in turn declare ops. These ops are included
-    // statically in our library and cause an abort when
-    // module is loaded due to double registration
-    // until Tensorflow properly exposes these headers
-    // we have to work around this by returning a string
-    // and converting it to exception on python side.
-    //,TF_Status* out_status) {
-) {
-#if GOOGLE_CUDA && GOOGLE_TENSORRT
-  string out_status;
-
-  tensorflow::GraphDef graph_def;
-  if (!graph_def.ParseFromString(graph_def_string)) {
-    out_status = "InvalidArgument;Couldn't interpret input as a GraphDef";
-    return std::pair<string, string>{out_status, ""};
-  }
-  graph_def_string.resize(0);
-  tensorflow::GraphDef out_graph;
-  tensorflow::Status conversion_status =
-      tensorflow::tensorrt::convert::ConvertCalibGraphToInferGraph(
-          graph_def, &out_graph, is_dyn_op);
-  if (!conversion_status.ok()) {
-    auto retCode = (int)conversion_status.code();
-    char buff[2000];
-    snprintf(buff, 2000, "%d;%s", retCode,
-             conversion_status.error_message().c_str());
-    out_status = buff;
-    return std::pair<string, string>{out_status, ""};
-  }
-  string result;
-  if (!out_graph.SerializeToString(&result)) {
-    out_status = "InvalidArgument;Couldn't serialize output as a GraphDef";
-    return std::pair<string, string>{out_status, ""};
-  }
-  out_status = "OK;All good!";
-  return std::pair<string, string>{out_status, result};
-#else
-  // Returns FAILED_PRECONDITION.
-  return std::pair<string, string>{"9;TensorRT is not enabled!", ""};
-#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
-}
-
 version_struct get_linked_tensorrt_version() {
   // Return the version at the link time.
   version_struct s;
@@ -221,8 +138,6 @@ PyObject* get_test_value(PyObject* label) {
 
 %}
 
-std::pair<string, string> calib_convert(
-    string graph_def_string, bool is_dyn_op);
 version_struct get_linked_tensorrt_version();
 version_struct get_loaded_tensorrt_version();
 bool is_tensorrt_enabled();
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert.py b/tensorflow/python/compiler/tensorrt/trt_convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..0caa2bfaf3b29c26111b554cd30e0f1ab5cebe78
--- /dev/null
+++ b/tensorflow/python/compiler/tensorrt/trt_convert.py
@@ -0,0 +1,768 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Exposes the Python wrapper conversion to trt_graph."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six as _six
+from tensorflow.compiler.tf2tensorrt.python.ops import trt_ops
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import graph_util
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.grappler import tf_optimizer
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import tf_logging
+from tensorflow.python.saved_model import builder
+from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.training import saver
+
+
+def _to_bytes(s):
+  """Encode s if it is a sequence of chars."""
+  if isinstance(s, _six.text_type):
+    return s.encode("utf-8", errors="surrogateescape")
+  return s
+
+
+def _to_string(s):
+  """Decode s if it is a sequence of bytes."""
+  if isinstance(s, _six.binary_type):
+    return s.decode("utf-8")
+  return s
+
+
+class GraphConverter(object):
+  """Base class for offline converters to optimize SavedModels/GraphDefs.
+
+  A `GraphConverter` object encapsulates the environment to convert (optimize) a
+  TensorFlow SavedModel or GraphDef.
+
+  To create a custom GraphConverter:
+
+  ```python
+  class MyGraphConverter(GraphConverter):
+    ...
+
+    def get_rewriter_config(self, rewriter_config_template=None):
+      my_rewriter_config = ...
+      return my_rewriter_config
+  ```
+
+  Then to run the conversion without quantization calibration:
+
+  ```python
+  my_converter = MyGraphConverter(input_saved_model_dir="my_dir")
+  converted_graph_def = my_converter.convert()
+  my_converter.save(output_saved_model_dir)  # Optional
+  ```
+
+  To run the conversion with quantization calibration:
+
+  ```python
+  my_converter = MyGraphConverter(input_saved_model_dir="my_dir")
+  my_converter.convert()
+
+  # Run calibration 10 times.
+  converted_graph_def = my_converter.calibrate(
+      fetch_names=['output:0'],
+      num_runs=10,
+      feed_dict_fn=lambda: {'input:0': my_next_data()})
+
+  my_converter.save(output_saved_model_dir)  # Optional
+  ```
+  """
+
+  def __init__(self,
+               input_saved_model_dir=None,
+               input_saved_model_tags=None,
+               input_graph_def=None,
+               nodes_blacklist=None,
+               session_config=None):
+    """Initialize the converter.
+
+    Args:
+      input_saved_model_dir: the directory to load the SavedModel which contains
+        the input graph to transforms. Used only when input_graph_def is None.
+      input_saved_model_tags: list of tags to load the SavedModel.
+      input_graph_def: a GraphDef object containing a model to be transformed.
+        If set to None, the graph will be read from the SavedModel loaded from
+        input_saved_model_dir.
+      nodes_blacklist: list of node names to prevent the converter from
+        touching. Only used when input_graph_def is not None.
+      session_config: the ConfigProto used to create a Session. It's also used
+        as a template to create a RewriterConfig for conversion. If not
+        specified, a default ConfigProto will be used.
+
+    Raises:
+      ValueError: if the combination of the parameters is invalid.
+    """
+    if input_graph_def and input_saved_model_dir:
+      raise ValueError(
+          "Can only specify one of input_graph_def and input_saved_model_dir")
+    if not input_graph_def and not input_saved_model_dir:
+      raise ValueError("Must specify one of input_graph_def and "
+                       "input_saved_model_dir")
+
+    self._input_graph_def = input_graph_def
+    self._nodes_blacklist = nodes_blacklist
+    self._input_saved_model_dir = input_saved_model_dir
+    self._converted = False
+    self._grappler_meta_graph_def = None
+
+    self._input_saved_model_tags = (
+        input_saved_model_tags or [tag_constants.SERVING])
+    self._session_config = session_config or config_pb2.ConfigProto()
+
+    # For calibration usage.
+    self._calibration_graph = None
+    self._calibration_sess = None
+    self._calibration_data_collected = False
+
+  def get_rewriter_config(self, rewriter_config_template=None):
+    """Returns a RewriterConfig proto for TRT transformation.
+
+    Args:
+      rewriter_config_template: a template RewriterConfig proto used to create a
+        RewriterConfig for the conversion. The implementation should not modify
+        the template. If None, it will use a default one.
+
+    Returns:
+      A RewriterConfig proto which will be used to run the conversion using
+      Grappler.
+    """
+    raise NotImplementedError("get_rewriter_config")
+
+  def _run_conversion(self):
+    """Run Grappler's OptimizeGraph() tool to convert the graph."""
+    # Create custom ConfigProto for Grappler.
+    grappler_session_config = config_pb2.ConfigProto()
+    grappler_session_config.CopyFrom(self._session_config)
+    rewriter_config = None
+    if (grappler_session_config.HasField("graph_options") and
+        grappler_session_config.graph_options.HasField("rewrite_options")):
+      rewriter_config = grappler_session_config.graph_options.rewrite_options
+    custom_rewriter_config = self.get_rewriter_config(rewriter_config)
+    grappler_session_config.graph_options.rewrite_options.CopyFrom(
+        custom_rewriter_config)
+
+    # Run Grappler.
+    self._converted_graph_def = tf_optimizer.OptimizeGraph(
+        grappler_session_config,
+        self._grappler_meta_graph_def,
+        graph_id=b"tf_graph")
+    self._converted = True
+
+  def _convert_graph_def(self):
+    """Convert the input GraphDef."""
+    graph = ops.Graph()
+    with graph.as_default():
+      importer.import_graph_def(self._input_graph_def, name="")
+    self._grappler_meta_graph_def = saver.export_meta_graph(
+        graph_def=graph.as_graph_def(add_shapes=True), graph=graph)
+    if self._nodes_blacklist:
+      output_collection = meta_graph_pb2.CollectionDef()
+      output_list = output_collection.node_list.value
+      for i in self._nodes_blacklist:
+        if isinstance(i, ops.Tensor):
+          output_list.append(_to_bytes(i.name))
+        else:
+          output_list.append(_to_bytes(i))
+      # TODO(laigd): use another key as the self._nodes_blacklist are really
+      # not train_op.
+      self._grappler_meta_graph_def.collection_def["train_op"].CopyFrom(
+          output_collection)
+
+    self._run_conversion()
+
+  def _convert_saved_model(self):
+    """Convert the input SavedModel."""
+    graph = ops.Graph()
+    with session.Session(graph=graph, config=self._session_config) as sess:
+      input_meta_graph_def = loader.load(sess, self._input_saved_model_tags,
+                                         self._input_saved_model_dir)
+
+      def _gather_names(tensor_info):
+        """Get the node names from a TensorInfo."""
+        return set([tensor_info[key].name.split(":")[0] for key in tensor_info])
+
+      # Get input and outputs from all SignatureDef.
+      output_node_names = set()
+      for key in input_meta_graph_def.signature_def:
+        signature_def = input_meta_graph_def.signature_def[key]
+        output_node_names.update(_gather_names(signature_def.inputs))
+        output_node_names.update(_gather_names(signature_def.outputs))
+
+      # Freeze the variables in the SavedModel graph and copy the frozen
+      # graph over.
+      frozen_graph_def = graph_util.convert_variables_to_constants(
+          sess, sess.graph.as_graph_def(add_shapes=True),
+          list(output_node_names))
+      self._grappler_meta_graph_def = meta_graph_pb2.MetaGraphDef()
+      self._grappler_meta_graph_def.graph_def.CopyFrom(frozen_graph_def)
+
+      # Copy the collections that are not variables.
+      for key in input_meta_graph_def.collection_def:
+        # TODO(laigd): currently we use the collection key to filter out
+        # collections that depend on variable ops, but this may miss some
+        # other user-defined collections. A better way would be to use
+        # CollectionDef::NodeList for the filtering.
+        if key not in [
+            "variables", "local_variables", "model_variables",
+            "trainable_variables", "train_op", "table_initializer"
+        ]:
+          self._grappler_meta_graph_def.collection_def[key].CopyFrom(
+              input_meta_graph_def.collection_def[key])
+
+      # Copy other information.
+      self._grappler_meta_graph_def.meta_info_def.CopyFrom(
+          input_meta_graph_def.meta_info_def)
+      for key in input_meta_graph_def.signature_def:
+        self._grappler_meta_graph_def.signature_def[key].CopyFrom(
+            input_meta_graph_def.signature_def[key])
+      # TODO(laigd): maybe add back AssetFileDef.
+
+    self._run_conversion()
+
+  def convert(self):
+    """Run the conversion.
+
+    Returns:
+      The converted GraphDef.
+    """
+    assert not self._converted
+
+    if self._input_graph_def:
+      self._convert_graph_def()
+    else:
+      self._convert_saved_model()
+    return self._converted_graph_def
+
+  def calibrate(self,
+                fetch_names,
+                num_runs,
+                feed_dict_fn=None,
+                input_map_fn=None):
+    """Run the calibration and return the calibrated GraphDef.
+
+    Args:
+      fetch_names: a list of output tensor name to fetch during calibration.
+      num_runs: number of runs of the graph during calibration.
+      feed_dict_fn: a function that returns a dictionary mapping input names (as
+        strings) in the GraphDef to be calibrated to values (e.g. Python list,
+        numpy arrays, etc). One and only one of `feed_dict_fn` and
+        `input_map_fn` should be specified.
+      input_map_fn: a function that returns a dictionary mapping input names (as
+        strings) in the GraphDef to be calibrated to Tensor objects. The values
+        of the named input tensors in the GraphDef to be calibrated will be
+        re-mapped to the respective `Tensor` values during calibration. One and
+        only one of `feed_dict_fn` and `input_map_fn` should be specified.
+
+    Raises:
+      ValueError: if the input combination is invalid.
+
+    Returns:
+      The GraphDef after the calibration.
+    """
+    assert self._converted
+    assert not self._calibration_sess
+    if (feed_dict_fn and input_map_fn) or (not feed_dict_fn and
+                                           not input_map_fn):
+      raise ValueError(
+          "Should specify one and only one of feed_dict_fn and input_map_fn.")
+
+    self._calibration_graph = ops.Graph()
+    with self._calibration_graph.as_default():
+      fetches = importer.import_graph_def(
+          self._converted_graph_def,
+          input_map=input_map_fn() if input_map_fn else None,
+          return_elements=fetch_names,
+          name="")
+    self._calibration_sess = session.Session(
+        graph=self._calibration_graph, config=self._session_config)
+
+    for _ in range(num_runs):
+      self._calibration_sess.run(
+          fetches, feed_dict=feed_dict_fn() if feed_dict_fn else None)
+
+    self.finalize_calibration()
+    return self._converted_graph_def
+
+  def finalize_calibration(self):
+    """Clean up calibration resources and finalize the calibration.
+
+    Implementations need to close self._calibration_sess before returning.
+    """
+    raise NotImplementedError("finalize_calibration")
+
+  def save(self, output_saved_model_dir):
+    """Save the converted graph as a SavedModel.
+
+    Args:
+      output_saved_model_dir: construct a SavedModel using the converted
+        GraphDef and save it to the specified directory. This option only works
+        when the input graph is loaded from a SavedModel, i.e. when
+        input_saved_model_dir is specified and input_graph_def is None in
+        __init__().
+
+    Raises:
+      ValueError: if the input to the converter is a GraphDef instead of a
+      SavedModel.
+    """
+    assert self._converted
+
+    if self._input_graph_def:
+      raise ValueError(
+          "Not able to save to a SavedModel since input is a GraphDef")
+
+    # Write the transformed graphdef as SavedModel.
+    saved_model_builder = builder.SavedModelBuilder(output_saved_model_dir)
+    with ops.Graph().as_default():
+      importer.import_graph_def(self._converted_graph_def, name="")
+      # We don't use any specific converter here.
+      with session.Session(config=self._session_config) as sess:
+        saved_model_builder.add_meta_graph_and_variables(
+            sess,
+            self._input_saved_model_tags,
+            signature_def_map=self._grappler_meta_graph_def.signature_def)
+    # Ignore other meta graphs from the input SavedModel.
+    saved_model_builder.save()
+
+
+class TrtPrecisionMode(object):
+  FP32 = "FP32"
+  FP16 = "FP16"
+  INT8 = "INT8"
+
+  @staticmethod
+  def supported_precision_modes():
+    return [TrtPrecisionMode.FP32, TrtPrecisionMode.FP16, TrtPrecisionMode.INT8]
+
+
+# Use a large enough number as the default max_workspace_size for TRT engines,
+# so it can produce reasonable performance results with the default.
+DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES = 1 << 30
+
+
+class TrtGraphConverter(GraphConverter):
+  """A GraphConverter for TRT transformation."""
+
+  _TRT_CALIBRATION_RESOURCE_CONTAINER_NAME = "TF_TRT_Calibration"
+
+  @classmethod
+  def get_tensorrt_rewriter_config(
+      cls,
+      rewriter_config_template=None,
+      max_batch_size=1,
+      max_workspace_size_bytes=DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES,
+      precision_mode=TrtPrecisionMode.FP32,
+      minimum_segment_size=3,
+      is_dynamic_op=False,
+      maximum_cached_engines=1,
+      cached_engine_batches=None,
+      use_calibration=True):
+    """Returns a RewriterConfig proto for TRT transformation.
+
+    Args:
+      rewriter_config_template: a template RewriterConfig proto used to create a
+        TRT-enabled RewriterConfig. If None, it will use a default one.
+      max_batch_size: max size for the input batch
+      max_workspace_size_bytes: the maximum GPU temporary memory which the TRT
+        engine can use at execution time. This corresponds to the
+        'workspaceSize' parameter of nvinfer1::IBuilder::setMaxWorkspaceSize().
+      precision_mode: one of TrtPrecisionMode.supported_precision_modes().
+      minimum_segment_size: the minimum number of nodes required for a subgraph
+        to be replaced by TRTEngineOp.
+      is_dynamic_op: whether to generate dynamic TRT ops which will build the
+        TRT network and engine at run time.
+      maximum_cached_engines: max number of cached TRT engines in dynamic TRT
+        ops. If the number of cached engines is already at max but none of them
+        can serve the input, the TRTEngineOp will fall back to run the TF
+        function based on which the TRTEngineOp is created.
+      cached_engine_batches: a list of batch sizes used to create cached
+        engines, only used when is_dynamic_op is True. The length of the list
+        should be <= maximum_cached_engines, and the dynamic TRT op will use
+        this list to determine the batch sizes of the cached engines, instead of
+        making the decision on the fly. This is useful when we know the most
+        common batch size(s) the application is going to generate.
+      use_calibration: this argument is ignored if precision_mode is not INT8.
+        If set to True, a calibration graph will be created to calibrate the
+        missing ranges. The calibration graph must be converted to an inference
+        graph using calib_graph_to_infer_graph() after running calibration. if
+        set to False, quantization nodes will be expected for every tensor in
+        the graph (exlcuding those which will be fused). If a range is missing,
+        an error will occur. Please note that accuracy may be negatively
+        affected if there is a mismatch between which tensors TRT quantizes and
+        which tensors were trained with fake quantization.
+
+    Returns:
+      A RewriterConfig proto which sets a TensorRTOptimizer to run Grappler.
+
+    Raises:
+      TypeError: if any of the parameters are of unexpected type.
+      ValueError: if any of the parameters are of unexpected value.
+    """
+    # Lazily load the TF-TRT C bindings, so `import tensorflow` doesn't complain
+    # even if it cannot find TensorRT library.
+    trt_ops.load_trt_ops()
+    # pylint: disable=g-import-not-at-top,unused-import,line-too-long,unused-variable
+    # Import a random symbol to trigger loading of TRT library.
+    from tensorflow.python.compiler.tensorrt.wrap_conversion import get_linked_tensorrt_version
+    # pylint: enable=g-import-not-at-top,unused-import,line-too-long,unused-variable
+
+    if rewriter_config_template is not None and not isinstance(
+        rewriter_config_template, rewriter_config_pb2.RewriterConfig):
+      raise TypeError(
+          "rewriter_config_template should be a RewriterConfig proto.")
+
+    rewriter_config_with_trt = rewriter_config_pb2.RewriterConfig()
+    if rewriter_config_template is None:
+      # Layout optimizer may add Const nodes followed by Reshape nodes, thus we
+      # need to run constant folding again.
+      rewriter_config_with_trt.optimizers.extend(
+          ["constfold", "layout", "constfold"])
+      rewriter_config_with_trt.meta_optimizer_iterations = (
+          rewriter_config_pb2.RewriterConfig.ONE)
+    else:
+      rewriter_config_with_trt.CopyFrom(rewriter_config_template)
+
+    optimizer = rewriter_config_with_trt.custom_optimizers.add()
+    optimizer.name = "TensorRTOptimizer"
+    optimizer.parameter_map["minimum_segment_size"].i = minimum_segment_size
+    optimizer.parameter_map["max_batch_size"].i = max_batch_size
+    optimizer.parameter_map["is_dynamic_op"].b = is_dynamic_op
+    optimizer.parameter_map[
+        "max_workspace_size_bytes"].i = max_workspace_size_bytes
+    optimizer.parameter_map["precision_mode"].s = _to_bytes(precision_mode)
+    optimizer.parameter_map["maximum_cached_engines"].i = maximum_cached_engines
+    if cached_engine_batches:
+      optimizer.parameter_map["cached_engine_batches"].list.i.extend(
+          cached_engine_batches)
+    optimizer.parameter_map["use_calibration"].b = use_calibration
+    return rewriter_config_with_trt
+
+  def __init__(self,
+               input_saved_model_dir=None,
+               input_saved_model_tags=None,
+               input_graph_def=None,
+               nodes_blacklist=None,
+               session_config=None,
+               max_batch_size=1,
+               max_workspace_size_bytes=DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES,
+               precision_mode=TrtPrecisionMode.FP32,
+               minimum_segment_size=3,
+               is_dynamic_op=False,
+               maximum_cached_engines=1,
+               cached_engine_batches=None,
+               use_calibration=True):
+    """Initialize the converter.
+
+    Args:
+      input_saved_model_dir: the directory to load the SavedModel which contains
+        the input graph to transforms. Used only when input_graph_def is None.
+      input_saved_model_tags: list of tags to load the SavedModel.
+      input_graph_def: a GraphDef object containing a model to be transformed.
+        If set to None, the graph will be read from the SavedModel loaded from
+        input_saved_model_dir.
+      nodes_blacklist: list of node names to prevent the converter from
+        touching. Only used when input_graph_def is not None.
+      session_config: the ConfigProto used to create a Session. It's also used
+        as a template to create a TRT-enabled ConfigProto for conversion. If not
+        specified, a default ConfigProto will be used.
+      max_batch_size: max size for the input batch.
+      max_workspace_size_bytes: the maximum GPU temporary memory which the TRT
+        engine can use at execution time. This corresponds to the
+        'workspaceSize' parameter of nvinfer1::IBuilder::setMaxWorkspaceSize().
+      precision_mode: one of TrtPrecisionMode.supported_precision_modes().
+      minimum_segment_size: the minimum number of nodes required for a subgraph
+        to be replaced by TRTEngineOp.
+      is_dynamic_op: whether to generate dynamic TRT ops which will build the
+        TRT network and engine at run time.
+      maximum_cached_engines: max number of cached TRT engines in dynamic TRT
+        ops. If the number of cached engines is already at max but none of them
+        can serve the input, the TRTEngineOp will fall back to run the TF
+        function based on which the TRTEngineOp is created.
+      cached_engine_batches: a list of batch sizes used to create cached
+        engines, only used when is_dynamic_op is True. The length of the list
+        should be <= maximum_cached_engines, and the dynamic TRT op will use
+        this list to determine the batch sizes of the cached engines, instead of
+        making the decision on the fly. This is useful when we know the most
+        common batch size(s) the application is going to generate.
+      use_calibration: this argument is ignored if precision_mode is not INT8.
+        If set to True, a calibration graph will be created to calibrate the
+        missing ranges. The calibration graph must be converted to an inference
+        graph using calib_graph_to_infer_graph() after running calibration. if
+        set to False, quantization nodes will be expected for every tensor in
+        the graph (exlcuding those which will be fused). If a range is missing,
+        an error will occur. Please note that accuracy may be negatively
+        affected if there is a mismatch between which tensors TRT quantizes and
+        which tensors were trained with fake quantization.
+
+    Raises:
+      ValueError: if the combination of the parameters is invalid.
+      RuntimeError: if the TensorRT library version is incompatible.
+    """
+    super(TrtGraphConverter, self).__init__(
+        input_saved_model_dir=input_saved_model_dir,
+        input_saved_model_tags=input_saved_model_tags,
+        input_graph_def=input_graph_def,
+        nodes_blacklist=nodes_blacklist,
+        session_config=session_config)
+
+    # Lazily load the TF-TRT C bindings, so `import tensorflow` doesn't complain
+    # even if it cannot find TensorRT library.
+    trt_ops.load_trt_ops()
+    # pylint: disable=g-import-not-at-top,line-too-long
+    from tensorflow.python.compiler.tensorrt.wrap_conversion import get_linked_tensorrt_version
+    from tensorflow.python.compiler.tensorrt.wrap_conversion import get_loaded_tensorrt_version
+    # pylint: enable=g-import-not-at-top,line-too-long
+
+    # Check compatibility of TensorRT version.
+    compiled_version = get_linked_tensorrt_version()
+    loaded_version = get_loaded_tensorrt_version()
+    version_mismatch = False
+    if loaded_version[0] < compiled_version[0]:
+      tf_logging.error(
+          "TensorRT version mismatch. Tensorflow was compiled against " +
+          "TensorRT %s but library loaded from environment is TensorRT %s" %
+          (".".join([str(x) for x in compiled_version]),
+           ".".join([str(x) for x in loaded_version])) +
+          ". Please make sure that correct version of TensorRT " +
+          "is available in the system and added to ldconfig or LD_LIBRARY_PATH")
+      raise RuntimeError("Incompatible TensorRT library version")
+    for i in zip(loaded_version, compiled_version):
+      if i[0] != i[1]:
+        tf_logging.warn("TensorRT mismatch. Compiled against version " +
+                        "%s, but loaded %s. Things may not work" %
+                        (".".join([str(x) for x in compiled_version]),
+                         ".".join([str(x) for x in loaded_version])))
+        version_mismatch = True
+        break
+    if not version_mismatch:
+      tf_logging.info("Running against TensorRT version %s" %
+                      ".".join([str(x) for x in loaded_version]))
+
+    # Check input arguments.
+    if precision_mode not in TrtPrecisionMode.supported_precision_modes():
+      raise ValueError(("precision mode '{}' is not supported."
+                        "It should be one of {}").format(
+                            precision_mode,
+                            TrtPrecisionMode.supported_precision_modes))
+
+    if cached_engine_batches:
+      if not isinstance(cached_engine_batches, list):
+        raise TypeError("cached_engine_batches should be a list.")
+      if len(cached_engine_batches) > maximum_cached_engines:
+        raise ValueError("cached_engine_batches should not contain more than "
+                         "maximum_cached_engines items.")
+
+    self._need_calibration = (
+        precision_mode == TrtPrecisionMode.INT8 and use_calibration)
+
+    # TODO(laigd):
+    # - Get rid of is_dynamic_op option, it should always be True, and it should
+    #   accept N shapes as input.
+    # - Verify in int8 mode that maximum_cached_engines and
+    #   cached_engine_batches are set appropriately.
+    # - If it fails to build the int8 engine it should return error.
+    self._max_batch_size = max_batch_size
+    self._max_workspace_size_bytes = max_workspace_size_bytes
+    self._precision_mode = precision_mode
+    self._minimum_segment_size = minimum_segment_size
+    self._is_dynamic_op = is_dynamic_op
+    self._maximum_cached_engines = maximum_cached_engines
+    self._cached_engine_batches = cached_engine_batches
+
+  def get_rewriter_config(self, rewriter_config_template=None):
+    return TrtGraphConverter.get_tensorrt_rewriter_config(
+        rewriter_config_template,
+        max_batch_size=self._max_batch_size,
+        max_workspace_size_bytes=self._max_workspace_size_bytes,
+        precision_mode=self._precision_mode,
+        minimum_segment_size=self._minimum_segment_size,
+        is_dynamic_op=self._is_dynamic_op,
+        maximum_cached_engines=self._maximum_cached_engines,
+        cached_engine_batches=self._cached_engine_batches,
+        use_calibration=self._need_calibration)
+
+  def finalize_calibration(self):
+    assert self._need_calibration
+    assert self._converted
+    assert not self._calibration_data_collected
+
+    # Lazily load the op, since it's not available in cpu-only builds. Importing
+    # this at top will cause tests that imports TF-TRT fail when they're built
+    # and run without CUDA/GPU.
+    # pylint: disable=g-import-not-at-top,line-too-long
+    from tensorflow.compiler.tf2tensorrt.ops.gen_trt_ops import get_serialized_resource_op
+    # pylint: enable=g-import-not-at-top,line-too-long
+
+    # TODO(laigd): a better way would be to use self._calibration_sess to list
+    # all the devices, add one get_serialized_resource_op for each device, and
+    # fetch each such op for every resource until its found. This can work
+    # even when the device of the TRTEngineOp is empty or not fully specified.
+
+    # Maps device name to the corresponding get_serialized_resource_op.
+    device_to_get_resource_op_map = {}
+
+    with self._calibration_graph.as_default():
+      container_input = array_ops.placeholder(dtypes.string)
+      resource_name_input = array_ops.placeholder(dtypes.string)
+
+      for node in self._converted_graph_def.node:
+        if node.op == "TRTEngineOp":
+          # Adds the get_serialized_resource_op for the device if not done
+          # before. We only add one such op for each device.
+          # TODO(laigd): What if the device is empty?????
+          if node.device not in device_to_get_resource_op_map:
+            with self._calibration_graph.device(node.device):
+              serialized_resources_output = (
+                  get_serialized_resource_op(container_input,
+                                             resource_name_input))
+            device_to_get_resource_op_map[node.device] = (
+                serialized_resources_output)
+
+          # Get the calibration resource.
+          calibration_result = self._calibration_sess.run(
+              device_to_get_resource_op_map[node.device],
+              feed_dict={
+                  container_input:
+                      TrtGraphConverter
+                      ._TRT_CALIBRATION_RESOURCE_CONTAINER_NAME,
+                  resource_name_input:
+                      node.name
+              })
+          node.attr["calibration_data"].s = calibration_result
+
+    self._calibration_data_collected = True
+    self._calibration_sess.close()
+
+  def save(self, output_saved_model_dir):
+    """Save the converted graph as a SavedModel."""
+    if self._need_calibration:
+      assert self._calibration_data_collected
+    super(TrtGraphConverter, self).save(output_saved_model_dir)
+
+
+def create_inference_graph(
+    input_graph_def,
+    outputs,
+    max_batch_size=1,
+    max_workspace_size_bytes=DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES,
+    precision_mode=TrtPrecisionMode.FP32,
+    minimum_segment_size=3,
+    is_dynamic_op=False,
+    maximum_cached_engines=1,
+    cached_engine_batches=None,
+    use_calibration=True,
+    input_saved_model_dir=None,
+    input_saved_model_tags=None,
+    output_saved_model_dir=None,
+    session_config=None):
+  """Python wrapper for the TRT transformation.
+
+  Args:
+    input_graph_def: a GraphDef object containing a model to be transformed. If
+      set to None, the graph will be read from the SavedModel loaded from
+      input_saved_model_dir.
+    outputs: list of tensors or node names for the model outputs. Only used when
+      input_graph_def is not None.
+    max_batch_size: max size for the input batch.
+    max_workspace_size_bytes: the maximum GPU temporary memory which the TRT
+      engine can use at execution time. This corresponds to the 'workspaceSize'
+      parameter of nvinfer1::IBuilder::setMaxWorkspaceSize().
+    precision_mode: one of TrtPrecisionMode.supported_precision_modes().
+    minimum_segment_size: the minimum number of nodes required for a subgraph to
+      be replaced by TRTEngineOp.
+    is_dynamic_op: whether to generate dynamic TRT ops which will build the TRT
+      network and engine at run time.
+    maximum_cached_engines: max number of cached TRT engines in dynamic TRT ops.
+      If the number of cached engines is already at max but none of them can
+      serve the input, the TRTEngineOp will fall back to run the TF function
+      based on which the TRTEngineOp is created.
+    cached_engine_batches: a list of batch sizes used to create cached engines,
+      only used when is_dynamic_op is True. The length of the list should be <=
+      maximum_cached_engines, and the dynamic TRT op will use this list to
+      determine the batch sizes of the cached engines, instead of making the
+      decision on the fly. This is useful when we know the most common batch
+      size(s) the application is going to generate.
+    use_calibration: this argument is ignored if precision_mode is not INT8. If
+      set to True, a calibration graph will be created to calibrate the missing
+      ranges. The calibration graph must be converted to an inference graph
+      using calib_graph_to_infer_graph() after running calibration. if set to
+      False, quantization nodes will be expected for every tensor in the graph
+      (exlcuding those which will be fused). If a range is missing, an error
+      will occur. Please note that accuracy may be negatively affected if there
+      is a mismatch between which tensors TRT quantizes and which tensors were
+      trained with fake quantization.
+    input_saved_model_dir: the directory to load the SavedModel which contains
+      the input graph to transforms. Used only when input_graph_def is None.
+    input_saved_model_tags: list of tags to load the SavedModel.
+    output_saved_model_dir: if not None, construct a SavedModel using the
+      returned GraphDef and save it to the specified directory. This option only
+      works when the input graph is loaded from a SavedModel, i.e. when
+      input_saved_model_dir is specified and input_graph_def is None.
+    session_config: the ConfigProto used to create a Session. It's also used as
+      a template to create a TRT-enabled ConfigProto for conversion. If not
+      specified, a default ConfigProto will be used.
+
+  Returns:
+    A GraphDef transformed from input_graph_def (or the SavedModel graph def
+    loaded from input_saved_model_dir, if input_graph_def is not present), where
+    all TRT compatible subgraphs are replaced with TRTEngineOps, and a TF
+    function is added for each of the subgraphs.
+
+    If is_dynamic_op is True, each TRTEngineOp will contain a serialized
+    subgraph GraphDef, which will be converted to a TRT engine at execution time
+    and the TRT engine will be cached for future usage. A new TRT engine will be
+    created each time when none of the cached engines match the input shapes. If
+    it fails to execute the TRT engine or the number of cached engines reaches
+    maximum_cached_engines, the op will fall back to call the corresponding TF
+    function.
+
+    If is_dynamic_op is False, each TRTEngineOp will contain a serialized TRT
+    engine created from the corresponding subgraph. No more engines will be
+    created on the fly, and the op will fall back to call the corresponding TF
+    function when it fails to execute the engine.
+
+  Raises:
+    ValueError: if the combination of the parameters is invalid.
+    RuntimeError: if the TensorRT library version is incompatible.
+  """
+  trt_converter = TrtGraphConverter(
+      input_saved_model_dir=input_saved_model_dir,
+      input_saved_model_tags=input_saved_model_tags,
+      input_graph_def=input_graph_def,
+      nodes_blacklist=outputs,
+      session_config=session_config,
+      max_batch_size=max_batch_size,
+      max_workspace_size_bytes=max_workspace_size_bytes,
+      precision_mode=precision_mode,
+      minimum_segment_size=minimum_segment_size,
+      is_dynamic_op=is_dynamic_op,
+      maximum_cached_engines=maximum_cached_engines,
+      cached_engine_batches=cached_engine_batches,
+      use_calibration=use_calibration)
+  converted_graph_def = trt_converter.convert()
+  if output_saved_model_dir:
+    trt_converter.save(output_saved_model_dir)
+  return converted_graph_def
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert_test.py b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
similarity index 61%
rename from tensorflow/contrib/tensorrt/python/trt_convert_test.py
rename to tensorflow/python/compiler/tensorrt/trt_convert_test.py
index a7b2d2ea50543ba85c5a13dd6ca320e794ca47f1..97dea1bfc8a69d8819de38f3c5d954f57a52b8de 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert_test.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
@@ -20,13 +20,14 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.contrib.tensorrt.python import trt_convert
-# pylint: disable=unused-import
-from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
-# pylint: enable=unused-import
+from tensorflow.python.compiler.tensorrt.wrap_conversion import clear_test_values
+from tensorflow.python.compiler.tensorrt.wrap_conversion import enable_test_value
+from tensorflow.python.compiler.tensorrt.wrap_conversion import get_test_value
+from tensorflow.python.compiler.tensorrt.wrap_conversion import is_tensorrt_enabled
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.compiler.tensorrt import trt_convert
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import importer
@@ -47,17 +48,23 @@ from tensorflow.python.tools import saved_model_utils
 class TrtConvertTest(test_util.TensorFlowTestCase):
   """Class to test Tensorflow-TensorRT integration python API."""
 
+  # Use a small max_workspace_size for tests so they don't consume too much GPU
+  # memory.
+  _TRT_MAX_WORKSPACE_SIZE_BYTES = 2 << 20
+
   def testGetTensorrtRewriterConfig(self):
-    """Test case for trt_convert.get_tensorrt_rewriter_config()."""
-    rewriter_cfg = trt_convert.get_tensorrt_rewriter_config(
-        rewriter_config=None,
+    """Test case for TrtGraphConverter.get_tensorrt_rewriter_config()."""
+    if not is_tensorrt_enabled():
+      return
+    rewriter_cfg = trt_convert.TrtGraphConverter.get_tensorrt_rewriter_config(
+        rewriter_config_template=None,
         max_batch_size=128,
         max_workspace_size_bytes=1234,
         precision_mode="INT8",
         minimum_segment_size=10,
         is_dynamic_op=True,
         maximum_cached_engines=2,
-        cached_engine_batch_sizes=[1, 128])
+        cached_engine_batches=[1, 128])
     self.assertEqual(["constfold", "layout", "constfold"],
                      rewriter_cfg.optimizers)
     self.assertEqual(rewriter_config_pb2.RewriterConfig.ONE,
@@ -84,8 +91,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
         trt_optimizer.parameter_map["precision_mode"].s)
     self.assertEqual(2, trt_optimizer.parameter_map["maximum_cached_engines"].i)
     self.assertEqual(
-        [1, 128],
-        trt_optimizer.parameter_map["cached_engine_batches"].list.i)
+        [1, 128], trt_optimizer.parameter_map["cached_engine_batches"].list.i)
 
   def _GetConfigProto(self):
     """Get ConfigProto for session creation."""
@@ -95,6 +101,17 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
 
   def _GetGraph(self):
     """Get the graph for testing."""
+    # The graph computes (input+1)^2, it looks like:
+    #
+    # input (Placeholder)  v1 (Variable)
+    #               |   \ /
+    #                \   +
+    #                 \ / \
+    #                  *   |
+    #                   \ /
+    #                    +
+    #                    |
+    #                 output (Identity)
     g = ops.Graph()
     with g.as_default():
       with g.device("/GPU:0"):
@@ -141,18 +158,61 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
           signature_def_map={"mypredict": signature_def})
     saved_model_builder.save()
 
-  def _TestCreateInferenceGraph(self,
-                                input_saved_model_dir=None,
-                                output_saved_model_dir=None):
-    """General method to test trt_convert.create_inference_graph()."""
-    input_graph_def = None if input_saved_model_dir else self._GetGraphDef()
-    output_graph_def = trt_convert.create_inference_graph(
-        input_graph_def, ["output"],
+  def _ConvertGraph(self,
+                    input_saved_model_dir=None,
+                    output_saved_model_dir=None,
+                    need_calibration=False,
+                    max_batch_size=1,
+                    minimum_segment_size=3,
+                    is_dynamic_op=False,
+                    maximum_cached_engines=1):
+    """Helper method to convert a GraphDef or SavedModel using TF-TRT."""
+    converter = trt_convert.TrtGraphConverter(
+        input_saved_model_dir=input_saved_model_dir,
+        input_graph_def=None if input_saved_model_dir else self._GetGraphDef(),
+        nodes_blacklist=["output"],
+        session_config=self._GetConfigProto(),
+        max_batch_size=max_batch_size,
+        max_workspace_size_bytes=TrtConvertTest._TRT_MAX_WORKSPACE_SIZE_BYTES,
+        precision_mode=(trt_convert.TrtPrecisionMode.INT8 if need_calibration
+                        else trt_convert.TrtPrecisionMode.FP32),
+        minimum_segment_size=minimum_segment_size,
+        is_dynamic_op=is_dynamic_op,
+        maximum_cached_engines=maximum_cached_engines)
+    output_graph_def = converter.convert()
+
+    if need_calibration:
+
+      class CalibrationData(object):
+
+        def __init__(self):
+          self._data = 0
+
+        def next(self):
+          self._data += 1
+          return {"input:0": [[[self._data]]]}
+
+      output_graph_def = converter.calibrate(
+          fetch_names=["output:0"],
+          num_runs=10,
+          feed_dict_fn=CalibrationData().next)
+
+    if output_saved_model_dir is not None:
+      converter.save(output_saved_model_dir=output_saved_model_dir)
+    return output_graph_def
+
+  def _TestTrtGraphConverter(self,
+                             input_saved_model_dir=None,
+                             output_saved_model_dir=None,
+                             need_calibration=False):
+    """General method to test trt_convert.TrtGraphConverter()."""
+    output_graph_def = self._ConvertGraph(
         input_saved_model_dir=input_saved_model_dir,
         output_saved_model_dir=output_saved_model_dir,
-        session_config=self._GetConfigProto())
+        need_calibration=need_calibration)
     graph_defs_to_verify = [output_graph_def]
-    if output_saved_model_dir is not None:
+
+    if output_saved_model_dir:
       saved_model_graph_def = saved_model_utils.get_meta_graph_def(
           output_saved_model_dir, tag_constants.SERVING).graph_def
       self.assertTrue(isinstance(saved_model_graph_def, graph_pb2.GraphDef))
@@ -166,42 +226,60 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
           "output": "Identity"
       }, node_name_to_op)
 
-  def testCreateInferenceGraph_BasicConversion(self):
-    """Test case for trt_convert.create_inference_graph()."""
-    if not trt_convert.is_tensorrt_enabled():
-      return
+      if need_calibration:
+        trt_engine_nodes = [
+            node for node in graph_def.node if node.op == "TRTEngineOp"
+        ]
+        self.assertNotEmpty(trt_engine_nodes)
+        for node in trt_engine_nodes:
+          self.assertTrue(len(node.attr["calibration_data"].s))
+        # Run the calibrated graph.
+        # TODO(laigd): consider having some input where the answer is different.
+        with ops.Graph().as_default():
+          importer.import_graph_def(graph_def, name="")
+          with self.session(config=self._GetConfigProto()) as sess:
+            for test_data in range(10):
+              self.assertEqual((test_data + 1.0)**2,
+                               sess.run(
+                                   "output:0",
+                                   feed_dict={"input:0": [[[test_data]]]}))
 
-    # Use GraphDef as input.
-    self._TestCreateInferenceGraph()
+  def testTrtGraphConverter_BasicConversion(self):
+    """Test case for trt_convert.TrtGraphConverter()."""
+    if not is_tensorrt_enabled():
+      return
 
-    # Use SavedModel as input.
     tmp_dir = self.get_temp_dir()
     input_saved_model_dir = os.path.join(tmp_dir, "in_dir1")
-    output_saved_model_dir = os.path.join(tmp_dir, "out_dir1")
     self._WriteInputSavedModel(input_saved_model_dir)
-    self._TestCreateInferenceGraph(input_saved_model_dir,
-                                   output_saved_model_dir)
+
+    for need_calibration in [False, True]:
+      # Use GraphDef as input.
+      self._TestTrtGraphConverter()
+
+      # Use SavedModel as input.
+      output_saved_model_dir = os.path.join(
+          tmp_dir, "out_dir1%s" % ("_int8" if need_calibration else ""))
+      self._TestTrtGraphConverter(
+          input_saved_model_dir=input_saved_model_dir,
+          output_saved_model_dir=output_saved_model_dir,
+          need_calibration=need_calibration)
 
   def _TestRun(self, sess, batch_size, expect_engine_is_run):
-    trt_convert.clear_test_values("")
+    clear_test_values("")
     result = sess.run("output:0", feed_dict={"input:0": [[[1.0]]] * batch_size})
     self.assertAllEqual([[[4.0]]] * batch_size, result)
     execute_engine_test_value = ("done" if expect_engine_is_run else "")
     execute_native_segment_test_value = ("" if expect_engine_is_run else "done")
-    self.assertEqual(
-        execute_engine_test_value,
-        trt_convert.get_test_value("TRTEngineOp_0:ExecuteTrtEngine"))
-    self.assertEqual(
-        execute_native_segment_test_value,
-        trt_convert.get_test_value("TRTEngineOp_0:ExecuteNativeSegment"))
+    self.assertEqual(execute_engine_test_value,
+                     get_test_value("TRTEngineOp_0:ExecuteTrtEngine"))
+    self.assertEqual(execute_native_segment_test_value,
+                     get_test_value("TRTEngineOp_0:ExecuteNativeSegment"))
 
-  def testCreateInferenceGraph_MinimumSegmentSize(self):
-    if not trt_convert.is_tensorrt_enabled():
+  def testTrtGraphConverter_MinimumSegmentSize(self):
+    if not is_tensorrt_enabled():
       return
-    output_graph_def = trt_convert.create_inference_graph(
-        self._GetGraphDef(), ["output"],
-        minimum_segment_size=5,
-        is_dynamic_op=False)
+    output_graph_def = self._ConvertGraph(minimum_segment_size=5)
     node_name_to_op = {node.name: node.op for node in output_graph_def.node}
     self.assertEqual({
         "v1/read": "Const",
@@ -212,71 +290,63 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
         "output": "Identity"
     }, node_name_to_op)
 
-  def testCreateInferenceGraph_DynamicOp(self):
-    if not trt_convert.is_tensorrt_enabled():
+  def testTrtGraphConverter_DynamicOp(self):
+    if not is_tensorrt_enabled():
       return
-    trt_convert.enable_test_value()
+    enable_test_value()
 
     tmp_dir = self.get_temp_dir()
     input_saved_model_dir = os.path.join(tmp_dir, "in_dir2")
     output_saved_model_dir = os.path.join(tmp_dir, "out_dir2")
     self._WriteInputSavedModel(input_saved_model_dir)
-    output_graph_def = trt_convert.create_inference_graph(
-        None,
-        None,
-        is_dynamic_op=True,
-        maximum_cached_engines=2,
+    output_graph_def = self._ConvertGraph(
         input_saved_model_dir=input_saved_model_dir,
         output_saved_model_dir=output_saved_model_dir,
-        session_config=self._GetConfigProto())
+        is_dynamic_op=True,
+        maximum_cached_engines=2)
 
     # Test the output GraphDef.
     with ops.Graph().as_default():
       importer.import_graph_def(output_graph_def, name="")
-      with self.test_session(config=self._GetConfigProto()) as sess:
+      with self.session(config=self._GetConfigProto()) as sess:
         # Run with batch size 1, a new engine is created and cached.
         self._TestRun(sess, 1, True)
         # Run with batch size 2, a new engine is created and cached.
         self._TestRun(sess, 2, True)
         # Run with batch size 3, since the number of cached engines has reached
-        # the max, it should fall back to TF function.
-        self._TestRun(sess, 3, False)
+        # the max, it should evict an old engine and create a new one.
+        self._TestRun(sess, 3, True)
 
     # Test the output SavedModel
     with ops.Graph().as_default():
-      with self.test_session(config=self._GetConfigProto()) as sess:
+      with self.session(config=self._GetConfigProto()) as sess:
         loader.load(sess, [tag_constants.SERVING], output_saved_model_dir)
         # Run with batch size 1, a new engine is created and cached.
         self._TestRun(sess, 1, True)
         # Run with batch size 2, a new engine is created and cached.
         self._TestRun(sess, 2, True)
         # Run with batch size 3, since the number of cached engines has reached
-        # the max, it should fall back to TF function.
-        self._TestRun(sess, 3, False)
+        # the max, it should evict an old engine and create a new one.
+        self._TestRun(sess, 3, True)
 
-  def testCreateInferenceGraph_StaticOp(self):
-    if not trt_convert.is_tensorrt_enabled():
+  def testTrtGraphConverter_StaticOp(self):
+    if not is_tensorrt_enabled():
       return
-    trt_convert.enable_test_value()
+    enable_test_value()
 
     tmp_dir = self.get_temp_dir()
     input_saved_model_dir = os.path.join(tmp_dir, "in_dir3")
     output_saved_model_dir = os.path.join(tmp_dir, "out_dir3")
     self._WriteInputSavedModel(input_saved_model_dir)
-    output_graph_def = trt_convert.create_inference_graph(
-        None,
-        None,
-        max_batch_size=1,
-        is_dynamic_op=False,
-        maximum_cached_engines=2,  # This is noop, added just for testing.
+    output_graph_def = self._ConvertGraph(
         input_saved_model_dir=input_saved_model_dir,
         output_saved_model_dir=output_saved_model_dir,
-        session_config=self._GetConfigProto())
+        maximum_cached_engines=2)  # This is noop, added just for testing.
 
     # Test the output GraphDef.
     with ops.Graph().as_default():
       importer.import_graph_def(output_graph_def, name="")
-      with self.test_session(config=self._GetConfigProto()) as sess:
+      with self.session(config=self._GetConfigProto()) as sess:
         # Run with batch size 1, the default engine embedded in the graphdef
         # will be used.
         self._TestRun(sess, 1, True)
@@ -286,7 +356,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
 
     # Test the output SavedModel
     with ops.Graph().as_default():
-      with self.test_session(config=self._GetConfigProto()) as sess:
+      with self.session(config=self._GetConfigProto()) as sess:
         loader.load(sess, [tag_constants.SERVING], output_saved_model_dir)
         # Run with batch size 1, the default engine embedded in the graphdef
         # will be used.
diff --git a/tensorflow/python/data/benchmarks/BUILD b/tensorflow/python/data/benchmarks/BUILD
index 5b0500eae1970b4f183737d4fc0cd4171dd1ea15..031476100f448528503b5bc9b7c6c360caf9f8b1 100644
--- a/tensorflow/python/data/benchmarks/BUILD
+++ b/tensorflow/python/data/benchmarks/BUILD
@@ -6,15 +6,34 @@ exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
+py_test(
+    name = "meta_benchmark",
+    srcs = ["meta_benchmark.py"],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "benchmark_base",
+    srcs = ["benchmark_base.py"],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "batch_benchmark",
     srcs = ["batch_benchmark.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:session",
+        ":benchmark_base",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
@@ -26,12 +45,8 @@ py_test(
     srcs = ["filter_benchmark.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:session",
+        ":benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -40,8 +55,21 @@ py_test(
     srcs = ["from_tensor_slices_benchmark.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":benchmark_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "list_files_benchmark",
+    srcs = ["list_files_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":benchmark_base",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:session",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
@@ -53,11 +81,8 @@ py_test(
     srcs = ["map_benchmark.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:session",
+        ":benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -66,8 +91,7 @@ py_test(
     srcs = ["range_benchmark.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:session",
+        ":benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
diff --git a/tensorflow/python/data/benchmarks/batch_benchmark.py b/tensorflow/python/data/benchmarks/batch_benchmark.py
index e063849f70381b8244a8a916353a3cc3be15c230..8cad91212a7c3699be8fcc0140505a9c8824723e 100644
--- a/tensorflow/python/data/benchmarks/batch_benchmark.py
+++ b/tensorflow/python/data/benchmarks/batch_benchmark.py
@@ -17,69 +17,37 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-
 import numpy as np
 
-from tensorflow.python.client import session
+from tensorflow.python.data.benchmarks import benchmark_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
 
 
-# TODO(b/119837791): Add eager benchmarks.
-class BatchBenchmark(test.Benchmark):
+class BatchBenchmark(benchmark_base.DatasetBenchmarkBase):
   """Benchmarks for `tf.data.Dataset.batch()`."""
 
-  def benchmarkBatchSparse(self):
+  def benchmark_batch_sparse(self):
     non_zeros_per_row_values = [0, 1, 5, 10, 100]
     batch_size_values = [1, 32, 64, 128, 1024]
 
-    sparse_placeholder = array_ops.sparse_placeholder(dtype=dtypes.int64)
-    batch_size_placeholder = array_ops.placeholder(dtype=dtypes.int64, shape=[])
-
-    dataset = dataset_ops.Dataset.from_tensors(sparse_placeholder).repeat(
-        ).batch(batch_size_placeholder)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-
     for non_zeros_per_row in non_zeros_per_row_values:
 
-      sparse_value = sparse_tensor.SparseTensorValue(
+      tensor = sparse_tensor.SparseTensor(
           indices=np.arange(non_zeros_per_row, dtype=np.int64)[:, np.newaxis],
           values=np.arange(non_zeros_per_row, dtype=np.int64),
           dense_shape=[1000])
 
       for batch_size in batch_size_values:
-
-        with session.Session() as sess:
-          sess.run(iterator.initializer, feed_dict={
-              sparse_placeholder: sparse_value,
-              batch_size_placeholder: batch_size})
-          # Run five steps to warm up the session caches before taking the
-          # first measurement.
-          for _ in range(5):
-            sess.run(next_element.indices.op)
-          deltas = []
-          for _ in range(100):
-            start = time.time()
-            for _ in range(100):
-              sess.run(next_element.indices.op)
-            end = time.time()
-            deltas.append(end - start)
-
-        median_wall_time = np.median(deltas) / 100.0
-
-        print("Batch sparse dataset non-zeros per row: %d batch_size: %d "
-              "wall time: %f"
-              % (non_zeros_per_row, batch_size, median_wall_time))
-        self.report_benchmark(
-            iters=10000, wall_time=median_wall_time,
-            name="batch_sparse_dataset_nnz_%d_batch_size_%d" % (
-                non_zeros_per_row, batch_size))
+        dataset = dataset_ops.Dataset.from_tensors(tensor).repeat().batch(
+            batch_size)
+        self.run_and_report_benchmark(
+            dataset,
+            num_elements=100000 // batch_size,
+            iters=1,
+            name="sparse_num_elements_%d_batch_size_%d" % (non_zeros_per_row,
+                                                           batch_size))
 
 
 if __name__ == "__main__":
-  test.main()
+  benchmark_base.test.main()
diff --git a/tensorflow/python/data/benchmarks/benchmark_base.py b/tensorflow/python/data/benchmarks/benchmark_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..47f992d955ba49cf0de1bdd7cf1ee760b6af8ef6
--- /dev/null
+++ b/tensorflow/python/data/benchmarks/benchmark_base.py
@@ -0,0 +1,92 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test utilities for tf.data benchmarking functionality."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.platform import test
+
+
+# TODO(b/119837791): Add eager benchmarks.
+class DatasetBenchmarkBase(test.Benchmark):
+  """Base class for dataset benchmarks."""
+
+  def run_benchmark(self, dataset, num_elements, iters=1):
+    """Benchmarks the dataset.
+
+    Runs the dataset `iters` times. In each iteration, the benchmark measures
+    the time it takes to go through `num_elements` elements of the dataset.
+
+    Args:
+      dataset: Dataset to benchmark.
+      num_elements: Number of dataset elements to iterate through each benchmark
+        iteration.
+      iters: Number of times to repeat the timing.
+
+    Returns:
+      A float, representing the per-element wall time of the dataset in seconds.
+      This is the median time (with respect to `iters`) it takes for the dataset
+      to go through `num_elements` elements, divided by `num_elements.`
+    """
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
+    # NOTE: We use `dataset.skip()` to perform the iterations in C++, avoiding
+    # the overhead of multiple `session.run()` calls. Note that this relies on
+    # the underlying implementation of `skip`: if it is optimized in the future,
+    # we will have to change this code.
+    dataset = dataset.skip(num_elements - 1)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+    next_element = nest.flatten(next_element)[0]
+
+    deltas = []
+    for _ in range(iters):
+      with session.Session() as sess:
+        # Run once to warm up the session caches.
+        sess.run(iterator.initializer)
+        sess.run(next_element)
+
+        sess.run(iterator.initializer)
+        start = time.time()
+        sess.run(next_element.op)
+        end = time.time()
+      deltas.append(end - start)
+    return np.median(deltas) / float(num_elements)
+
+  def run_and_report_benchmark(self,
+                               dataset,
+                               num_elements,
+                               name,
+                               iters=5,
+                               extras=None):
+    # Measure the per-element wall time.
+    wall_time = self.run_benchmark(dataset, num_elements, iters)
+
+    if extras is None:
+      extras = {}
+    extras["elements_per_second"] = 1 / wall_time
+    extras["num_elements"] = num_elements
+    # 'mode' represents the mechanism used for iterating over dataset elements.
+    name = "%s_mode_cpp" % name
+    self.report_benchmark(
+        wall_time=wall_time, iters=iters, name=name, extras=extras)
diff --git a/tensorflow/python/data/benchmarks/filter_benchmark.py b/tensorflow/python/data/benchmarks/filter_benchmark.py
index a6d86fe2218aec835e4f09f0c8c708596cf511f8..eb47b4089c7f57f9426fd5dcc15b2296fdb0bd25 100644
--- a/tensorflow/python/data/benchmarks/filter_benchmark.py
+++ b/tensorflow/python/data/benchmarks/filter_benchmark.py
@@ -17,53 +17,26 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-
-import numpy as np
-
-from tensorflow.python.client import session
+from tensorflow.python.data.benchmarks import benchmark_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
 
 
 # TODO(b/119837791): Add eager benchmarks.
-class FilterBenchmark(test.Benchmark):
+class FilterBenchmark(benchmark_base.DatasetBenchmarkBase):
   """Benchmarks for `tf.data.Dataset.filter()`."""
 
   def _benchmark(self, predicate, name):
-    with ops.Graph().as_default():
-      dataset = (
-          dataset_ops.Dataset.from_tensors(True).repeat(None).filter(predicate))
-      iterator = dataset_ops.make_one_shot_iterator(dataset)
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for _ in range(5):
-          sess.run(next_element.op)
-        deltas = []
-        for _ in range(100):
-          start = time.time()
-          for _ in range(100):
-            sess.run(next_element.op)
-          end = time.time()
-          deltas.append(end - start)
-
-        median_wall_time = np.median(deltas) / 100
-        print("Filter dataset using %s. Median wall time: %f" %
-              (name, median_wall_time))
-        self.report_benchmark(
-            iters=100,
-            wall_time=median_wall_time,
-            name=name)
+    dataset = (
+        dataset_ops.Dataset.from_tensors(True).repeat(None).filter(predicate))
+    self.run_and_report_benchmark(dataset, num_elements=100000, name=name)
 
-  def benchmarkSimpleFunction(self):
+  def benchmark_simple_function(self):
     self._benchmark(array_ops.identity, "simple_function")
 
-  def benchmarkReturnComponentOptimization(self):
+  def benchmark_return_component_optimization(self):
     self._benchmark(lambda x: x, "return_component")
 
 
 if __name__ == "__main__":
-  test.main()
+  benchmark_base.test.main()
diff --git a/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py b/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py
index d7f1a4e7af5b00569e71900df8f2a7486d7c813b..3af174acc320186ae368f23145bb9700e4d3aaa1 100644
--- a/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py
+++ b/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py
@@ -17,172 +17,70 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-
 import numpy as np
 
-from tensorflow.python.client import session
+from tensorflow.python.data.benchmarks import benchmark_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import test
 
 
 # TODO(b/119837791): Add eager benchmarks.
-class FromTensorSlicesBenchmark(test.Benchmark):
+class FromTensorSlicesBenchmark(benchmark_base.DatasetBenchmarkBase):
   """Benchmarks for `tf.data.Dataset.from_tensor_slices()`."""
 
-  def benchmarkSliceRepeatBatch(self):
+  def benchmark_slice_repeat_batch(self):
     input_size = 10000
     batch_size = 100
     num_epochs = 100
+    num_elements = input_size * num_epochs // batch_size
 
     input_data = np.random.randn(input_size)
 
     dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_data)
-        .repeat(num_epochs + 1).batch(batch_size))
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      sess.run(iterator.initializer)
-      # Run one whole epoch to burn in the computation.
-      for _ in range(input_size // batch_size):
-        sess.run(next_element)
-      deltas = []
-      try:
-        while True:
-          start = time.time()
-          sess.run(next_element)
-          deltas.append(time.time() - start)
-      except errors.OutOfRangeError:
-        pass
-
-    median_wall_time = np.median(deltas)
-    print("Slice/repeat/batch with sess.run() input size: %d batch size: %d "
-          "Median wall time per element: %f" % (input_size, batch_size,
-                                                median_wall_time))
-    self.report_benchmark(
-        iters=len(deltas),
-        wall_time=median_wall_time,
+        dataset_ops.Dataset.from_tensor_slices(input_data).repeat(
+            num_epochs).batch(batch_size))
+
+    self.run_and_report_benchmark(
+        dataset,
+        num_elements=num_elements,
         name="slice_repeat_batch_input_%d_batch_%d" % (input_size, batch_size))
 
-  def benchmarkSliceRepeatBatchCallable(self):
+  def benchmark_reshape_slice_repeat(self):
     input_size = 10000
-    batch_size = 100
+    reshape_dim = [100, 100]
     num_epochs = 100
 
+    num_elements = num_epochs * reshape_dim[0]
+
     input_data = np.random.randn(input_size)
 
     dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_data)
-        .repeat(num_epochs + 1).batch(batch_size))
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      sess.run(iterator.initializer)
-      get_next_element = sess.make_callable(next_element)
-      # Run one whole epoch to burn in the computation.
-      for _ in range(input_size // batch_size):
-        get_next_element()
-      deltas = []
-      try:
-        while True:
-          start = time.time()
-          get_next_element()
-          deltas.append(time.time() - start)
-      except errors.OutOfRangeError:
-        pass
-
-    median_wall_time = np.median(deltas)
-    print(
-        "Slice/repeat/batch with callable input size: %d batch size: %d Median"
-        " wall time per element: %f" % (input_size, batch_size,
-                                        median_wall_time))
-    self.report_benchmark(
-        iters=len(deltas),
-        wall_time=median_wall_time,
-        name="slice_repeat_batch_callable_input_%d_batch_%d" %
-        (input_size, batch_size))
-
-  def benchmarkReshapeSliceRepeatCallable(self):
-    input_size = 10000
-    batch_size = 100
-    num_epochs = 100
+        dataset_ops.Dataset.from_tensor_slices(
+            input_data.reshape(*reshape_dim)).repeat(num_epochs))
 
-    input_data = np.random.randn(input_size)
+    self.run_and_report_benchmark(
+        dataset,
+        num_elements=num_elements,
+        name="reshape_slice_repeat_input_%d" % input_size,
+    )
 
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_data.reshape(100, 100))
-        .repeat(num_epochs + 1))
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      sess.run(iterator.initializer)
-      get_next_element = sess.make_callable(next_element)
-      # Run one whole epoch to burn in the computation.
-      for _ in range(input_size // batch_size):
-        get_next_element()
-      deltas = []
-      try:
-        while True:
-          start = time.time()
-          get_next_element()
-          deltas.append(time.time() - start)
-      except errors.OutOfRangeError:
-        pass
-
-    median_wall_time = np.median(deltas)
-    print("Reshape/slice/repeat with callable input size: %d batch size: %d "
-          "Median wall time per element: %f" % (input_size, batch_size,
-                                                median_wall_time))
-    self.report_benchmark(
-        iters=len(deltas),
-        wall_time=median_wall_time,
-        name="reshape_slice_repeat_callable_input_%d_batch_%d" %
-        (input_size, batch_size))
-
-  def benchmarkSliceBatchCacheRepeatCallable(self):
+  def benchmark_slice_batch_cache_repeat(self):
     input_size = 10000
     batch_size = 100
     num_epochs = 100
+    num_elements = input_size * num_epochs // batch_size
 
     input_data = np.random.randn(input_size)
 
     dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_data).batch(batch_size)
-        .cache().repeat(num_epochs + 1))
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      sess.run(iterator.initializer)
-      get_next_element = sess.make_callable(next_element)
-      # Run one whole epoch to burn in the computation.
-      for _ in range(input_size // batch_size):
-        get_next_element()
-      deltas = []
-      try:
-        while True:
-          start = time.time()
-          get_next_element()
-          deltas.append(time.time() - start)
-      except errors.OutOfRangeError:
-        pass
-
-    median_wall_time = np.median(deltas)
-    print(
-        "Slice/batch/cache/repeat with callable input size: %d batch size: %d "
-        "Median wall time per element: %f"
-        % (input_size, batch_size, median_wall_time))
-    self.report_benchmark(
-        iters=len(deltas),
-        wall_time=median_wall_time,
-        name="slice_batch_cache_repeat_callable_input_%d_batch_%d" %
-        (input_size, batch_size))
+        dataset_ops.Dataset.from_tensor_slices(input_data).batch(
+            batch_size).cache().repeat(num_epochs))
+
+    self.run_and_report_benchmark(
+        dataset,
+        num_elements=num_elements,
+        name="slice_batch_cache_repeat_input_%d_batch_%d" % (input_size,
+                                                             batch_size))
 
 
 if __name__ == "__main__":
-  test.main()
+  benchmark_base.test.main()
diff --git a/tensorflow/python/data/benchmarks/list_files_benchmark.py b/tensorflow/python/data/benchmarks/list_files_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..70f8eeec9e8ec66edb7da5c2c82d97c8fa8336bd
--- /dev/null
+++ b/tensorflow/python/data/benchmarks/list_files_benchmark.py
@@ -0,0 +1,94 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for `tf.data.Dataset.list_files()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from os import path
+from os import makedirs
+import shutil
+import time
+import tempfile
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+class ListFilesBenchmark(test.Benchmark):
+  """Benchmarks for `tf.data.Dataset.list_files()`."""
+
+  def benchmarkNestedDirectories(self):
+    tmp_dir = tempfile.mkdtemp()
+    width = 1024
+    depth = 16
+    for i in range(width):
+      for j in range(depth):
+        new_base = path.join(tmp_dir, str(i),
+                             *[str(dir_name) for dir_name in range(j)])
+        makedirs(new_base)
+        child_files = ['a.py', 'b.pyc'] if j < depth - 1 else ['c.txt', 'd.log']
+        for f in child_files:
+          filename = path.join(new_base, f)
+          open(filename, 'w').close()
+    patterns = [
+        path.join(tmp_dir, path.join(*['**'
+                                       for _ in range(depth)]), suffix)
+        for suffix in ['*.txt', '*.log']
+    ]
+    deltas = []
+    iters = 3
+    for _ in range(iters):
+      with ops.Graph().as_default():
+        dataset = dataset_ops.Dataset.list_files(patterns)
+        options = dataset_ops.Options()
+        options.experimental_optimization.apply_default_optimizations = False
+        dataset = dataset.with_options(options)
+        next_element = dataset.make_one_shot_iterator().get_next()
+        with session.Session() as sess:
+          sub_deltas = []
+          while True:
+            try:
+              start = time.time()
+              sess.run(next_element)
+              end = time.time()
+              sub_deltas.append(end - start)
+            except errors.OutOfRangeError:
+              break
+          deltas.append(sub_deltas)
+    median_deltas = np.median(deltas, axis=0)
+    self.report_benchmark(
+        iters=iters,
+        wall_time=np.sum(median_deltas),
+        extras={
+            'read first file:':
+                median_deltas[0],
+            'read second file:':
+                median_deltas[1],
+            'avg time for reading %d more filenames:' %
+            (len(median_deltas) - 2):
+                np.average(median_deltas[2:])
+        },
+        name='nested_directory(%d*%d)' % (width, depth))
+    shutil.rmtree(tmp_dir, ignore_errors=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/data/benchmarks/map_benchmark.py b/tensorflow/python/data/benchmarks/map_benchmark.py
index 65d945cdae87aedad55351cfb63ad06e3521d570..75b71fffac4d493cb0d2e4d579597de302ad89c2 100644
--- a/tensorflow/python/data/benchmarks/map_benchmark.py
+++ b/tensorflow/python/data/benchmarks/map_benchmark.py
@@ -17,119 +17,51 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-
-import numpy as np
-
-from tensorflow.python.client import session
+from tensorflow.python.data.benchmarks import benchmark_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import ops
-from tensorflow.python.platform import test
 
 
 # TODO(b/119837791): Add eager benchmarks.
-class MapBenchmark(test.Benchmark):
-  """Bechmarks for `tf.data.Dataset.map()`."""
-
-  def benchmarkChainOfMaps(self):
-    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
-    for chain_length in chain_lengths:
-      for mode in ["general", "single-threaded", "short-circuit"]:
-        if mode == "general":
-          map_fn = lambda x: x + 1
-          use_inter_op_parallelism = True
-          print_label = ""
-          benchmark_label = ""
-        if mode == "single-threaded":
-          map_fn = lambda x: x + 1
-          use_inter_op_parallelism = False
-          print_label = " (single threaded mode)"
-          benchmark_label = "_single_threaded"
-        if mode == "short-circuit":
-          map_fn = lambda x: x
-          use_inter_op_parallelism = True  # should not have any significance
-          print_label = " (short circuit mode)"
-          benchmark_label = "_short_circuit"
+class MapBenchmark(benchmark_base.DatasetBenchmarkBase):
+  """Benchmarks for `tf.data.Dataset.map()`."""
 
-        with ops.Graph().as_default():
-          dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
-          for _ in range(chain_length):
-            dataset = dataset_ops.MapDataset(
-                dataset,
-                map_fn,
-                use_inter_op_parallelism=use_inter_op_parallelism)
-          iterator = dataset_ops.make_one_shot_iterator(dataset)
-          next_element = iterator.get_next()
+  def benchmark_chain_of_maps(self):
 
-          with session.Session() as sess:
-            for _ in range(5):
-              sess.run(next_element.op)
-            deltas = []
-            for _ in range(100):
-              start = time.time()
-              for _ in range(100):
-                sess.run(next_element.op)
-              end = time.time()
-              deltas.append(end - start)
+    def benchmark_helper(chain_length, map_fn, use_inter_op_parallelism, label):
+      dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
+      for _ in range(chain_length):
+        dataset = dataset_ops.MapDataset(
+            dataset, map_fn, use_inter_op_parallelism=use_inter_op_parallelism)
+      self.run_and_report_benchmark(
+          dataset,
+          num_elements=10000,
+          name="chain_length_%d%s" % (chain_length, label))
 
-            median_wall_time = np.median(deltas) / 100
-            print("Map dataset chain length%s: %d Median wall time: %f" %
-                  (print_label, chain_length, median_wall_time))
-            self.report_benchmark(
-                iters=1000,
-                wall_time=median_wall_time,
-                name="map_dataset_chain_length_%d%s" % (chain_length,
-                                                        benchmark_label))
+    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
+    for chain_length in chain_lengths:
+      benchmark_helper(chain_length, lambda x: x + 1, True, "")
+      benchmark_helper(chain_length, lambda x: x + 1, False, "_single_threaded")
+      benchmark_helper(chain_length, lambda x: x, True, "_short_circuit")
 
-  def benchmarkMapFanOut(self):
+  def benchmark_map_fan_out(self):
     fan_outs = [1, 2, 5, 10, 20, 50, 100]
-    for fan_out in fan_outs:
-      for mode in ["general", "single-threaded", "short-circuit"]:
-        if mode == "general":
-          map_fn = lambda *xs: [x + 1 for x in xs]
-          use_inter_op_parallelism = True
-          print_label = ""
-          benchmark_label = ""
-        if mode == "single-threaded":
-          map_fn = lambda *xs: [x + 1 for x in xs]
-          use_inter_op_parallelism = False
-          print_label = " (single threaded mode)"
-          benchmark_label = "_single_threaded"
-        if mode == "short-circuit":
-          map_fn = lambda *xs: xs
-          use_inter_op_parallelism = True  # should not have any significance
-          print_label = " (short circuit mode)"
-          benchmark_label = "_short_circuit"
 
-        with ops.Graph().as_default():
-          dataset = dataset_ops.Dataset.from_tensors(
-              tuple(0 for _ in range(fan_out))).repeat(None)
-          dataset = dataset_ops.MapDataset(
-              dataset,
-              map_fn,
-              use_inter_op_parallelism=use_inter_op_parallelism)
-          iterator = dataset_ops.make_one_shot_iterator(dataset)
-          next_element = iterator.get_next()
+    def benchmark_helper(fan_out, map_fn, use_inter_op_parallelism, label):
+      dataset = dataset_ops.Dataset.from_tensors(
+          tuple(0 for _ in range(fan_out))).repeat(None)
+      dataset = dataset_ops.MapDataset(
+          dataset, map_fn, use_inter_op_parallelism=use_inter_op_parallelism)
+      self.run_and_report_benchmark(
+          dataset,
+          num_elements=10000,
+          name="fan_out_%d%s" % (fan_out, label))
 
-          with session.Session() as sess:
-            for _ in range(5):
-              sess.run(next_element[0].op)
-            deltas = []
-            for _ in range(100):
-              start = time.time()
-              for _ in range(100):
-                sess.run(next_element[0].op)
-              end = time.time()
-              deltas.append(end - start)
-
-            median_wall_time = np.median(deltas) / 100
-            print("Map dataset fan out%s: %d Median wall time: %f" %
-                  (print_label, fan_out, median_wall_time))
-            self.report_benchmark(
-                iters=1000,
-                wall_time=median_wall_time,
-                name="map_dataset_fan_out_%d%s" % (fan_out, benchmark_label))
+    for fan_out in fan_outs:
+      benchmark_helper(fan_out, lambda *xs: [x + 1 for x in xs], True, "")
+      benchmark_helper(fan_out, lambda *xs: [x + 1 for x in xs], False,
+                       "_single_threaded")
+      benchmark_helper(fan_out, lambda *xs: xs, True, "_short_circuit")
 
 
 if __name__ == "__main__":
-  test.main()
+  benchmark_base.test.main()
diff --git a/tensorflow/python/data/benchmarks/meta_benchmark.py b/tensorflow/python/data/benchmarks/meta_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6d888b2df0925140623c655d53ea473e08868af
--- /dev/null
+++ b/tensorflow/python/data/benchmarks/meta_benchmark.py
@@ -0,0 +1,151 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test utilities for tf.data benchmarking functionality."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import timeit
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import sleep
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.platform import test
+
+
+class MetaBenchmark(test.Benchmark):
+  """Benchmark that compares various ways of running tf.data benchmarks."""
+
+  # Note that each of these benchmarks is a separate method so that we can
+  # run them independently and collect a performance profile.
+
+  def setup_fast_dataset(self):
+    self.num_reps = 15
+    self.iters = 100000
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    return dataset_ops.Dataset.range(10000**2).with_options(options)
+
+  def benchmarkFastDatasetWithOnlyCppIterations(self):
+    dataset = self.setup_fast_dataset()
+    self.run_benchmark_with_only_cpp_iterations(dataset)
+
+  def benchmarkFastDatasetWithSessionRun(self):
+    dataset = self.setup_fast_dataset()
+    self.run_benchmark_with_session_run(dataset)
+
+  def benchmarkFastDatasetWithSessionCallable(self):
+    dataset = self.setup_fast_dataset()
+    self.run_benchmark_with_session_run(dataset, make_callable=True)
+
+  def benchmarkFastDatasetInEager(self):
+    with context.eager_mode():
+      dataset = self.setup_fast_dataset()
+      self.run_benchmark_in_eager(dataset)
+
+  def setup_slow_dataset(self):
+    dataset = self.setup_fast_dataset()
+    self.iters = 1000
+    # sleep for 1e-3s per iteration
+    return dataset.apply(sleep.sleep(1000))
+
+  def benchmarkSlowDatasetWithOnlyCppIterations(self):
+    dataset = self.setup_slow_dataset()
+    self.run_benchmark_with_only_cpp_iterations(dataset)
+
+  def benchmarkSlowDatasetWithSessionRun(self):
+    dataset = self.setup_slow_dataset()
+    self.run_benchmark_with_session_run(dataset)
+
+  def benchmarkSlowDatasetWithSessionCallable(self):
+    dataset = self.setup_slow_dataset()
+    self.run_benchmark_with_session_run(dataset, make_callable=True)
+
+  def benchmarkSlowDatasetInEager(self):
+    with context.eager_mode():
+      dataset = self.setup_slow_dataset()
+      self.run_benchmark_in_eager(dataset)
+
+  def report(self, deltas):
+    # Each `delta` is the time taken for `self.iters` iterations. Divide by the
+    # number of iterations here to get per-element iteration time.
+    deltas = np.array(deltas) / self.iters
+    # Discard the first 5 results from "warming up" the session.
+    deltas = deltas[5:]
+
+    median = np.median(deltas)
+    mean = np.mean(deltas)
+    min_val = np.min(deltas)
+    max_val = np.max(deltas)
+    extras = {
+        "iters_per_second": 1 / median,
+        "median": median,
+        "mean": mean,
+        "min": min_val,
+        "max": max_val,
+        "num_reps": self.num_reps - 5,
+    }
+    self.report_benchmark(wall_time=median, iters=self.iters, extras=extras)
+
+  def run_benchmark_in_eager(self, dataset):
+    deltas = []
+    for _ in range(self.num_reps):
+      iterator = iter(dataset)
+      deltas.append(timeit.timeit(lambda: next(iterator), number=self.iters))  # pylint: disable=cell-var-from-loop
+
+    self.report(deltas)
+
+  def run_benchmark_with_session_run(self, dataset, make_callable=False):
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+
+    with session.Session() as sess:
+      deltas = []
+      for _ in range(self.num_reps):
+        if make_callable:
+          get_next_element = sess.make_callable(next_element)
+        else:
+          # Note: session.run(next_element.op) is more performant than
+          # session.run(next_element) because we avoid the cost of copying the
+          # tensor from C++ to python.
+          get_next_element = lambda: sess.run(next_element.op)
+
+        sess.run(iterator.initializer)
+        deltas.append(timeit.timeit(get_next_element, number=self.iters))
+    self.report(deltas)
+
+  def run_benchmark_with_only_cpp_iterations(self, dataset):
+    """Benchmarks the dataset with the iterations performed in C++."""
+    # NOTE: We use `dataset.skip()` to perform the iterations in C++, avoiding
+    # the overhead of multiple `session.run()` calls. Note that this relies on
+    # the underlying implementation of `skip`: if it is optimized in the future,
+    # we will have to change this code.
+    dataset = dataset.skip(self.iters - 1)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+
+    with session.Session() as sess:
+      deltas = []
+      for _ in range(self.num_reps):
+        sess.run(iterator.initializer)
+        deltas.append(
+            timeit.timeit(lambda: sess.run(next_element.op), number=1))
+    self.report(deltas)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/benchmarks/range_benchmark.py b/tensorflow/python/data/benchmarks/range_benchmark.py
index a5020e2873063ea8b01801c0889a23cb60601ec3..80569e4e313c0ccc44aa7f0413e92cd4c529c167 100644
--- a/tensorflow/python/data/benchmarks/range_benchmark.py
+++ b/tensorflow/python/data/benchmarks/range_benchmark.py
@@ -17,53 +17,26 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-
-from tensorflow.python.client import session
+from tensorflow.python.data.benchmarks import benchmark_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.platform import test
-
-_NUMPY_RANDOM_SEED = 42
 
 
-class RangeBenchmark(test.Benchmark):
+class RangeBenchmark(benchmark_base.DatasetBenchmarkBase):
   """Benchmarks for `tf.data.Dataset.range()`."""
 
-  def _benchmarkRangeHelper(self, modeling_enabled):
-    num_elements = 10000000 if modeling_enabled else 50000000
-    options = dataset_ops.Options()
-    options.experimental_autotune = modeling_enabled
-
-    # Use `Dataset.skip()` and `Dataset.take()` to perform the iteration in
-    # C++, and focus on the minimal overheads (excluding Python invocation
-    # costs).
-    dataset = dataset_ops.Dataset.range(num_elements).skip(
-        num_elements - 1).take(1).with_options(options)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      # Run once to warm up the session caches.
-      sess.run(iterator.initializer)
-      sess.run(next_element)
-
-      # Run once for timing.
-      sess.run(iterator.initializer)
-      start = time.time()
-      sess.run(next_element)
-      end = time.time()
-
-      time_per_element = (end - start) / num_elements
-      print("Average time per element (%s modeling): %f nanoseconds" % (
-          "with" if modeling_enabled else "without", time_per_element * 1e9))
-      self.report_benchmark(iters=num_elements, wall_time=time_per_element,
-                            name="benchmark_tf_data_dataset_range%s"
-                            % ("_with_modeling" if modeling_enabled else ""))
-
-  def benchmarkRange(self):
+  def benchmark_range(self):
     for modeling_enabled in [False, True]:
-      self._benchmarkRangeHelper(modeling_enabled)
+      num_elements = 10000000 if modeling_enabled else 50000000
+      options = dataset_ops.Options()
+      options.experimental_autotune = modeling_enabled
+      dataset = dataset_ops.Dataset.range(num_elements)
+      dataset = dataset.with_options(options)
+
+      self.run_and_report_benchmark(
+          dataset,
+          num_elements=num_elements,
+          name="modeling_%s" % ("on" if modeling_enabled else "off"))
 
 
 if __name__ == "__main__":
-  test.main()
+  benchmark_base.test.main()
diff --git a/tensorflow/python/data/experimental/__init__.py b/tensorflow/python/data/experimental/__init__.py
index ffc2e5ef5fa239beada67687ec700437b2fc44ba..275bdf7ef44b1012bf443e23644ee71df9d31709 100644
--- a/tensorflow/python/data/experimental/__init__.py
+++ b/tensorflow/python/data/experimental/__init__.py
@@ -58,6 +58,7 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@make_csv_dataset
 @@make_saveable_from_iterator
 @@map_and_batch
+@@map_and_batch_with_legacy_function
 @@parallel_interleave
 @@parse_example_dataset
 @@prefetch_to_device
@@ -65,6 +66,7 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@sample_from_datasets
 @@scan
 @@shuffle_and_repeat
+@@take_while
 @@unbatch
 @@unique
 
@@ -81,6 +83,7 @@ from __future__ import print_function
 
 from tensorflow.python.data.experimental.ops.batching import dense_to_sparse_batch
 from tensorflow.python.data.experimental.ops.batching import map_and_batch
+from tensorflow.python.data.experimental.ops.batching import map_and_batch_with_legacy_function
 from tensorflow.python.data.experimental.ops.batching import unbatch
 from tensorflow.python.data.experimental.ops.cardinality import cardinality
 from tensorflow.python.data.experimental.ops.cardinality import INFINITE as INFINITE_CARDINALITY
@@ -115,6 +118,7 @@ from tensorflow.python.data.experimental.ops.shuffle_ops import shuffle_and_repe
 from tensorflow.python.data.experimental.ops.stats_aggregator import StatsAggregator
 from tensorflow.python.data.experimental.ops.stats_ops import latency_stats
 from tensorflow.python.data.experimental.ops.stats_options import StatsOptions
+from tensorflow.python.data.experimental.ops.take_while_ops import take_while
 from tensorflow.python.data.experimental.ops.threading_options import ThreadingOptions
 from tensorflow.python.data.experimental.ops.unique import unique
 from tensorflow.python.data.experimental.ops.writers import TFRecordWriter
diff --git a/tensorflow/python/data/experimental/benchmarks/BUILD b/tensorflow/python/data/experimental/benchmarks/BUILD
index 8175116c6eddf4a754202a2fbb22499c79a3f5b8..4f2117ec9b07a7d22391d8e856588fe34ed4086f 100644
--- a/tensorflow/python/data/experimental/benchmarks/BUILD
+++ b/tensorflow/python/data/experimental/benchmarks/BUILD
@@ -58,6 +58,22 @@ py_test(
     ],
 )
 
+py_test(
+    name = "map_defun_benchmark",
+    srcs = ["map_defun_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python/data/experimental/ops:map_defun",
+        "//tensorflow/python/eager:function",
+    ],
+)
+
 py_test(
     name = "map_vectorization_benchmark",
     srcs = ["map_vectorization_benchmark.py"],
@@ -94,6 +110,20 @@ py_test(
     ],
 )
 
+py_test(
+    name = "choose_fastest_benchmark",
+    srcs = ["choose_fastest_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "optimize_benchmark",
     srcs = ["optimize_benchmark.py"],
@@ -108,6 +138,36 @@ py_test(
     ],
 )
 
+py_test(
+    name = "parallel_interleave_benchmark",
+    srcs = ["parallel_interleave_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/experimental/ops:interleave_ops",
+        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:sleep",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "rejection_resample_benchmark",
+    srcs = ["rejection_resample_benchmark.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:resampling",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
 py_test(
     name = "unbatch_benchmark",
     srcs = ["unbatch_benchmark.py"],
diff --git a/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py b/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
index e713494b526320f2c18774c7198406521c373033..bda7d38792a4aaaff6622f32f2101ad345eaa6da 100644
--- a/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
@@ -33,12 +33,20 @@ class AutotuneBenchmark(test.Benchmark):
   """Benchmarks for autotuning performance knobs."""
 
   def benchmarkMap(self):
+    a = self._benchmarkMap(autotune=False)
+    b = self._benchmarkMap(autotune=True)
+    print("speedup: %f" % (a / b))
+
+  def _benchmarkMap(self, autotune):
     k = 1024 * 1024
     dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
                                                 np.random.rand(4 * k,
                                                                1))).repeat()
     dataset = dataset.map(
         math_ops.matmul, num_parallel_calls=optimization.AUTOTUNE)
+    options = dataset_ops.Options()
+    options.experimental_autotune = autotune
+    dataset = dataset.with_options(options)
     iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
 
@@ -46,23 +54,24 @@ class AutotuneBenchmark(test.Benchmark):
     with session.Session() as sess:
       for _ in range(5):
         sess.run(get_next.op)
-      for _ in range(1000):
+      for _ in range(10000):
         start = time.time()
         sess.run(get_next.op)
         end = time.time()
         deltas.append(end - start)
 
-    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
-          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
-           np.max(deltas)))
     self.report_benchmark(
-        iters=1000, wall_time=np.median(deltas), name="map_autotune")
+        iters=10000,
+        wall_time=np.median(deltas),
+        name="map" + ("_autotune" if autotune else ""))
+    return np.median(deltas)
 
   def benchmarkMapAndBatch(self):
-    self._benchmarkMapAndBatch(numa_aware=False)
-    self._benchmarkMapAndBatch(numa_aware=True)
+    a = self._benchmarkMapAndBatch(autotune=False)
+    b = self._benchmarkMapAndBatch(autotune=True)
+    print("speedup: %f" % (a / b))
 
-  def _benchmarkMapAndBatch(self, numa_aware):
+  def _benchmarkMapAndBatch(self, autotune):
     batch_size = 16
     k = 1024 * 1024
     dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
@@ -74,7 +83,8 @@ class AutotuneBenchmark(test.Benchmark):
             num_parallel_calls=optimization.AUTOTUNE,
             batch_size=batch_size))
     options = dataset_ops.Options()
-    options.experimental_numa_aware = numa_aware
+    options.experimental_autotune = autotune
+    options.experimental_optimization.apply_default_optimizations = False
     dataset = dataset.with_options(options)
     iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
@@ -83,22 +93,24 @@ class AutotuneBenchmark(test.Benchmark):
     with session.Session() as sess:
       for _ in range(5):
         sess.run(get_next.op)
-      for _ in range(100):
+      for _ in range(1000):
         start = time.time()
         sess.run(get_next.op)
         end = time.time()
         deltas.append(end - start)
 
-    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
-          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
-           np.max(deltas)))
-
     self.report_benchmark(
-        iters=100,
+        iters=1000,
         wall_time=np.median(deltas),
-        name=("numa_" if numa_aware else "") + "map_and_batch_autotune")
+        name="map_and_batch" + ("_autotune" if autotune else ""))
+    return np.median(deltas)
 
   def benchmarkInterleave(self):
+    a = self._benchmarkInterleave(autotune=False)
+    b = self._benchmarkInterleave(autotune=True)
+    print("speedup: %f" % (a / b))
+
+  def _benchmarkInterleave(self, autotune):
     k = 1024 * 1024
     dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
                                                 np.random.rand(4 * k,
@@ -108,6 +120,10 @@ class AutotuneBenchmark(test.Benchmark):
         lambda _: dataset,
         cycle_length=10,
         num_parallel_calls=optimization.AUTOTUNE)
+    options = dataset_ops.Options()
+    options.experimental_autotune = autotune
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
 
@@ -115,21 +131,24 @@ class AutotuneBenchmark(test.Benchmark):
     with session.Session() as sess:
       for _ in range(5):
         sess.run(get_next.op)
-      for _ in range(1000):
+      for _ in range(10000):
         start = time.time()
         sess.run(get_next.op)
         end = time.time()
         deltas.append(end - start)
 
-    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
-          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
-           np.max(deltas)))
     self.report_benchmark(
-        iters=1000,
+        iters=10000,
         wall_time=np.median(deltas),
-        name="interleave_autotune")
+        name="interleave" + ("_autotune" if autotune else ""))
+    return np.median(deltas)
 
   def benchmarkMapAndInterleave(self):
+    a = self._benchmarkMapAndInterleave(autotune=False)
+    b = self._benchmarkMapAndInterleave(autotune=True)
+    print("speedup: %f" % (a / b))
+
+  def _benchmarkMapAndInterleave(self, autotune):
     k = 1024 * 1024
     a = (np.random.rand(1, 8 * k), np.random.rand(8 * k, 1))
     b = (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1))
@@ -161,6 +180,10 @@ class AutotuneBenchmark(test.Benchmark):
 
     dataset = dataset_ops.Dataset.zip((dataset, dataset_c))
     dataset = dataset.map(f2, num_parallel_calls=optimization.AUTOTUNE)
+    options = dataset_ops.Options()
+    options.experimental_autotune = autotune
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
 
@@ -168,19 +191,17 @@ class AutotuneBenchmark(test.Benchmark):
     with session.Session() as sess:
       for _ in range(5):
         sess.run(get_next)
-      for _ in range(100):
+      for _ in range(1000):
         start = time.time()
         sess.run(get_next)
         end = time.time()
         deltas.append(end - start)
 
-    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
-          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
-           np.max(deltas)))
     self.report_benchmark(
-        iters=100,
+        iters=1000,
         wall_time=np.median(deltas),
-        name="map_and_interleave_autotune")
+        name="map_and_interleave" + ("_autotune" if autotune else ""))
+    return np.median(deltas)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/benchmarks/choose_fastest_benchmark.py b/tensorflow/python/data/experimental/benchmarks/choose_fastest_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a5a264c6f33196f882c3c2455339b7ba5a7e81c
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/choose_fastest_benchmark.py
@@ -0,0 +1,105 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for static optimizations."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.platform import test
+
+
+# TODO(b/119837791): Add eager benchmarks too.
+class ChooseFastestBenchmark(test.Benchmark):
+  """Benchmarks for static optimizations."""
+
+  def benchmarkChooseFastest(self):
+
+    dataset = dataset_ops.Dataset.range(1000**2).repeat()
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
+    map_batch_dataset = dataset.map(lambda x: x + 1).batch(100)
+    batch_map_dataset = dataset.batch(100).map(lambda x: x + 1)
+
+    merge_dataset = optimization._ChooseFastestDataset(  # pylint: disable=protected-access
+        [batch_map_dataset, map_batch_dataset])
+    self._benchmark(map_batch_dataset, "map_batch_dataset")
+    self._benchmark(batch_map_dataset, "batch_map_dataset")
+    self._benchmark(merge_dataset, "merge_dataset")
+
+  def benchmarkChooseFastestFirstNIterations(self):
+
+    dataset = dataset_ops.Dataset.range(1000**2).repeat()
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
+    map_batch_dataset = dataset.map(lambda x: x + 1).batch(100)
+    batch_map_dataset = dataset.batch(100).map(lambda x: x + 1)
+
+    merge_dataset = optimization._ChooseFastestDataset(  # pylint: disable=protected-access
+        [batch_map_dataset, map_batch_dataset])
+
+    self._benchmarkFirstN(map_batch_dataset, "map_batch_dataset")
+    self._benchmarkFirstN(batch_map_dataset, "batch_map_dataset")
+    self._benchmarkFirstN(merge_dataset, "merge_dataset")
+
+  def _benchmarkFirstN(self, dataset, name):
+    n = 10  # The default num_experiments for ChooseFastestDataset
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    next_element = iterator.get_next()
+
+    deltas = []
+    for _ in range(100):
+      with session.Session() as sess:
+        start = time.time()
+        for _ in range(n):
+          sess.run(next_element.op)
+        end = time.time()
+        deltas.append(end - start)
+    median_wall_time = np.median(deltas) / n
+    self.report_benchmark(
+        iters=n, wall_time=median_wall_time, name=name + "_first_%d" % n)
+
+  def _benchmark(self, dataset, name):
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    next_element = iterator.get_next()
+
+    with session.Session() as sess:
+      # Run 10 steps to warm up the session caches before taking the first
+      # measurement. Additionally, 10 is the default num_experiments for
+      # ChooseFastestDataset.
+      for _ in range(10):
+        sess.run(next_element.op)
+      deltas = []
+      for _ in range(50):
+        start = time.time()
+        for _ in range(50):
+          sess.run(next_element.op)
+        end = time.time()
+        deltas.append(end - start)
+
+      median_wall_time = np.median(deltas) / 100
+      self.report_benchmark(iters=100, wall_time=median_wall_time, name=name)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/csv_dataset_benchmark.py b/tensorflow/python/data/experimental/benchmarks/csv_dataset_benchmark.py
index 03345ce4e6648fecf47348806c55adba10aeed5a..2e91e08c79f2fcd990b6e3850f4539ea616c65fe 100644
--- a/tensorflow/python/data/experimental/benchmarks/csv_dataset_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/csv_dataset_benchmark.py
@@ -63,6 +63,9 @@ class CsvDatasetBenchmark(test.Benchmark):
 
   def _runBenchmark(self, dataset, num_cols, prefix):
     dataset = dataset.skip(self._num_per_iter - 1)
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     deltas = []
     for _ in range(10):
       next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
@@ -79,8 +82,6 @@ class CsvDatasetBenchmark(test.Benchmark):
       deltas.append(end - start)
     # Median wall time per CSV record read and decoded
     median_wall_time = np.median(deltas) / self._num_per_iter
-    print('%s num_cols: %d Median wall time: %f' % (prefix, num_cols,
-                                                    median_wall_time))
     self.report_benchmark(
         iters=self._num_per_iter,
         wall_time=median_wall_time,
diff --git a/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py b/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py
index b17f2bcd12b2b78c97e7c390d919331ac4ef5386..4b7c1737863d040763b8dc94952d0742c2c1027c 100644
--- a/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py
@@ -26,7 +26,6 @@ import numpy as np
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.experimental.ops import batching
-from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -41,7 +40,7 @@ _NUMPY_RANDOM_SEED = 42
 class MapAndBatchBenchmark(test.Benchmark):
   """Benchmarks for `tf.data.experimental.map_and_batch()`."""
 
-  def benchmarkMapAndBatchDense(self):
+  def benchmarkMapAndBatch(self):
     """Measures the performance of parallelized batching."""
     shapes = [(), (10,), (10, 10), (10, 10, 10), (224, 224, 3)]
     batch_size_values = [1, 32, 64, 128, 1024]
@@ -55,6 +54,9 @@ class MapAndBatchBenchmark(test.Benchmark):
 
     dataset = dataset.apply(batching.map_and_batch(
         lambda _: dense_value, batch_size_placeholder))
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
@@ -88,13 +90,9 @@ class MapAndBatchBenchmark(test.Benchmark):
         median_wall_time = np.median(deltas) / 100.0
         iters = len(deltas) * 100
 
-        print("Map and batch dense dataset shape: %r batch_size: %d "
-              "wall time: %f (%d iters)"
-              % (shape, batch_size, median_wall_time, iters))
         self.report_benchmark(
             iters=iters, wall_time=median_wall_time,
-            name="benchmark_batch_dense_dataset_nnz_%d_batch_size_%d" % (
-                np.prod(shape), batch_size))
+            name="num_elements_%d_batch_size_%d" % (np.prod(shape), batch_size))
 
   def benchmarkMapAndBatchChainingVersusFusing(self):
     """Compares the performance of chaining and fusing map and batch.
@@ -128,49 +126,25 @@ class MapAndBatchBenchmark(test.Benchmark):
     def benchmark(label, series):
       """Runs benchmark the given series."""
 
-      print("%s:" % label)
-
-      def make_base_dataset(element_size):
+      def make_dataset(element_size, num_calls, batch_size):  # pylint: disable=missing-docstring
         k = 1024 * 1024
         x = constant_op.constant(np.random.rand(element_size, 4 * k))
         y = constant_op.constant(np.random.rand(4 * k, 1))
-        return dataset_ops.Dataset.range(1000000000000).map(lambda _: (x, y))
+        dataset = dataset_ops.Dataset.range(1000000000000).map(lambda _: (x, y))
+        dataset = dataset.map(
+            math_ops.matmul,
+            num_parallel_calls=num_calls).batch(batch_size=batch_size)
+        options = dataset_ops.Options()
+        options.experimental_optimization.apply_default_optimizations = False
+        return dataset.with_options(options)
 
       for num_calls, inter_op, element_size, batch_size in series:
-
         num_iters = 1024 // (
             (element_size * batch_size) // min(num_calls, inter_op))
-        fused_dataset = make_base_dataset(element_size)
-        fused_dataset = fused_dataset.map(
-            math_ops.matmul,
-            num_parallel_calls=num_calls).batch(batch_size=batch_size)
-
-        fused_iterator = dataset_ops.make_one_shot_iterator(fused_dataset)
-        fused_get_next = fused_iterator.get_next()
-
-        fused_deltas = []
-        with session.Session(
-            config=config_pb2.ConfigProto(
-                inter_op_parallelism_threads=inter_op,
-                use_per_session_threads=True)) as sess:
-
-          for _ in range(5):
-            sess.run(fused_get_next.op)
-          for _ in range(num_iters):
-            start = time.time()
-            sess.run(fused_get_next.op)
-            end = time.time()
-            fused_deltas.append(end - start)
-
-        # `map_and_batch_fusion` is optimized by default. To get the chained
-        # dataset, with have to disable it.
-        options = dataset_ops.Options()
-        options.experimental_optimization = OptimizationOptions()
-        options.experimental_optimization.map_and_batch_fusion = False
-        chained_dataset = fused_dataset.with_options(options)
+        # By default the chained map().batch() calls will not be fused.
+        chained_dataset = make_dataset(element_size, num_calls, batch_size)
         chained_iterator = dataset_ops.make_one_shot_iterator(chained_dataset)
         chained_get_next = chained_iterator.get_next()
-
         chained_deltas = []
         with session.Session(
             config=config_pb2.ConfigProto(
@@ -184,27 +158,32 @@ class MapAndBatchBenchmark(test.Benchmark):
             end = time.time()
             chained_deltas.append(end - start)
 
-        print(
-            "batch size: %d, num parallel calls: %d, inter-op parallelism: %d, "
-            "element size: %d, num iters: %d\nchained wall time: %f (median), "
-            "%f (mean), %f (stddev), %f (min), %f (max)\n  fused wall time: "
-            "%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n    "
-            "chained/fused:    %.2fx (median),    %.2fx (mean)" %
-            (batch_size, num_calls, inter_op, element_size, num_iters,
-             np.median(chained_deltas), np.mean(chained_deltas),
-             np.std(chained_deltas), np.min(chained_deltas),
-             np.max(chained_deltas), np.median(fused_deltas),
-             np.mean(fused_deltas), np.std(fused_deltas), np.min(fused_deltas),
-             np.max(fused_deltas),
-             np.median(chained_deltas) / np.median(fused_deltas),
-             np.mean(chained_deltas) / np.mean(fused_deltas)))
-
         self.report_benchmark(
             iters=num_iters,
             wall_time=np.median(chained_deltas),
             name=name("chained", label, num_calls, inter_op, element_size,
                       batch_size))
 
+        # Apply an option to the default dataset that will fuse map().batch().
+        options = dataset_ops.Options()
+        options.experimental_optimization.map_and_batch_fusion = True
+        fused_dataset = chained_dataset.with_options(options)
+        fused_iterator = dataset_ops.make_one_shot_iterator(fused_dataset)
+        fused_get_next = fused_iterator.get_next()
+        fused_deltas = []
+        with session.Session(
+            config=config_pb2.ConfigProto(
+                inter_op_parallelism_threads=inter_op,
+                use_per_session_threads=True)) as sess:
+
+          for _ in range(5):
+            sess.run(fused_get_next.op)
+          for _ in range(num_iters):
+            start = time.time()
+            sess.run(fused_get_next.op)
+            end = time.time()
+            fused_deltas.append(end - start)
+
         self.report_benchmark(
             iters=num_iters,
             wall_time=np.median(fused_deltas),
diff --git a/tensorflow/python/data/experimental/benchmarks/map_defun_benchmark.py b/tensorflow/python/data/experimental/benchmarks/map_defun_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac6d7d0360292f74cdd8b57eeab7450e362a0f27
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/map_defun_benchmark.py
@@ -0,0 +1,75 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for MapDefunOp."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import map_defun
+from tensorflow.python.eager import function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import map_fn
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+# TODO(b/119837791): Add eager benchmarks too.
+class MapDefunBenchmark(test.Benchmark):
+  """Benchmarks for MapDefunOp."""
+
+  def _run(self, op, name=None, num_iters=3000):
+    with session.Session() as sess:
+      for _ in range(5):
+        sess.run(op)
+      start = time.time()
+      for _ in range(num_iters):
+        sess.run(op)
+      end = time.time()
+      mean_us = (end - start) * 1e6 / num_iters
+      self.report_benchmark(
+          name=name,
+          iters=num_iters,
+          wall_time=mean_us,
+          extras={"examples_per_sec": num_iters / (end - start)})
+
+  def benchmarkDefunVsMapFn(self):
+    """Benchmarks to compare the performance of MapDefun vs tf.map_fn."""
+
+    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.int32)])
+    def defun(x):
+      return array_ops.identity(x)
+
+    def fn(x):
+      return array_ops.identity(x)
+
+    base = math_ops.range(100)
+    for input_size in [10, 100, 1000, 10000]:
+      num_iters = 100000 // input_size
+      map_defun_op = map_defun.map_defun(defun, [base], [dtypes.int32], [()])
+      map_fn_op = map_fn.map_fn(fn, base)
+
+      self._run(
+          map_defun_op, "with_defun_size_%d" % input_size, num_iters=num_iters)
+      self._run(
+          map_fn_op, "without_defun_size_%d" % input_size, num_iters=num_iters)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py b/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py
index a60ba0a857ee18e88e912fc25000a479e4a86e72..8f8fbe86dff47be6e058979251b066e017a5ba7b 100644
--- a/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py
@@ -24,7 +24,6 @@ import numpy as np
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.client import session
-from tensorflow.python.data.experimental.ops import optimization_options
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import constant_op
@@ -115,31 +114,27 @@ class MapVectorizationBenchmark(test.Benchmark):
 
   def _compare(self, input_dataset, map_fn, batch_size, input_size, str_id):
     num_elems = int(np.sum([np.prod(x) for x in input_size]))
-    name_template = "{}__batch_size_{}_input_element_size_{}_{}"
+    name_template = "{}_batch_size_{}_input_element_size_{}_{}"
 
-    base_dataset = input_dataset.map(map_fn).batch(batch_size)
+    unoptimized_dataset = input_dataset.map(map_fn).batch(batch_size)
 
     options = dataset_ops.Options()
-    opt_options = optimization_options.OptimizationOptions()
-    # Disable default map_and_batch_fusion optimization
-    opt_options.map_and_batch_fusion = False
-    options.experimental_optimization = opt_options
-    base_dataset = base_dataset.with_options(options)
+    options.experimental_optimization.apply_default_optimizations = False
+    unoptimized_dataset = unoptimized_dataset.with_options(options)
+    unoptimized_next = dataset_ops.make_one_shot_iterator(
+        unoptimized_dataset).get_next()
 
-    unoptimized_op = dataset_ops.make_one_shot_iterator(base_dataset).get_next()
-
-    optimized_options = dataset_ops.Options()
-    opt_options = optimization_options.OptimizationOptions()
-    opt_options.map_vectorization = True
-    optimized_options.experimental_optimization = opt_options
-    optimized = base_dataset.with_options(optimized_options)
-    optimized_op = dataset_ops.make_one_shot_iterator(optimized).get_next()
+    options = dataset_ops.Options()
+    options.experimental_optimization.map_vectorization = True
+    optimized_dataset = unoptimized_dataset.with_options(options)
+    optimized_next = dataset_ops.make_one_shot_iterator(
+        optimized_dataset).get_next()
 
     unoptimized_time = self._run(
-        unoptimized_op,
+        unoptimized_next,
         name=name_template.format(str_id, batch_size, num_elems, "unoptimized"))
     optimized_time = self._run(
-        optimized_op,
+        optimized_next,
         name=name_template.format(str_id, batch_size, num_elems, "optimized"))
 
     print("Batch size: {}\n"
@@ -164,7 +159,7 @@ class MapVectorizationBenchmark(test.Benchmark):
 
   def benchmarkCast(self):
     self._benchmark_helper(
-        lambda *args: [math_ops.cast(x, dtypes.float64) for x in args], "cast")
+        lambda *args: [math_ops.cast(x, dtypes.float32) for x in args], "cast")
 
   def benchmarkReshape(self):
     self._benchmark_helper(
diff --git a/tensorflow/python/data/experimental/benchmarks/matching_files_benchmark.py b/tensorflow/python/data/experimental/benchmarks/matching_files_benchmark.py
index c53f8dd7c537fecbfcd551e2a4809aaf5447ff46..cb5bf2946d5d7dc8b802a9d32db4ec49e78a5e14 100644
--- a/tensorflow/python/data/experimental/benchmarks/matching_files_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/matching_files_benchmark.py
@@ -60,6 +60,9 @@ class MatchingFilesBenchmark(test.Benchmark):
     for _ in range(iters):
       with ops.Graph().as_default():
         dataset = matching_files.MatchingFilesDataset(patterns)
+        options = dataset_ops.Options()
+        options.experimental_optimization.apply_default_optimizations = False
+        dataset = dataset.with_options(options)
         next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
         with session.Session() as sess:
@@ -75,11 +78,6 @@ class MatchingFilesBenchmark(test.Benchmark):
           deltas.append(sub_deltas)
 
     median_deltas = np.median(deltas, axis=0)
-    print('Nested directory size (width*depth): %d*%d Median wall time: '
-          '%fs (read first filename), %fs (read second filename), avg %fs'
-          ' (read %d more filenames)' %
-          (width, depth, median_deltas[0], median_deltas[1],
-           np.average(median_deltas[2:]), len(median_deltas) - 2))
     self.report_benchmark(
         iters=iters,
         wall_time=np.sum(median_deltas),
@@ -92,7 +90,7 @@ class MatchingFilesBenchmark(test.Benchmark):
             (len(median_deltas) - 2):
                 np.average(median_deltas[2:])
         },
-        name='dataset_nested_directory(%d*%d)' %
+        name='nested_directory(%d*%d)' %
         (width, depth))
 
     shutil.rmtree(tmp_dir, ignore_errors=True)
diff --git a/tensorflow/python/data/experimental/benchmarks/optimize_benchmark.py b/tensorflow/python/data/experimental/benchmarks/optimize_benchmark.py
index 2f9b89111fcda9230062a4aa7d3477df5d2f36a5..395a529f853e17909fd3f094174cc8d82393d6da 100644
--- a/tensorflow/python/data/experimental/benchmarks/optimize_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/optimize_benchmark.py
@@ -28,6 +28,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+# TODO(b/119837791): Add eager benchmarks too.
 class OptimizationBenchmark(test.Benchmark):
   """Benchmarks for static optimizations."""
 
@@ -46,7 +47,8 @@ class OptimizationBenchmark(test.Benchmark):
         dataset = dataset.map(lambda x: x)
       if optimize_dataset:
         options = dataset_ops.Options()
-        options.experimental_map_fusion = True
+        options.experimental_optimization.apply_default_optimizations = False
+        options.experimental_optimization.map_fusion = True
         dataset = dataset.with_options(options)
 
       iterator = dataset_ops.make_one_shot_iterator(dataset)
@@ -65,8 +67,6 @@ class OptimizationBenchmark(test.Benchmark):
 
         median_wall_time = np.median(deltas) / 100
         opt_mark = "opt" if optimize_dataset else "noopt"
-        print("Map dataset {} chain length: {} Median wall time: {}".format(
-            opt_mark, chain_length, median_wall_time))
         self.report_benchmark(
             iters=100,
             wall_time=median_wall_time,
@@ -89,7 +89,8 @@ class OptimizationBenchmark(test.Benchmark):
             lambda x: math_ops.greater_equal(x - 5, 0))
       if optimize_dataset:
         options = dataset_ops.Options()
-        options.experimental_map_and_filter_fusion = True
+        options.experimental_optimization.apply_default_optimizations = False
+        options.experimental_optimization.map_and_filter_fusion = True
         dataset = dataset.with_options(options)
       iterator = dataset_ops.make_one_shot_iterator(dataset)
       next_element = iterator.get_next()
@@ -107,14 +108,52 @@ class OptimizationBenchmark(test.Benchmark):
 
         median_wall_time = np.median(deltas) / 100
         opt_mark = "opt" if optimize_dataset else "noopt"
-        print("Map and filter dataset {} chain length: {} Median wall time: {}"
-              .format(opt_mark, chain_length, median_wall_time))
         self.report_benchmark(
             iters=100,
             wall_time=median_wall_time,
             name="map_and_filter_fusion_{}_chain_length_{}".format(
                 opt_mark, chain_length))
 
+  # This benchmark compares the performance of pipeline with multiple chained
+  # filter with and without filter fusion.
+  def benchmarkFilterFusion(self):
+    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
+    for chain_length in chain_lengths:
+      self._benchmarkFilterFusion(chain_length, False)
+      self._benchmarkFilterFusion(chain_length, True)
+
+  def _benchmarkFilterFusion(self, chain_length, optimize_dataset):
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.from_tensors(5).repeat(None)
+      for _ in range(chain_length):
+        dataset = dataset.filter(lambda x: math_ops.greater_equal(x - 5, 0))
+      if optimize_dataset:
+        options = dataset_ops.Options()
+        options.experimental_optimization.apply_default_optimizations = False
+        options.experimental_optimization.filter_fusion = True
+        dataset = dataset.with_options(options)
+
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for _ in range(10):
+          sess.run(next_element.op)
+        deltas = []
+        for _ in range(100):
+          start = time.time()
+          for _ in range(100):
+            sess.run(next_element.op)
+          end = time.time()
+          deltas.append(end - start)
+
+        median_wall_time = np.median(deltas) / 100
+        opt_mark = "opt" if optimize_dataset else "no-opt"
+        self.report_benchmark(
+            iters=1000,
+            wall_time=median_wall_time,
+            name="chain_length_{}_{}".format(opt_mark, chain_length))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/parallel_interleave_benchmark.py b/tensorflow/python/data/experimental/benchmarks/parallel_interleave_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..37375af27f4359764ec24aa0e5810a8b2a5b1ea7
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/parallel_interleave_benchmark.py
@@ -0,0 +1,105 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for `tf.data.experimental.parallel_interleave()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import interleave_ops
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops import sleep
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+def _make_fake_dataset_fn():
+  """Returns a dataset that emulates a remote storage data source.
+
+  Returns a dataset factory which creates a dataset with 100 elements that
+  emulates the performance characteristic of a file-based dataset stored in a
+  remote storage. In particular, the first element will take an order of
+  magnitude longer to produce than the remaining elements (1s vs. 1ms).
+  """
+
+  def fake_dataset_fn(unused):
+    del unused
+
+    def make_dataset(time_us, num_elements):
+      return dataset_ops.Dataset.range(num_elements).apply(sleep.sleep(time_us))
+
+    return make_dataset(1000 * 1000, 0).concatenate(make_dataset(1000,
+                                                                 100)).take(100)
+
+  return fake_dataset_fn
+
+
+class ParallelInterleaveBenchmark(test.Benchmark):
+  """Benchmarks for `tf.data.experimental.parallel_interleave()`."""
+
+  def _benchmark(self, dataset_fn, iters, num_elements):
+    with ops.Graph().as_default():
+      options = dataset_ops.Options()
+      options.experimental_optimization.apply_default_optimizations = False
+      dataset = dataset_fn().with_options(options)
+      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
+      with session.Session() as sess:
+        deltas = []
+        for _ in range(iters):
+          start = time.time()
+          for _ in range(num_elements):
+            sess.run(next_element.op)
+          end = time.time()
+          deltas.append(end - start)
+
+    mean_wall_time = np.mean(deltas) / num_elements
+    self.report_benchmark(iters=iters, wall_time=mean_wall_time)
+
+  def benchmark_sequential_interleave(self):
+
+    def dataset_fn():
+      return dataset_ops.Dataset.range(1).repeat().interleave(
+          _make_fake_dataset_fn(), cycle_length=10)
+
+    self._benchmark(dataset_fn=dataset_fn, iters=10, num_elements=100)
+
+  def benchmark_parallel_interleave_v1(self):
+    """Benchmark for parallel interleave that does not support autotuning."""
+
+    def dataset_fn():
+      return dataset_ops.Dataset.range(1).repeat().apply(
+          interleave_ops.parallel_interleave(
+              _make_fake_dataset_fn(), cycle_length=10))
+
+    self._benchmark(dataset_fn=dataset_fn, iters=100, num_elements=1000)
+
+  def benchmark_parallel_interleave_v2(self):
+    """Benchmark for parallel interleave that supports autotuning."""
+
+    def dataset_fn():
+      return dataset_ops.Dataset.range(1).repeat().interleave(
+          _make_fake_dataset_fn(),
+          cycle_length=10, num_parallel_calls=optimization.AUTOTUNE)
+
+    self._benchmark(dataset_fn=dataset_fn, iters=100, num_elements=1000)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/rejection_resample_benchmark.py b/tensorflow/python/data/experimental/benchmarks/rejection_resample_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a8ac7ef655d56ebc11c1467b6ed82b5f943277c
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/rejection_resample_benchmark.py
@@ -0,0 +1,74 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for `tf.data.experimental.rejection_resample()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import resampling
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.platform import test
+
+
+def _time_resampling(data_np, target_dist, init_dist, num_to_sample):  # pylint: disable=missing-docstring
+  dataset = dataset_ops.Dataset.from_tensor_slices(data_np).repeat()
+
+  # Reshape distribution via rejection sampling.
+  dataset = dataset.apply(
+      resampling.rejection_resample(
+          class_func=lambda x: x,
+          target_dist=target_dist,
+          initial_dist=init_dist,
+          seed=142))
+
+  options = dataset_ops.Options()
+  options.experimental_optimization.apply_default_optimizations = False
+  dataset = dataset.with_options(options)
+  get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
+
+  with session.Session() as sess:
+    start_time = time.time()
+    for _ in xrange(num_to_sample):
+      sess.run(get_next)
+    end_time = time.time()
+
+  return end_time - start_time
+
+
+class RejectionResampleBenchmark(test.Benchmark):
+  """Benchmarks for `tf.data.experimental.rejection_resample()`."""
+
+  def benchmarkResamplePerformance(self):
+    init_dist = [0.25, 0.25, 0.25, 0.25]
+    target_dist = [0.0, 0.0, 0.0, 1.0]
+    num_classes = len(init_dist)
+    # We don't need many samples to test a dirac-delta target distribution
+    num_samples = 1000
+    data_np = np.random.choice(num_classes, num_samples, p=init_dist)
+
+    resample_time = _time_resampling(
+        data_np, target_dist, init_dist, num_to_sample=1000)
+
+    self.report_benchmark(iters=1000, wall_time=resample_time, name="resample")
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py b/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py
index c36a32534dddfc29e5f0d4253508e44f9ae4a899..3f5b9b91307f423ca78489b5f3ef824974a0a6fe 100644
--- a/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for `tf.data.experimental.unbatch()`."""
+"""Benchmarks for `tf.data.experimental.unbatch()`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -42,6 +42,9 @@ class UnbatchBenchmark(test.Benchmark):
       dataset = dataset.batch(batch_size_placeholder)
       dataset = dataset.apply(batching.unbatch())
       dataset = dataset.skip(elems_per_trial)
+      options = dataset_ops.Options()
+      options.experimental_optimization.apply_default_optimizations = False
+      dataset = dataset.with_options(options)
       iterator = dataset_ops.make_initializable_iterator(dataset)
       next_element = iterator.get_next()
 
@@ -58,8 +61,6 @@ class UnbatchBenchmark(test.Benchmark):
             deltas.append((end - start) / elems_per_trial)
 
           median_wall_time = np.median(deltas)
-          print("Unbatch (native) batch size: %d Median wall time per element:"
-                " %f microseconds" % (batch_size, median_wall_time * 1e6))
           self.report_benchmark(
               iters=10000,
               wall_time=median_wall_time,
@@ -78,6 +79,9 @@ class UnbatchBenchmark(test.Benchmark):
       dataset = dataset.batch(batch_size_placeholder)
       dataset = dataset.flat_map(dataset_ops.Dataset.from_tensor_slices)
       dataset = dataset.skip(elems_per_trial)
+      options = dataset_ops.Options()
+      options.experimental_optimization.apply_default_optimizations = False
+      dataset = dataset.with_options(options)
       iterator = dataset_ops.make_initializable_iterator(dataset)
       next_element = iterator.get_next()
 
@@ -94,8 +98,6 @@ class UnbatchBenchmark(test.Benchmark):
             deltas.append((end - start) / elems_per_trial)
 
           median_wall_time = np.median(deltas)
-          print("Unbatch (unfused) batch size: %d Median wall time per element:"
-                " %f microseconds" % (batch_size, median_wall_time * 1e6))
           self.report_benchmark(
               iters=10000,
               wall_time=median_wall_time,
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 548eb422ed06de84447494391ad9e54d9b2df0b2..1733b9817b3348563ff67efba0ecbca9ff757963 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -1,12 +1,12 @@
+load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
 package(default_visibility = ["//tensorflow:internal"])
 
 licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "cuda_py_test")
-load("//tensorflow:tensorflow.bzl", "py_test")
-
 py_test(
     name = "bucket_by_sequence_length_test",
     size = "medium",
@@ -23,6 +23,7 @@ py_test(
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -129,26 +130,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "filter_dataset_op_test",
-    size = "medium",
-    srcs = ["filter_dataset_op_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/experimental/ops:optimization",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
-    ],
-)
-
 py_test(
     name = "get_single_element_test",
     size = "small",
@@ -341,6 +322,8 @@ py_test(
         "//tensorflow/python:functional_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:session",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/experimental/ops:map_defun",
         "//tensorflow/python/data/kernel_tests:test_base",
     ],
@@ -489,6 +472,21 @@ py_library(
     ],
 )
 
+py_test(
+    name = "rebatch_dataset_test",
+    size = "small",
+    srcs = ["rebatch_dataset_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "rejection_resample_test",
     size = "medium",
@@ -608,7 +606,7 @@ py_library(
 
 py_test(
     name = "sql_dataset_test",
-    size = "small",
+    size = "medium",
     srcs = ["sql_dataset_test.py"],
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
@@ -622,7 +620,7 @@ py_test(
 
 py_test(
     name = "stats_dataset_ops_test",
-    size = "medium",
+    size = "large",
     srcs = ["stats_dataset_ops_test.py"],
     srcs_version = "PY2AND3",
     tags = [
@@ -658,6 +656,28 @@ py_library(
     ],
 )
 
+py_test(
+    name = "take_while_test",
+    size = "small",
+    srcs = ["take_while_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/experimental/ops:take_while_ops",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "tf_record_writer_test",
     size = "small",
diff --git a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
index 8264dee3c15da3e1c10751b9c3db3d1e2bc3f1ee..0bbf0e9a12ba3170bd3c69e43824322b8b1eb059 100644
--- a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
@@ -19,13 +19,17 @@ from __future__ import print_function
 
 import random
 
+from absl.testing import parameterized
+
 from tensorflow.python.data.experimental.ops import grouping
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -69,9 +73,139 @@ def _get_record_shape(sparse):
   return tensor_shape.TensorShape([None])
 
 
-class BucketBySequenceLengthTest(test_base.DatasetTestBase):
+@test_util.run_all_in_graph_and_eager_modes
+class BucketBySequenceLengthTest(test_base.DatasetTestBase,
+                                 parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ("WithoutPadding", True),
+      ("WithPadding", False),
+  )
+  def testBucketDropReminder(self, param_no_padding):
+
+    boundaries = [10, 20, 30]
+    batch_sizes = [10, 8, 4, 2]
+    lengths = [8, 13, 25, 35]
+
+    n_bucket_elements = [28, 7, 6, 5]
+    n_expected_batches = 5
+
+    # Expected sequence lengths of the individual batches.
+    expected_lengths = []
+
+    # Expected sum of all batches with an equal sequence length.
+    # <seq-length>: <expected-total-sum>
+    expected_sums = dict()
+
+    # Expected batch sizes of batches depending on the sequence length.
+    # <seq-length>: [batch1_size, ..., batchN_size]
+    expected_batch_sizes = dict()
+
+    for length, batch_size, bucket_elements in zip(lengths, batch_sizes,
+                                                   n_bucket_elements):
+      # Calculate the expected sum across all batches of a specific sequence length.
+      expected_sums[length] = \
+          (bucket_elements - bucket_elements % batch_size) * length
+      # Calculate the expected occurrence of individual batch sizes.
+      expected_batch_sizes[length] = \
+          [batch_size] * (bucket_elements // batch_size)
+      # Calculate the expected occurence of individual sequence lengths.
+      expected_lengths.extend([length] * (bucket_elements // batch_size))
+
+    def build_dataset(sparse):
+
+      def _generator():
+        # Produce 1 batch for each bucket
+        elements = []
+        for bucket_elements, length in zip(n_bucket_elements, lengths):
+          # Using only full sequences (opposed to the strategy employed in `testBucket`) makes
+          # checking the sum a lot easier.
+          record_len = length
+          for _ in range(bucket_elements):
+            elements.append([1] * record_len)
+        random.shuffle(elements)
+        for el in elements:
+          yield (_format_record(el, sparse),)
+
+      dataset = dataset_ops.Dataset.from_generator(
+          _generator, (_get_record_type(sparse),), (_get_record_shape(sparse),))
+      if sparse:
+        dataset = dataset.map(lambda x: (_to_sparse_tensor(x),))
+      return dataset
+
+    def _test_bucket_by_padding(no_padding):
+      dataset = build_dataset(sparse=no_padding)
+      dataset = dataset.apply(
+          grouping.bucket_by_sequence_length(
+              _element_length_fn,
+              boundaries,
+              batch_sizes,
+              no_padding=no_padding,
+              drop_remainder=True))
+
+      get_next = self.getNext(dataset)
+      batches = []
+      for _ in range(n_expected_batches):
+        batch, = self.evaluate(get_next())
+        batches.append(batch)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
+
+      generated_lengths = []
+
+      # <seq-length>: <total-sum>
+      generated_sums = dict()
+
+      # <seq-length>: [<batch_size>, ...]
+      generated_batch_sizes = dict()
 
-  def testBucket(self):
+      for length, batch_size, bucket_elements in zip(lengths, batch_sizes,
+                                                     n_bucket_elements):
+        # Initialize the sum across all batches.
+        generated_sums[length] = 0
+        # Initialize the individual batch sizes.
+        generated_batch_sizes[length] = []
+
+      for batch in batches:
+        shape = batch.dense_shape if no_padding else batch.shape
+        length = shape[1]
+        generated_lengths.append(length)
+
+        batch_size = shape[0]
+        generated_batch_sizes[length].append(batch_size)
+
+        batch_sum = batch.values.sum() if no_padding else batch.sum()
+        generated_sums[length] += batch_sum
+
+      for l in lengths:
+        # Make sure the sum of the batch contents is correct for the individual sequence lengths.
+        self.assertEqual(
+            generated_sums[l], expected_sums[l], "Tensor sums did not match! "
+            "expected: {}, generated: {}".format(expected_sums, generated_sums))
+
+        # Make sure the individual batch sizes are generated as expected.
+        self.assertEqual(
+            sorted(generated_batch_sizes[l]), sorted(expected_batch_sizes[l]),
+            "Batch-sizes did not match! "
+            "expected: {}, generated: {}".format(
+                sorted(expected_batch_sizes[l]),
+                sorted(generated_batch_sizes[l])))
+
+      # Make sure the generated sequence lengths appear as often as expected.
+      self.assertEqual(
+          sorted(generated_lengths), sorted(expected_lengths),
+          "The generated sequence lengths did not match! "
+          "expected: {}, generated: {}".format(
+              sorted(expected_lengths), sorted(generated_lengths)))
+
+    _test_bucket_by_padding(param_no_padding)
+
+  @parameterized.named_parameters(
+      ("WithoutPadding", True),
+      ("WithPadding", False),
+  )
+  def testBucket(self, param_no_padding):
 
     boundaries = [10, 20, 30]
     batch_sizes = [10, 8, 4, 2]
@@ -105,14 +239,14 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
               boundaries,
               batch_sizes,
               no_padding=no_padding))
-      batch, = dataset_ops.make_one_shot_iterator(dataset).get_next()
-
-      with self.cached_session() as sess:
-        batches = []
-        for _ in range(4):
-          batches.append(self.evaluate(batch))
-        with self.assertRaises(errors.OutOfRangeError):
-          self.evaluate(batch)
+      get_next = self.getNext(dataset)
+      batches = []
+      for _ in range(4):
+        batch, = self.evaluate(get_next())
+        batches.append(batch)
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
+
       batch_sizes_val = []
       lengths_val = []
       for batch in batches:
@@ -121,14 +255,14 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
         length = shape[1]
         batch_sizes_val.append(batch_size)
         lengths_val.append(length)
-        sum_check = batch.values.sum() if no_padding else batch.sum()
-        self.assertEqual(sum_check, batch_size * length - 1)
+        if not context.executing_eagerly():
+          sum_check = batch.values.sum() if no_padding else batch.sum()
+          self.assertEqual(sum_check, batch_size * length - 1)
       self.assertEqual(sum(batch_sizes_val), sum(batch_sizes))
       self.assertEqual(sorted(batch_sizes), sorted(batch_sizes_val))
       self.assertEqual(sorted(lengths), sorted(lengths_val))
 
-    for no_padding in (True, False):
-      _test_bucket_by_padding(no_padding)
+    _test_bucket_by_padding(param_no_padding)
 
   def testPadToBoundary(self):
 
@@ -155,14 +289,15 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
             grouping.bucket_by_sequence_length(
                 element_len, boundaries, batch_sizes,
                 pad_to_bucket_boundary=True))
-    batch, = dataset_ops.make_one_shot_iterator(dataset).get_next()
+    get_next = self.getNext(dataset)
+
+    batches = []
+    for _ in range(3):
+      batch, = self.evaluate(get_next())
+      batches.append(batch)
+    with self.assertRaisesOpError("bucket_boundaries"):
+      self.evaluate(get_next())
 
-    with self.cached_session() as sess:
-      batches = []
-      for _ in range(3):
-        batches.append(self.evaluate(batch))
-      with self.assertRaisesOpError("bucket_boundaries"):
-        self.evaluate(batch)
     batch_sizes_val = []
     lengths_val = []
     for batch in batches:
@@ -192,14 +327,14 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
             grouping.bucket_by_sequence_length(
                 element_len, boundaries, batch_sizes,
                 pad_to_bucket_boundary=True))
-    batch, = dataset_ops.make_one_shot_iterator(dataset).get_next()
+    get_next = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      batches = []
-      for _ in range(5):
-        batches.append(self.evaluate(batch))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(batch)
+    batches = []
+    for _ in range(5):
+      batch, = self.evaluate(get_next())
+      batches.append(batch)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
     self.assertAllEqual(batches[0], [[1, 0],
                                      [1, 1]])
@@ -212,7 +347,11 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
     self.assertAllEqual(batches[4], [[1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
                                      [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
 
-  def testTupleElements(self):
+  @parameterized.named_parameters(
+      ("WithoutPadding", True),
+      ("WithPadding", False),
+  )
+  def testTupleElements(self, param_no_padding):
 
     def build_dataset(sparse):
       def _generator():
@@ -240,10 +379,13 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
       self.assertEqual([None, None], shapes[0].as_list())
       self.assertEqual([None], shapes[1].as_list())
 
-    for no_padding in (True, False):
-      _test_tuple_elements_by_padding(no_padding)
+    _test_tuple_elements_by_padding(param_no_padding)
 
-  def testBucketSparse(self):
+  @parameterized.named_parameters(
+      ("DoDropRemainder", True),
+      ("DoNotDropRemainder", False),
+  )
+  def testBucketSparse(self, param_drop_remainder):  # pylint: disable=g-doc-args
     """Tests bucketing of sparse tensors (case where `no_padding` == True).
 
     Test runs on following dataset:
@@ -274,11 +416,16 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
       dataset = dataset.map(_to_sparse_tensor)
       return dataset
 
-    def _compute_expected_batches():
+    def _compute_expected_batches(drop_remainder):
       """Computes expected batch outputs and stores in a set."""
       all_expected_sparse_tensors = set()
       for bucket_start_len in range(min_len, max_len, bucket_size):
-        for batch_offset in range(0, bucket_size, batch_size):
+        if drop_remainder:
+          batch_offsets = [0]
+        else:
+          batch_offsets = range(0, bucket_size, batch_size)
+
+        for batch_offset in batch_offsets:
           batch_start_len = bucket_start_len + batch_offset
           batch_end_len = min(batch_start_len + batch_size,
                               bucket_start_len + bucket_size)
@@ -295,26 +442,26 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
 
     def _compute_batches(dataset):
       """Computes actual batch outputs of dataset and stores in a set."""
-      batch = dataset_ops.make_one_shot_iterator(dataset).get_next()
+      batch = self.getNext(dataset)
       all_sparse_tensors = set()
-      with self.cached_session() as sess:
-        with self.assertRaises(errors.OutOfRangeError):
-          while True:
-            output = self.evaluate(batch)
-            sprs_tensor = (tuple([tuple(idx) for idx in output.indices]),
-                           tuple(output.values))
-            all_sparse_tensors.add(sprs_tensor)
-      return all_sparse_tensors
+      with self.assertRaises(errors.OutOfRangeError):
+        while True:
+          output = self.evaluate(batch())
+          sprs_tensor = (tuple([tuple(idx) for idx in output.indices]),
+                         tuple(output.values))
+          all_sparse_tensors.add(sprs_tensor)
 
+      return all_sparse_tensors
     dataset = _build_dataset()
     boundaries = range(min_len + bucket_size + 1, max_len, bucket_size)
-    dataset = dataset.apply(grouping.bucket_by_sequence_length(
-        _element_length_fn,
-        boundaries,
-        [batch_size] * (len(boundaries) + 1),
-        no_padding=True))
+    dataset = dataset.apply(
+        grouping.bucket_by_sequence_length(
+            _element_length_fn,
+            boundaries, [batch_size] * (len(boundaries) + 1),
+            no_padding=True,
+            drop_remainder=param_drop_remainder))
     batches = _compute_batches(dataset)
-    expected_batches = _compute_expected_batches()
+    expected_batches = _compute_expected_batches(param_drop_remainder)
     self.assertEqual(batches, expected_batches)
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py b/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
index 943f0f1f81272b334f0011a301636e9927c15b7c..993b511d5e3635b38e6e0a73f86c873a39a6c127 100644
--- a/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
@@ -22,9 +22,11 @@ from absl.testing import parameterized
 from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class NumElementsTest(test_base.DatasetTestBase, parameterized.TestCase):
   """Tests for `tf.data.experimental.cardinality()`."""
 
@@ -47,8 +49,7 @@ class NumElementsTest(test_base.DatasetTestBase, parameterized.TestCase):
        lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).concatenate(
            dataset_ops.Dataset.range(5)), cardinality.UNKNOWN),
       ("Concatenate3", lambda: dataset_ops.Dataset.range(5).repeat().
-       concatenate(dataset_ops.Dataset.range(5)),
-       cardinality.INFINITE),
+       concatenate(dataset_ops.Dataset.range(5)), cardinality.INFINITE),
       ("Concatenate4", lambda: dataset_ops.Dataset.range(5).concatenate(
           dataset_ops.Dataset.range(5).filter(lambda _: True)),
        cardinality.UNKNOWN),
@@ -68,8 +69,7 @@ class NumElementsTest(test_base.DatasetTestBase, parameterized.TestCase):
        lambda: dataset_ops.Dataset.range(5).repeat().concatenate(
            dataset_ops.Dataset.range(5).repeat()), cardinality.INFINITE),
       ("FlatMap", lambda: dataset_ops.Dataset.range(5).flat_map(
-          lambda _: dataset_ops.Dataset.from_tensors(0)),
-       cardinality.UNKNOWN),
+          lambda _: dataset_ops.Dataset.from_tensors(0)), cardinality.UNKNOWN),
       ("Filter", lambda: dataset_ops.Dataset.range(5).filter(lambda _: True),
        cardinality.UNKNOWN),
       ("FromTensors1", lambda: dataset_ops.Dataset.from_tensors(0), 1),
@@ -115,6 +115,13 @@ class NumElementsTest(test_base.DatasetTestBase, parameterized.TestCase):
        cardinality.INFINITE),
       ("Shuffle", lambda: dataset_ops.Dataset.range(5).shuffle(buffer_size=1),
        5),
+      ("Shard1", lambda: dataset_ops.Dataset.range(5).shard(2, 0), 3),
+      ("Shard2", lambda: dataset_ops.Dataset.range(5).shard(8, 7), 0),
+      ("Shard3",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).shard(2, 0),
+       cardinality.UNKNOWN),
+      ("Shard4", lambda: dataset_ops.Dataset.range(5).repeat().shard(2, 0),
+       cardinality.INFINITE),
       ("Skip1", lambda: dataset_ops.Dataset.range(5).skip(2), 3),
       ("Skip2", lambda: dataset_ops.Dataset.range(5).skip(8), 0),
       ("Skip3",
@@ -136,15 +143,13 @@ class NumElementsTest(test_base.DatasetTestBase, parameterized.TestCase):
        5),
       ("Zip2", lambda: dataset_ops.Dataset.zip(
           (dataset_ops.Dataset.range(5), dataset_ops.Dataset.range(3))), 3),
-      ("Zip3", lambda: dataset_ops.Dataset.zip(
-          (dataset_ops.Dataset.range(5),
-           dataset_ops.Dataset.range(3).repeat())), 5),
-      ("Zip4", lambda: dataset_ops.Dataset.zip(
-          (dataset_ops.Dataset.range(5).repeat(),
-           dataset_ops.Dataset.range(3).repeat())), cardinality.INFINITE),
-      ("Zip5", lambda: dataset_ops.Dataset.zip(
-          (dataset_ops.Dataset.range(5),
-           dataset_ops.Dataset.range(3).filter(lambda _: True))),
+      ("Zip3", lambda: dataset_ops.Dataset.zip((dataset_ops.Dataset.range(
+          5), dataset_ops.Dataset.range(3).repeat())), 5),
+      ("Zip4", lambda: dataset_ops.Dataset.zip((dataset_ops.Dataset.range(
+          5).repeat(), dataset_ops.Dataset.range(3).repeat())),
+       cardinality.INFINITE),
+      ("Zip5", lambda: dataset_ops.Dataset.zip((dataset_ops.Dataset.range(
+          5), dataset_ops.Dataset.range(3).filter(lambda _: True))),
        cardinality.UNKNOWN),
       # pylint: enable=g-long-lambda
   )
diff --git a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
index b8166fe8334a5117005b7194cd582287eac74dd7..d9fbe9e0e18c526e7e0bf88b9c3b477bf0917fe5 100644
--- a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import compat as util_compat
 
 
+# TODO(b/117581999): add eager coverage when supported.
 class CopyToDeviceTest(test_base.DatasetTestBase):
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/data/experimental/kernel_tests/counter_test.py b/tensorflow/python/data/experimental/kernel_tests/counter_test.py
index 49e1f2272b7bea8f2d245d678711a3879774ba06..436fa506c419dd73bf1836b9ba5486f9d435105b 100644
--- a/tensorflow/python/data/experimental/kernel_tests/counter_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/counter_test.py
@@ -19,35 +19,31 @@ from __future__ import print_function
 
 from tensorflow.python.data.experimental.ops import counter
 from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class CounterTest(test_base.DatasetTestBase):
 
-  @test_util.run_deprecated_v1
   def testCounter(self):
     """Test dataset construction using `count`."""
-    iterator = dataset_ops.make_one_shot_iterator(
-        counter.Counter(start=3, step=4))
-    get_next = iterator.get_next()
-    self.assertEqual([], get_next.shape.as_list())
-    self.assertEqual(dtypes.int64, get_next.dtype)
-
-    negative_iterator = dataset_ops.make_one_shot_iterator(
-        counter.Counter(start=0, step=-1))
-    negative_get_next = negative_iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertEqual(3, self.evaluate(get_next))
-      self.assertEqual(3 + 4, self.evaluate(get_next))
-      self.assertEqual(3 + 2 * 4, self.evaluate(get_next))
-
-      self.assertEqual(0, self.evaluate(negative_get_next))
-      self.assertEqual(-1, self.evaluate(negative_get_next))
-      self.assertEqual(-2, self.evaluate(negative_get_next))
+    dataset = counter.Counter(start=3, step=4)
+    self.assertEqual([], dataset.output_shapes.as_list())
+    self.assertEqual(dtypes.int64, dataset.output_types)
+    get_next = self.getNext(dataset)
+
+    negative_dataset = counter.Counter(start=0, step=-1)
+    negative_get_next = self.getNext(negative_dataset)
+
+    self.assertEqual(3, self.evaluate(get_next()))
+    self.assertEqual(3 + 4, self.evaluate(get_next()))
+    self.assertEqual(3 + 2 * 4, self.evaluate(get_next()))
+
+    self.assertEqual(0, self.evaluate(negative_get_next()))
+    self.assertEqual(-1, self.evaluate(negative_get_next()))
+    self.assertEqual(-2, self.evaluate(negative_get_next()))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
index b2f1b43ecf6f82725143c95af4d6f4df58e41903..e523f36639db78e7206b2ae989d5187e85c0f24b 100644
--- a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
@@ -89,14 +89,12 @@ class CsvDatasetTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(nxt())
     else:
-      # Verify that OpError is produced as expected
-      with self.assertRaisesOpError(expected_err_re):
-        nxt = self.getNext(dataset)
-        while True:
-          try:
-            self.evaluate(nxt())
-          except errors.OutOfRangeError:
-            break
+      nxt = self.getNext(dataset)
+      while True:
+        try:
+          self.evaluate(nxt())
+        except errors.OutOfRangeError:
+          break
 
   def _test_dataset(
       self,
@@ -110,8 +108,14 @@ class CsvDatasetTest(test_base.DatasetTestBase):
     # Convert str type because py3 tf strings are bytestrings
     filenames = self._setup_files(inputs, linebreak, compression_type)
     kwargs['compression_type'] = compression_type
-    dataset = readers.CsvDataset(filenames, **kwargs)
-    self._verify_output_or_err(dataset, expected_output, expected_err_re)
+    if expected_err_re is not None:
+      # Verify that OpError is produced as expected
+      with self.assertRaisesOpError(expected_err_re):
+        dataset = readers.CsvDataset(filenames, **kwargs)
+        self._verify_output_or_err(dataset, expected_output, expected_err_re)
+    else:
+      dataset = readers.CsvDataset(filenames, **kwargs)
+      self._verify_output_or_err(dataset, expected_output, expected_err_re)
 
   def testCsvDataset_requiredFields(self):
     record_defaults = [[]] * 4
diff --git a/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py
index 22e057a2848fd154de0ad356f2238fb2028cd647..cca7ae073ee07124715725c5913036cb41a37950 100644
--- a/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py
@@ -22,105 +22,87 @@ import numpy as np
 from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class DenseToSparseBatchTest(test_base.DatasetTestBase):
 
-  @test_util.run_deprecated_v1
   def testDenseToSparseBatchDataset(self):
     components = np.random.randint(12, size=(100,)).astype(np.int32)
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .map(lambda x: array_ops.fill([x], x)).apply(
-            batching.dense_to_sparse_batch(4, [12])))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-
-      for start in range(0, len(components), 4):
-        results = self.evaluate(get_next)
-        self.assertAllEqual([[i, j]
-                             for i, c in enumerate(components[start:start + 4])
-                             for j in range(c)], results.indices)
-        self.assertAllEqual(
-            [c for c in components[start:start + 4] for _ in range(c)],
-            results.values)
-        self.assertAllEqual([min(4,
-                                 len(components) - start), 12],
-                            results.dense_shape)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
-
-  @test_util.run_deprecated_v1
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        components).map(lambda x: array_ops.fill([x], x)).apply(
+            batching.dense_to_sparse_batch(4, [12]))
+    get_next = self.getNext(dataset)
+
+    for start in range(0, len(components), 4):
+      results = self.evaluate(get_next())
+      self.assertAllEqual([[i, j]
+                           for i, c in enumerate(components[start:start + 4])
+                           for j in range(c)], results.indices)
+      self.assertAllEqual(
+          [c for c in components[start:start + 4] for _ in range(c)],
+          results.values)
+      self.assertAllEqual([min(4,
+                               len(components) - start), 12],
+                          results.dense_shape)
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
   def testDenseToSparseBatchDatasetWithUnknownShape(self):
     components = np.random.randint(5, size=(40,)).astype(np.int32)
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .map(lambda x: array_ops.fill([x, x], x)).apply(
-            batching.dense_to_sparse_batch(4, [5, None])))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-
-      for start in range(0, len(components), 4):
-        results = self.evaluate(get_next)
-        self.assertAllEqual([[i, j, z]
-                             for i, c in enumerate(components[start:start + 4])
-                             for j in range(c)
-                             for z in range(c)], results.indices)
-        self.assertAllEqual([
-            c
-            for c in components[start:start + 4] for _ in range(c)
-            for _ in range(c)
-        ], results.values)
-        self.assertAllEqual([
-            min(4,
-                len(components) - start), 5,
-            np.max(components[start:start + 4])
-        ], results.dense_shape)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
-
-  @test_util.run_deprecated_v1
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        components).map(lambda x: array_ops.fill([x, x], x)).apply(
+            batching.dense_to_sparse_batch(4, [5, None]))
+
+    get_next = self.getNext(dataset)
+
+    for start in range(0, len(components), 4):
+      results = self.evaluate(get_next())
+      self.assertAllEqual([[i, j, z]
+                           for i, c in enumerate(components[start:start + 4])
+                           for j in range(c)
+                           for z in range(c)], results.indices)
+      self.assertAllEqual([
+          c for c in components[start:start + 4] for _ in range(c)
+          for _ in range(c)
+      ], results.values)
+      self.assertAllEqual([
+          min(4,
+              len(components) - start), 5,
+          np.max(components[start:start + 4])
+      ], results.dense_shape)
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
   def testDenseToSparseBatchDatasetWithInvalidShape(self):
     input_tensor = array_ops.constant([[1]])
     with self.assertRaisesRegexp(ValueError, "Dimension -2 must be >= 0"):
-      dataset_ops.make_initializable_iterator(
-          dataset_ops.Dataset.from_tensors(input_tensor).apply(
-              batching.dense_to_sparse_batch(4, [-2])))
+      dataset_ops.Dataset.from_tensors(input_tensor).apply(
+          batching.dense_to_sparse_batch(4, [-2]))
 
-  @test_util.run_deprecated_v1
   def testDenseToSparseBatchDatasetShapeErrors(self):
-    input_tensor = array_ops.placeholder(dtypes.int32)
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.from_tensors(input_tensor).apply(
-            batching.dense_to_sparse_batch(4, [12])))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # Initialize with an input tensor of incompatible rank.
-      sess.run(init_op, feed_dict={input_tensor: [[1]]})
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "incompatible with the row shape"):
-        self.evaluate(get_next)
-
-      # Initialize with an input tensor that is larger than `row_shape`.
-      sess.run(init_op, feed_dict={input_tensor: range(13)})
-      with self.assertRaisesRegexp(errors.DataLossError,
-                                   "larger than the row shape"):
-        self.evaluate(get_next)
+
+    def dataset_fn(input_tensor):
+      return dataset_ops.Dataset.from_tensors(input_tensor).apply(
+          batching.dense_to_sparse_batch(4, [12]))
+
+    # Initialize with an input tensor of incompatible rank.
+    get_next = self.getNext(dataset_fn([[1]]))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "incompatible with the row shape"):
+      self.evaluate(get_next())
+
+    # Initialize with an input tensor that is larger than `row_shape`.
+    get_next = self.getNext(dataset_fn(np.int32(range(13))))
+    with self.assertRaisesRegexp(errors.DataLossError,
+                                 "larger than the row shape"):
+      self.evaluate(get_next())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
index 214434206669299cf545d68bdc330b1a548b4710..df69a9dbb01b6f7049f76a83df682232d4012ead 100644
--- a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
@@ -28,9 +28,9 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class DirectedInterleaveDatasetTest(test_base.DatasetTestBase):
 
-  @test_util.run_deprecated_v1
   def testBasic(self):
     selector_dataset = dataset_ops.Dataset.range(10).repeat(100)
     input_datasets = [
@@ -38,16 +38,13 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase):
     ]
     dataset = interleave_ops._DirectedInterleaveDataset(selector_dataset,
                                                         input_datasets)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
+    next_element = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      self.evaluate(iterator.initializer)
-      for _ in range(100):
-        for i in range(10):
-          self.assertEqual(i, self.evaluate(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    for _ in range(100):
+      for i in range(10):
+        self.assertEqual(i, self.evaluate(next_element()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def _normalize(self, vec):
     return vec / vec.sum()
@@ -67,19 +64,16 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase):
         for i in range(num_datasets)
     ], weights)
     dataset = dataset.take(num_samples)
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
-    next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      freqs = np.zeros([num_datasets])
-      for _ in range(num_samples):
-        freqs[self.evaluate(next_element)] += 1
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    next_element = self.getNext(dataset)
+    freqs = np.zeros([num_datasets])
+    for _ in range(num_samples):
+      freqs[self.evaluate(next_element())] += 1
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
     return freqs
 
-  @test_util.run_deprecated_v1
   def testSampleFromDatasets(self):
     random_seed.set_random_seed(1619)
     num_samples = 5000
@@ -99,21 +93,17 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase):
       freqs = self._testSampleFromDatasetsHelper(probs_ds, classes, num_samples)
       self.assertLess(self._chi2(probs, freqs / num_samples), 1e-2)
 
-  @test_util.run_deprecated_v1
   def testSelectFromDatasets(self):
     words = [b"foo", b"bar", b"baz"]
     datasets = [dataset_ops.Dataset.from_tensors(w).repeat() for w in words]
     choice_array = np.random.randint(3, size=(15,), dtype=np.int64)
     choice_dataset = dataset_ops.Dataset.from_tensor_slices(choice_array)
     dataset = interleave_ops.choose_from_datasets(datasets, choice_dataset)
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in choice_array:
-        self.assertEqual(words[i], self.evaluate(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    next_element = self.getNext(dataset)
+    for i in choice_array:
+      self.assertEqual(words[i], self.evaluate(next_element()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testErrors(self):
     with self.assertRaisesRegexp(ValueError,
diff --git a/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
index 25742098f18787bc1d2e5bfd9c8717a777b8312c..cbc048e3ab460c9bc3bf4efa63221f814075f4ac 100644
--- a/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
@@ -22,37 +22,28 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class EnumerateDatasetTest(test_base.DatasetTestBase):
 
-  @test_util.run_deprecated_v1
   def testEnumerateDataset(self):
     components = (["a", "b"], [1, 2], [37.0, 38])
     start = constant_op.constant(20, dtype=dtypes.int64)
 
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.from_tensor_slices(components).apply(
-            enumerate_ops.enumerate_dataset(start)))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).apply(
+        enumerate_ops.enumerate_dataset(start))
 
-    self.assertEqual(dtypes.int64, get_next[0].dtype)
-    self.assertEqual((), get_next[0].shape)
+    self.assertEqual(dtypes.int64, dataset.output_types[0])
+    self.assertEqual((), dataset.output_shapes[0])
     self.assertEqual([tensor_shape.TensorShape([])] * 3,
-                     [t.shape for t in get_next[1]])
+                     [shape for shape in dataset.output_shapes[1]])
 
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      self.assertEqual((20, (b"a", 1, 37.0)), self.evaluate(get_next))
-      self.assertEqual((21, (b"b", 2, 38.0)), self.evaluate(get_next))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    self.assertDatasetProduces(dataset, [(20, (b"a", 1, 37.0)),
+                                         (21, (b"b", 2, 38.0))])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/filter_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/filter_dataset_op_test.py
deleted file mode 100644
index 357b5f1b49b9f75e187fc02a5a89907baa445a76..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/experimental/kernel_tests/filter_dataset_op_test.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Benchmarks FilterDataset input pipeline op."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import time
-
-import numpy as np
-
-from tensorflow.python.client import session
-from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-
-class FilterBenchmark(test.Benchmark):
-
-  # This benchmark compares the performance of pipeline with multiple chained
-  # filter with and without filter fusion.
-  def benchmarkFilters(self):
-    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
-    for chain_length in chain_lengths:
-      self._benchmarkFilters(chain_length, False)
-      self._benchmarkFilters(chain_length, True)
-
-  def _benchmarkFilters(self, chain_length, optimize_dataset):
-    with ops.Graph().as_default():
-      dataset = dataset_ops.Dataset.from_tensors(5).repeat(None)
-      for _ in range(chain_length):
-        dataset = dataset.filter(lambda x: math_ops.greater_equal(x - 5, 0))
-      if optimize_dataset:
-        dataset = dataset.apply(optimization.optimize(["filter_fusion"]))
-
-      iterator = dataset_ops.make_one_shot_iterator(dataset)
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for _ in range(10):
-          self.evaluate(next_element.op)
-        deltas = []
-        for _ in range(100):
-          start = time.time()
-          for _ in range(100):
-            self.evaluate(next_element.op)
-          end = time.time()
-          deltas.append(end - start)
-
-        median_wall_time = np.median(deltas) / 100
-        opt_mark = "opt" if optimize_dataset else "no-opt"
-        print("Filter dataset {} chain length: {} Median wall time: {}".format(
-            opt_mark, chain_length, median_wall_time))
-        self.report_benchmark(
-            iters=1000,
-            wall_time=median_wall_time,
-            name="benchmark_filter_dataset_chain_latency_{}_{}".format(
-                opt_mark, chain_length))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
index ef576563a15a7385d450e4f254e1cb579f79ce8c..f65740c56518c2c0baa1d1d56cac5e0314db4b97 100644
--- a/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
@@ -22,14 +22,16 @@ from absl.testing import parameterized
 from tensorflow.python.data.experimental.ops import get_single_element
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
+from tensorflow.python.eager import function
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(
@@ -40,34 +42,25 @@ class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("MoreThanOne", 0, 2, errors.InvalidArgumentError,
        "Dataset had more than one element."),
   )
-  @test_util.run_deprecated_v1
   def testGetSingleElement(self, skip, take, error=None, error_msg=None):
-    skip_t = array_ops.placeholder(dtypes.int64, shape=[])
-    take_t = array_ops.placeholder(dtypes.int64, shape=[])
 
     def make_sparse(x):
       x_1d = array_ops.reshape(x, [1])
       x_2d = array_ops.reshape(x, [1, 1])
       return sparse_tensor.SparseTensor(x_2d, x_1d, x_1d)
 
-    dataset = dataset_ops.Dataset.range(100).skip(skip_t).map(
-        lambda x: (x * x, make_sparse(x))).take(take_t)
-    element = get_single_element.get_single_element(dataset)
-
-    with self.cached_session() as sess:
-      if error is None:
-        dense_val, sparse_val = sess.run(
-            element, feed_dict={
-                skip_t: skip,
-                take_t: take
-            })
-        self.assertEqual(skip * skip, dense_val)
-        self.assertAllEqual([[skip]], sparse_val.indices)
-        self.assertAllEqual([skip], sparse_val.values)
-        self.assertAllEqual([skip], sparse_val.dense_shape)
-      else:
-        with self.assertRaisesRegexp(error, error_msg):
-          sess.run(element, feed_dict={skip_t: skip, take_t: take})
+    dataset = dataset_ops.Dataset.range(100).skip(
+        skip).map(lambda x: (x * x, make_sparse(x))).take(take)
+    if error is None:
+      dense_val, sparse_val = self.evaluate(
+          get_single_element.get_single_element(dataset))
+      self.assertEqual(skip * skip, dense_val)
+      self.assertAllEqual([[skip]], sparse_val.indices)
+      self.assertAllEqual([skip], sparse_val.values)
+      self.assertAllEqual([skip], sparse_val.dense_shape)
+    else:
+      with self.assertRaisesRegexp(error, error_msg):
+        self.evaluate(get_single_element.get_single_element(dataset))
 
   def testWindow(self):
     """Test that `get_single_element()` can consume a nested dataset."""
@@ -80,6 +73,52 @@ class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertDatasetProduces(
         dataset, [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]])
 
+  def testSideEffect(self):
+    counter_var = variables.Variable(0)
+
+    def increment_fn(x):
+      counter_var.assign_add(1)
+      return x
+
+    def dataset_fn():
+      return dataset_ops.Dataset.range(1).map(increment_fn)
+
+    @function.defun
+    def fn():
+      _ = get_single_element.get_single_element(dataset_fn())
+      return "hello"
+
+    self.evaluate(counter_var.initializer)
+    self.assertEqual(self.evaluate(fn()), b"hello")
+    self.assertEqual(self.evaluate(counter_var), 1)
+
+  def testAutomaticControlDependencies(self):
+    counter_var = variables.Variable(1)
+
+    def increment_fn(x):
+      counter_var.assign(counter_var + 1)
+      return x
+
+    def multiply_fn(x):
+      counter_var.assign(counter_var * 2)
+      return x
+
+    def dataset1_fn():
+      return dataset_ops.Dataset.range(1).map(increment_fn)
+
+    def dataset2_fn():
+      return dataset_ops.Dataset.range(1).map(multiply_fn)
+
+    @function.defun
+    def fn():
+      _ = get_single_element.get_single_element(dataset1_fn())
+      _ = get_single_element.get_single_element(dataset2_fn())
+      return "hello"
+
+    self.evaluate(counter_var.initializer)
+    self.assertEqual(self.evaluate(fn()), b"hello")
+    self.assertEqual(self.evaluate(counter_var), 4)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
index 8507df3d3a27ea62c9d866c94af589fbc566317e..4194f06a34a8008ac2ed835b5300959bda9e3f78 100644
--- a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
@@ -33,19 +33,9 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class GroupByReducerTest(test_base.DatasetTestBase):
 
-  def checkResults(self, dataset, shapes, values):
-    self.assertEqual(shapes, dataset.output_shapes)
-    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
-    with self.cached_session() as sess:
-      for expected in values:
-        got = self.evaluate(get_next)
-        self.assertEqual(got, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
-
-  @test_util.run_deprecated_v1
   def testSum(self):
     reducer = grouping.Reducer(
         init_func=lambda _: np.int64(0),
@@ -54,10 +44,11 @@ class GroupByReducerTest(test_base.DatasetTestBase):
     for i in range(1, 11):
       dataset = dataset_ops.Dataset.range(2 * i).apply(
           grouping.group_by_reducer(lambda x: x % 2, reducer))
-      self.checkResults(
-          dataset, shapes=tensor_shape.scalar(), values=[(i - 1) * i, i * i])
+      self.assertDatasetProduces(
+          dataset,
+          expected_shapes=tensor_shape.scalar(),
+          expected_output=[(i - 1) * i, i * i])
 
-  @test_util.run_deprecated_v1
   def testAverage(self):
 
     def reduce_fn(x, y):
@@ -72,10 +63,11 @@ class GroupByReducerTest(test_base.DatasetTestBase):
       dataset = dataset_ops.Dataset.range(2 * i).apply(
           grouping.group_by_reducer(
               lambda x: math_ops.cast(x, dtypes.int64) % 2, reducer))
-      self.checkResults(
-          dataset, shapes=tensor_shape.scalar(), values=[i - 1, i])
+      self.assertDatasetProduces(
+          dataset,
+          expected_shapes=tensor_shape.scalar(),
+          expected_output=[i - 1, i])
 
-  @test_util.run_deprecated_v1
   def testConcat(self):
     components = np.array(list("abcdefghijklmnopqrst")).view(np.chararray)
     reducer = grouping.Reducer(
@@ -87,12 +79,11 @@ class GroupByReducerTest(test_base.DatasetTestBase):
           (dataset_ops.Dataset.from_tensor_slices(components),
            dataset_ops.Dataset.range(2 * i))).apply(
                grouping.group_by_reducer(lambda x, y: y % 2, reducer))
-      self.checkResults(
+      self.assertDatasetProduces(
           dataset,
-          shapes=tensor_shape.scalar(),
-          values=[b"acegikmoqs" [:i], b"bdfhjlnprt" [:i]])
+          expected_shapes=tensor_shape.scalar(),
+          expected_output=[b"acegikmoqs" [:i], b"bdfhjlnprt" [:i]])
 
-  @test_util.run_deprecated_v1
   def testSparseSum(self):
     def _sparse(i):
       return sparse_tensor.SparseTensorValue(
@@ -107,10 +98,11 @@ class GroupByReducerTest(test_base.DatasetTestBase):
     for i in range(1, 11):
       dataset = dataset_ops.Dataset.range(2 * i).map(_sparse).apply(
           grouping.group_by_reducer(lambda x: x.values[0] % 2, reducer))
-      self.checkResults(
-          dataset, shapes=tensor_shape.scalar(), values=[(i - 1) * i, i * i])
+      self.assertDatasetProduces(
+          dataset,
+          expected_shapes=tensor_shape.scalar(),
+          expected_output=[(i - 1) * i, i * i])
 
-  @test_util.run_deprecated_v1
   def testChangingStateShape(self):
 
     def reduce_fn(x, _):
@@ -130,14 +122,12 @@ class GroupByReducerTest(test_base.DatasetTestBase):
           grouping.group_by_reducer(lambda x: x, reducer))
       self.assertEqual([None], dataset.output_shapes[0].as_list())
       self.assertIs(None, dataset.output_shapes[1].ndims)
-      iterator = dataset_ops.make_one_shot_iterator(dataset)
-      get_next = iterator.get_next()
-      with self.cached_session() as sess:
-        x, y = self.evaluate(get_next)
-        self.assertAllEqual([0] * (2**i), x)
-        self.assertAllEqual(np.array(1, ndmin=i), y)
-        with self.assertRaises(errors.OutOfRangeError):
-          self.evaluate(get_next)
+      get_next = self.getNext(dataset)
+      x, y = self.evaluate(get_next())
+      self.assertAllEqual([0] * (2**i), x)
+      self.assertAllEqual(np.array(1, ndmin=i), y)
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
 
   def testTypeMismatch(self):
     reducer = grouping.Reducer(
@@ -194,11 +184,10 @@ class GroupByReducerTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.zip(
         (dataset_ops.Dataset.range(10), dataset_ops.Dataset.range(10))).apply(
             grouping.group_by_reducer(lambda x, y: np.int64(0), reducer))
-    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
-    with self.cached_session() as sess:
-      x, y = self.evaluate(get_next)
-      self.assertAllEqual(x, np.asarray([x for x in range(10)]))
-      self.assertEqual(y, 45)
+    get_next = self.getNext(dataset)
+    x, y = self.evaluate(get_next())
+    self.assertAllEqual(x, np.asarray([x for x in range(10)]))
+    self.assertEqual(y, 45)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
index cbb79e55f507a41c0522163dc0b68c56835891a6..d1270703c56138ca8546b04ce0e16b6c5da41fe9 100644
--- a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
@@ -37,6 +37,7 @@ from tensorflow.python.platform import test
 # NOTE(mrry): These tests are based on the tests in bucket_ops_test.py.
 # Currently, they use a constant batch size, though should be made to use a
 # different batch size per key.
+@test_util.run_all_in_graph_and_eager_modes
 class GroupByWindowTest(test_base.DatasetTestBase):
 
   def _dynamicPad(self, bucket, window, window_size):
@@ -50,101 +51,87 @@ class GroupByWindowTest(test_base.DatasetTestBase):
              32, (tensor_shape.TensorShape([]), tensor_shape.TensorShape(
                  [None]), tensor_shape.TensorShape([3])))))
 
-  @test_util.run_deprecated_v1
   def testSingleBucket(self):
 
     def _map_fn(v):
       return (v, array_ops.fill([v], v),
               array_ops.fill([3], string_ops.as_string(v)))
 
-    input_dataset = (
-        dataset_ops.Dataset.from_tensor_slices(math_ops.range(32)).map(_map_fn))
+    input_dataset = dataset_ops.Dataset.from_tensor_slices(
+        math_ops.range(32)).map(_map_fn)
 
     bucketed_dataset = input_dataset.apply(
         grouping.group_by_window(
             lambda x, y, z: 0,
             lambda k, bucket: self._dynamicPad(k, bucket, 32), 32))
+    get_next = self.getNext(bucketed_dataset)
 
-    iterator = dataset_ops.make_initializable_iterator(bucketed_dataset)
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    which_bucket, bucketed_values = self.evaluate(get_next())
 
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
+    self.assertEqual(0, which_bucket)
 
-      which_bucket, bucketed_values = self.evaluate(get_next)
+    expected_scalar_int = np.arange(32, dtype=np.int64)
+    expected_unk_int64 = np.zeros((32, 31)).astype(np.int64)
+    for i in range(32):
+      expected_unk_int64[i, :i] = i
+    expected_vec3_str = np.vstack(3 * [np.arange(32).astype(bytes)]).T
 
-      self.assertEqual(0, which_bucket)
+    self.assertAllEqual(expected_scalar_int, bucketed_values[0])
+    self.assertAllEqual(expected_unk_int64, bucketed_values[1])
+    self.assertAllEqual(expected_vec3_str, bucketed_values[2])
 
-      expected_scalar_int = np.arange(32, dtype=np.int64)
-      expected_unk_int64 = np.zeros((32, 31)).astype(np.int64)
-      for i in range(32):
-        expected_unk_int64[i, :i] = i
-      expected_vec3_str = np.vstack(3 * [np.arange(32).astype(bytes)]).T
-
-      self.assertAllEqual(expected_scalar_int, bucketed_values[0])
-      self.assertAllEqual(expected_unk_int64, bucketed_values[1])
-      self.assertAllEqual(expected_vec3_str, bucketed_values[2])
-
-  @test_util.run_deprecated_v1
   def testEvenOddBuckets(self):
 
     def _map_fn(v):
       return (v, array_ops.fill([v], v),
               array_ops.fill([3], string_ops.as_string(v)))
 
-    input_dataset = (
-        dataset_ops.Dataset.from_tensor_slices(math_ops.range(64)).map(_map_fn))
+    input_dataset = dataset_ops.Dataset.from_tensor_slices(
+        math_ops.range(64)).map(_map_fn)
 
     bucketed_dataset = input_dataset.apply(
         grouping.group_by_window(
             lambda x, y, z: math_ops.cast(x % 2, dtypes.int64),
             lambda k, bucket: self._dynamicPad(k, bucket, 32), 32))
 
-    iterator = dataset_ops.make_initializable_iterator(bucketed_dataset)
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-
-      # Get two minibatches (one containing even values, one containing odds)
-      which_bucket_even, bucketed_values_even = self.evaluate(get_next)
-      which_bucket_odd, bucketed_values_odd = self.evaluate(get_next)
-
-      # Count number of bucket_tensors.
-      self.assertEqual(3, len(bucketed_values_even))
-      self.assertEqual(3, len(bucketed_values_odd))
-
-      # Ensure bucket 0 was used for all minibatch entries.
-      self.assertAllEqual(0, which_bucket_even)
-      self.assertAllEqual(1, which_bucket_odd)
-
-      # Test the first bucket outputted, the events starting at 0
-      expected_scalar_int = np.arange(0, 32 * 2, 2, dtype=np.int64)
-      expected_unk_int64 = np.zeros((32, 31 * 2)).astype(np.int64)
-      for i in range(0, 32):
-        expected_unk_int64[i, :2 * i] = 2 * i
-        expected_vec3_str = np.vstack(
-            3 * [np.arange(0, 32 * 2, 2).astype(bytes)]).T
-
-      self.assertAllEqual(expected_scalar_int, bucketed_values_even[0])
-      self.assertAllEqual(expected_unk_int64, bucketed_values_even[1])
-      self.assertAllEqual(expected_vec3_str, bucketed_values_even[2])
-
-      # Test the second bucket outputted, the odds starting at 1
-      expected_scalar_int = np.arange(1, 32 * 2 + 1, 2, dtype=np.int64)
-      expected_unk_int64 = np.zeros((32, 31 * 2 + 1)).astype(np.int64)
-      for i in range(0, 32):
-        expected_unk_int64[i, :2 * i + 1] = 2 * i + 1
-        expected_vec3_str = np.vstack(
-            3 * [np.arange(1, 32 * 2 + 1, 2).astype(bytes)]).T
-
-      self.assertAllEqual(expected_scalar_int, bucketed_values_odd[0])
-      self.assertAllEqual(expected_unk_int64, bucketed_values_odd[1])
-      self.assertAllEqual(expected_vec3_str, bucketed_values_odd[2])
-
-  @test_util.run_deprecated_v1
+    get_next = self.getNext(bucketed_dataset)
+
+    # Get two minibatches (one containing even values, one containing odds)
+    which_bucket_even, bucketed_values_even = self.evaluate(get_next())
+    which_bucket_odd, bucketed_values_odd = self.evaluate(get_next())
+
+    # Count number of bucket_tensors.
+    self.assertEqual(3, len(bucketed_values_even))
+    self.assertEqual(3, len(bucketed_values_odd))
+
+    # Ensure bucket 0 was used for all minibatch entries.
+    self.assertAllEqual(0, which_bucket_even)
+    self.assertAllEqual(1, which_bucket_odd)
+
+    # Test the first bucket outputted, the events starting at 0
+    expected_scalar_int = np.arange(0, 32 * 2, 2, dtype=np.int64)
+    expected_unk_int64 = np.zeros((32, 31 * 2)).astype(np.int64)
+    for i in range(0, 32):
+      expected_unk_int64[i, :2 * i] = 2 * i
+      expected_vec3_str = np.vstack(
+          3 * [np.arange(0, 32 * 2, 2).astype(bytes)]).T
+
+    self.assertAllEqual(expected_scalar_int, bucketed_values_even[0])
+    self.assertAllEqual(expected_unk_int64, bucketed_values_even[1])
+    self.assertAllEqual(expected_vec3_str, bucketed_values_even[2])
+
+    # Test the second bucket outputted, the odds starting at 1
+    expected_scalar_int = np.arange(1, 32 * 2 + 1, 2, dtype=np.int64)
+    expected_unk_int64 = np.zeros((32, 31 * 2 + 1)).astype(np.int64)
+    for i in range(0, 32):
+      expected_unk_int64[i, :2 * i + 1] = 2 * i + 1
+      expected_vec3_str = np.vstack(
+          3 * [np.arange(1, 32 * 2 + 1, 2).astype(bytes)]).T
+
+    self.assertAllEqual(expected_scalar_int, bucketed_values_odd[0])
+    self.assertAllEqual(expected_unk_int64, bucketed_values_odd[1])
+    self.assertAllEqual(expected_vec3_str, bucketed_values_odd[2])
+
   def testEvenOddBucketsFilterOutAllOdd(self):
 
     def _map_fn(v):
@@ -164,35 +151,28 @@ class GroupByWindowTest(test_base.DatasetTestBase):
                    "z": tensor_shape.TensorShape([3])
                })))
 
-    input_dataset = (
-        dataset_ops.Dataset.from_tensor_slices(math_ops.range(128)).map(_map_fn)
-        .filter(lambda d: math_ops.equal(d["x"] % 2, 0)))
+    input_dataset = dataset_ops.Dataset.from_tensor_slices(math_ops.range(
+        128)).map(_map_fn).filter(lambda d: math_ops.equal(d["x"] % 2, 0))
 
     bucketed_dataset = input_dataset.apply(
         grouping.group_by_window(
             lambda d: math_ops.cast(d["x"] % 2, dtypes.int64),
             lambda k, bucket: _dynamic_pad_fn(k, bucket, 32), 32))
 
-    iterator = dataset_ops.make_initializable_iterator(bucketed_dataset)
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
+    get_next = self.getNext(bucketed_dataset)
 
-      # Get two minibatches ([0, 2, ...] and [64, 66, ...])
-      which_bucket0, bucketed_values_even0 = self.evaluate(get_next)
-      which_bucket1, bucketed_values_even1 = self.evaluate(get_next)
+    # Get two minibatches ([0, 2, ...] and [64, 66, ...])
+    which_bucket0, bucketed_values_even0 = self.evaluate(get_next())
+    which_bucket1, bucketed_values_even1 = self.evaluate(get_next())
 
-      # Ensure that bucket 1 was completely filtered out
-      self.assertAllEqual(0, which_bucket0)
-      self.assertAllEqual(0, which_bucket1)
-      self.assertAllEqual(
-          np.arange(0, 64, 2, dtype=np.int64), bucketed_values_even0["x"])
-      self.assertAllEqual(
-          np.arange(64, 128, 2, dtype=np.int64), bucketed_values_even1["x"])
+    # Ensure that bucket 1 was completely filtered out
+    self.assertAllEqual(0, which_bucket0)
+    self.assertAllEqual(0, which_bucket1)
+    self.assertAllEqual(
+        np.arange(0, 64, 2, dtype=np.int64), bucketed_values_even0["x"])
+    self.assertAllEqual(
+        np.arange(64, 128, 2, dtype=np.int64), bucketed_values_even1["x"])
 
-  @test_util.run_deprecated_v1
   def testDynamicWindowSize(self):
     components = np.arange(100).astype(np.int64)
 
@@ -207,111 +187,81 @@ class GroupByWindowTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.from_tensor_slices(components).apply(
         grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(20),
                                  None, window_size_func))
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      with self.assertRaises(errors.OutOfRangeError):
-        batches = 0
-        while True:
-          result = self.evaluate(get_next)
-          is_even = all(x % 2 == 0 for x in result)
-          is_odd = all(x % 2 == 1 for x in result)
-          self.assertTrue(is_even or is_odd)
-          expected_batch_size = 5 if is_even else 10
-          self.assertEqual(expected_batch_size, result.shape[0])
-          batches += 1
-
-      self.assertEqual(batches, 15)
-
-  @test_util.run_deprecated_v1
+
+    get_next = self.getNext(dataset)
+    with self.assertRaises(errors.OutOfRangeError):
+      batches = 0
+      while True:
+        result = self.evaluate(get_next())
+        is_even = all(x % 2 == 0 for x in result)
+        is_odd = all(x % 2 == 1 for x in result)
+        self.assertTrue(is_even or is_odd)
+        expected_batch_size = 5 if is_even else 10
+        self.assertEqual(expected_batch_size, result.shape[0])
+        batches += 1
+
+    self.assertEqual(batches, 15)
+
   def testSimple(self):
     components = np.random.randint(100, size=(200,)).astype(np.int64)
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.from_tensor_slices(components).map(lambda x: x * x)
-        .apply(
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        components).map(lambda x: x * x).apply(
             grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4),
-                                     4)))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      counts = []
-      with self.assertRaises(errors.OutOfRangeError):
-        while True:
-          result = self.evaluate(get_next)
-          self.assertTrue(
-              all(x % 2 == 0
-                  for x in result) or all(x % 2 == 1)
-              for x in result)
-          counts.append(result.shape[0])
-
-      self.assertEqual(len(components), sum(counts))
-      num_full_batches = len([c for c in counts if c == 4])
-      self.assertGreaterEqual(num_full_batches, 24)
-      self.assertTrue(all(c == 4 for c in counts[:num_full_batches]))
-
-  @test_util.run_deprecated_v1
+                                     4))
+    get_next = self.getNext(dataset)
+    counts = []
+    with self.assertRaises(errors.OutOfRangeError):
+      while True:
+        result = self.evaluate(get_next())
+        self.assertTrue(
+            all(x % 2 == 0 for x in result) or all(x % 2 == 1) for x in result)
+        counts.append(result.shape[0])
+
+    self.assertEqual(len(components), sum(counts))
+    num_full_batches = len([c for c in counts if c == 4])
+    self.assertGreaterEqual(num_full_batches, 24)
+    self.assertTrue(all(c == 4 for c in counts[:num_full_batches]))
+
   def testImmediateOutput(self):
     components = np.array(
         [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0], dtype=np.int64)
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.from_tensor_slices(components).repeat(-1).apply(
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).repeat(
+        -1).apply(
             grouping.group_by_window(lambda x: x % 3, lambda _, xs: xs.batch(4),
-                                     4)))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      # The input is infinite, so this test demonstrates that:
-      # 1. We produce output without having to consume the entire input,
-      # 2. Different buckets can produce output at different rates, and
-      # 3. For deterministic input, the output is deterministic.
-      for _ in range(3):
-        self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next))
-        self.assertAllEqual([1, 1, 1, 1], self.evaluate(get_next))
-        self.assertAllEqual([2, 2, 2, 2], self.evaluate(get_next))
-        self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next))
-
-  @test_util.run_deprecated_v1
+                                     4))
+    get_next = self.getNext(dataset)
+    # The input is infinite, so this test demonstrates that:
+    # 1. We produce output without having to consume the entire input,
+    # 2. Different buckets can produce output at different rates, and
+    # 3. For deterministic input, the output is deterministic.
+    for _ in range(3):
+      self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next()))
+      self.assertAllEqual([1, 1, 1, 1], self.evaluate(get_next()))
+      self.assertAllEqual([2, 2, 2, 2], self.evaluate(get_next()))
+      self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next()))
+
   def testSmallGroups(self):
     components = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0], dtype=np.int64)
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.from_tensor_slices(components).apply(
-            grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4),
-                                     4)))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next))
-      self.assertAllEqual([1, 1, 1, 1], self.evaluate(get_next))
-      # The small outputs at the end are deterministically produced in key
-      # order.
-      self.assertAllEqual([0, 0, 0], self.evaluate(get_next))
-      self.assertAllEqual([1], self.evaluate(get_next))
-
-  @test_util.run_deprecated_v1
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).apply(
+        grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4), 4))
+    get_next = self.getNext(dataset)
+    self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next()))
+    self.assertAllEqual([1, 1, 1, 1], self.evaluate(get_next()))
+    # The small outputs at the end are deterministically produced in key
+    # order.
+    self.assertAllEqual([0, 0, 0], self.evaluate(get_next()))
+    self.assertAllEqual([1], self.evaluate(get_next()))
+
   def testEmpty(self):
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.range(4).apply(
-            grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0)))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          "Window size must be greater than zero, but got 0."):
-        print(self.evaluate(get_next))
-
-  @test_util.run_deprecated_v1
+    dataset = dataset_ops.Dataset.range(4).apply(
+        grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0))
+
+    get_next = self.getNext(dataset)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Window size must be greater than zero, but got 0."):
+      print(self.evaluate(get_next()))
+
   def testReduceFuncError(self):
     components = np.random.randint(100, size=(200,)).astype(np.int64)
 
@@ -323,19 +273,13 @@ class GroupByWindowTest(test_base.DatasetTestBase):
           padded_shapes=(tensor_shape.TensorShape([]),
                          constant_op.constant([5], dtype=dtypes.int64) * -1))
 
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .map(lambda x: (x, ops.convert_to_tensor([x * x]))).apply(
-            grouping.group_by_window(lambda x, _: x % 2, reduce_func, 32)))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        components).map(lambda x: (x, ops.convert_to_tensor([x * x]))).apply(
+            grouping.group_by_window(lambda x, _: x % 2, reduce_func, 32))
+    get_next = self.getNext(dataset)
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(get_next())
 
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      with self.assertRaises(errors.InvalidArgumentError):
-        self.evaluate(get_next)
-
-  @test_util.run_deprecated_v1
   def testConsumeWindowDatasetMoreThanOnce(self):
     components = np.random.randint(50, size=(200,)).astype(np.int64)
 
@@ -349,26 +293,23 @@ class GroupByWindowTest(test_base.DatasetTestBase):
               4, padded_shapes=ops.convert_to_tensor([(key + 1) * 10])),
       ))
 
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .map(lambda x: array_ops.fill([math_ops.cast(x, dtypes.int32)], x))
-        .apply(grouping.group_by_window(
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        components
+    ).map(lambda x: array_ops.fill([math_ops.cast(x, dtypes.int32)], x)).apply(
+        grouping.group_by_window(
             lambda x: math_ops.cast(array_ops.shape(x)[0] // 10, dtypes.int64),
-            reduce_func, 4)))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      counts = []
-      with self.assertRaises(errors.OutOfRangeError):
-        while True:
-          tight_result, multiple_of_10_result = self.evaluate(get_next)
-          self.assertEqual(0, multiple_of_10_result.shape[1] % 10)
-          self.assertAllEqual(tight_result,
-                              multiple_of_10_result[:, :tight_result.shape[1]])
-          counts.append(tight_result.shape[0])
-      self.assertEqual(len(components), sum(counts))
+            reduce_func, 4))
+
+    get_next = self.getNext(dataset)
+    counts = []
+    with self.assertRaises(errors.OutOfRangeError):
+      while True:
+        tight_result, multiple_of_10_result = self.evaluate(get_next())
+        self.assertEqual(0, multiple_of_10_result.shape[1] % 10)
+        self.assertAllEqual(tight_result,
+                            multiple_of_10_result[:, :tight_result.shape[1]])
+        counts.append(tight_result.shape[0])
+    self.assertEqual(len(components), sum(counts))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
index 81f580fccbd6b0053eaa865408b4f8c5f95ba94f..1d02f4fb773537de3800d4039d10112e465df285 100644
--- a/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
@@ -34,9 +34,9 @@ from tensorflow.python.util import compat
 _NUMPY_RANDOM_SEED = 42
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class IgnoreErrorsTest(test_base.DatasetTestBase):
 
-  @test_util.run_deprecated_v1
   def testMapIgnoreError(self):
     components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
 
@@ -44,18 +44,13 @@ class IgnoreErrorsTest(test_base.DatasetTestBase):
         dataset_ops.Dataset.from_tensor_slices(components)
         .map(lambda x: array_ops.check_numerics(x, "message")).apply(
             error_ops.ignore_errors()))
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      for x in [1., 2., 3., 5.]:
-        self.assertEqual(x, self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
-
-  @test_util.run_deprecated_v1
+    get_next = self.getNext(dataset)
+
+    for x in [1., 2., 3., 5.]:
+      self.assertEqual(x, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
   def testParallelMapIgnoreError(self):
     components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
 
@@ -63,18 +58,13 @@ class IgnoreErrorsTest(test_base.DatasetTestBase):
         dataset_ops.Dataset.from_tensor_slices(components).map(
             lambda x: array_ops.check_numerics(x, "message"),
             num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors()))
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      for x in [1., 2., 3., 5.]:
-        self.assertEqual(x, self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
-
-  @test_util.run_deprecated_v1
+    get_next = self.getNext(dataset)
+
+    for x in [1., 2., 3., 5.]:
+      self.assertEqual(x, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
   def testReadFileIgnoreError(self):
 
     def write_string_to_file(value, filename):
@@ -91,28 +81,24 @@ class IgnoreErrorsTest(test_base.DatasetTestBase):
         dataset_ops.Dataset.from_tensor_slices(filenames).map(
             io_ops.read_file,
             num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors()))
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # All of the files are present.
-      self.evaluate(init_op)
-      for filename in filenames:
-        self.assertEqual(compat.as_bytes(filename), self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
-
-      # Delete one of the files.
-      os.remove(filenames[0])
-
-      # Attempting to read filenames[0] will fail, but ignore_errors()
-      # will catch the error.
-      self.evaluate(init_op)
-      for filename in filenames[1:]:
-        self.assertEqual(compat.as_bytes(filename), self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(dataset)
+
+    # All of the files are present.
+    for filename in filenames:
+      self.assertEqual(compat.as_bytes(filename), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+    # Delete one of the files.
+    os.remove(filenames[0])
+
+    # Attempting to read filenames[0] will fail, but ignore_errors()
+    # will catch the error.
+    get_next = self.getNext(dataset)
+    for filename in filenames[1:]:
+      self.assertEqual(compat.as_bytes(filename), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py
index c3c4ccd07708d2c7cfdc57c2a6fcbf320f1dfb36..79b8c492c1f09d6ef6df49c2c1d27569b095b9a7 100644
--- a/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py
@@ -25,14 +25,13 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class IndexedDatasetOpsTest(test_base.DatasetTestBase):
 
-  @test_util.run_deprecated_v1
   def testLowLevelIndexedDatasetOps(self):
     identity = ged_ops.experimental_identity_indexed_dataset(
         ops.convert_to_tensor(16, dtype=dtypes.uint64))
@@ -43,40 +42,34 @@ class IndexedDatasetOpsTest(test_base.DatasetTestBase):
         output_shapes=[[]])
     materialize = ged_ops.experimental_indexed_dataset_materialize(
         identity, handle)
-    index = array_ops.placeholder(dtypes.uint64)
     get_op = ged_ops.experimental_indexed_dataset_get(
-        handle, index, output_types=[dtypes.uint64], output_shapes=[[]])
+        handle, 3, output_types=[dtypes.uint64], output_shapes=[[]])
 
-    with self.cached_session() as sess:
-      self.evaluate(materialize)
-      self.assertEqual([3], sess.run(get_op, feed_dict={index: 3}))
+    self.evaluate(materialize)
+    self.assertEqual([3], self.evaluate(get_op))
 
+  # TODO(b/117581999): Eager mode not supported.
   @test_util.run_deprecated_v1
-  def testIdentityIndexedDataset(self):
+  def testSkipEagerIdentityIndexedDataset(self):
     ds = indexed_dataset_ops.IdentityIndexedDataset(16)
     materialized = ds.materialize()
-    with self.cached_session() as sess:
-      self.evaluate(materialized.initializer)
-      placeholder = array_ops.placeholder(dtypes.uint64, shape=[])
-      for i in range(16):
-        output = sess.run(
-            materialized.get(placeholder), feed_dict={placeholder: i})
-        self.assertEqual([i], output)
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(materialized.get(placeholder), feed_dict={placeholder: 16})
+    self.evaluate(materialized.initializer)
+    for i in range(16):
+      output = self.evaluate(materialized.get(i))
+      self.assertEqual([i], output)
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(materialized.get(16))
 
   @unittest.skip("Requisite functionality currently unimplemented.")
   def testIdentityIndexedDatasetIterator(self):
     ds = indexed_dataset_ops.IdentityIndexedDataset(16)
-    itr = ds.make_initializable_iterator()
-    n = itr.get_next()
-    with self.cached_session() as sess:
-      self.evaluate(itr.initializer)
-      for i in range(16):
-        output = self.evaluate(n)
-        self.assertEqual(i, output)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(n)
+    n = self.getNext(ds)
+
+    for i in range(16):
+      output = self.evaluate(n())
+      self.assertEqual(i, output)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(n())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
index 7c78810494866cbd4cac4201d23182e083037e1c..1fb6971ecdec90964a6f860a797d7bf8ddf8bfb8 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
@@ -21,7 +21,6 @@ import numpy as np
 
 from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.python.data.experimental.ops import readers
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
@@ -33,78 +32,58 @@ from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MakeBatchedFeaturesDatasetTest(
     reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase):
 
   def testRead(self):
     for batch_size in [1, 2]:
       for num_epochs in [1, 10]:
-        with ops.Graph().as_default() as g:
-          with self.session(graph=g) as sess:
-            # Basic test: read from file 0.
-            self.outputs = dataset_ops.make_one_shot_iterator(
-                self.make_batch_feature(
-                    filenames=self.test_filenames[0],
-                    label_key="label",
-                    num_epochs=num_epochs,
-                    batch_size=batch_size)).get_next()
-            self.verify_records(
-                sess,
-                batch_size,
-                0,
+        # Basic test: read from file 0.
+        self.outputs = self.getNext(
+            self.make_batch_feature(
+                filenames=self.test_filenames[0],
+                label_key="label",
                 num_epochs=num_epochs,
-                label_key_provided=True)
-            with self.assertRaises(errors.OutOfRangeError):
-              self._next_actual_batch(sess, label_key_provided=True)
-
-        with ops.Graph().as_default() as g:
-          with self.session(graph=g) as sess:
-            # Basic test: read from file 1.
-            self.outputs = dataset_ops.make_one_shot_iterator(
-                self.make_batch_feature(
-                    filenames=self.test_filenames[1],
-                    label_key="label",
-                    num_epochs=num_epochs,
-                    batch_size=batch_size)).get_next()
-            self.verify_records(
-                sess,
-                batch_size,
-                1,
+                batch_size=batch_size))
+        self.verify_records(
+            batch_size, 0, num_epochs=num_epochs, label_key_provided=True)
+        with self.assertRaises(errors.OutOfRangeError):
+          self._next_actual_batch(label_key_provided=True)
+
+          # Basic test: read from file 1.
+        self.outputs = self.getNext(
+            self.make_batch_feature(
+                filenames=self.test_filenames[1],
+                label_key="label",
                 num_epochs=num_epochs,
-                label_key_provided=True)
-            with self.assertRaises(errors.OutOfRangeError):
-              self._next_actual_batch(sess, label_key_provided=True)
-
-        with ops.Graph().as_default() as g:
-          with self.session(graph=g) as sess:
-            # Basic test: read from both files.
-            self.outputs = dataset_ops.make_one_shot_iterator(
-                self.make_batch_feature(
-                    filenames=self.test_filenames,
-                    label_key="label",
-                    num_epochs=num_epochs,
-                    batch_size=batch_size)).get_next()
-            self.verify_records(
-                sess,
-                batch_size,
+                batch_size=batch_size))
+        self.verify_records(
+            batch_size, 1, num_epochs=num_epochs, label_key_provided=True)
+        with self.assertRaises(errors.OutOfRangeError):
+          self._next_actual_batch(label_key_provided=True)
+
+        # Basic test: read from both files.
+        self.outputs = self.getNext(
+            self.make_batch_feature(
+                filenames=self.test_filenames,
+                label_key="label",
                 num_epochs=num_epochs,
-                label_key_provided=True)
-            with self.assertRaises(errors.OutOfRangeError):
-              self._next_actual_batch(sess, label_key_provided=True)
-
-        with ops.Graph().as_default() as g:
-          with self.session(graph=g) as sess:
-            # Basic test: read from both files.
-            self.outputs = dataset_ops.make_one_shot_iterator(
-                self.make_batch_feature(
-                    filenames=self.test_filenames,
-                    num_epochs=num_epochs,
-                    batch_size=batch_size)).get_next()
-            self.verify_records(sess, batch_size, num_epochs=num_epochs)
-            with self.assertRaises(errors.OutOfRangeError):
-              self._next_actual_batch(sess)
+                batch_size=batch_size))
+        self.verify_records(
+            batch_size, num_epochs=num_epochs, label_key_provided=True)
+        with self.assertRaises(errors.OutOfRangeError):
+          self._next_actual_batch(label_key_provided=True)
+        # Basic test: read from both files.
+        self.outputs = self.getNext(
+            self.make_batch_feature(
+                filenames=self.test_filenames,
+                num_epochs=num_epochs,
+                batch_size=batch_size))
+        self.verify_records(batch_size, num_epochs=num_epochs)
+        with self.assertRaises(errors.OutOfRangeError):
+          self._next_actual_batch()
 
-  @test_util.run_deprecated_v1
   def testReadWithEquivalentDataset(self):
     features = {
         "file": parsing_ops.FixedLenFeature([], dtypes.int64),
@@ -114,120 +93,109 @@ class MakeBatchedFeaturesDatasetTest(
         core_readers.TFRecordDataset(self.test_filenames)
         .map(lambda x: parsing_ops.parse_single_example(x, features))
         .repeat(10).batch(2))
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    init_op = iterator.initializer
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      for file_batch, _, _, _, record_batch, _ in self._next_expected_batch(
-          range(self._num_files), 2, 10):
-        actual_batch = self.evaluate(next_element)
-        self.assertAllEqual(file_batch, actual_batch["file"])
-        self.assertAllEqual(record_batch, actual_batch["record"])
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    next_element = self.getNext(dataset)
+    for file_batch, _, _, _, record_batch, _ in self._next_expected_batch(
+        range(self._num_files), 2, 10):
+      actual_batch = self.evaluate(next_element())
+      self.assertAllEqual(file_batch, actual_batch["file"])
+      self.assertAllEqual(record_batch, actual_batch["record"])
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testReadWithFusedShuffleRepeatDataset(self):
     num_epochs = 5
     total_records = num_epochs * self._num_records
     for batch_size in [1, 2]:
       # Test that shuffling with same seed produces the same result.
-      with ops.Graph().as_default() as g:
-        with self.session(graph=g) as sess:
-          outputs1 = dataset_ops.make_one_shot_iterator(self.make_batch_feature(
+      outputs1 = self.getNext(
+          self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
               batch_size=batch_size,
               shuffle=True,
-              shuffle_seed=5)).get_next()
-          outputs2 = dataset_ops.make_one_shot_iterator(self.make_batch_feature(
+              shuffle_seed=5))
+      outputs2 = self.getNext(
+          self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
               batch_size=batch_size,
               shuffle=True,
-              shuffle_seed=5)).get_next()
-          for _ in range(total_records // batch_size):
-            batch1 = self._run_actual_batch(outputs1, sess)
-            batch2 = self._run_actual_batch(outputs2, sess)
-            for i in range(len(batch1)):
-              self.assertAllEqual(batch1[i], batch2[i])
+              shuffle_seed=5))
+      for _ in range(total_records // batch_size):
+        batch1 = self._run_actual_batch(outputs1)
+        batch2 = self._run_actual_batch(outputs2)
+        for i in range(len(batch1)):
+          self.assertAllEqual(batch1[i], batch2[i])
 
       # Test that shuffling with different seeds produces a different order.
-      with ops.Graph().as_default() as g:
-        with self.session(graph=g) as sess:
-          outputs1 = dataset_ops.make_one_shot_iterator(self.make_batch_feature(
+      outputs1 = self.getNext(
+          self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
               batch_size=batch_size,
               shuffle=True,
-              shuffle_seed=5)).get_next()
-          outputs2 = dataset_ops.make_one_shot_iterator(self.make_batch_feature(
+              shuffle_seed=5))
+      outputs2 = self.getNext(
+          self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
               batch_size=batch_size,
               shuffle=True,
-              shuffle_seed=15)).get_next()
-          all_equal = True
-          for _ in range(total_records // batch_size):
-            batch1 = self._run_actual_batch(outputs1, sess)
-            batch2 = self._run_actual_batch(outputs2, sess)
-            for i in range(len(batch1)):
-              all_equal = all_equal and np.array_equal(batch1[i], batch2[i])
-          self.assertFalse(all_equal)
+              shuffle_seed=15))
+      all_equal = True
+      for _ in range(total_records // batch_size):
+        batch1 = self._run_actual_batch(outputs1)
+        batch2 = self._run_actual_batch(outputs2)
+        for i in range(len(batch1)):
+          all_equal = all_equal and np.array_equal(batch1[i], batch2[i])
+      self.assertFalse(all_equal)
 
   def testParallelReadersAndParsers(self):
     num_epochs = 5
     for batch_size in [1, 2]:
       for reader_num_threads in [2, 4]:
         for parser_num_threads in [2, 4]:
-          with ops.Graph().as_default() as g:
-            with self.session(graph=g) as sess:
-              self.outputs = dataset_ops.make_one_shot_iterator(
-                  self.make_batch_feature(
-                      filenames=self.test_filenames,
-                      label_key="label",
-                      num_epochs=num_epochs,
-                      batch_size=batch_size,
-                      reader_num_threads=reader_num_threads,
-                      parser_num_threads=parser_num_threads)).get_next()
-              self.verify_records(
-                  sess,
-                  batch_size,
+          self.outputs = self.getNext(
+              self.make_batch_feature(
+                  filenames=self.test_filenames,
+                  label_key="label",
                   num_epochs=num_epochs,
-                  label_key_provided=True,
-                  interleave_cycle_length=reader_num_threads)
-              with self.assertRaises(errors.OutOfRangeError):
-                self._next_actual_batch(sess, label_key_provided=True)
-
-          with ops.Graph().as_default() as g:
-            with self.session(graph=g) as sess:
-              self.outputs = dataset_ops.make_one_shot_iterator(
-                  self.make_batch_feature(
-                      filenames=self.test_filenames,
-                      num_epochs=num_epochs,
-                      batch_size=batch_size,
-                      reader_num_threads=reader_num_threads,
-                      parser_num_threads=parser_num_threads)).get_next()
-              self.verify_records(
-                  sess,
-                  batch_size,
+                  batch_size=batch_size,
+                  reader_num_threads=reader_num_threads,
+                  parser_num_threads=parser_num_threads))
+          self.verify_records(
+              batch_size,
+              num_epochs=num_epochs,
+              label_key_provided=True,
+              interleave_cycle_length=reader_num_threads)
+          with self.assertRaises(errors.OutOfRangeError):
+            self._next_actual_batch(label_key_provided=True)
+
+          self.outputs = self.getNext(
+              self.make_batch_feature(
+                  filenames=self.test_filenames,
                   num_epochs=num_epochs,
-                  interleave_cycle_length=reader_num_threads)
-              with self.assertRaises(errors.OutOfRangeError):
-                self._next_actual_batch(sess)
+                  batch_size=batch_size,
+                  reader_num_threads=reader_num_threads,
+                  parser_num_threads=parser_num_threads))
+          self.verify_records(
+              batch_size,
+              num_epochs=num_epochs,
+              interleave_cycle_length=reader_num_threads)
+          with self.assertRaises(errors.OutOfRangeError):
+            self._next_actual_batch()
 
   def testDropFinalBatch(self):
     for batch_size in [1, 2]:
       for num_epochs in [1, 10]:
         with ops.Graph().as_default():
           # Basic test: read from file 0.
-          outputs = dataset_ops.make_one_shot_iterator(self.make_batch_feature(
+          outputs = self.make_batch_feature(
               filenames=self.test_filenames[0],
               label_key="label",
               num_epochs=num_epochs,
               batch_size=batch_size,
-              drop_final_batch=True)).get_next()
+              drop_final_batch=True)
           for tensor in nest.flatten(outputs):
             if isinstance(tensor, ops.Tensor):  # Guard against SparseTensor.
               self.assertEqual(tensor.shape[0], batch_size)
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
index 3b7b335e7066175fba6ef190b977362bc461ca1d..3f371434c047a32481ce38668ece1b1af0f00b1c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
@@ -449,6 +449,28 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
         header=True,
     )
 
+  def testMakeCSVDataset_withNAValuesAndFieldDelim(self):
+    """Tests that datasets can be created from different delim and na_value."""
+    column_names = ["col%d" % i for i in range(5)]
+    inputs = [["0 1 2 3 4", "5 6 7 8 9"], ["10 11 12 13 14", "15 16 17 ? 19"]]
+    expected_output = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14],
+                       [15, 16, 17, 0, 19]]
+    label = "col0"
+
+    self._test_dataset(
+        inputs,
+        expected_output=expected_output,
+        expected_keys=column_names,
+        column_names=column_names,
+        label_name=label,
+        batch_size=1,
+        num_epochs=1,
+        shuffle=False,
+        header=False,
+        na_value="?",
+        field_delim=" ",
+    )
+
   def testMakeCSVDataset_withSelectCols(self):
     record_defaults = [
         constant_op.constant([], dtypes.int32),
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
index ab2feb642629eef098162ca445f54e84fc0389a9..9f35aa69a834dc82d50550a99665d5d248e02e0f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
@@ -19,14 +19,14 @@ from __future__ import print_function
 
 from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.python.data.experimental.ops import readers
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MakeTFRecordDatasetTest(
     reader_dataset_ops_test_base.TFRecordDatasetTestBase):
 
@@ -90,7 +90,6 @@ class MakeTFRecordDatasetTest(
       yield record_batch
 
   def _verify_records(self,
-                      sess,
                       outputs,
                       batch_size,
                       file_index,
@@ -106,7 +105,7 @@ class MakeTFRecordDatasetTest(
     for expected_batch in self._next_expected_batch(
         file_indices, batch_size, num_epochs, interleave_cycle_length,
         drop_final_batch, use_parser_fn):
-      actual_batch = self.evaluate(outputs)
+      actual_batch = self.evaluate(outputs())
       self.assertAllEqual(expected_batch, actual_batch)
 
   def _read_test(self, batch_size, num_epochs, file_index=None,
@@ -121,23 +120,25 @@ class MakeTFRecordDatasetTest(
     else:
       fn = None
 
-    with ops.Graph().as_default() as g:
-      with self.session(graph=g) as sess:
-        outputs = dataset_ops.make_one_shot_iterator(
-            readers.make_tf_record_dataset(
-                file_pattern=file_pattern,
-                num_epochs=num_epochs,
-                batch_size=batch_size,
-                parser_fn=fn,
-                num_parallel_reads=num_parallel_reads,
-                drop_final_batch=drop_final_batch,
-                shuffle=False)).get_next()
-        self._verify_records(
-            sess, outputs, batch_size, file_index, num_epochs=num_epochs,
-            interleave_cycle_length=num_parallel_reads,
-            drop_final_batch=drop_final_batch, use_parser_fn=parser_fn)
-        with self.assertRaises(errors.OutOfRangeError):
-          self.evaluate(outputs)
+    outputs = self.getNext(
+        readers.make_tf_record_dataset(
+            file_pattern=file_pattern,
+            num_epochs=num_epochs,
+            batch_size=batch_size,
+            parser_fn=fn,
+            num_parallel_reads=num_parallel_reads,
+            drop_final_batch=drop_final_batch,
+            shuffle=False))
+    self._verify_records(
+        outputs,
+        batch_size,
+        file_index,
+        num_epochs=num_epochs,
+        interleave_cycle_length=num_parallel_reads,
+        drop_final_batch=drop_final_batch,
+        use_parser_fn=parser_fn)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(outputs())
 
   def testRead(self):
     for batch_size in [1, 2]:
@@ -178,50 +179,46 @@ class MakeTFRecordDatasetTest(
 
   def _shuffle_test(self, batch_size, num_epochs, num_parallel_reads=1,
                     seed=None):
-    with ops.Graph().as_default() as g:
-      with self.session(graph=g) as sess:
-        dataset = readers.make_tf_record_dataset(
-            file_pattern=self.test_filenames,
-            num_epochs=num_epochs,
-            batch_size=batch_size,
-            num_parallel_reads=num_parallel_reads,
-            shuffle=True,
-            shuffle_seed=seed)
-        iterator = dataset_ops.make_initializable_iterator(dataset)
-        next_element = iterator.get_next()
-
-        self.evaluate(iterator.initializer)
-        first_batches = []
-        try:
-          while True:
-            first_batches.append(self.evaluate(next_element))
-        except errors.OutOfRangeError:
-          pass
-
-        self.evaluate(iterator.initializer)
-        second_batches = []
-        try:
-          while True:
-            second_batches.append(self.evaluate(next_element))
-        except errors.OutOfRangeError:
-          pass
-
-        self.assertEqual(len(first_batches), len(second_batches))
-        if seed is not None:
-          # if you set a seed, should get the same results
-          for i in range(len(first_batches)):
-            self.assertAllEqual(first_batches[i], second_batches[i])
-
-        expected = []
-        for f in range(self._num_files):
-          for r in range(self._num_records):
-            expected.extend([self._record(f, r)] * num_epochs)
-
-        for batches in (first_batches, second_batches):
-          actual = []
-          for b in batches:
-            actual.extend(b)
-          self.assertAllEqual(sorted(expected), sorted(actual))
+    dataset = readers.make_tf_record_dataset(
+        file_pattern=self.test_filenames,
+        num_epochs=num_epochs,
+        batch_size=batch_size,
+        num_parallel_reads=num_parallel_reads,
+        shuffle=True,
+        shuffle_seed=seed)
+
+    next_element = self.getNext(dataset)
+    first_batches = []
+    try:
+      while True:
+        first_batches.append(self.evaluate(next_element()))
+    except errors.OutOfRangeError:
+      pass
+
+    next_element = self.getNext(dataset)
+    second_batches = []
+    try:
+      while True:
+        second_batches.append(self.evaluate(next_element()))
+    except errors.OutOfRangeError:
+      pass
+
+    self.assertEqual(len(first_batches), len(second_batches))
+    if seed is not None:
+      # if you set a seed, should get the same results
+      for i in range(len(first_batches)):
+        self.assertAllEqual(first_batches[i], second_batches[i])
+
+    expected = []
+    for f in range(self._num_files):
+      for r in range(self._num_records):
+        expected.extend([self._record(f, r)] * num_epochs)
+
+    for batches in (first_batches, second_batches):
+      actual = []
+      for b in batches:
+        actual.extend(b)
+      self.assertAllEqual(sorted(expected), sorted(actual))
 
   def testShuffle(self):
     for batch_size in [1, 2]:
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
index 5c115f7ae311ddabef1ff6d7279d724bb1e18f85..775dc61e480f56f60b54a1334e51e6e2c5a133e7 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -32,11 +33,14 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(
@@ -49,7 +53,6 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("ParallelCallsNUMA", 2, None, True),
       ("ParallelBatchesNUMA", None, 10, True),
   )
-  @test_util.run_deprecated_v1
   def testMapAndBatch(self, num_parallel_calls, num_parallel_batches,
                       numa_aware):
     """Test a dataset that maps a TF function across its input elements."""
@@ -59,74 +62,66 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
                   np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
                   np.array(37.0) * np.arange(7))
 
-    count = array_ops.placeholder(dtypes.int64, shape=[])
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-
     def _map_fn(x, y, z):
       return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(components).repeat(count).apply(
-            batching.map_and_batch(
-                map_func=_map_fn,
-                batch_size=batch_size,
-                num_parallel_calls=num_parallel_calls,
-                num_parallel_batches=num_parallel_batches)))
-
-    if numa_aware:
-      options = dataset_ops.Options()
-      options.experimental_numa_aware = True
-      dataset = dataset.with_options(options)
+    def dataset_fn(batch_size, count, numa_aware=numa_aware):
+      dataset = dataset_ops.Dataset.from_tensor_slices(components).repeat(
+          count).apply(
+              batching.map_and_batch(
+                  map_func=_map_fn,
+                  batch_size=batch_size,
+                  num_parallel_calls=num_parallel_calls,
+                  num_parallel_batches=num_parallel_batches))
+      if numa_aware:
+        options = dataset_ops.Options()
+        options.experimental_numa_aware = True
+        dataset = dataset.with_options(options)
+      return dataset
+
+    # Batch of a finite input, where the batch_size divides the
+    # total number of elements.
+    dataset = dataset_fn(14, 28)
+    get_next = self.getNext(dataset)
+    self.assertEqual([[None] + list(c.shape[1:]) for c in components],
+                     [shape.as_list() for shape in dataset.output_shapes])
+    num_batches = (28 * 7) // 14
+    for i in range(num_batches):
+      result = self.evaluate(get_next())
+      for component, result_component in zip(components, result):
+        for j in range(14):
+          self.assertAllEqual(component[(i * 14 + j) % 7]**2,
+                              result_component[j])
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    # Batch of a finite input, where the batch_size does not
+    # divide the total number of elements.
+    get_next = self.getNext(dataset_fn(8, 14))
 
-    self.assertEqual([[None] + list(c.shape[1:]) for c in components],
-                     [t.shape.as_list() for t in get_next])
-
-    with self.cached_session() as sess:
-      # Batch of a finite input, where the batch_size divides the
-      # total number of elements.
-      sess.run(init_op, feed_dict={count: 28, batch_size: 14})
-      num_batches = (28 * 7) // 14
-      for i in range(num_batches):
-        result = self.evaluate(get_next)
-        for component, result_component in zip(components, result):
-          for j in range(14):
-            self.assertAllEqual(component[(i * 14 + j) % 7]**2,
-                                result_component[j])
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
-
-      # Batch of a finite input, where the batch_size does not
-      # divide the total number of elements.
-      sess.run(init_op, feed_dict={count: 14, batch_size: 8})
-
-      # We expect (num_batches - 1) full-sized batches.
-      num_batches = int(math.ceil((14 * 7) / 8))
-      for i in range(num_batches - 1):
-        result = self.evaluate(get_next)
-        for component, result_component in zip(components, result):
-          for j in range(8):
-            self.assertAllEqual(component[(i * 8 + j) % 7]**2,
-                                result_component[j])
-      result = self.evaluate(get_next)
+    # We expect (num_batches - 1) full-sized batches.
+    num_batches = int(math.ceil((14 * 7) / 8))
+    for i in range(num_batches - 1):
+      result = self.evaluate(get_next())
       for component, result_component in zip(components, result):
-        for j in range((14 * 7) % 8):
-          self.assertAllEqual(component[((num_batches - 1) * 8 + j) % 7]**2,
+        for j in range(8):
+          self.assertAllEqual(component[(i * 8 + j) % 7]**2,
                               result_component[j])
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
 
-      # Batch of an empty input should fail straight away.
-      sess.run(init_op, feed_dict={count: 0, batch_size: 8})
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    result = self.evaluate(get_next())
+    for component, result_component in zip(components, result):
+      for j in range((14 * 7) % 8):
+        self.assertAllEqual(component[((num_batches - 1) * 8 + j) % 7]**2,
+                            result_component[j])
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
-      # Empty batch should be an initialization time error.
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(init_op, feed_dict={count: 14, batch_size: 0})
+    # Batch of an empty input should fail straight away.
+    self.assertDatasetProduces(dataset_fn(8, 0), expected_output=[])
+
+    # Empty batch should be an initialization time error.
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.assertDatasetProduces(dataset_fn(0, 14), expected_output=[])
 
   @parameterized.named_parameters(
       ("Even", False, False),
@@ -134,7 +129,6 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("EvenNUMA", False, True),
       ("UnevenNUMA", True, True),
   )
-  @test_util.run_deprecated_v1
   def testMapAndBatchPartialBatch(self, drop_remainder, numa_aware):
     dataset = (
         dataset_ops.Dataset.range(10).apply(
@@ -147,26 +141,20 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
 
     if drop_remainder:
-      self.assertEqual([4, 1], iterator.output_shapes.as_list())
+      self.assertEqual([4, 1], dataset.output_shapes.as_list())
     else:
-      self.assertEqual([None, 1], iterator.output_shapes.as_list())
-    next_element = iterator.get_next()
-    with self.cached_session():
-      self.assertAllEqual([[0], [1], [4], [9]], self.evaluate(next_element))
-      self.assertAllEqual([[16], [25], [36], [49]], self.evaluate(next_element))
-      if not drop_remainder:
-        self.assertAllEqual([[64], [81]], self.evaluate(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+      self.assertEqual([None, 1], dataset.output_shapes.as_list())
+    expected_output = [[[0], [1], [4], [9]], [[16], [25], [36], [49]]]
+    if not drop_remainder:
+      expected_output.append([[64], [81]])
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
 
   @parameterized.named_parameters(
       ("Normal", False),
       ("NUMA", True),
   )
-  @test_util.run_deprecated_v1
   def testMapAndBatchYieldsPartialBatch(self, numa_aware):
     dataset = (
         dataset_ops.Dataset.range(10).apply(
@@ -176,22 +164,15 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
-    self.assertEqual([None, 1], iterator.output_shapes.as_list())
-    next_element = iterator.get_next()
-    with self.cached_session():
-      self.assertAllEqual([[0], [1], [4], [9]], self.evaluate(next_element))
-      self.assertAllEqual([[16], [25], [36], [49]], self.evaluate(next_element))
-      self.assertAllEqual([[64], [81]], self.evaluate(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    self.assertEqual([None, 1], dataset.output_shapes.as_list())
+    expected_output = [[[0], [1], [4], [9]], [[16], [25], [36], [49]],
+                       [[64], [81]]]
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
 
   @parameterized.named_parameters(
       ("Normal", False),
       ("NUMA", True),
   )
-  @test_util.run_deprecated_v1
   def testMapAndBatchParallelGetNext(self, numa_aware):
     dataset = dataset_ops.Dataset.range(50000).apply(
         batching.map_and_batch(lambda x: x, batch_size=100))
@@ -199,27 +180,32 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
+
+    if context.executing_eagerly():
+      iterator = iter(dataset)
+      get_next = iterator._next_internal  # pylint: disable=protected-access
+    else:
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
+      get_next = iterator.get_next
 
     elements = []
     for _ in range(100):
-      elements.append(iterator.get_next())
-    with self.cached_session():
-      for i in range(5):
-        got = self.evaluate(elements)
-        got.sort(key=lambda x: x[0])
-        expected = []
-        for j in range(100):
-          expected.append(range(i * 10000 + j * 100, i * 10000 + (j + 1) * 100))
-        self.assertAllEqual(got, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(elements)
+      elements.append(get_next)
+
+    for i in range(5):
+      got = self.evaluate([element() for element in elements])
+      got.sort(key=lambda x: x[0])
+      expected = []
+      for j in range(100):
+        expected.append(range(i * 10000 + j * 100, i * 10000 + (j + 1) * 100))
+      self.assertAllEqual(got, expected)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate([element() for element in elements])
 
   @parameterized.named_parameters(
       ("Normal", False),
       ("NUMA", True),
   )
-  @test_util.run_deprecated_v1
   def testMapAndBatchParallelGetNextDropRemainder(self, numa_aware):
     dataset = dataset_ops.Dataset.range(49999).apply(
         batching.map_and_batch(
@@ -229,27 +215,32 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
+
+    if context.executing_eagerly():
+      iterator = iter(dataset)
+      get_next = iterator._next_internal  # pylint: disable=protected-access
+    else:
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
+      get_next = iterator.get_next
 
     elements = []
     for _ in range(100):
-      elements.append(iterator.get_next())
-    with self.cached_session():
-      for i in range(4):
-        got = self.evaluate(elements)
-        got.sort(key=lambda x: x[0])
-        expected = []
-        for j in range(100):
-          expected.append(range(i * 10000 + j * 100, i * 10000 + (j + 1) * 100))
-        self.assertAllEqual(got, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(elements)
+      elements.append(get_next)
+
+    for i in range(4):
+      got = self.evaluate([element() for element in elements])
+      got.sort(key=lambda x: x[0])
+      expected = []
+      for j in range(100):
+        expected.append(range(i * 10000 + j * 100, i * 10000 + (j + 1) * 100))
+      self.assertAllEqual(got, expected)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate([element() for element in elements])
 
   @parameterized.named_parameters(
       ("Normal", False),
       ("NUMA", True),
   )
-  @test_util.run_deprecated_v1
   def testMapAndBatchSparse(self, numa_aware):
 
     def _sparse(i):
@@ -262,52 +253,39 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session():
-      self.evaluate(init_op)
-      for i in range(2):
-        actual = self.evaluate(get_next)
-        expected = sparse_tensor.SparseTensorValue(
-            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
-            values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
-            dense_shape=[5, 1])
-        self.assertTrue(sparse_tensor.is_sparse(actual))
-        self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            sparse_tensor.SparseTensorValue(
+                indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
+                values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
+                dense_shape=[5, 1]) for i in range(2)
+        ])
 
   @parameterized.named_parameters(
       ("Normal", False),
       ("NUMA", True),
   )
-  @test_util.run_deprecated_v1
   def testMapAndBatchFails(self, numa_aware):
     """Test a dataset that maps a TF function across its input elements."""
-    dataset = dataset_ops.Dataset.from_tensors(
-        array_ops.check_numerics(
-            constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-    dataset = dataset.apply(batching.map_and_batch(lambda x: x, batch_size))
-    if numa_aware:
-      options = dataset_ops.Options()
-      options.experimental_numa_aware = True
-      dataset = dataset.with_options(options)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
 
-    init_op = iterator.initializer
-    with self.cached_session() as sess:
-      with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
-        sess.run(init_op, feed_dict={batch_size: 14})
+    with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
+      dataset = dataset_ops.Dataset.from_tensors(
+          array_ops.check_numerics(
+              constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
+      dataset = dataset.apply(batching.map_and_batch(lambda x: x, 14))
+      if numa_aware:
+        options = dataset_ops.Options()
+        options.experimental_numa_aware = True
+        dataset = dataset.with_options(options)
+      get_next = self.getNext(dataset)
+      self.evaluate(get_next())
 
   @parameterized.named_parameters(
       ("Normal", False),
       ("NUMA", True),
   )
-  @test_util.run_deprecated_v1
   def testMapAndBatchShapeMismatch(self, numa_aware):
     """Test a dataset that maps a TF function across its input elements."""
 
@@ -325,15 +303,10 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-    with self.cached_session():
-      self.evaluate(init_op)
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "number of elements does not match"):
-        self.evaluate(get_next)
+    self.assertDatasetProduces(
+        dataset,
+        expected_error=(errors.InvalidArgumentError,
+                        "number of elements does not match"))
 
   @parameterized.named_parameters(
       ("Normal", False),
@@ -358,12 +331,9 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
-    get_next = iterator.get_next()
-
-    with self.cached_session():
-      for _ in range(3):
-        self.evaluate(get_next)
+    get_next = self.getNext(dataset)
+    for _ in range(3):
+      self.evaluate(get_next())
 
   @parameterized.named_parameters(
       ("1", 0, False),
@@ -379,7 +349,6 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("5NUMA", 95, True),
       ("6NUMA", 99, True),
   )
-  @test_util.run_deprecated_v1
   def testMapAndBatchMapError(self, threshold, numa_aware):
 
     def raising_py_fn(i):
@@ -396,24 +365,22 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
-    get_next = iterator.get_next()
 
-    with self.cached_session():
-      for i in range(threshold // 10):
-        self.assertAllEqual([i * 10 + j for j in range(10)],
-                            self.evaluate(get_next))
-      if numa_aware:
-        if threshold % 10 != 0:
-          self.assertAllEqual(
-              [threshold // 10 * 10 + j for j in range(threshold % 10)],
-              self.evaluate(get_next))
-      else:
-        for i in range(threshold // 10, 10):
-          with self.assertRaises(errors.InvalidArgumentError):
-            self.evaluate(get_next)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(dataset)
+    for i in range(threshold // 10):
+      self.assertAllEqual([i * 10 + j for j in range(10)],
+                          self.evaluate(get_next()))
+    if numa_aware:
+      if threshold % 10 != 0:
+        self.assertAllEqual(
+            [threshold // 10 * 10 + j for j in range(threshold % 10)],
+            self.evaluate(get_next()))
+    else:
+      for i in range(threshold // 10, 10):
+        with self.assertRaises(errors.InvalidArgumentError):
+          self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   @parameterized.named_parameters(
       ("1", False, dtypes.bool, False),
@@ -452,12 +419,10 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
 
-    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
-
-    with self.cached_session():
-      for _ in range(10):
-        self.assertAllEqual([element for _ in range(10)],
-                            self.evaluate(get_next))
+    get_next = self.getNext(dataset)
+    for _ in range(10):
+      self.assertAllEqual([element for _ in range(10)],
+                          self.evaluate(get_next()))
 
   @parameterized.named_parameters(
       ("Identity", None, lambda x: x, None),
@@ -465,45 +430,38 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("Swap", (None, None), lambda x, y: (y, x), None),
       ("Project", (None, None), lambda x, y: x, None),
   )
-  @test_util.run_deprecated_v1
   def testShortCircuit(self, structure, map_fn, num_parallel_calls):
     dataset = self.structuredDataset(structure).repeat().apply(
         batching.map_and_batch(map_fn, batch_size=10))
-    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
+    get_next = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      if isinstance(structure, tuple):
-        expected = map_fn(
-            *sess.run(self.structuredElement(structure, shape=[10])))
-      else:
-        expected = map_fn(
-            sess.run(self.structuredElement(structure, shape=[10])))
-      self.assertAllEqual(expected, self.evaluate(get_next))
+    if isinstance(structure, tuple):
+      expected = map_fn(
+          *self.evaluate(self.structuredElement(structure, shape=[10])))
+    else:
+      expected = map_fn(
+          self.evaluate(self.structuredElement(structure, shape=[10])))
+    self.assertAllEqual(expected, self.evaluate(get_next()))
 
-  @test_util.run_deprecated_v1
   def testShortCircuitCapturedInput(self):
-    captured_t = array_ops.placeholder(dtypes.int64, shape=[])
+    captured_t = variables.Variable(42)
     dataset = self.structuredDataset(None).repeat().apply(
         batching.map_and_batch(lambda x: captured_t, batch_size=10))
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer, feed_dict={captured_t: 42})
-      self.assertAllEqual([42] * 10, self.evaluate(get_next))
+    self.evaluate(variables.global_variables_initializer())
+    get_next = self.getNext(dataset, requires_initialization=True)
+    self.assertAllEqual([42] * 10, self.evaluate(get_next()))
 
   @parameterized.named_parameters(
       ("Normal", False),
       ("NUMA", True),
   )
-  @test_util.run_deprecated_v1
   def testMapAndBatchControlFlow(self, numa_aware):
 
     def map_fn(x):
-      previous_cond_v2_value = control_flow_ops.ENABLE_COND_V2
-      control_flow_ops.ENABLE_COND_V2 = True
+      previous_control_flow_v2_value = control_flow_util.ENABLE_CONTROL_FLOW_V2
+      control_flow_util.ENABLE_CONTROL_FLOW_V2 = True
       return_value = control_flow_ops.cond(x < 50, lambda: x + 1, lambda: x * x)
-      control_flow_ops.ENABLE_COND_V2 = previous_cond_v2_value
+      control_flow_util.ENABLE_CONTROL_FLOW_V2 = previous_control_flow_v2_value
       return return_value
 
     dataset = dataset_ops.Dataset.range(100).apply(
@@ -512,19 +470,17 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
-    get_next = iterator.get_next()
-    with self.cached_session():
-      for i in range(10):
-        if i < 5:
-          self.assertAllEqual([i * 10 + j + 1 for j in range(10)],
-                              self.evaluate(get_next))
-        else:
-          self.assertAllEqual(
-              [((i * 10) + j) * ((i * 10) + j) for j in range(10)],
-              self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(dataset)
+    for i in range(10):
+      if i < 5:
+        self.assertAllEqual([i * 10 + j + 1 for j in range(10)],
+                            self.evaluate(get_next()))
+      else:
+        self.assertAllEqual(
+            [((i * 10) + j) * ((i * 10) + j) for j in range(10)],
+            self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
index 6042ca1c63f561a20e58e63e7864e13e847d3b35..4e99189279c5333029a8c068a7f334b7c02b62a5 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
@@ -27,15 +27,18 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_v1_only("b/123903858: Add eager and V2 test coverage")
 class MapDefunTest(test_base.DatasetTestBase):
 
   def testMapDefunSimple(self):
@@ -237,7 +240,7 @@ class MapDefunTest(test_base.DatasetTestBase):
       thread = self.checkedThread(
           self._assert_op_cancelled, args=(sess, map_defun_op))
       thread.start()
-      time.sleep(0.1)
+      time.sleep(0.2)
       sess.close()
       thread.join()
 
@@ -253,47 +256,70 @@ class MapDefunTest(test_base.DatasetTestBase):
     expected = x + c
     self.assertAllEqual(self.evaluate(expected), self.evaluate(map_defun_op))
 
+  def testMapDefunWithVariantTensor(self):
 
-class MapDefunBenchmark(test.Benchmark):
+    @function.defun(
+        input_signature=[tensor_spec.TensorSpec([], dtypes.variant)])
+    def fn(x):
+      return x
 
-  def _run(self, op, name=None, num_iters=3000):
-    with session.Session() as sess:
-      # Warm up the session
-      for _ in range(5):
-        self.evaluate(op)
-      start = time.time()
-      for _ in range(num_iters):
-        self.evaluate(op)
-      end = time.time()
-      mean_us = (end - start) * 1e6 / num_iters
-      self.report_benchmark(
-          name=name,
-          iters=num_iters,
-          wall_time=mean_us,
-          extras={"examples_per_sec": num_iters / (end - start)})
-
-  def benchmarkDefunVsMapFn(self):
-    """Benchmarks to compare the performance of MapDefun vs tf.map_fn."""
+    st = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
+
+    serialized = sparse_ops.serialize_sparse_v2(st, out_type=dtypes.variant)
+    serialized = array_ops.stack([serialized, serialized])
+    map_defun_op = map_defun.map_defun(fn, [serialized], [dtypes.variant],
+                                       [None])[0]
+    deserialized = sparse_ops.deserialize_sparse(map_defun_op, dtypes.int32)
+    expected = sparse_tensor.SparseTensorValue(
+        indices=[[0, 0, 0], [0, 1, 2], [1, 0, 0], [1, 1, 2]],
+        values=[1, 2, 1, 2],
+        dense_shape=[2, 3, 4])
+    actual = self.evaluate(deserialized)
+    self.assertSparseValuesEqual(expected, actual)
+
+  def testMapDefunWithVariantTensorAsCaptured(self):
+
+    st = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
+    serialized = sparse_ops.serialize_sparse_v2(st, out_type=dtypes.variant)
 
     @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.int32)])
-    def defun(x):
-      return array_ops.identity(x)
-
-    def map_fn(x):
-      return array_ops.identity(x)
-
-    base = math_ops.range(100)
-    for input_size in [10, 100, 1000, 10000]:
-      num_iters = 100000 // input_size
-      map_defun_op = map_defun.map_defun(defun, [base], [dtypes.int32], [()])
-      map_fn_op = functional_ops.map_fn(map_fn, base)
-
-      self._run(
-          map_defun_op,
-          "benchmarkMapDefun_size_%d" % input_size,
-          num_iters=num_iters)
-      self._run(
-          map_fn_op, "benchmarkMapFn_size_%d" % input_size, num_iters=num_iters)
+    def fn(x):
+      del x
+      return serialized
+
+    x = constant_op.constant([0, 0])
+    map_defun_op = map_defun.map_defun(fn, [x], [dtypes.variant], [None])[0]
+    deserialized = sparse_ops.deserialize_sparse(map_defun_op, dtypes.int32)
+    expected = sparse_tensor.SparseTensorValue(
+        indices=[[0, 0, 0], [0, 1, 2], [1, 0, 0], [1, 1, 2]],
+        values=[1, 2, 1, 2],
+        dense_shape=[2, 3, 4])
+    actual = self.evaluate(deserialized)
+    self.assertSparseValuesEqual(expected, actual)
+
+  def testMapDefunWithStrTensor(self):
+
+    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
+    def fn(x):
+      return x
+
+    st = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
+
+    serialized = sparse_ops.serialize_sparse_v2(st, out_type=dtypes.string)
+    serialized = array_ops.stack([serialized, serialized])
+    map_defun_op = map_defun.map_defun(fn, [serialized], [dtypes.string],
+                                       [None])[0]
+    deserialized = sparse_ops.deserialize_sparse(map_defun_op, dtypes.int32)
+    expected = sparse_tensor.SparseTensorValue(
+        indices=[[0, 0, 0], [0, 1, 2], [1, 0, 0], [1, 1, 2]],
+        values=[1, 2, 1, 2],
+        dense_shape=[2, 3, 4])
+    actual = self.evaluate(deserialized)
+    self.assertSparseValuesEqual(expected, actual)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/matching_files_test.py b/tensorflow/python/data/experimental/kernel_tests/matching_files_test.py
index 0ee7616d35e801743167865d8d8097064ef88126..fe83b4c66ec06fe5cd13caceb7c399036c4c4f5e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/matching_files_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/matching_files_test.py
@@ -23,14 +23,14 @@ import tempfile
 
 from tensorflow.python.data.experimental.ops import matching_files
 from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
-class MatchingFilesTest(test_base.DatasetTestBase):
+@test_util.run_all_in_graph_and_eager_modes
+class MatchingFilesDatasetTest(test_base.DatasetTestBase):
 
   def setUp(self):
     self.tmp_dir = tempfile.mkdtemp()
@@ -42,30 +42,23 @@ class MatchingFilesTest(test_base.DatasetTestBase):
     for filename in filenames:
       open(os.path.join(self.tmp_dir, filename), 'a').close()
 
-  @test_util.run_deprecated_v1
   def testNonExistingDirectory(self):
     """Test the MatchingFiles dataset with a non-existing directory."""
 
     self.tmp_dir = os.path.join(self.tmp_dir, 'nonexistingdir')
     dataset = matching_files.MatchingFilesDataset(
         os.path.join(self.tmp_dir, '*'))
-    with self.cached_session() as sess:
-      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
-      with self.assertRaises(errors.NotFoundError):
-        sess.run(next_element)
+    self.assertDatasetProduces(
+        dataset, expected_error=(errors.NotFoundError, ''))
 
-  @test_util.run_deprecated_v1
   def testEmptyDirectory(self):
     """Test the MatchingFiles dataset with an empty directory."""
 
     dataset = matching_files.MatchingFilesDataset(
         os.path.join(self.tmp_dir, '*'))
-    with self.cached_session() as sess:
-      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
-      with self.assertRaises(errors.NotFoundError):
-        sess.run(next_element)
+    self.assertDatasetProduces(
+        dataset, expected_error=(errors.NotFoundError, ''))
 
-  @test_util.run_deprecated_v1
   def testSimpleDirectory(self):
     """Test the MatchingFiles dataset with a simple directory."""
 
@@ -74,21 +67,14 @@ class MatchingFilesTest(test_base.DatasetTestBase):
 
     dataset = matching_files.MatchingFilesDataset(
         os.path.join(self.tmp_dir, '*'))
-    with self.cached_session() as sess:
-      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(os.path.join(self.tmp_dir, filename))
+            for filename in filenames
+        ],
+        assert_items_equal=True)
 
-      expected_filenames = []
-      actual_filenames = []
-      for filename in filenames:
-        expected_filenames.append(
-            compat.as_bytes(os.path.join(self.tmp_dir, filename)))
-        actual_filenames.append(compat.as_bytes(sess.run(next_element)))
-
-      self.assertItemsEqual(expected_filenames, actual_filenames)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  @test_util.run_deprecated_v1
   def testFileSuffixes(self):
     """Test the MatchingFiles dataset using the suffixes of filename."""
 
@@ -97,20 +83,14 @@ class MatchingFilesTest(test_base.DatasetTestBase):
 
     dataset = matching_files.MatchingFilesDataset(
         os.path.join(self.tmp_dir, '*.py'))
-    with self.cached_session() as sess:
-      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
-      expected_filenames = []
-      actual_filenames = []
-      for filename in filenames[1:-1]:
-        expected_filenames.append(
-            compat.as_bytes(os.path.join(self.tmp_dir, filename)))
-        actual_filenames.append(compat.as_bytes(sess.run(next_element)))
-
-      self.assertItemsEqual(expected_filenames, actual_filenames)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  @test_util.run_deprecated_v1
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(os.path.join(self.tmp_dir, filename))
+            for filename in filenames[1:-1]
+        ],
+        assert_items_equal=True)
+
   def testFileMiddles(self):
     """Test the MatchingFiles dataset using the middles of filename."""
 
@@ -119,20 +99,14 @@ class MatchingFilesTest(test_base.DatasetTestBase):
 
     dataset = matching_files.MatchingFilesDataset(
         os.path.join(self.tmp_dir, 'b*.py*'))
-    with self.cached_session() as sess:
-      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
-      expected_filenames = []
-      actual_filenames = []
-      for filename in filenames[1:3]:
-        expected_filenames.append(
-            compat.as_bytes(os.path.join(self.tmp_dir, filename)))
-        actual_filenames.append(compat.as_bytes(sess.run(next_element)))
-
-      self.assertItemsEqual(expected_filenames, actual_filenames)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  @test_util.run_deprecated_v1
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(os.path.join(self.tmp_dir, filename))
+            for filename in filenames[1:3]
+        ],
+        assert_items_equal=True)
+
   def testNestedDirectories(self):
     """Test the MatchingFiles dataset with nested directories."""
 
@@ -156,21 +130,20 @@ class MatchingFilesTest(test_base.DatasetTestBase):
     ]
 
     dataset = matching_files.MatchingFilesDataset(patterns)
-    with self.cached_session() as sess:
-      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
-      expected_filenames = [
-          compat.as_bytes(filename)
-          for filename in filenames
-          if filename.endswith('.txt') or filename.endswith('.log')
-      ]
-      actual_filenames = []
-      while True:
-        try:
-          actual_filenames.append(compat.as_bytes(sess.run(next_element)))
-        except errors.OutOfRangeError:
-          break
-
-      self.assertItemsEqual(expected_filenames, actual_filenames)
+    next_element = self.getNext(dataset)
+    expected_filenames = [
+        compat.as_bytes(filename)
+        for filename in filenames
+        if filename.endswith('.txt') or filename.endswith('.log')
+    ]
+    actual_filenames = []
+    while True:
+      try:
+        actual_filenames.append(compat.as_bytes(self.evaluate(next_element())))
+      except errors.OutOfRangeError:
+        break
+
+    self.assertItemsEqual(expected_filenames, actual_filenames)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
index bf868ebe79339e3c36473711ece064210db5f47f..3bfe55244e575066356fa3f3dfcec16076fbadb6 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
@@ -190,6 +190,7 @@ py_test(
     ],
     deps = [
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
@@ -206,7 +207,7 @@ py_test(
 
 py_test(
     name = "map_vectorization_test",
-    size = "medium",
+    size = "small",
     srcs = ["map_vectorization_test.py"],
     shard_count = 8,
     srcs_version = "PY2AND3",
@@ -232,6 +233,7 @@ py_test(
         "//tensorflow/python:nn",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/experimental/ops:optimization",
         "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/kernel_tests:test_base",
@@ -241,6 +243,26 @@ py_test(
     ],
 )
 
+py_test(
+    name = "choose_fastest_dataset_test",
+    size = "small",
+    srcs = ["choose_fastest_dataset_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "model_dataset_test",
     size = "medium",
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py
index 9b8248a78da11d99e3cf6cd87ab69d30d4d369d6..e05dcbd9d582da05a4049e76d4f8c057a53b3161 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py
@@ -31,11 +31,17 @@ class AssertNextDatasetTest(test_base.DatasetTestBase):
   def testAssertNext(self):
     dataset = dataset_ops.Dataset.from_tensors(0).apply(
         optimization.assert_next(["Map"])).map(lambda x: x)
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     self.assertDatasetProduces(dataset, expected_output=[0])
 
   def testAssertNextInvalid(self):
     dataset = dataset_ops.Dataset.from_tensors(0).apply(
         optimization.assert_next(["Whoops"])).map(lambda x: x)
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     self.assertDatasetProduces(
         dataset,
         expected_error=(
@@ -48,6 +54,7 @@ class AssertNextDatasetTest(test_base.DatasetTestBase):
         optimization.assert_next(["Map", "Whoops"])).map(lambda x: x)
     options = dataset_ops.Options()
     options.experimental_autotune = False
+    options.experimental_optimization.apply_default_optimizations = False
     dataset = dataset.with_options(options)
     self.assertDatasetProduces(
         dataset,
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/choose_fastest_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/choose_fastest_dataset_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec7a85ae113d0d517434827e5dae64804861070a
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/choose_fastest_dataset_test.py
@@ -0,0 +1,85 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental._ChooseFastestDataset`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class ChooseFastestDatasetTest(test_base.DatasetTestBase,
+                               parameterized.TestCase):
+
+  def testChooseFastestSimple(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2, 3, 4])
+    merge = optimization._ChooseFastestDataset([dataset, dataset])
+    self.assertDatasetProduces(
+        merge,
+        expected_output=[0, 1, 2, 3, 4],
+        expected_shapes=dataset.output_shapes)
+
+  def testChooseFastestManyInputs(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2, 3, 4])
+    merge = optimization._ChooseFastestDataset([dataset for _ in range(5)])
+    self.assertDatasetProduces(
+        merge,
+        expected_output=[0, 1, 2, 3, 4],
+        expected_shapes=dataset.output_shapes)
+
+  def testChooseFastest(self):
+    dataset = dataset_ops.Dataset.range(600)
+    f = lambda x: 2 * x
+    dataset_a = dataset.batch(50).map(f)
+    dataset_b = dataset.map(f).batch(50)
+    merge = optimization._ChooseFastestDataset([dataset_a, dataset_b])
+    self.assertDatasetProduces(
+        merge,
+        expected_output=[
+            [i * 2 for i in range(j * 50, (j + 1) * 50)] for j in range(12)
+        ],
+        expected_shapes=dataset_a.output_shapes)
+
+  @parameterized.named_parameters(
+      ("Shapes", [0], [[1, 2, 3]], "must have compatible output shapes."),
+      ("Types", [0], [0.0], "must have the same output types."),
+      ("NumComponents", [0], ([0], [1]), "must have the same output types."),
+      ("Cardinality", [1, 2, 3], [1], "must have compatible cardinalities."))
+  def testChooseFastestErrorWithIncompatibleInput(self, slices_a, slices_b,
+                                                  error_msg):
+    dataset_a = dataset_ops.Dataset.from_tensor_slices(slices_a)
+    dataset_b = dataset_ops.Dataset.from_tensor_slices(slices_b)
+
+    # The error is raised at dataset creation time.
+    if context.executing_eagerly():
+      with self.assertRaises(errors.InvalidArgumentError):
+        merge = optimization._ChooseFastestDataset([dataset_a, dataset_b])
+    else:
+      merge = optimization._ChooseFastestDataset([dataset_a, dataset_b])
+      self.assertDatasetProduces(
+          merge, expected_error=(errors.InvalidArgumentError, error_msg))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py
index 7371cf31dff33a5de18f3268ecdfc91c6a08b29c..525ae2c54e41e68869964de9d2997b41c3ca8585 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
@@ -72,7 +71,7 @@ class FilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     dataset = dataset.cache()
     options = dataset_ops.Options()
-    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.apply_default_optimizations = False
     options.experimental_optimization.filter_fusion = True
     dataset = dataset.with_options(options)
     expected_output = []
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
index 5f3a8683fbb6cb2b43a41ad6d738b4982755bbff..08a44e572b899c7f79af09c5a17448c9cd75a8b7 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
@@ -20,10 +20,8 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -66,12 +64,7 @@ class HoistRandomUniformTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def _testDataset(self, dataset):
     previous_result = 0
-    if context.executing_eagerly():
-      iterator = dataset.__iter__()
-      get_next = iterator._next_internal  # pylint: disable=protected-access
-    else:
-      iterator = dataset_ops.make_one_shot_iterator(dataset)
-      get_next = iterator.get_next
+    get_next = self.getNext(dataset)
     for _ in range(5):
       result = self.evaluate(get_next())
       self.assertLessEqual(1, result)
@@ -92,7 +85,7 @@ class HoistRandomUniformTest(test_base.DatasetTestBase, parameterized.TestCase):
             ["Zip[0]", "Map"] if will_optimize else ["Map"])).map(function)
 
     options = dataset_ops.Options()
-    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.apply_default_optimizations = False
     options.experimental_optimization.hoist_random_uniform = True
     dataset = dataset.with_options(options)
     self._testDataset(dataset)
@@ -109,7 +102,7 @@ class HoistRandomUniformTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.range(5).apply(
         optimization.assert_next(["Zip[0]", "Map"])).map(random_with_capture)
     options = dataset_ops.Options()
-    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.apply_default_optimizations = False
     options.experimental_optimization.hoist_random_uniform = True
     dataset = dataset.with_options(options)
     self._testDataset(dataset)
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
index fc65f52704c3389a24e9f304cfa1cadd5686c7d6..4fd982d12278232eaa65e8269f49c816823566ba 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 from tensorflow.python.data.experimental.kernel_tests import stats_dataset_test_base
 from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.experimental.ops import stats_aggregator
-from tensorflow.python.data.experimental.ops import stats_options
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
@@ -36,7 +35,7 @@ class LatencyAllEdgesTest(stats_dataset_test_base.StatsDatasetTestBase):
             ["LatencyStats", "Map", "LatencyStats", "Prefetch",
              "LatencyStats"])).map(lambda x: x * x).prefetch(1)
     options = dataset_ops.Options()
-    options.experimental_stats = stats_options.StatsOptions()
+    options.experimental_optimization.apply_default_optimizations = False
     options.experimental_stats.latency_all_edges = True
     options.experimental_stats.aggregator = aggregator
     dataset = dataset.with_options(options)
@@ -53,29 +52,6 @@ class LatencyAllEdgesTest(stats_dataset_test_base.StatsDatasetTestBase):
     self._assertSummaryHasCount(summary_str,
                                 "record_latency_PrefetchDataset/_6", 1)
 
-  def testLatencyStatsOptimizationV2(self):
-    aggregator = stats_aggregator.StatsAggregator()
-    dataset = dataset_ops.Dataset.from_tensors(1).apply(
-        optimization.assert_next(
-            ["LatencyStats", "Map", "LatencyStats", "Prefetch",
-             "LatencyStats"])).map(lambda x: x * x).prefetch(1)
-    options = dataset_ops.Options()
-    options.experimental_stats = stats_options.StatsOptions()
-    options.experimental_stats.aggregator = aggregator
-    dataset = dataset.with_options(options)
-    self.assertDatasetProduces(
-        dataset,
-        expected_output=[1],
-        requires_initialization=True,
-        num_test_iterations=1)
-    summary_t = aggregator.get_summary()
-    summary_str = self.evaluate(summary_t)
-    self._assertSummaryHasCount(summary_str, "record_latency_TensorDataset/_1",
-                                1)
-    self._assertSummaryHasCount(summary_str, "record_latency_MapDataset/_4", 1)
-    self._assertSummaryHasCount(summary_str,
-                                "record_latency_PrefetchDataset/_6", 1)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py
index 2386dd5f116d660eb93213c935b662c05d90011d..d79ae4387c868d4821ac65787ba0bc04d47cc7d3 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py
@@ -34,6 +34,7 @@ class MakeNumaAwareTest(test_base.DatasetTestBase):
             batching.map_and_batch(lambda x: x * x, 10))
     options = dataset_ops.Options()
     options.experimental_numa_aware = True
+    options.experimental_optimization.apply_default_optimizations = False
     dataset = dataset.with_options(options)
     self.assertDatasetProduces(
         dataset, expected_output=[[x * x for x in range(10)]])
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py
index e2ff3116eccf2ccfb7ed72085f4727a1e0262164..dc7bb9d6a37b8da3bfe983ea3cf8c74dbe16ee86 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py
@@ -31,6 +31,10 @@ class MapAndBatchFusionTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.range(10).apply(
         optimization.assert_next(
             ["MapAndBatch"])).map(lambda x: x * x).batch(10)
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.map_and_batch_fusion = True
+    dataset = dataset.with_options(options)
     self.assertDatasetProduces(
         dataset, expected_output=[[x * x for x in range(10)]])
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
index db8f214fbfca1389af70df55518c885610984031..7b0cc569734b9bf14b210e3a637334bdb950c503 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
@@ -84,7 +83,7 @@ class MapAndFilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
         optimization.assert_next(
             ["Map", "FilterByLastComponent"])).map(function).filter(predicate)
     options = dataset_ops.Options()
-    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.apply_default_optimizations = False
     options.experimental_optimization.map_and_filter_fusion = True
     dataset = dataset.with_options(options)
     self._testMapAndFilter(dataset, function, predicate)
@@ -103,7 +102,7 @@ class MapAndFilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
         optimization.assert_next(["Map",
                                   "Filter"])).map(function).filter(predicate)
     options = dataset_ops.Options()
-    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.apply_default_optimizations = False
     options.experimental_optimization.map_and_filter_fusion = True
     dataset = dataset.with_options(options)
     self._testMapAndFilter(dataset, function, predicate)
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
index d8d63903749d13b80f662c996ebf5c95f934a0b1..b3a7304b4e498fbcae01efc85281d3437061155e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import test_util
@@ -75,7 +74,7 @@ class MapFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     dataset = dataset.cache()
     options = dataset_ops.Options()
-    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.apply_default_optimizations = False
     options.experimental_optimization.map_fusion = True
     dataset = dataset.with_options(options)
     expected_output = []
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
index 0ff3fff4f8550a4221e54ab2b01ddcaf6c340145..60649cd3ede8ed5f5d13857c9182f6fc912325c5 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
@@ -20,15 +20,15 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -43,37 +43,58 @@ def _map_parallelization_test_cases():
     with ops.control_dependencies([assert_op]):
       return x
 
-  def random(_):
-    return random_ops.random_uniform([],
-                                     minval=0,
-                                     maxval=10,
-                                     dtype=dtypes.int64,
-                                     seed=42)
-
-  def assert_with_random(x):
-    x = assert_greater(x)
-    return random(x)
-
-  return (("Identity", identity, True), ("Increment", increment, True),
-          ("AssertGreater", assert_greater, True), ("Random", random, False),
-          ("AssertWithRandom", assert_with_random, False))
+  return (("Identity", identity, True),
+          ("Increment", increment, True),
+          ("AssertGreater", assert_greater, True))
 
 
 @test_util.run_all_in_graph_and_eager_modes
 class MapParallelizationTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(*_map_parallelization_test_cases())
-  def testMapParallelization(self, function, should_optimize):
-    next_nodes = ["ParallelMap"] if should_optimize else ["Map"]
+  def testMapParallelization(self, function, should_be_parallel):
+    next_nodes = ["ParallelMap"] if should_be_parallel else ["Map"]
     dataset = dataset_ops.Dataset.range(5).apply(
         optimization.assert_next(next_nodes)).map(function)
     options = dataset_ops.Options()
-    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.map_parallelization = True
+    dataset = dataset.with_options(options)
+    self.assertDatasetProduces(
+        dataset, expected_output=[function(x) for x in range(5)])
+
+  def testMapParallelizationWithCapturedConstant(self):
+    """Tests that functions with captured constants are parallelized."""
+
+    captured_t = constant_op.constant(42, dtype=dtypes.int64)
+    def fn(x):
+      return x + captured_t
+    dataset = dataset_ops.Dataset.range(5).apply(
+        optimization.assert_next(["ParallelMap"])).map(fn)
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.map_parallelization = True
+    dataset = dataset.with_options(options)
+    self.assertDatasetProduces(
+        dataset, expected_output=[x + 42 for x in range(5)])
+
+  def testMapParallelizationWithCapturedVariable(self):
+    """Tests that functions with captured variables are not parallelized."""
+
+    captured_t = variables.Variable(42, dtype=dtypes.int64)
+    def fn(x):
+      return x + captured_t
+    dataset = dataset_ops.Dataset.range(5).apply(
+        optimization.assert_next(["Map"])).map(fn)
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
     options.experimental_optimization.map_parallelization = True
     dataset = dataset.with_options(options)
-    if should_optimize:
-      self.assertDatasetProduces(
-          dataset, expected_output=[function(x) for x in range(5)])
+    self.evaluate(variables.global_variables_initializer())
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[x + 42 for x in range(5)],
+        requires_initialization=True)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
index adc411bfb5996904a92fd5b565eb59a439303500..0e14f75eadce3d4872ea25ea9fab2ee59564d2d5 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
@@ -22,10 +22,11 @@ import numpy as np
 
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
+from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.experimental.ops import optimization_options
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -226,6 +227,10 @@ def _generate_csv_test_case():
 
 
 def _generate_parse_single_example_test_case():
+  # When sparse tensors are used, map_vectorization is not
+  # attempted because the output_shapes of the map dataset are not defined.
+  # TODO(rachelim): Consider being more lax with checking the output_shapes of
+  # the map node.
 
   def parse_example_factory():
 
@@ -244,8 +249,6 @@ def _generate_parse_single_example_test_case():
                     feature={
                         "dense_int": _int64_feature(i),
                         "dense_str": _bytes_feature(str(i)),
-                        "sparse_int": _int64_feature(i, i * 2, i * 4, i * 8),
-                        "sparse_str": _bytes_feature(*["abc"] * i)
                     })).SerializeToString() for i in range(10)
         ]))
 
@@ -253,8 +256,6 @@ def _generate_parse_single_example_test_case():
     features = {
         "dense_int": parsing_ops.FixedLenFeature((), dtypes.int64, 0),
         "dense_str": parsing_ops.FixedLenFeature((), dtypes.string, ""),
-        "sparse_int": parsing_ops.VarLenFeature(dtypes.int64),
-        "sparse_str": parsing_ops.VarLenFeature(dtypes.string),
     }
     return parsing_ops.parse_single_example(x, features)
 
@@ -350,19 +351,19 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
       dataset = dataset.map(map_fn, num_parallel_calls)
       dataset = dataset.batch(100)
       options = dataset_ops.Options()
-      opt_options = optimization_options.OptimizationOptions()
-      opt_options.map_and_batch_fusion = False
-      options.experimental_optimization = opt_options
+      options.experimental_optimization.apply_default_optimizations = False
+      options.experimental_optimization.map_and_batch_fusion = False
       dataset = dataset.with_options(options)
       return dataset
 
     unoptimized = _make_dataset([map_node_name, "Batch"])
-    optimized = _make_dataset(["Batch", map_node_name]
-                              if expect_optimized else [map_node_name, "Batch"])
+    # Note that because of the `ChooseDataset` fork, we can't use `assert_next`
+    # to verify the optimization result.
+    optimized = _make_dataset(
+        [] if expect_optimized else [map_node_name, "Batch"])
     options = dataset_ops.Options()
-    opt_options = optimization_options.OptimizationOptions()
-    opt_options.map_vectorization = True
-    options.experimental_optimization = opt_options
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.map_vectorization = True
     optimized = optimized.with_options(options)
     return unoptimized, optimized
 
@@ -373,23 +374,22 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
                                                      num_parallel_calls)
     self.assertDatasetsEqual(unoptimized, optimized)
 
-  # TODO(b/117581999): Add eager coverage for the following tests.
-  def testSkipEagerOptimizationBadMapFn(self):
+  def testOptimizationBadMapFn(self):
     # Test map functions that give an error
     def map_fn(x):
       # x has leading dimension 5, this will raise an error
       return array_ops.gather(x, 10)
 
-    base_dataset = dataset_ops.Dataset.range(5).repeat(5).batch(
-        5, drop_remainder=True)
-    _, optimized = self._get_test_datasets(base_dataset, map_fn)
-    nxt = dataset_ops.make_one_shot_iterator(optimized).get_next()
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  r"indices = 10 is not in \[0, 5\)"):
+      base_dataset = dataset_ops.Dataset.range(5).repeat(5).batch(
+          5, drop_remainder=True)
+      _, optimized = self._get_test_datasets(base_dataset, map_fn)
+      nxt = dataset_ops.make_one_shot_iterator(optimized).get_next()
       self.evaluate(nxt)
 
   def testOptimizationWithCapturedInputs(self):
-    # Tests that vectorization works with captured inputs
+    # Tests that vectorization works with captured inputs.
     y = constant_op.constant(1, shape=(2,))
     z = constant_op.constant(2, shape=(2,))
 
@@ -402,8 +402,85 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
         base_dataset, map_fn, expect_optimized=True)
     self.assertDatasetsEqual(optimized, unoptimized)
 
-  # TODO(b/117581999): Add eager coverage for the following tests.
-  def testSkipEagerOptimizationIgnoreStateful(self):
+  def testOptimizationWithMapAndBatchFusion(self):
+    # Tests that vectorization works on fused map and batch.
+    y = constant_op.constant(1, shape=(2,))
+    z = constant_op.constant(2, shape=(2,))
+
+    def map_fn(x):
+      return x, y, z
+
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    base_dataset = dataset_ops.Dataset.from_tensor_slices([[1, 2],
+                                                           [3, 4]]).repeat(5)
+    base_dataset = base_dataset.with_options(options)
+
+    def _make_dataset(node_names):
+      dataset = base_dataset.apply(optimization.assert_next(node_names))
+      dataset = dataset.apply(batching.map_and_batch(map_fn, 100))
+      return dataset
+
+    unoptimized = _make_dataset(["MapAndBatch"])
+    optimized = _make_dataset([])
+    options = dataset_ops.Options()
+    options.experimental_optimization.map_vectorization = True
+    optimized = optimized.with_options(options)
+    self.assertDatasetsEqual(optimized, unoptimized)
+
+  @parameterized.named_parameters(
+      ("1", True, True),
+      ("2", True, False),
+      ("3", False, True),
+      ("4", False, False),
+  )
+  def testOptimizationWithChainedMapAndBatch(self, fuse_first, fuse_second):
+    # Tests that vectorization works on chained map and batch functions.
+    def map_fn(x):
+      return x * 2
+
+    unoptimized_seq = []
+
+    def make_apply_fn(is_fused):
+      if is_fused:
+        unoptimized_seq.append("MapAndBatch")
+
+        def apply_fn(dataset):
+          return dataset.apply(
+              batching.map_and_batch(map_fn, 2, 12, drop_remainder=True))
+
+        return apply_fn
+      else:
+        unoptimized_seq.extend(["ParallelMap", "Batch"])
+
+        def apply_fn(dataset):
+          return dataset.map(map_fn, 12).batch(2, drop_remainder=True)
+
+        return apply_fn
+
+    base_dataset = dataset_ops.Dataset.range(1000)
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    base_dataset = base_dataset.with_options(options)
+
+    apply_fn_1 = make_apply_fn(fuse_first)
+    apply_fn_2 = make_apply_fn(fuse_second)
+
+    def make_dataset(node_names):
+      dataset = base_dataset.apply(optimization.assert_next(node_names))
+      dataset = apply_fn_1(dataset)
+      dataset = apply_fn_2(dataset)
+      return dataset
+
+    unoptimized = make_dataset(unoptimized_seq)
+    optimized = make_dataset([])
+    options = dataset_ops.Options()
+    options.experimental_optimization.map_vectorization = True
+    optimized = optimized.with_options(options)
+
+    self.assertDatasetsEqual(optimized, unoptimized)
+
+  def testOptimizationIgnoreStateful(self):
 
     def map_fn(x):
       with ops.control_dependencies([check_ops.assert_equal(x, 0)]):
@@ -413,10 +490,13 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
                                                            [3, 4]]).repeat(5)
     unoptimized, optimized = self._get_test_datasets(
         base_dataset, map_fn, expect_optimized=False)
-    self.assertDatasetsRaiseSameError(
-        unoptimized, optimized, errors.InvalidArgumentError,
-        [("OneShotIterator", "OneShotIterator_1", 1),
-         ("IteratorGetNext", "IteratorGetNext_1", 1)])
+    replacements = None
+    if not context.executing_eagerly():
+      # In graph mode, the ops have unique names.
+      replacements = [("OneShotIterator", "OneShotIterator_1", 1),
+                      ("IteratorGetNext", "IteratorGetNext_1", 1)]
+    self.assertDatasetsRaiseSameError(unoptimized, optimized,
+                                      errors.InvalidArgumentError, replacements)
 
   def testOptimizationIgnoreRagged(self):
     # Make sure we ignore inputs that might not be uniformly sized
@@ -429,8 +509,7 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
         base_dataset, map_fn, expect_optimized=False)
     self.assertDatasetsEqual(unoptimized, optimized)
 
-  # TODO(b/117581999): Add eager coverage for the following tests.
-  def testSkipEagerOptimizationIgnoreRaggedMap(self):
+  def testOptimizationIgnoreRaggedMap(self):
     # Don't optimize when the output of the map fn shapes are unknown.
     def map_fn(x):
       return array_ops.tile(x, x)
@@ -438,10 +517,48 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
     base_dataset = dataset_ops.Dataset.range(20).batch(1, drop_remainder=True)
     unoptimized, optimized = self._get_test_datasets(
         base_dataset, map_fn, expect_optimized=False)
-    self.assertDatasetsRaiseSameError(
-        unoptimized, optimized, errors.InvalidArgumentError,
-        [("OneShotIterator", "OneShotIterator_1", 1),
-         ("IteratorGetNext", "IteratorGetNext_1", 1)])
+    replacements = None
+    if not context.executing_eagerly():
+      # In graph mode, the ops have unique names.
+      replacements = [("OneShotIterator", "OneShotIterator_1", 1),
+                      ("IteratorGetNext", "IteratorGetNext_1", 1)]
+    self.assertDatasetsRaiseSameError(unoptimized, optimized,
+                                      errors.InvalidArgumentError, replacements)
+
+  def testOptimizationWithUnknownBatchShape(self):
+    tensor = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
+
+    # Datasets with sparse tensors have unknown output shapes.
+    base_dataset = dataset_ops.Dataset.from_tensors(tensor)
+    unoptimized = base_dataset.apply(batching.map_and_batch(lambda x: x, 2))
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    unoptimized = unoptimized.with_options(options)
+
+    options = dataset_ops.Options()
+    options.experimental_optimization.map_vectorization = True
+    optimized = unoptimized.with_options(options)
+    self.assertDatasetsEqual(unoptimized, optimized)
+
+  def testOptimizationWithSparseTensor(self):
+    base_dataset = dataset_ops.Dataset.from_tensors(0)
+
+    def map_fn(x):
+      del x
+      return sparse_tensor.SparseTensor(
+          indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
+
+    # Datasets with sparse tensors have unknown output shapes.
+    unoptimized = base_dataset.apply(batching.map_and_batch(map_fn, 2))
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    unoptimized = unoptimized.with_options(options)
+
+    options = dataset_ops.Options()
+    options.experimental_optimization.map_vectorization = True
+    optimized = unoptimized.with_options(options)
+    self.assertDatasetsEqual(unoptimized, optimized)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py
index 0f0274b41f2da1add8b2361b54e5c32a5974da41..5c1ae7a98a2326f61518b1550d0678da50e78401 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py
@@ -23,10 +23,11 @@ from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
-# TODO(b/117581999): Add eager coverage for the following tests.
+@test_util.run_all_in_graph_and_eager_modes
 class ModelDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def testAutotuneOption(self):
@@ -35,15 +36,13 @@ class ModelDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         optimization.assert_next(["Model"]))
     options = dataset_ops.Options()
     options.experimental_autotune = True
+    options.experimental_optimization.apply_default_optimizations = False
     dataset = dataset.with_options(options)
+    get_next = self.getNext(dataset)
 
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertEqual(0, self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    self.assertEqual(0, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
index 8058f53eea240831545444286fb2c6aa404e240a..74f620e37d5659bf4d2989d7a8a0b5d8359a91af 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
@@ -41,6 +41,10 @@ class NoopEliminationTest(test_base.DatasetTestBase):
             ["FiniteRepeat", "FiniteSkip", "Prefetch", "MemoryCacheImpl"]))
     dataset = dataset.repeat(some_tensor).skip(5).take(-1).skip(0).repeat(
         1).prefetch(0).prefetch(1).cache()
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.noop_elimination = True
+    dataset = dataset.with_options(options)
     self.assertDatasetProduces(dataset, expected_output=range(5))
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
index 230b74e9e8e0e3e26aeabe11faa84c651069c7b8..a85e0cf801cda08cfe997c4ebce6497ae806aecd 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
@@ -25,7 +25,6 @@ import numpy as np
 from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import grouping
 from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.experimental.ops import optimization_options
 from tensorflow.python.data.experimental.ops import scan_ops
 from tensorflow.python.data.experimental.ops import threadpool
 from tensorflow.python.data.kernel_tests import test_base
@@ -107,15 +106,19 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testOptimizationStatefulFunction(self):
     dataset = dataset_ops.Dataset.range(
         10).map(lambda _: random_ops.random_uniform([])).batch(10)
-    dataset = dataset_ops._OptimizeDataset(dataset, [])
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     get_next = self.getNext(dataset)
     self.evaluate(get_next())
 
-  # TODO(b/117581999): Add eager coverage for the following tests.
+  @test_util.run_v1_only("b/123902160")
   def testSkipEagerOptimizationLargeInputFromTensor(self):
     input_t = array_ops.placeholder(dtypes.int32, (None, None, None))
     dataset = dataset_ops.Dataset.from_tensors(input_t)
-    dataset = dataset_ops._OptimizeDataset(dataset, [])
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
@@ -124,11 +127,13 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       sess.run(init_op, {input_t: np.ones([512, 1024, 1025], np.int32)})
       self.evaluate(get_next)
 
-  # TODO(b/117581999): Add eager coverage for the following tests.
+  @test_util.run_v1_only("b/123902160")
   def testSkipEagerOptimizationLargeInputFromTensorSlices(self):
     input_t = array_ops.placeholder(dtypes.int32, (None, None, None, None))
     dataset = dataset_ops.Dataset.from_tensor_slices(input_t)
-    dataset = dataset_ops._OptimizeDataset(dataset, [])
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
@@ -148,7 +153,10 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     dataset = dataset_ops.Dataset.range(1)
     dataset = dataset.flat_map(flat_map_fn)
-    dataset = dataset_ops._OptimizeDataset(dataset, ["noop_elimination"])
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.noop_elimination = True
+    dataset = dataset.with_options(options)
     self.assertDatasetProduces(dataset, expected_output=[0])
 
   def testOptimizationNestedDatasetWithModifiedRetval(self):
@@ -164,13 +172,9 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.range(1)
     dataset = dataset.flat_map(flat_map_fn)
 
-    # TODO(b/120558523): We use Options instead of _OptimizeDataset directly
-    # here because of a bug with chaining _OptimizeDatasets when there are
-    # nested dataset functions
     options = dataset_ops.Options()
-    opt_options = optimization_options.OptimizationOptions()
-    opt_options.map_and_batch_fusion = True
-    options.experimental_optimization = opt_options
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.map_and_batch_fusion = True
     dataset = dataset.with_options(options)
     self.assertDatasetProduces(dataset, expected_output=[[0]])
 
@@ -182,7 +186,9 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         threadpool.PrivateThreadPool(
             2, display_name="private_thread_pool_%d" % 2))
 
-    dataset = dataset_ops._OptimizeDataset(dataset, [])
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    dataset = dataset.with_options(options)
     self.assertDatasetProduces(
         dataset,
         expected_output=[list(range(10))],
@@ -196,49 +202,54 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset.apply(optimization.assert_next(["MemoryCacheImpl"]))
     dataset = dataset.skip(0)  # Should be removed by noop elimination
     dataset = dataset.cache()
-    dataset = dataset_ops._OptimizeDataset(dataset, ["noop_elimination"])
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.noop_elimination = True
+    dataset = dataset.with_options(options)
     self.assertDatasetProduces(dataset, expected_output=[0])
 
   def testOptimizationNonSerializableAsDirectInput(self):
     """Tests that non-serializable dataset can be OptimizeDataset's input."""
     dataset = dataset_ops.Dataset.from_tensors(0)
     dataset = dataset.apply(optimization.non_serializable())
-    dataset = dataset_ops._OptimizeDataset(dataset, ["noop_elimination"])
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.noop_elimination = True
+    dataset = dataset.with_options(options)
     self.assertDatasetProduces(dataset, expected_output=[0])
 
   @parameterized.named_parameters(_generate_captured_refvar_test_cases())
-  # Skip eager because RefVariables are not supported in eager mode.
+  @test_util.run_v1_only("RefVariables are not supported in eager mode.")
   def testSkipEagerOptimizationWithCapturedRefVar(self, dataset_fn):
     """Tests that default optimizations are disabled with ref variables."""
     variable = variable_scope.get_variable(
         "v", initializer=0, use_resource=False)
     assign_op = variable.assign_add(1)
 
-    unoptimized_dataset = dataset_fn(variable)
-
-    options = dataset_ops.Options()
-    opt_options = optimization_options.OptimizationOptions()
-    opt_options.noop_elimination = True
-    opt_options.map_and_batch_fusion = True
-    options.experimental_optimization = opt_options
-    optimized_dataset = unoptimized_dataset.with_options(options)
-
     # Check that warning is logged.
     warnings.simplefilter("always")
     with warnings.catch_warnings(record=True) as w:
-      optimized_it = optimized_dataset.make_initializable_iterator()
+      unoptimized_dataset = dataset_fn(variable)
+
+      options = dataset_ops.Options()
+      options.experimental_optimization.apply_default_optimizations = False
+      options.experimental_optimization.noop_elimination = True
+      options.experimental_optimization.map_and_batch_fusion = True
+      optimized_dataset = unoptimized_dataset.with_options(options)
+      optimized_it = dataset_ops.make_initializable_iterator(optimized_dataset)
 
     self.assertGreaterEqual(len(w), 1)
     expected = ("tf.data static optimizations are not compatible with "
                 "tf.Variable. The following optimizations will be disabled: %s."
                 " To enable optimizations, use resource variables instead by "
                 "calling `tf.enable_resource_variables()` at the start of the "
-                "program." % (", ".join(opt_options._static_optimizations())))
+                "program." % (", ".join(options._static_optimizations())))
     self.assertTrue(any([expected in str(warning) for warning in w]))
 
     # Check that outputs are the same in the optimized and unoptimized cases,
     # when the variable value is changing.
-    unoptimized_it = unoptimized_dataset.make_initializable_iterator()
+    unoptimized_it = dataset_ops.make_initializable_iterator(
+        unoptimized_dataset)
     with ops.control_dependencies([assign_op]):
       unoptimized_output = unoptimized_it.get_next()
       optimized_output = optimized_it.get_next()
@@ -271,12 +282,15 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     only explicitly enabled optimizations will be applied.
     """
     options = dataset_ops.Options()
-    opt_options = optimization_options.OptimizationOptions()
-    opt_options.hoist_random_uniform = True
-    opt_options.apply_default_optimizations = False
-    options.experimental_optimization = opt_options
-    expected_optimizations = ["hoist_random_uniform"]
-    self.assertEqual(options._static_optimizations(), expected_optimizations)
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.hoist_random_uniform = True
+    options.experimental_optimization.noop_elimination = True
+    expected_optimizations = [
+        "hoist_random_uniform",
+        "noop_elimination",
+    ]
+    self.assertEqual(
+        set(options._static_optimizations()), set(expected_optimizations))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
index 594b59375febbba6c939dc5429ff59fe9c971a5f..824cc680abb9e574f77a544edb6e7fffa9a064c7 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
@@ -31,6 +31,10 @@ class ShuffleAndRepeatFusionTest(test_base.DatasetTestBase):
   def testShuffleAndRepeatFusion(self):
     dataset = dataset_ops.Dataset.range(10).apply(
         optimization.assert_next(["ShuffleAndRepeat"])).shuffle(10).repeat(2)
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.shuffle_and_repeat_fusion = True
+    dataset = dataset.with_options(options)
     get_next = self.getNext(dataset)
 
     for _ in range(2):
diff --git a/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py b/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py
index aa81663a188cfee738acaedfd44e239909a4215e..811a58262efe6500784700518ac2bb1a20b03c63 100644
--- a/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class OverrideThreadpoolTest(test_base.DatasetTestBase,
                              parameterized.TestCase):
 
@@ -53,14 +54,12 @@ class OverrideThreadpoolTest(test_base.DatasetTestBase,
             lambda x: script_ops.py_func(get_thread_id, [x], dtypes.int64),
             num_parallel_calls=32).apply(unique.unique()))
     dataset = override_threadpool_fn(dataset)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
+    next_element = self.getNext(dataset, requires_initialization=True)
 
-    self.evaluate(iterator.initializer)
     thread_ids = []
     try:
       while True:
-        thread_ids.append(self.evaluate(next_element))
+        thread_ids.append(self.evaluate(next_element()))
     except errors.OutOfRangeError:
       pass
     self.assertLen(thread_ids, len(set(thread_ids)))
@@ -82,7 +81,6 @@ class OverrideThreadpoolTest(test_base.DatasetTestBase,
       ("8", 4, 1),
       ("9", 4, 4),
   )
-  @test_util.run_deprecated_v1
   def testNumThreadsDeprecated(self, num_threads, max_intra_op_parallelism):
 
     def override_threadpool_fn(dataset):
@@ -109,7 +107,6 @@ class OverrideThreadpoolTest(test_base.DatasetTestBase,
       ("11", 4, 4),
       ("12", None, None),
   )
-  @test_util.run_deprecated_v1
   def testNumThreads(self, num_threads, max_intra_op_parallelism):
 
     def override_threadpool_fn(dataset):
diff --git a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
index 113326c028a53be5b6aa3889ace5013fc08843a4..9d535316619db395853e83e3c1b2a740965b9f7d 100644
--- a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
@@ -22,6 +22,7 @@ import math
 import threading
 import time
 
+import numpy as np
 from six.moves import zip_longest
 
 from tensorflow.python.data.experimental.ops import interleave_ops
@@ -30,24 +31,18 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class ParallelInterleaveTest(test_base.DatasetTestBase):
 
   def setUp(self):
 
-    self.input_values = array_ops.placeholder(dtypes.int64, shape=[None])
-    self.cycle_length = array_ops.placeholder(dtypes.int64, shape=[])
-    self.block_length = array_ops.placeholder(dtypes.int64, shape=[])
-    self.sloppy = array_ops.placeholder(dtypes.bool, shape=[])
-    self.buffer_output_elements = array_ops.placeholder(dtypes.int64, shape=[])
-    self.prefetch_input_elements = array_ops.placeholder(dtypes.int64, shape=[])
-
     self.error = None
     self.repeat_count = 2
 
@@ -61,6 +56,9 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
       self.read_coordination_events[i] = threading.Semaphore(0)
       self.write_coordination_events[i] = threading.Event()
 
+  def dataset_fn(self, input_values, cycle_length, block_length, sloppy,
+                 buffer_output_elements, prefetch_input_elements):
+
     def map_py_fn(x):
       self.write_coordination_events[x].wait()
       self.write_coordination_events[x].clear()
@@ -79,16 +77,11 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
       dataset = dataset.repeat(x)
       return dataset.map(map_fn)
 
-    self.dataset = (
-        dataset_ops.Dataset.from_tensor_slices(self.input_values)
-        .repeat(self.repeat_count).apply(
-            interleave_ops.parallel_interleave(interleave_fn, self.cycle_length,
-                                               self.block_length, self.sloppy,
-                                               self.buffer_output_elements,
-                                               self.prefetch_input_elements)))
-    self.iterator = dataset_ops.make_initializable_iterator(self.dataset)
-    self.init_op = self.iterator.initializer
-    self.next_element = self.iterator.get_next()
+    return dataset_ops.Dataset.from_tensor_slices(input_values).repeat(
+        self.repeat_count).apply(
+            interleave_ops.parallel_interleave(
+                interleave_fn, cycle_length, block_length, sloppy,
+                buffer_output_elements, prefetch_input_elements))
 
   def _interleave(self, lists, cycle_length, block_length):
     """Python implementation of interleave used for testing."""
@@ -178,26 +171,22 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
   def _testSingleThreaded(self, sloppy=False, prefetch_input_elements=0):
     # cycle_length=1,block_length=1 acts like `Dataset.interleave()` and
     # `Dataset.flat_map()` and is single-threaded. No synchronization required.
-    with self.cached_session() as sess:
-      self._clear_coordination_events()
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [4, 5, 6],
-              self.cycle_length: 1,
-              self.block_length: 1,
-              self.sloppy: sloppy,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: prefetch_input_elements,
-          })
-
-      for expected_element in self._interleave(
-          [[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 1, 1):
-        self.write_coordination_events[expected_element].set()
-        self.assertEqual(expected_element * expected_element,
-                         self.evaluate(self.next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    self._clear_coordination_events()
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([4, 5, 6]),
+            cycle_length=1,
+            block_length=1,
+            sloppy=sloppy,
+            buffer_output_elements=1,
+            prefetch_input_elements=prefetch_input_elements))
+    for expected_element in self._interleave(
+        [[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 1, 1):
+      self.write_coordination_events[expected_element].set()
+      self.assertEqual(expected_element * expected_element,
+                       self.evaluate(next_element()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testSingleThreaded(self):
     self._testSingleThreaded()
@@ -213,64 +202,59 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
 
   def testSingleThreadedRagged(self):
     # Tests a sequence with wildly different elements per iterator.
-    with self.cached_session() as sess:
-      self._clear_coordination_events()
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [3, 7, 4],
-              self.cycle_length: 2,
-              self.block_length: 1,
-              self.sloppy: False,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 1,
-          })
-
-      # Add coordination values for 3 and 7
-      self.read_coordination_events[3] = threading.Semaphore(0)
-      self.write_coordination_events[3] = threading.Event()
-      self.read_coordination_events[7] = threading.Semaphore(0)
-      self.write_coordination_events[7] = threading.Event()
-
-      for expected_element in self._interleave(
-          [[3] * 3, [7] * 7, [4] * 4] * self.repeat_count, 2, 1):
-        self.write_coordination_events[expected_element].set()
-        output = self.evaluate(self.next_element)
-        self.assertEqual(expected_element * expected_element, output)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    self._clear_coordination_events()
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([3, 7, 4]),
+            cycle_length=2,
+            block_length=1,
+            sloppy=False,
+            buffer_output_elements=1,
+            prefetch_input_elements=1))
+
+    # Add coordination values for 3 and 7
+    self.read_coordination_events[3] = threading.Semaphore(0)
+    self.write_coordination_events[3] = threading.Event()
+    self.read_coordination_events[7] = threading.Semaphore(0)
+    self.write_coordination_events[7] = threading.Event()
+
+    for expected_element in self._interleave(
+        [[3] * 3, [7] * 7, [4] * 4] * self.repeat_count, 2, 1):
+      self.write_coordination_events[expected_element].set()
+      output = self.evaluate(next_element())
+      self.assertEqual(expected_element * expected_element, output)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def _testTwoThreadsNoContention(self, sloppy=False):
     # num_threads > 1.
     # Explicit coordination should result in `Dataset.interleave()` behavior
-    with self.cached_session() as sess:
-      self._clear_coordination_events()
-      done_first_event = False
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [4, 5, 6],
-              self.cycle_length: 2,
-              self.block_length: 1,
-              self.sloppy: sloppy,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 1,
-          })
-      for i, expected_element in enumerate(
-          self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
-                           1)):
-        self.write_coordination_events[expected_element].set()
-        if done_first_event:  # First event starts the worker threads.
-          self.read_coordination_events[expected_element].acquire()
-        actual_element = self.evaluate(self.next_element)
-        if not done_first_event:
-          self.read_coordination_events[expected_element].acquire()
-          done_first_event = True
-        self.assertEqual(expected_element * expected_element, actual_element,
-                         "At index %s: %s expected, got: %s" %
-                         (i, expected_element, actual_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    self._clear_coordination_events()
+    done_first_event = False
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([4, 5, 6]),
+            cycle_length=2,
+            block_length=1,
+            sloppy=sloppy,
+            buffer_output_elements=1,
+            prefetch_input_elements=1))
+    for i, expected_element in enumerate(
+        self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
+                         1)):
+      self.write_coordination_events[expected_element].set()
+      if done_first_event:  # First event starts the worker threads.
+        self.read_coordination_events[expected_element].acquire()
+      actual_element = self.evaluate(next_element())
+      if not done_first_event:
+        self.read_coordination_events[expected_element].acquire()
+        done_first_event = True
+      self.assertEqual(
+          expected_element * expected_element, actual_element,
+          "At index %s: %s expected, got: %s" % (i, expected_element,
+                                                 actual_element))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testTwoThreadsNoContention(self):
     self._testTwoThreadsNoContention()
@@ -287,38 +271,36 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
     Args:
       sloppy: Whether to be sloppy or not.
     """
-    with self.cached_session() as sess:
-      self._clear_coordination_events()
-      done_first_event = False
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [4, 5, 6],
-              self.cycle_length: 2,
-              self.block_length: 1,
-              self.sloppy: sloppy,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 1,
-          })
-      for i, expected_element in enumerate(
-          self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
-                           1)):
-        if done_first_event:  # First event starts the worker threads.
-          self._allow_all_map_threads()
-          self.read_coordination_events[expected_element].acquire()
-        else:
-          self.write_coordination_events[expected_element].set()
-        time.sleep(0.5)  # Sleep to consistently "avoid" the race condition.
-        actual_element = self.evaluate(self.next_element)
-        if not done_first_event:
-          done_first_event = True
-          self.assertTrue(
-              self.read_coordination_events[expected_element].acquire(False))
-        self.assertEqual(expected_element * expected_element, actual_element,
-                         "At index %s: %s expected, got: %s" %
-                         (i, expected_element, actual_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    self._clear_coordination_events()
+    done_first_event = False
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([4, 5, 6]),
+            cycle_length=2,
+            block_length=1,
+            sloppy=sloppy,
+            buffer_output_elements=1,
+            prefetch_input_elements=1))
+    for i, expected_element in enumerate(
+        self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
+                         1)):
+      if done_first_event:  # First event starts the worker threads.
+        self._allow_all_map_threads()
+        self.read_coordination_events[expected_element].acquire()
+      else:
+        self.write_coordination_events[expected_element].set()
+      time.sleep(0.5)  # Sleep to consistently "avoid" the race condition.
+      actual_element = self.evaluate(next_element())
+      if not done_first_event:
+        done_first_event = True
+        self.assertTrue(
+            self.read_coordination_events[expected_element].acquire(False))
+      self.assertEqual(
+          expected_element * expected_element, actual_element,
+          "At index %s: %s expected, got: %s" % (i, expected_element,
+                                                 actual_element))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testTwoThreadsNoContentionWithRaces(self):
     self._testTwoThreadsNoContentionWithRaces()
@@ -329,34 +311,32 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
   def _testTwoThreadsNoContentionBlockLength(self, sloppy=False):
     # num_threads > 1.
     # Explicit coordination should result in `Dataset.interleave()` behavior
-    with self.cached_session() as sess:
-      self._clear_coordination_events()
-      done_first_event = False
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [4, 5, 6],
-              self.cycle_length: 2,
-              self.block_length: 2,
-              self.sloppy: sloppy,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 1,
-          })
-      for i, expected_element in enumerate(
-          self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
-                           2)):
-        self.write_coordination_events[expected_element].set()
-        if done_first_event:  # First event starts the worker threads.
-          self.read_coordination_events[expected_element].acquire()
-        actual_element = self.evaluate(self.next_element)
-        if not done_first_event:
-          done_first_event = True
-          self.read_coordination_events[expected_element].acquire()
-        self.assertEqual(expected_element * expected_element, actual_element,
-                         "At index %s: %s expected, got: %s" %
-                         (i, expected_element, actual_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    self._clear_coordination_events()
+    done_first_event = False
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([4, 5, 6]),
+            cycle_length=2,
+            block_length=2,
+            sloppy=sloppy,
+            buffer_output_elements=1,
+            prefetch_input_elements=1))
+    for i, expected_element in enumerate(
+        self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
+                         2)):
+      self.write_coordination_events[expected_element].set()
+      if done_first_event:  # First event starts the worker threads.
+        self.read_coordination_events[expected_element].acquire()
+      actual_element = self.evaluate(next_element())
+      if not done_first_event:
+        done_first_event = True
+        self.read_coordination_events[expected_element].acquire()
+      self.assertEqual(
+          expected_element * expected_element, actual_element,
+          "At index %s: %s expected, got: %s" % (i, expected_element,
+                                                 actual_element))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testTwoThreadsNoContentionBlockLength(self):
     self._testTwoThreadsNoContentionBlockLength()
@@ -374,38 +354,36 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
     Args:
       sloppy: Whether to be sloppy or not.
     """
-    with self.cached_session() as sess:
-      self._clear_coordination_events()
-      done_first_event = False
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [4, 5, 6],
-              self.cycle_length: 2,
-              self.block_length: 2,
-              self.sloppy: sloppy,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 1,
-          })
-      for i, expected_element in enumerate(
-          self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
-                           2)):
-        if done_first_event:  # First event starts the worker threads.
-          self._allow_all_map_threads()
-          self.read_coordination_events[expected_element].acquire()
-        else:
-          self.write_coordination_events[expected_element].set()
-        time.sleep(0.5)  # Sleep to consistently "avoid" the race condition.
-        actual_element = self.evaluate(self.next_element)
-        if not done_first_event:
-          done_first_event = True
-          self.assertTrue(
-              self.read_coordination_events[expected_element].acquire(False))
-        self.assertEqual(expected_element * expected_element, actual_element,
-                         "At index %s: %s expected, got: %s" %
-                         (i, expected_element, actual_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    self._clear_coordination_events()
+    done_first_event = False
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([4, 5, 6]),
+            cycle_length=2,
+            block_length=2,
+            sloppy=sloppy,
+            buffer_output_elements=1,
+            prefetch_input_elements=1))
+    for i, expected_element in enumerate(
+        self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
+                         2)):
+      if done_first_event:  # First event starts the worker threads.
+        self._allow_all_map_threads()
+        self.read_coordination_events[expected_element].acquire()
+      else:
+        self.write_coordination_events[expected_element].set()
+      time.sleep(0.5)  # Sleep to consistently "avoid" the race condition.
+      actual_element = self.evaluate(next_element())
+      if not done_first_event:
+        done_first_event = True
+        self.assertTrue(
+            self.read_coordination_events[expected_element].acquire(False))
+      self.assertEqual(
+          expected_element * expected_element, actual_element,
+          "At index %s: %s expected, got: %s" % (i, expected_element,
+                                                 actual_element))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testTwoThreadsNoContentionWithRacesAndBlocking(self):
     self._testTwoThreadsNoContentionWithRacesAndBlocking()
@@ -414,21 +392,18 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
     self._testTwoThreadsNoContentionWithRacesAndBlocking(sloppy=True)
 
   def _testEmptyInput(self, sloppy=False):
-    with self.cached_session() as sess:
-      # Empty input.
-      self._clear_coordination_events()
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [],
-              self.cycle_length: 2,
-              self.block_length: 3,
-              self.sloppy: sloppy,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 0,
-          })
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    # Empty input.
+    self._clear_coordination_events()
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([]),
+            cycle_length=2,
+            block_length=3,
+            sloppy=sloppy,
+            buffer_output_elements=1,
+            prefetch_input_elements=0))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testEmptyInput(self):
     self._testEmptyInput()
@@ -438,20 +413,17 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
 
   def _testNonEmptyInputIntoEmptyOutputs(self, sloppy=False):
     # Non-empty input leading to empty output.
-    with self.cached_session() as sess:
-      self._clear_coordination_events()
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [0, 0, 0],
-              self.cycle_length: 2,
-              self.block_length: 3,
-              self.sloppy: sloppy,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 0,
-          })
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    self._clear_coordination_events()
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([0, 0, 0]),
+            cycle_length=2,
+            block_length=3,
+            sloppy=sloppy,
+            buffer_output_elements=1,
+            prefetch_input_elements=0))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testNonEmptyInputIntoEmptyOutputs(self):
     self._testNonEmptyInputIntoEmptyOutputs()
@@ -462,35 +434,33 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
   def _testPartiallyEmptyOutputs(self, sloppy=False, prefetch_input_elements=1):
     race_indices = {2, 8, 14}  # Sequence points when sloppy mode has race conds
     # Mixture of non-empty and empty interleaved datasets.
-    with self.cached_session() as sess:
-      self._clear_coordination_events()
-      done_first_event = False
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [4, 0, 6],
-              self.cycle_length: 2,
-              self.block_length: 1,
-              self.sloppy: sloppy,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: prefetch_input_elements,
-          })
-      for i, expected_element in enumerate(
-          self._interleave([[4] * 4, [], [6] * 6] * self.repeat_count, 2, 1)):
-        self.write_coordination_events[expected_element].set()
-        # First event starts the worker threads. Additionally, when running the
-        # sloppy case with prefetch_input_elements=0, we get stuck if we wait
-        # for the read coordination event for certain event orderings in the
-        # presence of finishing iterators.
-        if done_first_event and not (sloppy and (i in race_indices)):
-          self.read_coordination_events[expected_element].acquire()
-        actual_element = self.evaluate(self.next_element)
-        if not done_first_event or (sloppy and (i in race_indices)):
-          done_first_event = True
-          self.read_coordination_events[expected_element].acquire()
-        self.assertEqual(expected_element * expected_element, actual_element,
-                         "At index %s: %s expected, got: %s" %
-                         (i, expected_element, actual_element))
+    self._clear_coordination_events()
+    done_first_event = False
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([4, 0, 6]),
+            cycle_length=2,
+            block_length=1,
+            sloppy=sloppy,
+            buffer_output_elements=1,
+            prefetch_input_elements=prefetch_input_elements))
+    for i, expected_element in enumerate(
+        self._interleave([[4] * 4, [], [6] * 6] * self.repeat_count, 2, 1)):
+      self.write_coordination_events[expected_element].set()
+      # First event starts the worker threads. Additionally, when running the
+      # sloppy case with prefetch_input_elements=0, we get stuck if we wait
+      # for the read coordination event for certain event orderings in the
+      # presence of finishing iterators.
+      if done_first_event and not (sloppy and (i in race_indices)):
+        self.read_coordination_events[expected_element].acquire()
+      actual_element = self.evaluate(next_element())
+      if not done_first_event or (sloppy and (i in race_indices)):
+        done_first_event = True
+        self.read_coordination_events[expected_element].acquire()
+      self.assertEqual(
+          expected_element * expected_element, actual_element,
+          "At index %s: %s expected, got: %s" % (i, expected_element,
+                                                 actual_element))
 
   def testPartiallyEmptyOutputs(self):
     self._testPartiallyEmptyOutputs()
@@ -501,89 +471,81 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
   def testDelayedOutputSloppy(self):
     # Explicitly control the sequence of events to ensure we correctly avoid
     # head-of-line blocking.
-    with self.cached_session() as sess:
-      self._clear_coordination_events()
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [4, 5, 6],
-              self.cycle_length: 2,
-              self.block_length: 1,
-              self.sloppy: True,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 0,
-          })
-
-      mis_ordering = [
-          4, 4, 5, 4, 5, 5, 4, 5, 6, 6, 6, 5, 4, 4, 6, 6, 4, 4, 6, 5, 6, 6, 6,
-          6, 5, 5, 5, 5, 6, 6
-      ]
-      for element in mis_ordering:
-        self.write_coordination_events[element].set()
-        self.assertEqual(element * element, self.evaluate(self.next_element))
-        self.assertTrue(self.read_coordination_events[element].acquire(False))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    self._clear_coordination_events()
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([4, 5, 6]),
+            cycle_length=2,
+            block_length=1,
+            sloppy=True,
+            buffer_output_elements=1,
+            prefetch_input_elements=0))
+
+    mis_ordering = [
+        4, 4, 5, 4, 5, 5, 4, 5, 6, 6, 6, 5, 4, 4, 6, 6, 4, 4, 6, 5, 6, 6, 6, 6,
+        5, 5, 5, 5, 6, 6
+    ]
+    for element in mis_ordering:
+      self.write_coordination_events[element].set()
+      self.assertEqual(element * element, self.evaluate(next_element()))
+      self.assertTrue(self.read_coordination_events[element].acquire(False))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testBlockLengthWithContentionSloppy(self):
-    with self.cached_session() as sess:
-      self._clear_coordination_events()
-      done_first_event = False
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [4, 5, 6],
-              self.cycle_length: 2,
-              self.block_length: 1,
-              self.sloppy: True,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 1,
-          })
-      # Test against a generating sequence that differs from the uncontended
-      # case, in order to prove sloppy correctness.
-      for i, expected_element in enumerate(
-          self._interleave(
-              [[4] * 4, [5] * 5, [6] * 6] * self.repeat_count,
-              cycle_length=2,
-              block_length=3)):
-        self.write_coordination_events[expected_element].set()
-        if done_first_event:  # First event starts the worker threads.
-          self.read_coordination_events[expected_element].acquire()
-        actual_element = self.evaluate(self.next_element)
-        if not done_first_event:
-          self.read_coordination_events[expected_element].acquire()
-          done_first_event = True
-        self.assertEqual(expected_element * expected_element, actual_element,
-                         "At index %s: %s expected, got: %s" %
-                         (i, expected_element, actual_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    self._clear_coordination_events()
+    done_first_event = False
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([4, 5, 6]),
+            cycle_length=2,
+            block_length=1,
+            sloppy=True,
+            buffer_output_elements=1,
+            prefetch_input_elements=1))
+    # Test against a generating sequence that differs from the uncontended
+    # case, in order to prove sloppy correctness.
+    for i, expected_element in enumerate(
+        self._interleave(
+            [[4] * 4, [5] * 5, [6] * 6] * self.repeat_count,
+            cycle_length=2,
+            block_length=3)):
+      self.write_coordination_events[expected_element].set()
+      if done_first_event:  # First event starts the worker threads.
+        self.read_coordination_events[expected_element].acquire()
+      actual_element = self.evaluate(next_element())
+      if not done_first_event:
+        self.read_coordination_events[expected_element].acquire()
+        done_first_event = True
+      self.assertEqual(
+          expected_element * expected_element, actual_element,
+          "At index %s: %s expected, got: %s" % (i, expected_element,
+                                                 actual_element))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def _testEarlyExit(self, sloppy=False):
     # Exiting without consuming all input should not block
-    with self.cached_session() as sess:
-      self._clear_coordination_events()
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [4, 5, 6],
-              self.cycle_length: 3,
-              self.block_length: 2,
-              self.sloppy: sloppy,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 0,
-          })
-      for i in range(4, 7):
-        self.write_coordination_events[i].set()
-      elem = self.evaluate(self.next_element)  # Start all workers
-      # Allow the one successful worker to progress beyond the py_func again.
-      elem = int(math.sqrt(elem))
-      self.write_coordination_events[elem].set()
-      self.read_coordination_events[elem].acquire()
-      # Allow the prefetch to succeed
-      for i in range(4, 7):
-        self.read_coordination_events[i].acquire()
-        self.write_coordination_events[i].set()
+    self._clear_coordination_events()
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([4, 5, 6]),
+            cycle_length=3,
+            block_length=2,
+            sloppy=sloppy,
+            buffer_output_elements=1,
+            prefetch_input_elements=0))
+    for i in range(4, 7):
+      self.write_coordination_events[i].set()
+    elem = self.evaluate(next_element())  # Start all workers
+    # Allow the one successful worker to progress beyond the py_func again.
+    elem = int(math.sqrt(elem))
+    self.write_coordination_events[elem].set()
+    self.read_coordination_events[elem].acquire()
+    # Allow the prefetch to succeed
+    for i in range(4, 7):
+      self.read_coordination_events[i].acquire()
+      self.write_coordination_events[i].set()
 
   def testEarlyExit(self):
     self._testEarlyExit()
@@ -603,12 +565,10 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
     dataset = dataset.apply(
         interleave_ops.parallel_interleave(
             interleave_fn, cycle_length=16, block_length=2, sloppy=sloppy))
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
-
-    with self.cached_session() as sess:
-      output_values = []
-      for _ in range(30):
-        output_values.append(self.evaluate(iterator.get_next()))
+    get_next = self.getNext(dataset)
+    output_values = []
+    for _ in range(30):
+      output_values.append(self.evaluate(get_next()))
 
     expected_values = self._interleave(
         [[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 1, 2)
@@ -629,53 +589,47 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
       return dataset_ops.Dataset.from_tensor_slices(
           sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
 
-    dataset = dataset_ops.Dataset.range(10).map(_map_fn)
-    iterator = dataset_ops.make_initializable_iterator(dataset.apply(
-        interleave_ops.parallel_interleave(_interleave_fn, cycle_length=1)))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(init_op)
-      for i in range(10):
-        for j in range(2):
-          expected = [i, 0] if j % 2 == 0 else [0, -i]
-          self.assertAllEqual(expected, self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    dataset = dataset_ops.Dataset.range(10).map(_map_fn).apply(
+        interleave_ops.parallel_interleave(_interleave_fn, cycle_length=1))
+    get_next = self.getNext(dataset)
+
+    for i in range(10):
+      for j in range(2):
+        expected = [i, 0] if j % 2 == 0 else [0, -i]
+        self.assertAllEqual(expected, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   def testErrorsInOutputFn(self):
-    with self.cached_session() as sess:
-      self._clear_coordination_events()
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [4, 5, 6],
-              self.cycle_length: 2,
-              self.block_length: 1,
-              self.sloppy: False,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 0,
-          })
-
-      except_on_element_indices = set([3])
-
-      for i, expected_element in enumerate(
-          self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
-                           1)):
-        if i in except_on_element_indices:
-          self.error = ValueError()
-          self.write_coordination_events[expected_element].set()
-          with self.assertRaises(errors.InvalidArgumentError):
-            self.evaluate(self.next_element)
-        else:
-          self.write_coordination_events[expected_element].set()
-          actual_element = self.evaluate(self.next_element)
-          self.assertEqual(expected_element * expected_element, actual_element,
-                           "At index %s: %s expected, got: %s" %
-                           (i, expected_element, actual_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    self._clear_coordination_events()
+    next_element = self.getNext(
+        self.dataset_fn(
+            input_values=np.int64([4, 5, 6]),
+            cycle_length=2,
+            block_length=1,
+            sloppy=False,
+            buffer_output_elements=1,
+            prefetch_input_elements=0))
+
+    except_on_element_indices = set([3])
+
+    for i, expected_element in enumerate(
+        self._interleave([[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 2,
+                         1)):
+      if i in except_on_element_indices:
+        self.error = ValueError()
+        self.write_coordination_events[expected_element].set()
+        with self.assertRaises(errors.InvalidArgumentError):
+          self.evaluate(next_element())
+      else:
+        self.write_coordination_events[expected_element].set()
+        actual_element = self.evaluate(next_element())
+        self.assertEqual(
+            expected_element * expected_element, actual_element,
+            "At index %s: %s expected, got: %s" % (i, expected_element,
+                                                   actual_element))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testErrorsInInputFn(self):
 
@@ -692,41 +646,35 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
       dataset = dataset.repeat(x)
       return dataset
 
-    self.dataset = (
-        dataset_ops.Dataset.from_tensor_slices(self.input_values).map(map_fn)
-        .repeat(self.repeat_count).apply(
-            interleave_ops.parallel_interleave(interleave_fn, self.cycle_length,
-                                               self.block_length, self.sloppy,
-                                               self.buffer_output_elements,
-                                               self.prefetch_input_elements)))
-
-    self.iterator = dataset_ops.make_initializable_iterator(self.dataset)
-    self.init_op = self.iterator.initializer
-    self.next_element = self.iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [4, 5, 6],
-              self.cycle_length: 2,
-              self.block_length: 1,
-              self.sloppy: False,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 0,
-          })
-      for i, expected_element in enumerate(
-          self._interleave([[4] * 4, [5], [6] * 6] * self.repeat_count, 2, 1)):
-        if expected_element == 5:
-          with self.assertRaises(errors.InvalidArgumentError):
-            self.evaluate(self.next_element)
-        else:
-          actual_element = self.evaluate(self.next_element)
-          self.assertEqual(expected_element, actual_element,
-                           "At index %s: %s expected, got: %s" %
-                           (i, expected_element, actual_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    def dataset_fn(input_values, cycle_length, block_length, sloppy,
+                   buffer_output_elements, prefetch_input_elements):
+      return dataset_ops.Dataset.from_tensor_slices(input_values).map(
+          map_fn).repeat(self.repeat_count).apply(
+              interleave_ops.parallel_interleave(
+                  interleave_fn, cycle_length, block_length, sloppy,
+                  buffer_output_elements, prefetch_input_elements))
+
+    next_element = self.getNext(
+        dataset_fn(
+            input_values=np.int64([4, 5, 6]),
+            cycle_length=2,
+            block_length=1,
+            sloppy=False,
+            buffer_output_elements=1,
+            prefetch_input_elements=0))
+    for i, expected_element in enumerate(
+        self._interleave([[4] * 4, [5], [6] * 6] * self.repeat_count, 2, 1)):
+      if expected_element == 5:
+        with self.assertRaises(errors.InvalidArgumentError):
+          self.evaluate(next_element())
+      else:
+        actual_element = self.evaluate(next_element())
+        self.assertEqual(
+            expected_element, actual_element,
+            "At index %s: %s expected, got: %s" % (i, expected_element,
+                                                   actual_element))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testErrorsInInterleaveFn(self):
 
@@ -741,41 +689,35 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
       dataset = dataset.repeat(y)
       return dataset
 
-    self.dataset = (
-        dataset_ops.Dataset.from_tensor_slices(self.input_values)
-        .repeat(self.repeat_count).apply(
-            interleave_ops.parallel_interleave(interleave_fn, self.cycle_length,
-                                               self.block_length, self.sloppy,
-                                               self.buffer_output_elements,
-                                               self.prefetch_input_elements)))
-
-    self.iterator = dataset_ops.make_initializable_iterator(self.dataset)
-    self.init_op = self.iterator.initializer
-    self.next_element = self.iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.input_values: [4, 5, 6],
-              self.cycle_length: 2,
-              self.block_length: 1,
-              self.sloppy: False,
-              self.buffer_output_elements: 1,
-              self.prefetch_input_elements: 0,
-          })
-      for i, expected_element in enumerate(
-          self._interleave([[4] * 4, [5], [6] * 6] * self.repeat_count, 2, 1)):
-        if expected_element == 5:
-          with self.assertRaises(errors.InvalidArgumentError):
-            self.evaluate(self.next_element)
-        else:
-          actual_element = self.evaluate(self.next_element)
-          self.assertEqual(expected_element, actual_element,
-                           "At index %s: %s expected, got: %s" %
-                           (i, expected_element, actual_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(self.next_element)
+    def dataset_fn(input_values, cycle_length, block_length, sloppy,
+                   buffer_output_elements, prefetch_input_elements):
+      return dataset_ops.Dataset.from_tensor_slices(input_values).repeat(
+          self.repeat_count).apply(
+              interleave_ops.parallel_interleave(
+                  interleave_fn, cycle_length, block_length, sloppy,
+                  buffer_output_elements, prefetch_input_elements))
+
+    next_element = self.getNext(
+        dataset_fn(
+            input_values=np.int64([4, 5, 6]),
+            cycle_length=2,
+            block_length=1,
+            sloppy=False,
+            buffer_output_elements=1,
+            prefetch_input_elements=0))
+    for i, expected_element in enumerate(
+        self._interleave([[4] * 4, [5], [6] * 6] * self.repeat_count, 2, 1)):
+      if expected_element == 5:
+        with self.assertRaises(errors.InvalidArgumentError):
+          self.evaluate(next_element())
+      else:
+        actual_element = self.evaluate(next_element())
+        self.assertEqual(
+            expected_element, actual_element,
+            "At index %s: %s expected, got: %s" % (i, expected_element,
+                                                   actual_element))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testShutdownRace(self):
     dataset = dataset_ops.Dataset.range(20)
@@ -788,21 +730,17 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
             buffer_output_elements=1,
             prefetch_input_elements=0))
     dataset = dataset.batch(32)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
 
     results = []
-    with self.cached_session() as sess:
-      for _ in range(2):
-        elements = []
-        self.evaluate(iterator.initializer)
-        try:
-          while True:
-            elements.extend(self.evaluate(next_element))
-        except errors.OutOfRangeError:
-          pass
-        results.append(elements)
-
+    for _ in range(2):
+      elements = []
+      next_element = self.getNext(dataset)
+      try:
+        while True:
+          elements.extend(self.evaluate(next_element()))
+      except errors.OutOfRangeError:
+        pass
+      results.append(elements)
     self.assertAllEqual(results[0], results[1])
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
index 76e0d4d72a6d22f24da9c762770d1592ba67b737..4dbb188f2cffa08ff47cb4bd85ea6d3672edd222 100644
--- a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
@@ -27,6 +27,7 @@ from tensorflow.core.example import feature_pb2
 from tensorflow.python.data.experimental.ops import parsing_ops as contrib_parsing_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
@@ -671,8 +672,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
     for batch_size in (1, 10, 20, 100, 256):
       self._testSerializedContainingVarLenDenseLargerBatch(batch_size)
 
-  @test_util.run_deprecated_v1
-  def testSkipEagerSerializedShapeMismatch(self):
+  def testSerializedShapeMismatch(self):
     aname = "a"
     bname = "b"
     cname = "c"
@@ -695,19 +695,34 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
     ]
 
     serialized = [m.SerializeToString() for m in original]
-    self._test(
-        ops.convert_to_tensor(serialized), {
-            aname:
-                parsing_ops.FixedLenSequenceFeature((2, 1),
-                                                    dtype=dtypes.float32,
-                                                    allow_missing=True,
-                                                    default_value=[]),
-            bname:
-                parsing_ops.FixedLenSequenceFeature(
-                    (2, 1, 1), dtype=dtypes.string, allow_missing=True),
-        },
-        expected_err=(ValueError,
-                      "Cannot reshape a tensor with 0 elements to shape"))
+    if context.executing_eagerly():
+      self._test(
+          ops.convert_to_tensor(serialized), {
+              aname:
+                  parsing_ops.FixedLenSequenceFeature((2, 1),
+                                                      dtype=dtypes.float32,
+                                                      allow_missing=True,
+                                                      default_value=[]),
+              bname:
+                  parsing_ops.FixedLenSequenceFeature(
+                      (2, 1, 1), dtype=dtypes.string, allow_missing=True),
+          },
+          expected_err=(errors_impl.InvalidArgumentError,
+                        "Input to reshape is a tensor with 0 values"))
+    else:
+      self._test(
+          ops.convert_to_tensor(serialized), {
+              aname:
+                  parsing_ops.FixedLenSequenceFeature((2, 1),
+                                                      dtype=dtypes.float32,
+                                                      allow_missing=True,
+                                                      default_value=[]),
+              bname:
+                  parsing_ops.FixedLenSequenceFeature(
+                      (2, 1, 1), dtype=dtypes.string, allow_missing=True),
+          },
+          expected_err=(ValueError,
+                        "Cannot reshape a tensor with 0 elements to shape"))
 
   @test_util.run_deprecated_v1
   def testSerializedContainingVarLenDense(self):
diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
index 80bd43e9adee52afefc6a6c9866bab671aa4a731..238c5cd5060cafe7590fde72e4ac1e7b9b4ea6f4 100644
--- a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+# TODO(b/117581999): add eager coverage when supported.
 class PrefetchToDeviceTest(test_base.DatasetTestBase):
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py b/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py
index 77df8310d439b458c691ccbfb1d6015859c7d015..f36f94c02fec98f95d9cb718ae2d1dd19905b454 100644
--- a/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/reader_dataset_ops_test_base.py
@@ -26,12 +26,9 @@ from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.ops import readers as core_readers
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.lib.io import python_io
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.util import compat
 
@@ -150,26 +147,25 @@ class MakeBatchedFeaturesDatasetTestBase(test_base.DatasetTestBase):
       writer.close()
     return filenames
 
-  def _run_actual_batch(self, outputs, sess, label_key_provided=False):
+  def _run_actual_batch(self, outputs, label_key_provided=False):
     if label_key_provided:
       # outputs would be a tuple of (feature dict, label)
-      label_op = outputs[1]
-      features_op = outputs[0]
+      features, label = self.evaluate(outputs())
     else:
-      features_op = outputs
-      label_op = features_op["label"]
-    file_op = features_op["file"]
-    keywords_indices_op = features_op["keywords"].indices
-    keywords_values_op = features_op["keywords"].values
-    keywords_dense_shape_op = features_op["keywords"].dense_shape
-    record_op = features_op["record"]
-    return sess.run([
-        file_op, keywords_indices_op, keywords_values_op,
-        keywords_dense_shape_op, record_op, label_op
+      features = self.evaluate(outputs())
+      label = features["label"]
+    file_out = features["file"]
+    keywords_indices = features["keywords"].indices
+    keywords_values = features["keywords"].values
+    keywords_dense_shape = features["keywords"].dense_shape
+    record = features["record"]
+    return ([
+        file_out, keywords_indices, keywords_values, keywords_dense_shape,
+        record, label
     ])
 
-  def _next_actual_batch(self, sess, label_key_provided=False):
-    return self._run_actual_batch(self.outputs, sess, label_key_provided)
+  def _next_actual_batch(self, label_key_provided=False):
+    return self._run_actual_batch(self.outputs, label_key_provided)
 
   def _interleave(self, iterators, cycle_length):
     pending_iterators = iterators
@@ -251,7 +247,6 @@ class MakeBatchedFeaturesDatasetTestBase(test_base.DatasetTestBase):
       ]
 
   def verify_records(self,
-                     sess,
                      batch_size,
                      file_index=None,
                      num_epochs=1,
@@ -268,7 +263,7 @@ class MakeBatchedFeaturesDatasetTestBase(test_base.DatasetTestBase):
         num_epochs,
         cycle_length=interleave_cycle_length):
       actual_batch = self._next_actual_batch(
-          sess, label_key_provided=label_key_provided)
+          label_key_provided=label_key_provided)
       for i in range(len(expected_batch)):
         self.assertAllEqual(expected_batch[i], actual_batch[i])
 
@@ -323,21 +318,6 @@ class TFRecordDatasetTestBase(test_base.DatasetTestBase):
 
     self.test_filenames = self._createFiles()
 
-    self.filenames = array_ops.placeholder(dtypes.string, shape=[None])
-    self.num_epochs = array_ops.placeholder_with_default(
-        constant_op.constant(1, dtypes.int64), shape=[])
-    self.compression_type = array_ops.placeholder_with_default("", shape=[])
-    self.batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-
-    repeat_dataset = core_readers.TFRecordDataset(
-        self.filenames, self.compression_type).repeat(self.num_epochs)
-    batch_dataset = repeat_dataset.batch(self.batch_size)
-
-    iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
-    self.init_op = iterator.make_initializer(repeat_dataset)
-    self.init_batch_op = iterator.make_initializer(batch_dataset)
-    self.get_next = iterator.get_next()
-
   def _record(self, f, r):
     return compat.as_bytes("Record %d of file %d" % (r, f))
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..28f9aad5b9dc0ad218ab71790c5c3c25c61824f3
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
@@ -0,0 +1,320 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the private `_RebatchDataset` transformation."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import scan_ops
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+@parameterized.named_parameters(("WithDropRemainder", True),
+                                ("WithoutDropRemainder", False))
+@test_util.run_all_in_graph_and_eager_modes
+class RebatchDatasetTest(test_base.DatasetTestBase):
+
+  def testBasic(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(1024).batch(
+        32, drop_remainder=drop_remainder)
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[32 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    self.assertEqual(
+        [[8 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
+
+    expected_output = [[k for k in range(i, i + 8)] for i in range(0, 1024, 8)]  # pylint: disable=g-complex-comprehension
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testScalarInputError(self, _):
+    dataset = dataset_ops.Dataset.range(1024)
+    with self.assertRaisesRegexp(ValueError, "at least one dimension"):
+      batching._RebatchDataset(dataset, num_workers=4)
+
+  def testNotDivisibleError(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(1024).batch(
+        32, drop_remainder=drop_remainder)
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "not divisible by"):
+      rebatched_dataset = batching._RebatchDataset(dataset, num_workers=5)
+      next_element = self.getNext(rebatched_dataset)
+      self.evaluate(next_element())
+
+  def testTupleOutput(self, drop_remainder):
+    dataset = (
+        dataset_ops.Dataset.range(1024).map(lambda x: (x, x)).batch(
+            32, drop_remainder=drop_remainder))
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    expected_output = [([k for k in range(i, i + 8)],  # pylint: disable=g-complex-comprehension
+                        [k for k in range(i, i + 8)])
+                       for i in range(0, 1024, 8)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testNestedDictionaryOutput(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(1024).map(
+        lambda x: {"a": x, "b": {"c": x}}).batch(
+            32, drop_remainder=drop_remainder)
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    expected_output = [{"a": [k for k in range(i, i + 8)],  # pylint: disable=g-complex-comprehension
+                        "b": {"c": [k for k in range(i, i + 8)]}}
+                       for i in range(0, 1024, 8)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testFinalPartialBatchOriginal(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(1032).batch(
+        32, drop_remainder=drop_remainder)
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[32 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    self.assertEqual(
+        [[8 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
+
+    expected_output = [[k for k in range(i, i + 8)] for i in range(0, 1032, 8)]  # pylint: disable=g-complex-comprehension
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testFinalPartialBatchAfterRebatch(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(34).batch(
+        32, drop_remainder=drop_remainder)
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[32 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    self.assertEqual(
+        [[8 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
+
+    expected_output = [[k for k in range(i, i + 8)] for i in range(0, 32, 8)]  # pylint: disable=g-complex-comprehension
+    if not drop_remainder:
+      expected_output += [[32, 33]]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testMultipleBatches(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(128).batch(
+        4, drop_remainder=drop_remainder)
+    dataset = dataset.batch(8, drop_remainder=drop_remainder)
+    self.assertEqual(
+        [[8, 4]] if drop_remainder else [[None, None]],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    # Each element is a list of 8 elements where each element is a list of 4.
+    expected_output = [[[j, j + 1, j + 2, j + 3]  # pylint: disable=g-complex-comprehension
+                        for j in range(i, i + 32, 4)]  # generates 8 elements
+                       for i in range(0, 128, 32)]
+    self.assertDatasetProduces(dataset, expected_output)
+
+    rebatched_dataset = batching._RebatchDataset(dataset, 4)
+    self.assertEqual(
+        [[2, 4]] if drop_remainder else [[None, None]],
+        [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
+    # Each element is a list of 2 elements where each element is a list of 4.
+    expected_output = [[[j, j + 1, j + 2, j + 3]  # pylint: disable=g-complex-comprehension
+                        for j in range(i, i + 8, 4)]  # generates 2 elements
+                       for i in range(0, 128, 8)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testMapAndBatch(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(1024).apply(
+        batching.map_and_batch(
+            math_ops.square, 32, drop_remainder=drop_remainder))
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[32 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    self.assertEqual(
+        [[8 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
+    expected_output = [[k**2 for k in range(i, i + 8)]  # pylint: disable=g-complex-comprehension
+                       for i in range(0, 1024, 8)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testPaddedBatch(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(128).batch(4).padded_batch(
+        8, padded_shapes=[5], drop_remainder=drop_remainder)
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[8, 5]] if drop_remainder else [[None, 5]],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    # Each element is a list of 8 elements in which each element is a list of 5
+    # elements, first four are numbers and the last one is a padded zero.
+    expected_output = [[[j, j + 1, j + 2, j + 3, 0]  # pylint: disable=g-complex-comprehension
+                        for j in range(i, i + 32, 4)]  # generates 8 elements
+                       for i in range(0, 128, 32)]
+    self.assertDatasetProduces(dataset, expected_output)
+    self.assertEqual(
+        [[2, 5]] if drop_remainder else [[None, 5]],
+        [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
+    # Each element is a list of 2 elements in which each element is a list of 5
+    # elements, first four are numbers and the last one is a padded zero.
+    expected_output = [[[j, j + 1, j + 2, j + 3, 0]  # pylint: disable=g-complex-comprehension
+                        for j in range(i, i + 8, 4)]  # generates 2 elements
+                       for i in range(0, 128, 8)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testConcatenate(self, drop_remainder):
+    dataset1 = dataset_ops.Dataset.range(64).batch(
+        8, drop_remainder=drop_remainder)
+    dataset2 = dataset_ops.Dataset.range(32).batch(
+        8, drop_remainder=drop_remainder)
+    dataset = dataset1.concatenate(dataset2)
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[8 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    self.assertEqual(
+        [[2 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
+    expected_output = ([[i, i + 1] for i in range(0, 64, 2)] +
+                       [[i, i + 1] for i in range(0, 32, 2)])
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testConcatenateDifferentShapes(self, drop_remainder):
+    dataset1 = dataset_ops.Dataset.range(64).batch(
+        16, drop_remainder=drop_remainder)
+    dataset2 = dataset_ops.Dataset.range(32).batch(
+        8, drop_remainder=drop_remainder)
+    dataset = dataset1.concatenate(dataset2)
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[None]], [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    self.assertEqual(
+        [[None]],
+        [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
+    expected_output = ([[i, i + 1, i + 2, i + 3] for i in range(0, 64, 4)] +
+                       [[i, i + 1] for i in range(0, 32, 2)])
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testZip(self, drop_remainder):
+    dataset1 = dataset_ops.Dataset.range(64).batch(
+        8, drop_remainder=drop_remainder)
+    dataset2 = dataset_ops.Dataset.range(32).batch(
+        8, drop_remainder=drop_remainder)
+    dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[8], [8]] if drop_remainder else [[None], [None]],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    self.assertEqual(
+        [[2], [2]] if drop_remainder else [[None], [None]],
+        [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
+    expected_output = [([i, i + 1], [i, i + 1]) for i in range(0, 32, 2)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testZipDifferentShapes(self, drop_remainder):
+    dataset1 = dataset_ops.Dataset.range(64).batch(
+        16, drop_remainder=drop_remainder)
+    dataset2 = dataset_ops.Dataset.range(32).batch(
+        8, drop_remainder=drop_remainder)
+    dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[16], [8]] if drop_remainder else [[None], [None]],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    self.assertEqual(
+        [[4], [2]] if drop_remainder else [[None], [None]],
+        [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
+    expected_output = [([2 * i, 2 * i + 1, 2 * i + 2, 2 * i + 3], [i, i + 1])
+                       for i in range(0, 32, 2)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testUnsupportedTransformError(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(1024).batch(
+        32, drop_remainder=drop_remainder).apply(
+            scan_ops.scan([0], lambda _, a: ([0], a)))
+    with self.assertRaises(errors.InvalidArgumentError):
+      rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+      next_element = self.getNext(rebatched_dataset)
+      self.evaluate(next_element())
+
+  def testFlatMapBatching(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(
+        2).flat_map(lambda _: dataset_ops.Dataset.range(32).batch(  # pylint: disable=g-long-lambda
+            32, drop_remainder=drop_remainder))
+    self.assertEqual(
+        [[32 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    # Two elements where each element is range(32)
+    expected_output = [[k for k in range(32)] for _ in range(2)]  # pylint: disable=g-complex-comprehension
+    self.assertDatasetProduces(dataset, expected_output)
+
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[8 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
+    # Two elements where each element is a list of 4 elements where each element
+    # is a list of 8.
+    expected_output = [[k for k in range(i, i + 8)]  # pylint: disable=g-complex-comprehension
+                       for _ in range(2)
+                       for i in range(0, 32, 8)]  # generates 4 elements
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testInterleaveBatching(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(
+        2).interleave(lambda _: dataset_ops.Dataset.range(32).batch(  # pylint: disable=g-long-lambda
+            32, drop_remainder=drop_remainder), cycle_length=2)
+    self.assertEqual(
+        [[32 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    # Two elements where each element is range(32)
+    expected_output = [[k for k in range(32)] for _ in range(2)]  # pylint: disable=g-complex-comprehension
+    self.assertDatasetProduces(dataset, expected_output)
+
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[8 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
+    # List of 4 elements where each element is a list of 8 numbering from 0 to
+    # 31 repeated twice.
+    expected_output = [[k for k in range(i, i + 8)]  # pylint: disable=g-complex-comprehension
+                       for i in range(0, 32, 8)  # generates 4 elements
+                       for _ in range(2)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  def testParallelInterleaveBatching(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(
+        2).interleave(lambda _: dataset_ops.Dataset.range(32).batch(  # pylint: disable=g-long-lambda
+            32, drop_remainder=drop_remainder), cycle_length=2,
+                      num_parallel_calls=2)
+    self.assertEqual(
+        [[32 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+    # Two elements where each element is range(32)
+    expected_output = [[k for k in range(32)] for _ in range(2)]  # pylint: disable=g-complex-comprehension
+    self.assertDatasetProduces(dataset, expected_output)
+
+    rebatched_dataset = batching._RebatchDataset(dataset, num_workers=4)
+    self.assertEqual(
+        [[8 if drop_remainder else None]],
+        [ts.as_list() for ts in nest.flatten(rebatched_dataset.output_shapes)])
+    # List of 4 elements where each element is a list of 8 numbering from 0 to
+    # 31 repeated twice in collated fashion i.e [0...8], [0...8] etc.
+    expected_output = [[k for k in range(i, i + 8)]  # pylint: disable=g-complex-comprehension
+                       for i in range(0, 32, 8)  # generates 4 elements
+                       for _ in range(2)]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py b/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py
index 76f68f50c8188e58affc353e62b7ff8c952c4955..4d35b160fdc15e22b9b62718af9407978d20d7e2 100644
--- a/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py
@@ -17,11 +17,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
 
 from absl.testing import parameterized
 import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.data.experimental.ops import resampling
 from tensorflow.python.data.kernel_tests import test_base
@@ -36,35 +34,12 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
-def _time_resampling(
-    test_obj, data_np, target_dist, init_dist, num_to_sample):
-  dataset = dataset_ops.Dataset.from_tensor_slices(data_np).repeat()
-
-  # Reshape distribution via rejection sampling.
-  dataset = dataset.apply(
-      resampling.rejection_resample(
-          class_func=lambda x: x,
-          target_dist=target_dist,
-          initial_dist=init_dist,
-          seed=142))
-
-  get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
-
-  with test_obj.test_session() as sess:
-    start_time = time.time()
-    for _ in xrange(num_to_sample):
-      sess.run(get_next)
-    end_time = time.time()
-
-  return end_time - start_time
-
-
+@test_util.run_all_in_graph_and_eager_modes
 class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(
       ("InitialDistributionKnown", True),
       ("InitialDistributionUnknown", False))
-  @test_util.run_deprecated_v1
   def testDistribution(self, initial_known):
     classes = np.random.randint(5, size=(20000,))  # Uniformly sampled
     target_dist = [0.9, 0.05, 0.05, 0.0, 0.0]
@@ -73,17 +48,17 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.from_tensor_slices(classes).shuffle(
         200, seed=21).map(lambda c: (c, string_ops.as_string(c))).repeat()
 
-    get_next = dataset_ops.make_one_shot_iterator(dataset.apply(
-        resampling.rejection_resample(
-            target_dist=target_dist,
-            initial_dist=initial_dist,
-            class_func=lambda c, _: c,
-            seed=27))).get_next()
+    get_next = self.getNext(
+        dataset.apply(
+            resampling.rejection_resample(
+                target_dist=target_dist,
+                initial_dist=initial_dist,
+                class_func=lambda c, _: c,
+                seed=27)))
 
-    with self.cached_session() as sess:
-      returned = []
-      while len(returned) < 4000:
-        returned.append(sess.run(get_next))
+    returned = []
+    while len(returned) < 4000:
+      returned.append(self.evaluate(get_next()))
 
     returned_classes, returned_classes_and_data = zip(*returned)
     _, returned_data = zip(*returned_classes_and_data)
@@ -99,7 +74,6 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
   @parameterized.named_parameters(
       ("OnlyInitial", True),
       ("NotInitial", False))
-  @test_util.run_deprecated_v1
   def testEdgeCasesSampleFromInitialDataset(self, only_initial_dist):
     init_dist = [0.5, 0.5]
     target_dist = [0.5, 0.5] if only_initial_dist else [0.0, 1.0]
@@ -117,15 +91,13 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
             target_dist=target_dist,
             initial_dist=init_dist))
 
-    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
+    get_next = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      returned = []
-      with self.assertRaises(errors.OutOfRangeError):
-        while True:
-          returned.append(sess.run(get_next))
+    returned = []
+    with self.assertRaises(errors.OutOfRangeError):
+      while True:
+        returned.append(self.evaluate(get_next()))
 
-  @test_util.run_deprecated_v1
   def testRandomClasses(self):
     init_dist = [0.25, 0.25, 0.25, 0.25]
     target_dist = [0.0, 0.0, 0.0, 1.0]
@@ -149,13 +121,12 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
             target_dist=target_dist,
             initial_dist=init_dist))
 
-    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
+    get_next = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      returned = []
-      with self.assertRaises(errors.OutOfRangeError):
-        while True:
-          returned.append(sess.run(get_next))
+    returned = []
+    with self.assertRaises(errors.OutOfRangeError):
+      while True:
+        returned.append(self.evaluate(get_next()))
 
     classes, _ = zip(*returned)
     bincount = np.bincount(
@@ -165,22 +136,5 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertAllClose(target_dist, bincount, atol=1e-2)
 
 
-class ResampleDatasetBenchmark(test.Benchmark):
-
-  def benchmarkResamplePerformance(self):
-    init_dist = [0.25, 0.25, 0.25, 0.25]
-    target_dist = [0.0, 0.0, 0.0, 1.0]
-    num_classes = len(init_dist)
-    # We don't need many samples to test a dirac-delta target distribution
-    num_samples = 1000
-    data_np = np.random.choice(num_classes, num_samples, p=init_dist)
-
-    resample_time = _time_resampling(
-        self, data_np, target_dist, init_dist, num_to_sample=1000)
-
-    self.report_benchmark(
-        iters=1000, wall_time=resample_time, name="benchmark_resample")
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
index 658e6120cf9e30d7f79e542c8df726d997b1abb9..ddac02b9e29fc54efd962d9697be66cd7e756354 100644
--- a/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
+# TODO(b/117581999): Add eager specific test.
 class RestructuredDatasetTest(test_base.DatasetTestBase):
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/data/experimental/kernel_tests/scan_test.py b/tensorflow/python/data/experimental/kernel_tests/scan_test.py
index bd974b21e301806e5282c8970e091df684c85144..38e9b1e128157e4ff284ae0065ee474b20bad86c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/scan_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/scan_test.py
@@ -24,7 +24,6 @@ import numpy as np
 from tensorflow.python.data.experimental.ops import scan_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -35,48 +34,34 @@ from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class ScanTest(test_base.DatasetTestBase):
 
   def _counting_dataset(self, start, scan_fn):
     return dataset_ops.Dataset.from_tensors(0).repeat().apply(
         scan_ops.scan(start, scan_fn))
 
-  @test_util.run_deprecated_v1
   def testCount(self):
     def make_scan_fn(step):
       return lambda state, _: (state + step, state)
 
-    start = array_ops.placeholder(dtypes.int32, shape=[])
-    step = array_ops.placeholder(dtypes.int32, shape=[])
-    take = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.make_initializable_iterator(self._counting_dataset(
-        start, make_scan_fn(step)).take(take))
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-
-      for start_val, step_val, take_val in [(0, 1, 10), (0, 1, 0), (10, 1, 10),
-                                            (10, 2, 10), (10, -1, 10),
-                                            (10, -2, 10)]:
-        sess.run(iterator.initializer,
-                 feed_dict={start: start_val, step: step_val, take: take_val})
-        for expected, _ in zip(
-            itertools.count(start_val, step_val), range(take_val)):
-          self.assertEqual(expected, self.evaluate(next_element))
-        with self.assertRaises(errors.OutOfRangeError):
-          self.evaluate(next_element)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testFibonacci(self):
-    iterator = dataset_ops.make_one_shot_iterator(
-        dataset_ops.Dataset.from_tensors(1).repeat(None).apply(
-            scan_ops.scan([0, 1], lambda a, _: ([a[1], a[0] + a[1]], a[1]))))
+    def dataset_fn(start, step, take):
+      return self._counting_dataset(start, make_scan_fn(step)).take(take)
+
+    for start_val, step_val, take_val in [(0, 1, 10), (0, 1, 0), (10, 1, 10),
+                                          (10, 2, 10), (10, -1, 10), (10, -2,
+                                                                      10)]:
+      next_element = self.getNext(dataset_fn(start_val, step_val, take_val))
+      for expected, _ in zip(
+          itertools.count(start_val, step_val), range(take_val)):
+        self.assertEqual(expected, self.evaluate(next_element()))
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(next_element())
 
-    if context.executing_eagerly():
-      next_element = iterator.get_next
-    else:
-      get_next = iterator.get_next()
-      next_element = lambda: get_next
+  def testFibonacci(self):
+    data = dataset_ops.Dataset.from_tensors(1).repeat(None).apply(
+        scan_ops.scan([0, 1], lambda a, _: ([a[1], a[0] + a[1]], a[1])))
+    next_element = self.getNext(data)
 
     self.assertEqual(1, self.evaluate(next_element()))
     self.assertEqual(1, self.evaluate(next_element()))
@@ -85,8 +70,8 @@ class ScanTest(test_base.DatasetTestBase):
     self.assertEqual(5, self.evaluate(next_element()))
     self.assertEqual(8, self.evaluate(next_element()))
 
-  @test_util.run_deprecated_v1
   def testSparseCount(self):
+
     def _sparse(i):
       return sparse_tensor.SparseTensorValue(
           indices=np.array([[0, 0]]),
@@ -96,27 +81,20 @@ class ScanTest(test_base.DatasetTestBase):
     def make_scan_fn(step):
       return lambda state, _: (_sparse(state.values[0] + step), state)
 
-    start = array_ops.placeholder(dtypes.int32, shape=[])
-    step = array_ops.placeholder(dtypes.int32, shape=[])
-    take = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.make_initializable_iterator(self._counting_dataset(
-        _sparse(start), make_scan_fn(step)).take(take))
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-
-      for start_val, step_val, take_val in [(0, 1, 10), (0, 1, 0), (10, 1, 10),
-                                            (10, 2, 10), (10, -1, 10),
-                                            (10, -2, 10)]:
-        sess.run(iterator.initializer,
-                 feed_dict={start: start_val, step: step_val, take: take_val})
-        for expected, _ in zip(
-            itertools.count(start_val, step_val), range(take_val)):
-          self.assertEqual(expected, self.evaluate(next_element).values[0])
-        with self.assertRaises(errors.OutOfRangeError):
-          self.evaluate(next_element)
-
-  @test_util.run_deprecated_v1
+    def dataset_fn(start, step, take):
+      return self._counting_dataset(_sparse(start),
+                                    make_scan_fn(step)).take(take)
+
+    for start_val, step_val, take_val in [(0, 1, 10), (0, 1, 0), (10, 1, 10),
+                                          (10, 2, 10), (10, -1, 10), (10, -2,
+                                                                      10)]:
+      next_element = self.getNext(dataset_fn(start_val, step_val, take_val))
+      for expected, _ in zip(
+          itertools.count(start_val, step_val), range(take_val)):
+        self.assertEqual(expected, self.evaluate(next_element()).values[0])
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(next_element())
+
   def testChangingStateShape(self):
     # Test the fixed-point shape invariant calculations: start with
     # initial values with known shapes, and use a scan function that
@@ -134,16 +112,14 @@ class ScanTest(test_base.DatasetTestBase):
     self.assertIs(None, dataset.output_shapes[0][1].ndims)
     self.assertEqual([], dataset.output_shapes[1].as_list())
 
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
-    next_element = iterator.get_next()
+    next_element = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      for i in range(5):
-        (longer_vector_val, larger_rank_val), _ = self.evaluate(next_element)
-        self.assertAllEqual([0] * (2**i), longer_vector_val)
-        self.assertAllEqual(np.array(1, ndmin=i), larger_rank_val)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    for i in range(5):
+      (longer_vector_val, larger_rank_val), _ = self.evaluate(next_element())
+      self.assertAllEqual([0] * (2**i), longer_vector_val)
+      self.assertAllEqual(np.array(1, ndmin=i), larger_rank_val)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
   def testIncorrectStateType(self):
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
index 4a2e28f49649ea698e9d426d86dae4bb42cdebf9..caf571ef4ee643f39b39abd825a38e03dc34aaef 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
@@ -93,6 +93,24 @@ py_test(
     ],
 )
 
+py_test(
+    name = "choose_fastest_dataset_serialization_test",
+    size = "small",
+    srcs = ["choose_fastest_dataset_serialization_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
+    deps = [
+        ":dataset_serialization_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 py_test(
     name = "concatenate_dataset_serialization_test",
     size = "small",
@@ -390,6 +408,24 @@ py_test(
     ],
 )
 
+py_test(
+    name = "rebatch_dataset_serialization_test",
+    size = "small",
+    srcs = ["rebatch_dataset_serialization_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
+    deps = [
+        ":dataset_serialization_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 py_test(
     name = "padded_batch_dataset_serialization_test",
     size = "medium",
@@ -587,6 +623,24 @@ py_test(
     ],
 )
 
+py_test(
+    name = "shard_dataset_serialization_test",
+    size = "medium",
+    srcs = ["shard_dataset_serialization_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
+    deps = [
+        ":dataset_serialization_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "shuffle_and_repeat_dataset_serialization_test",
     size = "medium",
@@ -666,6 +720,25 @@ py_test(
     ],
 )
 
+py_test(
+    name = "take_while_dataset_serialization_test",
+    size = "small",
+    srcs = ["take_while_dataset_serialization_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
+    deps = [
+        ":dataset_serialization_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:take_while_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "textline_dataset_serialization_test",
     size = "medium",
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
index 8cc66d0c29392b206015ad886780d854fb2b5d5c..84b8e5ca3647a0597f6823249743a678900751b8 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.experimental.ops import iterator_ops
-from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -35,7 +34,8 @@ from tensorflow_estimator.python.estimator import estimator
 from tensorflow_estimator.python.estimator import model_fn
 
 
-class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
+@test_util.run_v1_only('b/123904664')
+class CheckpointInputPipelineHookTest(test.TestCase):
 
   @staticmethod
   def _model_fn(features, labels, mode, config):
@@ -69,7 +69,6 @@ class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
   def _build_iterator_saver_hook(self, est):
     return iterator_ops.CheckpointInputPipelineHook(est)
 
-  @test_util.run_deprecated_v1
   def testReturnDatasetFromInputFn(self):
 
     def _input_fn():
@@ -82,7 +81,6 @@ class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
     est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
     self.assertSequenceEqual(self._read_vars(est.model_dir), (4, 3))
 
-  @test_util.run_deprecated_v1
   def testBuildIteratorInInputFn(self):
 
     def _input_fn():
@@ -97,7 +95,6 @@ class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
     est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
     self.assertSequenceEqual(self._read_vars(est.model_dir), (4, 3))
 
-  @test_util.run_deprecated_v1
   def testDoNotRestore(self):
 
     def _input_fn():
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/choose_fastest_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/choose_fastest_dataset_serialization_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..936dc2221490d32eb978cf3fe96de13b53b57f99
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/choose_fastest_dataset_serialization_test.py
@@ -0,0 +1,45 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the ZipDataset serialization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.platform import test
+
+
+class ChooseFastestDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def testCore(self):
+    num_outputs = 10
+    batch_size = 2
+
+    def build_ds():
+      dataset = dataset_ops.Dataset.range(num_outputs)
+      map_fn = lambda x: x * 2
+      return optimization._ChooseFastestDataset([  # pylint: disable=protected-access
+          dataset.map(map_fn).batch(batch_size),
+          dataset.batch(batch_size).map(map_fn)
+      ])
+
+    self.run_core_tests(build_ds, None, num_outputs // 2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
index bdbd8702b7f8d315a730c5cd2b000218ea5e19be..ca45ecca4c734f6a896c093921844c425cbba7e6 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
@@ -23,7 +23,6 @@ import os
 import numpy as np
 
 from tensorflow.python.data.experimental.ops import iterator_ops as contrib_iterator_ops
-from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import dtypes
@@ -78,7 +77,6 @@ class DatasetSerializationTestBase(test.TestCase):
     # NOTE: We disable all default optimizations in serialization tests in order
     # to test the actual dataset in question.
     options = dataset_ops.Options()
-    options.experimental_optimization = OptimizationOptions()
     options.experimental_optimization.apply_default_optimizations = False
 
     def ds_fn1_no_opt():
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py
index ed4a1da59679c8c85141cb38e46ad95441b71b73..aaa46bacefed1865ff85bf5478fbd0f22c65c227 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/optimize_dataset_serialization_test.py
@@ -34,6 +34,20 @@ class OptimizeDatasetSerializationTest(
 
     self.run_core_tests(lambda: build_dataset(200, 10), None, 20)
 
+  def testWithNewFunction(self):
+    """Tests that optimized datasets with new functions work."""
+
+    def build_dataset():
+      dataset = dataset_ops.Dataset.range(100)
+      dataset = dataset.map(lambda x: x)
+      dataset = dataset.batch(5)
+      # map_vectorization adds a new vectorized function to the function
+      # library.
+      dataset = dataset.apply(optimization.optimize(["map_vectorization"]))
+      return dataset
+
+    self.run_core_tests(build_dataset, None, 20)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/rebatch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/rebatch_dataset_serialization_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b30db589069a26cf9f5322e3bde498413ca39108
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/rebatch_dataset_serialization_test.py
@@ -0,0 +1,41 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the _RebatchDataset serialization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.platform import test
+
+
+class RebatchDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def testCore(self):
+
+    def build_dataset(num_elements, batch_size):
+      return batching._RebatchDataset(
+          dataset_ops.Dataset.range(num_elements).batch(
+              4 * batch_size, drop_remainder=True),
+          num_workers=4)
+
+    self.run_core_tests(lambda: build_dataset(200, 10), None, 20)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/shard_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/shard_dataset_serialization_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..99674b6910312c35f065fc3dd2cdd738fe544615
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/shard_dataset_serialization_test.py
@@ -0,0 +1,42 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the ShardDataset serialization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.platform import test
+
+
+class ShardDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
+
+  def _build_dataset(self, num_elements, num_shards, index):
+    return dataset_ops.Dataset.range(num_elements).shard(num_shards, index)
+
+  @parameterized.parameters((10, 5, 2, 3), (10, 10, 0, 9), (100, 2, 0, 1))
+  def testCore(self, elems, num_shards, index1, index2):
+    self.run_core_tests(lambda: self._build_dataset(elems, num_shards, index1),
+                        lambda: self._build_dataset(elems, num_shards, index2),
+                        elems // num_shards)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/take_while_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/take_while_dataset_serialization_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..47899eab68cbe41ad0dcb7f4daddabda0071d488
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/take_while_dataset_serialization_test.py
@@ -0,0 +1,44 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the TakeWhileDataset serialization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
+from tensorflow.python.data.experimental.ops import take_while_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.platform import test
+
+
+class TakeWhileDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
+
+  def _build_dataset(self, num_elements, upper_bound):
+    return dataset_ops.Dataset.range(num_elements).apply(
+        take_while_ops.take_while(lambda x: x < upper_bound))
+
+  @parameterized.parameters((23, 10, 7), (10, 50, 0), (25, 30, 25))
+  def testCore(self, num_elem1, num_elem2, upper_bound):
+    self.run_core_tests(lambda: self._build_dataset(num_elem1, upper_bound),
+                        lambda: self._build_dataset(num_elem2, upper_bound),
+                        upper_bound)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
index 9528f83291f9e4b752a266499e9ec6d7e5239f7d..92ae528b940c60dd54e4d647ee0322997120605c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
@@ -23,11 +23,11 @@ from tensorflow.python.data.experimental.ops import shuffle_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class ShuffleAndRepeatTest(test_base.DatasetTestBase):
 
   def _build_ds(self, seed, count=5, num_elements=20):
@@ -35,17 +35,15 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
         shuffle_ops.shuffle_and_repeat(buffer_size=5, count=count, seed=seed))
 
   def _gen_outputs(self, ds_fn, num_outputs, verify_exhausted=True):
-    get_next = dataset_ops.make_one_shot_iterator(ds_fn()).get_next()
+    get_next = self.getNext(ds_fn())
     outputs = []
-    with self.cached_session() as sess:
-      for _ in range(num_outputs):
-        outputs.append(self.evaluate(get_next))
-      if verify_exhausted:
-        with self.assertRaises(errors.OutOfRangeError):
-          self.evaluate(get_next)
+    for _ in range(num_outputs):
+      outputs.append(self.evaluate(get_next()))
+    if verify_exhausted:
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
     return outputs
 
-  @test_util.run_deprecated_v1
   def testCorrectOutput(self):
     output = self._gen_outputs(lambda: self._build_ds(10), 100)
     self.assertSequenceEqual(
@@ -54,7 +52,6 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
     for i in range(5):
       self.assertSequenceEqual(sorted(output[i * 20:(i + 1) * 20]), range(20))
 
-  @test_util.run_deprecated_v1
   def testReshuffling(self):
     # Check that the output orders of different epochs are indeed different.
     output = self._gen_outputs(lambda: self._build_ds(10), 100)
@@ -63,20 +60,17 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
       epoch2 = output[(i + 1) * 20:(i + 2) * 20]
       self.assertNotEqual(epoch1, epoch2)
 
-  @test_util.run_deprecated_v1
   def testSameOrderForSameSeeds(self):
     output1 = self._gen_outputs(lambda: self._build_ds(10), 100)
     output2 = self._gen_outputs(lambda: self._build_ds(10), 100)
     self.assertEqual(output1, output2)
 
-  @test_util.run_deprecated_v1
   def testDifferentOrderForDifferentSeeds(self):
     output1 = self._gen_outputs(lambda: self._build_ds(10), 100)
     output2 = self._gen_outputs(lambda: self._build_ds(20), 100)
     self.assertNotEqual(output1, output2)
     self.assertEqual(sorted(output1), sorted(output2))
 
-  @test_util.run_deprecated_v1
   def testCountNone(self):
     output1 = self._gen_outputs(
         lambda: self._build_ds(10, count=None), 100, verify_exhausted=False)
@@ -85,7 +79,6 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
     self.assertNotEqual(output1, output2)
     self.assertEqual(sorted(output1), sorted(output2))
 
-  @test_util.run_deprecated_v1
   def testCountMinusOne(self):
     output1 = self._gen_outputs(
         lambda: self._build_ds(10, count=-1), 100, verify_exhausted=False)
@@ -110,12 +103,24 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
                         100)
 
   def testLargeBufferSize(self):
-    with ops.Graph().as_default() as g:
-      ds = dataset_ops.Dataset.range(20).apply(
-          shuffle_ops.shuffle_and_repeat(buffer_size=21))
-      get_next_op = ds.make_one_shot_iterator().get_next()
-      with self.session(graph=g) as sess:
-        self.evaluate(get_next_op)
+    ds = dataset_ops.Dataset.range(20).apply(
+        shuffle_ops.shuffle_and_repeat(buffer_size=21))
+    get_next = self.getNext(ds)
+    self.evaluate(get_next())
+
+  def testVeryLargeBufferSize(self):
+    num_epochs = 1000 * 1000
+    # Each element being shuffled and repeated has shape (100,). This will OOM
+    # or timeout if we actually load everything into the buffer.
+    ds = dataset_ops.Dataset.range(500).batch(100).apply(
+        shuffle_ops.shuffle_and_repeat(
+            buffer_size=5 * num_epochs, count=num_epochs))
+    # Verify two epochs worth of output.
+    output = self._gen_outputs(lambda: ds, 2 * 5, verify_exhausted=False)
+    for i in range(2):
+      sorted_epoch = sorted(
+          output[i * 5:(i + 1) * 5], key=lambda batch: batch[0])
+      self.assertAllEqual(sorted_epoch, np.arange(500).reshape([5, 100]))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/sleep_test.py b/tensorflow/python/data/experimental/kernel_tests/sleep_test.py
index 46b22f80b6d5f918624dcc98b894fbc37e0e46bc..4733c2a8330c377a6860c4207f6b50b7d83dc9ef 100644
--- a/tensorflow/python/data/experimental/kernel_tests/sleep_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/sleep_test.py
@@ -29,25 +29,22 @@ from tensorflow.python.platform import test
 _NUMPY_RANDOM_SEED = 42
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class SleepTest(test_base.DatasetTestBase):
 
-  @test_util.run_deprecated_v1
   def testSleep(self):
+    self.skipTest("b/123597912")
     sleep_microseconds = 100
     dataset = dataset_ops.Dataset.range(10).apply(
         sleep.sleep(sleep_microseconds))
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.evaluate(iterator.initializer)
-      start_time = time.time()
-      for i in range(10):
-        self.assertEqual(i, self.evaluate(next_element))
-      end_time = time.time()
-      self.assertGreater(end_time - start_time, (10 * sleep_microseconds) / 1e6)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    next_element = self.getNext(dataset)
+    start_time = time.time()
+    for i in range(10):
+      self.assertEqual(i, self.evaluate(next_element()))
+    end_time = time.time()
+    self.assertGreater(end_time - start_time, (10 * sleep_microseconds) / 1e6)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py
index eb66927ee5c73c67325f3764d29d5c8461c05cbb..e97c80627cf9f16f4f6865bb47f81de8e19bac21 100644
--- a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py
@@ -21,574 +21,454 @@ from __future__ import print_function
 from tensorflow.python.data.experimental.kernel_tests import sql_dataset_test_base
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
 
   # Test that SqlDataset can read from a database table.
   def testReadResultSet(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
-                                                dtypes.string), 2)
-    with self.cached_session() as sess:
-      for _ in range(2):  # Run twice to verify statelessness of db operations.
-        sess.run(
-            init_op,
-            feed_dict={
-                self.query: "SELECT first_name, last_name, motto FROM students "
-                            "ORDER BY first_name DESC"
-            })
-        for _ in range(2):  # Dataset is repeated. See setUp.
-          self.assertEqual((b"John", b"Doe", b"Hi!"), self.evaluate(get_next))
-          self.assertEqual((b"Jane", b"Moe", b"Hi again!"),
-                           self.evaluate(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          self.evaluate(get_next)
+    for _ in range(2):  # Run twice to verify statelessness of db operations.
+      dataset = self._createSqlDataset(
+          query="SELECT first_name, last_name, motto FROM students "
+          "ORDER BY first_name DESC",
+          output_types=(dtypes.string, dtypes.string, dtypes.string),
+          num_repeats=2)
+      self.assertDatasetProduces(
+          dataset,
+          expected_output=[(b"John", b"Doe", b"Hi!"),
+                           (b"Jane", b"Moe", b"Hi again!")] * 2,
+          num_test_iterations=2)
 
   # Test that SqlDataset works on a join query.
   def testReadResultSetJoinQuery(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
-                                                dtypes.string))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query:
-                  "SELECT students.first_name, state, motto FROM students "
-                  "INNER JOIN people "
-                  "ON students.first_name = people.first_name "
-                  "AND students.last_name = people.last_name"
-          })
-      self.assertEqual((b"John", b"California", b"Hi!"),
-                       self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT students.first_name, state, motto FROM students "
+            "INNER JOIN people "
+            "ON students.first_name = people.first_name "
+            "AND students.last_name = people.last_name",
+            output_types=(dtypes.string, dtypes.string, dtypes.string)))
+
+    self.assertEqual((b"John", b"California", b"Hi!"),
+                     self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # Test that SqlDataset can read a database entry with a null-terminator
   # in the middle of the text and place the entry in a `string` tensor.
   def testReadResultSetNullTerminator(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
-                                                dtypes.string))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query:
-                  "SELECT first_name, last_name, favorite_nonsense_word "
-                  "FROM students ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", b"Doe", b"n\0nsense"), self.evaluate(get_next))
-      self.assertEqual((b"Jane", b"Moe", b"nonsense\0"),
-                       self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, last_name, favorite_nonsense_word "
+            "FROM students ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.string, dtypes.string)))
+
+    self.assertEqual((b"John", b"Doe", b"n\0nsense"), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", b"Moe", b"nonsense\0"),
+                     self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next)
+      self.evaluate(get_next())
 
   # Test that SqlDataset works when used on two different queries.
   # Because the output types of the dataset must be determined at graph-creation
   # time, the two queries must have the same number and types of columns.
   def testReadResultSetReuseSqlDataset(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
-                                                dtypes.string))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, last_name, motto FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", b"Doe", b"Hi!"), self.evaluate(get_next))
-      self.assertEqual((b"Jane", b"Moe", b"Hi again!"), self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, last_name, state FROM people "
-                          "ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", b"Doe", b"California"),
-                       self.evaluate(get_next))
-      self.assertEqual((b"Benjamin", b"Franklin", b"Pennsylvania"),
-                       self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, last_name, motto FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.string, dtypes.string)))
+    self.assertEqual((b"John", b"Doe", b"Hi!"), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", b"Moe", b"Hi again!"), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, last_name, state FROM people "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.string, dtypes.string)))
+    self.assertEqual((b"John", b"Doe", b"California"),
+                     self.evaluate(get_next()))
+    self.assertEqual((b"Benjamin", b"Franklin", b"Pennsylvania"),
+                     self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # Test that an `OutOfRangeError` is raised on the first call to
   # `get_next_str_only` if result set is empty.
   def testReadEmptyResultSet(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
-                                                dtypes.string))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, last_name, motto FROM students "
-                          "WHERE first_name = 'Nonexistent'"
-          })
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, last_name, motto FROM students "
+            "WHERE first_name = 'Nonexistent'",
+            output_types=(dtypes.string, dtypes.string, dtypes.string)))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # Test that an error is raised when `driver_name` is invalid.
   def testReadResultSetWithInvalidDriverName(self):
-    init_op = self._createSqlDataset((dtypes.string, dtypes.string,
-                                      dtypes.string))[0]
-    with self.cached_session() as sess:
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(
-            init_op,
-            feed_dict={
-                self.driver_name: "sqlfake",
-                self.query: "SELECT first_name, last_name, motto FROM students "
-                            "ORDER BY first_name DESC"
-            })
+    with self.assertRaises(errors.InvalidArgumentError):
+      dataset = self._createSqlDataset(
+          driver_name="sqlfake",
+          query="SELECT first_name, last_name, motto FROM students "
+          "ORDER BY first_name DESC",
+          output_types=(dtypes.string, dtypes.string, dtypes.string))
+      self.assertDatasetProduces(dataset, expected_output=[])
 
   # Test that an error is raised when a column name in `query` is nonexistent
   def testReadResultSetWithInvalidColumnName(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
-                                                dtypes.string))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query:
-                  "SELECT first_name, last_name, fake_column FROM students "
-                  "ORDER BY first_name DESC"
-          })
-      with self.assertRaises(errors.UnknownError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, last_name, fake_column FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.string, dtypes.string)))
+    with self.assertRaises(errors.UnknownError):
+      self.evaluate(get_next())
 
   # Test that an error is raised when there is a syntax error in `query`.
   def testReadResultSetOfQueryWithSyntaxError(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
-                                                dtypes.string))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query:
-                  "SELEmispellECT first_name, last_name, motto FROM students "
-                  "ORDER BY first_name DESC"
-          })
-      with self.assertRaises(errors.UnknownError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELEmispellECT first_name, last_name, motto FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.string, dtypes.string)))
+    with self.assertRaises(errors.UnknownError):
+      self.evaluate(get_next())
 
   # Test that an error is raised when the number of columns in `query`
-  # does not match the length of `output_types`.
+  # does not match the length of `, output_types`.
   def testReadResultSetWithMismatchBetweenColumnsAndOutputTypes(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
-                                                dtypes.string))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, last_name FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      with self.assertRaises(errors.InvalidArgumentError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, last_name FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.string, dtypes.string)))
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(get_next())
 
   # Test that no results are returned when `query` is an insert query rather
   # than a select query. In particular, the error refers to the number of
   # output types passed to the op not matching the number of columns in the
   # result set of the query (namely, 0 for an insert statement.)
   def testReadResultSetOfInsertQuery(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
-                                                dtypes.string))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query:
-                  "INSERT INTO students (first_name, last_name, motto) "
-                  "VALUES ('Foo', 'Bar', 'Baz'), ('Fizz', 'Buzz', 'Fizzbuzz')"
-          })
-      with self.assertRaises(errors.InvalidArgumentError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="INSERT INTO students (first_name, last_name, motto) "
+            "VALUES ('Foo', 'Bar', 'Baz'), ('Fizz', 'Buzz', 'Fizzbuzz')",
+            output_types=(dtypes.string, dtypes.string, dtypes.string)))
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read an integer from a SQLite database table and
   # place it in an `int8` tensor.
   def testReadResultSetInt8(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.int8))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, desk_number FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", 9), self.evaluate(get_next))
-      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, desk_number FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.int8)))
+    self.assertEqual((b"John", 9), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", 127), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read a negative or 0-valued integer from a
   # SQLite database table and place it in an `int8` tensor.
   def testReadResultSetInt8NegativeAndZero(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.int8,
-                                                dtypes.int8))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, income, favorite_negative_number "
-                          "FROM students "
-                          "WHERE first_name = 'John' ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", 0, -2), self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, income, favorite_negative_number "
+            "FROM students "
+            "WHERE first_name = 'John' ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.int8, dtypes.int8)))
+    self.assertEqual((b"John", 0, -2), self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next)
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read a large (positive or negative) integer from
   # a SQLite database table and place it in an `int8` tensor.
   def testReadResultSetInt8MaxValues(self):
-    init_op, get_next = self._createSqlDataset((dtypes.int8, dtypes.int8))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query:
-                  "SELECT desk_number, favorite_negative_number FROM students "
-                  "ORDER BY first_name DESC"
-          })
-      self.assertEqual((9, -2), self.evaluate(get_next))
-      # Max and min values of int8
-      self.assertEqual((127, -128), self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT desk_number, favorite_negative_number FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.int8, dtypes.int8)))
+    self.assertEqual((9, -2), self.evaluate(get_next()))
+    # Max and min values of int8
+    self.assertEqual((127, -128), self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next)
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read an integer from a SQLite database table and
   # place it in an `int16` tensor.
   def testReadResultSetInt16(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.int16))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, desk_number FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", 9), self.evaluate(get_next))
-      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, desk_number FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.int16)))
+    self.assertEqual((b"John", 9), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", 127), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read a negative or 0-valued integer from a
   # SQLite database table and place it in an `int16` tensor.
   def testReadResultSetInt16NegativeAndZero(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.int16,
-                                                dtypes.int16))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, income, favorite_negative_number "
-                          "FROM students "
-                          "WHERE first_name = 'John' ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", 0, -2), self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, income, favorite_negative_number "
+            "FROM students "
+            "WHERE first_name = 'John' ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.int16, dtypes.int16)))
+    self.assertEqual((b"John", 0, -2), self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next)
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read a large (positive or negative) integer from
   # a SQLite database table and place it in an `int16` tensor.
   def testReadResultSetInt16MaxValues(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.int16))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, favorite_medium_sized_number "
-                          "FROM students ORDER BY first_name DESC"
-          })
-      # Max value of int16
-      self.assertEqual((b"John", 32767), self.evaluate(get_next))
-      # Min value of int16
-      self.assertEqual((b"Jane", -32768), self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, favorite_medium_sized_number "
+            "FROM students ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.int16)))
+    # Max value of int16
+    self.assertEqual((b"John", 32767), self.evaluate(get_next()))
+    # Min value of int16
+    self.assertEqual((b"Jane", -32768), self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next)
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read an integer from a SQLite database table and
   # place it in an `int32` tensor.
   def testReadResultSetInt32(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.int32))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, desk_number FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", 9), self.evaluate(get_next))
-      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, desk_number FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.int32)))
+    self.assertEqual((b"John", 9), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", 127), self.evaluate(get_next()))
 
   # Test that `SqlDataset` can read a negative or 0-valued integer from a
   # SQLite database table and place it in an `int32` tensor.
   def testReadResultSetInt32NegativeAndZero(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.int32))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, income FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", 0), self.evaluate(get_next))
-      self.assertEqual((b"Jane", -20000), self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, income FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.int32)))
+    self.assertEqual((b"John", 0), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", -20000), self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next)
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read a large (positive or negative) integer from
   # a SQLite database table and place it in an `int32` tensor.
   def testReadResultSetInt32MaxValues(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.int32))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, favorite_number FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      # Max value of int32
-      self.assertEqual((b"John", 2147483647), self.evaluate(get_next))
-      # Min value of int32
-      self.assertEqual((b"Jane", -2147483648), self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, favorite_number FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.int32)))
+    # Max value of int32
+    self.assertEqual((b"John", 2147483647), self.evaluate(get_next()))
+    # Min value of int32
+    self.assertEqual((b"Jane", -2147483648), self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next)
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read a numeric `varchar` from a SQLite database
   # table and place it in an `int32` tensor.
   def testReadResultSetInt32VarCharColumnAsInt(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.int32))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, school_id FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", 123), self.evaluate(get_next))
-      self.assertEqual((b"Jane", 1000), self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, school_id FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.int32)))
+    self.assertEqual((b"John", 123), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", 1000), self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next)
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read an integer from a SQLite database table
   # and place it in an `int64` tensor.
   def testReadResultSetInt64(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.int64))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, desk_number FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", 9), self.evaluate(get_next))
-      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, desk_number FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.int64)))
+    self.assertEqual((b"John", 9), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", 127), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read a negative or 0-valued integer from a
   # SQLite database table and place it in an `int64` tensor.
   def testReadResultSetInt64NegativeAndZero(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.int64))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, income FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", 0), self.evaluate(get_next))
-      self.assertEqual((b"Jane", -20000), self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, income FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.int64)))
+    self.assertEqual((b"John", 0), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", -20000), self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next)
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read a large (positive or negative) integer from
   # a SQLite database table and place it in an `int64` tensor.
   def testReadResultSetInt64MaxValues(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.int64))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query:
-                  "SELECT first_name, favorite_big_number FROM students "
-                  "ORDER BY first_name DESC"
-          })
-      # Max value of int64
-      self.assertEqual((b"John", 9223372036854775807), self.evaluate(get_next))
-      # Min value of int64
-      self.assertEqual((b"Jane", -9223372036854775808), self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, favorite_big_number FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.int64)))
+    # Max value of int64
+    self.assertEqual((b"John", 9223372036854775807), self.evaluate(get_next()))
+    # Min value of int64
+    self.assertEqual((b"Jane", -9223372036854775808), self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next)
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read an integer from a SQLite database table and
   # place it in a `uint8` tensor.
   def testReadResultSetUInt8(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.uint8))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, desk_number FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", 9), self.evaluate(get_next))
-      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, desk_number FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.uint8)))
+    self.assertEqual((b"John", 9), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", 127), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read the minimum and maximum uint8 values from a
   # SQLite database table and place them in `uint8` tensors.
   def testReadResultSetUInt8MinAndMaxValues(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.uint8))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, brownie_points FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      # Min value of uint8
-      self.assertEqual((b"John", 0), self.evaluate(get_next))
-      # Max value of uint8
-      self.assertEqual((b"Jane", 255), self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, brownie_points FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.uint8)))
+    # Min value of uint8
+    self.assertEqual((b"John", 0), self.evaluate(get_next()))
+    # Max value of uint8
+    self.assertEqual((b"Jane", 255), self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next)
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read an integer from a SQLite database table
   # and place it in a `uint16` tensor.
   def testReadResultSetUInt16(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.uint16))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, desk_number FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", 9), self.evaluate(get_next))
-      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, desk_number FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.uint16)))
+    self.assertEqual((b"John", 9), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", 127), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read the minimum and maximum uint16 values from a
   # SQLite database table and place them in `uint16` tensors.
   def testReadResultSetUInt16MinAndMaxValues(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.uint16))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, account_balance FROM students "
-                          "ORDER BY first_name DESC"
-          })
-      # Min value of uint16
-      self.assertEqual((b"John", 0), self.evaluate(get_next))
-      # Max value of uint16
-      self.assertEqual((b"Jane", 65535), self.evaluate(get_next))
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, account_balance FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.uint16)))
+    # Min value of uint16
+    self.assertEqual((b"John", 0), self.evaluate(get_next()))
+    # Max value of uint16
+    self.assertEqual((b"Jane", 65535), self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
-      self.evaluate(get_next)
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read a 0-valued and 1-valued integer from a
   # SQLite database table and place them as `True` and `False` respectively
   # in `bool` tensors.
   def testReadResultSetBool(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.bool))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query:
-                  "SELECT first_name, registration_complete FROM students "
-                  "ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", True), self.evaluate(get_next))
-      self.assertEqual((b"Jane", False), self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, registration_complete FROM students "
+            "ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.bool)))
+    self.assertEqual((b"John", True), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", False), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read an integer that is not 0-valued or 1-valued
   # from a SQLite database table and place it as `True` in a `bool` tensor.
   def testReadResultSetBoolNotZeroOrOne(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.bool))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query: "SELECT first_name, favorite_medium_sized_number "
-                          "FROM students ORDER BY first_name DESC"
-          })
-      self.assertEqual((b"John", True), self.evaluate(get_next))
-      self.assertEqual((b"Jane", True), self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, favorite_medium_sized_number "
+            "FROM students ORDER BY first_name DESC",
+            output_types=(dtypes.string, dtypes.bool)))
+    self.assertEqual((b"John", True), self.evaluate(get_next()))
+    self.assertEqual((b"Jane", True), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read a float from a SQLite database table
   # and place it in a `float64` tensor.
   def testReadResultSetFloat64(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
-                                                dtypes.float64))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query:
-                  "SELECT first_name, last_name, victories FROM townspeople "
-                  "ORDER BY first_name"
-          })
-      self.assertEqual((b"George", b"Washington", 20.0),
-                       self.evaluate(get_next))
-      self.assertEqual((b"John", b"Adams", -19.95), self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, last_name, victories FROM townspeople "
+            "ORDER BY first_name",
+            output_types=(dtypes.string, dtypes.string, dtypes.float64)))
+    self.assertEqual((b"George", b"Washington", 20.0),
+                     self.evaluate(get_next()))
+    self.assertEqual((b"John", b"Adams", -19.95), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read a float from a SQLite database table beyond
   # the precision of 64-bit IEEE, without throwing an error. Test that
   # `SqlDataset` identifies such a value as equal to itself.
   def testReadResultSetFloat64OverlyPrecise(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
-                                                dtypes.float64))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query:
-                  "SELECT first_name, last_name, accolades FROM townspeople "
-                  "ORDER BY first_name"
-          })
-      self.assertEqual(
-          (b"George", b"Washington",
-           1331241.321342132321324589798264627463827647382647382643874),
-          self.evaluate(get_next))
-      self.assertEqual(
-          (b"John", b"Adams",
-           1331241321342132321324589798264627463827647382647382643874.0),
-          self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, last_name, accolades FROM townspeople "
+            "ORDER BY first_name",
+            output_types=(dtypes.string, dtypes.string, dtypes.float64)))
+    self.assertEqual(
+        (b"George", b"Washington",
+         1331241.321342132321324589798264627463827647382647382643874),
+        self.evaluate(get_next()))
+    self.assertEqual(
+        (b"John", b"Adams",
+         1331241321342132321324589798264627463827647382647382643874.0),
+        self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # Test that `SqlDataset` can read a float from a SQLite database table,
   # representing the largest integer representable as a 64-bit IEEE float
   # such that the previous integer is also representable as a 64-bit IEEE float.
   # Test that `SqlDataset` can distinguish these two numbers.
   def testReadResultSetFloat64LargestConsecutiveWholeNumbersNotEqual(self):
-    init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
-                                                dtypes.float64))
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              self.query:
-                  "SELECT first_name, last_name, triumphs FROM townspeople "
-                  "ORDER BY first_name"
-          })
-      self.assertNotEqual((b"George", b"Washington", 9007199254740992.0),
-                          self.evaluate(get_next))
-      self.assertNotEqual((b"John", b"Adams", 9007199254740991.0),
-                          self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next)
+    get_next = self.getNext(
+        self._createSqlDataset(
+            query="SELECT first_name, last_name, triumphs FROM townspeople "
+            "ORDER BY first_name",
+            output_types=(dtypes.string, dtypes.string, dtypes.float64)))
+    self.assertNotEqual((b"George", b"Washington", 9007199254740992.0),
+                        self.evaluate(get_next()))
+    self.assertNotEqual((b"John", b"Adams", 9007199254740991.0),
+                        self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py
index 809e09c80420979b84dc5e4706398f793466a059..90451b865f842e9f34b332ed6df45f1e4e85b9ff 100644
--- a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py
@@ -24,28 +24,23 @@ import sqlite3
 
 from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
 class SqlDatasetTestBase(test_base.DatasetTestBase):
   """Base class for setting up and testing SqlDataset."""
 
-  def _createSqlDataset(self, output_types, num_repeats=1):
-    dataset = readers.SqlDataset(self.driver_name, self.data_source_name,
-                                 self.query, output_types).repeat(num_repeats)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-    return init_op, get_next
+  def _createSqlDataset(self,
+                        query,
+                        output_types,
+                        driver_name="sqlite",
+                        num_repeats=1):
+    dataset = readers.SqlDataset(driver_name, self.data_source_name, query,
+                                 output_types).repeat(num_repeats)
+    return dataset
 
   def setUp(self):
     self.data_source_name = os.path.join(test.get_temp_dir(), "tftest.sqlite")
-    self.driver_name = array_ops.placeholder_with_default(
-        array_ops.constant("sqlite", dtypes.string), shape=[])
-    self.query = array_ops.placeholder(dtypes.string, shape=[])
 
     conn = sqlite3.connect(self.data_source_name)
     c = conn.cursor()
diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
index f19b08a2dde821124b6f5065eed4c825afa9f107..c53ac82c6e23a5cc26ca2fc8d22ce7e6970638a4 100644
--- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
@@ -26,7 +26,6 @@ from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.experimental.ops import stats_aggregator
 from tensorflow.python.data.experimental.ops import stats_ops
-from tensorflow.python.data.experimental.ops import stats_options
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -46,7 +45,6 @@ def function_set_stats_aggregator(dataset,
 
 def function_apply_options(dataset, aggregator, prefix="", counter_prefix=""):
   options = dataset_ops.Options()
-  options.experimental_stats = stats_options.StatsOptions()
   options.experimental_stats.aggregator = aggregator
   options.experimental_stats.prefix = prefix
   options.experimental_stats.counter_prefix = counter_prefix
@@ -54,140 +52,116 @@ def function_apply_options(dataset, aggregator, prefix="", counter_prefix=""):
   return dataset.with_options(options)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 @parameterized.named_parameters(
     ("SetStatsAggregator", function_set_stats_aggregator),
     ("StatsOptions", function_apply_options),
 )
 class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
 
-  @test_util.run_deprecated_v1
-  def testBytesProduced(self, dataset_transformation):
+  @test_util.run_v1_only("b/123901126")
+  def testSkipEagerBytesProduced(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).map(
         lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).apply(
             stats_ops.bytes_produced_stats("bytes_produced"))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-    summary_t = aggregator.get_summary()
-
-    with self.cached_session() as sess:
-      self.evaluate(iterator.initializer)
-      expected_sum = 0.0
-      for i in range(100):
-        self.assertAllEqual(
-            np.array([i] * i, dtype=np.int64), self.evaluate(next_element))
-        summary_str = self.evaluate(summary_t)
-        self._assertSummaryHasCount(summary_str, "bytes_produced", float(i + 1))
-        expected_sum += i * 8.0
-        self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
-      summary_str = self.evaluate(summary_t)
-      self._assertSummaryHasCount(summary_str, "bytes_produced", 100.0)
+    next_element = self.getNext(dataset, requires_initialization=True)
+
+    expected_sum = 0.0
+    for i in range(100):
+      self.assertAllEqual(
+          np.array([i] * i, dtype=np.int64), self.evaluate(next_element()))
+      summary_str = self.evaluate(aggregator.get_summary())
+      self._assertSummaryHasCount(summary_str, "bytes_produced", float(i + 1))
+      expected_sum += i * 8.0
       self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+    summary_str = self.evaluate(aggregator.get_summary())
+    self._assertSummaryHasCount(summary_str, "bytes_produced", 100.0)
+    self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
 
-  @test_util.run_deprecated_v1
   def testLatencyStats(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency"))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-    summary_t = aggregator.get_summary()
+    next_element = self.getNext(dataset, requires_initialization=True)
 
-    with self.cached_session() as sess:
-      self.evaluate(iterator.initializer)
-      for i in range(100):
-        self.assertEqual(i, self.evaluate(next_element))
-        self._assertSummaryHasCount(
-            self.evaluate(summary_t), "record_latency", float(i + 1))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    for i in range(100):
+      self.assertEqual(i, self.evaluate(next_element()))
       self._assertSummaryHasCount(
-          self.evaluate(summary_t), "record_latency", 100.0)
+          self.evaluate(aggregator.get_summary()), "record_latency",
+          float(i + 1))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+    self._assertSummaryHasCount(
+        self.evaluate(aggregator.get_summary()), "record_latency", 100.0)
 
-  @test_util.run_deprecated_v1
   def testPrefetchBufferUtilization(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).map(
         lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch(-1)
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-    summary_t = aggregator.get_summary()
-
-    with self.cached_session() as sess:
-      self.evaluate(iterator.initializer)
-      for i in range(100):
-        self.assertAllEqual(
-            np.array([i] * i, dtype=np.int64), self.evaluate(next_element))
-        summary_str = self.evaluate(summary_t)
-        self._assertSummaryHasCount(summary_str, "Prefetch::buffer_utilization",
-                                    float(i + 1))
-        self._assertSummaryContains(summary_str, "Prefetch::buffer_capacity")
-        self._assertSummaryContains(summary_str, "Prefetch::buffer_size")
-        self._assertSummaryHasRange(summary_str, "Prefetch::buffer_utilization",
-                                    0, 1)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
-      summary_str = self.evaluate(summary_t)
+    next_element = self.getNext(dataset, requires_initialization=True)
+    for i in range(100):
+      self.assertAllEqual(
+          np.array([i] * i, dtype=np.int64), self.evaluate(next_element()))
+      summary_str = self.evaluate(aggregator.get_summary())
       self._assertSummaryHasCount(summary_str, "Prefetch::buffer_utilization",
-                                  100)
+                                  float(i + 1))
+      self._assertSummaryContains(summary_str, "Prefetch::buffer_capacity")
+      self._assertSummaryContains(summary_str, "Prefetch::buffer_size")
+      self._assertSummaryHasRange(summary_str, "Prefetch::buffer_utilization",
+                                  0, 1)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+    summary_str = self.evaluate(aggregator.get_summary())
+    self._assertSummaryHasCount(summary_str, "Prefetch::buffer_utilization",
+                                100)
 
-  @test_util.run_deprecated_v1
   def testPrefetchBufferScalars(self, dataset_transformation):
-    def map_fn(x):
-      return array_ops.tile([x], ops.convert_to_tensor([x]))
     aggregator = stats_aggregator.StatsAggregator()
-    dataset = dataset_ops.Dataset.range(10).map(map_fn).prefetch(1)
+    dataset = dataset_ops.Dataset.range(10).map(
+        lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch(1)
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-    summary_t = aggregator.get_summary()
-
-    with self.cached_session() as sess:
-      self.evaluate(iterator.initializer)
-      for i in range(10):
-        self.assertAllEqual(
-            np.array([i] * i, dtype=np.int64), self.evaluate(next_element))
-        summary_str = self.evaluate(summary_t)
-        self._assertSummaryHasScalarValue(summary_str,
-                                          "Prefetch::buffer_capacity", 1)
-        self._assertSummaryHasScalarValue(summary_str, "Prefetch::buffer_size",
-                                          1)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    next_element = self.getNext(dataset, requires_initialization=True)
+
+    for i in range(10):
+      self.assertAllEqual(
+          np.array([i] * i, dtype=np.int64), self.evaluate(next_element()))
+      summary_str = self.evaluate(aggregator.get_summary())
+      self._assertSummaryHasScalarValue(summary_str,
+                                        "Prefetch::buffer_capacity", 1)
+      self._assertSummaryHasScalarValue(summary_str, "Prefetch::buffer_size", 1)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
-  @test_util.run_deprecated_v1
   def testFilteredElementsStats(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(101).filter(
         lambda x: math_ops.equal(math_ops.mod(x, 3), 0))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-    summary_t = aggregator.get_summary()
-
-    with self.test_session() as sess:
-      self.evaluate(iterator.initializer)
-      for i in range(34):
-        self.assertEqual(i * 3, self.evaluate(next_element))
-        if i is not 0:
-          self._assertSummaryHasScalarValue(
-              self.evaluate(summary_t), "Filter::dropped_elements",
-              float(i * 2))
-        self._assertSummaryHasScalarValue(
-            self.evaluate(summary_t), "Filter::filtered_elements", float(i + 1))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
-      self._assertSummaryHasScalarValue(
-          self.evaluate(summary_t), "Filter::dropped_elements", 67.0)
+    next_element = self.getNext(dataset, requires_initialization=True)
+
+    for i in range(34):
+      self.assertEqual(i * 3, self.evaluate(next_element()))
+      summary_str = self.evaluate(aggregator.get_summary())
+      if i is not 0:
+        self._assertSummaryHasScalarValue(summary_str,
+                                          "Filter::dropped_elements",
+                                          float(i * 2))
       self._assertSummaryHasScalarValue(
-          self.evaluate(summary_t), "Filter::filtered_elements", 34.0)
+          summary_str, "Filter::filtered_elements", float(i + 1))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+    summary_str = self.evaluate(aggregator.get_summary())
+    self._assertSummaryHasScalarValue(summary_str, "Filter::dropped_elements",
+                                      67.0)
+    self._assertSummaryHasScalarValue(summary_str, "Filter::filtered_elements",
+                                      34.0)
 
-  @test_util.run_deprecated_v1
   def testMapBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
@@ -202,7 +176,6 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         dataset_transformation,
         function_processing_time=True)
 
-  @test_util.run_deprecated_v1
   def testMapAutoTuneBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
@@ -220,14 +193,16 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         dataset_transformation,
         function_processing_time=True)
 
-  @test_util.run_deprecated_v1
   def testInterleaveAutoTuneBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
-      dataset = dataset_ops.Dataset.range(10).map(
-          lambda x: array_ops.tile([x], ops.convert_to_tensor([x])))
+
+      def interleave_fn(_):
+        return dataset_ops.Dataset.range(
+            10).map(lambda x: array_ops.tile([x], ops.convert_to_tensor([x])))
+
       dataset = dataset_ops.Dataset.range(1).interleave(
-          lambda _: dataset,
+          interleave_fn,
           cycle_length=1,
           num_parallel_calls=optimization.AUTOTUNE)
       options = dataset_ops.Options()
@@ -237,7 +212,6 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
     self._testParallelCallsStats(dataset_fn, "ParallelInterleaveV2", 10,
                                  dataset_transformation)
 
-  @test_util.run_deprecated_v1
   def testMapAndBatchAutoTuneBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
@@ -259,114 +233,98 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         check_elements=False,
         function_processing_time=True)
 
-  @test_util.run_deprecated_v1
   def testReinitialize(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency"))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-    summary_t = aggregator.get_summary()
-
-    with self.cached_session() as sess:
-      for j in range(5):
-        self.evaluate(iterator.initializer)
-        for i in range(100):
-          self.assertEqual(i, self.evaluate(next_element))
-          self._assertSummaryHasCount(
-              self.evaluate(summary_t), "record_latency",
-              float((j * 100) + i + 1))
-        with self.assertRaises(errors.OutOfRangeError):
-          self.evaluate(next_element)
+
+    for j in range(5):
+      next_element = self.getNext(dataset, requires_initialization=True)
+      for i in range(100):
+        self.assertEqual(i, self.evaluate(next_element()))
         self._assertSummaryHasCount(
-            self.evaluate(summary_t), "record_latency", (j + 1) * 100.0)
+            self.evaluate(aggregator.get_summary()), "record_latency",
+            float((j * 100) + i + 1))
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(next_element())
+      self._assertSummaryHasCount(
+          self.evaluate(aggregator.get_summary()), "record_latency",
+          (j + 1) * 100.0)
 
-  @test_util.run_deprecated_v1
   def testNoAggregatorRegistered(self, dataset_transformation):
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency"))
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      self.evaluate(iterator.initializer)
-      for i in range(100):
-        self.assertEqual(i, self.evaluate(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    next_element = self.getNext(dataset, requires_initialization=True)
+
+    for i in range(100):
+      self.assertEqual(i, self.evaluate(next_element()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
 
-  @test_util.run_deprecated_v1
   def testMultipleTags(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency")).apply(
             stats_ops.latency_stats("record_latency_2"))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-    summary_t = aggregator.get_summary()
 
-    with self.cached_session() as sess:
-      self.evaluate(iterator.initializer)
-      for i in range(100):
-        self.assertEqual(i, self.evaluate(next_element))
-        self._assertSummaryHasCount(
-            self.evaluate(summary_t), "record_latency", float(i + 1))
-        self._assertSummaryHasCount(
-            self.evaluate(summary_t), "record_latency_2", float(i + 1))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    next_element = self.getNext(dataset, requires_initialization=True)
+
+    for i in range(100):
+      self.assertEqual(i, self.evaluate(next_element()))
       self._assertSummaryHasCount(
-          self.evaluate(summary_t), "record_latency", 100.0)
+          self.evaluate(aggregator.get_summary()), "record_latency",
+          float(i + 1))
       self._assertSummaryHasCount(
-          self.evaluate(summary_t), "record_latency_2", 100.0)
+          self.evaluate(aggregator.get_summary()), "record_latency_2",
+          float(i + 1))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+    self._assertSummaryHasCount(
+        self.evaluate(aggregator.get_summary()), "record_latency", 100.0)
+    self._assertSummaryHasCount(
+        self.evaluate(aggregator.get_summary()), "record_latency_2", 100.0)
 
-  @test_util.run_deprecated_v1
   def testRepeatedTags(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency")).apply(
             stats_ops.latency_stats("record_latency"))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-    summary_t = aggregator.get_summary()
+    next_element = self.getNext(dataset, requires_initialization=True)
 
-    with self.cached_session() as sess:
-      self.evaluate(iterator.initializer)
-      for i in range(100):
-        self.assertEqual(i, self.evaluate(next_element))
-        self._assertSummaryHasCount(
-            self.evaluate(summary_t), "record_latency", float(2 * (i + 1)))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    for i in range(100):
+      self.assertEqual(i, self.evaluate(next_element()))
       self._assertSummaryHasCount(
-          self.evaluate(summary_t), "record_latency", 200.0)
+          self.evaluate(aggregator.get_summary()), "record_latency",
+          float(2 * (i + 1)))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+    self._assertSummaryHasCount(
+        self.evaluate(aggregator.get_summary()), "record_latency", 200.0)
 
-  @test_util.run_deprecated_v1
   def testMultipleIteratorsSameAggregator(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency"))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator_0 = dataset_ops.make_initializable_iterator(dataset)
-    iterator_1 = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator_0.get_next() + iterator_1.get_next()
-    summary_t = aggregator.get_summary()
+    next_element1 = self.getNext(dataset, requires_initialization=True)
+    next_element2 = self.getNext(dataset, requires_initialization=True)
 
-    with self.cached_session() as sess:
-      self.evaluate([iterator_0.initializer, iterator_1.initializer])
-      for i in range(100):
-        self.assertEqual(i * 2, self.evaluate(next_element))
-        self._assertSummaryHasCount(
-            self.evaluate(summary_t), "record_latency", float(2 * (i + 1)))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    for i in range(100):
+      self.assertEqual(i * 2, self.evaluate(next_element1() + next_element2()))
       self._assertSummaryHasCount(
-          self.evaluate(summary_t), "record_latency", 200.0)
+          self.evaluate(aggregator.get_summary()), "record_latency",
+          float(2 * (i + 1)))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element1())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element2())
+    self._assertSummaryHasCount(
+        self.evaluate(aggregator.get_summary()), "record_latency", 200.0)
 
-  @test_util.run_deprecated_v1
   def testMultipleDatasetWithPrefixes(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
@@ -375,39 +333,38 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
     dataset2 = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency"))
     dataset2 = dataset_transformation(dataset2, aggregator, prefix="dataset2")
-    iterator_0 = dataset_ops.make_initializable_iterator(dataset)
-    iterator_1 = dataset_ops.make_initializable_iterator(dataset2)
-    next_element = iterator_0.get_next() + iterator_1.get_next()
-    summary_t = aggregator.get_summary()
+    next_element1 = self.getNext(dataset, requires_initialization=True)
+    next_element2 = self.getNext(dataset2, requires_initialization=True)
 
-    with self.test_session() as sess:
-      self.evaluate([iterator_0.initializer, iterator_1.initializer])
-      for i in range(100):
-        self.assertEqual(i * 2, self.evaluate(next_element))
-        self._assertSummaryHasCount(
-            self.evaluate(summary_t), "dataset1_record_latency", float(i + 1))
-        self._assertSummaryHasCount(
-            self.evaluate(summary_t), "dataset2_record_latency", float(i + 1))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    for i in range(100):
+      self.assertEqual(i * 2, self.evaluate(next_element1() + next_element2()))
       self._assertSummaryHasCount(
-          self.evaluate(summary_t), "dataset1_record_latency", 100.0)
+          self.evaluate(aggregator.get_summary()), "dataset1_record_latency",
+          float(i + 1))
       self._assertSummaryHasCount(
-          self.evaluate(summary_t), "dataset2_record_latency", 100.0)
-
-
+          self.evaluate(aggregator.get_summary()), "dataset2_record_latency",
+          float(i + 1))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element1())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element2())
+    self._assertSummaryHasCount(
+        self.evaluate(aggregator.get_summary()), "dataset1_record_latency",
+        100.0)
+    self._assertSummaryHasCount(
+        self.evaluate(aggregator.get_summary()), "dataset2_record_latency",
+        100.0)
+
+
+@test_util.run_all_in_graph_and_eager_modes
 @parameterized.named_parameters(
-    dict(
-        testcase_name="SetStatsAggregator",
-        dataset_transformation=function_set_stats_aggregator),
-    dict(
-        testcase_name="StatsOptions",
-        dataset_transformation=function_apply_options))
+    ("SetStatsAggregator", function_set_stats_aggregator),
+    ("StatsOptions", function_apply_options)
+)
 class FeatureStatsDatasetTest(
     stats_dataset_test_base.StatsDatasetTestBase,
     reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase):
 
-  @test_util.run_deprecated_v1
   def testFeaturesStats(self, dataset_transformation):
     num_epochs = 5
     total_records = num_epochs * self._num_records
@@ -436,27 +393,26 @@ class FeatureStatsDatasetTest(
 
     dataset = dataset_transformation(
         dataset_fn(), aggregator, prefix="record_stats")
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-    summary_t = aggregator.get_summary()
-
-    with self.test_session() as sess:
-      self.evaluate(iterator.initializer)
-      for _ in range(num_output):
-        self.evaluate(next_element)
 
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
-      self._assertSummaryHasCount(
-          self.evaluate(summary_t), "record_stats_features", total_records)
-      self._assertSummaryHasCount(
-          self.evaluate(summary_t), "record_stats_feature-values",
-          total_records)
-      self._assertSummaryHasSum(
-          self.evaluate(summary_t), "record_stats_features", total_records * 4)
-      self._assertSummaryHasSum(
-          self.evaluate(summary_t), "record_stats_feature-values",
-          self._sum_keywords(1) * num_epochs + 3 * total_records)
+    next_element = self.getNext(dataset, requires_initialization=True)
+
+    for _ in range(num_output):
+      self.evaluate(next_element())
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+    self._assertSummaryHasCount(
+        self.evaluate(aggregator.get_summary()), "record_stats_features",
+        total_records)
+    self._assertSummaryHasCount(
+        self.evaluate(aggregator.get_summary()), "record_stats_feature-values",
+        total_records)
+    self._assertSummaryHasSum(
+        self.evaluate(aggregator.get_summary()), "record_stats_features",
+        total_records * 4)
+    self._assertSummaryHasSum(
+        self.evaluate(aggregator.get_summary()), "record_stats_feature-values",
+        self._sum_keywords(1) * num_epochs + 3 * total_records)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
index ab1d1c3028a4ee99b99145c7296b7b0d5b8ea6b9..f5a15f4c848c536ac07636469ea1f8b762bd317e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
@@ -22,7 +22,6 @@ import numpy as np
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python.data.experimental.ops import stats_aggregator
 from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 
 
@@ -94,27 +93,21 @@ class StatsDatasetTestBase(test_base.DatasetTestBase):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_fn()
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
-    summary_t = aggregator.get_summary()
+    next_element = self.getNext(dataset, requires_initialization=True)
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      for i in range(num_output):
-        next_ = sess.run(next_element)
-        if check_elements:
-          self.assertAllEqual(np.array([i] * i, dtype=np.int64), next_)
-        summary_str = sess.run(summary_t)
-        if function_processing_time:
-          self._assertSummaryHasCountMoreOrEqualGeneralisedTag(
-              summary_str, "::execution_time", float(i + 1))
-        self._assertSummaryContains(summary_str,
-                                    dataset_name + "::num_parallel_calls")
-        self._assertSummaryContains(summary_str,
-                                    dataset_name + "::active_parallel_calls")
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+    for i in range(num_output):
+      next_ = self.evaluate(next_element())
+      if check_elements:
+        self.assertAllEqual(np.array([i] * i, dtype=np.int64), next_)
+      summary_str = self.evaluate(aggregator.get_summary())
       if function_processing_time:
-        summary_str = sess.run(summary_t)
         self._assertSummaryHasCountMoreOrEqualGeneralisedTag(
-            summary_str, "::execution_time", float(num_output))
+            summary_str, "::execution_time", float(i + 1))
+      self._assertSummaryContains(summary_str,
+                                  dataset_name + "::thread_utilization")
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+    if function_processing_time:
+      summary_str = self.evaluate(aggregator.get_summary())
+      self._assertSummaryHasCountMoreOrEqualGeneralisedTag(
+          summary_str, "::execution_time", float(num_output))
diff --git a/tensorflow/python/data/experimental/kernel_tests/take_while_test.py b/tensorflow/python/data/experimental/kernel_tests/take_while_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..25ad6c7750e75ab92f8bb81c31ad4d60fea9a871
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/take_while_test.py
@@ -0,0 +1,103 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.take_while()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.experimental.ops import take_while_ops
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TakeWhileTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @parameterized.parameters((14, 2), (15, 2), (100, 3))
+  def testTakeWhileDataset(self, num_elements, window_size):
+
+    def _predicate_func(elem):
+      return array_ops.shape(elem)[0] > (window_size - 1)
+
+    take_while = take_while_ops.take_while(_predicate_func)
+
+    dataset = dataset_ops.Dataset.range(num_elements).batch(window_size)
+    dataset = dataset.apply(take_while).flat_map(
+        dataset_ops.Dataset.from_tensor_slices)
+
+    expected_num_elements = int(num_elements / window_size) * window_size
+    self.assertDatasetProduces(dataset, np.arange(expected_num_elements))
+
+  @parameterized.parameters((10, 2, False), (16, 7, False), (100, 99, False),
+                            (100, 101, True), (0, 1, True))
+  def testTakeWhileDatasetRange(self, num_elements, upper_bound, out_of_bounds):
+    dataset = dataset_ops.Dataset.range(num_elements).apply(
+        take_while_ops.take_while(lambda x: x < upper_bound))
+
+    if out_of_bounds:
+      with self.assertRaises(errors.OutOfRangeError):
+        self.assertDatasetProduces(dataset, np.arange(upper_bound))
+
+    else:
+      self.assertDatasetProduces(dataset, np.arange(upper_bound))
+
+  def testTakeWhileDatasetString(self):
+
+    def not_equal(string):
+      return lambda x: math_ops.not_equal(x, constant_op.constant(string))
+
+    string = ["this", "is", "the", "test", "for", "strings"]
+    dataset = dataset_ops.Dataset.from_tensor_slices(string).apply(
+        take_while_ops.take_while(not_equal("test")))
+
+    next_element = self.getNext(dataset)
+    self.assertEqual(b"this", self.evaluate(next_element()))
+    self.assertEqual(b"is", self.evaluate(next_element()))
+    self.assertEqual(b"the", self.evaluate(next_element()))
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.assertEqual(b"test", self.evaluate(next_element()))
+
+  @parameterized.parameters((5, 3), (10, 0), (100, 5), (8, 7))
+  def testTakewhileDatasetShortCircuit(self, size, index):
+
+    def _predicate_func(data_elem):
+      return data_elem
+
+    boolean_array = [True] * size
+    boolean_array[index] = False
+    dataset = dataset_ops.Dataset.from_tensor_slices(boolean_array).apply(
+        take_while_ops.take_while(_predicate_func))
+
+    next_element = self.getNext(dataset)
+
+    for _ in range(index):
+      self.assertTrue(self.evaluate(next_element()))
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py b/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py
index 8fd0ad50c4483ab321f391d403a2c8bf6ab48b7d..783b2e6e22ae618f255673011d72201c993e0a85 100644
--- a/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py
@@ -23,26 +23,25 @@ from tensorflow.python.data.experimental.ops import writers
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
-from tensorflow.python.framework import dtypes
+from tensorflow.python.eager import function
+from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.lib.io import tf_record
-from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class TFRecordWriterTest(test_base.DatasetTestBase):
 
   def setUp(self):
     super(TFRecordWriterTest, self).setUp()
     self._num_records = 7
-    self.filename = array_ops.placeholder(dtypes.string, shape=[])
-    self.compression_type = array_ops.placeholder_with_default("", shape=[])
 
-    input_dataset = readers.TFRecordDataset([self.filename],
-                                            self.compression_type)
-    self.writer = writers.TFRecordWriter(
-        self._outputFilename(), self.compression_type).write(input_dataset)
+  def writer_fn(self, filename, compression_type=""):
+    input_dataset = readers.TFRecordDataset([filename], compression_type)
+    return writers.TFRecordWriter(self._outputFilename(),
+                                  compression_type).write(input_dataset)
 
   def _record(self, i):
     return compat.as_bytes("Record %d" % (i))
@@ -62,56 +61,53 @@ class TFRecordWriterTest(test_base.DatasetTestBase):
     return os.path.join(self.get_temp_dir(), "tf_record.out.txt")
 
   def testWrite(self):
-    with self.cached_session() as sess:
-      sess.run(
-          self.writer, feed_dict={
-              self.filename: self._createFile(),
-          })
+    self.evaluate(self.writer_fn(self._createFile()))
     for i, r in enumerate(tf_record.tf_record_iterator(self._outputFilename())):
       self.assertAllEqual(self._record(i), r)
 
   def testWriteZLIB(self):
     options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.ZLIB)
-    with self.cached_session() as sess:
-      sess.run(
-          self.writer,
-          feed_dict={
-              self.filename: self._createFile(options),
-              self.compression_type: "ZLIB",
-          })
+    self.evaluate(
+        self.writer_fn(self._createFile(options), compression_type="ZLIB"))
     for i, r in enumerate(
         tf_record.tf_record_iterator(self._outputFilename(), options=options)):
       self.assertAllEqual(self._record(i), r)
 
   def testWriteGZIP(self):
     options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.GZIP)
-    with self.cached_session() as sess:
-      sess.run(
-          self.writer,
-          feed_dict={
-              self.filename: self._createFile(options),
-              self.compression_type: "GZIP",
-          })
+    self.evaluate(
+        self.writer_fn(self._createFile(options), compression_type="GZIP"))
     for i, r in enumerate(
         tf_record.tf_record_iterator(self._outputFilename(), options=options)):
       self.assertAllEqual(self._record(i), r)
 
   def testFailDataset(self):
     with self.assertRaises(TypeError):
-      writers.TFRecordWriter(self._outputFilename(),
-                             self.compression_type).write("whoops")
+      writers.TFRecordWriter(self._outputFilename(), "").write("whoops")
 
   def testFailDType(self):
     input_dataset = dataset_ops.Dataset.from_tensors(10)
     with self.assertRaises(TypeError):
-      writers.TFRecordWriter(self._outputFilename(),
-                             self.compression_type).write(input_dataset)
+      writers.TFRecordWriter(self._outputFilename(), "").write(input_dataset)
 
   def testFailShape(self):
     input_dataset = dataset_ops.Dataset.from_tensors([["hello"], ["world"]])
     with self.assertRaises(TypeError):
-      writers.TFRecordWriter(self._outputFilename(),
-                             self.compression_type).write(input_dataset)
+      writers.TFRecordWriter(self._outputFilename(), "").write(input_dataset)
+
+  def testSideEffect(self):
+    def writer_fn():
+      input_dataset = readers.TFRecordDataset(self._createFile())
+      return writers.TFRecordWriter(self._outputFilename()).write(input_dataset)
+
+    @function.defun
+    def fn():
+      _ = writer_fn()
+      return "hello"
+
+    self.assertEqual(self.evaluate(fn()), b"hello")
+    for i, r in enumerate(tf_record.tf_record_iterator(self._outputFilename())):
+      self.assertAllEqual(self._record(i), r)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py b/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
index cef5e8d269ce8d4db861b97efc1a75a1dbf2ff8e..613fe0da6b3d3db81a969a3cea261f238951fab4 100644
--- a/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
@@ -36,24 +36,14 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
-  @test_util.run_deprecated_v1
   def testUnbatchWithUnknownRankInput(self):
-    placeholder = array_ops.placeholder(dtypes.int32)
-    dataset = dataset_ops.Dataset.from_tensors(placeholder).apply(
-        batching.unbatch())
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_elem = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer, feed_dict={placeholder: [0, 1, 2, 3]})
-      for i in range(4):
-        self.assertEqual(i, self.evaluate(next_elem))
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_elem)
+    dataset = dataset_ops.Dataset.from_tensors([0, 1, 2,
+                                                3]).apply(batching.unbatch())
+    self.assertDatasetProduces(dataset, range(4))
 
-  @test_util.run_deprecated_v1
   def testUnbatchScalarDataset(self):
     data = tuple([math_ops.range(10) for _ in range(3)])
     data = dataset_ops.Dataset.from_tensor_slices(data)
@@ -63,17 +53,8 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     self.assertEqual(expected_types, data.output_types)
 
-    iterator = dataset_ops.make_one_shot_iterator(data)
-    op = iterator.get_next()
+    self.assertDatasetProduces(data, [(i,) * 3 for i in range(10)])
 
-    with self.cached_session() as sess:
-      for i in range(10):
-        self.assertEqual((i,) * 3, self.evaluate(op))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(op)
-
-  @test_util.run_deprecated_v1
   def testUnbatchDatasetWithStrings(self):
     data = tuple([math_ops.range(10) for _ in range(3)])
     data = dataset_ops.Dataset.from_tensor_slices(data)
@@ -84,17 +65,9 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     self.assertEqual(expected_types, data.output_types)
 
-    iterator = dataset_ops.make_one_shot_iterator(data)
-    op = iterator.get_next()
+    self.assertDatasetProduces(
+        data, [(i, compat.as_bytes(str(i)), i) for i in range(10)])
 
-    with self.cached_session() as sess:
-      for i in range(10):
-        self.assertEqual((i, compat.as_bytes(str(i)), i), self.evaluate(op))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(op)
-
-  @test_util.run_deprecated_v1
   def testUnbatchDatasetWithSparseTensor(self):
     st = sparse_tensor.SparseTensorValue(
         indices=[[i, i] for i in range(10)],
@@ -104,19 +77,11 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     data = data.batch(5)
     data = data.apply(batching.unbatch())
-    iterator = dataset_ops.make_one_shot_iterator(data)
-    next_element = iterator.get_next()
+    expected_output = [
+        sparse_tensor.SparseTensorValue([[i]], [i], [10]) for i in range(10)
+    ]
+    self.assertDatasetProduces(data, expected_output=expected_output)
 
-    with self.cached_session() as sess:
-      for i in range(10):
-        st_row = self.evaluate(next_element)
-        self.assertEqual([i], st_row.indices)
-        self.assertEqual([i], st_row.values)
-        self.assertEqual([10], st_row.dense_shape)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
-
-  @test_util.run_deprecated_v1
   def testUnbatchDatasetWithDenseAndSparseTensor(self):
     st = sparse_tensor.SparseTensorValue(
         indices=[[i, i] for i in range(10)],
@@ -126,20 +91,10 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     data = data.batch(5)
     data = data.apply(batching.unbatch())
-    iterator = dataset_ops.make_one_shot_iterator(data)
-    next_element = iterator.get_next()
+    expected_output = [(i, sparse_tensor.SparseTensorValue([[i]], [i], [10]))
+                       for i in range(10)]
+    self.assertDatasetProduces(data, expected_output=expected_output)
 
-    with self.cached_session() as sess:
-      for i in range(10):
-        dense_elem, st_row = self.evaluate(next_element)
-        self.assertEqual(i, dense_elem)
-        self.assertEqual([i], st_row.indices)
-        self.assertEqual([i], st_row.values)
-        self.assertEqual([10], st_row.dense_shape)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
-
-  @test_util.run_deprecated_v1
   def testUnbatchSingleElementTupleDataset(self):
     data = tuple([(math_ops.range(10),) for _ in range(3)])
     data = dataset_ops.Dataset.from_tensor_slices(data)
@@ -149,17 +104,8 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     self.assertEqual(expected_types, data.output_types)
 
-    iterator = dataset_ops.make_one_shot_iterator(data)
-    op = iterator.get_next()
+    self.assertDatasetProduces(data, [((i,),) * 3 for i in range(10)])
 
-    with self.cached_session() as sess:
-      for i in range(10):
-        self.assertEqual(((i,),) * 3, self.evaluate(op))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(op)
-
-  @test_util.run_deprecated_v1
   def testUnbatchMultiElementTupleDataset(self):
     data = tuple([(math_ops.range(10 * i, 10 * i + 10),
                    array_ops.fill([10], "hi")) for i in range(3)])
@@ -170,29 +116,16 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     self.assertAllEqual(expected_types, data.output_types)
 
-    iterator = dataset_ops.make_one_shot_iterator(data)
-    op = iterator.get_next()
+    self.assertDatasetProduces(
+        data,
+        [((i, b"hi"), (10 + i, b"hi"), (20 + i, b"hi")) for i in range(10)])
 
-    with self.cached_session() as sess:
-      for i in range(10):
-        self.assertEqual(((i, b"hi"), (10 + i, b"hi"), (20 + i, b"hi")),
-                         self.evaluate(op))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(op)
-
-  @test_util.run_deprecated_v1
   def testUnbatchEmpty(self):
     data = dataset_ops.Dataset.from_tensors(
         (constant_op.constant([]), constant_op.constant([], shape=[0, 4]),
          constant_op.constant([], shape=[0, 4, 0])))
     data = data.apply(batching.unbatch())
-    iterator = dataset_ops.make_one_shot_iterator(data)
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(next_element)
+    self.assertDatasetProduces(data, [])
 
   def testUnbatchStaticShapeMismatch(self):
     data = dataset_ops.Dataset.from_tensors((np.arange(7), np.arange(8),
@@ -200,8 +133,9 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.assertRaises(ValueError):
       data.apply(batching.unbatch())
 
+  # Note: dynamic shape mismatch is graph specific test.
   @test_util.run_deprecated_v1
-  def testUnbatchDynamicShapeMismatch(self):
+  def testSkipEagerUnbatchDynamicShapeMismatch(self):
     ph1 = array_ops.placeholder(dtypes.int32, shape=[None])
     ph2 = array_ops.placeholder(dtypes.int32, shape=None)
     data = dataset_ops.Dataset.from_tensors((ph1, ph2))
diff --git a/tensorflow/python/data/experimental/kernel_tests/unique_test.py b/tensorflow/python/data/experimental/kernel_tests/unique_test.py
index 1d9941d7f4d0729e5e0f62ebbac80d0d4d385f59..42d76a2eb3013625e7807d1f50dd19809a7cd3e4 100644
--- a/tensorflow/python/data/experimental/kernel_tests/unique_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/unique_test.py
@@ -21,12 +21,12 @@ from tensorflow.python.data.experimental.ops import unique
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class UniqueTest(test_base.DatasetTestBase):
 
   def _testSimpleHelper(self, dtype, test_cases):
@@ -44,19 +44,13 @@ class UniqueTest(test_base.DatasetTestBase):
     current_test_case = []
     dataset = dataset_ops.Dataset.from_generator(lambda: current_test_case,
                                                  dtype).apply(unique.unique())
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      for test_case, expected in test_cases:
-        current_test_case = test_case
-        self.evaluate(iterator.initializer)
-        for element in expected:
-          if dtype == dtypes.string:
-            element = compat.as_bytes(element)
-          self.assertAllEqual(element, self.evaluate(next_element))
-        with self.assertRaises(errors.OutOfRangeError):
-          self.evaluate(next_element)
+    for test_case, expected in test_cases:
+      current_test_case = test_case
+      self.assertDatasetProduces(dataset, [
+          compat.as_bytes(element) if dtype == dtypes.string else element
+          for element in expected
+      ])
 
   @test_util.run_deprecated_v1
   def testSimpleInt(self):
diff --git a/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py b/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py
index 9c734b65e056df954a8597ab6f23489353cc057b..e6e77575a6f98b5becc9fe4ceb3126e22403b471 100644
--- a/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py
@@ -20,33 +20,32 @@ from __future__ import print_function
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class WrapDatasetVariantTest(test_base.DatasetTestBase):
 
   def testBasic(self):
     ds = dataset_ops.Dataset.range(100)
-    ds_variant = ds._as_variant_tensor()  # pylint: disable=protected-access
+    ds_variant = ds._variant_tensor  # pylint: disable=protected-access
 
     wrapped_variant = gen_dataset_ops.wrap_dataset_variant(ds_variant)
     unwrapped_variant = gen_dataset_ops.unwrap_dataset_variant(wrapped_variant)
 
     variant_ds = dataset_ops._VariantDataset(unwrapped_variant,
                                              ds._element_structure)
-    iterator = dataset_ops.make_initializable_iterator(variant_ds)
-    get_next = iterator.get_next()
-
-    with self.cached_session():
-      self.evaluate(iterator.initializer)
-      for i in range(100):
-        self.assertEqual(i, self.evaluate(get_next))
+    get_next = self.getNext(variant_ds, requires_initialization=True)
+    for i in range(100):
+      self.assertEqual(i, self.evaluate(get_next()))
 
-  def testGPU(self):
+  @test_util.run_v1_only("b/123901304")
+  def testSkipEagerGPU(self):
     ds = dataset_ops.Dataset.range(100)
-    ds_variant = ds._as_variant_tensor()  # pylint: disable=protected-access
+    ds_variant = ds._variant_tensor  # pylint: disable=protected-access
     wrapped_variant = gen_dataset_ops.wrap_dataset_variant(ds_variant)
 
     with ops.device("/gpu:0"):
diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
index 60c20e0bcf2d875a15ffcc4c42d10cb6e0cc25ea..56bf59344f8881d96525c197268ad9dac988166a 100644
--- a/tensorflow/python/data/experimental/ops/BUILD
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -354,6 +354,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "take_while_ops",
+    srcs = ["take_while_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 py_library(
     name = "threading_options",
     srcs = ["threading_options.py"],
@@ -454,6 +467,7 @@ py_library(
         ":shuffle_ops",
         ":sleep",
         ":stats_ops",
+        ":take_while_ops",
         ":threadpool",
         ":unique",
         ":writers",
diff --git a/tensorflow/python/data/experimental/ops/batching.py b/tensorflow/python/data/experimental/ops/batching.py
index 29df98f4ea4c90d80f3518684febacc101ec2ba5..983f7640b895639195ac8f6ff91784023c226165 100644
--- a/tensorflow/python/data/experimental/ops/batching.py
+++ b/tensorflow/python/data/experimental/ops/batching.py
@@ -27,6 +27,7 @@ from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import structure
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
@@ -38,6 +39,7 @@ from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -352,7 +354,6 @@ class _UnbatchDataset(dataset_ops.UnaryDataset):
 
   def __init__(self, input_dataset):
     """See `unbatch()` for more details."""
-    super(_UnbatchDataset, self).__init__(input_dataset)
     flat_shapes = nest.flatten(input_dataset.output_shapes)
     if any(s.ndims == 0 for s in flat_shapes):
       raise ValueError("Cannot unbatch an input with scalar components.")
@@ -370,10 +371,10 @@ class _UnbatchDataset(dataset_ops.UnaryDataset):
         nest.map_structure(lambda s: s[1:], input_dataset.output_shapes),
         input_dataset.output_classes)
 
-  def _as_variant_tensor(self):
-    return ged_ops.experimental_unbatch_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+    variant_tensor = ged_ops.experimental_unbatch_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
         **dataset_ops.flat_structure(self))
+    super(_UnbatchDataset, self).__init__(input_dataset, variant_tensor)
 
   @property
   def _element_structure(self):
@@ -440,7 +441,6 @@ class _DenseToSparseBatchDataset(dataset_ops.UnaryDataset):
 
   def __init__(self, input_dataset, batch_size, row_shape):
     """See `Dataset.dense_to_sparse_batch()` for more details."""
-    super(_DenseToSparseBatchDataset, self).__init__(input_dataset)
     if not isinstance(input_dataset.output_types, dtypes.DType):
       raise TypeError("DenseToSparseDataset requires an input whose elements "
                       "have a single component, whereas the input has %r." %
@@ -452,12 +452,13 @@ class _DenseToSparseBatchDataset(dataset_ops.UnaryDataset):
         input_dataset.output_types,
         tensor_shape.vector(None).concatenate(self._row_shape))
 
-  def _as_variant_tensor(self):
-    return ged_ops.experimental_dense_to_sparse_batch_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+    variant_tensor = ged_ops.experimental_dense_to_sparse_batch_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
         self._batch_size,
         row_shape=convert.partial_shape_to_tensor(self._row_shape),
         **dataset_ops.flat_structure(self))
+    super(_DenseToSparseBatchDataset, self).__init__(input_dataset,
+                                                     variant_tensor)
 
   @property
   def _element_structure(self):
@@ -499,7 +500,6 @@ class _RestructuredDataset(dataset_ops.UnaryDataset):
       ValueError: If either `output_types` or `output_shapes` is not compatible
         with the structure of `dataset`.
     """
-    super(_RestructuredDataset, self).__init__(dataset)
     self._input_dataset = dataset
 
     if not allow_unsafe_cast:
@@ -539,9 +539,8 @@ class _RestructuredDataset(dataset_ops.UnaryDataset):
 
     self._structure = structure.convert_legacy_structure(
         output_types, output_shapes, output_classes)
-
-  def _as_variant_tensor(self):
-    return self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
+    variant_tensor = self._input_dataset._variant_tensor  # pylint: disable=protected-access
+    super(_RestructuredDataset, self).__init__(dataset, variant_tensor)
 
   @property
   def _element_structure(self):
@@ -552,12 +551,15 @@ class _MapAndBatchDataset(dataset_ops.UnaryDataset):
   """A `Dataset` that maps a function over a batch of elements."""
 
   def __init__(self, input_dataset, map_func, batch_size, num_parallel_calls,
-               drop_remainder):
+               drop_remainder, use_legacy_function=False):
     """See `Dataset.map()` for details."""
-    super(_MapAndBatchDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
+
     self._map_func = dataset_ops.StructuredFunctionWrapper(
-        map_func, "tf.data.experimental.map_and_batch()", dataset=input_dataset)
+        map_func,
+        "tf.data.experimental.map_and_batch()",
+        dataset=input_dataset,
+        use_legacy_function=use_legacy_function)
     self._batch_size_t = ops.convert_to_tensor(
         batch_size, dtype=dtypes.int64, name="batch_size")
     self._num_parallel_calls_t = ops.convert_to_tensor(
@@ -573,14 +575,8 @@ class _MapAndBatchDataset(dataset_ops.UnaryDataset):
           tensor_util.constant_value(self._batch_size_t))
     else:
       self._structure = self._map_func.output_structure._batch(None)  # pylint: disable=protected-access
-
-  def _functions(self):
-    return [self._map_func]
-
-  def _as_variant_tensor(self):
-    # pylint: disable=protected-access
-    return ged_ops.experimental_map_and_batch_dataset(
-        self._input_dataset._as_variant_tensor(),
+    variant_tensor = ged_ops.experimental_map_and_batch_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
         self._map_func.function.captured_inputs,
         f=self._map_func.function,
         batch_size=self._batch_size_t,
@@ -588,12 +584,72 @@ class _MapAndBatchDataset(dataset_ops.UnaryDataset):
         drop_remainder=self._drop_remainder_t,
         preserve_cardinality=True,
         **dataset_ops.flat_structure(self))
+    super(_MapAndBatchDataset, self).__init__(input_dataset, variant_tensor)
+
+  def _functions(self):
+    return [self._map_func]
 
   @property
   def _element_structure(self):
     return self._structure
 
 
+@deprecation.deprecated(None, "Use `tf.data.experimental.map_and_batch()")
+@tf_export(v1=["data.experimental.map_and_batch_with_legacy_function"])
+def map_and_batch_with_legacy_function(map_func,
+                                       batch_size,
+                                       num_parallel_batches=None,
+                                       drop_remainder=False,
+                                       num_parallel_calls=None):
+  """Fused implementation of `map` and `batch`.
+
+  NOTE: This is an escape hatch for existing uses of `map_and_batch` that do not
+  work with V2 functions. New uses are strongly discouraged and existing uses
+  should migrate to `map_and_batch` as this method will not be removed in V2.
+
+  Args:
+    map_func: A function mapping a nested structure of tensors to another
+      nested structure of tensors.
+    batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
+      consecutive elements of this dataset to combine in a single batch.
+    num_parallel_batches: (Optional.) A `tf.int64` scalar `tf.Tensor`,
+      representing the number of batches to create in parallel. On one hand,
+      higher values can help mitigate the effect of stragglers. On the other
+      hand, higher values can increase contention if CPU is scarce.
+    drop_remainder: (Optional.) A `tf.bool` scalar `tf.Tensor`, representing
+      whether the last batch should be dropped in case its size is smaller than
+      desired; the default behavior is not to drop the smaller batch.
+    num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
+      representing the number of elements to process in parallel. If not
+      specified, `batch_size * num_parallel_batches` elements will be processed
+      in parallel. If the value `tf.data.experimental.AUTOTUNE` is used, then
+      the number of parallel calls is set dynamically based on available CPU.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+
+  Raises:
+    ValueError: If both `num_parallel_batches` and `num_parallel_calls` are
+      specified.
+  """
+
+  if num_parallel_batches is None and num_parallel_calls is None:
+    num_parallel_calls = batch_size
+  elif num_parallel_batches is not None and num_parallel_calls is None:
+    num_parallel_calls = batch_size * num_parallel_batches
+  elif num_parallel_batches is not None and num_parallel_calls is not None:
+    raise ValueError("The `num_parallel_batches` and `num_parallel_calls` "
+                     "arguments are mutually exclusive.")
+
+  def _apply_fn(dataset):
+    return _MapAndBatchDataset(dataset, map_func, batch_size,
+                               num_parallel_calls, drop_remainder,
+                               use_legacy_function=True)
+
+  return _apply_fn
+
+
 @tf_export("data.experimental.map_and_batch")
 def map_and_batch(map_func,
                   batch_size,
@@ -650,3 +706,40 @@ def map_and_batch(map_func,
                                num_parallel_calls, drop_remainder)
 
   return _apply_fn
+
+
+class _RebatchDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that divides the batch size by `num_workers`."""
+
+  def __init__(self, input_dataset, num_workers):
+    self._input_dataset = input_dataset
+
+    def recalculate_output_shapes(output_shapes):
+      """Recalculates the output_shapes after dividing it by num_workers."""
+      if len(output_shapes) < 1:
+        raise ValueError("Input shape should have at least one dimension.")
+      if (tensor_shape.dimension_value(output_shapes[0]) and
+          tensor_shape.dimension_value(output_shapes[0]) % num_workers != 0):
+        raise errors.InvalidArgumentError(
+            None, None,
+            "First dim of input shape: %d is not divisible by num_workers: %d" %
+            (output_shapes[0], num_workers))
+      output_dims = [d for d in output_shapes.dims]
+      output_dims[0] = output_dims[0] // num_workers
+      return tensor_shape.TensorShape(output_dims)
+
+    output_shapes = nest.map_structure(recalculate_output_shapes,
+                                       input_dataset.output_shapes)
+
+    self._structure = structure.convert_legacy_structure(
+        self._input_dataset.output_types, output_shapes,
+        self._input_dataset.output_classes)
+    variant_tensor = ged_ops.experimental_rebatch_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        num_workers=num_workers,
+        **dataset_ops.flat_structure(self))
+    super(_RebatchDataset, self).__init__(input_dataset, variant_tensor)
+
+  @property
+  def _element_structure(self):
+    return self._structure
diff --git a/tensorflow/python/data/experimental/ops/cardinality.py b/tensorflow/python/data/experimental/ops/cardinality.py
index 9cf0a8801e8339f233eb61c8e0b1223b8b94358b..0d596f68dd544f6c21143b4e8d805bca4110306d 100644
--- a/tensorflow/python/data/experimental/ops/cardinality.py
+++ b/tensorflow/python/data/experimental/ops/cardinality.py
@@ -47,4 +47,4 @@ def cardinality(dataset):
     the cardinality is infinite or unknown, the operation returns the named
     constant `INFINITE_CARDINALITY` and `UNKNOWN_CARDINALITY` respectively.
   """
-  return ged_ops.experimental_dataset_cardinality(dataset._as_variant_tensor())  # pylint: disable=protected-access
+  return ged_ops.experimental_dataset_cardinality(dataset._variant_tensor)  # pylint: disable=protected-access
diff --git a/tensorflow/python/data/experimental/ops/error_ops.py b/tensorflow/python/data/experimental/ops/error_ops.py
index 879b13ce092f20c2a6cfc911ba4c6e11992e23a8..eab29c7d88fc6f6870091bb81662ad23544a7c00 100644
--- a/tensorflow/python/data/experimental/ops/error_ops.py
+++ b/tensorflow/python/data/experimental/ops/error_ops.py
@@ -57,10 +57,9 @@ class _IgnoreErrorsDataset(dataset_ops.UnaryUnchangedStructureDataset):
 
   def __init__(self, input_dataset):
     """See `Dataset.ignore_errors()` for details."""
-    super(_IgnoreErrorsDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
-
-  def _as_variant_tensor(self):
-    return gen_experimental_dataset_ops.experimental_ignore_errors_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        **dataset_ops.flat_structure(self))
+    variant_tensor = (
+        gen_experimental_dataset_ops.experimental_ignore_errors_dataset(
+            self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+            **dataset_ops.flat_structure(self)))
+    super(_IgnoreErrorsDataset, self).__init__(input_dataset, variant_tensor)
diff --git a/tensorflow/python/data/experimental/ops/get_single_element.py b/tensorflow/python/data/experimental/ops/get_single_element.py
index d649a0701270c55d399af140f5e2bae79484fec2..46c215d6850eeea4ceed880144bc1b4d97fe714c 100644
--- a/tensorflow/python/data/experimental/ops/get_single_element.py
+++ b/tensorflow/python/data/experimental/ops/get_single_element.py
@@ -64,5 +64,4 @@ def get_single_element(dataset):
   # pylint: disable=protected-access
   return dataset._element_structure._from_compatible_tensor_list(
       gen_dataset_ops.dataset_to_single_element(
-          dataset._as_variant_tensor(),
-          **dataset_ops.flat_structure(dataset)))
+          dataset._variant_tensor, **dataset_ops.flat_structure(dataset)))
diff --git a/tensorflow/python/data/experimental/ops/grouping.py b/tensorflow/python/data/experimental/ops/grouping.py
index ef6b232429b872016842bcf513a851445b4d8a5e..4e83acf6bbadc065adae1a6fe3da81bc6ff19d0e 100644
--- a/tensorflow/python/data/experimental/ops/grouping.py
+++ b/tensorflow/python/data/experimental/ops/grouping.py
@@ -130,7 +130,8 @@ def bucket_by_sequence_length(element_length_func,
                               padded_shapes=None,
                               padding_values=None,
                               pad_to_bucket_boundary=False,
-                              no_padding=False):
+                              no_padding=False,
+                              drop_remainder=False):
   """A transformation that buckets elements in a `Dataset` by length.
 
   Elements of the `Dataset` are grouped together by length and then are padded
@@ -160,6 +161,10 @@ def bucket_by_sequence_length(element_length_func,
       any elements with length longer than `max(bucket_boundaries)`.
     no_padding: `bool`, indicates whether to pad the batch features (features
       need to be either of type `tf.SparseTensor` or of same shape).
+    drop_remainder: (Optional.) A `tf.bool` scalar `tf.Tensor`, representing
+      whether the last batch should be dropped in the case it has fewer than
+      `batch_size` elements; the default behavior is not to drop the smaller
+      batch.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
@@ -209,7 +214,7 @@ def bucket_by_sequence_length(element_length_func,
       """Batch elements in dataset."""
       batch_size = window_size_fn(bucket_id)
       if no_padding:
-        return grouped_dataset.batch(batch_size)
+        return grouped_dataset.batch(batch_size, drop_remainder=drop_remainder)
       none_filler = None
       if pad_to_bucket_boundary:
         err_msg = ("When pad_to_bucket_boundary=True, elements must have "
@@ -227,7 +232,8 @@ def bucket_by_sequence_length(element_length_func,
       shapes = make_padded_shapes(
           padded_shapes or grouped_dataset.output_shapes,
           none_filler=none_filler)
-      return grouped_dataset.padded_batch(batch_size, shapes, padding_values)
+      return grouped_dataset.padded_batch(
+          batch_size, shapes, padding_values, drop_remainder=drop_remainder)
 
     def _apply_fn(dataset):
       return dataset.apply(
@@ -242,14 +248,23 @@ class _GroupByReducerDataset(dataset_ops.UnaryDataset):
 
   def __init__(self, input_dataset, key_func, reducer):
     """See `group_by_reducer()` for details."""
-    super(_GroupByReducerDataset, self).__init__(input_dataset)
-
     self._input_dataset = input_dataset
-
     self._make_key_func(key_func, input_dataset)
     self._make_init_func(reducer.init_func)
     self._make_reduce_func(reducer.reduce_func, input_dataset)
     self._make_finalize_func(reducer.finalize_func)
+    variant_tensor = ged_ops.experimental_group_by_reducer_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._key_func.function.captured_inputs,
+        self._init_func.function.captured_inputs,
+        self._reduce_func.function.captured_inputs,
+        self._finalize_func.function.captured_inputs,
+        key_func=self._key_func.function,
+        init_func=self._init_func.function,
+        reduce_func=self._reduce_func.function,
+        finalize_func=self._finalize_func.function,
+        **dataset_ops.flat_structure(self))
+    super(_GroupByReducerDataset, self).__init__(input_dataset, variant_tensor)
 
   def _make_key_func(self, key_func, input_dataset):
     """Make wrapping defun for key_func."""
@@ -347,19 +362,6 @@ class _GroupByReducerDataset(dataset_ops.UnaryDataset):
         self._key_func, self._init_func, self._reduce_func, self._finalize_func
     ]
 
-  def _as_variant_tensor(self):
-    return ged_ops.experimental_group_by_reducer_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._key_func.function.captured_inputs,
-        self._init_func.function.captured_inputs,
-        self._reduce_func.function.captured_inputs,
-        self._finalize_func.function.captured_inputs,
-        key_func=self._key_func.function,
-        init_func=self._init_func.function,
-        reduce_func=self._reduce_func.function,
-        finalize_func=self._finalize_func.function,
-        **dataset_ops.flat_structure(self))
-
   def _transformation_name(self):
     return "tf.data.experimental.group_by_reducer()"
 
@@ -369,13 +371,20 @@ class _GroupByWindowDataset(dataset_ops.UnaryDataset):
 
   def __init__(self, input_dataset, key_func, reduce_func, window_size_func):
     """See `group_by_window()` for details."""
-    super(_GroupByWindowDataset, self).__init__(input_dataset)
-
     self._input_dataset = input_dataset
-
     self._make_key_func(key_func, input_dataset)
     self._make_reduce_func(reduce_func, input_dataset)
     self._make_window_size_func(window_size_func)
+    variant_tensor = ged_ops.experimental_group_by_window_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._key_func.function.captured_inputs,
+        self._reduce_func.function.captured_inputs,
+        self._window_size_func.function.captured_inputs,
+        key_func=self._key_func.function,
+        reduce_func=self._reduce_func.function,
+        window_size_func=self._window_size_func.function,
+        **dataset_ops.flat_structure(self))
+    super(_GroupByWindowDataset, self).__init__(input_dataset, variant_tensor)
 
   def _make_window_size_func(self, window_size_func):
     """Make wrapping defun for window_size_func."""
@@ -426,17 +435,6 @@ class _GroupByWindowDataset(dataset_ops.UnaryDataset):
   def _functions(self):
     return [self._key_func, self._reduce_func, self._window_size_func]
 
-  def _as_variant_tensor(self):
-    return ged_ops.experimental_group_by_window_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._key_func.function.captured_inputs,
-        self._reduce_func.function.captured_inputs,
-        self._window_size_func.function.captured_inputs,
-        key_func=self._key_func.function,
-        reduce_func=self._reduce_func.function,
-        window_size_func=self._window_size_func.function,
-        **dataset_ops.flat_structure(self))
-
   def _transformation_name(self):
     return "tf.data.experimental.group_by_window()"
 
diff --git a/tensorflow/python/data/experimental/ops/interleave_ops.py b/tensorflow/python/data/experimental/ops/interleave_ops.py
index 5a719f8ed8f0176f628a89eb1b3e535064d9a72e..f4b7123df119dddd65ea07b0c3afab8ad05d202c 100644
--- a/tensorflow/python/data/experimental/ops/interleave_ops.py
+++ b/tensorflow/python/data/experimental/ops/interleave_ops.py
@@ -113,15 +113,15 @@ class _DirectedInterleaveDataset(dataset_ops.Dataset):
     self._structure = structure.convert_legacy_structure(
         data_inputs[0].output_types, output_shapes,
         data_inputs[0].output_classes)
+    super(_DirectedInterleaveDataset, self).__init__()
 
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
     return (
         gen_experimental_dataset_ops.experimental_directed_interleave_dataset(
-            self._selector_input._as_variant_tensor(), [
-                data_input._as_variant_tensor()
-                for data_input in self._data_inputs
-            ], **dataset_ops.flat_structure(self)))
+            self._selector_input._variant_tensor,
+            [data_input._variant_tensor for data_input in self._data_inputs],
+            **dataset_ops.flat_structure(self)))
     # pylint: enable=protected-access
 
   def _inputs(self):
diff --git a/tensorflow/python/data/experimental/ops/matching_files.py b/tensorflow/python/data/experimental/ops/matching_files.py
index 63b99cb1e4533d165902893918d5aea2c6f02613..29beda9fc3a7705723ed47d6d0d4eba88170a56a 100644
--- a/tensorflow/python/data/experimental/ops/matching_files.py
+++ b/tensorflow/python/data/experimental/ops/matching_files.py
@@ -29,12 +29,10 @@ class MatchingFilesDataset(dataset_ops.DatasetSource):
   """A `Dataset` that list the files according to the input patterns."""
 
   def __init__(self, patterns):
-    super(MatchingFilesDataset, self).__init__()
     self._patterns = ops.convert_to_tensor(
         patterns, dtype=dtypes.string, name="patterns")
-
-  def _as_variant_tensor(self):
-    return ged_ops.experimental_matching_files_dataset(self._patterns)
+    variant_tensor = ged_ops.experimental_matching_files_dataset(self._patterns)
+    super(MatchingFilesDataset, self).__init__(variant_tensor)
 
   @property
   def _element_structure(self):
diff --git a/tensorflow/python/data/experimental/ops/optimization.py b/tensorflow/python/data/experimental/ops/optimization.py
index c6c7de9265c32245dfbc348a4e7c4fd06eda653b..984c820b17fcb2743b955f3fd3f6bbd0b1ba0860 100644
--- a/tensorflow/python/data/experimental/ops/optimization.py
+++ b/tensorflow/python/data/experimental/ops/optimization.py
@@ -105,18 +105,17 @@ class _AssertNextDataset(dataset_ops.UnaryUnchangedStructureDataset):
 
   def __init__(self, input_dataset, transformations):
     """See `assert_next()` for details."""
-    super(_AssertNextDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     if transformations is None:
       raise ValueError("At least one transformation should be specified")
     self._transformations = ops.convert_to_tensor(
         transformations, dtype=dtypes.string, name="transformations")
-
-  def _as_variant_tensor(self):
-    return gen_experimental_dataset_ops.experimental_assert_next_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._transformations,
-        **dataset_ops.flat_structure(self))
+    variant_tensor = (
+        gen_experimental_dataset_ops.experimental_assert_next_dataset(
+            self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+            self._transformations,
+            **dataset_ops.flat_structure(self)))
+    super(_AssertNextDataset, self).__init__(input_dataset, variant_tensor)
 
 
 class _NonSerializableDataset(dataset_ops.UnaryUnchangedStructureDataset):
@@ -124,10 +123,56 @@ class _NonSerializableDataset(dataset_ops.UnaryUnchangedStructureDataset):
 
   def __init__(self, input_dataset):
     """See `non_serializable()` for details."""
-    super(_NonSerializableDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
-
-  def _as_variant_tensor(self):
-    return gen_experimental_dataset_ops.experimental_non_serializable_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        **dataset_ops.flat_structure(self))
+    variant_tensor = (
+        gen_experimental_dataset_ops.experimental_non_serializable_dataset(
+            self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+            **dataset_ops.flat_structure(self)))
+    super(_NonSerializableDataset, self).__init__(input_dataset, variant_tensor)
+
+
+class _ChooseFastestDataset(dataset_ops.DatasetV2):
+  """A `Dataset` that merges two input datasets."""
+
+  def __init__(self, datasets, num_experiments=10):
+    """Chooses the fastest of some input datasets.
+
+    Given input datasets, produces elements as quickly as the fastest of the
+    inputs. Note that this dataset assumes that input datasets have the same
+    elements in the same order, though this is not enforced besides checking
+    that the input datasets have compatible output types, output shapes, and
+    cardinality at runtime. The resulting dataset produces elements that are
+    identical to the input elements, and in the same order.
+
+    Note that the time to first iteration is longer when this dataset is used
+    due to the overhead of dynamically picking the faster dataset. Namely,
+    for the first num_experiments iterations, this dataset will pull from all
+    of its inputs simultaneously in order to determine which input is the
+    fastest. For all subsequent iterations, that input will be used.
+
+    Args:
+      datasets: A list of `Datasets` that all have the same elements in the same
+        order.
+      num_experiments: The number of experiments to run before deciding which
+        dataset is fastest. In each "experiment" iteration, the dataset will
+        call from all its inputs simultaneously, and update its knowledge of
+        which input is the fastest.
+
+    Returns:
+      A `Dataset` that has the same elements the inputs.
+    """
+    self._datasets = list(datasets)
+    self._structure = self._datasets[0]._element_structure  # pylint: disable=protected-access
+    variant_tensor = (
+        gen_experimental_dataset_ops.experimental_choose_fastest_dataset(
+            [dataset._variant_tensor for dataset in self._datasets],  # pylint: disable=protected-access
+            num_experiments=num_experiments,
+            **dataset_ops.flat_structure(self)))
+    super(_ChooseFastestDataset, self).__init__(variant_tensor)
+
+  def _inputs(self):
+    return self._datasets
+
+  @property
+  def _element_structure(self):
+    return self._datasets[0]._element_structure  # pylint: disable=protected-access
diff --git a/tensorflow/python/data/experimental/ops/optimization_options.py b/tensorflow/python/data/experimental/ops/optimization_options.py
index 11b8b86f64b204782030411cc533d57dcc348bd3..be1fb4c7cacdbfffa43fa801e2f30d9e1d16ade9 100644
--- a/tensorflow/python/data/experimental/ops/optimization_options.py
+++ b/tensorflow/python/data/experimental/ops/optimization_options.py
@@ -26,12 +26,14 @@ from tensorflow.python.util.tf_export import tf_export
 class OptimizationOptions(options.OptionsBase):
   """Represents options for dataset optimizations.
 
-  You can apply `OptimizationOptions` to a `dataset` object, as follows:
+  You can set the optimization options of a dataset through the
+  `experimental_optimization` property of `tf.data.Options`; the property is
+  an instance of `tf.data.experimental.OptimizationOptions`.
 
   ```python
   options = tf.data.Options()
-  options.optimization = tf.data.experimental.OptimizationOptions()
-  options.optimization.map_and_batch_fusion = True
+  options.experimental_optimization.map_vectorization = True
+  options.experimental_optimization.apply_default_optimizations = False
   dataset = dataset.with_options(options)
   ```
   """
@@ -45,43 +47,54 @@ class OptimizationOptions(options.OptionsBase):
   filter_fusion = options.create_option(
       name="filter_fusion",
       ty=bool,
-      docstring="Whether to fuse filter transformations.")
+      docstring=
+      "Whether to fuse filter transformations. If None, defaults to False.")
 
   hoist_random_uniform = options.create_option(
       name="hoist_random_uniform",
       ty=bool,
       docstring=
-      "Whether to hoist `tf.random_uniform()` ops out of map transformations.")
+      "Whether to hoist `tf.random_uniform()` ops out of map transformations. "
+      "If None, defaults to False.")
 
   map_and_batch_fusion = options.create_option(
       name="map_and_batch_fusion",
       ty=bool,
-      docstring="Whether to fuse map and batch transformations.")
+      docstring=
+      "Whether to fuse map and batch transformations. If None, defaults to "
+      "True.")
 
   map_and_filter_fusion = options.create_option(
       name="map_and_filter_fusion",
       ty=bool,
-      docstring="Whether to fuse map and filter transformations.")
+      docstring=
+      "Whether to fuse map and filter transformations. If None, defaults to "
+      "False.")
 
   map_fusion = options.create_option(
-      name="map_and_filter_fusion",
+      name="map_fusion",
       ty=bool,
-      docstring="Whether to fuse map transformations.")
+      docstring="Whether to fuse map transformations. If None, defaults to "
+      "False.")
 
   map_parallelization = options.create_option(
       name="map_parallelization",
       ty=bool,
-      docstring="Whether to parallelize stateless map transformations.")
+      docstring=
+      "Whether to parallelize stateless map transformations. If None, defaults "
+      "to False.")
 
   map_vectorization = options.create_option(
       name="map_vectorization",
       ty=bool,
-      docstring="Whether to vectorize map transformations.")
+      docstring=
+      "Whether to vectorize map transformations. If None, defaults to False.")
 
   noop_elimination = options.create_option(
       name="noop_elimination",
       ty=bool,
-      docstring="Whether to eliminate no-op transformations.")
+      docstring=
+      "Whether to eliminate no-op transformations. If None, defaults to True.")
 
   shuffle_and_repeat_fusion = options.create_option(
       name="shuffle_and_repeat_fusion",
@@ -91,18 +104,21 @@ class OptimizationOptions(options.OptionsBase):
 
   def _static_optimizations(self):
     """Produces the list of enabled static optimizations."""
-    result = []
-    optimizations_to_enable = [
+    result = set()
+    all_optimizations = [
         "filter_fusion",
         "hoist_random_uniform",
+        "map_and_batch_fusion",
         "map_and_filter_fusion",
-        "map_fusion",
         "map_parallelization",
+        "map_fusion",
         "map_vectorization",
+        "noop_elimination",
+        "shuffle_and_repeat_fusion",
     ]
-    for optimization in optimizations_to_enable:
+    for optimization in all_optimizations:
       if getattr(self, optimization):
-        result.append(optimization)
+        result.add(optimization)
 
     if self.apply_default_optimizations is not False:
       # The following optimizations are turned on by default, unless the
@@ -114,5 +130,5 @@ class OptimizationOptions(options.OptionsBase):
       ]
       for optimization in optimizations_to_disable:
         if getattr(self, optimization) is not False:
-          result.append(optimization)
-    return result
+          result.add(optimization)
+    return sorted(list(result))
diff --git a/tensorflow/python/data/experimental/ops/parsing_ops.py b/tensorflow/python/data/experimental/ops/parsing_ops.py
index deb20d61888adeeff078997fc8adfede604de8eb..a5ca96e89b5eb10160d59fd3e36489488d986422 100644
--- a/tensorflow/python/data/experimental/ops/parsing_ops.py
+++ b/tensorflow/python/data/experimental/ops/parsing_ops.py
@@ -31,7 +31,6 @@ class _ParseExampleDataset(dataset_ops.UnaryDataset):
   """A `Dataset` that parses `example` dataset into a `dict` dataset."""
 
   def __init__(self, input_dataset, features, num_parallel_calls):
-    super(_ParseExampleDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     if not input_dataset._element_structure.is_compatible_with(  # pylint: disable=protected-access
         structure.TensorStructure(dtypes.string, [None])):
@@ -81,16 +80,17 @@ class _ParseExampleDataset(dataset_ops.UnaryDataset):
     self._structure = structure.convert_legacy_structure(
         output_types, output_shapes, output_classes)
 
-  def _as_variant_tensor(self):
-    return gen_experimental_dataset_ops.experimental_parse_example_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._num_parallel_calls,
-        self._dense_defaults,
-        self._sparse_keys,
-        self._dense_keys,
-        self._sparse_types,
-        self._dense_shapes,
-        **dataset_ops.flat_structure(self))
+    variant_tensor = (
+        gen_experimental_dataset_ops.experimental_parse_example_dataset(
+            self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+            self._num_parallel_calls,
+            self._dense_defaults,
+            self._sparse_keys,
+            self._dense_keys,
+            self._sparse_types,
+            self._dense_shapes,
+            **dataset_ops.flat_structure(self)))
+    super(_ParseExampleDataset, self).__init__(input_dataset, variant_tensor)
 
   @property
   def _element_structure(self):
diff --git a/tensorflow/python/data/experimental/ops/prefetching_ops.py b/tensorflow/python/data/experimental/ops/prefetching_ops.py
index e46dfb6568d5d0c29187c233e503cef98eecece1..e21ff8e9daa46bbf8a4d730e4a58d7175e7bfabb 100644
--- a/tensorflow/python/data/experimental/ops/prefetching_ops.py
+++ b/tensorflow/python/data/experimental/ops/prefetching_ops.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.data.experimental.ops import optimization_options
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import function
@@ -72,9 +71,7 @@ def copy_to_device(target_device, source_device="/cpu:0"):
   def _apply_fn(dataset):
     options = dataset_ops.Options()
     options.experimental_autotune = False
-    opt_options = optimization_options.OptimizationOptions()
-    opt_options.apply_default_optimizations = False
-    options.experimental_optimization = opt_options
+    options.experimental_optimization.apply_default_optimizations = False
     return _CopyToDeviceDataset(
         dataset, target_device=target_device,
         source_device=source_device).with_options(options)
@@ -96,7 +93,6 @@ class _CopyToDeviceDataset(dataset_ops.UnaryUnchangedStructureDataset):
       target_device: The name of the device to which elements would be copied.
       source_device: Device where input_dataset would be placed.
     """
-    super(_CopyToDeviceDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     self._target_device = target_device
     spec = framework_device.DeviceSpec().from_string(self._target_device)
@@ -104,6 +100,9 @@ class _CopyToDeviceDataset(dataset_ops.UnaryUnchangedStructureDataset):
     self._source_device_string = source_device
     self._source_device = ops.convert_to_tensor(source_device)
 
+    wrap_ds_variant = gen_dataset_ops.wrap_dataset_variant(
+        self._input_dataset._variant_tensor)  # pylint: disable=protected-access
+
     @function.defun()
     def _init_func():
       """Creates an iterator for the input dataset.
@@ -111,8 +110,7 @@ class _CopyToDeviceDataset(dataset_ops.UnaryUnchangedStructureDataset):
       Returns:
         A `string` tensor that encapsulates the iterator created.
       """
-      # pylint: disable=protected-access
-      ds_variant = self._input_dataset._as_variant_tensor()
+      ds_variant = gen_dataset_ops.unwrap_dataset_variant(wrap_ds_variant)
       resource = gen_dataset_ops.anonymous_iterator(
           **dataset_ops.flat_structure(self._input_dataset))
       with ops.control_dependencies(
@@ -149,7 +147,9 @@ class _CopyToDeviceDataset(dataset_ops.UnaryUnchangedStructureDataset):
 
     next_func_concrete = _next_func._get_concrete_function_internal()  # pylint: disable=protected-access
 
-    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
+    @function.defun_with_attributes(
+        input_signature=[tensor_spec.TensorSpec([], dtypes.string)],
+        attributes={"experimental_ints_on_device": True})
     def _remote_next_func(string_handle):
       return functional_ops.remote_call(
           target=self._source_device,
@@ -198,6 +198,17 @@ class _CopyToDeviceDataset(dataset_ops.UnaryUnchangedStructureDataset):
     self._finalize_func.add_to_graph(g)
     # pylint: enable=protected-scope
 
+    with ops.device(self._target_device):
+      variant_tensor = gen_dataset_ops.generator_dataset(
+          self._init_captured_args,
+          self._next_captured_args,
+          self._finalize_captured_args,
+          init_func=self._init_func,
+          next_func=self._next_func,
+          finalize_func=self._finalize_func,
+          **dataset_ops.flat_structure(self._input_dataset))
+    super(_CopyToDeviceDataset, self).__init__(input_dataset, variant_tensor)
+
   # The one_shot_iterator implementation needs a 0 arg _make_dataset function
   # that thereby captures all the inputs required to create the dataset. Since
   # there are strings that are inputs to the GeneratorDataset which can't be
@@ -211,24 +222,12 @@ class _CopyToDeviceDataset(dataset_ops.UnaryUnchangedStructureDataset):
     else:
       return super(_CopyToDeviceDataset, self).make_one_shot_iterator()
 
-  def _as_variant_tensor(self):
-    with ops.device(self._target_device):
-      return gen_dataset_ops.generator_dataset(
-          self._init_captured_args,
-          self._next_captured_args,
-          self._finalize_captured_args,
-          init_func=self._init_func,
-          next_func=self._next_func,
-          finalize_func=self._finalize_func,
-          **dataset_ops.flat_structure(self._input_dataset))
-
 
 class _MapOnGpuDataset(dataset_ops.UnaryDataset):
   """A `Dataset` that maps a function over elements in its using a GPU."""
 
   def __init__(self, input_dataset, map_func, use_inter_op_parallelism=True):
     """See `Dataset.map()` for details."""
-    super(_MapOnGpuDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     self._use_inter_op_parallelism = use_inter_op_parallelism
 
@@ -237,18 +236,16 @@ class _MapOnGpuDataset(dataset_ops.UnaryDataset):
         self._transformation_name(),
         dataset=input_dataset,
         defun_kwargs={"experimental_ints_on_device": True})
-
-  def _functions(self):
-    return [self._map_func]
-
-  def _as_variant_tensor(self):
-    input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
-    return ged_ops.experimental_map_dataset(
-        input_t,
+    variant_tensor = ged_ops.experimental_map_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
         self._map_func.function.captured_inputs,
         f=self._map_func.function,
         use_inter_op_parallelism=self._use_inter_op_parallelism,
         **dataset_ops.flat_structure(self))
+    super(_MapOnGpuDataset, self).__init__(input_dataset, variant_tensor)
+
+  def _functions(self):
+    return [self._map_func]
 
   @property
   def _element_structure(self):
diff --git a/tensorflow/python/data/experimental/ops/random_ops.py b/tensorflow/python/data/experimental/ops/random_ops.py
index cbdf367db6bd5b4ce27e636c08a19cd4fedda041..f96e4a84b4a21070f10c7b82ba4ca484bb613505 100644
--- a/tensorflow/python/data/experimental/ops/random_ops.py
+++ b/tensorflow/python/data/experimental/ops/random_ops.py
@@ -33,14 +33,10 @@ class RandomDatasetV2(dataset_ops.DatasetSource):
 
   def __init__(self, seed=None):
     """A `Dataset` of pseudorandom values."""
-    super(RandomDatasetV2, self).__init__()
     self._seed, self._seed2 = random_seed.get_seed(seed)
-
-  def _as_variant_tensor(self):
-    return gen_experimental_dataset_ops.experimental_random_dataset(
-        seed=self._seed,
-        seed2=self._seed2,
-        **dataset_ops.flat_structure(self))
+    variant_tensor = gen_experimental_dataset_ops.experimental_random_dataset(
+        seed=self._seed, seed2=self._seed2, **dataset_ops.flat_structure(self))
+    super(RandomDatasetV2, self).__init__(variant_tensor)
 
   @property
   def _element_structure(self):
diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py
index c2d82aeb59174fb9d35c4cc2c3d850fb351d8a90..24a399ab4e4d117ced025ec0f801b774c668dd85 100644
--- a/tensorflow/python/data/experimental/ops/readers.py
+++ b/tensorflow/python/data/experimental/ops/readers.py
@@ -24,6 +24,7 @@ import functools
 import numpy as np
 
 from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import error_ops
 from tensorflow.python.data.experimental.ops import interleave_ops
 from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.experimental.ops import parsing_ops
@@ -328,6 +329,7 @@ def make_csv_dataset_v2(
     sloppy=False,
     num_rows_for_inference=100,
     compression_type=None,
+    ignore_errors=False,
 ):
   """Reads CSV files into a dataset.
 
@@ -402,6 +404,10 @@ def make_csv_dataset_v2(
       the files. Defaults to 100.
     compression_type: (Optional.) A `tf.string` scalar evaluating to one of
       `""` (no compression), `"ZLIB"`, or `"GZIP"`. Defaults to no compression.
+    ignore_errors: (Optional.) If `True`, ignores errors with CSV file parsing,
+      such as malformed data or empty lines, and moves on to the next valid
+      CSV record. Otherwise, the dataset raises an error and stops processing
+      when encountering any invalid records. Defaults to `False`.
 
   Returns:
     A dataset, where each element is a (features, labels) tuple that corresponds
@@ -457,7 +463,7 @@ def make_csv_dataset_v2(
     raise ValueError("`label_name` provided must be one of the columns.")
 
   def filename_to_dataset(filename):
-    return CsvDataset(
+    dataset = CsvDataset(
         filename,
         record_defaults=column_defaults,
         field_delim=field_delim,
@@ -465,8 +471,11 @@ def make_csv_dataset_v2(
         na_value=na_value,
         select_cols=select_columns,
         header=header,
-        compression_type=compression_type,
+        compression_type=compression_type
     )
+    if ignore_errors:
+      dataset = dataset.apply(error_ops.ignore_errors())
+    return dataset
 
   def map_fn(*columns):
     """Organizes columns into a features dictionary.
@@ -528,13 +537,14 @@ def make_csv_dataset_v1(
     sloppy=False,
     num_rows_for_inference=100,
     compression_type=None,
+    ignore_errors=False,
 ):  # pylint: disable=missing-docstring
   return dataset_ops.DatasetV1Adapter(make_csv_dataset_v2(
       file_pattern, batch_size, column_names, column_defaults, label_name,
       select_columns, field_delim, use_quote_delim, na_value, header,
       num_epochs, shuffle, shuffle_buffer_size, shuffle_seed,
       prefetch_buffer_size, num_parallel_reads, sloppy, num_rows_for_inference,
-      compression_type))
+      compression_type, ignore_errors))
 make_csv_dataset_v1.__doc__ = make_csv_dataset_v2.__doc__
 
 
@@ -622,7 +632,6 @@ class CsvDatasetV2(dataset_ops.DatasetSource):
         the input data. If specified, only this subset of columns will be
         parsed. Defaults to parsing all columns.
     """
-    super(CsvDatasetV2, self).__init__()
     self._filenames = ops.convert_to_tensor(
         filenames, dtype=dtypes.string, name="filenames")
     self._compression_type = convert.optional_param_to_tensor(
@@ -655,10 +664,7 @@ class CsvDatasetV2(dataset_ops.DatasetSource):
     self._structure = structure.NestedStructure(
         tuple(structure.TensorStructure(d.dtype, [])
               for d in self._record_defaults))
-
-  def _as_variant_tensor(self):
-    # Constructs graph node for the dataset op.
-    return gen_experimental_dataset_ops.experimental_csv_dataset(
+    variant_tensor = gen_experimental_dataset_ops.experimental_csv_dataset(
         filenames=self._filenames,
         record_defaults=self._record_defaults,
         buffer_size=self._buffer_size,
@@ -668,8 +674,8 @@ class CsvDatasetV2(dataset_ops.DatasetSource):
         use_quote_delim=self._use_quote_delim,
         na_value=self._na_value,
         select_cols=self._select_cols,
-        compression_type=self._compression_type,
-    )
+        compression_type=self._compression_type)
+    super(CsvDatasetV2, self).__init__(variant_tensor)
 
   @property
   def _element_structure(self):
@@ -944,7 +950,6 @@ class SqlDatasetV2(dataset_ops.DatasetSource):
       output_types: A tuple of `tf.DType` objects representing the types of the
         columns returned by `query`.
     """
-    super(SqlDatasetV2, self).__init__()
     self._driver_name = ops.convert_to_tensor(
         driver_name, dtype=dtypes.string, name="driver_name")
     self._data_source_name = ops.convert_to_tensor(
@@ -954,11 +959,10 @@ class SqlDatasetV2(dataset_ops.DatasetSource):
     self._structure = structure.NestedStructure(
         nest.map_structure(
             lambda dtype: structure.TensorStructure(dtype, []), output_types))
-
-  def _as_variant_tensor(self):
-    return gen_experimental_dataset_ops.experimental_sql_dataset(
+    variant_tensor = gen_experimental_dataset_ops.experimental_sql_dataset(
         self._driver_name, self._data_source_name, self._query,
         nest.flatten(self.output_types), nest.flatten(self.output_shapes))
+    super(SqlDatasetV2, self).__init__(variant_tensor)
 
   @property
   def _element_structure(self):
diff --git a/tensorflow/python/data/experimental/ops/resampling.py b/tensorflow/python/data/experimental/ops/resampling.py
index 3a3040ae9a4b072ae5c1a2dc218863246b6310e6..6676085ae593bf98d7c7c3cc9bd7fdbdb1db90ff 100644
--- a/tensorflow/python/data/experimental/ops/resampling.py
+++ b/tensorflow/python/data/experimental/ops/resampling.py
@@ -168,8 +168,7 @@ def _filter_ds(dataset, acceptance_dist_ds, initial_dist_ds, class_values_ds,
 def _estimate_initial_dist_ds(
     target_dist_t, class_values_ds, dist_estimation_batch_size=32,
     smoothing_constant=10):
-  num_classes = (target_dist_t.shape[0].value or
-                 array_ops.shape(target_dist_t)[0])
+  num_classes = (target_dist_t.shape[0] or array_ops.shape(target_dist_t)[0])
   initial_examples_per_class_seen = array_ops.fill(
       [num_classes], np.int64(smoothing_constant))
 
@@ -207,7 +206,7 @@ def _estimate_data_distribution(c, num_examples_per_class_seen):
       `[num_classes]`.
     dist: The updated distribution.  Type `float32`, shape `[num_classes]`.
   """
-  num_classes = num_examples_per_class_seen.get_shape()[0].value
+  num_classes = num_examples_per_class_seen.get_shape()[0]
   # Update the class-count based on what labels are seen in batch.
   num_examples_per_class_seen = math_ops.add(
       num_examples_per_class_seen, math_ops.reduce_sum(
diff --git a/tensorflow/python/data/experimental/ops/scan_ops.py b/tensorflow/python/data/experimental/ops/scan_ops.py
index 5c77ad734348401ed666c562b36ef52ec8c5525b..7662626c3a0a5d28b07b7d0f6c77acfe92851aa7 100644
--- a/tensorflow/python/data/experimental/ops/scan_ops.py
+++ b/tensorflow/python/data/experimental/ops/scan_ops.py
@@ -33,7 +33,6 @@ class _ScanDataset(dataset_ops.UnaryDataset):
 
   def __init__(self, input_dataset, initial_state, scan_func):
     """See `scan()` for details."""
-    super(_ScanDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
 
     with ops.name_scope("initial_state"):
@@ -126,20 +125,18 @@ class _ScanDataset(dataset_ops.UnaryDataset):
 
     self._scan_func = wrapped_func
     self._scan_func.function.add_to_graph(ops.get_default_graph())
-
-  def _functions(self):
-    return [self._scan_func]
-
-  def _as_variant_tensor(self):
     # pylint: disable=protected-access
-    input_t = self._input_dataset._as_variant_tensor()
-    return gen_experimental_dataset_ops.experimental_scan_dataset(
-        input_t,
+    variant_tensor = gen_experimental_dataset_ops.experimental_scan_dataset(
+        self._input_dataset._variant_tensor,
         self._state_structure._to_tensor_list(self._initial_state),
         self._scan_func.function.captured_inputs,
         f=self._scan_func.function,
         preserve_cardinality=True,
         **dataset_ops.flat_structure(self))
+    super(_ScanDataset, self).__init__(input_dataset, variant_tensor)
+
+  def _functions(self):
+    return [self._scan_func]
 
   @property
   def _element_structure(self):
diff --git a/tensorflow/python/data/experimental/ops/shuffle_ops.py b/tensorflow/python/data/experimental/ops/shuffle_ops.py
index d12328a7145992880aedd939d7a02a8a12c61d4c..86a615d52400afca84b4c2537044f2adb35b574d 100644
--- a/tensorflow/python/data/experimental/ops/shuffle_ops.py
+++ b/tensorflow/python/data/experimental/ops/shuffle_ops.py
@@ -30,7 +30,6 @@ class _ShuffleAndRepeatDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that fuses `shuffle` and `repeat`."""
 
   def __init__(self, input_dataset, buffer_size, count=None, seed=None):
-    super(_ShuffleAndRepeatDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     self._buffer_size = ops.convert_to_tensor(
         buffer_size, dtype=dtypes.int64, name="buffer_size")
@@ -40,18 +39,15 @@ class _ShuffleAndRepeatDataset(dataset_ops.UnaryUnchangedStructureDataset):
       self._count = ops.convert_to_tensor(
           count, dtype=dtypes.int64, name="count")
     self._seed, self._seed2 = random_seed.get_seed(seed)
-
-  def _as_variant_tensor(self):
-    # pylint: disable=protected-access
-    input_resource = self._input_dataset._as_variant_tensor()
-    return gen_dataset_ops.shuffle_and_repeat_dataset(
-        input_resource,
+    variant_tensor = gen_dataset_ops.shuffle_and_repeat_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
         buffer_size=self._buffer_size,
         count=self._count,
         seed=self._seed,
         seed2=self._seed2,
         **dataset_ops.flat_structure(self))
-    # pylint: enable=protected-access
+    super(_ShuffleAndRepeatDataset, self).__init__(input_dataset,
+                                                   variant_tensor)
 
 
 @tf_export("data.experimental.shuffle_and_repeat")
diff --git a/tensorflow/python/data/experimental/ops/sleep.py b/tensorflow/python/data/experimental/ops/sleep.py
index 2da832395b2e665168c1cd9cd7f52fb13e50c830..b66edc7a194a2a7fd99eafad57d5be4f136f3ed1 100644
--- a/tensorflow/python/data/experimental/ops/sleep.py
+++ b/tensorflow/python/data/experimental/ops/sleep.py
@@ -25,15 +25,13 @@ class _SleepDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that sleeps before producing each upstream element."""
 
   def __init__(self, input_dataset, sleep_microseconds):
-    super(_SleepDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     self._sleep_microseconds = sleep_microseconds
-
-  def _as_variant_tensor(self):
-    return gen_experimental_dataset_ops.experimental_sleep_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+    variant_tensor = gen_experimental_dataset_ops.experimental_sleep_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
         self._sleep_microseconds,
         **dataset_ops.flat_structure(self))
+    super(_SleepDataset, self).__init__(input_dataset, variant_tensor)
 
 
 def sleep(sleep_microseconds):
diff --git a/tensorflow/python/data/experimental/ops/stats_aggregator.py b/tensorflow/python/data/experimental/ops/stats_aggregator.py
index d5fcc033ab7df34369e0680275df744c431ed069..0c6e68648115d94566598ac838628c77cd20865c 100644
--- a/tensorflow/python/data/experimental/ops/stats_aggregator.py
+++ b/tensorflow/python/data/experimental/ops/stats_aggregator.py
@@ -44,8 +44,8 @@ class StatsAggregator(object):
   dataset = ...
 
   # Apply `StatsOptions` to associate `dataset` with `aggregator`.
-  options = dataset_ops.Options()
-  options.experimental_stats = tf.data.experimental.StatsOptions(aggregator)
+  options = tf.data.Options()
+  options.experimental_stats.aggregator = aggregator
   dataset = dataset.with_options(options)
   ```
 
diff --git a/tensorflow/python/data/experimental/ops/stats_ops.py b/tensorflow/python/data/experimental/ops/stats_ops.py
index 15a9d24546e950543cc3274dbead26178620b5ed..13dcb92fa0643c0f89110307f2c13cb6e8425a56 100644
--- a/tensorflow/python/data/experimental/ops/stats_ops.py
+++ b/tensorflow/python/data/experimental/ops/stats_ops.py
@@ -102,13 +102,11 @@ class _StatsDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and also records statistics."""
 
   def __init__(self, input_dataset, op_function, tag):
-    super(_StatsDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     self._op_function = op_function
     self._tag = ops.convert_to_tensor(tag, dtype=dtypes.string)
-
-  def _as_variant_tensor(self):
-    return self._op_function(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+    variant_tensor = self._op_function(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
         self._tag,
         **dataset_ops.flat_structure(self))
+    super(_StatsDataset, self).__init__(input_dataset, variant_tensor)
diff --git a/tensorflow/python/data/experimental/ops/stats_options.py b/tensorflow/python/data/experimental/ops/stats_options.py
index 6e884aa08ae9173df0fda0e81e176644cd342bfa..c4c4b1cea0354ed35f60f56f3fdf73f9664d88b2 100644
--- a/tensorflow/python/data/experimental/ops/stats_options.py
+++ b/tensorflow/python/data/experimental/ops/stats_options.py
@@ -28,27 +28,19 @@ from tensorflow.python.util.tf_export import tf_export
 class StatsOptions(options.OptionsBase):
   """Represents options for collecting dataset stats using `StatsAggregator`.
 
-  To apply `StatsOptions` with a `tf.data.Dataset` object, use the following
-  pattern:
+  You can set the stats options of a dataset through the `experimental_stats`
+  property of `tf.data.Options`; the property is an instance of
+  `tf.data.experimental.StatsOptions`. For example, to collect latency stats
+  on all dataset edges, use the following pattern:
 
   ```python
   aggregator = tf.data.experimental.StatsAggregator()
 
   options = tf.data.Options()
-  options.experimental_stats = tf.data.experimental.StatsOptions()
   options.experimental_stats.aggregator = aggregator
+  options.experimental_stats.latency_all_edges = True
   dataset = dataset.with_options(options)
   ```
-
-  Note: a `StatsAggregator` object can be attached either duing construction or
-  can be provided later like in above example.
-
-  ```python
-  aggretator = tf.data.experimental.StatsAggregator()
-  # attach aggregator during construction
-  options.experimental_stats = tf.data.experimental.StatsOptions(aggregator)
-  .....
-  ```
   """
 
   aggregator = options.create_option(
@@ -62,18 +54,16 @@ class StatsOptions(options.OptionsBase):
       ty=str,
       docstring=
       "Prefix to prepend all statistics recorded for the input `dataset` with.",
-      default="")
+      default_factory=lambda: "")
 
   counter_prefix = options.create_option(
       name="counter_prefix",
       ty=str,
-      docstring=
-      "Prefix for the statistics recorded as counter.",
-      default="")
+      docstring="Prefix for the statistics recorded as counter.",
+      default_factory=lambda: "")
 
   latency_all_edges = options.create_option(
       name="latency_all_edges",
       ty=bool,
       docstring=
-      "Whether to add latency measurements on all edges.",
-      default=True)
+      "Whether to add latency measurements on all edges. Defaults to False.")
diff --git a/tensorflow/python/data/experimental/ops/take_while_ops.py b/tensorflow/python/data/experimental/ops/take_while_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f27a84edcc7306f5b8c7ec6866315c2490118a6
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/take_while_ops.py
@@ -0,0 +1,72 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""take-while dataset transformation."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import structure as structure_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import gen_experimental_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+class _TakeWhileDataset(dataset_ops.UnaryUnchangedStructureDataset):
+  """A dataset that stops iteration when `predicate` returns false."""
+
+  def __init__(self, input_dataset, predicate):
+    """See `take_while()` for details."""
+
+    self._input_dataset = input_dataset
+    wrapped_func = dataset_ops.StructuredFunctionWrapper(
+        predicate,
+        "tf.data.experimental.take_while()",
+        dataset=self._input_dataset)
+
+    if not wrapped_func.output_structure.is_compatible_with(
+        structure_lib.TensorStructure(dtypes.bool, [])):
+      raise ValueError("`predicate` must return a scalar boolean tensor.")
+
+    self._predicate = wrapped_func
+    var_tensor = gen_experimental_dataset_ops.experimental_take_while_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        other_arguments=self._predicate.function.captured_inputs,
+        predicate=self._predicate.function,
+        **dataset_ops.flat_structure(self))
+    super(_TakeWhileDataset, self).__init__(input_dataset, var_tensor)
+
+  def _functions(self):
+    return [self._predicate]
+
+
+@tf_export("data.experimental.take_while")
+def take_while(predicate):
+  """A transformation that stops dataset iteration based on a `predicate`.
+
+  Args:
+    predicate: A function that maps a nested structure of tensors (having shapes
+      and types defined by `self.output_shapes` and `self.output_types`) to a
+      scalar `tf.bool` tensor.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+
+  def _apply_fn(dataset):
+    return _TakeWhileDataset(dataset, predicate)
+
+  return _apply_fn
diff --git a/tensorflow/python/data/experimental/ops/threading_options.py b/tensorflow/python/data/experimental/ops/threading_options.py
index dbf662186f818a24a3b19ea678f87351ab45ed6e..d713b9ae0753d0c800a7212eccf99684218c193d 100644
--- a/tensorflow/python/data/experimental/ops/threading_options.py
+++ b/tensorflow/python/data/experimental/ops/threading_options.py
@@ -26,11 +26,12 @@ from tensorflow.python.util.tf_export import tf_export
 class ThreadingOptions(options.OptionsBase):
   """Represents options for dataset threading.
 
-  To apply `ThreadingOptions` to a `dataset` object, use the following pattern:
+  You can set the threading options of a dataset through the
+  `experimental_threading` property of `tf.data.Options`; the property is
+  an instance of `tf.data.experimental.ThreadingOptions`.
 
   ```python
   options = tf.data.Options()
-  options.experimental_threading = tf.data.experimental.ThreadingOptions()
   options.experimental_threading.private_threadpool_size = 10
   dataset = dataset.with_options(options)
   ```
@@ -46,5 +47,4 @@ class ThreadingOptions(options.OptionsBase):
       name="private_threadpool_size",
       ty=int,
       docstring=
-      "If set, the dataset will use a private threadpool of the given size.",
-      default=None)
+      "If set, the dataset will use a private threadpool of the given size.")
diff --git a/tensorflow/python/data/experimental/ops/threadpool.py b/tensorflow/python/data/experimental/ops/threadpool.py
index 69e8829d687fb54767bca1716c259efa150b4887..bc2c726822adf927c8d2d0255c4ce009b9f97207 100644
--- a/tensorflow/python/data/experimental/ops/threadpool.py
+++ b/tensorflow/python/data/experimental/ops/threadpool.py
@@ -64,15 +64,13 @@ class _ThreadPoolDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and sets a custom threadpool."""
 
   def __init__(self, input_dataset, thread_pool):
-    super(_ThreadPoolDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     self._thread_pool = thread_pool
-
-  def _as_variant_tensor(self):
-    return ged_ops.experimental_thread_pool_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+    variant_tensor = ged_ops.experimental_thread_pool_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
         self._thread_pool._resource,  # pylint: disable=protected-access
         **dataset_ops.flat_structure(self))
+    super(_ThreadPoolDataset, self).__init__(input_dataset, variant_tensor)
 
 
 # TODO(b/73383364): Properly export in the `tf.data.experimental` API when
diff --git a/tensorflow/python/data/experimental/ops/unique.py b/tensorflow/python/data/experimental/ops/unique.py
index 55ed98d8542187b1bd353e2ca581ef2fd2180875..dd26cfa4ee9fe19153a99fb3c732546d777ba12f 100644
--- a/tensorflow/python/data/experimental/ops/unique.py
+++ b/tensorflow/python/data/experimental/ops/unique.py
@@ -53,15 +53,13 @@ class _UniqueDataset(dataset_ops.UnaryUnchangedStructureDataset):
 
   def __init__(self, input_dataset):
     """See `unique()` for details."""
-    super(_UniqueDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     if input_dataset.output_types not in (dtypes.int32, dtypes.int64,
                                           dtypes.string):
       raise TypeError(
           "`tf.data.experimental.unique()` only supports inputs with a single "
           "`tf.int32`, `tf.int64`, or `tf.string` component.")
-
-  def _as_variant_tensor(self):
-    return gen_experimental_dataset_ops.experimental_unique_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+    variant_tensor = gen_experimental_dataset_ops.experimental_unique_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
         **dataset_ops.flat_structure(self))
+    super(_UniqueDataset, self).__init__(input_dataset, variant_tensor)
diff --git a/tensorflow/python/data/experimental/ops/writers.py b/tensorflow/python/data/experimental/ops/writers.py
index aef6da51409dbe13f59408b650fc5947f088d89d..49eae14652377ed652e5bb71b57f38244ef25749 100644
--- a/tensorflow/python/data/experimental/ops/writers.py
+++ b/tensorflow/python/data/experimental/ops/writers.py
@@ -57,4 +57,4 @@ class TFRecordWriter(object):
           "produces shape {0} and types {1}".format(dataset.output_shapes,
                                                     dataset.output_types))
     return gen_experimental_dataset_ops.experimental_dataset_to_tf_record(
-        dataset._as_variant_tensor(), self._filename, self._compression_type)  # pylint: disable=protected-access
+        dataset._variant_tensor, self._filename, self._compression_type)  # pylint: disable=protected-access
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 3390100bed5c6dbe937d26f008d794c0fbf3a753..af1c6ab97cb9791110c874e17809191f065d2773 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -108,8 +108,26 @@ tf_py_test(
     size = "small",
     srcs = ["filter_test.py"],
     additional_deps = [
+        ":filter_test_base",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+tf_py_test(
+    name = "filter_with_legacy_function_test",
+    size = "small",
+    srcs = ["filter_with_legacy_function_test.py"],
+    additional_deps = [
+        ":filter_test_base",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_library(
+    name = "filter_test_base",
+    srcs = ["filter_test_base.py"],
+    deps = [
         ":test_base",
-        "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
@@ -118,6 +136,7 @@ tf_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -272,7 +291,7 @@ tf_py_test(
         ":test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/tracking:util",
         "//tensorflow/python:checkpoint_management",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -287,6 +306,7 @@ tf_py_test(
     size = "small",
     srcs = ["iterator_cluster_test.py"],
     additional_deps = [
+        "//tensorflow/contrib/lookup:lookup_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
@@ -324,7 +344,7 @@ cuda_py_test(
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/data/util:structure",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/tracking:util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -405,6 +425,7 @@ cuda_py_test(
     srcs = ["multi_device_iterator_test.py"],
     additional_deps = [
         ":test_base",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:multi_device_iterator_ops",
@@ -444,6 +465,19 @@ cuda_py_test(
     ],
 )
 
+tf_py_test(
+    name = "options_test",
+    size = "small",
+    srcs = ["options_test.py"],
+    additional_deps = [
+        ":test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
+        "//tensorflow/python/data/experimental/ops:threading_options",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 tf_py_test(
     name = "padded_batch_test",
     size = "small",
diff --git a/tensorflow/python/data/kernel_tests/batch_test.py b/tensorflow/python/data/kernel_tests/batch_test.py
index 5b035e59173e6ee52be8ec0aab21c761093d07ce..2551250346745b6030d11e4af12ffd8e30ef6021 100644
--- a/tensorflow/python/data/kernel_tests/batch_test.py
+++ b/tensorflow/python/data/kernel_tests/batch_test.py
@@ -91,9 +91,9 @@ class BatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       result = self.evaluate(get_next())
 
   def testBatchDatasetInvalidBatchSize(self):
-    dataset = (dataset_ops.Dataset.range(10).batch(0))
-    self.assertDatasetProduces(
-        dataset, expected_error=(errors.InvalidArgumentError, ''))
+    with self.assertRaises(errors.InvalidArgumentError):
+      dataset = (dataset_ops.Dataset.range(10).batch(0))
+      self.evaluate(dataset._variant_tensor)
 
   def testBatchSparse(self):
 
diff --git a/tensorflow/python/data/kernel_tests/cache_test.py b/tensorflow/python/data/kernel_tests/cache_test.py
index b561cd58baf732f557d518e7eb237ab00512acc1..4806101d8c7e3dcaaf3d698727d863b3bcccc3ed 100644
--- a/tensorflow/python/data/kernel_tests/cache_test.py
+++ b/tensorflow/python/data/kernel_tests/cache_test.py
@@ -139,8 +139,8 @@ class FileCacheTest(test_base.DatasetTestBase):
       self.evaluate(get_next1())
 
     # Re-initialize
-    get_next1 = self.getNext(cache_dataset1)
-    get_next2 = self.getNext(cache_dataset2)
+    get_next1 = self.getNext(cache_dataset1, requires_initialization=True)
+    get_next2 = self.getNext(cache_dataset2, requires_initialization=True)
 
     # Reading concurrently should succeed.
     elements_itr1 = []
diff --git a/tensorflow/python/data/kernel_tests/dataset_test.py b/tensorflow/python/data/kernel_tests/dataset_test.py
index 2952c08be02b76fb221ee0f31f4b9fc34a14d659..1e764b3e25205d4fb369e8fb76f8908a76ed4c02 100644
--- a/tensorflow/python/data/kernel_tests/dataset_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_test.py
@@ -30,10 +30,12 @@ from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import structure
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -90,15 +92,16 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("TFRecord", lambda: readers.TFRecordDataset(""), 1),
   )
   def testDatasetSimpleSourceInputs(self, dataset_fn, num_inputs=0):
-    self.assertEqual(num_inputs, len(dataset_fn()._inputs()))
+    self.assertLen(dataset_fn()._inputs(), num_inputs)
 
+  @test_util.run_v1_only("deprecated API, no eager or V2 test coverage")
   def testDatasetComplexSourceInputs(self):
     dataset_fn = dataset_ops.Dataset.from_sparse_tensor_slices(
         sparse_tensor.SparseTensor(
             indices=np.array([[0, 0], [1, 0], [2, 0]]),
             values=np.array([0, 0, 0]),
             dense_shape=np.array([3, 1])))
-    self.assertEqual(0, len(dataset_fn._inputs()))
+    self.assertEmpty(dataset_fn._inputs())
 
   @parameterized.named_parameters(
       ("Batch",
@@ -207,54 +210,6 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertEqual(2, inputs.count(ds2))
     self.assertEqual(1, inputs.count(ds3))
 
-  def testOptionsDefault(self):
-    ds = dataset_ops.Dataset.range(0)
-    self.assertEqual(dataset_ops.Options(), ds.options())
-
-  def testOptionsOnce(self):
-    options = dataset_ops.Options()
-    ds = dataset_ops.Dataset.range(0).with_options(options).cache()
-    self.assertEqual(options, ds.options())
-
-  def testOptionsTwiceSame(self):
-    options = dataset_ops.Options()
-    options.experimental_autotune = True
-    ds = dataset_ops.Dataset.range(0).with_options(options).with_options(
-        options)
-    self.assertEqual(options, ds.options())
-
-  def testOptionsTwiceDifferent(self):
-    options1 = dataset_ops.Options()
-    options1.experimental_autotune = True
-    options2 = dataset_ops.Options()
-    options2.experimental_deterministic = False
-    ds = dataset_ops.Dataset.range(0).with_options(options1).with_options(
-        options2)
-    self.assertTrue(ds.options().experimental_autotune)
-    # Explicitly check that flag is False since assertFalse allows None
-    self.assertIs(ds.options().experimental_deterministic, False)
-
-  def testOptionsTwiceDifferentError(self):
-    options1 = dataset_ops.Options()
-    options1.experimental_autotune = True
-    options2 = dataset_ops.Options()
-    options2.experimental_autotune = False
-    with self.assertRaisesRegexp(ValueError,
-                                 "Cannot merge incompatible values"):
-      dataset_ops.Dataset.range(0).with_options(options1).with_options(options2)
-
-  def testOptionsMergeOptionsFromMultipleInputs(self):
-    options1 = dataset_ops.Options()
-    options1.experimental_autotune = True
-    options2 = dataset_ops.Options()
-    options2.experimental_deterministic = True
-    ds = dataset_ops.Dataset.zip(
-        (dataset_ops.Dataset.range(0).with_options(options1),
-         dataset_ops.Dataset.range(0).with_options(options2)))
-    self.assertTrue(ds.options().experimental_autotune)
-    self.assertTrue(ds.options().experimental_deterministic)
-
-  # TODO(b/119882922): use-after-free bug in eager mode.
   # pylint: disable=g-long-lambda
   @parameterized.named_parameters(
       ("Tensor", lambda: constant_op.constant(37.0),
@@ -278,8 +233,7 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
        optional_ops.OptionalStructure(
            structure.TensorStructure(dtypes.float32, []))),
   )
-  def testSkipEagerDatasetStructure(self, tf_value_fn,
-                                    expected_element_structure):
+  def testDatasetStructure(self, tf_value_fn, expected_element_structure):
     dataset = dataset_ops.Dataset.from_tensors(0).map(lambda _: tf_value_fn())
     dataset_structure = structure.Structure.from_value(dataset)
     self.assertIsInstance(dataset_structure, dataset_ops.DatasetStructure)
@@ -313,5 +267,30 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           round_trip_dataset, [self.evaluate(tf_value_fn())],
           requires_initialization=True)
 
+  @test_util.run_v1_only("graph mode specific, no eager or V2 test coverage")
+  def testSkipEagerSameGraphErrorOneShot(self):
+    dataset = dataset_ops.Dataset.range(10)
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(ValueError, "must be from the same graph"):
+        dataset = dataset.batch(2)
+
+  @test_util.run_v1_only("graph mode specific, no eager or V2 test coverage")
+  def testSkipEagerSameGraphErrorOneShotSimple(self):
+    dataset = dataset_ops.Dataset.range(10)
+    with ops.Graph().as_default():
+      with test.mock.patch.object(logging, "warning") as mock_log:
+        _ = dataset_ops.make_one_shot_iterator(dataset)
+        self.assertRegexpMatches(
+            str(mock_log.call_args), "Please ensure that all datasets in the "
+            "pipeline are created in the same graph as the iterator.")
+
+  @test_util.run_v1_only("graph mode specific, no eager or V2 test coverage")
+  def testSkipEagerSameGraphErrorInitializable(self):
+    dataset = dataset_ops.Dataset.range(10)
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(ValueError, "must be from the same graph"):
+        dataset = dataset.batch(2)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/filter_test.py b/tensorflow/python/data/kernel_tests/filter_test.py
index afaf954cbc6a96984239cb22665bbe1f17d6d40d..b81e9a892dfbb0baded27cbfb36ec94a0101d78f 100644
--- a/tensorflow/python/data/kernel_tests/filter_test.py
+++ b/tensorflow/python/data/kernel_tests/filter_test.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,111 +17,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.data.kernel_tests import filter_test_base
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class FilterTest(test_base.DatasetTestBase):
-
-  def testFilterDataset(self):
-    components = (
-        np.arange(7, dtype=np.int64),
-        np.array([[1, 2, 3]], dtype=np.int64) * np.arange(
-            7, dtype=np.int64)[:, np.newaxis],
-        np.array(37.0, dtype=np.float64) * np.arange(7)
-    )
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    def do_test(count, modulus):
-      dataset = dataset_ops.Dataset.from_tensor_slices(components).map(
-          _map_fn).repeat(count).filter(
-              lambda x, _y, _z: math_ops.equal(math_ops.mod(x, modulus), 0))
-      self.assertEqual([c.shape[1:] for c in components],
-                       [shape for shape in dataset.output_shapes])
-      get_next = self.getNext(dataset)
-      for _ in range(count):
-        for i in [x for x in range(7) if x**2 % modulus == 0]:
-          result = self.evaluate(get_next())
-          for component, result_component in zip(components, result):
-            self.assertAllEqual(component[i]**2, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        self.evaluate(get_next())
-
-    do_test(14, 2)
-    do_test(4, 18)
-
-    # Test an empty dataset.
-    do_test(0, 1)
-
-  def testFilterRange(self):
-    dataset = dataset_ops.Dataset.range(4).filter(
-        lambda x: math_ops.not_equal(math_ops.mod(x, 3), 2))
-    self.assertDatasetProduces(dataset, expected_output=[0, 1, 3])
-
-  def testFilterDict(self):
-    dataset = dataset_ops.Dataset.range(10).map(
-        lambda x: {"foo": x * 2, "bar": x ** 2}).filter(
-            lambda d: math_ops.equal(d["bar"] % 2, 0)).map(
-                lambda d: d["foo"] + d["bar"])
-    self.assertDatasetProduces(
-        dataset,
-        expected_output=[(i * 2 + i**2) for i in range(10) if not (i**2) % 2])
-
-  def testUseStepContainerInFilter(self):
-    input_data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64)
-
-    # Define a predicate that returns true for the first element of
-    # the sequence and not the second, and uses `tf.map_fn()`.
-    def _predicate(xs):
-      squared_xs = functional_ops.map_fn(lambda x: x * x, xs)
-      summed = math_ops.reduce_sum(squared_xs)
-      return math_ops.equal(summed, 1 + 4 + 9)
-
-    dataset = dataset_ops.Dataset.from_tensor_slices(
-        [[1, 2, 3], [4, 5, 6]]).filter(_predicate)
-    self.assertDatasetProduces(dataset, expected_output=[input_data[0]])
-
-  def testSparse(self):
-
-    def _map_fn(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=np.array([[0, 0]]),
-          values=(i * np.array([1])),
-          dense_shape=np.array([1, 1])), i
-
-    def _filter_fn(_, i):
-      return math_ops.equal(i % 2, 0)
-
-    dataset = dataset_ops.Dataset.range(10).map(_map_fn).filter(_filter_fn).map(
-        lambda x, i: x)
-    self.assertDatasetProduces(
-        dataset, expected_output=[_map_fn(i * 2)[0] for i in range(5)])
-
-  def testShortCircuit(self):
-    dataset = dataset_ops.Dataset.zip(
-        (dataset_ops.Dataset.range(10),
-         dataset_ops.Dataset.from_tensors(True).repeat(None)
-        )).filter(lambda x, y: y)
-    self.assertDatasetProduces(
-        dataset, expected_output=[(i, True) for i in range(10)])
+class FilterTest(filter_test_base.FilterTestBase):
 
-  def testParallelFilters(self):
-    dataset = dataset_ops.Dataset.range(10).filter(
-        lambda x: math_ops.equal(x % 2, 0))
-    next_elements = [self.getNext(dataset) for _ in range(10)]
-    self.assertEqual([0 for _ in range(10)],
-                     self.evaluate(
-                         [next_element() for next_element in next_elements]))
+  def apply_filter(self, input_dataset, predicate):
+    return input_dataset.filter(predicate)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/kernel_tests/filter_test_base.py b/tensorflow/python/data/kernel_tests/filter_test_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c53fa39906d4efee66a8f71762e780550d7cda9
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/filter_test_base.py
@@ -0,0 +1,134 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.filter()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import map_fn
+from tensorflow.python.ops import math_ops
+
+
+class FilterTestBase(test_base.DatasetTestBase):
+  """Base class for FilterDataset tests."""
+
+  def apply_filter(self, input_dataset, predicate):
+    raise NotImplementedError("FilterTestBase._apply_filter")
+
+  def testFilterDataset(self):
+    components = (
+        np.arange(7, dtype=np.int64),
+        np.array([[1, 2, 3]], dtype=np.int64) * np.arange(
+            7, dtype=np.int64)[:, np.newaxis],
+        np.array(37.0, dtype=np.float64) * np.arange(7)
+    )
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    def do_test(count, modulus):  # pylint: disable=missing-docstring
+      dataset = dataset_ops.Dataset.from_tensor_slices(components).map(
+          _map_fn).repeat(count)
+      # pylint: disable=g-long-lambda
+      dataset = self.apply_filter(
+          dataset, lambda x, _y, _z: math_ops.equal(
+              math_ops.mod(x, modulus), 0))
+      # pylint: enable=g-long-lambda
+      self.assertEqual([c.shape[1:] for c in components],
+                       [shape for shape in dataset.output_shapes])
+      get_next = self.getNext(dataset)
+      for _ in range(count):
+        for i in [x for x in range(7) if x**2 % modulus == 0]:
+          result = self.evaluate(get_next())
+          for component, result_component in zip(components, result):
+            self.assertAllEqual(component[i]**2, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
+
+    do_test(14, 2)
+    do_test(4, 18)
+
+    # Test an empty dataset.
+    do_test(0, 1)
+
+  def testFilterRange(self):
+    dataset = dataset_ops.Dataset.range(4)
+    dataset = self.apply_filter(
+        dataset, lambda x: math_ops.not_equal(math_ops.mod(x, 3), 2))
+    self.assertDatasetProduces(dataset, expected_output=[0, 1, 3])
+
+  def testFilterDict(self):
+    dataset = dataset_ops.Dataset.range(10).map(
+        lambda x: {"foo": x * 2, "bar": x ** 2})
+    dataset = self.apply_filter(
+        dataset, lambda d: math_ops.equal(d["bar"] % 2, 0))
+    dataset = dataset.map(lambda d: d["foo"] + d["bar"])
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[(i * 2 + i**2) for i in range(10) if not (i**2) % 2])
+
+  def testUseStepContainerInFilter(self):
+    input_data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64)
+
+    # Define a predicate that returns true for the first element of
+    # the sequence and not the second, and uses `tf.map_fn()`.
+    def _predicate(xs):
+      squared_xs = map_fn.map_fn(lambda x: x * x, xs)
+      summed = math_ops.reduce_sum(squared_xs)
+      return math_ops.equal(summed, 1 + 4 + 9)
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        [[1, 2, 3], [4, 5, 6]])
+    dataset = self.apply_filter(dataset, _predicate)
+    self.assertDatasetProduces(dataset, expected_output=[input_data[0]])
+
+  def testSparse(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1])), i
+
+    def _filter_fn(_, i):
+      return math_ops.equal(i % 2, 0)
+
+    dataset = dataset_ops.Dataset.range(10).map(_map_fn)
+    dataset = self.apply_filter(dataset, _filter_fn)
+    dataset = dataset.map(lambda x, i: x)
+    self.assertDatasetProduces(
+        dataset, expected_output=[_map_fn(i * 2)[0] for i in range(5)])
+
+  def testShortCircuit(self):
+    dataset = dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.range(10),
+         dataset_ops.Dataset.from_tensors(True).repeat(None)
+        ))
+    dataset = self.apply_filter(dataset, lambda x, y: y)
+    self.assertDatasetProduces(
+        dataset, expected_output=[(i, True) for i in range(10)])
+
+  def testParallelFilters(self):
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = self.apply_filter(dataset, lambda x: math_ops.equal(x % 2, 0))
+    next_elements = [self.getNext(dataset) for _ in range(10)]
+    self.assertEqual([0 for _ in range(10)],
+                     self.evaluate(
+                         [next_element() for next_element in next_elements]))
diff --git a/tensorflow/python/data/kernel_tests/filter_with_legacy_function_test.py b/tensorflow/python/data/kernel_tests/filter_with_legacy_function_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a283fb3302318ca526c0d43f8b025749b52c2fc
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/filter_with_legacy_function_test.py
@@ -0,0 +1,33 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.filter_with_legacy_function()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.kernel_tests import filter_test_base
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_v1_only
+class FilterWithLegacyFunctionTest(filter_test_base.FilterTestBase):
+
+  def apply_filter(self, input_dataset, predicate):
+    return input_dataset.filter_with_legacy_function(predicate)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/flat_map_test.py b/tensorflow/python/data/kernel_tests/flat_map_test.py
index ff52821b10740196286c30d19b0cda3b4b44bae5..69b5fd0d77fe743c02f441f1d65ae0bc9d731dae 100644
--- a/tensorflow/python/data/kernel_tests/flat_map_test.py
+++ b/tensorflow/python/data/kernel_tests/flat_map_test.py
@@ -65,11 +65,11 @@ class FlatMapTest(test_base.DatasetTestBase):
     repeats = [[1, 2], [3, 4], [5, 0], [1, 7]]
     components = np.array(repeats, dtype=np.int64)
     iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .flat_map(lambda x: dataset_ops.Dataset.from_tensor_slices(x)
-                  .flat_map(lambda y: dataset_ops.Dataset.from_tensors(y)
-                            .repeat(y))).make_initializable_iterator(
-                                shared_name="shared_flat_map_iterator"))
+        dataset_ops.make_initializable_iterator(
+            dataset_ops.Dataset.from_tensor_slices(components).flat_map(
+                lambda x: dataset_ops.Dataset.from_tensor_slices(x).flat_map(
+                    lambda y: dataset_ops.Dataset.from_tensors(y).repeat(y))),
+            shared_name="shared_flat_map_iterator"))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
diff --git a/tensorflow/python/data/kernel_tests/from_generator_test.py b/tensorflow/python/data/kernel_tests/from_generator_test.py
index a6625534e7a1a0efc5e39dc53ef57666f601c05b..11919bdaeee3d8b27e0c7644c485be4809213934 100644
--- a/tensorflow/python/data/kernel_tests/from_generator_test.py
+++ b/tensorflow/python/data/kernel_tests/from_generator_test.py
@@ -21,7 +21,6 @@ import threading
 
 import numpy as np
 
-from tensorflow.python.client import session
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
@@ -32,43 +31,27 @@ from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
 
-class FromGeneratorTest(test_base.DatasetTestBase):
+@test_util.run_all_in_graph_and_eager_modes
+class DatasetConstructorTest(test_base.DatasetTestBase):
 
   def _testFromGenerator(self, generator, elem_sequence, num_repeats,
                          output_types=None):
     if output_types is None:
       output_types = dtypes.int64
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.from_generator(generator, output_types=output_types)
-        .repeat(num_repeats)
-        .prefetch(5))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for _ in range(2):  # Run twice to test reinitialization.
-        sess.run(init_op)
-        for _ in range(num_repeats):
-          for elem in elem_sequence:
-            self.assertAllEqual(elem, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+    dataset = dataset_ops.Dataset.from_generator(
+        generator, output_types=output_types).repeat(num_repeats).prefetch(5)
+    self.assertDatasetProduces(
+        dataset,
+        elem_sequence * num_repeats,
+        requires_initialization=True,
+        num_test_iterations=2)
 
   def _testFromGeneratorOneShot(self, generator, elem_sequence, num_repeats):
-    iterator = dataset_ops.make_one_shot_iterator(
-        dataset_ops.Dataset.from_generator(generator, output_types=dtypes.int64)
-        .repeat(num_repeats)
-        .prefetch(5))
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for _ in range(num_repeats):
-        for elem in elem_sequence:
-          self.assertAllEqual(elem, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.from_generator(
+        generator, output_types=dtypes.int64).repeat(num_repeats).prefetch(5)
+    self.assertDatasetProduces(
+        dataset, elem_sequence * num_repeats, num_test_iterations=2)
 
-  @test_util.run_deprecated_v1
   def testFromGeneratorUsingFunction(self):
     def generator():
       for i in range(1, 100):
@@ -79,21 +62,18 @@ class FromGeneratorTest(test_base.DatasetTestBase):
     self._testFromGeneratorOneShot(generator, elem_sequence, 1)
     self._testFromGeneratorOneShot(generator, elem_sequence, 5)
 
-  @test_util.run_deprecated_v1
   def testFromGeneratorUsingList(self):
     generator = lambda: [[i] * i for i in range(1, 100)]
     elem_sequence = list(generator())
     self._testFromGenerator(generator, elem_sequence, 1)
     self._testFromGenerator(generator, elem_sequence, 5)
 
-  @test_util.run_deprecated_v1
   def testFromGeneratorUsingNdarray(self):
     generator = lambda: np.arange(100, dtype=np.int64)
     elem_sequence = list(generator())
     self._testFromGenerator(generator, elem_sequence, 1, output_types=np.int64)
     self._testFromGenerator(generator, elem_sequence, 5, output_types=np.int64)
 
-  @test_util.run_deprecated_v1
   def testFromGeneratorUsingGeneratorExpression(self):
     # NOTE(mrry): Generator *expressions* are not repeatable (or in
     # general reusable), because they eagerly evaluate the `for`
@@ -105,7 +85,6 @@ class FromGeneratorTest(test_base.DatasetTestBase):
     self._testFromGenerator(generator, elem_sequence, 1)
     self._testFromGenerator(generator, elem_sequence, 5)
 
-  @test_util.run_deprecated_v1
   def testFromMultipleConcurrentGenerators(self):
     num_inner_repeats = 5
     num_outer_repeats = 100
@@ -128,22 +107,16 @@ class FromGeneratorTest(test_base.DatasetTestBase):
           output_shapes=([None], [3]))
               .repeat(num_inner_repeats).prefetch(5))
 
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.range(num_outer_repeats)
-        .interleave(interleave_fn, cycle_length=10,
-                    block_length=len(input_list)))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for _ in range(num_inner_repeats * num_outer_repeats):
-        for elem in input_list:
-          val0, val1 = sess.run(get_next)
-          self.assertAllEqual(elem[0], val0)
-          self.assertAllEqual(elem[1], val1)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.range(num_outer_repeats).interleave(
+        interleave_fn, cycle_length=10, block_length=len(input_list))
+    get_next = self.getNext(dataset)
+    for _ in range(num_inner_repeats * num_outer_repeats):
+      for elem in input_list:
+        val0, val1 = self.evaluate(get_next())
+        self.assertAllEqual(elem[0], val0)
+        self.assertAllEqual(elem[1], val1)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   # TODO(b/67868766): Reenable this when the source of flakiness is discovered.
   def _testFromGeneratorsRunningInParallel(self):
@@ -186,22 +159,16 @@ class FromGeneratorTest(test_base.DatasetTestBase):
       return dataset_ops.Dataset.from_generator(
           generator, output_types=dtypes.int64, output_shapes=[]).prefetch(2)
 
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.range(num_parallel_iterators)
-        .interleave(
-            interleave_fn, cycle_length=num_parallel_iterators, block_length=1))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for elem in [0, 1]:
-        for _ in range(num_parallel_iterators):
-          self.assertAllEqual(elem, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.range(num_parallel_iterators).interleave(
+        interleave_fn, cycle_length=num_parallel_iterators, block_length=1)
+    get_next = self.getNext(dataset)
+
+    for elem in [0, 1]:
+      for _ in range(num_parallel_iterators):
+        self.assertAllEqual(elem, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
-  @test_util.run_deprecated_v1
   def testFromGeneratorImplicitConversion(self):
     def generator():
       yield [1]
@@ -209,45 +176,28 @@ class FromGeneratorTest(test_base.DatasetTestBase):
       yield [3]
 
     for dtype in [dtypes.int8, dtypes.int32, dtypes.int64]:
-      iterator = dataset_ops.make_initializable_iterator(
-          dataset_ops.Dataset.from_generator(
-              generator, output_types=dtype, output_shapes=[1]))
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-
-      self.assertEqual(dtype, get_next.dtype)
-
-      with self.cached_session() as sess:
-        sess.run(init_op)
-        for expected in [[1], [2], [3]]:
-          next_val = sess.run(get_next)
-          self.assertEqual(dtype.as_numpy_dtype, next_val.dtype)
-          self.assertAllEqual(expected, next_val)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-  @test_util.run_deprecated_v1
+      dataset = dataset_ops.Dataset.from_generator(
+          generator, output_types=dtype, output_shapes=[1])
+      get_next = self.getNext(dataset)
+
+      for expected in [[1], [2], [3]]:
+        next_val = self.evaluate(get_next())
+        self.assertEqual(dtype.as_numpy_dtype, next_val.dtype)
+        self.assertAllEqual(expected, next_val)
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
+
   def testFromGeneratorString(self):
     def generator():
       yield "foo"
       yield b"bar"
       yield u"baz"
 
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.from_generator(
-            generator, output_types=dtypes.string, output_shapes=[]))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for expected in [b"foo", b"bar", b"baz"]:
-        next_val = sess.run(get_next)
-        self.assertAllEqual(expected, next_val)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.from_generator(
+        generator, output_types=dtypes.string, output_shapes=[])
+    self.assertDatasetProduces(
+        dataset, expected_output=[b"foo", b"bar", b"baz"])
 
-  @test_util.run_deprecated_v1
   def testFromGeneratorTypeError(self):
     def generator():
       yield np.array([1, 2, 3], dtype=np.int64)
@@ -255,23 +205,19 @@ class FromGeneratorTest(test_base.DatasetTestBase):
       yield "ERROR"
       yield np.array([7, 8, 9], dtype=np.int64)
 
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.from_generator(
-            generator, output_types=dtypes.int64, output_shapes=[3]))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual([1, 2, 3], sess.run(get_next))
-      self.assertAllEqual([4, 5, 6], sess.run(get_next))
-      with self.assertRaisesOpError("The expected type was int64"):
-        sess.run(get_next)
-      self.assertAllEqual([7, 8, 9], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.from_generator(
+        generator, output_types=dtypes.int64, output_shapes=[3])
+
+    get_next = self.getNext(dataset)
+
+    self.assertAllEqual([1, 2, 3], self.evaluate(get_next()))
+    self.assertAllEqual([4, 5, 6], self.evaluate(get_next()))
+    with self.assertRaisesOpError("The expected type was int64"):
+      self.evaluate(get_next())
+    self.assertAllEqual([7, 8, 9], self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
-  @test_util.run_deprecated_v1
   def testFromGeneratorShapeError(self):
     def generator():
       yield np.array([1, 2, 3], dtype=np.int64)
@@ -279,23 +225,18 @@ class FromGeneratorTest(test_base.DatasetTestBase):
       yield np.array([7, 8, 9, 10], dtype=np.int64)
       yield np.array([11, 12, 13], dtype=np.int64)
 
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.from_generator(
-            generator, output_types=dtypes.int64, output_shapes=[3]))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual([1, 2, 3], sess.run(get_next))
-      self.assertAllEqual([4, 5, 6], sess.run(get_next))
-      with self.assertRaisesOpError(r"element of shape \(3,\) was expected"):
-        sess.run(get_next)
-      self.assertAllEqual([11, 12, 13], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.from_generator(
+        generator, output_types=dtypes.int64, output_shapes=[3])
+    get_next = self.getNext(dataset)
+
+    self.assertAllEqual([1, 2, 3], self.evaluate(get_next()))
+    self.assertAllEqual([4, 5, 6], self.evaluate(get_next()))
+    with self.assertRaisesOpError(r"element of shape \(3,\) was expected"):
+      self.evaluate(get_next())
+    self.assertAllEqual([11, 12, 13], self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
-  @test_util.run_deprecated_v1
   def testFromGeneratorStructureError(self):
     def generator():
       yield 1, 2
@@ -304,46 +245,31 @@ class FromGeneratorTest(test_base.DatasetTestBase):
       yield 6, 7, 8
       yield 9, 10
 
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.from_generator(
-            generator, output_types=(dtypes.int64, dtypes.int64)))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertEqual((1, 2), sess.run(get_next))
-      self.assertEqual((3, 4), sess.run(get_next))
-      with self.assertRaisesOpError(
-          r"The expected structure was \(tf\.int64, tf\.int64\)"):
-        sess.run(get_next)
-      with self.assertRaisesOpError(
-          r"The expected structure was \(tf\.int64, tf\.int64\)"):
-        sess.run(get_next)
-      self.assertEqual((9, 10), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.from_generator(
+        generator, output_types=(dtypes.int64, dtypes.int64))
+    get_next = self.getNext(dataset)
+
+    self.assertEqual((1, 2), self.evaluate(get_next()))
+    self.assertEqual((3, 4), self.evaluate(get_next()))
+    with self.assertRaisesOpError(
+        r"The expected structure was \(tf\.int64, tf\.int64\)"):
+      self.evaluate(get_next())
+    with self.assertRaisesOpError(
+        r"The expected structure was \(tf\.int64, tf\.int64\)"):
+      self.evaluate(get_next())
+    self.assertEqual((9, 10), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
-  @test_util.run_deprecated_v1
   def testFromGeneratorHeterogeneous(self):
     def generator():
       yield 1
       yield [2, 3]
 
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.from_generator(
-            generator, output_types=dtypes.int64))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    dataset = dataset_ops.Dataset.from_generator(
+        generator, output_types=dtypes.int64)
+    self.assertDatasetProduces(dataset, expected_output=[1, [2, 3]])
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(1, sess.run(get_next))
-      self.assertAllEqual([2, 3], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  @test_util.run_deprecated_v1
   def testFromGeneratorStopShort(self):
 
     def generator():
@@ -351,18 +277,12 @@ class FromGeneratorTest(test_base.DatasetTestBase):
       yield 1
       yield 2
 
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.from_generator(
-            generator, output_types=dtypes.int64))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(0, sess.run(get_next))
-      self.assertAllEqual(1, sess.run(get_next))
+    dataset = dataset_ops.Dataset.from_generator(
+        generator, output_types=dtypes.int64)
+    get_next = self.getNext(dataset)
+    self.assertAllEqual(0, self.evaluate(get_next()))
+    self.assertAllEqual(1, self.evaluate(get_next()))
 
-  @test_util.run_deprecated_v1
   def testFromGeneratorDestructorCalled(self):
     # Use an `Event` to signal that the generator has been deleted.
     event = threading.Event()
@@ -381,23 +301,18 @@ class FromGeneratorTest(test_base.DatasetTestBase):
       def __del__(self):
         event.set()
 
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.from_generator(
-            GeneratorWrapper, output_types=dtypes.int64).take(2))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    dataset = dataset_ops.Dataset.from_generator(
+        GeneratorWrapper, output_types=dtypes.int64).take(2)
+    get_next = self.getNext(dataset)
 
-    with session.Session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(42, sess.run(get_next))
-      self.assertAllEqual(42, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      # Test that `GeneratorWrapper` object is destroyed when the
-      # iterator terminates (and the generator iterator is deleted).
-      self.assertTrue(event.is_set())
+    self.assertAllEqual(42, self.evaluate(get_next()))
+    self.assertAllEqual(42, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    # Test that `GeneratorWrapper` object is destroyed when the
+    # iterator terminates (and the generator iterator is deleted).
+    self.assertTrue(event.is_set())
 
-  @test_util.run_deprecated_v1
   def testFromGeneratorWithArgs(self):
 
     def flat_map_fn(elem):
@@ -410,20 +325,10 @@ class FromGeneratorTest(test_base.DatasetTestBase):
           generator_with_arg, output_types=dtypes.int64, output_shapes=(),
           args=(elem,))
 
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.range(5).flat_map(flat_map_fn))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      expected = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]
-      for x in expected:
-        self.assertEqual(x, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.range(5).flat_map(flat_map_fn)
+    self.assertDatasetProduces(
+        dataset, expected_output=[1, 2, 2, 3, 3, 3, 4, 4, 4, 4])
 
-  @test_util.run_deprecated_v1
   def testFromGeneratorWithTwoArgs(self):
 
     def flat_map_fn(elem, message):
@@ -436,26 +341,17 @@ class FromGeneratorTest(test_base.DatasetTestBase):
           generator_with_arg, output_types=(dtypes.int64, dtypes.string),
           output_shapes=((), ()), args=(elem, message))
 
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.zip(
-            (dataset_ops.Dataset.range(5),
-             dataset_ops.Dataset.from_tensors("Hi!").repeat(None)))
-        .flat_map(flat_map_fn))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      expected = [(0, b"Hi!"),
-                  (0, b"Hi!"), (1, b"Hi!"),
-                  (0, b"Hi!"), (1, b"Hi!"), (2, b"Hi!"),
-                  (0, b"Hi!"), (1, b"Hi!"), (2, b"Hi!"), (3, b"Hi!")]
-      for x in expected:
-        self.assertEqual(x, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.range(5),
+         dataset_ops.Dataset.from_tensors("Hi!").repeat(None)
+        )).flat_map(flat_map_fn)
+
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[(0, b"Hi!"), (0, b"Hi!"), (1, b"Hi!"), (0, b"Hi!"),
+                         (1, b"Hi!"), (2, b"Hi!"), (0, b"Hi!"), (1, b"Hi!"),
+                         (2, b"Hi!"), (3, b"Hi!")])
 
-  @test_util.run_deprecated_v1
   def testGeneratorDatasetFinalizeFunctionCalled(self):
     # NOTE(mrry): This test tests the internal `_GeneratorDataset`,
     # which affords more control over what the finalize function can do than
@@ -472,19 +368,15 @@ class FromGeneratorTest(test_base.DatasetTestBase):
                                 stateful=True)
 
     dummy = constant_op.constant(37)
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops._GeneratorDataset(
-            dummy, lambda x: x, lambda x: x, finalize_fn).take(2))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(37, sess.run(get_next))
-      self.assertAllEqual(37, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-        self.assertTrue(event.is_set())
+    dataset = dataset_ops._GeneratorDataset(dummy, lambda x: x, lambda x: x,
+                                            finalize_fn).take(2)
+    get_next = self.getNext(dataset)
+
+    self.assertAllEqual(37, self.evaluate(get_next()))
+    self.assertAllEqual(37, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+      self.assertTrue(event.is_set())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py b/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py
index ef608ebb67007c7605e7bea36058d0cd5c5d146f..2ce9c9a061c63b6acea899aef0518e516befb388 100644
--- a/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py
+++ b/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py
@@ -29,11 +29,10 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@test_util.run_v1_only("deprecated API, no eager or V2 test coverage")
 class FromSparseTensorSlicesTest(test_base.DatasetTestBase):
 
-  @test_util.run_deprecated_v1
-  def testSkipEagerFromSparseTensorSlices(self):
+  def testFromSparseTensorSlices(self):
     """Test a dataset based on slices of a `tf.SparseTensor`."""
     st = array_ops.sparse_placeholder(dtypes.float64)
     iterator = dataset_ops.make_initializable_iterator(
diff --git a/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py b/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
index 9a480e56789aee9198fc88201f0eecb2c2eaab52..72db6387718712b97442eb3f7ddc3befcbbf6a12 100644
--- a/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
+++ b/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
@@ -53,7 +53,7 @@ class FromTensorSlicesTest(test_base.DatasetTestBase):
     with self.assertRaises(errors.OutOfRangeError):
       results = self.evaluate(get_next())
 
-  def testSkipEagerFromTensorSlicesSparse(self):
+  def testFromTensorSlicesSparse(self):
     """Test a dataset that represents the slices from a tuple of tensors."""
     components = (sparse_tensor.SparseTensorValue(
         indices=np.array([[0, 0], [1, 0], [2, 0]]),
diff --git a/tensorflow/python/data/kernel_tests/from_tensors_test.py b/tensorflow/python/data/kernel_tests/from_tensors_test.py
index ab3c15263fdaa0829686f90450e0e79081299a2e..82ccdebc7ff7adec439791f205c30e3011afa996 100644
--- a/tensorflow/python/data/kernel_tests/from_tensors_test.py
+++ b/tensorflow/python/data/kernel_tests/from_tensors_test.py
@@ -50,7 +50,7 @@ class FromTensorsTest(test_base.DatasetTestBase):
 
     self.assertDatasetProduces(dataset, expected_output=[components])
 
-  def testSkipEagerFromTensorsSparse(self):
+  def testFromTensorsSparse(self):
     """Test a dataset that represents a single tuple of tensors."""
     components = (sparse_tensor.SparseTensorValue(
         indices=np.array([[0]]),
@@ -224,6 +224,7 @@ class FromTensorsTest(test_base.DatasetTestBase):
     self.assertEquals(dtypes.int64, get_next().dtype)
     self.assertEquals([3], get_next().shape)
 
+  # TODO(b/121264236): needs mechanism for multiple device in eager mode.
   def testSkipEagerSplitPipelineFailsWithPlacementError(self):
     with session.Session(
         target="",
diff --git a/tensorflow/python/data/kernel_tests/interleave_test.py b/tensorflow/python/data/kernel_tests/interleave_test.py
index 05a211afcc177faaeb1a00ad03d8f117448f8315..4b427ff5a4173d73171400a2d3f36cbdfd416cdd 100644
--- a/tensorflow/python/data/kernel_tests/interleave_test.py
+++ b/tensorflow/python/data/kernel_tests/interleave_test.py
@@ -17,19 +17,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import threading
-
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
@@ -78,49 +74,6 @@ def _interleave(lists, cycle_length, block_length):
           break
 
 
-def _make_coordinated_sloppy_dataset(input_values, cycle_length, block_length,
-                                     num_parallel_calls):
-  """Produces a dataset iterator and events to control the order of elements.
-
-  Args:
-    input_values: the values to generate lists to interleave from
-    cycle_length: the length of the interleave cycle
-    block_length: the length of the interleave block
-    num_parallel_calls: the degree of interleave parallelism
-
-  Returns:
-    A dataset iterator (represented as `get_next` op) and events that can be
-    used to control the order of output elements.
-  """
-
-  # Set up threading events used to sequence when items are produced that
-  # are subsequently interleaved. These events allow us to deterministically
-  # simulate slowdowns and force sloppiness.
-  coordination_events = {i: threading.Event() for i in input_values}
-
-  def map_py_fn(x):
-    coordination_events[x].wait()
-    coordination_events[x].clear()
-    return x * x
-
-  def map_fn(x):
-    return script_ops.py_func(map_py_fn, [x], x.dtype)
-
-  def interleave_fn(x):
-    dataset = dataset_ops.Dataset.from_tensors(x)
-    dataset = dataset.repeat(x)
-    return dataset.map(map_fn)
-
-  options = dataset_ops.Options()
-  options.experimental_deterministic = False
-  dataset = dataset_ops.Dataset.from_tensor_slices(input_values).repeat(
-      2).interleave(interleave_fn, cycle_length, block_length,
-                    num_parallel_calls).with_options(options)
-  iterator = dataset_ops.make_one_shot_iterator(dataset)
-  get_next = iterator.get_next()
-  return get_next, coordination_events
-
-
 def _repeat(values, count):
   """Produces a list of lists suitable for testing interleave.
 
@@ -254,60 +207,37 @@ class InterleaveTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.evaluate(get_next())
 
   @parameterized.named_parameters(
-      ("1", np.int64([4, 5, 6]), 2, 1, 1),
-      ("2", np.int64([4, 5, 6]), 2, 1, 2),
-      ("3", np.int64([4, 5, 6]), 2, 3, 1),
-      ("4", np.int64([4, 5, 6]), 2, 3, 2),
-      ("5", np.int64([4, 5, 6]), 3, 2, 1),
-      ("6", np.int64([4, 5, 6]), 3, 2, 2),
-      ("7", np.int64([4, 5, 6]), 3, 2, 3),
-      ("8", np.int64([4, 0, 6]), 2, 3, 1),
-      ("9", np.int64([4, 0, 6]), 2, 3, 2),
+      ("1", np.int64([4, 5, 6]), 1, 3, 1),
+      ("2", np.int64([4, 5, 6]), 2, 1, 1),
+      ("3", np.int64([4, 5, 6]), 2, 1, 2),
+      ("4", np.int64([4, 5, 6]), 2, 3, 1),
+      ("5", np.int64([4, 5, 6]), 2, 3, 2),
+      ("6", np.int64([4, 5, 6]), 7, 2, 1),
+      ("7", np.int64([4, 5, 6]), 7, 2, 3),
+      ("8", np.int64([4, 5, 6]), 7, 2, 5),
+      ("9", np.int64([4, 5, 6]), 7, 2, 7),
+      ("10", np.int64([4, 0, 6]), 2, 3, 1),
+      ("11", np.int64([4, 0, 6]), 2, 3, 2),
   )
-  @test_util.run_v1_only("b/120545219")
-  def testSkipEagerSloppyInterleaveInOrder(self, input_values, cycle_length,
-                                           block_length, num_parallel_calls):
-    get_next, coordination_events = _make_coordinated_sloppy_dataset(
-        input_values, cycle_length, block_length, num_parallel_calls)
-    config = config_pb2.ConfigProto(
-        inter_op_parallelism_threads=num_parallel_calls + 1,
-        use_per_session_threads=True)
-    with self.cached_session(config=config) as sess:
-      for expected_element in _interleave(
-          _repeat(input_values, 2), cycle_length, block_length):
-        coordination_events[expected_element].set()
-        self.assertEqual(expected_element * expected_element,
-                         self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  @parameterized.named_parameters(
-      ("1", np.int64([4, 5, 6]), 2, 1, 2),
-      ("2", np.int64([4, 5, 6]), 2, 3, 2),
-      ("3", np.int64([4, 5, 6]), 3, 2, 3),
-      ("4", np.int64([4, 0, 6]), 2, 3, 2),
-  )
-  @test_util.run_v1_only("b/120545219")
-  def testSkipEagerSloppyInterleaveOutOfOrder(self, input_values, cycle_length,
-                                              block_length, num_parallel_calls):
-    get_next, coordination_events = _make_coordinated_sloppy_dataset(
-        input_values, cycle_length, block_length, num_parallel_calls)
-    config = config_pb2.ConfigProto(
-        inter_op_parallelism_threads=num_parallel_calls + 1,
-        use_per_session_threads=True)
-    with self.cached_session(config=config) as sess:
-      elements = [
-          x for x in _interleave(
-              _repeat(input_values, 2), cycle_length, block_length)
-      ]
-      for i in [1, 4, 7]:
-        elements[i], elements[i + 1] = elements[i + 1], elements[i]
-
-      for element in elements:
-        coordination_events[element].set()
-        self.assertEqual(element * element, self.evaluate(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+  def testSloppyInterleaveDataset(self, input_values, cycle_length,
+                                  block_length, num_parallel_calls):
+    count = 2
+    dataset = dataset_ops.Dataset.from_tensor_slices(input_values).repeat(
+        count).interleave(
+            lambda x: dataset_ops.Dataset.from_tensors(x).repeat(x),
+            cycle_length, block_length, num_parallel_calls)
+    options = dataset_ops.Options()
+    options.experimental_deterministic = False
+    dataset = dataset.with_options(options)
+    expected_output = [
+        element for element in _interleave(
+            _repeat(input_values, count), cycle_length, block_length)
+    ]
+    get_next = self.getNext(dataset)
+    actual_output = []
+    for _ in range(len(expected_output)):
+      actual_output.append(self.evaluate(get_next()))
+    self.assertAllEqual(expected_output.sort(), actual_output.sort())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py b/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py
index 91b356691b75eb337ad61643646ba717e4929ab9..dfb54b50ad6b2dd8f242fba09218d6eae871a49c 100644
--- a/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -43,7 +43,7 @@ class IteratorCheckpointingTest(test_base.DatasetTestBase):
     ) else dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next if context.executing_eagerly(
     ) else functools.partial(self.evaluate, iterator.get_next())
-    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    checkpoint = trackable_utils.Checkpoint(iterator=iterator)
     self.assertAllEqual([1, 4], get_next())
     save_path = checkpoint.save(checkpoint_prefix)
     self.assertAllEqual([9, 16], get_next())
@@ -73,7 +73,7 @@ class IteratorCheckpointingTest(test_base.DatasetTestBase):
     ) else dataset_ops.make_one_shot_iterator(dataset_2)
     get_next_3 = iterator_3.get_next if context.executing_eagerly(
     ) else functools.partial(self.evaluate, iterator_3.get_next())
-    checkpoint = checkpointable_utils.Checkpoint(
+    checkpoint = trackable_utils.Checkpoint(
         iterator_1=iterator_1, iterator_2=iterator_2, iterator_3=iterator_3)
     self.assertAllEqual([1, 4], get_next_1())
     self.assertAllEqual(0, get_next_3())
@@ -96,7 +96,7 @@ class IteratorCheckpointingTest(test_base.DatasetTestBase):
     ) else dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next if context.executing_eagerly(
     ) else functools.partial(self.evaluate, iterator.get_next())
-    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    checkpoint = trackable_utils.Checkpoint(iterator=iterator)
     self.assertAllEqual(0, get_next())
     self.assertAllEqual(1, get_next())
     save_path = checkpoint.save(checkpoint_prefix)
@@ -115,7 +115,7 @@ class IteratorCheckpointingTest(test_base.DatasetTestBase):
     iterator = iter(dataset) if context.executing_eagerly(
     ) else dataset_ops.make_initializable_iterator(dataset)
     get_next = iterator.get_next
-    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    checkpoint = trackable_utils.Checkpoint(iterator=iterator)
     for i in range(5):
       checkpoint.restore(
           checkpoint_management.latest_checkpoint(
diff --git a/tensorflow/python/data/kernel_tests/iterator_cluster_test.py b/tensorflow/python/data/kernel_tests/iterator_cluster_test.py
index 20088234953b1cdc8f85381ded45cf22aa93c75a..23d3b6a439857e229ebd1b3298db1c29e2b09849 100644
--- a/tensorflow/python/data/kernel_tests/iterator_cluster_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_cluster_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.contrib import lookup as lookup_ops
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
@@ -31,7 +32,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
diff --git a/tensorflow/python/data/kernel_tests/list_files_test.py b/tensorflow/python/data/kernel_tests/list_files_test.py
index a70c4b081d5c710082eb485a1dbb6179a90da2ce..03cec7efa50f3afcf844d6b9ed20952965e8c707 100644
--- a/tensorflow/python/data/kernel_tests/list_files_test.py
+++ b/tensorflow/python/data/kernel_tests/list_files_test.py
@@ -106,11 +106,13 @@ class ListFilesTest(test_base.DatasetTestBase):
     self.assertEqual(all_actual_filenames[0], all_actual_filenames[1])
     self.assertEqual(all_actual_filenames[0], all_actual_filenames[2])
 
-  # TODO(b/117581999): eager mode assertion fail wrapped, debug.
-  def tesSkipEagerEmptyDirectoryInitializer(self):
-    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
+  def tesEmptyDirectoryInitializer(self):
+
+    def dataset_fn():
+      return dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
+
     self.assertDatasetProduces(
-        dataset,
+        dataset_fn(),
         expected_error=(errors.InvalidArgumentError,
                         'No files matched pattern'),
         requires_initialization=True)
diff --git a/tensorflow/python/data/kernel_tests/map_test.py b/tensorflow/python/data/kernel_tests/map_test.py
index e07706413dea9932c0b83f9eaedd62707b57e668..4badcffd463ae8d12c5704730d710beadc563040 100644
--- a/tensorflow/python/data/kernel_tests/map_test.py
+++ b/tensorflow/python/data/kernel_tests/map_test.py
@@ -26,8 +26,10 @@ import numpy as np
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.data.experimental.ops import threading_options
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -38,14 +40,15 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -78,19 +81,23 @@ def _make_coordinated_sloppy_dataset(num_elements, num_parallel_calls):
   options.experimental_deterministic = False
   dataset = dataset_ops.Dataset.range(num_elements).map(
       map_fn, num_parallel_calls).with_options(options)
-  iterator = dataset_ops.make_one_shot_iterator(dataset)
-  next_element = iterator.get_next()
-  return next_element, coordination_events
+  return dataset, coordination_events
 
 
-@test_util.run_v1_only("b/120545219")
-class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
+# TODO(jsimsa): Add tests for `map_with_legacy_function`.
+@test_util.run_all_in_graph_and_eager_modes
+class MapTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def _buildMapDataset(self, components, count):
+
     def _map_fn(x, y, z):
       return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-    return (dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-            .repeat(count))
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).map(
+        _map_fn).repeat(count)
+    self.assertEqual([c.shape[1:] for c in components],
+                     [shape for shape in dataset.output_shapes])
+    return dataset
 
   def testMapDataset(self):
     """Test an dataset that maps a TF function across its input elements."""
@@ -99,34 +106,32 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     components = (np.arange(7),
                   np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
                   np.array(37.0) * np.arange(7))
-    count = array_ops.placeholder(dtypes.int64, shape=[])
 
-    dataset = self._buildMapDataset(components, count)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
+    # Test single-threaded access to the iterator.
+    get_next = self.getNext(self._buildMapDataset(components, 14))
+    for _ in range(14):
+      for i in range(7):
+        result = self.evaluate(get_next())
+        for component, result_component in zip(components, result):
+          self.assertAllEqual(component[i]**2, result_component)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
+  # TODO(b/117581999): add eager coverage, different threads run in graph
+  # context.
+  @test_util.run_v1_only("b/120545219")
+  def testSkipEagerMapDatasetMultithreaded(self):
+    # Test multi-threaded access to the same iterator.
+    components = (np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7))
+    get_next = self.getNext(self._buildMapDataset(components, 18))
+    results = []
     with self.cached_session() as sess:
-      # Test single-threaded access to the iterator.
-      sess.run(init_op, feed_dict={count: 14})
-      for _ in range(14):
-        for i in range(7):
-          result = sess.run(get_next)
-          for component, result_component in zip(components, result):
-            self.assertAllEqual(component[i]**2, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test multi-threaded access to the same iterator.
-      sess.run(init_op, feed_dict={count: 18})
-      results = []
       def iterator_thread():
         while True:
           try:
-            results.append(sess.run(get_next))
+            results.append(sess.run(get_next()))
           except errors.OutOfRangeError:
             return
       threads = [self.checkedThread(target=iterator_thread) for _ in range(8)]
@@ -148,59 +153,66 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def _buildParallelMapDataset(self, components, count, num_parallel_calls,
                                output_buffer_size):
+
     def _map_fn(x, y, z):
       return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-    return (dataset_ops.Dataset.from_tensor_slices(components)
-            .map(_map_fn, num_parallel_calls=num_parallel_calls)
-            .prefetch(output_buffer_size)
-            .repeat(count))
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).map(
+        _map_fn, num_parallel_calls=num_parallel_calls).prefetch(
+            output_buffer_size).repeat(count)
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     [shape for shape in dataset.output_shapes])
+    return dataset
 
   def testParallelMapDataset(self):
     """Test an dataset that maps a TF function across its input elements."""
+
     # The pipeline is TensorSliceDataset -> ParallelMapDataset(square_3) ->
     # RepeatDataset(count).
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-    count = array_ops.placeholder(dtypes.int64, shape=[])
-    num_parallel_calls = array_ops.placeholder(dtypes.int32, shape=[])
-    output_buffer_size = array_ops.placeholder(dtypes.int64, shape=[])
+    def do_test(num_parallel_calls, output_buffer_size):
 
-    dataset = self._buildParallelMapDataset(
-        components, count, num_parallel_calls, output_buffer_size)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+      components = (np.arange(7),
+                    np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                    np.array(37.0) * np.arange(7))
+      # Test single-threaded access to the iterator.
+      get_next = self.getNext(
+          self._buildParallelMapDataset(components, 14, num_parallel_calls,
+                                        output_buffer_size))
+      for _ in range(14):
+        for i in range(7):
+          result = self.evaluate(get_next())
+          for component, result_component in zip(components, result):
+            self.assertAllEqual(component[i]**2, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
 
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
+    for num_parallel_calls_val, output_buffer_size_val in [(1, 1), (1, 2), (2,
+                                                                            2),
+                                                           (2, 4), (8, 8),
+                                                           (8, 16)]:
+      do_test(num_parallel_calls_val, output_buffer_size_val)
 
-    with self.cached_session() as sess:
+  # TODO(b/117581999): add eager coverage, different threads run in graph
+  # context.
+  @test_util.run_v1_only("b/120545219")
+  def testSkipEagerParallelMapDatasetMultithreaded(self):
+
+    def do_test(num_parallel_calls, output_buffer_size):
+      # Test multi-threaded access to the same iterator.
+      components = (np.arange(7),
+                    np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                    np.array(37.0) * np.arange(7))
+      get_next = self.getNext(
+          self._buildParallelMapDataset(components, 18, num_parallel_calls,
+                                        output_buffer_size))
+      results = []
+      with self.cached_session() as sess:
 
-      def do_test(num_parallel_calls_val, output_buffer_size_val):
-        # Test single-threaded access to the iterator.
-        sess.run(init_op, feed_dict={
-            count: 14,
-            num_parallel_calls: num_parallel_calls_val,
-            output_buffer_size: output_buffer_size_val})
-        for _ in range(14):
-          for i in range(7):
-            result = sess.run(get_next)
-            for component, result_component in zip(components, result):
-              self.assertAllEqual(component[i]**2, result_component)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-        # Test multi-threaded access to the same iterator.
-        sess.run(init_op, feed_dict={
-            count: 18,
-            num_parallel_calls: num_parallel_calls_val,
-            output_buffer_size: output_buffer_size_val})
-        results = []
         def iterator_thread():
           while True:
             try:
-              results.append(sess.run(get_next))
+              results.append(sess.run(get_next()))
             except errors.OutOfRangeError:
               return
         threads = [self.checkedThread(target=iterator_thread)
@@ -237,14 +249,10 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = self._buildParallelMapDataset(components, 1000, 100, 100)
     # NOTE(mrry): Also test that the prefetching thread is cancelled correctly.
     dataset = dataset.prefetch(100)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    get_next = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for _ in range(3):
-        sess.run(get_next)
+    for _ in range(3):
+      self.evaluate(get_next())
 
   def testParallelMapUnspecifiedOutputSize(self):
     components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
@@ -252,14 +260,10 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = (dataset_ops.Dataset.from_tensor_slices(components)
                .map(lambda x: array_ops.check_numerics(x, "message"),
                     num_parallel_calls=2))
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    get_next = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for _ in range(3):
-        sess.run(get_next)
+    for _ in range(3):
+      self.evaluate(get_next())
 
   def testParallelMapError(self):
     components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
@@ -267,20 +271,16 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = (dataset_ops.Dataset.from_tensor_slices(components)
                .map(lambda x: array_ops.check_numerics(x, "message"),
                     num_parallel_calls=2))
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    get_next = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for _ in range(3):
-        sess.run(get_next)
-      # The 4th element is NaN, so `array_ops.check_numerics()` should fail.
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
-      sess.run(get_next)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    for _ in range(3):
+      self.evaluate(get_next())
+    # The 4th element is NaN, so `array_ops.check_numerics()` should fail.
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(get_next())
+    self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   def testPrefetchError(self):
     components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
@@ -288,20 +288,17 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = (dataset_ops.Dataset.from_tensor_slices(components)
                .map(lambda x: array_ops.check_numerics(x, "message"))
                .prefetch(2))
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for _ in range(3):
-        sess.run(get_next)
-      # The 4th element is NaN, so `array_ops.check_numerics()` should fail.
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
-      sess.run(get_next)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    get_next = self.getNext(dataset)
+
+    for _ in range(3):
+      self.evaluate(get_next())
+    # The 4th element is NaN, so `array_ops.check_numerics()` should fail.
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(get_next())
+    self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   def testCaptureIterator(self):
 
@@ -314,23 +311,22 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return dataset_ops.Dataset.range(10).map(_map_fn)
 
     def _build_graph():
-      captured_iterator = dataset_ops.make_initializable_iterator(
-          dataset_ops.Dataset.range(10))
+      if context.executing_eagerly():
+        captured_iterator = iter(dataset_ops.Dataset.range(10))
+      else:
+        captured_iterator = dataset_ops.make_initializable_iterator(
+            dataset_ops.Dataset.range(10))
       ds = _build_ds(captured_iterator)
-      iterator = ds.make_initializable_iterator()
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      return captured_iterator.initializer, init_op, get_next
-
-    with ops.Graph().as_default() as g:
-      captured_init_op, init_op, get_next = _build_graph()
-      with self.session(graph=g) as sess:
-        sess.run(captured_init_op)
-        sess.run(init_op)
-        for i in range(10):
-          self.assertEqual(i * i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+      return captured_iterator, ds
+
+    captured_iter, ds = _build_graph()
+    if not context.executing_eagerly():
+      self.evaluate(captured_iter.initializer)
+    get_next = self.getNext(ds, requires_initialization=True)
+    for i in range(10):
+      self.assertEqual(i * i, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   def testCaptureHashTable(self):
     # NOTE(mrry): We must use the V2 variants of `HashTable`
@@ -345,41 +341,38 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     input_sentences = dataset_ops.Dataset.from_tensor_slices(
         ["brain brain tank salad surgery", "surgery brain"])
 
-    iterator = dataset_ops.make_initializable_iterator(
-        input_sentences
-        .map(lambda x: string_ops.string_split([x]).values).map(table.lookup))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    dataset = input_sentences.map(lambda x: string_ops.string_split([x]).values
+                                 ).map(table.lookup)
 
-    with self.cached_session() as sess:
-      sess.run(table.initializer)
-      sess.run(init_op)
-      sess.run(get_next)
-      sess.run(get_next)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    get_next = self.getNext(dataset, requires_initialization=True)
 
+    self.evaluate(table.initializer)
+    self.evaluate(get_next())
+    self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  @test_util.run_v1_only("b/123904513")
   def testCaptureQueue(self):
     elements = np.random.randint(100, size=[200])
     queue = data_flow_ops.FIFOQueue(200, dtypes.int64, shapes=[])
     enqueue_op = queue.enqueue_many(elements)
     close_op = queue.close()
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.from_tensors(0).repeat(-1)
-        .map(lambda _: queue.dequeue()))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    dataset = dataset_ops.Dataset.from_tensors(0).repeat(
+        -1).map(lambda _: queue.dequeue())
 
-    with self.cached_session() as sess:
-      sess.run(enqueue_op)
-      sess.run(close_op)
-      sess.run(init_op)
-      for element in elements:
-        self.assertEqual(element, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    get_next = self.getNext(dataset, requires_initialization=True)
+    self.evaluate(enqueue_op)
+    self.evaluate(close_op)
 
-  def testCaptureSameResourceMultipleTimes(self):
+    for element in elements:
+      self.assertEqual(element, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  # TODO(b/117581999): Possible deadlock in eager mode, debug.
+  @test_util.run_v1_only("b/120545219")
+  def testSkipEagerCaptureSameResourceMultipleTimes(self):
     elements = np.random.randint(100, size=[200])
     queue = data_flow_ops.FIFOQueue(
         200, dtypes.int64, shapes=[], shared_name="shared_queue")
@@ -389,101 +382,149 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     enqueue_op = queue.enqueue_many(elements)
     close_op = queue.close()
 
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.from_tensors(0).repeat(-1)
-        .map(lambda _: (queue.dequeue(), queue_2.dequeue())))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    dataset = dataset_ops.Dataset.from_tensors(0).repeat(
+        -1).map(lambda _: (queue.dequeue(), queue_2.dequeue()))
 
-    with self.cached_session() as sess:
-      sess.run(enqueue_op)
-      sess.run(close_op)
-      sess.run(init_op)
-      for i in range(100):
-        self.assertEqual(sorted([elements[i * 2], elements[i * 2 + 1]]),
-                         sorted(sess.run(get_next)))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    self.evaluate(enqueue_op)
+    self.evaluate(close_op)
+    get_next = self.getNext(dataset, requires_initialization=True)
+    for i in range(100):
+      self.assertCountEqual([elements[i * 2], elements[i * 2 + 1]],
+                            self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  # TODO(b/121264236): add eager mode coverage when we have multi-device setup.
+  @test_util.run_v1_only("b/121264236")
+  def testSkipEagerCaptureConstantsWithConflictingDevices(self):
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
+    with self.cached_session(config=config):
+      with ops.device("/device:CPU:0"):
+        a = constant_op.constant(3.0)
+      with ops.device("/device:CPU:1"):
+        b = constant_op.constant(5.0)
+
+      def func(_):
+        return math_ops.add(a, b)
+
+      dataset = dataset_ops.Dataset.from_tensors(0).repeat(10).map(func)
+      expected_output = [8.0] * 10
+      self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  # TODO(b/121264236): add eager mode coverage when we have multi-device setup.
+  @test_util.run_v1_only("b/121264236")
+  def testSkipEagerRefVariablesWithConflictingDevices(self):
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
+    with self.cached_session(config=config):
+
+      def func(_):
+        with ops.device("/device:CPU:0"):
+          a = variables.VariableV1(3.0)
+        with ops.device("/device:CPU:1"):
+          b = variables.VariableV1(5.0)
+        return math_ops.add(a, b)
+
+      # Use the legacy function implementation as eager function will convert
+      # RefVariables to ResourceVariables.
+      dataset = dataset_ops.Dataset.from_tensors(0).repeat(10)
+      dataset = dataset.map_with_legacy_function(func)
+      self.evaluate(variables.global_variables_initializer())
+      expected_output = [8.0] * 10
+      self.assertDatasetProduces(
+          dataset,
+          expected_output=expected_output,
+          requires_initialization=True)
+
+  # TODO(b/121264236): add eager mode coverage when we have multi-device setup.
+  @test_util.run_v1_only("b/121264236")
+  def testSkipEagerResourceVariablesWithConflictingDevices(self):
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
+    with self.cached_session(config=config):
+
+      def func(_):
+        with ops.device("/device:CPU:0"):
+          a = variables.Variable(3.0)
+        with ops.device("/device:CPU:1"):
+          b = variables.Variable(5.0)
+        return math_ops.add(a, b)
+
+      # The MapDataset node ends up with two ResourceVariable inputs, one on
+      # device CPU:0 and the other on device CPU:1. The placer cannot resolve
+      # this as it cannot place the MapDatasetOp on both devices.
+      dataset = dataset_ops.Dataset.from_tensors(0).repeat(10).map(func)
+      expected_error = (
+          errors.InvalidArgumentError,
+          "Cannot place the graph because a reference or resource edge "
+          "connects colocation groups with incompatible assigned devices")
+      self.assertDatasetProduces(
+          dataset, expected_error=expected_error, requires_initialization=True)
 
   def testCaptureVariable(self):
     counter_var = variable_scope.get_variable(
         "counter", (), dtypes.int32, use_resource=True)
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.from_tensors(0).repeat(10)
-        .map(lambda _: counter_var.assign_add(1)))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    dataset = dataset_ops.Dataset.from_tensors(0).repeat(
+        10).map(lambda _: counter_var.assign_add(1))
+    get_next = self.getNext(dataset, requires_initialization=True)
 
-    with self.cached_session() as sess:
-      sess.run(counter_var.initializer)
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual(i, sess.run(counter_var))
-        self.assertEqual(i + 1, sess.run(get_next))
-      self.assertEqual(10, sess.run(counter_var))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertEqual(10, sess.run(counter_var))
+    self.evaluate(counter_var.initializer)
+
+    for i in range(10):
+      self.assertEqual(i, self.evaluate(counter_var))
+      self.assertEqual(i + 1, self.evaluate(get_next()))
+    self.assertEqual(10, self.evaluate(counter_var))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    self.assertEqual(10, self.evaluate(counter_var))
 
-  def testCaptureUninitializedVariableError(self):
+  # TODO(b/117581999): error not captured for eager mode, debug.
+  @test_util.run_v1_only("b/120545219")
+  def testSkipEagerCaptureUninitializedVariableError(self):
     counter_var = variable_scope.get_variable(
         "counter", (), dtypes.int32, use_resource=True)
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.from_tensors(0).repeat(10)
-        .map(lambda _: counter_var.assign_add(1)))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    dataset = dataset_ops.Dataset.from_tensors(0).repeat(
+        10).map(lambda _: counter_var.assign_add(1))
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      with self.assertRaises(errors.NotFoundError):
-        sess.run(get_next)
+    get_next = self.getNext(dataset, requires_initialization=True)
 
-  def testSeededStatefulOperatorIsProperlyStateful(self):
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.from_tensors(0).repeat(10)
-        .map(lambda _: random_ops.random_uniform((), seed=11)).batch(2))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      random_values = []
-      with self.assertRaises(errors.OutOfRangeError):
-        while True:
-          random_values.extend(sess.run(get_next))
-      self.assertEqual(10, len(random_values))
-      self.assertGreater(np.abs(np.diff(random_values)).max(), 1e-6)
-      sess.run(init_op)
-      random_values_2 = []
-      with self.assertRaises(errors.OutOfRangeError):
-        while True:
-          random_values_2.extend(sess.run(get_next))
+    with self.assertRaises(errors.NotFoundError):
+      self.evaluate(get_next())
 
-      # Randomness is repeatable given same seed
-      self.assertAllClose(random_values, random_values_2)
+  def testSeededStatefulOperatorIsProperlyStateful(self):
+    dataset = dataset_ops.Dataset.from_tensors(0).repeat(
+        10).map(lambda _: random_ops.random_uniform((), seed=11)).batch(2)
+
+    get_next = self.getNext(dataset, requires_initialization=True)
+    random_values = []
+    with self.assertRaises(errors.OutOfRangeError):
+      while True:
+        random_values.extend(self.evaluate(get_next()))
+    self.assertLen(random_values, 10)
+    self.assertGreater(np.abs(np.diff(random_values)).max(), 1e-6)
+
+    get_next = self.getNext(dataset, requires_initialization=True)
+    random_values_2 = []
+    with self.assertRaises(errors.OutOfRangeError):
+      while True:
+        random_values_2.extend(self.evaluate(get_next()))
+
+    # Randomness is repeatable given same seed
+    self.assertAllClose(random_values, random_values_2)
 
   def testStatefulMapKeepsStateAcrossIterators(self):
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.from_tensors(0).repeat(10)
-        .map(lambda _: random_ops.random_uniform((), seed=11))
-        .repeat(1000)
-        .batch(10))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    dataset = dataset_ops.Dataset.from_tensors(0).repeat(10).map(
+        lambda _: random_ops.random_uniform((), seed=11)).repeat(1000).batch(10)
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      random_values = sess.run(get_next)
-
-      # Assert that one of the next 99 batches yielded by the iterator is
-      # different from the first.
-      i = 0
-      while i < 99:
-        if np.any(random_values != sess.run(get_next)):
-          break
-        i += 1
-      self.assertLess(i, 99)
+    get_next = self.getNext(dataset)
+    random_values = self.evaluate(get_next())
+
+    # Assert that one of the next 99 batches yielded by the iterator is
+    # different from the first.
+    i = 0
+    while i < 99:
+      if np.any(random_values != self.evaluate(get_next())):
+        break
+      i += 1
+    self.assertLess(i, 99)
 
   def testStatefulOperationInShortCircuit(self):
     counter_var = variable_scope.get_variable(
@@ -493,36 +534,25 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       counter_var.assign_add(1)
       return x
 
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.range(10).map(increment_fn))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    dataset = dataset_ops.Dataset.range(10).map(increment_fn)
 
-    with self.cached_session() as sess:
-      sess.run(counter_var.initializer)
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual(i, sess.run(counter_var))
-        self.assertEqual(i, sess.run(get_next))
-      self.assertEqual(10, sess.run(counter_var))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertEqual(10, sess.run(counter_var))
+    get_next = self.getNext(dataset, requires_initialization=True)
 
-  def testMapDict(self):
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.range(10)
-        .map(lambda x: {"foo": x * 2, "bar": x ** 2})
-        .map(lambda d: d["foo"] + d["bar"]))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    self.evaluate(counter_var.initializer)
+    for i in range(10):
+      self.assertEqual(i, self.evaluate(counter_var))
+      self.assertEqual(i, self.evaluate(get_next()))
+    self.assertEqual(10, self.evaluate(counter_var))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    self.assertEqual(10, self.evaluate(counter_var))
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual(i * 2 + i**2, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+  def testMapDict(self):
+    dataset = dataset_ops.Dataset.range(10).map(
+        lambda x: {"foo": x * 2, "bar": x**2}).map(
+            lambda d: d["foo"] + d["bar"])
+    self.assertDatasetProduces(
+        dataset, expected_output=[i * 2 + i**2 for i in range(10)])
 
   def testMapNamedtuple(self, count=10):
     # construct dataset of tuples
@@ -545,33 +575,23 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset_tuple = dataset_tuple.map(preprocess_tuple)
     dataset_namedtuple = dataset_namedtuple.map(preprocess_namedtuple)
 
-    next_tuple = dataset_ops.make_one_shot_iterator(dataset_tuple).get_next()
-    next_namedtuple = dataset_ops.make_one_shot_iterator(
-        dataset_namedtuple).get_next()
+    next_tuple = self.getNext(dataset_tuple)
+    next_namedtuple = self.getNext(dataset_namedtuple)
 
     # make sure both datasets contain the same data
-    with self.cached_session() as sess:
-      for i in range(count):
-        tuple_, namedtuple_ = sess.run([next_tuple, next_namedtuple])
-        self.assertEqual(tuple_, namedtuple_)
-        self.assertEqual(tuple_, (i, -2 * i))
+    for i in range(count):
+      tuple_, namedtuple_ = self.evaluate([next_tuple(), next_namedtuple()])
+      self.assertEqual(tuple_, namedtuple_)
+      self.assertEqual(tuple_, (i, -2 * i))
 
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_namedtuple)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_namedtuple())
 
   def testUseStepContainerInMap(self):
     row = np.arange(6)
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.from_tensors(row)
-        .map(lambda elems: functional_ops.map_fn(lambda x: x * x, elems)))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(row**2, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.from_tensors(
+        row).map(lambda elems: map_fn.map_fn(lambda x: x * x, elems))
+    self.assertDatasetProduces(dataset, expected_output=[row**2])
 
   def testCaseAndCondInMap(self):
 
@@ -599,24 +619,19 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           pred_fn_pairs, default=multiply, exclusive=True)
 
     def build_dataset(row, num):
-      iterator = dataset_ops.make_initializable_iterator(
-          dataset_ops.Dataset.from_tensor_slices(row).map(
-              lambda x: control_map_fn(x, num)))
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      return init_op, get_next
+      dataset = dataset_ops.Dataset.from_tensor_slices(
+          row).map(lambda x: control_map_fn(x, num))
+      return self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      row = np.arange(6)
-      for num in [2, 3, 4]:
-        init_op, get_next = build_dataset(row, num)
-        sess.run(init_op)
-        for i in range(6):
-          self.assertEqual(
-              (i // 2 if i % 2 else i * 2) if (num == 2 or num == 3) else i * 2,
-              sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+    row = np.arange(6)
+    for num in [2, 3, 4]:
+      get_next = build_dataset(row, num)
+      for i in range(6):
+        self.assertEqual(
+            (i // 2 if i % 2 else i * 2) if (num == 2 or num == 3) else i * 2,
+            self.evaluate(get_next()))
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
 
   def testCaseInWhileInMap(self):
 
@@ -638,24 +653,19 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     def build_dataset(row, num):
       # pylint: disable=g-long-lambda
-      iterator = dataset_ops.make_initializable_iterator(
-          dataset_ops.Dataset.from_tensors(row).map(
-              lambda elems: functional_ops.map_fn(
-                  lambda x: control_map_fn(x, num), elems)))
-      init_op = iterator.initializer
-      get_next = iterator.get_next()
-      return init_op, get_next
+      dataset = dataset_ops.Dataset.from_tensors(
+          row).map(lambda elems: map_fn.map_fn(
+              lambda x: control_map_fn(x, num), elems))
+      return self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      row = np.arange(6)
-      for num in [2, 3, 4]:
-        init_op, get_next = build_dataset(row, num)
-        sess.run(init_op)
-        self.assertAllEqual(
-            [x // 2 if (num == 2 or num == 3) else x * 2 for x in row],
-            sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+    row = np.arange(6)
+    for num in [2, 3, 4]:
+      get_next = build_dataset(row, num)
+      self.assertAllEqual(
+          [x // 2 if (num == 2 or num == 3) else x * 2 for x in row],
+          self.evaluate(get_next()))
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
 
   def testCaseAndCondInWhileInMap(self):
 
@@ -685,21 +695,24 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     row = np.arange(6)
     num = 2
     # pylint: disable=g-long-lambda
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.from_tensors(row).map(
-            lambda elems: functional_ops.map_fn(
-                lambda x: control_map_fn(x, num), elems)))
+    dataset = dataset_ops.Dataset.from_tensors(
+        row).map(lambda elems: map_fn.map_fn(
+            lambda x: control_map_fn(x, num), elems))
     # pylint: enable=g-long-lambda
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    get_next = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual([(x // 2 if x % 2 else x * 2) if
-                           (num == 2 or num == 3) else x * 2 for x in row],
-                          sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    self.assertAllEqual([(x // 2 if x % 2 else x * 2) if
+                         (num == 2 or num == 3) else x * 2 for x in row],
+                        self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  def testNestedListMapDataset(self):
+    dataset = dataset_ops.Dataset.from_tensors(
+        [0, 1, 2]).repeat(10).map(lambda a: ([a[1], a[0] + a[2]], a[1]))
+
+    expected_output = [(np.array([1, 2]), 1)] * 10
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
 
   def testPrefetch(self):
     # We will use this event to test that `_map_py_func()` has been
@@ -717,58 +730,54 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     def _map_fn(x):
       return script_ops.py_func(_map_py_func, [x], x.dtype)
 
-    buffer_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.range(100)
-        .map(_map_fn)
-        .prefetch(buffer_size_placeholder))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    def do_test(buffer_size):
+      dataset = dataset_ops.Dataset.range(100).map(_map_fn).prefetch(
+          buffer_size)
 
-    with self.cached_session() as sess:
+      get_next = self.getNext(dataset)
       # Simple test that prefetch yields the expected values in the
       # expected order.
-      for buffer_size in [1, 10, 100, 1000]:
-        sess.run(init_op, feed_dict={buffer_size_placeholder: buffer_size})
-        for i in range(100):
-          self.assertEqual(i * i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-      # We can indirectly observe that varying the buffer size has the
-      # intended effect by observing when `ev` is set (on the 6th
-      # invocation of `_map_py_func()`).
-      # NOTE(mrry): We do not test with `buffer_size ==
-      # set_event_during_invocation`, because we must consume at least
-      # one element to start the prefetching.
-      for buffer_size in range(1, set_event_during_invocation):
-        event_will_be_set_after_consuming = (
-            set_event_during_invocation - buffer_size + 1)
-
-        ev.clear()
-        sess.run(init_op, feed_dict={buffer_size_placeholder: buffer_size})
-        for i in range(event_will_be_set_after_consuming):
-          self.assertFalse(ev.is_set())
-          self.assertEqual(i * i, sess.run(get_next))
-        ev.wait()
-        for i in range(event_will_be_set_after_consuming, 100):
-          self.assertEqual(i * i, sess.run(get_next))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+      for i in range(100):
+        self.assertEqual(i * i, self.evaluate(get_next()))
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
+
+    for buffer_size in [1, 10, 100, 1000]:
+      do_test(buffer_size)
+
+    # We can indirectly observe that varying the buffer size has the
+    # intended effect by observing when `ev` is set (on the 6th
+    # invocation of `_map_py_func()`).
+    # NOTE(mrry): We do not test with `buffer_size ==
+    # set_event_during_invocation`, because we must consume at least
+    # one element to start the prefetching.
+    def do_test_ev(buffer_size):
+      dataset = dataset_ops.Dataset.range(100).map(_map_fn).prefetch(
+          buffer_size)
+
+      get_next = self.getNext(dataset)
+
+      event_will_be_set_after_consuming = (
+          set_event_during_invocation - buffer_size + 1)
+
+      ev.clear()
+      for i in range(event_will_be_set_after_consuming):
+        self.assertFalse(ev.is_set())
+        self.assertEqual(i * i, self.evaluate(get_next()))
+      ev.wait()
+      for i in range(event_will_be_set_after_consuming, 100):
+        self.assertEqual(i * i, self.evaluate(get_next()))
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
 
-  def testReturnList(self):
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.range(10)
-        .map(lambda x: [x, constant_op.constant(37.0)]))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    for buffer_size in range(1, set_event_during_invocation):
+      do_test_ev(buffer_size)
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual((i, 37.0), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+  def testReturnList(self):
+    dataset = dataset_ops.Dataset.range(
+        10).map(lambda x: [x, constant_op.constant(37.0)])
+    self.assertDatasetProduces(
+        dataset, expected_output=[(i, 37.0) for i in range(10)])
 
   def testMultiOutputPyFunc(self):
     # The `tf.py_func()` op returns a list of tensors for its outputs.
@@ -778,17 +787,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return script_ops.py_func(
           _map_py_func, [x_tensor], [dtypes.int64, dtypes.float64])
 
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.range(10).map(_map_fn))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual((i, 37.0), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.range(10).map(_map_fn)
+    self.assertDatasetProduces(
+        dataset, expected_output=[(i, 37.0) for i in range(10)])
 
   def testSparse(self):
 
@@ -798,19 +799,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           values=(i * np.array([1])),
           dense_shape=np.array([1, 1]))
 
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.range(10).map(_sparse))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        actual = sess.run(get_next)
-        self.assertIsInstance(actual, sparse_tensor.SparseTensorValue)
-        self.assertSparseValuesEqual(actual, _sparse(i))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.range(10).map(_sparse)
+    self.assertDatasetProduces(
+        dataset, expected_output=[_sparse(i) for i in range(10)])
 
   def testSparseChain(self):
 
@@ -824,20 +815,13 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertTrue(sparse_tensor.is_sparse(i))
       return sparse_ops.sparse_concat(0, [i, i])
 
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.range(10).map(_sparse).map(_check))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
+    dataset = dataset_ops.Dataset.range(10).map(_sparse).map(_check)
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        actual = sess.run(get_next)
-        self.assertIsInstance(actual, sparse_tensor.SparseTensorValue)
-        self.assertSparseValuesEqual(actual, _check(_sparse(i)).eval())
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[self.evaluate(_check(_sparse(i))) for i in range(10)])
 
+  @test_util.run_v1_only("b/123904513")
   def testParallelMapOutOfRangeError(self):
     def raising_py_func(i):
       if i == 100:
@@ -845,32 +829,18 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       else:
         return i
 
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.range(105)
-        .map(lambda x: script_ops.py_func(raising_py_func, [x], dtypes.int64),
-             num_parallel_calls=2))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(100):
-        self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.range(105).map(
+        lambda x: script_ops.py_func(raising_py_func, [x], dtypes.int64),
+        num_parallel_calls=2)
+    get_next = self.getNext(dataset)
+    for i in range(100):
+      self.assertEqual(i, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   def testConstantOutput(self):
-    iterator = dataset_ops.make_initializable_iterator(
-        dataset_ops.Dataset.range(10).map(lambda x: [x, "hello", 10]))
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual((i, b"hello", 10), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.range(10).map(lambda x: [x, "hello", 10])
+    self.assertDatasetProduces(dataset, [(i, b"hello", 10) for i in range(10)])
 
   def testWarnOnLookupTable(self):
     def collecting_function(x):
@@ -899,7 +869,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         dataset_ops.Dataset.from_tensor_slices).map(
             lambda ds: ds.batch(3)).flat_map(lambda x: x)
 
-    self.assertDatasetProduces(dataset, [[1.0, 2.0, 3.0]])
+    self.assertDatasetProduces(dataset, expected_output=[[1.0, 2.0, 3.0]])
 
   def testReturnValueError(self):
     dataset = dataset_ops.Dataset.from_tensors([1.0, 2.0, 3.0])
@@ -932,11 +902,8 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return const_tensor
 
     dataset = dataset.map(broken_function)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-
-    with self.cached_session() as sess:
-      with self.assertRaisesRegexp(errors.InvalidArgumentError, "BrokenConst"):
-        sess.run(iterator.initializer)
+    self.assertDatasetProduces(
+        dataset, expected_error=(errors.InvalidArgumentError, "BrokenConst"))
 
 # pylint: disable=g-long-lambda
   @parameterized.named_parameters(
@@ -959,12 +926,10 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return tids
 
     dataset = make_dataset_fn(dataset, _map_fn)
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
-    get_next = iterator.get_next()
+    get_next = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      tids = sess.run(get_next)
-      self.assertTrue(all(tids[0] == tid for tid in tids))
+    tids = self.evaluate(get_next())
+    self.assertTrue(all(tids[0] == tid for tid in tids))
 # pylint: enable=g-long-lambda
 
   @parameterized.named_parameters(
@@ -980,29 +945,26 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testShortCircuit(self, structure, map_fn, num_parallel_calls):
     dataset = self.structuredDataset(structure).repeat().map(
         map_fn, num_parallel_calls=num_parallel_calls)
-    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
+    get_next = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      if isinstance(structure, tuple):
-        expected = map_fn(*sess.run(self.structuredElement(structure)))
-      else:
-        expected = map_fn(sess.run(self.structuredElement(structure)))
-      self.assertEqual(expected, sess.run(get_next))
+    if isinstance(structure, tuple):
+      expected = map_fn(*self.evaluate(self.structuredElement(structure)))
+    else:
+      expected = map_fn(self.evaluate(self.structuredElement(structure)))
+    self.assertEqual(expected, self.evaluate(get_next()))
 
   @parameterized.named_parameters(
       ("Sequential", None),
       ("Parallel", 10),
   )
   def testShortCircuitCapturedInput(self, num_parallel_calls):
-    captured_t = array_ops.placeholder(dtypes.int64, shape=[])
+    captured_t = variables.Variable(42)
     dataset = self.structuredDataset(None).repeat().map(
         lambda x: captured_t, num_parallel_calls=num_parallel_calls)
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-    get_next = iterator.get_next()
+    self.evaluate(variables.global_variables_initializer())
+    get_next = self.getNext(dataset, requires_initialization=True)
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer, feed_dict={captured_t: 42})
-      self.assertEqual(42, sess.run(get_next))
+    self.assertEqual(42, self.evaluate(get_next()))
 
   @parameterized.named_parameters(
       ("1", 1, 1),
@@ -1013,17 +975,19 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("6", 100, 100),
   )
   def testSloppyInterleaveInOrder(self, num_elements, num_parallel_calls):
-    get_next, coordination_events = _make_coordinated_sloppy_dataset(
+    dataset, coordination_events = _make_coordinated_sloppy_dataset(
         num_elements, num_parallel_calls)
-    config = config_pb2.ConfigProto(
-        inter_op_parallelism_threads=num_parallel_calls + 1,
-        use_per_session_threads=True)
-    with self.cached_session(config=config) as sess:
-      for i in range(num_elements):
-        coordination_events[i].set()
-        self.assertEqual(i * i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    options = dataset_ops.Options()
+    options.experimental_threading = threading_options.ThreadingOptions()
+    options.experimental_threading.private_threadpool_size = (
+        num_parallel_calls + 1)
+    dataset = dataset.with_options(options)
+    get_next = self.getNext(dataset, requires_initialization=True)
+    for i in range(num_elements):
+      coordination_events[i].set()
+      self.assertEqual(i * i, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   @parameterized.named_parameters(
       ("1", 10, 10),
@@ -1031,21 +995,25 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("3", 100, 100),
   )
   def testSloppyInterleaveOutOfOrder(self, num_elements, num_parallel_calls):
-    get_next, coordination_events = _make_coordinated_sloppy_dataset(
+    dataset, coordination_events = _make_coordinated_sloppy_dataset(
         num_elements, num_parallel_calls)
-    config = config_pb2.ConfigProto(
-        inter_op_parallelism_threads=num_parallel_calls + 1,
-        use_per_session_threads=True)
-    with self.cached_session(config=config) as sess:
-      elements = [x for x in range(num_elements)]
-      for i in [1, 4, 7]:
-        elements[i], elements[i + 1] = elements[i + 1], elements[i]
-
-      for element in elements:
-        coordination_events[element].set()
-        self.assertEqual(element * element, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    options = dataset_ops.Options()
+    options.experimental_threading = threading_options.ThreadingOptions()
+    options.experimental_threading.private_threadpool_size = (
+        num_parallel_calls + 1)
+    dataset = dataset.with_options(options)
+
+    get_next = self.getNext(dataset, requires_initialization=True)
+
+    elements = [x for x in range(num_elements)]
+    for i in [1, 4, 7]:
+      elements[i], elements[i + 1] = elements[i + 1], elements[i]
+
+    for element in elements:
+      coordination_events[element].set()
+      self.assertEqual(element * element, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   @parameterized.named_parameters(
       ("Map", None),
@@ -1063,7 +1031,18 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.assertRaises(errors.InvalidArgumentError):
       self.evaluate(get_next())
 
+  # NOTE: collection test is specific to graph mode only, no eager coverage.
+  @test_util.run_v1_only("graph specific test")
+  def testSkipEagerCollectionCopy(self):
+    w = variable_scope.get_variable("w", [])
+    self.assertIn(w, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+
+    def func(x):
+      self.assertIn(w, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+      return x
 
+    dataset = dataset_ops.Dataset.from_tensors(constant_op.constant(1.0))
+    dataset.map(func)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
index 0322d1f2c604c3f9588eb8eaa39eb9829bb0a26e..a040a64013602b23ddeeb315a2fe3035f5f9be1d 100644
--- a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
+++ b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
@@ -18,111 +18,196 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import time
+from absl.testing import parameterized
+import six
+
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import multi_device_iterator_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
-
-
-# TODO(b/117581999): Add eager coverage.
-class MultiDeviceIteratorTest(test_base.DatasetTestBase):
-
-  @test_util.run_v1_only("b/120545219")
-  def testNoGetNext(self):
+from tensorflow.python.platform import tf_logging as logging
+
+
+# memory_profiler might not be available in the OSS version of TensorFlow.
+try:
+  import memory_profiler  # pylint:disable=g-import-not-at-top
+except ImportError:
+  memory_profiler = None
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MultiDeviceIteratorTest(test_base.DatasetTestBase,
+                              parameterized.TestCase):
+
+  def assertNotIncreasingMemory(self,
+                                f,
+                                num_iters=100000,
+                                increase_threshold_absolute_mb=10):
+    """Assert memory usage doesn't increase beyond given threshold for f."""
+
+    with context.eager_mode():
+      # Warm up.
+      f()
+
+      # Wait for background threads to start up and take over memory.
+      # FIXME: The nature of this test leaves few other options. Maybe there
+      # is a better way to do this.
+      time.sleep(4)
+      initial = memory_profiler.memory_usage(-1)[0]
+      for _ in six.moves.range(num_iters):
+        f()
+      increase = memory_profiler.memory_usage(-1)[0] - initial
+      logging.info("Memory increase observed: %f MB" % increase)
+      assert increase < increase_threshold_absolute_mb, (
+          "Increase is too high. Initial memory usage: %f MB. Increase: %f MB. "
+          "Maximum allowed increase: %f") % (initial, increase,
+                                             increase_threshold_absolute_mb)
+
+  @parameterized.parameters(0, 1, 42,)
+  @test_util.run_v1_only("b/121264236")
+  def testInitOnly(self, num_inits):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
         dataset, ["/cpu:1", "/cpu:2"])
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
-    with self.test_session(config=config) as sess:
-      self.evaluate(multi_device_iterator.initializer)
+    with self.test_session(config=config):
+      for _ in range(num_inits):
+        self.evaluate(multi_device_iterator.initializer)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("b/121264236")
   def testBasic(self):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
         dataset, ["/cpu:1", "/cpu:2"])
-    elem_on_1, elem_on_2 = multi_device_iterator.get_next()
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
-    with self.test_session(config=config) as sess:
+    with self.test_session(config=config):
       self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
         self.assertEqual(i, self.evaluate(elem_on_1))
         self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("b/121264236")
+  def testEagerMemoryUsageWithReset(self):
+    if not context.executing_eagerly():
+      self.skipTest("Only eager mode test")
+    if memory_profiler is None:
+      self.skipTest("memory_profiler required to run this test")
+
+    dataset = dataset_ops.Dataset.range(10)
+    multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
+        dataset, ["/cpu:1", "/cpu:2"])
+
+    def f():
+      self.evaluate(multi_device_iterator.get_next())
+      multi_device_iterator._eager_reset()
+
+    self.assertNotIncreasingMemory(
+        f, num_iters=100, increase_threshold_absolute_mb=50)
+
+  @test_util.run_v1_only("b/121264236")
+  def testEagerMemoryUsageWithRecreation(self):
+    if not context.executing_eagerly():
+      self.skipTest("Only eager mode test")
+    if memory_profiler is None:
+      self.skipTest("memory_profiler required to run this test")
+
+    dataset = dataset_ops.Dataset.range(10)
+
+    def f():
+      multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
+          dataset, ["/cpu:1", "/cpu:2"])
+      self.evaluate(multi_device_iterator.get_next())
+      del multi_device_iterator
+
+    # TODO(b/123316347): Reduce threshold once bug is fixed.
+    self.assertNotIncreasingMemory(
+        f, num_iters=100, increase_threshold_absolute_mb=500)
+
+  @test_util.run_v1_only("b/121264236")
   def testOneOnSameDevice(self):
     with ops.device("/cpu:0"):
       dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
         dataset, ["/cpu:0", "/cpu:1"])
-    elem_on_1, elem_on_2 = multi_device_iterator.get_next()
 
     config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=config) as sess:
+    with self.test_session(config=config):
       self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
         self.assertEqual(i, self.evaluate(elem_on_1))
         self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("b/121264236")
   def testRepeatDevices(self):
     with ops.device("/cpu:0"):
       dataset = dataset_ops.Dataset.range(20)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
         dataset, ["/cpu:1", "/cpu:2", "/cpu:1", "/cpu:2"])
-    elements = multi_device_iterator.get_next()
-    elem_on_1, elem_on_2, elem_on_3, elem_on_4 = elements
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
-    with self.test_session(config=config) as sess:
+    with self.test_session(config=config):
       self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 20, 4):
+        elements = multi_device_iterator.get_next()
+        elem_on_1, elem_on_2, elem_on_3, elem_on_4 = elements
         self.assertEqual(i, self.evaluate(elem_on_1))
         self.assertEqual(i + 1, self.evaluate(elem_on_2))
         self.assertEqual(i + 2, self.evaluate(elem_on_3))
         self.assertEqual(i + 3, self.evaluate(elem_on_4))
       with self.assertRaises(errors.OutOfRangeError):
+        elements = multi_device_iterator.get_next()
+        elem_on_1, elem_on_2, elem_on_3, elem_on_4 = elements
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
         self.evaluate(elem_on_3)
         self.evaluate(elem_on_4)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("b/121264236")
   def testNotFullyDivisible(self):
     dataset = dataset_ops.Dataset.range(9)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
         dataset, ["/cpu:1", "/cpu:2"])
-    elem_on_1, elem_on_2 = multi_device_iterator.get_next()
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
-    with self.test_session(config=config) as sess:
+    with self.test_session(config=config):
       self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 8, 2):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
         self.assertEqual(i, self.evaluate(elem_on_1))
         self.assertEqual(i + 1, self.evaluate(elem_on_2))
+      elem_on_1 = multi_device_iterator.get_next("/cpu:1")
       self.assertEqual(8, self.evaluate(elem_on_1))
       with self.assertRaises(errors.OutOfRangeError):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("b/121264236")
   def testGetNextAsOptional(self):
+    if context.executing_eagerly():
+      return
+
     dataset = dataset_ops.Dataset.range(9)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
         dataset, ["/cpu:1", "/cpu:2"])
@@ -155,26 +240,31 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.InvalidArgumentError):
         self.evaluate(elem_on_2_t)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("b/121264236")
   def testUneven(self):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
         dataset, ["/cpu:1", "/cpu:2"], max_buffer_size=4)
-    elem_on_1, elem_on_2 = multi_device_iterator.get_next()
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
-    with self.test_session(config=config) as sess:
+    with self.test_session(config=config):
       self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
+        elem_on_1 = multi_device_iterator.get_next("/cpu:1")
         self.assertEqual(i, self.evaluate(elem_on_1))
       for i in range(0, 10, 2):
+        elem_on_2 = multi_device_iterator.get_next("/cpu:2")
         self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
-  @test_util.run_v1_only("b/120545219")
-  def testMultipleInitializations(self):
+  @test_util.run_v1_only("b/121264236")
+  def testMultipleInitializationsGraph(self):
+    if context.executing_eagerly():
+      return
+
     with ops.device("/cpu:0"):
       epoch = array_ops.placeholder(dtypes.int64, shape=[])
       dataset1 = dataset_ops.Dataset.from_tensors(epoch).repeat(1000)
@@ -192,6 +282,23 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
         self.assertEqual([(i, 0), (i, 1)], self.evaluate([elem_on_1,
                                                           elem_on_2]))
 
+  @test_util.run_v1_only("b/121264236")
+  def testMultipleInitializationsEager(self):
+    if not context.executing_eagerly():
+      return
+
+    with ops.device("/cpu:0"):
+      dataset1 = dataset_ops.Dataset.range(1000)
+      dataset2 = dataset_ops.Dataset.range(1000)
+      dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
+
+    for _ in range(5):
+      multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
+          dataset, ["/cpu:1", "/cpu:2"], prefetch_buffer_size=4)
+      elem_on_1, elem_on_2 = multi_device_iterator.get_next()
+      self.assertEqual([(0, 0), (1, 1)], self.evaluate([elem_on_1, elem_on_2]))
+
+  @test_util.run_v1_only("b/121264236")
   def testBasicGpu(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -199,18 +306,20 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
         dataset, ["/cpu:1", "/gpu:0"])
-    elem_on_1, elem_on_2 = multi_device_iterator.get_next()
 
     config = config_pb2.ConfigProto(device_count={"CPU": 2, "GPU": 1})
-    with self.test_session(config=config) as sess:
+    with self.test_session(config=config):
       self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
         self.assertEqual(i, self.evaluate(elem_on_1))
         self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
+  @test_util.run_v1_only("b/121264236")
   def testUnevenGpu(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -218,21 +327,24 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
         dataset, ["/cpu:1", "/gpu:0"], max_buffer_size=4)
-    elem_on_1, elem_on_2 = multi_device_iterator.get_next()
 
     config = config_pb2.ConfigProto(device_count={"CPU": 2, "GPU": 1})
-    with self.test_session(config=config) as sess:
+    with self.test_session(config=config):
       self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
+        elem_on_1 = multi_device_iterator.get_next("/cpu:1")
         self.assertEqual(i, self.evaluate(elem_on_1))
       for i in range(0, 10, 2):
+        elem_on_2 = multi_device_iterator.get_next("/gpu:0")
         self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
+  @test_util.run_v1_only("b/121264236")
   def testGetNextAsOptionalGpu(self):
-    if not test_util.is_gpu_available():
+    if not test_util.is_gpu_available() or context.executing_eagerly():
       self.skipTest("No GPU available")
 
     dataset = dataset_ops.Dataset.range(9)
@@ -267,7 +379,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.InvalidArgumentError):
         self.evaluate(elem_on_2_t)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("b/121264236")
   def testOptimization(self):
     dataset = dataset_ops.Dataset.range(10)
     dataset = dataset.apply(optimization.assert_next(["MemoryCacheImpl"]))
@@ -275,24 +387,26 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
     dataset = dataset.cache()
 
     options = dataset_ops.Options()
-    options.experimental_optimization = OptimizationOptions()
     options.experimental_optimization.noop_elimination = True
     dataset = dataset.with_options(options)
 
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
         dataset, ["/cpu:1", "/cpu:2"])
-    elem_on_1, elem_on_2 = multi_device_iterator.get_next()
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
-    with self.test_session(config=config) as sess:
+    with self.test_session(config=config):
       self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
         self.assertEqual(i, self.evaluate(elem_on_1))
         self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
+        elem_on_1, elem_on_2 = multi_device_iterator.get_next()
         self.evaluate(elem_on_1)
         self.evaluate(elem_on_2)
 
 
 if __name__ == "__main__":
+  ops.enable_eager_execution(
+      config=config_pb2.ConfigProto(device_count={"CPU": 3, "GPU": 1}))
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/optional_test.py b/tensorflow/python/data/kernel_tests/optional_test.py
index c2c62e9423e6e082fd6fc42668e2827cc06246e1..4fde0aa002b82a3f076eff21d7bc9e5b2b4aee33 100644
--- a/tensorflow/python/data/kernel_tests/optional_test.py
+++ b/tensorflow/python/data/kernel_tests/optional_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.ops import optional_ops
 from tensorflow.python.data.util import structure
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -74,7 +75,6 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertAllEqual(expected.dense_shape,
                           self.evaluate(actual.dense_shape))
 
-  @test_util.run_deprecated_v1
   def testFromNone(self):
     value_structure = structure.TensorStructure(dtypes.float32, [])
     opt = optional_ops.Optional.none_from_structure(value_structure)
@@ -268,9 +268,7 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
        optional_ops.OptionalStructure(
            structure.TensorStructure(dtypes.float32, []))),
   )
-  @test_util.run_deprecated_v1
-  def testSkipEagerOptionalStructure(self, tf_value_fn,
-                                     expected_value_structure):
+  def testOptionalStructure(self, tf_value_fn, expected_value_structure):
     tf_value = tf_value_fn()
     opt = optional_ops.Optional.from_value(tf_value)
 
@@ -305,6 +303,7 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertEqual(
           self.evaluate(tf_value), self.evaluate(round_trip_opt.get_value()))
 
+  # NOTE: This test is specific to graph mode and is skipped in eager mode.
   @parameterized.named_parameters(
       ("Tensor", np.array([1, 2, 3], dtype=np.int32),
        lambda: constant_op.constant([4, 5, 6], dtype=dtypes.int32), True),
@@ -330,7 +329,7 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
     if not works_on_gpu and test.is_gpu_available():
       self.skipTest("Test case not yet supported on GPU.")
     ds = dataset_ops.Dataset.from_tensors(np_value).repeat(3)
-    iterator = ds.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(ds)
     next_elem = iterator_ops.get_next_as_optional(iterator)
     self.assertIsInstance(next_elem, optional_ops.Optional)
     self.assertTrue(
@@ -361,6 +360,25 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
         with self.assertRaises(errors.InvalidArgumentError):
           sess.run(elem_value_t)
 
+  def testFunctionBoundaries(self):
+    @def_function.function
+    def get_optional():
+      x = constant_op.constant(1.0)
+      opt = optional_ops.Optional.from_value(x)
+      # TODO(skyewm): support returning Optionals from functions?
+      return opt._variant_tensor
+
+    # TODO(skyewm): support Optional arguments?
+    @def_function.function
+    def consume_optional(opt_tensor):
+      value_structure = structure.TensorStructure(dtypes.float32, [])
+      opt = optional_ops._OptionalImpl(opt_tensor, value_structure)
+      return opt.get_value()
+
+    opt_tensor = get_optional()
+    val = consume_optional(opt_tensor)
+    self.assertEqual(self.evaluate(val), 1.0)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/options_test.py b/tensorflow/python/data/kernel_tests/options_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5bad3e7ae58885a5d013b0dc0f9dec41e0204c8
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/options_test.py
@@ -0,0 +1,96 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Options`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.ops import optimization_options
+from tensorflow.python.data.experimental.ops import stats_options
+from tensorflow.python.data.experimental.ops import threading_options
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.platform import test
+
+
+class OptionsTest(test_base.DatasetTestBase):
+
+  def testOptionsDefault(self):
+    ds = dataset_ops.Dataset.range(0)
+    self.assertEqual(dataset_ops.Options(), ds.options())
+
+  def testOptionsOnce(self):
+    options = dataset_ops.Options()
+    ds = dataset_ops.Dataset.range(0).with_options(options).cache()
+    self.assertEqual(options, ds.options())
+
+  def testOptionsTwiceSame(self):
+    options = dataset_ops.Options()
+    options.experimental_autotune = True
+    ds = dataset_ops.Dataset.range(0).with_options(options).with_options(
+        options)
+    self.assertEqual(options, ds.options())
+
+  def testOptionsTwiceDifferent(self):
+    options1 = dataset_ops.Options()
+    options1.experimental_autotune = True
+    options2 = dataset_ops.Options()
+    options2.experimental_deterministic = False
+    ds = dataset_ops.Dataset.range(0).with_options(options1).with_options(
+        options2)
+    self.assertTrue(ds.options().experimental_autotune)
+    # Explicitly check that flag is False since assertFalse allows None
+    self.assertIs(ds.options().experimental_deterministic, False)
+
+  def testOptionsTwiceDifferentError(self):
+    options1 = dataset_ops.Options()
+    options1.experimental_autotune = True
+    options2 = dataset_ops.Options()
+    options2.experimental_autotune = False
+    with self.assertRaisesRegexp(ValueError,
+                                 "Cannot merge incompatible values"):
+      dataset_ops.Dataset.range(0).with_options(options1).with_options(options2)
+
+  def testOptionsMergeOptionsFromMultipleInputs(self):
+    options1 = dataset_ops.Options()
+    options1.experimental_autotune = True
+    options2 = dataset_ops.Options()
+    options2.experimental_deterministic = True
+    ds = dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.range(0).with_options(options1),
+         dataset_ops.Dataset.range(0).with_options(options2)))
+    self.assertTrue(ds.options().experimental_autotune)
+    self.assertTrue(ds.options().experimental_deterministic)
+
+  def testOptionsHaveDefaults(self):
+    options1 = dataset_ops.Options()
+    options2 = dataset_ops.Options()
+    self.assertIsNot(options1.experimental_optimization,
+                     options2.experimental_optimization)
+    self.assertIsNot(options1.experimental_stats,
+                     options2.experimental_stats)
+    self.assertIsNot(options1.experimental_threading,
+                     options2.experimental_threading)
+    self.assertEquals(options1.experimental_optimization,
+                      optimization_options.OptimizationOptions())
+    self.assertEquals(options1.experimental_stats,
+                      stats_options.StatsOptions())
+    self.assertEquals(options1.experimental_threading,
+                      threading_options.ThreadingOptions())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/padded_batch_test.py b/tensorflow/python/data/kernel_tests/padded_batch_test.py
index dcfb2f507bf1a7d91041eb5f24c95c6de2c18362..042af7a6f9fb19b25fd9b01c509ed267833720f9 100644
--- a/tensorflow/python/data/kernel_tests/padded_batch_test.py
+++ b/tensorflow/python/data/kernel_tests/padded_batch_test.py
@@ -156,6 +156,7 @@ class PaddedBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     next_element = self.getNext(padded_dataset)
     self.evaluate(next_element())
 
+  # NOTE: This test is specific to graph mode and is skipped in eager mode.
   @test_util.run_deprecated_v1
   def testSkipEagerPaddedBatchDatasetShapeSpecifications(self):
     int_placeholder = array_ops.placeholder(dtypes.int32)
@@ -228,6 +229,7 @@ class PaddedBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       _ = dataset_ops.Dataset.range(10).padded_batch(
           5, padded_shapes=shape_as_tensor)
 
+  # NOTE: This test is specific to graph mode and is skipped in eager mode.
   @test_util.run_deprecated_v1
   def testSkipEagerPaddedBatchShapeError(self):
     with self.assertRaisesRegexp(
diff --git a/tensorflow/python/data/kernel_tests/prefetch_test.py b/tensorflow/python/data/kernel_tests/prefetch_test.py
index a143ba0ac63d42667faa4cfdee6fa74cf0a82f57..8d076f6e685b9da6403ab3ad1680fb5f3dcf8550 100644
--- a/tensorflow/python/data/kernel_tests/prefetch_test.py
+++ b/tensorflow/python/data/kernel_tests/prefetch_test.py
@@ -36,9 +36,10 @@ class PrefetchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.parameters((-2), (-42))
   def testInvalidBufferSize(self, buffer_size):
-    dataset = dataset_ops.Dataset.range(10).prefetch(buffer_size=buffer_size)
-    self.assertDatasetProduces(
-        dataset, expected_error=(errors.InvalidArgumentError, "buffer_size"))
+    with self.assertRaises(errors.InvalidArgumentError):
+      dataset = dataset_ops.Dataset.range(10).prefetch(buffer_size=buffer_size)
+      self.evaluate(dataset._variant_tensor)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/range_test.py b/tensorflow/python/data/kernel_tests/range_test.py
index 3f5d25e7f3959eed70754db827052a91fd224dbc..b7ac60c3fff162a85ab0702418cb7eb45dfb5aad 100644
--- a/tensorflow/python/data/kernel_tests/range_test.py
+++ b/tensorflow/python/data/kernel_tests/range_test.py
@@ -43,9 +43,9 @@ class RangeTest(test_base.DatasetTestBase):
 
   def testZeroStep(self):
     start, stop, step = 2, 10, 0
-    dataset = dataset_ops.Dataset.range(start, stop, step)
-    self.assertDatasetProduces(
-        dataset, expected_error=(errors.InvalidArgumentError, ""))
+    with self.assertRaises(errors.InvalidArgumentError):
+      dataset = dataset_ops.Dataset.range(start, stop, step)
+      self.evaluate(dataset._variant_tensor)
 
   def testNegativeStep(self):
     start, stop, step = 2, 10, -1
diff --git a/tensorflow/python/data/kernel_tests/reduce_test.py b/tensorflow/python/data/kernel_tests/reduce_test.py
index 14bbc0bf72caa07445ca7d077845e2bc4569cc01..846d9a6cef9cd362eca269fa44824436766afa2a 100644
--- a/tensorflow/python/data/kernel_tests/reduce_test.py
+++ b/tensorflow/python/data/kernel_tests/reduce_test.py
@@ -22,12 +22,14 @@ import numpy as np
 
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -68,6 +70,7 @@ class ReduceTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertEqual(((i + 1) * i) // 2, s)
       self.assertEqual(i, c)
 
+  # NOTE: This test is specific to graph mode and is skipped in eager mode.
   @test_util.run_deprecated_v1
   def testSkipEagerSquareUsingPlaceholder(self):
     delta = array_ops.placeholder(dtype=dtypes.int64)
@@ -122,6 +125,71 @@ class ReduceTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertEqual(((i + 1) * i) // 2, result["dense"])
       self.assertSparseValuesEqual(make_sparse_fn(i), result["sparse"])
 
+  def testDatasetSideEffect(self):
+    counter_var = variables.Variable(0)
+
+    def increment_fn(x):
+      counter_var.assign_add(1)
+      return x
+
+    def dataset_fn():
+      return dataset_ops.Dataset.range(10).map(increment_fn)
+
+    def reduce_fn(state, value):
+      return state + value
+
+    @function.defun
+    def fn():
+      _ = dataset_fn().reduce(np.int64(0), reduce_fn)
+      return "hello"
+
+    self.evaluate(counter_var.initializer)
+    self.assertEqual(self.evaluate(fn()), b"hello")
+    self.assertEqual(self.evaluate(counter_var), 10)
+
+  def testSideEffect(self):
+    counter_var = variables.Variable(0)
+
+    def dataset_fn():
+      return dataset_ops.Dataset.range(10)
+
+    def reduce_fn(state, value):
+      counter_var.assign_add(1)
+      return state + value
+
+    @function.defun
+    def fn():
+      _ = dataset_fn().reduce(np.int64(0), reduce_fn)
+      return "hello"
+
+    self.evaluate(counter_var.initializer)
+    self.assertEqual(self.evaluate(fn()), b"hello")
+    self.assertEqual(self.evaluate(counter_var), 10)
+
+  def testAutomaticControlDependencies(self):
+    counter_var = variables.Variable(1)
+
+    def dataset_fn():
+      return dataset_ops.Dataset.range(1)
+
+    def reduce1_fn(state, value):
+      counter_var.assign(counter_var + 1)
+      return state + value
+
+    def reduce2_fn(state, value):
+      counter_var.assign(counter_var * 2)
+      return state + value
+
+    @function.defun
+    def fn():
+      _ = dataset_fn().reduce(np.int64(0), reduce1_fn)
+      _ = dataset_fn().reduce(np.int64(0), reduce2_fn)
+      return "hello"
+
+    self.evaluate(counter_var.initializer)
+    self.assertEqual(self.evaluate(fn()), b"hello")
+    self.assertEqual(self.evaluate(counter_var), 4)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/shard_test.py b/tensorflow/python/data/kernel_tests/shard_test.py
index 928550676d5b05c2e5a459af355acebe2f1f1cc4..9fc70ff60752c02ec626ee5f89606b428fc183fd 100644
--- a/tensorflow/python/data/kernel_tests/shard_test.py
+++ b/tensorflow/python/data/kernel_tests/shard_test.py
@@ -19,11 +19,12 @@ from __future__ import print_function
 
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
-@test_util.run_all_in_graph_and_eager_modes
+@test_util.run_v1_only("deprecated API, no eager or V2 test coverage")
 class ShardTest(test_base.DatasetTestBase):
 
   def testSimpleCase(self):
@@ -41,20 +42,24 @@ class ShardTest(test_base.DatasetTestBase):
     self.assertDatasetProduces(dataset, expected_output=[0, 5])
 
   def testOffsetGreaterNumShards(self):
-    with self.assertRaises(ValueError):
-      dataset_ops.Dataset.range(10).shard(5, 7)
+    with self.assertRaises(errors.InvalidArgumentError):
+      dataset = dataset_ops.Dataset.range(10).shard(5, 7)
+      self.evaluate(self.getNext(dataset)())
 
   def testNegativeOffset(self):
-    with self.assertRaises(ValueError):
-      dataset_ops.Dataset.range(10).shard(5, -3)
+    with self.assertRaises(errors.InvalidArgumentError):
+      dataset = dataset_ops.Dataset.range(10).shard(5, -3)
+      self.evaluate(self.getNext(dataset)())
 
   def testNegativeNumShards(self):
-    with self.assertRaises(ValueError):
-      dataset_ops.Dataset.range(10).shard(-3, 1)
+    with self.assertRaises(errors.InvalidArgumentError):
+      dataset = dataset_ops.Dataset.range(10).shard(-3, 1)
+      self.evaluate(self.getNext(dataset)())
 
   def testZeroNumShards(self):
-    with self.assertRaises(ValueError):
-      dataset_ops.Dataset.range(10).shard(0, 1)
+    with self.assertRaises(errors.InvalidArgumentError):
+      dataset = dataset_ops.Dataset.range(10).shard(0, 1)
+      self.evaluate(self.getNext(dataset)())
 
   def testIteratorEndsBeforeFirstElem(self):
     dataset = dataset_ops.Dataset.range(1).shard(5, 2)
@@ -72,5 +77,10 @@ class ShardTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.range(10).shard(4, 3)
     self.assertDatasetProduces(dataset, expected_output=[3, 7])
 
+  def testNumShardsLargerThanDataset(self):
+    dataset = dataset_ops.Dataset.range(10).shard(20, 5)
+    self.assertDatasetProduces(dataset, expected_output=[5])
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/test_base.py b/tensorflow/python/data/kernel_tests/test_base.py
index 85f6c9de231a9054a2d7a6f434502dbecce1d601..57df29ead57b6f69af1d292fb1480ec90051fa54 100644
--- a/tensorflow/python/data/kernel_tests/test_base.py
+++ b/tensorflow/python/data/kernel_tests/test_base.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import re
 
+from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.eager import context
@@ -32,6 +33,13 @@ from tensorflow.python.platform import test
 class DatasetTestBase(test.TestCase):
   """Base class for dataset tests."""
 
+  @classmethod
+  def setUpClass(cls):
+    if tf2.enabled():
+      dataset_ops.Dataset = dataset_ops.DatasetV2
+    else:
+      dataset_ops.Dataset = dataset_ops.DatasetV1
+
   def assertSparseValuesEqual(self, a, b):
     """Asserts that two SparseTensors/SparseTensorValues are equal."""
     self.assertAllEqual(a.indices, b.indices)
@@ -58,7 +66,7 @@ class DatasetTestBase(test.TestCase):
       A callable that returns the next element of `dataset`.
     """
     if context.executing_eagerly():
-      iterator = dataset.__iter__()
+      iterator = iter(dataset)
       return iterator._next_internal  # pylint: disable=protected-access
     else:
       if requires_initialization:
@@ -88,6 +96,7 @@ class DatasetTestBase(test.TestCase):
   def assertDatasetProduces(self,
                             dataset,
                             expected_output=None,
+                            expected_shapes=None,
                             expected_error=None,
                             requires_initialization=False,
                             num_test_iterations=1,
@@ -98,6 +107,8 @@ class DatasetTestBase(test.TestCase):
       dataset: A dataset to check for the expected output / error.
       expected_output: A list of elements that the dataset is expected to
         produce.
+      expected_shapes: A list of TensorShapes which is expected to match
+        output_shapes of dataset.
       expected_error: A tuple `(type, predicate)` identifying the expected error
         `dataset` should raise. The `type` should match the expected exception
         type, while `predicate` should either be 1) a unary function that inputs
@@ -126,6 +137,8 @@ class DatasetTestBase(test.TestCase):
             dataset, requires_initialization=requires_initialization)
         self.evaluate(get_next())
       return
+    if expected_shapes:
+      self.assertEqual(expected_shapes, dataset.output_shapes)
     self.assertGreater(num_test_iterations, 0)
     for _ in range(num_test_iterations):
       get_next = self.getNext(
@@ -173,6 +186,8 @@ class DatasetTestBase(test.TestCase):
                                    exception_class,
                                    replacements=None):
     """Checks that datasets raise the same error on the first get_next call."""
+    if replacements is None:
+      replacements = []
     next1 = self.getNext(dataset1)
     next2 = self.getNext(dataset2)
     try:
diff --git a/tensorflow/python/data/kernel_tests/window_test.py b/tensorflow/python/data/kernel_tests/window_test.py
index d083142ab6a1f300b9e51b50d0113474053af05e..a7b4d86fcf958b1ec06781380724c6f48dcf2a24 100644
--- a/tensorflow/python/data/kernel_tests/window_test.py
+++ b/tensorflow/python/data/kernel_tests/window_test.py
@@ -116,12 +116,11 @@ class WindowTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("3", 14, 3, 3, 0),
   )
   def testWindowDatasetInvalid(self, count, size, shift, stride):
-    dataset = dataset_ops.Dataset.range(10).map(lambda x: x).repeat(
-        count).window(
-            size=size, shift=shift,
-            stride=stride).flat_map(lambda x: x.batch(batch_size=size))
-    self.assertDatasetProduces(
-        dataset, expected_error=(errors.InvalidArgumentError, ""))
+    with self.assertRaises(errors.InvalidArgumentError):
+      ds = dataset_ops.Dataset.range(10).map(lambda x: x).repeat(count).window(
+          size=size, shift=shift,
+          stride=stride).flat_map(lambda x: x.batch(batch_size=size))
+      self.evaluate(ds._variant_tensor)
 
   def testWindowSparse(self):
 
diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD
index fbff7df9c379e04a2b12a14ed5f5534339cde543..a911d8c8195816456f1c0dc5eb422462c7a57b9e 100644
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@@ -35,6 +35,7 @@ py_library(
         "//tensorflow/python/data/util:random_seed",
         "//tensorflow/python/data/util:sparse",
         "//tensorflow/python/data/util:structure",
+        "//tensorflow/python/data/util:traverse",
         "//third_party/py/numpy",
     ],
 )
@@ -73,7 +74,7 @@ py_library(
         "//tensorflow/python/data/util:sparse",
         "//tensorflow/python/data/util:structure",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
     ],
 )
 
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index bee04aaef2b382ffce179bf7b44a699bd4c7b778..e185ba78a0c0c0a34e707ec32cc6ba87c4a22980 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -24,9 +24,10 @@ import warnings
 
 import numpy as np
 import six
+from six.moves import queue as Queue  # pylint: disable=redefined-builtin
+
 
 from tensorflow.python.compat import compat
-from tensorflow.python.data.experimental.ops import filter_for_shard_ops
 from tensorflow.python.data.experimental.ops import optimization_options
 from tensorflow.python.data.experimental.ops import stats_options
 from tensorflow.python.data.experimental.ops import threading_options
@@ -36,7 +37,9 @@ from tensorflow.python.data.util import options as options_lib
 from tensorflow.python.data.util import random_seed
 from tensorflow.python.data.util import sparse
 from tensorflow.python.data.util import structure as structure_lib
+from tensorflow.python.data.util import traverse
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function as eager_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
@@ -45,6 +48,7 @@ from tensorflow.python.framework import random_seed as core_random_seed
 from tensorflow.python.framework import smart_cond
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -54,6 +58,7 @@ from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import function_utils
 from tensorflow.python.util.tf_export import tf_export
@@ -72,6 +77,27 @@ class DatasetV2(object):
   plan" of transformations that act on those elements.
   """
 
+  def __init__(self, variant_tensor):
+    """Creates a DatasetV2 object.
+
+    This is a difference between DatasetV1 and DatasetV2. DatasetV1 does not
+    take anything in its constructor whereas in the DatasetV2, we expect
+    subclasses to create a variant_tensor and pass it in to the super() call.
+
+    Args:
+      variant_tensor: A DT_VARIANT tensor that represents the dataset.
+    """
+    self._variant_tensor_attr = variant_tensor
+    self._graph_attr = ops.get_default_graph()
+
+  @property
+  def _variant_tensor(self):
+    return self._variant_tensor_attr
+
+  @_variant_tensor.setter
+  def _variant_tensor(self, _):
+    raise ValueError("The _variant_tensor property is read-only")
+
   def _as_serialized_graph(self):
     """Produces serialized graph representation of the dataset.
 
@@ -79,16 +105,7 @@ class DatasetV2(object):
       A scalar `tf.Tensor` of `tf.string` type, representing this dataset as a
       serialized graph.
     """
-    return gen_dataset_ops.dataset_to_graph(self._as_variant_tensor())
-
-  @abc.abstractmethod
-  def _as_variant_tensor(self):
-    """Creates a scalar `tf.Tensor` of `tf.variant` representing this dataset.
-
-    Returns:
-      A scalar `tf.Tensor` of `tf.variant` type, which represents this dataset.
-    """
-    raise NotImplementedError("Dataset._as_variant_tensor")
+    return gen_dataset_ops.dataset_to_graph(self._variant_tensor)
 
   @abc.abstractmethod
   def _inputs(self):
@@ -96,6 +113,14 @@ class DatasetV2(object):
 
     raise NotImplementedError("Dataset._inputs")
 
+  @property
+  def _graph(self):
+    return self._graph_attr
+
+  @_graph.setter
+  def _graph(self, _):
+    raise ValueError("The _graph property is read-only")
+
   def _has_captured_ref(self):
     """Whether this dataset uses a function that captures ref variables.
 
@@ -119,6 +144,8 @@ class DatasetV2(object):
     return any(
         [input_dataset._has_captured_ref() for input_dataset in self._inputs()])  # pylint: disable=protected-access
 
+  # TODO(jsimsa): Change this to be the transitive closure of functions used
+  # by this dataset and its inputs.
   def _functions(self):
     """Returns a list of functions associated with this dataset.
 
@@ -147,12 +174,12 @@ class DatasetV2(object):
     options = self.options()
     if options.experimental_threading is not None:
       t_options = options.experimental_threading
-      if t_options.private_threadpool_size is not None:
-        dataset = _PrivateThreadPoolDataset(dataset,
-                                            t_options.private_threadpool_size)
       if t_options.max_intra_op_parallelism is not None:
         dataset = _MaxIntraOpParallelismDataset(
             dataset, t_options.max_intra_op_parallelism)
+      if t_options.private_threadpool_size is not None:
+        dataset = _PrivateThreadPoolDataset(dataset,
+                                            t_options.private_threadpool_size)
     static_optimizations = options._static_optimizations()  # pylint: disable=protected-access
     if static_optimizations:
       if self._has_captured_ref():
@@ -723,6 +750,12 @@ class DatasetV2(object):
     elements. For perfect shuffling, a buffer size greater than or equal to the
     full size of the dataset is required.
 
+    For instance, if your dataset contains 10,000 elements but `buffer_size` is
+    set to 1,000, then `shuffle` will initially select a random element from
+    only the first 1,000 elements in the buffer. Once an element is selected,
+    its space in the buffer is replaced by the next (i.e. 1,001-st) element,
+    maintaining the 1,000 element buffer.
+
     Args:
       buffer_size: A `tf.int64` scalar `tf.Tensor`, representing the
         number of elements from this dataset from which the new
@@ -781,6 +814,59 @@ class DatasetV2(object):
     """
     return SkipDataset(self, count)
 
+  def shard(self, num_shards, index):
+    """Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
+
+    This dataset operator is very useful when running distributed training, as
+    it allows each worker to read a unique subset.
+
+    When reading a single input file, you can skip elements as follows:
+
+    ```python
+    d = tf.data.TFRecordDataset(input_file)
+    d = d.shard(num_workers, worker_index)
+    d = d.repeat(num_epochs)
+    d = d.shuffle(shuffle_buffer_size)
+    d = d.map(parser_fn, num_parallel_calls=num_map_threads)
+    ```
+
+    Important caveats:
+
+    - Be sure to shard before you use any randomizing operator (such as
+      shuffle).
+    - Generally it is best if the shard operator is used early in the dataset
+      pipeline. For example, when reading from a set of TFRecord files, shard
+      before converting the dataset to input samples. This avoids reading every
+      file on every worker. The following is an example of an efficient
+      sharding strategy within a complete pipeline:
+
+    ```python
+    d = Dataset.list_files(pattern)
+    d = d.shard(num_workers, worker_index)
+    d = d.repeat(num_epochs)
+    d = d.shuffle(shuffle_buffer_size)
+    d = d.interleave(tf.data.TFRecordDataset,
+                     cycle_length=num_readers, block_length=1)
+    d = d.map(parser_fn, num_parallel_calls=num_map_threads)
+    ```
+
+    Args:
+      num_shards: A `tf.int64` scalar `tf.Tensor`, representing the number of
+        shards operating in parallel.
+      index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
+
+    Returns:
+      Dataset: A `Dataset`.
+
+    Raises:
+      InvalidArgumentError: if `num_shards` or `index` are illegal values.
+        Note: error checking is done on a best-effort basis, and errors aren't
+        guaranteed to be caught upon dataset creation. (e.g. providing in a
+        placeholder tensor bypasses the early checking, and will instead result
+        in an error during a session.run call.)
+    """
+    return ShardDataset(self, num_shards, index)
+
   def batch(self, batch_size, drop_remainder=False):
     """Combines consecutive elements of this dataset into batches.
 
@@ -946,8 +1032,8 @@ class DatasetV2(object):
         shapes and types defined by `self.output_shapes` and
        `self.output_types`) to another nested structure of tensors.
       num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
-        representing the number elements to process in parallel. If not
-        specified, elements will be processed sequentially. If the value
+        representing the number elements to process asynchronously in parallel.
+        If not specified, elements will be processed sequentially. If the value
         `tf.data.experimental.AUTOTUNE` is used, then the number of parallel
         calls is set dynamically based on available CPU.
 
@@ -1076,6 +1162,18 @@ class DatasetV2(object):
   def filter(self, predicate):
     """Filters this dataset according to `predicate`.
 
+    ```python
+    d = tf.data.Dataset.from_tensor_slices([1, 2, 3])
+    
+    d = d.filter(lambda x: x < 3) # [1, 2]
+
+    # `tf.math.equal(x, y)` is required for equality comparison
+    def filter_fn(x):
+      return tf.math.equal(x, 1)
+
+    d = d.filter(filter_fn) # [1]
+    ```
+
     Args:
       predicate: A function mapping a nested structure of tensors (having shapes
         and types defined by `self.output_shapes` and `self.output_types`) to a
@@ -1265,7 +1363,7 @@ class DatasetV2(object):
     # pylint: disable=protected-access
     return state_structure._from_compatible_tensor_list(
         gen_dataset_ops.reduce_dataset(
-            self._as_variant_tensor(),
+            self._variant_tensor,
             state_structure._to_tensor_list(initial_state),
             reduce_func.captured_inputs,
             f=reduce_func,
@@ -1301,7 +1399,30 @@ class DatasetV1(DatasetV2):
   """
 
   def __init__(self):
-    pass
+    try:
+      variant_tensor = self._as_variant_tensor()
+    except AttributeError as e:
+      if "_as_variant_tensor" in str(e):
+        raise AttributeError("Please use _variant_tensor instead of "
+                             "_as_variant_tensor() to obtain the variant "
+                             "associated with a dataset")
+      raise AttributeError("A likely cause of this error is that the super "
+                           "call for this dataset is not the last line of the "
+                           "__init__ method. The base class causes the "
+                           "_as_variant_tensor call in its constructor and "
+                           "if that uses attributes defined in the __init__ "
+                           "method, those attrs need to be defined before the "
+                           "super call.")
+    super(DatasetV1, self).__init__(variant_tensor)
+
+  @abc.abstractmethod
+  def _as_variant_tensor(self):
+    """Creates a scalar `tf.Tensor` of `tf.variant` representing this dataset.
+
+    Returns:
+      A scalar `tf.Tensor` of `tf.variant` type, which represents this dataset.
+    """
+    raise NotImplementedError("Dataset._as_variant_tensor")
 
   @deprecation.deprecated(
       None, "Use `for ... in dataset:` to iterate over a dataset. If using "
@@ -1317,14 +1438,26 @@ class DatasetV1(DatasetV2):
     Returns:
       An `Iterator` over the elements of this dataset.
     """
+    return self._make_one_shot_iterator()
+
+  def _make_one_shot_iterator(self):  # pylint: disable=missing-docstring
     if context.executing_eagerly():
       return iterator_ops.EagerIterator(self)
 
+    _ensure_same_dataset_graph(self)
+    # Now that we create datasets at python object creation time, the capture
+    # by value _make_dataset() function would try to capture these variant
+    # tensor dataset inputs, which are marked as stateful ops and would throw
+    # an error if we try and capture them. We therefore traverse the graph
+    # to find all these ops and whitelist them so that the capturing
+    # logic instead of throwing an error recreates these ops which is what was
+    # happening before.
+    all_ds_ops = traverse.obtain_all_variant_tensor_ops(self)
     graph_level_seed, op_level_seed = core_random_seed.get_seed(None)
 
     # NOTE(mrry): We capture by value here to ensure that `_make_dataset()` is
     # a 0-argument function.
-    @function.Defun(capture_by_value=True)
+    @function.Defun(capture_by_value=True, whitelisted_stateful_ops=all_ds_ops)
     def _make_dataset():
       """Factory function for a dataset."""
       # NOTE(mrry): `Defun` does not capture the graph-level seed from the
@@ -1336,7 +1469,7 @@ class DatasetV1(DatasetV2):
             (graph_level_seed + 87654321 * op_level_seed) % (2 ** 63 - 1))
 
       dataset = self._apply_options()
-      return dataset._as_variant_tensor()  # pylint: disable=protected-access
+      return dataset._variant_tensor  # pylint: disable=protected-access
 
     try:
       _make_dataset.add_to_graph(ops.get_default_graph())
@@ -1385,10 +1518,15 @@ class DatasetV1(DatasetV2):
     Raises:
       RuntimeError: If eager execution is enabled.
     """
+
+    return self._make_initializable_iterator(shared_name)
+
+  def _make_initializable_iterator(self, shared_name=None):  # pylint: disable=missing-docstring
     if context.executing_eagerly():
       raise RuntimeError(
           "dataset.make_initializable_iterator is not supported when eager "
           "execution is enabled.")
+    _ensure_same_dataset_graph(self)
     dataset = self._apply_options()
     if shared_name is None:
       shared_name = ""
@@ -1400,7 +1538,7 @@ class DatasetV1(DatasetV2):
           container="", shared_name=shared_name, **flat_structure(self))
     with ops.colocate_with(iterator_resource):
       initializer = gen_dataset_ops.make_iterator(
-          dataset._as_variant_tensor(),  # pylint: disable=protected-access
+          dataset._variant_tensor,  # pylint: disable=protected-access
           iterator_resource)
     return iterator_ops.Iterator(iterator_resource, initializer,
                                  dataset.output_types, dataset.output_shapes,
@@ -1486,60 +1624,9 @@ class DatasetV1(DatasetV2):
   def skip(self, count):
     return DatasetV1Adapter(super(DatasetV1, self).skip(count))
 
-  @deprecation.deprecated(
-      None, "Use `dataset.apply(tf.data.experimental.filter_for_shard(...))`.")
+  @functools.wraps(DatasetV2.shard)
   def shard(self, num_shards, index):
-    """Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
-
-    This dataset operator is very useful when running distributed training, as
-    it allows each worker to read a unique subset.
-
-    When reading a single input file, you can skip elements as follows:
-
-    ```python
-    d = tf.data.TFRecordDataset(FLAGS.input_file)
-    d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
-    d = d.repeat(FLAGS.num_epochs)
-    d = d.shuffle(FLAGS.shuffle_buffer_size)
-    d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
-    ```
-
-    Important caveats:
-
-    - Be sure to shard before you use any randomizing operator (such as
-      shuffle).
-    - Generally it is best if the shard operator is used early in the dataset
-      pipeline. For example, when reading from a set of TFRecord files, shard
-      before converting the dataset to input samples. This avoids reading every
-      file on every worker. The following is an example of an efficient
-      sharding strategy within a complete pipeline:
-
-    ```python
-    d = Dataset.list_files(FLAGS.pattern)
-    d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
-    d = d.repeat(FLAGS.num_epochs)
-    d = d.shuffle(FLAGS.shuffle_buffer_size)
-    d = d.interleave(tf.data.TFRecordDataset,
-                     cycle_length=FLAGS.num_readers, block_length=1)
-    d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
-    ```
-
-    Args:
-      num_shards: A `tf.int64` scalar `tf.Tensor`, representing the number of
-        shards operating in parallel.
-      index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
-
-    Returns:
-      Dataset: A `Dataset`.
-
-    Raises:
-      ValueError: if `num_shards` or `index` are illegal values. Note: error
-        checking is done on a best-effort basis, and errors aren't guaranteed
-        to be caught upon dataset creation. (e.g. providing in a placeholder
-        tensor bypasses the early checking, and will instead result in an error
-        during a session.run call.)
-    """
-    return self.apply(filter_for_shard_ops.filter_for_shard(num_shards, index))
+    return DatasetV1Adapter(super(DatasetV1, self).shard(num_shards, index))
 
   @functools.wraps(DatasetV2.batch)
   def batch(self, batch_size, drop_remainder=False):
@@ -1565,6 +1652,43 @@ class DatasetV1(DatasetV2):
           ParallelMapDataset(
               self, map_func, num_parallel_calls, preserve_cardinality=False))
 
+  @deprecation.deprecated(None, "Use `tf.data.Dataset.map()")
+  def map_with_legacy_function(self, map_func, num_parallel_calls=None):
+    """Maps `map_func` across the elements of this dataset.
+
+    NOTE: This is an escape hatch for existing uses of `map` that do not work
+    with V2 functions. New uses are strongly discouraged and existing uses
+    should migrate to `map` as this method will be removed in V2.
+
+    Args:
+      map_func: A function mapping a nested structure of tensors (having shapes
+        and types defined by `self.output_shapes` and `self.output_types`) to
+        another nested structure of tensors.
+      num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
+        representing the number elements to process asynchronously in parallel.
+        If not specified, elements will be processed sequentially. If the value
+        `tf.data.experimental.AUTOTUNE` is used, then the number of parallel
+        calls is set dynamically based on available CPU.
+
+    Returns:
+      Dataset: A `Dataset`.
+    """
+    if num_parallel_calls is None:
+      return DatasetV1Adapter(
+          MapDataset(
+              self,
+              map_func,
+              preserve_cardinality=False,
+              use_legacy_function=True))
+    else:
+      return DatasetV1Adapter(
+          ParallelMapDataset(
+              self,
+              map_func,
+              num_parallel_calls,
+              preserve_cardinality=False,
+              use_legacy_function=True))
+
   @functools.wraps(DatasetV2.flat_map)
   def flat_map(self, map_func):
     return DatasetV1Adapter(super(DatasetV1, self).flat_map(map_func))
@@ -1582,6 +1706,25 @@ class DatasetV1(DatasetV2):
   def filter(self, predicate):
     return DatasetV1Adapter(super(DatasetV1, self).filter(predicate))
 
+  @deprecation.deprecated(None, "Use `tf.data.Dataset.filter()")
+  def filter_with_legacy_function(self, predicate):
+    """Filters this dataset according to `predicate`.
+
+    NOTE: This is an escape hatch for existing uses of `filter` that do not work
+    with V2 functions. New uses are strongly discouraged and existing uses
+    should migrate to `filter` as this method will be removed in V2.
+
+    Args:
+      predicate: A function mapping a nested structure of tensors (having shapes
+        and types defined by `self.output_shapes` and `self.output_types`) to a
+        scalar `tf.bool` tensor.
+
+    Returns:
+      Dataset: The `Dataset` containing the elements of this dataset for which
+          `predicate` is `True`.
+    """
+    return FilterDataset(self, predicate, use_legacy_function=True)
+
   @functools.wraps(DatasetV2.apply)
   def apply(self, transformation_func):
     return DatasetV1Adapter(super(DatasetV1, self).apply(transformation_func))
@@ -1605,11 +1748,11 @@ class DatasetV1Adapter(DatasetV1):
   """Wraps a V2 `Dataset` object in the `tf.compat.v1.data.Dataset` API."""
 
   def __init__(self, dataset):
-    super(DatasetV1Adapter, self).__init__()
     self._dataset = dataset
+    super(DatasetV1Adapter, self).__init__()
 
   def _as_variant_tensor(self):
-    return self._dataset._as_variant_tensor()  # pylint: disable=protected-access
+    return self._dataset._variant_tensor  # pylint: disable=protected-access
 
   def _has_captured_ref(self):
     return self._dataset._has_captured_ref()  # pylint: disable=protected-access
@@ -1628,6 +1771,32 @@ class DatasetV1Adapter(DatasetV1):
     return iter(self._dataset)
 
 
+def _ensure_same_dataset_graph(dataset):
+  """Walks the dataset graph to ensure all datasets come from the same graph."""
+  current_graph = ops.get_default_graph()
+  bfs_q = Queue.Queue()
+  bfs_q.put(dataset)  # pylint: disable=protected-access
+  visited = []
+  while not bfs_q.empty():
+    ds = bfs_q.get()
+    visited.append(ds)
+    ds_graph = ds._graph  # pylint: disable=protected-access
+    if current_graph != ds_graph:
+      logging.warning("The graph (" + str(current_graph) + ") of the iterator "
+                      "is different from the graph (" + str(ds_graph) + ") "
+                      "the dataset: " + str(ds._variant_tensor) + " was "  # pylint: disable=protected-access
+                      "created in. If you are using the Estimator API, "
+                      "make sure that no part of the dataset returned by the "
+                      "`input_fn` function is defined outside the `input_fn` "
+                      "function. Please ensure that all datasets in the "
+                      "pipeline are created in the same graph as the iterator. "
+                      "NOTE: This warning will become an error in future "
+                      "versions of TensorFlow.")
+    for input_ds in ds._inputs():  # pylint: disable=protected-access
+      if input_ds not in visited:
+        bfs_q.put(input_ds)
+
+
 @tf_export(v1=["data.make_one_shot_iterator"])
 def make_one_shot_iterator(dataset):
   """Creates a `tf.data.Iterator` for enumerating the elements of a dataset.
@@ -1642,15 +1811,15 @@ def make_one_shot_iterator(dataset):
     A `tf.data.Iterator` over the elements of this dataset.
   """
   try:
-    # Call the defined `make_one_shot_iterator()` if there is one, because some
+    # Call the defined `_make_one_shot_iterator()` if there is one, because some
     # datasets (e.g. for prefetching) override its behavior.
-    return dataset.make_one_shot_iterator()
+    return dataset._make_one_shot_iterator()  # pylint: disable=protected-access
   except AttributeError:
-    return DatasetV1Adapter(dataset).make_one_shot_iterator()
+    return DatasetV1Adapter(dataset)._make_one_shot_iterator()  # pylint: disable=protected-access
 
 
 @tf_export(v1=["data.make_initializable_iterator"])
-def make_initializable_iterator(dataset):
+def make_initializable_iterator(dataset, shared_name=None):
   """Creates a `tf.data.Iterator` for enumerating the elements of a dataset.
 
   Note: The returned iterator will be in an uninitialized state,
@@ -1658,13 +1827,16 @@ def make_initializable_iterator(dataset):
 
   ```python
   dataset = ...
-  iterator = dataset.make_initializable_iterator()
+  iterator = tf.data.make_initializable_iterator(dataset)
   # ...
   sess.run(iterator.initializer)
   ```
 
   Args:
     dataset: A `tf.data.Dataset`.
+    shared_name: (Optional.) If non-empty, the returned iterator will be
+      shared under the given name across multiple sessions that share the
+      same devices (e.g. when using a remote server).
 
   Returns:
     A `tf.data.Iterator` over the elements of `dataset`.
@@ -1673,11 +1845,11 @@ def make_initializable_iterator(dataset):
     RuntimeError: If eager execution is enabled.
   """
   try:
-    # Call the defined `make_one_shot_iterator()` if there is one, because some
-    # datasets (e.g. for prefetching) override its behavior.
-    return dataset.make_initializable_iterator()
+    # Call the defined `_make_initializable_iterator()` if there is one, because
+    # some datasets (e.g. for prefetching) override its behavior.
+    return dataset._make_initializable_iterator(shared_name)  # pylint: disable=protected-access
   except AttributeError:
-    return DatasetV1Adapter(dataset).make_initializable_iterator()
+    return DatasetV1Adapter(dataset)._make_initializable_iterator(shared_name)  # pylint: disable=protected-access
 
 
 @tf_export("data.Options")
@@ -1695,43 +1867,50 @@ class Options(options_lib.OptionsBase):
       ty=bool,
       docstring=
       "Whether to dynamically adjust the values of tunable parameters (e.g. "
-      "degrees of parallelism).")
+      "degrees of parallelism). If None, defaults to True.")
 
   experimental_deterministic = options_lib.create_option(
       name="experimental_deterministic",
       ty=bool,
       docstring=
-      "Whether the outputs need to be produced in deterministic order."
-  )
+      "Whether the outputs need to be produced in deterministic order. If None,"
+      " defaults to True.")
 
   experimental_numa_aware = options_lib.create_option(
       name="experimental_numa_aware",
       ty=bool,
-      docstring="Whether to use NUMA-aware operations.")
+      docstring=
+      "Whether to use NUMA-aware operations. If None, defaults to False.")
 
   experimental_optimization = options_lib.create_option(
       name="experimental_optimization",
       ty=optimization_options.OptimizationOptions,
-      docstring="Associates the given optimization options with the dataset.")
+      docstring=
+      "The optimization options associated with the dataset. See "
+      "`tf.data.experimental.OptimizationOptions` for more details.",
+      default_factory=optimization_options.OptimizationOptions)
 
   experimental_stats = options_lib.create_option(
       name="experimental_stats",
       ty=stats_options.StatsOptions,
-      docstring="Associates the given statistics options with the dataset.")
+      docstring=
+      "The statistics options associated with the dataset. See "
+      "`tf.data.experimental.StatsOptions` for more details.",
+      default_factory=stats_options.StatsOptions)
 
   experimental_threading = options_lib.create_option(
       name="experimental_threading",
       ty=threading_options.ThreadingOptions,
-      docstring="Associates the given threading options with the dataset.")
+      docstring=
+      "The threading options associated with the dataset. See "
+      "`tf.data.experimental.ThreadingOptions` for more details.",
+      default_factory=threading_options.ThreadingOptions)
 
   def _static_optimizations(self):
     """Produces the list of enabled static optimizations."""
 
     result = []
-    exp_optimization_options = (
-        self.experimental_optimization or
-        optimization_options.OptimizationOptions())  # If not set, use default
-    result.extend(exp_optimization_options._static_optimizations())  # pylint: disable=protected-access
+    result.extend(self.experimental_optimization._static_optimizations())  # pylint: disable=protected-access
 
     if self.experimental_numa_aware:
       result.append("make_numa_aware")
@@ -1771,9 +1950,9 @@ class DatasetSource(DatasetV2):
 class UnaryDataset(DatasetV2):
   """Abstract class representing a dataset with one input."""
 
-  def __init__(self, input_dataset):
-    super(UnaryDataset, self).__init__()
+  def __init__(self, input_dataset, variant_tensor):
     self._input_dataset = input_dataset
+    super(UnaryDataset, self).__init__(variant_tensor)
 
   def _inputs(self):
     return [self._input_dataset]
@@ -1782,6 +1961,11 @@ class UnaryDataset(DatasetV2):
 class UnaryUnchangedStructureDataset(UnaryDataset):
   """Represents a unary dataset with the same input and output structure."""
 
+  def __init__(self, input_dataset, variant_tensor):
+    self._input_dataset = input_dataset
+    super(UnaryUnchangedStructureDataset, self).__init__(
+        input_dataset, variant_tensor)
+
   @property
   def _element_structure(self):
     return self._input_dataset._element_structure  # pylint: disable=protected-access
@@ -1792,7 +1976,6 @@ class TensorDataset(DatasetSource):
 
   def __init__(self, tensors):
     """See `Dataset.from_tensors()` for details."""
-    super(TensorDataset, self).__init__()
     with ops.name_scope("tensors"):
       tensors = nest.pack_sequence_as(tensors, [
           sparse_tensor_lib.SparseTensor.from_value(t)
@@ -1803,9 +1986,9 @@ class TensorDataset(DatasetSource):
     self._structure = structure_lib.Structure.from_value(tensors)
     self._tensors = self._structure._to_tensor_list(tensors)  # pylint: disable=protected-access
 
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.tensor_dataset(
+    variant_tensor = gen_dataset_ops.tensor_dataset(
         self._tensors, output_shapes=self._structure._flat_shapes)  # pylint: disable=protected-access
+    super(TensorDataset, self).__init__(variant_tensor)
 
   @property
   def _element_structure(self):
@@ -1817,7 +2000,6 @@ class TensorSliceDataset(DatasetSource):
 
   def __init__(self, tensors):
     """See `Dataset.from_tensor_slices()` for details."""
-    super(TensorSliceDataset, self).__init__()
     with ops.name_scope("tensors"):
       tensors = nest.pack_sequence_as(tensors, [
           sparse_tensor_lib.SparseTensor.from_value(t)
@@ -1838,9 +2020,9 @@ class TensorSliceDataset(DatasetSource):
       batch_dim.assert_is_compatible_with(tensor_shape.Dimension(
           tensor_shape.dimension_value(t.get_shape()[0])))
 
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.tensor_slice_dataset(
+    variant_tensor = gen_dataset_ops.tensor_slice_dataset(
         self._tensors, output_shapes=self._structure._flat_shapes)  # pylint: disable=protected-access
+    super(TensorSliceDataset, self).__init__(variant_tensor)
 
   @property
   def _element_structure(self):
@@ -1852,7 +2034,6 @@ class SparseTensorSliceDataset(DatasetSource):
 
   def __init__(self, sparse_tensor):
     """See `Dataset.from_sparse_tensor_slices()` for details."""
-    super(SparseTensorSliceDataset, self).__init__()
     if not isinstance(sparse_tensor, sparse_tensor_lib.SparseTensor):
       raise TypeError("`sparse_tensor` must be a `tf.SparseTensor` object.")
     self._sparse_tensor = sparse_tensor
@@ -1865,10 +2046,10 @@ class SparseTensorSliceDataset(DatasetSource):
          structure_lib.TensorStructure(self._sparse_tensor.dtype, [None]),
          structure_lib.TensorStructure(dtypes.int64, [rank])))
 
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.sparse_tensor_slice_dataset(
+    variant_tensor = gen_dataset_ops.sparse_tensor_slice_dataset(
         self._sparse_tensor.indices, self._sparse_tensor.values,
         self._sparse_tensor.dense_shape)
+    super(SparseTensorSliceDataset, self).__init__(variant_tensor)
 
   @property
   def _element_structure(self):
@@ -1879,12 +2060,8 @@ class _VariantDataset(DatasetV2):
   """A Dataset wrapper around a `tf.variant`-typed function argument."""
 
   def __init__(self, dataset_variant, structure):
-    super(_VariantDataset, self).__init__()
-    self._dataset_variant = dataset_variant
     self._structure = structure
-
-  def _as_variant_tensor(self):
-    return self._dataset_variant
+    super(_VariantDataset, self).__init__(dataset_variant)
 
   def _inputs(self):
     return []
@@ -1916,7 +2093,7 @@ class DatasetStructure(structure_lib.Structure):
                 other._element_structure))
 
   def _to_tensor_list(self, value):
-    return [value._as_variant_tensor()]  # pylint: disable=protected-access
+    return [value._variant_tensor]  # pylint: disable=protected-access
 
   def _to_batched_tensor_list(self, value):
     raise NotImplementedError("Unbatching for `tf.data.Dataset` objects.")
@@ -1959,9 +2136,9 @@ structure_lib.Structure._register_custom_converter(DatasetV2,
 
 
 class StructuredFunctionWrapper(object):
-  """A wrapper for `Defun` that supports structured arguments and return values.
-  """
+  """A function wrapper that supports structured arguments and return values."""
 
+  # pylint: disable=protected-access
   def __init__(self,
                func,
                transformation_name,
@@ -1971,6 +2148,7 @@ class StructuredFunctionWrapper(object):
                input_types=None,
                input_structure=None,
                add_to_graph=True,
+               use_legacy_function=False,
                defun_kwargs=None):
     """Creates a new `StructuredFunctionWrapper` for the given function.
 
@@ -1992,9 +2170,12 @@ class StructuredFunctionWrapper(object):
         defines the element types and structure for `func` arguments.
       add_to_graph: (Optional.) If `True`, the function will be added to the
         default graph.
+      use_legacy_function: (Optional.) A boolean that determines whether the
+        function be created using `tensorflow.python.eager.function.defun`
+        (default behavior) or `tensorflow.python.framework.function.Defun`
+        (legacy beheavior).
       defun_kwargs: (Optional.) A dictionary mapping string argument names to
-        values. If supplied, will be passed to `function.Defun()` as keyword
-        arguments.
+        values. If supplied, will be passed to `function` as keyword arguments.
 
     Raises:
       ValueError: If an invalid combination of `dataset`, `input_classes`,
@@ -2014,7 +2195,7 @@ class StructuredFunctionWrapper(object):
           raise ValueError("Either `dataset`, `input_structure` or all of "
                            "`input_classes`, `input_shapes`, and `input_types` "
                            "must be specified.")
-        self._input_structure = dataset._element_structure  # pylint: disable=protected-access
+        self._input_structure = dataset._element_structure
     else:
       if not (dataset is None and input_classes is None and input_shapes is None
               and input_types is None):
@@ -2023,24 +2204,38 @@ class StructuredFunctionWrapper(object):
                          "must be specified.")
       self._input_structure = input_structure
 
-    self._transformation_name = transformation_name
+    if defun_kwargs is None:
+      defun_kwargs = {}
+
     readable_transformation_name = transformation_name.replace(
         ".", "_")[:-2] if len(transformation_name) > 2 else ""
-    self._func_name = "_".join([
-        readable_transformation_name,
-        function_utils.get_func_name(func),
-        str(ops.uid())
-    ])
 
-    if defun_kwargs is None:
-      defun_kwargs = {}
+    func_name = "_".join(
+        [readable_transformation_name,
+         function_utils.get_func_name(func)])
+
+    def _warn_if_collections(transformation_name, graph, initial_length):
+      """Prints a warning if the given graph uses common graph collections.
 
-    @function.Defun(
-        *self._input_structure._flat_types, func_name=self._func_name,  # pylint: disable=protected-access
-        **defun_kwargs)
-    def tf_data_structured_function_wrapper(*args):
+      NOTE(mrry): Currently a warning is only generated for lookup tables. Any
+      variables created will be automatically hoisted out to the outermost scope
+      using `init_scope()`. Some collections (such as for control-flow contexts)
+      are benign and should not generate a warning.
+
+      Args:
+        transformation_name: A human-readable name for the transformation.
+        graph: The graph to check for collections.
+        initial_length: The initial length of the lookup table collection.
+      """
+      length = len(graph.get_collection(ops.GraphKeys.TABLE_INITIALIZERS))
+      if length != initial_length:
+        warnings.warn("Creating lookup tables inside a function passed to %s "
+                      "is not supported. Create each table outside the "
+                      "function, and capture it inside the function to use it."
+                      % transformation_name)
+
+    def _wrapper_helper(*args):
       """Wrapper for passing nested structures to and from tf.data functions."""
-      # pylint: disable=protected-access
       nested_args = self._input_structure._from_compatible_tensor_list(args)
       if not _should_unpack_args(nested_args):
         nested_args = (nested_args,)
@@ -2064,18 +2259,53 @@ class StructuredFunctionWrapper(object):
       except (ValueError, TypeError):
         raise TypeError("Unsupported return value from function passed to "
                         "%s: %s." % (transformation_name, ret))
+      return ret
+
+    if use_legacy_function:
+      func_name = func_name + "_" + str(ops.uid())
+
+      @function.Defun(
+          *self._input_structure._flat_types,
+          func_name=func_name,
+          **defun_kwargs)
+      def wrapper_fn(*args):
+        ret = _wrapper_helper(*args)
+        _warn_if_collections(transformation_name, ops.get_default_graph(), 0)
+        return self._output_structure._to_tensor_list(ret)
+
+      self._function = wrapper_fn
+      if add_to_graph:
+        self._function.add_to_graph(ops.get_default_graph())
+      else:
+        # Use the private method that will execute `wrapper_fn` but delay adding
+        # it to the graph in case (e.g.) we need to rerun the function.
+        self._function._create_definition_if_needed()
+    else:
+      defun_kwargs.update({"func_name": func_name})
+
+      @eager_function.defun_with_attributes(
+          input_signature=[
+              tensor_spec.TensorSpec(input_shape, input_type)  # pylint: disable=g-complex-comprehension
+              for input_shape, input_type in zip(
+                  self._input_structure._flat_shapes,
+                  self._input_structure._flat_types)
+          ],
+          attributes=defun_kwargs)
+      def wrapper_fn(*args):  # pylint: disable=missing-docstring
+        ret = _wrapper_helper(*args)
+        ret = self._output_structure._to_tensor_list(ret)
+        return [ops.convert_to_tensor(t) for t in ret]
 
-      _warn_if_collections(transformation_name)
-      return self._output_structure._to_tensor_list(ret)
+      initial_length = len(ops.get_default_graph().get_collection(
+          ops.GraphKeys.TABLE_INITIALIZERS))
 
-    self._function = tf_data_structured_function_wrapper
-    if add_to_graph:
-      self._function.add_to_graph(ops.get_default_graph())
-    else:
-      # Use the private method that will execute
-      # `tf_data_structured_function_wrapper` but delay adding it to the graph
-      # in case (e.g.) we need to rerun the function.
-      self._function._create_definition_if_needed()  # pylint: disable=protected-access
+      self._function = wrapper_fn._get_concrete_function_internal()
+      if add_to_graph:
+        self._function.add_to_graph(ops.get_default_graph())
+
+      _warn_if_collections(transformation_name, self._function.graph,
+                           initial_length)
+  # pylint: enable=protected-access
 
   @property
   def output_structure(self):
@@ -2104,7 +2334,7 @@ def flat_structure(dataset):
   Most Dataset op constructors expect `output_shapes` and `output_types`
   arguments that represent the flattened structure of an element. This helper
   function generates these attrs as a keyword argument dictionary, allowing
-  `Dataset._as_variant_tensor()` implementations to pass
+  `Dataset._variant_tensor` implementations to pass
   `**flat_structure(self)` to the op constructor.
 
   Args:
@@ -2140,7 +2370,6 @@ class _GeneratorDataset(DatasetSource):
         `init_func` immediately before a C++ iterator over this dataset is
         destroyed. The return value is ignored.
     """
-    super(_GeneratorDataset, self).__init__()
     self._init_args = init_args
 
     self._init_structure = structure_lib.Structure.from_value(init_args)
@@ -2159,9 +2388,7 @@ class _GeneratorDataset(DatasetSource):
         finalize_func,
         self._transformation_name(),
         input_structure=self._init_func.output_structure)
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.generator_dataset(
+    variant_tensor = gen_dataset_ops.generator_dataset(
         self._init_structure._to_tensor_list(self._init_args)  # pylint: disable=protected-access
         + self._init_func.function.captured_inputs,
         self._next_func.function.captured_inputs,
@@ -2170,6 +2397,7 @@ class _GeneratorDataset(DatasetSource):
         next_func=self._next_func.function,
         finalize_func=self._finalize_func.function,
         **flat_structure(self))
+    super(_GeneratorDataset, self).__init__(variant_tensor)
 
   @property
   def _element_structure(self):
@@ -2184,7 +2412,6 @@ class ZipDataset(DatasetV2):
 
   def __init__(self, datasets):
     """See `Dataset.zip()` for details."""
-    super(ZipDataset, self).__init__()
     for ds in nest.flatten(datasets):
       if not isinstance(ds, DatasetV2):
         if isinstance(ds, list):
@@ -2201,12 +2428,12 @@ class ZipDataset(DatasetV2):
             self._datasets,
             [ds._element_structure for ds in nest.flatten(self._datasets)]))  # pylint: disable=protected-access
 
-  def _as_variant_tensor(self):
     # pylint: disable=protected-access
-    return gen_dataset_ops.zip_dataset(
-        [ds._as_variant_tensor() for ds in nest.flatten(self._datasets)],
+    variant_tensor = gen_dataset_ops.zip_dataset(
+        [ds._variant_tensor for ds in nest.flatten(self._datasets)],
         **flat_structure(self))
     # pylint: enable=protected-access
+    super(ZipDataset, self).__init__(variant_tensor)
 
   def _inputs(self):
     return nest.flatten(self._datasets)
@@ -2221,7 +2448,6 @@ class ConcatenateDataset(DatasetV2):
 
   def __init__(self, input_dataset, dataset_to_concatenate):
     """See `Dataset.concatenate()` for details."""
-    super(ConcatenateDataset, self).__init__()
     self._input_dataset = input_dataset
     self._dataset_to_concatenate = dataset_to_concatenate
 
@@ -2249,17 +2475,15 @@ class ConcatenateDataset(DatasetV2):
         output_types, output_shapes, output_classes)
 
     self._input_datasets = [input_dataset, dataset_to_concatenate]
-
-  def _as_variant_tensor(self):
     # pylint: disable=protected-access
-    return gen_dataset_ops.concatenate_dataset(
-        self._input_dataset._as_variant_tensor(),
-        self._dataset_to_concatenate._as_variant_tensor(),
+    variant_tensor = gen_dataset_ops.concatenate_dataset(
+        input_dataset._variant_tensor, dataset_to_concatenate._variant_tensor,
         **flat_structure(self))
     # pylint: enable=protected-access
+    super(ConcatenateDataset, self).__init__(variant_tensor)
 
   def _inputs(self):
-    return [self._input_dataset, self._dataset_to_concatenate]
+    return self._input_datasets
 
   @property
   def _element_structure(self):
@@ -2271,19 +2495,17 @@ class RepeatDataset(UnaryUnchangedStructureDataset):
 
   def __init__(self, input_dataset, count):
     """See `Dataset.repeat()` for details."""
-    super(RepeatDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     if count is None:
       self._count = constant_op.constant(-1, dtype=dtypes.int64, name="count")
     else:
       self._count = ops.convert_to_tensor(
           count, dtype=dtypes.int64, name="count")
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.repeat_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+    variant_tensor = gen_dataset_ops.repeat_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
         count=self._count,
         **flat_structure(self))
+    super(RepeatDataset, self).__init__(input_dataset, variant_tensor)
 
 
 class RangeDataset(DatasetSource):
@@ -2291,8 +2513,13 @@ class RangeDataset(DatasetSource):
 
   def __init__(self, *args):
     """See `Dataset.range()` for details."""
-    super(RangeDataset, self).__init__()
     self._parse_args(*args)
+    variant_tensor = gen_dataset_ops.range_dataset(
+        start=self._start,
+        stop=self._stop,
+        step=self._step,
+        **flat_structure(self))
+    super(RangeDataset, self).__init__(variant_tensor)
 
   def _parse_args(self, *args):
     """Parse arguments according to the same rules as the `range()` builtin."""
@@ -2314,13 +2541,6 @@ class RangeDataset(DatasetSource):
   def _build_tensor(self, int64_value, name):
     return ops.convert_to_tensor(int64_value, dtype=dtypes.int64, name=name)
 
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.range_dataset(
-        start=self._start,
-        stop=self._stop,
-        step=self._step,
-        **flat_structure(self))
-
   @property
   def _element_structure(self):
     return structure_lib.TensorStructure(dtypes.int64, [])
@@ -2331,16 +2551,14 @@ class CacheDataset(UnaryUnchangedStructureDataset):
 
   def __init__(self, input_dataset, filename):
     """See `Dataset.cache()` for details."""
-    super(CacheDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     self._filename = ops.convert_to_tensor(
         filename, dtype=dtypes.string, name="filename")
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.cache_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+    variant_tensor = gen_dataset_ops.cache_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
         filename=self._filename,
         **flat_structure(self))
+    super(CacheDataset, self).__init__(input_dataset, variant_tensor)
 
 
 class ShuffleDataset(UnaryUnchangedStructureDataset):
@@ -2371,7 +2589,6 @@ class ShuffleDataset(UnaryUnchangedStructureDataset):
     Raises:
       ValueError: if invalid arguments are provided.
     """
-    super(ShuffleDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     self._buffer_size = ops.convert_to_tensor(
         buffer_size, dtype=dtypes.int64, name="buffer_size")
@@ -2381,15 +2598,14 @@ class ShuffleDataset(UnaryUnchangedStructureDataset):
       self._reshuffle_each_iteration = True
     else:
       self._reshuffle_each_iteration = reshuffle_each_iteration
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.shuffle_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+    variant_tensor = gen_dataset_ops.shuffle_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
         buffer_size=self._buffer_size,
         seed=self._seed,
         seed2=self._seed2,
         reshuffle_each_iteration=self._reshuffle_each_iteration,
         **flat_structure(self))
+    super(ShuffleDataset, self).__init__(input_dataset, variant_tensor)
 
 
 class TakeDataset(UnaryUnchangedStructureDataset):
@@ -2397,15 +2613,13 @@ class TakeDataset(UnaryUnchangedStructureDataset):
 
   def __init__(self, input_dataset, count):
     """See `Dataset.take()` for details."""
-    super(TakeDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     self._count = ops.convert_to_tensor(count, dtype=dtypes.int64, name="count")
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.take_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+    variant_tensor = gen_dataset_ops.take_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
         count=self._count,
         **flat_structure(self))
+    super(TakeDataset, self).__init__(input_dataset, variant_tensor)
 
 
 class SkipDataset(UnaryUnchangedStructureDataset):
@@ -2413,15 +2627,30 @@ class SkipDataset(UnaryUnchangedStructureDataset):
 
   def __init__(self, input_dataset, count):
     """See `Dataset.skip()` for details."""
-    super(SkipDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     self._count = ops.convert_to_tensor(count, dtype=dtypes.int64, name="count")
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.skip_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+    variant_tensor = gen_dataset_ops.skip_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
         count=self._count,
         **flat_structure(self))
+    super(SkipDataset, self).__init__(input_dataset, variant_tensor)
+
+
+class ShardDataset(UnaryUnchangedStructureDataset):
+  """A `Dataset` for sharding its input."""
+
+  def __init__(self, input_dataset, num_shards, index):
+    """See `Dataset.shard()` for details."""
+    self._input_dataset = input_dataset
+    self._num_shards = ops.convert_to_tensor(
+        num_shards, dtype=dtypes.int64, name="num_shards")
+    self._index = ops.convert_to_tensor(index, dtype=dtypes.int64, name="index")
+    variant_tensor = gen_dataset_ops.shard_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        num_shards=self._num_shards,
+        index=self._index,
+        **flat_structure(self))
+    super(ShardDataset, self).__init__(input_dataset, variant_tensor)
 
 
 class BatchDataset(UnaryDataset):
@@ -2429,7 +2658,6 @@ class BatchDataset(UnaryDataset):
 
   def __init__(self, input_dataset, batch_size, drop_remainder):
     """See `Dataset.batch()` for details."""
-    super(BatchDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     self._batch_size = ops.convert_to_tensor(
         batch_size, dtype=dtypes.int64, name="batch_size")
@@ -2445,13 +2673,12 @@ class BatchDataset(UnaryDataset):
           tensor_util.constant_value(self._batch_size))
     else:
       self._structure = input_dataset._element_structure._batch(None)
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.batch_dataset_v2(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+    variant_tensor = gen_dataset_ops.batch_dataset_v2(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
         batch_size=self._batch_size,
         drop_remainder=self._drop_remainder,
         **flat_structure(self))
+    super(BatchDataset, self).__init__(input_dataset, variant_tensor)
 
   @property
   def _element_structure(self):
@@ -2573,7 +2800,7 @@ class PaddedBatchDataset(UnaryDataset):
   def __init__(self, input_dataset, batch_size, padded_shapes, padding_values,
                drop_remainder):
     """See `Dataset.batch()` for details."""
-    super(PaddedBatchDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
     if sparse.any_sparse(input_dataset.output_classes):
       # TODO(b/63669786): support batching of sparse tensors
       raise TypeError(
@@ -2616,12 +2843,11 @@ class PaddedBatchDataset(UnaryDataset):
         self._input_dataset.output_types, output_shapes,
         self._input_dataset.output_classes)
 
-  def _as_variant_tensor(self):
     # pylint: disable=protected-access
     # TODO(jsimsa): Switch to using v2 only any time after 6/30/2018.
     if smart_cond.smart_constant_value(self._drop_remainder) is False:
-      return gen_dataset_ops.padded_batch_dataset(
-          self._input_dataset._as_variant_tensor(),
+      variant_tensor = gen_dataset_ops.padded_batch_dataset(
+          input_dataset._variant_tensor,  # pylint: disable=protected-access
           batch_size=self._batch_size,
           padded_shapes=[
               ops.convert_to_tensor(s, dtype=dtypes.int64)
@@ -2630,8 +2856,8 @@ class PaddedBatchDataset(UnaryDataset):
           padding_values=nest.flatten(self._padding_values),
           output_shapes=self._structure._flat_shapes)
     else:
-      return gen_dataset_ops.padded_batch_dataset_v2(
-          self._input_dataset._as_variant_tensor(),
+      variant_tensor = gen_dataset_ops.padded_batch_dataset_v2(
+          input_dataset._variant_tensor,  # pylint: disable=protected-access
           batch_size=self._batch_size,
           padded_shapes=[
               ops.convert_to_tensor(s, dtype=dtypes.int64)
@@ -2640,6 +2866,7 @@ class PaddedBatchDataset(UnaryDataset):
           padding_values=nest.flatten(self._padding_values),
           drop_remainder=self._drop_remainder,
           output_shapes=self._structure._flat_shapes)
+    super(PaddedBatchDataset, self).__init__(input_dataset, variant_tensor)
 
   @property
   def _element_structure(self):
@@ -2651,24 +2878,6 @@ def _should_unpack_args(args):
   return type(args) is tuple  # pylint: disable=unidiomatic-typecheck
 
 
-def _warn_if_collections(transformation_name):
-  """Prints warning message if the current graph uses common graph collections.
-
-  NOTE(mrry): Currently a warning is only generated for lookup tables. Any
-  variables created will be automatically hoisted out to the outermost scope
-  using `init_scope()`. Some collections (such as for control-flow contexts)
-  are benign and should not generate a warning.
-
-  Args:
-    transformation_name: A human-readable name for the transformation.
-  """
-  if ops.get_default_graph().get_collection(ops.GraphKeys.TABLE_INITIALIZERS):
-    warnings.warn("Creating lookup tables inside a function passed to %s is not"
-                  " supported. Create each table outside the function, and "
-                  "capture it inside the function to use it."
-                  % transformation_name)
-
-
 class MapDataset(UnaryDataset):
   """A `Dataset` that maps a function over elements in its input."""
 
@@ -2676,24 +2885,25 @@ class MapDataset(UnaryDataset):
                input_dataset,
                map_func,
                use_inter_op_parallelism=True,
-               preserve_cardinality=False):
+               preserve_cardinality=False,
+               use_legacy_function=False):
     """See `Dataset.map()` for details."""
-    super(MapDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     self._use_inter_op_parallelism = use_inter_op_parallelism
     self._preserve_cardinality = preserve_cardinality
     self._map_func = StructuredFunctionWrapper(
-        map_func, self._transformation_name(), dataset=input_dataset)
-
-  def _as_variant_tensor(self):
-    input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
-    return gen_dataset_ops.map_dataset(
-        input_t,
+        map_func,
+        self._transformation_name(),
+        dataset=input_dataset,
+        use_legacy_function=use_legacy_function)
+    variant_tensor = gen_dataset_ops.map_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
         self._map_func.function.captured_inputs,
         f=self._map_func.function,
         use_inter_op_parallelism=self._use_inter_op_parallelism,
         preserve_cardinality=self._preserve_cardinality,
         **flat_structure(self))
+    super(MapDataset, self).__init__(input_dataset, variant_tensor)
 
   def _functions(self):
     return [self._map_func]
@@ -2706,7 +2916,7 @@ class MapDataset(UnaryDataset):
     return "Dataset.map()"
 
 
-class ParallelMapDataset(MapDataset):
+class ParallelMapDataset(UnaryDataset):
   """A `Dataset` that maps a function over elements in its input in parallel."""
 
   def __init__(self,
@@ -2714,25 +2924,38 @@ class ParallelMapDataset(MapDataset):
                map_func,
                num_parallel_calls,
                use_inter_op_parallelism=True,
-               preserve_cardinality=False):
+               preserve_cardinality=False,
+               use_legacy_function=False):
     """See `Dataset.map()` for details."""
-    super(ParallelMapDataset, self).__init__(
-        input_dataset, map_func, use_inter_op_parallelism, preserve_cardinality)
-
+    self._input_dataset = input_dataset
+    self._use_inter_op_parallelism = use_inter_op_parallelism
+    self._map_func = StructuredFunctionWrapper(
+        map_func,
+        self._transformation_name(),
+        dataset=input_dataset,
+        use_legacy_function=use_legacy_function)
     self._num_parallel_calls = ops.convert_to_tensor(
         num_parallel_calls, dtype=dtypes.int32, name="num_parallel_calls")
-
-  def _as_variant_tensor(self):
-    # pylint: disable=protected-access
-    input_t = self._input_dataset._as_variant_tensor()
-    return gen_dataset_ops.parallel_map_dataset(
-        input_t,
+    self._preserve_cardinality = preserve_cardinality
+    variant_tensor = gen_dataset_ops.parallel_map_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
         self._map_func.function.captured_inputs,
         f=self._map_func.function,
         num_parallel_calls=self._num_parallel_calls,
         use_inter_op_parallelism=self._use_inter_op_parallelism,
         preserve_cardinality=self._preserve_cardinality,
         **flat_structure(self))
+    super(ParallelMapDataset, self).__init__(input_dataset, variant_tensor)
+
+  def _functions(self):
+    return [self._map_func]
+
+  @property
+  def _element_structure(self):
+    return self._map_func.output_structure
+
+  def _transformation_name(self):
+    return "Dataset.map()"
 
 
 class FlatMapDataset(UnaryDataset):
@@ -2740,24 +2963,21 @@ class FlatMapDataset(UnaryDataset):
 
   def __init__(self, input_dataset, map_func):
     """See `Dataset.flat_map()` for details."""
-    super(FlatMapDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
-
     self._map_func = StructuredFunctionWrapper(
         map_func, self._transformation_name(), dataset=input_dataset)
     if not isinstance(self._map_func.output_structure, DatasetStructure):
       raise TypeError("`map_func` must return a `Dataset` object.")
     self._structure = self._map_func.output_structure._element_structure  # pylint: disable=protected-access
-
-  def _functions(self):
-    return [self._map_func]
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.flat_map_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+    variant_tensor = gen_dataset_ops.flat_map_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
         self._map_func.function.captured_inputs,
         f=self._map_func.function,
         **flat_structure(self))
+    super(FlatMapDataset, self).__init__(input_dataset, variant_tensor)
+
+  def _functions(self):
+    return [self._map_func]
 
   @property
   def _element_structure(self):
@@ -2767,58 +2987,79 @@ class FlatMapDataset(UnaryDataset):
     return "Dataset.flat_map()"
 
 
-class InterleaveDataset(FlatMapDataset):
+class InterleaveDataset(UnaryDataset):
   """A `Dataset` that maps a function over its input and interleaves the result.
   """
 
   def __init__(self, input_dataset, map_func, cycle_length, block_length):
     """See `Dataset.interleave()` for details."""
-    super(InterleaveDataset, self).__init__(input_dataset, map_func)
+    self._input_dataset = input_dataset
+    self._map_func = StructuredFunctionWrapper(
+        map_func, self._transformation_name(), dataset=input_dataset)
+    if not isinstance(self._map_func.output_structure, DatasetStructure):
+      raise TypeError("`map_func` must return a `Dataset` object.")
+    self._structure = self._map_func.output_structure._element_structure  # pylint: disable=protected-access
     self._cycle_length = ops.convert_to_tensor(
         cycle_length, dtype=dtypes.int64, name="cycle_length")
     self._block_length = ops.convert_to_tensor(
         block_length, dtype=dtypes.int64, name="block_length")
 
-  def _as_variant_tensor(self):
-    # pylint: disable=protected-access
-    return gen_dataset_ops.interleave_dataset(
-        self._input_dataset._as_variant_tensor(),
-        self._map_func.function.captured_inputs,
+    variant_tensor = gen_dataset_ops.interleave_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._map_func.function.captured_inputs,  # pylint: disable=protected-access
         self._cycle_length,
         self._block_length,
         f=self._map_func.function,
         **flat_structure(self))
+    super(InterleaveDataset, self).__init__(input_dataset, variant_tensor)
+
+  def _functions(self):
+    return [self._map_func]
+
+  @property
+  def _element_structure(self):
+    return self._structure
 
   def _transformation_name(self):
     return "Dataset.interleave()"
 
 
-class ParallelInterleaveDataset(FlatMapDataset):
+class ParallelInterleaveDataset(UnaryDataset):
   """A `Dataset` that maps a function over its input and interleaves the result.
-
   """
 
   def __init__(self, input_dataset, map_func, cycle_length, block_length,
                num_parallel_calls):
     """See `Dataset.interleave()` for details."""
-    super(ParallelInterleaveDataset, self).__init__(input_dataset, map_func)
+    self._input_dataset = input_dataset
+    self._map_func = StructuredFunctionWrapper(
+        map_func, self._transformation_name(), dataset=input_dataset)
+    if not isinstance(self._map_func.output_structure, DatasetStructure):
+      raise TypeError("`map_func` must return a `Dataset` object.")
+    self._structure = self._map_func.output_structure._element_structure  # pylint: disable=protected-access
     self._cycle_length = ops.convert_to_tensor(
         cycle_length, dtype=dtypes.int64, name="cycle_length")
     self._block_length = ops.convert_to_tensor(
         block_length, dtype=dtypes.int64, name="block_length")
     self._num_parallel_calls = ops.convert_to_tensor(
         num_parallel_calls, dtype=dtypes.int64, name="num_parallel_calls")
-
-  def _as_variant_tensor(self):
-    # pylint: disable=protected-access
-    return gen_dataset_ops.parallel_interleave_dataset_v2(
-        self._input_dataset._as_variant_tensor(),
-        self._map_func.function.captured_inputs,
+    variant_tensor = gen_dataset_ops.parallel_interleave_dataset_v2(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._map_func.function.captured_inputs,  # pylint: disable=protected-access
         self._cycle_length,
         self._block_length,
         self._num_parallel_calls,
         f=self._map_func.function,
         **flat_structure(self))
+    super(ParallelInterleaveDataset, self).__init__(input_dataset,
+                                                    variant_tensor)
+
+  def _functions(self):
+    return [self._map_func]
+
+  @property
+  def _element_structure(self):
+    return self._structure
 
   def _transformation_name(self):
     return "Dataset.interleave()"
@@ -2827,26 +3068,27 @@ class ParallelInterleaveDataset(FlatMapDataset):
 class FilterDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that filters its input according to a predicate function."""
 
-  def __init__(self, input_dataset, predicate):
+  def __init__(self, input_dataset, predicate, use_legacy_function=False):
     """See `Dataset.filter()` for details."""
-    super(FilterDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     wrapped_func = StructuredFunctionWrapper(
-        predicate, self._transformation_name(), dataset=input_dataset)
+        predicate,
+        self._transformation_name(),
+        dataset=input_dataset,
+        use_legacy_function=use_legacy_function)
     if not wrapped_func.output_structure.is_compatible_with(
         structure_lib.TensorStructure(dtypes.bool, [])):
       raise ValueError("`predicate` must return a scalar boolean tensor.")
     self._predicate = wrapped_func
-
-  def _functions(self):
-    return [self._predicate]
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.filter_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+    variant_tensor = gen_dataset_ops.filter_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
         other_arguments=self._predicate.function.captured_inputs,
         predicate=self._predicate.function,
         **flat_structure(self))
+    super(FilterDataset, self).__init__(input_dataset, variant_tensor)
+
+  def _functions(self):
+    return [self._predicate]
 
   def _transformation_name(self):
     return "Dataset.filter()"
@@ -2857,18 +3099,16 @@ class PrefetchDataset(UnaryUnchangedStructureDataset):
 
   def __init__(self, input_dataset, buffer_size):
     """See `Dataset.prefetch()` for details."""
-    super(PrefetchDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     if buffer_size is None:
       buffer_size = -1  # This is the sentinel for auto-tuning.
     self._buffer_size = ops.convert_to_tensor(
         buffer_size, dtype=dtypes.int64, name="buffer_size")
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.prefetch_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+    variant_tensor = gen_dataset_ops.prefetch_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
         buffer_size=self._buffer_size,
         **flat_structure(self))
+    super(PrefetchDataset, self).__init__(input_dataset, variant_tensor)
 
 
 class WindowDataset(UnaryDataset):
@@ -2876,7 +3116,6 @@ class WindowDataset(UnaryDataset):
 
   def __init__(self, input_dataset, size, shift, stride, drop_remainder):
     """See `window_dataset()` for more details."""
-    super(WindowDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     self._size = ops.convert_to_tensor(size, dtype=dtypes.int64, name="size")
     self._shift = ops.convert_to_tensor(shift, dtype=dtypes.int64, name="shift")
@@ -2895,15 +3134,14 @@ class WindowDataset(UnaryDataset):
                 nest.flatten(input_dataset.output_types))
         ])
     self._structure = structure_lib.NestedStructure(nest_of_structures)
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.window_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+    variant_tensor = gen_dataset_ops.window_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
         self._size,
         self._shift,
         self._stride,
         self._drop_remainder,
         **flat_structure(self))
+    super(WindowDataset, self).__init__(input_dataset, variant_tensor)
 
   @property
   def _element_structure(self):
@@ -2914,16 +3152,14 @@ class _OptionsDataset(UnaryUnchangedStructureDataset):
   """An identity `Dataset` that stores options."""
 
   def __init__(self, input_dataset, options):
-    super(_OptionsDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     self._options = input_dataset.options()
     if self._options:
       self._options = self._options.merge(options)
     else:
       self._options = options
-
-  def _as_variant_tensor(self):
-    return self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
+    variant_tensor = input_dataset._variant_tensor  # pylint: disable=protected-access
+    super(_OptionsDataset, self).__init__(input_dataset, variant_tensor)
 
   def options(self):
     return self._options
@@ -2933,83 +3169,74 @@ class _ModelDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and models performance."""
 
   def __init__(self, input_dataset):
-    """See `optimize()` for details."""
-    super(_ModelDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.model_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+    variant_tensor = gen_dataset_ops.model_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
         **flat_structure(self))
+    super(_ModelDataset, self).__init__(input_dataset, variant_tensor)
 
 
 class _OptimizeDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and applies optimizations."""
 
   def __init__(self, input_dataset, optimizations):
-    """See `optimize()` for details."""
-    super(_OptimizeDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     if optimizations is None:
       optimizations = []
     self._optimizations = ops.convert_to_tensor(
         optimizations, dtype=dtypes.string, name="optimizations")
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.optimize_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+    variant_tensor = gen_dataset_ops.optimize_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
         self._optimizations,
         **flat_structure(self))
+    super(_OptimizeDataset, self).__init__(input_dataset, variant_tensor)
 
 
 class _SetStatsAggregatorDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and sets a stats aggregator."""
 
   def __init__(self, input_dataset, aggregator, prefix, counter_prefix):
-    super(_SetStatsAggregatorDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     self._stats_aggregator = aggregator
     self._prefix = prefix
     self._counter_prefix = counter_prefix
-
-  def _as_variant_tensor(self):
-    return ged_ops.experimental_set_stats_aggregator_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+    variant_tensor = ged_ops.experimental_set_stats_aggregator_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
         self._stats_aggregator._resource,  # pylint: disable=protected-access
         self._prefix,
         self._counter_prefix,
         **flat_structure(self))
+    super(_SetStatsAggregatorDataset, self).__init__(input_dataset,
+                                                     variant_tensor)
 
 
 class _MaxIntraOpParallelismDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, overriding intra-op parallelism."""
 
   def __init__(self, input_dataset, max_intra_op_parallelism):
-    super(_MaxIntraOpParallelismDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     self._max_intra_op_parallelism = ops.convert_to_tensor(
         max_intra_op_parallelism,
         dtype=dtypes.int64,
         name="max_intra_op_parallelism")
-
-  def _as_variant_tensor(self):
-    return ged_ops.experimental_max_intra_op_parallelism_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+    variant_tensor = ged_ops.experimental_max_intra_op_parallelism_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
         self._max_intra_op_parallelism,
         **flat_structure(self))
+    super(_MaxIntraOpParallelismDataset, self).__init__(input_dataset,
+                                                        variant_tensor)
 
 
 class _PrivateThreadPoolDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, setting a private threadpool."""
 
   def __init__(self, input_dataset, num_threads):
-    super(_PrivateThreadPoolDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     self._num_threads = ops.convert_to_tensor(
         num_threads, dtype=dtypes.int64, name="num_threads")
-
-  def _as_variant_tensor(self):
-    return ged_ops.experimental_private_thread_pool_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+    variant_tensor = ged_ops.experimental_private_thread_pool_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
         self._num_threads,
         **flat_structure(self))
+    super(_PrivateThreadPoolDataset, self).__init__(input_dataset,
+                                                    variant_tensor)
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index d0e91b01f9138470cd2a06a8b353149b74af2497..efa90209512b09b2b9727275b6e6ef3dfe7df529 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -31,8 +31,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.training.saver import BaseSaverBuilder
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -68,7 +68,7 @@ def _device_stack_is_empty():
 
 
 @tf_export(v1=["data.Iterator"])
-class Iterator(checkpointable.CheckpointableBase):
+class Iterator(trackable.Trackable):
   """Represents the state of iterating through a `Dataset`."""
 
   def __init__(self, iterator_resource, initializer, output_types,
@@ -357,7 +357,7 @@ class Iterator(checkpointable.CheckpointableBase):
                           (self.output_shapes, dataset.output_shapes))
     with ops.colocate_with(self._iterator_resource):
       return gen_dataset_ops.make_iterator(
-          dataset._as_variant_tensor(), self._iterator_resource, name=name)  # pylint: disable=protected-access
+          dataset._variant_tensor, self._iterator_resource, name=name)  # pylint: disable=protected-access
 
   def get_next(self, name=None):
     """Returns a nested structure of `tf.Tensor`s representing the next element.
@@ -491,7 +491,7 @@ def _generate_shared_name(prefix):
   return "{}{}".format(prefix, uid)
 
 
-class EagerIterator(checkpointable.CheckpointableBase):
+class EagerIterator(trackable.Trackable):
   """An iterator producing tf.Tensor objects from a tf.data.Dataset."""
 
   def __init__(self, dataset):
@@ -524,7 +524,7 @@ class EagerIterator(checkpointable.CheckpointableBase):
     with ops.device("/cpu:0"):
       # pylint: disable=protected-access
       dataset = dataset._apply_options()
-      ds_variant = dataset._as_variant_tensor()
+      ds_variant = dataset._variant_tensor
       self._structure = structure_lib.convert_legacy_structure(
           dataset.output_types, dataset.output_shapes, dataset.output_classes)
       self._flat_output_types = self._structure._flat_types
@@ -641,7 +641,7 @@ class EagerIterator(checkpointable.CheckpointableBase):
     return {"ITERATOR": _saveable_factory}
 
 
-# TODO(b/71645805): Expose checkpointable stateful objects from dataset
+# TODO(b/71645805): Expose trackable stateful objects from dataset
 # attributes(potential).
 class _IteratorSaveable(BaseSaverBuilder.SaveableObject):
   """SaveableObject for saving/restoring iterator state."""
diff --git a/tensorflow/python/data/ops/multi_device_iterator_ops.py b/tensorflow/python/data/ops/multi_device_iterator_ops.py
index 7586012574d39d7409e28f0d830a5fdadb25b61c..2592282104877a13e3f28290eb80801382667249 100644
--- a/tensorflow/python/data/ops/multi_device_iterator_ops.py
+++ b/tensorflow/python/data/ops/multi_device_iterator_ops.py
@@ -17,7 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.data.experimental.ops import optimization_options
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
@@ -29,14 +28,14 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import resource_variable_ops
 
 
-class _PerDeviceGenerator(dataset_ops.Dataset):
+class _PerDeviceGenerator(dataset_ops.DatasetV2):
   """A `dummy` generator dataset."""
 
   def __init__(self, shard_num, multi_device_iterator_resource, incarnation_id,
-               source_device, target_device, element_structure):
-    self._target_device = target_device
+               source_device, element_structure):
     self._structure = element_structure
 
     multi_device_iterator_string_handle = (
@@ -90,6 +89,11 @@ class _PerDeviceGenerator(dataset_ops.Dataset):
     self._next_func = _remote_next_func._get_concrete_function_internal()  # pylint: disable=protected-access
     self._next_captured_args = self._next_func.captured_inputs
 
+    self._incarnation_id_index = -1
+    for i, arg in enumerate(self._next_captured_args):
+      if arg == incarnation_id:
+        self._incarnation_id_index = i
+
     @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
     def _finalize_func(unused_string_handle):
       return array_ops.constant(0, dtypes.int64)
@@ -108,16 +112,15 @@ class _PerDeviceGenerator(dataset_ops.Dataset):
     )
     self._finalize_captured_args = self._finalize_func.captured_inputs
 
-  def _as_variant_tensor(self):
-    with ops.device(self._target_device):
-      return gen_dataset_ops.generator_dataset(
-          self._init_captured_args,
-          self._next_captured_args,
-          self._finalize_captured_args,
-          init_func=self._init_func,
-          next_func=self._next_func,
-          finalize_func=self._finalize_func,
-          **dataset_ops.flat_structure(self))
+    variant_tensor = gen_dataset_ops.generator_dataset(
+        self._init_captured_args,
+        self._next_captured_args,
+        self._finalize_captured_args,
+        init_func=self._init_func,
+        next_func=self._next_func,
+        finalize_func=self._finalize_func,
+        **dataset_ops.flat_structure(self))
+    super(_PerDeviceGenerator, self).__init__(variant_tensor)
 
   def _inputs(self):
     # TODO(b/116506223): Determine which datasets should be used as inputs here.
@@ -128,15 +131,52 @@ class _PerDeviceGenerator(dataset_ops.Dataset):
     return self._structure
 
 
-class MultiDeviceIterator(object):
-  """An iterator over multiple devices.
+class _ReincarnatedPerDeviceGenerator(dataset_ops.DatasetV2):
+  """Creates a _PerDeviceGenerator-like dataset with a new incarnation_id.
 
-  @compatibility(eager)
-  MultiDeviceIterator isn't currently supported in Eager mode but support is
-  coming soon.
-  @end_compatibility
+  Re-uses the functions from the provided per_device_dataset and just switches
+  out the function argument corresponding to the incarnation_id.
   """
 
+  def __init__(self, per_device_dataset, incarnation_id):
+    # pylint: disable=protected-access
+    self._structure = per_device_dataset._structure
+
+    self._init_func = per_device_dataset._init_func
+    self._init_captured_args = self._init_func.captured_inputs
+
+    self._next_func = per_device_dataset._next_func
+    self._next_captured_args = per_device_dataset._next_captured_args
+    # The captured arguments to the next_func are string_handle, incarnation_id.
+    # We update the incarnation id to the new one.
+    self._next_captured_args[
+        per_device_dataset._incarnation_id_index] = incarnation_id
+
+    self._finalize_func = per_device_dataset._finalize_func
+    self._finalize_captured_args = per_device_dataset._finalize_captured_args
+
+    variant_tensor = gen_dataset_ops.generator_dataset(
+        self._init_captured_args,
+        self._next_captured_args,
+        self._finalize_captured_args,
+        init_func=self._init_func,
+        next_func=self._next_func,
+        finalize_func=self._finalize_func,
+        **dataset_ops.flat_structure(self))
+    super(_ReincarnatedPerDeviceGenerator, self).__init__(variant_tensor)
+
+  def _inputs(self):
+    # TODO(b/116506223): Determine which datasets should be used as inputs here.
+    return []
+
+  @property
+  def _element_structure(self):
+    return self._structure
+
+
+class MultiDeviceIterator(object):
+  """An iterator over multiple devices."""
+
   def __init__(self,
                dataset,
                devices,
@@ -153,33 +193,55 @@ class MultiDeviceIterator(object):
         to prefetch into.
       source_device: The host device to place the `dataset` on.
 
+      In order to prevent deadlocks, if the prefetch_buffer_size is greater
+      than the max_buffer_size, we set the max_buffer_size to
+      prefetch_buffer_size.
+
     Raises:
       RuntimeError: If run in Eager mode.
     """
-    if context.executing_eagerly():
-      # TODO(rohanj): Fix this. Tracking bug: b/116467184
-      raise RuntimeError("MultiDeviceIterator is not currently supported in "
-                         "Eager mode.")
     self._dataset = dataset._apply_options()  # pylint: disable=protected-access
     self._devices = devices
     self._source_device = source_device
     self._source_device_tensor = ops.convert_to_tensor(source_device)
+    self._max_buffer_size = max_buffer_size
+    self._prefetch_buffer_size = prefetch_buffer_size
+
+    if self._prefetch_buffer_size > self._max_buffer_size:
+      self._max_buffer_size = self._prefetch_buffer_size
 
     # Create the MultiDeviceIterator.
     with ops.device(self._source_device):
+      # TODO(b/121378567): Get rid of this shared_name hack.
+      shared_name = ""
+      if context.executing_eagerly():
+        shared_name = context.shared_name()
       self._multi_device_iterator_resource = (
           gen_dataset_ops.multi_device_iterator(
               devices=self._devices,
-              shared_name="",
+              shared_name=shared_name,
               container="",
-              **dataset_ops.flat_structure(dataset)))
+              **dataset_ops.flat_structure(self._dataset)))
+      if context.executing_eagerly():
+        # Delete the resource when this object is deleted
+        self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
+            handle=self._multi_device_iterator_resource,
+            handle_device=self._source_device)
 
       # The incarnation ID is used to ensure consistency between the per-device
       # iterators and the multi-device iterator.
       self._incarnation_id = gen_dataset_ops.multi_device_iterator_init(
-          self._dataset._as_variant_tensor(),  # pylint: disable=protected-access
+          self._dataset._variant_tensor,  # pylint: disable=protected-access
           self._multi_device_iterator_resource,
-          max_buffer_size=max_buffer_size)
+          max_buffer_size=self._max_buffer_size)
+
+    self._prototype_device_datasets = []
+    for i, device in enumerate(self._devices):
+      with ops.device(device):
+        ds = _PerDeviceGenerator(
+            i, self._multi_device_iterator_resource, self._incarnation_id,
+            self._source_device_tensor, self._dataset._element_structure)  # pylint: disable=protected-access
+        self._prototype_device_datasets.append(ds)
 
     # TODO(rohanj): Explore the possibility of the MultiDeviceIterator to
     # initialize the device side of the pipeline. This would allow the
@@ -188,28 +250,40 @@ class MultiDeviceIterator(object):
     # Create the per device iterators.
     self._device_iterators = []
     for i, device in enumerate(self._devices):
-      ds = _PerDeviceGenerator(
-          i, self._multi_device_iterator_resource, self._incarnation_id,
-          self._source_device_tensor, device, dataset._element_structure)  # pylint: disable=protected-access
-      if prefetch_buffer_size > 0:
-        ds = ds.prefetch(prefetch_buffer_size)
-      # TODO(jsimsa): Enable auto-tuning and optimizations when supported for
-      # non-CPU devices.
-      options = dataset_ops.Options()
-      options.experimental_autotune = False
-      opt_options = optimization_options.OptimizationOptions()
-      opt_options.apply_default_optimizations = False
-      options.experimental_optimization = opt_options
-      ds = ds.with_options(options)
       with ops.device(device):
-        self._device_iterators.append(ds.make_initializable_iterator())
+        ds = self._create_device_dataset(i)
+        if context.executing_eagerly():
+          self._device_iterators.append(dataset_ops.make_one_shot_iterator(ds))
+        else:
+          self._device_iterators.append(
+              dataset_ops.make_initializable_iterator(ds))
+
+    if not context.executing_eagerly():
+      device_iterator_initializers = [
+          iterator.initializer for iterator in self._device_iterators
+      ]
+      self._initializer = control_flow_ops.group(*device_iterator_initializers)
+
+  def _create_device_dataset(self, i):
+    """Uses _prototype_device_datasets[i] to build a dataset for the device."""
+    ds = self._prototype_device_datasets[i]
+    ds = _ReincarnatedPerDeviceGenerator(ds, self._incarnation_id)
+    if self._prefetch_buffer_size > 0:
+      ds = ds.prefetch(self._prefetch_buffer_size)
+    # TODO(jsimsa): Enable auto-tuning and optimizations when supported for
+    # non-CPU devices.
+    options = dataset_ops.Options()
+    options.experimental_autotune = False
+    options.experimental_optimization.apply_default_optimizations = False
+    ds = ds.with_options(options)
+    return ds
+
+  def get_next(self, device=None):
+    """Returns the next element given a `device`, else returns all in a list."""
+    if device is not None:
+      index = self._devices.index(device)
+      return self._device_iterators[index].get_next()
 
-    device_iterator_initializers = [
-        iterator.initializer for iterator in self._device_iterators
-    ]
-    self._initializer = control_flow_ops.group(*device_iterator_initializers)
-
-  def get_next(self):
     result = []
     for i, device in enumerate(self._devices):
       with ops.device(device):
@@ -226,8 +300,27 @@ class MultiDeviceIterator(object):
 
   @property
   def initializer(self):
+    if context.executing_eagerly():
+      return control_flow_ops.no_op()
     return self._initializer
 
+  def _eager_reset(self):
+    """Resets the MultiDeviceIterator in eager mode."""
+    if not context.executing_eagerly():
+      raise ValueError("Eager reset is only supported in eager mode.")
+    # pylint: disable=protected-access
+    self._incarnation_id = gen_dataset_ops.multi_device_iterator_init(
+        self._dataset._variant_tensor,
+        self._multi_device_iterator_resource,
+        max_buffer_size=self._max_buffer_size)
+    for i, device in enumerate(self._devices):
+      with ops.device(device):
+        ds = self._create_device_dataset(i)
+        # Reset the device iterator resources with the new dataset.
+        ds_variant = ds._variant_tensor
+        gen_dataset_ops.make_iterator(ds_variant,
+                                      self._device_iterators[i]._resource)
+
   @property
   def output_types(self):
     return self._dataset.output_types
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index 0d6023dea28e3cefa13b32717e2aee87ac2c2bbf..5e61bcf6be0a099b75d9190aad17a6046e70c665 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -49,7 +49,6 @@ class TextLineDatasetV2(dataset_ops.DatasetSource):
         to buffer. A value of 0 results in the default buffering values chosen
         based on the compression type.
     """
-    super(TextLineDatasetV2, self).__init__()
     self._filenames = ops.convert_to_tensor(
         filenames, dtype=dtypes.string, name="filenames")
     self._compression_type = convert.optional_param_to_tensor(
@@ -59,10 +58,9 @@ class TextLineDatasetV2(dataset_ops.DatasetSource):
         argument_dtype=dtypes.string)
     self._buffer_size = convert.optional_param_to_tensor(
         "buffer_size", buffer_size, _DEFAULT_READER_BUFFER_SIZE_BYTES)
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.text_line_dataset(
+    variant_tensor = gen_dataset_ops.text_line_dataset(
         self._filenames, self._compression_type, self._buffer_size)
+    super(TextLineDatasetV2, self).__init__(variant_tensor)
 
   @property
   def _element_structure(self):
@@ -100,7 +98,6 @@ class _TFRecordDataset(dataset_ops.DatasetSource):
       buffer_size: (Optional.) A `tf.int64` scalar representing the number of
         bytes in the read buffer. 0 means no buffering.
     """
-    super(_TFRecordDataset, self).__init__()
     # Force the type to string even if filenames is an empty list.
     self._filenames = ops.convert_to_tensor(
         filenames, dtypes.string, name="filenames")
@@ -113,24 +110,32 @@ class _TFRecordDataset(dataset_ops.DatasetSource):
         "buffer_size",
         buffer_size,
         argument_default=_DEFAULT_READER_BUFFER_SIZE_BYTES)
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.tf_record_dataset(
+    variant_tensor = gen_dataset_ops.tf_record_dataset(
         self._filenames, self._compression_type, self._buffer_size)
+    super(_TFRecordDataset, self).__init__(variant_tensor)
 
   @property
   def _element_structure(self):
     return structure.TensorStructure(dtypes.string, [])
 
 
-class ParallelInterleaveDataset(dataset_ops.InterleaveDataset):
+class ParallelInterleaveDataset(dataset_ops.UnaryDataset):
   """A `Dataset` that maps a function over its input and flattens the result."""
 
   def __init__(self, input_dataset, map_func, cycle_length, block_length,
                sloppy, buffer_output_elements, prefetch_input_elements):
     """See `tf.data.experimental.parallel_interleave()` for details."""
-    super(ParallelInterleaveDataset, self).__init__(input_dataset, map_func,
-                                                    cycle_length, block_length)
+    self._input_dataset = input_dataset
+    self._map_func = dataset_ops.StructuredFunctionWrapper(
+        map_func, self._transformation_name(), dataset=input_dataset)
+    if not isinstance(self._map_func.output_structure,
+                      dataset_ops.DatasetStructure):
+      raise TypeError("`map_func` must return a `Dataset` object.")
+    self._structure = self._map_func.output_structure._element_structure  # pylint: disable=protected-access
+    self._cycle_length = ops.convert_to_tensor(
+        cycle_length, dtype=dtypes.int64, name="cycle_length")
+    self._block_length = ops.convert_to_tensor(
+        block_length, dtype=dtypes.int64, name="block_length")
     self._sloppy = ops.convert_to_tensor(
         sloppy, dtype=dtypes.bool, name="sloppy")
     self._buffer_output_elements = convert.optional_param_to_tensor(
@@ -141,11 +146,8 @@ class ParallelInterleaveDataset(dataset_ops.InterleaveDataset):
         "prefetch_input_elements",
         prefetch_input_elements,
         argument_default=2 * cycle_length)
-
-  def _as_variant_tensor(self):
-    # pylint: disable=protected-access
-    return ged_ops.experimental_parallel_interleave_dataset(
-        self._input_dataset._as_variant_tensor(),
+    variant_tensor = ged_ops.experimental_parallel_interleave_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
         self._map_func.function.captured_inputs,
         self._cycle_length,
         self._block_length,
@@ -154,7 +156,15 @@ class ParallelInterleaveDataset(dataset_ops.InterleaveDataset):
         self._prefetch_input_elements,
         f=self._map_func.function,
         **dataset_ops.flat_structure(self))
-    # pylint: enable=protected-access
+    super(ParallelInterleaveDataset, self).__init__(input_dataset,
+                                                    variant_tensor)
+
+  def _functions(self):
+    return [self._map_func]
+
+  @property
+  def _element_structure(self):
+    return self._structure
 
   def _transformation_name(self):
     return "tf.data.experimental.parallel_interleave()"
@@ -186,7 +196,6 @@ class TFRecordDatasetV2(dataset_ops.DatasetV2):
       TypeError: If any argument does not have the expected type.
       ValueError: If any argument does not have the expected shape.
     """
-    super(TFRecordDatasetV2, self).__init__()
     if isinstance(filenames, dataset_ops.DatasetV2):
       if filenames.output_types != dtypes.string:
         raise TypeError(
@@ -215,6 +224,8 @@ class TFRecordDatasetV2(dataset_ops.DatasetV2):
           filenames, read_one_file, cycle_length=num_parallel_reads,
           block_length=1, sloppy=False, buffer_output_elements=None,
           prefetch_input_elements=None)
+    variant_tensor = self._impl._variant_tensor  # pylint: disable=protected-access
+    super(TFRecordDatasetV2, self).__init__(variant_tensor)
 
   def _clone(self,
              filenames=None,
@@ -226,9 +237,6 @@ class TFRecordDatasetV2(dataset_ops.DatasetV2):
                              buffer_size or self._buffer_size,
                              num_parallel_reads or self._num_parallel_reads)
 
-  def _as_variant_tensor(self):
-    return self._impl._as_variant_tensor()  # pylint: disable=protected-access
-
   def _inputs(self):
     return self._impl._inputs()  # pylint: disable=protected-access
 
@@ -295,7 +303,6 @@ class FixedLengthRecordDatasetV2(dataset_ops.DatasetSource):
       compression_type: (Optional.) A `tf.string` scalar evaluating to one of
         `""` (no compression), `"ZLIB"`, or `"GZIP"`.
     """
-    super(FixedLengthRecordDatasetV2, self).__init__()
     self._filenames = ops.convert_to_tensor(
         filenames, dtype=dtypes.string, name="filenames")
     self._record_bytes = ops.convert_to_tensor(
@@ -312,17 +319,16 @@ class FixedLengthRecordDatasetV2(dataset_ops.DatasetSource):
         compression_type,
         argument_default="",
         argument_dtype=dtypes.string)
-
-  def _as_variant_tensor(self):
     if (self._compression_type is not None or
         compat.forward_compatible(2018, 11, 30)):
-      return gen_dataset_ops.fixed_length_record_dataset_v2(
+      variant_tensor = gen_dataset_ops.fixed_length_record_dataset_v2(
           self._filenames, self._header_bytes, self._record_bytes,
           self._footer_bytes, self._buffer_size, self._compression_type)
     else:
-      return gen_dataset_ops.fixed_length_record_dataset(
+      variant_tensor = gen_dataset_ops.fixed_length_record_dataset(
           self._filenames, self._header_bytes, self._record_bytes,
           self._footer_bytes, self._buffer_size)
+    super(FixedLengthRecordDatasetV2, self).__init__(variant_tensor)
 
   @property
   def _element_structure(self):
diff --git a/tensorflow/python/data/util/BUILD b/tensorflow/python/data/util/BUILD
index 04e80299e0d57965c21b88bd94250cb62e76d452..c98b1f17293334f9654a7cb5faa0accd1b7d8ac8 100644
--- a/tensorflow/python/data/util/BUILD
+++ b/tensorflow/python/data/util/BUILD
@@ -163,3 +163,24 @@ py_test(
         "//tensorflow/python:util",
     ],
 )
+
+py_library(
+    name = "traverse",
+    srcs = ["traverse.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+    ],
+)
+
+py_test(
+    name = "traverse_test",
+    size = "small",
+    srcs = ["traverse_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":traverse",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
diff --git a/tensorflow/python/data/util/nest.py b/tensorflow/python/data/util/nest.py
index e5abc654da77cd9409f52d3ba5c8868c0916c712..ebfd8af34233516d3f447d03735371b7e2be8f22 100644
--- a/tensorflow/python/data/util/nest.py
+++ b/tensorflow/python/data/util/nest.py
@@ -46,7 +46,7 @@ from tensorflow.python.framework import sparse_tensor as _sparse_tensor
 def _sorted(dict_):
   """Returns a sorted list of the dict keys, with error if keys not sortable."""
   try:
-    return sorted(_six.iterkeys(dict_))
+    return sorted(list(dict_))
   except TypeError:
     raise TypeError("nest only supports dicts with sortable keys.")
 
@@ -68,7 +68,7 @@ def _sequence_like(instance, args):
     # ordered and plain dicts (e.g., flattening a dict but using a
     # corresponding `OrderedDict` to pack it back).
     result = dict(zip(_sorted(instance), args))
-    return type(instance)((key, result[key]) for key in _six.iterkeys(instance))
+    return type(instance)((key, result[key]) for key in instance)
   elif (isinstance(instance, tuple) and
         hasattr(instance, "_fields") and
         isinstance(instance._fields, _collections.Sequence) and
@@ -317,8 +317,7 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
         raise ValueError(
             "The two structures don't have the same keys. Input "
             "structure has keys %s, while shallow structure has keys %s." %
-            (list(_six.iterkeys(input_tree)),
-             list(_six.iterkeys(shallow_tree))))
+            (list(input_tree), list(shallow_tree)))
       input_tree = list(sorted(_six.iteritems(input_tree)))
       shallow_tree = list(sorted(_six.iteritems(shallow_tree)))
 
diff --git a/tensorflow/python/data/util/options.py b/tensorflow/python/data/util/options.py
index 9badba8e5670c749b833da7f1e2094f4f3548098..3c79197fae8d6df91ba477db8f9475dfd3fb61c9 100644
--- a/tensorflow/python/data/util/options.py
+++ b/tensorflow/python/data/util/options.py
@@ -31,7 +31,8 @@ class OptionsBase(object):
   """
 
   def __init__(self):
-    self._options = {}
+    # NOTE: Cannot use `self._options` here as we override `__setattr__`
+    object.__setattr__(self, "_options", {})
 
   def __eq__(self, other):
     if not isinstance(other, self.__class__):
@@ -47,28 +48,40 @@ class OptionsBase(object):
     else:
       return NotImplemented
 
+  def __setattr__(self, name, value):
+    if hasattr(self, name):
+      object.__setattr__(self, name, value)
+    else:
+      raise AttributeError(
+          "Cannot set the property %s on %s." % (name, type(self).__name__))
+
 
-def create_option(name, ty, docstring, default=None):
+def create_option(name, ty, docstring, default_factory=lambda: None):
   """Creates a type-checked property.
 
   Args:
-    name: the name to use
-    ty: the type to use
-    docstring: the docstring to use
-    default: the default value to use
+    name: The name to use.
+    ty: The type to use. The type of the property will be validated when it
+      is set.
+    docstring: The docstring to use.
+    default_factory: A callable that takes no arguments and returns a default
+      value to use if not set.
 
   Returns:
     A type-checked property.
   """
 
-  def get_fn(self):
-    return self._options.get(name, default)  # pylint: disable=protected-access
+  def get_fn(option):
+    # pylint: disable=protected-access
+    if name not in option._options:
+      option._options[name] = default_factory()
+    return option._options.get(name)
 
-  def set_fn(self, value):
+  def set_fn(option, value):
     if not isinstance(value, ty):
       raise TypeError("Property \"%s\" must be of type %s, got: %r (type: %r)" %
                       (name, ty, value, type(value)))
-    self._options[name] = value  # pylint: disable=protected-access
+    option._options[name] = value  # pylint: disable=protected-access
 
   return property(get_fn, set_fn, None, docstring)
 
diff --git a/tensorflow/python/data/util/options_test.py b/tensorflow/python/data/util/options_test.py
index c5169835a322923d7bf2d644717870d87bfab13f..b21afbd455db6c7f3da61df3e1dd8a4897603b85 100644
--- a/tensorflow/python/data/util/options_test.py
+++ b/tensorflow/python/data/util/options_test.py
@@ -24,9 +24,12 @@ from tensorflow.python.platform import test
 
 class _TestOptions(options.OptionsBase):
   x = options.create_option(
-      name="x", ty=int, docstring="the answer to everything", default=42)
+      name="x",
+      ty=int,
+      docstring="the answer to everything",
+      default_factory=lambda: 42)
   y = options.create_option(
-      name="y", ty=float, docstring="a tasty pie", default=3.14)
+      name="y", ty=float, docstring="a tasty pie", default_factory=lambda: 3.14)
 
 
 class _NestedTestOptions(options.OptionsBase):
@@ -91,6 +94,13 @@ class OptionsTest(test.TestCase):
     with self.assertRaises(TypeError):
       options.merge_options(options1, options2)
 
+  def testNoSpuriousAttrs(self):
+    test_options = _TestOptions()
+    with self.assertRaises(AttributeError):
+      test_options.wrong_attr = True
+    with self.assertRaises(AttributeError):
+      _ = test_options.wrong_attr
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/util/traverse.py b/tensorflow/python/data/util/traverse.py
new file mode 100644
index 0000000000000000000000000000000000000000..12e576fb41431740e360a038787c8217f6d398c1
--- /dev/null
+++ b/tensorflow/python/data/util/traverse.py
@@ -0,0 +1,56 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Helpers to traverse the Dataset dependency structure."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from six.moves import queue as Queue  # pylint: disable=redefined-builtin
+
+from tensorflow.python.framework import dtypes
+
+
+def obtain_all_variant_tensor_ops(dataset):
+  """Given an input dataset, finds all dataset ops used for construction.
+
+  A series of transformations would have created this dataset with each
+  transformation including zero or more Dataset ops, each producing a dataset
+  variant tensor. This method outputs all of them.
+
+  Args:
+    dataset: Dataset to find variant tensors for.
+
+  Returns:
+    A list of variant_tensor producing dataset ops used to construct this
+    dataset.
+  """
+  all_variant_tensor_ops = []
+  bfs_q = Queue.Queue()
+  bfs_q.put(dataset._variant_tensor.op)  # pylint: disable=protected-access
+  visited = []
+  while not bfs_q.empty():
+    op = bfs_q.get()
+    visited.append(op)
+    # We look for all ops that produce variant tensors as output. This is a bit
+    # of overkill but the other dataset _inputs() traversal strategies can't
+    # cover the case of function inputs that capture dataset variants.
+    # TODO(b/120873778): Make this more efficient.
+    if op.outputs[0].dtype == dtypes.variant:
+      all_variant_tensor_ops.append(op)
+    for i in op.inputs:
+      input_op = i.op
+      if input_op not in visited:
+        bfs_q.put(input_op)
+  return all_variant_tensor_ops
diff --git a/tensorflow/python/data/util/traverse_test.py b/tensorflow/python/data/util/traverse_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..53de1be897a3b2fe986558d9d4695c67f08d6ff0
--- /dev/null
+++ b/tensorflow/python/data/util/traverse_test.py
@@ -0,0 +1,109 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for utilities for traversing the dataset construction graph."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import traverse
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class _TestDataset(dataset_ops.UnaryUnchangedStructureDataset):
+
+  def __init__(self, input_dataset):
+    self._input_dataset = input_dataset
+    temp_variant_tensor = gen_dataset_ops.prefetch_dataset(
+        input_dataset._variant_tensor,
+        buffer_size=1,
+        **dataset_ops.flat_structure(self))
+    variant_tensor = gen_dataset_ops.model_dataset(
+        temp_variant_tensor, **dataset_ops.flat_structure(self))
+    super(_TestDataset, self).__init__(input_dataset, variant_tensor)
+
+
+class TraverseTest(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def testOnlySource(self):
+    ds = dataset_ops.Dataset.range(10)
+    variant_tensor_ops = traverse.obtain_all_variant_tensor_ops(ds)
+    self.assertAllEqual(["RangeDataset"], [x.name for x in variant_tensor_ops])
+
+  @test_util.run_deprecated_v1
+  def testSimplePipeline(self):
+    ds = dataset_ops.Dataset.range(10).map(math_ops.square)
+    variant_tensor_ops = traverse.obtain_all_variant_tensor_ops(ds)
+    self.assertSetEqual(
+        set(["MapDataset", "RangeDataset"]),
+        set([x.name for x in variant_tensor_ops]))
+
+  @test_util.run_deprecated_v1
+  def testConcat(self):
+    ds1 = dataset_ops.Dataset.range(10)
+    ds2 = dataset_ops.Dataset.range(10)
+    ds = ds1.concatenate(ds2)
+    variant_tensor_ops = traverse.obtain_all_variant_tensor_ops(ds)
+    self.assertSetEqual(
+        set(["ConcatenateDataset", "RangeDataset", "RangeDataset_1"]),
+        set([x.name for x in variant_tensor_ops]))
+
+  @test_util.run_deprecated_v1
+  def testZip(self):
+    ds1 = dataset_ops.Dataset.range(10)
+    ds2 = dataset_ops.Dataset.range(10)
+    ds = dataset_ops.Dataset.zip((ds1, ds2))
+    variant_tensor_ops = traverse.obtain_all_variant_tensor_ops(ds)
+    self.assertSetEqual(
+        set(["ZipDataset", "RangeDataset", "RangeDataset_1"]),
+        set([x.name for x in variant_tensor_ops]))
+
+  @test_util.run_deprecated_v1
+  def testMultipleVariantTensors(self):
+    ds = dataset_ops.Dataset.range(10)
+    ds = _TestDataset(ds)
+    variant_tensor_ops = traverse.obtain_all_variant_tensor_ops(ds)
+    self.assertSetEqual(
+        set(["RangeDataset", "ModelDataset", "PrefetchDataset"]),
+        set([x.name for x in variant_tensor_ops]))
+
+  @test_util.run_deprecated_v1
+  def testFlatMap(self):
+    ds1 = dataset_ops.Dataset.range(10).repeat(10)
+
+    def map_fn(ds):
+
+      def _map(x):
+        return ds.batch(x)
+
+      return _map
+
+    ds2 = dataset_ops.Dataset.range(20).prefetch(1)
+    ds2 = ds2.flat_map(map_fn(ds1))
+    variant_tensor_ops = traverse.obtain_all_variant_tensor_ops(ds2)
+    self.assertSetEqual(
+        set([
+            "FlatMapDataset", "PrefetchDataset", "RepeatDataset",
+            "RangeDataset", "RangeDataset_1"
+        ]), set([x.name for x in variant_tensor_ops]))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index c6abd476d9d274a3aab270a548f5b0ebd3b6d257..3b1b214a6f458430876f9467a51fae6b8054c46a 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -19,6 +19,7 @@ exports_files(["LICENSE"])
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "py_binary")
+load("//tensorflow:tensorflow.bzl", "if_not_v2")
 load("//tensorflow:tensorflow.bzl", "if_not_windows")
 
 py_library(
@@ -46,7 +47,7 @@ py_library(
         ":cli_test_utils",
         ":debug_py",
         ":grpc_debug_test_server",
-        ":offline_analyzer",
+        ":offline_analyzer_lib",
         ":session_debug_testlib",
         ":source_remote",
     ] + if_not_windows([
@@ -392,6 +393,13 @@ py_binary(
     name = "offline_analyzer",
     srcs = ["cli/offline_analyzer.py"],
     srcs_version = "PY2AND3",
+    deps = [":offline_analyzer_lib"],
+)
+
+py_library(
+    name = "offline_analyzer_lib",
+    srcs = ["cli/offline_analyzer.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":analyzer_cli",
         ":debug_data",
@@ -403,18 +411,26 @@ py_binary(
 py_library(
     name = "debug_examples",
     deps = [
-        ":debug_errors",
-        ":debug_fibonacci",
-        ":debug_keras",
-        ":debug_mnist",
-        ":debug_tflearn_iris",
-    ],
+        ":debug_errors_lib",
+        ":debug_fibonacci_lib",
+        ":debug_keras_lib",
+    ] + if_not_v2([
+        ":debug_mnist_lib",
+        ":debug_tflearn_iris_lib",
+    ]),
 )
 
 py_binary(
     name = "debug_fibonacci",
     srcs = ["examples/debug_fibonacci.py"],
     srcs_version = "PY2AND3",
+    deps = [":debug_fibonacci_lib"],
+)
+
+py_library(
+    name = "debug_fibonacci_lib",
+    srcs = ["examples/debug_fibonacci.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":debug_py",
         "//tensorflow:tensorflow_py",
@@ -427,6 +443,13 @@ py_binary(
     name = "debug_errors",
     srcs = ["examples/debug_errors.py"],
     srcs_version = "PY2AND3",
+    deps = [":debug_errors_lib"],
+)
+
+py_library(
+    name = "debug_errors_lib",
+    srcs = ["examples/debug_errors.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":debug_py",
         "//tensorflow:tensorflow_py",
@@ -438,6 +461,13 @@ py_binary(
     name = "debug_mnist",
     srcs = ["examples/debug_mnist.py"],
     srcs_version = "PY2AND3",
+    deps = [":debug_mnist_lib"],
+)
+
+py_library(
+    name = "debug_mnist_lib",
+    srcs = ["examples/debug_mnist.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":debug_py",
         "//tensorflow:tensorflow_py",
@@ -449,6 +479,13 @@ py_binary(
     name = "debug_tflearn_iris",
     srcs = ["examples/debug_tflearn_iris.py"],
     srcs_version = "PY2AND3",
+    deps = [":debug_tflearn_iris_lib"],
+)
+
+py_library(
+    name = "debug_tflearn_iris_lib",
+    srcs = ["examples/debug_tflearn_iris.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":debug_py",
         "//tensorflow:tensorflow_py",
@@ -460,6 +497,13 @@ py_binary(
     name = "debug_keras",
     srcs = ["examples/debug_keras.py"],
     srcs_version = "PY2AND3",
+    deps = [":debug_keras_lib"],
+)
+
+py_library(
+    name = "debug_keras_lib",
+    srcs = ["examples/debug_keras.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":debug_py",
         "//tensorflow:tensorflow_py",
@@ -525,6 +569,7 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 py_test(
@@ -613,6 +658,7 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 py_test(
@@ -772,6 +818,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     tags = ["notsan"],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 cuda_py_test(
@@ -789,6 +836,7 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 cuda_py_test(
@@ -806,6 +854,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 py_test(
@@ -931,6 +980,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     tags = ["no_windows"],  # TODO: needs investigation on Windows
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 py_test(
@@ -971,6 +1021,13 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
+    tags = [
+        "manual",
+        "no_pip",
+        "no_windows",
+        "notap",
+    ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 cuda_py_test(
@@ -998,6 +1055,7 @@ cuda_py_test(
         "notsan",
         "oss_serial",
     ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 cuda_py_test(
@@ -1023,6 +1081,7 @@ cuda_py_test(
         "optonly",  # Test flaky (b/80130873)
         "oss_serial",
     ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 # TODO(cais): Run the test in OSS, perhaps through a sh_test.
@@ -1051,6 +1110,7 @@ cuda_py_test(
         "no_windows",
         "notsan",
     ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 py_test(
@@ -1132,4 +1192,7 @@ sh_test(
         ":debug_tflearn_iris",
         ":offline_analyzer",
     ],
+    tags = [
+        "no_windows",
+    ],
 )
diff --git a/tensorflow/python/debug/cli/cli_shared_test.py b/tensorflow/python/debug/cli/cli_shared_test.py
index 66a12efda53470b33edf4788984e632bfe55f2b9..535e8a262be329e25d6e9f4b22085ac4b91025e2 100644
--- a/tensorflow/python/debug/cli/cli_shared_test.py
+++ b/tensorflow/python/debug/cli/cli_shared_test.py
@@ -105,7 +105,7 @@ class TimeToReadableStrTest(test_util.TensorFlowTestCase):
       cli_shared.time_to_readable_str(100, force_time_unit="ks")
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_deprecated_v1
 class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -119,7 +119,6 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
   def tearDown(self):
     ops.reset_default_graph()
 
-  @test_util.run_deprecated_v1
   def testSingleFetchNoFeeds(self):
     run_start_intro = cli_shared.get_run_start_intro(12, self.const_a, None, {})
 
@@ -183,7 +182,6 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     run_start_intro = cli_shared.get_run_start_intro(1, self.sparse_d, None, {})
     self.assertEqual(str(self.sparse_d), run_start_intro.lines[4].strip())
 
-  @test_util.run_deprecated_v1
   def testTwoFetchesListNoFeeds(self):
     fetches = [self.const_a, self.const_b]
     run_start_intro = cli_shared.get_run_start_intro(1, fetches, None, {})
@@ -200,7 +198,6 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     description = cli_shared.get_run_short_description(1, fetches, None)
     self.assertEqual("run #1: 2 fetches; 0 feeds", description)
 
-  @test_util.run_deprecated_v1
   def testNestedListAsFetches(self):
     fetches = [self.const_c, [self.const_a, self.const_b]]
     run_start_intro = cli_shared.get_run_start_intro(1, fetches, None, {})
@@ -214,7 +211,6 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     description = cli_shared.get_run_short_description(1, fetches, None)
     self.assertEqual("run #1: 3 fetches; 0 feeds", description)
 
-  @test_util.run_deprecated_v1
   def testNestedDictAsFetches(self):
     fetches = {"c": self.const_c, "ab": {"a": self.const_a, "b": self.const_b}}
     run_start_intro = cli_shared.get_run_start_intro(1, fetches, None, {})
@@ -232,7 +228,6 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     description = cli_shared.get_run_short_description(1, fetches, None)
     self.assertEqual("run #1: 3 fetches; 0 feeds", description)
 
-  @test_util.run_deprecated_v1
   def testTwoFetchesAsTupleNoFeeds(self):
     fetches = (self.const_a, self.const_b)
     run_start_intro = cli_shared.get_run_start_intro(1, fetches, None, {})
@@ -249,7 +244,6 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     description = cli_shared.get_run_short_description(1, fetches, None)
     self.assertEqual("run #1: 2 fetches; 0 feeds", description)
 
-  @test_util.run_deprecated_v1
   def testTwoFetchesAsNamedTupleNoFeeds(self):
     fetches_namedtuple = namedtuple("fetches", "x y")
     fetches = fetches_namedtuple(self.const_b, self.const_c)
@@ -267,7 +261,6 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     description = cli_shared.get_run_short_description(1, fetches, None)
     self.assertEqual("run #1: 2 fetches; 0 feeds", description)
 
-  @test_util.run_deprecated_v1
   def testWithFeedDict(self):
     feed_dict = {
         self.const_a: 10.0,
@@ -291,7 +284,6 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
                                                        feed_dict)
     self.assertEqual("run #1: 1 fetch (c:0); 2 feeds", description)
 
-  @test_util.run_deprecated_v1
   def testTensorFilters(self):
     feed_dict = {self.const_a: 10.0}
     tensor_filters = {
@@ -322,20 +314,18 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     command_set.add(annot[2].content)
     self.assertEqual({"run -f filter_a", "run -f filter_b"}, command_set)
 
-  @test_util.run_deprecated_v1
   def testGetRunShortDescriptionWorksForTensorFeedKey(self):
     short_description = cli_shared.get_run_short_description(
         1, self.const_a, {self.const_a: 42.0})
     self.assertEqual("run #1: 1 fetch (a:0); 1 feed (a:0)", short_description)
 
-  @test_util.run_deprecated_v1
   def testGetRunShortDescriptionWorksForUnicodeFeedKey(self):
     short_description = cli_shared.get_run_short_description(
         1, self.const_a, {u"foo": 42.0})
     self.assertEqual("run #1: 1 fetch (a:0); 1 feed (foo)", short_description)
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_deprecated_v1
 class GetErrorIntroTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/debug/cli/stepper_cli.py b/tensorflow/python/debug/cli/stepper_cli.py
index 94eb2754da21b2a6c66271f53a2a0917deb25515..fe1a012a5444a0140edd15b9e66a4de0449a5e47 100644
--- a/tensorflow/python/debug/cli/stepper_cli.py
+++ b/tensorflow/python/debug/cli/stepper_cli.py
@@ -251,6 +251,9 @@ class NodeStepperCLI(object):
       lines.extend(
           ["Topologically-sorted transitive input(s) and fetch(es):", ""])
 
+    output = debugger_cli_common.rich_text_lines_from_rich_line_list(lines)
+    self._add_deprecation_warning(output)
+
     for i, element_name in enumerate(self._sorted_nodes):
       if i < index_range[0] or i >= index_range[1]:
         continue
@@ -269,15 +272,36 @@ class NodeStepperCLI(object):
           override_names,
           dirty_variable_names)
 
-      lines.append(node_prefix + "] " + element_name)
-
-    output = debugger_cli_common.rich_text_lines_from_rich_line_list(lines)
+      output.append_rich_line(node_prefix + "] " + element_name)
 
     if verbose:
       output.extend(self._node_status_label_legend())
 
     return output
 
+  def _add_deprecation_warning(self, message):
+    """Add deprecation warning as RichTextLines."""
+    color = "yellow"
+    message.append_rich_line(
+        debugger_cli_common.RichLine(
+            "WARNING: the invoke_stepper feature of tfdbg has been deprecated ",
+            color))
+    message.append_rich_line(
+        debugger_cli_common.RichLine(
+            "and will be removed in the next release of TensorFlow.",
+            color))
+    message.append_rich_line(debugger_cli_common.RichLine("", color))
+    message.append_rich_line(
+        debugger_cli_common.RichLine(
+            "There now exist better alternatives of stepping debugging, "
+            "including:",
+            color))
+    message.append_rich_line(
+        debugger_cli_common.RichLine("- TensorBoard Debugger Plugin", color))
+    message.append_rich_line(
+        debugger_cli_common.RichLine("- Eager Execution", color))
+    message.append_rich_line(debugger_cli_common.RichLine("", color))
+
   def _get_status_labels(self,
                          element_name,
                          handle_node_names,
diff --git a/tensorflow/python/debug/cli/stepper_cli_test.py b/tensorflow/python/debug/cli/stepper_cli_test.py
index 5cf69d0168b70a4d03162512b5024736c50cf23a..c728373ae2bf75b216415034ec275fc2bd29b15a 100644
--- a/tensorflow/python/debug/cli/stepper_cli_test.py
+++ b/tensorflow/python/debug/cli/stepper_cli_test.py
@@ -235,6 +235,9 @@ class NodeStepperSimpleGraphTest(test_util.TensorFlowTestCase):
       ], output.lines)
 
   def testContToValidNodeShouldUpdateStatus(self):
+    if test_util.is_gpu_available():
+      self.skipTest("b/123446705 this causes a segfault on GPU")
+
     with stepper.NodeStepper(self.sess, self.e) as node_stepper:
       cli = stepper_cli.NodeStepperCLI(node_stepper)
 
@@ -275,6 +278,9 @@ class NodeStepperSimpleGraphTest(test_util.TensorFlowTestCase):
       self.assertIn(stepper_cli.NodeStepperCLI.STATE_CONT, stat_labels[index_d])
 
   def testSteppingOneStepAtATimeShouldUpdateStatus(self):
+    if test_util.is_gpu_available():
+      self.skipTest("b/123446705 this causes a segfault on GPU")
+
     with stepper.NodeStepper(self.sess, self.e) as node_stepper:
       cli = stepper_cli.NodeStepperCLI(node_stepper)
 
diff --git a/tensorflow/python/debug/lib/debug_gradients_test.py b/tensorflow/python/debug/lib/debug_gradients_test.py
index 885691c3ef71ba995ec3ab38e2d1bda7e1e30b1a..e592e46095c950123eaf07e20d89839c260d6fed 100644
--- a/tensorflow/python/debug/lib/debug_gradients_test.py
+++ b/tensorflow/python/debug/lib/debug_gradients_test.py
@@ -36,7 +36,7 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.training import gradient_descent
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_deprecated_v1
 class IdentifyGradientTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/debug/lib/source_remote_test.py b/tensorflow/python/debug/lib/source_remote_test.py
index 29add425e946aadfe941c73e9f9cef4aef3c8a9c..dce400c9ab0b6be3cabaea7c465baa1a6d2f471d 100644
--- a/tensorflow/python/debug/lib/source_remote_test.py
+++ b/tensorflow/python/debug/lib/source_remote_test.py
@@ -48,7 +48,8 @@ class SendTracebacksTest(test_util.TensorFlowTestCase):
     test_util.TensorFlowTestCase.setUpClass()
     (cls._server_port, cls._debug_server_url, cls._server_dump_dir,
      cls._server_thread,
-     cls._server) = grpc_debug_test_server.start_server_on_separate_thread()
+     cls._server) = grpc_debug_test_server.start_server_on_separate_thread(
+         poll_server=True)
     cls._server_address = "localhost:%d" % cls._server_port
     (cls._server_port_2, cls._debug_server_url_2, cls._server_dump_dir_2,
      cls._server_thread_2,
diff --git a/tensorflow/python/debug/lib/stepper_test.py b/tensorflow/python/debug/lib/stepper_test.py
index 9e78e207b80a99f3812c5909cf3753d90eab3680..bec858a1ba6ce1df58a8fc8d18f7a4f802f7d87e 100644
--- a/tensorflow/python/debug/lib/stepper_test.py
+++ b/tensorflow/python/debug/lib/stepper_test.py
@@ -94,6 +94,9 @@ class StepperTest(test_util.TensorFlowTestCase):
       self.assertAllClose(6.0, stepper.cont("c"))
 
   def testUsingNamesNotUsingIntermediateTensors(self):
+    if test_util.is_gpu_available():
+      self.skipTest("b/123446705 this causes a segfault on GPU")
+
     with NodeStepper(self.sess, "e:0") as stepper:
       # The first cont() call should have used no feeds.
       result = stepper.cont("c:0")
@@ -119,6 +122,9 @@ class StepperTest(test_util.TensorFlowTestCase):
       }, stepper.last_feed_types())
 
   def testUsingNodesNotUsingIntermediateTensors(self):
+    if test_util.is_gpu_available():
+      self.skipTest("b/123446705 this causes a segfault on GPU")
+
     with NodeStepper(self.sess, self.e) as stepper:
       # There should be no handles before any cont() calls.
       self.assertEqual([], stepper.handle_names())
@@ -493,6 +499,9 @@ class StepperTestWithPlaceHolders(test_util.TensorFlowTestCase):
       self.assertSetEqual({"ph0", "ph1"}, set(stepper.placeholders()))
 
   def testContWithPlaceholders(self):
+    if test_util.is_gpu_available():
+      self.skipTest("b/123446705 this causes a segfault on GPU")
+
     with NodeStepper(
         self.sess,
         self.y,
@@ -739,6 +748,9 @@ class StepperBackwardRunTest(test_util.TensorFlowTestCase):
     ops.reset_default_graph()
 
   def testContToUpdateA(self):
+    if test_util.is_gpu_available():
+      self.skipTest("b/123446705 this causes a segfault on GPU")
+
     with NodeStepper(self.sess, "optim") as stepper:
       result = stepper.cont("a:0")
       self.assertAllClose(1.0, result)
@@ -887,6 +899,8 @@ class StepperBackwardRunTest(test_util.TensorFlowTestCase):
 
     "clean" means no Variables have been updated by preceding cont() calls.
     """
+    if test_util.is_gpu_available():
+      self.skipTest("b/123446705 this causes a segfault on GPU")
 
     with NodeStepper(self.sess, "optim") as stepper:
       # First, call cont() on the two tensors on the intermediate level: e and
@@ -979,6 +993,8 @@ class StepperBackwardRunTest(test_util.TensorFlowTestCase):
 
   def testOverrideThenContToUpdateThenRemoveOverrideThenUpdateAgain(self):
     """Test cont() to update nodes after overriding tensor values."""
+    if test_util.is_gpu_available():
+      self.skipTest("b/123446705 this causes a segfault on GPU")
 
     with NodeStepper(self.sess, "optim") as stepper:
       result = stepper.cont("d:0")
diff --git a/tensorflow/python/debug/wrappers/disk_usage_test.py b/tensorflow/python/debug/wrappers/disk_usage_test.py
index 88b1cd540de7a6a56db6e5165be53ae8c9c2df26..71c56b3310635fa4707f01d579c93e6190c9b6de 100644
--- a/tensorflow/python/debug/wrappers/disk_usage_test.py
+++ b/tensorflow/python/debug/wrappers/disk_usage_test.py
@@ -32,7 +32,7 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.training import monitored_session
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_deprecated_v1
 class DumpingDebugWrapperDiskUsageLimitTest(test_util.TensorFlowTestCase):
 
   @classmethod
diff --git a/tensorflow/python/debug/wrappers/framework_test.py b/tensorflow/python/debug/wrappers/framework_test.py
index a50fa7cf4b870868a61ea4df173fc24bc8a8e110..aa070d442848582a3202bfc1d02c3161636871d7 100644
--- a/tensorflow/python/debug/wrappers/framework_test.py
+++ b/tensorflow/python/debug/wrappers/framework_test.py
@@ -141,7 +141,7 @@ class TestDebugWrapperSessionBadAction(framework.BaseDebugWrapperSession):
     return framework.OnRunEndResponse()
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_deprecated_v1
 class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
   def _no_rewrite_session_config(self):
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 887c61cb8fd81c6be4d20ba6b25c2997cea8cb7f..feeae8d9f414c8be4ed053cc0150c98c4173ddd1 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -60,6 +60,7 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/tools/docs:doc_controls",
         "@six_archive//:six",
     ],
 )
@@ -110,8 +111,11 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":cross_device_ops",
         ":distribute_lib",
         ":mirrored_strategy",
+        ":one_device_strategy",
+        "//tensorflow/python/distribute/experimental",
     ],
 )
 
@@ -124,6 +128,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":device_util",
+        ":numpy_dataset",
         ":reduce_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
@@ -137,7 +142,6 @@ py_library(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/data",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
-        "//tensorflow/python/ops/losses",
         "//tensorflow/tools/docs:doc_controls",
     ],
 )
@@ -184,9 +188,6 @@ py_test(
     name = "distribute_coordinator_test",
     srcs = ["distribute_coordinator_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-    ],
     deps = [
         ":distribute_coordinator",
         "//tensorflow/core:protos_all_py",
@@ -219,7 +220,9 @@ py_library(
         ":cross_device_ops",
         ":device_util",
         ":distribute_lib",
+        ":input_lib",
         ":multi_worker_util",
+        ":numpy_dataset",
         ":reduce_util",
         ":shared_variable_creator",
         ":values",
@@ -241,6 +244,70 @@ py_library(
     ],
 )
 
+py_library(
+    name = "parameter_server_strategy",
+    srcs = ["parameter_server_strategy.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":input_lib",
+        ":mirrored_strategy",
+        ":numpy_dataset",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python/distribute:cross_device_ops",
+        "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:values",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+py_library(
+    name = "one_device_strategy",
+    srcs = ["one_device_strategy.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":distribute_lib",
+        ":input_lib",
+        ":numpy_dataset",
+        ":reduce_util",
+        ":values",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/eager:context",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "collective_all_reduce_strategy",
+    srcs = ["collective_all_reduce_strategy.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":mirrored_strategy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:collective_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/distribute:cross_device_ops",
+        "//tensorflow/python/distribute:cross_device_utils",
+        "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:numpy_dataset",
+        "//tensorflow/python/distribute:values",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
 py_library(
     name = "multi_worker_util",
     srcs = [
@@ -253,6 +320,49 @@ py_library(
     ],
 )
 
+py_library(
+    name = "numpy_dataset",
+    srcs = ["numpy_dataset.py"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "numpy_dataset_test",
+    size = "small",
+    srcs = ["numpy_dataset_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":numpy_dataset",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:test",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "input_lib",
+    srcs = ["input_lib.py"],
+    deps = [
+        ":device_util",
+        ":distribute_lib",
+        ":input_ops",
+        ":values",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/ops:multi_device_iterator_ops",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
 py_library(
     name = "input_ops",
     srcs = ["input_ops.py"],
@@ -270,6 +380,7 @@ cuda_py_test(
         ":input_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/data/util:traverse",
         "//tensorflow/python:errors",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
@@ -277,16 +388,12 @@ cuda_py_test(
         "//tensorflow/python:io_ops",
         "//tensorflow/python:util",
     ],
-    tags = [
-        "no_pip",
-    ],
 )
 
 py_test(
     name = "multi_worker_util_test",
     srcs = ["multi_worker_util_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
     deps = [
         ":multi_worker_util",
         "//tensorflow/core:protos_all_py",
@@ -347,16 +454,14 @@ py_library(
     deps = [
         ":device_util",
         ":distribute_lib",
-        ":input_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:multi_device_iterator_ops",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/distribute/__init__.py b/tensorflow/python/distribute/__init__.py
index 4ff912ae10d8336cfeeb42d060bd0d9c52e24482..f9d0a95ea580a8bb125e6610c232d1eabfe105a6 100644
--- a/tensorflow/python/distribute/__init__.py
+++ b/tensorflow/python/distribute/__init__.py
@@ -19,7 +19,12 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import
+from tensorflow.python.distribute import cluster_resolver
+from tensorflow.python.distribute import cross_device_ops
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.distribute import one_device_strategy
+from tensorflow.python.distribute.experimental import collective_all_reduce_strategy
+from tensorflow.python.distribute.experimental import parameter_server_strategy
 # pylint: enable=unused-import
diff --git a/tensorflow/python/distribute/cluster_resolver/__init__.py b/tensorflow/python/distribute/cluster_resolver/__init__.py
index ef87f59b7fd7ef1774ed97370c75e16f3ec4e295..39ea191fb04a9e6a8c091eabff9fb5aeec888dfd 100644
--- a/tensorflow/python/distribute/cluster_resolver/__init__.py
+++ b/tensorflow/python/distribute/cluster_resolver/__init__.py
@@ -18,40 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.distribute.cluster_resolver import cluster_resolver
-from tensorflow.python.distribute.cluster_resolver import gce_cluster_resolver
-from tensorflow.python.distribute.cluster_resolver import kubernetes_cluster_resolver
-from tensorflow.python.distribute.cluster_resolver import slurm_cluster_resolver
-from tensorflow.python.distribute.cluster_resolver import tfconfig_cluster_resolver
-from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
-
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import UnionClusterResolver
-from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GceClusterResolver
+from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GCEClusterResolver
 from tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver import KubernetesClusterResolver
 from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import SlurmClusterResolver
 from tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver import TFConfigClusterResolver
 from tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver import TPUClusterResolver
-
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = [
-    'cluster_resolver',
-    'gce_cluster_resolver',
-    'kubernetes_cluster_resolver',
-    'slurm_cluster_resolver',
-    'tfconfig_cluster_resolver',
-    'tpu_cluster_resolver',
-    'ClusterResolver',
-    'SimpleClusterResolver',
-    'UnionClusterResolver',
-    'GceClusterResolver',
-    'KubernetesClusterResolver',
-    'TFConfigClusterResolver',
-    'TPUClusterResolver',
-    'SlurmClusterResolver',
-]
-
-remove_undocumented(__name__, _allowed_symbols)
-
diff --git a/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
index ca40e60a557d8fb1a5db8565369d1d1ae7e0c136..22b93f033027e9ba17d705d9fa5774d63589fb89 100644
--- a/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
@@ -20,9 +20,13 @@ from __future__ import print_function
 
 import abc
 
+import collections
 import six
 
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
 from tensorflow.python.training.server_lib import ClusterSpec
+from tensorflow.python.util.tf_export import tf_export
 
 
 def format_master_url(master, rpc_layer=None):
@@ -32,6 +36,15 @@ def format_master_url(master, rpc_layer=None):
     return master
 
 
+def get_accelerator_devices(master, config_proto):
+  # TODO(frankchn): Add support for eager mode as well as graph mode.
+  with ops.Graph().as_default():
+    with session.Session(master, config=config_proto) as s:
+      devices = s.list_devices()
+  return devices
+
+
+@tf_export('distribute.cluster_resolver.ClusterResolver')
 @six.add_metaclass(abc.ABCMeta)
 class ClusterResolver(object):
   """Abstract class for all implementations of ClusterResolvers.
@@ -46,13 +59,13 @@ class ClusterResolver(object):
   underlying machine failures and scale TensorFlow worker clusters up and down.
 
   Note to Implementors: In addition to these abstract methods, you must also
-  implement the task_type, task_index, and rpc_layer attributes. You may choose
+  implement the task_type, task_id, and rpc_layer attributes. You may choose
   to implement them either as properties with getters or setters or directly
   set the attributes.
 
   - task_type is the name of the server's current named job (e.g. 'worker',
      'ps' in a distributed parameterized training job).
-  - task_index is the ordinal index of the server within the task type.
+  - task_id is the ordinal index of the server within the task type.
   - rpc_layer is the protocol used by TensorFlow to communicate with other
       TensorFlow servers in a distributed environment.
   """
@@ -74,12 +87,12 @@ class ClusterResolver(object):
     raise NotImplementedError()
 
   @abc.abstractmethod
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     """Retrieves the name or URL of the session master.
 
     Args:
       task_type: (Optional) The type of the TensorFlow task of the master.
-      task_index: (Optional) The index of the TensorFlow task of the master.
+      task_id: (Optional) The index of the TensorFlow task of the master.
       rpc_layer: (Optional) The RPC protocol for the given cluster.
 
     Returns:
@@ -91,53 +104,69 @@ class ClusterResolver(object):
     """
     raise NotImplementedError()
 
-  @abc.abstractmethod
   def num_accelerators(self,
                        task_type=None,
-                       task_index=None,
-                       accelerator_type='GPU',
+                       task_id=None,
                        config_proto=None):
     """Returns the number of accelerator cores per worker.
 
     This returns the number of accelerator cores (such as GPUs and TPUs)
-    available per worker. If workers only has CPU cores available, then this
-    should return 0. This method will query the master for this information
-    if it is not otherwise known.
+    available per worker.
 
-    Optionally, we allow callers to specify the task_type, task_index, and
-    rpc_layer, if they want to target a specific TensorFlow process to query
+    Optionally, we allow callers to specify the task_type, and task_id, for
+    if they want to target a specific TensorFlow process to query
     the number of accelerators. This is to support heterogenous environments,
     where the number of accelerators cores per host is different.
 
     Args:
       task_type: (Optional) The type of the TensorFlow task of the machine we
         want to query.
-      task_index: (Optional) The index of the TensorFlow task of the machine we
+      task_id: (Optional) The index of the TensorFlow task of the machine we
         want to query.
-      accelerator_type: (Optional) The type of accelerator we are trying to
-        query (defaults to 'GPU').
       config_proto: (Optional) Configuration for starting a new session to
         query how many accelerator cores it has.
+
+    Returns:
+      A map of accelerator types to number of cores.
     """
-    raise NotImplementedError()
+    master = self.master(task_type, task_id)
+    devices = get_accelerator_devices(master, config_proto)
+    mapping = collections.defaultdict(int)
+    for device in devices:
+      mapping[device.device_type] += 1
+    return mapping
 
-  @abc.abstractproperty
+  @property
   def environment(self):
-    """Returns the current environment which TensorFlow is running in."""
-    raise NotImplementedError()
+    """Returns the current environment which TensorFlow is running in.
+
+    There are two possible return values, "google" (when TensorFlow is running
+    in a Google-internal environment) or an empty string (when TensorFlow is
+    running elsewhere).
+
+    If you are implementing a ClusterResolver that works in both the Google
+    environment and the open-source world (for instance, a TPU ClusterResolver
+    or similar), you will have to return the appropriate string depending on the
+    environment, which you will have to detect.
+
+    Otherwise, if you are implementing a ClusterResolver that will only work
+    in open-source TensorFlow, you do not need to implement this property.
+    """
+    return ''
 
 
+@tf_export('distribute.cluster_resolver.SimpleClusterResolver')
 class SimpleClusterResolver(ClusterResolver):
   """Simple implementation of ClusterResolver that accepts a ClusterSpec."""
 
-  def __init__(self, cluster_spec, master='', task_type=None, task_index=None,
+  def __init__(self, cluster_spec, master='', task_type=None, task_id=None,
                environment='', num_accelerators=0,
                rpc_layer=None):
     """Creates a SimpleClusterResolver from a ClusterSpec."""
     super(SimpleClusterResolver, self).__init__()
 
     self._task_type = task_type
-    self._task_index = task_index
+    self._task_id = task_id
     self._environment = environment
     self._num_accelerators = num_accelerators
     self._rpc_layer = rpc_layer
@@ -154,22 +183,22 @@ class SimpleClusterResolver(ClusterResolver):
     """Returns the ClusterSpec passed into the constructor."""
     return self._cluster_spec
 
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     """Returns the master address to use when creating a session.
 
     Args:
       task_type: (Optional) The type of the TensorFlow task of the master.
-      task_index: (Optional) The index of the TensorFlow task of the master.
+      task_id: (Optional) The index of the TensorFlow task of the master.
       rpc_layer: (Optional) The RPC used by distributed TensorFlow.
 
     Returns:
       The name or URL of the session master.
 
-    If a task_type and task_index is given, this will override the `master`
+    If a task_type and task_id is given, this will override the `master`
     string passed into the initialization function.
     """
-    if task_type is not None and task_index is not None:
-      master = self.cluster_spec().task_address(task_type, task_index)
+    if task_type is not None and task_id is not None:
+      master = self.cluster_spec().task_address(task_type, task_id)
     else:
       master = self._master
 
@@ -180,16 +209,16 @@ class SimpleClusterResolver(ClusterResolver):
     return self._task_type
 
   @property
-  def task_index(self):
-    return self._task_index
+  def task_id(self):
+    return self._task_id
 
   @task_type.setter
   def task_type(self, task_type):
     self._task_type = task_type
 
-  @task_index.setter
-  def task_index(self, task_index):
-    self._task_index = task_index
+  @task_id.setter
+  def task_id(self, task_id):
+    self._task_id = task_id
 
   @property
   def environment(self):
@@ -197,7 +226,7 @@ class SimpleClusterResolver(ClusterResolver):
 
   def num_accelerators(self,
                        task_type=None,
-                       task_index=None,
+                       task_id=None,
                        accelerator_type='GPU',
                        config_proto=None):
     """Returns the number of accelerator cores per worker.
@@ -209,12 +238,12 @@ class SimpleClusterResolver(ClusterResolver):
 
     Args:
       task_type: Unused.
-      task_index: Unused.
+      task_id: Unused.
       accelerator_type: Unused.
       config_proto: Unused.
     """
     # Unused
-    del task_type, task_index, accelerator_type, config_proto
+    del task_type, task_id, accelerator_type, config_proto
     return self._num_accelerators
 
   @property
@@ -226,6 +255,7 @@ class SimpleClusterResolver(ClusterResolver):
     self._rpc_layer = rpc_layer
 
 
+@tf_export('distribute.cluster_resolver.UnionResolver')
 class UnionClusterResolver(ClusterResolver):
   """Performs a union on underlying ClusterResolvers.
 
@@ -248,7 +278,7 @@ class UnionClusterResolver(ClusterResolver):
         rpc_layer - (Optional) Override value for the RPC layer used by
           TensorFlow.
         task_type - (Optional) Override value for the current task type.
-        task_index - (Optional) Override value for the current task index.
+        task_id - (Optional) Override value for the current task index.
 
     Raises:
       TypeError: If any argument is not a subclass of `ClusterResolvers`.
@@ -258,7 +288,7 @@ class UnionClusterResolver(ClusterResolver):
 
     self._rpc_layer = kwargs.pop('rpc_layer', None)
     self._task_type = kwargs.pop('task_type', None)
-    self._task_index = kwargs.pop('task_index', None)
+    self._task_id = kwargs.pop('task_id', None)
 
     if kwargs:
       raise ValueError('Unexpected kwargs provided {!r}'.format(kwargs))
@@ -346,22 +376,22 @@ class UnionClusterResolver(ClusterResolver):
 
     return ClusterSpec(merged_cluster)
 
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     """Returns the master address to use when creating a session.
 
     This usually returns the master from the first ClusterResolver passed in,
-    but you can override this by specifying the task_type and task_index.
+    but you can override this by specifying the task_type and task_id.
 
     Args:
       task_type: (Optional) The type of the TensorFlow task of the master.
-      task_index: (Optional) The index of the TensorFlow task of the master.
+      task_id: (Optional) The index of the TensorFlow task of the master.
       rpc_layer: (Optional) The RPC protocol for the given cluster.
 
     Returns:
       The name or URL of the session master.
     """
-    if task_type is not None and task_index is not None:
-      master = self.cluster_spec().task_address(task_type, task_index)
+    if task_type is not None and task_id is not None:
+      master = self.cluster_spec().task_address(task_type, task_id)
       return format_master_url(master, rpc_layer or self._rpc_layer)
 
     return self._cluster_resolvers[0].master(rpc_layer=rpc_layer)
@@ -371,16 +401,16 @@ class UnionClusterResolver(ClusterResolver):
     return self._task_type or self._cluster_resolvers[0].task_type
 
   @property
-  def task_index(self):
-    return self._task_index or self._cluster_resolvers[0].task_index
+  def task_id(self):
+    return self._task_id or self._cluster_resolvers[0].task_id
 
   @task_type.setter
   def task_type(self, task_type):
     self._task_type = task_type
 
-  @task_index.setter
-  def task_index(self, task_index):
-    self._task_index = task_index
+  @task_id.setter
+  def task_id(self, task_id):
+    self._task_id = task_id
 
   @property
   def environment(self):
@@ -388,11 +418,11 @@ class UnionClusterResolver(ClusterResolver):
 
   def num_accelerators(self,
                        task_type=None,
-                       task_index=None,
+                       task_id=None,
                        accelerator_type='GPU',
                        config_proto=None):
     return self._cluster_resolvers[0].num_accelerators(
-        task_type, task_index, accelerator_type, config_proto)
+        task_type, task_id, accelerator_type, config_proto)
 
   @property
   def rpc_layer(self):
diff --git a/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py
index 3f7b46972746f46ee866a5891ed2ca9ef0722a0c..019d223eb02fd2d8ebe2bfbce3d0c02fe2c0cb17 100644
--- a/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py
@@ -18,11 +18,68 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.client import session
+from tensorflow.python.distribute.cluster_resolver import ClusterResolver
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.distribute.cluster_resolver import UnionClusterResolver
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
+mock = test.mock
+
+
+class MockBaseClusterResolver(ClusterResolver):
+
+  def cluster_spec(self):
+    return None
+
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
+    return ""
+
+  def environment(self):
+    return ""
+
+
+class BaseClusterResolverTest(test.TestCase):
+
+  @mock.patch.object(session.BaseSession, "list_devices")
+  def testNumAcceleratorsSuccess(self, mock_list_devices):
+    device_names = [
+        "/job:worker/task:0/device:GPU:0",
+        "/job:worker/task:0/device:GPU:1",
+        "/job:worker/task:0/device:GPU:2",
+        "/job:worker/task:0/device:GPU:3",
+    ]
+    device_list = [
+        session._DeviceAttributes(
+            name, "GPU", 1024, 0) for name in device_names
+    ]
+    mock_list_devices.return_value = device_list
+
+    resolver = MockBaseClusterResolver()
+    self.assertEqual(resolver.num_accelerators(), {"GPU": 4})
+
+  @mock.patch.object(session.BaseSession, "list_devices")
+  def testNumAcceleratorsMultiDeviceSuccess(self, mock_list_devices):
+    device_names = [
+        "/job:worker/task:0/device:TPU:0",
+        "/job:worker/task:0/device:TPU:1",
+        "/job:worker/task:0/device:TPU:2",
+        "/job:worker/task:0/device:TPU:3",
+        "/job:worker/task:0/device:GPU:0",
+        "/job:worker/task:0/device:GPU:1",
+        "/job:worker/task:0/device:GPU:2",
+        "/job:worker/task:0/device:GPU:3",
+    ]
+    device_list = [
+        session._DeviceAttributes(
+            name, name[26:29], 1024, 0) for name in device_names
+    ]
+    mock_list_devices.return_value = device_list
+
+    resolver = MockBaseClusterResolver()
+    self.assertEqual(resolver.num_accelerators(), {"TPU": 4, "GPU": 4})
+
 
 class UnionClusterResolverTest(test.TestCase):
   # TODO(frankchn): Transform to parameterized test after it is included in the
@@ -64,12 +121,12 @@ class UnionClusterResolverTest(test.TestCase):
     })
 
     simple_resolver = SimpleClusterResolver(base_cluster_spec, task_type="ps",
-                                            task_index=1, environment="cloud",
+                                            task_id=1, environment="cloud",
                                             num_accelerators=8,
                                             rpc_layer="grpc")
 
     self.assertEqual(simple_resolver.task_type, "ps")
-    self.assertEqual(simple_resolver.task_index, 1)
+    self.assertEqual(simple_resolver.task_id, 1)
     self.assertEqual(simple_resolver.environment, "cloud")
     self.assertEqual(simple_resolver.num_accelerators(), 8)
     self.assertEqual(simple_resolver.rpc_layer, "grpc")
@@ -81,16 +138,16 @@ class UnionClusterResolverTest(test.TestCase):
     })
 
     simple_resolver = SimpleClusterResolver(base_cluster_spec, task_type="ps",
-                                            task_index=1, environment="cloud",
+                                            task_id=1, environment="cloud",
                                             num_accelerators=8,
                                             rpc_layer="grpc")
 
     simple_resolver.task_type = "worker"
-    simple_resolver.task_index = 2
+    simple_resolver.task_id = 2
     simple_resolver.rpc_layer = "http"
 
     self.assertEqual(simple_resolver.task_type, "worker")
-    self.assertEqual(simple_resolver.task_index, 2)
+    self.assertEqual(simple_resolver.task_id, 2)
     self.assertEqual(simple_resolver.rpc_layer, "http")
 
   def testSimpleOverrideMasterWithTaskIndexZero(self):
@@ -129,7 +186,7 @@ class UnionClusterResolverTest(test.TestCase):
         "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
     })
     resolver1 = SimpleClusterResolver(cluster_spec_1, task_type="ps",
-                                      task_index=1, environment="cloud",
+                                      task_id=1, environment="cloud",
                                       num_accelerators=8,
                                       rpc_layer="grpc")
 
@@ -138,24 +195,24 @@ class UnionClusterResolverTest(test.TestCase):
         "worker": ["worker3:2222", "worker4:2222", "worker5:2222"]
     })
     resolver2 = SimpleClusterResolver(cluster_spec_2, task_type="worker",
-                                      task_index=2, environment="local",
+                                      task_id=2, environment="local",
                                       num_accelerators=16,
                                       rpc_layer="http")
 
     union_resolver = UnionClusterResolver(resolver1, resolver2)
 
     self.assertEqual(union_resolver.task_type, "ps")
-    self.assertEqual(union_resolver.task_index, 1)
+    self.assertEqual(union_resolver.task_id, 1)
     self.assertEqual(union_resolver.environment, "cloud")
     self.assertEqual(union_resolver.num_accelerators(), 8)
     self.assertEqual(union_resolver.rpc_layer, "grpc")
 
     union_resolver.task_type = "worker"
-    union_resolver.task_index = 2
+    union_resolver.task_id = 2
     union_resolver.rpc_layer = "http"
 
     self.assertEqual(union_resolver.task_type, "worker")
-    self.assertEqual(union_resolver.task_index, 2)
+    self.assertEqual(union_resolver.task_id, 2)
     self.assertEqual(union_resolver.rpc_layer, "http")
 
   def testTwoNonOverlappingJobMergedClusterResolver(self):
diff --git a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py
index 2412f6dad095bb2282ba51b7edb1f293f57d428d..9d7dfdd1ea9078ae4fd5fcf1da0f56a3f8b91a1f 100644
--- a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
 from tensorflow.python.training.server_lib import ClusterSpec
+from tensorflow.python.util.tf_export import tf_export
+
 
 _GOOGLE_API_CLIENT_INSTALLED = True
 try:
@@ -29,11 +31,8 @@ except ImportError:
   _GOOGLE_API_CLIENT_INSTALLED = False
 
 
-def _format_master_url(master, rpc_layer=None):
-  return '%s://%s' % (rpc_layer, master) if rpc_layer else master
-
-
-class GceClusterResolver(ClusterResolver):
+@tf_export('distribute.cluster_resolver.GCEClusterResolver')
+class GCEClusterResolver(ClusterResolver):
   """Cluster Resolver for Google Compute Engine.
 
   This is an implementation of cluster resolvers for the Google Compute Engine
@@ -49,14 +48,13 @@ class GceClusterResolver(ClusterResolver):
                instance_group,
                port,
                task_type='worker',
-               task_index=0,
+               task_id=0,
                rpc_layer='grpc',
-               num_accelerators=0,
                credentials='default',
                service=None):
-    """Creates a new GceClusterResolver object.
+    """Creates a new GCEClusterResolver object.
 
-    This takes in a few parameters and creates a GceClusterResolver project. It
+    This takes in a few parameters and creates a GCEClusterResolver project. It
     will then use these parameters to query the GCE API for the IP addresses of
     each instance in the instance group.
 
@@ -67,14 +65,12 @@ class GceClusterResolver(ClusterResolver):
       port: Port of the listening TensorFlow server (default: 8470)
       task_type: Name of the TensorFlow job this GCE instance group of VM
         instances belong to.
-      task_index: The task index for this particular VM, within the GCE
+      task_id: The task index for this particular VM, within the GCE
         instance group. In particular, every single instance should be assigned
         a unique ordinal index within an instance group manually so that they
         can be distinguished from each other.
       rpc_layer: The RPC layer TensorFlow should use to communicate across
         instances.
-      num_accelerators: Number of accelerators (GPUs) present per
-        instance.
       credentials: GCE Credentials. If nothing is specified, this defaults to
         GoogleCredentials.get_application_default().
       service: The GCE API object returned by the googleapiclient.discovery
@@ -88,9 +84,8 @@ class GceClusterResolver(ClusterResolver):
     self._zone = zone
     self._instance_group = instance_group
     self._task_type = task_type
-    self._task_index = task_index
+    self._task_id = task_id
     self._rpc_layer = rpc_layer
-    self._num_accelerators = num_accelerators
     self._port = port
     self._credentials = credentials
 
@@ -153,12 +148,12 @@ class GceClusterResolver(ClusterResolver):
     worker_list.sort()
     return ClusterSpec({self._task_type: worker_list})
 
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     task_type = task_type if task_type is not None else self._task_type
-    task_index = task_index if task_index is not None else self._task_index
+    task_id = task_id if task_id is not None else self._task_id
 
-    if task_type is not None and task_index is not None:
-      master = self.cluster_spec().task_address(task_type, task_index)
+    if task_type is not None and task_id is not None:
+      master = self.cluster_spec().task_address(task_type, task_id)
       if rpc_layer or self._rpc_layer:
         return '%s://%s' % (rpc_layer or self._rpc_layer, master)
       else:
@@ -171,28 +166,18 @@ class GceClusterResolver(ClusterResolver):
     return self._task_type
 
   @property
-  def task_index(self):
-    return self._task_index
+  def task_id(self):
+    return self._task_id
 
   @task_type.setter
   def task_type(self, task_type):
     raise RuntimeError(
-        'You cannot reset the task_type of the GceClusterResolver after it has '
+        'You cannot reset the task_type of the GCEClusterResolver after it has '
         'been created.')
 
-  @task_index.setter
-  def task_index(self, task_index):
-    self._task_index = task_index
-
-  @property
-  def environment(self):
-    """Returns the current environment which TensorFlow is running in.
-
-    For users in the GCE environment, the environment property is always an
-    empty string, and Google users will not use this ClusterResolver for running
-    on internal systems.
-    """
-    return ''
+  @task_id.setter
+  def task_id(self, task_id):
+    self._task_id = task_id
 
   @property
   def rpc_layer(self):
@@ -201,12 +186,3 @@ class GceClusterResolver(ClusterResolver):
   @rpc_layer.setter
   def rpc_layer(self, rpc_layer):
     self._rpc_layer = rpc_layer
-
-  def num_accelerators(self,
-                       task_type=None,
-                       task_index=None,
-                       accelerator_type='GPU',
-                       config_proto=None):
-    # Unused
-    del task_type, task_index, accelerator_type, config_proto
-    return self._num_accelerators
diff --git a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
index d4f0660c922d593d81c0927dea0d6271e89c53e1..47d1cdc0da9689d78647d8a584267707c6e85e64 100644
--- a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for GceClusterResolver."""
+"""Tests for GCEClusterResolver."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.distribute.cluster_resolver import GceClusterResolver
+from tensorflow.python.distribute.cluster_resolver import GCEClusterResolver
 from tensorflow.python.distribute.cluster_resolver import UnionClusterResolver
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
@@ -27,7 +27,7 @@ from tensorflow.python.training import server_lib
 mock = test.mock
 
 
-class GceClusterResolverTest(test.TestCase):
+class GCEClusterResolverTest(test.TestCase):
 
   def _verifyClusterSpecEquality(self, cluster_spec, expected_proto):
     self.assertProtoEquals(expected_proto, cluster_spec.as_cluster_def())
@@ -121,7 +121,7 @@ class GceClusterResolverTest(test.TestCase):
     return self.standard_mock_service_client(mock_instance_group, mock_instance)
 
   def testSimpleSuccessfulRetrieval(self):
-    gce_cluster_resolver = GceClusterResolver(
+    gce_cluster_resolver = GCEClusterResolver(
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
@@ -136,11 +136,11 @@ class GceClusterResolverTest(test.TestCase):
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
 
   def testMasterRetrieval(self):
-    gce_cluster_resolver = GceClusterResolver(
+    gce_cluster_resolver = GCEClusterResolver(
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
-        task_index=0,
+        task_id=0,
         port=8470,
         credentials=None,
         service=self.standard_mock_service_client())
@@ -153,7 +153,7 @@ class GceClusterResolverTest(test.TestCase):
         {'name': 'instance3', 'ip': '10.3.4.5'},
     ]
 
-    gce_cluster_resolver = GceClusterResolver(
+    gce_cluster_resolver = GCEClusterResolver(
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
@@ -172,7 +172,7 @@ class GceClusterResolverTest(test.TestCase):
         {'name': 'instance3', 'ip': '10.3.4.5'},
     ]
 
-    gce_cluster_resolver = GceClusterResolver(
+    gce_cluster_resolver = GCEClusterResolver(
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
@@ -181,11 +181,11 @@ class GceClusterResolverTest(test.TestCase):
         credentials=None,
         service=self.gen_standard_mock_service_client(name_to_ip))
 
-    gce_cluster_resolver.task_index = 1
+    gce_cluster_resolver.task_id = 1
     gce_cluster_resolver.rpc_layer = 'test'
 
     self.assertEqual(gce_cluster_resolver.task_type, 'testworker')
-    self.assertEqual(gce_cluster_resolver.task_index, 1)
+    self.assertEqual(gce_cluster_resolver.task_id, 1)
     self.assertEqual(gce_cluster_resolver.rpc_layer, 'test')
     self.assertEqual(gce_cluster_resolver.master(), 'test://10.2.3.4:8470')
 
@@ -196,21 +196,21 @@ class GceClusterResolverTest(test.TestCase):
         {'name': 'instance3', 'ip': '10.3.4.5'},
     ]
 
-    gce_cluster_resolver = GceClusterResolver(
+    gce_cluster_resolver = GCEClusterResolver(
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
         task_type='',
-        task_index=1,
+        task_id=1,
         port=8470,
         credentials=None,
         service=self.gen_standard_mock_service_client(name_to_ip))
 
     self.assertEqual(gce_cluster_resolver.master(
-        task_type='', task_index=0), 'grpc://10.1.2.3:8470')
+        task_type='', task_id=0), 'grpc://10.1.2.3:8470')
 
   def testCustomJobNameAndPortRetrieval(self):
-    gce_cluster_resolver = GceClusterResolver(
+    gce_cluster_resolver = GCEClusterResolver(
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
@@ -232,7 +232,7 @@ class GceClusterResolverTest(test.TestCase):
         {'name': 'instance3', 'ip': '10.3.4.5'},
     ]
 
-    gce_cluster_resolver = GceClusterResolver(
+    gce_cluster_resolver = GCEClusterResolver(
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
@@ -266,7 +266,7 @@ class GceClusterResolverTest(test.TestCase):
         {'name': 'ps2', 'ip': '10.100.2.3'},
     ]
 
-    worker1_gce_cluster_resolver = GceClusterResolver(
+    worker1_gce_cluster_resolver = GCEClusterResolver(
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
@@ -275,7 +275,7 @@ class GceClusterResolverTest(test.TestCase):
         credentials=None,
         service=self.gen_standard_mock_service_client(worker1_name_to_ip))
 
-    worker2_gce_cluster_resolver = GceClusterResolver(
+    worker2_gce_cluster_resolver = GCEClusterResolver(
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
@@ -284,7 +284,7 @@ class GceClusterResolverTest(test.TestCase):
         credentials=None,
         service=self.gen_standard_mock_service_client(worker2_name_to_ip))
 
-    ps_gce_cluster_resolver = GceClusterResolver(
+    ps_gce_cluster_resolver = GCEClusterResolver(
         project='test-project',
         zone='us-east1-d',
         instance_group='test-instance-group',
diff --git a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py
index b21c3676bee53e785474308435021885dc93377c..28b2712590d0519f1dbbdde1b43fab829238fa25 100644
--- a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py
@@ -18,10 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.client import device_lib
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import format_master_url
 from tensorflow.python.training import server_lib
+from tensorflow.python.util.tf_export import tf_export
 
 _KUBERNETES_API_CLIENT_INSTALLED = True
 try:
@@ -31,6 +31,7 @@ except ImportError:
   _KUBERNETES_API_CLIENT_INSTALLED = False
 
 
+@tf_export('distribute.cluster_resolver.KubernetesClusterResolver')
 class KubernetesClusterResolver(ClusterResolver):
   """Cluster Resolver for Kubernetes.
 
@@ -89,33 +90,31 @@ class KubernetesClusterResolver(ClusterResolver):
     self._override_client = override_client
 
     self.task_type = None
-    self.task_index = None
+    self.task_id = None
     self.rpc_layer = rpc_layer
 
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     """Returns the master address to use when creating a session.
 
-    You must have set the task_type and task_index object properties before
-    calling this function, or pass in the `task_type` and `task_index`
+    You must have set the task_type and task_id object properties before
+    calling this function, or pass in the `task_type` and `task_id`
     parameters when using this function. If you do both, the function parameters
     will override the object properties.
 
     Args:
       task_type: (Optional) The type of the TensorFlow task of the master.
-      task_index: (Optional) The index of the TensorFlow task of the master.
+      task_id: (Optional) The index of the TensorFlow task of the master.
       rpc_layer: (Optional) The RPC protocol for the given cluster.
 
     Returns:
       The name or URL of the session master.
     """
-    if task_type is not None and task_index is not None:
-      return format_master_url(
-          self.cluster_spec().task_address(task_type, task_index),
-          rpc_layer or self.rpc_layer)
+    task_type = task_type if task_type is not None else self.task_type
+    task_id = task_id if task_id is not None else self.task_id
 
-    if self.task_type is not None and self.task_index is not None:
+    if task_type is not None and task_id is not None:
       return format_master_url(
-          self.cluster_spec().task_address(self.task_type, self.task_index),
+          self.cluster_spec().task_address(task_type, task_id),
           rpc_layer or self.rpc_layer)
 
     return ''
@@ -157,26 +156,3 @@ class KubernetesClusterResolver(ClusterResolver):
       cluster_map[tf_job] = all_pods
 
     return server_lib.ClusterSpec(cluster_map)
-
-  @property
-  def environment(self):
-    """Returns the current environment which TensorFlow is running in.
-
-    For users in the Cloud environment, the environment property is always an
-    empty string, and Google users will not use this ClusterResolver for running
-    on internal systems.
-    """
-    return ''
-
-  def num_accelerators(self,
-                       task_type=None,
-                       task_index=None,
-                       accelerator_type='GPU',
-                       config_proto=None):
-    # TODO(frankchn): Make querying non-local accelerators work
-    if task_type is not None or task_index is not None:
-      raise NotImplementedError('Querying non-local accelerators is not yet'
-                                'implemented.')
-
-    local_devices = device_lib.list_local_devices(config_proto)
-    return sum(d.device_type == accelerator_type for d in local_devices)
diff --git a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py
index a9750fa60b993a3504bbd01f0663cfdf868a2f01..f4e4cd82129a807cc62b81e7b7ac07d6b7c8d92c 100644
--- a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py
@@ -119,9 +119,9 @@ class KubernetesClusterResolverTest(test.TestCase):
         override_client=_mock_kubernetes_client(
             {'job-name=tensorflow': ret}))
     cluster_resolver.task_type = 'worker'
-    cluster_resolver.task_index = 0
+    cluster_resolver.task_id = 0
     self.assertEqual(cluster_resolver.task_type, 'worker')
-    self.assertEqual(cluster_resolver.task_index, 0)
+    self.assertEqual(cluster_resolver.task_id, 0)
     self.assertEqual(cluster_resolver.master(), 'grpc://10.1.2.3:8470')
     self.assertEqual(cluster_resolver.master('worker', 2),
                      'grpc://10.1.2.5:8470')
diff --git a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
index 1ab81731b7a111848608068220488a368d9b86ec..04675f4d176852dd84c05f2af9e53552a8175dff 100644
--- a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
@@ -23,9 +23,12 @@ import os
 import subprocess
 
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import format_master_url
 from tensorflow.python.training.server_lib import ClusterSpec
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export('distribute.cluster_resolver.SlurmClusterResolver')
 class SlurmClusterResolver(ClusterResolver):
   """Cluster Resolver for system with Slurm workload manager.
 
@@ -111,7 +114,7 @@ class SlurmClusterResolver(ClusterResolver):
 
     self._auto_set_gpu = auto_set_gpu
     self.task_type = None
-    self.task_index = None
+    self.task_id = None
     self.rpc_layer = rpc_layer
 
     self._gpu_allocation = []
@@ -169,7 +172,7 @@ class SlurmClusterResolver(ClusterResolver):
 
       if cluster_rank_offset_start <= self._rank < cluster_rank_offset_end:
         self.task_type = task_type
-        self.task_index = self._rank - cluster_rank_offset_start
+        self.task_id = self._rank - cluster_rank_offset_start
 
       cluster_rank_offset_start = cluster_rank_offset_end
 
@@ -179,7 +182,7 @@ class SlurmClusterResolver(ClusterResolver):
     return ClusterSpec(self._cluster_allocation)
 
   def get_task_info(self):
-    """Returns job name and task_index for the process which calls this.
+    """Returns job name and task_id for the process which calls this.
 
     This returns the job name and task index for the process which calls this
     function according to its rank and cluster specification. The job name and
@@ -190,14 +193,14 @@ class SlurmClusterResolver(ClusterResolver):
       A string specifying job name the process belongs to and an integner
         specifying the task index the process belongs to in that job.
     """
-    return self.task_type, self.task_index
+    return self.task_type, self.task_id
 
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     """Returns the master string for connecting to a TensorFlow master.
 
     Args:
       task_type: (Optional) Overrides the default auto-selected task type.
-      task_index: (Optional) Overrides the default auto-slected task index.
+      task_id: (Optional) Overrides the default auto-slected task index.
       rpc_layer: (Optional) Overrides the default RPC protocol TensorFlow uses
         to communicate across nodes.
 
@@ -205,27 +208,20 @@ class SlurmClusterResolver(ClusterResolver):
       A connection string for connecting to a TensorFlow master.
     """
     task_type = task_type if task_type is not None else self.task_type
-    task_index = task_index if task_index is not None else self.task_index
-    rpc_layer = rpc_layer or self.rpc_layer
-    master = self.cluster_spec().task_address(task_type, task_index)
+    task_id = task_id if task_id is not None else self.task_id
 
-    return '%s://%s' % (rpc_layer, master) if rpc_layer else master
+    if task_type is not None and task_id is not None:
+      return format_master_url(
+          self.cluster_spec().task_address(task_type, task_id),
+          rpc_layer or self.rpc_layer)
 
-  @property
-  def environment(self):
-    """Returns the current environment which TensorFlow is running in.
-
-    For users in the Slurm environment, the environment property is always an
-    empty string, and Google users will not use this ClusterResolver for running
-    on internal systems.
-    """
     return ''
 
   def num_accelerators(self,
                        task_type=None,
-                       task_index=None,
+                       task_id=None,
                        accelerator_type='GPU',
                        config_proto=None):
     # Unused, since this is set in __init__ manually.
-    del task_type, task_index, accelerator_type, config_proto
+    del task_type, task_id, accelerator_type, config_proto
     return self._gpus_per_node
diff --git a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py
index 076539d16f17d64a9a28052960b61a5b99a7c9c6..c641fe60853a4b131cb6035c48e3d9f6ef9ddadf 100644
--- a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py
@@ -83,7 +83,7 @@ class SlurmClusterResolverTest(test.TestCase):
         auto_set_gpu=False)
 
     slurm_cluster_resolver.task_type = 'worker'
-    slurm_cluster_resolver.task_index = 1
+    slurm_cluster_resolver.task_id = 1
     self.assertEqual(slurm_cluster_resolver.master(), 'grpc://t02n43:8888')
 
     slurm_cluster_resolver.rpc_layer = 'ab'
diff --git a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
index b4465714b2679f616d8730205c7ad7c020b04da6..0cd823916022f037cf8c9669cc73dd242798d986 100644
--- a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
@@ -24,6 +24,7 @@ import os
 
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
 from tensorflow.python.training.server_lib import ClusterSpec
+from tensorflow.python.util.tf_export import tf_export
 
 _TF_CONFIG_ENV = 'TF_CONFIG'
 _SESSION_MASTER_KEY = 'session_master'
@@ -47,60 +48,54 @@ def _get_value_in_tfconfig(key, default=None):
   return tf_config[key] if key in tf_config else default
 
 
+@tf_export('distribute.cluster_resolver.TFConfigClusterResolver')
 class TFConfigClusterResolver(ClusterResolver):
   """Implementation of a ClusterResolver which reads the TF_CONFIG EnvVar."""
 
   def __init__(self,
                task_type=None,
-               task_index=None,
+               task_id=None,
                rpc_layer=None,
-               environment=None,
-               num_accelerators=0):
+               environment=None):
     """Creates a new TFConfigClusterResolver.
 
     Args:
       task_type: (String, optional) Overrides the task type specified in the
         TF_CONFIG environment variable.
-      task_index: (Integer, optional) Overrides the task index specified in the
+      task_id: (Integer, optional) Overrides the task index specified in the
         TF_CONFIG environment variable.
       rpc_layer: (String, optional) Overrides the rpc layer TensorFlow uses.
       environment: (String, optional) Overrides the environment TensorFlow
         operates in.
-      num_accelerators: (Integer, optional) Specifies the number of
-        accelerators (e.g. GPUs, TPUs, others) that each node has.
     """
-    # TODO(frankchn): num_accelerators is a stop-gap and will be removed
-    # in favor of autodetection of devices soon.
-
     self._task_type = task_type
-    self._task_index = task_index
+    self._task_id = task_id
     self._rpc_layer = rpc_layer
     self._environment = environment
-    self._num_accelerators = num_accelerators
 
   @property
   def task_type(self):
     if self._task_type is None:
       task_info = _get_value_in_tfconfig(_TASK_KEY, {})
-      return task_info['type'] if 'type' in task_info else None
+      return str(task_info['type']) if 'type' in task_info else None
     else:
-      return self._task_type
+      return str(self._task_type)
 
   @property
-  def task_index(self):
+  def task_id(self):
     if self._task_type is None:
       task_info = _get_value_in_tfconfig(_TASK_KEY, {})
-      return task_info['index'] if 'index' in task_info else None
+      return int(task_info['index']) if 'index' in task_info else None
     else:
-      return self._task_index
+      return int(self._task_id)
 
   @task_type.setter
   def task_type(self, task_type):
     self._task_type = task_type
 
-  @task_index.setter
-  def task_index(self, task_index):
-    self._task_index = task_index
+  @task_id.setter
+  def task_id(self, task_id):
+    self._task_id = task_id
 
   @property
   def environment(self):
@@ -117,16 +112,6 @@ class TFConfigClusterResolver(ClusterResolver):
   def rpc_layer(self, rpc_layer):
     self._rpc_layer = rpc_layer
 
-  def num_accelerators(self,
-                       task_type=None,
-                       task_index=None,
-                       accelerator_type='GPU',
-                       config_proto=None):
-    # TODO(frankchn): Connect to server (w/ session_config) in the future.
-    # Unused, we do not connect to another server here right now.
-    del task_type, task_index, accelerator_type, config_proto
-    return self._num_accelerators
-
   def cluster_spec(self):
     """Returns a ClusterSpec based on the TF_CONFIG environment variable.
 
@@ -138,13 +123,13 @@ class TFConfigClusterResolver(ClusterResolver):
       return ClusterSpec({})
     return ClusterSpec(tf_config['cluster'])
 
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     """Returns the master address to use when creating a TensorFlow session.
 
     Args:
       task_type: (String, optional) Overrides and sets the task_type of the
         master.
-      task_index: (Integer, optional) Overrides and sets the task id of the
+      task_id: (Integer, optional) Overrides and sets the task id of the
         master.
       rpc_layer: (String, optional) Overrides and sets the protocol over which
         TensorFlow nodes communicate with each other.
@@ -172,7 +157,7 @@ class TFConfigClusterResolver(ClusterResolver):
     # We try to auto-detect the task type and id, but uses the user-supplied one
     # where available
     task_type = task_type if task_type is not None else self.task_type
-    task_index = task_index if task_index is not None else self.task_index
+    task_id = task_id if task_id is not None else self.task_id
 
-    return format_master_url(cluster_spec.task_address(task_type, task_index),
+    return format_master_url(cluster_spec.task_address(task_type, task_id),
                              self.rpc_layer)
diff --git a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
index 197eba1739017e8665588618e6b64297b310b513..65e75d4dd3486c33868565bc4e8a34ec6bde3ce1 100644
--- a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
@@ -150,7 +150,7 @@ class TFConfigClusterResolverTest(test.TestCase):
 
     cluster_resolver = TFConfigClusterResolver()
     self.assertEqual('ps', cluster_resolver.task_type)
-    self.assertEqual(0, cluster_resolver.task_index)
+    self.assertEqual(0, cluster_resolver.task_id)
     self.assertEqual('grpc', cluster_resolver.rpc_layer)
 
   def testParameterOverrides(self):
@@ -168,23 +168,55 @@ class TFConfigClusterResolverTest(test.TestCase):
     }
     """
 
-    cluster_resolver = TFConfigClusterResolver(task_type='ps', task_index=0,
-                                               num_accelerators=8)
+    cluster_resolver = TFConfigClusterResolver(task_type='ps', task_id=0)
 
     self.assertEqual('grpc://ps0:2222', cluster_resolver.master())
     self.assertEqual('ps', cluster_resolver.task_type)
-    self.assertEqual(0, cluster_resolver.task_index)
-    self.assertEqual(8, cluster_resolver.num_accelerators())
+    self.assertEqual(0, cluster_resolver.task_id)
 
     cluster_resolver.task_type = 'worker'
-    cluster_resolver.task_index = 1
+    cluster_resolver.task_id = 1
     cluster_resolver.rpc_layer = 'test'
 
     self.assertEqual('test://worker1:2222', cluster_resolver.master())
     self.assertEqual('worker', cluster_resolver.task_type)
-    self.assertEqual(1, cluster_resolver.task_index)
+    self.assertEqual(1, cluster_resolver.task_id)
     self.assertEqual('test', cluster_resolver.rpc_layer)
 
+  def testTaskTypeCastToString(self):
+    os.environ['TF_CONFIG'] = """
+    {
+      "cluster": {
+        "123456": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+      },
+      "rpc_layer": "grpc",
+      "task": {
+        "type": 123456,
+        "index": 0
+      }
+    }
+    """
+    cluster_resolver = TFConfigClusterResolver()
+    self.assertEqual('123456', cluster_resolver.task_type)
+
+  def testTaskIndexCastToInteger(self):
+    os.environ['TF_CONFIG'] = """
+    {
+      "cluster": {
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+      },
+      "rpc_layer": "grpc",
+      "task": {
+        "type": "ps",
+        "index": "1"
+      }
+    }
+    """
+    cluster_resolver = TFConfigClusterResolver()
+    self.assertEqual(1, cluster_resolver.task_id)
+
   def testZeroItemsInClusterSpecMasterRead(self):
     os.environ['TF_CONFIG'] = """
     {}
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
index e907d6fde4f7bb63553b85c580149a8cb51c9c3b..abf628be8202a940ed738f6cb08703d6dd3ea183 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
@@ -22,17 +22,19 @@ import collections
 import os
 import re
 
+from six.moves import urllib
+from six.moves.urllib.error import URLError
 from six.moves.urllib.request import Request
 from six.moves.urllib.request import urlopen
 
-from tensorflow.python.client import session
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import format_master_url
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import get_accelerator_devices
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
 
 _GOOGLE_API_CLIENT_INSTALLED = True
 try:
@@ -41,7 +43,6 @@ try:
 except ImportError:
   _GOOGLE_API_CLIENT_INSTALLED = False
 
-
 _GKE_ENV_VARIABLE = 'KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'
 _ENDPOINTS_SEPARATOR = ','
 _DEFAULT_ENV_VARIABLE = 'TPU_NAME'
@@ -55,38 +56,7 @@ DeviceDetails = collections.namedtuple(
     'DeviceDetails', ['device_map', 'total_cores'])
 
 
-def _get_device_dict_and_cores(devices):
-  """Returns a dict of hosts to cores and total cores given devices names.
-
-  Returns a namedtuple with two attributes:
-    device_map: A map of host_ids to a list of core_ids.
-    total_cores: The total number of cores within the TPU system.
-
-  Args:
-    devices: A list of devices returned by session.list_devices()
-  """
-  device_map = collections.defaultdict(list)
-  num_cores = 0
-  for device in devices:
-    match = _TPU_DEVICE_REGEX.match(device.name)
-    if match:
-      host_id = match.group('host_id')
-      core_id = match.group('core_id')
-      device_map[host_id].append(core_id)
-      num_cores += 1
-  return DeviceDetails(device_map, num_cores)
-
-
-def _verify_and_return_same_core_count(device_dict):
-  """Verifies that every device in device_dict has the same number of cores."""
-  num_cores_per_host_set = (
-      {len(core_ids) for core_ids in device_dict.values()})
-  if len(num_cores_per_host_set) != 1:
-    raise RuntimeError('TPU cores on each device is not the same. This '
-                       'should never happen. Devices: {}'.format(device_dict))
-  return num_cores_per_host_set.pop()
-
-
+@tf_export('distribute.cluster_resolver.TPUClusterResolver')
 class TPUClusterResolver(ClusterResolver):
   """Cluster Resolver for Google Cloud TPUs.
 
@@ -142,6 +112,38 @@ class TPUClusterResolver(ClusterResolver):
       return False
     return True
 
+  @staticmethod
+  def _get_device_dict_and_cores(devices):
+    """Returns a dict of hosts to cores and total cores given devices names.
+
+    Returns a namedtuple with two attributes:
+      device_map: A map of host_ids to a list of core_ids.
+      total_cores: The total number of cores within the TPU system.
+
+    Args:
+      devices: A list of devices returned by session.list_devices()
+    """
+    device_map = collections.defaultdict(list)
+    num_cores = 0
+    for device in devices:
+      match = _TPU_DEVICE_REGEX.match(device.name)
+      if match:
+        host_id = match.group('host_id')
+        core_id = match.group('core_id')
+        device_map[host_id].append(core_id)
+        num_cores += 1
+    return DeviceDetails(device_map, num_cores)
+
+  @staticmethod
+  def _verify_and_return_same_core_count(device_dict):
+    """Verifies that every device in device_dict has the same # of cores."""
+    num_cores_per_host_set = (
+        {len(core_ids) for core_ids in device_dict.values()})
+    if len(num_cores_per_host_set) != 1:
+      raise RuntimeError('TPU cores on each device is not the same. This '
+                         'should never happen. Devices: {}'.format(device_dict))
+    return num_cores_per_host_set.pop()
+
   @staticmethod
   def _inGke():
     """When running in GKE, the environment variable will be set."""
@@ -161,6 +163,20 @@ class TPUClusterResolver(ClusterResolver):
   def _environmentDiscoveryUrl():
     return os.environ.get(_DISCOVERY_SERVICE_URL_ENV_VARIABLE)
 
+  @staticmethod
+  def _isRunningInGCE():
+    """Checks for GCE presence by attempting to query the metadata service."""
+    try:
+      req = Request('http://metadata.google.internal/computeMetadata/v1',
+                    headers={'Metadata-Flavor': 'Google'})
+      resp = urllib.request.urlopen(req, timeout=1)
+      info = resp.info()
+      if 'Metadata-Flavor' in info and info['Metadata-Flavor'] == 'Google':
+        return True
+    except URLError:
+      pass
+    return False
+
   def __init__(self,
                tpu=None,
                zone=None,
@@ -177,11 +193,12 @@ class TPUClusterResolver(ClusterResolver):
     for the IP addresses and ports of each Cloud TPU listed.
 
     Args:
-      tpu: Either a string, or a list of strings corresponding to the TPUs to
-        use. If the single string is the empty string, the string 'local', or a
-        string that begins with 'grpc://' or '/bns', then it is assumed to not
-        correspond with a Cloud TPU and will instead be passed as the session
-        master and no ClusterSpec propagation will be done.
+      tpu: A string corresponding to the TPU to use. If the string is the empty
+        string, the string 'local', or a string that begins with 'grpc://' or
+        '/bns', then it is assumed to not correspond with a Cloud TPU and will
+        instead be passed as the session master and no ClusterSpec propagation
+        will be done. In the future, this may also support a list of strings
+        when multiple Cloud TPUs are used.
       zone: Zone where the TPUs are located. If omitted or empty, we will assume
         that the zone of the TPU is the same as the zone of the GCE VM, which we
         will try to discover from the GCE metadata service.
@@ -209,6 +226,8 @@ class TPUClusterResolver(ClusterResolver):
     Raises:
       ImportError: If the googleapiclient is not installed.
       ValueError: If no TPUs are specified.
+      RuntimeError: If an empty TPU name is specified and this is running in a
+        Google Cloud environment.
     """
     if isinstance(tpu, list):
       if not tpu:
@@ -231,10 +250,15 @@ class TPUClusterResolver(ClusterResolver):
 
     self._tpu = compat.as_bytes(tpu)  # self._tpu is always bytes
 
-    # By default the task_type is 'worker` and the task_index is 0 (which is the
+    # If we are running in Cloud and don't specify a TPU name
+    if self._isRunningInGCE() and not self._tpu:
+      raise RuntimeError('You need to specify a TPU Name if you are running in '
+                         'the Google Cloud environment.')
+
+    # By default the task_type is 'worker` and the task_id is 0 (which is the
     # first worker in the task).
     self.task_type = job_name
-    self.task_index = 0
+    self.task_id = 0
 
     if tpu.startswith('grpc://'):
       # Cloud environment, where we are using GRPC to communicate to TPUs.
@@ -262,7 +286,7 @@ class TPUClusterResolver(ClusterResolver):
     # in later in self.master().
     if self.rpc_layer is not None and tpu.startswith(self.rpc_layer + '://'):
       tpu = tpu[len(self.rpc_layer + '://'):]
-      self._tpu = tpu
+      self._tpu = compat.as_bytes(tpu)  # self._tpu is always bytes
       self._should_resolve_override = False
 
     # Whether we should actually attempt to contact Cloud APIs
@@ -304,7 +328,7 @@ class TPUClusterResolver(ClusterResolver):
     else:
       self._coordinator_address = coordinator_address
 
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
     """Get the Master string to be used for the session.
 
     In the normal case, this returns the grpc path (grpc://1.2.3.4:8470) of
@@ -318,7 +342,7 @@ class TPUClusterResolver(ClusterResolver):
     Args:
       task_type: (Optional, string) The type of the TensorFlow task of the
         master.
-      task_index: (Optional, integer) The index of the TensorFlow task of the
+      task_id: (Optional, integer) The index of the TensorFlow task of the
         master.
       rpc_layer: (Optional, string) The RPC protocol TensorFlow should use to
         communicate with TPUs.
@@ -332,12 +356,12 @@ class TPUClusterResolver(ClusterResolver):
     if self._shouldResolve():
       # We are going to communicate with the Cloud TPU APIs to get a Cluster.
       cluster_spec = self.cluster_spec()
-      if task_type is not None and task_index is not None:
-        # task_type and task_index is from the function parameter
-        master = cluster_spec.task_address(task_type, task_index)
-      elif self.task_type is not None and self.task_index is not None:
-        # task_type and task_index is from the object
-        master = cluster_spec.task_address(self.task_type, self.task_index)
+      if task_type is not None and task_id is not None:
+        # task_type and task_id is from the function parameter
+        master = cluster_spec.task_address(task_type, task_id)
+      elif self.task_type is not None and self.task_id is not None:
+        # task_type and task_id is from the object
+        master = cluster_spec.task_address(self.task_type, self.task_id)
       else:
         # by default we take the first item in the cluster with the right name
         job_tasks = cluster_spec.job_tasks(self.task_type)
@@ -346,7 +370,7 @@ class TPUClusterResolver(ClusterResolver):
         master = job_tasks[0]
     else:
       if isinstance(self._tpu, (bytes, bytearray)):
-        master = self._tpu.split(compat.as_bytes(_ENDPOINTS_SEPARATOR))[0]
+        master = compat.as_text(self._tpu).split(_ENDPOINTS_SEPARATOR)[0]
       else:
         master = self._tpu.split(_ENDPOINTS_SEPARATOR)[0]
     return format_master_url(master, rpc_layer or self.rpc_layer)
@@ -355,7 +379,8 @@ class TPUClusterResolver(ClusterResolver):
     return self.master()
 
   def get_job_name(self):
-    if self._shouldResolve():
+    if (self._shouldResolve() or
+        self._isRunningInGCE()):
       return self.task_type
 
   def cluster_spec(self):
@@ -415,7 +440,7 @@ class TPUClusterResolver(ClusterResolver):
         return None
       # Case 2.
       tpus = []
-      for tpu in self._tpu.split(_ENDPOINTS_SEPARATOR):
+      for tpu in compat.as_text(self._tpu).split(_ENDPOINTS_SEPARATOR):
         # We are working around the fact that GKE environment variable that is
         # supplied to us has the protocol string embedded in it, but we want
         # to strip it out for the ClusterSpec.
@@ -434,7 +459,7 @@ class TPUClusterResolver(ClusterResolver):
 
   def num_accelerators(self,
                        task_type=None,
-                       task_index=None,
+                       task_id=None,
                        accelerator_type='TPU',
                        config_proto=None):
     """Returns the number of TPU cores per worker.
@@ -445,23 +470,22 @@ class TPUClusterResolver(ClusterResolver):
 
     Args:
       task_type: Unused.
-      task_index: Unused.
+      task_id: Unused.
       accelerator_type: Unused.
       config_proto: Used to create a connection to a TPU master in order to
         retrieve the system metadata.
 
     Raises:
-      RuntimeError: If this is used with a non-TPU accelerator_type.
+      RuntimeError: If we cannot talk to a TPU worker after retrying or if the
+        number of TPU devices per host is different.
     """
     retry_count = 1
     # TODO(b/120564445): Replace with standard library for retries.
     while True:
       try:
-        with ops.Graph().as_default():
-          with session.Session(self.master(), config=config_proto) as s:
-            devices = s.list_devices()
-            device_details = _get_device_dict_and_cores(devices)
-            break
+        device_details = TPUClusterResolver._get_device_dict_and_cores(
+            get_accelerator_devices(self.master(), config_proto=config_proto))
+        break
       except errors.DeadlineExceededError:
         error_message = ('Failed to connect to master. The TPU might not be '
                          'ready (e.g. still scheduling) or the master '
@@ -474,7 +498,8 @@ class TPUClusterResolver(ClusterResolver):
           raise RuntimeError(error_message)
 
     if device_details.total_cores:
-      return _verify_and_return_same_core_count(device_details.device_map)
+      return TPUClusterResolver._verify_and_return_same_core_count(
+          device_details.device_map)
     return 0
 
   @property
@@ -483,7 +508,8 @@ class TPUClusterResolver(ClusterResolver):
     return self._environment
 
   def _start_local_server(self):
-    address = self._requestComputeMetadata('instance/network-interfaces/0/ip')
+    address = compat.as_text(self._requestComputeMetadata(
+        'instance/network-interfaces/0/ip'))
     self._server = server_lib.Server(
         {
             'local': ['0.0.0.0:0']
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
index 27d92608fa2db95944c94160d716a033ab2f78a2..7f06dc168bfea5e2dc12b33ce6ac35cf2fcbccd2 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
@@ -20,9 +20,11 @@ from __future__ import print_function
 
 import os
 
+import six
+from six.moves.urllib.error import URLError
+
 from tensorflow.python.client import session
-from tensorflow.python.distribute import cluster_resolver
-from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
+from tensorflow.python.distribute.cluster_resolver import TPUClusterResolver
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
@@ -64,6 +66,28 @@ def mock_request_compute_metadata(cls, *args, **kwargs):
   return ''
 
 
+def mock_is_running_in_gce(cls, *args, **kwargs):
+  del cls, args, kwargs  # Unused.
+  return True
+
+
+def mock_is_not_running_in_gce(cls, *args, **kwargs):
+  del cls, args, kwargs  # Unused.
+  return False
+
+
+def mock_running_in_gce_urlopen(cls, *args, **kwargs):
+  del cls, args, kwargs  # Unused.
+  mock_response = mock.MagicMock()
+  mock_response.info.return_value = {'Metadata-Flavor': 'Google'}
+  return mock_response
+
+
+def mock_not_running_in_gce_urlopen(cls, *args, **kwargs):
+  del cls, args, kwargs  # Unused.
+  raise URLError(reason='Host does not exist.')
+
+
 class TPUClusterResolverTest(test.TestCase):
 
   def _verifyClusterSpecEquality(self, cluster_spec, expected_proto):
@@ -104,7 +128,26 @@ class TPUClusterResolverTest(test.TestCase):
 
     return mock_client
 
-  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+  @mock.patch.object(TPUClusterResolver,
+                     '_isRunningInGCE',
+                     mock_is_running_in_gce)
+  def testCheckRunningInGceWithNoTpuName(self):
+    with self.assertRaisesRegexp(RuntimeError, '.*Google Cloud.*'):
+      TPUClusterResolver(tpu='')
+
+  @mock.patch.object(six.moves.urllib.request,
+                     'urlopen',
+                     mock_running_in_gce_urlopen)
+  def testIsRunningInGce(self):
+    self.assertTrue(TPUClusterResolver._isRunningInGCE())
+
+  @mock.patch.object(six.moves.urllib.request,
+                     'urlopen',
+                     mock_not_running_in_gce_urlopen)
+  def testIsNotRunningInGce(self):
+    self.assertFalse(TPUClusterResolver._isRunningInGCE())
+
+  @mock.patch.object(TPUClusterResolver,
                      '_requestComputeMetadata',
                      mock_request_compute_metadata)
   def testRetrieveProjectAndZoneFromMetadata(self):
@@ -116,7 +159,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    resolver = cluster_resolver.TPUClusterResolver(
+    resolver = TPUClusterResolver(
         project=None,
         zone=None,
         tpu=['test-tpu-1'],
@@ -138,7 +181,7 @@ class TPUClusterResolverTest(test.TestCase):
     self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto))
     self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')
 
-  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+  @mock.patch.object(TPUClusterResolver,
                      '_requestComputeMetadata',
                      mock_request_compute_metadata)
   def testRetrieveProjectAndZoneFromMetadataNoCoordinator(self):
@@ -150,7 +193,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    resolver = cluster_resolver.TPUClusterResolver(
+    resolver = TPUClusterResolver(
         project=None,
         zone=None,
         tpu=['test-tpu-1'],
@@ -165,7 +208,7 @@ class TPUClusterResolverTest(test.TestCase):
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
     self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')
 
-  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+  @mock.patch.object(TPUClusterResolver,
                      '_requestComputeMetadata',
                      mock_request_compute_metadata)
   def testUnhealthyCloudTpu(self):
@@ -177,7 +220,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    resolver = cluster_resolver.TPUClusterResolver(
+    resolver = TPUClusterResolver(
         project=None,
         zone=None,
         tpu='test-tpu-1',
@@ -188,7 +231,7 @@ class TPUClusterResolverTest(test.TestCase):
     with self.assertRaises(RuntimeError):
       resolver.cluster_spec()
 
-  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+  @mock.patch.object(TPUClusterResolver,
                      '_requestComputeMetadata',
                      mock_request_compute_metadata)
   def testNotReadyCloudTpu(self):
@@ -200,7 +243,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    resolver = cluster_resolver.TPUClusterResolver(
+    resolver = TPUClusterResolver(
         project=None,
         zone=None,
         tpu='test-tpu-1',
@@ -220,7 +263,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    resolver = cluster_resolver.TPUClusterResolver(
+    resolver = TPUClusterResolver(
         project='test-project',
         zone='us-central1-c',
         tpu=['test-tpu-1'],
@@ -248,7 +291,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    resolver = cluster_resolver.TPUClusterResolver(
+    resolver = TPUClusterResolver(
         project='test-project',
         zone='us-central1-c',
         tpu='test-tpu-1',
@@ -265,7 +308,7 @@ class TPUClusterResolverTest(test.TestCase):
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
     self.assertEqual('grpc://10.2.3.4:8470', resolver.master())
 
-  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+  @mock.patch.object(TPUClusterResolver,
                      '_requestComputeMetadata',
                      mock_request_compute_metadata)
   def testPodResolution(self):
@@ -294,7 +337,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    resolver = cluster_resolver.TPUClusterResolver(
+    resolver = TPUClusterResolver(
         tpu='test-tpu-1',
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map),
@@ -343,7 +386,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    resolver = cluster_resolver.TPUClusterResolver(
+    resolver = TPUClusterResolver(
         project='test-project',
         zone='us-central1-c',
         tpu='test-tpu-1',
@@ -368,7 +411,7 @@ class TPUClusterResolverTest(test.TestCase):
     tpu_map = {}
 
     with self.assertRaises(ValueError):
-      cluster_resolver.TPUClusterResolver(
+      TPUClusterResolver(
           project='test-project',
           zone='us-central1-c',
           tpu=[],
@@ -378,7 +421,7 @@ class TPUClusterResolverTest(test.TestCase):
 
   # TODO(saeta): Convert to parameterized test when included in OSS TF.
   def verifyShouldResolve(self, tpu, should_resolve):
-    resolver = cluster_resolver.TPUClusterResolver(
+    resolver = TPUClusterResolver(
         project='test-project',
         zone='us-central1-c',
         tpu=tpu,
@@ -388,6 +431,9 @@ class TPUClusterResolverTest(test.TestCase):
     self.assertEqual(should_resolve, resolver._shouldResolve(),
                      "TPU: '%s'" % tpu)
 
+  @mock.patch.object(TPUClusterResolver,
+                     '_isRunningInGCE',
+                     mock_is_not_running_in_gce)
   def testShouldResolveNoName(self):
     self.verifyShouldResolve('', False)
 
@@ -410,22 +456,21 @@ class TPUClusterResolverTest(test.TestCase):
     self.verifyShouldResolve('grpctpu', True)
 
   def testNoCallComputeMetadata(self):
-    resolver = cluster_resolver.TPUClusterResolver(
+    resolver = TPUClusterResolver(
         tpu='/bns/foo/bar')
-    self.assertEqual(
-        compat.as_bytes('/bns/foo/bar'), resolver.master())
+    self.assertEqual('/bns/foo/bar', resolver.master())
     self.assertEqual(None, resolver.cluster_spec())
 
   def testGkeEnvironmentForDonut(self):
     os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = 'grpc://10.120.27.5:8470'
 
     self.assertIn('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS', os.environ)
-    self.assertTrue(cluster_resolver.TPUClusterResolver._inGke())
+    self.assertTrue(TPUClusterResolver._inGke())
     self.assertEqual(
         compat.as_bytes('grpc://10.120.27.5:8470'),
-        compat.as_bytes(cluster_resolver.TPUClusterResolver._gkeEndpoints()))
+        compat.as_bytes(TPUClusterResolver._gkeEndpoints()))
 
-    resolver = cluster_resolver.TPUClusterResolver()
+    resolver = TPUClusterResolver()
     self.assertEqual(
         compat.as_bytes('grpc://10.120.27.5:8470'),
         compat.as_bytes(resolver.master()))
@@ -447,15 +492,15 @@ class TPUClusterResolverTest(test.TestCase):
                                                      'grpc://10.120.27.8:8470')
 
     self.assertIn('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS', os.environ)
-    self.assertTrue(cluster_resolver.TPUClusterResolver._inGke())
+    self.assertTrue(TPUClusterResolver._inGke())
     self.assertEqual(
         compat.as_bytes('grpc://10.120.27.5:8470,'
                         'grpc://10.120.27.6:8470,'
                         'grpc://10.120.27.7:8470,'
                         'grpc://10.120.27.8:8470'),
-        compat.as_bytes(cluster_resolver.TPUClusterResolver._gkeEndpoints()))
+        compat.as_bytes(TPUClusterResolver._gkeEndpoints()))
 
-    resolver = cluster_resolver.TPUClusterResolver()
+    resolver = TPUClusterResolver()
     self.assertEqual(
         compat.as_bytes('grpc://10.120.27.5:8470'),
         compat.as_bytes(resolver.master()))
@@ -476,17 +521,17 @@ class TPUClusterResolverTest(test.TestCase):
   def testEnvironmentDiscoveryUrl(self):
     os.environ['TPU_API_DISCOVERY_URL'] = 'https://{api}.internal/{apiVersion}'
     self.assertEqual('https://{api}.internal/{apiVersion}',
-                     (cluster_resolver.TPUClusterResolver.
+                     (TPUClusterResolver.
                       _environmentDiscoveryUrl()))
 
   def testEnvironmentAndRpcDetectionForGoogle(self):
-    resolver = cluster_resolver.TPUClusterResolver(
+    resolver = TPUClusterResolver(
         tpu='/bns/ab/cd/ef')
     self.assertEqual(resolver.environment, 'google')
     self.assertEqual(resolver.rpc_layer, None)
 
   def testEnvironmentAndRpcDetectionForGrpcString(self):
-    resolver = cluster_resolver.TPUClusterResolver(
+    resolver = TPUClusterResolver(
         tpu='grpc://10.1.2.3:8470')
     self.assertEqual(resolver.environment, '')
     self.assertEqual(resolver.rpc_layer, 'grpc')
@@ -518,7 +563,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    resolver = cluster_resolver.TPUClusterResolver(
+    resolver = TPUClusterResolver(
         project='test-project',
         zone='us-central1-c',
         tpu='test-tpu-1',
@@ -529,12 +574,12 @@ class TPUClusterResolverTest(test.TestCase):
     self.assertEqual(resolver.master(), 'grpc://10.2.3.4:8470')
 
     resolver.task_type = 'worker'
-    resolver.task_index = 3
+    resolver.task_id = 3
     self.assertEqual(resolver.master(), 'grpc://10.2.3.7:8470')
 
     self.assertEqual(
         resolver.master(
-            task_type='worker', task_index=2, rpc_layer='test'),
+            task_type='worker', task_id=2, rpc_layer='test'),
         'test://10.2.3.6:8470')
 
   def testGetDeviceDictAndCoresWithTPUs(self):
@@ -553,7 +598,7 @@ class TPUClusterResolverTest(test.TestCase):
             name, 'TPU', 1024, 0) for name in device_names
     ]
 
-    device_details = tpu_cluster_resolver._get_device_dict_and_cores(
+    device_details = TPUClusterResolver._get_device_dict_and_cores(
         device_list)
     self.assertEqual(device_details.total_cores, 8)
     self.assertEqual(device_details.device_map,
@@ -578,23 +623,26 @@ class TPUClusterResolverTest(test.TestCase):
             name, 'XLA', 1024, 0) for name in device_names
     ]
 
-    device_dict, num_cores = tpu_cluster_resolver._get_device_dict_and_cores(
+    device_dict, num_cores = TPUClusterResolver._get_device_dict_and_cores(
         device_list)
     self.assertEqual(num_cores, 0)
     self.assertEqual(device_dict, {})
 
   def testVerifySameCoreCount(self):
     self.assertEqual(
-        tpu_cluster_resolver._verify_and_return_same_core_count(
+        TPUClusterResolver._verify_and_return_same_core_count(
             {0: [0, 1, 2, 3, 4, 5, 6, 7]}), 8)
     self.assertEqual(
-        tpu_cluster_resolver._verify_and_return_same_core_count(
+        TPUClusterResolver._verify_and_return_same_core_count(
             {0: [0, 1], 1: [2, 3]}), 2)
     with self.assertRaises(RuntimeError):
-      tpu_cluster_resolver._verify_and_return_same_core_count(
+      TPUClusterResolver._verify_and_return_same_core_count(
           {0: [0], 1: [1, 2]})
 
   @mock.patch.object(session.BaseSession, 'list_devices')
+  @mock.patch.object(TPUClusterResolver,
+                     '_isRunningInGCE',
+                     mock_is_not_running_in_gce)
   def testNumAcceleratorsSuccess(self, mock_list_devices):
     device_names = [
         '/job:tpu_worker/task:0/device:TPU:0',
@@ -612,12 +660,15 @@ class TPUClusterResolverTest(test.TestCase):
     ]
     mock_list_devices.return_value = device_list
 
-    resolver = cluster_resolver.TPUClusterResolver(tpu='')
+    resolver = TPUClusterResolver(tpu='')
     self.assertEqual(resolver.num_accelerators(), 2)
 
   @mock.patch.object(session.BaseSession, 'list_devices')
+  @mock.patch.object(TPUClusterResolver,
+                     '_isRunningInGCE',
+                     mock_is_not_running_in_gce)
   def testNumAcceleratorsRetryFailure(self, mock_list_devices):
-    resolver = cluster_resolver.TPUClusterResolver(tpu='')
+    resolver = TPUClusterResolver(tpu='')
     mock_list_devices.side_effect = errors.DeadlineExceededError(
         None, None, 'timeout')
     with self.assertRaises(RuntimeError):
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9d092784d9c841124e2c8af6f2a39b53ad4f498
--- /dev/null
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -0,0 +1,413 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Class CollectiveAllReduceStrategy implementing DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import numpy_dataset
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.eager import context
+from tensorflow.python.eager import tape
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import collective_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
+
+
+# TODO(yuefengz): support in-graph replication.
+@tf_export("distribute.experimental.MultiWorkerMirroredStrategy")
+class CollectiveAllReduceStrategy(distribute_lib.DistributionStrategy):
+  """Distribution strategy that uses collective ops for all-reduce.
+
+  It is similar to MirroredStrategy but it uses collective ops for reduction.
+
+  By default it uses all local GPUs or CPU for single-worker training.
+
+  When 'TF_CONFIG' environment variable is given, it parses cluster_spec,
+  task_type and task_id from 'TF_CONFIG' and turns into a multi-worker strategy
+  which mirrores models on GPUs of all machines in a cluster. In the current
+  implementation, it uses all GPUs in a cluster and it assumes all workers have
+  the same number of GPUs.
+  """
+
+  def __init__(self):
+    """Initializes the object."""
+    super(CollectiveAllReduceStrategy, self).__init__(
+        CollectiveAllReduceExtended(self))
+
+
+class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
+  """Implementation of CollectiveAllReduceStrategy."""
+
+  def __init__(self,
+               container_strategy,
+               cluster_resolver=TFConfigClusterResolver()):
+    distribute_lib.DistributionStrategyExtended.__init__(
+        self, container_strategy)
+    self._cross_device_ops = None
+    self._initialize_strategy(cluster_resolver)
+    assert isinstance(self._get_cross_device_ops(),
+                      cross_device_ops_lib.CollectiveAllReduce)
+
+  def _initialize_strategy(self, cluster_resolver):
+    if cluster_resolver.cluster_spec().as_dict():
+      self._initialize_multi_worker(cluster_resolver)
+    else:
+      self._initialize_local(cluster_resolver)
+    # Save the num_gpus_per_worker for configure method.
+    self._num_gpus_per_worker = cluster_resolver.num_accelerators()
+
+  def _initialize_local(self, cluster_resolver):
+    """Initializes the object for local training."""
+    self._is_chief = True
+    self._num_workers = 1
+
+    num_gpus = cluster_resolver.num_accelerators()
+    if num_gpus:
+      local_devices = tuple("/device:GPU:%d" % i for i in range(num_gpus))
+    else:
+      local_devices = ("/device:CPU:0",)
+    self._worker_device = device_util.canonicalize("/device:CPU:0")
+    self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)
+
+    self._collective_keys = cross_device_utils.CollectiveKeys()
+    super(CollectiveAllReduceExtended, self)._initialize_local(local_devices)
+    # TODO(yuefengz): remove num_gpus_per_worker from CollectiveAllReduce.
+    self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
+        num_workers=self._num_workers,
+        num_gpus_per_worker=num_gpus,
+        collective_keys=self._collective_keys)
+
+    self._cluster_spec = None
+    self._task_type = None
+    self._task_id = None
+
+    logging.info("CollectiveAllReduceStrategy with local_devices = %r",
+                 local_devices)
+
+  def _initialize_multi_worker(self, cluster_resolver):
+    """Initializes the object for multi-worker training."""
+    # TODO(yuefengz): The `num_gpus` is only for this particular task. It
+    # assumes all workers have the same number of GPUs. We should remove this
+    # assumption by querying all tasks for their numbers of GPUs.
+    num_gpus = cluster_resolver.num_accelerators()
+    cluster_spec = multi_worker_util.normalize_cluster_spec(
+        cluster_resolver.cluster_spec())
+    task_type = cluster_resolver.task_type
+    task_id = cluster_resolver.task_id
+    if task_type is None or task_id is None:
+      raise ValueError("When `cluster_spec` is given, you must also specify "
+                       "`task_type` and `task_id` in the `cluster_resolver`.")
+    if task_type not in ("chief", "worker"):
+      raise ValueError(
+          "Unrecognized task_type: %r, valid task types are: \"chief\", "
+          "\"worker\"." % task_type)
+
+    self._num_workers = multi_worker_util.worker_count(cluster_spec, task_type)
+    if not self._num_workers:
+      raise ValueError("No `worker` or `chief` tasks can be found in "
+                       "`cluster_spec`.")
+
+    self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
+                                                task_id)
+
+    self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
+    self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)
+    if num_gpus:
+      local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i)
+                            for i in range(num_gpus))
+    else:
+      local_devices = (self._worker_device,)
+
+    self._collective_keys = cross_device_utils.CollectiveKeys()
+    super(CollectiveAllReduceExtended, self)._initialize_local(local_devices)
+    self._input_workers = input_lib.InputWorkers(
+        self._device_map, [(self._worker_device, self.worker_devices)])
+    self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
+        num_workers=self._num_workers,
+        num_gpus_per_worker=num_gpus,
+        collective_keys=self._collective_keys)
+
+    # Add a default device so that ops without specified devices will not end up
+    # on other workers.
+    self._default_device = "/job:%s/task:%d" % (task_type, task_id)
+
+    self._cluster_spec = cluster_spec
+    self._task_type = task_type
+    self._task_id = task_id
+
+    logging.info(
+        "Multi-worker CollectiveAllReduceStrategy with "
+        "cluster_spec = %r, task_type = %r, task_id = %r, "
+        "num_workers = %r, local_devices = %r", cluster_spec.as_dict(),
+        task_type, task_id, self._num_workers, local_devices)
+
+  def _create_variable(self, next_creator, *args, **kwargs):
+    colocate_with = kwargs.pop("colocate_with", None)
+    if colocate_with is None:
+      device_map = self._device_map
+      logical_device = 0  # TODO(josh11b): Get logical device from scope here.
+    elif isinstance(colocate_with, numpy_dataset.SingleDevice):
+      with ops.device(colocate_with.device):
+        return next_creator(*args, **kwargs)
+    else:
+      device_map = colocate_with.device_map
+      logical_device = colocate_with.logical_device
+
+    def _real_mirrored_creator(devices, *args, **kwargs):
+      """Creates one MirroredVariable on the current worker."""
+      unique_var_name = ops.get_default_graph().unique_name(
+          kwargs["name"], mark_as_used=False).rstrip("/")
+      # pylint: disable=protected-access
+      collective_instance_key = self._collective_keys.get_instance_key(
+          key_id=unique_var_name)
+      # Only the first device participles in the broadcast of initial values.
+      group_key = self._collective_keys.get_group_key([devices[0]])
+      group_size = self._num_workers
+      if "initial_value" not in kwargs:
+        raise ValueError("Initial value must be specified.")
+      initial_value = kwargs["initial_value"]
+      if callable(initial_value):
+        initial_value_fn = initial_value
+      else:
+        initial_value_fn = lambda: initial_value
+
+      value_list = []
+      for i, d in enumerate(devices):
+        with ops.init_scope(), ops.device(d):
+          if i == 0:
+            # The initial value fn makes sure variables all initialized to
+            # same values. The first device of the chief worker will send their
+            # variable values to other workers.
+            def _overridden_initial_value_fn(device=d, index=i):  # pylint: disable=g-missing-docstring
+              with ops.device(device):
+                initial_value = initial_value_fn()
+                assert not callable(initial_value)
+                initial_value = ops.convert_to_tensor(initial_value)
+
+                assert index == 0, index
+                if self._num_workers > 1:
+                  if self._is_chief:
+                    bcast_send = collective_ops.broadcast_send(
+                        initial_value, initial_value.shape, initial_value.dtype,
+                        group_size, group_key, collective_instance_key)
+                    with ops.control_dependencies([bcast_send]):
+                      return array_ops.identity(initial_value)
+                  else:
+                    return collective_ops.broadcast_recv(
+                        initial_value.shape, initial_value.dtype, group_size,
+                        group_key, collective_instance_key)
+                return initial_value
+          else:
+            # Give replicas meaningful distinct names:
+            var0name = value_list[0].name.split(":")[0]
+            # We append a / to variable names created on replicas with id > 0 to
+            # ensure that we ignore the name scope and instead use the given
+            # name as the absolute name of the variable.
+            kwargs["name"] = "%s/replica_%d/" % (var0name, i)
+
+            # Variables on non-first replica get initial values from the
+            # variables created on the first device of each worker.
+            def _overridden_initial_value_fn(device=d, index=i):
+              assert index > 0
+              with ops.device(device):
+                if context.executing_eagerly():
+                  return array_ops.identity(value_list[0].value())
+                else:
+                  return array_ops.identity(value_list[0].initial_value)
+
+          kwargs["initial_value"] = _overridden_initial_value_fn
+          with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+            # Don't record operations (e.g. other variable reads) during
+            # variable creation.
+            with tape.stop_recording():
+              v = next_creator(*args, **kwargs)
+
+          if i == 0:
+            actual_var_name = v.name.split(":")[0]
+            assert unique_var_name == actual_var_name, "%r vs %r" % (
+                unique_var_name, actual_var_name)
+          assert not isinstance(v, values.DistributedVariable)
+          value_list.append(v)
+      return value_list
+
+    # pylint: disable=protected-access
+    return mirrored_strategy._create_mirrored_variable(
+        self._container_strategy(), device_map, logical_device,
+        _real_mirrored_creator, *args, **kwargs)
+
+  def _make_dataset_iterator(self, dataset):
+    return input_lib.DatasetIterator(dataset, self._input_workers,
+                                     self._num_replicas_in_sync)
+
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    """Distributes the dataset to each local GPU."""
+    if self._cluster_spec is None:
+      input_pipeline_id = 0
+    else:
+      input_pipeline_id = multi_worker_util.id_in_cluster(
+          self._cluster_spec, self._task_type, self._task_id)
+    input_context = distribute_lib.InputContext(
+        num_input_pipelines=self._num_workers,
+        input_pipeline_id=input_pipeline_id,
+        num_replicas_in_sync=self._num_replicas_in_sync)
+
+    return input_lib.InputFunctionIterator(
+        input_fn, self._input_workers, [input_context])
+
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
+    """Configures the object.
+
+    Args:
+      session_config: a `tf.ConfigProto`
+      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
+        cluster configurations.
+      task_type: the current task type, such as "worker".
+      task_id: the current task id.
+
+    Raises:
+      ValueError: if `task_type` is not in the `cluster_spec`.
+    """
+    if cluster_spec:
+      # Use the num_gpus_per_worker recorded in constructor since _configure
+      # doesn't take num_gpus.
+      cluster_resolver = SimpleClusterResolver(
+          cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
+          task_type=task_type,
+          task_id=task_id,
+          num_accelerators=self._num_gpus_per_worker)
+      self._initialize_multi_worker(cluster_resolver)
+      assert isinstance(self._get_cross_device_ops(),
+                        cross_device_ops_lib.CollectiveAllReduce)
+
+    if session_config:
+      session_config.CopyFrom(self._update_config_proto(session_config))
+
+  def _update_config_proto(self, config_proto):
+    updated_config = copy.deepcopy(config_proto)
+    # Enable the scoped allocator optimization for CollectiveOps.  This
+    # optimization converts many small all-reduces into fewer larger
+    # all-reduces.
+    rewrite_options = updated_config.graph_options.rewrite_options
+    rewrite_options.scoped_allocator_optimization = (
+        rewriter_config_pb2.RewriterConfig.ON)
+    # We turn on ScopedAllocator only for CollectiveReduce op, i.e. enable_op =
+    # ["CollectiveReduce"].  Since we can't assign to a repeated proto field, we
+    # clear and then append.
+    del rewrite_options.scoped_allocator_opts.enable_op[:]
+    rewrite_options.scoped_allocator_opts.enable_op.append("CollectiveReduce")
+
+    if not self._cluster_spec:
+      return updated_config
+
+    assert self._task_type
+    assert self._task_id is not None
+
+    # Collective group leader is needed for collective ops to coordinate
+    # workers.
+    if "chief" in self._cluster_spec.jobs:
+      updated_config.experimental.collective_group_leader = (
+          "/job:chief/replica:0/task:0")
+    else:
+      if "worker" not in self._cluster_spec.jobs:
+        raise ValueError(
+            "You must have `chief` or `worker` jobs in the `cluster_spec`.")
+      updated_config.experimental.collective_group_leader = (
+          "/job:worker/replica:0/task:0")
+
+    # The device filters prevent communication between workers.
+    del updated_config.device_filters[:]
+    updated_config.device_filters.append(
+        "/job:%s/task:%d" % (self._task_type, self._task_id))
+
+    return updated_config
+
+  def _reduce_to(self, reduce_op, value, destinations):
+    if (isinstance(value, values.Mirrored) and
+        reduce_op == reduce_util.ReduceOp.MEAN):
+      return value
+    assert not isinstance(value, values.Mirrored)
+
+    if (isinstance(value, values.DistributedValues) and
+        len(self.worker_devices) == 1):
+      value = value.values[0]
+
+    # When there are multiple workers, we need to reduce across workers using
+    # collective ops.
+    if (not isinstance(value, values.DistributedValues) and
+        self._num_workers == 1):
+      # This function handles reducing values that are not PerReplica or
+      # Mirrored values. For example, the same value could be present on all
+      # replicas in which case `value` would be a single value or value could
+      # be 0.
+      return cross_device_ops_lib.reduce_non_distributed_value(
+          reduce_op, self._device_map, value, destinations)
+    return self._get_cross_device_ops().reduce(
+        reduce_op, value, destinations=destinations)
+
+  @property
+  def experimental_between_graph(self):
+    return True
+
+  @property
+  def experimental_should_init(self):
+    return True
+
+  @property
+  def should_checkpoint(self):
+    return self._is_chief
+
+  @property
+  def should_save_summary(self):
+    return self._is_chief
+
+  @property
+  def _num_replicas_in_sync(self):
+    return len(self.worker_devices) * self._num_workers
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    """`make_dataset_iterator` and `make_numpy_iterator` use global batch size.
+
+    `make_input_fn_iterator` assumes per-replica batching.
+
+    Returns:
+      Boolean.
+    """
+    return True
diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index 57c552ca8f0abd36466932d800d9f1f802d9664c..afb7a35c0a29ed6e20816d2d062af8fb68b897cf 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -32,13 +32,15 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
+from tensorflow.tools.docs import doc_controls
 
 
 def check_destinations(destinations):
   """Checks whether `destinations` is not empty.
 
   Args:
-    destinations: a DistributedValues, Variable, string or a list of strings.
+    destinations: a `DistributedValues`, variable, or string object.
 
   Returns:
     Boolean which is True if `destinations` is not empty.
@@ -50,18 +52,48 @@ def check_destinations(destinations):
 
 
 def validate_destinations(destinations):
-  if not isinstance(
-      destinations,
-      (value_lib.DistributedValues, resource_variable_ops.ResourceVariable,
-       value_lib.AggregatingVariable, six.string_types, list, tuple)):
+  if not isinstance(destinations,
+                    (value_lib.DistributedValues,
+                     resource_variable_ops.ResourceVariable,
+                     value_lib.AggregatingVariable,
+                     six.string_types,
+                     value_lib.TPUMirroredVariable,
+                     # LogicalDeviceSpec is only used internally, e.g. as a
+                     # broadcast destination, never supplied by a user.
+                     value_lib.LogicalDeviceSpec)):
     raise ValueError("destinations must be one of a `DistributedValues` object,"
-                     " a tf.Variable object, a device string, a list or tuple "
-                     "of device strings")
+                     " a tf.Variable object, or a device string.")
 
   if not check_destinations(destinations):
     raise ValueError("destinations can not be empty")
 
 
+def reduce_non_distributed_value(reduce_op, device_map, value, destinations):
+  """Reduce a non-DistributedValue `value` to `destinations`."""
+  if isinstance(value, value_lib.DistributedValues):
+    raise ValueError("You are passing a `DistributedValue` to "
+                     "`reduce_non_distributed_value`, which is not allowed.")
+
+  # If the same value is present on all replicas then the PerReplica value will
+  # be a single value. We also handle the case when `value` is a single value
+  # and equal to 0.
+  if value == 0:
+    return 0
+  # If there is only a single value and the reduce op is MEAN,
+  # that value should be on all destinations.
+  if reduce_op == reduce_util.ReduceOp.MEAN:
+    return value
+
+  validate_destinations(destinations)
+  # We do not support a reduce op of SUM if the value is the same across
+  # all replicas. We call this as part of assign functions for MirroredVariables
+  # and summing up identical values across replicas is not clearly defined.
+  if device_map.num_replicas_in_graph != 1:
+    raise ValueError("A non-DistributedValues value %s cannot be reduced with "
+                     "the given reduce op %s." % (value, reduce_op))
+  return simple_broadcast(value, destinations)
+
+
 def _make_tensor_into_per_replica(input_tensor):
   """Converts a single tensor into a PerReplica object."""
   if isinstance(input_tensor, (tuple, list)):
@@ -77,12 +109,16 @@ def _make_tensor_into_per_replica(input_tensor):
     raise ValueError("Cannot convert `input_tensor` to a `PerReplica` object "
                      "because it doesn't have device set.")
 
-  return value_lib.PerReplica({device: input_tensor})
+  device_map = value_lib.SingleDeviceMap(device)
+  return value_lib.PerReplica(device_map, (input_tensor,))
 
 
 def _normalize_value_destination_pairs(value_destination_pairs):
   """Converts each tensor into a PerReplica object in the input list."""
   result = []
+
+  value_destination_pairs = list(value_destination_pairs)
+
   if not isinstance(value_destination_pairs, (list, tuple)):
     raise ValueError("`value_destination_pairs` should be a list or tuple")
   for pair in value_destination_pairs:
@@ -115,16 +151,24 @@ def _validate_value_destination_pairs(value_destination_pairs):
 # CrossDeviceOps.
 def get_devices_from(destinations):
   if isinstance(destinations, value_lib.DistributedValues):
-    return list(destinations.devices)
-  elif isinstance(destinations, (resource_variable_ops.ResourceVariable,
-                                 value_lib.AggregatingVariable)):
-    return [destinations.device]
+    return destinations.devices
+  elif isinstance(destinations, value_lib.LogicalDeviceSpec):
+    return destinations.device_map.logical_to_actual_devices(
+        destinations.logical_device)
   elif isinstance(destinations, six.string_types):
-    return [device_util.resolve(destinations)]
-  elif isinstance(destinations, (list, tuple)):
-    return [device_util.resolve(destination) for destination in destinations]
+    return (device_util.resolve(destinations),)
+  return (destinations.device,)
+
+
+def get_device_map_from(destinations):
+  if isinstance(destinations, (value_lib.DistributedValues,
+                               value_lib.LogicalDeviceSpec)):
+    return destinations.device_map, destinations.logical_device
+  if isinstance(destinations, six.string_types):
+    device = device_util.resolve(destinations)
   else:
-    return [destinations.device]
+    device = destinations.device
+  return value_lib.SingleDeviceMap(device), 0
 
 
 def _devices_match(left, right):
@@ -140,25 +184,29 @@ def _all_devices_match(value_destination_pairs):
   return True
 
 
-def _simple_broadcast(value, destinations):
-  index = {}
-  devices = get_devices_from(destinations)
-  for d in devices:
-    index[d] = cross_device_utils.copy_tensor_or_indexed_slices_to_device(
-        value, d)
-  return value_lib.Mirrored(index)
+def simple_broadcast(value, destinations, always_mirrored=False):
+  """Broadcast `value` to `destinations` using simple copies."""
+  device_map, logical_device = get_device_map_from(destinations)
+  devices = device_map.logical_to_actual_devices(logical_device)
+  if len(devices) == 1 and not always_mirrored:
+    return cross_device_utils.copy_tensor_or_indexed_slices_to_device(
+        value, devices[0])
+  else:
+    value_updates = []
+    for d in devices:
+      value_updates.append(
+          cross_device_utils.copy_tensor_or_indexed_slices_to_device(
+              value, d))
+    return value_lib.Mirrored(device_map, value_updates, logical_device)
 
 
 def _simple_reduce(per_replica_value, reduce_to_device, accumulation_fn,
                    reduce_op):
   # pylint: disable=g-missing-docstring
-  all_values = []
-  count = 0
-  for v in per_replica_value._index.values():  # pylint: disable=protected-access
-    count += 1
-    all_values.append(v)
+  all_values = per_replica_value.values
   if not all_values:
     raise ValueError("`per_replica_value` must be non-empty")
+  count = len(all_values)
 
   with ops.device(reduce_to_device):
     with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
@@ -172,6 +220,7 @@ def _simple_reduce(per_replica_value, reduce_to_device, accumulation_fn,
   return reduced
 
 
+@tf_export("distribute.CrossDeviceOps")
 class CrossDeviceOps(object):
   """Base class for cross-device reduction and broadcasting algorithms."""
 
@@ -194,13 +243,15 @@ class CrossDeviceOps(object):
       a Mirrored object.
 
     Raises:
-      ValueError: if per_replica_value is not a PerReplica object.
+      ValueError: if per_replica_value can't be converted to a PerReplica
+        object.
     """
     if not isinstance(per_replica_value, value_lib.PerReplica):
       per_replica_value = _make_tensor_into_per_replica(per_replica_value)
 
     validate_destinations(destinations)
-    return self._reduce(reduce_op, per_replica_value, destinations)
+    return self.reduce_implementation(reduce_op, per_replica_value,
+                                      destinations)
 
   def batch_reduce(self, reduce_op, value_destination_pairs):
     """Reduce PerReplica objects in a batch.
@@ -221,6 +272,8 @@ class CrossDeviceOps(object):
       ValueError: if `value_destination_pairs` is not a list or a tuple of
         tuples of PerReplica objects and destinations
     """
+    # TODO(yuefengz): if destinations are different, split into several
+    # `_batch_reduce` invocations.
     if not _validate_value_destination_pairs(value_destination_pairs):
       # If the first element of each pair is a tensor, we try to turn it into a
       # PerReplica object.
@@ -230,7 +283,7 @@ class CrossDeviceOps(object):
     for _, d in value_destination_pairs:
       validate_destinations(d)
 
-    return self._batch_reduce(reduce_op, value_destination_pairs)
+    return self.batch_reduce_implementation(reduce_op, value_destination_pairs)
 
   def broadcast(self, tensor, destinations):
     """Broadcast the `tensor` to destinations.
@@ -243,51 +296,104 @@ class CrossDeviceOps(object):
       a Mirrored object.
     """
     validate_destinations(destinations)
-    return self._broadcast(tensor, destinations)
+    return self.broadcast_implementation(tensor, destinations)
+
+  @doc_controls.for_subclass_implementers
+  def reduce_implementation(self, reduce_op, per_replica_value, destinations):
+    """The implementation of reduce of `per_replica_value` to `destinations`.
+
+    It runs the reduction operation defined by `reduce_op` and put the
+    result on `destinations`.
 
-  def _reduce(self, reduce_op, per_replica_value, destinations):
+    Args:
+      reduce_op: Indicates how per_replica_value will be reduced. Accepted
+        values are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
+      per_replica_value: a PerReplica object or a tensor with device set.
+      destinations: the reduction destinations.
+
+    Returns:
+      a Mirrored object.
+
+    Raises:
+      ValueError: if per_replica_value can't be converted to a PerReplica
+        object.
+    """
     raise NotImplementedError(
         "_reduce method must be implemented in descendants.")
 
-  def _batch_reduce(self, reduce_op, value_destination_pairs):
+  @doc_controls.for_subclass_implementers
+  def batch_reduce_implementation(self, reduce_op, value_destination_pairs):
+    """Implementation of reduce PerReplica objects in a batch.
+
+    Reduce each first element in `value_destination_pairs` to each second
+    element which indicates the destinations.
+
+    Args:
+      reduce_op: Indicates how per_replica_value will be reduced. Accepted
+        values are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
+      value_destination_pairs: a list or a tuple of tuples of PerReplica objects
+        (or tensors with device set if there is one device) and destinations.
+
+    Returns:
+      a list of Mirrored objects.
+
+    Raises:
+      ValueError: if `value_destination_pairs` is not a list or a tuple of
+        tuples of PerReplica objects and destinations
+    """
     raise NotImplementedError(
         "_batch_reduce method must be implemented in descendants.")
 
-  def _broadcast(self, tensor, destinations):
-    return _simple_broadcast(tensor, destinations)
+  @doc_controls.for_subclass_implementers
+  def broadcast_implementation(self, tensor, destinations):
+    """Implementation of broadcast the `tensor` to destinations.
+
+    Args:
+      tensor: the tensor to broadcast.
+      destinations: the broadcast destinations.
+
+    Returns:
+      a Mirrored object.
+    """
+    return simple_broadcast(tensor, destinations, always_mirrored=True)
 
 
-class ReductionToOneDeviceCrossDeviceOps(CrossDeviceOps):
+@tf_export("distribute.ReductionToOneDevice")
+class ReductionToOneDevice(CrossDeviceOps):
   """Always do reduction to one device first and then do broadcasting.
 
     Batch reduction is done by reduction on each element one by one.
   """
 
-  def __init__(self, reduce_to_device=None, accumulation_fn=math_ops.add_n):
+  def __init__(self, reduce_to_device=None, accumulation_fn=None):
     """Constructor.
 
     Args:
       reduce_to_device: the intermediate device to reduce to. If None, reduce
         to the first device in `destinations` of the reduce() method.
-      accumulation_fn: a function that does accumulation.
+      accumulation_fn: a function that does accumulation.  If None, then
+        `tf.math.add_n` is used.
     """
     self.reduce_to_device = reduce_to_device
-    self.accumulation_fn = accumulation_fn
-    super(ReductionToOneDeviceCrossDeviceOps, self).__init__()
+    self.accumulation_fn = accumulation_fn or math_ops.add_n
+    super(ReductionToOneDevice, self).__init__()
 
-  def _reduce(self, reduce_op, per_replica_value, destinations):
+  def reduce_implementation(self, reduce_op, per_replica_value, destinations):
     if check_destinations(destinations):
       devices = get_devices_from(destinations)
     else:
       devices = get_devices_from(per_replica_value)
     reduce_to_device = self.reduce_to_device or devices[0]
+    logging.log_first_n(
+        logging.INFO,
+        "Reduce to %s then broadcast to %r." % (reduce_to_device, devices), 10)
     reduced = _simple_reduce(per_replica_value, reduce_to_device,
                              self.accumulation_fn, reduce_op)
-    return self.broadcast(reduced, devices)
+    return self.broadcast(reduced, destinations)
 
-  def _batch_reduce(self, reduce_op, value_destination_pairs):
+  def batch_reduce_implementation(self, reduce_op, value_destination_pairs):
     return [
-        self._reduce(reduce_op, t, destinations=v)
+        self.reduce_implementation(reduce_op, t, destinations=v)
         for t, v in value_destination_pairs
     ]
 
@@ -314,7 +420,7 @@ def _group_value_by_device(per_replica_values):
   grouped = [[] for _ in range(len(destinations))]
   for per_replica_value in per_replica_values:
     # pylint: disable=protected-access
-    for i, v in enumerate(per_replica_value._index.values()):
+    for i, v in enumerate(per_replica_value.values):
       assert per_replica_value.devices == destinations
       grouped[i].append((v, None))
   return grouped
@@ -333,7 +439,7 @@ def _ungroup_and_make_mirrored(grouped_reduced,
     grouped_reduced: a list of lists, each sublist has components for each
       device, paired with a None. It is the result from
       cross_device_utils.aggregate_gradients_using*.
-    destinations: a list of device strings for returned Mirrored objects.
+    destinations: a value to colocate the result with.
     reduce_op: Indicates how values will be aggregated. Accepted values
       are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
     num_between_graph_workers: number of workers in the between-graph
@@ -342,22 +448,23 @@ def _ungroup_and_make_mirrored(grouped_reduced,
   Returns:
     a list of Mirrored objects.
   """
-  index = [{} for _ in range(len(grouped_reduced[0]))]
-  for d, per_replica_reduced in enumerate(grouped_reduced):
+  device_map, logical_device = get_device_map_from(destinations)
+  num_replicas = device_map.num_replicas_in_graph * num_between_graph_workers
+  index = [[] for _ in range(len(grouped_reduced[0]))]
+  for per_replica_reduced in grouped_reduced:
     for i, (v, _) in enumerate(per_replica_reduced):
       if reduce_op == reduce_util.ReduceOp.MEAN:
-        index[i][destinations[d]] = v / (
-            len(destinations) * num_between_graph_workers)
+        index[i].append(v / num_replicas)
       else:
-        index[i][destinations[d]] = v
-  return [value_lib.Mirrored(v) for v in index]
+        index[i].append(v)
+  return [value_lib.Mirrored(device_map, v, logical_device) for v in index]
 
 
-class ConcatAndSplitPacker(object):
+class _ConcatAndSplitPacker(object):
   """Concatenate and split tensors for reduction."""
 
   def __init__(self, num_packs=1):
-    """Initialize the ConcatAndSplitPacker object.
+    """Initialize the _ConcatAndSplitPacker object.
 
     Args:
       num_packs: specifies the number of split packs that will be
@@ -457,13 +564,13 @@ class ConcatAndSplitPacker(object):
     return aggregated_device_grads
 
 
-class AggregateSmallTensorPacker(object):
+class _AggregateSmallTensorPacker(object):
   """Concatenate small gradient tensors together for reduction."""
 
   def __init__(self,
                agg_small_grads_max_bytes=1048576,
                agg_small_grads_max_group=16):
-    """Initialize the AggregateSmallTensorPacker object.
+    """Initialize the _AggregateSmallTensorPacker object.
 
     Args:
       agg_small_grads_max_bytes: largest tensor eligible for aggregation,
@@ -503,11 +610,11 @@ def _pack_tensors(device_grads,
                   agg_small_grads_max_group=0):
   """Pack tensors if specified."""
   if num_packs > 0:
-    tensor_packer = ConcatAndSplitPacker(num_packs)
+    tensor_packer = _ConcatAndSplitPacker(num_packs)
     device_grad_packs = tensor_packer.pack(device_grads)
   elif agg_small_grads_max_bytes > 0 and agg_small_grads_max_group > 0:
-    tensor_packer = AggregateSmallTensorPacker(agg_small_grads_max_bytes,
-                                               agg_small_grads_max_group)
+    tensor_packer = _AggregateSmallTensorPacker(agg_small_grads_max_bytes,
+                                                agg_small_grads_max_group)
     device_grad_packs = tensor_packer.pack(device_grads)
   else:
     tensor_packer = None
@@ -523,7 +630,7 @@ def _unpack_tensors(reduced, tensor_packer=None):
 
 
 class AllReduceCrossDeviceOps(CrossDeviceOps):
-  """Reduction using all reduce."""
+  """Reduction using all-reduce."""
 
   def __init__(self,
                all_reduce_alg="nccl",
@@ -548,37 +655,22 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
       num_packs: see above.
       agg_small_grads_max_bytes: see above.
       agg_small_grads_max_group: see above.
-        tensors.
     """
     self._all_reduce_alg = all_reduce_alg
     self._num_packs = num_packs
     self._agg_small_grads_max_bytes = agg_small_grads_max_bytes
     self._agg_small_grads_max_group = agg_small_grads_max_group
+    self._simple_cross_replica_ops = ReductionToOneDevice()
     super(AllReduceCrossDeviceOps, self).__init__()
 
-  def _reduce(self, reduce_op, per_replica_value, destinations):
-    contains_indexed_slices = cross_device_utils.contains_indexed_slices(
-        per_replica_value)
-    if (_devices_match(per_replica_value, destinations)
-        and not context.executing_eagerly()
-        and not contains_indexed_slices):
+  def reduce_implementation(self, reduce_op, per_replica_value, destinations):
+    if _devices_match(per_replica_value, destinations):
       return self._batch_all_reduce(reduce_op, [per_replica_value])[0]
     else:
-      if contains_indexed_slices:
-        logging.log_first_n(
-            logging.WARN,
-            "Efficient allreduce is not supported for IndexedSlices.", 10)
+      return self._simple_cross_replica_ops.reduce(reduce_op, per_replica_value,
+                                                   destinations)
 
-      if check_destinations(destinations):
-        devices = get_devices_from(destinations)
-      else:
-        devices = get_devices_from(per_replica_value)
-      reduce_to_device = devices[0]
-      reduced = _simple_reduce(per_replica_value, reduce_to_device,
-                               math_ops.add_n, reduce_op)
-      return self.broadcast(reduced, devices)
-
-  def _batch_reduce(self, reduce_op, value_destination_pairs):
+  def batch_reduce_implementation(self, reduce_op, value_destination_pairs):
     all_devices_match = _all_devices_match(value_destination_pairs)
     contains_indexed_slices = cross_device_utils.contains_indexed_slices(
         value_destination_pairs)
@@ -594,20 +686,37 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
                             10)
 
       return [
-          self._reduce(reduce_op, t, destinations=v)
+          self.reduce_implementation(reduce_op, t, destinations=v)
           for t, v in value_destination_pairs
       ]
 
   def _batch_all_reduce(self, reduce_op, per_replica_values):
-    """All reduce algorithm in a batch."""
+    """All-reduce algorithm in a batch."""
+    dense_values, dense_indices, sparse_values, sparse_indices = (
+        cross_device_utils.split_by_sparsity(per_replica_values))
+    if dense_values:
+      dense_results = self._do_batch_all_reduce(reduce_op, dense_values)
+    else:
+      dense_results = []
+    if sparse_values:
+      sparse_results = self._do_batch_all_reduce_sparse(reduce_op,
+                                                        sparse_values)
+    else:
+      sparse_results = []
+    return cross_device_utils.stitch_values(((dense_results, dense_indices),
+                                             (sparse_results, sparse_indices)))
+
+  def _do_batch_all_reduce(self, reduce_op, dense_values):
+    """Run batch all-reduces."""
     logging.log_first_n(
         logging.INFO, "batch_all_reduce invoked for batches size = %d with "
         "algorithm = %s, num_packs = %d, agg_small_grads_max_bytes = %d and "
         "agg_small_grads_max_group = %d" %
-        (len(per_replica_values), self._all_reduce_alg, self._num_packs,
+        (len(dense_values), self._all_reduce_alg, self._num_packs,
          self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10)
-    destinations = per_replica_values[0].devices
-    grouped = _group_value_by_device(per_replica_values)
+
+    destinations = dense_values[0].devices
+    grouped = _group_value_by_device(dense_values)
 
     device_grad_packs, tensor_packer = _pack_tensors(
         grouped, self._num_packs, self._agg_small_grads_max_bytes,
@@ -628,8 +737,18 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
               destinations, device_grad_packs))
 
     reduced = _unpack_tensors(reduced, tensor_packer)
-    return _ungroup_and_make_mirrored(reduced, per_replica_values[0].devices,
-                                      reduce_op)
+    return _ungroup_and_make_mirrored(reduced, dense_values[0], reduce_op)
+
+  def _do_batch_all_reduce_sparse(self, reduce_op, sparse_values):
+    """Run batch all-reduce for sparse values."""
+    logging.log_first_n(
+        logging.WARN,
+        "Efficient allreduce is not supported for %d IndexedSlices" %
+        len(sparse_values), 10)
+    # Use `sparse_values` as destinations to do all-reduces. It is effectively
+    # an allgather under the hood but not an efficient one.
+    return self._simple_cross_replica_ops.batch_reduce(
+        reduce_op, zip(sparse_values, sparse_values))
 
 
 # For compatibility with code using the old name of `AllReduceCrossDeviceOps`.
@@ -640,6 +759,49 @@ AllReduceSpecTuple = collections.namedtuple("AllReduceSpecTuple",
                                             "alg shards limit")
 
 
+@tf_export("distribute.NcclAllReduce")
+class NcclAllReduce(AllReduceCrossDeviceOps):
+  """Reduction using NCCL all-reduce."""
+
+  def __init__(self, num_packs=1):
+    """NCCL all-reduce implementation of CrossDeviceOps.
+
+    Before performing all-reduce, tensors will be repacked or aggregated for
+    more efficient cross-device transportation.
+
+    Args:
+      num_packs: values will be packed in this many splits.  `num_packs` should
+        be greater than 0.
+    """
+    assert num_packs > 0, (
+        "NCLL all-reduce requires num_packs > 0, but {} is specified".format(
+            num_packs))
+    super(NcclAllReduce, self).__init__(
+        all_reduce_alg="nccl", num_packs=num_packs)
+
+
+@tf_export("distribute.HierarchicalCopyAllReduce")
+class HierarchicalCopyAllReduce(AllReduceCrossDeviceOps):
+  """Reduction using hierarchical copy all-reduce.
+
+  This is a good reduction for configurations like Nvidia DGX-1.
+  """
+
+  def __init__(self, num_packs=1):
+    """Hierarchical copy all-reduce implementation of CrossDeviceOps.
+
+    Before performing all-reduce, tensors will be repacked or aggregated for
+    more efficient cross-device transportation.
+
+    Args:
+      num_packs: values will be packed in this many splits.  `num_packs` should
+        be greater than 0.
+    """
+    super(HierarchicalCopyAllReduce, self).__init__(
+        all_reduce_alg="hierarchical_copy",
+        num_packs=num_packs)
+
+
 class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
   """All-reduce algorithms for distributed TensorFlow."""
 
@@ -713,7 +875,7 @@ class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
       ]
 
   def _batch_all_reduce(self, reduce_op, per_replica_values):
-    """All reduce algorithm in a batch."""
+    """All-reduce algorithm in a batch."""
     logging.log_first_n(
         logging.INFO,
         "distributed batch_all_reduce invoked for batches size = %d with "
@@ -722,10 +884,9 @@ class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
         (len(per_replica_values), self._all_reduce_spec, self._num_packs,
          self._agg_small_grads_max_bytes, self._agg_small_grads_max_group), 10)
 
-    destinations = sorted(per_replica_values[0].devices)
     device_grads = _group_value_by_device(per_replica_values)
 
-    # The all reduce library requires fully defined shapes.
+    # The all-reduce library requires fully defined shapes.
     # TODO(yuefengz): when tensor sharding is not needed, static shapes are not
     # required as well.
     for device_grad in device_grads:
@@ -759,7 +920,7 @@ class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
             aggregated_grads[i] += range_agg_grads[i]
     assert not remaining_grads
 
-    return _ungroup_and_make_mirrored(aggregated_grads, destinations,
+    return _ungroup_and_make_mirrored(aggregated_grads, per_replica_values[0],
                                       reduce_op)
 
 
@@ -794,37 +955,33 @@ class CollectiveAllReduce(CrossDeviceOps):
     super(CollectiveAllReduce, self).__init__()
 
   # TODO(yuefengz, tucker): is indexed slices supported by collective ops?
-  def _reduce(self, reduce_op, per_replica_value, destinations):
+  def reduce_implementation(self, reduce_op, per_replica_value, destinations):
     if cross_device_utils.contains_indexed_slices(per_replica_value):
       raise ValueError(
           "`IndexSlices` is not supported for Collective All-Reduce.")
-    if context.executing_eagerly():
-      raise ValueError(
-          "Eager execution is not supported for Collective All-Reduce")
 
     all_reduced = self._batch_all_reduce(reduce_op, [per_replica_value])[0]
-    if _devices_match(per_replica_value, destinations):
+    device_map, logical_device = get_device_map_from(destinations)
+    if (all_reduced.device_map is device_map and
+        all_reduced.logical_device == logical_device):
       return all_reduced
-    else:
-      index = {}
-      for d in get_devices_from(destinations):
-        # pylint: disable=protected-access
-        if d in all_reduced._index:
-          index[d] = all_reduced._index[d]
-        else:
-          with ops.control_dependencies(list(
-              all_reduced._index.values())), ops.device(d):
-            index[d] = array_ops.identity(list(all_reduced._index.values())[0])
+    devices = device_map.logical_to_actual_devices(logical_device)
+    index = []
+    for d in devices:
+      if d in all_reduced.devices:
+        index.append(all_reduced.get(d))
+      else:
+        # TODO(josh11b): Once we add support for model parallelism, get the
+        # copy from the corresponding replica instead of the primary.
+        with ops.control_dependencies(all_reduced.values), ops.device(d):
+          index.append(array_ops.identity(all_reduced.primary))
 
-      return value_lib.Mirrored(index)
+    return value_lib.Mirrored(device_map, index, logical_device)
 
-  def _batch_reduce(self, reduce_op, value_destination_pairs):
+  def batch_reduce_implementation(self, reduce_op, value_destination_pairs):
     if cross_device_utils.contains_indexed_slices(value_destination_pairs):
       raise ValueError(
           "`IndexSlices` is not supported for Collective All-Reduce.")
-    if context.executing_eagerly():
-      raise ValueError(
-          "Eager execution is not supported for Collective All-Reduce")
 
     all_devices_match = _all_devices_match(value_destination_pairs)
     if all_devices_match:
@@ -837,15 +994,12 @@ class CollectiveAllReduce(CrossDeviceOps):
             "destinations are different.", 10)
 
       return [
-          self._reduce(reduce_op, t, destinations=v)
+          self.reduce_implementation(reduce_op, t, destinations=v)
           for t, v in value_destination_pairs
       ]
 
   def _batch_all_reduce(self, reduce_op, per_replica_values):
     """All-reduce across all workers in a batch."""
-    if context.executing_eagerly():
-      raise ValueError(
-          "Eager execution with collective ops is not supported yet.")
 
     logging.log_first_n(
         logging.INFO, "Collective All-reduce invoked with batches size = %d, "
@@ -881,7 +1035,7 @@ class CollectiveAllReduce(CrossDeviceOps):
     new_device_grads = [list(x) for x in zip(*reduced_gv_list)]
     return _ungroup_and_make_mirrored(
         new_device_grads,
-        per_replica_values[0].devices,
+        per_replica_values[0],
         reduce_op,
         num_between_graph_workers=self._num_workers)
 
@@ -893,7 +1047,7 @@ _dgx1_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7],
 def _has_dgx1_like_links(gpu_links):
   if not gpu_links:
     return False
-  # TODO(yuefengz): figure out the right topology for hierarchial copy if
+  # TODO(yuefengz): figure out the right topology for hierarchical copy if
   # number of gpus are less than 8.
   if len(gpu_links) < 8:
     return False
@@ -906,13 +1060,9 @@ def _has_dgx1_like_links(gpu_links):
 
 def _choose_all_reduce_algorithm(device_links):
   if _has_dgx1_like_links(device_links):
-    logging.info("Configured hierarchical_copy with num_packs=%d",
-                 len(device_links))
-    return AllReduceCrossDeviceOps(
-        "hierarchical_copy", num_packs=len(device_links))
+    return HierarchicalCopyAllReduce(num_packs=len(device_links))
   else:
-    logging.info("Configured nccl all-reduce.")
-    return AllReduceCrossDeviceOps("nccl", num_packs=1)
+    return NcclAllReduce(num_packs=1)
 
 
 def choose_the_best(devices, session_config=None):
@@ -939,12 +1089,12 @@ def choose_the_best(devices, session_config=None):
   if len(using_devices) != len(requested_devices):
     logging.warning("Not all devices in `tf.distribute.Strategy` are visible "
                     "to TensorFlow.")
-    return ReductionToOneDeviceCrossDeviceOps()
+    return ReductionToOneDevice()
 
   if any(d.device_type.lower() != "gpu" for d in using_devices):
     logging.warning("Not all devices in `tf.distribute.Strategy` are visible "
                     "to TensorFlow.")
-    return ReductionToOneDeviceCrossDeviceOps()
+    return ReductionToOneDevice()
 
   device_links = [[] for _ in range(len(using_devices))]
   for i, device in enumerate(using_devices):
diff --git a/tensorflow/python/distribute/cross_device_utils.py b/tensorflow/python/distribute/cross_device_utils.py
index 0faadd7e0cfe69bf8c80399574dd67be53ebcfe0..612a958ebba3c989c6f873a978b889061cdbe1b6 100644
--- a/tensorflow/python/distribute/cross_device_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -23,12 +23,14 @@ import threading
 
 from tensorflow.python.distribute import all_reduce
 from tensorflow.python.distribute import values as value_lib
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
-from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import gradients_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nccl_ops
 
@@ -348,20 +350,30 @@ def build_collective_reduce(input_tensors,
   """
   group_size = len(input_tensors) * num_workers
   if group_size < 2:
-    raise ValueError('num_workers * len(input_tensors) must be 2 or greater')
+    return input_tensors
   devices = [t.device for t in input_tensors]
   num_devices = len(devices)
   group_key = collective_keys.get_group_key(devices)
   instance_key = collective_keys.get_instance_key()
-  out_tensors = []
   subdiv_offsets = [0]  # TODO(tucker): maybe support non-default subdiv spec
-  for d in range(num_devices):
-    with ops.device(devices[d]):
-      reduce_op = collective_ops.all_reduce(
-          input_tensors[d], group_size, group_key, instance_key, reduction_op,
-          unary_op, subdiv_offsets)
-      out_tensors.append(reduce_op)
-  return out_tensors
+
+  def collective_all_reduce():
+    """Call collective allreduce."""
+    assert not context.executing_eagerly()
+    out_tensors = []
+    for d in range(num_devices):
+      with ops.device(devices[d]):
+        reduce_op = collective_ops.all_reduce(
+            input_tensors[d], group_size, group_key, instance_key, reduction_op,
+            unary_op, subdiv_offsets)
+        out_tensors.append(reduce_op)
+    return out_tensors
+
+  if context.executing_eagerly():
+    # Collective ops will block unless they are executed concurrently such as in
+    # a graph or a defun.
+    collective_all_reduce = def_function.function(collective_all_reduce)
+  return collective_all_reduce()
 
 
 def sum_grad_and_var_all_reduce(grad_and_vars,
@@ -633,14 +645,14 @@ def unpack_small_tensors(replica_grads, packing):
 def aggregate_tensors_or_indexed_slices(values, accumulation_fn=math_ops.add_n):
   """Aggregate tensors using `accumulation_fn` and IndexedSlices via concat."""
   if any(isinstance(v, ops.IndexedSlices) for v in values):
-    return gradients_impl._AggregateIndexedSlicesGradients(values)  # pylint: disable=protected-access
+    return gradients_util._AggregateIndexedSlicesGradients(values)  # pylint: disable=protected-access
   else:
     return accumulation_fn(values)
 
 
 def divide_by_n_tensors_or_indexed_slices(value, n):
   if isinstance(value, ops.IndexedSlices):
-    value = gradients_impl._HandleNestedIndexedSlices(value)  # pylint: disable=protected-access
+    value = gradients_util._HandleNestedIndexedSlices(value)  # pylint: disable=protected-access
     return ops.IndexedSlices(
         value.values / n, value.indices, value.dense_shape)
   else:
@@ -666,6 +678,61 @@ def contains_indexed_slices(value):
   elif isinstance(value, (list, tuple)) and value:
     return any(contains_indexed_slices(v) for v in value)
   elif isinstance(value, value_lib.DistributedValues):
-    return contains_indexed_slices(list(value._index.values()))  # pylint: disable=protected-access
+    return contains_indexed_slices(value.values)
   else:
     return False
+
+
+def is_indexed_slices(value):
+  if isinstance(value, ops.IndexedSlices):
+    return True
+  assert isinstance(value, value_lib.DistributedValues)
+  return all([isinstance(v, ops.IndexedSlices) for v in value.values])
+
+
+def split_by_sparsity(values):
+  """Split values into dense and sparse values.
+
+  Args:
+    values: a list of tensors or `PerReplica`s.
+
+  Returns:
+    Four lists:
+      a list of dense values, a list of their indices in `values` and
+      a list of sparse values, a list of their indices in `values`.
+  """
+  dense_values = []
+  dense_indices = []
+  sparse_values = []
+  sparse_indices = []
+  for i, v in enumerate(values):
+    if is_indexed_slices(v):
+      sparse_values.append(v)
+      sparse_indices.append(i)
+    else:
+      dense_values.append(v)
+      dense_indices.append(i)
+  return dense_values, dense_indices, sparse_values, sparse_indices
+
+
+def stitch_values(values_and_indices_list):
+  """Stitch values together according to their indices.
+
+  Args:
+    values_and_indices_list: a list of tuples of values and indices indicating
+      the values and postions in the returned list.
+
+  Returns:
+    a stitched list of values.
+  """
+  length = 0
+  for values_and_indices in values_and_indices_list:
+    length += len(values_and_indices[0])
+
+  result = [None] * length
+  for values_and_indices in values_and_indices_list:
+    if values_and_indices and values_and_indices[0]:
+      for v, i in zip(*values_and_indices):
+        assert result[i] is None
+        result[i] = v
+  return result
diff --git a/tensorflow/python/distribute/distribute_coordinator.py b/tensorflow/python/distribute/distribute_coordinator.py
index c0f9b8a1fdfdf8bd95375f489058cadcd63c9cb9..eb3fd1d82e98c0b1cf6743bdb277f65d6590d4cd 100644
--- a/tensorflow/python/distribute/distribute_coordinator.py
+++ b/tensorflow/python/distribute/distribute_coordinator.py
@@ -29,10 +29,14 @@ from tensorflow.python.client import session
 from tensorflow.python.distribute import distribute_coordinator_context
 from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import coordinator
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import server_lib
 
 
+_thread_local = threading.local()
+
+
 class _TaskType(object):
   PS = "ps"
   WORKER = "worker"
@@ -76,8 +80,6 @@ class _Barrier(object):
 
   def wait(self):
     """Waits until all other callers reach the same wait call."""
-    if not hasattr(self._local_sense, "value"):
-      self._local_sense.value = False
     self._local_sense.value = not self._flag
     with self._lock:
       self._counter += 1
@@ -209,8 +211,8 @@ class _WorkerContext(object):
       ValueError: if `worker_barrier` is not passed to the __init__ method.
     """
     if not self._worker_barrier:
-      raise ValueError("`worker_barrier is not set in the worker context.` \t" +
-                       self._debug_message())
+      # TODO(yuefengz): we should throw an error in independent worker mode.
+      return
     self._worker_barrier.wait()
 
   def session_creator(self,
@@ -328,7 +330,8 @@ def _run_single_worker(worker_fn,
                        task_id,
                        session_config,
                        rpc_layer="",
-                       worker_barrier=None):
+                       worker_barrier=None,
+                       coord=None):
   """Runs a single worker by calling `worker_fn` under context."""
   session_config = copy.deepcopy(session_config)
   strategy = copy.deepcopy(strategy)
@@ -350,7 +353,11 @@ def _run_single_worker(worker_fn,
       rpc_layer=rpc_layer,
       worker_barrier=worker_barrier)
   with context:
-    return worker_fn(strategy)
+    if coord:
+      with coord.stop_on_exception():
+        return worker_fn(strategy)
+    else:
+      return worker_fn(strategy)
 
 
 def _split_cluster_for_evaluator(cluster_spec, task_type):
@@ -379,6 +386,27 @@ def _run_std_server(cluster_spec=None,
                     rpc_layer=None,
                     environment=None):
   """Runs a standard server."""
+  # Check if the Server is already running. If so, assert that no configuration
+  # options have changed, and return the existing Server. This allows us to
+  # call `run_distribute_coordinator` multiple times.
+  if getattr(_thread_local, "server", None) is not None:
+    assert _thread_local.cluster_spec == cluster_spec
+    assert _thread_local.task_type == task_type
+    assert _thread_local.task_id == task_id
+    assert _thread_local.session_config_str == repr(session_config)
+    assert _thread_local.rpc_layer == rpc_layer
+    assert _thread_local.environment == environment
+    return _thread_local.server
+  else:
+    # This method is not thread-safe.
+    _thread_local.server_started = True
+    _thread_local.cluster_spec = cluster_spec
+    _thread_local.task_type = task_type
+    _thread_local.task_id = task_id
+    _thread_local.session_config_str = repr(session_config)
+    _thread_local.rpc_layer = rpc_layer
+    _thread_local.environment = environment
+
   assert cluster_spec
   target = cluster_spec.task_address(task_type, task_id)
   if rpc_layer:
@@ -400,8 +428,6 @@ def _run_std_server(cluster_spec=None,
 
   if environment == "google":
     server = _FakeServer()
-    server.start()
-    return server
   else:
     if session_config:
       logging.info(
@@ -416,13 +442,16 @@ def _run_std_server(cluster_spec=None,
         task_index=task_id,
         config=session_config,
         protocol=rpc_layer)
-    server.start()
-    return server
+
+  server.start()
+  _thread_local.server = server
+  return server
 
 
 def _run_between_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
                               cluster_spec, session_config, rpc_layer):
   """Runs a standalone client for between-graph replication."""
+  coord = coordinator.Coordinator()
   eval_thread = None
   if _TaskType.EVALUATOR in cluster_spec.jobs:
     eval_thread = threading.Thread(
@@ -431,6 +460,7 @@ def _run_between_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
               session_config),
         kwargs={
             "rpc_layer": rpc_layer,
+            "coord": coord,
         })
     eval_thread.start()
 
@@ -444,18 +474,18 @@ def _run_between_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
                 session_config),
           kwargs={
               "rpc_layer": rpc_layer,
-              "worker_barrier": worker_barrier
+              "worker_barrier": worker_barrier,
+              "coord": coord,
           })
       t.start()
       threads.append(t)
 
-  # TODO(yuefengz): wrap threads into thread coordinator?
-  for t in threads:
-    t.join()
-
-  # TODO(yuefengz): is it necessary to join eval thread?
   if eval_thread:
-    eval_thread.join()
+    # TODO(yuefengz): is it necessary to join eval thread?
+    threads_to_join = threads + [eval_thread]
+  else:
+    threads_to_join = threads
+  coord.join(threads_to_join)
 
   # TODO(yuefengz): we probably want to return results from all workers?
   return None
@@ -464,6 +494,7 @@ def _run_between_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
 def _run_in_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
                          cluster_spec, session_config, rpc_layer):
   """Runs a standalone client for in-graph replication."""
+  coord = coordinator.Coordinator()
   eval_thread = None
   if _TaskType.EVALUATOR in cluster_spec.jobs:
     eval_thread = threading.Thread(
@@ -472,6 +503,7 @@ def _run_in_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
               session_config),
         kwargs={
             "rpc_layer": rpc_layer,
+            "coord": coord,
         })
     eval_thread.start()
 
@@ -482,9 +514,12 @@ def _run_in_graph_client(worker_fn, strategy, eval_fn, eval_strategy,
       None,
       None,
       session_config,
-      rpc_layer=rpc_layer)
+      rpc_layer=rpc_layer,
+      coord=coord)
+
   if eval_thread:
-    eval_thread.join()
+    coord.join([eval_thread])
+
   return worker_result
 
 
@@ -637,7 +672,7 @@ def run_distribute_coordinator(worker_fn,
   for a task. The distribute coordinator will make a copy of the `strategy`
   object, call its `configure` method and pass it to `worker_fn` as an argument.
 
-  The `worker_fn` defines the training logic and is called under a its own
+  The `worker_fn` defines the training logic and is called under its own
   worker context which can be accessed to via `get_current_worker_context`. A
   worker context provides access to configurations for each task, e.g. the
   task_type, task_id, master target and so on. Since `worker_fn` will be called
@@ -663,7 +698,7 @@ def run_distribute_coordinator(worker_fn,
   the worker context.
 
   The `cluster_spec` can be either passed by the argument or parsed from the
-  "TF_CONFIG" envrionment variable. Example of a TF_CONFIG:
+  "TF_CONFIG" environment variable. Example of a TF_CONFIG:
   ```
     cluster = {'chief': ['host0:2222'],
                'ps': ['host1:2222', 'host2:2222'],
@@ -678,19 +713,19 @@ def run_distribute_coordinator(worker_fn,
   will be created to call `eval_fn` with its `task_type` set to "evaluator". If
   `eval_fn` is not defined, fall back to `worker_fn`. This implies that
   evaluation will be done on a single machine if there is an "evaluator" task.
-  If "evaluator" doesn't exit in the cluster_spec, it entirely depends on the
+  If "evaluator" doesn't exist in the cluster_spec, it entirely depends on the
   `worker_fn` for how to do evaluation.
 
   Args:
     worker_fn: the function to be called. The function should accept a
       `strategy` object and will be given access to a context object via a
       context manager scope.
-    strategy: a DistributionStrategy object which specifying whether it should
+    strategy: a DistributionStrategy object specifying whether it should
       run between-graph replicated training or not, whether to run init ops,
       etc. This object will also be configured given `session_config`,
       `cluster_spec`, `task_type` and `task_id`.
     eval_fn: optional function for "evaluator" task. If `eval_fn` is not passed
-      in but a "evaluator" task found in the `cluster_spec`, the `worker_fn`
+      in but a "evaluator" task is found in the `cluster_spec`, the `worker_fn`
       will be used for this task.
     eval_strategy: optional DistributionStrategy object for "evaluator" task.
     mode: in which mode this distribute coordinator runs.
@@ -708,7 +743,8 @@ def run_distribute_coordinator(worker_fn,
 
   Returns:
     In the client job, return the value returned by `worker_fn` if
-    it is in-graph replication; return None otherwise.
+    it is in-graph replication or INDEPENDENT_WORKER mode; return None
+    otherwise.
   """
   tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
   if not cluster_spec:
@@ -725,7 +761,7 @@ def run_distribute_coordinator(worker_fn,
   rpc_layer = tf_config.get("rpc_layer", rpc_layer)
   environment = tf_config.get("environment", None)
 
-  # Setting the session config is necessary for some strategies such
+  # Setting the session config is necessary for some strategies such as
   # CollectiveAllReduceStrategy.
   session_config = session_config or config_pb2.ConfigProto(
       allow_soft_placement=True)
@@ -802,23 +838,22 @@ def run_distribute_coordinator(worker_fn,
         session_config=session_config,
         rpc_layer=rpc_layer,
         environment=environment)
-
     if task_type in [_TaskType.CHIEF, _TaskType.WORKER]:
       if strategy.extended.experimental_between_graph:
         # All jobs run `worker_fn` if between-graph.
-        _run_single_worker(worker_fn, strategy, cluster_spec, task_type,
-                           task_id, session_config, rpc_layer)
+        return _run_single_worker(worker_fn, strategy, cluster_spec, task_type,
+                                  task_id, session_config, rpc_layer)
       else:
         # Only one node runs `worker_fn` if in-graph.
         context = _WorkerContext(strategy, cluster_spec, task_type, task_id)
         if context.is_chief:
-          _run_single_worker(worker_fn, strategy, cluster_spec, None, None,
-                             session_config, rpc_layer)
+          return _run_single_worker(worker_fn, strategy, cluster_spec, None,
+                                    None, session_config, rpc_layer)
         else:
           server.join()
     elif task_type == _TaskType.EVALUATOR:
-      _run_single_worker(eval_fn, eval_strategy, cluster_spec, task_type,
-                         task_id, session_config, rpc_layer)
+      return _run_single_worker(eval_fn, eval_strategy, cluster_spec, task_type,
+                                task_id, session_config, rpc_layer)
     else:
       if task_type != _TaskType.PS:
         raise ValueError("Unexpected task_type: %r" % task_type)
diff --git a/tensorflow/python/distribute/distribute_coordinator_test.py b/tensorflow/python/distribute/distribute_coordinator_test.py
index 7598c105c2dd763c524e50e139fdd9984f1bd0c0..22997169fc1477c9cbf8753d1e36f21101ff8fe8 100644
--- a/tensorflow/python/distribute/distribute_coordinator_test.py
+++ b/tensorflow/python/distribute/distribute_coordinator_test.py
@@ -230,7 +230,7 @@ class DistributeCoordinatorTestBase(test.TestCase):
       with ops.device("/job:worker/task:0"):
         result = math_ops.add_n(xs)
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       result_value = sess.run(result)
     self.assertEqual(result_value, expected)
     if result_value == expected:
@@ -278,7 +278,7 @@ class DistributeCoordinatorTestBase(test.TestCase):
       train_op = control_flow_ops.group([x_add, y_sub])
 
       if context.is_chief:
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
       # Synchronize workers after initializaton.
       if context.has_barrier:
@@ -427,7 +427,7 @@ class DistributeCoordinatorTestStandaloneMode(DistributeCoordinatorTestBase):
     # Each finished worker will increment self._result_correct.
     self.assertEqual(self._result_correct, NUM_WORKERS)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("MonitoredSession removed from v2")
   def testBetweenGraphWithMonitoredSession(self):
     """Test monitored session in standalone client mode."""
     distribute_coordinator.run_distribute_coordinator(
@@ -601,7 +601,7 @@ class DistributeCoordinatorTestInpendentWorkerMode(
     # Each finished worker will increment self._result_correct.
     self.assertEqual(self._result_correct, NUM_WORKERS)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("MonitoredSession removed from v2")
   def testBetweenGraphWithMonitoredSession(self):
     cluster_spec = self._create_cluster_spec(
         num_workers=NUM_WORKERS, num_ps=NUM_PS)
@@ -864,6 +864,9 @@ class StrategyConfigureTest(test.TestCase):
     cluster_spec = {"worker": ["localhost:0"]}
     tf_config = {"cluster": cluster_spec}
 
+    # Reset the saved Server state.
+    distribute_coordinator._thread_local = threading.local()  # pylint: disable=protected-access
+
     with test.mock.patch.dict("os.environ",
                               {"TF_CONFIG": json.dumps(tf_config)}):
       distribute_coordinator.run_distribute_coordinator(
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index 87bf510ec549f6bf1ccabfba438d2c64fd5a88d9..3e4836448737ffa87eff4d4c9e5c02b26d17b70c 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -26,6 +26,7 @@ import enum
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.eager import context as eager_context
 from tensorflow.python.framework import constant_op
@@ -33,9 +34,9 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
@@ -77,14 +78,14 @@ class UpdateContext(object):
 # Public utility functions.
 
 
-@tf_export("distribute.get_loss_reduction")
+@tf_export(v1=["distribute.get_loss_reduction"])
 def get_loss_reduction():
-  """`tf.distribute.ReduceOp` corresponding to the last loss reduction."""
-  loss_reduction = ops.get_default_graph()._last_loss_reduction  # pylint: disable=protected-access
-  if (loss_reduction == losses_impl.Reduction.SUM or
-      loss_reduction == losses_impl.ReductionV2.SUM):
-    return reduce_util.ReduceOp.SUM
-  return reduce_util.ReduceOp.MEAN
+  """DEPRECATED: Now always returns `tf.distribute.ReduceOp.SUM`.
+
+  We now always make the complete adjustment when computing the loss, so
+  code should always add gradients/losses across replicas, never average.
+  """
+  return reduce_util.ReduceOp.SUM
 
 
 # ------------------------------------------------------------------------------
@@ -99,7 +100,7 @@ def _require_cross_replica_context_extended(extended):
     return
   strategy = extended._container_strategy()  # pylint: disable=protected-access
   # We have an error to report, figure out the right message.
-  if context.distribution_strategy is not strategy:
+  if context.strategy is not strategy:
     _wrong_strategy_scope(strategy, context)
   assert cross_replica is None
   raise RuntimeError("Method requires being in cross-replica context, use "
@@ -108,14 +109,14 @@ def _require_cross_replica_context_extended(extended):
 
 def _wrong_strategy_scope(strategy, context):
   # Figure out the right error message.
-  if not distribution_strategy_context.has_distribution_strategy():
+  if not distribution_strategy_context.has_strategy():
     raise RuntimeError(
         'Need to be inside "with strategy.scope()" for %s' %
         (strategy,))
   else:
     raise RuntimeError(
         "Mixing different tf.distribute.Strategy objects: %s is not %s" %
-        (context.distribution_strategy, strategy))
+        (context.strategy, strategy))
 
 
 def require_replica_context(replica_ctx):
@@ -125,25 +126,25 @@ def require_replica_context(replica_ctx):
   # We have an error to report, figure out the right message.
   if context.replica_context is None:
     raise RuntimeError("Need to be inside `call_for_each_replica()`")
-  if context.distribution_strategy is replica_ctx.distribution_strategy:
+  if context.strategy is replica_ctx.strategy:
     # Two different ReplicaContexts with the same tf.distribute.Strategy.
     raise RuntimeError("Mismatching ReplicaContext.")
   raise RuntimeError(
       "Mismatching tf.distribute.Strategy objects: %s is not %s." %
-      (context.distribution_strategy, replica_ctx.distribution_strategy))
+      (context.strategy, replica_ctx.strategy))
 
 
-def _require_distribution_strategy_scope_strategy(strategy):
+def _require_strategy_scope_strategy(strategy):
   """Verify in a `strategy.scope()` in this thread."""
   context = _get_per_thread_mode()
-  if context.distribution_strategy is strategy: return
+  if context.strategy is strategy: return
   _wrong_strategy_scope(strategy, context)
 
 
-def _require_distribution_strategy_scope_extended(extended):
+def _require_strategy_scope_extended(extended):
   """Verify in a `distribution_strategy.scope()` in this thread."""
   context = _get_per_thread_mode()
-  if context.distribution_strategy.extended is extended: return
+  if context.strategy.extended is extended: return
   # Report error.
   strategy = extended._container_strategy()  # pylint: disable=protected-access
   _wrong_strategy_scope(strategy, context)
@@ -181,7 +182,7 @@ class _CurrentDistributionContext(object):
     self._var_creator_scope.__enter__()
     if self._device_scope:
       self._device_scope.__enter__()
-    return self._context.distribution_strategy
+    return self._context.strategy
 
   def __exit__(self, exception_type, exception_value, traceback):
     if self._device_scope:
@@ -196,10 +197,10 @@ class _SameScopeAgainContext(object):
   """Trivial context manager when you are already in `scope()`."""
 
   def __init__(self, strategy):
-    self._distribution_strategy = strategy
+    self._strategy = strategy
 
   def __enter__(self):
-    return self._distribution_strategy
+    return self._strategy
 
   def __exit__(self, exception_type, exception_value, traceback):
     del exception_type, exception_value, traceback
@@ -208,12 +209,14 @@ class _SameScopeAgainContext(object):
 # TODO(yuefengz): add more replication modes.
 @tf_export("distribute.InputReplicationMode")
 class InputReplicationMode(enum.Enum):
-  """Replication mode for input function."""
+  """Replication mode for input function.
 
-  # The input function will be called on each worker independently, creating as
-  # many input pipelines as number of workers. Replicas will dequeue from the
-  # local Dataset on their worker. Distribution Strategy doesn't manage any
-  # state sharing between such separate input pipelines.
+  * `PER_WORKER`: The input function will be called on each worker
+    independently, creating as many input pipelines as number of workers.
+    Replicas will dequeue from the local Dataset on their worker.
+    `tf.distribute.Strategy` doesn't manage any state sharing between such
+    separate input pipelines.
+  """
   PER_WORKER = "PER_WORKER"
 
 
@@ -324,47 +327,13 @@ class DistributionStrategy(object):
     """
     return self._extended._scope(self)  # pylint: disable=protected-access
 
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def read_var(self, v):
-    """DEPRECATED: use extended.read_var() instead."""
-    return self._extended.read_var(v)
-
   @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
   def colocate_vars_with(self, colocate_with_variable):
     """DEPRECATED: use extended.colocate_vars_with() instead."""
     return self._extended.colocate_vars_with(colocate_with_variable)
 
-  @doc_controls.do_not_generate_docs  # DEPRECATED
-  def distribute_dataset(self, dataset_fn):
-    """Return a `dataset` split across all replicas.  DEPRECATED.
-
-    DEPRECATED: Please use `make_dataset_iterator` or
-    `make_input_fn_iterator` instead.
-
-    Suitable for providing input to `extended.call_for_each_replica()` by
-    creating an iterator:
-
-    ```
-    def dataset_fn():
-      return tf.data.Dataset.from_tensors([[1.]]).repeat()
-
-    with strategy.scope():
-      distributed_dataset = strategy.distribute_dataset(dataset_fn)
-      iterator = distributed_dataset.make_initializable_iterator()
-      replica_results = strategy.extended.call_for_each_replica(
-          replica_fn, args=(iterator.get_next(),))
-    ```
-
-    Args:
-      dataset_fn: A function that returns a `tf.data.Dataset`.
-
-    Returns:
-      A `PerReplicaDataset` that will produce data for each replica.
-    """
-    return self._extended._distribute_dataset(dataset_fn)  # pylint: disable=protected-access
-
   def make_dataset_iterator(self, dataset):
-    """Makes an iterator for input provided via input_dataset.
+    """Makes an iterator for input provided via `dataset`.
 
     Data from the given dataset will be distributed evenly across all the
     compute replicas. We will assume that the input dataset is batched by the
@@ -393,28 +362,36 @@ class DistributionStrategy(object):
     """Returns an iterator split across replicas created from an input function.
 
     The `input_fn` should take an `tf.distribute.InputContext` object where
-    information about input sharding can be accessed:
+    information about batching and input sharding can be accessed:
 
     ```
     def input_fn(input_context):
-      d = tf.data.Dataset.from_tensors([[1.]]).repeat()
+      batch_size = input_context.get_per_replica_batch_size(global_batch_size)
+      d = tf.data.Dataset.from_tensors([[1.]]).repeat().batch(batch_size)
       return d.shard(input_context.num_input_pipelines,
                      input_context.input_pipeline_id)
     with strategy.scope():
-      iterator = strategy.make_input_fn_iterator(
-          input_fn)
-      replica_results = strategy.extended.call_for_each_replica(
-          replica_fn, iterator.get_next())
+      iterator = strategy.make_input_fn_iterator(input_fn)
+      replica_results = strategy.experimental_run(replica_fn, iterator)
     ```
 
+    The `tf.data.Dataset` returned by `input_fn` should have a per-replica
+    batch size, which may be computed using
+    `input_context.get_per_replica_batch_size`.
+
     Args:
-      input_fn: A function that returns a `tf.data.Dataset`. This function is
-        expected to take an `tf.distribute.InputContext` object.
+      input_fn: A function taking a `tf.distribute.InputContext` object and
+        returning a `tf.data.Dataset`.
       replication_mode: an enum value of `tf.distribute.InputReplicationMode`.
-        Only `PER_WORKER` is supported currently.
+        Only `PER_WORKER` is supported currently, which means there will be
+        a single call to `input_fn` per worker. Replicas will dequeue from the
+        local `tf.data.Dataset` on their worker.
 
     Returns:
-      An iterator object that can be initialized and fetched next element.
+      An iterator object that should first be `.initialize()`-ed. It may then
+      either be passed to `strategy.experimental_run()` or you can
+      `iterator.get_next()` to get the next value to pass to
+      `strategy.extended.call_for_each_replica()`.
     """
     if replication_mode != InputReplicationMode.PER_WORKER:
       raise ValueError(
@@ -422,73 +399,81 @@ class DistributionStrategy(object):
     return self.extended._make_input_fn_iterator(  # pylint: disable=protected-access
         input_fn, replication_mode=replication_mode)
 
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def broadcast(self, tensor, destinations=None):
-    """DEPRECATED: use extended.broadcast_to() instead."""
-    return self._extended.broadcast_to(tensor, destinations)
-
-  @doc_controls.do_not_generate_docs  # Use experimental_initialize() instead.
-  def initialize(self):
-    """DEPRECATED: Use `experimental_initialize()` instead."""
-    return self._extended._initialize()  # pylint: disable=protected-access
-
-  def experimental_initialize(self):
-    """Any initialization to be done before running any computations.
+  def experimental_make_numpy_iterator(
+      self, numpy_input, batch_size, num_epochs=1, shuffle=1024, session=None):
+    """Makes an iterator for input provided via a nest of numpy arrays.
 
-    In eager mode, it executes any initialization as a side effect.
-    In graph mode, it creates the initialization ops and returns them.
-
-    For example, TPU initialize_system ops.
+    Args:
+      numpy_input: A nest of NumPy input arrays that will be distributed evenly
+        across all replicas. Note that lists of Numpy arrays are stacked,
+        as that is normal `tf.data.Dataset` behavior.
+      batch_size: The number of entries from the array we should consume in one
+        step of the computation, across all replicas. This is the global batch
+        size. It should be divisible by `num_replicas_in_sync`.
+      num_epochs: The number of times to iterate through the examples. A value
+        of `None` means repeat forever.
+      shuffle: Size of buffer to use for shuffling the input examples.
+        Use `None` to disable shuffling.
+      session: (TensorFlow v1.x graph execution only) A session used for
+        initialization.
 
     Returns:
-      A list of ops to execute.
+      An `tf.distribute.InputIterator` which returns inputs for each step of the
+      computation.  User should call `initialize` on the returned iterator.
     """
-    return self._extended._initialize()  # pylint: disable=protected-access
+    ds = self.extended.experimental_make_numpy_dataset(
+        numpy_input, session=session)
+    if shuffle:
+      ds = ds.shuffle(shuffle)
+    if num_epochs != 1:
+      ds = ds.repeat(num_epochs)
+    # We need to use the drop_remainder argument to get a known static
+    # input shape which is required for TPUs.
+    drop_remainder = self.extended.experimental_require_static_shapes
+    ds = ds.batch(batch_size, drop_remainder=drop_remainder)
+    return self.make_dataset_iterator(ds)
+
+  def experimental_run(self, fn, input_iterator=None):
+    """Runs ops in `fn` on each replica, with inputs from `input_iterator`.
+
+    When eager execution is enabled, executes ops specified by `fn` on each
+    replica.  Otherwise, builds a graph to execute the ops on each replica.
+
+    Each replica will take a single, different input from the inputs provided by
+    one `get_next` call on the input iterator.
+
+    `fn` may call `tf.distribute.get_replica_context()` to access members such
+    as `replica_id_in_sync_group`.
+
+    IMPORTANT: Depending on the `DistributionStrategy` being used, and whether
+    eager execution is enabled, `fn` may be called one or more times (once for
+    each replica).
 
-  @doc_controls.do_not_generate_docs  # Use experimental_finalize() instead.
-  def finalize(self):
-    """DEPRECATED: Use `experimental_finalize()` instead."""
-    return self._extended._finalize()  # pylint: disable=protected-access
-
-  def experimental_finalize(self):
-    """Any final actions to be done at the end of all computations.
-
-    In eager mode, it executes any finalize actions as a side effect.
-    In graph mode, it creates the finalize ops and returns them.
-
-    For example, TPU shutdown ops.
+    Args:
+      fn: function to run. The inputs to the function must match the outputs of
+        `input_iterator.get_next()`. The output must be a `tf.nest` of
+        `Tensor`s.
+      input_iterator: (Optional) input iterator from which the inputs are taken.
 
     Returns:
-      A list of ops to execute.
+      Merged return value of `fn` across replicas. The structure of the return
+      value is the same as the return value from `fn`. Each element in the
+      structure can either be `PerReplica` (if the values are unsynchronized),
+      `Mirrored` (if the values are kept in sync), or `Tensor` (if running on a
+      single replica).
     """
-    return self._extended._finalize()  # pylint: disable=protected-access
-
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def run_steps_on_dataset(self, fn, iterator, iterations=1,
-                           initial_loop_values=None):
-    """DEPRECATED: use extended.experimental_run_steps_on_iterator() instead."""
-    return self._extended.experimental_run_steps_on_iterator(
-        fn, iterator, iterations, initial_loop_values)
+    with self.scope():
+      if input_iterator is None:
+        return self._extended.call_for_each_replica(fn)
+      else:
+        inputs = input_iterator.get_next()
+        return self._extended.call_for_each_replica(fn, args=(inputs,))
 
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def call_for_each_replica(self, fn, *args, **kwargs):
-    """DEPRECATED: use extended.call_for_each_replica() instead."""
-    # Handle old *args, **kwargs, and new args=(...), kwargs={...}, to
-    # allow transition.
-    a = kwargs.pop("args", None)
-    if a is not None:
-      if args:
-        raise ValueError(
-            "Can't pass *args and args=... to call_for_each_replica")
-      args = a
-    k = kwargs.pop("kwargs", None)
-    if k is not None:
-      if kwargs:
-        raise ValueError(
-            "Can't pass **kwargs and kwargs=... to call_for_each_replica")
-      kwargs = k
-    kwargs.pop("run_concurrently", None)  # Ignore old option.
-    return self._extended.call_for_each_replica(fn, args, kwargs)
+  # TODO(b/121296772,b/121300973): Add logical_device argument (default of 0).
+  def broadcast(self, tensor):
+    """Broadcasts `tensor` to all replicas, returning a per-replica value."""
+    _require_cross_replica_context_extended(self._extended)
+    return self._extended._broadcast(tensor)  # pylint: disable=protected-access
 
   def reduce(self, reduce_op, value):
     """Reduce `value` across replicas.
@@ -504,58 +489,6 @@ class DistributionStrategy(object):
     _require_cross_replica_context_extended(self._extended)
     return self._extended._reduce(reduce_op, value)  # pylint: disable=protected-access
 
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def batch_reduce(self, aggregation, value_destination_pairs):
-    """DEPRECATED: use extended.batch_reduce_to() instead."""
-    return self._extended.batch_reduce_to(aggregation, value_destination_pairs)
-
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def update(self, var, fn, *args, **kwargs):
-    """DEPRECATED: use extended.update() instead."""
-    group = kwargs.pop("group", True)
-    # We temporarily support "grouped" in addition to "group" for backward-
-    # compatibility.
-    group = kwargs.pop("grouped", True) and group
-    # Handle old *args, **kwargs, and new args=(...), kwargs={...}, to
-    # allow transition.
-    a = kwargs.pop("args", None)
-    if a is not None:
-      if args:
-        raise ValueError(
-            "Can't pass *args and args=... to update")
-      args = a
-    k = kwargs.pop("kwargs", None)
-    if k is not None:
-      if kwargs:
-        raise ValueError(
-            "Can't pass **kwargs and kwargs=... to update")
-      kwargs = k
-    return self._extended.update(var, fn, args, kwargs, group)
-
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def update_non_slot(self, colocate_with, fn, *args, **kwargs):
-    """DEPRECATED: use extended.update_non_slot() instead."""
-    group = kwargs.pop("group", True)
-    # We temporarily support "grouped" in addition to "group" for backward-
-    # compatibility.
-    group = kwargs.pop("grouped", True) and group
-    # Handle old *args, **kwargs, and new args=(...), kwargs={...}, to
-    # allow transition.
-    a = kwargs.pop("args", None)
-    if a is not None:
-      if args:
-        raise ValueError(
-            "Can't pass *args and args=... to update_non_slot")
-      args = a
-    k = kwargs.pop("kwargs", None)
-    if k is not None:
-      if kwargs:
-        raise ValueError(
-            "Can't pass **kwargs and kwargs=... to update_non_slot")
-      kwargs = k
-    return self._extended.update_non_slot(
-        colocate_with, fn, args, kwargs, group)
-
   @doc_controls.do_not_generate_docs  # DEPRECATED, -> `DistributedValues`
   def unwrap(self, value):
     """Returns the list of all per-replica values contained in `value`.
@@ -570,50 +503,16 @@ class DistributionStrategy(object):
     """
     return self._extended._unwrap(value)  # pylint: disable=protected-access
 
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def value_container(self, value):
-    """DEPRECATED: use extended.value_container() instead."""
-    return self._extended.value_container(value)
-
   @doc_controls.do_not_generate_docs  # DEPRECATED, -> `DistributedValues`
   def group(self, value, name=None):
     """Shortcut for `tf.group(self.unwrap(value))`."""
     return self._extended._group(value, name)  # pylint: disable=protected-access
 
-  @property
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def require_static_shapes(self):
-    """DEPRECATED: use extended.require_static_shapes instead."""
-    return self._extended.experimental_require_static_shapes
-
   @property
   def num_replicas_in_sync(self):
     """Returns number of replicas over which gradients are aggregated."""
     return self._extended._num_replicas_in_sync  # pylint: disable=protected-access
 
-  @property
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def worker_devices(self):
-    """DEPRECATED: use extended.worker_devices instead."""
-    return self._extended.worker_devices
-
-  @property
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def parameter_devices(self):
-    """DEPRECATED: use extended.parameter_devices instead."""
-    return self._extended.parameter_devices
-
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def non_slot_devices(self, var_list):
-    """DEPRECATED: use extended.non_slot_devices instead."""
-    return self._extended.non_slot_devices(var_list)
-
-  @property
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def between_graph(self):
-    """DEPRECATED: use extended.experimental_between_graph instead."""
-    return self._extended.experimental_between_graph
-
   @doc_controls.do_not_generate_docs  # DEPRECATED, being replaced by a new API.
   def configure(self,
                 session_config=None,
@@ -649,24 +548,6 @@ class DistributionStrategy(object):
     """
     return self._extended._update_config_proto(config_proto)  # pylint: disable=protected-access
 
-  @property
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def should_init(self):
-    """DEPRECATED: use extended.should_init instead."""
-    return self._extended.experimental_should_init
-
-  @property
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def should_checkpoint(self):
-    """DEPRECATED: use extended.should_checkpoint instead."""
-    return self._extended.should_checkpoint
-
-  @property
-  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
-  def should_save_summary(self):
-    """DEPRECATED: use extended.should_save_summary instead."""
-    return self._extended.should_save_summary
-
   def __deepcopy__(self, memo):
     # First do a regular deepcopy of `self`.
     cls = self.__class__
@@ -844,11 +725,9 @@ class DistributionStrategyExtended(object):
     a variable (which by definition will have locality V(`v`), though
     will match another locality if inside a `colocate_vars_with`
     scope).
-  * `d.make_dataset_iterator(dataset)` (or the deprecated
-    `d.distribute_dataset(dataset).make_one_shot_iterator()`): in cross-replica
+  * `d.make_dataset_iterator(dataset)`: in cross-replica
     context, produces an iterator with locality T
-  * `d.extended.broadcast_to(t)`: in cross-replica context, produces a value
-    with locality M
+  * `d.broadcast(t)`: in cross-replica context, produces a value with locality M
   * `d.extended.broadcast_to(t, v)`: in cross-replica context, produces a value
     with locality V(`v`)
   * `d.extended.call_for_each_replica(fn, ...)`: in cross-replica context, runs
@@ -933,13 +812,14 @@ class DistributionStrategyExtended(object):
 
   def _scope(self, strategy):
     """Implementation of DistributionStrategy.scope()."""
-    if distribution_strategy_context.has_distribution_strategy():
+    if distribution_strategy_context.has_strategy():
       _require_cross_replica_context_extended(self)
       return _SameScopeAgainContext(strategy)
 
     def creator_with_resource_vars(*args, **kwargs):
-      _require_distribution_strategy_scope_extended(self)
+      _require_strategy_scope_extended(self)
       kwargs["use_resource"] = True
+      kwargs["distribute_strategy"] = strategy
       return self._create_variable(*args, **kwargs)
 
     def distributed_getter(getter, *args, **kwargs):
@@ -964,6 +844,30 @@ class DistributionStrategyExtended(object):
     # Note: should support "colocate_with" argument.
     raise NotImplementedError("must be implemented in descendants")
 
+  def variable_created_in_scope(self, v):
+    """Tests whether `v` was created while this strategy scope was active.
+
+    Variables created inside the strategy scope are "owned" by it:
+
+    >>> with strategy.scope():
+    ...   v = tf.Variable(1.)
+    >>> strategy.variable_created_in_scope(v)
+    True
+
+    Variables created outside the strategy are not owned by it:
+
+    >>> v = tf.Variable(1.)
+    >>> strategy.variable_created_in_scope(v)
+    False
+
+    Args:
+      v: A `tf.Variable` instance.
+
+    Returns:
+      True if `v` was created inside the scope, False if not.
+    """
+    return v._distribute_strategy == self._container_strategy_weakref()  # pylint: disable=protected-access
+
   def read_var(self, v):
     """Reads the value of a variable.
 
@@ -994,7 +898,7 @@ class DistributionStrategyExtended(object):
     ```
     with strategy.scope():
       var1 = tf.get_variable(...)
-      with strategy.extended.colocate_vars_with(v1):
+      with strategy.extended.colocate_vars_with(var1):
         # var2 and var3 will be created on the same device(s) as var1
         var2 = tf.get_variable(...)
         var3 = tf.get_variable(...)
@@ -1002,41 +906,32 @@ class DistributionStrategyExtended(object):
       def fn(v1, v2, v3):
         # operates on v1 from var1, v2 from var2, and v3 from var3
 
-      # `fn` runs on every device `v1` is on, `v2` and `v3` will be there too.
-      strategy.extended.update(v1, fn, args=(v2, v3))
+      # `fn` runs on every device `var1` is on, `var2` and `var3` will be there
+      # too.
+      strategy.extended.update(var1, fn, args=(var2, var3))
     ```
 
     Args:
-      colocate_with_variable: A created in `self.scope()`. Variables created
-        while in the returned context manager will be on the same set of
-        devices as `colocate_with_variable`.
+      colocate_with_variable: A variable created in this strategy's `scope()`.
+        Variables created while in the returned context manager will be on the
+        same set of devices as `colocate_with_variable`.
 
     Returns:
       A context manager.
     """
     def create_colocated_variable(next_creator, *args, **kwargs):
-      _require_distribution_strategy_scope_extended(self)
+      _require_strategy_scope_extended(self)
       kwargs["use_resource"] = True
       kwargs["colocate_with"] = colocate_with_variable
       return next_creator(*args, **kwargs)
 
-    _require_distribution_strategy_scope_extended(self)
+    _require_strategy_scope_extended(self)
+    self._validate_colocate_with_variable(colocate_with_variable)
     return variable_scope.variable_creator_scope(create_colocated_variable)
 
-  def _call_dataset_fn(self, dataset_fn):
-    """Call the `dataset_fn` with `input_context` as argument."""
-    result = dataset_fn()
-    if not isinstance(result, dataset_ops.DatasetV2):
-      raise ValueError(
-          "dataset_fn() must return a tf.data.Dataset when using a "
-          "tf.distribute.Strategy.")
-    return result
-
-  # TODO(josh11b): `PerReplicaDataset` currently only implements a few methods of
-  # Dataset API such as make_one_shot_iterator and make_initializable_iterator.
-  # Extend to implement more functionality of datasets.
-  def _distribute_dataset(self, dataset_fn):
-    raise NotImplementedError("must be implemented in descendants")
+  def _validate_colocate_with_variable(self, colocate_with_variable):
+    """Validate `colocate_with_variable` argument to `colocate_vars_with`."""
+    pass
 
   def _make_dataset_iterator(self, dataset):
     raise NotImplementedError("must be implemented in descendants")
@@ -1044,6 +939,29 @@ class DistributionStrategyExtended(object):
   def _make_input_fn_iterator(self, input_fn, replication_mode):
     raise NotImplementedError("must be implemented in descendants")
 
+  def experimental_make_numpy_dataset(self, numpy_input, session=None):
+    """Makes a dataset for input provided via a numpy array.
+
+    This avoids adding `numpy_input` as a large constant in the graph,
+    and copies the data to the machine or machines that will be processing
+    the input.
+
+    Args:
+      numpy_input: A nest of NumPy input arrays that will be distributed evenly
+        across all replicas. Note that lists of Numpy arrays are stacked,
+        as that is normal `tf.data.Dataset` behavior.
+      session: (TensorFlow v1.x graph execution only) A session used for
+        initialization.
+
+    Returns:
+      A `tf.data.Dataset` representing `numpy_input`.
+    """
+    _require_cross_replica_context_extended(self)
+    return self._experimental_make_numpy_dataset(numpy_input, session=session)
+
+  def _experimental_make_numpy_dataset(self, numpy_input, session):
+    raise NotImplementedError("must be implemented in descendants")
+
   def broadcast_to(self, tensor, destinations):
     """Mirror a tensor on one device to all worker devices.
 
@@ -1060,15 +978,12 @@ class DistributionStrategyExtended(object):
     assert not isinstance(destinations, (list, tuple))
     return self._broadcast_to(tensor, destinations)
 
+  def _broadcast(self, tensor):
+    return self._broadcast_to(tensor, None)  # Default implementation
+
   def _broadcast_to(self, tensor, destinations):
     raise NotImplementedError("must be implemented in descendants")
 
-  def _initialize(self):
-    return []
-
-  def _finalize(self):
-    return []
-
   def experimental_run_steps_on_iterator(self, fn, iterator, iterations=1,
                                          initial_loop_values=None):
     """Run `fn` with input from `iterator` for `iterations` times.
@@ -1175,9 +1090,6 @@ class DistributionStrategyExtended(object):
 
     Args:
       reduce_op: Reduction type, an instance of `tf.distribute.ReduceOp` enum.
-        DEPRECATED but still accepted values:
-        `tf.VariableAggregation.SUM`,
-        `tf.VariableAggregation.MEAN`,
       value: A per-replica value with one value per replica.
       destinations: A mirrored variable, a per-replica tensor, or a device
         string. The return value will be copied to all destination devices (or
@@ -1190,14 +1102,7 @@ class DistributionStrategyExtended(object):
     # TODO(josh11b): More docstring
     _require_cross_replica_context_extended(self)
     assert not isinstance(destinations, (list, tuple))
-
-    # TODO(priyag): Remove this when all callers have been updated.
-    if isinstance(reduce_op, variable_scope.VariableAggregation):
-      assert reduce_op in (
-          variable_scope.VariableAggregation.SUM,
-          variable_scope.VariableAggregation.MEAN,
-      )
-      reduce_op = reduce_util.ReduceOp.from_variable_aggregation(reduce_op)
+    assert not isinstance(reduce_op, variable_scope.VariableAggregation)
     assert (reduce_op == reduce_util.ReduceOp.SUM or
             reduce_op == reduce_util.ReduceOp.MEAN)
     return self._reduce_to(reduce_op, value, destinations)
@@ -1210,9 +1115,6 @@ class DistributionStrategyExtended(object):
 
     Args:
       reduce_op: Reduction type, an instance of `tf.distribute.ReduceOp` enum.
-        DEPRECATED but still accepted values:
-        `tf.VariableAggregation.SUM`,
-        `tf.VariableAggregation.MEAN`,
       value_destination_pairs: A sequence of (value, destinations)
         pairs. See `reduce_to()` for a description.
 
@@ -1221,14 +1123,7 @@ class DistributionStrategyExtended(object):
     """
     # TODO(josh11b): More docstring
     _require_cross_replica_context_extended(self)
-
-    # TODO(priyag): Remove this when all callers have been updated.
-    if isinstance(reduce_op, variable_scope.VariableAggregation):
-      assert reduce_op in [
-          variable_scope.VariableAggregation.SUM,
-          variable_scope.VariableAggregation.MEAN,
-      ]
-      reduce_op = reduce_util.ReduceOp.from_variable_aggregation(reduce_op)
+    assert not isinstance(reduce_op, variable_scope.VariableAggregation)
     return self._batch_reduce_to(reduce_op, value_destination_pairs)
 
   def _batch_reduce_to(self, reduce_op, value_destination_pairs):
@@ -1434,7 +1329,7 @@ class ReplicaContext(object):
   """
 
   def __init__(self, strategy, replica_id_in_sync_group):
-    self._distribution_strategy = strategy
+    self._strategy = strategy
     self._thread_context = distribution_strategy_context._InReplicaThreadMode(  # pylint: disable=protected-access
         self)
     self._replica_id_in_sync_group = replica_id_in_sync_group
@@ -1482,17 +1377,16 @@ class ReplicaContext(object):
   def _merge_call(self, merge_fn, args, kwargs):
     """Default implementation for single replica."""
     _push_per_thread_mode(  # thread-local, so not needed with multiple threads
-        distribution_strategy_context._CrossReplicaThreadMode(  # pylint: disable=protected-access
-            self._distribution_strategy))
+        distribution_strategy_context._CrossReplicaThreadMode(self._strategy))  # pylint: disable=protected-access
     try:
-      return merge_fn(self._distribution_strategy, *args, **kwargs)
+      return merge_fn(self._strategy, *args, **kwargs)
     finally:
       _pop_per_thread_mode()
 
   @property
   def num_replicas_in_sync(self):
     """Returns number of replicas over which gradients are aggregated."""
-    return self._distribution_strategy.num_replicas_in_sync
+    return self._strategy.num_replicas_in_sync
 
   @property
   def replica_id_in_sync_group(self):
@@ -1500,16 +1394,10 @@ class ReplicaContext(object):
     require_replica_context(self)
     return self._replica_id_in_sync_group
 
-  @property
-  @doc_controls.do_not_generate_docs  # DEPRECATED, use `strategy`
-  def distribution_strategy(self):
-    """DEPRECATED: use `self.stratgey` instead."""
-    return self._distribution_strategy
-
   @property
   def strategy(self):
     """The current `tf.distribute.Strategy` object."""
-    return self._distribution_strategy
+    return self._strategy
 
   @property
   def devices(self):
@@ -1517,6 +1405,50 @@ class ReplicaContext(object):
     require_replica_context(self)
     return (device_util.current(),)
 
+  def all_reduce(self, reduce_op, value):
+    """All-reduces the given `Tensor` nest across replicas.
+
+    If `all_reduce` is called in any replica, it must be called in all replicas.
+    The nested structure and `Tensor` shapes must be identical in all replicas.
+
+    IMPORTANT: The ordering of communications must be identical in all replicas.
+
+    Example with two replicas:
+      Replica 0 `value`: {'a': 1, 'b': [40,  1]}
+      Replica 1 `value`: {'a': 3, 'b': [ 2, 98]}
+
+      If `reduce_op` == `SUM`:
+        Result (on all replicas): {'a': 4, 'b': [42, 99]}
+
+      If `reduce_op` == `MEAN`:
+        Result (on all replicas): {'a': 2, 'b': [21, 49.5]}
+
+    Args:
+      reduce_op: Reduction type, an instance of `tf.distribute.ReduceOp` enum.
+      value: The nested structure of `Tensor`s to all-reduced.
+        The structure must be compatible with `tf.nest`.
+
+    Returns:
+       A `Tensor` nest with the reduced `value`s from each replica.
+    """
+    def batch_all_reduce(strategy, *value_flat):
+      return strategy.extended.batch_reduce_to(
+          reduce_op, [(v, _batch_reduce_destination(v)) for v in value_flat])
+
+    if reduce_op in [reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN]:
+      # TODO(cjfj): Work out why `batch_reduce` doesn't return the correct grad.
+      @custom_gradient.custom_gradient
+      def grad_wrapper(*xs):
+        ys = self.merge_call(batch_all_reduce, args=xs)
+        # The gradient of an all-sum is itself an all-sum (all-mean, likewise).
+        return ys, lambda *dy_s: self.all_reduce(reduce_op, dy_s)
+      return nest.pack_sequence_as(value, grad_wrapper(*nest.flatten(value)))
+    else:
+      # TODO(cjfj): Implement gradients for other reductions.
+      reduced = nest.pack_sequence_as(
+          value, self.merge_call(batch_all_reduce, args=nest.flatten(value)))
+      return nest.map_structure(array_ops.prevent_gradient, reduced)
+
   # TODO(josh11b): Implement `start_all_reduce(method, t)` for efficient
   # all-reduce. It would return a function returning the result of reducing `t`
   # across all replicas. The caller would wait to call this function until they
@@ -1527,6 +1459,15 @@ class ReplicaContext(object):
   #   to that point that the first result is needed. Most likely this can be
   #   implemented in terms of `merge_call()` and `batch_reduce_to()`.
 
+
+def _batch_reduce_destination(x):
+  """Returns the destinations for batch all-reduce."""
+  if isinstance(x, ops.Tensor):  # One device strategies.
+    return x.device
+  else:
+    return x
+
+
 # ------------------------------------------------------------------------------
 
 
@@ -1543,11 +1484,11 @@ class _DefaultDistributionExtended(DistributionStrategyExtended):
 
   def _scope(self, strategy):
     """Context manager setting a variable creator and `self` as current."""
-    if distribution_strategy_context.has_distribution_strategy():
+    if distribution_strategy_context.has_strategy():
       raise RuntimeError("Must not nest tf.distribute.Strategy scopes.")
 
     def creator(next_creator, *args, **kwargs):
-      _require_distribution_strategy_scope_strategy(strategy)
+      _require_strategy_scope_strategy(strategy)
       return next_creator(*args, **kwargs)
 
     return _CurrentDistributionContext(
@@ -1555,11 +1496,11 @@ class _DefaultDistributionExtended(DistributionStrategyExtended):
 
   def colocate_vars_with(self, colocate_with_variable):
     """Does not require `self.scope`."""
-    _require_distribution_strategy_scope_extended(self)
+    _require_strategy_scope_extended(self)
     return ops.colocate_with(colocate_with_variable)
 
-  def _distribute_dataset(self, dataset_fn):
-    return self._call_dataset_fn(dataset_fn)
+  def variable_created_in_scope(self, v):
+    return v._distribute_strategy is None  # pylint: disable=protected-access
 
   def _make_dataset_iterator(self, dataset):
     return _DefaultDistributionExtended.DefaultInputIterator(dataset)
@@ -1567,7 +1508,20 @@ class _DefaultDistributionExtended(DistributionStrategyExtended):
   def _make_input_fn_iterator(self,
                               input_fn,
                               replication_mode=InputReplicationMode.PER_WORKER):
-    return input_fn(InputContext()).make_initializable_iterator()
+    dataset = input_fn(InputContext())
+    return _DefaultDistributionExtended.DefaultInputIterator(dataset)
+
+  def _experimental_make_numpy_dataset(self, numpy_input, session):
+    numpy_flat = nest.flatten(numpy_input)
+    vars_flat = tuple(
+        variable_scope.variable(array_ops.zeros(i.shape, i.dtype),
+                                trainable=False, use_resource=True)
+        for i in numpy_flat
+    )
+    for v, i in zip(vars_flat, numpy_flat):
+      numpy_dataset.init_var_from_numpy(v, i, session)
+    vars_nested = nest.pack_sequence_as(numpy_input, vars_flat)
+    return dataset_ops.Dataset.from_tensor_slices(vars_nested)
 
   def _broadcast_to(self, tensor, destinations):
     if destinations is None:
@@ -1652,6 +1606,7 @@ class _DefaultDistributionExtended(DistributionStrategyExtended):
   # TODO(priyag): Delete this once all strategies use global batch size.
   @property
   def _global_batch_size(self):
+    """Global and per-replica batching are equivalent for this strategy."""
     return True
 
 
@@ -1664,7 +1619,7 @@ _original_from_proto = resource_variable_ops._from_proto_fn
 
 
 def _from_proto_fn(v, import_scope=None):
-  if distribution_strategy_context.has_distribution_strategy():
+  if distribution_strategy_context.has_strategy():
     raise NotImplementedError(
         "Deserialization of variables is not yet supported when using a "
         "tf.distribute.Strategy.")
diff --git a/tensorflow/python/distribute/distribute_lib_test.py b/tensorflow/python/distribute/distribute_lib_test.py
index d63d1fe3c323ac1e98afee52cf544c7c7da5fc65..6876af377515ff2c626d12cb5ad2cdf2a3014ddd 100644
--- a/tensorflow/python/distribute/distribute_lib_test.py
+++ b/tensorflow/python/distribute/distribute_lib_test.py
@@ -19,7 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import variable_scope
@@ -60,13 +60,12 @@ class _TestExtended(distribute_lib.DistributionStrategyExtended):
 
 
 def _assert_in_default_state(t):
-  t.assertIs(distribution_strategy_context._get_default_replica_context(),
-             distribution_strategy_context.get_replica_context())
-  t.assertIs(None, distribution_strategy_context.get_cross_replica_context())
-  t.assertFalse(distribution_strategy_context.in_cross_replica_context())
-  t.assertIs(distribution_strategy_context._get_default_distribution_strategy(),
-             distribution_strategy_context.get_distribution_strategy())
-  t.assertFalse(distribution_strategy_context.has_distribution_strategy())
+  t.assertIs(ds_context._get_default_replica_context(),
+             ds_context.get_replica_context())
+  t.assertIs(None, ds_context.get_cross_replica_context())
+  t.assertFalse(ds_context.in_cross_replica_context())
+  t.assertIs(ds_context._get_default_strategy(), ds_context.get_strategy())
+  t.assertFalse(ds_context.has_strategy())
 
 
 class TestStrategyTest(test.TestCase):
@@ -76,14 +75,12 @@ class TestStrategyTest(test.TestCase):
     dist = _TestStrategy()
 
     def run_fn():
-      replica_context = distribution_strategy_context.get_replica_context()
+      replica_context = ds_context.get_replica_context()
       self.assertTrue(replica_context is not None)
-      self.assertIs(None,
-                    distribution_strategy_context.get_cross_replica_context())
-      self.assertFalse(distribution_strategy_context.in_cross_replica_context())
-      self.assertTrue(distribution_strategy_context.has_distribution_strategy())
-      self.assertIs(dist,
-                    distribution_strategy_context.get_distribution_strategy())
+      self.assertIs(None, ds_context.get_cross_replica_context())
+      self.assertFalse(ds_context.in_cross_replica_context())
+      self.assertTrue(ds_context.has_strategy())
+      self.assertIs(dist, ds_context.get_strategy())
       self.assertEqual("foo", replica_context.merge_call(None, test_arg="foo"))
       expected_value = _get_test_variable(
           "bar", variable_scope.VariableSynchronization.AUTO,
@@ -101,13 +98,11 @@ class TestStrategyTest(test.TestCase):
     _assert_in_default_state(self)
     dist = _TestStrategy()
     with dist.scope():
-      self.assertIs(None, distribution_strategy_context.get_replica_context())
-      self.assertIs(dist,
-                    distribution_strategy_context.get_cross_replica_context())
-      self.assertTrue(distribution_strategy_context.in_cross_replica_context())
-      self.assertTrue(distribution_strategy_context.has_distribution_strategy())
-      self.assertIs(dist,
-                    distribution_strategy_context.get_distribution_strategy())
+      self.assertIs(None, ds_context.get_replica_context())
+      self.assertIs(dist, ds_context.get_cross_replica_context())
+      self.assertTrue(ds_context.in_cross_replica_context())
+      self.assertTrue(ds_context.has_strategy())
+      self.assertIs(dist, ds_context.get_strategy())
       expected_value = _get_test_variable(
           "baz", variable_scope.VariableSynchronization.AUTO,
           variable_scope.VariableAggregation.NONE)
@@ -138,22 +133,16 @@ class DefaultDistributionStrategyTest(test.TestCase):
     _assert_in_default_state(self)
 
     def merge_fn(dist, s):
-      self.assertIs(
-          distribution_strategy_context._get_default_distribution_strategy(),
-          dist)
-      self.assertIs(None, distribution_strategy_context.get_replica_context())
-      self.assertIs(dist,
-                    distribution_strategy_context.get_cross_replica_context())
-      self.assertTrue(distribution_strategy_context.in_cross_replica_context())
-      self.assertIs(dist,
-                    distribution_strategy_context.get_distribution_strategy())
-      self.assertFalse(
-          distribution_strategy_context.has_distribution_strategy())
+      self.assertIs(ds_context._get_default_strategy(), dist)
+      self.assertIs(None, ds_context.get_replica_context())
+      self.assertIs(dist, ds_context.get_cross_replica_context())
+      self.assertTrue(ds_context.in_cross_replica_context())
+      self.assertIs(dist, ds_context.get_strategy())
+      self.assertFalse(ds_context.has_strategy())
       return "foo_" + s
 
-    replica_ctx = distribution_strategy_context.get_replica_context()
-    self.assertIs(distribution_strategy_context._get_default_replica_context(),
-                  replica_ctx)
+    replica_ctx = ds_context.get_replica_context()
+    self.assertIs(ds_context._get_default_replica_context(), replica_ctx)
     self.assertEqual("foo_bar", replica_ctx.merge_call(merge_fn, args=("bar",)))
     _assert_in_default_state(self)
 
diff --git a/tensorflow/python/distribute/distribution_strategy_context.py b/tensorflow/python/distribute/distribution_strategy_context.py
index 78e096e286727664830f18ac0236c3626c5733d9..6c1e250f9651412067d32291560b9d1135676067 100644
--- a/tensorflow/python/distribute/distribution_strategy_context.py
+++ b/tensorflow/python/distribute/distribution_strategy_context.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utility to get distribution strategy related contexts."""
+"""Utility to get tf.distribute.Strategy related contexts."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -31,29 +31,27 @@ distribute_lib = LazyLoader(
 
 # ------------------------------------------------------------------------------
 # Internal API for setting the current thread mode as being either in a
-# replica or cross-replica context for a particular distribution strategy.
+# replica or cross-replica context for a particular tf.distribute.Strategy.
 
 
 class _ThreadMode(object):
 
   def __init__(self, dist, cross, replica):
-    self.distribution_strategy = dist
+    self.strategy = dist
     self.cross_replica_context = cross
     self.replica_context = replica
 
 
 class _CrossReplicaThreadMode(_ThreadMode):
 
-  def __init__(self, distribution_strategy):
-    _ThreadMode.__init__(
-        self, distribution_strategy, distribution_strategy, None)
+  def __init__(self, strategy):
+    _ThreadMode.__init__(self, strategy, strategy, None)
 
 
 class _InReplicaThreadMode(_ThreadMode):
 
   def __init__(self, replica_ctx):
-    _ThreadMode.__init__(
-        self, replica_ctx.distribution_strategy, None, replica_ctx)
+    _ThreadMode.__init__(self, replica_ctx.strategy, None, replica_ctx)
 
 
 def _push_per_thread_mode(context):
@@ -71,7 +69,7 @@ class _DefaultReplicaThreadMode(_ThreadMode):
   """
 
   def __init__(self):
-    _ThreadMode.__init__(self, _get_default_distribution_strategy(), None,
+    _ThreadMode.__init__(self, _get_default_strategy(), None,
                          _get_default_replica_context())
 
 
@@ -129,7 +127,7 @@ def get_cross_replica_context():
   """Returns the current tf.distribute.Strategy if in a cross-replica context.
 
   DEPRECATED: Please use `in_cross_replica_context()` and
-  `get_distribution_strategy()` instead.
+  `get_strategy()` instead.
 
   Note that execution:
 
@@ -174,7 +172,7 @@ def in_cross_replica_context():
 
 
 @tf_export("distribute.get_strategy")
-def get_distribution_strategy():
+def get_strategy():
   """Returns the current `tf.distribute.Strategy` object.
 
   Typically only used in a cross-replica context:
@@ -186,47 +184,50 @@ def get_distribution_strategy():
   ```
 
   Returns:
-    A `tf.distribute.Strategy` object. Inside a
-    `with distribution_strategy.scope()` block, it returns
-    `distribution_strategy`, otherwise it returns the default
-    (single-replica) `tf.distribute.Strategy` object.
+    A `tf.distribute.Strategy` object. Inside a `with strategy.scope()` block,
+    it returns `strategy`, otherwise it returns the default (single-replica)
+    `tf.distribute.Strategy` object.
   """
-  return _get_per_thread_mode().distribution_strategy
+  return _get_per_thread_mode().strategy
 
 
 @tf_export("distribute.has_strategy")
-def has_distribution_strategy():
+def has_strategy():
   """Return if there is a current non-default `tf.distribute.Strategy`.
 
   Returns:
     True if inside a `with strategy.scope():`.
   """
-  return get_distribution_strategy() is not _get_default_distribution_strategy()
+  return get_strategy() is not _get_default_strategy()
+
+
+def get_strategy_and_replica_context():
+  per_thread_mode = _get_per_thread_mode()
+  return (per_thread_mode.strategy, per_thread_mode.replica_context)
 
 
 # ------------------------------------------------------------------------------
-# Defaults that are used when no distribution strategy is explicitly created.
+# Defaults that are used when no tf.distribute.Strategy is explicitly created.
 # We create them lazily in a function so that we can workaround the circular
 # dependency on distribute_lib. See lazy loader at the top of this file.
 
 _defaults = {
-    "distribution_strategy": None,
+    "strategy": None,
     "replica_context": None,
     "replica_mode": None
 }
 
 
-def _get_default_distribution_strategy():
-  if _defaults["distribution_strategy"] is None:
-    _defaults["distribution_strategy"] = (
-        distribute_lib._DefaultDistributionStrategy())  # pylint: disable=protected-access
-  return _defaults["distribution_strategy"]
+def _get_default_strategy():
+  if _defaults["strategy"] is None:
+    _defaults["strategy"] = distribute_lib._DefaultDistributionStrategy()  # pylint: disable=protected-access
+  return _defaults["strategy"]
 
 
 def _get_default_replica_context():
   if _defaults["replica_context"] is None:
     _defaults["replica_context"] = distribute_lib.ReplicaContext(
-        _get_default_distribution_strategy(), replica_id_in_sync_group=0)
+        _get_default_strategy(), replica_id_in_sync_group=0)
   return _defaults["replica_context"]
 
 
@@ -234,3 +235,8 @@ def _get_default_replica_mode():
   if _defaults["replica_mode"] is None:
     _defaults["replica_mode"] = _DefaultReplicaThreadMode()
   return _defaults["replica_mode"]
+
+
+# Aliases for compatibility with old names.
+get_distribution_strategy = get_strategy
+has_distribution_strategy = has_strategy
diff --git a/tensorflow/python/distribute/estimator_training.py b/tensorflow/python/distribute/estimator_training.py
index 7d5f231c37da41f10f945adc468f40ffd0ecc743..0ec6703b8692fe313f12b6e9952e19f43e1e7adb 100644
--- a/tensorflow/python/distribute/estimator_training.py
+++ b/tensorflow/python/distribute/estimator_training.py
@@ -24,6 +24,7 @@ import six
 
 from tensorflow.python.distribute import distribute_coordinator as dc
 from tensorflow.python.distribute import distribute_coordinator_context as dc_context
+from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 
@@ -296,10 +297,11 @@ def estimator_train(estimator, train_distributed_fn, hooks):
   assert estimator._config._distribute_coordinator_mode
   run_config = estimator._config
   assert estimator._config.cluster_spec
-  cluster_spec = estimator._config.cluster_spec
+  cluster_spec = multi_worker_util.normalize_cluster_spec(
+      estimator._config.cluster_spec)
   assert estimator._config._train_distribute
 
-  if 'evaluator' in cluster_spec:
+  if 'evaluator' in cluster_spec.jobs:
     raise ValueError("'evaluator' job is not supported if you don't use "
                      '`train_and_evaluate`')
 
@@ -344,10 +346,11 @@ def estimator_evaluate(estimator, evaluate_distributed_fn, hooks):
   assert estimator._config._distribute_coordinator_mode
   run_config = estimator._config
   assert estimator._config.cluster_spec
-  cluster_spec = estimator._config.cluster_spec
+  cluster_spec = multi_worker_util.normalize_cluster_spec(
+      estimator._config.cluster_spec)
   assert estimator._config._eval_distribute
 
-  if 'evaluator' in cluster_spec:
+  if 'evaluator' in cluster_spec.jobs:
     raise ValueError("'evaluator' job is not supported if you don't use "
                      '`train_and_evaluate`')
 
diff --git a/tensorflow/python/distribute/experimental/BUILD b/tensorflow/python/distribute/experimental/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..18893290f3116d5ff7ba5dbf58e8e507fc7b854a
--- /dev/null
+++ b/tensorflow/python/distribute/experimental/BUILD
@@ -0,0 +1,19 @@
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "experimental",
+    srcs = [
+        "__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/distribute:collective_all_reduce_strategy",
+        "//tensorflow/python/distribute:parameter_server_strategy",
+    ],
+)
diff --git a/tensorflow/python/distribute/experimental/__init__.py b/tensorflow/python/distribute/experimental/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f74cac8b5ffce5f3982ce1e39a5108a4d841c8cf
--- /dev/null
+++ b/tensorflow/python/distribute/experimental/__init__.py
@@ -0,0 +1,24 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental Distribution Strategy library."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.python.distribute import collective_all_reduce_strategy
+from tensorflow.python.distribute import parameter_server_strategy
+# pylint: enable=unused-import
diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
new file mode 100644
index 0000000000000000000000000000000000000000..14b153b21933ee886af3858ca59ab53aa86b34b5
--- /dev/null
+++ b/tensorflow/python/distribute/input_lib.py
@@ -0,0 +1,506 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Various classes representing distributed inputs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import multi_device_iterator_ops
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import input_ops
+from tensorflow.python.distribute import values
+from tensorflow.python.eager import context
+from tensorflow.python.framework import device as tf_device
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+
+
+class InputWorkers(object):
+  """A 1-to-many mapping from input worker devices to compute devices."""
+
+  def __init__(self, device_map, worker_device_pairs=None, logical_device=0):
+    """Initialize an `InputWorkers` object.
+
+    Args:
+      device_map: A `DeviceMap` with the computation devices fed by the
+        input workers.
+      worker_device_pairs: A sequence of pairs:
+        `(input device, a tuple of compute devices fed by that input device)`.
+      logical_device: The logical device of `device_map` to feed.
+    """
+    self._device_map = device_map
+    self._logical_device = logical_device
+    if worker_device_pairs is None:
+      worker_device_pairs = ((
+          device_util.canonicalize("/device:CPU:0"),
+          device_map.logical_to_actual_devices(logical_device)),)
+    self._input_worker_devices = tuple(d for d, _ in worker_device_pairs)
+    self._fed_devices = tuple(tuple(device_util.canonicalize(d) for d in f)
+                              for _, f in worker_device_pairs)
+    flattened = tuple(d for l in self._fed_devices for d in l)
+    assert (flattened ==
+            device_map.logical_to_actual_devices(logical_device)), (
+                "flattened: %s logical device %d: %s" %
+                (flattened, logical_device,
+                 device_map.logical_to_actual_devices(logical_device)))
+
+  @property
+  def device_map(self):
+    return self._device_map
+
+  @property
+  def logical_device(self):
+    return self._logical_device
+
+  @property
+  def num_workers(self):
+    return len(self._input_worker_devices)
+
+  @property
+  def worker_devices(self):
+    return self._input_worker_devices
+
+  def compute_devices_for_worker(self, worker_index):
+    return self._fed_devices[worker_index]
+
+  def __repr__(self):
+    devices = self.worker_devices
+    debug_repr = ",\n".join("  %d %s: %s" %
+                            (i, devices[i], self._fed_devices[i])
+                            for i in range(len(devices)))
+    return "%s:{\n%s\n  device_map: %s}" % (
+        self.__class__.__name__, debug_repr, self._device_map)
+
+
+class InputIterator(object):
+  """An input iterator, intended to be passed to `DistributionStrategy.run`."""
+
+  def get_next(self):
+    """Returns the next inputs for all replicas."""
+    raise NotImplementedError("must be implemented in descendants")
+
+  def initialize(self):
+    """Initialize the underlying input dataset, when applicable.
+
+    In eager mode, this will create a new iterator and return it.
+    In graph mode, this will initialize the same underlying iterator(s).
+
+    Users are required to call this if
+    - This iterator was returned from a call to `make_input_fn_iterator` with an
+      input function that returns a dataset.
+    - Or this iterator was returned from a call to `make_dataset_iterator`.
+
+    Returns:
+      A list of initialization ops to be executed.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+
+class InputIteratorImpl(InputIterator):
+  """Common implementation for all input iterators."""
+
+  def __init__(self, input_workers, iterators):
+    assert isinstance(input_workers, InputWorkers)
+    if not input_workers.worker_devices:
+      raise ValueError("Should have at least one worker for input iterator.")
+
+    self._iterators = iterators
+    self._input_workers = input_workers
+
+  def get_next(self, name=None):
+    """Returns the next input from the iterator for all replicas."""
+    replicas = []
+    for i, worker in enumerate(self._input_workers.worker_devices):
+      if name is not None:
+        d = tf_device.DeviceSpec.from_string(worker)
+        new_name = "%s_%s_%d" % (name, d.job, d.task)
+      else:
+        new_name = None
+      with ops.device(worker):
+        # Make `replicas` a flat list of values across all replicas.
+        replicas.extend(self._iterators[i].get_next_as_list(new_name))
+
+    return values.regroup(self._input_workers.device_map, replicas)
+
+  def initialize(self):
+    """Initialze underlying iterators.
+
+    Returns:
+      A list of any initializer ops that should be run.
+    """
+    init_ops = []
+    for it in self._iterators:
+      init_ops.extend(it.initialize())
+    return init_ops
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  @property
+  def output_classes(self):
+    return self._iterators[0].output_classes
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  @property
+  def output_shapes(self):
+    return self._iterators[0].output_shapes
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  @property
+  def output_types(self):
+    return self._iterators[0].output_types
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  def get_iterator(self, worker):
+    for i, w in enumerate(self._input_workers.worker_devices):
+      if worker == w:
+        return self._iterators[i]
+    return None
+
+
+class InputFunctionIterator(InputIteratorImpl):
+  """Iterator created from input function."""
+
+  def __init__(self, input_fn, input_workers, input_contexts):
+    """Make an iterator for input provided via an input function.
+
+    Currently implements PER_WORKER mode, in which the `input_fn` is called
+    once on each worker.
+
+    TODO(priyag): Add other replication modes.
+
+    Args:
+      input_fn: Input function that returns a `tf.data.Dataset` object.
+      input_workers: an `InputWorkers` object.
+      input_contexts: A list of `InputContext` instances to be passed to call(s)
+        to `input_fn`. Length and order should match worker order in
+        `worker_device_pairs`.
+    """
+    assert isinstance(input_workers, InputWorkers)
+    if input_workers.num_workers != len(input_contexts):
+      raise ValueError(
+          "Number of input workers (%d) is not same as number of "
+          "input_contexts (%d)" %
+          (input_workers.num_workers, len(input_contexts)))
+
+    iterators = []
+    for i, ctx in enumerate(input_contexts):
+      worker = input_workers.worker_devices[i]
+      with ops.device(worker):
+        result = input_fn(ctx)
+        devices = input_workers.compute_devices_for_worker(i)
+        if isinstance(result, dataset_ops.DatasetV2):
+          iterator = _SingleWorkerDatasetIterator(result, worker, devices)
+        elif callable(result):
+          iterator = _SingleWorkerCallableIterator(result, worker, devices)
+        else:
+          raise ValueError(
+              "input_fn must return a tf.data.Dataset or a callable.")
+        iterators.append(iterator)
+
+    super(InputFunctionIterator, self).__init__(input_workers, iterators)
+
+
+class DatasetIterator(InputIteratorImpl):
+  """Iterator created from input dataset."""
+
+  def __init__(self, dataset, input_workers, split_batch_by=None):
+    """Make an iterator for the dataset on given devices.
+
+    If `split_batch_by` is not None, we "split" each batch of the
+    dataset by `split_batch_by` value. To achieve this, we first unbatch the
+    input dataset and then rebatch it with the per replica batch size that is
+    calculated using `global_batch_size // split_batch_by`.
+    The currently supported datasets are as follows:
+    `dataset.batch()` is the last operation on the dataset OR
+    `dataset.apply(map_and_batch)` is the last operation on the dataset OR
+    `dataset.batch().prefetch()` are the last 2 operations on the dataset OR
+    `dataset.apply(map_and_batch).prefetch()` are the last 2 operations.
+
+    TODO(priyag): Support multi worker / host cases properly by cloning
+    and sharding the dataset on each worker. Current setup will only work in
+    some cases, such as in-graph multi worker GPU case. If the input pipeline
+    has random shuffling (with a different seed on each worker), each worker
+    will see random input from the same overall dataset in each step. Otherwise,
+    each worker will see the same input in each step.
+
+    Args:
+      dataset: `tf.data.Dataset` that will be used as the input source.
+      input_workers: an `InputWorkers` object.
+      split_batch_by: Optional integer. If present, we "split" each batch of the
+        dataset by `split_batch_by` value.
+    """
+    assert isinstance(input_workers, InputWorkers)
+    if split_batch_by:
+      dataset = batching._RebatchDataset(dataset, split_batch_by)  # pylint: disable=protected-access
+
+    iterators = []
+    for i, worker in enumerate(input_workers.worker_devices):
+      with ops.device(worker):
+        worker_devices = input_workers.compute_devices_for_worker(i)
+        cloned_dataset = dataset
+        if not context.executing_eagerly():
+          cloned_dataset = input_ops._clone_dataset(dataset)  # pylint: disable=protected-access
+          cloned_dataset = cloned_dataset.with_options(dataset.options())
+        iterator = _SingleWorkerDatasetIterator(cloned_dataset, worker,
+                                                worker_devices)
+        iterators.append(iterator)
+
+    super(DatasetIterator, self).__init__(input_workers, iterators)
+
+
+class _SingleWorkerDatasetIterator(object):
+  """Iterator for a single `tf.data.Dataset`."""
+
+  def __init__(self, dataset, worker, devices):
+    """Create iterator for the `dataset` to fetch data to worker's `devices` .
+
+    `MultiDeviceIterator` is used to prefetch input to the devices on the
+    given worker.
+
+    Args:
+      dataset: A `tf.data.Dataset` instance.
+      worker: Worker on which ops should be created.
+      devices: Distribute data from `dataset` to these devices.
+    """
+    self._dataset = dataset
+    self._worker = worker
+    self._devices = devices
+    self._make_iterator()
+
+  def _make_iterator(self):
+    """Make appropriate iterator on the dataset."""
+    with ops.device(self._worker):
+      self._iterator = multi_device_iterator_ops.MultiDeviceIterator(
+          self._dataset, self._devices)
+
+  def get_next_as_list(self, name=None):
+    """Get next element from the underlying iterator."""
+    del name
+    with ops.device(self._worker):
+      data_list = self._iterator.get_next()
+      return data_list
+
+  def initialize(self):
+    """Initialze underlying iterator.
+
+    In eager execution, this simply recreates the underlying iterator.
+    In graph execution, it returns the initializer ops for the underlying
+    iterator.
+
+    Returns:
+      A list of any initializer ops that should be run.
+    """
+    if context.executing_eagerly():
+      self._iterator._eager_reset()  # pylint: disable=protected-access
+      return []
+    else:
+      return [self._iterator.initializer]
+
+  @property
+  def output_classes(self):
+    return self._iterator.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._iterator.output_shapes
+
+  @property
+  def output_types(self):
+    return self._iterator.output_types
+
+
+class _SingleWorkerCallableIterator(object):
+  """Iterator for a single tensor-returning callable."""
+
+  def __init__(self, fn, worker, devices):
+    self._fn = fn
+    self._worker = worker
+    self._devices = devices
+
+  def get_next_as_list(self, name=None):
+    """Get next element from the callable."""
+    del name
+    with ops.device(self._worker):
+      data_list = [self._fn() for _ in self._devices]
+      return data_list
+
+  def initialize(self):
+    # TODO(petebu) Should this throw an exception instead?
+    return []
+
+
+# TODO(sourabhbajaj): Remove this in lieu of distributed datasets
+def _get_batched_dataset(d):
+  """Get the batched dataset from `d`."""
+  # pylint: disable=protected-access
+  if isinstance(d, dataset_ops.DatasetV1Adapter):
+    d = d._dataset
+
+  if isinstance(d, (dataset_ops.BatchDataset, batching._MapAndBatchDataset)):
+    return d
+  elif isinstance(d, (dataset_ops.PrefetchDataset,
+                      dataset_ops._OptionsDataset)):
+    return _get_batched_dataset(d._input_dataset)
+
+  raise ValueError(
+      "Unable to get batched dataset from the input dataset. `batch` "
+      "`map_and_batch` need to be the last operations on the dataset. "
+      "The batch operations can be followed by a prefetch.")
+
+
+def _get_batched_dataset_attributes(d):
+  """Get `batch_size`, `drop_remainder` of dataset."""
+  # pylint: disable=protected-access
+  assert isinstance(d,
+                    (dataset_ops.BatchDataset, batching._MapAndBatchDataset))
+  if isinstance(d, dataset_ops.BatchDataset):
+    batch_size = d._batch_size
+    drop_remainder = d._drop_remainder
+  elif isinstance(d, batching._MapAndBatchDataset):
+    batch_size = d._batch_size_t
+    drop_remainder = d._drop_remainder_t
+  # pylint: enable=protected-access
+
+  if tensor_util.is_tensor(batch_size):
+    batch_size = tensor_util.constant_value(batch_size)
+
+  if tensor_util.is_tensor(drop_remainder):
+    drop_remainder = tensor_util.constant_value(drop_remainder)
+
+  return batch_size, drop_remainder
+
+
+# TODO(sourabhbajaj): Remove this in lieu of distributed datasets
+def _get_dataset_attributes(dataset):
+  """Get the underlying attributes from the dataset object."""
+  # pylint: disable=protected-access
+
+  # First, get batch_size and drop_remainder from the dataset. We need
+  # to walk back the dataset creation process and find the batched version in
+  # order to get the attributes.
+  batched_dataset = _get_batched_dataset(dataset)
+  batch_size, drop_remainder = _get_batched_dataset_attributes(batched_dataset)
+
+  # Second, prefetch buffer should be get from the original dataset.
+  prefetch_buffer = None
+  if isinstance(dataset, dataset_ops.PrefetchDataset):
+    prefetch_buffer = dataset._buffer_size
+  elif (isinstance(dataset, dataset_ops.DatasetV1Adapter)
+        and isinstance(dataset._dataset, dataset_ops.PrefetchDataset)):
+    prefetch_buffer = dataset._dataset._buffer_size
+
+  return batch_size, drop_remainder, prefetch_buffer
+
+
+class MultiStepContext(object):
+  """A context object that can be used to capture things when running steps.
+
+  This context object is useful when running multiple steps at a time using the
+  `experimental_run_steps_on_iterator` API. For e.g. it allows the user's step
+  function to specify which outputs to emit at what frequency. Currently it
+  supports capturing output from the last step, as well as capturing non tensor
+  outputs.  In the future it will be augmented to support other use cases such
+  as output each N steps.
+  """
+
+  def __init__(self):
+    """Initialize an output context.
+
+    Returns:
+      A context object.
+    """
+    self._last_step_outputs = {}
+    self._last_step_outputs_reduce_ops = {}
+    self._non_tensor_outputs = {}
+
+  @property
+  def last_step_outputs(self):
+    """A dictionary consisting of outputs to be captured on last step.
+
+    Keys in the dictionary are names of tensors to be captured, as specified
+    when `set_last_step_output` is called.
+    Values in the dictionary are the tensors themselves. If
+    `set_last_step_output` was called with a `reduce_op` for this output,
+    then the value is the reduced value.
+
+    Returns:
+      A dictionary with last step outputs.
+    """
+    return self._last_step_outputs
+
+  def _set_last_step_outputs(self, outputs):
+    """Replace the entire dictionary of last step outputs."""
+    if not isinstance(outputs, dict):
+      raise ValueError("Need a dictionary to set last_step_outputs.")
+    self._last_step_outputs = outputs
+
+  def set_last_step_output(self, name, output, reduce_op=None):
+    """Set `output` with `name` to be outputted from the last step.
+
+    Args:
+      name: String, name to identify the output. Doesn't need to match tensor
+        name.
+      output: The tensors that should be outputted with `name`. See below for
+        actual types supported.
+      reduce_op: Reduction method to use to reduce outputs from multiple
+        replicas. Required if `set_last_step_output` is called in a replica
+        context. Optional in cross_replica_context.
+        When present, the outputs from all the replicas are reduced using the
+        current distribution strategy's `reduce` method. Hence, the type of
+        `output` must be what's supported by the corresponding `reduce` method.
+        For e.g. if using MirroredStrategy and reduction is set, output
+        must be a `PerReplica` value.
+        The reduce method is also recorded in a dictionary
+        `_last_step_outputs_reduce_ops` for later interpreting of the
+        outputs as already reduced or not.
+    """
+    if distribution_strategy_context.in_cross_replica_context():
+      self._last_step_outputs_reduce_ops[name] = reduce_op
+      if reduce_op is None:
+        self._last_step_outputs[name] = output
+      else:
+        distribution = distribution_strategy_context.get_strategy()
+        self._last_step_outputs[name] = distribution.reduce(reduce_op, output)
+    else:
+      assert reduce_op is not None
+      def merge_fn(distribution, value):
+        self._last_step_outputs[name] = distribution.reduce(reduce_op, value)
+        # Setting this inside the `merge_fn` because all replicas share the same
+        # context object, so it's more robust to set it only once (even if all
+        # the replicas are trying to set the same value).
+        self._last_step_outputs_reduce_ops[name] = reduce_op
+
+      distribution_strategy_context.get_replica_context().merge_call(
+          merge_fn, args=(output,))
+
+  @property
+  def non_tensor_outputs(self):
+    """A dictionary consisting of any non tensor outputs to be captured."""
+    return self._non_tensor_outputs
+
+  def set_non_tensor_output(self, name, output):
+    """Set `output` with `name` to be captured as a non tensor output."""
+    if distribution_strategy_context.in_cross_replica_context():
+      self._non_tensor_outputs[name] = output
+    else:
+      def merge_fn(distribution, value):
+        # NOTE(priyag): For non tensor outputs, we simply return all the values
+        # in a list as reduction doesn't make sense on non tensors.
+        self._non_tensor_outputs[name] = distribution.unwrap(value)
+      distribution_strategy_context.get_replica_context().merge_call(
+          merge_fn, args=(output,))
diff --git a/tensorflow/python/distribute/input_ops.py b/tensorflow/python/distribute/input_ops.py
index 2ded209701e74afe45fc96d66fab65b3ae250596..d9e833b6bc6b123b6875440df7c35b0af02d0941 100644
--- a/tensorflow/python/distribute/input_ops.py
+++ b/tensorflow/python/distribute/input_ops.py
@@ -18,15 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.data.experimental.ops import filter_for_shard_ops
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import readers
-from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import traverse
+from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging
 
+
 # TODO(priyag): Any other reader datasets to consider here?
 _READER_DATASET_OPS = [
     "TextLineDataset", "TFRecordDataset", "FixedLengthRecordDataset",
@@ -53,100 +51,57 @@ def auto_shard_dataset(dataset, num_shards, index):
     determine a good way to shard the input dataset.
   """
 
-  # TODO(priyag): Clone datasets instead of updating in place, similar to the
-  # clone method for TFRecordDataset.
-  def _auto_shard_impl(dataset, found_reader_op):
-    """Recursive implementation of auto sharding."""
-
-    if not found_reader_op:
-      # TODO(priyag): Make this check more robust by enforcing some common
-      # property on reader datasets.
-      if (isinstance(dataset, readers.TextLineDataset) or
-          isinstance(dataset, readers.FixedLengthRecordDataset)):
-        filenames_tensor = dataset._filenames
-        num_files = array_ops.size(filenames_tensor)
-        sharded_filenames_tensor = array_ops.gather(
-            filenames_tensor, math_ops.range(index, num_files, num_shards))
-        dataset._filenames = sharded_filenames_tensor
-        return dataset
-      elif isinstance(dataset, readers.TFRecordDataset):
-        # `TFRecordDataset` needs to be handled separately than other readers
-        # because it converts filenames to a dataset first. Also, we clone it
-        # instead of updating in place because it has special logic in the
-        # constructor. Eventually we will change all cases to clone datasets
-        # instead of updating in-place.
-        return dataset._clone(
-            filenames=dataset._filenames.apply(
-                filter_for_shard_ops.filter_for_shard(num_shards, index)))
-      elif isinstance(dataset, dataset_ops.RangeDataset):
-        return dataset.apply(
-            filter_for_shard_ops.filter_for_shard(num_shards, index))
-      elif hasattr(dataset, "_map_func"):
-        # TODO(priyag): Make this check more robust by enforcing some common
-        # property on all map/flatmap/interleave datasets.
-        map_func_def = dataset._map_func.function.definition
-        for node in map_func_def.node_def:
-          if node.op in _READER_DATASET_OPS:
-            found_reader_op = True
-            break
-          elif node.op == "FlatMapDataset":
-            # TODO(priyag): Should this check for other map datasets? Should it
-            # be recursive? It is too specific to implementation of
-            # TFRecordDataset right now.
-            nested_func_name = node.attr["f"].func.name
-            nested_func = ops.get_default_graph()._functions[nested_func_name]
-            for nested_node in nested_func.definition.node_def:
-              if nested_node.op in _READER_DATASET_OPS:
-                found_reader_op = True
-                break
-            if found_reader_op:
-              break
-        if found_reader_op:
-          dataset._input_dataset = _auto_shard_impl(
-              dataset._input_dataset, found_reader_op)
-          return dataset
-
-    if isinstance(dataset, dataset_ops.DatasetV1Adapter):
-      dataset._dataset = _auto_shard_impl(
-          dataset._dataset, found_reader_op)
-      return dataset
-
-    # TODO(priyag): Make _input_dataset(s) a common property of all datasets to
-    # make this check more robust.
-    if hasattr(dataset, "_input_dataset"):
-      dataset._input_dataset = _auto_shard_impl(
-          dataset._input_dataset, found_reader_op)
-      if hasattr(dataset, "_dataset_to_concatenate"):
-        # Special case for `ConcatentateDataset`. We want to shard all input
-        # datasets.
-        dataset._dataset_to_concatenate = _auto_shard_impl(
-            dataset._dataset_to_concatenate, found_reader_op)
-      return dataset
-
-    if hasattr(dataset, "_datasets"):
-      # Special case for `ZipDataset`.
-      dataset._datasets = nest.pack_sequence_as(dataset._datasets, [
-          _auto_shard_impl(ds, found_reader_op)
-          for ds in nest.flatten(dataset._datasets)
-      ])
-      return dataset
-
-    if not found_reader_op:
-      tf_logging.warn(
-          "Could not find a standard reader in the input pipeline"
-          "(one of TextLineDataset, TFRecordDataset, FixedLengthRecordDataset)."
-          "So auto-sharding is not done. Please verify correctness of "
-          "auto-sharding for your input.")
-      # TODO(yuefengz): maybe still shard it?
-      return dataset
-
-    # TODO(priyag): What do we want to do if the number of filenames is
-    # uneven in the number of shards? By default, this will just return as
-    # many items it can before throwing OutOfRangeError.
-    # TODO(priyag): This will shard the filenames before any shuffling of the
-    # filename dataset. It might be desirable to shard after shuffling
-    # filenames? If so, how do we achieve that?
-    return dataset.apply(
-        filter_for_shard_ops.filter_for_shard(num_shards, index))
-
-  return _auto_shard_impl(dataset=dataset, found_reader_op=False)
+  # TODO(rohanj): b/120673685 to track re-enabling auto sharding.
+  tf_logging.warn("Autosharding is currently disabled. Please shard your input "
+                  "manually.")
+  del num_shards, index
+  return dataset
+
+
+def _clone_dataset(dataset):
+  """Returns a cloned version of `dataset`."""
+  variant_tensor_ops = traverse.obtain_all_variant_tensor_ops(dataset)
+  remap_dict = _clone_helper(dataset._variant_tensor.op, variant_tensor_ops)
+  new_variant_tensor = remap_dict[dataset._variant_tensor.op].outputs[0]
+  return dataset_ops._VariantDataset(new_variant_tensor,
+                                     dataset._element_structure)
+
+
+def _get_op_def(op):
+  return op.op_def or op_def_registry.get_registered_ops()[op.type]
+
+
+def _clone_helper(op_to_clone, variant_tensor_ops):
+  """Helper method that recursively clones `op_to_clone`.
+
+  Args:
+    op_to_clone: The op we want to clone.
+    variant_tensor_ops: A list of ops that we have to clone along the way.
+
+  Returns:
+    A dictionary mapping old_ops to new_ops created. Includes op_to_clone
+    as a key.
+  """
+  remap_dict = {}
+  for input_tensor in op_to_clone.inputs:
+    input_tensor_op = input_tensor.op
+    if input_tensor_op in variant_tensor_ops:
+      recursive_map = _clone_helper(input_tensor_op, variant_tensor_ops)
+      remap_dict.update(recursive_map)
+  inputs_list = []
+  for input_tensor in op_to_clone.inputs:
+    input_tensor_op = input_tensor.op
+    if input_tensor_op in remap_dict:
+      remapped_input = remap_dict[input_tensor_op].outputs[0]
+      inputs_list.append(remapped_input)
+    else:
+      inputs_list.append(input_tensor_op.outputs[input_tensor.value_index])
+  g = ops.get_default_graph()
+  new_op = g.create_op(
+      op_to_clone.type,
+      inputs_list, [o.dtype for o in op_to_clone.outputs],
+      name=op_to_clone.name,
+      attrs=op_to_clone.node_def.attr,
+      op_def=_get_op_def(op_to_clone))
+  remap_dict[op_to_clone] = new_op
+  return remap_dict
diff --git a/tensorflow/python/distribute/input_ops_test.py b/tensorflow/python/distribute/input_ops_test.py
index dcf946ba477635cda5ee3299abf163a2bb9e5bff..7db75163ed36ef35bfbd29d7ce9e03db5b6713a6 100644
--- a/tensorflow/python/distribute/input_ops_test.py
+++ b/tensorflow/python/distribute/input_ops_test.py
@@ -26,6 +26,8 @@ from tensorflow.python.distribute import input_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import python_io
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
@@ -90,7 +92,7 @@ class AutoShardDatasetTest(test.TestCase):
   def _verifySimpleShardingOutput(self, dataset, record_fn):
     iterator = dataset.make_one_shot_iterator()
     next_element = iterator.get_next()
-    with self.cached_session() as sess:
+    with self.cached_session():
       for f in range(self._shard_index, self._num_files, self._num_shards):
         for r in range(self._num_records):
           self.assertAllEqual(record_fn(r, f), self.evaluate(next_element))
@@ -98,7 +100,7 @@ class AutoShardDatasetTest(test.TestCase):
         self.evaluate(next_element)
 
   @test_util.run_deprecated_v1
-  def testTFRecordDataset(self):
+  def DISABLED_testTFRecordDataset(self):
     dataset = readers.TFRecordDataset(self._createTFRecordFiles())
     dataset = input_ops.auto_shard_dataset(
         dataset, self._num_shards, self._shard_index)
@@ -106,7 +108,7 @@ class AutoShardDatasetTest(test.TestCase):
     self._verifySimpleShardingOutput(dataset, self._record)
 
   @test_util.run_deprecated_v1
-  def testFlatMap(self):
+  def DISABLED_testFlatMap(self):
     dataset = dataset_ops.Dataset.from_tensor_slices(
         self._createTFRecordFiles())
     dataset = dataset.flat_map(readers.TFRecordDataset)
@@ -116,7 +118,7 @@ class AutoShardDatasetTest(test.TestCase):
     self._verifySimpleShardingOutput(dataset, self._record)
 
   @test_util.run_deprecated_v1
-  def testInterleave(self):
+  def DISABLED_testInterleave(self):
     dataset = dataset_ops.Dataset.from_tensor_slices(
         self._createTFRecordFiles())
     dataset = dataset.interleave(
@@ -129,7 +131,7 @@ class AutoShardDatasetTest(test.TestCase):
     self._verifySimpleShardingOutput(dataset, self._record)
 
   @test_util.run_deprecated_v1
-  def testListfiles(self):
+  def DISABLED_testListfiles(self):
     filenames = self._createTFRecordFiles()
     file_pattern = filenames[0].rsplit(os.sep, 1)[0] + "/tf_record.*.txt"
     dataset = dataset_ops.Dataset.list_files(file_pattern, shuffle=False)
@@ -139,7 +141,7 @@ class AutoShardDatasetTest(test.TestCase):
 
     iterator = dataset.make_one_shot_iterator()
     next_element = iterator.get_next()
-    with self.cached_session() as sess:
+    with self.cached_session():
       actual, expected = [], []
       for f in range(self._shard_index, self._num_files, self._num_shards):
         for r in range(self._num_records):
@@ -150,7 +152,7 @@ class AutoShardDatasetTest(test.TestCase):
       self.assertAllEqual(expected, actual)
 
   @test_util.run_deprecated_v1
-  def testComplexPipeline(self):
+  def DISABLED_testComplexPipeline(self):
     # Setup a complex input pipeline.
     batch_size = 2
     num_epochs = 5
@@ -172,7 +174,7 @@ class AutoShardDatasetTest(test.TestCase):
     # Verify output.
     iterator = dataset.make_one_shot_iterator()
     next_element = iterator.get_next()
-    with self.cached_session() as sess:
+    with self.cached_session():
       actual = []
       num_iterations = (self._num_files * self._num_records * num_epochs) // (
           self._num_shards * batch_size)
@@ -190,7 +192,7 @@ class AutoShardDatasetTest(test.TestCase):
       self.assertAllEqual(sorted(expected), sorted(actual))
 
   @test_util.run_deprecated_v1
-  def testZip(self):
+  def DISABLED_testZip(self):
     dataset1 = readers.TFRecordDataset(self._createTFRecordFiles())
     dataset2 = readers.TextLineDataset(self._createTextFiles())
     dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
@@ -201,7 +203,7 @@ class AutoShardDatasetTest(test.TestCase):
     self._verifySimpleShardingOutput(dataset, record_fn)
 
   @test_util.run_deprecated_v1
-  def testConcat(self):
+  def DISABLED_testConcat(self):
     dataset1 = readers.TFRecordDataset(self._createTFRecordFiles())
     dataset2 = readers.TextLineDataset(self._createTextFiles())
     dataset = dataset1.concatenate(dataset2)
@@ -222,7 +224,7 @@ class AutoShardDatasetTest(test.TestCase):
         self.evaluate(next_element)
 
   @test_util.run_deprecated_v1
-  def testTextLineReader(self):
+  def DISABLED_testTextLineReader(self):
     dataset = readers.TextLineDataset(self._createTextFiles())
     dataset = input_ops.auto_shard_dataset(
         dataset, self._num_shards, self._shard_index)
@@ -230,7 +232,7 @@ class AutoShardDatasetTest(test.TestCase):
     self._verifySimpleShardingOutput(dataset, self._text_line)
 
   @test_util.run_deprecated_v1
-  def testTextLineReaderWithFlatMap(self):
+  def DISABLED_testTextLineReaderWithFlatMap(self):
     dataset = dataset_ops.Dataset.from_tensor_slices(self._createTextFiles())
     dataset = dataset.flat_map(readers.TextLineDataset)
     dataset = input_ops.auto_shard_dataset(
@@ -239,7 +241,7 @@ class AutoShardDatasetTest(test.TestCase):
     self._verifySimpleShardingOutput(dataset, self._text_line)
 
   @test_util.run_deprecated_v1
-  def testFixedLengthReader(self):
+  def DISABLED_testFixedLengthReader(self):
     dataset = readers.FixedLengthRecordDataset(
         self._createFixedLengthRecordFiles(), self._record_bytes)
     dataset = input_ops.auto_shard_dataset(
@@ -248,7 +250,7 @@ class AutoShardDatasetTest(test.TestCase):
     self._verifySimpleShardingOutput(dataset, self._fixed_length_record)
 
   @test_util.run_deprecated_v1
-  def testFixedLengthReaderWithFlatMap(self):
+  def DISABLED_testFixedLengthReaderWithFlatMap(self):
     dataset = dataset_ops.Dataset.from_tensor_slices(
         self._createFixedLengthRecordFiles())
     dataset = dataset.flat_map(
@@ -258,5 +260,77 @@ class AutoShardDatasetTest(test.TestCase):
 
     self._verifySimpleShardingOutput(dataset, self._fixed_length_record)
 
+
+# A dataset that creates two variant tensors.
+class _TestDataset(dataset_ops.UnaryUnchangedStructureDataset):
+
+  def __init__(self, input_dataset):
+    self._input_dataset = input_dataset
+    temp_variant_tensor = gen_dataset_ops.prefetch_dataset(
+        input_dataset._variant_tensor,
+        buffer_size=1,
+        **dataset_ops.flat_structure(self))
+    variant_tensor = gen_dataset_ops.model_dataset(
+        temp_variant_tensor, **dataset_ops.flat_structure(self))
+    super(_TestDataset, self).__init__(input_dataset, variant_tensor)
+
+
+class CloneDatasetTest(test.TestCase):
+
+  def _assert_datasets_equal(self, ds1, ds2):
+    # First lets assert the structure is the same.
+    self.assertTrue(
+        ds1._element_structure.is_compatible_with(ds2._element_structure))
+    self.assertTrue(
+        ds2._element_structure.is_compatible_with(ds1._element_structure))
+
+    # Now create iterators on both and assert they produce the same values.
+    it1 = dataset_ops.make_initializable_iterator(ds1)
+    it2 = dataset_ops.make_initializable_iterator(ds2)
+
+    get_next1 = it1.get_next()
+    get_next2 = it2.get_next()
+
+    with self.cached_session():
+      self.evaluate([it1.initializer, it2.initializer])
+      val1, val2 = self.evaluate([get_next1, get_next2])
+      self.assertEqual(val1, val2)
+
+  @test_util.run_deprecated_v1
+  def testOnlySource(self):
+    ds = dataset_ops.Dataset.range(10)
+    cloned_ds = input_ops._clone_dataset(ds)
+    self._assert_datasets_equal(ds, cloned_ds)
+
+  @test_util.run_deprecated_v1
+  def testSimplePipeline(self):
+    ds = dataset_ops.Dataset.range(10).map(math_ops.square)
+    cloned_ds = input_ops._clone_dataset(ds)
+    self._assert_datasets_equal(ds, cloned_ds)
+
+  @test_util.run_deprecated_v1
+  def testConcat(self):
+    ds1 = dataset_ops.Dataset.range(10)
+    ds2 = dataset_ops.Dataset.range(10)
+    ds = ds1.concatenate(ds2)
+    cloned_ds = input_ops._clone_dataset(ds)
+    self._assert_datasets_equal(ds, cloned_ds)
+
+  @test_util.run_deprecated_v1
+  def testZip(self):
+    ds1 = dataset_ops.Dataset.range(10)
+    ds2 = dataset_ops.Dataset.range(10)
+    ds = dataset_ops.Dataset.zip((ds1, ds2))
+    cloned_ds = input_ops._clone_dataset(ds)
+    self._assert_datasets_equal(ds, cloned_ds)
+
+  @test_util.run_deprecated_v1
+  def testMultipleVariantTensors(self):
+    ds = dataset_ops.Dataset.range(10)
+    ds = _TestDataset(ds)
+    cloned_ds = input_ops._clone_dataset(ds)
+    self._assert_datasets_equal(ds, cloned_ds)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
index cb94dfcfbd206eb81bbb76b36ded23a4f3bc2515..96c7191652a2d93da400a5b14a01f1ea26e9079d 100644
--- a/tensorflow/python/distribute/mirrored_strategy.py
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -20,14 +20,15 @@ from __future__ import print_function
 
 import contextlib
 import copy
-import functools
 import threading
 
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import shared_variable_creator
 from tensorflow.python.distribute import values
@@ -50,12 +51,17 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 @contextlib.contextmanager
-def _enter_graph(g):
-  if context.executing_eagerly():
+def _enter_graph(g, eager, creator_stack=None):
+  """Context manager for selecting a graph and maybe eager mode."""
+  if eager:
     with g.as_default(), context.eager_mode():
+      if creator_stack is not None:
+        g._variable_creator_stack = creator_stack  # pylint: disable=protected-access
       yield
   else:
     with g.as_default():
+      if creator_stack is not None:
+        g._variable_creator_stack = creator_stack  # pylint: disable=protected-access
       yield
 
 
@@ -69,20 +75,20 @@ class _RequestedStop(Exception):  # pylint: disable=g-bad-exception-name
   pass
 
 
-# _call_for_each_replica and _reduce_non_distributed_value are not members of
-# MirroredStrategy so that they are generally not allowed to use anything
-# specific to MirroredStrategy and thus can be shared with other distribution
-# strategies.
+# _call_for_each_replica is not a member of MirroredStrategy so that it is
+# not allowed to use anything specific to MirroredStrategy and thus
+# can be shared with other distribution strategies.
 
 
 # TODO(yuefengz): maybe create a common class for those who need to call this
 # _call_for_each_replica.
-def _call_for_each_replica(distribution, fn, args, kwargs):
+def _call_for_each_replica(distribution, device_map, fn, args, kwargs):
   """Run `fn` in separate threads, once per replica/worker device.
 
   Args:
     distribution: the DistributionStrategy object.
-    fn: function to run (will be run once per device, each in its own thread).
+    device_map: the DeviceMap with the devices to run `fn` on.
+    fn: function to run (will be run once per replica, each in its own thread).
     args: positional arguments for `fn`
     kwargs: keyword arguments for `fn`.
 
@@ -104,15 +110,15 @@ def _call_for_each_replica(distribution, fn, args, kwargs):
 
   shared_variable_store = {}
 
-  # TODO(isaprykin): Create these threads once instead of during every run()
-  # call.
+  # TODO(isaprykin): Create these threads once instead of during every call.
   threads = []
-  for index, d in enumerate(distribution.extended.worker_devices):
+  for index in range(device_map.num_replicas_in_graph):
     variable_creator_fn = shared_variable_creator.make_fn(
         shared_variable_store, index)
-    t = MirroredExtended._MirroredReplicaThread(  # pylint: disable=protected-access
-        distribution, coord, d, variable_creator_fn, fn,
-        *values.select_device(d, args), **values.select_device(d, kwargs))
+    t = _MirroredReplicaThread(
+        distribution, coord, index, device_map, variable_creator_fn, fn,
+        values.select_replica(index, args),
+        values.select_replica(index, kwargs))
     threads.append(t)
 
   for t in threads:
@@ -160,9 +166,10 @@ def _call_for_each_replica(distribution, fn, args, kwargs):
             raise RuntimeError("Some replicas made a different number of "
                                "replica_context().merge_call() calls.")
           # get_replica_context().merge_call() case
-          merge_args = values.regroup({t.device: t.merge_args for t in threads})
+          merge_args = values.regroup(
+              device_map, tuple(t.merge_args for t in threads))
           merge_kwargs = values.regroup(
-              {t.device: t.merge_kwargs for t in threads})
+              device_map, tuple(t.merge_kwargs for t in threads))
           # We capture the name_scope of the MRT when we call merge_fn
           # to ensure that if we have opened a name scope in the MRT,
           # it will be respected when executing the merge function. We only
@@ -177,54 +184,18 @@ def _call_for_each_replica(distribution, fn, args, kwargs):
               ops.control_dependencies(mtt_captured_control_deps):
             merge_result = threads[0].merge_fn(distribution, *merge_args,
                                                **merge_kwargs)
-          for t in threads:
-            t.merge_result = values.select_device(t.device, merge_result)
+          for r, t in enumerate(threads):
+            t.merge_result = values.select_replica(r, merge_result)
   finally:
     for t in threads:
       t.should_run.set()
     coord.join(threads)
 
-  return values.regroup({t.device: t.main_result for t in threads})
-
-
-def _reduce_non_distributed_value(extended, reduce_op, value, destinations):
-  """Reduce a non-DistributedValue `value` to `destinations`."""
-  if isinstance(value, values.DistributedValues):
-    raise ValueError("You are passing a `DistributedValue` to "
-                     "`_reduce_non_distributed_value`, which is not allowed.")
-
-  # If the same value is present on all replicas then the PerReplica value will
-  # be a single value. We also handle the case when `value` is a single value
-  # and equal to 0.
-  if value == 0:
-    return 0
-  # If there is only a single value and the reduce op is MEAN,
-  # that value should be on all destinations.
-  if reduce_op == reduce_util.ReduceOp.MEAN:
-    return value
-
-  cross_device_ops_lib.validate_destinations(destinations)
-  # We do not support a reduce op of SUM if the value is the same across
-  # all replicas. We call this as part of assign functions for MirroredVariables
-  # and summing up identical values across replicas is not clearly defined.
-  if (len(extended.worker_devices) != 1 or
-      not cross_device_ops_lib.check_destinations(destinations)):
-    raise ValueError("A non-DistributedValues value %s cannot be reduced with "
-                     "the given reduce op %s." % (value, reduce_op))
-  # TODO(anjalisridhar): Moves these methods to a device utility file?
-  devices = cross_device_ops_lib.get_devices_from(destinations)
-  if len(devices) == 1:
-    with ops.device(devices[0]):
-      return array_ops.identity(value)
-  else:
-    value_updates = {}
-    for d in devices:
-      with ops.device(d):
-        value_updates[d] = array_ops.identity(value)
-    return values.Mirrored(value_updates)
+  return values.regroup(device_map, tuple(t.main_result for t in threads))
 
 
-def _create_mirrored_variable(devices, real_mirrored_creator, *args, **kwargs):  # pylint: disable=g-missing-docstring
+def _create_mirrored_variable(strategy, device_map, logical_device,  # pylint: disable=missing-docstring
+                              real_mirrored_creator, *args, **kwargs):
   # Figure out what collections this variable should be added to.
   # We'll add the MirroredVariable to those collections instead.
   collections = kwargs.pop("collections", None)
@@ -271,13 +242,17 @@ def _create_mirrored_variable(devices, real_mirrored_creator, *args, **kwargs):
   # was never recorded on the tape instead of having to do this manually
   # here.
   with tape.stop_recording():
-    index = real_mirrored_creator(devices, *args, **kwargs)
+    devices = device_map.logical_to_actual_devices(logical_device)
+    value_list = real_mirrored_creator(devices, *args, **kwargs)
 
     if is_replica_local:
       result = values.ReplicaLocalVariable(
-          index, index[devices[0]], aggregation)
+          strategy, device_map, value_list, aggregation,
+          logical_device=logical_device)
     else:
-      result = values.MirroredVariable(index, index[devices[0]], aggregation)
+      result = values.MirroredVariable(
+          strategy, device_map, value_list, aggregation,
+          logical_device=logical_device)
 
   # Add the wrapped variable to the requested collections.
   # The handling of eager mode and the global step matches
@@ -292,7 +267,7 @@ def _create_mirrored_variable(devices, real_mirrored_creator, *args, **kwargs):
     if kwargs.get("trainable", True):
       collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
       l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
-      for v in index.values():
+      for v in value_list:
         if v in l:
           l.remove(v)
     g.add_to_collections(collections, result)
@@ -380,6 +355,10 @@ def _group_device_list(devices):
   return device_dict
 
 
+def _is_gpu_device(device):
+  return tf_device.DeviceSpec().parse_from_string(device).device_type == "GPU"
+
+
 def _infer_num_gpus_per_worker(devices):
   """Infers the number of GPUs on each worker.
 
@@ -398,26 +377,25 @@ def _infer_num_gpus_per_worker(devices):
     consecutive and starting from 0.
   """
   if _is_device_list_local(devices):
-    return len([d for d in devices if "GPU" in d.upper()])
+    return sum(1 for d in devices if _is_gpu_device(d))
   else:
     device_dict = _group_device_list(devices)
     num_gpus = None
     for _, devices_in_task in device_dict.items():
       for device_in_task in devices_in_task:
         if num_gpus is None:
-          num_gpus = len([d for d in device_in_task if "GPU" in d.upper()])
+          num_gpus = sum(1 for d in device_in_task if _is_gpu_device(d))
 
         # Verify other workers have the same number of GPUs.
-        elif (
-            num_gpus != len([d for d in device_in_task if "GPU" in d.upper()])):
+        elif num_gpus != sum(1 for d in device_in_task if _is_gpu_device(d)):
           raise ValueError("All workers should have the same number of GPUs.")
 
         for d in device_in_task:
           d_spec = tf_device.DeviceSpec().parse_from_string(d)
-          if (d_spec.device_type.upper() == "GPU" and
+          if (d_spec.device_type == "GPU" and
               d_spec.device_index >= num_gpus):
-            raise ValueError("Device_index on a worker should be consecutive "
-                             "and start from 0.")
+            raise ValueError("GPU `device_index` on a worker should be "
+                             "consecutive and start from 0.")
     return num_gpus
 
 
@@ -435,7 +413,7 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
   This strategy uses one replica per device and sync replication for its
   multi-GPU version.
 
-  The multi-worker version will be added in the fture.
+  The multi-worker version will be added in the future.
 
   Args:
     devices: a list of device strings.
@@ -474,38 +452,34 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
     """Initializes the object for local training."""
     self._local_mode = True
     assert devices, "Must specify at least one device."
+    devices = tuple(device_util.resolve(d) for d in devices)
     assert len(set(devices)) == len(devices), (
-        "No duplicates allowed in `devices` argument.")
+        "No duplicates allowed in `devices` argument: %s" % (devices,))
     # TODO(josh11b): Require at least 2 devices?
-    self._devices = tuple(device_util.resolve(d) for d in devices)
-    self._canonical_device_set = set(self._devices)
-    self._device_index = values.PerReplica(
-        {d: i for i, d in enumerate(devices)})
-
+    self._device_map = values.ReplicaDeviceMap(devices)
+    self._input_workers = input_lib.InputWorkers(self._device_map)
     self._inferred_cross_device_ops = cross_device_ops_lib.choose_the_best(
         devices)
+    self._host_input_device = numpy_dataset.SingleDevice("/cpu:0")
 
   def _initialize_multi_worker(self, devices):
     """Initializes the object for multi-worker training."""
     self._local_mode = False
 
     assert devices, "Must specify at least one device."
+    devices = tuple(device_util.resolve(d) for d in devices)
     assert len(set(devices)) == len(devices), (
-        "No duplicates allowed in `devices` argument.")
+        "No duplicates allowed in `devices` argument: %s" % devices)
     # TODO(josh11b): Require at least 2 devices?
-    self._devices = tuple(device_util.resolve(d) for d in devices)
-    self._canonical_device_set = set(self._devices)
-    self._device_index = values.PerReplica(
-        {d: i for i, d in enumerate(devices)})
 
     device_dict = _group_device_list(devices)
-    self._workers = []
-    self._worker_devices = []
-    for job in ["chief", "worker"]:
+    workers = []
+    worker_devices = []
+    for job in ("chief", "worker"):
       for task in range(len(device_dict.get(job, []))):
         worker = "/job:%s/task:%d" % (job, task)
-        self._workers.append(worker)
-        self._worker_devices.append((worker, device_dict[job][task]))
+        workers.append(worker)
+        worker_devices.append((worker, device_dict[job][task]))
 
     # Setting `_default_device` will add a device scope in the
     # distribution.scope. We set the default device to the first worker. When
@@ -514,23 +488,35 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
     #     ...
     # their ops will end up on the cpu device of its first worker, e.g.
     # "/job:worker/task:0/device:CPU:0". Note this is not used in replica mode.
-    self._default_device = self._workers[0]
+    self._default_device = workers[0]
+    self._host_input_device = numpy_dataset.SingleDevice(workers[0])
 
+    self._device_map = values.ReplicaDeviceMap(devices)
+    self._input_workers = input_lib.InputWorkers(
+        self._device_map, worker_devices)
     self._inferred_cross_device_ops = cross_device_ops_lib.MultiWorkerAllReduce(
-        self._workers, _infer_num_gpus_per_worker(self._devices))
+        workers, _infer_num_gpus_per_worker(devices))
 
   def _create_variable(self, next_creator, *args, **kwargs):
     """Create a mirrored variable. See `DistributionStrategy.scope`."""
     colocate_with = kwargs.pop("colocate_with", None)
-    devices = self._get_devices_from(colocate_with)
+    if colocate_with is None:
+      device_map = self._device_map
+      logical_device = 0  # TODO(josh11b): Get logical device from scope here.
+    elif isinstance(colocate_with, numpy_dataset.SingleDevice):
+      with ops.device(colocate_with.device):
+        return next_creator(*args, **kwargs)
+    else:
+      device_map = colocate_with.device_map
+      logical_device = colocate_with.logical_device
 
     def _real_mirrored_creator(devices, *args, **kwargs):  # pylint: disable=g-missing-docstring
-      index = {}
+      value_list = []
       for i, d in enumerate(devices):
         with ops.init_scope(), ops.device(d):
           if i > 0:
             # Give replicas meaningful distinct names:
-            var0name = index[devices[0]].name.split(":")[0]
+            var0name = value_list[0].name.split(":")[0]
             # We append a / to variable names created on replicas with id > 0 to
             # ensure that we ignore the name scope and instead use the given
             # name as the absolute name of the variable.
@@ -538,11 +524,11 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
             # Initialize replicas with the same value:
             def initial_value_fn(device=d):
               if context.executing_eagerly():
-                init_value = index[devices[0]].value()
+                init_value = value_list[0].value()
                 return array_ops.identity(init_value)
               else:
                 with ops.device(device):
-                  init_value = index[devices[0]].initial_value
+                  init_value = value_list[0].initial_value
                   return array_ops.identity(init_value)
             kwargs["initial_value"] = initial_value_fn
           with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
@@ -551,52 +537,37 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
             with tape.stop_recording():
               v = next_creator(*args, **kwargs)
           assert not isinstance(v, values.DistributedVariable)
-          index[d] = v
-      return index
+          value_list.append(v)
+      return value_list
 
-    return _create_mirrored_variable(devices, _real_mirrored_creator, *args,
-                                     **kwargs)
+    return _create_mirrored_variable(
+        self._container_strategy(), device_map, logical_device,
+        _real_mirrored_creator, *args, **kwargs)
 
-  def _distribute_dataset(self, dataset_fn):
-    if self._local_mode:
-      return values.PerReplicaDataset(
-          self._call_dataset_fn(dataset_fn), self._devices)
-    else:
-      return values.MultiWorkerDataset(
-          functools.partial(self._call_dataset_fn, dataset_fn),
-          self._worker_devices,
-          auto_shard=False)
+  def _validate_colocate_with_variable(self, colocate_with_variable):
+    values.validate_colocate_distributed_variable(colocate_with_variable, self)
 
   def _make_dataset_iterator(self, dataset):
-    if self._local_mode:
-      worker = device_util.canonicalize("/device:CPU:0")
-      worker_device_pairs = [(worker, self._devices)]
-    else:
-      worker_device_pairs = self._worker_devices
-
-    return values.DatasetIterator(dataset, worker_device_pairs,
-                                  self._num_replicas_in_sync)
+    return input_lib.DatasetIterator(
+        dataset, self._input_workers, self._num_replicas_in_sync)
 
   def _make_input_fn_iterator(
       self,
       input_fn,
       replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
     input_contexts = []
-    if self._local_mode:
-      num_workers = 1
-      worker = device_util.canonicalize("/device:CPU:0")
-      worker_device_pairs = [(worker, self._devices)]
-    else:
-      num_workers = len(self._worker_devices)
-      worker_device_pairs = self._worker_devices
-
+    num_workers = self._input_workers.num_workers
     for i in range(num_workers):
       input_contexts.append(distribute_lib.InputContext(
           num_input_pipelines=num_workers,
           input_pipeline_id=i,
           num_replicas_in_sync=self._num_replicas_in_sync))
-    return values.InputFunctionIterator(
-        input_fn, worker_device_pairs, input_contexts)
+    return input_lib.InputFunctionIterator(
+        input_fn, self._input_workers, input_contexts)
+
+  def _experimental_make_numpy_dataset(self, numpy_input, session):
+    return numpy_dataset.one_host_numpy_dataset(
+        numpy_input, self._host_input_device, session)
 
   # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
   def _experimental_run_steps_on_iterator(self, fn, iterator, iterations,
@@ -605,14 +576,11 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
       initial_loop_values = {}
     initial_loop_values = nest.flatten(initial_loop_values)
 
-    ctx = values.MultiStepContext()
+    ctx = input_lib.MultiStepContext()
     def body(i, *args):
       """A wrapper around `fn` to create the while loop body."""
       del args
-      fn_inputs = iterator.get_next()
-      if not isinstance(fn_inputs, tuple):
-        fn_inputs = (fn_inputs,)
-      fn_result = fn(ctx, fn_inputs)
+      fn_result = fn(ctx, iterator.get_next())
       for (name, output) in ctx.last_step_outputs.items():
         # Convert all outputs to tensors, potentially from `DistributedValues`.
         ctx.last_step_outputs[name] = self._unwrap(output)
@@ -649,8 +617,8 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
       # For outputs that have already been reduced, wrap them in a Mirrored
       # container, else in a PerReplica container.
       if reduce_op is None:
-        last_step_tensor_outputs_dict[name] = values.regroup(
-            {d: t for d, t in zip(self._devices, output)}, values.PerReplica)
+        last_step_tensor_outputs_dict[name] = values.regroup(self._device_map,
+                                                             output)
       else:
         assert len(output) == 1
         last_step_tensor_outputs_dict[name] = output[0]
@@ -667,11 +635,15 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
     if isinstance(tensor, (float, int)):
       return tensor
     # TODO(josh11b): In eager mode, use one thread per device, or async mode.
-    return self._get_cross_device_ops().broadcast(
-        tensor, destinations or self._devices)
+    if not destinations:
+      # TODO(josh11b): Use current logical device instead of 0 here.
+      destinations = values.LogicalDeviceSpec(
+          device_map=self._device_map, logical_device=0)
+    return self._get_cross_device_ops().broadcast(tensor, destinations)
 
   def _call_for_each_replica(self, fn, args, kwargs):
-    return _call_for_each_replica(self._container_strategy(), fn, args, kwargs)
+    return _call_for_each_replica(self._container_strategy(), self._device_map,
+                                  fn, args, kwargs)
 
   def _configure(self,
                  session_config=None,
@@ -686,7 +658,8 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
     if cluster_spec:
       # TODO(yuefengz): remove the following code once cluster_resolver is
       # added.
-      num_gpus_per_worker = _infer_num_gpus_per_worker(self._devices)
+      num_gpus_per_worker = _infer_num_gpus_per_worker(
+          self._device_map.all_devices)
       multi_worker_devices = _cluster_spec_to_device_list(
           cluster_spec, num_gpus_per_worker)
       self._initialize_multi_worker(multi_worker_devices)
@@ -709,38 +682,38 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
       # Mirrored values. For example, the same value could be present on all
       # replicas in which case `value` would be a single value or value could
       # be 0.
-      return _reduce_non_distributed_value(self, reduce_op, value,
-                                           destinations)
+      return cross_device_ops_lib.reduce_non_distributed_value(
+          reduce_op, self._device_map, value, destinations)
     return self._get_cross_device_ops().reduce(
         reduce_op, value, destinations=destinations)
 
   def _batch_reduce_to(self, reduce_op, value_destination_pairs):
-    return self._get_cross_device_ops().batch_reduce(reduce_op,
-                                                     value_destination_pairs)
+    return self._get_cross_device_ops().batch_reduce(
+        reduce_op, value_destination_pairs)
 
   def _update(self, var, fn, args, kwargs, group):
     # TODO(josh11b): In eager mode, use one thread per device.
     assert isinstance(var, values.DistributedVariable)
-    updates = {}
-    for d, v in var._index.items():  # pylint: disable=protected-access
-      name = "update_%d" % self._device_index.get(d)
+    updates = []
+    for i, (d, v) in enumerate(zip(var.devices, var.values)):
+      name = "update_%d" % i
       with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
         # If args and kwargs are not mirrored, the value is returned as is.
-        updates[d] = fn(v,
-                        *values.select_device_mirrored(d, args),
-                        **values.select_device_mirrored(d, kwargs))
-    return values.update_regroup(self, updates, group)
+        updates.append(fn(v,
+                          *values.select_device_mirrored(d, args),
+                          **values.select_device_mirrored(d, kwargs)))
+    return values.update_regroup(self, self._device_map, updates, group)
 
   def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
     assert isinstance(colocate_with, tuple)
     # TODO(josh11b): In eager mode, use one thread per device.
-    updates = {}
-    for d in colocate_with:
-      name = "update_%d" % self._device_index.get(d)
+    updates = []
+    for i, d in enumerate(colocate_with):
+      name = "update_%d" % i
       with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
-        updates[d] = fn(*values.select_device_mirrored(d, args),
-                        **values.select_device_mirrored(d, kwargs))
-    return values.update_regroup(self, updates, group)
+        updates.append(fn(*values.select_device_mirrored(d, args),
+                          **values.select_device_mirrored(d, kwargs)))
+    return values.update_regroup(self, self._device_map, updates, group)
 
   def read_var(self, replica_local_var):
     """Read the aggregate value of a replica-local variable."""
@@ -751,10 +724,7 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
 
   def _unwrap(self, val):
     if isinstance(val, values.DistributedValues):
-      # Return in a deterministic order.
-      if set(val.devices) == self._canonical_device_set:
-        return tuple(val.get(device=d) for d in self._devices)
-      return tuple(val.get(device=d) for d in sorted(val.devices))
+      return val.values
     return (val,)
 
   def value_container(self, val):
@@ -762,15 +732,19 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
 
   @property
   def _num_replicas_in_sync(self):
-    return len(self._devices)
+    return self._device_map.num_replicas_in_graph
 
   @property
   def worker_devices(self):
-    return self._devices
+    return self._device_map.all_devices
+
+  @property
+  def worker_devices_by_replica(self):
+    return self._device_map.devices_by_replica
 
   @property
   def parameter_devices(self):
-    return self._devices
+    return self._device_map.all_devices
 
   @property
   def experimental_between_graph(self):
@@ -790,107 +764,116 @@ class MirroredExtended(distribute_lib.DistributionStrategyExtended):
 
   def non_slot_devices(self, var_list):
     del var_list
-    return tuple(self._devices)
-
-  def _get_devices_from(self, colocate_with=None):
-    if colocate_with is None:
-      return self._devices
-    else:
-      return cross_device_ops_lib.get_devices_from(colocate_with)
+    # TODO(josh11b): Should this be the last logical device instead?
+    return self._device_map.logical_to_actual_devices(0)
 
   # TODO(priyag): Delete this once all strategies use global batch size.
   @property
   def _global_batch_size(self):
+    """`make_dataset_iterator` and `make_numpy_iterator` use global batch size.
+
+    `make_input_fn_iterator` assumes per-replica batching.
+
+    Returns:
+      Boolean.
+    """
     return True
 
-  class _MirroredReplicaThread(threading.Thread):
-    """A thread that runs() a function on a device."""
-
-    def __init__(self, dist, coord, device, variable_creator_fn, fn, *args,
-                 **kwargs):
-      super(MirroredExtended._MirroredReplicaThread, self).__init__()  # pylint: disable=protected-access
-      self.coord = coord
-      self.distribution = dist
-      self.device = device
-      self.replica_id = dist.extended.worker_devices.index(device)
-      self.variable_creator_fn = variable_creator_fn
-      # State needed to run and return the results of `fn`.
-      self.main_fn = fn
-      self.main_args = args
-      self.main_kwargs = kwargs
-      self.main_result = None
-      self.done = False
-      # State needed to run the next merge_call() (if any) requested via
-      # ReplicaContext.
-      self.merge_fn = None
-      self.merge_args = None
-      self.merge_kwargs = None
-      self.merge_result = None
-      self.captured_name_scope = None
-      # We use a thread.Event for the main thread to signal when this
-      # thread should start running (`should_run`), and another for
-      # this thread to transfer control back to the main thread
-      # (`has_paused`, either when it gets to a
-      # `get_replica_context().merge_call` or when `fn` returns). In
-      # either case the event starts cleared, is signaled by calling
-      # set(). The receiving thread waits for the signal by calling
-      # wait() and then immediately clearing the event using clear().
-      self.should_run = threading.Event()
-      self.has_paused = threading.Event()
-      # These fields have to do with inheriting various contexts from the
-      # parent thread:
-      # pylint: disable=protected-access
-      self.context_mode = context.context()._eager_context.mode
-      if not context.context()._context_handle:
-        context.context()._initialize_handle_and_devices()
-      self.context_device_policy = (
-          pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(
-              context.context()._context_handle))
-      self.graph = ops.get_default_graph()
-      self._variable_creator_stack = self.graph._variable_creator_stack[:]
-      self._captured_var_scope = variable_scope.get_variable_scope()
-      # Adding a "/" at end lets us re-enter this scope later.
-      self._name_scope = self.graph.get_name_scope()
-      if self._name_scope:
-        self._name_scope += "/"
-      if self.replica_id > 0:
-        if not self._name_scope:
-          self._name_scope = ""
-        self._name_scope += "replica_%d/" % self.replica_id
-
-    def run(self):
-      # pylint: disable=protected-access
-      self.graph._variable_creator_stack = self._variable_creator_stack
-      self.should_run.wait()
-      self.should_run.clear()
-      try:
-        if self.coord.should_stop():
-          return
-        with self.coord.stop_on_exception(), \
-            context.context()._mode(self.context_mode), \
-            context.context().device_policy(self.context_device_policy), \
-            _enter_graph(self.graph), \
-            MirroredReplicaContext(self.distribution, constant_op.constant(
-                self.replica_id, dtypes.int32)), \
-            ops.device(self.device), \
-            ops.name_scope(self._name_scope), \
-            variable_scope.variable_scope(
-                self._captured_var_scope, reuse=self.replica_id > 0), \
-            variable_scope.variable_creator_scope(self.variable_creator_fn):
-          self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
-          self.done = True
-      finally:
-        self.has_paused.set()
+
+class _MirroredReplicaThread(threading.Thread):
+  """A thread that runs() a function on a device."""
+
+  def __init__(self, dist, coord, replica_id, device_map, variable_creator_fn,
+               fn, args, kwargs):
+    super(_MirroredReplicaThread, self).__init__()
+    self.coord = coord
+    self.distribution = dist
+    self.device_map = device_map
+    self.replica_id = replica_id
+    self.variable_creator_fn = variable_creator_fn
+    # State needed to run and return the results of `fn`.
+    self.main_fn = fn
+    self.main_args = args
+    self.main_kwargs = kwargs
+    self.main_result = None
+    self.done = False
+    # State needed to run the next merge_call() (if any) requested via
+    # ReplicaContext.
+    self.merge_fn = None
+    self.merge_args = None
+    self.merge_kwargs = None
+    self.merge_result = None
+    self.captured_name_scope = None
+    # We use a thread.Event for the main thread to signal when this
+    # thread should start running (`should_run`), and another for
+    # this thread to transfer control back to the main thread
+    # (`has_paused`, either when it gets to a
+    # `get_replica_context().merge_call` or when `fn` returns). In
+    # either case the event starts cleared, is signaled by calling
+    # set(). The receiving thread waits for the signal by calling
+    # wait() and then immediately clearing the event using clear().
+    self.should_run = threading.Event()
+    self.has_paused = threading.Event()
+    # These fields have to do with inheriting various contexts from the
+    # parent thread:
+    ctx = context.context()
+    self.in_eager = ctx.executing_eagerly()
+    # pylint: disable=protected-access
+    if not ctx._context_handle:
+      ctx._initialize_handle_and_devices()
+    self.context_device_policy = (
+        pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(
+            ctx._context_handle))
+    self.graph = ops.get_default_graph()
+    with ops.init_scope():
+      self._init_in_eager = context.executing_eagerly()
+      self._init_graph = ops.get_default_graph()
+
+    self._variable_creator_stack = self.graph._variable_creator_stack[:]
+    self._captured_var_scope = variable_scope.get_variable_scope()
+    # Adding a "/" at end lets us re-enter this scope later.
+    self._name_scope = self.graph.get_name_scope()
+    if self._name_scope:
+      self._name_scope += "/"
+    if self.replica_id > 0:
+      if not self._name_scope:
+        self._name_scope = ""
+      self._name_scope += "replica_%d/" % self.replica_id
+
+  def run(self):
+    self.should_run.wait()
+    self.should_run.clear()
+    try:
+      if self.coord.should_stop():
+        return
+      # TODO(josh11b): Use current logical device instead of 0 here.
+      with self.coord.stop_on_exception(), \
+          _enter_graph(self._init_graph, self._init_in_eager), \
+          _enter_graph(self.graph, self.in_eager,
+                       self._variable_creator_stack), \
+          context.context().device_policy(self.context_device_policy), \
+          MirroredReplicaContext(self.distribution, constant_op.constant(
+              self.replica_id, dtypes.int32)), \
+          ops.device(self.device_map.logical_to_actual_devices(0)[
+              self.replica_id]), \
+          ops.name_scope(self._name_scope), \
+          variable_scope.variable_scope(
+              self._captured_var_scope, reuse=self.replica_id > 0), \
+          variable_scope.variable_creator_scope(self.variable_creator_fn):
+        self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
+        self.done = True
+    finally:
+      self.has_paused.set()
 
 
 class MirroredReplicaContext(distribute_lib.ReplicaContext):
-  """ReplicaContext used in MirroredStrategy.call_for_each_replica().
+  """ReplicaContext used in MirroredStrategy.extended.call_for_each_replica().
 
   Opened in `_MirroredReplicaThread`, to allow the user to invoke
   `MirroredStrategy`'s specific implementation of `merge_call()`,
   which works by delegating the function and its arguments to
   the main thread (the one that invoked
-  `MirroredStrategy.call_for_each_replica()`).
+  `MirroredStrategy.extended.call_for_each_replica()`).
   """
 
   def _merge_call(self, fn, args, kwargs):
@@ -916,4 +899,4 @@ class MirroredReplicaContext(distribute_lib.ReplicaContext):
   def devices(self):
     distribute_lib.require_replica_context(self)
     replica_id = tensor_util.constant_value(self._replica_id_in_sync_group)
-    return [self._distribution_strategy.extended.worker_devices[replica_id]]
+    return [self._strategy.extended.worker_devices_by_replica[replica_id]]
diff --git a/tensorflow/python/distribute/numpy_dataset.py b/tensorflow/python/distribute/numpy_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..5881e4cd59e75ac5184e400bd0ac90443084635e
--- /dev/null
+++ b/tensorflow/python/distribute/numpy_dataset.py
@@ -0,0 +1,97 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Code for creating a dataset out of a NumPy array."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import nest
+
+
+def init_var_from_numpy(input_var, numpy_input, session):
+  """Initialize `input_var` to `numpy_input` using `session` in graph mode."""
+  with ops.init_scope():
+    if context.executing_eagerly():
+      input_var.assign(numpy_input)
+      return
+
+    assert session is not None
+    session.run(input_var.initializer)
+
+    start_placeholder = array_ops.placeholder(dtypes.int64, ())
+    end_placeholder = array_ops.placeholder(dtypes.int64, ())
+    slice_placeholder = array_ops.placeholder(input_var.dtype)
+    assign_slice_op = input_var[start_placeholder:end_placeholder].assign(
+        slice_placeholder)
+
+    # If each batch element is > 64 MB, then we copy each batch element
+    # individually. Otherwise, the slices will be < 128 MB. There might be
+    # padding which might mean that the slices are 128 MB even if the size of
+    # the tensor allocated is less than 128 MB.  This formula gives slices with
+    # size: ceil(64 MB / byte size per batch element) bytes.  Using ceil()
+    # guarantees we get a number >= 1.
+
+    # Calculate the size of each batch element.
+    byte_size_per_batch_element = (
+        np.prod(numpy_input.shape[1:]) * input_var.dtype.size)
+
+    # Calculate number of elements we want to copy per slice.
+    batch_size_per_slice = int(
+        np.ceil((64 << 20) / byte_size_per_batch_element))
+
+    # Copy slices of the above size starting at 0, except the last slice will be
+    # smaller.
+    start = 0
+    limit = numpy_input.shape[0]
+    while start < limit:
+      end = min(start + batch_size_per_slice, limit)
+      session.run(assign_slice_op, feed_dict={
+          start_placeholder: start,
+          end_placeholder: end,
+          slice_placeholder: numpy_input[start:end]})
+      start = end
+
+
+def one_host_numpy_dataset(numpy_input, colocate_with, session):
+  """Create a dataset on `colocate_with` from `numpy_input`."""
+  def create_colocated_variable(next_creator, *args, **kwargs):
+    kwargs["colocate_with"] = colocate_with
+    return next_creator(*args, **kwargs)
+
+  numpy_flat = nest.flatten(numpy_input)
+  with variable_scope.variable_creator_scope(create_colocated_variable):
+    vars_flat = tuple(variable_scope.variable(array_ops.zeros(i.shape, i.dtype),
+                                              trainable=False)
+                      for i in numpy_flat)
+  for v, i in zip(vars_flat, numpy_flat):
+    init_var_from_numpy(v, i, session)
+  vars_nested = nest.pack_sequence_as(numpy_input, vars_flat)
+  return dataset_ops.Dataset.from_tensor_slices(vars_nested)
+
+
+class SingleDevice(object):
+  """Used with `colocate_with` to create a non-mirrored variable."""
+
+  def __init__(self, device):
+    self.device = device
diff --git a/tensorflow/python/distribute/numpy_dataset_test.py b/tensorflow/python/distribute/numpy_dataset_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..04eae1daa2ee83040f4d9acb3a79baa6be16f402
--- /dev/null
+++ b/tensorflow/python/distribute/numpy_dataset_test.py
@@ -0,0 +1,44 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for numpy_dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.distribute import numpy_dataset
+from tensorflow.python.eager import test
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variable_scope
+
+
+class InitVarFromNumpyTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_creating_var_with_numpy_arrays(self):
+    with self.cached_session() as session:
+      x = np.asarray(np.random.random((64, 3)), dtype=np.float32)
+      initial = np.zeros_like(x)
+      var_x = variable_scope.variable(initial)
+      numpy_dataset.init_var_from_numpy(var_x, x, session)
+      val = self.evaluate(var_x.value())
+      # Verify that the numpy value is copied to the variable.
+      self.assertAllEqual(x, val)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/distribute/one_device_strategy.py b/tensorflow/python/distribute/one_device_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a255b2fe10e4cf56f8338e7f83e9258de2b12de
--- /dev/null
+++ b/tensorflow/python/distribute/one_device_strategy.py
@@ -0,0 +1,219 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Class OneDeviceStrategy implementing DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
+from tensorflow.python.distribute import numpy_dataset
+from tensorflow.python.distribute import values
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+# TODO(josh11b): Replace asserts in this file with if ...: raise ...
+
+
+@tf_export("distribute.OneDeviceStrategy")
+class OneDeviceStrategy(distribute_lib.DistributionStrategy):
+  """A distribution strategy for running on a single device."""
+  # TODO(josh11b): Do we wrap values in types to generate errors if you are
+  # doing something that won't work with other DistributionStrategy
+  # implementations?
+
+  def __init__(self, device):
+    super(OneDeviceStrategy, self).__init__(OneDeviceExtended(self, device))
+
+
+class OneDeviceExtended(distribute_lib.DistributionStrategyExtended):
+  """Implementation of OneDeviceStrategy."""
+
+  def __init__(self, container_strategy, device):
+    super(OneDeviceExtended, self).__init__(container_strategy)
+    self._device = device
+    self._input_device = device_util.canonicalize("/device:CPU:0")
+    worker_device_pairs = [(self._input_device, [self._device])]
+    device_map = values.SingleDeviceMap(device)
+    self._input_workers = input_lib.InputWorkers(
+        device_map, worker_device_pairs)
+
+  def _create_variable(self, next_creator, *args, **kwargs):
+    colocate_with = kwargs.pop("colocate_with", None)
+    if colocate_with is None:
+      with ops.device(self._device):
+        return next_creator(*args, **kwargs)
+    elif isinstance(colocate_with, numpy_dataset.SingleDevice):
+      with ops.device(colocate_with.device):
+        return next_creator(*args, **kwargs)
+    else:
+      with ops.colocate_with(colocate_with):
+        return next_creator(*args, **kwargs)
+
+  def _validate_colocate_with_variable(self, colocate_with_variable):
+    values.validate_colocate(colocate_with_variable, self)
+
+  def _make_dataset_iterator(self, dataset):
+    """Make iterator from dataset without splitting the batch."""
+    # Note that split_batch_by argument is not passed because it is always 1 in
+    # this strategy, and adding it adds unnecessary overhead to the dataset.
+    return input_lib.DatasetIterator(dataset, self._input_workers)
+
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    return input_lib.InputFunctionIterator(
+        input_fn, self._input_workers, [distribute_lib.InputContext()])
+
+  def _experimental_make_numpy_dataset(self, numpy_input, session):
+    return numpy_dataset.one_host_numpy_dataset(
+        numpy_input, numpy_dataset.SingleDevice(self._input_device), session)
+
+  def _broadcast_to(self, tensor, destinations):
+    del destinations
+    return tensor
+
+  # TODO(priyag): Deal with OutOfRange errors  once b/111349762 is fixed.
+  def _experimental_run_steps_on_iterator(self, fn, iterator, iterations,
+                                          initial_loop_values=None):
+    if initial_loop_values is None:
+      initial_loop_values = {}
+    initial_loop_values = nest.flatten(initial_loop_values)
+
+    ctx = input_lib.MultiStepContext()
+    def body(i, *args):
+      """A wrapper around `fn` to create the while loop body."""
+      del args
+      fn_result = fn(ctx, iterator.get_next())
+      flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
+      with ops.control_dependencies([fn_result]):
+        return [i + 1] + flat_last_step_outputs
+
+    # We capture the control_flow_context at this point, before we run `fn`
+    # inside a while_loop. This is useful in cases where we might need to exit
+    # these contexts and get back to the outer context to do some things, for
+    # e.g. create an op which should be evaluated only once at the end of the
+    # loop on the host. One such usage is in creating metrics' value op.
+    self._outer_control_flow_context = (
+        ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
+
+    # TODO(priyag): Use max_iterations instead of an explicit counter.
+    cond = lambda i, *args: i < iterations
+    i = constant_op.constant(0)
+    loop_result = control_flow_ops.while_loop(
+        cond, body, [i] + initial_loop_values, name="",
+        parallel_iterations=1, back_prop=False, swap_memory=False,
+        return_same_structure=True)
+    del self._outer_control_flow_context
+
+    ctx.run_op = control_flow_ops.group(loop_result)
+
+    # Convert the last_step_outputs from a list to the original dict structure
+    # of last_step_outputs.
+    last_step_tensor_outputs = loop_result[1:]
+    last_step_tensor_outputs_dict = nest.pack_sequence_as(
+        ctx.last_step_outputs, last_step_tensor_outputs)
+
+    ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
+    return ctx
+
+  def _call_for_each_replica(self, fn, args, kwargs):
+    strategy = self._container_strategy()
+    with ops.device(self._device), _OneDeviceReplicaContext(strategy):
+      return fn(*args, **kwargs)
+
+  def _reduce_to(self, reduce_op, value, destinations):
+    del reduce_op, destinations
+    return value
+
+  def _update(self, var, fn, args, kwargs, group):
+    # The implementations of _update() and _update_non_slot() are identical
+    # except _update() passes `var` as the first argument to `fn()`.
+    return self._update_non_slot(var, fn, (var,) + tuple(args), kwargs, group)
+
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
+    del colocate_with
+    with ops.device(self._device), distribute_lib.UpdateContext(self._device):
+      result = fn(*args, **kwargs)
+      if group:
+        return result
+      else:
+        return nest.map_structure(self._unwrap, result)
+
+  def read_var(self, replica_local_var):
+    """Read the aggregate value of a replica-local variable."""
+    return array_ops.identity(replica_local_var)
+
+  def _unwrap(self, value):
+    return (value,)
+
+  def value_container(self, value):
+    return value
+
+  @property
+  def _num_replicas_in_sync(self):
+    return 1
+
+  @property
+  def worker_devices(self):
+    return (self._device,)
+
+  @property
+  def parameter_devices(self):
+    return (self._device,)
+
+  def non_slot_devices(self, var_list):
+    del var_list
+    return (self._device,)
+
+  @property
+  def experimental_should_init(self):
+    return True
+
+  @property
+  def should_checkpoint(self):
+    return True
+
+  @property
+  def should_save_summary(self):
+    return True
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    """Global and per-replica batching are equivalent for OneDeviceStrategy."""
+    return True
+
+
+class _OneDeviceReplicaContext(distribute_lib.ReplicaContext):
+  """ReplicaContext for OneDeviceStrategy."""
+
+  def __init__(self, strategy):
+    zero = constant_op.constant(0, dtypes.int32)
+    distribute_lib.ReplicaContext.__init__(
+        self, strategy, replica_id_in_sync_group=zero)
+
+  @property
+  def devices(self):
+    return self._strategy.extended.worker_devices
diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..535327b9fe27b71688fa2cdcea1e2895cfa87016
--- /dev/null
+++ b/tensorflow/python/distribute/parameter_server_strategy.py
@@ -0,0 +1,541 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Classes implementing a multi-worker ps DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import input_lib
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import numpy_dataset
+from tensorflow.python.distribute import values
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.eager import context
+from tensorflow.python.framework import device as tf_device
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import device_setter
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+_LOCAL_CPU = "/device:CPU:0"
+_LOCAL_GPU_0 = "/device:GPU:0"
+
+
+# TODO(yuefengz): maybe cache variables on local CPU.
+@tf_export("distribute.experimental.ParameterServerStrategy")
+class ParameterServerStrategy(distribute_lib.DistributionStrategy):
+  """A parameter server DistributionStrategy.
+
+  This strategy class works for both local training and between-graph replicated
+  training for multiple workers. It uses `TFConfigClusterResolver` to detect
+  configurations for multi-worker training. In multi-worker training mode, i.e.
+  `TFConfigClusterResolver` has detected 'TF_CONFIG' environment variable and
+  'TF_CONFIG' has a cluster spec, variables and updates to those variables are
+  assigned to parameter servers and other operations are assigned to workers.
+  In local training mode, variables are assigned to local CPU or the only GPU.
+  When each worker has more than one GPU, operations will be replicated on these
+  GPUs. In both cases, operations are replicated but variables are not and these
+  workers share a common view for which paramater server a variable is assigned
+  to.
+
+  This class assumes between-graph replication will be used and works on a graph
+  for a particular worker. Note that each graph and worker is independent.
+  This means that while each worker will synchronously compute a single gradient
+  update across all GPUs, updates between workers proceed asynchronously.
+  Operations that occur only on the first replica (such as incrementing the
+  global step), will occur on the first replica *of every worker*.
+
+  It is expected to call `call_for_each_replica(fn, ...)` for any
+  operations which potentially can be replicated across replicas (i.e. multiple
+  GPUs) even if there is only CPU or one GPU. When defining the `fn`, extra
+  caution needs to be taken:
+
+  1) It is generally not recommended to open a device scope under the strategy's
+  scope. A device scope (i.e. calling `tf.device`) will be merged with or
+  override the device for operations but will not change the device for
+  variables.
+
+  2) It is also not recommended to open a colocation scope (i.e. calling
+  `tf.colocate_with`) under the strategy's scope. For colocating variables, use
+  `strategy.extended.colocate_vars_with` instead. Colocation of ops will
+  possibly create conflicts of device assignment.
+  """
+
+  def __init__(self):
+    """Initializes this strategy with default TFConfigClusterResolver."""
+    super(ParameterServerStrategy, self).__init__(
+        ParameterServerStrategyExtended(self))
+
+
+class ParameterServerStrategyExtended(
+    distribute_lib.DistributionStrategyExtended):
+  """Implementation of ParameterServerStrategy."""
+
+  def __init__(self,
+               container_strategy,
+               cluster_resolver=TFConfigClusterResolver()):
+    super(ParameterServerStrategyExtended, self).__init__(container_strategy)
+    self._initialize_strategy(cluster_resolver)
+
+    # We typically don't need to do all-reduce in this strategy.
+    self._cross_device_ops = (
+        cross_device_ops_lib.ReductionToOneDevice(reduce_to_device=_LOCAL_CPU))
+
+  def _initialize_strategy(self, cluster_resolver):
+    if cluster_resolver.cluster_spec().as_dict():
+      self._initialize_multi_worker(cluster_resolver)
+    else:
+      self._initialize_local(cluster_resolver)
+    # Save the num_gpus_per_worker for configure method.
+    self._num_gpus_per_worker = cluster_resolver.num_accelerators()
+
+  def _initialize_multi_worker(self, cluster_resolver):
+    """Initialize devices for multiple workers.
+
+    It creates variable devices and compute devices. Variables and operations
+    will be assigned to them respectively. We have one compute device per
+    replica. The variable device is a device function or device string. The
+    default variable device assigns variables to parameter servers in a
+    round-robin fashion.
+
+    Args:
+      cluster_resolver: a descendant of `ClusterResolver` object.
+
+    Raises:
+      ValueError: if the cluster doesn't have ps jobs.
+    """
+    num_gpus = cluster_resolver.num_accelerators()
+    cluster_spec = cluster_resolver.cluster_spec()
+    task_type = cluster_resolver.task_type
+    task_id = cluster_resolver.task_id
+    if not task_type or task_id is None:
+      raise ValueError("When `cluster_spec` is given, you must also specify "
+                       "`task_type` and `task_id`")
+    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
+    assert cluster_spec.as_dict()
+
+    worker_device = "/job:%s/task:%d" % (task_type, task_id)
+    self._input_host_device = numpy_dataset.SingleDevice(worker_device)
+
+    # Define compute devices which is a list of device strings and one for each
+    # replica. When there are GPUs, replicate operations on these GPUs.
+    # Otherwise, place operations on CPU.
+    if num_gpus > 0:
+      compute_devices = tuple(
+          "%s/device:GPU:%d" % (worker_device, i) for i in range(num_gpus))
+    else:
+      compute_devices = (worker_device,)
+
+    self._device_map = values.ReplicaDeviceMap(compute_devices)
+    self._input_workers = input_lib.InputWorkers(
+        self._device_map, [(worker_device, compute_devices)])
+
+    # In distributed mode, place variables on ps jobs in a round-robin fashion.
+    # Note that devices returned from `replica_device_setter` are not
+    # canonical and therefore we don't canonicalize all variable devices to
+    # make them consistent.
+    # TODO(yuefengz): support passing a strategy object to control variable
+    # assignment.
+    # TODO(yuefengz): merge the logic of replica_device_setter into this
+    # class.
+    num_ps_replicas = len(cluster_spec.as_dict().get("ps", []))
+    if num_ps_replicas == 0:
+      raise ValueError("The cluster spec needs to have `ps` jobs.")
+    self._variable_device = device_setter.replica_device_setter(
+        ps_tasks=num_ps_replicas,
+        worker_device=worker_device,
+        merge_devices=True,
+        cluster=cluster_spec)
+
+    # The `_parameter_devices` is needed for the `parameter_devices` property
+    # and is a list of all variable devices. Here parameter devices are all
+    # tasks of the "ps" job.
+    self._parameter_devices = tuple(map("/job:ps/task:{}".format,
+                                        range(num_ps_replicas)))
+
+    # Add a default device so that ops without specified devices will not end up
+    # on other workers.
+    self._default_device = worker_device
+
+    self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
+                                                task_id)
+    self._cluster_spec = cluster_spec
+    self._task_type = task_type
+    self._task_id = task_id
+
+    logging.info(
+        "Multi-worker ParameterServerStrategy with "
+        "cluster_spec = %r, task_type = %r, task_id = %r, "
+        "num_ps_replicas = %r, is_chief = %r, device_map = %r, "
+        "variable_device = %r", cluster_spec.as_dict(), task_type, task_id,
+        num_ps_replicas, self._is_chief, self._device_map,
+        self._variable_device)
+
+  def _initialize_local(self, cluster_resolver):
+    """Initialize internal devices for local training."""
+    worker_device = device_util.canonicalize("/device:CPU:0")
+    self._input_host_device = numpy_dataset.SingleDevice(worker_device)
+    num_gpus = cluster_resolver.num_accelerators()
+    # Define compute devices which is a list of device strings and one for each
+    # replica. When there are GPUs, replicate operations on these GPUs.
+    # Otherwise, place operations on CPU.
+    if num_gpus > 0:
+      compute_devices = tuple(map("/device:GPU:{}".format, range(num_gpus)))
+    else:
+      compute_devices = (_LOCAL_CPU,)
+
+    self._device_map = values.ReplicaDeviceMap(compute_devices)
+    self._input_workers = input_lib.InputWorkers(
+        self._device_map, [(worker_device, compute_devices)])
+
+    # If there is only one GPU, put everything on that GPU. Otherwise, place
+    # variables on CPU.
+    if num_gpus == 1:
+      assert len(compute_devices) == 1
+      self._variable_device = _LOCAL_GPU_0
+      self._parameter_devices = (_LOCAL_GPU_0,)
+    else:
+      self._variable_device = _LOCAL_CPU
+      self._parameter_devices = (_LOCAL_CPU,)
+
+    self._is_chief = True
+    self._cluster_spec = None
+    self._task_type = None
+    self._task_id = None
+
+    logging.info(
+        "ParameterServerStrategy with compute_devices = %r, "
+        "variable_device = %r", compute_devices, self._variable_device)
+
+  def _validate_colocate_with_variable(self, colocate_with_variable):
+    values.validate_colocate(colocate_with_variable, self)
+
+  def _make_dataset_iterator(self, dataset):
+    return input_lib.DatasetIterator(dataset, self._input_workers,
+                                     self._num_replicas_in_sync)
+
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    """Distributes the dataset to each local GPU."""
+    if self._cluster_spec:
+      input_pipeline_id = multi_worker_util.id_in_cluster(
+          self._cluster_spec, self._task_type, self._task_id)
+      num_input_pipelines = multi_worker_util.worker_count(
+          self._cluster_spec, self._task_type)
+    else:
+      input_pipeline_id = 0
+      num_input_pipelines = 1
+    input_context = distribute_lib.InputContext(
+        num_input_pipelines=num_input_pipelines,
+        input_pipeline_id=input_pipeline_id,
+        num_replicas_in_sync=self._num_replicas_in_sync)
+    return input_lib.InputFunctionIterator(input_fn, self._input_workers,
+                                           [input_context])
+
+  def _experimental_make_numpy_dataset(self, numpy_input, session):
+    return numpy_dataset.one_host_numpy_dataset(
+        numpy_input, self._input_host_device, session)
+
+  def _broadcast_to(self, tensor, destinations):
+    # This is both a fast path for Python constants, and a way to delay
+    # converting Python values to a tensor until we know what type it
+    # should be converted to. Otherwise we have trouble with:
+    #   global_step.assign_add(1)
+    # since the `1` gets broadcast as an int32 but global_step is int64.
+    if isinstance(tensor, (float, int)):
+      return tensor
+    if not cross_device_ops_lib.check_destinations(destinations):
+      # TODO(josh11b): Use current logical device instead of 0 here.
+      destinations = values.LogicalDeviceSpec(
+          device_map=self._device_map, logical_device=0)
+    return self._cross_device_ops.broadcast(tensor, destinations)
+
+  def _allow_variable_partition(self):
+    return not context.executing_eagerly()
+
+  # TODO(yuefengz): not all ops in device_setter.STANDARD_PS_OPS will go through
+  # this creator, such as "MutableHashTable".
+  def _create_variable(self, next_creator, *args, **kwargs):
+    if self._num_replicas_in_sync > 1:
+      aggregation = kwargs.pop("aggregation", vs.VariableAggregation.NONE)
+      if aggregation not in (
+          vs.VariableAggregation.NONE,
+          vs.VariableAggregation.SUM,
+          vs.VariableAggregation.MEAN,
+          vs.VariableAggregation.ONLY_FIRST_REPLICA
+      ):
+        raise ValueError("Invalid variable aggregation mode: " + aggregation +
+                         " for variable: " + kwargs["name"])
+
+      def var_creator(*args, **kwargs):
+        """Create an AggregatingVariable and fix up collections."""
+        # Record what collections this variable should be added to.
+        collections = kwargs.pop("collections", None)
+        if collections is None:
+          collections = [ops.GraphKeys.GLOBAL_VARIABLES]
+        kwargs["collections"] = []
+
+        # Create and wrap the variable.
+        v = next_creator(*args, **kwargs)
+        wrapped = values.AggregatingVariable(
+            self._container_strategy(), v, aggregation)
+
+        # Add the wrapped variable to the requested collections.
+        # The handling of eager mode and the global step matches
+        # ResourceVariable._init_from_args().
+        if not context.executing_eagerly():
+          g = ops.get_default_graph()
+          # If "trainable" is True, next_creator() will add the contained
+          # variable to the TRAINABLE_VARIABLES collection, so we manually
+          # remove it and replace with the wrapper. We can't set "trainable"
+          # to False for next_creator() since that causes functions like
+          # implicit_gradients to skip those variables.
+          if kwargs.get("trainable", True):
+            collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
+            l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
+            if v in l:
+              l.remove(v)
+          g.add_to_collections(collections, wrapped)
+        elif ops.GraphKeys.GLOBAL_STEP in collections:
+          ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, wrapped)
+
+        return wrapped
+    else:
+      var_creator = next_creator
+
+    if "colocate_with" in kwargs:
+      colocate_with = kwargs["colocate_with"]
+      if isinstance(colocate_with, numpy_dataset.SingleDevice):
+        with ops.device(colocate_with.device):
+          return var_creator(*args, **kwargs)
+      with ops.device(None):
+        with ops.colocate_with(colocate_with):
+          return var_creator(*args, **kwargs)
+
+    with ops.colocate_with(None, ignore_existing=True):
+      with ops.device(self._variable_device):
+        return var_creator(*args, **kwargs)
+
+  def _call_for_each_replica(self, fn, args, kwargs):
+    # pylint: disable=protected-access
+    return mirrored_strategy._call_for_each_replica(
+        self._container_strategy(), self._device_map, fn, args, kwargs)
+
+  def _verify_destinations_not_different_worker(self, destinations):
+    if not self._cluster_spec:
+      return
+    if destinations is None:
+      return
+    for d in cross_device_ops_lib.get_devices_from(destinations):
+      d_spec = tf_device.DeviceSpec.from_string(d)
+      if d_spec.job == self._task_type and d_spec.task != self._task_id:
+        raise ValueError(
+            "Cannot reduce to another worker: %r, current worker is %r" %
+            (d, self._input_workers.worker_devices[0]))
+
+  def _reduce_to(self, reduce_op, value, destinations):
+    self._verify_destinations_not_different_worker(destinations)
+    if not isinstance(value, values.DistributedValues):
+      # pylint: disable=protected-access
+      return cross_device_ops_lib.reduce_non_distributed_value(
+          reduce_op, self._device_map, value, destinations)
+    return self._cross_device_ops.reduce(
+        reduce_op, value, destinations=destinations)
+
+  def _batch_reduce_to(self, reduce_op, value_destination_pairs):
+    for _, destinations in value_destination_pairs:
+      self._verify_destinations_not_different_worker(destinations)
+    return self._cross_device_ops.batch_reduce(reduce_op,
+                                               value_destination_pairs)
+
+  def _select_single_value(self, structured):
+    """Select any single values in `structured`."""
+
+    def _select_fn(x):  # pylint: disable=g-missing-docstring
+      if isinstance(x, values.Mirrored):
+        if len(x.devices) == 1:
+          return x.primary
+        else:
+          raise ValueError(
+              "You cannot update variable with a Mirrored object with multiple "
+              "components %r when using ParameterServerStrategy. You must "
+              "specify a single value or a Mirrored with a single value." % x)
+      elif isinstance(x, values.PerReplica):
+        raise ValueError(
+            "You cannot update variable with a PerReplica object %r when using "
+            "ParameterServerStrategy. You must specify a single value or a "
+            "Mirrored with a single value" % x)
+      else:
+        return x
+
+    return nest.map_structure(_select_fn, structured)
+
+  def _update(self, var, fn, args, kwargs, group):
+    if isinstance(var, values.AggregatingVariable):
+      var = var.get()
+    if not isinstance(var, resource_variable_ops.ResourceVariable):
+      raise ValueError(
+          "You can not update `var` %r. It must be a Variable." % var)
+    with ops.colocate_with(var), distribute_lib.UpdateContext(var.device):
+      result = fn(var, *self._select_single_value(args),
+                  **self._select_single_value(kwargs))
+      if group:
+        return result
+      else:
+        return nest.map_structure(self._unwrap, result)
+
+  # TODO(yuefengz): does it need to call _select_single_value?
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
+    with ops.device(
+        colocate_with.device), distribute_lib.UpdateContext(colocate_with):
+      result = fn(*args, **kwargs)
+      if group:
+        return result
+      else:
+        return nest.map_structure(self._unwrap, result)
+
+  def _unwrap(self, val):
+    if isinstance(val, values.DistributedValues):
+      return val.values
+    return (val,)
+
+  def value_container(self, val):
+    if (hasattr(val, "_aggregating_container") and
+        not isinstance(val, values.AggregatingVariable)):
+      wrapper = val._aggregating_container()  # pylint: disable=protected-access
+      if wrapper is not None:
+        return wrapper
+    return val
+
+  def read_var(self, var):
+    # No need to distinguish between normal variables and replica-local
+    # variables.
+    return array_ops.identity(var)
+
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
+    """Configures the strategy class.
+
+    The strategy object will be re-initialized if `cluster_spec` is given but
+    was not passed in the constructor.
+
+    Args:
+      session_config: not used currently.
+      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
+        cluster configurations.
+      task_type: the current task type.
+      task_id: the current task id.
+
+    Raises:
+      ValueError: if `cluster_spec` is given but `task_type` or `task_id` is
+        not.
+    """
+    if cluster_spec:
+      # Use the num_gpus_per_worker recorded in constructor since _configure
+      # doesn't take num_gpus.
+      cluster_resolver = SimpleClusterResolver(
+          cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec),
+          task_type=task_type,
+          task_id=task_id,
+          num_accelerators=self._num_gpus_per_worker)
+      self._initialize_multi_worker(cluster_resolver)
+
+    if session_config:
+      session_config.CopyFrom(self._update_config_proto(session_config))
+
+  def _update_config_proto(self, config_proto):
+    updated_config = copy.deepcopy(config_proto)
+    if not self._cluster_spec:
+      updated_config.isolate_session_state = True
+      return updated_config
+
+    updated_config.isolate_session_state = False
+
+    assert self._task_type
+    assert self._task_id is not None
+
+    # The device filters prevent communication between workers.
+    if self._task_type not in ["chief", "worker"]:
+      return updated_config
+    del updated_config.device_filters[:]
+    updated_config.device_filters.extend(
+        ["/job:%s/task:%d" % (self._task_type, self._task_id), "/job:ps"])
+    return updated_config
+
+  @property
+  def _num_replicas_in_sync(self):
+    return self._device_map.num_replicas_in_graph
+
+  @property
+  def worker_devices(self):
+    return self._device_map.all_devices
+
+  @property
+  def worker_devices_by_replica(self):
+    return self._device_map.devices_by_replica
+
+  @property
+  def parameter_devices(self):
+    return self._parameter_devices
+
+  def non_slot_devices(self, var_list):
+    return min(var_list, key=lambda x: x.name)
+
+  @property
+  def experimental_between_graph(self):
+    # TODO(yuefengz): Should this return False in the local case?
+    return True
+
+  @property
+  def experimental_should_init(self):
+    return self._is_chief
+
+  @property
+  def should_checkpoint(self):
+    return self._is_chief
+
+  @property
+  def should_save_summary(self):
+    return self._is_chief
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    """`make_dataset_iterator` and `make_numpy_iterator` use global batch size.
+
+    `make_input_fn_iterator` assumes per-replica batching.
+
+    Returns:
+      Boolean.
+    """
+    return True
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 01a1680a246b9beb34c4c5c1b6b3dfe6494c33f3..40a9f9b012c8f9959748d98bb3feccc3dcf55a6d 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Various classes representing distributed values.
-
-See go/tf-distribution-strategy.
-"""
+"""Various classes representing distributed values."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -23,21 +20,15 @@ from __future__ import print_function
 
 import collections
 import contextlib
-import operator
 import weakref
 import six
 
-from tensorflow.python.data.experimental.ops import batching
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import multi_device_iterator_ops
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context
-from tensorflow.python.distribute import input_ops
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
-from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -46,58 +37,266 @@ from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.training import saver
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
 
 
-# pylint: disable=line-too-long
-# TODO(josh11b): Should device values be strings or DeviceSpec objects?
-# Not sure DeviceSpec objects are usable as a dict key.
+def _devices_match(d1, d2):
+  return device_util.canonicalize(d1) == device_util.canonicalize(d2)
+
+
+class DeviceMap(object):
+  """A mapping of replicas & logical device ids to devices."""
+
+  @property
+  def all_devices(self):
+    """Returns a tuple of strings with all devices in this DeviceMap."""
+    raise NotImplementedError("Required for DeviceMap implementations.")
+
+  @property
+  def devices_by_replica(self):
+    """Returns a tuple `t` where `t[replica]` is the devices for `replica`."""
+    raise NotImplementedError("Required for DeviceMap implementations.")
+
+  @property
+  def num_logical_devices(self):
+    """Count of the number of devices each replica may be defined across."""
+    raise NotImplementedError("Required for DeviceMap implementations.")
+
+  @property
+  def num_replicas_in_graph(self):
+    """Number of replicas defined in this graph."""
+    raise NotImplementedError("Required for DeviceMap implementations.")
+
+  def logical_device_from_values(self, values):
+    """Returns the logical device index `values` is on."""
+    raise NotImplementedError("Required for DeviceMap implementations.")
+
+  def logical_to_actual_devices(self, logical_device_id):
+    """Returns sequence of `num_replicas_in_graph` devices."""
+    raise NotImplementedError("Required for DeviceMap implementations.")
+
+  def select_for_current_replica(self, values, replica_context):
+    """Select the element of `values` for the current replica."""
+    raise NotImplementedError("Required for DeviceMap implementations.")
+
+  def replica_for_device(self, device):
+    """Return the replica id containing `device`."""
+    raise NotImplementedError("Required for DeviceMap implementations.")
+
+  def select_for_device(self, values, device):
+    """Select the element of `values` to access from `device`."""
+    raise NotImplementedError("Required for DeviceMap implementations.")
+
+  def is_device_in_replica(self, device, replica_id):
+    """Returns whether `device` is a member of replica `replica_id`."""
+    raise NotImplementedError("Required for DeviceMap implementations.")
+
+
+class SingleDeviceMap(DeviceMap):
+  """A device map for 1 non-computation device.
+
+  Use `SingleDeviceMap` when the device does not correspond to some replica of
+  the computation. For computation devices, use `ReplicaDeviceMap` below (even
+  if there is only a single device in the map).
+  """
+
+  def __init__(self, device):
+    """Initialize a `SingleDeviceMap`.
+
+    Args:
+      device: A string device.
+    """
+    assert isinstance(device, six.string_types)
+    self._device = device_util.canonicalize(device)
+    self._devices = (self._device,)
+
+  @property
+  def all_devices(self):
+    return self._devices
+
+  @property
+  def devices_by_replica(self):
+    raise ValueError("SingleDeviceMap not indexed by replicas")
+
+  @property
+  def num_logical_devices(self):
+    return 1
+
+  @property
+  def num_replicas_in_graph(self):
+    return 1
+
+  def logical_device_from_values(self, values):
+    del values
+    return 0
+
+  def logical_to_actual_devices(self, logical_device_id):
+    assert logical_device_id == 0
+    return self._devices
+
+  def select_for_current_replica(self, values, replica_context):
+    assert len(values) == 1
+    del replica_context
+    return values[0]
+
+  def replica_for_device(self, device):
+    raise ValueError("SingleDeviceMap not indexed by replicas")
+
+  def select_for_device(self, values, device):
+    assert len(values) == 1
+    if self._device != device:
+      raise ValueError("Device %s not found in %s (current device %s)" %
+                       (device, self._devices, device_util.current()))
+    return values[0]
+
+  def is_device_in_replica(self, device, replica_id):
+    raise ValueError("SingleDeviceMap not indexed by replicas")
+
+  def __repr__(self):
+    return "%s(%r)" % (self.__class__.__name__, self._device)
+
+
+class ReplicaDeviceMap(DeviceMap):
+  """A device map for 1 device per replica."""
+
+  def __init__(self, devices):
+    """Initialize a `ReplicaDeviceMap`.
+
+    Args:
+      devices: `devices[i]` is the string device for replica `i`.
+    """
+    self._devices = tuple(device_util.canonicalize(d) for d in devices)
+    if len(set(self._devices)) != len(self._devices):
+      raise ValueError("Duplicate devices in %s, after canonicalization: %s" %
+                       (devices, self._devices))
+    self._device_to_replica = {d: r for r, d in enumerate(self._devices)}
+
+  @property
+  def all_devices(self):
+    return self._devices
+
+  @property
+  def devices_by_replica(self):
+    return ((d,) for d in self._devices)
+
+  @property
+  def num_logical_devices(self):
+    return 1
+
+  @property
+  def num_replicas_in_graph(self):
+    return len(self._devices)
+
+  def logical_device_from_values(self, values):
+    del values
+    return 0
+
+  def logical_to_actual_devices(self, logical_device_id):
+    assert logical_device_id == 0
+    return self._devices
+
+  def select_for_current_replica(self, values, replica_context):
+    assert len(values) == len(self._devices)
+    replica_id = replica_context.replica_id_in_sync_group
+    if not isinstance(replica_id, int):
+      replica_id = tensor_util.constant_value(replica_id)
+    return values[replica_id]
+
+  def replica_for_device(self, device):
+    return self._device_to_replica.get(device)
+
+  def select_for_device(self, values, device):
+    assert len(values) == len(self._devices)
+    replica_id = self._device_to_replica.get(device)
+    if replica_id is None:
+      raise ValueError("Device %s not found in %s (current device %s)" %
+                       (device, self._devices, device_util.current()))
+    return values[replica_id]
+
+  def is_device_in_replica(self, device, replica_id):
+    return _devices_match(device, self._devices[replica_id])
+
+  def __str__(self):
+    return "[%s]" % (", ".join(self._devices))
+
+  def __repr__(self):
+    return "%s([%s])" % (self.__class__.__name__,
+                         ", ".join(repr(d) for d in self._devices))
+
+
+LogicalDeviceSpec = collections.namedtuple(
+    "LogicalDeviceSpec", ("device_map", "logical_device"))
+
+
 class DistributedValues(object):
   """Holds a map from device to values. Either PerReplica or Mirrored."""
 
-  def __init__(self, index):
-    self._index = {device_util.canonicalize(key): value
-                   for key, value in six.iteritems(index)}
+  def __init__(self, device_map, values, logical_device=None):
+    assert isinstance(device_map, DeviceMap)
+    self._device_map = device_map
+    self._values = tuple(values)
+    if logical_device is None:
+      logical_device = device_map.logical_device_from_values(self._values)
+    self._logical_device = logical_device
 
+  # TODO(josh11b): Split this into two functions, one with device, one without.
   def get(self, device=None):
     """Returns the value for the current device or raises a ValueError."""
     if device is None:
       replica_context = distribution_strategy_context.get_replica_context()
       if replica_context:
-        # TODO(josh11b): support model parallelism better here
-        device = replica_context.devices[0]
+        return self._device_map.select_for_current_replica(
+            self._values, replica_context)
       else:
         device = distribute_lib.get_update_device()
         if device is None:
           return self._get_cross_replica()
     device = device_util.canonicalize(device)
-    try:
-      return self._index[device]
-    except KeyError as e:
-      six.raise_from(
-          ValueError("Device %s not found in %s (current device %s)" %
-                     (device, self._index.keys(), device_util.current())), e)
+    return self._device_map.select_for_device(self._values, device)
+
+  @property
+  def primary(self):
+    """Returns a representative component."""
+    return self._values[0]
 
   @property
   def devices(self):
-    return list(self._index.keys())
+    return self._device_map.logical_to_actual_devices(self._logical_device)
+
+  @property
+  def logical_device(self):
+    return self._logical_device
+
+  @property
+  def device_map(self):
+    return self._device_map
+
+  # TODO(josh11b): Replace unwrap with this?
+  @property
+  def values(self):
+    return self._values
 
   @property
   def is_tensor_like(self):
-    for v in self._index.values():
+    for v in self._values:
       if not tensor_util.is_tensor(v):
         return False
     return True
 
   def __str__(self):
-    return "%s:%s" % (self.__class__.__name__, self._index)
+    devices = self.devices
+    assert len(self._values) == len(devices)
+    debug_str = ",\n".join("  %d %s: %s" % (i, devices[i], self._values[i])
+                           for i in range(len(devices)))
+    return "%s:{\n%s\n}" % (self.__class__.__name__, debug_str)
 
   def __repr__(self):
-    return "%s(%r)" % (self.__class__.__name__, self._index)
-
-  # TODO(josh11b): Possibly make an accessor for _index for use by
-  # DistributionStrategy implementations.
+    devices = self.devices
+    assert len(self._values) == len(devices)
+    debug_repr = ",\n".join("  %d %s: %r" % (i, devices[i], self._values[i])
+                            for i in range(len(devices)))
+    return "%s:{\n%s\n}" % (self.__class__.__name__, debug_repr)
 
 
 # NOTE(josh11b,apassos): It would be great if we could inspect the values this was
@@ -126,7 +325,10 @@ class DistributedDelegate(DistributedValues):
   def __rmul__(self, o): return o * self.get()
   def __truediv__(self, o): return self.get() / o
   def __rtruediv__(self, o): return o / self.get()
-  def __floordiv__(self, o): return self.get() // o
+
+  def __floordiv__(self, o):
+    return self.get() // o
+
   def __rfloordiv__(self, o): return o // self.get()
   def __mod__(self, o): return self.get() % o
   def __rmod__(self, o): return o % self.get()
@@ -190,9 +392,10 @@ class Mirrored(DistributedDelegate):
 
   def _get_cross_replica(self):
     device = device_util.canonicalize(device_util.current())
-    if device in self._index:
-      return self._index[device]
-    return list(self._index.values())[0]
+    replica_id = self._device_map.replica_for_device(device)
+    if replica_id is None:
+      return self.primary
+    return self._values[replica_id]
 
   def _as_graph_element(self):
     obj = self.get()
@@ -207,6 +410,18 @@ def _assign_on_device(device, variable, tensor):
     return variable.assign(array_ops.identity(tensor))
 
 
+def _assert_strategy(strategy):
+  if not distribution_strategy_context.has_strategy():
+    raise RuntimeError(
+        'Need to be inside "with strategy.scope()" for %s' %
+        (strategy,))
+  current_strategy = distribution_strategy_context.get_strategy()
+  if current_strategy is not strategy:
+    raise RuntimeError(
+        "Mixing different tf.distribute.Strategy objects: %s is not %s" %
+        (current_strategy, strategy))
+
+
 DistributedVarOp = collections.namedtuple(
     "DistributedVarOp", ["name", "graph", "type"])
 
@@ -216,13 +431,14 @@ class DistributedVariable(DistributedDelegate):
   # TODO(josh11b): Support changing the set of variables if e.g. if new
   # devices are joining or a device is to leave.
 
-  def __init__(self, index):
-    # Child class must set self._primary_var before calling
-    # super(...).__init__(index).
-    self._common_name = self._primary_var.name.split(":")[0]
+  def __init__(self, strategy, device_map, values, logical_device=None):
+    self._distribute_strategy = strategy
+    super(DistributedVariable, self).__init__(
+        device_map, values, logical_device=logical_device)
+    self._common_name = self.primary.name.split(":")[0]
     # Use a weakref to make it easy to map from the contained values
     # to the container without introducing a reference cycle.
-    for v in six.itervalues(index):
+    for v in values:
       v._distributed_container = weakref.ref(self)  # pylint: disable=protected-access
     # tf.keras keeps track of variables initialized using this attribute. When
     # tf.keras gets the default session, it initializes all uninitialized vars.
@@ -235,7 +451,6 @@ class DistributedVariable(DistributedDelegate):
     # when restoring from a checkpoint, we may set the _initializer_op
     # property on the entire `DistributedVariable`.
     self._initializer_op = None
-    super(DistributedVariable, self).__init__(index)
 
   def is_initialized(self, name=None):
     """Identifies if all the component variables are initialized.
@@ -247,18 +462,14 @@ class DistributedVariable(DistributedDelegate):
       The op that evaluates to True or False depending on if all the
       component variables are initialized.
     """
-    # We have to cast the self._index.values() to a `list` because when we
-    # use `model_to_estimator` to run tf.keras models, self._index.values() is
-    # of type `dict_values` and not `list`.
-    values_list = list(self._index.values())
-    result = values_list[0].is_initialized()
+    result = self.primary.is_initialized()
     # We iterate through the list of values except the last one to allow us to
     # name the final `logical_and` op the same name that is passed by the user
     # to the `is_initialized` op. For distributed variables, the
     # `is_initialized` op is a `logical_and` op.
-    for v in values_list[1:-1]:
+    for v in self._values[1:-1]:
       result = math_ops.logical_and(result, v.is_initialized())
-    result = math_ops.logical_and(result, values_list[-1].is_initialized(),
+    result = math_ops.logical_and(result, self._values[-1].is_initialized(),
                                   name=name)
     return result
 
@@ -269,13 +480,34 @@ class DistributedVariable(DistributedDelegate):
     else:
       # return grouped ops of all the var initializations of component values of
       # the mirrored variable
-      init_op = control_flow_ops.group(
-          [v.initializer for v in self._index.values()])
+      init_op = control_flow_ops.group(tuple(
+          v.initializer for v in self._values))
     return init_op
 
+  def _get_closest(self):
+    """Return member in the same replica if possible, else the primary."""
+    replica_context = distribution_strategy_context.get_replica_context()
+    if replica_context:
+      return self._device_map.select_for_current_replica(
+          self._values, replica_context)
+    device = distribute_lib.get_update_device()
+    if device is None:
+      device = device_util.canonicalize(device_util.current())
+    replica_id = self._device_map.replica_for_device(device)
+    if replica_id is None:
+      return self.primary
+    return self._values[replica_id]
+
+  def initialized_value(self):
+    return self._get_closest().initialized_value()
+
+  @property
+  def initial_value(self):
+    return self._get_closest().initial_value
+
   @property
   def graph(self):
-    return self._primary_var.graph
+    return self.primary.graph
 
   @property
   def _shared_name(self):
@@ -283,44 +515,55 @@ class DistributedVariable(DistributedDelegate):
 
   @property
   def _unique_id(self):
-    return self._primary_var._unique_id   # pylint: disable=protected-access
+    return self.primary._unique_id   # pylint: disable=protected-access
+
+  @property
+  def _graph_key(self):
+    """Lets Optimizers know which graph this variable is from."""
+    return self.primary._graph_key  # pylint: disable=protected-access
 
   @property
   def name(self):
-    return self._primary_var.name
+    return self.primary.name
 
   @property
   def dtype(self):
-    return self._primary_var.dtype
+    return self.primary.dtype
 
   @property
   def shape(self):
-    return self._primary_var.shape
+    return self.primary.shape
+
+  @property
+  def distribute_strategy(self):
+    return self._distribute_strategy
 
   def get_shape(self):
-    return self._primary_var.get_shape()
+    return self.primary.get_shape()
 
   def to_proto(self, export_scope=None):
-    return self._primary_var.to_proto(export_scope=export_scope)
+    return self.primary.to_proto(export_scope=export_scope)
 
   @property
   def op(self):
     # We want cross-replica code that does some var.op.X calls
     # to work (even if the current device isn't in self.devices), but
     # other uses of var.op in a cross-replica context to fail.
-    if distribution_strategy_context.get_cross_replica_context():
-      return DistributedVarOp(self._primary_var.op.name,
-                              self._primary_var.op.graph,
-                              self._primary_var.op.type)
+    if distribution_strategy_context.in_cross_replica_context():
+      return DistributedVarOp(self.primary.op.name,
+                              self.primary.op.graph,
+                              self.primary.op.type)
     return self.get().op
 
   @property
   def _in_graph_mode(self):
-    return self._primary_var._in_graph_mode   # pylint: disable=protected-access
+    return self.primary._in_graph_mode   # pylint: disable=protected-access
 
   def read_value(self):
-    return distribution_strategy_context.get_distribution_strategy().read_var(
-        self)
+    return self._distribute_strategy.extended.read_var(self)
+
+  def value(self):
+    return self._get_closest().value()
 
   def _should_act_as_resource_variable(self):
     """Pass resource_variable_ops.is_resource_variable check."""
@@ -330,10 +573,43 @@ class DistributedVariable(DistributedDelegate):
 ops.register_dense_tensor_like_type(DistributedVariable)
 
 
+def _validate_colocate_extended(v, extended):
+  variable_strategy = v._distribute_strategy  # pylint: disable=protected-access
+  if variable_strategy.extended is not extended:
+    raise ValueError(
+        "`colocate_vars_with` must only be passed a variable created in this "
+        "tf.distribute.Strategy.scope(), not %s created in scope: %s" %
+        (v, variable_strategy))
+
+
+def validate_colocate_distributed_variable(v, extended):
+  if not isinstance(v, DistributedVariable):
+    raise ValueError(
+        "`colocate_vars_with` must only be passed a variable created in this "
+        "tf.distribute.Strategy.scope(), not: %r" % (v,))
+  _validate_colocate_extended(v, extended)
+
+
+def validate_colocate_tpu_variable(v, extended):
+  if not isinstance(v, TPUMirroredVariable):
+    raise ValueError(
+        "`colocate_vars_with` must only be passed a variable created in this "
+        "tf.distribute.Strategy.scope(), not: %r" % (v,))
+  _validate_colocate_extended(v, extended)
+
+
+def validate_colocate(v, extended):
+  if not hasattr(v, "_distribute_strategy"):
+    raise ValueError(
+        "`colocate_vars_with` must only be passed a variable created in this "
+        "tf.distribute.Strategy.scope(), not: %r" % (v,))
+  _validate_colocate_extended(v, extended)
+
+
 def _apply_aggregation(strategy, value, aggregation, destinations):
   if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-    return strategy.broadcast(strategy.unwrap(value)[0],
-                              destinations=destinations)
+    return strategy.extended.broadcast_to(strategy.unwrap(value)[0],
+                                          destinations=destinations)
   reduce_op = reduce_util.ReduceOp.from_variable_aggregation(aggregation)
   return strategy.extended.reduce_to(reduce_op, value, destinations)
 
@@ -348,19 +624,20 @@ class _MirroredSaveable(saver.BaseSaverBuilder.ResourceVariableSaveable):
   def restore(self, restored_tensors, restored_shapes):
     """Restore the same value into all variables."""
     tensor, = restored_tensors
-    return control_flow_ops.group([
-        _assign_on_device(d, v, tensor)
-        for d, v in six.iteritems(self._mirrored_variable._index)])  # pylint: disable=protected-access
+    return control_flow_ops.group(tuple(
+        _assign_on_device(v.device, v, tensor)
+        for v in self._mirrored_variable.values))
 
 
 class MirroredVariable(DistributedVariable, Mirrored,
-                       checkpointable.CheckpointableBase):
+                       trackable.Trackable):
   """Holds a map from device to variables whose values are kept in sync."""
 
-  def __init__(self, index, primary_var, aggregation):
-    self._primary_var = primary_var
+  def __init__(
+      self, strategy, device_map, values, aggregation, logical_device=None):
+    super(MirroredVariable, self).__init__(
+        strategy, device_map, values, logical_device=logical_device)
     self._aggregation = aggregation
-    super(MirroredVariable, self).__init__(index)
 
   # The arguments to update() are automatically unwrapped so the update()
   # function would normally see regular variables, not MirroredVariables.
@@ -369,8 +646,9 @@ class MirroredVariable(DistributedVariable, Mirrored,
   # update_non_slot() function (like OptimizerV2._finish), which can
   # update several non-slot variables in one call.
   def _assign_func(self, *args, **kwargs):
+    _assert_strategy(self._distribute_strategy)
     f = kwargs.pop("f")
-    if distribution_strategy_context.get_cross_replica_context():
+    if distribution_strategy_context.in_cross_replica_context():
       update_device = distribute_lib.get_update_device()
       if update_device is not None:
         # We are calling an assign function on the mirrored variable in an
@@ -379,11 +657,11 @@ class MirroredVariable(DistributedVariable, Mirrored,
         return f(v, *args, **kwargs)
 
       # We are calling assign on the mirrored variable in cross replica context,
-      # use update to update the variable.
-      strategy = distribution_strategy_context.get_distribution_strategy()
-      return strategy.update(self, f, *args, **kwargs)
+      # use `strategy.extended.update()` to update the variable.
+      return self._distribute_strategy.extended.update(
+          self, f, args=args, kwargs=kwargs)
     else:
-      _assert_replica_context()
+      _assert_replica_context(self._distribute_strategy)
       # We are calling an assign function on the mirrored variable in replica
       # context.
       # We reduce the value we want to assign/add/sub. More details about how we
@@ -396,7 +674,8 @@ class MirroredVariable(DistributedVariable, Mirrored,
 
       def merge_fn(strategy, value, *other_args, **other_kwargs):
         v = _apply_aggregation(strategy, value, self._aggregation, self)
-        return strategy.update(self, f, v, *other_args, **other_kwargs)
+        return strategy.extended.update(
+            self, f, args=(v,) + other_args, kwargs=other_kwargs)
 
       return distribution_strategy_context.get_replica_context().merge_call(
           merge_fn, args=args, kwargs=kwargs)
@@ -419,18 +698,19 @@ class MirroredVariable(DistributedVariable, Mirrored,
 
   def _get_cross_replica(self):
     device = device_util.canonicalize(device_util.current())
-    if device in self._index:
-      return array_ops.identity(self._index[device])
-    return array_ops.identity(self._primary_var)
+    replica_id = self._device_map.replica_for_device(device)
+    if replica_id is None:
+      return array_ops.identity(self.primary)
+    return array_ops.identity(self._values[replica_id])
 
   def _as_graph_element(self):
     # pylint: disable=protected-access
-    if distribution_strategy_context.get_cross_replica_context():
-      return self._primary_var._as_graph_element()
+    if distribution_strategy_context.in_cross_replica_context():
+      return self.primary._as_graph_element()
     return self.get()._as_graph_element()
 
   def _gather_saveables_for_checkpoint(self):
-    """Overrides CheckpointableBase method.
+    """Overrides Trackable method.
 
     This allows both name-based and object-based save and restore of
     MirroredVariables.
@@ -439,15 +719,15 @@ class MirroredVariable(DistributedVariable, Mirrored,
       A dictionary mapping attribute names to `SaveableObject` factories.
     """
     def _saveable_factory(name=self._common_name):
-      return _MirroredSaveable(self, self._primary_var, name)
-    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+      return _MirroredSaveable(self, self.primary, name)
+    return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
 
 
 # Register a conversion function which reads the value of the variable,
 # allowing instances of the class to be used as tensors.
 def _tensor_conversion_mirrored(var, dtype=None, name=None, as_ref=False):
   # Try to avoid assignments to and other mutations of MirroredVariable
-  # state except through a DistributionStrategy.update() call.
+  # state except through a DistributionStrategy.extended.update() call.
   assert not as_ref
   return ops.internal_convert_to_tensor(
       var.get(), dtype=dtype, name=name, as_ref=as_ref)
@@ -472,21 +752,27 @@ def _enclosing_tpu_context():
 # tpu.replicate() because it assumes that you're in a device context where you
 # can operate on a single version of the variable, but a tpu.replicate()
 # operates on all variables and is replicated during a rewrite pass.
-class TPUMirroredVariable(checkpointable.CheckpointableBase):
+class TPUMirroredVariable(trackable.Trackable):
   """Holds a map from device to TPU variables whose values are kept in sync."""
 
-  def __init__(self, index, primary_var, aggregation):
+  def __init__(
+      self, strategy, device_map, values, aggregation, logical_device=None):
+    assert isinstance(device_map, DeviceMap)
+    self._distribute_strategy = strategy
+    self._device_map = device_map
+    self._values = tuple(values)
+    if logical_device is None:
+      logical_device = device_map.logical_device_from_values(self._values)
+    self._logical_device = logical_device
+
     # Use a weakref to make it easy to map from the contained values
     # to the container without introducing a reference cycle.
-    for v in six.itervalues(index):
+    for v in self._values:
       v._mirrored_container = weakref.ref(self)  # pylint: disable=protected-access
-    self._index = {device_util.canonicalize(key): value
-                   for key, value in six.iteritems(index)}
-    self._primary_var = primary_var
-    self._common_name = self._primary_var.name.split(":")[0]
+    self._common_name = self.primary.name.split(":")[0]
     self._aggregation = aggregation
     # Needed for GradientTape
-    self._trainable = self._primary_var.trainable
+    self._trainable = self.primary.trainable
     # Typically like `DistributedVariable`, a `TPUMirroredVariable`'s
     # initializer is composed of the initializers of the components variables.
     # However, in some cases, such as when restoring from a checkpoint, we may
@@ -498,19 +784,40 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
     if device is None:
       replica_context = distribution_strategy_context.get_replica_context()
       if replica_context:
-        # TODO(josh11b): support model parallelism better here
-        device = replica_context.devices[0]
+        return self._device_map.select_for_current_replica(
+            self._values, replica_context)
       else:
         device = distribute_lib.get_update_device()
         if device is None:
           return self._get_cross_replica()
     device = device_util.canonicalize(device)
-    try:
-      return self._index[device]
-    except KeyError as e:
-      six.raise_from(
-          ValueError("Device %s not found in %s (current device %s)" %
-                     (device, self._index.keys(), device_util.current())), e)
+    return self._device_map.select_for_device(self._values, device)
+
+  @property
+  def primary(self):
+    """Returns a representative component."""
+    return self._values[0]
+
+  @property
+  def devices(self):
+    return self._device_map.logical_to_actual_devices(self._logical_device)
+
+  @property
+  def logical_device(self):
+    return self._logical_device
+
+  @property
+  def device_map(self):
+    return self._device_map
+
+  # TODO(josh11b): Replace unwrap with this?
+  @property
+  def values(self):
+    return self._values
+
+  @property
+  def distribute_strategy(self):
+    return self._distribute_strategy
 
   # pylint: disable=multiple-statements
   def __add__(self, o): return self.read_value() + o
@@ -570,29 +877,38 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
       # See https://docs.python.org/3/library/constants.html#NotImplemented
       return NotImplemented
 
+  def __str__(self):
+    devices = self.devices
+    debug_str = ",\n".join("  %d %s: %s" % (i, devices[i], self._values[i])
+                           for i in range(len(devices)))
+    return "%s:{\n%s\n}" % (self.__class__.__name__, debug_str)
+
+  def __repr__(self):
+    devices = self.devices
+    debug_repr = ",\n".join("  %d %s: %r" % (i, devices[i], self._values[i])
+                            for i in range(len(devices)))
+    return "%s:{\n%s\n}" % (self.__class__.__name__, debug_repr)
+
   @property
   def handle(self):
     # If we're in a tpu.rewrite(), return the replicated handle.
     tpu_context = _enclosing_tpu_context()
     if tpu_context is not None:
       return tpu_context.get_replicated_var_handle(
-          self._common_name, nest.flatten(self._index))
+          self._common_name, self._values)
 
     device = distribute_lib.get_update_device()
     if device is None:
-      return self._primary_var.handle
-    device = device_util.canonicalize(device)
-    try:
-      return self._index[device].handle
-    except KeyError as e:
-      six.raise_from(
-          ValueError("Device %s not found in %s (current device %s)" %
-                     (device, self._index.keys(), device_util.current())), e)
+      return self.primary.handle
+    return self._get(device=device).handle
 
   @property
   def device(self):
     return self._get().device
 
+  def eval(self, session=None):
+    return self.primary.eval(session)
+
   # The arguments to update() are automatically unwrapped so the update()
   # function would normally see regular variables, not MirroredVariables.
   # However, the update function can still operate on wrapped MirroredVariables
@@ -600,15 +916,12 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
   # update_non_slot() function (like OptimizerV2._finish), which can
   # update several non-slot variables in one call.
   def _assign_func(self, *args, **kwargs):
-    strategy = distribution_strategy_context.get_distribution_strategy()
-    if strategy.__class__.__name__ != "TPUStrategy":
-      raise ValueError("You may only assign to a TPUMirroredVariable within a "
-                       "TPUStrategy.")
+    _assert_strategy(self._distribute_strategy)
     f = kwargs.pop("f")
-    if distribution_strategy_context.get_cross_replica_context():
+    if distribution_strategy_context.in_cross_replica_context():
       if _enclosing_tpu_context() is not None:
-        return distribution_strategy_context.get_distribution_strategy().update(
-            self, f, *args, **kwargs)
+        return self._distribute_strategy.extended.update(
+            self, f, args=args, kwargs=kwargs)
 
       update_device = distribute_lib.get_update_device()
       # We are calling update on the mirrored variable in cross replica context.
@@ -618,10 +931,10 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
         v = self._get(device=update_device)
         return f(v, *args, **kwargs)
 
-      return distribution_strategy_context.get_distribution_strategy().update(
-          self, f, *args, **kwargs)
+      return self._distribute_strategy.extended.update(
+          self, f, args=args, kwargs=kwargs)
     else:
-      _assert_replica_context()
+      _assert_replica_context(self._distribute_strategy)
       # We are calling an assign function on the mirrored variable in replica
       # context.
       # We reduce the value we want to assign/add/sub. More details about how we
@@ -634,7 +947,8 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
 
       def merge_fn(strategy, value, *other_args, **other_kwargs):
         v = _apply_aggregation(strategy, value, self._aggregation, self)
-        return strategy.update(self, f, v, *other_args, **other_kwargs)
+        return strategy.extended.update(
+            self, f, args=(v,) + other_args, kwargs=other_kwargs)
 
       return distribution_strategy_context.get_replica_context().merge_call(
           merge_fn, args=args, kwargs=kwargs)
@@ -669,7 +983,8 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
     return self._read_variable_op()
 
   def assign_sub(self, *args, **kwargs):
-    def assign_sub_fn(var, delta, **kw):
+    def assign_sub_fn(var, delta, *ar, **kw):
+      del ar
       name = kw.pop("name", None)
       read_value = kw.pop("read_value", True)
       with self._handle_graph(var.handle):
@@ -683,7 +998,8 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
     return self._assign_func(f=assign_sub_fn, *args, **kwargs)
 
   def assign_add(self, *args, **kwargs):
-    def assign_add_fn(var, delta, **kw):
+    def assign_add_fn(var, delta, *ar, **kw):
+      del ar
       name = kw.pop("name", None)
       read_value = kw.pop("read_value", True)
       with self._handle_graph(var.handle):
@@ -697,7 +1013,8 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
     return self._assign_func(f=assign_add_fn, *args, **kwargs)
 
   def assign(self, *args, **kwargs):
-    def assign_fn(var, value, **kw):
+    def assign_fn(var, value, *ar, **kw):
+      del ar
       name = kw.pop("name", None)
       read_value = kw.pop("read_value", True)
       with self._handle_graph(var.handle):
@@ -723,13 +1040,13 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
     if self._initializer_op:
       init_op = self._initializer_op
     else:
-      init_op = control_flow_ops.group(
-          [v.initializer for v in self._index.values()])
+      init_op = control_flow_ops.group(tuple(
+          v.initializer for v in self._values))
     return init_op
 
   @property
   def graph(self):
-    return self._primary_var.graph
+    return self.primary.graph
 
   @property
   def _shared_name(self):
@@ -737,40 +1054,41 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
 
   @property
   def _unique_id(self):
-    return self._primary_var._unique_id  # pylint: disable=protected-access
+    return self.primary._unique_id  # pylint: disable=protected-access
 
   @property
   def name(self):
-    return self._primary_var.name
+    return self.primary.name
 
   @property
   def dtype(self):
-    return self._primary_var.dtype
+    return self.primary.dtype
 
   @property
   def shape(self):
-    return self._primary_var.shape
+    return self.primary.shape
 
   def get_shape(self):
-    return self._primary_var.get_shape()
+    return self.primary.get_shape()
 
   def to_proto(self, export_scope=None):
-    return self._primary_var.to_proto(export_scope=export_scope)
+    return self.primary.to_proto(export_scope=export_scope)
 
   def _get_cross_replica(self):
     device = device_util.canonicalize(device_util.current())
-    if device in self._index:
-      return self._index[device]
-    return self._primary_var
+    replica = self._device_map.replica_for_device(device)
+    if replica is None:
+      return self.primary
+    return self._values[replica]
 
   def _as_graph_element(self):
     # pylint: disable=protected-access
-    if distribution_strategy_context.get_cross_replica_context():
-      return self._primary_var._as_graph_element()
+    if distribution_strategy_context.in_cross_replica_context():
+      return self.primary._as_graph_element()
     return self._read_variable_op()
 
   def _gather_saveables_for_checkpoint(self):
-    """Overrides CheckpointableBase method.
+    """Overrides Trackable method.
 
     This allows both name-based and object-based save and restore of
     MirroredVariables.
@@ -779,8 +1097,8 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
       A dictionary mapping attribute names to `SaveableObject` factories.
     """
     def _saveable_factory(name=self._common_name):
-      return _MirroredSaveable(self, self._primary_var, name)
-    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+      return _MirroredSaveable(self, self.primary, name)
+    return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
 
   def _should_act_as_resource_variable(self):
     """Pass resource_variable_ops.is_resource_variable check."""
@@ -789,23 +1107,23 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
   # Needed to pass ResourceVariable checks.
   @property
   def op(self):
-    return self._primary_var.op
+    return self.primary.op
 
   # pylint: disable=protected-access
   @property
   def _save_slice_info(self):
-    return self._primary_var._save_slice_info
+    return self.primary._save_slice_info
 
   def _get_save_slice_info(self):
-    return self._primary_var._get_save_slice_info()
+    return self.primary._get_save_slice_info()
 
   def _set_save_slice_info(self, save_slice_info):
-    return self._primary_var._set_save_slice_info(save_slice_info)
+    return self.primary._set_save_slice_info(save_slice_info)
   # pylint: enable=protected-access
 
   @property
   def _in_graph_mode(self):
-    return self._primary_var._in_graph_mode   # pylint: disable=protected-access
+    return self.primary._in_graph_mode   # pylint: disable=protected-access
 
   def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
     """Converts a variable to a tensor."""
@@ -832,18 +1150,14 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
     """
     # TODO(jhseu): Do we need TPU context implementation?
 
-    # We have to cast the self._index.values() to a `list` because when we
-    # use `model_to_estimator` to run tf.keras models, self._index.values() is
-    # of type `dict_values` and not `list`.
-    values_list = nest.flatten(self._index)
-    result = values_list[0].is_initialized()
+    result = self.primary.is_initialized()
     # We iterate through the list of values except the last one to allow us to
     # name the final `logical_and` op the same name that is passed by the user
     # to the `is_initialized` op. For distributed variables, the
     # `is_initialized` op is a `logical_and` op.
-    for v in values_list[1:-1]:
+    for v in self._values[1:-1]:
       result = math_ops.logical_and(result, v.is_initialized())
-    result = math_ops.logical_and(result, values_list[-1].is_initialized(),
+    result = math_ops.logical_and(result, self._values[-1].is_initialized(),
                                   name=name)
     return result
 
@@ -867,8 +1181,9 @@ class _ReplicaLocalSaveable(saver.BaseSaverBuilder.SaveableObject):
     # We use a callable so that we don't have to evaluate this expression
     # in the case where we are trying to restore instead of save.
     def tensor():
-      return distribution_strategy_context.get_distribution_strategy().read_var(
-          replica_local_variable)
+      strategy = replica_local_variable._distribute_strategy  # pylint: disable=protected-access
+      return strategy.extended.read_var(replica_local_variable)
+
     spec = saver.BaseSaverBuilder.SaveSpec(
         tensor=tensor,
         slice_spec="",
@@ -882,42 +1197,47 @@ class _ReplicaLocalSaveable(saver.BaseSaverBuilder.SaveableObject):
     return self._replica_local_variable.assign(tensor)
 
 
-def _assert_replica_context():
-  if not distribution_strategy_context.get_replica_context():
+def _assert_replica_context(strategy):
+  replica_context = distribution_strategy_context.get_replica_context()
+  if not replica_context:
+    raise RuntimeError(
+        "Replica-local variables may only be assigned in a replica context.")
+  if replica_context.strategy is not strategy:
     raise RuntimeError(
         "Replica-local variables may only be assigned in a replica context.")
 
 
+# TODO(josh11b): Rename this to SyncOnReadVariable.
 class ReplicaLocalVariable(DistributedVariable, PerReplica,
-                           checkpointable.CheckpointableBase):
+                           trackable.Trackable):
   """Holds a map from device to variables whose values are reduced on save."""
 
-  def __init__(self, index, primary_var, aggregation):
-    self._primary_var = primary_var
+  def __init__(
+      self, strategy, device_map, values, aggregation, logical_device=None):
     self._aggregation = aggregation
-    super(ReplicaLocalVariable, self).__init__(index)
+    super(ReplicaLocalVariable, self).__init__(
+        strategy, device_map, values, logical_device=logical_device)
 
   def assign_sub(self, *args, **kwargs):
-    _assert_replica_context()
+    _assert_replica_context(self._distribute_strategy)
     return self.get().assign_sub(*args, **kwargs)
 
   def assign_add(self, *args, **kwargs):
-    _assert_replica_context()
+    _assert_replica_context(self._distribute_strategy)
     return self.get().assign_add(*args, **kwargs)
 
   def assign(self, *args, **kwargs):
-    if distribution_strategy_context.get_cross_replica_context():
+    if distribution_strategy_context.in_cross_replica_context():
       # To preserve the sum across save and restore, we have to divide the
       # total across all devices when restoring a variable that was summed
       # when saving.
       tensor = args[0]
       if self._aggregation == vs.VariableAggregation.SUM:
         tensor *= 1. / len(self.devices)
-      return control_flow_ops.group(
-          [_assign_on_device(d, v, tensor)
-           for d, v in six.iteritems(self._index)])
+      return control_flow_ops.group(tuple(
+          _assign_on_device(v.device, v, tensor) for v in self._values))
     else:
-      _assert_replica_context()
+      _assert_replica_context(self._distribute_strategy)
       return self.get().assign(*args, **kwargs)
 
   @property
@@ -926,22 +1246,18 @@ class ReplicaLocalVariable(DistributedVariable, PerReplica,
 
   def _get_cross_replica(self):
     if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-      return self._primary_var
-    all_components = tuple(self._index.values())
-    # TODO(josh11b): Use a strategy-specific method.
-    total = math_ops.add_n(all_components)
-    if self._aggregation == vs.VariableAggregation.MEAN:
-      return total * (1./ len(all_components))
-    return total
+      return self.primary
+    return self._distribute_strategy.reduce(
+        reduce_util.ReduceOp.from_variable_aggregation(self.aggregation), self)
 
   def _as_graph_element(self):
     # pylint: disable=protected-access
-    if distribution_strategy_context.get_cross_replica_context():
+    if distribution_strategy_context.in_cross_replica_context():
       return self._get_cross_replica()
     return self.get()._as_graph_element()
 
   def _gather_saveables_for_checkpoint(self):
-    """Overrides CheckpointableBase method.
+    """Overrides Trackable method.
 
     This allows both name-based and object-based save and restore of
     ReplicaLocalVariables.
@@ -951,7 +1267,7 @@ class ReplicaLocalVariable(DistributedVariable, PerReplica,
     """
     def _saveable_factory(name=self._common_name):
       return _ReplicaLocalSaveable(self, name)
-    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+    return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
 
 
 # Register a conversion function for ReplicaLocalVariable which allows as_ref to
@@ -965,30 +1281,27 @@ ops.register_tensor_conversion_function(ReplicaLocalVariable,
                                         _tensor_conversion_replica_local)
 
 
-def _devices_match(d1, d2):
-  return device_util.canonicalize(d1) == device_util.canonicalize(d2)
-
-
-def regroup(per_replica, wrap_class=PerReplica):
-  """Makes device->nest map into a nest of PerReplica/Mirrored values."""
-  items = list(per_replica.items())
-  assert items
-  v0 = items[0][1]  # First value
+def regroup(device_map, values, wrap_class=PerReplica):
+  """Makes a nest per-replica into a nest of PerReplica/Mirrored values."""
+  assert isinstance(device_map, DeviceMap)
+  assert len(values) == device_map.num_replicas_in_graph
+  v0 = values[0]
 
   if isinstance(v0, list):
-    for _, v in items[1:]:
+    for v in values[1:]:
       assert isinstance(v, list)
       assert len(v) == len(v0), ("len(v) == %d, len(v0) == %d, v: %s, v0: %s" %
                                  (len(v), len(v0), v, v0))
-    return [regroup({k: v[i] for k, v in items}, wrap_class)
+    return [regroup(device_map, tuple(v[i] for v in values), wrap_class)
             for i in range(len(v0))]
 
   if isinstance(v0, tuple):
-    for _, v in items[1:]:
+    for v in values[1:]:
       assert isinstance(v, tuple)
       assert len(v) == len(v0)
-    regrouped_tuple = tuple(regroup({k: v[i] for k, v in items}, wrap_class)
-                            for i in range(len(v0)))
+    regrouped_tuple = tuple(
+        regroup(device_map, tuple(v[i] for v in values), wrap_class)
+        for i in range(len(v0)))
     if hasattr(v0, "_fields"):
       # This tuple is in fact a namedtuple! Create a new namedtuple instance
       # and initialize it with the regrouped values:
@@ -999,15 +1312,16 @@ def regroup(per_replica, wrap_class=PerReplica):
 
   if isinstance(v0, dict):
     v0keys = set(v0.keys())
-    for _, v in items[1:]:
-      assert isinstance(v, dict)
-      assert set(v.keys()) == v0keys
-    return {key: regroup({k: v[key] for k, v in items}, wrap_class)
+    for v in values[1:]:
+      assert isinstance(v, dict), ("v[0]: %r  v[i]: %r" % (v0, v))
+      assert set(v.keys()) == v0keys, ("v[0].keys: %s  v[i].keys: %s" %
+                                       (v0keys, set(v.keys())))
+    return {key: regroup(device_map, tuple(v[key] for v in values), wrap_class)
             for key in v0keys}
 
   # If exactly the same object across all devices, return it unwrapped.
   same_id = True
-  for _, v in items[1:]:
+  for v in values[1:]:
     if v is not v0:
       same_id = False
       break
@@ -1036,25 +1350,26 @@ def regroup(per_replica, wrap_class=PerReplica):
   if hasattr(v0, "_distributed_container"):
     # pylint: disable=protected-access
     assert not isinstance(v0, MirroredVariable), (
-        "ids = %s, items = %s" % ([id(v[1]) for v in items], items))
-    assert _devices_match(v0.device, items[0][0]), (
-        "v0.device = %s, items = %s" % (v0.device, items))
+        "ids = %s, values = %s" % ([id(v) for v in values], values))
+    assert device_map.is_device_in_replica(v0.device, 0), (
+        "v0.device = %s, device_map = %s" % (v0.device, device_map))
     distributed_container = v0._distributed_container()
     assert distributed_container is not None
-    for d, v in items[1:]:
-      assert _devices_match(v.device, d), (
-          "v.device = %s, d = %s, items = %s" % (v.device, d, items))
+    for r, v in enumerate(values[1:]):
+      assert device_map.is_device_in_replica(v.device, r + 1), (
+          "v.device = %s, r = %d, device_map = %s" %
+          (v.device, r + 1, device_map))
       assert distributed_container is v._distributed_container()
     return distributed_container
   # pylint: enable=protected-access
 
-  return wrap_class(per_replica)
+  return wrap_class(device_map, values)
 
 
-def select_device(device, structured):
-  """Specialize a nest of regular & per-replica values for one device."""
+def select_replica(replica_id, structured):
+  """Specialize a nest of regular & per-replica values for one replica."""
   def _get(x):
-    return x.get(device) if isinstance(x, DistributedValues) else x
+    return x.values[replica_id] if isinstance(x, DistributedValues) else x
 
   return nest.map_structure(_get, structured)
 
@@ -1074,9 +1389,11 @@ def select_device_mirrored(device, structured):
   return nest.map_structure(_get_mirrored, structured)
 
 
-def update_regroup(extended, updates, group):
+def update_regroup(extended, device_map, updates, group):
   """Regroup for an update, with dependencies to ensure all updates execute."""
-  regrouped = regroup(updates, Mirrored)
+  # TODO(josh11b): Replace "Mirrored" here with a function that does the following
+  # so we can avoid all these nest operations.
+  regrouped = regroup(device_map, updates, Mirrored)
   if not group:
     return nest.map_structure(extended._unwrap, regrouped)  # pylint: disable=protected-access
   grouped_flat = []
@@ -1086,635 +1403,17 @@ def update_regroup(extended, updates, group):
       if u.is_tensor_like:
         # Make sure we run all updates. Without this, something like
         # session.run(extended.update(...)) may only update one replica.
-        index = {}
+        values = []
         for d in u.devices:
           with ops.device(d), ops.control_dependencies([g]):
-            index[d] = array_ops.identity(u.get(d))
-        g = Mirrored(index)
+            values.append(array_ops.identity(u.get(d)))
+        g = Mirrored(u.device_map, values)
     else:
       g = u
     grouped_flat.append(g)
   return nest.pack_sequence_as(regrouped, grouped_flat)
 
 
-class PerReplicaDataIterator(object):
-  """An iterator (like `tf.data.Iterator`) into a `PerReplicaDataset`."""
-
-  def __init__(self, iterator, devices, prefetch_on_device=None):
-    self._iterator = iterator
-    self._devices = devices
-    self._prefetch_on_device = prefetch_on_device
-
-  @property
-  def initializer(self):
-    return self._iterator.initializer
-
-  def get_next(self, name=None):
-    """Scatter the input across devices."""
-    if self._prefetch_on_device:
-      data_list = self._iterator.get_next()
-      index = dict(zip(self._devices, data_list))
-    else:
-      batch = self._iterator.get_next(name=name)
-      index = {}
-      def get_ith(i):
-        return lambda x: x[i]
-
-      for i, d in enumerate(self._devices):
-        index[d] = nest.map_structure(get_ith(i), batch)
-        if context.executing_eagerly():
-          with ops.device(d):
-            index[d] = nest.map_structure(array_ops.identity, index[d])
-
-    return regroup(index)
-
-  @property
-  def output_classes(self):
-    return self._iterator.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._iterator.output_shapes
-
-  @property
-  def output_types(self):
-    return self._iterator.output_types
-
-
-class PerReplicaDataset(object):
-  """Like `tf.data.Dataset` split devices, producing `PerReplica` data."""
-
-  def __init__(self, dataset, devices, prefetch_on_device=None):
-    self._devices = devices
-
-    # Default to using prefetching in graph mode, unless specified.
-    # TODO(rohanj): Enable prefetching in eager mode.
-    self._prefetch_on_device = prefetch_on_device
-    if self._prefetch_on_device is None:
-      self._prefetch_on_device = not context.executing_eagerly()
-    assert not (self._prefetch_on_device and context.executing_eagerly()), (
-        "Prefetching is only supported in graph mode currently")
-
-    self._dataset = dataset
-    if not self._prefetch_on_device:
-      # TODO(priyag): If dropping remainder is not appropriate, find another
-      # approach to distributing the dataset when not possible to divide evenly.
-      # Possibly not an issue when we start using PartitionedDataset.
-      self._dataset = dataset.batch(len(devices), drop_remainder=True)
-
-  def make_one_shot_iterator(self):
-    """Get a one time use iterator for the distributed PerReplicaDataset."""
-    # Graph mode with one shot iterator is disabled.
-    if not context.executing_eagerly():
-      raise ValueError("Cannot create a one shot iterator. Please use "
-                       "`make_initializable_iterator()` instead.")
-    # Eager mode prefetching would error out in constructor. Only remaining
-    # case is non-prefetching in eager mode. We delegate to
-    # PerReplicaDataIterator to handle that case.
-    dataset_iterator = dataset_ops.make_one_shot_iterator(self._dataset)
-    return PerReplicaDataIterator(
-        dataset_iterator, self._devices, prefetch_on_device=False)
-
-  def make_initializable_iterator(self):
-    """Get an initializable iterator for the distributed PerReplicaDataset."""
-    # Eager mode generates already initialized iterators. Hence we cannot create
-    # an initializable iterator.
-    if context.executing_eagerly():
-      raise ValueError("Cannot create initializable iterator in Eager mode. "
-                       "Please use `make_one_shot_iterator` instead.")
-    if self._prefetch_on_device:
-      dataset_iterator = multi_device_iterator_ops.MultiDeviceIterator(
-          self._dataset, self._devices)
-    else:
-      dataset_iterator = dataset_ops.make_initializable_iterator(self._dataset)
-    return PerReplicaDataIterator(
-        dataset_iterator,
-        self._devices,
-        prefetch_on_device=self._prefetch_on_device)
-
-
-class MultiWorkerDataIterator(object):
-  """An iterator (like `tf.data.Iterator`) into a `MultiWorkerDataset`."""
-
-  def __init__(self, iterators, worker_device_pairs):
-    """Initialize the MultiWorkerDataIterator object.
-
-    Args:
-      iterators: a list of worker, iterator pairs.
-      worker_device_pairs: a list of (worker's devices, a list of
-        devices that belong to this worker) pairs.
-
-    Raises:
-      ValueError: if iterators and worker_device_pairs are not compatible.
-    """
-    if [d for d, _ in iterators] != [d for d, _ in worker_device_pairs]:
-      raise ValueError("iterators and worker_device_pairs are not compatible.")
-    self._workers = [d for d, _ in iterators]
-    self._iterators = [i for _, i in iterators]
-    self._worker_devices = [l for _, l in worker_device_pairs]
-
-  @property
-  def initializer(self):
-    return control_flow_ops.group(
-        [iterator.initializer for iterator in self._iterators])
-
-  def get_iterator(self, worker):
-    for i, w in enumerate(self._workers):
-      if worker == w:
-        return self._iterators[i]
-    return None
-
-  @property
-  def output_shapes(self):
-    return self._iterators[0].output_shapes
-
-  @property
-  def output_types(self):
-    return self._iterators[0].output_types
-
-  def get_next(self, name=None):
-    """Scatter the input across hosts and devices."""
-    index = {}
-    worker_info = zip(self._workers, self._iterators, self._worker_devices)
-    for worker, iterator, worker_devices in worker_info:
-      if name is not None:
-        d = tf_device.DeviceSpec.from_string(worker)
-        new_name = "%s_%s_%d" % (name, d.job, d.task)
-      else:
-        new_name = None
-      with ops.device(worker):
-        data_per_worker = iterator.get_next(name=new_name)
-
-      # Ungroup these per-replica value so as to get a flat map from devices to
-      # values.
-      for d in worker_devices:
-        v = select_device(d, data_per_worker)
-        if d in index:
-          raise ValueError("Duplicated devices in worker_device_pairs: %r" % v)
-        index[d] = v
-
-    return regroup(index)
-
-
-class MultiWorkerDataset(object):
-  """Like a `tf.data.Dataset` that distributes data to different workers.
-
-  Each worker gets one shard of the input dataset. This currently does not work
-  in eager mode.
-  """
-
-  def __init__(self, dataset_fn, worker_device_pairs, prefetch_on_device=None,
-               auto_shard=False):
-    """Initialize the MultiWorkerDataset object.
-
-    Args:
-      dataset_fn: a function or a list of functions that returns a
-        `tf.data.Dataset`.
-      worker_device_pairs: a list of (worker, list of devices on that worker)
-        pairs; it must have same length with `dataset_fn` if `dataset_fn` is a
-        list.
-      prefetch_on_device: whether to prefetch to devices.
-      auto_shard: whether to auto-shard the dataset.
-    """
-    if isinstance(dataset_fn, list):
-      if len(dataset_fn) != len(worker_device_pairs):
-        raise ValueError("If `dataset_fn` is a list, it must have same length "
-                         "as `worker_device_pairs`")
-      if auto_shard:
-        raise ValueError(
-            "If `dataset_fn` is a list, `auto_shard` is not supported.")
-    self._worker_device_pairs = worker_device_pairs
-    self._datasets = []
-    # TODO(yuefengz, priyag): support different set of jobs for input
-    # processing.
-    for i, (worker, worker_devices) in enumerate(worker_device_pairs):
-      with ops.device(worker):
-        if isinstance(dataset_fn, list):
-          worker_input = dataset_fn[i]()
-        else:
-          worker_input = dataset_fn()
-          if auto_shard:
-            worker_input = input_ops.auto_shard_dataset(
-                worker_input, len(worker_device_pairs), i)
-        dataset = PerReplicaDataset(
-            worker_input, worker_devices, prefetch_on_device=prefetch_on_device)
-        self._datasets.append((worker, dataset))
-
-  def make_one_shot_iterator(self):
-    iterators = []
-    for worker, dataset in self._datasets:
-      with ops.device(worker):
-        iterators.append((worker, dataset_ops.make_one_shot_iterator(dataset)))
-    return MultiWorkerDataIterator(iterators, self._worker_device_pairs)
-
-  def make_initializable_iterator(self):
-    iterators = []
-    for worker, dataset in self._datasets:
-      with ops.device(worker):
-        iterators.append(
-            (worker, dataset_ops.make_initializable_iterator(dataset)))
-    return MultiWorkerDataIterator(iterators, self._worker_device_pairs)
-
-
-class InputIterator(object):
-  """An input iterator, intended to be passed to `DistributionStrategy.run`."""
-
-  def get_next(self):
-    """Returns the next inputs for all replicas."""
-    raise NotImplementedError("must be implemented in descendants")
-
-  def initialize(self):
-    """Initialize the underlying input dataset, when applicable.
-
-    In eager mode, this will create a new iterator and return it.
-    In graph mode, this will initialize the same underlying iterator(s).
-
-    Users are required to call this if
-    - This iterator was returned from a call to `make_input_fn_iterator` with an
-      input function that returns a dataset.
-    - Or this iterator was returned from a call to `make_dataset_iterator`.
-
-    Returns:
-      A list of initialization ops to be executed.
-    """
-    raise NotImplementedError("must be implemented in descendants")
-
-
-class InputIteratorImpl(InputIterator):
-  """Common implementation for all input iterators."""
-
-  def __init__(self, worker_device_pairs, iterators):
-    if not worker_device_pairs:
-      raise ValueError("Should have at least one worker for input iterator.")
-
-    self._iterators = iterators
-    self._worker_device_pairs = worker_device_pairs
-    self._is_eager = context.executing_eagerly()
-
-  def get_next(self, name=None):
-    """Returns the next input from the iterator for all replicas."""
-    assert self._is_eager == context.executing_eagerly(), (
-        "Iterator should be created and used in same execution mode.")
-
-    index = {}
-    for i, (worker, worker_devices) in enumerate(self._worker_device_pairs):
-      if name is not None:
-        d = tf_device.DeviceSpec.from_string(worker)
-        new_name = "%s_%s_%d" % (name, d.job, d.task)
-      else:
-        new_name = None
-      with ops.device(worker):
-        data_per_worker = self._iterators[i].get_next(new_name)
-
-      # Ungroup these per-replica value so as to get a flat map from devices to
-      # values.
-      for d in worker_devices:
-        v = select_device(d, data_per_worker)
-        if d in index:
-          raise ValueError("Duplicated devices in worker_device_pairs: %r" % v)
-        index[d] = v
-
-    return regroup(index)
-
-  def initialize(self):
-    """Initialze underlying iterators.
-
-    Returns:
-      A list of any initializer ops that should be run.
-    """
-    assert self._is_eager == context.executing_eagerly(), (
-        "Iterator should be created and used in same execution mode.")
-
-    init_ops = []
-    for it in self._iterators:
-      init_ops.extend(it.initialize())
-    return init_ops
-
-  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
-  @property
-  def output_classes(self):
-    return self._iterators[0].output_classes
-
-  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
-  @property
-  def output_shapes(self):
-    return self._iterators[0].output_shapes
-
-  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
-  @property
-  def output_types(self):
-    return self._iterators[0].output_types
-
-  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
-  def get_iterator(self, worker):
-    for i, (w, _) in enumerate(self._worker_device_pairs):
-      if worker == w:
-        return self._iterators[i]
-    return None
-
-
-class InputFunctionIterator(InputIteratorImpl):
-  """Iterator created from input function."""
-
-  def __init__(self, input_fn, worker_device_pairs, input_contexts):
-    """Make an iterator for input provided via an input function.
-
-    Currently implements PER_WORKER mode, in which the `input_fn` is called
-    once on each worker.
-
-    TODO(priyag): Add other replication modes.
-    TODO(priyag): Allow taking input function that returns a callable that
-    returns nest of tensors.
-
-    Args:
-      input_fn: Input function that returns a `tf.data.Dataset` object.
-      worker_device_pairs: A list of (worker, list of devices on that worker)
-        pairs.
-      input_contexts: A list of `InputContext` instances to be passed to call(s)
-        to `input_fn`. Length and order should match worker order in
-        `worker_device_pairs`.
-    """
-    if len(worker_device_pairs) != len(input_contexts):
-      raise ValueError(
-          "Number of worker_device_pairs (%d) is not same as number of"
-          "input_contexts (%d)" % (
-              len(worker_device_pairs), len(input_contexts)))
-
-    iterators = []
-    for (worker, devices), ctx in zip(worker_device_pairs, input_contexts):
-      # TODO(priyag): We should probably explicitly specify CPU device on worker.
-      with ops.device(worker):
-        result = input_fn(ctx)
-        if not isinstance(result, dataset_ops.DatasetV2):
-          raise ValueError("input_fn must return a tf.data.Dataset.")
-        iterator = _SingleWorkerDatasetIterator(result, worker, devices)
-        iterators.append(iterator)
-
-    super(InputFunctionIterator, self).__init__(
-        worker_device_pairs, iterators)
-
-
-class DatasetIterator(InputIteratorImpl):
-  """Iterator created from input dataset."""
-
-  def __init__(self, dataset, worker_device_pairs, split_batch_by=None):
-    """Make an iterator for the dataset on given devices.
-
-    If `split_batch_by` is not None, we "split" each batch of the
-    dataset by `split_batch_by` value. To achieve this, we first unbatch the
-    input dataset and then rebatch it with the per replica batch size that is
-    calculated using `global_batch_size // split_batch_by`.
-    The currently supported datasets are as follows:
-    `dataset.batch()` is the last operation on the dataset OR
-    `dataset.apply(map_and_batch)` is the last operation on the dataset OR
-    `dataset.batch().prefetch()` are the last 2 operations on the dataset OR
-    `dataset.apply(map_and_batch).prefetch()` are the last 2 operations.
-
-    TODO(priyag): Support multi worker / host cases properly by cloning
-    and sharding the dataset on each worker. Current setup will only work in
-    some cases, such as in-graph multi worker GPU case. If the input pipeline
-    has random shuffling (with a different seed on each worker), each worker
-    will see random input from the same overall dataset in each step. Otherwise,
-    each worker will see the same input in each step.
-
-    Args:
-      dataset: `tf.data.Dataset` that will be used as the input source.
-      worker_device_pairs: A list of (worker, list of devices on that worker)
-        pairs.
-      split_batch_by: Optional integer. If present, we "split" each batch of the
-        dataset by `split_batch_by` value.
-    """
-    if split_batch_by:
-      dataset = _split_dataset_batch(dataset, split_batch_by)
-
-    iterators = []
-    for worker, worker_devices in worker_device_pairs:
-      with ops.device(worker):
-        iterator = _SingleWorkerDatasetIterator(dataset, worker, worker_devices)
-        iterators.append(iterator)
-
-    super(DatasetIterator, self).__init__(worker_device_pairs, iterators)
-
-
-class _SingleWorkerDatasetIterator(object):
-  """Iterator for a single `tf.data.Dataset`."""
-
-  def __init__(self, dataset, worker, devices):
-    """Create iterator for the `dataset` to fetch data to worker's `devices` .
-
-    `MultiDeviceIterator` is used to prefetch input to the devices on the
-    given worker. `MultiDeviceIterator` doesn't work in eager mode yet.
-
-    Args:
-      dataset: A `tf.data.Dataset` instance.
-      worker: Worker on which ops should be created.
-      devices: Distribute data from `dataset` to these devices.
-    """
-    self._dataset = dataset
-    self._worker = worker
-    self._devices = devices
-    self._is_eager = context.executing_eagerly()
-    self._make_iterator()
-
-  def _make_iterator(self):
-    """Make appropriate iterator on the dataset."""
-    with ops.device(self._worker):
-      if self._is_eager:
-        # TODO(rohanj): Enable prefetching in eager mode.
-        # TODO(priyag): Measure the performance of this approach vs calling
-        # get_next on the original dataset N times.
-        dataset = self._dataset.batch(len(self._devices), drop_remainder=True)
-        iterator = dataset_ops.make_one_shot_iterator(dataset)
-      else:
-        iterator = multi_device_iterator_ops.MultiDeviceIterator(
-            self._dataset, self._devices)
-    self._iterator = iterator
-
-  def get_next(self, name=None):
-    """Get next element from the underlying iterator."""
-    with ops.device(self._worker):
-      if self._is_eager:
-        # Batched dataset case.
-        batch = self._iterator.get_next(name=name)
-        index = {}
-        for i, d in enumerate(self._devices):
-          index[d] = nest.map_structure(operator.itemgetter(i), batch)
-          with ops.device(d):
-            index[d] = nest.map_structure(array_ops.identity, index[d])
-      else:
-        # MultiDeviceIterator case.
-        data_list = self._iterator.get_next()
-        index = dict(zip(self._devices, data_list))
-
-      return regroup(index)
-
-  def initialize(self):
-    """Initialze underlying iterator.
-
-    In eager execution, this simply recreates the underlying iterator.
-    In graph execution, it returns the initializer ops for the underlying
-    iterator.
-
-    Returns:
-      A list of any initializer ops that should be run.
-    """
-    if self._is_eager:
-      self._make_iterator()
-      return []
-    else:
-      return [self._iterator.initializer]
-
-  @property
-  def output_classes(self):
-    return self._iterator.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._iterator.output_shapes
-
-  @property
-  def output_types(self):
-    return self._iterator.output_types
-
-
-def _split_dataset_batch(dataset, split_batch_by):
-  """Divide a batch-ed dataset's batches into smaller batches."""
-  # TODO(sourabhbajaj): Remove this in lieu of distributed datasets
-  # pylint: disable=protected-access
-  def _get_batch_dataset(d):
-    """Get the underlying batch dataset from the dataset object."""
-    if isinstance(d, dataset_ops.DatasetV1Adapter):
-      d = d._dataset
-
-    if isinstance(d, (dataset_ops.BatchDataset, batching._MapAndBatchDataset)):
-      return d
-    elif isinstance(d, dataset_ops.PrefetchDataset):
-      return _get_batch_dataset(d._input_dataset)
-    raise ValueError(
-        "Unable to get batched dataset from the input dataset. `batch` "
-        "`map_and_batch` need to be the last operations on the dataset. "
-        "The batch operations can be followed by a prefetch.")
-
-  batched_dataset = _get_batch_dataset(dataset)
-  batch_size = batched_dataset._batch_size
-  drop_remainder = batched_dataset._drop_remainder
-  # pylint: enable=protected-access
-
-  if tensor_util.is_tensor(batch_size):
-    batch_size = tensor_util.constant_value(batch_size)
-
-  if tensor_util.is_tensor(drop_remainder):
-    drop_remainder = tensor_util.constant_value(drop_remainder)
-
-  if batch_size % split_batch_by:
-    raise ValueError(
-        "Batch size %s cannot be sharded evenly across replicas %s" % (
-            batch_size, split_batch_by))
-  new_batch_size = batch_size // split_batch_by
-
-  dataset = dataset.apply(batching.unbatch())
-  return dataset.batch(new_batch_size, drop_remainder=drop_remainder)
-
-
-class MultiStepContext(object):
-  """A context object that can be used to capture things when running steps.
-
-  This context object is useful when running multiple steps at a time using the
-  `experimental_run_steps_on_iterator` API. For e.g. it allows the user's step
-  function to specify which outputs to emit at what frequency. Currently it
-  supports capturing output from the last step, as well as capturing non tensor
-  outputs.  In the future it will be augmented to support other use cases such
-  as output each N steps.
-  """
-
-  def __init__(self):
-    """Initialize an output context.
-
-    Returns:
-      A context object.
-    """
-    self._last_step_outputs = {}
-    self._last_step_outputs_reduce_ops = {}
-    self._non_tensor_outputs = {}
-
-  @property
-  def last_step_outputs(self):
-    """A dictionary consisting of outputs to be captured on last step.
-
-    Keys in the dictionary are names of tensors to be captured, as specified
-    when `set_last_step_output` is called.
-    Values in the dictionary are the tensors themselves. If
-    `set_last_step_output` was called with a `reduce_op` for this output,
-    then the value is the reduced value.
-
-    Returns:
-      A dictionary with last step outputs.
-    """
-    return self._last_step_outputs
-
-  def _set_last_step_outputs(self, outputs):
-    """Replace the entire dictionary of last step outputs."""
-    if not isinstance(outputs, dict):
-      raise ValueError("Need a dictionary to set last_step_outputs.")
-    self._last_step_outputs = outputs
-
-  def set_last_step_output(self, name, output, reduce_op=None):
-    """Set `output` with `name` to be outputted from the last step.
-
-    Args:
-      name: String, name to identify the output. Doesn't need to match tensor
-        name.
-      output: The tensors that should be outputted with `name`. See below for
-        actual types supported.
-      reduce_op: Reduction method to use to reduce outputs from multiple
-        replicas. Required if `set_last_step_output` is called in a replica
-        context. Optional in cross_replica_context.
-        When present, the outputs from all the replicas are reduced using the
-        current distribution strategy's `reduce` method. Hence, the type of
-        `output` must be what's supported by the corresponding `reduce` method.
-        For e.g. if using MirroredStrategy and reduction is set, output
-        must be a `PerReplica` value.
-        The reduce method is also recorded in a dictionary
-        `_last_step_outputs_reduce_ops` for later interpreting of the
-        outputs as already reduced or not.
-    """
-    if distribution_strategy_context.get_cross_replica_context():
-      self._last_step_outputs_reduce_ops[name] = reduce_op
-      if reduce_op is None:
-        self._last_step_outputs[name] = output
-      else:
-        distribution = distribution_strategy_context.get_distribution_strategy()
-        self._last_step_outputs[name] = distribution.reduce(reduce_op, output)
-    else:
-      assert reduce_op is not None
-      def merge_fn(distribution, value):
-        self._last_step_outputs[name] = distribution.reduce(reduce_op, value)
-        # Setting this inside the `merge_fn` because all replicas share the same
-        # context object, so it's more robust to set it only once (even if all
-        # the replicas are trying to set the same value).
-        self._last_step_outputs_reduce_ops[name] = reduce_op
-
-      distribution_strategy_context.get_replica_context().merge_call(
-          merge_fn, args=(output,))
-
-  @property
-  def non_tensor_outputs(self):
-    """A dictionary consisting of any non tensor outputs to be captured."""
-    return self._non_tensor_outputs
-
-  def set_non_tensor_output(self, name, output):
-    """Set `output` with `name` to be captured as a non tensor output."""
-    if distribution_strategy_context.get_cross_replica_context():
-      self._non_tensor_outputs[name] = output
-    else:
-      def merge_fn(distribution, value):
-        # NOTE(priyag): For non tensor outputs, we simply return all the values
-        # in a list as reduction doesn't make sense on non tensors.
-        self._non_tensor_outputs[name] = distribution.unwrap(value)
-      distribution_strategy_context.get_replica_context().merge_call(
-          merge_fn, args=(output,))
-
-
 def value_container(val):
   """Returns the container that this per-replica `value` belongs to.
 
@@ -1738,10 +1437,11 @@ def value_container(val):
 
 
 # TODO(josh11b): Descend from Variable.
-class AggregatingVariable(checkpointable.CheckpointableBase):
+class AggregatingVariable(trackable.Trackable):
   """A wrapper around a variable that aggregates updates across replicas."""
 
-  def __init__(self, v, aggregation):
+  def __init__(self, strategy, v, aggregation):
+    self._distribute_strategy = strategy
     self._v = v
     # NOTE: We don't use "_distributed_container" here because we don't want
     # to trigger that code path in regroup().
@@ -1751,12 +1451,17 @@ class AggregatingVariable(checkpointable.CheckpointableBase):
   def get(self):
     return self._v
 
+  @property
+  def distribute_strategy(self):
+    return self._distribute_strategy
+
   def __getattr__(self, name):
     return getattr(self._v, name)
 
   def _assign_func(self, *args, **kwargs):
+    _assert_strategy(self._distribute_strategy)
     f = kwargs.pop("f")
-    if distribution_strategy_context.get_cross_replica_context():
+    if distribution_strategy_context.in_cross_replica_context():
       update_device = distribute_lib.get_update_device()
       if update_device is not None:
         # We are calling an assign function in an update context.
@@ -1764,24 +1469,25 @@ class AggregatingVariable(checkpointable.CheckpointableBase):
 
       # We are calling an assign function in cross replica context, wrap it in
       # an update call.
-      return distribution_strategy_context.get_distribution_strategy().update(
-          self, f, *args, **kwargs)
+      return self._distribute_strategy.extended.update(
+          self, f, args=args, kwargs=kwargs)
     else:
-      assert distribution_strategy_context.get_replica_context()
+      replica_context = distribution_strategy_context.get_replica_context()
+      assert replica_context
       # We are calling an assign function in replica context.
       # We reduce the value we want to assign/add/sub. More details about how we
       # handle the different use cases can be found in the _reduce method.
       # We call the function with the reduced value.
       if self._aggregation == vs.VariableAggregation.NONE:
         raise ValueError("You must specify an aggregation method to update a "
-                         "a variable in Replica Context.")
+                         "a variable in replica context.")
 
       def merge_fn(strategy, value, *other_args, **other_kwargs):
         v = _apply_aggregation(strategy, value, self._aggregation, self)
-        return strategy.update(self, f, v, *other_args, **other_kwargs)
+        return strategy.extended.update(
+            self, f, args=(v,) + other_args, kwargs=other_kwargs)
 
-      return distribution_strategy_context.get_replica_context().merge_call(
-          merge_fn, args=args, kwargs=kwargs)
+      return replica_context.merge_call(merge_fn, args=args, kwargs=kwargs)
 
   def assign_sub(self, *args, **kwargs):
     assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
@@ -1809,7 +1515,7 @@ class AggregatingVariable(checkpointable.CheckpointableBase):
 
   # TODO(josh11b): Test saving & restoring.
   def _gather_saveables_for_checkpoint(self):
-    return {checkpointable.VARIABLE_VALUE_KEY: self._v}
+    return {trackable.VARIABLE_VALUE_KEY: self._v}
 
   # pylint: disable=multiple-statements
   def __add__(self, o): return self._v + o
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index f43cf9327a1ad6b2b83ebcb2482ad3fc27515251..45e2682aac2d0a1b7591536ff55c78a4aad56b12 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -1,11 +1,12 @@
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "py_test", "tf_cc_binary")
+load("//tensorflow:tensorflow.bzl", "tf_py_test", "tf_cc_binary")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load(
     "//tensorflow/tools/test:performance.bzl",
     "tf_py_logged_benchmark",
 )
+load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_test")
 
 cc_library(
     name = "pywrap_tfe_lib",
@@ -25,6 +26,7 @@ cc_library(
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_internal",
         "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_experimental",
         "//tensorflow/c/eager:c_api_internal",
         "//tensorflow/c/eager:tape",
         "//tensorflow/core:framework",
@@ -55,6 +57,8 @@ py_library(
         ":execute",
         ":function",
         ":graph_only_ops",
+        ":profiler",
+        ":profiler_client",
         ":tape",
         ":test",
         ":wrap_function",
@@ -89,6 +93,38 @@ py_library(
     ],
 )
 
+py_library(
+    name = "profiler",
+    srcs = ["profiler.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":context",
+        "//tensorflow/python:pywrap_tensorflow",
+    ],
+)
+
+cuda_py_test(
+    name = "profiler_test",
+    srcs = ["profiler_test.py"],
+    additional_deps = [
+        ":profiler",
+        ":test",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/core/profiler:protos_all_py",
+    ],
+)
+
+py_library(
+    name = "profiler_client",
+    srcs = ["profiler_client.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python:pywrap_tensorflow",
+    ],
+)
+
 py_library(
     name = "tape",
     srcs = ["tape.py"],
@@ -126,6 +162,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:training",
     ],
+    tags = ["no_rocm"],
 )
 
 cuda_py_test(
@@ -255,11 +292,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "execution_callbacks_test",
     srcs = ["execution_callbacks_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":execution_callbacks",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
@@ -360,6 +396,7 @@ cuda_py_test(
         ":context",
         ":function",
         ":test",
+        ":profiler",
         "//third_party/py/numpy",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:pywrap_tensorflow",
@@ -373,11 +410,10 @@ tf_py_logged_benchmark(
     target = "//tensorflow/python/eager:benchmarks_test",
 )
 
-py_test(
+tf_py_test(
     name = "tape_test",
     srcs = ["tape_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":backprop",
         ":context",
         ":test",
@@ -414,20 +450,19 @@ cuda_py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "pywrap_tfe_test",
     srcs = ["pywrap_tfe_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":backprop",
         ":context",
         ":core",
         ":test",
+        "//third_party/py/numpy",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:random_ops",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -476,7 +511,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:while_v2",  # TODO(b/118513001): Imported via control_flow_ops; remove.
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
     ],
 )
 
@@ -491,12 +526,28 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "def_function_test",
     srcs = ["def_function_test.py"],
-    srcs_version = "PY2AND3",
+    additional_deps = [
+        ":def_function",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
+tf_xla_py_test(
+    name = "def_function_xla_test",
+    srcs = ["def_function_xla_test.py"],
+    tags = [
+        "no_pip",
+        "no_rocm",
+        "nomac",
+    ],
     deps = [
         ":def_function",
+        "//tensorflow/compiler/tests:xla_test",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_ops",
@@ -515,17 +566,28 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:template",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "wrap_function_test",
     srcs = ["wrap_function_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":wrap_function",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
     ],
 )
+
+py_library(
+    name = "remote",
+    srcs = ["remote.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:platform",
+        "//tensorflow/python/eager:context",
+    ],
+)
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 29f9b2cda3aa2c6e7fff6c6df10fed81779d02c7..694b05c1eeaf969a08b37c0025b34eed6b47ee9d 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -80,6 +80,8 @@ def make_attr(attr_type, value):
     return tensor_shape.as_shape(value).as_proto()
   elif attr_type == [pywrap_tensorflow.TF_ATTR_SHAPE]:
     return [tensor_shape.as_shape(v).as_proto() for v in value]
+  elif isinstance(value, str):
+    return value.encode()
   return value
 
 
@@ -465,14 +467,16 @@ def val_and_grad_function(f, params=None):
 
 
 def make_vjp(f, params=None, persistent=True):
-  """Returns a function that computes f and is vjp w.r.t. params.
+  """Returns a function that computes f and its vjp w.r.t.
+
+  params.
 
   The term "vjp" here is an abbreviation for vector-jacobian product.
 
   Args:
     f: the function to be differentiated.
     params: the parameters (numbers or names) to differentiate with respect to.
-       A value of None will differentiate with respect to all parameters.
+      A value of None will differentiate with respect to all parameters.
     persistent: Boolean controlling whether the VJP function can be re-used.
       Must be True or False.
 
@@ -595,7 +599,9 @@ def _fast_fill(value, shape, dtype):
 
 def _zeros(shape, dtype):
   """Helper to return (possibly cached) zero tensors in eager mode."""
-  if dtype == dtypes.variant:
+  if (dtype == dtypes.variant
+      or dtype == dtypes.string
+      or dtype == dtypes.resource):
     # TODO(apassos): need to save enough information about variant tensors to do
     # a zeros
     return None
@@ -618,6 +624,9 @@ def _zeros(shape, dtype):
 
 
 def _ones(shape, dtype):
+  if dtypes.as_dtype(dtype) == dtypes.string:
+    return None
+
   if not context.context().executing_eagerly():
     return array_ops.ones(shape, dtype)
 
@@ -925,11 +934,12 @@ class GradientTape(object):
                             "gradient in order to compute higher order "
                             "derrivatives.", 1)
 
-    flat_targets = nest.flatten(target)
-    for t in flat_targets:
+    flat_targets = []
+    for t in nest.flatten(target):
       if resource_variable_ops.is_resource_variable(t):
-        raise ValueError("GradientTape.gradient is not supported for variable "
-                         "targets.")
+        with self:
+          t = ops.convert_to_tensor(t)
+      flat_targets.append(t)
 
     flat_sources = nest.flatten(sources)
     flat_sources = [_handle_or_self(x) for x in flat_sources]
@@ -1104,8 +1114,13 @@ class GradientTape(object):
         dimension of `target` and `source` do not match.
     """
     target_shape = target.shape
-    if not target_shape.with_rank_at_least(2)[0].is_compatible_with(
-        source.shape.with_rank_at_least(2)[0]):
+    if target_shape.rank is None:
+      dim = Dimension(None)
+    else:
+      dim = target_shape.dims[0]
+    if not (target_shape.with_rank_at_least(2) and
+            source.shape.with_rank_at_least(2) and
+            dim.is_compatible_with(source.shape[0])):
       raise ValueError(
           "Need first dimension of target shape (%s) and "
           "source shape (%s) to match." % (target.shape, source.shape))
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 61c47a29fd2427850006cbe2dfe1e6bb69d988ab..5f4fda8897b3913ffeb165819a4b7859821ec3b8 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -270,6 +270,38 @@ class BackpropTest(test.TestCase):
       z = y * y
     self.assertAllEqual(t.gradient([x, y, z], [x, y]), [1.0, 11.0])
 
+  def testTapeGradientStringTarget(self):
+    s = constant_op.constant('unknown', dtype=dtypes.string)
+    x = constant_op.constant(3.0)
+
+    with backprop.GradientTape() as t:
+      t.watch(x)
+      t.watch(s)
+    grads = t.gradient(s, x)
+    self.assertEqual(grads, None)
+
+  def testTapeNoOpGradientStringSourceAndTarget(self):
+    s = constant_op.constant('unknown', dtype=dtypes.string)
+
+    with backprop.GradientTape() as t:
+      t.watch(s)
+    grads = t.gradient(s, s)
+    self.assertEqual(grads, None)
+
+  def testTapeNoOpGradientWithMultiTargetMultiSourceIncludeString(self):
+    x = constant_op.constant(3.0)
+    y = constant_op.constant(5.0)
+    s = constant_op.constant('unknown', dtype=dtypes.string)
+
+    with backprop.GradientTape() as t:
+      t.watch(x)
+      t.watch(y)
+      t.watch(s)
+      z = y * y
+    grads = t.gradient([x, y, z, s], [x, y, s])
+    self.assertAllEqual(grads[:2], [1.0, 11.0])
+    self.assertEqual(grads[2], None)
+
   def testTapeNoOpOnVariableIsIdentity(self):
     v0 = resource_variable_ops.ResourceVariable(1.0)
     with backprop.GradientTape() as t:
@@ -322,6 +354,16 @@ class BackpropTest(test.TestCase):
       loss += v * v
     self.assertAllEqual(t.gradient(loss, v), 2.0)
 
+  def testPythonMax(self):
+    x = [resource_variable_ops.ResourceVariable(2.),
+         resource_variable_ops.ResourceVariable(3.),
+         resource_variable_ops.ResourceVariable(5.)]
+    with backprop.GradientTape() as t:
+      f = max(x)
+    grad = t.gradient(f, x)
+    self.assertAllEqual(self.evaluate(f), 5.)
+    self.assertAllEqual(self.evaluate(grad), [None, None, 1.0])
+
   def testAutomaticWatchedVariables(self):
     with backprop.GradientTape() as t:
       self.assertEqual(0, len(t.watched_variables()))
@@ -642,10 +684,8 @@ class BackpropTest(test.TestCase):
     with backprop.GradientTape() as g:
       x = variables.Variable([3.0])
       y = variables.Variable([2.0])
-    with self.assertRaisesRegexp(
-        ValueError,
-        'GradientTape.gradient is not supported for variable targets.'):
-      g.gradient(x, y)
+    grad = g.gradient(x, y)
+    self.assertAllEqual(grad, None)
 
   @test_util.run_in_graph_and_eager_modes
   @test_util.run_v1_only('b/120545219')
@@ -1338,17 +1378,14 @@ class BatchJacobianTest(test.TestCase):
                               array_ops.diag(2 * x[1] * y[1])])
     return batch_jacobian, answer
 
-  @test_util.run_v1_only('b/120545219')
   def testPfor(self):
     batch_jacobian, answer = self._batch_jacobian(experimental_use_pfor=True)
     self.assertAllEqual(answer, batch_jacobian)
 
-  @test_util.run_v1_only('b/120545219')
   def testWhileLoop(self):
     batch_jacobian, answer = self._batch_jacobian(experimental_use_pfor=False)
     self.assertAllEqual(answer, batch_jacobian)
 
-  @test_util.run_v1_only('b/120545219')
   def testPforDefun(self):
 
     @function.defun
@@ -1358,7 +1395,6 @@ class BatchJacobianTest(test.TestCase):
     batch_jacobian, answer = _f()
     self.assertAllEqual(answer, batch_jacobian)
 
-  @test_util.run_v1_only('b/120545219')
   def testWhileLoopDefun(self):
 
     @function.defun
@@ -1368,7 +1404,6 @@ class BatchJacobianTest(test.TestCase):
     batch_jacobian, answer = _f()
     self.assertAllEqual(answer, batch_jacobian)
 
-  @test_util.run_v1_only('b/120545219')
   def testPersistentTape(self):
     if not context.executing_eagerly():
       return
@@ -1379,7 +1414,6 @@ class BatchJacobianTest(test.TestCase):
     with self.assertRaisesRegexp(RuntimeError, 'persistent'):
       g.batch_jacobian(y, x, experimental_use_pfor=False)
 
-  @test_util.run_v1_only('b/120545219')
   def testBadShape(self):
     x = random_ops.random_uniform([2, 3])
     with backprop.GradientTape() as g:
@@ -1387,7 +1421,6 @@ class BatchJacobianTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'Need first dimension'):
       g.batch_jacobian(y, x)
 
-  @test_util.run_v1_only('b/120545219')
   def testBadInputRank(self):
     x = random_ops.random_uniform([2])
     with backprop.GradientTape() as g:
@@ -1402,7 +1435,6 @@ class BatchJacobianTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'must have rank at least 2'):
       g.batch_jacobian(y, x)
 
-  @test_util.run_v1_only('b/120545219')
   def testPforException(self):
     var = variables.Variable([1.])
 
@@ -1423,7 +1455,6 @@ class BatchJacobianTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'No converter'):
       g.batch_jacobian(y, x, experimental_use_pfor=True)
 
-  @test_util.run_v1_only('b/120545219')
   def test_parallel_iterations(self):
     with backprop.GradientTape(persistent=True) as g:
       x = constant_op.constant([[1., 2], [3, 4]])
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 31a7efca82b016bc193ab9985ea7603897edc7ac..f44185df0b5a016d1e1f14aabfcf3704d91752b4 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -38,6 +38,7 @@ from tensorflow.python.eager import backprop  # pylint: disable=unused-import
 from tensorflow.python.eager import context
 from tensorflow.python.eager import core
 from tensorflow.python.eager import function
+from tensorflow.python.eager import profiler
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -140,7 +141,7 @@ class MicroBenchmarks(test.Benchmark):
     self._m_2_by_2 = random_ops.random_uniform((2, 2))
     self._m_100_by_784 = random_ops.random_uniform((100, 784))
     self._num_iters_2_by_2 = 30000
-    self._num_iters_100_by_784 = 1000
+    self._num_iters_100_by_784 = 30000
 
   def _run(self, func, num_iters, execution_mode=None):
     # call func to maybe warm up the GPU
@@ -370,6 +371,19 @@ class MicroBenchmarks(test.Benchmark):
     func = lambda: f(m, m, transpose_b=transpose_b)
     self._run(func, num_iters, execution_mode=execution_mode)
 
+  def _benchmark_nested_defun_matmul(self, m, transpose_b, num_iters):
+    inner = function.defun(math_ops.matmul)
+
+    @function.defun
+    def outer(a, b, c, transpose_b):
+      return math_ops.matmul(inner(a, b, transpose_b=transpose_b), c)
+
+    func = lambda: outer(m, m, m, transpose_b=transpose_b)
+    # Warmup before benchmark
+    for _ in range(1000):
+      func()
+    self._run(func, num_iters)
+
   def _benchmark_defun_matmul_forward_backward(self,
                                                m,
                                                transpose_b,
@@ -525,6 +539,11 @@ class MicroBenchmarks(test.Benchmark):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
+  def benchmark_nested_defun_matmul_2_by_2(self):
+    m = self._m_2_by_2.cpu()
+    self._benchmark_nested_defun_matmul(
+        m, transpose_b=False, num_iters=self._num_iters_2_by_2)
+
   # Benchmarks for AA.T, A of dimension 100 by 784.
   def benchmark_np_matmul_100_by_784(self):
     self._benchmark_np_matmul(
@@ -614,6 +633,11 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_defun_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
+  def benchmark_nested_defun_matmul_100_by_784(self):
+    m = self._m_100_by_784.gpu()
+    self._benchmark_nested_defun_matmul(
+        m, transpose_b=True, num_iters=self._num_iters_100_by_784)
+
   def benchmark_defun_without_signature(self):
 
     def func(t1, t2, t3, t4, t5, t6, t7, t8):
@@ -792,10 +816,26 @@ class MicroBenchmarks(test.Benchmark):
       model = make_keras_model(initializer="glorot_uniform")
       self._benchmark_keras_model_fit(model)
 
+  def benchmark_keras_model_functional_fit_graph_mode_with_profiler(self):
+    profiler.start()
+    with context.graph_mode():
+      model = make_keras_model(initializer="glorot_uniform")
+      self._benchmark_keras_model_fit(model)
+    result = profiler.stop()
+    assert result is not None
+
   def benchmark_keras_model_functional_fit_run_model_eagerly(self):
     model = make_keras_model(initializer="glorot_uniform")
     self._benchmark_keras_model_fit(model, run_eagerly=True)
 
+  def benchmark_keras_model_functional_fit_run_model_eagerly_with_profiler(
+      self):
+    profiler.start()
+    model = make_keras_model(initializer="glorot_uniform")
+    self._benchmark_keras_model_fit(model, run_eagerly=True)
+    result = profiler.stop()
+    assert result is not None
+
   def benchmark_keras_model_sequential_fit(self):
     model = make_sequential_keras_model(initializer="glorot_uniform")
     self._benchmark_keras_model_fit(model)
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index cbbe5cf49e20afc63e7710e39dc37ecbc4ac5082..4364b4925823d8f18f7abefc45ac66116c7ac464 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -44,6 +44,7 @@ default_execution_mode = EAGER_MODE if tf2.enabled() else GRAPH_MODE
 # Note that we do not protect this with a lock and instead rely on python's GIL
 # and the idempotent nature of writes to provide thread safety.
 _device_parsing_cache = {}
+_starting_device_spec = pydev.DeviceSpec.from_string("")
 
 _MAXINT32 = 2**31 - 1
 
@@ -135,30 +136,57 @@ class _EagerContext(threading.local):
 
   def __init__(self, config=None):
     super(_EagerContext, self).__init__()
-    self.device_spec = pydev.DeviceSpec.from_string("")
-    self.device_name = self.device_spec.to_string()
+    self.device_spec = _starting_device_spec
+    self.device_name = ""
     self.mode = default_execution_mode
     self.is_eager = default_execution_mode == EAGER_MODE
     self.scope_name = ""
-    self.recording_summaries = False
     self.summary_writer_resource = None
+    self.recording_summaries = None
     self.scalar_cache = {}
-    self.ones_rank_cache = _EagerTensorCache()
-    self.zeros_cache = _EagerTensorCache()
+    self._ones_rank_cache = None
+    self._zeros_cache = None
     self.execution_mode = None
 
     # Default rewriter config corresponds to turning all default grappler
     # optimizations on.
-    base_config = config_pb2.ConfigProto()
+    self._config = config
 
-    if config is not None:
-      base_config.MergeFrom(config)
+    self._function_call_options = None
 
-    self.function_call_options = FunctionCallOptions(config_proto=base_config)
+  @property
+  def function_call_options(self):
+    if self._function_call_options is None:
+      base_config = config_pb2.ConfigProto()
+      if self._config is not None:
+        base_config.MergeFrom(self._config)
+      self._config = None
+      self._function_call_options = FunctionCallOptions(
+          config_proto=base_config)
+
+    return self._function_call_options
+
+  @function_call_options.setter
+  def function_call_options(self, function_call_options):
+    self._function_call_options = function_call_options
+    self._config = None
+
+  @property
+  def ones_rank_cache(self):
+    if not self._ones_rank_cache:
+      self._ones_rank_cache = _EagerTensorCache()
+    return self._ones_rank_cache
+
+  @property
+  def zeros_cache(self):
+    if not self._zeros_cache:
+      self._zeros_cache = _EagerTensorCache()
+    return self._zeros_cache
 
 
 ContextSwitch = collections.namedtuple(
-    "ContextSwitch", ["is_building_function", "enter_context_fn"])
+    "ContextSwitch", ["is_building_function", "enter_context_fn",
+                      "device_stack"])
 
 
 # `_ContextSwitchStack` is a `threading.local` to match the semantics of
@@ -175,23 +203,28 @@ class _ContextSwitchStack(threading.local):
       # across threads, since (1) `enable_eager_execution` modifies a
       # process-level flag (`default_execution_mode`) and (2) `__init__` is
       # called each time a threading.local object is used in a separate thread.
-      self.push(is_building_function=False, enter_context_fn=eager_mode)
+      self.push(is_building_function=False, enter_context_fn=eager_mode,
+                device_stack=None)
 
-  def push(self, is_building_function, enter_context_fn):
+  def push(self, is_building_function, enter_context_fn, device_stack):
     """Push metadata about a context switch onto the stack.
 
-    A context switch can take one of two forms: installing a graph as the
-    default graph, or entering the eager context. For each context switch,
+    A context switch can take any one of the two forms: installing a graph as
+    the default graph, or entering the eager context. For each context switch,
     we record whether or not the entered context is building a function.
 
     Args:
       is_building_function: (bool.) Whether the context is building a function.
       enter_context_fn: (function.) A callable that executes the context switch.
         For example, `graph.as_default` or `eager_mode`.
+      device_stack: If applicable, the device function stack for this
+        graph. When breaking out of graphs in init_scope, the innermost nonempty
+        device stack is used. Eager contexts put `None` here and the value is
+        never used.
     """
 
     self.stack.append(
-        ContextSwitch(is_building_function, enter_context_fn))
+        ContextSwitch(is_building_function, enter_context_fn, device_stack))
 
   def pop(self):
     """Pop the stack."""
@@ -265,6 +298,7 @@ class Context(object):
       execution_mode = SYNC
     self._execution_mode = execution_mode
     self._server_def = server_def
+    self._collective_ops_server_def = None
 
   # pylint: enable=redefined-outer-name
 
@@ -325,10 +359,17 @@ class Context(object):
         self._context_handle = pywrap_tensorflow.TFE_NewContext(opts)
       finally:
         pywrap_tensorflow.TFE_DeleteContextOptions(opts)
+      assert not (self._server_def and self._collective_ops_server_def), (
+          "Cannot enable remote execution as well as collective ops at the "
+          "moment. If this is important to you, please file an issue.")
       if self._server_def is not None:
         server_def_str = self._server_def.SerializeToString()
         pywrap_tensorflow.TFE_ContextSetServerDef(self._context_handle, 600,
                                                   server_def_str)
+      elif self._collective_ops_server_def is not None:
+        server_def_str = self._collective_ops_server_def.SerializeToString()
+        pywrap_tensorflow.TFE_EnableCollectiveOps(self._context_handle,
+                                                  server_def_str)
 
       self._initialize_devices()
 
@@ -370,6 +411,30 @@ class Context(object):
 
       self._initialize_devices()
 
+  def enable_collective_ops(self, server_def):
+    """Enable collective ops with an appropriate server_def.
+
+    If previously enabled, this cannot be re-enabled.
+
+    Args:
+      server_def: A tensorflow::ServerDef proto. Enables execution on remote
+        devices.
+
+    Raises:
+      ValueError: if server_def is None.
+    """
+    if not server_def:
+      raise ValueError("server_def is None.")
+    if not self._context_handle:
+      self._collective_ops_server_def = server_def
+    else:
+      server_def_str = server_def.SerializeToString()
+      pywrap_tensorflow.TFE_EnableCollectiveOps(self._context_handle,
+                                                server_def_str)
+
+      self._clear_caches()
+      self._initialize_devices()
+
   @property
   def _handle(self):
     ctx = self._context_handle
@@ -410,7 +475,7 @@ class Context(object):
       # Entering graph mode does not provide us with sufficient information to
       # record a context switch; graph-based context switches are only logged
       # when a graph is registered as the default graph.
-      self.context_switches.push(False, eager_mode)
+      self.context_switches.push(False, eager_mode, None)
     try:
       yield
     finally:
@@ -455,6 +520,16 @@ class Context(object):
     """Sets summary writer resource."""
     self._eager_context.summary_writer_resource = resource
 
+  @property
+  def recording_summaries(self):
+    """Returns summary recording condition."""
+    return self._eager_context.recording_summaries
+
+  @recording_summaries.setter
+  def recording_summaries(self, condition):
+    """Sets summary recording condition."""
+    self._eager_context.recording_summaries = condition
+
   @property
   def device_name(self):
     """Returns the device name for the current thread."""
@@ -611,6 +686,10 @@ class Context(object):
     pywrap_tensorflow.TFE_ContextAddFunctionDef(
         self._handle, fdef_string, len(fdef_string))
 
+  def has_function(self, name):
+    """Check if a function `name` is registered."""
+    return bool(pywrap_tensorflow.TFE_ContextHasFunction(self._handle, name))
+
   def add_post_execution_callback(self, callback):
     """Add a post-execution callback to the context.
 
@@ -646,14 +725,6 @@ class Context(object):
     """Get the list of post-execution callbacks added to the context."""
     return self._post_execution_callbacks
 
-  def enable_run_metadata(self):
-    """Enables tracing of op execution via RunMetadata.
-
-    To retrieve the accumulated metadata call context.export_run_metadata()
-    and to stop tracing call context.disable_run_metadata().
-    """
-    pywrap_tensorflow.TFE_ContextEnableRunMetadata(self._handle)
-
   @tf_contextlib.contextmanager
   def device_policy(self, policy):
     handle = self._handle
@@ -666,12 +737,34 @@ class Context(object):
       pywrap_tensorflow.TFE_ContextSetThreadLocalDevicePlacementPolicy(
           handle, old)
 
+  def enable_run_metadata(self):
+    """Enables tracing of op execution via RunMetadata.
+
+    To retrieve the accumulated metadata call context.export_run_metadata()
+    and to stop tracing call context.disable_run_metadata().
+    """
+    pywrap_tensorflow.TFE_ContextEnableRunMetadata(self._handle)
+
   def disable_run_metadata(self):
     """Disables tracing of op execution via RunMetadata."""
     if not self._context_handle:
       return
     pywrap_tensorflow.TFE_ContextDisableRunMetadata(self._context_handle)
 
+  def enable_graph_collection(self):
+    """Enables graph collection of executed functions.
+
+    To retrieve the accumulated graphs call context.export_run_metadata()
+    and to stop collecting graphs call context.disable_graph_collection().
+    """
+    pywrap_tensorflow.TFE_ContextEnableGraphCollection(self._handle)
+
+  def disable_graph_collection(self):
+    """Disables graph collections of executed functions."""
+    if not self._context_handle:
+      return
+    pywrap_tensorflow.TFE_ContextDisableGraphCollection(self._context_handle)
+
   def export_run_metadata(self):
     """Returns a RunMetadata proto with accumulated information.
 
@@ -756,6 +849,27 @@ def in_eager_mode():
   return executing_eagerly()
 
 
+def shared_name(name=None):
+  """Returns the anonymous shared name GUID if no shared name is specified.
+
+  In eager mode we need to use a unique shared name to avoid spurious sharing
+  issues. The runtime generates a unique name on our behalf when the reserved
+  GUID is used as a shared name.
+
+  Args:
+    name: Optional shared name
+
+  Returns:
+    Eager compatible shared name.
+  """
+  if name or not executing_eagerly():
+    return name
+
+  # Ensure a unique name when eager execution is enabled to avoid spurious
+  # sharing issues.
+  return "cd2c89b7-88b7-44c8-ad83-06c2a9158347"
+
+
 def graph_mode():
   """Context-manager to disable eager execution for the current thread."""
   return context()._mode(GRAPH_MODE)  # pylint: disable=protected-access
@@ -807,6 +921,7 @@ def device(name):
   return context().device(name)
 
 
+@tf_export("config.experimental_list_devices")
 def list_devices():
   """List the names of the available devices.
 
@@ -878,6 +993,20 @@ def disable_run_metadata():
   context().disable_run_metadata()
 
 
+def enable_graph_collection():
+  """Enables tracing of op execution via RunMetadata.
+
+  To retrieve the accumulated metadata call context.export_run_metadata()
+  and to stop tracing call context.disable_run_metadata().
+  """
+  context().enable_graph_collection()
+
+
+def disable_graph_collection():
+  """Disables tracing of op execution via RunMetadata."""
+  context().disable_graph_collection()
+
+
 def export_run_metadata():
   """Returns a RunMetadata proto with accumulated information.
 
@@ -923,6 +1052,10 @@ def add_function(fdef):
 # but they do all import this file.  Note that IS_IN_GRAPH_MODE and
 # in_graph_mode are both parameterless functions.
 def _tmp_in_graph_mode():
+  if context_safe() is None:
+    # Context not yet initialized. Assume graph mode following the
+    # default implementation in `is_in_graph_mode`.
+    return True
   return not executing_eagerly()
 
 
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index e601aa376fa2ef8e0e240e4da03bfcd9ea227bd9..5432abab6054629367e9596029bf2b9f885628da 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -631,7 +631,8 @@ class TFETest(test_util.TensorFlowTestCase):
     for t in tensors:
       self.assertIsInstance(t, ops.EagerTensor)
 
-  def testSmallIntegerOpsForcedToCPU(self):
+  # TODO(b/123637108): re-enable
+  def disabled_testSmallIntegerOpsForcedToCPU(self):
     if not context.context().num_gpus():
       self.skipTest('No GPUs found')
 
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index 6bacd7a962fdefb8caf11189b0681694d23b97f0..1d54973487ca4c6a0221e376954824d4eba2aacd 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -25,12 +25,14 @@ import weakref
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function as function_lib
 from tensorflow.python.eager import lift_to_graph
+from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import tf_export
 
@@ -53,6 +55,7 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
                dtype=None,
                constraint=None,
                add_initializers_to=None,
+               lifted_initializer_graph=None,
                **unused_kwargs):
     """Creates a variable.
 
@@ -84,15 +87,16 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
         (which must have the same shape). Constraints are not safe to
         use when doing asynchronous distributed training.
       add_initializers_to: if not None and not in legacy graph mode, the
-        initializer tensor will be added to this map instead of adding the
+        initializer tensor will be added to this map in addition to adding the
         assignment to the function.
+      lifted_initializer_graph: FuncGraph to try to lift initializers to.
 
     Raises:
       ValueError: If the initial value is not specified, or does not have a
         shape and `validate_shape` is `True`.
       RuntimeError: If called outside of a function definition.
     """
-    if context.executing_eagerly():
+    if not ops.inside_function():
       # If we've been init_scope()d out of the function definition nothing to do
       # here; we can't really do the capturing or conditional logic.
       resource_variable_ops.ResourceVariable.__init__(
@@ -109,8 +113,8 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
     if constraint is not None and not callable(constraint):
       raise ValueError("The `constraint` argument must be a callable.")
 
-    if isinstance(initial_value, checkpointable.CheckpointInitialValue):
-      self._maybe_initialize_checkpointable()
+    if isinstance(initial_value, trackable.CheckpointInitialValue):
+      self._maybe_initialize_trackable()
       self._update_uid = initial_value.checkpoint_position.restore_uid
       initial_value = initial_value.wrapped_value
 
@@ -130,30 +134,36 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
                         if init_from_fn else [initial_value]) as name:
       # pylint: disable=protected-access
       with ops.init_scope():
-        shared_name = ops._name_from_scope_name(name)
-        shared_name = "%s_%d" % (shared_name, ops.uid())
+        handle_name = ops._name_from_scope_name(name)
+        unique_id = "%s_%d" % (handle_name, ops.uid())
+        shared_name = context.shared_name(unique_id)
       with ops.name_scope("Initializer"), ops.device(None):
         initial_value = ops.convert_to_tensor(
             initial_value() if init_from_fn else initial_value,
             name="initial_value", dtype=dtype)
       with ops.init_scope():
         self._handle = resource_variable_ops.eager_safe_variable_handle(
-            shape=initial_value.get_shape(),
-            dtype=initial_value.dtype.base_dtype,
+            initial_value=initial_value,
             shared_name=shared_name,
             name=name,
             graph_mode=self._in_graph_mode)
       self._shape = initial_value.shape
-      self._unique_id = shared_name
-      self._handle_name = shared_name + ":0"
+      self._unique_id = unique_id
+      self._handle_name = handle_name + ":0"
       self._dtype = initial_value.dtype.base_dtype
       self._constraint = constraint
       assert initial_value is not None
       if self._in_graph_mode:
         with ops.init_scope():
           outer_graph = ops.get_default_graph()
+        func_graph = ops.get_default_graph()
+        function_placeholders = (
+            func_graph.inputs + func_graph.internal_captures)
+        placeholder_ops = set(
+            [tensor.op for tensor in function_placeholders])
         lifted_initializer = lift_to_graph.lift_to_graph(
-            initial_value, outer_graph)[initial_value]
+            [initial_value], outer_graph,
+            disallowed_placeholders=placeholder_ops)[initial_value]
         with ops.init_scope():
           self._initial_value = lifted_initializer
           with ops.name_scope("IsInitialized"):
@@ -173,22 +183,21 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
       else:
         if add_initializers_to is not None:
           add_initializers_to[self] = initial_value
-        else:
-          def assign_fn():
-            with ops.name_scope("Assign") as n, ops.colocate_with(self._handle):
-              resource_variable_ops.assign_variable_op(
-                  self._handle,
-                  initial_value,
-                  name=n)
-              # Returning values to keep tf.cond happy.
-            return ops.convert_to_tensor(1)
-          def not_assign_fn():
-            return ops.convert_to_tensor(0)
-          # Note: this cond is always guaranteed to run because we're inside a
-          # defun which will insert automatic control dependencies.
-          control_flow_ops.cond(
-              resource_variable_ops.var_is_initialized_op(self._handle),
-              not_assign_fn, assign_fn)
+        def assign_fn():
+          with ops.name_scope("Assign") as n, ops.colocate_with(self._handle):
+            resource_variable_ops.assign_variable_op(
+                self._handle,
+                initial_value,
+                name=n)
+            # Returning values to keep tf.cond happy.
+          return ops.convert_to_tensor(1)
+        def not_assign_fn():
+          return ops.convert_to_tensor(0)
+        # Note: this cond is always guaranteed to run because we're inside a
+        # defun which will insert automatic control dependencies.
+        control_flow_ops.cond(
+            resource_variable_ops.var_is_initialized_op(self._handle),
+            not_assign_fn, assign_fn)
 
     # After the handle has been created, set up a way to clean it up when
     # executing eagerly. We'll hold the only reference to the deleter, so that
@@ -201,13 +210,49 @@ class UnliftedInitializerVariable(resource_variable_ops.ResourceVariable):
     self._cached_shape_as_list = None
 
 
-class PolymorphicFunction(object):
+RUN_FUNCTIONS_EAGERLY = False
+
+
+@tf_export("config.experimental_run_functions_eagerly")
+def run_functions_eagerly(run_eagerly):
+  """Enables / disables eager execution of `tf.function`s.
+
+  After calling `tf.config.experimental_run_functions_eagerly(True)` all
+  invocations of tf.function will run eagerly instead of running through a graph
+  function.
+
+  This can be useful for debugging or profiling.
+
+  Similarly, calling `tf.config.experimental_run_functions_eagerly(False)` will
+  revert the behavior of all functions to graph functions.
+
+  Args:
+    run_eagerly: Boolean. Whether to run functions eagerly.
+  """
+  global RUN_FUNCTIONS_EAGERLY
+  RUN_FUNCTIONS_EAGERLY = bool(run_eagerly)
+
+
+class FunctionDeleter(object):
+
+  def __init__(self, func_graph):
+    self.func_graph = func_graph
+
+  def __del__(self):
+    try:
+      func_graph_module.dismantle_func_graph(self.func_graph)
+    except:  # pylint: disable=bare-except
+      # Note: bare except here because this can be noisy at shutdown time.
+      pass
+
+
+class Function(object):
   """Wrapper class for the graph functions defined for a Python function.
 
   See the documentation for `tf.function` for more information on the semantics
   of defined functions.
 
-  PolymorphicFunction is thread-compatible.
+  `Function` is thread-compatible.
   """
 
   def __init__(self,
@@ -216,7 +261,7 @@ class PolymorphicFunction(object):
                input_signature=None,
                autograph=True,
                experimental_autograph_options=None):
-    """Initializes a polymorphic function.
+    """Initializes a `Function`.
 
     Args:
       python_function: the function to be wrapped.
@@ -236,47 +281,93 @@ class PolymorphicFunction(object):
     """
     self._python_function = python_function
     self._input_signature = input_signature
+    # TODO(vbardiovsky): Both _stateful_fn and _stateless_fn are populating the
+    # same FunctionSpec. Consider removing it from both and passing in instead.
+    self._function_spec = function_lib.FunctionSpec.from_function_and_signature(
+        python_function, input_signature)
     self._autograph = autograph
     self._experimental_autograph_options = experimental_autograph_options
-    if self._experimental_autograph_options is not None:
-      raise NotImplementedError()
     self._created_variables = None
     self._stateful_fn = None
+    self._stateless_fn = None
     self._descriptor_cache = weakref.WeakKeyDictionary()
     self._name = name
 
   def _defun_with_scope(self, scope):
     """Creates a defun wrapped inside a variable creator scope."""
 
+    weak_wrapped_fn = None
     def wrapped_fn(*args, **kwds):
-      with variable_scope.variable_creator_scope(scope):
-        # __wrapped__ allows AutoGraph to swap in a converted function.
-        return wrapped_fn.__wrapped__(*args, **kwds)
+      """Wraps `self._python_function` in a variable creator scope."""
+      # We register a variable creator with reduced priority. If an outer
+      # variable creator is just modifying keyword arguments to the variable
+      # constructor, this will work harmoniously. Since the `scope` registered
+      # here actually creates the variable, it taking priority would otherwise
+      # ignore the outer creator.
+      #
+      # If an outer variable creator calls the variable constructor manually,
+      # for example creating a MirroredVariable, then they won't call our
+      # creator. This means we won't be able to trace the initialization graph,
+      # and so variable initializers can't depend on function arguments. This is
+      # better than the alternative, tracing the initialization graph but giving
+      # the user a variable type they didn't want.
+      with ops.get_default_graph()._variable_creator_scope(scope, priority=50):  # pylint: disable=protected-access
+        # __wrapped__ allows AutoGraph to swap in a converted function. We give
+        # the function a weak reference to itself to avoid a reference cycle.
+        return weak_wrapped_fn().__wrapped__(*args, **kwds)
+    weak_wrapped_fn = weakref.ref(wrapped_fn)
 
     # TODO(mdan): Pipe self._experimental_autograph_options through.
     return function_lib.defun(
         tf_decorator.make_decorator(self._python_function, wrapped_fn),
         input_signature=self._input_signature,
-        autograph=self._autograph)
+        autograph=self._autograph,
+        experimental_autograph_options=self._experimental_autograph_options)
+
+  def _canonicalize_function_inputs(self, args, kwds):
+    """Canonicalize the inputs to the Python function."""
+    if self._input_signature is None or args or kwds:
+      return self._function_spec.canonicalize_function_inputs(*args, **kwds)  # pylint: disable=protected-access
+    # If an input signature is defined, we may need to fetch a concrete function
+    # without any inputs specified. In this case args and kwds should be ignored
+    # but running _canonicalize_function_inputs would raise an exception.
+    return (), {}
 
   def _initialize(self, args, kwds, add_initializers_to=None):
-    """Initializes, on the first call."""
+    """Initializes, on the first call.
+
+    Creates two `Function`s, one that will allow creation of variables
+    and one that won't.
+
+    Additionally runs a trace for the `Function` that allows creation
+    of variables.
+
+    Args:
+      args: Arguments to the underlying python callable.
+      kwds: Keyword arguments to the python callable.
+      add_initializers_to: Where to collect variable initializers, if not None.
+    """
 
-    self._created_variables = []
+    created_variables = []
+    lifted_initializer_graph = func_graph_module.FuncGraph("initializer")
 
     def variable_capturing_scope(unused_next_creator, **kwds):
       """Creates UnliftedInitializerVariables and saves references to them."""
       v = UnliftedInitializerVariable(
-          add_initializers_to=add_initializers_to, **kwds)
-      self._created_variables.append(weakref.ref(v))
+          add_initializers_to=add_initializers_to,
+          lifted_initializer_graph=lifted_initializer_graph, **kwds)
+      created_variables.append(weakref.ref(v))
       return v
 
+    self._created_variables = created_variables
     self._stateful_fn = self._defun_with_scope(variable_capturing_scope)
     self._stateful_fn._name = self._name  # pylint: disable=protected-access
-
     # Force the definition of the function for these arguments
+    self._lifted_initializer_graph = lifted_initializer_graph
+    self._graph_deleter = FunctionDeleter(self._lifted_initializer_graph)
     self._concrete_stateful_fn = (
-        self._stateful_fn._get_concrete_function_internal(*args, **kwds))  # pylint: disable=protected-access
+        self._stateful_fn._get_concrete_function_internal_garbage_collected(  # pylint: disable=protected-access
+            *args, **kwds))
 
     def invalid_creator_scope(*unused_args, **unused_kwds):
       """Disables variable creation."""
@@ -286,14 +377,36 @@ class PolymorphicFunction(object):
 
     self._stateless_fn = self._defun_with_scope(invalid_creator_scope)
     self._stateless_fn._name = self._name  # pylint: disable=protected-access
-    if self._input_signature is None or args or kwds:
-      return self._stateful_fn._canonicalize_function_inputs(*args, **kwds)  # pylint: disable=protected-access
-    # If an input signature is defined, we may need to fetch a concrete function
-    # without any inputs specified. In this case args and kwds should be ignored
-    # but running _canonicalize_function_inputs would raise an exception.
-    return (), {}
+
+  def _decorate(self, decorator):
+    """Allows the captured Python function to be decorated in place.
+
+    This method is only safe to call when the Function has not been called by a
+    user. It makes sense to use this method to push a decorator into the
+    function rather than wrapping the function in the decorator.
+
+    We use this in tf.Module to allow user annotated `tf.functions` to remain as
+    `Function` objects but still automatically enter the Module name_scope
+    when they are evaluated like all other methods.
+
+    Args:
+      decorator: A callable accepting a single argument which is the function
+        to decorate and returning a callable result.
+
+    Raises:
+      ValueError: If the function has been called a ValueError is raised.
+    """
+    if self._stateful_fn is not None or self._stateless_fn is not None:
+      raise ValueError(
+          "Functions cannot be decorated after they have been traced.")
+
+    self._python_function = decorator(self._python_function)
+    self._function_spec = function_lib.FunctionSpec.from_function_and_signature(
+        self._python_function, self._input_signature)
 
   def __call__(self, *args, **kwds):
+    if RUN_FUNCTIONS_EAGERLY:
+      return self._python_function(*args, **kwds)
     """Calls the graph function."""
     if self._created_variables:
       # In this case we have created variables on the first call, so we run the
@@ -308,9 +421,23 @@ class PolymorphicFunction(object):
                          " decorated with tf.function.")
       return results
 
-    canon_args, canon_kwds = self._initialize(args, kwds)
-
-    if not self._created_variables:
+    # This is the first call of __call__, so we have to initialize.
+    initializer_map = {}
+    self._initialize(args, kwds, add_initializers_to=initializer_map)
+    if self._created_variables:
+      try:
+        # Attempt to initialize variables eagerly and without conds by lifting
+        # out initialization graphs. This is the only initialization strategy
+        # compatible with XLA at the moment.
+        self._initialize_uninitialized_variables(initializer_map)
+      except lift_to_graph.UnliftableError:
+        pass  # Fall through to cond-based initialization.
+      else:
+        # Lifting succeeded, so variables are initialized and we can run the
+        # stateless function.
+        return self._stateless_fn(*args, **kwds)
+    else:
+      canon_args, canon_kwds = self._canonicalize_function_inputs(args, kwds)
       # If we did not create any variables the trace we have is good enough.
       return self._concrete_stateful_fn._filtered_call(canon_args, canon_kwds)  # pylint: disable=protected-access
 
@@ -321,9 +448,39 @@ class PolymorphicFunction(object):
         variable = wr()
         if variable is None:
           raise ValueError(
-              "Variable created in a tf.function garbage-collected. Code needs"
-              " to keep python references to variables created in a"
-              " tf.function.")
+              "A tf.Variable created inside your tf.function has been"
+              " garbage-collected. Your code needs to keep Python references"
+              " to variables created inside `tf.function`s.\n"
+              "\n"
+              "A common way to raise this error is to create and return a"
+              " variable only referenced inside your function:\n"
+              "\n"
+              "@tf.function\n"
+              "def f():\n"
+              "  v = tf.Variable(1.0)\n"
+              "  return v\n"
+              "\n"
+              "v = f()  # Crashes with this error message!\n"
+              "\n"
+              "The reason this crashes is that @tf.function annotated"
+              " function returns a **`tf.Tensor`** with the **value** of the"
+              " variable when the function is called rather than the"
+              " variable instance itself. As such there is no code holding a"
+              " reference to the `v` created inside the function and Python"
+              " garbage collects it.\n"
+              "\n"
+              "The simplest way to fix this issue is to create variables"
+              " outside the function and capture them:\n"
+              "\n"
+              "v = tf.Variable(1.0)\n"
+              "\n"
+              "@tf.function\n"
+              "def f():\n"
+              "  return v\n"
+              "\n"
+              "f()  # <tf.Tensor: ... numpy=1.>\n"
+              "v.assign_add(1.)\n"
+              "f()  # <tf.Tensor: ... numpy=2.>")
         condition = math_ops.logical_and(
             condition, resource_variable_ops.var_is_initialized_op(
                 variable.handle))
@@ -335,6 +492,9 @@ class PolymorphicFunction(object):
           functools.partial(self._concrete_stateful_fn._filtered_call,  # pylint: disable=protected-access
                             inner_args, inner_kwds))
 
+    # We've created variables and are unable to lift the initialization graphs,
+    # so we fall back to initializing with conds while running the function.
+    canon_args, canon_kwds = self._canonicalize_function_inputs(args, kwds)
     return function_lib.defun(fn_with_cond)(*canon_args, **canon_kwds)
 
   @property
@@ -342,20 +502,49 @@ class PolymorphicFunction(object):
     """The python function wrapped in this tf.function."""
     return self._python_function
 
+  @property
+  def input_signature(self):
+    return self._input_signature
+
+  @property
+  def function_spec(self):
+    return self._function_spec
+
+  def _initialize_uninitialized_variables(self, initializer_map):
+    """Make and call a `ConcreteFunction` which initializes variables."""
+
+    # Note: using defun here avoids an infinite recursion.
+    @function_lib.defun
+    def initialize_variables():
+      for v, init in initializer_map.items():
+        with ops.init_scope():
+          if resource_variable_ops.var_is_initialized_op(v.handle):
+            # Ignore variables which are already initialized at trace time.
+            continue
+        v.assign(lift_to_graph.lift_to_graph(
+            [init], ops.get_default_graph())[init])
+
+    with ops.init_scope():
+      return initialize_variables.get_concrete_function()()
+
   def get_initialization_function(self, *args, **kwargs):
-    """Returns a `Function` object which initializes this function's variables.
+    """Returns a `ConcreteFunction` which initializes this function's variables.
 
     Requires that this function hasn't been accessed yet through either calling
     it or calling get_concrete_function. Fails if we cannot build an initializer
     function which does not depend on the concrete values of the inputs to this
     function.
 
+    Note that running this function will overwrite any values currently assigned
+    to variables, for example restores from a checkpoint.
+
     Args:
       *args: arguments to the underlying python callable.
       **kwargs: keyword arguments to the python callable.
 
     Returns:
-      A `Function` object which initializes the variables of this function.
+      A `ConcreteFunction` object which initializes the variables of this
+      function.
 
     Raises:
       RuntimeError: if called after the variables have been initialized.
@@ -374,22 +563,61 @@ class PolymorphicFunction(object):
     def initialize_variables():
       for v, init in initializer_map.items():
         v.assign(lift_to_graph.lift_to_graph(
-            init, ops.get_default_graph())[init])
+            [init], ops.get_default_graph())[init])
 
     return initialize_variables.get_concrete_function()
 
+  def _list_all_concrete_functions_for_serialization(self):
+    """Returns all concrete functions for serialization.
+
+    Returns:
+      A list of instances of `Function`.
+    """
+    if self._input_signature is not None:
+      self.get_concrete_function()
+    concrete_functions = []
+    # pylint: disable=protected-access
+    if self._stateful_fn:
+      concrete_functions.extend(
+          self._stateful_fn._function_cache.all_values())
+    if self._stateless_fn:
+      concrete_functions.extend(
+          self._stateless_fn._function_cache.all_values())
+    # pylint: enable=protected-access
+    deduplicated_concrete_functions = list()
+    seen_signatures = list()
+    # We are using a list so that:
+    #  - the returned collection is deterministic, and
+    #  - we can use a custom equality operator (is_same_structure).
+    # This is run only at serialization time on likely very small inputs so we
+    # are not concerned about O(n^2) runtime.
+    for concrete_function in concrete_functions:
+      signature, _ = concrete_function.structured_input_signature
+      flattened = nest.flatten(signature)
+      if any(
+          isinstance(arg, func_graph_module.UnknownArgument)
+          for arg in flattened):
+        logging.info("Unsupported signature for serialization: %s.", signature)
+        continue
+      equal_to_signature = functools.partial(
+          function_lib.is_same_structure, signature, check_values=True)
+      if not any(equal_to_signature(s) for s in seen_signatures):
+        deduplicated_concrete_functions.append(concrete_function)
+        seen_signatures.append(signature)
+    return deduplicated_concrete_functions
+
   def get_concrete_function(self, *args, **kwargs):
-    """Returns a `Function` object specialized to inputs and execution context.
+    """Returns a `ConcreteFunction` specialized to inputs and execution context.
 
-    If this `PolymorphicFunction` was created with an `input_signature`, `args`
-    and `kwargs` may be omitted. With an input signature there is only one
-    concrete function associated with this `PolymorphicFunction`.
+    If this `Function` was created with an `input_signature`, `args` and
+    `kwargs` may be omitted. With an input signature there is only one
+    concrete function associated with this `Function`.
 
     If there is no fixed `input_signature` associated with this
-    `PolymorphicFunction`, positional and keyword arguments to
-    `get_concrete_function` follow the same rules as input signature
-    specification, with `tf.TensorSpec` objects describing `tf.Tensor`s which
-    will be passed to the concrete function.
+    `Function`, positional and keyword arguments to `get_concrete_function`
+    follow the same rules as input signature specification, with `tf.TensorSpec`
+    objects describing `tf.Tensor`s which will be passed to the concrete
+    function.
 
     Each `tf.Tensor` argument to the concrete function must have a unique name,
     either because it is the only one associated with a named argument of the
@@ -454,9 +682,10 @@ class PolymorphicFunction(object):
     Raises:
       ValueError: if this object has not yet been called on concrete values.
     """
-    assert context.executing_eagerly()
     if self._stateful_fn is None:
-      self.get_initialization_function(*args, **kwargs)()
+      initializer_map = {}
+      self._initialize(args, kwargs, add_initializers_to=initializer_map)
+      self._initialize_uninitialized_variables(initializer_map)
 
     if self._created_variables:
       # In this case we have created variables on the first call, so we run the
@@ -474,8 +703,8 @@ class PolymorphicFunction(object):
   def __get__(self, instance, owner):
     """Makes it possible to defun instance methods."""
     del owner
-    # `instance` here is the instance that this `PolymorphicFunction` was
-    # accessed through; e.g., for
+    # `instance` here is the instance that this `Function` was accessed through
+    # e.g., for
     #
     #   class Foo(object):
     #
@@ -484,10 +713,10 @@ class PolymorphicFunction(object):
     #       ...
     #
     #   foo = Foo()
-    #   foo.bar()  # `foo.bar` is a `PolymorphicFunction` instance
+    #   foo.bar()  # `foo.bar` is a `Function` instance
     #
     # then `instance` will be `foo` (and `owner` will be `Foo`).  We create a
-    # new instance of PolymorphicFunction here to allow different instances each
+    # new instance of `Function` here to allow different instances each
     # to create variables once, thereby allowing methods to be decorated with
     # tf.function. Keeps a cache to avoid retracing the function every time the
     # descriptor is accessed.
@@ -499,8 +728,7 @@ class PolymorphicFunction(object):
     return self._descriptor_cache[instance]
 
 
-# In TensorFlow 1.x, exported as tf.contrib.eager.function
-@tf_export("function", v1=[])
+@tf_export("function")
 def function(func=None,
              input_signature=None,
              autograph=True,
@@ -528,14 +756,38 @@ def function(func=None,
   assert f(x, y).numpy() == g(x, y).numpy()
 
   # Tensors and tf.Variables used by the Python function are captured in the
-  # traced graph.
+  # graph.
   @tf.function
   def h():
     return f(x, y)
 
   assert (h().numpy() == f(x, y).numpy()).all()
+
+  # Data-dependent control flow is also captured in the graph. Supported
+  # control flow statements include `if`, `for`, `break`, `continue`, `return`.
+  @tf.function
+  def g(x):
+    if tf.reduce_sum(x) > 0:
+      return x * x
+    else:
+      return -x // 2
+
+  # print and TensorFlow side effects are supported, but exercise caution when
+  # using Python side effects like mutating objects, saving to files, etc.
+  l = []
+
+  @tf.function
+  def g(x):
+    for i in x:
+      print(i)                              # Works
+      tf.assign(v, i)                       # Works
+      tf.py_func(lambda i: l.append(i))(i)  # Works
+      l.append(i)                           # Caution! Doesn't work.
   ```
 
+  Note that unlike other TensorFlow operations, we don't convert python
+  numerical inputs to tensors.
+
   _Referencing `tf.Variable`s_
 
   The Python function `func` may reference stateful objects (such as
@@ -605,6 +857,7 @@ def function(func=None,
   ```
 
   _Input Signatures_
+
   `function` instantiates a separate graph for every unique set of input
   shapes and datatypes. For example, the following code snippet will result
   in three distinct graphs being traced, as each input has a different
@@ -635,12 +888,18 @@ def function(func=None,
   def f(x): return tf.add(x, 1.)
   ```
 
-  When an `input_signature` is specified, the callable will only accept `Tensor`
-  (or NumPy `ndarray`) objects as arguments.
+  When an `input_signature` is specified, the callable will convert the inputs
+  to the specified TensorSpecs.
 
-  _Tracing_
-  Note that `function` only traces TensorFlow operations, all the other
-  Python code that `func` executes will shape the _construction_ of the graph.
+  _Tracing and staging_
+
+  When `autograph` is `True`, all Python code that depends on `Tensor` values is
+  staged into a TensorFlow graph. When `autograph` is `False`, the function is
+  traced and control flow is not allowed to depend on data.
+
+  Note that `function` only stages TensorFlow operations, all Python code that
+  `func` executes and does not depend on data will shape the _construction_ of
+  the graph.
   For example, consider the following:
 
   ```python
@@ -653,21 +912,26 @@ def function(func=None,
   ```
 
   `add_noise()` will return a different output every time it is invoked.
-  However, `traced` will return the same value every time it is called, since a
-  particular random value generated by the `np.random.randn` call will be
-  inserted in the traced TensorFlow graph as a constant. In this particular
-  example, replacing `np.random.randn(5, 5)` with `tf.random_normal((5, 5))`
-  will result in the same behavior for `add_noise()` and `traced()`.
+  However, `traced()` will return the same value every time it is called,
+  since a particular random value generated by the `np.random.randn` call will
+  be inserted in the traced/staged TensorFlow graph as a constant. In this
+  particular example, replacing `np.random.randn(5, 5)` with
+  `tf.random_normal((5, 5))` will result in the same behavior for `add_noise()`
+  and `traced()`.
 
   _Python Side-Effects_
+
   A corollary of the previous discussion on tracing is the following: If a
   Python function `func` has Python side-effects, then executing `func` multiple
-  times
-  may not be semantically equivalent to executing `F = tf.function(func)`
+  times may not be semantically equivalent to executing `F = tf.function(func)`
   multiple times; this difference is due to the fact that `function` only
   captures the subgraph of TensorFlow operations that is constructed when `func`
   is invoked to trace a graph.
 
+  The same is true if code with Python side effects is used inside control flow,
+  such as a loop. If your code uses side effects that are not intended to
+  control graph construction, wrap them inside `tf.py_func`.
+
   Args:
     func: function to be compiled. If `func` is None, returns a decorator that
       can be invoked with a single argument - `func`. The end result is
@@ -710,7 +974,7 @@ def function(func=None,
       name = "function"
     return tf_decorator.make_decorator(
         inner_function,
-        PolymorphicFunction(
+        Function(
             inner_function,
             name,
             input_signature=input_signature,
diff --git a/tensorflow/python/eager/def_function_test.py b/tensorflow/python/eager/def_function_test.py
index 4100a10044c3c39763de8bb3eec645e278d94e19..fdf054a9990d9c12109088a509c6d762dea310ee 100644
--- a/tensorflow/python/eager/def_function_test.py
+++ b/tensorflow/python/eager/def_function_test.py
@@ -18,17 +18,26 @@ from __future__ import division
 from __future__ import print_function
 
 import functools
+import weakref
 
 from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
+from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import adam
@@ -53,6 +62,28 @@ class _ModelWithOptimizer(training.Model):
     return {'loss': loss}
 
 
+class _HasDecoratedMethod(object):
+
+  @def_function.function
+  def f(self, x):
+    return x * 3.
+
+# pylint: disable=bad-continuation,anomalous-backslash-in-string
+MIXING_GRAPH_EAGER_TENSORS_ERROR = (
+"""An op outside of the function building code is being passed
+a "Graph" tensor. It is possible to have Graph tensors
+leak out of the function building context by including a
+tf.init_scope in your function building code.
+For example, the following function will fail:
+  @tf.function
+  def has_init_scope\(\):
+    my_constant = tf.constant\(1.\)
+    with tf.init_scope\(\):
+      added = my_constant \* 2
+The graph tensor has name: Const:0""")
+# pylint: enable=bad-continuation,anomalous-backslash-in-string
+
+
 class DefFunctionTest(test.TestCase):
 
   def testNoVariables(self):
@@ -183,7 +214,8 @@ class DefFunctionTest(test.TestCase):
           state.append(variables.Variable(2.0 * x))
         return state[0] * x
 
-      with self.assertRaises(ValueError):
+      with self.assertRaisesRegexp(
+          lift_to_graph.UnliftableError, r'transitively.* mul .* x'):
         fn(constant_op.constant(3.0))
 
   def testMethod(self):
@@ -238,6 +270,233 @@ class DefFunctionTest(test.TestCase):
     concrete = compute.get_concrete_function(
         tensor_spec.TensorSpec(None, dtypes.float32))
     self.assertAllClose(4., concrete(constant_op.constant(2.)))
+    signature_args, _ = concrete.structured_input_signature
+    self.assertEqual(signature_args,
+                     (tensor_spec.TensorSpec(
+                         None, dtypes.float32, name='x'),))
+
+  def test_error_inner_capture(self):
+
+    @def_function.function
+    def f(inputs):
+      num_steps, _ = inputs.shape[:2]
+      outputs = []
+      for t in math_ops.range(num_steps):
+        outputs.append(inputs[t])
+      return outputs
+
+    with self.assertRaisesRegexp(ValueError, 'inner'):
+      f(array_ops.zeros(shape=(8, 42, 3)))
+
+  def testRuntimeErrorNotSticky(self):
+
+    @def_function.function
+    def fail(i):
+      control_flow_ops.Assert(math_ops.equal(i, 0), ['ick'])
+
+    fail(constant_op.constant(0))  # OK
+    with self.assertRaises(errors.InvalidArgumentError):
+      fail(constant_op.constant(1))  # InvalidArgument: "ick"
+    fail(constant_op.constant(0))  # OK
+
+  def testUnderscoreName(self):
+
+    @def_function.function
+    def f(_):
+      return _ + _
+
+    self.assertAllEqual(2.0, f(constant_op.constant(1.0)))
+
+  def test_serialization_signature_cache(self):
+
+    @def_function.function
+    def f(x, y):
+      return x, y
+
+    f(constant_op.constant([[3., 4.]]), constant_op.constant([2.]))
+    f(constant_op.constant([[3, 4, 5]]), constant_op.constant([2]))
+
+    signatures_args = set()
+    concrete_functions = f._list_all_concrete_functions_for_serialization()
+    for concrete_function in concrete_functions:
+      args, kwargs = concrete_function.structured_input_signature
+      signatures_args.add(args)
+      self.assertEqual(dict(), kwargs)
+
+    self.assertEqual(
+        signatures_args,
+        set(((tensor_spec.TensorSpec([1, 2], dtypes.float32, name='x'),
+              tensor_spec.TensorSpec([1], dtypes.float32, name='y')),
+             (tensor_spec.TensorSpec([1, 3], dtypes.int32, name='x'),
+              tensor_spec.TensorSpec([1], dtypes.int32, name='y')))))
+
+  @test_util.assert_no_garbage_created
+  def testFunctionReferenceCycles(self):
+    fn = def_function.function(lambda x: 2. * x)
+    fn(constant_op.constant(4.0))
+    weak_fn = weakref.ref(fn)
+    del fn
+    # Tests that the weak reference we made to the function is now dead, which
+    # means the object has been deleted. This should be true as long as the
+    # function itself is not involved in a reference cycle.
+    self.assertIs(None, weak_fn())
+
+  @test_util.assert_no_garbage_created
+  def testMethodReferenceCycles(self):
+    has_decorated_method = _HasDecoratedMethod()
+    has_decorated_method.f(constant_op.constant(5.))
+    weak_fn = weakref.ref(has_decorated_method.f)
+    del has_decorated_method
+    # Tests that the weak reference we made to the function is now dead, which
+    # means the object has been deleted. This should be true as long as the
+    # function itself is not involved in a reference cycle.
+    self.assertIs(None, weak_fn())
+
+  def testErrorMessageWhenGraphTensorIsPassedToEager(self):
+
+    @def_function.function
+    def failing_function():
+      a = constant_op.constant(1.)
+
+      with ops.init_scope():
+        _ = a + a
+
+    with self.assertRaisesRegexp(TypeError, MIXING_GRAPH_EAGER_TENSORS_ERROR):
+      failing_function()
+
+  def testVariableCreatorScope(self):
+    created_variables = []
+    captured_variables = []
+
+    @def_function.function
+    def f():
+      if not created_variables:
+        created_variables.append(variables.Variable(1.))
+      return created_variables[0] + 1.
+
+    def capture_creator(next_creator, **kwargs):
+      created = next_creator(**kwargs)
+      captured_variables.append(created)
+      return created
+
+    with variable_scope.variable_creator_scope(capture_creator):
+      f()
+    self.assertEqual(created_variables, captured_variables)
+
+  def testVarAlreadyInitializedNoClobbering(self):
+    v_holder = []
+
+    @def_function.function
+    def add_var(x):
+      if not v_holder:
+        v = variables.Variable([1., 2.])
+        v_holder.append(v)
+        already_initialized = variables.Variable(3.)
+        with ops.init_scope():
+          already_initialized.assign(10.)
+        v_holder.append(already_initialized)
+      return v_holder[0] + v_holder[1] + x
+
+    add_var.get_concrete_function(constant_op.constant(2.))
+    self.assertAllClose([13., 14.], add_var(constant_op.constant(2.)))
+
+  def testSameVariableTwice(self):
+
+    v = variables.Variable(1.0)
+
+    @def_function.function
+    def add(a, b):
+      return a + b
+
+    self.assertAllEqual(add(v, v), 2.0)
+
+  def testShapeCache(self):
+    @def_function.function
+    def func(x):
+      return 2 * x
+
+    func_a = func.get_concrete_function(
+        tensor_spec.TensorSpec([None], dtypes.int32))
+    func_b = func.get_concrete_function(
+        tensor_spec.TensorSpec([None], dtypes.int32))
+
+    self.assertIs(func_a, func_b)
+
+  def testInitializationInNestedCall(self):
+    v_holder = []
+
+    @def_function.function
+    def add_var(x):
+      if not v_holder:
+        v = variables.Variable([1., 2.])
+        v_holder.append(v)
+        already_initialized = variables.Variable(3.)
+        with ops.init_scope():
+          already_initialized.assign(10.)
+        v_holder.append(already_initialized)
+      return v_holder[0] + v_holder[1] + x
+
+    @def_function.function
+    def wrapper(x):
+      return add_var(x)
+
+    self.assertAllClose([13., 14.], wrapper(constant_op.constant(2.)))
+    v_holder[1].assign(11.)
+    self.assertAllClose([14., 15.], wrapper(constant_op.constant(2.)))
+
+  def testDeviceAnnotationRespected(self):
+    if not context.num_gpus():
+      self.skipTest("Needs multiple devices")
+
+    a = []
+
+    @def_function.function()
+    def create_variable():
+      with ops.init_scope():
+        initial_value = random_ops.random_uniform(
+            (2, 2), maxval=1000000, dtype=dtypes.int64)
+
+      if not a:
+        with ops.device("CPU:0"):
+          a.append(resource_variable_ops.ResourceVariable(initial_value))
+
+      return a[0].read_value()
+
+    created_variable_read = create_variable()
+    self.assertRegexpMatches(created_variable_read.device, "CPU")
+
+  def testDecorate(self):
+    func = def_function.function(lambda: 1)
+    def decorator(f):
+      return lambda: 1 + f()
+
+    func._decorate(decorator)
+    self.assertEqual(func().numpy(), 2)
+
+  def testLiftPlaceholderInitializedVariable(self):
+    with ops.Graph().as_default():
+      var_list = []
+
+      @def_function.function
+      def use_variable():
+        if not var_list:
+          initial_value = array_ops.placeholder(shape=[], dtype=dtypes.float32)
+          v = variables.Variable(initial_value)
+          var_list.append(v)
+        return var_list[0] + 1.
+
+      var_plus_one = use_variable()
+      with self.session() as session:
+        init_op = var_list[0].initializer
+        session.run(init_op, feed_dict={init_op.inputs[1]: 2.})
+        self.assertEqual(3., session.run(var_plus_one))
+
+  def testDecorate_rejectedAfterTrace(self):
+    func = def_function.function(lambda: 1)
+    self.assertEqual(func().numpy(), 1)
+    msg = 'Functions cannot be decorated after they have been traced.'
+    with self.assertRaisesRegexp(ValueError, msg):
+      func._decorate(lambda f: f)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/eager/def_function_xla_test.py b/tensorflow/python/eager/def_function_xla_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9115d8a6943532fb87f1514ee20354067015a7d8
--- /dev/null
+++ b/tensorflow/python/eager/def_function_xla_test.py
@@ -0,0 +1,49 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class DefFunctionTests(xla_test.XLATestCase):
+
+  def testVarInitializedInFunction(self):
+    with self.test_scope():
+      v_holder = []
+
+      @def_function.function
+      def add_var(x):
+        if not v_holder:
+          v = variables.Variable([1., 2.])
+          v_holder.append(v)
+          already_initialized = variables.Variable(3.)
+          with ops.init_scope():
+            already_initialized.assign(10.)
+          v_holder.append(already_initialized)
+        return v_holder[0] + v_holder[1] + x
+
+      self.assertAllClose([13., 14.], add_var(constant_op.constant(2.)))
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/eager/execute.py b/tensorflow/python/eager/execute.py
index 6f8c780170cc8e3bfe5aa23603c0448e70b5e49c..7415a0ae22a3492fc9179a0cae37d09e9c1ad9aa 100644
--- a/tensorflow/python/eager/execute.py
+++ b/tensorflow/python/eager/execute.py
@@ -66,12 +66,6 @@ def quick_execute(op_name, num_outputs, inputs, attrs, ctx, name=None):
     six.raise_from(core._status_to_exception(e.code, message), None)
   except TypeError as e:
     if any(ops._is_keras_symbolic_tensor(x) for x in inputs):
-      if any(isinstance(x, ops.EagerTensor) for x in inputs):
-        raise TypeError("You are attempting to mix computation of symbolic "
-                        "Tensors (computation rooted at tf.keras.Input()) "
-                        "and concrete values. This is not supported. "
-                        "If you need this support, file an issue on the "
-                        "TensorFlow GitHub repository.")
       raise core._SymbolicException
     raise e
   # pylint: enable=protected-access
diff --git a/tensorflow/python/eager/execution_callbacks.py b/tensorflow/python/eager/execution_callbacks.py
index 28b6b84a82c6550cd0e1b893b5002d13b306233d..34fa3da39abc2a7311723a38011785145f23792c 100644
--- a/tensorflow/python/eager/execution_callbacks.py
+++ b/tensorflow/python/eager/execution_callbacks.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import contextlib
 import functools
+import enum  # pylint: disable=g-bad-import-order
 
 import numpy as np
 
@@ -29,13 +30,25 @@ from tensorflow.python.eager import core
 from tensorflow.python.eager import execute
 from tensorflow.python.platform import tf_logging as logging
 
-IGNORE = "ignore"
-PRINT = "print"
-RAISE = "raise"
-WARN = "warn"
 
-_DEFAULT_CALLBACK_ACTION = RAISE
-_VALID_CALLBACK_ACTIONS = (None, IGNORE, PRINT, RAISE, WARN)
+class ExecutionCallback(enum.Enum):
+  """Valid callback actions.
+
+  These can be passed to `seterr` or `errstate` to create callbacks when
+  specific events occur (e.g. an operation produces `NaN`s).
+
+  IGNORE: take no action.
+  PRINT:  print a warning to `stdout`.
+  RAISE:  raise an error (e.g. `InfOrNanError`).
+  WARN:   print a warning using `tf.logging.warn`.
+  """
+
+  IGNORE = "ignore"
+  PRINT = "print"
+  RAISE = "raise"
+  WARN = "warn"
+
+_DEFAULT_CALLBACK_ACTION = ExecutionCallback.RAISE
 
 
 # TODO(cais): Consider moving this exception class to errors_impl.py.
@@ -51,7 +64,7 @@ class InfOrNanError(Exception):
     """Constructor of InfOrNanError.
 
     Args:
-      op_type: Type name of the op that generated the tensor that generated the
+      op_type: Type name of the op that generated the tensor with
         `inf`(s) or `nan`(s) (e.g., `Div`).
       op_name: Name of the op that generated the tensor with `inf`(s) or
         `nan`(s). This name is set by client and can be `None` if it is unset.
@@ -139,11 +152,8 @@ def inf_nan_callback(op_type,
       the output tensor values.
     check_nan: (`bool`) Whether this callback should check for `nan` values in
       the output tensor values.
-    action: (`str`) Action to be taken by the callback when `inf` or `nan`
-      values are detected. Possible values {"raise", "warn", "print"}
-      `"raise"`: Raise a `InfOrNanError`.
-      `"warn"`: Log a warning using `tf.logging.warn`.
-      `"print"`: Print a message to `sys.stdout`.
+    action: (`ExecutionCallback`) Action to be taken by the callback when
+      `inf` or `nan` values are detected.
 
   Raises:
     InfOrNanError: iff `inf` or `nan` values are seen in any of `outputs` and
@@ -152,6 +162,7 @@ def inf_nan_callback(op_type,
   """
   del attrs, inputs  # Not used.
 
+  action = ExecutionCallback(action)
   ctx = context.context()
 
   for index, output in enumerate(outputs):
@@ -180,16 +191,16 @@ def inf_nan_callback(op_type,
           continue
 
         error = InfOrNanError(op_type, op_name, index, len(outputs), value)
-        if action == "print":
+        if action == ExecutionCallback.PRINT:
           print("Warning: %s" % str(error))
-        elif action == "warn":
+        elif action == ExecutionCallback.WARN:
           logging.warn(str(error))
-        elif action == "raise":
+        elif action == ExecutionCallback.RAISE:
           raise error
         else:
           raise ValueError(
               "Invalid action for inf_nan_callback: %s. Valid actions are: "
-              "{print | warn | raise}" % action)
+              "{PRINT | WARN | RAISE}" % action)
 
 
 def inf_callback(op_type,
@@ -282,7 +293,7 @@ def seterr(inf_or_nan=None):
 
   Example:
   ```python
-  tfe.seterr(inf_or_nan="raise")
+  tfe.seterr(inf_or_nan=ExecutionCallback.RAISE)
   a = tf.constant(10.0)
   b = tf.constant(0.0)
   try:
@@ -290,18 +301,14 @@ def seterr(inf_or_nan=None):
   except Exception as e:
     print("Caught Exception: %s" % e)
 
-  tfe.seterr(inf_or_nan="ignore")
+  tfe.seterr(inf_or_nan=ExecutionCallback.IGNORE)
   c = a / b  # <-- Does NOT raise exception anymore.
   ```
 
   Args:
-    inf_or_nan: Set action for infinity (`inf`) and NaN (`nan`) values.
-      Possible values: `{"ignore", "print", "raise", "warn"}`.
-      `"ignore"`: take no action when `inf` values appear.
-      `"print"`: print a warning to `stdout`.
-      `"raise"`: raise an `InfOrNanError`.
-      `"warn"`: print a warning using `tf.logging.warn`.
-      A value of `None` leads to no change in the action of the condition.
+    inf_or_nan: An `ExecutionCallback` determining the action for infinity
+      (`inf`) and NaN (`nan`) values. A value of `None` leads to no change in
+      the action of the condition.
 
   Returns:
     A dictionary of old actions.
@@ -309,12 +316,8 @@ def seterr(inf_or_nan=None):
   Raises:
     ValueError: If the value of any keyword arguments is invalid.
   """
-  if inf_or_nan not in _VALID_CALLBACK_ACTIONS:
-    raise ValueError(
-        "Invalid action value for inf_or_nan: %s. "
-        "Valid actions are %s." % (inf_or_nan, _VALID_CALLBACK_ACTIONS))
-
-  old_settings = {"inf_or_nan": "ignore"}
+  inf_or_nan = ExecutionCallback(inf_or_nan) if inf_or_nan is not None else None
+  old_settings = {"inf_or_nan": ExecutionCallback.IGNORE}
   default_context = context.context()
 
   carryover_callbacks = []
@@ -336,7 +339,7 @@ def seterr(inf_or_nan=None):
     default_context.clear_post_execution_callbacks()
     for callback in carryover_callbacks:
       default_context.add_post_execution_callback(callback)
-    if inf_or_nan != "ignore":
+    if inf_or_nan != ExecutionCallback.IGNORE:
       default_context.add_post_execution_callback(
           functools.partial(inf_nan_callback, action=inf_or_nan))
 
@@ -351,18 +354,14 @@ def errstate(inf_or_nan=None):
   ```
   c = tf.log(0.)  # -inf
 
-  with errstate(inf_or_nan="raise"):
+  with errstate(inf_or_nan=ExecutionCallback.RAISE):
     tf.log(0.)  # <-- Raises InfOrNanError.
   ```
 
   Args:
-    inf_or_nan: Set action for infinity (`inf`) and NaN (`nan`) values.
-      Possible values: `{IGNORE, PRINT, RAISE, WARN}`.
-      `IGNORE`: take no action when `inf` values appear.
-      `PRINT`: print a warning to `stdout`.
-      `RAISE`: raise an `InfOrNanError`.
-      `WARN`: print a warning using `tf.logging.warn`.
-      A value of `None` leads to no change in the action of the condition.
+    inf_or_nan: An `ExecutionCallback` determining the action for infinity
+      (`inf`) and NaN (`nan`) values. A value of `None` leads to no change in
+      the action of the condition.
 
   Yields:
     None.
diff --git a/tensorflow/python/eager/execution_callbacks_test.py b/tensorflow/python/eager/execution_callbacks_test.py
index 5594ab5f12abffb1e2b3bb4d1d0fa4251eedf809..b8b786ad2eeff5513ab0c6b2072d7b91975ee1f4 100644
--- a/tensorflow/python/eager/execution_callbacks_test.py
+++ b/tensorflow/python/eager/execution_callbacks_test.py
@@ -24,6 +24,9 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
+RAISE = execution_callbacks.ExecutionCallback.RAISE
+IGNORE = execution_callbacks.ExecutionCallback.IGNORE
+
 
 def log_zero():
   """Computes `log(0.0)`."""
@@ -33,17 +36,17 @@ def log_zero():
 class ExecutionCallbacksTest(test.TestCase):
 
   def test_errstate_inf_raise(self):
-    with execution_callbacks.errstate(inf_or_nan=execution_callbacks.RAISE):
+    with execution_callbacks.errstate(inf_or_nan=RAISE):
       with self.assertRaises(execution_callbacks.InfOrNanError):
         log_zero()
 
   def test_errstate_inf_ignore(self):
-    with execution_callbacks.errstate(inf_or_nan=execution_callbacks.IGNORE):
+    with execution_callbacks.errstate(inf_or_nan=IGNORE):
       self.assertEqual(-float("inf"), log_zero().numpy())
 
   def test_errstate_nesting(self):
-    with execution_callbacks.errstate(inf_or_nan=execution_callbacks.RAISE):
-      with execution_callbacks.errstate(inf_or_nan=execution_callbacks.IGNORE):
+    with execution_callbacks.errstate(inf_or_nan=RAISE):
+      with execution_callbacks.errstate(inf_or_nan=IGNORE):
         self.assertEqual(-float("inf"), log_zero().numpy())
 
       with self.assertRaises(execution_callbacks.InfOrNanError):
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 520c85a2c2093436d8d99b4713f0ad5fcc92321d..487fd5a9728daa8b6ce6f859ec2dfb9ac0e4ca4b 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -21,8 +21,6 @@ from __future__ import print_function
 
 import collections
 import functools
-import re
-import sys
 import threading
 import types as types_lib
 import weakref
@@ -41,36 +39,134 @@ from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes as dtypes_module
+from tensorflow.python.framework import error_interpolation
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import gradients_util
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
+from tensorflow.python.util import function_utils
+from tensorflow.python.util import memory
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
-# This is to avoid a circular dependency with gradients_impl
-gradients_impl._function = sys.modules[__name__]  # pylint: disable=protected-access
 
 FORWARD_FUNCTION_ATTRIBUTE_NAME = "forward_function_name"
 BACKWARD_FUNCTION_ATTRIBUTE_NAME = "backward_function_name"
 
-# TODO(scottzhu): Update this to allow arbitrary attribute names in future.
-WHITELIST_FUNCTION_ATTRIBUTE_REGEX = [
-    "experimental_.*",
-    FORWARD_FUNCTION_ATTRIBUTE_NAME,
-    BACKWARD_FUNCTION_ATTRIBUTE_NAME
-]
+class CacheKey(
+    collections.namedtuple("CacheKey", [
+        "input_signature", "parent_graph", "device_functions",
+        "colocation_stack", "uses_xla"])):
 
-CacheKey = collections.namedtuple("CacheKey", [
-    "input_signature", "parent_graph", "device_functions", "colocation_stack",
-    "uses_xla"
-])
+  def replace(self, *args, **kwargs):
+    return self._replace(*args, **kwargs)
+
+
+def _flat_shape_list(*params):
+  """Return a flat list of TensorShapes, one for each tensor[spec] in `*params`.
+
+  Args:
+    *params: Set of nested entries containing Tensors, TensorSpec, and
+      non-tensors.
+
+  Returns:
+    A list of entries containing either `None` or `TensorShape`.
+  """
+  return [tensor_shape.TensorShape(x.shape)
+          if isinstance(x, (ops.Tensor, tensor_spec.TensorSpec)) else None
+          for x in nest.flatten(params)]
+
+
+def _compatible_shapes(flat_x, flat_y):
+  """Check if lists of TensorShapes contain compatible shapes.
+
+  Args:
+    flat_x: List of TensorShape or None.
+    flat_y: List of TensorShape or None.
+
+  Returns:
+    A python bool.
+
+  Raises:
+    RuntimeError: if `len(flat_x) != len(flat_y)`.
+    RuntimeError: if `flat_x[i] is None != flat_y[i] is None` for any `i`.
+  """
+  if len(flat_x) != len(flat_y):
+    raise RuntimeError("Expected shape lists of identical lengths, but saw: "
+                       "%s and %s" % (flat_x, flat_y))
+  def is_compatible(x, y):
+    """Internal help function.
+
+    Args:
+      x: TensorShape or None.
+      y: TensorShape or None.
+
+    Returns:
+      Python bool.
+
+    Raises:
+      RuntimeError: If `x is None != y is None`.
+    """
+    # If both x and y are None, there is no shape to compare.  Otherwise check
+    # if they are compatible with each other.  Either way, both input signatures
+    # must have have Tensors in the same entries.  If not, raise an assertion
+    # error.
+    if x is None != y is None:
+      raise RuntimeError(
+          "Expected signature type matches between flattened input shapes "
+          "%s and %s; but saw that (%s is None) != (%s is None)"
+          % (flat_x, flat_y, x, y))
+    return x is None or x.is_compatible_with(y)
+  return all(is_compatible(x, y) for x, y in zip(flat_x, flat_y))
+
+
+def _common_shape(x, y):
+  """Find a `TensorShape` that is compatible with both `x` and `y`."""
+  if x is None != y is None:
+    raise RuntimeError(
+        "Cannot find a common shape when LHS shape is None but RHS shape "
+        "is not (or vice versa): %s vs. %s" % (x, y))
+  if x is None:
+    return None  # The associated input was not a Tensor, no shape generated.
+  if not isinstance(x, tensor_shape.TensorShape):
+    raise TypeError("Expected x to be a TensorShape but saw %s" % (x,))
+  if not isinstance(y, tensor_shape.TensorShape):
+    raise TypeError("Expected y to be a TensorShape but saw %s" % (y,))
+  if x.rank != y.rank or x.rank is None:
+    return tensor_shape.TensorShape(None)
+  dims = []
+  for dim_x, dim_y in zip(x.dims, y.dims):
+    if dim_x != dim_y or tensor_shape.dimension_value(dim_x) is None:
+      dims.append(None)
+    else:
+      dims.append(tensor_shape.dimension_value(dim_x))
+  return tensor_shape.TensorShape(dims)
+
+
+def is_same_structure(structure1,
+                      structure2,
+                      check_values=False):
+  """Check two structures for equality, optionally of types and of values."""
+  try:
+    nest.assert_same_structure(structure1, structure2)
+  except (ValueError, TypeError):
+    return False
+  if check_values:
+    flattened1 = nest.flatten(structure1)
+    flattened2 = nest.flatten(structure2)
+    # First check the types to avoid AttributeErrors.
+    if any(type(f1) != type(f2) for f1, f2 in zip(flattened1, flattened2)):
+      return False
+    return flattened1 == flattened2
+  return True
 
 
 def _parse_func_attrs(attributes):
@@ -89,12 +185,6 @@ def _parse_func_attrs(attributes):
   """
   attrs = {}
   for key, value in attributes.items():
-    if not any(re.match(reg, key)
-               for reg in WHITELIST_FUNCTION_ATTRIBUTE_REGEX):
-      raise ValueError("Attribute name is not whitelisted. "
-                       "Whitelisted: prefix %s, got: %s" %
-                       (WHITELIST_FUNCTION_ATTRIBUTE_REGEX, key))
-
     if isinstance(value, attr_value_pb2.AttrValue):
       attrs[key] = value
     # bool type check has to happen before int since bool is a subclass of int.
@@ -104,7 +194,7 @@ def _parse_func_attrs(attributes):
       attrs[key] = attr_value_pb2.AttrValue(i=value)
     elif isinstance(value, float):
       attrs[key] = attr_value_pb2.AttrValue(f=value)
-    elif isinstance(value, (str, bytes)):
+    elif isinstance(value, (str, bytes, six.text_type)):
       attrs[key] = attr_value_pb2.AttrValue(s=compat.as_bytes(value))
     else:
       raise ValueError("Unsupported attribute type for %s with type %s" %
@@ -112,6 +202,46 @@ def _parse_func_attrs(attributes):
   return attrs
 
 
+class _InterpolateFunctionError(object):
+  """Context Manager that interpolates the exception from 'top_level_func'."""
+
+  def __init__(self, top_level_func):
+    self._func = top_level_func
+
+  def __enter__(self):
+    pass
+
+  def __exit__(self, typ, exc, tb):
+    if not exc or not isinstance(exc, errors.OpError):
+      return False
+    message = compat.as_text(exc.message)
+    _, tags = error_interpolation.parse_message(message)
+    g = None
+    func_stack = []
+    # pylint: disable=protected-access
+    for t in tags:
+      if t.type == "function_node":
+        if t.name == compat.as_str(self._func.name):
+          g = self._func._graph
+        elif g:
+          next_func = g._get_function(t.name)
+          if next_func is not None and isinstance(next_func,
+                                                  _EagerDefinedFunction):
+            g = next_func._graph
+        if g:
+          func_stack.append(g.name)
+        else:
+          func_stack.append("<unknown>")
+    # pylint: enable=protected-access
+    if g:
+      message = error_interpolation.interpolate(message, g)
+      message += "\n\nFunction call stack:\n"
+      message += " -> ".join(func_stack)
+      message += "\n"
+      exc._message = message  # pylint: disable=protected-access
+    return False
+
+
 def _forward_name(n):
   """The name of a generated forward defun named n."""
   return "__forward_%s_%s" % (n, ops.uid())
@@ -131,7 +261,7 @@ def _inference_name(n):
 # so it doesn't have the definition-generating logic and is just a container for
 # an already-defined function.
 class _EagerDefinedFunction(object):
-  """Callable with the interface of `framework.function._DefinedFunction.`
+  """Callable with the interface of `framework.function._DefinedFunction`.
 
   `_EagerDefinedFunction` encapsulates a function definition and its properties,
   and it provides a method for calling the encapsulated function. Some Ops
@@ -149,10 +279,9 @@ class _EagerDefinedFunction(object):
       outputs: the tensors in the graph which will be outputs to the function
       attrs: dict mapping names of attributes to their AttrValue values
     """
-    operations = [
-        op for op in graph.get_operations()
-        if op not in set(arg.op for arg in inputs)
-    ]
+    input_ops = set(arg.op for arg in inputs)
+    operations = [op for op in graph.get_operations() if op not in input_ops]
+
     fn = pywrap_tensorflow.TF_GraphToFunction_wrapper(
         graph._c_graph,  # pylint: disable=protected-access
         compat.as_str(name),
@@ -161,6 +290,8 @@ class _EagerDefinedFunction(object):
         [t._as_tf_output() for t in inputs],  # pylint: disable=protected-access
         [t._as_tf_output() for t in outputs],  # pylint: disable=protected-access
         [],
+        [o._c_op for o in graph.control_outputs],  # pylint: disable=protected-access
+        [],  # control_output_names
         None,
         compat.as_str(""))
 
@@ -195,13 +326,16 @@ class _EagerDefinedFunction(object):
     self._graph = graph
     self._stateful_ops = tuple(op for op in operations if op.op_def.is_stateful)
 
-  def add_to_graph(self, g):
+  def add_to_graph(self, g=None):
     # pylint: disable=protected-access
-    if self.name not in g._functions:
-      g._add_function(self)
-    for f in self._graph._functions.values():
-      if f.name not in g._functions:
-        g._add_function(f)
+    if not g and context.executing_eagerly():
+      context.context().add_function_def(self.definition)
+    else:
+      if self.name not in g._functions:
+        g._add_function(self)
+      for f in self._graph._functions.values():
+        if f.name not in g._functions:
+          g._add_function(f)
     # pylint: enable=protected-access
 
   @property
@@ -211,8 +345,8 @@ class _EagerDefinedFunction(object):
   def call(self, ctx, args):
     """Calls this function with `args` as inputs.
 
-    Function execution respects device annotations only if the function won't
-    be compiled with xla.
+    `ConcreteFunction` execution respects device annotations only if the
+    function won't be compiled with xla.
 
     Args:
       ctx: a Context object
@@ -224,50 +358,60 @@ class _EagerDefinedFunction(object):
     Raises:
       ValueError: if the number of arguments is incorrect.
     """
+    if len(args) != len(self.signature.input_arg):
+      raise ValueError(
+          "Arguments and signature arguments do not match: %s %s " %
+          (len(args), len(list(self.signature.input_arg))))
+
+    function_call_options = ctx.get_function_call_options()
+    if function_call_options.config_proto_serialized is None:
+      config = function_utils.get_disabled_rewriter_config()
+    else:
+      config = function_call_options.config_proto_serialized
+    executor_type = function_call_options.executor_type or ""
 
     executing_eagerly = ctx.executing_eagerly()
-
-    if self._graph._xla_compile:  # pylint: disable=protected-access
-      # XLA compilation relies upon a custom kernel creator to run functions.
-      signature = self.signature
-      if executing_eagerly:
+    if executing_eagerly:
+      with _InterpolateFunctionError(self):
         outputs = execute.execute(
-            str(signature.name),
+            str(self.signature.name),
             num_outputs=self._num_outputs,
             inputs=args,
-            attrs=None,
+            attrs=("executor_type", executor_type,
+                   "config_proto", config),
             ctx=ctx)
+      # Replace empty list with None
+      outputs = outputs or None
+    elif self._graph._xla_compile:  # pylint: disable=protected-access
+      g = ops.get_default_graph()
+      self.add_to_graph(g)
+      signature = self.signature
+      op = g.create_op(
+          signature.name,
+          [ops.internal_convert_to_tensor(x, ctx=ctx) for x in args],
+          tuple(dtypes_module.DType(x.type) for x in signature.output_arg),
+          op_def=signature,
+          name="FunctionCall",
+          compute_shapes=False)
+      outputs = op.outputs
+      if not outputs:
+        return op
+      if isinstance(outputs, (ops.Tensor, type(None))):
+        outputs = [outputs]
       else:
-        g = ops.get_default_graph()
-        self.add_to_graph(g)
-        op = g.create_op(
-            signature.name,
-            [ops.internal_convert_to_tensor(x, ctx=ctx) for x in args],
-            tuple(dtypes_module.DType(x.type) for x in signature.output_arg),
-            op_def=signature,
-            name="FunctionCall",
-            compute_shapes=False)
-        outputs = op.outputs
-        if not outputs:
-          return op
-        outputs = [outputs] if isinstance(
-            outputs, (ops.Tensor, type(None))) else list(outputs)
+        outputs = list(outputs)
     else:
       # TODO(akshayka): Either remove this if the FunctionLibraryRuntime
       # creates `PartitionedCallOp` kernels by default, or remove the previous
       # branch if a TPU kernel is registered for `PartitionedCall`.
-      if len(args) != len(self.signature.input_arg):
-        raise ValueError(
-            "Arguments and signature arguments do not match: %s %s " %
-            (len(args), len(list(self.signature.input_arg))))
-      function_call_options = ctx.get_function_call_options()
-      outputs = functional_ops.partitioned_call(
-          args=args,
-          f=self,
-          tout=self._output_types,
-          executing_eagerly=executing_eagerly,
-          config=function_call_options.config_proto_serialized,
-          executor_type=function_call_options.executor_type)
+      with _InterpolateFunctionError(self):
+        outputs = functional_ops.partitioned_call(
+            args=args,
+            f=self,
+            tout=self._output_types,
+            executing_eagerly=executing_eagerly,
+            config=config,
+            executor_type=executor_type)
 
     if executing_eagerly:
       return outputs
@@ -279,15 +423,15 @@ class _EagerDefinedFunction(object):
       return outputs
 
 
-class Function(object):
+class ConcreteFunction(object):
   """Callable object encapsulating a function definition and its gradient.
 
-  `Function` is a callable that encapsulates a function definition and
+  `ConcreteFunction` is a callable that encapsulates a function definition and
   is differentiable under `tf.GradientTape` objects.
   """
 
   def __init__(self, func_graph, attrs=None, signature=None):
-    """Initialize a Function.
+    """Initialize a `ConcreteFunction`.
 
     Args:
       func_graph: An instance of FuncGraph: the function body to wrap.
@@ -296,6 +440,7 @@ class Function(object):
         definition.
      signature: a nested sequence of `TensorSpec` objects specifying the input
        signature of this function.
+
     Raises:
       ValueError: If number of input_placeholders is not equal to the number
         of function inputs.
@@ -323,8 +468,8 @@ class Function(object):
       *args: Tensors or Variables. Positional arguments are only accepted when
         they correspond one-to-one with arguments of the traced Python function.
       **kwargs: Tensors or Variables specified by name. When
-        `get_concrete_function` was called to create this `Function`, each
-        Tensor input was given a name, defaulting to the name of the Python
+        `get_concrete_function` was called to create this `ConcreteFunction`,
+        each Tensor input was given a name, defaulting to the name of the Python
         function's argument but possibly overridden by the `name=` argument to
         `tf.TensorSpec`. These names become the argument names for the concrete
         function.
@@ -333,14 +478,14 @@ class Function(object):
       The result of applying the TF function on the given Tensors.
 
     Raises:
-      AssertionError: If this `Function` was not created through
+      AssertionError: If this `ConcreteFunction` was not created through
         `get_concrete_function`.
       ValueError: If arguments contains anything other than Tensors or
         Variables.
       TypeError: For invalid positional/keyword argument combinations.
     """
     if self._arg_keywords is None or self._num_positional_args is None:
-      if self._signature:
+      if self._signature is not None:
         if kwargs:
           raise NotImplementedError(
               "Keyword arguments not supported when calling a "
@@ -351,21 +496,30 @@ class Function(object):
           "through the public interface. Use get_concrete_function instead.")
     if len(args) > self._num_positional_args:
       raise TypeError(
-          ("Expected at most {} positional arguments ({}), got {}. When "
-           "calling a concrete function, positional arguments may not be bound "
-           "to Tensors within nested structures.").format(
-               self._num_positional_args,
-               self._arg_keywords[:self._num_positional_args],
-               args))
+          ("Expected at most {} positional arguments (and the rest keywords, "
+           "of {}), got {}. When calling a concrete function, positional "
+           "arguments may not be bound to Tensors within nested structures."
+          ).format(self._num_positional_args, self._arg_keywords, args))
     args = list(args)
     for keyword in self._arg_keywords[len(args):]:
-      args.append(kwargs.pop(compat.as_str(keyword)))
+      try:
+        args.append(kwargs.pop(compat.as_str(keyword)))
+      except KeyError:
+        specified_keywords = (list(self._arg_keywords[:len(args)])
+                              + list(kwargs.keys()))
+        raise TypeError(
+            "Expected argument names {} but got values for {}. Missing: {}."
+            .format(
+                list(self._arg_keywords),
+                specified_keywords,
+                list(set(self._arg_keywords) - set(specified_keywords))))
     if kwargs:
       positional_arg_keywords = set(self._arg_keywords[:len(args)])
       for unused_key in kwargs:
         if unused_key in positional_arg_keywords:
           raise TypeError("Got two values for keyword '{}'.".format(unused_key))
-      raise TypeError("Keyword arguments {} unknown.".format(kwargs.keys()))
+      raise TypeError("Keyword arguments {} unknown. Expected {}.".format(
+          list(kwargs.keys()), list(self._arg_keywords)))
     return self._call_flat(args)
 
   def _filtered_call(self, args, kwargs):
@@ -383,8 +537,8 @@ class Function(object):
     """
     return self._call_flat(
         (t for t in nest.flatten((args, kwargs))
-         if isinstance(
-             t, (ops.Tensor, resource_variable_ops.ResourceVariable))))
+         if isinstance(t, (ops.Tensor,
+                           resource_variable_ops.ResourceVariable))))
 
   def _call_flat(self, args):
     """Executes the wrapped function.
@@ -400,16 +554,20 @@ class Function(object):
     """
     ctx = context.context()
 
-    for v in self._func_graph.variables:
-      if v.trainable:
-        tape.variable_accessed(v)
+    tape.variables_accessed(self._func_graph.variables)
 
     tensor_inputs = []
+    variables_used = set([])
     for i, arg in enumerate(args):
       if isinstance(arg, resource_variable_ops.ResourceVariable):
+        # We can pass a variable more than once, and in this case we need to
+        # pass its handle only once.
+        if arg.handle in variables_used:
+          continue
         if arg.trainable:
           tape.variable_accessed(arg)
         tensor_inputs.append(arg.handle)
+        variables_used.add(arg.handle)
       elif isinstance(arg, ops.Tensor):
         tensor_inputs.append(arg)
       elif (self._signature is not None and
@@ -417,7 +575,7 @@ class Function(object):
         tensor_inputs.append(
             ops.convert_to_tensor(arg, self._signature[i].dtype))
       else:
-        raise ValueError("All inputs to `Function`s must be Tensors; "
+        raise ValueError("All inputs to `ConcreteFunction`s must be Tensors; "
                          "on invocation of %s, the %d-th input (%s) was not a "
                          "Tensor." % (self._func_graph.name, i, str(arg)))
     args = tensor_inputs + self._captured_inputs
@@ -433,26 +591,25 @@ class Function(object):
     if context.executing_eagerly() or not self.outputs:
       outputs = self._inference_function.call(ctx, args)
     else:
-      if not self._gradient_name:
-        self._gradient_name = "PartitionedCall-%s" % ops.uid()
-        self._register_gradient(self._gradient_name)
+      self._register_gradient()
       with ops.get_default_graph().gradient_override_map(
           {"PartitionedCall": self._gradient_name,
            "StatefulPartitionedCall": self._gradient_name}):
         outputs = self._inference_function.call(ctx, args)
     return self._build_call_outputs(outputs)
 
-  def _register_gradient(self, name):
-    """Registers the gradient for the current Function under the given name.
+  def _register_gradient(self):
+    """Registers the gradient for this `ConcreteFunction`.
 
     The gradient rewrites an inference call op to a forward call op, but does
     not modify a pre-existing forward call op. It then computes the gradient
     from the output's gradients and the side outputs of the forward op.
-
-    Args:
-      name: The name to register the gradient as.
     """
-    @ops.RegisterGradient(name)
+    if self._gradient_name:
+      return
+    self._gradient_name = "PartitionedCall-%s" % ops.uid()
+
+    @ops.RegisterGradient(self._gradient_name)
     def _registered_grad_fn(op, *doutputs):  # pylint: disable=unused-variable
       return self._grad_fn(op, *doutputs)
 
@@ -484,7 +641,7 @@ class Function(object):
 
   @property
   def name(self):
-    """Function name."""
+    """`ConcreteFunction` name."""
     return self._inference_function.name
 
   @property
@@ -497,11 +654,21 @@ class Function(object):
     """Returns tensors in `self.graph` corresponding to arguments."""
     return self._func_graph.inputs
 
+  @property
+  def structured_input_signature(self):
+    """Returns structured signature of the original function."""
+    return self._func_graph.structured_input_signature
+
   @property
   def outputs(self):
-    """Returns tensors in `self.graph` corresponding to return values."""
+    """Returns tensors in `self.graph` corresponding to returned tensors."""
     return self._func_graph.outputs
 
+  @property
+  def structured_outputs(self):
+    """Returns outputs in `self.graph` as returned by the original function."""
+    return self._func_graph.structured_outputs
+
   @property
   def captured_inputs(self):
     """Returns external Tensors captured by this function.
@@ -555,27 +722,26 @@ class Function(object):
     # method's functionality better. Remove register_gradient_functions argument
     # and figure out if these needs to be registered.
 
-    if not context.executing_eagerly() or g:
-      if not g:
-        g = ops.get_default_graph()
-      self._inference_function.add_to_graph(g)  # pylint: disable=protected-access
-
-      # pylint: disable=protected-access
-      if register_gradient_functions:
-        # There are two situations for the actual call of a defun:
-        # 1. If none of the input args are resource variables or watch by any
-        #   tape, and it will run the _inference_function of concrete_func for
-        #   forward pass, the gradient will be generated by standard mechanism.
-        # 2. Otherwise, defun will create two functions, one for forward pass,
-        #   and the backward pass will be created via tape.
-        #   When registering the function, we register both cases.
-        if self._backward_graph_function is None:
-          self._construct_backprop_function()
-        forward_function = self._forward_function
-        backward_function = self._backward_graph_function._inference_function
-        # pylint: enable=protected-access
-        forward_function.add_to_graph(g)
-        backward_function.add_to_graph(g)
+    if not context.executing_eagerly() and not g:
+      g = ops.get_default_graph()
+    self._inference_function.add_to_graph(g)  # pylint: disable=protected-access
+
+    # pylint: disable=protected-access
+    if register_gradient_functions:
+      # There are two situations for the actual call of a defun:
+      # 1. If none of the input args are resource variables or watch by any
+      #   tape, and it will run the _inference_function of concrete_func for
+      #   forward pass, the gradient will be generated by standard mechanism.
+      # 2. Otherwise, defun will create two functions, one for forward pass,
+      #   and the backward pass will be created via tape.
+      #   When registering the function, we register both cases.
+      if self._backward_graph_function is None:
+        self._construct_backprop_function()
+      forward_function = self._forward_function
+      backward_function = self._backward_graph_function._inference_function
+      # pylint: enable=protected-access
+      forward_function.add_to_graph(g)
+      backward_function.add_to_graph(g)
 
   def _construct_backprop_function(self):
     """Constructs the backprop function object for this function."""
@@ -583,12 +749,12 @@ class Function(object):
         _backward_name(self._func_graph.name))
     forward_function_name = _forward_name(self._func_graph.name)
     outputs = [x for x in self._func_graph.outputs
-               if gradients_impl.IsTrainable(x)]
+               if gradients_util.IsTrainable(x)]
     with backwards_graph.as_default():
       gradients_wrt_outputs = [
           graph_placeholder(x.dtype, x.shape) for x in outputs
       ]
-      gradients_wrt_inputs = gradients_impl._GradientsHelper(  # pylint: disable=protected-access
+      gradients_wrt_inputs = gradients_util._GradientsHelper(  # pylint: disable=protected-access
           outputs,
           self._func_graph.inputs,
           grad_ys=gradients_wrt_outputs,
@@ -608,10 +774,11 @@ class Function(object):
     # Clear captures, since we pass them in as inputs.
     backwards_graph.captures = {}
     backwards_graph.outputs.extend(
-        grad for grad in func_graph_module.flatten(gradients_wrt_inputs)
+        grad
+        for grad in nest.flatten(gradients_wrt_inputs, expand_composites=True)
         if grad is not None)
     backwards_graph.structured_outputs = gradients_wrt_inputs
-    self._backward_graph_function = Function(
+    self._backward_graph_function = ConcreteFunction(
         backwards_graph, attrs=backward_function_attr)
 
     forward_function_attr = _parse_func_attrs({
@@ -642,9 +809,7 @@ class Function(object):
 
     ctx = context.context()
 
-    if not self._gradient_name:
-      self._gradient_name = "PartitionedCall-%s" % ops.uid()
-      self._register_gradient(self._gradient_name)
+    self._register_gradient()
     with ops.get_default_graph().gradient_override_map(
         {"PartitionedCall": self._gradient_name,
          "StatefulPartitionedCall": self._gradient_name}):
@@ -658,7 +823,7 @@ class Function(object):
     # the forward graph function so that we can compute its gradient.
     real_outputs = outputs[:self._num_outputs]
     skip_positions = [i for i, t in enumerate(real_outputs)
-                      if not gradients_impl.IsTrainable(t)]
+                      if not gradients_util.IsTrainable(t)]
     side_outputs = outputs[self._num_outputs:]
 
     def backward_function(*args):
@@ -691,9 +856,7 @@ class Function(object):
     """
     ctx = context.context()
 
-    if not self._gradient_name:
-      self._gradient_name = "PartitionedCall-%s" % ops.uid()
-      self._register_gradient(self._gradient_name)
+    self._register_gradient()
     with ops.get_default_graph().gradient_override_map(
         {"PartitionedCall": self._gradient_name,
          "StatefulPartitionedCall": self._gradient_name}):
@@ -756,16 +919,281 @@ def _deterministic_dict_values(dictionary):
   return tuple(dictionary[key] for key in sorted(dictionary))
 
 
-class PolymorphicFunction(object):
+class FunctionSpec(object):
+  """Specification of how to bind arguments to a function."""
+
+  @staticmethod
+  def from_function_and_signature(python_function, input_signature):
+    """Create a FunctionSpec instance given a python function and signature."""
+    if isinstance(python_function, functools.partial):
+      python_function_to_inspect = python_function.func
+      args_to_prepend = python_function.args or tuple()
+      kwargs_to_include = python_function.keywords or {}
+      if input_signature is not None:
+        # TODO(b/124441704): Add support for input_signature + partial.
+        raise NotImplementedError(
+            "Missing support for input_signature when using partial functions.")
+    else:
+      python_function_to_inspect = python_function
+      args_to_prepend = tuple()
+      kwargs_to_include = {}
+
+    fullargspec = tf_inspect.getfullargspec(python_function_to_inspect)
+    is_method = tf_inspect.ismethod(python_function_to_inspect)
+
+    return FunctionSpec(fullargspec, is_method, args_to_prepend,
+                        kwargs_to_include, input_signature)
+
+  def __init__(self, fullargspec, is_method, args_to_prepend, kwargs_to_include,
+               input_signature):
+    self._fullargspec = fullargspec
+    self._is_method = is_method
+    self._args_to_prepend = args_to_prepend
+    self._kwargs_to_include = kwargs_to_include
+    self._default_values = fullargspec.defaults
+
+    if self._is_method:
+      # Remove `self`: default arguments shouldn't be matched to it.
+      args = fullargspec.args[1:]
+    else:
+      args = fullargspec.args
+
+    # A cache mapping from argument name to index, for canonicalizing
+    # arguments that are called in a keyword-like fashion.
+    self._args_to_indices = {arg: i for i, arg in enumerate(args)}
+    self.arg_names = args
+    self.vararg_name = fullargspec.varargs
+
+    # A cache mapping from arg index to default value, for canonicalization.
+    offset = len(args) - len(fullargspec.defaults or [])
+    self._arg_indices_to_default_values = {
+        offset + index: default
+        for index, default in enumerate(fullargspec.defaults or [])
+    }
+    self._default_values_start_index = offset
+    if input_signature is None:
+      self._input_signature = None
+    else:
+      if fullargspec.varkw is not None or fullargspec.kwonlyargs:
+        raise ValueError("Cannot define a TensorFlow function from a Python "
+                         "function with keyword arguments when "
+                         "input_signature is provided.")
+
+      if not isinstance(input_signature, (tuple, list)):
+        raise TypeError("input_signature must be either a tuple or a "
+                        "list, received " + str(type(input_signature)))
+
+      self._input_signature = tuple(input_signature)
+      self._flat_input_signature = tuple(nest.flatten(input_signature))
+
+  @property
+  def fullargspec(self):
+    return self._fullargspec
+
+  @property
+  def is_method(self):
+    return self._is_method
+
+  @property
+  def args_to_prepend(self):
+    return self._args_to_prepend
+
+  @property
+  def kwargs_to_include(self):
+    return self._kwargs_to_include
+
+  @property
+  def input_signature(self):
+    return self._input_signature
+
+  @property
+  def flat_input_signature(self):
+    return self._flat_input_signature
+
+  def canonicalize_function_inputs(self, *args, **kwargs):
+    """Canonicalizes `args` and `kwargs`.
+
+    Canonicalize the inputs to the Python function using a `FunctionSpec`
+    instance. In particular, we parse the varags and kwargs that the
+    original function was called with into a tuple corresponding to the
+    Python function's positional (named) arguments and a dictionary
+    corresponding to its kwargs.
+
+    Args:
+      *args: The varargs this object was called with.
+      **kwargs: The keyword args this function was called with.
+
+    Returns:
+      A canonicalized ordering of the inputs representened by a tuple in the
+      form (args, kwargs). Here: `args` is a full list of bound arguments, and
+      `kwargs` contains only true keyword arguments, as opposed to named
+      arguments called in a keyword-like fashion.
+
+    Raises:
+      ValueError: If a keyword in `kwargs` cannot be matched with a positional
+        argument when an input signature is specified, or when the inputs
+        do not conform to the input signature.
+    """
+    if self._input_signature is not None:
+      if len(args) > len(self._input_signature):
+        raise TypeError(
+            "When input_signature is provided, only pass arguments "
+            "covered by it. Received %d argument(s)." % len(args))
+      for arg in six.iterkeys(kwargs):
+        index = self._args_to_indices.get(arg, None)
+        if index is None:
+          raise TypeError(
+              "Function got an unexpected keyword argument %s" % arg)
+        if index >= len(self._input_signature):
+          raise TypeError(
+              "When input_signature is provided, only pass arguments "
+              "covered by it. Received argument %s." % arg)
+
+    args = self._args_to_prepend + args
+    kwargs = dict(kwargs, **self._kwargs_to_include)
+    if not kwargs:
+      if self._default_values:
+        inputs = args + self._default_values[
+            len(args) - self._default_values_start_index:]
+      else:
+        inputs = args
+    else:
+      # Maps from index of arg to its corresponding value, according to `args`
+      # and `kwargs`; seeded with the default values for the named args that
+      # aren't in `args`.
+      arg_indices_to_values = {
+          index: default for index, default in six.iteritems(
+              self._arg_indices_to_default_values) if index >= len(args)
+      }
+      consumed_args = []
+      for arg, value in six.iteritems(kwargs):
+        index = self._args_to_indices.get(arg, None)
+        if index is not None:
+          arg_indices_to_values[index] = value
+          consumed_args.append(arg)
+        elif self._input_signature is not None:
+          raise ValueError("Cannot define a TensorFlow function from a Python "
+                           "function with keyword arguments when "
+                           "input_signature is provided.")
+      for arg in consumed_args:
+        # After this loop, `kwargs` will only contain true keyword arguments, as
+        # opposed to named arguments called in a keyword-like fashion.
+        kwargs.pop(arg)
+      inputs = args + _deterministic_dict_values(arg_indices_to_values)
+
+    if self._input_signature is None:
+      inputs = _convert_numpy_inputs(inputs)
+      return inputs, kwargs
+    else:
+      assert not kwargs
+      inputs = _convert_inputs_to_signature(
+          inputs,
+          self._input_signature,
+          self._flat_input_signature)
+      return inputs, {}
+
+
+def _convert_numpy_inputs(inputs):
+  """Convert numpy array inputs to tensors."""
+  flat_inputs = nest.flatten(inputs)
+
+  # Check for NumPy arrays in arguments and convert them to Tensors.
+  # TODO(nareshmodi): Skip ndarray conversion to tensor altogether, perhaps
+  # finding a way to store them directly in the cache key (currently not
+  # possible since ndarrays are not hashable).
+  need_packing = False
+  for index, value in enumerate(flat_inputs):
+    if type(value) == np.ndarray:
+      flat_inputs[index] = constant_op.constant(value)
+      need_packing = True
+  if need_packing:
+    return nest.pack_sequence_as(
+        structure=inputs, flat_sequence=flat_inputs)
+  else:
+    return inputs
+
+
+def _convert_inputs_to_signature(inputs, input_signature, flat_input_signature):
+  """Convert inputs to pass into a function with an explicit signature."""
+  try:
+    # TODO(b/124370185): Use all elements as inputs to throw an error if there
+    # are ignored arguments. Calling with arguments that are not part of the
+    # signature should throw an error.
+    flatten_inputs = nest.flatten_up_to(
+        input_signature,
+        inputs[:len(input_signature)])
+  except ValueError:
+    raise ValueError("Structure of Python function inputs does not match "
+                     "input_signature. Inputs (%s), input_signature(%s)." %
+                     (str(inputs), str(input_signature)))
+
+  need_packing = False
+  for index, (value, spec) in enumerate(zip(flatten_inputs,
+                                            flat_input_signature)):
+    if not pywrap_tensorflow.IsTensor(value):
+      try:
+        flatten_inputs[index] = ops.convert_to_tensor(
+            value, dtype_hint=spec.dtype)
+        need_packing = True
+      except ValueError:
+        raise ValueError("When input_signature is provided, all inputs to "
+                         "the Python function must be convertible to tensors."
+                         "Inputs (%s), input_signature(%s)." %
+                         (str(inputs), str(input_signature)))
+
+  if any(not spec.is_compatible_with(other) for spec, other in zip(
+      flat_input_signature,
+      flatten_inputs)):
+    raise ValueError("Python inputs incompatible with input_signature: "
+                     "inputs (%s), input_signature (%s)" %
+                     (str(inputs), str(input_signature)))
+
+  if need_packing:
+    inputs = nest.pack_sequence_as(
+        structure=input_signature,
+        flat_sequence=flatten_inputs)
+
+  return inputs
+
+
+class FunctionCache(object):
+  """A lightweight container for cached functions.
+  """
+
+  def __init__(self):
+    # The set of functions that have been missed; entries are CacheKey with
+    # input_signature `None` (e.g. a "call context key")
+    self.missed = set()
+    # The primary cache, mapping a fully shaped CacheKey to a function.
+    self.primary = collections.OrderedDict()
+    # A cache key lookup, mapping a CacheKey generated without shape info to a
+    # flat list of relaxed shapes (one for each argument).  Arguments that are
+    # not Tensors contain a `None` for the corresponding relaxed shape.
+    self.arg_relaxed_shapes = collections.OrderedDict()
+    # The secondary cache, mapping a CacheKey generated without shape info to a
+    # function.
+    self.arg_relaxed = collections.OrderedDict()
+    # All OrderedDicts require manual garbage collection.
+    self._garbage_collectors = [
+        _FunctionGarbageCollector(self.primary),
+        _FunctionGarbageCollector(self.arg_relaxed),
+        _FunctionGarbageCollector(self.arg_relaxed_shapes)]
+
+  def all_values(self):
+    """A set of all `ConcreteFunction` instances held by this cache."""
+    return set(self.primary.values()) | set(self.arg_relaxed.values())
+
+
+class Function(object):
   """Wrapper class for the graph functions defined for a Python function.
 
   See the documentation for `defun` for more information on the semantics of
   defined functions.
 
-  PolymorphicFunction class is thread-compatible meaning that minimal
-  usage of defuns (defining and calling) is thread-safe, but if users call other
-  methods or invoke the base `python_function` themselves, external
-  synchronization is necessary.
+  `Function` class is thread-compatible meaning that minimal usage of defuns
+  (defining and calling) is thread-safe, but if users call other methods or
+  invoke the base `python_function` themselves, external synchronization is
+  necessary.
   """
 
   def __init__(self,
@@ -773,8 +1201,10 @@ class PolymorphicFunction(object):
                name,
                input_signature=None,
                attributes=None,
-               autograph=True):
-    """Initializes a polymorphic function.
+               autograph=True,
+               autograph_options=None,
+               capture_by_value=None):
+    """Initializes a `Function`.
 
     Args:
       python_function: the function to be wrapped.
@@ -787,66 +1217,36 @@ class PolymorphicFunction(object):
       autograph: whether to use autograph to compile
         `python_function`. See https://www.tensorflow.org/guide/autograph for
         more information.
+      autograph_options: Experimental knobs to control behavior
+        `when autograph=True`. See https://www.tensorflow.org/guide/autograph
+        for more information.
+      capture_by_value: Experimental. Whether to capture resource variables by
+        value or reference. If None, will inherit from a parent context or
+        default to False.
 
     Raises:
       ValueError: if `input_signature` is not None and the `python_function`'s
         argspec has keyword arguments.
     """
-
     if isinstance(python_function, functools.partial):
       self._python_function = python_function.func
-      self._args_to_prepend = python_function.args or tuple()
-      self._kwargs_to_include = python_function.keywords or {}
     else:
       self._python_function = python_function
-      self._args_to_prepend = tuple()
-      self._kwargs_to_include = {}
+    self._function_spec = FunctionSpec.from_function_and_signature(
+        python_function, input_signature)
     self._name = name
     self._autograph = autograph
-    self._function_cache = collections.OrderedDict()
+    self._autograph_options = autograph_options
+    self._function_cache = FunctionCache()
     self._function_attributes = attributes or {}
+    self._capture_by_value = capture_by_value
 
     self._lock = threading.Lock()
     # _descriptor_cache is a of instance of a class to an instance-specific
-    # PolymorphicFunction, used to make sure defun-decorated methods create
-    # different functions for each instance.
+    # `Function`, used to make sure defun-decorated methods create different
+    # functions for each instance.
     self._descriptor_cache = weakref.WeakKeyDictionary()
 
-    fullargspec = tf_inspect.getfullargspec(self._python_function)
-    if tf_inspect.ismethod(self._python_function):
-      # Remove `self`: default arguments shouldn't be matched to it.
-      args = fullargspec.args[1:]
-    else:
-      args = fullargspec.args
-
-    # A cache mapping from argument name to index, for canonicalizing
-    # arguments that are called in a keyword-like fashion.
-    self._args_to_indices = {arg: i for i, arg in enumerate(args)}
-    self._arg_names = args
-    self._vararg_name = fullargspec.varargs
-    # A cache mapping from arg index to default value, for canonicalization.
-    offset = len(args) - len(fullargspec.defaults or [])
-    self._arg_indices_to_default_values = {
-        offset + index: default
-        for index, default in enumerate(fullargspec.defaults or [])
-    }
-    self._default_values = fullargspec.defaults
-    self._default_values_start_index = offset
-    if input_signature is None:
-      self._input_signature = None
-    else:
-      if fullargspec.varkw is not None or fullargspec.kwonlyargs:
-        raise ValueError("Cannot define a TensorFlow function from a Python "
-                         "function with keyword arguments when "
-                         "input_signature is provided.")
-
-      if not isinstance(input_signature, (tuple, list)):
-        raise TypeError("input_signature must be either a tuple or a "
-                        "list, received " + str(type(input_signature)))
-
-      self._input_signature = tuple(input_signature)
-      self._flat_input_signature = tuple(nest.flatten(input_signature))
-
   def __call__(self, *args, **kwargs):
     """Calls a graph function specialized to the inputs."""
     graph_function, args, kwargs = self._maybe_define_function(args, kwargs)
@@ -855,17 +1255,42 @@ class PolymorphicFunction(object):
   @property
   def python_function(self):
     """Returns the wrapped Python function."""
-    return self._python_function
+    return self._python_function  # pylint: disable=protected-access
 
-  def _get_concrete_function_internal(self, *args, **kwargs):
-    """Bypasses error checking when getting a graph function."""
+  @property
+  def function_spec(self):
+    return self._function_spec
+
+  @property
+  def _input_signature(self):
+    """Returns the input signature."""
+    return self._function_spec.input_signature  # pylint: disable=protected-access
+
+  @property
+  def _flat_input_signature(self):
+    """Returns the flattened input signature."""
+    return self._function_spec.flat_input_signature  # pylint: disable=protected-access
+
+  def _get_concrete_function_internal_garbage_collected(self, *args, **kwargs):
+    """Returns a concrete function which cleans up its graph function."""
     if self._input_signature:
       args, kwargs = None, None
     graph_function, _, _ = self._maybe_define_function(args, kwargs)
     return graph_function
 
+  def _get_concrete_function_internal(self, *args, **kwargs):
+    """Bypasses error checking when getting a graph function."""
+    graph_function = self._get_concrete_function_internal_garbage_collected(
+        *args, **kwargs)
+    # We're returning this concrete function to someone, and they may keep a
+    # reference to the FuncGraph without keeping a reference to the
+    # ConcreteFunction object. So we won't clean up the reference cycles
+    # manually and instead will leave them to Python's garbage collector.
+    graph_function._garbage_collector.release()  # pylint: disable=protected-access
+    return graph_function
+
   def get_concrete_function(self, *args, **kwargs):
-    """Returns a `Function` object specialized to inputs and execution context.
+    """Returns a `ConcreteFunction` specialized to inputs and execution context.
 
     Args:
       *args: inputs to specialize on.
@@ -878,9 +1303,7 @@ class PolymorphicFunction(object):
                          "input_signature is provided.")
       if args:
         # If args are provided, they must match the input signature.
-        try:
-          nest.assert_same_structure(self._input_signature, args)
-        except (ValueError, TypeError):
+        if not is_same_structure(self._input_signature, args):
           raise ValueError("Structure of Python function inputs does not match "
                            "input_signature.")
         flat_inputs = nest.flatten(args)
@@ -940,8 +1363,8 @@ class PolymorphicFunction(object):
   def __get__(self, instance, owner):
     """Makes it possible to defun instance methods."""
     del owner
-    # `instance` here is the instance that this `PolymorphicFunction` was
-    # accessed through; e.g., for
+    # `instance` here is the instance that this `Function` was accessed through
+    # e.g., for
     #
     #   class Foo(object):
     #
@@ -950,35 +1373,36 @@ class PolymorphicFunction(object):
     #       ...
     #
     #   foo = Foo()
-    #   foo.bar()  # `foo.bar` is a `PolymorphicFunction` instance
+    #   foo.bar()  # `foo.bar` is a `Function` instance
     #
     # then `instance` will be `foo` (and `owner` will be `Foo`).  We create a
-    # new instance of PolymorphicFunction here to allow different instances each
+    # new instance of `Function` here to allow different instances each
     # to create variables once, thereby allowing methods to be decorated with
     # defun. Keeps a cache to avoid retracing the function every time the
     # descriptor is accessed.
     if instance not in self._descriptor_cache:
       if instance is None:
         return self
-      # If there is no instance-specific polymorphic func in the cache,
-      # we construct an instance-specific polymorphic function
-      # that uses a weak reference to the instance (so that the instance will
-      # be correctly gc'd).
+      # If there is no instance-specific `Function` in the cache, we construct
+      # an instance-specific `Function` that uses a weak reference to the
+      # instance (so that the instance will be correctly gc'd).
 
       # And finally add the wrapped function to the description cache
       self._descriptor_cache[instance] = class_method_to_instance_method(
           self, instance)
 
-    # Return the cached polymorphic function for the instance
+    # Return the cached `Function` for the instance
     return self._descriptor_cache[instance]
 
-  def _cache_key(self, args, kwargs):
+  def _cache_key(self, args, kwargs, include_tensor_ranks_only=False):
     """Computes the cache key given inputs and execution context."""
     if self._input_signature is None:
       inputs = (args, kwargs) if kwargs else args
-      input_signature = pywrap_tensorflow.TFE_Py_EncodeArg(inputs)
+      input_signature = pywrap_tensorflow.TFE_Py_EncodeArg(
+          inputs, include_tensor_ranks_only)
     else:
       del args, kwargs
+      assert not include_tensor_ranks_only
       input_signature = self._flat_input_signature
 
     ctx = context.context()
@@ -1024,101 +1448,51 @@ class PolymorphicFunction(object):
     return CacheKey(input_signature, parent_graph, device_functions,
                     colocation_stack, uses_xla)
 
-  def _canonicalize_function_inputs(self, *args, **kwargs):
-    """Canonicalizes `args` and `kwargs`.
-
-    Canonicalize the inputs to the Python function using its fullargspec. In
-    particular, we parse the varags and kwargs that this
-    `PolymorphicFunction` was called with into a tuple corresponding to the
-    Python function's positional (named) arguments and a dictionary
-    corresponding to its kwargs.
-
-    Args:
-      *args: The varargs this object was called with.
-      **kwargs: The keyword args this function was called with.
-
-    Returns:
-      A canonicalized ordering of the inputs.
-
-    Raises:
-      ValueError: If a keyword in `kwargs` cannot be matched with a positional
-        argument when an input signature is specified, or when the inputs
-        do not conform to the input signature.
-    """
-    args = self._args_to_prepend + args
-    kwargs = dict(kwargs, **self._kwargs_to_include)
-    if not kwargs:
-      if self._default_values:
-        inputs = args + self._default_values[len(args) -
-                                             self._default_values_start_index:]
-      else:
-        inputs = args
-    else:
-      # Maps from index of arg to its corresponding value, according to `args`
-      # and `kwargs`; seeded with the default values for the named args that
-      # aren't in `args`.
-      arg_indices_to_values = {
-          index: default for index, default in six.iteritems(
-              self._arg_indices_to_default_values) if index >= len(args)
-      }
-      consumed_args = []
-      for arg, value in six.iteritems(kwargs):
-        index = self._args_to_indices.get(arg, None)
-        if index is not None:
-          arg_indices_to_values[index] = value
-          consumed_args.append(arg)
-        elif self._input_signature is not None:
-          raise ValueError("Cannot define a TensorFlow function from a Python "
-                           "function with keyword arguments when "
-                           "input_signature is provided.")
-      for arg in consumed_args:
-        # After this loop, `kwargs` will only contain true keyword arguments, as
-        # opposed to named arguments called in a keyword-like fashion.
-        kwargs.pop(arg)
-      inputs = args + _deterministic_dict_values(arg_indices_to_values)
-    flat_inputs = nest.flatten(inputs)
-
-    # Check for NumPy arrays in arguments and convert them to Tensors.
-    # TODO(nareshmodi): Skip ndarray conversion to tensor altogether, perhaps
-    # finding a way to store them directly in the cache key (currently not
-    # possible since ndarrays are not hashable).
-    need_packing = False
-    for index, value in enumerate(flat_inputs):
-      if type(value) == np.ndarray:
-        flat_inputs[index] = constant_op.constant(value)
-        need_packing = True
-    if need_packing:
-      inputs = nest.pack_sequence_as(structure=inputs,
-                                     flat_sequence=flat_inputs)
+  def _create_graph_function(self, args, kwargs, override_flat_arg_shapes=None):
+    """Create a `ConcreteFunction` from `args` and `kwargs`."""
     if self._input_signature is None:
-      return inputs, kwargs
+      arglen = len(args)
     else:
-      assert not kwargs
-      signature_relevant_inputs = inputs[:len(self._input_signature)]
-      try:
-        nest.assert_same_structure(self._input_signature,
-                                   signature_relevant_inputs)
-      except (ValueError, TypeError):
-        raise ValueError("Structure of Python function inputs does not match "
-                         "input_signature.")
-      signature_inputs_flat = nest.flatten(signature_relevant_inputs)
-      if any(not pywrap_tensorflow.IsTensor(arg)
-             for arg in signature_inputs_flat):
-        raise ValueError("When input_signature is provided, all inputs to "
-                         "the Python function must be Tensors.")
-      if any(not spec.is_compatible_with(other)
-             for spec, other in zip(self._flat_input_signature,
-                                    signature_inputs_flat)):
-        raise ValueError("Python inputs incompatible with input_signature: "
-                         "inputs (%s), input_signature (%s)" %
-                         (str(inputs), str(self._input_signature)))
-      return inputs, {}
+      arglen = len(self._input_signature)
+    base_arg_names = self._function_spec.arg_names[:arglen]
+    num_missing_args = arglen - len(self._function_spec.arg_names)
+    missing_arg_names = [self._function_spec.vararg_name] * num_missing_args
+    # Produce a list of missing args of the form ["arg_0", "arg_1", ...],
+    # where arg is based on the self._function_spec.vararg_name.
+    missing_arg_names = [
+        "%s_%d" % (arg, i) for i, arg in enumerate(missing_arg_names)
+    ]
+    arg_names = base_arg_names + missing_arg_names
+    graph_function = ConcreteFunction(
+        func_graph_module.func_graph_from_py_func(
+            self._name,
+            self._python_function,
+            args,
+            kwargs,
+            self._input_signature,
+            autograph=self._autograph,
+            autograph_options=self._autograph_options,
+            arg_names=arg_names,
+            override_flat_arg_shapes=override_flat_arg_shapes,
+            capture_by_value=self._capture_by_value),
+        self._function_attributes)
+
+    # pylint: disable=protected-access
+    # Tell the ConcreteFunction to clean up its graph once it goes out of
+    # scope. ConcreteFunction does not do this in its constructor since it
+    # gets used in some places (like Keras) where the FuncGraph lives
+    # longer than the ConcreteFunction.
+    graph_function._garbage_collector = ConcreteFunctionGarbageCollector(
+        graph_function.graph)
+    # pylint: enable=protected-access
+
+    return graph_function
 
   def _maybe_define_function(self, args, kwargs):
     """Gets a function for these inputs, defining it if necessary.
 
-    `args` and `kwargs` can be None if this `PolymorphicFunction` was created
-    with an `input_signature`.
+    `args` and `kwargs` can be None if this `Function` was created with an
+    `input_signature`.
 
     Args:
       args: The varargs for the Python function.
@@ -1131,61 +1505,98 @@ class PolymorphicFunction(object):
     Raises:
       ValueError: If inputs are incompatible with the input signature.
       TypeError: If the function inputs include non-hashable objects
+      RuntimeError: If there's an internal bug (inconsistency) in handling
+        shape relaxation retracing.
     """
     if self._input_signature is None or args is not None or kwargs is not None:
-      args, kwargs = self._canonicalize_function_inputs(*args, **kwargs)
+      args, kwargs = self._function_spec.canonicalize_function_inputs(
+          *args, **kwargs)
     cache_key = self._cache_key(args, kwargs)
+
+    try:
+      hash(cache_key)
+    except TypeError as e:
+      raise TypeError(
+          "Arguments supplied to `defun`-generated functions must be"
+          " hashable.  Original error: %s" % e)
+
     with self._lock:
-      try:
-        graph_function = self._function_cache.get(cache_key, None)
-      except TypeError:
-        raise TypeError("Arguments supplied to `defun`-generated functions "
-                        "must be hashable.")
-
-      if graph_function is None:
-        logging.vlog(1,
-                     "Creating new FuncGraph for Python function %r (key: %r)",
-                     self._python_function, cache_key)
-        if self._input_signature is None:
-          arglen = len(args)
-        else:
-          arglen = len(self._input_signature)
-        arg_names = (
-            self._arg_names[:arglen]
-            + [self._vararg_name] * (arglen - len(self._arg_names)))
-        graph_function = Function(
-            func_graph_module.func_graph_from_py_func(
-                self._name,
-                self._python_function,
-                args,
-                kwargs,
-                self._input_signature,
-                autograph=self._autograph,
-                arg_names=arg_names),
-            self._function_attributes)
-        self._function_cache[cache_key] = graph_function
+      graph_function = self._function_cache.primary.get(cache_key, None)
+      if graph_function is not None:
+        return graph_function, args, kwargs
+
+      logging.vlog(1,
+                   "Creating new FuncGraph for Python function %r (key: %r)",
+                   self._python_function, cache_key)
+      logging.vlog(2,
+                   "Python function signature [args: %s] [kwargs: %s]",
+                   str(args),
+                   str(kwargs))
+
+      call_context_key = cache_key.replace(input_signature=None)
+
+      # If there's a provided input signature, or XLA is being used, or
+      # there's no cache miss for this calling context so far, go ahead and
+      # build the function and bypass shape relaxation retracing.
+      if (self._input_signature is not None
+          or cache_key.uses_xla
+          or call_context_key not in self._function_cache.missed):
+        self._function_cache.missed.add(call_context_key)
+        graph_function = self._create_graph_function(args, kwargs)
+        self._function_cache.primary[cache_key] = graph_function
+        return graph_function, args, kwargs
+
+      rank_only_cache_key = self._cache_key(
+          args, kwargs, include_tensor_ranks_only=True)
+
+      arg_shapes = _flat_shape_list(args, kwargs)
+      relaxed_arg_shapes = self._function_cache.arg_relaxed_shapes.get(
+          rank_only_cache_key, None)
+      relaxed_arg_function = self._function_cache.arg_relaxed.get(
+          rank_only_cache_key, None)
+
+      if (relaxed_arg_function is not None
+          and _compatible_shapes(relaxed_arg_shapes, arg_shapes)):
+        return relaxed_arg_function, args, kwargs
+
+      if relaxed_arg_shapes is None:
+        relaxed_arg_shapes = arg_shapes
+      else:
+        if len(arg_shapes) != len(relaxed_arg_shapes):
+          raise RuntimeError("Expected arg_shapes len to match "
+                             "relaxed_arg_shapes len: %d vs. %d"
+                             % (len(arg_shapes), len(relaxed_arg_shapes)))
+        relaxed_arg_shapes = [
+            _common_shape(x, y) for (x, y) in zip(
+                arg_shapes, relaxed_arg_shapes)]
+      self._function_cache.arg_relaxed_shapes[rank_only_cache_key] = (
+          relaxed_arg_shapes)
+      graph_function = self._create_graph_function(
+          args, kwargs, override_flat_arg_shapes=relaxed_arg_shapes)
+      self._function_cache.arg_relaxed[rank_only_cache_key] = graph_function
+
       return graph_function, args, kwargs
 
 
 def register(func, *args, **kwargs):
-  """Register a specialization of a PolymorphicFunction into the graph.
+  """Register a specialization of a `Function` into the graph.
 
   This won't actually call the function with the inputs, and only put the
   function definition into graph. Register function with different input param
   will result into multiple version of functions registered in graph.
 
   Args:
-    func: the PolymorphicFunction instance that generated by a @defun
+    func: the `Function` instance that generated by a @defun
     *args: input arguments for the Python function.
     **kwargs: input keyword arguments for the Python function.
 
   Returns:
-    a `Function` object specialized to inputs and execution context.
+    a `ConcreteFunction` object specialized to inputs and execution context.
 
   Raises:
     ValueError: When the input function is not a defun wrapped python function.
   """
-  if not isinstance(func, PolymorphicFunction):
+  if not isinstance(func, Function):
     raise ValueError("Only defun function is allowed to be registered. "
                      "Got type: %s" % type(func))
   concrete_func = func.get_concrete_function(*args, **kwargs)
@@ -1200,22 +1611,24 @@ def validate_signature(signature):
                     "a possibly nested sequence of TensorSpec objects.")
 
 
-def defun(func=None, input_signature=None, autograph=True):
+def defun(func=None,
+          input_signature=None,
+          autograph=True,
+          experimental_autograph_options=None):
   """Compiles a Python function into a callable TensorFlow graph.
 
-  `defun` (short for "define function") trace-compiles a Python function
+  `defun` (short for "define function") compiles a Python function
   composed of TensorFlow operations into a callable that executes a `tf.Graph`
   containing those operations. The callable produced by `defun` contains only
   the subgraph of TensorFlow operations that were executed when the Python
   function was called with a particular input signature, defined as a list
   of the shapes and dtypes of the Python function's Tensor-valued arguments and
-  the values of its non-Tensor Python objects. In particular, `defun` is _not_ a
-  compiler for arbitrary Python code.
+  the values of its non-Tensor Python objects.
 
   When eager execution is enabled, the ability to create graphs from Python
   functions makes it possible to incrementally trade off debugability and
   interactivity for performance.  Functions compiled with `defun` cannot be
-  inspected with `pdb` and `print` statements; however, executing a graph
+  inspected with `pdb`; however, executing a graph
   generated by `defun` sometimes takes less time and memory than eagerly
   executing the corresponding Python function, since specifying computations as
   graphs allows for optimizations like automatic buffer reuse and
@@ -1306,6 +1719,7 @@ def defun(func=None, input_signature=None, autograph=True):
   outer graph otherwise.
 
   _Input Signatures_
+
   By default, `F = tf.contrib.eager.defun(f)` instantiates a separate graph
   for every unique sequence of the shapes and dtypes of Tensor arguments and
   the values of Python objects it is invoked with. For example, calling
@@ -1364,6 +1778,7 @@ def defun(func=None, input_signature=None, autograph=True):
   Tensors as arguments and must not take unnamed keyword arguments (**kwargs).
 
   _Tracing_
+
   Be aware that because `F` only logs TensorFlow operations, all the other
   Python code that `f` executes will only shape the _construction_ of the graphs
   that `F` executes: the Python code won't be executed when the graphs
@@ -1389,6 +1804,7 @@ def defun(func=None, input_signature=None, autograph=True):
   replace the call to `np.random.randn` with `tf.random_normal((5, 5))`.
 
   _Python Side-Effects_
+
   A corollary of the previous discussion on tracing is the following: If a
   Python function `f` has Python side-effects, then executing `f` multiple times
   will not necessarily be semantically equivalent to executing `F =
@@ -1396,7 +1812,8 @@ def defun(func=None, input_signature=None, autograph=True):
   that `defun` only captures the subgraph of TensorFlow operations that is
   constructed when `f` is called in a graph-building context.
 
-  _Python Control Flow_.
+  _Python Control Flow_
+
   The structure of many machine learning computations depend upon whether one is
   training or validating, and it is common to nest specialized logic under `if
   training:` blocks. By mapping each input signature to a unique graph, `defun`
@@ -1425,27 +1842,26 @@ def defun(func=None, input_signature=None, autograph=True):
   exact_outputs = lossy_matmul(W, x, training=False)
   ```
 
-  On the other hand, because `defun` generates graphs by tracing and not by
-  source code analysis, it fully unrolls Python `for` and `while` loops,
-  potentially creating large graphs. If your Python function has native loops
-  that run for many iterations, consider replacing them with `tf.while_loop`
-  operations.
+  _TensorFlow Control Flow_
 
-  When constructing graphs, `tf.Tensor` objects cannot be used as Python
-  `bool` objects. This means, for example, that you should replace code in `f`
-  resembling
+  When `autograph` is `True`, data-dependent control flow is allowed as well.
+  Control flow statements that depend on `Tensor` values are staged into
+  corresponding TensorFlow ops. For example, the following code will work as
+  expected:
 
   ```python
-
-  if tensor < 10:
-    true_fn()
-  else:
-    false_fn()
+  @tf.contrib.eager.defun
+  def dynamic_rnn_loop(cell, seq):
+    state, output = cell.zero_state()
+    for input in seq:
+      state, output = cell(input, state)
+    return output
   ```
 
-  with `tf.cond(tensor < 10, true_fn, false_fn)`.
+  For more information see `tf.autograph`.
 
   _Variables_
+
   TensorFlow operations related to variable creation and initialization are
   automatically lifted out of the graphs generated by `defun`. In practice, this
   implies that variable creation and initialization only happen the first time
@@ -1512,6 +1928,9 @@ def defun(func=None, input_signature=None, autograph=True):
     autograph: Whether `func` should be compiled before
       constructing the graph. See https://www.tensorflow.org/guide/autograph
       for more information.
+    experimental_autograph_options: Experimental knobs (in the form of a tuple
+      of tensorflow.autograph.Feature values) to control behavior when
+      autograph=True.
 
 
   Returns:
@@ -1527,13 +1946,15 @@ def defun(func=None, input_signature=None, autograph=True):
   return defun_with_attributes(
       func=func,
       input_signature=input_signature,
-      autograph=autograph)
+      autograph=autograph,
+      experimental_autograph_options=experimental_autograph_options)
 
 
 def defun_with_attributes(func=None,
                           input_signature=None,
                           attributes=None,
-                          autograph=True):
+                          autograph=True,
+                          experimental_autograph_options=None):
   """Compiles a Python function into a callable TensorFlow graph.
 
   This function supports adding extra function attributes. See detailed
@@ -1549,8 +1970,10 @@ def defun_with_attributes(func=None,
       whitelisted attribute name is allowed. Unwhitelisted attribute name or
       unsupported value will result into ValueError. `func_name` is also one of
       the whitelisted argument which is a python string, and sets the name for
-      this `Function` in the graph.
+      this `ConcreteFunction` in the graph.
     autograph: same as defun()'s autograph.
+    experimental_autograph_options: same as defun()'s
+      experimental_autograph_options.
 
   Returns:
     Same as the return value of defun, with attributes added to the function in
@@ -1570,12 +1993,13 @@ def defun_with_attributes(func=None,
       name = "function"
     return tf_decorator.make_decorator(
         function,
-        PolymorphicFunction(
+        Function(
             function,
             name,
             input_signature=input_signature,
             attributes=attributes,
-            autograph=autograph))
+            autograph=autograph,
+            autograph_options=experimental_autograph_options))
 
   # This code path is for the `foo = tfe.defun(foo, ...)` use case
   if func is not None:
@@ -1602,7 +2026,7 @@ class _WeakrefSelf(object):
 
 
 def class_method_to_instance_method(original_function, instance):
-  """Constructs a new PolymorphicFunction with `self` bound."""
+  """Constructs a new `Function` with `self` bound."""
   weak_instance = weakref.ref(instance)
 
   # Note: while we could bind to a weakref proxy instead, that causes the
@@ -1610,21 +2034,31 @@ def class_method_to_instance_method(original_function, instance):
   bound_method = types_lib.MethodType(original_function.python_function,
                                       _WeakrefSelf(weak_instance))
 
-  # original_function is expected to be of one of the two PolymorphicFunction
-  # types (defined either in function.py or def_function.py).
+  # original_function is expected to be of one of the two `Function` types
+  # (defined either in function.py or def_function.py).
   assert hasattr(original_function, "_name")
   assert hasattr(original_function, "_autograph")
   assert hasattr(original_function, "_input_signature")
   assert hasattr(original_function, "python_function")
 
+  weak_bound_method_wrapper = None
   def bound_method_wrapper(*args, **kwargs):
+    """Wraps either a dummy MethodType or a converted AutoGraph function."""
     # __wrapped__ allows AutoGraph to swap in a converted function.
-    wrapped_fn = bound_method_wrapper.__wrapped__
-    # If __wrapped__ was not replaced, then call original_function.
-    # TODO(b/119246461): This needs to be simplified.
-    if tf_inspect.ismethod(wrapped_fn):
+    strong_bound_method_wrapper = weak_bound_method_wrapper()
+    wrapped_fn = strong_bound_method_wrapper.__wrapped__
+
+    if wrapped_fn is strong_bound_method_wrapper.__original_wrapped__:
+      # If __wrapped__ was not replaced, then call original_function.
       wrapped_fn = original_function.python_function
+      if tf_inspect.ismethod(wrapped_fn):
+        wrapped_fn = six.get_unbound_function(wrapped_fn)
+      return wrapped_fn(weak_instance(), *args, **kwargs)
+
+    # If __wrapped__ was replaced, then it is always an unbound function
+    # that takes self as first argument.
     return wrapped_fn(weak_instance(), *args, **kwargs)
+  weak_bound_method_wrapper = weakref.ref(bound_method_wrapper)
 
   # pylint: disable=protected-access
   # We make a dummy MethodType object to generate the correct bound method
@@ -1641,3 +2075,39 @@ def class_method_to_instance_method(original_function, instance):
   wrapped_instance_func = tf_decorator.make_decorator(
       original_function.python_function, instance_func)
   return wrapped_instance_func
+
+
+class _FunctionGarbageCollector(object):
+  """Cleans up cycles when a defun goes out of scope."""
+
+  def __init__(self, cache):
+    self._cache = cache
+
+  def __del__(self):
+    if func_graph_module is None or memory is None:
+      return
+    try:
+      while self._cache:
+        self._cache.popitem()
+      memory.dismantle_ordered_dict(self._cache)
+    except:  # pylint: disable=bare-except
+      pass
+
+
+class ConcreteFunctionGarbageCollector(object):
+  """Cleans up reference cycles when a `ConcreteFunction` goes out of scope."""
+
+  def __init__(self, func_graph):
+    self._func_graph = func_graph
+
+  def release(self):
+    """Call off the FuncGraph deletion."""
+    self._func_graph = None
+
+  def __del__(self):
+    if func_graph_module is None or memory is None or self._func_graph is None:
+      return
+    try:
+      func_graph_module.dismantle_func_graph(self._func_graph)
+    except:  # pylint: disable=bare-except
+      pass
diff --git a/tensorflow/python/eager/function_argument_naming_test.py b/tensorflow/python/eager/function_argument_naming_test.py
index 9358c4fd07111f7adfbf60241727215f978b2a36..08a50a8f513425ff395b4b83de7a44183c12c757 100644
--- a/tensorflow/python/eager/function_argument_naming_test.py
+++ b/tensorflow/python/eager/function_argument_naming_test.py
@@ -220,10 +220,10 @@ class ArgumentNamingTests(test.TestCase, parameterized.TestCase):
         z=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
         zz=tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name='cust'))
     self.assertEqual(
-        ['x', 'y', 'args', 'second_variadic', 'z', 'cust'],
+        ['x', 'y', 'args_1', 'second_variadic', 'z', 'cust'],
         [inp.op.name for inp in variadic_op.inputs])
     self.assertEqual(
-        [b'x', b'y', b'args', b'second_variadic', b'z', b'cust'],
+        [b'x', b'y', b'args_1', b'second_variadic', b'z', b'cust'],
         [inp.op.get_attr('_user_specified_name')
          for inp in variadic_op.inputs])
 
@@ -244,10 +244,10 @@ class ArgumentNamingTests(test.TestCase, parameterized.TestCase):
     variadic_op = variadic_fn.get_concrete_function()
     self.assertIn(b'variadic_fn', variadic_op.name)
     self.assertEqual(
-        ['x', 'y', 'args', 'z'],
+        ['x', 'y', 'args_1', 'z'],
         [inp.op.name for inp in variadic_op.inputs])
     self.assertEqual(
-        [b'x', b'y', b'args', b'z'],
+        [b'x', b'y', b'args_1', b'z'],
         [inp.op.get_attr('_user_specified_name')
          for inp in variadic_op.inputs])
 
diff --git a/tensorflow/python/eager/function_gradients_test.py b/tensorflow/python/eager/function_gradients_test.py
index 98dec0b361b76eadbb107a7cd42e4deba6f2ea25..7cf77570e561b29a5b0f25782ceacc6b5ab17392 100644
--- a/tensorflow/python/eager/function_gradients_test.py
+++ b/tensorflow/python/eager/function_gradients_test.py
@@ -226,7 +226,8 @@ class FunctionGradientsTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(g, 1.0)
 
   def testGradient(self):
-    matmul = def_function.function(math_ops.matmul)
+    # TODO(b/121134877): Remove the autograph override.
+    matmul = def_function.function(math_ops.matmul, autograph=False)
 
     def sq(x):
       return matmul(x, x, transpose_a=True)
@@ -696,7 +697,8 @@ class FunctionGradientsTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(g2, 2.0)
 
   def testGradientWithKeywordArguments(self):
-    matmul = def_function.function(math_ops.matmul)
+    # TODO(b/121134877): Remove the autograph override.
+    matmul = def_function.function(math_ops.matmul, autograph=False)
 
     def sq(x):
       return matmul(a=x, b=x, transpose_a=True)
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 50d1b4b6f77e203e1d9ebb278f1c356024a4226f..7a17eb6ee2196a695014196fb3f40b6a1296cc9c 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import collections
 import functools
+import itertools
 from multiprocessing.pool import ThreadPool
 import sys
 import weakref
@@ -45,8 +46,12 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training as keras_training
 from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_functional_ops
+from tensorflow.python.ops import gen_random_ops
+from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
@@ -61,6 +66,13 @@ from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 
 
+def total_function_cache(defined):
+  # pylint: disable=protected-access
+  return (set(defined._function_cache.primary)
+          | set(defined._function_cache.arg_relaxed))
+  # pylint: enable=protected-access
+
+
 class MiniModel(keras_training.Model):
   """Minimal model for mnist.
 
@@ -86,13 +98,102 @@ class DefunnedMiniModel(MiniModel):
 class FunctionTest(test.TestCase, parameterized.TestCase):
 
   def testBasic(self):
-    matmul = def_function.function(math_ops.matmul)
+    # TODO(b/121134877): Remove the autograph override.
+    matmul = def_function.function(math_ops.matmul, autograph=False)
     t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
     sq = matmul(t, t, transpose_a=True)
     sq2 = matmul(sq, t, transpose_a=True)
     self.assertAllEqual(sq.numpy().reshape(-1), [10, 14, 14, 20])
     self.assertAllEqual(sq2.numpy().reshape(-1), [52, 76, 74, 108])
 
+  def testVariable(self):
+    v1 = variables.Variable(1.0)
+    add = def_function.function(lambda x, v: x + v1 + v)
+    v2 = variables.Variable(1.0)
+    x = constant_op.constant(1.0)
+    r = add(x, v2)
+    self.assertEqual(3.0, self.evaluate(r))
+
+  def testInputShapeFunctionRelaxation(self):
+    unknown_dim = [False]
+
+    @function.defun
+    def func(a):
+      if a._shape_tuple()[0] is None:
+        unknown_dim[0] = True
+      return a + 1
+
+    func(constant_op.constant([]))
+    self.assertFalse(unknown_dim[0])
+    self.assertLen(total_function_cache(func), 1)
+
+    func(constant_op.constant([1.0]))
+    self.assertFalse(unknown_dim[0])
+    self.assertLen(total_function_cache(func), 2)
+
+    func(constant_op.constant([1.0, 2.0]))
+    self.assertTrue(unknown_dim[0])
+    self.assertLen(total_function_cache(func), 2)
+
+  def testNestedInputShapeFunctionRelaxation(self):
+    unknown_dim = [False]
+
+    @function.defun
+    def func(a_, b_=None):
+      del a_  # Only used to check which cache is used.
+      self.assertEqual(b_[0]._shape_tuple(), ())
+      if b_[1]._shape_tuple()[0] is None:
+        unknown_dim[0] = True
+      return b_[0] + 1
+
+    a = 'hi'
+    b0 = constant_op.constant(1.0)
+    func(a, b_=[b0, constant_op.constant([])])
+    self.assertFalse(unknown_dim[0])
+    self.assertLen(total_function_cache(func), 1)
+
+    func(a, b_=[b0, constant_op.constant([1.0])])
+    self.assertFalse(unknown_dim[0])
+    self.assertLen(total_function_cache(func), 2)
+
+    func(a, b_=[b0, constant_op.constant([1.0, 1.0])])
+    self.assertTrue(unknown_dim[0])
+    self.assertLen(total_function_cache(func), 2)
+
+    unknown_dim[0] = False
+
+    # Now do the same except with a new a which is not a tensor; this should
+    # change the cache key.
+    a = 'bye'
+    func(a, b_=[b0, constant_op.constant([])])
+    self.assertFalse(unknown_dim[0])
+    self.assertLen(total_function_cache(func), 3)
+
+    # Since we already marked a cache miss for a function with the same
+    # non-input signatures, here we will immediately start relaxing shapes.
+    func(a, b_=[b0, constant_op.constant([1.0])])
+    self.assertTrue(unknown_dim[0])
+    self.assertLen(total_function_cache(func), 3)
+
+  def testFunctionRelaxationLosesInnerDimWithKerasLayer(self):
+    layer = keras.layers.Dense(1)
+    fn = def_function.function()(layer)
+
+    with self.captureWritesToStream(sys.stderr) as printed:
+      fn(array_ops.ones((3, 2)))
+      self.assertNotIn('ValueError', printed.contents())
+    with self.captureWritesToStream(sys.stderr) as printed:
+      # Use batch size 2 to trigger a second cache miss on the shape.
+      fn(array_ops.ones((2, 2)))
+      self.assertNotIn('ValueError', printed.contents())
+
+    # Shape relaxation passes TensorShape([None, None]), which causes layer
+    # matmul to fail, due to incompatible dims.  What would have been a graph
+    # build time error (layer would complain about the inner dim being 4).
+    with self.captureWritesToStream(sys.stderr) as printed:
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, r'MatMul'):
+        fn(array_ops.ones((3, 4)))
+
   def testWastedAdd(self):
 
     @def_function.function()
@@ -107,6 +208,15 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       t = constant_op.constant(1.0)
       self.assertAllEqual(add(t, t).numpy(), 2.0)
 
+  def testNoHash(self):
+
+    @def_function.function()
+    def f(_):
+      return 1.0
+
+    with self.assertRaisesRegexp(TypeError, 'set'):
+      f(set([]))
+
   def testFuncName(self):
 
     @function.defun_with_attributes(attributes={'func_name': 'multiply'})
@@ -123,7 +233,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(add_2._name, 'add_2')
 
   def testBasicGraphMode(self):
-    matmul = def_function.function(math_ops.matmul)
+    # TODO(b/121134877): Remove the autograph override.
+    matmul = def_function.function(math_ops.matmul, autograph=False)
 
     @def_function.function
     def sq(a):
@@ -134,7 +245,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(out, math_ops.matmul(t, t).numpy())
 
   def testNestedInputsGraphMode(self):
-    matmul = def_function.function(math_ops.matmul)
+    # TODO(b/121134877): Remove the autograph override.
+    matmul = def_function.function(math_ops.matmul, autograph=False)
 
     pair = collections.namedtuple('pair', ['a', 'b'])
 
@@ -148,7 +260,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(out, math_ops.matmul(t, t).numpy())
 
   def testNestedOutputsGraphMode(self):
-    matmul = def_function.function(math_ops.matmul)
+    # TODO(b/121134877): Remove the autograph override.
+    matmul = def_function.function(math_ops.matmul, autograph=False)
 
     pair = collections.namedtuple('pair', ['a', 'b'])
 
@@ -177,7 +290,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(f().shape, ())
 
   def testBasicGraphFunction(self):
-    matmul = def_function.function(math_ops.matmul)
+    # TODO(b/121134877): Remove the autograph override.
+    matmul = def_function.function(math_ops.matmul, autograph=False)
 
     @def_function.function
     def sq(a):
@@ -191,7 +305,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(out, math_ops.matmul(t, t).numpy())
 
   def testInputSpecGraphFunction(self):
-    matmul = def_function.function(math_ops.matmul)
+    # TODO(b/121134877): Remove the autograph override.
+    matmul = def_function.function(math_ops.matmul, autograph=False)
 
     @def_function.function
     def sq(a):
@@ -210,7 +325,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(out2, math_ops.matmul(t2, t2).numpy())
 
   def testNestedInputSpecGraphFunction(self):
-    matmul = def_function.function(math_ops.matmul)
+    # TODO(b/121134877): Remove the autograph override.
+    matmul = def_function.function(math_ops.matmul, autograph=False)
 
     @def_function.function
     def sq(mats):
@@ -304,7 +420,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(f(), x)
 
   def testNestedInputsGraphFunction(self):
-    matmul = def_function.function(math_ops.matmul)
+    # TODO(b/121134877): Remove the autograph override.
+    matmul = def_function.function(math_ops.matmul, autograph=False)
 
     pair = collections.namedtuple('pair', ['a', 'b'])
 
@@ -321,7 +438,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(out, math_ops.matmul(t, t).numpy())
 
   def testNestedOutputGraphFunction(self):
-    matmul = def_function.function(math_ops.matmul)
+    # TODO(b/121134877): Remove the autograph override.
+    matmul = def_function.function(math_ops.matmul, autograph=False)
 
     @def_function.function
     def sq(a):
@@ -359,13 +477,13 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     x = random_ops.random_uniform([2, 2]).numpy()
     defined = function.defun(f)
     defined(x)
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     x = random_ops.random_uniform([2, 2]).numpy()
     defined(x)
     # A NumPy array with different values but the same shape and dtype
     # shouldn't trigger another function definition.
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     # Test that the numpy array is properly an argument to the graph function.
     self.assertEqual(1., defined(numpy.ones([])).numpy())
@@ -444,6 +562,50 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     value = tensor_init()
     self.assertAllEqual(value, 2.0)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testGetConcreteFunctionCreatesVariables(self):
+
+    v_holder = []
+
+    @def_function.function
+    def tensor_init():
+      if not v_holder:
+        v_holder.append(variables.Variable(5.))
+      return v_holder[0].read_value()
+
+    concrete = tensor_init.get_concrete_function()
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual(5., self.evaluate(concrete()))
+    self.assertAllEqual(5., self.evaluate(tensor_init()))
+
+  def testFuncGraphCaptureByValue(self):
+    v = variables.Variable(1.0)
+
+    def trivial_function():
+      return v.read_value()
+
+    graph_function = function.Function(
+        trivial_function, 'test', capture_by_value=True)
+
+    self.assertAllEqual(graph_function(), 1.0)
+    v.assign(2.0)
+    self.assertAllEqual(graph_function(), 1.0)
+
+  def testFuncGraphCaptureByValueNested(self):
+    v = variables.Variable(1.0)
+
+    def trivial_function():
+      return control_flow_ops.cond(
+          array_ops.placeholder_with_default(True, ()),
+          v.read_value, v.read_value)
+
+    graph_function = function.Function(
+        trivial_function, 'test', capture_by_value=True)
+
+    self.assertAllEqual(graph_function(), 1.0)
+    v.assign(2.0)
+    self.assertAllEqual(graph_function(), 1.0)
+
   def testDefunShapeInferenceWithCapturedResourceVariable(self):
     v = resource_variable_ops.ResourceVariable([[1, 2], [3, 4]])
 
@@ -462,6 +624,44 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     var_t = resource_variable_ops.read_variable_op(var_handle, dtype=v.dtype)
     self.assertEqual(var_t.shape, tensor_shape.TensorShape([2, 2]))
 
+  def testShapeInferenceForMoreSpecificInput(self):
+    self.skipTest('b/124219898')
+
+    def f(a):
+      return array_ops.reshape(a, [-1, 3])
+
+    signature = [tensor_spec.TensorSpec(None, dtypes.float32)]
+    compiled = def_function.function(f, input_signature=signature)
+
+    with ops.Graph().as_default():
+      inputs = array_ops.zeros([10, 10, 3])
+      self.assertAllEqual(f(inputs).shape, compiled(inputs).shape)
+
+  def testFuncListAttr(self):
+
+    @function.defun
+    def test_function(val):
+
+      def fn1():
+        return array_ops.ones([10])
+
+      fn2 = lambda: array_ops.ones([10]) * 2
+
+      def fn3(x=2):
+        return array_ops.ones([10]) * x
+      fn3 = functools.partial(fn3, x=3)
+
+      return gen_functional_ops.case(val, [], [dtypes.float32],
+                                     [function.defun(f).get_concrete_function()
+                                      for f in (fn1, fn2, fn3)])
+
+    ones = array_ops.ones([10])
+    self.assertAllEqual([ones], test_function(0))
+    self.assertAllEqual([ones * 2], test_function(1))
+    self.assertAllEqual([ones * 3], test_function(2))
+    self.assertAllEqual([ones * 3], test_function(22))  # default branch
+
+  @test_util.enable_control_flow_v2
   def testVariableInLoopInFunction(self):
 
     @function.defun
@@ -544,7 +744,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertIsInstance(
         self.v, resource_variable_ops.ResourceVariable)
 
-  def disabled_testRunMetadata(self):
+  def testRunMetadata(self):
 
     @def_function.function
     def f(x):
@@ -556,7 +756,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     run_metadata = context.export_run_metadata()
     context.disable_run_metadata()
     step_stats = run_metadata.step_stats
-    self.assertGreater(len(step_stats.dev_stats), 0)
+    self.assertNotEmpty(step_stats.dev_stats)
     cpu_stats = step_stats.dev_stats[0]
     self.assertEqual('/job:localhost/replica:0/task:0/device:CPU:0',
                      cpu_stats.device)
@@ -565,10 +765,10 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     # arbitrarily many (placeholders, return identities, etc, might be included
     # or not in the future, so shouldn't be tested for exactly.
     self.assertGreaterEqual(len(cpu_stats.node_stats), 2)
-    self.assertEqual(len(run_metadata.partition_graphs), 1)
+    self.assertLen(run_metadata.partition_graphs, 1)
 
   def testGraphModeCaptureVariable(self):
-    with context.graph_mode(), self.cached_session() as sess:
+    with context.graph_mode(), self.cached_session():
 
       class HasAVar(object):
 
@@ -579,7 +779,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
           return self.v * 2
 
       o = HasAVar()
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       call = def_function.function(o.call)
       op = call()
       self.assertAllEqual(self.evaluate(op), 2.0)
@@ -725,7 +925,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       self.skipTest('No GPUs found')
 
     x = constant_op.constant([1.]).gpu()
-    f = def_function.function(math_ops.add)
+    # TODO(b/121134877): Remove the autograph override.
+    f = def_function.function(math_ops.add, autograph=False)
     y = f(x, x).cpu()
     self.assertAllEqual(y, [2.])
 
@@ -783,8 +984,9 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       return None
 
     with self.assertRaisesRegexp(
-        errors.InvalidArgumentError, 'Could not colocate node with its '
-        'resource and reference inputs.*'):
+        errors.InvalidArgumentError,
+        'Cannot place the graph because a reference or resource edge connects '
+        'colocation groups with incompatible assigned devices'):
       if not context.executing_eagerly():
         self.evaluate(variables.global_variables_initializer())
       self.evaluate(resource_apply_adam())
@@ -794,7 +996,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       self.skipTest('No GPUs found')
 
     # The Reshape op requires the shape tensor to be placed in host memory.
-    reshape = def_function.function(array_ops.reshape)
+    # TODO(b/121134877): Remove the autograph override.
+    reshape = def_function.function(array_ops.reshape, autograph=False)
     value = constant_op.constant([1., 2.]).gpu()
     shape = constant_op.constant([2, 1])
     reshaped = reshape(value, shape).cpu()
@@ -805,7 +1008,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       self.skipTest('No GPUs found')
 
     # The Reshape op requires the shape tensor to be placed in host memory.
-    reshape = def_function.function(array_ops.reshape)
+    # TODO(b/121134877): Remove the autograph override.
+    reshape = def_function.function(array_ops.reshape, autograph=False)
     value = constant_op.constant([1., 2.])
     shape = constant_op.constant([2, 1]).gpu()
     reshape(value, shape)  # No error is raised
@@ -864,7 +1068,9 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(1, int(self.evaluate(read())))
 
   def testSequenceInputs(self):
-    clip_by_global_norm = def_function.function(clip_ops.clip_by_global_norm)
+    # TODO(b/121134877): Remove the autograph override.
+    clip_by_global_norm = def_function.function(
+        clip_ops.clip_by_global_norm, autograph=False)
     t_list = [constant_op.constant(1.0), constant_op.constant(2.0)]
     clipped_list, global_norm = clip_by_global_norm(t_list,
                                                     constant_op.constant(.2))
@@ -887,7 +1093,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
                                   constant_op.constant(4)],
         constant_op.constant(5)
     ])
-    self.assertEqual(len(ret), 2)
+    self.assertLen(ret, 2)
     self.assertAllEqual(ret[0][0], 2)
     self.assertAllEqual(ret[0][1][0][0], 8)
     self.assertAllEqual(ret[0][1][0][1], 4)
@@ -936,8 +1142,31 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllClose([[[[4.0]]]], self.evaluate(y))
 
-    # Remove reference cycles in model
-    test_util.dismantle_polymorphic_function(model)
+  # Variable lifting is somewhat different between defun/tf.function, so testing
+  # device placement on both makes sense.
+  @parameterized.named_parameters(
+      dict(testcase_name='Defun',
+           function_decorator=function.defun),
+      dict(testcase_name='DefFunction',
+           function_decorator=def_function.function))
+  @test_util.run_in_graph_and_eager_modes
+  def testVariablesPlacedOnOutsideDevice(self, function_decorator):
+
+    class _Obj(object):
+
+      def __init__(self):
+        self.v = None
+
+      @function_decorator
+      def f(self):
+        if self.v is None:
+          self.v = variables.Variable(1.)
+        return self.v + 1.
+
+    has_device = _Obj()
+    with ops.device('cpu:0'):
+      has_device.f()
+    self.assertIn('CPU', has_device.v.device)
 
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testDefunKerasModelCall(self):
@@ -952,11 +1181,9 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllEqual([[3.0]], self.evaluate(y))
 
-    # Remove reference cycles in defun.
-    test_util.dismantle_polymorphic_function(model.call)
     # Break the reference cycle between the MiniModel and the defun:
-    # MiniModel --(through its `call` method)--> PolymorphicFunction
-    # PolymorphicFunction --(instancemethod on MiniModel)--> MiniModel
+    # `MiniModel` --(through its `call` method)--> `Function`
+    # `Function` --(instancemethod on `MiniModel`)--> `MiniModel`
     del model.call
 
   # Note: The ConfigProto below unfortunately only configures graph
@@ -978,7 +1205,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     defined = function.defun(multi_device_fn)
     outputs = self.evaluate(defined())
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
     self.assertIn(compat.as_bytes('CPU:0'), outputs[0])
     self.assertIn(compat.as_bytes('CPU:1'), outputs[1])
     self.assertIn(compat.as_bytes('CPU:2'), outputs[2])
@@ -986,7 +1213,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     with ops.device('/cpu:3'):
       outputs = self.evaluate(defined())
     # All function definitions are agnostic to call site devices.
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
     self.assertIn(compat.as_bytes('CPU:0'), outputs[0])
     self.assertIn(compat.as_bytes('CPU:1'), outputs[1])
     self.assertIn(compat.as_bytes('CPU:2'), outputs[2])
@@ -994,7 +1221,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     with ops.device('/cpu:0'):
       outputs = self.evaluate(defined())
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
     self.assertIn(compat.as_bytes('CPU:0'), outputs[0])
     self.assertIn(compat.as_bytes('CPU:1'), outputs[1])
     self.assertIn(compat.as_bytes('CPU:2'), outputs[2])
@@ -1008,7 +1235,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     def func():
       return constant_op.constant(0)
 
-    defined = function.defun(func)
+    defined = def_function.function(func)
     with ops.device('cpu:0'):
       cpu_graph_function = defined.get_concrete_function()
 
@@ -1079,10 +1306,38 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     defined = function.defun(func)
     defined(Foo())
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     defined(Foo())
-    self.assertEqual(len(defined._function_cache), 2)
+    self.assertLen(total_function_cache(defined), 2)
+
+  def testCacheTensorDtypeCollision(self):
+
+    def func(t):
+      return t + t
+
+    defined = function.defun(func)
+    t = constant_op.constant([[1.0]], dtype=dtypes.complex64)
+    defined(t)
+    self.assertLen(total_function_cache(defined), 1)
+
+    t = constant_op.constant([[1.0]], dtype=dtypes.complex128)
+    defined(t)
+    self.assertLen(total_function_cache(defined), 2)
+
+  def testCacheTensorShapeCollision(self):
+
+    def func(t):
+      return t + t
+
+    defined = function.defun(func)
+    t = constant_op.constant([[1.0]], dtype=dtypes.complex64)
+    defined(t)
+    self.assertLen(total_function_cache(defined), 1)
+
+    t = constant_op.constant([1.0], dtype=dtypes.complex64)
+    defined(t)
+    self.assertLen(total_function_cache(defined), 2)
 
   def testCacheTensorShapeDtypeCollision(self):
 
@@ -1092,11 +1347,11 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     defined = function.defun(func)
     t = constant_op.constant([[1.0]], dtype=dtypes.complex64)
     defined(t)
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     t = constant_op.constant([1.0], dtype=dtypes.complex128)
     defined(t)
-    self.assertEqual(len(defined._function_cache), 2)
+    self.assertLen(total_function_cache(defined), 2)
 
   def testCacheTensorUnknownShapesCollision(self):
 
@@ -1106,21 +1361,34 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     with context.graph_mode(), self.cached_session():
       defined = function.defun(func)
 
-      p = array_ops.placeholder(dtype=dtypes.float32, shape=None)
+      p = array_ops.placeholder(dtype=dtypes.float32, shape=[])
       defined(p)
-      self.assertEqual(len(defined._function_cache), 1)
+      self.assertLen(total_function_cache(defined), 1)
 
-      p = array_ops.placeholder(dtype=dtypes.float32, shape=[None])
+      p = array_ops.placeholder(dtype=dtypes.float32, shape=[1])
       defined(p)
-      self.assertEqual(len(defined._function_cache), 2)
+      self.assertLen(total_function_cache(defined), 2)
 
-      p = array_ops.placeholder(dtype=dtypes.float32, shape=[None, None])
+      p = array_ops.placeholder(dtype=dtypes.float32, shape=[2])
       defined(p)
-      self.assertEqual(len(defined._function_cache), 3)
-
-      t = constant_op.constant(1.0, dtype=dtypes.float32)
+      # Gradual shape relaxation is performed; and the common shape between
+      # [1] and [2] is one containing unknown dimensions.
+      self.assertLen(total_function_cache(defined), 2)
+
+      # pylint: disable=protected-access
+      self.assertLen(defined._function_cache.arg_relaxed_shapes, 1)
+      relaxed_shapes = (
+          list(defined._function_cache.arg_relaxed_shapes.values())[0])
+      self.assertEqual(len(relaxed_shapes), 1)
+      relaxed_shape = relaxed_shapes[0]
+      # pylint: enable=protected-access
+      self.assertEqual(relaxed_shape.rank, 1)
+      self.assertEqual(tensor_shape.dimension_value(relaxed_shape[0]), None)
+
+      t = constant_op.constant([1.0, 1.0, 1.0], dtype=dtypes.float32)
       defined(t)
-      self.assertEqual(len(defined._function_cache), 4)
+      # Shape (3,) matches the relaxed shape TensorShape([None])
+      self.assertLen(total_function_cache(defined), 2)
 
   def testPythonFunctionWithDefaultArgs(self):
 
@@ -1135,35 +1403,36 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     def cache_keys():
       """Sanitizes cache keys of non-input metadata."""
-      return tuple(key[0] for key in defined._function_cache)
+      return tuple(key[0] for key in total_function_cache(defined))
 
     # `True` corresponds to the fact that we're executing eagerly
-    self.assertIn(('URRR', (0, 1, 20)), cache_keys())
+    self.assertIn(('URRRu', (0, 1, 20)), cache_keys())
 
     defined(1)  # bar=1, baz=2
-    self.assertIn(('URRR', (1, 1, 2)), cache_keys())
+    self.assertIn(('URRRu', (1, 1, 2)), cache_keys())
 
     # This matches the previous call.
     defined(foo=1)
-    self.assertEqual(len(defined._function_cache), 2)
+    self.assertLen(total_function_cache(defined), 2)
 
     defined(1, 2, 3)
-    self.assertIn(('URRR', (1, 2, 3)), cache_keys())
+    self.assertLen(total_function_cache(defined), 3)
+    self.assertIn(('URRRu', (1, 2, 3)), cache_keys())
 
     # This matches the previous call.
     defined(1, bar=2, baz=3)
-    self.assertEqual(len(defined._function_cache), 3)
+    self.assertLen(total_function_cache(defined), 3)
 
     # This matches the previous call.
     defined(1, baz=3, bar=2)
-    self.assertEqual(len(defined._function_cache), 3)
+    self.assertLen(total_function_cache(defined), 3)
 
   def testFunctoolsPartialUnwrappedCorrectly(self):
 
     def full_function(a, b, c=3):
       return a, b, c
 
-    partial = functools.partial(full_function, 1, c=3)
+    partial = functools.partial(full_function, 1, c=4)
     a, b, c = partial(2)
 
     defined = function.defun(partial)
@@ -1172,7 +1441,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(func_b.numpy(), b)
     self.assertEqual(func_c.numpy(), c)
 
-  def testInputSignatureWithCompatibleInputs(self):
+  def testInputSignatureWithMatchingInputs(self):
 
     def foo(a):
       self.assertEqual(a.shape, (2,))
@@ -1182,12 +1451,12 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     defined = function.defun(foo, input_signature=signature)
     a = array_ops.ones([2])
     self.assertAllEqual(a, defined(a))
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
     self.assertAllEqual(a, defined.get_concrete_function()(a))
     self.assertAllEqual(a, defined.get_concrete_function(a)(a))
     self.assertAllEqual(a, defined.get_concrete_function(
         tensor_spec.TensorSpec((2,), dtype=dtypes.float32))(a))
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     def bar(a):
       self.assertEqual(a._shape_tuple(), (2, None))
@@ -1197,31 +1466,55 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     defined = function.defun(bar, input_signature=signature)
     a = array_ops.ones([2, 1])
     out = defined(a)
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
     self.assertAllEqual(out, a)
 
     # Changing the second dimension shouldn't create a new function.
     b = array_ops.ones([2, 3])
     out = defined(b)
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
     self.assertAllEqual(out, b)
 
+  def testInputSignatureWithCompatibleInputs(self):
+
+    rank2_spec = tensor_spec.TensorSpec(shape=(None, None),
+                                        dtype=dtypes.float32)
+
+    @function.defun(input_signature=[rank2_spec])
+    def func(a):
+      self.assertEqual([None, None], a.shape.as_list())
+      return array_ops.shape(a)
+
+    self.assertAllEqual([3, 1], func([[0], [1.0], [1]]))
+    self.assertAllEqual([2, 2], func(numpy.array([[1, 1], [2, 2]])))
+
+    with self.assertRaisesRegexp(ValueError, 'incompatible'):
+      func([0.0, 1.0, 2.0])  # Wrong shape.
+
+    with self.assertRaisesRegexp(ValueError, 'incompatible'):
+      func([['wrong dtype']])
+
   def testNestedInputSignatures(self):
 
+    def expected_foo(a, b):
+      return [a, b]
+
+    @function.defun(input_signature=[
+        [tensor_spec.TensorSpec((2, None), dtypes.float32)] * 2,
+        tensor_spec.TensorSpec((1,), dtypes.float32),
+    ])
     def foo(a, b):
       self.assertEqual(a[0]._shape_tuple(), (2, None))
       self.assertEqual(a[1]._shape_tuple(), (2, None))
       self.assertEqual(b._shape_tuple(), (1,))
       return [a, b]
 
-    signature = [[tensor_spec.TensorSpec((2, None), dtypes.float32)] * 2,
-                 tensor_spec.TensorSpec((1,), dtypes.float32)]
-    defined = function.defun(foo, input_signature=signature)
     a = array_ops.ones([2, 1])
     b = array_ops.ones([1])
-    out = defined([a, a], b)
-    self.assertEqual(len(defined._function_cache), 1)
-    nest.assert_same_structure(out, [[a, a], b])
+    expected = expected_foo([a, a], b)
+    out = foo([a, a], b)
+    self.assertLen(total_function_cache(foo), 1)
+    nest.assert_same_structure(out, expected)
     self.assertAllEqual(out[0][0], a)
     self.assertAllEqual(out[0][1], a)
     self.assertAllEqual(out[1], b)
@@ -1230,33 +1523,58 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     a = array_ops.ones([2, 3])
     b = array_ops.ones([2, 5])
     c = array_ops.ones([1])
-    out = defined([a, b], c)
-    self.assertEqual(len(defined._function_cache), 1)
-    nest.assert_same_structure(out, [[a, b], c])
+    expected = expected_foo([a, b], c)
+    out = foo([a, b], c)
+    self.assertLen(total_function_cache(foo), 1)
+    nest.assert_same_structure(out, expected)
     self.assertAllEqual(out[0][0], a)
     self.assertAllEqual(out[0][1], b)
     self.assertAllEqual(out[1], c)
 
+    # Passing compatible inputs should work.
+    a = a.numpy().tolist()
+    b = b.numpy().tolist()
+    c = c.numpy().tolist()
+    out = foo([a, b], c)
+    self.assertLen(total_function_cache(foo), 1)
+    nest.assert_same_structure(out, expected)
+    self.assertAllEqual(out[0][0], a)
+    self.assertAllEqual(out[0][1], b)
+    self.assertAllEqual(out[1], c)
+
+  def testNestedInputSignaturesWithDict(self):
+    def expected_bar(a):
+      return a
+
+    @function.defun(input_signature=[{
+        'a': tensor_spec.TensorSpec((2, None), dtypes.float32),
+        'b': tensor_spec.TensorSpec((2, None), dtypes.float32),
+        'c': tensor_spec.TensorSpec((1,), dtypes.float32)}])
     def bar(a):
       self.assertEqual(a['a']._shape_tuple(), (2, None))
       self.assertEqual(a['b']._shape_tuple(), (2, None))
       self.assertEqual(a['c']._shape_tuple(), (1,))
       return a
 
-    signature = [{
-        'a': tensor_spec.TensorSpec((2, None), dtypes.float32),
-        'b': tensor_spec.TensorSpec((2, None), dtypes.float32),
-        'c': tensor_spec.TensorSpec((1,), dtypes.float32)
-    }]
     a = array_ops.ones([2, 3])
     b = array_ops.ones([1])
     inputs = {'a': a, 'b': a, 'c': b}
-    defined = def_function.function(bar, input_signature=signature)
-    out = defined(inputs)
-    nest.assert_same_structure(out, inputs)
-    self.assertAllEqual(out['a'], inputs['a'])
-    self.assertAllEqual(out['b'], inputs['b'])
-    self.assertAllEqual(out['c'], inputs['c'])
+    expected = expected_bar(inputs)
+    out = bar(inputs)
+    nest.assert_same_structure(out, expected)
+    self.assertAllEqual(out['a'], expected['a'])
+    self.assertAllEqual(out['b'], expected['b'])
+    self.assertAllEqual(out['c'], expected['c'])
+
+    # Passing compatible inputs should work.
+    a = a.numpy().tolist()
+    b = b.numpy().tolist()
+    inputs = {'a': a, 'b': a, 'c': b}
+    out = bar(inputs)
+    nest.assert_same_structure(out, expected)
+    self.assertAllEqual(out['a'], expected['a'])
+    self.assertAllEqual(out['b'], expected['b'])
+    self.assertAllEqual(out['c'], expected['c'])
 
   def testInputSignatureMustBeSequenceOfTensorSpecs(self):
 
@@ -1275,6 +1593,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
                                  'tuple or a list.*'):
       function.defun(foo, input_signature=signature)
 
+  @test_util.run_in_graph_and_eager_modes
   def testInputsIncompatibleWithSignatureRaisesError(self):
 
     def foo(a):
@@ -1291,9 +1610,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       defined(array_ops.ones([2, 1]))
 
     # Wrong number of arguments.
-    with self.assertRaisesRegexp(
-        ValueError,
-        'Arguments and signature arguments do not match.*'):
+    with self.assertRaisesRegexp(TypeError, 'Received 2 argument\(s\)'):
       defined(array_ops.ones([2]), array_ops.ones([2]))
     with self.assertRaisesRegexp(ValueError,
                                  'Structure of Python function inputs.*'):
@@ -1304,7 +1621,60 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       defined.get_concrete_function(
           tensor_spec.TensorSpec(shape=(3,), dtype=dtypes.float32))
 
-  def testInputSignatureForFunctionWithNonTensorInputsNotAllowed(self):
+  def testInputsIncompatibleWithNestedSignatureRaisesError(self):
+
+    def foo(a, b):
+      return [a, b]
+
+    signature = [[tensor_spec.TensorSpec((1,), dtypes.float32)] * 2,
+                 [tensor_spec.TensorSpec((1,), dtypes.float32)] * 2]
+    defined = function.defun(foo, input_signature=signature)
+    a = array_ops.ones([1])
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'Structure of Python function inputs.*'):
+      defined([a, a, a], [a])
+
+    with self.assertRaisesRegexp(ValueError,
+                                 'Structure of Python function inputs.*'):
+      defined([a], [a, a, a])
+    defined([a, a], [a, a])
+
+  def testUnderspecifiedInputSignature(self):
+    @function.defun(input_signature=[
+        tensor_spec.TensorSpec([], dtypes.float32),
+    ])
+    def foo(a, training=True):
+      if training:
+        return a
+      else:
+        return -1.0 * a
+
+    x = constant_op.constant(1.0)
+    with self.assertRaisesRegexp(TypeError, 'only pass arguments'):
+      foo(x, training=True)
+
+    with self.assertRaisesRegexp(TypeError, 'only pass arguments'):
+      foo(x, training=False)
+
+    self.assertAllEqual(x.numpy(), foo(x).numpy())
+
+  def testInputSignatureWithPartialFunction(self):
+    self.skipTest('b/124441704')
+    def full_function(a, b, c=3.0):
+      return a, b, c
+
+    partial = functools.partial(full_function, 1, c=4)
+    a, b, c = partial(2.0)
+    signature = [tensor_spec.TensorSpec([], dtypes.float32)]
+    defined = function.defun(partial, input_signature=signature)
+    x = constant_op.constant(2.0)
+    func_a, func_b, func_c = defined(x)
+    self.assertEqual(func_a.numpy(), a)
+    self.assertEqual(func_b.numpy(), b)
+    self.assertEqual(func_c.numpy(), c)
+
+  def testInputSignatureConversionWithDefaultArg(self):
 
     def foo(a, training=True):
       if training:
@@ -1318,11 +1688,9 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     ]
     defined = def_function.function(foo, input_signature=signature)
     a = constant_op.constant(1.0)
-    with self.assertRaisesRegexp(
-        ValueError,
-        'When input_signature is provided, all inputs to '
-        'the Python function must be Tensors.'):
-      defined(a, training=True)
+    self.assertAllEqual(a.numpy(), defined(a))
+    self.assertAllEqual(a.numpy(), defined(a, training=True))
+    self.assertAllEqual(-a.numpy(), defined(a, training=False))
 
   def testInputSignatureWithKeywordPositionalArgs(self):
 
@@ -1337,22 +1705,22 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     integer = constant_op.constant(2, dtypes.int64)
 
     out1, out2 = foo(flt, integer)
-    self.assertEqual(len(foo._function_cache), 1)
+    self.assertLen(total_function_cache(foo), 1)
     self.assertEqual(out1.numpy(), 1.0)
     self.assertEqual(out2.numpy(), 2)
 
     out1, out2 = foo(flt=flt, integer=integer)
-    self.assertEqual(len(foo._function_cache), 1)
+    self.assertLen(total_function_cache(foo), 1)
     self.assertEqual(out1.numpy(), 1.0)
     self.assertEqual(out2.numpy(), 2)
 
     out1, out2 = foo(integer=integer, flt=flt)
-    self.assertEqual(len(foo._function_cache), 1)
+    self.assertLen(total_function_cache(foo), 1)
     self.assertEqual(out1.numpy(), 1.0)
     self.assertEqual(out2.numpy(), 2)
 
     out1, out2 = foo(flt, integer=integer)
-    self.assertEqual(len(foo._function_cache), 1)
+    self.assertLen(total_function_cache(foo), 1)
     self.assertEqual(out1.numpy(), 1.0)
     self.assertEqual(out2.numpy(), 2)
 
@@ -1382,27 +1750,27 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     a = constant_op.constant(2.0)
     b = constant_op.constant([1.0, 2.0])
     one = defined(a, b)
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     two = defined(a=a, b=b)
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     three = defined(b=b, a=a)
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     four = defined(a, b=b)
-    self.assertEqual(len(defined._function_cache), 1)
+    self.assertLen(total_function_cache(defined), 1)
 
     # The next call corresponds to a new input signature, hence
     # we expect another function to be defined.
     five = defined(b, a)
-    self.assertEqual(len(defined._function_cache), 2)
+    self.assertLen(total_function_cache(defined), 2)
 
     six = defined(a=b, b=a)
-    self.assertEqual(len(defined._function_cache), 2)
+    self.assertLen(total_function_cache(defined), 2)
 
     seven = defined(b=a, a=b)
-    self.assertEqual(len(defined._function_cache), 2)
+    self.assertLen(total_function_cache(defined), 2)
 
     self.assertAllEqual(one, [1.0, 2.0])
     self.assertAllEqual(two, [1.0, 2.0])
@@ -1484,35 +1852,24 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
         graph = ops.get_default_graph()
         # pylint: disable=protected-access
-        self.assertEqual(len(graph._functions), 2)
+        self.assertLen(graph._functions, 2)
         functions = list(graph._functions.values())
         self.assertRegexpMatches(
             functions[0].definition.signature.name, '.*matmul.*')
         attrs = functions[0].definition.attr
-        self.assertEqual(len(attrs), 2)
+        self.assertLen(attrs, 2)
         self.assertEqual(attrs['experimental_1'].s, b'value1')
         self.assertEqual(attrs['experimental_2'].i, 2)
 
         self.assertRegexpMatches(
             functions[1].definition.signature.name, '.*add.*')
         attrs = functions[1].definition.attr
-        self.assertEqual(len(attrs), 2)
+        self.assertLen(attrs, 2)
         self.assertEqual(attrs['experimental_3'].b, True)
         self.assertEqual(attrs['experimental_4'].f, 1.0)
         # pylint: enable=protected-access
 
   def testFunctionWithInvalidAttribute(self):
-    @function.defun_with_attributes(attributes={'attr1': 'value1'})
-    def matmul(x, y):
-      return math_ops.matmul(x, y)
-
-    with self.assertRaisesRegexp(ValueError,
-                                 '.*Attribute name is not whitelisted.*'):
-      with context.graph_mode(), self.cached_session():
-        with ops.get_default_graph().as_default():
-          t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
-          matmul(t, t)
-
     @function.defun_with_attributes(attributes={'experimental_1': ['value1']})
     def add(x, y):
       return math_ops.add(x, y)
@@ -1524,7 +1881,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
           t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
           add(t, t)
 
-  def testRegisterPolymorphicFunction(self):
+  def testRegisterFunction(self):
+
     @function.defun
     def add(x, y):
       return math_ops.add(x, y)
@@ -1541,7 +1899,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
         graph = ops.get_default_graph()
         # pylint: disable=protected-access
-        self.assertEqual(len(graph._functions), 6)
+        self.assertLen(graph._functions, 6)
         # two sets of functions, each of them are (inference, forward, backward)
         functions = list(graph._functions.values())
         captured_function_names = [
@@ -1580,7 +1938,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
         self.assertAllEqual(double.eval().reshape(-1), [2, 4, 6, 8])
         # Make sure the pre registered function is used, and no other function
         # is added.
-        self.assertEqual(len(graph._functions), 6)
+        self.assertLen(graph._functions, 6)
         functions = list(graph._functions.values())
         for i in range(len(functions)):
           self.assertEqual(captured_function_names[i],
@@ -1617,7 +1975,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
         graph = ops.get_default_graph()
         # pylint: disable=protected-access
-        self.assertEqual(len(graph._functions), 6)
+        self.assertLen(graph._functions, 6)
         # two sets of functions, each of them are (inference, forward, backward)
         functions = list(graph._functions.values())
         captured_function_names = [
@@ -1643,7 +2001,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
         self.assertAllEqual([[1, 2], [3, 4]], self.evaluate(composite_t))
         # Make sure the pre registered function is used, and no other function
         # is added.
-        self.assertEqual(len(graph._functions), 6)
+        self.assertLen(graph._functions, 6)
 
   def testRegisterFunctionWithInputSignature(self):
     def matmul(x, y):
@@ -1661,12 +2019,12 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
         graph = ops.get_default_graph()
         # pylint: disable=protected-access
-        self.assertEqual(len(graph._functions), 3)
+        self.assertLen(graph._functions, 3)
 
         # Test register function with cache, note inputs are ignored.
         function.register(defun_matmul)
         graph = ops.get_default_graph()
-        self.assertEqual(len(graph._functions), 3)
+        self.assertLen(graph._functions, 3)
 
   def testRegisterFunctionWithCache(self):
     def matmul(x, y):
@@ -1683,7 +2041,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
         graph = ops.get_default_graph()
         # Only one function is registered since the input param are in same type
         # pylint: disable=protected-access
-        self.assertEqual(len(graph._functions), 3)
+        self.assertLen(graph._functions, 3)
 
   def testCallingFunctionWithDifferentVariables(self):
 
@@ -1694,8 +2052,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     v = resource_variable_ops.ResourceVariable(0.0)
     graph_function = foo.get_concrete_function(v)
-    self.assertEqual(len(graph_function.inputs), 1)
-    self.assertEqual(len(graph_function.captured_inputs), 0)
+    self.assertLen(graph_function.inputs, 1)
+    self.assertEmpty(graph_function.captured_inputs)
 
     self.assertEqual(float(graph_function(v)), 1.0)
     self.assertEqual(float(graph_function(v)), 2.0)
@@ -1718,39 +2076,35 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       return x
 
     graph_function = foo.get_concrete_function(constant_op.constant(1.0))
-    with self.assertRaisesRegexp(ValueError, 'All inputs to `Function`s must '
-                                 'be Tensors;.*'):
+    with self.assertRaisesRegexp(
+        ValueError, 'All inputs to `ConcreteFunction`s must be Tensors;.*'):
       graph_function('Not a Tensor.')
 
   def testSwapImplementationWithGrapplerPlugin(self):
+    # Set the min_graph_nodes to -1 since the graph in this test is too small,
+    # and will be ignored by grappler if don't set this.
     rewrites = rewriter_config_pb2.RewriterConfig()
-    # function_optimizer has to be turn off, otherwise it will delete the
-    # registered function if it does not get called.
-    # TODO(scottzhu): Move the ExperimentalImplementationSelector to be called
-    # before function_optimizer in future.
-    rewrites.function_optimization = rewriter_config_pb2.RewriterConfig.OFF
-    customer_optimizer = rewrites.custom_optimizers.add()
-    customer_optimizer.name = 'ExperimentalImplementationSelector'
+    rewrites.implementation_selector = rewriter_config_pb2.RewriterConfig.ON
     rewrites.min_graph_nodes = -1
     graph_options = config_pb2.GraphOptions(
         rewrite_options=rewrites, build_cost_model=1)
     config = config_pb2.ConfigProto(graph_options=graph_options)
 
     with context.graph_mode(), self.cached_session(
-        config=config, graph=ops.Graph(), use_gpu=True) as sess:
+        config=config, graph=ops.Graph(), use_gpu=True):
 
       @function.defun_with_attributes(
           attributes={
-              'experimental_api_implements': 'random_boost',
-              'experimental_api_preferred_device': 'CPU'
+              'api_implements': 'random_boost',
+              'api_preferred_device': 'CPU'
           })
       def cpu_boost(x):
         return math_ops.add(x, 2.0)
 
       @function.defun_with_attributes(
           attributes={
-              'experimental_api_implements': 'random_boost',
-              'experimental_api_preferred_device': 'GPU'
+              'api_implements': 'random_boost',
+              'api_preferred_device': 'GPU'
           })
       def gpu_boost(x):
         return math_ops.add(x, 4.0)
@@ -1784,18 +2138,43 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       with ops.Graph().as_default():
         x = constant_op.constant(11)
         maybe_add(x, True)
-        self.assertEqual(len(maybe_add._function_cache), 1)
-        self.assertEqual(len(add._function_cache), 1)
+        self.assertLen(total_function_cache(maybe_add), 1)
+        self.assertLen(total_function_cache(add), 1)
 
         maybe_add(x, False)
-        self.assertEqual(len(maybe_add._function_cache), 2)
-        self.assertEqual(len(add._function_cache), 1)
+        self.assertLen(total_function_cache(maybe_add), 2)
+        self.assertLen(total_function_cache(add), 1)
 
       with ops.Graph().as_default():
         x = constant_op.constant(11)
         maybe_add(x, True)
-        self.assertEqual(len(maybe_add._function_cache), 3)
-        self.assertEqual(len(add._function_cache), 2)
+        self.assertLen(total_function_cache(maybe_add), 3)
+        self.assertLen(total_function_cache(add), 2)
+
+  def testCacheKeyOverlappingShapes(self):
+    @function.defun
+    def defined(t):
+      return t
+
+    defined(array_ops.zeros([12, 1]))
+    self.assertLen(total_function_cache(defined), 1)
+
+    defined(array_ops.zeros([1, 21]))
+    self.assertLen(total_function_cache(defined), 2)
+
+  def testCacheKeyNestedLists(self):
+    @function.defun
+    def defined(l):
+      return l
+
+    a = constant_op.constant(1.)
+    b = constant_op.constant(2.)
+    c = constant_op.constant(3.)
+    defined([[a], b, c])
+    self.assertLen(total_function_cache(defined), 1)
+
+    defined([[a, b], c])
+    self.assertLen(total_function_cache(defined), 2)
 
   def testDecoratedMethod(self):
     m = DefunnedMiniModel()
@@ -1807,7 +2186,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(instance_call_one, instance_call_two)
     self.assertAllEqual(instance_call_one, class_call)
 
-  def testDecoratedMethodUniquePolymorphicFuncPerInstance(self):
+  def testDecoratedMethodUniqueFunctionPerInstance(self):
     m = DefunnedMiniModel()
     n = DefunnedMiniModel()
 
@@ -2017,7 +2396,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     m = DefunnedMiniModel()
     m(array_ops.ones([1, 2]))
     weak_variables = weakref.WeakSet(m.variables)
-    self.assertEqual(2, len(weak_variables))
+    self.assertLen(weak_variables, 2)
     del m
     self.assertEqual([], list(weak_variables))
 
@@ -2040,6 +2419,323 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
             5,
             add_five(constant_op.constant(0, dtype=dtypes.int32)).numpy())
 
+  @test_util.assert_no_garbage_created
+  def testReferenceCycles(self):
+
+    fn = function.defun(lambda x: 2. * x)
+
+    fn(constant_op.constant(4.0))
+    weak_fn = weakref.ref(fn)
+    del fn
+    # Tests that the weak reference we made to the function is now dead, which
+    # means the object has been deleted. This should be true as long as the
+    # function itself is not involved in a reference cycle.
+    self.assertIs(None, weak_fn())
+
+  def testFunctionStackInErrorMessage(self):
+    if context.executing_eagerly():
+      # TODO(b/122736651): Remove this skipTest once fixed.
+      self.skipTest('Error interpolation is not working when function is '
+                    'invoked without PartitionedCallOp.')
+
+    @def_function.function()
+    def fn3(x):
+      return x + 2
+
+    @def_function.function()
+    def fn2(x):
+      check_ops.assert_equal(fn3(x), 3)
+      return 2
+
+    @def_function.function()
+    def fn(x):
+      return fn2(x)
+
+    with self.assertRaises(errors.InvalidArgumentError) as cm:
+      fn(2)
+    e = cm.exception
+    self.assertIn('fn -> fn2', e.message)
+    self.assertIn('node assert_equal/Assert/Assert (defined at', e.message)
+    self.assertNotIn('fn3', e.message)
+
+  def testFunctionIsNotPinned(self):
+    """Tests that functions aren't pinned to the CPU by the eager runtime."""
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found.')
+    seed1, seed2 = 79, 25
+    shape = constant_op.constant([4, 7])
+    dtype = dtypes.float32
+
+    @def_function.function
+    def func():
+      with ops.device('GPU:0'):
+        return gen_random_ops.random_standard_normal(
+            shape, dtype=dtype, seed=seed1, seed2=seed2)
+
+    with ops.device('GPU:0'):
+      x = func()
+      self.assertRegexpMatches(x.device, 'GPU')
+
+
+class MultiDeviceTest(test.TestCase, parameterized.TestCase):
+
+  def testMultiDeviceOutput(self):
+    """Tests that functions can produce outputs on multiple devices."""
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found.')
+
+    @function.defun
+    def func(a, b, transpose_a):
+      with ops.device('/device:CPU:0'):
+        m1 = math_ops.matmul(a, b, transpose_a=transpose_a)
+      with ops.device('/device:GPU:0'):
+        m2 = math_ops.matmul(a, b, transpose_a=transpose_a)
+      return m1, m2
+
+    t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    m1, m2 = func(t, t, transpose_a=True)
+    self.assertAllEqual(m1.numpy(), [[10, 14], [14, 20]])
+    self.assertRegexpMatches(m1.backing_device, 'CPU')
+    self.assertAllEqual(m2.numpy(), [[10, 14], [14, 20]])
+    self.assertRegexpMatches(m2.backing_device, 'GPU')
+
+  def testEmptyBody(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found.')
+
+    @function.defun
+    def func(a, b):
+      return b, a
+
+    with ops.device('/device:CPU:0'):
+      a = constant_op.constant(3.0)
+    with ops.device('/device:GPU:0'):
+      b = constant_op.constant(5.0)
+
+    m1, m2 = func(a, b)
+    self.assertAllEqual(m1.numpy(), 5.0)
+    self.assertRegexpMatches(m1.backing_device, 'GPU')
+    self.assertAllEqual(m2.numpy(), 3.0)
+    self.assertRegexpMatches(m2.backing_device, 'CPU')
+
+  def testMultiDeviceInt32(self):
+    """Tests that multi-device functions can take and output INT32s.
+
+    When an INT32 device tensor is fed into a function, it is copied to CPU
+    by the eager runtime. The function sees all INT32 inputs on CPU.
+
+    We set allocator attribute 'on_host' for INT32 outputs. They can be
+    partitioned into the GPU component function, but will be allocated on
+    CPU nevertheless.
+
+    There is experimental support for `ints_on_device` in
+    FunctionLibraryRuntime now. We can try that.
+
+    """
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found.')
+
+    with ops.device('/device:CPU:0'):
+      int_cpu = constant_op.constant(3, dtype=dtypes.int32)
+      resource = resource_variable_ops.ResourceVariable(5, dtype=dtypes.int32)
+    with ops.device('/device:GPU:0'):
+      int_gpu = constant_op.constant(7, dtype=dtypes.int32)
+
+    @function.defun
+    def func(int_cpu, resource, int_gpu):
+      with ops.device('/device:CPU:0'):
+        m1 = int_cpu * resource + int_gpu
+      with ops.device('/device:GPU:0'):
+        # This computation will happen on GPU but m2 will be copied to CPU.
+        m2 = int_gpu * resource + int_cpu + 1
+      return m1, m2
+
+    m1, m2 = func(int_cpu, resource, int_gpu)
+    self.assertAllEqual(m1.numpy(), 22)
+    self.assertRegexpMatches(m1.backing_device, 'CPU')
+    self.assertAllEqual(m2.numpy(), 39)
+    self.assertRegexpMatches(m2.backing_device, 'CPU')
+
+    # flip arguments
+    m1, m2 = func(int_gpu, resource, int_cpu)
+    self.assertAllEqual(m1.numpy(), 38)
+    self.assertRegexpMatches(m1.backing_device, 'CPU')
+    self.assertAllEqual(m2.numpy(), 23)
+    self.assertRegexpMatches(m2.backing_device, 'CPU')
+
+  def testMultiDeviceColocateWith(self):
+    """Tests that function's outputs respect colocation constraints."""
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found.')
+
+    @function.defun
+    def func(a, b):
+      with ops.colocate_with(a):
+        ra = 2 * a
+      with ops.colocate_with(b):
+        rb = 3 * b
+      return ra, rb
+
+    devices = ['/device:CPU:0', '/device:GPU:0']
+    for dev1, dev2 in itertools.product(devices, devices):
+      with ops.device(dev1):
+        a = constant_op.constant(1.0)
+      with ops.device(dev2):
+        b = constant_op.constant(10.0)
+
+      ra, rb = func(a, b)
+      self.assertEqual(ra.numpy(), 2.0)
+      self.assertRegexpMatches(ra.backing_device, dev1)
+      self.assertEqual(rb.numpy(), 30.0)
+      self.assertRegexpMatches(rb.backing_device, dev2)
+
+  def testMultiDeviceResources(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found.')
+
+    with ops.device('/device:CPU:0'):
+      c1 = resource_variable_ops.ResourceVariable(2.0)
+      c2 = resource_variable_ops.ResourceVariable(7.0)
+    with ops.device('/device:GPU:0'):
+      g1 = resource_variable_ops.ResourceVariable(3.0)
+      g2 = resource_variable_ops.ResourceVariable(5.0)
+
+    @function.defun
+    def func(resource1, resource2):
+      with ops.device('/device:CPU:0'):
+        result1 = resource1 * g2
+      with ops.device('/device:GPU:0'):
+        result2 = resource2 * c2
+      return result1, result2
+
+    r1, r2 = func(c1, g1)
+    self.assertEqual(r1.numpy(), 10.0)
+    self.assertRegexpMatches(r1.backing_device, 'CPU')
+    self.assertEqual(r2.numpy(), 21.0)
+    self.assertRegexpMatches(r2.backing_device, 'GPU')
+
+    # Call with flipped inputs. Check that we look at resource's
+    # device and reinstantiates the function when inputs' devices change.
+    r1, r2 = func(g1, c1)
+    self.assertEqual(r1.numpy(), 15.0)
+    self.assertRegexpMatches(r1.backing_device, 'CPU')
+    self.assertEqual(r2.numpy(), 14.0)
+    self.assertRegexpMatches(r2.backing_device, 'GPU')
+
+  def testOutputResources(self):
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found.')
+
+    with ops.device('/device:CPU:0'):
+      c1 = resource_variable_ops.ResourceVariable(2.0)
+    with ops.device('/device:GPU:0'):
+      g1 = resource_variable_ops.ResourceVariable(3.0)
+
+    @function.defun
+    def func(resource1, resource2):
+      with ops.device('/device:CPU:0'):
+        result1 = resource1 * 5
+      with ops.device('/device:GPU:0'):
+        result2 = resource2 * 7
+      return result1, resource1.handle, result2, resource2.handle
+
+    r1, res1, r2, res2 = func(c1, g1)
+    self.assertEqual(r1.numpy(), 10.0)
+    self.assertRegexpMatches(r1.backing_device, 'CPU')
+    self.assertEqual(r2.numpy(), 21.0)
+    self.assertRegexpMatches(r2.backing_device, 'GPU')
+
+    def check_handle(handle, expected_value):
+      self.assertRegexpMatches(handle.backing_device, 'CPU')
+      tensor = gen_resource_variable_ops.read_variable_op(
+          handle, dtypes.float32)
+      self.assertEqual(tensor.numpy(), expected_value)
+
+    # Check that handles returned from functions are on CPU and an op using
+    # the resource handle is correctly placed on the device backing the
+    # resource.
+    check_handle(res1, 2.0)
+    check_handle(res2, 3.0)
+
+    # Call with flipped inputs to make sure the same the function is
+    # reinstantiated and eager runtime does not mess up the device assignment
+    # for ops consuming handles returned from defuns.
+    r1, res1, r2, res2 = func(g1, c1)
+    self.assertEqual(r1.numpy(), 15.0)
+    self.assertRegexpMatches(r1.backing_device, 'CPU')
+    self.assertEqual(r2.numpy(), 14.0)
+    self.assertRegexpMatches(r2.backing_device, 'GPU')
+    check_handle(res1, 3.0)
+    check_handle(res2, 2.0)
+
+  def testComplexInputOutputDevicePattern(self):
+    """Tests input/output mapping logic in partitioning."""
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found.')
+
+    with ops.device('/device:CPU:0'):
+      rc0 = resource_variable_ops.ResourceVariable(2.0)
+      rc1 = resource_variable_ops.ResourceVariable(3.0)
+      cc0 = constant_op.constant(5.0)
+      cc1 = constant_op.constant(7.0)
+    with ops.device('/device:GPU:0'):
+      rg0 = resource_variable_ops.ResourceVariable(11.0)
+      rg1 = resource_variable_ops.ResourceVariable(13.0)
+      cg0 = constant_op.constant(17.0)
+      cg1 = constant_op.constant(19.0)
+
+    # Make sure tensors are on expected devices.
+    for tensor in [cc0, cc1]:
+      self.assertRegexpMatches(tensor.backing_device, 'CPU:0')
+    for tensor in [cg0, cg1]:
+      self.assertRegexpMatches(tensor.backing_device, 'GPU:0')
+
+    @function.defun
+    def func(rc0, cc0, cg0, rc1, cg1, rg0, rg1, cc1):
+      with ops.device('/device:CPU:0'):
+        m1 = rc0 * cg0
+      with ops.device('/device:GPU:0'):
+        m2 = rg0 * cc0
+
+      with ops.device('/device:CPU:0'):
+        r1 = 1000.0 * m2 + rc1 * cg1
+      with ops.device('/device:GPU:0'):
+        r2 = 1000.0 * m1 + rg1 * cc1
+
+      return r1, r2, m2, m1
+
+    r1, r2, m2, m1 = func(rc0, cc0, cg0, rc1, cg1, rg0, rg1, cc1)
+    self.assertRegexpMatches(m1.backing_device, 'CPU')
+    self.assertRegexpMatches(r1.backing_device, 'CPU')
+    self.assertRegexpMatches(m2.backing_device, 'GPU')
+    self.assertRegexpMatches(r2.backing_device, 'GPU')
+    self.assertEqual(m1.numpy(), 34.0)
+    self.assertEqual(r1.numpy(), 55000.0 + 3.0 * 19.0)
+    self.assertEqual(m2.numpy(), 55.0)
+    self.assertEqual(r2.numpy(), 34000.0 + 13.0 * 7.0)
+
+  def testArgumentPrunning(self):
+    """Tests functions taking unnecessary arguments."""
+    if not context.context().num_gpus():
+      self.skipTest('No GPUs found.')
+
+    with ops.device('/device:CPU:0'):
+      c1 = constant_op.constant(5.0)
+      c2 = constant_op.constant(7.0)
+
+    with ops.device('/device:GPU:0'):
+      g1 = constant_op.constant(11.0)
+      g2 = constant_op.constant(13.0)
+      g3 = constant_op.constant(17.0)
+
+    @function.defun
+    def func(g1, g2, c1, g3, c2):  # pylint: disable=unused-argument
+      # arguments g1 and g2 are unused and can be pruned by grappler.
+      return c1 * g3 * c2
+
+    result = func(g1, g2, c1, g3, c2)
+    self.assertEqual(result.numpy(), 5.0 * 7.0 * 17.0)
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution(
diff --git a/tensorflow/python/eager/graph_only_ops.py b/tensorflow/python/eager/graph_only_ops.py
index 77a9e7db20b4ff24dd751fe81049fca64431f45f..a7374ab14afbb164918fa8de1eac1a862fd5d178 100644
--- a/tensorflow/python/eager/graph_only_ops.py
+++ b/tensorflow/python/eager/graph_only_ops.py
@@ -44,7 +44,6 @@ def graph_placeholder(dtype, shape, name=None):
   dtype_value = attr_value_pb2.AttrValue(type=dtype.as_datatype_enum)
   if isinstance(shape, (list, tuple)):
     shape = tensor_shape.TensorShape(shape)
-  assert isinstance(shape, tensor_shape.TensorShape)
   shape = attr_value_pb2.AttrValue(shape=shape.as_proto())
   g = ops.get_default_graph()
   with ops.name_scope(name, "placeholder", []) as name:
diff --git a/tensorflow/python/eager/lift_to_graph.py b/tensorflow/python/eager/lift_to_graph.py
index c231264047bedccbb11abf996ff9ac93f15964f9..2ed2d5882e2721c6650e1d978238100d98a59322 100644
--- a/tensorflow/python/eager/lift_to_graph.py
+++ b/tensorflow/python/eager/lift_to_graph.py
@@ -21,43 +21,247 @@ from __future__ import print_function
 
 import collections
 
+from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
 
 
 def _graph_inputs(op):
   return [x.op for x in op.inputs] + list(op.control_inputs)
 
 
-def lift_to_graph(init_tensor, graph, sources=None):
-  """Copies the tensor and all its inputs recursively to the outer graph."""
-  # Check that the initializer does not depend on any placeholders.
-  if sources is None:
-    sources = set([])
+def _as_operation(op_or_tensor):
+  if isinstance(op_or_tensor, ops.Tensor):
+    return op_or_tensor.op
+  return op_or_tensor
+
+
+class UnliftableError(Exception):
+  """Raised if a Tensor cannot be lifted from the graph."""
+  pass
+
+
+def _constant_inputs(op_or_tensor):
+  return all(_as_operation(i).type == u"Const"
+             and not _as_operation(i).control_inputs
+             for i in _graph_inputs(_as_operation(op_or_tensor)))
+
+
+def _path_from(from_op, tensor, sources):
+  """Find one path from `from_op` to `tensor`, ignoring `sources`.
+
+  Args:
+    from_op: A `tf.Operation`.
+    tensor: A `tf.Operation` or `tf.Tensor`.
+    sources: A list of `tf.Tensor`.
+
+  Returns:
+    A python string containing the path, or "??" if none is found.
+  """
   visited_ops = set([x.op for x in sources])
-  ops_to_visit = [init_tensor.op]
-  op_outputs = collections.defaultdict(set)
+  ops_to_visit = [_as_operation(tensor)]
+  some_op_output = {}
   while ops_to_visit:
     op = ops_to_visit.pop()
     if op in visited_ops:
       continue
     visited_ops.add(op)
-    # TODO(apassos) distinguish arg placeholders, capture placeholders,
-    # and placeholders the user might directly use to initialize
-    # variables.
-    if op.type == "Placeholder":
-      raise ValueError(
-          "Unable to lift tensor", init_tensor,
-          "because it depends transitively on placeholder ", op)
+    if op == from_op:
+      path_op = op
+      path = [path_op]
+      final_op = _as_operation(tensor)
+      while path_op != final_op:
+        path_op = some_op_output[path_op]
+        path.append(path_op)
+      return " <- ".join(["%s (%s)" % (x.name, x.type) for x in reversed(path)])
+    else:
+      for inp in _graph_inputs(op):
+        if inp not in visited_ops and inp not in sources:
+          some_op_output[inp] = op
+          ops_to_visit.append(inp)
+  return "??"
+
+
+def _map_subgraph(init_tensor, sources, disallowed_placeholders, visited_ops,
+                  op_outputs, add_sources):
+  """Walk a Graph and capture the subgraph between init_tensor and sources.
+
+  Note: This function mutates visited_ops and op_outputs.
+
+  Arguments:
+    init_tensor:  A Tensor or Operation where the subgraph terminates.
+    sources:  A set of Tensors where subgraph extraction should stop.
+    disallowed_placeholders: An optional set of ops which may not appear in the
+      lifted graph. Defaults to all placeholders.
+    visited_ops: A set of operations which were visited in a prior pass.
+    op_outputs: A defaultdict containing the outputs of an op which are to be
+      copied into the new subgraph.
+    add_sources: A boolean indicating whether placeholders which are not in
+      sources should be allowed.
+
+  Returns:
+    The set of placeholders upon which init_tensor depends and are not in
+    sources.
+
+  Raises:
+    UnliftableError: if init_tensor depends on a placeholder which is not in
+      sources and add_sources is False.
+  """
+  ops_to_visit = [_as_operation(init_tensor)]
+  extra_sources = set()
+  while ops_to_visit:
+    op = ops_to_visit.pop()
+    if op in visited_ops:
+      continue
+    visited_ops.add(op)
+
+    should_raise = False
+    if disallowed_placeholders is not None and op in disallowed_placeholders:
+      should_raise = True
+    elif op.type == "Placeholder":
+      if disallowed_placeholders is None and not add_sources:
+        should_raise = True
+      extra_sources.update(op.outputs)
+
+    if should_raise:
+      raise UnliftableError(
+          "Unable to lift tensor %s because it depends transitively on "
+          "placeholder %s via at least one path, e.g.: %s"
+          % (repr(init_tensor), repr(op), _path_from(op, init_tensor, sources)))
     for inp in _graph_inputs(op):
       op_outputs[inp].add(op)
-      if inp not in visited_ops and inp not in sources:
+      if inp not in visited_ops and inp not in (sources or extra_sources):
         ops_to_visit.append(inp)
+
+  return extra_sources
+
+
+def _copy_non_source(op, graph, op_map):
+  """Copy an op directly to a given graph.
+
+  This function assumes that all of the inputs to an op have already been
+  copied.
+
+  Args:
+    op: The op to be copied.
+    graph: The destination graph.
+    op_map: A dict mapping ops and tensors in the old graph to the new one.
+  """
+  copied_inputs = [op_map[x] for x in op.inputs]
+  copied_control_inputs = [op_map[x] for x in op.control_inputs]
+  with ops.control_dependencies(copied_control_inputs), ops.device(op.device):
+    copied_op = graph.create_op(
+        op_type=op.type,
+        inputs=copied_inputs,
+        dtypes=[x.dtype for x in op.outputs],
+        attrs=op.node_def.attr,
+        name=op.name)
+  op_map[op] = copied_op
+  for i, o in enumerate(op.outputs):
+    op_map[o] = copied_op.outputs[i]
+
+
+def _copy_source(s, graph, op_map, handle_captures, inverse_captures):
+  """Create a source in a graph based on a Tensor from a different graph.
+
+  This function creates a placeholder analog of `s` in a graph with the
+  following behavior:
+
+  1) If s is a captured Tensor or Variable and handle_captures is set to True,
+     simply capture it in the new graph as well.
+
+  2) If s is a PlaceholderWithDefault whose default is a constant, preserve
+     said default in the new graph.
+
+  3) When applicable, copy resource variable metadata from `s` to the newly
+     created placeholder.
+
+  Args:
+    s: The source of interest.
+    graph: The destination graph.
+    op_map: A dict mapping ops and tensors in the old graph to the new one.
+    handle_captures: A boolean indicating whether to re-capture s in the new
+      graph or simply create a vanilla placeholder.
+    inverse_captures: A dict mapping s back to the Tensor or Variable that it
+      captures.
+  """
+  if handle_captures and s in inverse_captures:
+    copied_placeholder = graph.capture(inverse_captures[s], name=s.op.name)
+  elif s.op.type == "PlaceholderWithDefault" and _constant_inputs(s):
+    # Copy the default value to the graph.
+    default_value = s.op.inputs[0]
+    _copy_non_source(op=default_value.op, graph=graph, op_map=op_map)
+
+    with ops.device(s.op.device):
+      copied_placeholder = array_ops.placeholder_with_default(
+          input=op_map[default_value], shape=s.shape, name=s.op.name)
+  else:
+    with ops.device(s.op.device):
+      copied_placeholder = array_ops.placeholder(
+          dtype=s.dtype, shape=s.shape, name=s.op.name)
+
+  base_handle = resource_variable_ops.get_resource_handle_data(s)
+  if base_handle.shape_and_type:
+    resource_variable_ops._set_handle_shapes_and_types(  # pylint: disable=protected-access
+        copied_placeholder,
+        base_handle,
+        graph_mode=True)
+
+  op_map[s] = copied_placeholder
+
+
+def lift_to_graph(init_tensors, graph, sources=None,
+                  disallowed_placeholders=None, add_sources=False,
+                  handle_captures=False, base_graph=None):
+  """Copies the tensor and all its inputs recursively to the outer graph.
+
+  Args:
+    init_tensors: The Tensor to lift.
+    graph: The graph to lift to.
+    sources: Optional sequence of nodes to start from. If omitted the whole
+      subgraph which feeds into `init_tensor` is lifted.
+    disallowed_placeholders: An optional set of ops which may not appear in the
+      lifted graph. Defaults to all placeholders.
+    add_sources: A boolean indicating whether placeholders which are not in
+      sources should be allowed.
+    handle_captures: A boolean indicating whether to re-capture s in the new
+      graph or simply create a vanilla placeholder.
+    base_graph: The graph from which to lift ops. This will be inferred if not
+      specified.
+
+  Returns:
+    A mapping from ops in the current default graph to ops in `graph`.
+
+  Raises:
+    UnliftableError: If a placeholder blocks lifting.
+  """
+  variable_init_tensors = {i for i in init_tensors if isinstance(
+      i, resource_variable_ops.ResourceVariable)}
+  init_tensors = set(init_tensors).difference(variable_init_tensors)
+  base_graph = base_graph or list(init_tensors)[0].graph
+
+  # Check that the initializer does not depend on any placeholders.
+  sources = set(sources or [])
+  visited_ops = set([x.op for x in sources])
+  op_outputs = collections.defaultdict(set)
+
+  # First we extract the subgraph between init_tensors and sources.
+  for init_tensor in init_tensors:
+    sources.update(_map_subgraph(
+        init_tensor=init_tensor,
+        sources=sources,
+        disallowed_placeholders=disallowed_placeholders,
+        visited_ops=visited_ops,
+        op_outputs=op_outputs,
+        add_sources=add_sources))
+
   # Topologically sort the nodes we've extracted. Now we know how many of their
   # outputs are part of this subgraph.
   ops_to_copy = []
   marked_ops = set([])
-  ops_to_visit = [init_tensor.op]
+  ops_to_visit = [_as_operation(t) for t in init_tensors
+                  if not op_outputs[_as_operation(t)]]
   while ops_to_visit:
     op = ops_to_visit.pop()
     if op in marked_ops:
@@ -65,24 +269,34 @@ def lift_to_graph(init_tensor, graph, sources=None):
     marked_ops.add(op)
     ops_to_copy.append(op)
     for inp in _graph_inputs(op):
-      if all(x in marked_ops for x in op_outputs[inp]) and inp not in sources:
+      if (all(x in marked_ops for x in op_outputs[inp]) and
+          inp not in sources):
         ops_to_visit.append(inp)
-  assert len(ops_to_copy) == len(visited_ops)
+
+  # When lifting from one FuncGraph to another, we will need to capture the
+  # relevant tensors as well.
+  captures = collections.OrderedDict()
+  if (isinstance(base_graph, func_graph.FuncGraph) and
+      isinstance(graph, func_graph.FuncGraph)):
+    captures = base_graph.captures
+  inverse_captures = {v: k for k, v in captures.items()}
+
   # ops_to_copy now holds a reverse topologically sorted list of ops which
   # ends in the initializer. We copy those to the outermost graph and
   # build the initialization op there.
   with graph.as_default():
-    op_map = {}
+    op_map = {i: i for i in variable_init_tensors}  # Pass through variables.
+    source_ops = set()
     for s in sources:
-      op_map[s] = array_ops.placeholder(dtype=s.dtype, shape=s.shape)
+      source_ops.add(s.op)
+      _copy_source(s=s, graph=graph, op_map=op_map,
+                   handle_captures=handle_captures,
+                   inverse_captures=inverse_captures)
+
     for op in reversed(ops_to_copy):
-      copied_inputs = [op_map[x] for x in op.inputs]
-      copied_control_inputs = [op_map[x] for x in op.control_inputs]
-      with ops.control_dependencies(copied_control_inputs):
-        copied_op = graph.create_op(
-            op.type, copied_inputs, [x.dtype for x in op.outputs],
-            attrs=op.node_def.attr)
-      op_map[op] = copied_op
-      for i, o in enumerate(op.outputs):
-        op_map[o] = copied_op.outputs[i]
+      if op in source_ops:
+        continue
+
+      _copy_non_source(op=op, graph=graph, op_map=op_map)
+
     return op_map
diff --git a/tensorflow/python/eager/memory_test.py b/tensorflow/python/eager/memory_test.py
index a1a59d511fdd4b831ea853b1f1cb3212322a3b84..9d29180379bd5bc48472f5c8638f01f667763111 100644
--- a/tensorflow/python/eager/memory_test.py
+++ b/tensorflow/python/eager/memory_test.py
@@ -24,6 +24,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import time
 import six
 
 from tensorflow.python import keras
@@ -32,6 +33,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.variables import Variable
 
 # memory_profiler might not be available in the OSS version of TensorFlow.
 try:
@@ -63,6 +65,11 @@ class MemoryTest(test.TestCase):
       # Warm up.
       f()
 
+      # Wait for background threads to start up and take over memory.
+      # FIXME: The nature of this test leaves few other options. Maybe there
+      # is a better way to do this.
+      time.sleep(4)
+
       initial = memory_profiler.memory_usage(-1)[0]
 
       for _ in six.moves.range(num_iters):
@@ -75,6 +82,16 @@ class MemoryTest(test.TestCase):
           "Maximum allowed increase: %f") % (initial, increase,
                                              increase_threshold_absolute_mb)
 
+  def testMemoryLeakAnonymousVariable(self):
+    if memory_profiler is None:
+      self.skipTest("memory_profiler required to run this test")
+
+    def f():
+      inputs = Variable(array_ops.zeros([32, 100], dtypes.float32))
+      del inputs
+
+    self.assertNotIncreasingMemory(f, num_iters=10000)
+
   def testMemoryLeakInSimpleModelForwardOnly(self):
     if memory_profiler is None:
       self.skipTest("memory_profiler required to run this test")
diff --git a/tensorflow/python/eager/ops_test.py b/tensorflow/python/eager/ops_test.py
index 17a090d5262f790c92dfa1a92d47f9b5ac6c07d9..ab4bdaa601d94bee077dd9567fef0415164eb821 100644
--- a/tensorflow/python/eager/ops_test.py
+++ b/tensorflow/python/eager/ops_test.py
@@ -18,6 +18,8 @@ from __future__ import division
 from __future__ import print_function
 
 import threading
+import weakref
+
 import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
@@ -397,6 +399,32 @@ class OpsTest(test_util.TensorFlowTestCase):
     t1.start()
     t1.join()
 
+  def testWeakrefEagerTensor(self):
+    x = constant_op.constant([[1.]])
+    x.at1 = constant_op.constant([[2.]])
+    x.at2 = 3.
+    weak_x = weakref.ref(x)
+    weak_xat1 = weakref.ref(x.at1)
+    del x
+    self.assertIs(weak_x(), None)
+    self.assertIs(weak_xat1(), None)
+
+  def testWeakKeyDictionaryTensor(self):
+    weak_key_dict = weakref.WeakKeyDictionary()
+    strong_x = constant_op.constant([[1.]])
+    strong_y = constant_op.constant([[2.]])
+    weak_key_dict[strong_x] = constant_op.constant([[3.]])
+    weak_key_dict[strong_y] = constant_op.constant([[4.]])
+    strong_y.a = constant_op.constant([[5.]])
+    weak_x = weakref.ref(strong_x)
+    del strong_x
+    self.assertIs(weak_x(), None)
+    self.assertEqual([strong_y], list(weak_key_dict))
+    self.assertEqual(1, len(list(weak_key_dict)))
+    self.assertEqual(1, len(weak_key_dict))
+    del strong_y
+    self.assertEqual([], list(weak_key_dict))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/profiler.py b/tensorflow/python/eager/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..659c0cc6e69e25701b200d73020420a328a93d72
--- /dev/null
+++ b/tensorflow/python/eager/profiler.py
@@ -0,0 +1,133 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Profiler for eager mode."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import threading
+
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.eager import context
+from tensorflow.python.framework import c_api_util
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import tf_logging as logging
+
+LOGDIR_PLUGIN = 'plugins/profile'
+
+_profiler = None
+_profiler_lock = threading.Lock()
+_run_num = 0
+
+
+def start():
+  """Start profiling.
+
+  Only one active profiling session is allowed.
+
+  Raises:
+    AssertionError: If another profiling session is running.
+  """
+  global _profiler
+  if _profiler is not None:
+    raise AssertionError('Another profiler is running.')
+  with _profiler_lock:
+    profiler_context = pywrap_tensorflow.TFE_NewProfilerContext()
+    if context.default_execution_mode == context.EAGER_MODE:
+      pywrap_tensorflow.TFE_ProfilerContextSetEagerContext(
+          profiler_context,
+          context.context()._handle)  # pylint: disable=protected-access
+    _profiler = pywrap_tensorflow.TFE_NewProfiler(profiler_context)
+    pywrap_tensorflow.TFE_DeleteProfilerContext(profiler_context)
+    if not pywrap_tensorflow.TFE_ProfilerIsOk(_profiler):
+      logging.warning('Another profiler session is running which is probably '
+                      'created by profiler server. Please avoid using profiler '
+                      'server and profiler APIs at the same time.')
+
+
+def stop():
+  """Stop current profiling session and return its result.
+
+  Returns:
+    A binary string of tensorflow.tpu.Trace. User can write the string
+    to file for offline analysis by tensorboard.
+
+  Raises:
+    AssertionError: If there is no active profiling session.
+  """
+  global _profiler
+  global _run_num
+  if _profiler is None:
+    raise AssertionError('Cannot stop profiling. No profiler is running.')
+  with c_api_util.tf_buffer() as buffer_:
+    pywrap_tensorflow.TFE_ProfilerSerializeToString(
+        context.context()._handle,  # pylint: disable=protected-access
+        _profiler,
+        buffer_)
+    result = pywrap_tensorflow.TF_GetBuffer(buffer_)
+  with _profiler_lock:
+    pywrap_tensorflow.TFE_DeleteProfiler(_profiler)
+    _profiler = None
+    _run_num += 1
+  return result
+
+
+def start_profiler_server(port):
+  """Start a profiler grpc server that listens to given port.
+
+  The profiler server will keep the program running even the training finishes.
+  Please shutdown the server with CTRL-C. It can be used in both eager mode and
+  graph mode. The service defined in
+  tensorflow/contrib/tpu/profiler/tpu_profiler.proto. Please use
+  tensorflow/contrib/tpu/profiler/capture_tpu_profile to capture tracable
+  file following https://cloud.google.com/tpu/docs/cloud-tpu-tools#capture_trace
+
+  Args:
+    port: port profiler server listens to.
+  """
+  profiler_context = pywrap_tensorflow.TFE_NewProfilerContext()
+  if context.default_execution_mode == context.EAGER_MODE:
+    pywrap_tensorflow.TFE_ProfilerContextSetEagerContext(
+        profiler_context,
+        context.context()._handle)  # pylint: disable=protected-access
+  pywrap_tensorflow.TFE_StartProfilerServer(profiler_context, port)
+  pywrap_tensorflow.TFE_DeleteProfilerContext(profiler_context)
+
+
+class Profiler(object):
+  """Context-manager eager profiler api.
+
+  Example usage:
+  ```python
+  with Profiler("/path/to/logdir"):
+    # do some work
+  ```
+  """
+
+  def __init__(self, logdir):
+    self._logdir = logdir
+
+  def __enter__(self):
+    start()
+
+  def __exit__(self, typ, value, tb):
+    result = stop()
+    plugin_dir = os.path.join(self._logdir, LOGDIR_PLUGIN,
+                              'run{}'.format(_run_num))
+    gfile.MakeDirs(plugin_dir)
+    with gfile.Open(os.path.join(plugin_dir, 'local.trace'), 'wb') as f:
+      f.write(result)
diff --git a/tensorflow/python/eager/profiler_client.py b/tensorflow/python/eager/profiler_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f09d8b63419f4f837f74cd59fb1b3083b7d968b
--- /dev/null
+++ b/tensorflow/python/eager/profiler_client.py
@@ -0,0 +1,52 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Profiler client APIs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.framework import errors
+
+
+def start_tracing(service_addr,
+                  logdir,
+                  duration_ms,
+                  worker_list='',
+                  include_dataset_ops=True,
+                  num_tracing_attempts=3):
+  """Sending grpc requests to profiler server to perform on-demand profiling.
+
+  Note: This method will block caller thread until receives tracing result.
+
+  Args:
+    service_addr: Address of profiler service e.g. localhost:6009.
+    logdir: Path of TensorBoard log directory e.g. /tmp/tb_log.
+    duration_ms: Duration of tracing or monitoring in ms.
+    worker_list: The list of worker TPUs that we are about to profile in the
+      current session. (TPU only)
+    include_dataset_ops: Set to false to profile longer traces.
+    num_tracing_attempts: Automatically retry N times when no trace event is
+      collected.
+
+  Raises:
+    UnavailableError: If no trace event is collected.
+  """
+  # TODO(fishx): Uses errors.raise_exception_on_not_ok_status instead.
+  if not pywrap_tensorflow.TFE_ProfilerClientStartTracing(
+      service_addr, logdir, worker_list, include_dataset_ops, duration_ms,
+      num_tracing_attempts):
+    raise errors.UnavailableError(None, None, 'No trace event is collected.')
diff --git a/tensorflow/python/eager/profiler_test.py b/tensorflow/python/eager/profiler_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba19e17e96ef9321ed8314bc4a96fab6ca2c1e02
--- /dev/null
+++ b/tensorflow/python/eager/profiler_test.py
@@ -0,0 +1,49 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for eager profiler."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.profiler import trace_events_pb2
+from tensorflow.python.eager import profiler
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+
+
+class ProfilerTest(test_util.TensorFlowTestCase):
+
+  def test_profile(self):
+    profiler.start()
+    three = constant_op.constant(3)
+    five = constant_op.constant(5)
+    product = three * five
+    self.assertAllEqual(15, product)
+    with self.assertRaises(AssertionError):
+      profiler.start()
+
+    profile_result = profiler.stop()
+    profile_pb = trace_events_pb2.Trace()
+    profile_pb.ParseFromString(profile_result)
+    profile_pb_str = '%s' % profile_pb
+    self.assertTrue('Mul' in profile_pb_str)
+    with self.assertRaises(AssertionError):
+      profiler.stop()
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 30a93fb0e421e0b26f517a03302d2e96913d8b9a..35040c5d5652c856f6b72062bb5d4d009c48aa7f 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -501,9 +501,7 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
 void EagerTensor_dealloc(EagerTensor* self) {
   // Clear weak references to self.
   // Needs to happen before any actual destruction.
-  if (self->weakreflist != nullptr) {
-    PyObject_ClearWeakRefs((PyObject*)self);
-  }
+  PyObject_ClearWeakRefs((PyObject*)self);
 
   TF_DeleteStatus(self->status);
   Py_DECREF(self->handle_data);
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index 8d6f212499f80513eeb2a20cee8b2e0d7be21e3f..1db1b23d4c94ad911a2ffbd475134615f370af22 100755
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -231,7 +231,12 @@ PyObject* TFE_Py_TensorShapeSlice(PyObject* tensors, int slice_dim);
 PyObject* TFE_Py_TensorShapeOnDevice(PyObject* tensor);
 
 // Encodes the object as a tuple that is meant to be used as part of the key
-// for the defun function cache.
-PyObject* TFE_Py_EncodeArg(PyObject*);
+// for the defun function cache.  If `include_tensor_ranks_only` is true,
+// then the encoding only stores tensor ranks, and the key is
+// agnostic to dimension sizes.  Otherwise, full tensor shape encodings are
+// returned.
+PyObject* TFE_Py_EncodeArg(PyObject*, bool include_tensor_ranks_only);
+
+void TFE_Py_EnableInteractivePythonLogging();
 
 #endif  // TENSORFLOW_PYTHON_EAGER_PYWRAP_TFE_H_
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 9ce500bc08e478815f2dbe1d5d5353eefa4f17a8..3286e1add81d96172b249bae57beeaeea3399e28 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <cstring>
 #include <thread>
 
-#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/python/eager/pywrap_tfe.h"
 
 #include "absl/strings/str_cat.h"
@@ -25,6 +24,7 @@ limitations under the License.
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/tape.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/compactptrset.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
@@ -264,7 +264,8 @@ bool ParseTypeValue(const string& key, PyObject* py_value, TF_Status* status,
 }
 
 bool SetOpAttrList(
-    TFE_Op* op, const char* key, PyObject* py_list, TF_AttrType type,
+    TFE_Context* ctx, TFE_Op* op, const char* key, PyObject* py_list,
+    TF_AttrType type,
     tensorflow::gtl::FlatMap<string, tensorflow::int64>* attr_list_sizes,
     TF_Status* status) {
   if (!PySequence_Check(py_list)) {
@@ -369,6 +370,40 @@ bool SetOpAttrList(
     TFE_OpSetAttrShapeList(op, key, dims.get(), num_dims.get(), num_values,
                            status);
     if (TF_GetCode(status) != TF_OK) return false;
+  } else if (type == TF_ATTR_FUNC) {
+    std::unique_ptr<const TFE_Op*[]> funcs(new const TFE_Op*[num_values]);
+    for (int i = 0; i < num_values; ++i) {
+      tensorflow::Safe_PyObjectPtr py_value(PySequence_ITEM(py_list, i));
+      // Allow:
+      // (1) String function name, OR
+      // (2) A Python object with a .name attribute
+      //     (A crude test for being a
+      //     tensorflow.python.framework.function._DefinedFunction)
+      //     (which is what the various "defun" or "Defun" decorators do).
+      // And in the future also allow an object that can encapsulate
+      // the function name and its attribute values.
+      tensorflow::StringPiece func_name;
+      if (!ParseStringValue(key, py_value.get(), status, &func_name)) {
+        PyObject* name_attr = PyObject_GetAttrString(py_value.get(), "name");
+        if (name_attr == nullptr ||
+            !ParseStringValue(key, name_attr, status, &func_name)) {
+          TF_SetStatus(
+              status, TF_INVALID_ARGUMENT,
+              tensorflow::strings::StrCat(
+                  "unable to set function value attribute from a ",
+                  py_value.get()->ob_type->tp_name,
+                  " object. If you think this is an error, please file an "
+                  "issue at "
+                  "https://github.com/tensorflow/tensorflow/issues/new")
+                  .c_str());
+          return false;
+        }
+      }
+      funcs[i] = TFE_NewOp(ctx, func_name.data(), status);
+      if (TF_GetCode(status) != TF_OK) return false;
+    }
+    TFE_OpSetAttrFunctionList(op, key, funcs.get(), num_values);
+    if (TF_GetCode(status) != TF_OK) return false;
   } else {
     TF_SetStatus(status, TF_UNIMPLEMENTED,
                  tensorflow::strings::StrCat("Attr ", key,
@@ -619,7 +654,8 @@ void SetOpAttrs(TFE_Context* ctx, TFE_Op* op, PyObject* attrs, int start_index,
     const TF_AttrType type = TFE_OpGetAttrType(op, key, &is_list, out_status);
     if (TF_GetCode(out_status) != TF_OK) return;
     if (is_list != 0) {
-      if (!SetOpAttrList(op, key, py_value, type, nullptr, out_status)) return;
+      if (!SetOpAttrList(ctx, op, key, py_value, type, nullptr, out_status))
+        return;
     } else {
       if (!SetOpAttrScalar(ctx, op, key, py_value, type, nullptr, out_status))
         return;
@@ -649,7 +685,8 @@ void SetOpAttrWithDefaults(
     }
   } else {
     if (is_list != 0) {
-      SetOpAttrList(op, attr_name, attr_value, type, attr_list_sizes, status);
+      SetOpAttrList(ctx, op, attr_name, attr_value, type, attr_list_sizes,
+                    status);
     } else {
       SetOpAttrScalar(ctx, op, attr_name, attr_value, type, attr_list_sizes,
                       status);
@@ -835,15 +872,15 @@ int MaybeRaiseExceptionFromStatus(const tensorflow::Status& status,
 }
 
 const char* TFE_GetPythonString(PyObject* o) {
+#if PY_MAJOR_VERSION >= 3
   if (PyBytes_Check(o)) {
     return PyBytes_AsString(o);
-  }
-#if PY_MAJOR_VERSION >= 3
-  if (PyUnicode_Check(o)) {
+  } else {
     return PyUnicode_AsUTF8(o);
   }
+#else
+  return PyBytes_AsString(o);
 #endif
-  return nullptr;
 }
 
 int64_t get_uid() {
@@ -1011,8 +1048,18 @@ class PyVSpace : public tensorflow::eager::VSpace<PyObject, PyBackwardFunction,
   void MarkAsResult(PyObject* gradient) const final { Py_INCREF(gradient); }
 
   PyObject* Zeros(const PyTapeTensor& tensor) const final {
+    if (PyErr_Occurred()) {
+      return nullptr;
+    }
     PyObject* py_shape = tensor.GetShape();
+    if (PyErr_Occurred()) {
+      return nullptr;
+    }
     PyObject* py_dtype = tensor.GetDType();
+    if (PyErr_Occurred()) {
+      Py_DECREF(py_shape);
+      return nullptr;
+    }
     PyObject* arg_list = Py_BuildValue("OO", py_shape, py_dtype);
     PyObject* result = PyEval_CallObject(zeros_fn_, arg_list);
     Py_DECREF(arg_list);
@@ -1022,6 +1069,9 @@ class PyVSpace : public tensorflow::eager::VSpace<PyObject, PyBackwardFunction,
   }
 
   PyObject* Ones(const PyTapeTensor& tensor) const final {
+    if (PyErr_Occurred()) {
+      return nullptr;
+    }
     PyObject* py_shape = tensor.GetShape();
     PyObject* py_dtype = tensor.GetDType();
     PyObject* arg_list = Py_BuildValue("OO", py_shape, py_dtype);
@@ -2086,6 +2136,9 @@ PyObject* RecordGradient(PyObject* op_name, PyObject* inputs, PyObject* attrs,
         PyBackwardFunction* function =
             new PyBackwardFunction([op_name, attrs, num_inputs, op_inputs,
                                     op_outputs](PyObject* output_grads) {
+              if (PyErr_Occurred()) {
+                return static_cast<PyObject*>(nullptr);
+              }
               tensorflow::Safe_PyObjectPtr callback_args(
                   Py_BuildValue("OOOOOO", op_name, attrs, num_inputs, op_inputs,
                                 op_outputs, output_grads));
@@ -2411,14 +2464,14 @@ bool RaiseIfNotPySequence(PyObject* seq, const string& attr_name) {
 
 bool RunCallbacks(
     const FastPathOpExecInfo& op_exec_info, PyObject* args,
-    const std::vector<tensorflow::Safe_PyObjectPtr>& flattened_inputs,
-    const std::vector<tensorflow::Safe_PyObjectPtr>& flattened_attrs,
+    const std::vector<tensorflow::Safe_PyObjectPtr>* const flattened_inputs,
+    const std::vector<tensorflow::Safe_PyObjectPtr>* const flattened_attrs,
     PyObject* flattened_result) {
   if (!op_exec_info.run_callbacks) return true;
 
-  tensorflow::Safe_PyObjectPtr inputs(PyTuple_New(flattened_inputs.size()));
-  for (int i = 0; i < flattened_inputs.size(); i++) {
-    PyObject* input = flattened_inputs[i].get();
+  tensorflow::Safe_PyObjectPtr inputs(PyTuple_New(flattened_inputs->size()));
+  for (int i = 0; i < flattened_inputs->size(); i++) {
+    PyObject* input = (*flattened_inputs)[i].get();
     Py_INCREF(input);
     PyTuple_SET_ITEM(inputs.get(), i, input);
   }
@@ -2426,7 +2479,7 @@ bool RunCallbacks(
   int num_non_inferred_attrs = PyTuple_GET_SIZE(args) -
                                op_exec_info.op_def->input_arg_size() -
                                kFastPathExecuteInputStartIndex;
-  int num_attrs = flattened_attrs.size() + num_non_inferred_attrs;
+  int num_attrs = flattened_attrs->size() + num_non_inferred_attrs;
   tensorflow::Safe_PyObjectPtr attrs(PyTuple_New(num_attrs));
 
   for (int i = 0; i < num_non_inferred_attrs; i++) {
@@ -2438,7 +2491,7 @@ bool RunCallbacks(
   }
   for (int i = num_non_inferred_attrs; i < num_attrs; i++) {
     PyObject* attr_or_name =
-        flattened_attrs.at(i - num_non_inferred_attrs).get();
+        flattened_attrs->at(i - num_non_inferred_attrs).get();
     Py_INCREF(attr_or_name);
     PyTuple_SET_ITEM(attrs.get(), i, attr_or_name);
   }
@@ -2676,9 +2729,10 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
       for (Py_ssize_t j = 0; j < len; j++) {
         PyObject* py_input = PySequence_Fast_GET_ITEM(input, j);
         tensorflow::Safe_PyObjectPtr py_eager_tensor;
-        if (!ConvertToTensor(op_exec_info, py_input, &py_eager_tensor,
-                             []() { Py_RETURN_NONE; },
-                             [](const TF_DataType& dtype) {}, status)) {
+        if (!ConvertToTensor(
+                op_exec_info, py_input, &py_eager_tensor,
+                []() { Py_RETURN_NONE; }, [](const TF_DataType& dtype) {},
+                status)) {
           return nullptr;
         }
 
@@ -2757,8 +2811,8 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
     PyList_SET_ITEM(flat_result.get(), i, EagerTensorFromHandle(retvals[i]));
   }
 
-  if (!RunCallbacks(op_exec_info, args, *flattened_inputs, *flattened_attrs,
-                    flat_result.get())) {
+  if (!RunCallbacks(op_exec_info, args, flattened_inputs.get(),
+                    flattened_attrs.get(), flat_result.get())) {
     return nullptr;
   }
 
@@ -2823,10 +2877,13 @@ namespace {
 const char kTensor[] = "T";
 const char kIndexedSlices[] = "I";
 const char kList[] = "L";
+const char kListEnd[] = "l";
 const char kTuple[] = "U";
+const char kTupleEnd[] = "u";
 const char kDict[] = "D";
 const char kRaw[] = "R";
 const char kShape[] = "s";
+const char kShapeDelim[] = "-";
 const char kDType[] = "d";
 const char kNone[] = "n";
 
@@ -2856,7 +2913,9 @@ struct EncodeResult {
   }
 };
 
-tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg, EncodeResult* result) {
+tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg,
+                                       bool include_tensor_ranks_only,
+                                       EncodeResult* result) {
   if (EagerTensor_CheckExact(arg)) {
     TFE_TensorHandle* t = EagerTensor_Handle(arg);
     tensorflow::TensorShape tensor_shape;
@@ -2865,10 +2924,13 @@ tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg, EncodeResult* result) {
     absl::StrAppend(&result->str, kDType, t->handle->dtype);
 
     absl::StrAppend(&result->str, kShape);
-    for (tensorflow::int64 dim_size : tensor_shape.dim_sizes()) {
-      absl::StrAppend(&result->str, dim_size);
+    if (include_tensor_ranks_only) {
+      absl::StrAppend(&result->str, tensor_shape.dim_sizes().size());
+    } else {
+      for (tensorflow::int64 dim_size : tensor_shape.dim_sizes()) {
+        absl::StrAppend(&result->str, dim_size, kShapeDelim);
+      }
     }
-
     return tensorflow::Status::OK();
   }
 
@@ -2892,6 +2954,7 @@ tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg, EncodeResult* result) {
       static_cast<tensorflow::DataType>(MakeInt(dtype_enum.get()));
 
   absl::StrAppend(&result->str, kDType, dtype);
+
   static char _shape_tuple[] = "_shape_tuple";
   tensorflow::Safe_PyObjectPtr shape_tuple(
       PyObject_CallMethod(arg, _shape_tuple, nullptr));
@@ -2912,22 +2975,30 @@ tensorflow::Status TFE_Py_EncodeTensor(PyObject* arg, EncodeResult* result) {
       shape_tuple.get(), "shape_tuple didn't return a sequence"));
 
   int len = PySequence_Fast_GET_SIZE(shape_seq.get());
-  for (int i = 0; i < len; ++i) {
-    PyObject* item = PySequence_Fast_GET_ITEM(shape_seq.get(), i);
-    if (item == Py_None) {
-      absl::StrAppend(&result->str, kNone);
-    } else {
-      absl::StrAppend(&result->str, MakeInt(item));
+
+  if (include_tensor_ranks_only) {
+    absl::StrAppend(&result->str, len);
+  } else {
+    for (int i = 0; i < len; ++i) {
+      PyObject* item = PySequence_Fast_GET_ITEM(shape_seq.get(), i);
+      if (item == Py_None) {
+        absl::StrAppend(&result->str, kNone);
+      } else {
+        absl::StrAppend(&result->str, MakeInt(item));
+      }
     }
   }
-
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg, EncodeResult* result);
+tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg,
+                                          bool include_tensor_ranks_only,
+                                          EncodeResult* result);
 
 // This function doesn't set the type of sequence before
 tensorflow::Status TFE_Py_EncodeSequence(PyObject* arg, const char* type,
+                                         const char* end_type,
+                                         bool include_tensor_ranks_only,
                                          EncodeResult* result) {
   tensorflow::Safe_PyObjectPtr arg_seq(
       PySequence_Fast(arg, "unable to create seq from list/tuple"));
@@ -2939,17 +3010,22 @@ tensorflow::Status TFE_Py_EncodeSequence(PyObject* arg, const char* type,
     if (item == Py_None) {
       absl::StrAppend(&result->str, kNone);
     } else {
-      TF_RETURN_IF_ERROR(TFE_Py_EncodeArgHelper(item, result));
+      TF_RETURN_IF_ERROR(
+          TFE_Py_EncodeArgHelper(item, include_tensor_ranks_only, result));
     }
   }
+  absl::StrAppend(&result->str, end_type);
 
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg, EncodeResult* result) {
+tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg,
+                                          bool include_tensor_ranks_only,
+                                          EncodeResult* result) {
   if (tensorflow::swig::IsTensor(arg)) {
     absl::StrAppend(&result->str, kTensor);
-    TF_RETURN_IF_ERROR(TFE_Py_EncodeTensor(arg, result));
+    TF_RETURN_IF_ERROR(
+        TFE_Py_EncodeTensor(arg, include_tensor_ranks_only, result));
   } else if (tensorflow::swig::IsIndexedSlices(arg)) {
     absl::StrAppend(&result->str, kIndexedSlices);
     tensorflow::Safe_PyObjectPtr values(PyObject_GetAttrString(arg, "values"));
@@ -2958,7 +3034,8 @@ tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg, EncodeResult* result) {
       return tensorflow::errors::InvalidArgument(
           "IndexedSlices does not have a values attr");
     }
-    TF_RETURN_IF_ERROR(TFE_Py_EncodeTensor(values.get(), result));
+    TF_RETURN_IF_ERROR(
+        TFE_Py_EncodeTensor(values.get(), include_tensor_ranks_only, result));
 
     tensorflow::Safe_PyObjectPtr indices(
         PyObject_GetAttrString(arg, "indices"));
@@ -2967,7 +3044,8 @@ tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg, EncodeResult* result) {
       return tensorflow::errors::InvalidArgument(
           "IndexedSlices does not have a indices attr");
     }
-    TF_RETURN_IF_ERROR(TFE_Py_EncodeTensor(indices.get(), result));
+    TF_RETURN_IF_ERROR(
+        TFE_Py_EncodeTensor(indices.get(), include_tensor_ranks_only, result));
 
     tensorflow::Safe_PyObjectPtr dense_shape(
         PyObject_GetAttrString(arg, "dense_shape"));
@@ -2977,12 +3055,15 @@ tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg, EncodeResult* result) {
           "IndexedSlices does not have a dense_shape attr");
     }
     if (dense_shape.get() != Py_None) {
-      TF_RETURN_IF_ERROR(TFE_Py_EncodeTensor(dense_shape.get(), result));
+      TF_RETURN_IF_ERROR(TFE_Py_EncodeTensor(
+          dense_shape.get(), include_tensor_ranks_only, result));
     }
   } else if (PyList_Check(arg)) {
-    TF_RETURN_IF_ERROR(TFE_Py_EncodeSequence(arg, kList, result));
+    TF_RETURN_IF_ERROR(TFE_Py_EncodeSequence(
+        arg, kList, kListEnd, include_tensor_ranks_only, result));
   } else if (PyTuple_Check(arg)) {
-    TF_RETURN_IF_ERROR(TFE_Py_EncodeSequence(arg, kTuple, result));
+    TF_RETURN_IF_ERROR(TFE_Py_EncodeSequence(
+        arg, kTuple, kTupleEnd, include_tensor_ranks_only, result));
   } else if (PyDict_Check(arg)) {
     tensorflow::Safe_PyObjectPtr keys(PyDict_Keys(arg));
     if (PyList_Sort(keys.get()) == -1) {
@@ -2994,9 +3075,11 @@ tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg, EncodeResult* result) {
 
     for (int i = 0; i < len; i++) {
       PyObject* key = PyList_GetItem(keys.get(), i);
-      TF_RETURN_IF_ERROR(TFE_Py_EncodeArgHelper(key, result));
+      TF_RETURN_IF_ERROR(
+          TFE_Py_EncodeArgHelper(key, include_tensor_ranks_only, result));
       PyObject* value = PyDict_GetItem(arg, key);
-      TF_RETURN_IF_ERROR(TFE_Py_EncodeArgHelper(value, result));
+      TF_RETURN_IF_ERROR(
+          TFE_Py_EncodeArgHelper(value, include_tensor_ranks_only, result));
     }
   } else {
     PyObject* object = PyWeakref_NewRef(arg, nullptr);
@@ -3023,13 +3106,51 @@ tensorflow::Status TFE_Py_EncodeArgHelper(PyObject* arg, EncodeResult* result) {
 // on known shapes to produce slimmer graphs, and correctness, as some
 // high-level APIs require shapes to be fully-known.
 //
+// `include_tensor_ranks_only` allows caching on arguments excluding shape info,
+// so that a slow path using relaxed shape can rely on a cache key that excludes
+// shapes.
+//
 // TODO(nareshmodi): Add support for sparse tensors.
-PyObject* TFE_Py_EncodeArg(PyObject* arg) {
+PyObject* TFE_Py_EncodeArg(PyObject* arg, bool include_tensor_ranks_only) {
   EncodeResult result;
-  const auto status = TFE_Py_EncodeArgHelper(arg, &result);
+  const auto status =
+      TFE_Py_EncodeArgHelper(arg, include_tensor_ranks_only, &result);
   if (MaybeRaiseExceptionFromStatus(status, nullptr)) {
     return nullptr;
   }
 
   return result.ToPyTuple();
 }
+
+// A method prints incoming messages directly to Python's
+// stdout using Python's C API. This is necessary in Jupyter notebooks
+// and colabs where messages to the C stdout don't go to the notebook
+// cell outputs, but calls to Python's stdout do.
+void PrintToPythonStdout(const char* msg) {
+  if (Py_IsInitialized()) {
+    PyGILState_STATE py_threadstate;
+    py_threadstate = PyGILState_Ensure();
+
+    string string_msg = msg;
+    // PySys_WriteStdout truncates strings over 1000 bytes, so
+    // we write the message in chunks small enough to not be truncated.
+    int CHUNK_SIZE = 900;
+    auto len = string_msg.length();
+    for (int i = 0; i < len; i += CHUNK_SIZE) {
+      PySys_WriteStdout("%s", string_msg.substr(i, CHUNK_SIZE).c_str());
+    }
+    PySys_WriteStdout("\n");
+
+    PyGILState_Release(py_threadstate);
+  }
+}
+
+// Register PrintToPythonStdout as a log listener, to allow
+// printing in colabs and jupyter notebooks to work.
+void TFE_Py_EnableInteractivePythonLogging() {
+  static bool enabled_interactive_logging = false;
+  if (!enabled_interactive_logging) {
+    enabled_interactive_logging = true;
+    TF_RegisterLogListener(PrintToPythonStdout);
+  }
+}
diff --git a/tensorflow/python/eager/pywrap_tfe_test.py b/tensorflow/python/eager/pywrap_tfe_test.py
index 669fa084888a52da1601984fa11791f84add6170..445ffbc532d23bfe5fdd0aa5c31e941ee6eca527 100644
--- a/tensorflow/python/eager/pywrap_tfe_test.py
+++ b/tensorflow/python/eager/pywrap_tfe_test.py
@@ -22,6 +22,7 @@ from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import core
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -254,6 +255,21 @@ class Tests(test.TestCase):
         "Value for attr 'num_split' of -1 must be at least minimum 1"):
       array_ops.split(value=[1, 2, 3], num_or_size_splits=-1)
 
+    with self.assertRaisesRegexp(
+        Exception,
+        "Value for attr 'num_split' of 0 must be at least minimum 1"):
+      array_ops.split(value=[1, 2, 3], num_or_size_splits=0)
+
+  def testIsFunction(self):
+    ctx = context.context()
+    self.assertFalse(ctx.has_function("not_a_function"))
+
+    @def_function.function
+    def f():
+      return 1.
+
+    self.assertTrue(ctx.has_function(f.get_concrete_function().name))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/eager/python/remote.py b/tensorflow/python/eager/remote.py
similarity index 96%
rename from tensorflow/contrib/eager/python/remote.py
rename to tensorflow/python/eager/remote.py
index b74cf394f682b64327bc570ef8dbe79f5657902c..fdea95fa8038c7ce63257d5651f1ccd6fc3de3bd 100644
--- a/tensorflow/contrib/eager/python/remote.py
+++ b/tensorflow/python/eager/remote.py
@@ -23,8 +23,10 @@ import os
 from tensorflow.core.protobuf.cluster_pb2 import ClusterDef
 from tensorflow.core.protobuf.tensorflow_server_pb2 import ServerDef
 from tensorflow.python.eager import context
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("config.experimental_connect_to_host")
 def connect_to_remote_host(remote_host=None, job_name="worker"):
   """Connects to a single machine to enable remote execution on it.
 
diff --git a/tensorflow/python/eager/tape.py b/tensorflow/python/eager/tape.py
index e501b403a39144a673e8ac5155edf0498425bcd6..e5d6007b4892a739ed12e072738208880736ff23 100644
--- a/tensorflow/python/eager/tape.py
+++ b/tensorflow/python/eager/tape.py
@@ -27,8 +27,8 @@ from tensorflow.python.util.lazy_loader import LazyLoader
 # distribution_strategy_context.
 # TODO(b/117329403): Remove this circular dependency.
 distribution_strategy_context = LazyLoader(
-    "distribute_lib", globals(),
-    "tensorflow.python.training."
+    "distribution_strategy_context", globals(),
+    "tensorflow.python.distribute."
     "distribution_strategy_context")
 
 
@@ -61,8 +61,9 @@ def watch(tape, tensor):
 
 def watch_variable(tape, variable):
   """Marks this variable to be watched by the given tape."""
-  strategy = distribution_strategy_context.get_distribution_strategy()
-  if distribution_strategy_context.get_replica_context():
+  strategy, context = (
+      distribution_strategy_context.get_strategy_and_replica_context())
+  if context:
     variables = [strategy.extended.value_container(variable)]
   else:
     variables = strategy.unwrap(variable)
@@ -76,8 +77,9 @@ def variable_accessed(variable):
   Args:
     variable: variable to be watched.
   """
-  strategy = distribution_strategy_context.get_distribution_strategy()
-  if distribution_strategy_context.get_replica_context():
+  strategy, context = (
+      distribution_strategy_context.get_strategy_and_replica_context())
+  if context:
     variables = [strategy.extended.value_container(variable)]
   else:
     variables = strategy.unwrap(variable)
@@ -85,6 +87,29 @@ def variable_accessed(variable):
     pywrap_tensorflow.TFE_Py_TapeVariableAccessed(var)
 
 
+def variables_accessed(variables):
+  """Notifies all tapes in the stack that variables have been accessed.
+
+  Only trainable variables are marked as accessed.
+
+  Args:
+    variables: iterable of variables to mark as accessed.
+  """
+  strategy, context = (
+      distribution_strategy_context.get_strategy_and_replica_context())
+  accessed = []
+  if context:
+    accessed = [strategy.extended.value_container(variable)
+                for variable in variables if variable.trainable]
+  else:
+    for variable in variables:
+      if variable.trainable:
+        accessed.extend(strategy.unwrap(variable))
+
+  for var in accessed:
+    pywrap_tensorflow.TFE_Py_TapeVariableAccessed(var)
+
+
 def pop_tape(tape):
   """Pops the top tape in the stack, if any."""
   pywrap_tensorflow.TFE_Py_TapeSetRemove(tape._tape)  # pylint: disable=protected-access
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index 0ee2ff68c209aa13aaeb32be610302c11616b9d7..0d8845bd96f9dad6d9e110e5a9fd617647d18f64 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -339,6 +339,24 @@ class TFETensorTest(test_util.TensorFlowTestCase):
   def testConvertToTensorAllowsOverflow(self):
     _ = ops.convert_to_tensor(123456789, dtype=dtypes.uint8)
 
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.run_in_graph_and_eager_modes
+  def testConvertToTensorNumpyZeroDim(self):
+    for np_type, dtype in [(np.int32, dtypes.int32),
+                           (np.half, dtypes.half),
+                           (np.float32, dtypes.float32)]:
+      x = ops.convert_to_tensor([np.array(65, dtype=np_type),
+                                 np.array(16, dtype=np_type)])
+      self.assertEqual(x.dtype, dtype)
+      self.assertAllEqual(x, [65, 16])
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  @test_util.run_in_graph_and_eager_modes
+  def testConvertToTensorNumpyScalar(self):
+    x = ops.convert_to_tensor([np.asscalar(np.array(321, dtype=np.int)),
+                               np.asscalar(np.array(16, dtype=np.int))])
+    self.assertAllEqual(x, [321, 16])
+
   def testEagerTensorError(self):
     with self.assertRaisesRegexp(
         TypeError,
@@ -347,7 +365,6 @@ class TFETensorTest(test_util.TensorFlowTestCase):
       _ = ops.convert_to_tensor(1., dtype=dtypes.int32)
 
 
-
 class TFETensorUtilTest(test_util.TensorFlowTestCase):
 
   def testListOfThree(self):
diff --git a/tensorflow/python/eager/wrap_function.py b/tensorflow/python/eager/wrap_function.py
index 2b39e99a4ea5d145f9bb8cef5c5931c306bcaeea..8eb02007bb0b82d5161819721d436f2994b61ded 100644
--- a/tensorflow/python/eager/wrap_function.py
+++ b/tensorflow/python/eager/wrap_function.py
@@ -19,11 +19,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import weakref
+
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
@@ -37,8 +41,20 @@ class VariableHolder(object):
     self._variables = []
 
   def variable_creator_scope(self, next_creator, **kwargs):
+    """Creates variables & adds them to collections to match legacy code."""
     v = next_creator(**kwargs)
     self._variables.append(v)
+
+    collections = kwargs.get("collections")
+    trainable = v.trainable
+
+    if collections is None:
+      collections = [ops.GraphKeys.GLOBAL_VARIABLES]
+    if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
+      collections = list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES]
+
+    ops.add_to_collections(collections, v)
+
     return v
 
   def __call__(self, *args, **kwargs):
@@ -46,20 +62,85 @@ class VariableHolder(object):
       return self._fn(*args, **kwargs)
 
 
-# TODO(allenl): make this checkpointable
-class WrappedFunction(function.Function):
+# TODO(allenl): make this trackable
+class WrappedFunction(function.ConcreteFunction):
   """Wraps a tf V1 piece of code in a function."""
 
   def __init__(self, fn_graph, variable_holder, attrs=None, signature=None):
     super(WrappedFunction, self).__init__(
         fn_graph, attrs=attrs, signature=signature)
     self._variable_holder = variable_holder
+    if ops.executing_eagerly_outside_functions():
+      # TODO(allenl): Make this work in 1.x?
+      self._lift_unlifted_variables()
+
+  def _lift_unlifted_variables(self):
+    """Finds resource variables and lifts them into the outer context.
+
+    When we import a GraphDef inside a wrap_function, no Python graph building
+    code runs. This means we get VarHandleOps which create variable resources,
+    but no corresponding Python objects. Leaving them like this works but gives
+    the user no way to interact with or modify the variables outside the graph.
+
+    This method searches for variables and lifts them out as regular variable
+    objects when possible, indicating to the FuncGraph that they are captures.
+    """
+    with self.graph.as_default():
+      collection_variables = (
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+          + ops.get_collection(ops.GraphKeys.LOCAL_VARIABLES))
+      existing_captures = set(self.graph.internal_captures)
+      lifted_variables = {}
+      for old_variable in collection_variables:
+        if (old_variable._in_graph_mode  # pylint: disable=protected-access
+            and isinstance(old_variable,
+                           resource_variable_ops.ResourceVariable)):
+          if old_variable.handle in existing_captures:
+            continue
+          new_variable = def_function.UnliftedInitializerVariable(
+              array_ops.placeholder(
+                  name="unused_{}_initializer".format(old_variable.op.name),
+                  shape=old_variable.shape,
+                  dtype=old_variable.dtype),
+              name=old_variable.op.name,
+              trainable=old_variable.trainable)
+          self.graph.captures[new_variable.handle] = old_variable.handle
+          existing_captures.add(old_variable.handle)
+          lifted_variables[old_variable] = new_variable
+          # pylint: disable=protected-access
+          self._variable_holder._variables.append(new_variable)
+          self.graph._weak_variables.append(weakref.ref(new_variable))
+          # pylint: enable=protected-access
+      # Update the graph's collections, partly for the user and partly so this
+      # function is idempotent when it runs again in prune() calls.
+      for collection_name in [ops.GraphKeys.GLOBAL_VARIABLES,
+                              ops.GraphKeys.LOCAL_VARIABLES]:
+        mutable_collection = ops.get_collection_ref(collection_name)
+        for index, current in enumerate(mutable_collection):
+          mutable_collection[index] = lifted_variables.get(current, current)
 
   def prune(self, feeds, fetches):
     flat_feeds, flat_fetches = nest.flatten(feeds), nest.flatten(fetches)
-    for f in flat_feeds + flat_fetches:
+    for f in flat_feeds:
       if not isinstance(f, ops.Tensor):
-        raise ValueError("Feeds and fetches must be tensors.")
+        raise ValueError("Feeds must be tensors.")
+
+    # Ignoring all feeds that are captures allows prune to be called
+    # using wrapped_func.inputs even when it uses variables
+    internal_captures = self.graph.internal_captures
+    flat_feeds = [f for f in flat_feeds
+                  if f not in internal_captures]
+
+    tensor_fetches = []
+    operation_fetches = []
+    for f in flat_fetches:
+      if isinstance(f, ops.Tensor):
+        tensor_fetches.append(f)
+      elif isinstance(f, ops.Operation):
+        operation_fetches.append(f)
+      else:
+        raise ValueError("Fetches must be tensors or operations.")
+    for f in flat_feeds + flat_fetches:
       if f.graph is not self._func_graph:
         raise ValueError(
             "Can only prune function whose feeds and fetches "
@@ -67,11 +148,37 @@ class WrappedFunction(function.Function):
                 self._func_graph, f, f.graph))
     with self._func_graph.as_default():
       pruned_graph = func_graph.FuncGraph("pruned")
-      sink_tensor = array_ops.identity_n(flat_fetches)[0]
+      with ops.control_dependencies(operation_fetches):
+        if tensor_fetches:
+          identity_fetches = array_ops.identity_n(tensor_fetches)
+          sink_tensor = identity_fetches[0]
+        else:
+          identity_fetches = []
+          sink_tensor = array_ops.zeros([])
     lift_map = lift_to_graph.lift_to_graph(
-        sink_tensor, pruned_graph, sources=flat_feeds)
-    pruned_graph.outputs.extend(lift_map[x] for x in flat_fetches)
+        [sink_tensor], pruned_graph, sources=flat_feeds + internal_captures)
+    for original_fetch, identity_fetch in zip(
+        tensor_fetches, identity_fetches):
+      lift_map[original_fetch] = lift_map[identity_fetch]
+    pruned_graph.outputs.extend(
+        lift_map[x] for x in flat_fetches if isinstance(x, ops.Tensor))
+    if not tensor_fetches:
+      pruned_graph.outputs.append(lift_map[sink_tensor])
+    for external_capture, internal_capture in self.graph.captures.items():
+      pruned_graph.captures[external_capture] = lift_map[internal_capture]
     pruned_graph.inputs.extend(lift_map[x] for x in flat_feeds)
+    pruned_graph.inputs.extend(pruned_graph.captures.values())
+
+    pruned_graph.variables = self.graph.variables
+
+    def _structured_output_mapping(fetched):
+      lifted = lift_map[fetched]
+      if isinstance(lifted, ops.Operation):
+        return None
+      return lifted
+
+    pruned_graph.structured_outputs = nest.map_structure(
+        _structured_output_mapping, fetches)
     pruned_fn = WrappedFunction(
         pruned_graph, variable_holder=self._variable_holder)
     pruned_fn._num_positional_args = len(flat_feeds)  # pylint: disable=protected-access
@@ -137,11 +244,15 @@ def wrap_function(fn, signature, name=None):
     the wrapped graph function.
   """
   holder = VariableHolder(fn)
+  func_graph_name = "wrapped_function"
+  if name is not None:
+    func_graph_name = "wrapped_function_" + name
   return WrappedFunction(
       func_graph.func_graph_from_py_func(
-          name,
+          func_graph_name,
           holder,
           args=None, kwargs=None, signature=signature,
-          add_control_dependencies=False),
+          add_control_dependencies=False,
+          collections={}),
       variable_holder=holder,
       signature=signature)
diff --git a/tensorflow/python/eager/wrap_function_test.py b/tensorflow/python/eager/wrap_function_test.py
index b32b6ca42691a6261576da6b105a0afc97e0ec63..6225b13849f603a3881e2e740e2dfe09e77bd9d3 100644
--- a/tensorflow/python/eager/wrap_function_test.py
+++ b/tensorflow/python/eager/wrap_function_test.py
@@ -18,10 +18,13 @@ from __future__ import division
 from __future__ import print_function
 
 
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import wrap_function
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -70,6 +73,175 @@ class WrapFunctionTest(test.TestCase):
     f_pruned = f_wrapped.prune(x_in[0], [x_out[0]])
     self.assertAllEqual(f_pruned(ops.convert_to_tensor(2.0)), [4.0])
 
+  def testNoArguments(self):
+
+    def f():
+      return constant_op.constant(1.)
+
+    f_wrapped = wrap_function.wrap_function(f, [])
+    self.assertAllEqual(1.0, f_wrapped())
+
+  def testPruneCaptures(self):
+
+    v1 = variables.Variable(2.)
+
+    def f():
+      v2 = variables.Variable(3.)
+      return array_ops.identity(v1 * v2 * constant_op.constant(1.), 'fetch')
+
+    f_wrapped = wrap_function.wrap_function(f, [])
+    self.assertAllEqual(6.0, f_wrapped())
+
+    # Test pruning directly on the inputs
+    pruned = f_wrapped.prune(
+        feeds=f_wrapped.inputs,
+        fetches=f_wrapped.graph.get_tensor_by_name('fetch:0'))
+    self.assertAllEqual(6.0, pruned())
+
+    # Test pruning with no inputs
+    pruned = f_wrapped.prune(
+        feeds=(),
+        fetches=f_wrapped.graph.get_tensor_by_name('fetch:0'))
+    self.assertAllEqual(6.0, pruned())
+
+  def testCollectionsIsolation(self):
+
+    v1 = variables.Variable(2.)
+    v2_holder = []
+    def f():
+      v2 = variables.Variable(3.)
+      v2_holder.append(v2)
+      ops.add_to_collection(ops.GraphKeys.LOSSES, v2 * constant_op.constant(3.))
+      return array_ops.identity(v1 * v2 * constant_op.constant(1.), 'fetch')
+
+    f_wrapped = wrap_function.wrap_function(f, [])
+    self.assertAllEqual(6.0, f_wrapped())
+    self.assertEqual(
+        len(f_wrapped.graph.get_collection(ops.GraphKeys.LOSSES)), 1)
+    f_var_collection = f_wrapped.graph.get_collection(
+        ops.GraphKeys.TRAINABLE_VARIABLES)
+    self.assertEqual(len(f_var_collection), 1)
+    self.assertIs(f_var_collection[0], v2_holder[0])
+
+    v3_holder = []
+    def g():
+      v3 = variables.Variable(4.)
+      v3_holder.append(v3)
+      ops.add_to_collection(ops.GraphKeys.LOSSES, v3 * constant_op.constant(3.))
+      return array_ops.identity(v1 * v3 * constant_op.constant(1.), 'fetch')
+
+    g_wrapped = wrap_function.wrap_function(g, [])
+    self.assertAllEqual(8.0, g_wrapped())
+    self.assertEqual(
+        len(g_wrapped.graph.get_collection(ops.GraphKeys.LOSSES)), 1)
+    g_var_collection = g_wrapped.graph.get_collection(
+        ops.GraphKeys.TRAINABLE_VARIABLES)
+    self.assertEqual(len(g_var_collection), 1)
+    self.assertIs(g_var_collection[0], v3_holder[0])
+
+    # Both have only one value, and their values aren't equal. So no sharing.
+    self.assertNotEqual(g_wrapped.graph.get_collection(ops.GraphKeys.LOSSES),
+                        f_wrapped.graph.get_collection(ops.GraphKeys.LOSSES))
+
+  def testGradientsOfPrune(self):
+
+    v1 = variables.Variable(2.)
+    v2_holder = []
+
+    def f(z):
+      v2 = variables.Variable(3.)
+      v2_holder.append(v2)
+      return array_ops.identity(v1 * v2 * z, 'fetch')
+
+    f_wrapped = wrap_function.wrap_function(
+        f, [tensor_spec.TensorSpec((), dtype=dtypes.float32)])
+
+    x = constant_op.constant(1.)
+    with backprop.GradientTape() as tape:
+      tape.watch(x)
+      out = f_wrapped(x)
+    grads = tape.gradient(out, [x, v1, v2_holder[0]])
+
+    self.assertAllEqual(6.0, out)
+    self.assertAllEqual([6.0, 3.0, 2.0], grads)
+
+    pruned = f_wrapped.prune(
+        feeds=f_wrapped.inputs,
+        fetches=f_wrapped.graph.get_tensor_by_name('fetch:0'))
+
+    x = constant_op.constant(1.)
+    with backprop.GradientTape() as tape:
+      tape.watch(x)
+      out = pruned(x)
+    grads = tape.gradient(out, [x, v1, v2_holder[0]])
+
+    self.assertAllEqual(6.0, out)
+    self.assertAllEqual([6.0, 3.0, 2.0], grads)
+
+  def testPruneOperations(self):
+
+    v = variables.Variable(0)
+
+    def f():
+      v.assign_add(1, name='increment', read_value=False)
+
+    f_wrapped = wrap_function.wrap_function(f, [])
+    pruned = f_wrapped.prune(
+        feeds=(),
+        fetches=(f_wrapped.graph.get_operation_by_name('increment'),))
+    self.assertEqual((None,), pruned())
+    self.assertEqual(1, self.evaluate(v))
+
+    del f, f_wrapped
+
+    def f1():
+      v.assign_add(
+          array_ops.placeholder(shape=[], dtype=dtypes.int32, name='step'),
+          name='increment', read_value=False)
+      return constant_op.constant(1, name='other')
+
+    f_wrapped = wrap_function.wrap_function(f1, [])
+    increments = f_wrapped.prune(
+        feeds=(f_wrapped.graph.get_tensor_by_name('step:0')),
+        fetches=(f_wrapped.graph.get_operation_by_name('increment'),
+                 f_wrapped.graph.get_tensor_by_name('other:0')))
+    first_output, second_output = increments(constant_op.constant(2))
+    self.assertEqual(['step:0', 'increment/resource:0'],
+                     [t.name for t in increments.inputs])
+    self.assertIs(None, first_output)
+    self.assertEqual(1, second_output.numpy())
+    self.assertEqual(3, v.numpy())
+    does_not_increment = f_wrapped.prune(
+        feeds=(f_wrapped.graph.get_tensor_by_name('step:0')),
+        fetches=f_wrapped.graph.get_tensor_by_name('other:0'))
+    self.assertEqual(1, does_not_increment(constant_op.constant(3)).numpy())
+    self.assertEqual(3, v.numpy())
+
+  def testPruneStatefulOpsFromWrappedFunc(self):
+
+    v0 = variables.Variable(0)
+    v1 = variables.Variable(0)
+
+    # When we wrap a function, we expect it to be executed with 'tf.Graph`
+    # rules: it's allowed to prune all ops that are not in transitive fanin of
+    # the fetches.
+    def f(x):
+      v0.assign_add(1, name='increment_v0')
+      v1.assign_add(1, name='increment_v1')
+      return x
+
+    f_wrapped = wrap_function.wrap_function(f, [1])
+
+    self.assertEqual(1, f_wrapped().numpy())
+    self.assertEqual(0, v0.numpy())
+    self.assertEqual(0, v1.numpy())
+
+    f_wrapped_with_name = wrap_function.wrap_function(f, [2], name='func')
+
+    self.assertEqual(2, f_wrapped_with_name().numpy())
+    self.assertEqual(0, v0.numpy())
+    self.assertEqual(0, v1.numpy())
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index d24a7ae80c86d407ae3bb60ca55fff98be9f27a1..8caf46e3fa7f5b64a7b9d4683bc406b2fc213103 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -4,7 +4,7 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 py_library(
     name = "feature_column_py",
@@ -13,6 +13,7 @@ py_library(
     deps = [
         ":feature_column",
         ":feature_column_v2",
+        ":sequence_feature_column",
         "//tensorflow/python:util",
     ],
 )
@@ -29,6 +30,7 @@ py_library(
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers_base",
         "//tensorflow/python:lookup_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
@@ -94,19 +96,13 @@ filegroup(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "feature_column_test",
     srcs = ["feature_column_test.py"],
-    data = [":vocabulary_testdata"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_cuda_on_cpu_tap",
-        "no_pip",
-        "no_windows",
-    ],
-    deps = [
+    additional_deps = [
         ":feature_column",
         ":feature_column_py",
+        "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -125,24 +121,22 @@ py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        "//third_party/py/numpy",
     ],
-)
-
-py_test(
-    name = "feature_column_v2_test",
-    srcs = ["feature_column_v2_test.py"],
     data = [":vocabulary_testdata"],
-    shard_count = 5,
-    srcs_version = "PY2AND3",
     tags = [
         "no_cuda_on_cpu_tap",
         "no_pip",
         "no_windows",
     ],
-    deps = [
+)
+
+tf_py_test(
+    name = "feature_column_v2_test",
+    srcs = ["feature_column_v2_test.py"],
+    additional_deps = [
         ":feature_column_py",
         ":feature_column_v2",
+        "//third_party/py/numpy",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -162,6 +156,68 @@ py_test(
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:estimator_py",
+    ],
+    data = [":vocabulary_testdata"],
+    shard_count = 5,
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_pip",
+        "no_windows",
+    ],
+)
+
+py_library(
+    name = "sequence_feature_column",
+    srcs = ["sequence_feature_column.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":feature_column_v2",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:variable_scope",
+    ],
+)
+
+tf_py_test(
+    name = "sequence_feature_column_test",
+    srcs = ["sequence_feature_column_test.py"],
+    additional_deps = [
+        ":feature_column_v2",
+        ":feature_column_v2_test",
+        ":sequence_feature_column",
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:training",
+    ],
+    tags = ["no_pip"],
+)
+
+py_test(
+    name = "sequence_feature_column_integration_test",
+    srcs = ["sequence_feature_column_integration_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":feature_column_v2",
+        ":sequence_feature_column",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python/keras:layers",
     ],
 )
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index a858d92608db1a0d9d00b34f91860b7d4be01d68..42a07cd9275927f69d4795ffd51404998560672e 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -2361,7 +2361,7 @@ class _BucketizedColumn(_DenseColumn, _CategoricalColumn,
     del trainable
     input_tensor = inputs.get(self)
     return array_ops.one_hot(
-        indices=math_ops.to_int64(input_tensor),
+        indices=math_ops.cast(input_tensor, dtypes.int64),
         depth=len(self.boundaries) + 1,
         on_value=1.,
         off_value=0.)
@@ -2391,9 +2391,10 @@ class _BucketizedColumn(_DenseColumn, _CategoricalColumn,
         array_ops.reshape(input_tensor, (-1,)) +
         (len(self.boundaries) + 1) * i2)
 
-    indices = math_ops.to_int64(array_ops.transpose(array_ops.stack((i1, i2))))
-    dense_shape = math_ops.to_int64(array_ops.stack(
-        [batch_size, source_dimension]))
+    indices = math_ops.cast(
+        array_ops.transpose(array_ops.stack((i1, i2))), dtypes.int64)
+    dense_shape = math_ops.cast(
+        array_ops.stack([batch_size, source_dimension]), dtypes.int64)
     sparse_tensor = sparse_tensor_lib.SparseTensor(
         indices=indices,
         values=bucket_indices,
@@ -2829,7 +2830,7 @@ class _VocabularyFileCategoricalColumn(
     if input_tensor.dtype.is_integer:
       # `index_table_from_file` requires 64-bit integer keys.
       key_dtype = dtypes.int64
-      input_tensor = math_ops.to_int64(input_tensor)
+      input_tensor = math_ops.cast(input_tensor, dtypes.int64)
 
     return lookup_ops.index_table_from_file(
         vocabulary_file=self.vocabulary_file,
@@ -2881,7 +2882,7 @@ class _VocabularyListCategoricalColumn(
     if input_tensor.dtype.is_integer:
       # `index_table_from_tensor` requires 64-bit integer keys.
       key_dtype = dtypes.int64
-      input_tensor = math_ops.to_int64(input_tensor)
+      input_tensor = math_ops.cast(input_tensor, dtypes.int64)
 
     return lookup_ops.index_table_from_tensor(
         vocabulary_list=tuple(self.vocabulary_list),
@@ -2924,9 +2925,10 @@ class _IdentityCategoricalColumn(
           'Invalid input, not integer. key: {} dtype: {}'.format(
               self.key, input_tensor.dtype))
 
-    values = math_ops.to_int64(input_tensor.values, name='values')
-    num_buckets = math_ops.to_int64(self.num_buckets, name='num_buckets')
-    zero = math_ops.to_int64(0, name='zero')
+    values = math_ops.cast(input_tensor.values, dtypes.int64, name='values')
+    num_buckets = math_ops.cast(
+        self.num_buckets, dtypes.int64, name='num_buckets')
+    zero = math_ops.cast(0, dtypes.int64, name='zero')
     if self.default_value is None:
       # Fail if values are out-of-range.
       assert_less = check_ops.assert_less(
@@ -2944,9 +2946,8 @@ class _IdentityCategoricalColumn(
               values < zero, values >= num_buckets, name='out_of_range'),
           array_ops.fill(
               dims=array_ops.shape(values),
-              value=math_ops.to_int64(self.default_value),
-              name='default_values'),
-          values)
+              value=math_ops.cast(self.default_value, dtypes.int64),
+              name='default_values'), values)
 
     return sparse_tensor_lib.SparseTensor(
         indices=input_tensor.indices,
@@ -3256,7 +3257,8 @@ def _sequence_length_from_sparse_tensor(sp_tensor, num_elements=1):
     # Example: orig tensor [[1, 2], [3]], col_ids = (0, 1, 1),
     # row_ids = (0, 0, 1), seq_length = [2, 1]. If num_elements = 2,
     # these will get grouped, and the final seq_length is [1, 1]
-    seq_length = math_ops.to_int64(math_ops.ceil(seq_length / num_elements))
+    seq_length = math_ops.cast(
+        math_ops.ceil(seq_length / num_elements), dtypes.int64)
 
     # If the last n rows do not have ids, seq_length will have shape
     # [batch_size - n]. Pad the remaining values with zeros.
diff --git a/tensorflow/python/feature_column/feature_column_lib.py b/tensorflow/python/feature_column/feature_column_lib.py
index 68a2712425c56ae4b3e42c6bd7ae497c0358a074..15950403566b00025d93e643e6be880dac9bbb3d 100644
--- a/tensorflow/python/feature_column/feature_column_lib.py
+++ b/tensorflow/python/feature_column/feature_column_lib.py
@@ -21,4 +21,5 @@ from __future__ import print_function
 # pylint: disable=unused-import,line-too-long,wildcard-import
 from tensorflow.python.feature_column.feature_column import *
 from tensorflow.python.feature_column.feature_column_v2 import *
+from tensorflow.python.feature_column.sequence_feature_column import *
 # pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index daa0a3b3a4bb5fd067681c5ca91eaccdc64d3144..0ded2bf8c9fc9a7dcf1b100da3258b9e8f30a4b3 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -1832,7 +1832,7 @@ class LinearModelTest(test.TestCase):
       }
     with self.assertRaisesRegexp(
         ValueError,
-        'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        r'Batch size \(first dimension\) of each feature must be same.'):
       fc.linear_model(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
@@ -1847,7 +1847,7 @@ class LinearModelTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc.linear_model(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
@@ -2467,7 +2467,7 @@ class _LinearModelTest(test.TestCase):
       }
     with self.assertRaisesRegexp(
         ValueError,
-        'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
       get_keras_linear_model_predictions(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
@@ -2482,7 +2482,7 @@ class _LinearModelTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         get_keras_linear_model_predictions(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
@@ -2974,7 +2974,7 @@ class FunctionalInputLayerTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc.input_layer(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
@@ -2989,7 +2989,7 @@ class FunctionalInputLayerTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc.input_layer(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index 6308926494237f3546ddac0b893e4f6a23b116de..3b9f527061b6d71ac930cf921eac914ec94a6747 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -141,11 +141,11 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras.engine import training
-from tensorflow.python.keras.engine.base_layer import Layer
 # TODO(b/118385027): Dependency on keras can be problematic if Keras moves out
 # of the main repo.
 from tensorflow.python.keras import utils
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -162,13 +162,14 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_utils
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import keras_export
 from tensorflow.python.util.tf_export import tf_export
 
 
-_FEATURE_COLUMN_DEPRECATION_DATE = '2018-11-30'
+_FEATURE_COLUMN_DEPRECATION_DATE = None
 _FEATURE_COLUMN_DEPRECATION = ('The old _FeatureColumn APIs are being '
                                'deprecated. Please use the new FeatureColumn '
                                'APIs instead.')
@@ -303,8 +304,84 @@ class _StateManagerImpl(StateManager):
     raise ValueError('Variable does not exist.')
 
 
-@tf_export('keras.layers.DenseFeatures', v1=[])
-class DenseFeatures(Layer):
+class _BaseFeaturesLayer(Layer):
+  """Base class for DenseFeatures and SequenceFeatures.
+
+  Defines common methods and helpers.
+
+  Args:
+    feature_columns: An iterable containing the FeatureColumns to use as
+      inputs to your model.
+    expected_column_type: Expected class for provided feature columns.
+    trainable:  Boolean, whether the layer's variables will be updated via
+      gradient descent during training.
+    name: Name to give to the DenseFeatures.
+    **kwargs: Keyword arguments to construct a layer.
+
+  Raises:
+    ValueError: if an item in `feature_columns` doesn't match
+      `expected_column_type`.
+  """
+  def __init__(self, feature_columns, expected_column_type, trainable, name,
+               **kwargs):
+    super(_BaseFeaturesLayer, self).__init__(
+        name=name, trainable=trainable, **kwargs)
+    self._feature_columns = _normalize_feature_columns(feature_columns)
+    self._state_manager = _StateManagerImpl(self, self.trainable)
+    for column in self._feature_columns:
+      if not isinstance(column, expected_column_type):
+        raise ValueError(
+            'Items of feature_columns must be a {}. '
+            'You can wrap a categorical column with an '
+            'embedding_column or indicator_column. Given: {}'.format(
+                expected_column_type, column))
+
+  def build(self, _):
+    for column in self._feature_columns:
+      with variable_scope._pure_variable_scope(self.name):  # pylint: disable=protected-access
+        with variable_scope._pure_variable_scope(column.name):  # pylint: disable=protected-access
+          column.create_state(self._state_manager)
+    super(_BaseFeaturesLayer, self).build(None)
+
+  def _output_shape(self, input_shape, num_elements):
+    """Computes expected output shape of the layer or a column's dense tensor.
+
+    Args:
+      input_shape: Tensor or array with batch shape.
+      num_elements: Size of the last dimension of the output.
+
+    Returns:
+      Tuple with output shape.
+    """
+    raise NotImplementedError('Calling an abstract method.')
+
+  def compute_output_shape(self, input_shape):
+    total_elements = 0
+    for column in self._feature_columns:
+      total_elements += column.variable_shape.num_elements()
+    return self._target_shape(input_shape, total_elements)
+
+  def _process_dense_tensor(self, column, tensor):
+    """Reshapes the dense tensor output of a column based on expected shape.
+
+    Args:
+      column: A DenseColumn or SequenceDenseColumn object.
+      tensor: A dense tensor obtained from the same column.
+
+    Returns:
+      Reshaped dense tensor."""
+    num_elements = column.variable_shape.num_elements()
+    target_shape = self._target_shape(array_ops.shape(tensor), num_elements)
+    return array_ops.reshape(tensor, shape=target_shape)
+
+  def _verify_and_concat_tensors(self, output_tensors):
+    """Verifies and concatenates the dense output of several columns."""
+    _verify_static_batch_size_equality(output_tensors, self._feature_columns)
+    return array_ops.concat(output_tensors, -1)
+
+
+@keras_export('keras.layers.DenseFeatures')
+class DenseFeatures(_BaseFeaturesLayer):
   """A layer that produces a dense `Tensor` based on given `feature_columns`.
 
   Generally a single example in training data is described with FeatureColumns.
@@ -344,8 +421,8 @@ class DenseFeatures(Layer):
         `bucketized_column`, `indicator_column`. If you have categorical
         features, you can wrap them with an `embedding_column` or
         `indicator_column`.
-      trainable: If `True` also add the variable to the graph collection
-        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+      trainable:  Boolean, whether the layer's variables will be updated via
+        gradient descent during training.
       name: Name to give to the DenseFeatures.
       **kwargs: Keyword arguments to construct a layer.
 
@@ -353,28 +430,18 @@ class DenseFeatures(Layer):
       ValueError: if an item in `feature_columns` is not a `DenseColumn`.
     """
     super(DenseFeatures, self).__init__(
-        name=name, trainable=trainable, **kwargs)
-
-    self._feature_columns = _normalize_feature_columns(feature_columns)
-    self._feature_columns = sorted(self._feature_columns, key=lambda x: x.name)
-    self._state_manager = _StateManagerImpl(self, self.trainable)
-    for column in self._feature_columns:
-      if not isinstance(column, DenseColumn):
-        raise ValueError(
-            'Items of feature_columns must be a DenseColumn. '
-            'You can wrap a categorical column with an '
-            'embedding_column or indicator_column. Given: {}'.format(column))
+        feature_columns=feature_columns,
+        trainable=trainable,
+        name=name,
+        expected_column_type=DenseColumn,
+        **kwargs)
 
   @property
   def _is_feature_layer(self):
     return True
 
-  def build(self, _):
-    for column in self._feature_columns:
-      with variable_scope._pure_variable_scope(self.name):  # pylint: disable=protected-access
-        with variable_scope._pure_variable_scope(column.name):  # pylint: disable=protected-access
-          column.create_state(self._state_manager)
-      super(DenseFeatures, self).build(None)
+  def _target_shape(self, input_shape, total_elements):
+    return (input_shape[0], total_elements)
 
   def call(self, features, cols_to_output_tensors=None):
     """Returns a dense tensor corresponding to the `feature_columns`.
@@ -400,27 +467,15 @@ class DenseFeatures(Layer):
                        features)
     transformation_cache = FeatureTransformationCache(features)
     output_tensors = []
-    ordered_columns = []
     for column in self._feature_columns:
       with ops.name_scope(column.name):
-        ordered_columns.append(column)
         tensor = column.get_dense_tensor(transformation_cache,
                                          self._state_manager)
-        num_elements = column.variable_shape.num_elements()
-        batch_size = array_ops.shape(tensor)[0]
-        tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
-        output_tensors.append(tensor)
+        processed_tensors = self._process_dense_tensor(column, tensor)
         if cols_to_output_tensors is not None:
-          cols_to_output_tensors[column] = tensor
-
-    _verify_static_batch_size_equality(output_tensors, ordered_columns)
-    return array_ops.concat(output_tensors, 1)
-
-  def compute_output_shape(self, input_shape):
-    total_elements = 0
-    for column in self._feature_columns:
-      total_elements += column.variable_shape.num_elements()
-    return (input_shape[0], total_elements)
+          cols_to_output_tensors[column] = processed_tensors
+        output_tensors.append(processed_tensors)
+    return self._verify_and_concat_tensors(output_tensors)
 
 
 class _LinearModelLayer(Layer):
@@ -437,7 +492,6 @@ class _LinearModelLayer(Layer):
         name=name, trainable=trainable, **kwargs)
 
     self._feature_columns = _normalize_feature_columns(feature_columns)
-    self._feature_columns = sorted(self._feature_columns, key=lambda x: x.name)
     for column in self._feature_columns:
       if not isinstance(column, (DenseColumn, CategoricalColumn)):
         raise ValueError(
@@ -518,7 +572,7 @@ class _LinearModelLayer(Layer):
       return predictions
 
 
-@tf_export('keras.layers.LinearModel', v1=[])
+@keras_export('keras.layers.LinearModel', v1=[])
 class LinearModel(training.Model):
   """Produces a linear prediction `Tensor` based on given `feature_columns`.
 
@@ -693,7 +747,7 @@ def _transform_features_v2(features, feature_columns, state_manager):
   with ops.name_scope(
       None, default_name='transform_features', values=features.values()):
     transformation_cache = FeatureTransformationCache(features)
-    for column in sorted(feature_columns, key=lambda x: x.name):
+    for column in feature_columns:
       with ops.name_scope(None, default_name=column.name):
         outputs[column] = transformation_cache.get(column, state_manager)
   return outputs
@@ -1037,7 +1091,7 @@ def shared_embedding_columns(categorical_columns,
   return result
 
 
-@tf_export('feature_column.shared_embedding_columns', v1=[])
+@tf_export('feature_column.shared_embeddings', v1=[])
 def shared_embedding_columns_v2(categorical_columns,
                                 dimension,
                                 combiner='mean',
@@ -1354,8 +1408,9 @@ def bucketized_column(source_column, boundaries):
     raise ValueError(
         'source_column must be one-dimensional column. '
         'Given: {}'.format(source_column))
-  if (not boundaries or
-      not (isinstance(boundaries, list) or isinstance(boundaries, tuple))):
+  if not boundaries:
+    raise ValueError('boundaries must not be empty.')
+  if not (isinstance(boundaries, list) or isinstance(boundaries, tuple)):
     raise ValueError('boundaries must be a sorted list.')
   for i in range(len(boundaries) - 1):
     if boundaries[i] >= boundaries[i + 1]:
@@ -2658,7 +2713,7 @@ def _normalize_feature_columns(feature_columns):
                                                name_to_column[column.name]))
     name_to_column[column.name] = column
 
-  return feature_columns
+  return sorted(feature_columns, key=lambda x: x.name)
 
 
 class NumericColumn(
@@ -2778,12 +2833,8 @@ class NumericColumn(
     """See 'FeatureColumn` base class."""
     _check_config_keys(config, cls._fields)
     kwargs = config.copy()
-    # TODO(b/118820158): Simplify if deserialize_keras_object supports None.
-    if config['normalizer_fn']:
-      kwargs['normalizer_fn'] = utils.deserialize_keras_object(
-          config['normalizer_fn'], custom_objects=custom_objects)
-    else:
-      kwargs['normalizer_fn'] = None
+    kwargs['normalizer_fn'] = utils.deserialize_keras_object(
+        config['normalizer_fn'], custom_objects=custom_objects)
     kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
     return cls(**kwargs)
 
@@ -3070,10 +3121,10 @@ class EmbeddingColumn(
       raise ValueError(
           'In embedding_column: {}. '
           'categorical_column must not be of type SequenceCategoricalColumn. '
-          'Suggested fix A: If you wish to use input_layer, use a '
+          'Suggested fix A: If you wish to use DenseFeatures, use a '
           'non-sequence categorical_column_with_*. '
           'Suggested fix B: If you wish to create sequence input, use '
-          'sequence_input_layer instead of input_layer. '
+          'SequenceFeatures instead of DenseFeatures. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
     # Get sparse IDs and weights.
@@ -3090,10 +3141,10 @@ class EmbeddingColumn(
       raise ValueError(
           'In embedding_column: {}. '
           'categorical_column must not be of type _SequenceCategoricalColumn. '
-          'Suggested fix A: If you wish to use input_layer, use a '
+          'Suggested fix A: If you wish to use DenseFeatures, use a '
           'non-sequence categorical_column_with_*. '
           'Suggested fix B: If you wish to create sequence input, use '
-          'sequence_input_layer instead of input_layer. '
+          'SequenceFeatures instead of DenseFeatures. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
     sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
@@ -3107,11 +3158,11 @@ class EmbeddingColumn(
       raise ValueError(
           'In embedding_column: {}. '
           'categorical_column must be of type SequenceCategoricalColumn '
-          'to use sequence_input_layer. '
+          'to use SequenceFeatures. '
           'Suggested fix: Use one of sequence_categorical_column_with_*. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
-    sparse_tensors = self.categorical_column.get_sequence_sparse_tensors(
+    sparse_tensors = self.categorical_column.get_sparse_tensors(
         transformation_cache, state_manager)
     dense_tensor = self._get_dense_tensor_internal(sparse_tensors,
                                                    state_manager)
@@ -3131,8 +3182,8 @@ class EmbeddingColumn(
         (SequenceCategoricalColumn, fc_old._SequenceCategoricalColumn)):  # pylint: disable=protected-access
       raise ValueError(
           'In embedding_column: {}. '
-          'categorical_column must be of type _SequenceCategoricalColumn '
-          'to use sequence_input_layer. '
+          'categorical_column must be of type SequenceCategoricalColumn '
+          'to use SequenceFeatures. '
           'Suggested fix: Use one of sequence_categorical_column_with_*. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
@@ -3141,7 +3192,7 @@ class EmbeddingColumn(
         sparse_tensors,
         weight_collections=weight_collections,
         trainable=trainable)
-    sequence_length = fc_old._sequence_length_from_sparse_tensor(  # pylint: disable=protected-access
+    sequence_length = _sequence_length_from_sparse_tensor(
         sparse_tensors.id_tensor)
     return SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=dense_tensor, sequence_length=sequence_length)
@@ -3166,12 +3217,8 @@ class EmbeddingColumn(
     kwargs = config.copy()
     kwargs['categorical_column'] = deserialize_feature_column(
         config['categorical_column'], custom_objects, columns_by_name)
-    # TODO(b/118820158): Simplify if deserialize_keras_object supports None.
-    if config['initializer']:
-      kwargs['initializer'] = utils.deserialize_keras_object(
-          config['initializer'], custom_objects=custom_objects)
-    else:
-      kwargs['initializer'] = None
+    kwargs['initializer'] = utils.deserialize_keras_object(
+        config['initializer'], custom_objects=custom_objects)
     return cls(**kwargs)
 
 
@@ -3181,7 +3228,7 @@ def _raise_shared_embedding_column_error():
                    '`DenseFeatures` or `LinearModel` instead.')
 
 
-class SharedEmbeddingColumnCreator(tracking.Checkpointable):
+class SharedEmbeddingColumnCreator(tracking.AutoTrackable):
 
   def __init__(self,
                dimension,
@@ -3304,10 +3351,10 @@ class SharedEmbeddingColumn(
       raise ValueError(
           'In embedding_column: {}. '
           'categorical_column must not be of type SequenceCategoricalColumn. '
-          'Suggested fix A: If you wish to use input_layer, use a '
+          'Suggested fix A: If you wish to use DenseFeatures, use a '
           'non-sequence categorical_column_with_*. '
           'Suggested fix B: If you wish to create sequence input, use '
-          'sequence_input_layer instead of input_layer. '
+          'SequenceFeatures instead of DenseFeatures. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
     return self._get_dense_tensor_internal(transformation_cache, state_manager)
@@ -3321,15 +3368,15 @@ class SharedEmbeddingColumn(
       raise ValueError(
           'In embedding_column: {}. '
           'categorical_column must be of type SequenceCategoricalColumn '
-          'to use sequence_input_layer. '
+          'to use SequenceFeatures. '
           'Suggested fix: Use one of sequence_categorical_column_with_*. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
-    dense_tensor = self.get_dense_tensor_internal(transformation_cache,
-                                                  state_manager)
+    dense_tensor = self._get_dense_tensor_internal(transformation_cache,
+                                                   state_manager)
     sparse_tensors = self.categorical_column.get_sparse_tensors(
         transformation_cache, state_manager)
-    sequence_length = fc_old._sequence_length_from_sparse_tensor(  # pylint: disable=protected-access
+    sequence_length = _sequence_length_from_sparse_tensor(
         sparse_tensors.id_tensor)
     return SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=dense_tensor, sequence_length=sequence_length)
@@ -3975,13 +4022,9 @@ class WeightedCategoricalColumn(
 
   def transform_feature(self, transformation_cache, state_manager):
     """Applies weights to tensor generated from `categorical_column`'."""
-    print('WeightedCategoricalColumn.transform_feature: ', self.name)
-    print('Weight feature key: ', self.weight_feature_key)
     weight_tensor = transformation_cache.get(self.weight_feature_key,
                                              state_manager)
-    print('Weight tensor before: ', weight_tensor)
     weight_tensor = self._transform_weight_tensor(weight_tensor)
-    print('Weight tensor after: ', weight_tensor)
     return (transformation_cache.get(self.categorical_column, state_manager),
             weight_tensor)
 
@@ -3995,9 +4038,7 @@ class WeightedCategoricalColumn(
 
   def get_sparse_tensors(self, transformation_cache, state_manager):
     """See `CategoricalColumn` base class."""
-    print('WeightedCategoricalColumn.get_sparse_tensors: ', self.name)
     tensors = transformation_cache.get(self, state_manager)
-    print('tensors[1]: ', tensors[1])
     return CategoricalColumn.IdWeightPair(tensors[0], tensors[1])
 
   @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
@@ -4339,10 +4380,10 @@ class IndicatorColumn(
       raise ValueError(
           'In indicator_column: {}. '
           'categorical_column must not be of type SequenceCategoricalColumn. '
-          'Suggested fix A: If you wish to use input_layer, use a '
+          'Suggested fix A: If you wish to use DenseFeatures, use a '
           'non-sequence categorical_column_with_*. '
           'Suggested fix B: If you wish to create sequence input, use '
-          'sequence_input_layer instead of input_layer. '
+          'SequenceFeatures instead of DenseFeatures. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
     # Feature has been already transformed. Return the intermediate
@@ -4360,10 +4401,10 @@ class IndicatorColumn(
       raise ValueError(
           'In indicator_column: {}. '
           'categorical_column must not be of type _SequenceCategoricalColumn. '
-          'Suggested fix A: If you wish to use input_layer, use a '
+          'Suggested fix A: If you wish to use DenseFeatures, use a '
           'non-sequence categorical_column_with_*. '
           'Suggested fix B: If you wish to create sequence input, use '
-          'sequence_input_layer instead of input_layer. '
+          'SequenceFeatures instead of DenseFeatures. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
     # Feature has been already transformed. Return the intermediate
@@ -4376,7 +4417,7 @@ class IndicatorColumn(
       raise ValueError(
           'In indicator_column: {}. '
           'categorical_column must be of type SequenceCategoricalColumn '
-          'to use sequence_input_layer. '
+          'to use SequenceFeatures. '
           'Suggested fix: Use one of sequence_categorical_column_with_*. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
@@ -4385,7 +4426,7 @@ class IndicatorColumn(
     dense_tensor = transformation_cache.get(self, state_manager)
     sparse_tensors = self.categorical_column.get_sparse_tensors(
         transformation_cache, state_manager)
-    sequence_length = fc_old._sequence_length_from_sparse_tensor(  # pylint: disable=protected-access
+    sequence_length = _sequence_length_from_sparse_tensor(
         sparse_tensors.id_tensor)
     return SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=dense_tensor, sequence_length=sequence_length)
@@ -4406,7 +4447,7 @@ class IndicatorColumn(
       raise ValueError(
           'In indicator_column: {}. '
           'categorical_column must be of type _SequenceCategoricalColumn '
-          'to use sequence_input_layer. '
+          'to use SequenceFeatures. '
           'Suggested fix: Use one of sequence_categorical_column_with_*. '
           'Given (type {}): {}'.format(self.name, type(self.categorical_column),
                                        self.categorical_column))
@@ -4414,7 +4455,7 @@ class IndicatorColumn(
     # representation created by _transform_feature.
     dense_tensor = inputs.get(self)
     sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
-    sequence_length = fc_old._sequence_length_from_sparse_tensor(  # pylint: disable=protected-access
+    sequence_length = _sequence_length_from_sparse_tensor(
         sparse_tensors.id_tensor)
     return SequenceDenseColumn.TensorSequenceLengthPair(
         dense_tensor=dense_tensor, sequence_length=sequence_length)
@@ -4468,9 +4509,34 @@ def _verify_static_batch_size_equality(tensors, columns):
                 expected_batch_size, batch_size))
 
 
+def _sequence_length_from_sparse_tensor(sp_tensor, num_elements=1):
+  """Returns a [batch_size] Tensor with per-example sequence length."""
+  with ops.name_scope(None, 'sequence_length') as name_scope:
+    row_ids = sp_tensor.indices[:, 0]
+    column_ids = sp_tensor.indices[:, 1]
+    # Add one to convert column indices to element length
+    column_ids += array_ops.ones_like(column_ids)
+    # Get the number of elements we will have per example/row
+    seq_length = math_ops.segment_max(column_ids, segment_ids=row_ids)
+
+    # The raw values are grouped according to num_elements;
+    # how many entities will we have after grouping?
+    # Example: orig tensor [[1, 2], [3]], col_ids = (0, 1, 1),
+    # row_ids = (0, 0, 1), seq_length = [2, 1]. If num_elements = 2,
+    # these will get grouped, and the final seq_length is [1, 1]
+    seq_length = math_ops.cast(
+        math_ops.ceil(seq_length / num_elements), dtypes.int64)
+
+    # If the last n rows do not have ids, seq_length will have shape
+    # [batch_size - n]. Pad the remaining values with zeros.
+    n_pad = array_ops.shape(sp_tensor)[:1] - array_ops.shape(seq_length)[:1]
+    padding = array_ops.zeros(n_pad, dtype=seq_length.dtype)
+    return array_ops.concat([seq_length, padding], axis=0, name=name_scope)
+
+
 class SequenceCategoricalColumn(
-    FeatureColumn,
-    fc_old._CategoricalColumn,  # pylint: disable=protected-access
+    CategoricalColumn,
+    fc_old._SequenceCategoricalColumn,  # pylint: disable=protected-access
     collections.namedtuple('SequenceCategoricalColumn',
                            ('categorical_column'))):
   """Represents sequences of categorical data."""
@@ -4533,7 +4599,7 @@ class SequenceCategoricalColumn(
       weight_tensor = sparse_ops.sparse_reshape(weight_tensor, target_shape)
     return CategoricalColumn.IdWeightPair(id_tensor, weight_tensor)
 
-  def get_sequence_sparse_tensors(self, transformation_cache, state_manager):
+  def get_sparse_tensors(self, transformation_cache, state_manager):
     """Returns an IdWeightPair.
 
     `IdWeightPair` is a pair of `SparseTensor`s which represents ids and
@@ -4679,7 +4745,7 @@ def deserialize_feature_column(config,
           IdentityCategoricalColumn, IndicatorColumn, NumericColumn,
           SequenceCategoricalColumn, SequenceDenseColumn, SharedEmbeddingColumn,
           VocabularyFileCategoricalColumn, VocabularyListCategoricalColumn,
-          WeightedCategoricalColumn
+          WeightedCategoricalColumn, init_ops.TruncatedNormal
       ]
   }
   if columns_by_name is None:
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index 0755c0b6ac23f5ad73df855ab2bcbce11fec2653..2b150790c1d565ef963be34b0bd004101b7a02a7 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -40,6 +40,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import partitioned_variables
@@ -485,7 +486,7 @@ class BucketizedColumnTest(test.TestCase):
   def test_invalid_boundaries(self):
     a = fc.numeric_column('aaa')
     with self.assertRaisesRegexp(ValueError,
-                                 'boundaries must be a sorted list'):
+                                 'boundaries must not be empty'):
       fc.bucketized_column(a, boundaries=None)
     with self.assertRaisesRegexp(ValueError,
                                  'boundaries must be a sorted list'):
@@ -2015,7 +2016,7 @@ class LinearModelTest(test.TestCase):
       }
       model(features)
       for var in model.variables:
-        self.assertTrue(isinstance(var, variables_lib.RefVariable))
+        self.assertIsInstance(var, variables_lib.VariableV1)
       variable_names = [var.name for var in model.variables]
       self.assertItemsEqual([
           'linear_model/dense_feature_bucketized/weights:0',
@@ -2052,7 +2053,7 @@ class LinearModelTest(test.TestCase):
       }
     with self.assertRaisesRegexp(
         ValueError,
-        'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
       model = fc.LinearModel([price1, price2])
       model(features)
 
@@ -2068,7 +2069,7 @@ class LinearModelTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         model = fc.LinearModel([price1, price2, price3])
         model(features)
 
@@ -2818,7 +2819,7 @@ class OldLinearModelTest(test.TestCase):
       }
     with self.assertRaisesRegexp(
         ValueError,
-        'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
       fc_old.linear_model(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
@@ -2833,7 +2834,7 @@ class OldLinearModelTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc_old.linear_model(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
@@ -3261,7 +3262,7 @@ class DenseFeaturesTest(test.TestCase):
       fc.DenseFeatures(feature_columns=[])(features={})
 
   def test_should_be_dense_column(self):
-    with self.assertRaisesRegexp(ValueError, 'must be a DenseColumn'):
+    with self.assertRaisesRegexp(ValueError, 'must be a .*DenseColumn'):
       fc.DenseFeatures(feature_columns=[
           fc.categorical_column_with_hash_bucket('wire_cast', 4)
       ])(
@@ -3422,7 +3423,7 @@ class DenseFeaturesTest(test.TestCase):
               sparse_tensor.SparseTensor(
                   indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
       }
-      with self.assertRaisesRegexp(Exception, 'must be a DenseColumn'):
+      with self.assertRaisesRegexp(Exception, 'must be a .*DenseColumn'):
         fc.DenseFeatures([animal])(features)
 
   def test_static_batch_size_mismatch(self):
@@ -3435,7 +3436,7 @@ class DenseFeaturesTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc.DenseFeatures([price1, price2])(features)
 
   def test_subset_of_static_batch_size_mismatch(self):
@@ -3450,7 +3451,7 @@ class DenseFeaturesTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc.DenseFeatures([price1, price2, price3])(features)
 
   def test_runtime_batch_size_mismatch(self):
@@ -4010,7 +4011,7 @@ class FunctionalInputLayerTest(test.TestCase):
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(1, len(cols_to_vars[some_embedding_column]))
       self.assertIsInstance(cols_to_vars[some_embedding_column][0],
-                            variables_lib.Variable)
+                            variables_lib.VariableV1)
       self.assertAllEqual(cols_to_vars[some_embedding_column][0].shape, [5, 10])
 
   @test_util.run_deprecated_v1
@@ -4141,7 +4142,7 @@ class FunctionalInputLayerTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc_old.input_layer(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
@@ -4156,7 +4157,7 @@ class FunctionalInputLayerTest(test.TestCase):
       }
       with self.assertRaisesRegexp(
           ValueError,
-          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
         fc_old.input_layer(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
@@ -6839,7 +6840,7 @@ class EmbeddingColumnTest(test.TestCase):
     self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
     for v in global_vars:
-      self.assertTrue(isinstance(v, variables_lib.RefVariable))
+      self.assertIsInstance(v, variables_lib.Variable)
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
     self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in trainable_vars]))
@@ -7147,7 +7148,60 @@ class EmbeddingColumnTest(test.TestCase):
                           self.evaluate(predictions))
 
   @test_util.run_deprecated_v1
-  def test_serialization(self):
+  def test_serialization_with_default_initializer(self):
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    embedding_column = fc.embedding_column(categorical_column, dimension=2)
+
+    self.assertEqual([categorical_column], embedding_column.parents)
+
+    config = embedding_column._get_config()
+    self.assertEqual({
+        'categorical_column': {
+            'class_name': 'IdentityCategoricalColumn',
+            'config': {
+                'number_buckets': 3,
+                'key': 'aaa',
+                'default_value': None
+            }
+        },
+        'ckpt_to_load_from': None,
+        'combiner': 'mean',
+        'dimension': 2,
+        'initializer': {
+            'class_name': 'TruncatedNormal',
+            'config': {
+                'dtype': 'float32',
+                'stddev': 0.7071067811865475,
+                'seed': None,
+                'mean': 0.0
+            }
+        },
+        'max_norm': None,
+        'tensor_name_in_ckpt': None,
+        'trainable': True
+    }, config)
+
+    custom_objects = {'TruncatedNormal': init_ops.TruncatedNormal}
+    new_embedding_column = fc.EmbeddingColumn._from_config(
+        config, custom_objects=custom_objects)
+    self.assertEqual(embedding_column._get_config(),
+                     new_embedding_column._get_config())
+    self.assertIsNot(categorical_column,
+                     new_embedding_column.categorical_column)
+
+    new_embedding_column = fc.EmbeddingColumn._from_config(
+        config,
+        custom_objects=custom_objects,
+        columns_by_name={categorical_column.name: categorical_column})
+    self.assertEqual(embedding_column._get_config(),
+                     new_embedding_column._get_config())
+    self.assertIs(categorical_column, new_embedding_column.categorical_column)
+
+  @test_util.run_deprecated_v1
+  def test_serialization_with_custom_initializer(self):
 
     def _initializer(shape, dtype, partition_info):
       del shape, dtype, partition_info
@@ -7732,7 +7786,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
         tuple([v.name for v in global_vars]))
     for v in global_vars:
-      self.assertTrue(isinstance(v, variables_lib.RefVariable))
+      self.assertIsInstance(v, variables_lib.Variable)
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
     if trainable:
       self.assertItemsEqual(
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py b/tensorflow/python/feature_column/sequence_feature_column.py
similarity index 68%
rename from tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py
rename to tensorflow/python/feature_column/sequence_feature_column.py
index 0d34ad161855476b6a4cd9a258521dbe122b4140..bc58c413fef8a69111faaa1edaae873aa9f76cd9 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py
+++ b/tensorflow/python/feature_column/sequence_feature_column.py
@@ -25,8 +25,7 @@ from __future__ import print_function
 import collections
 
 
-from tensorflow.python.feature_column import feature_column as fc_old
-from tensorflow.python.feature_column import feature_column_lib as fc
+from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -34,107 +33,118 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops import variable_scope
+from tensorflow.python.util.tf_export import keras_export
+from tensorflow.python.util.tf_export import tf_export
 
 # pylint: disable=protected-access
 
 
-def sequence_input_layer(
-    features,
-    feature_columns,
-    weight_collections=None,
-    trainable=True):
-  """"Builds input layer for sequence input.
+@keras_export('keras.experimental.SequenceFeatures')
+class SequenceFeatures(fc._BaseFeaturesLayer):
+  """A layer for sequence input.
 
-  All `feature_columns` must be sequence dense columns with the same
-  `sequence_length`. The output of this method can be fed into sequence
-  networks, such as RNN.
+    All `feature_columns` must be sequence dense columns with the same
+    `sequence_length`. The output of this method can be fed into sequence
+    networks, such as RNN.
 
-  The output of this method is a 3D `Tensor` of shape `[batch_size, T, D]`.
-  `T` is the maximum sequence length for this batch, which could differ from
-  batch to batch.
+    The output of this method is a 3D `Tensor` of shape `[batch_size, T, D]`.
+    `T` is the maximum sequence length for this batch, which could differ from
+    batch to batch.
 
-  If multiple `feature_columns` are given with `Di` `num_elements` each, their
-  outputs are concatenated. So, the final `Tensor` has shape
-  `[batch_size, T, D0 + D1 + ... + Dn]`.
+    If multiple `feature_columns` are given with `Di` `num_elements` each, their
+    outputs are concatenated. So, the final `Tensor` has shape
+    `[batch_size, T, D0 + D1 + ... + Dn]`.
 
-  Example:
+    Example:
 
-  ```python
-  rating = sequence_numeric_column('rating')
-  watches = sequence_categorical_column_with_identity(
-      'watches', num_buckets=1000)
-  watches_embedding = embedding_column(watches, dimension=10)
-  columns = [rating, watches]
+    ```python
+    rating = sequence_numeric_column('rating')
+    watches = sequence_categorical_column_with_identity(
+        'watches', num_buckets=1000)
+    watches_embedding = embedding_column(watches, dimension=10)
+    columns = [rating, watches_embedding]
 
-  features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  input_layer, sequence_length = sequence_input_layer(features, columns)
+    sequence_input_layer = SequenceFeatures(columns)
+    features = tf.parse_example(..., features=make_parse_example_spec(columns))
+    sequence_input, sequence_length = sequence_input_layer(features)
+    sequence_length_mask = tf.sequence_mask(sequence_length)
 
-  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
-  outputs, state = tf.nn.dynamic_rnn(
-      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
-  ```
+    rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
+    rnn_layer = tf.keras.layers.RNN(rnn_cell)
+    outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
+    ```
+  """
 
-  Args:
-    features: A dict mapping keys to tensors.
-    feature_columns: An iterable of dense sequence columns. Valid columns are
-      - `embedding_column` that wraps a `sequence_categorical_column_with_*`
-      - `sequence_numeric_column`.
-    weight_collections: A list of collection names to which the Variable will be
-      added. Note that variables will also be added to collections
-      `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
-    trainable: If `True` also add the variable to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES`.
+  def __init__(
+      self,
+      feature_columns,
+      trainable=True,
+      name=None,
+      **kwargs):
+    """"Constructs a SequenceFeatures layer.
 
-  Returns:
-    An `(input_layer, sequence_length)` tuple where:
-    - input_layer: A float `Tensor` of shape `[batch_size, T, D]`.
-        `T` is the maximum sequence length for this batch, which could differ
-        from batch to batch. `D` is the sum of `num_elements` for all
-        `feature_columns`.
-    - sequence_length: An int `Tensor` of shape `[batch_size]`. The sequence
-        length for each example.
+    Args:
+      feature_columns: An iterable of dense sequence columns. Valid columns are
+        - `embedding_column` that wraps a `sequence_categorical_column_with_*`
+        - `sequence_numeric_column`.
+      trainable: Boolean, whether the layer's variables will be updated via
+        gradient descent during training.
+      name: Name to give to the SequenceFeatures.
+      **kwargs: Keyword arguments to construct a layer.
+
+    Raises:
+      ValueError: If any of the `feature_columns` is not a
+        `SequenceDenseColumn`.
+    """
+    super(SequenceFeatures, self).__init__(
+        feature_columns=feature_columns,
+        trainable=trainable,
+        name=name,
+        expected_column_type=fc.SequenceDenseColumn,
+        **kwargs)
 
-  Raises:
-    ValueError: If any of the `feature_columns` is the wrong type.
-  """
-  feature_columns = fc_old._normalize_feature_columns(feature_columns)
-  for c in feature_columns:
-    if not isinstance(c, fc_old._SequenceDenseColumn):
-      raise ValueError(
-          'All feature_columns must be of type _SequenceDenseColumn. '
-          'You can wrap a sequence_categorical_column with an embedding_column '
-          'or indicator_column. '
-          'Given (type {}): {}'.format(type(c), c))
-
-  with variable_scope.variable_scope(
-      None, default_name='sequence_input_layer', values=features.values()):
-    builder = fc_old._LazyBuilder(features)
+  def _target_shape(self, input_shape, total_elements):
+    return (input_shape[0], input_shape[1], total_elements)
+
+  def call(self, features):
+    """Returns sequence input corresponding to the `feature_columns`.
+
+    Args:
+      features: A dict mapping keys to tensors.
+
+    Returns:
+      An `(input_layer, sequence_length)` tuple where:
+      - input_layer: A float `Tensor` of shape `[batch_size, T, D]`.
+          `T` is the maximum sequence length for this batch, which could differ
+          from batch to batch. `D` is the sum of `num_elements` for all
+          `feature_columns`.
+      - sequence_length: An int `Tensor` of shape `[batch_size]`. The sequence
+          length for each example.
+
+    Raises:
+      ValueError: If features are not a dictionary.
+    """
+    if not isinstance(features, dict):
+      raise ValueError('We expected a dictionary here. Instead we got: ',
+                       features)
+    transformation_cache = fc.FeatureTransformationCache(features)
     output_tensors = []
     sequence_lengths = []
-    ordered_columns = []
-
-    for column in sorted(feature_columns, key=lambda x: x.name):
-      ordered_columns.append(column)
-      with variable_scope.variable_scope(
-          None, default_name=column._var_scope_name):
-        dense_tensor, sequence_length = column._get_sequence_dense_tensor(
-            builder,
-            weight_collections=weight_collections,
-            trainable=trainable)
+
+    for column in self._feature_columns:
+      with ops.name_scope(column.name):
+        dense_tensor, sequence_length = column.get_sequence_dense_tensor(
+            transformation_cache, self._state_manager)
         # Flattens the final dimension to produce a 3D Tensor.
-        num_elements = column._variable_shape.num_elements()
-        shape = array_ops.shape(dense_tensor)
-        target_shape = [shape[0], shape[1], num_elements]
-        output_tensors.append(
-            array_ops.reshape(dense_tensor, shape=target_shape))
+        output_tensors.append(self._process_dense_tensor(column, dense_tensor))
         sequence_lengths.append(sequence_length)
 
-    fc_old._verify_static_batch_size_equality(output_tensors, ordered_columns)
-    fc_old._verify_static_batch_size_equality(sequence_lengths, ordered_columns)
+    # Check and process sequence lengths.
+    fc._verify_static_batch_size_equality(sequence_lengths,
+                                          self._feature_columns)
     sequence_length = _assert_all_equal_and_return(sequence_lengths)
 
-    return array_ops.concat(output_tensors, -1), sequence_length
+    return self._verify_and_concat_tensors(output_tensors), sequence_length
 
 
 def concatenate_context_input(context_input, sequence_input):
@@ -186,6 +196,7 @@ def concatenate_context_input(context_input, sequence_input):
   return array_ops.concat([sequence_input, tiled_context_input], 2)
 
 
+@tf_export('feature_column.sequence_categorical_column_with_identity')
 def sequence_categorical_column_with_identity(
     key, num_buckets, default_value=None):
   """Returns a feature column that represents sequences of integers.
@@ -203,11 +214,13 @@ def sequence_categorical_column_with_identity(
   columns = [watches_embedding]
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  input_layer, sequence_length = sequence_input_layer(features, columns)
+  sequence_feature_layer = SequenceFeatures(columns)
+  sequence_input, sequence_length = sequence_feature_layer(features)
+  sequence_length_mask = tf.sequence_mask(sequence_length)
 
-  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
-  outputs, state = tf.nn.dynamic_rnn(
-      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
+  rnn_layer = tf.keras.layers.RNN(rnn_cell)
+  outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
   ```
 
   Args:
@@ -219,17 +232,20 @@ def sequence_categorical_column_with_identity(
       `[0, num_buckets)`, and will replace out-of-range inputs.
 
   Returns:
-    A `_SequenceCategoricalColumn`.
+    A `SequenceCategoricalColumn`.
 
   Raises:
     ValueError: if `num_buckets` is less than one.
     ValueError: if `default_value` is not in range `[0, num_buckets)`.
   """
-  return fc_old._SequenceCategoricalColumn(
-      fc_old._categorical_column_with_identity(
-          key=key, num_buckets=num_buckets, default_value=default_value))
+  return fc.SequenceCategoricalColumn(
+      fc.categorical_column_with_identity(
+          key=key,
+          num_buckets=num_buckets,
+          default_value=default_value))
 
 
+@tf_export('feature_column.sequence_categorical_column_with_hash_bucket')
 def sequence_categorical_column_with_hash_bucket(
     key, hash_bucket_size, dtype=dtypes.string):
   """A sequence of categorical terms where ids are set by hashing.
@@ -247,11 +263,13 @@ def sequence_categorical_column_with_hash_bucket(
   columns = [tokens_embedding]
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  input_layer, sequence_length = sequence_input_layer(features, columns)
+  sequence_feature_layer = SequenceFeatures(columns)
+  sequence_input, sequence_length = sequence_feature_layer(features)
+  sequence_length_mask = tf.sequence_mask(sequence_length)
 
-  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
-  outputs, state = tf.nn.dynamic_rnn(
-      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
+  rnn_layer = tf.keras.layers.RNN(rnn_cell)
+  outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
   ```
 
   Args:
@@ -260,17 +278,20 @@ def sequence_categorical_column_with_hash_bucket(
     dtype: The type of features. Only string and integer types are supported.
 
   Returns:
-    A `_SequenceCategoricalColumn`.
+    A `SequenceCategoricalColumn`.
 
   Raises:
     ValueError: `hash_bucket_size` is not greater than 1.
     ValueError: `dtype` is neither string nor integer.
   """
-  return fc_old._SequenceCategoricalColumn(
-      fc_old._categorical_column_with_hash_bucket(
-          key=key, hash_bucket_size=hash_bucket_size, dtype=dtype))
+  return fc.SequenceCategoricalColumn(
+      fc.categorical_column_with_hash_bucket(
+          key=key,
+          hash_bucket_size=hash_bucket_size,
+          dtype=dtype))
 
 
+@tf_export('feature_column.sequence_categorical_column_with_vocabulary_file')
 def sequence_categorical_column_with_vocabulary_file(
     key, vocabulary_file, vocabulary_size=None, num_oov_buckets=0,
     default_value=None, dtype=dtypes.string):
@@ -290,11 +311,13 @@ def sequence_categorical_column_with_vocabulary_file(
   columns = [states_embedding]
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  input_layer, sequence_length = sequence_input_layer(features, columns)
+  sequence_feature_layer = SequenceFeatures(columns)
+  sequence_input, sequence_length = sequence_feature_layer(features)
+  sequence_length_mask = tf.sequence_mask(sequence_length)
 
-  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
-  outputs, state = tf.nn.dynamic_rnn(
-      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
+  rnn_layer = tf.keras.layers.RNN(rnn_cell)
+  outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
   ```
 
   Args:
@@ -314,7 +337,7 @@ def sequence_categorical_column_with_vocabulary_file(
     dtype: The type of features. Only string and integer types are supported.
 
   Returns:
-    A `_SequenceCategoricalColumn`.
+    A `SequenceCategoricalColumn`.
 
   Raises:
     ValueError: `vocabulary_file` is missing or cannot be opened.
@@ -323,8 +346,8 @@ def sequence_categorical_column_with_vocabulary_file(
     ValueError: `num_oov_buckets` and `default_value` are both specified.
     ValueError: `dtype` is neither string nor integer.
   """
-  return fc_old._SequenceCategoricalColumn(
-      fc_old._categorical_column_with_vocabulary_file(
+  return fc.SequenceCategoricalColumn(
+      fc.categorical_column_with_vocabulary_file(
           key=key,
           vocabulary_file=vocabulary_file,
           vocabulary_size=vocabulary_size,
@@ -333,6 +356,7 @@ def sequence_categorical_column_with_vocabulary_file(
           dtype=dtype))
 
 
+@tf_export('feature_column.sequence_categorical_column_with_vocabulary_list')
 def sequence_categorical_column_with_vocabulary_list(
     key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0):
   """A sequence of categorical terms where ids use an in-memory list.
@@ -351,11 +375,13 @@ def sequence_categorical_column_with_vocabulary_list(
   columns = [colors_embedding]
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  input_layer, sequence_length = sequence_input_layer(features, columns)
+  sequence_feature_layer = SequenceFeatures(columns)
+  sequence_input, sequence_length = sequence_feature_layer(features)
+  sequence_length_mask = tf.sequence_mask(sequence_length)
 
-  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
-  outputs, state = tf.nn.dynamic_rnn(
-      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
+  rnn_layer = tf.keras.layers.RNN(rnn_cell)
+  outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
   ```
 
   Args:
@@ -375,7 +401,7 @@ def sequence_categorical_column_with_vocabulary_list(
       with `default_value`.
 
   Returns:
-    A `_SequenceCategoricalColumn`.
+    A `SequenceCategoricalColumn`.
 
   Raises:
     ValueError: if `vocabulary_list` is empty, or contains duplicate keys.
@@ -383,8 +409,8 @@ def sequence_categorical_column_with_vocabulary_list(
     ValueError: `num_oov_buckets` and `default_value` are both specified.
     ValueError: if `dtype` is not integer or string.
   """
-  return fc_old._SequenceCategoricalColumn(
-      fc_old._categorical_column_with_vocabulary_list(
+  return fc.SequenceCategoricalColumn(
+      fc.categorical_column_with_vocabulary_list(
           key=key,
           vocabulary_list=vocabulary_list,
           dtype=dtype,
@@ -392,6 +418,7 @@ def sequence_categorical_column_with_vocabulary_list(
           num_oov_buckets=num_oov_buckets))
 
 
+@tf_export('feature_column.sequence_numeric_column')
 def sequence_numeric_column(
     key,
     shape=(1,),
@@ -407,12 +434,13 @@ def sequence_numeric_column(
   columns = [temperature]
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
-  sequence_feature_layer = SequenceFeatureLayer(columns)
-  input_layer, sequence_length = sequence_feature_layer(features)
+  sequence_feature_layer = SequenceFeatures(columns)
+  sequence_input, sequence_length = sequence_feature_layer(features)
+  sequence_length_mask = tf.sequence_mask(sequence_length)
 
-  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
-  outputs, state = tf.nn.dynamic_rnn(
-      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
+  rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
+  rnn_layer = tf.keras.layers.RNN(rnn_cell)
+  outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
   ```
 
   Args:
@@ -437,7 +465,7 @@ def sequence_numeric_column(
     ValueError: if any dimension in shape is not a positive integer.
     ValueError: if `dtype` is not convertible to `tf.float32`.
   """
-  shape = fc_old._check_shape(shape=shape, key=key)
+  shape = fc._check_shape(shape=shape, key=key)
   if not (dtype.is_integer or dtype.is_floating):
     raise ValueError('dtype must be convertible to float. '
                      'dtype: {}, key: {}'.format(dtype, key))
@@ -532,9 +560,11 @@ class SequenceNumericColumn(
     # For the 2D case, the raw values are grouped according to num_elements;
     # for the 3D case, the grouping happens in the third dimension, and
     # sequence length is not affected.
-    num_elements = (self.variable_shape.num_elements()
-                    if sp_tensor.shape.ndims == 2 else 1)
-    seq_length = fc_old._sequence_length_from_sparse_tensor(
+    if sp_tensor.shape.ndims == 2:
+      num_elements = self.variable_shape.num_elements()
+    else:
+      num_elements = 1
+    seq_length = fc._sequence_length_from_sparse_tensor(
         sp_tensor, num_elements=num_elements)
 
     return fc.SequenceDenseColumn.TensorSequenceLengthPair(
diff --git a/tensorflow/python/feature_column/sequence_feature_column_integration_test.py b/tensorflow/python/feature_column/sequence_feature_column_integration_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7c67945c6bc05f1f0ff6be356e3cf7e844ee29b
--- /dev/null
+++ b/tensorflow/python/feature_column/sequence_feature_column_integration_test.py
@@ -0,0 +1,283 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Integration test for sequence feature columns with SequenceExamples."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import string
+import tempfile
+
+from google.protobuf import text_format
+
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column import sequence_feature_column as sfc
+from tensorflow.python.keras.layers import recurrent
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+class SequenceFeatureColumnIntegrationTest(test.TestCase):
+
+  def _make_sequence_example(self):
+    example = example_pb2.SequenceExample()
+    example.context.feature['int_ctx'].int64_list.value.extend([5])
+    example.context.feature['float_ctx'].float_list.value.extend([123.6])
+    for val in range(0, 10, 2):
+      feat = feature_pb2.Feature()
+      feat.int64_list.value.extend([val] * val)
+      example.feature_lists.feature_list['int_list'].feature.extend([feat])
+    for val in range(1, 11, 2):
+      feat = feature_pb2.Feature()
+      feat.bytes_list.value.extend([compat.as_bytes(str(val))] * val)
+      example.feature_lists.feature_list['str_list'].feature.extend([feat])
+
+    return example
+
+  def _build_feature_columns(self):
+    col = fc.categorical_column_with_identity('int_ctx', num_buckets=100)
+    ctx_cols = [
+        fc.embedding_column(col, dimension=10),
+        fc.numeric_column('float_ctx')
+    ]
+
+    identity_col = sfc.sequence_categorical_column_with_identity(
+        'int_list', num_buckets=10)
+    bucket_col = sfc.sequence_categorical_column_with_hash_bucket(
+        'bytes_list', hash_bucket_size=100)
+    seq_cols = [
+        fc.embedding_column(identity_col, dimension=10),
+        fc.embedding_column(bucket_col, dimension=20)
+    ]
+
+    return ctx_cols, seq_cols
+
+  def test_sequence_example_into_input_layer(self):
+    examples = [_make_sequence_example().SerializeToString()] * 100
+    ctx_cols, seq_cols = self._build_feature_columns()
+
+    def _parse_example(example):
+      ctx, seq = parsing_ops.parse_single_sequence_example(
+          example,
+          context_features=fc.make_parse_example_spec_v2(ctx_cols),
+          sequence_features=fc.make_parse_example_spec_v2(seq_cols))
+      ctx.update(seq)
+      return ctx
+
+    ds = dataset_ops.Dataset.from_tensor_slices(examples)
+    ds = ds.map(_parse_example)
+    ds = ds.batch(20)
+
+    # Test on a single batch
+    features = ds.make_one_shot_iterator().get_next()
+
+    # Tile the context features across the sequence features
+    sequence_input_layer = sfc.SequenceFeatures(seq_cols)
+    seq_layer, _ = sequence_input_layer(features)
+    input_layer = fc.DenseFeatures(ctx_cols)
+    ctx_layer = input_layer(features)
+    input_layer = sfc.concatenate_context_input(ctx_layer, seq_layer)
+
+    rnn_layer = recurrent.RNN(recurrent.SimpleRNNCell(10))
+    output = rnn_layer(input_layer)
+
+    with self.cached_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      features_r = sess.run(features)
+      self.assertAllEqual(features_r['int_list'].dense_shape, [20, 3, 6])
+
+      output_r = sess.run(output)
+      self.assertAllEqual(output_r.shape, [20, 10])
+
+
+class SequenceExampleParsingTest(test.TestCase):
+
+  def test_seq_ex_in_sequence_categorical_column_with_identity(self):
+    self._test_parsed_sequence_example(
+        'int_list', sfc.sequence_categorical_column_with_identity,
+        10, [3, 6], [2, 4, 6])
+
+  def test_seq_ex_in_sequence_categorical_column_with_hash_bucket(self):
+    self._test_parsed_sequence_example(
+        'bytes_list', sfc.sequence_categorical_column_with_hash_bucket,
+        10, [3, 4], [compat.as_bytes(x) for x in 'acg'])
+
+  def test_seq_ex_in_sequence_categorical_column_with_vocabulary_list(self):
+    self._test_parsed_sequence_example(
+        'bytes_list', sfc.sequence_categorical_column_with_vocabulary_list,
+        list(string.ascii_lowercase), [3, 4],
+        [compat.as_bytes(x) for x in 'acg'])
+
+  def test_seq_ex_in_sequence_categorical_column_with_vocabulary_file(self):
+    _, fname = tempfile.mkstemp()
+    with open(fname, 'w') as f:
+      f.write(string.ascii_lowercase)
+    self._test_parsed_sequence_example(
+        'bytes_list', sfc.sequence_categorical_column_with_vocabulary_file,
+        fname, [3, 4], [compat.as_bytes(x) for x in 'acg'])
+
+  def _test_parsed_sequence_example(
+      self, col_name, col_fn, col_arg, shape, values):
+    """Helper function to check that each FeatureColumn parses correctly.
+
+    Args:
+      col_name: string, name to give to the feature column. Should match
+        the name that the column will parse out of the features dict.
+      col_fn: function used to create the feature column. For example,
+        sequence_numeric_column.
+      col_arg: second arg that the target feature column is expecting.
+      shape: the expected dense_shape of the feature after parsing into
+        a SparseTensor.
+      values: the expected values at index [0, 2, 6] of the feature
+        after parsing into a SparseTensor.
+    """
+    example = _make_sequence_example()
+    columns = [
+        fc.categorical_column_with_identity('int_ctx', num_buckets=100),
+        fc.numeric_column('float_ctx'),
+        col_fn(col_name, col_arg)
+    ]
+    context, seq_features = parsing_ops.parse_single_sequence_example(
+        example.SerializeToString(),
+        context_features=fc.make_parse_example_spec_v2(columns[:2]),
+        sequence_features=fc.make_parse_example_spec_v2(columns[2:]))
+
+    with self.cached_session() as sess:
+      ctx_result, seq_result = sess.run([context, seq_features])
+      self.assertEqual(list(seq_result[col_name].dense_shape), shape)
+      self.assertEqual(
+          list(seq_result[col_name].values[[0, 2, 6]]), values)
+      self.assertEqual(list(ctx_result['int_ctx'].dense_shape), [1])
+      self.assertEqual(ctx_result['int_ctx'].values[0], 5)
+      self.assertEqual(list(ctx_result['float_ctx'].shape), [1])
+      self.assertAlmostEqual(ctx_result['float_ctx'][0], 123.6, places=1)
+
+
+_SEQ_EX_PROTO = """
+context {
+  feature {
+    key: "float_ctx"
+    value {
+      float_list {
+        value: 123.6
+      }
+    }
+  }
+  feature {
+    key: "int_ctx"
+    value {
+      int64_list {
+        value: 5
+      }
+    }
+  }
+}
+feature_lists {
+  feature_list {
+    key: "bytes_list"
+    value {
+      feature {
+        bytes_list {
+          value: "a"
+        }
+      }
+      feature {
+        bytes_list {
+          value: "b"
+          value: "c"
+        }
+      }
+      feature {
+        bytes_list {
+          value: "d"
+          value: "e"
+          value: "f"
+          value: "g"
+        }
+      }
+    }
+  }
+  feature_list {
+    key: "float_list"
+    value {
+      feature {
+        float_list {
+          value: 1.0
+        }
+      }
+      feature {
+        float_list {
+          value: 3.0
+          value: 3.0
+          value: 3.0
+        }
+      }
+      feature {
+        float_list {
+          value: 5.0
+          value: 5.0
+          value: 5.0
+          value: 5.0
+          value: 5.0
+        }
+      }
+    }
+  }
+  feature_list {
+    key: "int_list"
+    value {
+      feature {
+        int64_list {
+          value: 2
+          value: 2
+        }
+      }
+      feature {
+        int64_list {
+          value: 4
+          value: 4
+          value: 4
+          value: 4
+        }
+      }
+      feature {
+        int64_list {
+          value: 6
+          value: 6
+          value: 6
+          value: 6
+          value: 6
+          value: 6
+        }
+      }
+    }
+  }
+}
+"""
+
+
+def _make_sequence_example():
+  example = example_pb2.SequenceExample()
+  return text_format.Parse(_SEQ_EX_PROTO, example)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py b/tensorflow/python/feature_column/sequence_feature_column_test.py
similarity index 68%
rename from tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py
rename to tensorflow/python/feature_column/sequence_feature_column_test.py
index ca4398a142065de0be7bee57cd7e54670bbae12e..0c8f37b107122882e1f72c0bbb10ebe2c2885f5e 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/sequence_feature_column_test.py
@@ -22,23 +22,24 @@ import os
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as sfc_old
-from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column_v2 as sfc
-from tensorflow.python.feature_column import feature_column as fc_old
-from tensorflow.python.feature_column import feature_column_lib as fc
-from tensorflow.python.feature_column.feature_column import _LazyBuilder
+from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column import feature_column_v2_test as fc_test
+from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
-from tensorflow.python.training import monitored_session
 
 
-class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
+class SequenceFeaturesTest(test.TestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -82,6 +83,7 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
            [[1., 2., 17., 18., 19.], [3., 4., 11., 12., 13.]]],
        'expected_sequence_length': [2, 2]},
       )
+  @test_util.run_in_graph_and_eager_modes
   def test_embedding_column(
       self, sparse_input_args_a, sparse_input_args_b, expected_input_layer,
       expected_sequence_length):
@@ -111,37 +113,36 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc_old._embedding_column(
+    embedding_column_a = fc.embedding_column(
         categorical_column_a,
         dimension=embedding_dimension_a,
         initializer=_get_initializer(embedding_dimension_a, embedding_values_a))
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_b = fc_old._embedding_column(
+    embedding_column_b = fc.embedding_column(
         categorical_column_b,
         dimension=embedding_dimension_b,
         initializer=_get_initializer(embedding_dimension_b, embedding_values_b))
 
-    input_layer, sequence_length = sfc.sequence_input_layer(
-        features={
-            'aaa': sparse_input_a,
-            'bbb': sparse_input_b,
-        },
-        # Test that columns are reordered alphabetically.
-        feature_columns=[embedding_column_b, embedding_column_a])
-
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('sequence_input_layer/aaa_embedding/embedding_weights:0',
-         'sequence_input_layer/bbb_embedding/embedding_weights:0'),
-        tuple([v.name for v in global_vars]))
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(embedding_values_a, global_vars[0].eval(session=sess))
-      self.assertAllEqual(embedding_values_b, global_vars[1].eval(session=sess))
-      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
-
+    # Test that columns are reordered alphabetically.
+    sequence_input_layer = sfc.SequenceFeatures(
+        [embedding_column_b, embedding_column_a])
+    input_layer, sequence_length = sequence_input_layer({
+        'aaa': sparse_input_a, 'bbb': sparse_input_b,})
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    weights = sequence_input_layer.weights
+    self.assertCountEqual(
+        ('sequence_features/aaa_embedding/embedding_weights:0',
+         'sequence_features/bbb_embedding/embedding_weights:0'),
+        tuple([v.name for v in weights]))
+    self.assertAllEqual(embedding_values_a, self.evaluate(weights[0]))
+    self.assertAllEqual(embedding_values_b, self.evaluate(weights[1]))
+    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
+    self.assertAllEqual(
+        expected_sequence_length, self.evaluate(sequence_length))
+
+  @test_util.run_in_graph_and_eager_modes
   def test_embedding_column_with_non_sequence_categorical(self):
     """Tests that error is raised for non-sequence embedding column."""
     vocabulary_size = 3
@@ -152,86 +153,87 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc_old._categorical_column_with_identity(
+    categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc_old._embedding_column(
+    embedding_column_a = fc.embedding_column(
         categorical_column_a, dimension=2)
 
     with self.assertRaisesRegexp(
         ValueError,
         r'In embedding_column: aaa_embedding\. categorical_column must be of '
-        r'type _SequenceCategoricalColumn to use sequence_input_layer\.'):
-      _, _ = sfc.sequence_input_layer(
-          features={'aaa': sparse_input},
-          feature_columns=[embedding_column_a])
+        r'type SequenceCategoricalColumn to use SequenceFeatures\.'):
+      sequence_input_layer = sfc.SequenceFeatures([embedding_column_a])
+      _, _ = sequence_input_layer({'aaa': sparse_input})
 
+  @test_util.run_in_graph_and_eager_modes
   def test_shared_embedding_column(self):
-    vocabulary_size = 3
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        # example 0, ids [1]
-        # example 1, ids [2, 0]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(1, 2, 0),
-        dense_shape=(2, 2))
-
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 4.),  # id 1
-        (5., 6.)  # id 2
-    )
-
-    def _get_initializer(embedding_dimension, embedding_values):
-
-      def _initializer(shape, dtype, partition_info):
-        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-        self.assertEqual(dtypes.float32, dtype)
-        self.assertIsNone(partition_info)
-        return embedding_values
-
-      return _initializer
-
-    expected_input_layer = [
-        # example 0, ids_a [2], ids_b [1]
-        [[5., 6., 3., 4.], [0., 0., 0., 0.]],
-        # example 1, ids_a [0, 1], ids_b [2, 0]
-        [[1., 2., 5., 6.], [3., 4., 1., 2.]],
-    ]
-    expected_sequence_length = [1, 2]
-
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = sfc.sequence_categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    # Test that columns are reordered alphabetically.
-    shared_embedding_columns = fc.shared_embedding_columns(
-        [categorical_column_b, categorical_column_a],
-        dimension=embedding_dimension,
-        initializer=_get_initializer(embedding_dimension, embedding_values))
-
-    input_layer, sequence_length = sfc.sequence_input_layer(
-        features={
-            'aaa': sparse_input_a,
-            'bbb': sparse_input_b,
-        },
-        feature_columns=shared_embedding_columns)
-
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('sequence_input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
-        tuple([v.name for v in global_vars]))
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
-      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
+    with ops.Graph().as_default():
+      vocabulary_size = 3
+      sparse_input_a = sparse_tensor.SparseTensorValue(
+          # example 0, ids [2]
+          # example 1, ids [0, 1]
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(2, 0, 1),
+          dense_shape=(2, 2))
+      sparse_input_b = sparse_tensor.SparseTensorValue(
+          # example 0, ids [1]
+          # example 1, ids [2, 0]
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(1, 2, 0),
+          dense_shape=(2, 2))
+
+      embedding_dimension = 2
+      embedding_values = (
+          (1., 2.),  # id 0
+          (3., 4.),  # id 1
+          (5., 6.)  # id 2
+      )
 
+      def _get_initializer(embedding_dimension, embedding_values):
+
+        def _initializer(shape, dtype, partition_info):
+          self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+          self.assertEqual(dtypes.float32, dtype)
+          self.assertIsNone(partition_info)
+          return embedding_values
+
+        return _initializer
+
+      expected_input_layer = [
+          # example 0, ids_a [2], ids_b [1]
+          [[5., 6., 3., 4.], [0., 0., 0., 0.]],
+          # example 1, ids_a [0, 1], ids_b [2, 0]
+          [[1., 2., 5., 6.], [3., 4., 1., 2.]],
+      ]
+      expected_sequence_length = [1, 2]
+
+      categorical_column_a = sfc.sequence_categorical_column_with_identity(
+          key='aaa', num_buckets=vocabulary_size)
+      categorical_column_b = sfc.sequence_categorical_column_with_identity(
+          key='bbb', num_buckets=vocabulary_size)
+      # Test that columns are reordered alphabetically.
+      shared_embedding_columns = fc.shared_embedding_columns_v2(
+          [categorical_column_b, categorical_column_a],
+          dimension=embedding_dimension,
+          initializer=_get_initializer(embedding_dimension, embedding_values))
+
+      sequence_input_layer = sfc.SequenceFeatures(shared_embedding_columns)
+      input_layer, sequence_length = sequence_input_layer({
+          'aaa': sparse_input_a, 'bbb': sparse_input_b})
+
+      global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      self.assertCountEqual(
+          ('aaa_bbb_shared_embedding:0',),
+          tuple([v.name for v in global_vars]))
+      with fc_test._initialized_session() as sess:
+        self.assertAllEqual(embedding_values,
+                            global_vars[0].eval(session=sess))
+        self.assertAllEqual(expected_input_layer,
+                            input_layer.eval(session=sess))
+        self.assertAllEqual(
+            expected_sequence_length, sequence_length.eval(session=sess))
+
+  @test_util.run_deprecated_v1
   def test_shared_embedding_column_with_non_sequence_categorical(self):
     """Tests that error is raised for non-sequence shared embedding column."""
     vocabulary_size = 3
@@ -248,23 +250,20 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc_old._categorical_column_with_identity(
+    categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc_old._categorical_column_with_identity(
+    categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc.shared_embedding_columns_v2(
         [categorical_column_a, categorical_column_b], dimension=2)
 
     with self.assertRaisesRegexp(
         ValueError,
         r'In embedding_column: aaa_shared_embedding\. categorical_column must '
-        r'be of type _SequenceCategoricalColumn to use sequence_input_layer\.'):
-      _, _ = sfc.sequence_input_layer(
-          features={
-              'aaa': sparse_input_a,
-              'bbb': sparse_input_b
-          },
-          feature_columns=shared_embedding_columns)
+        r'be of type SequenceCategoricalColumn to use SequenceFeatures\.'):
+      sequence_input_layer = sfc.SequenceFeatures(shared_embedding_columns)
+      _, _ = sequence_input_layer({'aaa': sparse_input_a,
+                                   'bbb': sparse_input_b})
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -308,6 +307,7 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
            [[2., 0., 0., 0., 1.], [0., 1., 0., 1., 0.]]],
        'expected_sequence_length': [2, 2]},
       )
+  @test_util.run_in_graph_and_eager_modes
   def test_indicator_column(
       self, sparse_input_args_a, sparse_input_args_b, expected_input_layer,
       expected_sequence_length):
@@ -319,23 +319,21 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size_a)
-    indicator_column_a = fc_old._indicator_column(categorical_column_a)
+    indicator_column_a = fc.indicator_column(categorical_column_a)
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size_b)
-    indicator_column_b = fc_old._indicator_column(categorical_column_b)
-    input_layer, sequence_length = sfc.sequence_input_layer(
-        features={
-            'aaa': sparse_input_a,
-            'bbb': sparse_input_b,
-        },
-        # Test that columns are reordered alphabetically.
-        feature_columns=[indicator_column_b, indicator_column_a])
-
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
+    indicator_column_b = fc.indicator_column(categorical_column_b)
+    # Test that columns are reordered alphabetically.
+    sequence_input_layer = sfc.SequenceFeatures(
+        [indicator_column_b, indicator_column_a])
+    input_layer, sequence_length = sequence_input_layer({
+        'aaa': sparse_input_a, 'bbb': sparse_input_b})
 
+    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
+    self.assertAllEqual(
+        expected_sequence_length, self.evaluate(sequence_length))
+
+  @test_util.run_in_graph_and_eager_modes
   def test_indicator_column_with_non_sequence_categorical(self):
     """Tests that error is raised for non-sequence categorical column."""
     vocabulary_size = 3
@@ -346,17 +344,16 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc_old._categorical_column_with_identity(
+    categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc_old._indicator_column(categorical_column_a)
+    indicator_column_a = fc.indicator_column(categorical_column_a)
 
     with self.assertRaisesRegexp(
         ValueError,
         r'In indicator_column: aaa_indicator\. categorical_column must be of '
-        r'type _SequenceCategoricalColumn to use sequence_input_layer\.'):
-      _, _ = sfc.sequence_input_layer(
-          features={'aaa': sparse_input},
-          feature_columns=[indicator_column_a])
+        r'type SequenceCategoricalColumn to use SequenceFeatures\.'):
+      sequence_input_layer = sfc.SequenceFeatures([indicator_column_a])
+      _, _ = sequence_input_layer({'aaa': sparse_input})
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -375,27 +372,26 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
            # feature 0, ids [[20, 3], [5]]
            # feature 1, ids [[3], [8]]
            'indices': ((0, 0, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (1, 1, 0)),
-           'values': (20, 3, 5., 3., 8.),
+           'values': (20., 3., 5., 3., 8.),
            'dense_shape': (2, 2, 2)},
        'expected_input_layer': [
            [[20.], [3.], [5.], [0.]],
            [[3.], [0.], [8.], [0.]]],
        'expected_sequence_length': [2, 2]},
       )
+  @test_util.run_in_graph_and_eager_modes
   def test_numeric_column(
       self, sparse_input_args, expected_input_layer, expected_sequence_length):
     sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
 
-    numeric_column = sfc_old.sequence_numeric_column('aaa')
+    numeric_column = sfc.sequence_numeric_column('aaa')
 
-    input_layer, sequence_length = sfc.sequence_input_layer(
-        features={'aaa': sparse_input},
-        feature_columns=[numeric_column])
+    sequence_input_layer = sfc.SequenceFeatures([numeric_column])
+    input_layer, sequence_length = sequence_input_layer({'aaa': sparse_input})
 
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
+    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
+    self.assertAllEqual(
+        expected_sequence_length, self.evaluate(sequence_length))
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -426,22 +422,22 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
            [[10., 11., 12., 13.], [0., 0., 0., 0.]]],
        'expected_sequence_length': [2, 1]},
       )
+  @test_util.run_in_graph_and_eager_modes
   def test_numeric_column_multi_dim(
       self, sparse_input_args, expected_input_layer, expected_sequence_length):
-    """Tests sequence_input_layer for multi-dimensional numeric_column."""
+    """Tests SequenceFeatures for multi-dimensional numeric_column."""
     sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
 
-    numeric_column = sfc_old.sequence_numeric_column('aaa', shape=(2, 2))
+    numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2))
 
-    input_layer, sequence_length = sfc.sequence_input_layer(
-        features={'aaa': sparse_input},
-        feature_columns=[numeric_column])
+    sequence_input_layer = sfc.SequenceFeatures([numeric_column])
+    input_layer, sequence_length = sequence_input_layer({'aaa': sparse_input})
 
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(expected_input_layer, input_layer.eval(session=sess))
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
+    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
+    self.assertAllEqual(
+        expected_sequence_length, self.evaluate(sequence_length))
 
+  @test_util.run_in_graph_and_eager_modes
   def test_sequence_length_not_equal(self):
     """Tests that an error is raised when sequence lengths are not equal."""
     # Input a with sequence_length = [2, 1]
@@ -454,23 +450,17 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         indices=((0, 0), (1, 0)),
         values=(1., 10.),
         dense_shape=(2, 2))
-    numeric_column_a = sfc_old.sequence_numeric_column('aaa')
-    numeric_column_b = sfc_old.sequence_numeric_column('bbb')
-
-    _, sequence_length = sfc.sequence_input_layer(
-        features={
-            'aaa': sparse_input_a,
-            'bbb': sparse_input_b,
-        },
-        feature_columns=[numeric_column_a, numeric_column_b])
-
-    with monitored_session.MonitoredSession() as sess:
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[Condition x == y did not hold element-wise:\] '
-          r'\[x \(sequence_input_layer/aaa/sequence_length:0\) = \] \[2 1\] '
-          r'\[y \(sequence_input_layer/bbb/sequence_length:0\) = \] \[1 1\]'):
-        sess.run(sequence_length)
+    numeric_column_a = sfc.sequence_numeric_column('aaa')
+    numeric_column_b = sfc.sequence_numeric_column('bbb')
+
+    sequence_input_layer = sfc.SequenceFeatures(
+        [numeric_column_a, numeric_column_b])
+
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError, r'Condition x == y did not hold.*'):
+      _, sequence_length = sequence_input_layer({
+          'aaa': sparse_input_a, 'bbb': sparse_input_b})
+      self.evaluate(sequence_length)
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -487,21 +477,21 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
            # example 0, values [[0., 1., 2., 3.]], [[4., 5., 6., 7.]]
            # example 1, [[10., 11., 12., 13.], []]
            'indices': ((0, 0, 0), (0, 0, 1), (0, 0, 2), (0, 0, 3),
-                       (0, 1, 0), (0, 1, 1), (0, 1, 2), (0, 1, 2),
+                       (0, 1, 0), (0, 1, 1), (0, 1, 2), (0, 1, 3),
                        (1, 0, 0), (1, 0, 1), (1, 0, 2), (1, 0, 3)),
            'values': (0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
            'dense_shape': (2, 2, 4)},
        'expected_shape': [2, 2, 4]},
       )
+  @test_util.run_in_graph_and_eager_modes
   def test_static_shape_from_tensors_numeric(
       self, sparse_input_args, expected_shape):
     """Tests that we return a known static shape when we have one."""
     sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
-    numeric_column = sfc_old.sequence_numeric_column('aaa', shape=(2, 2))
+    numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2))
 
-    input_layer, _ = sfc.sequence_input_layer(
-        features={'aaa': sparse_input},
-        feature_columns=[numeric_column])
+    sequence_input_layer = sfc.SequenceFeatures([numeric_column])
+    input_layer, _ = sequence_input_layer({'aaa': sparse_input})
     shape = input_layer.get_shape()
     self.assertEqual(shape, expected_shape)
 
@@ -528,20 +518,58 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
            'dense_shape': (4, 2, 2)},
        'expected_shape': [4, 2, 3]}
       )
+  @test_util.run_in_graph_and_eager_modes
   def test_static_shape_from_tensors_indicator(
       self, sparse_input_args, expected_shape):
     """Tests that we return a known static shape when we have one."""
     sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    indicator_column = fc_old._indicator_column(categorical_column)
+    indicator_column = fc.indicator_column(categorical_column)
 
-    input_layer, _ = sfc.sequence_input_layer(
-        features={'aaa': sparse_input}, feature_columns=[indicator_column])
+    sequence_input_layer = sfc.SequenceFeatures([indicator_column])
+    input_layer, _ = sequence_input_layer({'aaa': sparse_input})
     shape = input_layer.get_shape()
     self.assertEqual(shape, expected_shape)
 
-
+  @test_util.run_in_graph_and_eager_modes
+  def test_compute_output_shape(self):
+    price1 = sfc.sequence_numeric_column('price1', shape=2)
+    price2 = sfc.sequence_numeric_column('price2')
+    features = {
+        'price1': sparse_tensor.SparseTensor(
+            indices=[[0, 0, 0], [0, 0, 1],
+                     [0, 1, 0], [0, 1, 1],
+                     [1, 0, 0], [1, 0, 1],
+                     [2, 0, 0], [2, 0, 1],
+                     [3, 0, 0], [3, 0, 1]],
+            values=[0., 1., 10., 11., 100., 101., 200., 201., 300., 301.],
+            dense_shape=(4, 3, 2)),
+        'price2': sparse_tensor.SparseTensor(
+            indices=[[0, 0],
+                     [0, 1],
+                     [1, 0],
+                     [2, 0],
+                     [3, 0]],
+            values=[10., 11., 20., 30., 40.],
+            dense_shape=(4, 3))}
+    sequence_features = sfc.SequenceFeatures([price1, price2])
+    seq_input, seq_len = sequence_features(features)
+    self.assertEqual(
+        sequence_features.compute_output_shape((None, None)),
+        (None, None, 3))
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllClose([[[0., 1., 10.], [10., 11., 11.], [0., 0., 0.]],
+                         [[100., 101., 20.], [0., 0., 0.], [0., 0., 0.]],
+                         [[200., 201., 30.], [0., 0., 0.], [0., 0., 0.]],
+                         [[300., 301., 40.], [0., 0., 0.], [0., 0., 0.]]],
+                        self.evaluate(seq_input))
+    self.assertAllClose([2, 1, 1, 1], self.evaluate(seq_len))
+
+
+@test_util.run_all_in_graph_and_eager_modes
 class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
   """Tests the utility fn concatenate_context_input."""
 
@@ -556,9 +584,8 @@ class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
         [[0, 1, 0, 1, 2, 3, 4], [2, 3, 0, 1, 2, 3, 4], [4, 5, 0, 1, 2, 3, 4]],
         [[6, 7, 5, 6, 7, 8, 9], [8, 9, 5, 6, 7, 8, 9], [10, 11, 5, 6, 7, 8, 9]]
     ], dtype=np.float32)
-    with monitored_session.MonitoredSession() as sess:
-      output = sess.run(input_layer)
-      self.assertAllEqual(expected, output)
+    output = self.evaluate(input_layer)
+    self.assertAllEqual(expected, output)
 
   @parameterized.named_parameters(
       {'testcase_name': 'rank_lt_3',
@@ -605,8 +632,9 @@ class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
       sfc.concatenate_context_input(context_input, seq_input)
 
 
-class InputLayerTest(test.TestCase):
-  """Tests input_layer with sequence feature columns."""
+@test_util.run_all_in_graph_and_eager_modes
+class DenseFeaturesTest(test.TestCase):
+  """Tests DenseFeatures with sequence feature columns."""
 
   def test_embedding_column(self):
     """Tests that error is raised for sequence embedding column."""
@@ -620,16 +648,15 @@ class InputLayerTest(test.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc_old._embedding_column(
+    embedding_column_a = fc.embedding_column(
         categorical_column_a, dimension=2)
 
     with self.assertRaisesRegexp(
         ValueError,
         r'In embedding_column: aaa_embedding\. categorical_column must not be '
-        r'of type _SequenceCategoricalColumn\.'):
-      _ = fc_old.input_layer(
-          features={'aaa': sparse_input},
-          feature_columns=[embedding_column_a])
+        r'of type SequenceCategoricalColumn\.'):
+      input_layer = fc.DenseFeatures([embedding_column_a])
+      _ = input_layer({'aaa': sparse_input})
 
   def test_indicator_column(self):
     """Tests that error is raised for sequence indicator column."""
@@ -643,15 +670,14 @@ class InputLayerTest(test.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc_old._indicator_column(categorical_column_a)
+    indicator_column_a = fc.indicator_column(categorical_column_a)
 
     with self.assertRaisesRegexp(
         ValueError,
         r'In indicator_column: aaa_indicator\. categorical_column must not be '
-        r'of type _SequenceCategoricalColumn\.'):
-      _ = fc_old.input_layer(
-          features={'aaa': sparse_input},
-          feature_columns=[indicator_column_a])
+        r'of type SequenceCategoricalColumn\.'):
+      input_layer = fc.DenseFeatures([indicator_column_a])
+      _ = input_layer({'aaa': sparse_input})
 
 
 def _assert_sparse_tensor_value(test_case, expected, actual):
@@ -670,6 +696,25 @@ def _assert_sparse_tensor_indices_shape(test_case, expected, actual):
   test_case.assertAllEqual(expected.dense_shape, actual.dense_shape)
 
 
+def _get_sequence_dense_tensor(column, features):
+  return column.get_sequence_dense_tensor(
+      fc.FeatureTransformationCache(features), None)
+
+
+def _get_sequence_dense_tensor_state(column, features):
+  state_manager = fc._StateManagerImpl(Layer(), trainable=True)
+  column.create_state(state_manager)
+  dense_tensor, lengths = column.get_sequence_dense_tensor(
+      fc.FeatureTransformationCache(features), state_manager)
+  return dense_tensor, lengths, state_manager
+
+
+def _get_sparse_tensors(column, features):
+  return column.get_sparse_tensors(
+      fc.FeatureTransformationCache(features), None)
+
+
+@test_util.run_all_in_graph_and_eager_modes
 class SequenceCategoricalColumnWithIdentityTest(
     test.TestCase, parameterized.TestCase):
 
@@ -698,14 +743,14 @@ class SequenceCategoricalColumnWithIdentityTest(
     expected = sparse_tensor.SparseTensorValue(**expected_args)
     column = sfc.sequence_categorical_column_with_identity('aaa', num_buckets=9)
 
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    id_weight_pair = _get_sparse_tensors(column, {'aaa': inputs})
 
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with monitored_session.MonitoredSession() as sess:
-      _assert_sparse_tensor_value(
-          self, expected, id_weight_pair.id_tensor.eval(session=sess))
+    _assert_sparse_tensor_value(
+        self, expected, self.evaluate(id_weight_pair.id_tensor))
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class SequenceCategoricalColumnWithHashBucketTest(
     test.TestCase, parameterized.TestCase):
 
@@ -737,14 +782,14 @@ class SequenceCategoricalColumnWithHashBucketTest(
     column = sfc.sequence_categorical_column_with_hash_bucket(
         'aaa', hash_bucket_size=10)
 
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    id_weight_pair = _get_sparse_tensors(column, {'aaa': inputs})
 
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with monitored_session.MonitoredSession() as sess:
-      _assert_sparse_tensor_indices_shape(
-          self, expected, id_weight_pair.id_tensor.eval(session=sess))
+    _assert_sparse_tensor_indices_shape(
+        self, expected, self.evaluate(id_weight_pair.id_tensor))
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class SequenceCategoricalColumnWithVocabularyFileTest(
     test.TestCase, parameterized.TestCase):
 
@@ -790,41 +835,43 @@ class SequenceCategoricalColumnWithVocabularyFileTest(
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size)
 
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    id_weight_pair = _get_sparse_tensors(column, {'aaa': inputs})
 
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with monitored_session.MonitoredSession() as sess:
-      _assert_sparse_tensor_value(
-          self, expected, id_weight_pair.id_tensor.eval(session=sess))
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+    _assert_sparse_tensor_value(
+        self, expected, self.evaluate(id_weight_pair.id_tensor))
 
   def test_get_sparse_tensors_dynamic_zero_length(self):
     """Tests _get_sparse_tensors with a dynamic sequence length."""
-    inputs = sparse_tensor.SparseTensorValue(
-        indices=np.zeros((0, 2)), values=[], dense_shape=(2, 0))
-    expected = sparse_tensor.SparseTensorValue(
-        indices=np.zeros((0, 3)),
-        values=np.array((), dtype=np.int64),
-        dense_shape=(2, 0, 1))
-    column = sfc.sequence_categorical_column_with_vocabulary_file(
-        key='aaa',
-        vocabulary_file=self._wire_vocabulary_file_name,
-        vocabulary_size=self._wire_vocabulary_size)
-    input_placeholder_shape = list(inputs.dense_shape)
-    # Make second dimension (sequence length) dynamic.
-    input_placeholder_shape[1] = None
-    input_placeholder = array_ops.sparse_placeholder(
-        dtypes.string, shape=input_placeholder_shape)
-    id_weight_pair = column._get_sparse_tensors(
-        _LazyBuilder({'aaa': input_placeholder}))
-
-    self.assertIsNone(id_weight_pair.weight_tensor)
-    with monitored_session.MonitoredSession() as sess:
-      result = id_weight_pair.id_tensor.eval(
-          session=sess, feed_dict={input_placeholder: inputs})
-      _assert_sparse_tensor_value(
-          self, expected, result)
-
-
+    with ops.Graph().as_default():
+      inputs = sparse_tensor.SparseTensorValue(
+          indices=np.zeros((0, 2)), values=[], dense_shape=(2, 0))
+      expected = sparse_tensor.SparseTensorValue(
+          indices=np.zeros((0, 3)),
+          values=np.array((), dtype=np.int64),
+          dense_shape=(2, 0, 1))
+      column = sfc.sequence_categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
+          vocabulary_size=self._wire_vocabulary_size)
+      input_placeholder_shape = list(inputs.dense_shape)
+      # Make second dimension (sequence length) dynamic.
+      input_placeholder_shape[1] = None
+      input_placeholder = array_ops.sparse_placeholder(
+          dtypes.string, shape=input_placeholder_shape)
+      id_weight_pair = _get_sparse_tensors(column, {'aaa': input_placeholder})
+
+      self.assertIsNone(id_weight_pair.weight_tensor)
+      with fc_test._initialized_session() as sess:
+        result = id_weight_pair.id_tensor.eval(
+            session=sess, feed_dict={input_placeholder: inputs})
+        _assert_sparse_tensor_value(
+            self, expected, result)
+
+
+@test_util.run_all_in_graph_and_eager_modes
 class SequenceCategoricalColumnWithVocabularyListTest(
     test.TestCase, parameterized.TestCase):
 
@@ -855,14 +902,16 @@ class SequenceCategoricalColumnWithVocabularyListTest(
         key='aaa',
         vocabulary_list=('omar', 'stringer', 'marlo'))
 
-    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
+    id_weight_pair = _get_sparse_tensors(column, {'aaa': inputs})
 
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with monitored_session.MonitoredSession() as sess:
-      _assert_sparse_tensor_value(
-          self, expected, id_weight_pair.id_tensor.eval(session=sess))
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+    _assert_sparse_tensor_value(
+        self, expected, self.evaluate(id_weight_pair.id_tensor))
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class SequenceEmbeddingColumnTest(
     test.TestCase, parameterized.TestCase):
 
@@ -922,20 +971,19 @@ class SequenceEmbeddingColumnTest(
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc_old._embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
         initializer=_initializer)
 
-    embedding_lookup, _ = embedding_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': inputs}))
+    embedding_lookup, _, state_manager = _get_sequence_dense_tensor_state(
+        embedding_column, {'aaa': inputs})
 
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
-      self.assertAllEqual(expected, embedding_lookup.eval(session=sess))
+    variables = state_manager._layer.weights
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.assertCountEqual(
+        ('embedding_weights:0',), tuple([v.name for v in variables]))
+    self.assertAllEqual(embedding_values, self.evaluate(variables[0]))
+    self.assertAllEqual(expected, self.evaluate(embedding_lookup))
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -961,15 +1009,15 @@ class SequenceEmbeddingColumnTest(
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc_old._embedding_column(categorical_column, dimension=2)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=2)
 
-    _, sequence_length = embedding_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': inputs}))
+    _, sequence_length, _ = _get_sequence_dense_tensor_state(
+        embedding_column, {'aaa': inputs})
 
-    with monitored_session.MonitoredSession() as sess:
-      sequence_length = sess.run(sequence_length)
-      self.assertAllEqual(expected_sequence_length, sequence_length)
-      self.assertEqual(np.int64, sequence_length.dtype)
+    sequence_length = self.evaluate(sequence_length)
+    self.assertAllEqual(expected_sequence_length, sequence_length)
+    self.assertEqual(np.int64, sequence_length.dtype)
 
   def test_sequence_length_with_empty_rows(self):
     """Tests _sequence_length when some examples do not have ids."""
@@ -988,18 +1036,19 @@ class SequenceEmbeddingColumnTest(
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc_old._embedding_column(categorical_column, dimension=2)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=2)
 
-    _, sequence_length = embedding_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
+    _, sequence_length, _ = _get_sequence_dense_tensor_state(
+        embedding_column, {'aaa': sparse_input})
 
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
+    self.assertAllEqual(
+        expected_sequence_length, self.evaluate(sequence_length))
 
 
 class SequenceSharedEmbeddingColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_get_sequence_dense_tensor(self):
     vocabulary_size = 3
     embedding_dimension = 2
@@ -1058,122 +1107,112 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
         key='aaa', num_buckets=vocabulary_size)
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc.shared_embedding_columns_v2(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         initializer=_initializer)
 
-    embedding_lookup_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
-        _LazyBuilder({
-            'aaa': sparse_input_a
-        }))[0]
-    embedding_lookup_b = shared_embedding_columns[1]._get_sequence_dense_tensor(
-        _LazyBuilder({
-            'bbb': sparse_input_b
-        }))[0]
+    embedding_lookup_a = _get_sequence_dense_tensor(
+        shared_embedding_columns[0], {'aaa': sparse_input_a})[0]
+    embedding_lookup_b = _get_sequence_dense_tensor(
+        shared_embedding_columns[1], {'bbb': sparse_input_b})[0]
 
+    self.evaluate(variables_lib.global_variables_initializer())
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertItemsEqual(('aaa_bbb_shared_embedding:0',),
                           tuple([v.name for v in global_vars]))
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
-      self.assertAllEqual(
-          expected_lookups_a, embedding_lookup_a.eval(session=sess))
-      self.assertAllEqual(
-          expected_lookups_b, embedding_lookup_b.eval(session=sess))
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(
+        expected_lookups_a, self.evaluate(embedding_lookup_a))
+    self.assertAllEqual(expected_lookups_b, self.evaluate(embedding_lookup_b))
 
   def test_sequence_length(self):
-    vocabulary_size = 3
-
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-    expected_sequence_length_a = [1, 2]
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        # example 0, ids [0, 2]
-        # example 1, ids [1]
-        indices=((0, 0), (0, 1), (1, 0)),
-        values=(0, 2, 1),
-        dense_shape=(2, 2))
-    expected_sequence_length_b = [2, 1]
-    categorical_column_b = sfc.sequence_categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc.shared_embedding_columns(
-        [categorical_column_a, categorical_column_b], dimension=2)
-
-    sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
-        _LazyBuilder({
-            'aaa': sparse_input_a
-        }))[1]
-    sequence_length_b = shared_embedding_columns[1]._get_sequence_dense_tensor(
-        _LazyBuilder({
-            'bbb': sparse_input_b
-        }))[1]
-
-    with monitored_session.MonitoredSession() as sess:
-      sequence_length_a = sess.run(sequence_length_a)
-      self.assertAllEqual(expected_sequence_length_a, sequence_length_a)
-      self.assertEqual(np.int64, sequence_length_a.dtype)
-      sequence_length_b = sess.run(sequence_length_b)
-      self.assertAllEqual(expected_sequence_length_b, sequence_length_b)
-      self.assertEqual(np.int64, sequence_length_b.dtype)
+    with ops.Graph().as_default():
+      vocabulary_size = 3
+
+      sparse_input_a = sparse_tensor.SparseTensorValue(
+          # example 0, ids [2]
+          # example 1, ids [0, 1]
+          indices=((0, 0), (1, 0), (1, 1)),
+          values=(2, 0, 1),
+          dense_shape=(2, 2))
+      expected_sequence_length_a = [1, 2]
+      categorical_column_a = sfc.sequence_categorical_column_with_identity(
+          key='aaa', num_buckets=vocabulary_size)
+
+      sparse_input_b = sparse_tensor.SparseTensorValue(
+          # example 0, ids [0, 2]
+          # example 1, ids [1]
+          indices=((0, 0), (0, 1), (1, 0)),
+          values=(0, 2, 1),
+          dense_shape=(2, 2))
+      expected_sequence_length_b = [2, 1]
+      categorical_column_b = sfc.sequence_categorical_column_with_identity(
+          key='bbb', num_buckets=vocabulary_size)
+      shared_embedding_columns = fc.shared_embedding_columns_v2(
+          [categorical_column_a, categorical_column_b], dimension=2)
+
+      sequence_length_a = _get_sequence_dense_tensor(
+          shared_embedding_columns[0], {'aaa': sparse_input_a})[1]
+      sequence_length_b = _get_sequence_dense_tensor(
+          shared_embedding_columns[1], {'bbb': sparse_input_b})[1]
+
+      with fc_test._initialized_session() as sess:
+        sequence_length_a = sess.run(sequence_length_a)
+        self.assertAllEqual(expected_sequence_length_a, sequence_length_a)
+        self.assertEqual(np.int64, sequence_length_a.dtype)
+        sequence_length_b = sess.run(sequence_length_b)
+        self.assertAllEqual(expected_sequence_length_b, sequence_length_b)
+        self.assertEqual(np.int64, sequence_length_b.dtype)
 
   def test_sequence_length_with_empty_rows(self):
     """Tests _sequence_length when some examples do not have ids."""
-    vocabulary_size = 3
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        # example 0, ids []
-        # example 1, ids [2]
-        # example 2, ids [0, 1]
-        # example 3, ids []
-        # example 4, ids [1]
-        # example 5, ids []
-        indices=((1, 0), (2, 0), (2, 1), (4, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(6, 2))
-    expected_sequence_length_a = [0, 1, 2, 0, 1, 0]
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids []
-        # example 2, ids []
-        # example 3, ids []
-        # example 4, ids [1]
-        # example 5, ids [0, 1]
-        indices=((0, 0), (4, 0), (5, 0), (5, 1)),
-        values=(2, 1, 0, 1),
-        dense_shape=(6, 2))
-    expected_sequence_length_b = [1, 0, 0, 0, 1, 2]
-    categorical_column_b = sfc.sequence_categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-
-    shared_embedding_columns = fc.shared_embedding_columns(
-        [categorical_column_a, categorical_column_b], dimension=2)
-
-    sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
-        _LazyBuilder({
-            'aaa': sparse_input_a
-        }))[1]
-    sequence_length_b = shared_embedding_columns[1]._get_sequence_dense_tensor(
-        _LazyBuilder({
-            'bbb': sparse_input_b
-        }))[1]
-
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_sequence_length_a, sequence_length_a.eval(session=sess))
-      self.assertAllEqual(
-          expected_sequence_length_b, sequence_length_b.eval(session=sess))
-
-
+    with ops.Graph().as_default():
+      vocabulary_size = 3
+      sparse_input_a = sparse_tensor.SparseTensorValue(
+          # example 0, ids []
+          # example 1, ids [2]
+          # example 2, ids [0, 1]
+          # example 3, ids []
+          # example 4, ids [1]
+          # example 5, ids []
+          indices=((1, 0), (2, 0), (2, 1), (4, 0)),
+          values=(2, 0, 1, 1),
+          dense_shape=(6, 2))
+      expected_sequence_length_a = [0, 1, 2, 0, 1, 0]
+      categorical_column_a = sfc.sequence_categorical_column_with_identity(
+          key='aaa', num_buckets=vocabulary_size)
+
+      sparse_input_b = sparse_tensor.SparseTensorValue(
+          # example 0, ids [2]
+          # example 1, ids []
+          # example 2, ids []
+          # example 3, ids []
+          # example 4, ids [1]
+          # example 5, ids [0, 1]
+          indices=((0, 0), (4, 0), (5, 0), (5, 1)),
+          values=(2, 1, 0, 1),
+          dense_shape=(6, 2))
+      expected_sequence_length_b = [1, 0, 0, 0, 1, 2]
+      categorical_column_b = sfc.sequence_categorical_column_with_identity(
+          key='bbb', num_buckets=vocabulary_size)
+
+      shared_embedding_columns = fc.shared_embedding_columns_v2(
+          [categorical_column_a, categorical_column_b], dimension=2)
+
+      sequence_length_a = _get_sequence_dense_tensor(
+          shared_embedding_columns[0], {'aaa': sparse_input_a})[1]
+      sequence_length_b = _get_sequence_dense_tensor(
+          shared_embedding_columns[1], {'bbb': sparse_input_b})[1]
+
+      with fc_test._initialized_session() as sess:
+        self.assertAllEqual(
+            expected_sequence_length_a, sequence_length_a.eval(session=sess))
+        self.assertAllEqual(
+            expected_sequence_length_b, sequence_length_b.eval(session=sess))
+
+
+@test_util.run_all_in_graph_and_eager_modes
 class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(
@@ -1221,13 +1260,12 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column = fc_old._indicator_column(categorical_column)
+    indicator_column = fc.indicator_column(categorical_column)
 
-    indicator_tensor, _ = indicator_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': inputs}))
+    indicator_tensor, _ = _get_sequence_dense_tensor(
+        indicator_column, {'aaa': inputs})
 
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(expected, indicator_tensor.eval(session=sess))
+    self.assertAllEqual(expected, self.evaluate(indicator_tensor))
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -1253,15 +1291,14 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column = fc_old._indicator_column(categorical_column)
+    indicator_column = fc.indicator_column(categorical_column)
 
-    _, sequence_length = indicator_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': inputs}))
+    _, sequence_length = _get_sequence_dense_tensor(
+        indicator_column, {'aaa': inputs})
 
-    with monitored_session.MonitoredSession() as sess:
-      sequence_length = sess.run(sequence_length)
-      self.assertAllEqual(expected_sequence_length, sequence_length)
-      self.assertEqual(np.int64, sequence_length.dtype)
+    sequence_length = self.evaluate(sequence_length)
+    self.assertAllEqual(expected_sequence_length, sequence_length)
+    self.assertEqual(np.int64, sequence_length.dtype)
 
   def test_sequence_length_with_empty_rows(self):
     """Tests _sequence_length when some examples do not have ids."""
@@ -1282,19 +1319,14 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
         key='aaa', num_buckets=vocabulary_size)
     indicator_column = fc.indicator_column(categorical_column)
 
-    _, sequence_length = indicator_column._get_sequence_dense_tensor(
-        _LazyBuilder({'aaa': sparse_input}))
-
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
-
+    _, sequence_length = _get_sequence_dense_tensor(
+        indicator_column, {'aaa': sparse_input})
 
-def _get_sequence_dense_tensor(column, features):
-  return column.get_sequence_dense_tensor(
-      fc.FeatureTransformationCache(features), None)
+    self.assertAllEqual(
+        expected_sequence_length, self.evaluate(sequence_length))
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class SequenceNumericColumnTest(test.TestCase, parameterized.TestCase):
 
   def test_defaults(self):
@@ -1355,8 +1387,7 @@ class SequenceNumericColumnTest(test.TestCase, parameterized.TestCase):
 
     dense_tensor, _ = _get_sequence_dense_tensor(
         numeric_column, {'aaa': inputs})
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(expected, dense_tensor.eval(session=sess))
+    self.assertAllEqual(expected, self.evaluate(dense_tensor))
 
   def test_get_sequence_dense_tensor_with_normalizer_fn(self):
 
@@ -1389,9 +1420,8 @@ class SequenceNumericColumnTest(test.TestCase, parameterized.TestCase):
     dense_tensor, _ = _get_sequence_dense_tensor(
         numeric_column, {'aaa': sparse_input})
 
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_dense_tensor, dense_tensor.eval(session=sess))
+    self.assertAllEqual(
+        expected_dense_tensor, self.evaluate(dense_tensor))
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -1427,9 +1457,8 @@ class SequenceNumericColumnTest(test.TestCase, parameterized.TestCase):
     dense_tensor, _ = _get_sequence_dense_tensor(
         numeric_column, {'aaa': sparse_input})
 
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_dense_tensor, dense_tensor.eval(session=sess))
+    self.assertAllEqual(
+        expected_dense_tensor, self.evaluate(dense_tensor))
 
   @parameterized.named_parameters(
       {'testcase_name': '2D',
@@ -1476,10 +1505,9 @@ class SequenceNumericColumnTest(test.TestCase, parameterized.TestCase):
     _, sequence_length = _get_sequence_dense_tensor(
         numeric_column, {'aaa': inputs})
 
-    with monitored_session.MonitoredSession() as sess:
-      sequence_length = sess.run(sequence_length)
-      self.assertAllEqual(expected_sequence_length, sequence_length)
-      self.assertEqual(np.int64, sequence_length.dtype)
+    sequence_length = self.evaluate(sequence_length)
+    self.assertAllEqual(expected_sequence_length, sequence_length)
+    self.assertEqual(np.int64, sequence_length.dtype)
 
   def test_sequence_length_with_empty_rows(self):
     """Tests _sequence_length when some examples do not have ids."""
@@ -1499,9 +1527,8 @@ class SequenceNumericColumnTest(test.TestCase, parameterized.TestCase):
     _, sequence_length = _get_sequence_dense_tensor(
         numeric_column, {'aaa': sparse_input})
 
-    with monitored_session.MonitoredSession() as sess:
-      self.assertAllEqual(
-          expected_sequence_length, sequence_length.eval(session=sess))
+    self.assertAllEqual(
+        expected_sequence_length, self.evaluate(sequence_length))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index 30dc959e9a9f717bdb5c56bfbdde5ffa9d48c257..a8ba4ea50d144854c9b38bca427ae9f820994fdd 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -29,13 +29,76 @@ from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 
+# Op types that should not run in program order, e.g. because they need to run
+# asynchronously to avoid deadlock.
+ASYNC_STATEFUL_OPS = [
+    "CollectiveGather",
+    "CollectiveReduce",
+    "CollectiveBcastSend",
+    "CollectiveBcastRecv",
+    "NcclAllReduce",
+]
+
+LEGACY_RANDOM_OPS = [
+    # These may be used in variable initializers -- thus their execution should
+    # not be dependent on other stateful operations.  This is because although
+    # according to program order, tf.Variables may be created in sequence,
+    # their initialization happens outside of the program order (specifically,
+    # in graph mode their initialization happens by calling a grouped
+    # initializer operation or in eager mode, where initialization is lifted
+    # out of the tf.function and executed the first time the function is
+    # executed).
+    #
+    # Unless there is a specific dependency between the initializers
+    # themselves (e.g. one initializer depends on a Variable whose value depends
+    # on another initializer), the initialization can happen in any order so
+    # long as it's before the associated Variable read operations.
+    #
+    # Note that in general the randomness of legacy random operations is only
+    # guaranteed by providing a graph-level and op-level seed (and ordering of
+    # the same op across multiple iterations of a while_loop is specifically not
+    # guaranteed; see the discussion below).
+    #
+    # There is a possible race condition inside while_loop where the same
+    # random OpKernel instantiation is reused across multiple steps
+    # of the loop.  Since legacy Random OpKernels have an internal rng state,
+    # automatic dependency tracking across loop steps would likely
+    # fix this race; and for that case this blacklist is problematic.
+    # However, since automatic dependency tracking inside while loops is not
+    # currently supported, and there are no other examples of OpKernel reuse
+    # (each OpKernel is associated with a unique op in graph mode),
+    # this blacklist has no effect on the aforementioned behavior.
+    #
+    # TODO(ebrevdo,skyewm): Modify the check against this blacklist to
+    # only occur when the op is inside a "variable initialization scope"; and
+    # add proper autodeps inside while_loops that respects this updated check.
+    "RandomUniform",
+    "RandomUniformInt",
+    "RandomStandardNormal",
+    "ParameterizedTruncatedNormal",
+    "TruncatedNormal",
+    "RandomShuffle",
+    "Multinomial",
+    "RandomGamma",
+    "RandomGammaGrad",
+    "RandomPoisson",
+    "RandomPoissonV2",
+]
+
+_ALL_BLACKLISTED_OPS = set(ASYNC_STATEFUL_OPS) | set(LEGACY_RANDOM_OPS)
+
+
+def op_is_stateful(op_def):
+  return op_def.is_stateful and op_def.name not in _ALL_BLACKLISTED_OPS
+
 
 class AutomaticControlDependencies(object):
   """Context manager to automatically add control dependencies.
 
   Code under this context manager will act as if a sensible set of control
   dependencies were present. More specifically:
-    1. All stateful ops in the scope will execute
+    1. All stateful ops in the scope will execute (with the exception of ops in
+       ASYNC_STATEFUL_OPS and LEGACY_RANDOM_OPS)
     2. Stateful ops which modify the same resource will execute in program order
 
   Note: creating variables in an automatic control dependencies context is not
@@ -47,6 +110,7 @@ class AutomaticControlDependencies(object):
 
   def __init__(self):
     self._returned_tensors = set()
+    self.ops_which_must_run = set()
 
   def mark_as_return(self, tensor):
     """Acts like identity but marks the `Tensor` as a return value.
@@ -100,6 +164,7 @@ class AutomaticControlDependencies(object):
     # graph (but that would mess up devices and collections at least,
     # probably other things as well).
     self._graph = ops.get_default_graph()
+    self._graph._add_control_dependencies = True  # pylint: disable=protected-access
     self._n_operations = len(self._graph.get_operations())
     return self
 
@@ -170,6 +235,14 @@ class AutomaticControlDependencies(object):
       raise RuntimeError(
           "Graph changed while trying to add control dependencies.")
 
+    # pylint: disable=protected-access
+    if hasattr(self._graph, "outer_graph"):
+      outer_val = self._graph.outer_graph._add_control_dependencies
+      self._graph._add_control_dependencies = outer_val
+    else:
+      self._graph._add_control_dependencies = False
+    # pylint: enable=protected-access
+
     # map from resource tensor to the last op which used it
     last_op_using_resource_tensor = {}
     # set of conditional and loop exits
@@ -214,7 +287,7 @@ class AutomaticControlDependencies(object):
       control_inputs = set()
       # Ensure stateful ops run
       if (op.type not in self._graph._registered_ops  # pylint: disable=protected-access
-          or self._graph._registered_ops[op.type].is_stateful):  # pylint: disable=protected-access
+          or op_is_stateful(self._graph._registered_ops[op.type])):  # pylint: disable=protected-access
         ops_which_must_run.add(op)
       # Ignore switches (they're handled separately)
       if op.type == "Switch" and op.inputs[0].dtype == dtypes_module.resource:
@@ -229,24 +302,29 @@ class AutomaticControlDependencies(object):
         ops_which_must_run = set([op])
         continue
       found_resource = False
-      for inp in op.inputs:
-        if inp.dtype == dtypes_module.resource:
-          found_resource = True
-          # Deal with switches, finally.
-          if inp.op.type == "Switch":
-            self._process_switch(inp.op, ops_which_must_run,
-                                 last_op_using_resource_tensor,
-                                 merge_for_resource)
-          # Ensure uses of resources are serialized
-          if inp in last_op_using_resource_tensor:
-            if (last_op_using_resource_tensor[inp]._control_flow_context  # pylint: disable=protected-access
-                is op._control_flow_context):  # pylint: disable=protected-access
-              control_inputs.add(last_op_using_resource_tensor[inp])
-          # Ensure merges happen after the closing of a cond block
-          if inp in merge_for_resource:
-            merge_for_resource[inp]._add_control_input(op)  # pylint: disable=protected-access
-          last_op_using_resource_tensor[inp] = op
-      if (op.op_def.is_stateful and not found_resource
+      # Check for any resource inputs. If we find any, we update control_inputs
+      # and last_op_using_resource_tensor. Note that we dedup op.inputs in case
+      # op receives the same resource tensor twice as input, which would result
+      # in op getting a control dependency on itself.
+      for inp in set(op.inputs):
+        if inp.dtype != dtypes_module.resource:
+          continue
+        found_resource = True
+        # Deal with switches, finally.
+        if inp.op.type == "Switch":
+          self._process_switch(inp.op, ops_which_must_run,
+                               last_op_using_resource_tensor,
+                               merge_for_resource)
+        # Ensure uses of resources are serialized
+        if inp in last_op_using_resource_tensor:
+          if (last_op_using_resource_tensor[inp]._control_flow_context  # pylint: disable=protected-access
+              is op._control_flow_context):  # pylint: disable=protected-access
+            control_inputs.add(last_op_using_resource_tensor[inp])
+        # Ensure merges happen after the closing of a cond block
+        if inp in merge_for_resource:
+          merge_for_resource[inp]._add_control_input(op)  # pylint: disable=protected-access
+        last_op_using_resource_tensor[inp] = op
+      if (op_is_stateful(op.op_def) and not found_resource
           and op._control_flow_context is None):  # pylint: disable=protected-access
         if None in last_op_using_resource_tensor:
           op._add_control_input(last_op_using_resource_tensor[None])  # pylint: disable=protected-access
@@ -256,10 +334,11 @@ class AutomaticControlDependencies(object):
       op._add_control_inputs(control_inputs)  # pylint: disable=protected-access
 
     # Ensure all ops which must run do run
+    self.ops_which_must_run.update(ops_which_must_run)
     for r in self._returned_tensors:
-      if ops_which_must_run:
+      if self.ops_which_must_run:
         r.op._add_control_inputs(  # pylint: disable=protected-access
-            [o for o in ops_which_must_run
+            [o for o in self.ops_which_must_run
              if o._control_flow_context is r.op._control_flow_context])  # pylint: disable=protected-access
 
 
diff --git a/tensorflow/python/framework/auto_control_deps_test.py b/tensorflow/python/framework/auto_control_deps_test.py
index 5f5de45b9ee44da8a3440b5f3a5d55fbf7b8a02f..d9df96f6d70c36ddd9b942f66929b1033e4542f6 100644
--- a/tensorflow/python/framework/auto_control_deps_test.py
+++ b/tensorflow/python/framework/auto_control_deps_test.py
@@ -19,12 +19,15 @@ from __future__ import print_function
 
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.framework import auto_control_deps as acd
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.layers import core as keras_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -39,7 +42,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
   def testBasic(self):
     with context.graph_mode(), self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       with acd.AutomaticControlDependencies() as c:
         v.assign(v + 1)
         v.assign(2 * v)
@@ -51,7 +54,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
   def testCondMustRun(self):
     with context.graph_mode(), self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       p = array_ops.placeholder(dtype=dtypes.bool)
       with acd.AutomaticControlDependencies() as c:
 
@@ -73,7 +76,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
   def testCondMustRunSeparateRead(self):
     with context.graph_mode(), self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       p = array_ops.placeholder(dtype=dtypes.bool)
       with acd.AutomaticControlDependencies() as c:
 
@@ -97,7 +100,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
   def testCondNested(self):
     with context.graph_mode(), self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       p = array_ops.placeholder(dtype=dtypes.bool)
       q = array_ops.placeholder(dtype=dtypes.bool)
       with acd.AutomaticControlDependencies() as c:
@@ -132,7 +135,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
   def testCondOneBranch(self):
     with context.graph_mode(), self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       p = array_ops.placeholder(dtype=dtypes.bool)
       with acd.AutomaticControlDependencies() as c:
 
@@ -153,7 +156,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
   def testCondOneBranchUpdateBefore(self):
     with context.graph_mode(), self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       p = array_ops.placeholder(dtype=dtypes.bool)
       with acd.AutomaticControlDependencies() as c:
         v.assign(v * 2)
@@ -175,7 +178,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
   def testCondOneBranchUpdateAfter(self):
     with context.graph_mode(), self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       p = array_ops.placeholder(dtype=dtypes.bool)
       with acd.AutomaticControlDependencies() as c:
 
@@ -211,7 +214,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
   def testDecorator(self):
     with context.graph_mode(), self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       @acd.automatic_control_dependencies
       def f():
@@ -281,6 +284,44 @@ class AutomaticControlDependenciesTest(test.TestCase):
     train()
     self.assertEqual(v.numpy(), -1.0)
 
+  def testRepeatedResourceInput(self):
+    var = resource_variable_ops.ResourceVariable(1.0)
+
+    @def_function.function
+    def inner(var1, var2):
+      return (resource_variable_ops.read_variable_op(var1, dtypes.float32) +
+              resource_variable_ops.read_variable_op(var2, dtypes.float32))
+
+    @def_function.function
+    def outer():
+      return inner(var.handle, var.handle)
+
+    self.assertEqual(self.evaluate(outer()), 2.0)
+
+  def testVariableInitializersCanBeLifted(self):
+    # The initializer is a stateful op, but using it inside a function should
+    # *not* create additional dependencies.  That's what we're testing.
+    layer = keras_core.Dense(1, kernel_initializer="glorot_uniform")
+
+    @def_function.function
+    def fn(x):
+      # Stateful operation
+      control_flow_ops.Assert(x, ["Error"])
+      # Variable initialization should be lifted.  Prior to the change that
+      # added this test, the lifting would crash because of an auto control dep
+      # added on `x`.  Note, the error did not happen if we
+      # manually created a tf.Variable outside of function and used it
+      # here.  Alternatively, creating a tf.Variable inside fn() causes
+      # a different sort of error that is out of scope for this test.
+      return layer(ops.convert_to_tensor([[1.0, 1.0]]))
+
+    true = ops.convert_to_tensor(True)
+
+    concrete = fn.get_concrete_function(
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.bool))
+    self.evaluate(concrete(true))
+    self.evaluate(fn(True))
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/framework/composite_tensor.py b/tensorflow/python/framework/composite_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c9a292e363e2e38be32a30ea95f1f122876bbb0
--- /dev/null
+++ b/tensorflow/python/framework/composite_tensor.py
@@ -0,0 +1,100 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tensor-like objects that are composed from tf.Tensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+from tensorflow.python import pywrap_tensorflow
+
+
+@six.add_metaclass(abc.ABCMeta)
+class CompositeTensor(object):
+  """Abstract base class for Tensor-like objects that are composed from Tensors.
+
+  Each `CompositeTensor` can be decomposed into a structured collection of
+  component `tf.Tensor`s, and reconstructed from those components.
+
+  The `tensorflow.python.util.nest` module has support for treating composite
+  tensors as structure, which makes it easy to flatten and reconstruct
+  composite tensors (or larger structures that contain composite tensors).
+  E.g.:
+
+  ```python
+  ct = ...  # Create a composite tensor.
+  flat_list_of_tensors = nest.flatten(ct, expand_composites=True)
+  transformed_list_of_tensors = ...  # do something with the flat tensors.
+  result = nest.pack_sequence_as(ct, transformed_list_of_tensors)
+  ```
+  """
+
+  @abc.abstractmethod
+  def _to_components(self):
+    """Decomposes this composite tensor into its components.
+
+    Returns:
+      The components that comprise this composite tensor: a nested structure
+      (as defined by `tf.python.util.nest`) whose values are `tf.Tensor`s or
+      `CompositeTensor`s.
+    """
+    raise NotImplementedError("CompositeTensor._to_components")
+
+  @abc.abstractmethod
+  def _from_components(cls, components):  # pylint: disable=no-self-argument
+    """Creates a composite tensor of type `cls` from components.
+
+    Args:
+      components: The components that should be used to form the
+        composite tensor: a nested structure (as defined by
+        `tf.python.util.nest`) whose values are tf.Tensors or composite
+        tensors.
+
+    Returns:
+      A `CompositeTensor` of type `cls`.
+    """
+    raise NotImplementedError("CompositeTensor._from_components")
+
+  @abc.abstractmethod
+  def _shape_invariant_to_components(self, shape=None):
+    """Converts a shape invariant into invariants for individual components.
+
+    Args:
+      shape: A `tf.TensorShape` object.  The shape invariant for this
+        `CompositeTensor`, or `None` if a default shape invariant should be
+        used (based on the value of this `CompositeTensor`).
+
+    Returns:
+      A nested structure whose values are `tf.TensorShape` objects, specifying
+      the shape invariants for the tensors that comprise this `CompositeTensor`.
+    """
+    raise NotImplementedError("CompositeTensor._shape_invariant_to_components")
+
+  @abc.abstractproperty
+  def _is_graph_tensor(self):
+    """Returns True if this tensor's components belong to a TF graph."""
+    raise NotImplementedError("CompositeTensor._is_symbolic_tensor")
+
+
+pywrap_tensorflow.RegisterType("CompositeTensor", CompositeTensor)
+
+
+# @TODO(edloper): Can we replace convert_to_tensor_or_xyz with just
+# convert_to_tensor_or_composite?  Alternatively, should composite tensors
+# register a dispatch override for tf.convert_to_tensor?
diff --git a/tensorflow/python/framework/composite_tensor_test.py b/tensorflow/python/framework/composite_tensor_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f249faa5d685b411742a65025000e00c2edadbc5
--- /dev/null
+++ b/tensorflow/python/framework/composite_tensor_test.py
@@ -0,0 +1,101 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.python.framework.composite_tensor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import composite_tensor
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+from tensorflow.python.util import nest
+
+
+class TestCompositeTensor(composite_tensor.CompositeTensor):
+
+  def __init__(self, *components):
+    self._components = components
+
+  def _to_components(self):
+    return self._components
+
+  @classmethod
+  def _from_components(cls, components):
+    return cls(*components)
+
+  def _shape_invariant_to_components(self, shape=None):
+    raise NotImplementedError('CompositeTensor._shape_invariant_to_components')
+
+  def _is_graph_tensor(self):
+    return True
+
+
+class CompositeTensorTest(test_util.TensorFlowTestCase):
+
+  def assertNestEqual(self, a, b, expand_composites=False):
+    if isinstance(a, dict):
+      self.assertIsInstance(b, dict)
+      self.assertEqual(set(a), set(b))
+      for key in a:
+        self.assertNestEqual(a[key], b[key])
+    elif isinstance(a, (list, tuple)):
+      self.assertIsInstance(b, (list, tuple))
+      self.assertEqual(len(a), len(b))
+      for a_val, b_val in zip(a, b):
+        self.assertNestEqual(a_val, b_val)
+    elif expand_composites and isinstance(a, composite_tensor.CompositeTensor):
+      self.assertIsInstance(b, composite_tensor.CompositeTensor)
+      self.assertNestEqual(a._to_components(),
+                           b._to_components())
+
+  def testNestFlatten(self):
+    st1 = sparse_tensor.SparseTensor([[0, 3], [7, 2]], [1, 2], [10, 10])
+    st2 = sparse_tensor.SparseTensor([[1, 2, 3]], ['a'], [10, 10, 10])
+    structure = [[st1], 'foo', {'y': [st2]}]
+    x = nest.flatten(structure, expand_composites=True)
+    self.assertEqual(len(x), 7)
+    self.assertIs(x[0], st1.indices)
+    self.assertIs(x[1], st1.values)
+    self.assertIs(x[2], st1.dense_shape)
+    self.assertEqual(x[3], 'foo')
+    self.assertIs(x[4], st2.indices)
+    self.assertIs(x[5], st2.values)
+    self.assertIs(x[6], st2.dense_shape)
+
+  def testNestPackSequenceAs(self):
+    st1 = sparse_tensor.SparseTensor([[0, 3], [7, 2]], [1, 2], [10, 10])
+    st2 = sparse_tensor.SparseTensor([[1, 2, 3]], ['a'], [10, 10, 10])
+    structure1 = [[st1], 'foo', {'y': [st2]}]
+    flat = [st2.indices, st2.values, st2.dense_shape, 'bar',
+            st1.indices, st1.values, st1.dense_shape]
+    result = nest.pack_sequence_as(structure1, flat, expand_composites=True)
+    expected = [[st2], 'bar', {'y': [st1]}]
+    self.assertNestEqual(expected, result)
+
+  def testAssertSameStructure(self):
+    st1 = sparse_tensor.SparseTensor([[0]], [0], [100])
+    st2 = sparse_tensor.SparseTensor([[0, 3]], ['x'], [100, 100])
+    test = TestCompositeTensor(st1.indices, st1.values, st1.dense_shape)
+    nest.assert_same_structure(st1, st2, expand_composites=False)
+    nest.assert_same_structure(st1, st2, expand_composites=True)
+    nest.assert_same_structure(st1, test, expand_composites=False)
+    with self.assertRaises(TypeError):
+      nest.assert_same_structure(st1, test, expand_composites=True)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/framework/convert_to_constants.py b/tensorflow/python/framework/convert_to_constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..f96e5991741e56e33cf99963873f12a51e3aad23
--- /dev/null
+++ b/tensorflow/python/framework/convert_to_constants.py
@@ -0,0 +1,142 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Helpers to convert variables to constants in TensorFlow 2.0."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.grappler import tf_optimizer
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training.saver import export_meta_graph
+
+
+def _run_inline_graph_optimization(func):
+  """Apply function inline optimization to the graph.
+
+  Returns the GraphDef after Grappler's function inlining optimization is
+  applied. This optimization does not work on models with control flow.
+
+  Args:
+    func: ConcreteFunction.
+
+  Returns:
+    GraphDef
+  """
+  meta_graph = export_meta_graph(
+      graph_def=func.graph.as_graph_def(), graph=func.graph)
+
+  # Add a collection 'train_op' so that Grappler knows the outputs.
+  fetch_collection = meta_graph_pb2.CollectionDef()
+  for array in func.inputs + func.outputs:
+    fetch_collection.node_list.value.append(array.name)
+  meta_graph.collection_def["train_op"].CopyFrom(fetch_collection)
+
+  # Initialize RewriterConfig with everything disabled except function inlining.
+  config = config_pb2.ConfigProto()
+  rewrite_options = config.graph_options.rewrite_options
+  rewrite_options.optimizers.append("function")
+  return tf_optimizer.OptimizeGraph(config, meta_graph)
+
+
+def convert_variables_to_constants_v2(func):
+  """Replaces all the variables in a graph with constants of the same values.
+
+  TensorFlow 2.0 function for converting all Variable ops into Const ops holding
+  the same values. This makes it possible to describe the network fully with a
+  single GraphDef file, and allows the removal of a lot of ops related to
+  loading and saving the variables. This function runs Grappler's function
+  inlining optimization in order to return a single subgraph.
+
+  The current implementation only works for graphs that do not contain any
+  control flow or embedding related ops.
+
+  Args:
+    func: ConcreteFunction.
+
+  Returns:
+    GraphDef containing a simplified version of the original.
+  """
+  # TODO(nupurgarg): Replace ResourceGather with Gather.
+  # TODO(nupurgarg): Change attr for Variables in control flow and functions.
+  graph_def = _run_inline_graph_optimization(func)
+
+  # Identify the ReadVariableOps.
+  get_name = lambda name: name.split(":")[0]
+  map_name_to_node = {get_name(node.name): node for node in graph_def.node}
+
+  variables_generator = func.graph.variables
+  resource_identities = {}
+  resource_placeholders = {}
+  for node in graph_def.node:
+    if node.op == "ReadVariableOp":
+      # Get name of Placeholder op associated with ReadVariableOp. There can be
+      # an Identity in between the ReadVariableOp and Placeholder. Store the
+      # Identity ops with the associated dtypes.
+      input_name = get_name(node.input[0])
+      while map_name_to_node[input_name].op == "Identity":
+        resource_identities[input_name] = node.attr["dtype"]
+        input_name = get_name(map_name_to_node[input_name].input[0])
+      if map_name_to_node[input_name].op != "Placeholder":
+        raise ValueError("Cannot find the Placeholder op that is an input "
+                         "to the ReadVariableOp.")
+      # Build a map of Placeholder ops that are inputs to ReadVariableOps to the
+      # variable's dtype and data.
+      # TODO(nupurgarg): Confirm relationship between variables in
+      # `func.graph.variables` and ReadVariableOps in `graph_def.nodes`.
+      resource_placeholders[input_name] = {
+          "dtype": node.attr["dtype"],
+          "data": next(variables_generator).numpy(),
+      }
+
+  # Reconstruct the graph with constants in place of variables.
+  output_graph_def = graph_pb2.GraphDef()
+  how_many_converted = 0
+
+  for input_node in graph_def.node:
+    output_node = output_graph_def.node.add()
+    # Convert Placeholder ops that are inputs to ReadVariableOps into Const ops.
+    if input_node.name in resource_placeholders:
+      dtype = resource_placeholders[input_node.name]["dtype"]
+      data = resource_placeholders[input_node.name]["data"]
+
+      output_node.op = "Const"
+      output_node.name = input_node.name
+      output_node.attr["dtype"].CopyFrom(dtype)
+      output_node.attr["value"].tensor.CopyFrom(
+          tensor_util.make_tensor_proto(
+              data, dtype=dtype.type, shape=data.shape))
+      how_many_converted += 1
+    # Change the dtype for Identity ops that are inputs to ReadVariableOps.
+    elif input_node.name in resource_identities:
+      output_node.CopyFrom(input_node)
+      output_node.attr["T"].CopyFrom(resource_identities[input_node.name])
+    # Convert ReadVariableOps into Identity ops.
+    elif input_node.op == "ReadVariableOp":
+      output_node.op = "Identity"
+      output_node.name = input_node.name
+      output_node.input.extend([input_node.input[0]])
+      output_node.attr["T"].CopyFrom(input_node.attr["dtype"])
+      if "_class" in input_node.attr:
+        output_node.attr["_class"].CopyFrom(input_node.attr["_class"])
+    else:
+      output_node.CopyFrom(input_node)
+
+  logging.info("Converted %d variables to const ops.", how_many_converted)
+  return output_graph_def
diff --git a/tensorflow/python/framework/convert_to_constants_test.py b/tensorflow/python/framework/convert_to_constants_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd28dbaf463e4547448683662cf45fef3c0b5837
--- /dev/null
+++ b/tensorflow/python/framework/convert_to_constants_test.py
@@ -0,0 +1,191 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for convert_to_constants.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.client import session
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import convert_to_constants
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model.load import load
+from tensorflow.python.saved_model.save import save
+from tensorflow.python.training.tracking import tracking
+
+
+class VariablesToConstantsTest(test.TestCase):
+
+  def _hasStatefulPartitionedCallOp(self, graph_def):
+    """Determines if a StatefulPartitionedCall op exists in the graph."""
+    for node in graph_def.node:
+      if node.op == "StatefulPartitionedCall":
+        return True
+    return False
+
+  def _getNumVariables(self, graph_def):
+    """Returns the number of ReadVariableOp in the graph."""
+    return sum(node.op == "ReadVariableOp" for node in graph_def.node)
+
+  def _getTensors(self, sess, tensor_list):
+    """Returns a list of Tensor objects from the Session."""
+    return [
+        sess.graph.get_tensor_by_name(tensor.name) for tensor in tensor_list
+    ]
+
+  def _evaluateGraphDef(self, graph_def, func, input_data):
+    """Evaluates the GraphDef using Sessions."""
+    with ops.Graph().as_default() as graph:
+      importer.import_graph_def(graph_def, name="")
+      func.add_to_graph(graph)
+      sess = session.Session(graph=graph)
+
+    input_tensors = self._getTensors(sess, func.inputs)
+    output_tensors = self._getTensors(sess, func.outputs)
+    return sess.run(
+        output_tensors, feed_dict=dict(zip(input_tensors, input_data)))
+
+  @test_util.run_v2_only
+  def testConstSavedModel(self):
+    """Test a basic model with functions to make sure functions are inlined."""
+    input_data = constant_op.constant(1., shape=[1])
+    root = tracking.AutoTrackable()
+    root.f = def_function.function(lambda x: 2. * x)
+    to_save = root.f.get_concrete_function(input_data)
+
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save(root, save_dir, to_save)
+    saved_model = load(save_dir)
+    concrete_func = saved_model.signatures["serving_default"]
+
+    variable_graph_def = concrete_func.graph.as_graph_def()
+    self.assertEqual(0, self._getNumVariables(variable_graph_def))
+    self.assertTrue(variable_graph_def.library.function)
+
+    constant_graph_def = convert_to_constants.convert_variables_to_constants_v2(
+        concrete_func)
+    self.assertEqual(0, self._getNumVariables(constant_graph_def))
+    self.assertFalse(constant_graph_def.library.function)
+
+    # Check value.
+    expected_value = root.f(input_data)
+    actual_value = self._evaluateGraphDef(constant_graph_def, concrete_func,
+                                          [input_data.numpy()])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+  @test_util.run_v2_only
+  def testVariableModel(self):
+    """Test a basic model with Variables."""
+    input_data = constant_op.constant(1., shape=[1])
+    root = tracking.AutoTrackable()
+    root.v1 = variables.Variable(3.)
+    root.v2 = variables.Variable(2.)
+    root.f = def_function.function(lambda x: root.v1 * root.v2 * x)
+    concrete_func = root.f.get_concrete_function(input_data)
+
+    variable_graph_def = concrete_func.graph.as_graph_def()
+    self.assertEqual(2, self._getNumVariables(variable_graph_def))
+
+    constant_graph_def = convert_to_constants.convert_variables_to_constants_v2(
+        concrete_func)
+    self.assertEqual(0, self._getNumVariables(constant_graph_def))
+    self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
+
+    # Check value.
+    expected_value = root.f(input_data)
+    actual_value = self._evaluateGraphDef(constant_graph_def, concrete_func,
+                                          [input_data.numpy()])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+  @test_util.run_v2_only
+  def testVariableSavedModel(self):
+    """Test a basic model with Variables with saving/loading the SavedModel."""
+    input_data = constant_op.constant(1., shape=[1])
+    root = tracking.AutoTrackable()
+    root.v1 = variables.Variable(3.)
+    root.v2 = variables.Variable(2.)
+    root.f = def_function.function(lambda x: root.v1 * root.v2 * x)
+    to_save = root.f.get_concrete_function(input_data)
+
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save(root, save_dir, to_save)
+    saved_model = load(save_dir)
+    concrete_func = saved_model.signatures["serving_default"]
+
+    variable_graph_def = concrete_func.graph.as_graph_def()
+    self.assertTrue(self._hasStatefulPartitionedCallOp(variable_graph_def))
+
+    constant_graph_def = convert_to_constants.convert_variables_to_constants_v2(
+        concrete_func)
+    self.assertEqual(0, self._getNumVariables(constant_graph_def))
+    self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
+
+    # Check value.
+    expected_value = root.f(input_data)
+    actual_value = self._evaluateGraphDef(constant_graph_def, concrete_func,
+                                          [input_data.numpy()])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+  @test_util.run_v2_only
+  def testMultiFunctionModel(self):
+    """Test a basic model with Variables."""
+
+    class BasicModel(tracking.AutoTrackable):
+
+      def __init__(self):
+        self.y = None
+        self.z = None
+
+      @def_function.function
+      def add(self, x):
+        if self.y is None:
+          self.y = variables.Variable(2.)
+        return x + self.y
+
+      @def_function.function
+      def sub(self, x):
+        if self.z is None:
+          self.z = variables.Variable(3.)
+        return x - self.z
+
+    input_data = constant_op.constant(1., shape=[1])
+    root = BasicModel()
+    concrete_func = root.add.get_concrete_function(input_data)
+
+    variable_graph_def = concrete_func.graph.as_graph_def()
+    self.assertEqual(1, self._getNumVariables(variable_graph_def))
+
+    constant_graph_def = convert_to_constants.convert_variables_to_constants_v2(
+        concrete_func)
+    self.assertEqual(0, self._getNumVariables(constant_graph_def))
+    self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def))
+
+    # Check value.
+    expected_value = root.add(input_data)
+    actual_value = self._evaluateGraphDef(constant_graph_def, concrete_func,
+                                          [input_data.numpy()])
+    self.assertEqual(expected_value.numpy(), actual_value)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/framework/device_test.py b/tensorflow/python/framework/device_test.py
index 0859e956ffd5a2c905837c5f6e68658d11403ae5..cd4b4ea51e62dd1c022316b30cb9203f089a92d3 100644
--- a/tensorflow/python/framework/device_test.py
+++ b/tensorflow/python/framework/device_test.py
@@ -18,8 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import device
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
@@ -116,6 +119,20 @@ class DeviceTest(test_util.TensorFlowTestCase):
         "/job:muu/device:MyFunnyDevice:2"))
     self.assertEquals("/job:muu/task:1/device:MyFunnyDevice:2", d.to_string())
 
+    if not context.executing_eagerly():
+      with ops.device(device.merge_device("/device:GPU:0")):
+        var1 = variables.Variable(1.0)
+        self.assertEquals("/device:GPU:0", var1.device)
+        with ops.device(device.merge_device("/job:worker")):
+          var2 = variables.Variable(1.0)
+          self.assertEquals("/job:worker/device:GPU:0", var2.device)
+          with ops.device(device.merge_device("/device:CPU:0")):
+            var3 = variables.Variable(1.0)
+            self.assertEquals("/job:worker/device:CPU:0", var3.device)
+            with ops.device(device.merge_device("/job:ps")):
+              var4 = variables.Variable(1.0)
+              self.assertEquals("/job:ps/device:CPU:0", var4.device)
+
   def testCanonicalName(self):
     self.assertEqual("/job:foo/replica:0",
                      device.canonical_name("/job:foo/replica:0"))
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 9a4fe4e93b32aeedcb74cf0f7b2703f64d9db23a..6638be219d00f2a2bc4981a3b32eea184f2ef31d 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -282,9 +282,6 @@ class DType(object):
     """Returns the string name for this `DType`."""
     return _TYPE_TO_STRING[self._type_enum]
 
-  def __int__(self):
-    return self._type_enum
-
   def __str__(self):
     return "<dtype: %r>" % self.name
 
@@ -535,29 +532,47 @@ _np_qint32 = np.dtype([("qint32", np.int32, 1)])
 np_resource = np.dtype([("resource", np.ubyte, 1)])
 
 # Standard mappings between types_pb2.DataType values and numpy.dtypes.
-_NP_TO_TF = frozenset([
-    (np.float16, float16),
-    (np.float32, float32),
-    (np.float64, float64),
-    (np.int32, int32),
-    (np.int64, int64),
-    (np.uint8, uint8),
-    (np.uint16, uint16),
-    (np.uint32, uint32),
-    (np.uint64, uint64),
-    (np.int16, int16),
-    (np.int8, int8),
-    (np.complex64, complex64),
-    (np.complex128, complex128),
-    (np.object_, string),
-    (np.bool_, bool),
-    (_np_qint8, qint8),
-    (_np_quint8, quint8),
-    (_np_qint16, qint16),
-    (_np_quint16, quint16),
-    (_np_qint32, qint32),
-    (_np_bfloat16, bfloat16),
-])
+_NP_TO_TF = {
+    np.float16: float16,
+    np.float32: float32,
+    np.float64: float64,
+    np.int32: int32,
+    np.int64: int64,
+    np.uint8: uint8,
+    np.uint16: uint16,
+    np.uint32: uint32,
+    np.uint64: uint64,
+    np.int16: int16,
+    np.int8: int8,
+    np.complex64: complex64,
+    np.complex128: complex128,
+    np.object_: string,
+    np.string_: string,
+    np.unicode_: string,
+    np.bool_: bool,
+    _np_qint8: qint8,
+    _np_quint8: quint8,
+    _np_qint16: qint16,
+    _np_quint16: quint16,
+    _np_qint32: qint32,
+    _np_bfloat16: bfloat16,
+}
+
+# Map (some) NumPy platform dtypes to TF ones using their fixed-width
+# synonyms. Note that platform dtypes are not always simples aliases,
+# i.e. reference equality is not guaranteed. See e.g. numpy/numpy#9799.
+for pdt in [
+    np.intc,
+    np.uintc,
+    np.int_,
+    np.uint,
+    np.longlong,
+    np.ulonglong,
+]:
+  if pdt not in _NP_TO_TF:
+    _NP_TO_TF[pdt] = next(
+        _NP_TO_TF[dt] for dt in _NP_TO_TF if dt == pdt().dtype)
+
 _TF_TO_NP = {
     types_pb2.DT_HALF:
         np.float16,
@@ -664,6 +679,20 @@ _PYTHON_TO_TF = {
     builtins.object: string
 }
 
+_ANY_TO_TF = {}
+_ANY_TO_TF.update(_INTERN_TABLE)
+_ANY_TO_TF.update(_STRING_TO_TF)
+_ANY_TO_TF.update(_PYTHON_TO_TF)
+_ANY_TO_TF.update(_NP_TO_TF)
+
+# Ensure no collisions.
+assert len(_ANY_TO_TF) == sum(len(d) for d in [
+    _INTERN_TABLE,
+    _STRING_TO_TF,
+    _PYTHON_TO_TF,
+    _NP_TO_TF
+])
+
 
 @tf_export("dtypes.as_dtype", "as_dtype")
 def as_dtype(type_value):
@@ -684,36 +713,16 @@ def as_dtype(type_value):
   if isinstance(type_value, DType):
     return type_value
 
-  try:
-    return _INTERN_TABLE[type_value]
-  except KeyError:
-    pass
-
-  try:
-    return _STRING_TO_TF[type_value]
-  except KeyError:
-    pass
+  if isinstance(type_value, np.dtype):
+    try:
+      return _NP_TO_TF[type_value.type]
+    except KeyError:
+      pass
 
   try:
-    return _PYTHON_TO_TF[type_value]
+    return _ANY_TO_TF[type_value]
   except KeyError:
     pass
 
-  if isinstance(type_value, np.dtype):
-    # The numpy dtype for strings is variable length. We can not compare
-    # dtype with a single constant (np.string does not exist) to decide
-    # dtype is a "string" type. We need to compare the dtype.type to be
-    # sure it's a string type.
-    if type_value.type == np.string_ or type_value.type == np.unicode_:
-      return string
-
-  if isinstance(type_value, (type, np.dtype)):
-    for key, val in _NP_TO_TF:
-      try:
-        if key == type_value:
-          return val
-      except TypeError as e:
-        raise TypeError("Cannot convert {} to a dtype. {}".format(
-            type_value, e))
-
-  raise TypeError("Cannot convert value %r to a TensorFlow DType." % type_value)
+  raise TypeError(
+      "Cannot convert value %r to a TensorFlow DType." % type_value)
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index 719fdc0953ae4d5bbe016b3dc2730f5601c3494e..7dd2a792d1254027401d03b9dacddbb815cf4858 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -295,6 +295,9 @@ class TypesTest(test_util.TensorFlowTestCase):
     self.assertNotEqual(dtypes.int32, int)
     self.assertNotEqual(dtypes.float64, 2.1)
 
+  def testPythonLongConversion(self):
+    self.assertIs(dtypes.int64, dtypes.as_dtype(np.array(2**32).dtype))
+
   def testPythonTypesConversion(self):
     self.assertIs(dtypes.float32, dtypes.as_dtype(float))
     self.assertIs(dtypes.bool, dtypes.as_dtype(bool))
diff --git a/tensorflow/python/framework/error_interpolation.py b/tensorflow/python/framework/error_interpolation.py
index 37a634d80679b095d319cabcd29208a35c4fe44f..b671dfbfaa12ed47b2ca5de0a923280af95de2ef 100644
--- a/tensorflow/python/framework/error_interpolation.py
+++ b/tensorflow/python/framework/error_interpolation.py
@@ -31,7 +31,7 @@ import six
 
 from tensorflow.python.util import tf_stack
 
-_NAME_REGEX = r"[A-Za-z0-9.][A-Za-z0-9_.\-/]*?"
+_NAME_REGEX = r"[A-Za-z0-9_.][A-Za-z0-9_.\-/]*?"
 _TAG_REGEX = r"{{{{({name}) ({name})}}}}".format(name=_NAME_REGEX)
 _INTERPOLATION_REGEX = r"^(.*?)({tag})".format(tag=_TAG_REGEX)
 _INTERPOLATION_PATTERN = re.compile(_INTERPOLATION_REGEX, re.DOTALL)
@@ -41,11 +41,13 @@ _ParseTag = collections.namedtuple("_ParseTag", ["type", "name"])
 _BAD_FILE_SUBSTRINGS = [
     os.path.join("tensorflow", "python"),
     os.path.join("tensorflow", "contrib"),
+    os.path.join("tensorflow_estimator", "python"),
+    os.path.join("tensorflow_estimator", "contrib"),
     "<embedded",
 ]
 
 
-def _parse_message(message):
+def parse_message(message):
   """Parses the message.
 
   Splits the message into separators and tags. Tags are named tuples
@@ -177,9 +179,9 @@ def _compute_colocation_summary_from_op(op, prefix=""):
 
 
 def _find_index_of_defining_frame_for_op(op):
-  """Return index in op._traceback with first 'useful' frame.
+  """Return index in op.traceback with first 'useful' frame.
 
-  This method reads through the stack stored in op._traceback looking for the
+  This method reads through the stack stored in op.traceback looking for the
   innermost frame which (hopefully) belongs to the caller.  It accomplishes this
   by rejecting frames whose filename appears to come from TensorFlow (see
   error_interpolation._BAD_FILE_SUBSTRINGS for the list of rejected substrings).
@@ -189,15 +191,13 @@ def _find_index_of_defining_frame_for_op(op):
         location.
 
   Returns:
-    Integer index into op._traceback where the first non-TF file was found
+    Integer index into op.traceback where the first non-TF file was found
     (innermost to outermost), or 0 (for the outermost stack frame) if all files
     came from TensorFlow.
   """
-  # pylint: disable=protected-access
   # Index 0 of tf_traceback is the outermost frame.
-  tf_traceback = tf_stack.convert_stack(op._traceback)
+  tf_traceback = op.traceback
   size = len(tf_traceback)
-  # pylint: enable=protected-access
   filenames = [frame[tf_stack.TB_FILENAME] for frame in tf_traceback]
   # We process the filenames from the innermost frame to outermost.
   for idx, filename in enumerate(reversed(filenames)):
@@ -210,17 +210,49 @@ def _find_index_of_defining_frame_for_op(op):
 def _get_defining_frame_from_op(op):
   """Find and return stack frame where op was defined."""
   frame_index = _find_index_of_defining_frame_for_op(op)
-  # pylint: disable=protected-access
-  frame = op._traceback[frame_index]
-  # pylint: enable=protected-access
-  return frame
+  return op.traceback[frame_index]
+
+def compute_useful_stack(op):
+  """Return a list of line name and lineno pairs, which form a 'useful' stack.
 
+  Starting from the defining frame to the outermost one, this method computes
+  the contiguous portion of the 'useful' stack trace and returns each line as
+  a line name and lineno pair.
 
-def compute_field_dict(op):
+  Args:
+    op: op.Operation object having a _traceback member.
+
+  Returns:
+    A list of line name and lineno pairs. Below is an example of returned list:
+    [("tool_utils.py", "124", "func1", "a={}"), ("tool_utils.py", "21", "func2",
+    "for i in range(10):"), ....]
+  """
+  defining_frame_index = _find_index_of_defining_frame_for_op(op)
+  stack_trace = []
+  # The stack trace is collected from the defining (included) to the outermost.
+  # Include `frame_num` frames at most.
+  # Two lines from the TensorFlow library are included to show the node
+  # definition.
+  frame_num = 10
+  innermost_excluded = min(defining_frame_index + 2 + 1, len(op.traceback))
+  outermost_included = max(innermost_excluded - frame_num, 0)
+  for index in reversed(range(outermost_included, innermost_excluded)):
+    frame = op.traceback[index]
+    filename = frame[tf_stack.TB_FILENAME]
+    lineno = frame[tf_stack.TB_LINENO]
+    func = frame[tf_stack.TB_FUNCNAME]
+    code = frame[tf_stack.TB_CODEDICT]
+    stack_trace.append((filename, lineno, func, code))
+  return stack_trace
+
+
+def compute_field_dict(op, strip_file_prefix=""):
   """Return a dictionary mapping interpolation tokens to values.
 
   Args:
     op: op.Operation object having a _traceback member.
+    strip_file_prefix: The common path in the stacktrace. We remove the prefix
+    from the file names.
 
   Returns:
     A dictionary mapping string tokens to string values.  The keys are shown
@@ -248,6 +280,8 @@ def compute_field_dict(op):
   """
   frame = _get_defining_frame_from_op(op)
   filename = frame[tf_stack.TB_FILENAME]
+  if filename.startswith(strip_file_prefix):
+    filename = filename[len(strip_file_prefix):]
   lineno = frame[tf_stack.TB_LINENO]
   defined_at = " (defined at %s:%d)" % (filename, lineno)
   colocation_summary = _compute_colocation_summary_from_op(op)
@@ -265,11 +299,95 @@ def compute_field_dict(op):
   return field_dict
 
 
+def traceback_files_common_prefix(all_ops):
+  """Determines the common prefix from the paths of the stacktrace of 'all_ops'.
+
+  For example, if the paths are '/foo/bar/baz/' and '/foo/car', this would
+  return '/foo'.
+
+  Args:
+    all_ops: All the input nodes in the form of a list of lists of ops.
+
+  Returns:
+    The common prefix.
+  """
+  files = set()
+  for ops in all_ops:
+    if ops is None:
+      continue
+    for op in ops:
+      for frame in op.traceback:
+        filename = frame[tf_stack.TB_FILENAME]
+        if "<embedded" not in filename:
+          files.add(filename)
+  return os.path.split(os.path.commonprefix(list(files)))[0]
+
+
+def _sources_for_node(node, graph):
+  """Gets the input op nodes for 'node'.
+
+  Args:
+    node: The node.
+    graph: The graph containing the node.
+
+  Returns:
+    The unique input nodes.
+  """
+  inputs = set()
+  for name in node.node_def.input:
+    if name.startswith("^"):
+      name = name[1:]
+    try:
+      tensor = graph.get_tensor_by_name(name)
+      op = tensor.op
+    except (KeyError, ValueError):
+      try:
+        op = graph.get_operation_by_name(name)
+      except KeyError:
+        continue
+    inputs.add(op)
+
+  return list(inputs)
+
+
+def _build_error_message(op, input_ops, common_prefix):
+  """Returns the formatted error message for the given op.
+
+  Args:
+    op: The node.
+    input_ops: The input nodes to the 'op' node
+    common_prefix: The prefix path common to the stacktrace of inputs.
+
+  Returns:
+    The formatted error message for the given op. The error message also
+    includes the information about the input sources for the given op.
+  """
+  field_dict = compute_field_dict(op, common_prefix)
+  msg = "node %s%s " % (op.name, field_dict["defined_at"])
+  input_debug_info = []
+  # This stores the line numbers that we have already printed.
+  done = set()
+  done.add(field_dict["defined_at"])
+  for op_inp in input_ops:
+    field_dict_inp = compute_field_dict(op_inp, common_prefix)
+    if field_dict_inp["defined_at"] not in done:
+      input_debug_info.append(
+          " %s%s" % (op_inp.name, field_dict_inp["defined_at"]))
+      done.add(field_dict_inp["defined_at"])
+  if input_debug_info:
+    end_msg = ("\nInput Source operations connected to node %s:\n") % (op.name)
+    end_msg += "\t\n".join(input_debug_info)
+  else:
+    end_msg = ""
+  return msg, end_msg
+
+
 def interpolate(error_message, graph):
   """Interpolates an error message.
 
   The error message can contain tags of the form `{{type name}}` which will be
-  replaced.
+  replaced. For example: "{{node <name>}}" would get expanded to:
+  "node <name>(defined at <path>)".
 
   Args:
     error_message: A string to interpolate.
@@ -279,27 +397,45 @@ def interpolate(error_message, graph):
   Returns:
     The string with tags of the form {{type name}} interpolated.
   """
-  seps, tags = _parse_message(error_message)
+  seps, tags = parse_message(error_message)
   subs = []
-  end_msg = ""
+  end_msg = collections.defaultdict(list)
+  tagged_ops = []
 
   for t in tags:
     try:
       op = graph.get_operation_by_name(t.name)
     except KeyError:
       op = None
-
-    msg = "{{%s %s}}" % (t.type, t.name)
-    if op is not None:
-      field_dict = compute_field_dict(op)
-      if t.type == "node":
-        msg = "node %s%s " % (t.name, field_dict["defined_at"])
-      elif t.type == "colocation_node":
-        msg = "node %s%s having device %s " % (t.name, field_dict["defined_at"],
-                                               field_dict["devices"])
-        end_msg += "\n\n" + field_dict["devs_and_colocs"]
+    if op is None:
+      tagged_ops.append(None)
+    else:
+      tagged_ops.append([op] + _sources_for_node(op, graph))
+
+  common_prefix = traceback_files_common_prefix(tagged_ops)
+  for tag, ops in zip(tags, tagged_ops):
+    msg = "{{%s %s}}" % (tag.type, tag.name)
+    if ops is not None:
+      if tag.type == "node":
+        msg, source_msg = _build_error_message(ops[0], ops[1:], common_prefix)
+        if source_msg:
+          end_msg["source_nodes"].append(source_msg)
+      elif tag.type == "colocation_node":
+        field_dict = compute_field_dict(ops[0], common_prefix)
+        msg = "node %s%s placed on device %s " % (
+            ops[0].name, field_dict["defined_at"], field_dict["devices"])
+        end_msg["colocations"].append(field_dict["devs_and_colocs"])
+    if tag.type == "function_node":
+      msg = ""
     subs.append(msg)
-  subs.append(end_msg)
+
+  if "source_nodes" in end_msg:
+    subs.append("\n\nErrors may have originated from an input operation.")
+    subs.append("\n".join(end_msg["source_nodes"]))
+    end_msg.pop("source_nodes", None)
+  for k, messages in end_msg.items():
+    subs.append("Additional information about %s:" % k)
+    subs.append("\n".join(messages))
 
   return "".join(
       itertools.chain(*six.moves.zip_longest(seps, subs, fillvalue="")))
diff --git a/tensorflow/python/framework/error_interpolation_test.py b/tensorflow/python/framework/error_interpolation_test.py
index 9eaa4a5f2d04c8baaf720d4b9a32c5c707d33772..5ddbac72ff36e8baab13b4a90f955da6d2ef4ca7 100644
--- a/tensorflow/python/framework/error_interpolation_test.py
+++ b/tensorflow/python/framework/error_interpolation_test.py
@@ -19,12 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import re
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import error_interpolation
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework import traceable_stack
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 from tensorflow.python.util import tf_stack
 
@@ -113,7 +115,7 @@ class ComputeColocationSummaryFromOpTest(test.TestCase):
     self.assertIn("No node-device colocations", summary)
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_deprecated_v1
 class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
 
   def setUp(self):
@@ -195,7 +197,45 @@ class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
     self.assertRegexpMatches(interpolated_string, "constant_op.py:[0-9]+.*")
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_deprecated_v1
+class InputNodesTest(test.TestCase):
+
+  def setUp(self):
+    # Add nodes to the graph for retrieval by name later.
+    one = constant_op.constant(1, name="One")
+    two = constant_op.constant(2, name="Two")
+    three = math_ops.add(one, two, name="Three")
+    self.graph = three.graph
+
+    # Change the list of bad file substrings so that constant_op.py is chosen
+    # as the defining stack frame for constant_op.constant ops.
+    self.old_bad_strings = error_interpolation._BAD_FILE_SUBSTRINGS
+    error_interpolation._BAD_FILE_SUBSTRINGS = [
+        "%sops.py" % os.sep,
+        "%sutil" % os.sep,
+    ]
+
+  def tearDown(self):
+    error_interpolation._BAD_FILE_SUBSTRINGS = self.old_bad_strings
+
+  def testNoInputs(self):
+    two_tags_with_seps = ";;;{{node One}},,,{{node Two}};;;"
+    interpolated_string = error_interpolation.interpolate(
+        two_tags_with_seps, self.graph)
+    expected_regex = (
+        r"^;;;.*constant_op.py:[0-9]+\) ,,,.*constant_op.py:[0-9]+\) ;;;$")
+    self.assertRegexpMatches(interpolated_string, expected_regex)
+
+  def testBasicInputs(self):
+    tag = ";;;{{node Three}};;;"
+    interpolated_string = error_interpolation.interpolate(tag, self.graph)
+    expected_regex = re.compile(
+        r"^;;;.*op_def_library.py:[0-9]+\) ;;;.*Input.*constant_op.py:[0-9]+\)",
+        re.DOTALL)
+    self.assertRegexpMatches(interpolated_string, expected_regex)
+
+
+@test_util.run_deprecated_v1
 class InterpolateDeviceSummaryTest(test.TestCase):
 
   def _fancy_device_function(self, unused_op):
@@ -239,7 +279,7 @@ class InterpolateDeviceSummaryTest(test.TestCase):
     self.assertRegexpMatches(result, expected_re)
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_deprecated_v1
 class InterpolateColocationSummaryTest(test.TestCase):
 
   def setUp(self):
@@ -264,13 +304,11 @@ class InterpolateColocationSummaryTest(test.TestCase):
 
     self.graph = node_three.graph
 
-  @test_util.run_v1_only("b/120545219")
   def testNodeThreeHasColocationInterpolation(self):
     message = "{{colocation_node Three_with_one}}"
     result = error_interpolation.interpolate(message, self.graph)
     self.assertIn("colocate_with(One)", result)
 
-  @test_util.run_v1_only("b/120545219")
   def testNodeFourHasColocationInterpolationForNodeThreeOnly(self):
     message = "{{colocation_node Four_with_three}}"
     result = error_interpolation.interpolate(message, self.graph)
@@ -279,14 +317,12 @@ class InterpolateColocationSummaryTest(test.TestCase):
         "One", result,
         "Node One should not appear in Four_with_three's summary:\n%s" % result)
 
-  @test_util.run_v1_only("b/120545219")
   def testNodeFiveHasColocationInterpolationForNodeOneAndTwo(self):
     message = "{{colocation_node Five_with_one_with_two}}"
     result = error_interpolation.interpolate(message, self.graph)
     self.assertIn("colocate_with(One)", result)
     self.assertIn("colocate_with(Two)", result)
 
-  @test_util.run_v1_only("b/120545219")
   def testColocationInterpolationForNodeLackingColocation(self):
     message = "{{colocation_node One}}"
     result = error_interpolation.interpolate(message, self.graph)
diff --git a/tensorflow/python/framework/errors_impl.py b/tensorflow/python/framework/errors_impl.py
index faa4fa7c6fa47f4328c6c04569aacde48b51b6c0..c473dfeedf8d232d5b5211fe5982ab4f8ea41fee 100644
--- a/tensorflow/python/framework/errors_impl.py
+++ b/tensorflow/python/framework/errors_impl.py
@@ -24,12 +24,28 @@ import warnings
 from tensorflow.core.lib.core import error_codes_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.framework import c_api_util
+from tensorflow.python.framework import error_interpolation
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util import tf_stack
 from tensorflow.python.util.tf_export import tf_export
 
 
+def _compact_stack_trace(op):
+  """Returns a traceback for `op` with common file prefixes stripped."""
+  compact_traces = []
+  common_prefix = error_interpolation.traceback_files_common_prefix([[op]])
+  for frame in op.traceback:
+    frame = list(frame)
+    filename = frame[tf_stack.TB_FILENAME]
+    if filename.startswith(common_prefix):
+      filename = filename[len(common_prefix):]
+      frame[tf_stack.TB_FILENAME] = filename
+    compact_traces.append(tuple(frame))
+  return compact_traces
+
+
 @tf_export("errors.OpError", v1=["errors.OpError", "OpError"])
 @deprecation.deprecated_endpoints("OpError")
 class OpError(Exception):
@@ -94,9 +110,10 @@ class OpError(Exception):
 
   def __str__(self):
     if self._op is not None:
-      output = ["%s\n\nCaused by op %r, defined at:\n" % (self.message,
+      output = ["%s\n\nOriginal stack trace for %r:\n" % (self.message,
                                                           self._op.name,)]
-      curr_traceback_list = traceback.format_list(self._op.traceback)
+      curr_traceback_list = traceback.format_list(
+          _compact_stack_trace(self._op))
       output.extend(curr_traceback_list)
       # pylint: disable=protected-access
       original_op = self._op._original_op
@@ -106,7 +123,8 @@ class OpError(Exception):
             "\n...which was originally created as op %r, defined at:\n"
             % (original_op.name,))
         prev_traceback_list = curr_traceback_list
-        curr_traceback_list = traceback.format_list(original_op.traceback)
+        curr_traceback_list = traceback.format_list(
+            _compact_stack_trace(original_op))
 
         # Attempt to elide large common subsequences of the subsequent
         # stack traces.
@@ -136,8 +154,6 @@ class OpError(Exception):
         # pylint: disable=protected-access
         original_op = original_op._original_op
         # pylint: enable=protected-access
-      output.append("\n%s (see above for traceback): %s\n" %
-                    (type(self).__name__, self.message))
       return "".join(output)
     else:
       return self.message
@@ -495,7 +511,11 @@ def exception_type_from_error_code(error_code):
 
 @tf_export("errors.error_code_from_exception_type")
 def error_code_from_exception_type(cls):
-  return _EXCEPTION_CLASS_TO_CODE[cls]
+  try:
+    return _EXCEPTION_CLASS_TO_CODE[cls]
+  except KeyError:
+    warnings.warn("Unknown class exception")
+    return UnknownError(None, None, "Unknown class exception", None)
 
 
 def _make_specific_exception(node_def, op, message, error_code):
diff --git a/tensorflow/python/framework/errors_test.py b/tensorflow/python/framework/errors_test.py
index 574b126caeef87c5e05f4f08a9432b22d2f8040d..c044202d92ad549d48fd0f4d9ace79b4e9a8ef97 100644
--- a/tensorflow/python/framework/errors_test.py
+++ b/tensorflow/python/framework/errors_test.py
@@ -70,6 +70,10 @@ class ErrorsTest(test.TestCase):
           isinstance(
               errors_impl._make_specific_exception(None, None, None,
                                                    error_code), exc_type))
+      # error_code_from_exception_type and exception_type_from_error_code should
+      # be consistent with operation result.
+      self.assertEqual(error_code,
+                       errors_impl.error_code_from_exception_type(exc_type))
       # pylint: enable=protected-access
 
   def testKnownErrorClassForEachErrorCodeInProto(self):
@@ -98,6 +102,14 @@ class ErrorsTest(test.TestCase):
     self.assertTrue("Unknown error code: 37" in str(w[0].message))
     self.assertTrue(isinstance(exc, errors_impl.OpError))
 
+    with warnings.catch_warnings(record=True) as w:
+      # pylint: disable=protected-access
+      exc = errors_impl.error_code_from_exception_type("Unknown")
+      # pylint: enable=protected-access
+    self.assertEqual(1, len(w))
+    self.assertTrue("Unknown class exception" in str(w[0].message))
+    self.assertTrue(isinstance(exc, errors_impl.OpError))
+
   def testStatusDoesNotLeak(self):
     try:
       with errors.raise_exception_on_not_ok_status() as status:
diff --git a/tensorflow/python/framework/fast_tensor_util.pyx b/tensorflow/python/framework/fast_tensor_util.pyx
index 2e3e15f53a919bac669b56e4a8f27c1808da345a..dc14361d666637f6fc37fb6b39ed2ea313b2286a 100644
--- a/tensorflow/python/framework/fast_tensor_util.pyx
+++ b/tensorflow/python/framework/fast_tensor_util.pyx
@@ -131,4 +131,4 @@ def AppendBoolArrayToTensorProto(tensor_proto, nparray):
   cdef long i, n
   n = nparray.size
   for i in range(n):
-    tensor_proto.bool_val.append(np.asscalar(nparray[i]))
+    tensor_proto.bool_val.append(nparray.item(i))
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index bd4ed5553e7b0b2445344d5c36c2209e59d64d14..9097a8dd1f05de0dc271df18721557f979c44c29 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -18,15 +18,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
+import collections as py_collections
+import itertools
 import weakref
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.eager import context
+from tensorflow.python.eager import execute
 from tensorflow.python.eager import tape
 from tensorflow.python.eager.graph_only_ops import graph_placeholder
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework.auto_control_deps import AutomaticControlDependencies
 from tensorflow.python.ops import array_ops
@@ -35,7 +37,9 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import compat
+from tensorflow.python.util import memory
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.lazy_loader import LazyLoader
 
@@ -56,6 +60,60 @@ WHITELIST_COLLECTIONS = [
 ]
 
 
+class UnknownArgument(object):
+  """Signifies an argument which is not currently handled."""
+  pass
+
+
+def convert_structure_to_signature(structure, arg_names=None):
+  """Convert a potentially nested structure to a signature.
+
+  Args:
+    structure: Structure to convert, where top level collection is a list or a
+      tuple.
+    arg_names: Optional list of arguments that has equal number of elements as
+      `structure` and is used for naming corresponding TensorSpecs.
+
+  Returns:
+    Identical structure that has TensorSpec objects instead of Tensors and
+    UknownArgument instead of any unsupported types.
+  """
+
+  def encode_arg(arg, name=None):
+    """A representation for this argument, for converting into signatures."""
+    if isinstance(arg, ops.Tensor):
+      return tensor_spec.TensorSpec(arg.shape, arg.dtype, name)
+    if isinstance(arg, (
+        int,
+        float,
+        bool,
+        type(None),
+        dtypes.DType,
+        tensor_spec.TensorSpec,
+    )):
+      return arg
+    return UnknownArgument()
+
+  # We are using the flattened paths to name the TensorSpecs. We need an
+  # explicit name for them downstream.
+  flattened = nest.flatten_with_tuple_paths(structure)
+  if arg_names:
+    if len(arg_names) != len(structure):
+      raise ValueError(
+          "Passed in arg_names don't match actual signature (%s)." % arg_names)
+    # Replace all top-level names with their actual arg_names. If a path before
+    # was "(2,'a',1)", it will become "(arg_names[2],'a',1)".
+    flattened = [
+        ((arg_names[path[0]],) + path[1:], arg) for path, arg in flattened
+    ]
+
+  mapped = [
+      encode_arg(arg, "/".join([str(p) for p in path]))
+      for path, arg in flattened
+  ]
+  return nest.pack_sequence_as(structure, mapped)
+
+
 class FuncGraph(ops.Graph):
   """Graph representing a function body.
 
@@ -67,6 +125,9 @@ class FuncGraph(ops.Graph):
       inputs coming first.
     outputs: Tensors that will be returned by this function. The tensors are in
       this FuncGraph.
+    structured_input_signature: A tuple of (args, kwargs), which are both
+      possibly-nested python objects that were received by this function. Note
+      that these structures might contain Python `None`s.
     structured_outputs: A possibly-nested python object which will be returned
       by this function. The Tensors in this structure are the same as those of
       self.outputs. Note that this structure might contain Python `None`s.
@@ -76,9 +137,11 @@ class FuncGraph(ops.Graph):
     captures: Maps external tensor -> internal tensor (i.e. input placeholder).
       The entries are in the order they were captured.
     seed: The graph-level random seed.
+    capture_by_value: If True, the func graph will capture Variables by value
+      instead of reference.
   """
 
-  def __init__(self, name, read_only_collections=True):
+  def __init__(self, name, collections=None, capture_by_value=None):
     """Construct a new FuncGraph.
 
     The graph will inherit its graph key, collections, seed, and distribution
@@ -86,19 +149,36 @@ class FuncGraph(ops.Graph):
 
     Args:
       name: the name of the function.
-      read_only_collections: whether to not write function graph collections
-        back to default graph. Defaults to True.
+      collections: a dictionary of collections this FuncGraph should start
+        with. If not specified (None), the FuncGraph will read (but not write
+        to) the outer graph's collections that are not whitelisted, and both
+        read and write to the outer graph's collections that are whitelisted.
+        The current whitelisted collections are the global variables, the
+        local variables, and the trainable variables.
+        Defaults to None.
+      capture_by_value: An optional boolean. If True, the func graph will
+        capture Variables by value instead of reference. By default inherit
+        from outer graphs, and failing that will default to False.
     """
     super(FuncGraph, self).__init__()
 
     self.name = name
     self.inputs = []
     self.outputs = []
+    self.control_outputs = []
+    self.structured_input_signature = None
     self.structured_outputs = None
-    self._read_only_collections = read_only_collections
     self._weak_variables = []
     self.outer_graph = ops.get_default_graph()
-    self.captures = collections.OrderedDict()
+    self.captures = py_collections.OrderedDict()
+    # Inherit capture-by-value from outer graph.
+    if capture_by_value is not None:
+      self.capture_by_value = capture_by_value
+    elif self.outer_graph is not None and isinstance(
+        self.outer_graph, FuncGraph):
+      self.capture_by_value = self.outer_graph.capture_by_value
+    else:
+      self.capture_by_value = False
 
     self._building_function = True
     # Map from resource tensor name to last op (in program order) which uses
@@ -108,39 +188,19 @@ class FuncGraph(ops.Graph):
 
     graph = self.outer_graph
 
-    # pylint: disable=protected-access
-    # TODO(b/112906995, nareshmodi): distribution strategy depends on inheriting
-    # this stack from the default graph even in eager mode. Maybe it should be
-    # part of the eager context? This would also allow us to remove a
-    # get_default_graph() call from the function cache lookup.
-    self._distribution_strategy_stack = list(graph._distribution_strategy_stack)
-    # We ignore device placements from any outer scopes while tracing the
-    # function when possible, to avoid hard-coding them in the function
-    # graph. "Default" placements come from the PartitionedCallOp's placement,
-    # so that the same trace of the Python function may be placed on several
-    # different devices and saved functions may be placed on new devices when
-    # restored.
     if context.executing_eagerly():
       self.seed = context.global_seed()
       device_type = context.context().device_spec.device_type
       self._xla_compile = (device_type == "TPU" or device_type == "XLA_GPU"
                            or device_type == "XLA_CPU")
-      if self._distribution_strategy_stack or self._xla_compile:
-        self._add_device_to_stack(context.context().device_name)
     else:
       self.seed = graph.seed
       self._xla_compile = getattr(graph, "_xla_compile", False)
       # TODO(allenl): Figure out if we can remove colocation stack
       # specialization (currently used in cond_v2), here and in the cache key.
-      self._colocation_stack = graph._colocation_stack.copy()
-      if (self._distribution_strategy_stack
-          or self._xla_compile
-          or device_stack_has_callable(graph._device_function_stack)):
-        # Hard-code devices from device functions in the function body
-        self._device_function_stack = graph._device_function_stack.copy()
-    if not self._read_only_collections:
-      self._collections = graph._collections
-    else:
+      self._colocation_stack = graph._colocation_stack.copy()  # pylint: disable=protected-access
+
+    if collections is None:
       for collection_name in graph.get_all_collection_keys():
         if collection_name not in WHITELIST_COLLECTIONS:
           self._collections[collection_name] = graph.get_collection(
@@ -148,12 +208,61 @@ class FuncGraph(ops.Graph):
       for collection_name in WHITELIST_COLLECTIONS:
         self._collections[collection_name] = graph.get_collection_ref(
             collection_name)
-
-    self._variable_creator_stack = graph._variable_creator_stack
-    # Inherit the graph key, since this is used for matching variables in
-    # optimizers.
-    self._graph_key = graph._graph_key
-    # pylint: enable=protected-access
+    else:
+      self._collections = collections
+
+  def __str__(self):
+    return "FuncGraph(name=%s, id=%s)" % (self.name, id(self))
+
+  def as_default(self):
+    outer_cm = super(FuncGraph, self).as_default()
+
+    @tf_contextlib.contextmanager
+    def inner_cm():
+      """Context manager for copying distribute.Strategy scope information."""
+      graph = ops.get_default_graph()
+      # pylint: disable=protected-access
+      # TODO(b/112906995, nareshmodi): distribution strategy depends on
+      # inheriting this stack from the default graph even in eager mode. Maybe
+      # it should be part of the eager context? This would also allow us to
+      # remove a get_default_graph() call from the function cache lookup.
+      old_strategy_stack = self._distribution_strategy_stack
+      self._distribution_strategy_stack = list(
+          graph._distribution_strategy_stack)
+      # We ignore device placements from any outer scopes while tracing the
+      # function when possible, to avoid hard-coding them in the function
+      # graph. "Default" placements come from the PartitionedCallOp's placement,
+      # so that the same trace of the Python function may be placed on several
+      # different devices and saved functions may be placed on new devices when
+      # restored.
+      old_device_stack = self._device_function_stack
+      if context.executing_eagerly():
+        if self._distribution_strategy_stack or self._xla_compile:
+          self._add_device_to_stack(context.context().device_name)
+      else:
+        if (self._distribution_strategy_stack
+            or self._xla_compile
+            or device_stack_has_callable(graph._device_function_stack)):
+          # Hard-code devices from device functions in the function body
+          self._device_function_stack = graph._device_function_stack.copy()
+
+      old_creator_stack = self._variable_creator_stack
+      self._variable_creator_stack = graph._variable_creator_stack
+      # Inherit the graph key, since this is used for matching variables in
+      # optimizers.
+      old_graph_key = self._graph_key
+      self._graph_key = graph._graph_key
+      # pylint: enable=protected-access
+
+      with outer_cm as g:
+        try:
+          yield g
+        finally:
+          self._distribution_strategy_stack = old_strategy_stack
+          self._device_function_stack = old_device_stack
+          self._variable_creator_stack = old_creator_stack
+          self._graph_key = old_graph_key
+    return inner_cm()
 
   @property
   def output_types(self):
@@ -188,11 +297,39 @@ class FuncGraph(ops.Graph):
   def variables(self, var_list):
     self._weak_variables = [weakref.ref(v) for v in var_list]
 
+  def _capture_by_value(
+      self,
+      op_type,
+      inputs,
+      dtypes,  # pylint: disable=redefined-outer-name
+      input_types=None,
+      name=None,
+      attrs=None,
+      op_def=None,
+      compute_shapes=True,
+      compute_device=True):
+    # When capturing by value, do the read outside
+    reverse_captures = dict((v, k) for k, v in self.captures.items())
+    uncaptured_inputs = [reverse_captures.get(t, t) for t in inputs]
+    with ops.init_scope():
+      if context.executing_eagerly():
+        attr_list = ("dtype", int(attrs["dtype"].type))
+        value, = execute.execute(
+            compat.as_bytes(op_type), 1, uncaptured_inputs, attr_list,
+            context.context())
+      else:
+        op = ops.get_default_graph().create_op(
+            op_type, uncaptured_inputs, dtypes, input_types, name, attrs,
+            op_def, compute_shapes, compute_device)
+        value = op.outputs[0]
+    captured_value = self.capture(value)
+    return captured_value.op
+
   def create_op(
       self,
       op_type,
       inputs,
-      dtypes,
+      dtypes,  # pylint: disable=redefined-outer-name
       input_types=None,
       name=None,
       attrs=None,
@@ -231,6 +368,12 @@ class FuncGraph(ops.Graph):
     Returns:
       An `Operation` object.
     """
+    if self.capture_by_value and op_type in ["ReadVariableOp",
+                                             "ResourceGather"]:
+      return self._capture_by_value(
+          op_type, inputs, dtypes, input_types, name, attrs, op_def,
+          compute_shapes, compute_device)
+
     # This capturing logic interacts poorly with control flow contexts which
     # want to replace inputs of ops far too late in the process. This can lead
     # the context to get confused and try to create an Enter for an Enter. We
@@ -276,6 +419,19 @@ class FuncGraph(ops.Graph):
     if tensor.graph is not self:
       if name is None:
         name = tensor.op.name
+      inner_graph = tensor.graph
+      while inner_graph is not None and isinstance(inner_graph, FuncGraph):
+        if inner_graph is self:
+          raise ValueError(
+              "Trying to capture a tensor from an inner function. This can be "
+              "caused by accessing a tensor defined inside a loop or "
+              "conditional body, or a subfunction, from a calling function, "
+              "without going through the proper return value mechanism. "
+              "Consider using TensorFlow mechanisms such as TensorArrays "
+              "to return tensors from inner functions or loop / conditional "
+              "bodies. Tensor: %s; tensor graph: %s; this graph: %s"
+              % (tensor, tensor.graph, self))
+        inner_graph = inner_graph.outer_graph
       return self._capture_helper(tensor, name)
     return tensor
 
@@ -308,9 +464,13 @@ def func_graph_from_py_func(name,
                             signature=None,
                             func_graph=None,
                             autograph=False,
+                            autograph_options=None,
                             add_control_dependencies=True,
                             arg_names=None,
-                            op_return_value=None):
+                            op_return_value=None,
+                            collections=None,
+                            capture_by_value=None,
+                            override_flat_arg_shapes=None):
   """Returns a `FuncGraph` generated from `python_func`.
 
   Args:
@@ -329,6 +489,8 @@ def func_graph_from_py_func(name,
       this graph else a new one is built and returned.
     autograph: whether to use autograph to compile `python_func`.
       See https://www.tensorflow.org/guide/autograph for more information.
+    autograph_options: additional knobs to control when `autograph=True`.
+      See https://www.tensorflow.org/guide/autograph for more information.
     add_control_dependencies: If True, automatically adds control dependencies
       to ensure program order matches execution order and stateful ops always
       execute.
@@ -337,6 +499,22 @@ def func_graph_from_py_func(name,
     op_return_value: Optional. A Tensor. If set and `python_func` returns
       Operations, those return values will be replaced with this value. If not
       set, returning an Operation triggers an error.
+    collections: a dictionary of collections this FuncGraph should start
+      with. If not specified (None), the FuncGraph will read (but not write to)
+      the outer graph's collections that are not whitelisted, and both
+      read and write to the outer graph's collections that are whitelisted.
+      The current whitelisted collections are the global variables, the
+      local variables, and the trainable variables.
+      Defaults to None.
+    capture_by_value: An optional boolean. If True, the func graph will capture
+      Variables by value instead of reference. By default inherit from outer
+      graphs, and failing that will default to False.
+    override_flat_arg_shapes: An optional list of instances that are either
+      `None` or `TensorShape`.  The length must match that of
+      `nest.flatten((args, kwargs))`.  The entries containing value `None`
+      must match entries in flattened arguments containing non-tensors, while
+      entries containing a `TensorShape` must match entries in the flattened
+      arguments containing tensors.
 
   Returns:
     A FuncGraph.
@@ -344,35 +522,67 @@ def func_graph_from_py_func(name,
   Raises:
     TypeError: If any of `python_func`'s return values is neither `None` nor a
       `Tensor`.
+    ValueError: If both `signature` and `override_flat_arg_shapes` are
+      passed in.
   """
   if op_return_value is not None:
     assert isinstance(op_return_value, ops.Tensor), op_return_value
   if func_graph is None:
-    func_graph = FuncGraph(name)
+    func_graph = FuncGraph(name, collections=collections,
+                           capture_by_value=capture_by_value)
   assert isinstance(func_graph, FuncGraph)
   if add_control_dependencies:
-    control_manager = AutomaticControlDependencies
+    control_manager = AutomaticControlDependencies()
   else:
-    control_manager = ops.NullContextmanager
-  with func_graph.as_default(), control_manager() as a:
+    control_manager = ops.NullContextmanager()
+  with func_graph.as_default(), control_manager as a:
     current_scope = variable_scope.get_variable_scope()
     default_use_recource = current_scope.use_resource
     current_scope.set_use_resource(True)
 
+    if signature is not None and override_flat_arg_shapes is not None:
+      raise ValueError(
+          "Passed both signature and override_flat_arg_shapes: %s and %s."
+          % (signature, override_flat_arg_shapes))
+
     if signature is not None:
       args = signature
       kwargs = {}
 
     # Creates and names placeholders for all arguments.
-    func_args = _get_defun_inputs_from_args(args, arg_names)
-    func_kwargs = _get_defun_inputs_from_kwargs(kwargs)
+    if override_flat_arg_shapes is not None:
+      flat_args = nest.flatten(args)
+      arg_shapes = override_flat_arg_shapes[:len(flat_args)]
+      kwarg_shapes = override_flat_arg_shapes[len(flat_args):]
+    else:
+      arg_shapes = None
+      kwarg_shapes = None
+    func_args = _get_defun_inputs_from_args(
+        args, arg_names, flat_shapes=arg_shapes)
+    func_kwargs = _get_defun_inputs_from_kwargs(
+        kwargs, flat_shapes=kwarg_shapes)
+
+    # Convert all Tensors into TensorSpecs before saving the structured inputs.
+    # If storing pure concrete functions that are not called through polymorphic
+    # functions, we don't have access to FunctionSpec, so we need to call the
+    # TensorSpecs by their `arg_names` for later binding.
+    func_graph.structured_input_signature = (
+        convert_structure_to_signature(func_args, arg_names),
+        convert_structure_to_signature(func_kwargs))
+
+    flat_func_args = nest.flatten(func_args)
+    flat_func_kwargs = nest.flatten(func_kwargs)
+    # Temporarily set inputs to allow graph building code to inspect
+    # them. Reassigned below.
+    func_graph.inputs = [arg for arg in flat_func_args + flat_func_kwargs
+                         if isinstance(arg, ops.Tensor)]
 
     # Note: `nest.flatten` sorts by keys, as does `_deterministic_dict_values`.
     # Variables to help check whether mutation happens in calling the function
     # Copy the recursive list, tuple and map structure, but not base objects
-    func_args_before = nest.pack_sequence_as(func_args, nest.flatten(func_args))
+    func_args_before = nest.pack_sequence_as(func_args, flat_func_args)
     func_kwargs_before = nest.pack_sequence_as(
-        func_kwargs, nest.flatten(func_kwargs))
+        func_kwargs, flat_func_kwargs)
 
     def convert(x):
       """Converts a function output to a Tensor."""
@@ -386,7 +596,7 @@ def func_graph_from_py_func(name,
           x = array_ops.identity(op_return_value)
       elif not isinstance(x, tensor_array_ops.TensorArray):
         try:
-          x = ops.convert_to_tensor_or_indexed_slices(x)
+          x = ops.convert_to_tensor_or_composite(x)
         except (ValueError, TypeError):
           raise TypeError(
               "To be compatible with tf.contrib.eager.defun, Python functions "
@@ -404,14 +614,21 @@ def func_graph_from_py_func(name,
         _, original_func = tf_decorator.unwrap(python_func)
 
         def wrapper(*args, **kwargs):
+          # Note: functions annotated with @tf.function should always be
+          # converted even though they would meet autograph's whitelisting
+          # criteria.
+          # If this assumption is ever broken, converted_call will need to
+          # handle the possibility of original_func still being a shim, e.g.
+          # bound to WeakrefSelf.
           return autograph.converted_call(
               original_func, None,
               autograph.ConversionOptions(
                   verbose=autograph.Verbosity.BRIEF,
                   recursive=True,
                   strip_decorators=(def_function.function,),
-                  optional_features=(),
-              ), *args, **kwargs)
+                  optional_features=autograph_options,
+                  force_conversion=True,
+              ), args, kwargs)
 
         # Wrapping around a decorator allows checks like tf_inspect.getargspec
         # to be accurate.
@@ -440,7 +657,9 @@ def func_graph_from_py_func(name,
         # Even if an argument variable was not used in the function, we've
         # already manually captured the resource Tensor when creating argument
         # placeholders.
-        resource_placeholder = func_graph.captures.pop(arg.handle)
+        resource_placeholder = func_graph.captures.pop(arg.handle, None)
+        if resource_placeholder is None:
+          continue
         arg_variables.add(arg)
         inputs.append(resource_placeholder)
       elif isinstance(arg, ops.Tensor):
@@ -457,7 +676,10 @@ def func_graph_from_py_func(name,
 
     func_graph.variables = variables
 
-  # Register any other functions defined in the graph.
+  if add_control_dependencies:
+    func_graph.control_outputs.extend(control_manager.ops_which_must_run)
+
+# Register any other functions defined in the graph.
   with ops.init_scope():
     if context.executing_eagerly():
       for f in func_graph._functions.values():  # pylint: disable=protected-access
@@ -513,36 +735,25 @@ def flatten(sequence):
   Flattens non-tensor objects into their constituent tensors.
 
   Args:
-    sequence: A nested structure of Tensors, IndexedSlices, SparseTensors and
+    sequence: A nested structure of Tensors, CompositeTensors, and
       TensorArrays.
 
   Returns:
     A list of tensors.
   """
   # TODO(akshayka): Support `SparseTensor` in a similar fashion.
-  flat_sequence = nest.flatten(sequence)
-  outputs = []
-  for item in flat_sequence:
-    if isinstance(item, ops.IndexedSlices):
-      if item.dense_shape is not None:
-        outputs.extend([item.values, item.indices, item.dense_shape])
-      else:
-        outputs.extend([item.values, item.indices])
-    elif isinstance(item, sparse_tensor.SparseTensor):
-      outputs.extend([item.indices, item.values, item.dense_shape])
-    elif isinstance(item, tensor_array_ops.TensorArray):
-      outputs.append(item.flow)
-    else:
-      outputs.append(item)
-  return outputs
+  flat_sequence = nest.flatten(sequence, expand_composites=True)
+  return [
+      item.flow if isinstance(item, tensor_array_ops.TensorArray) else item
+      for item in flat_sequence]
 
 
 def pack_sequence_as(structure, flat_sequence):
   """Like `nest.pack_sequence_as` but also packs other Tensor-like objects.
 
   Args:
-    structure: The structure to pack into. May contain Tensors, IndexedSlices,
-      TensorArrays or SparseTensors.
+    structure: The structure to pack into. May contain Tensors,
+      CompositeTensors, or TensorArrays.
     flat_sequence: An iterable containing tensors.
 
   Returns:
@@ -551,33 +762,16 @@ def pack_sequence_as(structure, flat_sequence):
   Raises:
     AssertionError if `structure` and `flat_sequence` are not compatible.
   """
-  flattened_structure = nest.flatten(structure)
-  flat_sequence_with_slices_and_tas = []
-  index = 0
-  for t in flattened_structure:
-    if isinstance(t, ops.IndexedSlices):
-      if t.dense_shape is not None:
-        flat_sequence_with_slices_and_tas.append(
-            ops.IndexedSlices(*flat_sequence[index:index + 3]))
-        index += 3
-      else:
-        flat_sequence_with_slices_and_tas.append(
-            ops.IndexedSlices(*flat_sequence[index:index + 2]))
-        index += 2
-    elif isinstance(t, sparse_tensor.SparseTensor):
-      flat_sequence_with_slices_and_tas.append(
-          sparse_tensor.SparseTensor(*flat_sequence[index:index + 3]))
-      index += 3
-    elif isinstance(t, tensor_array_ops.TensorArray):
-      flow = flat_sequence[index]
-      ta = tensor_array_ops.build_ta_with_new_flow(t, flow)
-      flat_sequence_with_slices_and_tas.append(ta)
-      index += 1
-    else:
-      flat_sequence_with_slices_and_tas.append(flat_sequence[index])
-      index += 1
-  assert len(flattened_structure) == len(flat_sequence_with_slices_and_tas)
-  return nest.pack_sequence_as(structure, flat_sequence_with_slices_and_tas)
+  flat_sequence = list(flat_sequence)
+  flattened_structure = nest.flatten(structure, expand_composites=True)
+  if len(flattened_structure) != len(flat_sequence):
+    raise ValueError("Mismatch in element count")
+  for i in range(len(flat_sequence)):
+    if isinstance(flattened_structure[i], tensor_array_ops.TensorArray):
+      flat_sequence[i] = tensor_array_ops.build_ta_with_new_flow(
+          old_ta=flattened_structure[i], flow=flat_sequence[i])
+  return nest.pack_sequence_as(structure, flat_sequence, expand_composites=True)
+
 
 
 def _create_substitute_placeholder(value, name=None, dtype=None):
@@ -591,37 +785,73 @@ def _create_substitute_placeholder(value, name=None, dtype=None):
   return placeholder
 
 
-def _get_defun_inputs_from_args(args, names):
+def _get_defun_inputs_from_args(args, names, flat_shapes=None):
   """Maps Python function positional args to graph-construction inputs."""
-  return _get_defun_inputs(args, names, structure=args)
+  return _get_defun_inputs(
+      args, names, structure=args, flat_shapes=flat_shapes)
 
 
-def _get_defun_inputs(flat_args, names, structure):
+def _get_defun_inputs(args, names, structure, flat_shapes=None):
   """Maps python function args to graph-construction inputs.
 
   Args:
-    flat_args: A flat list of user-specified arguments.
+    args: A flat list of user-specified arguments.
     names: A list of strings with user-specified argument names, same length as
-      `flat_args`. May be `None`, in which case a generic name is used.
+      `args`. May be `None`, in which case a generic name is used.
     structure: The original argument list or dictionary.
+    flat_shapes: A flat list of values that are either `None` or
+      instances of `TensorShape`.  If provided, then length must match
+      that of `nest.flatten(args)`; and locations where `args` are
+      instances of `Tensor` must have a corresponding `TensorShape` in
+      `flat_shapes`.  May be `None`, in which case exact shapes are read
+      directly from the args.
 
   Returns:
     Placeholders with the same structure as `structure`.
+
+  Raises:
+    RuntimeError: if `flat_shapes` is provided, but
+     `len(flat_shapes) != len(nest.flatten(args))`.
+    RuntimeError: if a shape from `flat_shapes` is not None
+     for an argument that is not a `Tensor`, `TensorSpec`,
+     or `ResourceVariable`.
   """
   func_graph = ops.get_default_graph()
   function_inputs = []
   if names is None:
-    names = [None] * len(flat_args)
-  for arg_value, name in zip(flat_args, names):
+    names = [None] * len(args)
+  if flat_shapes is None:
+    shapes_iter = itertools.repeat(None)
+  else:
+    len_flat_args = len(nest.flatten(args))
+    if len_flat_args != len(flat_shapes):
+      raise RuntimeError(
+          "Length of fully flat shapes (%d) must match that of "
+          "flatten(args) (%d).  args: %s, flat_shapes: %s"
+          % (len(flat_shapes),
+             len_flat_args,
+             args,
+             flat_shapes))
+    shapes_iter = iter(flat_shapes)
+  for arg_value, name in zip(args, names):
     for arg in nest.flatten(arg_value):
+      # We have a shape entry for each arg, regadless of whether it's a real
+      # Tensor or not.  For non-tensor entries it should be None.
+      shape = next(shapes_iter)
       if isinstance(arg, (ops.Tensor, tensor_spec.TensorSpec)):
         if isinstance(arg, tensor_spec.TensorSpec) and arg.name:
           requested_name = arg.name
         else:
           requested_name = name
-        placeholder = graph_placeholder(
-            arg.dtype, arg.shape,
-            name=requested_name)
+        placeholder_shape = shape if shape is not None else arg.shape
+        try:
+          placeholder = graph_placeholder(
+              arg.dtype, placeholder_shape,
+              name=requested_name)
+        except ValueError:
+          # Sometimes parameter names are not valid op names, so fall back to
+          # unnamed placeholders.
+          placeholder = graph_placeholder(arg.dtype, placeholder_shape)
         if name is not None:
           # Record the requested/user-specified name in case it's different than
           # the uniquified name, for validation when exporting signatures.
@@ -640,15 +870,40 @@ def _get_defun_inputs(flat_args, names, structure):
             attr_value_pb2.AttrValue(s=compat.as_bytes(name)))
         function_inputs.append(arg)
       else:
+        if shape is not None:
+          raise RuntimeError(
+              "Expected provided shape override to be None for arg that isn't "
+              "a Tensor, but saw arg: '%s', shape: '%s'.  args: %s"
+              % (arg, shape, args))
         function_inputs.append(arg)
   return nest.pack_sequence_as(structure, function_inputs)
 
 
-def _get_defun_inputs_from_kwargs(kwargs):
+def _get_defun_inputs_from_kwargs(kwargs, flat_shapes):
   """Maps Python function keyword args to graph-construction inputs."""
   if kwargs:
-    names, flat_args = zip(*sorted(kwargs.items()))
+    names, args = zip(*sorted(kwargs.items()))
   else:
     names = []
-    flat_args = []
-  return _get_defun_inputs(flat_args, names, structure=kwargs)
+    args = []
+  return _get_defun_inputs(
+      args, names, structure=kwargs, flat_shapes=flat_shapes)
+
+
+def dismantle_func_graph(func_graph):
+  """Removes reference cycles in `func_graph` FuncGraph.
+
+  Helpful for making sure the garbage collector doesn't need to run when
+  the FuncGraph goes out of scope, e.g. in tests using defun with
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True).
+
+  Args:
+    func_graph: A `FuncGraph` object to destroy. `func_graph` is unusable
+      after this function.
+  """
+  # TODO(b/115366440): Delete this method when a custom OrderedDict is added.
+  # Clearing captures using clear() leaves some cycles around.
+  while func_graph.captures:
+    func_graph.captures.popitem()
+  memory.dismantle_ordered_dict(func_graph.captures)
+  ops.dismantle_graph(func_graph)
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index cfdc915a1b34930b8f5205550c547d0eec331e52..7002f163d131545a632cc90e06f31a8788901fae 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -129,13 +129,15 @@ class Defun(object):
   def __call__(self, func):
     # Various sanity checks on the callable func.
     if not callable(func):
-      raise ValueError("func %s must be callable" % func)
+      raise ValueError("function %s must be callable" % func)
 
     # Func should not use kwargs and defaults.
     argspec = tf_inspect.getargspec(func)
     if argspec.keywords or argspec.defaults:
-      raise ValueError("Functions with argument defaults or keywords "
-                       "arguments are not supported.")
+      raise ValueError(
+          "function with argument defaults or keywords arguments are not"
+          " supported. {} has defaults {} and keywords {}.".format(
+              func, argspec.defaults, argspec.keywords))
 
     # Computes how many arguments 'func' has.
     min_args = len(argspec.args)
@@ -210,6 +212,7 @@ class _DefinedFunction(object):
                shape_func=None,
                capture_by_value=False,
                whitelisted_stateful_ops=None,
+               capture_resource_var_by_value=True,
                **kwargs):
     """Creates _DefinedFunction.
 
@@ -232,6 +235,8 @@ class _DefinedFunction(object):
         will be copied into the function body.
       whitelisted_stateful_ops: A set of ops that if stateful we ignore and
         copy into the function body, when `capture_by_value` is True.
+      capture_resource_var_by_value: Boolean (defaults to True). If False,
+        captured resource variable returns the handle instead of value.
       **kwargs: The keyword arguments. **kwargs is passed to every call
         site of this function.
 
@@ -250,6 +255,7 @@ class _DefinedFunction(object):
     self._whitelisted_stateful_ops = whitelisted_stateful_ops
     if self._whitelisted_stateful_ops is None:
       self._whitelisted_stateful_ops = set()
+    self._capture_resource_var_by_value = capture_resource_var_by_value
     self._extra_kwargs = kwargs
     # Constructed only when C API is disabled, lazily
     self._definition = None
@@ -352,7 +358,8 @@ class _DefinedFunction(object):
         self._func_name,
         self._capture_by_value,
         self._caller_device,
-        whitelisted_stateful_ops=self._whitelisted_stateful_ops)
+        whitelisted_stateful_ops=self._whitelisted_stateful_ops,
+        capture_resource_var_by_value=self._capture_resource_var_by_value)
 
     self._extra_inputs = temp_graph.extra_inputs
     # pylint: disable=protected-access
@@ -407,6 +414,8 @@ class _DefinedFunction(object):
           [t._as_tf_output() for t in temp_graph.inputs],
           [t._as_tf_output() for t in temp_graph.outputs],
           output_names,
+          [], # control_outputs
+          [], # control_output_names
           None,  # opts
           description)
       self._c_func = c_api_util.ScopedTFFunction(c_func)
@@ -636,11 +645,12 @@ class _FuncGraph(ops.Graph):
   function argument and the caller passes in the captured tensor.
   """
 
-  def __init__(self, name, capture_by_value, whitelisted_stateful_ops, *args,
-               **kwargs):
+  def __init__(self, name, capture_by_value, whitelisted_stateful_ops,
+               capture_resource_var_by_value, *args, **kwargs):
     super(_FuncGraph, self).__init__(*args, **kwargs)
     self._capture_by_value = capture_by_value
     self._whitelisted_stateful_ops = whitelisted_stateful_ops
+    self._capture_resource_var_by_value = capture_resource_var_by_value
     self._building_function = True
     self._outer_graph = ops.get_default_graph()
     self._vscope = vs.get_variable_scope()
@@ -735,7 +745,8 @@ class _FuncGraph(ops.Graph):
           collections=collections,
           use_resource=use_resource)
       self.extra_vars.append(var)
-      if isinstance(var, resource_variable_ops.ResourceVariable):
+      if (isinstance(var, resource_variable_ops.ResourceVariable) and
+          self._capture_resource_var_by_value):
         # For resource-based variables read the variable outside the function
         # and pass in the value. This ensures that the function is pure and
         # differentiable. TODO(apassos) this may have performance problems if
@@ -830,7 +841,8 @@ def func_graph_from_py_func(func,
                             container=None,
                             collections_ref=None,
                             arg_shapes=None,
-                            whitelisted_stateful_ops=None):
+                            whitelisted_stateful_ops=None,
+                            capture_resource_var_by_value=True):
   """Returns a _FuncGraph generated from `func`.
 
   Args:
@@ -850,6 +862,8 @@ def func_graph_from_py_func(func,
     arg_shapes: A sequence of the function's argument shapes.
     whitelisted_stateful_ops: A set of ops that if stateful we ignore and
       re-create.
+    capture_resource_var_by_value: Boolean (defaults to True). If False,
+      captured resource variable returns the handle instead of value.
 
   Returns:
     A _FuncGraph.
@@ -859,7 +873,8 @@ def func_graph_from_py_func(func,
   """
   if not name:
     name = function_utils.get_func_name(func)
-  func_graph = _FuncGraph(name, capture_by_value, whitelisted_stateful_ops)
+  func_graph = _FuncGraph(name, capture_by_value, whitelisted_stateful_ops,
+                          capture_resource_var_by_value)
 
   with func_graph.as_default(), ops.device(device):
     # pylint: disable=protected-access
@@ -993,17 +1008,18 @@ def _call(sig, *inputs, **kwargs):
   name = kwargs.pop("name", None)
   g = ops.get_default_graph()
   func_name = sig.name
+  if name is None:
+    name = func_name
   attrs = _parse_kwargs_as_attrs(func_name, **kwargs)
   output_types = [dtypes.DType(x.type) for x in sig.output_arg]
-  with ops.name_scope(name, func_name, inputs) as name:
-    op = g.create_op(
-        func_name,
-        list(inputs),
-        output_types,
-        name=name,
-        attrs=attrs,
-        op_def=sig,
-        compute_shapes=False)
+  op = g.create_op(
+      func_name,
+      list(inputs),
+      output_types,
+      name=name,
+      attrs=attrs,
+      op_def=sig,
+      compute_shapes=False)
   if op.outputs:
     if len(op.outputs) == 1:
       ret = op.outputs[0]
@@ -1046,12 +1062,13 @@ def _from_definition(fdef, grad_func=None):
   c_func = c_api.TF_FunctionImportFunctionDef(serialized)
   result._c_func = c_api_util.ScopedTFFunction(c_func)
   result._extra_inputs = []
+  result._op_def = fdef.signature
   # pylint: enable=protected-access
 
   return result
 
 
-def _from_library(lib):
+def from_library(lib):
   """Creates _DefinedFunctions initialized from a FunctionDefLibrary proto.
 
   This method handles assigning the correct gradient functions to each
diff --git a/tensorflow/python/framework/function_def_to_graph.py b/tensorflow/python/framework/function_def_to_graph.py
index 4d1aabde06984ded2a6e04d549538bc0afdbdc75..aa670f1e37941689624797d31b64ffff63408c0b 100644
--- a/tensorflow/python/framework/function_def_to_graph.py
+++ b/tensorflow/python/framework/function_def_to_graph.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.framework import versions_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import versions
@@ -76,6 +77,14 @@ def function_def_to_graph(fdef, input_shapes=None):
   return func_graph
 
 
+def _is_function(fname):
+  """Checks for a function definition with `fname` in the current context."""
+  if context.executing_eagerly():
+    return context.context().has_function(fname)
+  else:
+    return ops.get_default_graph()._is_function(fname)  # pylint: disable=protected-access
+
+
 def function_def_to_graph_def(fdef, input_shapes=None):
   """Convert a FunctionDef to a GraphDef.
 
@@ -147,12 +156,12 @@ def function_def_to_graph_def(fdef, input_shapes=None):
     for attr in op_def.attr:
       if attr.type == "func":
         fname = node_def.attr[attr.name].func.name
-        if not ops.get_default_graph()._is_function(fname):  # pylint: disable=protected-access
+        if not _is_function(fname):
           raise ValueError("%s function not found." % fname)
       elif attr.type == "list(func)":
         for fn in node_def.attr[attr.name].list.func:
           fname = fn.name
-          if not ops.get_default_graph()._is_function(fname):  # pylint: disable=protected-access
+          if not _is_function(fname):
             raise ValueError("%s function not found." % fname)
 
     # Iterate over output_args in op_def to build the map.
@@ -168,8 +177,8 @@ def function_def_to_graph_def(fdef, input_shapes=None):
         flat_name = "{}:{}".format(node_def.name, flattened_index)
         nested_to_flat_tensor_name[nested_name] = flat_name
         flattened_index += 1
-      control_name = "^" + node_def.name
-      nested_to_flat_tensor_name[control_name] = control_name
+    control_name = "^" + node_def.name
+    nested_to_flat_tensor_name[control_name] = control_name
 
   # Update inputs of all nodes in graph.
   for node_def in graph_def.node:
diff --git a/tensorflow/python/framework/function_def_to_graph_test.py b/tensorflow/python/framework/function_def_to_graph_test.py
index ddf1a6e74d2f7772c94dc5b39034a28ba0d715b2..d1dc46d6f823911fea35121cce0fed3d9b38d183 100644
--- a/tensorflow/python/framework/function_def_to_graph_test.py
+++ b/tensorflow/python/framework/function_def_to_graph_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.framework import test_ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -225,12 +226,15 @@ class FunctionDefToGraphDefTest(test.TestCase):
 
   def testControlDependencies(self):
 
+    v = variables.Variable(1)
+
     @function.defun
     def fn(inp):
+      assign = v.assign(3, name="assign", read_value=False)
       x = constant_op.constant(2.0, name="x")
       # TODO(b/79881896): Test external control dependency once that's
       # supported.
-      with ops.control_dependencies([x, inp]):
+      with ops.control_dependencies([x, inp, assign]):
         constant_op.constant(3.0, name="y")
       return 4.0
 
@@ -239,9 +243,10 @@ class FunctionDefToGraphDefTest(test.TestCase):
     func_graph = function_def_to_graph.function_def_to_graph(fdef)
 
     op = func_graph.get_operation_by_name("y")
-    self.assertEqual(len(op.control_inputs), 2)
+    self.assertEqual(len(op.control_inputs), 3)
     self.assertEqual(op.control_inputs[0].name, "x")
     self.assertEqual(op.control_inputs[1].name, "inp")
+    self.assertEqual(op.control_inputs[2].name, "assign")
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 6ec71ba8e9053000629ce0cd0e020494adabfe2d..cd623223e32c29c48b4b338bf508a9cabd02c643 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -284,6 +284,7 @@ class FunctionTest(test.TestCase):
         out, = sess.run(dlogits, {logits: x, labels: y})
       self.assertAllClose(out, np.exp(prob - y))
 
+  @test_util.disable_xla("b/124286351")  # No error is raised
   def testCustomGradientError(self):
     dtype = dtypes.float32
 
@@ -1287,7 +1288,7 @@ class FunctionsFromProtos(test.TestCase):
       gradients_impl.gradients([f1, f2, f3, f4], c)
 
     library = g.as_graph_def().library
-    new_funcs = function._from_library(library)
+    new_funcs = function.from_library(library)
 
     def CheckNewFunc(func):
       new_func = [f for f in new_funcs if f.name == func.name]
@@ -1303,7 +1304,7 @@ class FunctionsFromProtos(test.TestCase):
 
   def testFromLibraryEmptyLib(self):
     library = function_pb2.FunctionDefLibrary()
-    self.assertEqual(len(function._from_library(library)), 0)
+    self.assertEqual(len(function.from_library(library)), 0)
 
   def testFromLibraryMissingFuncDef(self):
 
@@ -1327,7 +1328,7 @@ class FunctionsFromProtos(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError,
         "FunctionDefLibrary missing 'G1_[0-9a-zA-Z]{8,11}' FunctionDef"):
-      function._from_library(library)
+      function.from_library(library)
 
     # Create invalid function def that is missing F1 function def
     library = function_pb2.FunctionDefLibrary()
@@ -1337,7 +1338,7 @@ class FunctionsFromProtos(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError,
         "FunctionDefLibrary missing 'F1_[0-9a-zA-Z]{8,11}' FunctionDef"):
-      function._from_library(library)
+      function.from_library(library)
 
   def testFromLibraryCyclicGradFuncs(self):
 
@@ -1366,7 +1367,7 @@ class FunctionsFromProtos(test.TestCase):
 
     with self.assertRaisesRegexp(
         ValueError, "FunctionDefLibrary contains cyclic gradient functions!"):
-      function._from_library(library)
+      function.from_library(library)
 
   def testExperimentalAttrs(self):
 
diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py
index 1b61ac925ce3d555525c9086172d43c75a3af10c..f0cd1647d0e2aa7fd993165384f616ce02228909 100644
--- a/tensorflow/python/framework/graph_util_impl.py
+++ b/tensorflow/python/framework/graph_util_impl.py
@@ -143,13 +143,14 @@ def _bfs_for_reachable_nodes(target_nodes, name_to_input_name):
   # Breadth first search to find all the nodes that we should keep.
   next_to_visit = target_nodes[:]
   while next_to_visit:
-    n = next_to_visit[0]
+    node = next_to_visit[0]
     del next_to_visit[0]
-    if n in nodes_to_keep:
+    if node in nodes_to_keep:
       # Already visited this node.
       continue
-    nodes_to_keep.add(n)
-    next_to_visit += name_to_input_name[n]
+    nodes_to_keep.add(node)
+    if node in name_to_input_name:
+      next_to_visit += name_to_input_name[node]
   return nodes_to_keep
 
 
@@ -196,7 +197,7 @@ def extract_sub_graph(graph_def, dest_nodes):
 
 @deprecation.deprecated(
     date=None,
-    instructions="Use tf.compat.v1.graph_util.remove_training_nodes")
+    instructions="Use tf.compat.v1.graph_util.tensor_shape_from_node_def_name")
 @tf_export(v1=["graph_util.tensor_shape_from_node_def_name"])
 def tensor_shape_from_node_def_name(graph, input_name):
   """Convenience function to get a shape from a NodeDef's input string."""
@@ -352,19 +353,27 @@ def remove_training_nodes(input_graph, protected_nodes=None):
     nodes_after_removal.append(new_node)
 
   types_to_splice = {"Identity": True}
+  control_input_names = set()
+  node_names_with_control_input = set()
+  for node in nodes_after_removal:
+    for node_input in node.input:
+      if "^" in node_input:
+        control_input_names.add(node_input.replace("^", ""))
+        node_names_with_control_input.add(node.name)
+
   names_to_splice = {}
   for node in nodes_after_removal:
     if node.op in types_to_splice and node.name not in protected_nodes:
       # We don't want to remove nodes that have control edge inputs, because
       # they might be involved in subtle dependency issues that removing them
       # will jeopardize.
-      has_control_edge = False
-      for input_name in node.input:
-        if re.match(r"^\^", input_name):
-          has_control_edge = True
-      if not has_control_edge:
+      if node.name not in node_names_with_control_input:
         names_to_splice[node.name] = node.input[0]
 
+  # We also don't want to remove nodes which are used as control edge inputs.
+  names_to_splice = {name: value for name, value in names_to_splice.items()
+                     if name not in control_input_names}
+
   nodes_after_splicing = []
   for node in nodes_after_removal:
     if node.name in names_to_splice:
diff --git a/tensorflow/python/framework/graph_util_test.py b/tensorflow/python/framework/graph_util_test.py
index dd26b8a78e9d2e13b34770775fcb1219745396e0..78777dc87724ab202e267a3aab4666c81465de59 100644
--- a/tensorflow/python/framework/graph_util_test.py
+++ b/tensorflow/python/framework/graph_util_test.py
@@ -308,8 +308,9 @@ class DeviceFunctionsTest(test.TestCase):
       new_node.input.extend([input_name])
     return new_node
 
-  def create_constant_node_def(self, name, value, dtype, shape=None):
-    node = self.create_node_def("Const", name, [])
+  def create_constant_node_def(self, name, value, dtype,
+                               shape=None, inputs=None):
+    node = self.create_node_def("Const", name, inputs or [])
     self.set_attr_dtype(node, "dtype", dtype)
     self.set_attr_tensor(node, "value", value, dtype, shape)
     return node
@@ -393,6 +394,18 @@ class DeviceFunctionsTest(test.TestCase):
     self.assertProtoEquals(expected_graph_def,
                            graph_util.remove_training_nodes(graph_def))
 
+  def testRemoveIdentityUsedAsControlInputInConst(self):
+    """Check that Identity nodes used as control inputs are not removed."""
+    graph_def = graph_pb2.GraphDef()
+    graph_def.node.extend([
+        self.create_constant_node_def("C", 1, dtypes.float32, inputs=["^I"]),
+        self.create_node_def("Identity", "I", ["Base"]),
+        self.create_node_def("BaseOp", "Base", [])
+    ])
+
+    self.assertProtoEquals(graph_def,
+                           graph_util.remove_training_nodes(graph_def))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 98c7aeccc4b19edfc433a6556108ef8b77d12aa4..e6f86f7f932db2955479d785b1b39ebf3e0c7210 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.tf_export import tf_export
@@ -266,7 +267,7 @@ def _ProcessNewOps(graph):
         coloc_op = graph._get_operation_by_name_unsafe(coloc_op_name)  # pylint: disable=protected-access
       except KeyError:
         # Do not error in TF2 if the colocation cannot be guaranteed
-        if tf2.enabled():
+        if tf2.enabled() or control_flow_util.EnableControlFlowV2(graph):
           continue
 
         raise ValueError('Specified colocation to an op that '
@@ -442,11 +443,9 @@ def import_graph_def(graph_def,
     _ProcessNewOps(graph)
 
   if graph_def.library and graph_def.library.function:
-    # pylint: disable=protected-access
-    functions = function._from_library(graph_def.library)
+    functions = function.from_library(graph_def.library)
     for f in functions:
       f.add_to_graph(graph)
-    # pylint: enable=protected-access
 
   # Treat input mappings that don't appear in the graph as an error, because
   # they are likely to be due to a typo.
diff --git a/tensorflow/python/framework/is_xla_test_true.py b/tensorflow/python/framework/is_xla_test_true.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ae1e68b36bdd38bf01eae7feb0d90db3cb9f197
--- /dev/null
+++ b/tensorflow/python/framework/is_xla_test_true.py
@@ -0,0 +1,29 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Including this as a dependency will result in Tensorflow tests using XLA.
+
+This function is defined by default in test_util.py to False. The test_util then
+attempts to import this module. If this file is made available through the BUILD
+rule, then this function is overridden and will instead cause Tensorflow graphs
+to be compiled with XLA.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+def is_xla_enabled():
+  """Returns true to state XLA should be enabled for Tensorflow tests."""
+  return True
diff --git a/tensorflow/python/framework/meta_graph.py b/tensorflow/python/framework/meta_graph.py
index ddf6f66e8ab5e17aa611cce40b01953fb7a5d3b1..fc566ce0b24fa52c712fe5f64357b066e5e41a08 100644
--- a/tensorflow/python/framework/meta_graph.py
+++ b/tensorflow/python/framework/meta_graph.py
@@ -29,10 +29,12 @@ from google.protobuf import text_format
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import op_def_pb2
+from tensorflow.core.protobuf import graph_debug_info_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
+from tensorflow.python.framework import error_interpolation
 from tensorflow.python.framework import graph_io
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import op_def_registry
@@ -509,6 +511,53 @@ def strip_graph_default_valued_attrs(meta_graph_def):
   meta_graph_def.meta_info_def.stripped_default_attrs = True
 
 
+def create_graph_debug_info_def(operations):
+  """Construct and returns a `GraphDebugInfo` protocol buffer.
+
+  Args:
+    operations: An iterable of op.Operation objects having _traceback members.
+
+  Returns:
+    GraphDebugInfo protocol buffer.
+
+  Raises:
+    TypeError: If the arguments are not of the correct proto buffer type.
+  """
+  # Creates an empty GraphDebugInfoDef proto.
+  graph_debug_info_def = graph_debug_info_pb2.GraphDebugInfo()
+
+  # Gets the file names and line numbers for the exported node names. Also
+  # collects the unique file names.
+  all_file_names = set()
+  node_to_trace = {}
+  for op in operations:
+    # Gets the stack trace of the operation and then the file location.
+    node_name = op.name
+    node_to_trace[node_name] = error_interpolation.compute_useful_stack(op)
+    for trace in node_to_trace[node_name]:
+      all_file_names.add(trace[0])
+
+  # Sets the `files` field in the GraphDebugInfo proto
+  graph_debug_info_def.files.extend(all_file_names)
+
+  # Builds a mapping between file names and index of the `files` field, so we
+  # only store the indexes for the nodes in the GraphDebugInfo.
+  file_to_index = dict(
+      [(y, x) for x, y in enumerate(graph_debug_info_def.files)])
+
+  # Creates the FileLineCol proto for each node and sets the value in the
+  # GraphDebugInfo proto. We only store the file name index for each node to
+  # save the storage space.
+  for node_name, trace in node_to_trace.items():
+    trace_def = graph_debug_info_def.traces[node_name]
+    for file_name, line, func, code in trace:
+      file_index = file_to_index[file_name]
+      trace_def.file_line_cols.add(
+          file_index=file_index, line=line, func=func, code=code)
+
+  return graph_debug_info_def
+
+
 def create_meta_graph_def(meta_info_def=None,
                           graph_def=None,
                           saver_def=None,
@@ -881,6 +930,7 @@ def export_scoped_meta_graph(filename=None,
                              saver_def=None,
                              clear_extraneous_savers=False,
                              strip_default_attrs=False,
+                             save_debug_info=False,
                              **kwargs):
   """Returns `MetaGraphDef` proto. Optionally writes it to filename.
 
@@ -910,7 +960,10 @@ def export_scoped_meta_graph(filename=None,
         graph (both Save/Restore ops and SaverDefs) that are not associated
         with the provided SaverDef.
     strip_default_attrs: Set to true if default valued attributes must be
-        removed while exporting the GraphDef.
+      removed while exporting the GraphDef.
+    save_debug_info: If `True`, save the GraphDebugInfo to a separate file,
+      which in the same directory of filename and with `_debug` added before the
+      file extension.
     **kwargs: Optional keyed arguments, including meta_info_def and
         collection_list.
 
@@ -920,8 +973,11 @@ def export_scoped_meta_graph(filename=None,
 
   Raises:
     ValueError: When the `GraphDef` is larger than 2GB.
+    ValueError: When executing in Eager mode and either `graph_def` or `graph`
+      is undefined.
   """
-  if context.executing_eagerly():
+  if context.executing_eagerly() and not (graph_def is not None and
+                                          graph is not None):
     raise ValueError("Exporting/importing meta graphs is not supported when "
                      "Eager Execution is enabled.")
   graph = graph or ops.get_default_graph()
@@ -1005,6 +1061,24 @@ def export_scoped_meta_graph(filename=None,
         os.path.dirname(filename),
         os.path.basename(filename),
         as_text=as_text)
+    if save_debug_info:
+      name, _ = os.path.splitext(filename)
+      debug_filename = "{name}{ext}".format(name=name, ext=".debug")
+
+      # Gets the operation from the graph by the name. Exludes variable nodes,
+      # so only the nodes in the frozen models are included.
+      ops_to_export = []
+      for node in scoped_meta_graph_def.graph_def.node:
+        scoped_op_name = ops.prepend_name_scope(node.name, export_scope)
+        ops_to_export.append(graph.get_operation_by_name(scoped_op_name))
+
+      graph_debug_info = create_graph_debug_info_def(ops_to_export)
+
+      graph_io.write_graph(
+          graph_debug_info,
+          os.path.dirname(debug_filename),
+          os.path.basename(debug_filename),
+          as_text=as_text)
 
   return scoped_meta_graph_def, var_list
 
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index e6e87881649729ca65db8cba9914e29b5a0d064e..3a0f338e23a414862eda0ec0836ee6e4e18dfb32 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -707,6 +707,26 @@ class ScopedMetaGraphTest(test.TestCase):
     test_util.assert_meta_graph_protos_equal(self, orig_meta_graph,
                                              new_meta_graph)
 
+  def testExportDebugInfo(self):
+    graph1 = ops.Graph()
+    with graph1.as_default():
+      with ops.name_scope("hidden1/hidden2/hidden3"):
+        images = constant_op.constant(
+            1.0, dtypes.float32, shape=[3, 2], name="images")
+        weights1 = variables.Variable([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]],
+                                      name="weights")
+        biases1 = resource_variable_ops.ResourceVariable(
+            [0.1] * 3, name="biases")
+        nn_ops.relu(math_ops.matmul(images, weights1) + biases1, name="relu")
+    debug_info_def = meta_graph.create_graph_debug_info_def(
+        operations=graph1.get_operations())
+
+    # The unique file names in all the stack traces should be larger or equal
+    # than 1.
+    self.assertTrue(len(debug_info_def.files) >= 1)
+    # All the nodes from the exported graphdef are included.
+    self.assertEqual(len(debug_info_def.traces), len(graph1.get_operations()))
+
   # Verifies that we can export a subgraph in a nested name scope containing a
   # "hidden1/hidden2" and import it into "new_hidden1/new_hidden2" in a new
   # graph.
diff --git a/tensorflow/python/framework/op_def_library.py b/tensorflow/python/framework/op_def_library.py
index 2318b32ef10d67c48950061d2c489f6c7dfb20a0..372763a862b5c416458bdc6ea9ae0f247687c8fc 100644
--- a/tensorflow/python/framework/op_def_library.py
+++ b/tensorflow/python/framework/op_def_library.py
@@ -212,6 +212,22 @@ def _MakeTensor(v, arg_name):
       (repr(v), arg_name))
 
 
+def _MakeFunc(v, arg_name):
+  """Ensure v is a func."""
+  if isinstance(v, attr_value_pb2.NameAttrList):
+    return v
+  fn_attr = attr_value_pb2.NameAttrList()
+  if isinstance(v, compat.bytes_or_text_types):
+    fn_attr.name = v
+  elif hasattr(v, "add_to_graph"):
+    v.add_to_graph(ops.get_default_graph())
+    fn_attr.name = v.name
+  else:
+    raise TypeError("Don't know how to convert {} to a func for "
+                    "argument {}".format(v, arg_name))
+  return fn_attr
+
+
 class _OpInfo(object):
   """All per-Op state we would like to precompute/validate."""
 
@@ -515,9 +531,9 @@ class OpDefLibrary(object):
             else:
               raise TypeError(
                   "Expected %s passed to parameter '%s' of op '%s', got %s of "
-                  "type '%s' instead." %
+                  "type '%s' instead. Error: %s" %
                   (dtypes.as_dtype(dtype).name, input_arg.name, op_type_name,
-                   repr(values), type(values).__name__))
+                   repr(values), type(values).__name__, err))
           except ValueError:
             # What type does convert_to_tensor think it has?
             try:
@@ -733,13 +749,9 @@ class OpDefLibrary(object):
           attr_value.list.tensor.extend(
               [_MakeTensor(x, key) for x in value])
         elif attr_def.type == "func":
-          if isinstance(value, attr_value_pb2.NameAttrList):
-            attr_value.func.CopyFrom(value)
-          elif isinstance(value, compat.bytes_or_text_types):
-            attr_value.func.name = value
-          else:
-            value.add_to_graph(ops.get_default_graph())
-            attr_value.func.name = value.name
+          attr_value.func.CopyFrom(_MakeFunc(value, key))
+        elif attr_def.type == "list(func)":
+          attr_value.list.func.extend([_MakeFunc(x, key) for x in value])
         else:
           raise TypeError("Unrecognized Attr type " + attr_def.type)
 
diff --git a/tensorflow/python/framework/op_def_library_test.py b/tensorflow/python/framework/op_def_library_test.py
index 66cfe213b3cc943de4cd423e8e2ffffbe0b49f8b..71d708dd89ecfbda9d64240b707563cd3fb2a9e9 100644
--- a/tensorflow/python/framework/op_def_library_test.py
+++ b/tensorflow/python/framework/op_def_library_test.py
@@ -24,6 +24,7 @@ from google.protobuf import text_format
 from tensorflow.core.framework import op_def_pb2
 from tensorflow.core.framework import tensor_shape_pb2
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_ops
@@ -140,40 +141,43 @@ class OpDefLibraryTest(test_util.TensorFlowTestCase):
     with ops.Graph().as_default():
       with self.assertRaises(TypeError) as cm:
         self._lib.apply_op("Simple", a="Bad string")
-      self.assertEqual(str(cm.exception),
-                       "Expected int32 passed to parameter 'a' of op 'Simple', "
-                       "got 'Bad string' of type 'str' instead.")
+      self.assertTrue(
+          "Expected int32 passed to parameter 'a' of op 'Simple', "
+          "got 'Bad string' of type 'str' instead." in str(cm.exception))
 
       with self.assertRaises(TypeError) as cm:
         self._lib.apply_op("Simple", a=self.Tensor(dtypes.string))
-      self.assertEqual(str(cm.exception),
-                       "Input 'a' of 'Simple' Op has type string "
-                       "that does not match expected type of int32.")
+      self.assertTrue(
+          "Input 'a' of 'Simple' Op has type string "
+          "that does not match expected type of int32." in str(cm.exception))
 
       with self.assertRaises(TypeError) as cm:
         self._lib.apply_op("Simple", a=6, extra="bogus")
-      self.assertEqual(str(cm.exception),
-                       "apply_op() got unexpected keyword arguments: extra")
+      self.assertTrue(
+          "apply_op() got unexpected keyword arguments: extra"
+          in str(cm.exception))
 
       with self.assertRaises(TypeError) as cm:
         self._lib.apply_op("Simple", a=6, extra1="bogus", extra2="also_bogus")
-      self.assertEqual(str(cm.exception),
-                       "apply_op() got unexpected keyword arguments: extra1, "
-                       "extra2")
+      self.assertTrue(
+          "apply_op() got unexpected keyword arguments: extra1, "
+          "extra2" in str(cm.exception))
 
       with self.assertRaises(TypeError) as cm:
         self._lib.apply_op("Simple")
-      self.assertEqual(str(cm.exception), "No argument for input a")
+      self.assertTrue(
+          "No argument for input a" in str(cm.exception))
 
       with self.assertRaises(TypeError) as cm:
         self._lib.apply_op("Simple", wrong=7)
-      self.assertEqual(str(cm.exception), "No argument for input a")
+      self.assertTrue(
+          "No argument for input a" in str(cm.exception))
 
       with self.assertRaises(TypeError) as cm:
         self._lib.apply_op("Simple", a={"label": 1})
-      self.assertEqual(str(cm.exception),
-                       "Expected int32 passed to parameter 'a' of op 'Simple', "
-                       "got {'label': 1} of type 'dict' instead.")
+      self.assertTrue(
+          "Expected int32 passed to parameter 'a' of op 'Simple', "
+          "got {'label': 1} of type 'dict' instead." in str(cm.exception))
 
   def testReservedInput(self):
     with ops.Graph().as_default():
@@ -268,19 +272,13 @@ class OpDefLibraryTest(test_util.TensorFlowTestCase):
         attr { key: 'T' value { type: DT_STRING } }
         """, out.op.node_def)
 
-      with self.assertRaises(TypeError) as cm:
+      with self.assertRaises(TypeError):
         self._lib.apply_op("Binary", a="left", b=12)
-      self.assertEqual(str(cm.exception),
-                       "Expected string passed to parameter 'b' of op 'Binary',"
-                       " got 12 of type 'int' instead.")
 
-      with self.assertRaises(TypeError) as cm:
+      with self.assertRaises(TypeError):
         self._lib.apply_op("Binary",
                            a=self.Tensor(dtypes.string),
                            b=self.Tensor(dtypes.int32))
-      self.assertEqual(str(cm.exception),
-                       "Input 'b' of 'Binary' Op has type int32 "
-                       "that does not match type string of argument 'a'.")
 
   def testRestrict(self):
     with ops.Graph().as_default():
@@ -466,6 +464,46 @@ class OpDefLibraryTest(test_util.TensorFlowTestCase):
       self.assertEqual(str(cm.exception),
                        "Expected float for argument 'a' not 'bad'.")
 
+  def testAttrFunc(self):
+    with ops.Graph().as_default():
+      @function.Defun(dtypes.float32, func_name="MyFn")
+      def fn(x):
+        return 2 + x
+      op = self._lib.apply_op("FuncAttr", f=fn, name="t")
+      self.assertProtoEquals("""
+        name: 't' op: 'FuncAttr' attr { key: 'f'
+                                        value { func { name: 'MyFn' } } }
+        """, op.node_def)
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("FuncAttr", f=3)
+      self.assertEqual(str(cm.exception),
+                       "Don't know how to convert 3 to a func for argument f")
+
+  def testAttrFuncList(self):
+    with ops.Graph().as_default():
+      @function.Defun(dtypes.float32, func_name="MyFn")
+      def fn1(x):
+        return 2 + x
+      @function.Defun(dtypes.int32, dtypes.float32, func_name="MyFn2")
+      def fn2(x, y):
+        return 2 + x, y * 3
+      @function.Defun(dtypes.int32, func_name="MyFn3")
+      def fn3(y):
+        return 2 + y
+      op = self._lib.apply_op("FuncListAttr", f=[fn1, fn2, fn3], name="t")
+      self.assertProtoEquals("""
+        name: 't' op: 'FuncListAttr'
+        attr { key: 'f' value { list { func { name: 'MyFn' }
+                                       func { name: 'MyFn2' }
+                                       func { name: 'MyFn3' } } } }
+        """, op.node_def)
+
+      with self.assertRaises(TypeError) as cm:
+        self._lib.apply_op("FuncListAttr", f=[fn1, 3, fn2])
+      self.assertEqual(str(cm.exception),
+                       "Don't know how to convert 3 to a func for argument f")
+
   def testAttrBool(self):
     with ops.Graph().as_default():
       op = self._lib.apply_op("AttrBool", a=True, name="t")
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index fa306936d653b233bba3b54d4f9a03ea202684e6..41d6bdeec1ba35068e58eff948667e649e9893a6 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -41,6 +41,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import core
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import c_api_util
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -94,14 +95,20 @@ class _UserDeviceSpec(object):
         lineno = -1
       self.display_name = "%s<%s, %d>" % (func_name, fname, lineno)
 
+    self.raw_string = None
+
     self.function = self._device_name_or_function
     if not (self._device_name_or_function is None or
             callable(self._device_name_or_function)):
+      self.raw_string = self._device_name_or_function
       self.function = pydev.merge_device(self._device_name_or_function)
 
 
 class NullContextmanager(object):
 
+  def __init__(self, *args, **kwargs):
+    pass
+
   def __enter__(self):
     pass
 
@@ -987,7 +994,8 @@ register_dense_tensor_like_type(Tensor)
 
 
 @tf_export(v1=["convert_to_tensor"])
-def convert_to_tensor(value, dtype=None, name=None, preferred_dtype=None):
+def convert_to_tensor(value, dtype=None, name=None, preferred_dtype=None,
+                      dtype_hint=None):
   """Converts the given `value` to a `Tensor`.
 
   This function converts Python objects of various types to `Tensor`
@@ -1027,15 +1035,18 @@ def convert_to_tensor(value, dtype=None, name=None, preferred_dtype=None):
       dtype in mind when converting to a tensor, so preferred_dtype
       can be used as a soft preference.  If the conversion to
       `preferred_dtype` is not possible, this argument has no effect.
+    dtype_hint: same meaning as preferred_dtype, and overrides it.
 
   Returns:
-    An `Tensor` based on `value`.
+    A `Tensor` based on `value`.
 
   Raises:
     TypeError: If no conversion function is registered for `value` to `dtype`.
     RuntimeError: If a registered conversion function returns an invalid value.
     ValueError: If the `value` is a tensor not of given `dtype` in graph mode.
   """
+  preferred_dtype = deprecation.deprecated_argument_lookup(
+      "dtype_hint", dtype_hint, "preferred_dtype", preferred_dtype)
   return convert_to_tensor_v2(value, dtype, preferred_dtype, name)
 
 
@@ -1082,7 +1093,7 @@ def convert_to_tensor_v2(value, dtype=None, dtype_hint=None, name=None):
     name: Optional name to use if a new `Tensor` is created.
 
   Returns:
-    An `Tensor` based on `value`.
+    A `Tensor` based on `value`.
 
   Raises:
     TypeError: If no conversion function is registered for `value` to `dtype`.
@@ -1225,7 +1236,7 @@ def internal_convert_n_to_tensor(values,
       value.
   """
   if not isinstance(values, collections.Sequence):
-    raise TypeError("values must be a list.")
+    raise TypeError("values must be a sequence.")
   ret = []
   if ctx is None: ctx = context.context()
   for i, value in enumerate(values):
@@ -1289,7 +1300,7 @@ def convert_to_tensor_or_indexed_slices(value, dtype=None, name=None):
     name: (Optional.) A name to use if a new `Tensor` is created.
 
   Returns:
-    An `Tensor`, `IndexedSlices`, or `SparseTensor` based on `value`.
+    A `Tensor`, `IndexedSlices`, or `SparseTensor` based on `value`.
 
   Raises:
     ValueError: If `dtype` does not match the element type of `value`.
@@ -1302,7 +1313,7 @@ def internal_convert_to_tensor_or_indexed_slices(value,
                                                  dtype=None,
                                                  name=None,
                                                  as_ref=False):
-  """Converts the given object to an `Tensor` or an `IndexedSlices`.
+  """Converts the given object to a `Tensor` or an `IndexedSlices`.
 
   If `value` is an `IndexedSlices` or `SparseTensor` it is returned
   unmodified. Otherwise, it is converted to a `Tensor` using
@@ -1317,7 +1328,7 @@ def internal_convert_to_tensor_or_indexed_slices(value,
     as_ref: True if the caller wants the results as ref tensors.
 
   Returns:
-    An `Tensor`, `IndexedSlices`, or `SparseTensor` based on `value`.
+    A `Tensor`, `IndexedSlices`, or `SparseTensor` based on `value`.
 
   Raises:
     ValueError: If `dtype` does not match the element type of `value`.
@@ -1348,7 +1359,7 @@ def internal_convert_n_to_tensor_or_indexed_slices(values,
   Args:
     values: A list of `None`, `IndexedSlices`, `SparseTensor`, or objects that
       can be consumed by `convert_to_tensor()`.
-    dtype: (Optional.) The required `DType` of the returned `Tensor`
+    dtype: (Optional.) The required `DType` of the returned `Tensor` or
       `IndexedSlices`.
     name: (Optional.) A name prefix to used when a new `Tensor` is
       created, in which case element `i` will be given the name `name
@@ -1356,7 +1367,7 @@ def internal_convert_n_to_tensor_or_indexed_slices(values,
     as_ref: True if the caller wants the results as ref tensors.
 
   Returns:
-    A list of `Tensor`, `IndexedSlices`, and/or `SparseTensor` objects.
+    A list of `Tensor`, `IndexedSlices`, `SparseTensor` and/or `None` objects.
 
   Raises:
     TypeError: If no conversion function is registered for an element in
@@ -1365,7 +1376,7 @@ def internal_convert_n_to_tensor_or_indexed_slices(values,
       value.
   """
   if not isinstance(values, collections.Sequence):
-    raise TypeError("values must be a list.")
+    raise TypeError("values must be a sequence.")
   ret = []
   for i, value in enumerate(values):
     if value is None:
@@ -1406,6 +1417,132 @@ def convert_n_to_tensor_or_indexed_slices(values, dtype=None, name=None):
       values=values, dtype=dtype, name=name, as_ref=False)
 
 
+def convert_to_tensor_or_composite(value, dtype=None, name=None):
+  """Converts the given object to a `Tensor` or `CompositeTensor`.
+
+  If `value` is a `CompositeTensor` it is returned unmodified. Otherwise, it
+  is converted to a `Tensor` using `convert_to_tensor()`.
+
+  Args:
+    value: A `CompositeTensor` or an object that can be consumed
+      by `convert_to_tensor()`.
+    dtype: (Optional.) The required `DType` of the returned `Tensor` or
+      `CompositeTensor`.
+    name: (Optional.) A name to use if a new `Tensor` is created.
+
+  Returns:
+    A `Tensor` or `CompositeTensor`, based on `value`.
+
+  Raises:
+    ValueError: If `dtype` does not match the element type of `value`.
+  """
+  return internal_convert_to_tensor_or_composite(
+      value=value, dtype=dtype, name=name, as_ref=False)
+
+
+def internal_convert_to_tensor_or_composite(value,
+                                            dtype=None,
+                                            name=None,
+                                            as_ref=False):
+  """Converts the given object to a `Tensor` or `CompositeTensor`.
+
+  If `value` is a `CompositeTensor` it is returned unmodified.  Otherwise, it
+  is converted to a `Tensor` using `convert_to_tensor()`.
+
+  Args:
+    value: A `CompositeTensor`, or an object that can be consumed
+      by `convert_to_tensor()`.
+    dtype: (Optional.) The required `DType` of the returned `Tensor` or
+      `CompositeTensor`.
+    name: (Optional.) A name to use if a new `Tensor` is created.
+    as_ref: True if the caller wants the results as ref tensors.
+
+  Returns:
+    A `Tensor` or `CompositeTensor`, based on `value`.
+
+  Raises:
+    ValueError: If `dtype` does not match the element type of `value`.
+  """
+  if isinstance(value, composite_tensor.CompositeTensor):
+    value_dtype = getattr(value, "dtype", None)
+    if dtype and not dtypes.as_dtype(dtype).is_compatible_with(value_dtype):
+      raise ValueError(
+          "Tensor conversion requested dtype %s for Tensor with dtype %s: %r" %
+          (dtypes.as_dtype(dtype).name, value.dtype.name, str(value)))
+    return value
+  else:
+    return internal_convert_to_tensor(
+        value, dtype=dtype, name=name, as_ref=as_ref)
+
+
+def internal_convert_n_to_tensor_or_composite(values,
+                                              dtype=None,
+                                              name=None,
+                                              as_ref=False):
+  """Converts `values` to a list of `Tensor` or `CompositeTensor` objects.
+
+  Any `CompositeTensor` objects in `values` are returned unmodified.
+
+  Args:
+    values: A list of `None`, `CompositeTensor`, or objects that
+      can be consumed by `convert_to_tensor()`.
+    dtype: (Optional.) The required `DType` of the returned `Tensor`s or
+      `CompositeTensor`s.
+    name: (Optional.) A name prefix to used when a new `Tensor` is
+      created, in which case element `i` will be given the name `name
+      + '_' + i`.
+    as_ref: True if the caller wants the results as ref tensors.
+
+  Returns:
+    A list of `Tensor`, `CompositeTensor`, and/or `None` objects.
+
+  Raises:
+    TypeError: If no conversion function is registered for an element in
+      `values`.
+    RuntimeError: If a registered conversion function returns an invalid
+      value.
+  """
+  if not isinstance(values, collections.Sequence):
+    raise TypeError("values must be a sequence.")
+  ret = []
+  for i, value in enumerate(values):
+    if value is None:
+      ret.append(value)
+    else:
+      n = None if name is None else "%s_%d" % (name, i)
+      ret.append(
+          internal_convert_to_tensor_or_composite(
+              value, dtype=dtype, name=n, as_ref=as_ref))
+  return ret
+
+
+def convert_n_to_tensor_or_composite(values, dtype=None, name=None):
+  """Converts `values` to a list of `Output` or `CompositeTensor` objects.
+
+  Any `CompositeTensor` objects in `values` are returned unmodified.
+
+  Args:
+    values: A list of `None`, `CompositeTensor``, or objects that
+      can be consumed by `convert_to_tensor()`.
+    dtype: (Optional.) The required `DType` of the returned `Tensor`s or
+      `CompositeTensor`s.
+    name: (Optional.) A name prefix to used when a new `Tensor` is
+      created, in which case element `i` will be given the name `name
+      + '_' + i`.
+
+  Returns:
+    A list of `Tensor` and/or `CompositeTensor` objects.
+
+  Raises:
+    TypeError: If no conversion function is registered for an element in
+      `values`.
+    RuntimeError: If a registered conversion function returns an invalid
+      value.
+  """
+  return internal_convert_n_to_tensor_or_composite(
+      values=values, dtype=dtype, name=name, as_ref=False)
+
+
 # TODO(josh11b): Add ctx argument to conversion_func() signature.
 @tf_export("register_tensor_conversion_function")
 def register_tensor_conversion_function(base_type,
@@ -1485,7 +1622,7 @@ def register_tensor_conversion_function(base_type,
 
 
 @tf_export("IndexedSlices")
-class IndexedSlices(_TensorLike):
+class IndexedSlices(_TensorLike, composite_tensor.CompositeTensor):
   """A sparse representation of a set of tensor slices at given indices.
 
   This class is a simple wrapper for a pair of `Tensor` objects:
@@ -1568,6 +1705,29 @@ class IndexedSlices(_TensorLike):
   def __neg__(self):
     return IndexedSlices(-self.values, self.indices, self.dense_shape)
 
+  def _to_components(self):
+    if self._dense_shape is None:
+      return (self._values, self._indices)
+    else:
+      return (self._values, self._indices, self._dense_shape)
+
+  @classmethod
+  def _from_components(cls, components):
+    return cls(*components)
+
+  def _shape_invariant_to_components(self, shape=None):
+    if shape is None:
+      shape = self._values.shape
+    if self._dense_shape is None:
+      return [shape, shape[:1]]  # values, indices
+    else:
+      # values, indices, dense_shape
+      return [shape, shape[:1], tensor_shape.TensorShape([shape.ndims])]
+
+  @property
+  def _is_graph_tensor(self):
+    return hasattr(self._values, 'graph')
+
 
 IndexedSlicesValue = collections.namedtuple(
     "IndexedSlicesValue", ["values", "indices", "dense_shape"])
@@ -2891,11 +3051,11 @@ class Graph(object):
     # being called inside function definitions behave as if they were seeing the
     # actual outside graph).
     self._graph_key = "grap-key-%d/" % (uid(),)
-    # A string with the last reduction method passed to
-    # losses.compute_weighted_loss(), or None.
-    self._last_loss_reduction = None
     self._container = ""
     self._registered_ops = op_def_registry.get_registered_ops()
+    # Set to True if this graph is being built in an
+    # AutomaticControlDependencies context.
+    self._add_control_dependencies = False
 
     # TODO(skyewm): fold as much of the above as possible into the C
     # implementation
@@ -2910,11 +3070,27 @@ class Graph(object):
   # Note: this method is private because the API of tf.Graph() is public and
   # frozen, and this functionality is still not ready for public visibility.
   @tf_contextlib.contextmanager
-  def _variable_creator_scope(self, creator):
+  def _variable_creator_scope(self, creator, priority=100):
+    """Scope which defines a variable creation function.
+
+    Args:
+      creator: A callable taking `next_creator` and `kwargs`. See the
+        `tf.variable_creator_scope` docstring.
+      priority: Creators with a higher `priority` are called first. Within the
+        same priority, creators are called inner-to-outer.
+
+    Yields:
+      `_variable_creator_scope` is a context manager with a side effect, but
+      doesn't return a value.
+    """
     # This step makes a copy of the existing stack, and it also initializes
     # self._thread_local._variable_creator_stack if it doesn't exist yet.
     old = list(self._variable_creator_stack)
-    self._thread_local._variable_creator_stack.append(creator)  # pylint: disable=protected-access
+    stack = self._thread_local._variable_creator_stack  # pylint: disable=protected-access
+    stack.append((priority, creator))
+    # Sorting is stable, so we'll put higher-priority creators later in the list
+    # but otherwise maintain registration order.
+    stack.sort(key=lambda item: item[0])
     try:
       yield
     finally:
@@ -4520,7 +4696,11 @@ class Graph(object):
     control_ops = []
     current = self._current_control_dependencies()
     for c in control_inputs:
-      if isinstance(c, IndexedSlices):
+      # The hasattr(handle) is designed to match ResourceVariables. This is so
+      # control dependencies on a variable or on an unread variable don't
+      # trigger reads.
+      if (isinstance(c, IndexedSlices) or
+          (hasattr(c, "_handle") and hasattr(c, "op"))):
         c = c.op
       c = self.as_graph_element(c)
       if isinstance(c, Tensor):
@@ -4868,6 +5048,48 @@ class Graph(object):
     self._thread_local._distribution_strategy_stack = (  # pylint: disable=protected-access
         _distribution_strategy_stack)
 
+  @property
+  def _auto_cast_variable_read_dtype(self):
+    """The dtype that instances of `AutoCastVariable` will be casted to.
+
+    This is None if `AutoCastVariables` should not be casted.
+
+    See `AutoCastVariable` for more information.
+
+    Returns:
+      The dtype that instances of `AutoCastVariable` will be casted to.
+    """
+    if not hasattr(self._thread_local, "_auto_cast_variable_read_dtype"):
+      self._thread_local._auto_cast_variable_read_dtype = None  # pylint: disable=protected-access
+    return self._thread_local._auto_cast_variable_read_dtype  # pylint: disable=protected-access
+
+  @_auto_cast_variable_read_dtype.setter
+  def _auto_cast_variable_read_dtype(self, _auto_cast_variable_read_dtype):
+    self._thread_local._auto_cast_variable_read_dtype = (  # pylint: disable=protected-access
+        _auto_cast_variable_read_dtype)
+
+  @tf_contextlib.contextmanager
+  def _enable_auto_casting_variables(self, dtype):
+    """Context manager to automatically cast AutoCastVariables.
+
+    If an AutoCastVariable `var` is used under this context manager, it will be
+    casted to `dtype` before being used.
+
+    See `AutoCastVariable` for more information.
+
+    Args:
+      dtype: The dtype that AutoCastVariables should be casted to.
+
+    Yields:
+      Nothing.
+    """
+    prev_read_dtype = self._auto_cast_variable_read_dtype
+    try:
+      self._auto_cast_variable_read_dtype = dtype
+      yield
+    finally:
+      self._auto_cast_variable_read_dtype = prev_read_dtype
+
   def _mutation_lock(self):
     """Returns a lock to guard code that creates & mutates ops.
 
@@ -4985,12 +5207,19 @@ def _colocate_with_for_gradient(op, gradient_uid, ignore_existing=False):
         op, gradient_uid=gradient_uid, ignore_existing=ignore_existing)
 
 
+# Internal interface to colocate_with. colocate_with has been deprecated from
+# public API. There are still a few internal uses of colocate_with. Add internal
+# only API for those uses to avoid deprecation warning.
+def colocate_with(op, ignore_existing=False):
+  return _colocate_with_for_gradient(op, None, ignore_existing=ignore_existing)
+
+
 @deprecation.deprecated(
     date=None,
     instructions="Colocations handled automatically by placer.")
 @tf_export(v1=["colocate_with"])
-def colocate_with(op, ignore_existing=False):
-  return _colocate_with_for_gradient(op, None, ignore_existing=ignore_existing)
+def _colocate_with(op, ignore_existing=False):
+  return colocate_with(op, ignore_existing)
 
 
 @tf_export("control_dependencies")
@@ -5243,7 +5472,8 @@ class _DefaultGraphStack(_DefaultStack):  # pylint: disable=protected-access
   @tf_contextlib.contextmanager
   def get_controller(self, default):
     context.context().context_switches.push(
-        default.building_function, default.as_default)
+        default.building_function, default.as_default,
+        default._device_function_stack)
     try:
       with super(_DefaultGraphStack, self).get_controller(
           default) as g, context.graph_mode():
@@ -5323,7 +5553,7 @@ def init_scope():
       # Names that end with trailing slashes are treated by `name_scope` as
       # absolute.
       scope = scope + "/"
-    inner_device_stack = default_graph._device_function_stack  # pylint: disable=protected-access
+    innermost_nonempty_device_stack = default_graph._device_function_stack  # pylint: disable=protected-access
 
     outer_context = None
     if not _default_graph_stack.stack:
@@ -5336,6 +5566,8 @@ def init_scope():
     else:
       # Find a context that is not building a function.
       for stack_entry in reversed(context.context().context_switches.stack):
+        if not innermost_nonempty_device_stack:
+          innermost_nonempty_device_stack = stack_entry.device_stack
         if not stack_entry.is_building_function:
           outer_context = stack_entry.enter_context_fn
           break
@@ -5357,6 +5589,8 @@ def init_scope():
     try:
       with outer_context(), name_scope(scope), control_dependencies(
           None), tape.stop_recording():
+        context_manager = NullContextmanager
+        context_manager_input = None
         if not context.executing_eagerly():
           # The device stack is preserved when lifting into a graph. Eager
           # execution doesn't implement device stacks and in particular it
@@ -5364,8 +5598,22 @@ def init_scope():
           # to do the same when lifting into the eager context.
           outer_graph = get_default_graph()
           outer_device_stack = outer_graph._device_function_stack  # pylint: disable=protected-access
-          outer_graph._device_function_stack = inner_device_stack  # pylint: disable=protected-access
-        yield
+          outer_graph._device_function_stack = innermost_nonempty_device_stack  # pylint: disable=protected-access
+        elif innermost_nonempty_device_stack is not None:
+          for device_spec in innermost_nonempty_device_stack.peek_objs():
+            if device_spec.function is None:
+              break
+            if device_spec.raw_string:
+              context_manager = context.device
+              context_manager_input = device_spec.raw_string
+              break
+            # It is currently not possible to have a device function in V2,
+            # but in V1 we are unable to apply device functions in eager mode.
+            # This means that we will silently skip some of the entries on the
+            # device stack in V1 + eager mode.
+
+        with context_manager(context_manager_input):
+          yield
     finally:
       # If an exception is raised here it may be hiding a related exception in
       # try-block (just above).
@@ -5467,6 +5715,9 @@ def disable_eager_execution():
   projects from TensorFlow 1.x to 2.x.
   """
   context.default_execution_mode = context.GRAPH_MODE
+  c = context.context_safe()
+  if c is not None:
+    c._eager_context.is_eager = False  # pylint: disable=protected-access
 
 
 def enable_eager_execution_internal(config=None,
@@ -5985,7 +6236,7 @@ name_scope_cache = {}
 # Named like a function for backwards compatibility with the
 # @tf_contextlib.contextmanager version, which was switched to a class to avoid
 # some object creation overhead.
-@tf_export("name_scope", "keras.backend.name_scope")
+@tf_export("name_scope")
 class name_scope(object):  # pylint: disable=invalid-name
   """A context manager for use when defining a Python op.
 
@@ -6019,7 +6270,15 @@ class name_scope(object):  # pylint: disable=invalid-name
       name: The name argument that is passed to the op function.
       default_name: The default name to use if the `name` argument is `None`.
       values: The list of `Tensor` arguments that are passed to the op function.
+
+    Raises:
+      TypeError: if `default_name` is passed in but not a string.
     """
+    if not (default_name is None or isinstance(default_name, six.string_types)):
+      raise TypeError(
+          "`default_name` type (%s) is not a string type. You likely meant to "
+          "pass this into the `values` kwarg."
+          % type(default_name))
     self._name = default_name if name is None else name
     self._default_name = default_name
     self._values = values
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 0fcbcd6ee4dd1f103c599dc4db26432b61879e83..7d9799a1a7e28c3317ddca1ce3ffada51517b508 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -615,6 +615,9 @@ class OperationTest(test_util.TensorFlowTestCase):
       self.assertEqual(while_op.type, "While")
       orig_num_inputs = len(while_op.inputs)
 
+      # Make sure we can handle the while op having a control input.
+      while_op._add_control_input(constant_op.constant(0).op)
+
       new_input1 = constant_op.constant(1.0)
       new_input2 = constant_op.constant(True)
 
@@ -1584,6 +1587,8 @@ class CollectionTest(test_util.TensorFlowTestCase):
     self.assertSequenceEqual(g.collections, ["key"])
     g.add_to_collection("other", "foo")
     self.assertSequenceEqual(sorted(g.collections), ["key", "other"])
+    self.assertSequenceEqual(
+        sorted(g.get_all_collection_keys()), ["key", "other"])
 
   def test_add_to_collection(self):
     g = ops.Graph()
@@ -2049,6 +2054,9 @@ class OpScopeTest(test_util.TensorFlowTestCase):
     with ops.name_scope(None, default_scope_name, [a, b]) as scope:
       self.assertEqual("%s/" % default_scope_name, scope)
       self.assertEqual(g0, ops.get_default_graph())
+    with self.assertRaises(TypeError):
+      with ops.name_scope(scope_name, [a, b]):
+        pass
 
   def _testGraphElements(self, graph_elements):
     scope_name = "my_scope"
@@ -2147,13 +2155,19 @@ class InitScopeTest(test_util.TensorFlowTestCase):
     with g0.as_default(), ops.device("CPU:0"):
       g1 = ops.Graph()
       g1._building_function = True  # pylint: disable=protected-access
-      with g1.as_default(), ops.device("GPU:0"):
+      with g1.as_default():
+        with ops.device("GPU:0"):
+          with ops.init_scope():
+            # init_scope should preserve device set under `g1`.
+            on_gpu = constant_op.constant(1.0)
+            self.assertEqual(on_gpu.device, "/device:GPU:0")
+          still_on_gpu = constant_op.constant(1.0)
+          self.assertEqual(still_on_gpu.device, "/device:GPU:0")
+        blank = constant_op.constant(1.0)
+        self.assertEqual(blank.device, "")
         with ops.init_scope():
-          # init_scope should preserve device set under `g1`.
-          on_gpu = constant_op.constant(1.0)
-          self.assertEqual(on_gpu.device, "/device:GPU:0")
-        still_on_gpu = constant_op.constant(1.0)
-        self.assertEqual(still_on_gpu.device, "/device:GPU:0")
+          now_on_cpu = constant_op.constant(1.0)
+          self.assertEqual(now_on_cpu.device, "/device:CPU:0")
       on_cpu = constant_op.constant(1.0)
       self.assertEqual(on_cpu.device, "/device:CPU:0")
 
@@ -2342,7 +2356,7 @@ class InitScopeTest(test_util.TensorFlowTestCase):
           math_ops.add(c, c)
         c2 = constant_op.constant(2.0)
       with self.assertRaisesRegexp(
-          TypeError, "contains objects other than 'EagerTensor'"):
+          TypeError, "Graph tensors"):
         math_ops.add(c2, c2)
 
   def testPreservesNameScopeInEagerExecution(self):
@@ -2402,17 +2416,22 @@ class GraphTest(test_util.TensorFlowTestCase):
 
   def testDefaultGraph(self):
     orig = ops.get_default_graph()
+    self.assertFalse(ops.has_default_graph())
     self._AssertDefault(orig)
     g0 = ops.Graph()
+    self.assertFalse(ops.has_default_graph())
     self._AssertDefault(orig)
     context_manager_0 = g0.as_default()
+    self.assertFalse(ops.has_default_graph())
     self._AssertDefault(orig)
     with context_manager_0 as g0:
       self._AssertDefault(g0)
       with ops.Graph().as_default() as g1:
+        self.assertTrue(ops.has_default_graph())
         self._AssertDefault(g1)
       self._AssertDefault(g0)
     self._AssertDefault(orig)
+    self.assertFalse(ops.has_default_graph())
 
   def testPreventFeeding(self):
     g = ops.Graph()
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index d460168631c3032bb91894c9997b2de29bf026e6..c27f0140144287cb715e92715343d1b1e69b009a 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -144,6 +144,8 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
                        const string& num_outputs_expr);
   void AddDispatch(const string& prefix);
 
+  void AddRawOpExport();
+
   void AddAttrForArg(const string& attr, int arg_index) {
     gtl::InsertIfNotPresent(&inferred_attrs_, attr,
                             op_def_.input_arg(arg_index).name());
@@ -545,7 +547,7 @@ bool GenEagerPythonOp::GetEagerFunctionSetup(const string& indentation,
       strings::StrAppend(function_setup, indentation, attr_api_name,
                          " = [_execute.make_tensor(_t, \"", attr_api_name,
                          "\") for _t in ", attr_api_name, "]\n");
-    } else if (attr_type != "func") {
+    } else if (attr_type != "func" && attr_type != "list(func)") {
       *function_setup =
           strings::StrCat("# No definition for ", function_name_,
                           " since we don't support attrs with type\n"
@@ -649,7 +651,7 @@ bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
   strings::StrAppend(&result_, "  \"\"\"\n");
 
   strings::StrAppend(&result_,
-                     "  _ctx = _context._context\n"
+                     "  _ctx = _context._context or _context.context()\n"
                      "  if _ctx is not None and _ctx._eager_context.is_eager:",
                      "\n");
   if (eager_not_allowed_error.empty()) {
@@ -668,6 +670,7 @@ bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
   AddEagerFunctionTeardown("  ", output_sizes,
                            true /* execute_record_gradient */);
 
+  AddRawOpExport();
   strings::StrAppend(&result_, "\n\n");
   return true;
 }
@@ -921,6 +924,68 @@ void GenEagerPythonOp::AddDispatch(const string& prefix) {
   strings::StrAppend(&result_, prefix, "  raise\n");
 }
 
+void GenEagerPythonOp::AddRawOpExport() {
+  // Create function for python op.
+  string raw_parameters;
+  string function_call_parameters;
+  string inputs;
+  string attrs;
+
+  std::map<string, string> renames;
+
+  for (const auto& param_names : param_names_) {
+    renames.insert({param_names.GetName(), param_names.GetRenameTo()});
+  }
+
+  for (const auto& input_arg : op_def_.input_arg()) {
+    const string input_arg_name =
+        python_op_gen_internal::AvoidPythonReserved(input_arg.name());
+    if (!raw_parameters.empty()) strings::StrAppend(&raw_parameters, ", ");
+    strings::StrAppend(&raw_parameters, input_arg_name);
+
+    if (!inputs.empty()) strings::StrAppend(&inputs, ", ");
+    strings::StrAppend(&inputs, input_arg_name);
+
+    if (!function_call_parameters.empty()) {
+      strings::StrAppend(&function_call_parameters, ", ");
+    }
+    strings::StrAppend(&function_call_parameters, renames[input_arg.name()],
+                       "=", input_arg_name);
+  }
+  for (const auto& attr : op_def_.attr()) {
+    if (inferred_attrs_.find(attr.name()) != inferred_attrs_.end()) continue;
+
+    const string attr_name =
+        python_op_gen_internal::AvoidPythonReserved(attr.name());
+
+    if (!raw_parameters.empty()) strings::StrAppend(&raw_parameters, ", ");
+    strings::StrAppend(&raw_parameters, attr_name);
+
+    if (!attrs.empty()) strings::StrAppend(&attrs, ", ");
+    strings::StrAppend(&attrs, "\"", attr_name, "\", ", attr_name);
+
+    if (!function_call_parameters.empty()) {
+      strings::StrAppend(&function_call_parameters, ", ");
+    }
+    strings::StrAppend(&function_call_parameters, renames[attr.name()], "=",
+                       attr_name);
+  }
+
+  const string raw_function_name =
+      python_op_gen_internal::AvoidPythonReserved(op_def_.name());
+
+  strings::StrAppend(&result_,
+                     "@_doc_controls.do_not_generate_docs\n@_kwarg_only\ndef ",
+                     raw_function_name, "(", raw_parameters, "):\n");
+
+  // Function body.
+  strings::StrAppend(&result_, "  return ", function_name_, "(",
+                     function_call_parameters, ")\n");
+
+  strings::StrAppend(&result_, "tf_export(\"raw_ops.", raw_function_name,
+                     "\")(", raw_function_name, ")\n");
+}
+
 string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
                     const std::vector<string>& hidden_ops, bool require_shapes,
                     const string& source_file_name = "") {
@@ -962,6 +1027,8 @@ from tensorflow.python.framework import op_def_library as _op_def_library
 from tensorflow.python.util.deprecation import deprecated_endpoints
 from tensorflow.python.util import dispatch as _dispatch
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import kwarg_only as _kwarg_only
+from tensorflow.tools.docs import doc_controls as _doc_controls
 
 )");
 
diff --git a/tensorflow/python/framework/registry.py b/tensorflow/python/framework/registry.py
index 4357c76bd6cc8ccac55b5e123fa0ce7cf3c0d19d..53c68b046192818da31ece0c3e9181986e671829 100644
--- a/tensorflow/python/framework/registry.py
+++ b/tensorflow/python/framework/registry.py
@@ -64,8 +64,12 @@ class Registry(object):
     # stack trace is [this_function, Register(), user_function,...]
     # so the user function is #2.
     stack = tf_stack.extract_stack()
-    user_function = stack[2]
-    location_tag = tf_stack.convert_stack([user_function])[0]
+    stack_index = min(2, len(stack)-1)
+    if stack_index >= 0:
+      user_function = stack[stack_index]
+      location_tag = tf_stack.convert_stack([user_function])[0]
+    else:
+      location_tag = "UNKNOWN"
     self._registry[name] = {_TYPE_TAG: candidate, _LOCATION_TAG: location_tag}
 
   def list(self):
diff --git a/tensorflow/python/framework/registry_test.py b/tensorflow/python/framework/registry_test.py
index 1a0d3f200d9427363ae36c19b6214ac6c9b75bec..5adf12fdacf5fa1e8ea096e3d6494824f26d282e 100644
--- a/tensorflow/python/framework/registry_test.py
+++ b/tensorflow/python/framework/registry_test.py
@@ -19,28 +19,33 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.framework import registry
 from tensorflow.python.platform import test
 
 
-class RegistryTest(test.TestCase):
+def bar():
+  pass
+
+
+class RegistryTest(test.TestCase, parameterized.TestCase):
 
   class Foo(object):
     pass
 
-  def testRegisterClass(self):
-    myreg = registry.Registry('testfoo')
+  # Test the registry basics on both classes (Foo) and functions (bar).
+  @parameterized.parameters([Foo, bar])
+  def testRegistryBasics(self, candidate):
+    myreg = registry.Registry('testRegistry')
     with self.assertRaises(LookupError):
-      myreg.lookup('Foo')
-    myreg.register(RegistryTest.Foo, 'Foo')
-    assert myreg.lookup('Foo') == RegistryTest.Foo
-
-  def testRegisterFunction(self):
-    myreg = registry.Registry('testbar')
-    with self.assertRaises(LookupError):
-      myreg.lookup('Bar')
-    myreg.register(bar, 'Bar')
-    assert myreg.lookup('Bar') == bar
+      myreg.lookup('testKey')
+    myreg.register(candidate)
+    self.assertEqual(myreg.lookup(candidate.__name__), candidate)
+    myreg.register(candidate, 'testKey')
+    self.assertEqual(myreg.lookup('testKey'), candidate)
+    self.assertEqual(
+        sorted(myreg.list()), sorted(['testKey', candidate.__name__]))
 
   def testDuplicate(self):
     myreg = registry.Registry('testbar')
@@ -51,9 +56,5 @@ class RegistryTest(test.TestCase):
       myreg.register(bar, 'Bar')
 
 
-def bar():
-  pass
-
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index 5e1a95a26be034bff0a1f5eb996ac6f16c61e282..c69fa41677bdc451d2de63a583bbea8b03fc0178 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Classes and functions used to construct graphs."""
+"""Sparse tensors."""
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
 from __future__ import division
@@ -21,8 +21,10 @@ from __future__ import print_function
 import collections
 
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.util.tf_export import tf_export
 
@@ -34,7 +36,7 @@ _override_helper = ops._override_helper
 
 
 @tf_export("sparse.SparseTensor", "SparseTensor")
-class SparseTensor(_TensorLike):
+class SparseTensor(_TensorLike, composite_tensor.CompositeTensor):
   """Represents a sparse tensor.
 
   TensorFlow represents a sparse tensor as three separate dense tensors:
@@ -113,16 +115,12 @@ class SparseTensor(_TensorLike):
       dense_shape: A 1-D int64 tensor of shape `[ndims]`.
 
     """
-    with ops.name_scope(None, "SparseTensor",
-                        [indices, values, dense_shape]):
+    with ops.name_scope(None, "SparseTensor", [indices, values, dense_shape]):
       indices = ops.convert_to_tensor(
           indices, name="indices", dtype=dtypes.int64)
-      # Always pass as_ref=True because we want to be able to update
-      # values later if it is a VariableOp.
       # TODO(touts): Consider adding mutable_values() when 'values'
       # is a VariableOp and updating users of SparseTensor.
-      values = ops.internal_convert_to_tensor(
-          values, name="values", as_ref=True)
+      values = ops.internal_convert_to_tensor(values, name="values")
       dense_shape = ops.convert_to_tensor(
           dense_shape, name="dense_shape", dtype=dtypes.int64)
     self._indices = indices
@@ -241,6 +239,30 @@ class SparseTensor(_TensorLike):
   def _override_operator(operator, func):
     _override_helper(SparseTensor, operator, func)
 
+  def _to_components(self):
+    return (self._indices, self._values, self._dense_shape)
+
+  @classmethod
+  def _from_components(cls, components):
+    return cls(*components)
+
+  def _shape_invariant_to_components(self, shape=None):
+    if shape is None:
+      shape = self.dense_shape.shape
+    if shape.ndims is None:
+      shape = tensor_shape.TensorShape([None])
+    if shape.ndims != 1:
+      raise ValueError("Shape invariant for SparseTensor must have the form "
+                       "TensorShape([r]), got %r" % shape)
+    rank = tensor_shape.dimension_value(shape[0])
+    return [tensor_shape.TensorShape([None, rank]),  # indices
+            tensor_shape.TensorShape([None]),  # values
+            tensor_shape.TensorShape([rank])]  # dense_shape
+
+  @property
+  def _is_graph_tensor(self):
+    return hasattr(self._values, 'graph')
+
 
 SparseTensorValue = collections.namedtuple(
     "SparseTensorValue", ["indices", "values", "dense_shape"])
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index 960a3dad7389553955c999e444a9f98c1857f588..0dc3dde4f6e95dbe4156a29d03f465e95cb4a5f6 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -74,9 +74,8 @@ def enable_v2_tensorshape():
   # in `tensor_shape[i]`, but they would not be.
   ```
   """
-  global _TENSORSHAPE_V2_OVERRIDE, TensorShape  # pylint: disable=invalid-name
+  global _TENSORSHAPE_V2_OVERRIDE  # pylint: disable=invalid-name
   _TENSORSHAPE_V2_OVERRIDE = True
-  TensorShape = TensorShapeV2
 
 
 @tf_export(v1=["disable_v2_tensorshape"])
@@ -85,12 +84,12 @@ def disable_v2_tensorshape():
 
   See docstring for `enable_v2_tensorshape` for details about the new behavior.
   """
-  global _TENSORSHAPE_V2_OVERRIDE, TensorShape  # pylint: disable=invalid-name
+  global _TENSORSHAPE_V2_OVERRIDE  # pylint: disable=invalid-name
   _TENSORSHAPE_V2_OVERRIDE = False
-  TensorShape = TensorShapeV1
 
 
-@tf_export(v1=["dimension_value"])
+@tf_export("compat.dimension_value",
+           v1=["dimension_value", "compat.dimension_value"])
 def dimension_value(dimension):
   """Compatibility utility required to allow for both V1 and V2 behavior in TF.
 
@@ -122,7 +121,8 @@ def dimension_value(dimension):
   return dimension
 
 
-@tf_export(v1=["dimension_at_index"])
+@tf_export("compat.dimension_at_index",
+           v1=["dimension_at_index", "compat.dimension_at_index"])
 def dimension_at_index(shape, index):
   """Compatibility utility required to allow for both V1 and V2 behavior in TF.
 
@@ -269,10 +269,11 @@ class Dimension(object):
     Dimensions are combined as follows:
 
     ```python
-    tf.Dimension(n)   .merge_with(tf.Dimension(n))    == tf.Dimension(n)
-    tf.Dimension(n)   .merge_with(tf.Dimension(None)) == tf.Dimension(n)
-    tf.Dimension(None).merge_with(tf.Dimension(n))    == tf.Dimension(n)
-    tf.Dimension(None).merge_with(tf.Dimension(None)) == tf.Dimension(None)
+    tf.Dimension(n)   .merge_with(tf.Dimension(n))     == tf.Dimension(n)
+    tf.Dimension(n)   .merge_with(tf.Dimension(None))  == tf.Dimension(n)
+    tf.Dimension(None).merge_with(tf.Dimension(n))     == tf.Dimension(n)
+    # equivalent to tf.Dimension(None)
+    tf.Dimension(None).merge_with(tf.Dimension(None))
 
     # raises ValueError for n != m
     tf.Dimension(n)   .merge_with(tf.Dimension(m))
@@ -302,10 +303,10 @@ class Dimension(object):
     Dimensions are summed as follows:
 
     ```python
-    tf.Dimension(m)    + tf.Dimension(n)    == tf.Dimension(m + n)
-    tf.Dimension(m)    + tf.Dimension(None) == tf.Dimension(None)
-    tf.Dimension(None) + tf.Dimension(n)    == tf.Dimension(None)
-    tf.Dimension(None) + tf.Dimension(None) == tf.Dimension(None)
+    tf.Dimension(m)    + tf.Dimension(n)     == tf.Dimension(m + n)
+    tf.Dimension(m)    + tf.Dimension(None)  # equiv. to tf.Dimension(None)
+    tf.Dimension(None) + tf.Dimension(n)     # equiv. to tf.Dimension(None)
+    tf.Dimension(None) + tf.Dimension(None)  # equiv. to tf.Dimension(None)
     ```
 
     Args:
@@ -337,10 +338,10 @@ class Dimension(object):
     Dimensions are subtracted as follows:
 
     ```python
-    tf.Dimension(m)    - tf.Dimension(n)    == tf.Dimension(m - n)
-    tf.Dimension(m)    - tf.Dimension(None) == tf.Dimension(None)
-    tf.Dimension(None) - tf.Dimension(n)    == tf.Dimension(None)
-    tf.Dimension(None) - tf.Dimension(None) == tf.Dimension(None)
+    tf.Dimension(m)    - tf.Dimension(n)     == tf.Dimension(m - n)
+    tf.Dimension(m)    - tf.Dimension(None)  # equiv. to tf.Dimension(None)
+    tf.Dimension(None) - tf.Dimension(n)     # equiv. to tf.Dimension(None)
+    tf.Dimension(None) - tf.Dimension(None)  # equiv. to tf.Dimension(None)
     ```
 
     Args:
@@ -376,10 +377,10 @@ class Dimension(object):
     Dimensions are summed as follows:
 
     ```python
-    tf.Dimension(m)    * tf.Dimension(n)    == tf.Dimension(m * n)
-    tf.Dimension(m)    * tf.Dimension(None) == tf.Dimension(None)
-    tf.Dimension(None) * tf.Dimension(n)    == tf.Dimension(None)
-    tf.Dimension(None) * tf.Dimension(None) == tf.Dimension(None)
+    tf.Dimension(m)    * tf.Dimension(n)     == tf.Dimension(m * n)
+    tf.Dimension(m)    * tf.Dimension(None)  # equiv. to tf.Dimension(None)
+    tf.Dimension(None) * tf.Dimension(n)     # equiv. to tf.Dimension(None)
+    tf.Dimension(None) * tf.Dimension(None)  # equiv. to tf.Dimension(None)
     ```
 
     Args:
@@ -415,10 +416,10 @@ class Dimension(object):
     Dimensions are divided as follows:
 
     ```python
-    tf.Dimension(m)    // tf.Dimension(n)    == tf.Dimension(m // n)
-    tf.Dimension(m)    // tf.Dimension(None) == tf.Dimension(None)
-    tf.Dimension(None) // tf.Dimension(n)    == tf.Dimension(None)
-    tf.Dimension(None) // tf.Dimension(None) == tf.Dimension(None)
+    tf.Dimension(m)    // tf.Dimension(n)     == tf.Dimension(m // n)
+    tf.Dimension(m)    // tf.Dimension(None)  # equiv. to tf.Dimension(None)
+    tf.Dimension(None) // tf.Dimension(n)     # equiv. to tf.Dimension(None)
+    tf.Dimension(None) // tf.Dimension(None)  # equiv. to tf.Dimension(None)
     ```
 
     Args:
@@ -467,16 +468,64 @@ class Dimension(object):
     """
     return self // other
 
+  def __rdiv__(self, other):
+    """Use `__floordiv__` via `x // y` instead.
+
+    This function exists only to have a better error message. Instead of:
+    `TypeError: unsupported operand type(s) for /: 'int' and 'Dimension'`,
+    this function will explicitly call for usage of `//` instead.
+
+    Args:
+      other: Another `Dimension`.
+
+    Raises:
+      TypeError.
+    """
+    raise TypeError("unsupported operand type(s) for /: '{}' and 'Dimension', "
+                    "please use // instead".format(type(other).__name__))
+
+  def __truediv__(self, other):
+    """Use `__floordiv__` via `x // y` instead.
+
+    This function exists only to have a better error message. Instead of:
+    `TypeError: unsupported operand type(s) for /: 'Dimension' and 'int'`,
+    this function will explicitly call for usage of `//` instead.
+
+    Args:
+      other: Another `Dimension`.
+
+    Raises:
+      TypeError.
+    """
+    raise TypeError("unsupported operand type(s) for /: 'Dimension' and '{}', "
+                    "please use // instead".format(type(other).__name__))
+
+  def __rtruediv__(self, other):
+    """Use `__floordiv__` via `x // y` instead.
+
+    This function exists only to have a better error message. Instead of:
+    `TypeError: unsupported operand type(s) for /: 'int' and 'Dimension'`,
+    this function will explicitly call for usage of `//` instead.
+
+    Args:
+      other: Another `Dimension`.
+
+    Raises:
+      TypeError.
+    """
+    raise TypeError("unsupported operand type(s) for /: '{}' and 'Dimension', "
+                    "please use // instead".format(type(other).__name__))
+
   def __mod__(self, other):
     """Returns `self` modulo `other`.
 
     Dimension moduli are computed as follows:
 
     ```python
-    tf.Dimension(m)    % tf.Dimension(n)    == tf.Dimension(m % n)
-    tf.Dimension(m)    % tf.Dimension(None) == tf.Dimension(None)
-    tf.Dimension(None) % tf.Dimension(n)    == tf.Dimension(None)
-    tf.Dimension(None) % tf.Dimension(None) == tf.Dimension(None)
+    tf.Dimension(m)    % tf.Dimension(n)     == tf.Dimension(m % n)
+    tf.Dimension(m)    % tf.Dimension(None)  # equiv. to tf.Dimension(None)
+    tf.Dimension(None) % tf.Dimension(n)     # equiv. to tf.Dimension(None)
+    tf.Dimension(None) % tf.Dimension(None)  # equiv. to tf.Dimension(None)
     ```
 
     Args:
@@ -632,8 +681,8 @@ def as_dimension(value):
     return Dimension(value)
 
 
-@tf_export(v1=["TensorShape"])
-class TensorShapeV1(object):
+@tf_export("TensorShape")
+class TensorShape(object):
   """Represents the shape of a `Tensor`.
 
   A `TensorShape` represents a possibly-partial shape specification for a
@@ -692,7 +741,7 @@ class TensorShapeV1(object):
   @property
   def _v2_behavior(self):
     if _TENSORSHAPE_V2_OVERRIDE is None:
-      return False
+      return tf2.enabled()
     return _TENSORSHAPE_V2_OVERRIDE
 
   def __repr__(self):
@@ -1148,22 +1197,6 @@ def unknown_shape(rank=None, **kwargs):
     return TensorShape([Dimension(None)] * rank)
 
 
-@tf_export("TensorShape", v1=[])
-class TensorShapeV2(TensorShapeV1):
-
-  @property
-  def _v2_behavior(self):
-    if _TENSORSHAPE_V2_OVERRIDE is None:
-      return True
-    return _TENSORSHAPE_V2_OVERRIDE
-
-
-if tf2.enabled():
-  TensorShape = TensorShapeV2
-else:
-  TensorShape = TensorShapeV1
-
-
 def scalar():
   """Returns a shape representing a scalar."""
   return TensorShape([])
diff --git a/tensorflow/python/framework/tensor_shape_div_test.py b/tensorflow/python/framework/tensor_shape_div_test.py
index 8e63d7f54705bb5c8384315f068598a86c047599..5160c75e5272d9326a35a0813809387605cca1ea 100644
--- a/tensorflow/python/framework/tensor_shape_div_test.py
+++ b/tensorflow/python/framework/tensor_shape_div_test.py
@@ -35,6 +35,16 @@ class DimensionDivTest(test_util.TensorFlowTestCase):
         for y in values:
           self.assertEqual((x / y).value, (x // y).value)
 
+  def testRDivFail(self):
+    # Note: This test is related to GitHub issue 25790.
+    """Without from __future__ import division, __rdiv__ is used."""
+    if six.PY2:  # Old division exists only in Python 2
+      two = tensor_shape.Dimension(2)
+      message = (r"unsupported operand type\(s\) for /: "
+                 r"'int' and 'Dimension', please use // instead")
+      with self.assertRaisesRegexp(TypeError, message):
+        _ = 6 / two
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/tensor_shape_test.py b/tensorflow/python/framework/tensor_shape_test.py
index 7d85e0a99e662512b29e4134091658190a3bc500..b4a37c05a83a578343114dd2e2f604af37e72fce 100644
--- a/tensorflow/python/framework/tensor_shape_test.py
+++ b/tensorflow/python/framework/tensor_shape_test.py
@@ -205,6 +205,23 @@ class DimensionTest(test_util.TensorFlowTestCase):
     reconstructed = ctor(*args)
     self.assertEquals(reconstructed, dim)
 
+  def testDiv(self):
+    # Note: This test is related to GitHub issue 25790.
+    six = tensor_shape.Dimension(6)
+    two = tensor_shape.Dimension(2)
+    message = (r"unsupported operand type\(s\) for /: "
+               r"'Dimension' and 'Dimension', please use // instead")
+    with self.assertRaisesRegexp(TypeError, message):
+      _ = six / two
+    message = (r"unsupported operand type\(s\) for /: "
+               r"'Dimension' and 'int', please use // instead")
+    with self.assertRaisesRegexp(TypeError, message):
+      _ = six / 2
+    message = (r"unsupported operand type\(s\) for /: "
+               r"'int' and 'Dimension', please use // instead")
+    with self.assertRaisesRegexp(TypeError, message):
+      _ = 6 / two
+
 
 class ShapeTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/framework/tensor_spec.py b/tensorflow/python/framework/tensor_spec.py
index c44636edc4ec5101c588766714c98a7da15793e4..2e847c7a35b946c4153304014d15f5ee96760142 100644
--- a/tensorflow/python/framework/tensor_spec.py
+++ b/tensorflow/python/framework/tensor_spec.py
@@ -108,7 +108,9 @@ class TensorSpec(object):
     return hash((self._shape_tuple, self.dtype))
 
   def __eq__(self, other):
-    return self.shape == other.shape and self.dtype == other.dtype
+    return (self._shape_tuple == other._shape_tuple  # pylint: disable=protected-access
+            and self.dtype == other.dtype
+            and self._name == other._name)  # pylint: disable=protected-access
 
   def __ne__(self, other):
     return not self == other
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index f98f301b38a946146df3051db9b8d26c8b816b33..af943f09ab84b0032d06e071f5d2fa5652027c33 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -22,6 +22,8 @@ import six
 
 from tensorflow.core.framework import tensor_pb2
 from tensorflow.core.framework import tensor_shape_pb2
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.util import compat
@@ -42,7 +44,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 def ExtractBitsFromFloat16(x):
-  return np.asscalar(np.asarray(x, dtype=np.float16).view(np.uint16))
+  return np.asarray(x, dtype=np.float16).view(np.uint16).item()
 
 
 def SlowAppendFloat16ArrayToTensorProto(tensor_proto, proto_values):
@@ -58,8 +60,8 @@ def _MediumAppendFloat16ArrayToTensorProto(tensor_proto, proto_values):
 
 
 def ExtractBitsFromBFloat16(x):
-  return np.asscalar(
-      np.asarray(x, dtype=dtypes.bfloat16.as_numpy_dtype).view(np.uint16))
+  return np.asarray(
+      x, dtype=dtypes.bfloat16.as_numpy_dtype).view(np.uint16).item()
 
 
 def SlowAppendBFloat16ArrayToTensorProto(tensor_proto, proto_values):
@@ -122,39 +124,39 @@ if _FAST_TENSOR_UTIL_AVAILABLE:
 else:
 
   def SlowAppendFloat32ArrayToTensorProto(tensor_proto, proto_values):
-    tensor_proto.float_val.extend([np.asscalar(x) for x in proto_values])
+    tensor_proto.float_val.extend([x.item() for x in proto_values])
 
   def SlowAppendFloat64ArrayToTensorProto(tensor_proto, proto_values):
-    tensor_proto.double_val.extend([np.asscalar(x) for x in proto_values])
+    tensor_proto.double_val.extend([x.item() for x in proto_values])
 
   def SlowAppendIntArrayToTensorProto(tensor_proto, proto_values):
-    tensor_proto.int_val.extend([np.asscalar(x) for x in proto_values])
+    tensor_proto.int_val.extend([x.item() for x in proto_values])
 
   def SlowAppendInt64ArrayToTensorProto(tensor_proto, proto_values):
-    tensor_proto.int64_val.extend([np.asscalar(x) for x in proto_values])
+    tensor_proto.int64_val.extend([x.item() for x in proto_values])
 
   def SlowAppendQIntArrayToTensorProto(tensor_proto, proto_values):
-    tensor_proto.int_val.extend([np.asscalar(x[0]) for x in proto_values])
+    tensor_proto.int_val.extend([x.item(0) for x in proto_values])
 
   def SlowAppendUInt32ArrayToTensorProto(tensor_proto, proto_values):
-    tensor_proto.uint32_val.extend([np.asscalar(x) for x in proto_values])
+    tensor_proto.uint32_val.extend([x.item() for x in proto_values])
 
   def SlowAppendUInt64ArrayToTensorProto(tensor_proto, proto_values):
-    tensor_proto.uint64_val.extend([np.asscalar(x) for x in proto_values])
+    tensor_proto.uint64_val.extend([x.item() for x in proto_values])
 
   def SlowAppendComplex64ArrayToTensorProto(tensor_proto, proto_values):
     tensor_proto.scomplex_val.extend(
-        [np.asscalar(v) for x in proto_values for v in [x.real, x.imag]])
+        [v.item() for x in proto_values for v in [x.real, x.imag]])
 
   def SlowAppendComplex128ArrayToTensorProto(tensor_proto, proto_values):
     tensor_proto.dcomplex_val.extend(
-        [np.asscalar(v) for x in proto_values for v in [x.real, x.imag]])
+        [v.item() for x in proto_values for v in [x.real, x.imag]])
 
   def SlowAppendObjectArrayToTensorProto(tensor_proto, proto_values):
     tensor_proto.string_val.extend([compat.as_bytes(x) for x in proto_values])
 
   def SlowAppendBoolArrayToTensorProto(tensor_proto, proto_values):
-    tensor_proto.bool_val.extend([np.asscalar(x) for x in proto_values])
+    tensor_proto.bool_val.extend([x.item() for x in proto_values])
 
   _NP_TO_APPEND_FN = {
       dtypes.bfloat16.as_numpy_dtype: SlowAppendBFloat16ArrayToTensorProto,
@@ -598,88 +600,53 @@ def MakeNdarray(tensor):
   dtype = tensor_dtype.as_numpy_dtype
 
   if tensor.tensor_content:
-    return (np.frombuffer(tensor.tensor_content, dtype=dtype).copy()
-            .reshape(shape))
-  elif tensor_dtype == dtypes.float16 or tensor_dtype == dtypes.bfloat16:
+    return (np.frombuffer(tensor.tensor_content,
+                          dtype=dtype).copy().reshape(shape))
+
+  if tensor_dtype == dtypes.string:
+    # np.pad throws on these arrays of type np.object.
+    values = list(tensor.string_val)
+    padding = num_elements - len(values)
+    if padding > 0:
+      last = values[-1] if values else ""
+      values.extend([last] * padding)
+    return np.array(values, dtype=dtype).reshape(shape)
+
+  if tensor_dtype == dtypes.float16 or tensor_dtype == dtypes.bfloat16:
     # the half_val field of the TensorProto stores the binary representation
     # of the fp16: we need to reinterpret this as a proper float16
-    if len(tensor.half_val) == 1:
-      tmp = np.array(tensor.half_val[0], dtype=np.uint16)
-      tmp.dtype = tensor_dtype.as_numpy_dtype
-      return np.repeat(tmp, num_elements).reshape(shape)
-    else:
-      tmp = np.fromiter(tensor.half_val, dtype=np.uint16)
-      tmp.dtype = tensor_dtype.as_numpy_dtype
-      return tmp.reshape(shape)
+    values = np.fromiter(tensor.half_val, dtype=np.uint16)
+    values.dtype = tensor_dtype.as_numpy_dtype
   elif tensor_dtype == dtypes.float32:
-    if len(tensor.float_val) == 1:
-      return np.repeat(
-          np.array(tensor.float_val[0], dtype=dtype),
-          num_elements).reshape(shape)
-    else:
-      return np.fromiter(tensor.float_val, dtype=dtype).reshape(shape)
+    values = np.fromiter(tensor.float_val, dtype=dtype)
   elif tensor_dtype == dtypes.float64:
-    if len(tensor.double_val) == 1:
-      return np.repeat(
-          np.array(tensor.double_val[0], dtype=dtype),
-          num_elements).reshape(shape)
-    else:
-      return np.fromiter(tensor.double_val, dtype=dtype).reshape(shape)
+    values = np.fromiter(tensor.double_val, dtype=dtype)
   elif tensor_dtype in [
       dtypes.int32, dtypes.uint8, dtypes.uint16, dtypes.int16, dtypes.int8,
       dtypes.qint32, dtypes.quint8, dtypes.qint8, dtypes.qint16, dtypes.quint16
   ]:
-    if len(tensor.int_val) == 1:
-      return np.repeat(np.array(tensor.int_val[0], dtype=dtype),
-                       num_elements).reshape(shape)
-    else:
-      return np.fromiter(tensor.int_val, dtype=dtype).reshape(shape)
+    values = np.fromiter(tensor.int_val, dtype=dtype)
   elif tensor_dtype == dtypes.int64:
-    if len(tensor.int64_val) == 1:
-      return np.repeat(
-          np.array(tensor.int64_val[0], dtype=dtype),
-          num_elements).reshape(shape)
-    else:
-      return np.fromiter(tensor.int64_val, dtype=dtype).reshape(shape)
-  elif tensor_dtype == dtypes.string:
-    if len(tensor.string_val) == 1:
-      return np.repeat(
-          np.array(tensor.string_val[0], dtype=dtype),
-          num_elements).reshape(shape)
-    else:
-      return np.array(
-          [x for x in tensor.string_val], dtype=dtype).reshape(shape)
+    values = np.fromiter(tensor.int64_val, dtype=dtype)
   elif tensor_dtype == dtypes.complex64:
     it = iter(tensor.scomplex_val)
-    if len(tensor.scomplex_val) == 2:
-      return np.repeat(
-          np.array(
-              complex(tensor.scomplex_val[0], tensor.scomplex_val[1]),
-              dtype=dtype), num_elements).reshape(shape)
-    else:
-      return np.array(
-          [complex(x[0], x[1]) for x in zip(it, it)],
-          dtype=dtype).reshape(shape)
+    values = np.array([complex(x[0], x[1]) for x in zip(it, it)], dtype=dtype)
   elif tensor_dtype == dtypes.complex128:
     it = iter(tensor.dcomplex_val)
-    if len(tensor.dcomplex_val) == 2:
-      return np.repeat(
-          np.array(
-              complex(tensor.dcomplex_val[0], tensor.dcomplex_val[1]),
-              dtype=dtype), num_elements).reshape(shape)
-    else:
-      return np.array(
-          [complex(x[0], x[1]) for x in zip(it, it)],
-          dtype=dtype).reshape(shape)
+    values = np.array([complex(x[0], x[1]) for x in zip(it, it)], dtype=dtype)
   elif tensor_dtype == dtypes.bool:
-    if len(tensor.bool_val) == 1:
-      return np.repeat(np.array(tensor.bool_val[0], dtype=dtype),
-                       num_elements).reshape(shape)
-    else:
-      return np.fromiter(tensor.bool_val, dtype=dtype).reshape(shape)
+    values = np.fromiter(tensor.bool_val, dtype=dtype)
   else:
     raise TypeError("Unsupported tensor type: %s" % tensor.dtype)
 
+  if values.size == 0:
+    return np.zeros(shape, dtype)
+
+  if values.size != num_elements:
+    values = np.pad(values, (0, num_elements - values.size), "edge")
+
+  return values.reshape(shape)
+
 
 def ShapeEquals(tensor_proto, shape):
   """Returns True if "tensor_proto" has the given "shape".
@@ -708,7 +675,7 @@ def ShapeEquals(tensor_proto, shape):
 def _ConstantValue(tensor, partial):
   # TODO(touts): Support Variables?
   if not isinstance(tensor, ops.Tensor):
-    raise TypeError("tensor is not a Tensor")
+    raise TypeError("%r is not a Tensor, has type %s" % (tensor, type(tensor)))
   if tensor.op.type == "Const":
     return MakeNdarray(tensor.op.get_attr("value"))
   elif tensor.op.type == "Shape":
@@ -816,19 +783,16 @@ def _ConstantValue(tensor, partial):
     return None
 
 
+@tf_export('get_static_value')
 def constant_value(tensor, partial=False):  # pylint: disable=invalid-name
   """Returns the constant value of the given tensor, if efficiently calculable.
 
   This function attempts to partially evaluate the given tensor, and
   returns its value as a numpy ndarray if this succeeds.
 
-  TODO(mrry): Consider whether this function should use a registration
-  mechanism like gradients and ShapeFunctions, so that it is easily
-  extensible.
-
-  NOTE: If `constant_value(tensor)` returns a non-`None` result, it will no
-  longer be possible to feed a different value for `tensor`. This allows the
-  result of this function to influence the graph that is constructed, and
+  Compatibility(V1): If `constant_value(tensor)` returns a non-`None` result, it
+  will no longer be possible to feed a different value for `tensor`. This allows
+  the result of this function to influence the graph that is constructed, and
   permits static shape optimizations.
 
   Args:
@@ -845,6 +809,8 @@ def constant_value(tensor, partial=False):  # pylint: disable=invalid-name
   """
   if isinstance(tensor, ops.EagerTensor):
     return tensor.numpy()
+  if not pywrap_tensorflow.IsTensor(tensor):
+    return tensor
   ret = _ConstantValue(tensor, partial)
   if ret is not None:
     # The caller may now depend on the constant value of `tensor`, so we
@@ -970,13 +936,15 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
   return ret
 
 
+@tf_export("is_tensor")
 def is_tensor(x):  # pylint: disable=invalid-name
   """Check whether `x` is of tensor type.
 
-  Check whether an object is a tensor. This check is equivalent to calling
-  `isinstance(x, (tf.Tensor, tf.SparseTensor, tf.Variable))` and also checks
-  if all the component variables of a MirroredVariable or a ReplicaLocalVariable
-  are tensors.
+  Check whether an object is a tensor or a composite tensor. This check is
+  equivalent to calling
+  `isinstance(x, (tf.Tensor, tf.SparseTensor, tf.RaggedTensor, tf.Variable))`
+  and also checks if all the component variables of a MirroredVariable or a
+  ReplicaLocalVariable are tensors.
 
   Args:
     x: A python object to check.
@@ -985,4 +953,5 @@ def is_tensor(x):  # pylint: disable=invalid-name
     `True` if `x` is a tensor, `False` if not.
   """
   return (isinstance(x, ops._TensorLike) or ops.is_dense_tensor_like(x) or  # pylint: disable=protected-access
+          isinstance(x, composite_tensor.CompositeTensor) or
           (hasattr(x, "is_tensor_like") and x.is_tensor_like))
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index 00337546186d3a01313a49d11dd266e6dade3227..e73df390357107b155dd6bdfb7c3bedc713303a9 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -336,23 +336,16 @@ class TensorUtilTest(test.TestCase):
       self.assertAllClose(np.array([10, 20, 30], dtype=nptype), a)
 
   def testIntTypesWithImplicitRepeat(self):
-    for dtype, nptype in [(dtypes.int64, np.int64),
-                          (dtypes.int32, np.int32),
-                          (dtypes.uint8, np.uint8),
-                          (dtypes.uint16, np.uint16),
-                          (dtypes.int16, np.int16),
-                          (dtypes.int8, np.int8)]:
+    for dtype, nptype in [(dtypes.int64, np.int64), (dtypes.int32, np.int32),
+                          (dtypes.uint8, np.uint8), (dtypes.uint16, np.uint16),
+                          (dtypes.int16, np.int16), (dtypes.int8, np.int8)]:
       self.assertAllEqual(
-          np.array(
-              [[10, 10, 10, 10],
-               [10, 10, 10, 10],
-               [10, 10, 10, 10]],
-              dtype=nptype),
+          np.array([[10, 11, 12, 12], [12, 12, 12, 12], [12, 12, 12, 12]],
+                   dtype=nptype),
           tensor_util.MakeNdarray(
-              tensor_util.make_tensor_proto(
-                  [10],
-                  shape=[3, 4],
-                  dtype=dtype)))
+              tensor_util.make_tensor_proto([10, 11, 12],
+                                            shape=[3, 4],
+                                            dtype=dtype)))
 
   def testIntMixedWithDimension(self):
     # Github issue: 11974
@@ -500,9 +493,12 @@ class TensorUtilTest(test.TestCase):
     self.assertEquals([b"foo"], a)
 
   def testStringWithImplicitRepeat(self):
-    t = tensor_util.make_tensor_proto("f", shape=[3, 4])
+    t = tensor_util.make_tensor_proto(["f", "g"], shape=[3, 4])
     a = tensor_util.MakeNdarray(t)
-    self.assertAllEqual(np.array([[b"f"] * 4] * 3, dtype=np.object), a)
+    self.assertAllEqual(
+        np.array([[b"f", b"g", b"g", b"g"], [b"g", b"g", b"g", b"g"],
+                  [b"g", b"g", b"g", b"g"]],
+                 dtype=np.object), a)
 
   def testStringN(self):
     t = tensor_util.make_tensor_proto([b"foo", b"bar", b"baz"], shape=[1, 3])
@@ -777,6 +773,16 @@ class TensorUtilTest(test.TestCase):
       self.assertAllClose(np.array([10, 20, 30], dtype=np.int64), a)
 
 
+class IsTensorTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConstantTensor(self):
+    np_val = np.random.rand(3).astype(np.int32)
+    tf_val = constant_op.constant(np_val)
+    self.assertFalse(tensor_util.is_tensor(np_val))
+    self.assertTrue(tensor_util.is_tensor(tf_val))
+
+
 class ConstantValueTest(test.TestCase):
 
   def testConstant(self):
diff --git a/tensorflow/python/framework/test_ops.cc b/tensorflow/python/framework/test_ops.cc
index 99e184a8acd44012774917c4baaecd48bae6cbe3..5d1386c26d73816772936bac9fe57c575a399066 100644
--- a/tensorflow/python/framework/test_ops.cc
+++ b/tensorflow/python/framework/test_ops.cc
@@ -157,7 +157,7 @@ REGISTER_KERNEL_BUILDER(Name("Old").Device(DEVICE_CPU), OldOp);
 // Stubbed-out resource to test resource handle ops.
 class StubResource : public ResourceBase {
  public:
-  string DebugString() override { return ""; }
+  string DebugString() const override { return ""; }
 };
 
 REGISTER_RESOURCE_HANDLE_KERNEL(StubResource);
@@ -406,6 +406,10 @@ REGISTER_OP("FuncAttr")
     .Attr("f: func")
     .SetShapeFn(shape_inference::UnknownShape);
 
+REGISTER_OP("FuncListAttr")
+    .Attr("f: list(func)")
+    .SetShapeFn(shape_inference::UnknownShape);
+
 REGISTER_OP("Simple")
     .Input("a: int32")
     .Output("out: float")
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index df3cebd2e0c2f37711dc41cf60409c2660bf3e2c..1d267830ff9bfca395f63e49a53bf5b97a71887b 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -67,16 +67,14 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import script_ops
-from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
-from tensorflow.python.util import memory
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
@@ -84,6 +82,19 @@ from tensorflow.python.util.protobuf import compare
 from tensorflow.python.util.tf_export import tf_export
 
 
+# If the above import is made available through the BUILD rule, then this
+# function is overridden and will instead return True and cause Tensorflow
+# graphs to be compiled with XLA.
+def is_xla_enabled():
+  return False
+
+
+try:
+  from tensorflow.python.framework.is_xla_test_true import is_xla_enabled  # pylint: disable=g-import-not-at-top
+except:
+  pass
+
+
 @tf_export("test.gpu_device_name")
 def gpu_device_name():
   """Returns the name of a GPU device if available or the empty string."""
@@ -99,6 +110,7 @@ def assert_ops_in_graph(expected_ops, graph):
   Args:
     expected_ops: `dict<string, string>` of op name to op type.
     graph: Graph to check.
+
   Returns:
     `dict<string, node>` of node name to node.
 
@@ -120,7 +132,7 @@ def assert_ops_in_graph(expected_ops, graph):
 
 
 @tf_export("test.assert_equal_graph_def", v1=[])
-def assert_equal_graph_def_v2(actual, expected):
+def assert_equal_graph_def_v2(expected, actual):
   """Asserts that two `GraphDef`s are (mostly) the same.
 
   Compares two `GraphDef` protos for equality, ignoring versions and ordering of
@@ -129,8 +141,8 @@ def assert_equal_graph_def_v2(actual, expected):
   ignores randomized attribute values that may appear in V2 checkpoints.
 
   Args:
-    actual: The `GraphDef` we have.
     expected: The `GraphDef` we expected.
+    actual: The `GraphDef` we have.
 
   Raises:
     AssertionError: If the `GraphDef`s do not match.
@@ -151,7 +163,7 @@ def assert_equal_graph_def_v1(actual, expected, checkpoint_v2=False):
     actual: The `GraphDef` we have.
     expected: The `GraphDef` we expected.
     checkpoint_v2: boolean determining whether to ignore randomized attribute
-        values that appear in V2 checkpoints.
+      values that appear in V2 checkpoints.
 
   Raises:
     AssertionError: If the `GraphDef`s do not match.
@@ -362,7 +374,8 @@ def skip_if(condition):
 
   Args:
     condition: Either an expression that can be used in "if not condition"
-               statement, or a callable whose result should be a boolean.
+      statement, or a callable whose result should be a boolean.
+
   Returns:
     The wrapped function
   """
@@ -375,7 +388,7 @@ def skip_if(condition):
       else:
         skip = condition
       if not skip:
-        fn(*args, **kwargs)
+        return fn(*args, **kwargs)
 
     return wrapper
 
@@ -409,42 +422,12 @@ def enable_control_flow_v2(fn):
   """
 
   def wrapper(*args, **kwargs):
-    enable_cond_v2_old = control_flow_ops.ENABLE_COND_V2
-    enable_while_v2_old = control_flow_ops.ENABLE_WHILE_V2
-    enable_tensor_array_v2_old = tensor_array_ops.ENABLE_TENSOR_ARRAY_V2
-    control_flow_ops.ENABLE_COND_V2 = True
-    control_flow_ops.ENABLE_WHILE_V2 = True
-    tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = True
-    try:
-      fn(*args, **kwargs)
-    finally:
-      control_flow_ops.ENABLE_COND_V2 = enable_cond_v2_old
-      control_flow_ops.ENABLE_WHILE_V2 = enable_while_v2_old
-      tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = enable_tensor_array_v2_old
-
-  return wrapper
-
-
-def enable_tensor_array_v2(fn):
-  """Decorator for enabling _GraphTensorArrayV2 on a test.
-
-  Note this enables _GraphTensorArrayV2 after running the test class's
-  setup/teardown methods.
-
-  Args:
-    fn: the function to be wrapped
-
-  Returns:
-    The wrapped function
-  """
-
-  def wrapper(*args, **kwargs):
-    enable_tensor_array_v2_old = tensor_array_ops.ENABLE_TENSOR_ARRAY_V2
-    tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = True
+    enable_control_flow_v2_old = control_flow_util.ENABLE_CONTROL_FLOW_V2
+    control_flow_util.ENABLE_CONTROL_FLOW_V2 = True
     try:
-      fn(*args, **kwargs)
+      return fn(*args, **kwargs)
     finally:
-      tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = enable_tensor_array_v2_old
+      control_flow_util.ENABLE_CONTROL_FLOW_V2 = enable_control_flow_v2_old
 
   return wrapper
 
@@ -493,7 +476,7 @@ def with_control_flow_v2(cls):
   Returns:
     cls with new test methods added
   """
-  if control_flow_ops.ENABLE_WHILE_V2 and control_flow_ops.ENABLE_COND_V2:
+  if control_flow_util.ENABLE_CONTROL_FLOW_V2:
     return cls
 
   for name, value in cls.__dict__.copy().items():
@@ -515,9 +498,11 @@ def disable_control_flow_v2(unused_msg):
   Returns:
     The wrapped function with _disable_control_flow_v2 attr set to True.
   """
+
   def wrapper(func):
     func._disable_control_flow_v2 = True
     return func
+
   return wrapper
 
 
@@ -600,6 +585,7 @@ def assert_no_new_tensors(f):
 
   Args:
     f: The test case to run.
+
   Returns:
     The decorated test case.
   """
@@ -626,9 +612,9 @@ def assert_no_new_tensors(f):
       ops.get_default_graph()._graph_key = outside_graph_key
       if outside_executed_eagerly:
         with context.eager_mode():
-          f(self, **kwargs)
+          result = f(self, **kwargs)
       else:
-        f(self, **kwargs)
+        result = f(self, **kwargs)
     # Make an effort to clear caches, which would otherwise look like leaked
     # Tensors.
     context.context()._clear_caches()  # pylint: disable=protected-access
@@ -642,6 +628,7 @@ def assert_no_new_tensors(f):
           len(tensors_after),
           str(tensors_after),
       )))
+    return result
 
   return decorator
 
@@ -758,6 +745,7 @@ def assert_no_garbage_created(f):
 
   Args:
     f: The function to decorate.
+
   Returns:
     The decorated function.
   """
@@ -766,14 +754,14 @@ def assert_no_garbage_created(f):
     """Sets DEBUG_SAVEALL, runs the test, and checks for new garbage."""
     # Force-load `distribution_strategy_context` to prevent GC at
     # test time when using eager. Remove once b/117329403 is resolved.
-    tape.distribution_strategy_context.get_distribution_strategy()
+    tape.distribution_strategy_context.get_strategy()
 
     gc.disable()
     previous_debug_flags = gc.get_debug()
     gc.set_debug(gc.DEBUG_SAVEALL)
     gc.collect()
     previous_garbage = len(gc.garbage)
-    f(self, **kwargs)
+    result = f(self, **kwargs)
     gc.collect()
     new_garbage = len(gc.garbage)
     if new_garbage > previous_garbage:
@@ -818,6 +806,7 @@ def assert_no_garbage_created(f):
     # not hold on to every object in other tests.
     gc.set_debug(previous_debug_flags)
     gc.enable()
+    return result
 
   return decorator
 
@@ -829,8 +818,8 @@ def _combine_named_parameters(**kwargs):
   can be computed using `times()`.
 
   Args:
-    **kwargs: keyword arguments of form `option=[possibilities, ...]`
-         or `option=the_only_possibility`.
+    **kwargs: keyword arguments of form `option=[possibilities, ...]` or
+      `option=the_only_possibility`.
 
   Returns:
     a list of dictionaries for each combination. Keys in the dictionaries are
@@ -868,8 +857,8 @@ def generate_combinations_with_testcase_name(**kwargs):
   parameterized tests.
 
   Args:
-    **kwargs: keyword arguments of form `option=[possibilities, ...]`
-         or `option=the_only_possibility`.
+    **kwargs: keyword arguments of form `option=[possibilities, ...]` or
+      `option=the_only_possibility`.
 
   Returns:
     a list of dictionaries for each combination. Keys in the dictionaries are
@@ -897,10 +886,10 @@ def run_all_in_graph_and_eager_modes(cls):
   """Execute all test methods in the given class with and without eager."""
   base_decorator = run_in_graph_and_eager_modes
   for name, value in cls.__dict__.copy().items():
-    if (callable(value) and
-        name.startswith(unittest.TestLoader.testMethodPrefix) and
-        not (name.startswith("testSkipEager")
-             or name.startswith("test_skip_eager"))):
+    if callable(value) and name.startswith(
+        unittest.TestLoader.testMethodPrefix) and not (
+            name.startswith("testSkipEager") or
+            name.startswith("test_skip_eager") or name == "test_session"):
       setattr(cls, name, base_decorator(value))
   return cls
 
@@ -938,13 +927,17 @@ def run_in_graph_and_eager_modes(func=None,
   eager execution enabled as it does when constructing a TensorFlow graph and
   executing the `z` tensor in a session.
 
+  `deprecated_graph_mode_only`, `run_v1_only`, `run_v2_only`, and
+  `run_in_graph_and_eager_modes` are available decorators for different
+  v1/v2/eager/graph combinations.
+
 
   Args:
     func: function to be annotated. If `func` is None, this method returns a
       decorator the can be applied to a function. If `func` is not None this
       returns the decorator applied to `func`.
-    config: An optional config_pb2.ConfigProto to use to configure the
-      session when executing graphs.
+    config: An optional config_pb2.ConfigProto to use to configure the session
+      when executing graphs.
     use_gpu: If True, attempt to run as many operations as possible on GPU.
     reset_test: If True, tearDown and SetUp the test case between the two
       executions of the test (once with and once without eager execution).
@@ -958,6 +951,7 @@ def run_in_graph_and_eager_modes(func=None,
       collected elsewhere in the unit test file will not work). Additionally,
       checks that nothing still has a reference to Tensors that the test
       allocated.
+
   Returns:
     Returns a decorator that will run the decorated test method twice:
     once by constructing and executing a graph in a session and once with
@@ -1018,9 +1012,10 @@ def py_func_if_in_function(f):
     if not ops.get_default_graph()._building_function:
       return f(*args, **kwds)
 
-    tensor_args, tensor_indices = zip(
-        *[(x, i) for i, x in enumerate(args)
-          if isinstance(x, (ops.Tensor, variables.Variable))])
+    tensor_args, tensor_indices = zip(*[(x, i)
+                                        for i, x in enumerate(args)
+                                        if isinstance(x, (ops.Tensor,
+                                                          variables.Variable))])
 
     def inner_f(*inner_tensor_args):
       my_args = list(args)
@@ -1054,42 +1049,58 @@ def also_run_as_tf_function(f):
   """
 
   def decorated(*args, **kwds):
+    def bound_f():
+      f(*args, **kwds)
     with context.eager_mode():
       # Running in eager mode
-      f(*args, **kwds)
-
-      defun_f = def_function.function(f)
-      defun_f(*args, **kwds)
+      bound_f()
+      # Running as TF function
+      # TODO(b/121143941): Remove the autograph override.
+      def_function.function(bound_f, autograph=False)()
 
   return decorated
 
 
-def run_deprecated_v1(func=None):
+def deprecated_graph_mode_only(func=None):
   """Execute the decorated test in graph mode.
 
-  This function returns a decorator intended to be applied to tests that have
-  not been updated to a style that is compatible with both TensorFlow 1.x and
-  2.x. When this decorated is applied, the test body will be run in
-  an environment where API calls construct graphs instead of executing eagerly.
+  This function returns a decorator intended to be applied to tests that are not
+  compatible with eager mode. When this decorator is applied, the test body will
+  be run in an environment where API calls construct graphs instead of executing
+  eagerly.
+
+  `deprecated_graph_mode_only`, `run_v1_only`, `run_v2_only`, and
+  `run_in_graph_and_eager_modes` are available decorators for different
+  v1/v2/eager/graph combinations.
 
   Args:
     func: function to be annotated. If `func` is None, this method returns a
       decorator the can be applied to a function. If `func` is not None this
       returns the decorator applied to `func`.
+
   Returns:
     Returns a decorator that will run the decorated test method in graph mode.
   """
 
   def decorator(f):
     if tf_inspect.isclass(f):
-      raise ValueError("`run_deprecated_v1` only supports test methods.")
+      setup = f.__dict__.get("setUp")
+      if setup is not None:
+        setattr(f, "setUp", decorator(setup))
+
+      for name, value in f.__dict__.copy().items():
+        if (callable(value) and
+            name.startswith(unittest.TestLoader.testMethodPrefix)):
+          setattr(f, name, decorator(value))
+
+      return f
 
     def decorated(self, *args, **kwargs):
       if tf2.enabled():
         with context.graph_mode():
-          f(self, *args, **kwargs)
+          return f(self, *args, **kwargs)
       else:
-        f(self, *args, **kwargs)
+        return f(self, *args, **kwargs)
 
     return decorated
 
@@ -1099,12 +1110,19 @@ def run_deprecated_v1(func=None):
   return decorator
 
 
+run_deprecated_v1 = deprecated_graph_mode_only
+
+
 def run_v1_only(reason, func=None):
   """Execute the decorated test only if running in v1 mode.
 
   This function is intended to be applied to tests that exercise v1 only
   functionality. If the test is run in v2 mode it will simply be skipped.
 
+  `deprecated_graph_mode_only`, `run_v1_only`, `run_v2_only`, and
+  `run_in_graph_and_eager_modes` are available decorators for different
+  v1/v2/eager/graph combinations.
+
   Args:
     reason: string giving a reason for limiting the test to v1 only.
     func: function to be annotated. If `func` is None, this method returns a
@@ -1132,7 +1150,7 @@ def run_v1_only(reason, func=None):
       if tf2.enabled():
         self.skipTest(reason)
 
-      f(self, *args, **kwargs)
+      return f(self, *args, **kwargs)
 
     return decorated
 
@@ -1148,6 +1166,10 @@ def run_v2_only(func=None):
   This function is intended to be applied to tests that exercise v2 only
   functionality. If the test is run in v1 mode it will simply be skipped.
 
+  `deprecated_graph_mode_only`, `run_v1_only`, `run_v2_only`, and
+  `run_in_graph_and_eager_modes` are available decorators for different
+  v1/v2/eager/graph combinations.
+
   Args:
     func: function to be annotated. If `func` is None, this method returns a
       decorator the can be applied to a function. If `func` is not None this
@@ -1165,7 +1187,7 @@ def run_v2_only(func=None):
       if not tf2.enabled():
         self.skipTest("Test is only comptaible in v2")
 
-      f(self, *args, **kwargs)
+      return f(self, *args, **kwargs)
 
     return decorated
 
@@ -1198,7 +1220,7 @@ def run_gpu_only(func=None):
       if not is_gpu_available():
         self.skipTest("Test requires GPU")
 
-      f(self, *args, **kwargs)
+      return f(self, *args, **kwargs)
 
     return decorated
 
@@ -1231,7 +1253,7 @@ def run_cuda_only(func=None):
       if not is_gpu_available(cuda_only=True):
         self.skipTest("Test requires CUDA GPU")
 
-      f(self, *args, **kwargs)
+      return f(self, *args, **kwargs)
 
     return decorated
 
@@ -1390,8 +1412,7 @@ class FakeEagerSession(object):
 
 
 class ErrorLoggingSession(session.Session):
-  """Wrapper around a Session that logs errors in run().
-  """
+  """Wrapper around a Session that logs errors in run()."""
 
   def run(self, *args, **kwargs):
     try:
@@ -1405,13 +1426,68 @@ class ErrorLoggingSession(session.Session):
       raise
 
 
+# The description is just for documentation purposes.
+def disable_xla(description):
+
+  def disable_xla_impl(func):
+    """Execute the test method only if xla is not enabled."""
+
+    def decorator(func):
+
+      def decorated(self, *args, **kwargs):
+        if is_xla_enabled():
+          return
+        else:
+          return func(self, *args, **kwargs)
+
+      return decorated
+
+    if func is not None:
+      return decorator(func)
+
+    return decorator
+
+  return disable_xla_impl
+
+
+# The description is just for documentation purposes.
+def disable_all_xla(description):
+
+  def disable_all_impl(cls):
+    """Execute all test methods in this class only if xla is not enabled."""
+    base_decorator = disable_xla
+    for name in dir(cls):
+      value = getattr(cls, name)
+      if callable(value) and name.startswith(
+          "test") and not name == "test_session":
+        setattr(cls, name, base_decorator(description)(value))
+    return cls
+
+  return disable_all_impl
+
+
+class EagerSessionWarner(object):
+
+  def __getattr__(self, attr):
+    raise AttributeError(
+        "Trying to access properties or call methods on the result of "
+        "self.session(), self.cached_session(), etc while eager execution "
+        "is enabled. If you're porting this test case to TF 2.0, either "
+        "adapt the test to work with eager execution or insert a call to "
+        "tf.disable_eager_execution() in the main() function of this test "
+        "file.")
+
+
 @tf_export("test.TestCase")
 class TensorFlowTestCase(googletest.TestCase):
-  """Base class for tests that need to test TensorFlow.
-  """
+  """Base class for tests that need to test TensorFlow."""
 
   def __init__(self, methodName="runTest"):  # pylint: disable=invalid-name
     super(TensorFlowTestCase, self).__init__(methodName)
+    if is_xla_enabled():
+      os.putenv(
+          "TF_XLA_FLAGS", "--tf_xla_auto_jit=2 --tf_xla_min_cluster_size=1 "
+          "--tf_xla_enable_lazy_compilation=false")
     self._threads = []
     self._tempdir = None
     self._cached_session = None
@@ -1489,9 +1565,9 @@ class TensorFlowTestCase(googletest.TestCase):
     ```
 
     Args:
-      stream: The stream whose writes should be captured. This
-        stream must have a file descriptor, support writing via using that
-        file descriptor, and must have a `.flush()` method.
+      stream: The stream whose writes should be captured. This stream must have
+        a file descriptor, support writing via using that file descriptor, and
+        must have a `.flush()` method.
 
     Yields:
       A `CapturedWrites` object that contains all writes to the specified stream
@@ -1582,8 +1658,13 @@ class TensorFlowTestCase(googletest.TestCase):
     else:
       try:
         if sparse_tensor.is_sparse(tensor):
-          return sparse_tensor.SparseTensorValue(tensor.indices, tensor.values,
-                                                 tensor.dense_shape)
+          return sparse_tensor.SparseTensorValue(tensor.indices.numpy(),
+                                                 tensor.values.numpy(),
+                                                 tensor.dense_shape.numpy())
+        elif isinstance(tensor, ops.IndexedSlices):
+          return ops.IndexedSlicesValue(values=tensor.values.numpy(),
+                                        indices=tensor.indices.numpy(),
+                                        dense_shape=tensor.dense_shape.numpy())
         return tensor.numpy()
       except AttributeError as e:
         six.raise_from(ValueError("Unsupported type %s." % type(tensor)), e)
@@ -1650,7 +1731,7 @@ class TensorFlowTestCase(googletest.TestCase):
       the graph building and execution code in a test case.
     """
     if context.executing_eagerly():
-      yield None
+      yield EagerSessionWarner()
     else:
       with self._create_session(graph, config, force_gpu) as sess:
         with self._constrain_devices_and_set_default(sess, use_gpu, force_gpu):
@@ -1840,7 +1921,6 @@ class TensorFlowTestCase(googletest.TestCase):
     self._threads.append(ret)
     return ret
 
-
   # pylint: enable=invalid-name
   @py_func_if_in_function
   def assertNear(self, f1, f2, err, msg=None):
@@ -1857,9 +1937,8 @@ class TensorFlowTestCase(googletest.TestCase):
     """
     # f1 == f2 is needed here as we might have: f1, f2 = inf, inf
     self.assertTrue(
-        f1 == f2 or math.fabs(f1 - f2) <= err,
-        "%f != %f +/- %f%s" % (f1, f2, err, " (%s)" % msg
-                               if msg is not None else ""))
+        f1 == f2 or math.fabs(f1 - f2) <= err, "%f != %f +/- %f%s" %
+        (f1, f2, err, " (%s)" % msg if msg is not None else ""))
 
   @py_func_if_in_function
   def assertArrayNear(self, farray1, farray2, err, msg=None):
@@ -1897,7 +1976,7 @@ class TensorFlowTestCase(googletest.TestCase):
     # If a is a tensor then convert it to ndarray
     if isinstance(a, ops.Tensor):
       if isinstance(a, ops._EagerTensorBase):
-        return a.numpy()
+        a = a.numpy()
       else:
         a = self.evaluate(a)
     if not isinstance(a, np.ndarray):
@@ -2028,11 +2107,11 @@ class TensorFlowTestCase(googletest.TestCase):
 
     Args:
       a: The expected numpy `ndarray`, or anything that can be converted into a
-         numpy `ndarray` (including Tensor), or any arbitrarily nested of
-         structure of these.
+        numpy `ndarray` (including Tensor), or any arbitrarily nested of
+        structure of these.
       b: The actual numpy `ndarray`, or anything that can be converted into a
-         numpy `ndarray` (including Tensor), or any arbitrarily nested of
-         structure of these.
+        numpy `ndarray` (including Tensor), or any arbitrarily nested of
+        structure of these.
       rtol: relative tolerance.
       atol: absolute tolerance.
       msg: Optional message to report on failure.
@@ -2160,8 +2239,8 @@ class TensorFlowTestCase(googletest.TestCase):
     """Assert element values are all greater than a target value.
 
     Args:
-      a: The numpy `ndarray`, or anything that can be converted into a
-         numpy `ndarray` (including Tensor).
+      a: The numpy `ndarray`, or anything that can be converted into a numpy
+        `ndarray` (including Tensor).
       comparison_target: The target value of comparison.
     """
     a = self._GetNdArray(a)
@@ -2172,8 +2251,8 @@ class TensorFlowTestCase(googletest.TestCase):
     """Assert element values are all less than a target value.
 
     Args:
-      a: The numpy `ndarray`, or anything that can be converted into a
-         numpy `ndarray` (including Tensor).
+      a: The numpy `ndarray`, or anything that can be converted into a numpy
+        `ndarray` (including Tensor).
       comparison_target: The target value of comparison.
     """
     a = self._GetNdArray(a)
@@ -2184,8 +2263,8 @@ class TensorFlowTestCase(googletest.TestCase):
     """Assert element values are all greater than or equal to a target value.
 
     Args:
-      a: The numpy `ndarray`, or anything that can be converted into a
-         numpy `ndarray` (including Tensor).
+      a: The numpy `ndarray`, or anything that can be converted into a numpy
+        `ndarray` (including Tensor).
       comparison_target: The target value of comparison.
     """
     a = self._GetNdArray(a)
@@ -2196,8 +2275,8 @@ class TensorFlowTestCase(googletest.TestCase):
     """Assert element values are all less than or equal to a target value.
 
     Args:
-      a: The numpy `ndarray`, or anything that can be converted into a
-         numpy `ndarray` (including Tensor).
+      a: The numpy `ndarray`, or anything that can be converted into a numpy
+        `ndarray` (including Tensor).
       comparison_target: The target value of comparison.
     """
     a = self._GetNdArray(a)
@@ -2245,7 +2324,7 @@ class TensorFlowTestCase(googletest.TestCase):
 
     Args:
       target: The numpy `ndarray`, or anything that can be converted into a
-         numpy `ndarray` (including Tensor).
+        numpy `ndarray` (including Tensor).
       lower_bound: lower bound of the range
       upper_bound: upper bound of the range
       open_lower_bound: (`bool`) whether the lower bound is open (i.e., > rather
@@ -2279,8 +2358,8 @@ class TensorFlowTestCase(googletest.TestCase):
                  str(upper_bound) + (")" if open_upper_bound else "]"))
 
     violations = (
-        np.less_equal(target, lower_bound)
-        if open_lower_bound else np.less(target, lower_bound))
+        np.less_equal(target, lower_bound) if open_lower_bound else np.less(
+            target, lower_bound))
     violations = np.logical_or(
         violations,
         np.greater_equal(target, upper_bound)
@@ -2299,7 +2378,7 @@ class TensorFlowTestCase(googletest.TestCase):
 
     Args:
       target: The numpy `ndarray`, or anything that can be converted into a
-         numpy `ndarray` (including Tensor).
+        numpy `ndarray` (including Tensor).
       expected_set: (`list`, `tuple` or `set`) The closed set that the elements
         of the value of `target` are expected to fall into.
 
@@ -2321,7 +2400,7 @@ class TensorFlowTestCase(googletest.TestCase):
 
     Args:
       target: The numpy `ndarray`, or anything that can be converted into a
-         numpy `ndarray` (including Tensor).
+        numpy `ndarray` (including Tensor).
       expected_dtype: Expected data type.
     """
     target = self._GetNdArray(target)
@@ -2342,9 +2421,9 @@ class TensorFlowTestCase(googletest.TestCase):
     Args:
       exception_type: The expected type of exception that should be raised.
       expected_err_re_or_predicate: If this is callable, it should be a function
-        of one argument that inspects the passed-in exception and
-        returns True (success) or False (please fail the test). Otherwise, the
-        error message is expected to match this regular expression partially.
+        of one argument that inspects the passed-in exception and returns True
+        (success) or False (please fail the test). Otherwise, the error message
+        is expected to match this regular expression partially.
 
     Returns:
       A context manager to surround code that is expected to raise an
@@ -2445,6 +2524,7 @@ class TensorFlowTestCase(googletest.TestCase):
 
   def _create_session(self, graph, config, force_gpu):
     """See session() for details."""
+
     def prepare_config(config):
       """Returns a config for sessions.
 
@@ -2547,10 +2627,10 @@ def create_local_cluster(num_workers,
   Args:
     num_workers: Number of worker servers to start.
     num_ps: Number of PS servers to start.
-    protocol: Communication protocol.  Allowed values are documented in
-      the documentation of `tf.train.Server`.
-    worker_config: (optional) ConfigProto to initialize workers. Can be used
-      to instantiate multiple devices etc.
+    protocol: Communication protocol.  Allowed values are documented in the
+      documentation of `tf.train.Server`.
+    worker_config: (optional) ConfigProto to initialize workers. Can be used to
+      instantiate multiple devices etc.
     ps_config: (optional) ConfigProto to initialize PS servers.
 
   Returns:
@@ -2620,42 +2700,3 @@ def set_producer_version(graph, producer_version):
   with graph.as_default():
     importer.import_graph_def(graph_def)
   assert graph.graph_def_versions.producer, producer_version
-
-
-def dismantle_func_graph(func_graph):
-  """Removes reference cycles in `func_graph` FuncGraph.
-
-  Helpful for making sure the garbage collector doesn't need to run when
-  the FuncGraph goes out of scope, e.g. in tests using defun with
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True).
-
-  Args:
-    func_graph: A `FuncGraph` object to destroy. `func_graph` is unusable
-      after this function.
-  """
-  # TODO(b/115366440): Delete this method when a custom OrderedDict is added.
-  # Clearing captures using clear() leaves some cycles around.
-  while func_graph.captures:
-    func_graph.captures.popitem()
-  memory.dismantle_ordered_dict(func_graph.captures)
-  ops.dismantle_graph(func_graph)
-
-
-def dismantle_polymorphic_function(func):
-  """Removes reference cycles in PolymorphicFunction `func`.
-
-  Helpful for making sure the garbage collector doesn't need to run when
-  PolymorphicFunction goes out of scope, e.g. in tests using defun with
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True).
-
-  Args:
-    func: A `PolymorphicFunction` object to destroy. `func` is unusable
-      after this function.
-  """
-  # TODO(b/115366440): Delete this method when a custom OrderedDict is added
-  cache = func._function_cache  # pylint: disable=protected-access
-  for concrete_func in cache.values():
-    dismantle_func_graph(concrete_func.graph)
-  while cache:
-    cache.popitem()
-  memory.dismantle_ordered_dict(cache)
diff --git a/tensorflow/python/grappler/cluster.i b/tensorflow/python/grappler/cluster.i
index 87795ffcfb5d21c408d646e581e19fe23a37b945..af9276c508b1db1e57a0dc8690cd5d6dfd0574e5 100644
--- a/tensorflow/python/grappler/cluster.i
+++ b/tensorflow/python/grappler/cluster.i
@@ -132,7 +132,7 @@ struct GCluster {
 
 static GCluster TF_NewCluster(bool allow_soft_placement,
                    bool disable_detailed_stats, TF_Status* out_status) {
-    int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores();
+  int num_cpu_cores = tensorflow::grappler::GetNumAvailableLogicalCPUCores();
   int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();
   int timeout_s = 60 * 10;
   tensorflow::grappler::Cluster* cluster_ =
@@ -176,13 +176,13 @@ tensorflow::Status _GetOpPerformanceDataAndRunTime(
   tensorflow::Status status = cost_measure->Initialize(item);
   if (!status.ok()) return status;
 
-  tensorflow::CostGraphDef cost_graph;
+  tensorflow::RunMetadata run_metadata;
   TF_RETURN_IF_ERROR(
-      cost_measure->PredictCosts(item.graph, &cost_graph, costs));
+      cost_measure->PredictCosts(item.graph, &run_metadata, costs));
 
   if (op_performance_data) {
     *op_performance_data = tensorflow::grappler::CostGraphToOpPerformanceData(
-        cost_graph, item.graph);
+        run_metadata.cost_graph(), item.graph);
   }
   return tensorflow::Status::OK();
 }
diff --git a/tensorflow/python/grappler/cluster.py b/tensorflow/python/grappler/cluster.py
index 079d07115b31da86600821a098aec08ec60bf436..428b52402cffc16bd692cac5839494a617815236 100644
--- a/tensorflow/python/grappler/cluster.py
+++ b/tensorflow/python/grappler/cluster.py
@@ -71,26 +71,21 @@ class Cluster(object):
     return self._tf_cluster
 
   def ListDevices(self):
-    """Returns the list of available hardware devices."""
-    devices = []
-    if self._tf_cluster is not None:
-      ret_from_swig = tf_cluster.TF_ListDevices(self._tf_cluster)
-      devices = []
-      for raw_dev in ret_from_swig:
-        devices.append(device_properties_pb2.NamedDevice.FromString(raw_dev))
-    return devices
+    """Returns a list of available hardware devices."""
+    if self._tf_cluster is None:
+      return []
+    return [device_properties_pb2.NamedDevice.FromString(device)
+            for device in tf_cluster.TF_ListDevices(self._tf_cluster)]
 
   def ListAvailableOps(self):
-    """Returns a list of all the available operations (sorted alphatically)."""
+    """Returns a list of all available operations (sorted alphabetically)."""
     return tf_cluster.TF_ListAvailableOps()
 
   def GetSupportedDevices(self, item):
     return tf_cluster.TF_GetSupportedDevices(self._tf_cluster, item.tf_item)
 
   def EstimatePerformance(self, device):
-    """Estimate the performance of the specified device."""
-    serialized = device.SerializeToString()
-    return tf_cluster.TF_EstimatePerformance(serialized)
+    return tf_cluster.TF_EstimatePerformance(device.SerializeToString())
 
   def MeasureCosts(self, item):
     """Returns the cost of running the specified item.
@@ -107,10 +102,8 @@ class Cluster(object):
       return None
 
     op_perf_bytes_list, run_time, step_stats_bytes = ret_from_swig
-    op_perfs = []
-    for op_perf_bytes in op_perf_bytes_list:
-      op_perfs.append(
-          op_performance_data_pb2.OpPerformance.FromString(op_perf_bytes))
+    op_perfs = [op_performance_data_pb2.OpPerformance.FromString(op_perf_bytes)
+                for op_perf_bytes in op_perf_bytes_list]
     return (op_perfs, run_time,
             step_stats_pb2.StepStats.FromString(step_stats_bytes))
 
@@ -122,11 +115,9 @@ class Cluster(object):
     Returns: A hashtable indexed by device name.
     """
     with errors.raise_exception_on_not_ok_status() as status:
-      ret_from_swig = tf_cluster.TF_DeterminePeakMemoryUsage(
+      return tf_cluster.TF_DeterminePeakMemoryUsage(
           item.tf_item, self._tf_cluster, status)
 
-    return ret_from_swig
-
 
 @contextlib.contextmanager
 def Provision(allow_soft_placement=True,
diff --git a/tensorflow/python/grappler/cost_analyzer.cc b/tensorflow/python/grappler/cost_analyzer.cc
index b474e19894957d01c7c8978282c547df81a9b2b3..9aa5fbca383d126ebb927a7e47fc714503fcefed 100644
--- a/tensorflow/python/grappler/cost_analyzer.cc
+++ b/tensorflow/python/grappler/cost_analyzer.cc
@@ -42,9 +42,13 @@ Status CostAnalyzer::GenerateReport(std::ostream& os, bool per_node_report,
 void CostAnalyzer::PredictCosts(CostEstimator* cost_estimator,
                                 CostGraphDef* cost_graph, int64* total_time) {
   TF_CHECK_OK(cost_estimator->Initialize(*item_));
+  RunMetadata run_metadata;
   Costs costs;
   const Status status =
-      cost_estimator->PredictCosts(item_->graph, cost_graph, &costs);
+      cost_estimator->PredictCosts(item_->graph, &run_metadata, &costs);
+  if (cost_graph) {
+    cost_graph->Swap(run_metadata.mutable_cost_graph());
+  }
   *total_time = costs.execution_time.count();
   if (!status.ok()) {
     LOG(ERROR) << "Could not estimate the cost for item " << item_->id << ": "
diff --git a/tensorflow/python/grappler/item.i b/tensorflow/python/grappler/item.i
index 593d38206d127978f1982a0f2cc22e17daee1a3d..0d4f7de9f55b2bd13cd1ab7988b7f6c43d3e018c 100644
--- a/tensorflow/python/grappler/item.i
+++ b/tensorflow/python/grappler/item.i
@@ -272,7 +272,6 @@ static PyObject* TF_GetColocationGroups(GItem item) {
     if (!s.ok()) {
       continue;
     }
-    int i = 0;
     for (const auto& arg : op_def->input_arg()) {
       if (!arg.is_ref()) {
         continue;
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 98f2e6d71816a4b6d8cd3f7fc836b09e5cc058a4..b6aee941b05991fe97b0570f60748406e17dc332 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -34,10 +34,10 @@ from tensorflow.python.grappler import cluster as gcluster
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.layers import convolutional as conv_layers
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_nn_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
@@ -120,7 +120,7 @@ def _loop():
   x3 = random_ops.truncated_normal([1, 784], seed=0)
   x4 = random_ops.truncated_normal([1, 784], seed=0)
   elems = (x1, x2, x3, x4)
-  outputs = functional_ops.map_fn(_two_layer_model, elems, dtype=dtypes.float32)
+  outputs = map_fn.map_fn(_two_layer_model, elems, dtype=dtypes.float32)
   return outputs
 
 
@@ -131,8 +131,7 @@ def _loop_with_branch():
   x3 = random_ops.truncated_normal([1, 784], seed=0)
   x4 = random_ops.truncated_normal([1, 784], seed=0)
   elems = (x1, x2, x3, x4)
-  outputs = functional_ops.map_fn(
-      _model_with_branch, elems, dtype=dtypes.float32)
+  outputs = map_fn.map_fn(_model_with_branch, elems, dtype=dtypes.float32)
   return outputs
 
 
@@ -143,8 +142,7 @@ def _loop_with_vec_and_4d():
   x3 = random_ops.truncated_normal([1, 784], seed=0)
   x4 = random_ops.truncated_normal([1, 784], seed=0)
   elems = (x1, x2, x3, x4)
-  outputs = functional_ops.map_fn(
-      _model_with_vec_and_4d, elems, dtype=dtypes.float32)
+  outputs = map_fn.map_fn(_model_with_vec_and_4d, elems, dtype=dtypes.float32)
   return outputs
 
 
diff --git a/tensorflow/python/grappler/memory_optimizer_test.py b/tensorflow/python/grappler/memory_optimizer_test.py
index e2864ebb4df646262456f2d04e4a24bdd06482b7..a5d70d994d9eebd9bf3988258a9d0f9f88faae28 100644
--- a/tensorflow/python/grappler/memory_optimizer_test.py
+++ b/tensorflow/python/grappler/memory_optimizer_test.py
@@ -65,41 +65,42 @@ class MemoryOptimizerSwapTest(test.TestCase):
   @test_util.run_v1_only('b/120545219')
   def testSimpleSwap(self):
     """Check that the swap annotations are followed."""
-    a = variables.VariableV1(10, name='a')
-    b = variables.VariableV1(20, name='b')
-    c = math_ops.add_n([a, b], name='c')
-    d = math_ops.add_n([b, c], name='d')
-    train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
-    train_op.append(d)
+    with ops.device('/gpu:0'):
+      a = variables.VariableV1(10, name='a')
+      b = variables.VariableV1(20, name='b')
+      c = math_ops.add_n([a, b], name='c')
+      d = math_ops.add_n([b, c], name='d')
+      train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+      train_op.append(d)
 
-    d.op._set_attr('_swap_to_host', attr_value_pb2.AttrValue(i=0))
+      d.op._set_attr('_swap_to_host', attr_value_pb2.AttrValue(i=0))
 
-    mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
-    graph_size = len(mg.graph_def.node)
+      mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
+      graph_size = len(mg.graph_def.node)
 
-    config = config_pb2.ConfigProto()
-    config.graph_options.rewrite_options.CopyFrom(
-        rewriter_config_pb2.RewriterConfig(
-            disable_model_pruning=True,
-            meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.ONE,
-            constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
-            memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL,
-            min_graph_nodes=-1))
-    graph = tf_optimizer.OptimizeGraph(config, mg)
+      config = config_pb2.ConfigProto()
+      config.graph_options.rewrite_options.CopyFrom(
+          rewriter_config_pb2.RewriterConfig(
+              disable_model_pruning=True,
+              meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.ONE,
+              constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
+              memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL,
+              min_graph_nodes=-1))
+      graph = tf_optimizer.OptimizeGraph(config, mg)
 
-    self.assertEqual(len(graph.node), graph_size + 2)
-    self.assertTrue(
-        set([node.name for node in graph.node]) > set(
-            ['a', 'b', 'c', 'd', 'swap_in_d_0', 'swap_out_d_0']))
-    for node in graph.node:
-      if node.name == 'swap_in_d_0':
-        self.assertEqual('swap_out_d_0', node.input[0])
-        self.assertEqual('^b/read', node.input[1])
-      elif node.name == 'swap_out_d_0':
-        self.assertEqual('b/read', node.input[0])
-      elif node.name == 'd':
-        self.assertEqual('swap_in_d_0', node.input[0])
-        self.assertEqual('c', node.input[1])
+      self.assertEqual(len(graph.node), graph_size + 2)
+      self.assertTrue(
+          set([node.name for node in graph.node]) > set(
+              ['a', 'b', 'c', 'd', 'swap_in_d_0', 'swap_out_d_0']))
+      for node in graph.node:
+        if node.name == 'swap_in_d_0':
+          self.assertEqual('swap_out_d_0', node.input[0])
+          self.assertEqual('^b/read', node.input[1])
+        elif node.name == 'swap_out_d_0':
+          self.assertEqual('b/read', node.input[0])
+        elif node.name == 'd':
+          self.assertEqual('swap_in_d_0', node.input[0])
+          self.assertEqual('c', node.input[1])
 
 
 class MemoryOptimizerRecomputeTest(test.TestCase):
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 36fea36389dc15104cca8a0d421ba50906295e9a..8743dbaa13c2e1d6b6b2241652002852b3c66a3d 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -1,5 +1,7 @@
 # Description:
 #   Contains the Keras API (internal TensorFlow version).
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 licenses(["notice"])  # Apache 2.0
 
@@ -7,9 +9,6 @@ package(default_visibility = ["//visibility:public"])
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
-load("//tensorflow:tensorflow.bzl", "cuda_py_test")
-
 config_setting(
     name = "empty_condition",
     values = {"define": "UNUSED=unused"},
@@ -42,6 +41,7 @@ py_library(
         "datasets/reuters.py",
         "estimator/__init__.py",
         "keras_parameterized.py",
+        "ops.py",
         "preprocessing/__init__.py",
         "preprocessing/image.py",
         "preprocessing/sequence.py",
@@ -61,7 +61,9 @@ py_library(
         ":engine",
         ":layers",
         ":pil_for_keras",
+        ":saving",
         "//tensorflow/python:training",
+        "//tensorflow/python/keras/mixed_precision/experimental:autocast_variable",
         "//tensorflow/python/keras/optimizer_v2",
         "//tensorflow/python/saved_model",
         "@keras_applications_archive//:keras_applications",
@@ -81,6 +83,7 @@ py_library(
     srcs = ["backend.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":backend_config",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
@@ -89,6 +92,7 @@ py_library(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:ctc_ops",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_ops",
@@ -96,7 +100,9 @@ py_library(
         "//tensorflow/python:gradients",
         "//tensorflow/python:image_ops",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:init_ops_v2",
         "//tensorflow/python:logging_ops",
+        "//tensorflow/python:map_fn",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
         "//tensorflow/python:nn",
@@ -110,17 +116,22 @@ py_library(
         "//tensorflow/python:tensor_array_grad",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_coordinator",
     ],
 )
 
+py_library(
+    name = "backend_config",
+    srcs = ["backend_config.py"],
+    srcs_version = "PY2AND3",
+)
+
 py_library(
     name = "engine",
     srcs = [
-        "activations.py",
-        "callbacks.py",
-        "constraints.py",
         "engine/__init__.py",
         "engine/base_layer.py",
         "engine/base_layer_utils.py",
@@ -128,6 +139,7 @@ py_library(
         "engine/input_layer.py",
         "engine/input_spec.py",
         "engine/network.py",
+        "engine/partial_batch_padding_handler.py",
         "engine/saving.py",
         "engine/sequential.py",
         "engine/training.py",
@@ -136,12 +148,159 @@ py_library(
         "engine/training_eager.py",
         "engine/training_generator.py",
         "engine/training_utils.py",
+        "metrics.py",  # Need base_layer
+        "models.py",
+        "utils/metrics_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":activations",
+        ":backend",
+        ":callbacks",
+        ":callbacks_v1",
+        ":constraints",
+        ":engine_utils",
+        ":initializers",
+        ":losses",
+        ":mode_keys",
+        ":optimizers",
+        ":regularizers",
+        ":saving",
+        "//tensorflow/python/data",
+        "//tensorflow/python/distribute:distribute_coordinator",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/training/tracking:data_structures",
+        "//tensorflow/tools/docs:doc_controls",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "saving",
+    srcs = [
+        "saving/__init__.py",
+        "saving/hdf5_format.py",
+        "saving/model_config.py",
+        "saving/saved_model.py",
+        "saving/saving_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":backend",
+        ":engine_utils",
+        ":mode_keys",
+        ":optimizers",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:saver",
+        "//tensorflow/python/saved_model",
+        "//tensorflow/python/saved_model/model_utils",
+    ],
+)
+
+py_library(
+    name = "activations",
+    srcs = [
+        "activations.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":backend",
+        ":engine_utils",
+    ],
+)
+
+py_library(
+    name = "callbacks",
+    srcs = [
+        "callbacks.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":backend",
+        ":engine_utils",
+        ":mode_keys",
+    ],
+)
+
+py_library(
+    name = "callbacks_v1",
+    srcs = [
+        "callbacks_v1.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":backend",
+        ":engine_utils",
+    ],
+)
+
+py_library(
+    name = "constraints",
+    srcs = [
+        "constraints.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":backend",
+        ":engine_utils",
+    ],
+)
+
+py_library(
+    name = "initializers",
+    srcs = [
         "initializers.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":backend",
+        ":engine_utils",
+    ],
+)
+
+py_library(
+    name = "losses",
+    srcs = [
         "losses.py",
-        "metrics.py",
-        "models.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":backend",
+        ":engine_utils",
+    ],
+)
+
+py_library(
+    name = "optimizers",
+    srcs = [
         "optimizers.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":backend",
+        ":engine_utils",
+        "//tensorflow/python/keras/optimizer_v2",
+    ],
+)
+
+py_library(
+    name = "regularizers",
+    srcs = [
         "regularizers.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":backend",
+        ":engine_utils",
+    ],
+)
+
+py_library(
+    name = "engine_utils",
+    srcs = [
+        "utils/conv_utils.py",
         "utils/data_utils.py",
         "utils/io_utils.py",
         "utils/losses_utils.py",
@@ -149,12 +308,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":backend",
-        "//tensorflow/python/data",
-        "//tensorflow/python/distribute:reduce_util",
-        "//tensorflow/python/keras/optimizer_v2",
-        "//tensorflow/python/training/checkpointable:data_structures",
-        "//tensorflow/tools/docs:doc_controls",
-        "@six_archive//:six",
+        "//tensorflow/python/distribute:distribute_lib",
     ],
 )
 
@@ -168,6 +322,7 @@ py_library(
         "layers/core.py",
         "layers/cudnn_recurrent.py",
         "layers/embeddings.py",
+        "layers/kernelized.py",
         "layers/local.py",
         "layers/merge.py",
         "layers/noise.py",
@@ -176,204 +331,280 @@ py_library(
         "layers/recurrent.py",
         "layers/serialization.py",
         "layers/wrappers.py",
-        "utils/conv_utils.py",
-        "utils/generic_utils.py",
+        "utils/kernelized_utils.py",
         "utils/layer_utils.py",
         "utils/tf_utils.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
         ":engine",
+        ":generic_utils",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:cudnn_rnn_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:logging_ops",
+        "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:standard_ops",
-        "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python:variables",
         "//tensorflow/python/distribute:distribute_lib",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "generic_utils",
+    srcs = [
+        "utils/generic_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "mode_keys",
+    srcs = [
+        "utils/mode_keys.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/saved_model/model_utils:mode_keys",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "integration_test",
     size = "medium",
     srcs = ["integration_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:nn",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:nn_ops",
     ],
+    shard_count = 12,
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "activations_test",
     size = "small",
     srcs = ["activations_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:nn_ops",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "constraints_test",
     size = "small",
     srcs = ["constraints_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "initializers_test",
     size = "small",
     srcs = ["initializers_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:init_ops",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "regularizers_test",
-    size = "small",
+    size = "medium",
     srcs = ["regularizers_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
         "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "optimizers_test",
     size = "medium",
     srcs = ["optimizers_test.py"],
-    shard_count = 2,
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:training",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
+    shard_count = 8,
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "losses_test",
     size = "small",
     srcs = ["losses_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
+    ],
+)
+
+tf_py_test(
+    name = "metrics_functional_test",
+    size = "small",
+    srcs = ["metrics_functional_test.py"],
+    additional_deps = [
+        ":keras",
         "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "metrics_test",
     size = "medium",
     srcs = ["metrics_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "manual",
-        "no_oss",
-        "notap",
-    ],
-    deps = [
+    additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 4,
+)
+
+tf_py_test(
+    name = "metrics_confusion_matrix_test",
+    size = "medium",
+    srcs = ["metrics_confusion_matrix_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 4,
+)
+
+tf_py_test(
+    name = "metrics_correctness_test",
+    size = "medium",
+    srcs = ["metrics_correctness_test.py"],
+    additional_deps = [
+        ":keras",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 4,
 )
 
-py_test(
+tf_py_test(
     name = "applications_test",
-    size = "enormous",
+    size = "medium",
     srcs = ["applications/applications_test.py"],
-    shard_count = 2,
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
         "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 11,
 )
 
-py_test(
+tf_py_test(
     name = "advanced_activations_test",
     size = "medium",
     srcs = ["layers/advanced_activations_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:client_testlib",
+    ],
+)
+
+tf_py_test(
+    name = "tensorflow_op_layer_test",
+    size = "medium",
+    srcs = ["layers/tensorflow_op_layer_test.py"],
+    additional_deps = [
+        ":keras",
         "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 3,
 )
 
-py_test(
+tf_py_test(
     name = "convolutional_recurrent_test",
     size = "large",
     srcs = ["layers/convolutional_recurrent_test.py"],
-    shard_count = 2,
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 2,
 )
 
-py_test(
+cuda_py_test(
     name = "convolutional_test",
-    size = "large",
+    size = "medium",
     srcs = ["layers/convolutional_test.py"],
-    shard_count = 4,
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 8,
+    tags = [
+        "manual",  # b/124471597
+        "notap",  # b/124471597
+    ],
+    xla_enable_strict_auto_jit = True,
+)
+
+cuda_py_test(
+    name = "convolutional_transpose_test",
+    size = "medium",
+    srcs = ["layers/convolutional_transpose_test.py"],
+    additional_deps = [
+        ":keras",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
 cuda_py_test(
     name = "cudnn_recurrent_test",
-    size = "large",
+    size = "medium",
     srcs = ["layers/cudnn_recurrent_test.py"],
     additional_deps = [
         ":keras",
@@ -381,34 +612,36 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
-    shard_count = 2,
-    tags = ["no_windows_gpu"],
+    shard_count = 4,
+    tags = [
+        "no_rocm",
+        "no_windows_gpu",
+    ],
 )
 
-py_test(
+tf_py_test(
     name = "pooling_test",
-    size = "large",
+    size = "medium",
     srcs = ["layers/pooling_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
         "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 8,
 )
 
-py_test(
+tf_py_test(
     name = "core_test",
     size = "medium",
     srcs = ["layers/core_test.py"],
-    shard_count = 2,
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 3,
 )
 
 cuda_py_test(
@@ -422,116 +655,124 @@ cuda_py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "local_test",
     size = "medium",
     srcs = ["layers/local_test.py"],
-    shard_count = 2,
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 2,
+    tags = ["no_windows"],
 )
 
-py_test(
+tf_py_test(
     name = "merge_test",
     size = "small",
     srcs = ["layers/merge_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "noise_test",
     size = "small",
     srcs = ["layers/noise_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
         "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "normalization_test",
     size = "medium",
     srcs = ["layers/normalization_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 3,
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "simplernn_test",
     size = "medium",
     srcs = ["layers/simplernn_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 4,
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "gru_test",
-    size = "large",
+    size = "medium",
     srcs = ["layers/gru_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],  # http://b/62136390
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 4,
+    tags = ["notsan"],  # http://b/62136390
 )
 
-py_test(
+tf_py_test(
     name = "lstm_test",
     size = "medium",
     srcs = ["layers/lstm_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
     shard_count = 4,
-    srcs_version = "PY2AND3",
     tags = [
         "noasan",  # times out b/63678675
         "notsan",  # http://b/62189182
     ],
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
 )
 
-py_test(
+tf_py_test(
     name = "recurrent_test",
-    size = "large",
+    size = "medium",
     srcs = ["layers/recurrent_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 4,
+)
+
+cuda_py_test(
+    name = "separable_convolutional_test",
+    size = "medium",
+    srcs = ["layers/separable_convolutional_test.py"],
+    additional_deps = [
+        ":keras",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
@@ -545,58 +786,102 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
-    shard_count = 4,
+    shard_count = 8,
+    tags = ["no_rocm"],
 )
 
-py_test(
+cuda_py_test(
+    name = "unified_gru_test",
+    size = "medium",
+    srcs = ["layers/unified_gru_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 8,
+    tags = ["no_rocm"],
+)
+
+tf_py_test(
     name = "serialization_test",
     size = "small",
     srcs = ["layers/serialization_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:client_testlib",
+    ],
+)
+
+tf_py_test(
+    name = "kernelized_test",
+    size = "small",
+    srcs = ["layers/kernelized_test.py"],
+    additional_deps = [
+        ":backend",
+        ":initializers",
+        ":keras",
+        ":layers",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/eager:context",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "wrappers_test",
     size = "medium",
     srcs = ["layers/wrappers_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
     shard_count = 4,
-    srcs_version = "PY2AND3",
     tags = [
         "noasan",  # http://b/78599823
         "notsan",
     ],
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
 )
 
-py_test(
+tf_py_test(
     name = "scikit_learn_test",
     size = "small",
     srcs = ["wrappers/scikit_learn_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "data_utils_test",
-    size = "large",
+    size = "medium",
     srcs = ["utils/data_utils_test.py"],
-    srcs_version = "PY2AND3",
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 6,
     tags = [
         "no_oss",
         "no_windows",
@@ -604,64 +889,87 @@ py_test(
         "notsan",
         "optonly",  # times out
     ],
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
 )
 
-py_test(
+tf_py_test(
     name = "generic_utils_test",
     size = "small",
     srcs = ["utils/generic_utils_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
         "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "tf_utils_test",
     size = "small",
     srcs = ["utils/tf_utils_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
+    name = "composite_tensor_support_test",
+    size = "medium",
+    srcs = ["utils/composite_tensor_support_test.py"],
+    additional_deps = [
+        ":engine",
+        ":layers",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+    ],
+)
+
+tf_py_test(
     name = "io_utils_test",
     size = "small",
     srcs = ["utils/io_utils_test.py"],
-    srcs_version = "PY2AND3",
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
     tags = [
         "no_windows",  # TODO: needs investigation on Windows
         "notsan",
     ],
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
 )
 
-py_test(
+tf_py_test(
     name = "np_utils_test",
     size = "small",
     srcs = ["utils/np_utils_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+tf_py_test(
+    name = "kernelized_utils_test",
+    size = "small",
+    srcs = ["utils/kernelized_utils_test.py"],
+    additional_deps = [
+        ":layers",
         "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
     ],
 )
 
@@ -692,277 +1000,339 @@ cuda_py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "conv_utils_test",
     size = "small",
     srcs = ["utils/conv_utils_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "image_test",
     size = "medium",
     srcs = ["preprocessing/image_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "sequence_test",
     size = "small",
     srcs = ["preprocessing/sequence_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "text_test",
     size = "small",
     srcs = ["preprocessing/text_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "callbacks_test",
     size = "medium",
     srcs = ["callbacks_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 4,
+    tags = ["notsan"],
+)
+
+tf_py_test(
+    name = "callbacks_v1_test",
+    size = "medium",
+    srcs = ["callbacks_v1_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    tags = ["notsan"],
+)
+
+tf_py_test(
+    name = "correctness_test",
+    size = "medium",
+    srcs = ["engine/correctness_test.py"],
+    additional_deps = [
+        ":keras",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 2,
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "training_test",
     size = "medium",
     srcs = ["engine/training_test.py"],
-    shard_count = 16,
-    srcs_version = "PY2AND3",
-    tags = [
-        "manual",  # TODO(b/120560388)
-        "no_oss",  # TODO(b/120560388)
-        "notap",  # TODO(b/120560388)
-        "notsan",
-    ],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 16,
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "training_dataset_test",
     size = "medium",
     srcs = ["engine/training_dataset_test.py"],
-    shard_count = 4,
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 4,
+)
+
+tf_py_test(
+    name = "training_arrays_test",
+    size = "small",
+    srcs = ["engine/training_arrays_test.py"],
+    additional_deps = [
+        ":keras",
+        ":layers",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "training_generator_test",
-    size = "enormous",
+    size = "medium",
     srcs = ["engine/training_generator_test.py"],
-    shard_count = 3,
-    srcs_version = "PY2AND3",
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 6,
     tags = [
         "no_oss",
+        "notap",  #TODO(b/123544294): Re-enable this test.
         "notsan",
     ],
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
 )
 
-py_test(
+tf_py_test(
     name = "feature_columns_integration_test",
     size = "small",
     srcs = ["engine/feature_columns_integration_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/feature_column:feature_column_py",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "training_eager_test",
     size = "medium",
     srcs = ["engine/training_eager_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "training_utils_test",
     size = "medium",
     srcs = ["engine/training_utils_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "model_subclassing_test",
     size = "medium",
     srcs = ["model_subclassing_test.py"],
-    shard_count = 2,
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 4,
+    tags = ["notsan"],
 )
 
-py_test(
+tf_py_test(
     name = "topology_test",
-    size = "small",
+    size = "medium",
     srcs = ["engine/topology_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no-internal-py3",
-    ],
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    tags = [
+        "no-internal-py3",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "base_layer_test",
     size = "small",
     srcs = ["engine/base_layer_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    tags = ["no_rocm"],
 )
 
-py_test(
-    name = "saving_test",
+tf_py_test(
+    name = "hdf5_format_test",
     size = "medium",
-    srcs = ["engine/saving_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    srcs = ["saving/hdf5_format_test.py"],
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    shard_count = 4,
 )
 
-py_test(
+tf_py_test(
     name = "sequential_test",
     size = "medium",
     srcs = ["engine/sequential_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
-        "//tensorflow/python:client_testlib",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "models_test",
     size = "medium",
     srcs = ["models_test.py"],
-    shard_count = 2,
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],  # b/67509773
-    deps = [
+    additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:training",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
+    shard_count = 8,
+    tags = ["notsan"],  # b/67509773
 )
 
-py_test(
+tf_py_test(
     name = "backend_test",
     size = "medium",
     srcs = ["backend_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:util",
+    ],
+)
+
+tf_py_test(
+    name = "backend_config_test",
+    size = "medium",
+    srcs = ["backend_config_test.py"],
+    additional_deps = [
+        ":keras",
         "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:util",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "keras_parameterized_test",
     size = "small",
     srcs = ["keras_parameterized_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
+    additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
+    ],
+    tags = ["notsan"],
+)
+
+tf_py_test(
+    name = "saved_model_test",
+    size = "medium",
+    srcs = ["saving/saved_model_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    tags = [
+        "no_oss",  # TODO(b/119349471): Re-enable
+        "no_windows",
+    ],
+)
+
+tf_py_test(
+    name = "saving_utils_test",
+    size = "medium",
+    srcs = ["saving/saving_utils_test.py"],
+    additional_deps = [
+        ":keras",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
     ],
+    tags = ["notsan"],
 )
diff --git a/tensorflow/python/keras/__init__.py b/tensorflow/python/keras/__init__.py
index be46a894e1b9979ea682aa2b635dc68da35c6097..b7ec63837d92d11258a88b870e5af5be04c32e5e 100644
--- a/tensorflow/python/keras/__init__.py
+++ b/tensorflow/python/keras/__init__.py
@@ -25,6 +25,7 @@ from tensorflow.python.keras import activations
 from tensorflow.python.keras import applications
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks
+from tensorflow.python.keras import callbacks_v1
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import datasets
 from tensorflow.python.keras import estimator
@@ -33,6 +34,7 @@ from tensorflow.python.keras import layers
 from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics
 from tensorflow.python.keras import models
+from tensorflow.python.keras import ops
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras import preprocessing
 from tensorflow.python.keras import regularizers
@@ -42,11 +44,11 @@ from tensorflow.python.keras.layers import Input
 from tensorflow.python.keras.models import Model
 from tensorflow.python.keras.models import Sequential
 
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 __version__ = '2.2.4-tf'
 
-tf_export('keras.__version__').export_constant(__name__, '__version__')
+keras_export('keras.__version__').export_constant(__name__, '__version__')
 
 del absolute_import
 del division
diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index d69791ce8d6b328067610f70c91373da5288d7d6..a10629a5fcf0ae042e7c456af288b1b4141f67b8 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -24,10 +24,23 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
+# b/123041942
+# In TF 2.x, if the `tf.nn.softmax` is used as an activation function in Keras
+# layers, it gets serialized as 'softmax_v2' instead of 'softmax' as the
+# internal method name is returned in serialization. This results in errors in
+# model exporting and loading as Keras can't find any activation function with
+# the name of `softmax_v2`.
 
-@tf_export('keras.activations.softmax')
+# This dict maps the activation function name from its v2 version to its
+# canonical name.
+_TF_ACTIVATIONS_V2 = {
+    'softmax_v2': 'softmax',
+}
+
+
+@keras_export('keras.activations.softmax')
 def softmax(x, axis=-1):
   """Softmax activation function.
 
@@ -53,7 +66,7 @@ def softmax(x, axis=-1):
                      'Received input: %s' % (x,))
 
 
-@tf_export('keras.activations.elu')
+@keras_export('keras.activations.elu')
 def elu(x, alpha=1.0):
   """Exponential linear unit.
 
@@ -72,7 +85,7 @@ def elu(x, alpha=1.0):
   return K.elu(x, alpha)
 
 
-@tf_export('keras.activations.selu')
+@keras_export('keras.activations.selu')
 def selu(x):
   """Scaled Exponential Linear Unit (SELU).
 
@@ -101,7 +114,7 @@ def selu(x):
   return scale * K.elu(x, alpha)
 
 
-@tf_export('keras.activations.softplus')
+@keras_export('keras.activations.softplus')
 def softplus(x):
   """Softplus activation function.
 
@@ -114,7 +127,7 @@ def softplus(x):
   return nn.softplus(x)
 
 
-@tf_export('keras.activations.softsign')
+@keras_export('keras.activations.softsign')
 def softsign(x):
   """Softsign activation function.
 
@@ -127,7 +140,7 @@ def softsign(x):
   return nn.softsign(x)
 
 
-@tf_export('keras.activations.relu')
+@keras_export('keras.activations.relu')
 def relu(x, alpha=0., max_value=None, threshold=0):
   """Rectified Linear Unit.
 
@@ -150,22 +163,22 @@ def relu(x, alpha=0., max_value=None, threshold=0):
   return K.relu(x, alpha=alpha, max_value=max_value, threshold=threshold)
 
 
-@tf_export('keras.activations.tanh')
+@keras_export('keras.activations.tanh')
 def tanh(x):
   return nn.tanh(x)
 
 
-@tf_export('keras.activations.sigmoid')
+@keras_export('keras.activations.sigmoid')
 def sigmoid(x):
   return nn.sigmoid(x)
 
 
-@tf_export('keras.activations.exponential')
+@keras_export('keras.activations.exponential')
 def exponential(x):
   return math_ops.exp(x)
 
 
-@tf_export('keras.activations.hard_sigmoid')
+@keras_export('keras.activations.hard_sigmoid')
 def hard_sigmoid(x):
   """Hard sigmoid activation function.
 
@@ -183,17 +196,19 @@ def hard_sigmoid(x):
   return K.hard_sigmoid(x)
 
 
-@tf_export('keras.activations.linear')
+@keras_export('keras.activations.linear')
 def linear(x):
   return x
 
 
-@tf_export('keras.activations.serialize')
+@keras_export('keras.activations.serialize')
 def serialize(activation):
+  if activation.__name__ in _TF_ACTIVATIONS_V2:
+    return _TF_ACTIVATIONS_V2[activation.__name__]
   return activation.__name__
 
 
-@tf_export('keras.activations.deserialize')
+@keras_export('keras.activations.deserialize')
 def deserialize(name, custom_objects=None):
   return deserialize_keras_object(
       name,
@@ -202,7 +217,7 @@ def deserialize(name, custom_objects=None):
       printable_module_name='activation function')
 
 
-@tf_export('keras.activations.get')
+@keras_export('keras.activations.get')
 def get(identifier):
   if identifier is None:
     return linear
diff --git a/tensorflow/python/keras/activations_test.py b/tensorflow/python/keras/activations_test.py
index 6b7bfb698b8abef4a3e0ac115f2f247103b92abc..9d2195480dd105c6d349b7a372606812520955f9 100644
--- a/tensorflow/python/keras/activations_test.py
+++ b/tensorflow/python/keras/activations_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import nn_ops as nn
 from tensorflow.python.platform import test
 
 
@@ -31,6 +32,7 @@ def _ref_softmax(values):
   return e / np.sum(e)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class KerasActivationsTest(test.TestCase):
 
   def test_serialization(self):
@@ -45,13 +47,20 @@ class KerasActivationsTest(test.TestCase):
       fn = keras.activations.deserialize(config)
       assert fn == ref_fn
 
+  def test_serialization_v2(self):
+    activation_map = {nn.softmax_v2: 'softmax'}
+    for fn_v2_key in activation_map:
+      fn_v2 = keras.activations.get(fn_v2_key)
+      config = keras.activations.serialize(fn_v2)
+      fn = keras.activations.deserialize(config)
+      assert fn.__name__ == activation_map[fn_v2_key]
+
   def test_softmax(self):
-    with self.cached_session():
-      x = keras.backend.placeholder(ndim=2)
-      f = keras.backend.function([x], [keras.activations.softmax(x)])
-      test_values = np.random.random((2, 5))
+    x = keras.backend.placeholder(ndim=2)
+    f = keras.backend.function([x], [keras.activations.softmax(x)])
+    test_values = np.random.random((2, 5))
 
-      result = f([test_values])[0]
+    result = f([test_values])[0]
     expected = _ref_softmax(test_values[0])
     self.assertAllClose(result[0], expected, rtol=1e-05)
 
@@ -60,40 +69,36 @@ class KerasActivationsTest(test.TestCase):
       keras.activations.softmax(x)
 
   def test_temporal_softmax(self):
-    with self.cached_session():
-      x = keras.backend.placeholder(shape=(2, 2, 3))
-      f = keras.backend.function([x], [keras.activations.softmax(x)])
-      test_values = np.random.random((2, 2, 3)) * 10
-      result = f([test_values])[0]
+    x = keras.backend.placeholder(shape=(2, 2, 3))
+    f = keras.backend.function([x], [keras.activations.softmax(x)])
+    test_values = np.random.random((2, 2, 3)) * 10
+    result = f([test_values])[0]
     expected = _ref_softmax(test_values[0, 0])
     self.assertAllClose(result[0, 0], expected, rtol=1e-05)
 
-  @test_util.run_deprecated_v1
   def test_selu(self):
     x = keras.backend.placeholder(ndim=2)
     f = keras.backend.function([x], [keras.activations.selu(x)])
     alpha = 1.6732632423543772848170429916717
     scale = 1.0507009873554804934193349852946
 
-    with self.cached_session():
-      positive_values = np.array([[1, 2]], dtype=keras.backend.floatx())
-      result = f([positive_values])[0]
-      self.assertAllClose(result, positive_values * scale, rtol=1e-05)
+    positive_values = np.array([[1, 2]], dtype=keras.backend.floatx())
+    result = f([positive_values])[0]
+    self.assertAllClose(result, positive_values * scale, rtol=1e-05)
 
-      negative_values = np.array([[-1, -2]], dtype=keras.backend.floatx())
-      result = f([negative_values])[0]
-      true_result = (np.exp(negative_values) - 1) * scale * alpha
-      self.assertAllClose(result, true_result)
+    negative_values = np.array([[-1, -2]], dtype=keras.backend.floatx())
+    result = f([negative_values])[0]
+    true_result = (np.exp(negative_values) - 1) * scale * alpha
+    self.assertAllClose(result, true_result)
 
   def test_softplus(self):
     def softplus(x):
       return np.log(np.ones_like(x) + np.exp(x))
 
-    with self.cached_session():
-      x = keras.backend.placeholder(ndim=2)
-      f = keras.backend.function([x], [keras.activations.softplus(x)])
-      test_values = np.random.random((2, 5))
-      result = f([test_values])[0]
+    x = keras.backend.placeholder(ndim=2)
+    f = keras.backend.function([x], [keras.activations.softplus(x)])
+    test_values = np.random.random((2, 5))
+    result = f([test_values])[0]
     expected = softplus(test_values)
     self.assertAllClose(result, expected, rtol=1e-05)
 
@@ -101,11 +106,10 @@ class KerasActivationsTest(test.TestCase):
     def softsign(x):
       return np.divide(x, np.ones_like(x) + np.absolute(x))
 
-    with self.cached_session():
-      x = keras.backend.placeholder(ndim=2)
-      f = keras.backend.function([x], [keras.activations.softsign(x)])
-      test_values = np.random.random((2, 5))
-      result = f([test_values])[0]
+    x = keras.backend.placeholder(ndim=2)
+    f = keras.backend.function([x], [keras.activations.softsign(x)])
+    test_values = np.random.random((2, 5))
+    result = f([test_values])[0]
     expected = softsign(test_values)
     self.assertAllClose(result, expected, rtol=1e-05)
 
@@ -118,68 +122,60 @@ class KerasActivationsTest(test.TestCase):
         return z / (1 + z)
     sigmoid = np.vectorize(ref_sigmoid)
 
-    with self.cached_session():
-      x = keras.backend.placeholder(ndim=2)
-      f = keras.backend.function([x], [keras.activations.sigmoid(x)])
-      test_values = np.random.random((2, 5))
-      result = f([test_values])[0]
+    x = keras.backend.placeholder(ndim=2)
+    f = keras.backend.function([x], [keras.activations.sigmoid(x)])
+    test_values = np.random.random((2, 5))
+    result = f([test_values])[0]
     expected = sigmoid(test_values)
     self.assertAllClose(result, expected, rtol=1e-05)
 
-  @test_util.run_deprecated_v1
   def test_hard_sigmoid(self):
     def ref_hard_sigmoid(x):
       x = (x * 0.2) + 0.5
       z = 0.0 if x <= 0 else (1.0 if x >= 1 else x)
       return z
     hard_sigmoid = np.vectorize(ref_hard_sigmoid)
-    with self.cached_session():
-      x = keras.backend.placeholder(ndim=2)
-      f = keras.backend.function([x], [keras.activations.hard_sigmoid(x)])
-      test_values = np.random.random((2, 5))
-      result = f([test_values])[0]
+    x = keras.backend.placeholder(ndim=2)
+    f = keras.backend.function([x], [keras.activations.hard_sigmoid(x)])
+    test_values = np.random.random((2, 5))
+    result = f([test_values])[0]
     expected = hard_sigmoid(test_values)
     self.assertAllClose(result, expected, rtol=1e-05)
 
   def test_relu(self):
-    with self.cached_session():
-      x = keras.backend.placeholder(ndim=2)
-      f = keras.backend.function([x], [keras.activations.relu(x)])
-      test_values = np.random.random((2, 5))
-      result = f([test_values])[0]
+    x = keras.backend.placeholder(ndim=2)
+    f = keras.backend.function([x], [keras.activations.relu(x)])
+    test_values = np.random.random((2, 5))
+    result = f([test_values])[0]
     # No negative values in test values...
     self.assertAllClose(result, test_values, rtol=1e-05)
 
-  @test_util.run_deprecated_v1
   def test_elu(self):
-    with self.cached_session():
-      x = keras.backend.placeholder(ndim=2)
-      f = keras.backend.function([x], [keras.activations.elu(x, 0.5)])
-      test_values = np.random.random((2, 5))
-      result = f([test_values])[0]
-      self.assertAllClose(result, test_values, rtol=1e-05)
-      negative_values = np.array([[-1, -2]], dtype=keras.backend.floatx())
-      result = f([negative_values])[0]
-      true_result = (np.exp(negative_values) - 1) / 2
+    x = keras.backend.placeholder(ndim=2)
+    f = keras.backend.function([x], [keras.activations.elu(x, 0.5)])
+    test_values = np.random.random((2, 5))
+    result = f([test_values])[0]
+    self.assertAllClose(result, test_values, rtol=1e-05)
+    negative_values = np.array([[-1, -2]], dtype=keras.backend.floatx())
+    result = f([negative_values])[0]
+    true_result = (np.exp(negative_values) - 1) / 2
     self.assertAllClose(result, true_result)
 
   def test_tanh(self):
-    with self.cached_session():
-      test_values = np.random.random((2, 5))
-      x = keras.backend.placeholder(ndim=2)
-      exp = keras.activations.tanh(x)
-      f = keras.backend.function([x], [exp])
-      result = f([test_values])[0]
+    test_values = np.random.random((2, 5))
+    x = keras.backend.placeholder(ndim=2)
+    exp = keras.activations.tanh(x)
+    f = keras.backend.function([x], [exp])
+    result = f([test_values])[0]
     expected = np.tanh(test_values)
     self.assertAllClose(result, expected, rtol=1e-05)
 
   def test_exponential(self):
-    with self.cached_session():
-      test_values = np.random.random((2, 5))
-      x = keras.backend.placeholder(ndim=2)
-      exp = keras.activations.exponential(x)
-      f = keras.backend.function([x], [exp])
-      result = f([test_values])[0]
+    test_values = np.random.random((2, 5))
+    x = keras.backend.placeholder(ndim=2)
+    exp = keras.activations.exponential(x)
+    f = keras.backend.function([x], [exp])
+    result = f([test_values])[0]
     expected = np.exp(test_values)
     self.assertAllClose(result, expected, rtol=1e-05)
 
diff --git a/tensorflow/python/keras/api/BUILD b/tensorflow/python/keras/api/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..259cadcb321908cbd5d678b2574d8ef12105edd7
--- /dev/null
+++ b/tensorflow/python/keras/api/BUILD
@@ -0,0 +1,60 @@
+# Description:
+# Package for TensorFlow.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0 License
+
+load("//tensorflow/python/tools/api/generator:api_gen.bzl", "gen_api_init_files")
+load("//tensorflow/python/tools/api/generator:api_init_files.bzl", "KERAS_API_INIT_FILES")
+load("//tensorflow/python/tools/api/generator:api_init_files_v1.bzl", "KERAS_API_INIT_FILES_V1")
+
+gen_api_init_files(
+    name = "keras_python_api_gen",
+    api_name = "keras",
+    api_version = 1,
+    output_files = KERAS_API_INIT_FILES_V1,
+    output_package = "tensorflow.python.keras.api",
+    package_deps = [
+        "//tensorflow/python/keras",
+        "//tensorflow/python:no_contrib",
+    ],
+    packages = [
+        "tensorflow.python",
+        "tensorflow.python.keras",
+    ],
+)
+
+gen_api_init_files(
+    name = "keras_python_api_gen_compat_v1",
+    api_name = "keras",
+    api_version = 1,
+    output_dir = "_v1/",
+    output_files = KERAS_API_INIT_FILES_V1,
+    output_package = "tensorflow.python.keras.api._v1",
+    package_deps = [
+        "//tensorflow/python/keras",
+        "//tensorflow/python:no_contrib",
+    ],
+    packages = [
+        "tensorflow.python",
+        "tensorflow.python.keras",
+    ],
+)
+
+gen_api_init_files(
+    name = "keras_python_api_gen_compat_v2",
+    api_name = "keras",
+    api_version = 2,
+    output_dir = "_v2/",
+    output_files = KERAS_API_INIT_FILES,
+    output_package = "tensorflow.python.keras.api._v2",
+    package_deps = [
+        "//tensorflow/python/keras",
+        "//tensorflow/python:no_contrib",
+    ],
+    packages = [
+        "tensorflow.python",
+        "tensorflow.python.keras",
+    ],
+)
diff --git a/tensorflow/python/keras/applications/applications_test.py b/tensorflow/python/keras/applications/applications_test.py
index b15ca5990aef9bed088cccd0dea1be049386eaf2..ad6b58992a9195975fbee8d4b81d8f810ab6e3df 100644
--- a/tensorflow/python/keras/applications/applications_test.py
+++ b/tensorflow/python/keras/applications/applications_test.py
@@ -32,13 +32,11 @@ MODEL_LIST = [
     (applications.InceptionV3, 2048),
     (applications.InceptionResNetV2, 1536),
     (applications.MobileNet, 1024),
-    # TODO(fchollet): enable MobileNetV2 tests when a new TensorFlow test image
-    # is released with keras_applications upgraded to 1.0.5 or above.
+    (applications.MobileNetV2, 1280),
     (applications.DenseNet121, 1024),
     (applications.DenseNet169, 1664),
     (applications.DenseNet201, 1920),
     (applications.NASNetMobile, 1056),
-    (applications.NASNetLarge, 4032),
 ]
 
 
@@ -47,7 +45,8 @@ class ApplicationsTest(test.TestCase, parameterized.TestCase):
   @parameterized.parameters(*MODEL_LIST)
   def test_feature_extration_model(self, model_fn, output_dim):
     model = model_fn(include_top=False, weights=None)
-    self.assertEqual(model.output_shape, (None, None, None, output_dim))
+    self.assertLen(model.output_shape, 4)
+    self.assertEqual(model.output_shape[-1], output_dim)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/applications/densenet.py b/tensorflow/python/keras/applications/densenet.py
index 172848bbdbe0dec6457961d15bdad756453187c1..9404968c81026b26f8aaeb91d1da006eddfd0468 100644
--- a/tensorflow/python/keras/applications/densenet.py
+++ b/tensorflow/python/keras/applications/densenet.py
@@ -22,37 +22,37 @@ from __future__ import print_function
 from keras_applications import densenet
 
 from tensorflow.python.keras.applications import keras_modules_injection
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.applications.densenet.DenseNet121',
-           'keras.applications.DenseNet121')
+@keras_export('keras.applications.densenet.DenseNet121',
+              'keras.applications.DenseNet121')
 @keras_modules_injection
 def DenseNet121(*args, **kwargs):
   return densenet.DenseNet121(*args, **kwargs)
 
 
-@tf_export('keras.applications.densenet.DenseNet169',
-           'keras.applications.DenseNet169')
+@keras_export('keras.applications.densenet.DenseNet169',
+              'keras.applications.DenseNet169')
 @keras_modules_injection
 def DenseNet169(*args, **kwargs):
   return densenet.DenseNet169(*args, **kwargs)
 
 
-@tf_export('keras.applications.densenet.DenseNet201',
-           'keras.applications.DenseNet201')
+@keras_export('keras.applications.densenet.DenseNet201',
+              'keras.applications.DenseNet201')
 @keras_modules_injection
 def DenseNet201(*args, **kwargs):
   return densenet.DenseNet201(*args, **kwargs)
 
 
-@tf_export('keras.applications.densenet.decode_predictions')
+@keras_export('keras.applications.densenet.decode_predictions')
 @keras_modules_injection
 def decode_predictions(*args, **kwargs):
   return densenet.decode_predictions(*args, **kwargs)
 
 
-@tf_export('keras.applications.densenet.preprocess_input')
+@keras_export('keras.applications.densenet.preprocess_input')
 @keras_modules_injection
 def preprocess_input(*args, **kwargs):
   return densenet.preprocess_input(*args, **kwargs)
diff --git a/tensorflow/python/keras/applications/imagenet_utils.py b/tensorflow/python/keras/applications/imagenet_utils.py
index c25b5c2bdd019b8816f6c83e64c1cb1cb106bff2..d60afe43bc3f5f0a2668c1b06936f1bf72c3d25f 100644
--- a/tensorflow/python/keras/applications/imagenet_utils.py
+++ b/tensorflow/python/keras/applications/imagenet_utils.py
@@ -21,16 +21,16 @@ from __future__ import print_function
 from keras_applications import imagenet_utils
 
 from tensorflow.python.keras.applications import keras_modules_injection
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.applications.imagenet_utils.preprocess_input')
+@keras_export('keras.applications.imagenet_utils.preprocess_input')
 @keras_modules_injection
 def decode_predictions(*args, **kwargs):
   return imagenet_utils.decode_predictions(*args, **kwargs)
 
 
-@tf_export('keras.applications.imagenet_utils.preprocess_input')
+@keras_export('keras.applications.imagenet_utils.preprocess_input')
 @keras_modules_injection
 def preprocess_input(*args, **kwargs):
   return imagenet_utils.preprocess_input(*args, **kwargs)
diff --git a/tensorflow/python/keras/applications/inception_resnet_v2.py b/tensorflow/python/keras/applications/inception_resnet_v2.py
index 0b9ef371fa593381476a4f3c97f57366bef4cb30..0203cf6ad9022a5a41a8e66da37a7a794a7edf3a 100644
--- a/tensorflow/python/keras/applications/inception_resnet_v2.py
+++ b/tensorflow/python/keras/applications/inception_resnet_v2.py
@@ -22,23 +22,23 @@ from __future__ import print_function
 from keras_applications import inception_resnet_v2
 
 from tensorflow.python.keras.applications import keras_modules_injection
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.applications.inception_resnet_v2.InceptionResNetV2',
-           'keras.applications.InceptionResNetV2')
+@keras_export('keras.applications.inception_resnet_v2.InceptionResNetV2',
+              'keras.applications.InceptionResNetV2')
 @keras_modules_injection
 def InceptionResNetV2(*args, **kwargs):
   return inception_resnet_v2.InceptionResNetV2(*args, **kwargs)
 
 
-@tf_export('keras.applications.inception_resnet_v2.decode_predictions')
+@keras_export('keras.applications.inception_resnet_v2.decode_predictions')
 @keras_modules_injection
 def decode_predictions(*args, **kwargs):
   return inception_resnet_v2.decode_predictions(*args, **kwargs)
 
 
-@tf_export('keras.applications.inception_resnet_v2.preprocess_input')
+@keras_export('keras.applications.inception_resnet_v2.preprocess_input')
 @keras_modules_injection
 def preprocess_input(*args, **kwargs):
   return inception_resnet_v2.preprocess_input(*args, **kwargs)
diff --git a/tensorflow/python/keras/applications/inception_v3.py b/tensorflow/python/keras/applications/inception_v3.py
index ab76826e17d2d4ec36433ba1a91de82e1dd17f63..08bf3f3f2b26862b424fac9ecb00680c8196695a 100644
--- a/tensorflow/python/keras/applications/inception_v3.py
+++ b/tensorflow/python/keras/applications/inception_v3.py
@@ -22,23 +22,23 @@ from __future__ import print_function
 from keras_applications import inception_v3
 
 from tensorflow.python.keras.applications import keras_modules_injection
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.applications.inception_v3.InceptionV3',
-           'keras.applications.InceptionV3')
+@keras_export('keras.applications.inception_v3.InceptionV3',
+              'keras.applications.InceptionV3')
 @keras_modules_injection
 def InceptionV3(*args, **kwargs):
   return inception_v3.InceptionV3(*args, **kwargs)
 
 
-@tf_export('keras.applications.inception_v3.decode_predictions')
+@keras_export('keras.applications.inception_v3.decode_predictions')
 @keras_modules_injection
 def decode_predictions(*args, **kwargs):
   return inception_v3.decode_predictions(*args, **kwargs)
 
 
-@tf_export('keras.applications.inception_v3.preprocess_input')
+@keras_export('keras.applications.inception_v3.preprocess_input')
 @keras_modules_injection
 def preprocess_input(*args, **kwargs):
   return inception_v3.preprocess_input(*args, **kwargs)
diff --git a/tensorflow/python/keras/applications/mobilenet.py b/tensorflow/python/keras/applications/mobilenet.py
index 1f71a5ae993e841d1ee1f835b2dea2951011c558..d40e4a7614709e59348b3eb207a4a9a7aa9c4d4e 100644
--- a/tensorflow/python/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/applications/mobilenet.py
@@ -22,23 +22,23 @@ from __future__ import print_function
 from keras_applications import mobilenet
 
 from tensorflow.python.keras.applications import keras_modules_injection
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.applications.mobilenet.MobileNet',
-           'keras.applications.MobileNet')
+@keras_export('keras.applications.mobilenet.MobileNet',
+              'keras.applications.MobileNet')
 @keras_modules_injection
 def MobileNet(*args, **kwargs):
   return mobilenet.MobileNet(*args, **kwargs)
 
 
-@tf_export('keras.applications.mobilenet.decode_predictions')
+@keras_export('keras.applications.mobilenet.decode_predictions')
 @keras_modules_injection
 def decode_predictions(*args, **kwargs):
   return mobilenet.decode_predictions(*args, **kwargs)
 
 
-@tf_export('keras.applications.mobilenet.preprocess_input')
+@keras_export('keras.applications.mobilenet.preprocess_input')
 @keras_modules_injection
 def preprocess_input(*args, **kwargs):
   return mobilenet.preprocess_input(*args, **kwargs)
diff --git a/tensorflow/python/keras/applications/mobilenet_v2.py b/tensorflow/python/keras/applications/mobilenet_v2.py
index 52ac5959adbce2a9d5b2c20f9eb265aa783a8ba5..696844067ef22865c0a9aa1ebbdb7e022c7970f2 100644
--- a/tensorflow/python/keras/applications/mobilenet_v2.py
+++ b/tensorflow/python/keras/applications/mobilenet_v2.py
@@ -22,23 +22,23 @@ from __future__ import print_function
 from keras_applications import mobilenet_v2
 
 from tensorflow.python.keras.applications import keras_modules_injection
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.applications.mobilenet_v2.MobileNetV2',
-           'keras.applications.MobileNetV2')
+@keras_export('keras.applications.mobilenet_v2.MobileNetV2',
+              'keras.applications.MobileNetV2')
 @keras_modules_injection
 def MobileNetV2(*args, **kwargs):
   return mobilenet_v2.MobileNetV2(*args, **kwargs)
 
 
-@tf_export('keras.applications.mobilenet_v2.decode_predictions')
+@keras_export('keras.applications.mobilenet_v2.decode_predictions')
 @keras_modules_injection
 def decode_predictions(*args, **kwargs):
   return mobilenet_v2.decode_predictions(*args, **kwargs)
 
 
-@tf_export('keras.applications.mobilenet_v2.preprocess_input')
+@keras_export('keras.applications.mobilenet_v2.preprocess_input')
 @keras_modules_injection
 def preprocess_input(*args, **kwargs):
   return mobilenet_v2.preprocess_input(*args, **kwargs)
diff --git a/tensorflow/python/keras/applications/nasnet.py b/tensorflow/python/keras/applications/nasnet.py
index 44fc329d577bce5394dde0fe56beccf69e5e61a3..90c4fb23bb892b68e00408c643f817791460527f 100644
--- a/tensorflow/python/keras/applications/nasnet.py
+++ b/tensorflow/python/keras/applications/nasnet.py
@@ -22,30 +22,30 @@ from __future__ import print_function
 from keras_applications import nasnet
 
 from tensorflow.python.keras.applications import keras_modules_injection
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.applications.nasnet.NASNetMobile',
-           'keras.applications.NASNetMobile')
+@keras_export('keras.applications.nasnet.NASNetMobile',
+              'keras.applications.NASNetMobile')
 @keras_modules_injection
 def NASNetMobile(*args, **kwargs):
   return nasnet.NASNetMobile(*args, **kwargs)
 
 
-@tf_export('keras.applications.nasnet.NASNetLarge',
-           'keras.applications.NASNetLarge')
+@keras_export('keras.applications.nasnet.NASNetLarge',
+              'keras.applications.NASNetLarge')
 @keras_modules_injection
 def NASNetLarge(*args, **kwargs):
   return nasnet.NASNetLarge(*args, **kwargs)
 
 
-@tf_export('keras.applications.nasnet.decode_predictions')
+@keras_export('keras.applications.nasnet.decode_predictions')
 @keras_modules_injection
 def decode_predictions(*args, **kwargs):
   return nasnet.decode_predictions(*args, **kwargs)
 
 
-@tf_export('keras.applications.nasnet.preprocess_input')
+@keras_export('keras.applications.nasnet.preprocess_input')
 @keras_modules_injection
 def preprocess_input(*args, **kwargs):
   return nasnet.preprocess_input(*args, **kwargs)
diff --git a/tensorflow/python/keras/applications/resnet50.py b/tensorflow/python/keras/applications/resnet50.py
index 80d3f9044f5f3814bb0d8afe8db3aee63c5cc41f..38f4d8a67a94c99b772babd2302953928d4c7e85 100644
--- a/tensorflow/python/keras/applications/resnet50.py
+++ b/tensorflow/python/keras/applications/resnet50.py
@@ -22,23 +22,23 @@ from __future__ import print_function
 from keras_applications import resnet50
 
 from tensorflow.python.keras.applications import keras_modules_injection
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.applications.resnet50.ResNet50',
-           'keras.applications.ResNet50')
+@keras_export('keras.applications.resnet50.ResNet50',
+              'keras.applications.ResNet50')
 @keras_modules_injection
 def ResNet50(*args, **kwargs):
   return resnet50.ResNet50(*args, **kwargs)
 
 
-@tf_export('keras.applications.resnet50.decode_predictions')
+@keras_export('keras.applications.resnet50.decode_predictions')
 @keras_modules_injection
 def decode_predictions(*args, **kwargs):
   return resnet50.decode_predictions(*args, **kwargs)
 
 
-@tf_export('keras.applications.resnet50.preprocess_input')
+@keras_export('keras.applications.resnet50.preprocess_input')
 @keras_modules_injection
 def preprocess_input(*args, **kwargs):
   return resnet50.preprocess_input(*args, **kwargs)
diff --git a/tensorflow/python/keras/applications/vgg16.py b/tensorflow/python/keras/applications/vgg16.py
index 8557d26931f7a13ea1cdae5791dba0399cd151e0..e2a34258caa0a7d12effdc59518b81af870cc34f 100644
--- a/tensorflow/python/keras/applications/vgg16.py
+++ b/tensorflow/python/keras/applications/vgg16.py
@@ -22,23 +22,23 @@ from __future__ import print_function
 from keras_applications import vgg16
 
 from tensorflow.python.keras.applications import keras_modules_injection
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.applications.vgg16.VGG16',
-           'keras.applications.VGG16')
+@keras_export('keras.applications.vgg16.VGG16',
+              'keras.applications.VGG16')
 @keras_modules_injection
 def VGG16(*args, **kwargs):
   return vgg16.VGG16(*args, **kwargs)
 
 
-@tf_export('keras.applications.vgg16.decode_predictions')
+@keras_export('keras.applications.vgg16.decode_predictions')
 @keras_modules_injection
 def decode_predictions(*args, **kwargs):
   return vgg16.decode_predictions(*args, **kwargs)
 
 
-@tf_export('keras.applications.vgg16.preprocess_input')
+@keras_export('keras.applications.vgg16.preprocess_input')
 @keras_modules_injection
 def preprocess_input(*args, **kwargs):
   return vgg16.preprocess_input(*args, **kwargs)
diff --git a/tensorflow/python/keras/applications/vgg19.py b/tensorflow/python/keras/applications/vgg19.py
index 8fc04413a0299156ffcb223577339c3470ea717e..ed362edfa8206ba4b86e1470bf197d9b83f9ce30 100644
--- a/tensorflow/python/keras/applications/vgg19.py
+++ b/tensorflow/python/keras/applications/vgg19.py
@@ -22,23 +22,23 @@ from __future__ import print_function
 from keras_applications import vgg19
 
 from tensorflow.python.keras.applications import keras_modules_injection
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.applications.vgg19.VGG19',
-           'keras.applications.VGG19')
+@keras_export('keras.applications.vgg19.VGG19',
+              'keras.applications.VGG19')
 @keras_modules_injection
 def VGG19(*args, **kwargs):
   return vgg19.VGG19(*args, **kwargs)
 
 
-@tf_export('keras.applications.vgg19.decode_predictions')
+@keras_export('keras.applications.vgg19.decode_predictions')
 @keras_modules_injection
 def decode_predictions(*args, **kwargs):
   return vgg19.decode_predictions(*args, **kwargs)
 
 
-@tf_export('keras.applications.vgg19.preprocess_input')
+@keras_export('keras.applications.vgg19.preprocess_input')
 @keras_modules_injection
 def preprocess_input(*args, **kwargs):
   return vgg19.preprocess_input(*args, **kwargs)
diff --git a/tensorflow/python/keras/applications/xception.py b/tensorflow/python/keras/applications/xception.py
index 960e6dec6943fcf94d91e70c161b88fedf20ed76..4476213f6d4971a4edc0b98cbbd44ad54c2b89c7 100644
--- a/tensorflow/python/keras/applications/xception.py
+++ b/tensorflow/python/keras/applications/xception.py
@@ -22,23 +22,23 @@ from __future__ import print_function
 from keras_applications import xception
 
 from tensorflow.python.keras.applications import keras_modules_injection
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.applications.xception.Xception',
-           'keras.applications.Xception')
+@keras_export('keras.applications.xception.Xception',
+              'keras.applications.Xception')
 @keras_modules_injection
 def Xception(*args, **kwargs):
   return xception.Xception(*args, **kwargs)
 
 
-@tf_export('keras.applications.xception.decode_predictions')
+@keras_export('keras.applications.xception.decode_predictions')
 @keras_modules_injection
 def decode_predictions(*args, **kwargs):
   return xception.decode_predictions(*args, **kwargs)
 
 
-@tf_export('keras.applications.xception.preprocess_input')
+@keras_export('keras.applications.xception.preprocess_input')
 @keras_modules_injection
 def preprocess_input(*args, **kwargs):
   return xception.preprocess_input(*args, **kwargs)
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 420c457a0ca2c74c5a0148a98e281b4663ab3226..0a8c5bb19f493d257ad453b8b265aaba2aac3a7a 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -32,14 +32,19 @@ import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session as session_module
+from tensorflow.python.distribute import distribute_coordinator as dc
+from tensorflow.python.distribute import distribute_coordinator_context as dc_context
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function as eager_function
+from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.keras import backend_config
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
@@ -50,6 +55,7 @@ from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import map_fn as map_fn_lib
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
@@ -59,11 +65,11 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables as variables_module
-
+from tensorflow.python.training import server_lib
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 py_all = all
 py_sum = sum
@@ -74,6 +80,9 @@ py_sum = sum
 # while executing eagerly (such as the functional API for model-building).
 _GRAPH = None
 
+# A graph which is used for constructing functions in eager mode.
+_CURRENT_SCRATCH_GRAPH = None
+
 # This is a thread local object that will hold the default internal TF session
 # used by Keras. It can be set manually via `set_session(sess)`.
 _SESSION = threading.local()
@@ -86,26 +95,14 @@ _GRAPH_LEARNING_PHASES = weakref.WeakKeyDictionary()
 
 # _DUMMY_EAGER_GRAPH is used as a key in _GRAPH_LEARNING_PHASES.
 # We keep a separate reference to it to make sure it does not get removed from
-# _GRAPH_LEARNING_PHASES. We use a dummy class instead of something like a
-# string because strings are not weakly-referencable.
-class _DummyEagerGraph(object):
-  pass
-_DUMMY_EAGER_GRAPH = _DummyEagerGraph()
+# _GRAPH_LEARNING_PHASES.
+_DUMMY_EAGER_GRAPH = threading.local()
 
 # This boolean flag can be set to True to leave variable initialization
 # up to the user.
 # Change its value via `manual_variable_initialization(value)`.
 _MANUAL_VAR_INIT = False
 
-# The type of float to use throughout a session.
-_FLOATX = 'float32'
-
-# Epsilon fuzz factor used throughout the codebase.
-_EPSILON = 1e-7
-
-# Default image data format, one of "channels_last", "channels_first".
-_IMAGE_DATA_FORMAT = 'channels_last'
-
 # This list holds the available devices.
 # It is populated when `_get_available_gpus()` is called for the first time.
 # We assume our devices don't change henceforth.
@@ -119,8 +116,16 @@ _GRAPH_VARIABLES = weakref.WeakKeyDictionary()
 # the graph.
 _GRAPH_TF_OPTIMIZERS = weakref.WeakKeyDictionary()
 
+# The below functions are kept accessible from backend for compatibility.
+epsilon = backend_config.epsilon
+floatx = backend_config.floatx
+image_data_format = backend_config.image_data_format
+set_epsilon = backend_config.set_epsilon
+set_floatx = backend_config.set_floatx
+set_image_data_format = backend_config.set_image_data_format
+
 
-@tf_export('keras.backend.backend')
+@keras_export('keras.backend.backend')
 def backend():
   """Publicly accessible method for determining the current backend.
 
@@ -132,88 +137,7 @@ def backend():
   return 'tensorflow'
 
 
-@tf_export('keras.backend.epsilon')
-def epsilon():
-  """Returns the value of the fuzz factor used in numeric expressions.
-
-  Returns:
-      A float.
-
-  Example:
-  ```python
-      >>> keras.backend.epsilon()
-      1e-07
-  ```
-  """
-  return _EPSILON
-
-
-@tf_export('keras.backend.set_epsilon')
-def set_epsilon(value):
-  """Sets the value of the fuzz factor used in numeric expressions.
-
-  Arguments:
-      value: float. New value of epsilon.
-
-  Example:
-  ```python
-      >>> from keras import backend as K
-      >>> K.epsilon()
-      1e-07
-      >>> K.set_epsilon(1e-05)
-      >>> K.epsilon()
-      1e-05
-  ```
-  """
-  global _EPSILON
-  _EPSILON = value
-
-
-@tf_export('keras.backend.floatx')
-def floatx():
-  """Returns the default float type, as a string.
-
-  E.g. 'float16', 'float32', 'float64'.
-
-  Returns:
-      String, the current default float type.
-
-  Example:
-  ```python
-      >>> keras.backend.floatx()
-      'float32'
-  ```
-  """
-  return _FLOATX
-
-
-@tf_export('keras.backend.set_floatx')
-def set_floatx(value):
-  """Sets the default float type.
-
-  Arguments:
-      value: String; 'float16', 'float32', or 'float64'.
-
-  Example:
-  ```python
-      >>> from keras import backend as K
-      >>> K.floatx()
-      'float32'
-      >>> K.set_floatx('float16')
-      >>> K.floatx()
-      'float16'
-  ```
-
-  Raises:
-      ValueError: In case of invalid value.
-  """
-  global _FLOATX
-  if value not in {'float16', 'float32', 'float64'}:
-    raise ValueError('Unknown floatx type: ' + str(value))
-  _FLOATX = str(value)
-
-
-@tf_export('keras.backend.cast_to_floatx')
+@keras_export('keras.backend.cast_to_floatx')
 def cast_to_floatx(x):
   """Cast a Numpy array to the default Keras float type.
 
@@ -238,49 +162,7 @@ def cast_to_floatx(x):
       dtype('float32')
   ```
   """
-  return np.asarray(x, dtype=_FLOATX)
-
-
-@tf_export('keras.backend.image_data_format')
-def image_data_format():
-  """Returns the default image data format convention.
-
-  Returns:
-      A string, either `'channels_first'` or `'channels_last'`
-
-  Example:
-  ```python
-      >>> keras.backend.image_data_format()
-      'channels_first'
-  ```
-  """
-  return _IMAGE_DATA_FORMAT
-
-
-@tf_export('keras.backend.set_image_data_format')
-def set_image_data_format(data_format):
-  """Sets the value of the image data format convention.
-
-  Arguments:
-      data_format: string. `'channels_first'` or `'channels_last'`.
-
-  Example:
-  ```python
-      >>> from keras import backend as K
-      >>> K.image_data_format()
-      'channels_first'
-      >>> K.set_image_data_format('channels_last')
-      >>> K.image_data_format()
-      'channels_last'
-  ```
-
-  Raises:
-      ValueError: In case of invalid `data_format` value.
-  """
-  global _IMAGE_DATA_FORMAT
-  if data_format not in {'channels_last', 'channels_first'}:
-    raise ValueError('Unknown data_format: ' + str(data_format))
-  _IMAGE_DATA_FORMAT = str(data_format)
+  return np.asarray(x, dtype=floatx())
 
 
 # A global dictionary mapping graph objects to an index of counters used
@@ -289,7 +171,7 @@ def set_image_data_format(data_format):
 PER_GRAPH_LAYER_NAME_UIDS = weakref.WeakKeyDictionary()
 
 
-@tf_export('keras.backend.get_uid')
+@keras_export('keras.backend.get_uid')
 def get_uid(prefix=''):
   """Associates a string prefix with an integer counter in a TensorFlow graph.
 
@@ -316,7 +198,7 @@ def get_uid(prefix=''):
   return layer_name_uids[prefix]
 
 
-@tf_export('keras.backend.reset_uids')
+@keras_export('keras.backend.reset_uids')
 def reset_uids():
   """Resets graph identifiers.
   """
@@ -326,7 +208,7 @@ def reset_uids():
     del per_graph_layer_name_uids[key]
 
 
-@tf_export('keras.backend.clear_session')
+@keras_export('keras.backend.clear_session')
 def clear_session():
   """Destroys the current TF graph and creates a new one.
 
@@ -341,15 +223,16 @@ def clear_session():
   _SESSION.session = None
   graph = get_graph()
   with graph.as_default():
-    phase = array_ops.placeholder_with_default(
-        False, shape=(), name='keras_learning_phase')
+    with ops.name_scope(''):
+      phase = array_ops.placeholder_with_default(
+          False, shape=(), name='keras_learning_phase')
     _GRAPH_LEARNING_PHASES = {}
     _GRAPH_LEARNING_PHASES[graph] = phase
     _GRAPH_VARIABLES.pop(graph, None)
     _GRAPH_TF_OPTIMIZERS.pop(graph, None)
 
 
-@tf_export('keras.backend.manual_variable_initialization')
+@keras_export('keras.backend.manual_variable_initialization')
 def manual_variable_initialization(value):
   """Sets the manual variable initialization flag.
 
@@ -366,7 +249,7 @@ def manual_variable_initialization(value):
   _MANUAL_VAR_INIT = value
 
 
-@tf_export('keras.backend.learning_phase')
+@keras_export('keras.backend.learning_phase')
 def learning_phase():
   """Returns the learning phase flag.
 
@@ -377,25 +260,38 @@ def learning_phase():
   Returns:
       Learning phase (scalar integer tensor or Python integer).
   """
-  if context.executing_eagerly():
-    if _DUMMY_EAGER_GRAPH not in _GRAPH_LEARNING_PHASES:
-      # Fallback to inference mode as default.
-      return 0
-    return _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH]
-  return symbolic_learning_phase()
+  if ops.get_default_graph() is _GRAPH:
+    # Don't enter an init_scope for the learning phase if eager execution
+    # is enabled but we're inside the Keras workspace graph.
+    return symbolic_learning_phase()
+  with ops.init_scope():
+    # We always check & set the learning phase inside the init_scope,
+    # otherwise the wrong default_graph will be used to look up the learning
+    # phase inside of functions & defuns.
+    #
+    # This is because functions & defuns (both in graph & in eager mode)
+    # will always execute non-eagerly using a function-specific default
+    # subgraph.
+    if context.executing_eagerly():
+      if _DUMMY_EAGER_GRAPH not in _GRAPH_LEARNING_PHASES:
+        # Fallback to inference mode as default.
+        return 0
+      return _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH]
+    return symbolic_learning_phase()
 
 
 def symbolic_learning_phase():
   graph = get_graph()
   with graph.as_default():
     if graph not in _GRAPH_LEARNING_PHASES:
-      phase = array_ops.placeholder_with_default(
-          False, shape=(), name='keras_learning_phase')
+      with ops.name_scope(''):
+        phase = array_ops.placeholder_with_default(
+            False, shape=(), name='keras_learning_phase')
       _GRAPH_LEARNING_PHASES[graph] = phase
     return _GRAPH_LEARNING_PHASES[graph]
 
 
-@tf_export('keras.backend.set_learning_phase')
+@keras_export('keras.backend.set_learning_phase')
 def set_learning_phase(value):
   """Sets the learning phase to a fixed value.
 
@@ -410,11 +306,25 @@ def set_learning_phase(value):
     raise ValueError('Expected learning phase to be 0 or 1.')
   with ops.init_scope():
     if context.executing_eagerly():
+      # In an eager context, the learning phase values applies to both the eager
+      # context and the internal Keras graph.
       _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH] = value
-    else:
-      _GRAPH_LEARNING_PHASES[get_graph()] = value
+    _GRAPH_LEARNING_PHASES[get_graph()] = value
 
 
+def set_eager_learning_phase(value):
+  """Internal utility that sets the learning phase in eager execution only.
+
+  Arguments:
+      value: Learning phase value, either 0 or 1 (integers).
+  """
+  global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
+  assert value in {0, 1}
+  assert context.executing_eagerly()
+  _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH] = value
+
+
+@keras_export('keras.backend.learning_phase_scope')
 @tf_contextlib.contextmanager
 def learning_phase_scope(value):
   """Provides a scope within which the learning phase is equal to `value`.
@@ -425,47 +335,102 @@ def learning_phase_scope(value):
      value: Learning phase value, either 0 or 1 (integers).
 
   Yields:
-    The provided value.
+    None.
 
   Raises:
      ValueError: if `value` is neither `0` nor `1`.
   """
+  global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
   if value not in {0, 1}:
     raise ValueError('Expected learning phase to be 0 or 1.')
-  previous_value = learning_phase()
+
+  with ops.init_scope():
+    if context.executing_eagerly():
+      previous_eager_value = _GRAPH_LEARNING_PHASES.get(
+          _DUMMY_EAGER_GRAPH, None)
+    previous_graph_value = _GRAPH_LEARNING_PHASES.get(get_graph(), None)
+
   try:
     set_learning_phase(value)
-    yield value
+    yield
   finally:
     # Restore learning phase to initial value.
     with ops.init_scope():
       if context.executing_eagerly():
-        _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH] = previous_value
-      else:
-        _GRAPH_LEARNING_PHASES[get_graph()] = previous_value
+        if previous_eager_value is not None:
+          _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH] = previous_eager_value
+        elif _DUMMY_EAGER_GRAPH in _GRAPH_LEARNING_PHASES:
+          del _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH]
+
+      graph = get_graph()
+      if previous_graph_value is not None:
+        _GRAPH_LEARNING_PHASES[graph] = previous_graph_value
+      elif graph in _GRAPH_LEARNING_PHASES:
+        del _GRAPH_LEARNING_PHASES[graph]
+
+@tf_contextlib.contextmanager
+def eager_learning_phase_scope(value):
+  """Internal scope that sets the learning phase in eager execution only.
+
+  Arguments:
+      value: Learning phase value, either 0 or 1 (integers).
+
+  Yields:
+    None.
+
+  Raises:
+     ValueError: if `value` is neither `0` nor `1`.
+  """
+  global _GRAPH_LEARNING_PHASES  # pylint: disable=global-variable-not-assigned
+  assert value in {0, 1}
+  assert context.executing_eagerly()
+  previous_value = learning_phase()
+  try:
+    _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH] = value
+    yield
+  finally:
+    # Restore learning phase to initial value.
+    _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH] = previous_value
 
 
-def _get_session():
+def _current_graph(op_input_list):
+  """Return the graph members of `op_input_list`, or the current graph."""
+  return ops._get_graph_from_inputs(op_input_list)
+
+
+def _get_session(op_input_list=()):
   """Returns the session object for the current thread."""
   global _SESSION
   default_session = ops.get_default_session()
   if default_session is not None:
     session = default_session
   else:
-    if getattr(_SESSION, 'session', None) is None:
-      _SESSION.session = session_module.Session(
-          config=get_default_session_config())
+    if ops.inside_function():
+      raise RuntimeError('Cannot get session inside Tensorflow graph function.')
+    # If we don't have a session, or that session does not match the current
+    # graph, create and cache a new session.
+    if (getattr(_SESSION, 'session', None) is None or
+        _SESSION.session.graph is not _current_graph(op_input_list)):
+      # If we are creating the Session inside a tf.distribute.Strategy scope,
+      # we ask the strategy for the right session options to use.
+      if distribution_strategy_context.has_strategy():
+        configure_and_create_distributed_session(
+            distribution_strategy_context.get_strategy())
+      else:
+        _SESSION.session = session_module.Session(
+            config=get_default_session_config())
     session = _SESSION.session
   return session
 
 
-@tf_export(v1=['keras.backend.get_session'])
-def get_session():
+@keras_export(v1=['keras.backend.get_session'])
+def get_session(op_input_list=()):
   """Returns the TF session to be used by the backend.
 
   If a default TensorFlow session is available, we will return it.
 
-  Else, we will return the global Keras session.
+  Else, we will return the global Keras session assuming it matches
+  the current graph.
 
   If no global Keras session exists at this point:
   we will create a new global session.
@@ -473,10 +438,15 @@ def get_session():
   Note that you can manually set the global session
   via `K.set_session(sess)`.
 
+  Arguments:
+      op_input_list: An option sequence of tensors or ops, which will be used
+        to determine the current graph. Otherwise the default graph will be
+        used.
+
   Returns:
       A TensorFlow session.
   """
-  session = _get_session()
+  session = _get_session(op_input_list)
   if not _MANUAL_VAR_INIT:
     with session.graph.as_default():
       _initialize_variables(session)
@@ -493,7 +463,41 @@ def get_graph():
     return ops.get_default_graph()
 
 
-@tf_export('keras.backend.set_session')
+@tf_contextlib.contextmanager
+def _scratch_graph(graph=None):
+  """Retrieve a shared and temporary func graph.
+
+  The eager execution path lifts a subgraph from the keras global graph into
+  a scratch graph in order to create a function. DistributionStrategies, in
+  turn, constructs multiple functions as well as a final combined function. In
+  order for that logic to work correctly, all of the functions need to be
+  created on the same scratch FuncGraph.
+
+  Args:
+    graph: A graph to be used as the current scratch graph. If not set then
+      a scratch graph will either be retrieved or created:
+
+  Yields:
+    The current scratch graph.
+  """
+  global _CURRENT_SCRATCH_GRAPH
+  if (_CURRENT_SCRATCH_GRAPH is not None and graph is not None and
+      _CURRENT_SCRATCH_GRAPH is not graph):
+    raise ValueError('Multiple scratch graphs specified.')
+
+  if _CURRENT_SCRATCH_GRAPH:
+    yield _CURRENT_SCRATCH_GRAPH
+    return
+
+  graph = graph or func_graph.FuncGraph('keras_scratch_graph')
+  try:
+    _CURRENT_SCRATCH_GRAPH = graph
+    yield graph
+  finally:
+    _CURRENT_SCRATCH_GRAPH = None
+
+
+@keras_export('keras.backend.set_session')
 def set_session(session):
   """Sets the global TensorFlow session.
 
@@ -568,6 +572,10 @@ def _get_available_gpus():
   Returns:
       A list of available GPU devices.
   """
+  if ops.executing_eagerly_outside_functions():
+    # Returns names of devices directly.
+    return [name for name in context.list_devices() if 'GPU' in name]
+
   global _LOCAL_DEVICES
   if _LOCAL_DEVICES is None:
     _LOCAL_DEVICES = get_session().list_devices()
@@ -606,7 +614,7 @@ def _to_tensor(x, dtype):
   return ops.convert_to_tensor(x, dtype=dtype)
 
 
-@tf_export('keras.backend.is_sparse')
+@keras_export('keras.backend.is_sparse')
 def is_sparse(tensor):
   """Returns whether a tensor is a sparse tensor.
 
@@ -630,7 +638,7 @@ def is_sparse(tensor):
   return isinstance(tensor, sparse_tensor.SparseTensor)
 
 
-@tf_export('keras.backend.to_dense')
+@keras_export('keras.backend.to_dense')
 def to_dense(tensor):
   """Converts a sparse tensor into a dense tensor and returns it.
 
@@ -660,7 +668,7 @@ def to_dense(tensor):
 name_scope = ops.name_scope
 
 
-@tf_export('keras.backend.variable')
+@keras_export('keras.backend.variable')
 def variable(value, dtype=None, name=None, constraint=None):
   """Instantiates a variable and returns it.
 
@@ -761,7 +769,7 @@ def _initialize_variables(session):
       session.run(variables_module.variables_initializer(uninitialized_vars))
 
 
-@tf_export('keras.backend.constant')
+@keras_export('keras.backend.constant')
 def constant(value, dtype=None, shape=None, name=None):
   """Creates a constant tensor.
 
@@ -832,7 +840,7 @@ def is_keras_tensor(x):
   return hasattr(x, '_keras_history')
 
 
-@tf_export('keras.backend.placeholder')
+@keras_export('keras.backend.placeholder')
 def placeholder(shape=None, ndim=None, dtype=None, sparse=False, name=None):
   """Instantiates a placeholder tensor and returns it.
 
@@ -888,7 +896,7 @@ def is_placeholder(x):
     return False
 
 
-@tf_export('keras.backend.shape')
+@keras_export('keras.backend.shape')
 def shape(x):
   """Returns the symbolic shape of a tensor or variable.
 
@@ -921,7 +929,7 @@ def shape(x):
   return array_ops.shape(x)
 
 
-@tf_export('keras.backend.int_shape')
+@keras_export('keras.backend.int_shape')
 def int_shape(x):
   """Returns the shape of tensor or variable as a tuple of int or None entries.
 
@@ -952,7 +960,7 @@ def int_shape(x):
     return None
 
 
-@tf_export('keras.backend.ndim')
+@keras_export('keras.backend.ndim')
 def ndim(x):
   """Returns the number of axes in a tensor, as an integer.
 
@@ -980,7 +988,7 @@ def ndim(x):
   return None
 
 
-@tf_export('keras.backend.dtype')
+@keras_export('keras.backend.dtype')
 def dtype(x):
   """Returns the dtype of a Keras tensor or variable, as a string.
 
@@ -1002,16 +1010,16 @@ def dtype(x):
       # Keras variable
       >>> kvar = K.variable(np.array([[1, 2], [3, 4]]))
       >>> K.dtype(kvar)
-      'float32_ref'
+      'float32'
       >>> kvar = K.variable(np.array([[1, 2], [3, 4]]), dtype='float32')
       >>> K.dtype(kvar)
-      'float32_ref'
+      'float32'
   ```
   """
   return x.dtype.base_dtype.name
 
 
-@tf_export('keras.backend.eval')
+@keras_export('keras.backend.eval')
 def eval(x):
   """Evaluates the value of a variable.
 
@@ -1033,7 +1041,7 @@ def eval(x):
   return get_value(to_dense(x))
 
 
-@tf_export('keras.backend.zeros')
+@keras_export('keras.backend.zeros')
 def zeros(shape, dtype=None, name=None):
   """Instantiates an all-zeros variable and returns it.
 
@@ -1068,7 +1076,7 @@ def zeros(shape, dtype=None, name=None):
     return v
 
 
-@tf_export('keras.backend.ones')
+@keras_export('keras.backend.ones')
 def ones(shape, dtype=None, name=None):
   """Instantiates an all-ones variable and returns it.
 
@@ -1103,7 +1111,7 @@ def ones(shape, dtype=None, name=None):
     return v
 
 
-@tf_export('keras.backend.eye')
+@keras_export('keras.backend.eye')
 def eye(size, dtype=None, name=None):
   """Instantiate an identity matrix and returns it.
 
@@ -1132,7 +1140,7 @@ def eye(size, dtype=None, name=None):
   return variable(linalg_ops.eye(size, dtype=tf_dtype), dtype, name)
 
 
-@tf_export('keras.backend.zeros_like')
+@keras_export('keras.backend.zeros_like')
 def zeros_like(x, dtype=None, name=None):
   """Instantiates an all-zeros variable of the same shape as another tensor.
 
@@ -1158,7 +1166,7 @@ def zeros_like(x, dtype=None, name=None):
   return array_ops.zeros_like(x, dtype=dtype, name=name)
 
 
-@tf_export('keras.backend.ones_like')
+@keras_export('keras.backend.ones_like')
 def ones_like(x, dtype=None, name=None):
   """Instantiates an all-ones variable of the same shape as another tensor.
 
@@ -1197,7 +1205,7 @@ def identity(x, name=None):
   return array_ops.identity(x, name=name)
 
 
-@tf_export('keras.backend.random_uniform_variable')
+@keras_export('keras.backend.random_uniform_variable')
 def random_uniform_variable(shape, low, high, dtype=None, name=None, seed=None):
   """Instantiates a variable with values drawn from a uniform distribution.
 
@@ -1234,7 +1242,7 @@ def random_uniform_variable(shape, low, high, dtype=None, name=None, seed=None):
   return variable(value, dtype=dtype, name=name)
 
 
-@tf_export('keras.backend.random_normal_variable')
+@keras_export('keras.backend.random_normal_variable')
 def random_normal_variable(shape, mean, scale, dtype=None, name=None,
                            seed=None):
   """Instantiates a variable with values drawn from a normal distribution.
@@ -1272,7 +1280,7 @@ def random_normal_variable(shape, mean, scale, dtype=None, name=None,
   return variable(value, dtype=dtype, name=name)
 
 
-@tf_export('keras.backend.count_params')
+@keras_export('keras.backend.count_params')
 def count_params(x):
   """Returns the static number of elements in a variable or tensor.
 
@@ -1295,7 +1303,7 @@ def count_params(x):
   return np.prod(x.shape.as_list())
 
 
-@tf_export('keras.backend.cast')
+@keras_export('keras.backend.cast')
 def cast(x, dtype):
   """Casts a tensor to a different dtype and returns it.
 
@@ -1331,12 +1339,12 @@ def cast(x, dtype):
 # UPDATES OPS
 
 
-@tf_export('keras.backend.update')
+@keras_export('keras.backend.update')
 def update(x, new_x):
   return state_ops.assign(x, new_x)
 
 
-@tf_export('keras.backend.update_add')
+@keras_export('keras.backend.update_add')
 def update_add(x, increment):
   """Update the value of `x` by adding `increment`.
 
@@ -1350,7 +1358,7 @@ def update_add(x, increment):
   return state_ops.assign_add(x, increment)
 
 
-@tf_export('keras.backend.update_sub')
+@keras_export('keras.backend.update_sub')
 def update_sub(x, decrement):
   """Update the value of `x` by subtracting `decrement`.
 
@@ -1364,7 +1372,7 @@ def update_sub(x, decrement):
   return state_ops.assign_sub(x, decrement)
 
 
-@tf_export('keras.backend.moving_average_update')
+@keras_export('keras.backend.moving_average_update')
 def moving_average_update(x, value, momentum):
   """Compute the moving average of a variable.
 
@@ -1388,7 +1396,7 @@ def moving_average_update(x, value, momentum):
 # LINEAR ALGEBRA
 
 
-@tf_export('keras.backend.dot')
+@keras_export('keras.backend.dot')
 def dot(x, y):
   """Multiplies 2 tensors (and/or variables) and returns a *tensor*.
 
@@ -1460,7 +1468,7 @@ def dot(x, y):
   return out
 
 
-@tf_export('keras.backend.batch_dot')
+@keras_export('keras.backend.batch_dot')
 def batch_dot(x, y, axes=None):
   """Batchwise dot product.
 
@@ -1552,7 +1560,7 @@ def batch_dot(x, y, axes=None):
   return out
 
 
-@tf_export('keras.backend.transpose')
+@keras_export('keras.backend.transpose')
 def transpose(x):
   """Transposes a tensor and returns it.
 
@@ -1588,7 +1596,7 @@ def transpose(x):
   return array_ops.transpose(x)
 
 
-@tf_export('keras.backend.gather')
+@keras_export('keras.backend.gather')
 def gather(reference, indices):
   """Retrieves the elements of indices `indices` in the tensor `reference`.
 
@@ -1605,7 +1613,7 @@ def gather(reference, indices):
 # ELEMENT-WISE OPERATIONS
 
 
-@tf_export('keras.backend.max')
+@keras_export('keras.backend.max')
 def max(x, axis=None, keepdims=False):
   """Maximum value in a tensor.
 
@@ -1623,7 +1631,7 @@ def max(x, axis=None, keepdims=False):
   return math_ops.reduce_max(x, axis, keepdims)
 
 
-@tf_export('keras.backend.min')
+@keras_export('keras.backend.min')
 def min(x, axis=None, keepdims=False):
   """Minimum value in a tensor.
 
@@ -1636,12 +1644,12 @@ def min(x, axis=None, keepdims=False):
           the reduced dimension is retained with length 1.
 
   Returns:
-      A tensor with miminum values of `x`.
+      A tensor with minimum values of `x`.
   """
   return math_ops.reduce_min(x, axis, keepdims)
 
 
-@tf_export('keras.backend.sum')
+@keras_export('keras.backend.sum')
 def sum(x, axis=None, keepdims=False):
   """Sum of the values in a tensor, alongside the specified axis.
 
@@ -1659,7 +1667,7 @@ def sum(x, axis=None, keepdims=False):
   return math_ops.reduce_sum(x, axis, keepdims)
 
 
-@tf_export('keras.backend.prod')
+@keras_export('keras.backend.prod')
 def prod(x, axis=None, keepdims=False):
   """Multiplies the values in a tensor, alongside the specified axis.
 
@@ -1677,6 +1685,7 @@ def prod(x, axis=None, keepdims=False):
   return math_ops.reduce_prod(x, axis, keepdims)
 
 
+@keras_export('keras.backend.cumsum')
 def cumsum(x, axis=0):
   """Cumulative sum of the values in a tensor, alongside the specified axis.
 
@@ -1690,6 +1699,7 @@ def cumsum(x, axis=0):
   return math_ops.cumsum(x, axis=axis)
 
 
+@keras_export('keras.backend.cumprod')
 def cumprod(x, axis=0):
   """Cumulative product of the values in a tensor, alongside the specified axis.
 
@@ -1703,7 +1713,7 @@ def cumprod(x, axis=0):
   return math_ops.cumprod(x, axis=axis)
 
 
-@tf_export('keras.backend.var')
+@keras_export('keras.backend.var')
 def var(x, axis=None, keepdims=False):
   """Variance of a tensor, alongside the specified axis.
 
@@ -1723,7 +1733,7 @@ def var(x, axis=None, keepdims=False):
   return math_ops.reduce_variance(x, axis=axis, keepdims=keepdims)
 
 
-@tf_export('keras.backend.std')
+@keras_export('keras.backend.std')
 def std(x, axis=None, keepdims=False):
   """Standard deviation of a tensor, alongside the specified axis.
 
@@ -1743,7 +1753,7 @@ def std(x, axis=None, keepdims=False):
   return math_ops.reduce_std(x, axis=axis, keepdims=keepdims)
 
 
-@tf_export('keras.backend.mean')
+@keras_export('keras.backend.mean')
 def mean(x, axis=None, keepdims=False):
   """Mean of a tensor, alongside the specified axis.
 
@@ -1763,7 +1773,7 @@ def mean(x, axis=None, keepdims=False):
   return math_ops.reduce_mean(x, axis, keepdims)
 
 
-@tf_export('keras.backend.any')
+@keras_export('keras.backend.any')
 def any(x, axis=None, keepdims=False):
   """Bitwise reduction (logical OR).
 
@@ -1779,7 +1789,7 @@ def any(x, axis=None, keepdims=False):
   return math_ops.reduce_any(x, axis, keepdims)
 
 
-@tf_export('keras.backend.all')
+@keras_export('keras.backend.all')
 def all(x, axis=None, keepdims=False):
   """Bitwise reduction (logical AND).
 
@@ -1795,7 +1805,7 @@ def all(x, axis=None, keepdims=False):
   return math_ops.reduce_all(x, axis, keepdims)
 
 
-@tf_export('keras.backend.argmax')
+@keras_export('keras.backend.argmax')
 def argmax(x, axis=-1):
   """Returns the index of the maximum value along an axis.
 
@@ -1809,7 +1819,7 @@ def argmax(x, axis=-1):
   return math_ops.argmax(x, axis)
 
 
-@tf_export('keras.backend.argmin')
+@keras_export('keras.backend.argmin')
 def argmin(x, axis=-1):
   """Returns the index of the minimum value along an axis.
 
@@ -1823,7 +1833,7 @@ def argmin(x, axis=-1):
   return math_ops.argmin(x, axis)
 
 
-@tf_export('keras.backend.square')
+@keras_export('keras.backend.square')
 def square(x):
   """Element-wise square.
 
@@ -1836,7 +1846,7 @@ def square(x):
   return math_ops.square(x)
 
 
-@tf_export('keras.backend.abs')
+@keras_export('keras.backend.abs')
 def abs(x):
   """Element-wise absolute value.
 
@@ -1849,7 +1859,7 @@ def abs(x):
   return math_ops.abs(x)
 
 
-@tf_export('keras.backend.sqrt')
+@keras_export('keras.backend.sqrt')
 def sqrt(x):
   """Element-wise square root.
 
@@ -1865,7 +1875,7 @@ def sqrt(x):
   return math_ops.sqrt(x)
 
 
-@tf_export('keras.backend.exp')
+@keras_export('keras.backend.exp')
 def exp(x):
   """Element-wise exponential.
 
@@ -1878,7 +1888,7 @@ def exp(x):
   return math_ops.exp(x)
 
 
-@tf_export('keras.backend.log')
+@keras_export('keras.backend.log')
 def log(x):
   """Element-wise log.
 
@@ -1912,7 +1922,7 @@ def logsumexp(x, axis=None, keepdims=False):
   return math_ops.reduce_logsumexp(x, axis, keepdims)
 
 
-@tf_export('keras.backend.round')
+@keras_export('keras.backend.round')
 def round(x):
   """Element-wise rounding to the closest integer.
 
@@ -1927,7 +1937,7 @@ def round(x):
   return math_ops.round(x)
 
 
-@tf_export('keras.backend.sign')
+@keras_export('keras.backend.sign')
 def sign(x):
   """Element-wise sign.
 
@@ -1940,7 +1950,7 @@ def sign(x):
   return math_ops.sign(x)
 
 
-@tf_export('keras.backend.pow')
+@keras_export('keras.backend.pow')
 def pow(x, a):
   """Element-wise exponentiation.
 
@@ -1954,7 +1964,7 @@ def pow(x, a):
   return math_ops.pow(x, a)
 
 
-@tf_export('keras.backend.clip')
+@keras_export('keras.backend.clip')
 def clip(x, min_value, max_value):
   """Element-wise value clipping.
 
@@ -1975,7 +1985,7 @@ def clip(x, min_value, max_value):
   return clip_ops.clip_by_value(x, min_value, max_value)
 
 
-@tf_export('keras.backend.equal')
+@keras_export('keras.backend.equal')
 def equal(x, y):
   """Element-wise equality between two tensors.
 
@@ -1989,7 +1999,7 @@ def equal(x, y):
   return math_ops.equal(x, y)
 
 
-@tf_export('keras.backend.not_equal')
+@keras_export('keras.backend.not_equal')
 def not_equal(x, y):
   """Element-wise inequality between two tensors.
 
@@ -2003,7 +2013,7 @@ def not_equal(x, y):
   return math_ops.not_equal(x, y)
 
 
-@tf_export('keras.backend.greater')
+@keras_export('keras.backend.greater')
 def greater(x, y):
   """Element-wise truth value of (x > y).
 
@@ -2017,7 +2027,7 @@ def greater(x, y):
   return math_ops.greater(x, y)
 
 
-@tf_export('keras.backend.greater_equal')
+@keras_export('keras.backend.greater_equal')
 def greater_equal(x, y):
   """Element-wise truth value of (x >= y).
 
@@ -2031,7 +2041,7 @@ def greater_equal(x, y):
   return math_ops.greater_equal(x, y)
 
 
-@tf_export('keras.backend.less')
+@keras_export('keras.backend.less')
 def less(x, y):
   """Element-wise truth value of (x < y).
 
@@ -2045,7 +2055,7 @@ def less(x, y):
   return math_ops.less(x, y)
 
 
-@tf_export('keras.backend.less_equal')
+@keras_export('keras.backend.less_equal')
 def less_equal(x, y):
   """Element-wise truth value of (x <= y).
 
@@ -2059,7 +2069,7 @@ def less_equal(x, y):
   return math_ops.less_equal(x, y)
 
 
-@tf_export('keras.backend.maximum')
+@keras_export('keras.backend.maximum')
 def maximum(x, y):
   """Element-wise maximum of two tensors.
 
@@ -2073,7 +2083,7 @@ def maximum(x, y):
   return math_ops.maximum(x, y)
 
 
-@tf_export('keras.backend.minimum')
+@keras_export('keras.backend.minimum')
 def minimum(x, y):
   """Element-wise minimum of two tensors.
 
@@ -2087,7 +2097,7 @@ def minimum(x, y):
   return math_ops.minimum(x, y)
 
 
-@tf_export('keras.backend.sin')
+@keras_export('keras.backend.sin')
 def sin(x):
   """Computes sin of x element-wise.
 
@@ -2100,7 +2110,7 @@ def sin(x):
   return math_ops.sin(x)
 
 
-@tf_export('keras.backend.cos')
+@keras_export('keras.backend.cos')
 def cos(x):
   """Computes cos of x element-wise.
 
@@ -2215,7 +2225,7 @@ def _fused_normalize_batch_in_training(x,
       x, gamma, beta, epsilon=epsilon, data_format=tf_data_format)
 
 
-@tf_export('keras.backend.normalize_batch_in_training')
+@keras_export('keras.backend.normalize_batch_in_training')
 def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3):
   """Computes mean and std for batch then apply batch_normalization on batch.
 
@@ -2245,7 +2255,7 @@ def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3):
           x, gamma, beta, reduction_axes, epsilon=epsilon)
 
 
-@tf_export('keras.backend.batch_normalization')
+@keras_export('keras.backend.batch_normalization')
 def batch_normalization(x, mean, var, beta, gamma, axis=-1, epsilon=1e-3):
   """Applies batch normalization on x given mean, var, beta and gamma.
 
@@ -2307,7 +2317,7 @@ def batch_normalization(x, mean, var, beta, gamma, axis=-1, epsilon=1e-3):
 # SHAPE OPERATIONS
 
 
-@tf_export('keras.backend.concatenate')
+@keras_export('keras.backend.concatenate')
 def concatenate(tensors, axis=-1):
   """Concatenates a list of tensors alongside the specified axis.
 
@@ -2331,7 +2341,7 @@ def concatenate(tensors, axis=-1):
     return array_ops.concat([to_dense(x) for x in tensors], axis)
 
 
-@tf_export('keras.backend.reshape')
+@keras_export('keras.backend.reshape')
 def reshape(x, shape):
   """Reshapes a tensor to the specified shape.
 
@@ -2345,7 +2355,7 @@ def reshape(x, shape):
   return array_ops.reshape(x, shape)
 
 
-@tf_export('keras.backend.permute_dimensions')
+@keras_export('keras.backend.permute_dimensions')
 def permute_dimensions(x, pattern):
   """Permutes axes in a tensor.
 
@@ -2360,7 +2370,7 @@ def permute_dimensions(x, pattern):
   return array_ops.transpose(x, perm=pattern)
 
 
-@tf_export('keras.backend.resize_images')
+@keras_export('keras.backend.resize_images')
 def resize_images(x, height_factor, width_factor, data_format,
                   interpolation='nearest'):
   """Resizes the images contained in a 4D tensor.
@@ -2421,7 +2431,7 @@ def resize_images(x, height_factor, width_factor, data_format,
   return x
 
 
-@tf_export('keras.backend.resize_volumes')
+@keras_export('keras.backend.resize_volumes')
 def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
   """Resizes the volume contained in a 5D tensor.
 
@@ -2453,7 +2463,7 @@ def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
     raise ValueError('Invalid data_format: ' + str(data_format))
 
 
-@tf_export('keras.backend.repeat_elements')
+@keras_export('keras.backend.repeat_elements')
 def repeat_elements(x, rep, axis):
   """Repeats the elements of a tensor along an axis, like `np.repeat`.
 
@@ -2506,7 +2516,7 @@ def repeat_elements(x, rep, axis):
   return x_rep
 
 
-@tf_export('keras.backend.repeat')
+@keras_export('keras.backend.repeat')
 def repeat(x, n):
   """Repeats a 2D tensor.
 
@@ -2526,7 +2536,7 @@ def repeat(x, n):
   return array_ops.tile(x, pattern)
 
 
-@tf_export('keras.backend.arange')
+@keras_export('keras.backend.arange')
 def arange(start, stop=None, step=1, dtype='int32'):
   """Creates a 1D tensor containing a sequence of integers.
 
@@ -2555,7 +2565,8 @@ def arange(start, stop=None, step=1, dtype='int32'):
     result = cast(result, dtype)
   return result
 
-@tf_export('keras.backend.tile')
+
+@keras_export('keras.backend.tile')
 def tile(x, n):
   """Creates a tensor by tiling `x` by `n`.
 
@@ -2572,7 +2583,7 @@ def tile(x, n):
   return array_ops.tile(x, n)
 
 
-@tf_export('keras.backend.flatten')
+@keras_export('keras.backend.flatten')
 def flatten(x):
   """Flatten a tensor.
 
@@ -2585,7 +2596,7 @@ def flatten(x):
   return array_ops.reshape(x, [-1])
 
 
-@tf_export('keras.backend.batch_flatten')
+@keras_export('keras.backend.batch_flatten')
 def batch_flatten(x):
   """Turn a nD tensor into a 2D tensor with same 0th dimension.
 
@@ -2601,7 +2612,7 @@ def batch_flatten(x):
   return x
 
 
-@tf_export('keras.backend.expand_dims')
+@keras_export('keras.backend.expand_dims')
 def expand_dims(x, axis=-1):
   """Adds a 1-sized dimension at index "axis".
 
@@ -2615,7 +2626,7 @@ def expand_dims(x, axis=-1):
   return array_ops.expand_dims(x, axis)
 
 
-@tf_export('keras.backend.squeeze')
+@keras_export('keras.backend.squeeze')
 def squeeze(x, axis):
   """Removes a 1-dimension from the tensor at index "axis".
 
@@ -2629,7 +2640,7 @@ def squeeze(x, axis):
   return array_ops.squeeze(x, [axis])
 
 
-@tf_export('keras.backend.temporal_padding')
+@keras_export('keras.backend.temporal_padding')
 def temporal_padding(x, padding=(1, 1)):
   """Pads the middle dimension of a 3D tensor.
 
@@ -2646,7 +2657,7 @@ def temporal_padding(x, padding=(1, 1)):
   return array_ops.pad(x, pattern)
 
 
-@tf_export('keras.backend.spatial_2d_padding')
+@keras_export('keras.backend.spatial_2d_padding')
 def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
   """Pads the 2nd and 3rd dimensions of a 4D tensor.
 
@@ -2677,7 +2688,7 @@ def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
   return array_ops.pad(x, pattern)
 
 
-@tf_export('keras.backend.spatial_3d_padding')
+@keras_export('keras.backend.spatial_3d_padding')
 def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
   """Pads 5D tensor with zeros along the depth, height, width dimensions.
 
@@ -2721,7 +2732,7 @@ def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
   return array_ops.pad(x, pattern)
 
 
-@tf_export('keras.backend.stack')
+@keras_export('keras.backend.stack')
 def stack(x, axis=0):
   """Stacks a list of rank `R` tensors into a rank `R+1` tensor.
 
@@ -2735,7 +2746,7 @@ def stack(x, axis=0):
   return array_ops.stack(x, axis=axis)
 
 
-@tf_export('keras.backend.one_hot')
+@keras_export('keras.backend.one_hot')
 def one_hot(indices, num_classes):
   """Computes the one-hot representation of an integer tensor.
 
@@ -2754,7 +2765,7 @@ def one_hot(indices, num_classes):
   return array_ops.one_hot(indices, depth=num_classes, axis=-1)
 
 
-@tf_export('keras.backend.reverse')
+@keras_export('keras.backend.reverse')
 def reverse(x, axes):
   """Reverse a tensor along the specified axes.
 
@@ -2774,7 +2785,7 @@ def reverse(x, axes):
 # VALUE MANIPULATION
 
 
-@tf_export('keras.backend.get_value')
+@keras_export('keras.backend.get_value')
 def get_value(x):
   """Returns the value of a variable.
 
@@ -2789,12 +2800,17 @@ def get_value(x):
   """
   if context.executing_eagerly():
     return x.numpy()
+  elif not getattr(x, '_in_graph_mode', True):
+    # This is a variable which was created in an eager context, but is being
+    # evaluated from a Graph.
+    with context.eager_mode():
+      return x.numpy()
   elif ops.inside_function():
     raise RuntimeError('Cannot get value inside Tensorflow graph function.')
-  return x.eval(session=get_session())
+  return x.eval(session=get_session((x,)))
 
 
-@tf_export('keras.backend.batch_get_value')
+@keras_export('keras.backend.batch_get_value')
 def batch_get_value(tensors):
   """Returns the value of more than one tensor variable.
 
@@ -2812,12 +2828,12 @@ def batch_get_value(tensors):
   elif ops.inside_function():  # pylint: disable=protected-access
     raise RuntimeError('Cannot get value inside Tensorflow graph function.')
   if tensors:
-    return get_session().run(tensors)
+    return get_session(tensors).run(tensors)
   else:
     return []
 
 
-@tf_export('keras.backend.set_value')
+@keras_export('keras.backend.set_value')
 def set_value(x, value):
   """Sets the value of a variable, from a Numpy array.
 
@@ -2843,7 +2859,7 @@ def set_value(x, value):
       get_session().run(assign_op, feed_dict={assign_placeholder: value})
 
 
-@tf_export('keras.backend.batch_set_value')
+@keras_export('keras.backend.batch_set_value')
 def batch_set_value(tuples):
   """Sets the values of many tensor variables at once.
 
@@ -2876,7 +2892,7 @@ def batch_set_value(tuples):
         get_session().run(assign_ops, feed_dict=feed_dict)
 
 
-@tf_export('keras.backend.print_tensor')
+@keras_export('keras.backend.print_tensor')
 def print_tensor(x, message=''):
   """Prints `message` and the tensor value when evaluated.
 
@@ -2926,17 +2942,12 @@ class GraphExecutionFunction(object):
   def __init__(self, inputs, outputs, updates=None, name=None,
                **session_kwargs):
     updates = updates or []
-    if not isinstance(inputs, (list, tuple)):
-      raise TypeError('`inputs` to a Keras backend function '
-                      'should be a list or tuple.')
-    if not isinstance(outputs, (list, tuple)):
-      raise TypeError('`outputs` of a Keras backend function '
-                      'should be a list or tuple.')
     if not isinstance(updates, (list, tuple)):
       raise TypeError('`updates` in a Keras backend function '
                       'should be a list or tuple.')
-    self.inputs = list(inputs)
-    self.outputs = list(outputs)
+    self.inputs = nest.flatten(inputs)
+    self._outputs_structure = outputs
+    self.outputs = cast_variables_to_tensor(nest.flatten(outputs))
     with ops.control_dependencies(self.outputs):
       updates_ops = []
       for update in updates:
@@ -3033,10 +3044,9 @@ class GraphExecutionFunction(object):
         self.fetch_callbacks[fetch](output)
 
   def __call__(self, inputs):
-    if not isinstance(inputs, (list, tuple)):
-      raise TypeError('`inputs` should be a list or tuple.')
+    inputs = nest.flatten(inputs)
 
-    session = get_session()
+    session = get_session(inputs)
     feed_arrays = []
     array_vals = []
     feed_symbols = []
@@ -3077,7 +3087,8 @@ class GraphExecutionFunction(object):
     fetched = self._callable_fn(*array_vals,
                                 run_metadata=self.run_metadata)
     self._call_fetch_callbacks(fetched[-len(self._fetches):])
-    return fetched[:len(self.outputs)]
+    return nest.pack_sequence_as(self._outputs_structure,
+                                 fetched[:len(self.outputs)])
 
 
 class EagerExecutionFunction(object):
@@ -3092,53 +3103,79 @@ class EagerExecutionFunction(object):
   """
 
   def __init__(self, inputs, outputs, updates=None, name=None):
+    self.name = name
+    self._outputs_structure = outputs
+    inputs = nest.flatten(inputs)
+    outputs = nest.flatten(outputs)
+
     updates = updates or []
-    if not isinstance(inputs, (list, tuple)):
-      raise TypeError('`inputs` to a Keras backend function '
-                      'should be a list or tuple.')
-    if not isinstance(outputs, (list, tuple)):
-      raise TypeError('`outputs` of a Keras backend function '
-                      'should be a list or tuple.')
     if not isinstance(updates, (list, tuple)):
       raise TypeError('`updates` in a Keras backend function '
                       'should be a list or tuple.')
-    self.inputs = list(inputs)
-    self.outputs = list(outputs)
-    self.name = name
 
-    graph = get_graph()
+    if updates and not outputs:
+      # Edge case; never happens in practice
+      raise ValueError('Cannot create a Keras backend function with updates'
+                       ' but no outputs during eager execution.')
+
+    graphs = {i.graph for i in nest.flatten([inputs, outputs, updates])
+              if hasattr(i, 'graph')}
+    if len(graphs) > 1:
+      raise ValueError('Cannot create an execution function which is comprised '
+                       'of elements from multiple graphs.')
+
+    source_graph = graphs.pop()
+    global_graph = get_graph()
+
+    updates_ops = []
+    legacy_update_ops = []
+    for update in updates:
+      # For legacy reasons it is allowed to pass an update as a tuple
+      # `(variable, new_value)` (this maps to an assign op). Otherwise it
+      # is assumed to already be an op -- we cannot control its execution
+      # order.
+      if isinstance(update, tuple):
+        legacy_update_ops.append(update)
+      else:
+        if hasattr(update, 'op'):
+          update = update.op
+        updates_ops.append(update)
+
+    with _scratch_graph() as exec_graph:
+      global_graph = get_graph()
+      if source_graph not in (exec_graph, global_graph):
+        raise ValueError('Unknown graph. Aborting.')
+
+      if source_graph is global_graph and exec_graph is not global_graph:
+        init_tensors = (
+            outputs + updates_ops + [p for [p, _] in legacy_update_ops] +
+            [p_new for [_, p_new] in legacy_update_ops
+             if isinstance(p_new, ops.Tensor)])
+        lifted_map = lift_to_graph.lift_to_graph(
+            init_tensors=init_tensors, graph=exec_graph, sources=inputs,
+            add_sources=True, handle_captures=True, base_graph=source_graph)
+
+        inputs = [lifted_map[i] for i in inputs]
+        outputs = [lifted_map[i] for i in outputs]
+        updates_ops = [lifted_map[i] for i in updates_ops]
+        legacy_update_ops = [(lifted_map[p], lifted_map.get(p_new, p_new))
+                             for p, p_new in legacy_update_ops]
+
     # Consolidate updates
-    with graph.as_default():
-      with ops.control_dependencies(self.outputs):
-        # In general, updates should be run after the outputs have been
-        # computed. However, we can only ensure this when we create
-        # the updates here (i.e. when updates are passed as tuples).
-        # We cannot modify the control dependencies of preexisting update ops.
-        updates_ops = []
-        for update in updates:
-          # For legacy reasons it is allowed to pass an update as a tuple
-          # `(variable, new_value)` (this maps to an assign op).
-          if isinstance(update, tuple):
-            p, new_p = update
-            updates_ops.append(state_ops.assign(p, new_p))
-          else:
-            # Assumed already an op -- we cannot control its execution order.
-            updates_ops.append(update)
-
-      # We set the update ops to run at the end by conditioning it on output[0]
-      if updates and not self.outputs:
-        # Edge case; never happens in practice
-        raise ValueError('Cannot create a Keras backend function with updates'
-                         ' but no outputs during eager execution.')
+    with exec_graph.as_default():
+      outputs = cast_variables_to_tensor(outputs)
+      with ops.control_dependencies(outputs):
+        for p, p_new in legacy_update_ops:
+          updates_ops.append(state_ops.assign(p, p_new))
+
+      self.inputs, self.outputs = inputs, outputs
       with ops.control_dependencies(updates_ops):
         self.outputs[0] = array_ops.identity(self.outputs[0])
 
-    # Prepare graph function
-    # TODO(fchollet): can we restrict `captures` to variables actually used in
-    # the relevant subgraph?
-    graph.inputs = self.inputs + list(graph.captures.values())
-    graph.outputs = self.outputs
-    graph_fn = eager_function.Function(graph)
+      exec_graph.inputs = self.inputs + list(exec_graph.captures.values())
+      exec_graph.outputs = self.outputs
+      graph_fn = eager_function.ConcreteFunction(exec_graph)
+
     graph_fn._num_positional_args = len(self.inputs)
     graph_fn._arg_keywords = []
     self._graph_fn = graph_fn
@@ -3146,13 +3183,14 @@ class EagerExecutionFunction(object):
     # Handle placeholders with default
     # (treated as required placeholder by graph functions)
     self._placeholder_default_values = {}
-    with graph.as_default():
+    with exec_graph.as_default():
       for x in self.inputs:
         if x.op.type == 'PlaceholderWithDefault':
           self._placeholder_default_values[x] = tensor_util.constant_value(
               x.op.inputs[0])
 
   def __call__(self, inputs):
+    inputs = nest.flatten(inputs)
     converted_inputs = []
     for tensor, value in zip(self.inputs, inputs):
       if value is None:
@@ -3169,10 +3207,11 @@ class EagerExecutionFunction(object):
         value = math_ops.cast(value, tensor.dtype)
       converted_inputs.append(value)
     outputs = self._graph_fn(*converted_inputs)
-    return [x.numpy() for x in outputs]
+    return nest.pack_sequence_as(self._outputs_structure,
+                                 [x.numpy() for x in outputs])
 
 
-@tf_export('keras.backend.function')
+@keras_export('keras.backend.function')
 def function(inputs, outputs, updates=None, name=None, **kwargs):
   """Instantiates a Keras function.
 
@@ -3205,7 +3244,7 @@ def function(inputs, outputs, updates=None, name=None, **kwargs):
   return GraphExecutionFunction(inputs, outputs, updates=updates, **kwargs)
 
 
-@tf_export('keras.backend.gradients')
+@keras_export('keras.backend.gradients')
 def gradients(loss, variables):
   """Returns the gradients of `loss` w.r.t. `variables`.
 
@@ -3220,7 +3259,7 @@ def gradients(loss, variables):
       loss, variables, colocate_gradients_with_ops=True)
 
 
-@tf_export('keras.backend.stop_gradient')
+@keras_export('keras.backend.stop_gradient')
 def stop_gradient(variables):
   """Returns `variables` but with zero gradient w.r.t. every other variable.
 
@@ -3241,7 +3280,7 @@ def stop_gradient(variables):
 # CONTROL FLOW
 
 
-@tf_export('keras.backend.rnn')
+@keras_export('keras.backend.rnn')
 def rnn(step_function,
         inputs,
         initial_states,
@@ -3599,7 +3638,7 @@ def rnn(step_function,
   return last_output, outputs, new_states
 
 
-@tf_export('keras.backend.switch')
+@keras_export('keras.backend.switch')
 def switch(condition, then_expression, else_expression):
   """Switches between two operations depending on a scalar value.
 
@@ -3663,7 +3702,7 @@ def switch(condition, then_expression, else_expression):
   return x
 
 
-@tf_export('keras.backend.in_train_phase')
+@keras_export('keras.backend.in_train_phase')
 def in_train_phase(x, alt, training=None):
   """Selects `x` in train phase, and `alt` otherwise.
 
@@ -3702,7 +3741,7 @@ def in_train_phase(x, alt, training=None):
   return x
 
 
-@tf_export('keras.backend.in_test_phase')
+@keras_export('keras.backend.in_test_phase')
 def in_test_phase(x, alt, training=None):
   """Selects `x` in test phase, and `alt` otherwise.
 
@@ -3726,7 +3765,7 @@ def in_test_phase(x, alt, training=None):
 # NN OPERATIONS
 
 
-@tf_export('keras.backend.relu')
+@keras_export('keras.backend.relu')
 def relu(x, alpha=0., max_value=None, threshold=0):
   """Rectified linear unit.
 
@@ -3779,7 +3818,7 @@ def relu(x, alpha=0., max_value=None, threshold=0):
   return x
 
 
-@tf_export('keras.backend.elu')
+@keras_export('keras.backend.elu')
 def elu(x, alpha=1.):
   """Exponential linear unit.
 
@@ -3797,7 +3836,7 @@ def elu(x, alpha=1.):
     return array_ops.where(x > 0, res, alpha * res)
 
 
-@tf_export('keras.backend.softmax')
+@keras_export('keras.backend.softmax')
 def softmax(x, axis=-1):
   """Softmax of a tensor.
 
@@ -3812,7 +3851,7 @@ def softmax(x, axis=-1):
   return nn.softmax(x, axis=axis)
 
 
-@tf_export('keras.backend.softplus')
+@keras_export('keras.backend.softplus')
 def softplus(x):
   """Softplus of a tensor.
 
@@ -3825,7 +3864,7 @@ def softplus(x):
   return nn.softplus(x)
 
 
-@tf_export('keras.backend.softsign')
+@keras_export('keras.backend.softsign')
 def softsign(x):
   """Softsign of a tensor.
 
@@ -3838,7 +3877,7 @@ def softsign(x):
   return nn.softsign(x)
 
 
-@tf_export('keras.backend.categorical_crossentropy')
+@keras_export('keras.backend.categorical_crossentropy')
 def categorical_crossentropy(target, output, from_logits=False, axis=-1):
   """Categorical crossentropy between an output tensor and a target tensor.
 
@@ -3859,22 +3898,27 @@ def categorical_crossentropy(target, output, from_logits=False, axis=-1):
   Raises:
       ValueError: if `axis` is neither -1 nor one of the axes of `output`.
   """
-  rank = len(output.shape)
-  axis = axis % rank
-  # Note: nn.softmax_cross_entropy_with_logits_v2
-  # expects logits, Keras expects probabilities.
   if not from_logits:
-    # scale preds so that the class probas of each sample sum to 1
-    output = output / math_ops.reduce_sum(output, axis, True)
-    # manual computation of crossentropy
-    epsilon_ = _to_tensor(epsilon(), output.dtype.base_dtype)
-    output = clip_ops.clip_by_value(output, epsilon_, 1. - epsilon_)
-    return -math_ops.reduce_sum(target * math_ops.log(output), axis)
-  else:
-    return nn.softmax_cross_entropy_with_logits_v2(labels=target, logits=output)
+    if context.executing_eagerly() or output.op.type != 'Softmax':
+      axis = axis % len(output.shape)
+      # scale preds so that the class probas of each sample sum to 1
+      output = output / math_ops.reduce_sum(output, axis, True)
+
+      # Compute cross entropy from probabilities.
+      epsilon_ = _to_tensor(epsilon(), output.dtype.base_dtype)
+      output = clip_ops.clip_by_value(output, epsilon_, 1. - epsilon_)
+      return -math_ops.reduce_sum(target * math_ops.log(output), axis)
+    else:
+      # When softmax activation function is used for output operation, we
+      # use logits from the softmax function directly to compute loss in order
+      # to prevent collapsing zero when training.
+      # See b/117284466
+      assert len(output.op.inputs) == 1
+      output = output.op.inputs[0]
+  return nn.softmax_cross_entropy_with_logits_v2(labels=target, logits=output)
 
 
-@tf_export('keras.backend.sparse_categorical_crossentropy')
+@keras_export('keras.backend.sparse_categorical_crossentropy')
 def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
   """Categorical crossentropy with integer targets.
 
@@ -3895,19 +3939,25 @@ def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
   Raises:
       ValueError: if `axis` is neither -1 nor one of the axes of `output`.
   """
+  if not from_logits:
+    if context.executing_eagerly() or output.op.type != 'Softmax':
+      epsilon_ = _to_tensor(epsilon(), output.dtype.base_dtype)
+      output = clip_ops.clip_by_value(output, epsilon_, 1 - epsilon_)
+      output = math_ops.log(output)
+    else:
+      # When softmax activation function is used for output operation, we
+      # use logits from the softmax function directly to compute loss in order
+      # to prevent collapsing zero when training.
+      # See b/117284466
+      assert len(output.op.inputs) == 1
+      output = output.op.inputs[0]
+
   rank = len(output.shape)
   axis = axis % rank
   if axis != rank - 1:
     permutation = list(range(axis)) + list(range(axis + 1, rank)) + [axis]
     output = array_ops.transpose(output, perm=permutation)
 
-  # Note: nn.sparse_softmax_cross_entropy_with_logits
-  # expects logits, Keras expects probabilities.
-  if not from_logits:
-    epsilon_ = _to_tensor(epsilon(), output.dtype.base_dtype)
-    output = clip_ops.clip_by_value(output, epsilon_, 1 - epsilon_)
-    output = math_ops.log(output)
-
   output_shape = output.shape
   targets = cast(flatten(target), 'int64')
   logits = array_ops.reshape(output, [-1, int(output_shape[-1])])
@@ -3920,7 +3970,7 @@ def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
     return res
 
 
-@tf_export('keras.backend.binary_crossentropy')
+@keras_export('keras.backend.binary_crossentropy')
 def binary_crossentropy(target, output, from_logits=False):
   """Binary crossentropy between an output tensor and a target tensor.
 
@@ -3934,17 +3984,25 @@ def binary_crossentropy(target, output, from_logits=False):
   Returns:
       A tensor.
   """
-  # Note: nn.sigmoid_cross_entropy_with_logits
-  # expects logits, Keras expects probabilities.
   if not from_logits:
-    # transform back to logits
-    epsilon_ = _to_tensor(epsilon(), output.dtype.base_dtype)
-    output = clip_ops.clip_by_value(output, epsilon_, 1 - epsilon_)
-    output = math_ops.log(output / (1 - output))
+    if context.executing_eagerly() or output.op.type != 'Sigmoid':
+      epsilon_ = _to_tensor(epsilon(), output.dtype.base_dtype)
+      output = clip_ops.clip_by_value(output, epsilon_, 1. - epsilon_)
+
+      # Compute cross entropy from probabilities.
+      bce = target * math_ops.log(output + epsilon())
+      bce += (1 - target) * math_ops.log(1 - output + epsilon())
+      return -bce
+    else:
+      # When sigmoid activation function is used for output operation, we
+      # use logits from the sigmoid function directly to compute loss in order
+      # to prevent collapsing zero when training.
+      assert len(output.op.inputs) == 1
+      output = output.op.inputs[0]
   return nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
 
 
-@tf_export('keras.backend.sigmoid')
+@keras_export('keras.backend.sigmoid')
 def sigmoid(x):
   """Element-wise sigmoid.
 
@@ -3957,7 +4015,7 @@ def sigmoid(x):
   return nn.sigmoid(x)
 
 
-@tf_export('keras.backend.hard_sigmoid')
+@keras_export('keras.backend.hard_sigmoid')
 def hard_sigmoid(x):
   """Segment-wise linear approximation of sigmoid.
 
@@ -3978,7 +4036,7 @@ def hard_sigmoid(x):
   return x
 
 
-@tf_export('keras.backend.tanh')
+@keras_export('keras.backend.tanh')
 def tanh(x):
   """Element-wise tanh.
 
@@ -3991,7 +4049,7 @@ def tanh(x):
   return nn.tanh(x)
 
 
-@tf_export('keras.backend.dropout')
+@keras_export('keras.backend.dropout')
 def dropout(x, level, noise_shape=None, seed=None):
   """Sets entries in `x` to zero at random, while scaling the entire tensor.
 
@@ -4014,7 +4072,7 @@ def dropout(x, level, noise_shape=None, seed=None):
   return nn.dropout(x * 1., retain_prob, noise_shape, seed=seed)
 
 
-@tf_export('keras.backend.l2_normalize')
+@keras_export('keras.backend.l2_normalize')
 def l2_normalize(x, axis=None):
   """Normalizes a tensor wrt the L2 norm alongside the specified axis.
 
@@ -4028,7 +4086,7 @@ def l2_normalize(x, axis=None):
   return nn.l2_normalize(x, axis=axis)
 
 
-@tf_export('keras.backend.in_top_k')
+@keras_export('keras.backend.in_top_k')
 def in_top_k(predictions, targets, k):
   """Returns whether the `targets` are in the top `k` `predictions`.
 
@@ -4130,7 +4188,7 @@ def _preprocess_padding(padding):
   return padding
 
 
-@tf_export('keras.backend.conv1d')
+@keras_export('keras.backend.conv1d')
 def conv1d(x,
            kernel,
            strides=1,
@@ -4171,8 +4229,8 @@ def conv1d(x,
   x = nn.convolution(
       input=x,
       filter=kernel,
-      dilation_rate=(dilation_rate,),
-      strides=(strides,),
+      dilation_rate=dilation_rate,
+      strides=strides,
       padding=padding,
       data_format=tf_data_format)
   if data_format == 'channels_first' and tf_data_format == 'NWC':
@@ -4180,7 +4238,7 @@ def conv1d(x,
   return x
 
 
-@tf_export('keras.backend.conv2d')
+@keras_export('keras.backend.conv2d')
 def conv2d(x,
            kernel,
            strides=(1, 1),
@@ -4225,7 +4283,7 @@ def conv2d(x,
   return x
 
 
-@tf_export('keras.backend.conv2d_transpose')
+@keras_export('keras.backend.conv2d_transpose')
 def conv2d_transpose(x,
                      kernel,
                      output_shape,
@@ -4367,7 +4425,7 @@ def separable_conv1d(x,
   return x
 
 
-@tf_export('keras.backend.separable_conv2d')
+@keras_export('keras.backend.separable_conv2d')
 def separable_conv2d(x,
                      depthwise_kernel,
                      pointwise_kernel,
@@ -4393,6 +4451,7 @@ def separable_conv2d(x,
   Raises:
       ValueError: if `data_format` is neither `channels_last` or
       `channels_first`.
+      ValueError: if `strides` is not a tuple of 2 integers.
   """
   if data_format is None:
     data_format = image_data_format()
@@ -4471,7 +4530,7 @@ def depthwise_conv2d(x,
   return x
 
 
-@tf_export('keras.backend.conv3d')
+@keras_export('keras.backend.conv3d')
 def conv3d(x,
            kernel,
            strides=(1, 1, 1),
@@ -4577,7 +4636,7 @@ def conv3d_transpose(x,
   return x
 
 
-@tf_export('keras.backend.pool2d')
+@keras_export('keras.backend.pool2d')
 def pool2d(x,
            pool_size,
            strides=(1, 1),
@@ -4600,6 +4659,8 @@ def pool2d(x,
   Raises:
       ValueError: if `data_format` is neither `"channels_last"` or
       `"channels_first"`.
+      ValueError: if `pool_size` is not a tuple of 2 integers.
+      ValueError: if `strides` is not a tuple of 2 integers.
       ValueError: if `pool_mode` is neither `"max"` or `"avg"`.
   """
   if data_format is None:
@@ -4634,7 +4695,7 @@ def pool2d(x,
   return x
 
 
-@tf_export('keras.backend.pool3d')
+@keras_export('keras.backend.pool3d')
 def pool3d(x,
            pool_size,
            strides=(1, 1, 1),
@@ -4838,7 +4899,7 @@ def local_conv2d(inputs,
                     data_format)
 
 
-@tf_export('keras.backend.bias_add')
+@keras_export('keras.backend.bias_add')
 def bias_add(x, bias, data_format=None):
   """Adds a bias vector to a tensor.
 
@@ -4912,7 +4973,7 @@ def bias_add(x, bias, data_format=None):
 # RANDOMNESS
 
 
-@tf_export('keras.backend.random_normal')
+@keras_export('keras.backend.random_normal')
 def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
   """Returns a tensor with normal distribution of values.
 
@@ -4935,7 +4996,7 @@ def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
       shape, mean=mean, stddev=stddev, dtype=dtype, seed=seed)
 
 
-@tf_export('keras.backend.random_uniform')
+@keras_export('keras.backend.random_uniform')
 def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
   """Returns a tensor with uniform distribution of values.
 
@@ -4959,7 +5020,7 @@ def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
       shape, minval=minval, maxval=maxval, dtype=dtype, seed=seed)
 
 
-@tf_export('keras.backend.random_binomial')
+@keras_export('keras.backend.random_binomial')
 def random_binomial(shape, p=0.0, dtype=None, seed=None):
   """Returns a tensor with random binomial distribution of values.
 
@@ -4981,7 +5042,7 @@ def random_binomial(shape, p=0.0, dtype=None, seed=None):
       array_ops.ones(shape, dtype=dtype), array_ops.zeros(shape, dtype=dtype))
 
 
-@tf_export('keras.backend.truncated_normal')
+@keras_export('keras.backend.truncated_normal')
 def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
   """Returns a tensor with truncated random normal distribution of values.
 
@@ -5015,7 +5076,7 @@ def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
 # in TensorFlow's CTC implementation
 
 
-@tf_export('keras.backend.ctc_label_dense_to_sparse')
+@keras_export('keras.backend.ctc_label_dense_to_sparse')
 def ctc_label_dense_to_sparse(labels, label_lengths):
   """Converts CTC labels from dense to sparse.
 
@@ -5060,7 +5121,7 @@ def ctc_label_dense_to_sparse(labels, label_lengths):
       math_ops.to_int64(indices), vals_sparse, math_ops.to_int64(label_shape))
 
 
-@tf_export('keras.backend.ctc_batch_cost')
+@keras_export('keras.backend.ctc_batch_cost')
 def ctc_batch_cost(y_true, y_pred, input_length, label_length):
   """Runs CTC loss algorithm on each batch element.
 
@@ -5090,7 +5151,7 @@ def ctc_batch_cost(y_true, y_pred, input_length, label_length):
           inputs=y_pred, labels=sparse_labels, sequence_length=input_length), 1)
 
 
-@tf_export('keras.backend.ctc_decode')
+@keras_export('keras.backend.ctc_decode')
 def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
   """Decodes the output of a softmax.
 
@@ -5142,7 +5203,7 @@ def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
 # HIGH ORDER FUNCTIONS
 
 
-@tf_export('keras.backend.map_fn')
+@keras_export('keras.backend.map_fn')
 def map_fn(fn, elems, name=None, dtype=None):
   """Map the function fn over the elements elems and return the outputs.
 
@@ -5155,10 +5216,10 @@ def map_fn(fn, elems, name=None, dtype=None):
   Returns:
       Tensor with dtype `dtype`.
   """
-  return functional_ops.map_fn(fn, elems, name=name, dtype=dtype)
+  return map_fn_lib.map_fn(fn, elems, name=name, dtype=dtype)
 
 
-@tf_export('keras.backend.foldl')
+@keras_export('keras.backend.foldl')
 def foldl(fn, elems, initializer=None, name=None):
   """Reduce elems using fn to combine them from left to right.
 
@@ -5175,7 +5236,7 @@ def foldl(fn, elems, initializer=None, name=None):
   return functional_ops.foldl(fn, elems, initializer=initializer, name=name)
 
 
-@tf_export('keras.backend.foldr')
+@keras_export('keras.backend.foldr')
 def foldr(fn, elems, initializer=None, name=None):
   """Reduce elems using fn to combine them from right to left.
 
@@ -5237,3 +5298,70 @@ if not os.path.exists(_config_path):
   except IOError:
     # Except permission denied.
     pass
+
+
+def in_multi_worker_mode():
+  """Whether we are operating in a Multi-Worker setting."""
+  tf_config = json.loads(os.environ.get('TF_CONFIG', '{}'))
+  cluster_spec = server_lib.ClusterSpec(tf_config.get('cluster', {}))
+  return tf_config and 'master' not in cluster_spec.jobs
+
+
+def configure_and_create_distributed_session(distribution_strategy):
+  """Configure session config and create a session with it."""
+
+  def _create_session(distribution_strategy):
+    """Create the Distributed Strategy session."""
+    session_config = get_default_session_config()
+
+    # If a session already exists, merge in its config; in the case there is a
+    # conflict, take values of the existing config.
+    global _SESSION
+    if getattr(_SESSION, 'session', None) and _SESSION.session._config:
+      session_config.MergeFrom(_SESSION.session._config)
+
+    if is_tpu_strategy(distribution_strategy):
+      # TODO(priyag, yuefengz): Remove this workaround when Distribute
+      # Coordinator is integrated with keras and we can create a session from
+      # there.
+      distribution_strategy.configure(session_config)
+      master = distribution_strategy.extended._tpu_cluster_resolver.master()  # pylint: disable=protected-access
+      session = session_module.Session(config=session_config, target=master)
+    else:
+      worker_context = dc_context.get_current_worker_context()
+      if worker_context:
+        dc_session_config = worker_context.session_config
+        # Merge the default session config to the one from distribute
+        # coordinator, which is fine for now since they don't have
+        # conflicting configurations.
+        dc_session_config.MergeFrom(session_config)
+        session = session_module.Session(
+            config=dc_session_config, target=worker_context.master_target)
+      else:
+        distribution_strategy.configure(session_config)
+        session = session_module.Session(config=session_config)
+
+    set_session(session)
+
+  if in_multi_worker_mode():
+    dc.run_distribute_coordinator(
+        _create_session,
+        distribution_strategy,
+        mode=dc.CoordinatorMode.INDEPENDENT_WORKER)
+  else:
+    _create_session(distribution_strategy)
+
+
+def is_tpu_strategy(strategy):
+  """We're executing TPU Strategy."""
+  return strategy is not None and strategy.__class__.__name__ == 'TPUStrategy'
+
+
+def cast_variables_to_tensor(tensors):
+
+  def _cast_variables_to_tensor(tensor):
+    if isinstance(tensor, variables_module.Variable):
+      return array_ops.identity(tensor)
+    return tensor
+
+  return nest.map_structure(_cast_variables_to_tensor, tensors)
diff --git a/tensorflow/python/keras/backend_config.py b/tensorflow/python/keras/backend_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7c63ac2c72df1f64e8b6ee4eafbaf75e56c1314
--- /dev/null
+++ b/tensorflow/python/keras/backend_config.py
@@ -0,0 +1,126 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras backend config API."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.util.tf_export import keras_export
+
+# The type of float to use throughout a session.
+_FLOATX = 'float32'
+
+# Epsilon fuzz factor used throughout the codebase.
+_EPSILON = 1e-7
+
+# Default image data format, one of "channels_last", "channels_first".
+_IMAGE_DATA_FORMAT = 'channels_last'
+
+
+@keras_export('keras.backend.epsilon')
+def epsilon():
+  """Returns the value of the fuzz factor used in numeric expressions.
+
+  Returns:
+      A float.
+
+  Example:
+  ```python
+  keras.backend.epsilon() >>>1e-07
+  ```
+  """
+  return _EPSILON
+
+
+@keras_export('keras.backend.set_epsilon')
+def set_epsilon(value):
+  """Sets the value of the fuzz factor used in numeric expressions.
+
+  Arguments:
+      value: float. New value of epsilon.
+  Example: ```python from keras import backend as K K.epsilon() >>> 1e-07
+    K.set_epsilon(1e-05) K.epsilon() >>> 1e-05 ```
+  """
+  global _EPSILON
+  _EPSILON = value
+
+
+@keras_export('keras.backend.floatx')
+def floatx():
+  """Returns the default float type, as a string.
+
+  E.g. 'float16', 'float32', 'float64'.
+
+  Returns:
+      String, the current default float type.
+
+  Example:
+  ```python
+  keras.backend.floatx() >>> 'float32'
+  ```
+  """
+  return _FLOATX
+
+
+@keras_export('keras.backend.set_floatx')
+def set_floatx(value):
+  """Sets the default float type.
+
+  Arguments:
+      value: String; 'float16', 'float32', or 'float64'.
+  Example: ```python from keras import backend as K K.floatx() >>> 'float32'
+    K.set_floatx('float16') K.floatx() >>> 'float16' ```
+
+  Raises:
+      ValueError: In case of invalid value.
+  """
+  global _FLOATX
+  if value not in {'float16', 'float32', 'float64'}:
+    raise ValueError('Unknown floatx type: ' + str(value))
+  _FLOATX = str(value)
+
+
+@keras_export('keras.backend.image_data_format')
+def image_data_format():
+  """Returns the default image data format convention.
+
+  Returns:
+      A string, either `'channels_first'` or `'channels_last'`
+
+  Example:
+  ```python
+  keras.backend.image_data_format() >>> 'channels_first'
+  ```
+  """
+  return _IMAGE_DATA_FORMAT
+
+
+@keras_export('keras.backend.set_image_data_format')
+def set_image_data_format(data_format):
+  """Sets the value of the image data format convention.
+
+  Arguments:
+      data_format: string. `'channels_first'` or `'channels_last'`.
+  Example: ```python from keras import backend as K K.image_data_format() >>>
+    'channels_first' K.set_image_data_format('channels_last')
+    K.image_data_format() >>> 'channels_last' ```
+
+  Raises:
+      ValueError: In case of invalid `data_format` value.
+  """
+  global _IMAGE_DATA_FORMAT
+  if data_format not in {'channels_last', 'channels_first'}:
+    raise ValueError('Unknown data_format: ' + str(data_format))
+  _IMAGE_DATA_FORMAT = str(data_format)
diff --git a/tensorflow/python/keras/backend_config_test.py b/tensorflow/python/keras/backend_config_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..59e003d81e196868654313fe08e53fb261a2baa0
--- /dev/null
+++ b/tensorflow/python/keras/backend_config_test.py
@@ -0,0 +1,55 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for backend_config."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import keras
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class BackendConfigTest(test.TestCase):
+
+  def test_backend(self):
+    self.assertEqual(keras.backend.backend(), 'tensorflow')
+
+  def test_espilon(self):
+    epsilon = 1e-2
+    keras.backend_config.set_epsilon(epsilon)
+    self.assertEqual(keras.backend_config.epsilon(), epsilon)
+    keras.backend_config.set_epsilon(1e-7)
+    self.assertEqual(keras.backend_config.epsilon(), 1e-7)
+
+  def test_floatx(self):
+    floatx = 'float64'
+    keras.backend_config.set_floatx(floatx)
+    self.assertEqual(keras.backend_config.floatx(), floatx)
+    keras.backend_config.set_floatx('float32')
+    self.assertEqual(keras.backend_config.floatx(), 'float32')
+
+  def test_image_data_format(self):
+    image_data_format = 'channels_first'
+    keras.backend_config.set_image_data_format(image_data_format)
+    self.assertEqual(keras.backend_config.image_data_format(),
+                     image_data_format)
+    keras.backend_config.set_image_data_format('channels_last')
+    self.assertEqual(keras.backend_config.image_data_format(), 'channels_last')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index af01b46fa9a4a45201de930cfb7827ac1d2bafbd..cd7821639b92613b716679c94ed3fd195663ba42 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -99,24 +99,6 @@ class BackendUtilsTest(test.TestCase):
   def test_backend(self):
     self.assertEqual(keras.backend.backend(), 'tensorflow')
 
-  def test_espilon(self):
-    epsilon = 1e-2
-    keras.backend.set_epsilon(epsilon)
-    self.assertEqual(keras.backend.epsilon(), epsilon)
-    keras.backend.set_epsilon(1e-7)
-
-  def test_floatx(self):
-    floatx = 'float64'
-    keras.backend.set_floatx(floatx)
-    self.assertEqual(keras.backend.floatx(), floatx)
-    keras.backend.set_floatx('float32')
-
-  def test_image_data_format(self):
-    image_data_format = 'channels_first'
-    keras.backend.set_image_data_format(image_data_format)
-    self.assertEqual(keras.backend.image_data_format(), image_data_format)
-    keras.backend.set_image_data_format('channels_last')
-
   def test_get_reset_uids(self):
     self.assertEqual(keras.backend.get_uid('foo'), 1)
     self.assertEqual(keras.backend.get_uid('foo'), 2)
@@ -126,34 +108,61 @@ class BackendUtilsTest(test.TestCase):
 
   def test_learning_phase(self):
     with self.cached_session() as sess:
-      keras.backend.set_learning_phase(1)
-      self.assertEqual(keras.backend.learning_phase(), 1)
       with self.assertRaises(ValueError):
         keras.backend.set_learning_phase(2)
 
       # Test running with a learning-phase-consuming layer
-      keras.backend.set_learning_phase(0)
-      x = keras.Input((3,))
-      y = keras.layers.BatchNormalization()(x)
-      if not context.executing_eagerly():
-        self.evaluate(variables.global_variables_initializer())
-        sess.run(y, feed_dict={x: np.random.random((2, 3))})
+      with keras.backend.learning_phase_scope(0):
+        x = keras.Input((3,))
+        y = keras.layers.BatchNormalization()(x)
+        if not context.executing_eagerly():
+          self.evaluate(variables.global_variables_initializer())
+          sess.run(y, feed_dict={x: np.random.random((2, 3))})
+
+  def test_learning_phase_name(self):
+    with ops.name_scope('test_scope'):
+      # Test that outer name scopes do not affect the learning phase's name.
+      lp = keras.backend.symbolic_learning_phase()
+    self.assertEqual(lp.name, 'keras_learning_phase:0')
 
   def test_learning_phase_scope(self):
-    with self.cached_session():
-      initial_learning_phase = keras.backend.learning_phase()
-      with keras.backend.learning_phase_scope(1) as lp:
-        self.assertEqual(lp, 1)
-        self.assertEqual(keras.backend.learning_phase(), 1)
-      self.assertEqual(keras.backend.learning_phase(), initial_learning_phase)
-      with keras.backend.learning_phase_scope(0) as lp:
-        self.assertEqual(lp, 0)
-        self.assertEqual(keras.backend.learning_phase(), 0)
-      self.assertEqual(keras.backend.learning_phase(), initial_learning_phase)
-      with self.assertRaises(ValueError):
-        with keras.backend.learning_phase_scope(None):
-          pass
-      self.assertEqual(keras.backend.learning_phase(), initial_learning_phase)
+    initial_learning_phase = keras.backend.learning_phase()
+    with keras.backend.learning_phase_scope(1):
+      self.assertEqual(keras.backend.learning_phase(), 1)
+    self.assertEqual(keras.backend.learning_phase(), initial_learning_phase)
+    with keras.backend.learning_phase_scope(0):
+      self.assertEqual(keras.backend.learning_phase(), 0)
+    self.assertEqual(keras.backend.learning_phase(), initial_learning_phase)
+    with self.assertRaises(ValueError):
+      with keras.backend.learning_phase_scope(None):
+        pass
+    self.assertEqual(keras.backend.learning_phase(), initial_learning_phase)
+
+    new_learning_phase = 0
+    keras.backend.set_learning_phase(new_learning_phase)
+    self.assertEqual(keras.backend.learning_phase(), new_learning_phase)
+    with keras.backend.learning_phase_scope(1):
+      self.assertEqual(keras.backend.learning_phase(), 1)
+    self.assertEqual(keras.backend.learning_phase(), new_learning_phase)
+
+  def test_learning_phase_scope_in_graph(self):
+    initial_learning_phase_outside_graph = keras.backend.learning_phase()
+    with keras.backend.get_graph().as_default():
+      initial_learning_phase_in_graph = keras.backend.learning_phase()
+
+    self.assertEqual(keras.backend.learning_phase(),
+                     initial_learning_phase_outside_graph)
+    with keras.backend.learning_phase_scope(1):
+      self.assertEqual(keras.backend.learning_phase(), 1)
+    self.assertEqual(keras.backend.learning_phase(),
+                     initial_learning_phase_outside_graph)
+
+    with keras.backend.get_graph().as_default():
+      self.assertEqual(keras.backend.learning_phase(),
+                       initial_learning_phase_in_graph)
+
+    self.assertEqual(keras.backend.learning_phase(),
+                     initial_learning_phase_outside_graph)
 
   def test_int_shape(self):
     x = keras.backend.ones(shape=(3, 4))
@@ -164,21 +173,20 @@ class BackendUtilsTest(test.TestCase):
       self.assertEqual(keras.backend.int_shape(x), (None, 4))
 
   def test_in_train_phase(self):
-    with self.cached_session():
-      y1 = keras.backend.variable(1)
-      y2 = keras.backend.variable(2)
-      if context.executing_eagerly():
-        with keras.backend.learning_phase_scope(0):
-          y_val_test = keras.backend.in_train_phase(y1, y2).numpy()
-        with keras.backend.learning_phase_scope(1):
-          y_val_train = keras.backend.in_train_phase(y1, y2).numpy()
-      else:
-        y = keras.backend.in_train_phase(y1, y2)
-        f = keras.backend.function([keras.backend.learning_phase()], [y])
-        y_val_test = f([0])[0]
-        y_val_train = f([1])[0]
-      self.assertAllClose(y_val_test, 2)
-      self.assertAllClose(y_val_train, 1)
+    y1 = keras.backend.variable(1)
+    y2 = keras.backend.variable(2)
+    if context.executing_eagerly():
+      with keras.backend.learning_phase_scope(0):
+        y_val_test = keras.backend.in_train_phase(y1, y2).numpy()
+      with keras.backend.learning_phase_scope(1):
+        y_val_train = keras.backend.in_train_phase(y1, y2).numpy()
+    else:
+      y = keras.backend.in_train_phase(y1, y2)
+      f = keras.backend.function([keras.backend.learning_phase()], [y])
+      y_val_test = f([0])[0]
+      y_val_train = f([1])[0]
+    self.assertAllClose(y_val_test, 2)
+    self.assertAllClose(y_val_train, 1)
 
   def test_is_keras_tensor(self):
     x = keras.backend.variable(1)
@@ -205,74 +213,63 @@ class BackendUtilsTest(test.TestCase):
 class BackendVariableTest(test.TestCase):
 
   def test_zeros(self):
-    with self.cached_session():
-      x = keras.backend.zeros((3, 4))
-      val = keras.backend.eval(x)
-      self.assertAllClose(val, np.zeros((3, 4)))
+    x = keras.backend.zeros((3, 4))
+    val = keras.backend.eval(x)
+    self.assertAllClose(val, np.zeros((3, 4)))
 
   def test_ones(self):
-    with self.cached_session():
-      x = keras.backend.ones((3, 4))
-      val = keras.backend.eval(x)
-      self.assertAllClose(val, np.ones((3, 4)))
+    x = keras.backend.ones((3, 4))
+    val = keras.backend.eval(x)
+    self.assertAllClose(val, np.ones((3, 4)))
 
   def test_eye(self):
-    with self.cached_session():
-      x = keras.backend.eye(4)
-      val = keras.backend.eval(x)
-      self.assertAllClose(val, np.eye(4))
+    x = keras.backend.eye(4)
+    val = keras.backend.eval(x)
+    self.assertAllClose(val, np.eye(4))
 
   def test_zeros_like(self):
-    with self.cached_session():
-      x = keras.backend.zeros((3, 4))
-      y = keras.backend.zeros_like(x)
-      val = keras.backend.eval(y)
-      self.assertAllClose(val, np.zeros((3, 4)))
+    x = keras.backend.zeros((3, 4))
+    y = keras.backend.zeros_like(x)
+    val = keras.backend.eval(y)
+    self.assertAllClose(val, np.zeros((3, 4)))
 
   def test_ones_like(self):
-    with self.cached_session():
-      x = keras.backend.zeros((3, 4))
-      y = keras.backend.ones_like(x)
-      val = keras.backend.eval(y)
-      self.assertAllClose(val, np.ones((3, 4)))
+    x = keras.backend.zeros((3, 4))
+    y = keras.backend.ones_like(x)
+    val = keras.backend.eval(y)
+    self.assertAllClose(val, np.ones((3, 4)))
 
   def test_random_uniform_variable(self):
-    with self.cached_session():
-      x = keras.backend.random_uniform_variable((30, 20), low=1, high=2, seed=0)
-      val = keras.backend.eval(x)
-      self.assertAllClose(val.mean(), 1.5, atol=1e-1)
-      self.assertAllClose(val.max(), 2., atol=1e-1)
-      self.assertAllClose(val.min(), 1., atol=1e-1)
+    x = keras.backend.random_uniform_variable((30, 20), low=1, high=2, seed=0)
+    val = keras.backend.eval(x)
+    self.assertAllClose(val.mean(), 1.5, atol=1e-1)
+    self.assertAllClose(val.max(), 2., atol=1e-1)
+    self.assertAllClose(val.min(), 1., atol=1e-1)
 
   def test_random_normal_variable(self):
-    with self.cached_session():
-      x = keras.backend.random_normal_variable((30, 20), 1., 0.5,
-                                               seed=0)
-      val = keras.backend.eval(x)
-      self.assertAllClose(val.mean(), 1., atol=1e-1)
-      self.assertAllClose(val.std(), 0.5, atol=1e-1)
+    x = keras.backend.random_normal_variable((30, 20), 1., 0.5, seed=0)
+    val = keras.backend.eval(x)
+    self.assertAllClose(val.mean(), 1., atol=1e-1)
+    self.assertAllClose(val.std(), 0.5, atol=1e-1)
 
   def test_count_params(self):
-    with self.cached_session():
-      x = keras.backend.zeros((4, 5))
-      val = keras.backend.count_params(x)
-      self.assertAllClose(val, 20)
+    x = keras.backend.zeros((4, 5))
+    val = keras.backend.count_params(x)
+    self.assertAllClose(val, 20)
 
   def test_constant(self):
-    with self.cached_session():
-      ref_val = np.random.random((3, 4)).astype('float32')
-      x = keras.backend.constant(ref_val)
-      val = keras.backend.eval(x)
-      self.assertAllClose(val, ref_val)
+    ref_val = np.random.random((3, 4)).astype('float32')
+    x = keras.backend.constant(ref_val)
+    val = keras.backend.eval(x)
+    self.assertAllClose(val, ref_val)
 
   def test_sparse_variable(self):
-    with self.cached_session():
-      val = scipy.sparse.eye(10)
-      x = keras.backend.variable(val)
-      self.assertTrue(isinstance(x, sparse_tensor.SparseTensor))
+    val = scipy.sparse.eye(10)
+    x = keras.backend.variable(val)
+    self.assertTrue(isinstance(x, sparse_tensor.SparseTensor))
 
-      y = keras.backend.to_dense(x)
-      self.assertFalse(keras.backend.is_sparse(y))
+    y = keras.backend.to_dense(x)
+    self.assertFalse(keras.backend.is_sparse(y))
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -310,20 +307,19 @@ class BackendLinearAlgebraTest(test.TestCase):
         (keras.backend.argmax, np.argmax),
     ]
     for keras_op, np_op in ops_to_test:
-      with self.cached_session():
-        compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7, 5),
-                                         keras_kwargs={'axis': 1},
-                                         np_kwargs={'axis': 1})
-        compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7, 5),
-                                         keras_kwargs={'axis': -1},
-                                         np_kwargs={'axis': -1})
-        if 'keepdims' in tf_inspect.getargspec(keras_op).args:
-          compare_single_input_op_to_numpy(keras_op, np_op,
-                                           input_shape=(4, 7, 5),
-                                           keras_kwargs={'axis': 1,
-                                                         'keepdims': True},
-                                           np_kwargs={'axis': 1,
-                                                      'keepdims': True})
+      compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7, 5),
+                                       keras_kwargs={'axis': 1},
+                                       np_kwargs={'axis': 1})
+      compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7, 5),
+                                       keras_kwargs={'axis': -1},
+                                       np_kwargs={'axis': -1})
+      if 'keepdims' in tf_inspect.getargspec(keras_op).args:
+        compare_single_input_op_to_numpy(keras_op, np_op,
+                                         input_shape=(4, 7, 5),
+                                         keras_kwargs={'axis': 1,
+                                                       'keepdims': True},
+                                         np_kwargs={'axis': 1,
+                                                    'keepdims': True})
 
   def test_elementwise_ops(self):
     ops_to_test = [
@@ -336,32 +332,28 @@ class BackendLinearAlgebraTest(test.TestCase):
         (keras.backend.exp, np.exp),
     ]
     for keras_op, np_op in ops_to_test:
-      with self.cached_session():
-        compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7))
+      compare_single_input_op_to_numpy(keras_op, np_op, input_shape=(4, 7))
 
     ops_to_test = [
         (keras.backend.sqrt, np.sqrt),
         (keras.backend.log, np.log),
     ]
     for keras_op, np_op in ops_to_test:
-      with self.cached_session():
-        compare_single_input_op_to_numpy(keras_op, np_op,
-                                         input_shape=(4, 7),
-                                         negative_values=False)
+      compare_single_input_op_to_numpy(keras_op, np_op,
+                                       input_shape=(4, 7),
+                                       negative_values=False)
 
-    with self.cached_session():
-      compare_single_input_op_to_numpy(
-          keras.backend.clip, np.clip,
-          input_shape=(6, 4),
-          keras_kwargs={'min_value': 0.1, 'max_value': 2.4},
-          np_kwargs={'a_min': 0.1, 'a_max': 1.4})
+    compare_single_input_op_to_numpy(
+        keras.backend.clip, np.clip,
+        input_shape=(6, 4),
+        keras_kwargs={'min_value': 0.1, 'max_value': 2.4},
+        np_kwargs={'a_min': 0.1, 'a_max': 1.4})
 
-    with self.cached_session():
-      compare_single_input_op_to_numpy(
-          keras.backend.pow, np.power,
-          input_shape=(6, 4),
-          keras_args=[3],
-          np_args=[3])
+    compare_single_input_op_to_numpy(
+        keras.backend.pow, np.power,
+        input_shape=(6, 4),
+        keras_args=[3],
+        np_args=[3])
 
   def test_two_tensor_ops(self):
     ops_to_test = [
@@ -375,84 +367,82 @@ class BackendLinearAlgebraTest(test.TestCase):
         (keras.backend.minimum, np.minimum),
     ]
     for keras_op, np_op in ops_to_test:
-      with self.cached_session():
-        compare_two_inputs_op_to_numpy(keras_op, np_op,
-                                       input_shape_a=(4, 7),
-                                       input_shape_b=(4, 7))
+      compare_two_inputs_op_to_numpy(keras_op, np_op,
+                                     input_shape_a=(4, 7),
+                                     input_shape_b=(4, 7))
 
   def test_relu(self):
     x = ops.convert_to_tensor([[-4, 0], [2, 7]], 'float32')
-    with self.cached_session():
-      # standard relu
-      relu_op = keras.backend.relu(x)
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
 
-      # alpha (leaky relu used)
-      relu_op = keras.backend.relu(x, alpha=0.5)
-      if not context.executing_eagerly():
-        self.assertTrue('LeakyRelu' in relu_op.name)
-      self.assertAllClose(keras.backend.eval(relu_op), [[-2, 0], [2, 7]])
+    # standard relu
+    relu_op = keras.backend.relu(x)
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
 
-      # max_value < some elements
-      relu_op = keras.backend.relu(x, max_value=5)
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 5]])
+    # alpha (leaky relu used)
+    relu_op = keras.backend.relu(x, alpha=0.5)
+    if not context.executing_eagerly():
+      self.assertTrue('LeakyRelu' in relu_op.name)
+    self.assertAllClose(keras.backend.eval(relu_op), [[-2, 0], [2, 7]])
+
+    # max_value < some elements
+    relu_op = keras.backend.relu(x, max_value=5)
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 5]])
 
-      # nn.relu6 used
-      relu_op = keras.backend.relu(x, max_value=6)
-      if not context.executing_eagerly():
-        self.assertTrue('Relu6' in relu_op.name)  # uses tf.nn.relu6
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 6]])
+    # nn.relu6 used
+    relu_op = keras.backend.relu(x, max_value=6)
+    if not context.executing_eagerly():
+      self.assertTrue('Relu6' in relu_op.name)  # uses tf.nn.relu6
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 6]])
 
-      # max value > 6
-      relu_op = keras.backend.relu(x, max_value=10)
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
+    # max value > 6
+    relu_op = keras.backend.relu(x, max_value=10)
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
 
-      # max value is float
-      relu_op = keras.backend.relu(x, max_value=4.3)
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 4.3]])
+    # max value is float
+    relu_op = keras.backend.relu(x, max_value=4.3)
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 4.3]])
 
-      # max value == 0
-      relu_op = keras.backend.relu(x, max_value=0)
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 0]])
+    # max value == 0
+    relu_op = keras.backend.relu(x, max_value=0)
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 0]])
 
-      # alpha and max_value
-      relu_op = keras.backend.relu(x, alpha=0.25, max_value=3)
-      self.assertAllClose(keras.backend.eval(relu_op), [[-1, 0], [2, 3]])
+    # alpha and max_value
+    relu_op = keras.backend.relu(x, alpha=0.25, max_value=3)
+    self.assertAllClose(keras.backend.eval(relu_op), [[-1, 0], [2, 3]])
 
-      # threshold
-      relu_op = keras.backend.relu(x, threshold=3)
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 7]])
+    # threshold
+    relu_op = keras.backend.relu(x, threshold=3)
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 7]])
 
-      # threshold is float
-      relu_op = keras.backend.relu(x, threshold=1.5)
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
+    # threshold is float
+    relu_op = keras.backend.relu(x, threshold=1.5)
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
 
-      # threshold is negative
-      relu_op = keras.backend.relu(x, threshold=-5)
-      self.assertAllClose(keras.backend.eval(relu_op), [[-4, 0], [2, 7]])
+    # threshold is negative
+    relu_op = keras.backend.relu(x, threshold=-5)
+    self.assertAllClose(keras.backend.eval(relu_op), [[-4, 0], [2, 7]])
 
-      # threshold and max_value
-      relu_op = keras.backend.relu(x, threshold=3, max_value=5)
-      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 5]])
+    # threshold and max_value
+    relu_op = keras.backend.relu(x, threshold=3, max_value=5)
+    self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 5]])
 
-      # threshold and alpha
-      relu_op = keras.backend.relu(x, alpha=0.25, threshold=4)
-      self.assertAllClose(keras.backend.eval(relu_op), [[-2, -1], [-0.5, 7]])
+    # threshold and alpha
+    relu_op = keras.backend.relu(x, alpha=0.25, threshold=4)
+    self.assertAllClose(keras.backend.eval(relu_op), [[-2, -1], [-0.5, 7]])
 
-      # threshold, alpha, and max_value
-      relu_op = keras.backend.relu(x, alpha=0.25, threshold=4, max_value=5)
-      self.assertAllClose(keras.backend.eval(relu_op), [[-2, -1], [-0.5, 5]])
+    # threshold, alpha, and max_value
+    relu_op = keras.backend.relu(x, alpha=0.25, threshold=4, max_value=5)
+    self.assertAllClose(keras.backend.eval(relu_op), [[-2, -1], [-0.5, 5]])
 
 
 @test_util.run_all_in_graph_and_eager_modes
 class BackendShapeOpsTest(test.TestCase):
 
   def test_reshape(self):
-    with self.cached_session():
-      compare_single_input_op_to_numpy(keras.backend.reshape, np.reshape,
-                                       input_shape=(4, 7),
-                                       keras_args=[(2, 14)],
-                                       np_args=[(2, 14)])
+    compare_single_input_op_to_numpy(keras.backend.reshape, np.reshape,
+                                     input_shape=(4, 7),
+                                     keras_args=[(2, 14)],
+                                     np_args=[(2, 14)])
 
   def test_concatenate(self):
     a = keras.backend.variable(np.ones((1, 2, 3)))
@@ -461,12 +451,11 @@ class BackendShapeOpsTest(test.TestCase):
     self.assertEqual(y.get_shape().as_list(), [1, 2, 5])
 
   def test_permute_dimensions(self):
-    with self.cached_session():
-      compare_single_input_op_to_numpy(keras.backend.permute_dimensions,
-                                       np.transpose,
-                                       input_shape=(4, 7),
-                                       keras_args=[(1, 0)],
-                                       np_args=[(1, 0)])
+    compare_single_input_op_to_numpy(keras.backend.permute_dimensions,
+                                     np.transpose,
+                                     input_shape=(4, 7),
+                                     keras_args=[(1, 0)],
+                                     np_args=[(1, 0)])
 
   def test_resize_images(self):
     height_factor = 2
@@ -541,18 +530,16 @@ class BackendShapeOpsTest(test.TestCase):
     self.assertEqual(y.get_shape().as_list(), [1, 2, 3])
 
   def test_flatten(self):
-    with self.cached_session():
-      compare_single_input_op_to_numpy(keras.backend.flatten,
-                                       np.reshape,
-                                       input_shape=(4, 7, 6),
-                                       np_args=[(4 * 7 * 6,)])
+    compare_single_input_op_to_numpy(keras.backend.flatten,
+                                     np.reshape,
+                                     input_shape=(4, 7, 6),
+                                     np_args=[(4 * 7 * 6,)])
 
   def test_batch_flatten(self):
-    with self.cached_session():
-      compare_single_input_op_to_numpy(keras.backend.batch_flatten,
-                                       np.reshape,
-                                       input_shape=(4, 7, 6),
-                                       np_args=[(4, 7 * 6)])
+    compare_single_input_op_to_numpy(keras.backend.batch_flatten,
+                                     np.reshape,
+                                     input_shape=(4, 7, 6),
+                                     np_args=[(4, 7 * 6)])
 
   def test_temporal_padding(self):
 
@@ -563,12 +550,11 @@ class BackendShapeOpsTest(test.TestCase):
       y[:, padding[0]:-padding[1], :] = x
       return y
 
-    with self.cached_session():
-      compare_single_input_op_to_numpy(keras.backend.temporal_padding,
-                                       ref_op,
-                                       input_shape=(4, 7, 6),
-                                       keras_args=[(2, 3)],
-                                       np_args=[(2, 3)])
+    compare_single_input_op_to_numpy(keras.backend.temporal_padding,
+                                     ref_op,
+                                     input_shape=(4, 7, 6),
+                                     keras_args=[(2, 3)],
+                                     np_args=[(2, 3)])
 
   def test_spatial_2d_padding(self):
 
@@ -586,23 +572,22 @@ class BackendShapeOpsTest(test.TestCase):
         y[:, :, padding[0][0]:-padding[0][1], padding[1][0]:-padding[1][1]] = x
       return y
 
-    with self.cached_session():
-      compare_single_input_op_to_numpy(
-          keras.backend.spatial_2d_padding,
-          ref_op,
-          input_shape=(2, 3, 2, 3),
-          keras_args=[((2, 3), (1, 2))],
-          keras_kwargs={'data_format': 'channels_last'},
-          np_args=[((2, 3), (1, 2))],
-          np_kwargs={'data_format': 'channels_last'})
-      compare_single_input_op_to_numpy(
-          keras.backend.spatial_2d_padding,
-          ref_op,
-          input_shape=(2, 3, 2, 3),
-          keras_args=[((2, 3), (1, 2))],
-          keras_kwargs={'data_format': 'channels_first'},
-          np_args=[((2, 3), (1, 2))],
-          np_kwargs={'data_format': 'channels_first'})
+    compare_single_input_op_to_numpy(
+        keras.backend.spatial_2d_padding,
+        ref_op,
+        input_shape=(2, 3, 2, 3),
+        keras_args=[((2, 3), (1, 2))],
+        keras_kwargs={'data_format': 'channels_last'},
+        np_args=[((2, 3), (1, 2))],
+        np_kwargs={'data_format': 'channels_last'})
+    compare_single_input_op_to_numpy(
+        keras.backend.spatial_2d_padding,
+        ref_op,
+        input_shape=(2, 3, 2, 3),
+        keras_args=[((2, 3), (1, 2))],
+        keras_kwargs={'data_format': 'channels_first'},
+        np_args=[((2, 3), (1, 2))],
+        np_kwargs={'data_format': 'channels_first'})
 
   def test_spatial_3d_padding(self):
 
@@ -629,73 +614,70 @@ class BackendShapeOpsTest(test.TestCase):
           padding[2][0]:-padding[2][1]] = x
       return y
 
-    with self.cached_session():
-      compare_single_input_op_to_numpy(
-          keras.backend.spatial_3d_padding,
-          ref_op,
-          input_shape=(2, 3, 2, 3, 2),
-          keras_args=[((2, 3), (1, 2), (2, 3))],
-          keras_kwargs={'data_format': 'channels_last'},
-          np_args=[((2, 3), (1, 2), (2, 3))],
-          np_kwargs={'data_format': 'channels_last'})
-      compare_single_input_op_to_numpy(
-          keras.backend.spatial_3d_padding,
-          ref_op,
-          input_shape=(2, 3, 2, 3, 2),
-          keras_args=[((2, 3), (1, 2), (2, 3))],
-          keras_kwargs={'data_format': 'channels_first'},
-          np_args=[((2, 3), (1, 2), (2, 3))],
-          np_kwargs={'data_format': 'channels_first'})
+    compare_single_input_op_to_numpy(
+        keras.backend.spatial_3d_padding,
+        ref_op,
+        input_shape=(2, 3, 2, 3, 2),
+        keras_args=[((2, 3), (1, 2), (2, 3))],
+        keras_kwargs={'data_format': 'channels_last'},
+        np_args=[((2, 3), (1, 2), (2, 3))],
+        np_kwargs={'data_format': 'channels_last'})
+    compare_single_input_op_to_numpy(
+        keras.backend.spatial_3d_padding,
+        ref_op,
+        input_shape=(2, 3, 2, 3, 2),
+        keras_args=[((2, 3), (1, 2), (2, 3))],
+        keras_kwargs={'data_format': 'channels_first'},
+        np_args=[((2, 3), (1, 2), (2, 3))],
+        np_kwargs={'data_format': 'channels_first'})
 
 
 @test_util.run_all_in_graph_and_eager_modes
 class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
 
   def test_bias_add(self):
-    with self.cached_session():
-      keras_op = keras.backend.bias_add
-      np_op = np.add
-      compare_two_inputs_op_to_numpy(keras_op, np_op,
-                                     input_shape_a=(4, 7),
-                                     input_shape_b=(7,))
-      compare_two_inputs_op_to_numpy(keras_op, np_op,
-                                     input_shape_a=(4, 3, 7),
-                                     input_shape_b=(7,))
-      compare_two_inputs_op_to_numpy(keras_op, np_op,
-                                     input_shape_a=(4, 3, 5, 7),
-                                     input_shape_b=(7,))
-      compare_two_inputs_op_to_numpy(keras_op, np_op,
-                                     input_shape_a=(4, 3, 5, 2, 7),
-                                     input_shape_b=(7,))
-
-      with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
-        x = keras.backend.variable((3, 4))
-        b = keras.backend.variable((3, 4))
-        keras.backend.bias_add(x, b)
-      with self.assertRaises(ValueError):
-        x = keras.backend.variable((3, 4))
-        b = keras.backend.variable((4,))
-        keras.backend.bias_add(x, b, data_format='unknown')
+    keras_op = keras.backend.bias_add
+    np_op = np.add
+    compare_two_inputs_op_to_numpy(keras_op, np_op,
+                                   input_shape_a=(4, 7),
+                                   input_shape_b=(7,))
+    compare_two_inputs_op_to_numpy(keras_op, np_op,
+                                   input_shape_a=(4, 3, 7),
+                                   input_shape_b=(7,))
+    compare_two_inputs_op_to_numpy(keras_op, np_op,
+                                   input_shape_a=(4, 3, 5, 7),
+                                   input_shape_b=(7,))
+    compare_two_inputs_op_to_numpy(keras_op, np_op,
+                                   input_shape_a=(4, 3, 5, 2, 7),
+                                   input_shape_b=(7,))
+
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
+      x = keras.backend.variable((3, 4))
+      b = keras.backend.variable((3, 4))
+      keras.backend.bias_add(x, b)
+    with self.assertRaises(ValueError):
+      x = keras.backend.variable((3, 4))
+      b = keras.backend.variable((4,))
+      keras.backend.bias_add(x, b, data_format='unknown')
 
   def test_bias_add_channels_first(self):
-    with self.cached_session():
 
-      def keras_op(x, b):
-        return keras.backend.bias_add(x, b, data_format='channels_first')
+    def keras_op(x, b):
+      return keras.backend.bias_add(x, b, data_format='channels_first')
 
-      def np_op(x, b):
-        if x.ndim == 3:
-          b = b.reshape((1, b.shape[0], 1))
-        if x.ndim == 4:
-          b = b.reshape((1, b.shape[0], 1, 1))
-        return x + b
+    def np_op(x, b):
+      if x.ndim == 3:
+        b = b.reshape((1, b.shape[0], 1))
+      if x.ndim == 4:
+        b = b.reshape((1, b.shape[0], 1, 1))
+      return x + b
 
-      compare_two_inputs_op_to_numpy(keras_op, np_op,
-                                     input_shape_a=(4, 3, 7),
-                                     input_shape_b=(3,))
-      compare_two_inputs_op_to_numpy(keras_op, np_op,
-                                     input_shape_a=(4, 3, 5, 7),
-                                     input_shape_b=(3,))
+    compare_two_inputs_op_to_numpy(keras_op, np_op,
+                                   input_shape_a=(4, 3, 7),
+                                   input_shape_b=(3,))
+    compare_two_inputs_op_to_numpy(keras_op, np_op,
+                                   input_shape_a=(4, 3, 5, 7),
+                                   input_shape_b=(3,))
 
   def test_pool2d(self):
     val = np.random.random((10, 3, 10, 10))
@@ -855,9 +837,9 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
                                              strides,
                                              output_shape,
                                              'channels_last')
-          with self.cached_session():
-            conv_cf = keras.backend.eval(conv_cf)
-            conv_cl = keras.backend.eval(conv_cl)
+
+          conv_cf = keras.backend.eval(conv_cf)
+          conv_cl = keras.backend.eval(conv_cl)
 
           self.assertAllCloseAccordingToType(
               conv_cf,
@@ -905,9 +887,8 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
                                                   output_shape,
                                                   'channels_last')
 
-    with self.cached_session():
-      local_conv = keras.backend.eval(local_conv)
-      local_conv_dim = keras.backend.eval(local_conv_dim)
+    local_conv = keras.backend.eval(local_conv)
+    local_conv_dim = keras.backend.eval(local_conv_dim)
 
     self.assertAllCloseAccordingToType(local_conv, local_conv_dim)
 
@@ -1063,24 +1044,23 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
         {'go_backwards': False, 'mask': mask},
         {'go_backwards': False, 'mask': mask, 'unroll': True},
     ]
-    with self.cached_session():
-      for i, kwargs in enumerate(kwargs_list):
-        last_output, outputs, new_states = keras.backend.rnn(rnn_fn, inputs,
-                                                             initial_states,
-                                                             **kwargs)
-        # check static shape inference
-        self.assertEqual(last_output.get_shape().as_list(),
+    for i, kwargs in enumerate(kwargs_list):
+      last_output, outputs, new_states = keras.backend.rnn(rnn_fn, inputs,
+                                                           initial_states,
+                                                           **kwargs)
+      # check static shape inference
+      self.assertEqual(last_output.get_shape().as_list(),
+                       [num_samples, output_dim])
+      self.assertEqual(outputs.get_shape().as_list(),
+                       [num_samples, timesteps, output_dim])
+      for state in new_states:
+        self.assertEqual(state.get_shape().as_list(),
                          [num_samples, output_dim])
-        self.assertEqual(outputs.get_shape().as_list(),
-                         [num_samples, timesteps, output_dim])
-        for state in new_states:
-          self.assertEqual(state.get_shape().as_list(),
-                           [num_samples, output_dim])
 
-        last_output_list[i].append(keras.backend.eval(last_output))
-        outputs_list[i].append(keras.backend.eval(outputs))
-        self.assertEqual(len(new_states), 1)
-        state_list[i].append(keras.backend.eval(new_states[0]))
+      last_output_list[i].append(keras.backend.eval(last_output))
+      outputs_list[i].append(keras.backend.eval(outputs))
+      self.assertLen(new_states, 1)
+      state_list[i].append(keras.backend.eval(new_states[0]))
 
       def assert_list_pairwise(z_list, atol=1e-05):
         for (z1, z2) in zip(z_list[1:], z_list[:-1]):
@@ -1162,29 +1142,28 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
         {'go_backwards': False, 'mask': mask},
         {'go_backwards': False, 'mask': mask, 'unroll': True},
     ]
-    with self.cached_session():
-      for i, kwargs in enumerate(kwargs_list):
-        last_output, outputs, new_states = keras.backend.rnn(rnn_fn, inputs,
-                                                             initial_states,
-                                                             **kwargs)
-        # check static shape inference
-        self.assertEqual(last_output.get_shape().as_list(),
-                         [num_samples, output_dim])
-        self.assertEqual(outputs.get_shape().as_list(),
-                         [num_samples, timesteps, output_dim])
-        # for state in new_states:
-        #   self.assertEqual(state.get_shape().as_list(),
-        #                     [num_samples, output_dim])
-        self.assertEqual(new_states[0].get_shape().as_list(),
-                         [num_samples, output_dim])
-        self.assertEqual(new_states[1].get_shape().as_list(),
-                         [num_samples, 2 * output_dim])
-
-        last_output_list[i].append(keras.backend.eval(last_output))
-        outputs_list[i].append(keras.backend.eval(outputs))
-        self.assertEqual(len(new_states), 2)
-        state_list[i].append(keras.backend.eval(new_states[0]))
-        additional_state_list[i].append(keras.backend.eval(new_states[1]))
+    for i, kwargs in enumerate(kwargs_list):
+      last_output, outputs, new_states = keras.backend.rnn(rnn_fn, inputs,
+                                                           initial_states,
+                                                           **kwargs)
+      # check static shape inference
+      self.assertEqual(last_output.get_shape().as_list(),
+                       [num_samples, output_dim])
+      self.assertEqual(outputs.get_shape().as_list(),
+                       [num_samples, timesteps, output_dim])
+      # for state in new_states:
+      #   self.assertEqual(state.get_shape().as_list(),
+      #                     [num_samples, output_dim])
+      self.assertEqual(new_states[0].get_shape().as_list(),
+                       [num_samples, output_dim])
+      self.assertEqual(new_states[1].get_shape().as_list(),
+                       [num_samples, 2 * output_dim])
+
+      last_output_list[i].append(keras.backend.eval(last_output))
+      outputs_list[i].append(keras.backend.eval(outputs))
+      self.assertLen(new_states, 2)
+      state_list[i].append(keras.backend.eval(new_states[0]))
+      additional_state_list[i].append(keras.backend.eval(new_states[1]))
 
       def assert_list_pairwise(z_list, atol=1e-05):
         for (z1, z2) in zip(z_list[1:], z_list[:-1]):
@@ -1374,53 +1353,52 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
 class TestCTC(test.TestCase):
 
   def test_ctc_decode(self):
-    with self.cached_session():
-      depth = 6
-      seq_len_0 = 5
-      input_prob_matrix_0 = np.asarray(
-          [[0.30999, 0.309938, 0.0679938, 0.0673362, 0.0708352, 0.173908],
-           [0.215136, 0.439699, 0.0370931, 0.0393967, 0.0381581, 0.230517],
-           [0.199959, 0.489485, 0.0233221, 0.0251417, 0.0233289, 0.238763],
-           [0.279611, 0.452966, 0.0204795, 0.0209126, 0.0194803, 0.20655],
-           [0.51286, 0.288951, 0.0243026, 0.0220788, 0.0219297, 0.129878],
-           # Random entry added in at time=5
-           [0.155251, 0.164444, 0.173517, 0.176138, 0.169979, 0.160671]],
-          dtype=np.float32)
-
-      # len max_time_steps array of batch_size x depth matrices
-      inputs = ([input_prob_matrix_0[t, :][np.newaxis, :]
-                 for t in range(seq_len_0)] +  # Pad to max_time_steps = 8
-                2 * [np.zeros((1, depth), dtype=np.float32)])
-
-      inputs = keras.backend.variable(np.asarray(inputs).transpose((1, 0, 2)))
-
-      # batch_size length vector of sequence_lengths
-      input_length = keras.backend.variable(
-          np.array([seq_len_0], dtype=np.int32))
-      # batch_size length vector of negative log probabilities
-      log_prob_truth = np.array([
-          -3.5821197,  # output beam 0
-          -3.777835    # output beam 1
-      ], np.float32)[np.newaxis, :]
-
-      decode_truth = [np.array([1, 0]), np.array([0, 1, 0])]
-      beam_width = 2
-      top_paths = 2
-
-      decode_pred_tf, log_prob_pred_tf = keras.backend.ctc_decode(
-          inputs,
-          input_length,
-          greedy=False,
-          beam_width=beam_width,
-          top_paths=top_paths)
-
-      self.assertEqual(len(decode_pred_tf), top_paths)
-      log_prob_pred = keras.backend.eval(log_prob_pred_tf)
-      for i in range(top_paths):
-        self.assertTrue(
-            np.alltrue(
-                decode_truth[i] == keras.backend.eval(decode_pred_tf[i])))
-      self.assertAllClose(log_prob_truth, log_prob_pred)
+    depth = 6
+    seq_len_0 = 5
+    input_prob_matrix_0 = np.asarray(
+        [[0.30999, 0.309938, 0.0679938, 0.0673362, 0.0708352, 0.173908],
+         [0.215136, 0.439699, 0.0370931, 0.0393967, 0.0381581, 0.230517],
+         [0.199959, 0.489485, 0.0233221, 0.0251417, 0.0233289, 0.238763],
+         [0.279611, 0.452966, 0.0204795, 0.0209126, 0.0194803, 0.20655],
+         [0.51286, 0.288951, 0.0243026, 0.0220788, 0.0219297, 0.129878],
+         # Random entry added in at time=5
+         [0.155251, 0.164444, 0.173517, 0.176138, 0.169979, 0.160671]],
+        dtype=np.float32)
+
+    # len max_time_steps array of batch_size x depth matrices
+    inputs = ([input_prob_matrix_0[t, :][np.newaxis, :]
+               for t in range(seq_len_0)] +  # Pad to max_time_steps = 8
+              2 * [np.zeros((1, depth), dtype=np.float32)])
+
+    inputs = keras.backend.variable(np.asarray(inputs).transpose((1, 0, 2)))
+
+    # batch_size length vector of sequence_lengths
+    input_length = keras.backend.variable(
+        np.array([seq_len_0], dtype=np.int32))
+    # batch_size length vector of negative log probabilities
+    log_prob_truth = np.array([
+        -3.5821197,  # output beam 0
+        -3.777835    # output beam 1
+    ], np.float32)[np.newaxis, :]
+
+    decode_truth = [np.array([1, 0]), np.array([0, 1, 0])]
+    beam_width = 2
+    top_paths = 2
+
+    decode_pred_tf, log_prob_pred_tf = keras.backend.ctc_decode(
+        inputs,
+        input_length,
+        greedy=False,
+        beam_width=beam_width,
+        top_paths=top_paths)
+
+    self.assertEqual(len(decode_pred_tf), top_paths)
+    log_prob_pred = keras.backend.eval(log_prob_pred_tf)
+    for i in range(top_paths):
+      self.assertTrue(
+          np.alltrue(
+              decode_truth[i] == keras.backend.eval(decode_pred_tf[i])))
+    self.assertAllClose(log_prob_truth, log_prob_pred)
 
   @test_util.run_v1_only('b/120545219')
   def test_ctc_batch_cost(self):
@@ -1481,29 +1459,26 @@ class TestCTC(test.TestCase):
 class TestRandomOps(test.TestCase):
 
   def test_random_binomial(self):
-    with self.cached_session():
-      np.random.seed(123)
-      x = keras.backend.random_binomial((1000, 1000), p=0.5)
-      self.assertAllClose(np.mean(keras.backend.eval(x)), 0.5, atol=0.1)
+    np.random.seed(123)
+    x = keras.backend.random_binomial((1000, 1000), p=0.5)
+    self.assertAllClose(np.mean(keras.backend.eval(x)), 0.5, atol=0.1)
 
   def test_truncated_normal(self):
-    with self.cached_session():
-      np.random.seed(123)
-      x = keras.backend.truncated_normal((1000, 1000), mean=0.0, stddev=1.0)
-      y = keras.backend.eval(x)
-      self.assertAllClose(np.mean(y), 0., atol=0.1)
-      self.assertAllClose(np.std(y), 0.88, atol=0.1)
-      self.assertAllClose(np.max(y), 2., atol=0.1)
-      self.assertAllClose(np.min(y), -2., atol=0.1)
+    np.random.seed(123)
+    x = keras.backend.truncated_normal((1000, 1000), mean=0.0, stddev=1.0)
+    y = keras.backend.eval(x)
+    self.assertAllClose(np.mean(y), 0., atol=0.1)
+    self.assertAllClose(np.std(y), 0.88, atol=0.1)
+    self.assertAllClose(np.max(y), 2., atol=0.1)
+    self.assertAllClose(np.min(y), -2., atol=0.1)
 
   def test_string_input(self):
-    with self.cached_session():
-      seq = keras.Sequential([
-          keras.layers.InputLayer(input_shape=(1,), dtype=dtypes.string),
-          keras.layers.Lambda(lambda x: x[0])
-      ])
-      preds = seq.predict([['tensorflow eager']])
-      self.assertEqual(preds.shape, (1,))
+    seq = keras.Sequential([
+        keras.layers.InputLayer(input_shape=(1,), dtype=dtypes.string),
+        keras.layers.Lambda(lambda x: x[0])
+    ])
+    preds = seq.predict([['tensorflow eager']])
+    self.assertEqual(preds.shape, (1,))
 
 
 class BackendGraphTests(test.TestCase):
@@ -1695,6 +1670,39 @@ class BackendGraphTests(test.TestCase):
       self.assertEqual(callback.times_called, 1)
       self.assertEqual(callback.callback_result, 200)
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_function_dict_outputs(self):
+    x_ph = keras.backend.placeholder(shape=(), name='x')
+    y_ph = keras.backend.placeholder(shape=(), name='y')
+    outputs = {'x*y': y_ph * x_ph, 'x*x': x_ph * x_ph}
+
+    f = keras.backend.function(inputs=[x_ph, y_ph], outputs=outputs)
+    x, y = 2., 5.
+    results = f([x, y])
+
+    self.assertEqual(results['x*y'], 10.)
+    self.assertEqual(results['x*x'], 4)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_function_dict_inputs(self):
+    placeholders = {
+        'x': keras.backend.placeholder(shape=()),
+        'y': keras.backend.placeholder(shape=())
+    }
+    outputs = [placeholders['x'] * placeholders['y']]
+
+    f = keras.backend.function(inputs=placeholders, outputs=outputs)
+    results = f({'x': 2., 'y': 3.})
+    self.assertEqual(results[0], 6.)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_function_single_input_output(self):
+    x_ph = keras.backend.placeholder(shape=(), name='x')
+    output = x_ph * x_ph
+    f = keras.backend.function(x_ph, output)
+    result = f(2.)
+    self.assertEqual(result, 4.)
+
   def test_placeholder(self):
     x = keras.backend.placeholder(shape=(3, 4))
     self.assertEqual(x.get_shape().as_list(), [3, 4])
@@ -1733,6 +1741,16 @@ class BackendGraphTests(test.TestCase):
         x, mean, var, beta, gamma, axis=1, epsilon=1e-3)
     self.assertEqual(normed.shape.as_list(), [10, 3, 5, 5])
 
+  def test_get_session_different_graphs(self):
+    with ops.Graph().as_default():
+      x = keras.backend.constant(1)
+      session = keras.backend.get_session()
+      self.assertIs(session, keras.backend.get_session((x,)))
+      self.assertIs(session, keras.backend.get_session())
+    with ops.Graph().as_default():
+      self.assertIs(session, keras.backend.get_session((x,)))
+      self.assertIsNot(session, keras.backend.get_session())
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 2d7d5a415d422cea300ab722ceacdb83803d3db8..817d47a1ad7091c50437c9379dfca84d29dd91f9 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -32,19 +32,15 @@ import six
 
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
-from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.utils.data_utils import Sequence
 from tensorflow.python.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import summary_ops_v2
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.summary import summary as tf_summary
-from tensorflow.python.training import saver
-from tensorflow.python.util.tf_export import tf_export
-
+from tensorflow.python.util.tf_export import keras_export
 
 try:
   import requests
@@ -52,7 +48,6 @@ except ImportError:
   requests = None
 
 
-# pylint: disable=protected-access
 def configure_callbacks(callbacks,
                         model,
                         do_validation=False,
@@ -62,7 +57,7 @@ def configure_callbacks(callbacks,
                         samples=None,
                         verbose=1,
                         count_mode='steps',
-                        mode='train'):
+                        mode=ModeKeys.TRAIN):
   """Configures callbacks for use in various training loops.
 
   Arguments:
@@ -75,8 +70,8 @@ def configure_callbacks(callbacks,
       samples: Number of training samples.
       verbose: int, 0 or 1. Keras logging verbosity to pass to ProgbarLogger.
       count_mode: One of 'steps' or 'samples'. Per-batch or per-sample count.
-      mode: String. One of 'train', 'test', or 'predict'. Which loop mode to
-        configure callbacks for.
+      mode: String. One of ModeKeys.TRAIN, ModeKeys.TEST, or ModeKeys.PREDICT.
+        Which loop mode to configure callbacks for.
 
   Returns:
       Instance of CallbackList used to control all Callbacks.
@@ -89,27 +84,64 @@ def configure_callbacks(callbacks,
     callbacks = []
 
   # Add additional callbacks during training.
-  if mode == 'train':
+  if mode == ModeKeys.TRAIN:
     model.history = History()
-    stateful_metric_names = None
-    if hasattr(model, 'metrics_names'):
-      stateful_metric_names = model.metrics_names[1:]  # Exclude `loss`
-    callbacks = [BaseLogger(stateful_metrics=stateful_metric_names)
-                ] + (callbacks or []) + [model.history]
+    callbacks = [BaseLogger()] + (callbacks or []) + [model.history]
     if verbose:
-      callbacks.append(
-          ProgbarLogger(count_mode, stateful_metrics=stateful_metric_names))
+      callbacks.append(ProgbarLogger(count_mode))
   callback_list = CallbackList(callbacks)
 
   # Set callback model
-  callback_model = model._get_callback_model()
+  callback_model = model._get_callback_model()  # pylint: disable=protected-access
   callback_list.set_model(callback_model)
 
+  set_callback_parameters(
+      callback_list,
+      model,
+      do_validation=do_validation,
+      batch_size=batch_size,
+      epochs=epochs,
+      steps_per_epoch=steps_per_epoch,
+      samples=samples,
+      verbose=verbose,
+      mode=mode)
+
+  callback_list.model.stop_training = False
+  return callback_list
+
+
+def set_callback_parameters(callback_list,
+                            model,
+                            do_validation=False,
+                            batch_size=None,
+                            epochs=None,
+                            steps_per_epoch=None,
+                            samples=None,
+                            verbose=1,
+                            mode=ModeKeys.TRAIN):
+  """Sets callback parameters.
+
+  Arguments:
+      callback_list: CallbackList instance.
+      model: Model being trained.
+      do_validation: Whether or not validation loop will be run.
+      batch_size: Number of samples per batch.
+      epochs: Number of epoch to train.
+      steps_per_epoch: Number of batches to run per training epoch.
+      samples: Number of training samples.
+      verbose: int, 0 or 1. Keras logging verbosity to pass to ProgbarLogger.
+      mode: String. One of ModeKeys.TRAIN, ModeKeys.TEST, or ModeKeys.PREDICT.
+        Which loop mode to configure callbacks for.
+  """
+  for cbk in callback_list:
+    if isinstance(cbk, (BaseLogger, ProgbarLogger)):
+      cbk.stateful_metrics = model.metrics_names[1:]  # Exclude `loss`
+
   # Set callback parameters
   callback_metrics = []
   # When we have deferred build scenario with iterator input, we will compile
   # when we standardize first batch of data.
-  if mode != 'predict' and hasattr(model, 'metrics_names'):
+  if mode != ModeKeys.PREDICT and hasattr(model, 'metrics_names'):
     callback_metrics = copy.copy(model.metrics_names)
     if do_validation:
       callback_metrics += ['val_' + n for n in model.metrics_names]
@@ -124,17 +156,6 @@ def configure_callbacks(callbacks,
   }
   callback_list.set_params(callback_params)
 
-  if (do_validation and not model._distribution_strategy and
-      not model.run_eagerly):
-    # Need to create the eval_function before start of the first epoch
-    # because TensorBoard callback on_epoch_begin adds summary to the
-    # list of fetches of the eval_function
-    callback_model._make_eval_function()
-
-  callback_list.model.stop_training = False
-  return callback_list
-# pylint: enable=protected-access
-
 
 def _is_generator_like(data):
   """Checks if data is a generator, Sequence, or Iterator."""
@@ -142,6 +163,17 @@ def _is_generator_like(data):
       data, (Sequence, iterator_ops.Iterator, iterator_ops.EagerIterator)))
 
 
+def make_logs(model, logs, outputs, mode, prefix=''):
+  """Computes logs for sending to `on_batch_end` methods."""
+  if mode in {ModeKeys.TRAIN, ModeKeys.TEST}:
+    if hasattr(model, 'metrics_names'):
+      for label, output in zip(model.metrics_names, outputs):
+        logs[prefix + label] = output
+  else:
+    logs['outputs'] = outputs
+  return logs
+
+
 class CallbackList(object):
   """Container abstracting a list of callbacks.
 
@@ -179,10 +211,8 @@ class CallbackList(object):
 
   def _call_batch_hook(self, mode, hook, batch, logs=None):
     """Helper function for all batch_{begin | end} methods."""
-    # TODO(omalleyt): add batch hooks for test/predict.
-    if mode != 'train':
+    if not self.callbacks:
       return
-
     hook_name = 'on_{mode}_batch_{hook}'.format(mode=mode, hook=hook)
     if hook == 'begin':
       self._t_enter_batch = time.time()
@@ -207,92 +237,180 @@ class CallbackList(object):
 
   def _call_begin_hook(self, mode):
     """Helper function for on_{train|test|predict}_begin methods."""
-    # TODO(omalleyt): add test/predict methods.
-    if mode == 'train':
+    if mode == ModeKeys.TRAIN:
       self.on_train_begin()
+    elif mode == ModeKeys.TEST:
+      self.on_test_begin()
+    else:
+      self.on_predict_begin()
 
   def _call_end_hook(self, mode):
     """Helper function for on_{train|test|predict}_end methods."""
-    # TODO(omalleyt): add test/predict methods.
-    if mode == 'train':
+    if mode == ModeKeys.TRAIN:
       self.on_train_end()
+    elif mode == ModeKeys.TEST:
+      self.on_test_end()
+    else:
+      self.on_predict_end()
 
   def on_batch_begin(self, batch, logs=None):
-    self._call_batch_hook('train', 'begin', batch, logs=logs)
+    self._call_batch_hook(ModeKeys.TRAIN, 'begin', batch, logs=logs)
 
   def on_batch_end(self, batch, logs=None):
-    self._call_batch_hook('train', 'end', batch, logs=logs)
+    self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
 
-  def on_epoch_begin(self, epoch, logs=None, mode='train'):
-    """Called at the start of an epoch.
+  def on_epoch_begin(self, epoch, logs=None):
+    """Calls the `on_epoch_begin` methods of its callbacks.
+
+    This function should only be called during TRAIN mode.
 
     Arguments:
         epoch: integer, index of epoch.
-        logs: dictionary of logs.
-        mode: One of 'train'/'test'/'predict'
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
     """
-    if mode == 'train':
-      logs = logs or {}
-      for callback in self.callbacks:
-        callback.on_epoch_begin(epoch, logs)
+    logs = logs or {}
+    for callback in self.callbacks:
+      callback.on_epoch_begin(epoch, logs)
     self._reset_batch_timing()
 
-  def on_epoch_end(self, epoch, logs=None, mode='train'):
-    """Called at the end of an epoch.
+  def on_epoch_end(self, epoch, logs=None):
+    """Calls the `on_epoch_end` methods of its callbacks.
+
+    This function should only be called during TRAIN mode.
 
     Arguments:
         epoch: integer, index of epoch.
-        logs: dictionary of logs.
-        mode: One of 'train'/'test'/'predict'
+        logs: dict, metric results for this training epoch, and for the
+          validation epoch if validation is performed. Validation result keys
+          are prefixed with `val_`.
     """
-    if mode == 'train':
-      logs = logs or {}
-      for callback in self.callbacks:
-        callback.on_epoch_end(epoch, logs)
+    logs = logs or {}
+    for callback in self.callbacks:
+      callback.on_epoch_end(epoch, logs)
 
   def on_train_batch_begin(self, batch, logs=None):
-    """Called at the beginning of a training batch in `fit` methods.
+    """Calls the `on_train_batch_begin` methods of its callbacks.
 
     Arguments:
         batch: integer, index of batch within the current epoch.
-        logs: dictionary of logs.
+        logs: dict. Has keys `batch` and `size` representing the current batch
+          number and the size of the batch.
     """
-    self._call_batch_hook('train', 'begin', batch, logs=logs)
+    self._call_batch_hook(ModeKeys.TRAIN, 'begin', batch, logs=logs)
 
   def on_train_batch_end(self, batch, logs=None):
-    """Called at the end of a training batch in `fit` methods.
+    """Calls the `on_train_batch_end` methods of its callbacks.
+
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Metric results for this batch.
+    """
+    self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
+
+  def on_test_batch_begin(self, batch, logs=None):
+    """Calls the `on_test_batch_begin` methods of its callbacks.
+
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Has keys `batch` and `size` representing the current batch
+          number and the size of the batch.
+    """
+    self._call_batch_hook(ModeKeys.TEST, 'begin', batch, logs=logs)
+
+  def on_test_batch_end(self, batch, logs=None):
+    """Calls the `on_test_batch_end` methods of its callbacks.
+
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Metric results for this batch.
+    """
+    self._call_batch_hook(ModeKeys.TEST, 'end', batch, logs=logs)
+
+  def on_predict_batch_begin(self, batch, logs=None):
+    """Calls the `on_predict_batch_begin` methods of its callbacks.
 
     Arguments:
         batch: integer, index of batch within the current epoch.
-        logs: dictionary of logs.
+        logs: dict. Has keys `batch` and `size` representing the current batch
+          number and the size of the batch.
     """
-    self._call_batch_hook('train', 'end', batch, logs=logs)
+    self._call_batch_hook(ModeKeys.PREDICT, 'begin', batch, logs=logs)
+
+  def on_predict_batch_end(self, batch, logs=None):
+    """Calls the `on_predict_batch_end` methods of its callbacks.
+
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Metric results for this batch.
+    """
+    self._call_batch_hook(ModeKeys.PREDICT, 'end', batch, logs=logs)
 
   def on_train_begin(self, logs=None):
-    """Called at the beginning of training.
+    """Calls the `on_train_begin` methods of its callbacks.
 
     Arguments:
-        logs: dictionary of logs.
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
     """
-    logs = logs or {}
     for callback in self.callbacks:
       callback.on_train_begin(logs)
 
   def on_train_end(self, logs=None):
-    """Called at the end of training.
+    """Calls the `on_train_end` methods of its callbacks.
 
     Arguments:
-        logs: dictionary of logs.
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
     """
-    logs = logs or {}
     for callback in self.callbacks:
       callback.on_train_end(logs)
 
+  def on_test_begin(self, logs=None):
+    """Calls the `on_test_begin` methods of its callbacks.
+
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
+    for callback in self.callbacks:
+      callback.on_test_begin(logs)
+
+  def on_test_end(self, logs=None):
+    """Calls the `on_test_end` methods of its callbacks.
+
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
+    for callback in self.callbacks:
+      callback.on_test_end(logs)
+
+  def on_predict_begin(self, logs=None):
+    """Calls the 'on_predict_begin` methods of its callbacks.
+
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
+    for callback in self.callbacks:
+      callback.on_predict_begin(logs)
+
+  def on_predict_end(self, logs=None):
+    """Calls the `on_predict_end` methods of its callbacks.
+
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
+    for callback in self.callbacks:
+      callback.on_predict_end(logs)
+
   def __iter__(self):
     return iter(self.callbacks)
 
 
-@tf_export('keras.callbacks.Callback')
+@keras_export('keras.callbacks.Callback')
 class Callback(object):
   """Abstract base class used to build new callbacks.
 
@@ -306,7 +424,7 @@ class Callback(object):
   take as argument will contain keys for quantities relevant to
   the current batch or epoch.
 
-  Currently, the `.fit()` method of the `Sequential` model class
+  Currently, the `.fit()` method of the `Model` class
   will include the following quantities in the `logs` that
   it passes to its callbacks:
 
@@ -323,6 +441,10 @@ class Callback(object):
   def __init__(self):
     self.validation_data = None
     self.model = None
+    # Whether this Callback should only run on the chief worker in a
+    # Multi-Worker setting.
+    # TODO(omalleyt): Make this attr public once solution is stable.
+    self._chief_worker_only = None
 
   def set_params(self, params):
     self.params = params
@@ -330,34 +452,172 @@ class Callback(object):
   def set_model(self, model):
     self.model = model
 
+  def on_batch_begin(self, batch, logs=None):
+    """A backwards compatibility alias for `on_train_batch_begin`."""
+
+  def on_batch_end(self, batch, logs=None):
+    """A backwards compatibility alias for `on_train_batch_end`."""
+
   def on_epoch_begin(self, epoch, logs=None):
-    pass
+    """Called at the start of an epoch.
+
+    Subclasses should override for any actions to run. This function should only
+    be called during TRAIN mode.
+
+    Arguments:
+        epoch: integer, index of epoch.
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
 
   def on_epoch_end(self, epoch, logs=None):
-    pass
+    """Called at the end of an epoch.
 
-  def on_batch_begin(self, batch, logs=None):
-    pass
+    Subclasses should override for any actions to run. This function should only
+    be called during TRAIN mode.
 
-  def on_batch_end(self, batch, logs=None):
-    pass
+    Arguments:
+        epoch: integer, index of epoch.
+        logs: dict, metric results for this training epoch, and for the
+          validation epoch if validation is performed. Validation result keys
+          are prefixed with `val_`.
+    """
 
   def on_train_batch_begin(self, batch, logs=None):
-    # For backwards compatibility
+    """Called at the beginning of a training batch in `fit` methods.
+
+    Subclasses should override for any actions to run.
+
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Has keys `batch` and `size` representing the current batch
+          number and the size of the batch.
+    """
+    # For backwards compatibility.
     self.on_batch_begin(batch, logs=logs)
 
   def on_train_batch_end(self, batch, logs=None):
-    # For backwards compatibility
+    """Called at the end of a training batch in `fit` methods.
+
+    Subclasses should override for any actions to run.
+
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Metric results for this batch.
+    """
+    # For backwards compatibility.
     self.on_batch_end(batch, logs=logs)
 
+  def on_test_batch_begin(self, batch, logs=None):
+    """Called at the beginning of a batch in `evaluate` methods.
+
+    Also called at the beginning of a validation batch in the `fit`
+    methods, if validation data is provided.
+
+    Subclasses should override for any actions to run.
+
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Has keys `batch` and `size` representing the current batch
+          number and the size of the batch.
+    """
+
+  def on_test_batch_end(self, batch, logs=None):
+    """Called at the end of a batch in `evaluate` methods.
+
+    Also called at the end of a validation batch in the `fit`
+    methods, if validation data is provided.
+
+    Subclasses should override for any actions to run.
+
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Metric results for this batch.
+    """
+
+  def on_predict_batch_begin(self, batch, logs=None):
+    """Called at the beginning of a batch in `predict` methods.
+
+    Subclasses should override for any actions to run.
+
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Has keys `batch` and `size` representing the current batch
+          number and the size of the batch.
+    """
+
+  def on_predict_batch_end(self, batch, logs=None):
+    """Called at the end of a batch in `predict` methods.
+
+    Subclasses should override for any actions to run.
+
+    Arguments:
+        batch: integer, index of batch within the current epoch.
+        logs: dict. Metric results for this batch.
+    """
+
   def on_train_begin(self, logs=None):
-    pass
+    """Called at the beginning of training.
+
+    Subclasses should override for any actions to run.
+
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
 
   def on_train_end(self, logs=None):
-    pass
+    """Called at the end of training.
+
+    Subclasses should override for any actions to run.
 
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
+
+  def on_test_begin(self, logs=None):
+    """Called at the beginning of evaluation or validation.
+
+    Subclasses should override for any actions to run.
+
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
 
-@tf_export('keras.callbacks.BaseLogger')
+  def on_test_end(self, logs=None):
+    """Called at the end of evaluation or validation.
+
+    Subclasses should override for any actions to run.
+
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
+
+  def on_predict_begin(self, logs=None):
+    """Called at the beginning of prediction.
+
+    Subclasses should override for any actions to run.
+
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
+
+  def on_predict_end(self, logs=None):
+    """Called at the end of prediction.
+
+    Subclasses should override for any actions to run.
+
+    Arguments:
+        logs: dict. Currently no data is passed to this argument for this method
+          but that may change in the future.
+    """
+
+
+@keras_export('keras.callbacks.BaseLogger')
 class BaseLogger(Callback):
   """Callback that accumulates epoch averages of metrics.
 
@@ -406,7 +666,7 @@ class BaseLogger(Callback):
             logs[k] = self.totals[k] / self.seen
 
 
-@tf_export('keras.callbacks.TerminateOnNaN')
+@keras_export('keras.callbacks.TerminateOnNaN')
 class TerminateOnNaN(Callback):
   """Callback that terminates training when a NaN loss is encountered.
   """
@@ -420,7 +680,7 @@ class TerminateOnNaN(Callback):
         self.model.stop_training = True
 
 
-@tf_export('keras.callbacks.ProgbarLogger')
+@keras_export('keras.callbacks.ProgbarLogger')
 class ProgbarLogger(Callback):
   """Callback that prints metrics to stdout.
 
@@ -461,15 +721,14 @@ class ProgbarLogger(Callback):
     if self.verbose:
       if self.epochs > 1:
         print('Epoch %d/%d' % (epoch + 1, self.epochs))
-      self.progbar = Progbar(
-          target=self.target,
-          verbose=self.verbose,
-          stateful_metrics=self.stateful_metrics,
-          unit_name='step' if self.use_steps else 'sample')
+    self.progbar = Progbar(
+        target=self.target,
+        verbose=self.verbose,
+        stateful_metrics=self.stateful_metrics,
+        unit_name='step' if self.use_steps else 'sample')
 
   def on_batch_begin(self, batch, logs=None):
-    if self.seen < self.target:
-      self.log_values = []
+    self.log_values = []
 
   def on_batch_end(self, batch, logs=None):
     logs = logs or {}
@@ -488,7 +747,7 @@ class ProgbarLogger(Callback):
 
     # Skip progbar update for the last batch;
     # will be handled by on_epoch_end.
-    if self.verbose and self.seen < self.target:
+    if self.verbose and (self.target is None or self.seen < self.target):
       self.progbar.update(self.seen, self.log_values)
 
   def on_epoch_end(self, epoch, logs=None):
@@ -500,7 +759,7 @@ class ProgbarLogger(Callback):
       self.progbar.update(self.seen, self.log_values)
 
 
-@tf_export('keras.callbacks.History')
+@keras_export('keras.callbacks.History')
 class History(Callback):
   """Callback that records events into a `History` object.
 
@@ -520,7 +779,7 @@ class History(Callback):
       self.history.setdefault(k, []).append(v)
 
 
-@tf_export('keras.callbacks.ModelCheckpoint')
+@keras_export('keras.callbacks.ModelCheckpoint')
 class ModelCheckpoint(Callback):
   """Save the model after every epoch.
 
@@ -589,6 +848,17 @@ class ModelCheckpoint(Callback):
         self.monitor_op = np.less
         self.best = np.Inf
 
+    # Only the chief worker writes model checkpoints.
+    self._chief_worker_only = True
+
+  def set_model(self, model):
+    self.model = model
+    # Use name matching rather than `isinstance` to avoid circular dependencies.
+    if (not self.save_weights_only and
+        not model._is_graph_network and  # pylint: disable=protected-access
+        model.__class__.__name__ != 'Sequential'):
+      self.save_weights_only = True
+
   def on_epoch_end(self, epoch, logs=None):
     logs = logs or {}
     self.epochs_since_last_save += 1
@@ -624,7 +894,7 @@ class ModelCheckpoint(Callback):
           self.model.save(filepath, overwrite=True)
 
 
-@tf_export('keras.callbacks.EarlyStopping')
+@keras_export('keras.callbacks.EarlyStopping')
 class EarlyStopping(Callback):
   """Stop training when a monitored quantity has stopped improving.
 
@@ -735,7 +1005,7 @@ class EarlyStopping(Callback):
     return monitor_value
 
 
-@tf_export('keras.callbacks.RemoteMonitor')
+@keras_export('keras.callbacks.RemoteMonitor')
 class RemoteMonitor(Callback):
   """Callback used to stream events to a server.
 
@@ -791,7 +1061,7 @@ class RemoteMonitor(Callback):
                       'root server at ' + str(self.root))
 
 
-@tf_export('keras.callbacks.LearningRateScheduler')
+@keras_export('keras.callbacks.LearningRateScheduler')
 class LearningRateScheduler(Callback):
   """Learning rate scheduler.
 
@@ -828,10 +1098,10 @@ class LearningRateScheduler(Callback):
     logs['lr'] = K.get_value(self.model.optimizer.lr)
 
 
-@tf_export('keras.callbacks.TensorBoard')
+@keras_export('keras.callbacks.TensorBoard', v1=[])
 class TensorBoard(Callback):
   # pylint: disable=line-too-long
-  """Tensorboard basic visualizations.
+  """TensorBoard basic visualizations.
 
   This callback writes a log for TensorBoard, which allows
   you to visualize dynamic graphs of your training and test
@@ -851,51 +1121,25 @@ class TensorBoard(Callback):
   [here](https://www.tensorflow.org/get_started/summaries_and_tensorboard).
 
   Arguments:
-      log_dir: the path of the directory where to save the log
-          files to be parsed by TensorBoard.
-      histogram_freq: frequency (in epochs) at which to compute activation
-          and weight histograms for the layers of the model. If set to 0,
-          histograms won't be computed. Validation data (or split) must be
-          specified for histogram visualizations.
-      write_graph: whether to visualize the graph in TensorBoard.
-          The log file can become quite large when
-          write_graph is set to True.
-      write_grads: whether to visualize gradient histograms in TensorBoard.
-          `histogram_freq` must be greater than 0.
-      batch_size: size of batch of inputs to feed to the network
-          for histograms computation.
-      write_images: whether to write model weights to visualize as
-          image in TensorBoard.
-      embeddings_freq: frequency (in epochs) at which selected embedding
-          layers will be saved. If set to 0, embeddings won't be computed.
-          Data to be visualized in TensorBoard's Embedding tab must be passed
-          as `embeddings_data`.
-      embeddings_layer_names: a list of names of layers to keep eye on. If
-          None or empty list all the embedding layer will be watched.
-      embeddings_metadata: a dictionary which maps layer name to a file name
-          in which metadata for this embedding layer is saved. See the
-          [details](https://www.tensorflow.org/how_tos/embedding_viz/#metadata_optional)
-          about metadata files format. In case if the same metadata file is
-          used for all embedding layers, string can be passed.
-      embeddings_data: data to be embedded at layers specified in
-          `embeddings_layer_names`. Numpy array (if the model has a single
-          input) or list of Numpy arrays (if the model has multiple inputs).
-          Learn [more about embeddings](https://www.tensorflow.org/programmers_guide/embedding)
+      log_dir: the path of the directory where to save the log files to be
+        parsed by TensorBoard.
+      histogram_freq: frequency (in epochs) at which to compute activation and
+        weight histograms for the layers of the model. If set to 0, histograms
+        won't be computed. Validation data (or split) must be specified for
+        histogram visualizations.
+      write_graph: whether to visualize the graph in TensorBoard. The log file
+        can become quite large when write_graph is set to True.
+      write_images: whether to write model weights to visualize as image in
+        TensorBoard.
       update_freq: `'batch'` or `'epoch'` or integer. When using `'batch'`,
-          writes the losses and metrics to TensorBoard after each batch.
-          The same applies for `'epoch'`. If using an integer, let's say `1000`,
-          the callback will write the metrics and losses to TensorBoard every
-          1000 samples. Note that writing too frequently to TensorBoard
-          can slow down your training.
+        writes the losses and metrics to TensorBoard after each batch. The same
+        applies for `'epoch'`. If using an integer, let's say `1000`, the
+        callback will write the metrics and losses to TensorBoard every 1000
+        samples. Note that writing too frequently to TensorBoard can slow down
+        your training.
 
   Raises:
       ValueError: If histogram_freq is set and no validation data is provided.
-
-  @compatibility(eager)
-  Using `Tensorboard` callback will work while eager execution is enabled,
-  however outputting histogram summaries of weights and gradients is not
-  supported, and thus `histogram_freq` will be ignored.
-  @end_compatibility
   """
 
   # pylint: enable=line-too-long
@@ -903,315 +1147,199 @@ class TensorBoard(Callback):
   def __init__(self,
                log_dir='./logs',
                histogram_freq=0,
-               batch_size=32,
                write_graph=True,
-               write_grads=False,
                write_images=False,
-               embeddings_freq=0,
-               embeddings_layer_names=None,
-               embeddings_metadata=None,
-               embeddings_data=None,
-               update_freq='epoch'):
+               update_freq='epoch',
+               **kwargs):
     super(TensorBoard, self).__init__()
+    self._validate_kwargs(kwargs)
+
     self.log_dir = log_dir
     self.histogram_freq = histogram_freq
-    if self.histogram_freq and context.executing_eagerly():
-      logging.warning(
-          UserWarning('Weight and gradient histograms not supported for eager'
-                      'execution, setting `histogram_freq` to `0`.'))
-      self.histogram_freq = 0
-    self.merged = None
     self.write_graph = write_graph
-    self.write_grads = write_grads
     self.write_images = write_images
-    self.batch_size = batch_size
-    self._current_batch = 0
-    self._total_batches_seen = 0
-    self._total_val_batches_seen = 0
-    self.embeddings_freq = embeddings_freq
-    self.embeddings_layer_names = embeddings_layer_names
-    self.embeddings_metadata = embeddings_metadata
-    self.embeddings_data = embeddings_data
     if update_freq == 'batch':
       self.update_freq = 1
     else:
       self.update_freq = update_freq
+
     self._samples_seen = 0
     self._samples_seen_at_last_write = 0
+    self._current_batch = 0
+    self._total_batches_seen = 0
+    self._total_val_batches_seen = 0
 
-  def _init_writer(self):
-    """Sets file writer."""
-    if context.executing_eagerly():
-      self.writer = summary_ops_v2.create_file_writer(self.log_dir)
-    elif self.write_graph:
-      self.writer = tf_summary.FileWriter(self.log_dir, K.get_session().graph)
-    else:
-      self.writer = tf_summary.FileWriter(self.log_dir)
-
-  def _make_histogram_ops(self, model):
-    """Defines histogram ops when histogram_freq > 0."""
-    # only make histogram summary op if it hasn't already been made
-    if self.histogram_freq and self.merged is None:
-      for layer in self.model.layers:
-        for weight in layer.weights:
-          mapped_weight_name = weight.name.replace(':', '_')
-          tf_summary.histogram(mapped_weight_name, weight)
-          if self.write_images:
-            w_img = array_ops.squeeze(weight)
-            shape = K.int_shape(w_img)
-            if len(shape) == 2:  # dense layer kernel case
-              if shape[0] > shape[1]:
-                w_img = array_ops.transpose(w_img)
-                shape = K.int_shape(w_img)
-              w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1])
-            elif len(shape) == 3:  # convnet case
-              if K.image_data_format() == 'channels_last':
-                # switch to channels_first to display
-                # every kernel as a separate image
-                w_img = array_ops.transpose(w_img, perm=[2, 0, 1])
-                shape = K.int_shape(w_img)
-              w_img = array_ops.reshape(w_img,
-                                        [shape[0], shape[1], shape[2], 1])
-            elif len(shape) == 1:  # bias case
-              w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1])
-            else:
-              # not possible to handle 3D convnets etc.
-              continue
-
-            shape = K.int_shape(w_img)
-            assert len(shape) == 4 and shape[-1] in [1, 3, 4]
-            tf_summary.image(mapped_weight_name, w_img)
-
-        if self.write_grads:
-          for weight in layer.trainable_weights:
-            mapped_weight_name = weight.name.replace(':', '_')
-            grads = model.optimizer.get_gradients(model.total_loss, weight)
-
-            def is_indexed_slices(grad):
-              return type(grad).__name__ == 'IndexedSlices'
-
-            grads = [
-                grad.values if is_indexed_slices(grad) else grad
-                for grad in grads
-            ]
-            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
-
-        if hasattr(layer, 'output'):
-          if isinstance(layer.output, list):
-            for i, output in enumerate(layer.output):
-              tf_summary.histogram('{}_out_{}'.format(layer.name, i), output)
-          else:
-            tf_summary.histogram('{}_out'.format(layer.name), layer.output)
+    self._writers = []  # file writers to be closed
+    self._train_writer = None  # set in `_initialize_writers`
+    self._validation_writer = None  # set in `_initialize_writers`
+
+    # TensorBoard should only write summaries on the chief when in a
+    # Multi-Worker setting.
+    self._chief_worker_only = True
+
+  def _validate_kwargs(self, kwargs):
+    """Handle arguments were supported in V1."""
+    if kwargs.get('write_grads', False):
+      logging.warning('`write_grads` will be ignored in TensorFlow 2.0 '
+                      'for the `TensorBoard` Callback.')
+    if kwargs.get('embeddings_freq', False):
+      logging.warning('Embeddings will be ignored in TensorFlow 2.0 '
+                      'for the `TensorBoard` Callback.')
+
+    unrecognized_kwargs = set(kwargs.keys()) - {
+        'write_grads', 'embeddings_freq', 'embeddings_layer_names',
+        'embeddings_metadata', 'embeddings_data'
+    }
+
+    # Only allow kwargs that were supported in V1.
+    if unrecognized_kwargs:
+      raise ValueError('Unrecognized arguments in `TensorBoard` '
+                       'Callback: ' + str(unrecognized_kwargs))
 
   def set_model(self, model):
-    """Sets Keras model and creates summary ops."""
-
+    """Sets Keras model and writes graph if specified."""
     self.model = model
-    self._init_writer()
-    # histogram summaries only enabled in graph mode
-    if not context.executing_eagerly():
-      self._make_histogram_ops(model)
-      self.merged = tf_summary.merge_all()
-
-    # If both embedding_freq and embeddings_data are available, we will
-    # visualize embeddings.
-    if self.embeddings_freq and self.embeddings_data is not None:
-      # Avoid circular dependency.
-      from tensorflow.python.keras.engine import training_utils  # pylint: disable=g-import-not-at-top
-      self.embeddings_data = training_utils.standardize_input_data(
-          self.embeddings_data, model.input_names)
-
-      # If embedding_layer_names are not provided, get all of the embedding
-      # layers from the model.
-      embeddings_layer_names = self.embeddings_layer_names
-      if not embeddings_layer_names:
-        embeddings_layer_names = [
-            layer.name
-            for layer in self.model.layers
-            if type(layer).__name__ == 'Embedding'
-        ]
-
-      self.assign_embeddings = []
-      embeddings_vars = {}
-
-      self.batch_id = batch_id = array_ops.placeholder(dtypes.int32)
-      self.step = step = array_ops.placeholder(dtypes.int32)
+    with context.eager_mode():
+      self._initialize_writers()
+      if self.write_graph:
+        if model.run_eagerly:
+          logging.warning('TensorBoard Callback will ignore `write_graph=True`'
+                          'when `Model.run_eagerly=True`.`')
+        else:
+          with self._train_writer.as_default():
+            with summary_ops_v2.always_record_summaries():
+              summary_ops_v2.graph(K.get_graph())
+              if self.model._is_graph_network:  # pylint: disable=protected-access
+                summary_ops_v2.keras_model('keras', self.model, step=0)
 
-      for layer in self.model.layers:
-        if layer.name in embeddings_layer_names:
-          embedding_input = self.model.get_layer(layer.name).output
-          embedding_size = np.prod(embedding_input.shape[1:])
-          embedding_input = array_ops.reshape(embedding_input,
-                                              (step, int(embedding_size)))
-          shape = (self.embeddings_data[0].shape[0], int(embedding_size))
-          embedding = variables.Variable(
-              array_ops.zeros(shape), name=layer.name + '_embedding')
-          embeddings_vars[layer.name] = embedding
-          batch = state_ops.assign(embedding[batch_id:batch_id + step],
-                                   embedding_input)
-          self.assign_embeddings.append(batch)
-
-      self.saver = saver.Saver(list(embeddings_vars.values()))
-
-      # Create embeddings_metadata dictionary
-      if isinstance(self.embeddings_metadata, str):
-        embeddings_metadata = {
-            layer_name: self.embeddings_metadata
-            for layer_name in embeddings_vars.keys()
-        }
-      else:
-        # If embedding_metadata is already a dictionary
-        embeddings_metadata = self.embeddings_metadata
-
-      try:
-        from tensorboard.plugins import projector
-      except ImportError:
-        raise ImportError('Failed to import TensorBoard. Please make sure that '
-                          'TensorBoard integration is complete."')
-
-      # TODO(psv): Add integration tests to test embedding visualization
-      # with TensorBoard callback. We are unable to write a unit test for this
-      # because TensorBoard dependency assumes TensorFlow package is installed.
-      config = projector.ProjectorConfig()
-      for layer_name, tensor in embeddings_vars.items():
-        embedding = config.embeddings.add()
-        embedding.tensor_name = tensor.name
-
-        if (embeddings_metadata is not None and
-            layer_name in embeddings_metadata):
-          embedding.metadata_path = embeddings_metadata[layer_name]
-
-      projector.visualize_embeddings(self.writer, config)
-
-  def _fetch_callback(self, summary):
-    self.writer.add_summary(summary, self._total_val_batches_seen)
-    self._total_val_batches_seen += 1
-
-  def _write_custom_summaries(self, step, logs=None):
-    """Writes metrics out as custom scalar summaries.
+  def _close_writers(self):
+    """Close all remaining open file writers owned by this callback.
 
-    Arguments:
-        step: the global step to use for Tensorboard.
-        logs: dict. Keys are scalar summary names, values are
-            NumPy scalars.
+    If there are no such file writers, this is a no-op.
+    """
+    with context.eager_mode():
+      for writer in self._writers:
+        writer.close()
+      del self._writers[:]
+
+  def _initialize_writers(self):
+    """Create all file writers needed and validation writers.
 
+    This updates `self._train_writer` and `self._validation_writer`, and
+    populates the `self._writers` list to be cleaned up by
+    `_close_writers`.
     """
-    logs = logs or {}
-    if context.executing_eagerly():
-      # use v2 summary ops
-      with self.writer.as_default(), summary_ops_v2.always_record_summaries():
-        for name, value in logs.items():
-          if isinstance(value, np.ndarray):
-            value = value.item()
-          summary_ops_v2.scalar(name, value, step=step)
-    else:
-      # use FileWriter from v1 summary
-      for name, value in logs.items():
-        if isinstance(value, np.ndarray):
-          value = value.item()
-        summary = tf_summary.Summary()
-        summary_value = summary.value.add()
-        summary_value.simple_value = value
-        summary_value.tag = name
-        self.writer.add_summary(summary, step)
-    self.writer.flush()
+    self._close_writers()
+
+    def create_writer(subdir):
+      path = os.path.join(self.log_dir, subdir)
+      return summary_ops_v2.create_file_writer(path)
+
+    self._train_writer = create_writer('train')
+    self._writers.append(self._train_writer)
+    self._validation_writer = create_writer('validation')
+    self._writers.append(self._validation_writer)
 
   def on_batch_end(self, batch, logs=None):
     """Writes scalar summaries for metrics on every training batch."""
-    # Don't output batch_size and batch number as Tensorboard summaries
+    # Don't output batch_size and batch number as TensorBoard summaries
     logs = logs or {}
     self._samples_seen += logs.get('size', 1)
     samples_seen_since = self._samples_seen - self._samples_seen_at_last_write
     if self.update_freq != 'epoch' and samples_seen_since >= self.update_freq:
-      batch_logs = {('batch_' + k): v
-                    for k, v in logs.items()
-                    if k not in ['batch', 'size', 'num_steps']}
-      self._write_custom_summaries(self._total_batches_seen, batch_logs)
+      self._log_metrics(logs, prefix='batch_', step=self._total_batches_seen)
       self._samples_seen_at_last_write = self._samples_seen
     self._total_batches_seen += 1
 
-  def on_epoch_begin(self, epoch, logs=None):
-    """Add histogram op to Model eval_function callbacks, reset batch count."""
-
-    # check if histogram summary should be run for this epoch
-    if self.histogram_freq and epoch % self.histogram_freq == 0:
-      self._epoch = epoch
-      # pylint: disable=protected-access
-      # add the histogram summary op if it should run this epoch
-      if self.merged not in self.model._eval_function.fetches:
-        self.model._eval_function.fetches.append(self.merged)
-        self.model._eval_function.fetch_callbacks[
-            self.merged] = self._fetch_callback
-      # pylint: enable=protected-access
-
   def on_epoch_end(self, epoch, logs=None):
-    """Checks if summary ops should run next epoch, logs scalar summaries."""
-
-    # don't output batch_size and
-    # batch number as Tensorboard summaries
-    logs = {('epoch_' + k): v
-            for k, v in logs.items()
-            if k not in ['batch', 'size', 'num_steps']}
-    if self.update_freq == 'epoch':
-      step = epoch
-    else:
-      step = self._samples_seen
-    self._write_custom_summaries(step, logs)
-
-    # pop the histogram summary op after each epoch
-    if self.histogram_freq:
-      # pylint: disable=protected-access
-      if self.merged in self.model._eval_function.fetches:
-        self.model._eval_function.fetches.remove(self.merged)
-      if self.merged in self.model._eval_function.fetch_callbacks:
-        self.model._eval_function.fetch_callbacks.pop(self.merged)
-      # pylint: enable=protected-access
-
-    if self.embeddings_data is None and self.embeddings_freq:
-      raise ValueError('To visualize embeddings, embeddings_data must '
-                       'be provided.')
-
-    if self.embeddings_freq and self.embeddings_data is not None:
-      if epoch % self.embeddings_freq == 0:
-        # We need a second forward-pass here because we're passing
-        # the `embeddings_data` explicitly. This design allows to pass
-        # arbitrary data as `embeddings_data` and results from the fact
-        # that we need to know the size of the `tf.Variable`s which
-        # hold the embeddings in `set_model`. At this point, however,
-        # the `validation_data` is not yet set.
-
-        embeddings_data = self.embeddings_data
-        n_samples = embeddings_data[0].shape[0]
-        i = 0
-        while i < n_samples:
-          step = min(self.batch_size, n_samples - i)
-          batch = slice(i, i + step)
-
-          if isinstance(self.model.input, list):
-            feed_dict = {
-                model_input: embeddings_data[idx][batch]
-                for idx, model_input in enumerate(self.model.input)
-            }
-          else:
-            feed_dict = {self.model.input: embeddings_data[0][batch]}
-
-          feed_dict.update({self.batch_id: i, self.step: step})
-
-          if not isinstance(K.learning_phase(), int):
-            feed_dict[K.learning_phase()] = False
+    """Runs metrics and histogram summaries at epoch end."""
+    step = epoch if self.update_freq == 'epoch' else self._samples_seen
+    self._log_metrics(logs, prefix='epoch_', step=step)
 
-          self.sess.run(self.assign_embeddings, feed_dict=feed_dict)
-          self.saver.save(self.sess,
-                          os.path.join(self.log_dir, 'keras_embedding.ckpt'),
-                          epoch)
-
-          i += self.batch_size
+    if self.histogram_freq and epoch % self.histogram_freq == 0:
+      self._log_weights(epoch)
 
   def on_train_end(self, logs=None):
-    self.writer.close()
+    self._close_writers()
 
+  def _log_metrics(self, logs, prefix, step):
+    """Writes metrics out as custom scalar summaries.
 
-@tf_export('keras.callbacks.ReduceLROnPlateau')
+    Arguments:
+        logs: Dict. Keys are scalar summary names, values are NumPy scalars.
+        prefix: String. The prefix to apply to the scalar summary names.
+        step: Int. The global step to use for TensorBoard.
+    """
+    if logs is None:
+      logs = {}
+
+    # Group metrics by their associated file writer. Values are lists of
+    # metrics, as (name, scalar_value) pairs.
+    logs_by_writer = {
+        self._train_writer: [],
+        self._validation_writer: [],
+    }
+    validation_prefix = 'val_'
+    for (name, value) in logs.items():
+      if name in ('batch', 'size', 'num_steps'):
+        # Scrub non-metric items.
+        continue
+      if name.startswith(validation_prefix):
+        name = name[len(validation_prefix):]
+        writer = self._validation_writer
+      else:
+        writer = self._train_writer
+      name = prefix + name  # assign batch or epoch prefix
+      logs_by_writer[writer].append((name, value))
+
+    with context.eager_mode():
+      with summary_ops_v2.always_record_summaries():
+        for writer in logs_by_writer:
+          with writer.as_default():
+            for (name, value) in logs_by_writer[writer]:
+              summary_ops_v2.scalar(name, value, step=step)
+
+  def _log_weights(self, epoch):
+    """Logs the weights of the Model to TensorBoard."""
+    with context.eager_mode(), \
+          self._train_writer.as_default(), \
+          summary_ops_v2.always_record_summaries():
+      for layer in self.model.layers:
+        for weight in layer.weights:
+          weight_name = weight.name.replace(':', '_')
+          with ops.init_scope():
+            weight = K.get_value(weight)
+          summary_ops_v2.histogram(weight_name, weight, step=epoch)
+          if self.write_images:
+            self._log_weight_as_image(weight, weight_name, epoch)
+      self._train_writer.flush()
+
+  def _log_weight_as_image(self, weight, weight_name, epoch):
+    """Logs a weight as a TensorBoard image."""
+    w_img = array_ops.squeeze(weight)
+    shape = K.int_shape(w_img)
+    if len(shape) == 1:  # Bias case
+      w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1])
+    elif len(shape) == 2:  # Dense layer kernel case
+      if shape[0] > shape[1]:
+        w_img = array_ops.transpose(w_img)
+        shape = K.int_shape(w_img)
+      w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1])
+    elif len(shape) == 3:  # ConvNet case
+      if K.image_data_format() == 'channels_last':
+        # Switch to channels_first to display every kernel as a separate
+        # image.
+        w_img = array_ops.transpose(w_img, perm=[2, 0, 1])
+        shape = K.int_shape(w_img)
+      w_img = array_ops.reshape(w_img, [shape[0], shape[1], shape[2], 1])
+
+    shape = K.int_shape(w_img)
+    # Not possible to handle 3D convnets etc.
+    if len(shape) == 4 and shape[-1] in [1, 3, 4]:
+      summary_ops_v2.image(weight_name, w_img, step=epoch)
+
+
+@keras_export('keras.callbacks.ReduceLROnPlateau')
 class ReduceLROnPlateau(Callback):
   """Reduce learning rate when a metric has stopped improving.
 
@@ -1336,7 +1464,7 @@ class ReduceLROnPlateau(Callback):
     return self.cooldown_counter > 0
 
 
-@tf_export('keras.callbacks.CSVLogger')
+@keras_export('keras.callbacks.CSVLogger')
 class CSVLogger(Callback):
   """Callback that streams epoch results to a csv file.
 
@@ -1429,7 +1557,7 @@ class CSVLogger(Callback):
     self.writer = None
 
 
-@tf_export('keras.callbacks.LambdaCallback')
+@keras_export('keras.callbacks.LambdaCallback')
 class LambdaCallback(Callback):
   r"""Callback for creating simple, custom callbacks on-the-fly.
 
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 4a65ade33c7f9c6159ab5cb8f50a06124507dbdd..2a14debfdd539a797ba9a894fc96890e3e69f392 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -18,25 +18,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import csv
 import os
 import re
 import shutil
-import tempfile
+import sys
 import threading
 import unittest
 
+from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.core.framework import summary_pb2
 from tensorflow.python import keras
-from tensorflow.python.framework import ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import random_seed
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import adam
+from tensorflow.python.util import tf_contextlib
 
 try:
   import h5py  # pylint:disable=g-import-not-at-top
@@ -57,173 +62,378 @@ NUM_HIDDEN = 5
 BATCH_SIZE = 5
 
 
-class KerasCallbacksTest(test.TestCase):
+class Counter(keras.callbacks.Callback):
+  """Counts the number of times each callback method was run.
 
-  def test_ModelCheckpoint(self):
-    if h5py is None:
-      return  # Skip test if models cannot be saved.
+  Attributes:
+    method_counts: dict. Contains the counts of time  each callback method was
+      run.
+  """
 
-    with self.cached_session():
-      np.random.seed(1337)
+  def __init__(self):
+    self.method_counts = collections.defaultdict(int)
+    methods_to_count = [
+        'on_batch_begin', 'on_batch_end', 'on_epoch_begin', 'on_epoch_end',
+        'on_predict_batch_begin', 'on_predict_batch_end', 'on_predict_begin',
+        'on_predict_end', 'on_test_batch_begin', 'on_test_batch_end',
+        'on_test_begin', 'on_test_end', 'on_train_batch_begin',
+        'on_train_batch_end', 'on_train_begin', 'on_train_end'
+    ]
+    for method_name in methods_to_count:
+      setattr(self, method_name,
+              self.wrap_with_counts(method_name, getattr(self, method_name)))
 
-      temp_dir = self.get_temp_dir()
-      self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+  def wrap_with_counts(self, method_name, method):
 
-      filepath = os.path.join(temp_dir, 'checkpoint.h5')
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=TRAIN_SAMPLES,
-          test_samples=TEST_SAMPLES,
-          input_shape=(INPUT_DIM,),
-          num_classes=NUM_CLASSES)
-      y_test = keras.utils.to_categorical(y_test)
-      y_train = keras.utils.to_categorical(y_train)
-      # case 1
-      monitor = 'val_loss'
-      save_best_only = False
-      mode = 'auto'
+    def _call_and_count(*args, **kwargs):
+      self.method_counts[method_name] += 1
+      return method(*args, **kwargs)
 
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Dense(
-              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
-      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
-      model.compile(
-          loss='categorical_crossentropy',
-          optimizer='rmsprop',
-          metrics=['accuracy'])
+    return _call_and_count
 
-      cbks = [
-          keras.callbacks.ModelCheckpoint(
-              filepath,
-              monitor=monitor,
-              save_best_only=save_best_only,
-              mode=mode)
-      ]
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=1,
-          verbose=0)
-      assert os.path.exists(filepath)
-      os.remove(filepath)
 
-      # case 2
-      mode = 'min'
-      cbks = [
-          keras.callbacks.ModelCheckpoint(
-              filepath,
-              monitor=monitor,
-              save_best_only=save_best_only,
-              mode=mode)
-      ]
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=1,
-          verbose=0)
-      assert os.path.exists(filepath)
-      os.remove(filepath)
+def _get_numpy():
+  return np.ones((10, 10)), np.ones((10, 1))
 
-      # case 3
-      mode = 'max'
-      monitor = 'val_acc'
-      cbks = [
-          keras.callbacks.ModelCheckpoint(
-              filepath,
-              monitor=monitor,
-              save_best_only=save_best_only,
-              mode=mode)
-      ]
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=1,
-          verbose=0)
-      assert os.path.exists(filepath)
-      os.remove(filepath)
 
-      # case 4
-      save_best_only = True
-      cbks = [
-          keras.callbacks.ModelCheckpoint(
-              filepath,
-              monitor=monitor,
-              save_best_only=save_best_only,
-              mode=mode)
-      ]
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=1,
-          verbose=0)
-      assert os.path.exists(filepath)
-      os.remove(filepath)
+def _get_sequence():
 
-      # Case: metric not available.
-      cbks = [
-          keras.callbacks.ModelCheckpoint(
-              filepath,
-              monitor='unknown',
-              save_best_only=True)
-      ]
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=1,
-          verbose=0)
-      # File won't be written.
-      assert not os.path.exists(filepath)
+  class MySequence(keras.utils.data_utils.Sequence):
 
-      # case 5
-      save_best_only = False
-      period = 2
-      mode = 'auto'
+    def __getitem__(self, _):
+      return np.ones((2, 10)), np.ones((2, 1))
 
-      filepath = os.path.join(temp_dir, 'checkpoint.{epoch:02d}.h5')
-      cbks = [
-          keras.callbacks.ModelCheckpoint(
-              filepath,
-              monitor=monitor,
-              save_best_only=save_best_only,
-              mode=mode,
-              period=period)
-      ]
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=4,
-          verbose=1)
-      assert os.path.exists(filepath.format(epoch=2))
-      assert os.path.exists(filepath.format(epoch=4))
-      os.remove(filepath.format(epoch=2))
-      os.remove(filepath.format(epoch=4))
-      assert not os.path.exists(filepath.format(epoch=1))
-      assert not os.path.exists(filepath.format(epoch=3))
-
-      # Invalid use: this will raise a warning but not an Exception.
-      keras.callbacks.ModelCheckpoint(
-          filepath,
-          monitor=monitor,
-          save_best_only=save_best_only,
-          mode='unknown')
+    def __len__(self):
+      return 5
+
+  return MySequence(), None
+
+
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes
+class CallbackCountsTest(keras_parameterized.TestCase):
+
+  def _check_counts(self, counter, expected_counts):
+    """Checks that the counts registered by `counter` are those expected."""
+    for method_name, expected_count in expected_counts.items():
+      self.assertEqual(
+          counter.method_counts[method_name],
+          expected_count,
+          msg='For method {}: expected {}, got: {}'.format(
+              method_name, expected_count, counter.method_counts[method_name]))
+
+  def _get_model(self):
+    layers = [
+        keras.layers.Dense(10, activation='relu'),
+        keras.layers.Dense(1, activation='sigmoid')
+    ]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(10,))
+    model.compile(
+        adam.AdamOptimizer(0.001),
+        'binary_crossentropy',
+        run_eagerly=testing_utils.should_run_eagerly())
+    return model
+
+  @parameterized.named_parameters(('with_numpy', _get_numpy()),
+                                  ('with_sequence', _get_sequence()))
+  def test_callback_hooks_are_called_in_fit(self, data):
+    x, y = data
+    val_x, val_y = np.ones((4, 10)), np.ones((4, 1))
+
+    model = self._get_model()
+    counter = Counter()
+    model.fit(
+        x,
+        y,
+        validation_data=(val_x, val_y),
+        batch_size=2,
+        epochs=5,
+        callbacks=[counter])
+
+    self._check_counts(
+        counter, {
+            'on_batch_begin': 25,
+            'on_batch_end': 25,
+            'on_epoch_begin': 5,
+            'on_epoch_end': 5,
+            'on_predict_batch_begin': 0,
+            'on_predict_batch_end': 0,
+            'on_predict_begin': 0,
+            'on_predict_end': 0,
+            'on_test_batch_begin': 10,
+            'on_test_batch_end': 10,
+            'on_test_begin': 5,
+            'on_test_end': 5,
+            'on_train_batch_begin': 25,
+            'on_train_batch_end': 25,
+            'on_train_begin': 1,
+            'on_train_end': 1
+        })
+
+  @parameterized.named_parameters(('with_numpy', _get_numpy()),
+                                  ('with_sequence', _get_sequence()))
+  def test_callback_hooks_are_called_in_evaluate(self, data):
+    x, y = data
+
+    model = self._get_model()
+    counter = Counter()
+    model.evaluate(x, y, batch_size=2, callbacks=[counter])
+    self._check_counts(
+        counter, {
+            'on_test_batch_begin': 5,
+            'on_test_batch_end': 5,
+            'on_test_begin': 1,
+            'on_test_end': 1
+        })
+
+  @parameterized.named_parameters(('with_numpy', _get_numpy()),
+                                  ('with_sequence', _get_sequence()))
+  def test_callback_hooks_are_called_in_predict(self, data):
+    x = data[0]
+
+    model = self._get_model()
+    counter = Counter()
+    model.predict(x, batch_size=2, callbacks=[counter])
+    self._check_counts(
+        counter, {
+            'on_predict_batch_begin': 5,
+            'on_predict_batch_end': 5,
+            'on_predict_begin': 1,
+            'on_predict_end': 1
+        })
+
+  def test_callback_list_methods(self):
+    counter = Counter()
+    callback_list = keras.callbacks.CallbackList([counter])
+
+    batch = 0
+    callback_list.on_test_batch_begin(batch)
+    callback_list.on_test_batch_end(batch)
+    callback_list.on_predict_batch_begin(batch)
+    callback_list.on_predict_batch_end(batch)
+
+    self._check_counts(
+        counter, {
+            'on_test_batch_begin': 1,
+            'on_test_batch_end': 1,
+            'on_predict_batch_begin': 1,
+            'on_predict_batch_end': 1
+        })
+
+
+class KerasCallbacksTest(keras_parameterized.TestCase):
+
+  def _get_model(self, input_shape=None):
+    layers = [
+        keras.layers.Dense(3, activation='relu'),
+        keras.layers.Dense(2, activation='softmax')
+    ]
+    model = testing_utils.get_model_from_layers(layers, input_shape=input_shape)
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        metrics=[keras.metrics.CategoricalAccuracy(name='my_acc')],
+        run_eagerly=testing_utils.should_run_eagerly())
+    return model
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_progbar_logging(self):
+    model = self._get_model(input_shape=(3,))
+
+    x = array_ops.ones((50, 3))
+    y = array_ops.zeros((50, 2))
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(10)
+    expected_log = r'(.*- loss:.*- my_acc:.*)+'
+
+    with self.captureWritesToStream(sys.stdout) as printed:
+      model.fit(dataset, epochs=2, steps_per_epoch=10)
+      self.assertRegexpMatches(printed.contents(), expected_log)
+
+  @keras_parameterized.run_with_all_model_types(exclude_models='functional')
+  @keras_parameterized.run_all_keras_modes
+  def test_progbar_logging_deferred_model_build(self):
+    model = self._get_model()
+    self.assertFalse(model.built)
+
+    x = array_ops.ones((50, 3))
+    y = array_ops.zeros((50, 2))
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).batch(10)
+    expected_log = r'(.*- loss:.*- my_acc:.*)+'
+
+    with self.captureWritesToStream(sys.stdout) as printed:
+      model.fit(dataset, epochs=2, steps_per_epoch=10)
+      self.assertRegexpMatches(printed.contents(), expected_log)
+
+  @keras_parameterized.run_with_all_model_types
+  def test_ModelCheckpoint(self):
+    if h5py is None:
+      return  # Skip test if models cannot be saved.
+
+    layers = [
+        keras.layers.Dense(NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'),
+        keras.layers.Dense(NUM_CLASSES, activation='softmax')
+    ]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(10,))
+    model.compile(
+        loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])
+
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+    filepath = os.path.join(temp_dir, 'checkpoint')
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=TRAIN_SAMPLES,
+        test_samples=TEST_SAMPLES,
+        input_shape=(INPUT_DIM,),
+        num_classes=NUM_CLASSES)
+    y_test = keras.utils.to_categorical(y_test)
+    y_train = keras.utils.to_categorical(y_train)
+    # case 1
+    monitor = 'val_loss'
+    save_best_only = False
+    mode = 'auto'
+
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.Dense(
+            NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+    model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+    model.compile(
+        loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])
+
+    cbks = [
+        keras.callbacks.ModelCheckpoint(
+            filepath,
+            monitor=monitor,
+            save_best_only=save_best_only,
+            mode=mode)
+    ]
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=BATCH_SIZE,
+        validation_data=(x_test, y_test),
+        callbacks=cbks,
+        epochs=1,
+        verbose=0)
+    assert os.path.exists(filepath)
+    os.remove(filepath)
+
+    # case 2
+    mode = 'min'
+    cbks = [
+        keras.callbacks.ModelCheckpoint(
+            filepath,
+            monitor=monitor,
+            save_best_only=save_best_only,
+            mode=mode)
+    ]
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=BATCH_SIZE,
+        validation_data=(x_test, y_test),
+        callbacks=cbks,
+        epochs=1,
+        verbose=0)
+    assert os.path.exists(filepath)
+    os.remove(filepath)
+
+    # case 3
+    mode = 'max'
+    monitor = 'val_acc'
+    cbks = [
+        keras.callbacks.ModelCheckpoint(
+            filepath,
+            monitor=monitor,
+            save_best_only=save_best_only,
+            mode=mode)
+    ]
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=BATCH_SIZE,
+        validation_data=(x_test, y_test),
+        callbacks=cbks,
+        epochs=1,
+        verbose=0)
+    assert os.path.exists(filepath)
+    os.remove(filepath)
+
+    # case 4
+    save_best_only = True
+    cbks = [
+        keras.callbacks.ModelCheckpoint(
+            filepath,
+            monitor=monitor,
+            save_best_only=save_best_only,
+            mode=mode)
+    ]
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=BATCH_SIZE,
+        validation_data=(x_test, y_test),
+        callbacks=cbks,
+        epochs=1,
+        verbose=0)
+    assert os.path.exists(filepath)
+    os.remove(filepath)
+
+    # Case: metric not available.
+    cbks = [
+        keras.callbacks.ModelCheckpoint(
+            filepath,
+            monitor='unknown',
+            save_best_only=True)
+    ]
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=BATCH_SIZE,
+        validation_data=(x_test, y_test),
+        callbacks=cbks,
+        epochs=1,
+        verbose=0)
+    # File won't be written.
+    assert not os.path.exists(filepath)
+
+    # case 5
+    save_best_only = False
+    period = 2
+    mode = 'auto'
+
+    filepath = os.path.join(temp_dir, 'checkpoint.{epoch:02d}.h5')
+    cbks = [
+        keras.callbacks.ModelCheckpoint(
+            filepath,
+            monitor=monitor,
+            save_best_only=save_best_only,
+            mode=mode,
+            period=period)
+    ]
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=BATCH_SIZE,
+        validation_data=(x_test, y_test),
+        callbacks=cbks,
+        epochs=4,
+        verbose=1)
+    assert os.path.exists(filepath.format(epoch=2))
+    assert os.path.exists(filepath.format(epoch=4))
+    os.remove(filepath.format(epoch=2))
+    os.remove(filepath.format(epoch=4))
+    assert not os.path.exists(filepath.format(epoch=1))
+    assert not os.path.exists(filepath.format(epoch=3))
+
+    # Invalid use: this will raise a warning but not an Exception.
+    keras.callbacks.ModelCheckpoint(
+        filepath,
+        monitor=monitor,
+        save_best_only=save_best_only,
+        mode='unknown')
 
   def test_EarlyStopping(self):
     with self.cached_session():
@@ -238,9 +448,7 @@ class KerasCallbacksTest(test.TestCase):
       model = testing_utils.get_small_sequential_mlp(
           num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
       model.compile(
-          loss='categorical_crossentropy',
-          optimizer='rmsprop',
-          metrics=['accuracy'])
+          loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])
 
       cases = [
           ('max', 'val_acc'),
@@ -298,7 +506,7 @@ class KerasCallbacksTest(test.TestCase):
       model = testing_utils.get_small_sequential_mlp(
           num_hidden=1, num_classes=1, input_dim=1)
       model.compile(
-          optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
+          optimizer='sgd', loss='binary_crossentropy', metrics=['acc'])
 
       stopper = keras.callbacks.EarlyStopping(monitor='acc',
                                               baseline=baseline)
@@ -403,7 +611,6 @@ class KerasCallbacksTest(test.TestCase):
           float(keras.backend.get_value(
               model.optimizer.lr)) - 0.01 / 4) < keras.backend.epsilon()
 
-  @test_util.run_v1_only('b/120545219')
   def test_ReduceLROnPlateau(self):
     with self.cached_session():
       np.random.seed(1337)
@@ -425,13 +632,15 @@ class KerasCallbacksTest(test.TestCase):
             optimizer=keras.optimizers.SGD(lr=0.1))
         return model
 
+      # TODO(psv): Make sure the callback works correctly when min_delta is
+      # set as 0. Test fails when the order of this callback and assertion is
+      # interchanged.
       model = make_model()
-      # This should reduce the LR after the first epoch (due to high epsilon).
       cbks = [
           keras.callbacks.ReduceLROnPlateau(
               monitor='val_loss',
               factor=0.1,
-              min_delta=10,
+              min_delta=0,
               patience=1,
               cooldown=5)
       ]
@@ -444,16 +653,15 @@ class KerasCallbacksTest(test.TestCase):
           epochs=5,
           verbose=0)
       self.assertAllClose(
-          float(keras.backend.get_value(model.optimizer.lr)),
-          0.01,
-          atol=1e-4)
+          float(keras.backend.get_value(model.optimizer.lr)), 0.1, atol=1e-4)
 
       model = make_model()
+      # This should reduce the LR after the first epoch (due to high epsilon).
       cbks = [
           keras.callbacks.ReduceLROnPlateau(
               monitor='val_loss',
               factor=0.1,
-              min_delta=0,
+              min_delta=10,
               patience=1,
               cooldown=5)
       ]
@@ -466,7 +674,7 @@ class KerasCallbacksTest(test.TestCase):
           epochs=5,
           verbose=2)
       self.assertAllClose(
-          float(keras.backend.get_value(model.optimizer.lr)), 0.1, atol=1e-4)
+          float(keras.backend.get_value(model.optimizer.lr)), 0.01, atol=1e-4)
 
   def test_ReduceLROnPlateau_patience(self):
 
@@ -675,310 +883,6 @@ class KerasCallbacksTest(test.TestCase):
       self.assertEqual(len(loss), 1)
       self.assertEqual(loss[0], np.inf)
 
-  @test_util.run_v1_only('b/120545219')
-  def test_TensorBoard(self):
-    np.random.seed(1337)
-
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-
-    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-        train_samples=TRAIN_SAMPLES,
-        test_samples=TEST_SAMPLES,
-        input_shape=(INPUT_DIM,),
-        num_classes=NUM_CLASSES)
-    y_test = keras.utils.to_categorical(y_test)
-    y_train = keras.utils.to_categorical(y_train)
-
-    def data_generator(train):
-      if train:
-        max_batch_index = len(x_train) // BATCH_SIZE
-      else:
-        max_batch_index = len(x_test) // BATCH_SIZE
-      i = 0
-      while 1:
-        if train:
-          yield (x_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE],
-                 y_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE])
-        else:
-          yield (x_test[i * BATCH_SIZE:(i + 1) * BATCH_SIZE],
-                 y_test[i * BATCH_SIZE:(i + 1) * BATCH_SIZE])
-        i += 1
-        i %= max_batch_index
-
-    # case: Sequential
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Dense(
-              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
-      # non_trainable_weights: moving_variance, moving_mean
-      model.add(keras.layers.BatchNormalization())
-      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
-      model.compile(
-          loss='categorical_crossentropy',
-          optimizer='sgd',
-          metrics=['accuracy'])
-      tsb = keras.callbacks.TensorBoard(
-          log_dir=temp_dir, histogram_freq=1, write_images=True,
-          write_grads=True, batch_size=5)
-      cbks = [tsb]
-
-      # fit with validation data
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=3,
-          verbose=0)
-
-      # fit with validation data and accuracy
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=2,
-          verbose=0)
-
-      # fit generator with validation data
-      model.fit_generator(
-          data_generator(True),
-          len(x_train),
-          epochs=2,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          verbose=0)
-
-      # fit generator without validation data
-      # histogram_freq must be zero
-      tsb.histogram_freq = 0
-      model.fit_generator(
-          data_generator(True),
-          len(x_train),
-          epochs=2,
-          callbacks=cbks,
-          verbose=0)
-
-      # fit generator with validation data and accuracy
-      tsb.histogram_freq = 1
-      model.fit_generator(
-          data_generator(True),
-          len(x_train),
-          epochs=2,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          verbose=0)
-
-      # fit generator without validation data and accuracy
-      tsb.histogram_freq = 0
-      model.fit_generator(
-          data_generator(True), len(x_train), epochs=2, callbacks=cbks)
-      assert os.path.exists(temp_dir)
-
-  @test_util.run_v1_only('b/120545219')
-  def test_TensorBoard_multi_input_output(self):
-    np.random.seed(1337)
-    tmpdir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
-
-    with self.cached_session():
-      filepath = os.path.join(tmpdir, 'logs')
-
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=TRAIN_SAMPLES,
-          test_samples=TEST_SAMPLES,
-          input_shape=(INPUT_DIM,),
-          num_classes=NUM_CLASSES)
-      y_test = keras.utils.to_categorical(y_test)
-      y_train = keras.utils.to_categorical(y_train)
-
-      def data_generator(train):
-        if train:
-          max_batch_index = len(x_train) // BATCH_SIZE
-        else:
-          max_batch_index = len(x_test) // BATCH_SIZE
-        i = 0
-        while 1:
-          if train:
-            # simulate multi-input/output models
-            yield ([x_train[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2,
-                   [y_train[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2)
-          else:
-            yield ([x_test[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2,
-                   [y_test[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2)
-          i += 1
-          i %= max_batch_index
-
-      inp1 = keras.Input((INPUT_DIM,))
-      inp2 = keras.Input((INPUT_DIM,))
-      inp = keras.layers.add([inp1, inp2])
-      hidden = keras.layers.Dense(2, activation='relu')(inp)
-      hidden = keras.layers.Dropout(0.1)(hidden)
-      output1 = keras.layers.Dense(NUM_CLASSES, activation='softmax')(hidden)
-      output2 = keras.layers.Dense(NUM_CLASSES, activation='softmax')(hidden)
-      model = keras.models.Model([inp1, inp2], [output1, output2])
-      model.compile(loss='categorical_crossentropy',
-                    optimizer='sgd',
-                    metrics=['accuracy'])
-
-      # we must generate new callbacks for each test, as they aren't stateless
-      def callbacks_factory(histogram_freq):
-        return [keras.callbacks.TensorBoard(log_dir=filepath,
-                                            histogram_freq=histogram_freq,
-                                            write_images=True, write_grads=True,
-                                            batch_size=5)]
-
-      # fit without validation data
-      model.fit([x_train] * 2, [y_train] * 2, batch_size=BATCH_SIZE,
-                callbacks=callbacks_factory(histogram_freq=0), epochs=3)
-
-      # fit with validation data and accuracy
-      model.fit([x_train] * 2, [y_train] * 2, batch_size=BATCH_SIZE,
-                validation_data=([x_test] * 2, [y_test] * 2),
-                callbacks=callbacks_factory(histogram_freq=1), epochs=2)
-
-      # fit generator without validation data
-      model.fit_generator(data_generator(True), len(x_train), epochs=2,
-                          callbacks=callbacks_factory(histogram_freq=0))
-
-      # fit generator with validation data and accuracy
-      model.fit_generator(data_generator(True), len(x_train), epochs=2,
-                          validation_data=([x_test] * 2, [y_test] * 2),
-                          callbacks=callbacks_factory(histogram_freq=1))
-      assert os.path.isdir(filepath)
-
-  @test_util.run_v1_only('b/120545219')
-  def test_Tensorboard_histogram_summaries_in_test_function(self):
-
-    class FileWriterStub(object):
-
-      def __init__(self, logdir, graph=None):
-        self.logdir = logdir
-        self.graph = graph
-        self.steps_seen = []
-
-      def add_summary(self, summary, global_step):
-        summary_obj = summary_pb2.Summary()
-
-        # ensure a valid Summary proto is being sent
-        if isinstance(summary, bytes):
-          summary_obj.ParseFromString(summary)
-        else:
-          assert isinstance(summary, summary_pb2.Summary)
-          summary_obj = summary
-
-        # keep track of steps seen for the merged_summary op,
-        # which contains the histogram summaries
-        if len(summary_obj.value) > 1:
-          self.steps_seen.append(global_step)
-
-      def flush(self):
-        pass
-
-      def close(self):
-        pass
-
-    def _init_writer(obj):
-      obj.writer = FileWriterStub(obj.log_dir)
-
-    np.random.seed(1337)
-    tmpdir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
-    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-        train_samples=TRAIN_SAMPLES,
-        test_samples=TEST_SAMPLES,
-        input_shape=(INPUT_DIM,),
-        num_classes=NUM_CLASSES)
-    y_test = keras.utils.to_categorical(y_test)
-    y_train = keras.utils.to_categorical(y_train)
-
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Dense(
-              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
-      # non_trainable_weights: moving_variance, moving_mean
-      model.add(keras.layers.BatchNormalization())
-      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
-      model.compile(
-          loss='categorical_crossentropy',
-          optimizer='sgd',
-          metrics=['accuracy'])
-      keras.callbacks.TensorBoard._init_writer = _init_writer
-      tsb = keras.callbacks.TensorBoard(
-          log_dir=tmpdir,
-          histogram_freq=1,
-          write_images=True,
-          write_grads=True,
-          batch_size=5)
-      cbks = [tsb]
-
-      # fit with validation data
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=3,
-          verbose=0)
-
-      self.assertAllEqual(tsb.writer.steps_seen, [0, 1, 2, 3, 4, 5])
-
-  @test_util.run_v1_only('b/120545219')
-  def test_Tensorboard_histogram_summaries_with_generator(self):
-    np.random.seed(1337)
-    tmpdir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
-
-    def generator():
-      x = np.random.randn(10, 100).astype(np.float32)
-      y = np.random.randn(10, 10).astype(np.float32)
-      while True:
-        yield x, y
-
-    with self.cached_session():
-      model = testing_utils.get_small_sequential_mlp(
-          num_hidden=10, num_classes=10, input_dim=100)
-      model.compile(
-          loss='categorical_crossentropy',
-          optimizer='sgd',
-          metrics=['accuracy'])
-      tsb = keras.callbacks.TensorBoard(
-          log_dir=tmpdir,
-          histogram_freq=1,
-          write_images=True,
-          write_grads=True,
-          batch_size=5)
-      cbks = [tsb]
-
-      # fit with validation generator
-      model.fit_generator(
-          generator(),
-          steps_per_epoch=2,
-          epochs=2,
-          validation_data=generator(),
-          validation_steps=2,
-          callbacks=cbks,
-          verbose=0)
-
-      with self.assertRaises(ValueError):
-        # fit with validation generator but no
-        # validation_steps
-        model.fit_generator(
-            generator(),
-            steps_per_epoch=2,
-            epochs=2,
-            validation_data=generator(),
-            callbacks=cbks,
-            verbose=0)
-
-      self.assertTrue(os.path.exists(tmpdir))
-
   @unittest.skipIf(
       os.name == 'nt',
       'use_multiprocessing=True does not work on windows properly.')
@@ -1026,209 +930,6 @@ class KerasCallbacksTest(test.TestCase):
       t.join()
       assert not t.is_alive()
 
-  def test_TensorBoard_with_ReduceLROnPlateau(self):
-    with self.cached_session():
-      temp_dir = self.get_temp_dir()
-      self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=TRAIN_SAMPLES,
-          test_samples=TEST_SAMPLES,
-          input_shape=(INPUT_DIM,),
-          num_classes=NUM_CLASSES)
-      y_test = keras.utils.to_categorical(y_test)
-      y_train = keras.utils.to_categorical(y_train)
-
-      model = testing_utils.get_small_sequential_mlp(
-          num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
-      model.compile(
-          loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
-
-      cbks = [
-          keras.callbacks.ReduceLROnPlateau(
-              monitor='val_loss', factor=0.5, patience=4, verbose=1),
-          keras.callbacks.TensorBoard(log_dir=temp_dir)
-      ]
-
-      model.fit(
-          x_train,
-          y_train,
-          batch_size=BATCH_SIZE,
-          validation_data=(x_test, y_test),
-          callbacks=cbks,
-          epochs=2,
-          verbose=0)
-
-      assert os.path.exists(temp_dir)
-
-  @test_util.run_deprecated_v1
-  def test_Tensorboard_batch_logging(self):
-
-    class FileWriterStub(object):
-
-      def __init__(self, logdir, graph=None):
-        self.logdir = logdir
-        self.graph = graph
-        self.batches_logged = []
-        self.summary_values = []
-        self.summary_tags = []
-
-      def add_summary(self, summary, step):
-        self.summary_values.append(summary.value[0].simple_value)
-        self.summary_tags.append(summary.value[0].tag)
-        self.batches_logged.append(step)
-
-      def flush(self):
-        pass
-
-      def close(self):
-        pass
-
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-
-    tb_cbk = keras.callbacks.TensorBoard(temp_dir, update_freq='batch')
-    tb_cbk.writer = FileWriterStub(temp_dir)
-
-    for batch in range(5):
-      tb_cbk.on_batch_end(batch, {'acc': batch})
-    self.assertEqual(tb_cbk.writer.batches_logged, [0, 1, 2, 3, 4])
-    self.assertEqual(tb_cbk.writer.summary_values, [0., 1., 2., 3., 4.])
-    self.assertEqual(tb_cbk.writer.summary_tags, ['batch_acc'] * 5)
-
-  @test_util.run_deprecated_v1
-  def test_Tensorboard_epoch_and_batch_logging(self):
-
-    class FileWriterStub(object):
-
-      def __init__(self, logdir, graph=None):
-        self.logdir = logdir
-        self.graph = graph
-
-      def add_summary(self, summary, step):
-        if 'batch_' in summary.value[0].tag:
-          self.batch_summary = (step, summary)
-        elif 'epoch_' in summary.value[0].tag:
-          self.epoch_summary = (step, summary)
-
-      def flush(self):
-        pass
-
-      def close(self):
-        pass
-
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-
-    tb_cbk = keras.callbacks.TensorBoard(temp_dir, update_freq='batch')
-    tb_cbk.writer = FileWriterStub(temp_dir)
-
-    tb_cbk.on_batch_end(0, {'acc': 5.0})
-    batch_step, batch_summary = tb_cbk.writer.batch_summary
-    self.assertEqual(batch_step, 0)
-    self.assertEqual(batch_summary.value[0].simple_value, 5.0)
-
-    tb_cbk = keras.callbacks.TensorBoard(temp_dir, update_freq='epoch')
-    tb_cbk.writer = FileWriterStub(temp_dir)
-    tb_cbk.on_epoch_end(0, {'acc': 10.0})
-    epoch_step, epoch_summary = tb_cbk.writer.epoch_summary
-    self.assertEqual(epoch_step, 0)
-    self.assertEqual(epoch_summary.value[0].simple_value, 10.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_Tensorboard_eager(self):
-    temp_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-
-    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-        train_samples=TRAIN_SAMPLES,
-        test_samples=TEST_SAMPLES,
-        input_shape=(INPUT_DIM,),
-        num_classes=NUM_CLASSES)
-    y_test = keras.utils.to_categorical(y_test)
-    y_train = keras.utils.to_categorical(y_train)
-
-    model = testing_utils.get_small_sequential_mlp(
-        num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
-    model.compile(
-        loss='binary_crossentropy',
-        optimizer=adam.AdamOptimizer(0.01),
-        metrics=['accuracy'])
-
-    cbks = [keras.callbacks.TensorBoard(log_dir=temp_dir)]
-
-    model.fit(
-        x_train,
-        y_train,
-        batch_size=BATCH_SIZE,
-        validation_data=(x_test, y_test),
-        callbacks=cbks,
-        epochs=2,
-        verbose=0)
-
-    self.assertTrue(os.path.exists(temp_dir))
-
-  @test_util.run_deprecated_v1
-  def test_TensorBoard_update_freq(self):
-
-    class FileWriterStub(object):
-
-      def __init__(self, logdir, graph=None):
-        self.logdir = logdir
-        self.graph = graph
-        self.batch_summaries = []
-        self.epoch_summaries = []
-
-      def add_summary(self, summary, step):
-        if 'batch_' in summary.value[0].tag:
-          self.batch_summaries.append((step, summary))
-        elif 'epoch_' in summary.value[0].tag:
-          self.epoch_summaries.append((step, summary))
-
-      def flush(self):
-        pass
-
-      def close(self):
-        pass
-
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
-
-    # Epoch mode
-    tb_cbk = keras.callbacks.TensorBoard(temp_dir, update_freq='epoch')
-    tb_cbk.writer = FileWriterStub(temp_dir)
-
-    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
-    self.assertEqual(tb_cbk.writer.batch_summaries, [])
-    tb_cbk.on_epoch_end(0, {'acc': 10.0, 'size': 1})
-    self.assertEqual(len(tb_cbk.writer.epoch_summaries), 1)
-
-    # Batch mode
-    tb_cbk = keras.callbacks.TensorBoard(temp_dir, update_freq='batch')
-    tb_cbk.writer = FileWriterStub(temp_dir)
-
-    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
-    self.assertEqual(len(tb_cbk.writer.batch_summaries), 1)
-    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
-    self.assertEqual(len(tb_cbk.writer.batch_summaries), 2)
-    self.assertFalse(tb_cbk.writer.epoch_summaries)
-
-    # Integer mode
-    tb_cbk = keras.callbacks.TensorBoard(temp_dir, update_freq=20)
-    tb_cbk.writer = FileWriterStub(temp_dir)
-
-    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
-    self.assertFalse(tb_cbk.writer.batch_summaries)
-    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
-    self.assertEqual(len(tb_cbk.writer.batch_summaries), 1)
-    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
-    self.assertEqual(len(tb_cbk.writer.batch_summaries), 1)
-    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
-    self.assertEqual(len(tb_cbk.writer.batch_summaries), 2)
-    tb_cbk.on_batch_end(0, {'acc': 10.0, 'size': 10})
-    self.assertEqual(len(tb_cbk.writer.batch_summaries), 2)
-    self.assertFalse(tb_cbk.writer.epoch_summaries)
-
   def test_RemoteMonitorWithJsonPayload(self):
     if requests is None:
       self.skipTest('`requests` required to run this test')
@@ -1260,47 +961,218 @@ class KerasCallbacksTest(test.TestCase):
             callbacks=cbks,
             epochs=1)
 
-  @test_util.run_deprecated_v1
-  def test_fit_generator_with_callback(self):
-
-    class TestCallback(keras.callbacks.Callback):
-
-      def set_model(self, model):
-        # Check the model operations for the optimizer operations that
-        # the _make_train_function adds under a named scope for the
-        # optimizer. This ensurs the full model is populated before the
-        # set_model callback is called.
-        optimizer_name_scope = 'training/' + model.optimizer.__class__.__name__
-        graph_def = ops.get_default_graph().as_graph_def()
-        for node in graph_def.node:
-          if node.name.startswith(optimizer_name_scope):
-            return
-        raise RuntimeError('The optimizer operations are not present in the '
-                           'model graph when the Callback.set_model function '
-                           'is called')
-    np.random.seed(1337)
 
-    def generator():
-      x = np.random.randn(10, 100).astype(np.float32)
-      y = np.random.randn(10, 10).astype(np.float32)
-      while True:
-        yield x, y
+# A summary that was emitted during a test. Fields:
+#   logdir: str. The logdir of the FileWriter to which the summary was
+#     written.
+#   tag: str. The name of the summary.
+_ObservedSummary = collections.namedtuple('_ObservedSummary', ('logdir', 'tag'))
+
+
+class _MockSummaryFile(object):
+  """Record summary tag names and the files to which they're written.
+
+  Fields `scalars`, `images`, and `histograms` are sets containing
+  `_ObservedSummary` values.
+  """
+
+  def __init__(self):
+    self.scalars = set()
+    self.images = set()
+    self.histograms = set()
+
+
+@tf_contextlib.contextmanager
+def _mock_summary_api():
+  summary_file = _MockSummaryFile()
+
+  # Keep track of the logdir associated with each created resource.
+  # (There doesn't seem to be an easy way to get this information after
+  # the fact.)
+  resource_logdirs = {}
+  real_create_file_writer = summary_ops_v2.create_file_writer
+
+  def mock_create_file_writer(logdir, *args, **kwargs):
+    writer = real_create_file_writer(logdir, *args, **kwargs)
+    resource = writer._resource
+    assert resource is not None
+    assert resource not in resource_logdirs, (resource, resource_logdirs)
+    resource_logdirs[resource] = logdir
+    return writer
+
+  def make_mock_summary(summary_set):
+
+    def mock_summary(tag, *args, **kwargs):
+      del args  # unused
+      del kwargs  # unused
+      resource = context.context().summary_writer_resource
+      logdir = resource_logdirs[resource]
+      summary_set.add(_ObservedSummary(logdir=logdir, tag=tag))
+
+    return mock_summary
+
+  with test.mock.patch.object(summary_ops_v2,
+                              'create_file_writer',
+                              mock_create_file_writer), \
+        test.mock.patch.object(summary_ops_v2,
+                               'scalar',
+                               make_mock_summary(summary_file.scalars)), \
+        test.mock.patch.object(summary_ops_v2,
+                               'histogram',
+                               make_mock_summary(summary_file.histograms)), \
+        test.mock.patch.object(summary_ops_v2,
+                               'image',
+                               make_mock_summary(summary_file.images)):
+    yield summary_file
+
+
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+class TestTensorBoardV2(keras_parameterized.TestCase):
+
+  def setUp(self):
+    super(TestTensorBoardV2, self).setUp()
+    self.logdir = os.path.join(self.get_temp_dir(), 'tb')
+    self.train_dir = os.path.join(self.logdir, 'train')
+    self.validation_dir = os.path.join(self.logdir, 'validation')
+
+  def _get_model(self):
+    layers = [
+        keras.layers.Conv2D(8, (3, 3)),
+        keras.layers.Flatten(),
+        keras.layers.Dense(1)
+    ]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(10, 10, 1))
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    return model
+
+  def test_TensorBoard_basic(self):
+    model = self._get_model()
+    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+    tb_cbk = keras.callbacks.TensorBoard(self.logdir)
+
+    with _mock_summary_api() as summary_file:
+      model.fit(
+          x,
+          y,
+          batch_size=2,
+          epochs=2,
+          validation_data=(x, y),
+          callbacks=[tb_cbk])
+
+    self.assertEqual(
+        summary_file.scalars, {
+            _ObservedSummary(logdir=self.train_dir, tag='epoch_loss'),
+            _ObservedSummary(logdir=self.validation_dir, tag='epoch_loss'),
+        })
+
+  def test_TensorBoard_batch_metrics(self):
+    model = self._get_model()
+    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+    tb_cbk = keras.callbacks.TensorBoard(self.logdir, update_freq=1)
+
+    with _mock_summary_api() as summary_file:
+      model.fit(
+          x,
+          y,
+          batch_size=2,
+          epochs=2,
+          validation_data=(x, y),
+          callbacks=[tb_cbk])
+
+    self.assertEqual(
+        summary_file.scalars,
+        {
+            _ObservedSummary(logdir=self.train_dir, tag='batch_loss'),
+            _ObservedSummary(logdir=self.train_dir, tag='epoch_loss'),
+            _ObservedSummary(logdir=self.validation_dir, tag='epoch_loss'),
+        },
+    )
+
+  def test_TensorBoard_weight_histograms(self):
+    model = self._get_model()
+    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+    temp_dir = self.get_temp_dir() + '/tb'
+    tb_cbk = keras.callbacks.TensorBoard(temp_dir, histogram_freq=1)
+
+    with _mock_summary_api() as summary_file:
+      model.fit(
+          x,
+          y,
+          batch_size=2,
+          epochs=2,
+          validation_data=(x, y),
+          callbacks=[tb_cbk])
+
+    self.assertEqual(
+        summary_file.scalars,
+        {
+            _ObservedSummary(logdir=self.train_dir, tag='epoch_loss'),
+            _ObservedSummary(logdir=self.validation_dir, tag='epoch_loss'),
+        },
+    )
+    self.assertEqual(
+        self._strip_layer_names(summary_file.histograms),
+        {
+            _ObservedSummary(logdir=self.train_dir, tag='bias_0'),
+            _ObservedSummary(logdir=self.train_dir, tag='kernel_0'),
+        },
+    )
+
+  def test_TensorBoard_weight_images(self):
+    model = self._get_model()
+    x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1))
+    temp_dir = self.get_temp_dir() + '/tb'
+    tb_cbk = keras.callbacks.TensorBoard(
+        temp_dir, histogram_freq=1, write_images=True)
+
+    with _mock_summary_api() as summary_file:
+      model.fit(
+          x,
+          y,
+          batch_size=2,
+          epochs=2,
+          validation_data=(x, y),
+          callbacks=[tb_cbk])
+
+    self.assertEqual(
+        summary_file.scalars,
+        {
+            _ObservedSummary(logdir=self.train_dir, tag='epoch_loss'),
+            _ObservedSummary(logdir=self.validation_dir, tag='epoch_loss'),
+        },
+    )
+    self.assertEqual(
+        self._strip_layer_names(summary_file.histograms),
+        {
+            _ObservedSummary(logdir=self.train_dir, tag='bias_0'),
+            _ObservedSummary(logdir=self.train_dir, tag='kernel_0'),
+        },
+    )
+    self.assertEqual(
+        self._strip_layer_names(summary_file.images),
+        {
+            _ObservedSummary(logdir=self.train_dir, tag='bias_0'),
+            _ObservedSummary(logdir=self.train_dir, tag='kernel_0'),
+        },
+    )
+
+  def _strip_layer_names(self, summaries):
+    """Deduplicate summary names modulo layer suffix.
+
+    Args:
+      summaries: A `set` of `_ObservedSummary` values.
+
+    Returns:
+      A new `set` of `_ObservedSummary` values with layer suffixes
+      removed.
+    """
+    return {s._replace(tag=s.tag[s.tag.rfind('/') + 1:]) for s in summaries}
+
+  def test_TensorBoard_invalid_argument(self):
+    with self.assertRaisesRegexp(ValueError, 'Unrecognized arguments'):
+      keras.callbacks.TensorBoard(wwrite_images=True)
 
-    with self.cached_session():
-      model = testing_utils.get_small_sequential_mlp(
-          num_hidden=10, num_classes=10, input_dim=100)
-      model.compile(
-          loss='categorical_crossentropy',
-          optimizer='sgd',
-          metrics=['accuracy'])
-      model.fit_generator(
-          generator(),
-          steps_per_epoch=2,
-          epochs=1,
-          validation_data=generator(),
-          validation_steps=2,
-          callbacks=[TestCallback()],
-          verbose=0)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/callbacks_v1.py b/tensorflow/python/keras/callbacks_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..980eacd356d996c4d5ecbde1e7d3a9cd2edc7e6c
--- /dev/null
+++ b/tensorflow/python/keras/callbacks_v1.py
@@ -0,0 +1,424 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=g-import-not-at-top
+"""Callbacks: utilities called at certain points during model training.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import callbacks
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import summary_ops_v2
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.summary import summary as tf_summary
+from tensorflow.python.training import saver
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export(v1=['keras.callbacks.TensorBoard'])
+class TensorBoard(callbacks.Callback):
+  # pylint: disable=line-too-long
+  """TensorBoard basic visualizations.
+
+  This callback writes a log for TensorBoard, which allows
+  you to visualize dynamic graphs of your training and test
+  metrics, as well as activation histograms for the different
+  layers in your model.
+
+  TensorBoard is a visualization tool provided with TensorFlow.
+
+  If you have installed TensorFlow with pip, you should be able
+  to launch TensorBoard from the command line:
+
+  ```sh
+  tensorboard --logdir=/full_path_to_your_logs
+  ```
+
+  You can find more information about TensorBoard
+  [here](https://www.tensorflow.org/get_started/summaries_and_tensorboard).
+
+  Arguments:
+      log_dir: the path of the directory where to save the log files to be
+        parsed by TensorBoard.
+      histogram_freq: frequency (in epochs) at which to compute activation and
+        weight histograms for the layers of the model. If set to 0, histograms
+        won't be computed. Validation data (or split) must be specified for
+        histogram visualizations.
+      write_graph: whether to visualize the graph in TensorBoard. The log file
+        can become quite large when write_graph is set to True.
+      write_grads: whether to visualize gradient histograms in TensorBoard.
+        `histogram_freq` must be greater than 0.
+      batch_size: size of batch of inputs to feed to the network for histograms
+        computation.
+      write_images: whether to write model weights to visualize as image in
+        TensorBoard.
+      embeddings_freq: frequency (in epochs) at which selected embedding layers
+        will be saved. If set to 0, embeddings won't be computed. Data to be
+        visualized in TensorBoard's Embedding tab must be passed as
+        `embeddings_data`.
+      embeddings_layer_names: a list of names of layers to keep eye on. If None
+        or empty list all the embedding layer will be watched.
+      embeddings_metadata: a dictionary which maps layer name to a file name in
+        which metadata for this embedding layer is saved. See the
+          [details](https://www.tensorflow.org/how_tos/embedding_viz/#metadata_optional)
+            about metadata files format. In case if the same metadata file is
+            used for all embedding layers, string can be passed.
+      embeddings_data: data to be embedded at layers specified in
+        `embeddings_layer_names`. Numpy array (if the model has a single input)
+        or list of Numpy arrays (if the model has multiple inputs). Learn [more
+        about
+            embeddings](https://www.tensorflow.org/programmers_guide/embedding)
+      update_freq: `'batch'` or `'epoch'` or integer. When using `'batch'`,
+        writes the losses and metrics to TensorBoard after each batch. The same
+        applies for `'epoch'`. If using an integer, let's say `1000`, the
+        callback will write the metrics and losses to TensorBoard every 1000
+        samples. Note that writing too frequently to TensorBoard can slow down
+        your training.
+
+  Raises:
+      ValueError: If histogram_freq is set and no validation data is provided.
+
+  @compatibility(eager)
+  Using the `TensorBoard` callback will work when eager execution is enabled,
+  with the restriction that outputting histogram summaries of weights and
+  gradients is not supported. Consequently, `histogram_freq` will be ignored.
+  @end_compatibility
+  """
+
+  # pylint: enable=line-too-long
+
+  def __init__(self,
+               log_dir='./logs',
+               histogram_freq=0,
+               batch_size=32,
+               write_graph=True,
+               write_grads=False,
+               write_images=False,
+               embeddings_freq=0,
+               embeddings_layer_names=None,
+               embeddings_metadata=None,
+               embeddings_data=None,
+               update_freq='epoch'):
+    super(TensorBoard, self).__init__()
+    self.log_dir = log_dir
+    self.histogram_freq = histogram_freq
+    if self.histogram_freq and context.executing_eagerly():
+      logging.warning(
+          UserWarning('Weight and gradient histograms not supported for eager'
+                      'execution, setting `histogram_freq` to `0`.'))
+      self.histogram_freq = 0
+    self.merged = None
+    self.write_graph = write_graph
+    self.write_grads = write_grads
+    self.write_images = write_images
+    self.batch_size = batch_size
+    self._current_batch = 0
+    self._total_batches_seen = 0
+    self._total_val_batches_seen = 0
+    self.embeddings_freq = embeddings_freq
+    self.embeddings_layer_names = embeddings_layer_names
+    self.embeddings_metadata = embeddings_metadata
+    self.embeddings_data = embeddings_data
+    if update_freq == 'batch':
+      self.update_freq = 1
+    else:
+      self.update_freq = update_freq
+    self._samples_seen = 0
+    self._samples_seen_at_last_write = 0
+
+  def _init_writer(self, model):
+    """Sets file writer."""
+    if context.executing_eagerly():
+      self.writer = summary_ops_v2.create_file_writer(self.log_dir)
+      if not model.run_eagerly and self.write_graph:
+        with self.writer.as_default():
+          summary_ops_v2.graph(K.get_graph())
+    elif self.write_graph:
+      self.writer = tf_summary.FileWriter(self.log_dir, K.get_graph())
+    else:
+      self.writer = tf_summary.FileWriter(self.log_dir)
+
+  def _make_histogram_ops(self, model):
+    """Defines histogram ops when histogram_freq > 0."""
+    # only make histogram summary op if it hasn't already been made
+    if self.histogram_freq and self.merged is None:
+      for layer in self.model.layers:
+        for weight in layer.weights:
+          mapped_weight_name = weight.name.replace(':', '_')
+          tf_summary.histogram(mapped_weight_name, weight)
+          if self.write_images:
+            w_img = array_ops.squeeze(weight)
+            shape = K.int_shape(w_img)
+            if len(shape) == 2:  # dense layer kernel case
+              if shape[0] > shape[1]:
+                w_img = array_ops.transpose(w_img)
+                shape = K.int_shape(w_img)
+              w_img = array_ops.reshape(w_img, [1, shape[0], shape[1], 1])
+            elif len(shape) == 3:  # convnet case
+              if K.image_data_format() == 'channels_last':
+                # switch to channels_first to display
+                # every kernel as a separate image
+                w_img = array_ops.transpose(w_img, perm=[2, 0, 1])
+                shape = K.int_shape(w_img)
+              w_img = array_ops.reshape(w_img,
+                                        [shape[0], shape[1], shape[2], 1])
+            elif len(shape) == 1:  # bias case
+              w_img = array_ops.reshape(w_img, [1, shape[0], 1, 1])
+            else:
+              # not possible to handle 3D convnets etc.
+              continue
+
+            shape = K.int_shape(w_img)
+            assert len(shape) == 4 and shape[-1] in [1, 3, 4]
+            tf_summary.image(mapped_weight_name, w_img)
+
+        if self.write_grads:
+          for weight in layer.trainable_weights:
+            mapped_weight_name = weight.name.replace(':', '_')
+            grads = model.optimizer.get_gradients(model.total_loss, weight)
+
+            def is_indexed_slices(grad):
+              return type(grad).__name__ == 'IndexedSlices'
+
+            grads = [
+                grad.values if is_indexed_slices(grad) else grad
+                for grad in grads
+            ]
+            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
+
+        if hasattr(layer, 'output'):
+          if isinstance(layer.output, list):
+            for i, output in enumerate(layer.output):
+              tf_summary.histogram('{}_out_{}'.format(layer.name, i), output)
+          else:
+            tf_summary.histogram('{}_out'.format(layer.name), layer.output)
+
+  def set_model(self, model):
+    """Sets Keras model and creates summary ops."""
+
+    self.model = model
+    self._init_writer(model)
+    # histogram summaries only enabled in graph mode
+    if not context.executing_eagerly():
+      self._make_histogram_ops(model)
+      self.merged = tf_summary.merge_all()
+
+    # If both embedding_freq and embeddings_data are available, we will
+    # visualize embeddings.
+    if self.embeddings_freq and self.embeddings_data is not None:
+      # Avoid circular dependency.
+      from tensorflow.python.keras.engine import training_utils  # pylint: disable=g-import-not-at-top
+      self.embeddings_data = training_utils.standardize_input_data(
+          self.embeddings_data, model.input_names)
+
+      # If embedding_layer_names are not provided, get all of the embedding
+      # layers from the model.
+      embeddings_layer_names = self.embeddings_layer_names
+      if not embeddings_layer_names:
+        embeddings_layer_names = [
+            layer.name
+            for layer in self.model.layers
+            if type(layer).__name__ == 'Embedding'
+        ]
+
+      self.assign_embeddings = []
+      embeddings_vars = {}
+
+      self.batch_id = batch_id = array_ops.placeholder(dtypes.int32)
+      self.step = step = array_ops.placeholder(dtypes.int32)
+
+      for layer in self.model.layers:
+        if layer.name in embeddings_layer_names:
+          embedding_input = self.model.get_layer(layer.name).output
+          embedding_size = np.prod(embedding_input.shape[1:])
+          embedding_input = array_ops.reshape(embedding_input,
+                                              (step, int(embedding_size)))
+          shape = (self.embeddings_data[0].shape[0], int(embedding_size))
+          embedding = variables.Variable(
+              array_ops.zeros(shape), name=layer.name + '_embedding')
+          embeddings_vars[layer.name] = embedding
+          batch = state_ops.assign(embedding[batch_id:batch_id + step],
+                                   embedding_input)
+          self.assign_embeddings.append(batch)
+
+      self.saver = saver.Saver(list(embeddings_vars.values()))
+
+      # Create embeddings_metadata dictionary
+      if isinstance(self.embeddings_metadata, str):
+        embeddings_metadata = {
+            layer_name: self.embeddings_metadata
+            for layer_name in embeddings_vars.keys()
+        }
+      else:
+        # If embedding_metadata is already a dictionary
+        embeddings_metadata = self.embeddings_metadata
+
+      try:
+        from tensorboard.plugins import projector
+      except ImportError:
+        raise ImportError('Failed to import TensorBoard. Please make sure that '
+                          'TensorBoard integration is complete."')
+
+      # TODO(psv): Add integration tests to test embedding visualization
+      # with TensorBoard callback. We are unable to write a unit test for this
+      # because TensorBoard dependency assumes TensorFlow package is installed.
+      config = projector.ProjectorConfig()
+      for layer_name, tensor in embeddings_vars.items():
+        embedding = config.embeddings.add()
+        embedding.tensor_name = tensor.name
+
+        if (embeddings_metadata is not None and
+            layer_name in embeddings_metadata):
+          embedding.metadata_path = embeddings_metadata[layer_name]
+
+      projector.visualize_embeddings(self.writer, config)
+
+  def _fetch_callback(self, summary):
+    self.writer.add_summary(summary, self._total_val_batches_seen)
+    self._total_val_batches_seen += 1
+
+  def _write_custom_summaries(self, step, logs=None):
+    """Writes metrics out as custom scalar summaries.
+
+    Arguments:
+        step: the global step to use for TensorBoard.
+        logs: dict. Keys are scalar summary names, values are
+            NumPy scalars.
+
+    """
+    logs = logs or {}
+    if context.executing_eagerly():
+      # use v2 summary ops
+      with self.writer.as_default(), summary_ops_v2.always_record_summaries():
+        for name, value in logs.items():
+          if isinstance(value, np.ndarray):
+            value = value.item()
+          summary_ops_v2.scalar(name, value, step=step)
+    else:
+      # use FileWriter from v1 summary
+      for name, value in logs.items():
+        if isinstance(value, np.ndarray):
+          value = value.item()
+        summary = tf_summary.Summary()
+        summary_value = summary.value.add()
+        summary_value.simple_value = value
+        summary_value.tag = name
+        self.writer.add_summary(summary, step)
+    self.writer.flush()
+
+  def on_batch_end(self, batch, logs=None):
+    """Writes scalar summaries for metrics on every training batch."""
+    # Don't output batch_size and batch number as TensorBoard summaries
+    logs = logs or {}
+    self._samples_seen += logs.get('size', 1)
+    samples_seen_since = self._samples_seen - self._samples_seen_at_last_write
+    if self.update_freq != 'epoch' and samples_seen_since >= self.update_freq:
+      batch_logs = {('batch_' + k): v
+                    for k, v in logs.items()
+                    if k not in ['batch', 'size', 'num_steps']}
+      self._write_custom_summaries(self._total_batches_seen, batch_logs)
+      self._samples_seen_at_last_write = self._samples_seen
+    self._total_batches_seen += 1
+
+  def on_epoch_begin(self, epoch, logs=None):
+    """Add histogram op to Model eval_function callbacks, reset batch count."""
+
+    # check if histogram summary should be run for this epoch
+    if self.histogram_freq and epoch % self.histogram_freq == 0:
+      self._epoch = epoch
+      # pylint: disable=protected-access
+      # add the histogram summary op if it should run this epoch
+      self.model._make_eval_function()
+      if self.merged not in self.model._eval_function.fetches:
+        self.model._eval_function.fetches.append(self.merged)
+        self.model._eval_function.fetch_callbacks[
+            self.merged] = self._fetch_callback
+      # pylint: enable=protected-access
+
+  def on_epoch_end(self, epoch, logs=None):
+    """Checks if summary ops should run next epoch, logs scalar summaries."""
+
+    # don't output batch_size and
+    # batch number as TensorBoard summaries
+    logs = {('epoch_' + k): v
+            for k, v in logs.items()
+            if k not in ['batch', 'size', 'num_steps']}
+    if self.update_freq == 'epoch':
+      step = epoch
+    else:
+      step = self._samples_seen
+    self._write_custom_summaries(step, logs)
+
+    # pop the histogram summary op after each epoch
+    if self.histogram_freq:
+      # pylint: disable=protected-access
+      if self.merged in self.model._eval_function.fetches:
+        self.model._eval_function.fetches.remove(self.merged)
+      if self.merged in self.model._eval_function.fetch_callbacks:
+        self.model._eval_function.fetch_callbacks.pop(self.merged)
+      # pylint: enable=protected-access
+
+    if self.embeddings_data is None and self.embeddings_freq:
+      raise ValueError('To visualize embeddings, embeddings_data must '
+                       'be provided.')
+
+    if self.embeddings_freq and self.embeddings_data is not None:
+      if epoch % self.embeddings_freq == 0:
+        # We need a second forward-pass here because we're passing
+        # the `embeddings_data` explicitly. This design allows to pass
+        # arbitrary data as `embeddings_data` and results from the fact
+        # that we need to know the size of the `tf.Variable`s which
+        # hold the embeddings in `set_model`. At this point, however,
+        # the `validation_data` is not yet set.
+
+        embeddings_data = self.embeddings_data
+        n_samples = embeddings_data[0].shape[0]
+        i = 0
+        while i < n_samples:
+          step = min(self.batch_size, n_samples - i)
+          batch = slice(i, i + step)
+
+          if isinstance(self.model.input, list):
+            feed_dict = {
+                model_input: embeddings_data[idx][batch]
+                for idx, model_input in enumerate(self.model.input)
+            }
+          else:
+            feed_dict = {self.model.input: embeddings_data[0][batch]}
+
+          feed_dict.update({self.batch_id: i, self.step: step})
+
+          if not isinstance(K.learning_phase(), int):
+            feed_dict[K.learning_phase()] = False
+
+          self.sess.run(self.assign_embeddings, feed_dict=feed_dict)
+          self.saver.save(self.sess,
+                          os.path.join(self.log_dir, 'keras_embedding.ckpt'),
+                          epoch)
+
+          i += self.batch_size
+
+  def on_train_end(self, logs=None):
+    self.writer.close()
diff --git a/tensorflow/python/keras/callbacks_v1_test.py b/tensorflow/python/keras/callbacks_v1_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdb6af938b8a5b24048a92a12f674e1b15bc6763
--- /dev/null
+++ b/tensorflow/python/keras/callbacks_v1_test.py
@@ -0,0 +1,562 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras callbacks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+
+import numpy as np
+
+from tensorflow.core.framework import summary_pb2
+from tensorflow.python import keras
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import callbacks_v1
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.platform import test
+from tensorflow.python.training import adam
+
+
+TRAIN_SAMPLES = 10
+TEST_SAMPLES = 10
+NUM_CLASSES = 2
+INPUT_DIM = 3
+NUM_HIDDEN = 5
+BATCH_SIZE = 5
+
+
+class TestTensorBoardV1(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def test_TensorBoard(self):
+    np.random.seed(1337)
+
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=TRAIN_SAMPLES,
+        test_samples=TEST_SAMPLES,
+        input_shape=(INPUT_DIM,),
+        num_classes=NUM_CLASSES)
+    y_test = keras.utils.to_categorical(y_test)
+    y_train = keras.utils.to_categorical(y_train)
+
+    def data_generator(train):
+      if train:
+        max_batch_index = len(x_train) // BATCH_SIZE
+      else:
+        max_batch_index = len(x_test) // BATCH_SIZE
+      i = 0
+      while 1:
+        if train:
+          yield (x_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE],
+                 y_train[i * BATCH_SIZE:(i + 1) * BATCH_SIZE])
+        else:
+          yield (x_test[i * BATCH_SIZE:(i + 1) * BATCH_SIZE],
+                 y_test[i * BATCH_SIZE:(i + 1) * BATCH_SIZE])
+        i += 1
+        i %= max_batch_index
+
+    # case: Sequential
+    with self.cached_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Dense(
+              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      # non_trainable_weights: moving_variance, moving_mean
+      model.add(keras.layers.BatchNormalization())
+      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model.compile(
+          loss='categorical_crossentropy',
+          optimizer='sgd',
+          metrics=['accuracy'])
+      tsb = callbacks_v1.TensorBoard(
+          log_dir=temp_dir,
+          histogram_freq=1,
+          write_images=True,
+          write_grads=True,
+          batch_size=5)
+      cbks = [tsb]
+
+      # fit with validation data
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=3,
+          verbose=0)
+
+      # fit with validation data and accuracy
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=2,
+          verbose=0)
+
+      # fit generator with validation data
+      model.fit_generator(
+          data_generator(True),
+          len(x_train),
+          epochs=2,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          verbose=0)
+
+      # fit generator without validation data
+      # histogram_freq must be zero
+      tsb.histogram_freq = 0
+      model.fit_generator(
+          data_generator(True),
+          len(x_train),
+          epochs=2,
+          callbacks=cbks,
+          verbose=0)
+
+      # fit generator with validation data and accuracy
+      tsb.histogram_freq = 1
+      model.fit_generator(
+          data_generator(True),
+          len(x_train),
+          epochs=2,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          verbose=0)
+
+      # fit generator without validation data and accuracy
+      tsb.histogram_freq = 0
+      model.fit_generator(
+          data_generator(True), len(x_train), epochs=2, callbacks=cbks)
+      assert os.path.exists(temp_dir)
+
+  @test_util.run_deprecated_v1
+  def test_TensorBoard_multi_input_output(self):
+    np.random.seed(1337)
+    tmpdir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
+
+    with self.cached_session():
+      filepath = os.path.join(tmpdir, 'logs')
+
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=TRAIN_SAMPLES,
+          test_samples=TEST_SAMPLES,
+          input_shape=(INPUT_DIM,),
+          num_classes=NUM_CLASSES)
+      y_test = keras.utils.to_categorical(y_test)
+      y_train = keras.utils.to_categorical(y_train)
+
+      def data_generator(train):
+        if train:
+          max_batch_index = len(x_train) // BATCH_SIZE
+        else:
+          max_batch_index = len(x_test) // BATCH_SIZE
+        i = 0
+        while 1:
+          if train:
+            # simulate multi-input/output models
+            yield ([x_train[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2,
+                   [y_train[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2)
+          else:
+            yield ([x_test[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2,
+                   [y_test[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]] * 2)
+          i += 1
+          i %= max_batch_index
+
+      inp1 = keras.Input((INPUT_DIM,))
+      inp2 = keras.Input((INPUT_DIM,))
+      inp = keras.layers.add([inp1, inp2])
+      hidden = keras.layers.Dense(2, activation='relu')(inp)
+      hidden = keras.layers.Dropout(0.1)(hidden)
+      output1 = keras.layers.Dense(NUM_CLASSES, activation='softmax')(hidden)
+      output2 = keras.layers.Dense(NUM_CLASSES, activation='softmax')(hidden)
+      model = keras.models.Model([inp1, inp2], [output1, output2])
+      model.compile(loss='categorical_crossentropy',
+                    optimizer='sgd',
+                    metrics=['accuracy'])
+
+      # we must generate new callbacks for each test, as they aren't stateless
+      def callbacks_factory(histogram_freq):
+        return [
+            callbacks_v1.TensorBoard(
+                log_dir=filepath,
+                histogram_freq=histogram_freq,
+                write_images=True,
+                write_grads=True,
+                batch_size=5)
+        ]
+
+      # fit without validation data
+      model.fit([x_train] * 2, [y_train] * 2, batch_size=BATCH_SIZE,
+                callbacks=callbacks_factory(histogram_freq=0), epochs=3)
+
+      # fit with validation data and accuracy
+      model.fit([x_train] * 2, [y_train] * 2, batch_size=BATCH_SIZE,
+                validation_data=([x_test] * 2, [y_test] * 2),
+                callbacks=callbacks_factory(histogram_freq=1), epochs=2)
+
+      # fit generator without validation data
+      model.fit_generator(data_generator(True), len(x_train), epochs=2,
+                          callbacks=callbacks_factory(histogram_freq=0))
+
+      # fit generator with validation data and accuracy
+      model.fit_generator(data_generator(True), len(x_train), epochs=2,
+                          validation_data=([x_test] * 2, [y_test] * 2),
+                          callbacks=callbacks_factory(histogram_freq=1))
+      assert os.path.isdir(filepath)
+
+  @test_util.run_deprecated_v1
+  def test_Tensorboard_histogram_summaries_in_test_function(self):
+
+    class FileWriterStub(object):
+
+      def __init__(self, logdir, graph=None):
+        self.logdir = logdir
+        self.graph = graph
+        self.steps_seen = []
+
+      def add_summary(self, summary, global_step):
+        summary_obj = summary_pb2.Summary()
+
+        # ensure a valid Summary proto is being sent
+        if isinstance(summary, bytes):
+          summary_obj.ParseFromString(summary)
+        else:
+          assert isinstance(summary, summary_pb2.Summary)
+          summary_obj = summary
+
+        # keep track of steps seen for the merged_summary op,
+        # which contains the histogram summaries
+        if len(summary_obj.value) > 1:
+          self.steps_seen.append(global_step)
+
+      def flush(self):
+        pass
+
+      def close(self):
+        pass
+
+    def _init_writer(obj, _):
+      obj.writer = FileWriterStub(obj.log_dir)
+
+    np.random.seed(1337)
+    tmpdir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=TRAIN_SAMPLES,
+        test_samples=TEST_SAMPLES,
+        input_shape=(INPUT_DIM,),
+        num_classes=NUM_CLASSES)
+    y_test = keras.utils.to_categorical(y_test)
+    y_train = keras.utils.to_categorical(y_train)
+
+    with self.cached_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Dense(
+              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      # non_trainable_weights: moving_variance, moving_mean
+      model.add(keras.layers.BatchNormalization())
+      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model.compile(
+          loss='categorical_crossentropy',
+          optimizer='sgd',
+          metrics=['accuracy'])
+      callbacks_v1.TensorBoard._init_writer = _init_writer
+      tsb = callbacks_v1.TensorBoard(
+          log_dir=tmpdir,
+          histogram_freq=1,
+          write_images=True,
+          write_grads=True,
+          batch_size=5)
+      cbks = [tsb]
+
+      # fit with validation data
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=3,
+          verbose=0)
+
+      self.assertAllEqual(tsb.writer.steps_seen, [0, 1, 2, 3, 4, 5])
+
+  @test_util.run_deprecated_v1
+  def test_Tensorboard_histogram_summaries_with_generator(self):
+    np.random.seed(1337)
+    tmpdir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
+
+    def generator():
+      x = np.random.randn(10, 100).astype(np.float32)
+      y = np.random.randn(10, 10).astype(np.float32)
+      while True:
+        yield x, y
+
+    with self.cached_session():
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=10, num_classes=10, input_dim=100)
+      model.compile(
+          loss='categorical_crossentropy',
+          optimizer='sgd',
+          metrics=['accuracy'])
+      tsb = callbacks_v1.TensorBoard(
+          log_dir=tmpdir,
+          histogram_freq=1,
+          write_images=True,
+          write_grads=True,
+          batch_size=5)
+      cbks = [tsb]
+
+      # fit with validation generator
+      model.fit_generator(
+          generator(),
+          steps_per_epoch=2,
+          epochs=2,
+          validation_data=generator(),
+          validation_steps=2,
+          callbacks=cbks,
+          verbose=0)
+
+      with self.assertRaises(ValueError):
+        # fit with validation generator but no
+        # validation_steps
+        model.fit_generator(
+            generator(),
+            steps_per_epoch=2,
+            epochs=2,
+            validation_data=generator(),
+            callbacks=cbks,
+            verbose=0)
+
+      self.assertTrue(os.path.exists(tmpdir))
+
+  def test_TensorBoard_with_ReduceLROnPlateau(self):
+    with self.cached_session():
+      temp_dir = self.get_temp_dir()
+      self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=TRAIN_SAMPLES,
+          test_samples=TEST_SAMPLES,
+          input_shape=(INPUT_DIM,),
+          num_classes=NUM_CLASSES)
+      y_test = keras.utils.to_categorical(y_test)
+      y_train = keras.utils.to_categorical(y_train)
+
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
+      model.compile(
+          loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
+
+      cbks = [
+          keras.callbacks.ReduceLROnPlateau(
+              monitor='val_loss', factor=0.5, patience=4, verbose=1),
+          callbacks_v1.TensorBoard(log_dir=temp_dir)
+      ]
+
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=2,
+          verbose=0)
+
+      assert os.path.exists(temp_dir)
+
+  @test_util.run_deprecated_v1
+  def test_Tensorboard_batch_logging(self):
+
+    class FileWriterStub(object):
+
+      def __init__(self, logdir, graph=None):
+        self.logdir = logdir
+        self.graph = graph
+        self.batches_logged = []
+        self.summary_values = []
+        self.summary_tags = []
+
+      def add_summary(self, summary, step):
+        self.summary_values.append(summary.value[0].simple_value)
+        self.summary_tags.append(summary.value[0].tag)
+        self.batches_logged.append(step)
+
+      def flush(self):
+        pass
+
+      def close(self):
+        pass
+
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+    tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq='batch')
+    tb_cbk.writer = FileWriterStub(temp_dir)
+
+    for batch in range(5):
+      tb_cbk.on_batch_end(batch, {'acc': batch})
+    self.assertEqual(tb_cbk.writer.batches_logged, [0, 1, 2, 3, 4])
+    self.assertEqual(tb_cbk.writer.summary_values, [0., 1., 2., 3., 4.])
+    self.assertEqual(tb_cbk.writer.summary_tags, ['batch_acc'] * 5)
+
+  @test_util.run_deprecated_v1
+  def test_Tensorboard_epoch_and_batch_logging(self):
+
+    class FileWriterStub(object):
+
+      def __init__(self, logdir, graph=None):
+        self.logdir = logdir
+        self.graph = graph
+
+      def add_summary(self, summary, step):
+        if 'batch_' in summary.value[0].tag:
+          self.batch_summary = (step, summary)
+        elif 'epoch_' in summary.value[0].tag:
+          self.epoch_summary = (step, summary)
+
+      def flush(self):
+        pass
+
+      def close(self):
+        pass
+
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+    tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq='batch')
+    tb_cbk.writer = FileWriterStub(temp_dir)
+
+    tb_cbk.on_batch_end(0, {'acc': 5.0})
+    batch_step, batch_summary = tb_cbk.writer.batch_summary
+    self.assertEqual(batch_step, 0)
+    self.assertEqual(batch_summary.value[0].simple_value, 5.0)
+
+    tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq='epoch')
+    tb_cbk.writer = FileWriterStub(temp_dir)
+    tb_cbk.on_epoch_end(0, {'acc': 10.0})
+    epoch_step, epoch_summary = tb_cbk.writer.epoch_summary
+    self.assertEqual(epoch_step, 0)
+    self.assertEqual(epoch_summary.value[0].simple_value, 10.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_Tensorboard_eager(self):
+    temp_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=TRAIN_SAMPLES,
+        test_samples=TEST_SAMPLES,
+        input_shape=(INPUT_DIM,),
+        num_classes=NUM_CLASSES)
+    y_test = keras.utils.to_categorical(y_test)
+    y_train = keras.utils.to_categorical(y_train)
+
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM)
+    model.compile(
+        loss='binary_crossentropy',
+        optimizer=adam.AdamOptimizer(0.01),
+        metrics=['accuracy'])
+
+    cbks = [callbacks_v1.TensorBoard(log_dir=temp_dir)]
+
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=BATCH_SIZE,
+        validation_data=(x_test, y_test),
+        callbacks=cbks,
+        epochs=2,
+        verbose=0)
+
+    self.assertTrue(os.path.exists(temp_dir))
+
+  @test_util.run_deprecated_v1
+  def test_TensorBoard_update_freq(self):
+
+    class FileWriterStub(object):
+
+      def __init__(self, logdir, graph=None):
+        self.logdir = logdir
+        self.graph = graph
+        self.batch_summaries = []
+        self.epoch_summaries = []
+
+      def add_summary(self, summary, step):
+        if 'batch_' in summary.value[0].tag:
+          self.batch_summaries.append((step, summary))
+        elif 'epoch_' in summary.value[0].tag:
+          self.epoch_summaries.append((step, summary))
+
+      def flush(self):
+        pass
+
+      def close(self):
+        pass
+
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir, ignore_errors=True)
+
+    # Epoch mode
+    tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq='epoch')
+    tb_cbk.writer = FileWriterStub(temp_dir)
+
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
+    self.assertEqual(tb_cbk.writer.batch_summaries, [])
+    tb_cbk.on_epoch_end(0, {'acc': 10.0, 'size': 1})
+    self.assertEqual(len(tb_cbk.writer.epoch_summaries), 1)
+
+    # Batch mode
+    tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq='batch')
+    tb_cbk.writer = FileWriterStub(temp_dir)
+
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
+    self.assertEqual(len(tb_cbk.writer.batch_summaries), 1)
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 1})
+    self.assertEqual(len(tb_cbk.writer.batch_summaries), 2)
+    self.assertFalse(tb_cbk.writer.epoch_summaries)
+
+    # Integer mode
+    tb_cbk = callbacks_v1.TensorBoard(temp_dir, update_freq=20)
+    tb_cbk.writer = FileWriterStub(temp_dir)
+
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
+    self.assertFalse(tb_cbk.writer.batch_summaries)
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
+    self.assertEqual(len(tb_cbk.writer.batch_summaries), 1)
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
+    self.assertEqual(len(tb_cbk.writer.batch_summaries), 1)
+    tb_cbk.on_batch_end(0, {'acc': 5.0, 'size': 10})
+    self.assertEqual(len(tb_cbk.writer.batch_summaries), 2)
+    tb_cbk.on_batch_end(0, {'acc': 10.0, 'size': 10})
+    self.assertEqual(len(tb_cbk.writer.batch_summaries), 2)
+    self.assertFalse(tb_cbk.writer.epoch_summaries)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/constraints.py b/tensorflow/python/keras/constraints.py
index bf3a3a728aafc8071d8ddb7e3acf4f7282ed4c16..334d072d5a24a8ddac5b23ff7fe1a868e7741a23 100644
--- a/tensorflow/python/keras/constraints.py
+++ b/tensorflow/python/keras/constraints.py
@@ -25,10 +25,10 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import math_ops
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.constraints.Constraint')
+@keras_export('keras.constraints.Constraint')
 class Constraint(object):
 
   def __call__(self, w):
@@ -38,7 +38,7 @@ class Constraint(object):
     return {}
 
 
-@tf_export('keras.constraints.MaxNorm', 'keras.constraints.max_norm')
+@keras_export('keras.constraints.MaxNorm', 'keras.constraints.max_norm')
 class MaxNorm(Constraint):
   """MaxNorm weight constraint.
 
@@ -75,7 +75,7 @@ class MaxNorm(Constraint):
     return {'max_value': self.max_value, 'axis': self.axis}
 
 
-@tf_export('keras.constraints.NonNeg', 'keras.constraints.non_neg')
+@keras_export('keras.constraints.NonNeg', 'keras.constraints.non_neg')
 class NonNeg(Constraint):
   """Constrains the weights to be non-negative.
   """
@@ -84,7 +84,7 @@ class NonNeg(Constraint):
     return w * math_ops.cast(math_ops.greater_equal(w, 0.), K.floatx())
 
 
-@tf_export('keras.constraints.UnitNorm', 'keras.constraints.unit_norm')
+@keras_export('keras.constraints.UnitNorm', 'keras.constraints.unit_norm')
 class UnitNorm(Constraint):
   """Constrains the weights incident to each hidden unit to have unit norm.
 
@@ -115,7 +115,7 @@ class UnitNorm(Constraint):
     return {'axis': self.axis}
 
 
-@tf_export('keras.constraints.MinMaxNorm', 'keras.constraints.min_max_norm')
+@keras_export('keras.constraints.MinMaxNorm', 'keras.constraints.min_max_norm')
 class MinMaxNorm(Constraint):
   """MinMaxNorm weight constraint.
 
@@ -181,12 +181,12 @@ nonneg = non_neg
 unitnorm = unit_norm
 
 
-@tf_export('keras.constraints.serialize')
+@keras_export('keras.constraints.serialize')
 def serialize(constraint):
   return serialize_keras_object(constraint)
 
 
-@tf_export('keras.constraints.deserialize')
+@keras_export('keras.constraints.deserialize')
 def deserialize(config, custom_objects=None):
   return deserialize_keras_object(
       config,
@@ -195,7 +195,7 @@ def deserialize(config, custom_objects=None):
       printable_module_name='constraint')
 
 
-@tf_export('keras.constraints.get')
+@keras_export('keras.constraints.get')
 def get(identifier):
   if identifier is None:
     return None
diff --git a/tensorflow/python/keras/constraints_test.py b/tensorflow/python/keras/constraints_test.py
index 4f674ea7c5826f916f31f08d60d060e024931a9f..92bc4852cff849674457a6546340a7a2bdd9b79f 100644
--- a/tensorflow/python/keras/constraints_test.py
+++ b/tensorflow/python/keras/constraints_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -35,6 +36,7 @@ def get_example_array():
   return example_array
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class KerasConstraintsTest(test.TestCase):
 
   def test_serialization(self):
@@ -49,54 +51,47 @@ class KerasConstraintsTest(test.TestCase):
       assert fn.__class__ == ref_fn.__class__
 
   def test_max_norm(self):
-    with self.cached_session():
-      array = get_example_array()
-      for m in get_test_values():
-        norm_instance = keras.constraints.max_norm(m)
-        normed = norm_instance(keras.backend.variable(array))
-        assert np.all(keras.backend.eval(normed) < m)
-
-      # a more explicit example
-      norm_instance = keras.constraints.max_norm(2.0)
-      x = np.array([[0, 0, 0], [1.0, 0, 0], [3, 0, 0], [3, 3, 3]]).T
-      x_normed_target = np.array([[0, 0, 0], [1.0, 0, 0],
-                                  [2.0, 0, 0],
-                                  [2. / np.sqrt(3),
-                                   2. / np.sqrt(3),
-                                   2. / np.sqrt(3)]]).T
-      x_normed_actual = keras.backend.eval(
-          norm_instance(keras.backend.variable(x)))
-      self.assertAllClose(x_normed_actual, x_normed_target, rtol=1e-05)
+    array = get_example_array()
+    for m in get_test_values():
+      norm_instance = keras.constraints.max_norm(m)
+      normed = norm_instance(keras.backend.variable(array))
+      assert np.all(keras.backend.eval(normed) < m)
+
+    # a more explicit example
+    norm_instance = keras.constraints.max_norm(2.0)
+    x = np.array([[0, 0, 0], [1.0, 0, 0], [3, 0, 0], [3, 3, 3]]).T
+    x_normed_target = np.array(
+        [[0, 0, 0], [1.0, 0, 0], [2.0, 0, 0],
+         [2. / np.sqrt(3), 2. / np.sqrt(3), 2. / np.sqrt(3)]]).T
+    x_normed_actual = keras.backend.eval(
+        norm_instance(keras.backend.variable(x)))
+    self.assertAllClose(x_normed_actual, x_normed_target, rtol=1e-05)
 
   def test_non_neg(self):
-    with self.cached_session():
-      non_neg_instance = keras.constraints.non_neg()
-      normed = non_neg_instance(keras.backend.variable(get_example_array()))
-      assert np.all(np.min(keras.backend.eval(normed), axis=1) == 0.)
+    non_neg_instance = keras.constraints.non_neg()
+    normed = non_neg_instance(keras.backend.variable(get_example_array()))
+    assert np.all(np.min(keras.backend.eval(normed), axis=1) == 0.)
 
   def test_unit_norm(self):
-    with self.cached_session():
-      unit_norm_instance = keras.constraints.unit_norm()
-      normalized = unit_norm_instance(
-          keras.backend.variable(get_example_array()))
-      norm_of_normalized = np.sqrt(
-          np.sum(keras.backend.eval(normalized) ** 2, axis=0))
-      # In the unit norm constraint, it should be equal to 1.
-      difference = norm_of_normalized - 1.
-      largest_difference = np.max(np.abs(difference))
-      assert np.abs(largest_difference) < 10e-5
+    unit_norm_instance = keras.constraints.unit_norm()
+    normalized = unit_norm_instance(keras.backend.variable(get_example_array()))
+    norm_of_normalized = np.sqrt(
+        np.sum(keras.backend.eval(normalized)**2, axis=0))
+    # In the unit norm constraint, it should be equal to 1.
+    difference = norm_of_normalized - 1.
+    largest_difference = np.max(np.abs(difference))
+    assert np.abs(largest_difference) < 10e-5
 
   def test_min_max_norm(self):
-    with self.cached_session():
-      array = get_example_array()
-      for m in get_test_values():
-        norm_instance = keras.constraints.min_max_norm(min_value=m,
-                                                       max_value=m * 2)
-        normed = norm_instance(keras.backend.variable(array))
-        value = keras.backend.eval(normed)
-        l2 = np.sqrt(np.sum(np.square(value), axis=0))
-        assert not l2[l2 < m]
-        assert not l2[l2 > m * 2 + 1e-5]
+    array = get_example_array()
+    for m in get_test_values():
+      norm_instance = keras.constraints.min_max_norm(
+          min_value=m, max_value=m * 2)
+      normed = norm_instance(keras.backend.variable(array))
+      value = keras.backend.eval(normed)
+      l2 = np.sqrt(np.sum(np.square(value), axis=0))
+      assert not l2[l2 < m]
+      assert not l2[l2 > m * 2 + 1e-5]
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/datasets/boston_housing.py b/tensorflow/python/keras/datasets/boston_housing.py
index eeb7cbc44a72a5c624f8d1d1d9dbfab1fcd1b225..cf1a1097bb839dfd216a8e0b3541d2873d48784e 100644
--- a/tensorflow/python/keras/datasets/boston_housing.py
+++ b/tensorflow/python/keras/datasets/boston_housing.py
@@ -21,10 +21,10 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.keras.utils.data_utils import get_file
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.datasets.boston_housing.load_data')
+@keras_export('keras.datasets.boston_housing.load_data')
 def load_data(path='boston_housing.npz', test_split=0.2, seed=113):
   """Loads the Boston Housing dataset.
 
diff --git a/tensorflow/python/keras/datasets/cifar10.py b/tensorflow/python/keras/datasets/cifar10.py
index d627160875c007971c695891d1dab34b8bf1ba39..36e1b83c10ab0d10f929ff520c9d9882803df97a 100644
--- a/tensorflow/python/keras/datasets/cifar10.py
+++ b/tensorflow/python/keras/datasets/cifar10.py
@@ -25,10 +25,10 @@ import numpy as np
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.datasets.cifar import load_batch
 from tensorflow.python.keras.utils.data_utils import get_file
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.datasets.cifar10.load_data')
+@keras_export('keras.datasets.cifar10.load_data')
 def load_data():
   """Loads CIFAR10 dataset.
 
diff --git a/tensorflow/python/keras/datasets/cifar100.py b/tensorflow/python/keras/datasets/cifar100.py
index e9a6d634a5308ab8c749e8861e0e4a33ac56d464..ee58d46228cfac13ee317dcae5deb7becec0d31d 100644
--- a/tensorflow/python/keras/datasets/cifar100.py
+++ b/tensorflow/python/keras/datasets/cifar100.py
@@ -25,10 +25,10 @@ import numpy as np
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.datasets.cifar import load_batch
 from tensorflow.python.keras.utils.data_utils import get_file
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.datasets.cifar100.load_data')
+@keras_export('keras.datasets.cifar100.load_data')
 def load_data(label_mode='fine'):
   """Loads CIFAR100 dataset.
 
diff --git a/tensorflow/python/keras/datasets/fashion_mnist.py b/tensorflow/python/keras/datasets/fashion_mnist.py
index 3f4c6c7413e01313fda051a5603f223f9f7c4d27..5e73635a3c129422585f95fab60b88b5c8a232f3 100644
--- a/tensorflow/python/keras/datasets/fashion_mnist.py
+++ b/tensorflow/python/keras/datasets/fashion_mnist.py
@@ -24,10 +24,10 @@ import os
 import numpy as np
 
 from tensorflow.python.keras.utils.data_utils import get_file
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.datasets.fashion_mnist.load_data')
+@keras_export('keras.datasets.fashion_mnist.load_data')
 def load_data():
   """Loads the Fashion-MNIST dataset.
 
diff --git a/tensorflow/python/keras/datasets/imdb.py b/tensorflow/python/keras/datasets/imdb.py
index b73b024162ac3fde4c430c34ff4f0f7b1174abe6..022a9b7fc13e46fcafc6ada803f82cd8fe060f83 100644
--- a/tensorflow/python/keras/datasets/imdb.py
+++ b/tensorflow/python/keras/datasets/imdb.py
@@ -25,10 +25,10 @@ import numpy as np
 from tensorflow.python.keras.preprocessing.sequence import _remove_long_seq
 from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.datasets.imdb.load_data')
+@keras_export('keras.datasets.imdb.load_data')
 def load_data(path='imdb.npz',
               num_words=None,
               skip_top=0,
@@ -131,7 +131,7 @@ def load_data(path='imdb.npz',
   return (x_train, y_train), (x_test, y_test)
 
 
-@tf_export('keras.datasets.imdb.get_word_index')
+@keras_export('keras.datasets.imdb.get_word_index')
 def get_word_index(path='imdb_word_index.json'):
   """Retrieves the dictionary mapping word indices back to words.
 
diff --git a/tensorflow/python/keras/datasets/mnist.py b/tensorflow/python/keras/datasets/mnist.py
index a96b581960f3d5f60994fe92a1424e793d7e39c7..bad41a516422f624ea6f10c5d997bc17eb46777c 100644
--- a/tensorflow/python/keras/datasets/mnist.py
+++ b/tensorflow/python/keras/datasets/mnist.py
@@ -21,10 +21,10 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.keras.utils.data_utils import get_file
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.datasets.mnist.load_data')
+@keras_export('keras.datasets.mnist.load_data')
 def load_data(path='mnist.npz'):
   """Loads the MNIST dataset.
 
diff --git a/tensorflow/python/keras/datasets/reuters.py b/tensorflow/python/keras/datasets/reuters.py
index cb796bb06cf09157cc510b55e3981d518fd8b433..0daa1c23060081af41dc1f31eb7f2e8e300d86ad 100644
--- a/tensorflow/python/keras/datasets/reuters.py
+++ b/tensorflow/python/keras/datasets/reuters.py
@@ -25,10 +25,10 @@ import numpy as np
 from tensorflow.python.keras.preprocessing.sequence import _remove_long_seq
 from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.datasets.reuters.load_data')
+@keras_export('keras.datasets.reuters.load_data')
 def load_data(path='reuters.npz',
               num_words=None,
               skip_top=0,
@@ -115,7 +115,7 @@ def load_data(path='reuters.npz',
   return (x_train, y_train), (x_test, y_test)
 
 
-@tf_export('keras.datasets.reuters.get_word_index')
+@keras_export('keras.datasets.reuters.get_word_index')
 def get_word_index(path='reuters_word_index.json'):
   """Retrieves the dictionary mapping word indices back to words.
 
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 858fa76472b3806f36b76f761043f011a260b66d..9730ecbef568eb6cfc4ecaf81be0b96c5425d300 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -20,15 +20,17 @@ from __future__ import print_function
 
 import functools
 import inspect  # Necessary supplement to tf_inspect to deal with variadic args.
+import itertools
 
 import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
 
+from tensorflow.core.framework import node_def_pb2
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import constraints
@@ -44,17 +46,19 @@ from tensorflow.python.keras.utils.tf_utils import is_tensor_or_tensor_list  # p
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables as tf_variables
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.training.tracking import data_structures
+from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
 
-@tf_export('keras.layers.Layer')
-class Layer(checkpointable.CheckpointableBase):
+@keras_export('keras.layers.Layer')
+class Layer(trackable.Trackable):
   """Base layer class.
 
   This is the class from which all layers inherit.
@@ -82,6 +86,12 @@ class Layer(checkpointable.CheckpointableBase):
     name: String name of the layer.
     dtype: Default dtype of the layer's weights (default of `None` means use the
       type of the first input).
+    dynamic: Set this to `True` if your layer should only be run eagerly, and
+      should not be used to generate a static computation graph.
+      This would be the case for a Tree-RNN or a recursive network,
+      for example, or generally for any layer that manipulates tensors
+      using Python control flow. If `False`, we assume that the layer can
+      safely be used to generate a static computation graph.
 
   Read-only properties:
     name: The name of the layer (string).
@@ -101,8 +111,9 @@ class Layer(checkpointable.CheckpointableBase):
       constraints on inputs that can be accepted by the layer.
   """
 
-  @checkpointable.no_automatic_dependency_tracking
-  def __init__(self, trainable=True, name=None, dtype=None, **kwargs):
+  @trackable.no_automatic_dependency_tracking
+  def __init__(self, trainable=True, name=None, dtype=None, dynamic=False,
+               **kwargs):
     # These properties should be set by the user via keyword arguments.
     # note that 'dtype', 'input_shape' and 'batch_input_shape'
     # are only applicable to input layers: do not pass these keywords
@@ -135,8 +146,10 @@ class Layer(checkpointable.CheckpointableBase):
 
     self._init_set_name(name)
     self._activity_regularizer = kwargs.pop('activity_regularizer', None)
-    self._trainable_weights = []
-    self._non_trainable_weights = []
+    if not hasattr(self, '_trainable_weights'):
+      self._trainable_weights = []
+    if not hasattr(self, '_non_trainable_weights'):
+      self._non_trainable_weights = []
     self._updates = []
     # A list of zero-argument lambdas which return Tensors, used for variable
     # regularizers.
@@ -164,6 +177,8 @@ class Layer(checkpointable.CheckpointableBase):
                                    hasattr(self, 'compute_mask'))
     self._call_convention = (base_layer_utils
                              .CallConvention.EXPLICIT_INPUTS_ARGUMENT)
+    if not hasattr(self, '_layers'):
+      self._layers = []  # Dependencies tracked via attribute assignment.
 
     # These lists will be filled via successive calls
     # to self._add_inbound_node().
@@ -177,7 +192,7 @@ class Layer(checkpointable.CheckpointableBase):
       self._expects_training_arg = False
 
     # Whether the `call` method can be used to build a TF graph without issues.
-    self._call_is_graph_friendly = True
+    self._dynamic = dynamic
 
     # Manage input shape information if passed.
     if 'input_shape' in kwargs or 'batch_input_shape' in kwargs:
@@ -230,8 +245,8 @@ class Layer(checkpointable.CheckpointableBase):
 
   @doc_controls.for_subclass_implementers
   def add_weight(self,
-                 name,
-                 shape,
+                 name=None,
+                 shape=None,
                  dtype=None,
                  initializer=None,
                  regularizer=None,
@@ -242,11 +257,11 @@ class Layer(checkpointable.CheckpointableBase):
                  synchronization=tf_variables.VariableSynchronization.AUTO,
                  aggregation=tf_variables.VariableAggregation.NONE,
                  **kwargs):
-    """Adds a new variable to the layer, or gets an existing one; returns it.
+    """Adds a new variable to the layer.
 
     Arguments:
-      name: variable name.
-      shape: variable shape.
+      name: Variable name.
+      shape: Variable shape. Defaults to scalar if unspecified.
       dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
       initializer: initializer instance (callable).
       regularizer: regularizer instance (callable).
@@ -258,7 +273,7 @@ class Layer(checkpointable.CheckpointableBase):
         marked as non-trainable. `trainable` defaults to `True` unless
         `synchronization` is set to `ON_READ`.
       constraint: constraint instance (callable).
-      partitioner: Partitioner to be passed to the `Checkpointable` API.
+      partitioner: Partitioner to be passed to the `Trackable` API.
       use_resource: Whether to use `ResourceVariable`.
       synchronization: Indicates when a distributed a variable will be
         aggregated. Accepted values are constants defined in the class
@@ -283,6 +298,7 @@ class Layer(checkpointable.CheckpointableBase):
       ValueError: When giving unsupported dtype and no initializer or when
         trainable has been set to True with synchronization set as `ON_READ`.
     """
+    shape = shape or ()
     # Validate optional keyword arguments.
     for kwarg in kwargs:
       if kwarg not in ['getter', 'collections']:
@@ -293,6 +309,8 @@ class Layer(checkpointable.CheckpointableBase):
     if dtype is None:
       dtype = self.dtype or backend.floatx()
     dtype = dtypes.as_dtype(dtype)
+    if self._dtype is None:
+      self._dtype = dtype.base_dtype.name
     initializer = initializers.get(initializer)
     regularizer = regularizers.get(regularizer)
     constraint = constraints.get(constraint)
@@ -328,9 +346,9 @@ class Layer(checkpointable.CheckpointableBase):
         name=name,
         shape=shape,
         # TODO(allenl): a `make_variable` equivalent should be added as a
-        # `Checkpointable` method.
+        # `Trackable` method.
         getter=getter or base_layer_utils.make_variable,
-        # Manage errors in Layer rather than Checkpointable.
+        # Manage errors in Layer rather than Trackable.
         overwrite=True,
         initializer=initializer,
         dtype=dtype,
@@ -347,8 +365,10 @@ class Layer(checkpointable.CheckpointableBase):
       # TODO(fchollet): in the future, this should be handled at the
       # level of variable creation, and weight regularization losses
       # should be variable attributes.
-      self._handle_weight_regularization(name, variable, regularizer)
-
+      name_in_scope = variable.name[:variable.name.find(':')]
+      self._handle_weight_regularization(name_in_scope,
+                                         variable,
+                                         regularizer)
     if trainable:
       self._trainable_weights.append(variable)
     else:
@@ -457,15 +477,10 @@ class Layer(checkpointable.CheckpointableBase):
             one per output tensor of the layer).
     """
     if not self.supports_masking:
-      if mask is not None:
-        if isinstance(mask, list):
-          if any(m is not None for m in mask):
-            raise TypeError('Layer ' + self.name + ' does not support masking, '
-                            'but was passed an input_mask: ' + str(mask))
-        else:
-          raise TypeError('Layer ' + self.name + ' does not support masking, '
-                          'but was passed an input_mask: ' + str(mask))
-      # masking not explicitly supported: return None as mask
+      if any(m is not None for m in nest.flatten(mask)):
+        raise TypeError('Layer ' + self.name + ' does not support masking, '
+                        'but was passed an input_mask: ' + str(mask))
+      # masking not explicitly supported: return None as mask.
       return None
     # if masking is explicitly supported, by default
     # carry over the input mask
@@ -497,96 +512,115 @@ class Layer(checkpointable.CheckpointableBase):
       ValueError: if the layer's `call` method returns None (an invalid value).
     """
     input_list = nest.flatten(inputs)
+    # Accept NumPy inputs by converting to Tensors.
+    if any(isinstance(x, (np.ndarray, float, int)) for x in input_list):
+      # Don't call `ops.convert_to_tensor` on all `inputs` because
+      # `SparseTensors` can't be converted to `Tensor`.
+      def _convert_non_tensor(x):
+        if isinstance(x, (np.ndarray, float, int)):
+          return ops.convert_to_tensor(x)
+        return x
 
-    if context.executing_eagerly():
-      # Accept NumPy inputs by converting to Tensors when executing eagerly.
-      if all(isinstance(x, (np.ndarray, float, int)) for x in input_list):
-        inputs = nest.map_structure(ops.convert_to_tensor, inputs)
-        input_list = nest.flatten(inputs)
+      inputs = nest.map_structure(_convert_non_tensor, inputs)
+      input_list = nest.flatten(inputs)
 
     # We will attempt to build a TF graph if & only if all inputs are symbolic.
     # This is always the case in graph mode. It can also be the case in eager
     # mode when all inputs can be traced back to `keras.Input()` (when building
     # models using the functional API).
     build_graph = tf_utils.are_all_symbolic_tensors(input_list)
-    executing_eagerly = context.executing_eagerly()
+
+    if build_graph:
+      # Only create Keras history if at least one tensor originates from a
+      # `keras.Input`. Otherwise this Layer may be being used outside the Keras
+      # framework.
+      if base_layer_utils.needs_keras_history(inputs):
+        base_layer_utils.create_keras_history(inputs)
 
     # Handle Keras mask propagation from previous layer to current layer.
     previous_mask = None
-    if build_graph and (not hasattr(self, '_compute_previous_mask') or
-                        self._compute_previous_mask):
+    if (not hasattr(self, '_compute_previous_mask') or
+        self._compute_previous_mask):
       previous_mask = base_layer_utils.collect_previous_mask(inputs)
       if not hasattr(self, '_call_fn_args'):
-        self._call_fn_args = self._no_dependency(
-            function_utils.fn_args(self.call))
+        self._call_fn_args = function_utils.fn_args(self.call)
       if ('mask' in self._call_fn_args and 'mask' not in kwargs and
           not generic_utils.is_all_none(previous_mask)):
-        # The previous layer generated a mask, and mask was not explicitly pass
-        # to __call__, hence we set previous_mask as the default value.
+        # The previous layer generated a mask, and mask was not explicitly
+        # pass to __call__, hence we set previous_mask as the default value.
         kwargs['mask'] = previous_mask
 
-    input_shapes = None
-
-    with ops.name_scope(self._name_scope()):
-      if not self.built:
-        # Build layer if applicable (if the `build` method has been overridden).
-        self._maybe_build(inputs)
-        # We must set self.built since user defined build functions are not
-        # constrained to set self.built.
-        self.built = True
-
+    with base_layer_utils.call_context():
       # Check input assumptions set after layer building, e.g. input shape.
       if build_graph:
         # Symbolic execution on symbolic tensors. We will attempt to build
         # the corresponding TF subgraph inside `backend.get_graph()`
-        input_spec.assert_input_compatibility(
-            self.input_spec, inputs, self.name)
+        input_spec.assert_input_compatibility(self.input_spec, inputs,
+                                              self.name)
         graph = backend.get_graph()
-        with graph.as_default():
-          if not executing_eagerly:
-            # In graph mode, failure to build the layer's graph
-            # implies a user-side bug. We don't catch exceptions.
-            outputs = self.call(inputs, *args, **kwargs)
-          else:
+        with graph.as_default(), ops.name_scope(self._name_scope()):
+          # Build layer if applicable (if the `build` method has been
+          # overridden).
+          self._maybe_build(inputs)
+          # Explicitly pass the learning phase placeholder to `call` if
+          # the `training` argument was left unspecified by the user.
+          # This behavior is restricted to the managed Keras FuncGraph.
+          learning_phase_passed_by_framework = False
+          if (self._expects_training_arg and
+              not base_layer_utils.training_arg_passed_to_call(
+                  tf_inspect.getfullargspec(self.call), args, kwargs) and
+              getattr(graph, 'name', None) == 'keras_graph'):
+            learning_phase_passed_by_framework = True
+            kwargs['training'] = backend.learning_phase()
+          if not self.dynamic:
             try:
               outputs = self.call(inputs, *args, **kwargs)
-            except Exception:  # pylint: disable=broad-except
-              # Any issue during graph-building means we will later run the
-              # model in eager mode, whether the issue was related to
-              # graph mode or not. This provides a nice debugging experience.
-              self._call_is_graph_friendly = False
-              # We will use static shape inference to return symbolic tensors
-              # matching the specifications of the layer outputs.
-              # Since we have set `self._call_is_graph_friendly = False`,
-              # we will never attempt to run the underlying TF graph (which is
-              # disconnected).
-              # TODO(fchollet): consider py_func as an alternative, which
-              # would enable us to run the underlying graph if needed.
-              input_shapes = nest.map_structure(lambda x: x.shape, inputs)
-              output_shapes = self.compute_output_shape(input_shapes)
-              outputs = nest.map_structure(
-                  lambda shape: backend.placeholder(shape, dtype=self.dtype),
-                  output_shapes)
+            except TypeError as e:
+              messages = ('`tf.Tensor` as a Python `bool` is not allowed',
+                          'Tensor objects are only iterable when eager')
+              exception_str = str(e)
+              for msg in messages:
+                if msg in exception_str:
+                  raise TypeError('You are attempting to use Python control '
+                                  'flow in a layer that was not declared to be '
+                                  'dynamic. Pass `dynamic=True` to the class '
+                                  'constructor.\nEncountered error:\n"""\n' +
+                                  exception_str + '\n"""')
+              raise
+          else:
+            # We will use static shape inference to return symbolic tensors
+            # matching the specifications of the layer outputs.
+            # Since `self.dynamic` is True, we will never attempt to
+            # run the underlying TF graph (which is disconnected).
+            # TODO(fchollet): consider py_func as an alternative, which
+            # would enable us to run the underlying graph if needed.
+            outputs = self._symbolic_call(inputs)
 
           if outputs is None:
             raise ValueError('A layer\'s `call` method should return a '
                              'Tensor or a list of Tensors, not None '
                              '(layer: ' + self.name + ').')
-          self._handle_activity_regularization(inputs, outputs)
-          self._set_mask_metadata(inputs, outputs, previous_mask)
           if base_layer_utils.have_all_keras_metadata(inputs):
+            if learning_phase_passed_by_framework:
+              kwargs.pop('training')
             inputs, outputs = self._set_connectivity_metadata_(
                 inputs, outputs, args, kwargs)
+          self._handle_activity_regularization(inputs, outputs)
+          self._set_mask_metadata(inputs, outputs, previous_mask)
           if hasattr(self, '_set_inputs') and not self.inputs:
             # Subclassed network: explicitly set metadata normally set by
             # a call to self._set_inputs().
-            # This is not relevant in eager execution.
+            # TODO(b/120997007): This should be done in Eager as well, but
+            # causes garbage collection issues because of the placeholders
+            # created on the default Keras graph.
             self._set_inputs(inputs, outputs)
       else:
         # Eager execution on data tensors.
-        outputs = self.call(inputs, *args, **kwargs)
-        self._handle_activity_regularization(inputs, outputs)
-        return outputs
+        with ops.name_scope(self._name_scope()):
+          self._maybe_build(inputs)
+          outputs = self.call(inputs, *args, **kwargs)
+          self._handle_activity_regularization(inputs, outputs)
+          self._set_mask_metadata(inputs, outputs, previous_mask)
 
     if not context.executing_eagerly():
       # Optionally load weight values specified at layer instantiation.
@@ -605,6 +639,10 @@ class Layer(checkpointable.CheckpointableBase):
   def name(self):
     return self._name
 
+  @property
+  def dynamic(self):
+    return self._dynamic
+
   @property
   def activity_regularizer(self):
     """Optional regularizer function for the output of this layer."""
@@ -613,18 +651,24 @@ class Layer(checkpointable.CheckpointableBase):
   @activity_regularizer.setter
   def activity_regularizer(self, regularizer):
     """Optional regularizer function for the output of this layer."""
-    self._activity_regularizer = self._no_dependency(regularizer)
+    self._activity_regularizer = regularizer
 
   @property
   def trainable_weights(self):
-    return self._trainable_weights if self.trainable else []
+    if self.trainable:
+      nested = self._gather_children_attribute('trainable_weights')
+      return self._trainable_weights + nested
+    else:
+      return []
 
   @property
   def non_trainable_weights(self):
     if self.trainable:
-      return self._non_trainable_weights
+      nested = self._gather_children_attribute('non_trainable_weights')
+      return self._non_trainable_weights + nested
     else:
-      return self._trainable_weights + self._non_trainable_weights
+      nested = self._gather_children_attribute('weights')
+      return self._trainable_weights + self._non_trainable_weights + nested
 
   @property
   def weights(self):
@@ -639,7 +683,7 @@ class Layer(checkpointable.CheckpointableBase):
   def updates(self):
     if not self.trainable and not self.stateful:
       return []
-    return self._updates
+    return self._updates + self._gather_children_attribute('updates')
 
   @property
   def losses(self):
@@ -653,7 +697,12 @@ class Layer(checkpointable.CheckpointableBase):
       A list of tensors.
     """
     collected_losses = []
-    if context.executing_eagerly():
+
+    # If any eager losses are present, we assume the model to be part of an
+    # eager training loop (either a custom one or the one used when
+    # `run_eagerly=True`), and so we always return just the eager losses in that
+    # case.
+    if self._eager_losses:
       collected_losses.extend(self._eager_losses)
     else:
       collected_losses.extend(self._losses)
@@ -661,7 +710,7 @@ class Layer(checkpointable.CheckpointableBase):
       loss_tensor = regularizer()
       if loss_tensor is not None:
         collected_losses.append(loss_tensor)
-    return collected_losses
+    return collected_losses + self._gather_children_attribute('losses')
 
   @doc_controls.for_subclass_implementers
   def add_loss(self, losses, inputs=None):
@@ -684,6 +733,7 @@ class Layer(checkpointable.CheckpointableBase):
     Arguments:
       losses: Loss tensor, or list/tuple of tensors. Rather than tensors, losses
         may also be zero-argument callables which create a loss tensor.
+        Other types of input are ignored.
       inputs: Ignored when executing eagerly. If anything other than None is
         passed, it signals the losses are conditional on some of the layer's
         inputs, and thus they should only be run where these inputs are
@@ -709,10 +759,13 @@ class Layer(checkpointable.CheckpointableBase):
         self._callable_losses.append(
             functools.partial(_tag_unconditional, loss))
       else:
-        if context.executing_eagerly():
-          self._eager_losses.append(_tag_unconditional(loss))
-        else:
+        if not tensor_util.is_tensor(loss):
+          # Ignoring constant values as this does not affect the gradients.
+          return
+        if tf_utils.is_symbolic_tensor(loss):
           self._losses.append(_tag_unconditional(loss))
+        else:
+          self._eager_losses.append(_tag_unconditional(loss))
 
   @doc_controls.for_subclass_implementers
   def add_metric(self, value, aggregation=None, name=None):
@@ -725,7 +778,7 @@ class Layer(checkpointable.CheckpointableBase):
         already. eg, `model.add_metric(BinaryAccuracy(name='acc')(y_true,
         y_pred))`. If aggregation='mean', the given metric tensor will be
         sample-wise reduced using `mean` function. eg, `model.add_metric(
-        tf.reduce_mean(outputs), name='output_mean', aggregation='mean')`.
+        tf.reduce_sum(outputs), name='output_mean', aggregation='mean')`.
       name: String metric name.
 
     Raises:
@@ -736,8 +789,25 @@ class Layer(checkpointable.CheckpointableBase):
           'We currently support only `mean` sample-wise metric aggregation. '
           'You provided aggregation=`%s`' % aggregation)
 
-    if tf_utils.is_symbolic_tensor(value):
-      self._symbolic_add_metric(value, aggregation, name)
+    is_symbolic = tf_utils.is_symbolic_tensor(value)
+    if name is None and (not is_symbolic or not hasattr(value, '_metric_obj')):
+      # Eg. `self.add_metric(math_ops.reduce_sum(x), aggregation='mean')`
+      # In eager mode, we use metric name to lookup a metric. Without a name,
+      # a new Mean metric wrapper will be created on every model/layer call.
+      # So, we raise an error when no name is provided.
+      # We will do the same for symbolic mode for consistency although a name
+      # will be generated if no name is provided.
+
+      # We will not raise this error in the foll use case for the sake of
+      # consistency as name in provided in the metric constructor.
+      # model.add_metric(metrics.Mean(name='my_metric')(outputs))
+      raise ValueError('Please provide a name for your metric like '
+                       '`self.add_metric(tf.reduce_sum(inputs), '
+                       'name=\'mean_activation\', aggregation=\'mean\')`')
+
+    if is_symbolic:
+      with backend.get_graph().as_default():
+        self._symbolic_add_metric(value, aggregation, name)
     else:
       self._eager_add_metric(value, aggregation, name)
 
@@ -1109,14 +1179,7 @@ class Layer(checkpointable.CheckpointableBase):
     all_input_shapes = set(
         [str(node.input_shapes) for node in self._inbound_nodes])
     if len(all_input_shapes) == 1:
-      input_shapes = self._inbound_nodes[0].input_shapes
-      if len(input_shapes) == 1:
-        return tuple(tensor_shape.TensorShape(input_shapes[0]).as_list())
-      else:
-        return [
-            tuple(tensor_shape.TensorShape(shape).as_list())
-            for shape in input_shapes
-        ]
+      return self._inbound_nodes[0].input_shapes
     else:
       raise AttributeError('The layer "' + str(self.name) +
                            ' has multiple inbound nodes, '
@@ -1167,14 +1230,7 @@ class Layer(checkpointable.CheckpointableBase):
     all_output_shapes = set(
         [str(node.output_shapes) for node in self._inbound_nodes])
     if len(all_output_shapes) == 1:
-      output_shapes = self._inbound_nodes[0].output_shapes
-      if len(output_shapes) == 1:
-        return tuple(tensor_shape.TensorShape(output_shapes[0]).as_list())
-      else:
-        return [
-            tuple(tensor_shape.TensorShape(shape).as_list())
-            for shape in output_shapes
-        ]
+      return self._inbound_nodes[0].output_shapes
     else:
       raise AttributeError('The layer "%s"'
                            ' has multiple inbound nodes, '
@@ -1273,9 +1329,10 @@ class Layer(checkpointable.CheckpointableBase):
       match(value)  # Update the metric state.
       return
     else:
-      if aggregation is None:
-        raise ValueError('We do not support adding an aggregated metric tensor '
-                         'in `call` in eager execution.')
+      # Aggregation will always be set in this use case. If not we will raise
+      # error on model/layer call in graph function mode when model/layer is
+      # created.
+      assert aggregation is not None
       metric_obj, _ = base_layer_utils.create_mean_metric(value, name)
       self._metrics.append(metric_obj)
 
@@ -1294,10 +1351,20 @@ class Layer(checkpointable.CheckpointableBase):
         else:
           raise ValueError(
               'We currently do not support reusing a metric instance.')
-      else:
+      elif hasattr(value, '_metric_obj'):
         # We track the instance using the metadata on the result tensor.
         result_tensor = value
         metric_obj = result_tensor._metric_obj
+      else:
+        raise ValueError(
+            'We do not support adding an aggregated metric result tensor that '
+            'is not the output of a `tf.keras.metrics.Metric` metric instance. '
+            'Without having access to the metric instance we cannot reset the '
+            'state of a metric after every epoch during training. You can '
+            'create a `tf.keras.metrics.Metric` instance and pass the result '
+            'here or pass an un-aggregated result with `aggregation` parameter '
+            'set as `mean`. For example: `self.add_metric(tf.reduce_sum(inputs)'
+            ', name=\'mean_activation\', aggregation=\'mean\')`')
     else:
       # If a non-aggregated tensor is given as input (ie. `aggregation` is
       # explicitly set to `mean`), we wrap the tensor in `Mean` metric.
@@ -1337,27 +1404,35 @@ class Layer(checkpointable.CheckpointableBase):
           self.add_loss(mean_activity_loss, inputs=inputs)
 
   def _set_mask_metadata(self, inputs, outputs, previous_mask):
-    # In some cases the mask of the outputs has already been computed by
-    # inner layers and does not need to be recomputed by this layer.
-    mask_already_computed = all(
-        hasattr(x, '_keras_mask') for x in generic_utils.to_list(outputs))
-    if hasattr(self, 'compute_mask') and not mask_already_computed:
-      output_mask = self.compute_mask(inputs, previous_mask)
-    else:
-      output_mask = None
-    if isinstance(outputs, (list, tuple)):
-      if output_mask is None:
-        output_mask = [None for _ in range(len(outputs))]
-      for x, m in zip(outputs, output_mask):
+    flat_outputs = nest.flatten(outputs)
+    mask_already_computed = (
+        getattr(self, '_compute_output_and_mask_jointly', False) or
+        all(getattr(x, '_keras_mask', None) is not None for x in flat_outputs))
+
+    if not mask_already_computed:
+      if hasattr(self, 'compute_mask'):
+        output_masks = self.compute_mask(inputs, previous_mask)
+        # `compute_mask` can return a single `None` even when a Layer
+        # has multiple outputs.
+        if output_masks is None:
+          flat_masks = [None for _ in flat_outputs]
+        else:
+          flat_masks = nest.flatten(output_masks)
+      else:
+        flat_masks = [None for _ in flat_outputs]
+
+      for output, mask in zip(flat_outputs, flat_masks):
         try:
-          x._keras_mask = m  # pylint: disable=protected-access
+          output._keras_mask = mask
         except AttributeError:
-          pass  # C type such as dict. Masking not supported in this case.
-    else:
-      try:
-        outputs._keras_mask = output_mask  # pylint: disable=protected-access
-      except AttributeError:
-        pass  # C type such as dict. Masking not supported in this case.
+          # C Type such as np.ndarray.
+          pass
+
+    if tf_utils.are_all_symbolic_tensors(flat_outputs):
+      for output in flat_outputs:
+        if getattr(output, '_keras_mask', None) is not None:
+          # Do not track masks for `TensorFlowOpLayer` construction.
+          output._keras_mask._keras_history_checked = True
 
   def _set_connectivity_metadata_(self, inputs, outputs, args, kwargs):
     call_convention = getattr(
@@ -1385,16 +1460,14 @@ class Layer(checkpointable.CheckpointableBase):
     # If the layer returns tensors from its inputs, unmodified,
     # we copy them to avoid loss of tensor metadata.
     output_ls = nest.flatten(outputs)
+    inputs_ls = nest.flatten(inputs)
     output_ls_copy = []
     for x in output_ls:
-      if x in nest.flatten(inputs):
+      if x in inputs_ls:
         with ops.name_scope(self.name):
           x = array_ops.identity(x)
       output_ls_copy.append(x)
-    if len(output_ls_copy) == 1:
-      outputs = output_ls_copy[0]
-    else:
-      outputs = output_ls_copy
+    outputs = nest.pack_sequence_as(outputs, output_ls_copy)
 
     inputs, kwargs = self._inputs_from_call_args(
         call_args=(inputs,) + args, call_kwargs=kwargs)
@@ -1488,19 +1561,12 @@ class Layer(checkpointable.CheckpointableBase):
         arguments: dictionary of keyword arguments that were passed to the
             `call` method of the layer at the call that created the node.
     """
-    input_tensors = nest.flatten(input_tensors)
-    output_tensors = nest.flatten(output_tensors)
-
-    # Collect input tensor(s) coordinates.
-    inbound_layers = []
-    node_indices = []
-    tensor_indices = []
-    for x in input_tensors:
-      assert hasattr(x, '_keras_history')
-      inbound_layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
-      inbound_layers.append(inbound_layer)
-      node_indices.append(node_index)
-      tensor_indices.append(tensor_index)
+    inbound_layers = nest.map_structure(lambda t: t._keras_history[0],
+                                        input_tensors)
+    node_indices = nest.map_structure(lambda t: t._keras_history[1],
+                                      input_tensors)
+    tensor_indices = nest.map_structure(lambda t: t._keras_history[2],
+                                        input_tensors)
 
     # Create node, add it to inbound nodes.
     Node(
@@ -1513,13 +1579,15 @@ class Layer(checkpointable.CheckpointableBase):
         arguments=arguments)
 
     # Update tensor history metadata.
-    for i in range(len(output_tensors)):
-      # The metadata attribute consists of 1) a layer instance
-      # 2) a node index for the layer, 3) a tensor index for the node.
-      # The allows layer reuse (multiple nodes per layer) and multi-output
-      # or multi-input layers (e.g. a layer can return multiple tensors,
-      # and each can be sent to a different layer).
-      output_tensors[i]._keras_history = (self, len(self._inbound_nodes) - 1, i)  # pylint: disable=protected-access
+    # The metadata attribute consists of
+    # 1) a layer instance
+    # 2) a node index for the layer
+    # 3) a tensor index for the node.
+    # The allows layer reuse (multiple nodes per layer) and multi-output
+    # or multi-input layers (e.g. a layer can return multiple tensors,
+    # and each can be sent to a different layer).
+    for i, tensor in enumerate(nest.flatten(output_tensors)):
+      tensor._keras_history = (self, len(self._inbound_nodes) - 1, i)  # pylint: disable=protected-access
 
   def _get_node_attribute_at_index(self, node_index, attr, attr_name):
     """Private utility to retrieves an attribute (e.g. inputs) from a node.
@@ -1552,30 +1620,16 @@ class Layer(checkpointable.CheckpointableBase):
                        str(node_index) + ', but the layer has only ' +
                        str(len(self._inbound_nodes)) + ' inbound nodes.')
     values = getattr(self._inbound_nodes[node_index], attr)
-    if len(values) == 1:
+    if isinstance(values, list) and len(values) == 1:
       return values[0]
     else:
       return values
 
-  @property
-  def _static_graph_friendly(self):
-    """Whether the layer can be called to create a static graph.
-
-    Because of nesting, there are two components to being "graph-friendly":
-      1) all inner layers are graph-friendly
-      2) the way they are composed is graph-friendly.
-    We denote the latter as "_call_is_graph_friendly", and define
-    "_static_graph_friendly" as being the combination of
-    "_call_is_graph_friendly" and "all inner layers are _static_graph_friendly".
-    For atomic layers (no inner layers), this is just "_call_is_graph_friendly".
-
-    Returns:
-      Boolean.
-    """
-    return self._call_is_graph_friendly
-
   def _maybe_build(self, inputs):
     # Check input assumptions set before layer building, e.g. input rank.
+    if self.built:
+      return
+
     input_spec.assert_input_compatibility(
         self.input_spec, inputs, self.name)
     input_list = nest.flatten(inputs)
@@ -1590,6 +1644,79 @@ class Layer(checkpointable.CheckpointableBase):
     # Only call `build` if the user has manually overridden the build method.
     if not hasattr(self.build, '_is_default'):
       self.build(input_shapes)
+    # We must set self.built since user defined build functions are not
+    # constrained to set self.built.
+    self.built = True
+
+  def _symbolic_call(self, inputs):
+    input_shapes = nest.map_structure(lambda x: x.shape, inputs)
+    output_shapes = self.compute_output_shape(input_shapes)
+
+    def _make_placeholder_like(shape):
+      ph = backend.placeholder(shape, self.dtype)
+      ph._keras_mask = None
+      return ph
+
+    return nest.map_structure(_make_placeholder_like, output_shapes)
+
+  def __setattr__(self, name, value):
+    if (not getattr(self, '_setattr_tracking', True) or
+        getattr(self, '_is_graph_network', False)):
+      super(Layer, self).__setattr__(name, value)
+      return
+
+    # Keep track of trackable objects, for the needs of `Network.save_weights`.
+    value = data_structures.sticky_attribute_assignment(
+        trackable=self, value=value, name=name)
+
+    # Append value to self._layers if relevant
+    if (isinstance(value, Layer) or
+        trackable_layer_utils.has_weights(value)):
+      # Initialize `_layers` here in case `__init__` has not yet been called.
+      if not hasattr(self, '_layers'):
+        super(Layer, self).__setattr__('_layers', [])
+      # We need to check object identity to avoid de-duplicating empty
+      # container types which compare equal.
+      if not any((layer is value for layer in self._layers)):
+        self._layers.append(value)
+        if hasattr(value, '_use_resource_variables'):
+          # Legacy layers (V1 tf.layers) must always use
+          # resource variables.
+          value._use_resource_variables = True
+
+    # Append value to list of trainable / non-trainable weights if relevant
+    # TODO(b/125122625): This won't pick up on any variables added to a
+    # list/dict after creation.
+    for val in nest.flatten(value):
+      if isinstance(val, tf_variables.Variable):
+        # Users may add extra weights/variables
+        # simply by assigning them to attributes (invalid for graph networks)
+        if not hasattr(self, '_trainable_weights'):
+          super(Layer, self).__setattr__('_trainable_weights', [])
+        if not hasattr(self, '_non_trainable_weights'):
+          super(Layer, self).__setattr__('_non_trainable_weights', [])
+        if val not in self._trainable_weights + self._non_trainable_weights:
+          if val.trainable:
+            self._trainable_weights.append(val)
+          else:
+            self._non_trainable_weights.append(val)
+          backend.track_variable(val)
+
+    super(Layer, self).__setattr__(name, value)
+
+  def _gather_children_attribute(self, attribute):
+    assert attribute in {'weights', 'trainable_weights',
+                         'non_trainable_weights', 'updates', 'losses'}
+    if hasattr(self, '_layers'):
+      return list(itertools.chain.from_iterable(
+          getattr(layer, attribute) for layer in self._layers))
+    return []
+
+  # This is a hack so that the is_layer (within
+  # training/trackable/layer_utils.py) check doesn't get the weights attr.
+  # TODO(b/110718070): Remove when fixed.
+  def _is_layer(self):
+    return True
 
 
 class Node(object):
@@ -1639,12 +1766,13 @@ class Node(object):
                input_tensors,
                output_tensors,
                arguments=None):
-    # Layer instance (NOT a list).
-    if isinstance(outbound_layer, list):
-      raise ValueError(
-          '`outbound_layer` should be a layer instance, not a list.')
-    # this is the layer that takes a list of input tensors
-    # and turns them into a list of output tensors.
+    # Layer instance (NOT a sequence)
+    if isinstance(outbound_layer, (list, tuple, dict)):
+      raise ValueError('`outbound_layer` should be a layer instance, '
+                       'not a list, tuple, or, dict.')
+
+    # this is the layer that takes a nested structure of input tensors
+    # and turns them into a nested structure of output tensors.
     # the current node will be added to
     # the inbound_nodes of outbound_layer.
     self.outbound_layer = outbound_layer
@@ -1654,33 +1782,33 @@ class Node(object):
     # and for each layer, which node and which
     # tensor output of each node.
 
-    # List of layer instances.
+    # Nested structure of layer instances.
     self.inbound_layers = inbound_layers
-    # List of integers, 1:1 mapping with inbound_layers.
+    # Nested structure of integers, 1:1 mapping with inbound_layers.
     self.node_indices = node_indices
-    # List of integers, 1:1 mapping with inbound_layers.
+    # Nested of integers, 1:1 mapping with inbound_layers.
     self.tensor_indices = tensor_indices
 
     # Following 2 properties:
     # tensor inputs and outputs of outbound_layer.
 
-    # List of tensors. 1:1 mapping with inbound_layers.
+    # Nested structure of tensors. 1:1 mapping with inbound_layers.
     self.input_tensors = input_tensors
-    # List of tensors, created by outbound_layer.call().
+    # Nested structure of tensors, created by outbound_layer.call().
     self.output_tensors = output_tensors
 
     # Following 2 properties: input and output shapes.
 
-    # List of shape tuples, shapes of input_tensors.
-    self.input_shapes = [backend.int_shape(x) for x in input_tensors]
-    # List of shape tuples, shapes of output_tensors.
-    self.output_shapes = [backend.int_shape(x) for x in output_tensors]
+    # Nested structure of shape tuples, shapes of input_tensors.
+    self.input_shapes = nest.map_structure(backend.int_shape, input_tensors)
+    # Nested structure of shape tuples, shapes of output_tensors.
+    self.output_shapes = nest.map_structure(backend.int_shape, output_tensors)
 
     # Optional keyword arguments to layer's `call`.
     self.arguments = arguments
 
     # Add nodes to all layers involved.
-    for layer in inbound_layers:
+    for layer in nest.flatten(inbound_layers):
       if layer is not None:
         # For compatibility with external Keras, we use the deprecated
         # accessor here.
@@ -1689,13 +1817,19 @@ class Node(object):
     # accessor here.
     outbound_layer.inbound_nodes.append(self)
 
+  def iterate_inbound(self):
+    """Returns a list of tuples representing the inbound data.
+
+    Returns:
+      List of tuples like: (inbound_layer, node_index, tensor_index, tensor).
+    """
+    return zip(
+        nest.flatten(self.inbound_layers), nest.flatten(self.node_indices),
+        nest.flatten(self.tensor_indices), nest.flatten(self.input_tensors))
+
   def get_config(self):
-    inbound_names = []
-    for layer in self.inbound_layers:
-      if layer:
-        inbound_names.append(layer.name)
-      else:
-        inbound_names.append(None)
+    inbound_names = nest.map_structure(
+        lambda layer: layer.name if layer else None, self.inbound_layers)
     return {
         'outbound_layer': self.outbound_layer.name,
         'inbound_layers': inbound_names,
@@ -1704,6 +1838,90 @@ class Node(object):
     }
 
 
+class TensorFlowOpLayer(Layer):
+  """Wraps a TensorFlow Operation in a Layer.
+
+  This class is used internally by the Functional API. When a user
+  uses a raw TensorFlow Operation on symbolic tensors originating
+  from an `Input` Layer, the resultant operation will be wrapped
+  with this Layer object in order to make the operation compatible
+  with the Keras API.
+
+  This Layer will create a new, identical operation (except for inputs
+  and outputs) every time it is called. If `run_eagerly` is `True`,
+  the op creation and calculation will happen inside an Eager function.
+
+  Instances of this Layer are created when `autolambda` is called, which
+  is whenever a Layer's `__call__` encounters symbolic inputs that do
+  not have Keras metadata, or when a Network's `__init__` encounters
+  outputs that do not have Keras metadata.
+
+  Attributes:
+    node_def: String, the serialized NodeDef of the Op this layer will wrap.
+    constants: Dict of NumPy arrays, the values of any Tensors needed for this
+      Operation that do not originate from a Keras `Input` Layer. Since all
+      placeholders must come from Keras `Input` Layers, these Tensors must be
+      treated as constant in the Functional API.
+    name: String, the name of the Layer.
+    trainable: Bool, whether this Layer is trainable. Currently Variables are
+      not supported, and so this parameter has no effect.
+    dtype: The default dtype of this Layer. Inherited from `Layer` and has no
+      effect on this class, however is used in `get_config`.
+  """
+
+  def __init__(self,
+               node_def,
+               constants=None,
+               name=None,
+               trainable=True,
+               dtype=None):
+    super(TensorFlowOpLayer, self).__init__(
+        name=name, trainable=trainable, dtype=dtype)
+    self.node_def = node_def_pb2.NodeDef.FromString(node_def)
+    self.constants = constants or {}
+    # Layer uses original op unless it is called on new inputs.
+    # This means `built` is not set in `__call__`.
+    self.built = True
+
+  def call(self, inputs):
+    if context.executing_eagerly():
+      return self._defun_call(inputs)
+    return self._make_op(inputs)
+
+  def _make_op(self, inputs):
+    inputs = nest.flatten(inputs)
+    graph = inputs[0].graph
+    with graph.as_default():
+      for index, constant in self.constants.items():
+        constant = ops.convert_to_tensor(constant)
+        inputs.insert(index, constant)
+
+      self.node_def.name = graph.unique_name(self.node_def.name)
+      # Check for case where first input should be a list of Tensors.
+      if 'N' in self.node_def.attr:
+        num_tensors = self.node_def.attr['N'].i
+        inputs = [inputs[:num_tensors]] + inputs[num_tensors:]
+      c_op = ops._create_c_op(graph, self.node_def, inputs, control_inputs=[])
+      op = graph._create_op_from_tf_operation(c_op)
+
+      if len(op.outputs) == 1:
+        return op.outputs[0]
+      return op.outputs
+
+  @function.defun
+  def _defun_call(self, inputs):
+    """Wraps the op creation method in an Eager function for `run_eagerly`."""
+    return self._make_op(inputs)
+
+  def get_config(self):
+    config = super(TensorFlowOpLayer, self).get_config()
+    config.update({
+        'node_def': self.node_def.SerializeToString(),
+        'constants': self.constants
+    })
+    return config
+
+
 def default(method):
   """Decorates a method to detect overrides in subclasses."""
   method._is_default = True
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index 798775b6a5b29aa72a2c766584811aa469db2471..b4748d130113ed218dfe5c8375cbd39af96f6e23 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -18,20 +18,34 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import sys
+import traceback
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.optimizer_v2 import rmsprop
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
 class DynamicLayer1(base_layer.Layer):
 
+  def __init__(self, dynamic=False, **kwargs):
+    super(DynamicLayer1, self).__init__(dynamic=dynamic, **kwargs)
+
   def call(self, inputs):
     if math_ops.reduce_sum(inputs) > 0:
       return math_ops.sqrt(inputs)
@@ -44,6 +58,9 @@ class DynamicLayer1(base_layer.Layer):
 
 class DynamicLayer2(base_layer.Layer):
 
+  def __init__(self, dynamic=False, **kwargs):
+    super(DynamicLayer2, self).__init__(dynamic=dynamic, **kwargs)
+
   def call(self, inputs):
     samples = []
     for sample in inputs:
@@ -59,67 +76,295 @@ class InvalidLayer(base_layer.Layer):
   def call(self, inputs):
     raise ValueError('You did something wrong!')
 
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
 
-class BaseLayerTest(test.TestCase):
+class BaseLayerTest(keras_parameterized.TestCase):
 
-  def test_dynamic_layer_in_functional_model_in_graph_mode(self):
+  @parameterized.parameters(DynamicLayer1, DynamicLayer2)
+  def test_dynamic_layer_in_functional_model_in_graph_mode(self, layer_class):
     with context.graph_mode():
       inputs = keras.Input((3,))
+      # Works when `dynamic=True` is declared.
+      outputs = layer_class(dynamic=True)(inputs)
+      model = keras.Model(inputs, outputs)
+      self.assertEqual(model.dynamic, True)
+      # But then you cannot run the model since you're in a graph scope.
       with self.assertRaisesRegexp(
-          TypeError, 'Using a `tf.Tensor` as a Python `bool` is not allowed'):
-        _ = DynamicLayer1()(inputs)
+          ValueError, 'You must enable eager execution'):
+        model.compile(rmsprop.RMSprop(0.001), loss='mse')
 
-      inputs = keras.Input((3,))
+      # Fails when `dynamic=True` not declared.
       with self.assertRaisesRegexp(
-          TypeError, 'Tensor objects are only iterable when eager'):
-        _ = DynamicLayer2()(inputs)
+          TypeError, 'attempting to use Python control flow'):
+        _ = layer_class()(inputs)
 
-  def test_dynamic_layer_in_functional_model_in_eager_mode(self):
+  @parameterized.parameters(DynamicLayer1, DynamicLayer2)
+  def test_dynamic_layer_in_functional_model_in_eager_mode(self, layer_class):
     inputs = keras.Input((3,))
-    outputs = DynamicLayer1()(inputs)
-    model = keras.Model(inputs, outputs)
-    self.assertEqual(model._static_graph_friendly, False)
-    model.compile(RMSPropOptimizer(0.001), loss='mse')
-    model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
-
-    inputs = keras.Input((3,))
-    outputs = DynamicLayer2()(inputs)
+    # Fails when `dynamic=True` not declared.
+    with self.assertRaisesRegexp(
+        TypeError, 'attempting to use Python control flow'):
+      _ = layer_class()(inputs)
+    # Works when `dynamic=True` is declared.
+    outputs = layer_class(dynamic=True)(inputs)
     model = keras.Model(inputs, outputs)
-    self.assertEqual(model._static_graph_friendly, False)
-    model.compile(RMSPropOptimizer(0.001), loss='mse')
+    self.assertEqual(model.dynamic, True)
+    model.compile(rmsprop.RMSprop(0.001), loss='mse')
+    self.assertEqual(model.run_eagerly, True)
     model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
 
-  def nested_dynamic_layers_in_eager_mode(self):
+  def test_nested_dynamic_layers_in_eager_mode(self):
     inputs = keras.Input((3,))
-    outputs = DynamicLayer1()(inputs)
+    outputs = DynamicLayer1(dynamic=True)(inputs)
     inner_model = keras.Model(inputs, outputs)
+    self.assertEqual(inner_model.dynamic, True)
 
     inputs = keras.Input((3,))
-    x = DynamicLayer2()(inputs)
+    x = DynamicLayer2(dynamic=True)(inputs)
     outputs = inner_model(x)
 
     model = keras.Model(inputs, outputs)
-    self.assertEqual(model._static_graph_friendly, False)
-    model.compile(RMSPropOptimizer(0.001), loss='mse')
+    self.assertEqual(model.dynamic, True)
+    model.compile(rmsprop.RMSprop(0.001), loss='mse')
+    self.assertEqual(model.run_eagerly, True)
     model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
 
-  def test_invalid_forward_pass_in_graph_mode(self):
-    with context.graph_mode():
-      inputs = keras.Input((3,))
-      with self.assertRaisesRegexp(ValueError, 'You did something wrong!'):
-        _ = InvalidLayer()(inputs)
+  def test_dynamic_layers_in_sequential_model(self):
+    # Without input_shape argument
+    model = keras.Sequential([DynamicLayer1(dynamic=True),
+                              keras.layers.Dense(3),
+                              DynamicLayer2(dynamic=True)])
+    self.assertEqual(model.dynamic, True)
+    model.compile(rmsprop.RMSprop(0.001), loss='mse')
+    self.assertEqual(model.run_eagerly, True)
+    model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
+
+    # With input_shape argument
+    model = keras.Sequential([DynamicLayer1(dynamic=True, input_shape=(3,)),
+                              DynamicLayer2(dynamic=True)])
+    self.assertEqual(model.dynamic, True)
+    model.compile(rmsprop.RMSprop(0.001), loss='mse')
+    self.assertEqual(model.run_eagerly, True)
+    model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
+
+  def test_dynamic_layers_in_subclassed_model(self):
+
+    class MyModel(keras.Model):
+
+      def __init__(self):
+        super(MyModel, self).__init__()
+        self.layer1 = DynamicLayer1(dynamic=True)
+
+      def call(self, inputs):
+        return self.layer1(inputs)
+
+    model = MyModel()
+    self.assertEqual(model.dynamic, True)
+    model.compile(rmsprop.RMSprop(0.001), loss='mse')
+    self.assertEqual(model.run_eagerly, True)
+    model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
+
+  def test_dynamic_subclassed_model_no_shape_inference(self):
+
+    class MyModel(keras.Model):
+
+      def __init__(self):
+        super(MyModel, self).__init__(dynamic=True)
+        self.layer1 = keras.layers.Dense(3)
+        self.layer2 = keras.layers.Dense(3)
+
+      def call(self, inputs):
+        if math_ops.reduce_sum(inputs) > 0:
+          return self.layer1(inputs)
+        else:
+          return self.layer2(inputs)
+
+    model = MyModel()
+    self.assertEqual(model.dynamic, True)
+    model.compile(rmsprop.RMSprop(0.001), loss='mse')
+    self.assertEqual(model.run_eagerly, True)
+    model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
+    self.assertEqual(model.outputs, [None])
+
+  def test_dynamic_subclassed_model_with_shape_inference(self):
 
-  def test_invalid_forward_pass_in_eager_mode(self):
+    class MyModel(keras.Model):
+
+      def __init__(self):
+        super(MyModel, self).__init__(dynamic=True)
+        self.layer1 = keras.layers.Dense(3)
+        self.layer2 = keras.layers.Dense(3)
+
+      def call(self, inputs):
+        if math_ops.reduce_sum(inputs) > 0:
+          return self.layer1(inputs)
+        else:
+          return self.layer2(inputs)
+
+      def compute_output_shape(self, input_shape):
+        return tensor_shape.TensorShape(
+            tuple(input_shape[:-1].as_list()) + (3,))
+
+    model = MyModel()
+    self.assertEqual(model.dynamic, True)
+    model.compile(rmsprop.RMSprop(0.001), loss='mse')
+    model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
+    self.assertEqual(model.outputs[0].shape.as_list(), [None, 3])
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_invalid_forward_pass(self):
     inputs = keras.Input((3,))
-    outputs = InvalidLayer()(inputs)
-    model = keras.Model(inputs, outputs)
-    self.assertEqual(model._static_graph_friendly, False)
-    model.compile(RMSPropOptimizer(0.001), loss='mse')
     with self.assertRaisesRegexp(ValueError, 'You did something wrong!'):
-      model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
+      _ = InvalidLayer()(inputs)
+
+  @keras_parameterized.run_with_all_model_types
+  @test_util.run_in_graph_and_eager_modes
+  def test_build_with_numpy_data(self):
+    model_layers = [
+        keras.layers.Dense(3, activation='relu', kernel_initializer='ones'),
+        keras.layers.Dense(1, activation='sigmoid', kernel_initializer='ones')
+    ]
+    model = testing_utils.get_model_from_layers(model_layers, input_shape=(4,))
+    model(np.zeros((2, 4), dtype='float32'))
+    self.assertTrue(model.built)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_default_add_weight(self):
+
+    class TestLayer(keras.layers.Layer):
+
+      def __init__(self):
+        super(TestLayer, self).__init__()
+        self.default_weight = self.add_weight()
+        self.weight_without_name = self.add_weight(shape=(3, 4))
+        self.regularized_weight_without_name = self.add_weight(
+            shape=(3, 4), regularizer='l2')
+
+    layer = TestLayer()
+    self.assertEqual(layer.default_weight.shape.as_list(), [])
+    self.assertEqual(layer.weight_without_name.shape.as_list(), [3, 4])
+    self.assertEqual(layer.default_weight.dtype.name, 'float32')
+    self.assertEqual(layer.weight_without_name.dtype.name, 'float32')
+    self.assertEqual(len(layer.losses), 1)
+    if not context.executing_eagerly():
+      # Cannot access tensor.name in eager execution.
+      self.assertTrue('Variable_2/Regularizer' in layer.losses[0].name)
+
+  def test_learning_phase_freezing_for_layers(self):
+    # This test is only meant to run in graph functions mode (ambient eager).
+    # In forced eager, `model.predict` ignores the global learning phase
+    # and just uses training=False. TODO(fchollet): consider unifying the
+    # behaviors.
+
+    class LearningPhaseLayer(keras.layers.Layer):
+
+      def call(self, inputs):
+        return keras.backend.in_train_phase(
+            lambda: array_ops.ones_like(inputs),
+            lambda: array_ops.zeros_like(inputs))
+
+    def get_learning_phase_value():
+      model = keras.models.Sequential([LearningPhaseLayer(input_shape=(1,))])
+      return np.sum(model.predict(np.ones((1, 1))))
+
+    self.assertEqual(get_learning_phase_value(), 0)
+
+    # Test scope.
+    with keras.backend.learning_phase_scope(1):
+      self.assertEqual(get_learning_phase_value(), 1)
+
+    # The effects of the scope end after exiting it.
+    self.assertEqual(get_learning_phase_value(), 0)
+
+    # Test setting.
+    keras.backend.set_learning_phase(1)
+    self.assertEqual(get_learning_phase_value(), 1)
+    keras.backend.set_learning_phase(0)
+    self.assertEqual(get_learning_phase_value(), 0)
+
+  # Cannot be enabled with `run_eagerly=True`, see b/123904578
+  @test_util.run_all_in_graph_and_eager_modes
+  def test_layer_can_return_variable(self):
+
+    class ComputeSum(keras.layers.Layer):
+
+      def __init__(self):
+        super(ComputeSum, self).__init__()
+        self.total = variables.Variable(
+            initial_value=array_ops.zeros((1, 1)), trainable=False)
+        if not context.executing_eagerly():
+          keras.backend.get_session().run(self.total.initializer)
+
+      def call(self, inputs):
+        self.total.assign_add(inputs)
+        return self.total
+
+    inputs = keras.Input(shape=(1,))
+    model = keras.Model(inputs, ComputeSum()(inputs))
+    model.predict(np.ones((1, 1)))
+
+  def _get_layer_with_training_arg(self):
+
+    class TrainingLayer(keras.layers.Layer):
+      """A layer with a `training` argument in a defuned `call`."""
+
+      @def_function.function
+      def call(self, inputs, training=None):
+        if training is None:
+          training = keras.backend.learning_phase()
+        return tf_utils.smart_cond(training,
+                                   lambda: array_ops.ones_like(inputs),
+                                   lambda: array_ops.zeros_like(inputs))
+
+    return TrainingLayer()
+
+  @keras_parameterized.run_with_all_model_types
+  # b/124459427: can't test with `run_eagerly=True` for now.
+  @test_util.run_in_graph_and_eager_modes
+  def test_training_arg_in_defun(self):
+    layer = self._get_layer_with_training_arg()
+    model = testing_utils.get_model_from_layers([layer], input_shape=(1,))
+    model.compile(rmsprop.RMSprop(0.),
+                  loss='mae')
+    history = model.fit(np.zeros((1, 1)), np.zeros((1, 1)))
+    self.assertEqual(history.history['loss'][0], 1.)
+    loss = model.evaluate(np.zeros((1, 1)), np.zeros((1, 1)))
+    self.assertEqual(loss, 0.)
+
+    # Test that the argument injection performed in `call` is not active
+    # when the argument is passed explicitly.
+    layer = self._get_layer_with_training_arg()
+    inputs = keras.Input(shape=(1,))
+    # Pass `training` by name
+    outputs = layer(inputs, training=False)
+    model = keras.Model(inputs, outputs)
+    model.compile(rmsprop.RMSprop(0.),
+                  loss='mae')
+    history = model.fit(np.zeros((1, 1)), np.zeros((1, 1)))
+    self.assertEqual(history.history['loss'][0], 0.)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_raw_variable_assignment(self):
+
+    class RawVariableLayer(keras.layers.Layer):
+
+      def __init__(self, **kwargs):
+        super(RawVariableLayer, self).__init__(**kwargs)
+        # Test variables in nested structure.
+        self.var_list = [variables.Variable(1.), {'a': variables.Variable(2.)}]
+
+      def call(self, inputs):
+        return inputs * self.var_list[0] * self.var_list[1]['a']
+
+    model = testing_utils.get_model_from_layers([RawVariableLayer()],
+                                                input_shape=(10,))
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    x, y = np.ones((10, 10)), np.ones((10, 10))
+    # Checks that variables get initialized.
+    model.fit(x, y, batch_size=2, epochs=2)
+
+
+class SymbolicSupportTest(test.TestCase):
 
   def test_using_symbolic_tensors_with_tf_ops(self):
     # Single-input.
@@ -149,37 +394,184 @@ class BaseLayerTest(test.TestCase):
     with ops.Graph().as_default():
       x1 = array_ops.ones((3, 3))
     x2 = array_ops.ones((3, 3))
-    self.assertTrue(isinstance(x2, ops.EagerTensor))
-    with self.assertRaisesRegexp(TypeError,
-                                 'provided list of inputs contains '
-                                 'objects other than \'EagerTensor\''):
+    self.assertIsInstance(x2, ops.EagerTensor)
+    with self.assertRaisesRegexp(TypeError, 'Graph tensors'):
       math_ops.matmul(x1, x2)
 
   def test_mixing_numpy_arrays_and_graph_tensors(self):
     with ops.Graph().as_default():
       x1 = array_ops.ones((3, 3))
     x2 = np.ones((3, 3), dtype='float32')
-    with self.assertRaisesRegexp(TypeError,
-                                 'provided list of inputs contains '
-                                 'objects other than \'EagerTensor\''):
+    with self.assertRaisesRegexp(TypeError, 'Graph tensors'):
       math_ops.matmul(x1, x2)
 
+  @test_util.run_in_graph_and_eager_modes
   def test_mixing_keras_symbolic_tensors_and_eager_tensors(self):
     x1 = keras.Input((3,))
     x2 = array_ops.ones((3, 3))
-    with self.assertRaisesRegexp(
-        TypeError,
-        'mix computation of symbolic Tensors'):
-      math_ops.matmul(x1, x2)
-
+    y = math_ops.matmul(x1, x2)
+    self.assertEqual(y.graph, keras.backend.get_graph())
+    fn = keras.backend.function(inputs=[x1], outputs=[y])
+    x_val = np.random.random((3, 3))
+    y_val = np.ones((3, 3))
+    self.assertAllClose(fn([x_val])[0],
+                        np.matmul(x_val, y_val),
+                        atol=1e-5)
+
+  @test_util.run_in_graph_and_eager_modes
   def test_mixing_keras_symbolic_tensors_and_numpy_arrays(self):
-    # For the time being we treat Numpy arrays as EagerTensors when mixing both.
     x1 = keras.Input((3,))
     x2 = np.ones((3, 3), dtype='float32')
-    with self.assertRaisesRegexp(
-        TypeError,
-        'mix computation of symbolic Tensors'):
-      math_ops.matmul(x1, x2)
+    y = math_ops.matmul(x1, x2)
+    self.assertEqual(y.graph, keras.backend.get_graph())
+    fn = keras.backend.function(inputs=[x1], outputs=[y])
+    x_val = np.random.random((3, 3))
+    y_val = np.ones((3, 3))
+    self.assertAllClose(fn([x_val])[0],
+                        np.matmul(x_val, y_val),
+                        atol=1e-5)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_reraising_exception(self):
+    # When layer is not dynamic, we have some pattern matching during exception
+    # handling to detect when the user is trying to use python control flow.
+    # When an exception is thrown but the pattern doesn't match, we want to
+    # preserve the originating stack trace. An early implementation of this
+    # logic lost the stack trace. We test the correct behavior here.
+
+    class TypeErrorLayer(base_layer.Layer):
+
+      def call(self, inputs):
+        def easily_identifiable_name():
+          raise TypeError('Non-matching TypeError message.')
+        easily_identifiable_name()
+
+    inputs = keras.Input((3,))
+
+    try:
+      _ = TypeErrorLayer()(inputs)
+    except TypeError:
+      tb = traceback.extract_tb(sys.exc_info()[2])
+      last_entry = tb[-1]
+      function_name = last_entry[2]
+      self.assertEqual(function_name, 'easily_identifiable_name')
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class NestedTrackingTest(test.TestCase):
+
+  def test_nested_layer_variable_tracking(self):
+    # Test that variables from nested sublayers are
+    # being tracked by subclassed layers.
+
+    class MyLayer(keras.layers.Layer):
+
+      def __init__(self):
+        super(MyLayer, self).__init__()
+        self.dense1 = keras.layers.Dense(1)
+        self.dense2 = keras.layers.BatchNormalization()
+
+      def build(self, input_shape):
+        self.v1 = self.add_weight('v1', shape=input_shape[1:].as_list())
+        self.v2 = variables.Variable(
+            name='v2',
+            initial_value=np.zeros(input_shape[1:].as_list(), dtype='float32'),
+            trainable=False)
+
+      def call(self, inputs):
+        x = self.dense1(inputs) + self.dense2(inputs)
+        return x + self.v1 + self.v2
+
+    layer = MyLayer()
+    inputs = keras.Input((1,))
+    _ = layer(inputs)
+
+    self.assertEqual(len(layer.weights), 8)
+    self.assertEqual(len(layer.trainable_weights), 5)
+    self.assertEqual(len(layer.non_trainable_weights), 3)
+
+    layer.dense1.trainable = False
+    self.assertEqual(len(layer.weights), 8)
+    self.assertEqual(len(layer.trainable_weights), 3)
+    self.assertEqual(len(layer.non_trainable_weights), 5)
+
+    layer.trainable = False
+    self.assertEqual(len(layer.weights), 8)
+    self.assertEqual(len(layer.trainable_weights), 0)
+    self.assertEqual(len(layer.non_trainable_weights), 8)
+    self.assertEqual(
+        set([layer.dense1, layer.dense2, layer.v1, layer.v2]),
+        set([obj for unused_name, obj in layer._checkpoint_dependencies]))
+
+  def test_nested_layer_updates_losses_tracking(self):
+    # Test that updates and losses from nested sublayers are
+    # being tracked by subclassed layers.
+
+    class UpdateAndLossLayer(keras.layers.Layer):
+
+      def build(self, _):
+        self.v1 = self.add_weight('v1', shape=())
+
+      def call(self, inputs):
+        self.add_loss(math_ops.reduce_sum(inputs))
+        self.add_update(state_ops.assign_add(self.v1, 1))
+        return inputs + 1
+
+    class MyLayer(keras.layers.Layer):
+
+      def build(self, _):
+        self.v1 = self.add_weight('v1', shape=())
+
+      def __init__(self):
+        super(MyLayer, self).__init__()
+        self.ul1 = UpdateAndLossLayer()
+        self.ul2 = UpdateAndLossLayer()
+
+      def call(self, inputs):
+        self.add_loss(math_ops.reduce_sum(inputs))
+        self.add_update(state_ops.assign_add(self.v1, 1))
+        x = self.ul1(inputs)
+        return self.ul2(x)
+
+    layer = MyLayer()
+
+    if context.executing_eagerly():
+      inputs = array_ops.ones((3, 1))
+      _ = layer(inputs)
+      self.assertEqual(len(layer.losses), 3)
+    else:
+      inputs = keras.Input((1,))
+      _ = layer(inputs)
+      self.assertEqual(len(layer.losses), 3)
+      self.assertEqual(len(layer.updates), 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class NameScopingTest(keras_parameterized.TestCase):
+
+  def test_name_scope_layer(self):
+    x = keras.backend.placeholder(shape=(10, 10))
+    layer = keras.layers.Dense(10, name='MyName')
+    layer(x)
+    self.assertEqual(layer.bias.name, 'MyName/bias:0')
+    self.assertEqual(layer.kernel.name, 'MyName/kernel:0')
+
+  def test_name_scope_sublayer(self):
+    x = keras.backend.placeholder(shape=(10, 10))
+    layer = keras.layers.Dense(
+        10, activation=keras.layers.ReLU(name='MyAct'), name='MyName2')
+    y = layer(x)
+    self.assertEqual(layer.bias.name, 'MyName2/bias:0')
+    self.assertEqual(layer.kernel.name, 'MyName2/kernel:0')
+    self.assertEqual(y.name, 'MyName2/MyAct/Relu:0')
+
+  def test_name_scope_tf_tensor(self):
+    x = ops.convert_to_tensor(np.ones((10, 10)))
+    layer = keras.layers.Dense(
+        10, activation=keras.layers.ReLU(name='MyAct'), name='MyName3')
+    layer(x)
+    self.assertEqual(layer.bias.name, 'MyName3/bias:0')
+    self.assertEqual(layer.kernel.name, 'MyName3/kernel:0')
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index d2f947f17723fbb01280d7ef09f327dd64fc938e..19143dbbdf0f463936c6cb6ecec7a1d769feac18 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections as collections_lib
+import threading
 import enum
 
 from tensorflow.python.framework import dtypes
@@ -25,8 +26,12 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import init_ops_v2
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_contextlib
+
+_call_context = threading.local()
 
 
 class CallConvention(enum.Enum):
@@ -55,7 +60,6 @@ def make_variable(name,
                   shape=None,
                   dtype=dtypes.float32,
                   initializer=None,
-                  partition_info=None,
                   trainable=None,
                   caching_device=None,
                   validate_shape=True,
@@ -72,18 +76,16 @@ def make_variable(name,
   that has fewer constraints (`variable_scope.variable()`).
 
   In the longer term, it seems like a similar "default variable creator" method
-  should exist in `CheckpointableBase` instead. When this happens, we can get
+  should exist in `Trackable` instead. When this happens, we can get
   rid of this temporary solution.
 
   TODO(fchollet): remove this method when no longer needed.
-  TODO(fchollet): handle `partitioner` argument.
 
   Arguments:
     name: Variable name.
     shape: Variable shape.
     dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
     initializer: Initializer instance (callable).
-    partition_info: Not handled at this time.
     trainable: Whether the variable should be part of the layer's
       "trainable_variables" (e.g. variables, biases)
       or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
@@ -123,8 +125,9 @@ def make_variable(name,
       # Instantiate initializer if provided initializer is a type object.
       if isinstance(initializer, type(init_ops.Initializer)):
         initializer = initializer(dtype=dtype)
-      init_val = lambda: initializer(  # pylint: disable=g-long-lambda
-          shape, dtype=dtype, partition_info=partition_info)
+      elif isinstance(initializer, type(init_ops_v2.Initializer)):
+        initializer = initializer()
+      init_val = lambda: initializer(shape, dtype=dtype)
       variable_dtype = dtype.base_dtype
   if use_resource is None:
     use_resource = True
@@ -206,31 +209,184 @@ def collect_previous_mask(input_tensors):
   """Retrieves the output mask(s) of the previous node.
 
   Arguments:
-      input_tensors: A tensor or list of tensors.
+      input_tensors: An arbitrary structure of Tensors.
 
   Returns:
       A mask tensor or list of mask tensors.
   """
-  input_tensors = nest.flatten(input_tensors)
-  masks = []
-  for x in input_tensors:
-    if hasattr(x, '_keras_mask'):
-      mask = x._keras_mask  # pylint: disable=protected-access
-      masks.append(mask)
-    else:
-      masks.append(None)
-  if len(masks) == 1:
-    return masks[0]
-  return masks
+
+  def _collect_previous_mask(x):
+    return getattr(x, '_keras_mask', None)
+
+  return nest.map_structure(_collect_previous_mask, input_tensors)
 
 
-def have_all_keras_metadata(iterable_or_element):
-  if not isinstance(iterable_or_element, (list, tuple)):
-    iterable = [iterable_or_element]
-  else:
-    iterable = nest.flatten(iterable_or_element)
-  return all(hasattr(x, '_keras_history') for x in iterable)
+def have_all_keras_metadata(tensors):
+  return all(hasattr(x, '_keras_history') for x in nest.flatten(tensors))
 
 
 def generate_placeholders_from_shape(shape):
   return array_ops.placeholder(shape=shape, dtype=backend.floatx())
+
+
+def create_keras_history(tensors):
+  """Wraps TensorFlow Operations for compatibility with the Functional API.
+
+  This method checks to see if a Tensor in `tensors` is missing Keras metadata
+  and has its origin in a Keras `Input` Layer. If so, this method will replace
+  the raw TensorFlow Operations that created this tensor with
+  `TensorFlowOpLayer` instances that create identical operations.
+
+  Any Tensors not originating from a Keras `Input` Layer will be treated as
+  constants when constructing `TensorFlowOpLayer` instances.
+
+  Arguments:
+    tensors: A structure of Tensors, some of which come from raw TensorFlow
+      operations and need to have Keras metadata assigned to them.
+  """
+  _create_keras_history_helper(tensors, set())
+
+
+def _create_keras_history_helper(tensors, processed_ops=None):
+  """Helper method for `create_keras_history`.
+
+  Arguments:
+    tensors: A structure of Tensors for which to create Keras metadata.
+    processed_ops: Set. TensorFlow operations that have already been wrapped
+      in `TensorFlowOpLayer` instances.
+
+  Returns:
+    The updated set of TensorFlow Operations that have been wrapped
+    in `TensorFlowOpLayer` instances.
+  """
+  # Import of `base_layer` needed in order to create `TensorFlowOpLayer`.
+  # Cannot be imported at top because of circular dependencies.
+  # TODO(omalleyt): Resolve circular dependency.
+  from tensorflow.python.keras.engine import base_layer  # pylint: disable=g-import-not-at-top
+  tensor_list = nest.flatten(tensors)
+  for tensor in tensor_list:
+    if getattr(tensor, '_keras_history', None) is not None:
+      continue
+    op = tensor.op  # The Op that created this Tensor.
+    if op not in processed_ops:
+      # Recursively set `_keras_history`.
+      op_inputs = list(op.inputs)
+      constants = {}
+      layer_inputs = []
+      for i, op_input in enumerate(op_inputs):
+        if uses_keras_history(op_input):
+          layer_inputs.append(op_input)
+        else:
+          # Treat any value not originating from a `keras.Input` as
+          # a constant (Variables currently have `Placeholder` op type
+          # when originating from an eager context
+          # so can't be supported.
+          constants[i] = backend.function([], op_input)([])
+      processed_ops = _create_keras_history_helper(layer_inputs, processed_ops)
+      name = op.name
+      node_def = op.node_def.SerializeToString()
+      op_layer = base_layer.TensorFlowOpLayer(
+          node_def, constants=constants, name=name)
+      op_layer._add_inbound_node(  # pylint: disable=protected-access
+          layer_inputs, op.outputs)
+      processed_ops.update([op])
+  return processed_ops
+
+
+def needs_keras_history(tensors):
+  """Check if any Tensors need to be wrapped in TensorFlowOpLayers.
+
+  This will never return True inside a sublayer, because sublayers
+  do not need to create Keras History. Otherwise, this returns True
+  if one or more of `tensors` originates from a `keras.Input` and
+  does not have `_keras_history` set.
+
+  Arguments:
+    tensors: An arbitrary nested structure of Tensors.
+
+  Returns:
+    Bool, whether at least one Tensor needs to be wrapped.
+  """
+  input_tensors = nest.flatten(tensors)
+  if getattr(_call_context, 'in_call', False) or all(
+      getattr(tensor, '_keras_history', None) is not None
+      for tensor in input_tensors):
+    # KerasHistory already set.
+    return False
+  return uses_keras_history(tensors)
+
+
+def uses_keras_history(tensors):
+  """Check if at least one Tensor originates from a `keras.Input`.
+
+  This is `True` if at least one Tensor has its origin in a `keras.Input`.
+  Any Tensor that originates from a `keras.Input` will have a dependency
+  Tensor with a `_keras_history` attribute attached. Tensors that have
+  already been checked to not originate from a `keras.Input`
+  are marked as `_keras_history_checked`.
+
+  Arguments:
+    tensors: An arbitrary nested structure of Tensors.
+
+  Returns:
+    Bool, whether at least one Tensor originates from a `keras.Input`.
+  """
+  checked_tensors = set()
+  tensors_to_check = nest.flatten(tensors)
+
+  while tensors_to_check:
+    new_tensors_to_check = set()
+    for tensor in tensors_to_check:
+      if getattr(tensor, '_keras_history_checked', None) is not None:
+        continue
+      if getattr(tensor, '_keras_history', None) is not None:
+        return True
+
+      try:
+        new_tensors_to_check.update(tensor.op.inputs)
+      except AttributeError:
+        # In case `tensor` is a Variable created in an Eager context.
+        pass
+
+    checked_tensors.update(tensors_to_check)
+    tensors_to_check = list(new_tensors_to_check - checked_tensors)
+
+  # Mark that these Tensors have been checked once for `_keras_history`,
+  # and should not be checked again for performance reasons.
+  mark_checked(tensors)
+  return False
+
+
+def mark_checked(tensors):
+  """Marks that these Tensors should not be tracked.
+
+  This prevents Layers from attempting to create TensorFlowOpLayers
+  for these Tensors.
+
+  Arguments:
+    tensors: An arbitrary structure of Tensors.
+  """
+
+  def _mark_checked(tensor):
+    tensor._keras_history_checked = True  # pylint: disable=protected-access
+
+  nest.map_structure(_mark_checked, tensors)
+
+
+@tf_contextlib.contextmanager
+def call_context():
+  """Scope that marks when we are currently inside a Layer/Model's `call`."""
+  was_in_call = getattr(_call_context, 'in_call', False)
+  _call_context.in_call = True
+  try:
+    yield
+  finally:
+    _call_context.in_call = was_in_call
+
+
+def training_arg_passed_to_call(argspec, args, kwargs):
+  """Returns whether a user passed the `training` argument in `__call__`."""
+  # `argspec.args` starts with ['self', 'inputs']
+  full_args = dict(zip(argspec.args[2:], args))
+  full_args.update(kwargs)
+  return 'training' in full_args
diff --git a/tensorflow/python/keras/engine/correctness_test.py b/tensorflow/python/keras/engine/correctness_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..68634235d1b5731d4359ef0796eaa28eeb9ca002
--- /dev/null
+++ b/tensorflow/python/keras/engine/correctness_test.py
@@ -0,0 +1,153 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for numerical correctness."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+class Bias(keras.layers.Layer):
+  """Layer that add a bias to its inputs."""
+
+  def build(self, input_shape):
+    self.bias = self.add_variable('bias', (1,), initializer='zeros')
+
+  def call(self, inputs):
+    return inputs + self.bias
+
+
+class MultiInputSubclassed(keras.Model):
+  """Subclassed Model that adds its inputs and then adds a bias."""
+
+  def __init__(self):
+    super(MultiInputSubclassed, self).__init__()
+    self.add = keras.layers.Add()
+    self.bias = Bias()
+
+  def call(self, inputs):
+    added = self.add(inputs)
+    return self.bias(added)
+
+
+def multi_input_functional():
+  """Functional Model that adds its inputs and then adds a bias."""
+  input_1 = keras.Input(shape=(1,))
+  input_2 = keras.Input(shape=(1,))
+  input_3 = keras.Input(shape=(1,))
+  added = keras.layers.Add()([input_1, input_2, input_3])
+  output = Bias()(added)
+  return keras.Model([input_1, input_2, input_3], output)
+
+
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes
+class SimpleBiasTest(keras_parameterized.TestCase):
+
+  def _get_simple_bias_model(self):
+    model = testing_utils.get_model_from_layers([Bias()], input_shape=(1,))
+    model.compile(
+        keras.optimizer_v2.gradient_descent.SGD(0.1),
+        'mae',
+        run_eagerly=testing_utils.should_run_eagerly())
+    return model
+
+  def test_simple_bias_fit(self):
+    x = np.array([[0.], [1.], [2.]])
+    y = np.array([[0.5], [2.], [3.5]])
+    model = self._get_simple_bias_model()
+
+    history = model.fit(x, y, batch_size=3, epochs=5)
+    self.assertAllClose(history.history['loss'], [1., 0.9, 0.8, 0.7, 0.6])
+
+  def test_simple_bias_evaluate(self):
+    x = np.array([[0.], [1.], [2.]])
+    y = np.array([[1.], [3.], [5.]])
+    model = self._get_simple_bias_model()
+
+    loss = model.evaluate(x, y, batch_size=1)
+    self.assertAlmostEqual(loss, 2.)
+
+  def test_simple_bias_predict(self):
+    x = np.array([[0.], [1.], [2.]])
+    model = self._get_simple_bias_model()
+
+    pred = model.predict(x, batch_size=1)
+    self.assertAllClose(x, pred)
+
+
+@keras_parameterized.run_all_keras_modes
+class MultipleInputTest(keras_parameterized.TestCase):
+
+  def _get_multiple_input_model(self, subclassed=True):
+    if subclassed:
+      model = MultiInputSubclassed()
+    else:
+      model = multi_input_functional()
+    model.compile(
+        keras.optimizer_v2.gradient_descent.SGD(0.1),
+        'mae',
+        run_eagerly=testing_utils.should_run_eagerly())
+    return model
+
+  @parameterized.named_parameters(('subclassed', True), ('functional', False))
+  def test_multiple_input_fit(self, subclassed):
+    x = [
+        np.array([[1.], [2.], [3.]]),
+        np.array([[4.], [5.], [6.]]),
+        np.array([[7.], [8.], [9.]])
+    ]
+    y = np.array([[12.5], [16.], [19.5]])
+
+    model = self._get_multiple_input_model(subclassed)
+    history = model.fit(x, y, batch_size=3, epochs=5)
+    self.assertAllClose(history.history['loss'], [1., 0.9, 0.8, 0.7, 0.6])
+
+  @parameterized.named_parameters(('subclassed', True), ('functional', False))
+  def test_multiple_input_evaluate(self, subclassed):
+    x = [
+        np.array([[1.], [2.], [3.]]),
+        np.array([[4.], [5.], [6.]]),
+        np.array([[7.], [8.], [9.]])
+    ]
+    y = np.array([[13.], [17.], [21.]])
+
+    model = self._get_multiple_input_model(subclassed)
+    loss = model.evaluate(x, y, batch_size=3)
+    self.assertAlmostEqual(loss, 2.)
+
+  @parameterized.named_parameters(('subclassed', True), ('functional', False))
+  def test_multiple_input_predict(self, subclassed):
+    x = [
+        np.array([[1.], [2.], [3.]]),
+        np.array([[4.], [5.], [6.]]),
+        np.array([[7.], [8.], [9.]])
+    ]
+
+    model = self._get_multiple_input_model(subclassed)
+    pred = model.predict(x, batch_size=1)
+    self.assertAllClose(pred, [[12.], [15.], [18.]])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/engine/distributed_training_utils.py b/tensorflow/python/keras/engine/distributed_training_utils.py
index 32129afe64761048ed219a4e0caaae19292b9bc4..0c9fdbb54c00d0aa597d5d8979bec27ddb6eb0aa 100644
--- a/tensorflow/python/keras/engine/distributed_training_utils.py
+++ b/tensorflow/python/keras/engine/distributed_training_utils.py
@@ -13,27 +13,34 @@
 # limitations under the License.
 # ==============================================================================
 """Utilities related to distributed training."""
+# pylint:disable=protected-access
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.client import session as session_module
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.distribute import distribute_coordinator_context as dc_context
-from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks
+from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
-from tensorflow.python.ops import array_ops
+from tensorflow.python.keras.utils.mode_keys import ModeKeys
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_contextlib
 
 
 def set_weights(distribution_strategy, dist_model, weights):
@@ -61,7 +68,7 @@ def set_weights(distribution_strategy, dist_model, weights):
     weights = weights[num_param:]
 
   if not ops.executing_eagerly_outside_functions():
-    K.get_session().run(assign_ops)
+    K.get_session(assign_ops).run(assign_ops)
 
 
 def unwrap_values(distribution_strategy, grouped_inputs, grouped_outputs,
@@ -98,7 +105,7 @@ def unwrap_values(distribution_strategy, grouped_inputs, grouped_outputs,
                                         grouped_inputs)
   if with_loss_tensor:
     # reduce loss tensor before adding it to the list of fetches
-    loss = distribution_strategy.reduce(distribute_lib.get_loss_reduction(),
+    loss = distribution_strategy.reduce(reduce_util.ReduceOp.SUM,
                                         grouped_outputs[0])
     all_outputs = flatten_perdevice_values(distribution_strategy,
                                            grouped_outputs[1:])
@@ -152,14 +159,12 @@ def flatten_perdevice_values(distribution_strategy, perdevice_values):
           for e in distribution_strategy.unwrap(flattened)]
 
 
-def validate_callbacks(input_callbacks, optimizer, current_strategy):
+def validate_callbacks(input_callbacks, optimizer):
   """Validate whether given callbacks are supported by DistributionStrategy.
 
   Args:
     input_callbacks: List of callbacks passed by the user to fit.
     optimizer: Optimizer instance used to train the model.
-    current_strategy: The DistributionStrategy used to distribute training
-      and validation.
 
   Raises:
     ValueError: If `LearningRateScheduler` or `ReduceLROnPlateau` is one of the
@@ -183,12 +188,6 @@ def validate_callbacks(input_callbacks, optimizer, current_strategy):
                         '`_grouped_model` attribute of your original model.')
       if isinstance(callback, (callbacks.LearningRateScheduler,
                                callbacks.ReduceLROnPlateau)):
-        strategy_name = current_strategy.__class__.__name__
-        # TODO(anjalisridhar): We might need to add a condition for multi
-        # worker strategy when we support it in Keras.
-        if is_tpu_strategy(current_strategy):
-          raise ValueError('%s callback is not supported with %s.' %
-                           (callback, strategy_name))
 
         if not isinstance(optimizer, optimizer_v2.OptimizerV2):
           raise ValueError('You must specify a Keras Optimizer V2 when using '
@@ -198,14 +197,14 @@ def validate_callbacks(input_callbacks, optimizer, current_strategy):
       # features of the callback that involve accessing model attributes and
       # running ops.
       if isinstance(callback, callbacks.TensorBoard):
-        if callback.__getattribute__('histogram_freq'):
+        if getattr(callback, 'histogram_freq', False):
           logging.warning(
               UserWarning(
                   '`histogram_freq` in the TensorBoard callback is not '
                   'supported when using DistributionStrategy. Setting '
                   '`histogram_freq` to `0`.'))
           callback.histogram_freq = 0
-        if callback.__getattribute__('write_grads'):
+        if getattr(callback, 'write_grads', False):
           logging.warning(
               UserWarning(
                   '`write_grads` in the TensorBoard callback is not supported '
@@ -346,41 +345,13 @@ def init_restore_or_wait_for_variables():
   session = K._get_session()  # pylint: disable=protected-access
   worker_context = dc_context.get_current_worker_context()
   if not worker_context or worker_context.experimental_should_init:
-    # TODO(yuefengz): if checkpoints exit, restore from checkpoint.
+    # TODO(yuefengz): if checkpoints exist, restore from checkpoint.
     K._initialize_variables(session)  # pylint: disable=protected-access
   else:
     _wait_for_variable_initialization(session)
 
 
-def configure_and_create_session(distribution_strategy):
-  """Configure session config and create a session with it."""
-  # TODO(priyag): Throw error if a session already exists.
-  session_config = K.get_default_session_config()
-
-  if is_tpu_strategy(distribution_strategy):
-    # TODO(priyag, yuefengz): Remove this workaround when Distribute
-    # Coordinator is integrated with keras and we can create a session from
-    # there.
-    distribution_strategy.configure(session_config)
-    master = distribution_strategy.extended._tpu_cluster_resolver.master()  # pylint: disable=protected-access
-    session = session_module.Session(config=session_config, target=master)
-  else:
-    worker_context = dc_context.get_current_worker_context()
-    if worker_context:
-      dc_session_config = worker_context.session_config
-      # Merge the default session config to the one from distribute coordinator,
-      # which is fine for now since they don't have conflicting configurations.
-      dc_session_config.MergeFrom(session_config)
-      session = session_module.Session(
-          config=dc_session_config, target=worker_context.master_target)
-    else:
-      distribution_strategy.configure(session_config)
-      session = session_module.Session(config=session_config)
-
-  K.set_session(session)
-
-
-def validate_inputs(x, y, distribution_strategy):
+def validate_inputs(x, y, distribution_strategy, allow_partial_batch=False):
   """Validate inputs when using DistributionStrategy.
 
   Args:
@@ -388,16 +359,13 @@ def validate_inputs(x, y, distribution_strategy):
     y: Model Targets.
     distribution_strategy: The DistributionStrategy with which the model is
       compiled.
+    allow_partial_batch: Boolean. If false, datasets must have fully
+      defined shapes.
 
   Raises:
     ValueError: if input is not a Dataset or a numpy array(when we use
       MirroredStrategy).
   """
-  if isinstance(x, dict) or isinstance(y, dict):
-    raise ValueError('`DistributionStrategy` does not support inputs of type '
-                     'dict. You must pass a `tf.data.Dataset` object or a '
-                     'numpy array as input.')
-
   if (isinstance(x, iterator_ops.Iterator) or
       isinstance(y, iterator_ops.Iterator)):
     raise ValueError('`DistributionStrategy` does not support inputs of type '
@@ -406,18 +374,13 @@ def validate_inputs(x, y, distribution_strategy):
 
   if is_tpu_strategy(distribution_strategy):
     for i in [x, y]:
-      if isinstance(i, dataset_ops.DatasetV2):
-        shapes = nest.flatten(i.output_shapes)
-        try:
-          s = next(s for s in shapes if not s.is_fully_defined())
-        except StopIteration:
-          continue
-        else:
+      if (isinstance(i, dataset_ops.DatasetV2) and not allow_partial_batch):
+        if not is_dataset_shape_fully_defined(i):
           raise ValueError(
               'Using TPUs currently requires fully defined shapes. Either use '
               'set_shape() on the input tensors or use '
               'dataset.batch(..., drop_remainder=True).'
-              'Found unknown shape {} in input {}.'.format(s, i))
+              'Found unknown shape in input {}.'.format(i))
 
 
 # TODO(b/118776054): Currently we support global batch size for TPUStrategy and
@@ -433,8 +396,15 @@ def is_tpu_strategy(strategy):
   return strategy is not None and strategy.__class__.__name__ == 'TPUStrategy'
 
 
+def is_dataset_shape_fully_defined(dataset):
+  """Returns whether a dataset contains a final partial batch."""
+  shapes = nest.flatten(dataset.output_shapes)
+  unknown_shapes = [s for s in shapes if not s.is_fully_defined()]
+  return not unknown_shapes
+
+
 def get_input_params(distribution_strategy, first_x_value, steps, batch_size,
-                     is_training=False):
+                     mode=None):
   """Calculate the number of batches and steps/steps_per_epoch.
 
   Args:
@@ -443,8 +413,10 @@ def get_input_params(distribution_strategy, first_x_value, steps, batch_size,
       model input.
     steps:  The specified number of steps.
     batch_size: The specified batch_size.
-    is_training: Boolean to relax the constraints on consuming all the training
-      samples to keep compatibility till we support partial batches.
+    mode: ModeKey representing whether input will be used for training,
+      evaluation, or prediction. This is used to relax the constraints on
+      consuming all the training samples to keep compatibility till we
+      support partial batches. If none, then partial batches are not allowed.
 
   Returns:
     steps: The steps or steps_per_epoch argument depending on if a user is
@@ -462,6 +434,14 @@ def get_input_params(distribution_strategy, first_x_value, steps, batch_size,
   use_per_replica_batch = not global_batch_size_supported(
       distribution_strategy)
 
+  # Partial batches are allowed for training as we repeat the
+  # dataset when converting numpy arrays into a dataset.
+  # For other modes uneven batch sizes are not allowed except
+  # for `predict()` on TPUStrategy.
+  allow_partial_batch = (mode == ModeKeys.TRAIN or
+                         (mode == ModeKeys.PREDICT
+                          and is_tpu_strategy(distribution_strategy)))
+
   if steps is None:
     if batch_size is None:
       # If neither the batch size or number of steps are set. We choose the
@@ -474,7 +454,7 @@ def get_input_params(distribution_strategy, first_x_value, steps, batch_size,
       global_batch_size = batch_size
       if use_per_replica_batch:
         global_batch_size *= distribution_strategy.num_replicas_in_sync
-    if not is_training and num_samples % global_batch_size:
+    if not allow_partial_batch and num_samples % global_batch_size:
       raise ValueError('The number of samples %s is not divisible by '
                        'batch size %s.' % (num_samples, global_batch_size))
     steps = num_samples // global_batch_size
@@ -494,7 +474,11 @@ def get_input_params(distribution_strategy, first_x_value, steps, batch_size,
       if use_per_replica_batch:
         global_batch_size *= distribution_strategy.num_replicas_in_sync
 
-      if num_samples < (global_batch_size * steps):
+      min_num_samples = global_batch_size * steps
+      if allow_partial_batch:
+        min_num_samples = global_batch_size * (steps-1) + 1 if steps > 1 else 0
+
+      if num_samples < min_num_samples:
         raise ValueError('Number of samples %s is less than samples required '
                          'for specified batch_size %s and steps %s' % (
                              num_samples, global_batch_size, steps))
@@ -521,97 +505,436 @@ def get_batch_dimension(iterator):
   return dims[0] if dims else None
 
 
-def get_cpu_device(distribution_strategy):
-  """Returns the CPU device of the TPU host or the default CPU device string.
+def list_to_tuple(maybe_list):
+  """Datasets treat lists specially, so switch them to tuples."""
+  if isinstance(maybe_list, list):
+    return tuple(maybe_list)
+  return maybe_list
 
-  Args:
-    distribution_strategy: The DistributionStrategy used to compile the model.
 
-  Returns:
-    A device string which is the TPU host's CPU device in case of
-    TPUDistributionStrategy or the default CPU device string in all other
-    cases.
+def get_iterator(dataset, distribution_strategy):
+  with distribution_strategy.scope():
+    iterator = distribution_strategy.make_dataset_iterator(dataset)
+  initialize_iterator(iterator, distribution_strategy)
+  return iterator
 
-  Raises:
-    NotImplementedError: We currently don't support copying numpy data to
-    multiple hosts in the case of Cloud TPU pods.
-  """
-  if is_tpu_strategy(distribution_strategy):
-    if distribution_strategy.extended.num_hosts > 1:
-      raise NotImplementedError('TPUDistributionStrategy does not '
-                                'support numpy inputs when running on Cloud'
-                                'TPU pods.')
-    return distribution_strategy.extended.get_host_cpu_device(0)
+
+def initialize_iterator(iterator, distribution_strategy):
+  with distribution_strategy.scope():
+    init_op = control_flow_ops.group(iterator.initialize())
+    if not context.executing_eagerly():
+      K.get_session((init_op,)).run(init_op)
+
+
+def _get_input_from_iterator(iterator, model):
+  """Get elements from the iterator and verify the input shape and type."""
+  next_element = iterator.get_next()
+
+  if len(nest.flatten(next_element)) == len(model.inputs):
+    x = next_element
+    y = None
+    sample_weights = None
+  elif len(nest.flatten(next_element)) == (len(model.inputs) +
+                                           len(model.outputs)):
+    x, y = next_element
+    sample_weights = None
   else:
-    # For all strategies except TPUDistributionStrategy
-    # TODO(anjalisridhar): We may need to modify this when we add support for
-    # multi-worker strategy.
-    return '/CPU:0'
+    x, y, sample_weights = next_element
+
+  # Validate that all the elements in x and y are of the same type and shape.
+  validate_distributed_dataset_inputs(
+      model._distribution_strategy, x, y, sample_weights)
+  return x, y, sample_weights
+
 
+def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
+  """Prepare feed values to the model execution function.
 
-def get_var_for_numpy(distribution_strategy, x):
-  if isinstance(x, list):
-    var_x = tuple([_get_var_for_numpy(distribution_strategy, single_input)
-                   for single_input in x])
+  Arguments:
+    model: Model to prepare feed values for.
+    inputs: List or dict of model inputs.
+    targets: Optional list of model targets.
+    sample_weights: Optional list of sample weight arrays.
+    mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
+
+  Returns:
+    Feed values for the model in the given mode.
+  """
+  strategy = model._distribution_strategy
+  inputs, targets, sample_weights = _get_input_from_iterator(inputs, model)
+  inputs = flatten_perdevice_values(strategy, inputs)
+  targets = flatten_perdevice_values(strategy, targets)
+  # Expand 1-dimensional inputs.
+  # TODO(b/124535720): Remove once this standarize data logic is shared with
+  # main flow.
+  inputs, targets = nest.map_structure(training_utils.standardize_single_array,
+                                       (inputs, targets))
+  if mode == ModeKeys.PREDICT:
+    sample_weights = []
+    targets = []
   else:
-    var_x = _get_var_for_numpy(distribution_strategy, x)
-  return var_x
+    sample_weights = [
+        None for _ in range(len(model.outputs) * strategy.num_replicas_in_sync)
+    ]
+  ins = inputs + targets + sample_weights
+  if mode == ModeKeys.TRAIN and not isinstance(K.symbolic_learning_phase(),
+                                               int):
+    ins += [True]
+  return ins
+
+
+def _custom_compile_for_predict(model):
+  """Custom compile for TPU predict mode."""
+  if not model.built:
+    # Model is not compilable because it does not know its number of inputs
+    # and outputs, nor their shapes and names. We will compile after the first
+    # time the model gets called on training data.
+    return
+  model._is_compiled = True
+  model.total_loss = None
+  model._fit_function = None
+  model._eval_function = None
+  model.train_function = None
+  model.test_function = None
+  model.predict_function = None
+
+
+def _build_network_on_replica(model, mode, inputs=None, targets=None):
+  """Build an updated model on replicas.
+
+  We create a new Keras model while sharing the variables from the old graph.
+  Building a new sub-graph is required since the original keras model creates
+  placeholders for the input and the output that are not accessible till we
+  call iterator.get_next() inside the step_fn for `fit`/`evaluate`/`predict`.
 
+  The sharing of weights and layers between the old and the new model gaurantee
+  that we're using Strategy variables and any updates on either model are
+  reflected correctly in callbacks and loop iterations.
 
-def _get_var_for_numpy(distribution_strategy, input_array):
-  """Creates a variable and assigns the value of the numpy array to it.
+  We need to make sure we share the optimizers between the old and the new model
+  as well so that optimizer state is not lost if the user is running fit
+  multiple times.
 
   Args:
-    distribution_strategy: The DistributionStrategy used to compile the model.
-    input_array: The input numpy array whose value will be assigned to the
-      variable we create.
+    model: Model to be replicated across Replicas
+    mode: Which of fit/eval/predict is building the distributed network
+    inputs: Input variables to be passed to the model
+    targets: Target tensor to be passed to model.compile
 
   Returns:
-    The variable to which we will copy the value of the input numpy array.
+    A new model with shared layers with the old model.
+  """
+  # Need to do imports here since we run into a circular dependency error.
+  from tensorflow.python.keras import models  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.keras.engine import sequential  # pylint: disable=g-import-not-at-top
+
+  # We rely on the internal methods to avoid having share_weights weights in the
+  # public API.
+  if isinstance(model, sequential.Sequential):
+    updated_model = models._clone_sequential_model(model, input_tensors=inputs,
+                                                   share_weights=True)
+  else:
+    updated_model = models._clone_functional_model(model, input_tensors=inputs,
+                                                   share_weights=True)
+
+  # Recast all low precision outputs back to float32 since we only casted
+  # the inputs to bfloat16 and not targets. This is done so that we can preserve
+  # precision when calculating the loss value.
+  def _upcast_low_precision_outputs(output):
+    if output.dtype == dtypes.bfloat16:
+      return math_ops.cast(output, dtypes.float32)
+    else:
+      return output
+  updated_model.outputs = [_upcast_low_precision_outputs(o)
+                           for o in updated_model.outputs]
+
+  if isinstance(targets, tuple):
+    targets = nest.flatten(targets)
 
+  if mode == ModeKeys.PREDICT and inputs is not None:  # TPU predict case
+    _custom_compile_for_predict(updated_model)
+  else:
+    updated_model.compile(
+        model.optimizer,
+        model.loss,
+        metrics=metrics_module.clone_metrics(model._compile_metrics),
+        loss_weights=model.loss_weights,
+        sample_weight_mode=model.sample_weight_mode,
+        weighted_metrics=metrics_module.clone_metrics(
+            model._compile_weighted_metrics),
+        target_tensors=targets)
+  return updated_model
+
+
+def _build_distributed_network(model, strategy, mode, inputs=None,
+                               targets=None):
+  """Create a cloned model on each replica."""
+  with K.get_graph().as_default(), strategy.scope():
+    distributed_model = strategy.extended.call_for_each_replica(
+        _build_network_on_replica,
+        args=(model, mode, inputs, targets))
+    set_distributed_model(model, mode, distributed_model)
+
+
+def _clone_and_build_model(model, mode, inputs=None, targets=None):
+  """Clone and build the given keras_model."""
+  # We need to set the import here since we run into a circular dependency
+  # error.
+  from tensorflow.python.keras import models  # pylint: disable=g-import-not-at-top
+  cloned_model = models.clone_model(model, input_tensors=inputs)
+
+  # Compile and build model.
+  if isinstance(model.optimizer, optimizers.TFOptimizer):
+    optimizer = model.optimizer
+  else:
+    optimizer_config = model.optimizer.get_config()
+    optimizer = model.optimizer.__class__.from_config(optimizer_config)
+
+  # Recast all low precision outputs back to float32 since we only casted
+  # the inputs to bfloat16 and not targets. This is done so that we can preserve
+  # precision when calculating the loss value.
+  def _upcast_low_precision_outputs(output):
+    if output.dtype == dtypes.bfloat16:
+      return math_ops.cast(output, dtypes.float32)
+    else:
+      return output
+  cloned_model.outputs = [_upcast_low_precision_outputs(o)
+                          for o in cloned_model.outputs]
+
+  if isinstance(targets, tuple):
+    targets = nest.flatten(targets)
+  if mode == ModeKeys.PREDICT and inputs is not None:  # TPU predict case
+    _custom_compile_for_predict(cloned_model)
+  else:
+    cloned_model.compile(
+        optimizer,
+        model.loss,
+        metrics=metrics_module.clone_metrics(model._compile_metrics),
+        loss_weights=model.loss_weights,
+        sample_weight_mode=model.sample_weight_mode,
+        weighted_metrics=metrics_module.clone_metrics(
+            model._compile_weighted_metrics),
+        target_tensors=targets)
+  return cloned_model
+
+
+def clone_model_on_replicas(model, strategy, mode, inputs=None, targets=None):
+  """Create a cloned model on each replica."""
+  with K.get_graph().as_default(), strategy.scope():
+    distributed_model = strategy.extended.call_for_each_replica(
+        _clone_and_build_model, args=(model, mode, inputs, targets))
+    set_distributed_model(model, mode, distributed_model)
+  if mode == ModeKeys.TRAIN:
+    model._make_callback_model(distributed_model)
+
+
+def _make_execution_function(model, mode):
+  """Makes or reuses function to run one step of distributed model execution."""
+  strategy = model._distribution_strategy
+
+  distributed_model = get_distributed_model(model, mode)
+  # If distributed model for a particular `mode` is already built, use the
+  # `_distribution_function` on that distributed model.
+  if distributed_model:
+    return distributed_model._distributed_function
+
+  # If distributed_model is not built, create one for `mode`.
+  if model._compile_distribution:
+    clone_model_on_replicas(model, strategy, mode)
+  else:
+    _build_distributed_network(model, strategy, mode)
+
+  # We've just created the distributed model. So `distributed_model` should be
+  # not None.
+  distributed_model = get_distributed_model(model, mode)
+  assert distributed_model
+
+  # Also create an execution fuction on that distributed model.
+  if context.executing_eagerly():
+    distributed_function = _make_eager_execution_function(model, mode)
+  else:
+    distributed_function = _make_graph_execution_function(model, mode)
+
+  # We cache the distributed execution function on the model since creating
+  # distributed models and exection functions are expensive.
+  distributed_model._distributed_function = distributed_function
+  return distributed_function
+
+
+def _make_graph_execution_function(model, mode):
+  """Makes function to run one step of distributed model in graph mode."""
+
+  def _per_device_function(model):
+    f = model._make_execution_function(mode)
+    return (f.inputs, f.outputs, f.updates_op, f.session_kwargs)
+
+  strategy = model._distribution_strategy
+  with strategy.scope():
+    # Create train ops on each of the devices when we call
+    # `_per_device_fit_function`.
+    (grouped_inputs, grouped_outputs, grouped_updates,
+     grouped_session_args) = strategy.extended.call_for_each_replica(
+         _per_device_function, args=(get_distributed_model(model, mode),))
+
+    # Initialize the variables in the replicated model. This is necessary for
+    # multi-worker training because on some workers, initialization is not
+    # needed. This method does initialization or waiting for initialization
+    # according to the context object of distribute coordinator.
+    init_restore_or_wait_for_variables()
+
+    # Unwrap all the per device values returned from `call_for_each_replica`.
+    # Unwrapping per device values gives you a list of values that can be
+    # used to construct a new train function that is composed of update ops on
+    # all the devices over which the model is distributed.
+    (all_inputs, all_outputs, all_updates, all_session_args) = unwrap_values(
+        strategy,
+        grouped_inputs,
+        grouped_outputs,
+        grouped_updates,
+        grouped_session_args,
+        with_loss_tensor=(mode != ModeKeys.PREDICT))
+
+    return K.function(
+        all_inputs,
+        all_outputs,
+        updates=all_updates,
+        name='distributed_{}_function'.format(mode),
+        **all_session_args)
+
+
+def _make_eager_execution_function(model, mode):
+  """Makes function to run one step of distributed model eager execution."""
+  def _per_device_function(model):
+    f = model._make_execution_function(mode)
+    return (f.inputs, f.outputs)
+
+  # NOTE(priyag): Try creating a new FuncGraph within DS scope instead of using
+  # the global one.
+  strategy = model._distribution_strategy
+  global_graph = K.get_graph()
+
+  with global_graph.as_default(), strategy.scope():
+    # First we gather the relevant portions of the model across all replicas.
+    # `K._scratch_graph(global_graph)` signals to Keras that it should not
+    # lift to a separate graph when creating the per-replica functions.
+    with K._scratch_graph(global_graph):
+      # Create train ops on each of the devices when we call
+      # `_per_device_fit_function`.
+      grouped = strategy.extended.call_for_each_replica(
+          _per_device_function, args=(get_distributed_model(model, mode),))
+      grouped_inputs, grouped_outputs = grouped
+
+      # Unwrap all the per device values returned from `call_for_each_replica`.
+      # Unwrapping per device values gives you a list of values that can be
+      # used to construct a new train function that is composed of
+      # inputs/outputs on all the devices over which the model is distributed.
+      (all_inputs, all_outputs, _, _) = unwrap_values(
+          strategy,
+          grouped_inputs,
+          grouped_outputs,
+          with_loss_tensor=(mode != ModeKeys.PREDICT))
+
+    # Finally, a joint Keras function is created; this one will be created in
+    # a separate FuncGraph.
+    return K.function(
+        all_inputs,
+        all_outputs,
+        name='eager_distributed_{}_function'.format(mode))
+
+
+def _copy_weights_to_distributed_model(original_model, mode):
+  """Copies weights from original model to distributed models."""
+  strategy = original_model._distribution_strategy
+  distributed_model = get_distributed_model(original_model, mode)
+  if strategy:
+    # Copy the weights from the original model to each of the replicated
+    # models.
+    orig_model_weights = original_model.get_weights()
+    first_model = strategy.unwrap(distributed_model)[0]
+    set_weights(strategy, first_model, orig_model_weights)
+
+
+def _copy_weights_to_original_model(model, mode):
+  """Copies weights from first distributed model back to original model."""
+  if model._distribution_strategy and mode == ModeKeys.TRAIN:
+    distributed_model = get_distributed_model(model, mode)
+    updated_weights = model._distribution_strategy.unwrap(
+        distributed_model)[0].get_weights()
+    model.set_weights(updated_weights)
+
+
+def _per_device_aggregate_batch(batch_outs, model, mode):
+  """Aggregates the per-device batch-level outputs from a distributed step."""
+  if model._distribution_strategy is not None and mode == ModeKeys.PREDICT:
+    total_batch_outs = []
+    for i in range(len(model.outputs)):
+      num_replicas = model._distribution_strategy.num_replicas_in_sync
+      nested_outs = batch_outs[i * num_replicas:i * num_replicas + num_replicas]
+      total_batch_outs.append(np.concatenate(nest.flatten(nested_outs)))
+    return total_batch_outs
+  return batch_outs
+
+
+def _reset_metrics(model):
+  if model._distribution_strategy:
+    for mode in [ModeKeys.TRAIN, ModeKeys.TEST, ModeKeys.PREDICT]:
+      distributed_model = get_distributed_model(model, mode)
+      if distributed_model:
+        first_model = model._distribution_strategy.unwrap(distributed_model)[0]
+        first_model.reset_metrics()
+
+
+def get_distributed_model(model, mode):
+  key = _generate_cache_key(mode)
+  return model._distributed_model_cache.get(key, None)
+
+
+def set_distributed_model(model, mode, distributed_model):
+  key = _generate_cache_key(mode)
+  model._distributed_model_cache[key] = distributed_model
+
+
+def _generate_cache_key(mode):
+  key = hash(mode)
+  return key
+
+
+@tf_contextlib.contextmanager
+def distributed_scope(strategy, learning_phase):
+  with strategy.scope(), K.learning_phase_scope(learning_phase):
+    yield
+
+
+def filter_distributed_callbacks(callbacks_list):
+  """Filter Callbacks based on the worker context when running multi-worker.
+
+  Arguments:
+    callbacks_list: A list of `Callback` instances.
+
+  Returns:
+    The list of `Callback` instances that should be run on this worker.
   """
-  with ops.device(get_cpu_device(distribution_strategy)):
-    # Create and initialize a variable on the CPU device. This is the CPU
-    # device of the host in the case of TPUDistributionStrategy.
-    input_var = variables.VariableV1(array_ops.zeros(input_array.shape,
-                                                     input_array.dtype),
-                                     trainable=False, use_resource=True)
-  K.get_session().run(input_var.initializer)
-
-  # Create a placeholder for the numpy array input slices. We copy the value
-  # of the input numpy array to the variable in slices of size 64 MB to avoid
-  # running into memory issues or RPC message limits.
-  start_placeholder = array_ops.placeholder(dtypes.int64, ())
-  end_placeholder = array_ops.placeholder(dtypes.int64, ())
-  slice_placeholder = array_ops.placeholder(input_var.dtype)
-  assign_slice_op = input_var[start_placeholder:end_placeholder].assign(
-      slice_placeholder)
-
-  # If each batch element is > 64 MB, then we copy each batch element
-  # individually. Otherwise, the slices will be < 128 MB. There might be padding
-  # which might mean that the slices are 128 MB even if the size of the
-  # tensor allocated is less than 128 MB.
-  # This formula gives slices with size:
-  # ceil(64 MB / byte size per batch element) bytes.
-  # Using ceil() guarantees we get a number >= 1.
-
-  # Calculate the size of each batch element.
-  byte_size_per_batch_element = np.prod(input_array.shape[1:]) * \
-                                input_var.dtype.size
-
-  # Calculate number of elements we want to copy per slice.
-  batch_size_per_slice = int(np.ceil((64 << 20) / byte_size_per_batch_element))
-
-  # Copy slices of the above size starting at 0, except the last slice will be
-  # smaller.
-  start = 0
-  limit = input_array.shape[0]
-  while start < limit:
-    end = min(start + batch_size_per_slice, limit)
-    K.get_session().run(assign_slice_op, feed_dict={
-        start_placeholder: start,
-        end_placeholder: end,
-        slice_placeholder: input_array[start:end]})
-    start = end
-
-  return input_var
+
+  if not K.in_multi_worker_mode():
+    raise ValueError(
+        'filter_distributed_callbacks() should only be called when Keras '
+        'is in multi worker mode.')
+
+  worker_context = dc_context.get_current_worker_context()
+  callbacks_list = callbacks_list or []
+  if not [
+      c for c in callbacks_list if isinstance(c, callbacks.ModelCheckpoint)
+  ]:
+    # TODO(rchao): Consider providing a ModelCheckpoint here if the user
+    # fails to.
+    logging.warning('ModelCheckpoint callback is not provided. '
+                    'Workers will need to restart training if any fails.')
+  # TODO(rchao): Add similar warning for restoring callback (to be designed).
+
+  if callbacks_list is None or worker_context.is_chief:
+    return callbacks_list
+
+  # Some Callbacks should only run on the chief worker.
+  return [
+      callback for callback in callbacks_list if not callback._chief_worker_only
+  ]  # pylint: disable=protected-access
diff --git a/tensorflow/python/keras/engine/feature_columns_integration_test.py b/tensorflow/python/keras/engine/feature_columns_integration_test.py
index b3f8cfe72585188d631c072b690729054d5db775..3bed40b08462f6907f7dbf41a90dd1503baf6a81 100644
--- a/tensorflow/python/keras/engine/feature_columns_integration_test.py
+++ b/tensorflow/python/keras/engine/feature_columns_integration_test.py
@@ -27,7 +27,6 @@ from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
-from tensorflow.python.training import rmsprop
 
 
 class TestDNNModel(keras.models.Model):
@@ -57,7 +56,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
         keras.layers.Dense(20, activation='softmax')
     ])
     model.compile(
-        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        optimizer='rmsprop',
         loss='categorical_crossentropy',
         metrics=['accuracy'],
         run_eagerly=testing_utils.should_run_eagerly())
@@ -79,7 +78,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
         keras.layers.Dense(20, activation='softmax')
     ])
     model.compile(
-        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        optimizer='rmsprop',
         loss='categorical_crossentropy',
         metrics=['accuracy'],
         run_eagerly=testing_utils.should_run_eagerly())
@@ -103,7 +102,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
     dnn_model = TestDNNModel([col_a, col_b], 20)
 
     dnn_model.compile(
-        optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+        optimizer='rmsprop',
         loss='categorical_crossentropy',
         metrics=['accuracy'],
         run_eagerly=testing_utils.should_run_eagerly())
@@ -124,7 +123,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
     dnn_model = TestDNNModel([col_a, col_b], 20)
 
     dnn_model.compile(
-        optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+        optimizer='rmsprop',
         loss='categorical_crossentropy',
         metrics=['accuracy'],
         run_eagerly=testing_utils.should_run_eagerly())
@@ -155,7 +154,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
 
     model = keras.models.Model([feature_layer], [output])
 
-    optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+    optimizer = 'rmsprop'
     loss = 'mse'
     loss_weights = [1., 0.5]
     model.compile(
@@ -184,7 +183,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
 
     model = keras.models.Model([fc1, fc2], [output])
 
-    optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+    optimizer = 'rmsprop'
     loss = 'mse'
     loss_weights = [1., 0.5]
     model.compile(
diff --git a/tensorflow/python/keras/engine/input_layer.py b/tensorflow/python/keras/engine/input_layer.py
index 9874efe2bccd5e2db370ed54089424063afe88b5..32fbbea8a162aaac592519739d3d8bb3ecbce57f 100644
--- a/tensorflow/python/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/engine/input_layer.py
@@ -23,10 +23,10 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.utils import tf_utils
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.layers.InputLayer')
+@keras_export('keras.layers.InputLayer')
 class InputLayer(base_layer.Layer):
   """Layer to be used as an entry point into a Network (a graph of layers).
 
@@ -77,6 +77,9 @@ class InputLayer(base_layer.Layer):
         dtype = backend.floatx()
       else:
         dtype = backend.dtype(input_tensor)
+    elif input_tensor is not None and input_tensor.dtype != dtype:
+      raise ValueError('`input_tensor.dtype` differs from `dtype`: %s vs. %s' %
+                       (input_tensor.dtype, dtype))
     super(InputLayer, self).__init__(dtype=dtype, name=name)
     self.built = True
     self.sparse = sparse
@@ -120,6 +123,7 @@ class InputLayer(base_layer.Layer):
     # Create an input node to add to self.outbound_node
     # and set output_tensors' _keras_history.
     input_tensor._keras_history = (self, 0, 0)  # pylint: disable=protected-access
+    input_tensor._keras_mask = None
     base_layer.Node(
         self,
         inbound_layers=[],
@@ -138,7 +142,7 @@ class InputLayer(base_layer.Layer):
     return config
 
 
-@tf_export('keras.layers.Input', 'keras.Input')
+@keras_export('keras.layers.Input', 'keras.Input')
 def Input(  # pylint: disable=invalid-name
     shape=None,
     batch_size=None,
@@ -215,8 +219,6 @@ def Input(  # pylint: disable=invalid-name
   if kwargs:
     raise ValueError('Unrecognized keyword arguments:', kwargs.keys())
 
-  if dtype is None:
-    dtype = backend.floatx()
   if shape is None and tensor is None:
     raise ValueError('Please provide to Input either a `shape`'
                      ' or a `tensor` argument. Note that '
diff --git a/tensorflow/python/keras/engine/input_spec.py b/tensorflow/python/keras/engine/input_spec.py
index 7277c16fe51197af3bf0e045814ccc29f7feaf7c..b0d11573640b559b174b4ddab74ed7486c9b9d38 100644
--- a/tensorflow/python/keras/engine/input_spec.py
+++ b/tensorflow/python/keras/engine/input_spec.py
@@ -21,11 +21,12 @@ from __future__ import print_function
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import keras_export
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('keras.layers.InputSpec',
-           v1=['keras.layers.InputSpec', 'layers.InputSpec'])
+@keras_export('keras.layers.InputSpec', v1=['keras.layers.InputSpec'])
+@tf_export(v1=['layers.InputSpec'])
 class InputSpec(object):
   """Specifies the ndim, dtype and shape of every input to a layer.
 
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 7e6cc7bfeef97f9ad567aed82757a0a18e8c06be..a6fdfad833d17ef2f49c707aab4b35215919647b 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -22,7 +22,6 @@ from __future__ import print_function
 import copy
 import json
 import os
-import weakref
 
 import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
@@ -37,19 +36,19 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
-from tensorflow.python.keras.engine import saving
 from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.saving import hdf5_format
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training.checkpointable import base as checkpointable
-from tensorflow.python.training.checkpointable import data_structures
-from tensorflow.python.training.checkpointable import layer_utils as checkpointable_layer_utils
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.training.tracking import data_structures
+from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
+from tensorflow.python.training.tracking import util as trackable_utils
+from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 
 
@@ -69,8 +68,64 @@ except ImportError:
 class Network(base_layer.Layer):
   """A `Network` is a composition of layers.
 
-  It is the topological form of a "model". A `Model`
+  `Network` is the topological form of a "model". A `Model`
   is simply a `Network` with added training routines.
+
+  Two types of `Networks` exist: Graph Networks and Subclass Networks. Graph
+  networks are used in the Keras Functional and Sequential APIs. Subclassed
+  networks are used when a user subclasses the `Model` class. In general,
+  more Keras features are supported with Graph Networks than with Subclassed
+  Networks, specifically:
+
+  - Model cloning (`keras.models.clone`)
+  - Serialization (`model.get_config()/from_config`, `model.to_json()/to_yaml()`
+  - Whole-model saving (`model.save()`)
+
+  A Graph Network can be instantiated by passing two arguments to `__init__`.
+  The first argument is the `keras.Input` Tensors that represent the inputs
+  to the Network. The second argument specifies the output Tensors that
+  represent the outputs of this Network. Both arguments can be a nested
+  structure of Tensors.
+
+  Example:
+
+  ```
+  inputs = {'x1': keras.Input(shape=(10,)), 'x2': keras.Input(shape=(1,))}
+  t = keras.layers.Dense(1, activation='relu')(inputs['x1'])
+  outputs = keras.layers.Add()([t, inputs['x2'])
+  network = Network(inputs, outputs)
+  ```
+
+  A Graph Network constructed using the Functional API can also include raw
+  TensorFlow functions, with the exception of functions that create Variables
+  or assign ops.
+
+  Example:
+
+  ```
+  inputs = keras.Input(shape=(10,))
+  x = keras.layers.Dense(1)(inputs)
+  outputs = tf.nn.relu(x)
+  network = Network(inputs, outputs)
+  ```
+
+  Subclassed Networks can be instantiated via `name` and (optional) `dynamic`
+  keyword arguments. Subclassed Networks keep track of their Layers, and their
+  `call` method can be overridden. Subclassed Networks are typically created
+  indirectly, by subclassing the `Model` class.
+
+  Example:
+
+  ```
+  class MyModel(keras.Model):
+    def __init__(self):
+      super(MyModel, self).__init__(name='my_model', dynamic=False)
+
+      self.layer1 = keras.layers.Dense(10, activation='relu')
+
+    def call(self, inputs):
+      return self.layer1(inputs)
+  ```
   """
 
   def __init__(self, *args, **kwargs):  # pylint: disable=super-init-not-called
@@ -97,7 +152,7 @@ class Network(base_layer.Layer):
   # empty lists shouldn't cause issues; adding or removing them will not break
   # checkpoints, but may cause "all Python objects matched" assertions to fail
   # (in which case less strict assertions may be substituted if necessary).
-  @checkpointable.no_automatic_dependency_tracking
+  @trackable.no_automatic_dependency_tracking
   def _base_init(self, name=None):
     # The following are implemented as property functions:
     # self.trainable_weights
@@ -114,15 +169,8 @@ class Network(base_layer.Layer):
     self.trainable = True
     self._is_compiled = False
     self._expects_training_arg = False
-    # In many internal cases one needs to compute both the model's output
-    # and its output mask without relying on `__call__` (which would do both and
-    # set mask metadata), but for models, computing the mask requires to
-    # recompute the output.
-    # Hence the pattern `output = model.call(); mask = model.compute_mask()`
-    # would be redundant, and internal logic
-    # (susceptible to use `call` directly) should prefer using the
-    # internal method `output, mask = _call_and_compute_mask()`.
-    # This is True for Sequential networks and graph networks.
+
+    # This is True for Sequential networks and Functional networks.
     self._compute_output_and_mask_jointly = False
 
     self.supports_masking = False
@@ -143,7 +191,6 @@ class Network(base_layer.Layer):
     self._metrics_tensors = {}
     self._scope = None  # Never used.
     self._reuse = None  # Never used.
-    self._call_is_graph_friendly = True
     if context.executing_eagerly():
       self._graph = None
     else:
@@ -159,25 +206,29 @@ class Network(base_layer.Layer):
     self._outbound_nodes = []
     self._inbound_nodes = []
 
-    self._checkpointable_saver = checkpointable_utils.CheckpointableSaver(
-        weakref.ref(self))
+    self._trackable_saver = (
+        trackable_utils.saver_with_op_caching(self))
 
-  @checkpointable.no_automatic_dependency_tracking
+  @trackable.no_automatic_dependency_tracking
   def _init_graph_network(self, inputs, outputs, name=None):
     self._call_convention = (base_layer_utils
                              .CallConvention.EXPLICIT_INPUTS_ARGUMENT)
     # Normalize and set self.inputs, self.outputs.
-    if isinstance(inputs, (list, tuple)):
-      self.inputs = list(inputs)  # Tensor or list of tensors.
-    else:
-      self.inputs = [inputs]
-    if isinstance(outputs, (list, tuple)):
-      self.outputs = list(outputs)
-    else:
-      self.outputs = [outputs]
-    self._validate_graph_inputs_and_outputs()
+    if isinstance(inputs, list) and len(nest.flatten(inputs)) == 1:
+      inputs = inputs[0]
+    if isinstance(outputs, list) and len(nest.flatten(outputs)) == 1:
+      outputs = outputs[0]
+    self._nested_outputs = outputs
+    self._nested_inputs = inputs
+    self.inputs = nest.flatten(inputs)
+    self.outputs = nest.flatten(outputs)
+
+    if any(not hasattr(tensor, '_keras_history') for tensor in self.outputs):
+      base_layer_utils.create_keras_history(self._nested_outputs)
 
     self._base_init(name=name)
+    self._validate_graph_inputs_and_outputs()
+
     self._compute_previous_mask = (
         'mask' in tf_inspect.getfullargspec(self.call).args or
         hasattr(self, 'compute_mask'))
@@ -186,6 +237,10 @@ class Network(base_layer.Layer):
     self.built = True
     self._compute_output_and_mask_jointly = True
     self._is_graph_network = True
+    self._dynamic = False
+    # `_expects_training_arg` is True since the `training` argument is always
+    # present in the signature of the `call` method of a graph network.
+    self._expects_training_arg = True
 
     self._input_layers = []
     self._output_layers = []
@@ -224,6 +279,9 @@ class Network(base_layer.Layer):
     self._nodes_by_depth = nodes_by_depth
     self._layers = layers
     self._layers_by_depth = layers_by_depth
+    self._layer_call_argspecs = {}
+    for layer in self._layers:
+      self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
 
     self._track_layers(layers)
 
@@ -233,8 +291,8 @@ class Network(base_layer.Layer):
         inbound_layers=[],
         node_indices=[],
         tensor_indices=[],
-        input_tensors=self.inputs,
-        output_tensors=self.outputs)
+        input_tensors=self._nested_inputs,
+        output_tensors=self._nested_outputs)
 
     # Build self.input_names and self.output_names.
     self.input_names = []
@@ -251,10 +309,11 @@ class Network(base_layer.Layer):
     for layer in self._output_layers:
       self.output_names.append(layer.name)
 
-  @checkpointable.no_automatic_dependency_tracking
-  def _init_subclassed_network(self, name=None):
+  @trackable.no_automatic_dependency_tracking
+  def _init_subclassed_network(self, name=None, dynamic=False):
     self._base_init(name=name)
     self._is_graph_network = False
+    self._dynamic = dynamic
     call_argspec = tf_inspect.getfullargspec(self.call)
     if 'training' in call_argspec.args:
       self._expects_training_arg = True
@@ -266,10 +325,10 @@ class Network(base_layer.Layer):
     self.built = False
 
   @property
-  def _static_graph_friendly(self):
+  def dynamic(self):
     if self._is_graph_network:
-      return all(layer._static_graph_friendly for layer in self.layers)
-    return self._call_is_graph_friendly
+      return any(layer.dynamic for layer in self.layers)
+    return self._dynamic or any(layer.dynamic for layer in self.layers)
 
   def _determine_call_convention(self, call_argspec):
     """Decides how `self.call()` is invoked. See `CallConvention`."""
@@ -311,87 +370,46 @@ class Network(base_layer.Layer):
       return base_layer_utils.CallConvention.POSITIONAL_ARGUMENTS_ARE_INPUTS
 
   def _track_layers(self, layers):
-    """Add Checkpointable dependencies on a list of Layers."""
+    """Add Trackable dependencies on a list of Layers."""
     weight_layer_index = 0
     for layer_index, layer in enumerate(layers):
       if layer.weights:
         # Keep a separate index for layers which have weights. This allows users
         # to insert Layers without weights anywhere in the network without
         # breaking checkpoints.
-        self._track_checkpointable(
+        self._track_trackable(
             layer, name='layer_with_weights-%d' % weight_layer_index,
             overwrite=True)
         weight_layer_index += 1
       # Even if it doesn't have weights, we should still track everything in
-      # case it has/will have Checkpointable dependencies.
-      self._track_checkpointable(
+      # case it has/will have Trackable dependencies.
+      self._track_trackable(
           layer, name='layer-%d' % layer_index, overwrite=True)
 
-  def _no_dependency(self, value):
-    """Override to allow `Layer` to disable dependency tracking.
-
-    `CheckpointableBase` defines this method, whose semantics are "if a subclass
-    does dependency tracking, this method exempts `value`." Layer uses
-    `_no_dependency` to exempt some of its attribute assignments (conditional on
-    attribute assignment causing tracking in the subclass).
-
-    Args:
-      value: An object which will be assigned to an object attribute, whose
-        value should not be tracked.
-
-    Returns:
-      A wrapped object which, when assigned to an attribute, will not be
-      tracked (`value` will be stored in the attribute).
-    """
-    return data_structures.NoDependency(value)
-
   def __setattr__(self, name, value):
     if not getattr(self, '_setattr_tracking', True):
       super(Network, self).__setattr__(name, value)
       return
-    no_dependency = isinstance(value, data_structures.NoDependency)
-    value = data_structures.sticky_attribute_assignment(
-        checkpointable=self, value=value, name=name)
-    if (isinstance(value, (base_layer.Layer,
-                           Network,
-                           data_structures.CheckpointableDataStructure))
-        or checkpointable_layer_utils.has_weights(value)):
+
+    if all(
+        isinstance(v, (base_layer.Layer,
+                       data_structures.TrackableDataStructure)) or
+        trackable_layer_utils.has_weights(v) for v in nest.flatten(value)):
       try:
-        is_graph_network = self._is_graph_network
+        self._is_graph_network
       except AttributeError:
         raise RuntimeError('It looks like you are subclassing `Model` and you '
                            'forgot to call `super(YourClass, self).__init__()`.'
                            ' Always start with this line.')
-      if not is_graph_network:
-        # We need to check object identity to avoid de-duplicating empty
-        # container types which compare equal.
-        if not any((layer is value for layer in self._layers)):
-          self._layers.append(value)
-          if hasattr(value, '_use_resource_variables'):
-            # In subclassed models, legacy layers (tf.layers) must always use
-            # resource variables.
-            value._use_resource_variables = True
-    if (not no_dependency
-        and isinstance(value, checkpointable.CheckpointableBase)):
-      if (  # For subclassed models only, users may add extra weights/variables
-            # simply by assigning them to attributes.
-          not self._is_graph_network
-          and isinstance(value, variables.Variable)):
-        if value.trainable:
-          # Could already be added via `add_weight`.
-          if value not in self._trainable_weights:
-            self._trainable_weights.append(value)
-        else:
-          if value not in self._non_trainable_weights:
-            self._non_trainable_weights.append(value)
 
-    # Keeping track of metric instance created in subclassed model/layer.
+    super(Network, self).__setattr__(name, value)
+
+    # Keep track of metric instance created in subclassed model/layer.
     # We do this so that we can maintain the correct order of metrics by adding
     # the instance to the `metrics` list as soon as it is created.
     from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
     if isinstance(value, metrics_module.Metric):
       self._metrics.append(value)
-    super(Network, self).__setattr__(name, value)
 
   @property
   def stateful(self):
@@ -452,18 +470,15 @@ class Network(base_layer.Layer):
     if not self._is_graph_network:
       return None
 
-    inputs = generic_utils.to_list(inputs)
-    if mask is None:
-      masks = [None for _ in range(len(inputs))]
-    else:
-      masks = generic_utils.to_list(mask)
-
-    _, output_masks = self._run_internal_graph(inputs, mask=masks)
-    return output_masks
+    # TODO(omalleyt): b/123540974 This function is not really safe to call
+    # by itself because it will duplicate any updates and losses in graph
+    # mode by `call`ing the Layers again.
+    output_tensors = self._run_internal_graph(inputs, mask=mask)
+    return nest.map_structure(lambda t: t._keras_mask, output_tensors)
 
   @property
   def layers(self):
-    return checkpointable_layer_utils.filter_empty_layer_containers(
+    return trackable_layer_utils.filter_empty_layer_containers(
         self._layers)
 
   def get_layer(self, name=None, index=None):
@@ -513,7 +528,12 @@ class Network(base_layer.Layer):
   @property
   def _unfiltered_losses(self):
     losses = []
-    if context.executing_eagerly():
+
+    # If any eager losses are present, we assume the model to be part of an
+    # eager training loop (either a custom one or the one used when
+    # `run_eagerly=True`), and so we always return just the eager losses in that
+    # case.
+    if self._eager_losses:
       losses.extend(self._eager_losses)
     else:
       losses.extend(self._losses)
@@ -524,7 +544,7 @@ class Network(base_layer.Layer):
         losses += layer.losses
     return losses
 
-  @checkpointable.no_automatic_dependency_tracking
+  @trackable.no_automatic_dependency_tracking
   def _clear_losses(self):
     """Used every step in eager to reset losses."""
     self._eager_losses = []
@@ -664,14 +684,14 @@ class Network(base_layer.Layer):
 
   @property
   def trainable_weights(self):
-    return checkpointable_layer_utils.gather_trainable_weights(
+    return trackable_layer_utils.gather_trainable_weights(
         trainable=self.trainable,
         sub_layers=self._layers,
         extra_variables=self._trainable_weights)
 
   @property
   def non_trainable_weights(self):
-    return checkpointable_layer_utils.gather_non_trainable_weights(
+    return trackable_layer_utils.gather_non_trainable_weights(
         trainable=self.trainable,
         sub_layers=self._layers,
         extra_variables=self._non_trainable_weights + self._trainable_weights)
@@ -708,7 +728,7 @@ class Network(base_layer.Layer):
         A list of `InputSpec` instances (one per input to the model)
             or a single instance if the model has only one input.
     """
-    # If not a graph network, can't assume anything.
+    # If subclassed model, can't assume anything.
     if not self._is_graph_network:
       return None
 
@@ -846,122 +866,79 @@ class Network(base_layer.Layer):
       raise NotImplementedError('When subclassing the `Model` class, you should'
                                 ' implement a `call` method.')
 
-    inputs = generic_utils.to_list(inputs)
-    if mask is None:
-      masks = [None for _ in range(len(inputs))]
-    else:
-      masks = generic_utils.to_list(mask)
-    outputs, _ = self._run_internal_graph(inputs,
-                                          training=training,
-                                          mask=masks)
-    return outputs
-
-  def _call_and_compute_mask(self, inputs, training=None, mask=None):
-    inputs = generic_utils.to_list(inputs)
-    if mask is None:
-      masks = [None for _ in range(len(inputs))]
-    else:
-      masks = generic_utils.to_list(mask)
-    return self._run_internal_graph(inputs,
-                                    training=training,
-                                    mask=masks)
+    return self._run_internal_graph(inputs, training=training, mask=mask)
 
   def compute_output_shape(self, input_shape):
     if not self._is_graph_network:
       return super(Network, self).compute_output_shape(input_shape)
 
-    if isinstance(input_shape, list):
-      input_shapes = []
-      for shape in input_shape:
-        if shape is not None:
-          input_shapes.append(tuple(tensor_shape.TensorShape(shape).as_list()))
-        else:
-          input_shapes.append(None)
-    else:
-      if input_shape is not None:
-        input_shapes = [tuple(tensor_shape.TensorShape(input_shape).as_list())]
-      else:
-        input_shapes = [None]
+    # Convert any shapes in tuple format to TensorShapes.
+    input_shape = tf_utils.convert_shapes(input_shape, to_tuples=False)
 
-    if len(input_shapes) != len(self._input_layers):
+    if len(nest.flatten(input_shape)) != len(nest.flatten(self._input_layers)):
       raise ValueError('Invalid input_shape argument ' + str(input_shape) +
                        ': model has ' + str(len(self._input_layers)) +
                        ' tensor inputs.')
 
-    cache_key = generic_utils.object_list_uid(input_shapes)
+    cache_key = generic_utils.object_list_uid(input_shape)
     if cache_key in self._output_shape_cache:
-      # Cache hit.
-      output_shapes = self._output_shape_cache[cache_key]
-    else:
-      layers_to_output_shapes = {}
-      for i in range(len(input_shapes)):
-        layer = self._input_layers[i]
-        input_shape = input_shapes[i]
-        # It's an input layer: then `compute_output_shape` is identity,
-        # and there is only one node and one tensor output.
-        shape_key = layer.name + '_0_0'
-        layers_to_output_shapes[shape_key] = input_shape
-
-      depth_keys = list(self._nodes_by_depth.keys())
-      depth_keys.sort(reverse=True)
-      # Iterate over nodes, by depth level.
-      if len(depth_keys) > 1:
-        for depth in depth_keys:
-          nodes = self._nodes_by_depth[depth]
-          for node in nodes:
-            # This is always a single layer, never a list.
-            layer = node.outbound_layer
-            if layer in self._input_layers:
-              # We've already covered the input layers
-              # a few lines above.
-              continue
-            # Potentially redundant list,
-            # same size as node.input_tensors.
-            input_shapes = []
-            for j in range(len(node.inbound_layers)):
-              inbound_layer = node.inbound_layers[j]
-              node_index = node.node_indices[j]
-              tensor_index = node.tensor_indices[j]
-              shape_key = inbound_layer.name + '_%s_%s' % (node_index,
-                                                           tensor_index)
-              input_shape = layers_to_output_shapes[shape_key]
-              input_shapes.append(input_shape)
-
-            if len(input_shapes) == 1:
-              output_shape = layer.compute_output_shape(input_shapes[0])
-            else:
-              output_shape = layer.compute_output_shape(input_shapes)
-            if isinstance(output_shape, list):
-              output_shapes = [
-                  tuple(tensor_shape.TensorShape(shape).as_list())
-                  for shape in output_shape
-              ]
-            else:
-              output_shapes = [
-                  tuple(tensor_shape.TensorShape(output_shape).as_list())
-              ]
-
-            node_index = layer._inbound_nodes.index(node)  # pylint: disable=protected-access
-            for j in range(len(output_shapes)):
-              shape_key = layer.name + '_%s_%s' % (node_index, j)
-              layers_to_output_shapes[shape_key] = output_shapes[j]
-
-        # Read final output shapes from layers_to_output_shapes.
-        output_shapes = []
-        for i in range(len(self._output_layers)):
-          layer, node_index, tensor_index = self._output_coordinates[i]
-          shape_key = layer.name + '_%s_%s' % (node_index, tensor_index)
-          output_shapes.append(layers_to_output_shapes[shape_key])
-        # Store in cache.
-        self._output_shape_cache[cache_key] = output_shapes
-
-    if isinstance(output_shapes, list):
-      if len(output_shapes) == 1:
-        return tensor_shape.TensorShape(output_shapes[0])
-      else:
-        return [tensor_shape.TensorShape(shape) for shape in output_shapes]
-    else:
-      return tensor_shape.TensorShape(output_shapes)
+      # Cache hit. Return shapes as TensorShapes.
+      return self._output_shape_cache[cache_key]
+
+    layers_to_output_shapes = {}
+    for layer, shape in zip(self._input_layers, nest.flatten(input_shape)):
+      # It's an input layer: then `compute_output_shape` is identity,
+      # and there is only one node and one tensor..
+      shape_key = layer.name + '_0_0'
+      layers_to_output_shapes[shape_key] = shape
+
+    depth_keys = list(self._nodes_by_depth.keys())
+    depth_keys.sort(reverse=True)
+    # Iterate over nodes, by depth level.
+    if len(depth_keys) > 1:
+      for depth in depth_keys:
+        nodes = self._nodes_by_depth[depth]
+        for node in nodes:
+          # This is always a single layer, never a list.
+          layer = node.outbound_layer
+          if layer in self._input_layers:
+            # We've already covered the input layers
+            # a few lines above.
+            continue
+          # Potentially redundant list,
+          # same size as node.input_tensors.
+          layer_input_shapes = []
+          for inbound_layer, node_id, tensor_id, _ in node.iterate_inbound():
+            input_layer_key = inbound_layer.name + '_%s_%s' % (node_id,
+                                                               tensor_id)
+            layer_input_shapes.append(layers_to_output_shapes[input_layer_key])
+          layer_input_shapes = nest.pack_sequence_as(node.inbound_layers,
+                                                     layer_input_shapes)
+          # Layers expect shapes to be tuples for `compute_output_shape`.
+          layer_input_shapes = tf_utils.convert_shapes(
+              layer_input_shapes, to_tuples=True)
+          layer_output_shapes = layer.compute_output_shape(layer_input_shapes)
+          # Convert back to TensorShapes.
+          layer_output_shapes = tf_utils.convert_shapes(
+              layer_output_shapes, to_tuples=False)
+
+          node_index = layer._inbound_nodes.index(node)  # pylint: disable=protected-access
+          for j, shape in enumerate(nest.flatten(layer_output_shapes)):
+            shape_key = layer.name + '_%s_%s' % (node_index, j)
+            layers_to_output_shapes[shape_key] = shape
+
+      # Read final output shapes from layers_to_output_shapes.
+      output_shapes = []
+      for i in range(len(self._output_layers)):
+        layer, node_index, tensor_index = self._output_coordinates[i]
+        shape_key = layer.name + '_%s_%s' % (node_index, tensor_index)
+        output_shapes.append(layers_to_output_shapes[shape_key])
+      output_shapes = nest.pack_sequence_as(self._nested_outputs, output_shapes)
+      # Store in cache.
+      self._output_shape_cache[cache_key] = output_shapes
+
+    # Return shapes as TensorShapes.
+    return output_shapes
 
   def _run_internal_graph(self, inputs, training=None, mask=None):
     """Computes output tensors for new inputs.
@@ -971,9 +948,9 @@ class Network(base_layer.Layer):
         - Can be run on non-Keras tensors.
 
     Arguments:
-        inputs: List of tensors
+        inputs: Tensor or nested structure of Tensors.
         training: Boolean learning phase.
-        mask: List of masks (tensors or None).
+        mask: (Optional) Tensor or nested structure of Tensors.
 
     Returns:
         Two lists: output_tensors, output_masks
@@ -985,138 +962,74 @@ class Network(base_layer.Layer):
     # the future and 2) Keras is a major user of Network.  If you don't
     # use masking, it does not interfere with regular behavior at all and you
     # can ignore it.
+    inputs = nest.flatten(inputs)
     if mask is None:
       masks = [None for _ in range(len(inputs))]
     else:
-      masks = mask
+      masks = nest.flatten(mask)
+
+    for input_t, mask in zip(inputs, masks):
+      input_t._keras_mask = mask
+
+    # Dictionary mapping reference tensors to computed tensors.
+    tensor_dict = {}
 
-    # Dictionary mapping reference tensors to tuples
-    # (computed tensor, compute mask)
-    # we assume a 1:1 mapping from tensor to mask
-    tensor_map = {}
     for x, y, mask in zip(self.inputs, inputs, masks):
-      tensor_map[str(id(x))] = (y, mask)
+      tensor_dict[str(id(x))] = y
 
     depth_keys = list(self._nodes_by_depth.keys())
     depth_keys.sort(reverse=True)
+    # Ignore the InputLayers when computing the graph.
+    depth_keys = depth_keys[1:]
+
     for depth in depth_keys:
       nodes = self._nodes_by_depth[depth]
       for node in nodes:
         # This is always a single layer, never a list.
         layer = node.outbound_layer
-        reference_input_tensors = node.input_tensors
-        reference_output_tensors = node.output_tensors
 
-        # If all previous input tensors are available in tensor_map,
-        # then call node.inbound_layer on them.
-        computed_data = []  # List of tuples (input, mask).
-        for x in reference_input_tensors:
-          if str(id(x)) in tensor_map:
-            computed_data.append(tensor_map[str(id(x))])
+        if all(
+            str(id(tensor)) in tensor_dict
+            for tensor in nest.flatten(node.input_tensors)):
 
-        if len(computed_data) == len(reference_input_tensors):
           # Call layer (reapplying ops to new inputs).
-          with ops.name_scope(layer.name):
-            if node.arguments:
-              kwargs = node.arguments
-            else:
-              kwargs = {}
-            # Ensure `training` arg propagation if applicable.
-            if 'training' in tf_inspect.getfullargspec(layer.call).args:
-              kwargs.setdefault('training', training)
-
-            if len(computed_data) == 1:
-              computed_tensor, computed_mask = computed_data[0]
-              # Ensure mask propagation if applicable.
-              if 'mask' in tf_inspect.getfullargspec(layer.call).args:
-                kwargs.setdefault('mask', computed_mask)
-
-              # Compute outputs and masks.
-              if (isinstance(layer, Network) and
-                  layer._compute_output_and_mask_jointly):
-                output_tensors, output_masks = layer._call_and_compute_mask(
-                    computed_tensor, **kwargs)
-              else:
-                if context.executing_eagerly():
-                  output_tensors = layer(computed_tensor, **kwargs)
-                else:
-                  output_tensors = layer.call(computed_tensor, **kwargs)
-                if hasattr(layer, 'compute_mask'):
-                  output_masks = layer.compute_mask(computed_tensor,
-                                                    computed_mask)
-                else:
-                  output_masks = [None for _ in output_tensors]
-              computed_tensors = [computed_tensor]
-
-            else:
-              computed_tensors = [x[0] for x in computed_data]
-              computed_masks = [x[1] for x in computed_data]
-              # Ensure mask propagation if applicable.
-              if 'mask' in tf_inspect.getfullargspec(layer.call).args:
-                kwargs.setdefault('mask', computed_masks)
-
-              # Compute outputs and masks.
-              if (isinstance(layer, Network) and
-                  layer._compute_output_and_mask_jointly):
-                output_tensors, output_masks = layer._call_and_compute_mask(
-                    computed_tensors, **kwargs)
-              else:
-                if context.executing_eagerly():
-                  output_tensors = layer(computed_tensors, **kwargs)
-                else:
-                  output_tensors = layer.call(computed_tensors, **kwargs)
-                if hasattr(layer, 'compute_mask'):
-                  output_masks = layer.compute_mask(computed_tensors,
-                                                    computed_masks)
-                else:
-                  output_masks = [None for _ in output_tensors]
-
-            output_tensors = generic_utils.to_list(output_tensors)
-            if output_masks is None:
-              output_masks = [None for _ in output_tensors]
-            else:
-              output_masks = generic_utils.to_list(output_masks)
-
-            if not context.executing_eagerly():
-              # Set mask metadata.
-              for x, m in zip(output_tensors, output_masks):
-                try:
-                  x._keras_mask = m
-                except AttributeError:
-                  pass
-
-              # Apply activity regularizer if any.
-              layer._handle_activity_regularization(computed_tensors,
-                                                    output_tensors)
-
-          # Update tensor_map.
-          for x, y, mask in zip(reference_output_tensors, output_tensors,
-                                output_masks):
-            tensor_map[str(id(x))] = (y, mask)
+          computed_tensors = nest.map_structure(
+              lambda t: tensor_dict[str(id(t))], node.input_tensors)
+
+          # Ensure `training` and `mask` arg propagation if applicable.
+          kwargs = node.arguments or {}
+          argspec = self._layer_call_argspecs[layer].args
+          if 'training' in argspec:
+            kwargs.setdefault('training', training)
+          if 'mask' in argspec:
+            computed_masks = nest.map_structure(lambda t: t._keras_mask,
+                                                computed_tensors)
+            kwargs.setdefault('mask', computed_masks)
+
+          # Compute outputs.
+          output_tensors = layer(computed_tensors, **kwargs)
+
+          # Update tensor_dict.
+          for x, y in zip(
+              nest.flatten(node.output_tensors), nest.flatten(output_tensors)):
+            tensor_dict[str(id(x))] = y
 
     output_tensors = []
-    output_masks = []
     output_shapes = []
     for x in self.outputs:
-      assert str(id(x)) in tensor_map, 'Could not compute output ' + str(x)
-      tensor, mask = tensor_map[str(id(x))]
-      output_shapes.append(backend.int_shape(x))
+      assert str(id(x)) in tensor_dict, 'Could not compute output ' + str(x)
+      tensor = tensor_dict[str(id(x))]
+      output_shapes.append(x.shape)
       output_tensors.append(tensor)
-      output_masks.append(mask)
-
-    if len(output_tensors) == 1:
-      output_tensors = output_tensors[0]
-      if output_shapes is not None:
-        output_shapes = output_shapes[0]
-      if output_masks is not None:
-        output_masks = output_masks[0]
 
     if output_shapes is not None:
-      input_shapes = [backend.int_shape(x) for x in inputs]
+      input_shapes = [x.shape for x in inputs]
       cache_key = generic_utils.object_list_uid(input_shapes)
-      self._output_shape_cache[cache_key] = output_shapes
+      self._output_shape_cache[cache_key] = nest.pack_sequence_as(
+          self._nested_outputs, output_shapes)
 
-    return output_tensors, output_masks
+    output_tensors = nest.pack_sequence_as(self._nested_outputs, output_tensors)
+    return output_tensors
 
   def get_config(self):
     if not self._is_graph_network:
@@ -1164,14 +1077,15 @@ class Network(base_layer.Layer):
             kwargs = {}
           if node.inbound_layers:
             node_data = []
-            for i in range(len(node.inbound_layers)):
-              inbound_layer = node.inbound_layers[i]
-              node_index = node.node_indices[i]
-              tensor_index = node.tensor_indices[i]
-              node_key = _make_node_key(inbound_layer.name, node_index)
+            for inbound_layer, node_id, tensor_id, _ in node.iterate_inbound():
+              node_key = _make_node_key(inbound_layer.name, node_id)
               new_node_index = node_conversion_map.get(node_key, 0)
               node_data.append(
-                  [inbound_layer.name, new_node_index, tensor_index, kwargs])
+                  tf_utils.ListWrapper(
+                      [inbound_layer.name, new_node_index, tensor_id, kwargs]))
+            node_data = nest.pack_sequence_as(node.input_tensors, node_data)
+            # Convert ListWrapper to list for backwards compatible configs.
+            node_data = tf_utils.convert_inner_node_data(node_data)
             filtered_inbound_nodes.append(node_data)
       layer_configs.append({
           'name': layer.name,
@@ -1189,8 +1103,12 @@ class Network(base_layer.Layer):
       if node_key not in self._network_nodes:
         continue
       new_node_index = node_conversion_map[node_key]
-      model_inputs.append([layer.name, new_node_index, tensor_index])
+      model_inputs.append(
+          tf_utils.ListWrapper([layer.name, new_node_index, tensor_index]))
+    model_inputs = nest.pack_sequence_as(self._nested_inputs, model_inputs)
+    model_inputs = tf_utils.convert_inner_node_data(model_inputs)
     config['input_layers'] = model_inputs
+
     model_outputs = []
     for i in range(len(self._output_layers)):
       layer, node_index, tensor_index = self._output_coordinates[i]
@@ -1198,7 +1116,10 @@ class Network(base_layer.Layer):
       if node_key not in self._network_nodes:
         continue
       new_node_index = node_conversion_map[node_key]
-      model_outputs.append([layer.name, new_node_index, tensor_index])
+      model_outputs.append(
+          tf_utils.ListWrapper([layer.name, new_node_index, tensor_index]))
+    model_outputs = nest.pack_sequence_as(self._nested_outputs, model_outputs)
+    model_outputs = tf_utils.convert_inner_node_data(model_outputs)
     config['output_layers'] = model_outputs
     return copy.deepcopy(config)
 
@@ -1240,13 +1161,14 @@ class Network(base_layer.Layer):
 
       Arguments:
           layer: layer instance.
-          node_data: node config dict.
+          node_data: Nested structure of `ListWrapper`.
 
       Raises:
-          ValueError: In case of improperly formatted `node_data` dict.
+          ValueError: In case of improperly formatted `node_data`.
       """
       input_tensors = []
-      for input_data in node_data:
+      for input_data in nest.flatten(node_data):
+        input_data = input_data.as_list()
         inbound_layer_name = input_data[0]
         inbound_node_index = input_data[1]
         inbound_tensor_index = input_data[2]
@@ -1256,20 +1178,22 @@ class Network(base_layer.Layer):
           kwargs = input_data[3]
         else:
           raise ValueError('Improperly formatted model config.')
-        if inbound_layer_name not in created_layers:
-          add_unprocessed_node(layer, node_data)
-          return
+
         inbound_layer = created_layers[inbound_layer_name]
         if len(inbound_layer._inbound_nodes) <= inbound_node_index:
           add_unprocessed_node(layer, node_data)
           return
         inbound_node = inbound_layer._inbound_nodes[inbound_node_index]
-        input_tensors.append(inbound_node.output_tensors[inbound_tensor_index])
+        input_tensors.append(
+            nest.flatten(inbound_node.output_tensors)[inbound_tensor_index])
+      input_tensors = nest.pack_sequence_as(node_data, input_tensors)
       # Call layer on its inputs, thus creating the node
       # and building the layer if needed.
-      if input_tensors:
-        if len(input_tensors) == 1:
-          layer(input_tensors[0], **kwargs)
+      if input_tensors is not None:
+        # Preserve compatibility with older configs.
+        flat_input_tensors = nest.flatten(input_tensors)
+        if len(flat_input_tensors) == 1:
+          layer(flat_input_tensors[0], **kwargs)
         else:
           layer(input_tensors, **kwargs)
 
@@ -1290,8 +1214,10 @@ class Network(base_layer.Layer):
       layer = deserialize_layer(layer_data, custom_objects=custom_objects)
       created_layers[layer_name] = layer
 
-      # Gather layer inputs.
+      # Gather layer inputs and convert to `ListWrapper` objects.
       inbound_nodes_data = layer_data['inbound_nodes']
+      inbound_nodes_data = tf_utils.convert_inner_node_data(
+          inbound_nodes_data, wrap=True)
       for node_data in inbound_nodes_data:
         # We don't process nodes (i.e. make layer calls)
         # on the fly because the inbound node may not yet exist,
@@ -1316,18 +1242,27 @@ class Network(base_layer.Layer):
     name = config.get('name')
     input_tensors = []
     output_tensors = []
-    for layer_data in config['input_layers']:
-      layer_name, node_index, tensor_index = layer_data
+
+    input_layers = tf_utils.convert_inner_node_data(
+        config['input_layers'], wrap=True)
+    for layer_data in nest.flatten(input_layers):
+      layer_name, node_index, tensor_index = layer_data.as_list()
       assert layer_name in created_layers
       layer = created_layers[layer_name]
       layer_output_tensors = layer._inbound_nodes[node_index].output_tensors
-      input_tensors.append(layer_output_tensors[tensor_index])
-    for layer_data in config['output_layers']:
-      layer_name, node_index, tensor_index = layer_data
+      input_tensors.append(nest.flatten(layer_output_tensors)[tensor_index])
+
+    output_layers = tf_utils.convert_inner_node_data(
+        config['output_layers'], wrap=True)
+    for layer_data in nest.flatten(output_layers):
+      layer_name, node_index, tensor_index = layer_data.as_list()
       assert layer_name in created_layers
       layer = created_layers[layer_name]
       layer_output_tensors = layer._inbound_nodes[node_index].output_tensors
-      output_tensors.append(layer_output_tensors[tensor_index])
+      output_tensors.append(nest.flatten(layer_output_tensors)[tensor_index])
+
+    input_tensors = nest.pack_sequence_as(input_layers, input_tensors)
+    output_tensors = nest.pack_sequence_as(output_layers, output_tensors)
     return cls(inputs=input_tensors, outputs=output_tensors, name=name)
 
   def save(self, filepath, overwrite=True, include_optimizer=True):
@@ -1368,7 +1303,10 @@ class Network(base_layer.Layer):
     """
     if not self._is_graph_network:
       raise NotImplementedError(
-          'Currently `save` requires model to be a graph network. Consider '
+          'The `save` method requires the model to be a Functional model or a '
+          'Sequential model. It does not work for subclassed models, '
+          'because such models are defined via the body of a Python method, '
+          'which isn\'t safely serializable. Consider '
           'using `save_weights`, in order to save the weights of the model.')
 
     from tensorflow.python.keras.models import save_model  # pylint: disable=g-import-not-at-top
@@ -1453,7 +1391,7 @@ class Network(base_layer.Layer):
         return
     if save_format == 'h5':
       with h5py.File(filepath, 'w') as f:
-        saving.save_weights_to_hdf5_group(f, self.layers)
+        hdf5_format.save_weights_to_hdf5_group(f, self.layers)
     else:
       if context.executing_eagerly():
         session = None
@@ -1461,7 +1399,7 @@ class Network(base_layer.Layer):
         session = backend.get_session()
       optimizer = getattr(self, 'optimizer', None)
       if (optimizer
-          and not isinstance(optimizer, checkpointable.CheckpointableBase)):
+          and not isinstance(optimizer, trackable.Trackable)):
         logging.warning(
             ('This model was compiled with a Keras optimizer (%s) but is being '
              'saved in TensorFlow format with `save_weights`. The model\'s '
@@ -1469,11 +1407,12 @@ class Network(base_layer.Layer):
              'the TensorFlow format the optimizer\'s state will not be '
              'saved.\n\nConsider using a TensorFlow optimizer from `tf.train`.')
             % (optimizer,))
-      self._checkpointable_saver.save(filepath, session=session)
+      self._trackable_saver.save(filepath, session=session)
       # Record this checkpoint so it's visible from tf.train.latest_checkpoint.
-      checkpoint_management.update_checkpoint_state(
+      checkpoint_management.update_checkpoint_state_internal(
           save_dir=os.path.dirname(filepath),
           model_checkpoint_path=filepath,
+          save_relative_paths=True,
           all_model_checkpoint_paths=[filepath])
 
   def load_weights(self, filepath, by_name=False):
@@ -1527,7 +1466,7 @@ class Network(base_layer.Layer):
         # The checkpoint is not readable in TensorFlow format. Try HDF5.
         save_format = 'h5'
     if save_format == 'tf':
-      status = self._checkpointable_saver.restore(filepath)
+      status = self._trackable_saver.restore(filepath)
       if by_name:
         raise NotImplementedError(
             'Weights may only be loaded based on topology into Models when '
@@ -1537,7 +1476,7 @@ class Network(base_layer.Layer):
         session = backend.get_session()
         # Restore existing variables (if any) immediately, and set up a
         # streaming restore for any variables created in the future.
-        checkpointable_utils.streaming_restore(status=status, session=session)
+        trackable_utils.streaming_restore(status=status, session=session)
       status.assert_nontrivial_match()
       return status
     if h5py is None:
@@ -1552,9 +1491,9 @@ class Network(base_layer.Layer):
       if 'layer_names' not in f.attrs and 'model_weights' in f:
         f = f['model_weights']
       if by_name:
-        saving.load_weights_from_hdf5_group_by_name(f, self.layers)
+        hdf5_format.load_weights_from_hdf5_group_by_name(f, self.layers)
       else:
-        saving.load_weights_from_hdf5_group(f, self.layers)
+        hdf5_format.load_weights_from_hdf5_group(f, self.layers)
 
   def _updated_config(self):
     """Util shared between different serialization methods.
@@ -1795,13 +1734,9 @@ def _map_graph_network(inputs, outputs):
     nodes_in_progress.add(node)
 
     # Propagate to all previous tensors connected to this node.
-    for i in range(len(node.inbound_layers)):
-      x = node.input_tensors[i]
-      layer = node.inbound_layers[i]
-      node_index = node.node_indices[i]
-      tensor_index = node.tensor_indices[i]
-      build_map(x, finished_nodes, nodes_in_progress, layer,
-                node_index, tensor_index)
+    for layer, node_index, tensor_index, tensor in node.iterate_inbound():
+      build_map(tensor, finished_nodes, nodes_in_progress, layer, node_index,
+                tensor_index)
 
     finished_nodes.add(node)
     nodes_in_progress.remove(node)
@@ -1833,9 +1768,7 @@ def _map_graph_network(inputs, outputs):
     # Update the depth of inbound nodes.
     # The "depth" of a node is the max of the depths
     # of all layers it is connected to.
-    for i in range(len(node.inbound_layers)):
-      inbound_layer = node.inbound_layers[i]
-      node_index = node.node_indices[i]
+    for inbound_layer, node_index, _, _ in node.iterate_inbound():
       inbound_node = inbound_layer._inbound_nodes[node_index]  # pylint: disable=protected-access
       previous_depth = nodes_depths.get(inbound_node, 0)
       nodes_depths[inbound_node] = max(depth + 1, previous_depth)
@@ -1883,7 +1816,7 @@ def _map_graph_network(inputs, outputs):
     for node in nodes_by_depth[depth]:
       layer = node.outbound_layer
       if layer:
-        for x in node.input_tensors:
+        for x in nest.flatten(node.input_tensors):
           if x not in computable_tensors:
             raise ValueError('Graph disconnected: '
                              'cannot obtain value for tensor ' + str(x) +
@@ -1891,7 +1824,7 @@ def _map_graph_network(inputs, outputs):
                              'The following previous layers '
                              'were accessed without issue: ' +
                              str(layers_with_complete_input))
-        for x in node.output_tensors:
+        for x in nest.flatten(node.output_tensors):
           computable_tensors.append(x)
         layers_with_complete_input.append(layer.name)
 
@@ -1904,3 +1837,4 @@ def _map_graph_network(inputs, outputs):
                        str(all_names.count(name)) + ' times in the model. '
                        'All layer names should be unique.')
   return network_nodes, nodes_by_depth, layers, layers_by_depth
+
diff --git a/tensorflow/python/keras/engine/partial_batch_padding_handler.py b/tensorflow/python/keras/engine/partial_batch_padding_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3951ead6e1d75473d3847ca52a895e7f50aed3a
--- /dev/null
+++ b/tensorflow/python/keras/engine/partial_batch_padding_handler.py
@@ -0,0 +1,111 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility object to handler partial batches for TPUStrategy."""
+# pylint: disable=protected-access
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import six
+
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.keras import backend as K
+from tensorflow.python.ops import array_ops
+from tensorflow.python.util import nest
+
+
+class PartialBatchPaddingHandler(object):
+  """A container that holds info about partial batches for `predict()`."""
+
+  def __init__(self, output_shape):
+    self.padded_batch_size = 0
+    self.padding_mask = array_ops.zeros(0)
+    self.output_shape = output_shape
+
+  def get_real_batch_size(self, dataset_batch):
+    """Returns the number of elements in a potentially partial batch."""
+    if isinstance(dataset_batch, (tuple, list)):
+      dataset_batch = dataset_batch[0]
+
+    assert nest.flatten(dataset_batch)
+
+    def _find_any_tensor(batch_features):
+      tensors = [
+          x for x in nest.flatten(batch_features) if tensor_util.is_tensor(x)
+      ]
+      if not tensors:
+        raise ValueError('Cannot find any Tensor in features dict.')
+      return tensors[0]
+
+    return K.cast(K.shape(_find_any_tensor(dataset_batch))[0],
+                  dtype='int64')
+
+  def update_mask(self, padding_mask, dataset_batch):
+    """Calculate and cache the amount of padding required for a batch."""
+    original_batch_size = self.get_real_batch_size(dataset_batch)
+    missing_count = self.padded_batch_size - original_batch_size
+    mask = K.concatenate([array_ops.ones(original_batch_size),
+                          array_ops.zeros(missing_count)], axis=0)
+    return K.concatenate([padding_mask, mask], axis=0)
+
+  def pad_batch(self, *dataset_batch_elements):
+    """Pads out the batch dimension of a tensor to the complete batch size."""
+    def _pad(batch):
+      """Helper function to pad nested data within each batch elements."""
+      padded_dict_batch = {}
+      if isinstance(batch, dict):
+        for key, value in six.iteritems(batch):
+          padded_dict_batch[key] = _pad(value)
+        return padded_dict_batch
+
+      rank = len(batch.shape)
+      assert rank > 0
+      missing_count = (self.padded_batch_size -
+                       self.get_real_batch_size(batch))
+      padding = K.stack([[0, missing_count]] + [[0, 0]] * (rank - 1))
+      return array_ops.pad(batch, padding, 'constant')
+
+    if len(dataset_batch_elements) == 1:
+      return _pad(dataset_batch_elements[0])
+
+    batch_elements = []
+    for batch_element in dataset_batch_elements:
+      batch_elements.append(_pad(batch_element))
+    return tuple(batch_elements)
+
+  def apply_mask(self, prediction_result):
+    """Removes prediction output that corresponds to padded input."""
+    padding_mask = K.get_value(self.padding_mask)
+    assert len(padding_mask.shape) == 1
+
+    if len(self.output_shape) == 1:
+      prediction = np.take(prediction_result,
+                           np.nonzero(
+                               padding_mask[:len(prediction_result)]),
+                           axis=0)
+      if prediction.shape[0] == 1:
+        prediction = np.squeeze(prediction, axis=0)
+      return prediction
+
+    else:
+      predictions = []
+      for i in range(len(self.output_shape)):
+        prediction = prediction_result[i]
+        prediction = np.take(prediction, np.nonzero(
+            padding_mask[:len(prediction)]), axis=0)
+        predictions.append(np.squeeze(prediction))
+
+      return predictions
diff --git a/tensorflow/python/keras/engine/saving.py b/tensorflow/python/keras/engine/saving.py
index 54d9e32fb258343dfd9b75351015959952893c1a..b4da86d98483c85c22d2beb0d285720fac58407d 100644
--- a/tensorflow/python/keras/engine/saving.py
+++ b/tensorflow/python/keras/engine/saving.py
@@ -14,941 +14,11 @@
 # ==============================================================================
 # pylint: disable=protected-access
 """Model saving utilities.
+
+Everything has been moved to keras/saving/. This file will be deleted soon.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import json
-import os
-
-import numpy as np
-from six.moves import zip  # pylint: disable=redefined-builtin
-
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras import optimizers
-from tensorflow.python.keras.utils import conv_utils
-from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import serialization
-from tensorflow.python.util.tf_export import tf_export
-
-# pylint: disable=g-import-not-at-top
-try:
-  import h5py
-  HDF5_OBJECT_HEADER_LIMIT = 64512
-except ImportError:
-  h5py = None
-
-try:
-  import yaml
-except ImportError:
-  yaml = None
-# pylint: enable=g-import-not-at-top
-
-
-@tf_export('keras.models.save_model')
-def save_model(model, filepath, overwrite=True, include_optimizer=True):
-  """Saves a model to a HDF5 file.
-
-  The saved model contains:
-      - the model's configuration (topology)
-      - the model's weights
-      - the model's optimizer's state (if any)
-
-  Thus the saved model can be reinstantiated in
-  the exact same state, without any of the code
-  used for model definition or training.
-
-  Arguments:
-      model: Keras model instance to be saved.
-      filepath: One of the following:
-          - String, path where to save the model
-          - `h5py.File` object where to save the model
-      overwrite: Whether we should overwrite any existing
-          model at the target location, or instead
-          ask the user with a manual prompt.
-      include_optimizer: If True, save optimizer's state together.
-
-  Raises:
-      ImportError: if h5py is not available.
-  """
-
-  if h5py is None:
-    raise ImportError('`save_model` requires h5py.')
-
-  from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
-
-  # TODO(psv) Add warning when we save models that contain non-serializable
-  # entities like metrics added using `add_metric` and losses added using
-  # `add_loss.`
-
-  if not isinstance(filepath, h5py.File):
-    # If file exists and should not be overwritten.
-    if not overwrite and os.path.isfile(filepath):
-      proceed = ask_to_proceed_with_overwrite(filepath)
-      if not proceed:
-        return
-
-    f = h5py.File(filepath, mode='w')
-    opened_new_file = True
-  else:
-    f = filepath
-    opened_new_file = False
-
-  try:
-    f.attrs['keras_version'] = str(keras_version).encode('utf8')
-    f.attrs['backend'] = K.backend().encode('utf8')
-    f.attrs['model_config'] = json.dumps(
-        {
-            'class_name': model.__class__.__name__,
-            'config': model.get_config()
-        },
-        default=serialization.get_json_type).encode('utf8')
-
-    model_weights_group = f.create_group('model_weights')
-    model_layers = model.layers
-    save_weights_to_hdf5_group(model_weights_group, model_layers)
-
-    if include_optimizer and model.optimizer:
-      if isinstance(model.optimizer, optimizers.TFOptimizer):
-        logging.warning(
-            'TensorFlow optimizers do not '
-            'make it possible to access '
-            'optimizer attributes or optimizer state '
-            'after instantiation. '
-            'As a result, we cannot save the optimizer '
-            'as part of the model save file.'
-            'You will have to compile your model again after loading it. '
-            'Prefer using a Keras optimizer instead '
-            '(see keras.io/optimizers).')
-      else:
-        f.attrs['training_config'] = json.dumps(
-            {
-                'optimizer_config': {
-                    'class_name': model.optimizer.__class__.__name__,
-                    'config': model.optimizer.get_config()
-                },
-                'loss': model.loss,
-                'metrics': model._compile_metrics,
-                'weighted_metrics': model._compile_weighted_metrics,
-                'sample_weight_mode': model.sample_weight_mode,
-                'loss_weights': model.loss_weights,
-            },
-            default=serialization.get_json_type).encode('utf8')
-
-        # Save optimizer weights.
-        symbolic_weights = getattr(model.optimizer, 'weights')
-        if symbolic_weights:
-          optimizer_weights_group = f.create_group('optimizer_weights')
-          weight_values = K.batch_get_value(symbolic_weights)
-          weight_names = []
-          for w, val in zip(symbolic_weights, weight_values):
-            name = str(w.name)
-            weight_names.append(name.encode('utf8'))
-          optimizer_weights_group.attrs['weight_names'] = weight_names
-          for name, val in zip(weight_names, weight_values):
-            param_dset = optimizer_weights_group.create_dataset(
-                name, val.shape, dtype=val.dtype)
-            if not val.shape:
-              # scalar
-              param_dset[()] = val
-            else:
-              param_dset[:] = val
-    f.flush()
-  finally:
-    if opened_new_file:
-      f.close()
-
-
-@tf_export('keras.models.load_model')
-def load_model(filepath, custom_objects=None, compile=True):  # pylint: disable=redefined-builtin
-  """Loads a model saved via `save_model`.
-
-  Arguments:
-      filepath: One of the following:
-          - String, path to the saved model
-          - `h5py.File` object from which to load the model
-      custom_objects: Optional dictionary mapping names
-          (strings) to custom classes or functions to be
-          considered during deserialization.
-      compile: Boolean, whether to compile the model
-          after loading.
-
-  Returns:
-      A Keras model instance. If an optimizer was found
-      as part of the saved model, the model is already
-      compiled. Otherwise, the model is uncompiled and
-      a warning will be displayed. When `compile` is set
-      to False, the compilation is omitted without any
-      warning.
-
-  Raises:
-      ImportError: if h5py is not available.
-      ValueError: In case of an invalid savefile.
-  """
-  if h5py is None:
-    raise ImportError('`load_model` requires h5py.')
-
-  if not custom_objects:
-    custom_objects = {}
-
-  def convert_custom_objects(obj):
-    """Handles custom object lookup.
-
-    Arguments:
-        obj: object, dict, or list.
-
-    Returns:
-        The same structure, where occurrences
-            of a custom object name have been replaced
-            with the custom object.
-    """
-    if isinstance(obj, list):
-      deserialized = []
-      for value in obj:
-        deserialized.append(convert_custom_objects(value))
-      return deserialized
-    if isinstance(obj, dict):
-      deserialized = {}
-      for key, value in obj.items():
-        deserialized[key] = convert_custom_objects(value)
-      return deserialized
-    if obj in custom_objects:
-      return custom_objects[obj]
-    return obj
-
-  opened_new_file = not isinstance(filepath, h5py.File)
-  if opened_new_file:
-    f = h5py.File(filepath, mode='r')
-  else:
-    f = filepath
-
-  model = None
-  try:
-    # instantiate model
-    model_config = f.attrs.get('model_config')
-    if model_config is None:
-      raise ValueError('No model found in config file.')
-    model_config = json.loads(model_config.decode('utf-8'))
-    model = model_from_config(model_config, custom_objects=custom_objects)
-
-    # set weights
-    load_weights_from_hdf5_group(f['model_weights'], model.layers)
-
-    if compile:
-      # instantiate optimizer
-      training_config = f.attrs.get('training_config')
-      if training_config is None:
-        logging.warning('No training configuration found in save file: '
-                        'the model was *not* compiled. Compile it manually.')
-        return model
-      training_config = json.loads(training_config.decode('utf-8'))
-      optimizer_config = training_config['optimizer_config']
-      optimizer = optimizers.deserialize(
-          optimizer_config, custom_objects=custom_objects)
-
-      # Recover loss functions and metrics.
-      loss = convert_custom_objects(training_config['loss'])
-      metrics = convert_custom_objects(training_config['metrics'])
-      weighted_metrics = convert_custom_objects(
-          training_config.get('weighted_metrics', None))
-      sample_weight_mode = training_config['sample_weight_mode']
-      loss_weights = training_config['loss_weights']
-
-      # Compile model.
-      model.compile(
-          optimizer=optimizer,
-          loss=loss,
-          metrics=metrics,
-          weighted_metrics=weighted_metrics,
-          loss_weights=loss_weights,
-          sample_weight_mode=sample_weight_mode)
-
-      # Set optimizer weights.
-      if 'optimizer_weights' in f:
-        # Build train function (to get weight updates).
-        # Models that aren't graph networks must wait until they are called
-        # with data to _make_train_function() and so can't load optimizer
-        # weights.
-        if model._is_graph_network:  # pylint: disable=protected-access
-          model._make_train_function()
-          optimizer_weights_group = f['optimizer_weights']
-          optimizer_weight_names = [
-              n.decode('utf8')
-              for n in optimizer_weights_group.attrs['weight_names']
-          ]
-          optimizer_weight_values = [
-              optimizer_weights_group[n] for n in optimizer_weight_names
-          ]
-          try:
-            model.optimizer.set_weights(optimizer_weight_values)
-          except ValueError:
-            logging.warning('Error in loading the saved optimizer '
-                            'state. As a result, your model is '
-                            'starting with a freshly initialized '
-                            'optimizer.')
-        else:
-          logging.warning('Sequential models without an `input_shape` '
-                          'passed to the first layer cannot reload their '
-                          'optimizer state. As a result, your model is'
-                          'starting with a freshly initialized optimizer.')
-
-  finally:
-    if opened_new_file:
-      f.close()
-  return model
-
-
-@tf_export('keras.models.model_from_config')
-def model_from_config(config, custom_objects=None):
-  """Instantiates a Keras model from its config.
-
-  Arguments:
-      config: Configuration dictionary.
-      custom_objects: Optional dictionary mapping names
-          (strings) to custom classes or functions to be
-          considered during deserialization.
-
-  Returns:
-      A Keras model instance (uncompiled).
-
-  Raises:
-      TypeError: if `config` is not a dictionary.
-  """
-  if isinstance(config, list):
-    raise TypeError('`model_from_config` expects a dictionary, not a list. '
-                    'Maybe you meant to use '
-                    '`Sequential.from_config(config)`?')
-  from tensorflow.python.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
-  return deserialize(config, custom_objects=custom_objects)
-
-
-@tf_export('keras.models.model_from_yaml')
-def model_from_yaml(yaml_string, custom_objects=None):
-  """Parses a yaml model configuration file and returns a model instance.
-
-  Arguments:
-      yaml_string: YAML string encoding a model configuration.
-      custom_objects: Optional dictionary mapping names
-          (strings) to custom classes or functions to be
-          considered during deserialization.
-
-  Returns:
-      A Keras model instance (uncompiled).
-
-  Raises:
-      ImportError: if yaml module is not found.
-  """
-  if yaml is None:
-    raise ImportError('Requires yaml module installed (`pip install pyyaml`).')
-  config = yaml.load(yaml_string)
-  from tensorflow.python.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
-  return deserialize(config, custom_objects=custom_objects)
-
-
-@tf_export('keras.models.model_from_json')
-def model_from_json(json_string, custom_objects=None):
-  """Parses a JSON model configuration file and returns a model instance.
-
-  Arguments:
-      json_string: JSON string encoding a model configuration.
-      custom_objects: Optional dictionary mapping names
-          (strings) to custom classes or functions to be
-          considered during deserialization.
-
-  Returns:
-      A Keras model instance (uncompiled).
-  """
-  config = json.loads(json_string)
-  from tensorflow.python.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
-  return deserialize(config, custom_objects=custom_objects)
-
-
-def preprocess_weights_for_loading(layer,
-                                   weights,
-                                   original_keras_version=None,
-                                   original_backend=None):
-  """Preprocess layer weights between different Keras formats.
-
-  Converts layers weights from Keras 1 format to Keras 2 and also weights of
-  CuDNN layers in Keras 2.
-
-  Arguments:
-      layer: Layer instance.
-      weights: List of weights values (Numpy arrays).
-      original_keras_version: Keras version for the weights, as a string.
-      original_backend: Keras backend the weights were trained with,
-          as a string.
-
-  Returns:
-      A list of weights values (Numpy arrays).
-  """
-  def convert_nested_bidirectional(weights):
-    """Converts layers nested in `Bidirectional` wrapper.
-
-    This function uses `preprocess_weights_for_loading()` for converting
-    layers.
-
-    Arguments:
-        weights: List of weights values (Numpy arrays).
-
-    Returns:
-        A list of weights values (Numpy arrays).
-    """
-    num_weights_per_layer = len(weights) // 2
-    forward_weights = preprocess_weights_for_loading(
-        layer.forward_layer, weights[:num_weights_per_layer],
-        original_keras_version, original_backend)
-    backward_weights = preprocess_weights_for_loading(
-        layer.backward_layer, weights[num_weights_per_layer:],
-        original_keras_version, original_backend)
-    return forward_weights + backward_weights
-
-  def convert_nested_time_distributed(weights):
-    """Converts layers nested in `TimeDistributed` wrapper.
-
-    This function uses `preprocess_weights_for_loading()` for converting nested
-    layers.
-
-    Arguments:
-        weights: List of weights values (Numpy arrays).
-
-    Returns:
-        A list of weights values (Numpy arrays).
-    """
-    return preprocess_weights_for_loading(
-        layer.layer, weights, original_keras_version, original_backend)
-
-  def convert_nested_model(weights):
-    """Converts layers nested in `Model` or `Sequential`.
-
-    This function uses `preprocess_weights_for_loading()` for converting nested
-    layers.
-
-    Arguments:
-        weights: List of weights values (Numpy arrays).
-
-    Returns:
-        A list of weights values (Numpy arrays).
-    """
-    new_weights = []
-    # trainable weights
-    for sublayer in layer.layers:
-      num_weights = len(sublayer.trainable_weights)
-      if num_weights > 0:
-        new_weights.extend(preprocess_weights_for_loading(
-            layer=sublayer,
-            weights=weights[:num_weights],
-            original_keras_version=original_keras_version,
-            original_backend=original_backend))
-        weights = weights[num_weights:]
-
-    # non-trainable weights
-    for sublayer in layer.layers:
-      num_weights = len([l for l in sublayer.weights
-                         if l not in sublayer.trainable_weights])
-      if num_weights > 0:
-        new_weights.extend(preprocess_weights_for_loading(
-            layer=sublayer,
-            weights=weights[:num_weights],
-            original_keras_version=original_keras_version,
-            original_backend=original_backend))
-        weights = weights[num_weights:]
-    return new_weights
-
-  # Convert layers nested in Bidirectional/Model/Sequential.
-  # Both transformation should be ran for both Keras 1->2 conversion
-  # and for conversion of CuDNN layers.
-  if layer.__class__.__name__ == 'Bidirectional':
-    weights = convert_nested_bidirectional(weights)
-  if layer.__class__.__name__ == 'TimeDistributed':
-    weights = convert_nested_time_distributed(weights)
-  elif layer.__class__.__name__ in ['Model', 'Sequential']:
-    weights = convert_nested_model(weights)
-
-  if original_keras_version == '1':
-    if layer.__class__.__name__ == 'TimeDistributed':
-      weights = preprocess_weights_for_loading(
-          layer.layer, weights, original_keras_version, original_backend)
-
-    if layer.__class__.__name__ == 'Conv1D':
-      shape = weights[0].shape
-      # Handle Keras 1.1 format
-      if shape[:2] != (layer.kernel_size[0], 1) or shape[3] != layer.filters:
-        # Legacy shape:
-        # (filters, input_dim, filter_length, 1)
-        assert shape[0] == layer.filters and shape[2:] == (layer.kernel_size[0],
-                                                           1)
-        weights[0] = np.transpose(weights[0], (2, 3, 1, 0))
-      weights[0] = weights[0][:, 0, :, :]
-
-    if layer.__class__.__name__ == 'Conv2D':
-      if layer.data_format == 'channels_first':
-        # old: (filters, stack_size, kernel_rows, kernel_cols)
-        # new: (kernel_rows, kernel_cols, stack_size, filters)
-        weights[0] = np.transpose(weights[0], (2, 3, 1, 0))
-
-    if layer.__class__.__name__ == 'Conv2DTranspose':
-      if layer.data_format == 'channels_last':
-        # old: (kernel_rows, kernel_cols, stack_size, filters)
-        # new: (kernel_rows, kernel_cols, filters, stack_size)
-        weights[0] = np.transpose(weights[0], (0, 1, 3, 2))
-      if layer.data_format == 'channels_first':
-        # old: (filters, stack_size, kernel_rows, kernel_cols)
-        # new: (kernel_rows, kernel_cols, filters, stack_size)
-        weights[0] = np.transpose(weights[0], (2, 3, 0, 1))
-
-    if layer.__class__.__name__ == 'Conv3D':
-      if layer.data_format == 'channels_first':
-        # old: (filters, stack_size, ...)
-        # new: (..., stack_size, filters)
-        weights[0] = np.transpose(weights[0], (2, 3, 4, 1, 0))
-
-    if layer.__class__.__name__ == 'GRU':
-      if len(weights) == 9:
-        kernel = np.concatenate([weights[0], weights[3], weights[6]], axis=-1)
-        recurrent_kernel = np.concatenate(
-            [weights[1], weights[4], weights[7]], axis=-1)
-        bias = np.concatenate([weights[2], weights[5], weights[8]], axis=-1)
-        weights = [kernel, recurrent_kernel, bias]
-
-    if layer.__class__.__name__ == 'LSTM':
-      if len(weights) == 12:
-        # old: i, c, f, o
-        # new: i, f, c, o
-        kernel = np.concatenate(
-            [weights[0], weights[6], weights[3], weights[9]], axis=-1)
-        recurrent_kernel = np.concatenate(
-            [weights[1], weights[7], weights[4], weights[10]], axis=-1)
-        bias = np.concatenate(
-            [weights[2], weights[8], weights[5], weights[11]], axis=-1)
-        weights = [kernel, recurrent_kernel, bias]
-
-    if layer.__class__.__name__ == 'ConvLSTM2D':
-      if len(weights) == 12:
-        kernel = np.concatenate(
-            [weights[0], weights[6], weights[3], weights[9]], axis=-1)
-        recurrent_kernel = np.concatenate(
-            [weights[1], weights[7], weights[4], weights[10]], axis=-1)
-        bias = np.concatenate(
-            [weights[2], weights[8], weights[5], weights[11]], axis=-1)
-        if layer.data_format == 'channels_first':
-          # old: (filters, stack_size, kernel_rows, kernel_cols)
-          # new: (kernel_rows, kernel_cols, stack_size, filters)
-          kernel = np.transpose(kernel, (2, 3, 1, 0))
-          recurrent_kernel = np.transpose(recurrent_kernel, (2, 3, 1, 0))
-        weights = [kernel, recurrent_kernel, bias]
-
-  conv_layers = ['Conv1D', 'Conv2D', 'Conv3D', 'Conv2DTranspose', 'ConvLSTM2D']
-  if layer.__class__.__name__ in conv_layers:
-    if original_backend == 'theano':
-      weights[0] = conv_utils.convert_kernel(weights[0])
-      if layer.__class__.__name__ == 'ConvLSTM2D':
-        weights[1] = conv_utils.convert_kernel(weights[1])
-    if K.int_shape(layer.weights[0]) != weights[0].shape:
-      weights[0] = np.transpose(weights[0], (3, 2, 0, 1))
-      if layer.__class__.__name__ == 'ConvLSTM2D':
-        weights[1] = np.transpose(weights[1], (3, 2, 0, 1))
-
-  # convert CuDNN layers
-  return _convert_rnn_weights(layer, weights)
-
-
-def _convert_rnn_weights(layer, weights):
-  """Converts weights for RNN layers between native and CuDNN format.
-
-  Input kernels for each gate are transposed and converted between Fortran
-  and C layout, recurrent kernels are transposed. For LSTM biases are summed/
-  split in half, for GRU biases are reshaped.
-
-  Weights can be converted in both directions between `LSTM` and`CuDNNSLTM`
-  and between `CuDNNGRU` and `GRU(reset_after=True)`. Default `GRU` is not
-  compatible with `CuDNNGRU`.
-
-  For missing biases in `LSTM`/`GRU` (`use_bias=False`) no conversion is made.
-
-  Arguments:
-      layer: Target layer instance.
-      weights: List of source weights values (input kernels, recurrent
-          kernels, [biases]) (Numpy arrays).
-
-  Returns:
-      A list of converted weights values (Numpy arrays).
-
-  Raises:
-      ValueError: for incompatible GRU layer/weights or incompatible biases
-  """
-
-  def transform_kernels(kernels, func, n_gates):
-    """Transforms kernel for each gate separately using given function.
-
-    Arguments:
-        kernels: Stacked array of kernels for individual gates.
-        func: Function applied to kernel of each gate.
-        n_gates: Number of gates (4 for LSTM, 3 for GRU).
-
-    Returns:
-        Stacked array of transformed kernels.
-    """
-    return np.hstack([func(k) for k in np.hsplit(kernels, n_gates)])
-
-  def transpose_input(from_cudnn):
-    """Makes a function that transforms input kernels from/to CuDNN format.
-
-    It keeps the shape, but changes between the layout (Fortran/C). Eg.:
-
-    ```
-    Keras                 CuDNN
-    [[0, 1, 2],  <--->  [[0, 2, 4],
-     [3, 4, 5]]          [1, 3, 5]]
-    ```
-
-    It can be passed to `transform_kernels()`.
-
-    Arguments:
-        from_cudnn: `True` if source weights are in CuDNN format, `False`
-            if they're in plain Keras format.
-
-    Returns:
-        Function that converts input kernel to the other format.
-    """
-    order = 'F' if from_cudnn else 'C'
-
-    def transform(kernel):
-      return kernel.T.reshape(kernel.shape, order=order)
-
-    return transform
-
-  target_class = layer.__class__.__name__
-
-  # convert the weights between CuDNNLSTM and LSTM
-  if target_class in ['LSTM', 'CuDNNLSTM'] and len(weights) == 3:
-    # determine if we're loading a CuDNNLSTM layer
-    # from the number of bias weights:
-    # CuDNNLSTM has (units * 8) weights; while LSTM has (units * 4)
-    # if there's no bias weight in the file, skip this conversion
-    units = weights[1].shape[0]
-    bias_shape = weights[2].shape
-    n_gates = 4
-
-    if bias_shape == (2 * units * n_gates,):
-      source = 'CuDNNLSTM'
-    elif bias_shape == (units * n_gates,):
-      source = 'LSTM'
-    else:
-      raise ValueError('Invalid bias shape: ' + str(bias_shape))
-
-    def convert_lstm_weights(weights, from_cudnn=True):
-      """Converts the weights between CuDNNLSTM and LSTM.
-
-      Arguments:
-        weights: Original weights.
-        from_cudnn: Indicates whether original weights are from CuDNN layer.
-
-      Returns:
-        Updated weights compatible with LSTM.
-      """
-
-      # Transpose (and reshape) input and recurrent kernels
-      kernels = transform_kernels(weights[0], transpose_input(from_cudnn),
-                                  n_gates)
-      recurrent_kernels = transform_kernels(weights[1], lambda k: k.T, n_gates)
-      if from_cudnn:
-        # merge input and recurrent biases into a single set
-        biases = np.sum(np.split(weights[2], 2, axis=0), axis=0)
-      else:
-        # Split single set of biases evenly to two sets. The way of
-        # splitting doesn't matter as long as the two sets sum is kept.
-        biases = np.tile(0.5 * weights[2], 2)
-      return [kernels, recurrent_kernels, biases]
-
-    if source != target_class:
-      weights = convert_lstm_weights(weights, from_cudnn=source == 'CuDNNLSTM')
-
-  # convert the weights between CuDNNGRU and GRU(reset_after=True)
-  if target_class in ['GRU', 'CuDNNGRU'] and len(weights) == 3:
-    # We can determine the source of the weights from the shape of the bias.
-    # If there is no bias we skip the conversion since
-    # CuDNNGRU always has biases.
-
-    units = weights[1].shape[0]
-    bias_shape = weights[2].shape
-    n_gates = 3
-
-    def convert_gru_weights(weights, from_cudnn=True):
-      """Converts the weights between CuDNNGRU and GRU.
-
-      Arguments:
-        weights: Original weights.
-        from_cudnn: Indicates whether original weights are from CuDNN layer.
-
-      Returns:
-        Updated weights compatible with GRU.
-      """
-
-      kernels = transform_kernels(weights[0], transpose_input(from_cudnn),
-                                  n_gates)
-      recurrent_kernels = transform_kernels(weights[1], lambda k: k.T, n_gates)
-      biases = np.array(weights[2]).reshape((2, -1) if from_cudnn else -1)
-      return [kernels, recurrent_kernels, biases]
-
-    if bias_shape == (2 * units * n_gates,):
-      source = 'CuDNNGRU'
-    elif bias_shape == (2, units * n_gates):
-      source = 'GRU(reset_after=True)'
-    elif bias_shape == (units * n_gates,):
-      source = 'GRU(reset_after=False)'
-    else:
-      raise ValueError('Invalid bias shape: ' + str(bias_shape))
-
-    if target_class == 'CuDNNGRU':
-      target = 'CuDNNGRU'
-    elif layer.reset_after:
-      target = 'GRU(reset_after=True)'
-    else:
-      target = 'GRU(reset_after=False)'
-
-    # only convert between different types
-    if source != target:
-      types = (source, target)
-      if 'GRU(reset_after=False)' in types:
-        raise ValueError('%s is not compatible with %s' % types)
-      if source == 'CuDNNGRU':
-        weights = convert_gru_weights(weights, from_cudnn=True)
-      elif source == 'GRU(reset_after=True)':
-        weights = convert_gru_weights(weights, from_cudnn=False)
-
-  return weights
-
-
-def save_weights_to_hdf5_group(f, layers):
-  """Saves the weights of a list of layers to a HDF5 group.
-
-  Arguments:
-      f: HDF5 group.
-      layers: List of layer instances.
-  """
-  from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
-
-  save_attributes_to_hdf5_group(
-      f, 'layer_names', [layer.name.encode('utf8') for layer in layers])
-  f.attrs['backend'] = K.backend().encode('utf8')
-  f.attrs['keras_version'] = str(keras_version).encode('utf8')
-
-  for layer in layers:
-    g = f.create_group(layer.name)
-    symbolic_weights = layer.weights
-    weight_values = K.batch_get_value(symbolic_weights)
-    weight_names = []
-    for i, (w, val) in enumerate(zip(symbolic_weights, weight_values)):
-      if hasattr(w, 'name') and w.name:
-        name = str(w.name)
-      else:
-        name = 'param_' + str(i)
-      weight_names.append(name.encode('utf8'))
-    save_attributes_to_hdf5_group(g, 'weight_names', weight_names)
-    for name, val in zip(weight_names, weight_values):
-      param_dset = g.create_dataset(name, val.shape, dtype=val.dtype)
-      if not val.shape:
-        # scalar
-        param_dset[()] = val
-      else:
-        param_dset[:] = val
-
-
-def load_weights_from_hdf5_group(f, layers):
-  """Implements topological (order-based) weight loading.
-
-  Arguments:
-      f: A pointer to a HDF5 group.
-      layers: a list of target layers.
-
-  Raises:
-      ValueError: in case of mismatch between provided layers
-          and weights file.
-  """
-  if 'keras_version' in f.attrs:
-    original_keras_version = f.attrs['keras_version'].decode('utf8')
-  else:
-    original_keras_version = '1'
-  if 'backend' in f.attrs:
-    original_backend = f.attrs['backend'].decode('utf8')
-  else:
-    original_backend = None
-
-  filtered_layers = []
-  for layer in layers:
-    weights = layer.weights
-    if weights:
-      filtered_layers.append(layer)
-
-  layer_names = load_attributes_from_hdf5_group(f, 'layer_names')
-  filtered_layer_names = []
-  for name in layer_names:
-    g = f[name]
-    weight_names = load_attributes_from_hdf5_group(g, 'weight_names')
-    if weight_names:
-      filtered_layer_names.append(name)
-  layer_names = filtered_layer_names
-  if len(layer_names) != len(filtered_layers):
-    raise ValueError('You are trying to load a weight file '
-                     'containing ' + str(len(layer_names)) +
-                     ' layers into a model with ' + str(len(filtered_layers)) +
-                     ' layers.')
-
-  # We batch weight value assignments in a single backend call
-  # which provides a speedup in TensorFlow.
-  weight_value_tuples = []
-  for k, name in enumerate(layer_names):
-    g = f[name]
-    weight_names = load_attributes_from_hdf5_group(g, 'weight_names')
-    weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names]
-    layer = filtered_layers[k]
-    symbolic_weights = layer.weights
-    weight_values = preprocess_weights_for_loading(
-        layer, weight_values, original_keras_version, original_backend)
-    if len(weight_values) != len(symbolic_weights):
-      raise ValueError('Layer #' + str(k) + ' (named "' + layer.name +
-                       '" in the current model) was found to '
-                       'correspond to layer ' + name + ' in the save file. '
-                       'However the new layer ' + layer.name + ' expects ' +
-                       str(len(symbolic_weights)) +
-                       ' weights, but the saved weights have ' +
-                       str(len(weight_values)) + ' elements.')
-    weight_value_tuples += zip(symbolic_weights, weight_values)
-  K.batch_set_value(weight_value_tuples)
-
-
-def load_weights_from_hdf5_group_by_name(f, layers):
-  """Implements name-based weight loading.
-
-  (instead of topological weight loading).
-
-  Layers that have no matching name are skipped.
-
-  Arguments:
-      f: A pointer to a HDF5 group.
-      layers: a list of target layers.
-
-  Raises:
-      ValueError: in case of mismatch between provided layers
-          and weights file.
-  """
-  if 'keras_version' in f.attrs:
-    original_keras_version = f.attrs['keras_version'].decode('utf8')
-  else:
-    original_keras_version = '1'
-  if 'backend' in f.attrs:
-    original_backend = f.attrs['backend'].decode('utf8')
-  else:
-    original_backend = None
-
-  # New file format.
-  layer_names = load_attributes_from_hdf5_group(f, 'layer_names')
-
-  # Reverse index of layer name to list of layers with name.
-  index = {}
-  for layer in layers:
-    if layer.name:
-      index.setdefault(layer.name, []).append(layer)
-
-  # We batch weight value assignments in a single backend call
-  # which provides a speedup in TensorFlow.
-  weight_value_tuples = []
-  for k, name in enumerate(layer_names):
-    g = f[name]
-    weight_names = load_attributes_from_hdf5_group(g, 'weight_names')
-    weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names]
-
-    for layer in index.get(name, []):
-      symbolic_weights = layer.weights
-      weight_values = preprocess_weights_for_loading(
-          layer, weight_values, original_keras_version, original_backend)
-      if len(weight_values) != len(symbolic_weights):
-        raise ValueError('Layer #' + str(k) + ' (named "' + layer.name +
-                         '") expects ' + str(len(symbolic_weights)) +
-                         ' weight(s), but the saved weights' + ' have ' +
-                         str(len(weight_values)) + ' element(s).')
-      # Set values.
-      for i in range(len(weight_values)):
-        if K.int_shape(symbolic_weights[i]) != weight_values[i].shape:
-          raise ValueError('Layer #' + str(k) +' (named "' + layer.name +
-                           '"), weight ' + str(symbolic_weights[i]) +
-                           ' has shape {}'.format(K.int_shape(
-                               symbolic_weights[i])) +
-                           ', but the saved weight has shape ' +
-                           str(weight_values[i].shape) + '.')
-
-        else:
-          weight_value_tuples.append((symbolic_weights[i], weight_values[i]))
-  K.batch_set_value(weight_value_tuples)
-
-
-def save_attributes_to_hdf5_group(group, name, data):
-  """Saves attributes (data) of the specified name into the HDF5 group.
-
-  This method deals with an inherent problem of HDF5 file which is not
-  able to store data larger than HDF5_OBJECT_HEADER_LIMIT bytes.
-
-  Arguments:
-      group: A pointer to a HDF5 group.
-      name: A name of the attributes to save.
-      data: Attributes data to store.
-
-  Raises:
-    RuntimeError: If any single attribute is too large to be saved.
-  """
-  # Check that no item in `data` is larger than `HDF5_OBJECT_HEADER_LIMIT`
-  # because in that case even chunking the array would not make the saving
-  # possible.
-  bad_attributes = [x for x in data if len(x) > HDF5_OBJECT_HEADER_LIMIT]
-
-  # Expecting this to never be true.
-  if bad_attributes:
-    raise RuntimeError('The following attributes cannot be saved to HDF5 '
-                       'file because they are larger than %d bytes: %s' %
-                       (HDF5_OBJECT_HEADER_LIMIT,
-                        ', '.join([x for x in bad_attributes])))
-
-  data_npy = np.asarray(data)
-
-  num_chunks = 1
-  chunked_data = np.array_split(data_npy, num_chunks)
-
-  # This will never loop forever thanks to the test above.
-  while any(x.nbytes > HDF5_OBJECT_HEADER_LIMIT for x in chunked_data):
-    num_chunks += 1
-    chunked_data = np.array_split(data_npy, num_chunks)
-
-  if num_chunks > 1:
-    for chunk_id, chunk_data in enumerate(chunked_data):
-      group.attrs['%s%d' % (name, chunk_id)] = chunk_data
-  else:
-    group.attrs[name] = data
-
-
-def load_attributes_from_hdf5_group(group, name):
-  """Loads attributes of the specified name from the HDF5 group.
-
-  This method deals with an inherent problem
-  of HDF5 file which is not able to store
-  data larger than HDF5_OBJECT_HEADER_LIMIT bytes.
-
-  Arguments:
-      group: A pointer to a HDF5 group.
-      name: A name of the attributes to load.
-
-  Returns:
-      data: Attributes data.
-  """
-  if name in group.attrs:
-    data = [n.decode('utf8') for n in group.attrs[name]]
-  else:
-    data = []
-    chunk_id = 0
-    while '%s%d' % (name, chunk_id) in group.attrs:
-      data.extend(
-          [n.decode('utf8') for n in group.attrs['%s%d' % (name, chunk_id)]])
-      chunk_id += 1
-  return data
+from tensorflow.python.keras.saving import *  # pylint: disable=wildcard-import
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index 3255613f6af07988e874339b96002355e39e6d14..6c8f5c2f3984f4a445e43dc12f5817b25a3d63a4 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -21,24 +21,21 @@ from __future__ import print_function
 
 import copy
 
-from tensorflow.python.eager import context
-from tensorflow.python.framework import ops
 from tensorflow.python.keras import layers as layer_module
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import input_layer
+from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.engine import training_utils
-from tensorflow.python.keras.engine.input_layer import Input
-from tensorflow.python.keras.engine.input_layer import InputLayer
-from tensorflow.python.keras.engine.network import Network
-from tensorflow.python.keras.engine.training import Model
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.models.Sequential', 'keras.Sequential')
-class Sequential(Model):
+@keras_export('keras.models.Sequential', 'keras.Sequential')
+class Sequential(training.Model):
   """Linear stack of layers.
 
   Arguments:
@@ -86,8 +83,8 @@ class Sequential(Model):
   model.add(Dense(32))
   model.weights  # returns list of length 4
 
-  When using the delayed-build pattern (no input shape specified), you can
-  choose to manually build your model by calling `build(batch_input_shape)`:
+  # When using the delayed-build pattern (no input shape specified), you can
+  # choose to manually build your model by calling `build(batch_input_shape)`:
   model = Sequential()
   model.add(Dense(32))
   model.add(Dense(32))
@@ -96,13 +93,15 @@ class Sequential(Model):
   ```
   """
 
-  @checkpointable.no_automatic_dependency_tracking
+  @trackable.no_automatic_dependency_tracking
   def __init__(self, layers=None, name=None):
     super(Sequential, self).__init__(name=name)
     self.supports_masking = True
     self._build_input_shape = None
     self._compute_output_and_mask_jointly = True
 
+    self._layer_call_argspecs = {}
+
     # Add to the model any layers passed to the constructor.
     if layers:
       for layer in layers:
@@ -113,18 +112,18 @@ class Sequential(Model):
     # Historically, `sequential.layers` only returns layers that were added
     # via `add`, and omits the auto-generated `InputLayer` that comes at the
     # bottom of the stack.
-    # `CheckpointableBase` manages the `_layers` attributes and does filtering
+    # `Trackable` manages the `_layers` attributes and does filtering
     # over it.
     layers = super(Sequential, self).layers
-    if layers and isinstance(layers[0], InputLayer):
+    if layers and isinstance(layers[0], input_layer.InputLayer):
       return layers[1:]
     return layers[:]
 
   @property
-  def _static_graph_friendly(self):
-    return all(layer._static_graph_friendly for layer in self.layers)
+  def dynamic(self):
+    return any(layer.dynamic for layer in self.layers)
 
-  @checkpointable.no_automatic_dependency_tracking
+  @trackable.no_automatic_dependency_tracking
   def add(self, layer):
     """Adds a layer instance on top of the layer stack.
 
@@ -139,6 +138,14 @@ class Sequential(Model):
             multiple output tensors, or is already connected
             somewhere else (forbidden in `Sequential` models).
     """
+    # If we are passed a Keras tensor created by keras.Input(), we can extract
+    # the input layer from its keras history and use that without any loss of
+    # generality.
+    if hasattr(layer, '_keras_history'):
+      origin_layer = layer._keras_history[0]
+      if isinstance(origin_layer, input_layer.InputLayer):
+        layer = origin_layer
+
     if not isinstance(layer, base_layer.Layer):
       raise TypeError('The added layer must be '
                       'an instance of class Layer. '
@@ -146,18 +153,16 @@ class Sequential(Model):
     self.built = False
     set_inputs = False
     if not self._layers:
-      if isinstance(layer, InputLayer):
+      if isinstance(layer, input_layer.InputLayer):
         # Corner case where the user passes an InputLayer layer via `add`.
-        assert len(layer._inbound_nodes[-1].output_tensors) == 1
+        assert len(nest.flatten(layer._inbound_nodes[-1].output_tensors)) == 1
         set_inputs = True
       else:
         batch_shape, dtype = training_utils.get_input_shape_and_dtype(layer)
         if batch_shape:
           # Instantiate an input layer.
-          x = Input(
-              batch_shape=batch_shape,
-              dtype=dtype,
-              name=layer.name + '_input')
+          x = input_layer.Input(
+              batch_shape=batch_shape, dtype=dtype, name=layer.name + '_input')
           # This will build the current layer
           # and create the node connecting the current layer
           # to the input layer we just created.
@@ -166,12 +171,14 @@ class Sequential(Model):
 
       if set_inputs:
         # If an input layer (placeholder) is available.
-        if len(layer._inbound_nodes[-1].output_tensors) != 1:
+        if len(nest.flatten(layer._inbound_nodes[-1].output_tensors)) != 1:
           raise ValueError('All layers in a Sequential model '
                            'should have a single output tensor. '
                            'For multi-output layers, '
                            'use the functional API.')
-        self.outputs = [layer._inbound_nodes[-1].output_tensors[0]]
+        self.outputs = [
+            nest.flatten(layer._inbound_nodes[-1].output_tensors)[0]
+        ]
         self.inputs = layer_utils.get_source_inputs(self.outputs[0])
 
     elif self.outputs:
@@ -192,7 +199,9 @@ class Sequential(Model):
     if self._layers:
       self._track_layers(self._layers)
 
-  @checkpointable.no_automatic_dependency_tracking
+    self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
+
+  @trackable.no_automatic_dependency_tracking
   def pop(self):
     """Removes the last layer in the model.
 
@@ -202,7 +211,8 @@ class Sequential(Model):
     if not self.layers:
       raise TypeError('There are no layers in the model.')
 
-    self._layers.pop()
+    layer = self._layers.pop()
+    self._layer_call_argspecs.pop(layer)
     if not self.layers:
       self.outputs = None
       self.inputs = None
@@ -225,42 +235,31 @@ class Sequential(Model):
       super(Sequential, self).build(input_shape)
     self.built = True
 
-  def call(self, inputs, training=None, mask=None):
+  def call(self, inputs, training=None, mask=None):  # pylint: disable=redefined-outer-name
     if self._is_graph_network:
+      if not self.built:
+        self._init_graph_network(self.inputs, self.outputs, name=self.name)
       return super(Sequential, self).call(inputs, training=training, mask=mask)
 
-    outputs, _ = self._call_and_compute_mask(
-        inputs, training=training, mask=mask)
-    return outputs
-
-  def _call_and_compute_mask(self, inputs, training=None, mask=None):
-    if not self.built and self._is_graph_network:
-      self._init_graph_network(self.inputs, self.outputs, name=self.name)
-
-    x = inputs
+    outputs = inputs  # handle the corner case where self.layers is empty
     for layer in self.layers:
+      # During each iteration, `inputs` are the inputs to `layer`, and `outputs`
+      # are the outputs of `layer` applied to `inputs`. At the end of each
+      # iteration `inputs` is set to `outputs` to prepare for the next layer.
       kwargs = {}
-      if 'mask' in tf_inspect.getfullargspec(layer.call).args:
+      argspec = self._layer_call_argspecs[layer].args
+      if 'mask' in argspec:
         kwargs['mask'] = mask
-      if 'training' in tf_inspect.getfullargspec(layer.call).args:
+      if 'training' in argspec:
         kwargs['training'] = training
 
-      if isinstance(layer, Network) and layer._compute_output_and_mask_jointly:
-        x, mask = layer._call_and_compute_mask(x, **kwargs)
-      else:
-        if not layer.built:
-          # Build layer if applicable.
-          with ops.name_scope(layer._name_scope()):
-            layer._maybe_build(x)
-          layer.built = True
-        x = layer.call(x, **kwargs)
-        if layer.supports_masking:
-          mask = layer.compute_mask(x, mask)
-        else:
-          mask = None
-      if not context.executing_eagerly():
-        x._keras_mask = mask
-    return x, mask
+      outputs = layer(inputs, **kwargs)
+
+      # `outputs` will be the inputs to the next layer.
+      inputs = outputs
+      mask = outputs._keras_mask
+
+    return outputs
 
   def compute_output_shape(self, input_shape):
     shape = input_shape
@@ -269,8 +268,11 @@ class Sequential(Model):
     return shape
 
   def compute_mask(self, inputs, mask):
-    _, mask = self._call_and_compute_mask(inputs, mask=mask)
-    return mask
+    # TODO(omalleyt): b/123540974 This function is not really safe to call
+    # by itself because it will duplicate any updates and losses in graph
+    # mode by `call`ing the Layers again.
+    outputs = self.call(inputs, mask=mask)
+    return outputs._keras_mask
 
   def predict_proba(self, x, batch_size=32, verbose=0):
     """Generates class probability predictions for the input samples.
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index 10f69da061c336cd1727ce4d34f1637e21329f3a..afd7d230f9a8e69ed45e374de90216580de5a367 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -30,7 +30,6 @@ from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
-from tensorflow.python.training import rmsprop
 
 
 class TestSequential(keras_parameterized.TestCase):
@@ -48,6 +47,18 @@ class TestSequential(keras_parameterized.TestCase):
     self.assertEqual(len(model.weights), 2 * 2)
     self.assertEqual(model.get_layer(name='dp').name, 'dp')
 
+  @keras_parameterized.run_all_keras_modes
+  def test_input_defined_first_layer(self):
+    model = keras.models.Sequential()
+    model.add(keras.Input(shape=(2,), name='input_layer'))
+    model.add(keras.layers.Dense(1))
+    model.add(keras.layers.Dropout(0.3, name='dp'))
+    model.add(keras.layers.Dense(2, kernel_regularizer='l2',
+                                 kernel_constraint='max_norm'))
+    self.assertLen(model.layers, 3)
+    self.assertLen(model.weights, 2 * 2)
+    self.assertEqual(model.get_layer(name='dp').name, 'dp')
+
   @keras_parameterized.run_all_keras_modes
   def test_sequential_pop(self):
     num_hidden = 5
@@ -57,16 +68,20 @@ class TestSequential(keras_parameterized.TestCase):
 
     model = testing_utils.get_small_sequential_mlp(
         num_hidden, num_classes, input_dim)
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3),
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
     x = np.random.random((batch_size, input_dim))
     y = np.random.random((batch_size, num_classes))
     model.fit(x, y, epochs=1)
     model.pop()
     self.assertEqual(len(model.layers), 1)
     self.assertEqual(model.output_shape, (None, num_hidden))
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3),
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
     y = np.random.random((batch_size, num_hidden))
     model.fit(x, y, epochs=1)
 
@@ -92,7 +107,7 @@ class TestSequential(keras_parameterized.TestCase):
     model = testing_utils.get_small_sequential_mlp(num_hidden, num_classes)
     model.compile(
         loss='mse',
-        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        optimizer='rmsprop',
         metrics=[keras.metrics.CategoricalAccuracy()],
         run_eagerly=testing_utils.should_run_eagerly())
     self.assertEqual(len(model.layers), 2)
@@ -117,7 +132,7 @@ class TestSequential(keras_parameterized.TestCase):
     model = testing_utils.get_small_sequential_mlp(num_hidden, num_classes)
     model.compile(
         loss='mse',
-        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        optimizer='rmsprop',
         metrics=[keras.metrics.CategoricalAccuracy()],
         run_eagerly=testing_utils.should_run_eagerly())
     self.assertEqual(len(model.layers), 2)
@@ -148,7 +163,7 @@ class TestSequential(keras_parameterized.TestCase):
         else:
           model = testing_utils.get_small_sequential_mlp(10, 4, input_dim=3)
         model.compile(
-            optimizer=rmsprop.RMSPropOptimizer(1e-3),
+            optimizer='rmsprop',
             loss='categorical_crossentropy',
             metrics=['accuracy'])
         return model
@@ -226,7 +241,6 @@ class TestSequential(keras_parameterized.TestCase):
     inner_model.trainable = True
     self.assertEqual(len(model.trainable_weights), 4)
 
-  @tf_test_util.run_v1_only('b/120545219')
   def test_sequential_update_disabling(self):
     val_a = np.random.random((10, 4))
     val_out = np.random.random((10, 4))
@@ -265,7 +279,7 @@ class TestSequential(keras_parameterized.TestCase):
     model = testing_utils.get_small_sequential_mlp(num_hidden, num_classes)
     model.compile(
         loss='mse',
-        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        optimizer='rmsprop',
         metrics=[keras.metrics.CategoricalAccuracy()],
         run_eagerly=testing_utils.should_run_eagerly())
     self.assertFalse(model.built)
@@ -305,14 +319,30 @@ class TestSequential(keras_parameterized.TestCase):
     self.assertTrue(model.built)
     self.assertEqual(len(model.weights), 8)
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_sequential_deferred_manual_build(self):
+    model = testing_utils.get_small_sequential_mlp(4, 5)
+    self.assertFalse(model.built)
+    model(array_ops.zeros([1, 2]))
+    self.assertTrue(model.built)
+    self.assertEqual(len(model.outputs), 0)
+    model.compile('rmsprop',
+                  loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
+    self.assertEqual(len(model.outputs), 0)
+    model.train_on_batch(np.zeros((1, 2)), np.zeros((1, 5)))
+    self.assertEqual(len(model.outputs), 1)
+
   @keras_parameterized.run_all_keras_modes
   def test_sequential_nesting(self):
     model = testing_utils.get_small_sequential_mlp(4, 3)
     inner_model = testing_utils.get_small_sequential_mlp(4, 5)
     model.add(inner_model)
 
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3),
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
     x = np.random.random((2, 6))
     y = np.random.random((2, 5))
     model.fit(x, y, epochs=1)
@@ -353,8 +383,10 @@ class TestSequentialEagerIntegration(keras_parameterized.TestCase):
     model.add(keras.layers.Dense(4, activation='relu'))
     model.add(keras.layers.Dense(5, activation='softmax'))
 
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3),
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.random.random((2, 6))
     y = np.random.random((2, 5))
@@ -364,8 +396,10 @@ class TestSequentialEagerIntegration(keras_parameterized.TestCase):
   def test_build_before_fit(self):
     # Fix for b/112433577
     model = testing_utils.get_small_sequential_mlp(4, 5)
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3),
-                  run_eagerly=testing_utils.should_run_eagerly())
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
 
     model.build((None, 6))
 
@@ -379,7 +413,7 @@ class TestSequentialEagerIntegration(keras_parameterized.TestCase):
     model = testing_utils.get_small_sequential_mlp(
         num_hidden=10, num_classes=num_classes)
     model.compile(
-        rmsprop.RMSPropOptimizer(learning_rate=0.001),
+        'rmsprop',
         metrics=['acc'],
         weighted_metrics=['mae'],
         loss='categorical_crossentropy',
diff --git a/tensorflow/python/keras/engine/topology_test.py b/tensorflow/python/keras/engine/topology_test.py
index 4071e2c091eede29af9418105e63c157ce2dc101..951988d852fe361a6b50b558b64169150bba6f53 100644
--- a/tensorflow/python/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/engine/topology_test.py
@@ -26,13 +26,14 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import input_layer as input_layer_lib
 from tensorflow.python.keras.engine import network as network_lib
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import test
-from tensorflow.python.training import rmsprop
 
 try:
   import yaml  # pylint:disable=g-import-not-at-top
@@ -40,7 +41,7 @@ except ImportError:
   yaml = None
 
 
-class TopologyConstructionTest(test.TestCase):
+class TopologyConstructionTest(keras_parameterized.TestCase):
 
   @test_util.run_deprecated_v1
   def test_get_updates(self):
@@ -107,7 +108,7 @@ class TopologyConstructionTest(test.TestCase):
     self.assertEqual(len(network.updates), 5)
     self.assertEqual(len(network.get_updates_for(x4)), 2)
 
-  @test_util.run_v1_only('b/120545219')
+  @test_util.run_in_graph_and_eager_modes()
   def test_get_updates_bn(self):
     x1 = input_layer_lib.Input(shape=(1,))
     layer = keras.layers.BatchNormalization()
@@ -180,6 +181,7 @@ class TopologyConstructionTest(test.TestCase):
     self.assertEqual(len(network.losses), 5)
     self.assertEqual(len(network.get_losses_for(x4)), 2)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testTopologicalAttributes(self):
     # test layer attributes / methods related to cross-layer connectivity.
     a = input_layer_lib.Input(shape=(32,), name='input_a')
@@ -237,6 +239,7 @@ class TopologyConstructionTest(test.TestCase):
       b_2 = dense(b)
       _ = new_dense.output_shape
 
+  @test_util.run_in_graph_and_eager_modes()
   def testTopologicalAttributesMultiOutputLayer(self):
 
     class PowersLayer(keras.layers.Layer):
@@ -253,6 +256,7 @@ class TopologyConstructionTest(test.TestCase):
     self.assertEqual(test_layer.input_shape, (None, 32))
     self.assertEqual(test_layer.output_shape, [(None, 32), (None, 32)])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testTopologicalAttributesMultiInputLayer(self):
 
     class AddLayer(keras.layers.Layer):
@@ -304,6 +308,7 @@ class TopologyConstructionTest(test.TestCase):
     self.assertEqual(network.non_trainable_weights,
                      dense.trainable_weights + dense.non_trainable_weights)
 
+  @test_util.run_in_graph_and_eager_modes
   def test_trainable_weights(self):
     a = keras.layers.Input(shape=(2,))
     b = keras.layers.Dense(1)(a)
@@ -353,17 +358,17 @@ class TopologyConstructionTest(test.TestCase):
     x = keras.layers.Dropout(0.5)(x, training=True)
     model = keras.models.Model(inp, x)
     # Would be `dropout/cond/Merge` by default
-    self.assertTrue(model.output.op.name.endswith('dropout/mul'))
+    self.assertTrue(model.output.op.name.endswith('dropout/mul_1'))
 
     # Test that argument is kept when applying the model
     inp2 = keras.layers.Input(shape=(2,))
     out2 = model(inp2)
-    self.assertTrue(out2.op.name.endswith('dropout/mul'))
+    self.assertTrue(out2.op.name.endswith('dropout/mul_1'))
 
     # Test that argument is kept after loading a model
     config = model.get_config()
     model = keras.models.Model.from_config(config)
-    self.assertTrue(model.output.op.name.endswith('dropout/mul'))
+    self.assertTrue(model.output.op.name.endswith('dropout/mul_1'))
 
   def test_node_construction(self):
     # test basics
@@ -395,12 +400,12 @@ class TopologyConstructionTest(test.TestCase):
 
     self.assertEqual(len(dense._inbound_nodes), 2)
     self.assertEqual(len(dense._outbound_nodes), 0)
-    self.assertListEqual(dense._inbound_nodes[0].inbound_layers, [a_layer])
+    self.assertEqual(dense._inbound_nodes[0].inbound_layers, a_layer)
     self.assertEqual(dense._inbound_nodes[0].outbound_layer, dense)
-    self.assertListEqual(dense._inbound_nodes[1].inbound_layers, [b_layer])
+    self.assertEqual(dense._inbound_nodes[1].inbound_layers, b_layer)
     self.assertEqual(dense._inbound_nodes[1].outbound_layer, dense)
-    self.assertListEqual(dense._inbound_nodes[0].input_tensors, [a])
-    self.assertListEqual(dense._inbound_nodes[1].input_tensors, [b])
+    self.assertEqual(dense._inbound_nodes[0].input_tensors, a)
+    self.assertEqual(dense._inbound_nodes[1].input_tensors, b)
 
     # test layer properties
     test_layer = keras.layers.Dense(16, name='test_layer')
@@ -424,6 +429,7 @@ class TopologyConstructionTest(test.TestCase):
     self.assertEqual(dense.get_output_mask_at(0), None)
     self.assertEqual(dense.get_output_mask_at(1), None)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_multi_input_layer(self):
     with self.cached_session():
       # test multi-input layer
@@ -558,6 +564,7 @@ class TopologyConstructionTest(test.TestCase):
       fn_outputs = fn([input_a_np, input_b_np])
       self.assertListEqual([x.shape for x in fn_outputs], [(10, 7), (10, 64)])
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_multi_input_multi_output_recursion(self):
     with self.cached_session():
       # test multi-input multi-output
@@ -631,6 +638,7 @@ class TopologyConstructionTest(test.TestCase):
         yaml_str = model.to_yaml()
         keras.models.model_from_yaml(yaml_str)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_invalid_graphs(self):
     a = keras.layers.Input(shape=(32,), name='input_a')
     b = keras.layers.Input(shape=(32,), name='input_b')
@@ -720,6 +728,7 @@ class TopologyConstructionTest(test.TestCase):
     x = keras.layers.Input(tensor=x)
     keras.layers.Dense(2)(x)
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_basic_masking(self):
     a = keras.layers.Input(shape=(10, 32), name='input_a')
     b = keras.layers.Masking()(a)
@@ -786,121 +795,138 @@ class TopologyConstructionTest(test.TestCase):
     loss = model_b.evaluate(x)
     self.assertEqual(loss, 4.)
 
+  @keras_parameterized.run_all_keras_modes
   def test_layer_sharing_at_heterogenous_depth(self):
-    with self.cached_session():
-      x_val = np.random.random((10, 5))
+    x_val = np.random.random((10, 5))
 
-      x = input_layer_lib.Input(shape=(5,))
-      a = keras.layers.Dense(5, name='A')
-      b = keras.layers.Dense(5, name='B')
-      output = a(b(a(b(x))))
-      m = keras.models.Model(x, output)
+    x = input_layer_lib.Input(shape=(5,))
+    a = keras.layers.Dense(5, name='A')
+    b = keras.layers.Dense(5, name='B')
+    output = a(b(a(b(x))))
+    m = keras.models.Model(x, output)
+    m.run_eagerly = testing_utils.should_run_eagerly()
 
-      output_val = m.predict(x_val)
+    output_val = m.predict(x_val)
 
-      config = m.get_config()
-      weights = m.get_weights()
+    config = m.get_config()
+    weights = m.get_weights()
 
-      m2 = keras.models.Model.from_config(config)
-      m2.set_weights(weights)
+    m2 = keras.models.Model.from_config(config)
+    m2.set_weights(weights)
 
-      output_val_2 = m2.predict(x_val)
-      self.assertAllClose(output_val, output_val_2, atol=1e-6)
+    output_val_2 = m2.predict(x_val)
+    self.assertAllClose(output_val, output_val_2, atol=1e-6)
 
+  @keras_parameterized.run_all_keras_modes
   def test_layer_sharing_at_heterogenous_depth_with_concat(self):
-    with self.cached_session():
-      input_shape = (16, 9, 3)
-      input_layer = input_layer_lib.Input(shape=input_shape)
+    input_shape = (16, 9, 3)
+    input_layer = input_layer_lib.Input(shape=input_shape)
 
-      a = keras.layers.Dense(3, name='dense_A')
-      b = keras.layers.Dense(3, name='dense_B')
-      c = keras.layers.Dense(3, name='dense_C')
+    a = keras.layers.Dense(3, name='dense_A')
+    b = keras.layers.Dense(3, name='dense_B')
+    c = keras.layers.Dense(3, name='dense_C')
 
-      x1 = b(a(input_layer))
-      x2 = a(c(input_layer))
-      output = keras.layers.concatenate([x1, x2])
+    x1 = b(a(input_layer))
+    x2 = a(c(input_layer))
+    output = keras.layers.concatenate([x1, x2])
 
-      m = keras.models.Model(inputs=input_layer, outputs=output)
+    m = keras.models.Model(inputs=input_layer, outputs=output)
+    m.run_eagerly = testing_utils.should_run_eagerly()
 
-      x_val = np.random.random((10, 16, 9, 3))
-      output_val = m.predict(x_val)
+    x_val = np.random.random((10, 16, 9, 3))
+    output_val = m.predict(x_val)
 
-      config = m.get_config()
-      weights = m.get_weights()
+    config = m.get_config()
+    weights = m.get_weights()
 
-      m2 = keras.models.Model.from_config(config)
-      m2.set_weights(weights)
+    m2 = keras.models.Model.from_config(config)
+    m2.set_weights(weights)
 
-      output_val_2 = m2.predict(x_val)
-      self.assertAllClose(output_val, output_val_2, atol=1e-6)
+    output_val_2 = m2.predict(x_val)
+    self.assertAllClose(output_val, output_val_2, atol=1e-6)
 
-  @test_util.run_v1_only('b/120545219')
+  @keras_parameterized.run_all_keras_modes
   def test_explicit_training_argument(self):
-    with self.cached_session():
-      a = keras.layers.Input(shape=(2,))
-      b = keras.layers.Dropout(0.5)(a)
-      base_model = keras.models.Model(a, b)
-
-      a = keras.layers.Input(shape=(2,))
-      b = base_model(a, training=False)
-      model = keras.models.Model(a, b)
-
-      x = np.ones((100, 2))
-      y = np.ones((100, 2))
-      model.compile(optimizer='sgd', loss='mse')
-      loss = model.train_on_batch(x, y)
-      self.assertEqual(loss, 0)  # In inference mode, output is equal to input.
-
-      a = keras.layers.Input(shape=(2,))
-      b = base_model(a, training=True)
-      model = keras.models.Model(a, b)
-      preds = model.predict(x)
-      self.assertEqual(np.min(preds), 0.)  # At least one unit was dropped.
+    a = keras.layers.Input(shape=(2,))
+    b = keras.layers.Dropout(0.5)(a)
+    base_model = keras.models.Model(a, b)
 
-  def test_multi_output_model_with_none_masking(self):
+    a = keras.layers.Input(shape=(2,))
+    b = base_model(a, training=False)
+    model = keras.models.Model(a, b)
 
-    with self.cached_session():
+    x = np.ones((100, 2))
+    y = np.ones((100, 2))
+    model.compile(
+        optimizer='sgd',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    loss = model.train_on_batch(x, y)
+    self.assertEqual(loss, 0)  # In inference mode, output is equal to input.
+
+    a = keras.layers.Input(shape=(2,))
+    b = base_model(a, training=True)
+    model = keras.models.Model(a, b)
+    preds = model.predict(x)
+    self.assertEqual(np.min(preds), 0.)  # At least one unit was dropped.
 
-      def func(x):
-        return [x * 0.2, x * 0.3]
+  @keras_parameterized.run_all_keras_modes
+  def test_multi_output_model_with_none_masking(self):
+    def func(x):
+      return [x * 0.2, x * 0.3]
 
-      def output_shape(input_shape):
-        return [input_shape, input_shape]
+    def output_shape(input_shape):
+      return [input_shape, input_shape]
 
-      i = keras.layers.Input(shape=(3, 2, 1))
-      o = keras.layers.Lambda(function=func, output_shape=output_shape)(i)
+    i = keras.layers.Input(shape=(3, 2, 1))
+    o = keras.layers.Lambda(function=func, output_shape=output_shape)(i)
 
-      self.assertEqual(keras.backend.int_shape(o[0]), (None, 3, 2, 1))
-      self.assertEqual(keras.backend.int_shape(o[1]), (None, 3, 2, 1))
+    self.assertEqual(keras.backend.int_shape(o[0]), (None, 3, 2, 1))
+    self.assertEqual(keras.backend.int_shape(o[1]), (None, 3, 2, 1))
 
-      o = keras.layers.add(o)
-      model = keras.Model(i, o)
+    o = keras.layers.add(o)
+    model = keras.Model(i, o)
+    model.run_eagerly = testing_utils.should_run_eagerly()
 
-      i2 = keras.layers.Input(shape=(3, 2, 1))
-      o2 = model(i2)
-      model2 = keras.Model(i2, o2)
+    i2 = keras.layers.Input(shape=(3, 2, 1))
+    o2 = model(i2)
+    model2 = keras.Model(i2, o2)
+    model2.run_eagerly = testing_utils.should_run_eagerly()
 
-      x = np.random.random((4, 3, 2, 1))
-      out = model2.predict(x)
-      assert out.shape == (4, 3, 2, 1)
-      self.assertAllClose(out, x * 0.2 + x * 0.3, atol=1e-4)
+    x = np.random.random((4, 3, 2, 1))
+    out = model2.predict(x)
+    assert out.shape == (4, 3, 2, 1)
+    self.assertAllClose(out, x * 0.2 + x * 0.3, atol=1e-4)
 
+  @keras_parameterized.run_all_keras_modes
   def test_constant_initializer_with_numpy(self):
+    initializer = keras.initializers.Constant(np.ones((3, 2)))
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.Dense(2, input_shape=(3,), kernel_initializer=initializer))
+    model.add(keras.layers.Dense(3))
+    model.compile(
+        loss='mse',
+        optimizer='sgd',
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
-    with self.cached_session():
-      initializer = keras.initializers.Constant(np.ones((3, 2)))
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,),
-                                   kernel_initializer=initializer))
-      model.add(keras.layers.Dense(3))
-      model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
+    json_str = model.to_json()
+    keras.models.model_from_json(json_str)
 
-      json_str = model.to_json()
-      keras.models.model_from_json(json_str)
+    if yaml is not None:
+      yaml_str = model.to_yaml()
+      keras.models.model_from_yaml(yaml_str)
 
-      if yaml is not None:
-        yaml_str = model.to_yaml()
-        keras.models.model_from_yaml(yaml_str)
+  def test_subclassed_error_if_init_not_called(self):
+
+    class MyNetwork(network_lib.Network):
+
+      def __init__(self):
+        self._foo = [keras.layers.Dense(10), keras.layers.Dense(10)]
+
+    with self.assertRaisesRegexp(RuntimeError, 'forgot to call'):
+      MyNetwork()
 
 
 class DeferredModeTest(test.TestCase):
@@ -929,7 +955,7 @@ class DeferredModeTest(test.TestCase):
       self.assertEqual(outputs.shape.as_list(), [10, 4])
 
   @test_util.run_in_graph_and_eager_modes()
-  def testMultiIONetworkbuilding(self):
+  def testMultiIONetworkBuilding(self):
     input_a = input_layer_lib.Input(shape=(32,))
     input_b = input_layer_lib.Input(shape=(16,))
     a = keras.layers.Dense(16)(input_a)
@@ -954,7 +980,7 @@ class DeferredModeTest(test.TestCase):
       self.assertEqual(outputs[1].shape.as_list(), [10, 2])
 
 
-class DefaultShapeInferenceBehaviorTest(test.TestCase):
+class DefaultShapeInferenceBehaviorTest(keras_parameterized.TestCase):
 
   def _testShapeInference(self, model, input_shape, expected_output_shape):
     input_value = np.random.random(input_shape)
@@ -1122,7 +1148,7 @@ class DefaultShapeInferenceBehaviorTest(test.TestCase):
     output = model(sample_input)
     self.assertEqual(output.shape, (1, 3))
 
-  @test_util.run_in_graph_and_eager_modes()
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_as_downstream_of_masking_layer(self):
     inputs = keras.layers.Input(shape=(3, 4))
     x = keras.layers.Masking(mask_value=0., input_shape=(3, 4))(inputs)
@@ -1132,7 +1158,10 @@ class DefaultShapeInferenceBehaviorTest(test.TestCase):
 
     x = keras.layers.wrappers.TimeDistributed(s)(x)
     model = keras.Model(inputs=inputs, outputs=x)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(1e-3), loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
 
     model_input = np.random.randint(
         low=1, high=5, size=(10, 3, 4)).astype('float32')
@@ -1182,5 +1211,80 @@ class GraphUtilsTest(test.TestCase):
           {x_3, x_5, x_5.op})
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class NestedNetworkTest(test.TestCase):
+
+  def test_nested_inputs_network(self):
+    inputs = {'x1': keras.Input(shape=(1,)), 'x2': keras.Input(shape=(1,))}
+    outputs = keras.layers.Add()([inputs['x1'], inputs['x2']])
+    network = keras.engine.network.Network(inputs, outputs)
+
+    network = keras.engine.network.Network.from_config(network.get_config())
+
+    result_tensor = network({
+        'x': array_ops.ones((1, 1), 'float32'),
+        'y': array_ops.ones((1, 1), 'float32')
+    })
+    result = self.evaluate(result_tensor)
+    self.assertAllEqual(result, [[2.]])
+
+    # TODO(b/122726584): Investigate why concrete batch is flaky in some builds.
+    output_shape = network.compute_output_shape({
+        'x1': (None, 1),
+        'x2': (None, 1)
+    })
+    self.assertListEqual(output_shape.as_list(), [None, 1])
+
+  def test_nested_outputs_network(self):
+    inputs = keras.Input(shape=(1,))
+    outputs = {
+        'x+x': keras.layers.Add()([inputs, inputs]),
+        'x*x': keras.layers.Multiply()([inputs, inputs])
+    }
+
+    network = keras.engine.network.Network(inputs, outputs)
+
+    network = keras.engine.network.Network.from_config(network.get_config())
+
+    result_tensor = network(array_ops.ones((1, 1), 'float32'))
+    result = self.evaluate(result_tensor)
+    self.assertAllEqual(result['x+x'], [[2.]])
+    self.assertAllEqual(result['x*x'], [[1.]])
+
+    output_shape = network.compute_output_shape((None, 1))
+    self.assertListEqual(output_shape['x+x'].as_list(), [None, 1])
+    self.assertListEqual(output_shape['x*x'].as_list(), [None, 1])
+
+  def test_nested_network_inside_network(self):
+    inner_inputs = {
+        'x1': keras.Input(shape=(1,)),
+        'x2': keras.Input(shape=(1,))
+    }
+    inner_outputs = {
+        'x1+x2':
+            keras.layers.Add()([inner_inputs['x1'], inner_inputs['x2']]),
+        'x1*x2':
+            keras.layers.Multiply()([inner_inputs['x1'], inner_inputs['x2']])
+    }
+    inner_network = keras.engine.network.Network(inner_inputs, inner_outputs)
+
+    inputs = [keras.Input(shape=(1,)), keras.Input(shape=(1,))]
+    middle = inner_network({'x1': inputs[0], 'x2': inputs[1]})
+    outputs = keras.layers.Add()([middle['x1+x2'], middle['x1*x2']])
+    network = keras.engine.network.Network(inputs, outputs)
+
+    network = keras.engine.network.Network.from_config(network.get_config())
+
+    # Computes: `(x1+x2) + (x1*x2)`
+    result_tensor = network(
+        [array_ops.ones((1, 1), 'float32'),
+         array_ops.ones((1, 1), 'float32')])
+    result = self.evaluate(result_tensor)
+    self.assertAllEqual(result, [[3.]])
+
+    output_shape = network.compute_output_shape([(None, 1), (None, 1)])
+    self.assertListEqual(output_shape.as_list(), [None, 1])
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 462694fda690fbaa2d1474b9b1ddba558a84e201..ee5baf4553ea9657c68e99c03a86703b255b425a 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -19,13 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-import weakref
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.distribute import distribute_coordinator as dc
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
@@ -40,18 +41,19 @@ from tensorflow.python.keras.engine import training_eager
 from tensorflow.python.keras.engine import training_generator
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.engine.network import Network
+from tensorflow.python.keras.saving import saving_utils
 from tensorflow.python.keras.utils import data_utils
+from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
-from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
+from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import optimizer as tf_optimizer_module
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.models.Model', 'keras.Model')
+@keras_export('keras.models.Model', 'keras.Model')
 class Model(Network):
   """`Model` groups layers into an object with training and inference features.
 
@@ -118,539 +120,208 @@ class Model(Network):
 
   def __init__(self, *args, **kwargs):
     super(Model, self).__init__(*args, **kwargs)
-    # Create a cache for iterator get_next op.
-    self._iterator_get_next = weakref.WeakKeyDictionary()
-    # Create a cache for dataset - uninitialized iterators
-    self._dataset_iterator_cache = weakref.WeakKeyDictionary()
     # initializing _distribution_strategy here since it is possible to call
     # predict on a model without compiling it.
     self._distribution_strategy = None
+    # This flag is used to track if the user is using the deprecated path of
+    # passing distribution strategy to compile rather than creating the model
+    # under distribution strategy scope.
+    self._compile_distribution = False
 
     self.run_eagerly = None
 
-  def _set_sample_weight_attributes(self, sample_weight_mode,
-                                    skip_target_weighing_indices):
-    """Sets sample weight related attributes on the model."""
-    sample_weights, sample_weight_modes = training_utils.prepare_sample_weights(
-        self.output_names, sample_weight_mode, skip_target_weighing_indices)
-    self.sample_weights = sample_weights
-    self.sample_weight_modes = sample_weight_modes
-    self._feed_sample_weight_modes = [
-        sample_weight_modes[i]
-        for i in range(len(self.outputs))
-        if i not in skip_target_weighing_indices
-    ]
-    self._feed_sample_weights = [
-        sample_weights[i]
-        for i in range(len(sample_weights))
-        if i not in skip_target_weighing_indices
-    ]
-
-  def _cache_output_metric_attributes(self, metrics, weighted_metrics):
-    """Caches metric name and function attributes for every model output."""
-    output_shapes = [
-        None if output is None else output.get_shape().as_list()
-        for output in self.outputs
-    ]
-    self._per_output_metrics = training_utils.collect_per_output_metric_info(
-        metrics, self.output_names, output_shapes, self.loss_functions)
-    self._per_output_weighted_metrics = \
-        training_utils.collect_per_output_metric_info(
-            weighted_metrics, self.output_names, output_shapes,
-            self.loss_functions, self.sample_weights)
+  def get_weights(self):
+    """Retrieves the weights of the model.
 
-  def _add_unique_metric_name(self, metric_name, output_index):
-    """Makes the metric name unique and adds it to the model's metric name list.
+    Returns:
+        A flat list of Numpy arrays.
+    """
+    if self._distribution_strategy:
+      with self._distribution_strategy.scope():
+        return super(Model, self).get_weights()
+    return super(Model, self).get_weights()
 
-      If there are multiple outputs for which the metrics are calculated, the
-      metric names have to be made unique by appending an integer.
+  @trackable.no_automatic_dependency_tracking
+  def compile(self,
+              optimizer,
+              loss=None,
+              metrics=None,
+              loss_weights=None,
+              sample_weight_mode=None,
+              weighted_metrics=None,
+              target_tensors=None,
+              distribute=None,
+              **kwargs):
+    """Configures the model for training.
 
     Arguments:
-      metric_name: Metric name that corresponds to the metric specified by the
-          user. For example: 'acc'.
-      output_index: The index of the model output for which the metric name is
-        being added.
+        optimizer: String (name of optimizer) or optimizer instance.
+            See `tf.keras.optimizers`.
+        loss: String (name of objective function), objective function or
+            `tf.losses.Loss` instance. See `tf.losses`. If the model has
+            multiple outputs, you can use a different loss on each output by
+            passing a dictionary or a list of losses. The loss value that will
+            be minimized by the model will then be the sum of all individual
+            losses.
+        metrics: List of metrics to be evaluated by the model during training
+            and testing. Typically you will use `metrics=['accuracy']`.
+            To specify different metrics for different outputs of a
+            multi-output model, you could also pass a dictionary, such as
+            `metrics={'output_a': 'accuracy', 'output_b': ['accuracy', 'mse']}`.
+            You can also pass a list (len = len(outputs)) of lists of metrics
+            such as `metrics=[['accuracy'], ['accuracy', 'mse']]` or
+            `metrics=['accuracy', ['accuracy', 'mse']]`.
+        loss_weights: Optional list or dictionary specifying scalar
+            coefficients (Python floats) to weight the loss contributions
+            of different model outputs.
+            The loss value that will be minimized by the model
+            will then be the *weighted sum* of all individual losses,
+            weighted by the `loss_weights` coefficients.
+            If a list, it is expected to have a 1:1 mapping
+            to the model's outputs. If a tensor, it is expected to map
+            output names (strings) to scalar coefficients.
+        sample_weight_mode: If you need to do timestep-wise
+            sample weighting (2D weights), set this to `"temporal"`.
+            `None` defaults to sample-wise weights (1D).
+            If the model has multiple outputs, you can use a different
+            `sample_weight_mode` on each output by passing a
+            dictionary or a list of modes.
+        weighted_metrics: List of metrics to be evaluated and weighted
+            by sample_weight or class_weight during training and testing.
+        target_tensors: By default, Keras will create placeholders for the
+            model's target, which will be fed with the target data during
+            training. If instead you would like to use your own
+            target tensors (in turn, Keras will not expect external
+            Numpy data for these targets at training time), you
+            can specify them via the `target_tensors` argument. It can be
+            a single tensor (for a single-output model), a list of tensors,
+            or a dict mapping output names to target tensors.
+        distribute: NOT SUPPORTED IN TF 2.0, please create and compile the
+            model under distribution strategy scope instead of passing it to
+            compile.
+        **kwargs: Any additional arguments.
 
-    Returns:
-      string, name of the model's unique metric name
+    Raises:
+        ValueError: In case of invalid arguments for
+            `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
     """
-    if len(self.output_names) > 1:
-      metric_name = '%s_%s' % (self.output_names[output_index], metric_name)
-    j = 1
-    base_metric_name = metric_name
-    while metric_name in self._compile_metrics_names:
-      metric_name = '%s_%d' % (base_metric_name, j)
-      j += 1
-
-    return metric_name
-
-  @property
-  def metrics(self):
-    """Returns the model's metrics added using `compile`, `add_metric` APIs."""
-    metrics = []
-    if self._is_compiled:
-      metrics += self._compile_stateful_metric_functions
-    return metrics + super(Model, self).metrics
+    run_eagerly = kwargs.pop('run_eagerly', None)
+    self._run_eagerly = run_eagerly
+    optimizer = optimizers.get(optimizer)
 
-  @property
-  def metrics_names(self):
-    """Returns the model's display labels for all outputs."""
-    metrics_names = []
-    if self._is_compiled:
-      metrics_names += self._compile_metrics_names  # Includes names of losses.
+    if distribute is not None:
+      if tf2.enabled():
+        raise ValueError(
+            'Distribute argument in compile is not available in TF 2.0 please '
+            'create the model under the distribution strategy scope.')
+      logging.warning('Distribute argument in compile is deprecated please '
+                      'create the model under the distribution strategy scope.')
+      self._distribution_strategy = distribute
+      self._compile_distribution = True
+    else:
+      if distribution_strategy_context.has_strategy():
+        # When the user builds the model in the DS scope and cross replica
+        # context we want distribution strategy to be set but when building the
+        # replica copies of the models internally we should not be compiling
+        # with distribution strategy and use the default compilation path.
+        if distribution_strategy_context.in_cross_replica_context():
+          self._distribution_strategy = (
+              distribution_strategy_context.get_strategy())
 
-    # Add metric names from layers.
-    for layer in self.layers:
-      metrics_names += [m.name for m in layer._metrics]  # pylint: disable=protected-access
-    metrics_names += [m.name for m in self._metrics]
-    return metrics_names
+    # Validate that arguments passed by the user to `compile` are supported by
+    # DistributionStrategy.
+    if self._distribution_strategy:
+      if sample_weight_mode:
+        raise NotImplementedError('sample_weight_mode is not supported with '
+                                  'DistributionStrategy.')
+      if weighted_metrics:
+        raise NotImplementedError('weighted_metrics is not supported with '
+                                  'DistributionStrategy.')
+      if target_tensors:
+        raise ValueError('target_tensors is not supported with '
+                         'DistributionStrategy.')
 
-  @property
-  def _all_metrics_tensors(self):
-    """Returns the network's symbolic metric tensors."""
-    metrics_tensors = {}
-    if self._is_compiled:
-      metrics_tensors.update(self._compile_metrics_tensors)
-    metrics_tensors.update(super(Model, self)._all_metrics_tensors)
-    return metrics_tensors
+    loss = loss or {}
 
-  @property
-  def _all_stateful_metrics_tensors(self):
-    """Returns the network's symbolic metric tensors."""
-    metrics_tensors = {}
-    if self._is_compiled:
-      metrics_tensors.update(self._compile_stateful_metrics_tensors)
-    metrics_tensors.update(super(Model, self)._all_metrics_tensors)
-    return metrics_tensors
+    self.optimizer = optimizer
+    # We've disabled automatic dependency tracking for this method, but do want
+    # to add a checkpoint dependency on the optimizer if it's trackable.
+    if isinstance(self.optimizer, trackable.Trackable):
+      self._track_trackable(
+          self.optimizer, name='optimizer', overwrite=True)
+    self.loss = loss
+    self._compile_metrics = metrics or []
+    self.loss_weights = loss_weights
+    self.sample_weight_mode = sample_weight_mode
+    self._compile_weighted_metrics = weighted_metrics
+    if self.run_eagerly and target_tensors is not None:
+      raise ValueError(
+          'target_tensors argument is not supported when '
+          'running a model eagerly.')
+    self.target_tensors = target_tensors
 
-  def _init_metric_attributes(self):
-    """Initialized model metric attributes."""
-    # List of all metric names in the model.
-    self._compile_metrics_names = ['loss']
-    # List of stateful metric functions. Used for resetting metric state during
-    # training/eval.
-    # This includes loss functions when there are multiple outputs.
-    self._compile_stateful_metric_functions = []
-    # Dict of all aggregated metric result tensors. This includes aggregated
-    # loss result tensors when there are multiple outputs.
-    self._compile_stateful_metrics_tensors = {}
-    # Dict of all metric result tensors (aggregated or not - based on the
-    # values given in compile.). This includes aggregated loss result tensors
-    # when there are multiple outputs.
-    self._compile_metrics_tensors = {}
+    # Set DistributionStrategy specific parameters.
+    self._distributed_model_cache = {}
 
-  def _set_per_output_metric_attributes(self, metrics_dict, output_index):
-    """Sets the metric attributes on the model for the given output.
+    if self._distribution_strategy is not None:
+      # Ensures a Session is created and configured correctly for Distribution
+      # Strategy.
+      K.configure_and_create_distributed_session(self._distribution_strategy)
+    # Initialize model metric attributes.
+    self._init_metric_attributes()
+    if not self.built or not self.inputs or not self.outputs:
+      # Model is not compilable because it does not know its number of inputs
+      # and outputs, nor their shapes and names. We will compile after the first
+      # time the model gets called on training data.
+      return
+    self._is_compiled = True
 
-    Arguments:
-      metrics_dict: A dict with metric names as keys and metric fns as values.
-      output_index: The index of the model output for which the metric
-        attributes are added.
+    # Prepare list of loss functions, same size of model outputs.
+    self.loss_functions = training_utils.prepare_loss_functions(
+        loss, self.output_names)
 
-    Returns:
-      Metrics dict updated with unique metric names as keys.
-    """
-    updated_metrics_dict = collections.OrderedDict()
-    for metric_name, (metric_fn, stateful_metric_fn) in metrics_dict.items():
-      metric_name = self._add_unique_metric_name(metric_name, output_index)
-      updated_metrics_dict[metric_name] = (metric_fn, stateful_metric_fn)
-      # Keep track of metric name, function and stateful function.
-      self._compile_metrics_names.append(metric_name)
-      self._compile_stateful_metric_functions.append(stateful_metric_fn)
-    return updated_metrics_dict
+    self._feed_outputs = []
+    self._feed_output_names = []
+    self._feed_output_shapes = []
+    self._feed_loss_fns = []
+    # if loss function is None, then this output will be skipped during total
+    # loss calculation and feed targets preparation.
+    skip_target_indices = []
+    skip_target_weighing_indices = []
+    for i, loss_function in enumerate(self.loss_functions):
+      if loss_function is None:
+        skip_target_indices.append(i)
+        skip_target_weighing_indices.append(i)
 
-  def _set_metric_attributes(self, outputs, skip_target_indices=None):
-    """Sets the metric attributes on the model for all the model outputs."""
-    skip_target_indices = skip_target_indices or []
-    updated_per_output_metrics = []
-    updated_per_output_weighted_metrics = []
-    for i in range(len(outputs)):
-      if i in skip_target_indices:
-        updated_per_output_metrics.append(self._per_output_metrics[i])
-        updated_per_output_weighted_metrics.append(
-            self._per_output_weighted_metrics[i])
-        continue
-      updated_per_output_metrics.append(
-          self._set_per_output_metric_attributes(self._per_output_metrics[i],
-                                                 i))
-      updated_per_output_weighted_metrics.append(
-          self._set_per_output_metric_attributes(
-              self._per_output_weighted_metrics[i], i))
+    # Prepare output masks.
+    if not self.run_eagerly:
+      masks = [getattr(x, '_keras_mask', None) for x in self.outputs]
 
-    self._per_output_metrics = updated_per_output_metrics
-    self._per_output_weighted_metrics = updated_per_output_weighted_metrics
+    # Prepare list loss weights, same size of model outputs.
+    self.loss_weights_list = training_utils.prepare_loss_weights(
+        self.output_names, loss_weights)
 
-  def _handle_per_output_metrics(self,
-                                 metrics_dict,
-                                 y_true,
-                                 y_pred,
-                                 mask,
-                                 weights=None,
-                                 return_stateful_result=True):
-    """Calls metric functions for a single output.
+    # Initialization for Eager mode execution.
+    if self.run_eagerly:
+      # Prepare sample weights.
+      self._set_sample_weight_attributes(sample_weight_mode,
+                                         skip_target_weighing_indices)
+      # Save all metric attributes per output of the model.
+      self._cache_output_metric_attributes(metrics, weighted_metrics)
 
-    Arguments:
-      metrics_dict: A dict with metric names as keys and metric fns as values.
-      y_true: Target output.
-      y_pred: Predicted output.
-      mask: Computed mask value for the current output.
-      weights: Weights to be applied on the current output.
-      return_stateful_result: Boolean, indicates whether the stateful
-        (aggregated)/stateless metric result should be returned.
+      if target_tensors is not None:
+        raise ValueError('target_tensors are not currently supported in Eager '
+                         'mode.')
+      self.total_loss = None
 
-    Returns:
-      A list of metric result tensors.
-    """
-    metric_results = []
-    for metric_name, (metric_fn, stateful_fn) in metrics_dict.items():
-      with K.name_scope(metric_name):
+      # Set metric attributes on model.
+      self._set_metric_attributes(skip_target_indices=skip_target_indices)
 
-        def _call_stateful_fn(fn):
-          return training_utils.call_metric_function(
-              fn, y_true, y_pred, weights=weights, mask=mask)
-
-        def _call_stateless_fn(fn):
-          weighted_metric_fn = training_utils.weighted_masked_objective(fn)
-          return weighted_metric_fn(y_true, y_pred, weights=weights, mask=mask)
-
-        def _track_metric_tensors(name, stateless_result, stateful_result):
-          self._compile_metrics_tensors[name] = stateless_result
-          self._compile_stateful_metrics_tensors[name] = stateful_result
-
-        if isinstance(metric_fn, metrics_module.Metric):
-          # If the given metric fn is stateful, call the fn and return result.
-          metric_result = _call_stateful_fn(metric_fn)
-          metric_results.append(metric_result)
-          if not self.run_eagerly:
-            _track_metric_tensors(metric_name, metric_result, metric_result)
-        elif self.run_eagerly:
-          # In eager mode, if the given metric fn is not stateful, we invoke the
-          # given fn or its stateful version based on the given flag.
-          if return_stateful_result:
-            metric_result = _call_stateful_fn(stateful_fn)
-          else:
-            metric_result = _call_stateless_fn(metric_fn)
-          metric_results.append(metric_result)
-        else:
-          # In graph mode, we build the sub-graph for both the stateful and the
-          # stateless fns.
-          stateful_metric_result = _call_stateful_fn(stateful_fn)
-          metric_result = _call_stateless_fn(metric_fn)
-          _track_metric_tensors(metric_name, metric_result,
-                                stateful_metric_result)
-
-    return metric_results
-
-  def _handle_metrics(self,
-                      outputs,
-                      skip_target_indices=None,
-                      targets=None,
-                      sample_weights=None,
-                      masks=None,
-                      return_stateful_result=True):
-    """Handles calling metric functions.
-
-    Arguments:
-      outputs: List of outputs (predictions).
-      skip_target_indices: Optional. List of target ids to skip.
-      targets: List of targets.
-      sample_weights: Optional list of sample weight arrays.
-      masks: List of computed output mask values.
-      return_stateful_result: Boolean, indicates whether the stateful
-        (aggregated)/stateless metric result should be returned.
-
-    Returns:
-      A list of metric result tensors.
-    """
-    skip_target_indices = skip_target_indices or []
-    metric_results = []
-    with K.name_scope('metrics'):
-      # Invoke all metrics added using `compile`.
-      for i in range(len(outputs)):
-        if i in skip_target_indices:
-          continue
-        output = outputs[i] if outputs else None
-        target = targets[i] if targets else None
-        output_mask = masks[i] if masks else None
-        metric_results.extend(
-            self._handle_per_output_metrics(
-                self._per_output_metrics[i],
-                target,
-                output,
-                output_mask,
-                return_stateful_result=return_stateful_result))
-        metric_results.extend(
-            self._handle_per_output_metrics(
-                self._per_output_weighted_metrics[i],
-                target,
-                output,
-                output_mask,
-                weights=sample_weights[i],
-                return_stateful_result=return_stateful_result))
-
-    # Add metric results from the `add_metric` metrics in eager mode.
-    if context.executing_eagerly():
-      for m in self.metrics:
-        if m not in self._compile_stateful_metric_functions:
-          metric_results.append(m.result())
-    return metric_results
-
-  @property
-  def run_eagerly(self):
-    """Settable attribute indicating whether the model should run eagerly.
-
-    Running eagerly means that your model will be run step by step,
-    like Python code. Your model might run slower, but it should become easier
-    for you to debug it by stepping into individual layer calls.
-
-    By default, we will attempt to compile your model to a static graph to
-    deliver the best execution performance.
-
-    Returns:
-      Boolean, whether the model should run eagerly.
-    """
-    if self._run_eagerly is True and not context.executing_eagerly():
-      raise ValueError('You can only set `run_eagerly=True` if eager execution '
-                       'is enabled.')
-    if self._static_graph_friendly:
-      if self._run_eagerly is None:
-        return False
-      else:
-        return self._run_eagerly
-    else:
-      if self._run_eagerly is False:
-        # TODO(fchollet): consider using py_func to enable this.
-        raise ValueError('Your model contains layers that can only be '
-                         'successfully run in eager execution. '
-                         'You cannot set `run_eagerly=False`.')
-      return context.executing_eagerly()
-
-  @run_eagerly.setter
-  def run_eagerly(self, value):
-    self._run_eagerly = value
-
-  @checkpointable.no_automatic_dependency_tracking
-  def compile(self,
-              optimizer,
-              loss=None,
-              metrics=None,
-              loss_weights=None,
-              sample_weight_mode=None,
-              weighted_metrics=None,
-              target_tensors=None,
-              distribute=None,
-              **kwargs):
-    """Configures the model for training.
-
-    Arguments:
-        optimizer: String (name of optimizer) or optimizer instance.
-            See [optimizers](/api_docs/python/tf/keras/optimizers).
-        loss: String (name of objective function) or objective function.
-            See [losses](/api_docs/python/tf/losses).
-            If the model has multiple outputs, you can use a different loss
-            on each output by passing a dictionary or a list of losses.
-            The loss value that will be minimized by the model
-            will then be the sum of all individual losses.
-        metrics: List of metrics to be evaluated by the model
-            during training and testing.
-            Typically you will use `metrics=['accuracy']`.
-            To specify different metrics for different outputs of a
-            multi-output model, you could also pass a dictionary,
-            such as `metrics={'output_a': 'accuracy'}`.
-        loss_weights: Optional list or dictionary specifying scalar
-            coefficients (Python floats) to weight the loss contributions
-            of different model outputs.
-            The loss value that will be minimized by the model
-            will then be the *weighted sum* of all individual losses,
-            weighted by the `loss_weights` coefficients.
-            If a list, it is expected to have a 1:1 mapping
-            to the model's outputs. If a tensor, it is expected to map
-            output names (strings) to scalar coefficients.
-        sample_weight_mode: If you need to do timestep-wise
-            sample weighting (2D weights), set this to `"temporal"`.
-            `None` defaults to sample-wise weights (1D).
-            If the model has multiple outputs, you can use a different
-            `sample_weight_mode` on each output by passing a
-            dictionary or a list of modes.
-        weighted_metrics: List of metrics to be evaluated and weighted
-            by sample_weight or class_weight during training and testing.
-        target_tensors: By default, Keras will create placeholders for the
-            model's target, which will be fed with the target data during
-            training. If instead you would like to use your own
-            target tensors (in turn, Keras will not expect external
-            Numpy data for these targets at training time), you
-            can specify them via the `target_tensors` argument. It can be
-            a single tensor (for a single-output model), a list of tensors,
-            or a dict mapping output names to target tensors.
-        distribute: The DistributionStrategy instance that we want to use to
-            distribute the training of the model.
-        **kwargs: These arguments are passed to `tf.Session.run`.
-
-    Raises:
-        ValueError: In case of invalid arguments for
-            `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
-    """
-    run_eagerly = kwargs.pop('run_eagerly', None)
-    self._run_eagerly = run_eagerly
-
-    # Validate that arguments passed by the user to `compile` are supported by
-    # DistributionStrategy.
-    if distribute:
-      if not isinstance(
-          optimizer, (tf_optimizer_module.Optimizer, optimizers.TFOptimizer)):
-        raise NotImplementedError(
-            'optimizer must be an instance of '
-            'tf.train.Optimizer, not a %s' % type(optimizer))
-      if sample_weight_mode:
-        raise NotImplementedError('sample_weight_mode is not supported with '
-                                  'DistributionStrategy.')
-      if weighted_metrics:
-        raise NotImplementedError('weighted_metrics is not supported with '
-                                  'DistributionStrategy.')
-      if target_tensors:
-        raise ValueError('target_tensors is not supported with '
-                         'DistributionStrategy.')
-
-    loss = loss or {}
-    if self.run_eagerly and not isinstance(
-        optimizer, (tf_optimizer_module.Optimizer, optimizers.TFOptimizer)):
-      raise ValueError(
-          'When running a model in eager execution, the optimizer must be an '
-          'instance of tf.train.Optimizer. Received: '
-          '%s' % optimizer)
-
-    self.optimizer = optimizers.get(optimizer)
-    # We've disabled automatic dependency tracking for this method, but do want
-    # to add a checkpoint dependency on the optimizer if it's checkpointable.
-    if isinstance(self.optimizer, checkpointable.CheckpointableBase):
-      self._track_checkpointable(
-          self.optimizer, name='optimizer', overwrite=True)
-    self.loss = loss
-    self._compile_metrics = metrics or []
-    self.loss_weights = loss_weights
-    self.sample_weight_mode = sample_weight_mode
-    self._compile_weighted_metrics = weighted_metrics
-    if self.run_eagerly and target_tensors is not None:
-      raise ValueError(
-          'target_tensors argument is not supported when '
-          'running a model eagerly.')
-    self.target_tensors = target_tensors
-
-    # Set DistributionStrategy specific parameters.
-    self._distribution_strategy = distribute
-    # Reset the value of grouped_model
-    self._grouped_model = None
-    if self._distribution_strategy is not None:
-      distributed_training_utils.configure_and_create_session(
-          self._distribution_strategy)
-    # Initialize model metric attributes.
-    self._init_metric_attributes()
-    if not self.built:
-      # Model is not compilable because it does not know its number of inputs
-      # and outputs, nor their shapes and names. We will compile after the first
-      # time the model gets called on training data.
-      return
-    self._is_compiled = True
-
-    # Prepare loss functions.
-    if isinstance(loss, dict):
-      for name in loss:
-        if name not in self.output_names:
-          raise ValueError(
-              'Unknown entry in loss '
-              'dictionary: "' + name + '". '
-              'Only expected the following keys: ' + str(self.output_names))
-      loss_functions = []
-      for name in self.output_names:
-        if name not in loss:
-          logging.warning(
-              'Output "' + name +
-              '" missing from loss dictionary. We assume '
-              'this was done on purpose. The fit and evaluate APIs will not be '
-              'expecting any data to be passed to "' + name + '".')
-        loss_functions.append(training_utils.get_loss_function(loss.get(name)))
-    elif isinstance(loss, list):
-      if len(loss) != len(self.outputs):
-        raise ValueError('When passing a list as loss, '
-                         'it should have one entry per model outputs. '
-                         'The model has ' + str(len(self.outputs)) +
-                         ' outputs, but you passed loss=' + str(loss))
-      loss_functions = [training_utils.get_loss_function(l) for l in loss]
-    else:
-      loss_function = training_utils.get_loss_function(loss)
-      loss_functions = [loss_function for _ in range(len(self.outputs))]
-    self.loss_functions = loss_functions
-
-    skip_target_indices = []
-    skip_target_weighing_indices = []
-    self._feed_outputs = []
-    self._feed_output_names = []
-    self._feed_output_shapes = []
-    self._feed_loss_fns = []
-    for i in range(len(loss_functions)):
-      if loss_functions[i] is None:
-        skip_target_indices.append(i)
-        skip_target_weighing_indices.append(i)
-
-    # Prepare output masks.
-    if not self.run_eagerly:
-      masks = [getattr(x, '_keras_mask', None) for x in self.outputs]
-      if not isinstance(masks, list):
-        masks = [masks]
-
-    # Prepare loss weights.
-    if loss_weights is None:
-      loss_weights_list = [1. for _ in range(len(self.outputs))]
-    elif isinstance(loss_weights, dict):
-      for name in loss_weights:
-        if name not in self.output_names:
-          raise ValueError(
-              'Unknown entry in loss_weights '
-              'dictionary: "' + name + '". '
-              'Only expected the following keys: ' + str(self.output_names))
-      loss_weights_list = []
-      for name in self.output_names:
-        loss_weights_list.append(loss_weights.get(name, 1.))
-    elif isinstance(loss_weights, list):
-      if len(loss_weights) != len(self.outputs):
-        raise ValueError(
-            'When passing a list as loss_weights, '
-            'it should have one entry per model output. '
-            'The model has ' + str(len(self.outputs)) +
-            ' outputs, but you passed loss_weights=' + str(loss_weights))
-      loss_weights_list = loss_weights
-    else:
-      raise TypeError('Could not interpret loss_weights argument: ' +
-                      str(loss_weights) + ' - expected a list of dicts.')
-    self.loss_weights_list = loss_weights_list
-
-    # Initialization for Eager mode execution.
-    if self.run_eagerly:
-      # Prepare sample weights.
-      self._set_sample_weight_attributes(sample_weight_mode,
-                                         skip_target_weighing_indices)
-      # Save all metric attributes per output of the model.
-      self._cache_output_metric_attributes(metrics, weighted_metrics)
-
-      if target_tensors is not None:
-        raise ValueError('target_tensors are not currently supported in Eager '
-                         'mode.')
-      self.total_loss = None
-      for i in range(len(self.outputs)):
-        if len(self.outputs) > 1:
-          self._compile_metrics_names.append(self.output_names[i] + '_loss')
-
-      # Set metric attributes on model.
-      self._set_metric_attributes(
-          self.outputs,
-          skip_target_indices=skip_target_indices,
-      )
-
-      self.targets = []
-      for i in range(len(self.outputs)):
-        self._feed_output_names.append(self.output_names[i])
-      self._collected_trainable_weights = self.trainable_weights
-      return
+      self.targets = []
+      for i in range(len(self.outputs)):
+        self._feed_output_names.append(self.output_names[i])
+      self._collected_trainable_weights = self.trainable_weights
+      return
 
     with K.get_graph().as_default():
       # Prepare targets of model.
@@ -717,77 +388,9 @@ class Model(Network):
       # Save all metric attributes per output of the model.
       self._cache_output_metric_attributes(metrics, weighted_metrics)
 
-      # Compute total loss.
-      total_loss = None
-      with K.name_scope('loss'):
-        for i in range(len(self.outputs)):
-          if i in skip_target_indices:
-            continue
-          y_true = self.targets[i]
-          y_pred = self.outputs[i]
-          loss_fn = loss_functions[i]
-          sample_weight = self.sample_weights[i]
-          mask = masks[i]
-          loss_weight = loss_weights_list[i]
-          with K.name_scope(self.output_names[i] + '_loss'):
-            if isinstance(loss_fn, losses.Loss):
-              if mask is not None:
-                mask = math_ops.cast(mask, y_pred.dtype)
-                # Update weights with mask.
-                if sample_weight is None:
-                  sample_weight = mask
-                else:
-                  # Update dimensions of weights to match with mask if possible.
-                  mask, _, sample_weight = squeeze_or_expand_dimensions(
-                      mask, None, sample_weight)
-                  sample_weight *= mask
-              output_loss = loss_fn(y_true, y_pred, sample_weight=sample_weight)
-            else:
-              weighted_loss = training_utils.weighted_masked_objective(loss_fn)
-              output_loss = weighted_loss(y_true, y_pred, sample_weight, mask)
-
-          if len(self.outputs) > 1:
-            # Keep track of the un-aggregated loss result tensor.
-            self._compile_metrics_tensors[self.output_names[i] +
-                                          '_loss'] = output_loss
-
-            # Keep track of stateful result tensor and function for the loss.
-            loss_name = loss_fn.name if isinstance(
-                loss_fn, losses.Loss) else loss_fn.__name__
-            mean_wrapped_loss = metrics_module.MeanMetricWrapper(
-                loss_fn, name=loss_name)
-            result_tensor = training_utils.call_metric_function(
-                mean_wrapped_loss,
-                y_true,
-                y_pred,
-                weights=sample_weight,
-                mask=mask)
-            self._compile_stateful_metrics_tensors[self.output_names[i] +
-                                                   '_loss'] = result_tensor
-            self._compile_stateful_metric_functions.append(mean_wrapped_loss)
-
-            self._compile_metrics_names.append(self.output_names[i] + '_loss')
-          if total_loss is None:
-            total_loss = loss_weight * output_loss
-          else:
-            total_loss += loss_weight * output_loss
-        if total_loss is None:
-          if not self.losses:
-            raise ValueError('The model cannot be compiled '
-                             'because it has no loss to optimize.')
-          else:
-            total_loss = 0.
-
-        # Add regularization penalties
-        # and other layer-specific losses.
-        for loss_tensor in self.losses:
-          total_loss += loss_tensor
-
       # Set metric attributes on model.
-      self._set_metric_attributes(
-          self.outputs,
-          skip_target_indices=skip_target_indices,
-      )
+      self._set_metric_attributes(skip_target_indices=skip_target_indices)
+
       # Invoke metric functions for all the outputs.
       self._handle_metrics(
           self.outputs,
@@ -796,8 +399,12 @@ class Model(Network):
           skip_target_indices=skip_target_indices,
           sample_weights=self.sample_weights)
 
-      # Prepare gradient updates and state updates.
-      self.total_loss = total_loss
+      # Compute total loss.
+      # Used to keep track of the total loss value (stateless).
+      # eg., total_loss = loss_weight_1 * output_1_loss_fn(...) +
+      #                   loss_weight_2 * output_2_loss_fn(...) +
+      #                   layer losses.
+      self.total_loss = self._prepare_total_loss(skip_target_indices, masks)
 
       # Functions for train, test and predict will
       # be compiled lazily when required.
@@ -806,6 +413,7 @@ class Model(Network):
 
       self._fit_function = None
       self._eval_function = None
+      self._predict_function = None
       self.train_function = None
       self.test_function = None
       self.predict_function = None
@@ -814,1771 +422,2337 @@ class Model(Network):
       trainable_weights = self.trainable_weights
       self._collected_trainable_weights = trainable_weights
 
-  def _check_trainable_weights_consistency(self):
-    """Check trainable weights count consistency.
-
-    This will raise a warning if `trainable_weights` and
-    `_collected_trainable_weights` are inconsistent (i.e. have different
-    number of parameters).
-    Inconsistency will typically arise when one modifies `model.trainable`
-    without calling `model.compile` again.
-    """
-    if not hasattr(self, '_collected_trainable_weights'):
-      return
-
-    if len(self.trainable_weights) != len(self._collected_trainable_weights):
-      logging.log_first_n(
-          logging.WARN, 'Discrepancy between trainable weights and collected'
-          ' trainable weights, did you set `model.trainable`'
-          ' without calling `model.compile` after ?', 1)
-
-  def _make_train_function_helper(self, fn_name, outputs, metric_updates=None):
-    if not hasattr(self, fn_name):
-      raise RuntimeError('You must compile your model before using it.')
-    self._check_trainable_weights_consistency()
-    if getattr(self, fn_name) is None:
-      inputs = (self._feed_inputs +
-                self._feed_targets +
-                self._feed_sample_weights)
-      if not isinstance(K.symbolic_learning_phase(), int):
-        inputs += [K.symbolic_learning_phase()]
-
-      with K.get_graph().as_default():
-        with K.name_scope('training'):
-          with K.name_scope(self.optimizer.__class__.__name__):
-            # Training updates
-            updates = self.optimizer.get_updates(
-                params=self._collected_trainable_weights, loss=self.total_loss)
-      # Unconditional updates
-      updates += self.get_updates_for(None)
-      # Conditional updates relevant to this model
-      updates += self.get_updates_for(self.inputs)
-      # Add stateful metrics updates.
-      if metric_updates is not None:
-        updates += metric_updates
-
-      with K.name_scope('training'):
-        # Gets loss and metrics. Updates weights at each call.
-        fn = K.function(
-            inputs,
-            outputs,
-            updates=updates,
-            name='train_function',
-            **self._function_kwargs)
-        setattr(self, fn_name, fn)
-
-  def _make_train_function(self):
-    metrics_tensors = [
-        self._all_metrics_tensors[m] for m in self.metrics_names[1:]
-    ]
-    self._make_train_function_helper('train_function',
-                                     [self.total_loss] + metrics_tensors)
-
-  def _make_fit_function(self):
-    metrics_tensors = [
-        self._all_stateful_metrics_tensors[m] for m in self.metrics_names[1:]
-    ]
-    self._make_train_function_helper(
-        '_fit_function', [self.total_loss] + metrics_tensors)
-
-  def _make_test_function_helper(self, fn_name, outputs, metric_updates=None):
-    if not hasattr(self, fn_name):
-      raise RuntimeError('You must compile your model before using it.')
-    if getattr(self, fn_name) is None:
-      inputs = (self._feed_inputs +
-                self._feed_targets +
-                self._feed_sample_weights)
-
-      with K.name_scope('evaluation'):
-        updates = self.state_updates
-        # Add stateful metrics updates.
-        if metric_updates is not None:
-          updates += metric_updates
-        # Return loss and metrics, no gradient updates.
-        # Does update the network states.
-        fn = K.function(
-            inputs,
-            outputs,
-            updates=updates,
-            name='test_function',
-            **self._function_kwargs)
-        setattr(self, fn_name, fn)
-
-  def _make_test_function(self):
-    metrics_tensors = [
-        self._all_metrics_tensors[m] for m in self.metrics_names[1:]
-    ]
-    self._make_test_function_helper('test_function',
-                                    [self.total_loss] + metrics_tensors)
-
-  def _make_eval_function(self):
-    metrics_tensors = [
-        self._all_stateful_metrics_tensors[m] for m in self.metrics_names[1:]
-    ]
-    self._make_test_function_helper(
-        '_eval_function', [self.total_loss] + metrics_tensors)
+      # Validate all variables were correctly created in distribution scope.
+      if self._distribution_strategy and not self._compile_distribution:
+        for v in self.variables:
+          strategy = self._distribution_strategy
+          if not strategy.extended.variable_created_in_scope(v):
+            raise ValueError(
+                'Variable (%s) was not created in the distribution strategy '
+                'scope of (%s). It is most likely due to not all layers or '
+                'the model or optimizer being created outside the distribution '
+                'strategy scope. Try to make sure your code looks similar '
+                'to the following.\n'
+                'with strategy.scope():\n'
+                '  model=_create_model()\n'
+                '  model.compile(...)'% (v, strategy))
 
-  def _make_predict_function(self):
-    if not hasattr(self, 'predict_function'):
-      self.predict_function = None
-    if self.predict_function is None:
-      inputs = self._feed_inputs
-      # Gets network outputs. Does not update weights.
-      # Does update the network states.
-      kwargs = getattr(self, '_function_kwargs', {})
-      with K.name_scope('predict'):
-        self.predict_function = K.function(
-            inputs,
-            self.outputs,
-            updates=self.state_updates,
-            name='predict_function',
-            **kwargs)
+  @property
+  def metrics(self):
+    """Returns the model's metrics added using `compile`, `add_metric` APIs."""
+    metrics = []
+    if self._is_compiled:
+      metrics += self._compile_stateful_metric_functions
+    return metrics + super(Model, self).metrics
 
-  def _make_execution_function(self, mode):
-    if mode == 'train':
-      self._make_fit_function()
-      return self._fit_function
-    if mode == 'test':
-      self._make_eval_function()
-      return self._eval_function
-    if mode == 'predict':
-      self._make_predict_function()
-      return self.predict_function
+  @property
+  def metrics_names(self):
+    """Returns the model's display labels for all outputs."""
+    metrics_names = []
+    if self._is_compiled:
+      metrics_names += self._compile_metrics_names  # Includes names of losses.
 
-  def _get_iterator_get_next_tensors(self, iterator):
-    get_next_op = self._iterator_get_next.get(iterator, None)
-    if get_next_op is None:
-      get_next_op = iterator.get_next()
-      self._iterator_get_next[iterator] = get_next_op
-    return get_next_op
+    # Add metric names from layers.
+    for layer in self.layers:
+      metrics_names += [m.name for m in layer._metrics]  # pylint: disable=protected-access
+    metrics_names += [m.name for m in self._metrics]
+    return metrics_names
 
-  def _distribution_standardize_user_data(self,
-                                          x,
-                                          y=None,
-                                          sample_weight=None,
-                                          class_weight=None,
-                                          batch_size=None,
-                                          check_steps=False,
-                                          steps_name='steps',
-                                          steps=None,
-                                          validation_split=0,
-                                          shuffle=False):
-    """Runs validation checks on input and target data passed by the user.
+  @property
+  def run_eagerly(self):
+    """Settable attribute indicating whether the model should run eagerly.
 
-    This is called when using DistributionStrategy to train, evaluate or serve
-    the model.
+    Running eagerly means that your model will be run step by step,
+    like Python code. Your model might run slower, but it should become easier
+    for you to debug it by stepping into individual layer calls.
 
-    Args:
-      x: Input data. A numpy array or `tf.data` dataset.
-      y: Target data. A numpy array or None if x is a `tf.data` dataset.
-      sample_weight: An optional sample-weight array passed by the user to
-        weight the importance of each sample in `x`.
-      class_weight: An optional class-weight array by the user to
-        weight the importance of samples in `x` based on the class they belong
-        to, as conveyed by `y`.
-      batch_size: Integer batch size. If provided, it is used to run additional
-        validation checks on stateful models.
-      check_steps: boolean, True if we want to check for validity of `steps` and
-        False, otherwise.
-      steps_name: The public API's parameter name for `steps`.
-      steps: Integer or `None`. Total number of steps (batches of samples) to
-        execute.
-      validation_split: Float between 0 and 1.
-        Fraction of the training data to be used as validation data.
-      shuffle: Boolean whether to shuffle the training data before each epoch.
+    By default, we will attempt to compile your model to a static graph to
+    deliver the best execution performance.
 
     Returns:
-      Iterator for reading the dataset `x`.
-
-    Raises:
-      ValueError: In case of invalid user-provided data.
-      RuntimeError: If the model was never compiled.
+      Boolean, whether the model should run eagerly.
     """
-    if class_weight:
-      raise NotImplementedError('`class_weight` is currently not supported '
-                                'when using DistributionStrategy.')
-
-    if (sample_weight is not None and sample_weight.all() and
-        distributed_training_utils.is_tpu_strategy(
-            self._distribution_strategy)):
-      raise NotImplementedError('`sample_weight` is currently not supported '
-                                'when using TPUStrategy.')
-
-    # Validates `steps` argument right at the beginning since we use it to
-    # construct the dataset object.
-    # TODO(anjalisridhar): Remove this check once we refactor the
-    # _standardize_user_data code path. This check is already present elsewhere
-    # in the codebase.
-    if check_steps and isinstance(x, dataset_ops.DatasetV2) and steps is None:
-      raise ValueError('When using Datasets as input, '
-                       'you should specify the `{steps_name}` argument.'
-                       .format(steps_name=steps_name))
-
-    first_x_value = nest.flatten(x)[0]
-    if isinstance(first_x_value, np.ndarray):
-      # We need to use the drop_remainder argument to allow for a static
-      # input shape which is required for TPUs.
-      drop_remainder = self._distribution_strategy.require_static_shapes
-      if y is not None:
-        var_x = distributed_training_utils.get_var_for_numpy(
-            self._distribution_strategy, x)
-        var_y = distributed_training_utils.get_var_for_numpy(
-            self._distribution_strategy, y)
-        if sample_weight is not None:
-          var_sample_weights = distributed_training_utils.get_var_for_numpy(
-              self._distribution_strategy, sample_weight)
-
-          x = dataset_ops.Dataset.from_tensor_slices((var_x, var_y,
-                                                      var_sample_weights))
-        else:
-          x = dataset_ops.Dataset.from_tensor_slices((var_x, var_y))
-
-        x = dataset_ops.Dataset.from_tensor_slices((var_x, var_y))
-        if shuffle:
-          # 1024 is a good buffer size since it is much larger than the average
-          # batch size provided by the user and provides sufficient randomness.
-          # One thing to keep in mind is the memory usage based on the size of
-          # each sample.
-          x = x.shuffle(1024)
-        x = x.repeat()
-        x = x.batch(batch_size, drop_remainder=drop_remainder)
-        y = None
-        sample_weight = None
+    if self._run_eagerly is True and not context.executing_eagerly():
+      raise ValueError('You can only set `run_eagerly=True` if eager execution '
+                       'is enabled.')
+    if not self.dynamic:
+      if self._run_eagerly is None:
+        return False
       else:
-        # This case is for the predict call where the dataset only contains
-        # inputs and no targets, i.e. it does not return a tuple
-        var_x = distributed_training_utils.get_var_for_numpy(
-            self._distribution_strategy, x)
-        x = dataset_ops.Dataset.from_tensor_slices(var_x)
-        x = x.batch(batch_size, drop_remainder=drop_remainder)
-
-    assert isinstance(x, dataset_ops.DatasetV2)
-
-    with self._distribution_strategy.scope():
-      iterator = self._distribution_strategy.make_dataset_iterator(x)
-      init_op = iterator.initialize()
+        return self._run_eagerly
+    else:
       if not context.executing_eagerly():
-        K.get_session().run(init_op)
-
-    training_utils.validate_iterator_input(x, y, sample_weight,
-                                           validation_split)
-    return iterator
-
-  def _standardize_user_data(self,
-                             x,
-                             y=None,
-                             sample_weight=None,
-                             class_weight=None,
-                             batch_size=None,
-                             check_steps=False,
-                             steps_name='steps',
-                             steps=None,
-                             validation_split=0,
-                             shuffle=False):
-    """Runs validation checks on input and target data passed by the user.
-
-    Also standardizes the data to lists of arrays, in order.
+        raise ValueError('Your model contains layers that can only be '
+                         'successfully run in eager execution (layers '
+                         'constructed with `dynamic=True`). '
+                         'You must enable eager execution with '
+                         '`tf.enable_eager_execution()`.')
+      if self._run_eagerly is False:
+        # TODO(fchollet): consider using py_func to enable this.
+        raise ValueError('Your model contains layers that can only be '
+                         'successfully run in eager execution (layers '
+                         'constructed with `dynamic=True`). '
+                         'You cannot set `run_eagerly=False`.')
+      return context.executing_eagerly()
 
-    Also builds and compiles the model on the fly if it is a subclassed model
-    that has never been called before (and thus has no inputs/outputs).
+  @run_eagerly.setter
+  def run_eagerly(self, value):
+    self._run_eagerly = value
 
-    This is a purely internal method, subject to refactoring at any time.
+  def fit(self,
+          x=None,
+          y=None,
+          batch_size=None,
+          epochs=1,
+          verbose=1,
+          callbacks=None,
+          validation_split=0.,
+          validation_data=None,
+          shuffle=True,
+          class_weight=None,
+          sample_weight=None,
+          initial_epoch=0,
+          steps_per_epoch=None,
+          validation_steps=None,
+          validation_freq=1,
+          max_queue_size=10,
+          workers=1,
+          use_multiprocessing=False,
+          **kwargs):
+    """Trains the model for a fixed number of epochs (iterations on a dataset).
 
-    Args:
-      x: Input data. It could be:
-        - A Numpy array (or array-like), or a list of arrays
-          (in case the model has multiple inputs).
-        - A TensorFlow tensor, or a list of tensors
-          (in case the model has multiple inputs).
-        - A dict mapping input names to the corresponding array/tensors,
-          if the model has named inputs.
-        - A `tf.data` dataset or a dataset iterator.
-      y: Target data. Like the input data `x`,
-        it could be either Numpy array(s) or TensorFlow tensor(s).
-        It should be consistent with `x` (you cannot have Numpy inputs and
-        tensor targets, or inversely). If `x` is a dataset or a
-        dataset iterator, `y` should not be specified
-        (since targets will be obtained from the iterator).
-      sample_weight: An optional sample-weight array passed by the user to
-        weight the importance of each sample in `x`.
-      class_weight: An optional class-weight array by the user to
-        weight the importance of samples in `x` based on the class they belong
-        to, as conveyed by `y`.
-      batch_size: Integer batch size. If provided, it is used to run additional
-        validation checks on stateful models.
-      check_steps: boolean, True if we want to check for validity of `steps` and
-        False, otherwise. For example, when we are standardizing one batch of
-        data for train_on_batch/predict_on_batch/test_on_batch APIs, `steps`
-        value is not required and we should not check for its validity in these
-        cases.
-      steps_name: The public API's parameter name for `steps`.
-      steps: Integer or `None`. Total number of steps (batches of samples) to
-        execute.
-      validation_split: Float between 0 and 1.
-        Fraction of the training data to be used as validation data.
-      shuffle: Boolean whether to shuffle the training data before each epoch.
+    Arguments:
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors,
+            if the model has named inputs.
+          - A `tf.data` dataset or a dataset iterator. Should return a tuple
+            of either `(inputs, targets)` or
+            `(inputs, targets, sample_weights)`.
+          - A generator or `keras.utils.Sequence` returning `(inputs, targets)`
+            or `(inputs, targets, sample weights)`.
+        y: Target data. Like the input data `x`,
+          it could be either Numpy array(s) or TensorFlow tensor(s).
+          It should be consistent with `x` (you cannot have Numpy inputs and
+          tensor targets, or inversely). If `x` is a dataset, dataset
+          iterator, generator, or `keras.utils.Sequence` instance, `y` should
+          not be specified (since targets will be obtained from `x`).
+        batch_size: Integer or `None`.
+            Number of samples per gradient update.
+            If unspecified, `batch_size` will default to 32.
+            Do not specify the `batch_size` if your data is in the
+            form of symbolic tensors, dataset, dataset iterators,
+            generators, or `keras.utils.Sequence` instances (since they generate
+            batches).
+        epochs: Integer. Number of epochs to train the model.
+            An epoch is an iteration over the entire `x` and `y`
+            data provided.
+            Note that in conjunction with `initial_epoch`,
+            `epochs` is to be understood as "final epoch".
+            The model is not trained for a number of iterations
+            given by `epochs`, but merely until the epoch
+            of index `epochs` is reached.
+        verbose: Integer. 0, 1, or 2. Verbosity mode.
+            0 = silent, 1 = progress bar, 2 = one line per epoch.
+        callbacks: List of `keras.callbacks.Callback` instances.
+            List of callbacks to apply during training.
+            See `tf.keras.callbacks`.
+        validation_split: Float between 0 and 1.
+            Fraction of the training data to be used as validation data.
+            The model will set apart this fraction of the training data,
+            will not train on it, and will evaluate
+            the loss and any model metrics
+            on this data at the end of each epoch.
+            The validation data is selected from the last samples
+            in the `x` and `y` data provided, before shuffling. This argument is
+            not supported when `x` is a dataset, dataset iterator, generator or
+           `keras.utils.Sequence` instance.
+        validation_data: Data on which to evaluate
+            the loss and any model metrics at the end of each epoch.
+            The model will not be trained on this data.
+            `validation_data` will override `validation_split`.
+            `validation_data` could be:
+              - tuple `(x_val, y_val)` of Numpy arrays or tensors
+              - tuple `(x_val, y_val, val_sample_weights)` of Numpy arrays
+              - dataset or a dataset iterator
+            For the first two cases, `batch_size` must be provided.
+            For the last case, `validation_steps` must be provided.
+        shuffle: Boolean (whether to shuffle the training data
+            before each epoch) or str (for 'batch').
+            'batch' is a special option for dealing with the
+            limitations of HDF5 data; it shuffles in batch-sized chunks.
+            Has no effect when `steps_per_epoch` is not `None`.
+        class_weight: Optional dictionary mapping class indices (integers)
+            to a weight (float) value, used for weighting the loss function
+            (during training only).
+            This can be useful to tell the model to
+            "pay more attention" to samples from
+            an under-represented class.
+        sample_weight: Optional Numpy array of weights for
+            the training samples, used for weighting the loss function
+            (during training only). You can either pass a flat (1D)
+            Numpy array with the same length as the input samples
+            (1:1 mapping between weights and samples),
+            or in the case of temporal data,
+            you can pass a 2D array with shape
+            `(samples, sequence_length)`,
+            to apply a different weight to every timestep of every sample.
+            In this case you should make sure to specify
+            `sample_weight_mode="temporal"` in `compile()`. This argument is not
+            supported when `x` is a dataset, dataset iterator, generator, or
+           `keras.utils.Sequence` instance, instead provide the sample_weights
+            as the third element of `x`.
+        initial_epoch: Integer.
+            Epoch at which to start training
+            (useful for resuming a previous training run).
+        steps_per_epoch: Integer or `None`.
+            Total number of steps (batches of samples)
+            before declaring one epoch finished and starting the
+            next epoch. When training with input tensors such as
+            TensorFlow data tensors, the default `None` is equal to
+            the number of samples in your dataset divided by
+            the batch size, or 1 if that cannot be determined.
+        validation_steps: Only relevant if `validation_data` is provided and
+            is a dataset or dataset iterator. Total number of steps (batches of
+            samples) to draw before stopping when performing validation
+            at the end of every epoch.
+        validation_freq: Only relevant if validation data is provided. Integer
+            or `collections.Container` instance (e.g. list, tuple, etc.). If an
+            integer, specifies how many training epochs to run before a new
+            validation run is performed, e.g. `validation_freq=2` runs
+            validation every 2 epochs. If a Container, specifies the epochs on
+            which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
+            validation at the end of the 1st, 2nd, and 10th epochs.
+        max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
+            input only. Maximum size for the generator queue.
+            If unspecified, `max_queue_size` will default to 10.
+        workers: Integer. Used for generator or `keras.utils.Sequence` input
+            only. Maximum number of processes to spin up
+            when using process-based threading. If unspecified, `workers`
+            will default to 1. If 0, will execute the generator on the main
+            thread.
+        use_multiprocessing: Boolean. Used for generator or
+            `keras.utils.Sequence` input only. If `True`, use process-based
+            threading. If unspecified, `use_multiprocessing` will default to
+            `False`. Note that because this implementation relies on
+            multiprocessing, you should not pass non-picklable arguments to
+            the generator as they can't be passed easily to children processes.
+        **kwargs: Used for backwards compatibility.
 
     Returns:
-      A tuple of 3: inputs (arrays or dicts, depending on whether `x` was a dict
-      or not), target arrays, sample-weight arrays.
-      If the model's input and targets are symbolic, these lists are empty
-      (since the model takes no user-provided data, instead the data comes
-      from the symbolic inputs/targets).
+        A `History` object. Its `History.history` attribute is
+        a record of training loss values and metrics values
+        at successive epochs, as well as validation loss values
+        and validation metrics values (if applicable).
 
     Raises:
-      ValueError: In case of invalid user-provided data.
-      RuntimeError: If the model was never compiled.
+        RuntimeError: If the model was never compiled.
+        ValueError: In case of mismatch between the provided input data
+            and what the model expects.
     """
+    # Legacy support
+    if 'nb_epoch' in kwargs:
+      logging.warning(
+          'The `nb_epoch` argument in `fit` '
+          'has been renamed `epochs`.')
+      epochs = kwargs.pop('nb_epoch')
+    if kwargs:
+      raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
+
+    # When the model expects dictionary inputs (i.e. FeatureColumn-based
+    # models), set run_eagerly to True as there's no support for graph
+    # functions.
+    training_utils.set_run_eagerly_for_dict_structure(self, x)
+
+    # Case 1: distribution strategy.
     if self._distribution_strategy:
-      iterator = self._distribution_standardize_user_data(
+      if K.in_multi_worker_mode():
+        # Multi-Worker mode runs the Keras training loop on multiple
+        # servers via the Distribute Coordinator.
+        def _worker_fn(_):
+          """Run training inside the distributed coordinator."""
+          filtered_callbacks = distributed_training_utils \
+              .filter_distributed_callbacks(callbacks)
+          return training_distributed.fit_distributed(
+              self,
+              x=x,
+              y=y,
+              batch_size=batch_size,
+              epochs=epochs,
+              verbose=verbose,
+              callbacks=filtered_callbacks,
+              validation_split=validation_split,
+              validation_data=validation_data,
+              shuffle=shuffle,
+              class_weight=class_weight,
+              sample_weight=sample_weight,
+              initial_epoch=initial_epoch,
+              steps_per_epoch=steps_per_epoch,
+              validation_steps=validation_steps,
+              validation_freq=validation_freq)
+
+        # Independent worker only for now.
+        return dc.run_distribute_coordinator(
+            _worker_fn,
+            self._distribution_strategy,
+            mode=dc.CoordinatorMode.INDEPENDENT_WORKER)
+      else:
+        return training_distributed.fit_distributed(
+            self,
+            x=x,
+            y=y,
+            batch_size=batch_size,
+            epochs=epochs,
+            verbose=verbose,
+            callbacks=callbacks,
+            validation_split=validation_split,
+            validation_data=validation_data,
+            shuffle=shuffle,
+            class_weight=class_weight,
+            sample_weight=sample_weight,
+            initial_epoch=initial_epoch,
+            steps_per_epoch=steps_per_epoch,
+            validation_steps=validation_steps,
+            validation_freq=validation_freq)
+
+    batch_size = self._validate_or_infer_batch_size(
+        batch_size, steps_per_epoch, x)
+
+    # Case 2: generator-like. Input is Python generator, or Sequence object,
+    # or a non-distributed Dataset or iterator in eager execution.
+    if data_utils.is_generator_or_sequence(x):
+      training_utils.check_generator_arguments(
+          y, sample_weight, validation_split=validation_split)
+      return self.fit_generator(
           x,
-          y,
-          sample_weight=sample_weight,
+          steps_per_epoch=steps_per_epoch,
+          epochs=epochs,
+          verbose=verbose,
+          callbacks=callbacks,
+          validation_data=validation_data,
+          validation_steps=validation_steps,
+          validation_freq=validation_freq,
           class_weight=class_weight,
-          batch_size=batch_size,
-          check_steps=check_steps,
-          steps_name=steps_name,
-          steps=steps,
-          validation_split=validation_split,
-          shuffle=shuffle)
-      return iterator, None, None
+          max_queue_size=max_queue_size,
+          workers=workers,
+          use_multiprocessing=use_multiprocessing,
+          shuffle=shuffle,
+          initial_epoch=initial_epoch)
+    if training_utils.is_eager_dataset_or_iterator(x):
+      # Make sure that y, sample_weights, validation_split are not passed.
+      training_utils.validate_dataset_input(x, y, sample_weight,
+                                            validation_split)
+      if (isinstance(x, (dataset_ops.DatasetV1, dataset_ops.DatasetV2))
+          and shuffle):
+        training_utils.verify_dataset_shuffled(x)
 
-    if isinstance(x, dataset_ops.DatasetV2):
-      if context.executing_eagerly():
-        x = iter(x)
-      else:
-        if x in self._dataset_iterator_cache:
-          x = self._dataset_iterator_cache[x]
-        else:
-          iterator = dataset_ops.make_initializable_iterator(x)
-          self._dataset_iterator_cache[x] = iterator
-          x = iterator
-        K.get_session().run(x.initializer)
+      return self.fit_generator(
+          x,
+          steps_per_epoch=steps_per_epoch,
+          epochs=epochs,
+          verbose=verbose,
+          callbacks=callbacks,
+          validation_data=validation_data,
+          validation_steps=validation_steps,
+          validation_freq=validation_freq,
+          class_weight=class_weight,
+          workers=0,
+          shuffle=shuffle,
+          initial_epoch=initial_epoch)
 
-    # Validates `steps` argument based on x's type.
-    if check_steps:
-      training_utils.check_steps_argument(x, steps, steps_name)
+    # Case 3: Symbolic tensors or Numpy array-like.
+    # This includes Datasets and iterators in graph mode (since they
+    # generate symbolic tensors).
+    x, y, sample_weights = self._standardize_user_data(
+        x,
+        y,
+        sample_weight=sample_weight,
+        class_weight=class_weight,
+        batch_size=batch_size,
+        check_steps=True,
+        steps_name='steps_per_epoch',
+        steps=steps_per_epoch,
+        validation_split=validation_split,
+        shuffle=shuffle)
 
-    is_x_eager_iterator = isinstance(x, iterator_ops.EagerIterator)
-    is_x_iterator = isinstance(x, iterator_ops.Iterator)
-
-    # Validate user inputs when data is given as a dataset or dataset iterator.
-    if is_x_iterator or is_x_eager_iterator:
-      training_utils.validate_iterator_input(x, y, sample_weight,
-                                             validation_split)
-
-    # For eager iterators, when we have to process multiple batches of samples,
-    # we will standardize the data when we actually loop over iterator and get
-    # the batches. For now, we just return the iterator as is.
-    if is_x_eager_iterator:
-      return x, y, sample_weight
-
-    # If input data is a dataset iterator in graph mode or if it is an eager
-    # iterator and only one batch of samples is required, we fetch the data
-    # tensors from the iterator and then standardize them.
-    if is_x_iterator or is_x_eager_iterator:
-      try:
-        if is_x_iterator:
-          next_element = self._get_iterator_get_next_tensors(x)
-        else:
-          next_element = x.get_next()
-      except errors.OutOfRangeError:
-        raise RuntimeError('Your dataset iterator ran out of data; '
-                           'Make sure that your dataset can generate '
-                           'required number of samples.')
-
-      if isinstance(next_element, (list, tuple)):
-        if len(next_element) not in [2, 3]:
-          raise ValueError(
-              'Please provide model inputs as a list or tuple of 2  or 3'
-              'elements: (input, target) or (input, target, sample_weights)'
-              'Received %s' % next_element)
-        if len(next_element) == 2:
-          x, y = next_element
-        else:
-          x, y, sample_weight = next_element
+    # Prepare validation data.
+    if validation_data:
+      val_x, val_y, val_sample_weights = self._unpack_validation_data(
+          validation_data)
+      val_x, val_y, val_sample_weights = self._standardize_user_data(
+          val_x,
+          val_y,
+          sample_weight=val_sample_weights,
+          batch_size=batch_size,
+          steps=validation_steps,
+          steps_name='validation_steps')
+    elif validation_split and 0. < validation_split < 1.:
+      if training_utils.has_symbolic_tensors(x):
+        raise ValueError('If your data is in the form of symbolic tensors, '
+                         'you cannot use `validation_split`.')
+      if hasattr(x[0], 'shape'):
+        split_at = int(x[0].shape[0] * (1. - validation_split))
       else:
-        x = next_element
-    x, y, sample_weights = self._standardize_weights(
-        x, y, sample_weight, class_weight, batch_size, is_x_iterator)
-    return x, y, sample_weights
-
-  def _standardize_weights(self,
-                           x,
-                           y,
-                           sample_weight=None,
-                           class_weight=None,
-                           batch_size=None,
-                           from_iterator=False):
-    """Standardize input data, target data, and weight values.
-
-    This method reformats all data passed to the model to an ordered list of
-    array/tensors, matching the order expected by the model. This also validates
-    the input and target data shapes.
-
-    Args:
-      x: Input data. It could be:
-        - A Numpy array (or array-like), or a list of arrays
-          (in case the model has multiple inputs).
-        - A TensorFlow tensor, or a list of tensors
-          (in case the model has multiple inputs).
-        - A dict mapping input names to the corresponding array/tensors,
-          if the model has named inputs.
-        x cannot not be an iterator.
-      y: Target data. Like the input data `x`,
-        it could be either Numpy array(s) or TensorFlow tensor(s).
-        It should be consistent with `x` (you cannot have Numpy inputs and
-        tensor targets, or inversely).
-      sample_weight: An optional sample-weight array passed by the user to
-        weight the importance of each sample in `x`.
-      class_weight: An optional class-weight array by the user to
-        weight the importance of samples in `x` based on the class they belong
-        to, as conveyed by `y`.
-      batch_size: Integer batch size. If provided, it is used to run additional
-        validation checks on stateful models.
-      from_iterator: Whether x and y were obtained from an iterator.
-
-    Returns:
-      Tuple of standardized data that will be fed to the model:
-        (input data, target data, sample weights)
+        split_at = int(len(x[0]) * (1. - validation_split))
+      x, val_x = (slice_arrays(x, 0, split_at), slice_arrays(x, split_at))
+      y, val_y = (slice_arrays(y, 0, split_at), slice_arrays(y, split_at))
+      sample_weights, val_sample_weights = (slice_arrays(
+          sample_weights, 0, split_at), slice_arrays(sample_weights, split_at))
+    elif validation_steps:
+      val_x = []
+      val_y = []
+      val_sample_weights = []
+    else:
+      val_x = None
+      val_y = None
+      val_sample_weights = None
 
-    Raises:
-      RuntimeError: If target data is provided, but the model has not yet been
-        compiled.
-      ValueError: If the input data, target data, and batch size have invalid
-        shapes or formats (e.g. the model expects input to be a list of three
-        tensors, but x is a list with two tensors). Error is also raised if the
-        input and target data are not both arrays or tensors.
-    """
-    # TODO(sourabhbajaj): Split input validation from weight standardization.
-    if sample_weight is not None and class_weight is not None:
-      logging.warning(
-          'Received both a `sample_weight` and `class_weight` argument. '
-          'The `class_weight` argument will be ignored.')
-    # First, we build/compile the model on the fly if necessary.
-    all_inputs = []
-    is_build_called = False
-    is_compile_called = False
-    # Whether this is a subclassed model that expects dictionary inputs
-    # rather than list inputs (e.g. FeatureColumn-based models).
-    dict_inputs = False
-    if not self.inputs:
-      # We need to use `x` to set the model inputs.
-      # We type-check that `x` and `y` are either single arrays
-      # or lists of arrays.
-      if isinstance(x, (list, tuple)):
-        if not all(isinstance(v, np.ndarray) or
-                   tensor_util.is_tensor(v) for v in x):
-          raise ValueError('Please provide as model inputs either a single '
-                           'array or a list of arrays. You passed: x=' + str(x))
-        all_inputs += list(x)
-      elif isinstance(x, dict):
-        dict_inputs = True
-        keys = sorted(x.keys())
-        all_inputs = [x[k] for k in keys]
-      else:
-        if not isinstance(x, np.ndarray) and not tensor_util.is_tensor(x):
-          raise ValueError('Please provide as model inputs either a single '
-                           'array or a list of arrays. You passed: x=' + str(x))
-        all_inputs.append(x)
-
-      # Build the model using the retrieved inputs (value or symbolic).
-      # If values or generated from a dataset, then in symbolic-mode
-      # placeholders will be created to match the value shapes.
-      if not self.inputs:
-        is_build_called = True
-        if from_iterator:
-          cast_inputs = nest.map_structure(lambda v: v.shape, x)
-        elif training_utils.has_tensors(x):
-          cast_inputs = training_utils.cast_if_floating_dtype(x)
-        else:
-          cast_inputs = x
-        self._set_inputs(cast_inputs)
+    if self.run_eagerly:
+      return training_generator.fit_generator(
+          self, (x, y, sample_weights),
+          steps_per_epoch=steps_per_epoch,
+          batch_size=batch_size,
+          epochs=epochs,
+          verbose=verbose,
+          callbacks=callbacks,
+          validation_data=validation_data,
+          validation_steps=validation_steps,
+          validation_freq=validation_freq,
+          workers=0,
+          shuffle=shuffle,
+          initial_epoch=initial_epoch,
+          steps_name='steps_per_epoch')
     else:
-      dict_inputs = isinstance(self.inputs, dict)
-    if dict_inputs and context.executing_eagerly():
-      # No support for graph functions when the model expects dictionary inputs
-      # (i.e. FeatureColumn-based models).
-      self.run_eagerly = True
+      return training_arrays.fit_loop(
+          self,
+          x,
+          y,
+          sample_weights=sample_weights,
+          batch_size=batch_size,
+          epochs=epochs,
+          verbose=verbose,
+          callbacks=callbacks,
+          val_inputs=val_x,
+          val_targets=val_y,
+          val_sample_weights=val_sample_weights,
+          shuffle=shuffle,
+          initial_epoch=initial_epoch,
+          steps_per_epoch=steps_per_epoch,
+          validation_steps=validation_steps,
+          validation_freq=validation_freq,
+          steps_name='steps_per_epoch')
 
-    if y is not None:
-      if not self.optimizer:
-        raise RuntimeError('You must compile a model before '
-                           'training/testing. '
-                           'Use `model.compile(optimizer, loss)`.')
-      if not self._is_compiled:
-        # On-the-fly compilation of the model.
-        # We need to use `y` to set the model targets.
-        if training_utils.has_tensors(y):
-          y = training_utils.cast_if_floating_dtype(y)
-        if isinstance(y, (list, tuple)):
-          if not all(isinstance(v, np.ndarray) or
-                     tensor_util.is_tensor(v) for v in y):
-            raise ValueError('Please provide as model targets either a single '
-                             'array or a list of arrays. '
-                             'You passed: y=' + str(y))
-          all_inputs += list(y)
-        elif isinstance(y, dict):
-          raise ValueError('Please do not pass a dictionary as model targets.')
-        else:
-          if not isinstance(y, np.ndarray) and not tensor_util.is_tensor(y):
-            raise ValueError('Please provide as model targets either a single '
-                             'array or a list of arrays. '
-                             'You passed: y=' + str(y))
-          all_inputs.append(y)
+  def evaluate(self,
+               x=None,
+               y=None,
+               batch_size=None,
+               verbose=1,
+               sample_weight=None,
+               steps=None,
+               callbacks=None,
+               max_queue_size=10,
+               workers=1,
+               use_multiprocessing=False):
+    """Returns the loss value & metrics values for the model in test mode.
 
-        # Typecheck that all inputs are *either* value *or* symbolic.
-        # TODO(fchollet): this check could be removed in Eager mode?
-        if any(tensor_util.is_tensor(v) for v in all_inputs):
-          if not all(tensor_util.is_tensor(v) for v in all_inputs):
-            raise ValueError('Do not pass inputs that mix Numpy arrays and '
-                             'TensorFlow tensors. '
-                             'You passed: x=' + str(x) + '; y=' + str(y))
+    Computation is done in batches.
 
-        if self.run_eagerly or from_iterator:
-          target_tensors = None
-        else:
-          # Handle target tensors if any passed.
-          if not isinstance(y, (list, tuple)):
-            y = [y]
-          target_tensors = [v for v in y if _is_symbolic_tensor(v)]
-        is_compile_called = True
-        self.compile(
-            optimizer=self.optimizer,
-            loss=self.loss,
-            metrics=self._compile_metrics,
-            weighted_metrics=self._compile_weighted_metrics,
-            loss_weights=self.loss_weights,
-            target_tensors=target_tensors,
-            run_eagerly=self.run_eagerly)
+    Arguments:
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors,
+            if the model has named inputs.
+          - A `tf.data` dataset or a dataset iterator.
+          - A generator or `keras.utils.Sequence` instance.
+        y: Target data. Like the input data `x`,
+          it could be either Numpy array(s) or TensorFlow tensor(s).
+          It should be consistent with `x` (you cannot have Numpy inputs and
+          tensor targets, or inversely).
+          If `x` is a dataset, dataset iterator, generator or
+          `keras.utils.Sequence` instance, `y` should not be specified (since
+          targets will be obtained from the iterator/dataset).
+        batch_size: Integer or `None`.
+            Number of samples per gradient update.
+            If unspecified, `batch_size` will default to 32.
+            Do not specify the `batch_size` is your data is in the
+            form of symbolic tensors, dataset, dataset iterators,
+            generators, or `keras.utils.Sequence` instances (since they generate
+            batches).
+        verbose: 0 or 1. Verbosity mode.
+            0 = silent, 1 = progress bar.
+        sample_weight: Optional Numpy array of weights for
+            the test samples, used for weighting the loss function.
+            You can either pass a flat (1D)
+            Numpy array with the same length as the input samples
+            (1:1 mapping between weights and samples),
+            or in the case of temporal data,
+            you can pass a 2D array with shape
+            `(samples, sequence_length)`,
+            to apply a different weight to every timestep of every sample.
+            In this case you should make sure to specify
+            `sample_weight_mode="temporal"` in `compile()`. This argument is not
+            supported when `x` is a dataset or a dataset iterator, instead pass
+            sample weights as the third element of `x`.
+        steps: Integer or `None`.
+            Total number of steps (batches of samples)
+            before declaring the evaluation round finished.
+            Ignored with the default value of `None`.
+        callbacks: List of `keras.callbacks.Callback` instances.
+            List of callbacks to apply during evaluation.
+            See [callbacks](/api_docs/python/tf/keras/callbacks).
+        max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
+            input only. Maximum size for the generator queue.
+            If unspecified, `max_queue_size` will default to 10.
+        workers: Integer. Used for generator or `keras.utils.Sequence` input
+            only. Maximum number of processes to spin up when using
+            process-based threading. If unspecified, `workers` will default
+            to 1. If 0, will execute the generator on the main thread.
+        use_multiprocessing: Boolean. Used for generator or
+            `keras.utils.Sequence` input only. If `True`, use process-based
+            threading. If unspecified, `use_multiprocessing` will default to
+            `False`. Note that because this implementation relies on
+            multiprocessing, you should not pass non-picklable arguments to
+            the generator as they can't be passed easily to children processes.
 
-    # In graph mode, if we had just set inputs and targets as symbolic tensors
-    # by invoking build and compile on the model respectively, we do not have to
-    # feed anything to the model. Model already has input and target data as
-    # part of the graph.
-    # Note: in this case, `any` and `all` are equivalent since we disallow
-    # mixed symbolic/value inputs.
-    if (not self.run_eagerly and is_build_called and is_compile_called and
-        not from_iterator and any(_is_symbolic_tensor(v) for v in all_inputs)):
-      return [], [], []
+    Returns:
+        Scalar test loss (if the model has a single output and no metrics)
+        or list of scalars (if the model has multiple outputs
+        and/or metrics). The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
 
-    # What follows is input validation and standardization to list format,
-    # in the case where all inputs are value arrays.
+    Raises:
+        ValueError: in case of invalid arguments.
+    """
+    # Case 1: distribution strategy.
+    if self._distribution_strategy:
+      if K.in_multi_worker_mode():
+        # Multi-Worker mode runs the Keras evaluation loop on multiple
+        # servers via the Distribute Coordinator.
+        def _worker_fn(_):
+          """Run evaluation inside the distributed coordinator."""
+          filtered_callbacks = distributed_training_utils \
+              .filter_distributed_callbacks(callbacks)
+          return training_distributed.evaluate_distributed(
+              self,
+              x=x,
+              y=y,
+              batch_size=batch_size,
+              verbose=verbose,
+              sample_weight=sample_weight,
+              steps=steps,
+              callbacks=filtered_callbacks)
+
+        # Independent worker only for now.
+        return dc.run_distribute_coordinator(
+            _worker_fn,
+            self._distribution_strategy,
+            mode=dc.CoordinatorMode.INDEPENDENT_WORKER)
+      else:
+        return training_distributed.evaluate_distributed(
+            self,
+            x=x,
+            y=y,
+            batch_size=batch_size,
+            verbose=verbose,
+            sample_weight=sample_weight,
+            steps=steps,
+            callbacks=callbacks)
 
-    if self.run_eagerly:
-      # In eager mode, do not do shape validation
-      # since the network has no input nodes (placeholders) to be fed.
-      feed_input_names = self.input_names
-      feed_input_shapes = None
-    elif not self._is_graph_network:
-      # Case: symbolic-mode subclassed network. Do not do shape validation.
-      feed_input_names = self._feed_input_names
-      feed_input_shapes = None
-    else:
-      # Case: symbolic-mode graph network.
-      # In this case, we run extensive shape validation checks.
-      feed_input_names = self._feed_input_names
-      feed_input_shapes = self._feed_input_shapes
+    batch_size = self._validate_or_infer_batch_size(batch_size, steps, x)
 
-    # Standardize the inputs.
-    x = training_utils.standardize_input_data(
+    # Case 2: generator-like. Input is Python generator, or Sequence object,
+    # or a non-distributed Dataset or iterator in eager execution.
+    if data_utils.is_generator_or_sequence(x):
+      training_utils.check_generator_arguments(y, sample_weight)
+      return self.evaluate_generator(
+          x,
+          steps=steps,
+          verbose=verbose,
+          callbacks=callbacks,
+          max_queue_size=max_queue_size,
+          workers=workers,
+          use_multiprocessing=use_multiprocessing)
+    if training_utils.is_eager_dataset_or_iterator(x):
+      # Make sure that y, sample_weights are not passed.
+      training_utils.validate_dataset_input(x, y, sample_weight)
+      return training_generator.evaluate_generator(
+          self, x,
+          steps=steps,
+          batch_size=batch_size,
+          verbose=verbose,
+          workers=0,
+          callbacks=callbacks)
+
+    # Case 3: Symbolic tensors or Numpy array-like.
+    # This includes Datasets and iterators in graph mode (since they
+    # generate symbolic tensors).
+    x, y, sample_weights = self._standardize_user_data(
         x,
-        feed_input_names,
-        feed_input_shapes,
-        check_batch_axis=False,  # Don't enforce the batch size.
-        exception_prefix='input')
+        y,
+        sample_weight=sample_weight,
+        batch_size=batch_size,
+        check_steps=True,
+        steps_name='steps',
+        steps=steps)
 
-    if y is not None:
-      if not self._is_graph_network:
-        feed_output_names = self._feed_output_names
-        feed_output_shapes = None
-        # Sample weighting not supported in this case.
-        # TODO(fchollet): consider supporting it.
-        feed_sample_weight_modes = [None for _ in self.outputs]
-      else:
-        feed_output_names = self._feed_output_names
-        feed_sample_weight_modes = self._feed_sample_weight_modes
-        feed_output_shapes = []
-        for output_shape, loss_fn in zip(self._feed_output_shapes,
-                                         self._feed_loss_fns):
-          if loss_fn is losses.sparse_categorical_crossentropy:
-            if K.image_data_format() == 'channels_first':
-              feed_output_shapes.append(
-                  (output_shape[0], 1) + output_shape[2:])
-            else:
-              feed_output_shapes.append(output_shape[:-1] + (1,))
-          elif (not hasattr(loss_fn, '__name__') or
-                getattr(losses, loss_fn.__name__, None) is None):
-            # If `loss_fn` is not a function (e.g. callable class)
-            # or if it not in the `losses` module, then
-            # it is a user-defined loss and we make no assumptions
-            # about it.
-            feed_output_shapes.append(None)
-          else:
-            feed_output_shapes.append(output_shape)
+    if self.run_eagerly:
+      return training_generator.evaluate_generator(
+          self, (x, y, sample_weights),
+          steps=steps,
+          batch_size=batch_size,
+          verbose=verbose,
+          workers=0,
+          callbacks=callbacks)
+    else:
+      return training_arrays.test_loop(
+          self,
+          inputs=x,
+          targets=y,
+          sample_weights=sample_weights,
+          batch_size=batch_size,
+          verbose=verbose,
+          steps=steps,
+          callbacks=callbacks)
 
-      # Standardize the outputs.
-      y = training_utils.standardize_input_data(
-          y,
-          feed_output_names,
-          # Don't enforce target shapes to match output shapes.
-          # Precise checks will be run in `check_loss_and_target_compatibility`.
-          shapes=None,
-          check_batch_axis=False,  # Don't enforce the batch size.
-          exception_prefix='target')
-
-      # Generate sample-wise weight values given the `sample_weight` and
-      # `class_weight` arguments.
-      sample_weights = training_utils.standardize_sample_weights(
-          sample_weight, feed_output_names)
-      class_weights = training_utils.standardize_class_weights(
-          class_weight, feed_output_names)
-      sample_weights = [
-          training_utils.standardize_weights(ref, sw, cw, mode)
-          for (ref, sw, cw, mode) in zip(y, sample_weights, class_weights,
-                                         feed_sample_weight_modes)
-      ]
-      # Check that all arrays have the same length.
-      if not self._distribution_strategy:
-        training_utils.check_array_lengths(x, y, sample_weights)
-        if self._is_graph_network and not self.run_eagerly:
-          # Additional checks to avoid users mistakenly using improper loss fns.
-          training_utils.check_loss_and_target_compatibility(
-              y, self._feed_loss_fns, feed_output_shapes)
-    else:
-      y = []
-      sample_weights = []
-
-    if self.stateful and batch_size:
-      # Check that for stateful networks, number of samples is a multiple
-      # of the static batch size.
-      if x[0].shape[0] % batch_size != 0:
-        raise ValueError('In a stateful network, '
-                         'you should only pass inputs with '
-                         'a number of samples that can be '
-                         'divided by the batch size. Found: ' +
-                         str(x[0].shape[0]) + ' samples')
-
-    # If dictionary inputs were provided, we return a dictionary as well.
-    if dict_inputs:
-      x = dict(zip(feed_input_names, x))
-    return x, y, sample_weights
-
-  @checkpointable.no_automatic_dependency_tracking
-  def _set_inputs(self, inputs, outputs=None, training=None):
-    """Set model's input and output specs based on the input data received.
-
-    This is to be used for Model subclasses, which do not know at instantiation
-    time what their inputs look like.
-
-    Args:
-      inputs: Single array, or list of arrays. The arrays could be placeholders,
-        Numpy arrays, data tensors, or TensorShapes.
-        - if placeholders: the model is built on top of these placeholders,
-          and we expect Numpy data to be fed for them when calling `fit`/etc.
-        - if Numpy data or TensorShapes: we create placeholders matching the
-          TensorShapes or shapes of the Numpy arrays. We expect Numpy data to be
-          fed for these placeholders when calling `fit`/etc.
-        - if data tensors: the model is built on top of these tensors.
-          We do not expect any Numpy data to be provided when calling `fit`/etc.
-      outputs: None, a data tensor, or a list of tensors. If None, the
-        outputs will be determined by invoking `self.call()`, otherwise the
-        provided value will be used.
-      training: Boolean or None. Only relevant in symbolic mode. Specifies
-        whether to build the model's graph in inference mode (False), training
-        mode (True), or using the Keras learning phase (None).
-    Raises:
-      ValueError: If dict inputs are passed to a Sequential Model where the
-        first layer isn't FeatureLayer.
-    """
-    if self.inputs:
-      raise ValueError('Model inputs are already set.')
-
-    if self.__class__.__name__ == 'Sequential' and not self.built:
-      if tensor_util.is_tensor(inputs):
-        input_shape = (None,) + tuple(inputs.shape.as_list()[1:])
-      elif isinstance(inputs, tensor_shape.TensorShape):
-        input_shape = (None,) + tuple(inputs.as_list()[1:])
-      elif isinstance(inputs, dict):
-        # We assert that the first layer is a FeatureLayer.
-        if not training_utils.is_feature_layer(self.layers[0]):
-          raise ValueError('Passing a dictionary input to a Sequential Model '
-                           'which doesn\'t have FeatureLayer as the first layer'
-                           ' is an error.')
-        input_shape = (None,)
-      else:
-        input_shape = (None,) + tuple(inputs.shape[1:])
-      self._build_input_shape = input_shape
-
-    # On-the-fly setting of symbolic model inputs (either by using the tensor
-    # provided, or by creating a placeholder if Numpy data was provided).
-    model_inputs = training_utils.ModelInputs(inputs)
-    inputs = model_inputs.get_symbolic_inputs()
-    self.inputs = model_inputs.get_symbolic_inputs(return_single_as_list=True)
-    self.input_names = model_inputs.get_input_names()
-
-    self._feed_inputs = []
-    self._feed_input_names = []
-    self._feed_input_shapes = []
-
-    for k, v in model_inputs.as_dict():
-      if K.is_placeholder(v):
-        self._feed_inputs.append(v)
-        self._feed_input_names.append(k)
-        self._feed_input_shapes.append(K.int_shape(v))
-
-    # TODO(fchollet): consider calling `_maybe_build` before calling the model.
-
-    if outputs is None:
-      # Obtain symbolic outputs by calling the model.
-      with K.get_graph().as_default():
-        if self._expects_training_arg:
-          outputs = self.call(inputs, training=training)
-        else:
-          outputs = self.call(inputs)
-
-    outputs = nest.flatten(outputs)
-    self.outputs = outputs
-    self.output_names = [
-        'output_%d' % (i + 1) for i in range(len(self.outputs))]
-    self.built = True
+  def predict(self,
+              x,
+              batch_size=None,
+              verbose=0,
+              steps=None,
+              callbacks=None,
+              max_queue_size=10,
+              workers=1,
+              use_multiprocessing=False):
+    """Generates output predictions for the input samples.
 
-  def fit(self,
-          x=None,
-          y=None,
-          batch_size=None,
-          epochs=1,
-          verbose=1,
-          callbacks=None,
-          validation_split=0.,
-          validation_data=None,
-          shuffle=True,
-          class_weight=None,
-          sample_weight=None,
-          initial_epoch=0,
-          steps_per_epoch=None,
-          validation_steps=None,
-          max_queue_size=10,
-          workers=1,
-          use_multiprocessing=False,
-          **kwargs):
-    """Trains the model for a fixed number of epochs (iterations on a dataset).
+    Computation is done in batches.
 
     Arguments:
-        x: Input data. It could be:
+         x: Input samples. It could be:
           - A Numpy array (or array-like), or a list of arrays
             (in case the model has multiple inputs).
           - A TensorFlow tensor, or a list of tensors
             (in case the model has multiple inputs).
-          - A dict mapping input names to the corresponding array/tensors,
-            if the model has named inputs.
-          - A `tf.data` dataset or a dataset iterator. Should return a tuple
-            of either `(inputs, targets)` or
-            `(inputs, targets, sample_weights)`.
-          - A generator or `keras.utils.Sequence` returning `(inputs, targets)`
-            or `(inputs, targets, sample weights)`.
-        y: Target data. Like the input data `x`,
-          it could be either Numpy array(s) or TensorFlow tensor(s).
-          It should be consistent with `x` (you cannot have Numpy inputs and
-          tensor targets, or inversely). If `x` is a dataset, dataset
-          iterator, generator, or `keras.utils.Sequence` instance, `y` should
-          not be specified (since targets will be obtained from `x`).
+          - A `tf.data` dataset or a dataset iterator.
+          - A generator or `keras.utils.Sequence` instance.
         batch_size: Integer or `None`.
             Number of samples per gradient update.
             If unspecified, `batch_size` will default to 32.
-            Do not specify the `batch_size` if your data is in the
+            Do not specify the `batch_size` is your data is in the
             form of symbolic tensors, dataset, dataset iterators,
             generators, or `keras.utils.Sequence` instances (since they generate
             batches).
-        epochs: Integer. Number of epochs to train the model.
-            An epoch is an iteration over the entire `x` and `y`
-            data provided.
-            Note that in conjunction with `initial_epoch`,
-            `epochs` is to be understood as "final epoch".
-            The model is not trained for a number of iterations
-            given by `epochs`, but merely until the epoch
-            of index `epochs` is reached.
-        verbose: Integer. 0, 1, or 2. Verbosity mode.
-            0 = silent, 1 = progress bar, 2 = one line per epoch.
+        verbose: Verbosity mode, 0 or 1.
+        steps: Total number of steps (batches of samples)
+            before declaring the prediction round finished.
+            Ignored with the default value of `None`.
         callbacks: List of `keras.callbacks.Callback` instances.
-            List of callbacks to apply during training.
+            List of callbacks to apply during prediction.
             See [callbacks](/api_docs/python/tf/keras/callbacks).
-        validation_split: Float between 0 and 1.
-            Fraction of the training data to be used as validation data.
-            The model will set apart this fraction of the training data,
-            will not train on it, and will evaluate
-            the loss and any model metrics
-            on this data at the end of each epoch.
-            The validation data is selected from the last samples
-            in the `x` and `y` data provided, before shuffling. This argument is
-            not supported when `x` is a dataset, dataset iterator, generator or
-           `keras.utils.Sequence` instance.
-        validation_data: Data on which to evaluate
-            the loss and any model metrics at the end of each epoch.
-            The model will not be trained on this data.
-            `validation_data` will override `validation_split`.
-            `validation_data` could be:
-              - tuple `(x_val, y_val)` of Numpy arrays or tensors
-              - tuple `(x_val, y_val, val_sample_weights)` of Numpy arrays
-              - dataset or a dataset iterator
-            For the first two cases, `batch_size` must be provided.
-            For the last case, `validation_steps` must be provided.
-        shuffle: Boolean (whether to shuffle the training data
-            before each epoch) or str (for 'batch').
-            'batch' is a special option for dealing with the
-            limitations of HDF5 data; it shuffles in batch-sized chunks.
-            Has no effect when `steps_per_epoch` is not `None`.
-        class_weight: Optional dictionary mapping class indices (integers)
-            to a weight (float) value, used for weighting the loss function
-            (during training only).
-            This can be useful to tell the model to
-            "pay more attention" to samples from
-            an under-represented class.
-        sample_weight: Optional Numpy array of weights for
-            the training samples, used for weighting the loss function
-            (during training only). You can either pass a flat (1D)
-            Numpy array with the same length as the input samples
-            (1:1 mapping between weights and samples),
-            or in the case of temporal data,
-            you can pass a 2D array with shape
-            `(samples, sequence_length)`,
-            to apply a different weight to every timestep of every sample.
-            In this case you should make sure to specify
-            `sample_weight_mode="temporal"` in `compile()`. This argument is not
-            supported when `x` is a dataset, dataset iterator, generator, or
-           `keras.utils.Sequence` instance, instead provide the sample_weights
-            as the third element of `x`.
-        initial_epoch: Integer.
-            Epoch at which to start training
-            (useful for resuming a previous training run).
-        steps_per_epoch: Integer or `None`.
-            Total number of steps (batches of samples)
-            before declaring one epoch finished and starting the
-            next epoch. When training with input tensors such as
-            TensorFlow data tensors, the default `None` is equal to
-            the number of samples in your dataset divided by
-            the batch size, or 1 if that cannot be determined.
-        validation_steps: Only relevant if `validation_data` is provided and
-            is a dataset or dataset iterator. Total number of steps (batches of
-            samples) to draw before stopping when performing validation
-            at the end of every epoch.
         max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
             input only. Maximum size for the generator queue.
             If unspecified, `max_queue_size` will default to 10.
         workers: Integer. Used for generator or `keras.utils.Sequence` input
-            only. Maximum number of processes to spin up
-            when using process-based threading. If unspecified, `workers`
-            will default to 1. If 0, will execute the generator on the main
-            thread.
+            only. Maximum number of processes to spin up when using
+            process-based threading. If unspecified, `workers` will default
+            to 1. If 0, will execute the generator on the main thread.
         use_multiprocessing: Boolean. Used for generator or
             `keras.utils.Sequence` input only. If `True`, use process-based
             threading. If unspecified, `use_multiprocessing` will default to
             `False`. Note that because this implementation relies on
             multiprocessing, you should not pass non-picklable arguments to
             the generator as they can't be passed easily to children processes.
-        **kwargs: Used for backwards compatibility.
+
+
+    Returns:
+        Numpy array(s) of predictions.
+
+    Raises:
+        ValueError: In case of mismatch between the provided
+            input data and the model's expectations,
+            or in case a stateful model receives a number of samples
+            that is not a multiple of the batch size.
+    """
+    # Case 1: distribution strategy.
+    if self._distribution_strategy:
+      return training_distributed.predict_distributed(self,
+                                                      x=x,
+                                                      batch_size=batch_size,
+                                                      verbose=verbose,
+                                                      steps=steps,
+                                                      callbacks=callbacks)
+
+    batch_size = self._validate_or_infer_batch_size(batch_size, steps, x)
+
+    # Case 2: generator-like. Input is Python generator, or Sequence object,
+    # or a non-distributed Dataset or iterator in eager execution.
+    if data_utils.is_generator_or_sequence(x):
+      return self.predict_generator(
+          x,
+          steps=steps,
+          verbose=verbose,
+          callbacks=callbacks,
+          max_queue_size=max_queue_size,
+          workers=workers,
+          use_multiprocessing=use_multiprocessing)
+    if training_utils.is_eager_dataset_or_iterator(x):
+      return training_generator.predict_generator(
+          self,
+          x,
+          steps=steps,
+          batch_size=batch_size,
+          verbose=verbose,
+          workers=0,
+          callbacks=callbacks)
+
+    # Case 3: Symbolic tensors or Numpy array-like.
+    # This includes Datasets and iterators in graph mode (since they
+    # generate symbolic tensors).
+    x, _, _ = self._standardize_user_data(
+        x, check_steps=True, steps_name='steps', steps=steps)
+
+    if self.run_eagerly:
+      return training_generator.predict_generator(
+          self,
+          x,
+          steps=steps,
+          batch_size=batch_size,
+          verbose=verbose,
+          workers=0,
+          callbacks=callbacks)
+    else:
+      return training_arrays.predict_loop(
+          self,
+          x,
+          batch_size=batch_size,
+          verbose=verbose,
+          steps=steps,
+          callbacks=callbacks)
+
+  def reset_metrics(self):
+    """Resets the state of metrics."""
+    if hasattr(self, 'metrics'):
+      for m in self.metrics:
+        m.reset_states()
+
+    # Reset the state of loss metric wrappers.
+    if hasattr(
+        self, '_output_loss_metrics') and self._output_loss_metrics is not None:
+      for m in self._output_loss_metrics:
+        m.reset_states()
+
+    # Reset metrics on all the distributed (cloned) models.
+    if self._distribution_strategy:
+      distributed_training_utils._reset_metrics(self)  # pylint: disable=protected-access
+
+  def train_on_batch(self,
+                     x,
+                     y=None,
+                     sample_weight=None,
+                     class_weight=None,
+                     reset_metrics=True):
+    """Runs a single gradient update on a single batch of data.
+
+    Arguments:
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+              (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+              (in case the model has multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors,
+              if the model has named inputs.
+          - A `tf.data` dataset or a dataset iterator.
+        y: Target data. Like the input data `x`, it could be either Numpy
+          array(s) or TensorFlow tensor(s). It should be consistent with `x`
+          (you cannot have Numpy inputs and tensor targets, or inversely). If
+          `x` is a dataset or a dataset iterator, `y` should not be specified
+          (since targets will be obtained from the iterator).
+        sample_weight: Optional array of the same length as x, containing
+          weights to apply to the model's loss for each sample. In the case of
+          temporal data, you can pass a 2D array with shape (samples,
+          sequence_length), to apply a different weight to every timestep of
+          every sample. In this case you should make sure to specify
+          sample_weight_mode="temporal" in compile(). This argument is not
+          supported when `x` is a dataset or a dataset iterator.
+        class_weight: Optional dictionary mapping class indices (integers) to a
+          weight (float) to apply to the model's loss for the samples from this
+          class during training. This can be useful to tell the model to "pay
+          more attention" to samples from an under-represented class.
+        reset_metrics: If `True`, the metrics returned will be only for this
+          batch. If `False`, the metrics will be statefully accumulated across
+          batches.
+
+    Returns:
+        Scalar training loss
+        (if the model has a single output and no metrics)
+        or list of scalars (if the model has multiple outputs
+        and/or metrics). The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
+
+    Raises:
+      ValueError: In case of invalid user-provided arguments.
+    """
+    if self._distribution_strategy:
+      raise NotImplementedError('`train_on_batch` is not supported for models '
+                                'compiled with DistributionStrategy.')
+    # Validate and standardize user data.
+    x, y, sample_weights = self._standardize_user_data(
+        x, y, sample_weight=sample_weight, class_weight=class_weight,
+        extract_tensors_from_dataset=True)
+
+    if self.run_eagerly:
+      outputs = training_eager.train_on_batch(
+          self,
+          x,
+          y,
+          sample_weights=sample_weights,
+          reset_metrics=reset_metrics,
+          output_loss_metrics=self._output_loss_metrics)
+    else:
+      if not isinstance(K.symbolic_learning_phase(), int):
+        ins = x + y + sample_weights + [True]
+      else:
+        ins = x + y + sample_weights
+
+      if reset_metrics:
+        self._make_train_function()
+        outputs = self.train_function(ins)  # pylint: disable=not-callable
+      else:
+        self._make_fit_function()
+        outputs = self._fit_function(ins)  # pylint: disable=not-callable
+
+    if reset_metrics:
+      self.reset_metrics()
+
+    if len(outputs) == 1:
+      return outputs[0]
+    return outputs
+
+  def test_on_batch(self, x, y=None, sample_weight=None, reset_metrics=True):
+    """Test the model on a single batch of samples.
+
+    Arguments:
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors,
+            if the model has named inputs.
+          - A `tf.data` dataset or a dataset iterator.
+        y: Target data. Like the input data `x`,
+          it could be either Numpy array(s) or TensorFlow tensor(s).
+          It should be consistent with `x` (you cannot have Numpy inputs and
+          tensor targets, or inversely). If `x` is a dataset or a
+          dataset iterator, `y` should not be specified
+          (since targets will be obtained from the iterator).
+        sample_weight: Optional array of the same length as x, containing
+            weights to apply to the model's loss for each sample.
+            In the case of temporal data, you can pass a 2D array
+            with shape (samples, sequence_length),
+            to apply a different weight to every timestep of every sample.
+            In this case you should make sure to specify
+            sample_weight_mode="temporal" in compile(). This argument is not
+            supported when `x` is a dataset or a dataset iterator.
+        reset_metrics: If `True`, the metrics returned will be only for this
+          batch. If `False`, the metrics will be statefully accumulated across
+          batches.
+
+    Returns:
+        Scalar test loss (if the model has a single output and no metrics)
+        or list of scalars (if the model has multiple outputs
+        and/or metrics). The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
+
+    Raises:
+        ValueError: In case of invalid user-provided arguments.
+    """
+    if self._distribution_strategy:
+      raise NotImplementedError('`test_on_batch` is not supported for models '
+                                'compiled with DistributionStrategy.')
+    # Validate and standardize user data.
+    x, y, sample_weights = self._standardize_user_data(
+        x, y, sample_weight=sample_weight, extract_tensors_from_dataset=True)
+
+    if self.run_eagerly:
+      outputs = training_eager.test_on_batch(
+          self,
+          x,
+          y,
+          sample_weights=sample_weights,
+          reset_metrics=reset_metrics,
+          output_loss_metrics=self._output_loss_metrics)
+    else:
+      inputs = x + y + sample_weights
+      if reset_metrics:
+        self._make_test_function()
+        outputs = self.test_function(inputs)  # pylint: disable=not-callable
+      else:
+        self._make_eval_function()
+        outputs = self._eval_function(inputs)  # pylint: disable=not-callable
+
+    if reset_metrics:
+      self.reset_metrics()
+
+    if len(outputs) == 1:
+      return outputs[0]
+    return outputs
+
+  def predict_on_batch(self, x):
+    """Returns predictions for a single batch of samples.
+
+    Arguments:
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A `tf.data` dataset or a dataset iterator.
+
+    Returns:
+        Numpy array(s) of predictions.
+
+    Raises:
+        ValueError: In case of mismatch between given number of inputs and
+          expectations of the model.
+    """
+    if self._distribution_strategy:
+      raise NotImplementedError('`predict_on_batch` is not supported for '
+                                'models compiled with DistributionStrategy.')
+    # Validate and standardize user data.
+    inputs, _, _ = self._standardize_user_data(
+        x, extract_tensors_from_dataset=True)
+    if self.run_eagerly:
+      if (isinstance(inputs, iterator_ops.EagerIterator) or
+          (isinstance(inputs, dataset_ops.DatasetV2))):
+        inputs = training_utils.cast_if_floating_dtype(inputs)
+      elif isinstance(inputs, collections.Sequence):
+        inputs = [
+            ops.convert_to_tensor(val, dtype=K.floatx()) for val in inputs]
+
+        # Unwrap lists with only one input, as we do when training on batch
+        if len(inputs) == 1:
+          inputs = inputs[0]
+
+      return self(inputs)  # pylint: disable=not-callable
+
+    self._make_predict_function()
+    outputs = self.predict_function(inputs)
+
+    if len(outputs) == 1:
+      return outputs[0]
+    return outputs
+
+  def fit_generator(self,
+                    generator,
+                    steps_per_epoch=None,
+                    epochs=1,
+                    verbose=1,
+                    callbacks=None,
+                    validation_data=None,
+                    validation_steps=None,
+                    validation_freq=1,
+                    class_weight=None,
+                    max_queue_size=10,
+                    workers=1,
+                    use_multiprocessing=False,
+                    shuffle=True,
+                    initial_epoch=0):
+    """Fits the model on data yielded batch-by-batch by a Python generator.
+
+    The generator is run in parallel to the model, for efficiency.
+    For instance, this allows you to do real-time data augmentation
+    on images on CPU in parallel to training your model on GPU.
+
+    The use of `keras.utils.Sequence` guarantees the ordering
+    and guarantees the single use of every input per epoch when
+    using `use_multiprocessing=True`.
+
+    Arguments:
+        generator: A generator or an instance of `Sequence`
+          (`keras.utils.Sequence`)
+            object in order to avoid duplicate data
+            when using multiprocessing.
+            The output of the generator must be either
+            - a tuple `(inputs, targets)`
+            - a tuple `(inputs, targets, sample_weights)`.
+            This tuple (a single output of the generator) makes a single batch.
+            Therefore, all arrays in this tuple must have the same length (equal
+            to the size of this batch). Different batches may have different
+              sizes.
+            For example, the last batch of the epoch is commonly smaller than
+              the
+            others, if the size of the dataset is not divisible by the batch
+              size.
+            The generator is expected to loop over its data
+            indefinitely. An epoch finishes when `steps_per_epoch`
+            batches have been seen by the model.
+        steps_per_epoch: Total number of steps (batches of samples)
+            to yield from `generator` before declaring one epoch
+            finished and starting the next epoch. It should typically
+            be equal to the number of samples of your dataset
+            divided by the batch size.
+            Optional for `Sequence`: if unspecified, will use
+            the `len(generator)` as a number of steps.
+        epochs: Integer, total number of iterations on the data.
+        verbose: Verbosity mode, 0, 1, or 2.
+        callbacks: List of callbacks to be called during training.
+        validation_data: This can be either
+            - a generator for the validation data
+            - a tuple (inputs, targets)
+            - a tuple (inputs, targets, sample_weights).
+        validation_steps: Only relevant if `validation_data`
+            is a generator. Total number of steps (batches of samples)
+            to yield from `generator` before stopping.
+            Optional for `Sequence`: if unspecified, will use
+            the `len(validation_data)` as a number of steps.
+        validation_freq: Only relevant if validation data is provided. Integer
+            or `collections.Container` instance (e.g. list, tuple, etc.). If an
+            integer, specifies how many training epochs to run before a new
+            validation run is performed, e.g. `validation_freq=2` runs
+            validation every 2 epochs. If a Container, specifies the epochs on
+            which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
+            validation at the end of the 1st, 2nd, and 10th epochs.
+        class_weight: Dictionary mapping class indices to a weight
+            for the class.
+        max_queue_size: Integer. Maximum size for the generator queue.
+            If unspecified, `max_queue_size` will default to 10.
+        workers: Integer. Maximum number of processes to spin up
+            when using process-based threading.
+            If unspecified, `workers` will default to 1. If 0, will
+            execute the generator on the main thread.
+        use_multiprocessing: Boolean.
+            If `True`, use process-based threading.
+            If unspecified, `use_multiprocessing` will default to `False`.
+            Note that because this implementation relies on multiprocessing,
+            you should not pass non-picklable arguments to the generator
+            as they can't be passed easily to children processes.
+        shuffle: Boolean. Whether to shuffle the order of the batches at
+            the beginning of each epoch. Only used with instances
+            of `Sequence` (`keras.utils.Sequence`).
+            Has no effect when `steps_per_epoch` is not `None`.
+        initial_epoch: Epoch at which to start training
+            (useful for resuming a previous training run)
+
+    Returns:
+        A `History` object.
+
+    Example:
+
+    ```python
+        def generate_arrays_from_file(path):
+            while 1:
+                f = open(path)
+                for line in f:
+                    # create numpy arrays of input data
+                    # and labels, from each line in the file
+                    x1, x2, y = process_line(line)
+                    yield ({'input_1': x1, 'input_2': x2}, {'output': y})
+                f.close()
+
+        model.fit_generator(generate_arrays_from_file('/my_file.txt'),
+                            steps_per_epoch=10000, epochs=10)
+    ```
+    Raises:
+        ValueError: In case the generator yields data in an invalid format.
+    """
+    if self._distribution_strategy:
+      raise NotImplementedError('`fit_generator` is not supported for '
+                                'models compiled with DistributionStrategy.')
+    return training_generator.fit_generator(
+        self,
+        generator,
+        steps_per_epoch=steps_per_epoch,
+        epochs=epochs,
+        verbose=verbose,
+        callbacks=callbacks,
+        validation_data=validation_data,
+        validation_steps=validation_steps,
+        validation_freq=validation_freq,
+        class_weight=class_weight,
+        max_queue_size=max_queue_size,
+        workers=workers,
+        use_multiprocessing=use_multiprocessing,
+        shuffle=shuffle,
+        initial_epoch=initial_epoch,
+        steps_name='steps_per_epoch')
+
+  def evaluate_generator(self,
+                         generator,
+                         steps=None,
+                         callbacks=None,
+                         max_queue_size=10,
+                         workers=1,
+                         use_multiprocessing=False,
+                         verbose=0):
+    """Evaluates the model on a data generator.
+
+    The generator should return the same kind of data
+    as accepted by `test_on_batch`.
+
+    Arguments:
+        generator: Generator yielding tuples (inputs, targets)
+            or (inputs, targets, sample_weights)
+            or an instance of `keras.utils.Sequence`
+            object in order to avoid duplicate data
+            when using multiprocessing.
+        steps: Total number of steps (batches of samples)
+            to yield from `generator` before stopping.
+            Optional for `Sequence`: if unspecified, will use
+            the `len(generator)` as a number of steps.
+        callbacks: List of `keras.callbacks.Callback` instances.
+            List of callbacks to apply during evaluation.
+            See [callbacks](/api_docs/python/tf/keras/callbacks).
+        max_queue_size: maximum size for the generator queue
+        workers: Integer. Maximum number of processes to spin up
+            when using process-based threading.
+            If unspecified, `workers` will default to 1. If 0, will
+            execute the generator on the main thread.
+        use_multiprocessing: Boolean.
+            If `True`, use process-based threading.
+            If unspecified, `use_multiprocessing` will default to `False`.
+            Note that because this implementation relies on multiprocessing,
+            you should not pass non-picklable arguments to the generator
+            as they can't be passed easily to children processes.
+        verbose: Verbosity mode, 0 or 1.
 
     Returns:
-        A `History` object. Its `History.history` attribute is
-        a record of training loss values and metrics values
-        at successive epochs, as well as validation loss values
-        and validation metrics values (if applicable).
+        Scalar test loss (if the model has a single output and no metrics)
+        or list of scalars (if the model has multiple outputs
+        and/or metrics). The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
 
     Raises:
-        RuntimeError: If the model was never compiled.
-        ValueError: In case of mismatch between the provided input data
-            and what the model expects.
+        ValueError: in case of invalid arguments.
+
+    Raises:
+        ValueError: In case the generator yields data in an invalid format.
     """
-    # TODO(fchollet): this method may be creating reference cycles, which would
-    # lead to accumulating garbage in memory when called in a loop. Investigate.
-    if data_utils.is_generator_or_sequence(x):
-      training_utils.check_generator_arguments(y, sample_weight)
-      return self.fit_generator(
-          x,
-          steps_per_epoch=steps_per_epoch,
-          epochs=epochs,
-          verbose=verbose,
-          callbacks=callbacks,
-          validation_data=validation_data,
-          validation_steps=validation_steps,
-          class_weight=class_weight,
-          max_queue_size=max_queue_size,
-          workers=workers,
-          use_multiprocessing=use_multiprocessing,
-          shuffle=shuffle,
-          initial_epoch=initial_epoch)
+    if self._distribution_strategy:
+      raise NotImplementedError('`evaluate_generator` is not supported for '
+                                'models compiled with DistributionStrategy.')
+    return training_generator.evaluate_generator(
+        self,
+        generator,
+        steps=steps,
+        max_queue_size=max_queue_size,
+        workers=workers,
+        use_multiprocessing=use_multiprocessing,
+        verbose=verbose,
+        callbacks=callbacks)
 
-    # Legacy support
-    if 'nb_epoch' in kwargs:
-      logging.warning(
-          'The `nb_epoch` argument in `fit` '
-          'has been renamed `epochs`.')
-      epochs = kwargs.pop('nb_epoch')
-    if kwargs:
-      raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
+  def predict_generator(self,
+                        generator,
+                        steps=None,
+                        callbacks=None,
+                        max_queue_size=10,
+                        workers=1,
+                        use_multiprocessing=False,
+                        verbose=0):
+    """Generates predictions for the input samples from a data generator.
 
-    # Validate and standardize user data.
+    The generator should return the same kind of data as accepted by
+    `predict_on_batch`.
+
+    Arguments:
+        generator: Generator yielding batches of input samples
+            or an instance of `keras.utils.Sequence` object in order to
+            avoid duplicate data when using multiprocessing.
+        steps: Total number of steps (batches of samples)
+            to yield from `generator` before stopping.
+            Optional for `Sequence`: if unspecified, will use
+            the `len(generator)` as a number of steps.
+        callbacks: List of `keras.callbacks.Callback` instances.
+            List of callbacks to apply during prediction.
+            See [callbacks](/api_docs/python/tf/keras/callbacks).
+        max_queue_size: Maximum size for the generator queue.
+        workers: Integer. Maximum number of processes to spin up
+            when using process-based threading.
+            If unspecified, `workers` will default to 1. If 0, will
+            execute the generator on the main thread.
+        use_multiprocessing: Boolean.
+            If `True`, use process-based threading.
+            If unspecified, `use_multiprocessing` will default to `False`.
+            Note that because this implementation relies on multiprocessing,
+            you should not pass non-picklable arguments to the generator
+            as they can't be passed easily to children processes.
+        verbose: verbosity mode, 0 or 1.
+
+    Returns:
+        Numpy array(s) of predictions.
+
+    Raises:
+        ValueError: In case the generator yields data in an invalid format.
+    """
     if self._distribution_strategy:
-      distributed_training_utils.validate_callbacks(callbacks, self.optimizer,
-                                                    self._distribution_strategy)
+      raise NotImplementedError('`predict_generator` is not supported for '
+                                'models compiled with DistributionStrategy.')
+    return training_generator.predict_generator(
+        self,
+        generator,
+        steps=steps,
+        max_queue_size=max_queue_size,
+        workers=workers,
+        use_multiprocessing=use_multiprocessing,
+        verbose=verbose,
+        callbacks=callbacks)
 
-      distributed_training_utils.validate_inputs(
-          x, y, self._distribution_strategy)
+  def _prepare_total_loss(self, skip_target_indices=None, masks=None):
+    """Computes total loss from loss functions.
 
-      first_x_value = nest.flatten(x)[0]
-      if isinstance(first_x_value, np.ndarray):
-        steps_per_epoch, batch_size = (
-            distributed_training_utils.get_input_params(
-                self._distribution_strategy, first_x_value, steps_per_epoch,
-                batch_size, is_training=True))
+    Arguments:
+        skip_target_indices: A list of indices of model outputs where loss
+          function is None.
+        masks: List of mask values corresponding to each model output.
 
-    batch_size = self._validate_or_infer_batch_size(batch_size, steps_per_epoch,
-                                                    x)
+    Returns:
+        A list of loss weights of python floats.
 
-    x, y, sample_weights = self._standardize_user_data(
-        x,
-        y,
-        sample_weight=sample_weight,
-        class_weight=class_weight,
-        batch_size=batch_size,
-        check_steps=True,
-        steps_name='steps_per_epoch',
-        steps=steps_per_epoch,
-        validation_split=validation_split,
-        shuffle=shuffle)
+    Raises:
+        TypeError: If model run_eagerly is True.
+    """
+    if self.run_eagerly:
+      raise TypeError('total loss can not be computed when compiled with '
+                      'run_eagerly = True.')
+    skip_target_indices = skip_target_indices or []
+    total_loss = None
+    with K.name_scope('loss'):
+      zipped_inputs = zip(self.targets, self.outputs, self.loss_functions,
+                          self.sample_weights, masks, self.loss_weights_list)
+      for i, (y_true, y_pred, loss_fn, sample_weight, mask,
+              loss_weight) in enumerate(zipped_inputs):
+        if i in skip_target_indices:
+          continue
+        loss_name = self.output_names[i] + '_loss'
+        with K.name_scope(loss_name):
+          if mask is not None:
+            mask = math_ops.cast(mask, y_pred.dtype)
+            # Update weights with mask.
+            if sample_weight is None:
+              sample_weight = mask
+            else:
+              # Update dimensions of weights to match with mask if possible.
+              mask, _, sample_weight = (
+                  losses_utils.squeeze_or_expand_dimensions(
+                      mask, None, sample_weight))
+              sample_weight *= mask
+
+          # Reset reduction on the loss so that we can get the per sample loss
+          # value. We use this to get both the stateless and stateful loss
+          # values without having to compute the underlying loss function
+          # twice.
+          weighted_losses = None
+          if hasattr(loss_fn, 'reduction'):
+            current_loss_reduction = loss_fn.reduction
+            loss_fn.reduction = losses_utils.ReductionV2.NONE
+            weighted_losses = loss_fn(
+                y_true, y_pred, sample_weight=sample_weight)
+            loss_fn.reduction = current_loss_reduction
+
+            # Compute the stateless loss value.
+            output_loss = losses_utils.reduce_weighted_loss(
+                weighted_losses, reduction=current_loss_reduction)
+          else:
+            # Compute the stateless loss value for a custom loss class.
+            # Here we assume that the class takes care of loss reduction
+            # because if this class returns a vector value we cannot
+            # differentiate between use case where a custom optimizer
+            # expects a vector loss value vs unreduced per-sample loss value.
+            output_loss = loss_fn(y_true, y_pred, sample_weight=sample_weight)
 
-    # Prepare validation data.
-    if validation_data:
-      if (isinstance(validation_data, iterator_ops.Iterator) or
-          isinstance(validation_data, iterator_ops.EagerIterator) or
-          isinstance(validation_data, dataset_ops.DatasetV2)):
-        val_x = validation_data
-        val_y = None
-        val_sample_weight = None
-      elif len(validation_data) == 2:
-        val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
-        val_sample_weight = None
-      elif len(validation_data) == 3:
-        val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
-      else:
-        raise ValueError(
-            'When passing a `validation_data` argument, '
-            'it must contain either 2 items (x_val, y_val), '
-            'or 3 items (x_val, y_val, val_sample_weights), '
-            'or alternatively it could be a dataset or a '
-            'dataset or a dataset iterator. '
-            'However we received `validation_data=%s`' % validation_data)
-
-      # Validate and standardize validation data.
-      if self._distribution_strategy:
-        distributed_training_utils.validate_inputs(
-            val_x, val_y, self._distribution_strategy)
-        first_valx_value = nest.flatten(val_x)[0]
-        if isinstance(first_valx_value, np.ndarray):
-          validation_steps, _ = distributed_training_utils.get_input_params(
-              self._distribution_strategy, first_valx_value, validation_steps,
-              batch_size)
+        if len(self.outputs) > 1:
+          # Keep track of the un-aggregated loss result tensor.
+          self._compile_metrics_tensors[loss_name] = output_loss
+
+          # Keep track of stateful result tensor and function for the loss.
+          # Compute the stateful loss value.
+          if weighted_losses is not None:
+            # TODO(b/120571621): Directly call metric when the bug is fixed.
+            aggregated_output_loss = self._call_fn_for_each_replica(
+                self._output_loss_metrics[i], weighted_losses)
+          else:
+            # Custom loss class.
+            aggregated_output_loss = self._call_metric_fn(
+                self._output_loss_metrics[i], y_true, y_pred, sample_weight)
+          self._compile_stateful_metrics_tensors[
+              loss_name] = aggregated_output_loss
+          self._compile_stateful_metric_functions.append(
+              self._output_loss_metrics[i])
 
-      val_x, val_y, val_sample_weights = self._standardize_user_data(
-          val_x,
-          val_y,
-          sample_weight=val_sample_weight,
-          batch_size=batch_size,
-          steps=validation_steps)
+        if total_loss is None:
+          total_loss = loss_weight * output_loss
+        else:
+          total_loss += loss_weight * output_loss
+      if total_loss is None:
+        if not self.losses:
+          raise ValueError('The model cannot be compiled '
+                           'because it has no loss to optimize.')
+        else:
+          total_loss = 0.
 
-    elif validation_split and 0. < validation_split < 1.:
-      if training_utils.has_symbolic_tensors(x):
-        raise ValueError('If your data is in the form of symbolic tensors, '
-                         'you cannot use `validation_split`.')
-      if hasattr(x[0], 'shape'):
-        split_at = int(x[0].shape[0] * (1. - validation_split))
-      else:
-        split_at = int(len(x[0]) * (1. - validation_split))
-      x, val_x = (slice_arrays(x, 0, split_at), slice_arrays(x, split_at))
-      y, val_y = (slice_arrays(y, 0, split_at), slice_arrays(y, split_at))
-      sample_weights, val_sample_weights = (slice_arrays(
-          sample_weights, 0, split_at), slice_arrays(sample_weights, split_at))
-    elif validation_steps:
-      val_x = []
-      val_y = []
-      val_sample_weights = []
-    else:
-      val_x = None
-      val_y = None
-      val_sample_weights = None
+      # Add regularization penalties and other layer-specific losses.
+      if self.losses:
+        total_loss += losses_utils.scale_loss_for_distribution(
+            math_ops.add_n(self.losses))
+    return total_loss
 
-    if (self.run_eagerly or (isinstance(x, iterator_ops.EagerIterator) and
-                             not self._distribution_strategy)):
-      return training_generator.fit_generator(
-          self, (x, y, sample_weights),
-          steps_per_epoch=steps_per_epoch,
-          batch_size=batch_size,
-          epochs=epochs,
-          shuffle=shuffle,
-          verbose=verbose,
-          callbacks=callbacks,
-          validation_data=validation_data,
-          validation_steps=validation_steps,
-          workers=0,
-          initial_epoch=initial_epoch)
-    elif distributed_training_utils.is_tpu_strategy(
-        self._distribution_strategy):
-      return training_distributed.experimental_fit_loop(
-          self,
-          x,
-          epochs=epochs,
-          verbose=verbose,
-          callbacks=callbacks,
-          val_iterator=val_x,
-          initial_epoch=initial_epoch,
-          steps_per_epoch=steps_per_epoch,
-          validation_steps=validation_steps)
-    else:
-      return training_arrays.fit_loop(
-          self,
-          x,
-          y,
-          sample_weights=sample_weights,
-          batch_size=batch_size,
-          epochs=epochs,
-          verbose=verbose,
-          callbacks=callbacks,
-          val_inputs=val_x,
-          val_targets=val_y,
-          val_sample_weights=val_sample_weights,
-          shuffle=shuffle,
-          initial_epoch=initial_epoch,
-          steps_per_epoch=steps_per_epoch,
-          validation_steps=validation_steps)
+  def _get_callback_model(self):
+    """Returns the Callback Model for this Model."""
 
-  def evaluate(self,
-               x=None,
-               y=None,
-               batch_size=None,
-               verbose=1,
-               sample_weight=None,
-               steps=None,
-               max_queue_size=10,
-               workers=1,
-               use_multiprocessing=False):
-    """Returns the loss value & metrics values for the model in test mode.
+    if hasattr(self, '_replicated_model') and self._replicated_model:
+      # When using training_distributed, we set the callback model
+      # to an instance of the `DistributedModel` that we create in
+      # the `compile` call. The `DistributedModel` is initialized
+      # with the first replicated model. We need to set the callback
+      # model to a DistributedModel to allow us to override saving
+      # and loading weights when we checkpoint the model during training.
+      return self._replicated_model
+    if hasattr(self, 'callback_model') and self.callback_model:
+      return self.callback_model
+    return self
+
+  def _make_callback_model(self, grouped_model):
+    first_replicated_model = self._distribution_strategy.unwrap(
+        grouped_model)[0]
+    # We initialize the callback model with the first replicated model.
+    self._replicated_model = DistributedCallbackModel(first_replicated_model)
+    self._replicated_model.set_original_model(self)
 
-    Computation is done in batches.
+  def _validate_or_infer_batch_size(self, batch_size, steps, x):
+    """Validates that the `batch_size` provided is consistent with InputLayer.
+
+    It's possible that the user specified a static batch size in their
+    InputLayer. If so, this method checks the provided `batch_size` and `x`
+    arguments are consistent with this static batch size. Also, if
+    `batch_size` is `None`, this method will attempt to infer the batch size
+    from the static batch size of the InputLayer. Lastly, ValueError will be
+    raised if `x` is a tf.data.Dataset and `batch_size` is specified as we
+    expect users to provide batched datasets.
 
     Arguments:
-        x: Input data. It could be:
-          - A Numpy array (or array-like), or a list of arrays
-            (in case the model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors
-            (in case the model has multiple inputs).
-          - A dict mapping input names to the corresponding array/tensors,
-            if the model has named inputs.
-          - A `tf.data` dataset or a dataset iterator.
-          - A generator or `keras.utils.Sequence` instance.
-        y: Target data. Like the input data `x`,
-          it could be either Numpy array(s) or TensorFlow tensor(s).
-          It should be consistent with `x` (you cannot have Numpy inputs and
-          tensor targets, or inversely).
-          If `x` is a dataset, dataset iterator, generator or
-          `keras.utils.Sequence` instance, `y` should not be specified (since
-          targets will be obtained from the iterator/dataset).
-        batch_size: Integer or `None`.
-            Number of samples per gradient update.
-            If unspecified, `batch_size` will default to 32.
-            Do not specify the `batch_size` is your data is in the
-            form of symbolic tensors, dataset, dataset iterators,
-            generators, or `keras.utils.Sequence` instances (since they generate
-            batches).
-        verbose: 0 or 1. Verbosity mode.
-            0 = silent, 1 = progress bar.
-        sample_weight: Optional Numpy array of weights for
-            the test samples, used for weighting the loss function.
-            You can either pass a flat (1D)
-            Numpy array with the same length as the input samples
-            (1:1 mapping between weights and samples),
-            or in the case of temporal data,
-            you can pass a 2D array with shape
-            `(samples, sequence_length)`,
-            to apply a different weight to every timestep of every sample.
-            In this case you should make sure to specify
-            `sample_weight_mode="temporal"` in `compile()`. This argument is not
-            supported when `x` is a dataset or a dataset iterator, instead pass
-            sample weights as the third element of `x`.
-        steps: Integer or `None`.
-            Total number of steps (batches of samples)
-            before declaring the evaluation round finished.
-            Ignored with the default value of `None`.
-        max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
-            input only. Maximum size for the generator queue.
-            If unspecified, `max_queue_size` will default to 10.
-        workers: Integer. Used for generator or `keras.utils.Sequence` input
-            only. Maximum number of processes to spin up when using
-            process-based threading. If unspecified, `workers` will default
-            to 1. If 0, will execute the generator on the main thread.
-        use_multiprocessing: Boolean. Used for generator or
-            `keras.utils.Sequence` input only. If `True`, use process-based
-            threading. If unspecified, `use_multiprocessing` will default to
-            `False`. Note that because this implementation relies on
-            multiprocessing, you should not pass non-picklable arguments to
-            the generator as they can't be passed easily to children processes.
+      batch_size: The batch_size provided as an argument to
+        fit/evaluate/predict.
+      steps: The steps provided as an argument to fit/evaluate/predict.
+      x: The data passed as `x` to fit/evaluate/predict.
 
     Returns:
-        Scalar test loss (if the model has a single output and no metrics)
-        or list of scalars (if the model has multiple outputs
-        and/or metrics). The attribute `model.metrics_names` will give you
-        the display labels for the scalar outputs.
-
-    Raises:
-        ValueError: in case of invalid arguments.
+      The validated batch_size, auto-inferred from the first layer if not
+      provided.
     """
-    if data_utils.is_generator_or_sequence(x):
-      training_utils.check_generator_arguments(y, sample_weight)
-      return self.evaluate_generator(
-          x,
-          steps=steps,
-          verbose=verbose,
-          max_queue_size=max_queue_size,
-          workers=workers,
-          use_multiprocessing=use_multiprocessing)
-    # Validate and standardize user data.
-    if self._distribution_strategy:
-      distributed_training_utils.validate_inputs(
-          x, y, self._distribution_strategy)
-      first_x_value = nest.flatten(x)[0]
-      if isinstance(first_x_value, np.ndarray):
-        steps, batch_size = distributed_training_utils.get_input_params(
-            self._distribution_strategy, first_x_value, steps, batch_size)
+    if batch_size is not None and isinstance(x, dataset_ops.DatasetV2):
+      raise ValueError('The `batch_size` argument must not be specified when'
+                       ' using dataset as an input.')
 
-    batch_size = self._validate_or_infer_batch_size(batch_size, steps, x)
+    layers = super(Model, self).layers  # Avoids the override in Sequential.
+    if layers:
+      first_layer = layers[0]
+      static_batch_size = training_utils.get_static_batch_size(first_layer)
+      if static_batch_size is not None:
 
-    x, y, sample_weights = self._standardize_user_data(
-        x,
-        y,
-        sample_weight=sample_weight,
-        batch_size=batch_size,
-        check_steps=True,
-        steps_name='steps',
-        steps=steps)
+        # Check `batch_size` argument is consistent with InputLayer.
+        if batch_size is not None and batch_size != static_batch_size:
+          raise ValueError('The `batch_size` argument value {} is incompatible '
+                           'with the specified batch size of your Input Layer: '
+                           '{}'.format(batch_size, static_batch_size))
 
-    if (self.run_eagerly or (isinstance(x, iterator_ops.EagerIterator) and
-                             not self._distribution_strategy)):
-      return training_generator.evaluate_generator(
-          self, (x, y, sample_weights),
-          steps=steps,
-          batch_size=batch_size,
-          verbose=verbose,
-          workers=0)
-    elif distributed_training_utils.is_tpu_strategy(
-        self._distribution_strategy):
-      return training_distributed.experimental_test_loop(
-          self, iterator=x, verbose=verbose, steps=steps)
-    else:
-      return training_arrays.test_loop(
-          self,
-          inputs=x,
-          targets=y,
-          sample_weights=sample_weights,
-          batch_size=batch_size,
-          verbose=verbose,
-          steps=steps)
+        # Check Dataset/Iterator batch size is consistent with InputLayer.
+        if isinstance(x, (dataset_ops.DatasetV2, iterator_ops.Iterator,
+                          iterator_ops.EagerIterator)):
+          ds_batch_size = tensor_shape.as_dimension(
+              nest.flatten(x.output_shapes)[0][0]).value
+          if ds_batch_size is not None and ds_batch_size != static_batch_size:
+            raise ValueError('The batch output shape of your `Dataset` is {}, '
+                             'which is incompatible with the specified batch '
+                             'size of your Input Layer: {}'.format(
+                                 ds_batch_size, static_batch_size))
 
-  def predict(self,
-              x,
-              batch_size=None,
-              verbose=0,
-              steps=None,
-              max_queue_size=10,
-              workers=1,
-              use_multiprocessing=False):
-    """Generates output predictions for the input samples.
+        # Set inferred batch size from the InputLayer.
+        if steps is None:
+          batch_size = static_batch_size
 
-    Computation is done in batches.
+    if batch_size is None and steps is None:
+      # Backwards compatibility
+      batch_size = 32
+    return batch_size
+
+  def _list_functions_for_serialization(self):
+    return {
+        '_default_save_signature': saving_utils.trace_model_call(self)
+    }
+
+  def _set_sample_weight_attributes(self, sample_weight_mode,
+                                    skip_target_weighing_indices):
+    """Sets sample weight related attributes on the model."""
+    sample_weights, sample_weight_modes = training_utils.prepare_sample_weights(
+        self.output_names, sample_weight_mode, skip_target_weighing_indices)
+    self.sample_weights = sample_weights
+    self.sample_weight_modes = sample_weight_modes
+    self._feed_sample_weight_modes = [
+        sample_weight_modes[i]
+        for i in range(len(self.outputs))
+        if i not in skip_target_weighing_indices
+    ]
+    self._feed_sample_weights = [
+        sample_weights[i]
+        for i in range(len(sample_weights))
+        if i not in skip_target_weighing_indices
+    ]
+
+  def _cache_output_metric_attributes(self, metrics, weighted_metrics):
+    """Caches metric name and function attributes for every model output."""
+    output_shapes = []
+    for output in self.outputs:
+      if output is None or output.shape.rank is None:
+        output_shapes.append(None)
+      else:
+        output_shapes.append(output.shape.as_list())
+    self._per_output_metrics = training_utils.collect_per_output_metric_info(
+        metrics, self.output_names, output_shapes, self.loss_functions)
+    self._per_output_weighted_metrics = (
+        training_utils.collect_per_output_metric_info(
+            weighted_metrics,
+            self.output_names,
+            output_shapes,
+            self.loss_functions,
+            is_weighted=True))
+
+  def _add_unique_metric_name(self, metric_name, output_index):
+    """Makes the metric name unique and adds it to the model's metric name list.
+
+      If there are multiple outputs for which the metrics are calculated, the
+      metric names have to be made unique by appending an integer.
 
     Arguments:
-         x: Input samples. It could be:
-          - A Numpy array (or array-like), or a list of arrays
-            (in case the model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors
-            (in case the model has multiple inputs).
-          - A `tf.data` dataset or a dataset iterator.
-          - A generator or `keras.utils.Sequence` instance.
-        batch_size: Integer or `None`.
-            Number of samples per gradient update.
-            If unspecified, `batch_size` will default to 32.
-            Do not specify the `batch_size` is your data is in the
-            form of symbolic tensors, dataset, dataset iterators,
-            generators, or `keras.utils.Sequence` instances (since they generate
-            batches).
-        verbose: Verbosity mode, 0 or 1.
-        steps: Total number of steps (batches of samples)
-            before declaring the prediction round finished.
-            Ignored with the default value of `None`.
-        max_queue_size: Integer. Used for generator or `keras.utils.Sequence`
-            input only. Maximum size for the generator queue.
-            If unspecified, `max_queue_size` will default to 10.
-        workers: Integer. Used for generator or `keras.utils.Sequence` input
-            only. Maximum number of processes to spin up when using
-            process-based threading. If unspecified, `workers` will default
-            to 1. If 0, will execute the generator on the main thread.
-        use_multiprocessing: Boolean. Used for generator or
-            `keras.utils.Sequence` input only. If `True`, use process-based
-            threading. If unspecified, `use_multiprocessing` will default to
-            `False`. Note that because this implementation relies on
-            multiprocessing, you should not pass non-picklable arguments to
-            the generator as they can't be passed easily to children processes.
+      metric_name: Metric name that corresponds to the metric specified by the
+          user. For example: 'acc'.
+      output_index: The index of the model output for which the metric name is
+        being added.
+
+    Returns:
+      string, name of the model's unique metric name
+    """
+    if len(self.output_names) > 1:
+      metric_name = '%s_%s' % (self.output_names[output_index], metric_name)
+    j = 1
+    base_metric_name = metric_name
+    while metric_name in self._compile_metrics_names:
+      metric_name = '%s_%d' % (base_metric_name, j)
+      j += 1
+
+    return metric_name
+
+  @property
+  def _all_metrics_tensors(self):
+    """Returns the network's symbolic metric tensors."""
+    metrics_tensors = {}
+    if self._is_compiled:
+      metrics_tensors.update(self._compile_metrics_tensors)
+    metrics_tensors.update(super(Model, self)._all_metrics_tensors)
+    return metrics_tensors
+
+  @property
+  def _all_stateful_metrics_tensors(self):
+    """Returns the network's symbolic metric tensors."""
+    metrics_tensors = {}
+    if self._is_compiled:
+      metrics_tensors.update(self._compile_stateful_metrics_tensors)
+    metrics_tensors.update(super(Model, self)._all_metrics_tensors)
+    return metrics_tensors
+
+  def _init_metric_attributes(self):
+    """Initialized model metric attributes."""
+    # List of all metric names in the model. This includes loss metrics.
+    self._compile_metrics_names = ['loss']
+    # List of stateful metric functions. Used for resetting metric state during
+    # training/eval. This includes loss metric functions.
+    self._compile_stateful_metric_functions = []
+    # Dict of all aggregated metric result tensors. This includes aggregated
+    # loss result tensors.
+    self._compile_stateful_metrics_tensors = {}
+    # Dict of all metric result tensors (aggregated or not - based on the
+    # values given in compile.). This includes aggregated loss result tensors.
+    self._compile_metrics_tensors = {}
+    # List of metric wrappers on output losses.
+    self._output_loss_metrics = None
 
+  def _set_per_output_metric_attributes(self, metrics_dict, output_index):
+    """Sets the metric attributes on the model for the given output.
 
-    Returns:
-        Numpy array(s) of predictions.
+    Arguments:
+      metrics_dict: A dict with metric names as keys and metric fns as values.
+      output_index: The index of the model output for which the metric
+        attributes are added.
 
-    Raises:
-        ValueError: In case of mismatch between the provided
-            input data and the model's expectations,
-            or in case a stateful model receives a number of samples
-            that is not a multiple of the batch size.
+    Returns:
+      Metrics dict updated with unique metric names as keys.
     """
-    if data_utils.is_generator_or_sequence(x):
-      return self.predict_generator(
-          x,
-          steps=steps,
-          verbose=verbose,
-          max_queue_size=max_queue_size,
-          workers=workers,
-          use_multiprocessing=use_multiprocessing)
-    if self._distribution_strategy:
-      distributed_training_utils.validate_inputs(
-          x, None, self._distribution_strategy)
-      first_x_value = nest.flatten(x)[0]
-      if isinstance(first_x_value, np.ndarray):
-        steps, batch_size = distributed_training_utils.get_input_params(
-            self._distribution_strategy, first_x_value, steps, batch_size)
+    updated_metrics_dict = collections.OrderedDict()
+    for metric_name, (metric_fn, stateful_metric_fn) in metrics_dict.items():
+      metric_name = self._add_unique_metric_name(metric_name, output_index)
 
-    batch_size = self._validate_or_infer_batch_size(batch_size, steps, x)
+      # Update the name on the metric class to be the unique generated name.
+      stateful_metric_fn._name = metric_name  # pylint: disable=protected-access
+      updated_metrics_dict[metric_name] = (metric_fn, stateful_metric_fn)
+      # Keep track of metric name, function and stateful function.
+      self._compile_metrics_names.append(metric_name)
+      self._compile_stateful_metric_functions.append(stateful_metric_fn)
+    return updated_metrics_dict
 
-    # Validate and standardize user data.
-    if self._distribution_strategy:
-      x, _, _ = self._standardize_user_data(
-          x, check_steps=True, steps_name='steps', steps=steps,
-          batch_size=batch_size)
-    else:
-      # TODO(anjalisridhar): We don't pass batch_size here for some reason. This
-      # means we need to special case distribution strategy which needs the
-      # batch size.
-      x, _, _ = self._standardize_user_data(
-          x, check_steps=True, steps_name='steps', steps=steps)
-
-    if (self.run_eagerly or (isinstance(x, iterator_ops.EagerIterator) and
-                             not self._distribution_strategy)):
-      return training_generator.predict_generator(
-          self,
-          x,
-          steps=steps,
-          batch_size=batch_size,
-          verbose=verbose,
-          workers=0)
-    elif distributed_training_utils.is_tpu_strategy(
-        self._distribution_strategy):
-      return training_distributed.experimental_predict_loop(
-          self, x, verbose=verbose, steps=steps)
-    else:
-      return training_arrays.predict_loop(
-          self, x, batch_size=batch_size, verbose=verbose, steps=steps)
+  def _set_metric_attributes(self, skip_target_indices=None):
+    """Sets the metric attributes on the model for all the model outputs."""
+    # Add loss metric names to the model metric names list.
+    if len(self.outputs) > 1:
+      output_names = [
+          self.output_names[i] + '_loss'
+          for i in range(len(self.outputs))
+          if i not in skip_target_indices
+      ]
+      self._compile_metrics_names.extend(output_names)
 
-  def reset_metrics(self):
-    """Resets the state of metrics."""
-    if hasattr(self, 'metrics'):
-      for m in self.metrics:
-        m.reset_states()
-      if self._distribution_strategy:
-        training_distributed._reset_metrics(self)  # pylint: disable=protected-access
+    skip_target_indices = skip_target_indices or []
+    updated_per_output_metrics = []
+    updated_per_output_weighted_metrics = []
+    for i in range(len(self.outputs)):
+      if i in skip_target_indices:
+        updated_per_output_metrics.append(self._per_output_metrics[i])
+        updated_per_output_weighted_metrics.append(
+            self._per_output_weighted_metrics[i])
+        continue
+      updated_per_output_metrics.append(
+          self._set_per_output_metric_attributes(self._per_output_metrics[i],
+                                                 i))
+      updated_per_output_weighted_metrics.append(
+          self._set_per_output_metric_attributes(
+              self._per_output_weighted_metrics[i], i))
 
-  def train_on_batch(self,
-                     x,
-                     y=None,
-                     sample_weight=None,
-                     class_weight=None,
-                     reset_metrics=True):
-    """Runs a single gradient update on a single batch of data.
+    # Create a metric wrapper for each output loss.
+    if len(self.outputs) > 1:
+      self._output_loss_metrics = [
+          metrics_module.SumOverBatchSize() if hasattr(loss_fn, 'reduction')
+          else metrics_module.SumOverBatchSizeMetricWrapper(loss_fn)
+          for loss_fn in self.loss_functions
+      ]
+
+    self._per_output_metrics = updated_per_output_metrics
+    self._per_output_weighted_metrics = updated_per_output_weighted_metrics
+
+  def _call_metric_fn(self, metric_fn, y_true, y_pred, weights, mask=None):
+    # TODO(b/120571621): Remove this function when the bug is fixed.
+    """Helper function to call metric function with distribution strategy."""
+    return self._call_fn_for_each_replica(
+        training_utils.call_metric_function,
+        metric_fn,
+        y_true,
+        y_pred,
+        weights=weights,
+        mask=mask)
+
+  def _call_fn_for_each_replica(self, fn, *args, **kwargs):
+    # TODO(b/120571621): We want to avoid metric reductions here since
+    # since TPUStrategy does not implement replica local variables.
+    # Remove this hack once we support TPUReplicaLocalVariables.
+    is_tpu = distributed_training_utils.is_tpu_strategy(
+        self._distribution_strategy)
+    if ((not is_tpu) and self._distribution_strategy and
+        distribution_strategy_context.in_cross_replica_context()):
+      with self._distribution_strategy.scope():
+        return self._distribution_strategy.extended.call_for_each_replica(
+            fn, args, kwargs)
+    return fn(*args, **kwargs)
+
+  def _handle_per_output_metrics(self,
+                                 metrics_dict,
+                                 y_true,
+                                 y_pred,
+                                 mask,
+                                 weights=None,
+                                 return_stateful_result=True):
+    """Calls metric functions for a single output.
 
     Arguments:
-        x: Input data. It could be:
-          - A Numpy array (or array-like), or a list of arrays
-              (in case the model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors
-              (in case the model has multiple inputs).
-          - A dict mapping input names to the corresponding array/tensors,
-              if the model has named inputs.
-          - A `tf.data` dataset or a dataset iterator.
-        y: Target data. Like the input data `x`, it could be either Numpy
-          array(s) or TensorFlow tensor(s). It should be consistent with `x`
-          (you cannot have Numpy inputs and tensor targets, or inversely). If
-          `x` is a dataset or a dataset iterator, `y` should not be specified
-          (since targets will be obtained from the iterator).
-        sample_weight: Optional array of the same length as x, containing
-          weights to apply to the model's loss for each sample. In the case of
-          temporal data, you can pass a 2D array with shape (samples,
-          sequence_length), to apply a different weight to every timestep of
-          every sample. In this case you should make sure to specify
-          sample_weight_mode="temporal" in compile(). This argument is not
-          supported when `x` is a dataset or a dataset iterator.
-        class_weight: Optional dictionary mapping class indices (integers) to a
-          weight (float) to apply to the model's loss for the samples from this
-          class during training. This can be useful to tell the model to "pay
-          more attention" to samples from an under-represented class.
-        reset_metrics: If `True`, the metrics returned will be only for this
-          batch. If `False`, the metrics will be statefully accumulated across
-          batches.
+      metrics_dict: A dict with metric names as keys and metric fns as values.
+      y_true: Target output.
+      y_pred: Predicted output.
+      mask: Computed mask value for the current output.
+      weights: Weights to be applied on the current output.
+      return_stateful_result: Boolean, indicates whether the stateful
+        (aggregated)/stateless metric result should be returned.
 
     Returns:
-        Scalar training loss
-        (if the model has a single output and no metrics)
-        or list of scalars (if the model has multiple outputs
-        and/or metrics). The attribute `model.metrics_names` will give you
-        the display labels for the scalar outputs.
-
-    Raises:
-      ValueError: In case of invalid user-provided arguments.
+      A list of metric result tensors.
     """
-    if self._distribution_strategy:
-      raise NotImplementedError('`train_on_batch` is not supported for models '
-                                'compiled with DistributionStrategy.')
-    # Validate and standardize user data.
-    x, y, sample_weights = self._standardize_user_data(
-        x, y, sample_weight=sample_weight, class_weight=class_weight)
+    metric_results = []
+    for metric_name, (metric_fn, stateful_fn) in metrics_dict.items():
+      with K.name_scope(metric_name):
 
-    if self.run_eagerly:
-      outputs = training_eager.train_on_batch(
-          self, x, y, sample_weights=sample_weights)
-    else:
-      if not isinstance(K.symbolic_learning_phase(), int):
-        ins = x + y + sample_weights + [True]
-      else:
-        ins = x + y + sample_weights
+        def _call_stateful_fn(fn):
+          """Create stateful metrics correctly."""
+          return self._call_metric_fn(fn, y_true, y_pred, weights, mask)
 
-      if reset_metrics:
-        self._make_train_function()
-        outputs = self.train_function(ins)  # pylint: disable=not-callable
-      else:
-        self._make_fit_function()
-        outputs = self._fit_function(ins)  # pylint: disable=not-callable
+        def _call_stateless_fn(fn):
+          weighted_metric_fn = training_utils.weighted_masked_objective(fn)
+          return weighted_metric_fn(y_true, y_pred, weights=weights, mask=mask)
 
-    if reset_metrics:
-      self.reset_metrics()
+        def _track_metric_tensors(name, stateless_result, stateful_result):
+          self._compile_metrics_tensors[name] = stateless_result
+          self._compile_stateful_metrics_tensors[name] = stateful_result
 
-    if len(outputs) == 1:
-      return outputs[0]
-    return outputs
+        if isinstance(metric_fn, metrics_module.Metric):
+          # If the given metric fn is stateful, call the fn and return result.
+          metric_result = _call_stateful_fn(metric_fn)
+          metric_results.append(metric_result)
+          if not self.run_eagerly:
+            _track_metric_tensors(metric_name, metric_result, metric_result)
+        elif self.run_eagerly:
+          # In eager mode, if the given metric fn is not stateful, we invoke the
+          # given fn or its stateful version based on the given flag.
+          if return_stateful_result:
+            metric_result = _call_stateful_fn(stateful_fn)
+          else:
+            metric_result = _call_stateless_fn(metric_fn)
+          metric_results.append(metric_result)
+        else:
+          # In graph mode, we build the sub-graph for both the stateful and the
+          # stateless fns.
+          stateful_metric_result = _call_stateful_fn(stateful_fn)
+          metric_result = _call_stateless_fn(metric_fn)
+          _track_metric_tensors(metric_name, metric_result,
+                                stateful_metric_result)
 
-  def test_on_batch(self, x, y=None, sample_weight=None, reset_metrics=True):
-    """Test the model on a single batch of samples.
+    return metric_results
+
+  def _handle_metrics(self,
+                      outputs,
+                      skip_target_indices=None,
+                      targets=None,
+                      sample_weights=None,
+                      masks=None,
+                      return_stateful_result=True):
+    """Handles calling metric functions.
 
     Arguments:
-        x: Input data. It could be:
-          - A Numpy array (or array-like), or a list of arrays
-            (in case the model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors
-            (in case the model has multiple inputs).
-          - A dict mapping input names to the corresponding array/tensors,
-            if the model has named inputs.
-          - A `tf.data` dataset or a dataset iterator.
-        y: Target data. Like the input data `x`,
-          it could be either Numpy array(s) or TensorFlow tensor(s).
-          It should be consistent with `x` (you cannot have Numpy inputs and
-          tensor targets, or inversely). If `x` is a dataset or a
-          dataset iterator, `y` should not be specified
-          (since targets will be obtained from the iterator).
-        sample_weight: Optional array of the same length as x, containing
-            weights to apply to the model's loss for each sample.
-            In the case of temporal data, you can pass a 2D array
-            with shape (samples, sequence_length),
-            to apply a different weight to every timestep of every sample.
-            In this case you should make sure to specify
-            sample_weight_mode="temporal" in compile(). This argument is not
-            supported when `x` is a dataset or a dataset iterator.
-        reset_metrics: If `True`, the metrics returned will be only for this
-          batch. If `False`, the metrics will be statefully accumulated across
-          batches.
+      outputs: List of outputs (predictions).
+      skip_target_indices: Optional. List of target ids to skip.
+      targets: List of targets.
+      sample_weights: Optional list of sample weight arrays.
+      masks: List of computed output mask values.
+      return_stateful_result: Boolean, indicates whether the stateful
+        (aggregated)/stateless metric result should be returned.
+
+    Returns:
+      A list of metric result tensors.
+    """
+    skip_target_indices = skip_target_indices or []
+    metric_results = []
+    with K.name_scope('metrics'):
+      # Invoke all metrics added using `compile`.
+      for i in range(len(outputs)):
+        if i in skip_target_indices:
+          continue
+        output = outputs[i] if outputs else None
+        target = targets[i] if targets else None
+        output_mask = masks[i] if masks else None
+        metric_results.extend(
+            self._handle_per_output_metrics(
+                self._per_output_metrics[i],
+                target,
+                output,
+                output_mask,
+                return_stateful_result=return_stateful_result))
+        metric_results.extend(
+            self._handle_per_output_metrics(
+                self._per_output_weighted_metrics[i],
+                target,
+                output,
+                output_mask,
+                weights=sample_weights[i],
+                return_stateful_result=return_stateful_result))
 
-    Returns:
-        Scalar test loss (if the model has a single output and no metrics)
-        or list of scalars (if the model has multiple outputs
-        and/or metrics). The attribute `model.metrics_names` will give you
-        the display labels for the scalar outputs.
+    # Add metric results from the `add_metric` metrics in eager mode.
+    if context.executing_eagerly():
+      for m in self.metrics:
+        if m not in self._compile_stateful_metric_functions:
+          metric_results.append(m.result())
+    return metric_results
 
-    Raises:
-        ValueError: In case of invalid user-provided arguments.
+  def _check_trainable_weights_consistency(self):
+    """Check trainable weights count consistency.
+
+    This will raise a warning if `trainable_weights` and
+    `_collected_trainable_weights` are inconsistent (i.e. have different
+    number of parameters).
+    Inconsistency will typically arise when one modifies `model.trainable`
+    without calling `model.compile` again.
     """
-    if self._distribution_strategy:
-      raise NotImplementedError('`test_on_batch` is not supported for models '
-                                'compiled with DistributionStrategy.')
-    # Validate and standardize user data.
-    x, y, sample_weights = self._standardize_user_data(
-        x, y, sample_weight=sample_weight)
+    if not hasattr(self, '_collected_trainable_weights'):
+      return
 
-    if self.run_eagerly:
-      outputs = training_eager.test_on_batch(
-          self, x, y, sample_weights=sample_weights)
-    else:
-      inputs = x + y + sample_weights
-      if reset_metrics:
-        self._make_test_function()
-        outputs = self.test_function(inputs)  # pylint: disable=not-callable
-      else:
-        self._make_eval_function()
-        outputs = self._eval_function(inputs)  # pylint: disable=not-callable
+    if len(self.trainable_weights) != len(self._collected_trainable_weights):
+      logging.log_first_n(
+          logging.WARN, 'Discrepancy between trainable weights and collected'
+          ' trainable weights, did you set `model.trainable`'
+          ' without calling `model.compile` after ?', 1)
 
-    if reset_metrics:
-      self.reset_metrics()
+  def _make_train_function_helper(self, fn_name, outputs):
+    if not self._is_compiled:
+      raise RuntimeError('You must compile your model before using it.')
+    self._check_trainable_weights_consistency()
+    if getattr(self, fn_name) is None:
+      inputs = (self._feed_inputs +
+                self._feed_targets +
+                self._feed_sample_weights)
+      if not isinstance(K.symbolic_learning_phase(), int):
+        inputs += [K.symbolic_learning_phase()]
 
-    if len(outputs) == 1:
-      return outputs[0]
-    return outputs
+      with K.get_graph().as_default():
+        with K.name_scope('training'):
+          with K.name_scope(self.optimizer.__class__.__name__):
+            # Training updates
+            updates = self.optimizer.get_updates(
+                params=self._collected_trainable_weights, loss=self.total_loss)
+      # Unconditional updates
+      updates += self.get_updates_for(None)
+      # Conditional updates relevant to this model
+      updates += self.get_updates_for(self.inputs)
 
-  def predict_on_batch(self, x):
-    """Returns predictions for a single batch of samples.
+      with K.name_scope('training'):
+        # Gets loss and metrics. Updates weights at each call.
+        fn = K.function(
+            inputs,
+            outputs,
+            updates=updates,
+            name='train_function',
+            **self._function_kwargs)
+        setattr(self, fn_name, fn)
 
-    Arguments:
-        x: Input data. It could be:
-          - A Numpy array (or array-like), or a list of arrays
-            (in case the model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors
-            (in case the model has multiple inputs).
-          - A `tf.data` dataset or a dataset iterator.
+  def _make_train_function(self):
+    metrics_tensors = [
+        self._all_metrics_tensors[m] for m in self.metrics_names[1:]
+    ]
+    self._make_train_function_helper('train_function',
+                                     [self.total_loss] + metrics_tensors)
 
-    Returns:
-        Numpy array(s) of predictions.
+  def _make_fit_function(self):
+    metrics_tensors = [
+        self._all_stateful_metrics_tensors[m] for m in self.metrics_names[1:]
+    ]
+    self._make_train_function_helper(
+        '_fit_function', [self.total_loss] + metrics_tensors)
 
-    Raises:
-        ValueError: In case of mismatch between given number of inputs and
-          expectations of the model.
-    """
-    if self._distribution_strategy:
-      raise NotImplementedError('`predict_on_batch` is not supported for '
-                                'models compiled with DistributionStrategy.')
-    # Validate and standardize user data.
-    inputs, _, _ = self._standardize_user_data(x)
-    if self.run_eagerly:
-      if (isinstance(inputs, iterator_ops.EagerIterator) or
-          (isinstance(inputs, dataset_ops.DatasetV2))):
-        inputs = training_utils.cast_if_floating_dtype(inputs)
-      elif isinstance(inputs, collections.Sequence):
-        inputs = [
-            ops.convert_to_tensor(val, dtype=K.floatx()) for val in inputs]
-      return self(inputs)  # pylint: disable=not-callable
+  def _make_test_function_helper(self, fn_name, outputs):
+    if not self._is_compiled:
+      raise RuntimeError('You must compile your model before using it.')
+    if getattr(self, fn_name) is None:
+      inputs = (self._feed_inputs +
+                self._feed_targets +
+                self._feed_sample_weights)
 
-    self._make_predict_function()
-    outputs = self.predict_function(inputs)
+      with K.name_scope('evaluation'):
+        updates = self.state_updates
+        # Return loss and metrics, no gradient updates.
+        # Does update the network states.
+        fn = K.function(
+            inputs,
+            outputs,
+            updates=updates,
+            name='test_function',
+            **self._function_kwargs)
+        setattr(self, fn_name, fn)
 
-    if len(outputs) == 1:
-      return outputs[0]
-    return outputs
+  def _make_test_function(self):
+    metrics_tensors = [
+        self._all_metrics_tensors[m] for m in self.metrics_names[1:]
+    ]
+    self._make_test_function_helper('test_function',
+                                    [self.total_loss] + metrics_tensors)
 
-  def fit_generator(self,
-                    generator,
-                    steps_per_epoch=None,
-                    epochs=1,
-                    verbose=1,
-                    callbacks=None,
-                    validation_data=None,
-                    validation_steps=None,
-                    class_weight=None,
-                    max_queue_size=10,
-                    workers=1,
-                    use_multiprocessing=False,
-                    shuffle=True,
-                    initial_epoch=0):
-    """Fits the model on data yielded batch-by-batch by a Python generator.
+  def _make_eval_function(self):
+    metrics_tensors = [
+        self._all_stateful_metrics_tensors[m] for m in self.metrics_names[1:]
+    ]
+    self._make_test_function_helper(
+        '_eval_function', [self.total_loss] + metrics_tensors)
 
-    The generator is run in parallel to the model, for efficiency.
-    For instance, this allows you to do real-time data augmentation
-    on images on CPU in parallel to training your model on GPU.
+  def _make_predict_function(self):
+    if not hasattr(self, 'predict_function'):
+      self.predict_function = None
+    if self.predict_function is None:
+      inputs = self._feed_inputs
+      # Gets network outputs. Does not update weights.
+      # Does update the network states.
+      kwargs = getattr(self, '_function_kwargs', {})
+      with K.name_scope(ModeKeys.PREDICT):
+        self.predict_function = K.function(
+            inputs,
+            self.outputs,
+            updates=self.state_updates,
+            name='predict_function',
+            **kwargs)
 
-    The use of `keras.utils.Sequence` guarantees the ordering
-    and guarantees the single use of every input per epoch when
-    using `use_multiprocessing=True`.
+  def _make_execution_function(self, mode):
+    if mode == ModeKeys.TRAIN:
+      self._make_fit_function()
+      return self._fit_function
+    if mode == ModeKeys.TEST:
+      self._make_eval_function()
+      return self._eval_function
+    if mode == ModeKeys.PREDICT:
+      self._make_predict_function()
+      return self.predict_function
 
-    Arguments:
-        generator: A generator or an instance of `Sequence`
-          (`keras.utils.Sequence`)
-            object in order to avoid duplicate data
-            when using multiprocessing.
-            The output of the generator must be either
-            - a tuple `(inputs, targets)`
-            - a tuple `(inputs, targets, sample_weights)`.
-            This tuple (a single output of the generator) makes a single batch.
-            Therefore, all arrays in this tuple must have the same length (equal
-            to the size of this batch). Different batches may have different
-              sizes.
-            For example, the last batch of the epoch is commonly smaller than
-              the
-            others, if the size of the dataset is not divisible by the batch
-              size.
-            The generator is expected to loop over its data
-            indefinitely. An epoch finishes when `steps_per_epoch`
-            batches have been seen by the model.
-        steps_per_epoch: Total number of steps (batches of samples)
-            to yield from `generator` before declaring one epoch
-            finished and starting the next epoch. It should typically
-            be equal to the number of samples of your dataset
-            divided by the batch size.
-            Optional for `Sequence`: if unspecified, will use
-            the `len(generator)` as a number of steps.
-        epochs: Integer, total number of iterations on the data.
-        verbose: Verbosity mode, 0, 1, or 2.
-        callbacks: List of callbacks to be called during training.
-        validation_data: This can be either
-            - a generator for the validation data
-            - a tuple (inputs, targets)
-            - a tuple (inputs, targets, sample_weights).
-        validation_steps: Only relevant if `validation_data`
-            is a generator. Total number of steps (batches of samples)
-            to yield from `generator` before stopping.
-            Optional for `Sequence`: if unspecified, will use
-            the `len(validation_data)` as a number of steps.
-        class_weight: Dictionary mapping class indices to a weight
-            for the class.
-        max_queue_size: Integer. Maximum size for the generator queue.
-            If unspecified, `max_queue_size` will default to 10.
-        workers: Integer. Maximum number of processes to spin up
-            when using process-based threading.
-            If unspecified, `workers` will default to 1. If 0, will
-            execute the generator on the main thread.
-        use_multiprocessing: Boolean.
-            If `True`, use process-based threading.
-            If unspecified, `use_multiprocessing` will default to `False`.
-            Note that because this implementation relies on multiprocessing,
-            you should not pass non-picklable arguments to the generator
-            as they can't be passed easily to children processes.
-        shuffle: Boolean. Whether to shuffle the order of the batches at
-            the beginning of each epoch. Only used with instances
-            of `Sequence` (`keras.utils.Sequence`).
-            Has no effect when `steps_per_epoch` is not `None`.
-        initial_epoch: Epoch at which to start training
-            (useful for resuming a previous training run)
+  def _distribution_standardize_user_data(self,
+                                          x,
+                                          y=None,
+                                          sample_weight=None,
+                                          class_weight=None,
+                                          batch_size=None,
+                                          validation_split=0,
+                                          shuffle=False,
+                                          repeat=True,
+                                          allow_partial_batch=False):
+    """Runs validation checks on input and target data passed by the user.
 
-    Returns:
-        A `History` object.
+    This is called when using DistributionStrategy to train, evaluate or serve
+    the model.
 
-    Example:
+    Args:
+      x: Input data. A numpy array or `tf.data` dataset.
+      y: Target data. A numpy array or None if x is a `tf.data` dataset.
+      sample_weight: An optional sample-weight array passed by the user to
+        weight the importance of each sample in `x`.
+      class_weight: An optional class-weight array by the user to
+        weight the importance of samples in `x` based on the class they belong
+        to, as conveyed by `y`.
+      batch_size: Integer batch size. If provided, it is used to run additional
+        validation checks on stateful models.
+      validation_split: Float between 0 and 1.
+        Fraction of the training data to be used as validation data.
+      shuffle: Boolean whether to shuffle the training data before each epoch.
+      repeat: Boolean whether to repeat the numpy training data when converting
+        to training dataset.
+      allow_partial_batch: Boolean whether to enforce that all batches have the
+        same size.
 
-    ```python
-        def generate_arrays_from_file(path):
-            while 1:
-                f = open(path)
-                for line in f:
-                    # create numpy arrays of input data
-                    # and labels, from each line in the file
-                    x1, x2, y = process_line(line)
-                    yield ({'input_1': x1, 'input_2': x2}, {'output': y})
-                f.close()
+    Returns:
+      Dataset instance.
 
-        model.fit_generator(generate_arrays_from_file('/my_file.txt'),
-                            steps_per_epoch=10000, epochs=10)
-    ```
     Raises:
-        ValueError: In case the generator yields data in an invalid format.
+      ValueError: In case of invalid user-provided data.
+      RuntimeError: If the model was never compiled.
     """
-    if self._distribution_strategy:
-      raise NotImplementedError('`fit_generator` is not supported for '
-                                'models compiled with DistributionStrategy.')
-    return training_generator.fit_generator(
-        self,
-        generator,
-        steps_per_epoch=steps_per_epoch,
-        epochs=epochs,
-        verbose=verbose,
-        callbacks=callbacks,
-        validation_data=validation_data,
-        validation_steps=validation_steps,
-        class_weight=class_weight,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing,
-        shuffle=shuffle,
-        initial_epoch=initial_epoch)
+    if class_weight:
+      raise NotImplementedError('`class_weight` is currently not supported '
+                                'when using DistributionStrategy.')
 
-  def evaluate_generator(self,
-                         generator,
-                         steps=None,
-                         max_queue_size=10,
-                         workers=1,
-                         use_multiprocessing=False,
-                         verbose=0):
-    """Evaluates the model on a data generator.
+    if (sample_weight is not None and sample_weight.all() and
+        distributed_training_utils.is_tpu_strategy(
+            self._distribution_strategy)):
+      raise NotImplementedError('`sample_weight` is currently not supported '
+                                'when using TPUStrategy.')
 
-    The generator should return the same kind of data
-    as accepted by `test_on_batch`.
+    if (self.stateful and distributed_training_utils.is_tpu_strategy(
+        self._distribution_strategy) and self._distribution_strategy.
+        num_replicas_in_sync != 1):
+      raise ValueError('Single core must be used for computation on '
+                       'stateful models. Consider adding `device_assignment` '
+                       'parameter to TPUStrategy using\n'
+                       'topology = tf.contrib.distribute.'
+                       'initialize_tpu_system()\n'
+                       'device_assignment = tf.contrib.tpu.DeviceAssignment('
+                       'topology, core_assignment=tf.contrib.tpu.'
+                       'SINGLE_CORE_ASSIGNMENT)\n'
+                       'tpu_strategy = tf.contrib.distribute.TPUStrategy('
+                       'device_assignment=device_assignment)')
+
+    # Validates `steps` and `shuffle` arguments right at the beginning
+    # since we use it to construct the dataset object.
+    # TODO(anjalisridhar): Remove this check once we refactor the
+    # _standardize_user_data code path. This check is already present elsewhere
+    # in the codebase.
+    if isinstance(x, dataset_ops.DatasetV2):
+      if shuffle:
+        training_utils.verify_dataset_shuffled(x)
+
+    strategy = self._distribution_strategy
+    with strategy.scope():
+      # We should be sure to call get_session() inside the strategy.scope()
+      # so the strategy can affect the session options.
+      if ops.executing_eagerly_outside_functions():
+        session = None
+      else:
+        session = K.get_session()
 
-    Arguments:
-        generator: Generator yielding tuples (inputs, targets)
-            or (inputs, targets, sample_weights)
-            or an instance of `keras.utils.Sequence`
-            object in order to avoid duplicate data
-            when using multiprocessing.
-        steps: Total number of steps (batches of samples)
-            to yield from `generator` before stopping.
-            Optional for `Sequence`: if unspecified, will use
-            the `len(generator)` as a number of steps.
-        max_queue_size: maximum size for the generator queue
-        workers: Integer. Maximum number of processes to spin up
-            when using process-based threading.
-            If unspecified, `workers` will default to 1. If 0, will
-            execute the generator on the main thread.
-        use_multiprocessing: Boolean.
-            If `True`, use process-based threading.
-            If unspecified, `use_multiprocessing` will default to `False`.
-            Note that because this implementation relies on multiprocessing,
-            you should not pass non-picklable arguments to the generator
-            as they can't be passed easily to children processes.
-        verbose: Verbosity mode, 0 or 1.
+      first_x_value = nest.flatten(x)[0]
+      if isinstance(first_x_value, np.ndarray):
+        x = distributed_training_utils.list_to_tuple(x)
+        if y is not None:
+          y = distributed_training_utils.list_to_tuple(y)
+          if sample_weight is not None:
+            sample_weight = distributed_training_utils.list_to_tuple(
+                sample_weight)
+            in_tuple = (x, y, sample_weight)
+          else:
+            in_tuple = (x, y)
+        else:
+          in_tuple = x
 
-    Returns:
-        Scalar test loss (if the model has a single output and no metrics)
-        or list of scalars (if the model has multiple outputs
-        and/or metrics). The attribute `model.metrics_names` will give you
-        the display labels for the scalar outputs.
+        ds = strategy.extended.experimental_make_numpy_dataset(in_tuple,
+                                                               session=session)
+        if shuffle:
+          # We want a buffer size that is larger than the batch size provided by
+          # the user and provides sufficient randomness. Note that larger
+          # numbers introduce more memory usage based on the size of each
+          # sample.
+          ds = ds.shuffle(max(1024, batch_size * 8))
+        if repeat:
+          ds = ds.repeat()
+
+        # We need to use the drop_remainder argument to get a known static
+        # input shape which is required for TPUs.
+        drop_remainder = (not allow_partial_batch and
+                          strategy.extended.experimental_require_static_shapes)
+        x = ds.batch(batch_size, drop_remainder=drop_remainder)
+      else:
+        assert isinstance(x, dataset_ops.DatasetV2)
+        training_utils.validate_dataset_input(x, y, sample_weight,
+                                              validation_split)
+    return x
 
-    Raises:
-        ValueError: in case of invalid arguments.
+  def _standardize_user_data(self,
+                             x,
+                             y=None,
+                             sample_weight=None,
+                             class_weight=None,
+                             batch_size=None,
+                             check_steps=False,
+                             steps_name='steps',
+                             steps=None,
+                             validation_split=0,
+                             shuffle=False,
+                             extract_tensors_from_dataset=False):
+    """Runs validation checks on input and target data passed by the user.
+
+    Also standardizes the data to lists of arrays, in order.
+
+    Also builds and compiles the model on the fly if it is a subclassed model
+    that has never been called before (and thus has no inputs/outputs).
+
+    This is a purely internal method, subject to refactoring at any time.
+
+    Args:
+      x: Input data. It could be:
+        - A Numpy array (or array-like), or a list of arrays
+          (in case the model has multiple inputs).
+        - A TensorFlow tensor, or a list of tensors
+          (in case the model has multiple inputs).
+        - A dict mapping input names to the corresponding array/tensors,
+          if the model has named inputs.
+        - A `tf.data` dataset or a dataset iterator.
+      y: Target data. Like the input data `x`,
+        it could be either Numpy array(s) or TensorFlow tensor(s).
+        It should be consistent with `x` (you cannot have Numpy inputs and
+        tensor targets, or inversely). If `x` is a dataset or a
+        dataset iterator, `y` should not be specified
+        (since targets will be obtained from the iterator).
+      sample_weight: An optional sample-weight array passed by the user to
+        weight the importance of each sample in `x`.
+      class_weight: An optional class-weight array by the user to
+        weight the importance of samples in `x` based on the class they belong
+        to, as conveyed by `y`. If both `sample_weight` and `class_weight` are
+        provided, the weights are multiplied.
+      batch_size: Integer batch size. If provided, it is used to run additional
+        validation checks on stateful models.
+      check_steps: boolean, True if we want to check for validity of `steps` and
+        False, otherwise. For example, when we are standardizing one batch of
+        data for train_on_batch/predict_on_batch/test_on_batch APIs, `steps`
+        value is not required and we should not check for its validity in these
+        cases.
+      steps_name: The public API's parameter name for `steps`.
+      steps: Integer or `None`. Total number of steps (batches of samples) to
+        execute.
+      validation_split: Float between 0 and 1.
+        Fraction of the training data to be used as validation data.
+      shuffle: Boolean whether to shuffle the training data before each epoch.
+      extract_tensors_from_dataset: Boolean. When `x` is a dataset instance,
+        this indicates whether to extract actual tensors from the dataset or
+        instead output the dataset instance itself.
+        Set to True when calling from `train_on_batch`/etc.
+
+    Returns:
+      A tuple of 3: inputs (arrays or dicts, depending on whether `x` was a dict
+      or not), target arrays, sample-weight arrays.
+      If the model's input and targets are symbolic, these lists are empty
+      (since the model takes no user-provided data, instead the data comes
+      from the symbolic inputs/targets).
 
     Raises:
-        ValueError: In case the generator yields data in an invalid format.
+      ValueError: In case of invalid user-provided data.
+      RuntimeError: If the model was never compiled.
     """
-    if self._distribution_strategy:
-      raise NotImplementedError('`evaluate_generator` is not supported for '
-                                'models compiled with DistributionStrategy.')
-    return training_generator.evaluate_generator(
-        self,
-        generator,
-        steps=steps,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing,
-        verbose=verbose)
+    if isinstance(x, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)):
+      # Graph mode dataset. We'll pass the dataset as-is (unless
+      # `extract_tensors_from_dataset` is True, in which case we extract
+      # the tensors from the dataset and we output them.
+      training_utils.validate_dataset_input(x, y, sample_weight,
+                                            validation_split)
+      if shuffle:
+        training_utils.verify_dataset_shuffled(x)
+
+      is_dataset = True
+      if extract_tensors_from_dataset:
+        # We do this for `train_on_batch`/etc.
+        x, y, sample_weight = training_utils.extract_tensors_from_dataset(x)
+    elif isinstance(x, iterator_ops.Iterator):
+      # Graph mode iterator. We extract the symbolic tensors.
+      training_utils.validate_dataset_input(x, y, sample_weight,
+                                            validation_split)
+      iterator = x
+      x, y, sample_weight = training_utils.unpack_iterator_input(iterator)
+      is_dataset = True
+    else:
+      is_dataset = False
 
-  def predict_generator(self,
-                        generator,
-                        steps=None,
-                        max_queue_size=10,
-                        workers=1,
-                        use_multiprocessing=False,
-                        verbose=0):
-    """Generates predictions for the input samples from a data generator.
+    # Validates `steps` argument based on x's type.
+    if check_steps:
+      training_utils.check_steps_argument(x, steps, steps_name)
+
+    # First, we build/compile the model on the fly if necessary.
+    all_inputs = []
+    is_build_called = False
+    is_compile_called = False
+    # Whether this is a subclassed model that expects dictionary inputs
+    # rather than list inputs (e.g. FeatureColumn-based models).
+    dict_inputs = False
+    if not self.inputs:
+      # We need to use `x_input` to set the model inputs.
+
+      # If input data is a dataset iterator in graph mode or if it is an eager
+      # iterator and only one batch of samples is required, we fetch the data
+      # tensors from the iterator and then standardize them.
+      if isinstance(x, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)):
+        x_input, y_input, _ = training_utils.extract_tensors_from_dataset(x)
+      else:
+        x_input = x
+        y_input = y
+      # We type-check that `x_input` and `y_input` are either single arrays
+      # or lists of arrays.
+      if isinstance(x_input, (list, tuple)):
+        if not all(isinstance(v, np.ndarray) or
+                   tensor_util.is_tensor(v) for v in x_input):
+          raise ValueError('Please provide as model inputs either a single '
+                           'array or a list of arrays. You passed: x=' + str(x))
+        all_inputs += list(x_input)
+      elif isinstance(x_input, dict):
+        dict_inputs = True
+        keys = sorted(x_input.keys())
+        all_inputs = [x_input[k] for k in keys]
+      else:
+        if (not isinstance(x_input, np.ndarray) and
+            not tensor_util.is_tensor(x_input)):
+          raise ValueError('Please provide as model inputs either a single '
+                           'array or a list of arrays. You passed: x=' + str(x))
+        all_inputs.append(x_input)
+
+      # Build the model using the retrieved inputs (value or symbolic).
+      # If values or generated from a dataset, then in symbolic-mode
+      # placeholders will be created to match the value shapes.
+      is_build_called = True
+      if is_dataset:
+        cast_inputs = nest.map_structure(lambda v: v.shape, x_input)
+      elif training_utils.has_tensors(x_input):
+        cast_inputs = training_utils.cast_if_floating_dtype(x_input)
+      else:
+        cast_inputs = x_input
+      self._set_inputs(cast_inputs)
+    else:
+      y_input = y
+      dict_inputs = isinstance(self.inputs, dict)
+
+    if y_input is not None:
+      if not self.optimizer:
+        raise RuntimeError('You must compile a model before '
+                           'training/testing. '
+                           'Use `model.compile(optimizer, loss)`.')
+      if not self._is_compiled:
+        # On-the-fly compilation of the model.
+        # We need to use `y` to set the model targets.
+        if training_utils.has_tensors(y_input):
+          y_input = training_utils.cast_if_floating_dtype(y_input)
+        if isinstance(y_input, (list, tuple)):
+          if not all(isinstance(v, np.ndarray) or
+                     tensor_util.is_tensor(v) for v in y_input):
+            raise ValueError('Please provide as model targets either a single '
+                             'array or a list of arrays. '
+                             'You passed: y=' + str(y))
+          all_inputs += list(y_input)
+        elif isinstance(y_input, dict):
+          raise ValueError('You cannot pass a dictionary as model targets.')
+        else:
+          if (not isinstance(y_input, np.ndarray) and
+              not tensor_util.is_tensor(y_input)):
+            raise ValueError('Please provide as model targets either a single '
+                             'array or a list of arrays. '
+                             'You passed: y=' + str(y))
+          all_inputs.append(y_input)
+
+        # Typecheck that all inputs are *either* value *or* symbolic.
+        # TODO(fchollet): this check could be removed in Eager mode?
+        if any(tensor_util.is_tensor(v) for v in all_inputs):
+          if not all(tensor_util.is_tensor(v) for v in all_inputs):
+            raise ValueError('Do not pass inputs that mix Numpy arrays and '
+                             'TensorFlow tensors. '
+                             'You passed: x=' + str(x) + '; y=' + str(y))
+
+        if is_dataset or context.executing_eagerly():
+          target_tensors = None
+        else:
+          # Handle target tensors if any passed.
+          if not isinstance(y_input, (list, tuple)):
+            y_input = [y_input]
+          target_tensors = [v for v in y_input if _is_symbolic_tensor(v)]
+        is_compile_called = True
+        self.compile(
+            optimizer=self.optimizer,
+            loss=self.loss,
+            metrics=self._compile_metrics,
+            weighted_metrics=self._compile_weighted_metrics,
+            loss_weights=self.loss_weights,
+            target_tensors=target_tensors,
+            run_eagerly=self.run_eagerly)
 
-    The generator should return the same kind of data as accepted by
-    `predict_on_batch`.
+    # In graph mode, if we had just set inputs and targets as symbolic tensors
+    # by invoking build and compile on the model respectively, we do not have to
+    # feed anything to the model. Model already has input and target data as
+    # part of the graph.
+    # Note: in this case, `any` and `all` are equivalent since we disallow
+    # mixed symbolic/value inputs.
+    if (not self.run_eagerly and is_build_called and is_compile_called and
+        not is_dataset  and any(_is_symbolic_tensor(v) for v in all_inputs)):
+      return [], [], []
 
-    Arguments:
-        generator: Generator yielding batches of input samples
-            or an instance of `keras.utils.Sequence` object in order to
-            avoid duplicate data when using multiprocessing.
-        steps: Total number of steps (batches of samples)
-            to yield from `generator` before stopping.
-            Optional for `Sequence`: if unspecified, will use
-            the `len(generator)` as a number of steps.
-        max_queue_size: Maximum size for the generator queue.
-        workers: Integer. Maximum number of processes to spin up
-            when using process-based threading.
-            If unspecified, `workers` will default to 1. If 0, will
-            execute the generator on the main thread.
-        use_multiprocessing: Boolean.
-            If `True`, use process-based threading.
-            If unspecified, `use_multiprocessing` will default to `False`.
-            Note that because this implementation relies on multiprocessing,
-            you should not pass non-picklable arguments to the generator
-            as they can't be passed easily to children processes.
-        verbose: verbosity mode, 0 or 1.
+    # What follows is input validation and standardization to list format,
+    # in the case where all inputs are value arrays.
 
-    Returns:
-        Numpy array(s) of predictions.
+    if self.run_eagerly:
+      # In eager mode, do not do shape validation
+      # since the network has no input nodes (placeholders) to be fed.
+      feed_input_names = self.input_names
+      feed_input_shapes = None
+    elif not self._is_graph_network:
+      # Case: symbolic-mode subclassed network. Do not do shape validation.
+      feed_input_names = self._feed_input_names
+      feed_input_shapes = None
+    else:
+      # Case: symbolic-mode graph network.
+      # In this case, we run extensive shape validation checks.
+      feed_input_names = self._feed_input_names
+      feed_input_shapes = self._feed_input_shapes
 
-    Raises:
-        ValueError: In case the generator yields data in an invalid format.
-    """
-    if self._distribution_strategy:
-      raise NotImplementedError('`predict_generator` is not supported for '
-                                'models compiled with DistributionStrategy.')
-    return training_generator.predict_generator(
-        self,
-        generator,
-        steps=steps,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing,
-        verbose=verbose)
+    # Standardize the inputs.
+    if not isinstance(x, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)):
+      # TODO(fchollet): run static checks with dataset output shape(s).
+      x = training_utils.standardize_input_data(
+          x,
+          feed_input_names,
+          feed_input_shapes,
+          check_batch_axis=False,  # Don't enforce the batch size.
+          exception_prefix='input')
 
-  def _get_callback_model(self):
-    """Returns the Callback Model for this Model."""
+    if y is not None:
+      if not self._is_graph_network:
+        feed_output_names = self._feed_output_names
+        feed_output_shapes = None
+        # Sample weighting not supported in this case.
+        # TODO(fchollet): consider supporting it.
+        feed_sample_weight_modes = [None for _ in self.outputs]
+      else:
+        feed_output_names = self._feed_output_names
+        feed_sample_weight_modes = self._feed_sample_weight_modes
+        feed_output_shapes = []
+        for output_shape, loss_fn in zip(self._feed_output_shapes,
+                                         self._feed_loss_fns):
+          if ((isinstance(loss_fn, losses.LossFunctionWrapper) and
+               loss_fn.fn == losses.sparse_categorical_crossentropy)) or (
+                   isinstance(loss_fn, losses.SparseCategoricalCrossentropy)):
+            if K.image_data_format() == 'channels_first':
+              feed_output_shapes.append(
+                  (output_shape[0], 1) + output_shape[2:])
+            else:
+              feed_output_shapes.append(output_shape[:-1] + (1,))
+          elif (not isinstance(loss_fn, losses.Loss) or
+                (isinstance(loss_fn, losses.LossFunctionWrapper) and
+                 (getattr(losses, loss_fn.fn.__name__, None) is None))):
+            # If the given loss is not an instance of the `Loss` class (custom
+            # class) or if the loss function that is wrapped is not in the
+            # `losses` module, then it is a user-defined loss and we make no
+            # assumptions about it.
+            feed_output_shapes.append(None)
+          else:
+            feed_output_shapes.append(output_shape)
 
-    if hasattr(self, '_replicated_model') and self._replicated_model:
-      # When using training_distributed, we set the callback model
-      # to an instance of the `DistributedModel` that we create in
-      # the `compile` call. The `DistributedModel` is initialized
-      # with the first replicated model. We need to set the callback
-      # model to a DistributedModel to allow us to override saving
-      # and loading weights when we checkpoint the model during training.
-      return self._replicated_model
-    if hasattr(self, 'callback_model') and self.callback_model:
-      return self.callback_model
-    return self
+      # Standardize the outputs.
+      y = training_utils.standardize_input_data(
+          y,
+          feed_output_names,
+          # Don't enforce target shapes to match output shapes.
+          # Precise checks will be run in `check_loss_and_target_compatibility`.
+          shapes=None,
+          check_batch_axis=False,  # Don't enforce the batch size.
+          exception_prefix='target')
 
-  def _make_callback_model(self, grouped_model):
-    first_replicated_model = self._distribution_strategy.unwrap(
-        grouped_model)[0]
-    # We initialize the callback model with the first replicated model.
-    self._replicated_model = DistributedCallbackModel(first_replicated_model)
-    self._replicated_model.set_original_model(self)
+      # Generate sample-wise weight values given the `sample_weight` and
+      # `class_weight` arguments.
+      sample_weights = training_utils.standardize_sample_weights(
+          sample_weight, feed_output_names)
+      class_weights = training_utils.standardize_class_weights(
+          class_weight, feed_output_names)
+      sample_weights = [
+          training_utils.standardize_weights(ref, sw, cw, mode)
+          for (ref, sw, cw, mode) in zip(y, sample_weights, class_weights,
+                                         feed_sample_weight_modes)
+      ]
+      # Check that all arrays have the same length.
+      if not self._distribution_strategy:
+        training_utils.check_array_lengths(x, y, sample_weights)
+        if self._is_graph_network and not self.run_eagerly:
+          # Additional checks to avoid users mistakenly using improper loss fns.
+          training_utils.check_loss_and_target_compatibility(
+              y, self._feed_loss_fns, feed_output_shapes)
+    else:
+      y = []
+      sample_weights = []
 
-  def _validate_or_infer_batch_size(self, batch_size, steps, x):
-    """Validates that the `batch_size` provided is consistent with InputLayer.
+    if self.stateful and batch_size:
+      # Check that for stateful networks, number of samples is a multiple
+      # of the static batch size.
+      if x[0].shape[0] % batch_size != 0:
+        raise ValueError('In a stateful network, '
+                         'you should only pass inputs with '
+                         'a number of samples that can be '
+                         'divided by the batch size. Found: ' +
+                         str(x[0].shape[0]) + ' samples')
 
-    It's possible that the user specified a static batch size in their
-    InputLayer. If so, this method checks the provided `batch_size` and `x`
-    arguments are consistent with this static batch size. Also, if
-    `batch_size` is `None`, this method will attempt to infer the batch size
-    from the static batch size of the InputLayer.
+    # If dictionary inputs were provided, we return a dictionary as well.
+    if dict_inputs and not isinstance(x, (dataset_ops.DatasetV1,
+                                          dataset_ops.DatasetV2)):
+      x = dict(zip(feed_input_names, x))
+    return x, y, sample_weights
 
-    Arguments:
-      batch_size: The batch_size provided as an argument to
-        fit/evaluate/predict.
-      steps: The steps provided as an argument to fit/evaluate/predict.
-      x: The data passed as `x` to fit/evaluate/predict.
+  def _unpack_validation_data(self, validation_data):
+    if (isinstance(validation_data, (iterator_ops.Iterator,
+                                     iterator_ops.EagerIterator,
+                                     dataset_ops.DatasetV2))):
+      val_x = validation_data
+      val_y = None
+      val_sample_weight = None
+    elif len(validation_data) == 2:
+      val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
+      val_sample_weight = None
+    elif len(validation_data) == 3:
+      val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
+    else:
+      raise ValueError(
+          'When passing a `validation_data` argument, '
+          'it must contain either 2 items (x_val, y_val), '
+          'or 3 items (x_val, y_val, val_sample_weights), '
+          'or alternatively it could be a dataset or a '
+          'dataset or a dataset iterator. '
+          'However we received `validation_data=%s`' % validation_data)
+    return val_x, val_y, val_sample_weight
+
+  @trackable.no_automatic_dependency_tracking
+  def _set_inputs(self, inputs, outputs=None, training=None):
+    """Set model's input and output specs based on the input data received.
 
-    Returns:
-      The validated batch_size, auto-inferred from the first layer if not
-      provided.
+    This is to be used for Model subclasses, which do not know at instantiation
+    time what their inputs look like.
+
+    Args:
+      inputs: Single array, or list of arrays. The arrays could be placeholders,
+        Numpy arrays, data tensors, or TensorShapes.
+        - if placeholders: the model is built on top of these placeholders,
+          and we expect Numpy data to be fed for them when calling `fit`/etc.
+        - if Numpy data or TensorShapes: we create placeholders matching the
+          TensorShapes or shapes of the Numpy arrays. We expect Numpy data to be
+          fed for these placeholders when calling `fit`/etc.
+        - if data tensors: the model is built on top of these tensors.
+          We do not expect any Numpy data to be provided when calling `fit`/etc.
+      outputs: None, a data tensor, or a list of tensors. If None, the
+        outputs will be determined by invoking `self.call()`, otherwise the
+        provided value will be used.
+      training: Boolean or None. Only relevant in symbolic mode. Specifies
+        whether to build the model's graph in inference mode (False), training
+        mode (True), or using the Keras learning phase (None).
+    Raises:
+      ValueError: If dict inputs are passed to a Sequential Model where the
+        first layer isn't FeatureLayer.
     """
-    layers = super(Model, self).layers  # Avoids the override in Sequential.
-    if layers:
-      first_layer = layers[0]
-      static_batch_size = training_utils.get_static_batch_size(first_layer)
-      if static_batch_size is not None:
+    if self.inputs:
+      raise ValueError('Model inputs are already set.')
 
-        # Check `batch_size` argument is consistent with InputLayer.
-        if batch_size is not None and batch_size != static_batch_size:
-          raise ValueError('The `batch_size` argument value {} is incompatible '
-                           'with the specified batch size of your Input Layer: '
-                           '{}'.format(batch_size, static_batch_size))
+    if self.__class__.__name__ == 'Sequential' and not self.built:
+      if tensor_util.is_tensor(inputs):
+        input_shape = (None,) + tuple(inputs.shape.as_list()[1:])
+      elif isinstance(inputs, tensor_shape.TensorShape):
+        input_shape = (None,) + tuple(inputs.as_list()[1:])
+      elif isinstance(inputs, dict):
+        # We assert that the first layer is a FeatureLayer.
+        if not training_utils.is_feature_layer(self.layers[0]):
+          raise ValueError('Passing a dictionary input to a Sequential Model '
+                           'which doesn\'t have FeatureLayer as the first layer'
+                           ' is an error.')
+        input_shape = (None,)
+      else:
+        input_shape = (None,) + tuple(inputs.shape[1:])
+      self._build_input_shape = input_shape
 
-        # Check Dataset/Iterator batch size is consistent with InputLayer.
-        if isinstance(x, (dataset_ops.DatasetV2, iterator_ops.Iterator,
-                          iterator_ops.EagerIterator)):
-          ds_batch_size = tensor_shape.as_dimension(
-              nest.flatten(x.output_shapes)[0][0]).value
-          if ds_batch_size is not None and ds_batch_size != static_batch_size:
-            raise ValueError('The batch output shape of your `Dataset` is {}, '
-                             'which is incompatible with the specified batch '
-                             'size of your Input Layer: {}'.format(
-                                 ds_batch_size, static_batch_size))
+    # On-the-fly setting of symbolic model inputs (either by using the tensor
+    # provided, or by creating a placeholder if Numpy data was provided).
+    model_inputs = training_utils.ModelInputs(inputs)
+    inputs = model_inputs.get_symbolic_inputs()
+    self.inputs = model_inputs.get_symbolic_inputs(return_single_as_list=True)
+    self.input_names = model_inputs.get_input_names()
 
-        # Set inferred batch size from the InputLayer.
-        if steps is None:
-          batch_size = static_batch_size
+    self._feed_inputs = []
+    self._feed_input_names = []
+    self._feed_input_shapes = []
 
-    if batch_size is None and steps is None:
-      # Backwards compatibility
-      batch_size = 32
-    return batch_size
+    for k, v in model_inputs.as_dict():
+      if K.is_placeholder(v):
+        self._feed_input_names.append(k)
+        self._feed_inputs.append(v)
+        self._feed_input_shapes.append(K.int_shape(v))
+
+    # TODO(fchollet): consider calling `_maybe_build` before calling the model.
+    if outputs is None:
+      if not self._dynamic:
+        # The network may include dynamic layers but its `call`
+        # itself isn't dynamic.
+        # Obtain symbolic outputs by calling the model.
+        with K.get_graph().as_default():
+          if self._expects_training_arg:
+            outputs = self.call(inputs, training=training)
+          else:
+            outputs = self.call(inputs)
+      else:
+        # Case: network's `call` is dynamic.
+        try:
+          outputs = self._symbolic_call(inputs)
+        except NotImplementedError:
+          # Static shape inference was not implemented for this dynamic net.
+          # Do not specify symbolic outputs.
+          outputs = None
+
+    outputs = nest.flatten(outputs)
+    self.outputs = outputs
+    self.output_names = training_utils.generic_output_names(outputs)
+    self.built = True
 
 
 class DistributedCallbackModel(Model):
diff --git a/tensorflow/python/keras/engine/training_arrays.py b/tensorflow/python/keras/engine/training_arrays.py
index 196d48faec23acd42bca33414b4862a5084d18f5..b6fe5f68e57dbbe2ec789f62011b6d117526a453 100644
--- a/tensorflow/python/keras/engine/training_arrays.py
+++ b/tensorflow/python/keras/engine/training_arrays.py
@@ -23,14 +23,17 @@ import functools
 
 import numpy as np
 
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks as cbks
-from tensorflow.python.keras.engine import training_distributed
+from tensorflow.python.keras.engine import distributed_training_utils
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils.generic_utils import make_batches
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
+from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.platform import tf_logging as logging
 
 try:
@@ -39,89 +42,6 @@ except ImportError:
   issparse = None
 
 
-def _get_model_feed(model, mode):
-  if mode == 'predict':
-    feed = model._feed_inputs
-  else:
-    feed = (
-        model._feed_inputs + model._feed_targets + model._feed_sample_weights)
-  return feed
-
-
-def _validate_arguments(steps_per_epoch, validation_steps, kwargs):
-  for k in kwargs:
-    if k != 'steps':
-      raise ValueError('Invalid argument passed: {}'.format(k))
-
-  # Validate inputs when in training mode.
-  if validation_steps and steps_per_epoch is None:
-    raise ValueError('Can only use `validation_steps` '
-                     'when doing step-wise '
-                     'training, i.e. `steps_per_epoch` '
-                     'must be set.')
-
-
-def _print_train_info(inputs, val_inputs, steps_per_epoch, verbose):
-  if (val_inputs and steps_per_epoch is None and verbose and inputs and
-      hasattr(inputs[0], 'shape') and hasattr(val_inputs[0], 'shape')):
-    print('Train on %d samples, validate on %d samples' %
-          (inputs[0].shape[0], val_inputs[0].shape[0]))
-
-
-def _get_num_samples_or_steps(ins, batch_size, steps_per_epoch):
-  """Returns total number of samples (when training in batch mode) or steps."""
-  if steps_per_epoch:
-    return steps_per_epoch
-  return training_utils.check_num_samples(ins, batch_size, steps_per_epoch,
-                                          'steps_per_epoch')
-
-
-def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
-  """Prepare feed values to the model execution function.
-
-  Arguments:
-    model: Model to prepare feed values for.
-    inputs: List or dict of model inputs.
-    targets: Optional list of model targets.
-    sample_weights: Optional list of sample weight arrays.
-    mode: One of 'train'/'test'/'predict'.
-
-  Returns:
-    Feed values for the model in the given mode.
-  """
-  if model._distribution_strategy:
-    def get_distributed_inputs():
-      return training_distributed._prepare_feed_values(
-          model, inputs, targets, sample_weights, mode)
-
-    # In the eager case, we want to call the input method per step, so return
-    # a lambda from here that can be called. Note that this is applicable only
-    # in Distribution Strategy case as it follows the same code path for both
-    # eager and graph modes.
-    # TODO(priyag,omalleyt): Either we should move the training DS with
-    # EagerIterator to use training_generator code path, or figure out how to
-    # set a symbolic Iterator out of a Dataset when in eager mode.
-    if context.executing_eagerly():
-      return get_distributed_inputs
-    else:
-      return get_distributed_inputs()
-
-  inputs = training_utils.ModelInputs(inputs).as_list()
-  targets = targets or []
-  sample_weights = sample_weights or []
-  ins = inputs + targets + sample_weights
-  if mode == 'train' and not isinstance(K.symbolic_learning_phase(), int):
-    ins += [True]
-  return ins
-
-
-def _make_execution_function(model, mode):
-  """Makes function to run one step of model execution."""
-  if model._distribution_strategy:
-    return training_distributed._make_execution_function(model, mode)
-  return model._make_execution_function(mode)
-
-
 def model_iteration(model,
                     inputs,
                     targets=None,
@@ -137,22 +57,25 @@ def model_iteration(model,
                     initial_epoch=0,
                     steps_per_epoch=None,
                     validation_steps=None,
-                    mode='train',
+                    validation_freq=1,
+                    mode=ModeKeys.TRAIN,
                     validation_in_fit=False,
+                    prepared_feed_values_from_dataset=False,
+                    steps_name='steps',
                     **kwargs):
-  """Loop function for arrays of data with modes 'train'/'test'/'predict'.
+  """Loop function for arrays of data with modes TRAIN/TEST/PREDICT.
 
   Arguments:
       model: Keras Model instance.
-      inputs: Either a list of arrays or a dictionary.
-      targets: List of target arrays.
+      inputs: Either a list or dictionary of arrays, or a dataset instance.
+      targets: List/dictionary of input arrays.
       sample_weights: Optional list of sample weight arrays.
       batch_size: Integer batch size or None if unknown.
       epochs: Number of times to iterate over the data
       verbose: Verbosity mode, 0, 1 or 2
       callbacks: List of callbacks to be called during training
-      val_inputs: List of input arrays.
-      val_targets: List of target arrays.
+      val_inputs: Either a list or dictionary of arrays, or a dataset instance.
+      val_targets: List/dictionary of target arrays.
       val_sample_weights: Optional list of sample weight arrays.
       shuffle: Whether to shuffle the data at the beginning of each epoch
         concatenation of list the display names of the outputs of `f` and the
@@ -164,42 +87,108 @@ def model_iteration(model,
         the default value of `None`.
       validation_steps: Number of steps to run validation for (only if doing
         validation from data tensors). Ignored with the default value of `None`.
-      mode: One of 'train'/'test'/'predict'.
+      validation_freq: Only relevant if validation data is provided. Integer or
+        `collections.Container` instance (e.g. list, tuple, etc.). If an
+        integer, specifies how many training epochs to run before a new
+        validation run is performed, e.g. `validation_freq=2` runs
+        validation every 2 epochs. If a Container, specifies the epochs on
+        which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
+        validation at the end of the 1st, 2nd, and 10th epochs.
+      mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
       validation_in_fit: if true, then this method is invoked from within
-        training iteration (for validation). In this case, do not copy weights
-        when using a tf.distribute.Strategy.
+        training iteration (for validation). In the case where `val_inputs` is a
+        dataset, this flag indicates that its iterator and feed values are
+        already created so should properly reuse resources.
+      prepared_feed_values_from_dataset: if True, `inputs` is a list of feed
+        tensors returned from `_prepare_feed_values` call on the validation
+        dataset, so do not call it again on `inputs`. Should only be used for
+        inline validation (i.e., only if `validation_in_fit` is also True).
+      steps_name: The string name of the steps argument, either `steps`,
+        `validation_steps`, or `steps_per_epoch`. Only used for error message
+        formatting.
       **kwargs: Additional arguments for backwards compatibility.
 
   Returns:
-      - In 'train' mode: `History` object.
-      - In 'test' mode: Evaluation metrics.
-      - In 'predict' mode: Outputs of the Model called on inputs.
+      - In TRAIN mode: `History` object.
+      - In TEST mode: Evaluation metrics.
+      - In PREDICT mode: Outputs of the Model called on inputs.
 
   Raises:
       ValueError: in case of invalid arguments.
   """
   # Backwards compatibility.
   if 'steps' in kwargs:
-    steps_per_epoch = kwargs['steps']
-
-  _validate_arguments(steps_per_epoch, validation_steps, kwargs)
-  if mode == 'train':
+    steps_per_epoch = kwargs.pop('steps')
+  if kwargs:
+    raise TypeError('Unknown arguments: %s' % (kwargs,))
+
+  # In case we were passed a dataset, we extract symbolic tensors from it.
+  reset_dataset_after_each_epoch = False
+  input_iterator = None
+  is_dataset = isinstance(inputs,
+                          (dataset_ops.DatasetV1, dataset_ops.DatasetV2))
+  # TODO(fchollet): consider moving `steps_per_epoch` inference to
+  # _standardize_user_data and set reset_dataset_after_each_epoch as an
+  # attribute on the dataset instance.
+  if is_dataset:
+    if steps_per_epoch is None:
+      reset_dataset_after_each_epoch = True
+      steps_per_epoch = training_utils.infer_steps_for_dataset(
+          inputs, steps_per_epoch, epochs=epochs, steps_name=steps_name)
+    input_iterator = _get_iterator(inputs, model._distribution_strategy)
+
+  if mode == ModeKeys.TRAIN:
     _print_train_info(inputs, val_inputs, steps_per_epoch, verbose)
 
   # Enter DistributionStrategy scope.
   if model._distribution_strategy:
-    scope = model._distribution_strategy.scope()
+    scope = distributed_training_utils.distributed_scope(
+        strategy=model._distribution_strategy,
+        learning_phase=(1 if mode == ModeKeys.TRAIN else 0))
     scope.__enter__()
 
   # Get step function and loop type.
   f = _make_execution_function(model, mode)
-  use_steps = steps_per_epoch is not None
+  use_steps = is_dataset or steps_per_epoch is not None
   do_validation = val_inputs is not None
 
+  # Convert Eager Tensors to NumPy arrays to support batching/shuffling.
+  inputs, targets, sample_weights = training_utils. \
+      convert_eager_tensors_to_numpy((inputs, targets, sample_weights))
+
   # Prepare input data.
-  ins = _prepare_feed_values(model, inputs, targets, sample_weights, mode)
-  num_samples_or_steps = _get_num_samples_or_steps(ins, batch_size,
-                                                   steps_per_epoch)
+  inputs = input_iterator or inputs
+  if validation_in_fit and prepared_feed_values_from_dataset:
+    # When invoking validation in training loop, avoid creating iterator and
+    # list of feed values for the same validation dataset multiple times (which
+    # essentially would call `iterator.get_next()` that slows down execution and
+    # leads to OOM errors eventually.
+    ins = inputs
+  else:
+    ins = _prepare_feed_values(model, inputs, targets, sample_weights, mode)
+  if not is_dataset:
+    num_samples_or_steps = _get_num_samples_or_steps(ins, batch_size,
+                                                     steps_per_epoch)
+  else:
+    num_samples_or_steps = steps_per_epoch
+
+  # Prepare validation data. Hold references to the iterator and the input list
+  # to properly reinitialize and reuse in multiple validation passes.
+  val_iterator = None
+  if isinstance(val_inputs, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)):
+    if validation_steps is None:
+      # Because we pass an iterator feed instead of a Dataset to the eval
+      # model_iteration() call, it will not trigger the dataset-input path
+      # that determines the number of steps required. To avoid this issue,
+      # set validation_steps here if validation_steps is None.
+      validation_steps = training_utils.infer_steps_for_dataset(
+          val_inputs,
+          validation_steps,
+          epochs=epochs,
+          steps_name='validation_steps')
+    val_iterator = _get_iterator(val_inputs, model._distribution_strategy)
+    val_inputs = _prepare_feed_values(
+        model, val_iterator, val_targets, val_sample_weights, ModeKeys.TEST)
 
   # Configure callbacks.
   count_mode = 'steps' if use_steps else 'samples'
@@ -227,16 +216,15 @@ def model_iteration(model,
         indices_for_conversion_to_dense.append(i)
 
   # Select aggregation method.
-  if mode == 'predict':
+  if mode == ModeKeys.PREDICT:
     aggregator = training_utils.OutputsAggregator(use_steps,
                                                   num_samples_or_steps)
   else:
     aggregator = training_utils.MetricsAggregator(use_steps,
                                                   num_samples_or_steps)
 
-  if model._distribution_strategy and not validation_in_fit:
-    training_distributed._copy_weights_to_distributed_model(
-        model, model._grouped_model)
+  if model._compile_distribution:
+    distributed_training_utils._copy_weights_to_distributed_model(model, mode)
 
   callbacks.model.stop_training = False
   callbacks._call_begin_hook(mode)
@@ -249,12 +237,21 @@ def model_iteration(model,
     # Setup work for each epoch
     epoch_logs = {}
     model.reset_metrics()
-    callbacks.on_epoch_begin(epoch, epoch_logs, mode=mode)
+    if mode == ModeKeys.TRAIN:
+      callbacks.on_epoch_begin(epoch, epoch_logs)
     progbar.on_epoch_begin(epoch, epoch_logs)
 
     if use_steps:
       # Step-wise loop.
-      for step in range(steps_per_epoch):
+      if steps_per_epoch is None:
+        # Loop over dataset until `OutOfRangeError` is raised.
+        target_steps = np.inf
+      else:
+        # Loop over dataset for the specified number of steps.
+        target_steps = steps_per_epoch
+
+      step = 0
+      while step < target_steps:
         batch_logs = {'batch': step, 'size': 1}
         callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
         progbar.on_batch_begin(step, batch_logs)
@@ -265,18 +262,42 @@ def model_iteration(model,
           actual_inputs = ins() if callable(ins) else ins
           batch_outs = f(actual_inputs)
         except errors.OutOfRangeError:
-          logging.warning('Your dataset iterator ran out of data; '
-                          'interrupting training. Make sure that your dataset '
-                          'can generate at least `steps_per_epoch * epochs` '
-                          'batches (in this case, %d batches). You may need to'
-                          'use the repeat() function when building your '
-                          'dataset.' % steps_per_epoch * epochs)
+          if is_dataset:
+            # The dataset passed by the user ran out of batches.
+            # Now we know the cardinality of the dataset.
+            # If steps_per_epoch was specified, then running out of data is
+            # unexpected, so we stop training and inform the user.
+            if steps_per_epoch:
+              callbacks.model.stop_training = True
+              logging.warning(
+                  'Your dataset ran out of data; interrupting training. '
+                  'Make sure that your dataset can generate at least '
+                  '`%s * epochs` batches (in this case, %d batches). '
+                  'You may need to use the repeat() function when '
+                  'building your dataset.'
+                  % (steps_name, steps_per_epoch * epochs))
+            elif step > 0:
+              steps_per_epoch = step
+              aggregator.num_samples_or_steps = steps_per_epoch
+              progbar.params['steps'] = steps_per_epoch
+              progbar.progbar.target = steps_per_epoch
+          else:
+            # We ran out of batches while the user passed an iterator (legacy).
+            callbacks.model.stop_training = True
+            logging.warning(
+                'Your dataset iterator ran out of data; '
+                'interrupting training. Make sure that your iterator '
+                'can generate at least `%s * epochs` '
+                'batches (in this case, %d batches). You may need to'
+                'use the repeat() function when building your '
+                'dataset.' % (steps_name, steps_per_epoch * epochs))
           break
+
         if not isinstance(batch_outs, list):
           batch_outs = [batch_outs]
 
         if model._distribution_strategy:
-          batch_outs = training_distributed._per_device_aggregate_batch(
+          batch_outs = distributed_training_utils._per_device_aggregate_batch(
               batch_outs, model, mode)
 
         # Aggregate results.
@@ -285,9 +306,10 @@ def model_iteration(model,
         aggregator.aggregate(batch_outs)
 
         # Callbacks batch end.
-        batch_logs.update(training_utils.make_logs(model, batch_outs, mode))
+        batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
         callbacks._call_batch_hook(mode, 'end', step, batch_logs)
         progbar.on_batch_end(step, batch_logs)
+        step += 1
 
         if callbacks.model.stop_training:
           break
@@ -336,7 +358,7 @@ def model_iteration(model,
         aggregator.aggregate(batch_outs, batch_start, batch_end)
 
         # Callbacks batch end.
-        batch_logs.update(training_utils.make_logs(model, batch_outs, mode))
+        batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
         callbacks._call_batch_hook(mode, 'end', batch_index, batch_logs)
         progbar.on_batch_end(batch_index, batch_logs)
 
@@ -345,12 +367,21 @@ def model_iteration(model,
 
     aggregator.finalize()
     results = aggregator.results
-    epoch_logs.update(training_utils.make_logs(model, results, mode))
+    epoch_logs = cbks.make_logs(model, epoch_logs, results, mode)
     if len(results) == 1:
       results = results[0]
 
-    # Run the test loop every epoch during training.
-    if do_validation and not callbacks.model.stop_training:
+    # Run the test loop every `validation_freq` epochs during training.
+    if (do_validation and
+        training_utils.should_run_validation(validation_freq, epoch) and
+        not callbacks.model.stop_training):
+
+      if model._compile_distribution:
+        # Since we create a new clone from the original model we need to copy
+        # the weights back to the original model before we can run validation.
+        distributed_training_utils._copy_weights_to_original_model(
+            model, ModeKeys.TRAIN)
+
       val_results = model_iteration(
           model,
           val_inputs,
@@ -360,31 +391,138 @@ def model_iteration(model,
           steps_per_epoch=validation_steps,
           callbacks=callbacks,
           verbose=0,
-          mode='test',
-          validation_in_fit=True)
+          mode=ModeKeys.TEST,
+          validation_in_fit=True,
+          prepared_feed_values_from_dataset=(val_iterator is not None),
+          steps_name='validation_steps')
       if not isinstance(val_results, list):
         val_results = [val_results]
-      epoch_logs.update(
-          training_utils.make_logs(model, val_results, mode, prefix='val_'))
-
-    callbacks.on_epoch_end(epoch, epoch_logs, mode=mode)
+      epoch_logs = cbks.make_logs(
+          model, epoch_logs, val_results, mode, prefix='val_')
+      if val_iterator and epoch < epochs - 1:
+        _reinitialize_iterator(val_iterator, model._distribution_strategy)
+
+    if mode == ModeKeys.TRAIN:
+      # Epochs only apply to `fit`.
+      callbacks.on_epoch_end(epoch, epoch_logs)
     progbar.on_epoch_end(epoch, epoch_logs)
+
+    # Reinitialize dataset iterator for the next epoch.
+    if reset_dataset_after_each_epoch and epoch < epochs - 1:
+      _reinitialize_iterator(input_iterator, model._distribution_strategy)
+
   callbacks._call_end_hook(mode)
 
   if model._distribution_strategy:
-    # TODO(priyag, psv): Copy back metrics to the original model as well?
-    if not validation_in_fit:
-      training_distributed._copy_weights_to_original_model(
-          model, model._grouped_model, mode)
-
+    if model._compile_distribution:
+      # TODO(priyag, psv): Copy back metrics to the original model as well?
+      distributed_training_utils._copy_weights_to_original_model(model, mode)
     scope.__exit__(None, None, None)
 
-  if mode == 'train':
+  if mode == ModeKeys.TRAIN:
     return model.history
   return results
 
 
+def _get_model_feed(model, mode):
+  if mode == ModeKeys.PREDICT:
+    feed = model._feed_inputs
+  else:
+    feed = (
+        model._feed_inputs + model._feed_targets + model._feed_sample_weights)
+  return feed
+
+
+def _print_train_info(inputs, val_inputs, steps_per_epoch, verbose):
+  if (val_inputs and steps_per_epoch is None and verbose and inputs and
+      hasattr(inputs[0], 'shape') and hasattr(val_inputs[0], 'shape')):
+    print('Train on %d samples, validate on %d samples' %
+          (inputs[0].shape[0], val_inputs[0].shape[0]))
+
+
+def _get_num_samples_or_steps(ins, batch_size, steps_per_epoch):
+  """Returns total number of samples (when training in batch mode) or steps."""
+  if steps_per_epoch:
+    return steps_per_epoch
+  return training_utils.check_num_samples(ins, batch_size, steps_per_epoch,
+                                          'steps_per_epoch')
+
+
+def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
+  """Prepare feed values to the model execution function.
+
+  Arguments:
+    model: Model to prepare feed values for.
+    inputs: List or dict of model inputs.
+    targets: Optional list of model targets.
+    sample_weights: Optional list of sample weight arrays.
+    mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
+
+  Returns:
+    Feed values for the model in the given mode.
+  """
+  if model._distribution_strategy:
+    if isinstance(inputs, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)):
+      inputs = distributed_training_utils.get_iterator(
+          inputs, model._distribution_strategy)
+
+    def get_distributed_inputs():
+      return distributed_training_utils._prepare_feed_values(
+          model, inputs, targets, sample_weights, mode)
+
+    # In the eager case, we want to call the input method per step, so return
+    # a lambda from here that can be called. Note that this is applicable only
+    # in Distribution Strategy case as it follows the same code path for both
+    # eager and graph modes.
+    # TODO(priyag,omalleyt): Either we should move the training DS with
+    # EagerIterator to use training_generator code path, or figure out how to
+    # set a symbolic Iterator out of a Dataset when in eager mode.
+    if context.executing_eagerly():
+      return get_distributed_inputs
+    else:
+      return get_distributed_inputs()
+
+  if isinstance(inputs, (dataset_ops.DatasetV1, dataset_ops.DatasetV2,
+                         iterator_ops.Iterator)):
+    inputs, targets, sample_weights = model._standardize_user_data(
+        inputs,
+        extract_tensors_from_dataset=True)
+
+  inputs = training_utils.ModelInputs(inputs).as_list()
+  targets = targets or []
+  sample_weights = sample_weights or []
+  ins = inputs + targets + sample_weights
+  if mode == ModeKeys.TRAIN and not isinstance(K.symbolic_learning_phase(),
+                                               int):
+    ins += [True]  # Add learning phase value.
+  return ins
+
+
+def _get_iterator(inputs, distribution_strategy=None):
+  if distribution_strategy:
+    return distributed_training_utils.get_iterator(
+        inputs, distribution_strategy)
+  return training_utils.get_iterator(inputs)
+
+
+def _reinitialize_iterator(iterator, distribution_strategy=None):
+  if distribution_strategy:
+    distributed_training_utils.initialize_iterator(
+        iterator, distribution_strategy)
+  else:
+    training_utils.initialize_iterator(iterator)
+
+
+def _make_execution_function(model, mode):
+  """Makes function to run one step of model execution."""
+  if model._distribution_strategy:
+    return distributed_training_utils._make_execution_function(model, mode)
+  return model._make_execution_function(mode)
+
+
 # For backwards compatibility for internal users of these loops.
-fit_loop = functools.partial(model_iteration, mode='train')
-test_loop = functools.partial(model_iteration, mode='test', shuffle=False)
-predict_loop = functools.partial(model_iteration, mode='predict', shuffle=False)
+fit_loop = functools.partial(model_iteration, mode=ModeKeys.TRAIN)
+test_loop = functools.partial(
+    model_iteration, mode=ModeKeys.TEST, shuffle=False)
+predict_loop = functools.partial(
+    model_iteration, mode=ModeKeys.PREDICT, shuffle=False)
diff --git a/tensorflow/python/keras/engine/training_arrays_test.py b/tensorflow/python/keras/engine/training_arrays_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..daa4735c838ed428e9e6d36eece6859b8bf47dea
--- /dev/null
+++ b/tensorflow/python/keras/engine/training_arrays_test.py
@@ -0,0 +1,61 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for model.fit calls with a Dataset object passed as validation_data."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers import core
+from tensorflow.python.platform import test
+
+
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes
+class ValidationDatasetNoLimitTest(keras_parameterized.TestCase):
+
+  def create_dataset(self, num_samples, batch_size):
+    input_data = np.random.rand(num_samples, 1)
+    expected_data = input_data * 3
+    dataset = dataset_ops.Dataset.from_tensor_slices((input_data,
+                                                      expected_data))
+    return dataset.shuffle(10 * batch_size).batch(batch_size)
+
+  def test_validation_dataset_with_no_step_arg(self):
+    # Create a model that learns y=Mx.
+    layers = [core.Dense(1)]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(1,))
+    model.compile(loss="mse", optimizer="adam", metrics=["mean_absolute_error"])
+
+    train_dataset = self.create_dataset(num_samples=200, batch_size=10)
+    eval_dataset = self.create_dataset(num_samples=50, batch_size=25)
+
+    history = model.fit(x=train_dataset, validation_data=eval_dataset, epochs=2)
+    evaluation = model.evaluate(x=eval_dataset)
+
+    # If the fit call used the entire dataset, then the final val MAE error
+    # from the fit history should be equal to the final element in the output
+    # of evaluating the model on the same eval dataset.
+    self.assertAlmostEqual(history.history["val_mean_absolute_error"][-1],
+                           evaluation[-1])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/engine/training_dataset_test.py b/tensorflow/python/keras/engine/training_dataset_test.py
index d6cc93d1ef77b14142851e6267158d61edcbc13b..bc37b082a84a473c3801645d53654f65e7616a60 100644
--- a/tensorflow/python/keras/engine/training_dataset_test.py
+++ b/tensorflow/python/keras/engine/training_dataset_test.py
@@ -23,17 +23,25 @@ import logging
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import callbacks
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
-from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
+
+
+class BatchCounterCallback(callbacks.Callback):
+
+  def __init__(self):
+    self.batch_count = 0
+
+  def on_batch_end(self, *args, **kwargs):
+    self.batch_count += 1
 
 
 class TestTrainingWithDatasetIterators(keras_parameterized.TestCase):
@@ -42,7 +50,7 @@ class TestTrainingWithDatasetIterators(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_training_and_eval_methods_on_iterators_single_io(self):
     model = testing_utils.get_small_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    optimizer = 'rmsprop'
     loss = 'mse'
     metrics = ['mae', metrics_module.CategoricalAccuracy()]
     model.compile(optimizer, loss, metrics=metrics,
@@ -90,43 +98,20 @@ class TestTrainingWithDatasetIterators(keras_parameterized.TestCase):
                 epochs=1, steps_per_epoch=2, verbose=0)
 
     with self.assertRaisesRegexp(
-        ValueError, 'you should specify the `steps_per_epoch` argument'):
+        ValueError, 'the `steps_per_epoch` argument'):
       model.fit(iterator, epochs=1, verbose=0)
     with self.assertRaisesRegexp(ValueError,
-                                 'you should specify the `steps` argument'):
+                                 'the `steps` argument'):
       model.evaluate(iterator, verbose=0)
     with self.assertRaisesRegexp(ValueError,
-                                 'you should specify the `steps` argument'):
+                                 'the `steps` argument'):
       model.predict(iterator, verbose=0)
 
-  @keras_parameterized.run_with_all_model_types
-  @keras_parameterized.run_all_keras_modes
-  def test_get_next_op_created_once(self):
-    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae']
-    model.compile(optimizer, loss, metrics=metrics,
-                  run_eagerly=testing_utils.should_run_eagerly())
-
-    inputs = np.zeros((10, 3), np.float32)
-    targets = np.zeros((10, 4), np.float32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
-
-    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
-    # Finalize graph to make sure we are not appending another iterator
-    # get_next op in the graph.
-    ops.get_default_graph().finalize()
-    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
-
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_iterators_running_out_of_data(self):
     model = testing_utils.get_small_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    optimizer = 'rmsprop'
     loss = 'mse'
     metrics = ['mae']
     model.compile(optimizer, loss, metrics=metrics,
@@ -157,7 +142,7 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
       self.skipTest('b/120673224')
 
     model = testing_utils.get_small_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    optimizer = 'rmsprop'
     loss = 'mse'
     metrics = ['mae']
     model.compile(optimizer, loss, metrics=metrics,
@@ -172,9 +157,6 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
     # Call fit with validation data
     model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
               validation_data=dataset, validation_steps=2)
-    # Finalize the graph to make sure new ops aren't added when calling on the
-    # same dataset
-    ops.get_default_graph().finalize()
     model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
               validation_data=dataset, validation_steps=2)
 
@@ -182,7 +164,7 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_training_and_eval_methods_on_dataset(self):
     model = testing_utils.get_small_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    optimizer = 'rmsprop'
     loss = 'mse'
     metrics = ['mae', metrics_module.CategoricalAccuracy()]
     model.compile(optimizer, loss, metrics=metrics,
@@ -191,7 +173,7 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
     inputs = np.zeros((10, 3), np.float32)
     targets = np.zeros((10, 4), np.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
+    dataset = dataset.repeat()  # Infinite dataset.
     dataset = dataset.batch(10)
 
     model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
@@ -223,26 +205,101 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
           sample_weight=sample_weight)
 
     # Test invalid usage
+    with self.assertRaisesRegexp(ValueError, 'The `batch_size` argument'
+                                 ' must not be specified when using dataset'
+                                 ' as an input.'):
+      model.fit(dataset, batch_size=10, epochs=1, steps_per_epoch=2,
+                verbose=0)
+    with self.assertRaisesRegexp(ValueError, 'The `batch_size` argument'
+                                 ' must not be specified when using dataset'
+                                 ' as an input.'):
+      model.predict(dataset, batch_size=10, steps=2, verbose=0)
+    with self.assertRaisesRegexp(ValueError, 'The `batch_size` argument'
+                                 ' must not be specified when using dataset'
+                                 ' as an input.'):
+      model.evaluate(dataset, batch_size=10, steps=2, verbose=0)
+
     with self.assertRaisesRegexp(ValueError,
                                  'you should not specify a target'):
       model.fit(dataset, dataset,
                 epochs=1, steps_per_epoch=2, verbose=0)
 
+    # With an infinite dataset, `steps_per_epoch`/`steps` argument is required.
     with self.assertRaisesRegexp(
-        ValueError, 'you should specify the `steps_per_epoch` argument'):
+        ValueError, 'the `steps_per_epoch` argument'):
       model.fit(dataset, epochs=1, verbose=0)
     with self.assertRaisesRegexp(ValueError,
-                                 'you should specify the `steps` argument'):
+                                 'the `steps` argument'):
       model.evaluate(dataset, verbose=0)
     with self.assertRaisesRegexp(ValueError,
-                                 'you should specify the `steps` argument'):
+                                 'the `steps` argument'):
       model.predict(dataset, verbose=0)
 
+  # TODO(b/123531973): Include tests using dataset_v1.
+  @keras_parameterized.run_with_all_model_types(exclude_models='sequential')
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_training_and_eval_methods_on_multi_input_output_dataset(self):
+    input_a = keras.layers.Input(shape=(3,), name='input_1')
+    input_b = keras.layers.Input(shape=(3,), name='input_2')
+    dense = keras.layers.Dense(4, name='dense')
+    dropout = keras.layers.Dropout(0.5, name='dropout')
+    branch_a = [input_a, dense]
+    branch_b = [input_b, dense, dropout]
+
+    model = testing_utils.get_multi_io_model(branch_a, branch_b)
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    input_a_np = np.random.random((10, 3)).astype(dtype=np.float32)
+    input_b_np = np.random.random((10, 3)).astype(dtype=np.float32)
+    output_d_np = np.random.random((10, 4)).astype(dtype=np.float32)
+    output_e_np = np.random.random((10, 4)).astype(dtype=np.float32)
+
+    # Test with tuples
+    dataset_tuple = dataset_ops.Dataset.from_tensor_slices((
+        (input_a_np, input_b_np), (output_d_np, output_e_np)))
+    dataset_tuple = dataset_tuple.repeat(100)
+    dataset_tuple = dataset_tuple.batch(10)
+
+    model.fit(dataset_tuple, epochs=1, steps_per_epoch=2, verbose=1)
+    model.evaluate(dataset_tuple, steps=2, verbose=1)
+
+    predict_dataset_tuple = dataset_ops.Dataset.from_tensor_slices(
+        (input_a_np, input_b_np))
+    # TODO(b/123360757): Remove below assertion once predict() supports
+    # muti-input datasets.
+    with self.assertRaisesRegexp(ValueError,
+                                 'Error when checking model input'):
+      model.predict(predict_dataset_tuple, steps=1)
+
+    # Test with dict
+    input_dict = {'input_1': input_a_np, 'input_2': input_b_np}
+    if testing_utils.get_model_type() == 'subclass':
+      output_dict = {'output_1': output_d_np, 'output_2': output_e_np}
+    else:
+      output_dict = {'dense': output_d_np, 'dropout': output_e_np}
+
+    dataset_dict = dataset_ops.Dataset.from_tensor_slices((
+        input_dict, output_dict))
+    dataset_dict = dataset_dict.repeat(100)
+    dataset_dict = dataset_dict.batch(10)
+
+    model.fit(dataset_dict, epochs=1, steps_per_epoch=2, verbose=1)
+    model.evaluate(dataset_dict, steps=2, verbose=1)
+
+    predict_dataset_dict = dataset_ops.Dataset.from_tensor_slices(
+        input_dict)
+    predict_dataset_dict = predict_dataset_dict.repeat(100)
+    predict_dataset_dict = dataset_dict.batch(10)
+    model.predict(predict_dataset_dict, steps=1)
+
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_dataset_with_sample_weights(self):
     model = testing_utils.get_small_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    optimizer = 'rmsprop'
     loss = 'mse'
     metrics = ['mae', metrics_module.CategoricalAccuracy()]
     model.compile(optimizer, loss, metrics=metrics,
@@ -264,25 +321,52 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
   @keras_parameterized.run_all_keras_modes
   def test_dataset_with_sparse_labels(self):
     model = testing_utils.get_small_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    for loss in ['sparse_categorical_crossentropy',
-                 losses_impl.sparse_softmax_cross_entropy]:
-      model.compile(optimizer, loss,
-                    run_eagerly=testing_utils.should_run_eagerly())
-
-      inputs = np.zeros((10, 3), dtype=np.float32)
-      targets = np.random.randint(0, 4, size=10, dtype=np.int32)
-      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
+    optimizer = 'rmsprop'
+    model.compile(
+        optimizer,
+        loss='sparse_categorical_crossentropy',
+        run_eagerly=testing_utils.should_run_eagerly())
 
-      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+    inputs = np.zeros((10, 3), dtype=np.float32)
+    targets = np.random.randint(0, 4, size=10, dtype=np.int32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_dataset_fit_correctness(self):
+
+    class SumLayer(keras.layers.Layer):
+
+      def build(self, _):
+        self.w = self.add_weight('w', ())
+
+      def call(self, inputs):
+        return keras.backend.sum(inputs) + self.w * 0
+
+    model = keras.Sequential([SumLayer(input_shape=(2,))])
+    model.compile(
+        'rmsprop', loss='mae', run_eagerly=testing_utils.should_run_eagerly())
+
+    inputs = np.zeros((40, 2), dtype=np.float32)
+    inputs[10:20, :] = 2
+    inputs[20:30, :] = 1
+    inputs[30:, :] = 4
+    targets = np.zeros((40, 1), dtype=np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.batch(10)
+    history = model.fit(dataset,
+                        epochs=2, steps_per_epoch=2, verbose=1, shuffle=False)
+    self.assertListEqual(history.history['loss'],
+                         [inputs[:20].sum() / 2, inputs[20:].sum() / 2])
 
   @tf_test_util.run_deprecated_v1
   def test_dataset_input_shape_validation(self):
     with self.cached_session():
       model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-      model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse')
+      model.compile(optimizer='rmsprop', loss='mse')
 
       # User forgets to batch the dataset
       inputs = np.zeros((10, 3))
@@ -307,6 +391,88 @@ class TestTrainingWithDataset(keras_parameterized.TestCase):
                                    r'expected (.*?) to have shape \(3,\)'):
         model.train_on_batch(dataset)
 
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_finite_dataset_known_cardinality_no_steps_arg(self):
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+    model.compile('rmsprop', 'mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    inputs = np.zeros((100, 3), dtype=np.float32)
+    targets = np.random.randint(0, 4, size=100, dtype=np.int32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.batch(10)
+
+    batch_counter = BatchCounterCallback()
+    history = model.fit(dataset, epochs=2, verbose=1, callbacks=[batch_counter])
+
+    self.assertLen(history.history['loss'], 2)
+    self.assertEqual(batch_counter.batch_count, 20)
+    model.evaluate(dataset)
+    out = model.predict(dataset)
+    self.assertEqual(out.shape[0], 100)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_finite_dataset_unknown_cardinality_no_steps_arg(self):
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+    model.compile('rmsprop', 'mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    inputs = np.zeros((100, 3), dtype=np.float32)
+    targets = np.random.randint(0, 4, size=100, dtype=np.int32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.filter(lambda x, y: True).batch(10)
+    self.assertEqual(keras.backend.get_value(cardinality.cardinality(dataset)),
+                     cardinality.UNKNOWN)
+
+    batch_counter = BatchCounterCallback()
+    history = model.fit(dataset, epochs=2, verbose=1, callbacks=[batch_counter])
+
+    self.assertLen(history.history['loss'], 2)
+    self.assertEqual(batch_counter.batch_count, 20)
+    model.evaluate(dataset)
+    out = model.predict(dataset)
+    self.assertEqual(out.shape[0], 100)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_finite_dataset_unknown_cardinality_out_of_data(self):
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+    model.compile('rmsprop', 'mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    inputs = np.zeros((100, 3), dtype=np.float32)
+    targets = np.random.randint(0, 4, size=100, dtype=np.int32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.filter(lambda x, y: True).batch(10)
+    self.assertEqual(
+        keras.backend.get_value(cardinality.cardinality(dataset)),
+        cardinality.UNKNOWN)
+
+    batch_counter = BatchCounterCallback()
+    with test.mock.patch.object(logging, 'warning') as mock_log:
+      # steps_per_epoch (200) is greater than the dataset size (100). As this is
+      # unexpected, training will stop and not make it to the second epoch.
+      history = model.fit(
+          dataset,
+          epochs=2,
+          verbose=1,
+          callbacks=[batch_counter],
+          steps_per_epoch=200)
+      self.assertIn(
+          'Your dataset ran out of data; interrupting training. '
+          'Make sure that your dataset can generate at least '
+          '`steps_per_epoch * epochs` batches (in this case, 400 batches). '
+          'You may need to use the repeat() function when '
+          'building your dataset.', str(mock_log.call_args))
+
+    self.assertLen(history.history['loss'], 1)
+    self.assertEqual(batch_counter.batch_count, 10)
+    model.evaluate(dataset)
+    out = model.predict(dataset)
+    self.assertEqual(out.shape[0], 100)
+
 
 class TestMetricsWithDatasetIterators(keras_parameterized.TestCase):
 
@@ -324,7 +490,7 @@ class TestMetricsWithDatasetIterators(keras_parameterized.TestCase):
     model.compile(
         loss='binary_crossentropy',
         metrics=['accuracy', metrics_module.BinaryAccuracy()],
-        optimizer=RMSPropOptimizer(learning_rate=0.001),
+        optimizer='rmsprop',
         run_eagerly=testing_utils.should_run_eagerly())
 
     np.random.seed(123)
diff --git a/tensorflow/python/keras/engine/training_distributed.py b/tensorflow/python/keras/engine/training_distributed.py
index d20d092d8e61499e4a005f7d6770a3c0a0ee60fc..3df58af0a25566ec78c5c8c1d2aaa1a06129fb58 100644
--- a/tensorflow/python/keras/engine/training_distributed.py
+++ b/tensorflow/python/keras/engine/training_distributed.py
@@ -19,50 +19,199 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import enum  # pylint: disable=g-bad-import-order
 import numpy as np
 
-from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
-from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks as cbks
-from tensorflow.python.keras import metrics as metrics_module
-from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.engine import distributed_training_utils
+from tensorflow.python.keras.engine import partial_batch_padding_handler as padding_util
+from tensorflow.python.keras.engine import training_arrays
+from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
 
-# TODO(sourabhbajaj): Check if we can merge the test and prediction graphs
-class _Mode(enum.Enum):
-  TRAIN = 'train'
-  TEST = 'test'
-  PREDICT = 'predict'
-# TODO(priyag, sourabhbajaj): Refactor this file to address code duplication.
-
-
-def experimental_fit_loop(model,
-                          iterator,
-                          epochs=100,
-                          verbose=1,
-                          callbacks=None,
-                          initial_epoch=0,
-                          steps_per_epoch=None,
-                          val_iterator=None,
-                          validation_steps=None):
+def fit_distributed(model,
+                    x=None,
+                    y=None,
+                    batch_size=None,
+                    epochs=1,
+                    verbose=1,
+                    callbacks=None,
+                    validation_split=0.,
+                    validation_data=None,
+                    shuffle=True,
+                    class_weight=None,
+                    sample_weight=None,
+                    initial_epoch=0,
+                    steps_per_epoch=None,
+                    validation_steps=None,
+                    validation_freq=1):
+  """Fit loop for Distribution Strategies."""
+  distributed_training_utils.validate_callbacks(callbacks, model.optimizer)
+  distributed_training_utils.validate_inputs(
+      x, y, model._distribution_strategy)
+
+  first_x_value = nest.flatten(x)[0]
+  if isinstance(first_x_value, np.ndarray):
+    # Until support for partial batch is implemented across all
+    # functions and distribution strategy, we pass `mode` to selectively
+    # relax the costraint to consume all the training samples.
+    steps_per_epoch, batch_size = (
+        distributed_training_utils.get_input_params(
+            model._distribution_strategy, first_x_value, steps_per_epoch,
+            batch_size, mode=ModeKeys.TRAIN))
+  batch_size = model._validate_or_infer_batch_size(
+      batch_size, steps_per_epoch, x)
+  dataset = model._distribution_standardize_user_data(
+      x, y,
+      sample_weight=sample_weight,
+      class_weight=class_weight,
+      batch_size=batch_size,
+      validation_split=validation_split,
+      shuffle=shuffle)
+
+  val_dataset = None
+  if validation_data:
+    val_x, val_y, val_sample_weights = model._unpack_validation_data(
+        validation_data)
+    distributed_training_utils.validate_inputs(
+        val_x, val_y, model._distribution_strategy)
+    first_valx_value = nest.flatten(val_x)[0]
+    if isinstance(first_valx_value, np.ndarray):
+      validation_steps, _ = distributed_training_utils.get_input_params(
+          model._distribution_strategy, first_valx_value, validation_steps,
+          batch_size)
+    val_dataset = model._distribution_standardize_user_data(
+        val_x, val_y,
+        sample_weight=val_sample_weights,
+        class_weight=None,
+        batch_size=batch_size,
+        validation_split=validation_split,
+        shuffle=shuffle)
+  elif validation_split:
+    raise ValueError('validation_split argument is not supported with '
+                     'distribution strategies.')
+
+  if distributed_training_utils.is_tpu_strategy(model._distribution_strategy):
+    return experimental_tpu_fit_loop(
+        model,
+        dataset,
+        epochs=epochs,
+        verbose=verbose,
+        callbacks=callbacks,
+        val_dataset=val_dataset,
+        initial_epoch=initial_epoch,
+        steps_per_epoch=steps_per_epoch,
+        validation_steps=validation_steps,
+        validation_freq=validation_freq)
+  else:
+    return training_arrays.fit_loop(
+        model,
+        dataset,
+        batch_size=batch_size,
+        epochs=epochs,
+        verbose=verbose,
+        callbacks=callbacks,
+        val_inputs=val_dataset,
+        shuffle=shuffle,
+        initial_epoch=initial_epoch,
+        steps_per_epoch=steps_per_epoch,
+        validation_steps=validation_steps,
+        validation_freq=validation_freq,
+        steps_name='steps_per_epoch')
+
+
+def evaluate_distributed(model,
+                         x=None,
+                         y=None,
+                         batch_size=None,
+                         verbose=1,
+                         sample_weight=None,
+                         steps=None,
+                         callbacks=None):
+  """Evaluate loop for Distribution Strategies."""
+  distributed_training_utils.validate_inputs(x, y, model._distribution_strategy)
+  first_x_value = nest.flatten(x)[0]
+  if isinstance(first_x_value, np.ndarray):
+    steps, batch_size = distributed_training_utils.get_input_params(
+        model._distribution_strategy, first_x_value, steps, batch_size)
+  batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
+  dataset = model._distribution_standardize_user_data(
+      x, y,
+      sample_weight=sample_weight,
+      batch_size=batch_size)
+
+  if distributed_training_utils.is_tpu_strategy(model._distribution_strategy):
+    return experimental_tpu_test_loop(
+        model, dataset, verbose=verbose, steps=steps, callbacks=callbacks)
+  else:
+    return training_arrays.test_loop(
+        model,
+        inputs=dataset,
+        batch_size=batch_size,
+        verbose=verbose,
+        steps=steps,
+        callbacks=callbacks)
+
+
+def predict_distributed(model,
+                        x=None,
+                        batch_size=None,
+                        verbose=0,
+                        steps=None,
+                        callbacks=None):
+  """Predict loop for Distribution Strategies."""
+  distributed_training_utils.validate_inputs(
+      x, None, model._distribution_strategy, allow_partial_batch=True)
+  first_x_value = nest.flatten(x)[0]
+  if isinstance(first_x_value, np.ndarray):
+    steps, batch_size = distributed_training_utils.get_input_params(
+        model._distribution_strategy, first_x_value, steps,
+        batch_size, mode=ModeKeys.PREDICT)
+  batch_size = model._validate_or_infer_batch_size(batch_size, steps, x)
+  dataset = model._distribution_standardize_user_data(
+      x,
+      batch_size=batch_size,
+      repeat=False,
+      allow_partial_batch=True)
+  if distributed_training_utils.is_tpu_strategy(model._distribution_strategy):
+    return experimental_tpu_predict_loop(
+        model, dataset, verbose=verbose, steps=steps, callbacks=callbacks)
+  else:
+    return training_arrays.predict_loop(
+        model,
+        dataset,
+        batch_size=batch_size,
+        verbose=verbose,
+        steps=steps,
+        callbacks=callbacks)
+
+
+def experimental_tpu_fit_loop(model,
+                              dataset,
+                              epochs=100,
+                              verbose=1,
+                              callbacks=None,
+                              initial_epoch=0,
+                              steps_per_epoch=None,
+                              val_dataset=None,
+                              validation_steps=None,
+                              validation_freq=1):
   """Fit loop for training with TPU DistributionStrategy.
 
   Arguments:
       model: Keras Model instance.
-      iterator: Iterator that returns inputs and targets
+      dataset: Dataset that returns inputs and targets
       epochs: Number of times to iterate over the data
       verbose: Integer, Verbosity mode, 0, 1 or 2
       callbacks: List of callbacks to be called during training
@@ -71,10 +220,17 @@ def experimental_fit_loop(model,
       steps_per_epoch: Total number of steps (batches of samples)
           before declaring one epoch finished and starting the
           next epoch. Ignored with the default value of `None`.
-      val_iterator: Iterator for validation data.
+      val_dataset: Dataset for validation data.
       validation_steps: Number of steps to run validation for
           (only if doing validation from data tensors).
           Ignored with the default value of `None`.
+      validation_freq: Only relevant if validation data is provided. Integer or
+          `collections.Container` instance (e.g. list, tuple, etc.). If an
+          integer, specifies how many training epochs to run before a new
+          validation run is performed, e.g. `validation_freq=2` runs
+          validation every 2 epochs. If a Container, specifies the epochs on
+          which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
+          validation at the end of the 1st, 2nd, and 10th epochs.
 
   Returns:
       Returns `None`.
@@ -82,36 +238,44 @@ def experimental_fit_loop(model,
   Raises:
       ValueError: in case of invalid arguments.
   """
+  mode = ModeKeys.TRAIN
+  # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops.
   current_strategy = model._distribution_strategy
+  iterator = distributed_training_utils.get_iterator(dataset, current_strategy)
+  steps_per_epoch = training_utils.infer_steps_for_dataset(
+      dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch')
+  if (current_strategy.extended.steps_per_run != 1 and
+      steps_per_epoch is None):
+    raise ValueError('`steps_per_epoch` should be specified when calling '
+                     '`fit` on the model with TPUStrategy when '
+                     '`steps_per_run` != 1 .')
 
-  K.get_session().run(current_strategy.initialize())
+  scope = distributed_training_utils.distributed_scope(
+      strategy=current_strategy, learning_phase=1)
+  scope.__enter__()
 
   def _per_device_fit_function(model):
     model._make_fit_function()
     return (model._fit_function.inputs, model._fit_function.outputs,
             model._fit_function.updates_op, model._fit_function.session_kwargs)
 
-  # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
-  K.set_learning_phase(1)
   out_labels = model.metrics_names or []
 
   def step_fn(ctx, inputs):
     """Clones the model and calls make_fit_function."""
-    # TODO(priyag, sourabhbajaj): The model gets cloned every time
-    # fit/test/predict is called. We should look into caching this keyed on
-    # input shapes.
     inputs, targets = inputs
-    clone_model_on_replicas(
-        model,
-        current_strategy,
-        make_callback_model=True,
-        inputs=inputs,
-        targets=targets,
-        mode=_Mode.TRAIN)
+    if model._compile_distribution:
+      distributed_training_utils.clone_model_on_replicas(
+          model, current_strategy, mode, inputs=inputs, targets=targets)
+    else:
+      distributed_training_utils._build_distributed_network(
+          model, current_strategy, mode, inputs, targets)
 
     (grouped_inputs, grouped_outputs, grouped_updates,
      grouped_session_args) = current_strategy.extended.call_for_each_replica(
-         _per_device_fit_function, args=(model._grouped_model_train,))
+         _per_device_fit_function,
+         args=(distributed_training_utils.get_distributed_model(
+             model, ModeKeys.TRAIN),))
     (all_inputs, all_outputs, all_updates,
      all_session_args) = distributed_training_utils.unwrap_values(
          current_strategy, grouped_inputs, grouped_outputs,
@@ -125,7 +289,7 @@ def experimental_fit_loop(model,
 
     for label, output in zip(out_labels, combined_fn.outputs):
       if label == 'loss':
-        reduce_op = distribute_lib.get_loss_reduction()
+        reduce_op = ds_reduce_util.ReduceOp.SUM
       else:
         # We reduce all other metrics using mean for now. This is temporary
         # workaround until new metrics are in place.
@@ -144,27 +308,27 @@ def experimental_fit_loop(model,
     tensor = model._all_stateful_metrics_tensors[name]
     initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)
 
-  if steps_per_epoch is None:
-    raise ValueError('`steps_per_epoch` should be specified when calling '
-                     '`fit` on the model.')
+  use_steps = steps_per_epoch is not None
+  if use_steps:
+    iteration_value = min(steps_per_epoch,
+                          current_strategy.extended.steps_per_run)
+  else:
+    iteration_value = current_strategy.extended.steps_per_run
+
   steps_per_run = K.variable(
-      value=min(steps_per_epoch, current_strategy.extended.steps_per_run),
+      value=iteration_value,
       dtype='int32',
       name='steps_per_run')
-
-  with current_strategy.scope():
-    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
-        step_fn, iterator, iterations=steps_per_run,
-        initial_loop_values=initial_loop_values)
-
+  ctx = current_strategy.extended.experimental_run_steps_on_iterator(
+      step_fn, iterator, iterations=steps_per_run,
+      initial_loop_values=initial_loop_values)
   train_op = ctx.run_op
   output_tensors = ctx.last_step_outputs
 
   do_validation = bool(validation_steps)
 
-  # Copy the weights from the original model to each of the replicated models.
-  with current_strategy.scope():
-    _copy_weights_to_distributed_model(model, model._grouped_model_train)
+  if model._compile_distribution:
+    distributed_training_utils._copy_weights_to_distributed_model(model, mode)
 
   callbacks = cbks.configure_callbacks(
       callbacks,
@@ -172,60 +336,78 @@ def experimental_fit_loop(model,
       do_validation=do_validation,
       epochs=epochs,
       steps_per_epoch=steps_per_epoch,
-      verbose=verbose)
+      verbose=verbose,
+      count_mode='steps',
+      mode=mode)
 
   # Calculate the steps each time on the device.
-  steps_to_run = [current_strategy.extended.steps_per_run] * (
-      steps_per_epoch // current_strategy.extended.steps_per_run)
-  if steps_per_epoch % current_strategy.extended.steps_per_run:
-    steps_to_run.append(
-        steps_per_epoch % current_strategy.extended.steps_per_run)
+  if use_steps:
+    steps_to_run = ([current_strategy.extended.steps_per_run] *
+                    (steps_per_epoch //
+                     current_strategy.extended.steps_per_run))
+    if steps_per_epoch % current_strategy.extended.steps_per_run:
+      steps_to_run.append(
+          steps_per_epoch % current_strategy.extended.steps_per_run)
+    target_steps = len(steps_to_run)
+  else:
+    target_steps = np.inf
 
-  callbacks.on_train_begin()
+  callbacks._call_begin_hook(mode)
   for epoch in range(initial_epoch, epochs):
-    with current_strategy.scope():
-      _reset_metrics(model, model._grouped_model_train)
+    distributed_training_utils._reset_metrics(model)
     callbacks.on_epoch_begin(epoch)
     epoch_logs = {}
     step_index = 0
     prev_step_count = None
-    for step_count in steps_to_run:
+    current_step = 0
+    while current_step < target_steps:
+      step_count = steps_to_run[current_step] if use_steps else 1
       batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count}
-      callbacks.on_batch_begin(step_index, batch_logs)
+      callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs)
       if prev_step_count is None or step_count != prev_step_count:
         steps_per_run.load(step_count, K.get_session())
         prev_step_count = step_count
       try:
-        _, outputs = K.get_session().run([train_op, output_tensors])
+        _, outputs = K.batch_get_value([train_op, output_tensors])
       except errors.OutOfRangeError:
-        logging.warning('Your dataset iterator ran out of data; '
-                        'interrupting training. Make sure that your dataset '
-                        'can generate at least `steps_per_epoch * epochs` '
-                        'batches (in this case, %d batches).' %
-                        steps_per_epoch * epochs)
+        if use_steps:
+          logging.warning('Your dataset iterator ran out of data; '
+                          'interrupting training. Make sure that your dataset '
+                          'can generate at least `steps_per_epoch * epochs` '
+                          'batches (in this case, %d batches).' %
+                          steps_per_epoch * epochs)
+        else:
+          target_steps = current_step
+          logging.info('Dataset iterator ran out of data. Inferring the '
+                       'value of `steps_per_epoch` as %s  .' % target_steps)
+          distributed_training_utils.initialize_iterator(iterator,
+                                                         current_strategy)
         break
 
       batch_logs.update(outputs)
-      callbacks.on_batch_end(step_index, batch_logs)
+      callbacks._call_batch_hook(mode, 'end', step_index, batch_logs)
       step_index = step_index + step_count
+      current_step += 1
+
       if callbacks.model.stop_training:
         break
 
-    if do_validation:
+    if (do_validation and
+        training_utils.should_run_validation(validation_freq, epoch)):
       logging.info('Running validation at fit epoch: %s', epoch)
 
-      # Since we create a new clone from the original model we need to copy
-      # the weights back to the original model before we can run validation.
-      with current_strategy.scope():
-        _copy_weights_to_original_model(model, model._grouped_model_train,
-                                        'train')
+      if model._compile_distribution:
+        # Since we create a new clone from the original model we need to copy
+        # the weights back to the original model before we can run validation.
+        distributed_training_utils._copy_weights_to_original_model(
+            model, ModeKeys.TRAIN)
 
-      val_outs = experimental_test_loop(  # pylint: disable=undefined-variable
+      val_outs = experimental_tpu_test_loop(  # pylint: disable=undefined-variable
           model,
-          val_iterator,
+          val_dataset,
           steps=validation_steps,
           verbose=verbose,
-          initialize_finalize_strategy=False)
+          callbacks=callbacks)
       if not isinstance(val_outs, list):
         val_outs = [val_outs]
       # Same labels assumed.
@@ -235,32 +417,31 @@ def experimental_fit_loop(model,
     callbacks.on_epoch_end(epoch, epoch_logs)
     if callbacks.model.stop_training:
       break
-  callbacks.on_train_end()
-
-  # Copy the weights back from the replicated model to the original model.
-  with current_strategy.scope():
-    _copy_weights_to_original_model(model, model._grouped_model_train, 'train')
+  callbacks._call_end_hook(mode)
 
-  K.get_session().run(current_strategy.finalize())
+  if model._compile_distribution:
+    # Copy the weights back from the replicated model to the original model.
+    distributed_training_utils._copy_weights_to_original_model(
+        model, ModeKeys.TRAIN)
+  scope.__exit__(None, None, None)
   return model.history
 
 
-def experimental_test_loop(model,
-                           iterator,
-                           verbose=0,
-                           steps=None,
-                           initialize_finalize_strategy=True):
+def experimental_tpu_test_loop(model,
+                               dataset,
+                               verbose=0,
+                               steps=None,
+                               callbacks=None):
   """Test loop for evaluating with TPU DistributionStrategy.
 
   Arguments:
       model: Keras Model instance.
-      iterator: Iterator for input data.
+      dataset: Dataset for input data.
       verbose: Integer, Verbosity mode 0 or 1.
       steps: Total number of steps (batches of samples)
           before declaring predictions finished.
           Ignored with the default value of `None`.
-      initialize_finalize_strategy: Should the strategy initialize and finalize
-          functions be called.
+      callbacks: List of callbacks to be called during training
 
   Returns:
       Scalar loss (if the model has a single output and no metrics)
@@ -268,9 +449,16 @@ def experimental_test_loop(model,
       and/or metrics). The attribute `model.metrics_names` will give you
       the display labels for the outputs.
   """
+  mode = ModeKeys.TEST
   current_strategy = model._distribution_strategy
-  if initialize_finalize_strategy:
-    K.get_session().run(current_strategy.initialize())
+  iterator = distributed_training_utils.get_iterator(dataset,
+                                                     current_strategy)
+  steps = training_utils.infer_steps_for_dataset(dataset, steps,
+                                                 steps_name='steps')
+
+  scope = distributed_training_utils.distributed_scope(
+      strategy=current_strategy, learning_phase=0)
+  scope.__enter__()
 
   def _per_device_eval_function(model):
     model._make_eval_function()
@@ -278,26 +466,21 @@ def experimental_test_loop(model,
             model._eval_function.updates_op,
             model._eval_function.session_kwargs)
 
-  # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
-  K.set_learning_phase(0)
-
   def step_fn(ctx, inputs):
     """Clones the model and calls make_eval_function."""
-    # TODO(priyag, sourabhbajaj): The model gets cloned every time
-    # fit/test/predict is called. We should look into caching this keyed on
-    # input shapes.
     inputs, targets = inputs
-    clone_model_on_replicas(
-        model,
-        current_strategy,
-        make_callback_model=False,
-        inputs=inputs,
-        targets=targets,
-        mode=_Mode.TEST)
+    if model._compile_distribution:
+      distributed_training_utils.clone_model_on_replicas(
+          model, current_strategy, mode=mode, inputs=inputs, targets=targets)
+    else:
+      distributed_training_utils._build_distributed_network(
+          model, current_strategy, mode, inputs, targets)
 
     (grouped_inputs, grouped_outputs, grouped_updates,
      grouped_session_args) = current_strategy.extended.call_for_each_replica(
-         _per_device_eval_function, args=(model._grouped_model_test,))
+         _per_device_eval_function,
+         args=(distributed_training_utils.get_distributed_model(
+             model, ModeKeys.TEST),))
 
     (all_inputs, all_outputs, all_updates,
      all_session_args) = distributed_training_utils.unwrap_values(
@@ -312,7 +495,7 @@ def experimental_test_loop(model,
 
     for label, output in zip(model.metrics_names, combined_fn.outputs):
       if label == 'loss':
-        reduce_op = distribute_lib.get_loss_reduction()
+        reduce_op = ds_reduce_util.ReduceOp.SUM
       else:
         # We reduce all other metrics using mean for now. This is temporary
         # workaround until new metrics are in place.
@@ -328,12 +511,11 @@ def experimental_test_loop(model,
     tensor = model._all_stateful_metrics_tensors[name]
     initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)
 
-  with current_strategy.scope():
-    # TODO(priyag): Use steps_per_run when we use new metrics as they will
-    # allow handling metric computation at each step using variables.
-    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
-        step_fn, iterator, iterations=1,
-        initial_loop_values=initial_loop_values)
+  # TODO(priyag): Use steps_per_run when we use new metrics as they will
+  # allow handling metric computation at each step using variables.
+  ctx = current_strategy.extended.experimental_run_steps_on_iterator(
+      step_fn, iterator, iterations=1,
+      initial_loop_values=initial_loop_values)
 
   test_op = ctx.run_op
   output_tensors = ctx.last_step_outputs
@@ -341,14 +523,45 @@ def experimental_test_loop(model,
   if verbose == 1:
     progbar = Progbar(target=steps)
 
-  # Copy the weights from the original model to each of the replicated models.
-  with current_strategy.scope():
-    _copy_weights_to_distributed_model(model, model._grouped_model_test)
-    _reset_metrics(model, model._grouped_model_test)
-  assert steps is not None
+  if model._compile_distribution:
+    distributed_training_utils._copy_weights_to_distributed_model(model, mode)
+
+  distributed_training_utils._reset_metrics(model)
+
+  callbacks = cbks.configure_callbacks(
+      callbacks,
+      model,
+      do_validation=False,
+      epochs=1,
+      steps_per_epoch=steps,
+      verbose=verbose,
+      count_mode='steps',
+      mode=ModeKeys.TEST)
+  callbacks._call_begin_hook(mode)
+
   outs = [0.] * len(model.metrics_names)
-  for step in range(steps):
-    _, batch_outs = K.get_session().run([test_op, output_tensors])
+  if steps is not None:
+    target_steps = steps
+  else:
+    target_steps = np.inf
+
+  current_step = 0
+  while current_step < target_steps:
+    batch_logs = {'batch': current_step, 'size': 1}
+    callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs)
+    try:
+      _, batch_outs = K.batch_get_value([test_op, output_tensors])
+    except errors.OutOfRangeError:
+      if steps is not None:
+        warning_msg = 'Make sure that your dataset can generate at least '
+        '`steps` batches (in this case, {} batches).'.format(steps)
+      else:
+        warning_msg = 'Number of steps ran: {} steps'.format(current_step)
+
+      logging.warning('Your dataset iterator ran out of data; '
+                      'interrupting evaluation. ' + warning_msg)
+      target_steps = current_step
+      break
     for i, label in enumerate(model.metrics_names):
       if i == 0:
         # Loss is stateless metrics.
@@ -357,41 +570,77 @@ def experimental_test_loop(model,
         # For all stateful metrics, the aggregation is handled by mirrored vars.
         outs[i] = batch_outs[label]
 
+    batch_logs = cbks.make_logs(model, batch_logs, outs, mode)
+    callbacks._call_batch_hook(mode, 'end', current_step, batch_logs)
     if verbose >= 1:
-      progbar.update(step + 1)
+      progbar.update(current_step + 1)
+    current_step += 1
 
-  if len(outs) >= 0:
-    outs[0] /= (steps)
+  callbacks._call_end_hook(mode)
 
-  if initialize_finalize_strategy:
-    K.get_session().run(current_strategy.finalize())
+  scope.__exit__(None, None, None)
+  if len(outs) >= 0:
+    outs[0] /= (target_steps)
 
   if len(outs) == 1:
     return outs[0]
   return outs
 
 
-def experimental_predict_loop(model, iterator, verbose=0, steps=None):
+def experimental_tpu_predict_loop(model,
+                                  dataset,
+                                  verbose=0,
+                                  steps=None,
+                                  callbacks=None):
   """Predict loop for predicting with TPU DistributionStrategy.
 
   Arguments:
       model: Keras Model instance.
-      iterator: Iterator for input data.
+      dataset: Dataset for input data.
       verbose: Integer, Verbosity mode 0 or 1.
       steps: Total number of steps (batches of samples)
           before declaring `_predict_loop` finished.
           Ignored with the default value of `None`.
+      callbacks: List of callbacks to be called during training
 
   Returns:
       Array of predictions (if the model has a single output)
       or list of arrays of predictions
       (if the model has multiple outputs).
   """
+  mode = ModeKeys.PREDICT
+  steps = training_utils.infer_steps_for_dataset(dataset, steps,
+                                                 steps_name='steps')
+  dataset_fully_shaped = (distributed_training_utils.
+                          is_dataset_shape_fully_defined(dataset))
+  padding_handler = None
+  if not dataset_fully_shaped:
+    # TODO(hongjunchoi): Investigate whether operations from
+    # PartialBatchPaddingHandler are unnecessarily pruned out
+    # during graph optimization.
+    padding_handler = padding_util.PartialBatchPaddingHandler(
+        model._feed_output_shapes)
+    batch_size, _, prefetch_buffer = input_lib._get_dataset_attributes(dataset)
+    padding_handler.padded_batch_size = batch_size
+    padding_handler.padding_mask = dataset.reduce(padding_handler.padding_mask,
+                                                  padding_handler.update_mask)
+
+    dataset = dataset.map(padding_handler.pad_batch)
+    dataset = dataset.apply(batching.unbatch())
+    # Upon this point, it is guaranteed that the dataset does not
+    # have partial batches. Thus, we set `drop_remainder=True` to
+    # get static shape information about the elements in the dataset.
+    dataset = dataset.batch(batch_size, drop_remainder=True)
+
+    if prefetch_buffer is not None:
+      dataset = dataset.prefetch(prefetch_buffer)
+
   current_strategy = model._distribution_strategy
-  K.get_session().run(current_strategy.initialize())
+  iterator = distributed_training_utils.get_iterator(dataset, current_strategy)
 
-  # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
-  K.set_learning_phase(0)
+  scope = distributed_training_utils.distributed_scope(
+      strategy=current_strategy, learning_phase=0)
+  scope.__enter__()
 
   def _per_device_predict_function(model):
     model._make_predict_function()
@@ -402,20 +651,18 @@ def experimental_predict_loop(model, iterator, verbose=0, steps=None):
 
   def step_fn(ctx, inputs):
     """Clones the model and calls make_predict_function."""
-
-    # TODO(priyag, sourabhbajaj): The model gets cloned every time
-    # fit/test/predict is called. We should look into caching this keyed on
-    # input shapes.
-    clone_model_on_replicas(
-        model,
-        current_strategy,
-        make_callback_model=False,
-        inputs=inputs,
-        mode=_Mode.PREDICT)
+    if model._compile_distribution:
+      distributed_training_utils.clone_model_on_replicas(
+          model, current_strategy, mode, inputs=inputs)
+    else:
+      distributed_training_utils._build_distributed_network(
+          model, current_strategy, mode, inputs)
 
     (grouped_inputs, grouped_outputs, grouped_updates,
      grouped_session_args) = current_strategy.extended.call_for_each_replica(
-         _per_device_predict_function, args=(model._grouped_model_predict,))
+         _per_device_predict_function,
+         args=(distributed_training_utils.get_distributed_model(
+             model, ModeKeys.PREDICT),))
 
     (all_inputs, all_outputs, all_updates,
      all_session_args) = distributed_training_utils.unwrap_values(
@@ -443,11 +690,10 @@ def experimental_predict_loop(model, iterator, verbose=0, steps=None):
     shape.dims = [batch_dimension] + shape.dims[1:]
     initial_loop_values[name] = array_ops.zeros(shape, tensor.dtype)
 
-  with current_strategy.scope():
-    # TODO(priyag, sourabhbajaj): Support steps_per_run if/when we add outfeed.
-    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
-        step_fn, iterator, iterations=1,
-        initial_loop_values=initial_loop_values)
+  # TODO(priyag, sourabhbajaj): Support steps_per_run if/when we add outfeed.
+  ctx = current_strategy.extended.experimental_run_steps_on_iterator(
+      step_fn, iterator, iterations=1,
+      initial_loop_values=initial_loop_values)
 
   predict_op = ctx.run_op
   output_tensors = ctx.last_step_outputs
@@ -455,279 +701,70 @@ def experimental_predict_loop(model, iterator, verbose=0, steps=None):
   if verbose == 1:
     progbar = Progbar(target=steps)
 
-  # Copy the weights from the original model to each of the replicated models.
-  with current_strategy.scope():
-    _copy_weights_to_distributed_model(model, model._grouped_model_predict)
-    _reset_metrics(model, model._grouped_model_predict)
-  assert steps is not None
+  if model._compile_distribution:
+    distributed_training_utils._copy_weights_to_distributed_model(model, mode)
+
+  distributed_training_utils._reset_metrics(model)
+
+  callbacks = cbks.configure_callbacks(
+      callbacks,
+      model,
+      do_validation=False,
+      epochs=1,
+      steps_per_epoch=steps,
+      verbose=verbose,
+      count_mode='steps',
+      mode=mode)
+  callbacks._call_begin_hook(mode)
+
   # Since we do not know how many samples we will see, we cannot pre-allocate
   # the returned Numpy arrays. Instead, we store one array per batch seen
   # and concatenate them upon returning.
   unconcatenated_outs = [[] for _ in model.outputs]
-  for step in range(steps):
-    _, batch_outs = K.get_session().run([predict_op, output_tensors])
+  if steps is not None:
+    target_steps = steps
+  else:
+    target_steps = np.inf
+
+  current_step = 0
+  while current_step < target_steps:
+    batch_logs = {'batch': current_step, 'size': 1}
+    callbacks._call_batch_hook(mode, 'begin', current_step, batch_logs)
+    try:
+      _, batch_outs = K.batch_get_value([predict_op, output_tensors])
+    except errors.OutOfRangeError:
+      if steps is not None:
+        warning_msg = 'Make sure that your dataset can generate at least '
+        '`steps` batches (in this case, {} batches).'.format(steps)
+      else:
+        warning_msg = 'Number of steps ran: {} steps'.format(current_step)
+
+      logging.warning('Your dataset iterator ran out of data; '
+                      'interrupting evaluation. ' + warning_msg)
+      break
+
     # TODO(priyag): maybe need to unwrap the outputs first for MirroredStrategy.
     for i, label in enumerate(model.output_names):
       unconcatenated_outs[i].extend(batch_outs[label])
+    batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
+    callbacks._call_batch_hook(mode, 'end', current_step, batch_logs)
     if verbose >= 1:
-      progbar.update(step + 1)
+      progbar.update(current_step + 1)
+    current_step += 1
+
+  callbacks._call_end_hook(mode)
 
-  K.get_session().run(current_strategy.finalize())
+  scope.__exit__(None, None, None)
 
   if len(unconcatenated_outs) == 1:
-    return np.concatenate(unconcatenated_outs[0], axis=0)
-  return [
-      np.concatenate(unconcatenated_outs[i], axis=0)
-      for i in range(len(unconcatenated_outs))
-  ]
-
-
-def _custom_compile_for_predict(model):
-  """Custom compile for TPU predict mode."""
-  model.total_loss = None
-  model._fit_function = None
-  model._eval_function = None
-  model.train_function = None
-  model.test_function = None
-  model.predict_function = None
-
-
-def _clone_and_build_model(model, inputs=None, targets=None, mode=None):
-  """Clone and build the given keras_model."""
-  # We need to set the import here since we run into a circular dependency
-  # error.
-  from tensorflow.python.keras import models  # pylint: disable=g-import-not-at-top
-  cloned_model = models.clone_model(model, input_tensors=inputs)
-
-  # Compile and build model.
-  if isinstance(model.optimizer, optimizers.TFOptimizer):
-    optimizer = model.optimizer
-  else:
-    optimizer_config = model.optimizer.get_config()
-    optimizer = model.optimizer.__class__.from_config(optimizer_config)
-
-  # Recast all low precision outputs back to float32 since we only casted
-  # the inputs to bfloat16 and not targets. This is done so that we can preserve
-  # precision when calculating the loss value.
-  def _upcast_low_precision_outputs(output):
-    if output.dtype == dtypes.bfloat16:
-      return math_ops.cast(output, dtypes.float32)
-    else:
-      return output
-  cloned_model.outputs = [_upcast_low_precision_outputs(o)
-                          for o in cloned_model.outputs]
-
-  if isinstance(targets, tuple):
-    targets = nest.flatten(targets)
-  if mode == _Mode.PREDICT:
-    _custom_compile_for_predict(cloned_model)
-  else:
-    cloned_model.compile(
-        optimizer,
-        model.loss,
-        metrics=metrics_module.clone_metrics(model._compile_metrics),
-        loss_weights=model.loss_weights,
-        sample_weight_mode=model.sample_weight_mode,
-        weighted_metrics=metrics_module.clone_metrics(
-            model._compile_weighted_metrics),
-        target_tensors=targets)
-  return cloned_model
-
-
-def clone_model_on_replicas(model, strategy, make_callback_model=False,
-                            inputs=None, targets=None, mode=None):
-  """Create a cloned model on each replica."""
-  with K.get_graph().as_default(), strategy.scope():
-    grouped_model = strategy.extended.call_for_each_replica(
-        _clone_and_build_model, args=(model, inputs, targets, mode))
-    if mode is _Mode.TRAIN:
-      model._grouped_model_train = grouped_model
-    elif mode is _Mode.TEST:
-      model._grouped_model_test = grouped_model
-    elif mode is _Mode.PREDICT:
-      model._grouped_model_predict = grouped_model
-    else:
-      model._grouped_model = grouped_model
-  if make_callback_model:
-    model._make_callback_model(grouped_model)
-
-
-def _get_input_from_iterator(iterator, model):
-  """Get elements from the iterator and verify the input shape and type."""
-  next_element = iterator.get_next()
-
-  if len(nest.flatten(next_element)) == len(model.inputs):
-    x = next_element
-    y = None
-    sample_weights = None
-  elif len(nest.flatten(next_element)) == (len(model.inputs) +
-                                           len(model.outputs)):
-    x, y = next_element
-    sample_weights = None
+    prediction_result = np.concatenate(unconcatenated_outs[0], axis=0)
   else:
-    x, y, sample_weights = next_element
-
-  # Validate that all the elements in x and y are of the same type and shape.
-  # We can then pass the first element of x and y to `_standardize_weights`
-  # below and be confident of the output.
-  x_values, y_values, sample_weights_values = distributed_training_utils.\
-    validate_distributed_dataset_inputs(model._distribution_strategy, x, y,
-                                        sample_weights)
-  model._standardize_weights(x_values, y_values,
-                             sample_weight=sample_weights_values)
-  return x, y, sample_weights
-
-
-def _make_execution_function(model, mode):
-  """Makes function to run one step of distributed model execution."""
-  if context.executing_eagerly():
-    return _make_eager_execution_function(model, mode)
-
-  strategy = model._distribution_strategy
-  if not model._grouped_model:
-    clone_model_on_replicas(
-        model, strategy, make_callback_model=(mode == 'train'))
-
-  def _per_device_function(model):
-    f = model._make_execution_function(mode)
-    return (f.inputs, f.outputs, f.updates_op, f.session_kwargs)
-
-  with strategy.scope():
-    # Create train ops on each of the devices when we call
-    # `_per_device_fit_function`.
-    (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = strategy.extended.call_for_each_replica(
-         _per_device_function, args=(model._grouped_model,))
-
-    if mode == 'train':
-      # Initialize the variables in the replicated model. This is necessary for
-      # multi-worker training because on some workers, initialization is not
-      # needed. This method does initialization or waiting for initialization
-      # according to the context object of distribute coordinator.
-      distributed_training_utils.init_restore_or_wait_for_variables()
-
-    # Unwrap all the per device values returned from `call_for_each_replica`.
-    # Unwrapping per device values gives you a list of values that can be
-    # used to construct a new train function that is composed of update ops on
-    # all the devices over which the model is distributed.
-    (all_inputs, all_outputs, all_updates,
-     all_session_args) = distributed_training_utils.unwrap_values(
-         strategy,
-         grouped_inputs,
-         grouped_outputs,
-         grouped_updates,
-         grouped_session_args,
-         with_loss_tensor=(mode != 'predict'))
-
-    return K.function(
-        all_inputs,
-        all_outputs,
-        updates=all_updates,
-        name='distributed_{}_function'.format(mode),
-        **all_session_args)
-
-
-def _make_eager_execution_function(model, mode):
-  """Makes function to run one step of distributed model eager execution."""
-  strategy = model._distribution_strategy
-  if not model._grouped_model:
-    clone_model_on_replicas(
-        model, strategy, make_callback_model=(mode == 'train'))
-
-  def _per_device_function(model):
-    f = model._make_execution_function(mode)
-    return (f.inputs, f.outputs)
-
-  # NOTE(priyag): Try creating a new FuncGraph within DS scope instead of using
-  # the global one.
-  with K.get_graph().as_default(), strategy.scope():
-    # Create train ops on each of the devices when we call
-    # `_per_device_fit_function`.
-    (grouped_inputs, grouped_outputs) = strategy.call_for_each_replica(
-        _per_device_function, args=(model._grouped_model,))
-
-    # Unwrap all the per device values returned from `call_for_each_replica`.
-    # Unwrapping per device values gives you a list of values that can be
-    # used to construct a new train function that is composed of inptus/outputs
-    # on all the devices over which the model is distributed.
-    (all_inputs, all_outputs, _, _) = distributed_training_utils.unwrap_values(
-        strategy,
-        grouped_inputs,
-        grouped_outputs,
-        with_loss_tensor=(mode != 'predict'))
-
-    return K.function(
-        all_inputs,
-        all_outputs,
-        name='eager_distributed_{}_function'.format(mode))
-
-
-def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
-  """Prepare feed values to the model execution function.
+    prediction_result = [
+        np.concatenate(unconcatenated_outs[i], axis=0)
+        for i in range(len(unconcatenated_outs))
+    ]
 
-  Arguments:
-    model: Model to prepare feed values for.
-    inputs: List or dict of model inputs.
-    targets: Optional list of model targets.
-    sample_weights: Optional list of sample weight arrays.
-    mode: One of 'train'/'test'/'predict'.
+  if padding_handler:
+    prediction_result = padding_handler.apply_mask(prediction_result)
 
-  Returns:
-    Feed values for the model in the given mode.
-  """
-  strategy = model._distribution_strategy
-  inputs, targets, sample_weights = _get_input_from_iterator(inputs, model)
-  inputs = distributed_training_utils.flatten_perdevice_values(strategy, inputs)
-  targets = distributed_training_utils.flatten_perdevice_values(
-      strategy, targets)
-  if mode == 'predict':
-    sample_weights = []
-    targets = []
-  else:
-    sample_weights = [
-        None for _ in range(len(model.outputs) * strategy.num_replicas_in_sync)
-    ]
-  ins = inputs + targets + sample_weights
-  if mode == 'train' and not isinstance(K.symbolic_learning_phase(), int):
-    ins += [True]
-  return ins
-
-
-def _copy_weights_to_distributed_model(original_model, grouped_model):
-  """Copies weights from original model to distributed models."""
-  strategy = original_model._distribution_strategy
-  if strategy:
-    # Copy the weights from the original model to each of the replicated
-    # models.
-    orig_model_weights = original_model.get_weights()
-    distributed_model = strategy.unwrap(grouped_model)[0]
-    distributed_training_utils.set_weights(strategy, distributed_model,
-                                           orig_model_weights)
-
-
-def _copy_weights_to_original_model(model, grouped_model, mode):
-  """Copies weights from first distributed model back to original model."""
-  if model._distribution_strategy and mode == 'train':
-    updated_weights = model._distribution_strategy.unwrap(
-        grouped_model)[0].get_weights()
-    model.set_weights(updated_weights)
-
-
-def _per_device_aggregate_batch(batch_outs, model, mode):
-  """Aggregates the per-device batch-level outputs from a distributed step."""
-  if model._distribution_strategy is not None and mode == 'predict':
-    total_batch_outs = []
-    for i in range(len(model.outputs)):
-      num_replicas = model._distribution_strategy.num_replicas_in_sync
-      nested_outs = batch_outs[i * num_replicas:i * num_replicas + num_replicas]
-      total_batch_outs.append(np.concatenate(nest.flatten(nested_outs)))
-    return total_batch_outs
-  return batch_outs
-
-
-def _reset_metrics(model, distributed_model=None):
-  if model._distribution_strategy:
-    distributed_model = (
-        distributed_model or
-        model._distribution_strategy.unwrap(model._grouped_model)[0])
-    distributed_model.reset_metrics()
+  return prediction_result
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index 895db5bc633669641b0493b8bfb918094f312513..6fdb19c523dd1cd2dd13c1ae7549e406fcf95856 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -21,16 +21,17 @@ from __future__ import print_function
 
 import collections
 
+import numpy as np
+
 from tensorflow.python.eager.backprop import GradientTape
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
-from tensorflow.python.keras import losses as losses_module
 from tensorflow.python.keras.engine import training_utils
-from tensorflow.python.keras.utils import generic_utils
-from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
+from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
 
 
 def _eager_loss_fn(outputs, targets, loss_fn, output_name):
@@ -59,8 +60,8 @@ def _eager_metrics_fn(model,
   Returns:
       Returns the metric results for each output of the model.
   """
-  outputs = generic_utils.to_list(outputs)
-  targets = generic_utils.to_list(targets)
+  outputs = nest.flatten(outputs)
+  targets = nest.flatten(targets)
   # TODO(psv): Consider supporting skip target indices in eager mode?
   metric_results = model._handle_metrics(
       outputs,
@@ -95,6 +96,10 @@ def _model_loss(model,
      regularization losses and applies masking and sample weighting
      to the loss value.
   """
+  # Used to keep track of the total loss value (stateless).
+  # eg., total_loss = loss_weight_1 * output_1_loss_fn(...) +
+  #                   loss_weight_2 * output_2_loss_fn(...) +
+  #                   layer losses.
   total_loss = 0
   kwargs = {}
   if model._expects_training_arg:
@@ -102,61 +107,77 @@ def _model_loss(model,
   if len(inputs) == 1 and not isinstance(inputs, dict):
     inputs = inputs[0]
 
-  if model._compute_output_and_mask_jointly:
-    outs, masks = model._call_and_compute_mask(inputs, **kwargs)
-    masks = generic_utils.to_list(masks)
-  else:
-    outs = model.call(inputs, **kwargs)
-    masks = None
+  # Allow mixed `NumPy` and `EagerTensor` input here.
+  if any(
+      isinstance(input_t, (np.ndarray, float, int))
+      for input_t in nest.flatten(inputs)):
+    inputs = nest.map_structure(ops.convert_to_tensor, inputs)
+
+  outs = model(inputs, **kwargs)
+
+  outs = nest.flatten(outs)
+  # `None` by default for `EagerTensors`.
+  masks = [t._keras_mask for t in outs]
+  targets = nest.flatten(targets)
 
-  outs = generic_utils.to_list(outs)
-  if masks is None:
-    masks = [None for _ in outs]
-  targets = generic_utils.to_list(targets)
+  # Used to keep track of individual output losses (stateless).
+  output_losses = []
+  # Used to keep track of individual output losses (stateful).
+  aggregated_output_losses = []
 
-  loss_metrics = []
-  aggregated_loss_metrics = []
   with backend.name_scope('loss'):
     for i, loss_fn in enumerate(model.loss_functions):
-      if sample_weights:
-        weights = sample_weights[i]
-      else:
-        weights = None
+      weights = sample_weights[i] if sample_weights else None
       mask = masks[i]
       with backend.name_scope(model.output_names[i] + '_loss'):
-        if isinstance(loss_fn, losses_module.Loss):
-          if mask is not None:
-            mask = math_ops.cast(mask, outs[i].dtype)
-            # Update weights with mask.
-            if weights is None:
-              weights = mask
-            else:
-              # Update dimensions of weights to match with mask if possible.
-              mask, _, weights = squeeze_or_expand_dimensions(
-                  mask, None, weights)
-              weights *= mask
-          output_loss = loss_fn(targets[i], outs[i], sample_weight=weights)
+        if mask is not None:
+          mask = math_ops.cast(mask, outs[i].dtype)
+          # Update weights with mask.
+          if weights is None:
+            weights = mask
+          else:
+            # Update dimensions of weights to match with mask if possible.
+            mask, _, weights = (
+                losses_utils.squeeze_or_expand_dimensions(mask, None, weights))
+            weights *= mask
+
+        # Reset reduction on the loss so that we can get the per sample loss
+        # value. We use this to get both the stateless and stateful loss
+        # values without having to compute the underlying loss function
+        # twice.
+        weighted_losses = None
+        if hasattr(loss_fn, 'reduction'):
+          current_loss_reduction = loss_fn.reduction
+          loss_fn.reduction = losses_utils.ReductionV2.NONE
+          weighted_losses = loss_fn(targets[i], outs[i], sample_weight=weights)
+          loss_fn.reduction = current_loss_reduction
+
+          # Compute the stateless loss value.
+          output_loss = losses_utils.reduce_weighted_loss(weighted_losses)
         else:
-          weighted_masked_fn = training_utils.weighted_masked_objective(loss_fn)
-          output_loss = weighted_masked_fn(
-              targets[i], outs[i], weights, mask=mask)
+          # Compute the stateless loss value for a custom loss class.
+          # Here we assume that the class takes care of loss reduction
+          # because if this class returns a vector value we cannot
+          # differentiate between use case where a custom optimizer
+          # expects a vector loss value vs unreduced per-sample loss value.
+          output_loss = loss_fn(targets[i], outs[i], sample_weight=weights)
 
       # If the number of outputs is 1 then we don't append the loss metric
       # associated with each model output. When there are multiple outputs
       # associated with a model, each output's loss is calculated and returned
       # as part of the loss_metrics.
       if len(model.outputs) > 1:
-        loss_metrics.append(backend.mean(output_loss))
-
+        output_losses.append(backend.mean(output_loss))
         if output_loss_metrics is not None:
-          # Keep track of the stateful loss result.
-          aggregated_loss_metrics.append(
-              training_utils.call_metric_function(
-                  output_loss_metrics[i],
-                  targets[i],
-                  outs[i],
-                  weights=weights,
-                  mask=mask))
+          # Compute the stateful loss value.
+          if weighted_losses is not None:
+            aggregated_output_loss = output_loss_metrics[i](weighted_losses)
+          else:
+            # Custom loss class.
+            aggregated_output_loss = training_utils.call_metric_function(
+                output_loss_metrics[i], targets[i], outs[i], weights=weights)
+          # Keep track of the stateful output loss result.
+          aggregated_output_losses.append(aggregated_output_loss)
 
       loss_weight = model.loss_weights_list[i]
       if total_loss is None:
@@ -168,10 +189,11 @@ def _model_loss(model,
     # Add regularization losses
     custom_losses = model.losses
     if custom_losses:
-      total_loss += math_ops.add_n(custom_losses)
+      total_loss += losses_utils.scale_loss_for_distribution(
+          math_ops.add_n(custom_losses))
     model._clear_losses()
 
-  return outs, total_loss, loss_metrics, aggregated_loss_metrics, masks
+  return outs, total_loss, output_losses, aggregated_output_losses, masks
 
 
 def _process_single_batch(model,
@@ -202,17 +224,17 @@ def _process_single_batch(model,
   Raises:
       ValueError: If the model has no loss to optimize.
   """
-  with backend.learning_phase_scope(1 if training else 0):
+  with backend.eager_learning_phase_scope(1 if training else 0):
     with GradientTape() as tape:
-      outs, loss, loss_metrics, aggregated_loss_metrics, masks\
-        = _model_loss(
-            model,
-            inputs,
-            targets,
-            output_loss_metrics=output_loss_metrics,
-            sample_weights=sample_weights,
-            training=training)
-      if loss is None:
+      outs, total_loss, output_losses, aggregated_output_losses, masks = (
+          _model_loss(
+              model,
+              inputs,
+              targets,
+              output_loss_metrics=output_loss_metrics,
+              sample_weights=sample_weights,
+              training=training))
+      if total_loss is None:
         raise ValueError('The model cannot be run '
                          'because it has no loss to optimize.')
     if training:
@@ -221,13 +243,18 @@ def _process_single_batch(model,
                         ' you are not setting model.trainable to False before '
                         'compiling the model.')
       else:
-        grads = tape.gradient(loss, model._collected_trainable_weights)
+        grads = tape.gradient(total_loss, model._collected_trainable_weights)
         model.optimizer.apply_gradients(zip(grads,
                                             model._collected_trainable_weights))
-    return outs, loss, loss_metrics, aggregated_loss_metrics, masks
+    return outs, total_loss, output_losses, aggregated_output_losses, masks
 
 
-def train_on_batch(model, inputs, targets, sample_weights=None):
+def train_on_batch(model,
+                   inputs,
+                   targets,
+                   sample_weights=None,
+                   reset_metrics=True,
+                   output_loss_metrics=None):
   """Calculates the loss and gradient updates for one input batch.
 
   Arguments:
@@ -235,6 +262,11 @@ def train_on_batch(model, inputs, targets, sample_weights=None):
       inputs: Input batch data.
       targets: Target batch data.
       sample_weights: Sample weight batch data.
+      reset_metrics: If `True`, the metrics returned will be only for this
+        batch. If `False`, the metrics will be statefully accumulated across
+        batches.
+      output_loss_metrics: List of metrics that are used to aggregated output
+        loss values.
 
   Returns:
       total loss and the loss associated with each output.
@@ -244,20 +276,24 @@ def train_on_batch(model, inputs, targets, sample_weights=None):
       inputs = training_utils.cast_if_floating_dtype(inputs)
       targets = training_utils.cast_if_floating_dtype(targets)
     else:
-      inputs = training_utils.cast_if_floating_dtype([
-          ops.convert_to_tensor(val) for val in inputs
-      ])
-      targets = training_utils.cast_if_floating_dtype([
-          ops.convert_to_tensor(val) for val in targets
-      ])
+      inputs = training_utils.cast_if_floating_dtype(
+          [ops.convert_to_tensor(val) for val in inputs])
+      targets = training_utils.cast_if_floating_dtype(
+          [ops.convert_to_tensor(val) for val in targets])
   if sample_weights:
     sample_weights = [
         training_utils.cast_if_floating_dtype(ops.convert_to_tensor(val))
         if val is not None else None for val in sample_weights
     ]
 
-  outs, loss, loss_metrics, _, masks = _process_single_batch(
-      model, inputs, targets, sample_weights=sample_weights, training=True)
+  outs, total_loss, output_losses, aggregated_output_losses, masks = (
+      _process_single_batch(
+          model,
+          inputs,
+          targets,
+          sample_weights=sample_weights,
+          training=True,
+          output_loss_metrics=output_loss_metrics))
   if not isinstance(outs, list):
     outs = [outs]
   metrics_results = _eager_metrics_fn(
@@ -266,16 +302,23 @@ def train_on_batch(model, inputs, targets, sample_weights=None):
       targets,
       sample_weights=sample_weights,
       masks=masks,
-      return_stateful_result=True)
-  loss = generic_utils.to_list(loss)
+      return_stateful_result=not reset_metrics)
+  total_loss = nest.flatten(total_loss)
+  if reset_metrics:
+    final_output_losses = output_losses
+  else:
+    final_output_losses = aggregated_output_losses
+  results = total_loss + final_output_losses + metrics_results
 
-  return [
-      tensor_util.constant_value(v)
-      for v in loss + loss_metrics + metrics_results
-  ]
+  return [tensor_util.constant_value(v) for v in results]
 
 
-def test_on_batch(model, inputs, targets, sample_weights=None):
+def test_on_batch(model,
+                  inputs,
+                  targets,
+                  sample_weights=None,
+                  reset_metrics=True,
+                  output_loss_metrics=None):
   """Calculates the loss for one input batch.
 
   Arguments:
@@ -283,6 +326,11 @@ def test_on_batch(model, inputs, targets, sample_weights=None):
       inputs: Input batch data.
       targets: Target batch data.
       sample_weights: Sample weight batch data.
+      reset_metrics: If `True`, the metrics returned will be only for this
+        batch. If `False`, the metrics will be statefully accumulated across
+        batches.
+      output_loss_metrics: List of metrics that are used to aggregated output
+        loss values.
 
   Returns:
       total loss, loss and metrics associated with each output.
@@ -292,19 +340,23 @@ def test_on_batch(model, inputs, targets, sample_weights=None):
       inputs = training_utils.cast_if_floating_dtype(inputs)
       targets = training_utils.cast_if_floating_dtype(targets)
     else:
-      inputs = training_utils.cast_if_floating_dtype([
-          ops.convert_to_tensor(val) for val in inputs
-      ])
-      targets = training_utils.cast_if_floating_dtype([
-          ops.convert_to_tensor(val) for val in targets
-      ])
+      inputs = training_utils.cast_if_floating_dtype(
+          [ops.convert_to_tensor(val) for val in inputs])
+      targets = training_utils.cast_if_floating_dtype(
+          [ops.convert_to_tensor(val) for val in targets])
   if sample_weights:
     sample_weights = [
         training_utils.cast_if_floating_dtype(ops.convert_to_tensor(val))
         if val is not None else None for val in sample_weights
     ]
-  outs, loss, loss_metrics, _, masks = _model_loss(
-      model, inputs, targets, sample_weights=sample_weights, training=False)
+  outs, total_loss, output_losses, aggregated_output_losses, masks = (
+      _model_loss(
+          model,
+          inputs,
+          targets,
+          sample_weights=sample_weights,
+          training=False,
+          output_loss_metrics=output_loss_metrics))
   if not isinstance(outs, list):
     outs = [outs]
   metrics_results = _eager_metrics_fn(
@@ -313,10 +365,12 @@ def test_on_batch(model, inputs, targets, sample_weights=None):
       targets,
       sample_weights=sample_weights,
       masks=masks,
-      return_stateful_result=True)
-  loss = generic_utils.to_list(loss)
+      return_stateful_result=not reset_metrics)
+  total_loss = nest.flatten(total_loss)
+  if reset_metrics:
+    final_output_losses = output_losses
+  else:
+    final_output_losses = aggregated_output_losses
+  results = total_loss + final_output_losses + metrics_results
 
-  return [
-      tensor_util.constant_value(v)
-      for v in loss + loss_metrics + metrics_results
-  ]
+  return [tensor_util.constant_value(v) for v in results]
diff --git a/tensorflow/python/keras/engine/training_eager_test.py b/tensorflow/python/keras/engine/training_eager_test.py
index 3fabbb17edc05138c57bf61c16a94c6647813963..84f1fa0efcba08c227cc6eb4e3e2ad4623c7adc9 100644
--- a/tensorflow/python/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/engine/training_eager_test.py
@@ -24,25 +24,34 @@ from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.optimizer_v2 import rmsprop
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
-class TrainingTest(test.TestCase):
+class TrainingTest(keras_parameterized.TestCase):
 
+  @keras_parameterized.run_with_all_model_types(exclude_models='sequential')
+  @keras_parameterized.run_all_keras_modes
   def test_model_methods_with_eager_tensors_multi_io(self):
-    a = keras.layers.Input(shape=(3,), name='input_a')
-    b = keras.layers.Input(shape=(3,), name='input_b')
+    if not context.executing_eagerly():
+      # Only test V2 Function and V2 Eager modes, as V1 Graph mode with
+      # symbolic tensors has different requirements.
+      return
+
+    input_a = keras.layers.Input(shape=(3,), name='input_a')
+    input_b = keras.layers.Input(shape=(3,), name='input_b')
 
     dense = keras.layers.Dense(4, name='dense')
-    c = dense(a)
-    d = dense(b)
-    e = keras.layers.Dropout(0.5, name='dropout')(c)
+    dropout = keras.layers.Dropout(0.5, name='dropout')
 
-    model = keras.models.Model([a, b], [d, e])
+    model = testing_utils.get_multi_io_model(
+        [input_a, dense], [input_b, dense, dropout])
 
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    optimizer = rmsprop.RMSprop(learning_rate=0.001)
     loss = 'mse'
     loss_weights = [1., 0.5]
     metrics = ['mae', metrics_module.CategoricalAccuracy()]
@@ -51,71 +60,80 @@ class TrainingTest(test.TestCase):
         loss,
         metrics=metrics,
         loss_weights=loss_weights,
-        run_eagerly=True,
+        run_eagerly=testing_utils.should_run_eagerly(),
         sample_weight_mode=None)
 
-    input_a = keras.backend.zeros(shape=(10, 3))
-    input_b = keras.backend.zeros(shape=(10, 3))
-    target_d = keras.backend.zeros(shape=(10, 4))
-    target_e = keras.backend.zeros(shape=(10, 4))
+    input_a = array_ops.zeros(shape=(10, 3))
+    input_b = array_ops.zeros(shape=(10, 3))
+    target_a = array_ops.zeros(shape=(10, 4))
+    target_b = array_ops.zeros(shape=(10, 4))
 
     model.fit(
-        [input_a, input_b], [target_d, target_e],
+        [input_a, input_b], [target_a, target_b],
         epochs=1,
         batch_size=5,
         verbose=0)
     # Test: no shuffle.
     model.fit(
-        [input_a, input_b], [target_d, target_e],
+        [input_a, input_b], [target_a, target_b],
         epochs=1,
         batch_size=5,
         verbose=0,
         shuffle=False)
     # Test: validation data.
-    model.fit([input_a, input_b], [target_d, target_e],
+    model.fit([input_a, input_b], [target_a, target_b],
               epochs=1, batch_size=2, verbose=0,
-              validation_data=([input_a, input_b], [target_d, target_e]))
-    model.train_on_batch([input_a, input_b], [target_d, target_e])
+              validation_data=([input_a, input_b], [target_a, target_b]))
+    model.train_on_batch([input_a, input_b], [target_a, target_b])
     model.predict([input_a, input_b], batch_size=5)
-    model.evaluate([input_a, input_b], [target_d, target_e],
+    model.evaluate([input_a, input_b], [target_a, target_b],
                    batch_size=2, verbose=0)
-    model.test_on_batch([input_a, input_b], [target_d, target_e])
+    model.test_on_batch([input_a, input_b], [target_a, target_b])
 
     # Test: mix np and tensors.
     input_b = np.zeros(shape=(10, 3)).astype('float32')
-    target_e = np.zeros(shape=(10, 4)).astype('float32')
+    target_b = np.zeros(shape=(10, 4)).astype('float32')
     model.fit(
-        [input_a, input_b], [target_d, target_e],
+        [input_a, input_b], [target_a, target_b],
         epochs=1,
         batch_size=5,
         verbose=0)
-    model.fit([input_a, input_b], [target_d, target_e],
+    model.fit([input_a, input_b], [target_a, target_b],
               epochs=1, batch_size=2, verbose=0,
-              validation_data=([input_a, input_b], [target_d, target_e]))
+              validation_data=([input_a, input_b], [target_a, target_b]))
     model.fit(
-        [input_a, input_b], [target_d, target_e],
+        [input_a, input_b], [target_a, target_b],
         epochs=1,
         batch_size=5,
         verbose=0,
         shuffle=False)
-    model.train_on_batch([input_a, input_b], [target_d, target_e])
+    model.train_on_batch([input_a, input_b], [target_a, target_b])
     model.predict([input_a, input_b], batch_size=5)
-    model.evaluate([input_a, input_b], [target_d, target_e],
+    model.evaluate([input_a, input_b], [target_a, target_b],
                    batch_size=2, verbose=0)
-    model.test_on_batch([input_a, input_b], [target_d, target_e])
+    model.test_on_batch([input_a, input_b], [target_a, target_b])
 
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
   def test_model_methods_with_eager_tensors_single_io(self):
-    x = keras.layers.Input(shape=(3,), name='input')
-    y = keras.layers.Dense(4, name='dense')(x)
-    model = keras.Model(x, y)
+    if not context.executing_eagerly():
+      # Only test V2 Function and V2 Eager modes, as V1 Graph mode with
+      # symbolic tensors has different requirements.
+      return
 
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    model = testing_utils.get_small_mlp(10, 4, 3)
+
+    optimizer = rmsprop.RMSprop(learning_rate=0.001)
     loss = 'mse'
     metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(optimizer, loss, metrics=metrics, run_eagerly=True)
+    model.compile(
+        optimizer,
+        loss,
+        metrics=metrics,
+        run_eagerly=testing_utils.should_run_eagerly())
 
-    inputs = keras.backend.zeros(shape=(10, 3))
-    targets = keras.backend.zeros(shape=(10, 4))
+    inputs = array_ops.zeros(shape=(10, 3))
+    targets = array_ops.zeros(shape=(10, 4))
 
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=0)
     model.fit(inputs, targets, epochs=1, batch_size=3, verbose=0, shuffle=False)
@@ -126,32 +144,32 @@ class TrainingTest(test.TestCase):
     model.train_on_batch(inputs, targets)
     model.test_on_batch(inputs, targets)
 
+  @keras_parameterized.run_with_all_model_types
   def test_model_fit_and_validation_with_missing_arg_errors(self):
-    x = keras.layers.Input(shape=(3,), name='input')
-    y = keras.layers.Dense(4, name='dense')(x)
-    model = keras.Model(x, y)
-    model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001),
+    model = testing_utils.get_small_mlp(10, 4, 3)
+    model.compile(optimizer=rmsprop.RMSprop(learning_rate=0.001),
                   loss='mse',
                   run_eagerly=True)
 
-    x = keras.backend.zeros(shape=(10, 3))
-    y = keras.backend.zeros(shape=(10, 4))
+    x = array_ops.zeros(shape=(10, 3))
+    y = array_ops.zeros(shape=(10, 4))
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat(10).batch(5)
     iterator = dataset_ops.make_one_shot_iterator(dataset)
     validation_dataset = dataset_ops.Dataset.from_tensor_slices(
-        (x, y)).repeat(10).batch(5)
+        (x, y)).repeat().batch(5)  # Infinite dataset.
     validation_iterator = dataset_ops.make_one_shot_iterator(validation_dataset)
 
     with self.assertRaisesRegexp(
         ValueError, r'specify .* `steps_per_epoch`'):
       model.fit(iterator, epochs=1, verbose=0)
     if not context.executing_eagerly():
-      # In eager execution, `keras.backend.zeros` returns value tensors
+      # In eager execution, `array_ops.zeros` returns value tensors
       # which can be used for validation without a `validation_steps` argument.
       with self.assertRaisesRegexp(
           ValueError, r'provide either `batch_size` or `validation_steps`'):
         model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
                   validation_data=(x, y))
+    # Step argument is required for infinite datasets.
     with self.assertRaisesRegexp(ValueError,
                                  'specify the `validation_steps` argument.'):
       model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
@@ -161,10 +179,12 @@ class TrainingTest(test.TestCase):
       model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
                 validation_data=validation_iterator)
 
+  # TODO(b/120931266): Enable test on subclassed models after bug causing an
+  # extra dimension to be added to predict outputs is fixed.
+  @keras_parameterized.run_with_all_model_types(exclude_models='subclass')
   def test_generator_methods(self):
-    model = keras.Sequential()
-    model.add(keras.layers.Dense(4, input_shape=(3,)))
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    model = testing_utils.get_small_mlp(10, 4, 3)
+    optimizer = rmsprop.RMSprop(learning_rate=0.001)
     model.compile(
         optimizer,
         loss='mse',
@@ -189,41 +209,41 @@ class TrainingTest(test.TestCase):
     self.assertEqual(out.shape, (30, 4))
 
 
-class CorrectnessTest(test.TestCase):
+class CorrectnessTest(keras_parameterized.TestCase):
 
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
   def test_loss_correctness(self):
     # Test that training loss is the same in eager and graph
     # (by comparing it to a reference value in a deterministic case)
-    model = keras.Sequential()
-    model.add(keras.layers.Dense(3,
-                                 activation='relu',
-                                 input_dim=4,
-                                 kernel_initializer='ones'))
-    model.add(keras.layers.Dense(2,
-                                 activation='softmax',
-                                 kernel_initializer='ones'))
+    layers = [
+        keras.layers.Dense(3, activation='relu',
+                           kernel_initializer='ones'),
+        keras.layers.Dense(2, activation='softmax', kernel_initializer='ones')]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(4,))
     model.compile(loss='sparse_categorical_crossentropy',
-                  optimizer=RMSPropOptimizer(learning_rate=0.001),
-                  run_eagerly=False)
+                  optimizer=rmsprop.RMSprop(learning_rate=0.001),
+                  run_eagerly=testing_utils.should_run_eagerly())
     x = np.ones((100, 4))
     np.random.seed(123)
     y = np.random.randint(0, 1, size=(100, 1))
     history = model.fit(x, y, epochs=1, batch_size=10)
-    self.assertAlmostEqual(history.history['loss'][-1], 0.6173, 4)
+    self.assertAlmostEqual(history.history['loss'][-1], 0.5836, 4)
 
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
   def test_loss_correctness_with_iterator(self):
     # Test that training loss is the same in eager and graph
     # (by comparing it to a reference value in a deterministic case)
-    model = keras.Sequential()
-    model.add(
-        keras.layers.Dense(
-            3, activation='relu', input_dim=4, kernel_initializer='ones'))
-    model.add(
-        keras.layers.Dense(2, activation='softmax', kernel_initializer='ones'))
+    layers = [
+        keras.layers.Dense(3, activation='relu',
+                           kernel_initializer='ones'),
+        keras.layers.Dense(2, activation='softmax', kernel_initializer='ones')]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(4,))
     model.compile(
         loss='sparse_categorical_crossentropy',
-        optimizer=RMSPropOptimizer(learning_rate=0.001),
-        run_eagerly=True)
+        optimizer=rmsprop.RMSprop(learning_rate=0.001),
+        run_eagerly=testing_utils.should_run_eagerly())
     x = np.ones((100, 4), dtype=np.float32)
     np.random.seed(123)
     y = np.random.randint(0, 1, size=(100, 1))
@@ -232,7 +252,7 @@ class CorrectnessTest(test.TestCase):
     dataset = dataset.batch(10)
     iterator = dataset_ops.make_one_shot_iterator(dataset)
     history = model.fit(iterator, epochs=1, steps_per_epoch=10)
-    self.assertAlmostEqual(history.history['loss'][-1], 0.6173, 4)
+    self.assertAlmostEqual(history.history['loss'][-1], 0.5836, 4)
 
   def test_loss_in_call(self):
 
diff --git a/tensorflow/python/keras/engine/training_generator.py b/tensorflow/python/keras/engine/training_generator.py
index 0abf0b8270915a37f1d59803cacd11bdf9abe132..a9fdb0721fe497abf86d957063fba5d4de986db9 100644
--- a/tensorflow/python/keras/engine/training_generator.py
+++ b/tensorflow/python/keras/engine/training_generator.py
@@ -33,6 +33,7 @@ from tensorflow.python.keras import callbacks as cbks
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
@@ -45,16 +46,18 @@ def model_iteration(model,
                     callbacks=None,
                     validation_data=None,
                     validation_steps=None,
+                    validation_freq=1,
                     class_weight=None,
                     max_queue_size=10,
                     workers=1,
                     use_multiprocessing=False,
-                    shuffle=True,
+                    shuffle=False,
                     initial_epoch=0,
-                    mode='train',
+                    mode=ModeKeys.TRAIN,
                     batch_size=None,
+                    steps_name='steps',
                     **kwargs):
-  """Loop function for arrays of data with modes 'train'/'test'/'predict'.
+  """Loop function for arrays of data with modes TRAIN/TEST/PREDICT.
 
   Arguments:
       model: Keras Model instance.
@@ -72,6 +75,13 @@ def model_iteration(model,
         `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
       validation_steps: Total number of steps (batches of samples) before
         declaring validation finished.
+      validation_freq: Only relevant if validation data is provided. Integer or
+        `collections.Container` instance (e.g. list, tuple, etc.). If an
+        integer, specifies how many training epochs to run before a new
+        validation run is performed, e.g. `validation_freq=2` runs
+        validation every 2 epochs. If a Container, specifies the epochs on
+        which to run validation, e.g. `validation_freq=[1, 2, 10]` runs
+        validation at the end of the 1st, 2nd, and 10th epochs.
       class_weight: Dictionary mapping class indices to a weight for the class.
       max_queue_size: Integer. Maximum size for the generator queue. If
         unspecified, `max_queue_size` will default to 10.
@@ -89,16 +99,19 @@ def model_iteration(model,
         `None`.
       initial_epoch: Epoch at which to start training (useful for resuming a
         previous training run).
-      mode: One of 'train'/'test'/'predict'.
+      mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
       batch_size: Integer batch size or None if unknown. Will only be used if
         `data` is in NumPy/Tensor format.
+      steps_name: The string name of the steps argument, either `steps`,
+        `validation_steps`, or `steps_per_epoch`. Only used for error message
+        formatting.
       **kwargs: Additional arguments for backwards compatibility. `steps` is
         accepted as an alias for `steps_per_epoch`.
 
   Returns:
-      - In 'train' mode: `History` object.
-      - In 'test' mode: Evaluation metrics.
-      - In 'predict' mode: Outputs of the Model called on inputs.
+      - In TRAIN mode: `History` object.
+      - In TEST mode: Evaluation metrics.
+      - In PREDICT mode: Outputs of the Model called on inputs.
 
   Raises:
       ValueError: in case of invalid arguments.
@@ -106,6 +119,18 @@ def model_iteration(model,
   if 'steps' in kwargs:
     steps_per_epoch = kwargs['steps']
 
+  # Determine the number of steps per epoch and whether we should reset the
+  # dataset at the end of each epoch.
+  reset_dataset_after_each_epoch = False
+  original_dataset = None
+  is_dataset = isinstance(data, (dataset_ops.DatasetV2, dataset_ops.DatasetV1))
+  if is_dataset:
+    original_dataset = data
+    if steps_per_epoch is None:
+      reset_dataset_after_each_epoch = True
+      steps_per_epoch = training_utils.infer_steps_for_dataset(
+          data, steps_per_epoch, epochs=epochs, steps_name=steps_name)
+
   # Convert to a format that supports `next(generator)`.
   generator, steps_per_epoch = convert_to_generator_like(
       data,
@@ -115,9 +140,8 @@ def model_iteration(model,
       shuffle=shuffle)
 
   do_validation = validation_data is not None
-  should_set_learning_phase = context.executing_eagerly() and model.run_eagerly
   is_sequence = isinstance(generator, data_utils.Sequence)
-  _validate_arguments(is_sequence, use_multiprocessing, workers,
+  _validate_arguments(is_sequence, is_dataset, use_multiprocessing, workers,
                       steps_per_epoch, validation_data, validation_steps, mode,
                       kwargs)
 
@@ -125,12 +149,14 @@ def model_iteration(model,
       model, mode, class_weight=class_weight)
 
   # Create the queue for the generator.
-  output_generator, enqueuer = _make_enqueued_generator(
-      generator,
-      workers=workers,
-      use_multiprocessing=use_multiprocessing,
-      max_queue_size=max_queue_size,
-      shuffle=shuffle)
+  enqueuer = None
+  if not is_dataset:
+    generator, enqueuer = _make_enqueued_generator(
+        generator,
+        workers=workers,
+        use_multiprocessing=use_multiprocessing,
+        max_queue_size=max_queue_size,
+        shuffle=shuffle)
 
   num_samples_or_steps, use_steps = _get_num_samples_or_steps(
       data, steps_per_epoch)
@@ -151,14 +177,15 @@ def model_iteration(model,
   progbar.params = callbacks.params
   progbar.params['verbose'] = verbose
 
-  if mode == 'predict':
+  if mode == ModeKeys.PREDICT:
     aggregator = training_utils.OutputsAggregator(True, steps_per_epoch)
   else:
     aggregator = training_utils.MetricsAggregator(True, steps_per_epoch)
 
+  should_set_learning_phase = context.executing_eagerly() and model.run_eagerly
   if should_set_learning_phase:
     old_learning_phase = backend.learning_phase()
-    backend.set_learning_phase(1 if mode == 'train' else 0)
+    backend.set_eager_learning_phase(1 if mode == ModeKeys.TRAIN else 0)
 
   callbacks.model.stop_training = False
   callbacks._call_begin_hook(mode)
@@ -170,13 +197,50 @@ def model_iteration(model,
     # Setup work for each epoch.
     model.reset_metrics()
     epoch_logs = {}
-    callbacks.on_epoch_begin(epoch, epoch_logs, mode=mode)
+    if mode == ModeKeys.TRAIN:
+      callbacks.on_epoch_begin(epoch, epoch_logs)
     progbar.on_epoch_begin(epoch, epoch_logs)
 
-    for step in range(steps_per_epoch):
-      batch_data = _get_next_batch(output_generator, mode)
+    if steps_per_epoch is None:
+      # Loop over dataset until `OutOfRangeError` is raised.
+      target_steps = np.inf
+    else:
+      # Loop over dataset for the specified number of steps.
+      target_steps = steps_per_epoch
+
+    step = 0
+    while step < target_steps:
+      batch_data = _get_next_batch(generator, mode)
       if batch_data is None:
-        callbacks.model.stop_training = True
+        if is_dataset:
+          # The dataset passed by the user ran out of batches.
+          # Now we know the cardinality of the dataset.
+          # If steps_per_epoch was specified, then running out of data is
+          # unexpected, so we stop training and inform the user.
+          if steps_per_epoch:
+            callbacks.model.stop_training = True
+            logging.warning(
+                'Your dataset ran out of data; interrupting training. '
+                'Make sure that your dataset can generate at least '
+                '`%s * epochs` batches (in this case, %d batches). '
+                'You may need to use the repeat() function when '
+                'building your dataset.'
+                % (steps_name, steps_per_epoch * epochs))
+          elif step > 0:
+            steps_per_epoch = step
+            aggregator.num_samples_or_steps = steps_per_epoch
+            progbar.params['steps'] = steps_per_epoch
+            progbar.progbar.target = steps_per_epoch
+        else:
+          # We ran out of batches while the user passed an iterator (legacy).
+          callbacks.model.stop_training = True
+          logging.warning(
+              'Your dataset iterator ran out of data; '
+              'interrupting training. Make sure that your iterator '
+              'can generate at least `%s * epochs` '
+              'batches (in this case, %d batches). You may need to'
+              'use the repeat() function when building your '
+              'dataset.' % (steps_name, steps_per_epoch * epochs))
         break
 
       # `batch_size` used for validation data if validation
@@ -188,31 +252,53 @@ def model_iteration(model,
       callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
       progbar.on_batch_begin(step, batch_logs)
 
+      is_deferred = not model._is_compiled
       batch_outs = batch_function(*batch_data)
       if not isinstance(batch_outs, list):
         batch_outs = [batch_outs]
 
-      # Aggregate results.
       if step == 0:
         aggregator.create(batch_outs)
+
+        if is_deferred:
+          # Set callbacks params. We do this here when model is compiled only
+          # in the first iteration of this loop (deferred build scenario).
+          cbks.set_callback_parameters(
+              callbacks,
+              model,
+              do_validation=do_validation,
+              batch_size=batch_size,
+              epochs=epochs,
+              steps_per_epoch=steps_per_epoch,
+              samples=num_samples_or_steps,
+              verbose=verbose,
+              mode=mode)
+
+          progbar.params = callbacks.params
+          progbar.params['verbose'] = verbose
+
+      # Aggregate results.
       aggregator.aggregate(batch_outs)
 
       # Callbacks batch end.
-      batch_logs.update(training_utils.make_logs(model, batch_outs, mode))
+      batch_logs = cbks.make_logs(model, batch_logs, batch_outs, mode)
       callbacks._call_batch_hook(mode, 'end', step, batch_logs)
       progbar.on_batch_end(step, batch_logs)
+      step += 1
 
       if callbacks.model.stop_training:
         break
 
     aggregator.finalize()
     results = aggregator.results
-    epoch_logs.update(training_utils.make_logs(model, results, mode))
+    epoch_logs = cbks.make_logs(model, epoch_logs, results, mode)
     if len(results) == 1:
       results = results[0]
 
     # Run the test loop every epoch during training.
-    if do_validation and not callbacks.model.stop_training:
+    if (do_validation and
+        training_utils.should_run_validation(validation_freq, epoch) and
+        not callbacks.model.stop_training):
       val_results = model_iteration(
           model,
           validation_data,
@@ -222,44 +308,54 @@ def model_iteration(model,
           workers=workers,
           use_multiprocessing=use_multiprocessing,
           max_queue_size=max_queue_size,
-          mode='test')
+          callbacks=callbacks,
+          verbose=0,
+          mode=ModeKeys.TEST,
+          steps_name='validation_steps')
 
       if not isinstance(val_results, list):
         val_results = [val_results]
-      epoch_logs.update(
-          training_utils.make_logs(model, val_results, mode, prefix='val_'))
+      epoch_logs = cbks.make_logs(
+          model, epoch_logs, val_results, mode, prefix='val_')
 
-    callbacks.on_epoch_end(epoch, epoch_logs, mode=mode)
+    if mode == ModeKeys.TRAIN:
+      # Epochs only apply to `fit`.
+      callbacks.on_epoch_end(epoch, epoch_logs)
     progbar.on_epoch_end(epoch, epoch_logs)
+
+    # Recreate dataset iterator for the next epoch.
+    if reset_dataset_after_each_epoch and epoch < epochs - 1:
+      generator = dataset_ops.make_one_shot_iterator(original_dataset)
+
   callbacks._call_end_hook(mode)
 
   if enqueuer is not None:
     enqueuer.stop()
 
   if should_set_learning_phase:
-    backend.set_learning_phase(old_learning_phase)
+    backend.set_eager_learning_phase(old_learning_phase)
 
-  if mode == 'train':
+  if mode == ModeKeys.TRAIN:
     return model.history
   return results
 
 
 # Maintain compatibility with the existing names.
-fit_generator = functools.partial(model_iteration, mode='train')
-evaluate_generator = functools.partial(model_iteration, mode='test')
-predict_generator = functools.partial(model_iteration, mode='predict')
+fit_generator = functools.partial(model_iteration, mode=ModeKeys.TRAIN)
+evaluate_generator = functools.partial(
+    model_iteration, mode=ModeKeys.TEST, shuffle=False)
+predict_generator = functools.partial(
+    model_iteration, mode=ModeKeys.PREDICT, shuffle=False)
 
 
-def _get_next_batch(output_generator, mode):
+def _get_next_batch(generator, mode):
   """Retrieves the next batch of input data."""
   try:
-    generator_output = next(output_generator)
-  except (errors.OutOfRangeError, StopIteration):
-    # Returning `None` will trigger looping to stop.
-    logging.warning('Your dataset iterator ran out of data.')
+    generator_output = next(generator)
+  except (StopIteration, errors.OutOfRangeError):
     return None
   if not isinstance(generator_output, tuple):
-    if mode == 'predict':
+    if mode == ModeKeys.PREDICT:
       # Always wrap in a tuple.
       return (generator_output,)
     else:
@@ -274,7 +370,7 @@ def _get_next_batch(output_generator, mode):
   return generator_output
 
 
-def _validate_arguments(is_sequence, use_multiprocessing, workers,
+def _validate_arguments(is_sequence, is_dataset, use_multiprocessing, workers,
                         steps_per_epoch, validation_data, validation_steps,
                         mode, kwargs):
   """Raises errors if arguments are invalid.
@@ -282,6 +378,7 @@ def _validate_arguments(is_sequence, use_multiprocessing, workers,
   Arguments:
     is_sequence: Boolean, whether data is a `keras.utils.data_utils.Sequence`
       instance.
+    is_dataset: Boolean, whether data is a dataset instance.
     use_multiprocessing: Boolean. If `True`, use process-based threading. If
       unspecified, `use_multiprocessing` will default to `False`. Note that
       because this implementation relies on multiprocessing, you should not pass
@@ -298,7 +395,7 @@ def _validate_arguments(is_sequence, use_multiprocessing, workers,
       `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
     validation_steps: Total number of steps (batches of samples) before
       declaring validation finished.
-    mode: One of 'train'/'test'/'predict'.
+    mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
     kwargs: Additional arguments for backwards compatibility.
 
   Raises:
@@ -313,15 +410,14 @@ def _validate_arguments(is_sequence, use_multiprocessing, workers,
                     ' Please consider using the `keras.utils.Sequence`'
                     ' class.'))
 
-  if steps_per_epoch is None:
-    arg_name = 'steps_per_epoch' if mode == 'train' else 'steps'
+  if steps_per_epoch is None and not is_dataset:
+    arg_name = 'steps_per_epoch' if mode == ModeKeys.TRAIN else 'steps'
     raise ValueError('Please specify the number of steps via the '
                      '`{}` argument.'.format(arg_name))
 
   val_gen = (
       data_utils.is_generator_or_sequence(validation_data) or
-      isinstance(validation_data, iterator_ops.EagerIterator) or
-      isinstance(validation_data, dataset_ops.DatasetV2))
+      isinstance(validation_data, iterator_ops.EagerIterator))
   if (val_gen and not isinstance(validation_data, data_utils.Sequence) and
       not validation_steps):
     raise ValueError('Please specify the `validation_steps` argument.')
@@ -345,7 +441,9 @@ def convert_to_generator_like(data,
       and may be `None` or `[None]`.
     batch_size: Used when creating a generator out of tuples of NumPy arrays or
       EagerTensors.
-    steps_per_epoch: Steps of the generator to run each epoch.
+    steps_per_epoch: Steps of the generator to run each epoch. If `None` the
+      number of steps will be read from the data (for
+      `keras.utils.data_utils.Sequence` types).
     epochs: Total number of epochs to run.
     shuffle: Whether the data should be shuffled.
 
@@ -366,7 +464,8 @@ def convert_to_generator_like(data,
   if data_utils.is_generator_or_sequence(data) or isinstance(
       data, iterator_ops.EagerIterator):
     if isinstance(data, data_utils.Sequence):
-      steps_per_epoch = len(data)
+      if steps_per_epoch is None:
+        steps_per_epoch = len(data)
     return data, steps_per_epoch
   if isinstance(data, dataset_ops.DatasetV2):
     return dataset_ops.make_one_shot_iterator(data), steps_per_epoch
@@ -420,13 +519,9 @@ def _make_enqueued_generator(generator,
 
 def _make_execution_function(model, mode, class_weight=None):
   """Makes function to run one step of model execution."""
-  if mode == 'train':
-    if not context.executing_eagerly():
-      model._make_fit_function()
+  if mode == ModeKeys.TRAIN:
     f = functools.partial(model.train_on_batch, class_weight=class_weight)
-  elif mode == 'test':
-    if not context.executing_eagerly():
-      model._make_eval_function()
+  elif mode == ModeKeys.TEST:
     f = model.test_on_batch
   else:
     # Match signature of other modes to allow
@@ -437,7 +532,7 @@ def _make_execution_function(model, mode, class_weight=None):
     f = predict_on_batch
 
   # Maintain stateful metrics across batch-level calls.
-  if mode != 'predict':
+  if mode != ModeKeys.PREDICT:
     f = functools.partial(f, reset_metrics=False)
 
   return f
diff --git a/tensorflow/python/keras/engine/training_generator_test.py b/tensorflow/python/keras/engine/training_generator_test.py
index 8941428e43ac5d7b4b439d86795e93a70fd270f0..6b754c18b3d45a66fd704a64e01b425d854d3329 100644
--- a/tensorflow/python/keras/engine/training_generator_test.py
+++ b/tensorflow/python/keras/engine/training_generator_test.py
@@ -29,11 +29,12 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import training_generator
+from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.platform import test
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
 from tensorflow.python.util import nest
 
 
@@ -60,23 +61,19 @@ def custom_generator(mode=2):
       yield x, y, w
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class TestGeneratorMethods(test.TestCase, parameterized.TestCase):
+class TestGeneratorMethods(keras_parameterized.TestCase):
 
   @unittest.skipIf(
       os.name == 'nt',
       'use_multiprocessing=True does not work on windows properly.')
-  @parameterized.parameters('sequential', 'functional')
-  def test_fit_generator_method(self, model_type):
-    if model_type == 'sequential':
-      model = testing_utils.get_small_sequential_mlp(
-          num_hidden=3, num_classes=4, input_dim=2)
-    else:
-      model = testing_utils.get_small_functional_mlp(
-          num_hidden=3, num_classes=4, input_dim=2)
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_fit_generator_method(self):
+    model = testing_utils.get_small_mlp(
+        num_hidden=3, num_classes=4, input_dim=2)
     model.compile(
         loss='mse',
-        optimizer='sgd',
+        optimizer=rmsprop.RMSprop(1e-3),
         metrics=['mae', metrics_module.CategoricalAccuracy()])
 
     model.fit_generator(custom_generator(),
@@ -109,19 +106,16 @@ class TestGeneratorMethods(test.TestCase, parameterized.TestCase):
   @unittest.skipIf(
       os.name == 'nt',
       'use_multiprocessing=True does not work on windows properly.')
-  @parameterized.parameters('sequential', 'functional')
-  def test_evaluate_generator_method(self, model_type):
-    if model_type == 'sequential':
-      model = testing_utils.get_small_sequential_mlp(
-          num_hidden=3, num_classes=4, input_dim=2)
-    else:
-      model = testing_utils.get_small_functional_mlp(
-          num_hidden=3, num_classes=4, input_dim=2)
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_evaluate_generator_method(self):
+    model = testing_utils.get_small_mlp(
+        num_hidden=3, num_classes=4, input_dim=2)
     model.compile(
         loss='mse',
-        optimizer='sgd',
-        metrics=['mae', metrics_module.CategoricalAccuracy()])
-    model.summary()
+        optimizer=rmsprop.RMSprop(1e-3),
+        metrics=['mae', metrics_module.CategoricalAccuracy()],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     model.evaluate_generator(custom_generator(),
                              steps=5,
@@ -142,18 +136,12 @@ class TestGeneratorMethods(test.TestCase, parameterized.TestCase):
   @unittest.skipIf(
       os.name == 'nt',
       'use_multiprocessing=True does not work on windows properly.')
-  @parameterized.parameters('sequential', 'functional')
-  def test_predict_generator_method(self, model_type):
-    if model_type == 'sequential':
-      model = testing_utils.get_small_sequential_mlp(
-          num_hidden=3, num_classes=4, input_dim=2)
-    else:
-      model = testing_utils.get_small_functional_mlp(
-          num_hidden=3, num_classes=4, input_dim=2)
-    model.compile(
-        loss='mse',
-        optimizer='sgd',
-        metrics=['mae', metrics_module.CategoricalAccuracy()])
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_predict_generator_method(self):
+    model = testing_utils.get_small_mlp(
+        num_hidden=3, num_classes=4, input_dim=2)
+    model.run_eagerly = testing_utils.should_run_eagerly()
 
     model.predict_generator(custom_generator(),
                             steps=5,
@@ -183,13 +171,16 @@ class TestGeneratorMethods(test.TestCase, parameterized.TestCase):
                             max_queue_size=10,
                             workers=0)
 
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
   def test_generator_methods_with_sample_weights(self):
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(4, input_shape=(2,)))
+    model = testing_utils.get_small_mlp(
+        num_hidden=3, num_classes=4, input_dim=2)
     model.compile(
         loss='mse',
-        optimizer='sgd',
-        metrics=['mae', metrics_module.CategoricalAccuracy()])
+        optimizer=rmsprop.RMSprop(1e-3),
+        metrics=['mae', metrics_module.CategoricalAccuracy()],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     model.fit_generator(custom_generator(mode=3),
                         steps_per_epoch=5,
@@ -214,15 +205,18 @@ class TestGeneratorMethods(test.TestCase, parameterized.TestCase):
                              max_queue_size=10,
                              use_multiprocessing=False)
 
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
   def test_generator_methods_invalid_use_case(self):
 
     def invalid_generator():
       while 1:
         yield 0
 
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(4, input_shape=(2,)))
-    model.compile(loss='mse', optimizer='sgd')
+    model = testing_utils.get_small_mlp(
+        num_hidden=3, num_classes=4, input_dim=2)
+    model.compile(loss='mse', optimizer=rmsprop.RMSprop(1e-3),
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     with self.assertRaises(ValueError):
       model.fit_generator(invalid_generator(),
@@ -251,6 +245,8 @@ class TestGeneratorMethods(test.TestCase, parameterized.TestCase):
                                max_queue_size=10,
                                use_multiprocessing=False)
 
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
   def test_generator_input_to_fit_eval_predict(self):
     val_data = np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
 
@@ -258,12 +254,11 @@ class TestGeneratorMethods(test.TestCase, parameterized.TestCase):
       while True:
         yield np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
 
-    inputs = keras.layers.Input(shape=(10,))
-    x = keras.layers.Dense(10, activation='relu')(inputs)
-    outputs = keras.layers.Dense(1, activation='sigmoid')(x)
-    model = keras.Model(inputs, outputs)
+    model = testing_utils.get_small_mlp(
+        num_hidden=10, num_classes=1, input_dim=10)
 
-    model.compile(RMSPropOptimizer(0.001), 'binary_crossentropy')
+    model.compile(rmsprop.RMSprop(0.001), 'binary_crossentropy',
+                  run_eagerly=testing_utils.should_run_eagerly())
     model.fit(
         ones_generator(),
         steps_per_epoch=2,
@@ -273,9 +268,10 @@ class TestGeneratorMethods(test.TestCase, parameterized.TestCase):
     model.predict(ones_generator(), steps=2)
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class TestGeneratorMethodsWithSequences(test.TestCase):
+class TestGeneratorMethodsWithSequences(keras_parameterized.TestCase):
 
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
   def test_training_with_sequences(self):
 
     class DummySequence(keras.utils.Sequence):
@@ -286,9 +282,9 @@ class TestGeneratorMethodsWithSequences(test.TestCase):
       def __len__(self):
         return 10
 
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(4, input_shape=(2,)))
-    model.compile(loss='mse', optimizer='sgd')
+    model = testing_utils.get_small_mlp(
+        num_hidden=3, num_classes=4, input_dim=2)
+    model.compile(loss='mse', optimizer=rmsprop.RMSprop(1e-3))
 
     model.fit_generator(DummySequence(),
                         steps_per_epoch=10,
@@ -305,6 +301,8 @@ class TestGeneratorMethodsWithSequences(test.TestCase):
                         workers=0,
                         use_multiprocessing=False)
 
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
   def test_sequence_input_to_fit_eval_predict(self):
     val_data = np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
 
@@ -316,12 +314,10 @@ class TestGeneratorMethodsWithSequences(test.TestCase):
       def __len__(self):
         return 2
 
-    inputs = keras.layers.Input(shape=(10,))
-    x = keras.layers.Dense(10, activation='relu')(inputs)
-    outputs = keras.layers.Dense(1, activation='sigmoid')(x)
-    model = keras.Model(inputs, outputs)
+    model = testing_utils.get_small_mlp(
+        num_hidden=10, num_classes=1, input_dim=10)
 
-    model.compile(RMSPropOptimizer(0.001), 'binary_crossentropy')
+    model.compile(rmsprop.RMSprop(0.001), 'binary_crossentropy')
     model.fit(CustomSequence(), validation_data=val_data, epochs=2)
     model.evaluate(CustomSequence())
     model.predict(CustomSequence())
diff --git a/tensorflow/python/keras/engine/training_gpu_test.py b/tensorflow/python/keras/engine/training_gpu_test.py
index 45dcfe43995b280072395b11a573e20d57bcadc7..ddc947339dd8f68a7c85eefb48860f9f65b1fad2 100644
--- a/tensorflow/python/keras/engine/training_gpu_test.py
+++ b/tensorflow/python/keras/engine/training_gpu_test.py
@@ -25,7 +25,6 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.layers.convolutional import Conv2D
 from tensorflow.python.platform import test
-from tensorflow.python.training import rmsprop
 
 
 class TrainingGPUTest(test.TestCase):
@@ -65,7 +64,7 @@ class TrainingGPUTest(test.TestCase):
                            bias_initializer='ones')(input_tensor)
       simple_model = keras.models.Model(inputs=input_tensor,
                                         outputs=predictions)
-      simple_model.compile(optimizer=rmsprop.RMSPropOptimizer(1e-3), loss=loss)
+      simple_model.compile(optimizer='rmsprop', loss=loss)
       return simple_model
 
     if test.is_gpu_available(cuda_only=True):
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 91a0c7cc2f2dc5cf3e76eafdaaa79cfe6bc10336..c83e669e9a568b3ef32cd2742ed4e9bb3399b959 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -22,10 +22,12 @@ import io
 import logging
 import sys
 
+from absl.testing import parameterized
 import numpy as np
 import six
 
 from tensorflow.python import keras
+from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
@@ -33,9 +35,11 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.callbacks import Callback
+from tensorflow.python.keras.engine.training_utils import set_run_eagerly_for_dict_structure
 from tensorflow.python.keras.engine.training_utils import weighted_masked_objective
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -43,6 +47,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training.adam import AdamOptimizer
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 try:
@@ -51,6 +56,129 @@ except ImportError:
   scipy_sparse = None
 
 
+class CompileTest(keras_parameterized.TestCase):
+
+  def _get_multi_output_model(self):
+    input_a = keras.layers.Input(shape=(3,), name='input_a')
+    output_a = keras.layers.Dense(1, name='dense_1')(input_a)
+    output_b = keras.layers.Dense(1, name='dense_2')(input_a)
+    return keras.models.Model(input_a, [output_a, output_b])
+
+  def _do_test_compile_with_model_and_single_loss(self, model, loss):
+    model.compile(optimizer='adam', loss=loss)
+    self.assertEqual(model.loss, loss)
+
+    loss = losses.get(loss)
+    if not isinstance(loss, list):
+      loss_list = [loss] * len(model.outputs)
+
+    self.assertEqual(len(model.loss_functions), len(loss_list))
+    for i in range(len(loss_list)):
+      self.assertIsInstance(model.loss_functions[i], losses.LossFunctionWrapper)
+      if not isinstance(loss_list[i], losses.LossFunctionWrapper):
+        self.assertEqual(model.loss_functions[i].fn, loss_list[i])
+    self.assertAllEqual(model.loss_weights_list, [1.] * len(loss_list))
+
+  @keras_parameterized.run_all_keras_modes
+  @parameterized.named_parameters(('loss_string', 'mse'),
+                                  ('loss_function', losses.mean_squared_error),
+                                  ('loss_instance', losses.MeanSquaredError()))
+  def test_compile_with_single_output(self, loss):
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=2, input_dim=3)
+    self._do_test_compile_with_model_and_single_loss(model, loss)
+
+  @keras_parameterized.run_all_keras_modes
+  @parameterized.named_parameters(('loss_string', 'mse'),
+                                  ('loss_function', losses.mean_squared_error),
+                                  ('loss_instance', losses.MeanSquaredError()))
+  def test_compile_with_multi_output(self, loss):
+    model = self._get_multi_output_model()
+    self._do_test_compile_with_model_and_single_loss(model, loss)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_compile_with_multi_output_and_multi_loss(self):
+    model = self._get_multi_output_model()
+    # Test loss is a list.
+    loss = ['mse', 'mae']
+    model.compile(optimizer='adam', loss=loss)
+    self.assertEqual(model.loss_functions[0].fn, losses.mean_squared_error)
+    self.assertEqual(model.loss_functions[1].fn, losses.mean_absolute_error)
+    self.assertAllEqual(model.loss_weights_list, [1., 1.])
+
+    # Test loss is a dict.
+    loss = {'dense_1': 'mae', 'dense_2': 'mse'}
+    model.compile(optimizer='adam', loss=loss)
+    self.assertEqual(model.loss_functions[0].fn, losses.mean_absolute_error)
+    self.assertEqual(model.loss_functions[1].fn, losses.mean_squared_error)
+    self.assertAllEqual(model.loss_weights_list, [1., 1.])
+
+  @keras_parameterized.run_all_keras_modes
+  def test_compile_with_multi_output_and_loss_weights_list(self):
+    model = self._get_multi_output_model()
+    loss_weights = [1., 2.]
+    model.compile(optimizer='adam', loss='mse', loss_weights=loss_weights)
+    self.assertAllEqual(model.loss_weights_list, [1., 2.])
+
+  def test_compile_with_multi_output_and_loss_weights_dict(self):
+    with context.graph_mode():
+      model = self._get_multi_output_model()
+      loss_weights = {'dense_1': 1., 'dense_2': 2.}
+      model.compile(optimizer='adam', loss='mse', loss_weights=loss_weights)
+      self.assertAllEqual(model.loss_weights_list, [1., 2.])
+
+      input_np = np.random.random((10, 3))
+      output_a_np = np.random.random((10, 1))
+      output_b_np = np.random.random((10, 1))
+
+      with self.cached_session() as sess:
+        sess.run(variables_lib.global_variables_initializer())
+        total_loss, y_preds = sess.run(
+            [model.total_loss, model.outputs],
+            feed_dict={
+                'input_a:0': input_np,
+                'dense_1_target:0': output_a_np,
+                'dense_2_target:0': output_b_np
+            })
+        self.assertAllClose(
+            total_loss,
+            np.mean(
+                np.add((output_a_np - y_preds[0])**2,
+                       2 * (output_b_np - y_preds[1])**2)))
+
+  @keras_parameterized.run_all_keras_modes
+  def test_compile_with_incorrect_loss_size(self):
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=2, input_dim=3)
+    with self.assertRaisesRegexp(ValueError, 'The model has 1 outputs'):
+      model.compile(optimizer='adam', loss=['mse', 'mae'])
+
+  @keras_parameterized.run_all_keras_modes
+  def test_compile_with_incorrect_loss_key(self):
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=2, input_dim=3)
+    with self.assertRaisesRegexp(
+        ValueError, 'Unknown entry in loss dictionary: unknown_output'):
+      model.compile(optimizer='adam', loss={'unknown_output': 'mse'})
+
+  @keras_parameterized.run_all_keras_modes
+  def test_compile_with_incorrect_loss_weights_size(self):
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=2, input_dim=3)
+    with self.assertRaisesRegexp(ValueError,
+                                 'it should have one entry per model output'):
+      model.compile(optimizer='adam', loss='mse', loss_weights=[1., 2.])
+
+  @keras_parameterized.run_all_keras_modes
+  def test_compile_with_incorrect_loss_weights_key(self):
+    model = testing_utils.get_small_sequential_mlp(
+        num_hidden=10, num_classes=2, input_dim=3)
+    with self.assertRaisesRegexp(
+        ValueError, 'Unknown entry in loss_weights dictionary: unknown_output'):
+      model.compile(
+          optimizer='adam', loss='mse', loss_weights={'unknown_output': 1.})
+
+
 class TrainingTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_with_all_model_types(exclude_models='sequential')
@@ -250,8 +378,10 @@ class TrainingTest(keras_parameterized.TestCase):
                   run_eagerly=testing_utils.should_run_eagerly())
     # This will work
     model.fit([input_a_np], output_d_np, epochs=1)
-    with self.assertRaises(ValueError):
-      model.fit([input_a_np, input_a_np], output_d_np, epochs=1)
+    # TODO(gsundeep) Test only works in eager, file ticket
+    if testing_utils.should_run_eagerly() and context.executing_eagerly():
+      with self.assertRaises(ValueError):
+        model.fit([input_a_np, input_a_np], output_d_np, epochs=1)
 
     # Test model on a list of floats
     input_a_np = np.random.random((10, 3))
@@ -346,16 +476,21 @@ class TrainingTest(keras_parameterized.TestCase):
     self.assertEqual(len(out), 2)
 
   @keras_parameterized.run_all_keras_modes
+  @keras_parameterized.run_with_all_model_types
   def test_activity_regularizer_fit(self):
     loss = {}
     for reg in [None, 'l2']:
-      inputs = keras.layers.Input(shape=(10,))
-      x = keras.layers.Dense(
-          10, activation='relu', activity_regularizer=reg,
-          kernel_initializer='ones', use_bias=False)(inputs)
-      outputs = keras.layers.Dense(1, activation='sigmoid',
-                                   kernel_initializer='ones', use_bias=False)(x)
-      model = keras.Model(inputs, outputs)
+      layers = [
+          keras.layers.Dense(
+              10, activation='relu', activity_regularizer=reg,
+              kernel_initializer='ones', use_bias=False),
+          keras.layers.Dense(
+              1, activation='sigmoid', kernel_initializer='ones',
+              use_bias=False),
+      ]
+
+      model = testing_utils.get_model_from_layers(
+          layers, input_shape=(10,))
 
       x = np.ones((10, 10), 'float32')
       y = np.ones((10, 1), 'float32')
@@ -368,15 +503,14 @@ class TrainingTest(keras_parameterized.TestCase):
     self.assertLess(loss[None], loss['l2'])
 
   @keras_parameterized.run_all_keras_modes
+  @keras_parameterized.run_with_all_model_types
   def test_activity_regularizer_loss_value(self):
-    inputs = keras.layers.Input(shape=(10,))
-    outputs = keras.layers.Dense(
-        1,
-        kernel_initializer=keras.initializers.zeros(),
-        bias_initializer=keras.initializers.ones(),
-        activity_regularizer='l2')(
-            inputs)
-    model = keras.Model(inputs, outputs)
+    layer = keras.layers.Dense(
+        1, kernel_initializer=keras.initializers.zeros(),
+        bias_initializer=keras.initializers.ones(), activity_regularizer='l2')
+
+    model = testing_utils.get_model_from_layers([layer], input_shape=(10,))
+
     x = np.ones((10, 10), 'float32')
     y = np.ones((10, 1), 'float32')
     optimizer = RMSPropOptimizer(learning_rate=0.001)
@@ -471,7 +605,6 @@ class TrainingTest(keras_parameterized.TestCase):
         metrics=['accuracy'],
         run_eagerly=testing_utils.should_run_eagerly())
 
-  @tf_test_util.run_v1_only('b/120545219')
   def test_that_trainable_disables_updates(self):
     val_a = np.random.random((10, 4))
     val_out = np.random.random((10, 4))
@@ -558,14 +691,17 @@ class TrainingTest(keras_parameterized.TestCase):
           validation_data=(x_train, y_train))
       self.assertEqual(test_callback.batch_end_call_count, 10)
       self.assertEqual(test_callback.epoch_end_call_count, 2)
+
+      weighted_metric = ('mae'
+                         if tf2.enabled() else 'weighted_mean_absolute_error')
       self.assertSetEqual(
           set(test_callback.batch_end_logs.keys()),
-          set(['batch', 'size', 'acc', 'loss', 'weighted_mean_absolute_error']))
+          set(['batch', 'size', 'acc', 'loss', weighted_metric]))
       self.assertSetEqual(
           set(test_callback.epoch_end_logs.keys()),
           set([
-              'acc', 'loss', 'weighted_mean_absolute_error', 'val_acc',
-              'val_loss', 'val_weighted_mean_absolute_error'
+              'acc', 'loss', weighted_metric, 'val_acc', 'val_loss',
+              'val_' + weighted_metric
           ]))
 
   @keras_parameterized.run_all_keras_modes
@@ -731,6 +867,137 @@ class TrainingTest(keras_parameterized.TestCase):
     self.assertAllEqual([[6], [8], [10], [12]],
                         model.predict(dataset_two, steps=2))
 
+  def test_training_on_sparse_categorical_crossentropy_loss_with_softmax(self):
+    with context.eager_mode():
+      np.random.seed(1337)
+      train_x = np.ones((100, 4))
+      train_y = np.random.randint(0, 1, size=(100, 1))
+
+      reference_model = testing_utils.get_small_sequential_mlp(16, 2,
+                                                               input_dim=4)
+      reference_model.compile(loss='sparse_categorical_crossentropy',
+                              optimizer=RMSPropOptimizer(learning_rate=0.001),
+                              run_eagerly=True)
+      fixed_weights = reference_model.get_weights()
+      reference_model_loss = reference_model.train_on_batch(train_x, train_y)
+
+      test_model = testing_utils.get_small_sequential_mlp(16, 2, input_dim=4)
+      test_model.compile(loss='sparse_categorical_crossentropy',
+                         optimizer=RMSPropOptimizer(learning_rate=0.001),
+                         run_eagerly=False)
+      test_model.set_weights(fixed_weights)
+      test_model_loss = test_model.train_on_batch(train_x, train_y)
+      self.assertAlmostEqual(test_model_loss, reference_model_loss, places=4)
+
+  def test_training_on_categorical_crossentropy_loss_with_softmax(self):
+    with context.eager_mode():
+      np.random.seed(1337)
+      train_x = np.ones((100, 4))
+      train_y = keras.utils.to_categorical(np.random.randint(0, 1,
+                                                             size=(100, 1)), 2)
+
+      reference_model = testing_utils.get_small_sequential_mlp(16, 2,
+                                                               input_dim=4)
+      reference_model.compile(loss='categorical_crossentropy',
+                              optimizer=RMSPropOptimizer(learning_rate=0.001),
+                              run_eagerly=True)
+      fixed_weights = reference_model.get_weights()
+      reference_model_loss = reference_model.train_on_batch(train_x, train_y)
+
+      test_model = testing_utils.get_small_sequential_mlp(16, 2, input_dim=4)
+      test_model.compile(loss='categorical_crossentropy',
+                         optimizer=RMSPropOptimizer(learning_rate=0.001),
+                         run_eagerly=False)
+      test_model.set_weights(fixed_weights)
+      test_model_loss = test_model.train_on_batch(train_x, train_y)
+      self.assertAlmostEqual(test_model_loss, reference_model_loss, places=4)
+
+  def test_training_on_binary_crossentropy_loss(self):
+    with context.eager_mode():
+      train_x = np.ones((100, 4), dtype=np.float32)
+      train_y = np.ones((100, 1), dtype=np.float32)
+      reference_model = testing_utils.get_small_sequential_mlp(16, 1,
+                                                               input_dim=4)
+      reference_model.compile(loss='binary_crossentropy',
+                              optimizer=RMSPropOptimizer(learning_rate=0.001),
+                              run_eagerly=True)
+      fixed_weights = reference_model.get_weights()
+      reference_model_loss = reference_model.train_on_batch(train_x, train_y)
+
+      test_model = testing_utils.get_small_sequential_mlp(16, 1, input_dim=4)
+      test_model.compile(loss='binary_crossentropy',
+                         optimizer=RMSPropOptimizer(learning_rate=0.001),
+                         run_eagerly=False)
+      test_model.set_weights(fixed_weights)
+      test_model_loss = test_model.train_on_batch(train_x, train_y)
+      self.assertAlmostEqual(test_model_loss, reference_model_loss, places=4)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  @parameterized.named_parameters(
+      ('default', 1, 4), ('integer_two', 2, 2), ('integer_four', 4, 1),
+      ('simple_list', [1, 3, 4], 3), ('duplicated_list', [4, 2, 2], 2))
+  def test_validation_freq(self, validation_freq, expected_runs):
+    x, y = np.ones((10, 10)), np.ones((10, 1))
+    model = testing_utils.get_small_mlp(2, 1, 10)
+    model.compile('sgd', 'mse')
+
+    class ValCounter(keras.callbacks.Callback):
+
+      def __init__(self):
+        self.val_runs = 0
+
+      def on_test_begin(self, logs=None):
+        self.val_runs += 1
+
+    val_counter = ValCounter()
+    model.fit(
+        x,
+        y,
+        epochs=4,
+        validation_data=(x, y),
+        validation_freq=validation_freq,
+        callbacks=[val_counter])
+    self.assertEqual(val_counter.val_runs, expected_runs)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_add_loss_correctness(self):
+    if testing_utils.should_run_eagerly():
+      self.skipTest('b/124303407')
+
+    class Bias(keras.layers.Layer):
+
+      def build(self, input_shape):
+        self.bias = self.add_variable('bias', (1,), initializer='zeros')
+
+      def call(self, inputs):
+        return inputs + self.bias
+
+    inputs = keras.Input(shape=(1,))
+    outputs = Bias()(inputs)
+    model = keras.Model(inputs, outputs)
+    targets = keras.Input(shape=(1,))
+
+    model.add_loss(
+        math_ops.reduce_mean(
+            keras.losses.mean_absolute_error(targets, outputs)))
+
+    # If we want to use the loss class instance as shown below, we will need to
+    # add graph scope as the reduction logic involves some eager mode checks.
+    with keras.backend.get_graph().as_default():
+      model.add_loss(keras.losses.MeanAbsoluteError()(targets, outputs))
+
+    model.compile(
+        keras.optimizer_v2.gradient_descent.SGD(0.033333),
+        loss=keras.losses.MeanAbsoluteError(),
+        target_tensors=[targets],
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    x = np.array([[0.], [1.], [2.]])
+    y = np.array([[0.5], [2.], [3.5]])
+    history = model.fit(x, y, batch_size=3, epochs=5)
+    self.assertAllClose(history.history['loss'], [3., 2.7, 2.4, 2.1, 1.8], 1e-3)
+
 
 class TestExceptionsAndWarnings(keras_parameterized.TestCase):
 
@@ -784,21 +1051,21 @@ class TestExceptionsAndWarnings(keras_parameterized.TestCase):
                 'dense_1': metrics_module.CategoricalAccuracy(),
             },
             run_eagerly=testing_utils.should_run_eagerly())
-        msg = ('Output "dense_1" missing from loss dictionary. We assume this '
+        msg = ('Output dense_1 missing from loss dictionary. We assume this '
                'was done on purpose. The fit and evaluate APIs will not be '
-               'expecting any data to be passed to "dense_1".')
+               'expecting any data to be passed to dense_1.')
         self.assertRegexpMatches(str(mock_log.call_args), msg)
 
 
 class LossWeightingTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes
-  # TODO(b/120562577): Test failing with assertion error.
-  def DISABLED_test_class_weights(self):
+  def test_class_weights(self):
     num_classes = 5
     batch_size = 5
-    epochs = 5
+    epochs = 10
     weighted_class = 3
+    weight = 10.
     train_samples = 1000
     test_samples = 1000
     input_dim = 5
@@ -827,10 +1094,7 @@ class LossWeightingTest(keras_parameterized.TestCase):
     test_ids = np.where(int_y_test == np.array(weighted_class))[0]
 
     class_weight = dict([(i, 1.) for i in range(num_classes)])
-    class_weight[weighted_class] = 2.
-
-    sample_weight = np.ones((y_train.shape[0]))
-    sample_weight[int_y_train == weighted_class] = 2.
+    class_weight[weighted_class] = weight
 
     model.fit(
         x_train,
@@ -839,7 +1103,7 @@ class LossWeightingTest(keras_parameterized.TestCase):
         epochs=epochs // 3,
         verbose=0,
         class_weight=class_weight,
-        validation_data=(x_train, y_train, sample_weight))
+        validation_data=(x_train, y_train))
     model.fit(
         x_train,
         y_train,
@@ -864,12 +1128,12 @@ class LossWeightingTest(keras_parameterized.TestCase):
     self.assertLess(score[0], ref_score[0])
 
   @keras_parameterized.run_all_keras_modes
-  @tf_test_util.run_v1_only('b/120545219')
   def test_sample_weights(self):
     num_classes = 5
     batch_size = 5
-    epochs = 5
+    epochs = 10
     weighted_class = 3
+    weight = 10.
     train_samples = 1000
     test_samples = 1000
     input_dim = 5
@@ -898,7 +1162,7 @@ class LossWeightingTest(keras_parameterized.TestCase):
     test_ids = np.where(int_y_test == np.array(weighted_class))[0]
 
     sample_weight = np.ones((y_train.shape[0]))
-    sample_weight[int_y_train == weighted_class] = 2.
+    sample_weight[int_y_train == weighted_class] = weight
 
     model.fit(
         x_train,
@@ -931,44 +1195,12 @@ class LossWeightingTest(keras_parameterized.TestCase):
       self.assertLess(score[0], ref_score[0])
 
   @keras_parameterized.run_all_keras_modes
-  def test_warning_for_concurrent_sample_and_class_weights(self):
-
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(10, input_shape=(3,)))
-    model.compile(
-        loss='mse',
-        optimizer=RMSPropOptimizer(learning_rate=0.01),
-        run_eagerly=testing_utils.should_run_eagerly())
-    x_train = np.random.random((10, 3))
-    y_train = np.random.random((10, 10))
-    sample_weight = np.ones((y_train.shape[0]))
-    class_weight = {0: 1., 1: 1.}
-
-    with test.mock.patch.object(logging, 'warning') as mock_log:
-      model.fit(
-          x_train,
-          y_train,
-          epochs=1,
-          verbose=0,
-          sample_weight=sample_weight,
-          class_weight=class_weight)
-      msg = 'The `class_weight` argument will be ignored.'
-
-      msg_found = False
-      for call_args in mock_log.call_args_list:
-        if msg in str(call_args):
-          msg_found = True
-
-      self.assertTrue(msg_found)
-
-  @keras_parameterized.run_all_keras_modes
-  @tf_test_util.run_v1_only('b/120545219')
-  # TODO(b/120562577): Test failing with assertion error.
-  def DISABLED_test_temporal_sample_weights(self):
+  def test_temporal_sample_weights(self):
     num_classes = 5
     batch_size = 5
-    epochs = 5
+    epochs = 10
     weighted_class = 3
+    weight = 10.
     train_samples = 1000
     test_samples = 1000
     input_dim = 5
@@ -997,7 +1229,7 @@ class LossWeightingTest(keras_parameterized.TestCase):
       test_ids = np.where(int_y_test == np.array(weighted_class))[0]
 
       sample_weight = np.ones((y_train.shape[0]))
-      sample_weight[int_y_train == weighted_class] = 2.
+      sample_weight[int_y_train == weighted_class] = weight
 
       temporal_x_train = np.reshape(x_train, (len(x_train), 1,
                                               x_train.shape[1]))
@@ -1018,7 +1250,7 @@ class LossWeightingTest(keras_parameterized.TestCase):
 
       model.compile(
           RMSPropOptimizer(learning_rate=learning_rate),
-          loss='binary_crossentropy',
+          loss='categorical_crossentropy',
           metrics=['acc', metrics_module.CategoricalAccuracy()],
           weighted_metrics=['mae', metrics_module.CategoricalAccuracy()],
           sample_weight_mode='temporal',
@@ -1285,7 +1517,6 @@ class LossMaskingTest(keras_parameterized.TestCase):
 
 class TestDynamicTrainability(keras_parameterized.TestCase):
 
-  @tf_test_util.run_v1_only('b/120545219')
   def test_trainable_warning(self):
     with self.cached_session():
       x = np.random.random((5, 3))
@@ -1299,7 +1530,6 @@ class TestDynamicTrainability(keras_parameterized.TestCase):
       model.train_on_batch(x, y)
       self.assertRaises(Warning)
 
-  @tf_test_util.run_v1_only('b/120545219')
   def test_trainable_argument(self):
     with self.cached_session():
       x = np.random.random((5, 3))
@@ -2031,9 +2261,11 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     metrics = ['mse', metrics_module.BinaryAccuracy()]
     model.compile(optimizer, loss='mae', metrics=metrics,
                   run_eagerly=testing_utils.should_run_eagerly())
+
+    mse_metric = 'mse' if tf2.enabled() else 'mean_squared_error'
     reference_metric_names = [
-        'loss', 'dense_loss', 'dropout_loss', 'dense_mean_squared_error',
-        'dense_binary_accuracy', 'dropout_mean_squared_error',
+        'loss', 'dense_loss', 'dropout_loss', 'dense_' + mse_metric,
+        'dense_binary_accuracy', 'dropout_' + mse_metric,
         'dropout_binary_accuracy'
     ]
     self.assertEqual(reference_metric_names, model.metrics_names)
@@ -2050,69 +2282,6 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
               batch_size=5)
     self.assertEqual(reference_metric_names, model.metrics_names)
 
-  @keras_parameterized.run_all_keras_modes
-  def test_metrics_correctness(self):
-    model = keras.Sequential()
-    model.add(
-        keras.layers.Dense(
-            3, activation='relu', input_dim=4, kernel_initializer='ones'))
-    model.add(
-        keras.layers.Dense(
-            1, activation='sigmoid', kernel_initializer='ones'))
-    model.compile(
-        loss='mae',
-        metrics=['accuracy', metrics_module.BinaryAccuracy()],
-        optimizer=RMSPropOptimizer(learning_rate=0.001),
-        run_eagerly=testing_utils.should_run_eagerly())
-
-    # verify correctness of stateful and stateless metrics.
-    x = np.ones((100, 4))
-    y = np.ones((100, 1))
-    outs = model.evaluate(x, y)
-    self.assertEqual(outs[1], 1.)
-    self.assertEqual(outs[2], 1.)
-
-    y = np.zeros((100, 1))
-    outs = model.evaluate(x, y)
-    self.assertEqual(outs[1], 0.)
-    self.assertEqual(outs[2], 0.)
-
-  @keras_parameterized.run_all_keras_modes
-  def test_metrics_correctness_with_weighted_metrics(self):
-    np.random.seed(1337)
-    x = np.array([[[1.], [1.]], [[0.], [0.]]])
-    model = keras.models.Sequential()
-    model.add(
-        keras.layers.TimeDistributed(
-            keras.layers.Dense(1, kernel_initializer='ones'),
-            input_shape=(2, 1)))
-    model.compile(
-        RMSPropOptimizer(learning_rate=0.001),
-        loss='mse',
-        sample_weight_mode='temporal',
-        weighted_metrics=['accuracy', 'mse'],
-        run_eagerly=testing_utils.should_run_eagerly())
-    y = np.array([[[1.], [1.]], [[1.], [1.]]])
-
-    outs = model.evaluate(x, y)
-    self.assertEqual(outs, [0.5, 0.5, 0.5])
-
-    w = np.array([[0., 0.], [0., 0.]])
-    outs = model.evaluate(x, y, sample_weight=w)
-    self.assertEqual(outs, [0., 0., 0.])
-
-    w = np.array([[3., 4.], [1., 2.]])
-    outs = model.evaluate(x, y, sample_weight=w)
-    self.assertArrayNear(outs, [0.75, 0.7, 0.3], .001)
-
-    # Verify that metric value is same with arbitrary weights and batch size.
-    x = np.random.random((50, 2, 1))
-    y = np.random.random((50, 2, 1))
-    w = np.random.random((50, 2))
-    mse1 = model.evaluate(x, y, sample_weight=w, batch_size=5)[2]
-    mse2 = model.evaluate(x, y, sample_weight=w, batch_size=10)[2]
-    self.assertNear(mse1, mse2, err=1e-7)
-
   @keras_parameterized.run_all_keras_modes
   def test_metric_state_reset_between_fit_and_evaluate(self):
     model = keras.Sequential()
@@ -2135,6 +2304,67 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     model.evaluate(x_test, y_test, batch_size=5)
     self.assertEqual(self.evaluate(acc_obj.count), 10)
 
+  @keras_parameterized.run_with_all_model_types(exclude_models=['sequential'])
+  @keras_parameterized.run_all_keras_modes
+  def test_metrics_valid_compile_input_formats(self):
+    inp_1 = keras.layers.Input(shape=(1,), name='input_1')
+    inp_2 = keras.layers.Input(shape=(1,), name='input_2')
+    x = keras.layers.Dense(3, kernel_initializer='ones', trainable=False)
+    out_1 = keras.layers.Dense(
+        1, kernel_initializer='ones', name='output_1', trainable=False)
+    out_2 = keras.layers.Dense(
+        1, kernel_initializer='ones', name='output_2', trainable=False)
+
+    branch_a = [inp_1, x, out_1]
+    branch_b = [inp_2, x, out_2]
+    model = testing_utils.get_multi_io_model(branch_a, branch_b)
+
+    # list of metrics.
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        metrics=[keras.metrics.MeanSquaredError()],
+        weighted_metrics=[keras.metrics.MeanSquaredError()],
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    # list of list of metrics.
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        metrics=[
+            keras.metrics.MeanSquaredError(),
+            [keras.metrics.MeanSquaredError(),
+             keras.metrics.Accuracy()]
+        ],
+        weighted_metrics=[
+            keras.metrics.MeanSquaredError(),
+            [keras.metrics.MeanSquaredError(),
+             keras.metrics.Accuracy()]
+        ],
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    # dict of metrics.
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        metrics={
+            'output_1':
+                keras.metrics.MeanSquaredError(),
+            'output_2': [
+                keras.metrics.MeanSquaredError(),
+                keras.metrics.Accuracy()
+            ],
+        },
+        weighted_metrics={
+            'output_1':
+                keras.metrics.MeanSquaredError(),
+            'output_2': [
+                keras.metrics.MeanSquaredError(),
+                keras.metrics.Accuracy()
+            ],
+        },
+        run_eagerly=testing_utils.should_run_eagerly())
+
   @keras_parameterized.run_all_keras_modes
   def test_invalid_metrics(self):
     num_classes = 5
@@ -2152,6 +2382,17 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
           metrics=metrics_module.CategoricalAccuracy(),
           run_eagerly=testing_utils.should_run_eagerly())
 
+    inp = keras.layers.Input(shape=(1,))
+    x = keras.layers.Dense(3, activation='relu')(inp)
+    out_1 = keras.layers.Dense(1, activation='sigmoid', name='output_1')(x)
+    out_2 = keras.layers.Dense(1, activation='sigmoid', name='output_2')(x)
+    model = keras.models.Model(inp, [out_1, out_2])
+    with self.assertRaisesRegex(
+        ValueError, 'When passing a list of lists as `metrics`, '
+        'it should have one entry per model output. '
+        'The model has 2 outputs, but you passed metrics='):
+      model.compile('rmsprop', loss='mse', metrics=[['mse']])
+
   @keras_parameterized.run_all_keras_modes
   def test_metrics_masking(self):
     if testing_utils.should_run_eagerly():
@@ -2180,40 +2421,44 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
       scores = model.train_on_batch(x, y, sample_weight=w)
       self.assertArrayNear(scores, [0.3328, 0.8], 0.001)
 
-  @tf_test_util.run_deprecated_v1
-  def test_add_metric_with_tensor_on_model_in_graph_mode(self):
-    with self.cached_session():
-      x = keras.layers.Input(shape=(1,))
-      y = keras.layers.Dense(1, kernel_initializer='ones')(x)
-      model = keras.models.Model(x, y)
-      model.add_metric(
-          math_ops.reduce_sum(y), name='metric_1', aggregation='mean')
+  @keras_parameterized.run_all_keras_modes
+  def test_add_metric_with_tensor_on_model(self):
+    if testing_utils.should_run_eagerly():
+      self.skipTest('b/124303407')
 
-      # test with a metric which does not have the standard signature:
-      # (y_true, y_pred, sample_Weight)
+    x = keras.layers.Input(shape=(1,))
+    y = keras.layers.Dense(1, kernel_initializer='ones')(x)
+    model = keras.models.Model(x, y)
+    model.add_metric(
+        math_ops.reduce_sum(y), name='metric_1', aggregation='mean')
+
+    # test with a metric which does not have the standard signature:
+    # (y_true, y_pred, sample_Weight)
+    with keras.backend.get_graph().as_default():
       model.add_metric(metrics_module.Mean(name='metric_2')(y))
-      model.compile('sgd', loss='mse')
+    model.compile(
+        'sgd', loss='mse', run_eagerly=testing_utils.should_run_eagerly())
 
-      inputs = np.ones(shape=(10, 1))
-      targets = np.ones(shape=(10, 1))
-      history = model.fit(
-          inputs,
-          targets,
-          epochs=2,
-          batch_size=5,
-          validation_data=(inputs, targets))
-      self.assertEqual(history.history['metric_1'][-1], 5)
-      self.assertEqual(history.history['metric_2'][-1], 1)
-      self.assertEqual(history.history['val_metric_1'][-1], 5)
-      self.assertEqual(history.history['val_metric_2'][-1], 1)
+    inputs = np.ones(shape=(10, 1))
+    targets = np.ones(shape=(10, 1))
+    history = model.fit(
+        inputs,
+        targets,
+        epochs=2,
+        batch_size=5,
+        validation_data=(inputs, targets))
+    self.assertEqual(history.history['metric_1'][-1], 5)
+    self.assertEqual(history.history['metric_2'][-1], 1)
+    self.assertEqual(history.history['val_metric_1'][-1], 5)
+    self.assertEqual(history.history['val_metric_2'][-1], 1)
 
-      eval_results = model.evaluate(inputs, targets, batch_size=5)
-      self.assertEqual(eval_results[-1], 1)
-      self.assertEqual(eval_results[-2], 5)
+    eval_results = model.evaluate(inputs, targets, batch_size=5)
+    self.assertEqual(eval_results[-1], 1)
+    self.assertEqual(eval_results[-2], 5)
 
-      model.predict(inputs, batch_size=5)
-      model.train_on_batch(inputs, targets)
-      model.test_on_batch(inputs, targets)
+    model.predict(inputs, batch_size=5)
+    model.train_on_batch(inputs, targets)
+    model.test_on_batch(inputs, targets)
 
   @keras_parameterized.run_all_keras_modes
   def test_add_metric_in_model_call(self):
@@ -2253,6 +2498,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     model.train_on_batch(x, y)
     model.test_on_batch(x, y)
 
+  @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_add_metric_in_layer_call(self):
 
@@ -2268,9 +2514,11 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
             math_ops.reduce_sum(inputs), name='metric_1', aggregation='mean')
         return inputs + 1
 
-    model = keras.Sequential()
-    model.add(TestLayer(input_shape=(1,)))
-    model.add(keras.layers.Dense(2, kernel_initializer='ones'))
+    layers = [
+        TestLayer(input_shape=(1,)),
+        keras.layers.Dense(2, kernel_initializer='ones')
+    ]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(1,))
     model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01),
                   run_eagerly=testing_utils.should_run_eagerly())
 
@@ -2280,60 +2528,53 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     self.assertEqual(history.history['metric_1'][-1], 5)
     self.assertAlmostEqual(history.history['val_metric_1'][-1], 5, 0)
 
-  @tf_test_util.run_deprecated_v1
+  @keras_parameterized.run_all_keras_modes
   def test_model_metrics_list(self):
-    with self.cached_session():
-      x = keras.layers.Input(shape=(1,))
-      y = keras.layers.Dense(1, kernel_initializer='ones')(x)
-      model = keras.models.Model(x, y)
-      model.add_metric(
-          math_ops.reduce_sum(y), name='metric_1', aggregation='mean')
+    x = keras.layers.Input(shape=(1,))
+    y = keras.layers.Dense(1, kernel_initializer='ones')(x)
+    model = keras.models.Model(x, y)
+    model.add_metric(
+        math_ops.reduce_sum(y), name='metric_1', aggregation='mean')
+    with keras.backend.get_graph().as_default():
       model.add_metric(metrics_module.Mean(name='metric_2')(y))
-      model.compile('sgd', loss='mse', metrics=['acc'])
-
-      # Verify that the metrics added using `compile` and `add_metric` API are
-      # included
-      self.assertEqual(model._compile_metrics, ['acc'])
-      names = []
-      for m in model.metrics:
-        if isinstance(m, metrics_module.Metric):
-          names.append(m.name)
-        else:
-          names.append(m.__name__)
-      self.assertEqual(names, ['binary_accuracy', 'metric_1', 'metric_2'])
-
-  def test_model_eager_metrics_list(self):
-    with context.eager_mode():
+    model.compile(
+        'sgd',
+        loss='mse',
+        metrics=[metrics_module.Accuracy('acc')],
+        run_eagerly=testing_utils.should_run_eagerly())
 
-      class TestModel(keras.Model):
+    # Verify that the metrics added using `compile` and `add_metric` API are
+    # included
+    self.assertEqual([m.name for m in model._compile_metrics], ['acc'])
+    self.assertEqual([m.name for m in model.metrics],
+                     ['acc', 'metric_1', 'metric_2'])
 
-        def __init__(self):
-          super(TestModel, self).__init__(name='test_model')
-          self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+  @keras_parameterized.run_all_keras_modes
+  def test_model_metrics_list_in_call(self):
 
-        def call(self, x):
-          self.add_metric(
-              math_ops.reduce_sum(x), name='metric_1', aggregation='mean')
-          return self.dense1(x)
+    class TestModel(keras.Model):
 
-      model = TestModel()
-      model.compile(
-          loss='mse',
-          optimizer=RMSPropOptimizer(0.01),
-          metrics=['acc'],
-          run_eagerly=True)
-      x = np.ones(shape=(10, 1))
-      y = np.ones(shape=(10, 2))
-      model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+      def __init__(self):
+        super(TestModel, self).__init__(name='test_model')
+        self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+
+      def call(self, x):
+        self.add_metric(
+            math_ops.reduce_sum(x), name='metric_1', aggregation='mean')
+        return self.dense1(x)
+
+    model = TestModel()
+    model.compile(
+        loss='mse',
+        optimizer=RMSPropOptimizer(0.01),
+        metrics=[metrics_module.Accuracy('acc')],
+        run_eagerly=testing_utils.should_run_eagerly())
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+    model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
 
-      self.assertEqual(model._compile_metrics, ['acc'])
-      names = []
-      for m in model.metrics:
-        if isinstance(m, metrics_module.Metric):
-          names.append(m.name)
-        else:
-          names.append(m.__name__)
-      self.assertEqual(names, ['categorical_accuracy', 'metric_1'])
+    self.assertEqual([m.name for m in model._compile_metrics], ['acc'])
+    self.assertEqual([m.name for m in model.metrics], ['acc', 'metric_1'])
 
   @keras_parameterized.run_all_keras_modes
   def test_multiple_add_metric_calls(self):
@@ -2371,28 +2612,34 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     model.train_on_batch(x, y)
     model.test_on_batch(x, y)
 
-  def test_invalid_metric_tensor_in_call(self):
-    with context.eager_mode():
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_invalid_metric_tensor(self):
 
-      class TestLayer(keras.layers.Layer):
+    class TestLayer(keras.layers.Layer):
 
-        def call(self, inputs):
-          self.add_metric(metrics_module.Mean(name='metric_1')(inputs))
-          return inputs + 1
+      def build(self, input_shape):
+        self.built = True
 
-      model = keras.Sequential()
-      model.add(TestLayer(input_shape=(1,)))
-      model.add(keras.layers.Dense(2, kernel_initializer='ones'))
-      model.compile(
-          loss='mse', optimizer=RMSPropOptimizer(0.01), run_eagerly=True)
+      def call(self, inputs):
+        self.add_metric(math_ops.reduce_mean(inputs), name='metric_1')
+        return inputs + 1
 
-      x = np.ones(shape=(10, 1))
-      y = np.ones(shape=(10, 2))
-      with self.assertRaisesRegexp(
-          ValueError,
-          'We do not support adding an aggregated metric tensor in `call` in '
-          'eager execution.'):
-        model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+    layers = [TestLayer(input_shape=(1,))]
+    layers.append(keras.layers.Dense(2, kernel_initializer='ones'))
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        'We do not support adding an aggregated metric result tensor that is '
+        'not the output of a `tf.keras.metrics.Metric` metric instance.'):
+      model = testing_utils.get_model_from_layers(layers, input_shape=(1,))
+      model.compile(
+          loss='mse',
+          optimizer=RMSPropOptimizer(0.01),
+          run_eagerly=testing_utils.should_run_eagerly())
+      model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
 
   @keras_parameterized.run_all_keras_modes
   def test_duplicate_metric_name_in_add_metric(self):
@@ -2422,10 +2669,7 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
       model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
 
   @keras_parameterized.run_all_keras_modes
-  def test_multiple_no_name_input_to_add_metric(self):
-    # TODO(kaftan) Test seems to not work, file ticket
-    if testing_utils.should_run_eagerly() and context.executing_eagerly():
-      self.skipTest('Skipping running model eagerly.')
+  def test_add_metric_without_name(self):
 
     class TestModel(keras.Model):
 
@@ -2434,7 +2678,6 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
 
       def call(self, x):
-        self.add_metric(math_ops.reduce_sum(x), aggregation='mean')
         self.add_metric(math_ops.reduce_sum(x), aggregation='mean')
         return self.dense1(x)
 
@@ -2443,8 +2686,131 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
                   run_eagerly=testing_utils.should_run_eagerly())
     x = np.ones(shape=(10, 1))
     y = np.ones(shape=(10, 2))
-    model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
-    self.assertEqual([m.name for m in model.metrics], ['mean', 'mean_1'])
+
+    with self.assertRaisesRegex(ValueError,
+                                'Please provide a name for your metric like'):
+      model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+
+  @keras_parameterized.run_all_keras_modes
+  def test_add_metric_correctness(self):
+    if testing_utils.should_run_eagerly():
+      self.skipTest('b/124303407')
+
+    inputs = keras.Input(shape=(1,))
+    targets = keras.Input(shape=(1,))
+
+    class Bias(keras.layers.Layer):
+
+      def build(self, input_shape):
+        self.bias = self.add_variable('bias', (1,), initializer='zeros')
+        self.mae = metrics_module.MeanAbsoluteError(name='mae_1')
+
+      def call(self, inputs):
+        outputs = inputs + self.bias
+        self.add_metric(self.mae(targets, outputs), name='mae_1')
+        return outputs
+
+    outputs = Bias()(inputs)
+    model = keras.Model(inputs, outputs)
+
+    model.add_metric(
+        metrics_module.mean_absolute_error(targets, outputs),
+        name='mae_2',
+        aggregation='mean')
+
+    # If we want to use the metric class instance as shown below, we will need
+    # to add graph scope as the reduction logic involves some eager mode checks.
+    with keras.backend.get_graph().as_default():
+      model.add_metric(
+          metrics_module.MeanAbsoluteError(name='mae_3')(targets, outputs))
+
+    model.compile(
+        loss='mae',
+        optimizer=keras.optimizer_v2.gradient_descent.SGD(0.1),
+        metrics=[metrics_module.MeanAbsoluteError(name='mae_4')],
+        target_tensors=[targets],
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    x = np.array([[0.], [1.], [2.]])
+    y = np.array([[0.5], [2.], [3.5]])
+    history = model.fit(x, y, batch_size=3, epochs=5)
+
+    expected_val = [1., 0.9, 0.8, 0.7, 0.6]
+    for key in ['loss', 'mae_1', 'mae_2', 'mae_3', 'mae_4']:
+      self.assertAllClose(history.history[key], expected_val, 1e-3)
+
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_a1_total_loss_available_with_dict_dataset(self):
+
+    class TestModel(keras.models.Model):
+
+      def call(self, inputs, training=None, mask=None):
+        return math_ops.to_float(inputs['id'])
+
+    model = TestModel()
+    model.compile(
+        optimizer=AdamOptimizer(), loss='mean_squared_error', metrics=['mse'],
+        run_eagerly=testing_utils.should_run_eagerly())
+    dataset = dataset_ops.Dataset.from_tensor_slices(({
+        'id': [[6], [3], [1]]
+    }, [[0.7], [0.4], [0.2]]))
+    val_dataset = dataset_ops.Dataset.from_tensor_slices(({
+        'id': [[8], [5]]
+    }, [[0.9], [0.6]]))
+    history = model.fit(
+        dataset,
+        steps_per_epoch=2,
+        validation_data=val_dataset,
+        validation_steps=2)
+    self.assertAlmostEqual(history.history['val_loss'][0], 34.885, 2)
+    model.evaluate(dataset, steps=30)
+    model.predict([7])
+
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_total_loss_available_with_dict_array(self):
+
+    class TestModel(keras.models.Model):
+
+      def call(self, inputs, training=None, mask=None):
+        return math_ops.to_float(inputs['id'])
+
+    model = TestModel()
+    model.compile(
+        optimizer=AdamOptimizer(), loss='mean_squared_error', metrics=['mse'],
+        run_eagerly=testing_utils.should_run_eagerly())
+    x = {'id': np.array([[3], [1]])}
+    y = np.array([[4], [2]])
+    val_dataset = (x, y)
+    history = model.fit(
+        x,
+        y,
+        batch_size=32,
+        steps_per_epoch=2,
+        validation_data=val_dataset,
+        validation_steps=2)
+    self.assertAlmostEqual(history.history['val_loss'][0], 1.0, 2)
+    model.evaluate(x, y)
+    model.predict([7])
+
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_set_run_eagerly_for_dict_structure(self):
+    test_model = keras.models.Model()
+    self.assertFalse(test_model.run_eagerly)
+    set_run_eagerly_for_dict_structure(
+        test_model,
+        {'a': 2})
+    self.assertTrue(test_model.run_eagerly)
+
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_set_run_eagerly_for_dict_dataset(self):
+    test_model = keras.models.Model()
+    self.assertFalse(test_model.run_eagerly)
+    set_run_eagerly_for_dict_structure(
+        test_model,
+        dataset_ops.Dataset.from_tensor_slices(({
+            'id': [[3], [1]]
+        }, [[0.5], [0.2]])))
+    self.assertTrue(test_model.run_eagerly)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index 01a09eb031eef20538d587e3f17a31ecbb5e5f9a..8c9b49cfcd3ff5bb940ba2879a5acdd47f2573f9 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -12,22 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Training-related utilities.
-"""
+"""Training-related utilities."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import abc
+import collections
 from collections import OrderedDict
-import copy
 
 import numpy as np
 import six
 
+from tensorflow.python import tf2
+from tensorflow.python.data.experimental.ops import cardinality
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.data.ops import readers
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
@@ -41,6 +46,7 @@ from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensi
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import weights_broadcast_ops
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
 
@@ -66,7 +72,7 @@ class Aggregator(object):
     Arguments:
       batch_outs: A list of batch-level outputs.
     """
-    NotImplementedError('Must be implemented in subclasses.')
+    raise NotImplementedError('Must be implemented in subclasses.')
 
   @abc.abstractmethod
   def aggregate(self, batch_outs, batch_start=None, batch_end=None):
@@ -79,12 +85,12 @@ class Aggregator(object):
       batch_end: The end index of this batch. Always `None` if `use_steps` is
         `True`.
     """
-    NotImplementedError('Must be implemented in subclasses.')
+    raise NotImplementedError('Must be implemented in subclasses.')
 
   @abc.abstractmethod
   def finalize(self):
     """Prepares the total results to be returned."""
-    NotImplementedError('Must be implemented in subclasses.')
+    raise NotImplementedError('Must be implemented in subclasses.')
 
 
 class MetricsAggregator(Aggregator):
@@ -103,6 +109,8 @@ class MetricsAggregator(Aggregator):
     self.results[1:] = batch_outs[1:]
 
   def finalize(self):
+    if not self.results:
+      raise ValueError('Empty training data.')
     self.results[0] /= self.num_samples_or_steps
 
 
@@ -134,18 +142,6 @@ class OutputsAggregator(Aggregator):
       self.results = [np.concatenate(result, axis=0) for result in self.results]
 
 
-def make_logs(model, outputs, mode, prefix=''):
-  """Computes logs for sending to `on_batch_end` methods."""
-  logs = {}
-  # TODO(omalleyt): handle outputs in prediction when Callback
-  # hooks are ready.
-  if mode in ['train', 'test']:
-    if hasattr(model, 'metrics_names'):
-      for label, output in zip(model.metrics_names, outputs):
-        logs[prefix + label] = output
-  return logs
-
-
 def get_progbar(model, count_mode):
   """Get Progbar."""
   stateful_metric_names = None
@@ -190,10 +186,7 @@ def slice_arrays(arrays, indices, contiguous=True):
   return slices
 
 
-def check_num_samples(ins,
-                      batch_size=None,
-                      steps=None,
-                      steps_name='steps'):
+def check_num_samples(ins, batch_size=None, steps=None, steps_name='steps'):
   """Determine the number of samples provided for training and evaluation.
 
   The number of samples is not defined when running with `steps`,
@@ -202,9 +195,8 @@ def check_num_samples(ins,
   Arguments:
       ins: List of tensors to be fed to the Keras function.
       batch_size: Integer batch size or `None` if not defined.
-      steps: Total number of steps (batches of samples)
-          before declaring `_predict_loop` finished.
-          Ignored with the default value of `None`.
+      steps: Total number of steps (batches of samples) before declaring
+        `_predict_loop` finished. Ignored with the default value of `None`.
       steps_name: The public API's parameter name for `steps`.
 
   Raises:
@@ -218,13 +210,10 @@ def check_num_samples(ins,
       processed based on the size of the first dimension of the
       first input numpy array. When steps is not `None` and
       `batch_size` is `None`, returns `None`.
-
-  Raises:
-      ValueError: In case of invalid arguments.
   """
   if steps is not None and batch_size is not None:
-    raise ValueError(
-        'If ' + steps_name + ' is set, the `batch_size` must be None.')
+    raise ValueError('If ' + steps_name +
+                     ' is set, the `batch_size` must be None.')
   if check_steps_argument(ins, steps, steps_name):
     return None
   if hasattr(ins[0], 'shape'):
@@ -237,9 +226,8 @@ def standardize_single_array(x, expected_shape=None):
   if x is None:
     return None
 
-  if (x.shape is not None
-      and len(x.shape) == 1
-      and (expected_shape is None or len(expected_shape) != 1)):
+  if (x.shape is not None and len(x.shape) == 1 and
+      (expected_shape is None or len(expected_shape) != 1)):
     if tensor_util.is_tensor(x):
       x = array_ops.expand_dims(x, axis=1)
     else:
@@ -263,9 +251,8 @@ def standardize_input_data(data,
       data: User-provided input data (polymorphic).
       names: List of expected array names.
       shapes: Optional list of expected array shapes.
-      check_batch_axis: Boolean; whether to check that
-          the batch axis of the arrays matches the expected
-          value found in `shapes`.
+      check_batch_axis: Boolean; whether to check that the batch axis of the
+        arrays matches the expected value found in `shapes`.
       exception_prefix: String prefix used for exception formatting.
 
   Returns:
@@ -277,8 +264,9 @@ def standardize_input_data(data,
   if not names:
     if (data is not None and hasattr(data, '__len__') and len(data) and
         not isinstance(data, dict)):
-      raise ValueError('Error when checking model ' + exception_prefix + ': '
-                       'expected no data, but got:', data)
+      raise ValueError(
+          'Error when checking model ' + exception_prefix + ': '
+          'expected no data, but got:', data)
     return []
   if data is None:
     return [None for _ in range(len(names))]
@@ -306,8 +294,9 @@ def standardize_input_data(data,
     data = data.values if data.__class__.__name__ == 'DataFrame' else data
     data = [data]
   if shapes is not None:
-    data = [standardize_single_array(x, shape)
-            for (x, shape) in zip(data, shapes)]
+    data = [
+        standardize_single_array(x, shape) for (x, shape) in zip(data, shapes)
+    ]
   else:
     data = [standardize_single_array(x) for x in data]
 
@@ -320,11 +309,11 @@ def standardize_input_data(data,
                        'but instead got the following list of ' +
                        str(len(data)) + ' arrays: ' + str(data)[:200] + '...')
     elif len(names) > 1:
-      raise ValueError(
-          'Error when checking model ' + exception_prefix +
-          ': you are passing a list as input to your model, '
-          'but the model expects a list of ' + str(len(names)) +
-          ' Numpy arrays instead. The list you passed was: ' + str(data)[:200])
+      raise ValueError('Error when checking model ' + exception_prefix +
+                       ': you are passing a list as input to your model, '
+                       'but the model expects a list of ' + str(len(names)) +
+                       ' Numpy arrays instead. The list you passed was: ' +
+                       str(data)[:200])
     elif len(data) == 1 and not hasattr(data[0], 'shape'):
       raise TypeError('Error when checking model ' + exception_prefix +
                       ': data should be a Numpy array, or list/dict of '
@@ -354,10 +343,10 @@ def standardize_input_data(data,
           shape = shape[1:]
         for dim, ref_dim in zip(data_shape, shape):
           if ref_dim != dim and ref_dim is not None and dim is not None:
-            raise ValueError(
-                'Error when checking ' + exception_prefix + ': expected ' +
-                names[i] + ' to have shape ' + str(shape) +
-                ' but got array with shape ' + str(data_shape))
+            raise ValueError('Error when checking ' + exception_prefix +
+                             ': expected ' + names[i] + ' to have shape ' +
+                             str(shape) + ' but got array with shape ' +
+                             str(data_shape))
   return data
 
 
@@ -399,10 +388,10 @@ def standardize_sample_or_class_weights(x_weight, output_names, weight_type):
       x_weights.append(x_weight.get(name))
     return x_weights
   else:
-    raise TypeError(
-        'The model has multiple outputs, so `' + weight_type + '` '
-        'should be either a list or a dict. '
-        'Provided `' + weight_type + '` type not understood: ' + str(x_weight))
+    raise TypeError('The model has multiple outputs, so `' + weight_type + '` '
+                    'should be either a list or a dict. '
+                    'Provided `' + weight_type + '` type not understood: ' +
+                    str(x_weight))
 
 
 def standardize_class_weights(class_weight, output_names):
@@ -433,8 +422,11 @@ def check_array_lengths(inputs, targets, weights=None):
     if x is None:
       return {}
     else:
-      return set([y.shape[0] for y in x
-                  if y is not None and not tensor_util.is_tensor(y)])
+      return set([
+          y.shape[0]
+          for y in x
+          if y is not None and not tensor_util.is_tensor(y)
+      ])
 
   set_x = set_of_lengths(inputs)
   set_y = set_of_lengths(targets)
@@ -478,17 +470,20 @@ def check_loss_and_target_compatibility(targets, loss_fns, output_shapes):
       ValueError: if a loss function or target array
           is incompatible with an output.
   """
-  key_losses = {
+  key_loss_fns = {
       losses.mean_squared_error, losses.binary_crossentropy,
       losses.categorical_crossentropy
   }
+  key_loss_classes = (losses.MeanSquaredError, losses.BinaryCrossentropy,
+                      losses.CategoricalCrossentropy)
   for y, loss, shape in zip(targets, loss_fns, output_shapes):
     if y is None or loss is None or tensor_util.is_tensor(y):
       continue
-    if loss is losses.categorical_crossentropy:
+    if losses.is_categorical_crossentropy(loss):
       if y.shape[-1] == 1:
-        raise ValueError('You are passing a target array of shape ' + str(
-            y.shape) + ' while using as loss `categorical_crossentropy`. '
+        raise ValueError('You are passing a target array of shape ' +
+                         str(y.shape) +
+                         ' while using as loss `categorical_crossentropy`. '
                          '`categorical_crossentropy` expects '
                          'targets to be binary matrices (1s and 0s) '
                          'of shape (samples, classes). '
@@ -502,14 +497,20 @@ def check_loss_and_target_compatibility(targets, loss_fns, output_shapes):
                          'Alternatively, you can use the loss function '
                          '`sparse_categorical_crossentropy` instead, '
                          'which does expect integer targets.')
-    if loss in key_losses:
+
+    is_loss_wrapper = isinstance(loss, losses.LossFunctionWrapper)
+    if (isinstance(loss, key_loss_classes) or (is_loss_wrapper and
+                                               (loss.fn in key_loss_fns))):
       for target_dim, out_dim in zip(y.shape[1:], shape[1:]):
         if out_dim is not None and target_dim != out_dim:
+          loss_name = loss.name
+          if loss_name is None:
+            loss_type = loss.fn if is_loss_wrapper else type(loss)
+            loss_name = loss_type.__name__
           raise ValueError('A target array with shape ' + str(y.shape) +
                            ' was passed for an output of shape ' + str(shape) +
-                           ' while using as loss `' + loss.__name__ + '`. '
-                           'This loss expects '
-                           'targets to have the same shape '
+                           ' while using as loss `' + loss_name + '`. '
+                           'This loss expects targets to have the same shape '
                            'as the output.')
 
 
@@ -517,15 +518,15 @@ def collect_per_output_metric_info(metrics,
                                    output_names,
                                    output_shapes,
                                    loss_fns,
-                                   sample_weights=None):
+                                   is_weighted=False):
   """Maps metric names and functions to model outputs.
 
   Arguments:
-      metrics: a list or dict of metric functions.
+      metrics: a list or a list of lists or a dict of metric functions.
       output_names: a list of the names (strings) of model outputs.
       output_shapes: a list of the shapes (strings) of model outputs.
       loss_fns: a list of the loss functions corresponding to the model outputs.
-      sample_weights: a list of weights to be applied on the model outputs.
+      is_weighted: Boolean indicating whether the given metrics are weighted.
 
   Returns:
       A list (one entry per model output) of dicts.
@@ -547,15 +548,30 @@ def collect_per_output_metric_info(metrics,
   """
   if not metrics:
     return [{} for _ in output_names]
+
   if isinstance(metrics, list):
-    # we then apply all metrics to all outputs.
-    nested_metrics = [copy.copy(metrics) for _ in output_names]
+    any_sub_list = any(isinstance(m, list) for m in metrics)
+    if any_sub_list:
+      if len(metrics) != len(output_names):
+        raise ValueError('When passing a list of lists as `metrics`, '
+                         'it should have one entry per model output. '
+                         'The model has ' + str(len(output_names)) +
+                         ' outputs, but you passed metrics=' + str(metrics))
+      # User has provided a list of len = len(outputs).
+      nested_metrics = [generic_utils.to_list(m) for m in metrics]
+    else:
+      # If it is a single list we then apply all metrics to all outputs.
+      if len(output_names) > 1:
+        nested_metrics = []
+        for _ in output_names:
+          nested_metrics.append(
+              [metrics_module.clone_metric(m) for m in metrics])
+      else:
+        nested_metrics = [metrics]
   elif isinstance(metrics, dict):
     nested_metrics = []
     for name in output_names:
-      output_metrics = metrics.get(name, [])
-      if not isinstance(output_metrics, list):
-        output_metrics = [output_metrics]
+      output_metrics = generic_utils.to_list(metrics.get(name, []))
       nested_metrics.append(output_metrics)
   else:
     raise TypeError('Type of `metrics` argument not understood. '
@@ -565,9 +581,7 @@ def collect_per_output_metric_info(metrics,
   for i, metrics in enumerate(nested_metrics):
     metrics_dict = OrderedDict()
     for metric in metrics:
-      weighted = False if (sample_weights is None) else (
-          sample_weights[i] is not None)
-      metric_name = get_metric_name(metric, weighted)
+      metric_name = get_metric_name(metric, is_weighted)
       metric_fn = get_metric_function(
           metric, output_shape=output_shapes[i], loss_fn=loss_fns[i])
 
@@ -580,7 +594,7 @@ def collect_per_output_metric_info(metrics,
       stateful_fn = metric_fn
       if not is_stateful:
         stateful_fn = metrics_module.MeanMetricWrapper(
-            metric_fn, name=metric_fn.__name__)
+            metric_fn, name=metric_name)
 
       metrics_dict[metric_name] = (metric_fn, stateful_fn)
     per_output_metrics.append(metrics_dict)
@@ -620,8 +634,7 @@ def weighted_masked_objective(fn):
   `fn(y_true, y_pred, weights, mask)`.
 
   Arguments:
-      fn: The objective function to wrap,
-          with signature `fn(y_true, y_pred)`.
+      fn: The objective function to wrap, with signature `fn(y_true, y_pred)`.
 
   Returns:
       A function with signature `fn(y_true, y_pred, weights, mask)`.
@@ -684,16 +697,17 @@ def standardize_weights(y,
   """Performs sample weight validation and standardization.
 
   Everything gets normalized to a single sample-wise (or timestep-wise)
-  weight array.
+  weight array. If both `sample_weight` and `class_weight` are provided,
+  the weights are multiplied.
 
   Arguments:
       y: Numpy array of model targets to be weighted.
       sample_weight: User-provided `sample_weight` argument.
       class_weight: User-provided `class_weight` argument.
-      sample_weight_mode: One of `None` or `"temporal"`.
-          `"temporal"` indicated that we expect 2D weight data
-          that will be applied to the last 2 dimensions of
-          the targets (i.e. we are weighting timesteps, not samples).
+      sample_weight_mode: One of `None` or `"temporal"`. `"temporal"` indicated
+        that we expect 2D weight data that will be applied to the last 2
+        dimensions of the targets (i.e. we are weighting timesteps, not
+        samples).
 
   Returns:
       A numpy array of target weights, one entry per sample to weight.
@@ -734,43 +748,53 @@ def standardize_weights(y,
 
   if sample_weight is not None:
     if len(sample_weight.shape) > len(y.shape):
-      raise ValueError(
-          'Found a sample_weight with shape' + str(sample_weight.shape) + '.'
-          'Expected sample_weight with rank '
-          'less than or equal to ' + str(len(y.shape)))
+      raise ValueError('Found a sample_weight with shape' +
+                       str(sample_weight.shape) + '.'
+                       'Expected sample_weight with rank '
+                       'less than or equal to ' + str(len(y.shape)))
 
     if (not tensor_util.is_tensor(sample_weight) and
         y.shape[:sample_weight.ndim] != sample_weight.shape):
-      raise ValueError(
-          'Found a sample_weight array with shape ' + str(sample_weight.shape) +
-          ' for an input with shape ' + str(y.shape) + '. '
-          'sample_weight cannot be broadcast.')
-    return sample_weight
-  elif isinstance(class_weight, dict):
+      raise ValueError('Found a sample_weight array with shape ' +
+                       str(sample_weight.shape) + ' for an input with shape ' +
+                       str(y.shape) + '. '
+                       'sample_weight cannot be broadcast.')
+
+  # Class weights applied per-sample.
+  class_sample_weight = None
+  if isinstance(class_weight, dict):
     if len(y.shape) > 2:
       raise ValueError('`class_weight` not supported for '
                        '3+ dimensional targets.')
-    if y.shape[1] > 1:
-      y_classes = np.argmax(y, axis=1)
-    elif y.shape[1] == 1:
-      y_classes = np.reshape(y, y.shape[0])
+
+    if len(y.shape) == 2:
+      if y.shape[1] > 1:
+        y_classes = np.argmax(y, axis=1)
+      elif y.shape[1] == 1:
+        y_classes = np.reshape(y, y.shape[0])
     else:
       y_classes = y
 
-    weights = np.asarray(
+    class_sample_weight = np.asarray(
         [class_weight[cls] for cls in y_classes if cls in class_weight])
 
-    if len(weights) != len(y_classes):
+    if len(class_sample_weight) != len(y_classes):
       # subtract the sets to pick all missing classes
       existing_classes = set(y_classes)
       existing_class_weight = set(class_weight.keys())
-      raise ValueError('`class_weight` must contain all classes in the data.'
-                       ' The classes %s exist in the data but not in '
-                       '`class_weight`.' %
-                       (existing_classes - existing_class_weight))
-    return weights
-  else:
-    return None
+      raise ValueError(
+          '`class_weight` must contain all classes in the data.'
+          ' The classes %s exist in the data but not in '
+          '`class_weight`.' % (existing_classes - existing_class_weight))
+
+  if class_sample_weight is not None and sample_weight is not None:
+    # Multiply weights if both are provided.
+    return class_sample_weight * sample_weight
+  if sample_weight is not None:
+    return sample_weight
+  if class_sample_weight is not None:
+    return class_sample_weight
+  return None
 
 
 def has_symbolic_tensors(ls):
@@ -797,21 +821,29 @@ def get_metric_name(metric, weighted=False):
   Returns:
       The metric name.
   """
-  metric_name_prefix = 'weighted_' if weighted else ''
-  if metric in ('accuracy', 'acc', 'crossentropy', 'ce'):
-    if metric in ('accuracy', 'acc'):
-      suffix = 'acc'
-    elif metric in ('crossentropy', 'ce'):
-      suffix = 'ce'
+  if tf2.enabled():
+    # We keep the string that the user has set in compile as the metric name.
+    if isinstance(metric, six.string_types):
+      return metric
+
+    metric = metrics_module.get(metric)
+    return metric.name if hasattr(metric, 'name') else metric.__name__
   else:
-    metric_fn = metrics_module.get(metric)
-    # Get metric name as string
-    if hasattr(metric_fn, 'name'):
-      suffix = metric_fn.name
+    metric_name_prefix = 'weighted_' if weighted else ''
+    if metric in ('accuracy', 'acc', 'crossentropy', 'ce'):
+      if metric in ('accuracy', 'acc'):
+        suffix = 'acc'
+      elif metric in ('crossentropy', 'ce'):
+        suffix = 'ce'
     else:
-      suffix = metric_fn.__name__
-  metric_name = metric_name_prefix + suffix
-  return metric_name
+      metric_fn = metrics_module.get(metric)
+      # Get metric name as string
+      if hasattr(metric_fn, 'name'):
+        suffix = metric_fn.name
+      else:
+        suffix = metric_fn.__name__
+    metric_name = metric_name_prefix + suffix
+    return metric_name
 
 
 def get_metric_function(metric, output_shape=None, loss_fn=None):
@@ -819,29 +851,41 @@ def get_metric_function(metric, output_shape=None, loss_fn=None):
 
   Arguments:
       metric: Metric function name or reference.
-      output_shape: The shape of the output that this metric
-          will be calculated for.
+      output_shape: The shape of the output that this metric will be calculated
+        for.
       loss_fn: The loss function used.
 
   Returns:
       The metric function.
   """
+  if metric not in ['accuracy', 'acc', 'crossentropy', 'ce']:
+    return metrics_module.get(metric)
+
+  is_sparse_categorical_crossentropy = (
+      isinstance(loss_fn, losses.SparseCategoricalCrossentropy) or
+      (isinstance(loss_fn, losses.LossFunctionWrapper) and
+       loss_fn.fn == losses.sparse_categorical_crossentropy))
+
+  is_binary_crossentropy = (
+      isinstance(loss_fn, losses.BinaryCrossentropy) or
+      (isinstance(loss_fn, losses.LossFunctionWrapper) and
+       loss_fn.fn == losses.binary_crossentropy))
+
   if metric in ['accuracy', 'acc']:
-    if output_shape[-1] == 1 or loss_fn == losses.binary_crossentropy:
-      return metrics_module.binary_accuracy  # case: binary accuracy
-    elif loss_fn == losses.sparse_categorical_crossentropy:
-      # case: categorical accuracy with sparse targets
+    if output_shape[-1] == 1 or is_binary_crossentropy:
+      return metrics_module.binary_accuracy
+    elif is_sparse_categorical_crossentropy:
       return metrics_module.sparse_categorical_accuracy
-    return metrics_module.categorical_accuracy  # case: categorical accuracy
-  elif metric in ['crossentropy', 'ce']:
-    if output_shape[-1] == 1 or loss_fn == losses.binary_crossentropy:
-      return metrics_module.binary_crossentropy  # case: binary cross-entropy
-    elif loss_fn == losses.sparse_categorical_crossentropy:
-      # case: categorical cross-entropy with sparse targets
+    # If the output_shape[-1] is not 1, then we know output is `categorical`.
+    # We assume it is sparse categorical only if loss is explicitly given
+    # as sparse categorical crossentropy loss.
+    return metrics_module.categorical_accuracy
+  else:
+    if output_shape[-1] == 1 or is_binary_crossentropy:
+      return metrics_module.binary_crossentropy
+    elif is_sparse_categorical_crossentropy:
       return metrics_module.sparse_categorical_crossentropy
-    # case: categorical cross-entropy
     return metrics_module.categorical_crossentropy
-  return metrics_module.get(metric)
 
 
 def call_metric_function(metric_fn, y_true, y_pred, weights=None, mask=None):
@@ -865,25 +909,33 @@ def get_loss_function(loss):
   if loss is None or isinstance(loss, losses.Loss):
     return loss
 
-  # TODO(psv): After we have added all V2 losses, update this function.
-  if loss in ['mse', 'MSE', 'mean_squared_error']:
-    return losses.MeanSquaredError()
-  return losses.get(loss)
+  # Deserialize loss configuration, if needed.
+  if isinstance(loss, collections.Mapping):
+    loss = losses.get(loss)
+
+  # Custom callable class.
+  if callable(loss) and not hasattr(loss, '__name__'):
+    return loss
+
+  # Wrap loss function with signature `(y_true, y_pred, **kwargs)`
+  # in `LossFunctionWrapper` class.
+  loss_fn = losses.get(loss)
+  return losses.LossFunctionWrapper(loss_fn, name=loss_fn.__name__)
 
 
-def validate_iterator_input(x, y, sample_weight, validation_split=None):
+def validate_dataset_input(x, y, sample_weight, validation_split=None):
   """Validates user input arguments when a dataset iterator is passed.
 
   Arguments:
-    x: Input data. A `tf.data` dataset iterator.
+    x: Input data. A `tf.data` dataset or iterator.
     y: Target data. It could be either Numpy array(s) or TensorFlow tensor(s).
-        Expected to be `None` when `x` is a dataset iterator.
-    sample_weight: An optional sample-weight array passed by the user to
-        weight the importance of each sample in `x`. Expected to be `None` when
-        `x` is a dataset iterator
-    validation_split: Float between 0 and 1. Fraction of the training data to
-        be used as validation data. Expected to be `None` when `x` is a dataset
-        iterator.
+      Expected to be `None` when `x` is a dataset iterator.
+    sample_weight: An optional sample-weight array passed by the user to weight
+      the importance of each sample in `x`. Expected to be `None` when `x` is a
+      dataset iterator
+    validation_split: Float between 0 and 1. Fraction of the training data to be
+      used as validation data. Expected to be `None` when `x` is a dataset
+      iterator.
 
   Raises:
     ValueError: if argument `y` or `sample_weight` or `validation_split` are
@@ -909,7 +961,8 @@ def validate_iterator_input(x, y, sample_weight, validation_split=None):
         'Received: x=%s, validation_split=%f' % (x, validation_split))
 
 
-def check_generator_arguments(y=None, sample_weight=None):
+def check_generator_arguments(y=None, sample_weight=None,
+                              validation_split=None):
   """Validates arguments passed when using a generator."""
   if y is not None:
     raise ValueError('`y` argument is not supported when data is'
@@ -919,6 +972,9 @@ def check_generator_arguments(y=None, sample_weight=None):
     raise ValueError('`sample_weight` argument is not supported when data is'
                      'a generator or Sequence instance. Instead pass sample'
                      ' weights as the third element of the generator.')
+  if validation_split:
+    raise ValueError('If your data is in the form of a Python generator, '
+                     'you cannot use `validation_split`.')
 
 
 def check_steps_argument(input_data, steps, steps_name):
@@ -944,15 +1000,13 @@ def check_steps_argument(input_data, steps, steps_name):
       ValueError: if `steps` argument is required for given input data type
         but not provided.
   """
-
-  is_x_iterator = (
-      isinstance(input_data, iterator_ops.Iterator) or
-      isinstance(input_data, iterator_ops.EagerIterator))
-
+  # TODO(fchollet): allow datasets with steps=None if cardinality is known.
+  is_x_iterator = isinstance(
+      input_data, (iterator_ops.Iterator, iterator_ops.EagerIterator))
   if (input_data is None or is_x_iterator or has_symbolic_tensors(input_data) or
       (isinstance(input_data, list) and not input_data)):
     if steps is None:
-      input_type_str = 'iterators' if is_x_iterator else 'data tensors'
+      input_type_str = 'a Dataset iterator' if is_x_iterator else 'data tensors'
       raise ValueError('When using {input_type} as input to a model, you should'
                        ' specify the `{steps_name}` argument.'.format(
                            input_type=input_type_str, steps_name=steps_name))
@@ -1066,6 +1120,95 @@ def prepare_sample_weights(output_names, sample_weight_mode,
   return sample_weights, sample_weight_modes
 
 
+def prepare_loss_functions(loss, output_names):
+  """Converts loss to a list of loss functions.
+
+  Arguments:
+      loss: String (name of objective function), objective function or
+        `tf.losses.Loss` instance. See `tf.losses`. If the model has multiple
+        outputs, you can use a different loss on each output by passing a
+        dictionary or a list of losses. The loss value that will be minimized by
+        the model will then be the sum of all individual losses.
+      output_names: List of model output names.
+
+  Returns:
+      A list of loss objective functions.
+
+  Raises:
+      ValueError: If loss is a dict with keys not in model output names,
+          or if loss is a list with len not equal to model outputs.
+  """
+  if isinstance(loss, collections.Mapping):
+    for name in loss:
+      if name not in output_names:
+        raise ValueError('Unknown entry in loss dictionary: {}. Only expected '
+                         'following keys: {}'.format(name, output_names))
+    loss_functions = []
+    for name in output_names:
+      if name not in loss:
+        logging.warning(
+            'Output {0} missing from loss dictionary. We assume '
+            'this was done on purpose. The fit and evaluate APIs will not be '
+            'expecting any data to be passed to {0}.'.format(name))
+      loss_functions.append(get_loss_function(loss.get(name, None)))
+  elif isinstance(loss, six.string_types):
+    loss_functions = [get_loss_function(loss) for _ in output_names]
+  elif isinstance(loss, collections.Sequence):
+    if len(loss) != len(output_names):
+      raise ValueError('When passing a list as loss, it should have one entry '
+                       'per model outputs. The model has {} outputs, but you '
+                       'passed loss={}'.format(len(output_names), loss))
+    loss_functions = nest.map_structure(get_loss_function, loss)
+  else:
+    loss_functions = [get_loss_function(loss) for _ in range(len(output_names))]
+
+  return loss_functions
+
+
+def prepare_loss_weights(output_names, loss_weights=None):
+  """Converts loss weights to a list of loss weights.
+
+  Arguments:
+      output_names: List of model output names.
+      loss_weights: Optional list or dictionary specifying scalar coefficients
+        (Python floats) to weight the loss contributions of different model
+        outputs. The loss value that will be minimized by the model will then be
+        the *weighted sum* of all individual losses, weighted by the
+          `loss_weights` coefficients. If a list, it is expected to have a 1:1
+            mapping to the model's outputs. If a dict, it is expected to map
+            output names (strings) to scalar coefficients.
+
+  Returns:
+      A list of loss weights of python floats.
+
+  Raises:
+      ValueError: If loss weight is a dict with key not in model output names,
+          or if loss is a list with len not equal to model outputs.
+  """
+  if loss_weights is None:
+    weights_list = [1.] * len(output_names)
+  elif isinstance(loss_weights, dict):
+    for name in loss_weights:
+      if name not in output_names:
+        raise ValueError('Unknown entry in loss_weights dictionary: {}. '
+                         'Only expected the following keys: {}'.format(
+                             name, output_names))
+    weights_list = [loss_weights.get(name, 1.) for name in output_names]
+  elif isinstance(loss_weights, list):
+    if len(loss_weights) != len(output_names):
+      raise ValueError('When passing a list as loss_weights, '
+                       'it should have one entry per model output. '
+                       'The model has ' + str(len(output_names)) +
+                       ' outputs, but you passed loss_weights=' +
+                       str(loss_weights))
+    weights_list = loss_weights
+  else:
+    raise TypeError('Could not interpret loss_weights argument: ' +
+                    str(loss_weights) + ' - expected a list of dicts.')
+
+  return weights_list
+
+
 # TODO(rohanj): This is a hack to get around not depending on feature_column and
 # create a cyclical dependency. Figure out a cleaner solution
 def is_feature_layer(layer):
@@ -1073,6 +1216,255 @@ def is_feature_layer(layer):
   return getattr(layer, '_is_feature_layer', False)
 
 
+def is_eager_dataset_or_iterator(data):
+  return context.executing_eagerly() and isinstance(
+      data, (dataset_ops.DatasetV1, dataset_ops.DatasetV2,
+             iterator_ops.EagerIterator))
+
+
+# pylint: disable=protected-access
+def assert_not_batched(dataset):
+  """Asserts that `dataset` is not batched.
+
+  The algorithm used by this method is sound but not complete. In other words,
+  if the method fails to establish the assertion, it does not mean the dataset
+  is batched.
+
+  Example usage:
+  ```python
+  try:
+    assert_not_batched(dataset)
+    # safe to assume `dataset` it not batched here
+  expect ValueError:
+    # make no assumptions about `dataset`
+  ```
+
+  Args:
+    dataset: The dataset to analyze.
+
+  Raises:
+    ValueError: If the method cannot establish the assertion.
+  """
+  if isinstance(dataset, dataset_ops.DatasetV1Adapter):
+    return assert_not_batched(dataset._dataset)
+  else:
+    whitelisted_types = [
+        dataset_ops._OptionsDataset,
+        dataset_ops.ConcatenateDataset,
+        dataset_ops.CacheDataset,
+        dataset_ops.FilterDataset,
+        dataset_ops.MapDataset,
+        dataset_ops.ParallelMapDataset,
+        dataset_ops.PrefetchDataset,
+        dataset_ops.RangeDataset,
+        dataset_ops.RepeatDataset,
+        dataset_ops.ShuffleDataset,
+        dataset_ops.SkipDataset,
+        dataset_ops.SparseTensorSliceDataset,
+        dataset_ops.TakeDataset,
+        dataset_ops.TensorDataset,
+        dataset_ops.TensorSliceDataset,
+        dataset_ops.ZipDataset,
+        readers.FixedLengthRecordDatasetV2,
+        readers.TextLineDatasetV2,
+        readers.TFRecordDatasetV2,
+    ]
+    for ty in whitelisted_types:
+      if isinstance(dataset, ty):
+        for input_dataset in dataset._inputs():
+          assert_not_batched(input_dataset)
+        return
+    raise ValueError('Could not assert that dataset is not batched.')
+
+
+# pylint: disable=protected-access
+def assert_not_shuffled(dataset):
+  """Asserts that `dataset` is not shuffled.
+
+  The algorithm used by this method is sound but not complete. In other words,
+  if the method fails to establish the assertion, it does not mean the dataset
+  is shuffled.
+
+  Example usage:
+  ```python
+  try:
+    assert_not_shuffled(dataset)
+    # safe to assume `dataset` it not shuffled here
+  expect ValueError:
+    # make no assumptions about `dataset`
+  ```
+
+  Args:
+    dataset: The dataset to analyze.
+
+  Raises:
+    ValueError: If the method cannot establish the assertion.
+  """
+  if isinstance(dataset, dataset_ops.DatasetV1Adapter):
+    return assert_not_shuffled(dataset._dataset)
+  else:
+    whitelisted_types = [
+        dataset_ops._OptionsDataset,
+        dataset_ops.BatchDataset,
+        dataset_ops.ConcatenateDataset,
+        dataset_ops.CacheDataset,
+        dataset_ops.FilterDataset,
+        dataset_ops.MapDataset,
+        dataset_ops.PaddedBatchDataset,
+        dataset_ops.ParallelMapDataset,
+        dataset_ops.PrefetchDataset,
+        dataset_ops.RangeDataset,
+        dataset_ops.RepeatDataset,
+        dataset_ops.SkipDataset,
+        dataset_ops.SparseTensorSliceDataset,
+        dataset_ops.TakeDataset,
+        dataset_ops.TensorDataset,
+        dataset_ops.TensorSliceDataset,
+        dataset_ops.WindowDataset,
+        dataset_ops.ZipDataset,
+        readers.FixedLengthRecordDatasetV2,
+        readers.TextLineDatasetV2,
+        readers.TFRecordDatasetV2,
+    ]
+    for ty in whitelisted_types:
+      if isinstance(dataset, ty):
+        for input_dataset in dataset._inputs():
+          assert_not_shuffled(input_dataset)
+        return
+    raise ValueError('Could not assert that dataset is not shuffled.')
+
+
+def verify_dataset_shuffled(x):
+  """Verifies that the dataset is shuffled.
+
+  Args:
+    x: Dataset passed as an input to the model.
+
+  Raises:
+    ValueError: if the dataset is not already shuffled.
+  """
+  assert isinstance(x, dataset_ops.DatasetV2)
+  try:
+    assert_not_shuffled(x)
+  except ValueError:
+    # Dataset may or may not be shuffled.
+    return
+  else:
+    logging.warning('Expected a shuffled dataset but input dataset `x` is '
+                    'not shuffled. Please invoke `shuffle()` on input dataset.')
+
+
+def is_dataset_or_iterator(data):
+  return isinstance(data, (dataset_ops.DatasetV1, dataset_ops.DatasetV2,
+                           iterator_ops.EagerIterator, iterator_ops.Iterator))
+
+
+def get_iterator(dataset):
+  """Create and initialize an iterator from a dataset."""
+  iterator = dataset_ops.make_initializable_iterator(dataset)
+  initialize_iterator(iterator)
+  return iterator
+
+
+def initialize_iterator(iterator):
+  init_op = iterator.initializer
+  if not context.executing_eagerly():
+    K.get_session((init_op,)).run(init_op)
+
+
+def extract_tensors_from_dataset(dataset):
+  """Extract a tuple of tensors `inputs, targets, sample_weight` from a dataset.
+
+  Arguments:
+    dataset: Dataset instance.
+
+  Returns:
+    Tuple of tensors `x, y, weights`. `y` and `weights` entry may be None.
+  """
+  iterator = get_iterator(dataset)
+  inputs, targets, sample_weight = unpack_iterator_input(iterator)
+  return inputs, targets, sample_weight
+
+
+def unpack_iterator_input(iterator):
+  """Convert a dataset iterator to a tuple of tensors `x, y, sample_weights`.
+
+  Arguments:
+    iterator: Instance of a dataset iterator.
+
+  Returns:
+    Tuple of tensors `x, y, weights`. `y` and `weights` entry may be None.
+  """
+  try:
+    next_element = iterator.get_next()
+  except errors.OutOfRangeError:
+    raise RuntimeError('Your dataset iterator ran out of data; '
+                       'Make sure that your dataset can generate '
+                       'required number of samples.')
+
+  if isinstance(next_element, (list, tuple)):
+    if len(next_element) not in [2, 3]:
+      raise ValueError(
+          'Please provide model inputs as a list or tuple of 2 or 3 '
+          'elements: (input, target) or (input, target, sample_weights) '
+          'Received %s' % next_element)
+    if len(next_element) == 2:
+      x, y = next_element
+      weights = None
+    else:
+      x, y, weights = next_element
+  else:
+    x = next_element
+    y = None
+    weights = None
+  return x, y, weights
+
+
+def infer_steps_for_dataset(dataset, steps, epochs=1, steps_name='steps'):
+  """Infers steps_per_epoch needed to loop through a dataset.
+
+  Arguments:
+      dataset: Input data of type tf.data.Dataset.
+      steps: Number of steps to draw from the dataset (may be None if unknown).
+      epochs: Number of times to iterate over the dataset.
+      steps_name: The string name of the steps argument, either `steps`,
+        `validation_steps`, or `steps_per_epoch`. Only used for error message
+        formatting.
+
+  Returns:
+    Integer or `None`. Inferred number of steps to loop through the dataset.
+    `None` is returned if the size of the dataset is unknown and `steps` was
+    not specified.
+
+  Raises:
+    ValueError: In case of invalid argument values.
+  """
+  assert isinstance(dataset, dataset_ops.DatasetV2)
+  size = K.get_value(cardinality.cardinality(dataset))
+  if size == cardinality.INFINITE and steps is None:
+    raise ValueError('When passing an infinitely repeating dataset, you '
+                     'must specify the `%s` argument.' % (steps_name,))
+  if size >= 0:
+    if steps is not None and steps * epochs > size:
+      if epochs > 1:
+        raise ValueError('The dataset you passed contains %s batches, but you '
+                         'passed `epochs=%s` and `%s=%s`, which is a total of '
+                         '%s steps. We cannot draw that many steps from this '
+                         'dataset. We suggest to set `%s=%s`.' %
+                         (size, epochs, steps_name, steps, steps * epochs,
+                          steps_name, size // epochs))
+      else:
+        raise ValueError('The dataset you passed contains %s batches, but you '
+                         'passed `%s=%s`. We cannot draw that many steps from '
+                         'this dataset. We suggest to set `%s=%s`.' %
+                         (size, steps_name, steps, steps_name, size))
+  if steps is None:
+    if size >= 0:
+      return size
+    return None
+  return steps
+
+
 class ModelInputs(object):
   """Encapsulates model inputs.
 
@@ -1107,6 +1499,9 @@ class ModelInputs(object):
 
   def get_symbolic_inputs(self, return_single_as_list=False):
     """Returns inputs to be set as self.inputs for a model."""
+    # TODO(karmel): There is a side-effect here where what you get
+    # with as_list and as_dict depends on whether you have called this
+    # method first, since it modifies in place.
     for i in range(len(self._flattened_inputs)):
       k = self._input_names[i]
       v = self._flattened_inputs[i]
@@ -1114,16 +1509,21 @@ class ModelInputs(object):
         v = np.asarray(v)
         if v.ndim == 1:
           v = np.expand_dims(v, 1)
+
       if isinstance(v, (np.ndarray, ops.EagerTensor)):
         # We fix the placeholder shape except the batch size.
         # This is suboptimal, but it is the best we can do with the info
         # we have. The user should call `model._set_inputs(placeholders)`
         # to specify custom placeholders if the need arises.
         shape = (None,) + tuple(v.shape[1:])
-        v = K.placeholder(shape=shape, name=k)
+        dtype = dtypes.as_dtype(v.dtype)
+        if dtype.is_floating:
+          dtype = K.floatx()
+        v = K.placeholder(shape=shape, name=k, dtype=dtype)
       elif isinstance(v, tensor_shape.TensorShape):
         shape = (None,) + tuple(v.as_list()[1:])
         v = K.placeholder(shape=shape, name=k)
+
       self._flattened_inputs[i] = v
 
     if self._is_dict:
@@ -1155,7 +1555,7 @@ def get_input_shape_and_dtype(layer):
       does not have a defined input shape.
 
   Raises:
-    ValueError: in case an empty Sequential or Graph Network is passed.
+    ValueError: in case an empty Sequential or Functional model is passed.
   """
 
   def _is_graph_model(layer):
@@ -1191,3 +1591,78 @@ def get_static_batch_size(layer):
   if batch_input_shape is not None:
     return tensor_shape.as_dimension(batch_input_shape[0]).value
   return None
+
+
+def generic_output_names(outputs_list):
+  return ['output_%d' % (i + 1) for i in range(len(outputs_list))]
+
+
+def set_run_eagerly_for_dict_structure(model, x):
+  """Set model.run_eagerly to true if x is dict structure.
+
+  Set model.run_eagerly to true if x is dict or
+  Iterator/EagerIterator/Dataset of dict.
+
+  Args:
+    model: A Keras model.
+    x: Input data.
+  """
+  if not context.executing_eagerly():
+    return
+  if isinstance(x, dict):
+    model.run_eagerly = True
+  if (isinstance(x, (iterator_ops.Iterator, iterator_ops.EagerIterator,
+                     dataset_ops.DatasetV2))):
+    for item in x.output_shapes:
+      if isinstance(item, dict):
+        model.run_eagerly = True
+        return
+
+
+def convert_eager_tensors_to_numpy(structure):
+  """Convert every EagerTensor in `structure` to NumPy.
+
+  Arguments:
+    structure: An arbitrary structure of elements to be converted to NumPy
+      arrays.
+
+  Returns:
+    An identical structure with EagerTensors converted to NumPy arrays.
+  """
+
+  def _convert(element):
+    if isinstance(element, ops.EagerTensor):
+      return element.numpy()
+    return element
+
+  return nest.map_structure(_convert, structure)
+
+
+def should_run_validation(validation_freq, epoch):
+  """Checks if validation should be run this epoch.
+
+  Arguments:
+    validation_freq: Integer or list. If an integer, specifies how many training
+      epochs to run before a new validation run is performed. If a list,
+      specifies the epochs on which to run validation.
+    epoch: Integer, the number of the training epoch just completed.
+
+  Returns:
+    Bool, True if validation should be run.
+
+  Raises:
+    ValueError: if `validation_freq` is an Integer and less than 1, or if
+    it is neither an Integer nor a Sequence.
+  """
+  # `epoch` is 0-indexed internally but 1-indexed in the public API.
+  one_indexed_epoch = epoch + 1
+
+  if isinstance(validation_freq, int):
+    if validation_freq < 1:
+      raise ValueError('`validation_freq` can not be less than 1.')
+    return one_indexed_epoch % validation_freq == 0
+
+  if not isinstance(validation_freq, collections.Container):
+    raise ValueError('`validation_freq` must be an Integer or '
+                     '`collections.Container` (e.g. list, tuple, etc.)')
+  return one_indexed_epoch in validation_freq
diff --git a/tensorflow/python/keras/engine/training_utils_test.py b/tensorflow/python/keras/engine/training_utils_test.py
index 44ea23998fe6f3b614fb09b9667add179cf3fd85..30e3d1f3e4029ce837647352f19fd70486d7bf1a 100644
--- a/tensorflow/python/keras/engine/training_utils_test.py
+++ b/tensorflow/python/keras/engine/training_utils_test.py
@@ -18,13 +18,22 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers
 from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
 
 
 class ModelInputsTest(test.TestCase):
@@ -38,10 +47,11 @@ class ModelInputsTest(test.TestCase):
     vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
     self.assertEqual(1, len(vals))
     self.assertTrue(tensor_util.is_tensor(vals[0]))
+    self.assertEqual(backend.floatx(), vals[0].dtype)
 
   def test_single_thing_eager(self):
     with context.eager_mode():
-      a = np.ones(10)
+      a = np.ones(10, dtype=np.int32)
       model_inputs = training_utils.ModelInputs(a)
       self.assertEqual(['input_1'], model_inputs.get_input_names())
       val = model_inputs.get_symbolic_inputs()
@@ -49,6 +59,7 @@ class ModelInputsTest(test.TestCase):
       vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
       self.assertEqual(1, len(vals))
       self.assertTrue(tf_utils.is_symbolic_tensor(vals[0]))
+      self.assertEqual(dtypes.int32, vals[0].dtype)
 
   def test_list(self):
     a = [np.ones(10), np.ones(20)]
@@ -85,5 +96,159 @@ class ModelInputsTest(test.TestCase):
       self.assertTrue(tf_utils.is_symbolic_tensor(vals['b']))
 
 
+class DatasetUtilsTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      # pylint: disable=g-long-lambda
+      ('Batch', lambda: dataset_ops.Dataset.range(5).batch(2), ValueError),
+      ('Cache', lambda: dataset_ops.Dataset.range(5).cache()),
+      ('Concatenate', lambda: dataset_ops.Dataset.range(5).concatenate(
+          dataset_ops.Dataset.range(5))),
+      ('FlatMap', lambda: dataset_ops.Dataset.range(5).flat_map(
+          lambda _: dataset_ops.Dataset.from_tensors(0)), ValueError),
+      ('Filter', lambda: dataset_ops.Dataset.range(5).filter(lambda _: True)),
+      ('FixedLengthRecordDatasetV2',
+       lambda: readers.FixedLengthRecordDatasetV2([], 42)),
+      ('FromTensors', lambda: dataset_ops.Dataset.from_tensors(0)),
+      ('FromTensorSlices',
+       lambda: dataset_ops.Dataset.from_tensor_slices([0, 0, 0])),
+      ('Interleave', lambda: dataset_ops.Dataset.range(5).interleave(
+          lambda _: dataset_ops.Dataset.from_tensors(0), cycle_length=1),
+       ValueError),
+      ('ParallelInterleave', lambda: dataset_ops.Dataset.range(5).interleave(
+          lambda _: dataset_ops.Dataset.from_tensors(0),
+          cycle_length=1,
+          num_parallel_calls=1), ValueError),
+      ('Map', lambda: dataset_ops.Dataset.range(5).map(lambda x: x)),
+      ('Options',
+       lambda: dataset_ops.Dataset.range(5).with_options(dataset_ops.Options())
+      ),
+      ('PaddedBatch', lambda: dataset_ops.Dataset.range(5).padded_batch(2, []),
+       ValueError),
+      ('ParallelMap', lambda: dataset_ops.Dataset.range(5).map(
+          lambda x: x, num_parallel_calls=1)),
+      ('Prefetch', lambda: dataset_ops.Dataset.range(5).prefetch(1)),
+      ('Range', lambda: dataset_ops.Dataset.range(0)),
+      ('Repeat', lambda: dataset_ops.Dataset.range(0).repeat(0)),
+      ('Shuffle', lambda: dataset_ops.Dataset.range(5).shuffle(1)),
+      ('Skip', lambda: dataset_ops.Dataset.range(5).skip(2)),
+      ('Take', lambda: dataset_ops.Dataset.range(5).take(2)),
+      ('TextLineDataset', lambda: readers.TextLineDatasetV2([])),
+      ('TFRecordDataset', lambda: readers.TFRecordDatasetV2([])),
+      ('Window', lambda: dataset_ops.Dataset.range(5).window(2), ValueError),
+      ('Zip', lambda: dataset_ops.Dataset.zip(dataset_ops.Dataset.range(5))),
+      # pylint: enable=g-long-lambda
+  )
+  def test_assert_not_batched(self, dataset_fn, expected_error=None):
+    if expected_error is None:
+      training_utils.assert_not_batched(dataset_fn())
+    else:
+      with self.assertRaises(expected_error):
+        training_utils.assert_not_batched(dataset_fn())
+
+  @parameterized.named_parameters(
+      # pylint: disable=g-long-lambda
+      ('Batch', lambda: dataset_ops.Dataset.range(5).batch(2)),
+      ('Cache', lambda: dataset_ops.Dataset.range(5).cache()),
+      ('Concatenate', lambda: dataset_ops.Dataset.range(5).concatenate(
+          dataset_ops.Dataset.range(5))),
+      ('FlatMap', lambda: dataset_ops.Dataset.range(5).flat_map(
+          lambda _: dataset_ops.Dataset.from_tensors(0)), ValueError),
+      ('Filter', lambda: dataset_ops.Dataset.range(5).filter(lambda _: True)),
+      ('FixedLengthRecordDatasetV2',
+       lambda: readers.FixedLengthRecordDatasetV2([], 42)),
+      ('FromTensors', lambda: dataset_ops.Dataset.from_tensors(0)),
+      ('FromTensorSlices',
+       lambda: dataset_ops.Dataset.from_tensor_slices([0, 0, 0])),
+      ('Interleave', lambda: dataset_ops.Dataset.range(5).interleave(
+          lambda _: dataset_ops.Dataset.from_tensors(0), cycle_length=1),
+       ValueError),
+      ('Map', lambda: dataset_ops.Dataset.range(5).map(lambda x: x)),
+      ('Options',
+       lambda: dataset_ops.Dataset.range(5).with_options(dataset_ops.Options())
+      ),
+      ('PaddedBatch', lambda: dataset_ops.Dataset.range(5).padded_batch(2, [])),
+      ('ParallelInterleave', lambda: dataset_ops.Dataset.range(5).interleave(
+          lambda _: dataset_ops.Dataset.from_tensors(0),
+          cycle_length=1,
+          num_parallel_calls=1), ValueError),
+      ('ParallelMap', lambda: dataset_ops.Dataset.range(5).map(
+          lambda x: x, num_parallel_calls=1)),
+      ('Prefetch', lambda: dataset_ops.Dataset.range(5).prefetch(1)),
+      ('Range', lambda: dataset_ops.Dataset.range(0)),
+      ('Repeat', lambda: dataset_ops.Dataset.range(0).repeat(0)),
+      ('Shuffle', lambda: dataset_ops.Dataset.range(5).shuffle(1), ValueError),
+      ('Skip', lambda: dataset_ops.Dataset.range(5).skip(2)),
+      ('Take', lambda: dataset_ops.Dataset.range(5).take(2)),
+      ('TextLineDataset', lambda: readers.TextLineDatasetV2([])),
+      ('TFRecordDataset', lambda: readers.TFRecordDatasetV2([])),
+      ('Window', lambda: dataset_ops.Dataset.range(5).window(2)),
+      ('Zip', lambda: dataset_ops.Dataset.zip(dataset_ops.Dataset.range(5))),
+      # pylint: enable=g-long-lambda
+  )
+  def test_assert_not_shuffled(self, dataset_fn, expected_error=None):
+    if expected_error is None:
+      training_utils.assert_not_shuffled(dataset_fn())
+    else:
+      with self.assertRaises(expected_error):
+        training_utils.assert_not_shuffled(dataset_fn())
+
+  def test_verify_dataset_shuffled(self):
+    dataset = dataset_ops.Dataset.range(5)
+    training_utils.assert_not_shuffled(dataset)
+
+    with test.mock.patch.object(logging, 'warning') as mock_log:
+      training_utils.verify_dataset_shuffled(dataset)
+      self.assertRegexpMatches(
+          str(mock_log.call_args),
+          'input dataset `x` is not shuffled.')
+
+    shuffled_dataset = dataset.shuffle(10)
+    training_utils.verify_dataset_shuffled(shuffled_dataset)
+
+
+class StandardizeWeightsTest(keras_parameterized.TestCase):
+
+  def test_sample_weights(self):
+    y = np.array([0, 1, 0, 0, 2])
+    sample_weights = np.array([0.5, 1., 1., 0., 2.])
+    weights = training_utils.standardize_weights(y, sample_weights)
+    self.assertAllClose(weights, sample_weights)
+
+  def test_class_weights(self):
+    y = np.array([0, 1, 0, 0, 2])
+    class_weights = {0: 0.5, 1: 1., 2: 1.5}
+    weights = training_utils.standardize_weights(y, class_weight=class_weights)
+    self.assertAllClose(weights, np.array([0.5, 1., 0.5, 0.5, 1.5]))
+
+  def test_sample_weights_and_class_weights(self):
+    y = np.array([0, 1, 0, 0, 2])
+    sample_weights = np.array([0.5, 1., 1., 0., 2.])
+    class_weights = {0: 0.5, 1: 1., 2: 1.5}
+    weights = training_utils.standardize_weights(y, sample_weights,
+                                                 class_weights)
+    expected = sample_weights * np.array([0.5, 1., 0.5, 0.5, 1.5])
+    self.assertAllClose(weights, expected)
+
+  def test_dataset_with_class_weight(self):
+    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+    model.compile('rmsprop', 'mse')
+
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+    class_weight_np = np.array([0.25, 0.25, 0.25, 0.25])
+    class_weight = dict(enumerate(class_weight_np))
+
+    model.fit(
+        dataset,
+        epochs=1,
+        steps_per_epoch=2,
+        verbose=1,
+        class_weight=class_weight)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/estimator/__init__.py b/tensorflow/python/keras/estimator/__init__.py
index dcd0600897005f1905b5f6b65cdc0f225172fa1b..f0e04c066d6bf072be7face0d28f03552c519c3a 100644
--- a/tensorflow/python/keras/estimator/__init__.py
+++ b/tensorflow/python/keras/estimator/__init__.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 # Keras has undeclared dependency on tensorflow/estimator:estimator_py.
 # As long as you depend //third_party/py/tensorflow:tensorflow target
@@ -26,7 +26,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 # LINT.IfChange
-@tf_export('keras.estimator.model_to_estimator')
+@keras_export('keras.estimator.model_to_estimator')
 def model_to_estimator(
     keras_model=None,
     keras_model_path=None,
@@ -72,6 +72,6 @@ def model_to_estimator(
       model_dir=model_dir,
       config=config)
 
-# LINT.ThenChange(//third_party/tensorflow_estimator/python/estimator/keras.py)
+# LINT.ThenChange(//tensorflow_estimator/python/estimator/keras.py)
 
 
diff --git a/tensorflow/python/keras/initializers.py b/tensorflow/python/keras/initializers.py
index cac78c44ca4503810a2bbbca27d38b7cde30affe..ac55ff965e693905407a534f083c8fab3f679c21 100644
--- a/tensorflow/python/keras/initializers.py
+++ b/tensorflow/python/keras/initializers.py
@@ -42,12 +42,30 @@ from tensorflow.python.ops.init_ops import RandomUniform as TFRandomUniform
 from tensorflow.python.ops.init_ops import TruncatedNormal as TFTruncatedNormal
 from tensorflow.python.ops.init_ops import VarianceScaling  # pylint: disable=unused-import
 from tensorflow.python.ops.init_ops import Zeros
-
-from tensorflow.python.util.tf_export import tf_export
-
-
-@tf_export('keras.initializers.TruncatedNormal',
-           'keras.initializers.truncated_normal')
+# pylint: disable=unused-import, disable=line-too-long
+from tensorflow.python.ops.init_ops_v2 import Constant as ConstantV2
+from tensorflow.python.ops.init_ops_v2 import GlorotNormal as GlorotNormalV2
+from tensorflow.python.ops.init_ops_v2 import GlorotUniform as GlorotUniformV2
+from tensorflow.python.ops.init_ops_v2 import he_normal as he_normalV2
+from tensorflow.python.ops.init_ops_v2 import he_uniform as he_uniformV2
+from tensorflow.python.ops.init_ops_v2 import Identity as IdentityV2
+from tensorflow.python.ops.init_ops_v2 import Initializer as InitializerV2
+from tensorflow.python.ops.init_ops_v2 import lecun_normal as lecun_normalV2
+from tensorflow.python.ops.init_ops_v2 import lecun_uniform  as lecun_uniformV2
+from tensorflow.python.ops.init_ops_v2 import Ones as OnesV2
+from tensorflow.python.ops.init_ops_v2 import Orthogonal as OrthogonalV2
+from tensorflow.python.ops.init_ops_v2 import RandomNormal as RandomNormalV2
+from tensorflow.python.ops.init_ops_v2 import RandomUniform as RandomUniformV2
+from tensorflow.python.ops.init_ops_v2 import TruncatedNormal as TruncatedNormalV2
+from tensorflow.python.ops.init_ops_v2 import VarianceScaling as VarianceScalingV2
+from tensorflow.python.ops.init_ops_v2 import Zeros as ZerosV2
+# pylint: enable=unused-import, enable=line-too-long
+
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export(v1=['keras.initializers.TruncatedNormal',
+                  'keras.initializers.truncated_normal'])
 class TruncatedNormal(TFTruncatedNormal):
   """Initializer that generates a truncated normal distribution.
 
@@ -71,8 +89,9 @@ class TruncatedNormal(TFTruncatedNormal):
         mean=mean, stddev=stddev, seed=seed, dtype=dtype)
 
 
-@tf_export('keras.initializers.RandomUniform', 'keras.initializers.uniform',
-           'keras.initializers.random_uniform')
+@keras_export(v1=['keras.initializers.RandomUniform',
+                  'keras.initializers.uniform',
+                  'keras.initializers.random_uniform'])
 class RandomUniform(TFRandomUniform):
   """Initializer that generates tensors with a uniform distribution.
 
@@ -92,8 +111,9 @@ class RandomUniform(TFRandomUniform):
         minval=minval, maxval=maxval, seed=seed, dtype=dtype)
 
 
-@tf_export('keras.initializers.RandomNormal', 'keras.initializers.normal',
-           'keras.initializers.random_normal')
+@keras_export(v1=['keras.initializers.RandomNormal',
+                  'keras.initializers.normal',
+                  'keras.initializers.random_normal'])
 class RandomNormal(TFRandomNormal):
   """Initializer that generates tensors with a normal distribution.
 
@@ -133,12 +153,12 @@ glorot_uniform = GlorotUniform
 # Utility functions
 
 
-@tf_export('keras.initializers.serialize')
+@keras_export('keras.initializers.serialize')
 def serialize(initializer):
   return serialize_keras_object(initializer)
 
 
-@tf_export('keras.initializers.deserialize')
+@keras_export('keras.initializers.deserialize')
 def deserialize(config, custom_objects=None):
   return deserialize_keras_object(
       config,
@@ -147,7 +167,7 @@ def deserialize(config, custom_objects=None):
       printable_module_name='initializer')
 
 
-@tf_export('keras.initializers.get')
+@keras_export('keras.initializers.get')
 def get(identifier):
   if identifier is None:
     return None
diff --git a/tensorflow/python/keras/initializers_test.py b/tensorflow/python/keras/initializers_test.py
index 4f91bea1e331f0b52a4f34fc848b3d51509e1360..36f2d405326f4bb96027d8022545c585072dcc98 100644
--- a/tensorflow/python/keras/initializers_test.py
+++ b/tensorflow/python/keras/initializers_test.py
@@ -21,8 +21,8 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
-from tensorflow.python.ops import init_ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import init_ops
 from tensorflow.python.platform import test
 
 
@@ -67,6 +67,7 @@ class KerasInitializersTest(test.TestCase):
                    tensor_shape,
                    target_mean=0., target_max=2, target_min=-2)
 
+  @test_util.run_deprecated_v1
   def test_constant(self):
     tensor_shape = (5, 6, 4)
     with self.cached_session():
@@ -134,6 +135,7 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.orthogonal(seed=123), tensor_shape,
                    target_mean=0.)
 
+  @test_util.run_deprecated_v1
   def test_identity(self):
     with self.cached_session():
       tensor_shape = (3, 4, 5)
@@ -145,28 +147,33 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.identity(), tensor_shape,
                    target_mean=1. / tensor_shape[0], target_max=1.)
 
+  @test_util.run_deprecated_v1
   def test_zero(self):
     tensor_shape = (4, 5)
     with self.cached_session():
       self._runner(keras.initializers.zeros(), tensor_shape,
                    target_mean=0., target_max=0.)
 
+  @test_util.run_deprecated_v1
   def test_one(self):
     tensor_shape = (4, 5)
     with self.cached_session():
       self._runner(keras.initializers.ones(), tensor_shape,
                    target_mean=1., target_max=1.)
 
+  @test_util.run_deprecated_v1
   def test_default_random_uniform(self):
     ru = keras.initializers.get('uniform')
     self.assertEqual(ru.minval, -0.05)
     self.assertEqual(ru.maxval, 0.05)
 
+  @test_util.run_deprecated_v1
   def test_default_random_normal(self):
     rn = keras.initializers.get('normal')
     self.assertEqual(rn.mean, 0.0)
     self.assertEqual(rn.stddev, 0.05)
 
+  @test_util.run_deprecated_v1
   def test_default_truncated_normal(self):
     tn = keras.initializers.get('truncated_normal')
     self.assertEqual(tn.mean, 0.0)
diff --git a/tensorflow/python/keras/integration_test.py b/tensorflow/python/keras/integration_test.py
index c516514f63270a9507101209680c1be221ba3f99..7250db2f99bcc68ca562564ce798c9f9f7020c35 100644
--- a/tensorflow/python/keras/integration_test.py
+++ b/tensorflow/python/keras/integration_test.py
@@ -18,309 +18,234 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
-from tensorflow.python.layers import core as tf_core_layers
-from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_ops as nn
 from tensorflow.python.ops import rnn_cell
 from tensorflow.python.platform import test
 
 
-class KerasIntegrationTest(test.TestCase):
-
-  def test_version(self):
-    self.assertTrue(keras.__version__.endswith('-tf'))
-
-  @test_util.run_v1_only('b/120545219')
-  def test_vector_classification_sequential(self):
-    with self.cached_session():
-      np.random.seed(1337)
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=100,
-          test_samples=0,
-          input_shape=(10,),
-          num_classes=2)
-      y_train = keras.utils.to_categorical(y_train)
-
-      model = keras.models.Sequential([
-          keras.layers.Dense(16,
-                             activation='relu',
-                             input_shape=x_train.shape[1:]),
-          keras.layers.Dropout(0.1),
-          keras.layers.Dense(y_train.shape[-1], activation='softmax')
-      ])
-      model.compile(loss='categorical_crossentropy',
-                    optimizer=keras.optimizers.Adam(lr=0.1),
-                    metrics=['accuracy'])
-      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_train, y_train),
-                          verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.7)
-
-  @test_util.run_deprecated_v1
-  def test_vector_classification_functional(self):
-    with self.cached_session():
-      np.random.seed(1337)
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=100,
-          test_samples=0,
-          input_shape=(20,),
-          num_classes=2)
-      y_train = keras.utils.to_categorical(y_train)
-
-      inputs = keras.layers.Input(shape=x_train.shape[1:])
-      x = keras.layers.Dense(16, activation='relu')(inputs)
-      x = keras.layers.Dropout(0.1)(x)
-      outputs = keras.layers.Dense(y_train.shape[-1], activation='softmax')(x)
-
-      model = keras.models.Model(inputs, outputs)
-      model.compile(loss='categorical_crossentropy',
-                    optimizer=keras.optimizers.Adam(lr=0.1),
-                    metrics=['accuracy'])
-      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_train, y_train),
-                          verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.7)
-
-  @test_util.run_deprecated_v1
-  def test_temporal_classification_sequential(self):
-    with self.cached_session():
-      np.random.seed(1337)
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=100,
-          test_samples=0,
-          input_shape=(4, 10),
-          num_classes=2)
-      y_train = keras.utils.to_categorical(y_train)
-
-      model = keras.models.Sequential()
-      model.add(keras.layers.LSTM(5, return_sequences=True,
-                                  input_shape=x_train.shape[1:]))
-      model.add(keras.layers.GRU(y_train.shape[-1], activation='softmax'))
-      model.compile(loss='categorical_crossentropy',
-                    optimizer=keras.optimizers.Adam(lr=0.1),
-                    metrics=['accuracy'])
-      history = model.fit(x_train, y_train, epochs=15, batch_size=16,
-                          validation_data=(x_train, y_train),
-                          verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.7)
-
-  @test_util.run_deprecated_v1
-  def test_temporal_classification_sequential_tf_rnn(self):
-    with self.cached_session():
-      np.random.seed(1337)
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=100,
-          test_samples=0,
-          input_shape=(4, 10),
-          num_classes=2)
-      y_train = keras.utils.to_categorical(y_train)
-
-      model = keras.models.Sequential()
-      model.add(keras.layers.RNN(rnn_cell.LSTMCell(5), return_sequences=True,
-                                 input_shape=x_train.shape[1:]))
-      model.add(keras.layers.RNN(rnn_cell.GRUCell(y_train.shape[-1],
-                                                  activation='softmax',
-                                                  dtype=dtypes.float32)))
-      model.compile(loss='categorical_crossentropy',
-                    optimizer=keras.optimizers.Adam(lr=0.1),
-                    metrics=['accuracy'])
-      history = model.fit(x_train, y_train, epochs=15, batch_size=16,
-                          validation_data=(x_train, y_train),
-                          verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.7)
-
-  @test_util.run_v1_only('b/120545219')
-  def test_image_classification_sequential(self):
-    with self.cached_session():
-      np.random.seed(1337)
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=100,
-          test_samples=0,
-          input_shape=(12, 12, 3),
-          num_classes=2)
-      y_train = keras.utils.to_categorical(y_train)
-
-      model = keras.models.Sequential()
-      model.add(keras.layers.Conv2D(
-          4, 3,
-          padding='same',
-          activation='relu',
-          input_shape=x_train.shape[1:]))
-      model.add(keras.layers.Conv2D(
-          8, 3,
-          padding='same',
-          activation='relu'))
-      model.add(keras.layers.Conv2D(
-          16, 3,
-          padding='same',
-          activation='relu'))
-      model.add(keras.layers.Flatten())
-      model.add(keras.layers.Dense(y_train.shape[-1], activation='softmax'))
-      model.compile(loss='categorical_crossentropy',
-                    optimizer=keras.optimizers.SGD(lr=0.01, momentum=0.8),
-                    metrics=['accuracy'])
-      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_train, y_train),
-                          verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.7)
-
-  @test_util.run_v1_only('b/120545219')
-  def test_video_classification_functional(self):
-    with self.cached_session():
-      np.random.seed(1337)
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=100,
-          test_samples=0,
-          input_shape=(4, 8, 8, 3),
-          num_classes=3)
-      y_train = keras.utils.to_categorical(y_train)
-
-      inputs = keras.layers.Input(shape=x_train.shape[1:])
-      x = keras.layers.TimeDistributed(
-          keras.layers.Conv2D(4, 3, activation='relu'))(inputs)
-      x = keras.layers.BatchNormalization()(x)
-      x = keras.layers.TimeDistributed(keras.layers.GlobalMaxPooling2D())(x)
-      x = keras.layers.Conv1D(8, 3, activation='relu')(x)
-      x = keras.layers.Flatten()(x)
-      outputs = keras.layers.Dense(y_train.shape[-1], activation='softmax')(x)
-
-      model = keras.models.Model(inputs, outputs)
-      model.compile(loss='categorical_crossentropy',
-                    optimizer=keras.optimizers.SGD(lr=0.01, momentum=0.8),
-                    metrics=['accuracy'])
-      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_train, y_train),
-                          verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.7)
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes
+class VectorClassificationIntegrationTest(keras_parameterized.TestCase):
+
+  def test_vector_classification(self):
+    np.random.seed(1337)
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=100,
+        test_samples=0,
+        input_shape=(10,),
+        num_classes=2)
+    y_train = keras.utils.to_categorical(y_train)
+
+    model = testing_utils.get_model_from_layers(
+        [keras.layers.Dense(16, activation='relu'),
+         keras.layers.Dropout(0.1),
+         keras.layers.Dense(y_train.shape[-1], activation='softmax')],
+        input_shape=x_train.shape[1:])
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=keras.optimizer_v2.adam.Adam(0.005),
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
+    history = model.fit(x_train, y_train, epochs=10, batch_size=10,
+                        validation_data=(x_train, y_train),
+                        verbose=2)
+    self.assertGreater(history.history['val_acc'][-1], 0.7)
+    _, val_acc = model.evaluate(x_train, y_train)
+    self.assertAlmostEqual(history.history['val_acc'][-1], val_acc)
+    predictions = model.predict(x_train)
+    self.assertEqual(predictions.shape, (x_train.shape[0], 2))
 
-  @test_util.run_v1_only('b/120545219')
-  def test_vector_classification_shared_sequential(self):
+  def test_vector_classification_shared_model(self):
     # Test that Sequential models that feature internal updates
     # and internal losses can be shared.
-    with self.cached_session():
-      np.random.seed(1337)
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=100,
-          test_samples=0,
-          input_shape=(10,),
-          num_classes=2)
-      y_train = keras.utils.to_categorical(y_train)
-
-      base_model = keras.models.Sequential([
-          keras.layers.Dense(16,
-                             activation='relu',
-                             kernel_regularizer=keras.regularizers.l2(1e-5),
-                             bias_regularizer=keras.regularizers.l2(1e-5),
-                             input_shape=x_train.shape[1:]),
-          keras.layers.BatchNormalization(),
-      ])
-      x = keras.layers.Input(x_train.shape[1:])
-      y = base_model(x)
-      y = keras.layers.Dense(y_train.shape[-1], activation='softmax')(y)
-      model = keras.models.Model(x, y)
-      model.compile(loss='categorical_crossentropy',
-                    optimizer=keras.optimizers.Adam(lr=0.1),
-                    metrics=['accuracy'])
+    np.random.seed(1337)
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=100,
+        test_samples=0,
+        input_shape=(10,),
+        num_classes=2)
+    y_train = keras.utils.to_categorical(y_train)
+
+    base_model = testing_utils.get_model_from_layers(
+        [keras.layers.Dense(16,
+                            activation='relu',
+                            kernel_regularizer=keras.regularizers.l2(1e-5),
+                            bias_regularizer=keras.regularizers.l2(1e-5)),
+         keras.layers.BatchNormalization()],
+        input_shape=x_train.shape[1:])
+    x = keras.layers.Input(x_train.shape[1:])
+    y = base_model(x)
+    y = keras.layers.Dense(y_train.shape[-1], activation='softmax')(y)
+    model = keras.models.Model(x, y)
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=keras.optimizer_v2.adam.Adam(0.005),
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
+    if not testing_utils.should_run_eagerly():
       self.assertEqual(len(model.losses), 2)
       self.assertEqual(len(model.updates), 2)
-      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_train, y_train),
-                          verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.7)
-
-  @test_util.run_v1_only('b/120545219')
-  def test_vector_classification_shared_model(self):
-    # Test that functional models that feature internal updates
-    # and internal losses can be shared.
-    with self.cached_session():
-      np.random.seed(1337)
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=100,
-          test_samples=0,
-          input_shape=(10,),
-          num_classes=2)
-      y_train = keras.utils.to_categorical(y_train)
-
-      inputs = keras.layers.Input(x_train.shape[1:])
-      x = keras.layers.Dense(16,
-                             activation='relu',
-                             kernel_regularizer=keras.regularizers.l2(1e-5),
-                             bias_regularizer=keras.regularizers.l2(1e-5),
-                             input_shape=x_train.shape[1:])(inputs)
-      x = keras.layers.BatchNormalization()(x)
-      base_model = keras.models.Model(inputs, x)
-
-      x = keras.layers.Input(x_train.shape[1:])
-      y = base_model(x)
-      y = keras.layers.Dense(y_train.shape[-1], activation='softmax')(y)
-      model = keras.models.Model(x, y)
-      model.compile(loss='categorical_crossentropy',
-                    optimizer=keras.optimizers.Adam(lr=0.1),
-                    metrics=['accuracy'])
-      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_train, y_train),
-                          verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.7)
-
-  def test_embedding_with_clipnorm(self):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Embedding(input_dim=1, output_dim=1))
-      model.compile(optimizer=keras.optimizers.SGD(clipnorm=0.1), loss='mse')
-      model.fit(np.array([[0]]), np.array([[[0.5]]]), epochs=1)
-
-  def test_using_tf_layers_in_keras_sequential_model(self):
-    with self.cached_session():
-      np.random.seed(1337)
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=100,
-          test_samples=0,
-          input_shape=(10,),
-          num_classes=2)
-
-      model = keras.models.Sequential()
-      model.add(tf_core_layers.Dense(32, activation=nn.relu, input_shape=(10,)))
-      model.add(tf_core_layers.Dense(2, activation=nn.softmax))
-      model.summary()
-
-      y_train = keras.utils.to_categorical(y_train)
-      model.compile(loss='categorical_crossentropy',
-                    optimizer=keras.optimizers.Adam(lr=0.1),
-                    metrics=['accuracy'])
-      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_train, y_train),
-                          verbose=0)
-      self.assertGreater(history.history['val_acc'][-1], 0.7)
-
-  def test_using_tf_layers_in_keras_functional_model(self):
-    with self.cached_session():
-      np.random.seed(1337)
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=100,
-          test_samples=0,
-          input_shape=(10,),
-          num_classes=2)
-      y_train = keras.utils.to_categorical(y_train)
-
-      inputs = keras.Input(shape=(10,))
-      x = tf_core_layers.Dense(32, activation=nn.relu)(inputs)
-      outputs = tf_core_layers.Dense(2, activation=nn.softmax)(x)
-      model = keras.Model(inputs, outputs)
-      model.summary()
-
-      model.compile(loss='categorical_crossentropy',
-                    optimizer=keras.optimizers.Adam(lr=0.1),
-                    metrics=['accuracy'])
-      history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_train, y_train),
-                          verbose=0)
-      self.assertGreater(history.history['val_acc'][-1], 0.7)
-
+    history = model.fit(x_train, y_train, epochs=10, batch_size=10,
+                        validation_data=(x_train, y_train),
+                        verbose=2)
+    self.assertGreater(history.history['val_acc'][-1], 0.7)
+    _, val_acc = model.evaluate(x_train, y_train)
+    self.assertAlmostEqual(history.history['val_acc'][-1], val_acc)
+    predictions = model.predict(x_train)
+    self.assertEqual(predictions.shape, (x_train.shape[0], 2))
+
+
+# See b/122473407
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+class TimeseriesClassificationIntegrationTest(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_with_all_model_types
+  def test_timeseries_classification(self):
+    np.random.seed(1337)
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=100,
+        test_samples=0,
+        input_shape=(4, 10),
+        num_classes=2)
+    y_train = keras.utils.to_categorical(y_train)
+
+    layers = [
+        keras.layers.LSTM(5, return_sequences=True),
+        keras.layers.GRU(y_train.shape[-1], activation='softmax')
+    ]
+    model = testing_utils.get_model_from_layers(
+        layers, input_shape=x_train.shape[1:])
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=keras.optimizer_v2.adam.Adam(0.005),
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
+    history = model.fit(x_train, y_train, epochs=15, batch_size=10,
+                        validation_data=(x_train, y_train),
+                        verbose=2)
+    self.assertGreater(history.history['val_acc'][-1], 0.7)
+    _, val_acc = model.evaluate(x_train, y_train)
+    self.assertAlmostEqual(history.history['val_acc'][-1], val_acc)
+    predictions = model.predict(x_train)
+    self.assertEqual(predictions.shape, (x_train.shape[0], 2))
+
+  def test_timeseries_classification_sequential_tf_rnn(self):
+    np.random.seed(1337)
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=100,
+        test_samples=0,
+        input_shape=(4, 10),
+        num_classes=2)
+    y_train = keras.utils.to_categorical(y_train)
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.RNN(rnn_cell.LSTMCell(5), return_sequences=True,
+                               input_shape=x_train.shape[1:]))
+    model.add(keras.layers.RNN(rnn_cell.GRUCell(y_train.shape[-1],
+                                                activation='softmax',
+                                                dtype=dtypes.float32)))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=keras.optimizer_v2.adam.Adam(0.005),
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
+    history = model.fit(x_train, y_train, epochs=15, batch_size=10,
+                        validation_data=(x_train, y_train),
+                        verbose=2)
+    self.assertGreater(history.history['val_acc'][-1], 0.7)
+    _, val_acc = model.evaluate(x_train, y_train)
+    self.assertAlmostEqual(history.history['val_acc'][-1], val_acc)
+    predictions = model.predict(x_train)
+    self.assertEqual(predictions.shape, (x_train.shape[0], 2))
+
+
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes
+class ImageClassificationIntegrationTest(keras_parameterized.TestCase):
+
+  def test_image_classification(self):
+    np.random.seed(1337)
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=100,
+        test_samples=0,
+        input_shape=(10, 10, 3),
+        num_classes=2)
+    y_train = keras.utils.to_categorical(y_train)
+
+    layers = [
+        keras.layers.Conv2D(4, 3, padding='same', activation='relu'),
+        keras.layers.Conv2D(8, 3, padding='same'),
+        keras.layers.BatchNormalization(),
+        keras.layers.Conv2D(8, 3, padding='same'),
+        keras.layers.Flatten(),
+        keras.layers.Dense(y_train.shape[-1], activation='softmax')
+    ]
+    model = testing_utils.get_model_from_layers(
+        layers, input_shape=x_train.shape[1:])
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=keras.optimizer_v2.adam.Adam(0.005),
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
+    history = model.fit(x_train, y_train, epochs=10, batch_size=10,
+                        validation_data=(x_train, y_train),
+                        verbose=2)
+    self.assertGreater(history.history['val_acc'][-1], 0.7)
+    _, val_acc = model.evaluate(x_train, y_train)
+    self.assertAlmostEqual(history.history['val_acc'][-1], val_acc)
+    predictions = model.predict(x_train)
+    self.assertEqual(predictions.shape, (x_train.shape[0], 2))
+
+
+@keras_parameterized.run_all_keras_modes
+class ActivationV2IntegrationTest(keras_parameterized.TestCase):
+  """Tests activation function V2 in model exporting and loading.
+
+  This test is to verify in TF 2.x, when 'tf.nn.softmax' is used as an
+  activition function, its model exporting and loading work as expected.
+  Check b/123041942 for details.
+  """
+
+  def test_serialization_v2_model(self):
+    np.random.seed(1337)
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=100,
+        test_samples=0,
+        input_shape=(10,),
+        num_classes=2)
+    y_train = keras.utils.to_categorical(y_train)
+
+    model = keras.Sequential([
+        keras.layers.Flatten(input_shape=x_train.shape[1:]),
+        keras.layers.Dense(10, activation=nn.relu),
+        # To mimic 'tf.nn.softmax' used in TF 2.x.
+        keras.layers.Dense(y_train.shape[-1], activation=nn.softmax_v2),
+    ])
+
+    # Check if 'softmax' is in model.get_config().
+    last_layer_activation = model.get_layer(index=2).get_config()['activation']
+    self.assertEqual(last_layer_activation, 'softmax')
+
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=keras.optimizer_v2.adam.Adam(0.005),
+                  metrics=['accuracy'],
+                  run_eagerly=testing_utils.should_run_eagerly())
+    model.fit(x_train, y_train, epochs=2, batch_size=10,
+              validation_data=(x_train, y_train),
+              verbose=2)
+
+    output_path = os.path.join(self.get_temp_dir(), 'tf_keras_saved_model')
+    keras.saving.saved_model.export_saved_model(model, output_path)
+    loaded_model = keras.saving.saved_model.load_from_saved_model(output_path)
+    self.assertEqual(model.summary(), loaded_model.summary())
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/keras_parameterized.py b/tensorflow/python/keras/keras_parameterized.py
index d76bbadeb3613a8e71b1a6fc313fb7e68630de93..f505ced038b6a28ced6db0a6b94b849a8091e965 100644
--- a/tensorflow/python/keras/keras_parameterized.py
+++ b/tensorflow/python/keras/keras_parameterized.py
@@ -145,14 +145,34 @@ def run_with_all_model_types(
     @functools.wraps(f)
     def decorated(self, model_type, *args, **kwargs):
       """A run of a single test case w/ the specified model type."""
-      with testing_utils.model_type_scope(model_type):
-        f(self, *args, **kwargs)
-
+      if model_type == 'functional':
+        _test_functional_model_type(f, self, *args, **kwargs)
+      elif model_type == 'subclass':
+        _test_subclass_model_type(f, self, *args, **kwargs)
+      elif model_type == 'sequential':
+        _test_sequential_model_type(f, self, *args, **kwargs)
+      else:
+        raise ValueError('Unknown model type: %s' % (model_type,))
     return decorated
 
   return _test_or_class_decorator(test_or_class, single_method_decorator)
 
 
+def _test_functional_model_type(f, test_or_class, *args, **kwargs):
+  with testing_utils.model_type_scope('functional'):
+    f(test_or_class, *args, **kwargs)
+
+
+def _test_subclass_model_type(f, test_or_class, *args, **kwargs):
+  with testing_utils.model_type_scope('subclass'):
+    f(test_or_class, *args, **kwargs)
+
+
+def _test_sequential_model_type(f, test_or_class, *args, **kwargs):
+  with testing_utils.model_type_scope('sequential'):
+    f(test_or_class, *args, **kwargs)
+
+
 def run_all_keras_modes(
     test_or_class=None,
     config=None,
@@ -233,17 +253,11 @@ def run_all_keras_modes(
     def decorated(self, run_mode, *args, **kwargs):
       """A run of a single test case w/ specified run mode."""
       if run_mode == 'v1_graph':
-        with context.graph_mode(), testing_utils.run_eagerly_scope(False):
-          with self.test_session(use_gpu=True, config=config):
-            f(self, *args, **kwargs)
+        _v1_graph_test(f, self, config, *args, **kwargs)
       elif run_mode == 'v2_function':
-        with context.eager_mode():
-          with testing_utils.run_eagerly_scope(False):
-            f(self, *args, **kwargs)
+        _v2_graph_functions_test(f, self, *args, **kwargs)
       elif run_mode == 'v2_eager':
-        with context.eager_mode():
-          with testing_utils.run_eagerly_scope(True):
-            f(self, *args, **kwargs)
+        _v2_eager_test(f, self, *args, **kwargs)
       else:
         return ValueError('Unknown run mode %s' % run_mode)
 
@@ -252,6 +266,24 @@ def run_all_keras_modes(
   return _test_or_class_decorator(test_or_class, single_method_decorator)
 
 
+def _v1_graph_test(f, test_or_class, config, *args, **kwargs):
+  with context.graph_mode(), testing_utils.run_eagerly_scope(False):
+    with test_or_class.test_session(use_gpu=True, config=config):
+      f(test_or_class, *args, **kwargs)
+
+
+def _v2_graph_functions_test(f, test_or_class, *args, **kwargs):
+  with context.eager_mode():
+    with testing_utils.run_eagerly_scope(False):
+      f(test_or_class, *args, **kwargs)
+
+
+def _v2_eager_test(f, test_or_class, *args, **kwargs):
+  with context.eager_mode():
+    with testing_utils.run_eagerly_scope(True):
+      f(test_or_class, *args, **kwargs)
+
+
 def _test_or_class_decorator(test_or_class, single_method_decorator):
   """Decorate a test or class with a decorator intended for one method.
 
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index df7571e5d5fc862c29016fc0e12d1d33059405ad..88fbaca3eacfc074bc567fa066e59d8f010c7ea2 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -110,6 +110,10 @@ from tensorflow.python.keras.layers.noise import GaussianDropout
 
 # Normalization layers.
 from tensorflow.python.keras.layers.normalization import BatchNormalization
+from tensorflow.python.keras.layers.normalization import LayerNormalization
+
+# Kernelized layers.
+from tensorflow.python.keras.layers.kernelized import RandomFourierFeatures
 
 # Pooling layers.
 from tensorflow.python.keras.layers.pooling import MaxPooling1D
@@ -149,6 +153,7 @@ from tensorflow.python.keras.layers.recurrent import PeepholeLSTMCell
 from tensorflow.python.keras.layers.recurrent import SimpleRNN
 from tensorflow.python.keras.layers.recurrent import GRU
 from tensorflow.python.keras.layers.recurrent import LSTM
+from tensorflow.python.keras.layers.recurrent import UnifiedGRU
 from tensorflow.python.keras.layers.recurrent import UnifiedLSTM
 
 # Convolutional-recurrent layers.
diff --git a/tensorflow/python/keras/layers/advanced_activations.py b/tensorflow/python/keras/layers/advanced_activations.py
index 35ac7830b2e2f37ffc270227d44450d730a9149c..5095287430735b4d370b0545c3971da14a4c0b6d 100644
--- a/tensorflow/python/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/layers/advanced_activations.py
@@ -26,10 +26,10 @@ from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import math_ops
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.layers.LeakyReLU')
+@keras_export('keras.layers.LeakyReLU')
 class LeakyReLU(Layer):
   """Leaky version of a Rectified Linear Unit.
 
@@ -68,7 +68,7 @@ class LeakyReLU(Layer):
     return input_shape
 
 
-@tf_export('keras.layers.PReLU')
+@keras_export('keras.layers.PReLU')
 class PReLU(Layer):
   """Parametric Rectified Linear Unit.
 
@@ -121,11 +121,9 @@ class PReLU(Layer):
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
     param_shape = list(input_shape[1:])
-    self.param_broadcast = [False] * len(param_shape)
     if self.shared_axes is not None:
       for i in self.shared_axes:
         param_shape[i - 1] = 1
-        self.param_broadcast[i - 1] = True
     self.alpha = self.add_weight(
         shape=param_shape,
         name='alpha',
@@ -143,12 +141,7 @@ class PReLU(Layer):
 
   def call(self, inputs, mask=None):
     pos = K.relu(inputs)
-    if K.backend() == 'theano':
-      neg = (
-          K.pattern_broadcast(self.alpha, self.param_broadcast) *
-          (inputs - math_ops.abs(inputs)) * 0.5)
-    else:
-      neg = -self.alpha * K.relu(-inputs)
+    neg = -self.alpha * K.relu(-inputs)
     return pos + neg
 
   def get_config(self):
@@ -166,7 +159,7 @@ class PReLU(Layer):
     return input_shape
 
 
-@tf_export('keras.layers.ELU')
+@keras_export('keras.layers.ELU')
 class ELU(Layer):
   """Exponential Linear Unit.
 
@@ -205,7 +198,7 @@ class ELU(Layer):
     return input_shape
 
 
-@tf_export('keras.layers.ThresholdedReLU')
+@keras_export('keras.layers.ThresholdedReLU')
 class ThresholdedReLU(Layer):
   """Thresholded Rectified Linear Unit.
 
@@ -245,7 +238,7 @@ class ThresholdedReLU(Layer):
     return input_shape
 
 
-@tf_export('keras.layers.Softmax')
+@keras_export('keras.layers.Softmax')
 class Softmax(Layer):
   """Softmax activation function.
 
@@ -279,7 +272,7 @@ class Softmax(Layer):
     return input_shape
 
 
-@tf_export('keras.layers.ReLU')
+@keras_export('keras.layers.ReLU')
 class ReLU(Layer):
   """Rectified Linear Unit activation function.
 
diff --git a/tensorflow/python/keras/layers/advanced_activations_test.py b/tensorflow/python/keras/layers/advanced_activations_test.py
index f32bb457c825d9769c6dccf625d9318c07843237..f04185417effae2b705a610edddd97a2ccf2ad74 100644
--- a/tensorflow/python/keras/layers/advanced_activations_test.py
+++ b/tensorflow/python/keras/layers/advanced_activations_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.keras import keras_parameterized
@@ -88,6 +90,13 @@ class AdvancedActivationsTest(keras_parameterized.TestCase):
             kwargs={'negative_slope': -2},
             input_shape=(2, 3, 4))
 
+  @keras_parameterized.run_with_all_model_types
+  def test_layer_as_activation(self):
+    layer = keras.layers.Dense(1, activation=keras.layers.ReLU())
+    model = testing_utils.get_model_from_layers([layer], input_shape=(10,))
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    model.fit(np.ones((10, 10)), np.ones((10, 1)), batch_size=2)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index 6564d6e8fdba6d6f8b384b06125032d16f34e28a..8d80eb85472416dddc16a2ad9db052faee72f2e0 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -42,7 +42,7 @@ from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
 class Conv(Layer):
@@ -180,12 +180,14 @@ class Conv(Layer):
       op_padding = 'valid'
     else:
       op_padding = self.padding
+    if not isinstance(op_padding, (list, tuple)):
+      op_padding = op_padding.upper()
     self._convolution_op = nn_ops.Convolution(
         input_shape,
         filter_shape=self.kernel.get_shape(),
         dilation_rate=self.dilation_rate,
         strides=self.strides,
-        padding=op_padding.upper(),
+        padding=op_padding,
         data_format=conv_utils.convert_data_format(self.data_format,
                                                    self.rank + 2))
     self.built = True
@@ -199,21 +201,8 @@ class Conv(Layer):
           # nn.bias_add does not accept a 1D input tensor.
           bias = array_ops.reshape(self.bias, (1, self.filters, 1))
           outputs += bias
-        if self.rank == 2:
+        else:
           outputs = nn.bias_add(outputs, self.bias, data_format='NCHW')
-        if self.rank == 3:
-          # As of Mar 2017, direct addition is significantly slower than
-          # bias_add when computing gradients. To use bias_add, we collapse Z
-          # and Y into a single dimension to obtain a 4D input tensor.
-          outputs_shape = outputs.shape.as_list()
-          if outputs_shape[0] is None:
-            outputs_shape[0] = -1
-          outputs_4d = array_ops.reshape(outputs,
-                                         [outputs_shape[0], outputs_shape[1],
-                                          outputs_shape[2] * outputs_shape[3],
-                                          outputs_shape[4]])
-          outputs_4d = nn.bias_add(outputs_4d, self.bias, data_format='NCHW')
-          outputs = array_ops.reshape(outputs_4d, outputs_shape)
       else:
         outputs = nn.bias_add(outputs, self.bias, data_format='NHWC')
 
@@ -282,7 +271,7 @@ class Conv(Layer):
     return causal_padding
 
 
-@tf_export('keras.layers.Conv1D', 'keras.layers.Convolution1D')
+@keras_export('keras.layers.Conv1D', 'keras.layers.Convolution1D')
 class Conv1D(Conv):
   """1D convolution layer (e.g. temporal convolution).
 
@@ -384,7 +373,7 @@ class Conv1D(Conv):
     return super(Conv1D, self).call(inputs)
 
 
-@tf_export('keras.layers.Conv2D', 'keras.layers.Convolution2D')
+@keras_export('keras.layers.Conv2D', 'keras.layers.Convolution2D')
 class Conv2D(Conv):
   """2D convolution layer (e.g. spatial convolution over images).
 
@@ -495,7 +484,7 @@ class Conv2D(Conv):
         **kwargs)
 
 
-@tf_export('keras.layers.Conv3D', 'keras.layers.Convolution3D')
+@keras_export('keras.layers.Conv3D', 'keras.layers.Convolution3D')
 class Conv3D(Conv):
   """3D convolution layer (e.g. spatial convolution over volumes).
 
@@ -613,8 +602,8 @@ class Conv3D(Conv):
         **kwargs)
 
 
-@tf_export('keras.layers.Conv2DTranspose',
-           'keras.layers.Convolution2DTranspose')
+@keras_export('keras.layers.Conv2DTranspose',
+              'keras.layers.Convolution2DTranspose')
 class Conv2DTranspose(Conv2D):
   """Transposed convolution layer (sometimes called Deconvolution).
 
@@ -700,7 +689,7 @@ class Conv2DTranspose(Conv2D):
       - [A guide to convolution arithmetic for deep
         learning](https://arxiv.org/abs/1603.07285v1)
       - [Deconvolutional
-        Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf)
+        Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf)
   """
 
   def __init__(self,
@@ -885,8 +874,8 @@ class Conv2DTranspose(Conv2D):
     return config
 
 
-@tf_export('keras.layers.Conv3DTranspose',
-           'keras.layers.Convolution3DTranspose')
+@keras_export('keras.layers.Conv3DTranspose',
+              'keras.layers.Convolution3DTranspose')
 class Conv3DTranspose(Conv3D):
   """Transposed convolution layer (sometimes called Deconvolution).
 
@@ -983,7 +972,7 @@ class Conv3DTranspose(Conv3D):
       - [A guide to convolution arithmetic for deep
         learning](https://arxiv.org/abs/1603.07285v1)
       - [Deconvolutional
-        Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf)
+        Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf)
   """
 
   def __init__(self,
@@ -1127,24 +1116,10 @@ class Conv3DTranspose(Conv3D):
       outputs.set_shape(out_shape)
 
     if self.use_bias:
-      outputs_shape = outputs.shape.as_list()
-      if outputs_shape[0] is None:
-        outputs_shape[0] = -1
-      if self.data_format == 'channels_first':
-        outputs_4d = array_ops.reshape(outputs, [
-            outputs_shape[0], outputs_shape[1],
-            outputs_shape[2] * outputs_shape[3], outputs_shape[4]
-        ])
-      else:
-        outputs_4d = array_ops.reshape(outputs, [
-            outputs_shape[0], outputs_shape[1] * outputs_shape[2],
-            outputs_shape[3], outputs_shape[4]
-        ])
-      outputs_4d = nn.bias_add(
-          outputs_4d,
+      outputs = nn.bias_add(
+          outputs,
           self.bias,
           data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
-      outputs = array_ops.reshape(outputs_4d, outputs_shape)
 
     if self.activation is not None:
       return self.activation(outputs)
@@ -1401,8 +1376,8 @@ class SeparableConv(Conv):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.SeparableConv1D',
-           'keras.layers.SeparableConvolution1D')
+@keras_export('keras.layers.SeparableConv1D',
+              'keras.layers.SeparableConvolution1D')
 class SeparableConv1D(SeparableConv):
   """Depthwise separable 1D convolution.
 
@@ -1549,8 +1524,8 @@ class SeparableConv1D(SeparableConv):
     return outputs
 
 
-@tf_export('keras.layers.SeparableConv2D',
-           'keras.layers.SeparableConvolution2D')
+@keras_export('keras.layers.SeparableConv2D',
+              'keras.layers.SeparableConvolution2D')
 class SeparableConv2D(SeparableConv):
   """Depthwise separable 2D convolution.
 
@@ -1701,7 +1676,7 @@ class SeparableConv2D(SeparableConv):
     return outputs
 
 
-@tf_export('keras.layers.DepthwiseConv2D')
+@keras_export('keras.layers.DepthwiseConv2D')
 class DepthwiseConv2D(Conv2D):
   """Depthwise separable 2D convolution.
 
@@ -1895,7 +1870,7 @@ class DepthwiseConv2D(Conv2D):
     return config
 
 
-@tf_export('keras.layers.UpSampling1D')
+@keras_export('keras.layers.UpSampling1D')
 class UpSampling1D(Layer):
   """Upsampling layer for 1D inputs.
 
@@ -1931,7 +1906,7 @@ class UpSampling1D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.UpSampling2D')
+@keras_export('keras.layers.UpSampling2D')
 class UpSampling2D(Layer):
   """Upsampling layer for 2D inputs.
 
@@ -2010,7 +1985,7 @@ class UpSampling2D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.UpSampling3D')
+@keras_export('keras.layers.UpSampling3D')
 class UpSampling3D(Layer):
   """Upsampling layer for 3D inputs.
 
@@ -2083,7 +2058,7 @@ class UpSampling3D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.ZeroPadding1D')
+@keras_export('keras.layers.ZeroPadding1D')
 class ZeroPadding1D(Layer):
   """Zero-padding layer for 1D input (e.g. temporal sequence).
 
@@ -2124,7 +2099,7 @@ class ZeroPadding1D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.ZeroPadding2D')
+@keras_export('keras.layers.ZeroPadding2D')
 class ZeroPadding2D(Layer):
   """Zero-padding layer for 2D input (e.g. picture).
 
@@ -2226,7 +2201,7 @@ class ZeroPadding2D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.ZeroPadding3D')
+@keras_export('keras.layers.ZeroPadding3D')
 class ZeroPadding3D(Layer):
   """Zero-padding layer for 3D data (spatial or spatio-temporal).
 
@@ -2344,7 +2319,7 @@ class ZeroPadding3D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.Cropping1D')
+@keras_export('keras.layers.Cropping1D')
 class Cropping1D(Layer):
   """Cropping layer for 1D input (e.g. temporal sequence).
 
@@ -2389,7 +2364,7 @@ class Cropping1D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.Cropping2D')
+@keras_export('keras.layers.Cropping2D')
 class Cropping2D(Layer):
   """Cropping layer for 2D input (e.g. picture).
 
@@ -2521,7 +2496,7 @@ class Cropping2D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.Cropping3D')
+@keras_export('keras.layers.Cropping3D')
 class Cropping3D(Layer):
   """Cropping layer for 3D data (e.g.
 
diff --git a/tensorflow/python/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/layers/convolutional_recurrent.py
index cf3861da21858d0ef0ab4e7567795edbf41635b8..c0479e71a24dc4b8c7ed1e660f18d610784448e1 100644
--- a/tensorflow/python/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent.py
@@ -34,7 +34,7 @@ from tensorflow.python.keras.layers.recurrent import RNN
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
 class ConvRNN2D(RNN):
@@ -770,7 +770,7 @@ class ConvLSTM2DCell(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.ConvLSTM2D')
+@keras_export('keras.layers.ConvLSTM2D')
 class ConvLSTM2D(ConvRNN2D):
   """Convolutional LSTM.
 
diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index d3339a8413095cae2b74e19d768fcda0e1b4e4fb..24b61feec27e1af5791fdfb1228bf01aca91a126 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -18,51 +18,44 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import copy
-
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
-from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class Convolution1DTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class Conv1DTest(keras_parameterized.TestCase):
 
-  def _run_test(self, kwargs, arg, values):
+  def _run_test(self, kwargs):
     num_samples = 2
     stack_size = 3
     length = 7
 
-    test_kwargs = copy.copy(kwargs)
-    for value in values:
-      test_kwargs[arg] = value
-      with self.cached_session(use_gpu=True):
-        testing_utils.layer_test(
-            keras.layers.Conv1D,
-            kwargs=test_kwargs,
-            input_shape=(num_samples, length, stack_size))
-
-  def test_conv1d(self):
-    kwargs = {
-        'filters': 2,
-        'kernel_size': 3,
-    }
-
-    self._run_test(kwargs, 'padding', ['valid', 'same', 'causal'])
-    self._run_test(kwargs, 'strides', [2])
-    self._run_test(kwargs, 'dilation_rate', [2])
-
-    kwargs = {
-        'filters': 2,
-        'kernel_size': 3,
-        'padding': 'same',
-    }
-    self._run_test(kwargs, 'dilation_rate', [2])
-    self._run_test(kwargs, 'dilation_rate', [3])
+    with self.cached_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.Conv1D,
+          kwargs=kwargs,
+          input_shape=(num_samples, length, stack_size))
+
+  @parameterized.named_parameters(
+      ('padding_valid', {'padding': 'valid'}),
+      ('padding_same', {'padding': 'same'}),
+      ('padding_same_dilation_2', {'padding': 'same', 'dilation_rate': 2}),
+      ('padding_same_dilation_3', {'padding': 'same', 'dilation_rate': 3}),
+      ('padding_causal', {'padding': 'causal'}),
+      ('strides', {'strides': 2}),
+      ('dilation_rate', {'dilation_rate': 2}),
+  )
+  def test_conv1d(self, kwargs):
+    kwargs['filters'] = 2
+    kwargs['kernel_size'] = 3
+    self._run_test(kwargs)
 
   def test_conv1d_regularizers(self):
     kwargs = {
@@ -74,7 +67,7 @@ class Convolution1DTest(test.TestCase):
         'activity_regularizer': 'l2',
         'strides': 1
     }
-    with self.session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       layer = keras.layers.Conv1D(**kwargs)
       layer.build((None, 5, 2))
       self.assertEqual(len(layer.losses), 2)
@@ -93,51 +86,43 @@ class Convolution1DTest(test.TestCase):
         'bias_constraint': b_constraint,
         'strides': 1
     }
-    with self.session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       layer = keras.layers.Conv1D(**kwargs)
       layer.build((None, 5, 2))
       self.assertEqual(layer.kernel.constraint, k_constraint)
       self.assertEqual(layer.bias.constraint, b_constraint)
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class Conv2DTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class Conv2DTest(keras_parameterized.TestCase):
 
-  def _run_test(self, kwargs, arg, values):
+  def _run_test(self, kwargs):
     num_samples = 2
     stack_size = 3
     num_row = 7
     num_col = 6
 
-    test_kwargs = copy.copy(kwargs)
-    for value in values:
-      test_kwargs[arg] = value
-      with self.cached_session(use_gpu=True):
-        testing_utils.layer_test(
-            keras.layers.Conv2D,
-            kwargs=test_kwargs,
-            input_shape=(num_samples, num_row, num_col, stack_size))
-
-  def test_conv2d(self):
-    kwargs = {
-        'filters': 2,
-        'kernel_size': (3, 3),
-    }
-
-    self._run_test(kwargs, 'padding', ['valid', 'same'])
-    self._run_test(kwargs, 'strides', [(2, 2)])
-    if test.is_gpu_available(cuda_only=True):
+    with self.cached_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.Conv2D,
+          kwargs=kwargs,
+          input_shape=(num_samples, num_row, num_col, stack_size))
+
+  @parameterized.named_parameters(
+      ('padding_valid', {'padding': 'valid'}),
+      ('padding_same', {'padding': 'same'}),
+      ('padding_same_dilation_2', {'padding': 'same', 'dilation_rate': 2}),
+      ('strides', {'strides': (2, 2)}),
+      ('dilation_rate', {'dilation_rate': (2, 2)}),
       # Only runs on GPU with CUDA, channels_first is not supported on CPU.
       # TODO(b/62340061): Support channels_first on CPU.
-      self._run_test(kwargs, 'data_format', ['channels_first'])
-    self._run_test(kwargs, 'dilation_rate', [(2, 2)])
-
-    kwargs = {
-        'filters': 2,
-        'kernel_size': 3,
-        'padding': 'same',
-    }
-    self._run_test(kwargs, 'dilation_rate', [2])
+      ('data_format', {'data_format': 'channels_first'}),
+  )
+  def test_conv2d(self, kwargs):
+    kwargs['filters'] = 2
+    kwargs['kernel_size'] = (3, 3)
+    if 'data_format' not in kwargs or test.is_gpu_available(cuda_only=True):
+      self._run_test(kwargs)
 
   def test_conv2d_regularizers(self):
     kwargs = {
@@ -149,7 +134,7 @@ class Conv2DTest(test.TestCase):
         'activity_regularizer': 'l2',
         'strides': 1
     }
-    with self.session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       layer = keras.layers.Conv2D(**kwargs)
       layer.build((None, 5, 5, 2))
       self.assertEqual(len(layer.losses), 2)
@@ -168,357 +153,43 @@ class Conv2DTest(test.TestCase):
         'bias_constraint': b_constraint,
         'strides': 1
     }
-    with self.session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       layer = keras.layers.Conv2D(**kwargs)
       layer.build((None, 5, 5, 2))
       self.assertEqual(layer.kernel.constraint, k_constraint)
       self.assertEqual(layer.bias.constraint, b_constraint)
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class Conv2DTransposeTest(test.TestCase):
-
-  def _run_test(self, kwargs, arg, values):
-    num_samples = 2
-    stack_size = 3
-    num_row = 7
-    num_col = 6
-
-    test_kwargs = copy.copy(kwargs)
-    for value in values:
-      test_kwargs[arg] = value
-      with self.cached_session(use_gpu=True):
-        testing_utils.layer_test(
-            keras.layers.Conv2DTranspose,
-            kwargs=test_kwargs,
-            input_shape=(num_samples, num_row, num_col, stack_size))
-
-  def test_conv2dtranspose(self):
-    kwargs = {
-        'filters': 2,
-        'kernel_size': (3, 3),
-    }
-
-    self._run_test(kwargs, 'padding', ['valid', 'same'])
-    self._run_test(kwargs, 'strides', [(2, 2)])
-    if test.is_gpu_available(cuda_only=True):
-      self._run_test(kwargs, 'data_format', ['channels_first'])
-
-    kwargs['strides'] = (2, 2)
-    self._run_test(kwargs, 'output_padding', [(1, 1)])
-
-  def test_conv2dtranspose_regularizers(self):
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'kernel_regularizer': 'l2',
-        'bias_regularizer': 'l2',
-        'activity_regularizer': 'l2',
-        'strides': 1
-    }
-    with self.session(use_gpu=True):
-      layer = keras.layers.Conv2DTranspose(**kwargs)
-      layer.build((None, 5, 5, 2))
-      self.assertEqual(len(layer.losses), 2)
-      layer(keras.backend.variable(np.ones((1, 5, 5, 2))))
-      self.assertEqual(len(layer.losses), 3)
-
-  def test_conv2dtranspose_constraints(self):
-    k_constraint = lambda x: x
-    b_constraint = lambda x: x
-
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'kernel_constraint': k_constraint,
-        'bias_constraint': b_constraint,
-        'strides': 1
-    }
-    with self.session(use_gpu=True):
-      layer = keras.layers.Conv2DTranspose(**kwargs)
-      layer.build((None, 5, 5, 2))
-      self.assertEqual(layer.kernel.constraint, k_constraint)
-      self.assertEqual(layer.bias.constraint, b_constraint)
-
-  def test_conv2d_transpose_dilation(self):
-    testing_utils.layer_test(keras.layers.Conv2DTranspose,
-                             kwargs={'filters': 2,
-                                     'kernel_size': 3,
-                                     'padding': 'same',
-                                     'data_format': 'channels_last',
-                                     'dilation_rate': (2, 2)},
-                             input_shape=(2, 5, 6, 3))
-
-    input_data = np.arange(48).reshape((1, 4, 4, 3)).astype(np.float32)
-    expected_output = np.float32([[192, 228, 192, 228],
-                                  [336, 372, 336, 372],
-                                  [192, 228, 192, 228],
-                                  [336, 372, 336, 372]]).reshape((1, 4, 4, 1))
-    testing_utils.layer_test(keras.layers.Conv2DTranspose,
-                             input_data=input_data,
-                             kwargs={'filters': 1,
-                                     'kernel_size': 3,
-                                     'padding': 'same',
-                                     'data_format': 'channels_last',
-                                     'dilation_rate': (2, 2),
-                                     'kernel_initializer': 'ones'},
-                             expected_output=expected_output)
-
-
-@tf_test_util.run_all_in_graph_and_eager_modes
-class Conv3DTransposeTest(test.TestCase):
-
-  def _run_test(self, kwargs, arg, values):
-    num_samples = 2
-    stack_size = 3
-    num_row = 7
-    num_col = 6
-    depth = 5
-
-    test_kwargs = copy.copy(kwargs)
-    for value in values:
-      test_kwargs[arg] = value
-      with self.cached_session(use_gpu=True):
-        testing_utils.layer_test(
-            keras.layers.Conv3DTranspose,
-            kwargs=test_kwargs,
-            input_shape=(num_samples, depth, num_row, num_col, stack_size))
-
-  def test_conv3dtranspose(self):
-    kwargs = {
-        'filters': 2,
-        'kernel_size': (3, 3, 3),
-    }
-
-    self._run_test(kwargs, 'padding', ['valid', 'same'])
-    self._run_test(kwargs, 'strides', [(2, 2, 2)])
-    if test.is_gpu_available(cuda_only=True):
-      self._run_test(kwargs, 'data_format', ['channels_first'])
-
-    kwargs['strides'] = (2, 2, 2)
-    self._run_test(kwargs, 'output_padding', [(1, 1, 1)])
-
-  def test_conv3dtranspose_regularizers(self):
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'kernel_regularizer': 'l2',
-        'bias_regularizer': 'l2',
-        'activity_regularizer': 'l2',
-        'strides': 1
-    }
-    with self.session(use_gpu=True):
-      layer = keras.layers.Conv3DTranspose(**kwargs)
-      layer.build((None, 5, 5, 5, 2))
-      self.assertEqual(len(layer.losses), 2)
-      layer(keras.backend.variable(np.ones((1, 5, 5, 5, 2))))
-      self.assertEqual(len(layer.losses), 3)
-
-  def test_conv3dtranspose_constraints(self):
-    k_constraint = lambda x: x
-    b_constraint = lambda x: x
-
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'kernel_constraint': k_constraint,
-        'bias_constraint': b_constraint,
-        'strides': 1
-    }
-    with self.session(use_gpu=True):
-      layer = keras.layers.Conv3DTranspose(**kwargs)
-      layer.build((None, 5, 5, 5, 2))
-      self.assertEqual(layer.kernel.constraint, k_constraint)
-      self.assertEqual(layer.bias.constraint, b_constraint)
-
-
-@tf_test_util.run_all_in_graph_and_eager_modes
-class SeparableConv1DTest(test.TestCase):
-
-  def _run_test(self, kwargs, arg, values):
-    num_samples = 2
-    stack_size = 3
-    length = 7
-
-    test_kwargs = copy.copy(kwargs)
-    for value in values:
-      test_kwargs[arg] = value
-      with self.cached_session(use_gpu=True):
-        testing_utils.layer_test(
-            keras.layers.SeparableConv1D,
-            kwargs=test_kwargs,
-            input_shape=(num_samples, length, stack_size))
-
-  def test_separable_conv1d(self):
-    kwargs = {
-        'filters': 2,
-        'kernel_size': 3,
-    }
-
-    self._run_test(kwargs, 'padding', ['valid', 'same', 'causal'])
-    self._run_test(kwargs, 'strides', [2])
-    self._run_test(kwargs, 'dilation_rate', [2])
-    self._run_test(kwargs, 'depth_multiplier', [2])
-
-    kwargs = {
-        'filters': 2,
-        'kernel_size': 3,
-        'padding': 'same',
-    }
-    self._run_test(kwargs, 'dilation_rate', [2])
-
-  def test_separable_conv1d_regularizers(self):
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'depthwise_regularizer': 'l2',
-        'pointwise_regularizer': 'l2',
-        'bias_regularizer': 'l2',
-        'activity_regularizer': 'l2',
-        'strides': 1
-    }
-    with self.session(use_gpu=True):
-      layer = keras.layers.SeparableConv1D(**kwargs)
-      layer.build((None, 5, 2))
-      self.assertEqual(len(layer.losses), 3)
-      layer(keras.backend.variable(np.ones((1, 5, 2))))
-      self.assertEqual(len(layer.losses), 4)
-
-  def test_separable_conv1d_constraints(self):
-    d_constraint = lambda x: x
-    p_constraint = lambda x: x
-    b_constraint = lambda x: x
-
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'pointwise_constraint': p_constraint,
-        'depthwise_constraint': d_constraint,
-        'bias_constraint': b_constraint,
-        'strides': 1
-    }
-    with self.session(use_gpu=True):
-      layer = keras.layers.SeparableConv1D(**kwargs)
-      layer.build((None, 5, 2))
-      self.assertEqual(layer.depthwise_kernel.constraint, d_constraint)
-      self.assertEqual(layer.pointwise_kernel.constraint, p_constraint)
-      self.assertEqual(layer.bias.constraint, b_constraint)
-
-
-@tf_test_util.run_all_in_graph_and_eager_modes
-class SeparableConv2DTest(test.TestCase):
-
-  def _run_test(self, kwargs, arg, values):
-    num_samples = 2
-    stack_size = 3
-    num_row = 7
-    num_col = 6
-
-    test_kwargs = copy.copy(kwargs)
-    for value in values:
-      test_kwargs[arg] = value
-      with self.cached_session(use_gpu=True):
-        testing_utils.layer_test(
-            keras.layers.SeparableConv2D,
-            kwargs=test_kwargs,
-            input_shape=(num_samples, num_row, num_col, stack_size))
-
-  def test_separable_conv2d(self):
-    kwargs = {
-        'filters': 2,
-        'kernel_size': 3,
-    }
-
-    self._run_test(kwargs, 'padding', ['valid', 'same'])
-    self._run_test(kwargs, 'strides', [2])
-    if test.is_gpu_available(cuda_only=True):
-      self._run_test(kwargs, 'data_format', ['channels_first'])
-    self._run_test(kwargs, 'dilation_rate', [2])
-    self._run_test(kwargs, 'depth_multiplier', [2])
-
-    kwargs = {
-        'filters': 2,
-        'kernel_size': 3,
-        'padding': 'same',
-    }
-    self._run_test(kwargs, 'dilation_rate', [2])
-
-  def test_separable_conv2d_regularizers(self):
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'depthwise_regularizer': 'l2',
-        'pointwise_regularizer': 'l2',
-        'bias_regularizer': 'l2',
-        'activity_regularizer': 'l2',
-        'strides': 1
-    }
-    with self.session(use_gpu=True):
-      layer = keras.layers.SeparableConv2D(**kwargs)
-      layer.build((None, 5, 5, 2))
-      self.assertEqual(len(layer.losses), 3)
-      layer(keras.backend.variable(np.ones((1, 5, 5, 2))))
-      self.assertEqual(len(layer.losses), 4)
-
-  def test_separable_conv2d_constraints(self):
-    d_constraint = lambda x: x
-    p_constraint = lambda x: x
-    b_constraint = lambda x: x
-
-    kwargs = {
-        'filters': 3,
-        'kernel_size': 3,
-        'padding': 'valid',
-        'pointwise_constraint': p_constraint,
-        'depthwise_constraint': d_constraint,
-        'bias_constraint': b_constraint,
-        'strides': 1
-    }
-    with self.session(use_gpu=True):
-      layer = keras.layers.SeparableConv2D(**kwargs)
-      layer.build((None, 5, 5, 2))
-      self.assertEqual(layer.depthwise_kernel.constraint, d_constraint)
-      self.assertEqual(layer.pointwise_kernel.constraint, p_constraint)
-      self.assertEqual(layer.bias.constraint, b_constraint)
-
-
-@tf_test_util.run_all_in_graph_and_eager_modes
-class Conv3DTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class Conv3DTest(keras_parameterized.TestCase):
 
-  def _run_test(self, kwargs, arg, values):
+  def _run_test(self, kwargs):
     num_samples = 2
     stack_size = 3
     num_row = 7
     num_col = 6
     depth = 5
 
-    test_kwargs = copy.copy(kwargs)
-    for value in values:
-      test_kwargs[arg] = value
-      with self.cached_session(use_gpu=True):
-        testing_utils.layer_test(
-            keras.layers.Conv3D,
-            kwargs=test_kwargs,
-            input_shape=(num_samples, depth, num_row, num_col, stack_size))
-
-  def test_conv3d(self):
-    kwargs = {
-        'filters': 2,
-        'kernel_size': (3, 3, 3),
-    }
-
-    self._run_test(kwargs, 'padding', ['valid', 'same'])
-    self._run_test(kwargs, 'strides', [(2, 2, 2)])
-    self._run_test(kwargs, 'dilation_rate', [(2, 2, 2)])
-    if test.is_gpu_available(cuda_only=True):
-      self._run_test(kwargs, 'data_format', ['channels_first'])
+    with self.cached_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.Conv3D,
+          kwargs=kwargs,
+          input_shape=(num_samples, depth, num_row, num_col, stack_size))
+
+  @parameterized.named_parameters(
+      ('padding_valid', {'padding': 'valid'}),
+      ('padding_same', {'padding': 'same'}),
+      ('strides', {'strides': (2, 2, 2)}),
+      ('dilation_rate', {'dilation_rate': (2, 2, 2)}),
+      # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+      # TODO(b/62340061): Support channels_first on CPU.
+      ('data_format', {'data_format': 'channels_first'}),
+  )
+  def test_conv3d(self, kwargs):
+    kwargs['filters'] = 2
+    kwargs['kernel_size'] = (3, 3, 3)
+    if 'data_format' not in kwargs or test.is_gpu_available(cuda_only=True):
+      self._run_test(kwargs)
 
   def test_conv3d_regularizers(self):
     kwargs = {
@@ -530,7 +201,7 @@ class Conv3DTest(test.TestCase):
         'activity_regularizer': 'l2',
         'strides': 1
     }
-    with self.session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       layer = keras.layers.Conv3D(**kwargs)
       layer.build((None, 5, 5, 5, 2))
       self.assertEqual(len(layer.losses), 2)
@@ -550,15 +221,39 @@ class Conv3DTest(test.TestCase):
         'bias_constraint': b_constraint,
         'strides': 1
     }
-    with self.session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       layer = keras.layers.Conv3D(**kwargs)
       layer.build((None, 5, 5, 5, 2))
       self.assertEqual(layer.kernel.constraint, k_constraint)
       self.assertEqual(layer.bias.constraint, b_constraint)
 
+  def test_conv3d_dynamic_shape(self):
+    input_data = np.random.random((1, 3, 3, 3, 3)).astype(np.float32)
+    with self.cached_session(use_gpu=True):
+      # Won't raise error here.
+      testing_utils.layer_test(
+          keras.layers.Conv3D,
+          kwargs={
+              'data_format': 'channels_last',
+              'filters': 3,
+              'kernel_size': 3
+          },
+          input_shape=(None, None, None, None, 3),
+          input_data=input_data)
+      if test.is_gpu_available(cuda_only=True):
+        testing_utils.layer_test(
+            keras.layers.Conv3D,
+            kwargs={
+                'data_format': 'channels_first',
+                'filters': 3,
+                'kernel_size': 3
+            },
+            input_shape=(None, 3, None, None, None),
+            input_data=input_data)
+
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class ZeroPaddingTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class ZeroPaddingTest(keras_parameterized.TestCase):
 
   def test_zero_padding_1d(self):
     num_samples = 2
@@ -567,7 +262,7 @@ class ZeroPaddingTest(test.TestCase):
     shape = (num_samples, num_steps, input_dim)
     inputs = np.ones(shape)
 
-    with self.session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       # basic test
       testing_utils.layer_test(
           keras.layers.ZeroPadding1D,
@@ -698,7 +393,7 @@ class ZeroPaddingTest(test.TestCase):
     inputs = np.ones((num_samples, input_len_dim1, input_len_dim2,
                       input_len_dim3, stack_size))
 
-    with self.session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       # basic test
       testing_utils.layer_test(
           keras.layers.ZeroPadding3D,
@@ -726,11 +421,12 @@ class ZeroPaddingTest(test.TestCase):
       keras.layers.ZeroPadding3D(padding=None)
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class UpSamplingTest(test.TestCase):
+@test_util.disable_all_xla('b/124289666')  # align_corners=False unimplemented
+@keras_parameterized.run_all_keras_modes
+class UpSamplingTest(keras_parameterized.TestCase):
 
   def test_upsampling_1d(self):
-    with self.session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       testing_utils.layer_test(
           keras.layers.UpSampling1D, kwargs={'size': 2}, input_shape=(3, 5, 4))
 
@@ -875,8 +571,8 @@ class UpSamplingTest(test.TestCase):
               np.testing.assert_allclose(np_output, expected_out)
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class CroppingTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class CroppingTest(keras_parameterized.TestCase):
 
   def test_cropping_1d(self):
     num_samples = 2
@@ -884,7 +580,7 @@ class CroppingTest(test.TestCase):
     input_len_dim1 = 2
     inputs = np.random.rand(num_samples, time_length, input_len_dim1)
 
-    with self.session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       testing_utils.layer_test(
           keras.layers.Cropping1D,
           kwargs={'cropping': (2, 2)},
@@ -1017,45 +713,51 @@ class CroppingTest(test.TestCase):
       keras.layers.Cropping3D(cropping=None)
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class DepthwiseConv2DTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class DepthwiseConv2DTest(keras_parameterized.TestCase):
 
-  def _run_test(self, kwargs, arg, values):
+  def _run_test(self, kwargs):
     num_samples = 2
     stack_size = 3
     num_row = 7
     num_col = 6
 
-    test_kwargs = copy.copy(kwargs)
-    for value in values:
-      test_kwargs[arg] = value
-      with self.cached_session(use_gpu=True):
-        testing_utils.layer_test(
-            keras.layers.DepthwiseConv2D,
-            kwargs=test_kwargs,
-            input_shape=(num_samples, num_row, num_col, stack_size))
-
-  def test_depthwise_conv2d(self):
-    kwargs = {'kernel_size': (3, 3)}
-
-    self._run_test(kwargs, 'padding', ['valid', 'same'])
-    self._run_test(kwargs, 'strides', [(2, 2)])
-    if test.is_gpu_available(cuda_only=True):
-      self._run_test(kwargs, 'data_format', ['channels_first'])
-    self._run_test(kwargs, 'depth_multiplier', [1, 2])
-
-    kwargs = {'kernel_size': 3,
-              'padding': 'valid',
-              'data_format': 'channels_first',
-              'activation': None,
-              'depthwise_regularizer': 'l2',
-              'bias_regularizer': 'l2',
-              'activity_regularizer': 'l2',
-              'depthwise_constraint': 'unit_norm',
-              'use_bias': True,
-              'strides': (2, 2),
-             }
-    self._run_test(kwargs, 'depth_multiplier', [1])
+    with self.cached_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.DepthwiseConv2D,
+          kwargs=kwargs,
+          input_shape=(num_samples, num_row, num_col, stack_size))
+
+  @parameterized.named_parameters(
+      ('padding_valid', {'padding': 'valid'}),
+      ('padding_same', {'padding': 'same'}),
+      ('strides', {'strides': (2, 2)}),
+      # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+      # TODO(b/62340061): Support channels_first on CPU.
+      ('data_format', {'data_format': 'channels_first'}),
+      ('depth_multiplier_1', {'depth_multiplier': 1}),
+      ('depth_multiplier_2', {'depth_multiplier': 2}),
+  )
+  def test_depthwise_conv2d(self, kwargs):
+    kwargs['kernel_size'] = (3, 3)
+    if 'data_format' not in kwargs or test.is_gpu_available(cuda_only=True):
+      self._run_test(kwargs)
+
+  def test_depthwise_conv2d_full(self):
+    kwargs = {
+        'kernel_size': 3,
+        'padding': 'valid',
+        'data_format': 'channels_last',
+        'activation': None,
+        'depthwise_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+        'depthwise_constraint': 'unit_norm',
+        'use_bias': True,
+        'strides': (2, 2),
+        'depth_multiplier': 1,
+    }
+    self._run_test(kwargs)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/convolutional_transpose_test.py b/tensorflow/python/keras/layers/convolutional_transpose_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd73d22d51b014c6dd00e946ad1cf7f0cd7332f8
--- /dev/null
+++ b/tensorflow/python/keras/layers/convolutional_transpose_test.py
@@ -0,0 +1,209 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for convolutional transpose layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+@keras_parameterized.run_all_keras_modes
+class Conv2DTransposeTest(keras_parameterized.TestCase):
+
+  def _run_test(self, kwargs):
+    num_samples = 2
+    stack_size = 3
+    num_row = 7
+    num_col = 6
+
+    with self.cached_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.Conv2DTranspose,
+          kwargs=kwargs,
+          input_shape=(num_samples, num_row, num_col, stack_size))
+
+  @parameterized.named_parameters(
+      ('padding_valid', {'padding': 'valid'}),
+      ('padding_same', {'padding': 'same'}),
+      ('strides', {'strides': (2, 2)}),
+      # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+      # TODO(b/62340061): Support channels_first on CPU.
+      ('data_format', {'data_format': 'channels_first'}),
+      ('strides_output_padding', {'strides': (2, 2), 'output_padding': (1, 1)}),
+  )
+  def test_conv2d_transpose(self, kwargs):
+    kwargs['filters'] = 2
+    kwargs['kernel_size'] = (3, 3)
+    if 'data_format' not in kwargs or test.is_gpu_available(cuda_only=True):
+      self._run_test(kwargs)
+
+  def test_conv2d_transpose_regularizers(self):
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+        'strides': 1
+    }
+    with self.cached_session(use_gpu=True):
+      layer = keras.layers.Conv2DTranspose(**kwargs)
+      layer.build((None, 5, 5, 2))
+      self.assertEqual(len(layer.losses), 2)
+      layer(keras.backend.variable(np.ones((1, 5, 5, 2))))
+      self.assertEqual(len(layer.losses), 3)
+
+  def test_conv2d_transpose_constraints(self):
+    k_constraint = lambda x: x
+    b_constraint = lambda x: x
+
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_constraint': k_constraint,
+        'bias_constraint': b_constraint,
+        'strides': 1
+    }
+    with self.cached_session(use_gpu=True):
+      layer = keras.layers.Conv2DTranspose(**kwargs)
+      layer.build((None, 5, 5, 2))
+      self.assertEqual(layer.kernel.constraint, k_constraint)
+      self.assertEqual(layer.bias.constraint, b_constraint)
+
+  def test_conv2d_transpose_dilation(self):
+    testing_utils.layer_test(keras.layers.Conv2DTranspose,
+                             kwargs={'filters': 2,
+                                     'kernel_size': 3,
+                                     'padding': 'same',
+                                     'data_format': 'channels_last',
+                                     'dilation_rate': (2, 2)},
+                             input_shape=(2, 5, 6, 3))
+
+    input_data = np.arange(48).reshape((1, 4, 4, 3)).astype(np.float32)
+    expected_output = np.float32([[192, 228, 192, 228],
+                                  [336, 372, 336, 372],
+                                  [192, 228, 192, 228],
+                                  [336, 372, 336, 372]]).reshape((1, 4, 4, 1))
+    testing_utils.layer_test(keras.layers.Conv2DTranspose,
+                             input_data=input_data,
+                             kwargs={'filters': 1,
+                                     'kernel_size': 3,
+                                     'padding': 'same',
+                                     'data_format': 'channels_last',
+                                     'dilation_rate': (2, 2),
+                                     'kernel_initializer': 'ones'},
+                             expected_output=expected_output)
+
+
+@keras_parameterized.run_all_keras_modes
+class Conv3DTransposeTest(keras_parameterized.TestCase):
+
+  def _run_test(self, kwargs):
+    num_samples = 2
+    stack_size = 3
+    num_row = 7
+    num_col = 6
+    depth = 5
+
+    with self.cached_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.Conv3DTranspose,
+          kwargs=kwargs,
+          input_shape=(num_samples, depth, num_row, num_col, stack_size))
+
+  @parameterized.named_parameters(
+      ('padding_valid', {'padding': 'valid'}),
+      ('padding_same', {'padding': 'same'}),
+      ('strides', {'strides': (2, 2, 2)}),
+      # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+      # TODO(b/62340061): Support channels_first on CPU.
+      ('data_format', {'data_format': 'channels_first'}),
+      ('strides_output_padding', {'strides': (2, 2, 2),
+                                  'output_padding': (1, 1, 1)}),
+  )
+  def test_conv3d_transpose(self, kwargs):
+    kwargs['filters'] = 2
+    kwargs['kernel_size'] = (3, 3, 3)
+    if 'data_format' not in kwargs or test.is_gpu_available(cuda_only=True):
+      self._run_test(kwargs)
+
+  def test_conv3d_transpose_regularizers(self):
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+        'strides': 1
+    }
+    with self.cached_session(use_gpu=True):
+      layer = keras.layers.Conv3DTranspose(**kwargs)
+      layer.build((None, 5, 5, 5, 2))
+      self.assertEqual(len(layer.losses), 2)
+      layer(keras.backend.variable(np.ones((1, 5, 5, 5, 2))))
+      self.assertEqual(len(layer.losses), 3)
+
+  def test_conv3d_transpose_constraints(self):
+    k_constraint = lambda x: x
+    b_constraint = lambda x: x
+
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'kernel_constraint': k_constraint,
+        'bias_constraint': b_constraint,
+        'strides': 1
+    }
+    with self.cached_session(use_gpu=True):
+      layer = keras.layers.Conv3DTranspose(**kwargs)
+      layer.build((None, 5, 5, 5, 2))
+      self.assertEqual(layer.kernel.constraint, k_constraint)
+      self.assertEqual(layer.bias.constraint, b_constraint)
+
+  def test_conv3d_transpose_dynamic_shape(self):
+    input_data = np.random.random((1, 3, 3, 3, 3)).astype(np.float32)
+    with self.cached_session(use_gpu=True):
+      # Won't raise error here.
+      testing_utils.layer_test(
+          keras.layers.Conv3DTranspose,
+          kwargs={
+              'data_format': 'channels_last',
+              'filters': 3,
+              'kernel_size': 3
+          },
+          input_shape=(None, None, None, None, 3),
+          input_data=input_data)
+      if test.is_gpu_available(cuda_only=True):
+        testing_utils.layer_test(
+            keras.layers.Conv3DTranspose,
+            kwargs={
+                'data_format': 'channels_first',
+                'filters': 3,
+                'kernel_size': 3
+            },
+            input_shape=(None, 3, None, None, None),
+            input_data=input_data)
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index 854774c569e3c86d1665f39fcdec74960df2928b..b33c328c4480a63e87a107624365d1819151f4be 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -27,6 +27,7 @@ import numpy as np
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import common_shapes
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import activations
@@ -45,10 +46,10 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import standard_ops
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.layers.Masking')
+@keras_export('keras.layers.Masking')
 class Masking(Layer):
   """Masks a sequence by using a mask value to skip timesteps.
 
@@ -81,6 +82,7 @@ class Masking(Layer):
     super(Masking, self).__init__(**kwargs)
     self.supports_masking = True
     self.mask_value = mask_value
+    self._compute_output_and_mask_jointly = True
 
   def compute_mask(self, inputs, mask=None):
     return K.any(math_ops.not_equal(inputs, self.mask_value), axis=-1)
@@ -88,7 +90,10 @@ class Masking(Layer):
   def call(self, inputs):
     boolean_mask = K.any(
         math_ops.not_equal(inputs, self.mask_value), axis=-1, keepdims=True)
-    return inputs * math_ops.cast(boolean_mask, inputs.dtype)
+    outputs = inputs * math_ops.cast(boolean_mask, inputs.dtype)
+    # Compute the mask and outputs simultaneously.
+    outputs._keras_mask = array_ops.squeeze(boolean_mask, axis=-1)  # pylint: disable=protected-access
+    return outputs
 
   def compute_output_shape(self, input_shape):
     return input_shape
@@ -99,7 +104,7 @@ class Masking(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.Dropout')
+@keras_export('keras.layers.Dropout')
 class Dropout(Layer):
   """Applies Dropout to the input.
 
@@ -138,9 +143,12 @@ class Dropout(Layer):
       training = K.learning_phase()
 
     def dropped_inputs():
-      return nn.dropout(inputs, 1  - self.rate,
-                        noise_shape=self._get_noise_shape(inputs),
-                        seed=self.seed)
+      return nn.dropout(
+          inputs,
+          noise_shape=self._get_noise_shape(inputs),
+          seed=self.seed,
+          rate=self.rate)
+
     output = tf_utils.smart_cond(training,
                                  dropped_inputs,
                                  lambda: array_ops.identity(inputs))
@@ -159,7 +167,7 @@ class Dropout(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.SpatialDropout1D')
+@keras_export('keras.layers.SpatialDropout1D')
 class SpatialDropout1D(Dropout):
   """Spatial 1D version of Dropout.
 
@@ -196,7 +204,7 @@ class SpatialDropout1D(Dropout):
     return noise_shape
 
 
-@tf_export('keras.layers.SpatialDropout2D')
+@keras_export('keras.layers.SpatialDropout2D')
 class SpatialDropout2D(Dropout):
   """Spatial 2D version of Dropout.
 
@@ -250,7 +258,7 @@ class SpatialDropout2D(Dropout):
       return (input_shape[0], 1, 1, input_shape[3])
 
 
-@tf_export('keras.layers.SpatialDropout3D')
+@keras_export('keras.layers.SpatialDropout3D')
 class SpatialDropout3D(Dropout):
   """Spatial 3D version of Dropout.
 
@@ -303,7 +311,7 @@ class SpatialDropout3D(Dropout):
       return (input_shape[0], 1, 1, 1, input_shape[4])
 
 
-@tf_export('keras.layers.Activation')
+@keras_export('keras.layers.Activation')
 class Activation(Layer):
   """Applies an activation function to an output.
 
@@ -337,7 +345,7 @@ class Activation(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.Reshape')
+@keras_export('keras.layers.Reshape')
 class Reshape(Layer):
   """Reshapes an output to a certain shape.
 
@@ -444,7 +452,7 @@ class Reshape(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.Permute')
+@keras_export('keras.layers.Permute')
 class Permute(Layer):
   """Permutes the dimensions of the input according to a given pattern.
 
@@ -502,7 +510,7 @@ class Permute(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.Flatten')
+@keras_export('keras.layers.Flatten')
 class Flatten(Layer):
   """Flattens the input. Does not affect the batch size.
 
@@ -549,7 +557,8 @@ class Flatten(Layer):
       inputs = array_ops.transpose(inputs, perm=permutation)
 
     outputs = array_ops.reshape(
-        inputs, (inputs.shape[0].value or array_ops.shape(inputs)[0], -1))
+        inputs, (tensor_shape.dimension_value(inputs.shape[0]) or
+                 array_ops.shape(inputs)[0], -1))
     if not context.executing_eagerly():
       outputs.set_shape(self.compute_output_shape(inputs.get_shape()))
     return outputs
@@ -571,7 +580,7 @@ class Flatten(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.RepeatVector')
+@keras_export('keras.layers.RepeatVector')
 class RepeatVector(Layer):
   """Repeats the input n times.
 
@@ -615,7 +624,7 @@ class RepeatVector(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.Lambda')
+@keras_export('keras.layers.Lambda')
 class Lambda(Layer):
   """Wraps arbitrary expression as a `Layer` object.
 
@@ -844,7 +853,7 @@ class Lambda(Layer):
     return cls(**config)
 
 
-@tf_export('keras.layers.Dense')
+@keras_export('keras.layers.Dense')
 class Dense(Layer):
   """Just your regular densely-connected NN layer.
 
@@ -931,6 +940,10 @@ class Dense(Layer):
     self.input_spec = InputSpec(min_ndim=2)
 
   def build(self, input_shape):
+    dtype = dtypes.as_dtype(self.dtype or K.floatx())
+    if not (dtype.is_floating or dtype.is_complex):
+      raise TypeError('Unable to build `Dense` layer with non-floating point '
+                      'dtype %s' % (dtype,))
     input_shape = tensor_shape.TensorShape(input_shape)
     if tensor_shape.dimension_value(input_shape[-1]) is None:
       raise ValueError('The last dimension of the inputs to `Dense` '
@@ -971,6 +984,7 @@ class Dense(Layer):
         output_shape = shape[:-1] + [self.units]
         outputs.set_shape(output_shape)
     else:
+      inputs = math_ops.cast(inputs, self.dtype)
       outputs = gen_math_ops.mat_mul(inputs, self.kernel)
     if self.use_bias:
       outputs = nn.bias_add(outputs, self.bias)
@@ -1005,7 +1019,7 @@ class Dense(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.ActivityRegularization')
+@keras_export('keras.layers.ActivityRegularization')
 class ActivityRegularization(Layer):
   """Layer that applies an update to the cost function based input activity.
 
diff --git a/tensorflow/python/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py
index f138adf76026b116b2a4d771e8ae90194e065bef..6ba59b2ff33f754f079b96d959af20071bc24d03 100644
--- a/tensorflow/python/keras/layers/core_test.py
+++ b/tensorflow/python/keras/layers/core_test.py
@@ -22,43 +22,37 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
-from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-class CoreLayersTest(test.TestCase):
-
-  def test_masking(self):
-    with self.cached_session():
-      testing_utils.layer_test(
-          keras.layers.Masking, kwargs={}, input_shape=(3, 2, 3))
+@keras_parameterized.run_all_keras_modes
+class DropoutLayersTest(keras_parameterized.TestCase):
 
   def test_dropout(self):
-    with self.cached_session():
-      testing_utils.layer_test(
-          keras.layers.Dropout, kwargs={'rate': 0.5}, input_shape=(3, 2))
+    testing_utils.layer_test(
+        keras.layers.Dropout, kwargs={'rate': 0.5}, input_shape=(3, 2))
 
-    with self.cached_session():
-      testing_utils.layer_test(
-          keras.layers.Dropout,
-          kwargs={'rate': 0.5,
-                  'noise_shape': [3, 1]},
-          input_shape=(3, 2))
-
-    # https://github.com/tensorflow/tensorflow/issues/14819
-    with self.cached_session():
-      dropout = keras.layers.Dropout(0.5)
-      self.assertEqual(True, dropout.supports_masking)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_spatial_dropout(self):
+    testing_utils.layer_test(
+        keras.layers.Dropout,
+        kwargs={'rate': 0.5,
+                'noise_shape': [3, 1]},
+        input_shape=(3, 2))
+
+  def test_dropout_supports_masking(self):
+    dropout = keras.layers.Dropout(0.5)
+    self.assertEqual(True, dropout.supports_masking)
+
+  def test_spatial_dropout_1d(self):
     testing_utils.layer_test(
         keras.layers.SpatialDropout1D,
         kwargs={'rate': 0.5},
         input_shape=(2, 3, 4))
 
+  def test_spatial_dropout_2d(self):
     testing_utils.layer_test(
         keras.layers.SpatialDropout2D,
         kwargs={'rate': 0.5},
@@ -69,6 +63,7 @@ class CoreLayersTest(test.TestCase):
         kwargs={'rate': 0.5, 'data_format': 'channels_first'},
         input_shape=(2, 3, 4, 5))
 
+  def test_spatial_dropout_3d(self):
     testing_utils.layer_test(
         keras.layers.SpatialDropout3D,
         kwargs={'rate': 0.5},
@@ -79,7 +74,129 @@ class CoreLayersTest(test.TestCase):
         kwargs={'rate': 0.5, 'data_format': 'channels_first'},
         input_shape=(2, 3, 4, 4, 5))
 
-  @tf_test_util.run_in_graph_and_eager_modes
+
+@keras_parameterized.run_all_keras_modes
+class LambdaLayerTest(keras_parameterized.TestCase):
+
+  def test_lambda(self):
+    testing_utils.layer_test(
+        keras.layers.Lambda,
+        kwargs={'function': lambda x: x + 1},
+        input_shape=(3, 2))
+
+    testing_utils.layer_test(
+        keras.layers.Lambda,
+        kwargs={
+            'function': lambda x, a, b: x * a + b,
+            'arguments': {
+                'a': 0.6,
+                'b': 0.4
+            }
+        },
+        input_shape=(3, 2))
+
+    # test serialization with function
+    def f(x):
+      return x + 1
+
+    ld = keras.layers.Lambda(f)
+    config = ld.get_config()
+    ld = keras.layers.deserialize({
+        'class_name': 'Lambda',
+        'config': config
+    })
+
+    # test with lambda
+    ld = keras.layers.Lambda(
+        lambda x: keras.backend.concatenate([math_ops.square(x), x]))
+    config = ld.get_config()
+    ld = keras.layers.Lambda.from_config(config)
+
+  def test_lambda_multiple_inputs(self):
+    ld = keras.layers.Lambda(lambda x: x[0], output_shape=lambda x: x[0])
+    x1 = np.ones([3, 2], np.float32)
+    x2 = np.ones([3, 5], np.float32)
+    out = ld([x1, x2])
+    self.assertAllEqual(out.shape, [3, 2])
+
+  def test_lambda_output_shape(self):
+    l = keras.layers.Lambda(lambda x: x + 1, output_shape=(1, 1))
+    l(keras.backend.variable(np.ones((1, 1))))
+    self.assertEqual((1, 1), l.get_config()['output_shape'])
+
+  def test_lambda_output_shape_function(self):
+    def get_output_shape(input_shape):
+      return 1 * input_shape
+
+    l = keras.layers.Lambda(lambda x: x + 1, output_shape=get_output_shape)
+    l(keras.backend.variable(np.ones((1, 1))))
+    self.assertEqual('lambda', l.get_config()['output_shape_type'])
+
+  def test_lambda_output_shape_autocalculate_multiple_inputs(self):
+
+    def lambda_fn(x):
+      return math_ops.matmul(x[0], x[1])
+
+    l = keras.layers.Lambda(lambda_fn)
+    output_shape = l.compute_output_shape([(10, 10), (10, 20)])
+    self.assertAllEqual((10, 20), output_shape)
+
+  def test_lambda_output_shape_list_multiple_outputs(self):
+
+    def lambda_fn(x):
+      return x
+
+    l = keras.layers.Lambda(lambda_fn, output_shape=[(10,), (20,)])
+    output_shape = l.compute_output_shape([(10, 10), (10, 20)])
+    self.assertAllEqual([(10, 10), (10, 20)], output_shape)
+
+  def test_lambda_output_shape_tuple_with_none(self):
+
+    def lambda_fn(x):
+      return x
+
+    l = keras.layers.Lambda(lambda_fn, output_shape=(None, 10))
+    output_shape = l.compute_output_shape((5, 10, 20))
+    self.assertAllEqual([5, None, 10], output_shape.as_list())
+
+  def test_lambda_output_shape_function_multiple_outputs(self):
+
+    def lambda_fn(x):
+      return x
+
+    def output_shape_fn(input_shape):
+      return input_shape
+
+    l = keras.layers.Lambda(lambda_fn, output_shape=output_shape_fn)
+    output_shape = l.compute_output_shape([(10, 10), (10, 20)])
+    self.assertAllEqual([(10, 10), (10, 20)], output_shape)
+
+  def test_lambda_config_serialization(self):
+    # Test serialization with output_shape and output_shape_type
+    layer = keras.layers.Lambda(lambda x: x + 1, output_shape=(1, 1))
+    layer(keras.backend.variable(np.ones((1, 1))))
+    config = layer.get_config()
+    layer = keras.layers.deserialize({
+        'class_name': 'Lambda',
+        'config': config
+    })
+    layer = keras.layers.Lambda.from_config(config)
+
+
+@keras_parameterized.run_all_keras_modes
+class CoreLayersTest(keras_parameterized.TestCase):
+
+  def test_masking(self):
+    testing_utils.layer_test(
+        keras.layers.Masking, kwargs={}, input_shape=(3, 2, 3))
+
+  def test_keras_mask(self):
+    x = np.ones((10, 10))
+    y = keras.layers.Masking(1.)(x)
+    self.assertTrue(hasattr(y, '_keras_mask'))
+    self.assertTrue(y._keras_mask is not None)
+    self.assertAllClose(self.evaluate(y._keras_mask), np.zeros((10,)))
+
   def test_activation(self):
     # with string argument
     testing_utils.layer_test(
@@ -93,7 +210,6 @@ class CoreLayersTest(test.TestCase):
         kwargs={'activation': keras.backend.relu},
         input_shape=(3, 2))
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_reshape(self):
     testing_utils.layer_test(
         keras.layers.Reshape,
@@ -115,26 +231,22 @@ class CoreLayersTest(test.TestCase):
         kwargs={'target_shape': (-1, 1)},
         input_shape=(None, None, 2))
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_permute(self):
     testing_utils.layer_test(
         keras.layers.Permute, kwargs={'dims': (2, 1)}, input_shape=(3, 2, 4))
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_permute_errors_on_invalid_starting_dims_index(self):
     with self.assertRaisesRegexp(ValueError, r'Invalid permutation .*dims.*'):
       testing_utils.layer_test(
           keras.layers.Permute,
           kwargs={'dims': (0, 1, 2)}, input_shape=(3, 2, 4))
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_permute_errors_on_invalid_set_of_dims_indices(self):
     with self.assertRaisesRegexp(ValueError, r'Invalid permutation .*dims.*'):
       testing_utils.layer_test(
           keras.layers.Permute,
           kwargs={'dims': (1, 4, 2)}, input_shape=(3, 2, 4))
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_flatten(self):
     testing_utils.layer_test(
         keras.layers.Flatten, kwargs={}, input_shape=(3, 2, 4))
@@ -149,7 +261,6 @@ class CoreLayersTest(test.TestCase):
         np.transpose(inputs, (0, 2, 3, 1)), (-1, 5 * 5 * 3))
     self.assertAllClose(outputs, target_outputs)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_flatten_scalar_channels(self):
     testing_utils.layer_test(
         keras.layers.Flatten, kwargs={}, input_shape=(3,))
@@ -163,54 +274,10 @@ class CoreLayersTest(test.TestCase):
     target_outputs = np.expand_dims(inputs, -1)
     self.assertAllClose(outputs, target_outputs)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_repeat_vector(self):
     testing_utils.layer_test(
         keras.layers.RepeatVector, kwargs={'n': 3}, input_shape=(3, 2))
 
-  def test_lambda(self):
-    testing_utils.layer_test(
-        keras.layers.Lambda,
-        kwargs={'function': lambda x: x + 1},
-        input_shape=(3, 2))
-
-    testing_utils.layer_test(
-        keras.layers.Lambda,
-        kwargs={
-            'function': lambda x, a, b: x * a + b,
-            'arguments': {
-                'a': 0.6,
-                'b': 0.4
-            }
-        },
-        input_shape=(3, 2))
-
-    # test serialization with function
-    def f(x):
-      return x + 1
-
-    ld = keras.layers.Lambda(f)
-    config = ld.get_config()
-    ld = keras.layers.deserialize({
-        'class_name': 'Lambda',
-        'config': config
-    })
-
-    # test with lambda
-    ld = keras.layers.Lambda(
-        lambda x: keras.backend.concatenate([math_ops.square(x), x]))
-    config = ld.get_config()
-    ld = keras.layers.Lambda.from_config(config)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_lambda_multiple_inputs(self):
-    ld = keras.layers.Lambda(lambda x: x[0], output_shape=lambda x: x[0])
-    x1 = np.ones([3, 2], np.float32)
-    x2 = np.ones([3, 5], np.float32)
-    out = ld([x1, x2])
-    self.assertAllEqual(out.shape, [3, 2])
-
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_dense(self):
     testing_utils.layer_test(
         keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 2))
@@ -224,106 +291,39 @@ class CoreLayersTest(test.TestCase):
     testing_utils.layer_test(
         keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 4, 5, 2))
 
+  def test_dense_dtype(self):
+    inputs = ops.convert_to_tensor(
+        np.random.randint(low=0, high=7, size=(2, 2)))
+    layer = keras.layers.Dense(5, dtype='float32')
+    outputs = layer(inputs)
+    self.assertEqual(outputs.dtype, 'float32')
+
   def test_dense_regularization(self):
-    with self.cached_session():
-      layer = keras.layers.Dense(
-          3,
-          kernel_regularizer=keras.regularizers.l1(0.01),
-          bias_regularizer='l1',
-          activity_regularizer='l2',
-          name='dense_reg')
-      layer(keras.backend.variable(np.ones((2, 4))))
-      self.assertEqual(3, len(layer.losses))
+    layer = keras.layers.Dense(
+        3,
+        kernel_regularizer=keras.regularizers.l1(0.01),
+        bias_regularizer='l1',
+        activity_regularizer='l2',
+        name='dense_reg')
+    layer(keras.backend.variable(np.ones((2, 4))))
+    self.assertEqual(3, len(layer.losses))
 
   def test_dense_constraints(self):
-    with self.cached_session():
-      k_constraint = keras.constraints.max_norm(0.01)
-      b_constraint = keras.constraints.max_norm(0.01)
-      layer = keras.layers.Dense(
-          3, kernel_constraint=k_constraint, bias_constraint=b_constraint)
-      layer(keras.backend.variable(np.ones((2, 4))))
-      self.assertEqual(layer.kernel.constraint, k_constraint)
-      self.assertEqual(layer.bias.constraint, b_constraint)
+    k_constraint = keras.constraints.max_norm(0.01)
+    b_constraint = keras.constraints.max_norm(0.01)
+    layer = keras.layers.Dense(
+        3, kernel_constraint=k_constraint, bias_constraint=b_constraint)
+    layer(keras.backend.variable(np.ones((2, 4))))
+    self.assertEqual(layer.kernel.constraint, k_constraint)
+    self.assertEqual(layer.bias.constraint, b_constraint)
 
   def test_activity_regularization(self):
-    with self.cached_session():
-      layer = keras.layers.ActivityRegularization(l1=0.1)
-      layer(keras.backend.variable(np.ones((2, 4))))
-      self.assertEqual(1, len(layer.losses))
-      _ = layer.get_config()
-
-  def test_lambda_output_shape(self):
-    with self.cached_session():
-      l = keras.layers.Lambda(lambda x: x + 1, output_shape=(1, 1))
-      l(keras.backend.variable(np.ones((1, 1))))
-      self.assertEqual((1, 1), l.get_config()['output_shape'])
+    layer = keras.layers.ActivityRegularization(l1=0.1)
+    layer(keras.backend.variable(np.ones((2, 4))))
+    self.assertEqual(1, len(layer.losses))
+    config = layer.get_config()
+    self.assertEqual(config.pop('l1'), 0.1)
 
-  def test_lambda_output_shape_function(self):
-    def get_output_shape(input_shape):
-      return 1 * input_shape
-
-    with self.cached_session():
-      l = keras.layers.Lambda(lambda x: x + 1, output_shape=get_output_shape)
-      l(keras.backend.variable(np.ones((1, 1))))
-      self.assertEqual('lambda', l.get_config()['output_shape_type'])
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_lambda_output_shape_autocalculate_multiple_inputs(self):
-
-    def lambda_fn(x):
-      return math_ops.matmul(x[0], x[1])
-
-    l = keras.layers.Lambda(lambda_fn)
-    output_shape = l.compute_output_shape([(10, 10), (10, 20)])
-    self.assertAllEqual((10, 20), output_shape)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_lambda_output_shape_list_multiple_outputs(self):
-
-    def lambda_fn(x):
-      return x
-
-    l = keras.layers.Lambda(lambda_fn, output_shape=[(10,), (20,)])
-    output_shape = l.compute_output_shape([(10, 10), (10, 20)])
-    self.assertAllEqual([(10, 10), (10, 20)], output_shape)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_lambda_output_shape_tuple_with_none(self):
-
-    def lambda_fn(x):
-      return x
-
-    l = keras.layers.Lambda(lambda_fn, output_shape=(None, 10))
-    output_shape = l.compute_output_shape((5, 10, 20))
-    self.assertAllEqual([5, None, 10], output_shape.as_list())
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_lambda_output_shape_function_multiple_outputs(self):
-
-    def lambda_fn(x):
-      return x
-
-    def output_shape_fn(input_shape):
-      return input_shape
-
-    l = keras.layers.Lambda(lambda_fn, output_shape=output_shape_fn)
-    output_shape = l.compute_output_shape([(10, 10), (10, 20)])
-    self.assertAllEqual([(10, 10), (10, 20)], output_shape)
-
-  def test_lambda_config_serialization(self):
-    with self.cached_session():
-      # test serialization with output_shape and output_shape_type
-      layer = keras.layers.Lambda(lambda x: x + 1, output_shape=(1, 1))
-      layer(keras.backend.variable(np.ones((1, 1))))
-      config = layer.get_config()
-      layer = keras.layers.deserialize({
-          'class_name': 'Lambda',
-          'config': config
-      })
-
-      layer = keras.layers.Lambda.from_config(config)
-
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_numpy_inputs(self):
     if context.executing_eagerly():
       layer = keras.layers.RepeatVector(2)
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent.py b/tensorflow/python/keras/layers/cudnn_recurrent.py
index 16692753afbc83d55349f5b3843952f1b8c8d2bf..a74308f69cd6cbfccec1eb044c208149de214450 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent.py
@@ -31,7 +31,7 @@ from tensorflow.python.keras.layers.recurrent import RNN
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_cudnn_rnn_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
 class _CuDNNRNN(RNN):
@@ -158,7 +158,7 @@ class _CuDNNRNN(RNN):
         RNN, self).get_losses_for(inputs=inputs)
 
 
-@tf_export('keras.layers.CuDNNGRU')
+@keras_export(v1=['keras.layers.CuDNNGRU'])
 class CuDNNGRU(_CuDNNRNN):
   """Fast GRU implementation backed by cuDNN.
 
@@ -335,7 +335,7 @@ class CuDNNGRU(_CuDNNRNN):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.CuDNNLSTM')
+@keras_export(v1=['keras.layers.CuDNNLSTM'])
 class CuDNNLSTM(_CuDNNRNN):
   """Fast LSTM implementation backed by cuDNN.
 
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent_test.py b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
index cc93364aaec5dd0e09cb0e3f31a163f49c3f73c3..c7d8d82ee2b178ba2b9ab43c6f4a19d1cd4bddcb 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent_test.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
@@ -25,278 +25,293 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.optimizer_v2.rmsprop import RMSprop
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
-
-
-class CuDNNTest(test.TestCase, parameterized.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_cudnn_rnn_basics(self):
-    if test.is_gpu_available(cuda_only=True):
-      with self.session(use_gpu=True):
-        input_size = 10
-        timesteps = 6
-        units = 2
-        num_samples = 32
-        for layer_class in [keras.layers.CuDNNGRU, keras.layers.CuDNNLSTM]:
-          for return_sequences in [True, False]:
-            with keras.utils.CustomObjectScope(
-                {'keras.layers.CuDNNGRU': keras.layers.CuDNNGRU,
-                 'keras.layers.CuDNNLSTM': keras.layers.CuDNNLSTM}):
-              testing_utils.layer_test(
-                  layer_class,
-                  kwargs={'units': units,
-                          'return_sequences': return_sequences},
-                  input_shape=(num_samples, timesteps, input_size))
-          for go_backwards in [True, False]:
-            with keras.utils.CustomObjectScope(
-                {'keras.layers.CuDNNGRU': keras.layers.CuDNNGRU,
-                 'keras.layers.CuDNNLSTM': keras.layers.CuDNNLSTM}):
-              testing_utils.layer_test(
-                  layer_class,
-                  kwargs={'units': units,
-                          'go_backwards': go_backwards},
-                  input_shape=(num_samples, timesteps, input_size))
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_trainability(self):
-    if test.is_gpu_available(cuda_only=True):
-      with self.session(use_gpu=True):
-        input_size = 10
-        units = 2
-        for layer_class in [keras.layers.CuDNNGRU, keras.layers.CuDNNLSTM]:
-          layer = layer_class(units)
-          layer.build((None, None, input_size))
-          self.assertEqual(len(layer.weights), 3)
-          self.assertEqual(len(layer.trainable_weights), 3)
-          self.assertEqual(len(layer.non_trainable_weights), 0)
-          layer.trainable = False
-          self.assertEqual(len(layer.weights), 3)
-          self.assertEqual(len(layer.non_trainable_weights), 3)
-          self.assertEqual(len(layer.trainable_weights), 0)
-          layer.trainable = True
-          self.assertEqual(len(layer.weights), 3)
-          self.assertEqual(len(layer.trainable_weights), 3)
-          self.assertEqual(len(layer.non_trainable_weights), 0)
+from tensorflow.python.training import gradient_descent
+
+
+@keras_parameterized.run_all_keras_modes
+class CuDNNTest(keras_parameterized.TestCase):
 
   @parameterized.named_parameters(
-      ('cudnngru', keras.layers.CuDNNGRU),
-      ('cudnnlstm', keras.layers.CuDNNLSTM),
-  )
-  def test_regularizer(self, layer_class):
-    if test.is_gpu_available(cuda_only=True):
-      with self.session(use_gpu=True):
-        input_size = 10
-        timesteps = 6
-        units = 2
-        num_samples = 32
-        layer = layer_class(
-            units,
-            return_sequences=False,
-            input_shape=(timesteps, input_size),
-            kernel_regularizer=keras.regularizers.l1(0.01),
-            recurrent_regularizer=keras.regularizers.l1(0.01),
-            bias_regularizer='l2')
-        layer.build((None, None, input_size))
-        self.assertEqual(len(layer.losses), 3)
-
-        layer = layer_class(
-            units,
-            return_sequences=False,
-            input_shape=(timesteps, input_size),
-            activity_regularizer='l2')
-        self.assertTrue(layer.activity_regularizer)
-        x = keras.backend.variable(
-            np.ones((num_samples, timesteps, input_size)))
-        layer(x)
-        self.assertEqual(len(layer.get_losses_for(x)), 1)
+      *test_util.generate_combinations_with_testcase_name(
+          layer_class=[keras.layers.CuDNNGRU, keras.layers.CuDNNLSTM],
+          return_sequences=[True, False]))
+  @test_util.run_gpu_only
+  def test_cudnn_rnn_return_sequence(self, layer_class, return_sequences):
+    input_size = 10
+    timesteps = 6
+    units = 2
+    num_samples = 32
+    testing_utils.layer_test(
+        layer_class,
+        kwargs={'units': units,
+                'return_sequences': return_sequences},
+        input_shape=(num_samples, timesteps, input_size))
+
+  @parameterized.named_parameters(
+      *test_util.generate_combinations_with_testcase_name(
+          layer_class=[keras.layers.CuDNNGRU, keras.layers.CuDNNLSTM],
+          go_backwards=[True, False]))
+  @test_util.run_gpu_only
+  def test_cudnn_rnn_go_backward(self, layer_class, go_backwards):
+    input_size = 10
+    timesteps = 6
+    units = 2
+    num_samples = 32
+    testing_utils.layer_test(
+        layer_class,
+        kwargs={'units': units,
+                'go_backwards': go_backwards},
+        input_shape=(num_samples, timesteps, input_size))
 
   @parameterized.named_parameters(
       ('cudnngru', keras.layers.CuDNNGRU),
       ('cudnnlstm', keras.layers.CuDNNLSTM),
   )
+  @test_util.run_gpu_only
   def test_return_state(self, layer_class):
-    if test.is_gpu_available(cuda_only=True):
-      with self.session(use_gpu=True):
-        input_size = 10
-        timesteps = 6
-        units = 2
-        num_samples = 32
-        num_states = 2 if layer_class is keras.layers.CuDNNLSTM else 1
-
-        inputs = keras.Input(batch_shape=(num_samples, timesteps, input_size))
-        layer = layer_class(units, return_state=True, stateful=True)
-        outputs = layer(inputs)
-        _, state = outputs[0], outputs[1:]
-        self.assertEqual(len(state), num_states)
-        model = keras.models.Model(inputs, state[0])
-
-        inputs = np.random.random((num_samples, timesteps, input_size))
-        state = model.predict(inputs)
-        np.testing.assert_allclose(
-            keras.backend.eval(layer.states[0]), state, atol=1e-4)
+    input_size = 10
+    timesteps = 6
+    units = 2
+    num_samples = 32
+    num_states = 2 if layer_class is keras.layers.CuDNNLSTM else 1
+
+    inputs = keras.Input(batch_shape=(num_samples, timesteps, input_size))
+    layer = layer_class(units, return_state=True, stateful=True)
+    outputs = layer(inputs)
+    _, state = outputs[0], outputs[1:]
+    self.assertEqual(len(state), num_states)
+    model = keras.models.Model(inputs, state[0])
+    model.run_eagerly = testing_utils.should_run_eagerly()
+
+    inputs = np.random.random((num_samples, timesteps, input_size))
+    state = model.predict(inputs)
+    np.testing.assert_allclose(
+        keras.backend.eval(layer.states[0]), state, atol=1e-4)
 
   @parameterized.named_parameters(
       ('cudnngru', keras.layers.CuDNNGRU),
       ('cudnnlstm', keras.layers.CuDNNLSTM),
   )
+  @test_util.run_gpu_only
   def test_time_major_input(self, layer_class):
-    if test.is_gpu_available(cuda_only=True):
-      with self.test_session(use_gpu=True):
-        input_size = 10
-        timesteps = 6
-        units = 2
-        num_samples = 32
-
-        model = keras.models.Sequential()
-        model.add(
-            keras.layers.Lambda(lambda t: array_ops.transpose(t, [1, 0, 2])))
-        layer = layer_class(units, time_major=True, return_sequences=True)
-        model.add(layer)
-        model.add(
-            keras.layers.Lambda(lambda t: array_ops.transpose(t, [1, 0, 2])))
-        model.compile(loss='categorical_crossentropy', optimizer='adam')
-        model.fit(
-            np.ones((num_samples, timesteps, input_size)),
-            np.ones((num_samples, timesteps, units)))
-        out = model.predict(np.ones((num_samples, timesteps, input_size)))
-        self.assertEqual(out.shape, (num_samples, timesteps, units))
+    input_size = 10
+    timesteps = 6
+    units = 2
+    num_samples = 32
+
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.Lambda(lambda t: array_ops.transpose(t, [1, 0, 2])))
+    layer = layer_class(units, time_major=True, return_sequences=True)
+    model.add(layer)
+    model.add(
+        keras.layers.Lambda(lambda t: array_ops.transpose(t, [1, 0, 2])))
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=RMSprop(learning_rate=0.001))
+    model.fit(
+        np.ones((num_samples, timesteps, input_size)),
+        np.ones((num_samples, timesteps, units)))
+    out = model.predict(np.ones((num_samples, timesteps, input_size)))
+    self.assertEqual(out.shape, (num_samples, timesteps, units))
 
   @parameterized.named_parameters(
       ('cudnngru', keras.layers.CuDNNGRU),
       ('cudnnlstm', keras.layers.CuDNNLSTM),
   )
+  @test_util.run_gpu_only
   def test_specify_initial_state_keras_tensor(self, layer_class):
-    if test.is_gpu_available(cuda_only=True):
-      with self.session(use_gpu=True):
-        input_size = 10
-        timesteps = 6
-        units = 2
-        num_samples = 32
-        num_states = 2 if layer_class is keras.layers.CuDNNLSTM else 1
-
-        inputs = keras.Input((timesteps, input_size))
-        initial_state = [keras.Input((units,)) for _ in range(num_states)]
-        layer = layer_class(units)
-        if len(initial_state) == 1:
-          output = layer(inputs, initial_state=initial_state[0])
-        else:
-          output = layer(inputs, initial_state=initial_state)
-        self.assertIn(initial_state[0], layer._inbound_nodes[0].input_tensors)
-
-        model = keras.models.Model([inputs] + initial_state, output)
-        model.compile(loss='categorical_crossentropy', optimizer='adam')
-
-        inputs = np.random.random((num_samples, timesteps, input_size))
-        initial_state = [
-            np.random.random((num_samples, units)) for _ in range(num_states)
-        ]
-        targets = np.random.random((num_samples, units))
-        model.fit([inputs] + initial_state, targets)
+    input_size = 10
+    timesteps = 6
+    units = 2
+    num_samples = 32
+    num_states = 2 if layer_class is keras.layers.CuDNNLSTM else 1
+
+    inputs = keras.Input((timesteps, input_size))
+    initial_state = [keras.Input((units,)) for _ in range(num_states)]
+    layer = layer_class(units)
+    if len(initial_state) == 1:
+      output = layer(inputs, initial_state=initial_state[0])
+    else:
+      output = layer(inputs, initial_state=initial_state)
+    self.assertIn(initial_state[0], layer._inbound_nodes[0].input_tensors)
+
+    model = keras.models.Model([inputs] + initial_state, output)
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=RMSprop(learning_rate=0.001),
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    inputs = np.random.random((num_samples, timesteps, input_size))
+    initial_state = [
+        np.random.random((num_samples, units)) for _ in range(num_states)
+    ]
+    targets = np.random.random((num_samples, units))
+    model.fit([inputs] + initial_state, targets)
+
+
+class CuDNNGraphOnlyTest(keras_parameterized.TestCase):
 
   @parameterized.named_parameters(
       ('cudnngru', keras.layers.CuDNNGRU),
       ('cudnnlstm', keras.layers.CuDNNLSTM),
   )
+  @test_util.run_deprecated_v1
+  @test_util.run_gpu_only
+  def test_regularizer(self, layer_class):
+    input_size = 10
+    timesteps = 6
+    units = 2
+    num_samples = 32
+    layer = layer_class(
+        units,
+        return_sequences=False,
+        input_shape=(timesteps, input_size),
+        kernel_regularizer=keras.regularizers.l1(0.01),
+        recurrent_regularizer=keras.regularizers.l1(0.01),
+        bias_regularizer='l2')
+    layer.build((None, None, input_size))
+    self.assertEqual(len(layer.losses), 3)
+
+    layer = layer_class(
+        units,
+        return_sequences=False,
+        input_shape=(timesteps, input_size),
+        activity_regularizer='l2')
+    self.assertTrue(layer.activity_regularizer)
+    x = keras.backend.variable(
+        np.ones((num_samples, timesteps, input_size)))
+    layer(x)
+    self.assertEqual(len(layer.get_losses_for(x)), 1)
+
+  @parameterized.named_parameters(
+      ('cudnngru', keras.layers.CuDNNGRU),
+      ('cudnnlstm', keras.layers.CuDNNLSTM),
+  )
+  @test_util.run_gpu_only
+  @test_util.run_v1_only('b/120941292')
   def test_statefulness(self, layer_class):
-    if test.is_gpu_available(cuda_only=True):
-      with self.session(use_gpu=True):
-        input_size = 10
-        timesteps = 6
-        units = 2
-        num_samples = 32
-
-        model = keras.models.Sequential()
-        model.add(
-            keras.layers.Embedding(
-                10,
-                input_size,
-                input_length=timesteps,
-                batch_input_shape=(num_samples, timesteps)))
-        layer = layer_class(
-            units, return_sequences=False, stateful=True, weights=None)
-        model.add(layer)
-        model.compile(optimizer='sgd', loss='mse')
-        out1 = model.predict(np.ones((num_samples, timesteps)))
-        self.assertEqual(out1.shape, (num_samples, units))
-
-        # train once so that the states change
-        model.train_on_batch(
-            np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
-        out2 = model.predict(np.ones((num_samples, timesteps)))
-
-        # if the state is not reset, output should be different
-        self.assertNotEqual(out1.max(), out2.max())
-
-        # check that output changes after states are reset
-        # (even though the model itself didn't change)
-        layer.reset_states()
-        out3 = model.predict(np.ones((num_samples, timesteps)))
-        self.assertNotEqual(out2.max(), out3.max())
-
-        # check that container-level reset_states() works
-        model.reset_states()
-        out4 = model.predict(np.ones((num_samples, timesteps)))
-        self.assertAllClose(out3, out4, atol=1e-5)
-
-        # check that the call to `predict` updated the states
-        out5 = model.predict(np.ones((num_samples, timesteps)))
-        self.assertNotEqual(out4.max(), out5.max())
+    input_size = 10
+    timesteps = 6
+    units = 2
+    num_samples = 32
+
+    with self.cached_session(use_gpu=True):
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Embedding(
+              10,
+              input_size,
+              input_length=timesteps,
+              batch_input_shape=(num_samples, timesteps)))
+      layer = layer_class(
+          units, return_sequences=False, stateful=True, weights=None)
+      model.add(layer)
+      model.compile(optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                    loss='mse')
+      out1 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertEqual(out1.shape, (num_samples, units))
+
+      # train once so that the states change
+      model.train_on_batch(
+          np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
+      out2 = model.predict(np.ones((num_samples, timesteps)))
+
+      # if the state is not reset, output should be different
+      self.assertNotEqual(out1.max(), out2.max())
+
+      # check that output changes after states are reset
+      # (even though the model itself didn't change)
+      layer.reset_states()
+      out3 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertNotEqual(out2.max(), out3.max())
+
+      # check that container-level reset_states() works
+      model.reset_states()
+      out4 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertAllClose(out3, out4, atol=1e-5)
+
+      # check that the call to `predict` updated the states
+      out5 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertNotEqual(out4.max(), out5.max())
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class CuDNNV1OnlyTest(keras_parameterized.TestCase):
+
+  @test_util.run_gpu_only
+  def test_trainability(self):
+    input_size = 10
+    units = 2
+    for layer_class in [keras.layers.CuDNNGRU, keras.layers.CuDNNLSTM]:
+      layer = layer_class(units)
+      layer.build((None, None, input_size))
+      self.assertEqual(len(layer.weights), 3)
+      self.assertEqual(len(layer.trainable_weights), 3)
+      self.assertEqual(len(layer.non_trainable_weights), 0)
+      layer.trainable = False
+      self.assertEqual(len(layer.weights), 3)
+      self.assertEqual(len(layer.non_trainable_weights), 3)
+      self.assertEqual(len(layer.trainable_weights), 0)
+      layer.trainable = True
+      self.assertEqual(len(layer.weights), 3)
+      self.assertEqual(len(layer.trainable_weights), 3)
+      self.assertEqual(len(layer.non_trainable_weights), 0)
 
   @parameterized.named_parameters(
       *test_util.generate_combinations_with_testcase_name(
           rnn_type=['LSTM', 'GRU'], to_cudnn=[True, False],
           bidirectional=[True, False], implementation=[1, 2],
           model_nest_level=[1, 2], model_type=['seq', 'func']))
+  @test_util.run_v1_only('b/120911602, b/112083752')
+  @test_util.run_gpu_only
   def test_load_weights_between_noncudnn_rnn(self, rnn_type, to_cudnn,
                                              bidirectional, implementation,
                                              model_nest_level, model_type):
-    if test.is_gpu_available(cuda_only=True):
-      with self.session(use_gpu=True):
-        input_size = 10
-        timesteps = 6
-        input_shape = (timesteps, input_size)
-        units = 2
-        num_samples = 32
-        inputs = np.random.random((num_samples, timesteps, input_size))
-
-        rnn_layer_kwargs = {
-            'recurrent_activation': 'sigmoid',
-            # ensure biases are non-zero and properly converted
-            'bias_initializer': 'random_uniform',
-            'implementation': implementation
-        }
-        if rnn_type == 'LSTM':
-          rnn_layer_class = keras.layers.LSTM
-          cudnn_rnn_layer_class = keras.layers.CuDNNLSTM
-        else:
-          rnn_layer_class = keras.layers.GRU
-          cudnn_rnn_layer_class = keras.layers.CuDNNGRU
-          rnn_layer_kwargs['reset_after'] = True
-
-        layer = rnn_layer_class(units, **rnn_layer_kwargs)
-        if bidirectional:
-          layer = keras.layers.Bidirectional(layer)
-
-        cudnn_layer = cudnn_rnn_layer_class(units)
-        if bidirectional:
-          cudnn_layer = keras.layers.Bidirectional(cudnn_layer)
-
-        model = self._make_nested_model(input_shape, layer, model_nest_level,
-                                        model_type)
-        cudnn_model = self._make_nested_model(input_shape, cudnn_layer,
-                                              model_nest_level, model_type)
-
-        if to_cudnn:
-          self._convert_model_weights(model, cudnn_model)
-        else:
-          self._convert_model_weights(cudnn_model, model)
-
-        self.assertAllClose(model.predict(inputs), cudnn_model.predict(inputs),
-                            atol=1e-4)
+    input_size = 10
+    timesteps = 6
+    input_shape = (timesteps, input_size)
+    units = 2
+    num_samples = 32
+    inputs = np.random.random((num_samples, timesteps, input_size))
+
+    rnn_layer_kwargs = {
+        'recurrent_activation': 'sigmoid',
+        # ensure biases are non-zero and properly converted
+        'bias_initializer': 'random_uniform',
+        'implementation': implementation
+    }
+    if rnn_type == 'LSTM':
+      rnn_layer_class = keras.layers.LSTM
+      cudnn_rnn_layer_class = keras.layers.CuDNNLSTM
+    else:
+      rnn_layer_class = keras.layers.GRU
+      cudnn_rnn_layer_class = keras.layers.CuDNNGRU
+      rnn_layer_kwargs['reset_after'] = True
+
+    layer = rnn_layer_class(units, **rnn_layer_kwargs)
+    if bidirectional:
+      layer = keras.layers.Bidirectional(layer)
+
+    cudnn_layer = cudnn_rnn_layer_class(units)
+    if bidirectional:
+      cudnn_layer = keras.layers.Bidirectional(cudnn_layer)
+
+    model = self._make_nested_model(input_shape, layer, model_nest_level,
+                                    model_type)
+    cudnn_model = self._make_nested_model(input_shape, cudnn_layer,
+                                          model_nest_level, model_type)
+
+    if to_cudnn:
+      self._convert_model_weights(model, cudnn_model)
+    else:
+      self._convert_model_weights(cudnn_model, model)
+
+    self.assertAllClose(model.predict(inputs), cudnn_model.predict(inputs),
+                        atol=1e-4)
 
   def _make_nested_model(self, input_shape, layer, level=1, model_type='func'):
     # example: make_nested_seq_model((1,), Dense(10), level=2).summary()
@@ -330,153 +345,146 @@ class CuDNNTest(test.TestCase, parameterized.TestCase):
   @parameterized.named_parameters(
       *test_util.generate_combinations_with_testcase_name(
           rnn_type=['LSTM', 'GRU'], to_cudnn=[True, False]))
+  @test_util.run_v1_only('b/120911602')
+  @test_util.run_gpu_only
   def test_load_weights_between_noncudnn_rnn_time_distributed(self, rnn_type,
                                                               to_cudnn):
     # Similar test as test_load_weights_between_noncudnn_rnn() but has different
     # rank of input due to usage of TimeDistributed. Issue: #10356.
-    if test.is_gpu_available(cuda_only=True):
-      with self.session(use_gpu=True):
-        input_size = 10
-        steps = 6
-        timesteps = 6
-        input_shape = (timesteps, steps, input_size)
-        units = 2
-        num_samples = 32
-        inputs = np.random.random((num_samples, timesteps, steps, input_size))
-
-        rnn_layer_kwargs = {
-            'recurrent_activation': 'sigmoid',
-            # ensure biases are non-zero and properly converted
-            'bias_initializer': 'random_uniform',
-        }
-        if rnn_type == 'LSTM':
-          rnn_layer_class = keras.layers.LSTM
-          cudnn_rnn_layer_class = keras.layers.CuDNNLSTM
-        else:
-          rnn_layer_class = keras.layers.GRU
-          cudnn_rnn_layer_class = keras.layers.CuDNNGRU
-          rnn_layer_kwargs['reset_after'] = True
-
-        layer = rnn_layer_class(units, **rnn_layer_kwargs)
-        layer = keras.layers.TimeDistributed(layer)
-
-        cudnn_layer = cudnn_rnn_layer_class(units)
-        cudnn_layer = keras.layers.TimeDistributed(cudnn_layer)
-
-        model = self._make_nested_model(input_shape, layer)
-        cudnn_model = self._make_nested_model(input_shape, cudnn_layer)
-
-        if to_cudnn:
-          self._convert_model_weights(model, cudnn_model)
-        else:
-          self._convert_model_weights(cudnn_model, model)
-
-        self.assertAllClose(model.predict(inputs), cudnn_model.predict(inputs),
-                            atol=1e-4)
-
-  @test_util.run_in_graph_and_eager_modes
+    input_size = 10
+    steps = 6
+    timesteps = 6
+    input_shape = (timesteps, steps, input_size)
+    units = 2
+    num_samples = 32
+    inputs = np.random.random((num_samples, timesteps, steps, input_size))
+
+    rnn_layer_kwargs = {
+        'recurrent_activation': 'sigmoid',
+        # ensure biases are non-zero and properly converted
+        'bias_initializer': 'random_uniform',
+    }
+    if rnn_type == 'LSTM':
+      rnn_layer_class = keras.layers.LSTM
+      cudnn_rnn_layer_class = keras.layers.CuDNNLSTM
+    else:
+      rnn_layer_class = keras.layers.GRU
+      cudnn_rnn_layer_class = keras.layers.CuDNNGRU
+      rnn_layer_kwargs['reset_after'] = True
+
+    layer = rnn_layer_class(units, **rnn_layer_kwargs)
+    layer = keras.layers.TimeDistributed(layer)
+
+    cudnn_layer = cudnn_rnn_layer_class(units)
+    cudnn_layer = keras.layers.TimeDistributed(cudnn_layer)
+
+    model = self._make_nested_model(input_shape, layer)
+    cudnn_model = self._make_nested_model(input_shape, cudnn_layer)
+
+    if to_cudnn:
+      self._convert_model_weights(model, cudnn_model)
+    else:
+      self._convert_model_weights(cudnn_model, model)
+
+    self.assertAllClose(model.predict(inputs), cudnn_model.predict(inputs),
+                        atol=1e-4)
+
+  @test_util.run_gpu_only
   def test_cudnnrnn_bidirectional(self):
-    if test.is_gpu_available(cuda_only=True):
-      with self.session(use_gpu=True):
-        rnn = keras.layers.CuDNNGRU
-        samples = 2
-        dim = 2
-        timesteps = 2
-        output_dim = 2
-        mode = 'concat'
-
-        x = np.random.random((samples, timesteps, dim))
-        target_dim = 2 * output_dim if mode == 'concat' else output_dim
-        y = np.random.random((samples, target_dim))
-
-        # test with Sequential model
-        model = keras.Sequential()
-        model.add(
-            keras.layers.Bidirectional(
-                rnn(output_dim), merge_mode=mode, input_shape=(None, dim)))
-        model.compile(
-            loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
-        model.fit(x, y, epochs=1, batch_size=1)
-
-        # test config
-        model.get_config()
-        model = keras.models.model_from_json(model.to_json())
-        model.summary()
-
-        # test stacked bidirectional layers
-        model = keras.Sequential()
-        model.add(
-            keras.layers.Bidirectional(
-                rnn(output_dim, return_sequences=True),
-                merge_mode=mode,
-                input_shape=(None, dim)))
-        model.add(keras.layers.Bidirectional(rnn(output_dim), merge_mode=mode))
-        model.compile(
-            loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
-        model.fit(x, y, epochs=1, batch_size=1)
-
-        # test with functional API
-        inputs = keras.Input((timesteps, dim))
-        outputs = keras.layers.Bidirectional(
-            rnn(output_dim), merge_mode=mode)(
-                inputs)
-        model = keras.Model(inputs, outputs)
-        model.compile(
-            loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
-        model.fit(x, y, epochs=1, batch_size=1)
-
-        # Bidirectional and stateful
-        inputs = keras.Input(batch_shape=(1, timesteps, dim))
-        outputs = keras.layers.Bidirectional(
-            rnn(output_dim, stateful=True), merge_mode=mode)(
-                inputs)
-        model = keras.Model(inputs, outputs)
-        model.compile(
-            loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
-        model.fit(x, y, epochs=1, batch_size=1)
-
+    rnn = keras.layers.CuDNNGRU
+    samples = 2
+    dim = 2
+    timesteps = 2
+    output_dim = 2
+    mode = 'concat'
+
+    x = np.random.random((samples, timesteps, dim))
+    target_dim = 2 * output_dim if mode == 'concat' else output_dim
+    y = np.random.random((samples, target_dim))
+
+    # test with Sequential model
+    model = keras.Sequential()
+    model.add(
+        keras.layers.Bidirectional(
+            rnn(output_dim), merge_mode=mode, input_shape=(None, dim)))
+    model.compile(loss='mse', optimizer='rmsprop')
+    model.fit(x, y, epochs=1, batch_size=1)
+
+    # test config
+    model.get_config()
+    model = keras.models.model_from_json(model.to_json())
+    model.summary()
+
+    # test stacked bidirectional layers
+    model = keras.Sequential()
+    model.add(
+        keras.layers.Bidirectional(
+            rnn(output_dim, return_sequences=True),
+            merge_mode=mode,
+            input_shape=(None, dim)))
+    model.add(keras.layers.Bidirectional(rnn(output_dim), merge_mode=mode))
+    model.compile(loss='mse', optimizer=R'rmsprop')
+    model.fit(x, y, epochs=1, batch_size=1)
+
+    # test with functional API
+    inputs = keras.Input((timesteps, dim))
+    outputs = keras.layers.Bidirectional(
+        rnn(output_dim), merge_mode=mode)(
+            inputs)
+    model = keras.Model(inputs, outputs)
+    model.compile(loss='mse', optimizer=R'rmsprop')
+    model.fit(x, y, epochs=1, batch_size=1)
+
+    # Bidirectional and stateful
+    inputs = keras.Input(batch_shape=(1, timesteps, dim))
+    outputs = keras.layers.Bidirectional(
+        rnn(output_dim, stateful=True), merge_mode=mode)(
+            inputs)
+    model = keras.Model(inputs, outputs)
+    model.compile(loss='mse', optimizer='rmsprop')
+    model.fit(x, y, epochs=1, batch_size=1)
+
+  @test_util.run_gpu_only
   def test_preprocess_weights_for_loading_gru_incompatible(self):
     """Test loading weights between incompatible layers.
 
     Should fail fast with an exception.
     """
-    if test.is_gpu_available(cuda_only=True):
-      with self.session(use_gpu=True):
-        input_shape = (3, 5)
-
-        def gru(cudnn=False, **kwargs):
-          layer_class = keras.layers.CuDNNGRU if cudnn else keras.layers.GRU
-          return layer_class(2, input_shape=input_shape, **kwargs)
-
-        def get_layer_weights(layer):
-          layer.build(input_shape=input_shape)
-          return layer.get_weights()
-
-        def assert_not_compatible(src, dest, message):
-          with self.assertRaises(ValueError) as ex:
-            keras.engine.saving.preprocess_weights_for_loading(
-                dest,
-                get_layer_weights(src))
-          self.assertIn(message, str(ex.exception))
-
-        assert_not_compatible(
-            gru(),
-            gru(cudnn=True),
-            'GRU(reset_after=False) is not compatible with CuDNNGRU')
-        assert_not_compatible(
-            gru(cudnn=True),
-            gru(),
-            'CuDNNGRU is not compatible with GRU(reset_after=False)')
-        assert_not_compatible(
-            gru(),
-            gru(reset_after=True),
-            'GRU(reset_after=False) is not compatible with '
-            'GRU(reset_after=True)')
-        assert_not_compatible(
-            gru(reset_after=True),
-            gru(),
-            'GRU(reset_after=True) is not compatible with '
-            'GRU(reset_after=False)')
+    input_shape = (3, 5)
+
+    def gru(cudnn=False, **kwargs):
+      layer_class = keras.layers.CuDNNGRU if cudnn else keras.layers.GRU
+      return layer_class(2, input_shape=input_shape, **kwargs)
+
+    def get_layer_weights(layer):
+      layer.build(input_shape=input_shape)
+      return layer.get_weights()
+
+    def assert_not_compatible(src, dest, message):
+      with self.assertRaises(ValueError) as ex:
+        keras.saving.preprocess_weights_for_loading(
+            dest,
+            get_layer_weights(src))
+      self.assertIn(message, str(ex.exception))
+
+    assert_not_compatible(
+        gru(),
+        gru(cudnn=True),
+        'GRU(reset_after=False) is not compatible with CuDNNGRU')
+    assert_not_compatible(
+        gru(cudnn=True),
+        gru(),
+        'CuDNNGRU is not compatible with GRU(reset_after=False)')
+    assert_not_compatible(
+        gru(),
+        gru(reset_after=True),
+        'GRU(reset_after=False) is not compatible with '
+        'GRU(reset_after=True)')
+    assert_not_compatible(
+        gru(reset_after=True),
+        gru(),
+        'GRU(reset_after=True) is not compatible with '
+        'GRU(reset_after=False)')
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/layers/embeddings.py b/tensorflow/python/keras/layers/embeddings.py
index e8a8575705ab5c412ae4a793faaa89ef8918130c..df5e82c2459b4c1beb1c5b74a7048be022144535 100644
--- a/tensorflow/python/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/layers/embeddings.py
@@ -28,10 +28,10 @@ from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.layers.Embedding')
+@keras_export('keras.layers.Embedding')
 class Embedding(Layer):
   """Turns positive integers (indexes) into dense vectors of fixed size.
 
diff --git a/tensorflow/python/keras/layers/embeddings_test.py b/tensorflow/python/keras/layers/embeddings_test.py
index aaa17b7e96078dea9b84e0f0e62a4bdcbe071fa0..ac3acad7accb2a9d9d8858af973b61023dcfbc22 100644
--- a/tensorflow/python/keras/layers/embeddings_test.py
+++ b/tensorflow/python/keras/layers/embeddings_test.py
@@ -23,15 +23,19 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.eager import backprop
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training import adagrad
 
 
-class EmbeddingTest(test.TestCase):
+class EmbeddingTest(keras_parameterized.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=False)
+  @keras_parameterized.run_all_keras_modes
   def test_embedding(self):
+    if tf_test_util.is_gpu_available():
+      self.skipTest('Only test embedding on CPU.')
+
     testing_utils.layer_test(
         keras.layers.Embedding,
         kwargs={'output_dim': 4,
@@ -69,18 +73,17 @@ class EmbeddingTest(test.TestCase):
         input_dtype='int32',
         expected_output_dtype='float32')
 
-  @tf_test_util.run_in_graph_and_eager_modes()
+  @keras_parameterized.run_all_keras_modes
   def test_embedding_correctness(self):
     layer = keras.layers.Embedding(output_dim=2, input_dim=2)
-    layer.build((None, 2))
-    matrix = np.array([[1, 1], [2, 2]])
-    layer.set_weights([matrix])
+    model = keras.models.Sequential([layer])
 
-    inputs = keras.backend.constant([[0, 1, 0]], dtype='int32')
-    outputs = keras.backend.eval(layer(inputs))
+    layer.set_weights([np.array([[1, 1], [2, 2]])])
+    model.run_eagerly = testing_utils.should_run_eagerly()
+    outputs = model.predict(np.array([[0, 1, 0]], dtype='int32'))
     self.assertAllClose(outputs, [[[1, 1], [2, 2], [1, 1]]])
 
-  @tf_test_util.run_in_graph_and_eager_modes()
+  @tf_test_util.run_in_graph_and_eager_modes
   def test_eager_gpu_cpu(self):
     l = keras.layers.Embedding(output_dim=2, input_dim=2)
     l.build((None, 2))
diff --git a/tensorflow/python/keras/layers/gru_test.py b/tensorflow/python/keras/layers/gru_test.py
index 9988c9fae5808a5cad47464addbb3f5e33953e66..91183a4d732fb87e9e5868c9996c74a5ed5b0932 100644
--- a/tensorflow/python/keras/layers/gru_test.py
+++ b/tensorflow/python/keras/layers/gru_test.py
@@ -18,18 +18,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
-class GRULayerTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class GRULayerTest(keras_parameterized.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_return_sequences_GRU(self):
     num_samples = 2
     timesteps = 3
@@ -41,7 +43,6 @@ class GRULayerTest(test.TestCase):
                 'return_sequences': True},
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_dynamic_behavior_GRU(self):
     num_samples = 2
     timesteps = 3
@@ -50,12 +51,12 @@ class GRULayerTest(test.TestCase):
     layer = keras.layers.GRU(units, input_shape=(None, embedding_dim))
     model = keras.models.Sequential()
     model.add(layer)
-    model.compile(RMSPropOptimizer(0.01), 'mse')
+    model.compile(
+        'rmsprop', 'mse', run_eagerly=testing_utils.should_run_eagerly())
     x = np.random.random((num_samples, timesteps, embedding_dim))
     y = np.random.random((num_samples, units))
     model.train_on_batch(x, y)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_dropout_GRU(self):
     num_samples = 2
     timesteps = 3
@@ -68,134 +69,140 @@ class GRULayerTest(test.TestCase):
                 'recurrent_dropout': 0.1},
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_implementation_mode_GRU(self):
+  @parameterized.parameters([0, 1, 2])
+  def test_implementation_mode_GRU(self, implementation_mode):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
-    for mode in [0, 1, 2]:
-      testing_utils.layer_test(
-          keras.layers.GRU,
-          kwargs={'units': units,
-                  'implementation': mode},
-          input_shape=(num_samples, timesteps, embedding_dim))
+    testing_utils.layer_test(
+        keras.layers.GRU,
+        kwargs={'units': units,
+                'implementation': implementation_mode},
+        input_shape=(num_samples, timesteps, embedding_dim))
 
-  def test_statefulness_GRU(self):
+  def test_reset_after_GRU(self):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
+
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=num_samples,
+        test_samples=0,
+        input_shape=(timesteps, embedding_dim),
+        num_classes=units)
+    y_train = keras.utils.to_categorical(y_train, units)
+
+    inputs = keras.layers.Input(shape=[timesteps, embedding_dim])
+    gru_layer = keras.layers.GRU(units,
+                                 reset_after=True)
+    output = gru_layer(inputs)
+    gru_model = keras.models.Model(inputs, output)
+    gru_model.compile(
+        'rmsprop', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    gru_model.fit(x_train, y_train)
+    gru_model.predict(x_train)
+
+  def test_with_masking_layer_GRU(self):
     layer_class = keras.layers.GRU
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Embedding(
-              4,
-              embedding_dim,
-              mask_zero=True,
-              input_length=timesteps,
-              batch_input_shape=(num_samples, timesteps)))
-      layer = layer_class(
-          units, return_sequences=False, stateful=True, weights=None)
-      model.add(layer)
-      model.compile(optimizer='sgd', loss='mse')
-      out1 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertEqual(out1.shape, (num_samples, units))
-
-      # train once so that the states change
-      model.train_on_batch(
-          np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
-      out2 = model.predict(np.ones((num_samples, timesteps)))
-
-      # if the state is not reset, output should be different
-      self.assertNotEqual(out1.max(), out2.max())
-
-      # check that output changes after states are reset
-      # (even though the model itself didn't change)
-      layer.reset_states()
-      out3 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertNotEqual(out2.max(), out3.max())
-
-      # check that container-level reset_states() works
-      model.reset_states()
-      out4 = model.predict(np.ones((num_samples, timesteps)))
-      np.testing.assert_allclose(out3, out4, atol=1e-5)
-
-      # check that the call to `predict` updated the states
-      out5 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertNotEqual(out4.max(), out5.max())
-
-      # Check masking
-      layer.reset_states()
-
-      left_padded_input = np.ones((num_samples, timesteps))
-      left_padded_input[0, :1] = 0
-      left_padded_input[1, :2] = 0
-      out6 = model.predict(left_padded_input)
-
-      layer.reset_states()
-
-      right_padded_input = np.ones((num_samples, timesteps))
-      right_padded_input[0, -1:] = 0
-      right_padded_input[1, -2:] = 0
-      out7 = model.predict(right_padded_input)
-
-      np.testing.assert_allclose(out7, out6, atol=1e-5)
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Masking(input_shape=(3, 4)))
+    model.add(layer_class(units=5, return_sequences=True, unroll=False))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
-  def test_regularizers_GRU(self):
+  def test_statefulness_GRU(self):
+    num_samples = 2
+    timesteps = 3
     embedding_dim = 4
+    units = 2
     layer_class = keras.layers.GRU
-    with self.cached_session():
-      layer = layer_class(
-          5,
-          return_sequences=False,
-          weights=None,
-          input_shape=(None, embedding_dim),
-          kernel_regularizer=keras.regularizers.l1(0.01),
-          recurrent_regularizer=keras.regularizers.l1(0.01),
-          bias_regularizer='l2',
-          activity_regularizer='l1')
-      layer.build((None, None, 2))
-      self.assertEqual(len(layer.losses), 3)
-
-      x = keras.backend.variable(np.ones((2, 3, 2)))
-      layer(x)
-      self.assertEqual(len(layer.get_losses_for(x)), 1)
+
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.Embedding(
+            4,
+            embedding_dim,
+            mask_zero=True,
+            input_length=timesteps,
+            batch_input_shape=(num_samples, timesteps)))
+    layer = layer_class(
+        units, return_sequences=False, stateful=True, weights=None)
+    model.add(layer)
+    model.compile(optimizer='sgd', loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
+    out1 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertEqual(out1.shape, (num_samples, units))
+
+    # train once so that the states change
+    model.train_on_batch(
+        np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
+    out2 = model.predict(np.ones((num_samples, timesteps)))
+
+    # if the state is not reset, output should be different
+    self.assertNotEqual(out1.max(), out2.max())
+
+    # check that output changes after states are reset
+    # (even though the model itself didn't change)
+    layer.reset_states()
+    out3 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertNotEqual(out2.max(), out3.max())
+
+    # check that container-level reset_states() works
+    model.reset_states()
+    out4 = model.predict(np.ones((num_samples, timesteps)))
+    np.testing.assert_allclose(out3, out4, atol=1e-5)
+
+    # check that the call to `predict` updated the states
+    out5 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertNotEqual(out4.max(), out5.max())
+
+    # Check masking
+    layer.reset_states()
+
+    left_padded_input = np.ones((num_samples, timesteps))
+    left_padded_input[0, :1] = 0
+    left_padded_input[1, :2] = 0
+    out6 = model.predict(left_padded_input)
+
+    layer.reset_states()
+
+    right_padded_input = np.ones((num_samples, timesteps))
+    right_padded_input[0, -1:] = 0
+    right_padded_input[1, -2:] = 0
+    out7 = model.predict(right_padded_input)
+
+    np.testing.assert_allclose(out7, out6, atol=1e-5)
+
+
+@tf_test_util.run_all_in_graph_and_eager_modes
+class GRULayerGenericTest(test.TestCase):
 
   def test_constraints_GRU(self):
     embedding_dim = 4
     layer_class = keras.layers.GRU
-    with self.cached_session():
-      k_constraint = keras.constraints.max_norm(0.01)
-      r_constraint = keras.constraints.max_norm(0.01)
-      b_constraint = keras.constraints.max_norm(0.01)
-      layer = layer_class(
-          5,
-          return_sequences=False,
-          weights=None,
-          input_shape=(None, embedding_dim),
-          kernel_constraint=k_constraint,
-          recurrent_constraint=r_constraint,
-          bias_constraint=b_constraint)
-      layer.build((None, None, embedding_dim))
-      self.assertEqual(layer.cell.kernel.constraint, k_constraint)
-      self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
-      self.assertEqual(layer.cell.bias.constraint, b_constraint)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_with_masking_layer_GRU(self):
-    layer_class = keras.layers.GRU
-    with self.cached_session():
-      inputs = np.random.random((2, 3, 4))
-      targets = np.abs(np.random.random((2, 3, 5)))
-      targets /= targets.sum(axis=-1, keepdims=True)
-      model = keras.models.Sequential()
-      model.add(keras.layers.Masking(input_shape=(3, 4)))
-      model.add(layer_class(units=5, return_sequences=True, unroll=False))
-      model.compile(loss='categorical_crossentropy',
-                    optimizer=RMSPropOptimizer(0.01))
-      model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+    k_constraint = keras.constraints.max_norm(0.01)
+    r_constraint = keras.constraints.max_norm(0.01)
+    b_constraint = keras.constraints.max_norm(0.01)
+    layer = layer_class(
+        5,
+        return_sequences=False,
+        weights=None,
+        input_shape=(None, embedding_dim),
+        kernel_constraint=k_constraint,
+        recurrent_constraint=r_constraint,
+        bias_constraint=b_constraint)
+    layer.build((None, None, embedding_dim))
+    self.assertEqual(layer.cell.kernel.constraint, k_constraint)
+    self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
+    self.assertEqual(layer.cell.bias.constraint, b_constraint)
 
   def test_from_config_GRU(self):
     layer_class = keras.layers.GRU
@@ -204,6 +211,28 @@ class GRULayerTest(test.TestCase):
       l2 = layer_class.from_config(l1.get_config())
       assert l1.get_config() == l2.get_config()
 
+  def test_regularizers_GRU(self):
+    embedding_dim = 4
+    layer_class = keras.layers.GRU
+    layer = layer_class(
+        5,
+        return_sequences=False,
+        weights=None,
+        input_shape=(None, embedding_dim),
+        kernel_regularizer=keras.regularizers.l1(0.01),
+        recurrent_regularizer=keras.regularizers.l1(0.01),
+        bias_regularizer='l2',
+        activity_regularizer='l1')
+    layer.build((None, None, 2))
+    self.assertEqual(len(layer.losses), 3)
+
+    x = keras.backend.variable(np.ones((2, 3, 2)))
+    layer(x)
+    if context.executing_eagerly():
+      self.assertEqual(len(layer.losses), 4)
+    else:
+      self.assertEqual(len(layer.get_losses_for(x)), 1)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/kernelized.py b/tensorflow/python/keras/layers/kernelized.py
new file mode 100644
index 0000000000000000000000000000000000000000..9753fc66de9ad98b831b225974db180e6f5737d1
--- /dev/null
+++ b/tensorflow/python/keras/layers/kernelized.py
@@ -0,0 +1,258 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras layers that implement explicit (approximate) kernel feature maps."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import six
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import input_spec
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import nn
+
+_SUPPORTED_RBF_KERNEL_TYPES = ['gaussian', 'laplacian']
+
+
+class RandomFourierFeatures(base_layer.Layer):
+  r"""Layer that maps its inputs using random Fourier features.
+
+  This layer implements a feature map \\(\phi: \mathbb{R}^d \rightarrow
+  \mathbb{R}^D\\) which approximates shift-invariant kernels. A kernel function
+  K(x, y) defined over \\(\mathbb{R}^d x \mathbb{R}^d\\) is shift-invariant if
+  K(x, y) = k(x-y) for some function defined over \\(\mathbb{R}^d\\). Many
+  popular Radial Basis Functions (in short RBF), including gaussian and
+  laplacian kernels are shift-invariant.
+
+  The layer approximates a (shift invariant) kernel K in the following sense:
+    up to a scaling factor, for all inputs \\(x, y \in \mathbb{R}^d\\)
+        \\(\phi(x)^T \cdot \phi(y) \approx K(x, y)\\)
+
+  The implementation of this layer is based on the following paper:
+  "Random Features for Large-Scale Kernel Machines" by Ali Rahimi and Ben Recht.
+  (link: https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf)
+
+  The distribution from which the parameters of the random features map (layer)
+  are sampled, determines which shift-invariant kernel the layer approximates
+  (see paper for more details). The users can use the distribution of their
+  choice. Due to their popularity, the layer supports the out-of-the-box
+  approximation of the following RBF kernels:
+  - Gaussian: \\(K(x, y) = e^{-\frac{\|x-y\|_2^2}{2 \cdot scale^2}}\\)
+  - Laplacian: \\(K(x, y) = e^{-\frac{\|x-y\|_1}{scale}}\\)
+
+  NOTE: Unlike the map described in the paper and the scikit-learn
+  implementation, the output of this layer does not apply the sqrt(2/D)
+  normalization factor.
+
+  Usage for ML: Typically, this layer is used to "kernelize" linear models by
+  applying a non-linear transformation (this layer) to the input features and
+  then training a linear model on top of the transformed features. Depending on
+  the loss function of the linear model, the composition of this layer and the
+  linear model results to models that are equivalent (up to approximation) to
+  kernel SVMs (for hinge loss), kernel logistic regression (for logistic loss),
+  kernel linear regression (for squared loss) etc.
+
+  Example of building a kernel multinomial logistic regression model with
+  Gaussian kernel in keras:
+  ```python
+  random_features_layer = RandomFourierFeatures(
+      output_dim=500,
+      kernel_initializer='gaussian',
+      scale=5.0,
+      ...)
+
+  model = tf.keras.models.Sequential()
+  model.add(random_features_layer)
+  model.add(tf.keras.layers.Dense(units=num_classes, activation='softmax')
+
+  model.compile(elif isinstance(identifier, six.string_types):
+    loss=tf.keras.losses.categorical_crossentropy, optimizer=..., metrics=...)
+  ```
+
+  To use another kernel, replace the layer creation command with:
+  ```python
+  random_features_layer = RandomFourierFeatures(
+      output_dim=500,
+      kernel_initializer=<my_initializer>,
+      scale=...,
+      ...)
+  ```
+
+  Arguments:
+    output_dim: Positive integer, the dimension of the layer's output, i.e., the
+      number of random features used to approximate the kernel.
+    kernel_initializer: Determines the distribution of the parameters of the
+      random features map (and therefore the kernel approximated by the layer).
+      It can be either a string or an instance of TensorFlow's Initializer
+      class. Currently only 'gaussian' and 'laplacian' are supported as string
+      initializers (case insensitive). Note that these parameters are not
+      trainable.
+    scale: For gaussian and laplacian kernels, this corresponds to a scaling
+      factor of the corresponding kernel approximated by the layer (see concrete
+      definitions above). When provided, it should be a positive float. If None,
+      the implementation chooses a default value (1.0 typically). Both the
+      approximation error of the kernel and the classification quality are
+      sensitive to this parameter. If trainable is set to True, this paramater
+      is learned end-to-end during training and the provided value serves as an
+      initialization value.
+      NOTE: When this layer is used to map the initial features and then the
+        transformed features are fed to a linear model, by making `scale`
+        trainable, the resulting optimization problem is no longer convex (even
+        if the loss function used by the linear model is convex).
+    trainable: Whether the scaling parameter of th layer is trainable. Defaults
+      to False.
+    name: name for the RandomFourierFeatures layer.
+
+  Raises:
+    ValueError: if output_dim or stddev are not positive or if the provided
+      kernel_initializer is not supported.
+  """
+
+  def __init__(self,
+               output_dim,
+               kernel_initializer='gaussian',
+               scale=None,
+               trainable=False,
+               name=None,
+               **kwargs):
+    if output_dim <= 0:
+      raise ValueError(
+          '`output_dim` should be a positive integer. Given: {}.'.format(
+              output_dim))
+    if isinstance(kernel_initializer, six.string_types):
+      if kernel_initializer.lower() not in _SUPPORTED_RBF_KERNEL_TYPES:
+        raise ValueError(
+            'Unsupported kernel type: \'{}\'. Supported kernel types: {}.'
+            .format(kernel_initializer, _SUPPORTED_RBF_KERNEL_TYPES))
+    if scale is not None and scale <= 0.0:
+      raise ValueError('When provided, `scale` should be a positive float. '
+                       'Given: {}.'.format(scale))
+    super(RandomFourierFeatures, self).__init__(
+        trainable=trainable, name=name, **kwargs)
+    self.output_dim = output_dim
+    self.kernel_initializer = kernel_initializer
+    self.scale = scale
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    # TODO(sibyl-vie3Poto): Allow higher dimension inputs. Currently the input is expected
+    # to have shape [batch_size, dimension].
+    if input_shape.rank != 2:
+      raise ValueError(
+          'The rank of the input tensor should be 2. Got {} instead.'.format(
+              input_shape.ndims))
+    if input_shape.dims[1].value is None:
+      raise ValueError(
+          'The last dimension of the inputs to `RandomFourierFeatures` '
+          'should be defined. Found `None`.')
+    self.input_spec = input_spec.InputSpec(
+        ndim=2, axes={1: input_shape.dims[1].value})
+    input_dim = input_shape.dims[1].value
+
+    kernel_initializer = _get_random_features_initializer(
+        self.kernel_initializer, shape=(input_dim, self.output_dim))
+
+    unscaled_kernel = self.add_weight(
+        name='unscaled_random_features',
+        shape=(input_dim, self.output_dim),
+        dtype=dtypes.float32,
+        initializer=kernel_initializer,
+        trainable=False)
+
+    self.bias = self.add_weight(
+        name='random_features_bias',
+        shape=(self.output_dim,),
+        dtype=dtypes.float32,
+        initializer=init_ops.random_uniform_initializer(
+            minval=0.0, maxval=2 * np.pi, dtype=dtypes.float32),
+        trainable=False)
+
+    if self.scale is None:
+      self.scale = _get_default_scale(self.kernel_initializer, input_dim)
+    scale = self.add_weight(
+        name='random_features_scale',
+        shape=(1,),
+        dtype=dtypes.float32,
+        initializer=init_ops.constant_initializer(self.scale),
+        trainable=True,
+        constraint='NonNeg')
+    self.kernel = (1.0 / scale) * unscaled_kernel
+    super(RandomFourierFeatures, self).build(input_shape)
+
+  def call(self, inputs):
+    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
+    inputs = gen_math_ops.cast(inputs, dtypes.float32)
+    outputs = gen_math_ops.mat_mul(inputs, self.kernel)
+    outputs = nn.bias_add(outputs, self.bias)
+    return gen_math_ops.cos(outputs)
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    input_shape = input_shape.with_rank(2)
+    if input_shape.dims[-1].value is None:
+      raise ValueError(
+          'The innermost dimension of input shape must be defined. Given: %s' %
+          input_shape)
+    return input_shape[:-1].concatenate(self.output_dim)
+
+  def get_config(self):
+    kernel_initializer = self.kernel_initializer
+    if isinstance(self.kernel_initializer, init_ops.Initializer):
+      kernel_initializer = initializers.serialize(self.kernel_initializer)
+    config = {
+        'output_dim': self.output_dim,
+        'kernel_initializer': kernel_initializer,
+        'scale': self.scale,
+    }
+    base_config = super(RandomFourierFeatures, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+def _get_random_features_initializer(initializer, shape):
+  """Returns Initializer object for random features."""
+
+  def _get_cauchy_samples(loc, scale, shape):
+    probs = np.random.uniform(low=0., high=1., size=shape)
+    return loc + scale * np.tan(np.pi * (probs - 0.5))
+
+  random_features_initializer = initializer
+  if isinstance(initializer, six.string_types):
+    if initializer.lower() == 'gaussian':
+      random_features_initializer = init_ops.random_normal_initializer(
+          stddev=1.0)
+    elif initializer.lower() == 'laplacian':
+      random_features_initializer = init_ops.constant_initializer(
+          _get_cauchy_samples(loc=0.0, scale=1.0, shape=shape))
+
+    else:
+      raise ValueError(
+          'Unsupported kernel type: \'{}\'. Supported kernel types: {}.'.format(
+              random_features_initializer, _SUPPORTED_RBF_KERNEL_TYPES))
+  return random_features_initializer
+
+
+def _get_default_scale(initializer, input_dim):
+  if (isinstance(initializer, six.string_types) and
+      initializer.lower() == 'gaussian'):
+    return np.sqrt(input_dim / 2.0)
+  return 1.0
diff --git a/tensorflow/python/keras/layers/kernelized_test.py b/tensorflow/python/keras/layers/kernelized_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bc6e8b144a6ab79ff6b0d0fe936683a5478b9e3
--- /dev/null
+++ b/tensorflow/python/keras/layers/kernelized_test.py
@@ -0,0 +1,391 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for kernelized.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import math
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend as keras_backend
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras.layers import kernelized as kernel_layers
+from tensorflow.python.keras.utils import kernelized_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test
+
+
+def _exact_gaussian(stddev):
+  return functools.partial(
+      kernelized_utils.exact_gaussian_kernel, stddev=stddev)
+
+
+def _exact_laplacian(stddev):
+  return functools.partial(
+      kernelized_utils.exact_laplacian_kernel, stddev=stddev)
+
+
+class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
+
+  def _assert_all_close(self, expected, actual, atol=0.001):
+    if not context.executing_eagerly():
+      with self.cached_session() as sess:
+        keras_backend._initialize_variables(sess)
+        self.assertAllClose(expected, actual, atol=atol)
+    else:
+      self.assertAllClose(expected, actual, atol=atol)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_invalid_output_dim(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'`output_dim` should be a positive integer. Given: -3.'):
+      _ = kernel_layers.RandomFourierFeatures(output_dim=-3, scale=2.0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_unsupported_kernel_type(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'Unsupported kernel type: \'unsupported_kernel\'.'):
+      _ = kernel_layers.RandomFourierFeatures(
+          3, 'unsupported_kernel', stddev=2.0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_invalid_scale(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'When provided, `scale` should be a positive float. Given: 0.0.'):
+      _ = kernel_layers.RandomFourierFeatures(output_dim=10, scale=0.0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_invalid_input_shape(self):
+    inputs = random_ops.random_uniform((3, 2, 4), seed=1)
+    rff_layer = kernel_layers.RandomFourierFeatures(output_dim=10, scale=3.0)
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'The rank of the input tensor should be 2. Got 3 instead.'):
+      _ = rff_layer.apply(inputs)
+
+  @parameterized.named_parameters(
+      ('gaussian', 'gaussian', 10.0, False),
+      ('random', init_ops.random_uniform_initializer, 1.0, True))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_random_features_properties(self, initializer, scale, trainable):
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=10,
+        kernel_initializer=initializer,
+        scale=scale,
+        trainable=trainable)
+    self.assertEqual(rff_layer.output_dim, 10)
+    self.assertEqual(rff_layer.kernel_initializer, initializer)
+    self.assertEqual(rff_layer.scale, scale)
+    self.assertEqual(rff_layer.trainable, trainable)
+
+  @parameterized.named_parameters(('gaussian', 'gaussian', False),
+                                  ('laplacian', 'laplacian', True),
+                                  ('other', init_ops.ones_initializer, True))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_call(self, initializer, trainable):
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=10,
+        kernel_initializer=initializer,
+        scale=1.0,
+        trainable=trainable,
+        name='random_fourier_features')
+    inputs = random_ops.random_uniform((3, 2), seed=1)
+    outputs = rff_layer(inputs)
+    self.assertListEqual([3, 10], outputs.get_shape().as_list())
+    num_trainable_vars = 1 if trainable else 0
+    self.assertLen(rff_layer.non_trainable_variables, 3 - num_trainable_vars)
+    if not context.executing_eagerly():
+      self.assertLen(
+          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES),
+          num_trainable_vars)
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def test_no_eager_Leak(self):
+    # Tests that repeatedly constructing and building a Layer does not leak
+    # Python objects.
+    inputs = random_ops.random_uniform((5, 4), seed=1)
+    kernel_layers.RandomFourierFeatures(output_dim=4, name='rff')(inputs)
+    kernel_layers.RandomFourierFeatures(output_dim=10, scale=2.0)(inputs)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_output_shape(self):
+    inputs = random_ops.random_uniform((3, 2), seed=1)
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=7, name='random_fourier_features', trainable=True)
+    outputs = rff_layer(inputs)
+    self.assertEqual([3, 7], outputs.get_shape().as_list())
+
+  @parameterized.named_parameters(
+      ('gaussian', 'gaussian'), ('laplacian', 'laplacian'),
+      ('other', init_ops.random_uniform_initializer))
+  @test_util.run_deprecated_v1
+  def test_call_on_placeholder(self, initializer):
+    inputs = array_ops.placeholder(dtype=dtypes.float32, shape=[None, None])
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=5,
+        kernel_initializer=initializer,
+        name='random_fourier_features')
+    with self.assertRaisesRegexp(
+        ValueError, r'The last dimension of the inputs to '
+        '`RandomFourierFeatures` should be defined. Found `None`.'):
+      rff_layer(inputs)
+
+    inputs = array_ops.placeholder(dtype=dtypes.float32, shape=[2, None])
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=5,
+        kernel_initializer=initializer,
+        name='random_fourier_features')
+    with self.assertRaisesRegexp(
+        ValueError, r'The last dimension of the inputs to '
+        '`RandomFourierFeatures` should be defined. Found `None`.'):
+      rff_layer(inputs)
+
+    inputs = array_ops.placeholder(dtype=dtypes.float32, shape=[None, 3])
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=5, name='random_fourier_features')
+    rff_layer(inputs)
+
+  @parameterized.named_parameters(('gaussian', 10, 'gaussian', 2.0),
+                                  ('laplacian', 5, 'laplacian', None),
+                                  ('other', 10, init_ops.ones_initializer, 1.0))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_compute_output_shape(self, output_dim, initializer, scale):
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim, initializer, scale=scale, name='rff')
+    with self.assertRaises(ValueError):
+      rff_layer.compute_output_shape(tensor_shape.TensorShape(None))
+    with self.assertRaises(ValueError):
+      rff_layer.compute_output_shape(tensor_shape.TensorShape([]))
+    with self.assertRaises(ValueError):
+      rff_layer.compute_output_shape(tensor_shape.TensorShape([3]))
+    with self.assertRaises(ValueError):
+      rff_layer.compute_output_shape(tensor_shape.TensorShape([3, 2, 3]))
+
+    with self.assertRaisesRegexp(
+        ValueError, r'The innermost dimension of input shape must be defined.'):
+      rff_layer.compute_output_shape(tensor_shape.TensorShape([3, None]))
+
+    self.assertEqual([None, output_dim],
+                     rff_layer.compute_output_shape((None, 3)).as_list())
+    self.assertEqual([None, output_dim],
+                     rff_layer.compute_output_shape(
+                         tensor_shape.TensorShape([None, 2])).as_list())
+    self.assertEqual([4, output_dim],
+                     rff_layer.compute_output_shape((4, 1)).as_list())
+
+  @parameterized.named_parameters(
+      ('gaussian', 10, 'gaussian', 3.0, False),
+      ('laplacian', 5, 'laplacian', 5.5, True),
+      ('other', 7, init_ops.random_uniform_initializer(), None, True))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_get_config(self, output_dim, initializer, scale, trainable):
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim,
+        initializer,
+        scale=scale,
+        trainable=trainable,
+        name='random_fourier_features',
+    )
+    expected_initializer = initializer
+    if isinstance(initializer, init_ops.Initializer):
+      expected_initializer = initializers.serialize(initializer)
+
+    expected_config = {
+        'output_dim': output_dim,
+        'kernel_initializer': expected_initializer,
+        'scale': scale,
+        'name': 'random_fourier_features',
+        'trainable': trainable,
+        'dtype': None,
+    }
+    self.assertLen(expected_config, len(rff_layer.get_config()))
+    self.assertSameElements(
+        list(expected_config.items()), list(rff_layer.get_config().items()))
+
+  @parameterized.named_parameters(
+      ('gaussian', 5, 'gaussian', None, True),
+      ('laplacian', 5, 'laplacian', 5.5, False),
+      ('other', 7, init_ops.ones_initializer(), 2.0, True))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_from_config(self, output_dim, initializer, scale, trainable):
+    model_config = {
+        'output_dim': output_dim,
+        'kernel_initializer': initializer,
+        'scale': scale,
+        'trainable': trainable,
+        'name': 'random_fourier_features',
+    }
+    rff_layer = kernel_layers.RandomFourierFeatures.from_config(model_config)
+    self.assertEqual(rff_layer.output_dim, output_dim)
+    self.assertEqual(rff_layer.kernel_initializer, initializer)
+    self.assertEqual(rff_layer.scale, scale)
+    self.assertEqual(rff_layer.trainable, trainable)
+
+    inputs = random_ops.random_uniform((3, 2), seed=1)
+    outputs = rff_layer(inputs)
+    self.assertListEqual([3, output_dim], outputs.get_shape().as_list())
+    num_trainable_vars = 1 if trainable else 0
+    self.assertLen(rff_layer.trainable_variables, num_trainable_vars)
+    if trainable:
+      self.assertEqual('random_fourier_features/random_features_scale:0',
+                       rff_layer.trainable_variables[0].name)
+    self.assertLen(rff_layer.non_trainable_variables, 3 - num_trainable_vars)
+    if not context.executing_eagerly():
+      self.assertLen(
+          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES),
+          num_trainable_vars)
+
+  @parameterized.named_parameters(
+      ('gaussian', 10, 'gaussian', 3.0, True),
+      ('laplacian', 5, 'laplacian', 5.5, False),
+      ('other', 10, init_ops.random_uniform_initializer(), None, True))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_same_random_features_params_reused(self, output_dim, initializer,
+                                              scale, trainable):
+    """Applying the layer on the same input twice gives the same output."""
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=output_dim,
+        kernel_initializer=initializer,
+        scale=scale,
+        trainable=trainable,
+        name='random_fourier_features')
+    inputs = constant_op.constant(
+        np.random.uniform(low=-1.0, high=1.0, size=(2, 4)))
+    output1 = rff_layer.apply(inputs)
+    output2 = rff_layer.apply(inputs)
+    self._assert_all_close(output1, output2)
+
+  @parameterized.named_parameters(
+      ('gaussian', 'gaussian', 5.0), ('laplacian', 'laplacian', 3.0),
+      ('other', init_ops.random_uniform_initializer(), 5.0))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_different_params_similar_approximation(self, initializer, scale):
+    random_seed.set_random_seed(12345)
+    rff_layer1 = kernel_layers.RandomFourierFeatures(
+        output_dim=3000,
+        kernel_initializer=initializer,
+        scale=scale,
+        name='rff1')
+    rff_layer2 = kernel_layers.RandomFourierFeatures(
+        output_dim=2000,
+        kernel_initializer=initializer,
+        scale=scale,
+        name='rff2')
+    # Two distinct inputs.
+    x = constant_op.constant([[1.0, -1.0, 0.5]])
+    y = constant_op.constant([[-1.0, 1.0, 1.0]])
+
+    # Apply both layers to both inputs.
+    output_x1 = math.sqrt(2.0 / 3000.0) * rff_layer1.apply(x)
+    output_y1 = math.sqrt(2.0 / 3000.0) * rff_layer1.apply(y)
+    output_x2 = math.sqrt(2.0 / 2000.0) * rff_layer2.apply(x)
+    output_y2 = math.sqrt(2.0 / 2000.0) * rff_layer2.apply(y)
+
+    # Compute the inner products of the outputs (on inputs x and y) for both
+    # layers. For any fixed random features layer rff_layer, and inputs x, y,
+    # rff_layer(x)^T * rff_layer(y) ~= K(x,y) up to a normalization factor.
+    approx_kernel1 = kernelized_utils.inner_product(output_x1, output_y1)
+    approx_kernel2 = kernelized_utils.inner_product(output_x2, output_y2)
+    self._assert_all_close(approx_kernel1, approx_kernel2, atol=0.08)
+
+  @parameterized.named_parameters(
+      ('gaussian', 'gaussian', 5.0, _exact_gaussian(stddev=5.0)),
+      ('laplacian', 'laplacian', 20.0, _exact_laplacian(stddev=20.0)))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_bad_kernel_approximation(self, initializer, scale, exact_kernel_fn):
+    """Approximation is bad when output dimension is small."""
+    # Two distinct inputs.
+    x = constant_op.constant([[1.0, -1.0, 0.5]])
+    y = constant_op.constant([[-1.0, 1.0, 1.0]])
+
+    small_output_dim = 10
+    random_seed.set_random_seed(1234)
+    # Initialize layer.
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=small_output_dim,
+        kernel_initializer=initializer,
+        scale=scale,
+        name='random_fourier_features')
+
+    # Apply layer to both inputs.
+    output_x = math.sqrt(2.0 / small_output_dim) * rff_layer.apply(x)
+    output_y = math.sqrt(2.0 / small_output_dim) * rff_layer.apply(y)
+
+    # The inner products of the outputs (on inputs x and y) approximates the
+    # real value of the RBF kernel but poorly since the output dimension of the
+    # layer is small.
+    exact_kernel_value = exact_kernel_fn(x, y)
+    approx_kernel_value = kernelized_utils.inner_product(output_x, output_y)
+    abs_error = math_ops.abs(exact_kernel_value - approx_kernel_value)
+    if not context.executing_eagerly():
+      with self.cached_session() as sess:
+        keras_backend._initialize_variables(sess)
+        abs_error_eval = sess.run([abs_error])
+        self.assertGreater(abs_error_eval[0][0], 0.05)
+        self.assertLess(abs_error_eval[0][0], 0.5)
+    else:
+      self.assertGreater(abs_error, 0.05)
+      self.assertLess(abs_error, 0.5)
+
+  @parameterized.named_parameters(
+      ('gaussian', 'gaussian', 10.0, _exact_gaussian(stddev=10.0)),
+      ('laplacian', 'laplacian', 50.0, _exact_laplacian(stddev=50.0)))
+  @test_util.run_in_graph_and_eager_modes()
+  def test_good_kernel_approximation_multiple_inputs(self, initializer, scale,
+                                                     exact_kernel_fn):
+    # Parameters.
+    input_dim = 5
+    output_dim = 5000
+    x_rows = 20
+    y_rows = 30
+
+    random_seed.set_random_seed(1234)
+    x = random_ops.random_uniform(shape=(x_rows, input_dim), maxval=1.0)
+    y = random_ops.random_uniform(shape=(y_rows, input_dim), maxval=1.0)
+
+    rff_layer = kernel_layers.RandomFourierFeatures(
+        output_dim=output_dim,
+        kernel_initializer=initializer,
+        scale=scale,
+        name='random_fourier_features')
+
+    # The shapes of output_x and output_y are (x_rows, output_dim) and
+    # (y_rows, output_dim) respectively.
+    output_x = math.sqrt(2.0 / output_dim) * rff_layer.apply(x)
+    output_y = math.sqrt(2.0 / output_dim) * rff_layer.apply(y)
+
+    approx_kernel_matrix = kernelized_utils.inner_product(output_x, output_y)
+    exact_kernel_matrix = exact_kernel_fn(x, y)
+    self._assert_all_close(approx_kernel_matrix, exact_kernel_matrix, atol=0.1)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/layers/local.py b/tensorflow/python/keras/layers/local.py
index d2c4aaa125e7f1415c4e33224056c18418670769..2c66608f8628977f2529c6cd7c47851053900540 100644
--- a/tensorflow/python/keras/layers/local.py
+++ b/tensorflow/python/keras/layers/local.py
@@ -27,10 +27,10 @@ from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.keras.utils import tf_utils
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.layers.LocallyConnected1D')
+@keras_export('keras.layers.LocallyConnected1D')
 class LocallyConnected1D(Layer):
   """Locally-connected layer for 1D inputs.
 
@@ -293,7 +293,7 @@ class LocallyConnected1D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.LocallyConnected2D')
+@keras_export('keras.layers.LocallyConnected2D')
 class LocallyConnected2D(Layer):
   """Locally-connected layer for 2D inputs.
 
diff --git a/tensorflow/python/keras/layers/lstm_test.py b/tensorflow/python/keras/layers/lstm_test.py
index aea426150260cf4c7b849b18319789eaf4f5da5a..38e165653e80c4ed82f55ac0482ae8ed5a5d5b4f 100644
--- a/tensorflow/python/keras/layers/lstm_test.py
+++ b/tensorflow/python/keras/layers/lstm_test.py
@@ -22,16 +22,16 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
-from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.eager import context
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training import adam
 from tensorflow.python.training import gradient_descent
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class LSTMLayerTest(test.TestCase, parameterized.TestCase):
+@keras_parameterized.run_all_keras_modes
+class LSTMLayerTest(keras_parameterized.TestCase):
 
   def test_return_sequences_LSTM(self):
     num_samples = 2
@@ -67,7 +67,9 @@ class LSTMLayerTest(test.TestCase, parameterized.TestCase):
     layer = keras.layers.LSTM(units, input_shape=(None, embedding_dim))
     model = keras.models.Sequential()
     model.add(layer)
-    model.compile(RMSPropOptimizer(0.001), 'mse')
+    model.compile(
+        'rmsprop', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+
     x = np.random.random((num_samples, timesteps, embedding_dim))
     y = np.random.random((num_samples, units))
     model.train_on_batch(x, y)
@@ -115,7 +117,6 @@ class LSTMLayerTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
     self.assertEqual(layer.cell.bias.constraint, b_constraint)
 
-  @tf_test_util.run_v1_only('b/120545219')
   def test_with_masking_layer_LSTM(self):
     layer_class = keras.layers.LSTM
     inputs = np.random.random((2, 3, 4))
@@ -124,11 +125,12 @@ class LSTMLayerTest(test.TestCase, parameterized.TestCase):
     model = keras.models.Sequential()
     model.add(keras.layers.Masking(input_shape=(3, 4)))
     model.add(layer_class(units=5, return_sequences=True, unroll=False))
-    model.compile(loss='categorical_crossentropy',
-                  optimizer=RMSPropOptimizer(0.01))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
-  @tf_test_util.run_v1_only('b/120545219')
   def test_masking_with_stacking_LSTM(self):
     inputs = np.random.random((2, 3, 4))
     targets = np.abs(np.random.random((2, 3, 5)))
@@ -137,8 +139,10 @@ class LSTMLayerTest(test.TestCase, parameterized.TestCase):
     model.add(keras.layers.Masking(input_shape=(3, 4)))
     lstm_cells = [keras.layers.LSTMCell(10), keras.layers.LSTMCell(5)]
     model.add(keras.layers.RNN(lstm_cells, return_sequences=True, unroll=False))
-    model.compile(loss='categorical_crossentropy',
-                  optimizer=RMSPropOptimizer(0.01))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
   def test_from_config_LSTM(self):
@@ -167,7 +171,8 @@ class LSTMLayerTest(test.TestCase, parameterized.TestCase):
 
     model = keras.models.Model([inputs] + initial_state, output)
     model.compile(loss='categorical_crossentropy',
-                  optimizer=adam.AdamOptimizer())
+                  optimizer=adam.AdamOptimizer(),
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     inputs = np.random.random((num_samples, timesteps, embedding_dim))
     initial_state = [np.random.random((num_samples, units))
@@ -192,7 +197,8 @@ class LSTMLayerTest(test.TestCase, parameterized.TestCase):
 
     model = keras.models.Model(inputs, output)
     model.compile(loss='categorical_crossentropy',
-                  optimizer=adam.AdamOptimizer())
+                  optimizer=adam.AdamOptimizer(),
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     inputs = np.random.random((num_samples, timesteps, embedding_dim))
     targets = np.random.random((num_samples, units))
@@ -241,8 +247,10 @@ class LSTMLayerTest(test.TestCase, parameterized.TestCase):
     output = keras.layers.LSTM(units)(inputs, initial_state=initial_state)
 
     model = keras.models.Model([inputs] + initial_state, output)
-    model.compile(loss='categorical_crossentropy',
-                  optimizer=RMSPropOptimizer(0.01))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
 
     inputs = np.random.random((num_samples, timesteps, embedding_dim))
     initial_state = [np.random.random((num_samples, units))
@@ -303,7 +311,8 @@ class LSTMLayerTest(test.TestCase, parameterized.TestCase):
 
     model = keras.models.Model(inputs, output)
     model.compile(loss='categorical_crossentropy',
-                  optimizer=adam.AdamOptimizer())
+                  optimizer=adam.AdamOptimizer(),
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     main_inputs = np.random.random((num_samples, timesteps, embedding_dim))
     initial_state = [np.random.random((num_samples, units))
@@ -311,92 +320,89 @@ class LSTMLayerTest(test.TestCase, parameterized.TestCase):
     targets = np.random.random((num_samples, units))
     model.train_on_batch([main_inputs] + initial_state, targets)
 
+  def test_regularizers_LSTM(self):
+    embedding_dim = 4
+    layer_class = keras.layers.LSTM
+    layer = layer_class(
+        5,
+        return_sequences=False,
+        weights=None,
+        input_shape=(None, embedding_dim),
+        kernel_regularizer=keras.regularizers.l1(0.01),
+        recurrent_regularizer=keras.regularizers.l1(0.01),
+        bias_regularizer='l2',
+        activity_regularizer='l1')
+    layer.build((None, None, 2))
+    self.assertEqual(len(layer.losses), 3)
+    x = keras.backend.variable(np.ones((2, 3, 2)))
+    layer(x)
+    if context.executing_eagerly():
+      self.assertEqual(len(layer.losses), 4)
+    else:
+      self.assertEqual(len(layer.get_losses_for(x)), 1)
 
-class LSTMLayerGraphOnlyTest(test.TestCase):
-
-  @tf_test_util.run_v1_only('b/120545219')
   def test_statefulness_LSTM(self):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
     layer_class = keras.layers.LSTM
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Embedding(
-              4,
-              embedding_dim,
-              mask_zero=True,
-              input_length=timesteps,
-              batch_input_shape=(num_samples, timesteps)))
-      layer = layer_class(
-          units, return_sequences=False, stateful=True, weights=None)
-      model.add(layer)
-      model.compile(optimizer=gradient_descent.GradientDescentOptimizer(0.01),
-                    loss='mse')
-      out1 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertEqual(out1.shape, (num_samples, units))
-
-      # train once so that the states change
-      model.train_on_batch(
-          np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
-      out2 = model.predict(np.ones((num_samples, timesteps)))
-
-      # if the state is not reset, output should be different
-      self.assertNotEqual(out1.max(), out2.max())
-
-      # check that output changes after states are reset
-      # (even though the model itself didn't change)
-      layer.reset_states()
-      out3 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertNotEqual(out2.max(), out3.max())
-
-      # check that container-level reset_states() works
-      model.reset_states()
-      out4 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertAllClose(out3, out4, atol=1e-5)
-
-      # check that the call to `predict` updated the states
-      out5 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertNotEqual(out4.max(), out5.max())
-
-      # Check masking
-      layer.reset_states()
-
-      left_padded_input = np.ones((num_samples, timesteps))
-      left_padded_input[0, :1] = 0
-      left_padded_input[1, :2] = 0
-      out6 = model.predict(left_padded_input)
-
-      layer.reset_states()
-
-      right_padded_input = np.ones((num_samples, timesteps))
-      right_padded_input[0, -1:] = 0
-      right_padded_input[1, -2:] = 0
-      out7 = model.predict(right_padded_input)
-
-      self.assertAllClose(out7, out6, atol=1e-5)
-
-  @tf_test_util.run_deprecated_v1
-  def test_regularizers_LSTM(self):
-    embedding_dim = 4
-    layer_class = keras.layers.LSTM
-    with self.cached_session():
-      layer = layer_class(
-          5,
-          return_sequences=False,
-          weights=None,
-          input_shape=(None, embedding_dim),
-          kernel_regularizer=keras.regularizers.l1(0.01),
-          recurrent_regularizer=keras.regularizers.l1(0.01),
-          bias_regularizer='l2',
-          activity_regularizer='l1')
-      layer.build((None, None, 2))
-      self.assertEqual(len(layer.losses), 3)
-      x = keras.backend.variable(np.ones((2, 3, 2)))
-      layer(x)
-      self.assertEqual(len(layer.get_losses_for(x)), 1)
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.Embedding(
+            4,
+            embedding_dim,
+            mask_zero=True,
+            input_length=timesteps,
+            batch_input_shape=(num_samples, timesteps)))
+    layer = layer_class(
+        units, return_sequences=False, stateful=True, weights=None)
+    model.add(layer)
+    model.compile(optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                  loss='mse', run_eagerly=testing_utils.should_run_eagerly())
+    out1 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertEqual(out1.shape, (num_samples, units))
+
+    # train once so that the states change
+    model.train_on_batch(
+        np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
+    out2 = model.predict(np.ones((num_samples, timesteps)))
+
+    # if the state is not reset, output should be different
+    self.assertNotEqual(out1.max(), out2.max())
+
+    # check that output changes after states are reset
+    # (even though the model itself didn't change)
+    layer.reset_states()
+    out3 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertNotEqual(out2.max(), out3.max())
+
+    # check that container-level reset_states() works
+    model.reset_states()
+    out4 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertAllClose(out3, out4, atol=1e-5)
+
+    # check that the call to `predict` updated the states
+    out5 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertNotEqual(out4.max(), out5.max())
+
+    # Check masking
+    layer.reset_states()
+
+    left_padded_input = np.ones((num_samples, timesteps))
+    left_padded_input[0, :1] = 0
+    left_padded_input[1, :2] = 0
+    out6 = model.predict(left_padded_input)
+
+    layer.reset_states()
+
+    right_padded_input = np.ones((num_samples, timesteps))
+    right_padded_input[0, -1:] = 0
+    right_padded_input[1, -2:] = 0
+    out7 = model.predict(right_padded_input)
+
+    self.assertAllClose(out7, out6, atol=1e-5)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/merge.py b/tensorflow/python/keras/layers/merge.py
index 45e705c69606c4dd839429597aa9903a9442234a..b497bf48cda1947f6be820d2ce4337287f70e491 100644
--- a/tensorflow/python/keras/layers/merge.py
+++ b/tensorflow/python/keras/layers/merge.py
@@ -26,7 +26,7 @@ from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
 class _Merge(Layer):
@@ -87,7 +87,7 @@ class _Merge(Layer):
   def build(self, input_shape):
     # Used purely for shape validation.
     if not isinstance(input_shape, list):
-      raise ValueError('A merge layer should be called ' 'on a list of inputs.')
+      raise ValueError('A merge layer should be called on a list of inputs.')
     if len(input_shape) < 2:
       raise ValueError('A merge layer should be called '
                        'on a list of at least 2 inputs. '
@@ -118,7 +118,7 @@ class _Merge(Layer):
 
   def call(self, inputs):
     if not isinstance(inputs, list):
-      raise ValueError('A merge layer should be called ' 'on a list of inputs.')
+      raise ValueError('A merge layer should be called on a list of inputs.')
     if self._reshape_required:
       reshaped_inputs = []
       input_ndims = list(map(K.ndim, inputs))
@@ -218,7 +218,7 @@ class _Merge(Layer):
     return K.all(K.concatenate(masks, axis=0), axis=0, keepdims=False)
 
 
-@tf_export('keras.layers.Add')
+@keras_export('keras.layers.Add')
 class Add(_Merge):
   """Layer that adds a list of inputs.
 
@@ -250,7 +250,7 @@ class Add(_Merge):
     return output
 
 
-@tf_export('keras.layers.Subtract')
+@keras_export('keras.layers.Subtract')
 class Subtract(_Merge):
   """Layer that subtracts two inputs.
 
@@ -289,7 +289,7 @@ class Subtract(_Merge):
     return inputs[0] - inputs[1]
 
 
-@tf_export('keras.layers.Multiply')
+@keras_export('keras.layers.Multiply')
 class Multiply(_Merge):
   """Layer that multiplies (element-wise) a list of inputs.
 
@@ -305,7 +305,7 @@ class Multiply(_Merge):
     return output
 
 
-@tf_export('keras.layers.Average')
+@keras_export('keras.layers.Average')
 class Average(_Merge):
   """Layer that averages a list of inputs.
 
@@ -321,7 +321,7 @@ class Average(_Merge):
     return output / len(inputs)
 
 
-@tf_export('keras.layers.Maximum')
+@keras_export('keras.layers.Maximum')
 class Maximum(_Merge):
   """Layer that computes the maximum (element-wise) a list of inputs.
 
@@ -337,7 +337,7 @@ class Maximum(_Merge):
     return output
 
 
-@tf_export('keras.layers.Minimum')
+@keras_export('keras.layers.Minimum')
 class Minimum(_Merge):
   """Layer that computes the minimum (element-wise) a list of inputs.
 
@@ -353,7 +353,7 @@ class Minimum(_Merge):
     return output
 
 
-@tf_export('keras.layers.Concatenate')
+@keras_export('keras.layers.Concatenate')
 class Concatenate(_Merge):
   """Layer that concatenates a list of inputs.
 
@@ -444,7 +444,7 @@ class Concatenate(_Merge):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.Dot')
+@keras_export('keras.layers.Dot')
 class Dot(_Merge):
   """Layer that computes a dot product between samples in two tensors.
 
@@ -504,7 +504,7 @@ class Dot(_Merge):
 
   def _merge_function(self, inputs):
     if len(inputs) != 2:
-      raise ValueError('A `Dot` layer should be called ' 'on exactly 2 inputs')
+      raise ValueError('A `Dot` layer should be called on exactly 2 inputs')
     x1 = inputs[0]
     x2 = inputs[1]
     if isinstance(self.axes, int):
@@ -559,7 +559,7 @@ class Dot(_Merge):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.add')
+@keras_export('keras.layers.add')
 def add(inputs, **kwargs):
   """Functional interface to the `Add` layer.
 
@@ -588,7 +588,7 @@ def add(inputs, **kwargs):
   return Add(**kwargs)(inputs)
 
 
-@tf_export('keras.layers.subtract')
+@keras_export('keras.layers.subtract')
 def subtract(inputs, **kwargs):
   """Functional interface to the `Subtract` layer.
 
@@ -617,7 +617,7 @@ def subtract(inputs, **kwargs):
   return Subtract(**kwargs)(inputs)
 
 
-@tf_export('keras.layers.multiply')
+@keras_export('keras.layers.multiply')
 def multiply(inputs, **kwargs):
   """Functional interface to the `Multiply` layer.
 
@@ -631,7 +631,7 @@ def multiply(inputs, **kwargs):
   return Multiply(**kwargs)(inputs)
 
 
-@tf_export('keras.layers.average')
+@keras_export('keras.layers.average')
 def average(inputs, **kwargs):
   """Functional interface to the `Average` layer.
 
@@ -645,7 +645,7 @@ def average(inputs, **kwargs):
   return Average(**kwargs)(inputs)
 
 
-@tf_export('keras.layers.maximum')
+@keras_export('keras.layers.maximum')
 def maximum(inputs, **kwargs):
   """Functional interface to the `Maximum` layer.
 
@@ -659,7 +659,7 @@ def maximum(inputs, **kwargs):
   return Maximum(**kwargs)(inputs)
 
 
-@tf_export('keras.layers.minimum')
+@keras_export('keras.layers.minimum')
 def minimum(inputs, **kwargs):
   """Functional interface to the `Minimum` layer.
 
@@ -673,7 +673,7 @@ def minimum(inputs, **kwargs):
   return Minimum(**kwargs)(inputs)
 
 
-@tf_export('keras.layers.concatenate')
+@keras_export('keras.layers.concatenate')
 def concatenate(inputs, axis=-1, **kwargs):
   """Functional interface to the `Concatenate` layer.
 
@@ -688,7 +688,7 @@ def concatenate(inputs, axis=-1, **kwargs):
   return Concatenate(axis=axis, **kwargs)(inputs)
 
 
-@tf_export('keras.layers.dot')
+@keras_export('keras.layers.dot')
 def dot(inputs, axes, normalize=False, **kwargs):
   """Functional interface to the `Dot` layer.
 
diff --git a/tensorflow/python/keras/layers/merge_test.py b/tensorflow/python/keras/layers/merge_test.py
index fcb161ae20a4caeaa9514477529c2885d6e5bd41..7432ad4af886f2cbe20574d2c264d81681b210a6 100644
--- a/tensorflow/python/keras/layers/merge_test.py
+++ b/tensorflow/python/keras/layers/merge_test.py
@@ -21,22 +21,26 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.keras import backend as K
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.ops import array_ops
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class MergeLayersTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class MergeLayersTest(keras_parameterized.TestCase):
 
   def test_merge_add(self):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 5))
     i3 = keras.layers.Input(shape=(4, 5))
 
-    o = keras.layers.add([i1, i2, i3])
-    self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+    add_layer = keras.layers.Add()
+    o = add_layer([i1, i2, i3])
+    self.assertListEqual(o.shape.as_list(), [None, 4, 5])
     model = keras.models.Model([i1, i2, i3], o)
+    model.run_eagerly = testing_utils.should_run_eagerly()
 
     x1 = np.random.random((2, 4, 5))
     x2 = np.random.random((2, 4, 5))
@@ -45,25 +49,64 @@ class MergeLayersTest(test.TestCase):
     self.assertEqual(out.shape, (2, 4, 5))
     self.assertAllClose(out, x1 + x2 + x3, atol=1e-4)
 
-  def test_merge_elementwise_errors(self):
+    self.assertEqual(
+        add_layer.compute_mask([i1, i2, i3], [None, None, None]), None)
+    self.assertTrue(
+        np.all(
+            K.eval(
+                add_layer.compute_mask(
+                    [i1, i2], [K.variable(x1), K.variable(x2)]))))
+
+    with self.assertRaisesRegexp(ValueError, "`mask` should be a list."):
+      add_layer.compute_mask([i1, i2, i3], x1)
+    with self.assertRaisesRegexp(ValueError, "`inputs` should be a list."):
+      add_layer.compute_mask(i1, [None, None, None])
+    with self.assertRaisesRegexp(ValueError, " should have the same length."):
+      add_layer.compute_mask([i1, i2, i3], [None, None])
+
+  def test_merge_subtract(self):
     i1 = keras.layers.Input(shape=(4, 5))
-    i2 = keras.layers.Input(shape=(4, 6))
-    with self.assertRaises(ValueError):
-      keras.layers.add([i1, i2])
-    with self.assertRaises(ValueError):
-      keras.layers.add([i1])
-    with self.assertRaises(ValueError):
-      keras.layers.add(i1)
-    with self.assertRaises(ValueError):
-      keras.layers.add([i1])
+    i2 = keras.layers.Input(shape=(4, 5))
+    i3 = keras.layers.Input(shape=(4, 5))
+
+    subtract_layer = keras.layers.Subtract()
+    o = subtract_layer([i1, i2])
+    self.assertListEqual(o.shape.as_list(), [None, 4, 5])
+    model = keras.models.Model([i1, i2], o)
+    model.run_eagerly = testing_utils.should_run_eagerly()
+
+    x1 = np.random.random((2, 4, 5))
+    x2 = np.random.random((2, 4, 5))
+    out = model.predict([x1, x2])
+    self.assertEqual(out.shape, (2, 4, 5))
+    self.assertAllClose(out, x1 - x2, atol=1e-4)
+
+    self.assertEqual(subtract_layer.compute_mask([i1, i2], [None, None]), None)
+    self.assertTrue(
+        np.all(
+            K.eval(
+                subtract_layer.compute_mask(
+                    [i1, i2], [K.variable(x1), K.variable(x2)]))))
+
+    with self.assertRaisesRegexp(ValueError, "`mask` should be a list."):
+      subtract_layer.compute_mask([i1, i2], x1)
+    with self.assertRaisesRegexp(ValueError, "`inputs` should be a list."):
+      subtract_layer.compute_mask(i1, [None, None])
+    with self.assertRaisesRegexp(ValueError,
+                                 "layer should be called on exactly 2 inputs"):
+      subtract_layer([i1, i2, i3])
+    with self.assertRaisesRegexp(ValueError,
+                                 "layer should be called on exactly 2 inputs"):
+      subtract_layer([i1])
 
   def test_merge_multiply(self):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 5))
     i3 = keras.layers.Input(shape=(4, 5))
     o = keras.layers.multiply([i1, i2, i3])
-    self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+    self.assertListEqual(o.shape.as_list(), [None, 4, 5])
     model = keras.models.Model([i1, i2, i3], o)
+    model.run_eagerly = testing_utils.should_run_eagerly()
 
     x1 = np.random.random((2, 4, 5))
     x2 = np.random.random((2, 4, 5))
@@ -76,8 +119,9 @@ class MergeLayersTest(test.TestCase):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 5))
     o = keras.layers.average([i1, i2])
-    self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+    self.assertListEqual(o.shape.as_list(), [None, 4, 5])
     model = keras.models.Model([i1, i2], o)
+    model.run_eagerly = testing_utils.should_run_eagerly()
 
     x1 = np.random.random((2, 4, 5))
     x2 = np.random.random((2, 4, 5))
@@ -89,8 +133,9 @@ class MergeLayersTest(test.TestCase):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 5))
     o = keras.layers.maximum([i1, i2])
-    self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+    self.assertListEqual(o.shape.as_list(), [None, 4, 5])
     model = keras.models.Model([i1, i2], o)
+    model.run_eagerly = testing_utils.should_run_eagerly()
 
     x1 = np.random.random((2, 4, 5))
     x2 = np.random.random((2, 4, 5))
@@ -102,8 +147,9 @@ class MergeLayersTest(test.TestCase):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 5))
     o = keras.layers.minimum([i1, i2])
-    self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
+    self.assertListEqual(o.shape.as_list(), [None, 4, 5])
     model = keras.models.Model([i1, i2], o)
+    model.run_eagerly = testing_utils.should_run_eagerly()
 
     x1 = np.random.random((2, 4, 5))
     x2 = np.random.random((2, 4, 5))
@@ -114,9 +160,11 @@ class MergeLayersTest(test.TestCase):
   def test_merge_concatenate(self):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 5))
-    o = keras.layers.concatenate([i1, i2], axis=1)
-    self.assertListEqual(o.get_shape().as_list(), [None, 8, 5])
+    concat_layer = keras.layers.Concatenate(axis=1)
+    o = concat_layer([i1, i2])
+    self.assertListEqual(o.shape.as_list(), [None, 8, 5])
     model = keras.models.Model([i1, i2], o)
+    model.run_eagerly = testing_utils.should_run_eagerly()
 
     x1 = np.random.random((2, 4, 5))
     x2 = np.random.random((2, 4, 5))
@@ -124,22 +172,30 @@ class MergeLayersTest(test.TestCase):
     self.assertEqual(out.shape, (2, 8, 5))
     self.assertAllClose(out, np.concatenate([x1, x2], axis=1), atol=1e-4)
 
-  def test_concatenate_errors(self):
-    i1 = keras.layers.Input(shape=(4, 5))
-    i2 = keras.layers.Input(shape=(3, 5))
-    with self.assertRaisesRegexp(ValueError, 'inputs with matching shapes'):
-      keras.layers.concatenate([i1, i2], axis=-1)
-    with self.assertRaisesRegexp(ValueError, 'called on a list'):
-      keras.layers.concatenate(i1, axis=-1)
-    with self.assertRaisesRegexp(ValueError, 'called on a list'):
-      keras.layers.concatenate([i1], axis=-1)
+    self.assertEqual(concat_layer.compute_mask([i1, i2], [None, None]), None)
+    self.assertTrue(
+        np.all(
+            K.eval(
+                concat_layer.compute_mask(
+                    [i1, i2], [K.variable(x1), K.variable(x2)]))))
+
+    with self.assertRaisesRegexp(ValueError, "`mask` should be a list."):
+      concat_layer.compute_mask([i1, i2], x1)
+    with self.assertRaisesRegexp(ValueError, "`inputs` should be a list."):
+      concat_layer.compute_mask(i1, [None, None])
+    with self.assertRaisesRegexp(ValueError, "should have the same length"):
+      concat_layer.compute_mask([i1, i2], [None])
+    with self.assertRaisesRegexp(ValueError,
+                                 "layer should be called on a list of inputs"):
+      concat_layer(i1)
 
   def test_merge_dot(self):
     i1 = keras.layers.Input(shape=(4,))
     i2 = keras.layers.Input(shape=(4,))
     o = keras.layers.dot([i1, i2], axes=1)
-    self.assertListEqual(o.get_shape().as_list(), [None, 1])
+    self.assertListEqual(o.shape.as_list(), [None, 1])
     model = keras.models.Model([i1, i2], o)
+    model.run_eagerly = testing_utils.should_run_eagerly()
     _ = keras.layers.Dot(axes=1).get_config()
 
     x1 = np.random.random((2, 4))
@@ -153,8 +209,9 @@ class MergeLayersTest(test.TestCase):
 
     # Test with negative tuple of axes.
     o = keras.layers.dot([i1, i2], axes=(-1, -1))
-    self.assertListEqual(o.get_shape().as_list(), [None, 1])
+    self.assertListEqual(o.shape.as_list(), [None, 1])
     model = keras.models.Model([i1, i2], o)
+    model.run_eagerly = testing_utils.should_run_eagerly()
     out = model.predict([x1, x2])
     self.assertEqual(out.shape, (2, 1))
     self.assertAllClose(out, expected, atol=1e-4)
@@ -163,6 +220,32 @@ class MergeLayersTest(test.TestCase):
     layer = keras.layers.Dot(axes=-1)
     self.assertEqual(layer.compute_output_shape([(4, 5), (4, 5)]), (4, 1))
 
+
+@tf_test_util.run_all_in_graph_and_eager_modes
+class MergeLayersTestNoExecution(test.TestCase):
+
+  def test_merge_elementwise_errors(self):
+    i1 = keras.layers.Input(shape=(4, 5))
+    i2 = keras.layers.Input(shape=(4, 6))
+    with self.assertRaises(ValueError):
+      keras.layers.add([i1, i2])
+    with self.assertRaises(ValueError):
+      keras.layers.add([i1])
+    with self.assertRaises(ValueError):
+      keras.layers.add(i1)
+    with self.assertRaises(ValueError):
+      keras.layers.add([i1])
+
+  def test_concatenate_errors(self):
+    i1 = keras.layers.Input(shape=(4, 5))
+    i2 = keras.layers.Input(shape=(3, 5))
+    with self.assertRaisesRegexp(ValueError, 'inputs with matching shapes'):
+      keras.layers.concatenate([i1, i2], axis=-1)
+    with self.assertRaisesRegexp(ValueError, 'called on a list'):
+      keras.layers.concatenate(i1, axis=-1)
+    with self.assertRaisesRegexp(ValueError, 'called on a list'):
+      keras.layers.concatenate([i1], axis=-1)
+
   def test_dot_errors(self):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 6))
@@ -183,7 +266,7 @@ class MergeLayersTest(test.TestCase):
     i1 = keras.layers.Input(shape=(4, 5))
     i2 = keras.layers.Input(shape=(4, 5))
     y = keras.layers.subtract([i1, i2])
-    self.assertEqual(y.get_shape().as_list(), [None, 4, 5])
+    self.assertEqual(y.shape.as_list(), [None, 4, 5])
 
     # Test invalid use cases
     i1 = keras.layers.Input(shape=(4, 5))
@@ -193,39 +276,32 @@ class MergeLayersTest(test.TestCase):
     with self.assertRaises(ValueError):
       keras.layers.subtract([i1, i1, i1])
 
-
-class MergeLayersGraphOnlyTest(test.TestCase):
-
   def test_merge_add_masking(self):
-    with self.cached_session():
-      i1 = keras.layers.Input(shape=(4, 5))
-      i2 = keras.layers.Input(shape=(4, 5))
-      m1 = keras.layers.Masking()(i1)
-      layer = keras.layers.Add()
-      o = layer([m1, i2])
-      self.assertListEqual(o.get_shape().as_list(), [None, 4, 5])
-      mask = layer.output_mask
-      self.assertListEqual(mask.get_shape().as_list(), [None, 4])
-
-  @tf_test_util.run_deprecated_v1
+    i1 = keras.layers.Input(shape=(4, 5))
+    i2 = keras.layers.Input(shape=(4, 5))
+    m1 = keras.layers.Masking()(i1)
+    layer = keras.layers.Add()
+    o = layer([m1, i2])
+    self.assertListEqual(o.shape.as_list(), [None, 4, 5])
+    mask = layer.output_mask
+    self.assertListEqual(mask.shape.as_list(), [None, 4])
+
   def test_merge_add_dynamic_shape(self):
-    with self.cached_session():
-      i1 = array_ops.placeholder(shape=(4, None), dtype='float32')
-      i2 = array_ops.placeholder(shape=(4, 5), dtype='float32')
-      layer = keras.layers.Add()
-      o = layer([i1, i2])
-      self.assertListEqual(o.get_shape().as_list(), [4, 5])
+    i1 = keras.Input(batch_shape=(4, None), dtype='float32')
+    i2 = keras.Input(batch_shape=(4, 5), dtype='float32')
+    layer = keras.layers.Add()
+    o = layer([i1, i2])
+    self.assertListEqual(o.shape.as_list(), [4, 5])
 
   def test_merge_concatenate_masking(self):
-    with self.cached_session():
-      i1 = keras.layers.Input(shape=(4, 5))
-      i2 = keras.layers.Input(shape=(4, 5))
-      m1 = keras.layers.Masking()(i1)
-      layer = keras.layers.Concatenate()
-      o = layer([m1, i2])
-      self.assertListEqual(o.get_shape().as_list(), [None, 4, 10])
-      mask = layer.output_mask
-      self.assertListEqual(mask.get_shape().as_list(), [None, 4])
+    i1 = keras.layers.Input(shape=(4, 5))
+    i2 = keras.layers.Input(shape=(4, 5))
+    m1 = keras.layers.Masking()(i1)
+    layer = keras.layers.Concatenate()
+    o = layer([m1, i2])
+    self.assertListEqual(o.shape.as_list(), [None, 4, 10])
+    mask = layer.output_mask
+    self.assertListEqual(mask.shape.as_list(), [None, 4])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/layers/noise.py b/tensorflow/python/keras/layers/noise.py
index cb7cee3ebc3ebd2413836b876f2aaf21985f1d9c..958ab7c0f616a94bd7b35b0575ac8bee91fa037b 100644
--- a/tensorflow/python/keras/layers/noise.py
+++ b/tensorflow/python/keras/layers/noise.py
@@ -25,10 +25,10 @@ from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.layers.GaussianNoise')
+@keras_export('keras.layers.GaussianNoise')
 class GaussianNoise(Layer):
   """Apply additive zero-centered Gaussian noise.
 
@@ -74,7 +74,7 @@ class GaussianNoise(Layer):
     return input_shape
 
 
-@tf_export('keras.layers.GaussianDropout')
+@keras_export('keras.layers.GaussianDropout')
 class GaussianDropout(Layer):
   """Apply multiplicative 1-centered Gaussian noise.
 
@@ -121,7 +121,7 @@ class GaussianDropout(Layer):
     return input_shape
 
 
-@tf_export('keras.layers.AlphaDropout')
+@keras_export('keras.layers.AlphaDropout')
 class AlphaDropout(Layer):
   """Applies Alpha Dropout to the input.
 
diff --git a/tensorflow/python/keras/layers/noise_test.py b/tensorflow/python/keras/layers/noise_test.py
index 325dd933b21bd4182fcd8c20493acba70834383f..f1537a6919f6a13c4e1c5bd793f01f63fb7dc834 100644
--- a/tensorflow/python/keras/layers/noise_test.py
+++ b/tensorflow/python/keras/layers/noise_test.py
@@ -19,13 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python import keras
-from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class NoiseLayersTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class NoiseLayersTest(keras_parameterized.TestCase):
 
   def test_GaussianNoise(self):
     testing_utils.layer_test(
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index 75b10222edd19ea59361d1312ead727e02431cac..5008bd77147420822c96bab872b878d7f3fa6781 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
-
 from tensorflow.python import tf2
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
@@ -40,10 +38,11 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import keras_export
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('keras.layers.BatchNormalization', v1=[])
+@keras_export('keras.layers.BatchNormalization', v1=[])
 class BatchNormalizationV2(Layer):
   """Batch normalization layer (Ioffe and Szegedy, 2014).
 
@@ -91,8 +90,7 @@ class BatchNormalizationV2(Layer):
       if the fused implementation cannot be used. If `None`, use the faster
       implementation if possible. If False, do not used the fused
       implementation.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    trainable: Boolean, if `True` the variables will be marked as trainable.
     virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`,
       which means batch normalization is performed across the whole batch. When
       `virtual_batch_size` is not `None`, instead perform "Ghost Batch
@@ -393,16 +391,16 @@ class BatchNormalizationV2(Layer):
               aggregation=tf_variables.VariableAggregation.MEAN)
           return var
 
-        with distribution_strategy_context.get_distribution_strategy(
-        ).colocate_vars_with(self.moving_mean):
+        with distribution_strategy_context.get_strategy(
+        ).extended.colocate_vars_with(self.moving_mean):
           self.renorm_mean = _renorm_variable('renorm_mean', param_shape)
           self.renorm_mean_weight = _renorm_variable('renorm_mean_weight', ())
         # We initialize renorm_stddev to 0, and maintain the (0-initialized)
         # renorm_stddev_weight. This allows us to (1) mix the average
         # stddev with the minibatch stddev early in training, and (2) compute
         # the unbiased average stddev by dividing renorm_stddev by the weight.
-        with distribution_strategy_context.get_distribution_strategy(
-        ).colocate_vars_with(self.moving_variance):
+        with distribution_strategy_context.get_strategy(
+        ).extended.colocate_vars_with(self.moving_variance):
           self.renorm_stddev = _renorm_variable('renorm_stddev', param_shape)
           self.renorm_stddev_weight = _renorm_variable('renorm_stddev_weight',
                                                        ())
@@ -414,14 +412,7 @@ class BatchNormalizationV2(Layer):
   def _assign_moving_average(self, variable, value, momentum):
     with ops.name_scope(None, 'AssignMovingAvg',
                         [variable, value, momentum]) as scope:
-      # TODO(apassos,srbs,skyewm): the colocation constraints here are disabled
-      # because of a bug which leads cond_v2 to skip rewriting them creating
-      # conflicts.
-      if tf2.enabled():
-        cm = contextlib.contextmanager(lambda: (yield))
-      else:
-        cm = ops.colocate_with(variable)
-      with cm:
+      with ops.colocate_with(variable):
         decay = ops.convert_to_tensor(1.0 - momentum, name='decay')
         if decay.dtype != variable.dtype.base_dtype:
           decay = math_ops.cast(decay, variable.dtype.base_dtype)
@@ -472,10 +463,19 @@ class BatchNormalizationV2(Layer):
     else:
       momentum = ops.convert_to_tensor(self.momentum)
     if training_value or training_value is None:
-      mean_update = self._assign_moving_average(self.moving_mean, mean,
-                                                momentum)
-      variance_update = self._assign_moving_average(self.moving_variance,
-                                                    variance, momentum)
+      if distribution_strategy_context.in_cross_replica_context():
+        strategy = distribution_strategy_context.get_strategy()
+        mean_update = strategy.extended.update(
+            self.moving_mean, self._assign_moving_average,
+            (mean, self.momentum))
+        variance_update = strategy.extended.update(
+            self.moving_variance, self._assign_moving_average,
+            (variance, self.momentum))
+      else:
+        mean_update = self._assign_moving_average(self.moving_mean, mean,
+                                                  momentum)
+        variance_update = self._assign_moving_average(self.moving_variance,
+                                                      variance, momentum)
       self.add_update(mean_update, inputs=True)
       self.add_update(variance_update, inputs=True)
 
@@ -655,20 +655,41 @@ class BatchNormalizationV2(Layer):
         d = _broadcast(array_ops.stop_gradient(d, name='renorm_d'))
         scale, offset = _compose_transforms(r, d, scale, offset)
 
-      def _do_update(var, value):
-        if in_eager_mode and not self.trainable:
-          return
-
-        return self._assign_moving_average(var, value, self.momentum)
-
-      mean_update = tf_utils.smart_cond(
-          training,
-          lambda: _do_update(self.moving_mean, new_mean),
-          lambda: self.moving_mean)
-      variance_update = tf_utils.smart_cond(
-          training,
-          lambda: _do_update(self.moving_variance, new_variance),
-          lambda: self.moving_variance)
+      if distribution_strategy_context.in_cross_replica_context():
+        strategy = distribution_strategy_context.get_strategy()
+
+        def _do_update(var, value):
+          """Compute the updates for mean and variance."""
+          if in_eager_mode and not self.trainable:
+            return
+          return strategy.extended.update(
+              var, self._assign_moving_average, (value, self.momentum),
+              group=False)
+        # We need to unwrap the moving_mean or moving_variance in the case of
+        # training being false to match the output of true_fn and false_fn
+        # in the smart cond.
+        mean_update = tf_utils.smart_cond(
+            training,
+            lambda: _do_update(self.moving_mean, new_mean),
+            lambda: strategy.unwrap(self.moving_mean))
+        variance_update = tf_utils.smart_cond(
+            training,
+            lambda: _do_update(self.moving_variance, new_variance),
+            lambda: strategy.unwrap(self.moving_variance))
+      else:
+        def _do_update(var, value):
+          """Compute the updates for mean and variance."""
+          if in_eager_mode and not self.trainable:
+            return
+          return self._assign_moving_average(var, value, self.momentum)
+        mean_update = tf_utils.smart_cond(
+            training,
+            lambda: _do_update(self.moving_mean, new_mean),
+            lambda: self.moving_mean)
+        variance_update = tf_utils.smart_cond(
+            training,
+            lambda: _do_update(self.moving_variance, new_variance),
+            lambda: self.moving_variance)
       if not context.executing_eagerly():
         self.add_update(mean_update, inputs=True)
         self.add_update(variance_update, inputs=True)
@@ -740,7 +761,7 @@ def _replace_in_v2_docstring(old, new):
   return string.replace(old, new)
 
 
-@tf_export(v1=['keras.layers.BatchNormalization'])  # pylint: disable=missing-docstring
+@keras_export(v1=['keras.layers.BatchNormalization'])  # pylint: disable=missing-docstring
 class BatchNormalizationV1(BatchNormalizationV2):
 
   __doc__ = _replace_in_v2_docstring(
@@ -757,7 +778,243 @@ class BatchNormalizationV1(BatchNormalizationV2):
   _USE_V2_BEHAVIOR = False
 
 
-if tf2.enabled():
+BatchNormalization = None  # pylint: disable=invalid-name
+
+
+@tf_export(v1=['enable_v2_batch_normalization'])
+def enable_v2_batch_normalization():
+  global BatchNormalization  # pylint: disable=invalid-name
   BatchNormalization = BatchNormalizationV2
-else:
+
+
+@tf_export(v1=['disable_v2_batch_normalization'])
+def disable_v2_batch_normalization():
+  global BatchNormalization  # pylint: disable=invalid-name
   BatchNormalization = BatchNormalizationV1
+
+
+if tf2.enabled():
+  enable_v2_batch_normalization()
+else:
+  disable_v2_batch_normalization()
+
+
+@keras_export('keras.layers.experimental.LayerNormalization')
+class LayerNormalization(Layer):
+  """Layer normalization layer (Ba et al., 2016).
+
+  Normalize the activations of the previous layer for each given example in a
+  batch independently, rather than across a batch like Batch Normalization.
+  i.e. applies a transformation that maintains the mean activation within each
+  example close to 0 and the activation standard deviation close to 1.
+
+  Given a tensor `inputs` of rank `R`, moments are calculated and normalization
+  is performed over all axes in norm_axis.  Scaling and centering,
+  if requested, is performed over all axes in params_axis.
+
+  By default, normalization is performed over all but the first axis
+  (the `HWC` if `inputs` is `NHWC`), while the `beta` and `gamma` trainable
+  parameters are calculated for the rightmost axis (the `C` if `inputs` is
+  `NHWC`).  Scaling and recentering is performed via broadcast of the
+  `beta` and `gamma` parameters with the normalized tensor.
+
+  The shapes of `beta` and `gamma` are
+  `[inputs.shape[i] for i in (param axes)]`,
+  and this part of the inputs' shape must be fully defined.
+
+  Arguments:
+    norm_axis: Integer or List. normalization will be
+      performed along these dimensions. If unspecified (None), it will default
+      to the dimensions `begin_norm_axis : rank(inputs)`
+    params_axis: Integer or List. The (beta, gamma) dimensions: scale
+      and centering parameters will have take their shapes from these axes and
+      will be broadcast with the normalized inputs accordingly. If unspecified
+      (None), it will default to the last dimension
+    epsilon: Small float added to variance to avoid dividing by zero.
+    center: If True, add offset of `beta` to normalized tensor.
+        If False, `beta` is ignored.
+    scale: If True, multiply by `gamma`.
+        If False, `gamma` is not used.
+        When the next layer is linear (also e.g. `nn.relu`),
+        this can be disabled since the scaling
+        will be done by the next layer.
+    beta_initializer: Initializer for the beta weight.
+    gamma_initializer: Initializer for the gamma weight.
+    beta_regularizer: Optional regularizer for the beta weight.
+    gamma_regularizer: Optional regularizer for the gamma weight.
+    beta_constraint: Optional constraint for the beta weight.
+    gamma_constraint: Optional constraint for the gamma weight.
+    trainable: Boolean, if `True` the variables will be marked as trainable.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as input.
+
+  References:
+      - [Layer Normalization](https://arxiv.org/abs/1607.06450)
+  """
+
+  def __init__(self,
+               norm_axis=None,
+               params_axis=-1,
+               epsilon=1e-12,
+               center=True,
+               scale=True,
+               beta_initializer='zeros',
+               gamma_initializer='ones',
+               beta_regularizer=None,
+               gamma_regularizer=None,
+               beta_constraint=None,
+               gamma_constraint=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(LayerNormalization, self).__init__(
+        name=name, trainable=trainable, **kwargs)
+    if isinstance(norm_axis, list):
+      self.norm_axis = norm_axis[:]
+    elif isinstance(norm_axis, int):
+      self.norm_axis = norm_axis
+    elif norm_axis is None:
+      self.norm_axis = None
+    else:
+      raise TypeError('norm_axis must be int or list or None, type given: %s'
+                      % type(norm_axis))
+
+    if isinstance(params_axis, list):
+      self.params_axis = params_axis[:]
+    elif isinstance(params_axis, int):
+      self.params_axis = params_axis
+    else:
+      raise TypeError('params_axis must be int or list, type given: %s'
+                      % type(params_axis))
+
+    self.epsilon = epsilon
+    self.center = center
+    self.scale = scale
+    self.beta_initializer = initializers.get(beta_initializer)
+    self.gamma_initializer = initializers.get(gamma_initializer)
+    self.beta_regularizer = regularizers.get(beta_regularizer)
+    self.gamma_regularizer = regularizers.get(gamma_regularizer)
+    self.beta_constraint = constraints.get(beta_constraint)
+    self.gamma_constraint = constraints.get(gamma_constraint)
+
+    self.supports_masking = True
+
+  def build(self, input_shape):
+    ndims = len(input_shape)
+    if ndims is None:
+      raise ValueError('Input shape %s has undefined rank.' % input_shape)
+
+    # Handle an unspecified norm_axis
+    if self.norm_axis is None:
+      self.norm_axis = list(range(1, ndims))
+
+    # Convert axes to lists and resolve negatives
+    if isinstance(self.norm_axis, int):
+      self.norm_axis = [self.norm_axis]
+    for idx, x in enumerate(self.norm_axis):
+      if x < 0:
+        self.norm_axis[idx] = ndims + x
+
+    if isinstance(self.params_axis, int):
+      self.params_axis = [self.params_axis]
+    for idx, x in enumerate(self.params_axis):
+      if x < 0:
+        self.params_axis[idx] = ndims + x
+
+    # Validate axes
+    for x in self.norm_axis:
+      if x < 0 or x >= ndims:
+        raise ValueError('Invalid axis: %d' % x)
+    if len(self.norm_axis) != len(set(self.norm_axis)):
+      raise ValueError('Duplicate axis: %s' % self.norm_axis)
+
+    for x in self.params_axis:
+      if x < 0 or x >= ndims:
+        raise ValueError('Invalid axis: %d' % x)
+    if len(self.params_axis) != len(set(self.params_axis)):
+      raise ValueError('Duplicate axis: %s' % self.params_axis)
+
+    param_shape = [input_shape[dim] for dim in self.params_axis]
+
+    if self.scale:
+      self.gamma = self.add_weight(
+          name='gamma',
+          shape=param_shape,
+          initializer=self.gamma_initializer,
+          regularizer=self.gamma_regularizer,
+          constraint=self.gamma_constraint,
+          trainable=True)
+    else:
+      self.gamma = None
+
+    if self.center:
+      self.beta = self.add_weight(
+          name='beta',
+          shape=param_shape,
+          initializer=self.beta_initializer,
+          regularizer=self.beta_regularizer,
+          constraint=self.beta_constraint,
+          trainable=True)
+    else:
+      self.beta = None
+
+  def call(self, inputs):
+    # Compute the axes along which to reduce the mean / variance
+    input_shape = inputs.get_shape()
+    ndims = len(input_shape)
+
+    # Calculate the moments on the last axis (layer activations).
+    mean, variance = nn.moments(inputs, self.norm_axis, keep_dims=True)
+
+    # Broadcasting only necessary for norm where the params axes aren't just
+    # the last dimension
+    broadcast_shape = [1] * ndims
+    for dim in self.params_axis:
+      broadcast_shape[dim] = input_shape.dims[dim].value
+    def _broadcast(v):
+      if (v is not None and
+          len(v.get_shape()) != ndims and
+          self.params_axis != [ndims - 1]):
+        return array_ops.reshape(v, broadcast_shape)
+      return v
+    scale, offset = _broadcast(self.gamma), _broadcast(self.beta)
+
+    # Compute layer normalization using the batch_normalization function.
+    outputs = nn.batch_normalization(
+        inputs,
+        mean,
+        variance,
+        offset=offset,
+        scale=scale,
+        variance_epsilon=self.epsilon)
+
+    # If some components of the shape got lost due to adjustments, fix that.
+    outputs.set_shape(input_shape)
+
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
+  def get_config(self):
+    config = {
+        'norm_axis': self.norm_axis,
+        'params_axis': self.params_axis,
+        'epsilon': self.epsilon,
+        'center': self.center,
+        'scale': self.scale,
+        'beta_initializer': initializers.serialize(self.beta_initializer),
+        'gamma_initializer': initializers.serialize(self.gamma_initializer),
+        'beta_regularizer': regularizers.serialize(self.beta_regularizer),
+        'gamma_regularizer': regularizers.serialize(self.gamma_regularizer),
+        'beta_constraint': constraints.serialize(self.beta_constraint),
+        'gamma_constraint': constraints.serialize(self.gamma_constraint)
+    }
+    base_config = super(LayerNormalization, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index c1acc2eb3a3a463f4f71d5a010a3388029cb82f4..3815d1e673db7c97eb540d7ac4899fa2d86e26f5 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -21,17 +21,18 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.layers import normalization
 from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-@tf_test_util.run_v1_only('b/120545219')
-class NormalizationLayersTest(test.TestCase):
+class BatchNormalizationTest(keras_parameterized.TestCase):
 
+  @keras_parameterized.run_all_keras_modes
   def test_basic_batchnorm(self):
     testing_utils.layer_test(
         keras.layers.BatchNormalization,
@@ -56,15 +57,8 @@ class NormalizationLayersTest(test.TestCase):
         kwargs={'scale': False,
                 'center': False},
         input_shape=(3, 3))
-    testing_utils.layer_test(
-        normalization.BatchNormalizationV2,
-        kwargs={'fused': True},
-        input_shape=(3, 3, 3, 3))
-    testing_utils.layer_test(
-        normalization.BatchNormalizationV2,
-        kwargs={'fused': None},
-        input_shape=(3, 3, 3))
 
+  @tf_test_util.run_in_graph_and_eager_modes
   def test_batchnorm_weights(self):
     layer = keras.layers.BatchNormalization(scale=False, center=False)
     layer.build((None, 3, 4))
@@ -76,6 +70,7 @@ class NormalizationLayersTest(test.TestCase):
     self.assertEqual(len(layer.trainable_weights), 2)
     self.assertEqual(len(layer.weights), 4)
 
+  @tf_test_util.run_in_graph_and_eager_modes
   def test_batchnorm_regularization(self):
     layer = keras.layers.BatchNormalization(
         gamma_regularizer='l1', beta_regularizer='l1')
@@ -88,36 +83,7 @@ class NormalizationLayersTest(test.TestCase):
     self.assertEqual(layer.gamma.constraint, max_norm)
     self.assertEqual(layer.beta.constraint, max_norm)
 
-  def _test_batchnorm_correctness(self, dtype, use_v2=True, fused=False):
-    model = keras.models.Sequential()
-    layer_ctor = (normalization.BatchNormalizationV2 if use_v2
-                  else normalization.BatchNormalizationV1)
-    norm = layer_ctor(input_shape=(2, 2, 2), momentum=0.8, fused=fused)
-    model.add(norm)
-    model.compile(loss='mse',
-                  optimizer=gradient_descent.GradientDescentOptimizer(0.01))
-
-    # centered on 5.0, variance 10.0
-    x = (np.random.normal(loc=5.0, scale=10.0, size=(1000, 2, 2, 2))
-         .astype(dtype))
-    model.fit(x, x, epochs=4, verbose=0)
-    out = model.predict(x)
-    out -= keras.backend.eval(norm.beta)
-    out /= keras.backend.eval(norm.gamma)
-
-    np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
-    np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
-
-  def test_batchnorm_correctness(self):
-    self._test_batchnorm_correctness(np.float32)
-    self._test_batchnorm_correctness(np.float32, fused=True)
-    self._test_batchnorm_correctness(np.float32, use_v2=False)
-
-  def test_batchnorm_mixed_precision(self):
-    self._test_batchnorm_correctness(np.float16)
-    self._test_batchnorm_correctness(np.float16, fused=True)
-    self._test_batchnorm_correctness(np.float16, use_v2=False)
-
+  @keras_parameterized.run_all_keras_modes
   def test_batchnorm_convnet(self):
     if test.is_gpu_available(cuda_only=True):
       with self.session(use_gpu=True):
@@ -126,7 +92,8 @@ class NormalizationLayersTest(test.TestCase):
             axis=1, input_shape=(3, 4, 4), momentum=0.8)
         model.add(norm)
         model.compile(loss='mse',
-                      optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+                      optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                      run_eagerly=testing_utils.should_run_eagerly())
 
         # centered on 5.0, variance 10.0
         x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 3, 4, 4))
@@ -138,13 +105,15 @@ class NormalizationLayersTest(test.TestCase):
         np.testing.assert_allclose(np.mean(out, axis=(0, 2, 3)), 0.0, atol=1e-1)
         np.testing.assert_allclose(np.std(out, axis=(0, 2, 3)), 1.0, atol=1e-1)
 
+  @keras_parameterized.run_all_keras_modes
   def test_batchnorm_convnet_channel_last(self):
     model = keras.models.Sequential()
     norm = keras.layers.BatchNormalization(
         axis=-1, input_shape=(4, 4, 3), momentum=0.8)
     model.add(norm)
     model.compile(loss='mse',
-                  optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+                  optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     # centered on 5.0, variance 10.0
     x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 4, 4, 3))
@@ -156,6 +125,28 @@ class NormalizationLayersTest(test.TestCase):
     np.testing.assert_allclose(np.mean(out, axis=(0, 1, 2)), 0.0, atol=1e-1)
     np.testing.assert_allclose(np.std(out, axis=(0, 1, 2)), 1.0, atol=1e-1)
 
+  @keras_parameterized.run_all_keras_modes
+  def test_batchnorm_correctness(self):
+    _run_batchnorm_correctness_test(
+        normalization.BatchNormalization, dtype='float32')
+    _run_batchnorm_correctness_test(
+        normalization.BatchNormalization, dtype='float32', fused=True)
+    _run_batchnorm_correctness_test(
+        normalization.BatchNormalization, dtype='float32', fused=False)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_batchnorm_mixed_precision(self):
+    _run_batchnorm_correctness_test(
+        normalization.BatchNormalization, dtype='float16')
+    _run_batchnorm_correctness_test(
+        normalization.BatchNormalization, dtype='float16', fused=True)
+    _run_batchnorm_correctness_test(
+        normalization.BatchNormalization, dtype='float16', fused=False)
+
+
+class BatchNormalizationV1Test(test.TestCase):
+
+  @tf_test_util.run_in_graph_and_eager_modes
   def test_v1_fused_attribute(self):
     norm = normalization.BatchNormalizationV1()
     inp = keras.layers.Input((4, 4, 4))
@@ -174,6 +165,21 @@ class NormalizationLayersTest(test.TestCase):
     norm(inp)
     self.assertEqual(norm.fused, False)
 
+
+class BatchNormalizationV2Test(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_all_keras_modes
+  def test_basic_batchnorm_v2(self):
+    testing_utils.layer_test(
+        normalization.BatchNormalizationV2,
+        kwargs={'fused': True},
+        input_shape=(3, 3, 3, 3))
+    testing_utils.layer_test(
+        normalization.BatchNormalizationV2,
+        kwargs={'fused': None},
+        input_shape=(3, 3, 3))
+
+  @tf_test_util.run_in_graph_and_eager_modes
   def test_v2_fused_attribute(self):
     norm = normalization.BatchNormalizationV2()
     self.assertEqual(norm.fused, None)
@@ -228,7 +234,26 @@ class NormalizationLayersTest(test.TestCase):
       norm(inp)
 
 
-@tf_test_util.run_v1_only('b/120545219')
+def _run_batchnorm_correctness_test(layer, dtype='float32', fused=False):
+  model = keras.models.Sequential()
+  norm = layer(input_shape=(2, 2, 2), momentum=0.8, fused=fused)
+  model.add(norm)
+  model.compile(loss='mse',
+                optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                run_eagerly=testing_utils.should_run_eagerly())
+
+  # centered on 5.0, variance 10.0
+  x = (np.random.normal(loc=5.0, scale=10.0, size=(1000, 2, 2, 2))
+       .astype(dtype))
+  model.fit(x, x, epochs=4, verbose=0)
+  out = model.predict(x)
+  out -= keras.backend.eval(norm.beta)
+  out /= keras.backend.eval(norm.gamma)
+
+  np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
+  np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
+
+
 class NormalizationLayersGraphModeOnlyTest(test.TestCase):
 
   def test_shared_batchnorm(self):
@@ -303,12 +328,15 @@ class NormalizationLayersGraphModeOnlyTest(test.TestCase):
       x2 = model.predict(val_a)
       self.assertAllClose(x1, x2, atol=1e-7)
 
+  @tf_test_util.run_deprecated_v1
   def test_batchnorm_trainable(self):
     """Tests that batchnorm layer is trainable when learning phase is enabled.
 
     Computes mean and std for current inputs then
     applies batch normalization using them.
     """
+    # TODO(fchollet): enable in all execution modes when issue with
+    # learning phase setting is resolved.
     with self.cached_session():
       bn_mean = 0.5
       bn_std = 10.
@@ -328,12 +356,241 @@ class NormalizationLayersGraphModeOnlyTest(test.TestCase):
 
       # Simulates training-mode with trainable layer.
       # Should use mini-batch statistics.
-      keras.backend.set_learning_phase(1)
-      model = get_model(bn_mean, bn_std)
-      model.compile(loss='mse', optimizer='rmsprop')
-      out = model.predict(val_a)
-      self.assertAllClose(
-          (val_a - np.mean(val_a)) / np.std(val_a), out, atol=1e-3)
+      with keras.backend.learning_phase_scope(1):
+        model = get_model(bn_mean, bn_std)
+        model.compile(loss='mse', optimizer='rmsprop')
+        out = model.predict(val_a)
+        self.assertAllClose(
+            (val_a - np.mean(val_a)) / np.std(val_a), out, atol=1e-3)
+
+
+def _run_layernorm_correctness_test(layer, dtype='float32'):
+  model = keras.models.Sequential()
+  norm = layer(input_shape=(2, 2, 2))
+  model.add(norm)
+  model.compile(loss='mse',
+                optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                run_eagerly=testing_utils.should_run_eagerly())
+
+  # centered on 5.0, variance 10.0
+  x = (np.random.normal(loc=5.0, scale=10.0, size=(1000, 2, 2, 2))
+       .astype(dtype))
+  model.fit(x, x, epochs=4, verbose=0)
+  out = model.predict(x)
+  out -= keras.backend.eval(norm.beta)
+  out /= keras.backend.eval(norm.gamma)
+
+  np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
+  np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
+
+
+class LayerNormalizationTest(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_all_keras_modes
+  def test_basic_layernorm(self):
+    testing_utils.layer_test(
+        keras.layers.LayerNormalization,
+        kwargs={
+            'gamma_regularizer': keras.regularizers.l2(0.01),
+            'beta_regularizer': keras.regularizers.l2(0.01)
+        },
+        input_shape=(3, 4, 2))
+    testing_utils.layer_test(
+        keras.layers.LayerNormalization,
+        kwargs={
+            'gamma_initializer': 'ones',
+            'beta_initializer': 'ones',
+        },
+        input_shape=(3, 4, 2))
+    testing_utils.layer_test(
+        keras.layers.LayerNormalization,
+        kwargs={'scale': False,
+                'center': False},
+        input_shape=(3, 3))
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_layernorm_weights(self):
+    layer = keras.layers.LayerNormalization(scale=False, center=False)
+    layer.build((None, 3, 4))
+    self.assertEqual(len(layer.trainable_weights), 0)
+    self.assertEqual(len(layer.weights), 0)
+
+    layer = keras.layers.LayerNormalization()
+    layer.build((None, 3, 4))
+    self.assertEqual(len(layer.trainable_weights), 2)
+    self.assertEqual(len(layer.weights), 2)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_layernorm_regularization(self):
+    layer = keras.layers.LayerNormalization(
+        gamma_regularizer='l1', beta_regularizer='l1')
+    layer.build((None, 3, 4))
+    self.assertEqual(len(layer.losses), 2)
+    max_norm = keras.constraints.max_norm
+    layer = keras.layers.LayerNormalization(
+        gamma_constraint=max_norm, beta_constraint=max_norm)
+    layer.build((None, 3, 4))
+    self.assertEqual(layer.gamma.constraint, max_norm)
+    self.assertEqual(layer.beta.constraint, max_norm)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_layernorm_convnet(self):
+    if test.is_gpu_available(cuda_only=True):
+      with self.session(use_gpu=True):
+        model = keras.models.Sequential()
+        norm = keras.layers.LayerNormalization(
+            input_shape=(3, 4, 4), params_axis=1)
+        model.add(norm)
+        model.compile(loss='mse',
+                      optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                      run_eagerly=testing_utils.should_run_eagerly())
+
+        # centered on 5.0, variance 10.0
+        x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 3, 4, 4))
+        model.fit(x, x, epochs=4, verbose=0)
+        out = model.predict(x)
+        out -= np.reshape(keras.backend.eval(norm.beta), (1, 3, 1, 1))
+        out /= np.reshape(keras.backend.eval(norm.gamma), (1, 3, 1, 1))
+
+        np.testing.assert_allclose(np.mean(out, axis=(0, 2, 3)), 0.0, atol=1e-1)
+        np.testing.assert_allclose(np.std(out, axis=(0, 2, 3)), 1.0, atol=1e-1)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_layernorm_convnet_channel_last(self):
+    model = keras.models.Sequential()
+    norm = keras.layers.LayerNormalization(input_shape=(4, 4, 3))
+    model.add(norm)
+    model.compile(loss='mse',
+                  optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    # centered on 5.0, variance 10.0
+    x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 4, 4, 3))
+    model.fit(x, x, epochs=4, verbose=0)
+    out = model.predict(x)
+    out -= np.reshape(keras.backend.eval(norm.beta), (1, 1, 1, 3))
+    out /= np.reshape(keras.backend.eval(norm.gamma), (1, 1, 1, 3))
+
+    np.testing.assert_allclose(np.mean(out, axis=(0, 1, 2)), 0.0, atol=1e-1)
+    np.testing.assert_allclose(np.std(out, axis=(0, 1, 2)), 1.0, atol=1e-1)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_layernorm_correctness(self):
+    _run_layernorm_correctness_test(
+        normalization.LayerNormalization, dtype='float32')
+
+  @keras_parameterized.run_all_keras_modes
+  def test_layernorm_mixed_precision(self):
+    _run_layernorm_correctness_test(
+        normalization.LayerNormalization, dtype='float16')
+
+  def doOutputTest(self,
+                   input_shape,
+                   tol=1e-5,
+                   norm_axis=None,
+                   params_axis=-1,
+                   dtype=None):
+    ndim = len(input_shape)
+    if norm_axis is None:
+      moments_axis = range(1, ndim)
+    elif isinstance(norm_axis, int):
+      if norm_axis < 0:
+        moments_axis = [norm_axis + ndim]
+      else:
+        moments_axis = [norm_axis]
+    else:
+      moments_axis = []
+      for dim in norm_axis:
+        if dim < 0:
+          dim = dim + ndim
+        moments_axis.append(dim)
+
+    moments_axis = tuple(moments_axis)
+    expected_shape = []
+    for i in range(ndim):
+      if i not in moments_axis:
+        expected_shape.append(input_shape[i])
+
+    expected_mean = np.zeros(expected_shape)
+    expected_var = np.ones(expected_shape)
+    for mu in [0.0, 1e2]:
+      for sigma in [1.0, 0.1]:
+        inputs = np.random.randn(*input_shape) * sigma + mu
+        inputs_t = constant_op.constant(inputs, shape=input_shape)
+        layer = normalization.LayerNormalization(
+            norm_axis=norm_axis, params_axis=params_axis, dtype=dtype)
+        outputs = layer(inputs_t)
+        beta = layer.beta
+        gamma = layer.gamma
+        for weight in layer.weights:
+          self.evaluate(weight.initializer)
+        outputs = self.evaluate(outputs)
+        beta = self.evaluate(beta)
+        gamma = self.evaluate(gamma)
+
+        # The mean and variance of the output should be close to 0 and 1
+        # respectively.
+
+        # Make sure that there are no NaNs
+        self.assertFalse(np.isnan(outputs).any())
+        mean = np.mean(outputs, axis=moments_axis)
+        var = np.var(outputs, axis=moments_axis)
+        # Layer-norm implemented in numpy
+        eps = 1e-12
+        expected_out = (
+            (gamma * (inputs - np.mean(
+                inputs, axis=moments_axis, keepdims=True)) /
+             np.sqrt(eps + np.var(
+                 inputs, axis=moments_axis, keepdims=True))) + beta)
+        self.assertAllClose(expected_mean, mean, atol=tol, rtol=tol)
+        self.assertAllClose(expected_var, var, atol=tol)
+        # The full computation gets a bigger tolerance
+        self.assertAllClose(expected_out, outputs, atol=5 * tol)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutput2DInput(self):
+    self.doOutputTest((10, 300))
+    self.doOutputTest((10, 300), norm_axis=[0])
+    self.doOutputTest((10, 300), params_axis=[0, 1])
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutput2DInputDegenerateNormAxis(self):
+    with self.assertRaisesRegexp(ValueError, r'Invalid axis: 2'):
+      self.doOutputTest((10, 300), norm_axis=2)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutput4DInput(self):
+    self.doOutputTest((100, 10, 10, 3))
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutput4DInputNormOnInnermostAxis(self):
+    # Equivalent tests
+    shape = (100, 10, 10, 3)
+    self.doOutputTest(
+        shape, norm_axis=list(range(3, len(shape))), tol=1e-4, dtype='float64')
+    self.doOutputTest(shape, norm_axis=-1, tol=1e-4, dtype='float64')
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutputSmallInput(self):
+    self.doOutputTest((10, 10, 10, 30))
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutputSmallInputNormOnInnermostAxis(self):
+    self.doOutputTest((10, 10, 10, 30), norm_axis=3)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutputSmallInputNormOnMixedAxes(self):
+    self.doOutputTest((10, 10, 10, 30), norm_axis=[0, 3])
+    self.doOutputTest((10, 10, 10, 30), params_axis=[-2, -1])
+    self.doOutputTest((10, 10, 10, 30), norm_axis=[0, 3],
+                      params_axis=[-3, -2, -1])
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def testOutputBigInput(self):
+    self.doOutputTest((1, 100, 100, 1))
+    self.doOutputTest((1, 100, 100, 1), norm_axis=[1, 2])
+    self.doOutputTest((1, 100, 100, 1), norm_axis=[1, 2],
+                      params_axis=[-2, -1])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/layers/pooling.py b/tensorflow/python/keras/layers/pooling.py
index a0744cddad682fdcae18f571413b668d7767cb2f..6d76f962166fe123e6c46f5524a59ed742d7d0dc 100644
--- a/tensorflow/python/keras/layers/pooling.py
+++ b/tensorflow/python/keras/layers/pooling.py
@@ -28,7 +28,7 @@ from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
 class Pooling1D(Layer):
@@ -108,7 +108,7 @@ class Pooling1D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.MaxPool1D', 'keras.layers.MaxPooling1D')
+@keras_export('keras.layers.MaxPool1D', 'keras.layers.MaxPooling1D')
 class MaxPooling1D(Pooling1D):
   """Max pooling operation for temporal data.
 
@@ -155,7 +155,7 @@ class MaxPooling1D(Pooling1D):
         **kwargs)
 
 
-@tf_export('keras.layers.AveragePooling1D', 'keras.layers.AvgPool1D')
+@keras_export('keras.layers.AveragePooling1D', 'keras.layers.AvgPool1D')
 class AveragePooling1D(Pooling1D):
   """Average pooling for temporal data.
 
@@ -286,7 +286,7 @@ class Pooling2D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.MaxPool2D', 'keras.layers.MaxPooling2D')
+@keras_export('keras.layers.MaxPool2D', 'keras.layers.MaxPooling2D')
 class MaxPooling2D(Pooling2D):
   """Max pooling operation for spatial data.
 
@@ -340,7 +340,7 @@ class MaxPooling2D(Pooling2D):
         padding=padding, data_format=data_format, **kwargs)
 
 
-@tf_export('keras.layers.AveragePooling2D', 'keras.layers.AvgPool2D')
+@keras_export('keras.layers.AveragePooling2D', 'keras.layers.AvgPool2D')
 class AveragePooling2D(Pooling2D):
   """Average pooling operation for spatial data.
 
@@ -490,7 +490,7 @@ class Pooling3D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.MaxPool3D', 'keras.layers.MaxPooling3D')
+@keras_export('keras.layers.MaxPool3D', 'keras.layers.MaxPooling3D')
 class MaxPooling3D(Pooling3D):
   """Max pooling operation for 3D data (spatial or spatio-temporal).
 
@@ -540,7 +540,7 @@ class MaxPooling3D(Pooling3D):
         padding=padding, data_format=data_format, **kwargs)
 
 
-@tf_export('keras.layers.AveragePooling3D', 'keras.layers.AvgPool3D')
+@keras_export('keras.layers.AveragePooling3D', 'keras.layers.AvgPool3D')
 class AveragePooling3D(Pooling3D):
   """Average pooling operation for 3D data (spatial or spatio-temporal).
 
@@ -615,8 +615,8 @@ class GlobalPooling1D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.GlobalAveragePooling1D',
-           'keras.layers.GlobalAvgPool1D')
+@keras_export('keras.layers.GlobalAveragePooling1D',
+              'keras.layers.GlobalAvgPool1D')
 class GlobalAveragePooling1D(GlobalPooling1D):
   """Global average pooling operation for temporal data.
 
@@ -664,7 +664,7 @@ class GlobalAveragePooling1D(GlobalPooling1D):
     return None
 
 
-@tf_export('keras.layers.GlobalMaxPool1D', 'keras.layers.GlobalMaxPooling1D')
+@keras_export('keras.layers.GlobalMaxPool1D', 'keras.layers.GlobalMaxPooling1D')
 class GlobalMaxPooling1D(GlobalPooling1D):
   """Global max pooling operation for temporal data.
 
@@ -720,8 +720,8 @@ class GlobalPooling2D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.GlobalAveragePooling2D',
-           'keras.layers.GlobalAvgPool2D')
+@keras_export('keras.layers.GlobalAveragePooling2D',
+              'keras.layers.GlobalAvgPool2D')
 class GlobalAveragePooling2D(GlobalPooling2D):
   """Global average pooling operation for spatial data.
 
@@ -757,7 +757,7 @@ class GlobalAveragePooling2D(GlobalPooling2D):
       return backend.mean(inputs, axis=[2, 3])
 
 
-@tf_export('keras.layers.GlobalMaxPool2D', 'keras.layers.GlobalMaxPooling2D')
+@keras_export('keras.layers.GlobalMaxPool2D', 'keras.layers.GlobalMaxPooling2D')
 class GlobalMaxPooling2D(GlobalPooling2D):
   """Global max pooling operation for spatial data.
 
@@ -818,8 +818,8 @@ class GlobalPooling3D(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.GlobalAveragePooling3D',
-           'keras.layers.GlobalAvgPool3D')
+@keras_export('keras.layers.GlobalAveragePooling3D',
+              'keras.layers.GlobalAvgPool3D')
 class GlobalAveragePooling3D(GlobalPooling3D):
   """Global Average pooling operation for 3D data.
 
@@ -855,7 +855,7 @@ class GlobalAveragePooling3D(GlobalPooling3D):
       return backend.mean(inputs, axis=[2, 3, 4])
 
 
-@tf_export('keras.layers.GlobalMaxPool3D', 'keras.layers.GlobalMaxPooling3D')
+@keras_export('keras.layers.GlobalMaxPool3D', 'keras.layers.GlobalMaxPooling3D')
 class GlobalMaxPooling3D(GlobalPooling3D):
   """Global Max pooling operation for 3D data.
 
diff --git a/tensorflow/python/keras/layers/pooling_test.py b/tensorflow/python/keras/layers/pooling_test.py
index 936e73ecf9dab86cb12a9e45499bf0e7599a0dc4..67df4d7a256c03b2c476c9b5d6ca1622870a6553 100644
--- a/tensorflow/python/keras/layers/pooling_test.py
+++ b/tensorflow/python/keras/layers/pooling_test.py
@@ -25,7 +25,6 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
-from tensorflow.python.training import rmsprop
 
 
 class GlobalPoolingTest(test.TestCase):
@@ -48,7 +47,7 @@ class GlobalPoolingTest(test.TestCase):
     model = keras.Sequential()
     model.add(keras.layers.Masking(mask_value=0., input_shape=(3, 4)))
     model.add(keras.layers.GlobalAveragePooling1D())
-    model.compile(loss='mae', optimizer=rmsprop.RMSPropOptimizer(0.001))
+    model.compile(loss='mae', optimizer='rmsprop')
 
     model_input = np.random.random((2, 3, 4))
     model_input[0, 1:, :] = 0
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index 86a69e45d900bfd037a9d39076c22d9bd2d11c43..ab8469a61167759bc512a4c0fa8bb4847639ec23 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -42,12 +42,20 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_cudnn_rnn_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.layers.StackedRNNCells')
+# The following string constants are used by Defun approach for unified backend
+# of LSTM and GRU.
+_DEFUN_API_NAME_ATTRIBUTE = 'api_implements'
+_DEFUN_DEVICE_ATTRIBUTE = 'api_preferred_device'
+_CPU_DEVICE_NAME = 'CPU'
+_GPU_DEVICE_NAME = 'GPU'
+
+
+@keras_export('keras.layers.StackedRNNCells')
 class StackedRNNCells(Layer):
   """Wrapper allowing a stack of RNN cells to behave as a single cell.
 
@@ -250,7 +258,7 @@ class StackedRNNCells(Layer):
     return updates + self._updates
 
 
-@tf_export('keras.layers.RNN')
+@keras_export('keras.layers.RNN')
 class RNN(Layer):
   """Base class for recurrent layers.
 
@@ -434,6 +442,8 @@ class RNN(Layer):
   ```
   """
 
+  _setattr_tracking = False
+
   def __init__(self,
                cell,
                return_sequences=False,
@@ -458,8 +468,8 @@ class RNN(Layer):
     self.zero_output_for_mask = kwargs.pop('zero_output_for_mask', False)
     super(RNN, self).__init__(**kwargs)
     self.cell = cell
-    if isinstance(cell, checkpointable.CheckpointableBase):
-      self._track_checkpointable(self.cell, name='cell')
+    if isinstance(cell, trackable.Trackable):
+      self._track_trackable(self.cell, name='cell')
     self.return_sequences = return_sequences
     self.return_state = return_state
     self.go_backwards = go_backwards
@@ -540,8 +550,12 @@ class RNN(Layer):
       return output_shape
 
   def compute_mask(self, inputs, mask):
-    if isinstance(mask, list):
-      mask = mask[0]
+    # Time step masks must be the same for each input.
+    # This is because the mask for an RNN is of size [batch, time_steps, 1],
+    # and specifies which time steps should be skipped, and a time step
+    # must be skipped for all inputs.
+    # TODO(scottzhu): Should we accept multiple different masks?
+    mask = nest.flatten(mask)[0]
     output_mask = mask if self.return_sequences else None
     if self.return_state:
       state_mask = [None for _ in self.states]
@@ -756,8 +770,10 @@ class RNN(Layer):
     inputs, initial_state, constants = self._process_inputs(
         inputs, initial_state, constants)
 
-    if isinstance(mask, list):
-      mask = mask[0]
+    if mask is not None:
+      # Time step masks must be the same for each input.
+      # TODO(scottzhu): Should we accept multiple different masks?
+      mask = nest.flatten(mask)[0]
 
     if nest.is_sequence(inputs):
       # In the case of nested input, use the first element for shape check.
@@ -765,9 +781,9 @@ class RNN(Layer):
     else:
       input_shape = K.int_shape(inputs)
     timesteps = input_shape[0] if self.time_major else input_shape[1]
-    if self.unroll and timesteps in [None, 1]:
+    if self.unroll and timesteps is None:
       raise ValueError('Cannot unroll a RNN if the '
-                       'time dimension is undefined or equal to 1. \n'
+                       'time dimension is undefined. \n'
                        '- If using a Sequential model, '
                        'specify the time dimension by passing '
                        'an `input_shape` or `batch_input_shape` '
@@ -990,7 +1006,7 @@ class RNN(Layer):
     return updates + self._updates
 
 
-@tf_export('keras.layers.SimpleRNNCell')
+@keras_export('keras.layers.SimpleRNNCell')
 class SimpleRNNCell(Layer):
   """Cell class for SimpleRNN.
 
@@ -1160,7 +1176,7 @@ class SimpleRNNCell(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.layers.SimpleRNN')
+@keras_export('keras.layers.SimpleRNN')
 class SimpleRNN(RNN):
   """Fully-connected RNN where the output is to be fed back to input.
 
@@ -1372,7 +1388,7 @@ class SimpleRNN(RNN):
     return cls(**config)
 
 
-@tf_export('keras.layers.GRUCell')
+@keras_export('keras.layers.GRUCell')
 class GRUCell(Layer):
   """Cell class for the GRU layer.
 
@@ -1497,12 +1513,6 @@ class GRUCell(Layer):
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)
-      if not self.reset_after:
-        self.input_bias, self.recurrent_bias = self.bias, None
-      else:
-        self.input_bias = K.flatten(self.bias[0])
-        self.recurrent_bias = K.flatten(self.bias[1])
-
     else:
       self.bias = None
     self.built = True
@@ -1529,6 +1539,12 @@ class GRUCell(Layer):
     # dropout matrices for recurrent units
     rec_dp_mask = self._recurrent_dropout_mask
 
+    if self.use_bias:
+      if not self.reset_after:
+        input_bias, recurrent_bias = self.bias, None
+      else:
+        input_bias, recurrent_bias = array_ops.unstack(self.bias)
+
     if self.implementation == 1:
       if 0. < self.dropout < 1.:
         inputs_z = inputs * dp_mask[0]
@@ -1544,9 +1560,9 @@ class GRUCell(Layer):
       x_h = K.dot(inputs_h, self.kernel[:, self.units * 2:])
 
       if self.use_bias:
-        x_z = K.bias_add(x_z, self.input_bias[:self.units])
-        x_r = K.bias_add(x_r, self.input_bias[self.units: self.units * 2])
-        x_h = K.bias_add(x_h, self.input_bias[self.units * 2:])
+        x_z = K.bias_add(x_z, input_bias[:self.units])
+        x_r = K.bias_add(x_r, input_bias[self.units: self.units * 2])
+        x_h = K.bias_add(x_h, input_bias[self.units * 2:])
 
       if 0. < self.recurrent_dropout < 1.:
         h_tm1_z = h_tm1 * rec_dp_mask[0]
@@ -1561,10 +1577,9 @@ class GRUCell(Layer):
       recurrent_r = K.dot(h_tm1_r,
                           self.recurrent_kernel[:, self.units:self.units * 2])
       if self.reset_after and self.use_bias:
-        recurrent_z = K.bias_add(recurrent_z, self.recurrent_bias[:self.units])
+        recurrent_z = K.bias_add(recurrent_z, recurrent_bias[:self.units])
         recurrent_r = K.bias_add(recurrent_r,
-                                 self.recurrent_bias[self.units:
-                                                     self.units * 2])
+                                 recurrent_bias[self.units:self.units * 2])
 
       z = self.recurrent_activation(x_z + recurrent_z)
       r = self.recurrent_activation(x_r + recurrent_r)
@@ -1573,8 +1588,7 @@ class GRUCell(Layer):
       if self.reset_after:
         recurrent_h = K.dot(h_tm1_h, self.recurrent_kernel[:, self.units * 2:])
         if self.use_bias:
-          recurrent_h = K.bias_add(recurrent_h,
-                                   self.recurrent_bias[self.units * 2:])
+          recurrent_h = K.bias_add(recurrent_h, recurrent_bias[self.units * 2:])
         recurrent_h = r * recurrent_h
       else:
         recurrent_h = K.dot(r * h_tm1_h,
@@ -1589,7 +1603,7 @@ class GRUCell(Layer):
       matrix_x = K.dot(inputs, self.kernel)
       if self.use_bias:
         # biases: bias_z_i, bias_r_i, bias_h_i
-        matrix_x = K.bias_add(matrix_x, self.input_bias)
+        matrix_x = K.bias_add(matrix_x, input_bias)
 
       x_z = matrix_x[:, :self.units]
       x_r = matrix_x[:, self.units: 2 * self.units]
@@ -1602,7 +1616,7 @@ class GRUCell(Layer):
         # hidden state projected by all gate matrices at once
         matrix_inner = K.dot(h_tm1, self.recurrent_kernel)
         if self.use_bias:
-          matrix_inner = K.bias_add(matrix_inner, self.recurrent_bias)
+          matrix_inner = K.bias_add(matrix_inner, recurrent_bias)
       else:
         # hidden state projected separately for update/reset and new
         matrix_inner = K.dot(h_tm1, self.recurrent_kernel[:, :2 * self.units])
@@ -1655,7 +1669,7 @@ class GRUCell(Layer):
     return _generate_zero_filled_state_for_cell(self, inputs, batch_size, dtype)
 
 
-@tf_export('keras.layers.GRU')
+@keras_export(v1=['keras.layers.GRU'])
 class GRU(RNN):
   """Gated Recurrent Unit - Cho et al. 2014.
 
@@ -1914,7 +1928,391 @@ class GRU(RNN):
     return cls(**config)
 
 
-@tf_export('keras.layers.LSTMCell')
+@keras_export('keras.layers.GRU', v1=[])
+class UnifiedGRU(GRU):
+  """Gated Recurrent Unit - Cho et al. 2014.
+
+  `UnifiedGRU` unifies the implementations between standard `GRU` layer and
+  `CuDNNGRU` layer. Based on available runtime hardware and constraints,
+  `UnifiedGRU` will choose different implementations to maximize the
+  performance. For instance, if GPU is available and all the parameters meet the
+  requirement of CuDNN kernel, `UnifiedGRU` will use CuDNN kernel for the
+  calculation. The requirements to use CuDNN kernel are:
+
+    1. `activation` == 'tanh'
+    2. `recurrent_activation` == 'sigmoid'
+    3. `recurrent_dropout` == 0
+    4. `unroll` is False
+    5. `use_bias` is True
+    6. `reset_after` is True
+    7. Use masking in previous layers.
+
+  There are two variants. The default one is based on
+  [v3](https://arxiv.org/abs/1406.1078v3) and has reset gate applied to hidden
+  state before matrix multiplication. The other one is based on
+  [original](https://arxiv.org/abs/1406.1078v1) and has the order reversed.
+
+  The second variant is compatible with CuDNNGRU (GPU-only) and allows
+  inference on CPU. Thus it has separate biases for `kernel` and
+  `recurrent_kernel`. Use `'reset_after'=True` and
+  `recurrent_activation='sigmoid'`.
+
+  Arguments:
+      units: Positive integer, dimensionality of the output space.
+      activation: Activation function to use.
+          Default: hyperbolic tangent (`tanh`).
+          If you pass `None`, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      recurrent_activation: Activation function to use
+          for the recurrent step.
+          Default: sigmoid (`sigmoid`).
+          If you pass `None`, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix,
+          used for the linear transformation of the inputs.
+      recurrent_initializer: Initializer for the `recurrent_kernel`
+          weights matrix,
+          used for the linear transformation of the recurrent state.
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to
+          the `kernel` weights matrix.
+      recurrent_regularizer: Regularizer function applied to
+          the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to
+          the output of the layer (its "activation")..
+      kernel_constraint: Constraint function applied to
+          the `kernel` weights matrix.
+      recurrent_constraint: Constraint function applied to
+          the `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+      dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the inputs.
+      recurrent_dropout: Float between 0 and 1.
+          Fraction of the units to drop for
+          the linear transformation of the recurrent state.
+      implementation: Implementation mode, either 1 or 2.
+          Mode 1 will structure its operations as a larger number of
+          smaller dot products and additions, whereas mode 2 will
+          batch them into fewer, larger operations. These modes will
+          have different performance profiles on different hardware and
+          for different applications.
+      return_sequences: Boolean. Whether to return the last output
+          in the output sequence, or the full sequence.
+      return_state: Boolean. Whether to return the last state
+          in addition to the output.
+      go_backwards: Boolean (default False).
+          If True, process the input sequence backwards and return the
+          reversed sequence.
+      stateful: Boolean (default False). If True, the last state
+          for each sample at index i in a batch will be used as initial
+          state for the sample of index i in the following batch.
+      unroll: Boolean (default False).
+          If True, the network will be unrolled,
+          else a symbolic loop will be used.
+          Unrolling can speed-up a RNN,
+          although it tends to be more memory-intensive.
+          Unrolling is only suitable for short sequences.
+      reset_after: GRU convention (whether to apply reset gate after or
+          before matrix multiplication). False = "before",
+          True = "after" (default and CuDNN compatible).
+  """
+
+  _setattr_tracking = False  # TODO(allenl): Figure out why this is needed.
+
+  def __init__(self,
+               units,
+               activation='tanh',
+               recurrent_activation='sigmoid',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               dropout=0.,
+               recurrent_dropout=0.,
+               implementation=1,
+               return_sequences=False,
+               return_state=False,
+               go_backwards=False,
+               stateful=False,
+               unroll=False,
+               time_major=False,
+               reset_after=True,
+               **kwargs):
+    # return_runtime is a flag for testing, which shows the real backend
+    # implementation chosen by grappler in graph mode.
+    self._return_runtime = kwargs.pop('return_runtime', False)
+
+    super(UnifiedGRU, self).__init__(
+        units,
+        activation=activation,
+        recurrent_activation=recurrent_activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        recurrent_initializer=recurrent_initializer,
+        bias_initializer=bias_initializer,
+        kernel_regularizer=kernel_regularizer,
+        recurrent_regularizer=recurrent_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        kernel_constraint=kernel_constraint,
+        recurrent_constraint=recurrent_constraint,
+        bias_constraint=bias_constraint,
+        dropout=dropout,
+        recurrent_dropout=recurrent_dropout,
+        implementation=implementation,
+        return_sequences=return_sequences,
+        return_state=return_state,
+        go_backwards=go_backwards,
+        stateful=stateful,
+        unroll=unroll,
+        time_major=time_major,
+        reset_after=reset_after,
+        **kwargs)
+    self._dropout_mask = None
+    # CuDNN uses following setting by default and not configurable.
+    self.could_use_cudnn = (
+        activation == 'tanh' and recurrent_activation == 'sigmoid' and
+        recurrent_dropout == 0 and not unroll and use_bias and
+        reset_after is True)
+
+  def call(self, inputs, mask=None, training=None, initial_state=None):
+    # GRU does not support constants. Ignore it during process.
+    inputs, initial_state, _ = self._process_inputs(inputs, initial_state, None)
+
+    if isinstance(mask, list):
+      mask = mask[0]
+
+    input_shape = K.int_shape(inputs)
+    timesteps = input_shape[0] if self.time_major else input_shape[1]
+
+    if mask is not None or not self.could_use_cudnn:
+      # CuDNN does not support masking, fall back to use the normal GRU.
+      kwargs = {'training': training}
+      self.cell._dropout_mask = None
+      self.cell._recurrent_dropout_mask = None
+
+      def step(cell_inputs, cell_states):
+        return self.cell.call(cell_inputs, cell_states, **kwargs)
+
+      last_output, outputs, states = K.rnn(
+          step,
+          inputs,
+          initial_state,
+          constants=None,
+          go_backwards=self.go_backwards,
+          mask=mask,
+          unroll=self.unroll,
+          input_length=timesteps,
+          time_major=self.time_major,
+          zero_output_for_mask=self.zero_output_for_mask)
+      # This is a dummy tensor for testing purpose.
+      runtime = _runtime('unknown')
+    else:
+      last_output, outputs, runtime, states = self._defun_gru_call(
+          inputs, initial_state, training)
+
+    if self.stateful:
+      updates = [state_ops.assign(self.states[0], states[0])]
+      self.add_update(updates, inputs)
+
+    if self.return_sequences:
+      output = outputs
+    else:
+      output = last_output
+
+    if self.return_state:
+      return [output] + states
+    elif self._return_runtime:
+      return output, runtime
+    else:
+      return output
+
+  def _defun_gru_call(self, inputs, initial_state, training):
+    # Use the new defun approach for backend implementation swap.
+    # Note that different implementations need to have same function
+    # signature, eg, the tensor parameters need to have same shape and dtypes.
+    if self.go_backwards:
+      # Reverse time axis.
+      inputs = K.reverse(inputs, 0 if self.time_major else 1)
+    if 0 < self.dropout < 1:
+      if self._dropout_mask is None:
+        self._dropout_mask = _generate_dropout_mask(
+            array_ops.ones_like(inputs),
+            self.dropout,
+            training=training,
+            count=3)
+
+      inputs *= self._dropout_mask[0]
+    if ops.executing_eagerly_outside_functions():
+      # Under eager context, the device placement is already known. Prefer the
+      # GPU implementation when GPU is available.
+      if context.num_gpus() > 0:
+        last_output, outputs, new_h, runtime = cudnn_gru(
+            inputs=inputs,
+            init_h=initial_state[0],
+            kernel=self.cell.kernel,
+            recurrent_kernel=self.cell.recurrent_kernel,
+            bias=self.cell.bias,
+            time_major=self.time_major)
+      else:
+        last_output, outputs, new_h, runtime = standard_gru(
+            inputs=inputs,
+            init_h=initial_state[0],
+            kernel=self.cell.kernel,
+            recurrent_kernel=self.cell.recurrent_kernel,
+            bias=self.cell.bias,
+            activation=self.activation,
+            recurrent_activation=self.recurrent_activation,
+            time_major=self.time_major)
+    else:
+      api_name = 'gru_' + str(uuid.uuid4())
+      defun_standard_gru = _generate_defun_backend(
+          api_name, _CPU_DEVICE_NAME, standard_gru)
+      defun_cudnn_gru = _generate_defun_backend(
+          api_name, _GPU_DEVICE_NAME, cudnn_gru)
+      # Call the normal GRU impl and register the CuDNN impl function. The
+      # grappler will kick in during session execution to optimize the graph.
+      last_output, outputs, new_h, runtime = defun_standard_gru(
+          inputs=inputs,
+          init_h=initial_state[0],
+          kernel=self.cell.kernel,
+          recurrent_kernel=self.cell.recurrent_kernel,
+          bias=self.cell.bias,
+          activation=self.activation,
+          recurrent_activation=self.recurrent_activation,
+          time_major=self.time_major)
+
+      function.register(defun_cudnn_gru, inputs, initial_state[0],
+                        self.cell.kernel, self.cell.recurrent_kernel,
+                        self.cell.bias, self.time_major)
+    states = [new_h]
+    return last_output, outputs, runtime, states
+
+
+def standard_gru(inputs, init_h, kernel, recurrent_kernel, bias, activation,
+                 recurrent_activation, time_major):
+  """GRU with standard kernel implementation.
+
+  This implementation can be run on all types of hardware.
+
+  This implementation lifts out all the layer weights and make them function
+  parameters. It has same number of tensor input params as the CuDNN
+  counterpart. The RNN step logic has been simplified, eg dropout and mask is
+  removed since CuDNN implementation does not support that.
+
+  Args:
+    inputs: input tensor of GRU layer.
+    init_h: initial state tensor for the cell output.
+    kernel: weights for cell kernel.
+    recurrent_kernel: weights for cell recurrent kernel.
+    bias: weights for cell kernel bias and recurrent bias. The bias contains the
+      combined input_bias and recurrent_bias.
+    activation: Activation function to use for output.
+    recurrent_activation: Activation function to use for hidden recurrent state.
+    time_major: boolean, whether the inputs are in the format of
+      [time, batch, feature] or [batch, time, feature].
+
+  Returns:
+    last_output: output tensor for the last timestep, which has shape
+      [batch, units].
+    outputs: output tensor for all timesteps, which has shape
+      [batch, time, units].
+    state_0: the cell output, which has same shape as init_h.
+    runtime: constant string tensor which indicate real runtime hardware. This
+      value is for testing purpose and should be used by user.
+  """
+  input_shape = K.int_shape(inputs)
+  timesteps = input_shape[0] if time_major else input_shape[1]
+
+  input_bias, recurrent_bias = array_ops.unstack(bias)
+
+  def step(cell_inputs, cell_states):
+    """Step function that will be used by Keras RNN backend."""
+    h_tm1 = cell_states[0]
+
+    # inputs projected by all gate matrices at once
+    matrix_x = K.dot(cell_inputs, kernel)
+    matrix_x = K.bias_add(matrix_x, input_bias)
+
+    x_z, x_r, x_h = array_ops.split(matrix_x, 3, axis=1)
+
+    # hidden state projected by all gate matrices at once
+    matrix_inner = K.dot(h_tm1, recurrent_kernel)
+    matrix_inner = K.bias_add(matrix_inner, recurrent_bias)
+
+    recurrent_z, recurrent_r, recurrent_h = array_ops.split(matrix_inner, 3,
+                                                            axis=1)
+    z = recurrent_activation(x_z + recurrent_z)
+    r = recurrent_activation(x_r + recurrent_r)
+    hh = activation(x_h + r * recurrent_h)
+
+    # previous and candidate state mixed by update gate
+    h = z * h_tm1 + (1 - z) * hh
+    return h, [h]
+
+  last_output, outputs, new_states = K.rnn(
+      step,
+      inputs, [init_h],
+      constants=None,
+      unroll=False,
+      time_major=time_major,
+      input_length=timesteps)
+  return last_output, outputs, new_states[0], _runtime('cpu')
+
+
+def cudnn_gru(inputs, init_h, kernel, recurrent_kernel, bias, time_major):
+  """GRU with CuDNN implementation which is only available for GPU."""
+  if not time_major:
+    inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
+  init_h = array_ops.expand_dims(init_h, axis=0)
+
+  weights = array_ops.split(kernel, 3, axis=1)
+  weights += array_ops.split(recurrent_kernel, 3, axis=1)
+  # Note that the bias was initialized as shape (2, 3 * units), flat it into
+  # (6 * units)
+  bias = array_ops.split(K.flatten(bias), 6)
+  # Note that the gate order for CuDNN is different from the canonical format.
+  # canonical format is [z, r, h], whereas CuDNN is [r, z, h]. The swap need to
+  # be done for kernel, recurrent_kernel, input_bias, recurrent_bias.
+  # z is update gate weights.
+  # r is reset gate weights.
+  # h is output gate weights.
+  weights[0], weights[1] = weights[1], weights[0]
+  weights[3], weights[4] = weights[4], weights[3]
+  bias[0], bias[1] = bias[1], bias[0]
+  bias[3], bias[4] = bias[4], bias[3]
+
+  params = _canonical_to_params(
+      weights=weights,
+      biases=bias,
+      shape=constant_op.constant([-1]),
+      transpose_weights=True)
+
+  outputs, h, _, _ = gen_cudnn_rnn_ops.cudnn_rnn(
+      inputs,
+      input_h=init_h,
+      input_c=0,
+      params=params,
+      is_training=True,
+      rnn_mode='gru')
+  last_output = outputs[-1]
+  if not time_major:
+    outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
+  h = h[0]
+  return last_output, outputs, h, _runtime('cudnn')
+
+
+@keras_export('keras.layers.LSTMCell')
 class LSTMCell(Layer):
   """Cell class for the LSTM layer.
 
@@ -2194,7 +2592,7 @@ class LSTMCell(Layer):
         self, inputs, batch_size, dtype))
 
 
-@tf_export('keras.experimental.PeepholeLSTMCell')
+@keras_export('keras.experimental.PeepholeLSTMCell')
 class PeepholeLSTMCell(LSTMCell):
   """Equivalent to LSTMCell class but adds peephole connections.
 
@@ -2274,7 +2672,7 @@ class PeepholeLSTMCell(LSTMCell):
     return c, o
 
 
-@tf_export('keras.layers.LSTM')
+@keras_export(v1=['keras.layers.LSTM'])
 class LSTM(RNN):
   """Long Short-Term Memory layer - Hochreiter 1997.
 
@@ -2533,6 +2931,7 @@ class LSTM(RNN):
     return cls(**config)
 
 
+@keras_export('keras.layers.LSTM', v1=[])
 class UnifiedLSTM(LSTM):
   """Long Short-Term Memory layer - Hochreiter 1997.
 
@@ -2546,13 +2945,11 @@ class UnifiedLSTM(LSTM):
   Arguments:
     units: Positive integer, dimensionality of the output space.
     activation: Activation function to use.
-        Default: hyperbolic tangent (`tanh`). If you pass `None`, no activation
-          is applied
-        (ie. "linear" activation: `a(x) = x`).
+      Default: hyperbolic tangent (`tanh`). If you pass `None`, no activation
+      is applied (ie. "linear" activation: `a(x) = x`).
     recurrent_activation: Activation function to use for the recurrent step.
-        Default: hard sigmoid (`hard_sigmoid`). If you pass `None`, no
-          activation is applied
-        (ie. "linear" activation: `a(x) = x`).
+      Default: sigmoid (`sigmoid`). If you pass `None`, no activation is
+      applied (ie. "linear" activation: `a(x) = x`).
     use_bias: Boolean, whether the layer uses a bias vector.
     kernel_initializer: Initializer for the `kernel` weights matrix, used for
       the linear transformation of the inputs..
@@ -2602,7 +2999,7 @@ class UnifiedLSTM(LSTM):
   def __init__(self,
                units,
                activation='tanh',
-               recurrent_activation='hard_sigmoid',
+               recurrent_activation='sigmoid',
                use_bias=True,
                kernel_initializer='glorot_uniform',
                recurrent_initializer='orthogonal',
@@ -2659,12 +3056,10 @@ class UnifiedLSTM(LSTM):
     self.state_spec = [
         InputSpec(shape=(None, dim)) for dim in (self.units, self.units)
     ]
-    self._num_constants = None
-    self._num_inputs = None
     self._dropout_mask = None
     self.could_use_cudnn = (
-        activation == 'tanh' and recurrent_dropout == 0 and
-        not unroll and use_bias and bias_regularizer is None)
+        activation == 'tanh' and recurrent_activation == 'sigmoid' and
+        recurrent_dropout == 0 and not unroll and use_bias)
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
     # LSTM does not support constants. Ignore it during process.
@@ -2694,8 +3089,7 @@ class UnifiedLSTM(LSTM):
           input_length=timesteps,
           time_major=self.time_major,
           zero_output_for_mask=self.zero_output_for_mask)
-      runtime = constant_op.constant(
-          'unknown', dtype=dtypes.string, name='runtime')
+      runtime = _runtime('unknown')
     else:
       # Use the new defun approach for backend implementation swap.
       # Note that different implementations need to have same function
@@ -2716,37 +3110,29 @@ class UnifiedLSTM(LSTM):
 
         inputs *= self._dropout_mask[0]
 
-      # Each time a defun function is called, we will give a unique identifiable
-      # API name, so that the grappler won't get confused when it sees multiple
-      # LSTM layer added into same graph, and it will be able to pair up the
-      # different implementations across them.
-      experimental_api_name = 'lstm_' + str(uuid.uuid4())
-      standard_lstm_attributes = {
-          'experimental_api_implements': experimental_api_name,
-          'experimental_api_preferred_device': 'CPU',
-      }
-      cudnn_lstm_attributes = {
-          'experimental_api_implements': experimental_api_name,
-          'experimental_api_preferred_device': 'GPU',
-      }
-      defun_standard_lstm = function.defun_with_attributes(
-          standard_lstm, attributes=standard_lstm_attributes)
-      defun_cudnn_lstm = function.defun_with_attributes(
-          cudnn_lstm, attributes=cudnn_lstm_attributes)
-
       if ops.executing_eagerly_outside_functions():
         # Under eager context, the device placement is already known. Prefer the
         # GPU implementation here.
         if context.num_gpus() > 0:
-          last_output, outputs, new_h, new_c, runtime = defun_cudnn_lstm(
+          last_output, outputs, new_h, new_c, runtime = cudnn_lstm(
               inputs, initial_state[0], initial_state[1], self.cell.kernel,
               self.cell.recurrent_kernel, self.cell.bias, self.time_major)
         else:
-          last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(
+          last_output, outputs, new_h, new_c, runtime = standard_lstm(
               inputs, initial_state[0], initial_state[1], self.cell.kernel,
               self.cell.recurrent_kernel, self.cell.bias, self.activation,
               self.recurrent_activation, self.time_major)
       else:
+        # Each time a `tf.function` is called, we will give it a unique
+        # identifiable API name, so that Grappler won't get confused when it
+        # sees multiple LSTM layers added into same graph, and it will be able
+        # to pair up the different implementations across them.
+        api_name = 'lstm_' + str(uuid.uuid4())
+        defun_standard_lstm = _generate_defun_backend(
+            api_name, _CPU_DEVICE_NAME, standard_lstm)
+        defun_cudnn_lstm = _generate_defun_backend(
+            api_name, _GPU_DEVICE_NAME, cudnn_lstm)
+
         # Call the normal LSTM impl and register the CuDNN impl function. The
         # grappler will kick in during session execution to optimize the graph.
         last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(
@@ -2778,46 +3164,6 @@ class UnifiedLSTM(LSTM):
     else:
       return output
 
-  @property
-  def trainable_weights(self):
-    if self.trainable:
-      weights = []
-      weights += self.cell.trainable_weights
-      return weights
-    return []
-
-  @property
-  def non_trainable_weights(self):
-    if not self.trainable:
-      weights = []
-      weights += self.cell.non_trainable_weights
-      return weights
-    return []
-
-  @property
-  def losses(self):
-    losses = []
-    losses += self.cell.losses
-    return losses + self._losses
-
-  @property
-  def updates(self):
-    updates = []
-    updates += self.cell.updates
-    return updates + self._updates
-
-  def get_weights(self):
-    weights = []
-    weights += self.cell.weights
-    return K.batch_get_value(weights)
-
-  def set_weights(self, weights):
-    tuples = []
-    cell_weights = weights[:len(self.cell.weights)]
-    if cell_weights:
-      tuples.append((self.cell.weights, cell_weights))
-    K.batch_set_value(tuples)
-
 
 def _canonical_to_params(weights, biases, shape, transpose_weights=False):
   """Utility function convert variable to CuDNN compatible parameter.
@@ -2918,8 +3264,7 @@ def standard_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias,
       unroll=False,
       time_major=time_major,
       input_length=timesteps)
-  return last_output, outputs, new_states[0], new_states[
-      1], constant_op.constant('cpu', dtype=dtypes.string, name='runtime')
+  return last_output, outputs, new_states[0], new_states[1], _runtime('cpu')
 
 
 def cudnn_lstm(inputs, input_h, input_c, kernel, recurrent_kernel, bias,
@@ -2950,8 +3295,7 @@ def cudnn_lstm(inputs, input_h, input_c, kernel, recurrent_kernel, bias,
   h = h[0]
   c = c[0]
 
-  return last_output, outputs, h, c, constant_op.constant(
-      'cudnn', dtype=dtypes.string, name='runtime')
+  return last_output, outputs, h, c, _runtime('cudnn')
 
 
 def _generate_dropout_mask(ones, rate, training=None, count=1):
@@ -3004,6 +3348,7 @@ def _standardize_args(
     # For either case, we will use num_inputs to split the input list, and
     # restructure the real input into tuple.
     assert initial_state is None and constants is None
+    inputs = nest.flatten(inputs)
     if num_constants is not None:
       constants = inputs[-num_constants:]
       inputs = inputs[:-num_constants]
@@ -3060,3 +3405,18 @@ def _generate_zero_filled_state(batch_size_tensor, state_size, dtype):
     return nest.map_structure(create_zeros, state_size)
   else:
     return create_zeros(state_size)
+
+
+def _generate_defun_backend(unique_api_name, preferred_device, func):
+  function_attributes = {
+      _DEFUN_API_NAME_ATTRIBUTE: unique_api_name,
+      _DEFUN_DEVICE_ATTRIBUTE: preferred_device,
+  }
+  return function.defun_with_attributes(func=func,
+                                        attributes=function_attributes)
+
+
+def _runtime(runtime_name):
+  with ops.device('/cpu:0'):
+    return constant_op.constant(
+        runtime_name, dtype=dtypes.string, name='runtime')
diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
index b1449069e3279e27b08ecc383e72aed63525e521..4b7aeb9f22f7c215159a629e3483186f2a96a5e1 100644
--- a/tensorflow/python/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -30,7 +30,8 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -39,8 +40,8 @@ from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
-from tensorflow.python.training import rmsprop
-from tensorflow.python.training.checkpointable import util as checkpointable_util
+from tensorflow.python.training.tracking import object_identity
+from tensorflow.python.training.tracking import util as trackable_util
 from tensorflow.python.util import nest
 
 # Used for nested input/output/state RNN test.
@@ -48,8 +49,8 @@ NestedInput = collections.namedtuple('NestedInput', ['t1', 't2'])
 NestedState = collections.namedtuple('NestedState', ['s1', 's2'])
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class RNNTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class RNNTest(keras_parameterized.TestCase):
 
   def test_minimal_rnn_cell_non_layer(self):
 
@@ -72,8 +73,10 @@ class RNNTest(test.TestCase):
     layer = keras.layers.RNN(cell)
     y = layer(x)
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
     # Test stacking.
@@ -83,8 +86,10 @@ class RNNTest(test.TestCase):
     layer = keras.layers.RNN(cells)
     y = layer(x)
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
   def test_minimal_rnn_cell_non_layer_multiple_states(self):
@@ -111,8 +116,10 @@ class RNNTest(test.TestCase):
     layer = keras.layers.RNN(cell)
     y = layer(x)
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
     # Test stacking.
@@ -124,8 +131,10 @@ class RNNTest(test.TestCase):
     self.assertEqual(layer.cell.output_size, 32)
     y = layer(x)
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
   def test_minimal_rnn_cell_layer(self):
@@ -164,8 +173,10 @@ class RNNTest(test.TestCase):
     layer = keras.layers.RNN(cell)
     y = layer(x)
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
     # Test basic case serialization.
@@ -188,8 +199,10 @@ class RNNTest(test.TestCase):
     layer = keras.layers.RNN(cells)
     y = layer(x)
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
 
     # Test stacked RNN serialization.
@@ -227,8 +240,10 @@ class RNNTest(test.TestCase):
     y = keras.layers.Lambda(lambda t: array_ops.transpose(t, [1, 0, 2]))(y)
 
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         np.zeros((batch, time_step, embedding_dim)),
         np.zeros((batch, time_step, units)))
@@ -245,8 +260,10 @@ class RNNTest(test.TestCase):
 
     y = keras.layers.Lambda(lambda t: array_ops.transpose(t, [1, 0, 2]))(y)
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         np.zeros((batch, time_step, embedding_dim)),
         np.zeros((batch, time_step, cell_units[-1])))
@@ -260,8 +277,10 @@ class RNNTest(test.TestCase):
         units, time_major=True, return_sequences=True)(mask)
     y = keras.layers.Lambda(lambda t: array_ops.transpose(t, [1, 0, 2]))(rnn)
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         np.zeros((batch, time_step, embedding_dim)),
         np.zeros((batch, time_step, units)))
@@ -272,8 +291,10 @@ class RNNTest(test.TestCase):
     y = rnn_1(x)
 
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         np.zeros((batch, time_step, embedding_dim)),
         np.zeros((batch, time_step, units)))
@@ -346,8 +367,10 @@ class RNNTest(test.TestCase):
     y = layer(x, constants=c)
 
     model = keras.models.Model([x, c], y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((6, 5, 5)), np.zeros((6, 3))],
         np.zeros((6, 32))
@@ -384,8 +407,10 @@ class RNNTest(test.TestCase):
     layer = keras.layers.recurrent.RNN(cells)
     y = layer(x, constants=c)
     model = keras.models.Model([x, c], y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((6, 5, 5)), np.zeros((6, 3))],
         np.zeros((6, 32))
@@ -398,8 +423,10 @@ class RNNTest(test.TestCase):
     layer = keras.layers.recurrent.RNN(cells)
     y = layer(x, constants=c)
     model = keras.models.Model([x, c], y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((6, 5, 5)), np.zeros((6, 3))],
         np.zeros((6, 32))
@@ -470,8 +497,10 @@ class RNNTest(test.TestCase):
     layer = keras.layers.RNN(cell)
     y = layer(x, initial_state=s, constants=c)
     model = keras.models.Model([x, s, c], y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((6, 5, 5)), np.zeros((6, 32)), np.zeros((6, 3))],
         np.zeros((6, 32))
@@ -600,8 +629,10 @@ class RNNTest(test.TestCase):
       layer = keras.layers.RNN(cell)
       y = layer(x)
       model = keras.models.Model(x, y)
-      model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                    loss='mse')
+      model.compile(
+          optimizer='rmsprop',
+          loss='mse',
+          run_eagerly=testing_utils.should_run_eagerly())
 
       # Test basic case serialization.
       x_np = np.random.random((6, 5, 5))
@@ -622,8 +653,10 @@ class RNNTest(test.TestCase):
       layer = keras.layers.RNN(cells)
       y = layer(x)
       model = keras.models.Model(x, y)
-      model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                    loss='mse')
+      model.compile(
+          optimizer='rmsprop',
+          loss='mse',
+          run_eagerly=testing_utils.should_run_eagerly())
 
       # Test stacked RNN serialization.
       x_np = np.random.random((6, 5, 5))
@@ -647,7 +680,7 @@ class RNNTest(test.TestCase):
     x = keras.Input((None, 5))
     y = layer(x)
     model = keras.models.Model(x, y)
-    model.compile('sgd', 'mse')
+    model.compile('sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
     x_np = np.random.random((6, 5, 5))
     y_np = np.random.random((6, 3))
     model.train_on_batch(x_np, y_np)
@@ -683,19 +716,22 @@ class RNNTest(test.TestCase):
         [tuple(o.as_list()) for o in output_shape],
         expected_output_shape)
 
-  def test_checkpointable_dependencies(self):
+  def test_trackable_dependencies(self):
     rnn = keras.layers.SimpleRNN
     x = np.random.random((2, 2, 2))
     y = np.random.random((2, 2))
     model = keras.models.Sequential()
     model.add(rnn(2))
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.fit(x, y, epochs=1, batch_size=1)
 
     # check whether the model variables are present in the
-    # checkpointable list of objects
-    checkpointed_objects = set(checkpointable_util.list_objects(model))
+    # trackable list of objects
+    checkpointed_objects = object_identity.ObjectIdentitySet(
+        trackable_util.list_objects(model))
     for v in model.variables:
       self.assertIn(v, checkpointed_objects)
 
@@ -722,8 +758,10 @@ class RNNTest(test.TestCase):
                        [None, unit_a, unit_b])
 
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         np.zeros((batch, time_step, input_a, input_b)),
         np.zeros((batch, unit_a, unit_b)))
@@ -738,8 +776,10 @@ class RNNTest(test.TestCase):
     layer = keras.layers.RNN(cells)
     y = layer(x)
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         np.zeros((batch, time_step, input_a, input_b)),
         np.zeros((batch, unit_a * 4, unit_b * 4)))
@@ -761,8 +801,10 @@ class RNNTest(test.TestCase):
     y = layer(x, initial_state=s)
 
     model = keras.models.Model([x, s], y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch([
         np.zeros((batch, time_step, input_a, input_b)),
         np.zeros((batch, unit_a, unit_b))
@@ -798,8 +840,10 @@ class RNNTest(test.TestCase):
                        [None, state_size])
 
     model = keras.models.Model(x, y)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         np.zeros((batch, time_step, input_size)),
         np.zeros((batch, input_size)))
@@ -853,8 +897,10 @@ class RNNTest(test.TestCase):
     self.assertEqual(outputs[1].shape.as_list(), [None, o2, o3])
 
     model = keras.models.Model((input_1, input_2), outputs)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((batch, t, i1)), np.zeros((batch, t, i2, i3))],
         [np.zeros((batch, o1)), np.zeros((batch, o2, o3))])
@@ -874,8 +920,10 @@ class RNNTest(test.TestCase):
     self.assertEqual(outputs[1].shape.as_list(), [None, o2, o3])
 
     model = keras.models.Model([input_1, input_2], outputs)
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((batch, t, i1)),
          np.zeros((batch, t, i2, i3))],
@@ -902,8 +950,10 @@ class RNNTest(test.TestCase):
     self.assertEqual(s2.shape.as_list(), [None, o2, o3])
 
     model = keras.models.Model([input_1, input_2], [output1, output2])
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((batch, t, i1)),
          np.zeros((batch, t, i2, i3))],
@@ -926,8 +976,10 @@ class RNNTest(test.TestCase):
     self.assertEqual(s2.shape.as_list(), [None, o2, o3])
 
     model = keras.models.Model([input_1, input_2], [output1, output2])
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((batch, t, i1)),
          np.zeros((batch, t, i2, i3))],
@@ -959,8 +1011,10 @@ class RNNTest(test.TestCase):
 
     model = keras.models.Model([input_1, input_2, init_s1, init_s2],
                                [output1, output2])
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((batch, t, i1)),
          np.zeros((batch, t, i2, i3)),
@@ -990,8 +1044,10 @@ class RNNTest(test.TestCase):
 
     model = keras.models.Model([input_1, input_2, init_s1, init_s2],
                                [output1, output2])
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch(
         [np.zeros((batch, t, i1)),
          np.zeros((batch, t, i2, i3)),
@@ -1004,18 +1060,17 @@ class RNNTest(test.TestCase):
   def test_peephole_lstm_cell(self):
 
     def _run_cell(cell_fn, **kwargs):
-      with self.cached_session() as sess:
-        inputs = array_ops.one_hot([1, 2, 3, 4], 4)
-        cell = cell_fn(5, **kwargs)
-        cell.build(inputs.shape)
-        initial_state = cell.get_initial_state(
-            inputs=inputs, batch_size=4, dtype=dtypes.float32)
-        inputs, _ = cell(inputs, initial_state)
-        output = inputs
-        if not context.executing_eagerly():
-          self.evaluate(variables_lib.global_variables_initializer())
-          output = self.evaluate(output)
-        return output
+      inputs = array_ops.one_hot([1, 2, 3, 4], 4)
+      cell = cell_fn(5, **kwargs)
+      cell.build(inputs.shape)
+      initial_state = cell.get_initial_state(
+          inputs=inputs, batch_size=4, dtype=dtypes.float32)
+      inputs, _ = cell(inputs, initial_state)
+      output = inputs
+      if not context.executing_eagerly():
+        self.evaluate(variables_lib.global_variables_initializer())
+        output = self.evaluate(output)
+      return output
 
     random_seed.set_random_seed(12345)
     # `recurrent_activation` kwarg is set to sigmoid as that is hardcoded into
@@ -1066,8 +1121,10 @@ class RNNTest(test.TestCase):
     y, s = keras.layers.RNN(
         Cell(), return_state=True)(x_masked, initial_state=s_0)
     model = keras.models.Model([x, s_0], [y, s])
-    model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                  loss='mse')
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
 
     # last time step masked
     x_np = np.array([[[1.], [2.], [0.]]])
@@ -1090,8 +1147,10 @@ class RNNTest(test.TestCase):
       masked_input = mask(x)
       y = layer(masked_input)
       model = keras.models.Model(x, y)
-      model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
-                    loss='mse')
+      model.compile(
+          optimizer='rmsprop',
+          loss='mse',
+          run_eagerly=testing_utils.should_run_eagerly())
 
       np_x = np.ones((6, 5, 5))
       result_1 = model.predict(np_x)
@@ -1105,6 +1164,30 @@ class RNNTest(test.TestCase):
       result_1[5, 3:] = 0
       self.assertAllClose(result_1, result_2)
 
+  def test_unroll_single_step(self):
+    """Even if the time dimension is only one, we should be able to unroll."""
+    cell = keras.layers.SimpleRNNCell(5)
+    x = keras.Input((1, 5))
+    layer = keras.layers.RNN(cell, return_sequences=True, unroll=True)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    np_x = np.ones((6, 1, 5))
+    result = model.predict(np_x)
+    self.assertEqual((6, 1, 5), result.shape)
+
+  def test_unroll_zero_step(self):
+    """If the time dimension is None, we should fail to unroll."""
+    cell = keras.layers.SimpleRNNCell(5)
+    x = keras.Input((None, 5))
+    layer = keras.layers.RNN(cell, return_sequences=True, unroll=True)
+    with self.assertRaisesRegexp(ValueError, 'Cannot unroll a RNN.*'):
+      layer(x)
+
 
 class Minimal2DRNNCell(keras.layers.Layer):
   """The minimal 2D RNN cell is a simple combination of 2 1-D RNN cell.
@@ -1212,3 +1295,4 @@ class NestedCell(keras.layers.Layer):
 
 if __name__ == '__main__':
   test.main()
+
diff --git a/tensorflow/python/keras/layers/separable_convolutional_test.py b/tensorflow/python/keras/layers/separable_convolutional_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8234bfe704d84e0de6e7f60e33df31de5a800cc5
--- /dev/null
+++ b/tensorflow/python/keras/layers/separable_convolutional_test.py
@@ -0,0 +1,167 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for separable convolutional layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+@keras_parameterized.run_all_keras_modes
+class SeparableConv1DTest(keras_parameterized.TestCase):
+
+  def _run_test(self, kwargs):
+    num_samples = 2
+    stack_size = 3
+    length = 7
+
+    with self.cached_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.SeparableConv1D,
+          kwargs=kwargs,
+          input_shape=(num_samples, length, stack_size))
+
+  @parameterized.named_parameters(
+      ('padding_valid', {'padding': 'valid'}),
+      ('padding_same', {'padding': 'same'}),
+      ('padding_same_dilation_2', {'padding': 'same', 'dilation_rate': 2}),
+      ('padding_causal', {'padding': 'causal'}),
+      ('strides', {'strides': 2}),
+      ('dilation_rate', {'dilation_rate': 2}),
+      ('depth_multiplier', {'depth_multiplier': 2}),
+  )
+  def test_separable_conv1d(self, kwargs):
+    kwargs['filters'] = 2
+    kwargs['kernel_size'] = 3
+    self._run_test(kwargs)
+
+  def test_separable_conv1d_regularizers(self):
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'depthwise_regularizer': 'l2',
+        'pointwise_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+        'strides': 1
+    }
+    with self.cached_session(use_gpu=True):
+      layer = keras.layers.SeparableConv1D(**kwargs)
+      layer.build((None, 5, 2))
+      self.assertEqual(len(layer.losses), 3)
+      layer(keras.backend.variable(np.ones((1, 5, 2))))
+      self.assertEqual(len(layer.losses), 4)
+
+  def test_separable_conv1d_constraints(self):
+    d_constraint = lambda x: x
+    p_constraint = lambda x: x
+    b_constraint = lambda x: x
+
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'pointwise_constraint': p_constraint,
+        'depthwise_constraint': d_constraint,
+        'bias_constraint': b_constraint,
+        'strides': 1
+    }
+    with self.cached_session(use_gpu=True):
+      layer = keras.layers.SeparableConv1D(**kwargs)
+      layer.build((None, 5, 2))
+      self.assertEqual(layer.depthwise_kernel.constraint, d_constraint)
+      self.assertEqual(layer.pointwise_kernel.constraint, p_constraint)
+      self.assertEqual(layer.bias.constraint, b_constraint)
+
+
+@keras_parameterized.run_all_keras_modes
+class SeparableConv2DTest(keras_parameterized.TestCase):
+
+  def _run_test(self, kwargs):
+    num_samples = 2
+    stack_size = 3
+    num_row = 7
+    num_col = 6
+
+    with self.cached_session(use_gpu=True):
+      testing_utils.layer_test(
+          keras.layers.SeparableConv2D,
+          kwargs=kwargs,
+          input_shape=(num_samples, num_row, num_col, stack_size))
+
+  @parameterized.named_parameters(
+      ('padding_valid', {'padding': 'valid'}),
+      ('padding_same', {'padding': 'same'}),
+      ('padding_same_dilation_2', {'padding': 'same', 'dilation_rate': 2}),
+      ('strides', {'strides': 2}),
+      # Only runs on GPU with CUDA, channels_first is not supported on CPU.
+      # TODO(b/62340061): Support channels_first on CPU.
+      ('data_format', {'data_format': 'channels_first'}),
+      ('dilation_rate', {'dilation_rate': 2}),
+      ('depth_multiplier', {'depth_multiplier': 2}),
+  )
+  def test_separable_conv2d(self, kwargs):
+    kwargs['filters'] = 2
+    kwargs['kernel_size'] = 3
+    if 'data_format' not in kwargs or test.is_gpu_available(cuda_only=True):
+      self._run_test(kwargs)
+
+  def test_separable_conv2d_regularizers(self):
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'depthwise_regularizer': 'l2',
+        'pointwise_regularizer': 'l2',
+        'bias_regularizer': 'l2',
+        'activity_regularizer': 'l2',
+        'strides': 1
+    }
+    with self.cached_session(use_gpu=True):
+      layer = keras.layers.SeparableConv2D(**kwargs)
+      layer.build((None, 5, 5, 2))
+      self.assertEqual(len(layer.losses), 3)
+      layer(keras.backend.variable(np.ones((1, 5, 5, 2))))
+      self.assertEqual(len(layer.losses), 4)
+
+  def test_separable_conv2d_constraints(self):
+    d_constraint = lambda x: x
+    p_constraint = lambda x: x
+    b_constraint = lambda x: x
+
+    kwargs = {
+        'filters': 3,
+        'kernel_size': 3,
+        'padding': 'valid',
+        'pointwise_constraint': p_constraint,
+        'depthwise_constraint': d_constraint,
+        'bias_constraint': b_constraint,
+        'strides': 1
+    }
+    with self.cached_session(use_gpu=True):
+      layer = keras.layers.SeparableConv2D(**kwargs)
+      layer.build((None, 5, 5, 2))
+      self.assertEqual(layer.depthwise_kernel.constraint, d_constraint)
+      self.assertEqual(layer.pointwise_kernel.constraint, p_constraint)
+      self.assertEqual(layer.bias.constraint, b_constraint)
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index 7c45e08b5c48084cc57569a4d1102a0a7c5b29e1..95adf78aff957bb58b9379972e39aef077f22d4f 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -20,6 +20,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.keras.engine.base_layer import TensorFlowOpLayer
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
 from tensorflow.python.keras.layers.advanced_activations import *
@@ -36,12 +37,25 @@ from tensorflow.python.keras.layers.pooling import *
 from tensorflow.python.keras.layers.recurrent import *
 from tensorflow.python.keras.layers.wrappers import *
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.util.tf_export import keras_export
 
+# TODO(b/124791387): replace mapping with layer attribute.
+_V2_CONVERSION_TABLE = {
+    # BatchNormalization Layer
+    'BatchNormalizationV1': 'BatchNormalization',
+    'BatchNormalizationV2': 'BatchNormalization',
+}
 
+
+@keras_export('keras.layers.serialize')
 def serialize(layer):
-  return {'class_name': layer.__class__.__name__, 'config': layer.get_config()}
+  layer_class_name = layer.__class__.__name__
+  if layer_class_name in _V2_CONVERSION_TABLE:
+    layer_class_name = _V2_CONVERSION_TABLE[layer_class_name]
+  return {'class_name': layer_class_name, 'config': layer.get_config()}
 
 
+@keras_export('keras.layers.deserialize')
 def deserialize(config, custom_objects=None):
   """Instantiates a layer from a config dictionary.
 
@@ -51,10 +65,11 @@ def deserialize(config, custom_objects=None):
           of custom (non-Keras) objects to class/functions
 
   Returns:
-      Layer instance (may be Model, Sequential, Layer...)
+      Layer instance (may be Model, Sequential, Network, Layer...)
   """
   from tensorflow.python.keras import models  # pylint: disable=g-import-not-at-top
   globs = globals()  # All layers.
+  globs['Network'] = models.Network
   globs['Model'] = models.Model
   globs['Sequential'] = models.Sequential
   return deserialize_keras_object(
diff --git a/tensorflow/python/keras/layers/serialization_test.py b/tensorflow/python/keras/layers/serialization_test.py
index 548c3ec1ac760a33d6eb998e7d601c843bd87779..d322e6afb5413e3b4164ee158f7768be472b4507 100644
--- a/tensorflow/python/keras/layers/serialization_test.py
+++ b/tensorflow/python/keras/layers/serialization_test.py
@@ -38,6 +38,17 @@ class LayerSerializationTest(test.TestCase):
                      keras.initializers.Ones)
     self.assertEqual(new_layer.units, 3)
 
+  def test_serialize_deserialize_batchnorm(self):
+    layer = keras.layers.BatchNormalization(
+        momentum=0.9, beta_initializer='zeros', gamma_regularizer='l2')
+    config = keras.layers.serialize(layer)
+    self.assertEqual(config['class_name'], 'BatchNormalization')
+    new_layer = keras.layers.deserialize(config)
+    self.assertEqual(new_layer.momentum, 0.9)
+    self.assertEqual(new_layer.beta_initializer.__class__,
+                     keras.initializers.Zeros)
+    self.assertEqual(new_layer.gamma_regularizer.__class__,
+                     keras.regularizers.L1L2)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/simplernn_test.py b/tensorflow/python/keras/layers/simplernn_test.py
index bb3fea26926959c15e76556b836a120c02905c6f..0ee074d19c5eb35bbdaea68de0f69676f3282ee5 100644
--- a/tensorflow/python/keras/layers/simplernn_test.py
+++ b/tensorflow/python/keras/layers/simplernn_test.py
@@ -21,15 +21,15 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
-from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.eager import context
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class SimpleRNNLayerTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class SimpleRNNLayerTest(keras_parameterized.TestCase):
 
   def test_return_sequences_SimpleRNN(self):
     num_samples = 2
@@ -50,7 +50,7 @@ class SimpleRNNLayerTest(test.TestCase):
     layer = keras.layers.SimpleRNN(units, input_shape=(None, embedding_dim))
     model = keras.models.Sequential()
     model.add(layer)
-    model.compile(RMSPropOptimizer(0.01), 'mse')
+    model.compile('rmsprop', 'mse')
     x = np.random.random((num_samples, timesteps, embedding_dim))
     y = np.random.random((num_samples, units))
     model.train_on_batch(x, y)
@@ -98,7 +98,6 @@ class SimpleRNNLayerTest(test.TestCase):
     self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
     self.assertEqual(layer.cell.bias.constraint, b_constraint)
 
-  @tf_test_util.run_v1_only('b/120545219')
   def test_with_masking_layer_SimpleRNN(self):
     layer_class = keras.layers.SimpleRNN
     inputs = np.random.random((2, 3, 4))
@@ -107,8 +106,7 @@ class SimpleRNNLayerTest(test.TestCase):
     model = keras.models.Sequential()
     model.add(keras.layers.Masking(input_shape=(3, 4)))
     model.add(layer_class(units=5, return_sequences=True, unroll=False))
-    model.compile(loss='categorical_crossentropy',
-                  optimizer=RMSPropOptimizer(0.01))
+    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
   def test_from_config_SimpleRNN(self):
@@ -118,93 +116,89 @@ class SimpleRNNLayerTest(test.TestCase):
       l2 = layer_class.from_config(l1.get_config())
       assert l1.get_config() == l2.get_config()
 
+  def test_regularizers_SimpleRNN(self):
+    embedding_dim = 4
+    layer_class = keras.layers.SimpleRNN
+    layer = layer_class(
+        5,
+        return_sequences=False,
+        weights=None,
+        input_shape=(None, embedding_dim),
+        kernel_regularizer=keras.regularizers.l1(0.01),
+        recurrent_regularizer=keras.regularizers.l1(0.01),
+        bias_regularizer='l2',
+        activity_regularizer='l1')
+    layer.build((None, None, 2))
+    self.assertEqual(len(layer.losses), 3)
+
+    x = keras.backend.variable(np.ones((2, 3, 2)))
+    layer(x)
+    if context.executing_eagerly():
+      self.assertEqual(len(layer.losses), 4)
+    else:
+      self.assertEqual(len(layer.get_losses_for(x)), 1)
 
-class SimpleRNNLayerGraphOnlyTest(test.TestCase):
-
-  @tf_test_util.run_v1_only('b/120545219')
   def test_statefulness_SimpleRNN(self):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
     layer_class = keras.layers.SimpleRNN
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Embedding(
-              4,
-              embedding_dim,
-              mask_zero=True,
-              input_length=timesteps,
-              batch_input_shape=(num_samples, timesteps)))
-      layer = layer_class(
-          units, return_sequences=False, stateful=True, weights=None)
-      model.add(layer)
-      model.compile(optimizer=gradient_descent.GradientDescentOptimizer(0.01),
-                    loss='mse')
-      out1 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertEqual(out1.shape, (num_samples, units))
-
-      # train once so that the states change
-      model.train_on_batch(
-          np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
-      out2 = model.predict(np.ones((num_samples, timesteps)))
-
-      # if the state is not reset, output should be different
-      self.assertNotEqual(out1.max(), out2.max())
-
-      # check that output changes after states are reset
-      # (even though the model itself didn't change)
-      layer.reset_states()
-      out3 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertNotEqual(out2.max(), out3.max())
-
-      # check that container-level reset_states() works
-      model.reset_states()
-      out4 = model.predict(np.ones((num_samples, timesteps)))
-      np.testing.assert_allclose(out3, out4, atol=1e-5)
-
-      # check that the call to `predict` updated the states
-      out5 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertNotEqual(out4.max(), out5.max())
-
-      # Check masking
-      layer.reset_states()
-
-      left_padded_input = np.ones((num_samples, timesteps))
-      left_padded_input[0, :1] = 0
-      left_padded_input[1, :2] = 0
-      out6 = model.predict(left_padded_input)
-
-      layer.reset_states()
-
-      right_padded_input = np.ones((num_samples, timesteps))
-      right_padded_input[0, -1:] = 0
-      right_padded_input[1, -2:] = 0
-      out7 = model.predict(right_padded_input)
-
-      np.testing.assert_allclose(out7, out6, atol=1e-5)
-
-  @tf_test_util.run_deprecated_v1
-  def test_regularizers_SimpleRNN(self):
-    embedding_dim = 4
-    layer_class = keras.layers.SimpleRNN
-    with self.cached_session():
-      layer = layer_class(
-          5,
-          return_sequences=False,
-          weights=None,
-          input_shape=(None, embedding_dim),
-          kernel_regularizer=keras.regularizers.l1(0.01),
-          recurrent_regularizer=keras.regularizers.l1(0.01),
-          bias_regularizer='l2',
-          activity_regularizer='l1')
-      layer.build((None, None, 2))
-      self.assertEqual(len(layer.losses), 3)
-
-      x = keras.backend.variable(np.ones((2, 3, 2)))
-      layer(x)
-      self.assertEqual(len(layer.get_losses_for(x)), 1)
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.Embedding(
+            4,
+            embedding_dim,
+            mask_zero=True,
+            input_length=timesteps,
+            batch_input_shape=(num_samples, timesteps)))
+    layer = layer_class(
+        units, return_sequences=False, stateful=True, weights=None)
+    model.add(layer)
+    model.compile(optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                  loss='mse', run_eagerly=testing_utils.should_run_eagerly())
+    out1 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertEqual(out1.shape, (num_samples, units))
+
+    # train once so that the states change
+    model.train_on_batch(
+        np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
+    out2 = model.predict(np.ones((num_samples, timesteps)))
+
+    # if the state is not reset, output should be different
+    self.assertNotEqual(out1.max(), out2.max())
+
+    # check that output changes after states are reset
+    # (even though the model itself didn't change)
+    layer.reset_states()
+    out3 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertNotEqual(out2.max(), out3.max())
+
+    # check that container-level reset_states() works
+    model.reset_states()
+    out4 = model.predict(np.ones((num_samples, timesteps)))
+    np.testing.assert_allclose(out3, out4, atol=1e-5)
+
+    # check that the call to `predict` updated the states
+    out5 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertNotEqual(out4.max(), out5.max())
+
+    # Check masking
+    layer.reset_states()
+
+    left_padded_input = np.ones((num_samples, timesteps))
+    left_padded_input[0, :1] = 0
+    left_padded_input[1, :2] = 0
+    out6 = model.predict(left_padded_input)
+
+    layer.reset_states()
+
+    right_padded_input = np.ones((num_samples, timesteps))
+    right_padded_input[0, -1:] = 0
+    right_padded_input[1, -2:] = 0
+    out7 = model.predict(right_padded_input)
+
+    np.testing.assert_allclose(out7, out6, atol=1e-5)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/tensorflow_op_layer_test.py b/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..993f5a9afd39c4a7131e1f6d42b9a4c6da808ce7
--- /dev/null
+++ b/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
@@ -0,0 +1,244 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test for allowing TF ops to work with Keras Functional API."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_nn_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import nest
+
+
+def _single_op_at_end():
+  inputs = keras.Input(shape=(10,))
+  x = keras.layers.Dense(10)(inputs)
+  outputs = gen_nn_ops.relu(x)
+  return inputs, outputs
+
+
+def _single_identity_op_at_end():
+  inputs = keras.Input(shape=(10,))
+  x = keras.layers.Dense(10)(inputs)
+  outputs = array_ops.identity(x)
+  assert 'Identity' in outputs.name
+  return inputs, outputs
+
+
+def _multiple_ops_at_end():
+  inputs = keras.Input(shape=(10,))
+  x = keras.layers.Dense(10)(inputs)
+  x = gen_nn_ops.relu(x)
+  outputs = gen_nn_ops.relu(x)
+  return inputs, outputs
+
+
+def _single_op_in_middle():
+  inputs = keras.Input(shape=(10,))
+  x = keras.layers.Dense(10)(inputs)
+  x = gen_nn_ops.relu(x)
+  outputs = keras.layers.Dense(10)(x)
+  return inputs, outputs
+
+
+def _multiple_ops_in_middle():
+  inputs = keras.Input(shape=(10,))
+  x = keras.layers.Dense(10)(inputs)
+  x = gen_nn_ops.relu(x)
+  x = gen_nn_ops.relu(x)
+  outputs = keras.layers.Dense(10)(x)
+  return inputs, outputs
+
+
+def _single_standalone_branch():
+  inputs = keras.Input(shape=(10,))
+  x = keras.layers.Dense(10)(inputs)
+  outputs = x * 2
+  return inputs, outputs
+
+
+def _single_op_with_attrs():
+  inputs = keras.Input(shape=(10,))
+  x = math_ops.reduce_mean(inputs, axis=1, keepdims=True)
+  outputs = keras.layers.Dense(10)(x)
+  return inputs, outputs
+
+
+def _multiple_uses():
+  inputs = keras.Input(shape=(10,))
+  x = math_ops.reduce_mean(inputs, axis=1, keepdims=True)
+  x1 = keras.layers.Dense(10)(x)
+  x2 = keras.layers.Dense(10)(x)
+  outputs = x1 + x2
+  return inputs, outputs
+
+
+def _op_with_tensor_list():
+  inputs = keras.Input(shape=(10,))
+  x = array_ops.concat([inputs, inputs], axis=1)
+  outputs = keras.layers.Dense(10)(x)
+  return inputs, outputs
+
+
+def _add_n():
+  inputs = keras.Input(shape=(10,))
+  outputs = math_ops.add_n([inputs, inputs, inputs])
+  return inputs, outputs
+
+
+def _reuse_op():
+  inputs = keras.Input(shape=(10,))
+  # This op needs to be checked multiple times.
+  x = gen_nn_ops.relu(inputs)
+  y = keras.layers.Dense(10)(x)
+  x2 = x * 2
+  y2 = keras.layers.Dense(10)(x2)
+  outputs = y + y2
+  return inputs, outputs
+
+
+class LayerWithLayer(keras.layers.Layer):
+
+  def build(self, input_shape):
+    self.bias = self.add_weight(name='bias', dtype='float32')
+    self.layer = keras.layers.Dense(10)
+
+  def call(self, inputs):
+    inputs = inputs * self.bias
+    # Would throw an error if Keras History was created here.
+    return self.layer(inputs)
+
+
+def _inner_layer():
+  inputs = keras.Input(shape=(10,))
+  outputs = LayerWithLayer()(inputs)
+  return inputs, outputs
+
+
+@keras_parameterized.run_all_keras_modes
+class AutoLambdaTest(keras_parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('single_op_at_end', _single_op_at_end),
+      ('single_identity_op_at_end', _single_identity_op_at_end),
+      ('multiple_ops_at_end', _multiple_ops_at_end),
+      ('single_op_in_middle', _single_op_in_middle),
+      ('multiple_ops_in_middle', _multiple_ops_in_middle),
+      ('single_standalone_branch', _single_standalone_branch),
+      ('single_op_with_attrs', _single_op_with_attrs),
+      ('multiple_uses', _multiple_uses),
+      ('op_with_tensor_list', _op_with_tensor_list), ('add_n', _add_n),
+      ('_reuse_op', _reuse_op), ('_inner_layer', _inner_layer))
+  def test_autolambda(self, model_fn):
+    inputs, outputs = model_fn()
+    model = keras.Model(inputs, outputs)
+    model.compile(
+        adam.Adam(0.001), 'mse', run_eagerly=testing_utils.should_run_eagerly())
+
+    np_inputs = nest.map_structure(lambda x: np.ones((10, 10), 'float32'),
+                                   inputs)
+    np_outputs = nest.map_structure(lambda x: np.ones((10, 10), 'float32'),
+                                    outputs)
+    model.fit(np_inputs, np_outputs, batch_size=2)
+    model(np_inputs)  # Test calling the model directly on inputs.
+
+    new_model = keras.Model.from_config(
+        model.get_config(), custom_objects={'LayerWithLayer': LayerWithLayer})
+    new_model.compile(
+        adam.Adam(0.001), 'mse', run_eagerly=testing_utils.should_run_eagerly())
+    new_model.fit(np_inputs, np_outputs, batch_size=2)
+    new_model(np_inputs)  # Test calling the new model directly on inputs.
+
+  def test_numerical_correctness_simple(self):
+    x = ops.convert_to_tensor([[-1., 0., -2., 1.]])
+    inputs = keras.Input(shape=(4,))
+    outputs = gen_nn_ops.relu(inputs)
+    model = keras.Model(inputs, outputs)
+    y = self.evaluate(model(x))
+    self.assertAllClose(y, [[0., 0., 0., 1.]])
+
+  def test_numerical_correctness_with_attrs(self):
+    x = ops.convert_to_tensor([[1.5, 1.5], [2.5, 3.5]])
+    inputs = keras.Input(shape=(10,))
+    outputs = math_ops.reduce_mean(inputs, axis=1)
+    model = keras.Model(inputs, outputs)
+    y = self.evaluate(model(x))
+    self.assertAllClose(y, [1.5, 3.])
+
+  def test_numerical_correctness_serialization(self):
+    x = ops.convert_to_tensor([-1., 0., -2., 1.])
+    inputs = keras.Input(shape=(4,))
+    outputs = gen_nn_ops.relu(inputs)
+    model1 = keras.Model(inputs, outputs)
+    y1 = self.evaluate(model1(x))
+    model2 = model1.from_config(model1.get_config())
+    y2 = self.evaluate(model2(x))
+    self.assertAllClose(y1, y2)
+
+  def test_no_tracking(self):
+    x = keras.backend.placeholder((10, 10))
+    keras.layers.Dense(1)(x)
+    self.assertTrue(x._keras_history_checked)
+
+  def test_timing_scales_linearly(self):
+
+    def _construct_graph_of_size(size):
+      start = time.time()
+      x = keras.backend.placeholder(shape=(10, 4))
+
+      for _ in range(size):
+        x = keras.layers.Dense(4)(x)
+        x = gen_nn_ops.relu(x)
+
+      end = time.time()
+      return end - start
+
+    size_50 = _construct_graph_of_size(50)
+    size_500 = _construct_graph_of_size(500)
+
+    # Check construction time grows approx. linearly with size.
+    e = 2  # Fudge factor to prevent flakiness.
+    self.assertLess(size_500, (10 * e) * size_50)
+
+  def test_no_mask_tracking(self):
+    x = keras.backend.placeholder((10, 10))
+    y = keras.layers.Masking(0.)(x)
+    self.assertTrue(y._keras_mask._keras_history_checked)
+
+  def test_built(self):
+    inputs = keras.Input(shape=(10,))
+    outputs = gen_nn_ops.relu(inputs)
+    model = keras.Model(inputs, outputs)
+    model.compile('sgd', 'mse')
+    for layer in model.layers:
+      self.assertTrue(layer.built)
+    # Test something that requires Layers to be built.
+    model.summary()
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/layers/unified_gru_test.py b/tensorflow/python/keras/layers/unified_gru_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..db8610423803354d67824cf6fef7df2db94e62bd
--- /dev/null
+++ b/tensorflow/python/keras/layers/unified_gru_test.py
@@ -0,0 +1,626 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for UnifiedGRU layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python import keras
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent
+
+
+# Global config for grappler setting that is used for graph mode test.
+_rewrites = rewriter_config_pb2.RewriterConfig()
+_rewrites.implementation_selector = rewriter_config_pb2.RewriterConfig.ON
+_rewrites.min_graph_nodes = -1
+_graph_options = config_pb2.GraphOptions(rewrite_options=_rewrites)
+_config = config_pb2.ConfigProto(graph_options=_graph_options)
+
+
+@keras_parameterized.run_all_keras_modes(config=_config)
+class UnifiedGRUTest(keras_parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('non_tan_activation', 'relu', 'sigmoid', 0, False, True, True),
+      ('non_sigmoid_recur_activation', 'tanh', 'relu', 0, False, True, True),
+      ('use_recurrent_dropout', 'tanh', 'sigmoid', 0.1, False, True, True),
+      ('unroll', 'tanh', 'sigmoid', 0, True, True, True),
+      ('not_use_bias', 'tanh', 'sigmoid', 0, False, False, True),
+      ('not_reset_after', 'tanh', 'sigmoid', 0, False, True, False)
+  )
+  def test_could_use_defun_backend(self, activation, recurrent_activation,
+                                   recurrent_dropout, unroll, use_bias,
+                                   reset_after):
+    layer = keras.layers.UnifiedGRU(1,
+                                    activation=activation,
+                                    recurrent_activation=recurrent_activation,
+                                    recurrent_dropout=recurrent_dropout,
+                                    unroll=unroll,
+                                    use_bias=use_bias,
+                                    reset_after=reset_after)
+    self.assertFalse(layer.could_use_cudnn)
+
+  def test_keras_model_with_gru(self):
+    input_shape = 10
+    rnn_state_size = 8
+    output_shape = 8
+    timestep = 4
+    batch = 100
+    epoch = 10
+
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=batch,
+        test_samples=0,
+        input_shape=(timestep, input_shape),
+        num_classes=output_shape)
+    y_train = keras.utils.to_categorical(y_train, output_shape)
+
+    layer = keras.layers.UnifiedGRU(rnn_state_size)
+
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+
+    outputs = layer(inputs)
+    model = keras.models.Model(inputs, outputs)
+    model.compile('rmsprop', loss='mse')
+    model.fit(x_train, y_train, epochs=epoch)
+    model.evaluate(x_train, y_train)
+    model.predict(x_train)
+
+  def test_dynamic_behavior_GRU(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    layer = keras.layers.UnifiedGRU(units, input_shape=(None, embedding_dim))
+    model = keras.models.Sequential()
+    model.add(layer)
+    model.compile(gradient_descent.GradientDescentOptimizer(0.001), 'mse')
+    x = np.random.random((num_samples, timesteps, embedding_dim))
+    y = np.random.random((num_samples, units))
+    model.train_on_batch(x, y)
+
+  def test_stacking_GRU(self):
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(keras.layers.UnifiedGRU(10, return_sequences=True, unroll=False))
+    model.add(keras.layers.UnifiedGRU(5, return_sequences=True, unroll=False))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  def test_from_config_GRU(self):
+    layer_class = keras.layers.UnifiedGRU
+    for stateful in (False, True):
+      l1 = layer_class(units=1, stateful=stateful)
+      l2 = layer_class.from_config(l1.get_config())
+      assert l1.get_config() == l2.get_config()
+
+  def test_unified_gru_feature_parity_with_canonical_gru(self):
+    with context.eager_mode():
+      # Run this test under eager only due to b/120160788 for model.set_weights.
+      input_shape = 10
+      rnn_state_size = 8
+      timestep = 4
+      batch = 20
+
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=rnn_state_size)
+      y_train = keras.utils.to_categorical(y_train, rnn_state_size)
+
+      inputs = keras.layers.Input(
+          shape=[timestep, input_shape], dtype=dtypes.float32)
+      gru_layer = keras.layers.GRU(rnn_state_size,
+                                   recurrent_activation='sigmoid',
+                                   reset_after=True)
+      output = gru_layer(inputs)
+      gru_model = keras.models.Model(inputs, output)
+      weights = gru_model.get_weights()
+      y_1 = gru_model.predict(x_train)
+      gru_model.compile('rmsprop', 'mse')
+      gru_model.fit(x_train, y_train)
+      y_2 = gru_model.predict(x_train)
+
+      with test_util.device(use_gpu=True):
+        cudnn_layer = keras.layers.UnifiedGRU(rnn_state_size,
+                                              recurrent_activation='sigmoid',
+                                              reset_after=True)
+        cudnn_model = keras.models.Model(inputs, cudnn_layer(inputs))
+      cudnn_model.set_weights(weights)
+      y_3 = cudnn_model.predict(x_train)
+      cudnn_model.compile('rmsprop', 'mse')
+      cudnn_model.fit(x_train, y_train)
+      y_4 = cudnn_model.predict(x_train)
+
+      self.assertAllClose(y_1, y_3, rtol=2e-5, atol=2e-5)
+      self.assertAllClose(y_2, y_4, rtol=2e-5, atol=2e-5)
+
+  @parameterized.named_parameters(
+      # test_name, use_bias, bias_initializer, activation
+      ('normal', True, 'zeros'),
+      ('no_bias', False, 'zeros'),
+      ('random_bias', True, 'random_uniform'),
+  )
+  def test_unified_gru_model_save_load(self, use_bias, bias_initializer):
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir)
+    h5_path = os.path.join(temp_dir, 'test.h5')
+
+    batch = 10
+    timestep = 3
+    input_dim = 5
+    units = 2
+
+    x = np.random.random((batch, timestep, input_dim))
+
+    def build_model():
+      inputs = keras.layers.Input(
+          shape=[timestep, input_dim], dtype=dtypes.float32)
+      layer = keras.layers.UnifiedGRU(
+          units,
+          use_bias=use_bias,
+          bias_initializer=bias_initializer)
+      output = layer(inputs)
+      return keras.models.Model(inputs, output), layer
+
+    model, layer = build_model()
+    y_ref = model.predict(x)
+    model.save_weights(h5_path)
+
+    cloned_model, new_layer = build_model()
+    cloned_model.load_weights(h5_path)
+    y = cloned_model.predict(x)
+
+    self.assertAllClose(y, y_ref)
+    self.assertAllClose(layer.get_weights(), new_layer.get_weights())
+
+  def test_unified_gru_output_on_multiple_kernel(self):
+    input_shape = 10
+    rnn_state_size = 8
+    timestep = 4
+    batch = 100
+
+    x_train = np.random.random((batch, timestep, input_shape))
+
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+    with test_util.device(use_gpu=False):
+      layer = keras.layers.UnifiedGRU(rnn_state_size)
+      output = layer(inputs)
+      cpu_model = keras.models.Model(inputs, output)
+      weights = cpu_model.get_weights()
+      y_1 = cpu_model.predict(x_train)
+
+    with test_util.device(use_gpu=True):
+      layer = keras.layers.UnifiedGRU(rnn_state_size)
+      output = layer(inputs)
+      gpu_model = keras.models.Model(inputs, output)
+      gpu_model.set_weights(weights)
+      y_2 = gpu_model.predict(x_train)
+
+    # Note that CuDNN uses 'sigmoid' as activation, so the unified GRU uses
+    # 'sigmoid' as default. Construct the canonical GRU with sigmoid to achieve
+    # the same output.
+    with test_util.device(use_gpu=True):
+      layer = keras.layers.GRU(rnn_state_size,
+                               recurrent_activation='sigmoid',
+                               reset_after=True)
+      output = layer(inputs)
+      canonical_model = keras.models.Model(inputs, output)
+      canonical_model.set_weights(weights)
+      y_3 = canonical_model.predict(x_train)
+
+    self.assertAllClose(y_1, y_2)
+    self.assertAllClose(y_2, y_3)
+
+  @parameterized.named_parameters(
+      # test_name, time_major, go_backwards
+      ('normal', False, False),
+      ('time_major', True, False),
+      ('go_backwards', False, True),
+      ('both', True, True),
+  )
+  def test_time_major_and_go_backward(self, time_major, go_backwards):
+    input_shape = 10
+    rnn_state_size = 8
+    timestep = 4
+    batch = 100
+
+    x_train = np.random.random((batch, timestep, input_shape))
+
+    def build_model(layer_cls):
+      inputs = keras.layers.Input(
+          shape=[timestep, input_shape], dtype=dtypes.float32)
+      layer = layer_cls(rnn_state_size,
+                        recurrent_activation='sigmoid',
+                        time_major=time_major,
+                        return_sequences=True,
+                        go_backwards=go_backwards,
+                        reset_after=True)
+      if time_major:
+        converted_input = keras.layers.Lambda(
+            lambda t: array_ops.transpose(t, [1, 0, 2]))(inputs)
+        outputs = layer(converted_input)
+        outputs = keras.layers.Lambda(
+            lambda t: array_ops.transpose(t, [1, 0, 2]))(outputs)
+      else:
+        outputs = layer(inputs)
+      return keras.models.Model(inputs, outputs)
+
+    gru_model = build_model(keras.layers.GRU)
+    y_ref = gru_model.predict(x_train)
+    weights = gru_model.get_weights()
+
+    unified_gru_model = build_model(keras.layers.UnifiedGRU)
+    unified_gru_model.set_weights(weights)
+    y = unified_gru_model.predict(x_train)
+
+    self.assertAllClose(y, y_ref)
+
+  def test_with_masking_layer_GRU(self):
+    layer_class = keras.layers.UnifiedGRU
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Masking(input_shape=(3, 4)))
+    model.add(layer_class(units=5, return_sequences=True, unroll=False))
+    model.compile(loss='categorical_crossentropy',
+                  optimizer=gradient_descent.GradientDescentOptimizer(0.001))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  def test_masking_with_stacking_GRU(self):
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Masking(input_shape=(3, 4)))
+    model.add(keras.layers.UnifiedGRU(10, return_sequences=True, unroll=False))
+    model.add(keras.layers.UnifiedGRU(5, return_sequences=True, unroll=False))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  def test_return_sequences_GRU(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        keras.layers.UnifiedGRU,
+        kwargs={'units': units,
+                'return_sequences': True},
+        input_shape=(num_samples, timesteps, embedding_dim))
+
+  def test_dropout_GRU(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        keras.layers.UnifiedGRU,
+        kwargs={'units': units,
+                'dropout': 0.1,
+                'recurrent_dropout': 0.1},
+        input_shape=(num_samples, timesteps, embedding_dim))
+
+  def test_constraints_GRU(self):
+    embedding_dim = 4
+    layer_class = keras.layers.UnifiedGRU
+    k_constraint = keras.constraints.max_norm(0.01)
+    r_constraint = keras.constraints.max_norm(0.01)
+    b_constraint = keras.constraints.max_norm(0.01)
+    layer = layer_class(
+        5,
+        return_sequences=False,
+        weights=None,
+        input_shape=(None, embedding_dim),
+        kernel_constraint=k_constraint,
+        recurrent_constraint=r_constraint,
+        bias_constraint=b_constraint)
+    layer.build((None, None, embedding_dim))
+    self.assertEqual(layer.cell.kernel.constraint, k_constraint)
+    self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
+    self.assertEqual(layer.cell.bias.constraint, b_constraint)
+
+  @parameterized.parameters([0, 1, 2])
+  def test_implementation_mode_GRU(self, implementation_mode):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        keras.layers.UnifiedGRU,
+        kwargs={'units': units,
+                'implementation': implementation_mode},
+        input_shape=(num_samples, timesteps, embedding_dim))
+
+  def test_regularizers_GRU(self):
+    embedding_dim = 4
+    layer_class = keras.layers.UnifiedGRU
+    layer = layer_class(
+        5,
+        return_sequences=False,
+        weights=None,
+        input_shape=(None, embedding_dim),
+        kernel_regularizer=keras.regularizers.l1(0.01),
+        recurrent_regularizer=keras.regularizers.l1(0.01),
+        bias_regularizer='l2',
+        activity_regularizer='l1')
+    layer.build((None, None, 2))
+    self.assertEqual(len(layer.losses), 3)
+
+    x = keras.backend.variable(np.ones((2, 3, 2)))
+    layer(x)
+    if context.executing_eagerly():
+      self.assertEqual(len(layer.losses), 4)
+    else:
+      self.assertEqual(len(layer.get_losses_for(x)), 1)
+
+  def test_statefulness_GRU(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    layer_class = keras.layers.UnifiedGRU
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.Embedding(
+            4,
+            embedding_dim,
+            mask_zero=True,
+            input_length=timesteps,
+            batch_input_shape=(num_samples, timesteps)))
+    layer = layer_class(
+        units, return_sequences=False, stateful=True, weights=None)
+    model.add(layer)
+    model.compile(optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                  loss='mse', run_eagerly=testing_utils.should_run_eagerly())
+    out1 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertEqual(out1.shape, (num_samples, units))
+
+    # train once so that the states change
+    model.train_on_batch(
+        np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
+    out2 = model.predict(np.ones((num_samples, timesteps)))
+
+    # if the state is not reset, output should be different
+    self.assertNotEqual(out1.max(), out2.max())
+
+    # check that output changes after states are reset
+    # (even though the model itself didn't change)
+    layer.reset_states()
+    out3 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertNotEqual(out2.max(), out3.max())
+
+    # check that container-level reset_states() works
+    model.reset_states()
+    out4 = model.predict(np.ones((num_samples, timesteps)))
+    np.testing.assert_allclose(out3, out4, atol=1e-5)
+
+    # check that the call to `predict` updated the states
+    out5 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertNotEqual(out4.max(), out5.max())
+
+    # Check masking
+    layer.reset_states()
+
+    left_padded_input = np.ones((num_samples, timesteps))
+    left_padded_input[0, :1] = 0
+    left_padded_input[1, :2] = 0
+    out6 = model.predict(left_padded_input)
+
+    layer.reset_states()
+
+    right_padded_input = np.ones((num_samples, timesteps))
+    right_padded_input[0, -1:] = 0
+    right_padded_input[1, -2:] = 0
+    out7 = model.predict(right_padded_input)
+
+    np.testing.assert_allclose(out7, out6, atol=1e-5)
+
+  def test_stateful_GRU_training(self):
+    # See b/123587692 for more context.
+    vocab_size = 20
+    embedding_dim = 10
+    batch_size = 8
+    timestep = 12
+    units = 5
+    x = np.random.randint(0, vocab_size, size=(batch_size, timestep))
+    y = np.random.randint(0, vocab_size, size=(batch_size, timestep))
+
+    model = keras.Sequential([
+        keras.layers.Embedding(vocab_size, embedding_dim,
+                               batch_input_shape=[batch_size, timestep]),
+        keras.layers.UnifiedGRU(units,
+                                return_sequences=True,
+                                stateful=True),
+        keras.layers.Dense(vocab_size)
+    ])
+    model.compile(optimizer='adam',
+                  loss='sparse_categorical_crossentropy',
+                  run_eagerly=testing_utils.should_run_eagerly())
+    model.fit(x, y, epochs=1, shuffle=False)
+
+
+class GRULayerGradientTapeTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_in_tape(self):
+    if not context.executing_eagerly():
+      self.skipTest('bloo')
+    time_steps = 10
+    embedding_size = 11
+    gru_unit_size = 12
+
+    gru = keras.layers.UnifiedGRU(gru_unit_size,
+                                  return_sequences=True,
+                                  return_state=True,
+                                  recurrent_activation='sigmoid',
+                                  recurrent_initializer='glorot_uniform')
+
+    x = random_ops.random_uniform([1, time_steps, embedding_size])
+    y = random_ops.random_uniform([1, gru_unit_size])
+
+    with backprop.GradientTape() as tape:
+      hidden_state = array_ops.zeros([1, gru_unit_size], dtype=dtypes.float32)
+      _, state = gru(x, initial_state=hidden_state)
+
+      loss = math_ops.reduce_mean(math_ops.square(state - y))
+
+    tape.gradient(loss, gru.variables)
+
+
+class GRULayerGraphOnlyTest(test.TestCase):
+
+  # Need session for test
+  @test_util.run_deprecated_v1
+  def test_unifiedGRU(self):
+    input_shape = 10
+    rnn_state_size = 8
+    output_shape = 8
+    timestep = 4
+    batch = 100
+    epoch = 1
+
+    with self.cached_session(config=_config, use_gpu=True) as sess:
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      y_train = keras.utils.to_categorical(y_train, output_shape)
+
+      layer = keras.layers.UnifiedGRU(rnn_state_size, return_runtime=True)
+
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape), name='predict')
+
+      outputs, runtime = layer(inputs)
+      loss = losses.softmax_cross_entropy(predict, outputs)
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      train_op = optimizer.minimize(loss)
+
+      sess.run([variables.global_variables_initializer()])
+      existing_loss = 0
+      for _ in range(epoch):
+        loss_value, _, runtime_value = sess.run([loss, train_op, runtime], {
+            inputs: x_train,
+            predict: y_train
+        })
+        if test.is_gpu_available():
+          self.assertEqual(runtime_value, b'cudnn')
+        else:
+          self.assertEqual(runtime_value, b'cpu')
+        # Make sure the loss is updated for every epoch
+        # (layer weights properly updated).
+        self.assertNotEqual(existing_loss, loss_value)
+        existing_loss = loss_value
+
+  # Need session for test
+  @test_util.run_deprecated_v1
+  def test_UnifiedGRU_with_cond(self):
+    # This test is to demonstrate the graph rewrite of grappler plugin under
+    # the condition that the function returns different number of internal
+    # states.
+    input_shape = 10
+    rnn_state_size = 8
+    output_shape = 8
+    timestep = 4
+    batch = 100
+    epoch = 1
+
+    with self.cached_session(config=_config, use_gpu=True) as sess:
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      y_train = keras.utils.to_categorical(y_train, output_shape)
+
+      layer = keras.layers.UnifiedGRU(rnn_state_size, return_runtime=True)
+
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape), name='predict')
+
+      zeros = array_ops.zeros([batch, output_shape])
+      dummy_runtime = constant_op.constant(
+          'unknown', dtype=dtypes.string, name='runtime')
+      a = constant_op.constant(0)
+      b = constant_op.constant(1)
+      # Will always run the GRU layer.
+      outputs, runtime = control_flow_ops.cond(
+          gen_math_ops.less(a, b),
+          lambda: layer(inputs),
+          lambda: (zeros, dummy_runtime))
+      loss = losses.softmax_cross_entropy(predict, outputs)
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      train_op = optimizer.minimize(loss)
+
+      sess.run([variables.global_variables_initializer()])
+      existing_loss = 0
+
+      for _ in range(epoch):
+        loss_value, _, runtime_value = sess.run([loss, train_op, runtime], {
+            inputs: x_train,
+            predict: y_train
+        })
+        if test.is_gpu_available():
+          self.assertEqual(runtime_value, b'cudnn')
+        else:
+          self.assertEqual(runtime_value, b'cpu')
+        # Make sure the loss is updated for every epoch
+        # (layer weights properly updated).
+        self.assertNotEqual(existing_loss, loss_value)
+        existing_loss = loss_value
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/layers/unified_lstm_test.py b/tensorflow/python/keras/layers/unified_lstm_test.py
index 932b2d331dcb60c6ff3a70ec418d47424d4b8575..938c87c6b1aa2e493e2b053630381031c628d210 100644
--- a/tensorflow/python/keras/layers/unified_lstm_test.py
+++ b/tensorflow/python/keras/layers/unified_lstm_test.py
@@ -33,9 +33,8 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
-from tensorflow.python.keras.layers.cudnn_recurrent import CuDNNLSTM
-from tensorflow.python.keras.layers.recurrent import UnifiedLSTM
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_math_ops
@@ -48,135 +47,252 @@ from tensorflow.python.training import gradient_descent
 
 # Global config for grappler setting that is used for graph mode test.
 _rewrites = rewriter_config_pb2.RewriterConfig()
-_rewrites.function_optimization = rewriter_config_pb2.RewriterConfig.OFF
-_customer_optimizer = _rewrites.custom_optimizers.add()
-_customer_optimizer.name = 'ExperimentalImplementationSelector'
+_rewrites.implementation_selector = rewriter_config_pb2.RewriterConfig.ON
 _rewrites.min_graph_nodes = -1
 _graph_options = config_pb2.GraphOptions(rewrite_options=_rewrites)
 _config = config_pb2.ConfigProto(graph_options=_graph_options)
 
 
-@test_util.run_v1_only('b/120545219')
-class UnifiedLSTMTest(test.TestCase, parameterized.TestCase):
+@keras_parameterized.run_all_keras_modes(config=_config)
+class UnifiedLSTMTest(keras_parameterized.TestCase):
 
-  def test_unifiedLSTM(self):
-    input_shape = 10
-    rnn_state_size = 8
-    output_shape = 8
-    timestep = 4
-    batch = 100
-    epoch = 1
+  @parameterized.named_parameters(
+      ('non_tan_activation', 'relu', 'sigmoid', 0, False, True),
+      ('non_sigmoid_recur_activation', 'tanh', 'relu', 0, False, True),
+      ('use_recurrent_dropout', 'tanh', 'sigmoid', 0.1, False, True),
+      ('unroll', 'tanh', 'sigmoid', 0, True, True),
+      ('not_use_bias', 'tanh', 'sigmoid', 0, False, False),
+  )
+  def test_could_use_defun_backend(self, activation, recurrent_activation,
+                                   recurrent_dropout, unroll, use_bias):
+    layer = keras.layers.UnifiedLSTM(
+        1,
+        activation=activation,
+        recurrent_activation=recurrent_activation,
+        recurrent_dropout=recurrent_dropout,
+        unroll=unroll,
+        use_bias=use_bias)
+    self.assertFalse(layer.could_use_cudnn)
 
-    with self.cached_session(config=_config, use_gpu=True) as sess:
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=batch,
-          test_samples=0,
-          input_shape=(timestep, input_shape),
-          num_classes=output_shape)
-      y_train = keras.utils.to_categorical(y_train, output_shape)
+  def test_static_shape_inference_LSTM(self):
+    # Github issue: 15165
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
 
-      layer = UnifiedLSTM(rnn_state_size, return_runtime=True)
+    model = keras.models.Sequential()
+    inputs = keras.layers.Dense(
+        embedding_dim, input_shape=(timesteps, embedding_dim))
+    model.add(inputs)
+    layer = keras.layers.UnifiedLSTM(units, return_sequences=True)
+    model.add(layer)
+    outputs = model.layers[-1].output
+    self.assertEqual(outputs.get_shape().as_list(), [None, timesteps, units])
 
-      inputs = array_ops.placeholder(
-          dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
-      predict = array_ops.placeholder(
-          dtypes.float32, shape=(None, output_shape), name='predict')
+  def test_dynamic_behavior_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    layer = keras.layers.UnifiedLSTM(units, input_shape=(None, embedding_dim))
+    model = keras.models.Sequential()
+    model.add(layer)
+    model.compile(gradient_descent.GradientDescentOptimizer(0.001), 'mse')
+    x = np.random.random((num_samples, timesteps, embedding_dim))
+    y = np.random.random((num_samples, units))
+    model.train_on_batch(x, y)
 
-      outputs, runtime = layer(inputs)
-      loss = losses.softmax_cross_entropy(predict, outputs)
-      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-      train_op = optimizer.minimize(loss)
+  def test_stacking_LSTM(self):
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(keras.layers.UnifiedLSTM(10, return_sequences=True, unroll=False))
+    model.add(keras.layers.UnifiedLSTM(5, return_sequences=True, unroll=False))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
-      sess.run([variables.global_variables_initializer()])
-      existing_loss = 0
-      for _ in range(epoch):
-        loss_value, _, runtime_value = sess.run([loss, train_op, runtime], {
-            inputs: x_train,
-            predict: y_train
-        })
-        if test.is_gpu_available():
-          self.assertEqual(runtime_value, b'cudnn')
-        else:
-          self.assertEqual(runtime_value, b'cpu')
-        # Make sure the loss is updated for every epoch
-        # (layer weights properly updated).
-        self.assertNotEqual(existing_loss, loss_value)
-        existing_loss = loss_value
+  def test_from_config_LSTM(self):
+    layer_class = keras.layers.UnifiedLSTM
+    for stateful in (False, True):
+      l1 = layer_class(units=1, stateful=stateful)
+      l2 = layer_class.from_config(l1.get_config())
+      assert l1.get_config() == l2.get_config()
 
-  def test_unifiedLSTM_with_cond(self):
-    # This test is to demonstrate the graph rewrite of grappler plugin under
-    # the condition that the function returns different number of internal
-    # states.
-    input_shape = 10
-    rnn_state_size = 8
-    output_shape = 8
-    timestep = 4
-    batch = 100
-    epoch = 1
+  def test_specify_initial_state_keras_tensor(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
 
-    with self.cached_session(config=_config, use_gpu=True) as sess:
-      (x_train, y_train), _ = testing_utils.get_test_data(
-          train_samples=batch,
-          test_samples=0,
-          input_shape=(timestep, input_shape),
-          num_classes=output_shape)
-      y_train = keras.utils.to_categorical(y_train, output_shape)
+    # Test with Keras tensor
+    inputs = keras.Input((timesteps, embedding_dim))
+    initial_state = [keras.Input((units,)) for _ in range(num_states)]
+    layer = keras.layers.UnifiedLSTM(units)
+    if len(initial_state) == 1:
+      output = layer(inputs, initial_state=initial_state[0])
+    else:
+      output = layer(inputs, initial_state=initial_state)
+    assert initial_state[0] in layer._inbound_nodes[0].input_tensors
 
-      layer = UnifiedLSTM(rnn_state_size, return_runtime=True)
+    model = keras.models.Model([inputs] + initial_state, output)
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
 
-      inputs = array_ops.placeholder(
-          dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
-      predict = array_ops.placeholder(
-          dtypes.float32, shape=(None, output_shape), name='predict')
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    initial_state = [
+        np.random.random((num_samples, units)) for _ in range(num_states)
+    ]
+    targets = np.random.random((num_samples, units))
+    model.train_on_batch([inputs] + initial_state, targets)
 
-      zeros = array_ops.zeros([batch, output_shape])
-      dummy_runtime = constant_op.constant(
-          'unknown', dtype=dtypes.string, name='runtime')
-      a = constant_op.constant(0)
-      b = constant_op.constant(1)
-      # Will always run the lstm layer.
-      outputs, runtime = control_flow_ops.cond(
-          gen_math_ops.less(a, b),
-          lambda: layer(inputs),
-          lambda: (zeros, dummy_runtime))
-      loss = losses.softmax_cross_entropy(predict, outputs)
-      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-      train_op = optimizer.minimize(loss)
+  def DISABLED_test_specify_initial_state_non_keras_tensor(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
 
-      sess.run([variables.global_variables_initializer()])
-      existing_loss = 0
+    # Test with non-Keras tensor
+    inputs = keras.Input((timesteps, embedding_dim))
+    initial_state = [
+        keras.backend.random_normal_variable((num_samples, units), 0, 1)
+        for _ in range(num_states)
+    ]
+    layer = keras.layers.UnifiedLSTM(units)
+    output = layer(inputs, initial_state=initial_state)
 
-      for _ in range(epoch):
-        loss_value, _, runtime_value = sess.run([loss, train_op, runtime], {
-            inputs: x_train,
-            predict: y_train
-        })
-        if test.is_gpu_available():
-          self.assertEqual(runtime_value, b'cudnn')
-        else:
-          self.assertEqual(runtime_value, b'cpu')
-        # Make sure the loss is updated for every epoch
-        # (layer weights properly updated).
-        self.assertNotEqual(existing_loss, loss_value)
-        existing_loss = loss_value
+    model = keras.models.Model(inputs, output)
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
 
-  @parameterized.named_parameters(
-      ('_non_tan_activation', 'relu', 0, False, True, None),
-      ('_use_recurrent_dropout', 'tanh', 0.1, False, True, None),
-      ('_unroll', 'tanh', 0, True, True, None),
-      ('_not_use_bias', 'tanh', 0, False, False, None),
-      ('_use_bias_regularizer', 'tanh', 0, False, True, 'l2')
-  )
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_could_use_defun_backend(self, activation, recurrent_dropout,
-                                   unroll, use_bias, bias_regularizer):
-    layer = UnifiedLSTM(1,
-                        activation=activation,
-                        recurrent_dropout=recurrent_dropout,
-                        unroll=unroll,
-                        use_bias=use_bias,
-                        bias_regularizer=bias_regularizer)
-    self.assertFalse(layer.could_use_cudnn)
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    targets = np.random.random((num_samples, units))
+    model.train_on_batch(inputs, targets)
+
+  def test_reset_states_with_values(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    layer = keras.layers.UnifiedLSTM(units, stateful=True)
+    layer.build((num_samples, timesteps, embedding_dim))
+    layer.reset_states()
+    assert len(layer.states) == num_states
+    assert layer.states[0] is not None
+    self.assertAllClose(
+        keras.backend.eval(layer.states[0]),
+        np.zeros(keras.backend.int_shape(layer.states[0])),
+        atol=1e-4)
+    state_shapes = [keras.backend.int_shape(state) for state in layer.states]
+    values = [np.ones(shape) for shape in state_shapes]
+    if len(values) == 1:
+      values = values[0]
+    layer.reset_states(values)
+    self.assertAllClose(
+        keras.backend.eval(layer.states[0]),
+        np.ones(keras.backend.int_shape(layer.states[0])),
+        atol=1e-4)
+
+    # Test with invalid data
+    with self.assertRaises(ValueError):
+      layer.reset_states([1] * (len(layer.states) + 1))
+
+  def test_specify_state_with_masking(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    inputs = keras.Input((timesteps, embedding_dim))
+    _ = keras.layers.Masking()(inputs)
+    initial_state = [keras.Input((units,)) for _ in range(num_states)]
+    output = keras.layers.UnifiedLSTM(units)(
+        inputs, initial_state=initial_state)
+
+    model = keras.models.Model([inputs] + initial_state, output)
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    initial_state = [
+        np.random.random((num_samples, units)) for _ in range(num_states)
+    ]
+    targets = np.random.random((num_samples, units))
+    model.train_on_batch([inputs] + initial_state, targets)
+
+  def test_return_state(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
+    layer = keras.layers.UnifiedLSTM(units, return_state=True, stateful=True)
+    outputs = layer(inputs)
+    state = outputs[1:]
+    assert len(state) == num_states
+    model = keras.models.Model(inputs, state[0])
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    state = model.predict(inputs)
+    self.assertAllClose(keras.backend.eval(layer.states[0]), state, atol=1e-4)
+
+  def test_state_reuse(self):
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
+    layer = keras.layers.UnifiedLSTM(
+        units, return_state=True, return_sequences=True)
+    outputs = layer(inputs)
+    output, state = outputs[0], outputs[1:]
+    output = keras.layers.UnifiedLSTM(units)(output, initial_state=state)
+    model = keras.models.Model(inputs, output)
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    model.predict(inputs)
+
+  def test_initial_states_as_other_inputs(self):
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+    num_states = 2
+    layer_class = keras.layers.UnifiedLSTM
+
+    # Test with Keras tensor
+    main_inputs = keras.Input((timesteps, embedding_dim))
+    initial_state = [keras.Input((units,)) for _ in range(num_states)]
+    inputs = [main_inputs] + initial_state
+
+    layer = layer_class(units)
+    output = layer(inputs)
+    assert initial_state[0] in layer._inbound_nodes[0].input_tensors
+
+    model = keras.models.Model(inputs, output)
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+
+    main_inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    initial_state = [
+        np.random.random((num_samples, units)) for _ in range(num_states)
+    ]
+    targets = np.random.random((num_samples, units))
+    model.train_on_batch([main_inputs] + initial_state, targets)
 
   def test_unified_lstm_feature_parity_with_canonical_lstm(self):
     with context.eager_mode():
@@ -201,101 +317,77 @@ class UnifiedLSTMTest(test.TestCase, parameterized.TestCase):
       lstm_model = keras.models.Model(inputs, output)
       weights = lstm_model.get_weights()
       y_1 = lstm_model.predict(x_train)
-      lstm_model.compile('rmsprop', 'mse')
-      lstm_model.fit(x_train, y_train)
-      y_2 = lstm_model.predict(x_train)
-
-      with test_util.device(use_gpu=True):
-        cudnn_layer = keras.layers.UnifiedLSTM(rnn_state_size,
-                                               recurrent_activation='sigmoid')
-        cudnn_model = keras.models.Model(inputs, cudnn_layer(inputs))
-      cudnn_model.set_weights(weights)
-      y_3 = cudnn_model.predict(x_train)
-      cudnn_model.compile('rmsprop', 'mse')
-      cudnn_model.fit(x_train, y_train)
-      y_4 = cudnn_model.predict(x_train)
-
-      self.assertAllClose(y_1, y_3)
-      self.assertAllClose(y_2, y_4)
-
-  @parameterized.named_parameters(
-      # test_name, use_bias, bias_initializer, activation
-      ('normal', True, 'zeros'),
-      ('no_bias', False, 'zeros'),
-      ('random_bias', True, 'random_uniform'),
-  )
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_unified_lstm_model_save_load(self, use_bias, bias_initializer):
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir)
-    h5_path = os.path.join(temp_dir, 'test.h5')
-
-    batch = 10
-    timestep = 3
-    input_dim = 5
-    units = 2
-
-    x = np.random.random((batch, timestep, input_dim))
-
-    def build_model():
-      inputs = keras.layers.Input(
-          shape=[timestep, input_dim], dtype=dtypes.float32)
-      layer = keras.layers.UnifiedLSTM(
-          units,
-          use_bias=use_bias,
-          bias_initializer=bias_initializer)
-      output = layer(inputs)
-      return keras.models.Model(inputs, output), layer
-
-    model, layer = build_model()
-    y_ref = model.predict(x)
-    model.save_weights(h5_path)
-
-    cloned_model, new_layer = build_model()
-    cloned_model.load_weights(h5_path)
-    y = cloned_model.predict(x)
-
-    self.assertAllClose(y, y_ref)
-    self.assertAllClose(layer.get_weights(), new_layer.get_weights())
+      lstm_model.compile('rmsprop', 'mse')
+      lstm_model.fit(x_train, y_train)
+      y_2 = lstm_model.predict(x_train)
 
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_unified_lstm_output_on_multiple_kernel(self):
-    input_shape = 10
-    rnn_state_size = 8
-    timestep = 4
-    batch = 100
+      with test_util.device(use_gpu=True):
+        cudnn_layer = keras.layers.UnifiedLSTM(rnn_state_size)
+        cudnn_model = keras.models.Model(inputs, cudnn_layer(inputs))
+      cudnn_model.set_weights(weights)
+      y_3 = cudnn_model.predict(x_train)
+      cudnn_model.compile('rmsprop', 'mse')
+      cudnn_model.fit(x_train, y_train)
+      y_4 = cudnn_model.predict(x_train)
 
-    x_train = np.random.random((batch, timestep, input_shape))
+      self.assertAllClose(y_1, y_3, rtol=1e-5, atol=1e-5)
+      self.assertAllClose(y_2, y_4, rtol=1e-5, atol=1e-5)
 
-    inputs = keras.layers.Input(
-        shape=[timestep, input_shape], dtype=dtypes.float32)
-    with test_util.device(use_gpu=False):
-      # Note that CuDNN use 'sigmoid' as activation. Force the CPU
-      # implementation to use 'sigmoid' so that it will generate same output as
-      # CuDNN implementation.
-      layer = UnifiedLSTM(rnn_state_size, recurrent_activation='sigmoid')
-      output = layer(inputs)
-      cpu_model = keras.models.Model(inputs, output)
-      weights = cpu_model.get_weights()
-      y_1 = cpu_model.predict(x_train)
+  @parameterized.named_parameters(('v0', 0), ('v1', 1), ('v2', 2))
+  def test_implementation_mode_LSTM(self, implementation_mode):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        keras.layers.UnifiedLSTM,
+        kwargs={
+            'units': units,
+            'implementation': implementation_mode
+        },
+        input_shape=(num_samples, timesteps, embedding_dim))
 
-    with test_util.device(use_gpu=True):
-      layer = UnifiedLSTM(rnn_state_size, recurrent_activation='sigmoid')
-      output = layer(inputs)
-      gpu_model = keras.models.Model(inputs, output)
-      gpu_model.set_weights(weights)
-      y_2 = gpu_model.predict(x_train)
+    layer_class = keras.layers.UnifiedLSTM
+    k_constraint = keras.constraints.max_norm(0.01)
+    r_constraint = keras.constraints.max_norm(0.01)
+    b_constraint = keras.constraints.max_norm(0.01)
+    layer = layer_class(
+        5,
+        return_sequences=False,
+        weights=None,
+        input_shape=(None, embedding_dim),
+        kernel_constraint=k_constraint,
+        recurrent_constraint=r_constraint,
+        bias_constraint=b_constraint)
+    layer.build((None, None, embedding_dim))
+    self.assertEqual(layer.cell.kernel.constraint, k_constraint)
+    self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
+    self.assertEqual(layer.cell.bias.constraint, b_constraint)
 
-    with test_util.device(use_gpu=True):
-      layer = keras.layers.LSTM(rnn_state_size, recurrent_activation='sigmoid')
-      output = layer(inputs)
-      canonical_model = keras.models.Model(inputs, output)
-      # Remove the extra cudnn bias since canonical lstm will not use it.
-      canonical_model.set_weights(weights[:3])
-      y_3 = canonical_model.predict(x_train)
+    layer_class = keras.layers.UnifiedLSTM
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Masking(input_shape=(3, 4)))
+    model.add(layer_class(units=5, return_sequences=True, unroll=False))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
-    self.assertAllClose(y_1, y_2)
-    self.assertAllClose(y_2, y_3)
+  def test_masking_with_stacking_LSTM(self):
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Masking(input_shape=(3, 4)))
+    model.add(keras.layers.UnifiedLSTM(10, return_sequences=True, unroll=False))
+    model.add(keras.layers.UnifiedLSTM(5, return_sequences=True, unroll=False))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
   @parameterized.named_parameters(
       # test_name, time_major, go_backwards
@@ -304,7 +396,6 @@ class UnifiedLSTMTest(test.TestCase, parameterized.TestCase):
       ('go_backwards', False, True),
       ('both', True, True),
   )
-  @test_util.run_in_graph_and_eager_modes(config=_config)
   def test_time_major_and_go_backward(self, time_major, go_backwards):
     input_shape = 10
     rnn_state_size = 8
@@ -341,8 +432,6 @@ class UnifiedLSTMTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllClose(y, y_ref)
 
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_keras_model_with_lstm(self):
     input_shape = 10
     rnn_state_size = 8
     output_shape = 8
@@ -357,7 +446,7 @@ class UnifiedLSTMTest(test.TestCase, parameterized.TestCase):
         num_classes=output_shape)
     y_train = keras.utils.to_categorical(y_train, output_shape)
 
-    layer = UnifiedLSTM(rnn_state_size)
+    layer = keras.layers.UnifiedLSTM(rnn_state_size)
 
     inputs = keras.layers.Input(
         shape=[timestep, input_shape], dtype=dtypes.float32)
@@ -369,412 +458,327 @@ class UnifiedLSTMTest(test.TestCase, parameterized.TestCase):
     model.evaluate(x_train, y_train)
     model.predict(x_train)
 
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_return_sequences_LSTM(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    testing_utils.layer_test(
-        UnifiedLSTM,
-        kwargs={
-            'units': units,
-            'return_sequences': True
-        },
-        input_shape=(num_samples, timesteps, embedding_dim))
+  @parameterized.named_parameters(
+      # test_name, use_bias, bias_initializer, activation
+      ('normal', True, 'zeros'),
+      ('no_bias', False, 'zeros'),
+      ('random_bias', True, 'random_uniform'),
+  )
+  def test_unified_lstm_model_save_load(self, use_bias, bias_initializer):
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir)
+    h5_path = os.path.join(temp_dir, 'test.h5')
 
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_static_shape_inference_LSTM(self):
-    # Github issue: 15165
-    timesteps = 3
-    embedding_dim = 4
+    batch = 10
+    timestep = 3
+    input_dim = 5
     units = 2
 
-    model = keras.models.Sequential()
-    inputs = keras.layers.Dense(
-        embedding_dim, input_shape=(timesteps, embedding_dim))
-    model.add(inputs)
-    layer = UnifiedLSTM(units, return_sequences=True)
-    model.add(layer)
-    outputs = model.layers[-1].output
-    self.assertEqual(outputs.get_shape().as_list(), [None, timesteps, units])
+    x = np.random.random((batch, timestep, input_dim))
 
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_dynamic_behavior_LSTM(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    layer = UnifiedLSTM(units, input_shape=(None, embedding_dim))
-    model = keras.models.Sequential()
-    model.add(layer)
-    model.compile(gradient_descent.GradientDescentOptimizer(0.001), 'mse')
-    x = np.random.random((num_samples, timesteps, embedding_dim))
-    y = np.random.random((num_samples, units))
-    model.train_on_batch(x, y)
+    def build_model():
+      inputs = keras.layers.Input(
+          shape=[timestep, input_dim], dtype=dtypes.float32)
+      layer = keras.layers.UnifiedLSTM(
+          units,
+          use_bias=use_bias,
+          bias_initializer=bias_initializer)
+      output = layer(inputs)
+      return keras.models.Model(inputs, output), layer
 
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_dropout_LSTM(self):
-    num_samples = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 2
-    testing_utils.layer_test(
-        UnifiedLSTM,
-        kwargs={
-            'units': units,
-            'dropout': 0.1,
-            'recurrent_dropout': 0.1
-        },
-        input_shape=(num_samples, timesteps, embedding_dim))
+    model, layer = build_model()
+    y_ref = model.predict(x)
+    model.save_weights(h5_path)
 
-  @parameterized.parameters([0, 1, 2])
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_implementation_mode_LSTM(self, implementation_mode):
+    cloned_model, new_layer = build_model()
+    cloned_model.load_weights(h5_path)
+    y = cloned_model.predict(x)
+
+    self.assertAllClose(y, y_ref)
+    self.assertAllClose(layer.get_weights(), new_layer.get_weights())
+
+  def test_unified_lstm_output_on_multiple_kernel(self):
+    input_shape = 10
+    rnn_state_size = 8
+    timestep = 4
+    batch = 100
+
+    x_train = np.random.random((batch, timestep, input_shape))
+
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+    with test_util.device(use_gpu=False):
+      layer = keras.layers.UnifiedLSTM(rnn_state_size)
+      output = layer(inputs)
+      cpu_model = keras.models.Model(inputs, output)
+      weights = cpu_model.get_weights()
+    y_1 = cpu_model.predict(x_train)
+
+    with test_util.device(use_gpu=True):
+      layer = keras.layers.UnifiedLSTM(rnn_state_size)
+      output = layer(inputs)
+      gpu_model = keras.models.Model(inputs, output)
+      gpu_model.set_weights(weights)
+    y_2 = gpu_model.predict(x_train)
+
+    # Note that CuDNN uses 'sigmoid' as activation, so the unified LSTM uses
+    # 'sigmoid' as default. Construct the canonical LSTM with sigmoid to achieve
+    # the same output.
+    with test_util.device(use_gpu=True):
+      layer = keras.layers.LSTM(rnn_state_size, recurrent_activation='sigmoid')
+      output = layer(inputs)
+      canonical_model = keras.models.Model(inputs, output)
+      # Remove the extra cudnn bias since canonical lstm will not use it.
+      canonical_model.set_weights(weights[:3])
+    y_3 = canonical_model.predict(x_train)
+
+    self.assertAllClose(y_1, y_2)
+    self.assertAllClose(y_2, y_3)
+
+  def test_return_sequences_LSTM(self):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
     testing_utils.layer_test(
-        UnifiedLSTM,
+        keras.layers.UnifiedLSTM,
         kwargs={
             'units': units,
-            'implementation': implementation_mode
+            'return_sequences': True
         },
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_constraints_LSTM(self):
+  def test_regularizers_LSTM(self):
     embedding_dim = 4
-    layer_class = UnifiedLSTM
-    k_constraint = keras.constraints.max_norm(0.01)
-    r_constraint = keras.constraints.max_norm(0.01)
-    b_constraint = keras.constraints.max_norm(0.01)
+    layer_class = keras.layers.UnifiedLSTM
     layer = layer_class(
         5,
         return_sequences=False,
-        weights=None,
-        input_shape=(None, embedding_dim),
-        kernel_constraint=k_constraint,
-        recurrent_constraint=r_constraint,
-        bias_constraint=b_constraint)
-    layer.build((None, None, embedding_dim))
-    self.assertEqual(layer.cell.kernel.constraint, k_constraint)
-    self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
-    self.assertEqual(layer.cell.bias.constraint, b_constraint)
-
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_with_masking_layer_LSTM(self):
-    layer_class = UnifiedLSTM
-    inputs = np.random.random((2, 3, 4))
-    targets = np.abs(np.random.random((2, 3, 5)))
-    targets /= targets.sum(axis=-1, keepdims=True)
-    model = keras.models.Sequential()
-    model.add(keras.layers.Masking(input_shape=(3, 4)))
-    model.add(layer_class(units=5, return_sequences=True, unroll=False))
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
-    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
-
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_stacking_LSTM(self):
-    inputs = np.random.random((2, 3, 4))
-    targets = np.abs(np.random.random((2, 3, 5)))
-    targets /= targets.sum(axis=-1, keepdims=True)
-    model = keras.models.Sequential()
-    model.add(UnifiedLSTM(10, return_sequences=True, unroll=False))
-    model.add(UnifiedLSTM(5, return_sequences=True, unroll=False))
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
-    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
-
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_masking_with_stacking_LSTM(self):
-    inputs = np.random.random((2, 3, 4))
-    targets = np.abs(np.random.random((2, 3, 5)))
-    targets /= targets.sum(axis=-1, keepdims=True)
-    model = keras.models.Sequential()
-    model.add(keras.layers.Masking(input_shape=(3, 4)))
-    model.add(UnifiedLSTM(10, return_sequences=True, unroll=False))
-    model.add(UnifiedLSTM(5, return_sequences=True, unroll=False))
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
-    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
-
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_from_config_LSTM(self):
-    layer_class = UnifiedLSTM
-    for stateful in (False, True):
-      l1 = layer_class(units=1, stateful=stateful)
-      l2 = layer_class.from_config(l1.get_config())
-      assert l1.get_config() == l2.get_config()
+        weights=None,
+        input_shape=(None, embedding_dim),
+        kernel_regularizer=keras.regularizers.l1(0.01),
+        recurrent_regularizer=keras.regularizers.l1(0.01),
+        bias_regularizer='l2',
+        activity_regularizer='l1')
+    layer.build((None, None, 2))
+    self.assertEqual(len(layer.losses), 3)
+    x = keras.backend.variable(np.ones((2, 3, 2)))
+    layer(x)
+    if context.executing_eagerly():
+      self.assertEqual(len(layer.losses), 4)
+    else:
+      self.assertEqual(len(layer.get_losses_for(x)), 1)
 
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_specify_initial_state_keras_tensor(self):
-    num_states = 2
+  def test_statefulness_LSTM(self):
+    num_samples = 2
     timesteps = 3
     embedding_dim = 4
-    units = 3
-    num_samples = 2
-
-    # Test with Keras tensor
-    inputs = keras.Input((timesteps, embedding_dim))
-    initial_state = [keras.Input((units,)) for _ in range(num_states)]
-    layer = UnifiedLSTM(units)
-    if len(initial_state) == 1:
-      output = layer(inputs, initial_state=initial_state[0])
-    else:
-      output = layer(inputs, initial_state=initial_state)
-    assert initial_state[0] in layer._inbound_nodes[0].input_tensors
+    units = 2
+    layer_class = keras.layers.UnifiedLSTM
+    model = keras.models.Sequential()
+    model.add(
+        keras.layers.Embedding(
+            4,
+            embedding_dim,
+            mask_zero=True,
+            input_length=timesteps,
+            batch_input_shape=(num_samples, timesteps)))
+    layer = layer_class(
+        units, return_sequences=False, stateful=True, weights=None)
+    model.add(layer)
+    model.compile(optimizer=gradient_descent.GradientDescentOptimizer(0.01),
+                  loss='mse', run_eagerly=testing_utils.should_run_eagerly())
+    out1 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertEqual(out1.shape, (num_samples, units))
 
-    model = keras.models.Model([inputs] + initial_state, output)
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+    # train once so that the states change
+    model.train_on_batch(
+        np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
+    out2 = model.predict(np.ones((num_samples, timesteps)))
 
-    inputs = np.random.random((num_samples, timesteps, embedding_dim))
-    initial_state = [
-        np.random.random((num_samples, units)) for _ in range(num_states)
-    ]
-    targets = np.random.random((num_samples, units))
-    model.train_on_batch([inputs] + initial_state, targets)
+    # if the state is not reset, output should be different
+    self.assertNotEqual(out1.max(), out2.max())
 
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def DISABLED_test_specify_initial_state_non_keras_tensor(self):
-    num_states = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 3
-    num_samples = 2
+    # check that output changes after states are reset
+    # (even though the model itself didn't change)
+    layer.reset_states()
+    out3 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertNotEqual(out2.max(), out3.max())
 
-    # Test with non-Keras tensor
-    inputs = keras.Input((timesteps, embedding_dim))
-    initial_state = [
-        keras.backend.random_normal_variable((num_samples, units), 0, 1)
-        for _ in range(num_states)
-    ]
-    layer = UnifiedLSTM(units)
-    output = layer(inputs, initial_state=initial_state)
+    # check that container-level reset_states() works
+    model.reset_states()
+    out4 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertAllClose(out3, out4, atol=1e-5)
 
-    model = keras.models.Model(inputs, output)
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+    # check that the call to `predict` updated the states
+    out5 = model.predict(np.ones((num_samples, timesteps)))
+    self.assertNotEqual(out4.max(), out5.max())
 
-    inputs = np.random.random((num_samples, timesteps, embedding_dim))
-    targets = np.random.random((num_samples, units))
-    model.train_on_batch(inputs, targets)
+    # Check masking
+    layer.reset_states()
 
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_reset_states_with_values(self):
-    num_states = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 3
-    num_samples = 2
+    left_padded_input = np.ones((num_samples, timesteps))
+    left_padded_input[0, :1] = 0
+    left_padded_input[1, :2] = 0
+    out6 = model.predict(left_padded_input)
 
-    layer = UnifiedLSTM(units, stateful=True)
-    layer.build((num_samples, timesteps, embedding_dim))
     layer.reset_states()
-    assert len(layer.states) == num_states
-    assert layer.states[0] is not None
-    self.assertAllClose(
-        keras.backend.eval(layer.states[0]),
-        np.zeros(keras.backend.int_shape(layer.states[0])),
-        atol=1e-4)
-    state_shapes = [keras.backend.int_shape(state) for state in layer.states]
-    values = [np.ones(shape) for shape in state_shapes]
-    if len(values) == 1:
-      values = values[0]
-    layer.reset_states(values)
-    self.assertAllClose(
-        keras.backend.eval(layer.states[0]),
-        np.ones(keras.backend.int_shape(layer.states[0])),
-        atol=1e-4)
 
-    # Test with invalid data
-    with self.assertRaises(ValueError):
-      layer.reset_states([1] * (len(layer.states) + 1))
+    right_padded_input = np.ones((num_samples, timesteps))
+    right_padded_input[0, -1:] = 0
+    right_padded_input[1, -2:] = 0
+    out7 = model.predict(right_padded_input)
+
+    self.assertAllClose(out7, out6, atol=1e-5)
+
+  def test_stateful_LSTM_training(self):
+    # See b/123587692 for more context.
+    vocab_size = 20
+    embedding_dim = 10
+    batch_size = 8
+    timestep = 12
+    units = 5
+    x = np.random.randint(0, vocab_size, size=(batch_size, timestep))
+    y = np.random.randint(0, vocab_size, size=(batch_size, timestep))
+
+    model = keras.Sequential([
+        keras.layers.Embedding(vocab_size, embedding_dim,
+                               batch_input_shape=[batch_size, timestep]),
+        keras.layers.UnifiedLSTM(units,
+                                 return_sequences=True,
+                                 stateful=True),
+        keras.layers.Dense(vocab_size)
+    ])
+    model.compile(optimizer='adam',
+                  loss='sparse_categorical_crossentropy',
+                  run_eagerly=testing_utils.should_run_eagerly())
+    model.fit(x, y, epochs=1, shuffle=False)
 
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_specify_state_with_masking(self):
-    num_states = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 3
-    num_samples = 2
 
-    inputs = keras.Input((timesteps, embedding_dim))
-    _ = keras.layers.Masking()(inputs)
-    initial_state = [keras.Input((units,)) for _ in range(num_states)]
-    output = UnifiedLSTM(units)(inputs, initial_state=initial_state)
+class LSTMLayerGraphOnlyTest(test.TestCase):
 
-    model = keras.models.Model([inputs] + initial_state, output)
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+  # Need session for test
+  @test_util.run_deprecated_v1
+  def test_unifiedLSTM(self):
+    input_shape = 10
+    rnn_state_size = 8
+    output_shape = 8
+    timestep = 4
+    batch = 100
+    epoch = 1
 
-    inputs = np.random.random((num_samples, timesteps, embedding_dim))
-    initial_state = [
-        np.random.random((num_samples, units)) for _ in range(num_states)
-    ]
-    targets = np.random.random((num_samples, units))
-    model.train_on_batch([inputs] + initial_state, targets)
+    with self.cached_session(config=_config, use_gpu=True) as sess:
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      y_train = keras.utils.to_categorical(y_train, output_shape)
 
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_return_state(self):
-    num_states = 2
-    timesteps = 3
-    embedding_dim = 4
-    units = 3
-    num_samples = 2
+      layer = keras.layers.UnifiedLSTM(rnn_state_size, return_runtime=True)
 
-    inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
-    layer = UnifiedLSTM(units, return_state=True, stateful=True)
-    outputs = layer(inputs)
-    state = outputs[1:]
-    assert len(state) == num_states
-    model = keras.models.Model(inputs, state[0])
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape), name='predict')
 
-    inputs = np.random.random((num_samples, timesteps, embedding_dim))
-    state = model.predict(inputs)
-    self.assertAllClose(keras.backend.eval(layer.states[0]), state, atol=1e-4)
+      outputs, runtime = layer(inputs)
+      loss = losses.softmax_cross_entropy(predict, outputs)
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      train_op = optimizer.minimize(loss)
 
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_state_reuse(self):
-    timesteps = 3
-    embedding_dim = 4
-    units = 3
-    num_samples = 2
+      sess.run([variables.global_variables_initializer()])
+      existing_loss = 0
+      for _ in range(epoch):
+        loss_value, _, runtime_value = sess.run([loss, train_op, runtime], {
+            inputs: x_train,
+            predict: y_train
+        })
+        if test.is_gpu_available():
+          self.assertEqual(runtime_value, b'cudnn')
+        else:
+          self.assertEqual(runtime_value, b'cpu')
+        # Make sure the loss is updated for every epoch
+        # (layer weights properly updated).
+        self.assertNotEqual(existing_loss, loss_value)
+        existing_loss = loss_value
 
-    inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
-    layer = UnifiedLSTM(units, return_state=True, return_sequences=True)
-    outputs = layer(inputs)
-    output, state = outputs[0], outputs[1:]
-    output = UnifiedLSTM(units)(output, initial_state=state)
-    model = keras.models.Model(inputs, output)
+  # Need session for test
+  @test_util.run_deprecated_v1
+  def test_unifiedLSTM_with_cond(self):
+    # This test is to demonstrate the graph rewrite of grappler plugin under
+    # the condition that the function returns different number of internal
+    # states.
+    input_shape = 10
+    rnn_state_size = 8
+    output_shape = 8
+    timestep = 4
+    batch = 100
+    epoch = 1
 
-    inputs = np.random.random((num_samples, timesteps, embedding_dim))
-    model.predict(inputs)
+    with self.cached_session(config=_config, use_gpu=True) as sess:
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      y_train = keras.utils.to_categorical(y_train, output_shape)
 
-  @test_util.run_in_graph_and_eager_modes(config=_config)
-  def test_initial_states_as_other_inputs(self):
-    timesteps = 3
-    embedding_dim = 4
-    units = 3
-    num_samples = 2
-    num_states = 2
-    layer_class = UnifiedLSTM
+      layer = keras.layers.UnifiedLSTM(rnn_state_size, return_runtime=True)
 
-    # Test with Keras tensor
-    main_inputs = keras.Input((timesteps, embedding_dim))
-    initial_state = [keras.Input((units,)) for _ in range(num_states)]
-    inputs = [main_inputs] + initial_state
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape), name='predict')
 
-    layer = layer_class(units)
-    output = layer(inputs)
-    assert initial_state[0] in layer._inbound_nodes[0].input_tensors
+      zeros = array_ops.zeros([batch, output_shape])
+      dummy_runtime = constant_op.constant(
+          'unknown', dtype=dtypes.string, name='runtime')
+      a = constant_op.constant(0)
+      b = constant_op.constant(1)
+      # Will always run the lstm layer.
+      outputs, runtime = control_flow_ops.cond(
+          gen_math_ops.less(a, b),
+          lambda: layer(inputs),
+          lambda: (zeros, dummy_runtime))
+      loss = losses.softmax_cross_entropy(predict, outputs)
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      train_op = optimizer.minimize(loss)
 
-    model = keras.models.Model(inputs, output)
-    model.compile(
-        loss='categorical_crossentropy',
-        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+      sess.run([variables.global_variables_initializer()])
+      existing_loss = 0
 
-    main_inputs = np.random.random((num_samples, timesteps, embedding_dim))
-    initial_state = [
-        np.random.random((num_samples, units)) for _ in range(num_states)
-    ]
-    targets = np.random.random((num_samples, units))
-    model.train_on_batch([main_inputs] + initial_state, targets)
+      for _ in range(epoch):
+        loss_value, _, runtime_value = sess.run([loss, train_op, runtime], {
+            inputs: x_train,
+            predict: y_train
+        })
+        if test.is_gpu_available():
+          self.assertEqual(runtime_value, b'cudnn')
+        else:
+          self.assertEqual(runtime_value, b'cpu')
+        # Make sure the loss is updated for every epoch
+        # (layer weights properly updated).
+        self.assertNotEqual(existing_loss, loss_value)
+        existing_loss = loss_value
 
 
-@test_util.run_v1_only('b/120545219')
-class LSTMLayerGraphOnlyTest(test.TestCase):
+class LSTMLayerV1OnlyTest(test.TestCase, parameterized.TestCase):
 
-  def test_statefulness_LSTM(self):
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_dropout_LSTM(self):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
-    layer_class = UnifiedLSTM
-    with self.cached_session(config=_config):
-      model = keras.models.Sequential()
-      model.add(
-          keras.layers.Embedding(
-              4,
-              embedding_dim,
-              mask_zero=True,
-              input_length=timesteps,
-              batch_input_shape=(num_samples, timesteps)))
-      layer = layer_class(
-          units, return_sequences=False, stateful=True, weights=None)
-      model.add(layer)
-      model.compile(
-          optimizer=gradient_descent.GradientDescentOptimizer(0.01), loss='mse')
-      out1 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertEqual(out1.shape, (num_samples, units))
-
-      # train once so that the states change
-      model.train_on_batch(
-          np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
-      out2 = model.predict(np.ones((num_samples, timesteps)))
-
-      # if the state is not reset, output should be different
-      self.assertNotEqual(out1.max(), out2.max())
-
-      # check that output changes after states are reset
-      # (even though the model itself didn't change)
-      layer.reset_states()
-      out3 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertNotEqual(out2.max(), out3.max())
-
-      # check that container-level reset_states() works
-      model.reset_states()
-      out4 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertAllClose(out3, out4, atol=1e-5)
-
-      # check that the call to `predict` updated the states
-      out5 = model.predict(np.ones((num_samples, timesteps)))
-      self.assertNotEqual(out4.max(), out5.max())
-
-      # Check masking
-      layer.reset_states()
-
-      left_padded_input = np.ones((num_samples, timesteps))
-      left_padded_input[0, :1] = 0
-      left_padded_input[1, :2] = 0
-      out6 = model.predict(left_padded_input)
-
-      layer.reset_states()
-
-      right_padded_input = np.ones((num_samples, timesteps))
-      right_padded_input[0, -1:] = 0
-      right_padded_input[1, -2:] = 0
-      out7 = model.predict(right_padded_input)
-
-      self.assertAllClose(out7, out6, atol=1e-5)
-
-  def test_regularizers_LSTM(self):
-    embedding_dim = 4
-    layer_class = UnifiedLSTM
-    with self.cached_session(config=_config):
-      layer = layer_class(
-          5,
-          return_sequences=False,
-          weights=None,
-          input_shape=(None, embedding_dim),
-          kernel_regularizer=keras.regularizers.l1(0.01),
-          recurrent_regularizer=keras.regularizers.l1(0.01),
-          bias_regularizer='l2',
-          activity_regularizer='l1')
-      layer.build((None, None, 2))
-      self.assertEqual(len(layer.losses), 3)
-      x = keras.backend.variable(np.ones((2, 3, 2)))
-      layer(x)
-      self.assertEqual(len(layer.get_losses_for(x)), 1)
+    testing_utils.layer_test(
+        keras.layers.UnifiedLSTM,
+        kwargs={
+            'units': units,
+            'dropout': 0.1,
+            'recurrent_dropout': 0.1
+        },
+        input_shape=(num_samples, timesteps, embedding_dim))
 
 
 class UnifiedLSTMPerformanceTest(test.Benchmark):
@@ -797,7 +801,7 @@ class UnifiedLSTMPerformanceTest(test.Benchmark):
     rnn_state_size = test_config['rnn_state_size']
     timestep = test_config['timestep']
 
-    cudnn_lstm_layer = CuDNNLSTM(rnn_state_size)
+    cudnn_lstm_layer = keras.layers.CuDNNLSTM(rnn_state_size)
     inputs = keras.layers.Input(
         shape=[timestep, input_shape], dtype=dtypes.float32)
 
@@ -818,7 +822,7 @@ class UnifiedLSTMPerformanceTest(test.Benchmark):
     rnn_state_size = test_config['rnn_state_size']
     timestep = test_config['timestep']
 
-    layer = UnifiedLSTM(rnn_state_size)
+    layer = keras.layers.UnifiedLSTM(rnn_state_size)
     inputs = keras.layers.Input(
         shape=[timestep, input_shape], dtype=dtypes.float32)
 
diff --git a/tensorflow/python/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py
index 67b154141efc036b5fa7920c8179b35f5eb38cc1..182d5e3c5df973475add2cf0e97f925ac47c518a 100644
--- a/tensorflow/python/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/layers/wrappers.py
@@ -29,11 +29,12 @@ from tensorflow.python.keras.layers.recurrent import _standardize_args
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.layers.Wrapper')
+@keras_export('keras.layers.Wrapper')
 class Wrapper(Layer):
   """Abstract wrapper base class.
 
@@ -45,6 +46,7 @@ class Wrapper(Layer):
       layer: The layer to be wrapped.
   """
 
+  @trackable.no_automatic_dependency_tracking
   def __init__(self, layer, **kwargs):
     assert isinstance(layer, Layer)
     self.layer = layer
@@ -112,7 +114,7 @@ class Wrapper(Layer):
     return cls(layer, **config)
 
 
-@tf_export('keras.layers.TimeDistributed')
+@keras_export('keras.layers.TimeDistributed')
 class TimeDistributed(Wrapper):
   """This wrapper allows to apply a layer to every temporal slice of an input.
 
@@ -168,7 +170,7 @@ class TimeDistributed(Wrapper):
           '`Layer` instance. You passed: {input}'.format(input=layer))
     super(TimeDistributed, self).__init__(layer, **kwargs)
     self.supports_masking = True
-    self._track_checkpointable(layer, name='layer')
+    self._track_trackable(layer, name='layer')
 
   def _get_shape_tuple(self, init_tuple, tensor, start_idx, int_shape=None):
     """Finds non-specific dimensions in the static shapes.
@@ -204,8 +206,12 @@ class TimeDistributed(Wrapper):
 
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    assert len(input_shape) >= 3
-    self.input_spec = InputSpec(shape=input_shape)
+    if len(input_shape) < 3:
+      raise ValueError(
+          '`TimeDistributed` Layer should be passed an `input_shape ` '
+          'with at least 3 dimensions, received: ' + str(input_shape))
+    # Don't enforce the batch or time dimension.
+    self.input_spec = InputSpec(shape=[None, None] + input_shape[2:])
     child_input_shape = [input_shape[0]] + input_shape[2:]
     if not self.layer.built:
       # The base layer class calls a conversion function on the input shape to
@@ -351,7 +357,7 @@ class TimeDistributed(Wrapper):
     return output_mask
 
 
-@tf_export('keras.layers.Bidirectional')
+@keras_export('keras.layers.Bidirectional')
 class Bidirectional(Wrapper):
   """Bidirectional wrapper for RNNs.
 
@@ -380,6 +386,7 @@ class Bidirectional(Wrapper):
   ```
   """
 
+  @trackable.no_automatic_dependency_tracking
   def __init__(self, layer, merge_mode='concat', weights=None, **kwargs):
     if not isinstance(layer, Layer):
       raise ValueError(
@@ -412,8 +419,8 @@ class Bidirectional(Wrapper):
     self._num_constants = None
     super(Bidirectional, self).__init__(layer, **kwargs)
     self.input_spec = layer.input_spec
-    self._track_checkpointable(self.forward_layer, name='forward_layer')
-    self._track_checkpointable(self.backward_layer, name='backward_layer')
+    self._track_trackable(self.forward_layer, name='forward_layer')
+    self._track_trackable(self.backward_layer, name='backward_layer')
 
   @property
   def trainable(self):
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index 727f33dadc8abf113e9af76ef63e3e016de319ce..8fa0e7bdacc7169847d2c15ebe564d60089983a8 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -23,11 +23,12 @@ import copy
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.platform import test
-from tensorflow.python.training.checkpointable import util as checkpointable_util
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
+from tensorflow.python.training.tracking import object_identity
+from tensorflow.python.training.tracking import util as trackable_util
 
 
 class _RNNCellWithConstants(keras.layers.Layer):
@@ -77,7 +78,7 @@ class TimeDistributedTest(test.TestCase):
     model.add(
         keras.layers.TimeDistributed(
             keras.layers.Dense(2), input_shape=(3, 4)))
-    model.compile(optimizer=RMSPropOptimizer(0.01), loss='mse')
+    model.compile(optimizer='rmsprop', loss='mse')
     model.fit(
         np.random.random((10, 3, 4)),
         np.random.random((10, 3, 2)),
@@ -88,8 +89,8 @@ class TimeDistributedTest(test.TestCase):
     model.get_config()
 
     # check whether the model variables are present in the
-    # checkpointable list of objects
-    checkpointed_objects = set(checkpointable_util.list_objects(model))
+    # trackable list of objects
+    checkpointed_objects = set(trackable_util.list_objects(model))
     for v in model.variables:
       self.assertIn(v, checkpointed_objects)
 
@@ -98,7 +99,7 @@ class TimeDistributedTest(test.TestCase):
     model.add(
         keras.layers.TimeDistributed(
             keras.layers.Dense(2), input_shape=(3, 4), batch_size=10))
-    model.compile(optimizer=RMSPropOptimizer(0.01), loss='mse')
+    model.compile(optimizer='rmsprop', loss='mse')
     model.fit(
         np.random.random((10, 3, 4)),
         np.random.random((10, 3, 2)),
@@ -159,13 +160,12 @@ class TimeDistributedTest(test.TestCase):
       # test layers that need learning_phase to be set
       np.random.seed(1234)
       x = keras.layers.Input(shape=(3, 2))
-      y = keras.layers.TimeDistributed(
-          keras.layers.Dropout(.999))(x, training=True)
+      y = keras.layers.TimeDistributed(keras.layers.Dropout(.999))(
+          x, training=True)
       model = keras.models.Model(x, y)
       y = model.predict(np.random.random((10, 3, 2)))
       self.assertAllClose(np.mean(y), 0., atol=1e-1, rtol=1e-1)
 
-  @tf_test_util.run_v1_only('b/120545219')
   def test_TimeDistributed_batchnorm(self):
     with self.cached_session():
       # test that wrapped BN updates still work.
@@ -188,7 +188,6 @@ class TimeDistributedTest(test.TestCase):
       # Verify input_map has one mapping from inputs to reshaped inputs.
       self.assertEqual(len(td._input_map.keys()), 1)
 
-  @tf_test_util.run_v1_only('b/120545219')
   def test_TimeDistributed_trainable(self):
     # test layers that need learning_phase to be set
     x = keras.layers.Input(shape=(3, 2))
@@ -203,7 +202,6 @@ class TimeDistributedTest(test.TestCase):
     assert len(layer.updates) == 2
     assert len(layer.trainable_weights) == 2
 
-  @tf_test_util.run_v1_only('b/120545219')
   def test_TimeDistributed_with_masked_embedding_and_unspecified_shape(self):
     with self.cached_session():
       # test with unspecified shape and Embeddings with mask_zero
@@ -236,7 +234,6 @@ class TimeDistributedTest(test.TestCase):
         self.assertAllEqual(mask_outputs_val[i], ref_mask_val[i])
       self.assertIs(mask_outputs[-1], None)  # final layer
 
-  @tf_test_util.run_v1_only('b/120545219')
   def test_TimeDistributed_with_masking_layer(self):
     with self.cached_session():
       # test with Masking layer
@@ -261,6 +258,28 @@ class TimeDistributedTest(test.TestCase):
       self.assertEqual((mask_outputs_val[1]).all(),
                        model_input.all())
 
+  def test_TimeDistributed_with_different_time_shapes(self):
+    time_dist = keras.layers.TimeDistributed(keras.layers.Dense(5))
+    ph_1 = keras.backend.placeholder(shape=(None, 10, 13))
+    out_1 = time_dist(ph_1)
+    self.assertEqual(out_1.shape.as_list(), [None, 10, 5])
+
+    ph_2 = keras.backend.placeholder(shape=(None, 1, 13))
+    out_2 = time_dist(ph_2)
+    self.assertEqual(out_2.shape.as_list(), [None, 1, 5])
+
+    ph_3 = keras.backend.placeholder(shape=(None, 1, 18))
+    with self.assertRaisesRegexp(ValueError, 'is incompatible with layer'):
+      time_dist(ph_3)
+
+  def test_TimeDistributed_with_invalid_dimensions(self):
+    time_dist = keras.layers.TimeDistributed(keras.layers.Dense(5))
+    ph = keras.backend.placeholder(shape=(None, 10))
+    with self.assertRaisesRegexp(
+        ValueError,
+        '`TimeDistributed` Layer should be passed an `input_shape `'):
+      time_dist(ph)
+
 
 class BidirectionalTest(test.TestCase):
 
@@ -281,12 +300,13 @@ class BidirectionalTest(test.TestCase):
         model.add(
             keras.layers.Bidirectional(
                 rnn(output_dim), merge_mode=mode, input_shape=(timesteps, dim)))
-        model.compile(optimizer=RMSPropOptimizer(0.01), loss='mse')
+        model.compile(optimizer='rmsprop', loss='mse')
         model.fit(x, y, epochs=1, batch_size=1)
 
         # check whether the model variables are present in the
-        # checkpointable list of objects
-        checkpointed_objects = set(checkpointable_util.list_objects(model))
+        # trackable list of objects
+        checkpointed_objects = object_identity.ObjectIdentitySet(
+            trackable_util.list_objects(model))
         for v in model.variables:
           self.assertIn(v, checkpointed_objects)
 
@@ -379,7 +399,6 @@ class BidirectionalTest(test.TestCase):
       model.compile(loss='mse', optimizer='sgd')
       model.fit(x, y, epochs=1, batch_size=1)
 
-  @tf_test_util.run_v1_only('b/120545219')
   def test_Bidirectional_merged_value(self):
     rnn = keras.layers.LSTM
     samples = 2
@@ -407,10 +426,10 @@ class BidirectionalTest(test.TestCase):
             rnn(units, return_sequences=True), merge_mode=merge_mode)
         f_merged = keras.backend.function([inputs], _to_list(layer(inputs)))
         f_forward = keras.backend.function([inputs],
-                                           [layer.forward_layer.call(inputs)])
+                                           [layer.forward_layer(inputs)])
         f_backward = keras.backend.function(
             [inputs],
-            [keras.backend.reverse(layer.backward_layer.call(inputs), 1)])
+            [keras.backend.reverse(layer.backward_layer(inputs), 1)])
 
         y_merged = f_merged(x)
         y_expected = _to_list(merge_func(f_forward(x)[0], f_backward(x)[0]))
@@ -424,9 +443,9 @@ class BidirectionalTest(test.TestCase):
             rnn(units, return_state=True), merge_mode=merge_mode)
         f_merged = keras.backend.function([inputs], layer(inputs))
         f_forward = keras.backend.function([inputs],
-                                           layer.forward_layer.call(inputs))
+                                           layer.forward_layer(inputs))
         f_backward = keras.backend.function([inputs],
-                                            layer.backward_layer.call(inputs))
+                                            layer.backward_layer(inputs))
         n_states = len(layer.layer.states)
 
         y_merged = f_merged(x)
@@ -510,8 +529,10 @@ class BidirectionalTest(test.TestCase):
       layer.trainable = True
       assert len(layer.trainable_weights) == 6
 
-  @tf_test_util.run_v1_only('b/120545219')
   def test_Bidirectional_updates(self):
+    if context.executing_eagerly():
+      self.skipTest('layer.updates is only available in graph mode.')
+
     with self.cached_session():
       x = keras.layers.Input(shape=(3, 2))
       x_reachable_update = x * x
@@ -539,10 +560,15 @@ class BidirectionalTest(test.TestCase):
       assert len(layer.losses) == 4
       assert len(layer.get_losses_for(None)) == 4
       assert not layer.get_losses_for(x)
+
+      # Create a random tensor that is not conditional on the inputs.
+      with keras.backend.get_graph().as_default():
+        const_tensor = constant_op.constant(1)
+
       layer.forward_layer.add_loss(x_reachable_loss, inputs=x)
-      layer.forward_layer.add_loss(1, inputs=None)
+      layer.forward_layer.add_loss(const_tensor, inputs=None)
       layer.backward_layer.add_loss(x_reachable_loss, inputs=x)
-      layer.backward_layer.add_loss(1, inputs=None)
+      layer.backward_layer.add_loss(const_tensor, inputs=None)
       assert len(layer.losses) == 8
       assert len(layer.get_losses_for(None)) == 6
       assert len(layer.get_losses_for(x)) == 2
@@ -679,3 +705,4 @@ def _to_list(ls):
 
 if __name__ == '__main__':
   test.main()
+
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index 4c584d0ff059ba8eabd3de06ebb06b2703400a73..2d8358cab43674bba0dd5352bc1ccfa070e6721c 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# pylint: disable=unused-import
 """Built-in loss functions.
 """
 from __future__ import absolute_import
@@ -24,17 +23,21 @@ import abc
 import six
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import smart_cond
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
-from tensorflow.python.keras.utils.losses_utils import compute_weighted_loss
+from tensorflow.python.keras.utils.tf_utils import is_tensor_or_variable
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops.losses import losses_impl
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
+from tensorflow.tools.docs import doc_controls
 
 
+@keras_export('keras.losses.Loss')
 class Loss(object):
   """Loss base class.
 
@@ -51,13 +54,13 @@ class Loss(object):
   ```
 
   Args:
-    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
-      `SUM_OVER_BATCH_SIZE`.
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
+      Default value is `SUM_OVER_BATCH_SIZE`.
     name: Optional name for the op.
   """
 
   def __init__(self,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
                name=None):
     self.reduction = reduction
     self.name = name
@@ -85,10 +88,13 @@ class Loss(object):
     Raises:
       ValueError: If the shape of `sample_weight` is invalid.
     """
-    with ops.name_scope(self.name, format(self.__class__.__name__),
+    # If we are wrapping a lambda function strip '<>' from the name as it is not
+    # accepted in scope name.
+    scope_name = 'lambda' if self.name == '<lambda>' else self.name
+    with ops.name_scope(scope_name, format(self.__class__.__name__),
                         (y_pred, y_true, sample_weight)):
       losses = self.call(y_true, y_pred)
-      return compute_weighted_loss(
+      return losses_utils.compute_weighted_loss(
           losses, sample_weight, reduction=self.reduction)
 
   @classmethod
@@ -107,6 +113,7 @@ class Loss(object):
     return {'reduction': self.reduction, 'name': self.name}
 
   @abc.abstractmethod
+  @doc_controls.for_subclass_implementers
   def call(self, y_true, y_pred):
     """Invokes the `Loss` instance.
 
@@ -117,8 +124,49 @@ class Loss(object):
     NotImplementedError('Must be implemented in subclasses.')
 
 
-@tf_export('keras.losses.MeanSquaredError')
-class MeanSquaredError(Loss):
+class LossFunctionWrapper(Loss):
+  """Wraps a loss function in the `Loss` class.
+
+  Args:
+    fn: The loss function to wrap, with signature `fn(y_true, y_pred,
+      **kwargs)`.
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
+      Default value is `SUM_OVER_BATCH_SIZE`.
+    name: (Optional) name for the loss.
+    **kwargs: The keyword arguments that are passed on to `fn`.
+  """
+
+  def __init__(self,
+               fn,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name=None,
+               **kwargs):
+    super(LossFunctionWrapper, self).__init__(reduction=reduction, name=name)
+    self.fn = fn
+    self._fn_kwargs = kwargs
+
+  def call(self, y_true, y_pred):
+    """Invokes the `LossFunctionWrapper` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Loss values per sample.
+    """
+    return self.fn(y_true, y_pred, **self._fn_kwargs)
+
+  def get_config(self):
+    config = {}
+    for k, v in six.iteritems(self._fn_kwargs):
+      config[k] = K.eval(v) if is_tensor_or_variable(v) else v
+    base_config = super(LossFunctionWrapper, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+@keras_export('keras.losses.MeanSquaredError')
+class MeanSquaredError(LossFunctionWrapper):
   """Computes the mean of squares of errors between labels and predictions.
 
   For example, if `y_true` is [0., 0., 1., 1.] and `y_pred` is [1., 1., 1., 0.]
@@ -135,28 +183,20 @@ class MeanSquaredError(Loss):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss=tf.keras.losses.MeanSquaredError())
   ```
   """
 
-  def call(self, y_true, y_pred):
-    """Invokes the `MeanSquaredError` instance.
-
-    Args:
-      y_true: Ground truth values.
-      y_pred: The predicted values.
-
-    Returns:
-      Mean squared error losses.
-    """
-    y_pred = ops.convert_to_tensor(y_pred)
-    y_true = math_ops.cast(y_true, y_pred.dtype)
-    return mean_squared_error(y_true, y_pred)
+  def __init__(self,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='mean_squared_error'):
+    super(MeanSquaredError, self).__init__(
+        mean_squared_error, name=name, reduction=reduction)
 
 
-@tf_export('keras.losses.MeanAbsoluteError')
-class MeanAbsoluteError(Loss):
+@keras_export('keras.losses.MeanAbsoluteError')
+class MeanAbsoluteError(LossFunctionWrapper):
   """Computes the mean of absolute difference between labels and predictions.
 
   For example, if `y_true` is [0., 0., 1., 1.] and `y_pred` is [1., 1., 1., 0.]
@@ -173,28 +213,20 @@ class MeanAbsoluteError(Loss):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss=tf.keras.losses.MeanAbsoluteError())
   ```
   """
 
-  def call(self, y_true, y_pred):
-    """Invokes the `MeanAbsoluteError` instance.
-
-    Args:
-      y_true: Ground truth values.
-      y_pred: The predicted values.
-
-    Returns:
-      Mean absolute error losses.
-    """
-    y_pred = ops.convert_to_tensor(y_pred)
-    y_true = math_ops.cast(y_true, y_pred.dtype)
-    return mean_absolute_error(y_true, y_pred)
+  def __init__(self,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='mean_absolute_error'):
+    super(MeanAbsoluteError, self).__init__(
+        mean_absolute_error, name=name, reduction=reduction)
 
 
-@tf_export('keras.losses.MeanAbsolutePercentageError')
-class MeanAbsolutePercentageError(Loss):
+@keras_export('keras.losses.MeanAbsolutePercentageError')
+class MeanAbsolutePercentageError(LossFunctionWrapper):
   """Computes the mean absolute percentage error between `y_true` and `y_pred`.
 
   For example, if `y_true` is [0., 0., 1., 1.] and `y_pred` is [1., 1., 1., 0.]
@@ -211,28 +243,20 @@ class MeanAbsolutePercentageError(Loss):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss=tf.keras.losses.MeanAbsolutePercentageError())
   ```
   """
 
-  def call(self, y_true, y_pred):
-    """Invokes the `MeanAbsolutePercentageError` instance.
-
-    Args:
-      y_true: Ground truth values.
-      y_pred: The predicted values.
-
-    Returns:
-      Mean absolute percentage error losses.
-    """
-    y_pred = ops.convert_to_tensor(y_pred)
-    y_true = math_ops.cast(y_true, y_pred.dtype)
-    return mean_absolute_percentage_error(y_true, y_pred)
+  def __init__(self,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='mean_absolute_percentage_error'):
+    super(MeanAbsolutePercentageError, self).__init__(
+        mean_absolute_percentage_error, name=name, reduction=reduction)
 
 
-@tf_export('keras.losses.MeanSquaredLogarithmicError')
-class MeanSquaredLogarithmicError(Loss):
+@keras_export('keras.losses.MeanSquaredLogarithmicError')
+class MeanSquaredLogarithmicError(LossFunctionWrapper):
   """Computes the mean squared logarithmic error between `y_true` and `y_pred`.
 
   For example, if `y_true` is [0., 0., 1., 1.] and `y_pred` is [1., 1., 1., 0.]
@@ -249,29 +273,28 @@ class MeanSquaredLogarithmicError(Loss):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss=tf.keras.losses.MeanSquaredLogarithmicError())
   ```
   """
 
-  def call(self, y_true, y_pred):
-    """Invokes the `MeanSquaredLogarithmicError` instance.
+  def __init__(self,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='mean_squared_logarithmic_error'):
+    super(MeanSquaredLogarithmicError, self).__init__(
+        mean_squared_logarithmic_error, name=name, reduction=reduction)
 
-    Args:
-      y_true: Ground truth values.
-      y_pred: The predicted values.
 
-    Returns:
-      Mean squared logarithmic error losses.
-    """
-    y_pred = ops.convert_to_tensor(y_pred)
-    y_true = math_ops.cast(y_true, y_pred.dtype)
-    return mean_squared_logarithmic_error(y_true, y_pred)
+@keras_export('keras.losses.BinaryCrossentropy')
+class BinaryCrossentropy(LossFunctionWrapper):
+  """Computes the crossentropy loss between the labels and predictions.
 
+  Use this crossentropy loss function when there are only two label classes
+  (assumed to be 0 and 1). There should be a single floating point value per
+  feature.
 
-@tf_export('keras.losses.BinaryCrossentropy')
-class BinaryCrossentropy(Loss):
-  """Computes the binary cross entropy loss between the labels and predictions.
+  In the snippet below, there is a single floating pointing value per example,
+  and the shape of both `y_pred` and `y_true` are `[batch_size]`.
 
   Usage:
 
@@ -284,50 +307,45 @@ class BinaryCrossentropy(Loss):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss=tf.keras.losses.BinaryCrossentropy())
-  ````
+  ```
 
   Args:
-    from_logits: Whether `output` is expected to be a logits tensor. By default,
-      we consider that `output` encodes a probability distribution.
-    label_smoothing: If greater than `0` then smooth the labels.
-    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
-      `SUM_OVER_BATCH_SIZE`.
+    from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
+      we assume that `y_pred` encodes a probability distribution.
+    label_smoothing: Float in [0, 1]. If > `0` then smooth the labels.
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
+      Default value is `SUM_OVER_BATCH_SIZE`.
     name: Optional name for the op.
   """
 
   def __init__(self,
                from_logits=False,
                label_smoothing=0,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
-               name=None):
-    super(BinaryCrossentropy, self).__init__(reduction=reduction, name=name)
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='binary_crossentropy'):
+    super(BinaryCrossentropy, self).__init__(
+        binary_crossentropy,
+        name=name,
+        reduction=reduction,
+        from_logits=from_logits,
+        label_smoothing=label_smoothing)
     self.from_logits = from_logits
-    self.label_smoothing = label_smoothing
-
-  def call(self, y_true, y_pred):
-    """Invokes the `BinaryCrossentropy` instance.
-
-    Args:
-      y_true: Ground truth values.
-      y_pred: The predicted values.
-
-    Returns:
-      Binary cross entropy losses.
-    """
-    y_pred = ops.convert_to_tensor(y_pred)
-    y_true = math_ops.cast(y_true, y_pred.dtype)
 
-    if self.label_smoothing > 0:
-      y_true = y_true * (1 - self.label_smoothing) + 0.5 * self.label_smoothing
 
-    return binary_crossentropy(y_true, y_pred, from_logits=self.from_logits)
+@keras_export('keras.losses.CategoricalCrossentropy')
+class CategoricalCrossentropy(LossFunctionWrapper):
+  """Computes the crossentropy loss between the labels and predictions.
 
+  Use this crossentropy loss function when there are two or more label classes.
+  We expect labels to be provided in a `one_hot` representation. If you want to
+  provide labels as integers, please use `SparseCategoricalCrossentropy` loss.
+  There should be `# classes` floating point values per feature.
 
-@tf_export('keras.losses.CategoricalCrossentropy')
-class CategoricalCrossentropy(Loss):
-  """Computes categorical cross entropy loss between the `y_true` and `y_pred`.
+  In the snippet below, there is `# classes` floating pointing values per
+  example. The shape of both `y_pred` and `y_true` are
+  `[batch_size, num_classes]`.
 
   Usage:
 
@@ -342,122 +360,490 @@ class CategoricalCrossentropy(Loss):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss=tf.keras.losses.CategoricalCrossentropy())
-  ````
+  ```
 
   Args:
-    from_logits: Whether `output` is expected to be a logits tensor. By default,
-      we consider that `output` encodes a probability distribution.
-    label_smoothing: If greater than `0` then smooth the labels. This option is
-      currently not supported when `y_pred` is a sparse input (not one-hot).
-    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
-      `SUM_OVER_BATCH_SIZE`.
+    from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
+      we assume that `y_pred` encodes a probability distribution.
+    label_smoothing: Float in [0, 1]. When > 0, label values are smoothed,
+      meaning the confidence on label values are relaxed. e.g.
+      `label_smoothing=0.2` means that we will use a value of `0.1` for label
+      `0` and `0.9` for label `1`"
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
+      Default value is `SUM_OVER_BATCH_SIZE`.
     name: Optional name for the op.
   """
 
   def __init__(self,
                from_logits=False,
                label_smoothing=0,
-               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
-               name=None):
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='categorical_crossentropy'):
     super(CategoricalCrossentropy, self).__init__(
-        reduction=reduction, name=name)
-    self.from_logits = from_logits
-    self.label_smoothing = label_smoothing
+        categorical_crossentropy,
+        name=name,
+        reduction=reduction,
+        from_logits=from_logits,
+        label_smoothing=label_smoothing)
 
-  def call(self, y_true, y_pred):
-    """Invokes the `CategoricalCrossentropy` instance.
 
-    Args:
-      y_true: Ground truth values.
-      y_pred: The predicted values.
+@keras_export('keras.losses.SparseCategoricalCrossentropy')
+class SparseCategoricalCrossentropy(LossFunctionWrapper):
+  """Computes the crossentropy loss between the labels and predictions.
 
-    Returns:
-      Categorical cross entropy losses.
-    """
-    y_pred = ops.convert_to_tensor(y_pred)
-    y_true = ops.convert_to_tensor(y_true)
-    is_sparse = y_pred.shape != y_true.shape
-
-    if is_sparse:
-      return sparse_categorical_crossentropy(
-          y_true, y_pred, from_logits=self.from_logits)
-    else:
-      y_true = math_ops.cast(y_true, y_pred.dtype)
-      if self.label_smoothing > 0:
-        num_classes = math_ops.cast(array_ops.shape(y_true)[1], y_pred.dtype)
-        smooth_positives = 1.0 - self.label_smoothing
-        smooth_negatives = self.label_smoothing / num_classes
-        y_true = y_true * smooth_positives + smooth_negatives
-
-      return categorical_crossentropy(
-          y_true, y_pred, from_logits=self.from_logits)
-
-
-@tf_export('keras.metrics.mean_squared_error',
-           'keras.metrics.mse',
-           'keras.metrics.MSE',
-           'keras.losses.mean_squared_error',
-           'keras.losses.mse',
-           'keras.losses.MSE')
+  Use this crossentropy loss function when there are two or more label classes.
+  We expect labels to be provided as integers. If you want to provide labels
+  using `one-hot` representation, please use `CategoricalCrossentropy` loss.
+  There should be `# classes` floating point values per feature for `y_pred`
+  and a single floating point value per feature for `y_true`.
+
+  In the snippet below, there is a single floating point value per example for
+  `y_true` and `# classes` floating pointing values per example for `y_pred`.
+  The shape of `y_true` is `[batch_size]` and the shape of `y_pred` is
+  `[batch_size, num_classes]`.
+
+  Usage:
+
+  ```python
+  cce = tf.keras.losses.SparseCategoricalCrossentropy()
+  loss = cce(
+    [0, 1, 2],
+    [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]])
+  print('Loss: ', loss.numpy())  # Loss: 0.3239
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.SparseCategoricalCrossentropy())
+  ````
+
+  Args:
+    from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
+      we assume that `y_pred` encodes a probability distribution.
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
+      Default value is `SUM_OVER_BATCH_SIZE`.
+    name: Optional name for the op.
+  """
+
+  def __init__(self,
+               from_logits=False,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name=None):
+    super(SparseCategoricalCrossentropy, self).__init__(
+        sparse_categorical_crossentropy,
+        name=name,
+        reduction=reduction,
+        from_logits=from_logits)
+
+
+@keras_export('keras.losses.Hinge')
+class Hinge(LossFunctionWrapper):
+  """Computes the hinge loss between `y_true` and `y_pred`.
+
+  `y_true` values are expected to be -1 or 1. If binary (0 or 1) labels are
+  provided we will convert them to -1 or 1.
+
+  Usage:
+
+  ```python
+  h = tf.keras.losses.Hinge()
+  loss = h([-1., 1., 1.], [0.6, -0.7, -0.5])
+
+  # loss = max(0, 1 - y_true * y_pred) = [1.6 + 1.7 + 1.5] / 3
+
+  print('Loss: ', loss.numpy())  # Loss: 1.6
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.Hinge())
+  ```
+  """
+
+  def __init__(self,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name=None):
+    super(Hinge, self).__init__(hinge, name=name, reduction=reduction)
+
+
+@keras_export('keras.losses.SquaredHinge')
+class SquaredHinge(LossFunctionWrapper):
+  """Computes the squared hinge loss between `y_true` and `y_pred`.
+
+  `y_true` values are expected to be -1 or 1. If binary (0 or 1) labels are
+  provided we will convert them to -1 or 1.
+
+  Usage:
+
+  ```python
+  sh = tf.keras.losses.SquaredHinge()
+  loss = sh([-1., 1., 1.], [0.6, -0.7, -0.5])
+
+  # loss = (max(0, 1 - y_true * y_pred))^2 = [1.6^2 + 1.7^2 + 1.5^2] / 3
+
+  print('Loss: ', loss.numpy())  # Loss: 2.566666
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.SquaredHinge())
+  ```
+  """
+
+  def __init__(self,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='squared_hinge'):
+    super(SquaredHinge, self).__init__(
+        squared_hinge, name=name, reduction=reduction)
+
+
+@keras_export('keras.losses.CategoricalHinge')
+class CategoricalHinge(LossFunctionWrapper):
+  """Computes the categorical hinge loss between `y_true` and `y_pred`.
+
+  Usage:
+
+  ```python
+  ch = tf.keras.losses.CategoricalHinge()
+  loss = ch([0., 1., 1.], [1., 0., 1.])
+  print('Loss: ', loss.numpy())  # Loss: 1.0
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.CategoricalHinge())
+  ```
+  """
+
+  def __init__(self,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='categorical_hinge'):
+    super(CategoricalHinge, self).__init__(
+        categorical_hinge, name=name, reduction=reduction)
+
+
+@keras_export('keras.losses.LogLoss')
+class LogLoss(LossFunctionWrapper):
+  """Computes the log loss between `y_true` and `y_pred`.
+
+  `logloss = - y_true * log(y_pred) - (1 - y_true) * log(1 - y_pred)`
+
+  Usage:
+
+  ```python
+  l = tf.keras.losses.LogLoss()
+  loss = l([0., 1., 1.], [1., 0., 1.])
+  print('Loss: ', loss.numpy())  # Loss: 10.745
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.LogLoss())
+  ```
+  """
+
+  def __init__(self,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='logloss'):
+    super(LogLoss, self).__init__(logloss, name=name, reduction=reduction)
+
+
+@keras_export('keras.losses.Poisson')
+class Poisson(LossFunctionWrapper):
+  """Computes the Poisson loss between `y_true` and `y_pred`.
+
+  `loss = y_pred - y_true * log(y_pred)`
+
+  Usage:
+
+  ```python
+  p = tf.keras.losses.Poisson()
+  loss = p([1, 9, 2], [4, 8, 12])
+  print('Loss: ', loss.numpy())  # Loss: -4.63
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.Poisson())
+  ```
+  """
+
+  def __init__(self,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='poisson'):
+    super(Poisson, self).__init__(poisson, name=name, reduction=reduction)
+
+
+@keras_export('keras.losses.LogCosh')
+class LogCosh(LossFunctionWrapper):
+  """Computes the logarithm of the hyperbolic cosine of the prediction error.
+
+  `logcosh = log((exp(x) + exp(-x))/2)`, where x is the error (y_pred - y_true)
+
+  Usage:
+
+  ```python
+  l = tf.keras.losses.LogCosh()
+  loss = l([0., 1., 1.], [1., 0., 1.])
+  print('Loss: ', loss.numpy())  # Loss: 0.289
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.LogCosh())
+  ```
+  """
+
+  def __init__(self,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='logcosh'):
+    super(LogCosh, self).__init__(logcosh, name=name, reduction=reduction)
+
+
+@keras_export('keras.losses.KLDivergence')
+class KLDivergence(LossFunctionWrapper):
+  """Computes Kullback Leibler divergence loss between `y_true` and `y_pred`.
+
+  `loss = y_true * log(y_true / y_pred)`
+
+  Usage:
+
+  ```python
+  k = tf.keras.losses.KLDivergence()
+  loss = k([.4, .9, .2], [.5, .8, .12])
+  print('Loss: ', loss.numpy())  # Loss: -0.043
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.KLDivergence())
+  ```
+  """
+
+  def __init__(self,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='kullback_leibler_divergence'):
+    super(KLDivergence, self).__init__(
+        kullback_leibler_divergence, name=name, reduction=reduction)
+
+
+@keras_export('keras.losses.Huber')
+class Huber(LossFunctionWrapper):
+  """Computes the Huber loss between `y_true` and `y_pred`.
+
+  For each value x in `error=y_true-y_pred`, the following is calculated:
+
+  ```
+  0.5 * x^2                  if |x| <= d
+  0.5 * d^2 + d * (|x| - d)  if |x| > d
+  ```
+  where d is `delta`. See: https://en.wikipedia.org/wiki/Huber_loss
+
+  Usage:
+
+  ```python
+  l = tf.keras.losses.Huber()
+  loss = l([0., 1., 1.], [1., 0., 1.])
+  print('Loss: ', loss.numpy())  # Loss: 0.333
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.Huber())
+  ```
+
+  Args:
+    delta: A float, the point where the Huber loss function changes from a
+      quadratic to linear.
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
+      Default value is `SUM_OVER_BATCH_SIZE`.
+    name: Optional name for the op.
+  """
+
+  def __init__(self,
+               delta=1.0,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='huber_loss'):
+    super(Huber, self).__init__(
+        huber_loss, name=name, reduction=reduction, delta=delta)
+
+
+@keras_export('keras.metrics.mean_squared_error',
+              'keras.metrics.mse',
+              'keras.metrics.MSE',
+              'keras.losses.mean_squared_error',
+              'keras.losses.mse',
+              'keras.losses.MSE')
 def mean_squared_error(y_true, y_pred):
-  return K.mean(math_ops.square(y_pred - y_true), axis=-1)
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, y_pred.dtype)
+  return K.mean(math_ops.squared_difference(y_pred, y_true), axis=-1)
 
 
-@tf_export('keras.metrics.mean_absolute_error',
-           'keras.metrics.mae',
-           'keras.metrics.MAE',
-           'keras.losses.mean_absolute_error',
-           'keras.losses.mae',
-           'keras.losses.MAE')
+@keras_export('keras.metrics.mean_absolute_error',
+              'keras.metrics.mae',
+              'keras.metrics.MAE',
+              'keras.losses.mean_absolute_error',
+              'keras.losses.mae',
+              'keras.losses.MAE')
 def mean_absolute_error(y_true, y_pred):
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, y_pred.dtype)
   return K.mean(math_ops.abs(y_pred - y_true), axis=-1)
 
 
-@tf_export('keras.metrics.mean_absolute_percentage_error',
-           'keras.metrics.mape',
-           'keras.metrics.MAPE',
-           'keras.losses.mean_absolute_percentage_error',
-           'keras.losses.mape',
-           'keras.losses.MAPE')
-def mean_absolute_percentage_error(y_true, y_pred):
+@keras_export('keras.metrics.mean_absolute_percentage_error',
+              'keras.metrics.mape',
+              'keras.metrics.MAPE',
+              'keras.losses.mean_absolute_percentage_error',
+              'keras.losses.mape',
+              'keras.losses.MAPE')
+def mean_absolute_percentage_error(y_true, y_pred):  # pylint: disable=missing-docstring
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, y_pred.dtype)
   diff = math_ops.abs(
       (y_true - y_pred) / K.clip(math_ops.abs(y_true), K.epsilon(), None))
   return 100. * K.mean(diff, axis=-1)
 
 
-@tf_export('keras.metrics.mean_squared_logarithmic_error',
-           'keras.metrics.msle',
-           'keras.metrics.MSLE',
-           'keras.losses.mean_squared_logarithmic_error',
-           'keras.losses.msle',
-           'keras.losses.MSLE')
-def mean_squared_logarithmic_error(y_true, y_pred):
+@keras_export('keras.metrics.mean_squared_logarithmic_error',
+              'keras.metrics.msle',
+              'keras.metrics.MSLE',
+              'keras.losses.mean_squared_logarithmic_error',
+              'keras.losses.msle',
+              'keras.losses.MSLE')
+def mean_squared_logarithmic_error(y_true, y_pred):  # pylint: disable=missing-docstring
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, y_pred.dtype)
   first_log = math_ops.log(K.clip(y_pred, K.epsilon(), None) + 1.)
   second_log = math_ops.log(K.clip(y_true, K.epsilon(), None) + 1.)
-  return K.mean(math_ops.square(first_log - second_log), axis=-1)
+  return K.mean(math_ops.squared_difference(first_log, second_log), axis=-1)
+
+
+def _maybe_convert_labels(y_true):
+  """Converts binary labels into -1/1."""
+  are_zeros = math_ops.equal(y_true, 0)
+  are_ones = math_ops.equal(y_true, 1)
+  is_binary = math_ops.reduce_all(math_ops.logical_or(are_zeros, are_ones))
 
+  def _convert_binary_labels():
+    # Convert the binary labels to -1 or 1.
+    return 2. * y_true - 1.
 
-@tf_export('keras.metrics.squared_hinge', 'keras.losses.squared_hinge')
+  updated_y_true = smart_cond.smart_cond(is_binary,
+                                         _convert_binary_labels, lambda: y_true)
+  return updated_y_true
+
+
+@keras_export('keras.metrics.squared_hinge', 'keras.losses.squared_hinge')
 def squared_hinge(y_true, y_pred):
+  """Computes the squared hinge loss between `y_true` and `y_pred`.
+
+  Args:
+    y_true: The ground truth values. `y_true` values are expected to be -1 or 1.
+      If binary (0 or 1) labels are provided we will convert them to -1 or 1.
+    y_pred: The predicted values.
+
+  Returns:
+    Tensor with one scalar loss entry per sample.
+  """
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, y_pred.dtype)
+  y_true = _maybe_convert_labels(y_true)
   return K.mean(
       math_ops.square(math_ops.maximum(1. - y_true * y_pred, 0.)), axis=-1)
 
 
-@tf_export('keras.metrics.hinge', 'keras.losses.hinge')
+@keras_export('keras.metrics.hinge', 'keras.losses.hinge')
 def hinge(y_true, y_pred):
+  """Computes the hinge loss between `y_true` and `y_pred`.
+
+  Args:
+    y_true: The ground truth values. `y_true` values are expected to be -1 or 1.
+      If binary (0 or 1) labels are provided we will convert them to -1 or 1.
+    y_pred: The predicted values.
+
+  Returns:
+    Tensor with one scalar loss entry per sample.
+  """
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, y_pred.dtype)
+  y_true = _maybe_convert_labels(y_true)
   return K.mean(math_ops.maximum(1. - y_true * y_pred, 0.), axis=-1)
 
 
-@tf_export('keras.losses.categorical_hinge')
+@keras_export('keras.losses.categorical_hinge')
 def categorical_hinge(y_true, y_pred):
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, y_pred.dtype)
   pos = math_ops.reduce_sum(y_true * y_pred, axis=-1)
   neg = math_ops.reduce_max((1. - y_true) * y_pred, axis=-1)
   return math_ops.maximum(0., neg - pos + 1.)
 
 
-@tf_export('keras.losses.logcosh')
+def logloss(y_true, y_pred):
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, y_pred.dtype)
+  losses = math_ops.multiply(y_true, math_ops.log(y_pred + K.epsilon()))
+  losses += math_ops.multiply((1 - y_true),
+                              math_ops.log(1 - y_pred + K.epsilon()))
+  return K.mean(-losses, axis=-1)
+
+
+def huber_loss(y_true, y_pred, delta=1.0):
+  """Computes Huber loss value.
+
+  For each value x in `error=y_true-y_pred`, the following is calculated:
+
+  ```
+  0.5 * x^2                  if |x| <= d
+  0.5 * d^2 + d * (|x| - d)  if |x| > d
+  ```
+  where d is `delta`. See: https://en.wikipedia.org/wiki/Huber_loss
+
+  Args:
+    y_true: tensor of true targets.
+    y_pred: tensor of predicted targets.
+    delta: A float, the point where the Huber loss function changes from a
+      quadratic to linear.
+
+  Returns:
+    Tensor with one scalar loss entry per sample.
+  """
+  y_pred = math_ops.cast(y_pred, dtype=K.floatx())
+  y_true = math_ops.cast(y_true, dtype=K.floatx())
+  error = math_ops.subtract(y_pred, y_true)
+  abs_error = math_ops.abs(error)
+  quadratic = math_ops.minimum(abs_error, delta)
+  linear = math_ops.subtract(abs_error, quadratic)
+  return math_ops.add(
+      math_ops.multiply(
+          ops.convert_to_tensor(0.5, dtype=quadratic.dtype),
+          math_ops.multiply(quadratic, quadratic)),
+      math_ops.multiply(delta, linear))
+
+
+@keras_export('keras.losses.logcosh')
 def logcosh(y_true, y_pred):
   """Logarithm of the hyperbolic cosine of the prediction error.
 
@@ -473,6 +859,8 @@ def logcosh(y_true, y_pred):
   Returns:
       Tensor with one scalar loss entry per sample.
   """
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, y_pred.dtype)
 
   def _logcosh(x):
     return x + nn.softplus(-2. * x) - math_ops.log(2.)
@@ -480,85 +868,139 @@ def logcosh(y_true, y_pred):
   return K.mean(_logcosh(y_pred - y_true), axis=-1)
 
 
-@tf_export('keras.metrics.categorical_crossentropy',
-           'keras.losses.categorical_crossentropy')
-def categorical_crossentropy(y_true, y_pred, from_logits=False):
+@keras_export('keras.metrics.categorical_crossentropy',
+              'keras.losses.categorical_crossentropy')
+def categorical_crossentropy(y_true,
+                             y_pred,
+                             from_logits=False,
+                             label_smoothing=0):
+  """Computes the categorical crossentropy loss.
+
+  Args:
+    y_true: tensor of true targets.
+    y_pred: tensor of predicted targets.
+    from_logits: Whether `y_pred` is expected to be a logits tensor. By default,
+      we assume that `y_pred` encodes a probability distribution.
+    label_smoothing: Float in [0, 1]. If > `0` then smooth the labels.
+
+  Returns:
+    Categorical crossentropy loss value.
+  """
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, y_pred.dtype)
+  label_smoothing = ops.convert_to_tensor(label_smoothing, dtype=K.floatx())
+
+  def _smooth_labels():
+    num_classes = math_ops.cast(array_ops.shape(y_true)[1], y_pred.dtype)
+    return y_true * (1.0 - label_smoothing) + (label_smoothing / num_classes)
+
+  y_true = smart_cond.smart_cond(label_smoothing,
+                                 _smooth_labels, lambda: y_true)
   return K.categorical_crossentropy(y_true, y_pred, from_logits=from_logits)
 
 
-@tf_export('keras.metrics.sparse_categorical_crossentropy',
-           'keras.losses.sparse_categorical_crossentropy')
-def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False):
+@keras_export('keras.metrics.sparse_categorical_crossentropy',
+              'keras.losses.sparse_categorical_crossentropy')
+def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False, axis=-1):
   return K.sparse_categorical_crossentropy(
-      y_true, y_pred, from_logits=from_logits)
+      y_true, y_pred, from_logits=from_logits, axis=axis)
+
 
+@keras_export('keras.metrics.binary_crossentropy',
+              'keras.losses.binary_crossentropy')
+def binary_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0):  # pylint: disable=missing-docstring
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, y_pred.dtype)
+  label_smoothing = ops.convert_to_tensor(label_smoothing, dtype=K.floatx())
 
-@tf_export('keras.metrics.binary_crossentropy',
-           'keras.losses.binary_crossentropy')
-def binary_crossentropy(y_true, y_pred, from_logits=False):
+  def _smooth_labels():
+    return y_true * (1.0 - label_smoothing) + 0.5 * label_smoothing
+
+  y_true = smart_cond.smart_cond(label_smoothing,
+                                 _smooth_labels, lambda: y_true)
   return K.mean(
       K.binary_crossentropy(y_true, y_pred, from_logits=from_logits), axis=-1)
 
 
-@tf_export('keras.metrics.kullback_leibler_divergence',
-           'keras.metrics.kld',
-           'keras.metrics.KLD',
-           'keras.losses.kullback_leibler_divergence',
-           'keras.losses.kld',
-           'keras.losses.KLD')
-def kullback_leibler_divergence(y_true, y_pred):
+@keras_export('keras.metrics.kullback_leibler_divergence',
+              'keras.metrics.kld',
+              'keras.metrics.KLD',
+              'keras.losses.kullback_leibler_divergence',
+              'keras.losses.kld',
+              'keras.losses.KLD')
+def kullback_leibler_divergence(y_true, y_pred):  # pylint: disable=missing-docstring
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, y_pred.dtype)
   y_true = K.clip(y_true, K.epsilon(), 1)
   y_pred = K.clip(y_pred, K.epsilon(), 1)
   return math_ops.reduce_sum(y_true * math_ops.log(y_true / y_pred), axis=-1)
 
 
-@tf_export('keras.metrics.poisson', 'keras.losses.poisson')
+@keras_export('keras.metrics.poisson', 'keras.losses.poisson')
 def poisson(y_true, y_pred):
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_true = math_ops.cast(y_true, y_pred.dtype)
   return K.mean(y_pred - y_true * math_ops.log(y_pred + K.epsilon()), axis=-1)
 
 
-@tf_export('keras.metrics.cosine_proximity',
-           'keras.metrics.cosine',
-           'keras.losses.cosine_proximity',
-           'keras.losses.cosine')
-def cosine_proximity(y_true, y_pred):
-  y_true = nn.l2_normalize(y_true, axis=-1)
-  y_pred = nn.l2_normalize(y_pred, axis=-1)
-  return -math_ops.reduce_sum(y_true * y_pred, axis=-1)
-
-
-class CosineProximity(Loss):
-  """Computes the cosine distance between `y_true` and `y_pred`.
+# Retaining the legacy namespaces: 'cosine_proximity' and 'cosine'.
+# TODO(psv): Change name of this function to `cosine_similarity` after fixing
+# estimator test.
+@keras_export(
+    'keras.losses.cosine_similarity',
+    v1=[
+        'keras.metrics.cosine_proximity',
+        'keras.metrics.cosine',
+        'keras.losses.cosine_proximity',
+        'keras.losses.cosine',
+        'keras.losses.cosine_similarity',
+    ])
+def cosine_proximity(y_true, y_pred, axis=-1):
+  """Computes the cosine similarity between labels and predictions."""
+  y_true = nn.l2_normalize(y_true, axis=axis)
+  y_pred = nn.l2_normalize(y_pred, axis=axis)
+  return math_ops.reduce_sum(y_true * y_pred, axis=axis)
+
+
+@keras_export('keras.losses.CosineSimilarity')
+class CosineSimilarity(LossFunctionWrapper):
+  """Computes the cosine similarity between `y_true` and `y_pred`.
 
   Usage:
 
   ```python
-  cosine_loss = tf.losses.CosineProximity()
-  loss = cosine_loss([0., 1., 1.], [1., 0., 1.])
-  print('Loss: ', loss.numpy())  # Loss: -0.5
+  cosine_loss = tf.keras.losses.CosineSimilarity(axis=1)
+  loss = cosine_loss([[0., 1.], [1., 1.]], [[1., 0.], [1., 1.]])
+  # l2_norm(y_true) = [[0., 1.], [1./1.414], 1./1.414]]]
+  # l2_norm(y_pred) = [[1., 0.], [1./1.414], 1./1.414]]]
+  # l2_norm(y_true) . l2_norm(y_pred) = [[0., 0.], [0.5, 0.5]]
+  # loss = mean(sum(l2_norm(y_true) . l2_norm(y_pred), axis=1))
+         = ((0. + 0.) +  (0.5 + 0.5)) / 2
+
+  print('Loss: ', loss.numpy())  # Loss: 0.5
   ```
 
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
-  model.compile('sgd', loss=tf.losses.CosineProximity())
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.CosineSimilarity(axis=1))
   ```
-  """
 
-  def call(self, y_true, y_pred):
-    """Calculates the cosine proximity loss.
-
-    Args:
-      y_true: Ground truth values.
-      y_pred: The predicted values.
+  Args:
+    axis: (Optional) Defaults to -1. The dimension along which the cosine
+      similarity is computed.
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
+      Default value is `SUM_OVER_BATCH_SIZE`.
+    name: Optional name for the op.
+  """
 
-    Returns:
-      Cosine distance loss.
-    """
-    y_pred = ops.convert_to_tensor(y_pred)
-    y_true = math_ops.cast(y_true, y_pred.dtype)
-    return cosine_proximity(y_true, y_pred)
+  def __init__(self,
+               axis=-1,
+               reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name='cosine_similarity'):
+    super(CosineSimilarity, self).__init__(
+        cosine_similarity, reduction=reduction, name=name, axis=axis)
 
 
 # Aliases.
@@ -568,15 +1010,25 @@ mae = MAE = mean_absolute_error
 mape = MAPE = mean_absolute_percentage_error
 msle = MSLE = mean_squared_logarithmic_error
 kld = KLD = kullback_leibler_divergence
-cosine = cosine_proximity
+cosine_similarity = cosine_proximity
+
+
+def is_categorical_crossentropy(loss):
+  result = ((isinstance(loss, CategoricalCrossentropy) or
+             (isinstance(loss, LossFunctionWrapper) and
+              loss.fn == categorical_crossentropy) or
+             (hasattr(loss, '__name__') and
+              loss.__name__ == 'categorical_crossentropy') or
+             (loss == 'categorical_crossentropy')))
+  return result
 
 
-@tf_export('keras.losses.serialize')
+@keras_export('keras.losses.serialize')
 def serialize(loss):
   return serialize_keras_object(loss)
 
 
-@tf_export('keras.losses.deserialize')
+@keras_export('keras.losses.deserialize')
 def deserialize(name, custom_objects=None):
   return deserialize_keras_object(
       name,
@@ -585,7 +1037,7 @@ def deserialize(name, custom_objects=None):
       printable_module_name='loss function')
 
 
-@tf_export('keras.losses.get')
+@keras_export('keras.losses.get')
 def get(identifier):
   if identifier is None:
     return None
diff --git a/tensorflow/python/keras/losses_test.py b/tensorflow/python/keras/losses_test.py
index d2791cdcd3bdac799c92112174f9edf2dbdf87ee..6fe68335e856accd6f60f693d6b97e6328fc8e59 100644
--- a/tensorflow/python/keras/losses_test.py
+++ b/tensorflow/python/keras/losses_test.py
@@ -27,8 +27,7 @@ from tensorflow.python import keras
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops.losses import losses_impl
+from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.platform import test
 
 try:
@@ -46,7 +45,7 @@ ALL_LOSSES = [keras.losses.mean_squared_error,
               keras.losses.binary_crossentropy,
               keras.losses.kullback_leibler_divergence,
               keras.losses.poisson,
-              keras.losses.cosine_proximity,
+              keras.losses.cosine_similarity,
               keras.losses.logcosh,
               keras.losses.categorical_hinge]
 
@@ -57,7 +56,7 @@ class _MSEMAELoss(object):
   def __init__(self, mse_fraction):
     self.mse_fraction = mse_fraction
 
-  def __call__(self, y_true, y_pred):
+  def __call__(self, y_true, y_pred, sample_weight=None):
     return (self.mse_fraction * keras.losses.mse(y_true, y_pred) +
             (1 - self.mse_fraction) * keras.losses.mae(y_true, y_pred))
 
@@ -95,6 +94,45 @@ class KerasLossesTest(test.TestCase):
       objective_output = keras.losses.sparse_categorical_crossentropy(y_a, y_b)
       assert keras.backend.eval(objective_output).shape == (6,)
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_categorical_crossentropy_loss(self):
+    target = keras.backend.variable(np.random.randint(0, 1, (5, 1)))
+    logits = keras.backend.variable(np.random.random((5, 1)))
+    softmax_output = keras.backend.softmax(logits)
+    output_from_logit = keras.losses.categorical_crossentropy(
+        target, logits, from_logits=True)
+    output_from_softmax = keras.losses.categorical_crossentropy(
+        target, softmax_output)
+    np.testing.assert_allclose(
+        keras.backend.eval(output_from_logit),
+        keras.backend.eval(output_from_softmax), atol=1e-5)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_sparse_categorical_crossentropy_loss(self):
+    target = keras.backend.variable(np.random.randint(0, 1, (5, 1)))
+    logits = keras.backend.variable(np.random.random((5, 1)))
+    softmax_output = keras.backend.softmax(logits)
+    output_from_logit = keras.losses.sparse_categorical_crossentropy(
+        target, logits, from_logits=True)
+    output_from_softmax = keras.losses.sparse_categorical_crossentropy(
+        target, softmax_output)
+    np.testing.assert_allclose(
+        keras.backend.eval(output_from_logit),
+        keras.backend.eval(output_from_softmax), atol=1e-5)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_binary_crossentropy_loss(self):
+    target = keras.backend.variable(np.random.randint(0, 1, (5, 1)))
+    logits = keras.backend.variable(np.random.random((5, 1)))
+    sigmoid_output = keras.backend.sigmoid(logits)
+    output_from_logit = keras.losses.binary_crossentropy(
+        target, logits, from_logits=True)
+    output_from_sigmoid = keras.losses.binary_crossentropy(
+        target, sigmoid_output)
+    np.testing.assert_allclose(
+        keras.backend.eval(output_from_logit),
+        keras.backend.eval(output_from_sigmoid), atol=1e-5)
+
   def test_serialization(self):
     fn = keras.losses.get('mse')
     config = keras.losses.serialize(fn)
@@ -142,15 +180,34 @@ class KerasLossesTest(test.TestCase):
         loaded_model = keras.models.load_model(model_filename)
         loaded_model.predict(np.random.rand(128, 2))
 
+  def test_loss_wrapper(self):
+    loss_fn = keras.losses.get('mse')
+    mse_obj = keras.losses.LossFunctionWrapper(loss_fn, name=loss_fn.__name__)
+
+    self.assertEqual(mse_obj.name, 'mean_squared_error')
+    self.assertEqual(mse_obj.reduction,
+                     losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE)
+
+    y_true = constant_op.constant([[1., 9.], [2., 5.]])
+    y_pred = constant_op.constant([[4., 8.], [12., 3.]])
+    sample_weight = constant_op.constant([1.2, 0.5])
+    loss = mse_obj(y_true, y_pred, sample_weight=sample_weight)
+
+    # mse = [((4 - 1)^2 + (8 - 9)^2) / 2, ((12 - 2)^2 + (3 - 5)^2) / 2]
+    # mse = [5, 52]
+    # weighted_mse = [5 * 1.2, 52 * 0.5] = [6, 26]
+    # reduced_weighted_mse = (6 + 26) / 2 =
+    self.assertAllClose(self.evaluate(loss), 16, 1e-2)
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class MeanSquaredErrorTest(test.TestCase):
 
   def test_config(self):
     mse_obj = keras.losses.MeanSquaredError(
-        reduction=losses_impl.ReductionV2.SUM, name='mse_1')
+        reduction=losses_utils.ReductionV2.SUM, name='mse_1')
     self.assertEqual(mse_obj.name, 'mse_1')
-    self.assertEqual(mse_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(mse_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_all_correct_unweighted(self):
     mse_obj = keras.losses.MeanSquaredError()
@@ -216,7 +273,7 @@ class MeanSquaredErrorTest(test.TestCase):
 
   def test_no_reduction(self):
     mse_obj = keras.losses.MeanSquaredError(
-        reduction=losses_impl.ReductionV2.NONE)
+        reduction=losses_utils.ReductionV2.NONE)
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -227,7 +284,7 @@ class MeanSquaredErrorTest(test.TestCase):
 
   def test_sum_reduction(self):
     mse_obj = keras.losses.MeanSquaredError(
-        reduction=losses_impl.ReductionV2.SUM)
+        reduction=losses_utils.ReductionV2.SUM)
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -241,9 +298,9 @@ class MeanAbsoluteErrorTest(test.TestCase):
 
   def test_config(self):
     mae_obj = keras.losses.MeanAbsoluteError(
-        reduction=losses_impl.ReductionV2.SUM, name='mae_1')
+        reduction=losses_utils.ReductionV2.SUM, name='mae_1')
     self.assertEqual(mae_obj.name, 'mae_1')
-    self.assertEqual(mae_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(mae_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_all_correct_unweighted(self):
     mae_obj = keras.losses.MeanAbsoluteError()
@@ -309,7 +366,7 @@ class MeanAbsoluteErrorTest(test.TestCase):
 
   def test_no_reduction(self):
     mae_obj = keras.losses.MeanAbsoluteError(
-        reduction=losses_impl.ReductionV2.NONE)
+        reduction=losses_utils.ReductionV2.NONE)
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -320,7 +377,7 @@ class MeanAbsoluteErrorTest(test.TestCase):
 
   def test_sum_reduction(self):
     mae_obj = keras.losses.MeanAbsoluteError(
-        reduction=losses_impl.ReductionV2.SUM)
+        reduction=losses_utils.ReductionV2.SUM)
     y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
     y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
                                   shape=(2, 3),
@@ -334,9 +391,9 @@ class MeanAbsolutePercentageErrorTest(test.TestCase):
 
   def test_config(self):
     mape_obj = keras.losses.MeanAbsolutePercentageError(
-        reduction=losses_impl.ReductionV2.SUM, name='mape_1')
+        reduction=losses_utils.ReductionV2.SUM, name='mape_1')
     self.assertEqual(mape_obj.name, 'mape_1')
-    self.assertEqual(mape_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(mape_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
     mape_obj = keras.losses.MeanAbsolutePercentageError()
@@ -391,9 +448,9 @@ class MeanSquaredLogarithmicErrorTest(test.TestCase):
 
   def test_config(self):
     msle_obj = keras.losses.MeanSquaredLogarithmicError(
-        reduction=losses_impl.ReductionV2.SUM, name='mape_1')
+        reduction=losses_utils.ReductionV2.SUM, name='mape_1')
     self.assertEqual(msle_obj.name, 'mape_1')
-    self.assertEqual(msle_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(msle_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
     msle_obj = keras.losses.MeanSquaredLogarithmicError()
@@ -444,70 +501,98 @@ class MeanSquaredLogarithmicErrorTest(test.TestCase):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class CosineProximityTest(test.TestCase):
+class CosineSimilarityTest(test.TestCase):
+
+  def l2_norm(self, x, axis):
+    epsilon = 1e-12
+    square_sum = np.sum(np.square(x), axis=axis, keepdims=True)
+    x_inv_norm = 1 / np.sqrt(np.maximum(square_sum, epsilon))
+    return np.multiply(x, x_inv_norm)
+
+  def setup(self, axis=1):
+    self.np_y_true = np.asarray([[1, 9, 2], [-5, -2, 6]], dtype=np.float32)
+    self.np_y_pred = np.asarray([[4, 8, 12], [8, 1, 3]], dtype=np.float32)
+
+    y_true = self.l2_norm(self.np_y_true, axis)
+    y_pred = self.l2_norm(self.np_y_pred, axis)
+    self.expected_loss = np.sum(np.multiply(y_true, y_pred), axis=(axis,))
+
+    self.y_true = constant_op.constant(self.np_y_true)
+    self.y_pred = constant_op.constant(self.np_y_pred)
 
   def test_config(self):
-    cosine_obj = keras.losses.CosineProximity(
-        reduction=losses_impl.ReductionV2.SUM, name='cosine_loss')
+    cosine_obj = keras.losses.CosineSimilarity(
+        axis=2, reduction=losses_utils.ReductionV2.SUM, name='cosine_loss')
     self.assertEqual(cosine_obj.name, 'cosine_loss')
-    self.assertEqual(cosine_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(cosine_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_unweighted(self):
-    cosine_obj = keras.losses.CosineProximity()
-    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float32)
-    loss = cosine_obj(y_true, y_pred)
-    self.assertAlmostEqual(self.evaluate(loss), -0.18722, 3)
+    self.setup()
+    cosine_obj = keras.losses.CosineSimilarity()
+    loss = cosine_obj(self.y_true, self.y_pred)
+    expected_loss = np.mean(self.expected_loss)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
 
   def test_scalar_weighted(self):
-    cosine_obj = keras.losses.CosineProximity()
-    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float32)
-    loss = cosine_obj(y_true, y_pred, sample_weight=2.3)
-    self.assertAlmostEqual(self.evaluate(loss), -0.43060, 3)
+    self.setup()
+    cosine_obj = keras.losses.CosineSimilarity()
+    sample_weight = 2.3
+    loss = cosine_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    expected_loss = np.mean(self.expected_loss * sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
 
   def test_sample_weighted(self):
-    cosine_obj = keras.losses.CosineProximity()
-    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float32)
-    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
-    loss = cosine_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 0.15599, 3)
+    self.setup()
+    cosine_obj = keras.losses.CosineSimilarity()
+    sample_weight = np.asarray([1.2, 3.4])
+    loss = cosine_obj(
+        self.y_true,
+        self.y_pred,
+        sample_weight=constant_op.constant(sample_weight))
+    expected_loss = np.mean(self.expected_loss * sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
 
   def test_timestep_weighted(self):
-    cosine_obj = keras.losses.CosineProximity()
-    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
-    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
-                                  shape=(2, 3, 1),
-                                  dtype=dtypes.float32)
-    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
-    loss = cosine_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), -2.0000, 3)
+    self.setup()
+    cosine_obj = keras.losses.CosineSimilarity()
+    np_y_true = self.np_y_true.reshape((2, 3, 1))
+    np_y_pred = self.np_y_pred.reshape((2, 3, 1))
+    sample_weight = np.asarray([3, 6, 5, 0, 4, 2]).reshape((2, 3))
+
+    y_true = self.l2_norm(np_y_true, 2)
+    y_pred = self.l2_norm(np_y_pred, 2)
+    expected_loss = np.sum(np.multiply(y_true, y_pred), axis=(2,))
+
+    y_true = constant_op.constant(np_y_true)
+    y_pred = constant_op.constant(np_y_pred)
+    loss = cosine_obj(
+        y_true, y_pred, sample_weight=constant_op.constant(sample_weight))
+
+    expected_loss = np.mean(expected_loss * sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
 
   def test_zero_weighted(self):
-    cosine_obj = keras.losses.CosineProximity()
-    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
-    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float32)
-    loss = cosine_obj(y_true, y_pred, sample_weight=0)
+    self.setup()
+    cosine_obj = keras.losses.CosineSimilarity()
+    loss = cosine_obj(self.y_true, self.y_pred, sample_weight=0)
     self.assertAlmostEqual(self.evaluate(loss), 0., 3)
 
+  def test_axis(self):
+    self.setup(axis=1)
+    cosine_obj = keras.losses.CosineSimilarity(axis=1)
+    loss = cosine_obj(self.y_true, self.y_pred)
+    expected_loss = np.mean(self.expected_loss)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class BinaryCrossentropyTest(test.TestCase):
 
   def test_config(self):
     bce_obj = keras.losses.BinaryCrossentropy(
-        reduction=losses_impl.ReductionV2.SUM, name='bce_1')
+        reduction=losses_utils.ReductionV2.SUM, name='bce_1')
     self.assertEqual(bce_obj.name, 'bce_1')
-    self.assertEqual(bce_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(bce_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_all_correct_unweighted(self):
     y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]],
@@ -525,74 +610,132 @@ class BinaryCrossentropyTest(test.TestCase):
     self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
   def test_unweighted(self):
+    y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
+    y_pred = np.asarray([1, 1, 1, 0], dtype=np.float32).reshape([2, 2])
     bce_obj = keras.losses.BinaryCrossentropy()
-    y_true = constant_op.constant([1, 0, 1, 0, 0, 1], shape=(2, 3))
-    y_pred = constant_op.constant([1, 1, 1, 0, 1, 0],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float32)
     loss = bce_obj(y_true, y_pred)
-    self.assertAlmostEqual(self.evaluate(loss), 8.0004, 3)
+
+    # EPSILON = 1e-7, y = y_true, y` = y_pred, Y_MAX = 0.9999999
+    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+    # y` = [Y_MAX, Y_MAX, Y_MAX, EPSILON]
+
+    # Loss = -(y log(y` + EPSILON) + (1 - y) log(1 - y` + EPSILON))
+    #      = [-log(Y_MAX + EPSILON), -log(1 - Y_MAX + EPSILON),
+    #         -log(Y_MAX + EPSILON), -log(1)]
+    #      = [0, 15.33, 0, 0]
+    # Reduced loss = 15.33 / 4
+
+    self.assertAlmostEqual(self.evaluate(loss), 3.833, 3)
 
     # Test with logits.
-    logits = constant_op.constant([10., 10., 10., -10., 10, -10],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float32)
+    y_true = constant_op.constant([[1, 0, 1], [0, 1, 1]])
+    logits = constant_op.constant([[100.0, -100.0, 100.0],
+                                   [100.0, 100.0, -100.0]])
     bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
     loss = bce_obj(y_true, logits)
-    self.assertAlmostEqual(self.evaluate(loss), 5., 3)
+
+    # Loss = max(x, 0) - x * z + log(1 + exp(-abs(x)))
+    #            (where x = logits and z = y_true)
+    #      = [((100 - 100 * 1 + log(1 + exp(-100))) +
+    #          (0 + 100 * 0 + log(1 + exp(-100))) +
+    #          (100 - 100 * 1 + log(1 + exp(-100))),
+    #         ((100 - 100 * 0 + log(1 + exp(-100))) +
+    #          (100 - 100 * 1 + log(1 + exp(-100))) +
+    #          (0 + 100 * 1 + log(1 + exp(-100))))]
+    #      = [(0 + 0 + 0) / 3, 200 / 3]
+    # Reduced loss = (0 + 66.666) / 2
+
+    self.assertAlmostEqual(self.evaluate(loss), 33.333, 3)
 
   def test_scalar_weighted(self):
     bce_obj = keras.losses.BinaryCrossentropy()
-    y_true = constant_op.constant([1, 0, 1, 0, 0, 1], shape=(2, 3))
-    y_pred = constant_op.constant([1, 1, 1, 0, 1, 0],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float32)
+    y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
+    y_pred = np.asarray([1, 1, 1, 0], dtype=np.float32).reshape([2, 2])
     loss = bce_obj(y_true, y_pred, sample_weight=2.3)
-    self.assertAlmostEqual(self.evaluate(loss), 18.4010, 3)
+
+    # EPSILON = 1e-7, y = y_true, y` = y_pred, Y_MAX = 0.9999999
+    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+    # y` = [Y_MAX, Y_MAX, Y_MAX, EPSILON]
+
+    # Loss = -(y log(y` + EPSILON) + (1 - y) log(1 - y` + EPSILON))
+    #      = [-log(Y_MAX + EPSILON), -log(1 - Y_MAX + EPSILON),
+    #         -log(Y_MAX + EPSILON), -log(1)]
+    #      = [0, 15.33, 0, 0]
+    # Weighted loss = [0, 15.33 * 2.3, 0, 0]
+    # Reduced loss = 15.33 * 2.3 / 4
+
+    self.assertAlmostEqual(self.evaluate(loss), 8.817, 3)
 
     # Test with logits.
-    y_true = array_ops.ones((32, 1))
-    logits = array_ops.ones((32, 1), dtype=dtypes.float32)
+    y_true = constant_op.constant([[1, 0, 1], [0, 1, 1]])
+    logits = constant_op.constant([[100.0, -100.0, 100.0],
+                                   [100.0, 100.0, -100.0]])
     bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
     loss = bce_obj(y_true, logits, sample_weight=2.3)
-    self.assertAlmostEqual(self.evaluate(loss), 0.7205, 3)
+
+    # Loss = max(x, 0) - x * z + log(1 + exp(-abs(x)))
+    #            (where x = logits and z = y_true)
+    # Loss = [(0 + 0 + 0) / 3, 200 / 3]
+    # Weighted loss = [0 * 2.3, 66.666 * 2.3]
+    # Reduced loss = (0 + 66.666 * 2.3) / 2
+
+    self.assertAlmostEqual(self.evaluate(loss), 76.667, 3)
 
   def test_sample_weighted(self):
     bce_obj = keras.losses.BinaryCrossentropy()
-    y_true = constant_op.constant([1, 0, 1, 0, 0, 1], shape=(2, 3))
-    y_pred = constant_op.constant([1, 1, 1, 0, 1, 0],
-                                  shape=(2, 3),
-                                  dtype=dtypes.float64)
+    y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
+    y_pred = np.asarray([1, 1, 1, 0], dtype=np.float32).reshape([2, 2])
     sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
     loss = bce_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAlmostEqual(self.evaluate(loss), 21.4907, 3)
+
+    # EPSILON = 1e-7, y = y_true, y` = y_pred, Y_MAX = 0.9999999
+    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+    # y` = [Y_MAX, Y_MAX, Y_MAX, EPSILON]
+
+    # Loss = -(y log(y` + EPSILON) + (1 - y) log(1 - y` + EPSILON))
+    #      = [-log(Y_MAX + EPSILON), -log(1 - Y_MAX + EPSILON),
+    #         -log(Y_MAX + EPSILON), -log(1)]
+    #      = [0, 15.33, 0, 0]
+    # Reduced loss = 15.33 * 1.2 / 4
+
+    self.assertAlmostEqual(self.evaluate(loss), 4.6, 3)
 
     # Test with logits.
-    y_true = constant_op.constant([[0, 0, 1], [1, 0, 0], [0, 1, 0]])
-    logits = constant_op.constant(
-        [[100.0, -100.0, -100.0], [-100.0, 100.0, -100.0],
-         [-100.0, -100.0, 100.0]],
-        dtype=dtypes.float64)
-    weights = constant_op.constant([3, 2, 8])
+    y_true = constant_op.constant([[1, 0, 1], [0, 1, 1]])
+    logits = constant_op.constant([[100.0, -100.0, 100.0],
+                                   [100.0, 100.0, -100.0]])
+    weights = constant_op.constant([4, 3])
     bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
     loss = bce_obj(y_true, logits, sample_weight=weights)
-    self.assertAlmostEqual(self.evaluate(loss), 288.8888, 3)
+
+    # Loss = max(x, 0) - x * z + log(1 + exp(-abs(x)))
+    #            (where x = logits and z = y_true)
+    # Loss = [(0 + 0 + 0)/3, 200 / 3]
+    # Weighted loss = [0 * 4, 66.666 * 3]
+    # Reduced loss = (0 + 66.666 * 3) / 2
+
+    self.assertAlmostEqual(self.evaluate(loss), 100, 3)
 
   def test_no_reduction(self):
-    y_true = constant_op.constant(((1, 0, 1), (1, 1, 0), (0, 1, 1)))
-    logits = constant_op.constant(((100.0, -100.0, 100.0),
-                                   (100.0, -100.0, 100.0),
-                                   (100.0, 100.0, -100.0)))
+    y_true = constant_op.constant([[1, 0, 1], [0, 1, 1]])
+    logits = constant_op.constant([[100.0, -100.0, 100.0],
+                                   [100.0, 100.0, -100.0]])
     bce_obj = keras.losses.BinaryCrossentropy(
-        from_logits=True, reduction=losses_impl.ReductionV2.NONE)
+        from_logits=True, reduction=losses_utils.ReductionV2.NONE)
     loss = bce_obj(y_true, logits)
-    self.assertAllClose((0., 66.6666, 66.6666), self.evaluate(loss), 3)
+
+    # Loss = max(x, 0) - x * z + log(1 + exp(-abs(x)))
+    #            (where x = logits and z = y_true)
+    # Loss = [(0 + 0 + 0)/3, (200)/3]
+
+    self.assertAllClose((0., 66.6666), self.evaluate(loss), 3)
 
   def test_label_smoothing(self):
     logits = constant_op.constant([[100.0, -100.0, -100.0]])
     y_true = constant_op.constant([[1, 0, 1]])
     label_smoothing = 0.1
     # Loss: max(x, 0) - x * z + log(1 + exp(-abs(x)))
+    #            (where x = logits and z = y_true)
     # Label smoothing: z' = z * (1 - L) + 0.5L
     #                  1  = 1 - 0.5L
     #                  0  = 0.5L
@@ -613,9 +756,9 @@ class CategoricalCrossentropyTest(test.TestCase):
 
   def test_config(self):
     cce_obj = keras.losses.CategoricalCrossentropy(
-        reduction=losses_impl.ReductionV2.SUM, name='bce_1')
+        reduction=losses_utils.ReductionV2.SUM, name='bce_1')
     self.assertEqual(cce_obj.name, 'bce_1')
-    self.assertEqual(cce_obj.reduction, losses_impl.ReductionV2.SUM)
+    self.assertEqual(cce_obj.reduction, losses_utils.ReductionV2.SUM)
 
   def test_all_correct_unweighted(self):
     y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]],
@@ -679,7 +822,7 @@ class CategoricalCrossentropyTest(test.TestCase):
     y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
     logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
     cce_obj = keras.losses.CategoricalCrossentropy(
-        from_logits=True, reduction=losses_impl.ReductionV2.NONE)
+        from_logits=True, reduction=losses_utils.ReductionV2.NONE)
     loss = cce_obj(y_true, logits)
     self.assertAllClose((0.001822, 0.000459, 0.169846), self.evaluate(loss), 3)
 
@@ -705,22 +848,26 @@ class CategoricalCrossentropyTest(test.TestCase):
     expected_value = 400.0 * label_smoothing / 3.0
     self.assertAlmostEqual(self.evaluate(loss), expected_value, 3)
 
-  def test_all_correct_unweighted_sparse(self):
+
+@test_util.run_all_in_graph_and_eager_modes
+class SparseCategoricalCrossentropyTest(test.TestCase):
+
+  def test_all_correct_unweighted(self):
     y_true = constant_op.constant([[0], [1], [2]], dtype=dtypes.int64)
     y_pred = constant_op.constant([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]],
                                   dtype=dtypes.float32)
-    cce_obj = keras.losses.CategoricalCrossentropy()
+    cce_obj = keras.losses.SparseCategoricalCrossentropy()
     loss = cce_obj(y_true, y_pred)
     self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
     # Test with logits.
     logits = constant_op.constant([[10., 0., 0.], [0., 10., 0.], [0., 0., 10.]])
-    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    cce_obj = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
     loss = cce_obj(y_true, logits)
     self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
 
-  def test_unweighted_sparse(self):
-    cce_obj = keras.losses.CategoricalCrossentropy()
+  def test_unweighted(self):
+    cce_obj = keras.losses.SparseCategoricalCrossentropy()
     y_true = constant_op.constant([0, 1, 2])
     y_pred = constant_op.constant(
         [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
@@ -729,12 +876,12 @@ class CategoricalCrossentropyTest(test.TestCase):
 
     # Test with logits.
     logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    cce_obj = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
     loss = cce_obj(y_true, logits)
     self.assertAlmostEqual(self.evaluate(loss), .0573, 3)
 
-  def test_scalar_weighted_sparse(self):
-    cce_obj = keras.losses.CategoricalCrossentropy()
+  def test_scalar_weighted(self):
+    cce_obj = keras.losses.SparseCategoricalCrossentropy()
     y_true = constant_op.constant([[0], [1], [2]])
     y_pred = constant_op.constant(
         [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
@@ -743,12 +890,12 @@ class CategoricalCrossentropyTest(test.TestCase):
 
     # Test with logits.
     logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    cce_obj = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
     loss = cce_obj(y_true, logits, sample_weight=2.3)
     self.assertAlmostEqual(self.evaluate(loss), .1317, 3)
 
-  def test_sample_weighted_sparse(self):
-    cce_obj = keras.losses.CategoricalCrossentropy()
+  def test_sample_weighted(self):
+    cce_obj = keras.losses.SparseCategoricalCrossentropy()
     y_true = constant_op.constant([[0], [1], [2]])
     y_pred = constant_op.constant(
         [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
@@ -758,18 +905,727 @@ class CategoricalCrossentropyTest(test.TestCase):
 
     # Test with logits.
     logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    cce_obj = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
     loss = cce_obj(y_true, logits, sample_weight=sample_weight)
     self.assertAlmostEqual(self.evaluate(loss), 0.31829, 3)
 
-  def test_no_reduction_sparse(self):
+  def test_no_reduction(self):
     y_true = constant_op.constant([[0], [1], [2]])
     logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
-    cce_obj = keras.losses.CategoricalCrossentropy(
-        from_logits=True, reduction=losses_impl.ReductionV2.NONE)
+    cce_obj = keras.losses.SparseCategoricalCrossentropy(
+        from_logits=True, reduction=losses_utils.ReductionV2.NONE)
     loss = cce_obj(y_true, logits)
     self.assertAllClose((0.001822, 0.000459, 0.169846), self.evaluate(loss), 3)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class HingeTest(test.TestCase):
+
+  def test_config(self):
+    hinge_obj = keras.losses.Hinge(
+        reduction=losses_utils.ReductionV2.SUM, name='hinge_loss')
+    self.assertEqual(hinge_obj.name, 'hinge_loss')
+    self.assertEqual(hinge_obj.reduction, losses_utils.ReductionV2.SUM)
+
+  def test_unweighted(self):
+    hinge_obj = keras.losses.Hinge()
+    y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+    y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
+                                   [-0.25, -1., 0.5, 0.6]])
+
+    # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+    # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
+    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+    # loss = [(0.7 + 0.8 + 0.9 + 0) / 4, (0.75 + 0 + 0.5 + 0.4) / 4]
+    #      = [0.6, 0.4125]
+    # reduced loss = (0.6 + 0.4125) / 2
+
+    loss = hinge_obj(y_true, y_pred)
+    self.assertAllClose(0.506, self.evaluate(loss), atol=1e-3)
+
+  def test_scalar_weighted(self):
+    hinge_obj = keras.losses.Hinge()
+    y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+    y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
+                                   [-0.25, -1., 0.5, 0.6]])
+
+    # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+    # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
+    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+    # loss = [(0.7 + 0.8 + 0.9 + 0) / 4, (0.75 + 0 + 0.5 + 0.4) / 4]
+    #      = [0.6, 0.4125]
+    # weighted_loss = [0.6 * 2.3, 0.4125 * 2.3]
+    # reduced loss = (0.6 + 0.4125) * 2.3 / 2
+
+    loss = hinge_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 1.164, 3)
+
+    # Verify we get the same output when the same input is given
+    loss_2 = hinge_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAllClose(self.evaluate(loss), self.evaluate(loss_2), 1e-3)
+
+  def test_sample_weighted(self):
+    hinge_obj = keras.losses.Hinge()
+    y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+    y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
+                                   [-0.25, -1., 0.5, 0.6]])
+
+    # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+    # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
+    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+    # loss = [(0.7 + 0.8 + 0.9 + 0) / 4, (0.75 + 0 + 0.5 + 0.4) / 4]
+    #      = [0.6, 0.4125]
+    # weighted loss = [0.6 * 1.2, 0.4125 * 3.4]
+    # reduced loss = (0.6 * 1.2 + 0.4125 * 3.4) / 2
+
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(self.evaluate(loss), 1.061, 1e-3)
+
+  def test_timestep_weighted(self):
+    hinge_obj = keras.losses.Hinge()
+    y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]], shape=(2, 4, 1))
+    y_pred = constant_op.constant(
+        [[-0.3, 0.2, -0.1, 1.6], [-0.25, -1., 0.5, 0.6]], shape=(2, 4, 1))
+    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2, 1, 3], shape=(2, 4))
+
+    # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+    # y_true = [[[-1], [1], [-1], [1]], [[-1], [-1], [1], [1]]]
+    # y_true * y_pred = [[[0.3], [0.2], [0.1], [1.6]],
+    #                    [[0.25], [1], [0.5], [0.6]]]
+    # 1 - y_true * y_pred = [[[0.7], [0.8], [0.9], [-0.6]],
+    #                        [[0.75], [0], [0.5], [0.4]]]
+    # loss = [[0.7, 0.8, 0.9, 0], [0.75, 0, 0.5, 0.4]]
+    # weighted loss    = [[2.1, 4.8, 4.5, 0], [3, 0, 0.5, 1.2]]
+    # reduced loss = (2.1 + 4.8 + 4.5 + 0 + 3 + 0 + 0.5 + 1.2) / 8
+
+    loss = hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(self.evaluate(loss), 2.012, 1e-3)
+
+  def test_zero_weighted(self):
+    hinge_obj = keras.losses.Hinge()
+    y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+    y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
+                                   [-0.25, -1., 0.5, 0.6]])
+    loss = hinge_obj(y_true, y_pred, sample_weight=0)
+    self.assertAllClose(self.evaluate(loss), 0., 1e-3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SquaredHingeTest(test.TestCase):
+
+  def test_config(self):
+    sq_hinge_obj = keras.losses.SquaredHinge(
+        reduction=losses_utils.ReductionV2.SUM, name='sq_hinge_loss')
+    self.assertEqual(sq_hinge_obj.name, 'sq_hinge_loss')
+    self.assertEqual(sq_hinge_obj.reduction, losses_utils.ReductionV2.SUM)
+
+  def test_unweighted(self):
+    sq_hinge_obj = keras.losses.SquaredHinge()
+    y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+    y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
+                                   [-0.25, -1., 0.5, 0.6]])
+
+    # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+    # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
+    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+    # max(0, 1 - y_true * y_pred) = [[0.7, 0.8, 0.9, 0], [0.75, 0, 0.5, 0.4]]
+    # squared(max(0, 1 - y_true * y_pred)) = [[0.49, 0.64, 0.81, 0],
+    #                                         [0.5625, 0, 0.25, 0.16]]
+    # loss = [(0.49 + 0.64 + 0.81 + 0) / 4, (0.5625 + 0 + 0.25 + 0.16) / 4]
+    #      = [0.485, 0.2431]
+    # reduced loss = (0.485 + 0.2431) / 2
+
+    loss = sq_hinge_obj(y_true, y_pred)
+    self.assertAllClose(self.evaluate(loss), 0.364, 1e-3)
+
+  def test_scalar_weighted(self):
+    sq_hinge_obj = keras.losses.SquaredHinge()
+    y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+    y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
+                                   [-0.25, -1., 0.5, 0.6]])
+
+    # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+    # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
+    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+    # max(0, 1 - y_true * y_pred) = [[0.7, 0.8, 0.9, 0], [0.75, 0, 0.5, 0.4]]
+    # squared(max(0, 1 - y_true * y_pred)) = [[0.49, 0.64, 0.81, 0],
+    #                                         [0.5625, 0, 0.25, 0.16]]
+    # loss = [(0.49 + 0.64 + 0.81 + 0) / 4, (0.5625 + 0 + 0.25 + 0.16) / 4]
+    #      = [0.485, 0.2431]
+    # weighted loss = [0.485 * 2.3, 0.2431 * 2.3]
+    # reduced loss = (0.485 + 0.2431) * 2.3 / 2
+
+    loss = sq_hinge_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAllClose(self.evaluate(loss), 0.837, 1e-3)
+
+    # Verify we get the same output when the same input is given
+    loss_2 = sq_hinge_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+  def test_sample_weighted(self):
+    sq_hinge_obj = keras.losses.SquaredHinge()
+    y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+    y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
+                                   [-0.25, -1., 0.5, 0.6]])
+
+    # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+    # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
+    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+    # max(0, 1 - y_true * y_pred) = [[0.7, 0.8, 0.9, 0], [0.75, 0, 0.5, 0.4]]
+    # squared(max(0, 1 - y_true * y_pred)) = [[0.49, 0.64, 0.81, 0],
+    #                                         [0.5625, 0, 0.25, 0.16]]
+    # loss = [(0.49 + 0.64 + 0.81 + 0) / 4, (0.5625 + 0 + 0.25 + 0.16) / 4]
+    #      = [0.485, 0.2431]
+    # weighted loss = [0.485 * 1.2, 0.2431 * 3.4]
+    # reduced loss = (0.485 * 1.2 + 0.2431 * 3.4) / 2
+
+    sample_weight = constant_op.constant([1.2, 3.4])
+    loss = sq_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(self.evaluate(loss), 0.704, 1e-3)
+
+  def test_timestep_weighted(self):
+    sq_hinge_obj = keras.losses.SquaredHinge()
+    y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]], shape=(2, 4, 1))
+    y_pred = constant_op.constant(
+        [[-0.3, 0.2, -0.1, 1.6], [-0.25, -1., 0.5, 0.6]], shape=(2, 4, 1))
+    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2, 1, 3], shape=(2, 4))
+
+    # loss = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+    # y_true = [[[-1], [1], [-1], [1]], [[-1], [-1], [1], [1]]]
+    # y_true * y_pred = [[[0.3], [0.2], [0.1], [1.6]],
+    #                    [[0.25], [1], [0.5], [0.6]]]
+    # 1 - y_true * y_pred = [[[0.7], [0.8], [0.9], [-0.6]],
+    #                        [[0.75], [0], [0.5], [0.4]]]
+    # loss = [[0.49, 0.64, 0.81, 0], [0.5625, 0, 0.25, 0.16]]
+    # weighted loss    = [[1.47, 3.84, 4.05, 0], [2.25, 0, 0.25, 0.48]]
+    # reduced loss = (1.47 + 3.84 + 4.05 + 0 + 2.25 + 0 + 0.25 + 0.48) / 8
+
+    loss = sq_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(self.evaluate(loss), 1.542, 1e-3)
+
+  def test_zero_weighted(self):
+    sq_hinge_obj = keras.losses.SquaredHinge()
+    y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+    y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
+                                   [-0.25, -1., 0.5, 0.6]])
+    loss = sq_hinge_obj(y_true, y_pred, sample_weight=0)
+    self.assertAllClose(self.evaluate(loss), 0., 1e-3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class CategoricalHingeTest(test.TestCase):
+
+  def test_config(self):
+    cat_hinge_obj = keras.losses.CategoricalHinge(
+        reduction=losses_utils.ReductionV2.SUM, name='cat_hinge_loss')
+    self.assertEqual(cat_hinge_obj.name, 'cat_hinge_loss')
+    self.assertEqual(cat_hinge_obj.reduction, losses_utils.ReductionV2.SUM)
+
+  def test_unweighted(self):
+    cat_hinge_obj = keras.losses.CategoricalHinge()
+    y_true = constant_op.constant([1, 9, 2, -5], shape=(2, 2))
+    y_pred = constant_op.constant([4, 8, 12, 8],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    loss = cat_hinge_obj(y_true, y_pred)
+
+    # pos = reduce_sum(y_true * y_pred) = [1*4+8*9, 12*2+8*-5] = [76, -16]
+    # neg = reduce_max((1. - y_true) * y_pred) = [[0, -64], [-12, 48]] = [0, 48]
+    # cat_hinge = max(0., neg - pos + 1.) = [0, 65]
+    # reduced_loss = (0 + 65)/2 = 32.5
+    self.assertAlmostEqual(self.evaluate(loss), 32.5, 3)
+
+  def test_scalar_weighted(self):
+    cat_hinge_obj = keras.losses.CategoricalHinge()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = cat_hinge_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 83.95, 3)
+
+    # Verify we get the same output when the same input is given
+    loss_2 = cat_hinge_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+  def test_sample_weighted(self):
+    cat_hinge_obj = keras.losses.CategoricalHinge()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = cat_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 124.1, 3)
+
+  def test_timestep_weighted(self):
+    cat_hinge_obj = keras.losses.CategoricalHinge()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3, 1),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+    loss = cat_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 4.0, 3)
+
+  def test_zero_weighted(self):
+    cat_hinge_obj = keras.losses.CategoricalHinge()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = cat_hinge_obj(y_true, y_pred, sample_weight=0)
+    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class LogLossTest(test.TestCase):
+
+  def setup(self):
+    # TODO(psv): Change to setUp() after b/122319309 is fixed.
+    y_pred = np.asarray([.9, .2, .2, .8, .4, .6]).reshape((2, 3))
+    y_true = np.asarray([1., 0., 1., 1., 0., 0.]).reshape((2, 3))
+    epsilon = 1e-7  # to avoid log 0
+
+    self.batch_size = 6
+    self.expected_losses = np.multiply(y_true, np.log(y_pred + epsilon))
+    self.expected_losses += np.multiply(1 - y_true,
+                                        np.log(1 - y_pred + epsilon))
+    self.expected_losses = -self.expected_losses
+
+    self.y_pred = constant_op.constant(y_pred)
+    self.y_true = constant_op.constant(y_true)
+
+  def test_config(self):
+    log_loss_obj = keras.losses.LogLoss(
+        reduction=losses_utils.ReductionV2.SUM, name='log')
+    self.assertEqual(log_loss_obj.name, 'log')
+    self.assertEqual(log_loss_obj.reduction, losses_utils.ReductionV2.SUM)
+
+  def test_all_correct(self):
+    self.setup()
+    log_loss_obj = keras.losses.LogLoss()
+    loss = log_loss_obj(self.y_true, self.y_true)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_unweighted(self):
+    self.setup()
+    log_loss_obj = keras.losses.LogLoss()
+    loss = log_loss_obj(self.y_true, self.y_pred)
+    actual_loss = np.sum(self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+  def test_scalar_weighted(self):
+    self.setup()
+    log_loss_obj = keras.losses.LogLoss()
+    sample_weight = 2.3
+    loss = log_loss_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    actual_loss = sample_weight * np.sum(self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+    # Verify we get the same output when the same input is given
+    loss_2 = log_loss_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+  def test_sample_weighted(self):
+    self.setup()
+    log_loss_obj = keras.losses.LogLoss()
+    sample_weight = constant_op.constant((1.2, 3.4), shape=(2, 1))
+
+    loss = log_loss_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    actual_loss = np.multiply(
+        self.expected_losses,
+        np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3)))
+    actual_loss = np.sum(actual_loss) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+  def test_timestep_weighted(self):
+    log_loss_obj = keras.losses.LogLoss()
+
+    y_pred = np.asarray([.9, .2, .2, .8, .4, .6]).reshape((2, 3, 1))
+    y_true = np.asarray([1., 0., 1., 1., 0., 0.]).reshape((2, 3, 1))
+    epsilon = 1e-7  # to avoid log 0
+    batch_size = 6
+
+    expected_losses = np.multiply(y_true, np.log(y_pred + epsilon))
+    expected_losses += np.multiply(1 - y_true, np.log(1 - y_pred + epsilon))
+
+    y_pred = constant_op.constant(y_pred)
+    y_true = constant_op.constant(y_true)
+    sample_weight = np.array([3, 6, 5, 0, 4, 2]).reshape((2, 3, 1))
+    loss = log_loss_obj(
+        y_true,
+        y_pred,
+        sample_weight=constant_op.constant(sample_weight, shape=(2, 3)))
+    actual_loss = np.multiply(-expected_losses, sample_weight)
+    actual_loss = np.sum(actual_loss) / batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+  def test_zero_weighted(self):
+    self.setup()
+    log_loss_obj = keras.losses.LogLoss()
+    sample_weight = 0
+    loss = log_loss_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class LogCoshTest(test.TestCase):
+
+  def setup(self):
+    y_pred = np.asarray([1, 9, 2, -5, -2, 6]).reshape((2, 3))
+    y_true = np.asarray([4, 8, 12, 8, 1, 3]).reshape((2, 3))
+
+    self.batch_size = 6
+    error = y_pred - y_true
+    self.expected_losses = np.log((np.exp(error) + np.exp(-error)) / 2)
+
+    self.y_pred = constant_op.constant(y_pred, dtype=dtypes.float32)
+    self.y_true = constant_op.constant(y_true)
+
+  def test_config(self):
+    logcosh_obj = keras.losses.LogCosh(
+        reduction=losses_utils.ReductionV2.SUM, name='logcosh_loss')
+    self.assertEqual(logcosh_obj.name, 'logcosh_loss')
+    self.assertEqual(logcosh_obj.reduction, losses_utils.ReductionV2.SUM)
+
+  def test_unweighted(self):
+    self.setup()
+    logcosh_obj = keras.losses.LogCosh()
+
+    loss = logcosh_obj(self.y_true, self.y_pred)
+    expected_loss = np.sum(self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_scalar_weighted(self):
+    self.setup()
+    logcosh_obj = keras.losses.LogCosh()
+    sample_weight = 2.3
+
+    loss = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    expected_loss = sample_weight * np.sum(
+        self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+    # Verify we get the same output when the same input is given
+    loss_2 = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+  def test_sample_weighted(self):
+    self.setup()
+    logcosh_obj = keras.losses.LogCosh()
+
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+
+    expected_loss = np.multiply(
+        self.expected_losses,
+        np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3)))
+    expected_loss = np.sum(expected_loss) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_timestep_weighted(self):
+    self.setup()
+    logcosh_obj = keras.losses.LogCosh()
+    y_true = np.asarray([1, 9, 2, -5, -2, 6]).reshape(2, 3, 1)
+    y_pred = np.asarray([4, 8, 12, 8, 1, 3]).reshape(2, 3, 1)
+    error = y_pred - y_true
+    expected_losses = np.log((np.exp(error) + np.exp(-error)) / 2)
+    sample_weight = np.array([3, 6, 5, 0, 4, 2]).reshape((2, 3, 1))
+
+    y_pred = constant_op.constant(y_pred, dtype=dtypes.float32)
+    y_true = constant_op.constant(y_true)
+    loss = logcosh_obj(
+        y_true,
+        y_pred,
+        sample_weight=constant_op.constant(sample_weight, shape=(2, 3)))
+    expected_loss = np.sum(expected_losses * sample_weight) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_zero_weighted(self):
+    self.setup()
+    logcosh_obj = keras.losses.LogCosh()
+    sample_weight = 0
+    loss = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class PoissonTest(test.TestCase):
+
+  def setup(self):
+    self.np_y_pred = np.asarray([1, 9, 2, 5, 2, 6]).reshape((2, 3))
+    self.np_y_true = np.asarray([4, 8, 12, 8, 1, 3]).reshape((2, 3))
+
+    self.batch_size = 6
+    self.expected_losses = self.np_y_pred - np.multiply(self.np_y_true,
+                                                        np.log(self.np_y_pred))
+
+    self.y_pred = constant_op.constant(self.np_y_pred, dtype=dtypes.float32)
+    self.y_true = constant_op.constant(self.np_y_true)
+
+  def test_config(self):
+    poisson_obj = keras.losses.Poisson(
+        reduction=losses_utils.ReductionV2.SUM, name='poisson')
+    self.assertEqual(poisson_obj.name, 'poisson')
+    self.assertEqual(poisson_obj.reduction, losses_utils.ReductionV2.SUM)
+
+  def test_unweighted(self):
+    self.setup()
+    poisson_obj = keras.losses.Poisson()
+
+    loss = poisson_obj(self.y_true, self.y_pred)
+    expected_loss = np.sum(self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_scalar_weighted(self):
+    self.setup()
+    poisson_obj = keras.losses.Poisson()
+    sample_weight = 2.3
+    loss = poisson_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+
+    expected_loss = sample_weight * np.sum(
+        self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+    # Verify we get the same output when the same input is given
+    loss_2 = poisson_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+  def test_sample_weighted(self):
+    self.setup()
+    poisson_obj = keras.losses.Poisson()
+
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = poisson_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+
+    expected_loss = np.multiply(
+        self.expected_losses,
+        np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3)))
+    expected_loss = np.sum(expected_loss) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_timestep_weighted(self):
+    self.setup()
+    poisson_obj = keras.losses.Poisson()
+    y_true = self.np_y_true.reshape(2, 3, 1)
+    y_pred = self.np_y_pred.reshape(2, 3, 1)
+    sample_weight = np.asarray([3, 6, 5, 0, 4, 2]).reshape(2, 3, 1)
+    expected_losses = y_pred - np.multiply(y_true, np.log(y_pred))
+
+    y_pred = constant_op.constant(y_pred, dtype=dtypes.float32)
+    y_true = constant_op.constant(y_true)
+
+    loss = poisson_obj(
+        y_true,
+        y_pred,
+        sample_weight=constant_op.constant(sample_weight, shape=(2, 3)))
+    expected_loss = np.sum(expected_losses * sample_weight) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_zero_weighted(self):
+    self.setup()
+    poisson_obj = keras.losses.Poisson()
+    loss = poisson_obj(self.y_true, self.y_pred, sample_weight=0)
+    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class KLDivergenceTest(test.TestCase):
+
+  def setup(self):
+    self.np_y_pred = np.asarray([.4, .9, .12, .36, .3, .4]).reshape((2, 3))
+    self.np_y_true = np.asarray([.5, .8, .12, .7, .43, .8]).reshape((2, 3))
+
+    self.batch_size = 2
+    self.expected_losses = np.multiply(self.np_y_true,
+                                       np.log(self.np_y_true / self.np_y_pred))
+
+    self.y_pred = constant_op.constant(self.np_y_pred, dtype=dtypes.float32)
+    self.y_true = constant_op.constant(self.np_y_true)
+
+  def test_config(self):
+    k_obj = keras.losses.KLDivergence(
+        reduction=losses_utils.ReductionV2.SUM, name='kld')
+    self.assertEqual(k_obj.name, 'kld')
+    self.assertEqual(k_obj.reduction, losses_utils.ReductionV2.SUM)
+
+  def test_unweighted(self):
+    self.setup()
+    k_obj = keras.losses.KLDivergence()
+
+    loss = k_obj(self.y_true, self.y_pred)
+    expected_loss = np.sum(self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_scalar_weighted(self):
+    self.setup()
+    k_obj = keras.losses.KLDivergence()
+    sample_weight = 2.3
+
+    loss = k_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    expected_loss = sample_weight * np.sum(
+        self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+    # Verify we get the same output when the same input is given
+    loss_2 = k_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+  def test_sample_weighted(self):
+    self.setup()
+    k_obj = keras.losses.KLDivergence()
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = k_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+
+    expected_loss = np.multiply(
+        self.expected_losses,
+        np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape(2, 3))
+    expected_loss = np.sum(expected_loss) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_timestep_weighted(self):
+    self.setup()
+    k_obj = keras.losses.KLDivergence()
+    y_true = self.np_y_true.reshape(2, 3, 1)
+    y_pred = self.np_y_pred.reshape(2, 3, 1)
+    sample_weight = np.asarray([3, 6, 5, 0, 4, 2]).reshape(2, 3)
+    expected_losses = np.sum(
+        np.multiply(y_true, np.log(y_true / y_pred)), axis=-1)
+
+    y_pred = constant_op.constant(y_pred, dtype=dtypes.float32)
+    y_true = constant_op.constant(y_true)
+    loss = k_obj(
+        y_true, y_pred, sample_weight=constant_op.constant(sample_weight))
+
+    num_timesteps = 3
+    expected_loss = np.sum(expected_losses * sample_weight) / (
+        self.batch_size * num_timesteps)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_zero_weighted(self):
+    self.setup()
+    k_obj = keras.losses.KLDivergence()
+    loss = k_obj(self.y_true, self.y_pred, sample_weight=0)
+    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class HuberLossTest(test.TestCase):
+
+  def huber_loss(self, y_true, y_pred, delta=1.0):
+    error = y_pred - y_true
+    abs_error = np.abs(error)
+
+    quadratic = np.minimum(abs_error, delta)
+    linear = np.subtract(abs_error, quadratic)
+    return np.add(
+        np.multiply(0.5, np.multiply(quadratic, quadratic)),
+        np.multiply(delta, linear))
+
+  def setup(self, delta=1.0):
+    self.np_y_pred = np.asarray([.9, .2, .2, .8, .4, .6]).reshape((2, 3))
+    self.np_y_true = np.asarray([1., 0., 1., 1., 0., 0.]).reshape((2, 3))
+
+    self.batch_size = 6
+    self.expected_losses = self.huber_loss(self.np_y_true, self.np_y_pred,
+                                           delta)
+
+    self.y_pred = constant_op.constant(self.np_y_pred)
+    self.y_true = constant_op.constant(self.np_y_true)
+
+  def test_config(self):
+    h_obj = keras.losses.Huber(
+        reduction=losses_utils.ReductionV2.SUM, name='huber')
+    self.assertEqual(h_obj.name, 'huber')
+    self.assertEqual(h_obj.reduction, losses_utils.ReductionV2.SUM)
+
+  def test_all_correct(self):
+    self.setup()
+    h_obj = keras.losses.Huber()
+    loss = h_obj(self.y_true, self.y_true)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_unweighted(self):
+    self.setup()
+    h_obj = keras.losses.Huber()
+    loss = h_obj(self.y_true, self.y_pred)
+    actual_loss = np.sum(self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+  def test_scalar_weighted(self):
+    self.setup()
+    h_obj = keras.losses.Huber()
+    sample_weight = 2.3
+    loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    actual_loss = sample_weight * np.sum(self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+    # Verify we get the same output when the same input is given
+    loss_2 = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), self.evaluate(loss_2), 3)
+
+  def test_sample_weighted(self):
+    self.setup()
+    h_obj = keras.losses.Huber()
+    sample_weight = constant_op.constant((1.2, 3.4), shape=(2, 1))
+
+    loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    actual_loss = np.multiply(
+        self.expected_losses,
+        np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3)))
+    actual_loss = np.sum(actual_loss) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+  def test_timestep_weighted(self):
+    self.setup()
+    h_obj = keras.losses.Huber()
+    y_pred = self.np_y_pred.reshape((2, 3, 1))
+    y_true = self.np_y_true.reshape((2, 3, 1))
+    expected_losses = self.huber_loss(y_true, y_pred)
+
+    y_pred = constant_op.constant(y_pred)
+    y_true = constant_op.constant(y_true)
+    sample_weight = np.array([3, 6, 5, 0, 4, 2]).reshape((2, 3, 1))
+    loss = h_obj(
+        y_true,
+        y_pred,
+        sample_weight=constant_op.constant(sample_weight, shape=(2, 3)))
+    actual_loss = np.multiply(expected_losses, sample_weight)
+    actual_loss = np.sum(actual_loss) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+  def test_zero_weighted(self):
+    self.setup()
+    h_obj = keras.losses.Huber()
+    sample_weight = 0
+    loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
+
+  def test_non_default_delta(self):
+    self.setup(delta=0.8)
+    h_obj = keras.losses.Huber(delta=0.8)
+    sample_weight = 2.3
+    loss = h_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    actual_loss = sample_weight * np.sum(self.expected_losses) / self.batch_size
+    self.assertAlmostEqual(self.evaluate(loss), actual_loss, 3)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index 331a8636d1c93ce9c8ee03a8d6c0f486617bf6dd..b0b4a8dfb9d9811ffdc63e3a29c5b29030809d35 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -20,24 +20,21 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
-import functools
 import sys
 import types
-import weakref
-from enum import Enum
 import numpy as np
 import six
 
-from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
-from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.losses import binary_crossentropy
 from tensorflow.python.keras.losses import categorical_crossentropy
-from tensorflow.python.keras.losses import cosine_proximity
+from tensorflow.python.keras.losses import categorical_hinge
+from tensorflow.python.keras.losses import cosine_similarity
 from tensorflow.python.keras.losses import hinge
 from tensorflow.python.keras.losses import kullback_leibler_divergence
 from tensorflow.python.keras.losses import logcosh
@@ -48,279 +45,24 @@ from tensorflow.python.keras.losses import mean_squared_logarithmic_error
 from tensorflow.python.keras.losses import poisson
 from tensorflow.python.keras.losses import sparse_categorical_crossentropy
 from tensorflow.python.keras.losses import squared_hinge
+from tensorflow.python.keras.utils import metrics_utils
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import to_list
 from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
+from tensorflow.python.keras.utils.tf_utils import is_tensor_or_variable
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import confusion_matrix
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.ops import weights_broadcast_ops
-from tensorflow.python.util import tf_decorator
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
 
-def clone_metric(metric):
-  """Returns a clone of the metric if stateful, otherwise returns it as is."""
-  if isinstance(metric, Metric):
-    return metric.__class__.from_config(metric.get_config())
-  return metric
-
-
-def clone_metrics(metrics):
-  """Clones the given metric list/dict."""
-  if metrics is None:
-    return None
-  if isinstance(metrics, dict):
-    return {key: clone_metric(value) for key, value in metrics.items()}
-  return [clone_metric(metric) for metric in metrics]
-
-
-def update_state_wrapper(update_state_fn):
-  """Decorator to wrap metric `update_state()` with `add_update()`.
-
-  Args:
-    update_state_fn: function that accumulates metric statistics.
-
-  Returns:
-    Decorated function that wraps `update_state_fn()` with `add_update()`.
-  """
-
-  def decorated(metric_obj, *args, **kwargs):
-    """Decorated function with `add_update()`."""
-
-    update_op = update_state_fn(*args, **kwargs)
-    if update_op is not None:  # update_op will be None in eager execution.
-      metric_obj.add_update(update_op, inputs=True)
-    return update_op
-
-  return tf_decorator.make_decorator(update_state_fn, decorated)
-
-
-def result_wrapper(result_fn):
-  """Decorator to wrap metric `result()` function in `merge_call()`.
-
-  Result computation is an idempotent operation that simply calculates the
-  metric value using the state variables.
-
-  If metric state variables are distributed across replicas/devices and
-  `result()` is requested from the context of one device - This function wraps
-  `result()` in a distribution strategy `merge_call()`. With this,
-  the metric state variables will be aggregated across devices.
-
-  Args:
-    result_fn: function that computes the metric result.
-
-  Returns:
-    Decorated function that wraps `result_fn()` in distribution strategy
-    `merge_call()`.
-  """
-
-  def decorated(_, *args):
-    """Decorated function with merge_call."""
-    replica_context = distribution_strategy_context.get_replica_context()
-    if replica_context is None:  # if in cross replica context already
-      result_t = result_fn(*args)
-    else:
-      # TODO(psv): Test distribution of metrics using different distribution
-      # strategies.
-
-      # Creating a wrapper for merge_fn. merge_call invokes the given merge_fn
-      # with distribution object as the first parameter. We create a wrapper
-      # here so that the result function need not have that parameter.
-      def merge_fn_wrapper(distribution, merge_fn, *args):
-        # We will get `PerDevice` merge function. Taking the first one as all
-        # are identical copies of the function that we had passed below.
-        return distribution.unwrap(merge_fn)[0](*args)
-
-      # Wrapping result in merge_call. merge_call is used when we want to leave
-      # replica mode and compute a value in cross replica mode.
-      result_t = replica_context.merge_call(
-          merge_fn_wrapper, args=(result_fn,) + args)
-    return result_t
-
-  return tf_decorator.make_decorator(result_fn, decorated)
-
-
-def weakmethod(method):
-  """Creates a weak reference to the bound method."""
-
-  cls = method.im_class
-  func = method.im_func
-  instance_ref = weakref.ref(method.im_self)
-
-  @functools.wraps(method)
-  def inner(*args, **kwargs):
-    return func.__get__(instance_ref(), cls)(*args, **kwargs)
-
-  del method
-  return inner
-
-
-class _ConfusionMatrix(Enum):
-  TRUE_POSITIVES = 'tp'
-  FALSE_POSITIVES = 'fp'
-  TRUE_NEGATIVES = 'tn'
-  FALSE_NEGATIVES = 'fn'
-
-
-def _assert_thresholds_range(thresholds):
-  invalid_thresholds = [t for t in thresholds if t < 0 or t > 1]
-  if any(invalid_thresholds):
-    raise ValueError('Threshold values must be in [0, 1]. Invalid values: {}'
-                     .format(invalid_thresholds))
-
-
-def _update_confusion_matrix_variables(variables_to_update,
-                                       y_true,
-                                       y_pred,
-                                       thresholds,
-                                       sample_weight=None):
-  """Returns op to update the given confusion matrix variables.
-
-  For every pair of values in y_true and y_pred:
-
-  true_positive: y_true == True and y_pred > thresholds
-  false_negatives: y_true == True and y_pred <= thresholds
-  true_negatives: y_true == False and y_pred <= thresholds
-  false_positive: y_true == False and y_pred > thresholds
-
-  The results will be weighted and added together. When multiple thresholds are
-  provided, we will repeat the same for every threshold.
-
-  For estimation of these metrics over a stream of data, the function creates an
-  `update_op` operation that updates the given variables.
-
-  If `sample_weight` is `None`, weights default to 1.
-  Use weights of 0 to mask values.
-
-  Args:
-    variables_to_update: Dictionary with 'tp', 'fn', 'tn', 'fp' as valid keys
-      and corresponding variables to update as values.
-    y_true: A `Tensor` whose shape matches `y_pred`. Will be cast to `bool`.
-    y_pred: A floating point `Tensor` of arbitrary shape and whose values are in
-      the range `[0, 1]`.
-    thresholds: A float value or a python list or tuple of float thresholds in
-      `[0, 1]`.
-    sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as
-      `y_true`, and must be broadcastable to `y_true` (i.e., all dimensions must
-      be either `1`, or the same as the corresponding `y_true` dimension).
-
-  Returns:
-    Update op.
-
-  Raises:
-    ValueError: If `y_pred` and `y_true` have mismatched shapes, or if
-      `sample_weight` is not `None` and its shape doesn't match `y_pred`, or if
-      `variables_to_update` contains invalid keys.
-  """
-  if variables_to_update is None:
-    return
-  y_true = ops.convert_to_tensor(y_true)
-  y_pred = ops.convert_to_tensor(y_pred)
-  y_pred.shape.assert_is_compatible_with(y_true.shape)
-
-  if not any(
-      key for key in variables_to_update if key in list(_ConfusionMatrix)):
-    raise ValueError(
-        'Please provide at least one valid confusion matrix '
-        'variable to update. Valid variable key options are: "{}". '
-        'Received: "{}"'.format(
-            list(_ConfusionMatrix), variables_to_update.keys()))
-
-  invalid_keys = [
-      key for key in variables_to_update if key not in list(_ConfusionMatrix)
-  ]
-  if invalid_keys:
-    raise ValueError(
-        'Invalid keys: {}. Valid variable key options are: "{}"'.format(
-            invalid_keys, list(_ConfusionMatrix)))
-
-  with ops.control_dependencies([
-      check_ops.assert_greater_equal(
-          y_pred,
-          math_ops.cast(0.0, dtype=y_pred.dtype),
-          message='predictions must be >= 0'),
-      check_ops.assert_less_equal(
-          y_pred,
-          math_ops.cast(1.0, dtype=y_pred.dtype),
-          message='predictions must be <= 1')
-  ]):
-    y_pred, y_true, sample_weight = squeeze_or_expand_dimensions(
-        math_ops.cast(y_pred, dtype=dtypes.float32),
-        math_ops.cast(y_true, dtype=dtypes.bool), sample_weight)
-
-  thresholds = to_list(thresholds)
-  num_thresholds = len(thresholds)
-  num_predictions = array_ops.size(y_pred)
-
-  # Reshape predictions and labels.
-  predictions_2d = array_ops.reshape(y_pred, [1, -1])
-  labels_2d = array_ops.reshape(
-      math_ops.cast(y_true, dtype=dtypes.bool), [1, -1])
-
-  # Tile the thresholds for every prediction.
-  thresh_tiled = array_ops.tile(
-      array_ops.expand_dims(array_ops.constant(thresholds), 1),
-      array_ops.stack([1, num_predictions]))
-
-  # Tile the predictions for every threshold.
-  preds_tiled = array_ops.tile(predictions_2d, [num_thresholds, 1])
-
-  # Compare predictions and threshold.
-  pred_is_pos = math_ops.greater(preds_tiled, thresh_tiled)
-
-  # Tile labels by number of thresholds
-  label_is_pos = array_ops.tile(labels_2d, [num_thresholds, 1])
-
-  if sample_weight is not None:
-    weights = weights_broadcast_ops.broadcast_weights(
-        math_ops.cast(sample_weight, dtype=dtypes.float32), y_pred)
-    weights_tiled = array_ops.tile(
-        array_ops.reshape(weights, [1, -1]), [num_thresholds, 1])
-  else:
-    weights_tiled = None
-
-  update_ops = []
-
-  def weighted_assign_add(label, pred, weights, var):
-    label_and_pred = math_ops.cast(
-        math_ops.logical_and(label, pred), dtype=dtypes.float32)
-    if weights is not None:
-      label_and_pred *= weights
-    return state_ops.assign_add(var, math_ops.reduce_sum(label_and_pred, 1))
-
-  loop_vars = {
-      _ConfusionMatrix.TRUE_POSITIVES: (label_is_pos, pred_is_pos),
-  }
-  update_tn = _ConfusionMatrix.TRUE_NEGATIVES in variables_to_update
-  update_fp = _ConfusionMatrix.FALSE_POSITIVES in variables_to_update
-  update_fn = _ConfusionMatrix.FALSE_NEGATIVES in variables_to_update
-
-  if update_fn or update_tn:
-    pred_is_neg = math_ops.logical_not(pred_is_pos)
-    loop_vars[_ConfusionMatrix.FALSE_NEGATIVES] = (label_is_pos, pred_is_neg)
-
-  if update_fp or update_tn:
-    label_is_neg = math_ops.logical_not(label_is_pos)
-    loop_vars[_ConfusionMatrix.FALSE_POSITIVES] = (label_is_neg, pred_is_pos)
-    if update_tn:
-      loop_vars[_ConfusionMatrix.TRUE_NEGATIVES] = (label_is_neg, pred_is_neg)
-
-  for matrix_cond, (label, pred) in loop_vars.items():
-    if matrix_cond in variables_to_update:
-      update_ops.append(
-          weighted_assign_add(label, pred, weights_tiled,
-                              variables_to_update[matrix_cond]))
-  return control_flow_ops.group(update_ops)
-
-
+@keras_export('keras.metrics.Metric')
 @six.add_metaclass(abc.ABCMeta)
 class Metric(Layer):
   """Encapsulates metric logic and state.
@@ -367,33 +109,30 @@ class Metric(Layer):
   Example subclass implementation:
 
   ```
-  class BinaryTruePositives(Metric):
-    def __init__(self, name='binary_true_positives', dtype=None):
-      super(BinaryTruePositives, self).__init__(name=name, dtype=dtype)
-      self.true_positives = self.add_weight(
-          'true_positives', initializer=init_ops.zeros_initializer)
+  class BinaryTruePositives(tf.keras.metrics.Metric):
+
+    def __init__(self, name='binary_true_positives'):
+      super(BinaryTruePositives, self).__init__(name=name)
+      self.true_positives = self.add_weight(name='tp', initializer='zeros')
 
     def update_state(self, y_true, y_pred, sample_weight=None):
-      y_true = math_ops.cast(y_true, dtypes.bool)
-      y_pred = math_ops.cast(y_pred, dtypes.bool)
-      y_pred, y_true, sample_weight = squeeze_or_expand_dimensions(
-          y_pred, y_true, sample_weight)
-
-      values = math_ops.logical_and(
-          math_ops.equal(y_true, True), math_ops.equal(y_pred, True))
-      values = math_ops.cast(values, self._dtype)
+      y_true = tf.cast(y_true, tf.bool)
+      y_pred = tf.cast(y_pred, tf.bool)
+
+      values = tf.logical_and(tf.equal(y_true, True), tf.equal(y_pred, True))
+      values = tf.cast(values, self.dtype)
       if sample_weight is not None:
-        sample_weight = math_ops.cast(sample_weight, self._dtype)
-        values = math_ops.multiply(values, sample_weight)
-      state_ops.assign_add(self.true_positives, math_ops.reduce_sum(values))
+        sample_weight = tf.cast(sample_weight, self.dtype)
+        values = tf.multiply(values, sample_weight)
+      return self.true_positives.assign_add(tf.reduce_sum(values))
 
     def result(self):
-      return array_ops.identity(self.true_positives)
+      return tf.identity(self.true_positives)
   ```
   """
 
-  def __init__(self, name=None, dtype=None):
-    super(Metric, self).__init__(name=name, dtype=dtype)
+  def __init__(self, name=None, dtype=None, **kwargs):
+    super(Metric, self).__init__(name=name, dtype=dtype, **kwargs)
     self.stateful = True  # All metric layers are stateful.
     self.built = True
     self._dtype = K.floatx() if dtype is None else dtypes.as_dtype(dtype).name
@@ -406,15 +145,18 @@ class Metric(Layer):
       # weak reference. This is to remove reference cycle that is created here.
       # This is not an issue in python versions > 3.
       if context.executing_eagerly():
-        obj.update_state = weakmethod(obj.update_state)
-      obj.update_state = weakmethod(
-          types.MethodType(update_state_wrapper(obj.update_state), obj))
-      result = weakmethod(obj.result)
-      obj.result = weakmethod(types.MethodType(result_wrapper(result), obj))
+        obj.update_state = metrics_utils.weakmethod(obj.update_state)
+      obj.update_state = metrics_utils.weakmethod(
+          types.MethodType(
+              metrics_utils.update_state_wrapper(obj.update_state), obj))
+      result = metrics_utils.weakmethod(obj.result)
+      obj.result = metrics_utils.weakmethod(
+          types.MethodType(metrics_utils.result_wrapper(result), obj))
     else:
       obj.update_state = types.MethodType(
-          update_state_wrapper(obj.update_state), obj)
-      obj.result = types.MethodType(result_wrapper(obj.result), obj)
+          metrics_utils.update_state_wrapper(obj.update_state), obj)
+      obj.result = types.MethodType(
+          metrics_utils.result_wrapper(obj.result), obj)
 
     return obj
 
@@ -444,14 +186,21 @@ class Metric(Layer):
         result_t._metric_obj = self  # pylint: disable=protected-access
       return result_t
 
+  @property
+  def dtype(self):
+    return self._dtype
+
+  def get_config(self):
+    """Returns the serializable config of the metric."""
+    return {'name': self.name, 'dtype': self.dtype}
+
   def reset_states(self):
     """Resets all of the metric state variables.
 
     This function is called between epochs/steps,
     when a metric is evaluated during training.
     """
-    for v in self.variables:
-      K.set_value(v, 0)
+    K.batch_set_value([(v, 0) for v in self.variables])
 
   @abc.abstractmethod
   def update_state(self, *args, **kwargs):
@@ -466,7 +215,6 @@ class Metric(Layer):
          All update ops added to the graph by this function will be executed.
       As a result, code should generally work the same way with graph or
       eager execution.
-    and adds the update op to the metric layer.
 
     Args:
       *args:
@@ -483,12 +231,6 @@ class Metric(Layer):
     """
     NotImplementedError('Must be implemented in subclasses.')
 
-  @classmethod
-  def from_config(cls, config):
-    if 'trainable' in config:
-      config.pop('trainable')
-    return cls(**config)
-
   ### For use by subclasses ###
   @doc_controls.for_subclass_implementers
   def add_weight(self,
@@ -496,12 +238,13 @@ class Metric(Layer):
                  shape=(),
                  aggregation=tf_variables.VariableAggregation.SUM,
                  synchronization=tf_variables.VariableSynchronization.ON_READ,
-                 initializer=None):
+                 initializer=None,
+                 dtype=None):
     """Adds state variable. Only for use by subclasses."""
     return super(Metric, self).add_weight(
         name=name,
         shape=shape,
-        dtype=self._dtype,
+        dtype=self._dtype if dtype is None else dtype,
         trainable=False,
         initializer=initializer,
         collections=[],
@@ -511,8 +254,141 @@ class Metric(Layer):
   ### End: For use by subclasses ###
 
 
-@tf_export('keras.metrics.Mean')
-class Mean(Metric):
+class Reduce(Metric):
+  """Encapsulates metrics that perform a reduce operation on the values."""
+
+  def __init__(self, reduction, name, dtype=None):
+    """Creates a `Reduce` instance.
+
+    Args:
+      reduction: a `tf.keras.metrics.Reduction` enum value.
+      name: string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(Reduce, self).__init__(name=name, dtype=dtype)
+    self.reduction = reduction
+    self.total = self.add_weight(
+        'total', initializer=init_ops.zeros_initializer)
+    if reduction in [metrics_utils.Reduction.SUM_OVER_BATCH_SIZE,
+                     metrics_utils.Reduction.WEIGHTED_MEAN]:
+      self.count = self.add_weight(
+          'count', initializer=init_ops.zeros_initializer)
+
+  def update_state(self, values, sample_weight=None):
+    """Accumulates statistics for computing the reduction metric.
+
+    For example, if `values` is [1, 3, 5, 7] and reduction=SUM_OVER_BATCH_SIZE,
+    then the value of `result()` is 4. If the `sample_weight` is specified as
+    [1, 1, 0, 0] then value of `result()` would be 2.
+
+    Args:
+      values: Per-example value.
+      sample_weight: Optional weighting of each example. Defaults to 1.
+
+    Returns:
+      Update op.
+    """
+    values = math_ops.cast(values, self._dtype)
+    if sample_weight is not None:
+      sample_weight = math_ops.cast(sample_weight, self._dtype)
+      # Update dimensions of weights to match with values if possible.
+      values, _, sample_weight = squeeze_or_expand_dimensions(
+          values, None, sample_weight)
+      try:
+        # Broadcast weights if possible.
+        sample_weight = weights_broadcast_ops.broadcast_weights(
+            sample_weight, values)
+      except ValueError:
+        # Reduce values to same ndim as weight array
+        ndim = K.ndim(values)
+        weight_ndim = K.ndim(sample_weight)
+        if self.reduction == metrics_utils.Reduction.SUM:
+          values = math_ops.reduce_sum(
+              values, axis=list(range(weight_ndim, ndim)))
+        else:
+          values = math_ops.reduce_mean(
+              values, axis=list(range(weight_ndim, ndim)))
+      values = math_ops.multiply(values, sample_weight)
+
+    value_sum = math_ops.reduce_sum(values)
+    with ops.control_dependencies([value_sum]):
+      update_total_op = self.total.assign_add(value_sum)
+
+    # Exit early if the reduction doesn't have a denominator.
+    if self.reduction == metrics_utils.Reduction.SUM:
+      return update_total_op
+
+    # Update `count` for reductions that require a denominator.
+    if self.reduction == metrics_utils.Reduction.SUM_OVER_BATCH_SIZE:
+      num_values = math_ops.cast(array_ops.size(values), self._dtype)
+    elif self.reduction == metrics_utils.Reduction.WEIGHTED_MEAN:
+      if sample_weight is None:
+        num_values = math_ops.cast(array_ops.size(values), self._dtype)
+      else:
+        num_values = math_ops.reduce_sum(sample_weight)
+    else:
+      raise NotImplementedError(
+          'reduction [%s] not implemented' % self.reduction)
+
+    with ops.control_dependencies([update_total_op]):
+      return self.count.assign_add(num_values)
+
+  def result(self):
+    if self.reduction == metrics_utils.Reduction.SUM:
+      return array_ops.identity(self.total)
+    elif self.reduction in [
+        metrics_utils.Reduction.WEIGHTED_MEAN,
+        metrics_utils.Reduction.SUM_OVER_BATCH_SIZE
+    ]:
+      return math_ops.div_no_nan(self.total, self.count)
+    else:
+      raise NotImplementedError(
+          'reduction [%s] not implemented' % self.reduction)
+
+
+@keras_export('keras.metrics.Sum')
+class Sum(Reduce):
+  """Computes the (weighted) sum of the given values.
+
+  For example, if values is [1, 3, 5, 7] then the sum is 16.
+  If the weights were specified as [1, 1, 0, 0] then the sum would be 4.
+
+  This metric creates one variable, `total`, that is used to compute the sum of
+  `values`. This is ultimately returned as `sum`.
+
+  If `sample_weight` is `None`, weights default to 1.  Use `sample_weight` of 0
+  to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.Sum()
+  m.update_state([1, 3, 5, 7])
+  print('Final result: ', m.result().numpy())  # Final result: 16.0
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.add_metric(tf.keras.metrics.Sum(name='sum_1')(outputs))
+  model.compile('sgd', loss='mse')
+  ```
+  """
+
+  def __init__(self, name='sum', dtype=None):
+    """Creates a `Sum` instance.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(Sum, self).__init__(reduction=metrics_utils.Reduction.SUM,
+                              name=name, dtype=dtype)
+
+
+@keras_export('keras.metrics.Mean')
+class Mean(Reduce):
   """Computes the (weighted) mean of the given values.
 
   For example, if values is [1, 3, 5, 7] then the mean is 4.
@@ -536,7 +412,7 @@ class Mean(Metric):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.add_metric(tf.keras.metrics.Mean(name='mean_1')(outputs))
   model.compile('sgd', loss='mse')
   ```
@@ -549,58 +425,89 @@ class Mean(Metric):
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
     """
-    super(Mean, self).__init__(name=name, dtype=dtype)
-    # Create new state variables
-    self.total = self.add_weight(
-        'total', initializer=init_ops.zeros_initializer)
-    self.count = self.add_weight(
-        'count', initializer=init_ops.zeros_initializer)
+    super(Mean, self).__init__(
+        reduction=metrics_utils.Reduction.WEIGHTED_MEAN, name=name, dtype=dtype)
 
-  def update_state(self, values, sample_weight=None):
-    """Accumulates statistics for computing the mean.
 
-    For example, if `values` is [1, 3, 5, 7] then the mean is 4. If
-    the `sample_weight` is specified as [1, 1, 0, 0] then the mean would be 2.
+@keras_export('keras.metrics.MeanRelativeError')
+class MeanRelativeError(Mean):
+  """Computes the mean relative error by normalizing with the given values.
+
+  This metric creates two local variables, `total` and `count` that are used to
+  compute the mean relative absolute error. This average is weighted by
+  `sample_weight`, and it is ultimately returned as `mean_relative_error`:
+  an idempotent operation that simply divides `total` by `count`.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.MeanRelativeError(normalizer=[1, 3, 2, 3])
+  m.update_state([1, 3, 2, 3], [2, 4, 6, 8])
+
+  # metric = mean(|y_pred - y_true| / normalizer)
+  #        = mean([1, 1, 4, 5] / [1, 3, 2, 3]) = mean([1, 1/3, 2, 5/3])
+  #        = 5/4 = 1.25
+  print('Final result: ', m.result().numpy())  # Final result: 1.25
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile(
+    'sgd',
+    loss='mse',
+    metrics=[tf.keras.metrics.MeanRelativeError(normalizer=[1, 3])])
+  ```
+  """
+
+  def __init__(self, normalizer, name=None, dtype=None):
+    """Creates a `MeanRelativeError` instance.
 
     Args:
-      values: Per-example value.
-      sample_weight: Optional weighting of each example. Defaults to 1.
+      normalizer: The normalizer values with same shape as predictions.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(MeanRelativeError, self).__init__(name=name, dtype=dtype)
+    normalizer = math_ops.cast(normalizer, self._dtype)
+    self.normalizer = normalizer
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates metric statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
 
     Returns:
       Update op.
     """
-    values = math_ops.cast(values, self._dtype)
-    if sample_weight is None:
-      num_values = math_ops.cast(array_ops.size(values), self._dtype)
-    else:
-      sample_weight = math_ops.cast(sample_weight, self._dtype)
-
-      # Update dimensions of weights to match with values if possible.
-      values, _, sample_weight = squeeze_or_expand_dimensions(
-          values, None, sample_weight)
-      try:
-        # Broadcast weights if possible.
-        sample_weight = weights_broadcast_ops.broadcast_weights(
-            sample_weight, values)
-      except ValueError:
-        # Reduce values to same ndim as weight array
-        ndim = K.ndim(values)
-        weight_ndim = K.ndim(sample_weight)
-        values = math_ops.reduce_mean(
-            values, axis=list(range(weight_ndim, ndim)))
+    y_true = math_ops.cast(y_true, self._dtype)
+    y_pred = math_ops.cast(y_pred, self._dtype)
+    y_pred, y_true, sample_weight = squeeze_or_expand_dimensions(
+        y_pred, y_true, sample_weight)
 
-      num_values = math_ops.reduce_sum(sample_weight)
-      values = math_ops.multiply(values, sample_weight)
-    values = math_ops.reduce_sum(values)
+    y_pred, self.normalizer = confusion_matrix.remove_squeezable_dimensions(
+        y_pred, self.normalizer)
+    y_pred.shape.assert_is_compatible_with(y_pred.shape)
+    relative_errors = math_ops.div_no_nan(
+        math_ops.abs(y_true - y_pred), self.normalizer)
 
-    # Update state variables. Count should be updated only when total is
-    # updated.
-    update_total_op = state_ops.assign_add(self.total, values)
-    with ops.control_dependencies([update_total_op]):
-      return state_ops.assign_add(self.count, num_values)
+    return super(MeanRelativeError, self).update_state(
+        relative_errors, sample_weight=sample_weight)
 
-  def result(self):
-    return math_ops.div_no_nan(self.total, self.count)
+  def get_config(self):
+    n = self.normalizer
+    config = {'normalizer': K.eval(n) if is_tensor_or_variable(n) else n}
+    base_config = super(MeanRelativeError, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
 
 
 class MeanMetricWrapper(Mean):
@@ -645,13 +552,14 @@ class MeanMetricWrapper(Mean):
         matches, sample_weight=sample_weight)
 
   def get_config(self):
-    config = {'fn': self._fn}
-    config.update(self._fn_kwargs)
+    config = {}
+    for k, v in six.iteritems(self._fn_kwargs):
+      config[k] = K.eval(v) if is_tensor_or_variable(v) else v
     base_config = super(MeanMetricWrapper, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.metrics.Accuracy')
+@keras_export('keras.metrics.Accuracy')
 class Accuracy(MeanMetricWrapper):
   """Calculates how often predictions matches labels.
 
@@ -678,7 +586,7 @@ class Accuracy(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.Accuracy()])
   ```
   """
@@ -686,14 +594,8 @@ class Accuracy(MeanMetricWrapper):
   def __init__(self, name='accuracy', dtype=None):
     super(Accuracy, self).__init__(accuracy, name, dtype=dtype)
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(Accuracy, cls).from_config(config)
-
 
-@tf_export('keras.metrics.BinaryAccuracy')
+@keras_export('keras.metrics.BinaryAccuracy')
 class BinaryAccuracy(MeanMetricWrapper):
   """Calculates how often predictions matches labels.
 
@@ -720,7 +622,7 @@ class BinaryAccuracy(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.BinaryAccuracy()])
   ```
   """
@@ -737,21 +639,16 @@ class BinaryAccuracy(MeanMetricWrapper):
     super(BinaryAccuracy, self).__init__(
         binary_accuracy, name, dtype=dtype, threshold=threshold)
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(BinaryAccuracy, cls).from_config(config)
 
-
-@tf_export('keras.metrics.CategoricalAccuracy')
+@keras_export('keras.metrics.CategoricalAccuracy')
 class CategoricalAccuracy(MeanMetricWrapper):
   """Calculates how often predictions matches labels.
 
   For example, if `y_true` is [[0, 0, 1], [0, 1, 0]] and `y_pred` is
   [[0.1, 0.9, 0.8], [0.05, 0.95, 0]] then the categorical accuracy is 1/2 or .5.
   If the weights were specified as [0.7, 0.3] then the categorical accuracy
-  would be .3.
+  would be .3. You can provide logits of classes as `y_pred`, since argmax of
+  logits and probabilities are same.
 
   This metric creates two local variables, `total` and `count` that are used to
   compute the frequency with which `y_pred` matches `y_true`. This frequency is
@@ -775,7 +672,7 @@ class CategoricalAccuracy(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile(
     'sgd',
     loss='mse',
@@ -793,21 +690,16 @@ class CategoricalAccuracy(MeanMetricWrapper):
     super(CategoricalAccuracy, self).__init__(
         categorical_accuracy, name, dtype=dtype)
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(CategoricalAccuracy, cls).from_config(config)
-
 
-@tf_export('keras.metrics.SparseCategoricalAccuracy')
+@keras_export('keras.metrics.SparseCategoricalAccuracy')
 class SparseCategoricalAccuracy(MeanMetricWrapper):
   """Calculates how often predictions matches integer labels.
 
   For example, if `y_true` is [[2], [1]] and `y_pred` is
   [[0.1, 0.9, 0.8], [0.05, 0.95, 0]] then the categorical accuracy is 1/2 or .5.
   If the weights were specified as [0.7, 0.3] then the categorical accuracy
-  would be .3.
+  would be .3. You can provide logits of classes as `y_pred`, since argmax of
+  logits and probabilities are same.
 
   This metric creates two local variables, `total` and `count` that are used to
   compute the frequency with which `y_pred` matches `y_true`. This frequency is
@@ -828,7 +720,7 @@ class SparseCategoricalAccuracy(MeanMetricWrapper):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile(
       'sgd',
       loss='mse',
@@ -840,11 +732,73 @@ class SparseCategoricalAccuracy(MeanMetricWrapper):
     super(SparseCategoricalAccuracy, self).__init__(
         sparse_categorical_accuracy, name, dtype=dtype)
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(SparseCategoricalAccuracy, cls).from_config(config)
+
+@keras_export('keras.metrics.TopKCategoricalAccuracy')
+class TopKCategoricalAccuracy(MeanMetricWrapper):
+  """Computes how often targets are in the top `K` predictions.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.TopKCategoricalAccuracy()
+  m.update_state([[0, 0, 1], [0, 1, 0]], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
+  print('Final result: ', m.result().numpy())  # Final result: 1.0
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.TopKCategoricalAccuracy()])
+  ```
+  """
+
+  def __init__(self, k=5, name='top_k_categorical_accuracy', dtype=None):
+    """Creates a `TopKCategoricalAccuracy` instance.
+
+    Args:
+      k: (Optional) Number of top elements to look at for computing accuracy.
+        Defaults to 5.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(TopKCategoricalAccuracy, self).__init__(
+        top_k_categorical_accuracy, name, dtype=dtype, k=k)
+
+
+@keras_export('keras.metrics.SparseTopKCategoricalAccuracy')
+class SparseTopKCategoricalAccuracy(MeanMetricWrapper):
+  """Computes how often integer targets are in the top `K` predictions.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.SparseTopKCategoricalAccuracy()
+  m.update_state([2, 1], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
+  print('Final result: ', m.result().numpy())  # Final result: 1.0
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile(
+    'sgd',
+    metrics=[tf.keras.metrics.SparseTopKCategoricalAccuracy()])
+  ```
+  """
+
+  def __init__(self, k=5, name='sparse_top_k_categorical_accuracy', dtype=None):
+    """Creates a `SparseTopKCategoricalAccuracy` instance.
+
+    Args:
+      k: (Optional) Number of top elements to look at for computing accuracy.
+        Defaults to 5.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(SparseTopKCategoricalAccuracy, self).__init__(
+        sparse_top_k_categorical_accuracy, name, dtype=dtype, k=k)
 
 
 class _ConfusionMatrixConditionCount(Metric):
@@ -858,7 +812,7 @@ class _ConfusionMatrixConditionCount(Metric):
     """Creates a `_ConfusionMatrixConditionCount` instance.
 
     Args:
-      confusion_matrix_cond: One of `_ConfusionMatrix` conditions.
+      confusion_matrix_cond: One of `metrics_utils.ConfusionMatrix` conditions.
       thresholds: (Optional) Defaults to 0.5. A float value or a python
         list/tuple of float threshold values in [0, 1]. A threshold is compared
         with prediction values to determine the truth value of predictions
@@ -869,12 +823,12 @@ class _ConfusionMatrixConditionCount(Metric):
     """
     super(_ConfusionMatrixConditionCount, self).__init__(name=name, dtype=dtype)
     self._confusion_matrix_cond = confusion_matrix_cond
-    self.thresholds = 0.5 if thresholds is None else thresholds
-    thresholds = to_list(thresholds)
-    _assert_thresholds_range(thresholds)
+    self.init_thresholds = thresholds
+    self.thresholds = metrics_utils.parse_init_thresholds(
+        thresholds, default_threshold=0.5)
     self.accumulator = self.add_weight(
         'accumulator',
-        shape=(len(thresholds),),
+        shape=(len(self.thresholds),),
         initializer=init_ops.zeros_initializer)
 
   def update_state(self, y_true, y_pred, sample_weight=None):
@@ -890,24 +844,32 @@ class _ConfusionMatrixConditionCount(Metric):
     Returns:
       Update op.
     """
-    return _update_confusion_matrix_variables({
-        self._confusion_matrix_cond: self.accumulator
-    }, y_true, y_pred, self.thresholds, sample_weight)
+    return metrics_utils.update_confusion_matrix_variables(
+        {self._confusion_matrix_cond: self.accumulator},
+        y_true,
+        y_pred,
+        thresholds=self.thresholds,
+        sample_weight=sample_weight)
 
   def result(self):
-    if isinstance(self.thresholds, (list, tuple)):
-      result = self.accumulator
-    else:
+    if len(self.thresholds) == 1:
       result = self.accumulator[0]
+    else:
+      result = self.accumulator
     return ops.convert_to_tensor(result)
 
   def reset_states(self):
     num_thresholds = len(to_list(self.thresholds))
-    for v in self.variables:
-      K.set_value(v, np.zeros((num_thresholds,)))
+    K.batch_set_value(
+        [(v, np.zeros((num_thresholds,))) for v in self.variables])
+
+  def get_config(self):
+    config = {'thresholds': self.init_thresholds}
+    base_config = super(_ConfusionMatrixConditionCount, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.metrics.FalsePositives')
+@keras_export('keras.metrics.FalsePositives')
 class FalsePositives(_ConfusionMatrixConditionCount):
   """Calculates the number of false positives.
 
@@ -933,7 +895,7 @@ class FalsePositives(_ConfusionMatrixConditionCount):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.FalsePositives()])
   ```
   """
@@ -951,13 +913,13 @@ class FalsePositives(_ConfusionMatrixConditionCount):
       dtype: (Optional) data type of the metric result.
     """
     super(FalsePositives, self).__init__(
-        confusion_matrix_cond=_ConfusionMatrix.FALSE_POSITIVES,
+        confusion_matrix_cond=metrics_utils.ConfusionMatrix.FALSE_POSITIVES,
         thresholds=thresholds,
         name=name,
         dtype=dtype)
 
 
-@tf_export('keras.metrics.FalseNegatives')
+@keras_export('keras.metrics.FalseNegatives')
 class FalseNegatives(_ConfusionMatrixConditionCount):
   """Calculates the number of false negatives.
 
@@ -983,7 +945,7 @@ class FalseNegatives(_ConfusionMatrixConditionCount):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.FalseNegatives()])
   ```
   """
@@ -1001,13 +963,13 @@ class FalseNegatives(_ConfusionMatrixConditionCount):
       dtype: (Optional) data type of the metric result.
     """
     super(FalseNegatives, self).__init__(
-        confusion_matrix_cond=_ConfusionMatrix.FALSE_NEGATIVES,
+        confusion_matrix_cond=metrics_utils.ConfusionMatrix.FALSE_NEGATIVES,
         thresholds=thresholds,
         name=name,
         dtype=dtype)
 
 
-@tf_export('keras.metrics.TrueNegatives')
+@keras_export('keras.metrics.TrueNegatives')
 class TrueNegatives(_ConfusionMatrixConditionCount):
   """Calculates the number of true negatives.
 
@@ -1033,7 +995,7 @@ class TrueNegatives(_ConfusionMatrixConditionCount):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.TrueNegatives()])
   ```
   """
@@ -1051,13 +1013,13 @@ class TrueNegatives(_ConfusionMatrixConditionCount):
       dtype: (Optional) data type of the metric result.
     """
     super(TrueNegatives, self).__init__(
-        confusion_matrix_cond=_ConfusionMatrix.TRUE_NEGATIVES,
+        confusion_matrix_cond=metrics_utils.ConfusionMatrix.TRUE_NEGATIVES,
         thresholds=thresholds,
         name=name,
         dtype=dtype)
 
 
-@tf_export('keras.metrics.TruePositives')
+@keras_export('keras.metrics.TruePositives')
 class TruePositives(_ConfusionMatrixConditionCount):
   """Calculates the number of true positives.
 
@@ -1083,7 +1045,7 @@ class TruePositives(_ConfusionMatrixConditionCount):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.TruePositives()])
   ```
   """
@@ -1101,13 +1063,13 @@ class TruePositives(_ConfusionMatrixConditionCount):
       dtype: (Optional) data type of the metric result.
     """
     super(TruePositives, self).__init__(
-        confusion_matrix_cond=_ConfusionMatrix.TRUE_POSITIVES,
+        confusion_matrix_cond=metrics_utils.ConfusionMatrix.TRUE_POSITIVES,
         thresholds=thresholds,
         name=name,
         dtype=dtype)
 
 
-@tf_export('keras.metrics.Precision')
+@keras_export('keras.metrics.Precision')
 class Precision(Metric):
   """Computes the precision of the predictions with respect to the labels.
 
@@ -1123,6 +1085,15 @@ class Precision(Metric):
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
 
+  If `top_k` is set, we'll calculate precision as how often on average a class
+  among the top-k classes with the highest predicted values of a batch entry is
+  correct and can be found in the label for that entry.
+
+  If `class_id` is specified, we calculate precision by considering only the
+  entries in the batch for which `class_id` is above the threshold and/or in the
+  top-k highest predictions, and computing the fraction of them for which
+  `class_id` is indeed a correct label.
+
   Usage:
 
   ```python
@@ -1134,42 +1105,58 @@ class Precision(Metric):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.Precision()])
   ```
   """
 
-  def __init__(self, thresholds=None, name=None, dtype=None):
+  def __init__(self,
+               thresholds=None,
+               top_k=None,
+               class_id=None,
+               name=None,
+               dtype=None):
     """Creates a `Precision` instance.
 
     Args:
-      thresholds: (Optional) Defaults to 0.5. A float value or a python
-        list/tuple of float threshold values in [0, 1]. A threshold is compared
-        with prediction values to determine the truth value of predictions
-        (i.e., above the threshold is `true`, below is `false`). One metric
-        value is generated for each threshold value.
+      thresholds: (Optional) A float value or a python list/tuple of float
+        threshold values in [0, 1]. A threshold is compared with prediction
+        values to determine the truth value of predictions (i.e., above the
+        threshold is `true`, below is `false`). One metric value is generated
+        for each threshold value. If neither thresholds nor top_k are set, the
+        default is to calculate precision with `thresholds=0.5`.
+      top_k: (Optional) Unset by default. An int value specifying the top-k
+        predictions to consider when calculating precision.
+      class_id: (Optional) Integer class ID for which we want binary metrics.
+        This must be in the half-open interval `[0, num_classes)`, where
+        `num_classes` is the last dimension of predictions.
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
     """
     super(Precision, self).__init__(name=name, dtype=dtype)
-    self.thresholds = 0.5 if thresholds is None else thresholds
-    thresholds = to_list(thresholds)
-    _assert_thresholds_range(thresholds)
-    self.tp = self.add_weight(
+    self.init_thresholds = thresholds
+    self.top_k = top_k
+    self.class_id = class_id
+
+    default_threshold = 0.5 if top_k is None else metrics_utils.NEG_INF
+    self.thresholds = metrics_utils.parse_init_thresholds(
+        thresholds, default_threshold=default_threshold)
+    self.true_positives = self.add_weight(
         'true_positives',
-        shape=(len(thresholds),),
+        shape=(len(self.thresholds),),
         initializer=init_ops.zeros_initializer)
-    self.fp = self.add_weight(
+    self.false_positives = self.add_weight(
         'false_positives',
-        shape=(len(thresholds),),
+        shape=(len(self.thresholds),),
         initializer=init_ops.zeros_initializer)
 
   def update_state(self, y_true, y_pred, sample_weight=None):
     """Accumulates true positive and false positive statistics.
 
     Args:
-      y_true: The ground truth values.
-      y_pred: The predicted values.
+      y_true: The ground truth values, with the same dimensions as `y_pred`.
+        Will be cast to `bool`.
+      y_pred: The predicted values. Each element must be in the range `[0, 1]`.
       sample_weight: Optional weighting of each example. Defaults to 1. Can be a
         `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
         be broadcastable to `y_true`.
@@ -1177,22 +1164,39 @@ class Precision(Metric):
     Returns:
       Update op.
     """
-    return _update_confusion_matrix_variables({
-        _ConfusionMatrix.TRUE_POSITIVES: self.tp,
-        _ConfusionMatrix.FALSE_POSITIVES: self.fp
-    }, y_true, y_pred, self.thresholds, sample_weight)
+    return metrics_utils.update_confusion_matrix_variables(
+        {
+            metrics_utils.ConfusionMatrix.TRUE_POSITIVES: self.true_positives,
+            metrics_utils.ConfusionMatrix.FALSE_POSITIVES: self.false_positives
+        },
+        y_true,
+        y_pred,
+        thresholds=self.thresholds,
+        top_k=self.top_k,
+        class_id=self.class_id,
+        sample_weight=sample_weight)
 
   def result(self):
-    result = math_ops.div_no_nan(self.tp, self.tp + self.fp)
-    return result if isinstance(self.thresholds, (list, tuple)) else result[0]
+    result = math_ops.div_no_nan(self.true_positives,
+                                 self.true_positives + self.false_positives)
+    return result[0] if len(self.thresholds) == 1 else result
 
   def reset_states(self):
     num_thresholds = len(to_list(self.thresholds))
-    for v in self.variables:
-      K.set_value(v, np.zeros((num_thresholds,)))
+    K.batch_set_value(
+        [(v, np.zeros((num_thresholds,))) for v in self.variables])
+
+  def get_config(self):
+    config = {
+        'thresholds': self.init_thresholds,
+        'top_k': self.top_k,
+        'class_id': self.class_id
+    }
+    base_config = super(Precision, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.metrics.Recall')
+@keras_export('keras.metrics.Recall')
 class Recall(Metric):
   """Computes the recall of the predictions with respect to the labels.
 
@@ -1208,6 +1212,14 @@ class Recall(Metric):
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
 
+  If `top_k` is set, recall will be computed as how often on average a class
+  among the labels of a batch entry is in the top-k predictions.
+
+  If `class_id` is specified, we calculate recall by considering only the
+  entries in the batch for which `class_id` is in the label, and computing the
+  fraction of them for which `class_id` is above the threshold and/or in the
+  top-k predictions.
+
   Usage:
 
   ```python
@@ -1219,42 +1231,58 @@ class Recall(Metric):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.Recall()])
   ```
   """
 
-  def __init__(self, thresholds=None, name=None, dtype=None):
+  def __init__(self,
+               thresholds=None,
+               top_k=None,
+               class_id=None,
+               name=None,
+               dtype=None):
     """Creates a `Recall` instance.
 
     Args:
-      thresholds: (Optional) Defaults to 0.5. A float value or a python
-        list/tuple of float threshold values in [0, 1]. A threshold is compared
-        with prediction values to determine the truth value of predictions
-        (i.e., above the threshold is `true`, below is `false`). One metric
-        value is generated for each threshold value.
+      thresholds: (Optional) A float value or a python list/tuple of float
+        threshold values in [0, 1]. A threshold is compared with prediction
+        values to determine the truth value of predictions (i.e., above the
+        threshold is `true`, below is `false`). One metric value is generated
+        for each threshold value. If neither thresholds nor top_k are set, the
+        default is to calculate recall with `thresholds=0.5`.
+      top_k: (Optional) Unset by default. An int value specifying the top-k
+        predictions to consider when calculating recall.
+      class_id: (Optional) Integer class ID for which we want binary metrics.
+        This must be in the half-open interval `[0, num_classes)`, where
+        `num_classes` is the last dimension of predictions.
       name: (Optional) string name of the metric instance.
       dtype: (Optional) data type of the metric result.
     """
     super(Recall, self).__init__(name=name, dtype=dtype)
-    self.thresholds = 0.5 if thresholds is None else thresholds
-    thresholds = to_list(thresholds)
-    _assert_thresholds_range(thresholds)
-    self.tp = self.add_weight(
+    self.init_thresholds = thresholds
+    self.top_k = top_k
+    self.class_id = class_id
+
+    default_threshold = 0.5 if top_k is None else metrics_utils.NEG_INF
+    self.thresholds = metrics_utils.parse_init_thresholds(
+        thresholds, default_threshold=default_threshold)
+    self.true_positives = self.add_weight(
         'true_positives',
-        shape=(len(thresholds),),
+        shape=(len(self.thresholds),),
         initializer=init_ops.zeros_initializer)
-    self.fn = self.add_weight(
+    self.false_negatives = self.add_weight(
         'false_negatives',
-        shape=(len(thresholds),),
+        shape=(len(self.thresholds),),
         initializer=init_ops.zeros_initializer)
 
   def update_state(self, y_true, y_pred, sample_weight=None):
     """Accumulates true positive and false negative statistics.
 
     Args:
-      y_true: The ground truth values.
-      y_pred: The predicted values.
+      y_true: The ground truth values, with the same dimensions as `y_pred`.
+        Will be cast to `bool`.
+      y_pred: The predicted values. Each element must be in the range `[0, 1]`.
       sample_weight: Optional weighting of each example. Defaults to 1. Can be a
         `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
         be broadcastable to `y_true`.
@@ -1262,19 +1290,36 @@ class Recall(Metric):
     Returns:
       Update op.
     """
-    return _update_confusion_matrix_variables({
-        _ConfusionMatrix.TRUE_POSITIVES: self.tp,
-        _ConfusionMatrix.FALSE_NEGATIVES: self.fn
-    }, y_true, y_pred, self.thresholds, sample_weight)
+    return metrics_utils.update_confusion_matrix_variables(
+        {
+            metrics_utils.ConfusionMatrix.TRUE_POSITIVES: self.true_positives,
+            metrics_utils.ConfusionMatrix.FALSE_NEGATIVES: self.false_negatives
+        },
+        y_true,
+        y_pred,
+        thresholds=self.thresholds,
+        top_k=self.top_k,
+        class_id=self.class_id,
+        sample_weight=sample_weight)
 
   def result(self):
-    result = math_ops.div_no_nan(self.tp, self.tp + self.fn)
-    return result if isinstance(self.thresholds, (list, tuple)) else result[0]
+    result = math_ops.div_no_nan(self.true_positives,
+                                 self.true_positives + self.false_negatives)
+    return result[0] if len(self.thresholds) == 1 else result
 
   def reset_states(self):
     num_thresholds = len(to_list(self.thresholds))
-    for v in self.variables:
-      K.set_value(v, np.zeros((num_thresholds,)))
+    K.batch_set_value(
+        [(v, np.zeros((num_thresholds,))) for v in self.variables])
+
+  def get_config(self):
+    config = {
+        'thresholds': self.init_thresholds,
+        'top_k': self.top_k,
+        'class_id': self.class_id
+    }
+    base_config = super(Recall, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
 
 
 @six.add_metaclass(abc.ABCMeta)
@@ -1290,19 +1335,19 @@ class SensitivitySpecificityBase(Metric):
     if num_thresholds <= 0:
       raise ValueError('`num_thresholds` must be > 0.')
     self.value = value
-    self.tp = self.add_weight(
+    self.true_positives = self.add_weight(
         'true_positives',
         shape=(num_thresholds,),
         initializer=init_ops.zeros_initializer)
-    self.tn = self.add_weight(
+    self.true_negatives = self.add_weight(
         'true_negatives',
         shape=(num_thresholds,),
         initializer=init_ops.zeros_initializer)
-    self.fp = self.add_weight(
+    self.false_positives = self.add_weight(
         'false_positives',
         shape=(num_thresholds,),
         initializer=init_ops.zeros_initializer)
-    self.fn = self.add_weight(
+    self.false_negatives = self.add_weight(
         'false_negatives',
         shape=(num_thresholds,),
         initializer=init_ops.zeros_initializer)
@@ -1328,20 +1373,25 @@ class SensitivitySpecificityBase(Metric):
     Returns:
       Update op.
     """
-    return _update_confusion_matrix_variables({
-        _ConfusionMatrix.TRUE_POSITIVES: self.tp,
-        _ConfusionMatrix.TRUE_NEGATIVES: self.tn,
-        _ConfusionMatrix.FALSE_POSITIVES: self.fp,
-        _ConfusionMatrix.FALSE_NEGATIVES: self.fn,
-    }, y_true, y_pred, self.thresholds, sample_weight)
+    return metrics_utils.update_confusion_matrix_variables(
+        {
+            metrics_utils.ConfusionMatrix.TRUE_POSITIVES: self.true_positives,
+            metrics_utils.ConfusionMatrix.TRUE_NEGATIVES: self.true_negatives,
+            metrics_utils.ConfusionMatrix.FALSE_POSITIVES: self.false_positives,
+            metrics_utils.ConfusionMatrix.FALSE_NEGATIVES: self.false_negatives,
+        },
+        y_true,
+        y_pred,
+        thresholds=self.thresholds,
+        sample_weight=sample_weight)
 
   def reset_states(self):
     num_thresholds = len(self.thresholds)
-    for v in self.variables:
-      K.set_value(v, np.zeros((num_thresholds,)))
+    K.batch_set_value(
+        [(v, np.zeros((num_thresholds,))) for v in self.variables])
 
 
-@tf_export('keras.metrics.SensitivityAtSpecificity')
+@keras_export('keras.metrics.SensitivityAtSpecificity')
 class SensitivityAtSpecificity(SensitivitySpecificityBase):
   """Computes the sensitivity at a given specificity.
 
@@ -1372,7 +1422,7 @@ class SensitivityAtSpecificity(SensitivitySpecificityBase):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile(
       'sgd',
       loss='mse',
@@ -1392,12 +1442,15 @@ class SensitivityAtSpecificity(SensitivitySpecificityBase):
     """
     if specificity < 0 or specificity > 1:
       raise ValueError('`specificity` must be in the range [0, 1].')
+    self.specificity = specificity
+    self.num_thresholds = num_thresholds
     super(SensitivityAtSpecificity, self).__init__(
         specificity, num_thresholds=num_thresholds, name=name, dtype=dtype)
 
   def result(self):
     # Calculate specificities at all the thresholds.
-    specificities = math_ops.div_no_nan(self.tn, self.tn + self.fp)
+    specificities = math_ops.div_no_nan(
+        self.true_negatives, self.true_negatives + self.false_positives)
 
     # Find the index of the threshold where the specificity is closest to the
     # given specificity.
@@ -1406,11 +1459,20 @@ class SensitivityAtSpecificity(SensitivitySpecificityBase):
     min_index = math_ops.cast(min_index, dtypes.int32)
 
     # Compute sensitivity at that index.
-    return math_ops.div_no_nan(self.tp[min_index],
-                               self.tp[min_index] + self.fn[min_index])
+    return math_ops.div_no_nan(
+        self.true_positives[min_index],
+        self.true_positives[min_index] + self.false_negatives[min_index])
+
+  def get_config(self):
+    config = {
+        'num_thresholds': self.num_thresholds,
+        'specificity': self.specificity
+    }
+    base_config = super(SensitivityAtSpecificity, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.metrics.SpecificityAtSensitivity')
+@keras_export('keras.metrics.SpecificityAtSensitivity')
 class SpecificityAtSensitivity(SensitivitySpecificityBase):
   """Computes the specificity at a given sensitivity.
 
@@ -1441,7 +1503,7 @@ class SpecificityAtSensitivity(SensitivitySpecificityBase):
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile(
       'sgd',
       loss='mse',
@@ -1461,12 +1523,15 @@ class SpecificityAtSensitivity(SensitivitySpecificityBase):
     """
     if sensitivity < 0 or sensitivity > 1:
       raise ValueError('`sensitivity` must be in the range [0, 1].')
+    self.sensitivity = sensitivity
+    self.num_thresholds = num_thresholds
     super(SpecificityAtSensitivity, self).__init__(
         sensitivity, num_thresholds=num_thresholds, name=name, dtype=dtype)
 
   def result(self):
     # Calculate sensitivities at all the thresholds.
-    sensitivities = math_ops.div_no_nan(self.tp, self.tp + self.fn)
+    sensitivities = math_ops.div_no_nan(
+        self.true_positives, self.true_positives + self.false_negatives)
 
     # Find the index of the threshold where the sensitivity is closest to the
     # given specificity.
@@ -1475,113 +1540,1245 @@ class SpecificityAtSensitivity(SensitivitySpecificityBase):
     min_index = math_ops.cast(min_index, dtypes.int32)
 
     # Compute specificity at that index.
-    return math_ops.div_no_nan(self.tn[min_index],
-                               self.tn[min_index] + self.fp[min_index])
+    return math_ops.div_no_nan(
+        self.true_negatives[min_index],
+        self.true_negatives[min_index] + self.false_positives[min_index])
+
+  def get_config(self):
+    config = {
+        'num_thresholds': self.num_thresholds,
+        'sensitivity': self.sensitivity
+    }
+    base_config = super(SpecificityAtSensitivity, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+@keras_export('keras.metrics.AUC')
+class AUC(Metric):
+  """Computes the approximate AUC (Area under the curve) via a Riemann sum.
+
+  This metric creates four local variables, `true_positives`, `true_negatives`,
+  `false_positives` and `false_negatives` that are used to compute the AUC.
+  To discretize the AUC curve, a linearly spaced set of thresholds is used to
+  compute pairs of recall and precision values. The area under the ROC-curve is
+  therefore computed using the height of the recall values by the false positive
+  rate, while the area under the PR-curve is the computed using the height of
+  the precision values by the recall.
+
+  This value is ultimately returned as `auc`, an idempotent operation that
+  computes the area under a discretized curve of precision versus recall values
+  (computed using the aforementioned variables). The `num_thresholds` variable
+  controls the degree of discretization with larger numbers of thresholds more
+  closely approximating the true AUC. The quality of the approximation may vary
+  dramatically depending on `num_thresholds`.
+
+  For best results, `predictions` should be distributed approximately uniformly
+  in the range [0, 1] and not peaked around 0 or 1. The quality of the AUC
+  approximation may be poor if this is not the case. Setting `summation_method`
+  to 'minoring' or 'majoring' can help quantify the error in the approximation
+  by providing lower or upper bound estimate of the AUC.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.AUC(num_thresholds=3)
+  m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9])
+
+  # threshold values are [0 - 1e-7, 0.5, 1 + 1e-7]
+  # tp = [2, 1, 0], fp = [2, 0, 0], fn = [0, 1, 2], tn = [0, 2, 2]
+  # recall = [1, 0.5, 0], fp_rate = [1, 0, 0]
+  # auc = ((((1+0.5)/2)*(1-0))+ (((0.5+0)/2)*(0-0))) = 0.75
+
+  print('Final result: ', m.result().numpy())  # Final result: 0.75
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.AUC()])
+  ```
+  """
+
+  def __init__(self,
+               num_thresholds=200,
+               curve='ROC',
+               summation_method='interpolation',
+               name=None,
+               dtype=None):
+    """Creates an `AUC` instance.
+
+    Args:
+      num_thresholds: (Optional) Defaults to 200. The number of thresholds to
+        use when discretizing the roc curve. Values must be > 1.
+      curve: (Optional) Specifies the name of the curve to be computed, 'ROC'
+        [default] or 'PR' for the Precision-Recall-curve.
+      summation_method: (Optional) Specifies the Riemann summation method used
+        (https://en.wikipedia.org/wiki/Riemann_sum): 'interpolation' [default],
+          applies mid-point summation scheme for `ROC`. For PR-AUC, interpolates
+          (true/false) positives but not the ratio that is precision (see Davis
+          & Goadrich 2006 for details); 'minoring' that applies left summation
+          for increasing intervals and right summation for decreasing intervals;
+          'majoring' that does the opposite.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    # Validate configurations.
+    if num_thresholds <= 1:
+      raise ValueError('`num_thresholds` must be > 1.')
+    if isinstance(curve, metrics_utils.AUCCurve) and curve not in list(
+        metrics_utils.AUCCurve):
+      raise ValueError('Invalid curve: "{}". Valid options are: "{}"'.format(
+          curve, list(metrics_utils.AUCCurve)))
+    if isinstance(
+        summation_method,
+        metrics_utils.AUCSummationMethod) and summation_method not in list(
+            metrics_utils.AUCSummationMethod):
+      raise ValueError(
+          'Invalid summation method: "{}". Valid options are: "{}"'.format(
+              summation_method, list(metrics_utils.AUCSummationMethod)))
+
+    # Update properties.
+    self.num_thresholds = num_thresholds
+    if isinstance(curve, metrics_utils.AUCCurve):
+      self.curve = curve
+    else:
+      self.curve = metrics_utils.AUCCurve.from_str(curve)
+    if isinstance(summation_method, metrics_utils.AUCSummationMethod):
+      self.summation_method = summation_method
+    else:
+      self.summation_method = metrics_utils.AUCSummationMethod.from_str(
+          summation_method)
+    super(AUC, self).__init__(name=name, dtype=dtype)
+
+    # Create metric variables
+    self.true_positives = self.add_weight(
+        'true_positives',
+        shape=(num_thresholds,),
+        initializer=init_ops.zeros_initializer)
+    self.true_negatives = self.add_weight(
+        'true_negatives',
+        shape=(num_thresholds,),
+        initializer=init_ops.zeros_initializer)
+    self.false_positives = self.add_weight(
+        'false_positives',
+        shape=(num_thresholds,),
+        initializer=init_ops.zeros_initializer)
+    self.false_negatives = self.add_weight(
+        'false_negatives',
+        shape=(num_thresholds,),
+        initializer=init_ops.zeros_initializer)
+
+    # Compute `num_thresholds` thresholds in [0, 1]
+    thresholds = [
+        (i + 1) * 1.0 / (num_thresholds - 1) for i in range(num_thresholds - 2)
+    ]
+    self.thresholds = [0.0 - K.epsilon()] + thresholds + [1.0 + K.epsilon()]
+    # epsilon - to account for floating point imprecisions.
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates confusion matrix statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    return metrics_utils.update_confusion_matrix_variables({
+        metrics_utils.ConfusionMatrix.TRUE_POSITIVES: self.true_positives,
+        metrics_utils.ConfusionMatrix.TRUE_NEGATIVES: self.true_negatives,
+        metrics_utils.ConfusionMatrix.FALSE_POSITIVES: self.false_positives,
+        metrics_utils.ConfusionMatrix.FALSE_NEGATIVES: self.false_negatives,
+    }, y_true, y_pred, self.thresholds, sample_weight=sample_weight)
+
+  def interpolate_pr_auc(self):
+    """Interpolation formula inspired by section 4 of Davis & Goadrich 2006.
+
+    https://www.biostat.wisc.edu/~page/rocpr.pdf
+
+    Note here we derive & use a closed formula not present in the paper
+    as follows:
+
+      Precision = TP / (TP + FP) = TP / P
+
+    Modeling all of TP (true positive), FP (false positive) and their sum
+    P = TP + FP (predicted positive) as varying linearly within each interval
+    [A, B] between successive thresholds, we get
+
+      Precision slope = dTP / dP
+                      = (TP_B - TP_A) / (P_B - P_A)
+                      = (TP - TP_A) / (P - P_A)
+      Precision = (TP_A + slope * (P - P_A)) / P
+
+    The area within the interval is (slope / total_pos_weight) times
+
+      int_A^B{Precision.dP} = int_A^B{(TP_A + slope * (P - P_A)) * dP / P}
+      int_A^B{Precision.dP} = int_A^B{slope * dP + intercept * dP / P}
+
+    where intercept = TP_A - slope * P_A = TP_B - slope * P_B, resulting in
+
+      int_A^B{Precision.dP} = TP_B - TP_A + intercept * log(P_B / P_A)
+
+    Bringing back the factor (slope / total_pos_weight) we'd put aside, we get
+
+      slope * [dTP + intercept *  log(P_B / P_A)] / total_pos_weight
+
+    where dTP == TP_B - TP_A.
+
+    Note that when P_A == 0 the above calculation simplifies into
+
+      int_A^B{Precision.dTP} = int_A^B{slope * dTP} = slope * (TP_B - TP_A)
+
+    which is really equivalent to imputing constant precision throughout the
+    first bucket having >0 true positives.
+
+    Returns:
+      pr_auc: an approximation of the area under the P-R curve.
+    """
+    dtp = self.true_positives[:self.num_thresholds -
+                              1] - self.true_positives[1:]
+    p = self.true_positives + self.false_positives
+    dp = p[:self.num_thresholds - 1] - p[1:]
+
+    prec_slope = math_ops.div_no_nan(
+        dtp, math_ops.maximum(dp, 0), name='prec_slope')
+    intercept = self.true_positives[1:] - math_ops.multiply(prec_slope, p[1:])
+
+    safe_p_ratio = array_ops.where(
+        math_ops.logical_and(p[:self.num_thresholds - 1] > 0, p[1:] > 0),
+        math_ops.div_no_nan(
+            p[:self.num_thresholds - 1],
+            math_ops.maximum(p[1:], 0),
+            name='recall_relative_ratio'),
+        array_ops.ones_like(p[1:]))
+
+    return math_ops.reduce_sum(
+        math_ops.div_no_nan(
+            prec_slope * (dtp + intercept * math_ops.log(safe_p_ratio)),
+            math_ops.maximum(self.true_positives[1:] + self.false_negatives[1:],
+                             0),
+            name='pr_auc_increment'),
+        name='interpolate_pr_auc')
+
+  def result(self):
+    if (self.curve == metrics_utils.AUCCurve.PR and
+        self.summation_method == metrics_utils.AUCSummationMethod.INTERPOLATION
+       ):
+      # This use case is different and is handled separately.
+      return self.interpolate_pr_auc()
+
+    # Set `x` and `y` values for the curves based on `curve` config.
+    recall = math_ops.div_no_nan(self.true_positives,
+                                 self.true_positives + self.false_negatives)
+    if self.curve == metrics_utils.AUCCurve.ROC:
+      fp_rate = math_ops.div_no_nan(self.false_positives,
+                                    self.false_positives + self.true_negatives)
+      x = fp_rate
+      y = recall
+    else:  # curve == 'PR'.
+      precision = math_ops.div_no_nan(
+          self.true_positives, self.true_positives + self.false_positives)
+      x = recall
+      y = precision
+
+    # Find the rectangle heights based on `summation_method`.
+    if self.summation_method == metrics_utils.AUCSummationMethod.INTERPOLATION:
+      # Note: the case ('PR', 'interpolation') has been handled above.
+      heights = (y[:self.num_thresholds - 1] + y[1:]) / 2.
+    elif self.summation_method == metrics_utils.AUCSummationMethod.MINORING:
+      heights = math_ops.minimum(y[:self.num_thresholds - 1], y[1:])
+    else:  # self.summation_method = metrics_utils.AUCSummationMethod.MAJORING:
+      heights = math_ops.maximum(y[:self.num_thresholds - 1], y[1:])
+
+    # Sum up the areas of all the rectangles.
+    return math_ops.reduce_sum(
+        math_ops.multiply(x[:self.num_thresholds - 1] - x[1:], heights),
+        name=self.name)
+
+  def reset_states(self):
+    num_thresholds = len(self.thresholds)
+    K.batch_set_value(
+        [(v, np.zeros((num_thresholds,))) for v in self.variables])
+
+  def get_config(self):
+    config = {
+        'num_thresholds': self.num_thresholds,
+        'curve': self.curve.value,
+        'summation_method': self.summation_method.value,
+    }
+    base_config = super(AUC, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
+@keras_export('keras.metrics.CosineSimilarity')
+class CosineSimilarity(MeanMetricWrapper):
+  """Computes the cosine similarity between the labels and predictions.
 
-class CosineProximity(MeanMetricWrapper):
-  """Computes the cosine distance between the labels and predictions.
+  cosine similarity = (a . b) / ||a|| ||b||
+  (https://en.wikipedia.org/wiki/Cosine_similarity)
 
   For example, if `y_true` is [0, 1, 1], and `y_pred` is [1, 0, 1], the cosine
-  proximity is -0.5.
+  similarity is 0.5.
 
-  This metric keeps the average cosine distance between `predictions` and
+  This metric keeps the average cosine similarity between `predictions` and
   `labels` over a stream of data.
 
   Usage:
   ```python
-  m = tf.metrics.CosineProximity()
-  m.update_state([0, 1, 1], [1, 0, 1])
-  print('Final result: ', m.result().numpy())  # Final result: -0.5
+  m = tf.keras.metrics.CosineSimilarity(axis=1)
+  m.update_state([[0., 1.], [1., 1.]], [[1., 0.], [1., 1.]])
+  # l2_norm(y_true) = [[0., 1.], [1./1.414], 1./1.414]]]
+  # l2_norm(y_pred) = [[1., 0.], [1./1.414], 1./1.414]]]
+  # l2_norm(y_true) . l2_norm(y_pred) = [[0., 0.], [0.5, 0.5]]
+  # result = mean(sum(l2_norm(y_true) . l2_norm(y_pred), axis=1))
+         = ((0. + 0.) +  (0.5 + 0.5)) / 2
+
+  print('Final result: ', m.result().numpy())  # Final result: 0.5
   ```
 
   Usage with tf.keras API:
 
   ```python
-  model = keras.models.Model(inputs, outputs)
+  model = tf.keras.Model(inputs, outputs)
   model.compile(
       'sgd',
       loss='mse',
-      metrics=[tf.metrics.CosineProximity()])
+      metrics=[tf.keras.metrics.CosineSimilarity(axis=1)])
   ```
   """
 
-  def __init__(self, name='cosine_proximity', dtype=None):
-    super(CosineProximity, self).__init__(cosine, name, dtype=dtype)
+  def __init__(self, name='cosine_similarity', dtype=None, axis=-1):
+    """Creates a `CosineSimilarity` instance.
 
-  @classmethod
-  def from_config(cls, config):
-    if 'fn' in config:
-      config.pop('fn')
-    return super(CosineProximity, cls).from_config(config)
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      axis: (Optional) Defaults to -1. The dimension along which the cosine
+        similarity is computed.
+    """
+    super(CosineSimilarity, self).__init__(
+        cosine_similarity, name, dtype=dtype, axis=axis)
 
 
-def accuracy(y_true, y_pred):
-  y_pred.get_shape().assert_is_compatible_with(y_true.get_shape())
-  if y_true.dtype != y_pred.dtype:
-    y_pred = math_ops.cast(y_pred, y_true.dtype)
-  return math_ops.cast(math_ops.equal(y_true, y_pred), K.floatx())
+@keras_export('keras.metrics.MeanAbsoluteError')
+class MeanAbsoluteError(MeanMetricWrapper):
+  """Computes the mean absolute error between the labels and predictions.
 
+  For example, if `y_true` is [0., 0., 1., 1.], and `y_pred` is [1., 1., 1., 0.]
+  the mean absolute error is 3/4 (0.75).
 
-@tf_export('keras.metrics.binary_accuracy')
-def binary_accuracy(y_true, y_pred, threshold=0.5):
-  threshold = math_ops.cast(threshold, y_pred.dtype)
-  y_pred = math_ops.cast(y_pred > threshold, y_pred.dtype)
-  return K.mean(math_ops.equal(y_true, y_pred), axis=-1)
+  Usage:
+  ```python
+  m = tf.keras.metrics.MeanAbsoluteError()
+  m.update_state([0., 0., 1., 1.], [1., 1., 1., 0.])
+  print('Final result: ', m.result().numpy())  # Final result: 0.75
+  ```
 
+  Usage with tf.keras API:
 
-@tf_export('keras.metrics.categorical_accuracy')
-def categorical_accuracy(y_true, y_pred):
-  return math_ops.cast(
-      math_ops.equal(
-          math_ops.argmax(y_true, axis=-1), math_ops.argmax(y_pred, axis=-1)),
-      K.floatx())
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.MeanAbsoluteError()])
+  ```
+  """
 
+  def __init__(self, name='mean_absolute_error', dtype=None):
+    super(MeanAbsoluteError, self).__init__(
+        mean_absolute_error, name, dtype=dtype)
 
-@tf_export('keras.metrics.sparse_categorical_accuracy')
-def sparse_categorical_accuracy(y_true, y_pred):
-  # If the shape of y_true is (num_samples, 1), squeeze to (num_samples,)
-  if (len(K.int_shape(y_true)) == len(K.int_shape(y_pred))):
-    y_true = array_ops.squeeze(y_true, [-1])
-  y_pred = math_ops.argmax(y_pred, axis=-1)
 
-  # If the predicted output and actual output types don't match, force cast them
-  # to match.
-  if K.dtype(y_pred) != K.dtype(y_true):
-    y_pred = math_ops.cast(y_pred, K.dtype(y_true))
+@keras_export('keras.metrics.MeanAbsolutePercentageError')
+class MeanAbsolutePercentageError(MeanMetricWrapper):
+  """Computes the mean absolute percentage error between `y_true` and `y_pred`.
 
-  return math_ops.cast(math_ops.equal(y_true, y_pred), K.floatx())
+  For example, if `y_true` is [0., 0., 1., 1.], and `y_pred` is [1., 1., 1., 0.]
+  the mean absolute percentage error is 5e+08.
 
+  Usage:
 
-@tf_export('keras.metrics.top_k_categorical_accuracy')
-def top_k_categorical_accuracy(y_true, y_pred, k=5):
-  return K.mean(
-      nn.in_top_k(y_pred, math_ops.argmax(y_true, axis=-1), k), axis=-1)
+  ```python
+  m = tf.keras.metrics.MeanAbsolutePercentageError()
+  m.update_state([0., 0., 1., 1.], [1., 1., 1., 0.])
+  print('Final result: ', m.result().numpy())  # Final result: 5e+08
+  ```
 
+  Usage with tf.keras API:
 
-@tf_export('keras.metrics.sparse_top_k_categorical_accuracy')
-def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
-  # If the shape of y_true is (num_samples, 1), squeeze to (num_samples,)
-  if (len(K.int_shape(y_true)) == len(K.int_shape(y_pred))):
-    y_true = array_ops.squeeze(y_true, [-1])
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.MeanAbsolutePercentageError()])
+  ```
+  """
 
-  return K.mean(nn.in_top_k(y_pred, math_ops.cast(y_true, 'int32'), k), axis=-1)
+  def __init__(self, name='mean_absolute_percentage_error', dtype=None):
+    super(MeanAbsolutePercentageError, self).__init__(
+        mean_absolute_percentage_error, name, dtype=dtype)
 
-# Aliases
 
-mse = MSE = mean_squared_error
-mae = MAE = mean_absolute_error
-mape = MAPE = mean_absolute_percentage_error
-msle = MSLE = mean_squared_logarithmic_error
-cosine = cosine_proximity
+@keras_export('keras.metrics.MeanSquaredError')
+class MeanSquaredError(MeanMetricWrapper):
+  """Computes the mean squared error between `y_true` and `y_pred`.
+
+  For example, if `y_true` is [0., 0., 1., 1.], and `y_pred` is [1., 1., 1., 0.]
+  the mean squared error is 3/4 (0.75).
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.MeanSquaredError()
+  m.update_state([0., 0., 1., 1.], [1., 1., 1., 0.])
+  print('Final result: ', m.result().numpy())  # Final result: 0.75
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.MeanSquaredError()])
+  ```
+  """
+
+  def __init__(self, name='mean_squared_error', dtype=None):
+    super(MeanSquaredError, self).__init__(
+        mean_squared_error, name, dtype=dtype)
+
+
+@keras_export('keras.metrics.MeanSquaredLogarithmicError')
+class MeanSquaredLogarithmicError(MeanMetricWrapper):
+  """Computes the mean squared logarithmic error between `y_true` and `y_pred`.
+
+  For example, if `y_true` is [0., 0., 1., 1.], and `y_pred` is [1., 1., 1., 0.]
+  the mean squared logarithmic error is 0.36034.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.MeanSquaredLogarithmicError()
+  m.update_state([0., 0., 1., 1.], [1., 1., 1., 0.])
+  print('Final result: ', m.result().numpy())  # Final result: 0.36034
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.MeanSquaredLogarithmicError()])
+  ```
+  """
+
+  def __init__(self, name='mean_squared_logarithmic_error', dtype=None):
+    super(MeanSquaredLogarithmicError, self).__init__(
+        mean_squared_logarithmic_error, name, dtype=dtype)
+
+
+@keras_export('keras.metrics.Hinge')
+class Hinge(MeanMetricWrapper):
+  """Computes the hinge metric between `y_true` and `y_pred`.
+
+  `y_true` values are expected to be -1 or 1. If binary (0 or 1) labels are
+  provided we will convert them to -1 or 1.
+
+  For example, if `y_true` is [-1., 1., 1.], and `y_pred` is [0.6, -0.7, -0.5]
+  the hinge metric value is 1.6.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.Hinge()
+  m.update_state([-1., 1., 1.], [0.6, -0.7, -0.5])
+
+  # result = max(0, 1-y_true * y_pred) = [1.6 + 1.7 + 1.5] / 3
+
+  print('Final result: ', m.result().numpy())  # Final result: 1.6
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.Hinge()])
+  ```
+  """
+
+  def __init__(self, name='hinge', dtype=None):
+    super(Hinge, self).__init__(hinge, name, dtype=dtype)
+
+
+@keras_export('keras.metrics.SquaredHinge')
+class SquaredHinge(MeanMetricWrapper):
+  """Computes the squared hinge metric between `y_true` and `y_pred`.
+
+  `y_true` values are expected to be -1 or 1. If binary (0 or 1) labels are
+  provided we will convert them to -1 or 1.
+
+  For example, if `y_true` is [-1., 1., 1.], and `y_pred` is [0.6, -0.7, -0.5]
+  the squared hinge metric value is 2.6.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.SquaredHinge()
+  m.update_state([-1., 1., 1.], [0.6, -0.7, -0.5])
+
+  # result = max(0, 1-y_true * y_pred) = [1.6^2 + 1.7^2 + 1.5^2] / 3
+
+  print('Final result: ', m.result().numpy())  # Final result: 2.6
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.SquaredHinge()])
+  ```
+  """
+
+  def __init__(self, name='squared_hinge', dtype=None):
+    super(SquaredHinge, self).__init__(squared_hinge, name, dtype=dtype)
+
+
+@keras_export('keras.metrics.CategoricalHinge')
+class CategoricalHinge(MeanMetricWrapper):
+  """Computes the categorical hinge metric between `y_true` and `y_pred`.
+
+  For example, if `y_true` is [0., 1., 1.], and `y_pred` is [1., 0., 1.]
+  the categorical hinge metric value is 1.0.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.CategoricalHinge()
+  m.update_state([0., 1., 1.], [1., 0., 1.])
+  print('Final result: ', m.result().numpy())  # Final result: 1.0
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.CategoricalHinge()])
+  ```
+  """
+
+  def __init__(self, name='categorical_hinge', dtype=None):
+    super(CategoricalHinge, self).__init__(categorical_hinge, name, dtype=dtype)
+
+
+@keras_export('keras.metrics.RootMeanSquaredError')
+class RootMeanSquaredError(Mean):
+  """Computes root mean squared error metric between `y_true` and `y_pred`.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.RootMeanSquaredError()
+  m.update_state([2., 4., 6.], [1., 3., 2.])
+  print('Final result: ', m.result().numpy())  # Final result: 2.449
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.RootMeanSquaredError()])
+  ```
+  """
+
+  def __init__(self, name='root_mean_squared_error', dtype=None):
+    super(RootMeanSquaredError, self).__init__(name, dtype=dtype)
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates root mean squared error statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    y_true = math_ops.cast(y_true, self._dtype)
+    y_pred = math_ops.cast(y_pred, self._dtype)
+    y_pred, y_true, sample_weight = squeeze_or_expand_dimensions(
+        y_pred, y_true, sample_weight)
+    error_sq = math_ops.squared_difference(y_pred, y_true)
+    return super(RootMeanSquaredError, self).update_state(
+        error_sq, sample_weight=sample_weight)
+
+  def result(self):
+    return math_ops.sqrt(math_ops.div_no_nan(self.total, self.count))
+
+
+@keras_export('keras.metrics.LogCoshError')
+class LogCoshError(MeanMetricWrapper):
+  """Computes the logarithm of the hyperbolic cosine of the prediction error.
+
+  `logcosh = log((exp(x) + exp(-x))/2)`, where x is the error (y_pred - y_true)
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.LogCoshError()
+  m.update_state([0., 1., 1.], [1., 0., 1.])
+  print('Final result: ', m.result().numpy())  # Final result: 0.289
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.LogCoshError()])
+  ```
+  """
+
+  def __init__(self, name='logcosh', dtype=None):
+    super(LogCoshError, self).__init__(logcosh, name, dtype=dtype)
+
+
+@keras_export('keras.metrics.Poisson')
+class Poisson(MeanMetricWrapper):
+  """Computes the Poisson metric between `y_true` and `y_pred`.
+
+  `metric = y_pred - y_true * log(y_pred)`
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.Poisson()
+  m.update_state([1, 9, 2], [4, 8, 12])
+  print('Final result: ', m.result().numpy())  # Final result: -4.63
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.Poisson()])
+  ```
+  """
+
+  def __init__(self, name='poisson', dtype=None):
+    super(Poisson, self).__init__(poisson, name, dtype=dtype)
+
+
+@keras_export('keras.metrics.KLDivergence')
+class KLDivergence(MeanMetricWrapper):
+  """Computes Kullback Leibler divergence metric between `y_true` and `y_pred`.
+
+  `metric = y_true * log(y_true / y_pred)`
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.KLDivergence()
+  m.update_state([.4, .9, .2], [.5, .8, .12])
+  print('Final result: ', m.result().numpy())  # Final result: -0.043
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', metrics=[tf.keras.metrics.KLDivergence()])
+  ```
+  """
+
+  def __init__(self, name='kullback_leibler_divergence', dtype=None):
+    super(KLDivergence, self).__init__(
+        kullback_leibler_divergence, name, dtype=dtype)
+
+
+@keras_export('keras.metrics.MeanIoU')
+class MeanIoU(Metric):
+  """Computes the mean Intersection-Over-Union metric.
+
+  Mean Intersection-Over-Union is a common evaluation metric for semantic image
+  segmentation, which first computes the IOU for each semantic class and then
+  computes the average over classes. IOU is defined as follows:
+    IOU = true_positive / (true_positive + false_positive + false_negative).
+  The predictions are accumulated in a confusion matrix, weighted by
+  `sample_weight` and the metric is then calculated from it.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.MeanIoU(num_classes=2)
+  m.update_state([0, 0, 1, 1], [0, 1, 0, 1])
+
+    # cm = [[1, 1],
+            [1, 1]]
+    # sum_row = [2, 2], sum_col = [2, 2], true_positives = [1, 1]
+    # iou = true_positives / (sum_row + sum_col - true_positives))
+    # result = (1 / (2 + 2 - 1) + 1 / (2 + 2 - 1)) / 2 = 0.33
+  print('Final result: ', m.result().numpy())  # Final result: 0.33
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile(
+    'sgd',
+    loss='mse',
+    metrics=[tf.keras.metrics.MeanIoU(num_classes=2)])
+  ```
+  """
+
+  def __init__(self, num_classes, name=None, dtype=None):
+    """Creates a `MeanIoU` instance.
+
+    Args:
+      num_classes: The possible number of labels the prediction task can have.
+        This value must be provided, since a confusion matrix of dimension =
+        [num_classes, num_classes] will be allocated.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(MeanIoU, self).__init__(name=name, dtype=dtype)
+    self.num_classes = num_classes
+
+    # Variable to accumulate the predictions in the confusion matrix. Setting
+    # the type to be `float64` as required by confusion_matrix_ops.
+    self.total_cm = self.add_weight(
+        'total_confusion_matrix',
+        shape=(num_classes, num_classes),
+        initializer=init_ops.zeros_initializer,
+        dtype=dtypes.float64)
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates the confusion matrix statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    # Flatten the input if its rank > 1.
+    if y_pred.shape.ndims > 1:
+      y_pred = array_ops.reshape(y_pred, [-1])
+
+    if y_true.shape.ndims > 1:
+      y_true = array_ops.reshape(y_true, [-1])
+
+    if sample_weight is not None and sample_weight.shape.ndims > 1:
+      sample_weight = array_ops.reshape(sample_weight, [-1])
+
+    # Accumulate the prediction to current confusion matrix.
+    current_cm = confusion_matrix.confusion_matrix(
+        y_true,
+        y_pred,
+        self.num_classes,
+        weights=sample_weight,
+        dtype=dtypes.float64)
+    return self.total_cm.assign_add(current_cm)
+
+  def result(self):
+    """Compute the mean intersection-over-union via the confusion matrix."""
+    sum_over_row = math_ops.cast(
+        math_ops.reduce_sum(self.total_cm, axis=0), dtype=self._dtype)
+    sum_over_col = math_ops.cast(
+        math_ops.reduce_sum(self.total_cm, axis=1), dtype=self._dtype)
+    true_positives = math_ops.cast(
+        array_ops.diag_part(self.total_cm), dtype=self._dtype)
+
+    # sum_over_row + sum_over_col =
+    #     2 * true_positives + false_positives + false_negatives.
+    denominator = sum_over_row + sum_over_col - true_positives
+
+    # The mean is only computed over classes that appear in the
+    # label or prediction tensor. If the denominator is 0, we need to
+    # ignore the class.
+    num_valid_entries = math_ops.reduce_sum(
+        math_ops.cast(math_ops.not_equal(denominator, 0), dtype=self._dtype))
+
+    iou = math_ops.div_no_nan(true_positives, denominator)
+
+    return math_ops.div_no_nan(
+        math_ops.reduce_sum(iou, name='mean_iou'), num_valid_entries)
+
+  def reset_states(self):
+    K.set_value(self.total_cm, np.zeros((self.num_classes, self.num_classes)))
+
+  def get_config(self):
+    config = {'num_classes': self.num_classes}
+    base_config = super(MeanIoU, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+@keras_export('keras.metrics.MeanTensor')
+class MeanTensor(Metric):
+  """Computes the element-wise (weighted) mean of the given tensors.
+
+  `MeanTensor` returns a tensor with the same shape of the input tensors. The
+  mean value is updated by keeping local variables `total` and `count`. The
+  `total` tracks the sum of the weighted values, and `count` stores the sum of
+  the weighted counts.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.MeanTensor()
+  m.update_state([0, 1, 2, 3])
+  m.update_state([4, 5, 6, 7])
+  print('Result: ', m.result().numpy())  # Result: [2, 3, 4, 5]
+  m.update_state([12, 10, 8, 6], sample_weights= [0, 0.2, 0.5, 1])
+  print('Result: ', m.result().numpy())  # Result: [2, 3.636, 4.8, 5.333]
+  ```
+  """
+
+  def __init__(self, name='mean_tensor', dtype=None):
+    """Creates a `MeanTensor` instance.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(MeanTensor, self).__init__(name=name, dtype=dtype)
+    self._shape = None
+    self._total = None
+    self._count = None
+    self._built = False
+
+  def _build(self, shape):
+    self._shape = tensor_shape.TensorShape(shape)
+    # Create new state variables
+    self._total = self.add_weight(
+        'total', shape=shape, initializer=init_ops.zeros_initializer)
+    self._count = self.add_weight(
+        'count', shape=shape, initializer=init_ops.zeros_initializer)
+    with ops.init_scope():
+      if not context.executing_eagerly():
+        K._initialize_variables(K._get_session())  # pylint: disable=protected-access
+    self._built = True
+
+  @property
+  def total(self):
+    return self._total if self._built else None
+
+  @property
+  def count(self):
+    return self._count if self._built else None
+
+  def update_state(self, values, sample_weight=None):
+    """Accumulates statistics for computing the element-wise mean.
+
+    Args:
+      values: Per-example value.
+      sample_weight: Optional weighting of each example. Defaults to 1.
+
+    Returns:
+      Update op.
+    """
+    values = math_ops.cast(values, self._dtype)
+    if not self._built:
+      self._build(values.shape)
+    elif values.shape != self._shape:
+      raise ValueError('MeanTensor input values must always have the same '
+                       'shape. Expected shape (set during the first call): {}. '
+                       'Got: {}'.format(self._shape, values.get_shape()))
+
+    num_values = array_ops.ones_like(values)
+    if sample_weight is not None:
+      sample_weight = math_ops.cast(sample_weight, self._dtype)
+
+      # Update dimensions of weights to match with values if possible.
+      values, _, sample_weight = squeeze_or_expand_dimensions(
+          values, None, sample_weight)
+      try:
+        # Broadcast weights if possible.
+        sample_weight = weights_broadcast_ops.broadcast_weights(
+            sample_weight, values)
+      except ValueError:
+        # Reduce values to same ndim as weight array
+        ndim = K.ndim(values)
+        weight_ndim = K.ndim(sample_weight)
+        values = math_ops.reduce_mean(
+            values, axis=list(range(weight_ndim, ndim)))
+
+      num_values = math_ops.multiply(num_values, sample_weight)
+      values = math_ops.multiply(values, sample_weight)
+
+    update_total_op = self._total.assign_add(values)
+    with ops.control_dependencies([update_total_op]):
+      return self._count.assign_add(num_values)
+
+  def result(self):
+    if not self._built:
+      raise ValueError(
+          'MeanTensor does not have any result yet. Please call the MeanTensor '
+          'instance or use `.update_state(value)` before retrieving the result.'
+          )
+    return math_ops.div_no_nan(self.total, self.count)
+
+  def reset_states(self):
+    if self._built:
+      K.batch_set_value(
+          [(v, np.zeros(self._shape.as_list())) for v in self.variables])
+
+
+@keras_export('keras.metrics.BinaryCrossentropy')
+class BinaryCrossentropy(MeanMetricWrapper):
+  """Computes the crossentropy metric between the labels and predictions.
+
+  This is the crossentropy metric class to be used when there are only two
+  label classes (0 and 1).
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.BinaryCrossentropy()
+  m.update_state([1., 0., 1., 0.], [1., 1., 1., 0.])
+
+  # EPSILON = 1e-7, y = y_true, y` = y_pred, Y_MAX = 0.9999999
+  # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+  # y` = [Y_MAX, Y_MAX, Y_MAX, EPSILON]
+
+  # Metric = -(y log(y` + EPSILON) + (1 - y) log(1 - y` + EPSILON))
+  #        = [-log(Y_MAX + EPSILON), -log(1 - Y_MAX + EPSILON),
+  #           -log(Y_MAX + EPSILON), -log(1)]
+  #        = [(0 + 15.33) / 2, (0 + 0) / 2]
+  # Reduced metric = 7.665 / 2
+
+  print('Final result: ', m.result().numpy())  # Final result: 3.833
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile(
+      'sgd',
+      loss='mse',
+      metrics=[tf.keras.metrics.BinaryCrossentropy()])
+  ```
+  """
+
+  def __init__(self,
+               name='binary_crossentropy',
+               dtype=None,
+               from_logits=False,
+               label_smoothing=0):
+    """Creates a `BinaryCrossentropy` instance.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      from_logits: (Optional )Whether output is expected to be a logits tensor.
+        By default, we consider that output encodes a probability distribution.
+      label_smoothing: (Optional) Float in [0, 1]. When > 0, label values are
+        smoothed, meaning the confidence on label values are relaxed.
+        e.g. `label_smoothing=0.2` means that we will use a value of `0.1` for
+        label `0` and `0.9` for label `1`"
+    """
+    label_smoothing = ops.convert_to_tensor(label_smoothing, dtype=K.floatx())
+
+    super(BinaryCrossentropy, self).__init__(
+        binary_crossentropy,
+        name,
+        dtype=dtype,
+        from_logits=from_logits,
+        label_smoothing=label_smoothing)
+
+
+@keras_export('keras.metrics.CategoricalCrossentropy')
+class CategoricalCrossentropy(MeanMetricWrapper):
+  """Computes the crossentropy metric between the labels and predictions.
+
+  This is the crossentropy metric class to be used when there are multiple
+  label classes (2 or more). Here we assume that labels are given as a `one_hot`
+  representation. eg., When labels values are [2, 0, 1],
+   `y_true` = [[0, 0, 1], [1, 0, 0], [0, 1, 0]].
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.CategoricalCrossentropy()
+  m.update_state([[0, 1, 0], [0, 0, 1]],
+                 [[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
+
+  # EPSILON = 1e-7, y = y_true, y` = y_pred
+  # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+  # y` = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+
+  # xent = -sum(y * log(y'), axis = -1)
+  #      = -((log 0.95), (log 0.1))
+  #      = [0.051, 2.302]
+  # Reduced xent = (0.051 + 2.302) / 2
+
+  print('Final result: ', m.result().numpy())  # Final result: 1.176
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile(
+    'sgd',
+    loss='mse',
+    metrics=[tf.keras.metrics.CategoricalCrossentropy()])
+  ```
+
+  Args:
+    name: (Optional) string name of the metric instance.
+    dtype: (Optional) data type of the metric result.
+    from_logits: (Optional ) Whether `y_pred` is expected to be a logits tensor.
+      By default, we assume that `y_pred` encodes a probability distribution.
+    label_smoothing: Float in [0, 1]. When > 0, label values are smoothed,
+      meaning the confidence on label values are relaxed. e.g.
+      `label_smoothing=0.2` means that we will use a value of `0.1` for label
+      `0` and `0.9` for label `1`"
+  """
+
+  def __init__(self,
+               name='categorical_crossentropy',
+               dtype=None,
+               from_logits=False,
+               label_smoothing=0):
+    label_smoothing = ops.convert_to_tensor(label_smoothing, dtype=K.floatx())
+
+    super(CategoricalCrossentropy, self).__init__(
+        categorical_crossentropy,
+        name,
+        dtype=dtype,
+        from_logits=from_logits,
+        label_smoothing=label_smoothing)
+
+
+@keras_export('keras.metrics.SparseCategoricalCrossentropy')
+class SparseCategoricalCrossentropy(MeanMetricWrapper):
+  """Computes the crossentropy metric between the labels and predictions.
+
+  Use this crossentropy metric when there are two or more label classes.
+  We expect labels to be provided as integers. If you want to provide labels
+  using `one-hot` representation, please use `CategoricalCrossentropy` metric.
+  There should be `# classes` floating point values per feature for `y_pred`
+  and a single floating point value per feature for `y_true`.
+
+  In the snippet below, there is a single floating point value per example for
+  `y_true` and `# classes` floating pointing values per example for `y_pred`.
+  The shape of `y_true` is `[batch_size]` and the shape of `y_pred` is
+  `[batch_size, num_classes]`.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.SparseCategoricalCrossentropy()
+  m.update_state(
+    [1, 2],
+    [[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
+
+  # y_true = one_hot(y_true) = [[0, 1, 0], [0, 0, 1]]
+  # logits = log(y_pred)
+  # softmax = exp(logits) / sum(exp(logits), axis=-1)
+  # softmax = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+
+  # xent = -sum(y * log(softmax), 1)
+  # log(softmax) = [[-2.9957, -0.0513, -16.1181], [-2.3026, -0.2231, -2.3026]]
+  # y_true * log(softmax) = [[0, -0.0513, 0], [0, 0, -2.3026]]
+
+  # xent = [0.0513, 2.3026]
+  # Reduced xent = (0.0513 + 2.3026) / 2
+
+  print('Final result: ', m.result().numpy())  # Final result: 1.176
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = tf.keras.Model(inputs, outputs)
+  model.compile(
+    'sgd',
+    loss='mse',
+    metrics=[tf.keras.metrics.SparseCategoricalCrossentropy()])
+  ```
+
+  Args:
+    name: (Optional) string name of the metric instance.
+    dtype: (Optional) data type of the metric result.
+    from_logits: (Optional ) Whether `y_pred` is expected to be a logits tensor.
+      By default, we assume that `y_pred` encodes a probability distribution.
+    axis: (Optional) Defaults to -1. The dimension along which the metric is
+      computed.
+  """
+
+  def __init__(self,
+               name='sparse_categorical_crossentropy',
+               dtype=None,
+               from_logits=False,
+               axis=-1):
+
+    super(SparseCategoricalCrossentropy, self).__init__(
+        sparse_categorical_crossentropy,
+        name,
+        dtype=dtype,
+        from_logits=from_logits,
+        axis=axis)
+
+
+class SumOverBatchSize(Reduce):
+  """Computes the weighted sum over batch size of the given values.
+
+  For example, if values is [1, 3, 5, 7] then the metric value is 4.
+  If the weights were specified as [1, 1, 0, 0] then the value would be 1.
+
+  This metric creates two variables, `total` and `count` that are used to
+  compute the average of `values`. This average is ultimately returned as sum
+  over batch size which is an idempotent operation that simply divides `total`
+  by `count`.
+
+  If `sample_weight` is `None`, weights default to 1.  Use `sample_weight` of 0
+  to mask values.
+  """
+
+  def __init__(self, name='sum_over_batch_size', dtype=None):
+    super(SumOverBatchSize, self).__init__(
+        reduction=metrics_utils.Reduction.SUM_OVER_BATCH_SIZE,
+        name=name,
+        dtype=dtype)
+
+
+class SumOverBatchSizeMetricWrapper(SumOverBatchSize):
+  """Wraps a function with the `SumOverBatchSizeMetricWrapper` metric."""
+
+  def __init__(self, fn, name=None, dtype=None, **kwargs):
+    """Creates a `SumOverBatchSizeMetricWrapper` instance.
+
+    Args:
+      fn: The metric function to wrap, with signature `fn(y_true, y_pred,
+        **kwargs)`.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      **kwargs: The keyword arguments that are passed on to `fn`.
+    """
+    super(SumOverBatchSizeMetricWrapper, self).__init__(name=name, dtype=dtype)
+    self._fn = fn
+    self._fn_kwargs = kwargs
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    y_true = math_ops.cast(y_true, self._dtype)
+    y_pred = math_ops.cast(y_pred, self._dtype)
+    y_pred, y_true, sample_weight = squeeze_or_expand_dimensions(
+        y_pred, y_true, sample_weight)
+
+    matches = self._fn(y_true, y_pred, **self._fn_kwargs)
+    return super(SumOverBatchSizeMetricWrapper, self).update_state(
+        matches, sample_weight=sample_weight)
+
+  def get_config(self):
+    config = {}
+    for k, v in six.iteritems(self._fn_kwargs):
+      config[k] = K.eval(v) if is_tensor_or_variable(v) else v
+    base_config = super(SumOverBatchSizeMetricWrapper, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+def accuracy(y_true, y_pred):
+  y_pred.get_shape().assert_is_compatible_with(y_true.get_shape())
+  if y_true.dtype != y_pred.dtype:
+    y_pred = math_ops.cast(y_pred, y_true.dtype)
+  return math_ops.cast(math_ops.equal(y_true, y_pred), K.floatx())
+
+
+@keras_export('keras.metrics.binary_accuracy')
+def binary_accuracy(y_true, y_pred, threshold=0.5):
+  threshold = math_ops.cast(threshold, y_pred.dtype)
+  y_pred = math_ops.cast(y_pred > threshold, y_pred.dtype)
+  return K.mean(math_ops.equal(y_true, y_pred), axis=-1)
+
+
+@keras_export('keras.metrics.categorical_accuracy')
+def categorical_accuracy(y_true, y_pred):
+  return math_ops.cast(
+      math_ops.equal(
+          math_ops.argmax(y_true, axis=-1), math_ops.argmax(y_pred, axis=-1)),
+      K.floatx())
+
+
+@keras_export('keras.metrics.sparse_categorical_accuracy')
+def sparse_categorical_accuracy(y_true, y_pred):
+  y_pred_rank = ops.convert_to_tensor(y_pred).get_shape().ndims
+  y_true_rank = ops.convert_to_tensor(y_true).get_shape().ndims
+  # If the shape of y_true is (num_samples, 1), squeeze to (num_samples,)
+  if (y_true_rank is not None) and (y_pred_rank is not None) and (len(
+      K.int_shape(y_true)) == len(K.int_shape(y_pred))):
+    y_true = array_ops.squeeze(y_true, [-1])
+  y_pred = math_ops.argmax(y_pred, axis=-1)
+
+  # If the predicted output and actual output types don't match, force cast them
+  # to match.
+  if K.dtype(y_pred) != K.dtype(y_true):
+    y_pred = math_ops.cast(y_pred, K.dtype(y_true))
+
+  return math_ops.cast(math_ops.equal(y_true, y_pred), K.floatx())
+
+
+@keras_export('keras.metrics.top_k_categorical_accuracy')
+def top_k_categorical_accuracy(y_true, y_pred, k=5):
+  return K.mean(
+      nn.in_top_k(y_pred, math_ops.argmax(y_true, axis=-1), k), axis=-1)
+
+
+@keras_export('keras.metrics.sparse_top_k_categorical_accuracy')
+def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
+  y_pred_rank = ops.convert_to_tensor(y_pred).get_shape().ndims
+  y_true_rank = ops.convert_to_tensor(y_true).get_shape().ndims
+  # If the shape of y_true is (num_samples, 1), squeeze to (num_samples,)
+  if (y_true_rank is not None) and (y_pred_rank is not None) and (len(
+      K.int_shape(y_true)) == len(K.int_shape(y_pred))):
+    y_true = array_ops.squeeze(y_true, [-1])
+
+  return K.mean(nn.in_top_k(y_pred, math_ops.cast(y_true, 'int32'), k), axis=-1)
+
+# Aliases
+
+mse = MSE = mean_squared_error
+mae = MAE = mean_absolute_error
+mape = MAPE = mean_absolute_percentage_error
+msle = MSLE = mean_squared_logarithmic_error
+cosine_proximity = cosine_similarity
+
+
+def clone_metric(metric):
+  """Returns a clone of the metric if stateful, otherwise returns it as is."""
+  if isinstance(metric, Metric):
+    return metric.__class__.from_config(metric.get_config())
+  return metric
+
+
+def clone_metrics(metrics):
+  """Clones the given metric list/dict."""
+  if metrics is None:
+    return None
+  if isinstance(metrics, dict):
+    return {key: clone_metric(value) for key, value in metrics.items()}
+  return [clone_metric(metric) for metric in metrics]
 
 
-@tf_export('keras.metrics.serialize')
+@keras_export('keras.metrics.serialize')
 def serialize(metric):
   return serialize_keras_object(metric)
 
 
-@tf_export('keras.metrics.deserialize')
+@keras_export('keras.metrics.deserialize')
 def deserialize(config, custom_objects=None):
   return deserialize_keras_object(
       config,
@@ -1590,7 +2787,7 @@ def deserialize(config, custom_objects=None):
       printable_module_name='metric function')
 
 
-@tf_export('keras.metrics.get')
+@keras_export('keras.metrics.get')
 def get(identifier):
   if isinstance(identifier, dict):
     return deserialize(identifier)
diff --git a/tensorflow/python/keras/metrics_confusion_matrix_test.py b/tensorflow/python/keras/metrics_confusion_matrix_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..972f7b6de7bd6a8b856737a57bf79ae58746e758
--- /dev/null
+++ b/tensorflow/python/keras/metrics_confusion_matrix_test.py
@@ -0,0 +1,1131 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras metrics functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import metrics
+from tensorflow.python.keras.utils import metrics_utils
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FalsePositivesTest(test.TestCase):
+
+  def test_config(self):
+    fp_obj = metrics.FalsePositives(name='my_fp', thresholds=[0.4, 0.9])
+    self.assertEqual(fp_obj.name, 'my_fp')
+    self.assertEqual(len(fp_obj.variables), 1)
+    self.assertEqual(fp_obj.thresholds, [0.4, 0.9])
+
+    # Check save and restore config
+    fp_obj2 = metrics.FalsePositives.from_config(fp_obj.get_config())
+    self.assertEqual(fp_obj2.name, 'my_fp')
+    self.assertEqual(len(fp_obj2.variables), 1)
+    self.assertEqual(fp_obj2.thresholds, [0.4, 0.9])
+
+  def test_unweighted(self):
+    fp_obj = metrics.FalsePositives()
+    self.evaluate(variables.variables_initializer(fp_obj.variables))
+
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+
+    update_op = fp_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = fp_obj.result()
+    self.assertAllClose(7., result)
+
+  def test_weighted(self):
+    fp_obj = metrics.FalsePositives()
+    self.evaluate(variables.variables_initializer(fp_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = fp_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(14., self.evaluate(result))
+
+  def test_unweighted_with_thresholds(self):
+    fp_obj = metrics.FalsePositives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(fp_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    update_op = fp_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = fp_obj.result()
+    self.assertAllClose([7., 4., 2.], result)
+
+  def test_weighted_with_thresholds(self):
+    fp_obj = metrics.FalsePositives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(fp_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+    sample_weight = ((1.0, 2.0, 3.0, 5.0), (7.0, 11.0, 13.0, 17.0),
+                     (19.0, 23.0, 29.0, 31.0), (5.0, 15.0, 10.0, 0))
+
+    result = fp_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose([125., 42., 12.], self.evaluate(result))
+
+  def test_threshold_limit(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'Threshold values must be in \[0, 1\]. Invalid values: \[-1, 2\]'):
+      metrics.FalsePositives(thresholds=[-1, 0.5, 2])
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'Threshold values must be in \[0, 1\]. Invalid values: \[None\]'):
+      metrics.FalsePositives(thresholds=[None])
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FalseNegativesTest(test.TestCase):
+
+  def test_config(self):
+    fn_obj = metrics.FalseNegatives(name='my_fn', thresholds=[0.4, 0.9])
+    self.assertEqual(fn_obj.name, 'my_fn')
+    self.assertEqual(len(fn_obj.variables), 1)
+    self.assertEqual(fn_obj.thresholds, [0.4, 0.9])
+
+    # Check save and restore config
+    fn_obj2 = metrics.FalseNegatives.from_config(fn_obj.get_config())
+    self.assertEqual(fn_obj2.name, 'my_fn')
+    self.assertEqual(len(fn_obj2.variables), 1)
+    self.assertEqual(fn_obj2.thresholds, [0.4, 0.9])
+
+  def test_unweighted(self):
+    fn_obj = metrics.FalseNegatives()
+    self.evaluate(variables.variables_initializer(fn_obj.variables))
+
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+
+    update_op = fn_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = fn_obj.result()
+    self.assertAllClose(3., result)
+
+  def test_weighted(self):
+    fn_obj = metrics.FalseNegatives()
+    self.evaluate(variables.variables_initializer(fn_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = fn_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(5., self.evaluate(result))
+
+  def test_unweighted_with_thresholds(self):
+    fn_obj = metrics.FalseNegatives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(fn_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    update_op = fn_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = fn_obj.result()
+    self.assertAllClose([1., 4., 6.], result)
+
+  def test_weighted_with_thresholds(self):
+    fn_obj = metrics.FalseNegatives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(fn_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+    sample_weight = ((3.0,), (5.0,), (7.0,), (4.0,))
+
+    result = fn_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose([4., 16., 23.], self.evaluate(result))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TrueNegativesTest(test.TestCase):
+
+  def test_config(self):
+    tn_obj = metrics.TrueNegatives(name='my_tn', thresholds=[0.4, 0.9])
+    self.assertEqual(tn_obj.name, 'my_tn')
+    self.assertEqual(len(tn_obj.variables), 1)
+    self.assertEqual(tn_obj.thresholds, [0.4, 0.9])
+
+    # Check save and restore config
+    tn_obj2 = metrics.TrueNegatives.from_config(tn_obj.get_config())
+    self.assertEqual(tn_obj2.name, 'my_tn')
+    self.assertEqual(len(tn_obj2.variables), 1)
+    self.assertEqual(tn_obj2.thresholds, [0.4, 0.9])
+
+  def test_unweighted(self):
+    tn_obj = metrics.TrueNegatives()
+    self.evaluate(variables.variables_initializer(tn_obj.variables))
+
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+
+    update_op = tn_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = tn_obj.result()
+    self.assertAllClose(3., result)
+
+  def test_weighted(self):
+    tn_obj = metrics.TrueNegatives()
+    self.evaluate(variables.variables_initializer(tn_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = tn_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(4., self.evaluate(result))
+
+  def test_unweighted_with_thresholds(self):
+    tn_obj = metrics.TrueNegatives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(tn_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    update_op = tn_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = tn_obj.result()
+    self.assertAllClose([2., 5., 7.], result)
+
+  def test_weighted_with_thresholds(self):
+    tn_obj = metrics.TrueNegatives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(tn_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+    sample_weight = ((0.0, 2.0, 3.0, 5.0),)
+
+    result = tn_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose([5., 15., 23.], self.evaluate(result))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TruePositivesTest(test.TestCase):
+
+  def test_config(self):
+    tp_obj = metrics.TruePositives(name='my_tp', thresholds=[0.4, 0.9])
+    self.assertEqual(tp_obj.name, 'my_tp')
+    self.assertEqual(len(tp_obj.variables), 1)
+    self.assertEqual(tp_obj.thresholds, [0.4, 0.9])
+
+    # Check save and restore config
+    tp_obj2 = metrics.TruePositives.from_config(tp_obj.get_config())
+    self.assertEqual(tp_obj2.name, 'my_tp')
+    self.assertEqual(len(tp_obj2.variables), 1)
+    self.assertEqual(tp_obj2.thresholds, [0.4, 0.9])
+
+  def test_unweighted(self):
+    tp_obj = metrics.TruePositives()
+    self.evaluate(variables.variables_initializer(tp_obj.variables))
+
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+
+    update_op = tp_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = tp_obj.result()
+    self.assertAllClose(7., result)
+
+  def test_weighted(self):
+    tp_obj = metrics.TruePositives()
+    self.evaluate(variables.variables_initializer(tp_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = tp_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(12., self.evaluate(result))
+
+  def test_unweighted_with_thresholds(self):
+    tp_obj = metrics.TruePositives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(tp_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    update_op = tp_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = tp_obj.result()
+    self.assertAllClose([6., 3., 1.], result)
+
+  def test_weighted_with_thresholds(self):
+    tp_obj = metrics.TruePositives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(tp_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    result = tp_obj(y_true, y_pred, sample_weight=37.)
+    self.assertAllClose([222., 111., 37.], self.evaluate(result))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class PrecisionTest(test.TestCase):
+
+  def test_config(self):
+    p_obj = metrics.Precision(
+        name='my_precision', thresholds=[0.4, 0.9], top_k=15, class_id=12)
+    self.assertEqual(p_obj.name, 'my_precision')
+    self.assertEqual(len(p_obj.variables), 2)
+    self.assertEqual([v.name for v in p_obj.variables],
+                     ['true_positives:0', 'false_positives:0'])
+    self.assertEqual(p_obj.thresholds, [0.4, 0.9])
+    self.assertEqual(p_obj.top_k, 15)
+    self.assertEqual(p_obj.class_id, 12)
+
+    # Check save and restore config
+    p_obj2 = metrics.Precision.from_config(p_obj.get_config())
+    self.assertEqual(p_obj2.name, 'my_precision')
+    self.assertEqual(len(p_obj2.variables), 2)
+    self.assertEqual(p_obj2.thresholds, [0.4, 0.9])
+    self.assertEqual(p_obj2.top_k, 15)
+    self.assertEqual(p_obj2.class_id, 12)
+
+  def test_value_is_idempotent(self):
+    p_obj = metrics.Precision(thresholds=[0.3, 0.72])
+    y_pred = random_ops.random_uniform(shape=(10, 3))
+    y_true = random_ops.random_uniform(shape=(10, 3))
+    update_op = p_obj.update_state(y_true, y_pred)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+
+    # Run several updates.
+    for _ in range(10):
+      self.evaluate(update_op)
+
+    # Then verify idempotency.
+    initial_precision = self.evaluate(p_obj.result())
+    for _ in range(10):
+      self.assertArrayNear(initial_precision, self.evaluate(p_obj.result()),
+                           1e-3)
+
+  def test_unweighted(self):
+    p_obj = metrics.Precision()
+    y_pred = constant_op.constant([1, 0, 1, 0], shape=(1, 4))
+    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.5, self.evaluate(result))
+
+  def test_unweighted_all_incorrect(self):
+    p_obj = metrics.Precision(thresholds=[0.5])
+    inputs = np.random.randint(0, 2, size=(100, 1))
+    y_pred = constant_op.constant(inputs)
+    y_true = constant_op.constant(1 - inputs)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(0, self.evaluate(result))
+
+  def test_weighted(self):
+    p_obj = metrics.Precision()
+    y_pred = constant_op.constant([[1, 0, 1, 0], [1, 0, 1, 0]])
+    y_true = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(
+        y_true,
+        y_pred,
+        sample_weight=constant_op.constant([[1, 2, 3, 4], [4, 3, 2, 1]]))
+    weighted_tp = 3.0 + 4.0
+    weighted_positives = (1.0 + 3.0) + (4.0 + 2.0)
+    expected_precision = weighted_tp / weighted_positives
+    self.assertAlmostEqual(expected_precision, self.evaluate(result))
+
+  def test_div_by_zero(self):
+    p_obj = metrics.Precision()
+    y_pred = constant_op.constant([0, 0, 0, 0])
+    y_true = constant_op.constant([0, 0, 0, 0])
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertEqual(0, self.evaluate(result))
+
+  def test_unweighted_with_threshold(self):
+    p_obj = metrics.Precision(thresholds=[0.5, 0.7])
+    y_pred = constant_op.constant([1, 0, 0.6, 0], shape=(1, 4))
+    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertArrayNear([0.5, 0.], self.evaluate(result), 0)
+
+  def test_weighted_with_threshold(self):
+    p_obj = metrics.Precision(thresholds=[0.5, 1.])
+    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
+    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    weights = constant_op.constant([[4, 0], [3, 1]],
+                                   shape=(2, 2),
+                                   dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred, sample_weight=weights)
+    weighted_tp = 0 + 3.
+    weighted_positives = (0 + 3.) + (4. + 0.)
+    expected_precision = weighted_tp / weighted_positives
+    self.assertArrayNear([expected_precision, 0], self.evaluate(result), 1e-3)
+
+  def test_multiple_updates(self):
+    p_obj = metrics.Precision(thresholds=[0.5, 1.])
+    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
+    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    weights = constant_op.constant([[4, 0], [3, 1]],
+                                   shape=(2, 2),
+                                   dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    update_op = p_obj.update_state(y_true, y_pred, sample_weight=weights)
+    for _ in range(2):
+      self.evaluate(update_op)
+
+    weighted_tp = (0 + 3.) + (0 + 3.)
+    weighted_positives = ((0 + 3.) + (4. + 0.)) + ((0 + 3.) + (4. + 0.))
+    expected_precision = weighted_tp / weighted_positives
+    self.assertArrayNear([expected_precision, 0], self.evaluate(p_obj.result()),
+                         1e-3)
+
+  def test_unweighted_top_k(self):
+    p_obj = metrics.Precision(top_k=3)
+    y_pred = constant_op.constant([0.2, 0.1, 0.5, 0, 0.2], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 1, 0, 0], shape=(1, 5))
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(1. / 3, self.evaluate(result))
+
+  def test_weighted_top_k(self):
+    p_obj = metrics.Precision(top_k=3)
+    y_pred1 = constant_op.constant([0.2, 0.1, 0.4, 0, 0.2], shape=(1, 5))
+    y_true1 = constant_op.constant([0, 1, 1, 0, 1], shape=(1, 5))
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    self.evaluate(
+        p_obj(
+            y_true1,
+            y_pred1,
+            sample_weight=constant_op.constant([[1, 4, 2, 3, 5]])))
+
+    y_pred2 = constant_op.constant([0.2, 0.6, 0.4, 0.2, 0.2], shape=(1, 5))
+    y_true2 = constant_op.constant([1, 0, 1, 1, 1], shape=(1, 5))
+    result = p_obj(y_true2, y_pred2, sample_weight=constant_op.constant(3))
+
+    tp = (2 + 5) + (3 + 3)
+    predicted_positives = (1 + 2 + 5) + (3 + 3 + 3)
+    expected_precision = tp / predicted_positives
+    self.assertAlmostEqual(expected_precision, self.evaluate(result))
+
+  def test_unweighted_class_id(self):
+    p_obj = metrics.Precision(class_id=2)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+
+    y_pred = constant_op.constant([0.2, 0.1, 0.6, 0, 0.2], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 1, 0, 0], shape=(1, 5))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(1, self.evaluate(result))
+    self.assertAlmostEqual(1, self.evaluate(p_obj.true_positives))
+    self.assertAlmostEqual(0, self.evaluate(p_obj.false_positives))
+
+    y_pred = constant_op.constant([0.2, 0.1, 0, 0, 0.2], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 1, 0, 0], shape=(1, 5))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(1, self.evaluate(result))
+    self.assertAlmostEqual(1, self.evaluate(p_obj.true_positives))
+    self.assertAlmostEqual(0, self.evaluate(p_obj.false_positives))
+
+    y_pred = constant_op.constant([0.2, 0.1, 0.6, 0, 0.2], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 0, 0, 0], shape=(1, 5))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.5, self.evaluate(result))
+    self.assertAlmostEqual(1, self.evaluate(p_obj.true_positives))
+    self.assertAlmostEqual(1, self.evaluate(p_obj.false_positives))
+
+  def test_unweighted_top_k_and_class_id(self):
+    p_obj = metrics.Precision(class_id=2, top_k=2)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+
+    y_pred = constant_op.constant([0.2, 0.6, 0.3, 0, 0.2], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 1, 0, 0], shape=(1, 5))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(1, self.evaluate(result))
+    self.assertAlmostEqual(1, self.evaluate(p_obj.true_positives))
+    self.assertAlmostEqual(0, self.evaluate(p_obj.false_positives))
+
+    y_pred = constant_op.constant([1, 1, 0.9, 1, 1], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 1, 0, 0], shape=(1, 5))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(1, self.evaluate(result))
+    self.assertAlmostEqual(1, self.evaluate(p_obj.true_positives))
+    self.assertAlmostEqual(0, self.evaluate(p_obj.false_positives))
+
+  def test_unweighted_top_k_and_threshold(self):
+    p_obj = metrics.Precision(thresholds=.7, top_k=2)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+
+    y_pred = constant_op.constant([0.2, 0.8, 0.6, 0, 0.2], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 1, 0, 1], shape=(1, 5))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(1, self.evaluate(result))
+    self.assertAlmostEqual(1, self.evaluate(p_obj.true_positives))
+    self.assertAlmostEqual(0, self.evaluate(p_obj.false_positives))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RecallTest(test.TestCase):
+
+  def test_config(self):
+    r_obj = metrics.Recall(
+        name='my_recall', thresholds=[0.4, 0.9], top_k=15, class_id=12)
+    self.assertEqual(r_obj.name, 'my_recall')
+    self.assertEqual(len(r_obj.variables), 2)
+    self.assertEqual([v.name for v in r_obj.variables],
+                     ['true_positives:0', 'false_negatives:0'])
+    self.assertEqual(r_obj.thresholds, [0.4, 0.9])
+    self.assertEqual(r_obj.top_k, 15)
+    self.assertEqual(r_obj.class_id, 12)
+
+    # Check save and restore config
+    r_obj2 = metrics.Recall.from_config(r_obj.get_config())
+    self.assertEqual(r_obj2.name, 'my_recall')
+    self.assertEqual(len(r_obj2.variables), 2)
+    self.assertEqual(r_obj2.thresholds, [0.4, 0.9])
+    self.assertEqual(r_obj2.top_k, 15)
+    self.assertEqual(r_obj2.class_id, 12)
+
+  def test_value_is_idempotent(self):
+    r_obj = metrics.Recall(thresholds=[0.3, 0.72])
+    y_pred = random_ops.random_uniform(shape=(10, 3))
+    y_true = random_ops.random_uniform(shape=(10, 3))
+    update_op = r_obj.update_state(y_true, y_pred)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+
+    # Run several updates.
+    for _ in range(10):
+      self.evaluate(update_op)
+
+    # Then verify idempotency.
+    initial_recall = self.evaluate(r_obj.result())
+    for _ in range(10):
+      self.assertArrayNear(initial_recall, self.evaluate(r_obj.result()), 1e-3)
+
+  def test_unweighted(self):
+    r_obj = metrics.Recall()
+    y_pred = constant_op.constant([1, 0, 1, 0], shape=(1, 4))
+    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.5, self.evaluate(result))
+
+  def test_unweighted_all_incorrect(self):
+    r_obj = metrics.Recall(thresholds=[0.5])
+    inputs = np.random.randint(0, 2, size=(100, 1))
+    y_pred = constant_op.constant(inputs)
+    y_true = constant_op.constant(1 - inputs)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(0, self.evaluate(result))
+
+  def test_weighted(self):
+    r_obj = metrics.Recall()
+    y_pred = constant_op.constant([[1, 0, 1, 0], [0, 1, 0, 1]])
+    y_true = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(
+        y_true,
+        y_pred,
+        sample_weight=constant_op.constant([[1, 2, 3, 4], [4, 3, 2, 1]]))
+    weighted_tp = 3.0 + 1.0
+    weighted_t = (2.0 + 3.0) + (4.0 + 1.0)
+    expected_recall = weighted_tp / weighted_t
+    self.assertAlmostEqual(expected_recall, self.evaluate(result))
+
+  def test_div_by_zero(self):
+    r_obj = metrics.Recall()
+    y_pred = constant_op.constant([0, 0, 0, 0])
+    y_true = constant_op.constant([0, 0, 0, 0])
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertEqual(0, self.evaluate(result))
+
+  def test_unweighted_with_threshold(self):
+    r_obj = metrics.Recall(thresholds=[0.5, 0.7])
+    y_pred = constant_op.constant([1, 0, 0.6, 0], shape=(1, 4))
+    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertArrayNear([0.5, 0.], self.evaluate(result), 0)
+
+  def test_weighted_with_threshold(self):
+    r_obj = metrics.Recall(thresholds=[0.5, 1.])
+    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
+    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    weights = constant_op.constant([[1, 4], [3, 2]],
+                                   shape=(2, 2),
+                                   dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred, sample_weight=weights)
+    weighted_tp = 0 + 3.
+    weighted_positives = (0 + 3.) + (4. + 0.)
+    expected_recall = weighted_tp / weighted_positives
+    self.assertArrayNear([expected_recall, 0], self.evaluate(result), 1e-3)
+
+  def test_multiple_updates(self):
+    r_obj = metrics.Recall(thresholds=[0.5, 1.])
+    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
+    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    weights = constant_op.constant([[1, 4], [3, 2]],
+                                   shape=(2, 2),
+                                   dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    update_op = r_obj.update_state(y_true, y_pred, sample_weight=weights)
+    for _ in range(2):
+      self.evaluate(update_op)
+
+    weighted_tp = (0 + 3.) + (0 + 3.)
+    weighted_positives = ((0 + 3.) + (4. + 0.)) + ((0 + 3.) + (4. + 0.))
+    expected_recall = weighted_tp / weighted_positives
+    self.assertArrayNear([expected_recall, 0], self.evaluate(r_obj.result()),
+                         1e-3)
+
+  def test_unweighted_top_k(self):
+    r_obj = metrics.Recall(top_k=3)
+    y_pred = constant_op.constant([0.2, 0.1, 0.5, 0, 0.2], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 1, 0, 0], shape=(1, 5))
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.5, self.evaluate(result))
+
+  def test_weighted_top_k(self):
+    r_obj = metrics.Recall(top_k=3)
+    y_pred1 = constant_op.constant([0.2, 0.1, 0.4, 0, 0.2], shape=(1, 5))
+    y_true1 = constant_op.constant([0, 1, 1, 0, 1], shape=(1, 5))
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    self.evaluate(
+        r_obj(
+            y_true1,
+            y_pred1,
+            sample_weight=constant_op.constant([[1, 4, 2, 3, 5]])))
+
+    y_pred2 = constant_op.constant([0.2, 0.6, 0.4, 0.2, 0.2], shape=(1, 5))
+    y_true2 = constant_op.constant([1, 0, 1, 1, 1], shape=(1, 5))
+    result = r_obj(y_true2, y_pred2, sample_weight=constant_op.constant(3))
+
+    tp = (2 + 5) + (3 + 3)
+    positives = (4 + 2 + 5) + (3 + 3 + 3 + 3)
+    expected_recall = tp / positives
+    self.assertAlmostEqual(expected_recall, self.evaluate(result))
+
+  def test_unweighted_class_id(self):
+    r_obj = metrics.Recall(class_id=2)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+
+    y_pred = constant_op.constant([0.2, 0.1, 0.6, 0, 0.2], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 1, 0, 0], shape=(1, 5))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(1, self.evaluate(result))
+    self.assertAlmostEqual(1, self.evaluate(r_obj.true_positives))
+    self.assertAlmostEqual(0, self.evaluate(r_obj.false_negatives))
+
+    y_pred = constant_op.constant([0.2, 0.1, 0, 0, 0.2], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 1, 0, 0], shape=(1, 5))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.5, self.evaluate(result))
+    self.assertAlmostEqual(1, self.evaluate(r_obj.true_positives))
+    self.assertAlmostEqual(1, self.evaluate(r_obj.false_negatives))
+
+    y_pred = constant_op.constant([0.2, 0.1, 0.6, 0, 0.2], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 0, 0, 0], shape=(1, 5))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.5, self.evaluate(result))
+    self.assertAlmostEqual(1, self.evaluate(r_obj.true_positives))
+    self.assertAlmostEqual(1, self.evaluate(r_obj.false_negatives))
+
+  def test_unweighted_top_k_and_class_id(self):
+    r_obj = metrics.Recall(class_id=2, top_k=2)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+
+    y_pred = constant_op.constant([0.2, 0.6, 0.3, 0, 0.2], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 1, 0, 0], shape=(1, 5))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(1, self.evaluate(result))
+    self.assertAlmostEqual(1, self.evaluate(r_obj.true_positives))
+    self.assertAlmostEqual(0, self.evaluate(r_obj.false_negatives))
+
+    y_pred = constant_op.constant([1, 1, 0.9, 1, 1], shape=(1, 5))
+    y_true = constant_op.constant([0, 1, 1, 0, 0], shape=(1, 5))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.5, self.evaluate(result))
+    self.assertAlmostEqual(1, self.evaluate(r_obj.true_positives))
+    self.assertAlmostEqual(1, self.evaluate(r_obj.false_negatives))
+
+  def test_unweighted_top_k_and_threshold(self):
+    r_obj = metrics.Recall(thresholds=.7, top_k=2)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+
+    y_pred = constant_op.constant([0.2, 0.8, 0.6, 0, 0.2], shape=(1, 5))
+    y_true = constant_op.constant([1, 1, 1, 0, 1], shape=(1, 5))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.25, self.evaluate(result))
+    self.assertAlmostEqual(1, self.evaluate(r_obj.true_positives))
+    self.assertAlmostEqual(3, self.evaluate(r_obj.false_negatives))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SensitivityAtSpecificityTest(test.TestCase, parameterized.TestCase):
+
+  def test_config(self):
+    s_obj = metrics.SensitivityAtSpecificity(
+        0.4, num_thresholds=100, name='sensitivity_at_specificity_1')
+    self.assertEqual(s_obj.name, 'sensitivity_at_specificity_1')
+    self.assertLen(s_obj.variables, 4)
+    self.assertEqual(s_obj.specificity, 0.4)
+    self.assertEqual(s_obj.num_thresholds, 100)
+
+    # Check save and restore config
+    s_obj2 = metrics.SensitivityAtSpecificity.from_config(s_obj.get_config())
+    self.assertEqual(s_obj2.name, 'sensitivity_at_specificity_1')
+    self.assertLen(s_obj2.variables, 4)
+    self.assertEqual(s_obj2.specificity, 0.4)
+    self.assertEqual(s_obj2.num_thresholds, 100)
+
+  def test_value_is_idempotent(self):
+    s_obj = metrics.SensitivityAtSpecificity(0.7)
+    y_pred = random_ops.random_uniform((10, 3),
+                                       maxval=1,
+                                       dtype=dtypes.float32,
+                                       seed=1)
+    y_true = random_ops.random_uniform((10, 3),
+                                       maxval=2,
+                                       dtype=dtypes.int64,
+                                       seed=1)
+    update_op = s_obj.update_state(y_true, y_pred)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+
+    # Run several updates.
+    for _ in range(10):
+      self.evaluate(update_op)
+
+    # Then verify idempotency.
+    initial_sensitivity = self.evaluate(s_obj.result())
+    for _ in range(10):
+      self.assertAlmostEqual(initial_sensitivity, self.evaluate(s_obj.result()),
+                             1e-3)
+
+  def test_unweighted_all_correct(self):
+    s_obj = metrics.SensitivityAtSpecificity(0.7)
+    inputs = np.random.randint(0, 2, size=(100, 1))
+    y_pred = constant_op.constant(inputs, dtype=dtypes.float32)
+    y_true = constant_op.constant(inputs)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(1, self.evaluate(result))
+
+  def test_unweighted_high_specificity(self):
+    s_obj = metrics.SensitivityAtSpecificity(0.8)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.1, 0.45, 0.5, 0.8, 0.9]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = constant_op.constant(label_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.8, self.evaluate(result))
+
+  def test_unweighted_low_specificity(self):
+    s_obj = metrics.SensitivityAtSpecificity(0.4)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = constant_op.constant(label_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.6, self.evaluate(result))
+
+  @parameterized.parameters([dtypes.bool, dtypes.int32, dtypes.float32])
+  def test_weighted(self, label_dtype):
+    s_obj = metrics.SensitivityAtSpecificity(0.4)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+    weight_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = math_ops.cast(label_values, dtype=label_dtype)
+    weights = constant_op.constant(weight_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred, sample_weight=weights)
+    self.assertAlmostEqual(0.675, self.evaluate(result))
+
+  def test_invalid_specificity(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'`specificity` must be in the range \[0, 1\].'):
+      metrics.SensitivityAtSpecificity(-1)
+
+  def test_invalid_num_thresholds(self):
+    with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 0.'):
+      metrics.SensitivityAtSpecificity(0.4, num_thresholds=-1)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SpecificityAtSensitivityTest(test.TestCase, parameterized.TestCase):
+
+  def test_config(self):
+    s_obj = metrics.SpecificityAtSensitivity(
+        0.4, num_thresholds=100, name='specificity_at_sensitivity_1')
+    self.assertEqual(s_obj.name, 'specificity_at_sensitivity_1')
+    self.assertLen(s_obj.variables, 4)
+    self.assertEqual(s_obj.sensitivity, 0.4)
+    self.assertEqual(s_obj.num_thresholds, 100)
+
+    # Check save and restore config
+    s_obj2 = metrics.SpecificityAtSensitivity.from_config(s_obj.get_config())
+    self.assertEqual(s_obj2.name, 'specificity_at_sensitivity_1')
+    self.assertLen(s_obj2.variables, 4)
+    self.assertEqual(s_obj2.sensitivity, 0.4)
+    self.assertEqual(s_obj2.num_thresholds, 100)
+
+  def test_value_is_idempotent(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.7)
+    y_pred = random_ops.random_uniform((10, 3),
+                                       maxval=1,
+                                       dtype=dtypes.float32,
+                                       seed=1)
+    y_true = random_ops.random_uniform((10, 3),
+                                       maxval=2,
+                                       dtype=dtypes.int64,
+                                       seed=1)
+    update_op = s_obj.update_state(y_true, y_pred)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+
+    # Run several updates.
+    for _ in range(10):
+      self.evaluate(update_op)
+
+    # Then verify idempotency.
+    initial_specificity = self.evaluate(s_obj.result())
+    for _ in range(10):
+      self.assertAlmostEqual(initial_specificity, self.evaluate(s_obj.result()),
+                             1e-3)
+
+  def test_unweighted_all_correct(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.7)
+    inputs = np.random.randint(0, 2, size=(100, 1))
+    y_pred = constant_op.constant(inputs, dtype=dtypes.float32)
+    y_true = constant_op.constant(inputs)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(1, self.evaluate(result))
+
+  def test_unweighted_high_sensitivity(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.8)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.1, 0.45, 0.5, 0.8, 0.9]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = constant_op.constant(label_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.4, self.evaluate(result))
+
+  def test_unweighted_low_sensitivity(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.4)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = constant_op.constant(label_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.6, self.evaluate(result))
+
+  @parameterized.parameters([dtypes.bool, dtypes.int32, dtypes.float32])
+  def test_weighted(self, label_dtype):
+    s_obj = metrics.SpecificityAtSensitivity(0.4)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+    weight_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = math_ops.cast(label_values, dtype=label_dtype)
+    weights = constant_op.constant(weight_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred, sample_weight=weights)
+    self.assertAlmostEqual(0.4, self.evaluate(result))
+
+  def test_invalid_sensitivity(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'`sensitivity` must be in the range \[0, 1\].'):
+      metrics.SpecificityAtSensitivity(-1)
+
+  def test_invalid_num_thresholds(self):
+    with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 0.'):
+      metrics.SpecificityAtSensitivity(0.4, num_thresholds=-1)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class AUCTest(test.TestCase):
+
+  def setup(self):
+    self.num_thresholds = 3
+    self.y_pred = constant_op.constant([0, 0.5, 0.3, 0.9], dtype=dtypes.float32)
+    self.y_true = constant_op.constant([0, 0, 1, 1])
+    self.sample_weight = [1, 2, 3, 4]
+
+    # threshold values are [0 - 1e-7, 0.5, 1 + 1e-7]
+    # y_pred when threshold = 0 - 1e-7  : [1, 1, 1, 1]
+    # y_pred when threshold = 0.5       : [0, 0, 0, 1]
+    # y_pred when threshold = 1 + 1e-7  : [0, 0, 0, 0]
+
+    # without sample_weight:
+    # tp = np.sum([[0, 0, 1, 1], [0, 0, 0, 1], [0, 0, 0, 0]], axis=1)
+    # fp = np.sum([[1, 1, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], axis=1)
+    # fn = np.sum([[0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 1, 1]], axis=1)
+    # tn = np.sum([[0, 0, 0, 0], [1, 1, 0, 0], [1, 1, 0, 0]], axis=1)
+
+    # tp = [2, 1, 0], fp = [2, 0, 0], fn = [0, 1, 2], tn = [0, 2, 2]
+
+    # with sample_weight:
+    # tp = np.sum([[0, 0, 3, 4], [0, 0, 0, 4], [0, 0, 0, 0]], axis=1)
+    # fp = np.sum([[1, 2, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], axis=1)
+    # fn = np.sum([[0, 0, 0, 0], [0, 0, 3, 0], [0, 0, 3, 4]], axis=1)
+    # tn = np.sum([[0, 0, 0, 0], [1, 2, 0, 0], [1, 2, 0, 0]], axis=1)
+
+    # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
+
+  def test_config(self):
+    auc_obj = metrics.AUC(
+        num_thresholds=100,
+        curve='PR',
+        summation_method='majoring',
+        name='auc_1')
+    self.assertEqual(auc_obj.name, 'auc_1')
+    self.assertEqual(len(auc_obj.variables), 4)
+    self.assertEqual(auc_obj.num_thresholds, 100)
+    self.assertEqual(auc_obj.curve, metrics_utils.AUCCurve.PR)
+    self.assertEqual(auc_obj.summation_method,
+                     metrics_utils.AUCSummationMethod.MAJORING)
+    old_config = auc_obj.get_config()
+    self.assertDictEqual(old_config, json.loads(json.dumps(old_config)))
+
+    # Check save and restore config
+    auc_obj2 = metrics.AUC.from_config(auc_obj.get_config())
+    self.assertEqual(auc_obj2.name, 'auc_1')
+    self.assertEqual(len(auc_obj2.variables), 4)
+    self.assertEqual(auc_obj2.num_thresholds, 100)
+    self.assertEqual(auc_obj2.curve, metrics_utils.AUCCurve.PR)
+    self.assertEqual(auc_obj2.summation_method,
+                     metrics_utils.AUCSummationMethod.MAJORING)
+    new_config = auc_obj2.get_config()
+    self.assertDictEqual(old_config, new_config)
+
+  def test_value_is_idempotent(self):
+    self.setup()
+    auc_obj = metrics.AUC(num_thresholds=3)
+    self.evaluate(variables.variables_initializer(auc_obj.variables))
+
+    # Run several updates.
+    update_op = auc_obj.update_state(self.y_true, self.y_pred)
+    for _ in range(10):
+      self.evaluate(update_op)
+
+    # Then verify idempotency.
+    initial_auc = self.evaluate(auc_obj.result())
+    for _ in range(10):
+      self.assertAllClose(initial_auc, self.evaluate(auc_obj.result()), 1e-3)
+
+  def test_unweighted_all_correct(self):
+    self.setup()
+    auc_obj = metrics.AUC()
+    self.evaluate(variables.variables_initializer(auc_obj.variables))
+    result = auc_obj(self.y_true, self.y_true)
+    self.assertEqual(self.evaluate(result), 1)
+
+  def test_unweighted(self):
+    self.setup()
+    auc_obj = metrics.AUC(num_thresholds=self.num_thresholds)
+    self.evaluate(variables.variables_initializer(auc_obj.variables))
+    result = auc_obj(self.y_true, self.y_pred)
+
+    # tp = [2, 1, 0], fp = [2, 0, 0], fn = [0, 1, 2], tn = [0, 2, 2]
+    # recall = [2/2, 1/(1+1), 0] = [1, 0.5, 0]
+    # fp_rate = [2/2, 0, 0] = [1, 0, 0]
+    # heights = [(1 + 0.5)/2, (0.5 + 0)/2] = [0.75, 0.25]
+    # widths = [(1 - 0), (0 - 0)] = [1, 0]
+    expected_result = (0.75 * 1 + 0.25 * 0)
+    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+  def test_weighted_roc_interpolation(self):
+    self.setup()
+    auc_obj = metrics.AUC(num_thresholds=self.num_thresholds)
+    self.evaluate(variables.variables_initializer(auc_obj.variables))
+    result = auc_obj(self.y_true, self.y_pred, sample_weight=self.sample_weight)
+
+    # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
+    # recall = [7/7, 4/(4+3), 0] = [1, 0.571, 0]
+    # fp_rate = [3/3, 0, 0] = [1, 0, 0]
+    # heights = [(1 + 0.571)/2, (0.571 + 0)/2] = [0.7855, 0.2855]
+    # widths = [(1 - 0), (0 - 0)] = [1, 0]
+    expected_result = (0.7855 * 1 + 0.2855 * 0)
+    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+  def test_weighted_roc_majoring(self):
+    self.setup()
+    auc_obj = metrics.AUC(
+        num_thresholds=self.num_thresholds, summation_method='majoring')
+    self.evaluate(variables.variables_initializer(auc_obj.variables))
+    result = auc_obj(self.y_true, self.y_pred, sample_weight=self.sample_weight)
+
+    # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
+    # recall = [7/7, 4/(4+3), 0] = [1, 0.571, 0]
+    # fp_rate = [3/3, 0, 0] = [1, 0, 0]
+    # heights = [max(1, 0.571), max(0.571, 0)] = [1, 0.571]
+    # widths = [(1 - 0), (0 - 0)] = [1, 0]
+    expected_result = (1 * 1 + 0.571 * 0)
+    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+  def test_weighted_roc_minoring(self):
+    self.setup()
+    auc_obj = metrics.AUC(
+        num_thresholds=self.num_thresholds, summation_method='minoring')
+    self.evaluate(variables.variables_initializer(auc_obj.variables))
+    result = auc_obj(self.y_true, self.y_pred, sample_weight=self.sample_weight)
+
+    # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
+    # recall = [7/7, 4/(4+3), 0] = [1, 0.571, 0]
+    # fp_rate = [3/3, 0, 0] = [1, 0, 0]
+    # heights = [min(1, 0.571), min(0.571, 0)] = [0.571, 0]
+    # widths = [(1 - 0), (0 - 0)] = [1, 0]
+    expected_result = (0.571 * 1 + 0 * 0)
+    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+  def test_weighted_pr_majoring(self):
+    self.setup()
+    auc_obj = metrics.AUC(
+        num_thresholds=self.num_thresholds,
+        curve='PR',
+        summation_method='majoring')
+    self.evaluate(variables.variables_initializer(auc_obj.variables))
+    result = auc_obj(self.y_true, self.y_pred, sample_weight=self.sample_weight)
+
+    # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
+    # precision = [7/(7+3), 4/4, 0] = [0.7, 1, 0]
+    # recall = [7/7, 4/(4+3), 0] = [1, 0.571, 0]
+    # heights = [max(0.7, 1), max(1, 0)] = [1, 1]
+    # widths = [(1 - 0.571), (0.571 - 0)] = [0.429, 0.571]
+    expected_result = (1 * 0.429 + 1 * 0.571)
+    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+  def test_weighted_pr_minoring(self):
+    self.setup()
+    auc_obj = metrics.AUC(
+        num_thresholds=self.num_thresholds,
+        curve='PR',
+        summation_method='minoring')
+    self.evaluate(variables.variables_initializer(auc_obj.variables))
+    result = auc_obj(self.y_true, self.y_pred, sample_weight=self.sample_weight)
+
+    # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
+    # precision = [7/(7+3), 4/4, 0] = [0.7, 1, 0]
+    # recall = [7/7, 4/(4+3), 0] = [1, 0.571, 0]
+    # heights = [min(0.7, 1), min(1, 0)] = [0.7, 0]
+    # widths = [(1 - 0.571), (0.571 - 0)] = [0.429, 0.571]
+    expected_result = (0.7 * 0.429 + 0 * 0.571)
+    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+  def test_weighted_pr_interpolation(self):
+    self.setup()
+    auc_obj = metrics.AUC(num_thresholds=self.num_thresholds, curve='PR')
+    self.evaluate(variables.variables_initializer(auc_obj.variables))
+    result = auc_obj(self.y_true, self.y_pred, sample_weight=self.sample_weight)
+
+    # auc = (slope / Total Pos) * [dTP - intercept * log(Pb/Pa)]
+
+    # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
+    # P = tp + fp = [10, 4, 0]
+    # dTP = [7-4, 4-0] = [3, 4]
+    # dP = [10-4, 4-0] = [6, 4]
+    # slope = dTP/dP = [0.5, 1]
+    # intercept = (TPa+(slope*Pa) = [(4 - 0.5*4), (0 - 1*0)] = [2, 0]
+    # (Pb/Pa) = (Pb/Pa) if Pb > 0 AND Pa > 0 else 1 = [10/4, 4/0] = [2.5, 1]
+    # auc * TotalPos = [(0.5 * (3 + 2 * log(2.5))), (1 * (4 + 0))]
+    #                = [2.416, 4]
+    # auc = [2.416, 4]/(tp[1:]+fn[1:])
+    expected_result = (2.416/7 + 4/7)
+    self.assertAllClose(self.evaluate(result), expected_result, 1e-3)
+
+  def test_invalid_num_thresholds(self):
+    with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 1.'):
+      metrics.AUC(num_thresholds=-1)
+
+    with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 1.'):
+      metrics.AUC(num_thresholds=1)
+
+  def test_invalid_curve(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'Invalid AUC curve value "Invalid".'):
+      metrics.AUC(curve='Invalid')
+
+  def test_invalid_summation_method(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Invalid AUC summation method value "Invalid".'):
+      metrics.AUC(summation_method='Invalid')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/metrics_correctness_test.py b/tensorflow/python/keras/metrics_correctness_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..abef3c4d3f1bd78ae70ea9662c3d49f473c0561c
--- /dev/null
+++ b/tensorflow/python/keras/metrics_correctness_test.py
@@ -0,0 +1,322 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests metrics correctness using Keras model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python import tf2
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import layers
+from tensorflow.python.keras import metrics
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.platform import test
+
+
+@keras_parameterized.run_with_all_model_types(exclude_models=['sequential'])
+@keras_parameterized.run_all_keras_modes
+class TestMetricsCorrectnessMultiIO(keras_parameterized.TestCase):
+
+  def _get_multi_io_model(self):
+    inp_1 = layers.Input(shape=(1,), name='input_1')
+    inp_2 = layers.Input(shape=(1,), name='input_2')
+    x = layers.Dense(3, kernel_initializer='ones', trainable=False)
+    out_1 = layers.Dense(
+        1, kernel_initializer='ones', name='output_1', trainable=False)
+    out_2 = layers.Dense(
+        1, kernel_initializer='ones', name='output_2', trainable=False)
+
+    branch_a = [inp_1, x, out_1]
+    branch_b = [inp_2, x, out_2]
+    model = testing_utils.get_multi_io_model(branch_a, branch_b)
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        metrics=[metrics.MeanSquaredError(name='mean_squared_error')],
+        weighted_metrics=[
+            metrics.MeanSquaredError(name='mean_squared_error_2')
+        ],
+        run_eagerly=testing_utils.should_run_eagerly())
+    return model
+
+  def _custom_generator(self):
+    batch_size = 2
+    num_samples = 4
+    inputs = np.asarray([[1.], [2.], [3.], [4.]])
+    targets = np.asarray([[2.], [4.], [6.], [8.]])
+    w1 = np.asarray([2., 3., 4., 5.])
+    w2 = np.asarray([3.5, 2.5, 1.5, 0.5])
+    i = 0
+    while True:
+      batch_index = i * batch_size % num_samples
+      i += 1
+      start = batch_index
+      end = start + batch_size
+      x = [inputs[start:end], inputs[start:end]]
+      y = [targets[start:end], targets[start:end]]
+      w = [w1[start:end], w2[start:end]]
+      yield x, y, w
+
+  def setUp(self):
+    super(TestMetricsCorrectnessMultiIO, self).setUp()
+    self.x = np.asarray([[1.], [2.], [3.], [4.]])
+    self.y = np.asarray([[2.], [4.], [6.], [8.]])
+    self.weights_1 = np.asarray([2., 3., 4., 5.])
+    self.weights_2 = np.asarray([3.5, 2.5, 1.5, 0.5])
+
+    # y_true = [[2.], [4.], [6.], [8.]], y_pred = [[3.], [6.], [9.], [12.]]
+
+    # Metric `output_1`, `output_2`:
+    #   Total = ((3 - 2)^2 + (6 - 4)^2) + ((9 - 6)^2 + (12 - 8)^2) = 30,
+    #   Count = 2 + 2
+    #   Result = 7.5
+
+    # Weighted metric `output_1`:
+    #   Total = ((3 - 2)^2 * 2  + (6 - 4)^2 * 3) +
+    #           ((9 - 6)^2 * 4 + (12 - 8)^2 * 5)
+    #         = 130
+    #   Count = (2 + 3) + (4 + 5)
+    #   Result = 9.2857141
+
+    # Weighted metric `output_2`:
+    #   Total = ((3 - 2)^2 * 3.5 + (6 - 4)^2 * 2.5) +
+    #           ((9 - 6)^2 * 1.5 + (12 - 8)^2 * 0.5)
+    #         = 35
+    #   Count = (3.5 + 2.5) + (1.5 + 0.5)
+    #   Result = 4.375
+
+    # Loss `output_1`:
+    #   Total = ((3 - 2)^2 * 2  + (6 - 4)^2 * 3) +
+    #           ((9 - 6)^2 * 4 + (12 - 8)^2 * 5)
+    #         = 130
+    #   Count = 2 + 2
+    #   Result = 32.5
+
+    # Loss `output_2`:
+    #   Total = ((3 - 2)^2 * 3.5 + (6 - 4)^2 * 2.5) +
+    #           ((9 - 6)^2 * 1.5 + (12 - 8)^2 * 0.5)
+    #         = 35
+    #   Count = 2 + 2
+    #   Result = 8.75
+
+    # Total loss = 32.5 + 8.75 = 41.25
+
+    wmse = 'mean_squared_error_2'
+    if not tf2.enabled():
+      wmse = 'weighted_' + wmse
+    self.expected_fit_result = {
+        'output_1_mean_squared_error': [7.5, 7.5],
+        'output_2_mean_squared_error': [7.5, 7.5],
+        'output_1_' + wmse: [9.286, 9.286],
+        'output_2_' + wmse: [4.375, 4.375],
+        'loss': [41.25, 41.25],
+        'output_1_loss': [32.5, 32.5],
+        'output_2_loss': [8.75, 8.75],
+    }
+
+    # In the order: 'loss', 'output_1_loss', 'output_2_loss',
+    # 'output_1_mean_squared_error', 'output_1_mean_squared_error_2',
+    # 'output_2_mean_squared_error', 'output_2_mean_squared_error_2'
+    self.expected_batch_result = [41.25, 32.5, 8.75, 7.5, 9.286, 7.5, 4.375]
+
+  def test_fit(self):
+    model = self._get_multi_io_model()
+    history = model.fit([self.x, self.x], [self.y, self.y],
+                        sample_weight={
+                            'output_1': self.weights_1,
+                            'output_2': self.weights_2,
+                        },
+                        batch_size=2,
+                        epochs=2,
+                        shuffle=False)
+    for key, value in self.expected_fit_result.items():
+      self.assertAllClose(history.history[key], value, 1e-3)
+
+  def test_eval(self):
+    model = self._get_multi_io_model()
+    eval_result = model.evaluate([self.x, self.x], [self.y, self.y],
+                                 batch_size=2,
+                                 sample_weight={
+                                     'output_1': self.weights_1,
+                                     'output_2': self.weights_2,
+                                 })
+    self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
+
+    # Verify that metric value is same with arbitrary weights and batch size.
+    x = np.random.random((50, 1))
+    y = np.random.random((50, 1))
+    w = np.random.random((50,))
+    mse1 = model.evaluate([x, x], [y, y], sample_weight=[w, w], batch_size=5)[3]
+    mse2 = model.evaluate([x, x], [y, y], sample_weight=[w, w],
+                          batch_size=10)[3]
+    self.assertAllClose(mse1, mse2, 1e-3)
+
+  def test_train_on_batch(self):
+    model = self._get_multi_io_model()
+    result = model.train_on_batch([self.x, self.x], [self.y, self.y],
+                                  sample_weight={
+                                      'output_1': self.weights_1,
+                                      'output_2': self.weights_2,
+                                  })
+    self.assertAllClose(result, self.expected_batch_result, 1e-3)
+
+  def test_test_on_batch(self):
+    model = self._get_multi_io_model()
+    result = model.test_on_batch([self.x, self.x], [self.y, self.y],
+                                 sample_weight={
+                                     'output_1': self.weights_1,
+                                     'output_2': self.weights_2,
+                                 })
+    self.assertAllClose(result, self.expected_batch_result, 1e-3)
+
+  def test_fit_generator(self):
+    model = self._get_multi_io_model()
+    history = model.fit_generator(
+        self._custom_generator(), steps_per_epoch=2, epochs=2)
+    for key, value in self.expected_fit_result.items():
+      self.assertAllClose(history.history[key], value, 1e-3)
+
+  def test_eval_generator(self):
+    model = self._get_multi_io_model()
+    eval_result = model.evaluate_generator(self._custom_generator(), steps=2)
+    self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
+
+
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes
+class TestMetricsCorrectnessSingleIO(keras_parameterized.TestCase):
+
+  def _get_model(self):
+    x = layers.Dense(3, kernel_initializer='ones', trainable=False)
+    out = layers.Dense(
+        1, kernel_initializer='ones', name='output', trainable=False)
+    model = testing_utils.get_model_from_layers([x, out], input_shape=(1,))
+    model.compile(
+        optimizer='rmsprop',
+        loss='mse',
+        metrics=[metrics.MeanSquaredError(name='mean_squared_error')],
+        weighted_metrics=[
+            metrics.MeanSquaredError(name='mean_squared_error_2')
+        ],
+        run_eagerly=testing_utils.should_run_eagerly())
+    return model
+
+  def _custom_generator(self):
+    batch_size = 2
+    num_samples = 4
+    x = np.asarray([[1.], [2.], [3.], [4.]])
+    y = np.asarray([[2.], [4.], [6.], [8.]])
+    w = np.asarray([2., 3., 4., 5.])
+    i = 0
+    while True:
+      batch_index = i * batch_size % num_samples
+      i += 1
+      start = batch_index
+      end = start + batch_size
+      yield x[start:end], y[start:end], w[start:end]
+
+  def setUp(self):
+    super(TestMetricsCorrectnessSingleIO, self).setUp()
+    self.x = np.asarray([[1.], [2.], [3.], [4.]])
+    self.y = np.asarray([[2.], [4.], [6.], [8.]])
+    self.weights = np.asarray([2., 3., 4., 5.])
+
+    # y_true = [[2.], [4.], [6.], [8.]], y_pred = [[3.], [6.], [9.], [12.]]
+
+    # Metric:
+    #   Total = ((3 - 2)^2 + (6 - 4)^2) + ((9 - 6)^2 + (12 - 8)^2) = 30,
+    #   Count = 2 + 2
+    #   Result = 7.5
+
+    # Weighted metric:
+    #   Total = ((3 - 2)^2 * 2  + (6 - 4)^2 * 3) +
+    #           ((9 - 6)^2 * 4 + (12 - 8)^2 * 5)
+    #         = 130
+    #   Count = (2 + 3) + (4 + 5)
+    #   Result = 9.2857141
+
+    # Total loss:
+    #   Total = ((3 - 2)^2 * 2  + (6 - 4)^2 * 3) +
+    #           ((9 - 6)^2 * 4 + (12 - 8)^2 * 5)
+    #         = 130,
+    #   Count = 2 + 2
+    #   Result = 32.5
+
+    wmse = 'mean_squared_error_2'
+    if not tf2.enabled():
+      wmse = 'weighted_' + wmse
+    self.expected_fit_result = {
+        'mean_squared_error': [7.5, 7.5],
+        wmse: [9.286, 9.286],
+        'loss': [32.5, 32.5]
+    }
+
+    # In the order: 'loss', 'mean_squared_error', 'mean_squared_error_2'
+    self.expected_batch_result = [32.5, 7.5, 9.286]
+
+  def test_fit(self):
+    model = self._get_model()
+    history = model.fit(
+        self.x,
+        self.y,
+        sample_weight=self.weights,
+        batch_size=2,
+        epochs=2,
+        shuffle=False)
+    for key, value in self.expected_fit_result.items():
+      self.assertAllClose(history.history[key], value, 1e-3)
+
+  def test_eval(self):
+    model = self._get_model()
+    eval_result = model.evaluate(
+        self.x, self.y, batch_size=2, sample_weight=self.weights)
+    self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
+
+    # Verify that metric value is same with arbitrary weights and batch size.
+    x = np.random.random((50, 1))
+    y = np.random.random((50, 1))
+    w = np.random.random((50,))
+    mse1 = model.evaluate(x, y, sample_weight=w, batch_size=5)[1]
+    mse2 = model.evaluate(x, y, sample_weight=w, batch_size=10)[1]
+    self.assertAllClose(mse1, mse2, 1e-3)
+
+  def test_train_on_batch(self):
+    model = self._get_model()
+    result = model.train_on_batch(self.x, self.y, sample_weight=self.weights)
+    self.assertAllClose(result, self.expected_batch_result, 1e-3)
+
+  def test_test_on_batch(self):
+    model = self._get_model()
+    result = model.test_on_batch(self.x, self.y, sample_weight=self.weights)
+    self.assertAllClose(result, self.expected_batch_result, 1e-3)
+
+  def test_fit_generator(self):
+    model = self._get_model()
+    history = model.fit_generator(
+        self._custom_generator(), steps_per_epoch=2, epochs=2)
+    for key, value in self.expected_fit_result.items():
+      self.assertAllClose(history.history[key], value, 1e-3)
+
+  def test_eval_generator(self):
+    model = self._get_model()
+    eval_result = model.evaluate_generator(self._custom_generator(), steps=2)
+    self.assertAllClose(eval_result, self.expected_batch_result, 1e-3)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/metrics_functional_test.py b/tensorflow/python/keras/metrics_functional_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..513daaf9fcc01cc6741df1b698190ade1e848492
--- /dev/null
+++ b/tensorflow/python/keras/metrics_functional_test.py
@@ -0,0 +1,122 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras metrics functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import metrics
+from tensorflow.python.platform import test
+
+
+class KerasFunctionalMetricsTest(test.TestCase):
+
+  def test_metrics(self):
+    with self.cached_session():
+      y_a = K.variable(np.random.random((6, 7)))
+      y_b = K.variable(np.random.random((6, 7)))
+      for metric in [metrics.binary_accuracy, metrics.categorical_accuracy]:
+        output = metric(y_a, y_b)
+        self.assertEqual(K.eval(output).shape, (6,))
+
+  def test_sparse_categorical_accuracy_int(self):
+    with self.cached_session():
+      metric = metrics.sparse_categorical_accuracy
+      y_true = K.variable(np.random.randint(0, 7, (6,)))
+      y_pred = K.variable(np.random.random((6, 7)))
+      self.assertEqual(K.eval(metric(y_true, y_pred)).shape, (6,))
+
+      # Test correctness if the shape of y_true is (num_samples,)
+      y_true = K.variable([1., 0., 0., 0.])
+      y_pred = K.variable([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3], [0.9, 0.1]])
+      print(K.eval(metric(y_true, y_pred)))
+      self.assertAllEqual(K.eval(metric(y_true, y_pred)), [0., 1., 1., 1.])
+
+      # Test correctness if the shape of y_true is (num_samples, 1)
+      y_true = K.variable([[1.], [0.], [0.], [0.]])
+      y_pred = K.variable([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3], [0.9, 0.1]])
+      print(K.eval(metric(y_true, y_pred)))
+      self.assertAllEqual(K.eval(metric(y_true, y_pred)), [0., 1., 1., 1.])
+
+  def test_sparse_categorical_accuracy_float(self):
+    with self.cached_session():
+      metric = metrics.sparse_categorical_accuracy
+      y_true = K.variable(np.random.random((6,)))
+      y_pred = K.variable(np.random.random((6, 7)))
+      self.assertEqual(K.eval(metric(y_true, y_pred)).shape, (6,))
+
+  def test_sparse_categorical_accuracy_eager(self):
+    """Tests that ints passed in via Eager return results. See b/113504761."""
+    with context.eager_mode():
+      metric = metrics.sparse_categorical_accuracy
+      y_true = np.arange(6).reshape([6, 1])
+      y_pred = np.arange(36).reshape([6, 6])
+      self.assertAllEqual(metric(y_true, y_pred), [0., 0., 0., 0., 0., 1.])
+
+  def test_sparse_categorical_accuracy_float_eager(self):
+    """Tests that floats passed in via Eager return results. See b/113504761."""
+    with context.eager_mode():
+      metric = metrics.sparse_categorical_accuracy
+      y_true = np.arange(6, dtype=np.float32).reshape([6, 1])
+      y_pred = np.arange(36).reshape([6, 6])
+      self.assertAllEqual(metric(y_true, y_pred), [0., 0., 0., 0., 0., 1.])
+
+  def test_sparse_top_k_categorical_accuracy(self):
+    with self.cached_session():
+      # Test correctness if the shape of y_true is (num_samples, 1)
+      y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
+      y_true = K.variable(np.array([[1], [0]]))
+      result = K.eval(
+          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=3))
+      self.assertEqual(result, 1)
+      result = K.eval(
+          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=2))
+      self.assertEqual(result, 0.5)
+      result = K.eval(
+          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1))
+      self.assertEqual(result, 0.)
+
+      # Test correctness if the shape of y_true is (num_samples,)
+      y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
+      y_true = K.variable(np.array([1, 0]))
+      result = K.eval(
+          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=3))
+      self.assertEqual(result, 1)
+      result = K.eval(
+          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=2))
+      self.assertEqual(result, 0.5)
+      result = K.eval(
+          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1))
+      self.assertEqual(result, 0.)
+
+  def test_top_k_categorical_accuracy(self):
+    with self.cached_session():
+      y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
+      y_true = K.variable(np.array([[0, 1, 0], [1, 0, 0]]))
+      result = K.eval(metrics.top_k_categorical_accuracy(y_true, y_pred, k=3))
+      self.assertEqual(result, 1)
+      result = K.eval(metrics.top_k_categorical_accuracy(y_true, y_pred, k=2))
+      self.assertEqual(result, 0.5)
+      result = K.eval(metrics.top_k_categorical_accuracy(y_true, y_pred, k=1))
+      self.assertEqual(result, 0.)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index 92398acd8e6dc683e37cf759c667c4665961b356..ffe093532a4305e5ec79d17fa3b52f0f70b52757 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -18,120 +18,150 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import json
+import math
 import os
-from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function as eager_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import layers
 from tensorflow.python.keras import metrics
-from tensorflow.python.keras.models import Sequential
+from tensorflow.python.keras import Model
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
-
-
-class KerasMetricsTest(test.TestCase):
-
-  def test_metrics(self):
-    with self.cached_session():
-      y_a = K.variable(np.random.random((6, 7)))
-      y_b = K.variable(np.random.random((6, 7)))
-      for metric in [metrics.binary_accuracy, metrics.categorical_accuracy]:
-        output = metric(y_a, y_b)
-        self.assertEqual(K.eval(output).shape, (6,))
-
-  def test_sparse_categorical_accuracy_int(self):
-    with self.cached_session():
-      metric = metrics.sparse_categorical_accuracy
-      y_true = K.variable(np.random.randint(0, 7, (6,)))
-      y_pred = K.variable(np.random.random((6, 7)))
-      self.assertEqual(K.eval(metric(y_true, y_pred)).shape, (6,))
-
-      # Test correctness if the shape of y_true is (num_samples,)
-      y_true = K.variable([1., 0., 0., 0.])
-      y_pred = K.variable([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3], [0.9, 0.1]])
-      print(K.eval(metric(y_true, y_pred)))
-      self.assertAllEqual(K.eval(metric(y_true, y_pred)), [0., 1., 1., 1.])
-
-      # Test correctness if the shape of y_true is (num_samples, 1)
-      y_true = K.variable([[1.], [0.], [0.], [0.]])
-      y_pred = K.variable([[0.8, 0.2], [0.6, 0.4], [0.7, 0.3], [0.9, 0.1]])
-      print(K.eval(metric(y_true, y_pred)))
-      self.assertAllEqual(K.eval(metric(y_true, y_pred)), [0., 1., 1., 1.])
-
-  def test_sparse_categorical_accuracy_float(self):
-    with self.cached_session():
-      metric = metrics.sparse_categorical_accuracy
-      y_true = K.variable(np.random.random((6,)))
-      y_pred = K.variable(np.random.random((6, 7)))
-      self.assertEqual(K.eval(metric(y_true, y_pred)).shape, (6,))
-
-  def test_sparse_categorical_accuracy_eager(self):
-    """Tests that ints passed in via Eager return results. See b/113504761."""
-    with context.eager_mode():
-      metric = metrics.sparse_categorical_accuracy
-      y_true = np.arange(6).reshape([6, 1])
-      y_pred = np.arange(36).reshape([6, 6])
-      self.assertAllEqual(metric(y_true, y_pred), [0., 0., 0., 0., 0., 1.])
+from tensorflow.python.training.tracking import util as trackable_utils
 
-  def test_sparse_categorical_accuracy_float_eager(self):
-    """Tests that floats passed in via Eager return results. See b/113504761."""
-    with context.eager_mode():
-      metric = metrics.sparse_categorical_accuracy
-      y_true = np.arange(6, dtype=np.float32).reshape([6, 1])
-      y_pred = np.arange(36).reshape([6, 6])
-      self.assertAllEqual(metric(y_true, y_pred), [0., 0., 0., 0., 0., 1.])
-
-  def test_sparse_top_k_categorical_accuracy(self):
-    with self.cached_session():
-      # Test correctness if the shape of y_true is (num_samples, 1)
-      y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
-      y_true = K.variable(np.array([[1], [0]]))
-      result = K.eval(
-          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=3))
-      self.assertEqual(result, 1)
-      result = K.eval(
-          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=2))
-      self.assertEqual(result, 0.5)
-      result = K.eval(
-          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1))
-      self.assertEqual(result, 0.)
-
-      # Test correctness if the shape of y_true is (num_samples,)
-      y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
-      y_true = K.variable(np.array([1, 0]))
-      result = K.eval(
-          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=3))
-      self.assertEqual(result, 1)
-      result = K.eval(
-          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=2))
-      self.assertEqual(result, 0.5)
-      result = K.eval(
-          metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1))
-      self.assertEqual(result, 0.)
-
-  def test_top_k_categorical_accuracy(self):
-    with self.cached_session():
-      y_pred = K.variable(np.array([[0.3, 0.2, 0.1], [0.1, 0.2, 0.7]]))
-      y_true = K.variable(np.array([[0, 1, 0], [1, 0, 0]]))
-      result = K.eval(metrics.top_k_categorical_accuracy(y_true, y_pred, k=3))
-      self.assertEqual(result, 1)
-      result = K.eval(metrics.top_k_categorical_accuracy(y_true, y_pred, k=2))
-      self.assertEqual(result, 0.5)
-      result = K.eval(metrics.top_k_categorical_accuracy(y_true, y_pred, k=1))
-      self.assertEqual(result, 0.)
-
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+
+@test_util.run_all_in_graph_and_eager_modes
+class KerasSumTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_sum(self):
+    m = metrics.Sum(name='my_sum')
+
+    # check config
+    self.assertEqual(m.name, 'my_sum')
+    self.assertTrue(m.stateful)
+    self.assertEqual(m.dtype, dtypes.float32)
+    self.assertEqual(len(m.variables), 1)
+    self.evaluate(variables.variables_initializer(m.variables))
+
+    # check initial state
+    self.assertEqual(self.evaluate(m.total), 0)
+
+    # check __call__()
+    self.assertEqual(self.evaluate(m(100)), 100)
+    self.assertEqual(self.evaluate(m.total), 100)
+
+    # check update_state() and result() + state accumulation + tensor input
+    update_op = m.update_state(ops.convert_n_to_tensor([1, 5]))
+    self.evaluate(update_op)
+    self.assertAlmostEqual(self.evaluate(m.result()), 106)
+    self.assertEqual(self.evaluate(m.total), 106)  # 100 + 1 + 5
+
+    # check reset_states()
+    m.reset_states()
+    self.assertEqual(self.evaluate(m.total), 0)
+
+  def test_sum_with_sample_weight(self):
+    m = metrics.Sum(dtype=dtypes.float64)
+    self.assertEqual(m.dtype, dtypes.float64)
+    self.evaluate(variables.variables_initializer(m.variables))
+
+    # check scalar weight
+    result_t = m(100, sample_weight=0.5)
+    self.assertEqual(self.evaluate(result_t), 50)
+    self.assertEqual(self.evaluate(m.total), 50)
+
+    # check weights not scalar and weights rank matches values rank
+    result_t = m([1, 5], sample_weight=[1, 0.2])
+    result = self.evaluate(result_t)
+    self.assertAlmostEqual(result, 52., 4)  # 50 + 1 + 5 * 0.2
+    self.assertAlmostEqual(self.evaluate(m.total), 52., 4)
+
+    # check weights broadcast
+    result_t = m([1, 2], sample_weight=0.5)
+    self.assertAlmostEqual(self.evaluate(result_t), 53.5, 1)  # 52 + 0.5 + 1
+    self.assertAlmostEqual(self.evaluate(m.total), 53.5, 1)
+
+    # check weights squeeze
+    result_t = m([1, 5], sample_weight=[[1], [0.2]])
+    self.assertAlmostEqual(self.evaluate(result_t), 55.5, 1)  # 53.5 + 1 + 1
+    self.assertAlmostEqual(self.evaluate(m.total), 55.5, 1)
+
+    # check weights expand
+    result_t = m([[1], [5]], sample_weight=[1, 0.2])
+    self.assertAlmostEqual(self.evaluate(result_t), 57.5, 2)  # 55.5 + 1 + 1
+    self.assertAlmostEqual(self.evaluate(m.total), 57.5, 1)
+
+    # check values reduced to the dimensions of weight
+    result_t = m([[[1., 2.], [3., 2.], [0.5, 4.]]], sample_weight=[0.5])
+    result = np.round(self.evaluate(result_t), decimals=2)
+    # result = (prev: 57.5) + 0.5 + 1 + 1.5 + 1 + 0.25 + 2
+    self.assertAlmostEqual(result, 63.75, 2)
+    self.assertAlmostEqual(self.evaluate(m.total), 63.75, 2)
+
+  def test_sum_graph_with_placeholder(self):
+    with context.graph_mode(), self.cached_session() as sess:
+      m = metrics.Sum()
+      v = array_ops.placeholder(dtypes.float32)
+      w = array_ops.placeholder(dtypes.float32)
+      self.evaluate(variables.variables_initializer(m.variables))
+
+      # check __call__()
+      result_t = m(v, sample_weight=w)
+      result = sess.run(result_t, feed_dict=({v: 100, w: 0.5}))
+      self.assertEqual(result, 50)
+      self.assertEqual(self.evaluate(m.total), 50)
+
+      # check update_state() and result()
+      result = sess.run(result_t, feed_dict=({v: [1, 5], w: [1, 0.2]}))
+      self.assertAlmostEqual(result, 52., 2)  # 50 + 1 + 5 * 0.2
+      self.assertAlmostEqual(self.evaluate(m.total), 52., 2)
+
+  def test_save_restore(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
+    m = metrics.Sum()
+    checkpoint = trackable_utils.Checkpoint(sum=m)
+    self.evaluate(variables.variables_initializer(m.variables))
+
+    # update state
+    self.evaluate(m(100.))
+    self.evaluate(m(200.))
+
+    # save checkpoint and then add an update
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.evaluate(m(1000.))
+
+    # restore to the same checkpoint sum object (= 300)
+    checkpoint.restore(save_path).assert_consumed().run_restore_ops()
+    self.evaluate(m(300.))
+    self.assertEqual(600., self.evaluate(m.result()))
+
+    # restore to a different checkpoint sum object
+    restore_sum = metrics.Sum()
+    restore_checkpoint = trackable_utils.Checkpoint(sum=restore_sum)
+    status = restore_checkpoint.restore(save_path)
+    restore_update = restore_sum(300.)
+    status.assert_consumed().run_restore_ops()
+    self.evaluate(restore_update)
+    self.assertEqual(600., self.evaluate(restore_sum.result()))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class KerasMeanTest(test.TestCase):
+
+  # TODO(b/120949004): Re-enable garbage collection check
+  # @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def test_mean(self):
     m = metrics.Mean(name='my_mean')
 
@@ -163,7 +193,13 @@ class KerasMetricsTest(test.TestCase):
     self.assertEqual(self.evaluate(m.total), 0)
     self.assertEqual(self.evaluate(m.count), 0)
 
-  @test_util.run_in_graph_and_eager_modes
+    # Check save and restore config
+    m2 = metrics.Mean.from_config(m.get_config())
+    self.assertEqual(m2.name, 'my_mean')
+    self.assertTrue(m2.stateful)
+    self.assertEqual(m2.dtype, dtypes.float32)
+    self.assertEqual(len(m2.variables), 2)
+
   def test_mean_with_sample_weight(self):
     m = metrics.Mean(dtype=dtypes.float64)
     self.assertEqual(m.dtype, dtypes.float64)
@@ -227,12 +263,11 @@ class KerasMetricsTest(test.TestCase):
       self.assertAlmostEqual(self.evaluate(m.count), 1.7, 2)  # 0.5 + 1.2
       self.assertAlmostEqual(result, 52 / 1.7, 2)
 
-  @test_util.run_in_graph_and_eager_modes
   def test_save_restore(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
     m = metrics.Mean()
-    checkpoint = checkpointable_utils.Checkpoint(mean=m)
+    checkpoint = trackable_utils.Checkpoint(mean=m)
     self.evaluate(variables.variables_initializer(m.variables))
 
     # update state
@@ -250,7 +285,7 @@ class KerasMetricsTest(test.TestCase):
 
     # restore to a different checkpoint mean object
     restore_mean = metrics.Mean()
-    restore_checkpoint = checkpointable_utils.Checkpoint(mean=restore_mean)
+    restore_checkpoint = trackable_utils.Checkpoint(mean=restore_mean)
     status = restore_checkpoint.restore(save_path)
     restore_update = restore_mean(300.)
     status.assert_consumed().run_restore_ops()
@@ -258,7 +293,10 @@ class KerasMetricsTest(test.TestCase):
     self.assertEqual(200., self.evaluate(restore_mean.result()))
     self.assertEqual(3, self.evaluate(restore_mean.count))
 
-  @test_util.run_in_graph_and_eager_modes
+
+@test_util.run_all_in_graph_and_eager_modes
+class KerasAccuracyTest(test.TestCase):
+
   def test_accuracy(self):
     acc_obj = metrics.Accuracy(name='my acc')
 
@@ -275,12 +313,18 @@ class KerasMetricsTest(test.TestCase):
     result = self.evaluate(acc_obj.result())
     self.assertEqual(result, 1)  # 2/2
 
+    # Check save and restore config
+    a2 = metrics.Accuracy.from_config(acc_obj.get_config())
+    self.assertEqual(a2.name, 'my acc')
+    self.assertTrue(a2.stateful)
+    self.assertEqual(len(a2.variables), 2)
+    self.assertEqual(a2.dtype, dtypes.float32)
+
     # check with sample_weight
     result_t = acc_obj([[2], [1]], [[2], [0]], sample_weight=[[0.5], [0.2]])
     result = self.evaluate(result_t)
     self.assertAlmostEqual(result, 0.96, 2)  # 4.5/4.7
 
-  @test_util.run_in_graph_and_eager_modes
   def test_binary_accuracy(self):
     acc_obj = metrics.BinaryAccuracy(name='my acc')
 
@@ -313,7 +357,6 @@ class KerasMetricsTest(test.TestCase):
     result = self.evaluate(result_t)
     self.assertAlmostEqual(result, 0.67, 2)  # 4.5/6.7
 
-  @test_util.run_in_graph_and_eager_modes
   def test_binary_accuracy_threshold(self):
     acc_obj = metrics.BinaryAccuracy(threshold=0.7)
     self.evaluate(variables.variables_initializer(acc_obj.variables))
@@ -321,7 +364,6 @@ class KerasMetricsTest(test.TestCase):
     result = self.evaluate(result_t)
     self.assertAlmostEqual(result, 0.5, 2)
 
-  @test_util.run_in_graph_and_eager_modes
   def test_categorical_accuracy(self):
     acc_obj = metrics.CategoricalAccuracy(name='my acc')
 
@@ -345,7 +387,6 @@ class KerasMetricsTest(test.TestCase):
     result = self.evaluate(result_t)
     self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
 
-  @test_util.run_in_graph_and_eager_modes
   def test_sparse_categorical_accuracy(self):
     acc_obj = metrics.SparseCategoricalAccuracy(name='my acc')
 
@@ -369,807 +410,1491 @@ class KerasMetricsTest(test.TestCase):
     result = self.evaluate(result_t)
     self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
 
+  def test_sparse_categorical_accuracy_mismatched_dims(self):
+    acc_obj = metrics.SparseCategoricalAccuracy(name='my acc')
 
-def _get_simple_sequential_model(compile_metrics):
-  model = Sequential()
-  model.add(
-      layers.Dense(
-          3, activation='relu', input_dim=4, kernel_initializer='ones'))
-  model.add(layers.Dense(1, activation='sigmoid', kernel_initializer='ones'))
-  model.compile(
-      loss='mae',
-      metrics=compile_metrics,
-      optimizer=RMSPropOptimizer(learning_rate=0.001))
-  return model
+    # check config
+    self.assertEqual(acc_obj.name, 'my acc')
+    self.assertTrue(acc_obj.stateful)
+    self.assertEqual(len(acc_obj.variables), 2)
+    self.assertEqual(acc_obj.dtype, dtypes.float32)
+    self.evaluate(variables.variables_initializer(acc_obj.variables))
+
+    # verify that correct value is returned
+    update_op = acc_obj.update_state([2, 1], [[0.1, 0.1, 0.8], [0.05, 0.95, 0]])
+    self.evaluate(update_op)
+    result = self.evaluate(acc_obj.result())
+    self.assertEqual(result, 1)  # 2/2
+
+    # check with sample_weight
+    result_t = acc_obj([2, 1], [[0.1, 0.1, 0.8], [0.05, 0, 0.95]],
+                       [[0.5], [0.2]])
+    result = self.evaluate(result_t)
+    self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
+
+  def test_sparse_categorical_accuracy_mismatched_dims_dynamic(self):
+    with context.graph_mode(), self.cached_session() as sess:
+      acc_obj = metrics.SparseCategoricalAccuracy(name='my acc')
+      self.evaluate(variables.variables_initializer(acc_obj.variables))
+
+      t = array_ops.placeholder(dtypes.float32)
+      p = array_ops.placeholder(dtypes.float32)
+      w = array_ops.placeholder(dtypes.float32)
+
+      result_t = acc_obj(t, p, w)
+      result = sess.run(
+          result_t,
+          feed_dict=({
+              t: [2, 1],
+              p: [[0.1, 0.1, 0.8], [0.05, 0, 0.95]],
+              w: [[0.5], [0.2]]
+          }))
+      self.assertAlmostEqual(result, 0.71, 2)  # 2.5/2.7
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class FalsePositivesTest(test.TestCase):
+class CosineSimilarityTest(test.TestCase):
+
+  def l2_norm(self, x, axis):
+    epsilon = 1e-12
+    square_sum = np.sum(np.square(x), axis=axis, keepdims=True)
+    x_inv_norm = 1 / np.sqrt(np.maximum(square_sum, epsilon))
+    return np.multiply(x, x_inv_norm)
+
+  def setup(self, axis=1):
+    self.np_y_true = np.asarray([[1, 9, 2], [-5, -2, 6]], dtype=np.float32)
+    self.np_y_pred = np.asarray([[4, 8, 12], [8, 1, 3]], dtype=np.float32)
+
+    y_true = self.l2_norm(self.np_y_true, axis)
+    y_pred = self.l2_norm(self.np_y_pred, axis)
+    self.expected_loss = np.sum(np.multiply(y_true, y_pred), axis=(axis,))
+
+    self.y_true = constant_op.constant(self.np_y_true)
+    self.y_pred = constant_op.constant(self.np_y_pred)
 
   def test_config(self):
-    fp_obj = metrics.FalsePositives(name='my_fp', thresholds=[0.4, 0.9])
-    self.assertEqual(fp_obj.name, 'my_fp')
-    self.assertEqual(len(fp_obj.variables), 1)
-    self.assertEqual(fp_obj.thresholds, [0.4, 0.9])
+    cosine_obj = metrics.CosineSimilarity(
+        axis=2, name='my_cos', dtype=dtypes.int32)
+    self.assertEqual(cosine_obj.name, 'my_cos')
+    self.assertEqual(cosine_obj._dtype, dtypes.int32)
+
+    # Check save and restore config
+    cosine_obj2 = metrics.CosineSimilarity.from_config(cosine_obj.get_config())
+    self.assertEqual(cosine_obj2.name, 'my_cos')
+    self.assertEqual(cosine_obj2._dtype, dtypes.int32)
 
   def test_unweighted(self):
-    fp_obj = metrics.FalsePositives()
-    self.evaluate(variables.variables_initializer(fp_obj.variables))
+    self.setup()
+    cosine_obj = metrics.CosineSimilarity()
+    self.evaluate(variables.variables_initializer(cosine_obj.variables))
+    loss = cosine_obj(self.y_true, self.y_pred)
+    expected_loss = np.mean(self.expected_loss)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_weighted(self):
+    self.setup()
+    cosine_obj = metrics.CosineSimilarity()
+    self.evaluate(variables.variables_initializer(cosine_obj.variables))
+    sample_weight = np.asarray([1.2, 3.4])
+    loss = cosine_obj(
+        self.y_true,
+        self.y_pred,
+        sample_weight=constant_op.constant(sample_weight))
+    expected_loss = np.sum(
+        self.expected_loss * sample_weight) / np.sum(sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+  def test_axis(self):
+    self.setup(axis=1)
+    cosine_obj = metrics.CosineSimilarity(axis=1)
+    self.evaluate(variables.variables_initializer(cosine_obj.variables))
+    loss = cosine_obj(self.y_true, self.y_pred)
+    expected_loss = np.mean(self.expected_loss)
+    self.assertAlmostEqual(self.evaluate(loss), expected_loss, 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MeanAbsoluteErrorTest(test.TestCase):
+
+  def test_config(self):
+    mae_obj = metrics.MeanAbsoluteError(name='my_mae', dtype=dtypes.int32)
+    self.assertEqual(mae_obj.name, 'my_mae')
+    self.assertEqual(mae_obj._dtype, dtypes.int32)
+
+    # Check save and restore config
+    mae_obj2 = metrics.MeanAbsoluteError.from_config(mae_obj.get_config())
+    self.assertEqual(mae_obj2.name, 'my_mae')
+    self.assertEqual(mae_obj2._dtype, dtypes.int32)
 
+  def test_unweighted(self):
+    mae_obj = metrics.MeanAbsoluteError()
+    self.evaluate(variables.variables_initializer(mae_obj.variables))
     y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
                                    (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
     y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
                                    (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
 
-    update_op = fp_obj.update_state(y_true, y_pred)
+    update_op = mae_obj.update_state(y_true, y_pred)
     self.evaluate(update_op)
-    result = fp_obj.result()
-    self.assertAllClose(7., result)
+    result = mae_obj.result()
+    self.assertAllClose(0.5, result, atol=1e-5)
 
   def test_weighted(self):
-    fp_obj = metrics.FalsePositives()
-    self.evaluate(variables.variables_initializer(fp_obj.variables))
+    mae_obj = metrics.MeanAbsoluteError()
+    self.evaluate(variables.variables_initializer(mae_obj.variables))
     y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
                                    (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
     y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
                                    (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
     sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
-    result = fp_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(14., self.evaluate(result))
-
-  def test_unweighted_with_thresholds(self):
-    fp_obj = metrics.FalsePositives(thresholds=[0.15, 0.5, 0.85])
-    self.evaluate(variables.variables_initializer(fp_obj.variables))
-
-    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
-                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
-    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
-                                   (1, 1, 1, 1)))
-
-    update_op = fp_obj.update_state(y_true, y_pred)
-    self.evaluate(update_op)
-    result = fp_obj.result()
-    self.assertAllClose([7., 4., 2.], result)
-
-  def test_weighted_with_thresholds(self):
-    fp_obj = metrics.FalsePositives(thresholds=[0.15, 0.5, 0.85])
-    self.evaluate(variables.variables_initializer(fp_obj.variables))
-
-    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
-                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
-    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
-                                   (1, 1, 1, 1)))
-    sample_weight = ((1.0, 2.0, 3.0, 5.0), (7.0, 11.0, 13.0, 17.0),
-                     (19.0, 23.0, 29.0, 31.0), (5.0, 15.0, 10.0, 0))
-
-    result = fp_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose([125., 42., 12.], self.evaluate(result))
-
-  def test_threshold_limit(self):
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'Threshold values must be in \[0, 1\]. Invalid values: \[-1, 2\]'):
-      metrics.FalsePositives(thresholds=[-1, 0.5, 2])
-
-  def test_reset_states(self):
-    fp_obj = metrics.FalsePositives()
-    model = _get_simple_sequential_model([fp_obj])
-    x = np.ones((100, 4))
-    y = np.zeros((100, 1))
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(fp_obj.accumulator), 100.)
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(fp_obj.accumulator), 100.)
+    result = mae_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(0.54285, self.evaluate(result), atol=1e-5)
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class FalseNegativesTest(test.TestCase):
+class MeanAbsolutePercentageErrorTest(test.TestCase):
 
   def test_config(self):
-    fn_obj = metrics.FalseNegatives(name='my_fn', thresholds=[0.4, 0.9])
-    self.assertEqual(fn_obj.name, 'my_fn')
-    self.assertEqual(len(fn_obj.variables), 1)
-    self.assertEqual(fn_obj.thresholds, [0.4, 0.9])
+    mape_obj = metrics.MeanAbsolutePercentageError(
+        name='my_mape', dtype=dtypes.int32)
+    self.assertEqual(mape_obj.name, 'my_mape')
+    self.assertEqual(mape_obj._dtype, dtypes.int32)
 
-  def test_unweighted(self):
-    fn_obj = metrics.FalseNegatives()
-    self.evaluate(variables.variables_initializer(fn_obj.variables))
+    # Check save and restore config
+    mape_obj2 = metrics.MeanAbsolutePercentageError.from_config(
+        mape_obj.get_config())
+    self.assertEqual(mape_obj2.name, 'my_mape')
+    self.assertEqual(mape_obj2._dtype, dtypes.int32)
 
+  def test_unweighted(self):
+    mape_obj = metrics.MeanAbsolutePercentageError()
+    self.evaluate(variables.variables_initializer(mape_obj.variables))
     y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
                                    (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
     y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
                                    (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
 
-    update_op = fn_obj.update_state(y_true, y_pred)
+    update_op = mape_obj.update_state(y_true, y_pred)
     self.evaluate(update_op)
-    result = fn_obj.result()
-    self.assertAllClose(3., result)
+    result = mape_obj.result()
+    self.assertAllClose(35e7, result, atol=1e-5)
 
   def test_weighted(self):
-    fn_obj = metrics.FalseNegatives()
-    self.evaluate(variables.variables_initializer(fn_obj.variables))
+    mape_obj = metrics.MeanAbsolutePercentageError()
+    self.evaluate(variables.variables_initializer(mape_obj.variables))
     y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
                                    (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
     y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
                                    (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
     sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
-    result = fn_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(5., self.evaluate(result))
+    result = mape_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(40e7, self.evaluate(result), atol=1e-5)
 
-  def test_unweighted_with_thresholds(self):
-    fn_obj = metrics.FalseNegatives(thresholds=[0.15, 0.5, 0.85])
-    self.evaluate(variables.variables_initializer(fn_obj.variables))
 
-    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
-                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
-    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
-                                   (1, 1, 1, 1)))
+@test_util.run_all_in_graph_and_eager_modes
+class MeanSquaredErrorTest(test.TestCase):
 
-    update_op = fn_obj.update_state(y_true, y_pred)
-    self.evaluate(update_op)
-    result = fn_obj.result()
-    self.assertAllClose([1., 4., 6.], result)
+  def test_config(self):
+    mse_obj = metrics.MeanSquaredError(name='my_mse', dtype=dtypes.int32)
+    self.assertEqual(mse_obj.name, 'my_mse')
+    self.assertEqual(mse_obj._dtype, dtypes.int32)
 
-  def test_weighted_with_thresholds(self):
-    fn_obj = metrics.FalseNegatives(thresholds=[0.15, 0.5, 0.85])
-    self.evaluate(variables.variables_initializer(fn_obj.variables))
+    # Check save and restore config
+    mse_obj2 = metrics.MeanSquaredError.from_config(mse_obj.get_config())
+    self.assertEqual(mse_obj2.name, 'my_mse')
+    self.assertEqual(mse_obj2._dtype, dtypes.int32)
 
-    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
-                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
-    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
-                                   (1, 1, 1, 1)))
-    sample_weight = ((3.0,), (5.0,), (7.0,), (4.0,))
+  def test_unweighted(self):
+    mse_obj = metrics.MeanSquaredError()
+    self.evaluate(variables.variables_initializer(mse_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
 
-    result = fn_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose([4., 16., 23.], self.evaluate(result))
+    update_op = mse_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = mse_obj.result()
+    self.assertAllClose(0.5, result, atol=1e-5)
 
-  def test_reset_states(self):
-    fn_obj = metrics.FalseNegatives()
-    model = _get_simple_sequential_model([fn_obj])
-    x = np.zeros((100, 4))
-    y = np.ones((100, 1))
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(fn_obj.accumulator), 100.)
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(fn_obj.accumulator), 100.)
+  def test_weighted(self):
+    mse_obj = metrics.MeanSquaredError()
+    self.evaluate(variables.variables_initializer(mse_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = mse_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(0.54285, self.evaluate(result), atol=1e-5)
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class TrueNegativesTest(test.TestCase):
+class MeanSquaredLogarithmicErrorTest(test.TestCase):
 
   def test_config(self):
-    tn_obj = metrics.TrueNegatives(name='my_tn', thresholds=[0.4, 0.9])
-    self.assertEqual(tn_obj.name, 'my_tn')
-    self.assertEqual(len(tn_obj.variables), 1)
-    self.assertEqual(tn_obj.thresholds, [0.4, 0.9])
+    msle_obj = metrics.MeanSquaredLogarithmicError(
+        name='my_msle', dtype=dtypes.int32)
+    self.assertEqual(msle_obj.name, 'my_msle')
+    self.assertEqual(msle_obj._dtype, dtypes.int32)
 
-  def test_unweighted(self):
-    tn_obj = metrics.TrueNegatives()
-    self.evaluate(variables.variables_initializer(tn_obj.variables))
+    # Check save and restore config
+    msle_obj2 = metrics.MeanSquaredLogarithmicError.from_config(
+        msle_obj.get_config())
+    self.assertEqual(msle_obj2.name, 'my_msle')
+    self.assertEqual(msle_obj2._dtype, dtypes.int32)
 
+  def test_unweighted(self):
+    msle_obj = metrics.MeanSquaredLogarithmicError()
+    self.evaluate(variables.variables_initializer(msle_obj.variables))
     y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
                                    (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
     y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
                                    (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
 
-    update_op = tn_obj.update_state(y_true, y_pred)
+    update_op = msle_obj.update_state(y_true, y_pred)
     self.evaluate(update_op)
-    result = tn_obj.result()
-    self.assertAllClose(3., result)
+    result = msle_obj.result()
+    self.assertAllClose(0.24022, result, atol=1e-5)
 
   def test_weighted(self):
-    tn_obj = metrics.TrueNegatives()
-    self.evaluate(variables.variables_initializer(tn_obj.variables))
+    msle_obj = metrics.MeanSquaredLogarithmicError()
+    self.evaluate(variables.variables_initializer(msle_obj.variables))
     y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
                                    (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
     y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
                                    (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
     sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
-    result = tn_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(4., self.evaluate(result))
+    result = msle_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(0.26082, self.evaluate(result), atol=1e-5)
 
-  def test_unweighted_with_thresholds(self):
-    tn_obj = metrics.TrueNegatives(thresholds=[0.15, 0.5, 0.85])
-    self.evaluate(variables.variables_initializer(tn_obj.variables))
 
-    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
-                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
-    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
-                                   (1, 1, 1, 1)))
+@test_util.run_all_in_graph_and_eager_modes
+class HingeTest(test.TestCase):
+
+  def test_config(self):
+    hinge_obj = metrics.Hinge(name='hinge', dtype=dtypes.int32)
+    self.assertEqual(hinge_obj.name, 'hinge')
+    self.assertEqual(hinge_obj._dtype, dtypes.int32)
+
+    # Check save and restore config
+    hinge_obj2 = metrics.Hinge.from_config(hinge_obj.get_config())
+    self.assertEqual(hinge_obj2.name, 'hinge')
+    self.assertEqual(hinge_obj2._dtype, dtypes.int32)
 
-    update_op = tn_obj.update_state(y_true, y_pred)
+  def test_unweighted(self):
+    hinge_obj = metrics.Hinge()
+    self.evaluate(variables.variables_initializer(hinge_obj.variables))
+    y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+    y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
+                                   [-0.25, -1., 0.5, 0.6]])
+
+    # metric = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+    # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
+    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+    # metric = [(0.7 + 0.8 + 0.9 + 0) / 4, (0.75 + 0 + 0.5 + 0.4) / 4]
+    #        = [0.6, 0.4125]
+    # reduced metric = (0.6 + 0.4125) / 2
+
+    update_op = hinge_obj.update_state(y_true, y_pred)
     self.evaluate(update_op)
-    result = tn_obj.result()
-    self.assertAllClose([2., 5., 7.], result)
+    result = hinge_obj.result()
+    self.assertAllClose(0.506, result, atol=1e-3)
 
-  def test_weighted_with_thresholds(self):
-    tn_obj = metrics.TrueNegatives(thresholds=[0.15, 0.5, 0.85])
-    self.evaluate(variables.variables_initializer(tn_obj.variables))
+  def test_weighted(self):
+    hinge_obj = metrics.Hinge()
+    self.evaluate(variables.variables_initializer(hinge_obj.variables))
+    y_true = constant_op.constant([[-1, 1, -1, 1], [-1, -1, 1, 1]])
+    y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
+                                   [-0.25, -1., 0.5, 0.6]])
+    sample_weight = constant_op.constant([1.5, 2.])
 
-    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
-                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
-    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
-                                   (1, 1, 1, 1)))
-    sample_weight = ((0.0, 2.0, 3.0, 5.0),)
+    # metric = max(0, 1-y_true * y_pred), where y_true is -1/1
 
-    result = tn_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose([5., 15., 23.], self.evaluate(result))
+    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+    # metric = [(0.7 + 0.8 + 0.9 + 0) / 4, (0.75 + 0 + 0.5 + 0.4) / 4]
+    #        = [0.6, 0.4125]
+    # weighted metric = [0.6 * 1.5, 0.4125 * 2]
+    # reduced metric = (0.6 * 1.5 + 0.4125 * 2) / (1.5 + 2)
 
-  def test_reset_states(self):
-    tn_obj = metrics.TrueNegatives()
-    model = _get_simple_sequential_model([tn_obj])
-    x = np.zeros((100, 4))
-    y = np.zeros((100, 1))
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(tn_obj.accumulator), 100.)
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(tn_obj.accumulator), 100.)
+    result = hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(0.493, self.evaluate(result), atol=1e-3)
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class TruePositivesTest(test.TestCase):
+class SquaredHingeTest(test.TestCase):
 
   def test_config(self):
-    tp_obj = metrics.TruePositives(name='my_tp', thresholds=[0.4, 0.9])
-    self.assertEqual(tp_obj.name, 'my_tp')
-    self.assertEqual(len(tp_obj.variables), 1)
-    self.assertEqual(tp_obj.thresholds, [0.4, 0.9])
+    sq_hinge_obj = metrics.SquaredHinge(name='sq_hinge', dtype=dtypes.int32)
+    self.assertEqual(sq_hinge_obj.name, 'sq_hinge')
+    self.assertEqual(sq_hinge_obj._dtype, dtypes.int32)
+
+    # Check save and restore config
+    sq_hinge_obj2 = metrics.SquaredHinge.from_config(sq_hinge_obj.get_config())
+    self.assertEqual(sq_hinge_obj2.name, 'sq_hinge')
+    self.assertEqual(sq_hinge_obj2._dtype, dtypes.int32)
 
   def test_unweighted(self):
-    tp_obj = metrics.TruePositives()
-    self.evaluate(variables.variables_initializer(tp_obj.variables))
+    sq_hinge_obj = metrics.SquaredHinge()
+    self.evaluate(variables.variables_initializer(sq_hinge_obj.variables))
+    y_true = constant_op.constant([[0, 1, 0, 1], [0, 0, 1, 1]])
+    y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
+                                   [-0.25, -1., 0.5, 0.6]])
+
+    # metric = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+    # y_true = [[-1, 1, -1, 1], [-1, -1, 1, 1]]
+    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+    # max(0, 1 - y_true * y_pred) = [[0.7, 0.8, 0.9, 0], [0.75, 0, 0.5, 0.4]]
+    # squared(max(0, 1 - y_true * y_pred)) = [[0.49, 0.64, 0.81, 0],
+    #                                         [0.5625, 0, 0.25, 0.16]]
+    # metric = [(0.49 + 0.64 + 0.81 + 0) / 4, (0.5625 + 0 + 0.25 + 0.16) / 4]
+    #        = [0.485, 0.2431]
+    # reduced metric = (0.485 + 0.2431) / 2
+
+    update_op = sq_hinge_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = sq_hinge_obj.result()
+    self.assertAllClose(0.364, result, atol=1e-3)
+
+  def test_weighted(self):
+    sq_hinge_obj = metrics.SquaredHinge()
+    self.evaluate(variables.variables_initializer(sq_hinge_obj.variables))
+    y_true = constant_op.constant([[-1, 1, -1, 1], [-1, -1, 1, 1]])
+    y_pred = constant_op.constant([[-0.3, 0.2, -0.1, 1.6],
+                                   [-0.25, -1., 0.5, 0.6]])
+    sample_weight = constant_op.constant([1.5, 2.])
+
+    # metric = max(0, 1-y_true * y_pred), where y_true is -1/1
+
+    # y_true * y_pred = [[0.3, 0.2, 0.1, 1.6], [0.25, 1, 0.5, 0.6]]
+    # 1 - y_true * y_pred = [[0.7, 0.8, 0.9, -0.6], [0.75, 0, 0.5, 0.4]]
+    # max(0, 1 - y_true * y_pred) = [[0.7, 0.8, 0.9, 0], [0.75, 0, 0.5, 0.4]]
+    # squared(max(0, 1 - y_true * y_pred)) = [[0.49, 0.64, 0.81, 0],
+    #                                         [0.5625, 0, 0.25, 0.16]]
+    # metric = [(0.49 + 0.64 + 0.81 + 0) / 4, (0.5625 + 0 + 0.25 + 0.16) / 4]
+    #        = [0.485, 0.2431]
+    # weighted metric = [0.485 * 1.5, 0.2431 * 2]
+    # reduced metric = (0.485 * 1.5 + 0.2431 * 2) / (1.5 + 2)
+
+    result = sq_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(0.347, self.evaluate(result), atol=1e-3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class CategoricalHingeTest(test.TestCase):
+
+  def test_config(self):
+    cat_hinge_obj = metrics.CategoricalHinge(
+        name='cat_hinge', dtype=dtypes.int32)
+    self.assertEqual(cat_hinge_obj.name, 'cat_hinge')
+    self.assertEqual(cat_hinge_obj._dtype, dtypes.int32)
+
+    # Check save and restore config
+    cat_hinge_obj2 = metrics.CategoricalHinge.from_config(
+        cat_hinge_obj.get_config())
+    self.assertEqual(cat_hinge_obj2.name, 'cat_hinge')
+    self.assertEqual(cat_hinge_obj2._dtype, dtypes.int32)
 
+  def test_unweighted(self):
+    cat_hinge_obj = metrics.CategoricalHinge()
+    self.evaluate(variables.variables_initializer(cat_hinge_obj.variables))
     y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
                                    (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
     y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
                                    (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
 
-    update_op = tp_obj.update_state(y_true, y_pred)
+    update_op = cat_hinge_obj.update_state(y_true, y_pred)
     self.evaluate(update_op)
-    result = tp_obj.result()
-    self.assertAllClose(7., result)
+    result = cat_hinge_obj.result()
+    self.assertAllClose(0.5, result, atol=1e-5)
 
   def test_weighted(self):
-    tp_obj = metrics.TruePositives()
-    self.evaluate(variables.variables_initializer(tp_obj.variables))
+    cat_hinge_obj = metrics.CategoricalHinge()
+    self.evaluate(variables.variables_initializer(cat_hinge_obj.variables))
     y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
                                    (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
     y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
                                    (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
     sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
-    result = tp_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(12., self.evaluate(result))
+    result = cat_hinge_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(0.5, self.evaluate(result), atol=1e-5)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RootMeanSquaredErrorTest(test.TestCase):
+
+  def test_config(self):
+    rmse_obj = metrics.RootMeanSquaredError(name='rmse', dtype=dtypes.int32)
+    self.assertEqual(rmse_obj.name, 'rmse')
+    self.assertEqual(rmse_obj._dtype, dtypes.int32)
 
-  def test_unweighted_with_thresholds(self):
-    tp_obj = metrics.TruePositives(thresholds=[0.15, 0.5, 0.85])
-    self.evaluate(variables.variables_initializer(tp_obj.variables))
+    rmse_obj2 = metrics.RootMeanSquaredError.from_config(rmse_obj.get_config())
+    self.assertEqual(rmse_obj2.name, 'rmse')
+    self.assertEqual(rmse_obj2._dtype, dtypes.int32)
 
-    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
-                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
-    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
-                                   (1, 1, 1, 1)))
+  def test_unweighted(self):
+    rmse_obj = metrics.RootMeanSquaredError()
+    self.evaluate(variables.variables_initializer(rmse_obj.variables))
+    y_true = constant_op.constant((2, 4, 6))
+    y_pred = constant_op.constant((1, 3, 2))
 
-    update_op = tp_obj.update_state(y_true, y_pred)
+    update_op = rmse_obj.update_state(y_true, y_pred)
     self.evaluate(update_op)
-    result = tp_obj.result()
-    self.assertAllClose([6., 3., 1.], result)
+    result = rmse_obj.result()
+    # error = [-1, -1, -4], square(error) = [1, 1, 16], mean = 18/3 = 6
+    self.assertAllClose(math.sqrt(6), result, atol=1e-3)
 
-  def test_weighted_with_thresholds(self):
-    tp_obj = metrics.TruePositives(thresholds=[0.15, 0.5, 0.85])
-    self.evaluate(variables.variables_initializer(tp_obj.variables))
+  def test_weighted(self):
+    rmse_obj = metrics.RootMeanSquaredError()
+    self.evaluate(variables.variables_initializer(rmse_obj.variables))
+    y_true = constant_op.constant((2, 4, 6, 8))
+    y_pred = constant_op.constant((1, 3, 2, 3))
+    sample_weight = constant_op.constant((0, 1, 0, 1))
+    result = rmse_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(math.sqrt(13), self.evaluate(result), atol=1e-3)
 
-    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
-                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
-    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
-                                   (1, 1, 1, 1)))
 
-    result = tp_obj(y_true, y_pred, sample_weight=37.)
-    self.assertAllClose([222., 111., 37.], self.evaluate(result))
+@test_util.run_all_in_graph_and_eager_modes
+class TopKCategoricalAccuracyTest(test.TestCase):
 
-  def test_reset_states(self):
-    tp_obj = metrics.TruePositives()
-    model = _get_simple_sequential_model([tp_obj])
-    x = np.ones((100, 4))
-    y = np.ones((100, 1))
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(tp_obj.accumulator), 100.)
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(tp_obj.accumulator), 100.)
+  def test_config(self):
+    a_obj = metrics.TopKCategoricalAccuracy(name='topkca', dtype=dtypes.int32)
+    self.assertEqual(a_obj.name, 'topkca')
+    self.assertEqual(a_obj._dtype, dtypes.int32)
+
+    a_obj2 = metrics.TopKCategoricalAccuracy.from_config(a_obj.get_config())
+    self.assertEqual(a_obj2.name, 'topkca')
+    self.assertEqual(a_obj2._dtype, dtypes.int32)
+
+  def test_correctness(self):
+    a_obj = metrics.TopKCategoricalAccuracy()
+    self.evaluate(variables.variables_initializer(a_obj.variables))
+    y_true = constant_op.constant([[0, 0, 1], [0, 1, 0]])
+    y_pred = constant_op.constant([[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
+
+    result = a_obj(y_true, y_pred)
+    self.assertEqual(1, self.evaluate(result))  # both the samples match
+
+    # With `k` < 5.
+    a_obj = metrics.TopKCategoricalAccuracy(k=1)
+    self.evaluate(variables.variables_initializer(a_obj.variables))
+    result = a_obj(y_true, y_pred)
+    self.assertEqual(0.5, self.evaluate(result))  # only sample #2 matches
+
+    # With `k` > 5.
+    y_true = constant_op.constant([[0, 0, 1, 0, 0, 0, 0],
+                                   [0, 1, 0, 0, 0, 0, 0]])
+    y_pred = constant_op.constant([[0.5, 0.9, 0.1, 0.7, 0.6, 0.5, 0.4],
+                                   [0.05, 0.95, 0, 0, 0, 0, 0]])
+    a_obj = metrics.TopKCategoricalAccuracy(k=6)
+    self.evaluate(variables.variables_initializer(a_obj.variables))
+    result = a_obj(y_true, y_pred)
+    self.assertEqual(0.5, self.evaluate(result))  # only 1 sample matches.
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class PrecisionTest(test.TestCase):
+class SparseTopKCategoricalAccuracyTest(test.TestCase):
 
   def test_config(self):
-    p_obj = metrics.Precision(name='my_precision', thresholds=[0.4, 0.9])
-    self.assertEqual(p_obj.name, 'my_precision')
-    self.assertLen(p_obj.variables, 2)
-    self.assertEqual([v.name for v in p_obj.variables],
-                     ['true_positives:0', 'false_positives:0'])
-    self.assertEqual(p_obj.thresholds, [0.4, 0.9])
-
-  def test_value_is_idempotent(self):
-    p_obj = metrics.Precision(thresholds=[0.3, 0.72])
-    y_pred = random_ops.random_uniform(shape=(10, 3))
-    y_true = random_ops.random_uniform(shape=(10, 3))
-    update_op = p_obj.update_state(y_true, y_pred)
-    self.evaluate(variables.variables_initializer(p_obj.variables))
-
-    # Run several updates.
-    for _ in range(10):
-      self.evaluate(update_op)
-
-    # Then verify idempotency.
-    initial_precision = self.evaluate(p_obj.result())
-    for _ in range(10):
-      self.assertArrayNear(initial_precision, self.evaluate(p_obj.result()),
-                           1e-3)
+    a_obj = metrics.SparseTopKCategoricalAccuracy(
+        name='stopkca', dtype=dtypes.int32)
+    self.assertEqual(a_obj.name, 'stopkca')
+    self.assertEqual(a_obj._dtype, dtypes.int32)
+
+    a_obj2 = metrics.SparseTopKCategoricalAccuracy.from_config(
+        a_obj.get_config())
+    self.assertEqual(a_obj2.name, 'stopkca')
+    self.assertEqual(a_obj2._dtype, dtypes.int32)
+
+  def test_correctness(self):
+    a_obj = metrics.SparseTopKCategoricalAccuracy()
+    self.evaluate(variables.variables_initializer(a_obj.variables))
+    y_true = constant_op.constant([2, 1])
+    y_pred = constant_op.constant([[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
+
+    result = a_obj(y_true, y_pred)
+    self.assertEqual(1, self.evaluate(result))  # both the samples match
+
+    # With `k` < 5.
+    a_obj = metrics.SparseTopKCategoricalAccuracy(k=1)
+    self.evaluate(variables.variables_initializer(a_obj.variables))
+    result = a_obj(y_true, y_pred)
+    self.assertEqual(0.5, self.evaluate(result))  # only sample #2 matches
+
+    # With `k` > 5.
+    y_pred = constant_op.constant([[0.5, 0.9, 0.1, 0.7, 0.6, 0.5, 0.4],
+                                   [0.05, 0.95, 0, 0, 0, 0, 0]])
+    a_obj = metrics.SparseTopKCategoricalAccuracy(k=6)
+    self.evaluate(variables.variables_initializer(a_obj.variables))
+    result = a_obj(y_true, y_pred)
+    self.assertEqual(0.5, self.evaluate(result))  # only 1 sample matches.
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class LogCoshErrorTest(test.TestCase):
+
+  def setup(self):
+    y_pred = np.asarray([1, 9, 2, -5, -2, 6]).reshape((2, 3))
+    y_true = np.asarray([4, 8, 12, 8, 1, 3]).reshape((2, 3))
+
+    self.batch_size = 6
+    error = y_pred - y_true
+    self.expected_results = np.log((np.exp(error) + np.exp(-error)) / 2)
+
+    self.y_pred = constant_op.constant(y_pred, dtype=dtypes.float32)
+    self.y_true = constant_op.constant(y_true)
+
+  def test_config(self):
+    logcosh_obj = metrics.LogCoshError(name='logcosh', dtype=dtypes.int32)
+    self.assertEqual(logcosh_obj.name, 'logcosh')
+    self.assertEqual(logcosh_obj._dtype, dtypes.int32)
 
   def test_unweighted(self):
-    p_obj = metrics.Precision()
-    y_pred = constant_op.constant([1, 0, 1, 0], shape=(1, 4))
-    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
-    self.evaluate(variables.variables_initializer(p_obj.variables))
-    result = p_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.5, self.evaluate(result))
-
-  def test_unweighted_all_incorrect(self):
-    p_obj = metrics.Precision(thresholds=[0.5])
-    inputs = np.random.randint(0, 2, size=(100, 1))
-    y_pred = constant_op.constant(inputs)
-    y_true = constant_op.constant(1 - inputs)
-    self.evaluate(variables.variables_initializer(p_obj.variables))
-    result = p_obj(y_true, y_pred)
-    self.assertAlmostEqual(0, self.evaluate(result))
+    self.setup()
+    logcosh_obj = metrics.LogCoshError()
+    self.evaluate(variables.variables_initializer(logcosh_obj.variables))
+
+    update_op = logcosh_obj.update_state(self.y_true, self.y_pred)
+    self.evaluate(update_op)
+    result = logcosh_obj.result()
+    expected_result = np.sum(self.expected_results) / self.batch_size
+    self.assertAllClose(result, expected_result, atol=1e-3)
 
   def test_weighted(self):
-    p_obj = metrics.Precision()
-    y_pred = constant_op.constant([[1, 0, 1, 0], [1, 0, 1, 0]])
-    y_true = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
-    self.evaluate(variables.variables_initializer(p_obj.variables))
-    result = p_obj(
-        y_true,
-        y_pred,
-        sample_weight=constant_op.constant([[1, 2, 3, 4], [4, 3, 2, 1]]))
-    weighted_tp = 3.0 + 4.0
-    weighted_positives = (1.0 + 3.0) + (4.0 + 2.0)
-    expected_precision = weighted_tp / weighted_positives
-    self.assertAlmostEqual(expected_precision, self.evaluate(result))
-
-  def test_div_by_zero(self):
-    p_obj = metrics.Precision()
-    y_pred = constant_op.constant([0, 0, 0, 0])
-    y_true = constant_op.constant([0, 0, 0, 0])
-    self.evaluate(variables.variables_initializer(p_obj.variables))
-    result = p_obj(y_true, y_pred)
-    self.assertEqual(0, self.evaluate(result))
-
-  def test_unweighted_with_threshold(self):
-    p_obj = metrics.Precision(thresholds=[0.5, 0.7])
-    y_pred = constant_op.constant([1, 0, 0.6, 0], shape=(1, 4))
-    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
-    self.evaluate(variables.variables_initializer(p_obj.variables))
-    result = p_obj(y_true, y_pred)
-    self.assertArrayNear([0.5, 0.], self.evaluate(result), 0)
-
-  def test_weighted_with_threshold(self):
-    p_obj = metrics.Precision(thresholds=[0.5, 1.])
-    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
-    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
-                                  shape=(2, 2),
-                                  dtype=dtypes.float32)
-    weights = constant_op.constant([[4, 0], [3, 1]],
-                                   shape=(2, 2),
-                                   dtype=dtypes.float32)
-    self.evaluate(variables.variables_initializer(p_obj.variables))
-    result = p_obj(y_true, y_pred, sample_weight=weights)
-    weighted_tp = 0 + 3.
-    weighted_positives = (0 + 3.) + (4. + 0.)
-    expected_precision = weighted_tp / weighted_positives
-    self.assertArrayNear([expected_precision, 0], self.evaluate(result), 1e-3)
-
-  def test_multiple_updates(self):
-    p_obj = metrics.Precision(thresholds=[0.5, 1.])
-    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
-    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
-                                  shape=(2, 2),
-                                  dtype=dtypes.float32)
-    weights = constant_op.constant([[4, 0], [3, 1]],
-                                   shape=(2, 2),
-                                   dtype=dtypes.float32)
-    self.evaluate(variables.variables_initializer(p_obj.variables))
-    update_op = p_obj.update_state(y_true, y_pred, sample_weight=weights)
-    for _ in range(2):
-      self.evaluate(update_op)
+    self.setup()
+    logcosh_obj = metrics.LogCoshError()
+    self.evaluate(variables.variables_initializer(logcosh_obj.variables))
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    result = logcosh_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
 
-    weighted_tp = (0 + 3.) + (0 + 3.)
-    weighted_positives = ((0 + 3.) + (4. + 0.)) + ((0 + 3.) + (4. + 0.))
-    expected_precision = weighted_tp / weighted_positives
-    self.assertArrayNear([expected_precision, 0], self.evaluate(p_obj.result()),
-                         1e-3)
+    sample_weight = np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3))
+    expected_result = np.multiply(self.expected_results, sample_weight)
+    expected_result = np.sum(expected_result) / np.sum(sample_weight)
+    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
 
-  def test_reset_states(self):
-    p_obj = metrics.Precision()
-    model = _get_simple_sequential_model([p_obj])
-    x = np.concatenate((np.ones((50, 4)), np.ones((50, 4))))
-    y = np.concatenate((np.ones((50, 1)), np.zeros((50, 1))))
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(p_obj.tp), 50.)
-    self.assertEqual(self.evaluate(p_obj.fp), 50.)
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(p_obj.tp), 50.)
-    self.assertEqual(self.evaluate(p_obj.fp), 50.)
+
+@test_util.run_all_in_graph_and_eager_modes
+class PoissonTest(test.TestCase):
+
+  def setup(self):
+    y_pred = np.asarray([1, 9, 2, 5, 2, 6]).reshape((2, 3))
+    y_true = np.asarray([4, 8, 12, 8, 1, 3]).reshape((2, 3))
+
+    self.batch_size = 6
+    self.expected_results = y_pred - np.multiply(y_true, np.log(y_pred))
+
+    self.y_pred = constant_op.constant(y_pred, dtype=dtypes.float32)
+    self.y_true = constant_op.constant(y_true)
+
+  def test_config(self):
+    poisson_obj = metrics.Poisson(name='poisson', dtype=dtypes.int32)
+    self.assertEqual(poisson_obj.name, 'poisson')
+    self.assertEqual(poisson_obj._dtype, dtypes.int32)
+
+    poisson_obj2 = metrics.Poisson.from_config(poisson_obj.get_config())
+    self.assertEqual(poisson_obj2.name, 'poisson')
+    self.assertEqual(poisson_obj2._dtype, dtypes.int32)
+
+  def test_unweighted(self):
+    self.setup()
+    poisson_obj = metrics.Poisson()
+    self.evaluate(variables.variables_initializer(poisson_obj.variables))
+
+    update_op = poisson_obj.update_state(self.y_true, self.y_pred)
+    self.evaluate(update_op)
+    result = poisson_obj.result()
+    expected_result = np.sum(self.expected_results) / self.batch_size
+    self.assertAllClose(result, expected_result, atol=1e-3)
+
+  def test_weighted(self):
+    self.setup()
+    poisson_obj = metrics.Poisson()
+    self.evaluate(variables.variables_initializer(poisson_obj.variables))
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+
+    result = poisson_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
+    sample_weight = np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3))
+    expected_result = np.multiply(self.expected_results, sample_weight)
+    expected_result = np.sum(expected_result) / np.sum(sample_weight)
+    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class RecallTest(test.TestCase):
+class KLDivergenceTest(test.TestCase):
+
+  def setup(self):
+    y_pred = np.asarray([.4, .9, .12, .36, .3, .4]).reshape((2, 3))
+    y_true = np.asarray([.5, .8, .12, .7, .43, .8]).reshape((2, 3))
+
+    self.batch_size = 2
+    self.expected_results = np.multiply(y_true, np.log(y_true / y_pred))
+
+    self.y_pred = constant_op.constant(y_pred, dtype=dtypes.float32)
+    self.y_true = constant_op.constant(y_true)
 
   def test_config(self):
-    r_obj = metrics.Recall(name='my_recall', thresholds=[0.4, 0.9])
-    self.assertEqual(r_obj.name, 'my_recall')
-    self.assertLen(r_obj.variables, 2)
-    self.assertEqual([v.name for v in r_obj.variables],
-                     ['true_positives:0', 'false_negatives:0'])
-    self.assertEqual(r_obj.thresholds, [0.4, 0.9])
-
-  def test_value_is_idempotent(self):
-    r_obj = metrics.Recall(thresholds=[0.3, 0.72])
-    y_pred = random_ops.random_uniform(shape=(10, 3))
-    y_true = random_ops.random_uniform(shape=(10, 3))
-    update_op = r_obj.update_state(y_true, y_pred)
-    self.evaluate(variables.variables_initializer(r_obj.variables))
-
-    # Run several updates.
-    for _ in range(10):
-      self.evaluate(update_op)
-
-    # Then verify idempotency.
-    initial_recall = self.evaluate(r_obj.result())
-    for _ in range(10):
-      self.assertArrayNear(initial_recall, self.evaluate(r_obj.result()), 1e-3)
+    k_obj = metrics.KLDivergence(name='kld', dtype=dtypes.int32)
+    self.assertEqual(k_obj.name, 'kld')
+    self.assertEqual(k_obj._dtype, dtypes.int32)
+
+    k_obj2 = metrics.KLDivergence.from_config(k_obj.get_config())
+    self.assertEqual(k_obj2.name, 'kld')
+    self.assertEqual(k_obj2._dtype, dtypes.int32)
 
   def test_unweighted(self):
-    r_obj = metrics.Recall()
-    y_pred = constant_op.constant([1, 0, 1, 0], shape=(1, 4))
-    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
-    self.evaluate(variables.variables_initializer(r_obj.variables))
-    result = r_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.5, self.evaluate(result))
-
-  def test_unweighted_all_incorrect(self):
-    r_obj = metrics.Recall(thresholds=[0.5])
-    inputs = np.random.randint(0, 2, size=(100, 1))
-    y_pred = constant_op.constant(inputs)
-    y_true = constant_op.constant(1 - inputs)
-    self.evaluate(variables.variables_initializer(r_obj.variables))
-    result = r_obj(y_true, y_pred)
-    self.assertAlmostEqual(0, self.evaluate(result))
+    self.setup()
+    k_obj = metrics.KLDivergence()
+    self.evaluate(variables.variables_initializer(k_obj.variables))
+
+    update_op = k_obj.update_state(self.y_true, self.y_pred)
+    self.evaluate(update_op)
+    result = k_obj.result()
+    expected_result = np.sum(self.expected_results) / self.batch_size
+    self.assertAllClose(result, expected_result, atol=1e-3)
 
   def test_weighted(self):
-    r_obj = metrics.Recall()
-    y_pred = constant_op.constant([[1, 0, 1, 0], [0, 1, 0, 1]])
-    y_true = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
-    self.evaluate(variables.variables_initializer(r_obj.variables))
-    result = r_obj(
-        y_true,
-        y_pred,
-        sample_weight=constant_op.constant([[1, 2, 3, 4], [4, 3, 2, 1]]))
-    weighted_tp = 3.0 + 1.0
-    weighted_t = (2.0 + 3.0) + (4.0 + 1.0)
-    expected_recall = weighted_tp / weighted_t
-    self.assertAlmostEqual(expected_recall, self.evaluate(result))
-
-  def test_div_by_zero(self):
-    r_obj = metrics.Recall()
-    y_pred = constant_op.constant([0, 0, 0, 0])
-    y_true = constant_op.constant([0, 0, 0, 0])
-    self.evaluate(variables.variables_initializer(r_obj.variables))
-    result = r_obj(y_true, y_pred)
-    self.assertEqual(0, self.evaluate(result))
-
-  def test_unweighted_with_threshold(self):
-    r_obj = metrics.Recall(thresholds=[0.5, 0.7])
-    y_pred = constant_op.constant([1, 0, 0.6, 0], shape=(1, 4))
-    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
-    self.evaluate(variables.variables_initializer(r_obj.variables))
-    result = r_obj(y_true, y_pred)
-    self.assertArrayNear([0.5, 0.], self.evaluate(result), 0)
-
-  def test_weighted_with_threshold(self):
-    r_obj = metrics.Recall(thresholds=[0.5, 1.])
-    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
-    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
-                                  shape=(2, 2),
-                                  dtype=dtypes.float32)
-    weights = constant_op.constant([[1, 4], [3, 2]],
-                                   shape=(2, 2),
-                                   dtype=dtypes.float32)
-    self.evaluate(variables.variables_initializer(r_obj.variables))
-    result = r_obj(y_true, y_pred, sample_weight=weights)
-    weighted_tp = 0 + 3.
-    weighted_positives = (0 + 3.) + (4. + 0.)
-    expected_recall = weighted_tp / weighted_positives
-    self.assertArrayNear([expected_recall, 0], self.evaluate(result), 1e-3)
-
-  def test_multiple_updates(self):
-    r_obj = metrics.Recall(thresholds=[0.5, 1.])
-    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
-    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
-                                  shape=(2, 2),
-                                  dtype=dtypes.float32)
-    weights = constant_op.constant([[1, 4], [3, 2]],
-                                   shape=(2, 2),
-                                   dtype=dtypes.float32)
-    self.evaluate(variables.variables_initializer(r_obj.variables))
-    update_op = r_obj.update_state(y_true, y_pred, sample_weight=weights)
-    for _ in range(2):
-      self.evaluate(update_op)
+    self.setup()
+    k_obj = metrics.KLDivergence()
+    self.evaluate(variables.variables_initializer(k_obj.variables))
 
-    weighted_tp = (0 + 3.) + (0 + 3.)
-    weighted_positives = ((0 + 3.) + (4. + 0.)) + ((0 + 3.) + (4. + 0.))
-    expected_recall = weighted_tp / weighted_positives
-    self.assertArrayNear([expected_recall, 0], self.evaluate(r_obj.result()),
-                         1e-3)
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    result = k_obj(self.y_true, self.y_pred, sample_weight=sample_weight)
 
-  def test_reset_states(self):
-    r_obj = metrics.Recall()
-    model = _get_simple_sequential_model([r_obj])
-    x = np.concatenate((np.ones((50, 4)), np.zeros((50, 4))))
-    y = np.concatenate((np.ones((50, 1)), np.ones((50, 1))))
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(r_obj.tp), 50.)
-    self.assertEqual(self.evaluate(r_obj.fn), 50.)
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(r_obj.tp), 50.)
-    self.assertEqual(self.evaluate(r_obj.fn), 50.)
+    sample_weight = np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3))
+    expected_result = np.multiply(self.expected_results, sample_weight)
+    expected_result = np.sum(expected_result) / (1.2 + 3.4)
+    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class SensitivityAtSpecificityTest(test.TestCase, parameterized.TestCase):
+class MeanRelativeErrorTest(test.TestCase):
 
   def test_config(self):
-    s_obj = metrics.SensitivityAtSpecificity(
-        0.4, num_thresholds=100, name='sensitivity_at_specificity_1')
-    self.assertEqual(s_obj.name, 'sensitivity_at_specificity_1')
-    self.assertLen(s_obj.variables, 4)
-    self.assertEqual(s_obj.value, 0.4)
-    self.assertLen(s_obj.thresholds, 100)
-
-  def test_value_is_idempotent(self):
-    s_obj = metrics.SensitivityAtSpecificity(0.7)
-    y_pred = random_ops.random_uniform((10, 3),
-                                       maxval=1,
-                                       dtype=dtypes.float32,
-                                       seed=1)
-    y_true = random_ops.random_uniform((10, 3),
-                                       maxval=2,
-                                       dtype=dtypes.int64,
-                                       seed=1)
-    update_op = s_obj.update_state(y_true, y_pred)
-    self.evaluate(variables.variables_initializer(s_obj.variables))
-
-    # Run several updates.
-    for _ in range(10):
-      self.evaluate(update_op)
-
-    # Then verify idempotency.
-    initial_sensitivity = self.evaluate(s_obj.result())
-    for _ in range(10):
-      self.assertAlmostEqual(initial_sensitivity, self.evaluate(s_obj.result()),
-                             1e-3)
-
-  def test_unweighted_all_correct(self):
-    s_obj = metrics.SensitivityAtSpecificity(0.7)
-    inputs = np.random.randint(0, 2, size=(100, 1))
-    y_pred = constant_op.constant(inputs, dtype=dtypes.float32)
-    y_true = constant_op.constant(inputs)
-    self.evaluate(variables.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    self.assertAlmostEqual(1, self.evaluate(result))
-
-  def test_unweighted_high_specificity(self):
-    s_obj = metrics.SensitivityAtSpecificity(0.8)
-    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.1, 0.45, 0.5, 0.8, 0.9]
-    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
-
-    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
-    y_true = constant_op.constant(label_values)
-    self.evaluate(variables.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.8, self.evaluate(result))
-
-  def test_unweighted_low_specificity(self):
-    s_obj = metrics.SensitivityAtSpecificity(0.4)
-    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
-    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
-
-    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
-    y_true = constant_op.constant(label_values)
-    self.evaluate(variables.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.6, self.evaluate(result))
-
-  @parameterized.parameters([dtypes.bool, dtypes.int32, dtypes.float32])
-  def test_weighted(self, label_dtype):
-    s_obj = metrics.SensitivityAtSpecificity(0.4)
-    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
-    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
-    weight_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-
-    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
-    y_true = math_ops.cast(label_values, dtype=label_dtype)
-    weights = constant_op.constant(weight_values)
-    self.evaluate(variables.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred, sample_weight=weights)
-    self.assertAlmostEqual(0.675, self.evaluate(result))
-
-  def test_invalid_specificity(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'`specificity` must be in the range \[0, 1\].'):
-      metrics.SensitivityAtSpecificity(-1)
+    normalizer = constant_op.constant([1, 3], dtype=dtypes.float32)
+    mre_obj = metrics.MeanRelativeError(normalizer=normalizer, name='mre')
+    self.assertEqual(mre_obj.name, 'mre')
+    self.assertArrayNear(self.evaluate(mre_obj.normalizer), [1, 3], 1e-1)
 
-  def test_invalid_num_thresholds(self):
-    with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 0.'):
-      metrics.SensitivityAtSpecificity(0.4, num_thresholds=-1)
+    mre_obj2 = metrics.MeanRelativeError.from_config(mre_obj.get_config())
+    self.assertEqual(mre_obj2.name, 'mre')
+    self.assertArrayNear(self.evaluate(mre_obj2.normalizer), [1, 3], 1e-1)
 
-  def test_reset_states(self):
-    s_obj = metrics.SensitivityAtSpecificity(0.5, num_thresholds=1)
-    model = _get_simple_sequential_model([s_obj])
-    x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
-                        np.ones((25, 4))))
-    y = np.concatenate((np.ones((25, 1)), np.zeros((25, 1)), np.ones((25, 1)),
-                        np.zeros((25, 1))))
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(s_obj.tp), 25.)
-    self.assertEqual(self.evaluate(s_obj.fp), 25.)
-    self.assertEqual(self.evaluate(s_obj.fn), 25.)
-    self.assertEqual(self.evaluate(s_obj.tn), 25.)
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(s_obj.tp), 25.)
-    self.assertEqual(self.evaluate(s_obj.fp), 25.)
-    self.assertEqual(self.evaluate(s_obj.fn), 25.)
-    self.assertEqual(self.evaluate(s_obj.tn), 25.)
+  def test_unweighted(self):
+    np_y_pred = np.asarray([2, 4, 6, 8], dtype=np.float32)
+    np_y_true = np.asarray([1, 3, 2, 3], dtype=np.float32)
+    expected_error = np.mean(
+        np.divide(np.absolute(np_y_pred - np_y_true), np_y_true))
+
+    y_pred = constant_op.constant(np_y_pred, shape=(1, 4), dtype=dtypes.float32)
+    y_true = constant_op.constant(np_y_true, shape=(1, 4))
+
+    mre_obj = metrics.MeanRelativeError(normalizer=y_true)
+    self.evaluate(variables.variables_initializer(mre_obj.variables))
+
+    result = mre_obj(y_true, y_pred)
+    self.assertAllClose(self.evaluate(result), expected_error, atol=1e-3)
+
+  def test_weighted(self):
+    np_y_pred = np.asarray([2, 4, 6, 8], dtype=np.float32)
+    np_y_true = np.asarray([1, 3, 2, 3], dtype=np.float32)
+    sample_weight = np.asarray([0.2, 0.3, 0.5, 0], dtype=np.float32)
+    rel_errors = np.divide(np.absolute(np_y_pred - np_y_true), np_y_true)
+    expected_error = np.sum(rel_errors * sample_weight)
+
+    y_pred = constant_op.constant(np_y_pred, dtype=dtypes.float32)
+    y_true = constant_op.constant(np_y_true)
+
+    mre_obj = metrics.MeanRelativeError(normalizer=y_true)
+    self.evaluate(variables.variables_initializer(mre_obj.variables))
+
+    result = mre_obj(
+        y_true, y_pred, sample_weight=constant_op.constant(sample_weight))
+    self.assertAllClose(self.evaluate(result), expected_error, atol=1e-3)
+
+  def test_zero_normalizer(self):
+    y_pred = constant_op.constant([2, 4], dtype=dtypes.float32)
+    y_true = constant_op.constant([1, 3])
+
+    mre_obj = metrics.MeanRelativeError(normalizer=array_ops.zeros_like(y_true))
+    self.evaluate(variables.variables_initializer(mre_obj.variables))
+
+    result = mre_obj(y_true, y_pred)
+    self.assertEqual(self.evaluate(result), 0)
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class SpecificityAtSensitivityTest(test.TestCase, parameterized.TestCase):
+class MeanIoUTest(test.TestCase):
 
   def test_config(self):
-    s_obj = metrics.SpecificityAtSensitivity(
-        0.4, num_thresholds=100, name='specificity_at_sensitivity_1')
-    self.assertEqual(s_obj.name, 'specificity_at_sensitivity_1')
-    self.assertLen(s_obj.variables, 4)
-    self.assertEqual(s_obj.value, 0.4)
-    self.assertLen(s_obj.thresholds, 100)
-
-  def test_value_is_idempotent(self):
-    s_obj = metrics.SpecificityAtSensitivity(0.7)
-    y_pred = random_ops.random_uniform((10, 3),
-                                       maxval=1,
-                                       dtype=dtypes.float32,
-                                       seed=1)
-    y_true = random_ops.random_uniform((10, 3),
-                                       maxval=2,
-                                       dtype=dtypes.int64,
-                                       seed=1)
-    update_op = s_obj.update_state(y_true, y_pred)
-    self.evaluate(variables.variables_initializer(s_obj.variables))
-
-    # Run several updates.
-    for _ in range(10):
-      self.evaluate(update_op)
-
-    # Then verify idempotency.
-    initial_specificity = self.evaluate(s_obj.result())
-    for _ in range(10):
-      self.assertAlmostEqual(initial_specificity, self.evaluate(s_obj.result()),
-                             1e-3)
-
-  def test_unweighted_all_correct(self):
-    s_obj = metrics.SpecificityAtSensitivity(0.7)
-    inputs = np.random.randint(0, 2, size=(100, 1))
-    y_pred = constant_op.constant(inputs, dtype=dtypes.float32)
-    y_true = constant_op.constant(inputs)
-    self.evaluate(variables.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    self.assertAlmostEqual(1, self.evaluate(result))
-
-  def test_unweighted_high_sensitivity(self):
-    s_obj = metrics.SpecificityAtSensitivity(0.8)
-    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.1, 0.45, 0.5, 0.8, 0.9]
-    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
-
-    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
-    y_true = constant_op.constant(label_values)
-    self.evaluate(variables.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.4, self.evaluate(result))
-
-  def test_unweighted_low_sensitivity(self):
-    s_obj = metrics.SpecificityAtSensitivity(0.4)
-    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
-    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
-
-    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
-    y_true = constant_op.constant(label_values)
-    self.evaluate(variables.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred)
-    self.assertAlmostEqual(0.6, self.evaluate(result))
-
-  @parameterized.parameters([dtypes.bool, dtypes.int32, dtypes.float32])
-  def test_weighted(self, label_dtype):
-    s_obj = metrics.SpecificityAtSensitivity(0.4)
-    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
-    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
-    weight_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-
-    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
-    y_true = math_ops.cast(label_values, dtype=label_dtype)
-    weights = constant_op.constant(weight_values)
-    self.evaluate(variables.variables_initializer(s_obj.variables))
-    result = s_obj(y_true, y_pred, sample_weight=weights)
-    self.assertAlmostEqual(0.4, self.evaluate(result))
-
-  def test_invalid_sensitivity(self):
+    m_obj = metrics.MeanIoU(num_classes=2, name='mean_iou')
+    self.assertEqual(m_obj.name, 'mean_iou')
+    self.assertEqual(m_obj.num_classes, 2)
+
+    m_obj2 = metrics.MeanIoU.from_config(m_obj.get_config())
+    self.assertEqual(m_obj2.name, 'mean_iou')
+    self.assertEqual(m_obj2.num_classes, 2)
+
+  def test_unweighted(self):
+    y_pred = constant_op.constant([0, 1, 0, 1], dtype=dtypes.float32)
+    y_true = constant_op.constant([0, 0, 1, 1])
+
+    m_obj = metrics.MeanIoU(num_classes=2)
+    self.evaluate(variables.variables_initializer(m_obj.variables))
+
+    result = m_obj(y_true, y_pred)
+
+    # cm = [[1, 1],
+    #       [1, 1]]
+    # sum_row = [2, 2], sum_col = [2, 2], true_positives = [1, 1]
+    # iou = true_positives / (sum_row + sum_col - true_positives))
+    expected_result = (1 / (2 + 2 - 1) + 1 / (2 + 2 - 1)) / 2
+    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+  def test_weighted(self):
+    y_pred = constant_op.constant([0, 1, 0, 1], dtype=dtypes.float32)
+    y_true = constant_op.constant([0, 0, 1, 1])
+    sample_weight = constant_op.constant([0.2, 0.3, 0.4, 0.1])
+
+    m_obj = metrics.MeanIoU(num_classes=2)
+    self.evaluate(variables.variables_initializer(m_obj.variables))
+
+    result = m_obj(y_true, y_pred, sample_weight=sample_weight)
+
+    # cm = [[0.2, 0.3],
+    #       [0.4, 0.1]]
+    # sum_row = [0.6, 0.4], sum_col = [0.5, 0.5], true_positives = [0.2, 0.1]
+    # iou = true_positives / (sum_row + sum_col - true_positives))
+    expected_result = (0.2 / (0.6 + 0.5 - 0.2) + 0.1 / (0.4 + 0.5 - 0.1)) / 2
+    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+  def test_multi_dim_input(self):
+    y_pred = constant_op.constant([[0, 1], [0, 1]], dtype=dtypes.float32)
+    y_true = constant_op.constant([[0, 0], [1, 1]])
+    sample_weight = constant_op.constant([[0.2, 0.3], [0.4, 0.1]])
+
+    m_obj = metrics.MeanIoU(num_classes=2)
+    self.evaluate(variables.variables_initializer(m_obj.variables))
+
+    result = m_obj(y_true, y_pred, sample_weight=sample_weight)
+
+    # cm = [[0.2, 0.3],
+    #       [0.4, 0.1]]
+    # sum_row = [0.6, 0.4], sum_col = [0.5, 0.5], true_positives = [0.2, 0.1]
+    # iou = true_positives / (sum_row + sum_col - true_positives))
+    expected_result = (0.2 / (0.6 + 0.5 - 0.2) + 0.1 / (0.4 + 0.5 - 0.1)) / 2
+    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+  def test_zero_valid_entries(self):
+    m_obj = metrics.MeanIoU(num_classes=2)
+    self.evaluate(variables.variables_initializer(m_obj.variables))
+    self.assertAllClose(self.evaluate(m_obj.result()), 0, atol=1e-3)
+
+  def test_zero_and_non_zero_entries(self):
+    y_pred = constant_op.constant([1], dtype=dtypes.float32)
+    y_true = constant_op.constant([1])
+
+    m_obj = metrics.MeanIoU(num_classes=2)
+    self.evaluate(variables.variables_initializer(m_obj.variables))
+    result = m_obj(y_true, y_pred)
+
+    # cm = [[0, 0],
+    #       [0, 1]]
+    # sum_row = [0, 1], sum_col = [0, 1], true_positives = [0, 1]
+    # iou = true_positives / (sum_row + sum_col - true_positives))
+    expected_result = (0 + 1 / (1 + 1 - 1)) / 1
+    self.assertAllClose(self.evaluate(result), expected_result, atol=1e-3)
+
+
+class MeanTensorTest(keras_parameterized.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_config(self):
+    m = metrics.MeanTensor(name='mean_by_element')
+
+    # check config
+    self.assertEqual(m.name, 'mean_by_element')
+    self.assertTrue(m.stateful)
+    self.assertEqual(m.dtype, dtypes.float32)
+    self.assertEqual(len(m.variables), 0)
+
+    with self.assertRaisesRegexp(ValueError, 'does not have any result yet'):
+      m.result()
+
+    self.evaluate(m([[3], [5], [3]]))
+    self.assertAllEqual(m._shape, [3, 1])
+
+    m2 = metrics.MeanTensor.from_config(m.get_config())
+    self.assertEqual(m2.name, 'mean_by_element')
+    self.assertTrue(m2.stateful)
+    self.assertEqual(m2.dtype, dtypes.float32)
+    self.assertEqual(len(m2.variables), 0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_unweighted(self):
+    m = metrics.MeanTensor(dtype=dtypes.float64)
+
+    # check __call__()
+    self.assertAllClose(self.evaluate(m([100, 40])), [100, 40])
+    self.assertAllClose(self.evaluate(m.total), [100, 40])
+    self.assertAllClose(self.evaluate(m.count), [1, 1])
+
+    # check update_state() and result() + state accumulation + tensor input
+    update_op = m.update_state(ops.convert_n_to_tensor([1, 5]))
+    self.evaluate(update_op)
+    self.assertAllClose(self.evaluate(m.result()), [50.5, 22.5])
+    self.assertAllClose(self.evaluate(m.total), [101, 45])
+    self.assertAllClose(self.evaluate(m.count), [2, 2])
+
+    # check reset_states()
+    m.reset_states()
+    self.assertAllClose(self.evaluate(m.total), [0, 0])
+    self.assertAllClose(self.evaluate(m.count), [0, 0])
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_weighted(self):
+    m = metrics.MeanTensor(dtype=dtypes.float64)
+    self.assertEqual(m.dtype, dtypes.float64)
+
+    # check scalar weight
+    result_t = m([100, 30], sample_weight=0.5)
+    self.assertAllClose(self.evaluate(result_t), [100, 30])
+    self.assertAllClose(self.evaluate(m.total), [50, 15])
+    self.assertAllClose(self.evaluate(m.count), [0.5, 0.5])
+
+    # check weights not scalar and weights rank matches values rank
+    result_t = m([1, 5], sample_weight=[1, 0.2])
+    result = self.evaluate(result_t)
+    self.assertAllClose(result, [51 / 1.5, 16 / 0.7], 2)
+    self.assertAllClose(self.evaluate(m.total), [51, 16])
+    self.assertAllClose(self.evaluate(m.count), [1.5, 0.7])
+
+    # check weights broadcast
+    result_t = m([1, 2], sample_weight=0.5)
+    self.assertAllClose(self.evaluate(result_t), [51.5 / 2, 17 / 1.2])
+    self.assertAllClose(self.evaluate(m.total), [51.5, 17])
+    self.assertAllClose(self.evaluate(m.count), [2, 1.2])
+
+    # check weights squeeze
+    result_t = m([1, 5], sample_weight=[[1], [0.2]])
+    self.assertAllClose(self.evaluate(result_t), [52.5 / 3, 18 / 1.4])
+    self.assertAllClose(self.evaluate(m.total), [52.5, 18])
+    self.assertAllClose(self.evaluate(m.count), [3, 1.4])
+
+    # check weights expand
+    m = metrics.MeanTensor((2, 1), dtype=dtypes.float64)
+    self.evaluate(variables.variables_initializer(m.variables))
+    result_t = m([[1], [5]], sample_weight=[1, 0.2])
+    self.assertAllClose(self.evaluate(result_t), [[1], [5]])
+    self.assertAllClose(self.evaluate(m.total), [[1], [1]])
+    self.assertAllClose(self.evaluate(m.count), [[1], [0.2]])
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_invalid_value_shape(self):
+    m = metrics.MeanTensor(dtype=dtypes.float64)
+    m([1])
     with self.assertRaisesRegexp(
-        ValueError, r'`sensitivity` must be in the range \[0, 1\].'):
-      metrics.SpecificityAtSensitivity(-1)
+        ValueError, 'MeanTensor input values must always have the same shape'):
+      m([1, 5])
 
-  def test_invalid_num_thresholds(self):
-    with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 0.'):
-      metrics.SpecificityAtSensitivity(0.4, num_thresholds=-1)
+  @test_util.run_in_graph_and_eager_modes
+  def test_build_in_tf_function(self):
+    """Ensure that variables are created correctly in a tf function."""
+    m = metrics.MeanTensor(dtype=dtypes.float64)
 
-  def test_reset_states(self):
-    s_obj = metrics.SpecificityAtSensitivity(0.5, num_thresholds=1)
-    model = _get_simple_sequential_model([s_obj])
-    x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
-                        np.ones((25, 4))))
-    y = np.concatenate((np.ones((25, 1)), np.zeros((25, 1)), np.ones((25, 1)),
-                        np.zeros((25, 1))))
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(s_obj.tp), 25.)
-    self.assertEqual(self.evaluate(s_obj.fp), 25.)
-    self.assertEqual(self.evaluate(s_obj.fn), 25.)
-    self.assertEqual(self.evaluate(s_obj.tn), 25.)
-    model.evaluate(x, y)
-    self.assertEqual(self.evaluate(s_obj.tp), 25.)
-    self.assertEqual(self.evaluate(s_obj.fp), 25.)
-    self.assertEqual(self.evaluate(s_obj.fn), 25.)
-    self.assertEqual(self.evaluate(s_obj.tn), 25.)
+    @eager_function.defun
+    def call_metric(x):
+      return m(x)
+
+    self.assertAllClose(self.evaluate(call_metric([100, 40])), [100, 40])
+    self.assertAllClose(self.evaluate(m.total), [100, 40])
+    self.assertAllClose(self.evaluate(m.count), [1, 1])
+    self.assertAllClose(self.evaluate(call_metric([20, 2])), [60, 21])
+
+  def test_in_keras_model(self):
+    with context.eager_mode():
+      class ModelWithMetric(Model):
+
+        def __init__(self):
+          super(ModelWithMetric, self).__init__()
+          self.dense1 = layers.Dense(
+              3, activation='relu', kernel_initializer='ones')
+          self.dense2 = layers.Dense(
+              1, activation='sigmoid', kernel_initializer='ones')
+          self.mean_tensor = metrics.MeanTensor()
+
+        def call(self, x):
+          x = self.dense1(x)
+          x = self.dense2(x)
+          self.mean_tensor(self.dense1.kernel)
+          return x
+
+      model = ModelWithMetric()
+      model.compile(
+          loss='mae',
+          optimizer='rmsprop',
+          run_eagerly=True)
+
+      x = np.ones((100, 4))
+      y = np.zeros((100, 1))
+      model.evaluate(x, y, batch_size=50)
+      self.assertAllClose(self.evaluate(model.mean_tensor.result()),
+                          np.ones((4, 3)))
+      self.assertAllClose(self.evaluate(model.mean_tensor.total),
+                          np.full((4, 3), 2))
+      self.assertAllClose(self.evaluate(model.mean_tensor.count),
+                          np.full((4, 3), 2))
+
+      model.evaluate(x, y, batch_size=25)
+      self.assertAllClose(self.evaluate(model.mean_tensor.result()),
+                          np.ones((4, 3)))
+      self.assertAllClose(self.evaluate(model.mean_tensor.total),
+                          np.full((4, 3), 4))
+      self.assertAllClose(self.evaluate(model.mean_tensor.count),
+                          np.full((4, 3), 4))
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class CosineProximityTest(test.TestCase):
+class BinaryCrossentropyTest(test.TestCase):
 
   def test_config(self):
-    cosine_obj = metrics.CosineProximity(name='my_cos', dtype=dtypes.int32)
-    self.assertEqual(cosine_obj.name, 'my_cos')
-    self.assertEqual(cosine_obj._dtype, dtypes.int32)
+    bce_obj = metrics.BinaryCrossentropy(
+        name='bce', dtype=dtypes.int32, label_smoothing=0.2)
+    self.assertEqual(bce_obj.name, 'bce')
+    self.assertEqual(bce_obj._dtype, dtypes.int32)
+
+    old_config = bce_obj.get_config()
+    self.assertAllClose(old_config['label_smoothing'], 0.2, 1e-3)
+
+    # Check save and restore config
+    bce_obj2 = metrics.BinaryCrossentropy.from_config(old_config)
+    self.assertEqual(bce_obj2.name, 'bce')
+    self.assertEqual(bce_obj2._dtype, dtypes.int32)
+    new_config = bce_obj2.get_config()
+    self.assertDictEqual(old_config, new_config)
 
   def test_unweighted(self):
-    cosine_obj = metrics.CosineProximity()
-    self.evaluate(variables.variables_initializer(cosine_obj.variables))
+    bce_obj = metrics.BinaryCrossentropy()
+    self.evaluate(variables.variables_initializer(bce_obj.variables))
+    y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
+    y_pred = np.asarray([1, 1, 1, 0], dtype=np.float32).reshape([2, 2])
+    result = bce_obj(y_true, y_pred)
+
+    # EPSILON = 1e-7, y = y_true, y` = y_pred, Y_MAX = 0.9999999
+    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+    # y` = [Y_MAX, Y_MAX, Y_MAX, EPSILON]
+
+    # Metric = -(y log(y` + EPSILON) + (1 - y) log(1 - y` + EPSILON))
+    #        = [-log(Y_MAX + EPSILON), -log(1 - Y_MAX + EPSILON),
+    #           -log(Y_MAX + EPSILON), -log(1)]
+    #        = [(0 + 15.33) / 2, (0 + 0) / 2]
+    # Reduced metric = 7.665 / 2
+
+    self.assertAllClose(self.evaluate(result), 3.833, atol=1e-3)
+
+  def test_unweighted_with_logits(self):
+    bce_obj = metrics.BinaryCrossentropy(from_logits=True)
+    self.evaluate(variables.variables_initializer(bce_obj.variables))
+
+    y_true = constant_op.constant([[1, 0, 1], [0, 1, 1]])
+    y_pred = constant_op.constant([[100.0, -100.0, 100.0],
+                                   [100.0, 100.0, -100.0]])
+    result = bce_obj(y_true, y_pred)
+
+    # Metric = max(x, 0) - x * z + log(1 + exp(-abs(x)))
+    #              (where x = logits and z = y_true)
+    #        = [((100 - 100 * 1 + log(1 + exp(-100))) +
+    #            (0 + 100 * 0 + log(1 + exp(-100))) +
+    #            (100 - 100 * 1 + log(1 + exp(-100))),
+    #           ((100 - 100 * 0 + log(1 + exp(-100))) +
+    #            (100 - 100 * 1 + log(1 + exp(-100))) +
+    #            (0 + 100 * 1 + log(1 + exp(-100))))]
+    #        = [(0 + 0 + 0) / 3, 200 / 3]
+    # Reduced metric = (0 + 66.666) / 2
+
+    self.assertAllClose(self.evaluate(result), 33.333, atol=1e-3)
 
-    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+  def test_weighted(self):
+    bce_obj = metrics.BinaryCrossentropy()
+    self.evaluate(variables.variables_initializer(bce_obj.variables))
+    y_true = np.asarray([1, 0, 1, 0]).reshape([2, 2])
+    y_pred = np.asarray([1, 1, 1, 0], dtype=np.float32).reshape([2, 2])
+    sample_weight = constant_op.constant([1.5, 2.])
+    result = bce_obj(y_true, y_pred, sample_weight=sample_weight)
+
+    # EPSILON = 1e-7, y = y_true, y` = y_pred, Y_MAX = 0.9999999
+    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+    # y` = [Y_MAX, Y_MAX, Y_MAX, EPSILON]
+
+    # Metric = -(y log(y` + EPSILON) + (1 - y) log(1 - y` + EPSILON))
+    #        = [-log(Y_MAX + EPSILON), -log(1 - Y_MAX + EPSILON),
+    #           -log(Y_MAX + EPSILON), -log(1)]
+    #        = [(0 + 15.33) / 2, (0 + 0) / 2]
+    # Weighted metric = [7.665 * 1.5, 0]
+    # Reduced metric = 7.665 * 1.5 / (1.5 + 2)
+
+    self.assertAllClose(self.evaluate(result), 3.285, atol=1e-3)
+
+  def test_weighted_from_logits(self):
+    bce_obj = metrics.BinaryCrossentropy(from_logits=True)
+    self.evaluate(variables.variables_initializer(bce_obj.variables))
+    y_true = constant_op.constant([[1, 0, 1], [0, 1, 1]])
+    y_pred = constant_op.constant([[100.0, -100.0, 100.0],
+                                   [100.0, 100.0, -100.0]])
+    sample_weight = constant_op.constant([2., 2.5])
+    result = bce_obj(y_true, y_pred, sample_weight=sample_weight)
+
+    # Metric = max(x, 0) - x * z + log(1 + exp(-abs(x)))
+    #              (where x = logits and z = y_true)
+    #        = [(0 + 0 + 0) / 3, 200 / 3]
+    # Weighted metric = [0, 66.666 * 2.5]
+    # Reduced metric = 66.666 * 2.5 / (2 + 2.5)
+
+    self.assertAllClose(self.evaluate(result), 37.037, atol=1e-3)
+
+  def test_label_smoothing(self):
+    logits = constant_op.constant(((100., -100., -100.)))
+    y_true = constant_op.constant(((1, 0, 1)))
+    label_smoothing = 0.1
+    # Metric: max(x, 0) - x * z + log(1 + exp(-abs(x)))
+    #             (where x = logits and z = y_true)
+    # Label smoothing: z' = z * (1 - L) + 0.5L
+    # After label smoothing, label 1 becomes 1 - 0.5L
+    #                        label 0 becomes 0.5L
+    # Applying the above two fns to the given input:
+    # (100 - 100 * (1 - 0.5 L)  + 0 +
+    #  0   + 100 * (0.5 L)      + 0 +
+    #  0   + 100 * (1 - 0.5 L)  + 0) * (1/3)
+    #  = (100 + 50L) * 1/3
+    bce_obj = metrics.BinaryCrossentropy(
+        from_logits=True, label_smoothing=label_smoothing)
+    self.evaluate(variables.variables_initializer(bce_obj.variables))
+    result = bce_obj(y_true, logits)
+    expected_value = (100.0 + 50.0 * label_smoothing) / 3.0
+    self.assertAllClose(expected_value, self.evaluate(result), atol=1e-3)
 
-    update_op = cosine_obj.update_state(y_true, y_pred)
-    self.evaluate(update_op)
-    result = cosine_obj.result()
-    self.assertAllClose(-0.60723, result, atol=1e-5)
+
+@test_util.run_all_in_graph_and_eager_modes
+class CategoricalCrossentropyTest(test.TestCase):
+
+  def test_config(self):
+    cce_obj = metrics.CategoricalCrossentropy(
+        name='cce', dtype=dtypes.int32, label_smoothing=0.2)
+    self.assertEqual(cce_obj.name, 'cce')
+    self.assertEqual(cce_obj._dtype, dtypes.int32)
+
+    old_config = cce_obj.get_config()
+    self.assertAllClose(old_config['label_smoothing'], 0.2, 1e-3)
+
+    # Check save and restore config
+    cce_obj2 = metrics.CategoricalCrossentropy.from_config(old_config)
+    self.assertEqual(cce_obj2.name, 'cce')
+    self.assertEqual(cce_obj2._dtype, dtypes.int32)
+    new_config = cce_obj2.get_config()
+    self.assertDictEqual(old_config, new_config)
+
+  def test_unweighted(self):
+    cce_obj = metrics.CategoricalCrossentropy()
+    self.evaluate(variables.variables_initializer(cce_obj.variables))
+
+    y_true = np.asarray([[0, 1, 0], [0, 0, 1]])
+    y_pred = np.asarray([[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
+    result = cce_obj(y_true, y_pred)
+
+    # EPSILON = 1e-7, y = y_true, y` = y_pred
+    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+    # y` = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+
+    # Metric = -sum(y * log(y'), axis = -1)
+    #        = -((log 0.95), (log 0.1))
+    #        = [0.051, 2.302]
+    # Reduced metric = (0.051 + 2.302) / 2
+
+    self.assertAllClose(self.evaluate(result), 1.176, atol=1e-3)
+
+  def test_unweighted_from_logits(self):
+    cce_obj = metrics.CategoricalCrossentropy(from_logits=True)
+    self.evaluate(variables.variables_initializer(cce_obj.variables))
+
+    y_true = np.asarray([[0, 1, 0], [0, 0, 1]])
+    logits = np.asarray([[1, 9, 0], [1, 8, 1]], dtype=np.float32)
+    result = cce_obj(y_true, logits)
+
+    # softmax = exp(logits) / sum(exp(logits), axis=-1)
+    # xent = -sum(labels * log(softmax), 1)
+
+    # exp(logits) = [[2.718, 8103.084, 1], [2.718, 2980.958, 2.718]]
+    # sum(exp(logits), axis=-1) = [8106.802, 2986.394]
+    # softmax = [[0.00033, 0.99954, 0.00012], [0.00091, 0.99817, 0.00091]]
+    # log(softmax) = [[-8.00045, -0.00045, -9.00045],
+    #                 [-7.00182, -0.00182, -7.00182]]
+    # labels * log(softmax) = [[0, -0.00045, 0], [0, 0, -7.00182]]
+    # xent = [0.00045, 7.00182]
+    # Reduced xent = (0.00045 + 7.00182) / 2
+
+    self.assertAllClose(self.evaluate(result), 3.5011, atol=1e-3)
 
   def test_weighted(self):
-    cosine_obj = metrics.CosineProximity()
-    self.evaluate(variables.variables_initializer(cosine_obj.variables))
-    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
-                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
-    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
-                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
-    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
-    result = cosine_obj(y_true, y_pred, sample_weight=sample_weight)
-    self.assertAllClose(-0.59916, self.evaluate(result), atol=1e-5)
+    cce_obj = metrics.CategoricalCrossentropy()
+    self.evaluate(variables.variables_initializer(cce_obj.variables))
+
+    y_true = np.asarray([[0, 1, 0], [0, 0, 1]])
+    y_pred = np.asarray([[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
+    sample_weight = constant_op.constant([1.5, 2.])
+    result = cce_obj(y_true, y_pred, sample_weight=sample_weight)
+
+    # EPSILON = 1e-7, y = y_true, y` = y_pred
+    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+    # y` = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+
+    # Metric = -sum(y * log(y'), axis = -1)
+    #        = -((log 0.95), (log 0.1))
+    #        = [0.051, 2.302]
+    # Weighted metric = [0.051 * 1.5, 2.302 * 2.]
+    # Reduced metric = (0.051 * 1.5 + 2.302 * 2.) / 3.5
+
+    self.assertAllClose(self.evaluate(result), 1.338, atol=1e-3)
+
+  def test_weighted_from_logits(self):
+    cce_obj = metrics.CategoricalCrossentropy(from_logits=True)
+    self.evaluate(variables.variables_initializer(cce_obj.variables))
+
+    y_true = np.asarray([[0, 1, 0], [0, 0, 1]])
+    logits = np.asarray([[1, 9, 0], [1, 8, 1]], dtype=np.float32)
+    sample_weight = constant_op.constant([1.5, 2.])
+    result = cce_obj(y_true, logits, sample_weight=sample_weight)
+
+    # softmax = exp(logits) / sum(exp(logits), axis=-1)
+    # xent = -sum(labels * log(softmax), 1)
+    # xent = [0.00045, 7.00182]
+    # weighted xent = [0.000675, 14.00364]
+    # Reduced xent = (0.000675 + 14.00364) / (1.5 + 2)
+
+    self.assertAllClose(self.evaluate(result), 4.0012, atol=1e-3)
+
+  def test_label_smoothing(self):
+    y_true = np.asarray([[0, 1, 0], [0, 0, 1]])
+    logits = np.asarray([[1, 9, 0], [1, 8, 1]], dtype=np.float32)
+    label_smoothing = 0.1
+
+    # Label smoothing: z' = z * (1 - L) + L/n,
+    #     where L = label smoothing value and n = num classes
+    # Label value 1 becomes: 1 - L + L/n
+    # Label value 0 becomes: L/n
+    # y_true with label_smoothing = [[0.0333, 0.9333, 0.0333],
+    #                               [0.0333, 0.0333, 0.9333]]
+
+    # softmax = exp(logits) / sum(exp(logits), axis=-1)
+    # xent = -sum(labels * log(softmax), 1)
+    # log(softmax) = [[-8.00045, -0.00045, -9.00045],
+    #                 [-7.00182, -0.00182, -7.00182]]
+    # labels * log(softmax) = [[-0.26641, -0.00042, -0.29971],
+    #                          [-0.23316, -0.00006, -6.53479]]
+    # xent = [0.56654, 6.76801]
+    # Reduced xent = (0.56654 + 6.76801) / 2
+
+    cce_obj = metrics.CategoricalCrossentropy(
+        from_logits=True, label_smoothing=label_smoothing)
+    self.evaluate(variables.variables_initializer(cce_obj.variables))
+    loss = cce_obj(y_true, logits)
+    self.assertAllClose(self.evaluate(loss), 3.667, atol=1e-3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SparseCategoricalCrossentropyTest(test.TestCase):
+
+  def test_config(self):
+    scce_obj = metrics.SparseCategoricalCrossentropy(
+        name='scce', dtype=dtypes.int32)
+    self.assertEqual(scce_obj.name, 'scce')
+    self.assertEqual(scce_obj.dtype, dtypes.int32)
+    old_config = scce_obj.get_config()
+    self.assertDictEqual(old_config, json.loads(json.dumps(old_config)))
+
+    # Check save and restore config
+    scce_obj2 = metrics.SparseCategoricalCrossentropy.from_config(old_config)
+    self.assertEqual(scce_obj2.name, 'scce')
+    self.assertEqual(scce_obj2.dtype, dtypes.int32)
+    new_config = scce_obj2.get_config()
+    self.assertDictEqual(old_config, new_config)
+
+  def test_unweighted(self):
+    scce_obj = metrics.SparseCategoricalCrossentropy()
+    self.evaluate(variables.variables_initializer(scce_obj.variables))
+
+    y_true = np.asarray([1, 2])
+    y_pred = np.asarray([[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
+    result = scce_obj(y_true, y_pred)
+
+    # EPSILON = 1e-7, y = y_true, y` = y_pred
+    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+    # y` = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+    # logits = log(y`) =  [[-2.9957, -0.0513, -16.1181],
+    #                      [-2.3026, -0.2231, -2.3026]]
+
+    # softmax = exp(logits) / sum(exp(logits), axis=-1)
+    # y = one_hot(y) = [[0, 1, 0], [0, 0, 1]]
+    # xent = -sum(y * log(softmax), 1)
+
+    # exp(logits) = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+    # sum(exp(logits), axis=-1) = [1, 1]
+    # softmax = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+    # log(softmax) = [[-2.9957, -0.0513, -16.1181],
+    #                 [-2.3026, -0.2231, -2.3026]]
+    # y * log(softmax) = [[0, -0.0513, 0], [0, 0, -2.3026]]
+    # xent = [0.0513, 2.3026]
+    # Reduced xent = (0.0513 + 2.3026) / 2
+
+    self.assertAllClose(self.evaluate(result), 1.176, atol=1e-3)
+
+  def test_unweighted_from_logits(self):
+    scce_obj = metrics.SparseCategoricalCrossentropy(from_logits=True)
+    self.evaluate(variables.variables_initializer(scce_obj.variables))
+
+    y_true = np.asarray([1, 2])
+    logits = np.asarray([[1, 9, 0], [1, 8, 1]], dtype=np.float32)
+    result = scce_obj(y_true, logits)
+
+    # softmax = exp(logits) / sum(exp(logits), axis=-1)
+    # y_true = one_hot(y_true) = [[0, 1, 0], [0, 0, 1]]
+    # xent = -sum(y_true * log(softmax), 1)
+
+    # exp(logits) = [[2.718, 8103.084, 1], [2.718, 2980.958, 2.718]]
+    # sum(exp(logits), axis=-1) = [8106.802, 2986.394]
+    # softmax = [[0.00033, 0.99954, 0.00012], [0.00091, 0.99817, 0.00091]]
+    # log(softmax) = [[-8.00045, -0.00045, -9.00045],
+    #                 [-7.00182, -0.00182, -7.00182]]
+    # y_true * log(softmax) = [[0, -0.00045, 0], [0, 0, -7.00182]]
+    # xent = [0.00045, 7.00182]
+    # Reduced xent = (0.00045 + 7.00182) / 2
+
+    self.assertAllClose(self.evaluate(result), 3.5011, atol=1e-3)
+
+  def test_weighted(self):
+    scce_obj = metrics.SparseCategoricalCrossentropy()
+    self.evaluate(variables.variables_initializer(scce_obj.variables))
+
+    y_true = np.asarray([1, 2])
+    y_pred = np.asarray([[0.05, 0.95, 0], [0.1, 0.8, 0.1]])
+    sample_weight = constant_op.constant([1.5, 2.])
+    result = scce_obj(y_true, y_pred, sample_weight=sample_weight)
+
+    # EPSILON = 1e-7, y = y_true, y` = y_pred
+    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+    # y` = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+    # logits = log(y`) =  [[-2.9957, -0.0513, -16.1181],
+    #                      [-2.3026, -0.2231, -2.3026]]
+
+    # softmax = exp(logits) / sum(exp(logits), axis=-1)
+    # y = one_hot(y) = [[0, 1, 0], [0, 0, 1]]
+    # xent = -sum(y * log(softmax), 1)
+
+    # exp(logits) = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+    # sum(exp(logits), axis=-1) = [1, 1]
+    # softmax = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]]
+    # log(softmax) = [[-2.9957, -0.0513, -16.1181],
+    #                 [-2.3026, -0.2231, -2.3026]]
+    # y * log(softmax) = [[0, -0.0513, 0], [0, 0, -2.3026]]
+    # xent = [0.0513, 2.3026]
+    # Weighted xent = [0.051 * 1.5, 2.302 * 2.]
+    # Reduced xent = (0.051 * 1.5 + 2.302 * 2.) / 3.5
+
+    self.assertAllClose(self.evaluate(result), 1.338, atol=1e-3)
+
+  def test_weighted_from_logits(self):
+    scce_obj = metrics.SparseCategoricalCrossentropy(from_logits=True)
+    self.evaluate(variables.variables_initializer(scce_obj.variables))
+
+    y_true = np.asarray([1, 2])
+    logits = np.asarray([[1, 9, 0], [1, 8, 1]], dtype=np.float32)
+    sample_weight = constant_op.constant([1.5, 2.])
+    result = scce_obj(y_true, logits, sample_weight=sample_weight)
+
+    # softmax = exp(logits) / sum(exp(logits), axis=-1)
+    # y_true = one_hot(y_true) = [[0, 1, 0], [0, 0, 1]]
+    # xent = -sum(y_true * log(softmax), 1)
+    # xent = [0.00045, 7.00182]
+    # weighted xent = [0.000675, 14.00364]
+    # Reduced xent = (0.000675 + 14.00364) / (1.5 + 2)
+
+    self.assertAllClose(self.evaluate(result), 4.0012, atol=1e-3)
+
+  def test_axis(self):
+    scce_obj = metrics.SparseCategoricalCrossentropy(axis=0)
+    self.evaluate(variables.variables_initializer(scce_obj.variables))
+
+    y_true = np.asarray([1, 2])
+    y_pred = np.asarray([[0.05, 0.1], [0.95, 0.8], [0, 0.1]])
+    result = scce_obj(y_true, y_pred)
+
+    # EPSILON = 1e-7, y = y_true, y` = y_pred
+    # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON)
+    # y` = [[0.05, 0.1], [0.95, 0.8], [EPSILON, 0.1]]
+    # logits = log(y`) =  [[-2.9957, -2.3026],
+    #                      [-0.0513, -0.2231],
+    #                      [-16.1181, -2.3026]]
+
+    # softmax = exp(logits) / sum(exp(logits), axis=-1)
+    # y = one_hot(y) = [[0, 0], [1, 0], [0, 1]]
+    # xent = -sum(y * log(softmax), 1)
+
+    # exp(logits) = [[0.05, 0.1], [0.95, 0.8], [EPSILON, 0.1]]
+    # sum(exp(logits)) = [1, 1]
+    # softmax = [[0.05, 0.1], [0.95, 0.8], [EPSILON, 0.1]]
+    # log(softmax) = [[-2.9957, -2.3026],
+    #                 [-0.0513, -0.2231],
+    #                 [-16.1181, -2.3026]]
+    # y * log(softmax) = [[0, 0], [-0.0513, 0], [0, -2.3026]]
+    # xent = [0.0513, 2.3026]
+    # Reduced xent = (0.0513 + 2.3026) / 2
+
+    self.assertAllClose(self.evaluate(result), 1.176, atol=1e-3)
+
+
+def _get_model(compile_metrics):
+  model_layers = [
+      layers.Dense(3, activation='relu', kernel_initializer='ones'),
+      layers.Dense(1, activation='sigmoid', kernel_initializer='ones')]
+
+  model = testing_utils.get_model_from_layers(model_layers, input_shape=(4,))
+  model.compile(
+      loss='mae',
+      metrics=compile_metrics,
+      optimizer='rmsprop',
+      run_eagerly=testing_utils.should_run_eagerly())
+  return model
+
+
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes
+class ResetStatesTest(keras_parameterized.TestCase):
+
+  def test_reset_states_false_positives(self):
+    fp_obj = metrics.FalsePositives()
+    model = _get_model([fp_obj])
+    x = np.ones((100, 4))
+    y = np.zeros((100, 1))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(fp_obj.accumulator), 100.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(fp_obj.accumulator), 100.)
+
+  def test_reset_states_false_negatives(self):
+    fn_obj = metrics.FalseNegatives()
+    model = _get_model([fn_obj])
+    x = np.zeros((100, 4))
+    y = np.ones((100, 1))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(fn_obj.accumulator), 100.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(fn_obj.accumulator), 100.)
+
+  def test_reset_states_true_negatives(self):
+    tn_obj = metrics.TrueNegatives()
+    model = _get_model([tn_obj])
+    x = np.zeros((100, 4))
+    y = np.zeros((100, 1))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(tn_obj.accumulator), 100.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(tn_obj.accumulator), 100.)
+
+  def test_reset_states_true_positives(self):
+    tp_obj = metrics.TruePositives()
+    model = _get_model([tp_obj])
+    x = np.ones((100, 4))
+    y = np.ones((100, 1))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(tp_obj.accumulator), 100.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(tp_obj.accumulator), 100.)
+
+  def test_reset_states_precision(self):
+    p_obj = metrics.Precision()
+    model = _get_model([p_obj])
+    x = np.concatenate((np.ones((50, 4)), np.ones((50, 4))))
+    y = np.concatenate((np.ones((50, 1)), np.zeros((50, 1))))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(p_obj.true_positives), 50.)
+    self.assertEqual(self.evaluate(p_obj.false_positives), 50.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(p_obj.true_positives), 50.)
+    self.assertEqual(self.evaluate(p_obj.false_positives), 50.)
+
+  def test_reset_states_recall(self):
+    r_obj = metrics.Recall()
+    model = _get_model([r_obj])
+    x = np.concatenate((np.ones((50, 4)), np.zeros((50, 4))))
+    y = np.concatenate((np.ones((50, 1)), np.ones((50, 1))))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(r_obj.true_positives), 50.)
+    self.assertEqual(self.evaluate(r_obj.false_negatives), 50.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(r_obj.true_positives), 50.)
+    self.assertEqual(self.evaluate(r_obj.false_negatives), 50.)
+
+  def test_reset_states_sensitivity_at_specificity(self):
+    s_obj = metrics.SensitivityAtSpecificity(0.5, num_thresholds=1)
+    model = _get_model([s_obj])
+    x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
+                        np.ones((25, 4))))
+    y = np.concatenate((np.ones((25, 1)), np.zeros((25, 1)), np.ones((25, 1)),
+                        np.zeros((25, 1))))
+
+    for _ in range(2):
+      model.evaluate(x, y)
+      self.assertEqual(self.evaluate(s_obj.true_positives), 25.)
+      self.assertEqual(self.evaluate(s_obj.false_positives), 25.)
+      self.assertEqual(self.evaluate(s_obj.false_negatives), 25.)
+      self.assertEqual(self.evaluate(s_obj.true_negatives), 25.)
+
+  def test_reset_states_specificity_at_sensitivity(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.5, num_thresholds=1)
+    model = _get_model([s_obj])
+    x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
+                        np.ones((25, 4))))
+    y = np.concatenate((np.ones((25, 1)), np.zeros((25, 1)), np.ones((25, 1)),
+                        np.zeros((25, 1))))
+
+    for _ in range(2):
+      model.evaluate(x, y)
+      self.assertEqual(self.evaluate(s_obj.true_positives), 25.)
+      self.assertEqual(self.evaluate(s_obj.false_positives), 25.)
+      self.assertEqual(self.evaluate(s_obj.false_negatives), 25.)
+      self.assertEqual(self.evaluate(s_obj.true_negatives), 25.)
+
+  def test_reset_states_auc(self):
+    auc_obj = metrics.AUC(num_thresholds=3)
+    model = _get_model([auc_obj])
+    x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
+                        np.ones((25, 4))))
+    y = np.concatenate((np.ones((25, 1)), np.zeros((25, 1)), np.ones((25, 1)),
+                        np.zeros((25, 1))))
+
+    for _ in range(2):
+      model.evaluate(x, y)
+      self.assertEqual(self.evaluate(auc_obj.true_positives[1]), 25.)
+      self.assertEqual(self.evaluate(auc_obj.false_positives[1]), 25.)
+      self.assertEqual(self.evaluate(auc_obj.false_negatives[1]), 25.)
+      self.assertEqual(self.evaluate(auc_obj.true_negatives[1]), 25.)
+
+  def test_reset_states_mean_iou(self):
+    m_obj = metrics.MeanIoU(num_classes=2)
+    model = _get_model([m_obj])
+    x = np.asarray([[0, 0, 0, 0], [1, 1, 1, 1], [1, 0, 1, 0], [0, 1, 0, 1]],
+                   dtype=np.float32)
+    y = np.asarray([[0], [1], [1], [1]], dtype=np.float32)
+    model.evaluate(x, y)
+    self.assertArrayNear(self.evaluate(m_obj.total_cm)[0], [1, 0], 1e-1)
+    self.assertArrayNear(self.evaluate(m_obj.total_cm)[1], [3, 0], 1e-1)
+    model.evaluate(x, y)
+    self.assertArrayNear(self.evaluate(m_obj.total_cm)[0], [1, 0], 1e-1)
+    self.assertArrayNear(self.evaluate(m_obj.total_cm)[1], [3, 0], 1e-1)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/mixed_precision/experimental/BUILD b/tensorflow/python/keras/mixed_precision/experimental/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f994ab9c70a7491c3252ff781ac48055fe59bb9d
--- /dev/null
+++ b/tensorflow/python/keras/mixed_precision/experimental/BUILD
@@ -0,0 +1,54 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Description:
+#   Contains the Keras Mixed Precision API (TensorFlow version).
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "autocast_variable",
+    srcs = [
+        "autocast_variable.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python/distribute:values",
+    ],
+)
+
+py_test(
+    name = "autocast_variable_test",
+    size = "medium",
+    srcs = ["autocast_variable_test.py"],
+    deps = [
+        ":autocast_variable",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/eager:context",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
new file mode 100644
index 0000000000000000000000000000000000000000..a64b5178316009354c6adecc8213bf7681504e6f
--- /dev/null
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
@@ -0,0 +1,178 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains AutoCastVariable, a variable which automatically casts itself."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute import values as distribute_values
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+
+
+# TODO(reedwm): Make checkpointable?
+class AutoCastVariable(object):
+  """Variable that will cast itself to a different dtype in applicable contexts.
+
+  This class wraps a floating-point tf.Variable. It emulates the variable
+  interface and delegates to the wrapped variable, but it additionally will cast
+  the wrapped variable under a `Graph._enable_variable_auto_cast(dtype)` context
+  manager.
+
+  For example:
+
+  ```
+  v = tf.Variable(1.0, dtype=tf.float32)
+  v = AutoCastVariable(v)
+  print(tf.identity(v).dtype)  # tf.float32
+  with ops.get_default_graph()._enable_variable_auto_cast(tf.float16):
+    print(tf.identity(v).dtype)  # tf.float16, as v will cast itself to float16
+    print(v.dtype)  # tf.float16, as v.dtype also changes under the ctx manager.
+  ```
+
+  The purpose of this class is to allow Keras layers to create variables in
+  float32, and automatically cast them to float16 or bfloat16 when the layer is
+  called.
+  """
+
+  def __init__(self, variable):
+    """Creates an AutoCastVariable instance.
+
+    Args:
+      variable: A floating-point resource variable to wrap.
+
+    Raises:
+      ValueError: If `variable` is not a floating-point resource variable
+    """
+    if not resource_variable_ops.is_resource_variable(variable):
+      raise ValueError('variable must be of type tf.ResourceVariable, but got: '
+                       '%s' % variable)
+    if not variable.dtype.is_floating:
+      raise ValueError('variable must be a floating point variable but has '
+                       'type: %s' % variable.dtype.name)
+    self._variable = variable
+
+  @property
+  def name(self):
+    return self._variable.name
+
+  def _should_cast(self):
+    """Returns True if this variable should be casted when accessed."""
+    g = ops.get_default_graph()
+    # pylint:disable=protected-access
+    return (g._auto_cast_variable_read_dtype is not None and
+            self.true_dtype != g._auto_cast_variable_read_dtype)
+    # pylint:enable=protected-access
+
+  @property
+  def dtype(self):
+    """The dtype this variable will be casted to when read."""
+    if self._should_cast():
+      return ops.get_default_graph()._auto_cast_variable_read_dtype  # pylint:disable=protected-access
+    else:
+      return self._variable.dtype
+
+  @property
+  def true_dtype(self):
+    """The dtype of the underlying variable, before any casts are done."""
+    return self._variable.dtype
+
+  def value(self):
+    val = self._variable.value()
+    if not self._should_cast():
+      return val
+    # We colocate_with(None) to ignore the existing device constraints, so that
+    # the cast is always done on the variable's device
+    with ops.colocate_with(None, ignore_existing=True):
+      with ops.device(val.device):
+        return math_ops.cast(val, self.dtype)
+
+  def read_value(self):
+    val = self._variable.read_value()
+    if not self._should_cast():
+      return val
+    return math_ops.cast(val, self.dtype)
+
+  def sparse_read(self, indices, name=None):
+    """Reads the value of this variable sparsely, using `gather`."""
+    val = self._variable.sparse_read(indices, name=name)
+    if not self._should_cast():
+      return val
+    return math_ops.cast(val, self.dtype)
+
+  def assign(self, value, use_locking=None, name=None, read_value=True):
+    return self._variable.assign(
+        value, use_locking=use_locking, name=name, read_value=read_value)
+
+  def assign_add(self, delta, use_locking=None, name=None, read_value=True):
+    return self._variable.assign_add(
+        delta, use_locking=use_locking, name=name, read_value=read_value)
+
+  def assign_sub(self, delta, use_locking=None, name=None, read_value=True):
+    return self._variable.assign_sub(
+        delta, use_locking=use_locking, name=name, read_value=read_value)
+
+  # TODO(reedwm): Support assigning variables with tf.assign(), var.scatter_add,
+  # etc.
+
+  def __getattr__(self, name):
+    return getattr(self._variable, name)
+
+  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
+    """Converts this variable to a tensor."""
+    if not self._should_cast():
+      return ops.internal_convert_to_tensor(self._variable, dtype, name,
+                                            as_ref)
+    # TODO(reedwm): Support as_ref?
+    assert not as_ref
+    if dtype is not None and not dtype.is_compatible_with(self.dtype):
+      raise ValueError(
+          'Incompatible type conversion requested to type {!r} for variable '
+          'of type {!r}'.format(dtype.name, self.dtype.name))
+    val = ops.internal_convert_to_tensor(self._variable,
+                                         self._variable.dtype, name,
+                                         as_ref=False)
+    with ops.colocate_with(None, ignore_existing=True):
+      with ops.device(val.device):
+        return math_ops.cast(val, self.dtype)
+
+  def _should_act_as_resource_variable(self):
+    """Pass resource_variable_ops.is_resource_variable check."""
+    pass
+
+  # TODO(reedwm): Define operator overloads.
+
+
+ops.register_tensor_conversion_function(
+    AutoCastVariable, AutoCastVariable._dense_var_to_tensor)  # pylint:disable=protected-access
+ops.register_dense_tensor_like_type(AutoCastVariable)
+
+
+# We have DistributedVariable subclass to pass
+# isinstance(..., DistributedVariable) checks when wrapping a
+# DistributedVariable.
+# TODO(reedwm): We should not wrap DistributedVariable, but instead have
+# DistributedVariable wrap AutoCastVariable. Subclassing DistributedVariable is
+# messy, because we do not fully implement the interface of DistributedVariable.
+class AutoCastDistributedVariable(AutoCastVariable,
+                                  distribute_values.DistributedVariable):
+  """Version of AutoCastVariable that subclasses DistributedVariable."""
+
+  def __init__(self, variable):
+    if not isinstance(variable, distribute_values.DistributedValues):
+      raise ValueError('variable must be of type DistributedValues, '
+                       'but got: %s' % variable)
+    super(AutoCastDistributedVariable, self).__init__(variable)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1caec6a738709768b35aeab9bd18fe67e90982a9
--- /dev/null
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
@@ -0,0 +1,245 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for AutoCastVariable."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.mixed_precision.experimental import autocast_variable
+
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+TESTCASES = ({
+    'testcase_name': 'base',
+    'distribute': False
+}, {
+    'testcase_name': 'distribute',
+    'distribute': True
+})
+
+
+def get_distribute_scope(distribute):
+
+  class DummyContextManager(object):
+
+    def __enter__(self):
+      pass
+
+    def __exit__(self, *args):
+      pass
+
+  if distribute:
+    return mirrored_strategy.MirroredStrategy(['cpu:0']).scope()
+  else:
+    return DummyContextManager()
+
+
+def get_autocast_var(var, distribute):
+  if distribute:
+    return autocast_variable.AutoCastDistributedVariable(var)
+  else:
+    return autocast_variable.AutoCastVariable(var)
+
+
+def get_var(val, dtype):
+  return variables.VariableV1(val, use_resource=True, dtype=dtype)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(*TESTCASES)
+  def test_read(self, distribute):
+    with get_distribute_scope(distribute):
+      x = get_var(1., dtypes.float32)
+      x = get_autocast_var(x, distribute)
+      self.evaluate(x.initializer)
+
+      # outside of auto cast scope.
+      self.assertEqual(x.dtype, dtypes.float32)
+      self.assertEqual(x.value().dtype, dtypes.float32)
+      self.assertEqual(x.read_value().dtype, dtypes.float32)
+      self.assertEqual(array_ops.identity(x).dtype, dtypes.float32)
+
+      # within auto cast scope of different dtype
+      with ops.get_default_graph()._enable_auto_casting_variables(
+          dtypes.float16):
+        self.assertEqual(x.dtype, dtypes.float16)
+        self.assertEqual(x.value().dtype, dtypes.float16)
+        self.assertEqual(x.read_value().dtype, dtypes.float16)
+        self.assertEqual(array_ops.identity(x).dtype, dtypes.float16)
+
+      # within auto cast scope of same dtype
+      with ops.get_default_graph()._enable_auto_casting_variables(
+          dtypes.float32):
+        self.assertEqual(x.dtype, dtypes.float32)
+        self.assertEqual(x.value().dtype, dtypes.float32)
+        self.assertEqual(x.read_value().dtype, dtypes.float32)
+        self.assertEqual(array_ops.identity(x).dtype, dtypes.float32)
+
+  @parameterized.named_parameters(*TESTCASES)
+  def test_read_nested_scopes(self, distribute):
+    with get_distribute_scope(distribute):
+      x = get_var(1., dtypes.float32)
+      x = get_autocast_var(x, distribute)
+      self.evaluate(x.initializer)
+
+      with ops.get_default_graph()._enable_auto_casting_variables(
+          dtypes.float16):
+        self.assertEqual(x.dtype, dtypes.float16)
+        self.assertEqual(x.read_value().dtype, dtypes.float16)
+
+        with ops.get_default_graph()._enable_auto_casting_variables(
+            dtypes.float32):
+          self.assertEqual(x.dtype, dtypes.float32)
+          self.assertEqual(x.read_value().dtype, dtypes.float32)
+
+        self.assertEqual(x.dtype, dtypes.float16)
+        self.assertEqual(x.read_value().dtype, dtypes.float16)
+
+  @parameterized.named_parameters(*TESTCASES)
+  def test_operator_overloads(self, distribute):
+    with get_distribute_scope(distribute):
+      x = get_var(1., dtypes.float32)
+      x = get_autocast_var(x, distribute)
+      self.evaluate(x.initializer)
+
+    v1 = constant_op.constant(2., dtype=dtypes.float32)
+    v2 = constant_op.constant(2., dtype=dtypes.float16)
+
+    # Because autocast variables do not yet define operator overloads, the
+    # operator is defined by the non-variable tensor
+
+    # Test variable as the LHS. Currently, this is not supported with
+    # distributed autocast variables
+    if not distribute:
+      self.assertEqual(self.evaluate(x + v1), 3.)
+
+      with ops.get_default_graph()._enable_auto_casting_variables(
+          dtypes.float16):
+        self.assertEqual(self.evaluate(x + v2), 3.)
+
+    # Test variable as the RHS
+    self.assertEqual(self.evaluate(v1 + x), 3.)
+
+    with ops.get_default_graph()._enable_auto_casting_variables(
+        dtypes.float16):
+      self.assertEqual(self.evaluate(v2 + x), 3.)
+
+  @parameterized.named_parameters(*TESTCASES)
+  def test_assign(self, distribute):
+    with get_distribute_scope(distribute):
+      x = get_var(0., dtypes.float32)
+      x = get_autocast_var(x, distribute)
+      self.evaluate(x.initializer)
+
+      # outside of auto cast scope.
+      v1 = constant_op.constant(3.14, dtype=dtypes.float32)
+      v2 = constant_op.constant(3.14, dtype=dtypes.float16)
+
+      def run_and_check():
+        # Assign float32 values
+        self.assertAllClose(3.14, self.evaluate(x.assign(v1)))
+        self.assertAllClose(3.14 * 2, self.evaluate(x.assign_add(v1)))
+        self.assertAllClose(3.14, self.evaluate(x.assign_sub(v1)))
+
+        # Attempt to assign float16 values
+        with self.assertRaisesRegexp(
+            ValueError,
+            'conversion requested dtype float32 for Tensor with dtype float16'):
+          self.evaluate(x.assign(v2))
+        with self.assertRaisesRegexp(
+            ValueError,
+            'conversion requested dtype float32 for Tensor with dtype float16'):
+          self.evaluate(x.assign_add(v2))
+        with self.assertRaisesRegexp(
+            ValueError,
+            'conversion requested dtype float32 for Tensor with dtype float16'):
+          self.evaluate(x.assign_sub(v2))
+
+        # Assign Python floats
+        self.assertAllClose(3.14, self.evaluate(x.assign(3.14)))
+        self.assertAllClose(3.14 * 2, self.evaluate(x.assign_add(3.14)))
+        self.assertAllClose(3.14, self.evaluate(x.assign_sub(3.14)))
+
+      run_and_check()
+      # reset x
+      self.evaluate(x.assign(0.))
+      # within auto cast scope.
+      with ops.get_default_graph()._enable_auto_casting_variables(
+          dtypes.float16):
+        # assign still expect float32 value even if in float16 scope
+        run_and_check()
+
+  @parameterized.named_parameters(*TESTCASES)
+  def test_assign_stays_in_true_dtype(self, distribute):
+    with get_distribute_scope(distribute):
+      x = get_var(1., dtypes.float32)
+      x = get_autocast_var(x, distribute)
+      self.evaluate(x.initializer)
+      # small_val is a value such that 1.0 + small_val == 1.0 in fp16, but not
+      # in fp32
+      small_val = np.finfo('float16').eps / 2
+      small_tensor = constant_op.constant(small_val, dtype=dtypes.float32)
+      with ops.get_default_graph()._enable_auto_casting_variables(
+          dtypes.float16):
+        # Variable should be increased, despite it appearing to be the same
+        # float16 value.
+        self.assertEqual(1. + small_val,
+                         self.evaluate(x.assign(1. + small_tensor)))
+        self.assertEqual(1., self.evaluate(x.value()))
+      self.assertEqual(1. + small_val, self.evaluate(x.value()))
+
+      self.evaluate(x.assign(1.))
+      with ops.get_default_graph()._enable_auto_casting_variables(
+          dtypes.float16):
+        self.assertEqual(1. + small_val,
+                         self.evaluate(x.assign_add(small_tensor)))
+        self.assertEqual(1., self.evaluate(x.value()))
+      self.assertEqual(1. + small_val, self.evaluate(x.value()))
+
+  @parameterized.named_parameters(*TESTCASES)
+  def test_invalid_wrapped_variable(self, distribute):
+    with get_distribute_scope(distribute):
+      # Wrap a non-variable
+      with self.assertRaisesRegexp(ValueError, 'variable must be of type'):
+        x = constant_op.constant([1.], dtype=dtypes.float32)
+        get_autocast_var(x, distribute)
+
+      # Wrap a non-floating point variable
+      with self.assertRaisesRegexp(ValueError,
+                                   'variable must be a floating point'):
+        x = get_var(1, dtypes.int32)
+        get_autocast_var(x, distribute)
+
+    if distribute:
+      # Wrap a non-distributed variable with AutoCastDistributedVariable
+      with self.assertRaisesRegexp(ValueError, 'variable must be of type'):
+        x = get_var(1., dtypes.float32)
+        get_autocast_var(x, distribute)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/model_subclassing_test.py b/tensorflow/python/keras/model_subclassing_test.py
index 553c7fb00969fd8c1e042b84ffff37bc82981d02..5220f4e28f4244773fb4a6597fd2939fdd799662 100644
--- a/tensorflow/python/keras/model_subclassing_test.py
+++ b/tensorflow/python/keras/model_subclassing_test.py
@@ -28,13 +28,14 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
-from tensorflow.python.training.checkpointable import data_structures
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
+from tensorflow.python.training.tracking import data_structures
 
 try:
   import h5py  # pylint:disable=g-import-not-at-top
@@ -187,7 +188,6 @@ def get_nested_model_3(input_dim, num_classes):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-@test_util.run_v1_only('b/120545219')
 class ModelSubclassingTest(test.TestCase):
 
   def test_custom_build(self):
@@ -409,6 +409,158 @@ class ModelSubclassingTest(test.TestCase):
     x2 = array_ops.ones((num_samples, input_dim))
     model([x1, x2])
 
+  def test_summary(self):
+
+    class ToString(object):
+
+      def __init__(self):
+        self.contents = ''
+
+      def __call__(self, msg):
+        self.contents += msg + '\n'
+
+    # Single-io
+    model = SimpleTestModel(num_classes=4, use_bn=True, use_dp=True)
+    model._set_inputs(np.ones((3, 4)))  # need to build model first
+    print_fn = ToString()
+    model.summary(print_fn=print_fn)
+    self.assertTrue('Trainable params: 356' in print_fn.contents)
+
+    # Multi-io
+    model = MultiIOTestModel(num_classes=(5, 6), use_bn=True, use_dp=True)
+    model._set_inputs([np.ones((3, 4)),
+                       np.ones((3, 4))])  # need to build model first
+    print_fn = ToString()
+    model.summary(print_fn=print_fn)
+    self.assertTrue('Trainable params: 587' in print_fn.contents)
+
+  def test_no_dependency(self):
+    class Foo(keras.Model):
+
+      def __init__(self):
+        super(Foo, self).__init__()
+        self.isdep = keras.layers.Dense(1)
+        self.notdep = data_structures.NoDependency(keras.layers.Dense(2))
+        self.notdep_var = data_structures.NoDependency(
+            resource_variable_ops.ResourceVariable(1., name='notdep_var'))
+
+    m = Foo()
+    self.assertEqual([m.isdep, m.notdep], m.layers)
+    self.assertEqual(1, len(m._checkpoint_dependencies))
+    self.assertIs(m.isdep, m._checkpoint_dependencies[0].ref)
+    self.assertEqual('notdep_var:0', m.notdep_var.name)
+
+  def test_extra_variable(self):
+
+    class ExtraVar(keras.Model):
+
+      def __init__(self):
+        super(ExtraVar, self).__init__()
+        self.dense = keras.layers.Dense(1)
+        self.var = resource_variable_ops.ResourceVariable(1.)
+        self.not_trainable_var = resource_variable_ops.ResourceVariable(
+            2., trainable=False)
+
+      def call(self, inputs):
+        return self.dense(inputs + self.var)
+
+    m = ExtraVar()
+    self.assertTrue(m.trainable)
+    self.assertEqual([m.dense], m.layers)
+    self.assertEqual([m.var, m.not_trainable_var], m.variables)
+    self.assertEqual([m.var], m.trainable_variables)
+    self.assertEqual([m.not_trainable_var], m.non_trainable_variables)
+    m.trainable = False
+    self.assertEqual([m.var, m.not_trainable_var], m.variables)
+    self.assertEqual([], m.trainable_variables)
+    self.assertEqual([m.var, m.not_trainable_var], m.non_trainable_variables)
+    m.trainable = True
+
+    m(array_ops.ones([1, 1]))
+
+    self.assertEqual([m.dense.kernel, m.dense.bias], m.dense.variables)
+    self.assertEqual([m.dense.kernel, m.dense.bias], m.dense.weights)
+
+    self.assertEqual([m.dense.kernel, m.dense.bias, m.var, m.not_trainable_var],
+                     m.variables)
+    self.assertEqual([m.dense.kernel, m.dense.bias, m.var],
+                     m.trainable_variables)
+    self.assertEqual([m.not_trainable_var], m.non_trainable_variables)
+
+    m.dense.trainable = False
+    self.assertEqual(
+        [m.var, m.dense.kernel, m.dense.bias, m.not_trainable_var],
+        m.variables)
+    self.assertEqual([m.var], m.trainable_variables)
+    self.assertEqual([m.dense.kernel, m.dense.bias, m.not_trainable_var],
+                     m.non_trainable_variables)
+
+  def test_add_weight_in_model(self):
+
+    class MyModel(keras.Model):
+
+      def __init__(self):
+        super(MyModel, self).__init__()
+        self.b = self.add_weight('bias', (10,))
+        self.c = self.add_weight('bias2', (10,), trainable=False)
+
+      def call(self, inputs):
+        return inputs + self.b + self.c
+
+    x = ops.convert_to_tensor(np.ones((10, 10), 'float32'))
+    model = MyModel()
+    model(x)
+    self.assertEqual(1, len(model.trainable_weights))
+    self.assertEqual(1, len(model.non_trainable_weights))
+    self.assertEqual(2, len(model.weights))
+
+    class MyModelCustomBuild(keras.Model):
+
+      def build(self, input_shape):
+        self.b = self.add_weight('bias', (10,))
+        self.c = self.add_weight('bias2', (10,), trainable=False)
+
+      def call(self, inputs):
+        return inputs + self.b + self.c
+
+    x = ops.convert_to_tensor(np.ones((10, 10), 'float32'))
+    model = MyModelCustomBuild()
+    model(x)
+    self.assertEqual(1, len(model.trainable_weights))
+    self.assertEqual(1, len(model.non_trainable_weights))
+    self.assertEqual(2, len(model.weights))
+
+  def test_add_update_in_model(self):
+
+    class MyModel(keras.Model):
+
+      def __init__(self):
+        super(MyModel, self).__init__()
+        self.b = self.add_weight('bias', (10,))
+        self.c = self.add_weight('bias2', (10,))
+
+      def call(self, inputs):
+        # Unconditional
+        self.add_update(self.b.assign(self.b * 2))
+        # Conditional
+        self.add_update(self.c.assign(inputs[1, :]), inputs)
+        return inputs + self.b + self.c
+
+    x = ops.convert_to_tensor(np.ones((10, 10), 'float32'))
+    model = MyModel()
+    model(x)
+
+    if context.executing_eagerly():
+      self.assertEqual(0, len(model.updates))
+    else:
+      self.assertEqual(2, len(model.updates))
+      self.assertEqual(1, len(model.get_updates_for(None)))
+      self.assertEqual(1, len(model.get_updates_for(x)))
+
+
+@keras_parameterized.run_all_keras_modes
+class ModelSubclassCompiledTest(keras_parameterized.TestCase):
+
   def test_single_io_workflow_with_np_arrays(self):
     num_classes = 2
     num_samples = 100
@@ -419,8 +571,9 @@ class ModelSubclassingTest(test.TestCase):
                             use_bn=True)
     model.compile(
         loss='mse',
-        optimizer=RMSPropOptimizer(learning_rate=0.001),
-        metrics=['acc', keras.metrics.CategoricalAccuracy()])
+        optimizer='rmsprop',
+        metrics=['acc', keras.metrics.CategoricalAccuracy()],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.ones((num_samples, input_dim))
     y = np.zeros((num_samples, num_classes))
@@ -436,9 +589,11 @@ class ModelSubclassingTest(test.TestCase):
     model = MultiIOTestModel(num_classes=num_classes,
                              use_dp=True,
                              use_bn=True)
-    model.compile(loss='mse',
-                  optimizer=RMSPropOptimizer(learning_rate=0.001),
-                  metrics=['acc'])
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x1 = np.ones((num_samples, input_dim))
     x2 = np.ones((num_samples, input_dim))
@@ -455,7 +610,10 @@ class ModelSubclassingTest(test.TestCase):
 
     with self.cached_session():
       model = SimpleTestModel(num_classes=num_classes, use_dp=True, use_bn=True)
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      model.compile(
+          loss='mse',
+          optimizer='rmsprop',
+          run_eagerly=testing_utils.should_run_eagerly())
 
       x = np.ones((num_samples, input_dim), dtype=np.float32)
       y = np.zeros((num_samples, num_classes), dtype=np.float32)
@@ -485,7 +643,10 @@ class ModelSubclassingTest(test.TestCase):
     self.assertEqual(model.built, False)
     self.assertEqual(len(model.weights), 0)
 
-    model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch([x1, x2], [y1, y2])
 
     self.assertEqual(model.built, True)
@@ -515,7 +676,10 @@ class ModelSubclassingTest(test.TestCase):
     y = np.ones((num_samples, input_dim))
 
     model = BNNet()
-    model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
     y_ref = model.predict(x)
 
     model.train_on_batch(x, y)
@@ -545,7 +709,10 @@ class ModelSubclassingTest(test.TestCase):
     x = np.ones((num_samples, input_dim))
     y = model.predict(x)
     self.assertEqual(np.sum(y), np.sum(x))
-    model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
     loss = model.train_on_batch(x, y)
     self.assertGreater(loss, 0.1)
 
@@ -563,7 +730,10 @@ class ModelSubclassingTest(test.TestCase):
     y2 = np.zeros((num_samples, num_classes[1]))
 
     model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
-    model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
     model.fit({'input_1': x1, 'input_2': x2},
               {'output_1': y1, 'output_2': y2},
@@ -572,7 +742,10 @@ class ModelSubclassingTest(test.TestCase):
               validation_data=([x1, x2], [y1, y2]))
 
     model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
-    model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.train_on_batch([x1, x2], [y1, y2])
     model.train_on_batch({'input_1': x1, 'input_2': x2},
                          {'output_1': y1, 'output_2': y2})
@@ -590,7 +763,10 @@ class ModelSubclassingTest(test.TestCase):
     y2 = np.zeros((num_samples, num_classes[1]))
 
     model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
-    model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.evaluate([x1, x2], [y1, y2])
     model.test_on_batch([x1, x2], [y1, y2])
 
@@ -612,7 +788,10 @@ class ModelSubclassingTest(test.TestCase):
     y2 = np.zeros((num_samples, num_classes[1]))
 
     model = MultiIOTestModel(num_classes=num_classes, use_bn=True)
-    model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
     model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0)
     y_ref_1, y_ref_2 = model.predict([x1, x2])
 
@@ -641,40 +820,17 @@ class ModelSubclassingTest(test.TestCase):
       self.assertAllClose(y_ref_1, y1, atol=1e-5)
       self.assertAllClose(y_ref_2, y2, atol=1e-5)
 
-  def test_summary(self):
-
-    class ToString(object):
-
-      def __init__(self):
-        self.contents = ''
-
-      def __call__(self, msg):
-        self.contents += msg + '\n'
-
-    # Single-io
-    model = SimpleTestModel(num_classes=4, use_bn=True, use_dp=True)
-    model._set_inputs(np.ones((3, 4)))  # need to build model first
-    print_fn = ToString()
-    model.summary(print_fn=print_fn)
-    self.assertTrue('Trainable params: 356' in print_fn.contents)
-
-    # Multi-io
-    model = MultiIOTestModel(num_classes=(5, 6), use_bn=True, use_dp=True)
-    model._set_inputs([np.ones((3, 4)),
-                       np.ones((3, 4))])  # need to build model first
-    print_fn = ToString()
-    model.summary(print_fn=print_fn)
-    self.assertTrue('Trainable params: 587' in print_fn.contents)
-
   def test_subclass_nested_in_subclass(self):
     num_classes = 2
     num_samples = 100
     input_dim = 50
 
     model = NestedTestModel1(num_classes=num_classes)
-    model.compile(loss='mse',
-                  optimizer=RMSPropOptimizer(learning_rate=0.001),
-                  metrics=['acc'])
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.ones((num_samples, input_dim))
     y = np.zeros((num_samples, num_classes))
@@ -694,9 +850,11 @@ class ModelSubclassingTest(test.TestCase):
     input_dim = 50
 
     model = NestedTestModel2(num_classes=num_classes)
-    model.compile(loss='mse',
-                  optimizer=RMSPropOptimizer(learning_rate=0.001),
-                  metrics=['acc'])
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.ones((num_samples, input_dim))
     y = np.zeros((num_samples, num_classes))
@@ -716,9 +874,11 @@ class ModelSubclassingTest(test.TestCase):
     input_dim = 50
 
     model = get_nested_model_3(input_dim=input_dim, num_classes=num_classes)
-    model.compile(loss='mse',
-                  optimizer=RMSPropOptimizer(learning_rate=0.001),
-                  metrics=['acc'])
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.ones((num_samples, input_dim))
     y = np.zeros((num_samples, num_classes))
@@ -749,9 +909,11 @@ class ModelSubclassingTest(test.TestCase):
         return self.bn(x)
 
     model = keras.Sequential([Inner()])
-    model.compile(loss='mse',
-                  optimizer=RMSPropOptimizer(learning_rate=0.001),
-                  metrics=['acc'])
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        metrics=['acc'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.ones((num_samples, input_dim))
     y = np.zeros((num_samples, num_classes))
@@ -787,136 +949,14 @@ class ModelSubclassingTest(test.TestCase):
     x = np.ones((10, 10))
     y = model.predict(x)
     self.assertEqual(np.sum(y), np.sum(x))
-    model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+    model.compile(
+        loss='mse',
+        optimizer='rmsprop',
+        run_eagerly=testing_utils.should_run_eagerly())
     loss = model.train_on_batch(x, y)
     self.assertGreater(loss, 0.1)
 
-  def test_no_dependency(self):
-    class Foo(keras.Model):
-
-      def __init__(self):
-        super(Foo, self).__init__()
-        self.isdep = keras.layers.Dense(1)
-        self.notdep = data_structures.NoDependency(keras.layers.Dense(2))
-        self.notdep_var = data_structures.NoDependency(
-            resource_variable_ops.ResourceVariable(1., name='notdep_var'))
-
-    m = Foo()
-    self.assertEqual([m.isdep, m.notdep], m.layers)
-    self.assertEqual(1, len(m._checkpoint_dependencies))
-    self.assertIs(m.isdep, m._checkpoint_dependencies[0].ref)
-    self.assertEqual('notdep_var:0', m.notdep_var.name)
-
-  def test_extra_variable(self):
-
-    class ExtraVar(keras.Model):
-
-      def __init__(self):
-        super(ExtraVar, self).__init__()
-        self.dense = keras.layers.Dense(1)
-        self.var = resource_variable_ops.ResourceVariable(1.)
-        self.not_trainable_var = resource_variable_ops.ResourceVariable(
-            2., trainable=False)
-
-      def call(self, inputs):
-        return self.dense(inputs + self.var)
-
-    m = ExtraVar()
-    self.assertTrue(m.trainable)
-    self.assertEqual([m.dense], m.layers)
-    self.assertEqual([m.var, m.not_trainable_var], m.variables)
-    self.assertEqual([m.var], m.trainable_variables)
-    self.assertEqual([m.not_trainable_var], m.non_trainable_variables)
-    m.trainable = False
-    self.assertEqual([m.var, m.not_trainable_var], m.variables)
-    self.assertEqual([], m.trainable_variables)
-    self.assertEqual([m.var, m.not_trainable_var], m.non_trainable_variables)
-    m.trainable = True
-
-    m(array_ops.ones([1, 1]))
-
-    self.assertEqual([m.dense.kernel, m.dense.bias], m.dense.variables)
-    self.assertEqual([m.dense.kernel, m.dense.bias], m.dense.weights)
-
-    self.assertEqual([m.dense.kernel, m.dense.bias, m.var, m.not_trainable_var],
-                     m.variables)
-    self.assertEqual([m.dense.kernel, m.dense.bias, m.var],
-                     m.trainable_variables)
-    self.assertEqual([m.not_trainable_var], m.non_trainable_variables)
-
-    m.dense.trainable = False
-    self.assertEqual(
-        [m.var, m.dense.kernel, m.dense.bias, m.not_trainable_var],
-        m.variables)
-    self.assertEqual([m.var], m.trainable_variables)
-    self.assertEqual([m.dense.kernel, m.dense.bias, m.not_trainable_var],
-                     m.non_trainable_variables)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_add_weight_in_model(self):
-
-    class MyModel(keras.Model):
-
-      def __init__(self):
-        super(MyModel, self).__init__()
-        self.b = self.add_weight('bias', (10,))
-        self.c = self.add_weight('bias2', (10,), trainable=False)
-
-      def call(self, inputs):
-        return inputs + self.b + self.c
-
-    x = ops.convert_to_tensor(np.ones((10, 10), 'float32'))
-    model = MyModel()
-    model(x)
-    self.assertEqual(1, len(model.trainable_weights))
-    self.assertEqual(1, len(model.non_trainable_weights))
-    self.assertEqual(2, len(model.weights))
-
-    class MyModelCustomBuild(keras.Model):
 
-      def build(self, input_shape):
-        self.b = self.add_weight('bias', (10,))
-        self.c = self.add_weight('bias2', (10,), trainable=False)
-
-      def call(self, inputs):
-        return inputs + self.b + self.c
-
-    x = ops.convert_to_tensor(np.ones((10, 10), 'float32'))
-    model = MyModelCustomBuild()
-    model(x)
-    self.assertEqual(1, len(model.trainable_weights))
-    self.assertEqual(1, len(model.non_trainable_weights))
-    self.assertEqual(2, len(model.weights))
-
-  def test_add_update_in_model(self):
-
-    class MyModel(keras.Model):
-
-      def __init__(self):
-        super(MyModel, self).__init__()
-        self.b = self.add_weight('bias', (10,))
-        self.c = self.add_weight('bias2', (10,))
-
-      def call(self, inputs):
-        # Unconditional
-        self.add_update(self.b.assign(self.b * 2))
-        # Conditional
-        self.add_update(self.c.assign(inputs[1, :]), inputs)
-        return inputs + self.b + self.c
-
-    x = ops.convert_to_tensor(np.ones((10, 10), 'float32'))
-    model = MyModel()
-    model(x)
-
-    if context.executing_eagerly():
-      self.assertEqual(0, len(model.updates))
-    else:
-      self.assertEqual(2, len(model.updates))
-      self.assertEqual(1, len(model.get_updates_for(None)))
-      self.assertEqual(1, len(model.get_updates_for(x)))
-
-
-@test_util.run_v1_only('b/120545219')
 class GraphSpecificModelSubclassingTests(test.TestCase):
 
   @test_util.run_deprecated_v1
@@ -929,7 +969,7 @@ class GraphSpecificModelSubclassingTests(test.TestCase):
       model = SimpleTestModel(num_classes=num_classes,
                               use_dp=True,
                               use_bn=True)
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      model.compile(loss='mse', optimizer='rmsprop')
 
       x = array_ops.ones((num_samples, input_dim))
       y = array_ops.zeros((num_samples, num_classes))
@@ -947,7 +987,7 @@ class GraphSpecificModelSubclassingTests(test.TestCase):
       model = MultiIOTestModel(num_classes=num_classes,
                                use_dp=True,
                                use_bn=True)
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      model.compile(loss='mse', optimizer='rmsprop')
 
       x1 = array_ops.ones((num_samples, input_dim))
       x2 = array_ops.ones((num_samples, input_dim))
@@ -1035,7 +1075,7 @@ class GraphSpecificModelSubclassingTests(test.TestCase):
       model = MultiIOTestModel(num_classes=num_classes,
                                use_dp=True,
                                use_bn=True)
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      model.compile(loss='mse', optimizer='rmsprop')
 
       x1 = np.ones((num_samples, input_dim))
       x2 = np.ones((num_samples, input_dim))
@@ -1085,9 +1125,9 @@ class TrainingMaskingModel(keras.Model):
     return self.dense1(x)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class CustomCallSignatureTests(test.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
   def test_no_inputs_in_signature(self):
     model = CustomCallModel()
     first = array_ops.ones([2, 3])
@@ -1101,7 +1141,6 @@ class CustomCallSignatureTests(test.TestCase):
     output = model(first, second=second, training=False)
     self.assertAllClose(expected_output, self.evaluate(output))
 
-  @test_util.run_in_graph_and_eager_modes
   def test_training_args_call_build(self):
     input_dim = 2
 
@@ -1114,7 +1153,6 @@ class CustomCallSignatureTests(test.TestCase):
                                     'has been properly built.'))
     self.assertTrue(model.built, 'Model should be built after calling `build`.')
 
-  @test_util.run_in_graph_and_eager_modes
   def test_training_and_mask_args_call_build(self):
     input_dim = 2
 
@@ -1127,7 +1165,6 @@ class CustomCallSignatureTests(test.TestCase):
                                     'has been properly built.'))
     self.assertTrue(model.built, 'Model should be built after calling `build`.')
 
-  @test_util.run_in_graph_and_eager_modes
   def test_custom_call_kwargs_and_build(self):
     first_input_shape = (2, 3)
     second_input_shape = (2, 5)
@@ -1140,7 +1177,6 @@ class CustomCallSignatureTests(test.TestCase):
         ValueError, 'cannot build your model if it has positional'):
       model.build(input_shape=[first_input_shape, second_input_shape])
 
-  @test_util.run_in_graph_and_eager_modes
   def test_inputs_in_signature(self):
 
     class HasInputsAndOtherPositional(keras.Model):
@@ -1157,7 +1193,6 @@ class CustomCallSignatureTests(test.TestCase):
       x1, x2 = keras.Input((1, 1)), keras.Input((1, 1))
       model(x1, x2)
 
-  @test_util.run_in_graph_and_eager_modes
   def test_kwargs_in_signature(self):
 
     class HasKwargs(keras.Model):
@@ -1166,12 +1201,11 @@ class CustomCallSignatureTests(test.TestCase):
         return x
 
     model = HasKwargs()
-    arg = array_ops.ones([])
+    arg = array_ops.ones([1])
     model(arg, a=3)
     if not context.executing_eagerly():
       self.assertEqual(len(model.inputs), 1)
 
-  @test_util.run_in_graph_and_eager_modes
   def test_args_in_signature(self):
 
     class HasArgs(keras.Model):
@@ -1191,23 +1225,26 @@ class CustomCallSignatureTests(test.TestCase):
 
     class HasArgs(keras.Model):
 
-      def call(self, x, training=True, *args, **kwargs):
+      def call(self, x, training=True, *args, **kwargs):  # pylint:disable=keyword-arg-before-vararg
         return x
 
-    with context.graph_mode():
-      model = HasArgs()
-      x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1))
-      with self.assertRaisesRegexp(
-          TypeError, 'may not accept both positional arguments and '):
-        model(x1, x2, x3, a=3)
+    model = HasArgs()
+    x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1))
+    with self.assertRaisesRegexp(
+        TypeError, 'may not accept both positional arguments and '):
+      model(x1, x2, x3, a=3)
 
+  @test_util.assert_no_new_tensors
+  @test_util.assert_no_garbage_created
   def test_training_no_default(self):
+    if context.executing_eagerly():
+      self.skipTest('b/120997007')
 
-    with context.graph_mode():
-      model = TrainingNoDefaultModel()
-      arg = array_ops.ones([1, 1])
-      model(arg, True)
-      self.assertEqual(len(model.inputs), 1)
+    model = TrainingNoDefaultModel()
+
+    arg = array_ops.ones([1, 1])
+    model(arg, True)
+    self.assertEqual(len(model.inputs), 1)
 
   def test_training_no_default_with_positional(self):
 
@@ -1216,11 +1253,10 @@ class CustomCallSignatureTests(test.TestCase):
       def call(self, x, training, positional):
         return x
 
-    with context.graph_mode():
-      model = TrainingNoDefaultWithPositional()
-      x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1))
-      with self.assertRaisesRegexp(TypeError, 'after a non-input'):
-        model(x1, x2, x3)
+    model = TrainingNoDefaultWithPositional()
+    x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1))
+    with self.assertRaisesRegexp(TypeError, 'after a non-input'):
+      model(x1, x2, x3)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index 2637191bb75b357341376a703b2620243bd925bf..487259864c40abab644272a6c2a341d8335fc9be 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -22,30 +22,35 @@ from __future__ import print_function
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import optimizers
-from tensorflow.python.keras.engine import saving
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
 from tensorflow.python.keras.engine.network import Network
+from tensorflow.python.keras.saving import hdf5_format
+from tensorflow.python.keras.saving import model_config
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
-from tensorflow.python.training.checkpointable import base as checkpointable
-from tensorflow.python.training.checkpointable import data_structures
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import keras_export
+
 
 # API entries importable from `keras.models`:
 Model = training.Model  # pylint: disable=invalid-name
 Sequential = sequential.Sequential  # pylint: disable=invalid-name
-save_model = saving.save_model
-load_model = saving.load_model
-model_from_config = saving.model_from_config
-model_from_yaml = saving.model_from_yaml
-model_from_json = saving.model_from_json
+save_model = hdf5_format.save_model
+load_model = hdf5_format.load_model
+model_from_config = model_config.model_from_config
+model_from_yaml = model_config.model_from_yaml
+model_from_json = model_config.model_from_json
+
+
+def _clone_layer(layer):
+  return layer.__class__.from_config(layer.get_config())
 
 
-def _clone_functional_model(model, input_tensors=None):
+def _clone_functional_model(model, input_tensors=None, share_weights=False):
   """Clone a functional `Model` instance.
 
   Model cloning is similar to calling a model on new inputs,
@@ -57,6 +62,11 @@ def _clone_functional_model(model, input_tensors=None):
       input_tensors: optional list of input tensors
           to build the model upon. If not provided,
           placeholders will be created.
+      share_weights: flag to enable sharing of non-input layers between the
+          cloned and original model. Note this still clones the input layers.
+          This is required when we create a per-replica copy of the model with
+          distribution strategy; we want the weights to be shared but still
+          feed inputs separately so we create new input layers.
 
   Returns:
       An instance of `Model` reproducing the behavior
@@ -90,15 +100,14 @@ def _clone_functional_model(model, input_tensors=None):
       # Cache newly created input layer.
       newly_created_input_layer = input_tensor._keras_history[0]
       layer_map[layer] = newly_created_input_layer
+
     for original_input_layer, cloned_input_layer in zip(model._input_layers,
                                                         input_layers):
       layer_map[original_input_layer] = cloned_input_layer
   else:
     # Make sure that all input tensors come from a Keras layer.
     # If tensor comes from an input layer: cache the input layer.
-    if isinstance(input_tensors, tuple):
-      input_tensors = list(input_tensors)
-    input_tensors = generic_utils.to_list(input_tensors)
+    input_tensors = nest.flatten(input_tensors)
     input_tensors_ = []
     for i in range(len(input_tensors)):
       input_tensor = input_tensors[i]
@@ -107,6 +116,7 @@ def _clone_functional_model(model, input_tensors=None):
         name = original_input_layer.name
         input_tensor = Input(tensor=input_tensor,
                              name='input_wrapper_for_' + name)
+
         input_tensors_.append(input_tensor)
         # Cache newly created input layer.
         newly_created_input_layer = input_tensor._keras_history[0]
@@ -129,10 +139,11 @@ def _clone_functional_model(model, input_tensors=None):
 
       # Get or create layer.
       if layer not in layer_map:
-        # Clone layer.
-        new_layer = layer.__class__.from_config(layer.get_config())
-        layer_map[layer] = new_layer
-        layer = new_layer
+        if not share_weights:
+          # Clone layer.
+          new_layer = _clone_layer(layer)
+          layer_map[layer] = new_layer
+          layer = new_layer
       else:
         # Reuse previously cloned layer.
         layer = layer_map[layer]
@@ -140,34 +151,18 @@ def _clone_functional_model(model, input_tensors=None):
         if isinstance(layer, InputLayer):
           continue
 
-      # Gather inputs to call the new layer.
-      reference_input_tensors = node.input_tensors
-      reference_output_tensors = node.output_tensors
-
       # If all previous input tensors are available in tensor_map,
       # then call node.inbound_layer on them.
-      computed_tensors = []
-      for x in reference_input_tensors:
-        if x in tensor_map:
-          computed_tensors.append(tensor_map[x])
-
-      if len(computed_tensors) == len(reference_input_tensors):
+      if all(
+          tensor in tensor_map for tensor in nest.flatten(node.input_tensors)):
+        computed_tensors = nest.map_structure(lambda t: tensor_map[t],
+                                              node.input_tensors)
         # Call layer.
-        if node.arguments:
-          kwargs = node.arguments
-        else:
-          kwargs = {}
-        if len(computed_tensors) == 1:
-          computed_tensor = computed_tensors[0]
-          output_tensors = generic_utils.to_list(layer(computed_tensor,
-                                                       **kwargs))
-          computed_tensors = [computed_tensor]
-        else:
-          computed_tensors = computed_tensors
-          output_tensors = generic_utils.to_list(layer(computed_tensors,
-                                                       **kwargs))
+        kwargs = node.arguments or {}
+        output_tensors = layer(computed_tensors, **kwargs)
 
-        for x, y in zip(reference_output_tensors, output_tensors):
+        for x, y in zip(
+            nest.flatten(node.output_tensors), nest.flatten(output_tensors)):
           tensor_map[x] = y
 
   # Check that we did compute the model outputs,
@@ -176,10 +171,13 @@ def _clone_functional_model(model, input_tensors=None):
   for x in model.outputs:
     assert x in tensor_map, 'Could not compute output ' + str(x)
     output_tensors.append(tensor_map[x])
+
+  input_tensors = nest.pack_sequence_as(model._nested_inputs, input_tensors)
+  output_tensors = nest.pack_sequence_as(model._nested_outputs, output_tensors)
   return Model(input_tensors, output_tensors, name=model.name)
 
 
-def _clone_sequential_model(model, input_tensors=None):
+def _clone_sequential_model(model, input_tensors=None, share_weights=False):
   """Clone a `Sequential` model instance.
 
   Model cloning is similar to calling a model on new inputs,
@@ -191,6 +189,11 @@ def _clone_sequential_model(model, input_tensors=None):
       input_tensors: optional list of input tensors
           to build the model upon. If not provided,
           placeholders will be created.
+      share_weights: flag to enable sharing of non-input layers between the
+          cloned and original model. Note this still clones the input layers.
+          This is required when we create a per-replica copy of the model with
+          distribution strategy; we want the weights to be shared but still
+          feed inputs separately so we create new input layers.
 
   Returns:
       An instance of `Sequential` reproducing the behavior
@@ -205,23 +208,28 @@ def _clone_sequential_model(model, input_tensors=None):
                      'to be a `Sequential` model instance, '
                      'but got:', model)
 
-  def clone(layer):
-    return layer.__class__.from_config(layer.get_config())
-
   # Use model._layers to ensure that all layers are cloned. The model's layers
   # property will exclude the initial InputLayer (if it exists) in the model,
   # resulting in a different Sequential model structure.
   if input_tensors is None:
-    layers = [clone(layer) for layer in model._layers]
+    if share_weights:
+      # In preserve weights case we still want the input layers to be cloned.
+      layers = []
+      for layer in model._layers:
+        if isinstance(layer, InputLayer):
+          layers.append(_clone_layer(layer))
+        else:
+          layers.append(layer)
+    else:
+      layers = [_clone_layer(layer) for layer in model._layers]
     return Sequential(layers=layers, name=model.name)
   else:
     # If input tensors are provided, the original model's InputLayer is
     # overwritten with a different InputLayer.
     layers = [
-        clone(layer)
-        for layer in model._layers
-        if not isinstance(layer, InputLayer)
-    ]
+        layer for layer in model._layers if not isinstance(layer, InputLayer)]
+    if not share_weights:
+      layers = [_clone_layer(layer) for layer in layers]
     if len(generic_utils.to_list(input_tensors)) != 1:
       raise ValueError('To clone a `Sequential` model, we expect '
                        ' at most one tensor '
@@ -244,7 +252,7 @@ def _clone_sequential_model(model, input_tensors=None):
     return Sequential(layers=[input_layer] + layers, name=model.name)
 
 
-@tf_export('keras.models.clone_model')
+@keras_export('keras.models.clone_model')
 def clone_model(model, input_tensors=None):
   """Clone any `Model` instance.
 
@@ -274,8 +282,6 @@ def clone_model(model, input_tensors=None):
 
 
 # "Clone" a subclassed model by reseting all of the attributes.
-
-
 def _in_place_subclassed_model_reset(model):
   """Substitute for model cloning that works for subclassed models.
 
@@ -307,11 +313,16 @@ def _in_place_subclassed_model_reset(model):
       continue
     if isinstance(value, Layer):
       attributes_cache[name] = value
-      assert value in model._layers
+      assert value in model.layers
+      if hasattr(value, 'layers') and value.layers:
+        raise ValueError('We do not support the use of nested layers '
+                         'in `model_to_estimator` at this time. Found nested '
+                         'layer: %s' % value)
     elif isinstance(
         value,
         (list, tuple)) and name not in ('layers', '_layers', 'metrics',
-                                        '_compile_stateful_metric_functions'):
+                                        '_compile_stateful_metric_functions',
+                                        '_output_loss_metrics'):
       # Handle case: list/tuple of layers (also tracked by the Network API).
       if value and all(isinstance(val, Layer) for val in value):
         raise ValueError('We do not support the use of list-of-layers '
@@ -322,7 +333,9 @@ def _in_place_subclassed_model_reset(model):
   # Replace layers on the model with fresh layers
   layers_to_names = {value: key for key, value in attributes_cache.items()}
   original_layers = model._layers[:]
-  model._layers = data_structures.NoDependency([])
+  setattr_tracking = model._setattr_tracking
+  model._setattr_tracking = False
+  model._layers = []
   for layer in original_layers:  # We preserve layer order.
     config = layer.get_config()
     # This will not work for nested subclassed models used as layers.
@@ -335,6 +348,7 @@ def _in_place_subclassed_model_reset(model):
     fresh_layer = layer.__class__.from_config(config)
     name = layers_to_names[layer]
     setattr(model, name, fresh_layer)
+    model._layers.append(fresh_layer)
 
   # Cache original model build attributes (in addition to layers)
   if (not hasattr(model, '_original_attributes_cache') or
@@ -367,12 +381,31 @@ def _in_place_subclassed_model_reset(model):
       ]
       for name in attributes_to_cache:
         attributes_cache[name] = getattr(model, name)
-  model._original_attributes_cache = data_structures.NoDependency(
-      attributes_cache)
-  # Reset built state
+  model._original_attributes_cache = attributes_cache
+  _reset_build_compile_trackers(model)
+  model._setattr_tracking = setattr_tracking
+
+
+def _reset_build_compile_trackers(model):
+  """Reset state trackers for model.
+
+  Note that we do not actually zero out attributes such as optimizer,
+  but instead rely on the expectation that all of the attrs will be
+  over-written on calling build/compile/etc. This is somewhat fragile,
+  insofar as we check elsewhere for the presence of these attributes as
+  evidence of having been built/compiled/etc. Pending a better way to do this,
+  we reset key attributes here to allow building and compiling.
+
+  Args:
+    model: the model that is being reset
+  """
+  # Reset build state
   model.built = False
   model.inputs = None
   model.outputs = None
+  # Reset compile state
+  model._is_compiled = False  # pylint:disable=protected-access
+  model.optimizer = None
 
 
 def in_place_subclassed_model_state_restoration(model):
@@ -393,20 +426,18 @@ def in_place_subclassed_model_state_restoration(model):
     # back the previous attributes and track Layers by their original names
     # without adding dependencies on "utility" attributes which Models exempt
     # when they're constructed.
-    model._layers = data_structures.NoDependency([])
+    setattr_tracking = model._setattr_tracking
+    model._setattr_tracking = False
+    model._layers = []
     for name, value in model._original_attributes_cache.items():
-      if not isinstance(value, checkpointable.CheckpointableBase):
-        # If this value is not already checkpointable, it's probably that way
-        # for a reason; we don't want to start tracking data structures that the
-        # original Model didn't.
-        value = data_structures.NoDependency(value)
       setattr(model, name, value)
+      if isinstance(value, Layer):
+        model._layers.append(value)
     model._original_attributes_cache = None
+    model._setattr_tracking = setattr_tracking
   else:
     # Restore to the state of a never-called model.
-    model.built = False
-    model.inputs = None
-    model.outputs = None
+    _reset_build_compile_trackers(model)
 
 
 def clone_and_build_model(
@@ -431,7 +462,7 @@ def clone_and_build_model(
       or functions.
     compile_clone: Boolean, whether to compile model clone (default `True`).
     in_place_reset: Boolean, whether to reset the model in place. Only used if
-      the model is not a graph network. If the model is a subclassed model, then
+      the model is a subclassed model. In the case of a subclassed model,
       this argument must be set to `True` (default `False`). To restore the
       original model, use the function
       `in_place_subclassed_model_state_restoration(model)`.
@@ -448,7 +479,10 @@ def clone_and_build_model(
       - cloning a subclassed model with `in_place_reset` set to False.
       - compiling the clone when the original model has not been compiled.
   """
-  if compile_clone and not model.optimizer:
+  # Grab optimizer now, as we reset-in-place for subclassed models, but
+  # want to maintain access to the original optimizer.
+  orig_optimizer = model.optimizer
+  if compile_clone and not orig_optimizer:
     raise ValueError(
         'Error when cloning model: compile_clone was set to True, but the '
         'original model has not been compiled.')
@@ -471,8 +505,8 @@ def clone_and_build_model(
   else:
     if not in_place_reset:
       raise ValueError(
-          'Model is not a graph network (usually means that it is a subclassed '
-          'model). The model cannot be cloned, but there is a workaround where '
+          'This model is a subclassed model. '
+          'Such a model cannot be cloned, but there is a workaround where '
           'the model is reset in-place. To use this, please set the argument '
           '`in_place_reset` to `True`. This will reset the attributes in the '
           'original model. To restore the attributes, call '
@@ -484,14 +518,14 @@ def clone_and_build_model(
         input_tensors = input_tensors[0]
       clone._set_inputs(input_tensors)
 
-  if compile_clone and model.optimizer:
-    if isinstance(model.optimizer, optimizers.TFOptimizer):
+  if compile_clone:
+    if isinstance(orig_optimizer, optimizers.TFOptimizer):
       optimizer = optimizers.TFOptimizer(
-          model.optimizer.optimizer, optimizer_iterations)
+          orig_optimizer.optimizer, optimizer_iterations)
       K.track_tf_optimizer(optimizer)
     else:
-      optimizer_config = model.optimizer.get_config()
-      optimizer = model.optimizer.__class__.from_config(optimizer_config)
+      optimizer_config = orig_optimizer.get_config()
+      optimizer = orig_optimizer.__class__.from_config(optimizer_config)
       if optimizer_iterations is not None:
         optimizer.iterations = optimizer_iterations
 
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index c466d94fed8f34e0ca9e25425f88d6028c806131..f429aba498d90b3afc9d18925543c88b48c5ffd9 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -19,18 +19,21 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
+import functools
 import os
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics
 from tensorflow.python.keras import models
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -52,158 +55,181 @@ class TestModel(keras.Model):
     return self.layer1(x)
 
 
-def sequential_model(add_input_layer, include_input_shape=True):
-  model = keras.models.Sequential()
+def _get_layers(input_shape=(4,), add_input_layer=False):
   if add_input_layer:
-    model.add(keras.layers.InputLayer(input_shape=(4,)))
-    model.add(keras.layers.Dense(4))
-  elif include_input_shape:
-    model.add(keras.layers.Dense(4, input_shape=(4,)))
+    model_layers = [keras.layers.InputLayer(input_shape=input_shape),
+                    keras.layers.Dense(4)]
+  elif input_shape:
+    model_layers = [keras.layers.Dense(4, input_shape=input_shape)]
   else:
-    model.add(keras.layers.Dense(4))
-  model.add(keras.layers.BatchNormalization())
-  model.add(keras.layers.Dropout(0.5))
-  model.add(keras.layers.Dense(4))
-  return model
-
-
-class TestModelCloning(test.TestCase):
-
-  @test_util.run_v1_only('b/120545219')
-  def test_clone_sequential_model(self):
-    with self.cached_session():
-      val_a = np.random.random((10, 4))
-      val_out = np.random.random((10, 4))
-
-      model = sequential_model(False)
-
-    # Everything should work in a new session.
-    keras.backend.clear_session()
-
-    with self.cached_session():
-      # With placeholder creation
-      new_model = keras.models.clone_model(model)
+    model_layers = [keras.layers.Dense(4)]
+
+  model_layers += [
+      keras.layers.BatchNormalization(),
+      keras.layers.Dropout(0.5),
+      keras.layers.Dense(4)]
+
+  return model_layers
+
+
+def _get_model(input_shape=(4,)):
+  model_layers = _get_layers(input_shape=None, add_input_layer=False)
+  return testing_utils.get_model_from_layers(
+      model_layers, input_shape=input_shape)
+
+
+class TestModelCloning(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_all_keras_modes
+  @parameterized.named_parameters([
+      {'testcase_name': 'has_input_layer',
+       'input_shape': (4,),
+       'add_input_layer': True,
+       'share_weights': False},
+      {'testcase_name': 'no_input_layer',
+       'input_shape': None,
+       'add_input_layer': False,
+       'share_weights': False},
+      {'testcase_name': 'has_input_layer_share_weights',
+       'input_shape': (4,),
+       'add_input_layer': True,
+       'share_weights': True},
+      {'testcase_name': 'no_input_layer_share_weights',
+       'input_shape': None,
+       'add_input_layer': False,
+       'share_weights': True},
+  ])
+  def test_clone_sequential_model(
+      self, input_shape, add_input_layer, share_weights):
+
+    if share_weights:
+      clone_fn = functools.partial(
+          keras.models._clone_sequential_model, share_weights=True)
+    else:
+      clone_fn = keras.models.clone_model
+
+    val_a = np.random.random((10, 4))
+    model = models.Sequential(_get_layers(input_shape, add_input_layer))
+    # Sanity check
+    self.assertEqual(
+        isinstance(model._layers[0], keras.layers.InputLayer),
+        add_input_layer)
+    self.assertEqual(model._is_graph_network, add_input_layer)
+
+    # With placeholder creation -- clone model should have an InputLayer
+    # if the original model has one.
+    new_model = clone_fn(model)
+    self.assertEqual(
+        isinstance(new_model._layers[0], keras.layers.InputLayer),
+        add_input_layer)
+    self.assertEqual(new_model._is_graph_network, model._is_graph_network)
+    if input_shape:
       # update ops from batch norm needs to be included
       self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch(val_a, val_out)
 
-      # On top of new tensor
-      input_a = keras.Input(shape=(4,))
-      new_model = keras.models.clone_model(model, input_tensors=input_a)
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch(val_a, val_out)
-
-      # On top of new, non-Keras tensor
+    # On top of new tensor  -- clone model should always have an InputLayer.
+    input_a = keras.Input(shape=(4,))
+    new_model = clone_fn(model, input_tensors=input_a)
+    self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer)
+    self.assertTrue(new_model._is_graph_network)
+
+    # On top of new, non-Keras tensor  -- clone model should always have an
+    # InputLayer.
+    if not context.executing_eagerly():
+      # TODO(b/121277734):Skip Eager contexts, as Input() layers raise an error
+      # saying they should not be used with EagerTensors
       input_a = keras.backend.variable(val_a)
-      new_model = keras.models.clone_model(model, input_tensors=input_a)
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch(None, val_out)
-
-  @test_util.run_v1_only('b/120545219')
-  def test_clone_sequential_model_input_layer(self):
-
-    def test_input_layer(include_inputs):
-      with self.cached_session():
-        val_a = np.random.random((10, 4))
-        model = sequential_model(include_inputs, include_inputs)
-        # Sanity check
-        self.assertEqual(
-            isinstance(model._layers[0], keras.layers.InputLayer),
-            include_inputs)
-        self.assertEqual(model._is_graph_network, include_inputs)
-
-      keras.backend.clear_session()
-      with self.cached_session():
-        # With placeholder creation -- clone model should have an InputLayer
-        # if the original model has one.
-        new_model = keras.models.clone_model(model)
-        self.assertEqual(
-            isinstance(new_model._layers[0], keras.layers.InputLayer),
-            include_inputs)
-        self.assertEqual(new_model._is_graph_network, model._is_graph_network)
-
-        # On top of new tensor  -- clone model should always have an InputLayer.
-        input_a = keras.Input(shape=(4,))
-        new_model = keras.models.clone_model(model, input_tensors=input_a)
-        self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer)
-        self.assertTrue(new_model._is_graph_network)
-
-        # On top of new, non-Keras tensor  -- clone model should always have an
-        # InputLayer.
-        input_a = keras.backend.variable(val_a)
-        new_model = keras.models.clone_model(model, input_tensors=input_a)
-        self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer)
-        self.assertTrue(new_model._is_graph_network)
-
-    test_input_layer(True)
-    test_input_layer(False)
-
-  @test_util.run_v1_only('b/120545219')
-  def test_clone_functional_model(self):
-    with self.cached_session():
-      val_a = np.random.random((10, 4))
-      val_b = np.random.random((10, 4))
-      val_out = np.random.random((10, 4))
-
-      input_a = keras.Input(shape=(4,))
-      input_b = keras.Input(shape=(4,))
-      dense_1 = keras.layers.Dense(4,)
-      dense_2 = keras.layers.Dense(4,)
-
-      x_a = dense_1(input_a)
-      x_a = keras.layers.Dropout(0.5)(x_a)
-      x_a = keras.layers.BatchNormalization()(x_a)
-      x_b = dense_1(input_b)
-      x_a = dense_2(x_a)
-      outputs = keras.layers.add([x_a, x_b])
-      model = keras.models.Model([input_a, input_b], outputs)
-
-    # Everything should work in a new session.
-    keras.backend.clear_session()
-
-    with self.cached_session():
-      # With placeholder creation
-      new_model = keras.models.clone_model(model)
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch([val_a, val_b], val_out)
-
-      # On top of new tensors
-      input_a = keras.Input(shape=(4,), name='a')
-      input_b = keras.Input(shape=(4,), name='b')
-      new_model = keras.models.clone_model(
-          model, input_tensors=[input_a, input_b])
-      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch([val_a, val_b], val_out)
-
-      # On top of new, non-Keras tensors
+      new_model = clone_fn(model, input_tensors=input_a)
+      self.assertIsInstance(new_model._layers[0], keras.layers.InputLayer)
+      self.assertTrue(new_model._is_graph_network)
+
+  @keras_parameterized.run_all_keras_modes
+  @parameterized.named_parameters([
+      {'testcase_name': 'clone_weights', 'share_weights': False},
+      {'testcase_name': 'share_weights', 'share_weights': True},
+  ])
+  def test_clone_functional_model(self, share_weights):
+    if share_weights:
+      clone_fn = functools.partial(
+          keras.models._clone_functional_model, share_weights=True)
+    else:
+      clone_fn = keras.models.clone_model
+
+    val_a = np.random.random((10, 4))
+    val_b = np.random.random((10, 4))
+    val_out = np.random.random((10, 4))
+
+    input_a = keras.Input(shape=(4,))
+    input_b = keras.Input(shape=(4,))
+    dense_1 = keras.layers.Dense(4,)
+    dense_2 = keras.layers.Dense(4,)
+
+    x_a = dense_1(input_a)
+    x_a = keras.layers.Dropout(0.5)(x_a)
+    x_a = keras.layers.BatchNormalization()(x_a)
+    x_b = dense_1(input_b)
+    x_a = dense_2(x_a)
+    outputs = keras.layers.add([x_a, x_b])
+    model = keras.models.Model([input_a, input_b], outputs)
+
+    # With placeholder creation
+    new_model = clone_fn(model)
+    self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
+    new_model.compile(
+        testing_utils.get_v2_optimizer('rmsprop'), 'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    new_model.train_on_batch([val_a, val_b], val_out)
+
+    # On top of new tensors
+    input_a = keras.Input(shape=(4,), name='a')
+    input_b = keras.Input(shape=(4,), name='b')
+    new_model = keras.models.clone_model(
+        model, input_tensors=[input_a, input_b])
+    self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
+    new_model.compile(
+        testing_utils.get_v2_optimizer('rmsprop'), 'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    new_model.train_on_batch([val_a, val_b], val_out)
+
+    # On top of new, non-Keras tensors
+    if not context.executing_eagerly():
+      # TODO(b/121277734):Skip Eager contexts, as Input() layers raise an error
+      # saying they should not be used with EagerTensors
       input_a = keras.backend.variable(val_a)
       input_b = keras.backend.variable(val_b)
-      new_model = keras.models.clone_model(
-          model, input_tensors=[input_a, input_b])
+      new_model = clone_fn(model, input_tensors=[input_a, input_b])
       self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
-      new_model.compile('rmsprop', 'mse')
+      new_model.compile(
+          testing_utils.get_v2_optimizer('rmsprop'), 'mse',
+          run_eagerly=testing_utils.should_run_eagerly())
       new_model.train_on_batch(None, val_out)
 
-  @test_util.run_in_graph_and_eager_modes
-  def test_clone_functional_model_with_masking(self):
-    with self.cached_session():
-      x = np.array([[[1], [1]], [[0], [0]]])
-      inputs = keras.Input((2, 1))
-      outputs = keras.layers.Masking(mask_value=0)(inputs)
-      outputs = keras.layers.TimeDistributed(
-          keras.layers.Dense(1, kernel_initializer='one'))(outputs)
-      model = keras.Model(inputs, outputs)
-
-      model = keras.models.clone_model(model)
-      model.compile(loss='mse', optimizer=adam.AdamOptimizer(0.01))
-      y = np.array([[[1], [1]], [[1], [1]]])
-      loss = model.train_on_batch(x, y)
-      self.assertEqual(float(loss), 0.)
+  @keras_parameterized.run_all_keras_modes
+  @parameterized.named_parameters([
+      {'testcase_name': 'clone_weights', 'share_weights': False},
+      {'testcase_name': 'share_weights', 'share_weights': True},
+  ])
+  def test_clone_functional_with_masking(self, share_weights):
+    if share_weights:
+      clone_fn = functools.partial(
+          keras.models._clone_functional_model, share_weights=True)
+    else:
+      clone_fn = keras.models.clone_model
+
+    x = np.array([[[1.], [1.]], [[0.], [0.]]])
+    inputs = keras.Input((2, 1))
+    outputs = keras.layers.Masking(mask_value=0)(inputs)
+    outputs = keras.layers.TimeDistributed(
+        keras.layers.Dense(1, kernel_initializer='one'))(outputs)
+    model = keras.Model(inputs, outputs)
+
+    model = clone_fn(model)
+    model.compile(
+        loss='mse', optimizer=testing_utils.get_v2_optimizer('adam'),
+        run_eagerly=testing_utils.should_run_eagerly())
+    y = np.array([[[1], [1]], [[1], [1]]])
+    loss = model.train_on_batch(x, y)
+    self.assertEqual(float(loss), 0.)
 
   def test_model_cloning_invalid_use_cases(self):
     seq_model = keras.models.Sequential()
@@ -254,15 +280,21 @@ def _has_placeholder(graph):
   return any('Placeholder' in s for s in ops_types)
 
 
-class CheckpointingTests(test.TestCase):
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes
+class CheckpointingTests(keras_parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
   def test_optimizer_dependency(self):
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(1, input_shape=(4,)))
-    opt = adam.AdamOptimizer(0.01)
-    model.compile(optimizer=opt, loss='mse')
-    model.fit(x=np.array([[1., 2., 3., 4.]]), y=[1.], epochs=2)
+    model = _get_model()
+    opt = adam.AdamOptimizer(.01)
+    model.compile(
+        optimizer=opt, loss='mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    model.fit(
+        x=np.array([[1., 2., 3., 4.]]),
+        y=np.array([[1., 1., 1., 1.]]),
+        epochs=2)
     save_prefix = os.path.join(self.get_temp_dir(), 'ckpt')
     beta1_power, _ = opt._get_beta_accumulators()
     self.evaluate(beta1_power.assign(12.))
@@ -272,7 +304,8 @@ class CheckpointingTests(test.TestCase):
     self.assertEqual(12., self.evaluate(beta1_power))
 
 
-class TestModelBackend(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class TestModelBackend(keras_parameterized.TestCase):
 
   def test_model_backend_float64_use_cases(self):
     # Test case for GitHub issue 19318
@@ -282,7 +315,9 @@ class TestModelBackend(test.TestCase):
     x = keras.Input((5,))
     y = keras.layers.Dense(1)(x)
     model = keras.models.Model(x, y)
-    model.compile('rmsprop', 'mse')
+    model.compile(
+        testing_utils.get_v2_optimizer('rmsprop'), 'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
 
     keras.backend.set_floatx(floatx)
 
@@ -317,48 +352,46 @@ class TestModelDeepCopy(test.TestCase):
                       model_copy.get_weights()[0]))
 
 
-@test_util.run_v1_only('b/120545219')
-class TestCloneAndBuildModel(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class TestCloneAndBuildModel(keras_parameterized.TestCase):
 
+  @keras_parameterized.run_with_all_model_types
   def test_clone_and_build_non_compiled_model(self):
-    with self.cached_session():
-      inp = np.random.random((10, 4))
-      out = np.random.random((10, 4))
+    inp = np.random.random((10, 4))
+    out = np.random.random((10, 4))
 
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(4, input_shape=(4,)))
-      model.add(keras.layers.BatchNormalization())
-      model.add(keras.layers.Dropout(0.5))
-      model.add(keras.layers.Dense(4))
-
-    # Everything should work in a new session.
-    keras.backend.clear_session()
-
-    with self.cached_session():
-      with self.assertRaisesRegexp(ValueError, 'has not been compiled'):
-        models.clone_and_build_model(model, compile_clone=True)
-
-      # With placeholder creation
-      new_model = models.clone_and_build_model(model, compile_clone=False)
-      with self.assertRaisesRegexp(RuntimeError, 'must compile'):
-        new_model.evaluate(inp, out)
-      with self.assertRaisesRegexp(RuntimeError, 'must compile'):
-        new_model.train_on_batch(inp, out)
-      new_model.compile('rmsprop', 'mse')
-      new_model.train_on_batch(inp, out)
+    model = _get_model()
 
-      # Create new tensors for inputs and targets
-      input_a = keras.Input(shape=(4,))
-      target_a = keras.Input(shape=(4,))
-      new_model = models.clone_and_build_model(model, input_tensors=input_a,
-                                               target_tensors=[target_a],
-                                               compile_clone=False)
-      with self.assertRaisesRegexp(RuntimeError, 'must compile'):
-        new_model.evaluate(inp, out)
-      with self.assertRaisesRegexp(RuntimeError, 'must compile'):
-        new_model.train_on_batch(inp, out)
-      new_model.compile('rmsprop', 'mse')
+    with self.assertRaisesRegexp(ValueError, 'has not been compiled'):
+      models.clone_and_build_model(model, compile_clone=True)
+
+    is_subclassed = (testing_utils.get_model_type() == 'subclass')
+    # With placeholder creation
+    new_model = models.clone_and_build_model(
+        model, compile_clone=False, in_place_reset=is_subclassed)
+    with self.assertRaisesRegexp(RuntimeError, 'must compile'):
+      new_model.evaluate(inp, out)
+    with self.assertRaisesRegexp(RuntimeError, 'must compile'):
       new_model.train_on_batch(inp, out)
+    new_model.compile(
+        testing_utils.get_v2_optimizer('rmsprop'), 'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    new_model.train_on_batch(inp, out)
+
+    # Create new tensors for inputs and targets
+    input_a = keras.Input(shape=(4,))
+    target_a = keras.Input(shape=(4,))
+    new_model = models.clone_and_build_model(
+        model, input_tensors=input_a, target_tensors=[target_a],
+        compile_clone=False, in_place_reset=is_subclassed)
+    with self.assertRaisesRegexp(RuntimeError, 'must compile'):
+      new_model.evaluate(inp, out)
+    with self.assertRaisesRegexp(RuntimeError, 'must compile'):
+      new_model.train_on_batch(inp, out)
+    new_model.compile(
+        testing_utils.get_v2_optimizer('rmsprop'), 'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    new_model.train_on_batch(inp, out)
 
   def _assert_same_compile_params(self, model):
     """Assert that two models have the same compile parameters."""
@@ -371,130 +404,88 @@ class TestCloneAndBuildModel(test.TestCase):
     self.assertEqual(['acc', metrics.categorical_accuracy],
                      model._compile_metrics)
 
-  def _clone_and_build_test_helper(self, model, is_subclassed=False):
+  def _clone_and_build_test_helper(self, model, model_type):
     inp = np.random.random((10, 4))
     out = np.random.random((10, 4))
 
-    # Everything should work in a new session.
-    keras.backend.clear_session()
-
-    with self.cached_session():
-      # With placeholder creation
-      new_model = models.clone_and_build_model(
-          model, compile_clone=True, in_place_reset=is_subclassed)
-
-      self._assert_same_compile_params(new_model)
-      new_model.train_on_batch(inp, out)
-      new_model.evaluate(inp, out)
+    is_subclassed = (model_type == 'subclass')
+
+    # With placeholder creation
+    new_model = models.clone_and_build_model(
+        model, compile_clone=True, in_place_reset=is_subclassed)
+
+    self._assert_same_compile_params(new_model)
+    new_model.train_on_batch(inp, out)
+    new_model.evaluate(inp, out)
+
+    # Create new tensors for inputs and targets
+    input_a = keras.Input(shape=(4,), name='a')
+    new_model = models.clone_and_build_model(
+        model, input_tensors=input_a, compile_clone=True,
+        in_place_reset=is_subclassed)
+    self._assert_same_compile_params(new_model)
+    new_model.train_on_batch(inp, out)
+    new_model.evaluate(inp, out)
+
+    target_a = keras.Input(shape=(4,), name='b')
+    new_model = models.clone_and_build_model(
+        model, input_tensors=input_a, target_tensors=[target_a],
+        compile_clone=True, in_place_reset=is_subclassed)
+    self._assert_same_compile_params(new_model)
+    new_model.train_on_batch(inp, out)
+    new_model.evaluate(inp, out)
+
+  @keras_parameterized.run_with_all_model_types
+  def test_clone_and_build_compiled(self):
+    model = _get_model()
+    model.compile(
+        testing_utils.get_v2_optimizer('rmsprop'), 'mse',
+        metrics=['acc', metrics.categorical_accuracy],
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    self._clone_and_build_test_helper(model, testing_utils.get_model_type())
+
+  def test_clone_and_build_sequential_without_inputs_defined(self):
+    model = models.Sequential(_get_layers(input_shape=None))
+    model.compile(
+        testing_utils.get_v2_optimizer('rmsprop'),
+        'mse', metrics=['acc', metrics.categorical_accuracy],
+        run_eagerly=testing_utils.should_run_eagerly())
+    self._clone_and_build_test_helper(model, 'sequential')
 
-      # Create new tensors for inputs and targets
-      input_a = keras.Input(shape=(4,), name='a')
-      new_model = models.clone_and_build_model(
-          model, input_tensors=input_a, compile_clone=True,
-          in_place_reset=is_subclassed)
-      self._assert_same_compile_params(new_model)
-      new_model.train_on_batch(inp, out)
-      new_model.evaluate(inp, out)
-
-      target_a = keras.Input(shape=(4,), name='b')
-      new_model = models.clone_and_build_model(
-          model, input_tensors=input_a, target_tensors=[target_a],
-          compile_clone=True, in_place_reset=is_subclassed)
-      self._assert_same_compile_params(new_model)
-      new_model.train_on_batch(inp, out)
-      new_model.evaluate(inp, out)
-
-  def test_clone_and_build_compiled_sequential_model(self):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(4, input_shape=(4,)))
-      model.add(keras.layers.BatchNormalization())
-      model.add(keras.layers.Dropout(0.5))
-      model.add(keras.layers.Dense(4))
-      model.compile('rmsprop', 'mse',
-                    metrics=['acc', metrics.categorical_accuracy])
-
-    self._clone_and_build_test_helper(model)
-
-  def test_clone_and_build_functional_model(self):
-    with self.cached_session():
-      input_a = keras.Input(shape=(4,))
-      dense_1 = keras.layers.Dense(4,)
-      dense_2 = keras.layers.Dense(4,)
-
-      x_a = dense_1(input_a)
-      x_a = keras.layers.Dropout(0.5)(x_a)
-      x_a = keras.layers.BatchNormalization()(x_a)
-      x_a = dense_2(x_a)
-      model = keras.models.Model(input_a, x_a)
-      model.compile('rmsprop', 'mse',
-                    metrics=['acc', metrics.categorical_accuracy])
-
-    self._clone_and_build_test_helper(model)
-
-  def test_clone_and_build_subclassed_model(self):
-    class SubclassedModel(keras.Model):
-
-      def __init__(self):
-        super(SubclassedModel, self).__init__()
-        self.layer1 = keras.layers.Dense(4)
-        self.layer2 = keras.layers.Dense(4)
-
-      def call(self, inp):
-        out = self.layer1(inp)
-        out = keras.layers.BatchNormalization()(out)
-        out = keras.layers.Dropout(0.5)(out)
-        out = self.layer2(out)
-        return out
-
-    with self.cached_session():
-      model = SubclassedModel()
-      model.compile('rmsprop', 'mse',
-                    metrics=['acc', metrics.categorical_accuracy])
-    self._clone_and_build_test_helper(model, True)
+    inp = np.random.random((10, 4))
+    out = np.random.random((10, 4))
+    model.train_on_batch(inp, out)
+    self._clone_and_build_test_helper(model, 'sequential')
 
   def assert_optimizer_iterations_increases(self, optimizer):
-    with self.cached_session():
-      input_a = keras.Input(shape=(4,))
-      dense_1 = keras.layers.Dense(4,)
-      dense_2 = keras.layers.Dense(4,)
+    model = _get_model()
+    model.compile(
+        optimizer, 'mse', metrics=['acc', metrics.categorical_accuracy],
+        run_eagerly=testing_utils.should_run_eagerly())
 
-      x_a = dense_1(input_a)
-      x_a = keras.layers.Dropout(0.5)(x_a)
-      x_a = keras.layers.BatchNormalization()(x_a)
-      x_a = dense_2(x_a)
-      model = keras.models.Model(input_a, x_a)
-      model.compile(optimizer, 'mse',
-                    metrics=['acc', metrics.categorical_accuracy])
+    global_step = keras.backend.variable(123, dtype=dtypes.int64)
+    clone_model = models.clone_and_build_model(
+        model, compile_clone=True, optimizer_iterations=global_step,
+        in_place_reset=(testing_utils.get_model_type() == 'subclass'))
 
-      global_step = keras.backend.variable(123, dtype=dtypes.int64)
-      clone_model = models.clone_and_build_model(
-          model, compile_clone=True, optimizer_iterations=global_step)
-
-      inp = np.random.random((10, 4))
-      out = np.random.random((10, 4))
-      clone_model.train_on_batch(inp, out)
+    inp = np.random.random((10, 4))
+    out = np.random.random((10, 4))
+    clone_model.train_on_batch(inp, out)
 
-      self.assertEqual(K.eval(global_step), 124)
+    self.assertEqual(K.eval(global_step), 124)
 
+  @keras_parameterized.run_with_all_model_types
   def test_replace_tf_optimizer_iterations_variable(self):
     self.assert_optimizer_iterations_increases(adam.AdamOptimizer(0.01))
 
+  @keras_parameterized.run_with_all_model_types
   def test_replace_keras_optimizer_iterations_variable(self):
-    self.assert_optimizer_iterations_increases('adam')
+    if testing_utils.should_run_eagerly():
+      # This needs to be updated to run with v2 optimizers.
+      self.skipTest('b/120991591')
 
-  def test_clone_and_build_sequential_model_without_inputs_defined(self):
-    with self.cached_session():
-      model = sequential_model(False, False)
-      model.compile('rmsprop', 'mse',
-                    metrics=['acc', metrics.categorical_accuracy])
-    self._clone_and_build_test_helper(model, False)
-
-    with self.cached_session():
-      inp = np.random.random((10, 4))
-      out = np.random.random((10, 4))
-      model.train_on_batch(inp, out)
-    self._clone_and_build_test_helper(model, False)
+    self.assert_optimizer_iterations_increases('adam')
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/ops.py b/tensorflow/python/keras/ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2d852054448b4887fe9f9f28ad4f99e12ce7680
--- /dev/null
+++ b/tensorflow/python/keras/ops.py
@@ -0,0 +1,102 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Module for exporting TensorFlow ops under tf.keras.*."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import init_ops_v2
+from tensorflow.python.util.tf_export import keras_export
+
+
+# pylint: disable=bad-continuation
+keras_export(v1=["keras.initializers.Initializer"])(
+    init_ops.Initializer)
+keras_export(v1=["keras.initializers.Zeros", "keras.initializers.zeros"])(
+    init_ops.Zeros)
+keras_export(v1=["keras.initializers.Ones", "keras.initializers.ones"])(
+    init_ops.Ones)
+keras_export(v1=["keras.initializers.Constant", "keras.initializers.constant"])(
+    init_ops.Constant)
+keras_export(v1=["keras.initializers.VarianceScaling"])(
+    init_ops.VarianceScaling)
+keras_export(v1=["keras.initializers.Orthogonal",
+                 "keras.initializers.orthogonal"])(
+    init_ops.Orthogonal)
+keras_export(v1=["keras.initializers.Identity",
+                 "keras.initializers.identity"])(
+    init_ops.Identity)
+keras_export(v1=["keras.initializers.glorot_uniform"])(
+    init_ops.GlorotUniform)
+keras_export(v1=["keras.initializers.glorot_normal"])(
+    init_ops.GlorotNormal)
+keras_export(v1=["keras.initializers.lecun_normal"])(
+    init_ops.lecun_normal)
+keras_export(v1=["keras.initializers.lecun_uniform"])(
+    init_ops.lecun_uniform)
+keras_export(v1=["keras.initializers.he_normal"])(
+    init_ops.he_normal)
+keras_export(v1=["keras.initializers.he_uniform"])(
+    init_ops.he_uniform)
+
+keras_export("keras.initializers.Initializer", v1=[])(
+    init_ops_v2.Initializer)
+keras_export(
+    "keras.initializers.Zeros", "keras.initializers.zeros", v1=[])(
+        init_ops_v2.Zeros)
+keras_export(
+    "keras.initializers.Ones", "keras.initializers.ones", v1=[])(
+        init_ops_v2.Ones)
+keras_export(
+    "keras.initializers.Constant", "keras.initializers.constant", v1=[])(
+        init_ops_v2.Constant)
+keras_export("keras.initializers.VarianceScaling", v1=[])(
+    init_ops_v2.VarianceScaling)
+keras_export(
+    "keras.initializers.Orthogonal", "keras.initializers.orthogonal", v1=[])(
+        init_ops_v2.Orthogonal)
+keras_export(
+    "keras.initializers.Identity", "keras.initializers.identity", v1=[])(
+        init_ops_v2.Identity)
+keras_export(
+    "keras.initializers.GlorotUniform",
+    "keras.initializers.glorot_uniform",
+    v1=[])(
+        init_ops_v2.GlorotUniform)
+keras_export(
+    "keras.initializers.GlorotNormal",
+    "keras.initializers.glorot_normal",
+    v1=[])(
+        init_ops_v2.GlorotNormal)
+keras_export("keras.initializers.lecun_normal", v1=[])(
+    init_ops_v2.lecun_normal)
+keras_export("keras.initializers.lecun_uniform", v1=[])(
+    init_ops_v2.lecun_uniform)
+keras_export("keras.initializers.he_normal", v1=[])(
+    init_ops_v2.he_normal)
+keras_export("keras.initializers.he_uniform", v1=[])(
+    init_ops_v2.he_uniform)
+keras_export("keras.initializers.RandomNormal", v1=[])(
+    init_ops_v2.RandomNormal)
+keras_export("keras.initializers.RandomUniform", v1=[])(
+    init_ops_v2.RandomUniform)
+keras_export("keras.initializers.TruncatedNormal", v1=[])(
+    init_ops_v2.TruncatedNormal)
+# pylint: enable=bad-continuation
+
+
+keras_export("keras.backend.name_scope")(ops.name_scope)
diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
index b8f01249419c595a735442310c735bc10648cba6..88f2521d5e8dbc68f7f80d6cb0921c1ae75c4221 100644
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -25,6 +25,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":learning_rate_schedule",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:distribute",
         "//tensorflow/python:framework",
@@ -34,6 +35,23 @@ py_library(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:values",
+        "//tensorflow/python/keras:backend_config",
+    ],
+)
+
+py_library(
+    name = "learning_rate_schedule",
+    srcs = [
+        "learning_rate_schedule.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python/keras:generic_utils",
     ],
 )
 
@@ -172,9 +190,9 @@ cuda_py_test(
 
 py_test(
     name = "optimizer_v2_test",
-    size = "large",
+    size = "medium",
     srcs = ["optimizer_v2_test.py"],
-    shard_count = 4,
+    shard_count = 8,
     tags = [
         "no_windows",
     ],
@@ -195,6 +213,19 @@ py_test(
     ],
 )
 
+py_test(
+    name = "learning_rate_schedule_test",
+    size = "small",
+    srcs = ["learning_rate_schedule_test.py"],
+    deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 cuda_py_test(
     name = "rmsprop_test",
     size = "medium",
@@ -212,4 +243,5 @@ cuda_py_test(
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variables",
     ],
+    shard_count = 2,
 )
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta.py b/tensorflow/python/keras/optimizer_v2/adadelta.py
index 55b4eba1051287420b8ab1adeea1598eb4647c36..a3d5538ea86a0e0ed86e5ee70df69248ec76ba48 100644
--- a/tensorflow/python/keras/optimizer_v2/adadelta.py
+++ b/tensorflow/python/keras/optimizer_v2/adadelta.py
@@ -20,10 +20,13 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.keras import backend_config
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import keras_export
 
 
+@keras_export('keras.optimizers.Adadelta')
 class Adadelta(optimizer_v2.OptimizerV2):
   r"""Optimizer that implements the Adadelta algorithm.
 
@@ -75,7 +78,11 @@ class Adadelta(optimizer_v2.OptimizerV2):
                to better conditioning the grad update.
       name: Optional name prefix for the operations created when applying
         gradients.  Defaults to "Adadelta".
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
 
     @compatibility(eager)
     When eager execution is enabled, `learning_rate`, `rho`, and `epsilon` can
@@ -84,8 +91,10 @@ class Adadelta(optimizer_v2.OptimizerV2):
     invocations of optimizer functions.
     @end_compatibility
     """
+    if epsilon is None:
+      epsilon = backend_config.epsilon()
     super(Adadelta, self).__init__(name, **kwargs)
-    self._set_hyper('learning_rate', learning_rate)
+    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
     self._set_hyper('decay', self._initial_decay)
     self._set_hyper('rho', rho)
     self._set_hyper('epsilon', epsilon)
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta_test.py b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
index 0fb67d0cd1675fa0d02db7b78f6d90d86b64888f..06ff975212d9e405ff9bc4c6283e2e115ce4c1d2 100644
--- a/tensorflow/python/keras/optimizer_v2/adadelta_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
@@ -153,8 +153,11 @@ class AdadeltaOptimizerTest(test.TestCase):
       with self.cached_session():
         var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
-        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
-        loss = pred * pred
+
+        def loss():
+          pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)  # pylint: disable=cell-var-from-loop
+          return pred * pred
+
         sgd_op = adadelta.Adadelta(1.0, 1.0, 1.0).minimize(
             loss, var_list=[var0])
         variables.global_variables_initializer().run()
@@ -165,6 +168,28 @@ class AdadeltaOptimizerTest(test.TestCase):
         # Validate updated params
         self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
 
+  def testConstructAdadeltaWithLR(self):
+    opt = adadelta.Adadelta(lr=1.0, rho=0.9, epsilon=1.)
+    opt_2 = adadelta.Adadelta(learning_rate=0.1, rho=0.9, epsilon=1., lr=1.0)
+    opt_3 = adadelta.Adadelta(learning_rate=0.1, rho=0.9, epsilon=1.)
+    self.assertIsInstance(opt.lr, variables.Variable)
+    self.assertIsInstance(opt_2.lr, variables.Variable)
+    self.assertIsInstance(opt_3.lr, variables.Variable)
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose(self.evaluate(opt.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
+
+  def testConstructAdadeltaWithEpsilonValues(self):
+    opt = adadelta.Adadelta(epsilon=None)
+    config = opt.get_config()
+    self.assertEqual(config["epsilon"], 1e-7)
+
+    opt = adadelta.Adadelta(epsilon=1e-8)
+    config = opt.get_config()
+    self.assertEqual(config["epsilon"], 1e-8)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad.py b/tensorflow/python/keras/optimizer_v2/adagrad.py
index 670cad70e63354650aeb47ed2324e2c1756e12c1..0840aa6fae5be0b698de69827f483ec55b9ea37a 100644
--- a/tensorflow/python/keras/optimizer_v2/adagrad.py
+++ b/tensorflow/python/keras/optimizer_v2/adagrad.py
@@ -21,14 +21,17 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend_config
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.util.tf_export import keras_export
 
 
+@keras_export('keras.optimizers.Adagrad')
 class Adagrad(optimizer_v2.OptimizerV2):
   r"""Optimizer that implements the Adagrad algorithm.
 
@@ -68,7 +71,11 @@ class Adagrad(optimizer_v2.OptimizerV2):
         Starting value for the accumulators, must be positive.
       name: Optional name prefix for the operations created when applying
         gradients.  Defaults to "Adagrad".
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
 
     Raises:
       ValueError: If the `initial_accumulator_value` or `epsilon` is invalid.
@@ -83,10 +90,12 @@ class Adagrad(optimizer_v2.OptimizerV2):
     if initial_accumulator_value < 0.0:
       raise ValueError('initial_accumulator_value must be non-negative: %s' %
                        initial_accumulator_value)
+    if epsilon is None:
+      epsilon = backend_config.epsilon()
     if epsilon < 1e-7:
       raise ValueError('epsilon must be larger than 1e-7: %s' % epsilon)
     super(Adagrad, self).__init__(name, **kwargs)
-    self._set_hyper('learning_rate', learning_rate)
+    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
     self._set_hyper('decay', self._initial_decay)
     self._initial_accumulator_value = initial_accumulator_value
     self._set_hyper('epsilon', epsilon)
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad_test.py b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
index b2c290178fe8a62d1c7240df1d6c04f7b62456e1..9c8d3ff8a4ef89a34fc1217f2a27d13d3e172d68 100644
--- a/tensorflow/python/keras/optimizer_v2/adagrad_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.optimizer_v2 import adagrad
+from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -160,6 +161,52 @@ class AdagradOptimizerTest(test.TestCase):
           self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
           self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
+  def testBasicWithLearningRateInverseTimeDecay(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = 3.0
+        decay = 0.5
+        lr_schedule = learning_rate_schedule.InverseTimeDecay(
+            learning_rate, decay_steps=1.0, decay_rate=decay)
+
+        ada_opt = adagrad.Adagrad(lr_schedule)
+
+        accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+
+        if not context.executing_eagerly():
+          ada_update = ada_opt.apply_gradients(
+              zip([grads0, grads1], [var0, var1]))
+          self.evaluate(variables.global_variables_initializer())
+
+        # Fetch params to validate initial values
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllClose([1.0, 2.0], v0_val)
+        self.assertAllClose([3.0, 4.0], v1_val)
+
+        # Run 3 steps of adagrad
+        for t in range(3):
+          if not context.executing_eagerly():
+            self.evaluate(ada_update)
+          else:
+            ada_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+          lr_np = learning_rate / (1 + decay * t)
+          var0_np, accum0_np = adagrad_update_numpy(var0_np, accum0_np,
+                                                    grads0_np, lr_np)
+          var1_np, accum1_np = adagrad_update_numpy(var1_np, accum1_np,
+                                                    grads1_np, lr_np)
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
   @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -167,8 +214,11 @@ class AdagradOptimizerTest(test.TestCase):
         var0 = resource_variable_ops.ResourceVariable(
             [[1.0, 2.0], [3.0, 4.0]], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
-        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
-        loss = pred * pred
+
+        def loss():
+          pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)  # pylint: disable=cell-var-from-loop
+          return pred * pred
+
         sgd_op = adagrad.Adagrad(1.0).minimize(loss, var_list=[var0])
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
@@ -297,12 +347,12 @@ class AdagradOptimizerTest(test.TestCase):
       with self.cached_session():
         var_repeated = resource_variable_ops.ResourceVariable(
             [1.0, 2.0], dtype=dtype)
-        loss_repeated = math_ops.reduce_sum(
-            embedding_ops.embedding_lookup(var_repeated, [0, 0]))
+        loss_repeated = lambda: math_ops.reduce_sum(  # pylint: disable=g-long-lambda
+            embedding_ops.embedding_lookup(var_repeated, [0, 0]))  # pylint: disable=cell-var-from-loop
         var_aggregated = resource_variable_ops.ResourceVariable(
             [1.0, 2.0], dtype=dtype)
-        loss_aggregated = 2 * math_ops.reduce_sum(
-            embedding_ops.embedding_lookup(var_aggregated, [0]))
+        loss_aggregated = lambda: 2 * math_ops.reduce_sum(  # pylint: disable=g-long-lambda
+            embedding_ops.embedding_lookup(var_aggregated, [0]))  # pylint: disable=cell-var-from-loop
         update_op_repeated = adagrad.Adagrad(2.0).minimize(
             loss_repeated, var_list=[var_repeated])
         update_op_aggregated = adagrad.Adagrad(2.0).minimize(
@@ -395,6 +445,32 @@ class AdagradOptimizerTest(test.TestCase):
         self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
         self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
+  def testConstructAdagradWithLR(self):
+    opt = adagrad.Adagrad(lr=1.0)
+    opt_2 = adagrad.Adagrad(learning_rate=0.1, lr=1.0)
+    opt_3 = adagrad.Adagrad(learning_rate=0.1)
+    self.assertIsInstance(opt.lr, variables.Variable)
+    self.assertIsInstance(opt_2.lr, variables.Variable)
+    self.assertIsInstance(opt_3.lr, variables.Variable)
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose(self.evaluate(opt.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
+
+  def testConstructAdagradWithEpsilonValues(self):
+    opt = adagrad.Adagrad(epsilon=None)
+    config = opt.get_config()
+    self.assertEqual(config["epsilon"], 1e-7)
+
+    opt = adagrad.Adagrad(epsilon=1e-6)
+    config = opt.get_config()
+    self.assertEqual(config["epsilon"], 1e-6)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 "epsilon must be larger than 1e-7"):
+      opt = adagrad.Adagrad(epsilon=1e-8)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/adam.py b/tensorflow/python/keras/optimizer_v2/adam.py
index ef3d783f8910e791cf8591e0604935102c2b52cf..965ae8669bbca53ec5bb7e6666a7a7ba7fba1575 100644
--- a/tensorflow/python/keras/optimizer_v2/adam.py
+++ b/tensorflow/python/keras/optimizer_v2/adam.py
@@ -18,14 +18,16 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend_config
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import keras_export
 
 
+@keras_export('keras.optimizers.Adam')
 class Adam(optimizer_v2.OptimizerV2):
   """Optimizer that implements the Adam algorithm.
 
@@ -62,7 +64,7 @@ class Adam(optimizer_v2.OptimizerV2):
       $$t := 0 \text{(Initialize timestep)}$$
 
       The update rule for `variable` with gradient `g` uses an optimization
-      described at the end of section2 of the paper:
+      described at the end of section 2 of the paper:
 
       $$t := t + 1$$
       $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
@@ -80,7 +82,7 @@ class Adam(optimizer_v2.OptimizerV2):
       $$t := 0 \text{(Initialize timestep)}$$
 
       The update rule for `variable` with gradient `g` uses an optimization
-      described at the end of section2 of the paper:
+      described at the end of section 2 of the paper:
 
       $$t := t + 1$$
       $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
@@ -123,16 +125,22 @@ class Adam(optimizer_v2.OptimizerV2):
         a callable that takes no arguments and returns the actual value to use.
         This can be useful for changing these values across different
         invocations of optimizer functions. @end_compatibility
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
     """
 
+    if epsilon is None:
+      epsilon = backend_config.epsilon()
     super(Adam, self).__init__(name, **kwargs)
-    self._set_hyper('learning_rate', learning_rate)
+    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
     self._set_hyper('decay', self._initial_decay)
     self._set_hyper('beta_1', beta_1)
     self._set_hyper('beta_2', beta_2)
     self._set_hyper('epsilon', epsilon)
-    self._amsgrad = amsgrad
+    self.amsgrad = amsgrad
 
   def _create_slots(self, var_list):
     # Create slots for the first and second moments.
@@ -141,7 +149,7 @@ class Adam(optimizer_v2.OptimizerV2):
       self.add_slot(var, 'm')
     for var in var_list:
       self.add_slot(var, 'v')
-    if self._amsgrad:
+    if self.amsgrad:
       for var in var_list:
         self.add_slot(var, 'vhat')
 
@@ -166,7 +174,7 @@ class Adam(optimizer_v2.OptimizerV2):
     local_step = math_ops.cast(self.iterations + 1, var_dtype)
     beta_1_power = math_ops.pow(beta_1_t, local_step)
     beta_2_power = math_ops.pow(beta_2_t, local_step)
-    if not self._amsgrad:
+    if not self.amsgrad:
       return training_ops.resource_apply_adam(
           var.handle,
           m.handle,
@@ -220,7 +228,7 @@ class Adam(optimizer_v2.OptimizerV2):
     with ops.control_dependencies([v_t]):
       v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
 
-    if not self._amsgrad:
+    if not self.amsgrad:
       v_sqrt = math_ops.sqrt(v_t)
       var_update = state_ops.assign_sub(
           var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)
@@ -238,11 +246,6 @@ class Adam(optimizer_v2.OptimizerV2):
           use_locking=self._use_locking)
       return control_flow_ops.group(*[var_update, m_t, v_t, v_hat_t])
 
-  def _resource_scatter_add(self, x, i, v):
-    with ops.control_dependencies(
-        [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
-      return x.value()
-
   def get_config(self):
     config = super(Adam, self).get_config()
     config.update({
@@ -251,6 +254,6 @@ class Adam(optimizer_v2.OptimizerV2):
         'beta_1': self._serialize_hyperparameter('beta_1'),
         'beta_2': self._serialize_hyperparameter('beta_2'),
         'epsilon': self._serialize_hyperparameter('epsilon'),
-        'amsgrad': self._amsgrad,
+        'amsgrad': self.amsgrad,
     })
     return config
diff --git a/tensorflow/python/keras/optimizer_v2/adam_test.py b/tensorflow/python/keras/optimizer_v2/adam_test.py
index 3bbafe12f8e27df9bcc158ae6b50cba2fb086914..761b6a0854d761c22e1ea236bb29184992f892a9 100644
--- a/tensorflow/python/keras/optimizer_v2/adam_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adam_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -162,9 +163,9 @@ class AdamOptimizerTest(test.TestCase):
         # it (i.e. they have GPU kernels).
         var = variables.Variable([[1.0], [2.0]])
         indices = constant_op.constant([0, 1], dtype=index_dtype)
-        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        g_sum = lambda: math_ops.reduce_sum(array_ops.gather(var, indices))  # pylint: disable=cell-var-from-loop
         optimizer = adam.Adam(3.0)
-        minimize_op = optimizer.minimize(gathered_sum, var_list=[var])
+        minimize_op = optimizer.minimize(g_sum, var_list=[var])
         variables.global_variables_initializer().run()
         minimize_op.run()
 
@@ -399,6 +400,55 @@ class AdamOptimizerTest(test.TestCase):
           self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
           self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
+  def testBasicWithLearningRateInverseTimeDecay(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(
+            var0_np, name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable(
+            var1_np, name="var1_%d" % i)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = 0.001
+        decay = 0.5
+        lr_schedule = learning_rate_schedule.InverseTimeDecay(
+            learning_rate, decay_steps=1.0, decay_rate=decay)
+        beta_1 = 0.9
+        beta_2 = 0.999
+        epsilon = 1e-7
+
+        opt = adam.Adam(
+            learning_rate=lr_schedule,
+            beta_1=beta_1,
+            beta_2=beta_2,
+            epsilon=epsilon)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+        self.evaluate(variables.global_variables_initializer())
+        # Run 3 steps of Adam
+        for t in range(3):
+          self.evaluate(update)
+
+          lr_np = learning_rate / (1 + decay * t)
+
+          var0_np, m0, v0 = adam_update_numpy(
+              var0_np, grads0_np, t, m0, v0, lr=lr_np)
+          var1_np, m1, v1 = adam_update_numpy(
+              var1_np, grads1_np, t, m1, v1, lr=lr_np)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
   @test_util.run_deprecated_v1
   def testTensorLearningRate(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -503,6 +553,28 @@ class AdamOptimizerTest(test.TestCase):
     self.assertEqual(
         self.evaluate(keras_v1_iteration), self.evaluate(keras_v2_iteration))
 
+  def testConstructAdamWithLR(self):
+    opt = adam.Adam(lr=1.0)
+    opt_2 = adam.Adam(learning_rate=0.1, lr=1.0)
+    opt_3 = adam.Adam(learning_rate=0.1)
+    self.assertIsInstance(opt.lr, variables.Variable)
+    self.assertIsInstance(opt_2.lr, variables.Variable)
+    self.assertIsInstance(opt_3.lr, variables.Variable)
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose(self.evaluate(opt.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
+
+  def testConstructAdamWithEpsilonValues(self):
+    opt = adam.Adam(epsilon=None)
+    config = opt.get_config()
+    self.assertEqual(config["epsilon"], 1e-7)
+
+    opt = adam.Adam(epsilon=1e-8)
+    config = opt.get_config()
+    self.assertEqual(config["epsilon"], 1e-8)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/adamax.py b/tensorflow/python/keras/optimizer_v2/adamax.py
index ddd78584f852f24f9da6277888d1883bb44db327..3102e28cffcc846e12f72c8f2dd03662a99e2ed3 100644
--- a/tensorflow/python/keras/optimizer_v2/adamax.py
+++ b/tensorflow/python/keras/optimizer_v2/adamax.py
@@ -19,15 +19,17 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
-from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.keras import backend_config
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import keras_export
 
 
-class Adamax(adam.Adam):
+@keras_export('keras.optimizers.Adamax')
+class Adamax(optimizer_v2.OptimizerV2):
   """Optimizer that implements the Adamax algorithm.
 
   It is a variant of Adam based on the infinity norm.
@@ -88,18 +90,27 @@ class Adamax(adam.Adam):
       epsilon: A small constant for numerical stability.
       name: Optional name for the operations created when applying gradients.
         Defaults to "Adamax".
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
     """
-    # pylint: disable=useless-super-delegation
-    super(Adamax, self).__init__(
-        learning_rate=learning_rate,
-        beta_1=beta_1,
-        beta_2=beta_2,
-        epsilon=epsilon,
-        amsgrad=False,
-        name=name,
-        **kwargs)
-    # pylint: enable=useless-super-delegation
+    if epsilon is None:
+      epsilon = backend_config.epsilon()
+    super(Adamax, self).__init__(name, **kwargs)
+    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
+    self._set_hyper('decay', self._initial_decay)
+    self._set_hyper('beta_1', beta_1)
+    self._set_hyper('beta_2', beta_2)
+    self._set_hyper('epsilon', epsilon)
+
+  def _create_slots(self, var_list):
+    # Separate for-loops to respect the ordering of slot variables from v1.
+    for var in var_list:
+      self.add_slot(var, 'm')  # Create slots for the first moments.
+    for var in var_list:
+      self.add_slot(var, 'v')  # Create slots for the second moments.
 
   def _resource_apply_dense(self, grad, var):
     var_dtype = var.dtype.base_dtype
@@ -152,8 +163,13 @@ class Adamax(adam.Adam):
       var_update = self._resource_scatter_add(var, indices, var_slice)
     return control_flow_ops.group(*[var_update, m_t, v_t])
 
-  def _resource_scatter_update(self, x, i, v):
-    with ops.control_dependencies(
-        [resource_variable_ops.resource_scatter_update(
-            x.handle, i, v)]):
-      return x.value()
+  def get_config(self):
+    config = super(Adamax, self).get_config()
+    config.update({
+        'learning_rate': self._serialize_hyperparameter('learning_rate'),
+        'decay': self._serialize_hyperparameter('decay'),
+        'beta_1': self._serialize_hyperparameter('beta_1'),
+        'beta_2': self._serialize_hyperparameter('beta_2'),
+        'epsilon': self._serialize_hyperparameter('epsilon'),
+    })
+    return config
diff --git a/tensorflow/python/keras/optimizer_v2/adamax_test.py b/tensorflow/python/keras/optimizer_v2/adamax_test.py
index baf131fbb0ce5bd4ab6c7d9b8c49e0519290dcef..6934f1590eb32dc2626efa65fcdfb56d4dace4bb 100644
--- a/tensorflow/python/keras/optimizer_v2/adamax_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adamax_test.py
@@ -136,9 +136,9 @@ class AdamaxOptimizerTest(test.TestCase):
         # it (i.e. they have GPU kernels).
         var = variables.Variable([[1.0], [2.0]])
         indices = constant_op.constant([0, 1], dtype=index_dtype)
-        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        g_sum = lambda: math_ops.reduce_sum(array_ops.gather(var, indices))  # pylint: disable=cell-var-from-loop
         optimizer = adamax.Adamax(3.0)
-        minimize_op = optimizer.minimize(gathered_sum, var_list=[var])
+        minimize_op = optimizer.minimize(g_sum, var_list=[var])
         variables.global_variables_initializer().run()
         minimize_op.run()
 
@@ -362,6 +362,28 @@ class AdamaxOptimizerTest(test.TestCase):
       # There should be iteration, and two unique slot variables for v1 and v2.
       self.assertEqual(5, len(set(opt.variables())))
 
+  def testConstructAdamaxWithLR(self):
+    opt = adamax.Adamax(lr=1.0)
+    opt_2 = adamax.Adamax(learning_rate=0.1, lr=1.0)
+    opt_3 = adamax.Adamax(learning_rate=0.1)
+    self.assertIsInstance(opt.lr, variables.Variable)
+    self.assertIsInstance(opt_2.lr, variables.Variable)
+    self.assertIsInstance(opt_3.lr, variables.Variable)
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose(self.evaluate(opt.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
+
+  def testConstructAdamaxWithEpsilonValues(self):
+    opt = adamax.Adamax(epsilon=None)
+    config = opt.get_config()
+    self.assertEqual(config["epsilon"], 1e-7)
+
+    opt = adamax.Adamax(epsilon=1e-8)
+    config = opt.get_config()
+    self.assertEqual(config["epsilon"], 1e-8)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/ftrl.py b/tensorflow/python/keras/optimizer_v2/ftrl.py
index e278e352f551a12718f6b400b16f9d7e05d0c02e..a86fd8d89dbc824cc35a4a6585c85e1794a6aa5c 100644
--- a/tensorflow/python/keras/optimizer_v2/ftrl.py
+++ b/tensorflow/python/keras/optimizer_v2/ftrl.py
@@ -21,8 +21,10 @@ from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import keras_export
 
 
+@keras_export('keras.optimizers.Ftrl')
 class Ftrl(optimizer_v2.OptimizerV2):
   """Optimizer that implements the FTRL algorithm.
 
@@ -70,7 +72,11 @@ class Ftrl(optimizer_v2.OptimizerV2):
                   2*L2_shrinkage*lr_t / (1 + 2*L2*lr_t) * w_t
         where lr_t is the learning rate at t.
         When input is sparse shrinkage will only happen on the active weights.\
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
 
     Raises:
       ValueError: If one of the arguments is invalid.
diff --git a/tensorflow/python/keras/optimizer_v2/ftrl_test.py b/tensorflow/python/keras/optimizer_v2/ftrl_test.py
index bec400e8cbba2654decaf520a24800095e4d16f5..f0f07e9d03f6db31f5e83efbbe6428688d944093 100644
--- a/tensorflow/python/keras/optimizer_v2/ftrl_test.py
+++ b/tensorflow/python/keras/optimizer_v2/ftrl_test.py
@@ -113,8 +113,11 @@ class FtrlOptimizerTest(test.TestCase):
       with self.cached_session():
         var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
-        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
-        loss = pred * pred
+
+        def loss():
+          pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)  # pylint: disable=cell-var-from-loop
+          return pred * pred
+
         sgd_op = ftrl.Ftrl(1.0).minimize(loss, var_list=[var0])
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent.py b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
index 2b82b5e78dedce5ff68b860d143b1ecadd18e0bd..c444f969f6492dd3de646f8bab80ba65d3da625d 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,8 +21,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import keras_export
 
 
+@keras_export("keras.optimizers.SGD")
 class SGD(optimizer_v2.OptimizerV2):
   """Stochastic gradient descent and momentum optimizer.
 
@@ -32,7 +34,7 @@ class SGD(optimizer_v2.OptimizerV2):
   gradient is evaluated at theta(t).
   ```
 
-  or Computes (if `use_nesterov = False`):
+  or Computes (if `nesterov = False`):
   ```
   v(t+1) = momentum * v(t) - learning_rate * gradient
   theta(t+1) = theta(t) + v(t+1)
@@ -72,10 +74,14 @@ class SGD(optimizer_v2.OptimizerV2):
       nesterov: boolean. Whether to apply Nesterov momentum.
       name: Optional name prefix for the operations created when applying
         gradients.  Defaults to 'SGD'.
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
     """
     super(SGD, self).__init__(name, **kwargs)
-    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
     self._set_hyper("decay", self._initial_decay)
 
     self._momentum = False
@@ -85,7 +91,7 @@ class SGD(optimizer_v2.OptimizerV2):
       raise ValueError("`momentum` must be between [0, 1].")
     self._set_hyper("momentum", momentum)
 
-    self._nesterov = nesterov
+    self.nesterov = nesterov
 
   def _create_slots(self, var_list):
     if self._momentum:
@@ -104,7 +110,7 @@ class SGD(optimizer_v2.OptimizerV2):
           grad,
           self._get_hyper("momentum", var_dtype),
           use_locking=self._use_locking,
-          use_nesterov=self._nesterov)
+          use_nesterov=self.nesterov)
     else:
       return training_ops.resource_apply_gradient_descent(
           var.handle, lr_t, grad, use_locking=self._use_locking)
@@ -132,7 +138,7 @@ class SGD(optimizer_v2.OptimizerV2):
         indices,
         self._get_hyper("momentum", var_dtype),
         use_locking=self._use_locking,
-        use_nesterov=self._nesterov)
+        use_nesterov=self.nesterov)
 
   def get_config(self):
     config = super(SGD, self).get_config()
@@ -140,6 +146,6 @@ class SGD(optimizer_v2.OptimizerV2):
         "learning_rate": self._serialize_hyperparameter("learning_rate"),
         "decay": self._serialize_hyperparameter("decay"),
         "momentum": self._serialize_hyperparameter("momentum"),
-        "nesterov": self._nesterov,
+        "nesterov": self.nesterov,
     })
     return config
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
index 0c64202da81c36e4140be7ca7719e9d426c549cc..6bd56372b9a08645d96dd7b0d3e991bfe22fa1d4 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
@@ -57,42 +58,61 @@ class GradientDescentOptimizerTest(test.TestCase):
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
                                            self.evaluate(var1))
 
+  def _test_basic_sgd_with_learning_rate_decay(self, sgd, dtype):
+    var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+    var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+    grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+    grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+    if not context.executing_eagerly():
+      sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
+    self.evaluate(variables.global_variables_initializer())
+    # Run 2 steps of sgd
+    if not context.executing_eagerly():
+      self.evaluate(sgd_op)
+    else:
+      sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
+    # Validate updated params
+    self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                       self.evaluate(var0))
+    self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                       self.evaluate(var1))
+
+    if not context.executing_eagerly():
+      self.evaluate(sgd_op)
+    else:
+      sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
+    # Validate updated params
+    self.assertAllCloseAccordingToType(
+        [1.0 - 3.0 * 0.1 - 2.0 * 0.1, 2.0 - 3.0 * 0.1 - 2.0 * 0.1],
+        self.evaluate(var0))
+    self.assertAllCloseAccordingToType(
+        [3.0 - 3.0 * 0.01 - 2.0 * 0.01, 4.0 - 3.0 * 0.01 - 2.0 * 0.01],
+        self.evaluate(var1))
+
   @test_util.run_in_graph_and_eager_modes
   def testBasicWithLearningRateDecay(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
-        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        learning_rate = 3.0
-        decay = 0.5
-        sgd = gradient_descent.SGD(learning_rate=learning_rate, decay=decay)
-        if not context.executing_eagerly():
-          sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        self.evaluate(variables.global_variables_initializer())
-        # Run 2 steps of sgd
-        if not context.executing_eagerly():
-          self.evaluate(sgd_op)
-        else:
-          sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        # Validate updated params
-        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           self.evaluate(var0))
-        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           self.evaluate(var1))
+      learning_rate = 3.0
+      decay = 0.5
+      sgd = gradient_descent.SGD(learning_rate=learning_rate, decay=decay)
+      self._test_basic_sgd_with_learning_rate_decay(sgd, dtype)
 
-        if not context.executing_eagerly():
-          self.evaluate(sgd_op)
-        else:
-          sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [1.0 - 3.0 * 0.1 - 2.0 * 0.1, 2.0 - 3.0 * 0.1 - 2.0 * 0.1],
-            self.evaluate(var0))
-        self.assertAllCloseAccordingToType(
-            [3.0 - 3.0 * 0.01 - 2.0 * 0.01, 4.0 - 3.0 * 0.01 - 2.0 * 0.01],
-            self.evaluate(var1))
+  @test_util.run_in_graph_and_eager_modes
+  def testBasicWithLearningRateInverseTimeDecay(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      learning_rate = learning_rate_schedule.InverseTimeDecay(
+          3.0, decay_steps=1.0, decay_rate=0.5)
+      sgd = gradient_descent.SGD(learning_rate=learning_rate)
+      self._test_basic_sgd_with_learning_rate_decay(sgd, dtype)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBasicWithLearningRateInverseTimeDecaySerializeAndDeserialize(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      learning_rate = learning_rate_schedule.InverseTimeDecay(
+          3.0, decay_steps=1.0, decay_rate=0.5)
+      sgd = gradient_descent.SGD(learning_rate=learning_rate)
+      sgd = gradient_descent.SGD.from_config(sgd.get_config())
+      self._test_basic_sgd_with_learning_rate_decay(sgd, dtype)
 
   @test_util.run_in_graph_and_eager_modes
   def testBasicCallableParams(self):
@@ -122,8 +142,6 @@ class GradientDescentOptimizerTest(test.TestCase):
         var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
         loss = lambda: math_ops.matmul(var0, x) + var1  # pylint: disable=cell-var-from-loop
-        if not context.executing_eagerly():
-          loss = loss()
         sgd = gradient_descent.SGD(1.0)
         sgd_op = sgd.minimize(loss, [var0, var1])
         self.evaluate(variables.global_variables_initializer())
@@ -141,9 +159,12 @@ class GradientDescentOptimizerTest(test.TestCase):
         var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
-        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
-        pred += var1
-        loss = pred * pred
+
+        def loss():
+          pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)  # pylint: disable=cell-var-from-loop
+          pred += var1  # pylint: disable=cell-var-from-loop
+          return pred * pred
+
         sgd_op = gradient_descent.SGD(1.0).minimize(loss, [var0, var1])
         self.evaluate(variables.global_variables_initializer())
         # Run 1 step of sgd
@@ -181,7 +202,8 @@ class GradientDescentOptimizerTest(test.TestCase):
         opt = gradient_descent.SGD(3.0)
         values = [1.0, 3.0]
         vars_ = [variables.Variable([v], dtype=dtype) for v in values]
-        grads_and_vars = opt.compute_gradients(vars_[0] + vars_[1], vars_)
+        loss = lambda: vars_[0] + vars_[1]  # pylint: disable=cell-var-from-loop
+        grads_and_vars = opt._compute_gradients(loss, vars_)
         self.evaluate(variables.global_variables_initializer())
         for grad, _ in grads_and_vars:
           self.assertAllCloseAccordingToType([1.0], self.evaluate(grad))
@@ -259,6 +281,19 @@ class GradientDescentOptimizerTest(test.TestCase):
       # be an EagerTensor once again, not a graph Tensor.
       self.assertEqual(float(step()), -1.0)
 
+  def testConstructSGDWithLR(self):
+    opt = gradient_descent.SGD(lr=1.0)
+    opt_2 = gradient_descent.SGD(learning_rate=0.1, lr=1.0)
+    opt_3 = gradient_descent.SGD(learning_rate=0.1)
+    self.assertIsInstance(opt.lr, variables.Variable)
+    self.assertIsInstance(opt_2.lr, variables.Variable)
+    self.assertIsInstance(opt_3.lr, variables.Variable)
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose(self.evaluate(opt.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
+
 
 class MomentumOptimizerTest(test.TestCase):
 
@@ -346,7 +381,7 @@ class MomentumOptimizerTest(test.TestCase):
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
         accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
         accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        loss = 5 * var0 * var0 + 3 * var1
+        loss = lambda: 5 * var0 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
         mom_op = gradient_descent.SGD(
             learning_rate=2.0, momentum=0.9, nesterov=True)
         opt_op = mom_op.minimize(loss, [var0, var1])
@@ -657,11 +692,16 @@ class MomentumOptimizerTest(test.TestCase):
       opt = gradient_descent.SGD(learning_rate=1.0, momentum=0.9, nesterov=True)
       config = opt.get_config()
       opt2 = gradient_descent.SGD.from_config(config)
-      # assert both are equal float values.
-      self.assertEqual(
-          opt._get_hyper("learning_rate"), opt2._get_hyper("learning_rate"))
-      self.assertEqual(opt._get_hyper("momentum"), opt2._get_hyper("momentum"))
-      # self.assertEqual(opt._get_hyper("decay"), opt2._get_hyper("decay"))
+      lr = opt.lr
+      lr2 = opt2.lr
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllClose(self.evaluate(lr), self.evaluate(lr2))
+      self.assertAllClose(
+          self.evaluate(opt._get_hyper("momentum")),
+          self.evaluate(opt2._get_hyper("momentum")))
+      self.assertAllClose(
+          self.evaluate(opt._get_hyper("decay")),
+          self.evaluate(opt2._get_hyper("decay")))
       var0 = variables.Variable([[1.0], [2.0]], dtype=dtypes.float32)
       loss = lambda: 3 * var0
       # learning rate variable created when calling minimize.
@@ -669,20 +709,34 @@ class MomentumOptimizerTest(test.TestCase):
       self.evaluate(variables.global_variables_initializer())
       config = opt.get_config()
       opt3 = gradient_descent.SGD.from_config(config)
-      self.assertEqual(
-          self.evaluate(opt._get_hyper("learning_rate")),
-          opt3._get_hyper("learning_rate"))
-      self.assertEqual(
+      lr3 = opt3.lr
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllClose(self.evaluate(lr), self.evaluate(lr3))
+      self.assertAllClose(
           self.evaluate(opt._get_hyper("momentum")),
-          opt3._get_hyper("momentum"))
-      # self.assertEqual(
-      #     self.evaluate(opt._get_hyper("decay")), opt3._get_hyper("decay"))
-      self.assertTrue(opt3._nesterov)
+          self.evaluate(opt3._get_hyper("momentum")))
+      self.assertAllClose(
+          self.evaluate(opt._get_hyper("decay")),
+          self.evaluate(opt3._get_hyper("decay")))
+      self.assertTrue(opt3.nesterov)
 
   def testNesterovWithoutMomentum(self):
     with self.assertRaisesRegexp(ValueError, "must be between"):
       gradient_descent.SGD(learning_rate=1.0, momentum=2.0)
 
+  def testConstructMomentumWithLR(self):
+    opt = gradient_descent.SGD(lr=1.0, momentum=0.9)
+    opt_2 = gradient_descent.SGD(learning_rate=0.1, momentum=0.9, lr=1.0)
+    opt_3 = gradient_descent.SGD(learning_rate=0.1, momentum=0.9)
+    self.assertIsInstance(opt.lr, variables.Variable)
+    self.assertIsInstance(opt_2.lr, variables.Variable)
+    self.assertIsInstance(opt_3.lr, variables.Variable)
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose(self.evaluate(opt.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
new file mode 100644
index 0000000000000000000000000000000000000000..c44263bdcf2237ae998f7d796bd4086361a4146c
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
@@ -0,0 +1,1028 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Various learning rate decay functions."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import math
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export("keras.optimizers.schedules.LearningRateSchedule")
+class LearningRateSchedule(object):
+  """A serializable learning rate decay schedule.
+
+  `LearningRateSchedule`s can be passed in as the learning rate of optimizers in
+  `tf.keras.optimizers`. They can be serialized and deserialized using
+  `tf.keras.optimizers.schedules.serialize` and
+  `tf.keras.optimizers.schedules.deserialize`.
+  """
+
+  @abc.abstractmethod
+  def __call__(self, step):
+    raise NotImplementedError("Learning rate schedule must override __call__")
+
+  @abc.abstractmethod
+  def get_config(self):
+    raise NotImplementedError("Learning rate schedule must override get_config")
+
+  @classmethod
+  def from_config(cls, config):
+    """Instantiates a `LearningRateSchedule` from its config.
+
+    Args:
+        config: Output of `get_config()`.
+
+    Returns:
+        A `LearningRateSchedule` instance.
+    """
+    return cls(**config)
+
+
+@keras_export("keras.optimizers.schedules.ExponentialDecay")
+class ExponentialDecay(LearningRateSchedule):
+  """A LearningRateSchedule that uses an exponential decay schedule."""
+
+  def __init__(
+      self,
+      initial_learning_rate,
+      decay_steps,
+      decay_rate,
+      staircase=False,
+      name=None):
+    """Applies exponential decay to the learning rate.
+
+    When training a model, it is often recommended to lower the learning rate as
+    the training progresses. This schedule applies an exponential decay function
+    to an optimizer step, given a provided initial learning rate.
+
+    The schedule a 1-arg callable that produces a decayed learning
+    rate when passed the current optimizer step. This can be useful for changing
+    the learning rate value across different invocations of optimizer functions.
+    It is computed as:
+
+    ```python
+    def decayed_learning_rate(step):
+      return initial_learning_rate * decay_rate ^ (step / decay_steps)
+    ```
+
+    If the argument `staircase` is `True`, then `step / decay_steps` is
+    an integer division and the decayed learning rate follows a
+    staircase function.
+
+    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+    as the learning rate.
+    Example: When fitting a Keras model, decay every 100000 steps with a base
+    of 0.96:
+
+    ```python
+    initial_learning_rate = 0.1
+    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
+        initial_learning_rate,
+        decay_steps=100000,
+        decay_rate=0.96,
+        staircase=True)
+
+    model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=lr_schedule),
+                  loss='sparse_categorical_crossentropy',
+                  metrics=['accuracy'])
+
+    model.fit(data, labels, epochs=5)
+    ```
+
+    The learning rate schedule is also serializable and deserializable using
+    `tf.keras.optimizers.schedules.serialize` and
+    `tf.keras.optimizers.schedules.deserialize`.
+
+    Args:
+      initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
+        Python number.  The initial learning rate.
+      decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+        Must be positive.  See the decay computation above.
+      decay_rate: A scalar `float32` or `float64` `Tensor` or a
+        Python number.  The decay rate.
+      staircase: Boolean.  If `True` decay the learning rate at discrete
+        intervals
+      name: String.  Optional name of the operation.  Defaults to
+        'ExponentialDecay'.
+
+    Returns:
+      A 1-arg callable learning rate schedule that takes the current optimizer
+      step and outputs the decayed learning rate, a scalar `Tensor` of the same
+      type as `initial_learning_rate`.
+    """
+    super(ExponentialDecay, self).__init__()
+    self.initial_learning_rate = initial_learning_rate
+    self.decay_steps = decay_steps
+    self.decay_rate = decay_rate
+    self.staircase = staircase
+    self.name = name
+
+  def __call__(self, step):
+    with ops.name_scope(
+        self.name, "ExponentialDecay",
+        [self.initial_learning_rate, step, self.decay_steps, self.decay_rate]
+    ) as name:
+      initial_learning_rate = ops.convert_to_tensor(
+          self.initial_learning_rate, name="initial_learning_rate")
+      dtype = initial_learning_rate.dtype
+      decay_steps = math_ops.cast(self.decay_steps, dtype)
+      decay_rate = math_ops.cast(self.decay_rate, dtype)
+
+      global_step_recomp = math_ops.cast(step, dtype)
+      p = global_step_recomp / decay_steps
+      if self.staircase:
+        p = math_ops.floor(p)
+      return math_ops.multiply(
+          initial_learning_rate, math_ops.pow(decay_rate, p), name=name)
+
+  def get_config(self):
+    return {
+        "initial_learning_rate": self.initial_learning_rate,
+        "decay_steps": self.decay_steps,
+        "decay_rate": self.decay_rate,
+        "staircase": self.staircase,
+        "name": self.name
+    }
+
+
+@keras_export("keras.optimizers.schedules.PiecewiseConstantDecay")
+class PiecewiseConstantDecay(LearningRateSchedule):
+  """A LearningRateSchedule that uses a piecewise constant decay schedule."""
+
+  def __init__(
+      self,
+      boundaries,
+      values,
+      name=None):
+    """Piecewise constant from boundaries and interval values.
+
+    The function returns a 1-arg callable to compute the piecewise constant
+    when passed the current optimizer step. This can be useful for changing the
+    learning rate value across different invocations of optimizer functions.
+
+    Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5
+      for the next 10000 steps, and 0.1 for any additional steps.
+
+    ```python
+    step = tf.Variable(0, trainable=False)
+    boundaries = [100000, 110000]
+    values = [1.0, 0.5, 0.1]
+    learning_rate_fn = keras.optimizers.schedules.PiecewiseConstantDecay(
+        boundaries, values)
+
+    # Later, whenever we perform an optimization step, we pass in the step.
+    learning_rate = learning_rate_fn(step)
+    ```
+
+    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+    as the learning rate. The learning rate schedule is also serializable and
+    deserializable using `tf.keras.optimizers.schedules.serialize` and
+    `tf.keras.optimizers.schedules.deserialize`.
+
+    Args:
+      boundaries: A list of `Tensor`s or `int`s or `float`s with strictly
+        increasing entries, and with all elements having the same type as the
+        optimizer step.
+      values: A list of `Tensor`s or `float`s or `int`s that specifies the
+        values for the intervals defined by `boundaries`. It should have one
+        more element than `boundaries`, and all elements should have the same
+        type.
+      name: A string. Optional name of the operation. Defaults to
+        'PiecewiseConstant'.
+
+    Returns:
+      A 1-arg callable learning rate schedule that takes the current optimizer
+      step and outputs the decayed learning rate, a scalar `Tensor` of the same
+      type as the boundary tensors.
+
+      The output of the 1-arg function that takes the `step`
+      is `values[0]` when `step <= boundaries[0]`,
+      `values[1]` when `step > boundaries[0]` and `step <= boundaries[1]`, ...,
+      and values[-1] when `step > boundaries[-1]`.
+
+    Raises:
+      ValueError: if types of all `values` do not match or
+          the number of elements in the lists does not match.
+    """
+    super(PiecewiseConstantDecay, self).__init__()
+
+    if len(boundaries) != len(values) - 1:
+      raise ValueError(
+          "The length of boundaries should be 1 less than the length of values")
+
+    self.boundaries = boundaries
+    self.values = values
+    self.name = name
+
+  def __call__(self, step):
+    with ops.name_scope(self.name, "PiecewiseConstant",
+                        [step, self.boundaries, self.values, self.name]):
+      boundaries = ops.convert_n_to_tensor(self.boundaries)
+      values = ops.convert_n_to_tensor(self.values)
+      x_recomp = ops.convert_to_tensor(step)
+      # Avoid explicit conversion to x's dtype. This could result in faulty
+      # comparisons, for example if floats are converted to integers.
+      for i, b in enumerate(boundaries):
+        if b.dtype.base_dtype != x_recomp.dtype.base_dtype:
+          # We can promote int32 boundaries to int64 without loss of precision.
+          # This covers the most common case where the user passes in boundaries
+          # as an array of Python integers.
+          if (b.dtype.base_dtype == dtypes.int32 and
+              x_recomp.dtype.base_dtype == dtypes.int64):
+            b = math_ops.cast(b, x_recomp.dtype.base_dtype)
+            boundaries[i] = b
+          else:
+            raise ValueError(
+                "Boundaries (%s) must have the same dtype as x (%s)." %
+                (b.dtype.base_dtype, x_recomp.dtype.base_dtype))
+      # TODO(rdipietro): Ensure that boundaries' elements strictly increases.
+      for v in values[1:]:
+        if v.dtype.base_dtype != values[0].dtype.base_dtype:
+          raise ValueError(
+              "Values must have elements all with the same dtype (%s vs %s)." %
+              (values[0].dtype.base_dtype, v.dtype.base_dtype))
+      pred_fn_pairs = []
+      pred_fn_pairs.append((x_recomp <= boundaries[0], lambda: values[0]))
+      pred_fn_pairs.append((x_recomp > boundaries[-1], lambda: values[-1]))
+      for low, high, v in zip(boundaries[:-1], boundaries[1:], values[1:-1]):
+        # Need to bind v here; can do this with lambda v=v: ...
+        pred = (x_recomp > low) & (x_recomp <= high)
+        pred_fn_pairs.append((pred, lambda v=v: v))
+
+      # The default isn't needed here because our conditions are mutually
+      # exclusive and exhaustive, but tf.case requires it.
+      default = lambda: values[0]
+      return control_flow_ops.case(pred_fn_pairs, default, exclusive=True)
+
+  def get_config(self):
+    return {
+        "boundaries": self.boundaries,
+        "values": self.values,
+        "name": self.name
+    }
+
+
+@keras_export("keras.optimizers.schedules.PolynomialDecay")
+class PolynomialDecay(LearningRateSchedule):
+  """A LearningRateSchedule that uses a polynomial decay schedule."""
+
+  def __init__(
+      self,
+      initial_learning_rate,
+      decay_steps,
+      end_learning_rate=0.0001,
+      power=1.0,
+      cycle=False,
+      name=None):
+    """Applies a polynomial decay to the learning rate.
+
+    It is commonly observed that a monotonically decreasing learning rate, whose
+    degree of change is carefully chosen, results in a better performing model.
+    This schedule applies a polynomial decay function to an optimizer step,
+    given a provided `initial_learning_rate`, to reach an `end_learning_rate`
+    in the given `decay_steps`.
+
+    It requires a `step` value to compute the decayed learning rate. You
+    can just pass a TensorFlow variable that you increment at each training
+    step.
+
+    The schedule is a 1-arg callable that produces a decayed learning rate
+    when passed the current optimizer step. This can be useful for changing the
+    learning rate value across different invocations of optimizer functions.
+    It is computed as:
+
+    ```python
+    def decayed_learning_rate(step):
+      step = min(step, decay_steps)
+      return ((initial_learning_rate - end_learning_rate) *
+              (1 - step / decay_steps) ^ (power)
+             ) + end_learning_rate
+    ```
+
+    If `cycle` is True then a multiple of `decay_steps` is used, the first one
+    that is bigger than `step`.
+
+    ```python
+    def decayed_learning_rate(step):
+      decay_steps = decay_steps * ceil(step / decay_steps)
+      return ((initial_learning_rate - end_learning_rate) *
+              (1 - step / decay_steps) ^ (power)
+             ) + end_learning_rate
+    ```
+
+    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+    as the learning rate.
+    Example: Fit a model while decaying from 0.1 to 0.01 in 10000 steps using
+    sqrt (i.e. power=0.5):
+
+    ```python
+    ...
+    starter_learning_rate = 0.1
+    end_learning_rate = 0.01
+    decay_steps = 10000
+    learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
+        starter_learning_rate,
+        decay_steps,
+        end_learning_rate,
+        power=0.5)
+
+    model.compile(optimizer=tf.keras.optimizers.SGD(
+                      learning_rate=learning_rate_fn),
+                  loss='sparse_categorical_crossentropy',
+                  metrics=['accuracy'])
+
+    model.fit(data, labels, epochs=5)
+    ```
+
+    The learning rate schedule is also serializable and deserializable using
+    `tf.keras.optimizers.schedules.serialize` and
+    `tf.keras.optimizers.schedules.deserialize`.
+
+    Args:
+      initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
+        Python number.  The initial learning rate.
+      decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+        Must be positive.  See the decay computation above.
+      end_learning_rate: A scalar `float32` or `float64` `Tensor` or a
+        Python number.  The minimal end learning rate.
+      power: A scalar `float32` or `float64` `Tensor` or a
+        Python number.  The power of the polynomial. Defaults to linear, 1.0.
+      cycle: A boolean, whether or not it should cycle beyond decay_steps.
+      name: String.  Optional name of the operation. Defaults to
+        'PolynomialDecay'.
+
+    Returns:
+      A 1-arg callable learning rate schedule that takes the current optimizer
+      step and outputs the decayed learning rate, a scalar `Tensor` of the same
+      type as `initial_learning_rate`.
+    """
+    super(PolynomialDecay, self).__init__()
+
+    self.initial_learning_rate = initial_learning_rate
+    self.decay_steps = decay_steps
+    self.end_learning_rate = end_learning_rate
+    self.power = power
+    self.cycle = cycle
+    self.name = name
+
+  def __call__(self, step):
+    with ops.name_scope(
+        self.name, "PolynomialDecay",
+        [self.initial_learning_rate, step, self.decay_steps,
+         self.end_learning_rate, self.power]
+    ) as name:
+      initial_learning_rate = ops.convert_to_tensor(
+          self.initial_learning_rate, name="initial_learning_rate")
+      dtype = initial_learning_rate.dtype
+      end_learning_rate = math_ops.cast(self.end_learning_rate, dtype)
+      power = math_ops.cast(self.power, dtype)
+
+      global_step_recomp = math_ops.cast(step, dtype)
+      decay_steps_recomp = math_ops.cast(self.decay_steps, dtype)
+      if self.cycle:
+        # Find the first multiple of decay_steps that is bigger than
+        # global_step. If global_step is zero set the multiplier to 1
+        multiplier = control_flow_ops.cond(
+            math_ops.equal(global_step_recomp, 0), lambda: 1.0,
+            lambda: math_ops.ceil(global_step_recomp / self.decay_steps))
+        decay_steps_recomp = math_ops.multiply(decay_steps_recomp, multiplier)
+      else:
+        # Make sure that the global_step used is not bigger than decay_steps.
+        global_step_recomp = math_ops.minimum(global_step_recomp,
+                                              self.decay_steps)
+
+      p = math_ops.div(global_step_recomp, decay_steps_recomp)
+      return math_ops.add(
+          math_ops.multiply(initial_learning_rate - end_learning_rate,
+                            math_ops.pow(1 - p, power)),
+          end_learning_rate,
+          name=name)
+
+  def get_config(self):
+    return {
+        "initial_learning_rate": self.initial_learning_rate,
+        "decay_steps": self.decay_steps,
+        "end_learning_rate": self.end_learning_rate,
+        "power": self.power,
+        "cycle": self.cycle,
+        "name": self.name
+    }
+
+
+@keras_export("keras.optimizers.schedules.InverseTimeDecay")
+class InverseTimeDecay(LearningRateSchedule):
+  """A LearningRateSchedule that uses an inverse time decay schedule."""
+
+  def __init__(
+      self,
+      initial_learning_rate,
+      decay_steps,
+      decay_rate,
+      staircase=False,
+      name=None):
+    """Applies inverse time decay to the initial learning rate.
+
+    When training a model, it is often recommended to lower the learning rate as
+    the training progresses. This schedule applies the inverse decay function
+    to an optimizer step, given a provided initial learning rate.
+    It requires a `step` value to compute the decayed learning rate. You can
+    just pass a TensorFlow variable that you increment at each training step.
+
+    The schedule a 1-arg callable that produces a decayed learning
+    rate when passed the current optimizer step. This can be useful for changing
+    the learning rate value across different invocations of optimizer functions.
+    It is computed as:
+
+    ```python
+    def decayed_learning_rate(step):
+      return initial_learning_rate / (1 + decay_rate * step / decay_step)
+    ```
+
+    or, if `staircase` is `True`, as:
+
+    ```python
+    def decayed_learning_rate(step):
+      return initial_learning_rate / (1 + decay_rate * floor(step / decay_step))
+    ```
+
+    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+    as the learning rate.
+    Example: Fit a Keras model when decaying 1/t with a rate of 0.5:
+
+    ```python
+    ...
+    initial_learning_rate = 0.1
+    decay_steps = 1.0
+    decay_rate = 0.5
+    learning_rate_fn = keras.optimizers.schedules.InverseTimeDecay(
+      initial_learning_rate, global_step, decay_steps, decay_rate)
+
+    model.compile(optimizer=tf.keras.optimizers.SGD(
+                      learning_rate=learning_rate_fn),
+                  loss='sparse_categorical_crossentropy',
+                  metrics=['accuracy'])
+
+    model.fit(data, labels, epochs=5)
+    ```
+
+    Args:
+      initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
+        Python number.  The initial learning rate.
+      decay_steps: How often to apply decay.
+      decay_rate: A Python number.  The decay rate.
+      staircase: Whether to apply decay in a discrete staircase, as opposed to
+        continuous, fashion.
+      name: String.  Optional name of the operation.  Defaults to
+        'InverseTimeDecay'.
+
+    Returns:
+      A 1-arg callable learning rate schedule that takes the current optimizer
+      step and outputs the decayed learning rate, a scalar `Tensor` of the same
+      type as `initial_learning_rate`.
+    """
+    super(InverseTimeDecay, self).__init__()
+
+    self.initial_learning_rate = initial_learning_rate
+    self.decay_steps = decay_steps
+    self.decay_rate = decay_rate
+    self.staircase = staircase
+    self.name = name
+
+  def __call__(self, step):
+    with ops.name_scope(self.name, "InverseTimeDecay",
+                        [self.initial_learning_rate, step, self.decay_rate]
+                       ) as name:
+      initial_learning_rate = ops.convert_to_tensor(
+          self.initial_learning_rate, name="initial_learning_rate")
+      dtype = initial_learning_rate.dtype
+      decay_steps = math_ops.cast(self.decay_steps, dtype)
+      decay_rate = math_ops.cast(self.decay_rate, dtype)
+
+      global_step_recomp = math_ops.cast(step, dtype)
+      p = global_step_recomp / decay_steps
+      if self.staircase:
+        p = math_ops.floor(p)
+      const = math_ops.cast(constant_op.constant(1), dtype)
+      denom = math_ops.add(const, math_ops.multiply(decay_rate, p))
+      return math_ops.div(initial_learning_rate, denom, name=name)
+
+  def get_config(self):
+    return {
+        "initial_learning_rate": self.initial_learning_rate,
+        "decay_steps": self.decay_steps,
+        "decay_rate": self.decay_rate,
+        "staircase": self.staircase,
+        "name": self.name
+    }
+
+
+@keras_export("keras.experimental.CosineDecay")
+class CosineDecay(LearningRateSchedule):
+  """A LearningRateSchedule that uses a cosine decay schedule."""
+
+  def __init__(
+      self,
+      initial_learning_rate,
+      decay_steps,
+      alpha=0.0,
+      name=None):
+    """Applies cosine decay to the learning rate.
+
+    See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
+    with Warm Restarts. https://arxiv.org/abs/1608.03983
+
+    When training a model, it is often recommended to lower the learning rate as
+    the training progresses. This schedule applies a cosine decay function
+    to an optimizer step, given a provided initial learning rate.
+    It requires a `step` value to compute the decayed learning rate. You can
+    just pass a TensorFlow variable that you increment at each training step.
+
+    The schedule a 1-arg callable that produces a decayed learning
+    rate when passed the current optimizer step. This can be useful for changing
+    the learning rate value across different invocations of optimizer functions.
+    It is computed as:
+
+    ```python
+    def decayed_learning_rate(step):
+      step = min(step, decay_steps)
+      cosine_decay = 0.5 * (1 + cos(pi * step / decay_steps))
+      decayed = (1 - alpha) * cosine_decay + alpha
+      return initial_learning_rate * decayed
+    ```
+
+    Example usage:
+    ```python
+    decay_steps = 1000
+    lr_decayed_fn = tf.keras.experimental.CosineDecay(
+        initial_learning_rate, global_step, decay_steps)
+    ```
+
+    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+    as the learning rate. The learning rate schedule is also serializable and
+    deserializable using `tf.keras.optimizers.schedules.serialize` and
+    `tf.keras.optimizers.schedules.deserialize`.
+
+    Args:
+      initial_learning_rate: A scalar `float32` or `float64` Tensor or a
+        Python number. The initial learning rate.
+      decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+        Number of steps to decay over.
+      alpha: A scalar `float32` or `float64` Tensor or a Python number.
+        Minimum learning rate value as a fraction of initial_learning_rate.
+      name: String. Optional name of the operation.  Defaults to 'CosineDecay'.
+    Returns:
+      A 1-arg callable learning rate schedule that takes the current optimizer
+      step and outputs the decayed learning rate, a scalar `Tensor` of the same
+      type as `initial_learning_rate`.
+    """
+    super(CosineDecay, self).__init__()
+
+    self.initial_learning_rate = initial_learning_rate
+    self.decay_steps = decay_steps
+    self.alpha = alpha
+    self.name = name
+
+  def __call__(self, step):
+    with ops.name_scope(self.name, "CosineDecay",
+                        [self.initial_learning_rate, step]):
+      initial_learning_rate = ops.convert_to_tensor(
+          self.initial_learning_rate, name="initial_learning_rate")
+      dtype = initial_learning_rate.dtype
+      decay_steps = math_ops.cast(self.decay_steps, dtype)
+
+      global_step_recomp = math_ops.cast(step, dtype)
+      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
+      completed_fraction = global_step_recomp / decay_steps
+      cosine_decayed = 0.5 * (1.0 + math_ops.cos(
+          constant_op.constant(math.pi) * completed_fraction))
+
+      decayed = (1 - self.alpha) * cosine_decayed + self.alpha
+      return math_ops.multiply(initial_learning_rate, decayed)
+
+  def get_config(self):
+    return {
+        "initial_learning_rate": self.initial_learning_rate,
+        "decay_steps": self.decay_steps,
+        "alpha": self.alpha,
+        "name": self.name
+    }
+
+
+@keras_export("keras.experimental.CosineDecayRestarts")
+class CosineDecayRestarts(LearningRateSchedule):
+  """A LearningRateSchedule that uses a cosine decay schedule with restarts."""
+
+  def __init__(
+      self,
+      initial_learning_rate,
+      first_decay_steps,
+      t_mul=2.0,
+      m_mul=1.0,
+      alpha=0.0,
+      name=None):
+    """Applies cosine decay with restarts to the learning rate.
+
+    See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
+    with Warm Restarts. https://arxiv.org/abs/1608.03983
+
+    When training a model, it is often recommended to lower the learning rate as
+    the training progresses. This schedule applies a cosine decay function with
+    restarts to an optimizer step, given a provided initial learning rate.
+    It requires a `step` value to compute the decayed learning rate. You can
+    just pass a TensorFlow variable that you increment at each training step.
+
+    The schedule a 1-arg callable that produces a decayed learning
+    rate when passed the current optimizer step. This can be useful for changing
+    the learning rate value across different invocations of optimizer functions.
+
+    The learning rate multiplier first decays
+    from 1 to `alpha` for `first_decay_steps` steps. Then, a warm
+    restart is performed. Each new warm restart runs for `t_mul` times more
+    steps and with `m_mul` times smaller initial learning rate.
+
+    Example usage:
+    ```python
+    first_decay_steps = 1000
+    lr_decayed_fn = (
+      tf.keras.experimental.CosineDecayRestarts(
+          initial_learning_rate,
+          global_step,
+          first_decay_steps))
+    ```
+
+    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+    as the learning rate. The learning rate schedule is also serializable and
+    deserializable using `tf.keras.optimizers.schedules.serialize` and
+    `tf.keras.optimizers.schedules.deserialize`.
+
+    Args:
+      initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python
+        number. The initial learning rate.
+      first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python
+        number. Number of steps to decay over.
+      t_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
+        Used to derive the number of iterations in the i-th period
+      m_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
+        Used to derive the initial learning rate of the i-th period:
+      alpha: A scalar `float32` or `float64` Tensor or a Python number.
+        Minimum learning rate value as a fraction of the initial_learning_rate.
+      name: String. Optional name of the operation.  Defaults to 'SGDRDecay'.
+    Returns:
+      A 1-arg callable learning rate schedule that takes the current optimizer
+      step and outputs the decayed learning rate, a scalar `Tensor` of the same
+      type as `initial_learning_rate`.
+    Raises:
+      ValueError: if `global_step` is not supplied.
+    """
+    super(CosineDecayRestarts, self).__init__()
+
+    self.initial_learning_rate = initial_learning_rate
+    self.first_decay_steps = first_decay_steps
+    self._t_mul = t_mul
+    self._m_mul = m_mul
+    self.alpha = alpha
+    self.name = name
+
+  def __call__(self, step):
+    with ops.name_scope(self.name, "SGDRDecay",
+                        [self.initial_learning_rate, step]
+                       ) as name:
+      initial_learning_rate = ops.convert_to_tensor(
+          self.initial_learning_rate, name="initial_learning_rate")
+      dtype = initial_learning_rate.dtype
+      first_decay_steps = math_ops.cast(self.first_decay_steps, dtype)
+      alpha = math_ops.cast(self.alpha, dtype)
+      t_mul = math_ops.cast(self._t_mul, dtype)
+      m_mul = math_ops.cast(self._m_mul, dtype)
+
+      global_step_recomp = math_ops.cast(step, dtype)
+      completed_fraction = global_step_recomp / first_decay_steps
+
+      def compute_step(completed_fraction, geometric=False):
+        """Helper for `cond` operation."""
+        if geometric:
+          i_restart = math_ops.floor(
+              math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) /
+              math_ops.log(t_mul))
+
+          sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul)
+          completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart
+
+        else:
+          i_restart = math_ops.floor(completed_fraction)
+          completed_fraction -= i_restart
+
+        return i_restart, completed_fraction
+
+      i_restart, completed_fraction = control_flow_ops.cond(
+          math_ops.equal(t_mul, 1.0),
+          lambda: compute_step(completed_fraction, geometric=False),
+          lambda: compute_step(completed_fraction, geometric=True))
+
+      m_fac = m_mul**i_restart
+      cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos(
+          constant_op.constant(math.pi) * completed_fraction))
+      decayed = (1 - alpha) * cosine_decayed + alpha
+
+      return math_ops.multiply(initial_learning_rate, decayed, name=name)
+
+  def get_config(self):
+    return {
+        "initial_learning_rate": self.initial_learning_rate,
+        "first_decay_steps": self.first_decay_steps,
+        "t_mul": self._t_mul,
+        "m_mul": self._m_mul,
+        "alpha": self.alpha,
+        "name": self.name
+    }
+
+
+@keras_export("keras.experimental.LinearCosineDecay")
+class LinearCosineDecay(LearningRateSchedule):
+  """A LearningRateSchedule that uses a linear cosine decay schedule."""
+
+  def __init__(
+      self,
+      initial_learning_rate,
+      decay_steps,
+      num_periods=0.5,
+      alpha=0.0,
+      beta=0.001,
+      name=None):
+    """Applies linear cosine decay to the learning rate.
+
+    See [Bello et al., ICML2017] Neural Optimizer Search with RL.
+    https://arxiv.org/abs/1709.07417
+
+    For the idea of warm starts here controlled by `num_periods`,
+    see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
+    with Warm Restarts. https://arxiv.org/abs/1608.03983
+
+    Note that linear cosine decay is more aggressive than cosine decay and
+    larger initial learning rates can typically be used.
+
+    When training a model, it is often recommended to lower the learning rate as
+    the training progresses. This schedule applies a linear cosine decay
+    function to an optimizer step, given a provided initial learning rate.
+    It requires a `step` value to compute the decayed learning rate. You can
+    just pass a TensorFlow variable that you increment at each training step.
+
+    The schedule a 1-arg callable that produces a decayed learning
+    rate when passed the current optimizer step. This can be useful for changing
+    the learning rate value across different invocations of optimizer functions.
+    It is computed as:
+
+    ```python
+    def decayed_learning_rate(step):
+      step = min(step, decay_steps)
+      linear_decay = (decay_steps - step) / decay_steps
+      cosine_decay = 0.5 * (
+          1 + cos(pi * 2 * num_periods * step / decay_steps))
+      decayed = (alpha + linear_decay) * cosine_decay + beta
+      return initial_learning_rate * decayed
+    ```
+
+    Example usage:
+    ```python
+    decay_steps = 1000
+    lr_decayed_fn = (
+      tf.keras.experimental.LinearCosineDecay(
+        initial_learning_rate, global_step, decay_steps))
+    ```
+
+    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+    as the learning rate. The learning rate schedule is also serializable and
+    deserializable using `tf.keras.optimizers.schedules.serialize` and
+    `tf.keras.optimizers.schedules.deserialize`.
+
+    Args:
+      initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python
+        number. The initial learning rate.
+      decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+        Number of steps to decay over.
+      num_periods: Number of periods in the cosine part of the decay.
+        See computation above.
+      alpha: See computation above.
+      beta: See computation above.
+      name: String.  Optional name of the operation.  Defaults to
+        'LinearCosineDecay'.
+    Returns:
+      A 1-arg callable learning rate schedule that takes the current optimizer
+      step and outputs the decayed learning rate, a scalar `Tensor` of the same
+      type as `initial_learning_rate`.
+    """
+    super(LinearCosineDecay, self).__init__()
+
+    self.initial_learning_rate = initial_learning_rate
+    self.decay_steps = decay_steps
+    self.num_periods = num_periods
+    self.alpha = alpha
+    self.beta = beta
+    self.name = name
+
+  def __call__(self, step):
+    with ops.name_scope(self.name, "LinearCosineDecay",
+                        [self.initial_learning_rate, step]) as name:
+      initial_learning_rate = ops.convert_to_tensor(
+          self.initial_learning_rate, name="initial_learning_rate")
+      dtype = initial_learning_rate.dtype
+      decay_steps = math_ops.cast(self.decay_steps, dtype)
+      num_periods = math_ops.cast(self.num_periods, dtype)
+      alpha = math_ops.cast(self.alpha, dtype)
+      beta = math_ops.cast(self.beta, dtype)
+
+      global_step_recomp = math_ops.cast(step, dtype)
+      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
+      linear_decayed = (decay_steps - global_step_recomp) / decay_steps
+      completed_fraction = global_step_recomp / decay_steps
+      fraction = 2.0 * num_periods * completed_fraction
+      cosine_decayed = 0.5 * (
+          1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
+
+      linear_cosine_decayed = (alpha + linear_decayed) * cosine_decayed + beta
+      return math_ops.multiply(initial_learning_rate, linear_cosine_decayed,
+                               name=name)
+
+  def get_config(self):
+    return {
+        "initial_learning_rate": self.initial_learning_rate,
+        "decay_steps": self.decay_steps,
+        "num_periods": self.num_periods,
+        "alpha": self.alpha,
+        "beta": self.beta,
+        "name": self.name
+    }
+
+
+@keras_export("keras.experimental.NoisyLinearCosineDecay")
+class NoisyLinearCosineDecay(LearningRateSchedule):
+  """A LearningRateSchedule that uses a noisy linear cosine decay schedule."""
+
+  def __init__(
+      self,
+      initial_learning_rate,
+      decay_steps,
+      initial_variance=1.0,
+      variance_decay=0.55,
+      num_periods=0.5,
+      alpha=0.0,
+      beta=0.001,
+      name=None):
+    """Applies noisy linear cosine decay to the learning rate.
+
+    See [Bello et al., ICML2017] Neural Optimizer Search with RL.
+    https://arxiv.org/abs/1709.07417
+
+    For the idea of warm starts here controlled by `num_periods`,
+    see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
+    with Warm Restarts. https://arxiv.org/abs/1608.03983
+
+    Note that linear cosine decay is more aggressive than cosine decay and
+    larger initial learning rates can typically be used.
+
+    When training a model, it is often recommended to lower the learning rate as
+    the training progresses. This schedule applies a noisy linear cosine decay
+    function to an optimizer step, given a provided initial learning rate.
+    It requires a `step` value to compute the decayed learning rate. You can
+    just pass a TensorFlow variable that you increment at each training step.
+
+    The schedule a 1-arg callable that produces a decayed learning
+    rate when passed the current optimizer step. This can be useful for changing
+    the learning rate value across different invocations of optimizer functions.
+    It is computed as:
+
+    ```python
+    def decayed_learning_rate(step):
+      step = min(step, decay_steps)
+      linear_decay = (decay_steps - step) / decay_steps)
+      cosine_decay = 0.5 * (
+          1 + cos(pi * 2 * num_periods * step / decay_steps))
+      decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta
+      return initial_learning_rate * decayed
+    ```
+    where eps_t is 0-centered gaussian noise with variance
+    initial_variance / (1 + global_step) ** variance_decay
+
+    Example usage:
+    ```python
+    decay_steps = 1000
+    lr_decayed_fn = (
+      tf.keras.experimental.NoisyLinearCosineDecay(
+        initial_learning_rate, global_step, decay_steps))
+    ```
+
+    You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
+    as the learning rate. The learning rate schedule is also serializable and
+    deserializable using `tf.keras.optimizers.schedules.serialize` and
+    `tf.keras.optimizers.schedules.deserialize`.
+
+    Args:
+      initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python
+        number. The initial learning rate.
+      decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+        Number of steps to decay over.
+      initial_variance: initial variance for the noise. See computation above.
+      variance_decay: decay for the noise's variance. See computation above.
+      num_periods: Number of periods in the cosine part of the decay.
+        See computation above.
+      alpha: See computation above.
+      beta: See computation above.
+      name: String.  Optional name of the operation.  Defaults to
+        'NoisyLinearCosineDecay'.
+    Returns:
+      A 1-arg callable learning rate schedule that takes the current optimizer
+      step and outputs the decayed learning rate, a scalar `Tensor` of the same
+      type as `initial_learning_rate`.
+    """
+    super(NoisyLinearCosineDecay, self).__init__()
+
+    self.initial_learning_rate = initial_learning_rate
+    self.decay_steps = decay_steps
+    self.initial_variance = initial_variance
+    self.variance_decay = variance_decay
+    self.num_periods = num_periods
+    self.alpha = alpha
+    self.beta = beta
+    self.name = name
+
+  def __call__(self, step):
+    with ops.name_scope(self.name, "NoisyLinearCosineDecay",
+                        [self.initial_learning_rate, step]) as name:
+      initial_learning_rate = ops.convert_to_tensor(
+          self.initial_learning_rate, name="initial_learning_rate")
+      dtype = initial_learning_rate.dtype
+      decay_steps = math_ops.cast(self.decay_steps, dtype)
+      initial_variance = math_ops.cast(self.initial_variance, dtype)
+      variance_decay = math_ops.cast(self.variance_decay, dtype)
+      num_periods = math_ops.cast(self.num_periods, dtype)
+      alpha = math_ops.cast(self.alpha, dtype)
+      beta = math_ops.cast(self.beta, dtype)
+
+      global_step_recomp = math_ops.cast(step, dtype)
+      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
+      linear_decayed = (decay_steps - global_step_recomp) / decay_steps
+      variance = initial_variance / (
+          math_ops.pow(1.0 + global_step_recomp, variance_decay))
+      std = math_ops.sqrt(variance)
+      noisy_linear_decayed = (
+          linear_decayed + random_ops.random_normal(
+              linear_decayed.shape, stddev=std))
+
+      completed_fraction = global_step_recomp / decay_steps
+      fraction = 2.0 * num_periods * completed_fraction
+      cosine_decayed = 0.5 * (
+          1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
+      noisy_linear_cosine_decayed = (
+          (alpha + noisy_linear_decayed) * cosine_decayed + beta)
+
+      return math_ops.multiply(
+          initial_learning_rate, noisy_linear_cosine_decayed, name=name)
+
+  def get_config(self):
+    return {
+        "initial_learning_rate": self.initial_learning_rate,
+        "decay_steps": self.decay_steps,
+        "initial_variance": self.initial_variance,
+        "variance_decay": self.variance_decay,
+        "num_periods": self.num_periods,
+        "alpha": self.alpha,
+        "beta": self.beta,
+        "name": self.name
+    }
+
+
+@keras_export("keras.optimizers.schedules.serialize")
+def serialize(learning_rate_schedule):
+  return generic_utils.serialize_keras_object(learning_rate_schedule)
+
+
+@keras_export("keras.optimizers.schedules.deserialize")
+def deserialize(config, custom_objects=None):
+  return generic_utils.deserialize_keras_object(
+      config,
+      module_objects=globals(),
+      custom_objects=custom_objects,
+      printable_module_name="decay")
diff --git a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..87b97fa76ca39850d111db75aeed3f991e46ddc6
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py
@@ -0,0 +1,527 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Functional test for learning rate decay."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+from absl.testing import parameterized
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
+# Import resource_variable_ops for the variables-to-tensor implicit conversion.
+from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+
+
+def _maybe_serialized(lr_decay, serialize_and_deserialize):
+  if serialize_and_deserialize:
+    serialized = learning_rate_schedule.serialize(lr_decay)
+    return learning_rate_schedule.deserialize(serialized)
+  else:
+    return lr_decay
+
+
+@parameterized.named_parameters(
+    ("NotSerialized", False),
+    ("Serialized", True))
+class LRDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testContinuous(self, serialize):
+    self.evaluate(variables.global_variables_initializer())
+    step = 5
+    decayed_lr = learning_rate_schedule.ExponentialDecay(0.05, 10, 0.96)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+    expected = .05 * 0.96**(5.0 / 10.0)
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testStaircase(self, serialize):
+    if context.executing_eagerly():
+      step = resource_variable_ops.ResourceVariable(0)
+      self.evaluate(variables.global_variables_initializer())
+      decayed_lr = learning_rate_schedule.ExponentialDecay(
+          .1, 3, 0.96, staircase=True)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+
+      # No change to learning rate due to staircase
+      expected = .1
+      self.evaluate(step.assign(1))
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+      expected = .1
+      self.evaluate(step.assign(2))
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+      # Decayed learning rate
+      expected = .1 * 0.96 ** (100 // 3)
+      self.evaluate(step.assign(100))
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_deprecated_v1
+  def testVariables(self, serialize):
+    step = variables.Variable(1)
+    assign_1 = step.assign(1)
+    assign_2 = step.assign(2)
+    assign_100 = step.assign(100)
+    decayed_lr = learning_rate_schedule.ExponentialDecay(
+        .1, 3, 0.96, staircase=True)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+
+    self.evaluate(variables.global_variables_initializer())
+    # No change to learning rate
+    self.evaluate(assign_1.op)
+    self.assertAllClose(self.evaluate(decayed_lr(step)), .1, 1e-6)
+    self.evaluate(assign_2.op)
+    self.assertAllClose(self.evaluate(decayed_lr(step)), .1, 1e-6)
+    # Decayed learning rate
+    self.evaluate(assign_100.op)
+    expected = .1 * 0.96**(100 // 3)
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testPiecewiseConstant(self, serialize):
+    x = resource_variable_ops.ResourceVariable(-999)
+    decayed_lr = learning_rate_schedule.PiecewiseConstantDecay(
+        [100, 110, 120], [1.0, 0.1, 0.01, 0.001])
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+
+    self.evaluate(variables.global_variables_initializer())
+
+    self.assertAllClose(self.evaluate(decayed_lr(x)), 1.0, 1e-6)
+    self.evaluate(x.assign(100))
+    self.assertAllClose(self.evaluate(decayed_lr(x)), 1.0, 1e-6)
+    self.evaluate(x.assign(105))
+    self.assertAllClose(self.evaluate(decayed_lr(x)), 0.1, 1e-6)
+    self.evaluate(x.assign(110))
+    self.assertAllClose(self.evaluate(decayed_lr(x)), 0.1, 1e-6)
+    self.evaluate(x.assign(120))
+    self.assertAllClose(self.evaluate(decayed_lr(x)), 0.01, 1e-6)
+    self.evaluate(x.assign(999))
+    self.assertAllClose(self.evaluate(decayed_lr(x)), 0.001, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testPiecewiseConstantEdgeCases(self, serialize):
+    x_int = resource_variable_ops.ResourceVariable(
+        0, dtype=variables.dtypes.int32)
+    boundaries, values = [-1.0, 1.0], [1, 2, 3]
+    with self.assertRaises(ValueError):
+      decayed_lr = learning_rate_schedule.PiecewiseConstantDecay(
+          boundaries, values)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+      decayed_lr(x_int)
+
+    x = resource_variable_ops.ResourceVariable(0.0)
+    boundaries, values = [-1.0, 1.0], [1.0, 2, 3]
+    with self.assertRaises(ValueError):
+      decayed_lr = learning_rate_schedule.PiecewiseConstantDecay(
+          boundaries, values)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+      decayed_lr(x)
+
+    # Test casting boundaries from int32 to int64.
+    x_int64 = resource_variable_ops.ResourceVariable(
+        0, dtype=variables.dtypes.int64)
+    boundaries, values = [1, 2, 3], [0.4, 0.5, 0.6, 0.7]
+    decayed_lr = learning_rate_schedule.PiecewiseConstantDecay(
+        boundaries, values)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose(self.evaluate(decayed_lr(x_int64)), 0.4, 1e-6)
+    self.evaluate(x_int64.assign(1))
+    self.assertAllClose(self.evaluate(decayed_lr(x_int64)), 0.4, 1e-6)
+    self.evaluate(x_int64.assign(2))
+    self.assertAllClose(self.evaluate(decayed_lr(x_int64)), 0.5, 1e-6)
+    self.evaluate(x_int64.assign(3))
+    self.assertAllClose(self.evaluate(decayed_lr(x_int64)), 0.6, 1e-6)
+    self.evaluate(x_int64.assign(4))
+    self.assertAllClose(self.evaluate(decayed_lr(x_int64)), 0.7, 1e-6)
+
+
+@parameterized.named_parameters(
+    ("NotSerialized", False),
+    ("Serialized", True))
+class LinearDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testHalfWay(self, serialize):
+    step = 5
+    lr = 0.05
+    end_lr = 0.0
+    decayed_lr = learning_rate_schedule.PolynomialDecay(lr, 10, end_lr)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+    expected = lr * 0.5
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testEnd(self, serialize):
+    step = 10
+    lr = 0.05
+    end_lr = 0.001
+    decayed_lr = learning_rate_schedule.PolynomialDecay(lr, 10, end_lr)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+    expected = end_lr
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testHalfWayWithEnd(self, serialize):
+    step = 5
+    lr = 0.05
+    end_lr = 0.001
+    decayed_lr = learning_rate_schedule.PolynomialDecay(lr, 10, end_lr)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+    expected = (lr + end_lr) * 0.5
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBeyondEnd(self, serialize):
+    step = 15
+    lr = 0.05
+    end_lr = 0.001
+    decayed_lr = learning_rate_schedule.PolynomialDecay(lr, 10, end_lr)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+    expected = end_lr
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBeyondEndWithCycle(self, serialize):
+    step = 15
+    lr = 0.05
+    end_lr = 0.001
+    decayed_lr = learning_rate_schedule.PolynomialDecay(
+        lr, 10, end_lr, cycle=True)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+    expected = (lr - end_lr) * 0.25 + end_lr
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+
+@parameterized.named_parameters(
+    ("NotSerialized", False),
+    ("Serialized", True))
+class SqrtDecayTestV2(test_util.TensorFlowTestCase,
+                      parameterized.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testHalfWay(self, serialize):
+    step = 5
+    lr = 0.05
+    end_lr = 0.0
+    power = 0.5
+    decayed_lr = learning_rate_schedule.PolynomialDecay(
+        lr, 10, end_lr, power=power)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+    expected = lr * 0.5**power
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testEnd(self, serialize):
+    step = 10
+    lr = 0.05
+    end_lr = 0.001
+    power = 0.5
+    decayed_lr = learning_rate_schedule.PolynomialDecay(
+        lr, 10, end_lr, power=power)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+    expected = end_lr
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testHalfWayWithEnd(self, serialize):
+    step = 5
+    lr = 0.05
+    end_lr = 0.001
+    power = 0.5
+    decayed_lr = learning_rate_schedule.PolynomialDecay(
+        lr, 10, end_lr, power=power)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+    expected = (lr - end_lr) * 0.5**power + end_lr
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBeyondEnd(self, serialize):
+    step = 15
+    lr = 0.05
+    end_lr = 0.001
+    power = 0.5
+    decayed_lr = learning_rate_schedule.PolynomialDecay(
+        lr, 10, end_lr, power=power)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+    expected = end_lr
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBeyondEndWithCycle(self, serialize):
+    step = 15
+    lr = 0.05
+    end_lr = 0.001
+    power = 0.5
+    decayed_lr = learning_rate_schedule.PolynomialDecay(
+        lr, 10, end_lr, power=power, cycle=True)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+    expected = (lr - end_lr) * 0.25**power + end_lr
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+
+@parameterized.named_parameters(
+    ("NotSerialized", False),
+    ("Serialized", True))
+class PolynomialDecayTestV2(test_util.TensorFlowTestCase,
+                            parameterized.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBeginWithCycle(self, serialize):
+    lr = 0.001
+    decay_steps = 10
+    step = 0
+    decayed_lr = learning_rate_schedule.PolynomialDecay(
+        lr, decay_steps, cycle=True)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+    expected = lr
+    self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+
+@parameterized.named_parameters(
+    ("NotSerialized", False),
+    ("Serialized", True))
+class InverseDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDecay(self, serialize):
+    initial_lr = 0.1
+    k = 10
+    decay_rate = 0.96
+    step = resource_variable_ops.ResourceVariable(0)
+    decayed_lr = learning_rate_schedule.InverseTimeDecay(initial_lr, k,
+                                                         decay_rate)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+
+    self.evaluate(variables.global_variables_initializer())
+    for i in range(k + 1):
+      expected = initial_lr / (1 + i / k * decay_rate)
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+      self.evaluate(step.assign_add(1))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testStaircase(self, serialize):
+    initial_lr = 0.1
+    k = 10
+    decay_rate = 0.96
+    step = resource_variable_ops.ResourceVariable(0)
+    decayed_lr = learning_rate_schedule.InverseTimeDecay(
+        initial_lr, k, decay_rate, staircase=True)
+    decayed_lr = _maybe_serialized(decayed_lr, serialize)
+
+    self.evaluate(variables.global_variables_initializer())
+    for i in range(k + 1):
+      expected = initial_lr / (1 + decay_rate * (i // k))
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+      self.evaluate(step.assign_add(1))
+
+
+@parameterized.named_parameters(
+    ("NotSerialized", False),
+    ("Serialized", True))
+class CosineDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  def np_cosine_decay(self, step, decay_steps, alpha=0.0):
+    step = min(step, decay_steps)
+    completed_fraction = step / decay_steps
+    decay = 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
+    return (1.0 - alpha) * decay + alpha
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDecay(self, serialize):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    for step in range(0, 1500, 250):
+      decayed_lr = learning_rate_schedule.CosineDecay(initial_lr,
+                                                      num_training_steps)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+      expected = self.np_cosine_decay(step, num_training_steps)
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testAlpha(self, serialize):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    alpha = 0.1
+    for step in range(0, 1500, 250):
+      decayed_lr = learning_rate_schedule.CosineDecay(initial_lr,
+                                                      num_training_steps,
+                                                      alpha)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+      expected = self.np_cosine_decay(step, num_training_steps, alpha)
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+
+@parameterized.named_parameters(
+    ("NotSerialized", False),
+    ("Serialized", True))
+class CosineDecayRestartsTestV2(test_util.TensorFlowTestCase,
+                                parameterized.TestCase):
+
+  def np_cosine_decay_restarts(self, step, decay_steps, t_mul=2.0, m_mul=1.0,
+                               alpha=0.0):
+    fac = 1.0
+    while step >= decay_steps:
+      step -= decay_steps
+      decay_steps *= t_mul
+      fac *= m_mul
+
+    completed_fraction = step / decay_steps
+    decay = fac * 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
+    return (1.0 - alpha) * decay + alpha
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDecay(self, serialize):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    for step in range(0, 1500, 250):
+      decayed_lr = learning_rate_schedule.CosineDecayRestarts(
+          initial_lr, num_training_steps)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+      expected = self.np_cosine_decay_restarts(step, num_training_steps)
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testAlpha(self, serialize):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    alpha = 0.1
+    for step in range(0, 1500, 250):
+      decayed_lr = learning_rate_schedule.CosineDecayRestarts(
+          initial_lr, num_training_steps, alpha=alpha)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+      expected = self.np_cosine_decay_restarts(
+          step, num_training_steps, alpha=alpha)
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMMul(self, serialize):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    m_mul = 0.9
+    for step in range(0, 1500, 250):
+      decayed_lr = learning_rate_schedule.CosineDecayRestarts(
+          initial_lr, num_training_steps, m_mul=m_mul)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+      expected = self.np_cosine_decay_restarts(
+          step, num_training_steps, m_mul=m_mul)
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testTMul(self, serialize):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    t_mul = 1.0
+    for step in range(0, 1500, 250):
+      decayed_lr = learning_rate_schedule.CosineDecayRestarts(
+          initial_lr, num_training_steps, t_mul=t_mul)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+      expected = self.np_cosine_decay_restarts(
+          step, num_training_steps, t_mul=t_mul)
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+
+@parameterized.named_parameters(
+    ("NotSerialized", False),
+    ("Serialized", True))
+class LinearCosineDecayTestV2(test_util.TensorFlowTestCase,
+                              parameterized.TestCase):
+
+  def np_linear_cosine_decay(self,
+                             step,
+                             decay_steps,
+                             alpha=0.0,
+                             beta=0.001,
+                             num_periods=0.5):
+    step = min(step, decay_steps)
+    linear_decayed = float(decay_steps - step) / decay_steps
+    fraction = 2.0 * num_periods * step / float(decay_steps)
+    cosine_decayed = 0.5 * (1.0 + math.cos(math.pi * fraction))
+    return (alpha + linear_decayed) * cosine_decayed + beta
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDefaultDecay(self, serialize):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    for step in range(0, 1500, 250):
+      decayed_lr = learning_rate_schedule.LinearCosineDecay(
+          initial_lr, num_training_steps)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+      expected = self.np_linear_cosine_decay(step, num_training_steps)
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNonDefaultDecay(self, serialize):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    for step in range(0, 1500, 250):
+      decayed_lr = learning_rate_schedule.LinearCosineDecay(
+          initial_lr,
+          num_training_steps,
+          alpha=0.1,
+          beta=1e-4,
+          num_periods=5)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+      expected = self.np_linear_cosine_decay(
+          step, num_training_steps, alpha=0.1, beta=1e-4, num_periods=5)
+      self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6)
+
+
+@parameterized.named_parameters(
+    ("NotSerialized", False),
+    ("Serialized", True))
+class NoisyLinearCosineDecayTestV2(test_util.TensorFlowTestCase,
+                                   parameterized.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDefaultNoisyLinearCosine(self, serialize):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    for step in range(0, 1500, 250):
+      # No numerical check because of noise
+      decayed_lr = learning_rate_schedule.NoisyLinearCosineDecay(
+          initial_lr, num_training_steps)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+      # Cannot be deterministically tested
+      self.evaluate(decayed_lr(step))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNonDefaultNoisyLinearCosine(self, serialize):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    for step in range(0, 1500, 250):
+      # No numerical check because of noise
+      decayed_lr = learning_rate_schedule.NoisyLinearCosineDecay(
+          initial_lr,
+          num_training_steps,
+          initial_variance=0.5,
+          variance_decay=0.1,
+          alpha=0.1,
+          beta=1e-4,
+          num_periods=5)
+      decayed_lr = _maybe_serialized(decayed_lr, serialize)
+      # Cannot be deterministically tested
+      self.evaluate(decayed_lr(step))
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/keras/optimizer_v2/nadam.py b/tensorflow/python/keras/optimizer_v2/nadam.py
index 00b095e0dc950c7e68414c1657847b891652a5ba..77a897124be9620414a6c11b11d6b0b7636f6983 100644
--- a/tensorflow/python/keras/optimizer_v2/nadam.py
+++ b/tensorflow/python/keras/optimizer_v2/nadam.py
@@ -18,15 +18,18 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
-from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.keras import backend_config
+from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import keras_export
 
 
-class Nadam(adam.Adam):
+@keras_export('keras.optimizers.Nadam')
+class Nadam(optimizer_v2.OptimizerV2):
   r"""Optimizer that implements the NAdam algorithm.
 
   Much like Adam is essentially RMSprop with momentum, Nadam is Adam with
@@ -34,17 +37,21 @@ class Nadam(adam.Adam):
 
   Initialization:
 
-  $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
-  $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
+  $$m_0 := 0 \text{(Initialize 1st moment vector)}$$
+  $$v_0 := 0 \text{(Initialize 2nd moment vector)}$$
+  $$mu_0 := 1$$
   $$t := 0 \text{(Initialize timestep)}$$
 
   Computes:
   $$t := t + 1$$
-  $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
-  $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
-  $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-  $$m_bar_t := beta_1 * v_t + (1 - beta_1) * g$$
-  $$theta_t := theta_{t-1} - lr_t * m_bar_t / (\sqrt{v_t} + \epsilon)$$
+  $$\mu_t := \beta_1 * (1 - 0.5 * 0.96^{0.004 * t})$$
+  $$g' := g / (1 - \prod_{i=1}^{t}{\mu_i})$$
+  $$m_t := \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
+  $$m' := m_t / (1 - \prod_{i=1}^{t+1}{\mu_i})$$
+  $$v_t := \beta_2 * v_{t-1} + (1 - \beta_2) * g * g$$
+  $$v' := v_t / (1 - \beta_2^t)$$
+  $$\bar{m} := (1 - \mu_t) * g' + \mu_{t+1} * m'$$
+  $$\theta_t := \theta_{t-1} - lr * \bar{m} / (\sqrt{v'} + \epsilon)$$
 
   gradient is evaluated at theta(t) + momentum * v(t), and the variables always
   store theta + beta_1 * m / sqrt(v) instead of theta.
@@ -71,54 +78,95 @@ class Nadam(adam.Adam):
       epsilon: A small constant for numerical stability.
       name: Optional name for the operations created when applying gradients.
         Defaults to "Adamax".
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
     """
 
-    # pylint: disable=useless-super-delegation
-    super(Nadam, self).__init__(
-        learning_rate=learning_rate,
-        beta_1=beta_1,
-        beta_2=beta_2,
-        epsilon=epsilon,
-        amsgrad=False,
-        name=name,
-        **kwargs)
-    # pylint: enable=useless-super-delegation
+    # Backwards compatiblity with keras NAdam optimizer.
+    kwargs['decay'] = kwargs.pop('schedule_decay', 0.004)
+    learning_rate = kwargs.get('lr', learning_rate)
+    if isinstance(learning_rate, learning_rate_schedule.LearningRateSchedule):
+      raise ValueError('The Nadam optimizer does not support '
+                       'tf.keras.optimizers.LearningRateSchedules as the '
+                       'learning rate.')
+
+    if epsilon is None:
+      epsilon = backend_config.epsilon()
+    super(Nadam, self).__init__(name, **kwargs)
+    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
+    self._set_hyper('decay', self._initial_decay)
+    self._set_hyper('beta_1', beta_1)
+    self._set_hyper('beta_2', beta_2)
+    self._set_hyper('epsilon', epsilon)
+    self._m_cache = None
+
+  def _create_slots(self, var_list):
+    var_dtype = var_list[0].dtype.base_dtype
+    if self._m_cache is None:
+      self._m_cache = self.add_weight(
+          'momentum_cache',
+          shape=[],
+          dtype=var_dtype,
+          initializer='ones',
+          trainable=False)
+      self._weights.append(self._m_cache)
+    # Separate for-loops to respect the ordering of slot variables from v1.
+    for var in var_list:
+      # Create slots for the first moments.
+      self.add_slot(var, 'm')
+    for var in var_list:
+      # Create slots for the second moments.
+      self.add_slot(var, 'v')
+
+  def _prepare(self, var_list):
+    var_dtype = var_list[0].dtype.base_dtype
+    beta_1_t = self._get_hyper('beta_1', var_dtype)
+    local_step = math_ops.cast(self.iterations + 1, var_dtype)
+    decay_base = math_ops.cast(0.96, var_dtype)
+    self.m_cache_t = beta_1_t * (
+        1. - 0.5 * (math_ops.pow(decay_base, self._initial_decay * local_step)))
+    self.m_cache_t_1 = beta_1_t * (
+        1. - 0.5 *
+        (math_ops.pow(decay_base, self._initial_decay * (local_step + 1))))
+    m_schedule_new = self._m_cache * self.m_cache_t
+    self.m_schedule_new = state_ops.assign(
+        self._m_cache, m_schedule_new, use_locking=self._use_locking)
+    self.m_schedule_next = self.m_schedule_new * self.m_cache_t_1
 
   def _resource_apply_dense(self, grad, var):
     var_dtype = var.dtype.base_dtype
-    lr_t = self._decayed_lr(var_dtype)
+    lr_t = self._get_hyper('learning_rate', var_dtype)
+    epsilon_t = self._get_hyper('epsilon', var_dtype)
     m = self.get_slot(var, 'm')
     v = self.get_slot(var, 'v')
     beta_1_t = self._get_hyper('beta_1', var_dtype)
     beta_2_t = self._get_hyper('beta_2', var_dtype)
     local_step = math_ops.cast(self.iterations + 1, var_dtype)
-    beta_1_power = math_ops.pow(beta_1_t, local_step)
-    beta_2_power = math_ops.pow(beta_2_t, local_step)
-    return training_ops.resource_apply_adam(
-        var.handle,
-        m.handle,
-        v.handle,
-        beta_1_power,
-        beta_2_power,
-        lr_t,
-        beta_1_t,
-        beta_2_t,
-        self._get_hyper('epsilon', var_dtype),
-        grad,
-        use_locking=self._use_locking,
-        use_nesterov=True)
+
+    g_prime = grad / (1. - self.m_schedule_new)
+    m_t = beta_1_t * m + (1 - beta_1_t) * grad
+    m_t = state_ops.assign(m, m_t, use_locking=self._use_locking)
+    m_t_prime = m_t / (1. - self.m_schedule_next)
+    v_t = beta_2_t * v + (1 - beta_2_t) * math_ops.square(grad)
+    v_t = state_ops.assign(v, v_t, use_locking=self._use_locking)
+    v_t_prime = v_t / (1. - math_ops.pow(beta_2_t, local_step))
+    m_t_bar = (1. - self.m_cache_t) * g_prime + self.m_cache_t_1 * m_t_prime
+    var_t = var - lr_t * m_t_bar / (math_ops.sqrt(v_t_prime) + epsilon_t)
+    return state_ops.assign(var, var_t, use_locking=self._use_locking).op
 
   def _resource_apply_sparse(self, grad, var, indices):
     var_dtype = var.dtype.base_dtype
-    lr_t = self._decayed_lr(var_dtype)
+    lr_t = self._get_hyper('learning_rate', var_dtype)
+    epsilon_t = self._get_hyper('epsilon', var_dtype)
+    v = self.get_slot(var, 'v')
     beta_1_t = self._get_hyper('beta_1', var_dtype)
     beta_2_t = self._get_hyper('beta_2', var_dtype)
     local_step = math_ops.cast(self.iterations + 1, var_dtype)
-    beta_1_power = math_ops.pow(beta_1_t, local_step)
-    beta_2_power = math_ops.pow(beta_2_t, local_step)
-    epsilon_t = self._get_hyper('epsilon', var_dtype)
-    lr = (lr_t * math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power))
+
+    g_prime = grad / (1. - self.m_schedule_new)
 
     # m_t = beta1 * m + (1 - beta1) * g_t
     m = self.get_slot(var, 'm')
@@ -126,8 +174,10 @@ class Nadam(adam.Adam):
     m_t = state_ops.assign(m, m * beta_1_t, use_locking=self._use_locking)
     with ops.control_dependencies([m_t]):
       m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
-      # m_bar = (1 - beta1) * g_t + beta1 * m_t
-      m_bar = m_scaled_g_values + beta_1_t * array_ops.gather(m_t, indices)
+      m_t_slice = array_ops.gather(m_t, indices)
+
+    m_t_prime = m_t_slice / (1. - self.m_schedule_next)
+    m_t_bar = (1. - self.m_cache_t) * g_prime + self.m_cache_t_1 * m_t_prime
 
     # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
     v = self.get_slot(var, 'v')
@@ -135,9 +185,22 @@ class Nadam(adam.Adam):
     v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking)
     with ops.control_dependencies([v_t]):
       v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
-
-    v_t_slice = array_ops.gather(v_t, indices)
-    v_sqrt = math_ops.sqrt(v_t_slice)
-    var_update = self._resource_scatter_add(var, indices,
-                                            -lr * m_bar / (v_sqrt + epsilon_t))
-    return control_flow_ops.group(*[var_update, m_bar, v_t])
+      v_t_slice = array_ops.gather(v_t, indices)
+
+    v_t_prime = v_t_slice / (1. - math_ops.pow(beta_2_t, local_step))
+    v_prime_sqrt = math_ops.sqrt(v_t_prime)
+
+    var_update = self._resource_scatter_add(
+        var, indices, -lr_t * m_t_bar / (v_prime_sqrt + epsilon_t))
+    return control_flow_ops.group(*[var_update, m_t_bar, v_t])
+
+  def get_config(self):
+    config = super(Nadam, self).get_config()
+    config.update({
+        'learning_rate': self._serialize_hyperparameter('learning_rate'),
+        'decay': self._serialize_hyperparameter('decay'),
+        'beta_1': self._serialize_hyperparameter('beta_1'),
+        'beta_2': self._serialize_hyperparameter('beta_2'),
+        'epsilon': self._serialize_hyperparameter('epsilon'),
+    })
+    return config
diff --git a/tensorflow/python/keras/optimizer_v2/nadam_test.py b/tensorflow/python/keras/optimizer_v2/nadam_test.py
index d991e3117cad4530ffb1f3a4315b49dc46d26bfc..8dd61956f6f1efcbf11c8e8379ac0c5eac2cc5ef 100644
--- a/tensorflow/python/keras/optimizer_v2/nadam_test.py
+++ b/tensorflow/python/keras/optimizer_v2/nadam_test.py
@@ -40,45 +40,54 @@ def get_beta_accumulators(opt, dtype):
   return (beta_1_power, beta_2_power)
 
 
+def update_m_cache(m_cache, t, beta1=0.9):
+  mu_t = beta1 * (1 - 0.5 * 0.96**(0.004 * (t + 1)))
+  m_cache_t = m_cache * mu_t
+  return m_cache_t
+
+
 def nadam_update_numpy(param,
                        g_t,
                        t,
                        m,
                        v,
+                       m_cache,
                        alpha=0.001,
                        beta1=0.9,
                        beta2=0.999,
                        epsilon=1e-8):
-  alpha_t = alpha * np.sqrt(1 - beta2**(t + 1)) / (1 - beta1**(t + 1))
 
+  mu_t = beta1 * (1 - 0.5 * 0.96**(0.004 * (t + 1)))
+  mu_t_1 = beta1 * (1 - 0.5 * 0.96**(0.004 * (t + 2)))
+  m_cache_t_1 = m_cache * mu_t_1
+  g_prime_t = g_t / (1 - m_cache)
   m_t = beta1 * m + (1 - beta1) * g_t
   v_t = beta2 * v + (1 - beta2) * g_t * g_t
 
-  m_bar = (1 - beta1) * g_t + beta1 * m_t
+  m_prime_t = m_t / (1 - m_cache_t_1)
+  v_prime_t = v_t / (1 - beta2**(t + 1))
+  m_bar_t = (1 - mu_t) * g_prime_t + mu_t_1 * m_prime_t
 
-  param_t = param - alpha_t * m_bar / (np.sqrt(v_t) + epsilon)
+  param_t = param - alpha * m_bar_t / (np.sqrt(v_prime_t) + epsilon)
   return param_t, m_t, v_t
 
 
 class NadamOptimizerTest(test.TestCase):
 
-  def doTestSparse(self, use_resource=False):
+  @test_util.run_deprecated_v1
+  def testSparse(self):
     sparse_epsilon = 1e-7
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
         # Initialize variables for numpy implementation.
-        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        m0, v0, m1, v1, mcache = 0.0, 0.0, 0.0, 0.0, 1.0
         var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
         grads0_np = np.array([0.1, 0, 0.1], dtype=dtype.as_numpy_dtype)
         var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
         grads1_np = np.array([0.01, 0, 0.01], dtype=dtype.as_numpy_dtype)
 
-        if use_resource:
-          var0 = resource_variable_ops.ResourceVariable(var0_np)
-          var1 = resource_variable_ops.ResourceVariable(var1_np)
-        else:
-          var0 = variables.Variable(var0_np)
-          var1 = variables.Variable(var1_np)
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
         grads0_np_indices = np.array([0, 2], dtype=np.int32)
         grads0 = ops.IndexedSlices(
             constant_op.constant(grads0_np[grads0_np_indices]),
@@ -103,74 +112,22 @@ class NadamOptimizerTest(test.TestCase):
           self.assertAllCloseAccordingToType(0.999**(t + 1), beta2_power.eval())
           update.run()
 
+          mcache = update_m_cache(mcache, t)
           var0_np, m0, v0 = nadam_update_numpy(
-              var0_np, grads0_np, t, m0, v0, epsilon=sparse_epsilon)
+              var0_np, grads0_np, t, m0, v0, mcache, epsilon=sparse_epsilon)
           var1_np, m1, v1 = nadam_update_numpy(
-              var1_np, grads1_np, t, m1, v1, epsilon=sparse_epsilon)
-
-          # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
-
-  @test_util.run_deprecated_v1
-  def testSparse(self):
-    self.doTestSparse(use_resource=False)
-
-  @test_util.run_deprecated_v1
-  def testResourceSparse(self):
-    self.doTestSparse(use_resource=True)
-
-  def doTestBasic(self, use_resource=False):
-    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-        # Initialize variables for numpy implementation.
-        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
-
-        if use_resource:
-          var0 = resource_variable_ops.ResourceVariable(var0_np)
-          var1 = resource_variable_ops.ResourceVariable(var1_np)
-        else:
-          var0 = variables.Variable(var0_np)
-          var1 = variables.Variable(var1_np)
-        grads0 = constant_op.constant(grads0_np)
-        grads1 = constant_op.constant(grads1_np)
-        opt = nadam.Nadam()
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
-
-        beta1_power, beta2_power = get_beta_accumulators(opt, dtype)
-
-        # Run 3 steps of Nadam
-        for t in range(3):
-          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**(t + 1), beta2_power.eval())
-          update.run()
-
-          var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0)
-          var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1)
+              var1_np, grads1_np, t, m1, v1, mcache, epsilon=sparse_epsilon)
 
           # Validate updated params
           self.assertAllCloseAccordingToType(var0_np, var0.eval())
           self.assertAllCloseAccordingToType(var1_np, var1.eval())
 
   @test_util.run_deprecated_v1
-  def testResourceBasic(self):
-    self.doTestBasic(use_resource=True)
-
-  @test_util.run_deprecated_v1
-  def testBasicWithLearningRateDecay(self):
+  def testBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
         # Initialize variables for numpy implementation.
-        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        m0, v0, m1, v1, mcache = 0.0, 0.0, 0.0, 0.0, 1.0
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
         grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
         var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
@@ -180,9 +137,7 @@ class NadamOptimizerTest(test.TestCase):
         var1 = resource_variable_ops.ResourceVariable(var1_np)
         grads0 = constant_op.constant(grads0_np)
         grads1 = constant_op.constant(grads1_np)
-        learning_rate = 0.001
-        decay = 0.5
-        opt = nadam.Nadam(learning_rate=learning_rate, decay=decay)
+        opt = nadam.Nadam()
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
@@ -190,24 +145,48 @@ class NadamOptimizerTest(test.TestCase):
         self.assertAllClose([1.0, 2.0], var0.eval())
         self.assertAllClose([3.0, 4.0], var1.eval())
 
-        beta1_power, beta2_power = get_beta_accumulators(opt, dtype)
-
         # Run 3 steps of Nadam
         for t in range(3):
-          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**(t + 1), beta2_power.eval())
           update.run()
 
-          lr = learning_rate / (1 + decay * t)
-          var0_np, m0, v0 = nadam_update_numpy(
-              var0_np, grads0_np, t, m0, v0, alpha=lr)
-          var1_np, m1, v1 = nadam_update_numpy(
-              var1_np, grads1_np, t, m1, v1, alpha=lr)
+          mcache = update_m_cache(mcache, t)
+          var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0,
+                                               mcache)
+          var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1,
+                                               mcache)
 
           # Validate updated params
           self.assertAllCloseAccordingToType(var0_np, var0.eval())
           self.assertAllCloseAccordingToType(var1_np, var1.eval())
 
+  def testConstructNAdamWithLR(self):
+    opt = nadam.Nadam(lr=1.0)
+    opt_2 = nadam.Nadam(learning_rate=0.1, lr=1.0)
+    opt_3 = nadam.Nadam(learning_rate=0.1)
+    self.assertIsInstance(opt.lr, variables.Variable)
+    self.assertIsInstance(opt_2.lr, variables.Variable)
+    self.assertIsInstance(opt_3.lr, variables.Variable)
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose(self.evaluate(opt.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
+
+  def testConstructNAdamWithScheduleDecay(self):
+    opt = nadam.Nadam(schedule_decay=0.2)
+    self.assertIsInstance(opt.decay, variables.Variable)
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose(self.evaluate(opt.decay), (0.2))
+
+  def testConstructNAdamWithEpsilonValues(self):
+    opt = nadam.Nadam(epsilon=None)
+    config = opt.get_config()
+    self.assertEqual(config["epsilon"], 1e-7)
+
+    opt = nadam.Nadam(epsilon=1e-8)
+    config = opt.get_config()
+    self.assertEqual(config["epsilon"], 1e-8)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index 15f3009a4af4270f2f845f6c5bf945f330efe545..26369e976462806c39e3f6a3f3c2f7e15c6b10f2 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -21,12 +21,13 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
+import functools
 
 import six
 
-from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
+from tensorflow.python.distribute import values as distributed_values
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
@@ -34,38 +35,62 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras.engine import base_layer_utils
+from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import optimizer as optimizer_v1
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import keras_export
+
+
+def _deduplicate_indexed_slices(values, indices):
+  """Sums `values` associated with any non-unique `indices`.
+
+  Args:
+    values: A `Tensor` with rank >= 1.
+    indices: A one-dimensional integer `Tensor`, indexing into the first
+      dimension of `values` (as in an IndexedSlices object).
+
+  Returns:
+    A tuple of (`summed_values`, `unique_indices`) where `unique_indices` is a
+    de-duplicated version of `indices` and `summed_values` contains the sum of
+    `values` slices associated with each unique index.
+  """
+  unique_indices, new_index_positions = array_ops.unique(indices)
+  summed_values = math_ops.unsorted_segment_sum(
+      values, new_index_positions,
+      array_ops.shape(unique_indices)[0])
+  return (summed_values, unique_indices)
 
 
 @six.add_metaclass(abc.ABCMeta)
-class OptimizerV2(optimizer_v1.Optimizer):
+@keras_export("keras.optimizers.Optimizer")
+class OptimizerV2(trackable.Trackable):
   """Updated base class for optimizers.
 
   This class defines the API to add Ops to train a model.  You never use this
   class directly, but instead instantiate one of its subclasses such as
-  `GradientDescentOptimizer`, `AdagradOptimizer`, or `MomentumOptimizer`.
+  `tf.keras.optimizers.SGD`, `tf.keras.optimizers.Adam`.
 
   ### Usage
 
   ```python
   # Create an optimizer with the desired parameters.
-  opt = GradientDescentOptimizer(learning_rate=0.1)
-  # Add Ops to the graph to minimize a cost by updating a list of variables.
-  # "cost" is a Tensor, and the list of variables contains tf.Variable
-  # objects.
-  opt_op = opt.minimize(cost, var_list=<list of variables>)
-  ```
-
-  In the training program you will just have to run the returned Op.
-
-  ```python
-  # Execute opt_op to do one step of training:
+  opt = tf.keras.optimizers.SGD(learning_rate=0.1)
+  # `loss` is a callable that takes no argument and returns the value
+  # to minimize.
+  loss = lambda: 3 * var1 * var1 + 2 * var2 * var2
+  # In graph mode, returns op that minimizes the loss by updating the listed
+  # variables.
+  opt_op = opt.minimize(loss, var_list=[var1, var2])
   opt_op.run()
+  # In eager mode, simply call minimize to update the list of variables.
+  opt.minimize(loss, var_list=[var1, var2])
   ```
 
   ### Processing gradients before applying them.
@@ -74,7 +99,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
   applying them to the variables.  If you want to process the gradients
   before applying them you can instead use the optimizer in three steps:
 
-  1.  Compute the gradients with `compute_gradients()`.
+  1.  Compute the gradients with `tf.GradientTape`.
   2.  Process the gradients as you wish.
   3.  Apply the processed gradients with `apply_gradients()`.
 
@@ -82,10 +107,15 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
   ```python
   # Create an optimizer.
-  opt = GradientDescentOptimizer(learning_rate=0.1)
+  opt = tf.keras.optimizers.SGD(learning_rate=0.1)
 
   # Compute the gradients for a list of variables.
-  grads_and_vars = opt.compute_gradients(loss, <list of variables>)
+  with tf.GradientTape() as tape:
+    loss = <call_loss_function>
+  vars = <list_of_variables>
+  grads = tape.gradient(loss, vars)
+  processed_grads = [process_gradient(g) for g in grads]
+  grads_and_vars = zip(processed_grads, var_list)
 
   # grads_and_vars is a list of tuples (gradient, variable).  Do whatever you
   # need to the 'gradient' part, for example cap them, etc.
@@ -95,13 +125,43 @@ class OptimizerV2(optimizer_v1.Optimizer):
   opt.apply_gradients(capped_grads_and_vars)
   ```
 
+  ### Use with `tf.distribute.Strategy`.
+
+  This optimizer class is `tf.distribute.Strategy` aware, which means it
+  automatically sums gradients across all replicas. To average gradients,
+  you divide your loss by the global batch size, which is done automatically
+  if you use a member of `tf.keras.losses` or `tf.losses`. See the
+  `reduction` argument of your loss which should be set to
+  `tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE` for averaging or
+  `tf.keras.losses.Reduction.SUM` for not.
+
+  If you are not using these and you want to average gradients, you should use
+  `tf.math.reduce_sum` to add up your per-example losses and then divide by the
+  global batch size. Note that when using `tf.distribute.Strategy`, the first
+  component of a tensor's shape is the *replica-local* batch size, which is off
+  by a factor equal to the number of replicas being used to compute a single
+  step. As a result, using `tf.math.reduce_mean` will give the wrong answer,
+  resulting in gradients that can be many times too big.
+
+  ### Variable Constraint
+
+  All Keras optimizers respect variable constraints. If constraint function is
+  passed to any variable, the constraint will be applied to the variable after
+  the gradient has been applied to the variable.
+  Important: If gradient is sparse tensor, variable constraint is not supported.
+
+  ### Thread Compatibility
+
+  The entire optimizer is currently thread compatible, not thread-safe. The user
+  needs to perform synchronization if necessary.
+
   ### Slots
 
-  Some optimizer subclasses, such as `MomentumOptimizer` and `AdagradOptimizer`
-  allocate and manage additional variables associated with the variables to
-  train.  These are called <i>Slots</i>.  Slots have names and you can ask the
-  optimizer for the names of the slots that it uses.  Once you have a slot name
-  you can ask the optimizer for the variable it created to hold the slot value.
+  Many optimizer subclasses, such as `Adam` and `Adagrad` allocate and manage
+  additional variables associated with the variables to train.  These are called
+  <i>Slots</i>.  Slots have names and you can ask the optimizer for the names of
+  the slots that it uses.  Once you have a slot name you can ask the optimizer
+  for the variable it created to hold the slot value.
 
   This can be useful if you want to log debug a training algorithm, report stats
   about the slots, etc.
@@ -114,6 +174,31 @@ class OptimizerV2(optimizer_v1.Optimizer):
   callables. If they are callable, the callable will be called during
   `apply_gradients()` to get the value for the hyper parameter.
 
+  Hyper parameters can be overwritten through user code:
+
+  Example:
+
+  ```python
+  # Create an optimizer with the desired parameters.
+  opt = tf.keras.optimizers.SGD(learning_rate=0.1)
+  # `loss` is a callable that takes no argument and returns the value
+  # to minimize.
+  loss = lambda: 3 * var1 + 2 * var2
+  # In eager mode, simply call minimize to update the list of variables.
+  opt.minimize(loss, var_list=[var1, var2])
+  # update learning rate
+  opt.learning_rate = 0.05
+  opt.minimize(loss, var_list=[var1, var2])
+  ```
+
+  ### Write a customized optimizer.
+  If you intend to create your own optimization algorithm, simply inherit from
+  this class and override the following methods:
+
+    - resource_apply_dense (update variable given gradient tensor is dense)
+    - resource_apply_sparse (update variable given gradient tensor is sparse)
+    - create_slots (if your optimizer algorithm requires additional variables)
+    - get_config (serialization of the optimizer, include all hyper parameters)
   """
 
   def __init__(self, name, **kwargs):
@@ -130,51 +215,68 @@ class OptimizerV2(optimizer_v1.Optimizer):
     Args:
       name: A non-empty string.  The name to use for accumulators created
         for the optimizer.
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
 
     Raises:
       ValueError: If name is malformed.
       RuntimeError: If _create_slots has been overridden instead of
           _create_vars.
     """
+    allowed_kwargs = {"clipnorm", "clipvalue", "lr", "decay"}
+    for k in kwargs:
+      if k not in allowed_kwargs:
+        raise TypeError("Unexpected keyword argument "
+                        "passed to optimizer: " + str(k))
+      # checks that all keyword arguments are non-negative.
+      if kwargs[k] < 0:
+        raise ValueError("Expected {} >= 0, received: {}".format(k, kwargs[k]))
+
     self._use_locking = True
-    super(OptimizerV2, self).__init__(self._use_locking, name)
+    self._name = name
     self._hyper = {}
     # dict: {variable name : {slot name : variable}}
     self._slots = {}
+    self._slot_names = []
     self._weights = []
+    self._iterations = None
+
+    # For implementing Trackable. Stores information about how to restore
+    # slot variables which have not yet been created
+    # (trackable._CheckpointPosition objects).
+    #  {slot_name :
+    #      {_var_key(variable_to_train): [checkpoint_position, ... ], ... },
+    #   ... }
+    self._deferred_slot_restorations = {}
 
     decay = kwargs.pop("decay", 0.0)
     if decay < 0.:
       raise ValueError("decay cannot be less than 0: {}".format(decay))
     self._initial_decay = decay
+    if "clipnorm" in kwargs:
+      self.clipnorm = kwargs.pop("clipnorm")
+    if "clipvalue" in kwargs:
+      self.clipvalue = kwargs.pop("clipvalue")
 
-    self._prepared = False
+    self._hypers_created = False
 
-  def minimize(self,
-               loss,
-               var_list,
-               aggregation_method=None,
-               colocate_gradients_with_ops=False,
-               name=None,
-               grad_loss=None):
+  def minimize(self, loss, var_list, grad_loss=None, name=None):
     """Add operations to minimize `loss` by updating `var_list`.
 
-    This method simply combines calls `compute_gradients()` and
+    This method simply computes gradient using `tf.GradientTape` and calls
     `apply_gradients()`. If you want to process the gradient before applying
-    them call `compute_gradients()` and `apply_gradients()` explicitly instead
+    then call `tf.GradientTape` and `apply_gradients()` explicitly instead
     of using this function.
 
     Args:
-      loss: A `Tensor` containing the value to minimize.
+      loss: A callable taking no arguments which returns the value to minimize.
       var_list: list or tuple of `Variable` objects to update to minimize
         `loss`.
-      aggregation_method: Specifies the method used to combine gradient terms.
-        Valid values are defined in the class `AggregationMethod`.
-      colocate_gradients_with_ops: If True, try colocating gradients with the
-        corresponding op.
-      name: Optional name for the returned operation.
       grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
+      name: Optional name for the returned operation.
 
     Returns:
       An Operation that updates the variables in `var_list`.  If `global_step`
@@ -186,29 +288,16 @@ class OptimizerV2(optimizer_v1.Optimizer):
     @compatibility(eager)
     When eager execution is enabled, `loss` should be a Python function that
     takes no arguments and computes the value to be minimized. Minimization (and
-    gradient computation) is done with respect to the elements of `var_list` if
-    not None, else with respect to any trainable variables created during the
-    execution of the `loss` function. `gate_gradients`, `aggregation_method`,
-    `colocate_gradients_with_ops` and `grad_loss` are ignored when eager
-    execution is enabled.
+    gradient computation) is done with respect to the elements of `var_list`.
+    `grad_loss` is ignored when eager execution is enabled.
     @end_compatibility
     """
-    grads_and_vars = self.compute_gradients(
-        loss,
-        var_list=var_list,
-        aggregation_method=aggregation_method,
-        colocate_gradients_with_ops=colocate_gradients_with_ops,
-        grad_loss=grad_loss)
+    grads_and_vars = self._compute_gradients(
+        loss, var_list=var_list, grad_loss=grad_loss)
 
     return self.apply_gradients(grads_and_vars, name=name)
 
-  def compute_gradients(self,
-                        loss,
-                        var_list,
-                        aggregation_method=None,
-                        colocate_gradients_with_ops=False,
-                        grad_loss=None,
-                        stop_gradients=None):
+  def _compute_gradients(self, loss, var_list, grad_loss=None):
     """Compute gradients of `loss` for the variables in `var_list`.
 
     This is the first part of `minimize()`.  It returns a list
@@ -218,19 +307,11 @@ class OptimizerV2(optimizer_v1.Optimizer):
     given variable.
 
     Args:
-      loss: A Tensor containing the value to minimize or a callable taking no
-        arguments which returns the value to minimize. When eager execution is
-        enabled it must be a callable.
-      var_list: Optional list or tuple of `tf.Variable` to update to minimize
+      loss: A callable taking no arguments which returns the value to minimize.
+      var_list: List or tuple of `tf.Variable` to update to minimize
         `loss`.  Defaults to the list of variables collected in the graph under
         the key `GraphKeys.TRAINABLE_VARIABLES`.
-      aggregation_method: Specifies the method used to combine gradient terms.
-        Valid values are defined in the class `AggregationMethod`.
-      colocate_gradients_with_ops: If True, try colocating gradients with the
-        corresponding op.
       grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
-      stop_gradients: Optional. A Tensor or list of tensors not to differentiate
-        through.
 
     Returns:
       A list of (gradient, variable) pairs. Variable is always present, but
@@ -239,38 +320,21 @@ class OptimizerV2(optimizer_v1.Optimizer):
     Raises:
       TypeError: If `var_list` contains anything else than `Variable` objects.
       ValueError: If some arguments are invalid, or var_list is None.
-      RuntimeError: If called with eager execution enabled and `loss` is
-        not callable.
-
-    @compatibility(eager)
-    When eager execution is enabled, `aggregation_method`, and
-    `colocate_gradients_with_ops` are ignored.
-    @end_compatibility
     """
     var_list = nest.flatten(var_list)
     # TODO(josh11b): Test that we handle weight decay in a reasonable way.
-    if callable(loss):
-      with backprop.GradientTape() as tape:
-        tape.watch(var_list)
-        loss_value = loss()
-        loss_value = self._scale_loss(loss_value)
-      grads = tape.gradient(loss_value, var_list, grad_loss)
-    else:
-      if context.executing_eagerly():
-        raise RuntimeError("`loss` passed to Optimizer.compute_gradients "
-                           "should be a function when eager execution is "
-                           "enabled.")
-      loss = self._scale_loss(loss)
-      self._assert_valid_dtypes([loss])
-      if grad_loss is not None:
-        self._assert_valid_dtypes([grad_loss])
-      grads = gradients.gradients(
-          loss,
-          var_list,
-          grad_ys=grad_loss,
-          aggregation_method=aggregation_method,
-          colocate_gradients_with_ops=colocate_gradients_with_ops,
-          stop_gradients=stop_gradients)
+    with backprop.GradientTape() as tape:
+      tape.watch(var_list)
+      loss_value = loss()
+    grads = tape.gradient(loss_value, var_list, grad_loss)
+
+    if hasattr(self, "clipnorm"):
+      grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads]
+    if hasattr(self, "clipvalue"):
+      grads = [
+          clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue)
+          for g in grads
+      ]
 
     grads_and_vars = list(zip(grads, var_list))
     self._assert_valid_dtypes([
@@ -280,14 +344,35 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
     return grads_and_vars
 
-  @staticmethod
-  def _scale_loss(loss_value):
-    if distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN:
-      num_replicas = \
-        distribute_ctx.get_distribution_strategy().num_replicas_in_sync
-      if num_replicas > 1:
-        loss_value *= (1. / num_replicas)
-    return loss_value
+  def get_gradients(self, loss, params):
+    """Returns gradients of `loss` with respect to `params`.
+
+    Arguments:
+      loss: Loss tensor.
+      params: List of variables.
+
+    Returns:
+      List of gradient tensors.
+
+    Raises:
+      ValueError: In case any gradient cannot be computed (e.g. if gradient
+        function not implemented).
+    """
+    grads = gradients.gradients(loss, params)
+    if None in grads:
+      raise ValueError("An operation has `None` for gradient. "
+                       "Please make sure that all of your ops have a "
+                       "gradient defined (i.e. are differentiable). "
+                       "Common ops without gradient: "
+                       "K.argmax, K.round, K.eval.")
+    if hasattr(self, "clipnorm"):
+      grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads]
+    if hasattr(self, "clipvalue"):
+      grads = [
+          clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue)
+          for g in grads
+      ]
+    return grads
 
   def apply_gradients(self, grads_and_vars, name=None):
     """Apply gradients to variables.
@@ -296,8 +381,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
     applies gradients.
 
     Args:
-      grads_and_vars: List of (gradient, variable) pairs as returned by
-        `compute_gradients()`.
+      grads_and_vars: List of (gradient, variable) pairs.
       name: Optional name for the returned operation.  Default to the name
         passed to the `Optimizer` constructor.
 
@@ -311,16 +395,24 @@ class OptimizerV2(optimizer_v1.Optimizer):
     """
     grads_and_vars = _filter_grads(grads_and_vars)
     var_list = [v for (_, v) in grads_and_vars]
-    if distribute_ctx.has_distribution_strategy():
-      reduced_grads = merge_grads(grads_and_vars)
-      grads_and_vars = zip(reduced_grads, var_list)
 
+    self._create_hypers()
     with ops.init_scope():
-      self._prepare()
       self._create_slots(var_list)
-    update_ops = []
 
-    def update_grad_to_var(grad, var):
+    self._prepare(var_list)
+
+    return distribute_ctx.get_replica_context().merge_call(
+        self._distributed_apply, args=(grads_and_vars,), kwargs={"name": name})
+
+  def _distributed_apply(self, distribution, grads_and_vars, name):
+    """`apply_gradients` using a `DistributionStrategy`."""
+    reduced_grads = distribution.extended.batch_reduce_to(
+        ds_reduce_util.ReduceOp.SUM, grads_and_vars)
+    var_list = [v for _, v in grads_and_vars]
+    grads_and_vars = zip(reduced_grads, var_list)
+
+    def apply_grad_to_update_var(var, grad):
       """Apply gradient to variable."""
       if isinstance(var, ops.Tensor):
         raise NotImplementedError("Trying to update a Tensor ", var)
@@ -337,21 +429,29 @@ class OptimizerV2(optimizer_v1.Optimizer):
       else:
         return update_op
 
+    update_ops = []
     with ops.name_scope(name, self._name) as name:
       for grad, var in grads_and_vars:
         scope_name = ("" if ops.executing_eagerly_outside_functions() else
                       "_" + var.op.name)
         with ops.name_scope("update" + scope_name):
-          update_ops.append(update_grad_to_var(grad, var))
-      # control dependencies does not work in per replica mode, please change
-      # this once b/118841692 is fixed.
-      # with ops.control_dependencies(update_ops):
-      #   apply_updates = self._iterations.assign_add(1).op
-      apply_updates = merge_update_step(update_ops, self.iterations)
+          update_ops.extend(
+              distribution.extended.update(
+                  var, apply_grad_to_update_var, args=(grad,), group=False))
+      with ops.control_dependencies(update_ops):
+        apply_updates = self._iterations.assign_add(1)
+      if not context.executing_eagerly():
+        apply_updates = apply_updates.op
       return apply_updates
 
   def get_updates(self, loss, params):
-    return [self.minimize(loss, params)]
+    grads = self.get_gradients(loss, params)
+    grads_and_vars = list(zip(grads, params))
+    self._assert_valid_dtypes([
+        v for g, v in grads_and_vars
+        if g is not None and v.dtype != dtypes.resource
+    ])
+    return [self.apply_gradients(grads_and_vars)]
 
   def _set_hyper(self, name, value):
     """set hyper `name` to value. value can be callable, tensor, numeric."""
@@ -359,14 +459,21 @@ class OptimizerV2(optimizer_v1.Optimizer):
       self._hyper[name] = value
     else:
       prev_value = self._hyper[name]
-      if callable(prev_value) or isinstance(prev_value,
-                                            (ops.Tensor, int, float)):
+      if (callable(prev_value)
+          or isinstance(prev_value,
+                        (ops.Tensor, int, float,
+                         learning_rate_schedule.LearningRateSchedule))
+          or isinstance(value, learning_rate_schedule.LearningRateSchedule)):
         self._hyper[name] = value
       else:
         backend.set_value(self._hyper[name], value)
 
   def _get_hyper(self, name, dtype=None):
+    if not self._hypers_created:
+      self._create_hypers()
     value = self._hyper[name]
+    if isinstance(value, learning_rate_schedule.LearningRateSchedule):
+      return value
     if callable(value):
       value = value()
     if dtype:
@@ -386,7 +493,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
       if name == "lr":
         name = "learning_rate"
       if name in self._hyper:
-        return self._hyper[name]
+        return self._get_hyper(name)
       raise e
 
   def __setattr__(self, name, value):
@@ -399,38 +506,60 @@ class OptimizerV2(optimizer_v1.Optimizer):
     else:
       super(OptimizerV2, self).__setattr__(name, value)
 
+  def get_slot_names(self):
+    """A list of names for this optimizer's slots."""
+    return self._slot_names
+
   def add_slot(self, var, slot_name, initializer="zeros"):
+    """Add a new slot variable for `var`."""
+    if slot_name not in self._slot_names:
+      self._slot_names.append(slot_name)
     var_key = _var_key(var)
     slot_dict = self._slots.setdefault(var_key, {})
-    if slot_name not in slot_dict:
-      slot_key = _get_slot_key_from_var(var, slot_name)
-      weight = self.add_weight(
-          name=slot_key,
-          shape=var.shape,
+    weight = slot_dict.get(slot_name, None)
+    if weight is None:
+      if isinstance(initializer, six.string_types) or callable(initializer):
+        initializer = initializers.get(initializer)
+        initial_value = functools.partial(
+            initializer, shape=var.shape, dtype=var.dtype)
+      else:
+        initial_value = initializer
+      weight = tf_variables.Variable(
+          name="%s/%s" % (var._shared_name, slot_name),  # pylint: disable=protected-access
           dtype=var.dtype,
-          initializer=initializer)
+          trainable=False,
+          initial_value=initial_value)
+      backend.track_variable(weight)
       slot_dict[slot_name] = weight
+      self._restore_slot_variable(
+          slot_name=slot_name, variable=var,
+          slot_variable=weight)
       self._weights.append(weight)
+    return weight
 
   def get_slot(self, var, slot_name):
     var_key = _var_key(var)
     slot_dict = self._slots[var_key]
     return slot_dict[slot_name]
 
-  def _prepare(self):
-    if self._prepared:
+  def _prepare(self, var_list):
+    pass
+
+  def _create_hypers(self):
+    if self._hypers_created:
       return
-    with ops.device("cpu:0"):
-      self._iterations = self.add_weight(
-          "iter",
-          shape=[],
-          dtype=dtypes.int64,
-          trainable=False,
-          aggregation=tf_variables.VariableAggregation.ONLY_FIRST_REPLICA)
-      self._weights.append(self._iterations)
+    if self._iterations is None:
+      with ops.device("cpu:0"):
+        self._iterations = self.add_weight(
+            "iter",
+            shape=[],
+            dtype=dtypes.int64,
+            trainable=False,
+            aggregation=tf_variables.VariableAggregation.ONLY_FIRST_REPLICA)
+        self._weights.append(self._iterations)
     for name, value in self._hyper.items():
       if isinstance(value, ops.Tensor) or callable(value):
-        pass
+        continue
       else:
         self._hyper[name] = self.add_weight(
             name,
@@ -438,17 +567,29 @@ class OptimizerV2(optimizer_v1.Optimizer):
             trainable=False,
             initializer=value,
             aggregation=tf_variables.VariableAggregation.ONLY_FIRST_REPLICA)
-    self._prepared = True
+    self._hypers_created = True
 
   @property
   def iterations(self):
-    if not self._prepared:
-      self._prepare()
+    """Variable. The number of training steps this Optimizer has run."""
+    if not self._hypers_created:
+      self._create_hypers()
     return self._iterations
 
+  @iterations.setter
+  def iterations(self, variable):
+    if self._hypers_created:
+      raise RuntimeError("Cannot set `iterations` to a new Variable after"
+                         "the Optimizer weights have been created")
+    self._iterations = variable
+    self._weights.append(self._iterations)
+
   def _decayed_lr(self, var_dtype):
     """Get decayed learning rate as a Tensor with dtype=var_dtype."""
     lr_t = self._get_hyper("learning_rate", var_dtype)
+    if isinstance(lr_t, learning_rate_schedule.LearningRateSchedule):
+      local_step = math_ops.cast(self.iterations, var_dtype)
+      lr_t = math_ops.cast(lr_t(local_step), var_dtype)
     if self._initial_decay > 0.:
       local_step = math_ops.cast(self.iterations, var_dtype)
       decay_t = self._get_hyper("decay", var_dtype)
@@ -467,7 +608,12 @@ class OptimizerV2(optimizer_v1.Optimizer):
     Returns:
         Python dictionary.
     """
-    return {"name": self._name}
+    config = {"name": self._name}
+    if hasattr(self, "clipnorm"):
+      config["clipnorm"] = self.clipnorm
+    if hasattr(self, "clipvalue"):
+      config["clipvalue"] = self.clipvalue
+    return config
 
   @classmethod
   def from_config(cls, config, custom_objects=None):
@@ -488,14 +634,22 @@ class OptimizerV2(optimizer_v1.Optimizer):
     """
     if "lr" in config:
       config["learning_rate"] = config.pop("lr")
+    if "learning_rate" in config:
+      if isinstance(config["learning_rate"], dict):
+        config["learning_rate"] = learning_rate_schedule.deserialize(
+            config["learning_rate"])
     return cls(**config)
 
   def _serialize_hyperparameter(self, hyperparameter_name):
     """Serialize a hyperparameter that can be a float, callable, or Tensor."""
-    value = self._get_hyper(hyperparameter_name)
+    value = self._hyper[hyperparameter_name]
+    if isinstance(value, learning_rate_schedule.LearningRateSchedule):
+      return learning_rate_schedule.serialize(value)
     if callable(value):
       return value()
-    if isinstance(value, (ops.Tensor, tf_variables.Variable)):
+    if isinstance(value, (ops.Tensor, tf_variables.Variable,
+                          distributed_values.TPUMirroredVariable,
+                          distributed_values.DistributedVariable)):
       return backend.get_value(value)
     return value
 
@@ -575,12 +729,188 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
     return variable
 
+  def _assert_valid_dtypes(self, tensors):
+    """Asserts tensors are all valid types (see `_valid_dtypes`).
+
+    Args:
+      tensors: Tensors to check.
+
+    Raises:
+      ValueError: If any tensor is not a valid type.
+    """
+    valid_dtypes = self._valid_dtypes()
+    for t in tensors:
+      dtype = t.dtype.base_dtype
+      if dtype not in valid_dtypes:
+        raise ValueError("Invalid type %r for %s, expected: %s." %
+                         (dtype, t.name, [v for v in valid_dtypes]))
+
+  def _valid_dtypes(self):
+    """Valid types for loss, variables and gradients.
+
+    Subclasses should override to allow other float types.
+
+    Returns:
+      Valid types for loss, variables and gradients.
+    """
+    return set(
+        [dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64])
+
+  def _call_if_callable(self, param):
+    """Call the function if param is callable."""
+    return param() if callable(param) else param
+
+  def _resource_apply_dense(self, grad, handle):
+    """Add ops to apply dense gradients to the variable `handle`.
+
+    Args:
+      grad: a `Tensor` representing the gradient.
+      handle: a `Tensor` of dtype `resource` which points to the variable to be
+        updated.
+
+    Returns:
+      An `Operation` which updates the value of the variable.
+    """
+    raise NotImplementedError()
+
+  def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices):
+    """Add ops to apply sparse gradients to `handle`, with repeated indices.
+
+    Optimizers which override this method must deal with repeated indices. See
+    the docstring of `_apply_sparse_duplicate_indices` for details. By default
+    the correct behavior, to sum non-unique indices and their associated
+    gradients, is enforced by first pre-processing `grad` and `indices` and
+    passing them on to `_resource_apply_sparse`. Optimizers which deal correctly
+    with duplicate indices may instead override this method to avoid the
+    overhead of summing.
+
+    Args:
+      grad: a `Tensor` representing the gradient for the affected indices.
+      handle: a `Tensor` of dtype `resource` which points to the variable to be
+        updated.
+      indices: a `Tensor` of integral type representing the indices for which
+        the gradient is nonzero. Indices may be repeated.
+
+    Returns:
+      An `Operation` which updates the value of the variable.
+    """
+    summed_grad, unique_indices = _deduplicate_indexed_slices(
+        values=grad, indices=indices)
+    return self._resource_apply_sparse(summed_grad, handle, unique_indices)
+
+  def _resource_apply_sparse(self, grad, handle, indices):
+    """Add ops to apply sparse gradients to the variable `handle`.
+
+    Similar to `_apply_sparse`, the `indices` argument to this method has been
+    de-duplicated. Optimizers which deal correctly with non-unique indices may
+    instead override `_resource_apply_sparse_duplicate_indices` to avoid this
+    overhead.
+
+    Args:
+      grad: a `Tensor` representing the gradient for the affected indices.
+      handle: a `Tensor` of dtype `resource` which points to the variable to be
+        updated.
+      indices: a `Tensor` of integral type representing the indices for which
+        the gradient is nonzero. Indices are unique.
+
+    Returns:
+      An `Operation` which updates the value of the variable.
+    """
+    raise NotImplementedError()
+
+  def _resource_scatter_add(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
+      return x.value()
+
+  def _resource_scatter_update(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_update(x.handle, i, v)]):
+      return x.value()
+
+  # ---------------
+  # For implementing the trackable interface
+  # ---------------
+
+  def _restore_slot_variable(self, slot_name, variable, slot_variable):
+    """Restore a newly created slot variable's value."""
+    variable_key = _var_key(variable)
+    deferred_restorations = self._deferred_slot_restorations.get(
+        slot_name, {}).pop(variable_key, [])
+    # Iterate over restores, highest restore UID first to minimize the number
+    # of assignments.
+    deferred_restorations.sort(key=lambda position: position.restore_uid,
+                               reverse=True)
+    for checkpoint_position in deferred_restorations:
+      checkpoint_position.restore(slot_variable)
+
+  def _create_or_restore_slot_variable(
+      self, slot_variable_position, slot_name, variable):
+    """Restore a slot variable's value, possibly creating it.
+
+    Called when a variable which has an associated slot variable is created or
+    restored. When executing eagerly, we create the slot variable with a
+    restoring initializer.
+
+    No new variables are created when graph building. Instead,
+    _restore_slot_variable catches these after normal creation and adds restore
+    ops to the graph. This method is nonetheless important when graph building
+    for the case when a slot variable has already been created but `variable`
+    has just been added to a dependency graph (causing us to realize that the
+    slot variable needs to be restored).
+
+    Args:
+      slot_variable_position: A `trackable._CheckpointPosition` object
+        indicating the slot variable `Trackable` object to be restored.
+      slot_name: The name of this `Optimizer`'s slot to restore into.
+      variable: The variable object this slot is being created for.
+    """
+    variable_key = _var_key(variable)
+    slot_dict = self._slots.get(variable_key, {})
+    slot_variable = slot_dict.get(slot_name, None)
+    if (slot_variable is None and context.executing_eagerly() and
+        slot_variable_position.is_simple_variable()
+        # Defer slot variable creation if there is an active variable creator
+        # scope. Generally we'd like to eagerly create/restore slot variables
+        # when possible, but this may mean that scopes intended to catch
+        # `variable` also catch its eagerly created slot variable
+        # unintentionally (specifically make_template would add a dependency on
+        # a slot variable if not for this case). Deferring is mostly harmless
+        # (aside from double initialization), and makes variable creator scopes
+        # behave the same way they do when graph building.
+        and not ops.get_default_graph()._variable_creator_stack):  # pylint: disable=protected-access
+      initializer = trackable.CheckpointInitialValue(
+          checkpoint_position=slot_variable_position)
+      slot_variable = self.add_slot(
+          var=variable,
+          initializer=initializer,
+          slot_name=slot_name)
+      # Slot variables are not owned by any one object (because we don't want to
+      # save the slot variable if the optimizer is saved without the non-slot
+      # variable, or if the non-slot variable is saved without the optimizer;
+      # it's a dependency hypergraph with edges of the form (optimizer, non-slot
+      # variable, variable)). So we don't _track_ slot variables anywhere, and
+      # instead special-case this dependency and otherwise pretend it's a normal
+      # graph.
+    if slot_variable is not None:
+      # If we've either made this slot variable, or if we've pulled out an
+      # existing slot variable, we should restore it.
+      slot_variable_position.restore(slot_variable)
+    else:
+      # We didn't make the slot variable. Defer restoring until it gets created
+      # normally. We keep a list rather than the one with the highest restore
+      # UID in case slot variables have their own dependencies, in which case
+      # those could differ between restores.
+      self._deferred_slot_restorations.setdefault(
+          slot_name, {}).setdefault(variable_key, []).append(
+              slot_variable_position)
+
 
 def _filter_grads(grads_and_vars):
   """Filter out iterable with grad equal to None."""
   grads_and_vars = tuple(grads_and_vars)
   if not grads_and_vars:
-    raise ValueError("No variables provided.")
+    return grads_and_vars
   filtered = []
   vars_with_empty_grads = []
   for grad, var in grads_and_vars:
@@ -599,33 +929,6 @@ def _filter_grads(grads_and_vars):
   return filtered
 
 
-def merge_update_step(update_ops, local_step):
-  """Merge local step counter update from different replicas."""
-
-  def merge_update_step_fn(strategy, update_ops, local_step):
-    merged_ops = []
-    for update_op in update_ops:
-      merged_ops.append(strategy.group(update_op))
-    with ops.control_dependencies(merged_ops):
-      incre_op = local_step.assign_add(1).op
-    return incre_op
-
-  return distribute_ctx.get_replica_context().merge_call(
-      merge_update_step_fn, args=(update_ops, local_step))
-
-
-def merge_grads(grads_and_vars):
-  """Merge gradients from different replicas."""
-
-  def merge_grad_fn(strategy, grads_and_vars):
-    reduced_grads = strategy.batch_reduce(ds_reduce_util.ReduceOp.SUM,
-                                          grads_and_vars)
-    return reduced_grads
-
-  return distribute_ctx.get_replica_context().merge_call(
-      merge_grad_fn, args=(grads_and_vars,))
-
-
 def _var_key(var):
   """Key for representing a primary variable, for looking up slots.
 
@@ -641,10 +944,10 @@ def _var_key(var):
   """
 
   # pylint: disable=protected-access
-  if distribute_ctx.has_distribution_strategy() and hasattr(
-      var, "_primary_var"):
-    var = var._primary_var
-  if hasattr(var, "op"):
+  # Get the distributed variable if it exists.
+  if getattr(var, "_distributed_container", None) is not None:
+    var = var._distributed_container()
+  if var._in_graph_mode:
     return var._shared_name
   return var._unique_id
 
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
index 158577fe64afefaff28ee644caf084cb40d429ea..2d4c1827167b02f64075a88f34aeb3a015576581 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
@@ -18,41 +18,41 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-import tempfile
-
-from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import input_layer
-from tensorflow.python.keras.engine import saving
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.optimizer_v2 import adadelta
+from tensorflow.python.keras.optimizer_v2 import adagrad
 from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.keras.optimizer_v2 import adamax
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
+from tensorflow.python.keras.optimizer_v2 import nadam
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
-from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.training import momentum
+from tensorflow.python.training import training_util
 
 
 class OptimizerTest(test.TestCase):
@@ -64,8 +64,6 @@ class OptimizerTest(test.TestCase):
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
         loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
-        if not context.executing_eagerly():
-          loss = loss()
         sgd = gradient_descent.SGD(3.0)
 
         self.evaluate(variables.global_variables_initializer())
@@ -116,32 +114,12 @@ class OptimizerTest(test.TestCase):
       # var1 = [0., 1.] - 0.5 * [3, 3]
       self.assertAllClose([-1.5, -0.5], self.evaluate(var1))
 
-  @test_util.run_in_graph_and_eager_modes
-  def testAggregationMethod(self):
-    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
-        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
-        if not context.executing_eagerly():
-          loss = loss()
-        sgd = gradient_descent.SGD(3.0)
-
-        self.evaluate(variables.global_variables_initializer())
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-        # Run 1 step of sgd through optimizer
-        opt_op = sgd.minimize(
-            loss,
-            var_list=[var0, var1],
-            aggregation_method=gradients_impl.AggregationMethod
-            .EXPERIMENTAL_ACCUMULATE_N)
-        self.evaluate(variables.global_variables_initializer())
+      sgd.learning_rate = learning_rate_schedule.InverseTimeDecay(
+          0.5, decay_steps=1.0, decay_rate=0.5)
+      if context.executing_eagerly():
+        sgd.minimize(loss, [var0, var1])
+      else:
         self.evaluate(opt_op)
-        # Validate updated params
-        self.assertAllClose([-14., -13.], self.evaluate(var0))
-        self.assertAllClose([-6., -5.], self.evaluate(var1))
 
   @test_util.run_in_graph_and_eager_modes
   def testPrecomputedGradient(self):
@@ -150,8 +128,6 @@ class OptimizerTest(test.TestCase):
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
-        if not context.executing_eagerly():
-          loss = loss()
         grad_loss = constant_op.constant([42, -42], dtype=dtype)
         sgd = gradient_descent.SGD(3.0)
 
@@ -176,8 +152,6 @@ class OptimizerTest(test.TestCase):
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
         loss = lambda: 5 * var0  # pylint: disable=cell-var-from-loop
-        if not context.executing_eagerly():
-          loss = loss()
         sgd_op = gradient_descent.SGD(3.0)
         with self.assertRaisesRegexp(ValueError, 'No gradients'):
           # var1 has no gradient
@@ -190,8 +164,6 @@ class OptimizerTest(test.TestCase):
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
         loss = lambda: constant_op.constant(5.0)
-        if not context.executing_eagerly():
-          loss = loss()
 
         sgd_op = gradient_descent.SGD(3.0)
         with self.assertRaisesRegexp(ValueError,
@@ -216,11 +188,9 @@ class OptimizerTest(test.TestCase):
         var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
         var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
         loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
-        if not context.executing_eagerly():
-          loss = loss()
 
         sgd = gradient_descent.SGD(3.0)
-        grads_and_vars = sgd.compute_gradients(loss, [var0, var1])
+        grads_and_vars = sgd._compute_gradients(loss, [var0, var1])
         # Convert gradients to tf.Variables
         converted_grads = [
             resource_variable_ops.ResourceVariable(
@@ -259,7 +229,7 @@ class OptimizerTest(test.TestCase):
         return x * x
 
       sgd = gradient_descent.SGD(3.0)
-      grads_and_vars = sgd.compute_gradients(f, [x])
+      grads_and_vars = sgd._compute_gradients(f, [x])
       self.assertEqual(1, len(grads_and_vars))
       grad, x_as_var = grads_and_vars[0]
       self.assertIs(x, x_as_var)
@@ -278,8 +248,6 @@ class OptimizerTest(test.TestCase):
       var1 = variables.Variable([3.0, 4.0],
                                 constraint=constraint_0)
       loss = lambda: 5 * var0 + 3 * var1
-      if not context.executing_eagerly():  # pylint: disable=cell-var-from-loop
-        loss = loss()
       sgd = gradient_descent.SGD(3.0)
 
       self.evaluate(variables.global_variables_initializer())
@@ -302,31 +270,41 @@ class OptimizerTest(test.TestCase):
       self.assertEqual(0, self.evaluate(sgd.iterations))
 
   @test_util.run_in_graph_and_eager_modes
-  def testSerializationWithinDefun(self):
+  def testConfig(self):
     with self.cached_session():
-      sgd = gradient_descent.SGD(3.0)
-      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
-                                                    dtype=dtypes.float32)
+      opt = gradient_descent.SGD(learning_rate=1.0)
+      config = opt.get_config()
+      opt2 = gradient_descent.SGD.from_config(config)
+      lr = opt._get_hyper('learning_rate')
+      lr2 = opt2._get_hyper('learning_rate')
+      self.evaluate(variables.global_variables_initializer())
+      # assert both are equal float values.
+      self.assertEqual(self.evaluate(lr), self.evaluate(lr2))
+      var0 = variables.Variable([[1.0], [2.0]], dtype=dtypes.float32)
       loss = lambda: 3 * var0
-      sgd.minimize(loss, [var0])
-
-      def serialize():
-        config = sgd.get_config()
-        gradient_descent.SGD.from_config(config)
-
-      compiled_serialize = function.defun(serialize)
-      with self.assertRaisesRegexp(RuntimeError, 'inside Tensorflow graph'):
-        compiled_serialize()
+      # learning rate variable created when calling minimize.
+      opt.minimize(loss, [var0])
+      opt3 = gradient_descent.SGD.from_config(config)
+      lr3 = opt3._get_hyper('learning_rate')
+      self.evaluate(variables.global_variables_initializer())
+      self.assertEqual(self.evaluate(lr), self.evaluate(lr3))
 
   @test_util.run_in_graph_and_eager_modes
-  def testConfig(self):
+  def testConfigWithLearningRateDecay(self):
     with self.cached_session():
-      opt = gradient_descent.SGD(learning_rate=1.0)
+      decay_schedule = learning_rate_schedule.InverseTimeDecay(
+          0.5, decay_steps=1.0, decay_rate=0.1)
+      step = 10
+      opt = gradient_descent.SGD(decay_schedule)
       config = opt.get_config()
       opt2 = gradient_descent.SGD.from_config(config)
       # assert both are equal float values.
-      self.assertEqual(
-          opt._get_hyper('learning_rate'), opt2._get_hyper('learning_rate'))
+      self.assertAllEqual(
+          decay_schedule(step),
+          opt._get_hyper('learning_rate')(step))
+      self.assertAllEqual(
+          decay_schedule(step),
+          opt2._get_hyper('learning_rate')(step))
       var0 = variables.Variable([[1.0], [2.0]], dtype=dtypes.float32)
       loss = lambda: 3 * var0
       # learning rate variable created when calling minimize.
@@ -334,9 +312,41 @@ class OptimizerTest(test.TestCase):
       self.evaluate(variables.global_variables_initializer())
       config = opt.get_config()
       opt3 = gradient_descent.SGD.from_config(config)
-      self.assertEqual(
-          self.evaluate(opt._get_hyper('learning_rate')),
-          opt3._get_hyper('learning_rate'))
+      self.assertAllEqual(
+          self.evaluate(opt._get_hyper('learning_rate')(step)),
+          opt3._get_hyper('learning_rate')(step))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testGradClipValue(self):
+    with self.cached_session():
+      var = resource_variable_ops.ResourceVariable([1.0, 2.0])
+      loss = lambda: 3 * var
+      opt = gradient_descent.SGD(learning_rate=1.0, clipvalue=1.0)
+      opt_op = opt.minimize(loss, [var])
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(opt_op)
+      self.assertAllClose([0., 1.], self.evaluate(var))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testGradClipNorm(self):
+    with self.cached_session():
+      var = resource_variable_ops.ResourceVariable([1.0])
+      loss = lambda: 3 * var
+      opt = gradient_descent.SGD(learning_rate=1.0, clipnorm=1.0)
+      opt_op = opt.minimize(loss, [var])
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(opt_op)
+      self.assertAllClose([0.], self.evaluate(var))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInvalidClipNorm(self):
+    with self.assertRaisesRegexp(ValueError, '>= 0'):
+      gradient_descent.SGD(learning_rate=1.0, clipnorm=-1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInvalidKwargs(self):
+    with self.assertRaisesRegexp(TypeError, 'Unexpected keyword argument'):
+      gradient_descent.SGD(learning_rate=1.0, invalidkwargs=1.0)
 
   @test_util.run_in_graph_and_eager_modes
   def testWeights(self):
@@ -413,6 +423,31 @@ class OptimizerTest(test.TestCase):
     with self.assertRaises(AttributeError):
       opt.not_an_attr += 3
 
+  @test_util.run_in_graph_and_eager_modes
+  def testGettingHyperParametersWithLrInConstructor(self):
+    opt = gradient_descent.SGD(lr=3.0)
+    var = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                 dtype=dtypes.float32)
+    loss = lambda: 3 * var
+    opt_op = opt.minimize(loss, [var])
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(opt_op)
+
+    self.assertTrue(isinstance(opt.lr, resource_variable_ops.ResourceVariable))
+    self.assertTrue(
+        isinstance(opt.learning_rate, resource_variable_ops.ResourceVariable))
+
+    lr = self.evaluate(opt.lr)
+    self.assertEqual(3.0, lr)
+
+    opt.lr = 2.0
+    lr = self.evaluate(opt.lr)
+    self.assertEqual(2.0, lr)
+
+    self.evaluate(opt.lr.assign(4.0))
+    lr = self.evaluate(opt.lr)
+    self.assertEqual(4.0, lr)
+
   @test_util.run_in_graph_and_eager_modes
   def testOptimizerWithKerasModel(self):
     a = input_layer.Input(shape=(3,), name='input_a')
@@ -486,17 +521,37 @@ class OptimizerTest(test.TestCase):
     self.assertAllClose(
         float(backend.get_value(model.optimizer.lr)), 0.01, atol=1e-4)
 
+  def testOptimizerSetIterations(self):
+    global_step = training_util.get_or_create_global_step()
+    opt = adam.Adam(learning_rate=1.0)
+    opt.iterations = global_step
+    var = resource_variable_ops.ResourceVariable([1.0, 2.0],
+                                                 dtype=dtypes.float32)
+    self.evaluate(variables.global_variables_initializer())
+    init_step_value = self.evaluate(global_step)
+    loss = lambda: 3 * var
+    opt_op = opt.minimize(loss, [var])
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(opt_op)
+    new_step_value = self.evaluate(global_step)
+    self.assertEqual(new_step_value, init_step_value + 1)
+
+  def testVarKey(self):
+    with context.graph_mode():
+      a = variables.Variable([1., 2.], name='var')
+      b = variables.Variable([1.], name='var')
+      self.assertTrue(a._in_graph_mode)
+      self.assertTrue(b._in_graph_mode)
+      var_key = optimizer_v2._var_key(a)
+      self.assertEqual('var', var_key)
+      var_key = optimizer_v2._var_key(b)
+      self.assertEqual('var_1', var_key)
+
 
-class OptimizersCompatibilityTest(test.TestCase, parameterized.TestCase):
+@keras_parameterized.run_with_all_model_types
+class OptimizersCompatibilityTest(keras_parameterized.TestCase):
 
-  # TODO(tanzheny): remove test_numeric after algorithm for Momentum, Adam and
-  # NAdam has been unified: currently these three algorithms behave differently.
-  @parameterized.named_parameters(
-      ('adadelta', 'adadelta', True, True), ('adagrad', 'adagrad', True, True),
-      ('adam', 'adam', True, True), ('adamax', 'adamax', True, True),
-      ('nadam', 'nadam', True, False), ('momentum', 'momentum', True, True),
-      ('sgd', 'sgd', False, True))
-  def testOptimizersCompatibility(self, opt_str, test_weights, test_numeric):
+  def _testOptimizersCompatibility(self, opt_v1, opt_v2, test_weights=True):
     np.random.seed(1331)
     with self.cached_session():
       train_samples = 20
@@ -510,43 +565,65 @@ class OptimizersCompatibilityTest(test.TestCase, parameterized.TestCase):
       y = keras.utils.to_categorical(y)
 
       num_hidden = 5
-      model = testing_utils.get_small_sequential_mlp(
+      model_v1 = testing_utils.get_small_sequential_mlp(
           num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_v1.compile(opt_v1, loss='categorical_crossentropy', metrics=[])
+      model_v1.fit(x, y, batch_size=5, epochs=1)
 
-      old_mode = os.environ.get('TF2_BEHAVIOR', None)
-      # Disable tf2 to create V1 optimizer.
-      disable_tf2()
-      if opt_str == 'momentum':
-        opt_v1 = optimizers.SGD(momentum=0.9)
-      else:
-        opt_v1 = optimizers.get(opt_str)
-
-      # Test compile and fit with v1 optimizer.
-      model.compile(opt_v1, loss='categorical_crossentropy', metrics=[])
-      model.fit(x, y, batch_size=5, epochs=1)
-      model_dir = tempfile.mkdtemp()
-      gfile.MakeDirs(model_dir)
-      file_name = os.path.join(model_dir, 'model.h5')
-      model.save(file_name)
-
-      enable_tf2()
-      # Test load and fit with v2 optimizer.
-      model_2 = saving.load_model(file_name)
-      opt_v2 = model_2.optimizer
-      self.assertIsInstance(opt_v2, optimizer_v2.OptimizerV2)
-      # set_weights is called inside load_model but exception is swallowed,
-      # this call checks the weights can be set correctly.
+      model_v2 = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_v2.set_weights(model_v1.get_weights())
+      model_v2.compile(opt_v2, loss='categorical_crossentropy', metrics=[])
+      model_v2._make_train_function()
       if test_weights:
         opt_v2.set_weights(opt_v1.get_weights())
-      if test_numeric:
-        hist_1 = model.fit(x, y, batch_size=5, epochs=1, shuffle=False)
-        hist_2 = model_2.fit(x, y, batch_size=5, epochs=1, shuffle=False)
-        self.assertAllClose(model.get_weights(), model_2.get_weights())
-        self.assertAllClose(model.get_weights(), model_2.get_weights())
-        self.assertAllClose(hist_1.history['loss'], hist_2.history['loss'])
 
-      if old_mode is not None:
-        os.environ['TF2_BEHAVIOR'] = old_mode
+      hist_1 = model_v1.fit(x, y, batch_size=5, epochs=1, shuffle=False)
+      hist_2 = model_v2.fit(x, y, batch_size=5, epochs=1, shuffle=False)
+      self.assertAllClose(model_v1.get_weights(), model_v2.get_weights(),
+                          rtol=1e-5, atol=1e-5)
+      self.assertAllClose(hist_1.history['loss'], hist_2.history['loss'],
+                          rtol=1e-5, atol=1e-5)
+
+  def testAdadeltaCompatibility(self):
+    opt_v1 = optimizers.Adadelta(lr=0.01)
+    opt_v2 = adadelta.Adadelta(learning_rate=0.01)
+    self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+  def testAdagradCompatibility(self):
+    opt_v1 = optimizers.Adagrad(lr=0.01)
+    opt_v2 = adagrad.Adagrad(learning_rate=0.01)
+    self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+  def testAdamCompatibility(self):
+    opt_v1 = optimizers.Adam()
+    opt_v2 = adam.Adam()
+    self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+  def testAdamaxCompatibility(self):
+    opt_v1 = optimizers.Adamax(lr=0.01)
+    opt_v2 = adamax.Adamax(learning_rate=0.01)
+    self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+  def testNadamCompatibility(self):
+    opt_v1 = optimizers.Nadam(lr=0.001)
+    opt_v2 = nadam.Nadam(learning_rate=0.001)
+    self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+  def testMomentumCompatibility(self):
+    opt_v1 = optimizers.SGD(lr=0.01, momentum=0.9)
+    opt_v2 = gradient_descent.SGD(learning_rate=0.01, momentum=0.9)
+    self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+  def testRMSpropCompatibility(self):
+    opt_v1 = optimizers.RMSprop()
+    opt_v2 = rmsprop.RMSprop()
+    self._testOptimizersCompatibility(opt_v1, opt_v2)
+
+  def testSGDCompatibility(self):
+    opt_v1 = optimizers.SGD(lr=0.01)
+    opt_v2 = gradient_descent.SGD(learning_rate=0.01)
+    self._testOptimizersCompatibility(opt_v1, opt_v2, False)
 
   def testNumericEquivalenceForNesterovMomentum(self):
     np.random.seed(1331)
@@ -624,15 +701,6 @@ class OptimizersCompatibilityTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(hist_k_v1.history['loss'], hist_k_v2.history['loss'])
 
 
-def disable_tf2():
-  if 'TF2_BEHAVIOR' in os.environ:
-    del os.environ['TF2_BEHAVIOR']
-
-
-def enable_tf2():
-  os.environ['TF2_BEHAVIOR'] = 'enabled'
-
-
 # Note: These tests are kept in a separate class to avoid bugs in some
 # distributions of Python that break AutoGraph which is used by tf.function.
 class OptimizerWithFunctionTest(test.TestCase):
@@ -652,6 +720,23 @@ class OptimizerWithFunctionTest(test.TestCase):
       self.assertAllClose([0., 1.], fn(), atol=1e-4)
       self.assertAllClose([-1, 0.], fn(), atol=1e-4)
 
+  def testVarKeyWithVarCreatedInEager(self):
+    with context.eager_mode():
+      a = variables.Variable([1., 2.], name='var')
+      b = variables.Variable([1.], name='var')
+
+      @test_util.also_run_as_tf_function
+      def var_key_test():
+        self.assertFalse(a._in_graph_mode)
+        self.assertFalse(b._in_graph_mode)
+        var_key_a = optimizer_v2._var_key(a)
+        self.assertStartsWith(var_key_a, 'var_')
+        var_key_b = optimizer_v2._var_key(b)
+        self.assertStartsWith(var_key_b, 'var_')
+        self.assertNotEquals(var_key_a, var_key_b)
+
+      var_key_test()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop.py b/tensorflow/python/keras/optimizer_v2/rmsprop.py
index 6a5b334fc46f6ae76f48cce29bc119cdc8f0eaf2..e55e6375a3ea8e89cb377f1f9ac9291c2d098142 100644
--- a/tensorflow/python/keras/optimizer_v2/rmsprop.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop.py
@@ -17,11 +17,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend_config
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.training import training_ops
+from tensorflow.python.util.tf_export import keras_export
 
 
+@keras_export("keras.optimizers.RMSprop")
 class RMSprop(optimizer_v2.OptimizerV2):
   r"""Optimizer that implements the RMSprop algorithm.
 
@@ -88,10 +97,16 @@ class RMSprop(optimizer_v2.OptimizerV2):
         `epsilon` can each be a callable that takes no arguments and returns the
         actual value to use. This can be useful for changing these values across
         different invocations of optimizer functions. @end_compatibility
-      **kwargs: keyword arguments. Allowed to be {`decay`}
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for backward
+        compatibility, recommended to use `learning_rate` instead.
     """
+    if epsilon is None:
+      epsilon = backend_config.epsilon()
     super(RMSprop, self).__init__(name, **kwargs)
-    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
     self._set_hyper("decay", self._initial_decay)
     self._set_hyper("rho", rho)
 
@@ -103,82 +118,127 @@ class RMSprop(optimizer_v2.OptimizerV2):
     self._set_hyper("momentum", momentum)
 
     self._set_hyper("epsilon", epsilon)
-    self._centered = centered
+    self.centered = centered
 
   def _create_slots(self, var_list):
     for var in var_list:
       self.add_slot(var, "rms")
-      self.add_slot(var, "momentum")
-      if self._centered:
+    if self._momentum:
+      for var in var_list:
+        self.add_slot(var, "momentum")
+    if self.centered:
+      for var in var_list:
         self.add_slot(var, "mg")
 
   def _resource_apply_dense(self, grad, var):
     var_dtype = var.dtype.base_dtype
     lr_t = self._decayed_lr(var_dtype)
     rms = self.get_slot(var, "rms")
-    mom = self.get_slot(var, "momentum")
     rho = self._get_hyper("rho", var_dtype)
     momentum = self._get_hyper("momentum", var_dtype)
     epsilon = self._get_hyper("epsilon", var_dtype)
-    if self._centered:
-      mg = self.get_slot(var, "mg")
-      return training_ops.resource_apply_centered_rms_prop(
-          var.handle,
-          mg.handle,
-          rms.handle,
-          mom.handle,
-          lr_t,
-          rho,
-          momentum,
-          epsilon,
-          grad,
-          use_locking=self._use_locking)
+    if self._momentum:
+      mom = self.get_slot(var, "momentum")
+      if self.centered:
+        mg = self.get_slot(var, "mg")
+        return training_ops.resource_apply_centered_rms_prop(
+            var.handle,
+            mg.handle,
+            rms.handle,
+            mom.handle,
+            lr_t,
+            rho,
+            momentum,
+            epsilon,
+            grad,
+            use_locking=self._use_locking)
+      else:
+        return training_ops.resource_apply_rms_prop(
+            var.handle,
+            rms.handle,
+            mom.handle,
+            lr_t,
+            rho,
+            momentum,
+            epsilon,
+            grad,
+            use_locking=self._use_locking)
     else:
-      return training_ops.resource_apply_rms_prop(
-          var.handle,
-          rms.handle,
-          mom.handle,
-          lr_t,
-          rho,
-          momentum,
-          epsilon,
-          grad,
-          use_locking=self._use_locking)
+      rms_t = rho * rms + (1. - rho) * math_ops.square(grad)
+      rms_t = state_ops.assign(rms, rms_t, use_locking=self._use_locking)
+      denom_t = rms_t
+      if self.centered:
+        mg = self.get_slot(var, "mg")
+        mg_t = rho * mg + (1. - rho) * grad
+        mg_t = state_ops.assign(mg, mg_t, use_locking=self._use_locking)
+        denom_t = rms_t - math_ops.square(mg_t)
+      var_t = var - lr_t * grad / (math_ops.sqrt(denom_t) + epsilon)
+      return state_ops.assign(var, var_t, use_locking=self._use_locking).op
 
   def _resource_apply_sparse(self, grad, var, indices):
     var_dtype = var.dtype.base_dtype
     lr_t = self._decayed_lr(var_dtype)
     rms = self.get_slot(var, "rms")
-    mom = self.get_slot(var, "momentum")
     rho = self._get_hyper("rho", var_dtype)
     momentum = self._get_hyper("momentum", var_dtype)
     epsilon = self._get_hyper("epsilon", var_dtype)
-    if self._centered:
-      mg = self.get_slot(var, "mg")
-      return training_ops.resource_sparse_apply_centered_rms_prop(
-          var.handle,
-          mg.handle,
-          rms.handle,
-          mom.handle,
-          lr_t,
-          rho,
-          momentum,
-          epsilon,
-          grad,
-          indices,
-          use_locking=self._use_locking)
+    if self._momentum:
+      mom = self.get_slot(var, "momentum")
+      if self.centered:
+        mg = self.get_slot(var, "mg")
+        return training_ops.resource_sparse_apply_centered_rms_prop(
+            var.handle,
+            mg.handle,
+            rms.handle,
+            mom.handle,
+            lr_t,
+            rho,
+            momentum,
+            epsilon,
+            grad,
+            indices,
+            use_locking=self._use_locking)
+      else:
+        return training_ops.resource_sparse_apply_rms_prop(
+            var.handle,
+            rms.handle,
+            mom.handle,
+            lr_t,
+            rho,
+            momentum,
+            epsilon,
+            grad,
+            indices,
+            use_locking=self._use_locking)
     else:
-      return training_ops.resource_sparse_apply_rms_prop(
-          var.handle,
-          rms.handle,
-          mom.handle,
-          lr_t,
-          rho,
-          momentum,
-          epsilon,
-          grad,
-          indices,
-          use_locking=self._use_locking)
+      rms_scaled_g_values = (grad * grad) * (1. - rho)
+      rms_t = state_ops.assign(rms, rms * rho, use_locking=self._use_locking)
+      with ops.control_dependencies([rms_t]):
+        rms_t = self._resource_scatter_add(rms, indices, rms_scaled_g_values)
+        rms_slice = array_ops.gather(rms_t, indices)
+      denom_slice = rms_slice
+      if self.centered:
+        mg = self.get_slot(var, "mg")
+        mg_scaled_g_values = grad * (1. - rho)
+        mg_t = state_ops.assign(mg, mg * rho, use_locking=self._use_locking)
+        with ops.control_dependencies([mg_t]):
+          mg_t = self._resource_scatter_add(mg, indices, mg_scaled_g_values)
+          mg_slice = array_ops.gather(mg_t, indices)
+          denom_slice = rms_slice - math_ops.square(mg_slice)
+      var_update = self._resource_scatter_add(
+          var, indices, -lr_t * grad / (math_ops.sqrt(denom_slice) + epsilon))
+      if self.centered:
+        return control_flow_ops.group(*[var_update, rms_t, mg_t])
+      return control_flow_ops.group(*[var_update, rms_t])
+
+  def set_weights(self, weights):
+    params = self.weights
+    # Override set_weights for backward compatibility of Keras V1 optimizer
+    # since it does not include iteration at head of the weight list. Set
+    # iteration to 0.
+    if len(params) == len(weights) + 1:
+      weights = [np.array(0)] + weights
+    super(RMSprop, self).set_weights(weights)
 
   def get_config(self):
     config = super(RMSprop, self).get_config()
@@ -188,7 +248,7 @@ class RMSprop(optimizer_v2.OptimizerV2):
         "rho": self._serialize_hyperparameter("rho"),
         "momentum": self._serialize_hyperparameter("momentum"),
         "epsilon": self._serialize_hyperparameter("epsilon"),
-        "centered": self._centered,
+        "centered": self.centered,
     })
     return config
 
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
index a8658a8550760a04c6031e26721038b88fad0ebd..ab8052667623b7e43d9c8b1ab6b8954e5df2e13b 100644
--- a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
@@ -58,14 +59,18 @@ class RMSpropOptimizerTest(test.TestCase):
   def _rmsprop_update_numpy(self, var, g, mg, rms, mom, lr, rho, momentum,
                             epsilon, centered):
     rms_t = rms * rho + (1 - rho) * g * g
-    denom_t = rms_t + epsilon
     if centered:
       mg_t = mg * rho + (1 - rho) * g
-      denom_t -= mg_t * mg_t
+      denom_t = rms_t - mg_t * mg_t
     else:
       mg_t = mg
-    mom_t = momentum * mom + lr * g / np.sqrt(denom_t, dtype=denom_t.dtype)
-    var_t = var - mom_t
+      denom_t = rms_t
+    if momentum > 0.:
+      mom_t = momentum * mom + lr * g / (np.sqrt(denom_t + epsilon))
+      var_t = var - mom_t
+    else:
+      mom_t = mom
+      var_t = var - lr * g / (np.sqrt(denom_t) + epsilon)
     return var_t, mg_t, rms_t, mom_t
 
   def _sparse_rmsprop_update_numpy(self, var, gindexs, gvalues, mg, rms, mom,
@@ -78,12 +83,18 @@ class RMSpropOptimizerTest(test.TestCase):
       gindex = gindexs[i]
       gvalue = gvalues[i]
       rms_t[gindex] = rms[gindex] * rho + (1 - rho) * gvalue * gvalue
-      denom_t = rms_t[gindex] + epsilon
       if centered:
         mg_t[gindex] = mg_t[gindex] * rho + (1 - rho) * gvalue
-        denom_t -= mg_t[gindex] * mg_t[gindex]
-      mom_t[gindex] = momentum * mom[gindex] + lr * gvalue / np.sqrt(denom_t)
-      var_t[gindex] = var[gindex] - mom_t[gindex]
+        denom_t = rms_t[gindex] - mg_t[gindex] * mg_t[gindex]
+      else:
+        denom_t = rms_t[gindex]
+      if momentum > 0.:
+        mom_t[gindex] = momentum * mom[gindex] + lr * gvalue / np.sqrt(denom_t +
+                                                                       epsilon)
+        var_t[gindex] = var[gindex] - mom_t[gindex]
+      else:
+        mom_t[gindex] = mom[gindex]
+        var_t[gindex] = var[gindex] - lr * gvalue / (np.sqrt(denom_t) + epsilon)
     return var_t, mg_t, rms_t, mom_t
 
   @test_util.run_deprecated_v1
@@ -117,14 +128,17 @@ class RMSpropOptimizerTest(test.TestCase):
           mg0 = None
           mg1 = None
 
+        if momentum > 0.:
+          mom0 = opt.get_slot(var0, "momentum")
+          mom1 = opt.get_slot(var1, "momentum")
+        else:
+          mom0 = None
+          mom1 = None
+
         rms0 = opt.get_slot(var0, "rms")
         self.assertTrue(rms0 is not None)
         rms1 = opt.get_slot(var1, "rms")
         self.assertTrue(rms1 is not None)
-        mom0 = opt.get_slot(var0, "momentum")
-        self.assertTrue(mom0 is not None)
-        mom1 = opt.get_slot(var1, "momentum")
-        self.assertTrue(mom1 is not None)
 
         mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
         mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
@@ -137,8 +151,8 @@ class RMSpropOptimizerTest(test.TestCase):
         self.assertAllClose([1.0, 2.0], self.evaluate(var0))
         self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
-        # Run 4 steps of RMSprop
-        for _ in range(1, 5):
+        # Run 3 steps of RMSprop
+        for _ in range(1, 4):
           self.evaluate(update)
 
           var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
@@ -152,10 +166,11 @@ class RMSpropOptimizerTest(test.TestCase):
           if centered:
             self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0))
             self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
+          if momentum > 0.:
+            self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+            self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
           self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
           self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
-          self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
-          self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
           self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
           self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
@@ -191,10 +206,12 @@ class RMSpropOptimizerTest(test.TestCase):
     self.assertTrue(rms0 is not None)
     rms1 = opt.get_slot(var1, "rms")
     self.assertTrue(rms1 is not None)
-    mom0 = opt.get_slot(var0, "momentum")
-    self.assertTrue(mom0 is not None)
-    mom1 = opt.get_slot(var1, "momentum")
-    self.assertTrue(mom1 is not None)
+    if momentum > 0.:
+      mom0 = opt.get_slot(var0, "momentum")
+      mom1 = opt.get_slot(var1, "momentum")
+    else:
+      mom0 = None
+      mom1 = None
 
     mg0_np = np.array([0.0, 0.0])
     mg1_np = np.array([0.0, 0.0])
@@ -222,8 +239,81 @@ class RMSpropOptimizerTest(test.TestCase):
       # Validate updated params
       self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
       self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
-      self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
-      self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+      if momentum > 0.:
+        self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+        self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+      self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+      self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testDenseWithLearningRateInverseTimeDecay(self):
+    var0_np = np.array([1.0, 2.0])
+    grads0_np = np.array([0.1, 0.2])
+    var1_np = np.array([3.0, 4.0])
+    grads1_np = np.array([0.01, 0.2])
+
+    var0 = resource_variable_ops.ResourceVariable(var0_np)
+    var1 = resource_variable_ops.ResourceVariable(var1_np)
+    grads0 = constant_op.constant(grads0_np)
+    grads1 = constant_op.constant(grads1_np)
+    learning_rate = 0.01
+    rho = 0.9
+    momentum = 0.0
+    epsilon = 1e-7
+    centered = False
+    decay = 0.5
+    lr_schedule = learning_rate_schedule.InverseTimeDecay(
+        learning_rate, decay_steps=1.0, decay_rate=decay)
+    opt = rmsprop.RMSprop(
+        learning_rate=lr_schedule,
+        rho=rho,
+        momentum=momentum,
+        epsilon=epsilon,
+        centered=centered)
+
+    update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+    self.evaluate(variables.global_variables_initializer())
+
+    rms0 = opt.get_slot(var0, "rms")
+    self.assertTrue(rms0 is not None)
+    rms1 = opt.get_slot(var1, "rms")
+    self.assertTrue(rms1 is not None)
+    if momentum > 0.:
+      mom0 = opt.get_slot(var0, "momentum")
+      mom1 = opt.get_slot(var1, "momentum")
+    else:
+      mom0 = None
+      mom1 = None
+
+    mg0_np = np.array([0.0, 0.0])
+    mg1_np = np.array([0.0, 0.0])
+    rms0_np = np.array([0.0, 0.0])
+    rms1_np = np.array([0.0, 0.0])
+    mom0_np = np.array([0.0, 0.0])
+    mom1_np = np.array([0.0, 0.0])
+
+    # Fetch params to validate initial values
+    self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+    self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+    # Run 4 steps of RMSprop
+    for t in range(2):
+      self.evaluate(update)
+
+      lr = learning_rate / (1 + decay * t)
+      var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
+          var0_np, grads0_np, mg0_np, rms0_np, mom0_np, lr, rho, momentum,
+          epsilon, centered)
+      var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy(
+          var1_np, grads1_np, mg1_np, rms1_np, mom1_np, lr, rho, momentum,
+          epsilon, centered)
+
+      # Validate updated params
+      self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+      self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+      if momentum > 0.:
+        self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+        self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
       self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
       self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
@@ -233,8 +323,11 @@ class RMSpropOptimizerTest(test.TestCase):
       with self.cached_session():
         var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
-        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
-        loss = pred * pred
+
+        def loss():
+          pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)  # pylint: disable=cell-var-from-loop
+          return pred * pred
+
         sgd_op = rmsprop.RMSprop(
             learning_rate=1.0,
             rho=0.0,
@@ -258,8 +351,12 @@ class RMSpropOptimizerTest(test.TestCase):
       with self.cached_session():
         var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
         x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
-        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
-        loss = pred * pred
+
+        def loss():
+          pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)  # pylint: disable=cell-var-from-loop
+          return pred * pred
+
+        # loss = lambda: pred * pred  # pylint: disable=cell-var-from-loop
         sgd_op = rmsprop.RMSprop(
             learning_rate=1.0,
             rho=0.0,
@@ -318,10 +415,12 @@ class RMSpropOptimizerTest(test.TestCase):
         self.assertTrue(rms0 is not None)
         rms1 = opt.get_slot(var1, "rms")
         self.assertTrue(rms1 is not None)
-        mom0 = opt.get_slot(var0, "momentum")
-        self.assertTrue(mom0 is not None)
-        mom1 = opt.get_slot(var1, "momentum")
-        self.assertTrue(mom1 is not None)
+        if momentum > 0.:
+          mom0 = opt.get_slot(var0, "momentum")
+          mom1 = opt.get_slot(var1, "momentum")
+        else:
+          mom0 = None
+          mom1 = None
 
         mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
         mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
@@ -334,8 +433,8 @@ class RMSpropOptimizerTest(test.TestCase):
         self.assertAllClose([1.0, 2.0], self.evaluate(var0))
         self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
-        # Run 4 steps of RMSprop
-        for _ in range(1, 5):
+        # Run 3 steps of RMSprop
+        for _ in range(1, 4):
           self.evaluate(update)
 
           var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy(
@@ -351,8 +450,9 @@ class RMSpropOptimizerTest(test.TestCase):
             self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
           self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
           self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
-          self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
-          self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+          if momentum > 0.:
+            self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+            self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
           self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
           self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
@@ -405,6 +505,54 @@ class RMSpropOptimizerTest(test.TestCase):
                 (0.01 * 2.0 / math.sqrt(0.00001 * 0.9 + 1e-5 + 1.0))
             ]), self.evaluate(var1))
 
+  def testConstructRMSpropWithLR(self):
+    opt = rmsprop.RMSprop(lr=1.0)
+    opt_2 = rmsprop.RMSprop(learning_rate=0.1, lr=1.0)
+    opt_3 = rmsprop.RMSprop(learning_rate=0.1)
+    self.assertIsInstance(opt.lr, variables.Variable)
+    self.assertIsInstance(opt_2.lr, variables.Variable)
+    self.assertIsInstance(opt_3.lr, variables.Variable)
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose(self.evaluate(opt.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
+    self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
+
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = variables.Variable(1.)
+      v2 = variables.Variable(1.)
+
+      opt = rmsprop.RMSprop(1., momentum=0., centered=False)
+      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+      # There should be iteration, and one unique slot variable for v1 and v2.
+      self.assertEqual(3, len(set(opt.variables())))
+      self.assertEqual(
+          self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
+
+      opt = rmsprop.RMSprop(learning_rate=1., momentum=0.2, centered=False)
+      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+      # There should be iteration, and two unique slot variables for v1 and v2.
+      self.assertEqual(5, len(set(opt.variables())))
+      self.assertEqual(
+          self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
+
+      opt = rmsprop.RMSprop(learning_rate=1., momentum=0.2, centered=True)
+      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+      # There should be iteration, and three unique slot variables for v1 and v2
+      self.assertEqual(7, len(set(opt.variables())))
+      self.assertEqual(
+          self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
+
+  def testConstructRMSpropWithEpsilonValues(self):
+    opt = rmsprop.RMSprop(epsilon=None)
+    config = opt.get_config()
+    self.assertEqual(config["epsilon"], 1e-7)
+
+    opt = rmsprop.RMSprop(epsilon=1e-8)
+    config = opt.get_config()
+    self.assertEqual(config["epsilon"], 1e-8)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/optimizers.py b/tensorflow/python/keras/optimizers.py
index ee6dbba5ad62ee4b35101d1496a77ae91412fd64..1fb8f8d2802d5e067a6c8ed79915147e086cf172 100644
--- a/tensorflow/python/keras/optimizers.py
+++ b/tensorflow/python/keras/optimizers.py
@@ -22,7 +22,6 @@ from __future__ import print_function
 import six
 from six.moves import zip  # pylint: disable=redefined-builtin
 
-from tensorflow.python import tf2
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
@@ -30,6 +29,7 @@ from tensorflow.python.keras.optimizer_v2 import adadelta as adadelta_v2
 from tensorflow.python.keras.optimizer_v2 import adagrad as adagrad_v2
 from tensorflow.python.keras.optimizer_v2 import adam as adam_v2
 from tensorflow.python.keras.optimizer_v2 import adamax as adamax_v2
+from tensorflow.python.keras.optimizer_v2 import ftrl
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_v2
 from tensorflow.python.keras.optimizer_v2 import nadam as nadam_v2
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
@@ -41,11 +41,10 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.training import optimizer as tf_optimizer_module
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import base as checkpointable
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.optimizers.Optimizer')
 class Optimizer(object):
   """Abstract optimizer base class.
 
@@ -159,7 +158,6 @@ class Optimizer(object):
     return cls(**config)
 
 
-@tf_export('keras.optimizers.SGD')
 class SGD(Optimizer):
   """Stochastic gradient descent optimizer.
 
@@ -224,7 +222,6 @@ class SGD(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.optimizers.RMSprop')
 class RMSprop(Optimizer):
   """RMSProp optimizer.
 
@@ -291,7 +288,6 @@ class RMSprop(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.optimizers.Adagrad')
 class Adagrad(Optimizer):
   """Adagrad optimizer.
 
@@ -358,7 +354,6 @@ class Adagrad(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.optimizers.Adadelta')
 class Adadelta(Optimizer):
   """Adadelta optimizer.
 
@@ -442,7 +437,6 @@ class Adadelta(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.optimizers.Adam')
 class Adam(Optimizer):
   """Adam optimizer.
 
@@ -539,7 +533,6 @@ class Adam(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.optimizers.Adamax')
 class Adamax(Optimizer):
   """Adamax optimizer from Adam paper's Section 7.
 
@@ -575,7 +568,7 @@ class Adamax(Optimizer):
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    self.updates = [state_ops.assign_add(self.iterations, 1)]
+    self.updates = []
 
     lr = self.lr
     if self.initial_decay > 0:
@@ -583,7 +576,8 @@ class Adamax(Optimizer):
           1. / (1. + self.decay * math_ops.cast(self.iterations,
                                                 K.dtype(self.decay))))
 
-    t = math_ops.cast(self.iterations, K.floatx()) + 1
+    with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
+      t = math_ops.cast(self.iterations, K.floatx())
     lr_t = lr / (1. - math_ops.pow(self.beta_1, t))
 
     shapes = [K.int_shape(p) for p in params]
@@ -622,7 +616,6 @@ class Adamax(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-@tf_export('keras.optimizers.Nadam')
 class Nadam(Optimizer):
   """Nesterov Adam optimizer.
 
@@ -661,9 +654,10 @@ class Nadam(Optimizer):
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    self.updates = [state_ops.assign_add(self.iterations, 1)]
+    self.updates = []
 
-    t = math_ops.cast(self.iterations, K.floatx()) + 1
+    with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
+      t = math_ops.cast(self.iterations, K.floatx())
 
     # Due to the recommendations in [2], i.e. warming momentum schedule
     momentum_cache_t = self.beta_1 * (
@@ -680,7 +674,7 @@ class Nadam(Optimizer):
     ms = [K.zeros(shape) for shape in shapes]
     vs = [K.zeros(shape) for shape in shapes]
 
-    self.weights = [self.iterations] + ms + vs
+    self.weights = [self.iterations, self.m_schedule] + ms + vs
 
     for p, g, m, v in zip(params, grads, ms, vs):
       # the following equations given in [1]
@@ -717,19 +711,19 @@ class Nadam(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-class TFOptimizer(Optimizer, checkpointable.CheckpointableBase):
+class TFOptimizer(Optimizer, trackable.Trackable):
   """Wrapper class for native TensorFlow optimizers.
   """
 
   def __init__(self, optimizer, iterations=None):  # pylint: disable=super-init-not-called
     self.optimizer = optimizer
-    self._track_checkpointable(optimizer, name='optimizer')
+    self._track_trackable(optimizer, name='optimizer')
     if iterations is None:
       with K.name_scope(self.__class__.__name__):
         self.iterations = K.variable(0, dtype='int64', name='iterations')
     else:
       self.iterations = iterations
-    self._track_checkpointable(self.iterations, name='global_step')
+    self._track_trackable(self.iterations, name='global_step')
 
   def apply_gradients(self, grads):
     self.optimizer.apply_gradients(grads, global_step=self.iterations)
@@ -738,7 +732,7 @@ class TFOptimizer(Optimizer, checkpointable.CheckpointableBase):
     return self.optimizer.compute_gradients(loss, params)
 
   def get_updates(self, loss, params):
-    if distribution_strategy_context.has_distribution_strategy():
+    if distribution_strategy_context.has_strategy():
       self.updates = []
 
       if not params:
@@ -787,12 +781,12 @@ adamax = Adamax
 nadam = Nadam
 
 
-@tf_export('keras.optimizers.serialize')
+@keras_export('keras.optimizers.serialize')
 def serialize(optimizer):
   return serialize_keras_object(optimizer)
 
 
-@tf_export('keras.optimizers.deserialize')
+@keras_export('keras.optimizers.deserialize')
 def deserialize(config, custom_objects=None):
   """Inverse of the `serialize` function.
 
@@ -806,27 +800,17 @@ def deserialize(config, custom_objects=None):
   Returns:
       A Keras Optimizer instance.
   """
-  if tf2.enabled():
-    all_classes = {
-        'adadelta': adadelta_v2.Adadelta,
-        'adagrad': adagrad_v2.Adagrad,
-        'adam': adam_v2.Adam,
-        'adamax': adamax_v2.Adamax,
-        'nadam': nadam_v2.Nadam,
-        'rmsprop': rmsprop_v2.RMSprop,
-        'sgd': gradient_descent_v2.SGD
-    }
-  else:
-    all_classes = {
-        'adadelta': Adadelta,
-        'adagrad': Adagrad,
-        'adam': Adam,
-        'adamax': Adamax,
-        'nadam': Nadam,
-        'rmsprop': RMSprop,
-        'sgd': SGD,
-        'tfoptimizer': TFOptimizer
-    }
+  all_classes = {
+      'adadelta': adadelta_v2.Adadelta,
+      'adagrad': adagrad_v2.Adagrad,
+      'adam': adam_v2.Adam,
+      'adamax': adamax_v2.Adamax,
+      'nadam': nadam_v2.Nadam,
+      'rmsprop': rmsprop_v2.RMSprop,
+      'sgd': gradient_descent_v2.SGD,
+      'ftrl': ftrl.Ftrl
+  }
+
   # Make deserialization case-insensitive for built-in optimizers.
   if config['class_name'].lower() in all_classes:
     config['class_name'] = config['class_name'].lower()
@@ -837,7 +821,7 @@ def deserialize(config, custom_objects=None):
       printable_module_name='optimizer')
 
 
-@tf_export('keras.optimizers.get')
+@keras_export('keras.optimizers.get')
 def get(identifier):
   """Retrieves a Keras Optimizer instance.
 
diff --git a/tensorflow/python/keras/optimizers_test.py b/tensorflow/python/keras/optimizers_test.py
index 77104a5d4d526792dde209b3c7cce2262a138dce..e3ae442cedbe3d1eaee1cf759e434cc6475e6a5a 100644
--- a/tensorflow/python/keras/optimizers_test.py
+++ b/tensorflow/python/keras/optimizers_test.py
@@ -44,114 +44,117 @@ def _get_model(input_dim, num_hidden, output_dim):
   return model
 
 
-def _test_optimizer(optimizer, target=0.75):
-  np.random.seed(1337)
-  (x_train, y_train), _ = testing_utils.get_test_data(train_samples=1000,
-                                                      test_samples=200,
-                                                      input_shape=(10,),
-                                                      num_classes=2)
-  y_train = keras.utils.to_categorical(y_train)
-  model = _get_model(x_train.shape[1], 20, y_train.shape[1])
-  model.compile(loss='categorical_crossentropy',
-                optimizer=optimizer,
-                metrics=['accuracy'])
-  np.testing.assert_equal(keras.backend.get_value(model.optimizer.iterations),
-                          0)
-  history = model.fit(x_train, y_train, epochs=2, batch_size=16, verbose=0)
-  np.testing.assert_equal(keras.backend.get_value(model.optimizer.iterations),
-                          126)  # 63 steps per epoch
-  assert history.history['acc'][-1] >= target
-  config = keras.optimizers.serialize(optimizer)
-  optim = keras.optimizers.deserialize(config)
-  new_config = keras.optimizers.serialize(optim)
-  new_config['class_name'] = new_config['class_name'].lower()
-  assert config == new_config
-
-  # Test constraints.
-  model = keras.models.Sequential()
-  dense = keras.layers.Dense(10,
-                             input_shape=(x_train.shape[1],),
-                             kernel_constraint=lambda x: 0. * x + 1.,
-                             bias_constraint=lambda x: 0. * x + 2.,
-                             activation='relu')
-  model.add(dense)
-  model.add(keras.layers.Dense(y_train.shape[1], activation='softmax'))
-  model.compile(loss='categorical_crossentropy',
-                optimizer=optimizer,
-                metrics=['accuracy'])
-  np.testing.assert_equal(keras.backend.get_value(model.optimizer.iterations),
-                          126)  # Using same optimizer from before
-  model.train_on_batch(x_train[:10], y_train[:10])
-  np.testing.assert_equal(keras.backend.get_value(model.optimizer.iterations),
-                          127)
-  kernel, bias = dense.get_weights()
-  np.testing.assert_allclose(kernel, 1., atol=1e-3)
-  np.testing.assert_allclose(bias, 2., atol=1e-3)
-
-
 class KerasOptimizersTest(test.TestCase):
 
-  @test_util.run_v1_only('b/120545219')
+  def _test_optimizer(self, optimizer, target=0.75):
+    np.random.seed(1337)
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=1000, test_samples=200, input_shape=(10,), num_classes=2)
+    y_train = keras.utils.to_categorical(y_train)
+    model = _get_model(x_train.shape[1], 20, y_train.shape[1])
+    model.compile(
+        loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])
+    np.testing.assert_equal(
+        keras.backend.get_value(model.optimizer.iterations), 0)
+    history = model.fit(x_train, y_train, epochs=2, batch_size=16, verbose=0)
+    np.testing.assert_equal(
+        keras.backend.get_value(model.optimizer.iterations),
+        126)  # 63 steps per epoch
+    self.assertGreaterEqual(history.history['acc'][-1], target)
+    config = keras.optimizers.serialize(optimizer)
+    optim = keras.optimizers.deserialize(config)
+    new_config = keras.optimizers.serialize(optim)
+    new_config['class_name'] = new_config['class_name'].lower()
+    new_config['config'].pop('name', None)
+    if 'amsgrad' not in config['config']:
+      new_config['config'].pop('amsgrad', None)
+    if 'decay' in new_config['config'] and 'schedule_decay' in config['config']:
+      new_config['config']['schedule_decay'] = new_config['config'].pop('decay')
+    if 'momentum' not in config['config']:
+      new_config['config'].pop('momentum', None)
+    if 'centered' not in config['config']:
+      new_config['config'].pop('centered', None)
+    self.assertDictEqual(config, new_config)
+
+    # Test constraints.
+    model = keras.models.Sequential()
+    dense = keras.layers.Dense(
+        10,
+        input_shape=(x_train.shape[1],),
+        kernel_constraint=lambda x: 0. * x + 1.,
+        bias_constraint=lambda x: 0. * x + 2.,
+        activation='relu')
+    model.add(dense)
+    model.add(keras.layers.Dense(y_train.shape[1], activation='softmax'))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=optimizer,
+        metrics=['accuracy'])
+    np.testing.assert_equal(
+        keras.backend.get_value(model.optimizer.iterations),
+        126)  # Using same optimizer from before
+    model.train_on_batch(x_train[:10], y_train[:10])
+    np.testing.assert_equal(
+        keras.backend.get_value(model.optimizer.iterations), 127)
+    kernel, bias = dense.get_weights()
+    np.testing.assert_allclose(kernel, 1., atol=1e-3)
+    np.testing.assert_allclose(bias, 2., atol=1e-3)
+
   def test_sgd(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.SGD(lr=0.01,
-                                           momentum=0.9,
-                                           nesterov=True))
+      self._test_optimizer(keras.optimizers.SGD())
+
+  def test_momentum(self):
+    with self.cached_session():
+      self._test_optimizer(
+          keras.optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True))
 
-  @test_util.run_v1_only('b/120545219')
   def test_rmsprop(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.RMSprop())
-      _test_optimizer(keras.optimizers.RMSprop(decay=1e-3))
+      self._test_optimizer(keras.optimizers.RMSprop())
+      self._test_optimizer(keras.optimizers.RMSprop(decay=1e-3))
 
-  @test_util.run_v1_only('b/120545219')
   def test_adagrad(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.Adagrad())
-      _test_optimizer(keras.optimizers.Adagrad(decay=1e-3))
+      self._test_optimizer(keras.optimizers.Adagrad())
+      self._test_optimizer(keras.optimizers.Adagrad(decay=1e-3))
 
-  @test_util.run_v1_only('b/120545219')
   def test_adadelta(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.Adadelta(), target=0.6)
+      self._test_optimizer(keras.optimizers.Adadelta(), target=0.6)
       # Accuracy seems dependent on the initialization. Even adding tf.Print
       # nodes in the graph seemed to affect the initialization seed, and hence
       # the accuracy.
-      _test_optimizer(keras.optimizers.Adadelta(decay=1e-3), target=0.4)
+      self._test_optimizer(keras.optimizers.Adadelta(decay=1e-3), target=0.4)
 
-  @test_util.run_v1_only('b/120545219')
   def test_adam(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.Adam())
-      _test_optimizer(keras.optimizers.Adam(decay=1e-3))
-      _test_optimizer(keras.optimizers.Adam(amsgrad=True))
+      self._test_optimizer(keras.optimizers.Adam())
+      # Accuracy seems dependent on the seed initialization.
+      # TODO(b/121051441): fix test flakiness.
+      self._test_optimizer(keras.optimizers.Adam(decay=1e-3), target=0.73)
+      self._test_optimizer(keras.optimizers.Adam(amsgrad=True))
 
-  @test_util.run_v1_only('b/120545219')
   def test_adamax(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.Adamax())
-      _test_optimizer(keras.optimizers.Adamax(decay=1e-3))
+      self._test_optimizer(keras.optimizers.Adamax())
+      self._test_optimizer(keras.optimizers.Adamax(decay=1e-3))
 
-  @test_util.run_v1_only('b/120545219')
   def test_nadam(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.Nadam())
+      self._test_optimizer(keras.optimizers.Nadam())
 
-  @test_util.run_v1_only('b/120545219')
   def test_clipnorm(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.SGD(lr=0.01,
-                                           momentum=0.9,
-                                           clipnorm=0.5))
+      self._test_optimizer(
+          keras.optimizers.SGD(lr=0.01, momentum=0.9, clipnorm=0.5))
 
-  @test_util.run_v1_only('b/120545219')
   def test_clipvalue(self):
     with self.cached_session():
-      _test_optimizer(keras.optimizers.SGD(lr=0.01,
-                                           momentum=0.9,
-                                           clipvalue=0.5))
+      self._test_optimizer(
+          keras.optimizers.SGD(lr=0.01, momentum=0.9, clipvalue=0.5))
 
-  def test_tfoptimizer(self):
+  def test_tf_optimizer(self):
     optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01))
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(
@@ -185,8 +188,7 @@ class KerasOptimizersTest(test.TestCase):
     self.assertIs(graph_weak(), None)
     self.assertIs(optimizer_weak(), None)
 
-  @test_util.run_in_graph_and_eager_modes
-  def test_tfoptimizer_iterations(self):
+  def test_tf_optimizer_iterations(self):
     with self.cached_session():
       optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01))
       model = keras.models.Sequential()
diff --git a/tensorflow/python/keras/preprocessing/image.py b/tensorflow/python/keras/preprocessing/image.py
index e33993950d12f259cb6158b3496edbcfa6be5400..f2fefffb0a8fa62475216db9658288a4b9290ff7 100644
--- a/tensorflow/python/keras/preprocessing/image.py
+++ b/tensorflow/python/keras/preprocessing/image.py
@@ -30,7 +30,7 @@ except ImportError:
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import utils
 from tensorflow.python.util import tf_inspect
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 random_rotation = image.random_rotation
 random_shift = image.random_shift
@@ -44,7 +44,7 @@ apply_affine_transform = image.apply_affine_transform
 load_img = image.load_img
 
 
-@tf_export('keras.preprocessing.image.array_to_img')
+@keras_export('keras.preprocessing.image.array_to_img')
 def array_to_img(x, data_format=None, scale=True, dtype=None):
   """Converts a 3D Numpy array to a PIL Image instance.
 
@@ -74,7 +74,7 @@ def array_to_img(x, data_format=None, scale=True, dtype=None):
   return image.array_to_img(x, data_format=data_format, scale=scale, **kwargs)
 
 
-@tf_export('keras.preprocessing.image.img_to_array')
+@keras_export('keras.preprocessing.image.img_to_array')
 def img_to_array(img, data_format=None, dtype=None):
   """Converts a PIL Image instance to a Numpy array.
 
@@ -101,7 +101,7 @@ def img_to_array(img, data_format=None, dtype=None):
   return image.img_to_array(img, data_format=data_format, **kwargs)
 
 
-@tf_export('keras.preprocessing.image.save_img')
+@keras_export('keras.preprocessing.image.save_img')
 def save_img(path,
              x,
              data_format=None,
@@ -131,12 +131,12 @@ def save_img(path,
                  scale=scale, **kwargs)
 
 
-@tf_export('keras.preprocessing.image.Iterator')
+@keras_export('keras.preprocessing.image.Iterator')
 class Iterator(image.Iterator, utils.Sequence):
   pass
 
 
-@tf_export('keras.preprocessing.image.DirectoryIterator')
+@keras_export('keras.preprocessing.image.DirectoryIterator')
 class DirectoryIterator(image.DirectoryIterator, Iterator):
   """Iterator capable of reading images from a directory on disk.
 
@@ -227,7 +227,7 @@ class DirectoryIterator(image.DirectoryIterator, Iterator):
         **kwargs)
 
 
-@tf_export('keras.preprocessing.image.NumpyArrayIterator')
+@keras_export('keras.preprocessing.image.NumpyArrayIterator')
 class NumpyArrayIterator(image.NumpyArrayIterator, Iterator):
   """Iterator yielding data from a Numpy array.
 
@@ -291,7 +291,7 @@ class NumpyArrayIterator(image.NumpyArrayIterator, Iterator):
         **kwargs)
 
 
-@tf_export('keras.preprocessing.image.ImageDataGenerator')
+@keras_export('keras.preprocessing.image.ImageDataGenerator')
 class ImageDataGenerator(image.ImageDataGenerator):
   """Generate batches of tensor image data with real-time data augmentation.
 
@@ -518,16 +518,17 @@ class ImageDataGenerator(image.ImageDataGenerator):
         validation_split=validation_split,
         **kwargs)
 
-tf_export('keras.preprocessing.image.random_rotation')(random_rotation)
-tf_export('keras.preprocessing.image.random_shift')(random_shift)
-tf_export('keras.preprocessing.image.random_shear')(random_shear)
-tf_export('keras.preprocessing.image.random_zoom')(random_zoom)
-tf_export('keras.preprocessing.image.apply_channel_shift')(apply_channel_shift)
-tf_export(
+keras_export('keras.preprocessing.image.random_rotation')(random_rotation)
+keras_export('keras.preprocessing.image.random_shift')(random_shift)
+keras_export('keras.preprocessing.image.random_shear')(random_shear)
+keras_export('keras.preprocessing.image.random_zoom')(random_zoom)
+keras_export(
+    'keras.preprocessing.image.apply_channel_shift')(apply_channel_shift)
+keras_export(
     'keras.preprocessing.image.random_channel_shift')(random_channel_shift)
-tf_export(
+keras_export(
     'keras.preprocessing.image.apply_brightness_shift')(apply_brightness_shift)
-tf_export('keras.preprocessing.image.random_brightness')(random_brightness)
-tf_export(
+keras_export('keras.preprocessing.image.random_brightness')(random_brightness)
+keras_export(
     'keras.preprocessing.image.apply_affine_transform')(apply_affine_transform)
-tf_export('keras.preprocessing.image.load_img')(load_img)
+keras_export('keras.preprocessing.image.load_img')(load_img)
diff --git a/tensorflow/python/keras/preprocessing/image_test.py b/tensorflow/python/keras/preprocessing/image_test.py
index 4abaadfcd305f493b163ad710d11c977b3d1adac..f7cbb589dc9de63e4426a0a0338a67f78d7f07d3 100644
--- a/tensorflow/python/keras/preprocessing/image_test.py
+++ b/tensorflow/python/keras/preprocessing/image_test.py
@@ -386,6 +386,8 @@ class TestImage(test.TestCase):
     _ = keras.preprocessing.image.random_shift(x, 0.2, 0.2)
     _ = keras.preprocessing.image.random_shear(x, 2.)
     _ = keras.preprocessing.image.random_zoom(x, (0.5, 0.5))
+    _ = keras.preprocessing.image.apply_channel_shift(x, 2, 2)
+    _ = keras.preprocessing.image.apply_affine_transform(x, 2)
     with self.assertRaises(ValueError):
       keras.preprocessing.image.random_zoom(x, (0, 0, 0))
     _ = keras.preprocessing.image.random_channel_shift(x, 2.)
diff --git a/tensorflow/python/keras/preprocessing/sequence.py b/tensorflow/python/keras/preprocessing/sequence.py
index f014668909bf333af0d78ab89e3e1493efde8236..1d73a1e4da57cc4b7545d286ab1a2000618a8c5b 100644
--- a/tensorflow/python/keras/preprocessing/sequence.py
+++ b/tensorflow/python/keras/preprocessing/sequence.py
@@ -22,7 +22,7 @@ from __future__ import print_function
 from keras_preprocessing import sequence
 
 from tensorflow.python.keras import utils
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 pad_sequences = sequence.pad_sequences
 make_sampling_table = sequence.make_sampling_table
@@ -31,7 +31,7 @@ skipgrams = sequence.skipgrams
 _remove_long_seq = sequence._remove_long_seq  # pylint: disable=protected-access
 
 
-@tf_export('keras.preprocessing.sequence.TimeseriesGenerator')
+@keras_export('keras.preprocessing.sequence.TimeseriesGenerator')
 class TimeseriesGenerator(sequence.TimeseriesGenerator, utils.Sequence):
   """Utility class for generating batches of temporal data.
   This class takes in a sequence of data-points gathered at
@@ -89,7 +89,7 @@ class TimeseriesGenerator(sequence.TimeseriesGenerator, utils.Sequence):
   pass
 
 
-tf_export('keras.preprocessing.sequence.pad_sequences')(pad_sequences)
-tf_export(
+keras_export('keras.preprocessing.sequence.pad_sequences')(pad_sequences)
+keras_export(
     'keras.preprocessing.sequence.make_sampling_table')(make_sampling_table)
-tf_export('keras.preprocessing.sequence.skipgrams')(skipgrams)
+keras_export('keras.preprocessing.sequence.skipgrams')(skipgrams)
diff --git a/tensorflow/python/keras/preprocessing/text.py b/tensorflow/python/keras/preprocessing/text.py
index 57e5d00e0486694f8034453d56247029164f9849..f10a768c31fb862f029cda45ebbdc7c9a4038f9a 100644
--- a/tensorflow/python/keras/preprocessing/text.py
+++ b/tensorflow/python/keras/preprocessing/text.py
@@ -21,15 +21,15 @@ from __future__ import print_function
 
 from keras_preprocessing import text
 
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 text_to_word_sequence = text.text_to_word_sequence
 one_hot = text.one_hot
 hashing_trick = text.hashing_trick
 Tokenizer = text.Tokenizer
 
-tf_export(
+keras_export(
     'keras.preprocessing.text.text_to_word_sequence')(text_to_word_sequence)
-tf_export('keras.preprocessing.text.one_hot')(one_hot)
-tf_export('keras.preprocessing.text.hashing_trick')(hashing_trick)
-tf_export('keras.preprocessing.text.Tokenizer')(Tokenizer)
+keras_export('keras.preprocessing.text.one_hot')(one_hot)
+keras_export('keras.preprocessing.text.hashing_trick')(hashing_trick)
+keras_export('keras.preprocessing.text.Tokenizer')(Tokenizer)
diff --git a/tensorflow/python/keras/regularizers.py b/tensorflow/python/keras/regularizers.py
index 28b6ad4c65a2919323b81c89de6e5a3d4b5d3ff3..2dabe504b0c5fcdb95223ace15a60c33bfa058e2 100644
--- a/tensorflow/python/keras/regularizers.py
+++ b/tensorflow/python/keras/regularizers.py
@@ -24,10 +24,10 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import math_ops
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.regularizers.Regularizer')
+@keras_export('keras.regularizers.Regularizer')
 class Regularizer(object):
   """Regularizer base class.
   """
@@ -40,7 +40,7 @@ class Regularizer(object):
     return cls(**config)
 
 
-@tf_export('keras.regularizers.L1L2')
+@keras_export('keras.regularizers.L1L2')
 class L1L2(Regularizer):
   """Regularizer for L1 and L2 regularization.
 
@@ -54,6 +54,8 @@ class L1L2(Regularizer):
     self.l2 = K.cast_to_floatx(l2)
 
   def __call__(self, x):
+    if not self.l1 and not self.l2:
+      return K.constant(0.)
     regularization = 0.
     if self.l1:
       regularization += math_ops.reduce_sum(self.l1 * math_ops.abs(x))
@@ -68,27 +70,27 @@ class L1L2(Regularizer):
 # Aliases.
 
 
-@tf_export('keras.regularizers.l1')
+@keras_export('keras.regularizers.l1')
 def l1(l=0.01):
   return L1L2(l1=l)
 
 
-@tf_export('keras.regularizers.l2')
+@keras_export('keras.regularizers.l2')
 def l2(l=0.01):
   return L1L2(l2=l)
 
 
-@tf_export('keras.regularizers.l1_l2')
+@keras_export('keras.regularizers.l1_l2')
 def l1_l2(l1=0.01, l2=0.01):  # pylint: disable=redefined-outer-name
   return L1L2(l1=l1, l2=l2)
 
 
-@tf_export('keras.regularizers.serialize')
+@keras_export('keras.regularizers.serialize')
 def serialize(regularizer):
   return serialize_keras_object(regularizer)
 
 
-@tf_export('keras.regularizers.deserialize')
+@keras_export('keras.regularizers.deserialize')
 def deserialize(config, custom_objects=None):
   return deserialize_keras_object(
       config,
@@ -97,7 +99,7 @@ def deserialize(config, custom_objects=None):
       printable_module_name='regularizer')
 
 
-@tf_export('keras.regularizers.get')
+@keras_export('keras.regularizers.get')
 def get(identifier):
   if identifier is None:
     return None
diff --git a/tensorflow/python/keras/regularizers_test.py b/tensorflow/python/keras/regularizers_test.py
index 3d6b259d87de8b6533d008a839f0df2226d71ed4..fb2439395bd94e781b9c4b7576c9b249ee44286f 100644
--- a/tensorflow/python/keras/regularizers_test.py
+++ b/tensorflow/python/keras/regularizers_test.py
@@ -18,9 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python import keras
-from tensorflow.python.keras import testing_utils
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
@@ -28,50 +30,54 @@ DATA_DIM = 5
 NUM_CLASSES = 2
 
 
-def get_data():
-  (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-      train_samples=10,
-      test_samples=10,
-      input_shape=(DATA_DIM,),
-      num_classes=NUM_CLASSES)
-  y_train = keras.utils.to_categorical(y_train, NUM_CLASSES)
-  y_test = keras.utils.to_categorical(y_test, NUM_CLASSES)
-  return (x_train, y_train), (x_test, y_test)
-
-
-def create_model(kernel_regularizer=None, activity_regularizer=None):
-  model = keras.models.Sequential()
-  model.add(keras.layers.Dense(NUM_CLASSES,
-                               kernel_regularizer=kernel_regularizer,
-                               activity_regularizer=activity_regularizer,
-                               input_shape=(DATA_DIM,)))
-  return model
+class KerasRegularizersTest(test.TestCase, parameterized.TestCase):
 
+  def create_model(self, kernel_regularizer=None, activity_regularizer=None):
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(NUM_CLASSES,
+                                 kernel_regularizer=kernel_regularizer,
+                                 activity_regularizer=activity_regularizer,
+                                 input_shape=(DATA_DIM,)))
+    return model
 
-class KerasRegularizersTest(test.TestCase):
+  def get_data(self):
+    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+        train_samples=10,
+        test_samples=10,
+        input_shape=(DATA_DIM,),
+        num_classes=NUM_CLASSES)
+    y_train = keras.utils.to_categorical(y_train, NUM_CLASSES)
+    y_test = keras.utils.to_categorical(y_test, NUM_CLASSES)
+    return (x_train, y_train), (x_test, y_test)
 
-  def test_kernel_regularization(self):
+  @parameterized.named_parameters([
+      ('l1', keras.regularizers.l1()),
+      ('l2', keras.regularizers.l2()),
+      ('l1_l2', keras.regularizers.l1_l2()),
+  ])
+  def test_kernel_regularization(self, regularizer):
     with self.cached_session():
-      (x_train, y_train), _ = get_data()
-      for reg in [keras.regularizers.l1(),
-                  keras.regularizers.l2(),
-                  keras.regularizers.l1_l2()]:
-        model = create_model(kernel_regularizer=reg)
-        model.compile(loss='categorical_crossentropy', optimizer='sgd')
-        assert len(model.losses) == 1
-        model.fit(x_train, y_train, batch_size=10,
-                  epochs=1, verbose=0)
+      (x_train, y_train), _ = self.get_data()
+      model = self.create_model(kernel_regularizer=regularizer)
+      model.compile(loss='categorical_crossentropy', optimizer='sgd')
+      assert len(model.losses) == 1
+      model.fit(x_train, y_train, batch_size=10,
+                epochs=1, verbose=0)
 
-  @test_util.run_deprecated_v1
-  def test_activity_regularization(self):
+  @parameterized.named_parameters([
+      ('l1', keras.regularizers.l1()),
+      ('l2', keras.regularizers.l2()),
+      ('l2_zero', keras.regularizers.l2(0.)),
+  ])
+  @test_util.deprecated_graph_mode_only
+  def test_activity_regularization(self, regularizer):
     with self.cached_session():
-      (x_train, y_train), _ = get_data()
-      for reg in [keras.regularizers.l1(), keras.regularizers.l2()]:
-        model = create_model(activity_regularizer=reg)
-        model.compile(loss='categorical_crossentropy', optimizer='sgd')
-        assert len(model.losses) == 1
-        model.fit(x_train, y_train, batch_size=10,
-                  epochs=1, verbose=0)
+      (x_train, y_train), _ = self.get_data()
+      model = self.create_model(activity_regularizer=regularizer)
+      model.compile(loss='categorical_crossentropy', optimizer='sgd')
+      assert len(model.losses) == 1
+      model.fit(x_train, y_train, batch_size=10,
+                epochs=1, verbose=0)
 
   def test_zero_regularization(self):
     inputs = keras.backend.ones(shape=(10, 10))
diff --git a/tensorflow/python/keras/saving/__init__.py b/tensorflow/python/keras/saving/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b32ae4041c6d5c6111de42c5401095607972281f
--- /dev/null
+++ b/tensorflow/python/keras/saving/__init__.py
@@ -0,0 +1,35 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utils for saving and loading Keras Models."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras.saving.hdf5_format import load_attributes_from_hdf5_group
+from tensorflow.python.keras.saving.hdf5_format import load_model
+from tensorflow.python.keras.saving.hdf5_format import load_weights_from_hdf5_group
+from tensorflow.python.keras.saving.hdf5_format import load_weights_from_hdf5_group_by_name
+from tensorflow.python.keras.saving.hdf5_format import preprocess_weights_for_loading
+from tensorflow.python.keras.saving.hdf5_format import save_attributes_to_hdf5_group
+from tensorflow.python.keras.saving.hdf5_format import save_model
+from tensorflow.python.keras.saving.hdf5_format import save_weights_to_hdf5_group
+from tensorflow.python.keras.saving.model_config import model_from_config
+from tensorflow.python.keras.saving.model_config import model_from_json
+from tensorflow.python.keras.saving.model_config import model_from_yaml
+from tensorflow.python.keras.saving.saved_model import export_saved_model
+from tensorflow.python.keras.saving.saved_model import load_from_saved_model
+from tensorflow.python.keras.saving.saving_utils import trace_model_call
+
+
diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
new file mode 100644
index 0000000000000000000000000000000000000000..973cdcac8a079bb7c19f4e004a6315e2da94ff86
--- /dev/null
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -0,0 +1,911 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Functions for saving and loading a Keras Model from HDF5 format.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import os
+
+import numpy as np
+from six.moves import zip  # pylint: disable=redefined-builtin
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.saving import model_config as model_config_lib
+from tensorflow.python.keras.utils import conv_utils
+from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import serialization
+from tensorflow.python.util.tf_export import keras_export
+
+# pylint: disable=g-import-not-at-top
+try:
+  import h5py
+  HDF5_OBJECT_HEADER_LIMIT = 64512
+except ImportError:
+  h5py = None
+# pylint: enable=g-import-not-at-top
+
+
+@keras_export('keras.models.save_model')
+def save_model(model, filepath, overwrite=True, include_optimizer=True):
+  """Saves a model to a HDF5 file.
+
+  The saved model contains:
+      - the model's configuration (topology)
+      - the model's weights
+      - the model's optimizer's state (if any)
+
+  Thus the saved model can be reinstantiated in
+  the exact same state, without any of the code
+  used for model definition or training.
+
+  Arguments:
+      model: Keras model instance to be saved.
+      filepath: One of the following:
+          - String, path where to save the model
+          - `h5py.File` object where to save the model
+      overwrite: Whether we should overwrite any existing
+          model at the target location, or instead
+          ask the user with a manual prompt.
+      include_optimizer: If True, save optimizer's state together.
+
+  Raises:
+      ImportError: if h5py is not available.
+  """
+
+  if h5py is None:
+    raise ImportError('`save_model` requires h5py.')
+
+  from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
+
+  # TODO(psv) Add warning when we save models that contain non-serializable
+  # entities like metrics added using `add_metric` and losses added using
+  # `add_loss.`
+
+  if not isinstance(filepath, h5py.File):
+    # If file exists and should not be overwritten.
+    if not overwrite and os.path.isfile(filepath):
+      proceed = ask_to_proceed_with_overwrite(filepath)
+      if not proceed:
+        return
+
+    f = h5py.File(filepath, mode='w')
+    opened_new_file = True
+  else:
+    f = filepath
+    opened_new_file = False
+
+  try:
+    f.attrs['keras_version'] = str(keras_version).encode('utf8')
+    f.attrs['backend'] = K.backend().encode('utf8')
+    f.attrs['model_config'] = json.dumps(
+        {
+            'class_name': model.__class__.__name__,
+            'config': model.get_config()
+        },
+        default=serialization.get_json_type).encode('utf8')
+
+    model_weights_group = f.create_group('model_weights')
+    model_layers = model.layers
+    save_weights_to_hdf5_group(model_weights_group, model_layers)
+
+    if include_optimizer and model.optimizer:
+      if isinstance(model.optimizer, optimizers.TFOptimizer):
+        logging.warning(
+            'TensorFlow optimizers do not '
+            'make it possible to access '
+            'optimizer attributes or optimizer state '
+            'after instantiation. '
+            'As a result, we cannot save the optimizer '
+            'as part of the model save file. '
+            'You will have to compile your model again after loading it. '
+            'Prefer using a Keras optimizer instead '
+            '(see keras.io/optimizers).')
+      else:
+        f.attrs['training_config'] = json.dumps(
+            {
+                'optimizer_config': {
+                    'class_name': model.optimizer.__class__.__name__,
+                    'config': model.optimizer.get_config()
+                },
+                'loss': model.loss,
+                'metrics': model._compile_metrics,
+                'weighted_metrics': model._compile_weighted_metrics,
+                'sample_weight_mode': model.sample_weight_mode,
+                'loss_weights': model.loss_weights,
+            },
+            default=serialization.get_json_type).encode('utf8')
+
+        # Save optimizer weights.
+        save_optimizer_weights_to_hdf5_group(f, model.optimizer)
+    f.flush()
+  finally:
+    if opened_new_file:
+      f.close()
+
+
+@keras_export('keras.models.load_model')
+def load_model(filepath, custom_objects=None, compile=True):  # pylint: disable=redefined-builtin
+  """Loads a model saved via `save_model`.
+
+  Arguments:
+      filepath: One of the following:
+          - String, path to the saved model
+          - `h5py.File` object from which to load the model
+      custom_objects: Optional dictionary mapping names
+          (strings) to custom classes or functions to be
+          considered during deserialization.
+      compile: Boolean, whether to compile the model
+          after loading.
+
+  Returns:
+      A Keras model instance. If an optimizer was found
+      as part of the saved model, the model is already
+      compiled. Otherwise, the model is uncompiled and
+      a warning will be displayed. When `compile` is set
+      to False, the compilation is omitted without any
+      warning.
+
+  Raises:
+      ImportError: if h5py is not available.
+      ValueError: In case of an invalid savefile.
+  """
+  if h5py is None:
+    raise ImportError('`load_model` requires h5py.')
+
+  if not custom_objects:
+    custom_objects = {}
+
+  def convert_custom_objects(obj):
+    """Handles custom object lookup.
+
+    Arguments:
+        obj: object, dict, or list.
+
+    Returns:
+        The same structure, where occurrences
+            of a custom object name have been replaced
+            with the custom object.
+    """
+    if isinstance(obj, list):
+      deserialized = []
+      for value in obj:
+        deserialized.append(convert_custom_objects(value))
+      return deserialized
+    if isinstance(obj, dict):
+      deserialized = {}
+      for key, value in obj.items():
+        deserialized[key] = convert_custom_objects(value)
+      return deserialized
+    if obj in custom_objects:
+      return custom_objects[obj]
+    return obj
+
+  opened_new_file = not isinstance(filepath, h5py.File)
+  if opened_new_file:
+    f = h5py.File(filepath, mode='r')
+  else:
+    f = filepath
+
+  model = None
+  try:
+    # instantiate model
+    model_config = f.attrs.get('model_config')
+    if model_config is None:
+      raise ValueError('No model found in config file.')
+    model_config = json.loads(model_config.decode('utf-8'))
+    model = model_config_lib.model_from_config(model_config,
+                                               custom_objects=custom_objects)
+
+    # set weights
+    load_weights_from_hdf5_group(f['model_weights'], model.layers)
+
+    if compile:
+      # instantiate optimizer
+      training_config = f.attrs.get('training_config')
+      if training_config is None:
+        logging.warning('No training configuration found in save file: '
+                        'the model was *not* compiled. Compile it manually.')
+        return model
+      training_config = json.loads(training_config.decode('utf-8'))
+      optimizer_config = training_config['optimizer_config']
+      optimizer = optimizers.deserialize(
+          optimizer_config, custom_objects=custom_objects)
+
+      # Recover loss functions and metrics.
+      loss = convert_custom_objects(training_config['loss'])
+      metrics = convert_custom_objects(training_config['metrics'])
+      weighted_metrics = convert_custom_objects(
+          training_config.get('weighted_metrics', None))
+      sample_weight_mode = training_config['sample_weight_mode']
+      loss_weights = training_config['loss_weights']
+
+      # Compile model.
+      model.compile(
+          optimizer=optimizer,
+          loss=loss,
+          metrics=metrics,
+          weighted_metrics=weighted_metrics,
+          loss_weights=loss_weights,
+          sample_weight_mode=sample_weight_mode)
+
+      # Set optimizer weights.
+      if 'optimizer_weights' in f:
+        # Build train function (to get weight updates).
+        # Models that aren't graph networks must wait until they are called
+        # with data to _make_train_function() and so can't load optimizer
+        # weights.
+        if model._is_graph_network:  # pylint: disable=protected-access
+          model._make_train_function()
+          optimizer_weight_values = load_optimizer_weights_from_hdf5_group(f)
+          try:
+            model.optimizer.set_weights(optimizer_weight_values)
+          except ValueError:
+            logging.warning('Error in loading the saved optimizer '
+                            'state. As a result, your model is '
+                            'starting with a freshly initialized '
+                            'optimizer.')
+        else:
+          logging.warning('Sequential models without an `input_shape` '
+                          'passed to the first layer cannot reload their '
+                          'optimizer state. As a result, your model is'
+                          'starting with a freshly initialized optimizer.')
+
+  finally:
+    if opened_new_file:
+      f.close()
+  return model
+
+
+def preprocess_weights_for_loading(layer,
+                                   weights,
+                                   original_keras_version=None,
+                                   original_backend=None):
+  """Preprocess layer weights between different Keras formats.
+
+  Converts layers weights from Keras 1 format to Keras 2 and also weights of
+  CuDNN layers in Keras 2.
+
+  Arguments:
+      layer: Layer instance.
+      weights: List of weights values (Numpy arrays).
+      original_keras_version: Keras version for the weights, as a string.
+      original_backend: Keras backend the weights were trained with,
+          as a string.
+
+  Returns:
+      A list of weights values (Numpy arrays).
+  """
+  def convert_nested_bidirectional(weights):
+    """Converts layers nested in `Bidirectional` wrapper.
+
+    This function uses `preprocess_weights_for_loading()` for converting
+    layers.
+
+    Arguments:
+        weights: List of weights values (Numpy arrays).
+
+    Returns:
+        A list of weights values (Numpy arrays).
+    """
+    num_weights_per_layer = len(weights) // 2
+    forward_weights = preprocess_weights_for_loading(
+        layer.forward_layer, weights[:num_weights_per_layer],
+        original_keras_version, original_backend)
+    backward_weights = preprocess_weights_for_loading(
+        layer.backward_layer, weights[num_weights_per_layer:],
+        original_keras_version, original_backend)
+    return forward_weights + backward_weights
+
+  def convert_nested_time_distributed(weights):
+    """Converts layers nested in `TimeDistributed` wrapper.
+
+    This function uses `preprocess_weights_for_loading()` for converting nested
+    layers.
+
+    Arguments:
+        weights: List of weights values (Numpy arrays).
+
+    Returns:
+        A list of weights values (Numpy arrays).
+    """
+    return preprocess_weights_for_loading(
+        layer.layer, weights, original_keras_version, original_backend)
+
+  def convert_nested_model(weights):
+    """Converts layers nested in `Model` or `Sequential`.
+
+    This function uses `preprocess_weights_for_loading()` for converting nested
+    layers.
+
+    Arguments:
+        weights: List of weights values (Numpy arrays).
+
+    Returns:
+        A list of weights values (Numpy arrays).
+    """
+    new_weights = []
+    # trainable weights
+    for sublayer in layer.layers:
+      num_weights = len(sublayer.trainable_weights)
+      if num_weights > 0:
+        new_weights.extend(preprocess_weights_for_loading(
+            layer=sublayer,
+            weights=weights[:num_weights],
+            original_keras_version=original_keras_version,
+            original_backend=original_backend))
+        weights = weights[num_weights:]
+
+    # non-trainable weights
+    for sublayer in layer.layers:
+      num_weights = len([l for l in sublayer.weights
+                         if l not in sublayer.trainable_weights])
+      if num_weights > 0:
+        new_weights.extend(preprocess_weights_for_loading(
+            layer=sublayer,
+            weights=weights[:num_weights],
+            original_keras_version=original_keras_version,
+            original_backend=original_backend))
+        weights = weights[num_weights:]
+    return new_weights
+
+  # Convert layers nested in Bidirectional/Model/Sequential.
+  # Both transformation should be ran for both Keras 1->2 conversion
+  # and for conversion of CuDNN layers.
+  if layer.__class__.__name__ == 'Bidirectional':
+    weights = convert_nested_bidirectional(weights)
+  if layer.__class__.__name__ == 'TimeDistributed':
+    weights = convert_nested_time_distributed(weights)
+  elif layer.__class__.__name__ in ['Model', 'Sequential']:
+    weights = convert_nested_model(weights)
+
+  if original_keras_version == '1':
+    if layer.__class__.__name__ == 'TimeDistributed':
+      weights = preprocess_weights_for_loading(
+          layer.layer, weights, original_keras_version, original_backend)
+
+    if layer.__class__.__name__ == 'Conv1D':
+      shape = weights[0].shape
+      # Handle Keras 1.1 format
+      if shape[:2] != (layer.kernel_size[0], 1) or shape[3] != layer.filters:
+        # Legacy shape:
+        # (filters, input_dim, filter_length, 1)
+        assert shape[0] == layer.filters and shape[2:] == (layer.kernel_size[0],
+                                                           1)
+        weights[0] = np.transpose(weights[0], (2, 3, 1, 0))
+      weights[0] = weights[0][:, 0, :, :]
+
+    if layer.__class__.__name__ == 'Conv2D':
+      if layer.data_format == 'channels_first':
+        # old: (filters, stack_size, kernel_rows, kernel_cols)
+        # new: (kernel_rows, kernel_cols, stack_size, filters)
+        weights[0] = np.transpose(weights[0], (2, 3, 1, 0))
+
+    if layer.__class__.__name__ == 'Conv2DTranspose':
+      if layer.data_format == 'channels_last':
+        # old: (kernel_rows, kernel_cols, stack_size, filters)
+        # new: (kernel_rows, kernel_cols, filters, stack_size)
+        weights[0] = np.transpose(weights[0], (0, 1, 3, 2))
+      if layer.data_format == 'channels_first':
+        # old: (filters, stack_size, kernel_rows, kernel_cols)
+        # new: (kernel_rows, kernel_cols, filters, stack_size)
+        weights[0] = np.transpose(weights[0], (2, 3, 0, 1))
+
+    if layer.__class__.__name__ == 'Conv3D':
+      if layer.data_format == 'channels_first':
+        # old: (filters, stack_size, ...)
+        # new: (..., stack_size, filters)
+        weights[0] = np.transpose(weights[0], (2, 3, 4, 1, 0))
+
+    if layer.__class__.__name__ == 'GRU':
+      if len(weights) == 9:
+        kernel = np.concatenate([weights[0], weights[3], weights[6]], axis=-1)
+        recurrent_kernel = np.concatenate(
+            [weights[1], weights[4], weights[7]], axis=-1)
+        bias = np.concatenate([weights[2], weights[5], weights[8]], axis=-1)
+        weights = [kernel, recurrent_kernel, bias]
+
+    if layer.__class__.__name__ == 'LSTM':
+      if len(weights) == 12:
+        # old: i, c, f, o
+        # new: i, f, c, o
+        kernel = np.concatenate(
+            [weights[0], weights[6], weights[3], weights[9]], axis=-1)
+        recurrent_kernel = np.concatenate(
+            [weights[1], weights[7], weights[4], weights[10]], axis=-1)
+        bias = np.concatenate(
+            [weights[2], weights[8], weights[5], weights[11]], axis=-1)
+        weights = [kernel, recurrent_kernel, bias]
+
+    if layer.__class__.__name__ == 'ConvLSTM2D':
+      if len(weights) == 12:
+        kernel = np.concatenate(
+            [weights[0], weights[6], weights[3], weights[9]], axis=-1)
+        recurrent_kernel = np.concatenate(
+            [weights[1], weights[7], weights[4], weights[10]], axis=-1)
+        bias = np.concatenate(
+            [weights[2], weights[8], weights[5], weights[11]], axis=-1)
+        if layer.data_format == 'channels_first':
+          # old: (filters, stack_size, kernel_rows, kernel_cols)
+          # new: (kernel_rows, kernel_cols, stack_size, filters)
+          kernel = np.transpose(kernel, (2, 3, 1, 0))
+          recurrent_kernel = np.transpose(recurrent_kernel, (2, 3, 1, 0))
+        weights = [kernel, recurrent_kernel, bias]
+
+  conv_layers = ['Conv1D', 'Conv2D', 'Conv3D', 'Conv2DTranspose', 'ConvLSTM2D']
+  if layer.__class__.__name__ in conv_layers:
+    if original_backend == 'theano':
+      weights[0] = conv_utils.convert_kernel(weights[0])
+      if layer.__class__.__name__ == 'ConvLSTM2D':
+        weights[1] = conv_utils.convert_kernel(weights[1])
+    if K.int_shape(layer.weights[0]) != weights[0].shape:
+      weights[0] = np.transpose(weights[0], (3, 2, 0, 1))
+      if layer.__class__.__name__ == 'ConvLSTM2D':
+        weights[1] = np.transpose(weights[1], (3, 2, 0, 1))
+
+  # convert CuDNN layers
+  return _convert_rnn_weights(layer, weights)
+
+
+def _convert_rnn_weights(layer, weights):
+  """Converts weights for RNN layers between native and CuDNN format.
+
+  Input kernels for each gate are transposed and converted between Fortran
+  and C layout, recurrent kernels are transposed. For LSTM biases are summed/
+  split in half, for GRU biases are reshaped.
+
+  Weights can be converted in both directions between `LSTM` and`CuDNNSLTM`
+  and between `CuDNNGRU` and `GRU(reset_after=True)`. Default `GRU` is not
+  compatible with `CuDNNGRU`.
+
+  For missing biases in `LSTM`/`GRU` (`use_bias=False`) no conversion is made.
+
+  Arguments:
+      layer: Target layer instance.
+      weights: List of source weights values (input kernels, recurrent
+          kernels, [biases]) (Numpy arrays).
+
+  Returns:
+      A list of converted weights values (Numpy arrays).
+
+  Raises:
+      ValueError: for incompatible GRU layer/weights or incompatible biases
+  """
+
+  def transform_kernels(kernels, func, n_gates):
+    """Transforms kernel for each gate separately using given function.
+
+    Arguments:
+        kernels: Stacked array of kernels for individual gates.
+        func: Function applied to kernel of each gate.
+        n_gates: Number of gates (4 for LSTM, 3 for GRU).
+
+    Returns:
+        Stacked array of transformed kernels.
+    """
+    return np.hstack([func(k) for k in np.hsplit(kernels, n_gates)])
+
+  def transpose_input(from_cudnn):
+    """Makes a function that transforms input kernels from/to CuDNN format.
+
+    It keeps the shape, but changes between the layout (Fortran/C). Eg.:
+
+    ```
+    Keras                 CuDNN
+    [[0, 1, 2],  <--->  [[0, 2, 4],
+     [3, 4, 5]]          [1, 3, 5]]
+    ```
+
+    It can be passed to `transform_kernels()`.
+
+    Arguments:
+        from_cudnn: `True` if source weights are in CuDNN format, `False`
+            if they're in plain Keras format.
+
+    Returns:
+        Function that converts input kernel to the other format.
+    """
+    order = 'F' if from_cudnn else 'C'
+
+    def transform(kernel):
+      return kernel.T.reshape(kernel.shape, order=order)
+
+    return transform
+
+  target_class = layer.__class__.__name__
+
+  # convert the weights between CuDNNLSTM and LSTM
+  if target_class in ['LSTM', 'CuDNNLSTM'] and len(weights) == 3:
+    # determine if we're loading a CuDNNLSTM layer
+    # from the number of bias weights:
+    # CuDNNLSTM has (units * 8) weights; while LSTM has (units * 4)
+    # if there's no bias weight in the file, skip this conversion
+    units = weights[1].shape[0]
+    bias_shape = weights[2].shape
+    n_gates = 4
+
+    if bias_shape == (2 * units * n_gates,):
+      source = 'CuDNNLSTM'
+    elif bias_shape == (units * n_gates,):
+      source = 'LSTM'
+    else:
+      raise ValueError('Invalid bias shape: ' + str(bias_shape))
+
+    def convert_lstm_weights(weights, from_cudnn=True):
+      """Converts the weights between CuDNNLSTM and LSTM.
+
+      Arguments:
+        weights: Original weights.
+        from_cudnn: Indicates whether original weights are from CuDNN layer.
+
+      Returns:
+        Updated weights compatible with LSTM.
+      """
+
+      # Transpose (and reshape) input and recurrent kernels
+      kernels = transform_kernels(weights[0], transpose_input(from_cudnn),
+                                  n_gates)
+      recurrent_kernels = transform_kernels(weights[1], lambda k: k.T, n_gates)
+      if from_cudnn:
+        # merge input and recurrent biases into a single set
+        biases = np.sum(np.split(weights[2], 2, axis=0), axis=0)
+      else:
+        # Split single set of biases evenly to two sets. The way of
+        # splitting doesn't matter as long as the two sets sum is kept.
+        biases = np.tile(0.5 * weights[2], 2)
+      return [kernels, recurrent_kernels, biases]
+
+    if source != target_class:
+      weights = convert_lstm_weights(weights, from_cudnn=source == 'CuDNNLSTM')
+
+  # convert the weights between CuDNNGRU and GRU(reset_after=True)
+  if target_class in ['GRU', 'CuDNNGRU'] and len(weights) == 3:
+    # We can determine the source of the weights from the shape of the bias.
+    # If there is no bias we skip the conversion since
+    # CuDNNGRU always has biases.
+
+    units = weights[1].shape[0]
+    bias_shape = weights[2].shape
+    n_gates = 3
+
+    def convert_gru_weights(weights, from_cudnn=True):
+      """Converts the weights between CuDNNGRU and GRU.
+
+      Arguments:
+        weights: Original weights.
+        from_cudnn: Indicates whether original weights are from CuDNN layer.
+
+      Returns:
+        Updated weights compatible with GRU.
+      """
+
+      kernels = transform_kernels(weights[0], transpose_input(from_cudnn),
+                                  n_gates)
+      recurrent_kernels = transform_kernels(weights[1], lambda k: k.T, n_gates)
+      biases = np.array(weights[2]).reshape((2, -1) if from_cudnn else -1)
+      return [kernels, recurrent_kernels, biases]
+
+    if bias_shape == (2 * units * n_gates,):
+      source = 'CuDNNGRU'
+    elif bias_shape == (2, units * n_gates):
+      source = 'GRU(reset_after=True)'
+    elif bias_shape == (units * n_gates,):
+      source = 'GRU(reset_after=False)'
+    else:
+      raise ValueError('Invalid bias shape: ' + str(bias_shape))
+
+    if target_class == 'CuDNNGRU':
+      target = 'CuDNNGRU'
+    elif layer.reset_after:
+      target = 'GRU(reset_after=True)'
+    else:
+      target = 'GRU(reset_after=False)'
+
+    # only convert between different types
+    if source != target:
+      types = (source, target)
+      if 'GRU(reset_after=False)' in types:
+        raise ValueError('%s is not compatible with %s' % types)
+      if source == 'CuDNNGRU':
+        weights = convert_gru_weights(weights, from_cudnn=True)
+      elif source == 'GRU(reset_after=True)':
+        weights = convert_gru_weights(weights, from_cudnn=False)
+
+  return weights
+
+
+def save_optimizer_weights_to_hdf5_group(hdf5_group, optimizer):
+  """Saves optimizer weights of a optimizer to a HDF5 group.
+
+  Arguments:
+      hdf5_group: HDF5 group.
+      optimizer: optimizer instance.
+  """
+
+  symbolic_weights = getattr(optimizer, 'weights')
+  if symbolic_weights:
+    weights_group = hdf5_group.create_group('optimizer_weights')
+    weight_names = [str(w.name).encode('utf8') for w in symbolic_weights]
+    save_attributes_to_hdf5_group(weights_group, 'weight_names', weight_names)
+    weight_values = K.batch_get_value(symbolic_weights)
+    for name, val in zip(weight_names, weight_values):
+      param_dset = weights_group.create_dataset(
+          name, val.shape, dtype=val.dtype)
+      if not val.shape:
+        # scalar
+        param_dset[()] = val
+      else:
+        param_dset[:] = val
+
+
+def load_optimizer_weights_from_hdf5_group(hdf5_group):
+  """Load optimizer weights from a HDF5 group.
+
+  Arguments:
+      hdf5_group: A pointer to a HDF5 group.
+
+  Returns:
+      data: List of optimizer weight names.
+  """
+  weights_group = hdf5_group['optimizer_weights']
+  optimizer_weight_names = load_attributes_from_hdf5_group(
+      weights_group, 'weight_names')
+  return [weights_group[weight_name] for weight_name in optimizer_weight_names]
+
+
+def save_weights_to_hdf5_group(f, layers):
+  """Saves the weights of a list of layers to a HDF5 group.
+
+  Arguments:
+      f: HDF5 group.
+      layers: List of layer instances.
+  """
+  from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
+
+  save_attributes_to_hdf5_group(
+      f, 'layer_names', [layer.name.encode('utf8') for layer in layers])
+  f.attrs['backend'] = K.backend().encode('utf8')
+  f.attrs['keras_version'] = str(keras_version).encode('utf8')
+
+  # On TPUs, modifying the graph between session.runs() triggers some expensive
+  # recompilation overhead. To avoid this, we build up the full set of tensors
+  # to save before fetching weights, thus only modifying the graph once.
+  layer_weights_dict = {}
+  for layer in layers:
+    layer_weights_dict[layer.name] = [ops.convert_to_tensor(w)
+                                      for w in layer.weights]
+
+  for layer in layers:
+    g = f.create_group(layer.name)
+    symbolic_weights = layer_weights_dict[layer.name]
+    weight_values = K.batch_get_value(symbolic_weights)
+    weight_names = []
+    for i, (w, val) in enumerate(zip(symbolic_weights, weight_values)):
+      if hasattr(w, 'name') and w.name:
+        name = str(w.name)
+      else:
+        name = 'param_' + str(i)
+      weight_names.append(name.encode('utf8'))
+    save_attributes_to_hdf5_group(g, 'weight_names', weight_names)
+    for name, val in zip(weight_names, weight_values):
+      param_dset = g.create_dataset(name, val.shape, dtype=val.dtype)
+      if not val.shape:
+        # scalar
+        param_dset[()] = val
+      else:
+        param_dset[:] = val
+
+
+def load_weights_from_hdf5_group(f, layers):
+  """Implements topological (order-based) weight loading.
+
+  Arguments:
+      f: A pointer to a HDF5 group.
+      layers: a list of target layers.
+
+  Raises:
+      ValueError: in case of mismatch between provided layers
+          and weights file.
+  """
+  if 'keras_version' in f.attrs:
+    original_keras_version = f.attrs['keras_version'].decode('utf8')
+  else:
+    original_keras_version = '1'
+  if 'backend' in f.attrs:
+    original_backend = f.attrs['backend'].decode('utf8')
+  else:
+    original_backend = None
+
+  filtered_layers = []
+  for layer in layers:
+    weights = layer.weights
+    if weights:
+      filtered_layers.append(layer)
+
+  layer_names = load_attributes_from_hdf5_group(f, 'layer_names')
+  filtered_layer_names = []
+  for name in layer_names:
+    g = f[name]
+    weight_names = load_attributes_from_hdf5_group(g, 'weight_names')
+    if weight_names:
+      filtered_layer_names.append(name)
+  layer_names = filtered_layer_names
+  if len(layer_names) != len(filtered_layers):
+    raise ValueError('You are trying to load a weight file '
+                     'containing ' + str(len(layer_names)) +
+                     ' layers into a model with ' + str(len(filtered_layers)) +
+                     ' layers.')
+
+  # We batch weight value assignments in a single backend call
+  # which provides a speedup in TensorFlow.
+  weight_value_tuples = []
+  for k, name in enumerate(layer_names):
+    g = f[name]
+    weight_names = load_attributes_from_hdf5_group(g, 'weight_names')
+    weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names]
+    layer = filtered_layers[k]
+    symbolic_weights = layer.weights
+    weight_values = preprocess_weights_for_loading(
+        layer, weight_values, original_keras_version, original_backend)
+    if len(weight_values) != len(symbolic_weights):
+      raise ValueError('Layer #' + str(k) + ' (named "' + layer.name +
+                       '" in the current model) was found to '
+                       'correspond to layer ' + name + ' in the save file. '
+                       'However the new layer ' + layer.name + ' expects ' +
+                       str(len(symbolic_weights)) +
+                       ' weights, but the saved weights have ' +
+                       str(len(weight_values)) + ' elements.')
+    weight_value_tuples += zip(symbolic_weights, weight_values)
+  K.batch_set_value(weight_value_tuples)
+
+
+def load_weights_from_hdf5_group_by_name(f, layers):
+  """Implements name-based weight loading.
+
+  (instead of topological weight loading).
+
+  Layers that have no matching name are skipped.
+
+  Arguments:
+      f: A pointer to a HDF5 group.
+      layers: a list of target layers.
+
+  Raises:
+      ValueError: in case of mismatch between provided layers
+          and weights file.
+  """
+  if 'keras_version' in f.attrs:
+    original_keras_version = f.attrs['keras_version'].decode('utf8')
+  else:
+    original_keras_version = '1'
+  if 'backend' in f.attrs:
+    original_backend = f.attrs['backend'].decode('utf8')
+  else:
+    original_backend = None
+
+  # New file format.
+  layer_names = load_attributes_from_hdf5_group(f, 'layer_names')
+
+  # Reverse index of layer name to list of layers with name.
+  index = {}
+  for layer in layers:
+    if layer.name:
+      index.setdefault(layer.name, []).append(layer)
+
+  # We batch weight value assignments in a single backend call
+  # which provides a speedup in TensorFlow.
+  weight_value_tuples = []
+  for k, name in enumerate(layer_names):
+    g = f[name]
+    weight_names = load_attributes_from_hdf5_group(g, 'weight_names')
+    weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names]
+
+    for layer in index.get(name, []):
+      symbolic_weights = layer.weights
+      weight_values = preprocess_weights_for_loading(
+          layer, weight_values, original_keras_version, original_backend)
+      if len(weight_values) != len(symbolic_weights):
+        raise ValueError('Layer #' + str(k) + ' (named "' + layer.name +
+                         '") expects ' + str(len(symbolic_weights)) +
+                         ' weight(s), but the saved weights' + ' have ' +
+                         str(len(weight_values)) + ' element(s).')
+      # Set values.
+      for i in range(len(weight_values)):
+        if K.int_shape(symbolic_weights[i]) != weight_values[i].shape:
+          raise ValueError('Layer #' + str(k) +' (named "' + layer.name +
+                           '"), weight ' + str(symbolic_weights[i]) +
+                           ' has shape {}'.format(K.int_shape(
+                               symbolic_weights[i])) +
+                           ', but the saved weight has shape ' +
+                           str(weight_values[i].shape) + '.')
+
+        else:
+          weight_value_tuples.append((symbolic_weights[i], weight_values[i]))
+  K.batch_set_value(weight_value_tuples)
+
+
+def save_attributes_to_hdf5_group(group, name, data):
+  """Saves attributes (data) of the specified name into the HDF5 group.
+
+  This method deals with an inherent problem of HDF5 file which is not
+  able to store data larger than HDF5_OBJECT_HEADER_LIMIT bytes.
+
+  Arguments:
+      group: A pointer to a HDF5 group.
+      name: A name of the attributes to save.
+      data: Attributes data to store.
+
+  Raises:
+    RuntimeError: If any single attribute is too large to be saved.
+  """
+  # Check that no item in `data` is larger than `HDF5_OBJECT_HEADER_LIMIT`
+  # because in that case even chunking the array would not make the saving
+  # possible.
+  bad_attributes = [x for x in data if len(x) > HDF5_OBJECT_HEADER_LIMIT]
+
+  # Expecting this to never be true.
+  if bad_attributes:
+    raise RuntimeError('The following attributes cannot be saved to HDF5 '
+                       'file because they are larger than %d bytes: %s' %
+                       (HDF5_OBJECT_HEADER_LIMIT,
+                        ', '.join([x for x in bad_attributes])))
+
+  data_npy = np.asarray(data)
+
+  num_chunks = 1
+  chunked_data = np.array_split(data_npy, num_chunks)
+
+  # This will never loop forever thanks to the test above.
+  while any(x.nbytes > HDF5_OBJECT_HEADER_LIMIT for x in chunked_data):
+    num_chunks += 1
+    chunked_data = np.array_split(data_npy, num_chunks)
+
+  if num_chunks > 1:
+    for chunk_id, chunk_data in enumerate(chunked_data):
+      group.attrs['%s%d' % (name, chunk_id)] = chunk_data
+  else:
+    group.attrs[name] = data
+
+
+def load_attributes_from_hdf5_group(group, name):
+  """Loads attributes of the specified name from the HDF5 group.
+
+  This method deals with an inherent problem
+  of HDF5 file which is not able to store
+  data larger than HDF5_OBJECT_HEADER_LIMIT bytes.
+
+  Arguments:
+      group: A pointer to a HDF5 group.
+      name: A name of the attributes to load.
+
+  Returns:
+      data: Attributes data.
+  """
+  if name in group.attrs:
+    data = [n.decode('utf8') for n in group.attrs[name]]
+  else:
+    data = []
+    chunk_id = 0
+    while '%s%d' % (name, chunk_id) in group.attrs:
+      data.extend(
+          [n.decode('utf8') for n in group.attrs['%s%d' % (name, chunk_id)]])
+      chunk_id += 1
+  return data
diff --git a/tensorflow/python/keras/engine/saving_test.py b/tensorflow/python/keras/saving/hdf5_format_test.py
similarity index 96%
rename from tensorflow/python/keras/engine/saving_test.py
rename to tensorflow/python/keras/saving/hdf5_format_test.py
index bc33a3ea7f3ef38e9f94854043fe7bdc7a9bfe46..534c78da1756d17d11dbac8cbc47d6c708afb605 100644
--- a/tensorflow/python/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/saving/hdf5_format_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #,============================================================================
-"""Tests for model saving."""
+"""Tests for model saving in the HDF5 format."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -30,8 +30,9 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras.engine import saving
+from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.saving import hdf5_format
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
@@ -39,7 +40,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import training as training_module
-from tensorflow.python.training.checkpointable import util as checkpointable
+from tensorflow.python.training.tracking import util as trackable
 
 try:
   import h5py  # pylint:disable=g-import-not-at-top
@@ -173,17 +174,17 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
     ]
     for layer, weights, input_shape in cases:
       layer.build(input_shape)
-      _ = keras.engine.saving.preprocess_weights_for_loading(
+      _ = hdf5_format.preprocess_weights_for_loading(
           layer, weights, original_keras_version='1')
 
     model = keras.models.Sequential([keras.layers.Dense(2, input_dim=2)])
-    _ = keras.engine.saving.preprocess_weights_for_loading(
+    _ = hdf5_format.preprocess_weights_for_loading(
         model, model.weights, original_keras_version='1')
 
     x = keras.Input((2,))
     y = keras.layers.Dense(2)(x)
     model = keras.models.Model(x, y)
-    _ = keras.engine.saving.preprocess_weights_for_loading(
+    _ = hdf5_format.preprocess_weights_for_loading(
         model, model.weights, original_keras_version='1')
 
   @parameterized.named_parameters(
@@ -214,7 +215,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
       layer = layer_class(**layer_args)
       layer.build(input_shape=layer_args.get('input_shape'))
       weights1 = layer.get_weights()
-      weights2 = keras.engine.saving.preprocess_weights_for_loading(
+      weights2 = hdf5_format.preprocess_weights_for_loading(
           layer, weights1)
       _ = [
           self.assertAllClose(x, y, rtol=1e-05)
@@ -273,7 +274,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
                         metrics=[keras.metrics.categorical_accuracy])
 
       f_ref_model = h5py.File(h5_path, 'w')
-      saving.save_weights_to_hdf5_group(f_ref_model, ref_model.layers)
+      hdf5_format.save_weights_to_hdf5_group(f_ref_model, ref_model.layers)
 
       f_model = h5py.File(h5_path, 'r')
       model = keras.models.Sequential()
@@ -287,7 +288,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
                                  r'Layer #0 \(named \"d1\"\) expects 1 '
                                  r'weight\(s\), but the saved weights have 2 '
                                  r'element\(s\)\.'):
-      saving.load_weights_from_hdf5_group_by_name(f_model, model.layers)
+      hdf5_format.load_weights_from_hdf5_group_by_name(f_model, model.layers)
 
   @test_util.run_deprecated_v1
   def test_sequential_weight_loading_group_name_with_incorrect_shape(self):
@@ -311,7 +312,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
                         metrics=[keras.metrics.categorical_accuracy])
 
       f_ref_model = h5py.File(h5_path, 'w')
-      saving.save_weights_to_hdf5_group(f_ref_model, ref_model.layers)
+      hdf5_format.save_weights_to_hdf5_group(f_ref_model, ref_model.layers)
 
       f_model = h5py.File(h5_path, 'r')
       model = keras.models.Sequential()
@@ -327,12 +328,12 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
                                    r'shape=\(3, 10\) dtype=float32> has '
                                    r'shape \(3, 10\), but the saved weight has '
                                    r'shape \(3, 5\)\.'):
-        saving.load_weights_from_hdf5_group_by_name(f_model, model.layers)
+        hdf5_format.load_weights_from_hdf5_group_by_name(f_model, model.layers)
 
 
 class TestWholeModelSaving(test.TestCase):
 
-  @test_util.run_v1_only('b/120545219')
+  @test_util.run_v1_only('b/120994067')
   def test_sequential_model_saving(self):
     if h5py is None:
       self.skipTest('h5py required to run this test')
@@ -347,13 +348,16 @@ class TestWholeModelSaving(test.TestCase):
           optimizer=keras.optimizers.RMSprop(lr=0.0001),
           metrics=[
               keras.metrics.categorical_accuracy,
-              keras.metrics.CategoricalAccuracy()
+              keras.metrics.CategoricalCrossentropy(
+                  name='cce', label_smoothing=constant_op.constant(0.2)),
           ],
           weighted_metrics=[
-              keras.metrics.categorical_accuracy,
-              keras.metrics.CategoricalAccuracy()
+              keras.metrics.categorical_crossentropy,
+              keras.metrics.CategoricalCrossentropy(
+                  name='cce', label_smoothing=constant_op.constant(0.2)),
           ],
           sample_weight_mode='temporal')
+
       x = np.random.random((1, 3))
       y = np.random.random((1, 3, 3))
       model.train_on_batch(x, y)
@@ -383,7 +387,10 @@ class TestWholeModelSaving(test.TestCase):
 
       out = model.predict(x)
       out2 = new_model.predict(x)
-      self.assertAllClose(out, out2, atol=1e-05)
+
+      # TODO(b/120930751) This tolerance should be 1e-05,
+      # very concerning that its not.
+      self.assertAllClose(out, out2, atol=1e-03)
 
   @test_util.run_deprecated_v1
   def test_sequential_model_saving_without_input_shape(self):
@@ -635,7 +642,6 @@ class TestWholeModelSaving(test.TestCase):
       os.close(fd)
       os.remove(fname)
 
-  @test_util.run_v1_only('b/120545219')
   def test_saving_model_with_long_weights_names(self):
     if h5py is None:
       self.skipTest('h5py required to run this test')
@@ -756,14 +762,13 @@ class SubclassedModel(training.Model):
 
 class TestWeightSavingAndLoadingTFFormat(test.TestCase):
 
-  @test_util.run_v1_only('b/120545219')
   def test_keras_optimizer_warning(self):
     graph = ops.Graph()
     with graph.as_default(), self.session(graph):
       model = keras.models.Sequential()
       model.add(keras.layers.Dense(2, input_shape=(3,)))
       model.add(keras.layers.Dense(3))
-      model.compile(loss='mse', optimizer='adam', metrics=['acc'])
+      model.compile(loss='mse', optimizer=optimizers.Adam(), metrics=['acc'])
       model._make_train_function()
       temp_dir = self.get_temp_dir()
       prefix = os.path.join(temp_dir, 'ckpt')
@@ -989,7 +994,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def test_incompatible_checkpoint(self):
-    save_path = checkpointable.Checkpoint().save(
+    save_path = trackable.Checkpoint().save(
         os.path.join(self.get_temp_dir(), 'ckpt'))
     m = keras.Model()
     with self.assertRaisesRegexp(AssertionError, 'Nothing to load'):
diff --git a/tensorflow/python/keras/saving/model_config.py b/tensorflow/python/keras/saving/model_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f59ecd7df53d794abf9db0dee15f410f4453951
--- /dev/null
+++ b/tensorflow/python/keras/saving/model_config.py
@@ -0,0 +1,96 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Functions that save the model's config into different formats.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+
+from tensorflow.python.util.tf_export import keras_export
+
+# pylint: disable=g-import-not-at-top
+try:
+  import yaml
+except ImportError:
+  yaml = None
+# pylint: enable=g-import-not-at-top
+
+
+@keras_export('keras.models.model_from_config')
+def model_from_config(config, custom_objects=None):
+  """Instantiates a Keras model from its config.
+
+  Arguments:
+      config: Configuration dictionary.
+      custom_objects: Optional dictionary mapping names
+          (strings) to custom classes or functions to be
+          considered during deserialization.
+
+  Returns:
+      A Keras model instance (uncompiled).
+
+  Raises:
+      TypeError: if `config` is not a dictionary.
+  """
+  if isinstance(config, list):
+    raise TypeError('`model_from_config` expects a dictionary, not a list. '
+                    'Maybe you meant to use '
+                    '`Sequential.from_config(config)`?')
+  from tensorflow.python.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
+  return deserialize(config, custom_objects=custom_objects)
+
+
+@keras_export('keras.models.model_from_yaml')
+def model_from_yaml(yaml_string, custom_objects=None):
+  """Parses a yaml model configuration file and returns a model instance.
+
+  Arguments:
+      yaml_string: YAML string encoding a model configuration.
+      custom_objects: Optional dictionary mapping names
+          (strings) to custom classes or functions to be
+          considered during deserialization.
+
+  Returns:
+      A Keras model instance (uncompiled).
+
+  Raises:
+      ImportError: if yaml module is not found.
+  """
+  if yaml is None:
+    raise ImportError('Requires yaml module installed (`pip install pyyaml`).')
+  config = yaml.load(yaml_string)
+  from tensorflow.python.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
+  return deserialize(config, custom_objects=custom_objects)
+
+
+@keras_export('keras.models.model_from_json')
+def model_from_json(json_string, custom_objects=None):
+  """Parses a JSON model configuration file and returns a model instance.
+
+  Arguments:
+      json_string: JSON string encoding a model configuration.
+      custom_objects: Optional dictionary mapping names
+          (strings) to custom classes or functions to be
+          considered during deserialization.
+
+  Returns:
+      A Keras model instance (uncompiled).
+  """
+  config = json.loads(json_string)
+  from tensorflow.python.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
+  return deserialize(config, custom_objects=custom_objects)
diff --git a/tensorflow/python/keras/saving/saved_model.py b/tensorflow/python/keras/saving/saved_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffaf02be7d992e1a126d9e131efe317acee5cd10
--- /dev/null
+++ b/tensorflow/python/keras/saving/saved_model.py
@@ -0,0 +1,401 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Utility functions to save/load keras Model to/from SavedModel."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import six
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.keras.saving import model_from_json
+from tensorflow.python.keras.saving import saving_utils
+from tensorflow.python.keras.utils import mode_keys
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import builder as saved_model_builder
+from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import model_utils
+from tensorflow.python.saved_model import save as save_lib
+from tensorflow.python.saved_model import utils_impl as saved_model_utils
+from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training.tracking import graph_view
+from tensorflow.python.util import compat
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export('keras.experimental.export_saved_model')
+def export_saved_model(model,
+                       saved_model_path,
+                       custom_objects=None,
+                       as_text=False,
+                       input_signature=None,
+                       serving_only=False):
+  """Exports a `tf.keras.Model` as a Tensorflow SavedModel.
+
+  Note that at this time, subclassed models can only be saved using
+  `serving_only=True`.
+
+  The exported `SavedModel` is a standalone serialization of Tensorflow objects,
+  and is supported by TF language APIs and the Tensorflow Serving system.
+  To load the model, use the function
+  `tf.keras.experimental.load_from_saved_model`.
+
+  The `SavedModel` contains:
+
+  1. a checkpoint containing the model weights.
+  2. a `SavedModel` proto containing the Tensorflow backend graph. Separate
+     graphs are saved for prediction (serving), train, and evaluation. If
+     the model has not been compiled, then only the graph computing predictions
+     will be exported.
+  3. the model's json config. If the model is subclassed, this will only be
+     included if the model's `get_config()` method is overwritten.
+
+  Example:
+
+  ```python
+  import tensorflow as tf
+
+  # Create a tf.keras model.
+  model = tf.keras.Sequential()
+  model.add(tf.keras.layers.Dense(1, input_shape=[10]))
+  model.summary()
+
+  # Save the tf.keras model in the SavedModel format.
+  path = '/tmp/simple_keras_model'
+  tf.keras.experimental.export_saved_model(model, path)
+
+  # Load the saved keras model back.
+  new_model = tf.keras.experimental.load_from_saved_model(path)
+  new_model.summary()
+  ```
+
+  Args:
+    model: A `tf.keras.Model` to be saved. If the model is subclassed, the flag
+      `serving_only` must be set to True.
+    saved_model_path: a string specifying the path to the SavedModel directory.
+    custom_objects: Optional dictionary mapping string names to custom classes
+      or functions (e.g. custom loss functions).
+    as_text: bool, `False` by default. Whether to write the `SavedModel` proto
+      in text format. Currently unavailable in serving-only mode.
+    input_signature: A possibly nested sequence of `tf.TensorSpec` objects, used
+      to specify the expected model inputs. See `tf.function` for more details.
+    serving_only: bool, `False` by default. When this is true, only the
+      prediction graph is saved.
+
+  Raises:
+    NotImplementedError: If the model is a subclassed model, and serving_only is
+      False.
+    ValueError: If the input signature cannot be inferred from the model.
+    AssertionError: If the SavedModel directory already exists and isn't empty.
+  """
+  if serving_only:
+    save_lib.save(
+        model,
+        saved_model_path,
+        signatures=saving_utils.trace_model_call(model, input_signature))
+  else:
+    _save_v1_format(model, saved_model_path, custom_objects, as_text,
+                    input_signature)
+
+  try:
+    _export_model_json(model, saved_model_path)
+  except NotImplementedError:
+    logging.warning('Skipped saving model JSON, subclassed model does not have '
+                    'get_config() defined.')
+
+
+def _export_model_json(model, saved_model_path):
+  """Saves model configuration as a json string under assets folder."""
+  model_json = model.to_json()
+  model_json_filepath = os.path.join(
+      saved_model_utils.get_or_create_assets_dir(saved_model_path),
+      compat.as_text(constants.SAVED_MODEL_FILENAME_JSON))
+  file_io.write_string_to_file(model_json_filepath, model_json)
+
+
+def _export_model_variables(model, saved_model_path):
+  """Saves model weights in checkpoint format under variables folder."""
+  saved_model_utils.get_or_create_variables_dir(saved_model_path)
+  checkpoint_prefix = saved_model_utils.get_variables_path(saved_model_path)
+  model.save_weights(checkpoint_prefix, save_format='tf', overwrite=True)
+  return checkpoint_prefix
+
+
+def _save_v1_format(model, path, custom_objects, as_text, input_signature):
+  """Exports model to v1 SavedModel format."""
+  from tensorflow.python.keras.engine import sequential  # pylint: disable=g-import-not-at-top
+
+  if not model._is_graph_network:
+    if isinstance(model, sequential.Sequential):
+      # If input shape is not directly set in the model, the exported model
+      # will infer the expected shapes of the input from the model.
+      if not model.built and input_signature is None:
+        raise ValueError(
+            'Sequential model\'s input shape is unknown. Please build the '
+            'model, or use the input_signature argument to specify the '
+            'model inputs.')
+    else:
+      raise NotImplementedError(
+          'Subclassed models can only be exported for serving. Please set '
+          'argument serving_only=True.')
+
+  builder = saved_model_builder._SavedModelBuilder(path)
+
+  # Manually save variables to export them in an object-based checkpoint. This
+  # skips the `builder.add_meta_graph_and_variables()` step, which saves a
+  # named-based checkpoint.
+  # TODO(b/113134168): Add fn to Builder to save with object-based saver.
+  # TODO(b/113178242): This should only export the model json structure. Only
+  # one save is needed once the weights can be copied from the model to clone.
+  checkpoint_path = _export_model_variables(model, path)
+
+  # Export each mode. Use ModeKeys enums defined for `Estimator` to ensure that
+  # Keras models and `Estimator`s are exported with the same format.
+  # Every time a mode is exported, the code checks to see if new variables have
+  # been created (e.g. optimizer slot variables). If that is the case, the
+  # checkpoint is re-saved to include the new variables.
+  export_args = {'builder': builder,
+                 'model': model,
+                 'custom_objects': custom_objects,
+                 'checkpoint_path': checkpoint_path,
+                 'input_signature': input_signature}
+
+  has_saved_vars = False
+  if model.optimizer:
+    if isinstance(model.optimizer, (optimizers.TFOptimizer,
+                                    optimizer_v2.OptimizerV2)):
+      _export_mode(mode_keys.ModeKeys.TRAIN, has_saved_vars, **export_args)
+      has_saved_vars = True
+      _export_mode(mode_keys.ModeKeys.TEST, has_saved_vars, **export_args)
+    else:
+      logging.warning(
+          'Model was compiled with an optimizer, but the optimizer is not from '
+          '`tf.train` (e.g. `tf.train.AdagradOptimizer`). Only the serving '
+          'graph was exported. The train and evaluate graphs were not added to '
+          'the SavedModel.')
+  _export_mode(mode_keys.ModeKeys.PREDICT, has_saved_vars, **export_args)
+
+  builder.save(as_text)
+
+
+def _get_var_list(model):
+  """Returns list of all checkpointed saveable objects in the model."""
+  var_list, _, _ = graph_view.ObjectGraphView(model).serialize_object_graph()
+  return var_list
+
+
+def create_placeholder(spec):
+  return K.placeholder(shape=spec.shape, dtype=spec.dtype, name=spec.name)
+
+
+def _export_mode(
+    mode, has_saved_vars, builder, model, custom_objects, checkpoint_path,
+    input_signature):
+  """Exports a model, and optionally saves new vars from the clone model.
+
+  Args:
+    mode: A `tf.estimator.ModeKeys` string.
+    has_saved_vars: A `boolean` indicating whether the SavedModel has already
+      exported variables.
+    builder: A `SavedModelBuilder` object.
+    model: A `tf.keras.Model` object.
+    custom_objects: A dictionary mapping string names to custom classes
+      or functions.
+    checkpoint_path: String path to checkpoint.
+    input_signature: Nested TensorSpec containing the expected inputs. Can be
+      `None`, in which case the signature will be inferred from the model.
+
+  Raises:
+    ValueError: If the train/eval mode is being exported, but the model does
+      not have an optimizer.
+  """
+  from tensorflow.python.keras import models as models_lib  # pylint: disable=g-import-not-at-top
+  compile_clone = (mode != mode_keys.ModeKeys.PREDICT)
+  if compile_clone and not model.optimizer:
+    raise ValueError(
+        'Model does not have an optimizer. Cannot export mode %s' % mode)
+
+  model_graph = ops.get_default_graph()
+  with ops.Graph().as_default() as g, K.learning_phase_scope(
+      mode == mode_keys.ModeKeys.TRAIN):
+
+    if input_signature is None:
+      input_tensors = None
+    else:
+      input_tensors = nest.map_structure(create_placeholder, input_signature)
+
+    # Clone the model into blank graph. This will create placeholders for inputs
+    # and targets.
+    clone = models_lib.clone_and_build_model(
+        model, input_tensors=input_tensors, custom_objects=custom_objects,
+        compile_clone=compile_clone)
+
+    # Make sure that iterations variable is added to the global step collection,
+    # to ensure that, when the SavedModel graph is loaded, the iterations
+    # variable is returned by `tf.train.get_global_step()`. This is required for
+    # compatibility with the SavedModelEstimator.
+    if compile_clone:
+      g.add_to_collection(ops.GraphKeys.GLOBAL_STEP, clone.optimizer.iterations)
+
+    # Extract update and train ops from train/test/predict functions.
+    train_op = None
+    if mode == mode_keys.ModeKeys.TRAIN:
+      clone._make_train_function()
+      train_op = clone.train_function.updates_op
+    elif mode == mode_keys.ModeKeys.TEST:
+      clone._make_test_function()
+    else:
+      clone._make_predict_function()
+    g.get_collection_ref(ops.GraphKeys.UPDATE_OPS).extend(clone.state_updates)
+
+    with session.Session().as_default():
+      clone_var_list = _get_var_list(clone)
+      if has_saved_vars:
+        # Confirm all variables in the clone have an entry in the checkpoint.
+        status = clone.load_weights(checkpoint_path)
+        status.assert_existing_objects_matched()
+      else:
+        # Confirm that variables between the clone and model match up exactly,
+        # not counting optimizer objects. Optimizer objects are ignored because
+        # if the model has not trained, the slot variables will not have been
+        # created yet.
+        # TODO(b/113179535): Replace with trackable equivalence.
+        _assert_same_non_optimizer_objects(model, model_graph, clone, g)
+
+        # TODO(b/113178242): Use value transfer for trackable objects.
+        clone.load_weights(checkpoint_path)
+
+        # Add graph and variables to SavedModel.
+        # TODO(b/113134168): Switch to add_meta_graph_and_variables.
+        clone.save_weights(checkpoint_path, save_format='tf', overwrite=True)
+        builder._has_saved_variables = True
+
+      # Add graph to the SavedModel builder.
+      builder.add_meta_graph(
+          model_utils.EXPORT_TAG_MAP[mode],
+          signature_def_map=_create_signature_def_map(clone, mode),
+          saver=saver_lib.Saver(clone_var_list),
+          init_op=variables.local_variables_initializer(),
+          train_op=train_op)
+    return None
+
+
+def _create_signature_def_map(model, mode):
+  """Creates a SignatureDef map from a Keras model."""
+  inputs_dict = {name: x for name, x in zip(model.input_names, model.inputs)}
+  if model.optimizer:
+    targets_dict = {x.name.split(':')[0]: x
+                    for x in model.targets if x is not None}
+    inputs_dict.update(targets_dict)
+  outputs_dict = {name: x
+                  for name, x in zip(model.output_names, model.outputs)}
+  metrics = saving_utils.extract_model_metrics(model)
+
+  # Add metric variables to the `LOCAL_VARIABLES` collection. Metric variables
+  # are by default not added to any collections. We are doing this here, so
+  # that metric variables get initialized.
+  local_vars = set(ops.get_collection(ops.GraphKeys.LOCAL_VARIABLES))
+  vars_to_add = set()
+  if metrics is not None:
+    from tensorflow.python.keras.metrics import Metric  # pylint: disable=g-import-not-at-top
+    for key, value in six.iteritems(metrics):
+      if isinstance(value, Metric):
+        vars_to_add.update(value.variables)
+        # Convert Metric instances to (value_tensor, update_op) tuple.
+        metrics[key] = (value.result(), value.updates[0])
+  # Remove variables that are in the local variables collection already.
+  vars_to_add = vars_to_add.difference(local_vars)
+  for v in vars_to_add:
+    ops.add_to_collection(ops.GraphKeys.LOCAL_VARIABLES, v)
+
+  export_outputs = model_utils.export_outputs_for_mode(
+      mode,
+      predictions=outputs_dict,
+      loss=model.total_loss if model.optimizer else None,
+      metrics=metrics)
+  return model_utils.build_all_signature_defs(
+      inputs_dict,
+      export_outputs=export_outputs,
+      serving_only=(mode == mode_keys.ModeKeys.PREDICT))
+
+
+def _assert_same_non_optimizer_objects(model, model_graph, clone, clone_graph):  # pylint: disable=unused-argument
+  """Asserts model and clone contain the same trackable objects."""
+
+  # TODO(fchollet, kathywu): make sure this works in eager mode.
+  return True
+
+
+@keras_export('keras.experimental.load_from_saved_model')
+def load_from_saved_model(saved_model_path, custom_objects=None):
+  """Loads a keras Model from a SavedModel created by `export_saved_model()`.
+
+  This function reinstantiates model state by:
+  1) loading model topology from json (this will eventually come
+     from metagraph).
+  2) loading model weights from checkpoint.
+
+  Example:
+
+  ```python
+  import tensorflow as tf
+
+  # Create a tf.keras model.
+  model = tf.keras.Sequential()
+  model.add(tf.keras.layers.Dense(1, input_shape=[10]))
+  model.summary()
+
+  # Save the tf.keras model in the SavedModel format.
+  path = '/tmp/simple_keras_model'
+  tf.keras.experimental.export_saved_model(model, path)
+
+  # Load the saved keras model back.
+  new_model = tf.keras.experimental.load_from_saved_model(path)
+  new_model.summary()
+  ```
+
+  Args:
+    saved_model_path: a string specifying the path to an existing SavedModel.
+    custom_objects: Optional dictionary mapping names
+        (strings) to custom classes or functions to be
+        considered during deserialization.
+
+  Returns:
+    a keras.Model instance.
+  """
+  # restore model topology from json string
+  model_json_filepath = os.path.join(
+      compat.as_bytes(saved_model_path),
+      compat.as_bytes(constants.ASSETS_DIRECTORY),
+      compat.as_bytes(constants.SAVED_MODEL_FILENAME_JSON))
+  model_json = file_io.read_file_to_string(model_json_filepath)
+  model = model_from_json(model_json, custom_objects=custom_objects)
+
+  # restore model weights
+  checkpoint_prefix = os.path.join(
+      compat.as_text(saved_model_path),
+      compat.as_text(constants.VARIABLES_DIRECTORY),
+      compat.as_text(constants.VARIABLES_FILENAME))
+  model.load_weights(checkpoint_prefix)
+  return model
diff --git a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py b/tensorflow/python/keras/saving/saved_model_test.py
similarity index 62%
rename from tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
rename to tensorflow/python/keras/saving/saved_model_test.py
index 93d73e1b484ed810fb347b13e95022dfca3584c2..50ddf1f24c7c3360702ea4d9222b2661886dffff 100644
--- a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model_test.py
@@ -24,19 +24,23 @@ import shutil
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.contrib.saved_model.python.saved_model import keras_saved_model
 from tensorflow.python import keras
+from tensorflow.python import tf2
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
-from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.optimizer_v2 import adadelta
+from tensorflow.python.keras.saving import saved_model as keras_saved_model
+from tensorflow.python.keras.utils import mode_keys
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import loader_impl
-from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import model_utils
 from tensorflow.python.training import training as training_module
 
 
@@ -64,10 +68,10 @@ class TestModelSavingandLoading(test.TestCase):
 
       ref_y = model.predict(x)
 
-      temp_saved_model = self._save_model_dir()
-      output_path = keras_saved_model.save_keras_model(model, temp_saved_model)
+      saved_model_dir = self._save_model_dir()
+      keras_saved_model.export_saved_model(model, saved_model_dir)
 
-      loaded_model = keras_saved_model.load_keras_model(output_path)
+      loaded_model = keras_saved_model.load_from_saved_model(saved_model_dir)
       y = loaded_model.predict(x)
       self.assertAllClose(ref_y, y, atol=1e-05)
 
@@ -82,9 +86,9 @@ class TestModelSavingandLoading(test.TestCase):
       x = np.random.random((1, 3))
       ref_y = model.predict(x)
 
-      temp_saved_model = self._save_model_dir()
-      output_path = keras_saved_model.save_keras_model(model, temp_saved_model)
-      loaded_model = keras_saved_model.load_keras_model(output_path)
+      saved_model_dir = self._save_model_dir()
+      keras_saved_model.export_saved_model(model, saved_model_dir)
+      loaded_model = keras_saved_model.load_from_saved_model(saved_model_dir)
 
       y = loaded_model.predict(x)
       self.assertAllClose(ref_y, y, atol=1e-05)
@@ -106,9 +110,9 @@ class TestModelSavingandLoading(test.TestCase):
 
       ref_y = model.predict(x)
 
-      temp_saved_model = self._save_model_dir()
-      output_path = keras_saved_model.save_keras_model(model, temp_saved_model)
-      loaded_model = keras_saved_model.load_keras_model(output_path)
+      saved_model_dir = self._save_model_dir()
+      keras_saved_model.export_saved_model(model, saved_model_dir)
+      loaded_model = keras_saved_model.load_from_saved_model(saved_model_dir)
 
       y = loaded_model.predict(x)
       self.assertAllClose(ref_y, y, atol=1e-05)
@@ -127,58 +131,56 @@ class TestModelSavingandLoading(test.TestCase):
 
       ref_y = model.predict(x)
 
-      temp_saved_model = self._save_model_dir()
-      output_path = keras_saved_model.save_keras_model(model, temp_saved_model)
-      loaded_model = keras_saved_model.load_keras_model(output_path)
+      saved_model_dir = self._save_model_dir()
+      keras_saved_model.export_saved_model(model, saved_model_dir)
+      loaded_model = keras_saved_model.load_from_saved_model(saved_model_dir)
 
       y = loaded_model.predict(x)
       self.assertAllClose(ref_y, y, atol=1e-05)
 
   @test_util.run_in_graph_and_eager_modes
   def test_saving_with_tf_optimizer(self):
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(2, input_shape=(3,)))
-      model.add(keras.layers.Dense(3))
-      model.compile(
-          loss='mse',
-          optimizer=training_module.RMSPropOptimizer(0.1),
-          metrics=['acc'])
-
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3))
-      model.train_on_batch(x, y)
-      ref_y = model.predict(x)
-
-      temp_saved_model = self._save_model_dir()
-      output_path = keras_saved_model.save_keras_model(model, temp_saved_model)
-      loaded_model = keras_saved_model.load_keras_model(output_path)
-      loaded_model.compile(
-          loss='mse',
-          optimizer=training_module.RMSPropOptimizer(0.1),
-          metrics=['acc'])
-      y = loaded_model.predict(x)
-      self.assertAllClose(ref_y, y, atol=1e-05)
-
-      # test that new updates are the same with both models
-      x = np.random.random((1, 3))
-      y = np.random.random((1, 3))
-
-      ref_loss = model.train_on_batch(x, y)
-      loss = loaded_model.train_on_batch(x, y)
-      self.assertAllClose(ref_loss, loss, atol=1e-05)
-
-      ref_y = model.predict(x)
-      y = loaded_model.predict(x)
-      self.assertAllClose(ref_y, y, atol=1e-05)
-
-      # test saving/loading again
-      temp_saved_model2 = self._save_model_dir('saved_model_2')
-      output_path2 = keras_saved_model.save_keras_model(
-          loaded_model, temp_saved_model2)
-      loaded_model = keras_saved_model.load_keras_model(output_path2)
-      y = loaded_model.predict(x)
-      self.assertAllClose(ref_y, y, atol=1e-05)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(2, input_shape=(3,)))
+    model.add(keras.layers.Dense(3))
+    model.compile(
+        loss='mse',
+        optimizer=training_module.RMSPropOptimizer(0.1),
+        metrics=['acc'])
+
+    x = np.random.random((1, 3))
+    y = np.random.random((1, 3))
+    model.train_on_batch(x, y)
+    ref_y = model.predict(x)
+
+    saved_model_dir = self._save_model_dir()
+    keras_saved_model.export_saved_model(model, saved_model_dir)
+    loaded_model = keras_saved_model.load_from_saved_model(saved_model_dir)
+    loaded_model.compile(
+        loss='mse',
+        optimizer=training_module.RMSPropOptimizer(0.1),
+        metrics=['acc'])
+    y = loaded_model.predict(x)
+    self.assertAllClose(ref_y, y, atol=1e-05)
+
+    # test that new updates are the same with both models
+    x = np.random.random((1, 3))
+    y = np.random.random((1, 3))
+
+    ref_loss = model.train_on_batch(x, y)
+    loss = loaded_model.train_on_batch(x, y)
+    self.assertAllClose(ref_loss, loss, atol=1e-05)
+
+    ref_y = model.predict(x)
+    y = loaded_model.predict(x)
+    self.assertAllClose(ref_y, y, atol=1e-05)
+
+    # test saving/loading again
+    saved_model_dir2 = self._save_model_dir('saved_model_2')
+    keras_saved_model.export_saved_model(loaded_model, saved_model_dir2)
+    loaded_model = keras_saved_model.load_from_saved_model(saved_model_dir2)
+    y = loaded_model.predict(x)
+    self.assertAllClose(ref_y, y, atol=1e-05)
 
   def test_saving_subclassed_model_raise_error(self):
     # For now, saving subclassed model should raise an error. It should be
@@ -196,9 +198,9 @@ class TestModelSavingandLoading(test.TestCase):
 
     model = SubclassedModel()
 
-    temp_saved_model = self._save_model_dir()
+    saved_model_dir = self._save_model_dir()
     with self.assertRaises(NotImplementedError):
-      keras_saved_model.save_keras_model(model, temp_saved_model)
+      keras_saved_model.export_saved_model(model, saved_model_dir)
 
 
 class LayerWithLearningPhase(keras.engine.base_layer.Layer):
@@ -215,7 +217,7 @@ class LayerWithLearningPhase(keras.engine.base_layer.Layer):
     return input_shape
 
 
-def functional_model(uses_learning_phase):
+def functional_model(uses_learning_phase=True):
   inputs = keras.layers.Input(shape=(3,))
   x = keras.layers.Dense(2)(inputs)
   x = keras.layers.Dense(3)(x)
@@ -224,7 +226,7 @@ def functional_model(uses_learning_phase):
   return keras.models.Model(inputs, x)
 
 
-def sequential_model(uses_learning_phase):
+def sequential_model(uses_learning_phase=True):
   model = keras.models.Sequential()
   model.add(keras.layers.Dense(2, input_shape=(3,)))
   model.add(keras.layers.Dense(3))
@@ -233,7 +235,7 @@ def sequential_model(uses_learning_phase):
   return model
 
 
-def sequential_model_without_input_shape(uses_learning_phase):
+def sequential_model_without_input_shape(uses_learning_phase=True):
   model = keras.models.Sequential()
   model.add(keras.layers.Dense(2))
   model.add(keras.layers.Dense(3))
@@ -242,10 +244,27 @@ def sequential_model_without_input_shape(uses_learning_phase):
   return model
 
 
+class Subclassed(keras.models.Model):
+
+  def __init__(self):
+    super(Subclassed, self).__init__()
+    self.dense1 = keras.layers.Dense(2)
+    self.dense2 = keras.layers.Dense(3)
+
+  def call(self, inputs):
+    x = self.dense1(inputs)
+    x = self.dense2(x)
+    return x
+
+
+def subclassed_model():
+  return Subclassed()
+
+
 def load_model(sess, path, mode):
-  tags = model_fn_lib.EXPORT_TAG_MAP[mode]
-  sig_def_key = (signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-                 if mode == model_fn_lib.ModeKeys.PREDICT else mode)
+  tags = model_utils.EXPORT_TAG_MAP[mode]
+  sig_def_key = model_utils.SIGNATURE_KEY_MAP[mode]
+
   meta_graph_def = loader_impl.load(sess, tags, path)
   inputs = {
       k: sess.graph.get_tensor_by_name(v.name)
@@ -268,61 +287,64 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
       {
           'model_builder': functional_model,
           'uses_learning_phase': True,
-          'optimizer': training_module.AdadeltaOptimizer(),
+          'optimizer_cls': adadelta.Adadelta,
           'train_before_export': True},
       {
           'model_builder': functional_model,
           'uses_learning_phase': True,
-          'optimizer': training_module.AdadeltaOptimizer(),
+          'optimizer_cls': training_module.AdadeltaOptimizer,
           'train_before_export': False},
       {
           'model_builder': functional_model,
           'uses_learning_phase': False,
-          'optimizer': None,
+          'optimizer_cls': None,
           'train_before_export': False},
       {
           'model_builder': sequential_model,
           'uses_learning_phase': True,
-          'optimizer': training_module.AdadeltaOptimizer(),
+          'optimizer_cls': training_module.AdadeltaOptimizer,
           'train_before_export': True},
       {
           'model_builder': sequential_model,
           'uses_learning_phase': True,
-          'optimizer': training_module.AdadeltaOptimizer(),
+          'optimizer_cls': adadelta.Adadelta,
           'train_before_export': False},
       {
           'model_builder': sequential_model,
           'uses_learning_phase': False,
-          'optimizer': None,
+          'optimizer_cls': None,
           'train_before_export': False},
       {
           'model_builder': sequential_model_without_input_shape,
           'uses_learning_phase': True,
-          'optimizer': training_module.AdadeltaOptimizer(),
+          'optimizer_cls': training_module.AdadeltaOptimizer,
           'train_before_export': False})
   def testSaveAndLoadSavedModelExport(
-      self, model_builder, uses_learning_phase, optimizer, train_before_export):
-    saved_model_path = self._save_model_dir()
-    with self.session(graph=ops.Graph()):
-      np.random.seed(130)
-      input_arr = np.random.random((1, 3))
-      target_arr = np.random.random((1, 3))
-
-      model = model_builder(uses_learning_phase)
-      if optimizer is not None:
-        model.compile(
-            loss='mse',
-            optimizer=optimizer,
-            metrics=['mae'])
-        if train_before_export:
-          model.train_on_batch(input_arr, target_arr)
-
-        ref_loss, ref_mae = model.evaluate(input_arr, target_arr)
+      self, model_builder, uses_learning_phase, optimizer_cls,
+      train_before_export):
+    optimizer = None if optimizer_cls is None else optimizer_cls()
 
-      ref_predict = model.predict(input_arr)
+    saved_model_dir = self._save_model_dir()
+
+    np.random.seed(130)
+    input_arr = np.random.random((1, 3))
+    target_arr = np.random.random((1, 3))
+
+    model = model_builder(uses_learning_phase)
+    if optimizer is not None:
+      model.compile(
+          loss='mse',
+          optimizer=optimizer,
+          metrics=['mae'])
+      if train_before_export:
+        model.train_on_batch(input_arr, target_arr)
+
+      ref_loss, ref_mae = model.evaluate(input_arr, target_arr)
+
+    ref_predict = model.predict(input_arr)
 
-      # Export SavedModel
-      output_path = keras_saved_model.save_keras_model(model, saved_model_path)
+    # Export SavedModel
+    keras_saved_model.export_saved_model(model, saved_model_dir)
 
     input_name = model.input_names[0]
     output_name = model.output_names[0]
@@ -330,8 +352,8 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
 
     # Load predict graph, and test predictions
     with session.Session(graph=ops.Graph()) as sess:
-      inputs, outputs, _ = load_model(sess, output_path,
-                                      model_fn_lib.ModeKeys.PREDICT)
+      inputs, outputs, _ = load_model(sess, saved_model_dir,
+                                      mode_keys.ModeKeys.PREDICT)
 
       predictions = sess.run(outputs[output_name],
                              {inputs[input_name]: input_arr})
@@ -340,21 +362,25 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
     if optimizer:
       # Load eval graph, and test predictions, loss and metric values
       with session.Session(graph=ops.Graph()) as sess:
-        inputs, outputs, _ = load_model(sess, output_path,
-                                        model_fn_lib.ModeKeys.EVAL)
+        inputs, outputs, _ = load_model(sess, saved_model_dir,
+                                        mode_keys.ModeKeys.TEST)
 
         # First obtain the loss and predictions, and run the metric update op by
         # feeding in the inputs and targets.
+        metrics_name = 'mae' if tf2.enabled() else 'mean_absolute_error'
+        metrics_update_op_key = 'metrics/' + metrics_name + '/update_op'
+        metrics_value_op_key = 'metrics/' + metrics_name + '/value'
+
         loss, predictions, _ = sess.run(
             (outputs['loss'], outputs['predictions/' + output_name],
-             outputs['metrics/mean_absolute_error/update_op']), {
+             outputs[metrics_update_op_key]), {
                  inputs[input_name]: input_arr,
                  inputs[target_name]: target_arr
              })
 
         # The metric value should be run after the update op, to ensure that it
         # reflects the correct value.
-        metric_value = sess.run(outputs['metrics/mean_absolute_error/value'])
+        metric_value = sess.run(outputs[metrics_value_op_key])
 
         self.assertEqual(int(train_before_export),
                          sess.run(training_module.get_global_step()))
@@ -365,12 +391,12 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
       # Load train graph, and check for the train op, and prediction values
       with session.Session(graph=ops.Graph()) as sess:
         inputs, outputs, meta_graph_def = load_model(
-            sess, output_path, model_fn_lib.ModeKeys.TRAIN)
+            sess, saved_model_dir, mode_keys.ModeKeys.TRAIN)
         self.assertEqual(int(train_before_export),
                          sess.run(training_module.get_global_step()))
         self.assertIn('loss', outputs)
-        self.assertIn('metrics/mean_absolute_error/update_op', outputs)
-        self.assertIn('metrics/mean_absolute_error/value', outputs)
+        self.assertIn(metrics_update_op_key, outputs)
+        self.assertIn(metrics_value_op_key, outputs)
         self.assertIn('predictions/' + output_name, outputs)
 
         # Train for a step
@@ -391,18 +417,18 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
               atol=1e-05)
 
   def testSaveAndLoadSavedModelWithCustomObject(self):
-    saved_model_path = self._save_model_dir()
+    saved_model_dir = self._save_model_dir()
     with session.Session(graph=ops.Graph()) as sess:
       def relu6(x):
         return keras.backend.relu(x, max_value=6)
       inputs = keras.layers.Input(shape=(1,))
       outputs = keras.layers.Activation(relu6)(inputs)
       model = keras.models.Model(inputs, outputs)
-      output_path = keras_saved_model.save_keras_model(
-          model, saved_model_path, custom_objects={'relu6': relu6})
+      keras_saved_model.export_saved_model(
+          model, saved_model_dir, custom_objects={'relu6': relu6})
     with session.Session(graph=ops.Graph()) as sess:
-      inputs, outputs, _ = load_model(sess, output_path,
-                                      model_fn_lib.ModeKeys.PREDICT)
+      inputs, outputs, _ = load_model(sess, saved_model_dir,
+                                      mode_keys.ModeKeys.PREDICT)
       input_name = model.input_names[0]
       output_name = model.output_names[0]
       predictions = sess.run(
@@ -463,12 +489,56 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
       clone.compile(loss='mse', optimizer=keras.optimizers.RMSprop(lr=0.0001))
       clone.train_on_batch(input_arr, target_arr)
 
-  def testSaveSeqModelWithoutInputShapesRaisesError(self):
-    """A Sequential model that hasn't been built should raise an error."""
+  def testSaveSequentialModelWithoutInputShapes(self):
     model = sequential_model_without_input_shape(True)
-    with self.assertRaisesRegexp(
-        ValueError, 'must be built'):
-      keras_saved_model.save_keras_model(model, '')
+    # A Sequential model that hasn't been built should raise an error.
+    with self.assertRaisesRegexp(ValueError, 'Please build the model'):
+      keras_saved_model.export_saved_model(model, '')
+
+    saved_model_dir = self._save_model_dir()
+    keras_saved_model.export_saved_model(
+        model,
+        saved_model_dir,
+        input_signature=tensor_spec.TensorSpec(
+            shape=(10, 11, 12, 13, 14), dtype=dtypes.float32,
+            name='spec_input'))
+
+    with session.Session(graph=ops.Graph()) as sess:
+      inputs, outputs, _ = load_model(sess, saved_model_dir,
+                                      mode_keys.ModeKeys.PREDICT)
+      self.assertEqual(5, inputs[next(iter(inputs.keys()))].shape.ndims)
+      self.assertEqual(5, outputs[next(iter(outputs.keys()))].shape.ndims)
+      self.assertEqual(3, outputs[next(iter(outputs.keys()))].shape[-1])
+
+  @parameterized.parameters(
+      {
+          'model_builder': sequential_model_without_input_shape,
+          'input_signature': [tensor_spec.TensorSpec(shape=[None, 3],
+                                                     dtype=dtypes.float32)]},
+      {
+          'model_builder': subclassed_model,
+          'input_signature': [tensor_spec.TensorSpec(shape=[None, 3],
+                                                     dtype=dtypes.float32)]})
+  def testServingOnly(self, model_builder, input_signature):
+    if context.executing_eagerly():
+      saved_model_dir = self._save_model_dir()
+      input_arr = np.random.random((5, 3)).astype(np.float32)
+      model = model_builder()
+      ref_predict = model.predict(input_arr)
+
+      keras_saved_model.export_saved_model(
+          model,
+          saved_model_dir,
+          serving_only=True,
+          input_signature=input_signature)
+
+      # Load predict graph, and test predictions
+      with session.Session(graph=ops.Graph()) as sess:
+        inputs, outputs, _ = load_model(sess, saved_model_dir,
+                                        mode_keys.ModeKeys.PREDICT)
+        predictions = sess.run(outputs[next(iter(outputs.keys()))],
+                               {inputs[next(iter(inputs.keys()))]: input_arr})
+        self.assertAllClose(ref_predict, predictions, atol=1e-05)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/saving/saving_utils.py b/tensorflow/python/keras/saving/saving_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..95da169e82367c7e6ee7ef17fcb22295f8b0242b
--- /dev/null
+++ b/tensorflow/python/keras/saving/saving_utils.py
@@ -0,0 +1,103 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Utils related to keras model saving."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.util import nest
+
+
+def extract_model_metrics(model):
+  """Convert metrics from a Keras model to (value, update) ops.
+
+  This is used for converting Keras models to Estimators and SavedModels.
+
+  Args:
+    model: A `tf.keras.Model` object.
+
+  Returns:
+    Dictionary mapping metric names to tuples of (value, update) ops. May return
+    `None` if the model does not contain any metrics.
+  """
+  from tensorflow.python.keras import metrics  # pylint: disable=g-import-not-at-top
+  if not getattr(model, '_compile_metrics', None):
+    return None
+
+  # TODO(psv/kathywu): use this implementation in model to estimator flow.
+  eval_metric_ops = {}
+  for metric_name in model.metrics_names[1:]:  # Index 0 is `loss`.
+    m = metrics.Mean()
+    m(model._compile_metrics_tensors[metric_name])
+    eval_metric_ops[metric_name] = m
+  return eval_metric_ops
+
+
+def trace_model_call(model, input_signature=None):
+  """Trace the model call to create a tf.function for exporting a Keras model.
+
+  Args:
+    model: A Keras model.
+    input_signature: optional, a list of tf.TensorSpec objects specifying the
+      inputs to the model.
+
+  Returns:
+    A tf.function wrapping the model's call function with input signatures set.
+
+  Raises:
+    ValueError: if input signature cannot be inferred from the model.
+  """
+  if input_signature is None:
+    if isinstance(model.call, def_function.Function):
+      input_signature = model.call.input_signature
+
+  if input_signature is None:
+    try:
+      inputs = model.inputs
+      input_names = model.input_names
+    except AttributeError:
+      raise ValueError(
+          'Model {} cannot be saved because the input shapes have not been '
+          'set. Usually, input shapes are automatically determined from calling'
+          ' .fit() or .predict(). To manually set the shapes, call '
+          'model._set_inputs(inputs).'.format(model))
+    input_specs = []
+    for input_tensor, input_name in zip(inputs, input_names):
+      input_specs.append(tensor_spec.TensorSpec(
+          shape=input_tensor.shape, dtype=input_tensor.dtype,
+          name=input_name))
+    # The input signature of the call function is a list with one element, since
+    # all tensor inputs must be passed in as the first argument.
+    input_signature = [input_specs] if len(input_specs) > 1 else input_specs
+
+  # TODO(mdan): Should the model's call be autographed by default?
+  @def_function.function(input_signature=input_signature, autograph=False)
+  def _wrapped_model(*args):
+    """A concrete tf.function that wraps the model's call function."""
+    # When given a single input, Keras models will call the model on the tensor
+    # rather than a list consisting of the single tensor.
+    inputs = args[0] if len(input_signature) == 1 else list(args)
+    outputs_list = nest.flatten(model(inputs=inputs))
+    try:
+      output_names = model.output_names
+    except AttributeError:
+      from tensorflow.python.keras.engine import training_utils  # pylint: disable=g-import-not-at-top
+      output_names = training_utils.generic_output_names(outputs_list)
+    return {name: output for name, output in zip(output_names, outputs_list)}
+
+  return _wrapped_model
diff --git a/tensorflow/python/keras/saving/saving_utils_test.py b/tensorflow/python/keras/saving/saving_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbd14c085298861c091b0da0a15ba2743471117f
--- /dev/null
+++ b/tensorflow/python/keras/saving/saving_utils_test.py
@@ -0,0 +1,241 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for saving utility functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+
+
+from tensorflow.python import keras
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.saving import saving_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import save as save_lib
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.training import rmsprop
+
+
+class TraceModelCallTest(keras_parameterized.TestCase):
+
+  def _assert_all_close(self, expected, actual):
+    if not context.executing_eagerly():
+      with self.cached_session() as sess:
+        K._initialize_variables(sess)
+        self.assertAllClose(expected, actual)
+    else:
+      self.assertAllClose(expected, actual)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_trace_model_outputs(self):
+    input_dim = 5 if testing_utils.get_model_type() == 'functional' else None
+    model = testing_utils.get_small_mlp(10, 3, input_dim)
+    inputs = array_ops.ones((8, 5))
+
+    if input_dim is None:
+      with self.assertRaisesRegexp(ValueError,
+                                   'input shapes have not been set'):
+        saving_utils.trace_model_call(model)
+      model._set_inputs(inputs)
+
+    fn = saving_utils.trace_model_call(model)
+    signature_outputs = fn(inputs)
+    expected_outputs = {model.output_names[0]: model(inputs)}
+
+    self._assert_all_close(expected_outputs, signature_outputs)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_trace_model_outputs_after_fitting(self):
+    input_dim = 5 if testing_utils.get_model_type() == 'functional' else None
+    model = testing_utils.get_small_mlp(10, 3, input_dim)
+    model.compile(optimizer='sgd', loss='mse')
+    model.fit(x=np.random.random((8, 5)),
+              y=np.random.random((8, 3)), epochs=2)
+
+    inputs = array_ops.ones((8, 5))
+
+    fn = saving_utils.trace_model_call(model)
+    signature_outputs = fn(inputs)
+    expected_outputs = {model.output_names[0]: model(inputs)}
+
+    self._assert_all_close(expected_outputs, signature_outputs)
+
+  @keras_parameterized.run_with_all_model_types(exclude_models='sequential')
+  @keras_parameterized.run_all_keras_modes
+  def test_trace_multi_io_model_outputs(self):
+    input_dim = 5
+    num_classes = 3
+    num_classes_b = 4
+    input_a = keras.layers.Input(shape=(input_dim,), name='input_a')
+    input_b = keras.layers.Input(shape=(input_dim,), name='input_b')
+
+    dense = keras.layers.Dense(num_classes, name='dense')
+    dense2 = keras.layers.Dense(num_classes_b, name='dense2')
+    dropout = keras.layers.Dropout(0.5, name='dropout')
+    branch_a = [input_a, dense]
+    branch_b = [input_b, dense, dense2, dropout]
+
+    model = testing_utils.get_multi_io_model(branch_a, branch_b)
+
+    input_a_np = np.random.random((10, input_dim)).astype(np.float32)
+    input_b_np = np.random.random((10, input_dim)).astype(np.float32)
+
+    if testing_utils.get_model_type() == 'subclass':
+      with self.assertRaisesRegexp(ValueError,
+                                   'input shapes have not been set'):
+        saving_utils.trace_model_call(model)
+
+    model.compile(optimizer='sgd', loss='mse')
+    model.fit(x=[np.random.random((8, input_dim)).astype(np.float32),
+                 np.random.random((8, input_dim)).astype(np.float32)],
+              y=[np.random.random((8, num_classes)).astype(np.float32),
+                 np.random.random((8, num_classes_b)).astype(np.float32)],
+              epochs=2)
+
+    fn = saving_utils.trace_model_call(model)
+    signature_outputs = fn([input_a_np, input_b_np])
+    outputs = model([input_a_np, input_b_np])
+    expected_outputs = {model.output_names[0]: outputs[0],
+                        model.output_names[1]: outputs[1]}
+
+    self._assert_all_close(expected_outputs, signature_outputs)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_specify_input_signature(self):
+    model = testing_utils.get_small_sequential_mlp(10, 3, None)
+    inputs = array_ops.ones((8, 5))
+
+    with self.assertRaisesRegexp(ValueError, 'input shapes have not been set'):
+      saving_utils.trace_model_call(model)
+
+    fn = saving_utils.trace_model_call(
+        model, [tensor_spec.TensorSpec(shape=[None, 5], dtype=dtypes.float32)])
+    signature_outputs = fn(inputs)
+    expected_outputs = {model.output_names[0]: model(inputs)}
+    self._assert_all_close(expected_outputs, signature_outputs)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_subclassed_model_with_input_signature(self):
+
+    class Model(keras.Model):
+
+      def __init__(self):
+        super(Model, self).__init__()
+        self.dense = keras.layers.Dense(3, name='dense')
+
+      @def_function.function(
+          input_signature=[[tensor_spec.TensorSpec([None, 5], dtypes.float32),
+                            tensor_spec.TensorSpec([None], dtypes.float32)]],)
+      def call(self, inputs, *args):
+        x, y = inputs
+        return self.dense(x) + y
+
+    model = Model()
+    fn = saving_utils.trace_model_call(model)
+    x = array_ops.ones((8, 5), dtype=dtypes.float32)
+    y = array_ops.ones((3,), dtype=dtypes.float32)
+    expected_outputs = {'output_1': model([x, y])}
+    signature_outputs = fn([x, y])
+    self._assert_all_close(expected_outputs, signature_outputs)
+
+
+def _import_and_infer(save_dir, inputs):
+  """Import a SavedModel into a TF 1.x-style graph and run `signature_key`."""
+  graph = ops.Graph()
+  with graph.as_default(), session_lib.Session() as session:
+    model = loader.load(session, [tag_constants.SERVING], save_dir)
+    signature = model.signature_def[
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+    assert set(inputs.keys()) == set(signature.inputs.keys())
+    feed_dict = {}
+    for arg_name in inputs.keys():
+      feed_dict[graph.get_tensor_by_name(signature.inputs[arg_name].name)] = (
+          inputs[arg_name])
+    output_dict = {}
+    for output_name, output_tensor_info in signature.outputs.items():
+      output_dict[output_name] = graph.get_tensor_by_name(
+          output_tensor_info.name)
+    return session.run(output_dict, feed_dict=feed_dict)
+
+
+class ModelSaveTest(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_model_save(self):
+    input_dim = 5
+    model = testing_utils.get_small_mlp(10, 3, input_dim)
+    inputs = array_ops.ones((8, 5))
+
+    if testing_utils.get_model_type() == 'subclass':
+      model._set_inputs(inputs)
+
+    save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
+    save_lib.save(model, save_dir)
+
+    self.assertAllClose(
+        {model.output_names[0]: model.predict_on_batch(inputs)},
+        _import_and_infer(save_dir, {model.input_names[0]: np.ones((8, 5))}))
+
+
+class ExtractModelMetricsTest(test.TestCase):
+
+  def test_extract_model_metrics(self):
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
+
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+    model = keras.models.Model([a, b], [d, e])
+    extract_metrics = saving_utils.extract_model_metrics(model)
+    self.assertEqual(None, extract_metrics)
+
+    extract_metric_names = [
+        'dense_loss', 'dropout_loss', 'dense_binary_accuracy',
+        'dropout_binary_accuracy'
+    ]
+    model_metric_names = ['loss'] + extract_metric_names
+    model.compile(
+        loss='mae',
+        metrics=[keras.metrics.BinaryAccuracy()],
+        optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01),
+        run_eagerly=None)
+    extract_metrics = saving_utils.extract_model_metrics(model)
+    self.assertEqual(set(model_metric_names), set(model.metrics_names))
+    self.assertEqual(set(extract_metric_names), set(extract_metrics.keys()))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index fd062b0ab337aa6fa62a7603a36749cde315c3da..fdc01d1dcd842025c3152e8884c7201d497f9576 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -25,7 +25,13 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.training.rmsprop import RMSPropOptimizer
+from tensorflow.python.keras.optimizer_v2 import adadelta as adadelta_v2
+from tensorflow.python.keras.optimizer_v2 import adagrad as adagrad_v2
+from tensorflow.python.keras.optimizer_v2 import adam as adam_v2
+from tensorflow.python.keras.optimizer_v2 import adamax as adamax_v2
+from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_v2
+from tensorflow.python.keras.optimizer_v2 import nadam as nadam_v2
+from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_v2
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
 
@@ -154,7 +160,7 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
     weights = model.get_weights()
     recovered_model.set_weights(weights)
     output = recovered_model.predict(input_data)
-    np.testing.assert_allclose(output, actual_output, rtol=1e-3)
+    np.testing.assert_allclose(output, actual_output, rtol=2e-3)
 
   # test training mode (e.g. useful for dropout tests)
   # Rebuild the model to avoid the graph being reused between predict() and
@@ -162,10 +168,13 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
   # See b/120160788 for more details. This should be mitigated after 2.0.
   model = keras.models.Model(x, layer(x))
   if _thread_local_data.run_eagerly is not None:
-    model.compile(RMSPropOptimizer(0.01), 'mse', weighted_metrics=['acc'],
-                  run_eagerly=should_run_eagerly())
+    model.compile(
+        'rmsprop',
+        'mse',
+        weighted_metrics=['acc'],
+        run_eagerly=should_run_eagerly())
   else:
-    model.compile(RMSPropOptimizer(0.01), 'mse', weighted_metrics=['acc'])
+    model.compile('rmsprop', 'mse', weighted_metrics=['acc'])
   model.train_on_batch(input_data, actual_output)
 
   # test as first layer in Sequential API
@@ -200,7 +209,7 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
     weights = model.get_weights()
     recovered_model.set_weights(weights)
     output = recovered_model.predict(input_data)
-    np.testing.assert_allclose(output, actual_output, rtol=1e-3)
+    np.testing.assert_allclose(output, actual_output, rtol=2e-3)
 
   # for further checks in the caller function
   return actual_output
@@ -355,11 +364,20 @@ class _SubclassModel(keras.Model):
 
   def __init__(self, layers):
     super(_SubclassModel, self).__init__()
-    self.all_layers = layers
+    # Note that clone and build doesn't support lists of layers in subclassed
+    # models. Adding each layer directly here.
+    for i, layer in enumerate(layers):
+      setattr(self, self._layer_name_for_i(i), layer)
+
+    self.num_layers = len(layers)
+
+  def _layer_name_for_i(self, i):
+    return 'layer{}'.format(i)
 
   def call(self, inputs, **kwargs):
     x = inputs
-    for layer in self.all_layers:
+    for i in range(self.num_layers):
+      layer = getattr(self, self._layer_name_for_i(i))
       x = layer(x)
     return x
 
@@ -626,3 +644,39 @@ def get_multi_io_model(
     return keras.Model(inputs, outputs)
 
   raise ValueError('Unknown model type {}'.format(model_type))
+
+
+_V2_OPTIMIZER_MAP = {
+    'adadelta': adadelta_v2.Adadelta,
+    'adagrad': adagrad_v2.Adagrad,
+    'adam': adam_v2.Adam,
+    'adamax': adamax_v2.Adamax,
+    'nadam': nadam_v2.Nadam,
+    'rmsprop': rmsprop_v2.RMSprop,
+    'sgd': gradient_descent_v2.SGD
+}
+
+
+def get_v2_optimizer(name, **kwargs):
+  """Get the v2 optimizer requested.
+
+  This is only necessary until v2 are the default, as we are testing in Eager,
+  and Eager + v1 optimizers fail tests. When we are in v2, the strings alone
+  should be sufficient, and this mapping can theoretically be removed.
+
+  Args:
+    name: string name of Keras v2 optimizer.
+    **kwargs: any kwargs to pass to the optimizer constructor.
+
+  Returns:
+    Initialized Keras v2 optimizer.
+
+  Raises:
+    ValueError: if an unknown name was passed.
+  """
+  try:
+    return _V2_OPTIMIZER_MAP[name](**kwargs)
+  except KeyError:
+    raise ValueError(
+        'Could not find requested v2 optimizer: {}\nValid choices: {}'.format(
+            name, list(_V2_OPTIMIZER_MAP.keys())))
diff --git a/tensorflow/python/keras/utils/__init__.py b/tensorflow/python/keras/utils/__init__.py
index 61940ad789c4009fca5462079014482fb8bfec1b..66d9817a6aecd28aafcf01896d089a342401fca7 100644
--- a/tensorflow/python/keras/utils/__init__.py
+++ b/tensorflow/python/keras/utils/__init__.py
@@ -34,10 +34,12 @@ from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.keras.utils.io_utils import HDF5Matrix
 from tensorflow.python.keras.utils.layer_utils import convert_all_kernels_in_model
 from tensorflow.python.keras.utils.layer_utils import get_source_inputs
+from tensorflow.python.keras.utils.layer_utils import print_summary
 from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.keras.utils.multi_gpu_utils import multi_gpu_model
 from tensorflow.python.keras.utils.np_utils import normalize
 from tensorflow.python.keras.utils.np_utils import to_categorical
+from tensorflow.python.keras.utils.vis_utils import model_to_dot
 from tensorflow.python.keras.utils.vis_utils import plot_model
 
 del absolute_import
diff --git a/tensorflow/python/keras/utils/composite_tensor_support_test.py b/tensorflow/python/keras/utils/composite_tensor_support_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..67e7711ebd61079b625890b14520ef18b724ddbb
--- /dev/null
+++ b/tensorflow/python/keras/utils/composite_tensor_support_test.py
@@ -0,0 +1,133 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras composite tensor support."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.layers import Layer
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import test
+
+
+# Define test-only Layer classes to validate passing Sparse and Ragged tensors
+# between layers.
+class ToDense(Layer):
+  """Create a dense (standard) tensor from the given input tensor."""
+
+  def __init__(self, default_value, **kwargs):
+    super(ToDense, self).__init__(**kwargs)
+    self._default_value = default_value
+
+  def call(self, inputs):
+    if isinstance(inputs, ragged_tensor.RaggedTensor):
+      return inputs.to_tensor(default_value=self._default_value)
+    elif isinstance(inputs, sparse_tensor.SparseTensor):
+      return sparse_ops.sparse_tensor_to_dense(
+          inputs, default_value=self._default_value)
+    elif isinstance(inputs, ops.Tensor):
+      return inputs
+    else:
+      raise TypeError("Unexpected tensor type %s" % type(inputs).__name__)
+
+
+class ToRagged(Layer):
+  """Create a ragged tensor based on a given dense tensor."""
+
+  def __init__(self, padding, ragged_rank=1, **kwargs):
+    super(ToRagged, self).__init__(**kwargs)
+    self._padding = padding
+    self._ragged_rank = ragged_rank
+
+  def call(self, inputs):
+    return ragged_tensor.RaggedTensor.from_tensor(
+        inputs, padding=self._padding, ragged_rank=self._ragged_rank)
+
+
+class ToSparse(Layer):
+  """Create a sparse tensor based on a given dense tensor."""
+
+  def call(self, inputs):
+    indices = array_ops.where(math_ops.not_equal(inputs, 0))
+    values = array_ops.gather_nd(inputs, indices)
+    shape = array_ops.shape(inputs, out_type=dtypes.int64)
+    return sparse_tensor.SparseTensor(indices, values, dense_shape=shape)
+
+
+@keras_parameterized.run_with_all_model_types
+@keras_parameterized.run_all_keras_modes
+class InternalCompositeTest(keras_parameterized.TestCase):
+
+  def test_model_with_internal_ragged_tensors(self):
+    # Create a model that accepts an input, converts it to Ragged, and
+    # converts the ragged tensor back to a dense tensor.
+    layers = [ToRagged(padding=0), ToDense(default_value=-1)]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(None,))
+
+    # Define some training data with additional padding.
+    input_data = np.array([[1, 0, 0], [2, 3, 0]])
+    expected_output = np.array([[1, -1], [2, 3]])
+    output = model.predict(input_data)
+    self.assertAllEqual(expected_output, output)
+
+  def test_model_with_internal_sparse_tensors(self):
+    # Create a model that accepts an input, converts it to Sparse, and
+    # converts the sparse tensor back to a dense tensor.
+    layers = [ToSparse(), ToDense(default_value=-1)]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(None,))
+
+    # Define some training data with additional padding.
+    input_data = np.array([[1, 0, 0], [2, 3, 0]])
+    expected_output = np.array([[1, -1, -1], [2, 3, -1]])
+    output = model.predict(input_data)
+    self.assertAllEqual(expected_output, output)
+
+  def test_training_model_with_internal_ragged_tensors(self):
+
+    # Create a model that implements y=Mx. This is easy to learn and will
+    # demonstrate appropriate gradient passing. (We have to use RaggedTensors
+    # for this test, as ToSparse() doesn't support gradient propagation through
+    # the layer.) TODO(b/124796939): Investigate this.
+    layers = [core.Dense(2), ToRagged(padding=0), ToDense(default_value=-1)]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(1,))
+
+    input_data = np.random.rand(1024, 1)
+    expected_data = np.concatenate((input_data * 3, input_data * .5), axis=-1)
+
+    model.compile(
+        loss="mse",
+        optimizer="adam",
+        run_eagerly=testing_utils.should_run_eagerly())
+    history = model.fit(input_data, expected_data, epochs=10, verbose=0)
+
+    # If the model trained, the loss stored at history[0] should be different
+    # than the one stored at history[-1].
+    self.assertNotEqual(history.history["loss"][-1], history.history["loss"][0])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/utils/conv_utils.py b/tensorflow/python/keras/utils/conv_utils.py
index f486e631e50e5beb8da606879f23cd67131389f5..ea7427f61a8cc234f69df28d111d26b87b326a48 100644
--- a/tensorflow/python/keras/utils/conv_utils.py
+++ b/tensorflow/python/keras/utils/conv_utils.py
@@ -194,9 +194,11 @@ def normalize_data_format(value):
 
 
 def normalize_padding(value):
+  if isinstance(value, (list, tuple)):
+    return value
   padding = value.lower()
   if padding not in {'valid', 'same', 'causal'}:
-    raise ValueError('The `padding` argument must be one of '
+    raise ValueError('The `padding` argument must be a list/tuple or one of '
                      '"valid", "same" (or "causal", only for `Conv1D). '
                      'Received: ' + str(padding))
   return padding
diff --git a/tensorflow/python/keras/utils/conv_utils_test.py b/tensorflow/python/keras/utils/conv_utils_test.py
index eb2a360bfdaf04d695a599b477c0d154bac062cd..ef7ad1b8c53edbc313d95382b248b159c6c2da1d 100644
--- a/tensorflow/python/keras/utils/conv_utils_test.py
+++ b/tensorflow/python/keras/utils/conv_utils_test.py
@@ -52,6 +52,114 @@ input_shapes = [
 ]
 
 
+class TestBasicConvUtilsTest(test.TestCase):
+
+  def test_convert_data_format(self):
+    self.assertEqual('NCDHW', conv_utils.convert_data_format(
+        'channels_first', 5))
+    self.assertEqual('NCHW', conv_utils.convert_data_format(
+        'channels_first', 4))
+    self.assertEqual('NCW', conv_utils.convert_data_format('channels_first', 3))
+    self.assertEqual('NHWC', conv_utils.convert_data_format('channels_last', 4))
+    self.assertEqual('NWC', conv_utils.convert_data_format('channels_last', 3))
+    self.assertEqual('NDHWC', conv_utils.convert_data_format(
+        'channels_last', 5))
+
+    with self.assertRaises(ValueError):
+      conv_utils.convert_data_format('invalid', 2)
+
+  def test_normalize_tuple(self):
+    self.assertEqual((2, 2, 2),
+                     conv_utils.normalize_tuple(2, n=3, name='strides'))
+    self.assertEqual((2, 1, 2),
+                     conv_utils.normalize_tuple((2, 1, 2), n=3, name='strides'))
+
+    with self.assertRaises(ValueError):
+      conv_utils.normalize_tuple((2, 1), n=3, name='strides')
+
+    with self.assertRaises(ValueError):
+      conv_utils.normalize_tuple(None, n=3, name='strides')
+
+  def test_normalize_data_format(self):
+    self.assertEqual('channels_last',
+                     conv_utils.normalize_data_format('Channels_Last'))
+    self.assertEqual('channels_first',
+                     conv_utils.normalize_data_format('CHANNELS_FIRST'))
+
+    with self.assertRaises(ValueError):
+      conv_utils.normalize_data_format('invalid')
+
+  def test_normalize_padding(self):
+    self.assertEqual('same', conv_utils.normalize_padding('SAME'))
+    self.assertEqual('valid', conv_utils.normalize_padding('VALID'))
+
+    with self.assertRaises(ValueError):
+      conv_utils.normalize_padding('invalid')
+
+  def test_conv_output_length(self):
+    self.assertEqual(4, conv_utils.conv_output_length(4, 2, 'same', 1, 1))
+    self.assertEqual(2, conv_utils.conv_output_length(4, 2, 'same', 2, 1))
+    self.assertEqual(3, conv_utils.conv_output_length(4, 2, 'valid', 1, 1))
+    self.assertEqual(2, conv_utils.conv_output_length(4, 2, 'valid', 2, 1))
+    self.assertEqual(5, conv_utils.conv_output_length(4, 2, 'full', 1, 1))
+    self.assertEqual(3, conv_utils.conv_output_length(4, 2, 'full', 2, 1))
+    self.assertEqual(2, conv_utils.conv_output_length(5, 2, 'valid', 2, 2))
+
+  def test_conv_input_length(self):
+    self.assertEqual(3, conv_utils.conv_input_length(4, 2, 'same', 1))
+    self.assertEqual(2, conv_utils.conv_input_length(2, 2, 'same', 2))
+    self.assertEqual(4, conv_utils.conv_input_length(3, 2, 'valid', 1))
+    self.assertEqual(4, conv_utils.conv_input_length(2, 2, 'valid', 2))
+    self.assertEqual(3, conv_utils.conv_input_length(4, 2, 'full', 1))
+    self.assertEqual(4, conv_utils.conv_input_length(3, 2, 'full', 2))
+
+  def test_deconv_output_length(self):
+    self.assertEqual(4, conv_utils.deconv_output_length(4, 2, 'same', stride=1))
+    self.assertEqual(8, conv_utils.deconv_output_length(4, 2, 'same', stride=2))
+    self.assertEqual(5, conv_utils.deconv_output_length(
+        4, 2, 'valid', stride=1))
+    self.assertEqual(8, conv_utils.deconv_output_length(
+        4, 2, 'valid', stride=2))
+    self.assertEqual(3, conv_utils.deconv_output_length(4, 2, 'full', stride=1))
+    self.assertEqual(6, conv_utils.deconv_output_length(4, 2, 'full', stride=2))
+    self.assertEqual(
+        5,
+        conv_utils.deconv_output_length(
+            4, 2, 'same', output_padding=2, stride=1))
+    self.assertEqual(
+        7,
+        conv_utils.deconv_output_length(
+            4, 2, 'same', output_padding=1, stride=2))
+    self.assertEqual(
+        7,
+        conv_utils.deconv_output_length(
+            4, 2, 'valid', output_padding=2, stride=1))
+    self.assertEqual(
+        9,
+        conv_utils.deconv_output_length(
+            4, 2, 'valid', output_padding=1, stride=2))
+    self.assertEqual(
+        5,
+        conv_utils.deconv_output_length(
+            4, 2, 'full', output_padding=2, stride=1))
+    self.assertEqual(
+        7,
+        conv_utils.deconv_output_length(
+            4, 2, 'full', output_padding=1, stride=2))
+    self.assertEqual(
+        5,
+        conv_utils.deconv_output_length(
+            4, 2, 'same', output_padding=1, stride=1, dilation=2))
+    self.assertEqual(
+        12,
+        conv_utils.deconv_output_length(
+            4, 2, 'valid', output_padding=2, stride=2, dilation=3))
+    self.assertEqual(
+        6,
+        conv_utils.deconv_output_length(
+            4, 2, 'full', output_padding=2, stride=2, dilation=3))
+
+
 @parameterized.parameters(input_shapes)
 class TestConvUtils(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/utils/data_utils.py b/tensorflow/python/keras/utils/data_utils.py
index 01a9d61a84c8ceb5a251a80c9440c0ba6469e64f..0f6e89b4d273ba37174cfa2f5c20a473ab6087ea 100644
--- a/tensorflow/python/keras/utils/data_utils.py
+++ b/tensorflow/python/keras/utils/data_utils.py
@@ -40,7 +40,7 @@ from six.moves.urllib.request import urlopen
 
 from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.util import tf_inspect
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
 try:
@@ -144,7 +144,7 @@ def _extract_archive(file_path, path='.', archive_format='auto'):
   return False
 
 
-@tf_export('keras.utils.get_file')
+@keras_export('keras.utils.get_file')
 def get_file(fname,
              origin,
              untar=False,
@@ -246,10 +246,10 @@ def get_file(fname,
     try:
       try:
         urlretrieve(origin, fpath, dl_progress)
-      except URLError as e:
-        raise Exception(error_msg.format(origin, e.errno, e.reason))
       except HTTPError as e:
         raise Exception(error_msg.format(origin, e.code, e.msg))
+      except URLError as e:
+        raise Exception(error_msg.format(origin, e.errno, e.reason))
     except (Exception, KeyboardInterrupt) as e:
       if os.path.exists(fpath):
         os.remove(fpath)
@@ -324,7 +324,7 @@ def validate_file(fpath, file_hash, algorithm='auto', chunk_size=65535):
     return False
 
 
-@tf_export('keras.utils.Sequence')
+@keras_export('keras.utils.Sequence')
 class Sequence(object):
   """Base object for fitting to a sequence of data, such as a dataset.
 
@@ -445,7 +445,7 @@ def get_index(uid, i):
   return _SHARED_SEQUENCES[uid][i]
 
 
-@tf_export('keras.utils.SequenceEnqueuer')
+@keras_export('keras.utils.SequenceEnqueuer')
 class SequenceEnqueuer(object):
   """Base class to enqueue inputs.
 
@@ -570,7 +570,7 @@ class SequenceEnqueuer(object):
     raise NotImplementedError
 
 
-@tf_export('keras.utils.OrderedEnqueuer')
+@keras_export('keras.utils.OrderedEnqueuer')
 class OrderedEnqueuer(SequenceEnqueuer):
   """Builds a Enqueuer from a Sequence.
 
@@ -596,9 +596,9 @@ class OrderedEnqueuer(SequenceEnqueuer):
         Function, a Function to initialize the pool
     """
     def pool_fn(seqs):
-      return multiprocessing.Pool(workers,
-                                  initializer=init_pool_generator,
-                                  initargs=(seqs, self.random_seed))
+      return multiprocessing.Pool(
+          workers, initializer=init_pool_generator, initargs=(seqs, None))
+
     return pool_fn
 
   def _wait_queue(self):
@@ -680,7 +680,7 @@ def next_sample(uid):
   return six.next(_SHARED_SEQUENCES[uid])
 
 
-@tf_export('keras.utils.GeneratorEnqueuer')
+@keras_export('keras.utils.GeneratorEnqueuer')
 class GeneratorEnqueuer(SequenceEnqueuer):
   """Builds a queue out of a data generator.
 
diff --git a/tensorflow/python/keras/utils/generic_utils.py b/tensorflow/python/keras/utils/generic_utils.py
index c331ce430bd761ca4beb2d6f8ab2e314e2e3178c..85457720dc1445e9a1818e59170fcbb0485772ea 100644
--- a/tensorflow/python/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/utils/generic_utils.py
@@ -32,12 +32,12 @@ import six
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 _GLOBAL_CUSTOM_OBJECTS = {}
 
 
-@tf_export('keras.utils.CustomObjectScope')
+@keras_export('keras.utils.CustomObjectScope')
 class CustomObjectScope(object):
   """Provides a scope that changes to `_GLOBAL_CUSTOM_OBJECTS` cannot escape.
 
@@ -73,7 +73,7 @@ class CustomObjectScope(object):
     _GLOBAL_CUSTOM_OBJECTS.update(self.backup)
 
 
-@tf_export('keras.utils.custom_object_scope')
+@keras_export('keras.utils.custom_object_scope')
 def custom_object_scope(*args):
   """Provides a scope that changes to `_GLOBAL_CUSTOM_OBJECTS` cannot escape.
 
@@ -104,7 +104,7 @@ def custom_object_scope(*args):
   return CustomObjectScope(*args)
 
 
-@tf_export('keras.utils.get_custom_objects')
+@keras_export('keras.utils.get_custom_objects')
 def get_custom_objects():
   """Retrieves a live reference to the global dictionary of custom objects.
 
@@ -130,7 +130,7 @@ def serialize_keras_class_and_config(cls_name, cls_config):
   return {'class_name': cls_name, 'config': cls_config}
 
 
-@tf_export('keras.utils.serialize_keras_object')
+@keras_export('keras.utils.serialize_keras_object')
 def serialize_keras_object(instance):
   _, instance = tf_decorator.unwrap(instance)
   if instance is None:
@@ -167,7 +167,7 @@ def class_and_config_for_serialized_keras_object(
   return (cls, config['config'])
 
 
-@tf_export('keras.utils.deserialize_keras_object')
+@keras_export('keras.utils.deserialize_keras_object')
 def deserialize_keras_object(identifier,
                              module_objects=None,
                              custom_objects=None,
@@ -306,7 +306,7 @@ def has_arg(fn, name, accept_all=False):
   return name in arg_spec.args
 
 
-@tf_export('keras.utils.Progbar')
+@keras_export('keras.utils.Progbar')
 class Progbar(object):
   """Displays a progress bar.
 
@@ -391,9 +391,8 @@ class Progbar(object):
         sys.stdout.write('\n')
 
       if self.target is not None:
-        numdigits = int(np.floor(np.log10(self.target))) + 1
-        barstr = '%%%dd/%d [' % (numdigits, self.target)
-        bar = barstr % current
+        numdigits = int(np.log10(self.target)) + 1
+        bar = ('%' + str(numdigits) + 'd/%d') % (current, self.target)
         prog = float(current) / self.target
         prog_width = int(self.width * prog)
         if prog_width > 0:
@@ -456,7 +455,10 @@ class Progbar(object):
       sys.stdout.flush()
 
     elif self.verbose == 2:
-      if self.target is None or current >= self.target:
+      if self.target is not None and current >= self.target:
+        numdigits = int(np.log10(self.target)) + 1
+        count = ('%' + str(numdigits) + 'd/%d') % (current, self.target)
+        info = count + info
         for k in self._values_order:
           info += ' - %s:' % k
           avg = np.mean(self._values[k][0] / max(1, self._values[k][1]))
@@ -570,11 +572,8 @@ def to_snake_case(name):
   return 'private' + insecure
 
 
-def is_all_none(iterable_or_element):
-  if not isinstance(iterable_or_element, (list, tuple)):
-    iterable = [iterable_or_element]
-  else:
-    iterable = iterable_or_element
+def is_all_none(structure):
+  iterable = nest.flatten(structure)
   # We cannot use Python's `any` because the iterable may return Tensors.
   for element in iterable:
     if element is not None:
diff --git a/tensorflow/python/keras/utils/io_utils.py b/tensorflow/python/keras/utils/io_utils.py
index 62674a9c77fc410a551d2ac79c22ecf959b16fc3..5bb9a93ccade10221cb1f1594693b415e2061c72 100644
--- a/tensorflow/python/keras/utils/io_utils.py
+++ b/tensorflow/python/keras/utils/io_utils.py
@@ -22,7 +22,7 @@ from collections import defaultdict
 
 import numpy as np
 import six
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
 try:
@@ -31,7 +31,7 @@ except ImportError:
   h5py = None
 
 
-@tf_export('keras.utils.HDF5Matrix')
+@keras_export('keras.utils.HDF5Matrix')
 class HDF5Matrix(object):
   """Representation of HDF5 dataset to be used instead of a Numpy array.
 
diff --git a/tensorflow/python/keras/utils/kernelized_utils.py b/tensorflow/python/keras/utils/kernelized_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e73cb2d4c63df2f1098802deffbcc899039d0cb
--- /dev/null
+++ b/tensorflow/python/keras/utils/kernelized_utils.py
@@ -0,0 +1,117 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility methods related to kernelized layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+def _to_matrix(u):
+  """If input tensor is a vector (i.e., has rank 1), converts it to matrix."""
+  u_rank = len(u.shape)
+  if u_rank not in [1, 2]:
+    raise ValueError('The input tensor should have rank 1 or 2. Given rank: {}'
+                     .format(u_rank))
+  if u_rank == 1:
+    return array_ops.expand_dims(u, 0)
+  return u
+
+
+def _align_matrices(x, y):
+  """Aligns x and y tensors to allow computations over pairs of their rows."""
+  x_matrix = _to_matrix(x)
+  y_matrix = _to_matrix(y)
+  x_shape = x_matrix.shape
+  y_shape = y_matrix.shape
+  if y_shape[1] != x_shape[1]:  # dimensions do not match.
+    raise ValueError(
+        'The outermost dimensions of the input tensors should match. Given: {} '
+        'vs {}.'.format(y_shape[1], x_shape[1]))
+
+  x_tile = array_ops.tile(
+      array_ops.expand_dims(x_matrix, 1), [1, y_shape[0], 1])
+  y_tile = array_ops.tile(
+      array_ops.expand_dims(y_matrix, 0), [x_shape[0], 1, 1])
+  return x_tile, y_tile
+
+
+def inner_product(u, v):
+  u = _to_matrix(u)
+  v = _to_matrix(v)
+  return math_ops.matmul(u, v, transpose_b=True)
+
+
+def exact_gaussian_kernel(x, y, stddev):
+  """Computes exact Gaussian kernel value(s) for tensors x and y and stddev.
+
+  The Gaussian kernel for vectors u, v is defined as follows:
+       K(u, v) = exp(-||u-v||^2 / (2* stddev^2))
+  where the norm is the l2-norm. x, y can be either vectors or matrices. If they
+  are vectors, they must have the same dimension. If they are matrices, they
+  must have the same number of columns. In the latter case, the method returns
+  (as a matrix) K(u, v) values for all pairs (u, v) where u is a row from x and
+  v is a row from y.
+
+  Args:
+    x: a tensor of rank 1 or 2. It's shape should be either [dim] or [m, dim].
+    y: a tensor of rank 1 or 2. It's shape should be either [dim] or [n, dim].
+    stddev: The width of the Gaussian kernel.
+
+  Returns:
+    A single value (scalar) with shape (1, 1) (if x, y are vectors) or a matrix
+      of shape (m, n) with entries K(u, v) (where K is the Gaussian kernel) for
+      all (u,v) pairs where u, v are rows from x and y respectively.
+
+  Raises:
+    InvalidShapeError: if the shapes of x, y are not compatible.
+  """
+  x_aligned, y_aligned = _align_matrices(x, y)
+  diff_squared_l2_norm = math_ops.reduce_sum(
+      math_ops.squared_difference(x_aligned, y_aligned), 2)
+  return math_ops.exp(-diff_squared_l2_norm / (2 * stddev * stddev))
+
+
+def exact_laplacian_kernel(x, y, stddev):
+  """Computes exact Laplacian kernel value(s) for tensors x and y using stddev.
+
+  The Laplacian kernel for vectors u, v is defined as follows:
+       K(u, v) = exp(-||u-v|| / stddev)
+  where the norm is the l1-norm. x, y can be either vectors or matrices. If they
+  are vectors, they must have the same dimension. If they are matrices, they
+  must have the same number of columns. In the latter case, the method returns
+  (as a matrix) K(u, v) values for all pairs (u, v) where u is a row from x and
+  v is a row from y.
+
+  Args:
+    x: a tensor of rank 1 or 2. It's shape should be either [dim] or [m, dim].
+    y: a tensor of rank 1 or 2. It's shape should be either [dim] or [n, dim].
+    stddev: The width of the Gaussian kernel.
+
+  Returns:
+    A single value (scalar) with shape (1, 1)  if x, y are vectors or a matrix
+    of shape (m, n) with entries K(u, v) (where K is the Laplacian kernel) for
+    all (u,v) pairs where u, v are rows from x and y respectively.
+
+  Raises:
+    InvalidShapeError: if the shapes of x, y are not compatible.
+  """
+  x_aligned, y_aligned = _align_matrices(x, y)
+  diff_l1_norm = math_ops.reduce_sum(
+      math_ops.abs(math_ops.subtract(x_aligned, y_aligned)), 2)
+  return math_ops.exp(-diff_l1_norm / stddev)
diff --git a/tensorflow/python/keras/utils/kernelized_utils_test.py b/tensorflow/python/keras/utils/kernelized_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9a72493ddee5cf1d0f310c06d0fa1860b2a61f
--- /dev/null
+++ b/tensorflow/python/keras/utils/kernelized_utils_test.py
@@ -0,0 +1,116 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for kernelized_utils.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.keras.utils import kernelized_utils
+from tensorflow.python.platform import test
+
+
+def _exact_gaussian(stddev):
+  return functools.partial(
+      kernelized_utils.exact_gaussian_kernel, stddev=stddev)
+
+
+def _exact_laplacian(stddev):
+  return functools.partial(
+      kernelized_utils.exact_laplacian_kernel, stddev=stddev)
+
+
+class KernelizedUtilsTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('gaussian', _exact_gaussian(stddev=10.0), [[1.0]]),
+      ('laplacian', _exact_laplacian(stddev=50.0), [[1.0]]))
+  def test_equal_vectors(self, exact_kernel_fn, expected_values):
+    """Identical vectors give exactly the identity kernel value."""
+    x = constant_op.constant([0.5, -0.5, -0.5, 0.5])
+    y = constant_op.constant([0.5, -0.5, -0.5, 0.5])
+    exact_kernel = exact_kernel_fn(x, y)
+    shape = exact_kernel.get_shape().as_list()
+    self.assertLen(shape, 2)
+    # x and y are identical and therefore K(x, y) will be precisely equal to
+    # the identity value of the kernel.
+    self.assertAllClose(expected_values, exact_kernel, atol=1e-6)
+
+  @parameterized.named_parameters(
+      ('gaussian', _exact_gaussian(stddev=10.0), [[1.0]]),
+      ('laplacian', _exact_laplacian(stddev=50.0), [[1.0]]))
+  def test_almost_identical_vectors(self, exact_kernel_fn, expected_values):
+    """Almost identical vectors give the identity kernel value."""
+    x = constant_op.constant([1.0, 0.4, -2.1, -1.1])
+    y = constant_op.constant([1.01, 0.39, -2.099, -1.101])
+    exact_kernel = exact_kernel_fn(x, y)
+    shape = exact_kernel.get_shape().as_list()
+    self.assertLen(shape, 2)
+    # x and y are almost identical and therefore K(x, y) will be almost equal to
+    # the identity value of the kernel.
+    self.assertAllClose(expected_values, exact_kernel, atol=1e-3)
+
+  @parameterized.named_parameters(
+      ('gaussian', _exact_gaussian(stddev=1.0), [[0.99], [0.977]]),
+      ('laplacian', _exact_laplacian(stddev=5.0), [[0.96], [0.94]]))
+  def test_similar_matrices(self, exact_kernel_fn, expected_values):
+    """Pairwise "close" vectors give high kernel values (similarity scores)."""
+    x = constant_op.constant([1.0, 3.4, -2.1, 0.9, 3.3, -2.0], shape=[2, 3])
+    y = constant_op.constant([1.1, 3.35, -2.05])
+    exact_kernel = exact_kernel_fn(x, y)
+    shape = exact_kernel.get_shape().as_list()
+    self.assertLen(shape, 2)
+    # The 2 rows of x are close to y. The pairwise kernel values (similarity
+    # scores) are somewhat close to the identity value of the kernel.
+    self.assertAllClose(expected_values, exact_kernel, atol=1e-2)
+
+  @parameterized.named_parameters(
+      ('gaussian', _exact_gaussian(stddev=2.0), [[.997, .279], [.251, 1.],
+                                                 [.164, 0.019]]),
+      ('laplacian', _exact_laplacian(stddev=2.0), [[.904, .128], [.116, 1.],
+                                                   [.07, 0.027]]))
+  def test_matrices_varying_similarity(self, exact_kernel_fn, expected_values):
+    """Test matrices with row vectors of varying pairwise similarity."""
+    x = constant_op.constant([1.0, 2., -2., 0.9, 3.3, -1.0], shape=[3, 2])
+    y = constant_op.constant([1.1, 2.1, -2., 0.9], shape=[2, 2])
+    exact_kernel = exact_kernel_fn(x, y)
+
+    shape = exact_kernel.get_shape().as_list()
+    self.assertLen(shape, 2)
+    self.assertAllClose(expected_values, exact_kernel, atol=1e-2)
+
+  @parameterized.named_parameters(
+      ('gaussian', _exact_gaussian(stddev=1.0), [[0.0]]),
+      ('laplacian', _exact_laplacian(stddev=1.0), [[0.0]]))
+  def test_completely_dissimilar_vectors(self, exact_kernel_fn,
+                                         expected_values):
+    """Very dissimilar vectors give very low similarity scores."""
+    x = constant_op.constant([1.0, 3.4, -2.1, -5.1])
+    y = constant_op.constant([0.5, 2.1, 1.0, 3.0])
+    exact_kernel = exact_kernel_fn(x, y)
+    shape = exact_kernel.get_shape().as_list()
+    self.assertLen(shape, 2)
+    # x and y are very "far" from each other and so the corresponding kernel
+    # value will be very low.
+    self.assertAllClose(expected_values, exact_kernel, atol=1e-2)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/utils/layer_utils.py b/tensorflow/python/keras/utils/layer_utils.py
index 60677be73512c921f9fbbc96911655f28de29638..640462d5c63f459f59bb09d24edc1f78f7016c35 100644
--- a/tensorflow/python/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/utils/layer_utils.py
@@ -23,10 +23,11 @@ import numpy as np
 
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.utils.conv_utils import convert_kernel
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.utils.get_source_inputs')
+@keras_export('keras.utils.get_source_inputs')
 def get_source_inputs(tensor, layer=None, node_index=None):
   """Returns the list of input tensors necessary to compute `tensor`.
 
@@ -53,14 +54,11 @@ def get_source_inputs(tensor, layer=None, node_index=None):
     node = layer._inbound_nodes[node_index]
     if not node.inbound_layers:
       # Reached an Input layer, stop recursion.
-      return node.input_tensors
+      return nest.flatten(node.input_tensors)
     else:
       source_tensors = []
-      for i in range(len(node.inbound_layers)):
-        x = node.input_tensors[i]
-        layer = node.inbound_layers[i]
-        node_index = node.node_indices[i]
-        previous_sources = get_source_inputs(x, layer, node_index)
+      for layer, node_index, _, tensor in node.iterate_inbound():
+        previous_sources = get_source_inputs(tensor, layer, node_index)
         # Avoid input redundancy.
         for x in previous_sources:
           if x not in source_tensors:
@@ -110,7 +108,8 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
     nodes_by_depth = model._nodes_by_depth.values()
     nodes = []
     for v in nodes_by_depth:
-      if (len(v) > 1) or (len(v) == 1 and len(v[0].inbound_layers) > 1):
+      if (len(v) > 1) or (len(v) == 1 and
+                          len(nest.flatten(v[0].inbound_layers)) > 1):
         # if the model has multiple nodes
         # or if the nodes have multiple inbound_layers
         # the model is no longer sequential
@@ -159,6 +158,7 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
       line += ' ' * (positions[i] - len(line))
     print_fn(line)
 
+  print_fn('Model: "{}"'.format(model.name))
   print_fn('_' * line_length)
   print_row(to_display, positions)
   print_fn('=' * line_length)
@@ -195,12 +195,10 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
       if relevant_nodes and node not in relevant_nodes:
         # node is not part of the current network
         continue
-      for i in range(len(node.inbound_layers)):
-        inbound_layer = node.inbound_layers[i].name
-        inbound_node_index = node.node_indices[i]
-        inbound_tensor_index = node.tensor_indices[i]
-        connections.append(inbound_layer + '[' + str(inbound_node_index) +
-                           '][' + str(inbound_tensor_index) + ']')
+
+      for inbound_layer, node_index, tensor_index, _ in node.iterate_inbound():
+        connections.append('{}[{}][{}]'.format(inbound_layer.name, node_index,
+                                               tensor_index))
 
     name = layer.name
     cls_name = layer.__class__.__name__
@@ -298,7 +296,7 @@ def gather_non_trainable_weights(trainable, sub_layers, extra_variables):
   return weights + non_trainable_extra_variables
 
 
-@tf_export('keras.utils.convert_all_kernels_in_model')
+@keras_export('keras.utils.convert_all_kernels_in_model')
 def convert_all_kernels_in_model(model):
   """Converts all convolution kernels in a model from Theano to TensorFlow.
 
diff --git a/tensorflow/python/keras/utils/losses_utils.py b/tensorflow/python/keras/utils/losses_utils.py
index fc4b4ac7dfd0966af5f4c21d4b78ba8ecd6bf46a..4b37c741d1c53febed28252c4bb12b77c8c75722 100644
--- a/tensorflow/python/keras/utils/losses_utils.py
+++ b/tensorflow/python/keras/utils/losses_utils.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.ops import array_ops
@@ -25,7 +27,34 @@ from tensorflow.python.ops import confusion_matrix
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import weights_broadcast_ops
-from tensorflow.python.ops.losses import losses_impl
+from tensorflow.python.util.tf_export import keras_export
+
+
+@keras_export('keras.losses.Reduction', v1=[])
+class ReductionV2(object):
+  """Types of loss reduction.
+
+  Contains the following values:
+
+  * `NONE`: Un-reduced weighted losses with the same shape as input.
+  * `SUM`: Scalar sum of weighted losses.
+  * `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
+     Note that when using `tf.distribute.Strategy`, this is the global batch
+     size across all the replicas that are contributing to a single step.
+  """
+
+  NONE = 'none'
+  SUM = 'sum'
+  SUM_OVER_BATCH_SIZE = 'sum_over_batch_size'
+
+  @classmethod
+  def all(cls):
+    return (cls.NONE, cls.SUM, cls.SUM_OVER_BATCH_SIZE)
+
+  @classmethod
+  def validate(cls, key):
+    if key not in cls.all():
+      raise ValueError('Invalid Reduction Key %s.' % key)
 
 
 def squeeze_or_expand_dimensions(y_pred, y_true, sample_weight):
@@ -51,10 +80,31 @@ def squeeze_or_expand_dimensions(y_pred, y_true, sample_weight):
     the last dimension squeezed,
     `sample_weight` could be extended by one dimension.
   """
+  y_pred_shape = y_pred.get_shape()
+  y_pred_rank = y_pred_shape.ndims
   if y_true is not None:
-    # squeeze last dim of `y_pred` or `y_true` if their rank differs by 1
-    y_true, y_pred = confusion_matrix.remove_squeezable_dimensions(
-        y_true, y_pred)
+
+    # If sparse matrix is provided as `y_true`, the last dimension in `y_pred`
+    # may be > 1. Eg: y_true = [0, 1, 2] (shape=(3,)),
+    # y_pred = [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]] (shape=(3, 3))
+    # In this case, we should not try to remove squeezable dimension.
+    y_true_shape = y_true.get_shape()
+    y_true_rank = y_true_shape.ndims
+    if (y_true_rank is not None) and (y_pred_rank is not None):
+      # Use static rank for `y_true` and `y_pred`.
+      if (y_pred_rank - y_true_rank != 1) or y_pred_shape[-1] == 1:
+        y_true, y_pred = confusion_matrix.remove_squeezable_dimensions(
+            y_true, y_pred)
+    else:
+      # Use dynamic rank.
+      rank_diff = array_ops.rank(y_pred) - array_ops.rank(y_true)
+      squeeze_dims = lambda: confusion_matrix.remove_squeezable_dimensions(  # pylint: disable=g-long-lambda
+          y_true, y_pred)
+      is_last_dim_1 = math_ops.equal(1, array_ops.shape(y_pred)[-1])
+      maybe_squeeze_dims = lambda: control_flow_ops.cond(  # pylint: disable=g-long-lambda
+          is_last_dim_1, squeeze_dims, lambda: (y_true, y_pred))
+      y_true, y_pred = control_flow_ops.cond(
+          math_ops.equal(1, rank_diff), maybe_squeeze_dims, squeeze_dims)
 
   if sample_weight is None:
     return y_pred, y_true, None
@@ -65,8 +115,6 @@ def squeeze_or_expand_dimensions(y_pred, y_true, sample_weight):
   if weights_rank == 0:  # If weights is scalar, do nothing.
     return y_pred, y_true, sample_weight
 
-  y_pred_shape = y_pred.get_shape()
-  y_pred_rank = y_pred_shape.ndims
   if (y_pred_rank is not None) and (weights_rank is not None):
     # Use static rank.
     if weights_rank - y_pred_rank == 1:
@@ -120,21 +168,23 @@ def _num_elements(losses):
     return math_ops.cast(array_ops.size(losses, name=scope), dtype=losses.dtype)
 
 
-def _reduce_weighted_loss(
-    weighted_losses, reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE):
+def reduce_weighted_loss(weighted_losses,
+                         reduction=ReductionV2.SUM_OVER_BATCH_SIZE):
   """Reduces the individual weighted loss measurements."""
-  if reduction == losses_impl.ReductionV2.NONE:
+  if reduction == ReductionV2.NONE:
     loss = weighted_losses
   else:
     loss = math_ops.reduce_sum(weighted_losses)
-    if reduction == losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE:
-      loss = _safe_mean(loss, _num_elements(weighted_losses))
+    if reduction == ReductionV2.SUM_OVER_BATCH_SIZE:
+      num_replicas = (  # Used to convert from local to global batch size.
+          distribution_strategy_context.get_strategy().num_replicas_in_sync)
+      loss = _safe_mean(loss, num_replicas * _num_elements(weighted_losses))
   return loss
 
 
 def compute_weighted_loss(losses,
                           sample_weight=None,
-                          reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+                          reduction=ReductionV2.SUM_OVER_BATCH_SIZE,
                           name=None):
   """Computes the weighted loss.
 
@@ -142,8 +192,8 @@ def compute_weighted_loss(losses,
     losses: `Tensor` of shape `[batch_size, d1, ... dN]`.
     sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as
       `losses`, or be broadcastable to `losses`.
-    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
-      `SUM_OVER_BATCH_SIZE`.
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to loss.
+      Default value is `SUM_OVER_BATCH_SIZE`.
     name: Optional name for the op.
 
   Raises:
@@ -153,22 +203,17 @@ def compute_weighted_loss(losses,
     Weighted loss `Tensor` of the same type as `losses`. If `reduction` is
     `NONE`, this has the same shape as `losses`; otherwise, it is scalar.
   """
-  losses_impl.ReductionV2.validate(reduction)
+  ReductionV2.validate(reduction)
   if sample_weight is None:
     sample_weight = 1.0
   with ops.name_scope(name, 'weighted_loss', (losses, sample_weight)):
-    # Save the `reduction` argument for loss normalization when distributing
-    # to multiple replicas.
-    # TODO(josh11b): Associate it with the returned op for more precision.
-    ops.get_default_graph()._last_loss_reduction = reduction  # pylint: disable=protected-access
-
     # Update dimensions of `sample_weight` to match with `losses` if possible.
     losses, _, sample_weight = squeeze_or_expand_dimensions(
         losses, None, sample_weight)
     losses = ops.convert_to_tensor(losses)
     input_dtype = losses.dtype
-    losses = math_ops.to_float(losses)
-    sample_weight = math_ops.to_float(sample_weight)
+    losses = math_ops.cast(losses, dtypes.float32)
+    sample_weight = math_ops.cast(sample_weight, dtypes.float32)
 
     try:
       # Broadcast weights if possible.
@@ -183,7 +228,16 @@ def compute_weighted_loss(losses,
     sample_weight.get_shape().assert_is_compatible_with(losses.get_shape())
     weighted_losses = math_ops.multiply(losses, sample_weight)
     # Apply reduction function to the individual weighted losses.
-    loss = _reduce_weighted_loss(weighted_losses, reduction)
+    loss = reduce_weighted_loss(weighted_losses, reduction)
     # Convert the result back to the input type.
     loss = math_ops.cast(loss, input_dtype)
     return loss
+
+
+def scale_loss_for_distribution(loss_value):
+  """Scales and returns the given loss value by the number of replicas."""
+  num_replicas = (
+      distribution_strategy_context.get_strategy().num_replicas_in_sync)
+  if num_replicas > 1:
+    loss_value *= (1. / num_replicas)
+  return loss_value
diff --git a/tensorflow/python/keras/utils/metrics_utils.py b/tensorflow/python/keras/utils/metrics_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..329cb0b278076c6ea2d25d9f78607ef20d1cf3b2
--- /dev/null
+++ b/tensorflow/python/keras/utils/metrics_utils.py
@@ -0,0 +1,380 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Utils related to keras metrics.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import weakref
+
+from enum import Enum
+
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.utils.generic_utils import to_list
+from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import weights_broadcast_ops
+from tensorflow.python.util import tf_decorator
+
+NEG_INF = -1e10
+
+
+class Reduction(Enum):
+  """Types of metrics reduction.
+
+  Contains the following values:
+
+  * `SUM`: Scalar sum of weighted values.
+  * `SUM_OVER_BATCH_SIZE`: Scalar sum of weighted values divided by
+        number of elements.
+  * `WEIGHTED_MEAN`: Scalar sum of weighted values divided by sum of weights.
+  """
+  SUM = 'sum'
+  SUM_OVER_BATCH_SIZE = 'sum_over_batch_size'
+  WEIGHTED_MEAN = 'weighted_mean'
+
+
+def update_state_wrapper(update_state_fn):
+  """Decorator to wrap metric `update_state()` with `add_update()`.
+
+  Args:
+    update_state_fn: function that accumulates metric statistics.
+
+  Returns:
+    Decorated function that wraps `update_state_fn()` with `add_update()`.
+  """
+
+  def decorated(metric_obj, *args, **kwargs):
+    """Decorated function with `add_update()`."""
+
+    update_op = update_state_fn(*args, **kwargs)
+    if update_op is not None:  # update_op will be None in eager execution.
+      metric_obj.add_update(update_op, inputs=True)
+    return update_op
+
+  return tf_decorator.make_decorator(update_state_fn, decorated)
+
+
+def result_wrapper(result_fn):
+  """Decorator to wrap metric `result()` function in `merge_call()`.
+
+  Result computation is an idempotent operation that simply calculates the
+  metric value using the state variables.
+
+  If metric state variables are distributed across replicas/devices and
+  `result()` is requested from the context of one device - This function wraps
+  `result()` in a distribution strategy `merge_call()`. With this,
+  the metric state variables will be aggregated across devices.
+
+  Args:
+    result_fn: function that computes the metric result.
+
+  Returns:
+    Decorated function that wraps `result_fn()` in distribution strategy
+    `merge_call()`.
+  """
+
+  def decorated(_, *args):
+    """Decorated function with merge_call."""
+    replica_context = distribution_strategy_context.get_replica_context()
+    if replica_context is None:  # if in cross replica context already
+      result_t = result_fn(*args)
+    else:
+      # TODO(psv): Test distribution of metrics using different distribution
+      # strategies.
+
+      # Creating a wrapper for merge_fn. merge_call invokes the given merge_fn
+      # with distribution object as the first parameter. We create a wrapper
+      # here so that the result function need not have that parameter.
+      def merge_fn_wrapper(distribution, merge_fn, *args):
+        # We will get `PerDevice` merge function. Taking the first one as all
+        # are identical copies of the function that we had passed below.
+        return distribution.unwrap(merge_fn)[0](*args)
+
+      # Wrapping result in merge_call. merge_call is used when we want to leave
+      # replica mode and compute a value in cross replica mode.
+      result_t = replica_context.merge_call(
+          merge_fn_wrapper, args=(result_fn,) + args)
+    return result_t
+
+  return tf_decorator.make_decorator(result_fn, decorated)
+
+
+def weakmethod(method):
+  """Creates a weak reference to the bound method."""
+
+  cls = method.im_class
+  func = method.im_func
+  instance_ref = weakref.ref(method.im_self)
+
+  @functools.wraps(method)
+  def inner(*args, **kwargs):
+    return func.__get__(instance_ref(), cls)(*args, **kwargs)
+
+  del method
+  return inner
+
+
+def assert_thresholds_range(thresholds):
+  if thresholds is not None:
+    invalid_thresholds = [t for t in thresholds if t is None or t < 0 or t > 1]
+    if invalid_thresholds:
+      raise ValueError(
+          'Threshold values must be in [0, 1]. Invalid values: {}'.format(
+              invalid_thresholds))
+
+
+def parse_init_thresholds(thresholds, default_threshold=0.5):
+  if thresholds is not None:
+    assert_thresholds_range(to_list(thresholds))
+  thresholds = to_list(default_threshold if thresholds is None else thresholds)
+  return thresholds
+
+
+class ConfusionMatrix(Enum):
+  TRUE_POSITIVES = 'tp'
+  FALSE_POSITIVES = 'fp'
+  TRUE_NEGATIVES = 'tn'
+  FALSE_NEGATIVES = 'fn'
+
+
+class AUCCurve(Enum):
+  """Type of AUC Curve (ROC or PR)."""
+  ROC = 'ROC'
+  PR = 'PR'
+
+  @staticmethod
+  def from_str(key):
+    if key in ('pr', 'PR'):
+      return AUCCurve.PR
+    elif key in ('roc', 'ROC'):
+      return AUCCurve.ROC
+    else:
+      raise ValueError('Invalid AUC curve value "%s".' % key)
+
+
+class AUCSummationMethod(Enum):
+  """Type of AUC summation method.
+
+  https://en.wikipedia.org/wiki/Riemann_sum)
+
+  Contains the following values:
+  * 'interpolation': Applies mid-point summation scheme for `ROC` curve. For
+    `PR` curve, interpolates (true/false) positives but not the ratio that is
+    precision (see Davis & Goadrich 2006 for details).
+  * 'minoring': Applies left summation for increasing intervals and right
+    summation for decreasing intervals.
+  * 'majoring': Applies right summation for increasing intervals and left
+    summation for decreasing intervals.
+  """
+  INTERPOLATION = 'interpolation'
+  MAJORING = 'majoring'
+  MINORING = 'minoring'
+
+  @staticmethod
+  def from_str(key):
+    if key in ('interpolation', 'Interpolation'):
+      return AUCSummationMethod.INTERPOLATION
+    elif key in ('majoring', 'Majoring'):
+      return AUCSummationMethod.MAJORING
+    elif key in ('minoring', 'Minoring'):
+      return AUCSummationMethod.MINORING
+    else:
+      raise ValueError('Invalid AUC summation method value "%s".' % key)
+
+
+def update_confusion_matrix_variables(variables_to_update,
+                                      y_true,
+                                      y_pred,
+                                      thresholds,
+                                      top_k=None,
+                                      class_id=None,
+                                      sample_weight=None):
+  """Returns op to update the given confusion matrix variables.
+
+  For every pair of values in y_true and y_pred:
+
+  true_positive: y_true == True and y_pred > thresholds
+  false_negatives: y_true == True and y_pred <= thresholds
+  true_negatives: y_true == False and y_pred <= thresholds
+  false_positive: y_true == False and y_pred > thresholds
+
+  The results will be weighted and added together. When multiple thresholds are
+  provided, we will repeat the same for every threshold.
+
+  For estimation of these metrics over a stream of data, the function creates an
+  `update_op` operation that updates the given variables.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use weights of 0 to mask values.
+
+  Args:
+    variables_to_update: Dictionary with 'tp', 'fn', 'tn', 'fp' as valid keys
+      and corresponding variables to update as values.
+    y_true: A `Tensor` whose shape matches `y_pred`. Will be cast to `bool`.
+    y_pred: A floating point `Tensor` of arbitrary shape and whose values are in
+      the range `[0, 1]`.
+    thresholds: A float value or a python list or tuple of float thresholds in
+      `[0, 1]`, or NEG_INF (used when top_k is set).
+    top_k: Optional int, indicates that the positive labels should be limited to
+      the top k predictions.
+    class_id: Optional int, limits the prediction and labels to the class
+      specified by this argument.
+    sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as
+      `y_true`, and must be broadcastable to `y_true` (i.e., all dimensions must
+      be either `1`, or the same as the corresponding `y_true` dimension).
+
+  Returns:
+    Update op.
+
+  Raises:
+    ValueError: If `y_pred` and `y_true` have mismatched shapes, or if
+      `sample_weight` is not `None` and its shape doesn't match `y_pred`, or if
+      `variables_to_update` contains invalid keys.
+  """
+  if variables_to_update is None:
+    return
+  y_true = ops.convert_to_tensor(y_true)
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_pred.shape.assert_is_compatible_with(y_true.shape)
+
+  if not any(
+      key for key in variables_to_update if key in list(ConfusionMatrix)):
+    raise ValueError(
+        'Please provide at least one valid confusion matrix '
+        'variable to update. Valid variable key options are: "{}". '
+        'Received: "{}"'.format(
+            list(ConfusionMatrix), variables_to_update.keys()))
+
+  invalid_keys = [
+      key for key in variables_to_update if key not in list(ConfusionMatrix)
+  ]
+  if invalid_keys:
+    raise ValueError(
+        'Invalid keys: {}. Valid variable key options are: "{}"'.format(
+            invalid_keys, list(ConfusionMatrix)))
+
+  with ops.control_dependencies([
+      check_ops.assert_greater_equal(
+          y_pred,
+          math_ops.cast(0.0, dtype=y_pred.dtype),
+          message='predictions must be >= 0'),
+      check_ops.assert_less_equal(
+          y_pred,
+          math_ops.cast(1.0, dtype=y_pred.dtype),
+          message='predictions must be <= 1')
+  ]):
+    y_pred, y_true, sample_weight = squeeze_or_expand_dimensions(
+        math_ops.cast(y_pred, dtype=dtypes.float32),
+        math_ops.cast(y_true, dtype=dtypes.bool), sample_weight)
+
+  if top_k is not None:
+    y_pred = _filter_top_k(y_pred, top_k)
+  if class_id is not None:
+    y_true = y_true[..., class_id]
+    y_pred = y_pred[..., class_id]
+
+  thresholds = to_list(thresholds)
+  num_thresholds = len(thresholds)
+  num_predictions = array_ops.size(y_pred)
+
+  # Reshape predictions and labels.
+  predictions_2d = array_ops.reshape(y_pred, [1, -1])
+  labels_2d = array_ops.reshape(
+      math_ops.cast(y_true, dtype=dtypes.bool), [1, -1])
+
+  # Tile the thresholds for every prediction.
+  thresh_tiled = array_ops.tile(
+      array_ops.expand_dims(array_ops.constant(thresholds), 1),
+      array_ops.stack([1, num_predictions]))
+
+  # Tile the predictions for every threshold.
+  preds_tiled = array_ops.tile(predictions_2d, [num_thresholds, 1])
+
+  # Compare predictions and threshold.
+  pred_is_pos = math_ops.greater(preds_tiled, thresh_tiled)
+
+  # Tile labels by number of thresholds
+  label_is_pos = array_ops.tile(labels_2d, [num_thresholds, 1])
+
+  if sample_weight is not None:
+    weights = weights_broadcast_ops.broadcast_weights(
+        math_ops.cast(sample_weight, dtype=dtypes.float32), y_pred)
+    weights_tiled = array_ops.tile(
+        array_ops.reshape(weights, [1, -1]), [num_thresholds, 1])
+  else:
+    weights_tiled = None
+
+  update_ops = []
+
+  def weighted_assign_add(label, pred, weights, var):
+    label_and_pred = math_ops.cast(
+        math_ops.logical_and(label, pred), dtype=dtypes.float32)
+    if weights is not None:
+      label_and_pred *= weights
+    return state_ops.assign_add(var, math_ops.reduce_sum(label_and_pred, 1))
+
+  loop_vars = {
+      ConfusionMatrix.TRUE_POSITIVES: (label_is_pos, pred_is_pos),
+  }
+  update_tn = ConfusionMatrix.TRUE_NEGATIVES in variables_to_update
+  update_fp = ConfusionMatrix.FALSE_POSITIVES in variables_to_update
+  update_fn = ConfusionMatrix.FALSE_NEGATIVES in variables_to_update
+
+  if update_fn or update_tn:
+    pred_is_neg = math_ops.logical_not(pred_is_pos)
+    loop_vars[ConfusionMatrix.FALSE_NEGATIVES] = (label_is_pos, pred_is_neg)
+
+  if update_fp or update_tn:
+    label_is_neg = math_ops.logical_not(label_is_pos)
+    loop_vars[ConfusionMatrix.FALSE_POSITIVES] = (label_is_neg, pred_is_pos)
+    if update_tn:
+      loop_vars[ConfusionMatrix.TRUE_NEGATIVES] = (label_is_neg, pred_is_neg)
+
+  for matrix_cond, (label, pred) in loop_vars.items():
+    if matrix_cond in variables_to_update:
+      update_ops.append(
+          weighted_assign_add(label, pred, weights_tiled,
+                              variables_to_update[matrix_cond]))
+  return control_flow_ops.group(update_ops)
+
+
+def _filter_top_k(x, k):
+  """Filters top-k values in the last dim of x and set the rest to NEG_INF.
+
+  Used for computing top-k prediction values in dense labels (which has the same
+  shape as predictions) for recall and precision top-k metrics.
+
+  Args:
+    x: tensor with any dimensions.
+    k: the number of values to keep.
+
+  Returns:
+    tensor with same shape and dtype as x.
+  """
+  _, top_k_idx = nn_ops.top_k(x, k, sorted=False)
+  top_k_mask = math_ops.reduce_sum(
+      array_ops.one_hot(top_k_idx, x.shape[-1], axis=-1), axis=-2)
+  return x * top_k_mask + NEG_INF * (1 - top_k_mask)
diff --git a/tensorflow/python/keras/utils/mode_keys.py b/tensorflow/python/keras/utils/mode_keys.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb6fc3eef7e8967b8a87707569770f7ec1495022
--- /dev/null
+++ b/tensorflow/python/keras/utils/mode_keys.py
@@ -0,0 +1,22 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras model mode constants."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.python.saved_model.model_utils.mode_keys import KerasModeKeys as ModeKeys
+# pylint: enable=unused-import
diff --git a/tensorflow/python/keras/utils/multi_gpu_utils.py b/tensorflow/python/keras/utils/multi_gpu_utils.py
index 04b2ea8fe314afaf935bc81bfa62e0c0f1424aa7..9c97e554b0755eed6442e1a11da218655896d7db 100644
--- a/tensorflow/python/keras/utils/multi_gpu_utils.py
+++ b/tensorflow/python/keras/utils/multi_gpu_utils.py
@@ -21,7 +21,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine.training import Model
 from tensorflow.python.ops import array_ops
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
 def _get_available_devices():
@@ -33,7 +33,7 @@ def _normalize_device_name(name):
   return name
 
 
-@tf_export('keras.utils.multi_gpu_model')
+@keras_export('keras.utils.multi_gpu_model')
 def multi_gpu_model(model, gpus, cpu_merge=True, cpu_relocation=False):
   """Replicates a model on different GPUs.
 
diff --git a/tensorflow/python/keras/utils/multi_gpu_utils_test.py b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
index 8c1abd632484273a01fd99cbd72ee73b66e46f27..9c711bd2a28395279c1e8cd726084d6b9ab4e188 100644
--- a/tensorflow/python/keras/utils/multi_gpu_utils_test.py
+++ b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
@@ -148,7 +148,6 @@ class TestMultiGPUModel(test.TestCase):
       input_shape = (num_samples,) + shape
       x_train = np.random.randint(0, 255, input_shape)
       y_train = np.random.randint(0, num_classes, (input_shape[0],))
-      keras.backend.set_learning_phase(True)
 
       y_train = keras.utils.to_categorical(y_train, num_classes)
 
diff --git a/tensorflow/python/keras/utils/np_utils.py b/tensorflow/python/keras/utils/np_utils.py
index 3763999bff4f6c920e1fadeb98e964fe62f8412c..5227a472a395509162fdeea2ad5961a11775f4c2 100644
--- a/tensorflow/python/keras/utils/np_utils.py
+++ b/tensorflow/python/keras/utils/np_utils.py
@@ -18,10 +18,10 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
-@tf_export('keras.utils.to_categorical')
+@keras_export('keras.utils.to_categorical')
 def to_categorical(y, num_classes=None, dtype='float32'):
   """Converts a class vector (integers) to binary class matrix.
 
@@ -52,7 +52,7 @@ def to_categorical(y, num_classes=None, dtype='float32'):
   return categorical
 
 
-@tf_export('keras.utils.normalize')
+@keras_export('keras.utils.normalize')
 def normalize(x, axis=-1, order=2):
   """Normalizes a Numpy array.
 
diff --git a/tensorflow/python/keras/utils/tf_utils.py b/tensorflow/python/keras/utils/tf_utils.py
index 7b4c9e7239e2f097e0351b160bd7520ee587a8b3..1c1d30ba4b5c12b36d4e439f3f85037c3f954f16 100644
--- a/tensorflow/python/keras/utils/tf_utils.py
+++ b/tensorflow/python/keras/utils/tf_utils.py
@@ -17,10 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import six
+
 from tensorflow.python.eager import context
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond as smart_module
-from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
@@ -102,6 +104,7 @@ def get_reachable_from_inputs(inputs, targets=None):
   Returns:
     A set of tensors reachable from the inputs (includes the inputs themselves).
   """
+  inputs = nest.flatten(inputs)
   reachable = set(inputs)
   if targets:
     targets = set(targets)
@@ -129,6 +132,140 @@ def get_reachable_from_inputs(inputs, targets=None):
   return reachable
 
 
+# This function needs access to private functions of `nest`.
+#  pylint: disable=protected-access
+def map_structure_with_atomic(is_atomic_fn, map_fn, nested):
+  """Maps the atomic elements of a nested structure.
+
+  Arguments:
+    is_atomic_fn: A function that determines if an element of `nested` is
+      atomic.
+    map_fn: The function to apply to atomic elements of `nested`.
+    nested: A nested structure.
+
+  Returns:
+    The nested structure, with atomic elements mapped according to `map_fn`.
+
+  Raises:
+    ValueError: If an element that is neither atomic nor a sequence is
+      encountered.
+  """
+  if is_atomic_fn(nested):
+    return map_fn(nested)
+
+  # Recursively convert.
+  if not nest.is_sequence(nested):
+    raise ValueError(
+        'Received non-atomic and non-sequence element: {}'.format(nested))
+  if nest._is_mapping(nested):
+    values = [nested[k] for k in nest._sorted(nested)]
+  else:
+    values = nested
+  mapped_values = [
+      map_structure_with_atomic(is_atomic_fn, map_fn, ele) for ele in values
+  ]
+  return nest._sequence_like(nested, mapped_values)
+
+
+#  pylint: enable=protected-access
+
+
+def convert_shapes(input_shape, to_tuples=True):
+  """Converts nested shape representations  to desired format.
+
+  Performs:
+
+  TensorShapes -> tuples if `to_tuples=True`.
+  tuples of int or None -> TensorShapes if `to_tuples=False`.
+
+  Valid objects to be converted are:
+  - TensorShapes
+  - tuples with elements of type int or None.
+  - ints
+  - None
+
+  Arguments:
+    input_shape: A nested structure of objects to be converted to TensorShapes.
+    to_tuples: If `True`, converts all TensorShape to tuples. Otherwise converts
+      all tuples representing shapes to TensorShapes.
+
+  Returns:
+    Nested structure of shapes in desired format.
+  """
+
+  def _is_shape_component(element):
+    value = tensor_shape.as_dimension(element).value
+    return value is None or isinstance(value, int)
+
+  def _is_atomic_shape(input_shape):
+    # Ex: TensorShape or (None, 10, 32) or 5 or `None`
+    if input_shape is None or isinstance(input_shape, int):
+      return True
+    if isinstance(input_shape, tensor_shape.TensorShape):
+      return True
+    if (isinstance(input_shape, tuple) and
+        all(_is_shape_component(ele) for ele in input_shape)):
+      return True
+    return False
+
+  def _convert_shape(input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if to_tuples:
+      input_shape = tuple(input_shape.as_list())
+    return input_shape
+
+  return map_structure_with_atomic(_is_atomic_shape, _convert_shape,
+                                   input_shape)
+
+
+class ListWrapper(object):
+  """A wrapper for lists to be treated as elements for `nest`."""
+
+  def __init__(self, list_to_wrap):
+    self._list = list_to_wrap
+
+  def as_list(self):
+    return self._list
+
+
+def convert_inner_node_data(nested, wrap=False):
+  """Either wraps or unwraps innermost node data lists in `ListWrapper` objects.
+
+  Arguments:
+    nested: A nested data structure.
+    wrap: If `True`, wrap innermost lists in `ListWrapper` objects. If `False`,
+      unwraps `ListWrapper` objects into lists.
+
+  Returns:
+    Strucutre of same type as nested, with lists wrapped/unwrapped.
+  """
+
+  def _is_atomic_nested(nested):
+    """Returns `True` if `nested` is a list representing node data."""
+    if isinstance(nested, ListWrapper):
+      return True
+    # Node data can be of form `[layer_name, node_id, tensor_id]` or
+    # `[layer_name, node_id, tensor_id, kwargs]`.
+    if (isinstance(nested, list) and (len(nested) in [3, 4]) and
+        isinstance(nested[0], six.string_types)):
+      return True
+    return False
+
+  def _convert_object_or_list(nested):
+    """Convert b/t `ListWrapper` object and list representations."""
+    if wrap:
+      if isinstance(nested, ListWrapper):
+        return nested
+      return ListWrapper(nested)
+    else:
+      if isinstance(nested, ListWrapper):
+        return nested.as_list()
+      return nested
+
+  return map_structure_with_atomic(_is_atomic_nested, _convert_object_or_list,
+                                   nested)
+
+
 def shape_type_conversion(fn):
   """Decorator that handles tuple/TensorShape conversion.
 
@@ -142,17 +279,15 @@ def shape_type_conversion(fn):
   """
 
   def wrapper(instance, input_shape):
+    # Pass shapes as tuples to `fn`
+    # This preserves compatibility with external Keras.
     if input_shape is not None:
-      if isinstance(input_shape, list):
-        input_shape = [
-            tuple(tensor_shape.TensorShape(x).as_list()) for x in input_shape]
-      else:
-        input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
+      input_shape = convert_shapes(input_shape, to_tuples=True)
     output_shape = fn(instance, input_shape)
+    # Return shapes from `fn` as TensorShapes.
     if output_shape is not None:
-      if isinstance(output_shape, list):
-        return [tensor_shape.TensorShape(x) for x in output_shape]
-      return tensor_shape.TensorShape(output_shape)
+      output_shape = convert_shapes(output_shape, to_tuples=False)
+    return output_shape
 
   return wrapper
 
@@ -178,7 +313,9 @@ def is_symbolic_tensor(tensor):
   """
   if isinstance(tensor, variables.Variable):
     return not context.executing_eagerly()
-  if isinstance(tensor, (ops.Tensor, sparse_tensor.SparseTensor)):
+  if isinstance(tensor, composite_tensor.CompositeTensor):
+    return tensor._is_graph_tensor  # pylint: disable=protected-access
+  if isinstance(tensor, ops.Tensor):
     return hasattr(tensor, 'graph')
   if isinstance(tensor, tuple(_user_convertible_tensor_types)):
     return hasattr(ops.convert_to_tensor(tensor), 'graph')
@@ -216,3 +353,7 @@ def register_symbolic_tensor_type(cls):
   """
   global _user_convertible_tensor_types
   _user_convertible_tensor_types.add(cls)
+
+
+def is_tensor_or_variable(x):
+  return tensor_util.is_tensor(x) or isinstance(x, variables.Variable)
diff --git a/tensorflow/python/keras/utils/tf_utils_test.py b/tensorflow/python/keras/utils/tf_utils_test.py
index 9833a492993feb3a989d09160919fbf85c3a21e7..902ecf91670d52ff6839f42d345944b9be009f85 100644
--- a/tensorflow/python/keras/utils/tf_utils_test.py
+++ b/tensorflow/python/keras/utils/tf_utils_test.py
@@ -89,6 +89,10 @@ class TestIsSymbolicTensor(test.TestCase):
         self._input = input_
         self.value = ops.convert_to_tensor(42.)
 
+      @property
+      def dtype(self):
+        return self.value.dtype
+
     ops.register_tensor_conversion_function(
         Foo, lambda x, *args, **kwargs: x.value)
     tf_utils.register_symbolic_tensor_type(Foo)
@@ -128,6 +132,28 @@ class TestIsSymbolicTensor(test.TestCase):
     # `Tensor`.
     y = model(ops.convert_to_tensor(7.))
     self.assertIsInstance(y, Foo)
+    # Confirm that (custom) loss sees `Foo` instance, not Tensor.
+    obtained_prediction_box = [None]
+    def custom_loss(y_obs, y_pred):
+      del y_obs
+      obtained_prediction_box[0] = y_pred
+      return y_pred
+    # Apparently `compile` calls the loss function enough to trigger the
+    # side-effect.
+    model.compile('SGD', loss=custom_loss)
+    self.assertIsInstance(obtained_prediction_box[0], Foo)
+
+
+class ConvertInnerNodeDataTest(test.TestCase):
+
+  def test_convert_inner_node_data(self):
+    data = tf_utils.convert_inner_node_data((tf_utils.ListWrapper(['l', 2, 3]),
+                                             tf_utils.ListWrapper(['l', 5, 6])))
+    self.assertEqual(data, (['l', 2, 3], ['l', 5, 6]))
+
+    data = tf_utils.convert_inner_node_data(((['l', 2, 3], ['l', 5, 6])),
+                                            wrap=True)
+    self.assertTrue(all(isinstance(ele, tf_utils.ListWrapper) for ele in data))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/utils/vis_utils.py b/tensorflow/python/keras/utils/vis_utils.py
index 7a454ac8314acdfa3c3e61c080acdd9efdf3acdc..d396851a629b80114496b4a978768d73730aecd2 100644
--- a/tensorflow/python/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/utils/vis_utils.py
@@ -20,7 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
 try:
@@ -67,6 +67,7 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
   """
   from tensorflow.python.keras.layers.wrappers import Wrapper
   from tensorflow.python.keras.models import Sequential
+  from tensorflow.python.util import nest
 
   _check_pydot()
   dot = pydot.Dot()
@@ -77,7 +78,7 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
   if isinstance(model, Sequential):
     if not model.built:
       model.build()
-  layers = model.layers
+  layers = model._layers
 
   # Create graph nodes.
   for layer in layers:
@@ -120,14 +121,14 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
     for i, node in enumerate(layer._inbound_nodes):
       node_key = layer.name + '_ib-' + str(i)
       if node_key in model._network_nodes:  # pylint: disable=protected-access
-        for inbound_layer in node.inbound_layers:
+        for inbound_layer in nest.flatten(node.inbound_layers):
           inbound_layer_id = str(id(inbound_layer))
           layer_id = str(id(layer))
           dot.add_edge(pydot.Edge(inbound_layer_id, layer_id))
   return dot
 
 
-@tf_export('keras.utils.plot_model')
+@keras_export('keras.utils.plot_model')
 def plot_model(model,
                to_file='model.png',
                show_shapes=False,
@@ -144,6 +145,10 @@ def plot_model(model,
           a string specifying the format of the plot:
           'TB' creates a vertical plot;
           'LR' creates a horizontal plot.
+
+  Returns:
+      A Jupyter notebook Image object if Jupyter is installed.
+      This enables in-line display of the model plots in notebooks.
   """
   dot = model_to_dot(model, show_shapes, show_layer_names, rankdir)
   _, extension = os.path.splitext(to_file)
@@ -151,4 +156,13 @@ def plot_model(model,
     extension = 'png'
   else:
     extension = extension[1:]
+  # Save image to disk.
   dot.write(to_file, format=extension)
+  # Return the image as a Jupyter Image object, to be displayed in-line.
+  # Note that we cannot easily detect whether the code is running in a
+  # notebook, and thus we always return the Image if Jupyter is available.
+  try:
+    from IPython import display
+    return display.Image(filename=to_file)
+  except ImportError:
+    pass
diff --git a/tensorflow/python/keras/wrappers/scikit_learn.py b/tensorflow/python/keras/wrappers/scikit_learn.py
index 4462d94ecdb10c6f7306de1f552151e209394bac..149ad06f57c23990777a854836c7c8beb352799f 100644
--- a/tensorflow/python/keras/wrappers/scikit_learn.py
+++ b/tensorflow/python/keras/wrappers/scikit_learn.py
@@ -23,10 +23,11 @@ import types
 
 import numpy as np
 
+from tensorflow.python.keras import losses
 from tensorflow.python.keras.models import Sequential
 from tensorflow.python.keras.utils.generic_utils import has_arg
 from tensorflow.python.keras.utils.np_utils import to_categorical
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import keras_export
 
 
 class BaseWrapper(object):
@@ -155,10 +156,8 @@ class BaseWrapper(object):
     else:
       self.model = self.build_fn(**self.filter_sk_params(self.build_fn))
 
-    loss_name = self.model.loss
-    if hasattr(loss_name, '__name__'):
-      loss_name = loss_name.__name__
-    if loss_name == 'categorical_crossentropy' and len(y.shape) != 2:
+    if (losses.is_categorical_crossentropy(self.model.loss) and
+        len(y.shape) != 2):
       y = to_categorical(y)
 
     fit_args = copy.deepcopy(self.filter_sk_params(Sequential.fit))
@@ -188,7 +187,7 @@ class BaseWrapper(object):
     return res
 
 
-@tf_export('keras.wrappers.scikit_learn.KerasClassifier')
+@keras_export('keras.wrappers.scikit_learn.KerasClassifier')
 class KerasClassifier(BaseWrapper):
   """Implementation of the scikit-learn classifier API for Keras.
   """
@@ -304,14 +303,14 @@ class KerasClassifier(BaseWrapper):
     if not isinstance(outputs, list):
       outputs = [outputs]
     for name, output in zip(self.model.metrics_names, outputs):
-      if name == 'acc':
+      if name in ['accuracy', 'acc']:
         return output
     raise ValueError('The model is not configured to compute accuracy. '
                      'You should pass `metrics=["accuracy"]` to '
                      'the `model.compile()` method.')
 
 
-@tf_export('keras.wrappers.scikit_learn.KerasRegressor')
+@keras_export('keras.wrappers.scikit_learn.KerasRegressor')
 class KerasRegressor(BaseWrapper):
   """Implementation of the scikit-learn regressor API for Keras.
   """
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index df8c14970a0af7e2b1bd19162b344ff4329d385f..6ed8f69afad20941349d0093d60730aa8ba07c46 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -131,6 +131,8 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
     ],
     grpc_enabled = True,
+    tags = ["no_rocm"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -145,6 +147,7 @@ cuda_py_test(
         "//tensorflow/python:platform_benchmark",
     ],
     tags = ["no_windows"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -161,6 +164,7 @@ cuda_py_test(
         "//tensorflow/python:platform",
         "//tensorflow/python:platform_benchmark",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -226,6 +230,7 @@ cuda_py_test(
     ],
     shard_count = 5,
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -282,6 +287,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -387,6 +393,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:linalg_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -515,6 +522,7 @@ tf_py_test(
         "//tensorflow/python:io_ops",
         "//tensorflow/python:util",
     ],
+    tags = ["notap"],  #TODO(b/123583863): Re-enable this test.
 )
 
 tf_py_test(
@@ -645,6 +653,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
     ],
     tags = ["optonly"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -659,6 +668,7 @@ cuda_py_test(
         "//tensorflow/python:linalg_ops",
         "//tensorflow/python:math_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -671,6 +681,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:linalg_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -684,6 +695,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:linalg_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -695,6 +707,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:linalg_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -713,6 +726,7 @@ cuda_py_test(
         "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -795,6 +809,7 @@ cuda_py_test(
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -853,6 +868,7 @@ cuda_py_test(
         "//tensorflow/python:resource_variable_ops",
     ],
     tags = ["noasan"],  # http://b/32635055
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -1068,6 +1084,31 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "summary_ops_test",
+    size = "small",
+    srcs = ["summary_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:summary_ops_v2",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/keras:engine",
+        "//tensorflow/python/keras:layers",
+    ],
+)
+
 tf_py_test(
     name = "summary_v1_ops_test",
     size = "small",
@@ -1115,6 +1156,19 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "tridiagonal_solve_op_test",
+    size = "medium",
+    srcs = ["tridiagonal_solve_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:linalg_ops",
+    ],
+    shard_count = 5,
+)
+
 tf_py_test(
     name = "unicode_script_op_test",
     size = "small",
@@ -1140,6 +1194,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1155,6 +1210,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -1190,8 +1246,15 @@ tf_py_test(
     srcs = ["unicode_decode_op_test.py"],
     additional_deps = [
         "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/ops/ragged:ragged",
+        "//tensorflow/python/ops/ragged:ragged_test_util",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
     ],
 )
@@ -1264,6 +1327,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1285,6 +1349,7 @@ cuda_py_test(
         "noguitar",
         "notap",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1300,6 +1365,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     tags = ["notsan"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1316,6 +1382,7 @@ cuda_py_test(
     ],
     shard_count = 2,
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -1348,6 +1415,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1359,6 +1427,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1387,6 +1456,7 @@ cuda_py_test(
         "noasan",  # times out
         "optonly",  # times out
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1399,6 +1469,7 @@ cuda_py_test(
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1415,6 +1486,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
     ],
     shard_count = 10,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1429,6 +1501,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
     ],
     shard_count = 20,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1442,6 +1515,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1456,11 +1530,12 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
     name = "bias_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["bias_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -1471,6 +1546,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1483,6 +1559,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1501,6 +1578,7 @@ cuda_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1516,6 +1594,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:util",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1534,6 +1613,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:util",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1574,6 +1654,7 @@ cuda_py_test(
     ],
     shard_count = 16,
     tags = ["no_gpu"],  # TODO(b/117928656)
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -1614,6 +1695,22 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:nn_ops",
     ],
+    tags = ["no_rocm"],
+    xla_enable_strict_auto_jit = True,
+)
+
+cuda_py_test(
+    name = "conv1d_transpose_test",
+    size = "small",
+    srcs = ["conv1d_transpose_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:nn_ops",
+    ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1628,6 +1725,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1642,6 +1740,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1653,6 +1752,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1666,6 +1766,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:platform",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1681,6 +1782,7 @@ cuda_py_test(
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1695,6 +1797,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
     ],
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1707,6 +1810,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
     tags = ["manual"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1722,6 +1826,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1736,6 +1841,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1748,6 +1854,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1760,11 +1867,12 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
     name = "functional_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["functional_ops_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -1786,6 +1894,7 @@ cuda_py_test(
     grpc_enabled = True,
     shard_count = 2,
     tags = ["no_windows"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1801,6 +1910,7 @@ cuda_py_test(
         "//tensorflow/python:gradients",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1808,12 +1918,14 @@ cuda_py_test(
     size = "medium",
     srcs = ["gather_op_test.py"],
     additional_deps = [
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1827,6 +1939,7 @@ cuda_py_test(
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1854,6 +1967,7 @@ cuda_py_test(
         "noasan",
         "notap",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1871,6 +1985,7 @@ cuda_py_test(
         "//tensorflow/python/ops/linalg",
     ],
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1886,6 +2001,7 @@ cuda_py_test(
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_grad",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1902,6 +2018,7 @@ cuda_py_test(
         "//tensorflow/python:platform",
         "//tensorflow/python/ops/linalg",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1915,6 +2032,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1932,6 +2050,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     shard_count = 20,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1945,6 +2064,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1960,6 +2080,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:numerics",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1973,6 +2094,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1987,6 +2109,30 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = True,
+)
+
+cuda_py_test(
+    name = "map_fn_test",
+    size = "small",
+    srcs = ["map_fn_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:map_fn",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_array_grad",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+    grpc_enabled = True,
+    shard_count = 2,
+    tags = ["no_windows"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -1999,6 +2145,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2013,6 +2160,7 @@ cuda_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2031,6 +2179,7 @@ cuda_py_test(
         "//tensorflow/python/eager:function",
     ],
     tags = ["no_windows"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2044,6 +2193,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:string_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2057,8 +2207,9 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
     ],
-    shard_count = 4,
+    shard_count = 6,
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2072,13 +2223,14 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
     ],
+    shard_count = 3,
     tags = [
         "manual",
         "no_gpu",
         "nogpu",
         "noguitar",
-        "notap",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2097,6 +2249,7 @@ cuda_py_test(
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python:tf2",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2109,6 +2262,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2121,6 +2275,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2133,6 +2288,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2151,6 +2307,7 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:sparse_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2164,6 +2321,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2176,6 +2334,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:session_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2192,6 +2351,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2206,6 +2366,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:nn_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2219,6 +2380,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2232,6 +2394,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2247,6 +2410,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2261,6 +2425,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
     ],
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -2305,6 +2470,7 @@ cuda_py_test(
         "//tensorflow/python:sparse_grad",
         "//tensorflow/python:sparse_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2329,6 +2495,7 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2343,6 +2510,7 @@ cuda_py_test(
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2358,6 +2526,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:math_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2370,6 +2539,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:string_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2382,6 +2552,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:parsing_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2395,6 +2566,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:summary",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2410,6 +2582,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:summary",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2440,6 +2613,8 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
     ],
     flaky = 1,  # create_local_cluster sometimes times out.
+    tags = ["no_rocm"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2452,6 +2627,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
     ],
     tags = ["no_windows_gpu"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2470,6 +2646,7 @@ cuda_py_test(
         "no_oss",
         "optonly",  # times out
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2482,6 +2659,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2499,6 +2677,7 @@ cuda_py_test(
         "//tensorflow/python:state_ops_gen",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2515,6 +2694,7 @@ cuda_py_test(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2526,6 +2706,7 @@ cuda_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2545,6 +2726,7 @@ cuda_py_test(
     tags = [
         "no_gpu",  #  Flaky: b/80127739
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2560,6 +2742,7 @@ cuda_py_test(
         "//tensorflow/python:nn_ops",
     ],
     tags = ["manual"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2573,6 +2756,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2589,8 +2773,10 @@ cuda_py_test(
     ],
     shard_count = 2,
     tags = [
+        "no_rocm",
         "optonly",  # flaky timeouts unless optimized
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2604,6 +2790,7 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2628,10 +2815,10 @@ cuda_py_test(
     ],
     shard_count = 4,
     tags = [
-        # TODO(b/118887316): Re-enable this test in Kokoro.
-        "no_oss",
+        "no_rocm",
         "optonly",  # times out
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2649,6 +2836,7 @@ cuda_py_test(
     ],
     # TODO(b/118842098): Re-enable this test in Kokoro.
     tags = ["no_oss"],
+    xla_enable_strict_auto_jit = True,
 )
 
 tf_py_test(
@@ -2677,6 +2865,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
     tags = ["manual"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2690,6 +2879,8 @@ cuda_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    tags = ["no_rocm"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2708,6 +2899,8 @@ cuda_py_test(
         "//tensorflow/python:nn_ops_gen",
     ],
     shard_count = 4,
+    tags = ["no_rocm"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2737,6 +2930,35 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
     ],
     shard_count = 10,
+    xla_enable_strict_auto_jit = True,
+)
+
+cuda_py_test(
+    name = "rnn_cell_test",
+    size = "medium",
+    srcs = ["rnn_cell_test.py"],
+    additional_deps = [
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:rnn_cell",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+    shard_count = 10,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2752,6 +2974,7 @@ cuda_py_test(
     ],
     shard_count = 2,
     tags = ["optonly"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2766,6 +2989,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2782,6 +3006,7 @@ cuda_py_test(
     tags = [
         "no_oss",  # Requires 4GB+ RAM
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2795,6 +3020,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
     ],
     tags = ["no_windows"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2818,6 +3044,7 @@ cuda_py_test(
         "noasan",
         "optonly",  # b/77589990
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2837,6 +3064,7 @@ cuda_py_test(
         "//tensorflow/python:platform",
         "//tensorflow/python:sparse_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 # TODO(gpapan): Revisit the gradient of extract_image_patches_op to resolve
@@ -2853,6 +3081,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
     tags = ["notap"],  # http://b/31080670
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2867,6 +3096,7 @@ cuda_py_test(
         "//tensorflow/python:util",
         "//tensorflow/python:data_flow_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2881,6 +3111,8 @@ cuda_py_test(
         "//tensorflow/python:util",
         "//tensorflow/python:data_flow_ops",
     ],
+    tags = ["no_oss"],  # b/124474135
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2898,6 +3130,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2913,6 +3146,7 @@ cuda_py_test(
         "nomsan",
         "notsan",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2926,6 +3160,8 @@ cuda_py_test(
         "//tensorflow/python:nn_ops",
     ],
     shard_count = 30,
+    tags = ["no_rocm"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2946,6 +3182,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     shard_count = 50,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2966,6 +3203,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     shard_count = 50,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -2986,6 +3224,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     shard_count = 50,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3011,6 +3250,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     shard_count = 20,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3028,6 +3268,7 @@ cuda_py_test(
         "//tensorflow/python/ops/linalg",
     ],
     shard_count = 20,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3041,6 +3282,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
     shard_count = 20,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3057,7 +3299,11 @@ cuda_py_test(
     ],
     data = ["//tensorflow/python/kernel_tests/testdata:self_adjoint_eig_op_test_files"],
     shard_count = 20,
-    tags = ["no_windows"],
+    tags = [
+        "no_rocm",  # flaky test
+        "no_windows",
+    ],
+    # TODO(kuny): Add xla_enable_strict_auto_jit = True after b/124377352 is fixed.
 )
 
 cuda_py_test(
@@ -3073,6 +3319,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
     ],
     shard_count = 20,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3092,6 +3339,7 @@ cuda_py_test(
         "no_oss",  # b/117185141.
         "nomsan",  # TODO(b/117236102): Re-enable in msan build.
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3111,6 +3359,7 @@ cuda_py_test(
         "no_windows_gpu",
         "nomsan",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3126,6 +3375,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
     ],
     shard_count = 20,
+    xla_enable_strict_auto_jit = True,
 )
 
 sycl_py_test(
@@ -3372,6 +3622,7 @@ cuda_py_test(
         "//tensorflow/python:while_v2",
     ],
     grpc_enabled = True,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -3382,6 +3633,7 @@ cuda_py_test(
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
@@ -3397,4 +3649,26 @@ cuda_py_test(
         "//tensorflow/python:while_v2",
     ],
     grpc_enabled = True,
+    xla_enable_strict_auto_jit = True,
+)
+
+cuda_py_test(
+    name = "critical_section_test",
+    size = "medium",
+    srcs = ["critical_section_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:prefetching_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:critical_section_ops",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
+    ],
 )
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index f4c442b7b1932c3ddab0d255f57c3fac5a23954a..0bc533a7736f8a21fc25811d1fca7a6b3eb7d7c4 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -1050,10 +1050,12 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
     checker2[None] = [6]  # new axis
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("b/123559667")
   def testSliceAssign(self):
     self.doTestSliceAssign(use_resource=False)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("b/123559667")
   def testSliceAssignResource(self):
     self.doTestSliceAssign(use_resource=True)
 
diff --git a/tensorflow/python/kernel_tests/atrous_conv2d_test.py b/tensorflow/python/kernel_tests/atrous_conv2d_test.py
index a13e325835cfd343eda61037b8392e83bed0f1c2..b84e76472399943279c1f9b680332f69f8ed48d8 100644
--- a/tensorflow/python/kernel_tests/atrous_conv2d_test.py
+++ b/tensorflow/python/kernel_tests/atrous_conv2d_test.py
@@ -160,7 +160,7 @@ class AtrousConv2DTest(test.TestCase):
                                                       [x_shape, f_shape],
                                                       output, y_shape)
         print("atrous_conv2d gradient err = %g " % err)
-        err_tolerance = 1e-3
+        err_tolerance = 4e-3 if test_util.is_xla_enabled() else 1e-3
         self.assertLess(err, err_tolerance)
 
 
diff --git a/tensorflow/python/kernel_tests/attention_ops_test.py b/tensorflow/python/kernel_tests/attention_ops_test.py
index 00dba9996dd909786301d56da41fa037328ba3e5..0b557bda2e3436846df9a4a64c915c33b0d72c68 100644
--- a/tensorflow/python/kernel_tests/attention_ops_test.py
+++ b/tensorflow/python/kernel_tests/attention_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.platform import test
@@ -195,5 +196,46 @@ class ExtractGlimpseTest(test.TestCase):
         expected_rows=[None, None, None, 1, 2, 3, 4],
         expected_cols=[56, 57, 58, 59, 60])
 
+  def testGlimpseNoiseZero(self):
+    # Image:
+    # [  0.   1.   2.   3.   4.]
+    # [  5.   6.   7.   8.   9.]
+    # [ 10.  11.  12.  13.  14.]
+    # [ 15.  16.  17.  18.  19.]
+    # [ 20.  21.  22.  23.  24.]
+    img = constant_op.constant(
+        np.arange(25).reshape((1, 5, 5, 1)), dtype=dtypes.float32)
+    with self.test_session():
+      # Result 1:
+      # [ 0.  0.  0.]
+      # [ 0.  0.  0.]
+      # [ 0.  0.  0.]
+      result1 = image_ops.extract_glimpse_v2(
+          img, [3, 3], [[-2, 2]],
+          centered=False,
+          normalized=False,
+          noise='zero')
+      self.assertAllEqual(
+          np.asarray([[0, 0, 0], [0, 0, 0], [0, 0, 0]]),
+          self.evaluate(result1)[0, :, :, 0])
+
+      # Result 2:
+      # [  0.   0.   0.   0.   0.   0.   0.]
+      # [  0.   0.   1.   2.   3.   4.   0.]
+      # [  0.   5.   6.   7.   8.   9.   0.]
+      # [  0.  10.  11.  12.  13.  14.   0.]
+      # [  0.  15.  16.  17.  18.  19.   0.]
+      # [  0.  20.  21.  22.  23.  24.   0.]
+      # [  0.   0.   0.   0.   0.   0.   0.]]
+      result2 = image_ops.extract_glimpse_v2(
+          img, [7, 7], [[0, 0]], normalized=False, noise='zero')
+      self.assertAllEqual(
+          np.asarray([[0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 2, 3, 4, 0],
+                      [0, 5, 6, 7, 8, 9, 0], [0, 10, 11, 12, 13, 14, 0],
+                      [0, 15, 16, 17, 18, 19, 0], [0, 20, 21, 22, 23, 24, 0],
+                      [0, 0, 0, 0, 0, 0, 0]]),
+          self.evaluate(result2)[0, :, :, 0])
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/base64_ops_test.py b/tensorflow/python/kernel_tests/base64_ops_test.py
index 381f190b8df6d65afaa80654e3d98377a69b9ae3..d5a5dc8c01352fca8ff9b65d8621c48865b79a83 100644
--- a/tensorflow/python/kernel_tests/base64_ops_test.py
+++ b/tensorflow/python/kernel_tests/base64_ops_test.py
@@ -31,7 +31,7 @@ from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_deprecated_v1
 class Base64OpsTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/kernel_tests/benchmark_test.py b/tensorflow/python/kernel_tests/benchmark_test.py
index bffa5e6e8f4d9125f5021eb531319f67fd6e77bb..3fa2054847db635a96caedf4d596020ec2137003 100644
--- a/tensorflow/python/kernel_tests/benchmark_test.py
+++ b/tensorflow/python/kernel_tests/benchmark_test.py
@@ -26,6 +26,7 @@ import numpy as np
 from tensorflow.core.util import test_log_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import gfile
@@ -125,6 +126,7 @@ class BenchmarkTest(test.TestCase):
     self.assertFalse(_ran_somebenchmark_2[0])
     self.assertFalse(_ran_somebenchmark_but_shouldnt[0])
 
+  @test_util.disable_xla("b/123744455")  # GPU memory is incorrect
   def testReportingBenchmark(self):
     tempdir = test.get_temp_dir()
     try:
diff --git a/tensorflow/python/kernel_tests/bias_op_test.py b/tensorflow/python/kernel_tests/bias_op_test.py
index 66f442dbddb5f609e7525ba0db9809dc3943ee25..94e20d93017b07f8c3b5343744537cd7ce08896d 100644
--- a/tensorflow/python/kernel_tests/bias_op_test.py
+++ b/tensorflow/python/kernel_tests/bias_op_test.py
@@ -35,8 +35,6 @@ class BiasAddTest(test.TestCase):
 
   def _npBias(self, inputs, bias):
     assert len(bias.shape) == 1
-    print(inputs.shape)
-    print(bias.shape)
     assert inputs.shape[-1] == bias.shape[0]
     return inputs + bias.reshape(([1] * (len(inputs.shape) - 1)) +
                                  [bias.shape[0]])
@@ -196,9 +194,7 @@ class BiasAddTest(test.TestCase):
       self.assertAllClose(grad_jacob_t, grad_jacob_n, threshold, threshold)
 
   @test_util.run_deprecated_v1
-  def testGradientTensor(self):
-    # TODO(yongtang): BiasAddGrad with NCHW only works 4D. Reenable once
-    # all dimensions are supported.
+  def testGradientTensor2D(self):
     for (data_format, use_gpu) in ("NHWC", False), ("NHWC", True):
       for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
         np_input = np.array(
@@ -207,9 +203,18 @@ class BiasAddTest(test.TestCase):
         bias = np.array([1.3, 2.4], dtype=dtype.as_numpy_dtype)
         self._testGradient(np_input, bias, dtype, data_format, use_gpu)
 
+  @test_util.run_deprecated_v1
+  def testGradientTensor3D(self):
+    for (data_format, use_gpu) in [("NHWC", False), ("NHWC", True),
+                                   ("NCHW", False), ("NCHW", True)]:
+      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
+        np_input = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
+                            dtype=dtype.as_numpy_dtype).reshape(1, 3, 2)
+        bias = np.array([1.3, 2.4], dtype=dtype.as_numpy_dtype)
+        self._testGradient(np_input, bias, dtype, data_format, use_gpu)
+
   @test_util.run_deprecated_v1
   def testGradientTensor4D(self):
-    # BiasAddGrad with NCHW support 4D so all are enabled.
     for (data_format, use_gpu) in [("NHWC", False), ("NHWC", True),
                                    ("NCHW", False), ("NCHW", True)]:
       for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
@@ -219,6 +224,17 @@ class BiasAddTest(test.TestCase):
         bias = np.array([1.3, 2.4], dtype=dtype.as_numpy_dtype)
         self._testGradient(np_input, bias, dtype, data_format, use_gpu)
 
+  @test_util.run_deprecated_v1
+  def testGradientTensor5D(self):
+    for (data_format, use_gpu) in [("NHWC", False), ("NHWC", True),
+                                   ("NCHW", False), ("NCHW", True)]:
+      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
+        np_input = np.arange(
+            1.0, 49.0, dtype=dtype.as_numpy_dtype).reshape(
+                [1, 2, 3, 4, 2]).astype(np.float32)
+        bias = np.array([1.3, 2.4], dtype=dtype.as_numpy_dtype)
+        self._testGradient(np_input, bias, dtype, data_format, use_gpu)
+
   @test_util.run_deprecated_v1
   def testEmpty(self):
     np.random.seed(7)
@@ -227,10 +243,15 @@ class BiasAddTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testEmptyGradient(self):
-    # TODO(yongtang): BiasAddGrad with NCHW only works 4D. Reenable once
-    # all dimensions are supported.
     for (data_format, use_gpu) in ("NHWC", False), ("NHWC", True):
-      for shape in (0, 0), (2, 0), (0, 2), (4, 3, 0), (4, 0, 3), (0, 4, 3):
+      for shape in (0, 0), (2, 0), (0, 2):
+        self._testGradient(
+            np.random.randn(*shape), np.random.randn(shape[-1]), dtypes.float64,
+            data_format, use_gpu)
+
+    for (data_format, use_gpu) in [("NHWC", False), ("NHWC", True),
+                                   ("NCHW", False), ("NCHW", True)]:
+      for shape in (4, 3, 0), (4, 0, 3), (0, 4, 3):
         self._testGradient(
             np.random.randn(*shape),
             np.random.randn(shape[-1]), dtypes.float64, data_format, use_gpu)
diff --git a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
index 6b04e8abf40dc6fc396581e82b59bc6c4dec2a41..e74193049b1df732e5d986340c16329a207cf2fe 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
@@ -896,12 +896,37 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       logits = session.run(predict_op)
       self.assertAllClose(expected_logits, logits)
 
+  @test_util.run_deprecated_v1
+  def testPredictionOnEmptyEnsembleMultiClass(self):
+    """Tests that prediction on empty ensemble does not fail for multiclass."""
+    with self.cached_session() as session:
+      # Create an empty ensemble.
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto='')
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      feature_0_values = [36, 32]
+      feature_1_values = [11, 27]
+      logits_dimension = 2
+      expected_logits = [[0.0, 0.0], [0.0, 0.0]]
+
+      # Prediction should work fine.
+      predict_op = boosted_trees_ops.predict(
+          tree_ensemble_handle,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=logits_dimension)
+
+      logits = session.run(predict_op)
+      self.assertAllClose(expected_logits, logits)
+
   @test_util.run_deprecated_v1
   def testPredictionMultipleTree(self):
     """Tests the predictions work when we have multiple trees."""
     with self.cached_session() as session:
       tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
-      text_format.Merge("""
+      text_format.Merge(
+          """
         trees {
           nodes {
             bucketized_split {
@@ -1007,6 +1032,158 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       logits = session.run(predict_op)
       self.assertAllClose(expected_logits, logits)
 
+  @test_util.run_deprecated_v1
+  def testPredictionMultipleTreeMultiClass(self):
+    """Tests the predictions work when we have multiple trees."""
+    with self.cached_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge(
+          """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 28
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              vector: {
+                value: 0.51
+              }
+              vector: {
+                value: 1.14
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector: {
+                value: 1.29
+              }
+              vector: {
+                value: 8.79
+              }
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 26
+              left_id: 1
+              right_id: 2
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 50
+              left_id: 3
+              right_id: 4
+            }
+          }
+          nodes {
+            leaf {
+              vector: {
+                value: -4.33
+              }
+              vector: {
+                value: 7.0
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector: {
+                value: 0.2
+              }
+              vector: {
+                value: 5.0
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector: {
+                value: -4.1
+              }
+              vector: {
+                value: 6.0
+              }
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 34
+              left_id: 1
+              right_id: 2
+            }
+          }
+          nodes {
+            leaf {
+              vector: {
+                value: 2.0
+              }
+              vector: {
+                value: -7.0
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector: {
+                value: 6.3
+              }
+              vector: {
+                value: 5.0
+              }
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_weights: 0.2
+        tree_weights: 1.0
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      feature_0_values = [36, 32]
+      feature_1_values = [11, 27]
+
+      # Example 1: tree 0: (0.51, 1.14), tree 1: (0.2, 5.0), tree 2: (6.3, 5.0)
+      #
+      #            logits = (0.1*0.51+0.2*0.2+1*6.3,
+      #                      0.1*1.14+0.2*5.0+1*5)
+      # Example 2: tree 0: (0.51, 1.14), tree 1: (-4.33, 7.0), tree 2: (2.0, -7)
+      #
+      #            logits = (0.1*0.51+0.2*-4.33+1*2.0,
+      #                      0.1*1.14+0.2*7.0+1*-7)
+      logits_dimension = 2
+      expected_logits = [[6.391, 6.114], [1.185, -5.486]]
+
+      # Prediction should work fine.
+      predict_op = boosted_trees_ops.predict(
+          tree_ensemble_handle,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=logits_dimension)
+
+      logits = session.run(predict_op)
+      self.assertAllClose(expected_logits, logits)
+
   @test_util.run_deprecated_v1
   def testCategoricalSplits(self):
     """Tests the predictions work for categorical splits."""
diff --git a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
index 2b9863fb89bac80f6a2f012a3f25c23f993d03ad..0315456447dec43264e48d918b74ba3bf0e119c5 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.training import saver
 
 
+@test_util.run_deprecated_v1
 class QuantileOpsTest(test_util.TensorFlowTestCase):
 
   def create_resource(self, name, eps, max_elements, num_streams=1):
@@ -82,7 +83,6 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
     self.max_elements = 1 << 16
     self.num_quantiles = constant_op.constant(3, dtype=dtypes.int64)
 
-  @test_util.run_v1_only("b/120545219")
   def testBasicQuantileBucketsSingleResource(self):
     with self.cached_session() as sess:
       quantile_accumulator_handle = self.create_resource("floats", self.eps,
@@ -107,7 +107,6 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(self._feature_0_quantiles, quantiles[0].eval())
       self.assertAllClose(self._feature_1_quantiles, quantiles[1].eval())
 
-  @test_util.run_v1_only("b/120545219")
   def testBasicQuantileBucketsMultipleResources(self):
     with self.cached_session() as sess:
       quantile_accumulator_handle_0 = self.create_resource("float_0", self.eps,
@@ -142,12 +141,11 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(self._feature_0_quantiles, quantiles[0].eval())
       self.assertAllClose(self._feature_1_quantiles, quantiles[1].eval())
 
-  @test_util.run_v1_only("b/120545219")
   def testSaveRestoreAfterFlush(self):
     save_dir = os.path.join(self.get_temp_dir(), "save_restore")
     save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
           num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
 
@@ -166,7 +164,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
       save.save(sess, save_path)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
           num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
       save = saver.Saver()
@@ -175,12 +173,11 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
       self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
 
-  @test_util.run_v1_only("b/120545219")
   def testSaveRestoreBeforeFlush(self):
     save_dir = os.path.join(self.get_temp_dir(), "save_restore")
     save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
 
-    with self.test_session() as sess:
+    with self.cached_session() as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
           num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
 
@@ -198,7 +195,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
       self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
           num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
       save = saver.Saver()
diff --git a/tensorflow/python/kernel_tests/broadcast_to_ops_test.py b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
index b9eb2391b490f659bd20e26a2c5b290ab4bfea1b..3c5433cb8990539d28bac70df2e8d589ffd9bb7a 100644
--- a/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
+++ b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
@@ -66,6 +66,36 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
           v_np = np.broadcast_to(x, output_shape)
           self.assertAllEqual(v_tf.eval(), v_np)
 
+  @test_util.run_deprecated_v1
+  def testBroadcastToShapeInnerDim(self):
+    input_shape = [2, 1, 3]
+    output_shape = [2, 5, 3]
+    with self.cached_session(use_gpu=True):
+      x = np.array(np.random.randint(5, size=input_shape), dtype=np.int32)
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), output_shape)
+      v_np = np.broadcast_to(x, output_shape)
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  @test_util.run_deprecated_v1
+  def testBroadcastToShapeLargerDim(self):
+    input_shape = [2, 1, 3, 2, 2, 2]
+    output_shape = [1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 15, 3, 2, 2, 2]
+    with self.cached_session(use_gpu=True):
+      x = np.array(np.random.randint(5, size=input_shape), dtype=np.int32)
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), output_shape)
+      v_np = np.broadcast_to(x, output_shape)
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  @test_util.run_deprecated_v1
+  def testBroadcastToShapeLargerDim2(self):
+    input_shape = [2, 1, 3, 2, 2, 2, 1, 1, 1]
+    output_shape = [1, 1, 1, 2, 5, 3, 2, 2, 2, 3, 3, 3]
+    with self.cached_session(use_gpu=True):
+      x = np.array(np.random.randint(5, size=input_shape), dtype=np.int32)
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), output_shape)
+      v_np = np.broadcast_to(x, output_shape)
+      self.assertAllEqual(v_tf.eval(), v_np)
+
   @test_util.run_deprecated_v1
   def testBroadcastToScalar(self):
     with self.session(use_gpu=True):
@@ -78,8 +108,9 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
   def testBroadcastScalarToNonScalar(self):
     with self.session(use_gpu=True):
       x = np.array(1.0, dtype=np.float)
-      v_tf = array_ops.broadcast_to(constant_op.constant(1.0), [2, 3, 4])
-      v_np = np.broadcast_to(x, [2, 3, 4])
+      v_tf = array_ops.broadcast_to(constant_op.constant(1.0), [2, 3, 4,
+                                                                1, 1, 1])
+      v_np = np.broadcast_to(x, [2, 3, 4, 1, 1, 1])
       self.assertAllEqual(v_tf.eval(), v_np)
 
   @test_util.run_deprecated_v1
@@ -130,14 +161,26 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
   def testGradientWithBroadcastAllDimensions(self):
-    x = constant_op.constant([[1, 2, 3], [4, 5, 6]], dtype=dtypes.float32)
-    v = array_ops.broadcast_to(x, [5, 4, 6])
+    x = constant_op.constant([1], dtype=dtypes.float32)
+    v = array_ops.broadcast_to(x, [5, 2, 3])
     out = 2 * v
     with self.cached_session():
       err = gradient_checker.compute_gradient_error(x, x.get_shape(),
                                                     out, out.get_shape())
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
+  def testGradientWithLargeDim(self):
+    input_shape = [2, 1, 3, 2, 2, 2, 1, 1, 1]
+    output_shape = [1, 1, 1, 2, 5, 3, 2, 2, 2, 3, 3, 3]
+    x = constant_op.constant(np.array(np.random.randn(*input_shape),
+                                      dtype=np.float32))
+    v = array_ops.broadcast_to(x, output_shape)
+    out = 2 * v
+    with self.cached_session():
+      err = gradient_checker.compute_gradient_error(x, x.get_shape(),
+                                                    out, out.get_shape())
+    self.assertLess(err, 1e-4)
 
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/python/kernel_tests/cast_op_test.py b/tensorflow/python/kernel_tests/cast_op_test.py
index b3187e1637193a8b34f7f3668220d94d783b6170..e9be8e7d5f73c9ea6f7a0fe15d84ecba7201156b 100644
--- a/tensorflow/python/kernel_tests/cast_op_test.py
+++ b/tensorflow/python/kernel_tests/cast_op_test.py
@@ -157,7 +157,7 @@ class CastOpTest(test.TestCase):
       # np.float64("np.inf").astype(np.int32) is negative on x86 but positive on ppc64le
       # Numpy link to relevant discussion - https://github.com/numpy/numpy/issues/9040
       # Tensorflow link to relevant discussion - https://github.com/tensorflow/tensorflow/issues/9360
-      if platform.machine() == "ppc64le":
+      if platform.machine() == "ppc64le" or platform.machine() == "aarch64":
         self._compare(-np.inf, np.int32, i4.min, False)
         self._compare(-np.inf, np.int64, i8.min, False)
       else:
@@ -169,8 +169,13 @@ class CastOpTest(test.TestCase):
     self._compare(-np.inf, np.int64, i8.min, False)
     self.assertAllEqual(np.isnan(self._cast(np.nan, np.float32, False)), True)
     self.assertAllEqual(np.isnan(self._cast(np.nan, np.float64, False)), True)
-    self._compare(np.nan, np.int32, i4.min, False)
-    self._compare(np.nan, np.int64, i8.min, False)
+    # np.float64(np.nan).astype(np.int32) is 0 on ARM
+    if platform.machine() == "aarch64":
+      self._compare(np.nan, np.int32, 0, False)
+      self._compare(np.nan, np.int64, 0, False)
+    else:
+      self._compare(np.nan, np.int32, i4.min, False)
+      self._compare(np.nan, np.int64, i8.min, False)
 
     self._compare(np.inf, np.float32, np.inf, True)
     self._compare(np.inf, np.float64, np.inf, True)
diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index 95bac85027bd1709420dcfc7f96f92195f8f2472..7d00919cc8a9927c3e8d05b1c92aa89c8fb54ad9 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -889,6 +889,8 @@ class EnsureShapeTest(test.TestCase):
 
   # Dynamic shape check
   @test_util.run_deprecated_v1
+  @test_util.disable_xla(
+      "b/123337890")  # Dynamic shapes not supported now with XLA
   def testEnsuresDynamicShape_RaisesError(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     derived = math_ops.divide(placeholder, 3, name="MyDivide")
@@ -902,6 +904,8 @@ class EnsureShapeTest(test.TestCase):
         sess.run(derived, feed_dict={placeholder: feed_val})
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla(
+      "b/123337890")  # Dynamic shapes not supported now with XLA
   def testEnsuresDynamicShape_RaisesErrorDimUnknown(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     derived = placeholder / 3
diff --git a/tensorflow/python/kernel_tests/checkpoint_ops_test.py b/tensorflow/python/kernel_tests/checkpoint_ops_test.py
index 6e289bf9b780ae2ba16f400cc001ddce59f547b3..a67461856808b064ff0de485d1fe28e79430c7fb 100644
--- a/tensorflow/python/kernel_tests/checkpoint_ops_test.py
+++ b/tensorflow/python/kernel_tests/checkpoint_ops_test.py
@@ -105,7 +105,6 @@ class GenerateVocabRemappingTest(test.TestCase):
       self.assertAllEqual(expected_num_present, self.evaluate(num_present))
 
 
-@test_util.run_v1_only('b/120545219')
 class LoadAndRemapMatrixTest(test.TestCase):
   """Tests for the load_and_remap_matrix() op."""
 
@@ -126,7 +125,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
 
     save = saver.Saver([matrix])
     with self.cached_session() as sess:
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.bundle_file = os.path.join(test.get_temp_dir(), 'bundle_checkpoint')
       save.save(sess, self.bundle_file)
 
@@ -231,6 +230,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
           np.reshape(initializing_values, (num_rows, num_cols)),
           self.evaluate(remapped_matrix))
 
+  @test_util.run_deprecated_v1
   def test_load_and_remap_invalid_remapping(self):
     """Tests that errors are raised when an ID maps to multiple new IDs.
 
@@ -262,6 +262,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
     with self.cached_session(), self.assertRaises(errors.UnimplementedError):
       self.evaluate(remapped_matrix)
 
+  @test_util.run_deprecated_v1
   def test_load_and_remap_incorrect_initializing_values(self):
     """Tests that errors are raised with incorrect number of init values."""
     remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
@@ -313,7 +314,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
     with self.cached_session() as sess:
       ckpt_path = os.path.join(test.get_temp_dir(), 'temp_ckpt')
       save = saver.Saver([matrix])
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       save.save(sess, ckpt_path)
       num_rows, num_cols = np_value.shape
 
diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index a08cfe960d005451ab5a02aff02e90a0fbcb92a0..2305c0b568ee6220dab8dd9be8b7bda339b9f082 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -163,6 +163,9 @@ class CholeskyOpTest(test.TestCase):
     with self.assertRaises(ValueError):
       linalg_ops.cholesky(tensor3)
 
+  # The below invalid Cholesky call returns an error with TF Classic and just
+  # returns NaNs with XLA.
+  @test_util.disable_xla("b/123337890")
   def testNotInvertibleCPU(self):
     # The input should be invertible.
     with self.session(use_gpu=True):
diff --git a/tensorflow/python/kernel_tests/concat_op_test.py b/tensorflow/python/kernel_tests/concat_op_test.py
index 474760a93ff84be698388a7784f66445c21cd8ca..7e37785344391364b2e5d8ea54170e68659335dc 100644
--- a/tensorflow/python/kernel_tests/concat_op_test.py
+++ b/tensorflow/python/kernel_tests/concat_op_test.py
@@ -640,6 +640,7 @@ class ConcatOpTest(test.TestCase):
         output = self.evaluate(c)
         self.assertAllEqual([[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]], output)
 
+
 class ConcatOffsetTest(test.TestCase):
 
   def testBasic(self):
@@ -683,6 +684,7 @@ class ConcatOffsetTest(test.TestCase):
       self.evaluate(off)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testSizeMismatch(self):
     cdim = constant_op.constant(1, dtypes.int32)
     s0 = constant_op.constant([2, 3, 5], dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/cond_v2_test.py b/tensorflow/python/kernel_tests/cond_v2_test.py
index 8fe3ba41e27aa101fd4f2e3b41b0a0b226471047..244b0bdd7fd48d8e0b4b7fb5a778123dede5fef2 100644
--- a/tensorflow/python/kernel_tests/cond_v2_test.py
+++ b/tensorflow/python/kernel_tests/cond_v2_test.py
@@ -20,7 +20,9 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -145,6 +147,22 @@ class CondV2Test(test.TestCase):
     self.assertEqual(cond_op.type, "If")
     return output, cond_op
 
+  def _createNestedCond(self, name):
+    """Like _createCond but creates a nested cond_v2 call as well."""
+    pred = constant_op.constant(True, name="pred")
+    x = constant_op.constant(1.0, name="x")
+
+    def true_fn():
+      return cond_v2.cond_v2(pred, lambda: x, lambda: x + 1)
+
+    def false_fn():
+      return x + 2
+
+    output = cond_v2.cond_v2(pred, true_fn, false_fn, name=name)
+    cond_op = output.op.inputs[0].op
+    self.assertEqual(cond_op.type, "If")
+    return output, cond_op
+
   def testDefaultName(self):
     with ops.Graph().as_default():
       _, cond_op = self._createCond(None)
@@ -612,6 +630,26 @@ class CondV2Test(test.TestCase):
         # d2[x]/dx2 = 0
         self.assertEqual(false_val, [0.0])
 
+  def testGradientTapeOfCondWithResourceVariableInFunction(self):
+    with context.eager_mode():
+      v = variables.Variable(2.)
+
+      @def_function.function
+      def fnWithCond():  # pylint: disable=invalid-name
+        with backprop.GradientTape() as tape:
+          pred = constant_op.constant(True, dtype=dtypes.bool)
+
+          def true_fn():
+            return math_ops.pow(v, 3)
+
+          def false_fn():
+            return v
+
+          cond = cond_v2.cond_v2(pred, true_fn, false_fn, name="cond")
+        return tape.gradient(cond, v)
+
+      self.assertAllEqual(fnWithCond(), 12.0)
+
   def testLowering(self):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
@@ -645,9 +683,14 @@ class CondV2Test(test.TestCase):
       # Build the cond_v2 in an XLA context
       xla_context = control_flow_ops.XLAControlFlowContext()
       xla_context.Enter()
-      cond_output, _ = self._createCond("cond")
+      cond_output, cond_op = self._createCond("cond")
       xla_context.Exit()
 
+      # Check lowering attr is not set.
+      with self.assertRaises(ValueError):
+        cond_op.get_attr("_lower_using_switch_merge")
+
+      # Check the actual graph that is run.
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
       run_metadata = config_pb2.RunMetadata()
       sess.run(cond_output, options=run_options, run_metadata=run_metadata)
@@ -672,6 +715,29 @@ class CondV2Test(test.TestCase):
           if_found,
           "An `If` op was not found, but the graph should not be lowered.")
 
+  @test_util.run_deprecated_v1
+  def testNestedLoweringDisabledInXLA(self):
+    # Build the cond_v2 in an XLA context
+    xla_context = control_flow_ops.XLAControlFlowContext()
+    xla_context.Enter()
+    _, cond_op = self._createNestedCond("cond")
+    xla_context.Exit()
+
+    # Check lowering attr is not set for either If node.
+    with self.assertRaises(ValueError):
+      cond_op.get_attr("_lower_using_switch_merge")
+
+    nested_if_ops = []
+    for func in ops.get_default_graph()._functions.values():
+      nested_if_ops.extend(op for op in func._graph.get_operations()
+                           if op.type == "If")
+    self.assertEqual(len(nested_if_ops), 1)
+    with self.assertRaises(ValueError):
+      nested_if_ops[0].get_attr("_lower_using_switch_merge")
+
+    # TODO(skyewm): check the actual graphs that are run once we have a way to
+    # programmatically access those graphs.
+
   @test_util.run_deprecated_v1
   def testLoweringDisabledWithSingleThreadedExecutorContext(self):
     with self.session(graph=ops.Graph()) as sess:
@@ -719,8 +785,8 @@ class CondV2Test(test.TestCase):
       return ((x,), y * 3.0)
 
     with self.assertRaisesRegexp(
-        ValueError, "Outputs of true_fn and false_fn must"
-        " have the same structure"):
+        TypeError, "true_fn and false_fn arguments to tf.cond must have the "
+        "same number, type, and overall structure of return values."):
       control_flow_ops.cond(constant_op.constant(False), true_fn, false_fn)
 
   @test_util.enable_control_flow_v2
@@ -1040,7 +1106,7 @@ class CondV2ColocationGroupAndDeviceTest(test.TestCase):
                 self.evaluate(cond_v2.cond_v2(constant_op.constant(True),
                                               fn2, fn2)))
         else:
-          self.skipTest("Test requrires a GPU to check GPU device placement.")
+          self.skipTest("Test requires a GPU to check GPU device placement.")
 
   def testDeviceInAndOutOfCond(self):
     with ops.Graph().as_default() as g:
diff --git a/tensorflow/python/kernel_tests/conditional_accumulator_test.py b/tensorflow/python/kernel_tests/conditional_accumulator_test.py
index ce34201706492ca488afbec95cddf436f38c820d..32a20587508b7b5b4f0eeda248f6bb0e55f34c1f 100644
--- a/tensorflow/python/kernel_tests/conditional_accumulator_test.py
+++ b/tensorflow/python/kernel_tests/conditional_accumulator_test.py
@@ -408,7 +408,7 @@ class ConditionalAccumulatorTest(test.TestCase):
 
       set_global_step_op = q.set_global_step(new_global_step)
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(3):
         set_global_step_op.run()
         self.evaluate(inc_global_step)
diff --git a/tensorflow/python/kernel_tests/confusion_matrix_test.py b/tensorflow/python/kernel_tests/confusion_matrix_test.py
index ae13c8e32e5ed5c8f3e6b670835db66d1e7dad0f..0ea5b1f5d8c35a1d5f7e883872475fdeb97688c6 100644
--- a/tensorflow/python/kernel_tests/confusion_matrix_test.py
+++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py
@@ -470,9 +470,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
       }
-      with self.assertRaisesRegexp(
-          errors_impl.InvalidArgumentError,
-          "Can not squeeze dim\[2\]"):
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   r"Can not squeeze dim\[2\]"):
         dynamic_labels.eval(feed_dict=feed_dict)
       self.assertAllEqual(
           prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
@@ -498,9 +497,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
       }
       self.assertAllEqual(
           label_values, dynamic_labels.eval(feed_dict=feed_dict))
-      with self.assertRaisesRegexp(
-          errors_impl.InvalidArgumentError,
-          "Can not squeeze dim\[2\]"):
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   r"Can not squeeze dim\[2\]"):
         dynamic_predictions.eval(feed_dict=feed_dict)
 
 
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 0fd293ebba3044097453c18fb625fc0dee19b19f..982ead7e94557d41b9c4415c3fc604b829ad7390 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -32,7 +32,9 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import device_lib
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as eager_function
+from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -43,6 +45,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_array_ops
@@ -50,21 +53,27 @@ from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import gen_logging_ops
 from tensorflow.python.ops import gen_state_ops
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import script_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops import while_v2  # pylint: disable=unused-import
 # pylint: disable=unused-import
+from tensorflow.python.ops.ragged import ragged_factory_ops
 import tensorflow.python.ops.tensor_array_grad
 # pylint: enable=unused-import
 from tensorflow.python.platform import test
@@ -139,7 +148,7 @@ class ControlFlowTest(test.TestCase):
       v2 = control_flow_ops.with_dependencies([op], v)
 
       self.assertTrue(isinstance(v2, ops.Tensor))
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual(9, self.evaluate(v2))
 
   @test_util.run_v1_only("b/120545219")
@@ -153,7 +162,7 @@ class ControlFlowTest(test.TestCase):
       op = state_ops.assign(enter_v, enter_nine)
       v2 = control_flow_ops.with_dependencies([op], enter_v)
       v3 = control_flow_ops.exit(v2)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual(9, self.evaluate(v3))
 
   @test_util.run_v1_only("b/120545219")
@@ -164,7 +173,7 @@ class ControlFlowTest(test.TestCase):
       p = constant_op.constant(True)
       v1 = control_flow_ops._SwitchRefOrTensor(v._ref(), p)  # pylint: disable=protected-access
       v2 = state_ops.assign(v1[1], 9)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual(9, self.evaluate(v2))
 
   def testEnterMulExit(self):
@@ -204,8 +213,8 @@ class ControlFlowTest(test.TestCase):
       switch_op = control_flow_ops.switch(data, pred)
       merge_op = control_flow_ops.merge(switch_op)[0]
 
-      val = merge_op.values.eval()
-      ind = merge_op.indices.eval()
+      val = merge_op.values
+      ind = merge_op.indices
     self.assertAllEqual(np.arange(1, 7), val)
     self.assertAllEqual(np.arange(0, 12, 2), ind)
 
@@ -417,35 +426,61 @@ class ControlFlowTest(test.TestCase):
       fn2 = lambda: ops.IndexedSlices(math_ops.subtract(x.values, 1), indices)
       r = control_flow_ops.cond(pred, fn1, fn2)
 
-      val = r.values.eval()
-      ind = r.indices.eval()
+      val = r.values
+      ind = r.indices
     self.assertAllEqual(11, val)
     self.assertAllEqual(0, ind)
 
-  @test_util.run_v1_only("b/120545219")
+  def testCondMismatchedIndexedSlices(self):
+    @def_function.function
+    def foo():
+      values = constant_op.constant(10)
+      indices = constant_op.constant(0)
+      x = ops.IndexedSlices(values, indices)
+      v1_msg = "The two structures don't have the same nested structure"
+      v2_msg = ("true_fn and false_fn arguments to tf.cond must have the same "
+                "number, type, and overall structure of return values.")
+      with self.assertRaisesRegexp(
+          TypeError,
+          v2_msg if control_flow_util.ENABLE_CONTROL_FLOW_V2 else v1_msg):
+        control_flow_ops.cond(
+            constant_op.constant(True),
+            lambda: ops.IndexedSlices(math_ops.add(x.values, 1), indices),
+            lambda: math_ops.add(x.values, 1), indices)
+    foo()
+
   def testCondSparseTensor(self):
-    with self.cached_session():
-      values = constant_op.constant([2.0, 4.0], name="values")
-      indices = constant_op.constant(
-          [[0], [3]], dtype=dtypes.int64, name="indices")
-      shape = constant_op.constant([10], dtype=dtypes.int64, name="dense_shape")
-      x = sparse_tensor.SparseTensor(indices, values, dense_shape=shape)
-      pred = math_ops.less(1, 2)
-      fn1 = lambda: sparse_tensor.SparseTensor(
-          indices + 1, x.values + 1, dense_shape=shape)
-      fn2 = lambda: sparse_tensor.SparseTensor(
-          indices, x.values - 1, dense_shape=shape)
-      r = control_flow_ops.cond(pred, fn1, fn2)
-      self.assertAllEqual([3.0, 5.0], r.values.eval())
-      self.assertAllEqual([[1], [4]], r.indices.eval())
-      self.assertAllEqual(r.values.get_shape(), (2,))
+    values = constant_op.constant([2.0, 4.0], name="values")
+    indices = constant_op.constant([[0], [3]],
+                                   dtype=dtypes.int64,
+                                   name="indices")
+    shape = constant_op.constant([10], dtype=dtypes.int64, name="dense_shape")
+    x = sparse_tensor.SparseTensor(indices, values, dense_shape=shape)
+    pred = math_ops.less(1, 2)
+    fn1 = lambda: sparse_tensor.SparseTensor(
+        indices + 1, x.values + 1, dense_shape=shape)
+    fn2 = lambda: sparse_tensor.SparseTensor(
+        indices, x.values - 1, dense_shape=shape)
+    r = control_flow_ops.cond(pred, fn1, fn2)
+    self.assertAllEqual([3.0, 5.0], r.values)
+    self.assertAllEqual([[1], [4]], r.indices)
+    self.assertAllEqual(r.values.get_shape(), (2,))
+
+  def testCondRaggedTensor(self):
+    rt = ragged_factory_ops.constant([[1, 2], [3], [4, 5, 6]])
+    pred = math_ops.less(1, 2)
+    fn1 = lambda: array_ops.concat([rt + 2, [[100]]], axis=0)
+    fn2 = lambda: rt[:2] - 2
+    result = control_flow_ops.cond(pred, fn1, fn2)
+    self.assertAllEqual([3, 4, 5, 6, 7, 8, 100], result.values)
+    self.assertAllEqual([0, 2, 3, 6, 7], result.row_splits)
 
   @test_util.run_v1_only("b/120545219")
   def testCondResource(self):
 
     with self.cached_session():
       rv = resource_variable_ops.ResourceVariable(True)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       t = ops.convert_to_tensor(1.0)
 
       def case():
@@ -453,7 +488,8 @@ class ControlFlowTest(test.TestCase):
         with ops.control_dependencies([assign]):
           return array_ops.identity(t)
 
-      self.assertEqual(1.0, control_flow_ops.cond(rv, case, lambda: t).eval())
+      self.assertEqual(
+          1.0, self.evaluate(control_flow_ops.cond(rv, case, lambda: t)))
 
   @test_util.run_v1_only("b/120545219")
   def testCondWithTensorArrayGrad(self):
@@ -462,7 +498,7 @@ class ControlFlowTest(test.TestCase):
         pred = array_ops.placeholder(dtypes.bool, [])
         x = constant_op.constant([1.0, 2.0, 3.0])
         y = control_flow_ops.cond(
-            pred, lambda: functional_ops.map_fn(lambda z: z * 2.0, x),
+            pred, lambda: map_fn.map_fn(lambda z: z * 2.0, x),
             lambda: constant_op.constant([1.0, 1.0, 1.0]))
         g = gradients_impl.gradients(y, x)[0]
 
@@ -482,8 +518,8 @@ class ControlFlowTest(test.TestCase):
       fn2 = lambda: ops.IndexedSlices(math_ops.subtract(x.values, 1), i_64)
       r = control_flow_ops.cond(pred, fn1, fn2)
 
-      val = r.values.eval()
-      ind = r.indices.eval()
+      val = r.values
+      ind = r.indices
     self.assertAllEqual(11, val)
     self.assertAllEqual(0, ind)
     self.assertTrue(ind.dtype == np.int64)
@@ -564,8 +600,8 @@ class ControlFlowTest(test.TestCase):
 
     if not context.executing_eagerly():
       with self.cached_session():
-        variables.global_variables_initializer().run()
-        result = f().eval()
+        self.evaluate(variables.global_variables_initializer())
+        result = self.evaluate(f())
         self.assertEqual(True, result)
         # Only second cond result was fetched, so v1 assign shouldn't run.
         self.assertEqual(7, self.evaluate(v1))
@@ -604,7 +640,7 @@ class ControlFlowTest(test.TestCase):
       fn2 = lambda: v1
       r = control_flow_ops.cond(pred, fn1, fn2)
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       result = self.evaluate(r)
       self.assertAllEqual(np.array([7]), result)
 
@@ -618,6 +654,27 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(pred, fn1, fn2)
       self.assertAllEqual([11, 12], self.evaluate(r))
 
+  @test_util.run_gpu_only
+  @test_util.run_deprecated_v1
+  def testCond_Device(self):
+    x = constant_op.constant(-10)
+
+    # True branch function defined outside of device scope
+    def true_fn():
+      return math_ops.exp(x)
+
+    with ops.device("CPU:0"):
+      r = control_flow_ops.cond(
+          constant_op.constant(True), true_fn, lambda: 0.)
+      self.assertIn("cpu", r.device.lower())
+
+    with session.Session() as sess:
+      options = config_pb2.RunOptions(output_partition_graphs=True)
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(r, options=options, run_metadata=run_metadata)
+      # We expect that everything runs on CPU, even if GPU is available.
+      self.assertEqual(len(run_metadata.partition_graphs), 1)
+
   def testCondListOutput(self):
     with self.cached_session() as sess:
       x = constant_op.constant(10)
@@ -698,11 +755,12 @@ class ControlFlowTest(test.TestCase):
       fn1 = lambda: {"a": math_ops.add(x, y), "b": math_ops.add(x, y)}
       fn2 = lambda: {"c": y, "d": y}
       v1_msg = "The two structures don't have the same nested structure"
-      v2_msg = "Outputs of true_fn and false_fn must have the same structure"
+      v2_msg = ("true_fn and false_fn arguments to tf.cond must have the same "
+                "number, type, and overall structure of return values.")
       with self.assertRaisesRegexp(
-          ValueError, v2_msg if control_flow_ops.ENABLE_COND_V2 else v1_msg):
-        r = control_flow_ops.cond(pred, fn1, fn2)
-        self.evaluate(r)
+          TypeError if control_flow_util.ENABLE_CONTROL_FLOW_V2 else ValueError,
+          v2_msg if control_flow_util.ENABLE_CONTROL_FLOW_V2 else v1_msg):
+        control_flow_ops.cond(pred, fn1, fn2)
 
   @test_util.run_deprecated_v1
   def testCondRef(self):
@@ -859,7 +917,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(sess.run(grad, {pred: False, x: 1.0, y: 2.0}), 0.0)
 
       # v1 control flow gets None second derivative for some reason.
-      if not control_flow_ops.ENABLE_COND_V2:
+      if not control_flow_util.ENABLE_CONTROL_FLOW_V2:
         self.assertIsNone(grad_grad)
         return
 
@@ -893,7 +951,7 @@ class ControlFlowTest(test.TestCase):
       fn2 = lambda: array_ops.gather(v1, [1, 1])
       r = control_flow_ops.cond(pred, fn1, fn2)
       grad = gradients_impl.gradients(r, [v1])[0]
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       # Should just be [1, 1], but possibly a sparse representation
       gv, gi = sess.run([grad.values, grad.indices], feed_dict={c: 1})
       dense_gv = [
@@ -907,6 +965,68 @@ class ControlFlowTest(test.TestCase):
       ]
       self.assertAllEqual(dense_gv, [0.0, 2.0])
 
+  @test_util.run_deprecated_v1
+  def testCondGrad_ResourceVarSparseRead(self):
+    # NOTE(skyewm): this test is interesting because the
+    # ResourceVariable.sparse_read gradient function returns IndexedSlices.
+    var = resource_variable_ops.ResourceVariable(
+        np.ones((4, 2), dtype=np.float32))
+    x = constant_op.constant(1.0)
+    r = control_flow_ops.cond(
+        constant_op.constant(True),
+        lambda: x * math_ops.reduce_sum(var.sparse_read([1, 2])),
+        lambda: constant_op.constant(np.zeros((2, 3)),
+                                     dtype=dtypes.float32))
+    grad = gradients_impl.gradients(r, var)[0]
+
+    self.evaluate(variables.global_variables_initializer())
+    grad_val = self.evaluate(grad)
+    self.assertIsInstance(grad_val, ops.IndexedSlicesValue)
+    self.assertAllEqual(gradient_checker_v2._to_numpy(grad_val), [[0., 0.],
+                                                                  [1., 1.],
+                                                                  [1., 1.],
+                                                                  [0., 0.]])
+
+  def testCondGrad_MultiGather(self):
+    # NOTE(skyewm): this test is interesting because the array_ops.gather and
+    # ResourceVariable.sparse_read gradient functions returns IndexedSlices.
+    var = resource_variable_ops.ResourceVariable(
+        np.ones((4, 2), dtype=np.float32))
+    x1 = constant_op.constant(np.ones((3, 3), dtype=np.float32))
+    x2 = constant_op.constant(2.0)
+
+    def true_fn():
+      y1 = var.sparse_read([1, 2])
+      y2 = array_ops.gather(x1, [2]) * x2
+      y3 = x2 * [1., 1., 1.]
+      return y1, y2, y3
+
+    def false_fn():
+      y1 = np.zeros((2, 2), dtype=np.float32)
+      y2 = array_ops.gather(x1, [2]) * x2
+      y3 = array_ops.gather(x1, [2])
+      return y1, y2, y3
+
+    @def_function.function
+    def foo():
+      r = control_flow_ops.cond(constant_op.constant(True), true_fn, false_fn)
+      return gradients_impl.gradients(r, [var, x1, x2])
+
+    grad = foo()
+    self.evaluate(variables.global_variables_initializer())
+    var_grad, x1_grad, x2_grad = self.evaluate(grad)
+    self.assertIsInstance(var_grad, ops.IndexedSlicesValue)
+    self.assertAllEqual(gradient_checker_v2._to_numpy(var_grad), [[0., 0.],
+                                                                  [1., 1.],
+                                                                  [1., 1.],
+                                                                  [0., 0]])
+    self.assertIsInstance(x1_grad, ops.IndexedSlicesValue)
+    self.assertAllEqual(gradient_checker_v2._to_numpy(x1_grad), [[0., 0., 0.],
+                                                                 [0., 0., 0.],
+                                                                 [2., 2., 2.]])
+    self.assertIsInstance(x1_grad, ops.IndexedSlicesValue)
+    self.assertEqual(gradient_checker_v2._to_numpy(x2_grad), 6.)
+
   @test_util.run_v1_only("b/120545219")
   def testCondPredicateTensor(self):
     """Regression test for lowering predicate from non-first output of an op."""
@@ -918,9 +1038,33 @@ class ControlFlowTest(test.TestCase):
     r = control_flow_ops.cond(foo()[1], lambda: 1.0, lambda: 2.0)
     self.assertEqual(self.evaluate(r), 1.0)
 
-  # TODO(b/117945658): reenable
+  @test_util.run_v1_only("Tests Session.run() pruning logic.")
+  def testCondFeedConstantPredicate(self):
+    with self.cached_session() as sess:
+      value = constant_op.constant(37.0)
+      predicate = constant_op.constant(True)
+      cond_output = control_flow_ops.cond(
+          predicate, lambda: constant_op.constant(0.0), lambda: value)
+      result = array_ops.identity(cond_output)
+      self.assertEqual(37.0, sess.run(result, feed_dict={predicate: False}))
+      self.assertEqual(0.0, sess.run(result, feed_dict={predicate: True}))
+      self.assertEqual(0.0, sess.run(result))
+
+  @test_util.run_v1_only("Tests Session.run() pruning logic.")
+  def testCondFeedPlaceholderWithDefaultPredicate(self):
+    with self.cached_session() as sess:
+      value = constant_op.constant(37.0)
+      predicate = array_ops.placeholder_with_default(
+          constant_op.constant(True), [])
+      cond_output = control_flow_ops.cond(
+          predicate, lambda: constant_op.constant(0.0), lambda: value)
+      result = array_ops.identity(cond_output)
+      self.assertAllEqual(37.0, sess.run(result, feed_dict={predicate: False}))
+      self.assertAllEqual(0.0, sess.run(result, feed_dict={predicate: True}))
+      self.assertAllEqual(0.0, sess.run(result))
+
   @test_util.run_in_graph_and_eager_modes
-  def DISABLED_testCondAutoControlDeps(self):
+  def testCondAutoControlDeps(self):
 
     def branch_fn():
       logging_ops.print_v2("A")
@@ -940,16 +1084,16 @@ class ControlFlowTest(test.TestCase):
     if not context.executing_eagerly():
       with self.cached_session():
         with self.captureWritesToStream(sys.stderr) as printed:
-          self.assertEqual(build_cond().eval(), 10)
+          self.assertEqual(self.evaluate(build_cond()), 10)
         self.assertEqual(printed.contents(), "C\n")
 
         with self.captureWritesToStream(sys.stderr) as printed:
-          self.assertEqual(build_nested_cond().eval(), 10)
+          self.assertEqual(self.evaluate(build_nested_cond()), 10)
         self.assertEqual(printed.contents(), "C\n")
 
     # In defuns, all prints should execute in program order.
     # This doesn't work with legacy control flow.
-    if control_flow_ops.ENABLE_COND_V2:
+    if control_flow_util.ENABLE_CONTROL_FLOW_V2:
 
       @eager_function.defun
       def cond():
@@ -957,7 +1101,8 @@ class ControlFlowTest(test.TestCase):
 
       with self.captureWritesToStream(sys.stderr) as printed:
         self.assertEqual(self.evaluate(cond()), 10)
-      self.assertEqual(printed.contents(), "A\nB\nC\n")
+      self.assertTrue(printed.contents().endswith("A\nB\nC\n"),
+                      printed.contents())
 
       @eager_function.defun
       def nested_cond():
@@ -965,11 +1110,31 @@ class ControlFlowTest(test.TestCase):
 
       with self.captureWritesToStream(sys.stderr) as printed:
         self.assertEqual(self.evaluate(nested_cond()), 10)
-      self.assertEqual(printed.contents(), "A\nB\nC\n")
+      self.assertTrue(printed.contents().endswith("A\nB\nC\n"),
+                      printed.contents())
+
+    # wrap_function should prune.
+    def pruned_cond():
+      return build_cond()
+    pruned_cond = wrap_function.wrap_function(pruned_cond, [])
+
+    with self.captureWritesToStream(sys.stderr) as printed:
+      self.assertEqual(self.evaluate(pruned_cond()), 10)
+    self.assertEqual(printed.contents(), "C\n")
+
+    def pruned_nested_cond():
+      return build_nested_cond()
+    pruned_nested_cond = wrap_function.wrap_function(pruned_nested_cond, [])
+
+    with self.captureWritesToStream(sys.stderr) as printed:
+      self.assertEqual(self.evaluate(pruned_nested_cond()), 10)
+    self.assertEqual(printed.contents(), "C\n")
 
-  # TODO(b/117945658): reenable
   @test_util.run_in_graph_and_eager_modes
-  def DISABLED_testWhileAutoControlDeps(self):
+  def testWhileAutoControlDeps(self):
+    # Legacy while_loop fails this test because it produces deprecation notices
+    # in stderr.
+    if not control_flow_util.ENABLE_CONTROL_FLOW_V2: return
 
     def cond(i, unused_x):
       logging_ops.print_v2("A")
@@ -988,43 +1153,63 @@ class ControlFlowTest(test.TestCase):
 
     def build_nested_while():
       return control_flow_ops.cond(
-          constant_op.constant(True), build_while, lambda: (0, 0))
+          constant_op.constant(True), build_while, lambda: [0, 0])
 
     # In v1 graph mode, pruning should make only "D" print.
     if not context.executing_eagerly():
       with self.cached_session():
         with self.captureWritesToStream(sys.stderr) as printed:
-          self.assertEqual(build_while()[0].eval(), 2)
-        self.assertEqual(printed.contents(), "D\nD\n")
+          self.assertEqual(self.evaluate(build_while()[0]), 2)
+        self.assertTrue(printed.contents().endswith("D\nD\n"),
+                        printed.contents())
 
         with self.captureWritesToStream(sys.stderr) as printed:
-          self.assertEqual(build_nested_while()[0].eval(), 2)
-        self.assertEqual(printed.contents(), "D\nD\n")
+          self.assertEqual(self.evaluate(build_nested_while()[0]), 2)
+        self.assertTrue(printed.contents().endswith("D\nD\n"),
+                        printed.contents())
 
     # In defuns, all prints should execute in program order.
-    # This doesn't work with legacy control flow.
-    if control_flow_ops.ENABLE_WHILE_V2:
+    @eager_function.defun
+    def while_loop():
+      return build_while()[0]
 
-      @eager_function.defun
-      def while_loop():
-        return build_while()[0]
+    with self.captureWritesToStream(sys.stderr) as printed:
+      self.assertEqual(self.evaluate(while_loop()), 2)
+    self.assertTrue(printed.contents().endswith("A\nB\nC\nD\nA\nB\nC\nD\nA\n"),
+                    printed.contents())
+
+    @eager_function.defun
+    def nested_while_loop():
+      return build_nested_while()[0]
 
+    # TODO(b/117840611): calling nested_while_loop fails in eager
+    if not context.executing_eagerly():
       with self.captureWritesToStream(sys.stderr) as printed:
-        self.assertEqual(self.evaluate(while_loop()), 2)
-      self.assertEqual(printed.contents(), "A\nB\nC\nD\nA\nB\nC\nD\nA\n")
+        self.assertEqual(self.evaluate(nested_while_loop()), 2)
+      self.assertTrue(
+          printed.contents().endswith("A\nB\nC\nD\nA\nB\nC\nD\nA\n"),
+          printed.contents())
 
-      @eager_function.defun
-      def nested_while_loop():
-        return build_nested_while()[0]
+    # wrap_function should prune.
+    def pruned_while():
+      return build_while()[0]
+    pruned_while = wrap_function.wrap_function(pruned_while, [])
 
-      # TODO(b/117840611): calling nested_while_loop fails in eager
-      if not context.executing_eagerly():
-        with self.captureWritesToStream(sys.stderr) as printed:
-          self.assertEqual(self.evaluate(nested_while_loop()), 2)
-        self.assertEqual(printed.contents(), "A\nB\nC\nD\nA\nB\nC\nD\nA\n")
+    with self.captureWritesToStream(sys.stderr) as printed:
+      self.assertEqual(self.evaluate(pruned_while()), 2)
+    self.assertTrue(printed.contents().endswith("D\nD\n"), printed.contents())
+
+    def pruned_nested_while():
+      return build_nested_while()[0]
+    pruned_nested_while = wrap_function.wrap_function(pruned_nested_while, [])
+
+    # TODO(b/117840611): calling nested_while_loop fails in eager
+    if not context.executing_eagerly():
+      with self.captureWritesToStream(sys.stderr) as printed:
+        self.assertEqual(self.evaluate(pruned_nested_while()), 2)
+      self.assertTrue(printed.contents().endswith("D\nD\n"), printed.contents())
 
   # Microbenchmark: 256,000 iterations/s.
-  @test_util.disable_control_flow_v2("b/116630618 (Times out)")
   def testWhile_1(self):
     with self.cached_session():
       n = constant_op.constant(0)
@@ -1039,7 +1224,7 @@ class ControlFlowTest(test.TestCase):
     with self.cached_session():
       v = variables.Variable(0.0)
       v.initializer.run()
-      increment = v.assign_add(1.0)
+      increment = v.assign_add(1.0).read_value()
 
       def body_fn(i):
         with ops.control_dependencies([increment]):
@@ -1047,8 +1232,8 @@ class ControlFlowTest(test.TestCase):
 
       result = control_flow_ops.while_loop(cond=lambda i: i < 2,
                                            body=body_fn, loop_vars=[1])
-      self.assertAllEqual(result.eval(), 2)
-      self.assertAllEqual(v.eval(), 1.0)
+      self.assertAllEqual(result, 2)
+      self.assertAllEqual(v.read_value(), 1.0)
 
   @test_util.disable_control_flow_v2("b/79881896 (control deps)")
   @test_util.run_v1_only("b/120545219")
@@ -1056,7 +1241,8 @@ class ControlFlowTest(test.TestCase):
     with self.cached_session():
       v = variables.Variable(0.0)
       v.initializer.run()
-      increment = v.assign_add(1.0)
+      # TODO(apassos): figure out why the reading is necessary here.
+      increment = v.assign_add(1.0).read_value()
 
       def body_fn(unused_i):
         with ops.control_dependencies([increment]):
@@ -1065,7 +1251,7 @@ class ControlFlowTest(test.TestCase):
       result = control_flow_ops.while_loop(cond=lambda i: i < 5,
                                            body=body_fn, loop_vars=[0])
       self.evaluate(result)
-      self.assertAllEqual(v.eval(), 1.0)
+      self.assertAllEqual(self.evaluate(v), 1.0)
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
   @test_util.run_v1_only("b/120545219")
@@ -1083,7 +1269,7 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(c, b, [i, x], parallel_iterations=5)
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertEqual(r[0].dtype, dtypes.int32)
       self.assertEqual(r[1].dtype, dtypes.int32_ref)
@@ -1141,6 +1327,8 @@ class ControlFlowTest(test.TestCase):
 
   @test_util.run_v1_only("b/120545219")
   def testInvalidMaximumIterationsWhileLoopGradientInXLAContext(self):
+    if control_flow_util.ENABLE_CONTROL_FLOW_V2:
+      self.skipTest("WhileV2 does lazy evaluation of maximum_iterations")
     v = constant_op.constant(1.0)
 
     def inner_body(i, x):
@@ -1161,44 +1349,27 @@ class ControlFlowTest(test.TestCase):
     gs = gradients_impl.gradients(loop_no_xla, v)
     self.evaluate(gs)  # This should execute without error.
 
-    if control_flow_ops.ENABLE_WHILE_V2:
-      xla_context = control_flow_ops.XLAControlFlowContext()
-      xla_context.Enter()
-      with self.assertRaisesRegexp(
-          ValueError,
-          r"maximum_iterations is None. It is required and must be statically "
-          r"known \(e.g. a constant value or known shape dimension\) when "
-          r"building while_loop in XLA context."):
-        loop_no_maxiter = create_while_loop()
-      with self.assertRaisesRegexp(
-          ValueError,
-          r"maximum_iterations must be statically "
-          r"known \(e.g. a constant value or known shape dimension\) when "
-          r"building while_loop in XLA context."):
-        loop_with_maxiter = create_while_loop(maximum_iterations=2)
-      xla_context.Exit()
-    else:
-      xla_context = control_flow_ops.XLAControlFlowContext()
-      xla_context.Enter()
-      loop_no_maxiter = create_while_loop()
-      loop_with_maxiter = create_while_loop(maximum_iterations=2)
-      xla_context.Exit()
+    xla_context = control_flow_ops.XLAControlFlowContext()
+    xla_context.Enter()
+    loop_no_maxiter = create_while_loop()
+    loop_with_maxiter = create_while_loop(maximum_iterations=2)
+    xla_context.Exit()
 
-      with self.assertRaisesRegexp(
-          ValueError,
-          r"Cannot create a gradient accumulator for tensor '.+' inside "
-          r"XLA while_loop because maximum_iterations was not passed to "
-          r"the tf.while_loop call \('.+'\)."):
-        _ = gradients_impl.gradients(loop_no_maxiter, v)
+    with self.assertRaisesRegexp(
+        ValueError,
+        r"Cannot create a gradient accumulator for tensor '.+' inside "
+        r"XLA while_loop because maximum_iterations was not passed to "
+        r"the tf.while_loop call \('.+'\)."):
+      _ = gradients_impl.gradients(loop_no_maxiter, v)
 
-      with self.assertRaisesRegexp(
-          ValueError,
-          r"Cannot create a gradient accumulator for tensor '.+' inside XLA "
-          r"while_loop. maximum_iterations tensor '.+' for while_loop context "
-          r"'.+' must be statically known \(e.g. a constant value or known "
-          r"shape dimension\), or be defined at or outside the while loop "
-          r"context '.*' \(currently defined in '.*'\)"):
-        _ = gradients_impl.gradients(loop_with_maxiter, v)
+    with self.assertRaisesRegexp(
+        ValueError,
+        r"Cannot create a gradient accumulator for tensor '.+' inside XLA "
+        r"while_loop. maximum_iterations tensor '.+' for while_loop context "
+        r"'.+' must be statically known \(e.g. a constant value or known "
+        r"shape dimension\), or be defined at or outside the while loop "
+        r"context '.*' \(currently defined in '.*'\)"):
+      _ = gradients_impl.gradients(loop_with_maxiter, v)
 
   @test_util.run_v1_only("b/120545219")
   def testInvalidMaximumIterationsFromSiblingContextWhileLoopInXLAContext(self):
@@ -1219,14 +1390,11 @@ class ControlFlowTest(test.TestCase):
           lambda i, x: (i + 1, v * x), (0, 1.0),
           maximum_iterations=max_iter_holder[0])
 
-    if control_flow_ops.ENABLE_WHILE_V2:
+    if control_flow_util.ENABLE_CONTROL_FLOW_V2:
       xla_context = control_flow_ops.XLAControlFlowContext()
       xla_context.Enter()
       with self.assertRaisesRegexp(
-          ValueError,
-          r"maximum_iterations must be statically known \(e.g. a constant value"
-          r" or known shape dimension\) when building while_loop in XLA "
-          r"context."):
+          ValueError, r"Tensor.*Placeholder:0.* must be from the same graph.*"):
         loop = create_while_loop()
       xla_context.Exit()
     else:
@@ -1243,7 +1411,7 @@ class ControlFlowTest(test.TestCase):
           r"while loop context '' \(currently defined in 'cond/.+'\)"):
         _ = gradients_impl.gradients(loop, v)
 
-  @test_util.disable_control_flow_v2("b/118457764")
+  @test_util.disable_control_flow_v2("b/123601232")
   @test_util.run_v1_only("b/120545219")
   def testNestedWhileLoopWithMaxItersFromOuterContextInXLAContext(self):
     v = constant_op.constant(1.0)
@@ -1331,7 +1499,7 @@ class ControlFlowTest(test.TestCase):
       d = ops.convert_to_tensor(100)
       r = control_flow_ops.while_loop(lambda i, m, c, o: math_ops.less(i, d),
                                       compute, [i, m, c, o])
-      result = r[3].eval()
+      result = r[3]
     self.assertAllEqual(10100, result)
 
   @test_util.run_deprecated_v1
@@ -1353,7 +1521,7 @@ class ControlFlowTest(test.TestCase):
       s = array_ops.size(x)
       r = control_flow_ops.while_loop(lambda i, m, c, o: math_ops.less(i, s),
                                       compute, [i, m, c, o])
-      result = r[3].eval()
+      result = r[3]
     self.assertAllEqual(42, result)
 
   @test_util.run_v1_only("b/120545219")
@@ -1378,9 +1546,29 @@ class ControlFlowTest(test.TestCase):
                                           tensor_shape.unknown_shape(),
                                           tensor_shape.unknown_shape()
                                       ])
-      result = r[2].eval()
+      result = r[2]
     self.assertAllEqual(np.array([0, 1, 2, 3, 4, 5, 6]), result)
 
+  @test_util.run_gpu_only
+  @test_util.run_deprecated_v1
+  def testWhile_Device(self):
+
+    # Body function defined outside of device scope
+    def body(x):
+      return math_ops.exp(x)
+
+    with ops.device("CPU:0"):
+      r = control_flow_ops.while_loop(
+          lambda x: x < 10, body, [constant_op.constant(-10.)])
+      self.assertIn("cpu", r.device.lower())
+
+    with session.Session() as sess:
+      options = config_pb2.RunOptions(output_partition_graphs=True)
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(r, options=options, run_metadata=run_metadata)
+      # We expect that everything runs on CPU, even if GPU is available.
+      self.assertEqual(len(run_metadata.partition_graphs), 1)
+
   @test_util.disable_control_flow_v2("b/116338794 (buffer_reuse)")
   @test_util.run_v1_only("b/120545219")
   def testBufferForwarding(self):
@@ -1496,35 +1684,95 @@ class ControlFlowTest(test.TestCase):
   @test_util.disable_control_flow_v2("b/116328420 (SparseTensor)")
   @test_util.run_v1_only("b/120545219")
   def testWhileShapeInferenceSparseTensor(self):
-    with self.cached_session():
-      values = constant_op.constant([2.0, 4.0], name="values")
-      indices = constant_op.constant(
-          [[0], [3]], dtype=dtypes.int64, name="indices")
-      shape = constant_op.constant([10], dtype=dtypes.int64, name="dense_shape")
-      i = constant_op.constant(0)
-      x = sparse_tensor.SparseTensor(indices, values, dense_shape=shape)
-
-      def c(i, _):
-        return i < 10
+    values = constant_op.constant([2.0, 4.0], name="values")
+    indices = constant_op.constant([[0], [3]],
+                                   dtype=dtypes.int64,
+                                   name="indices")
+    shape = constant_op.constant([10], dtype=dtypes.int64, name="dense_shape")
+    i = constant_op.constant(0)
+    x = sparse_tensor.SparseTensor(indices, values, dense_shape=shape)
+
+    def c(i, _):
+      return i < 10
+
+    def b1(i, x):  # modifies values.  (shape of components is not changed.)
+      return [
+          i + 1,
+          sparse_tensor.SparseTensor(x.indices, x.values * 2.0, x.dense_shape)
+      ]
 
-      def b(i, x):
-        return [
-            i + 1,
-            sparse_tensor.SparseTensor(x.indices, x.values * 2.0, x.dense_shape)
-        ]
+    def b2(i, x):  # adds new values.  (shape of components is changed.)
+      return [
+          i + 1,
+          sparse_ops.sparse_add(
+              x,
+              sparse_tensor.SparseTensor(
+                  indices=math_ops.cast(
+                      array_ops.fill([1, 1], i), dtypes.int64),
+                  values=array_ops.fill([1], 1.0),
+                  dense_shape=x.dense_shape))
+      ]
 
-      _, r = control_flow_ops.while_loop(c, b, [i, x])
-      self.assertEqual(r.dense_shape.get_shape()[0].value, 1)
+    def b3(i, x):  # modifies rank.  (shape of all components is changed.)
+      return [
+          i + 1,
+          sparse_tensor.SparseTensor(
+              array_ops.concat([x.indices, [[i], [i]]], axis=1), x.values * 2.0,
+              array_ops.concat([x.dense_shape, [10]], axis=0))
+      ]
 
+    # Default shape invariant; b1 only modifies values.
+    _, r = control_flow_ops.while_loop(c, b1, [i, x])
+    self.assertEqual(r.indices.get_shape().as_list(), [None, 1])
+    self.assertEqual(r.values.get_shape().as_list(), [None])
+    self.assertEqual(r.dense_shape.get_shape().as_list(), [1])
+
+    # Default shape invariant; b2 adds new values
+    _, r = control_flow_ops.while_loop(c, b2, [i, x])
+    self.assertEqual(r.indices.get_shape().as_list(), [None, 1])
+    self.assertEqual(r.values.get_shape().as_list(), [None])
+    self.assertEqual(r.dense_shape.get_shape().as_list(), [1])
+
+    # Default shape invariant; b3 modifies rank (which is not allowed).
+    with self.assertRaises(ValueError):
+      _, r = control_flow_ops.while_loop(c, b3, [i, x])
+
+    # Explicit shape invariant, allowing any rank; b1 only modifies values.
+    _, r = control_flow_ops.while_loop(
+        c, b1, [i, x],
+        [i.get_shape(), tensor_shape.TensorShape([None])])
+    self.assertEqual(r.indices.get_shape().as_list(), [None, None])
+    self.assertEqual(r.values.get_shape().as_list(), [None])
+    self.assertEqual(r.dense_shape.get_shape().as_list(), [None])
+
+    # Explicit shape invariant, allowing any rank; b3 modifies rank.
+    _, r = control_flow_ops.while_loop(
+        c, b3, [i, x],
+        [i.get_shape(), tensor_shape.TensorShape([None])])
+    self.assertEqual(r.indices.get_shape().as_list(), [None, None])
+    self.assertEqual(r.values.get_shape().as_list(), [None])
+    self.assertEqual(r.dense_shape.get_shape().as_list(), [None])
+
+    # Shape invariant with ndims=None.  Technically, this isn't supported
+    # according to the docs, but we support it for backwards compatibility.
+    _, r = control_flow_ops.while_loop(
+        c, b1, [i, x],
+        [i.get_shape(), tensor_shape.TensorShape(None)])
+    self.assertEqual(r.indices.get_shape().as_list(), [None, None])
+    self.assertEqual(r.values.get_shape().as_list(), [None])
+    self.assertEqual(r.dense_shape.get_shape().as_list(), [None])
+    _, r = control_flow_ops.while_loop(
+        c, b3, [i, x],
+        [i.get_shape(), tensor_shape.TensorShape(None)])
+    self.assertEqual(r.indices.get_shape().as_list(), [None, None])
+    self.assertEqual(r.values.get_shape().as_list(), [None])
+    self.assertEqual(r.dense_shape.get_shape().as_list(), [None])
+
+    # Explicit shape invariant, with a specific (incompatible) rank.
+    with self.assertRaisesRegexp(ValueError, "is not compatible with"):
       _, r = control_flow_ops.while_loop(
-          c, b, [i, x],
-          [i.get_shape(), tensor_shape.TensorShape([None])])
-      self.assertTrue(r.dense_shape.get_shape()[0].value is None)
-
-      with self.assertRaisesRegexp(ValueError, "is not compatible with"):
-        _, r = control_flow_ops.while_loop(
-            c, b, [i, x],
-            [i.get_shape(), tensor_shape.TensorShape([5])])
+          c, b1, [i, x],
+          [i.get_shape(), tensor_shape.TensorShape([5])])
 
   @test_util.disable_control_flow_v2("b/116282023 (IndexedSlices)")
   @test_util.run_v1_only("b/120545219")
@@ -1546,21 +1794,83 @@ class ControlFlowTest(test.TestCase):
         ]
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
-      self.assertEqual(r.dense_shape.get_shape()[0].value, 2)
+      self.assertEqual(r.dense_shape.get_shape()[0], 2)
       self.assertEqual(r.values.get_shape(), tensor_shape.TensorShape([2, 2]))
 
       _, r = control_flow_ops.while_loop(
           c, b, [i, x],
           [i.get_shape(), tensor_shape.TensorShape([None, 2])])
-      self.assertEqual(r.dense_shape.get_shape()[0].value, 2)
-      self.assertTrue(r.values.get_shape()[0].value is None)
-      self.assertEqual(r.values.get_shape()[1].value, 2)
+      self.assertEqual(r.dense_shape.get_shape()[0], 2)
+      self.assertEqual(r.values.get_shape().as_list(), [None, 2])
 
       with self.assertRaisesRegexp(ValueError, "is not compatible with"):
         _, r = control_flow_ops.while_loop(
             c, b, [i, x],
             [i.get_shape(), tensor_shape.TensorShape([None, 5])])
 
+  @test_util.disable_control_flow_v2("b/116328420 (RaggedTensor)")
+  def testWhileShapeInferenceRaggedTensor(self):
+    if context.executing_eagerly():
+      self.skipTest("b/116328420")
+    i = constant_op.constant(0)
+    x = ragged_factory_ops.constant([[1, 2], [3], [4, 5, 6]])
+    c = lambda i, _: i < 10
+
+    def b1(i, x):  # Adds new values to rows (but doesn't create new rows)
+      return [
+          i + 1,
+          array_ops.concat([x, x], axis=1)
+      ]
+
+    def b2(i, x):  # Adds new rows.
+      return [
+          i + 1,
+          array_ops.concat([x, x], axis=0)
+      ]
+
+    # Default shape invariant; b1 adds new values to rows.
+    _, r = control_flow_ops.while_loop(c, b1, [i, x])
+    self.assertEqual(r.row_splits.shape.as_list(), [4])
+
+    self.assertTrue(r.values.shape.as_list() in ([6 * 2**10], [None]))
+
+    # Default shape invariant; b2 adds new rows (not allowed).
+    if not context.executing_eagerly():
+      with self.assertRaises(ValueError):
+        _, r = control_flow_ops.while_loop(c, b2, [i, x])
+
+    # Explicit shape invariant; b1 adds new values to rows.
+    _, r = control_flow_ops.while_loop(
+        c, b1, [i, x],
+        [i.get_shape(), tensor_shape.TensorShape([None, None])])
+    self.assertTrue(r.row_splits.shape.as_list() in ([4], [None]))
+    self.assertTrue(r.values.shape.as_list() in ([6 * 2**10], [None]))
+
+    # Explicit shape invariant; b2 adds new rows.
+    _, r = control_flow_ops.while_loop(
+        c, b2, [i, x],
+        [i.get_shape(), tensor_shape.TensorShape([None, None])])
+    self.assertTrue(r.row_splits.shape.as_list() in ([3 * 2**10 + 1], [None]))
+    self.assertTrue(r.values.shape.as_list() in ([6 * 2**10], [None]))
+
+  @test_util.disable_control_flow_v2("b/116328420 (RaggedTensor)")
+  def testWhileShapeInferenceRaggedTensorRaggedRank2(self):
+    if context.executing_eagerly():
+      self.skipTest("b/116328420")
+    i = constant_op.constant(0)
+    x = ragged_factory_ops.constant([[[1, 2], [3], [4, 5, 6]],
+                                     [[], [8, 9, 10]]])
+    c = lambda i, _: i < 10
+    def b(i, x):
+      return [
+          i + 1,
+          array_ops.concat([x, x[..., i:i+1]], axis=-1)
+      ]
+    _, r = control_flow_ops.while_loop(c, b, [i, x])
+    self.assertEqual(r.row_splits.shape.as_list(), [3])
+    self.assertTrue(r.values.row_splits.shape.as_list() in ([6], [None]))
+    self.assertTrue(r.values.values.shape.as_list() in ([49], [None]))
+
   def _testNestedWhile_1(self, use_gpu):
     with self.cached_session(use_gpu=use_gpu):
       n = constant_op.constant(0)
@@ -1633,7 +1943,7 @@ class ControlFlowTest(test.TestCase):
 
       res = control_flow_ops.while_loop(
           condition, body, [n, r], parallel_iterations=1)
-      self.assertAllEqual(12, res[1].eval())
+      self.assertAllEqual(12, res[1])
 
   @test_util.run_deprecated_v1
   def testWhileWithControl_2(self):
@@ -1720,7 +2030,7 @@ class ControlFlowTest(test.TestCase):
             return i + 1
 
       r = control_flow_ops.while_loop(loop_condition, loop_body, (i0,))
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual(4, self.evaluate(r))
       self.assertAllClose(65536.0, self.evaluate(v))
 
@@ -1746,7 +2056,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(
           constant_op.constant(False), lambda: constant_op.constant(1.0),
           false_branch)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual(6.0, self.evaluate(r))
       self.assertEqual(99, self.evaluate(v))
 
@@ -1794,7 +2104,6 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(0.0, sess.run(r, {p: False}))
       self.assertEqual([2.0], sess.run(r1, {p: False}))
 
-  @test_util.disable_control_flow_v2("b/116743589")
   @test_util.run_deprecated_v1
   def testCondWhile_3(self):
     self._testCondWhile_3(use_gpu=False)
@@ -1863,7 +2172,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(sess.run(grad, {pred: True}), 8.0)
       self.assertEqual(sess.run(grad, {pred: False}), 0.0)
 
-      if not control_flow_ops.ENABLE_WHILE_V2:
+      if not control_flow_util.ENABLE_CONTROL_FLOW_V2:
         return
 
       self.assertEqual(sess.run(grad_grad, {pred: True}), 0.0)
@@ -1889,7 +2198,7 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(
           loop_iterator, loop_body, [n], parallel_iterations=1)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual(3, self.evaluate(r))
       result = self.evaluate(select)
       self.assertAllClose(np.array([10.0, 10.0, 10.0]), result)
@@ -1915,7 +2224,7 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(
           loop_iterator, loop_body, [n], parallel_iterations=1)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual(3, self.evaluate(r))
       result1 = self.evaluate(select1)
       self.assertAllClose(np.array([10.0, 10.0, 10.0]), result1)
@@ -1923,7 +2232,7 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose(np.array([10.0, 10.0, 10.0]), result2)
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("b/120545219")
   def testWhileUpdateVariable_3(self):
     with self.cached_session():
       select = variables.Variable([3.0, 4.0, 5.0])
@@ -1941,8 +2250,8 @@ class ControlFlowTest(test.TestCase):
           loop_iterator,
           loop_body, [n, array_ops.identity(select)],
           parallel_iterations=1)
-      variables.global_variables_initializer().run()
-      result = r[1].eval()
+      self.evaluate(variables.global_variables_initializer())
+      result = r[1]
     self.assertAllClose(np.array([10.0, 10.0, 10.0]), result)
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
@@ -1951,7 +2260,7 @@ class ControlFlowTest(test.TestCase):
     with self.cached_session():
       var_a = variables.Variable(0, name="a")
       var_b = variables.Variable(0, name="b")
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       c = constant_op.constant(0, name="c")
       asn1 = state_ops.assign_add(var_a, 1, name="a_add")
@@ -1981,7 +2290,7 @@ class ControlFlowTest(test.TestCase):
       # Create some variables.
       var_a = variables.Variable(0, name="a")
       var_b = variables.Variable(0, name="b")
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       # Change condition to check var_b
       def pred(_):
@@ -2013,7 +2322,7 @@ class ControlFlowTest(test.TestCase):
       var_a = variables.Variable(0, name="a")
       var_b = variables.Variable(0, name="b")
       c = constant_op.constant(0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       # Loop condition
       def pred(i):
@@ -2053,7 +2362,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [i], parallel_iterations=1)
       self.assertEqual([10], self.evaluate(r))
       for i in xrange(10):
-        self.assertEqual([i], q.dequeue().eval())
+        self.assertEqual([i], self.evaluate(q.dequeue()))
 
   @test_util.run_v1_only("b/120545219")
   def testWhileTimeOut(self):
@@ -2222,7 +2531,6 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose(216.0, grad_a_val)
       self.assertAllClose(81.0, grad_v_val)
 
-  @test_util.disable_control_flow_v2("b/116630618 (parallel_iters: times out)")
   @test_util.run_deprecated_v1
   def testWhileGrad_Mul(self):
     self._testWhileGrad_Mul(use_gpu=False, p_iters=1)
@@ -2271,8 +2579,8 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [v], parallel_iterations=1)
 
       r = gradients_impl.gradients(r, a)
-      variables.global_variables_initializer().run()
-      self.assertAllClose(216.0, r[0].eval())
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllClose(216.0, r[0])
 
   @test_util.run_deprecated_v1
   def testWhileGrad_ResourceVariable(self):
@@ -2284,8 +2592,180 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [v], parallel_iterations=1)
 
       g = gradients_impl.gradients(r, a)
-      variables.global_variables_initializer().run()
-      self.assertAllClose(216.0, g[0].eval())
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllClose(216.0, g[0])
+
+  def testWhileGrad_ResourceVarInFunctionCall(self):
+
+    @def_function.function
+    def foo(x, var):
+      return x + math_ops.reduce_sum(var.sparse_read([1, 3]))
+
+    @def_function.function
+    def bar(var):
+      r = control_flow_ops.while_loop(
+          lambda i, _: i < 2,
+          lambda i, x: (i + 1, foo(x, var)),
+          [0, 0.0])[1]
+      return gradients_impl.gradients(r, var)[0]
+
+    var = resource_variable_ops.ResourceVariable([1., 2., 3., 4.])
+    self.evaluate(variables.global_variables_initializer())
+    grad = self.evaluate(bar(var))
+    self.assertIsInstance(grad, ops.IndexedSlicesValue)
+    self.assertAllEqual(gradient_checker_v2._to_numpy(grad), [0., 2., 0., 2.])
+
+  def testWhileGrad_ResourceVarInNestedFunctionCall(self):
+
+    @def_function.function
+    def foo(x, var):
+      return x + math_ops.reduce_sum(var.sparse_read([1, 3]))
+
+    @def_function.function
+    def foo2(x, var):
+      return foo(x, var)
+
+    @def_function.function
+    def bar(var):
+      r = control_flow_ops.while_loop(
+          lambda i, _: i < 2,
+          lambda i, x: (i + 1, foo2(x, var)),
+          [0, 0.0])[1]
+      return gradients_impl.gradients(r, var)[0]
+
+    var = resource_variable_ops.ResourceVariable([1., 1., 1., 1.])
+    self.evaluate(variables.global_variables_initializer())
+    grad = self.evaluate(bar(var))
+    self.assertIsInstance(grad, ops.IndexedSlicesValue)
+    self.assertAllEqual(gradient_checker_v2._to_numpy(grad), [0., 2., 0., 2.])
+
+  def testWhileGrad_ResourceVarInLoopInFunctionCall(self):
+
+    @def_function.function
+    def foo(x, var):
+      return control_flow_ops.while_loop(
+          lambda j, _: j < 3,
+          lambda j, y: (j + 1,
+                        y + math_ops.reduce_sum(var.sparse_read([1, 2]))),
+          [0, x])[1]
+
+    @def_function.function
+    def bar(var):
+      r = control_flow_ops.while_loop(
+          lambda i, _: i < 2,
+          lambda i, x: (i + 1, foo(x, var)),
+          [0, 0.0])[1]
+      return gradients_impl.gradients(r, var)[0]
+
+    var = resource_variable_ops.ResourceVariable([1., 1., 1., 1.])
+    self.evaluate(variables.global_variables_initializer())
+    grad = self.evaluate(bar(var))
+    self.assertIsInstance(grad, ops.IndexedSlicesValue)
+    self.assertAllEqual(gradient_checker_v2._to_numpy(grad), [0., 6., 6., 0.])
+
+  def testWhileCondGrad_ResourceVarInFunctionCall(self):
+
+    @def_function.function
+    def foo(x, var):
+      return x + var.sparse_read([1])[0]
+
+    def body(i, x):
+      return (i + 1, control_flow_ops.cond(
+          math_ops.equal(i % 2, 0),
+          lambda: foo(x, var1),
+          lambda: foo(x, var2)))
+
+    @def_function.function
+    def bar(var1, var2):
+      r = control_flow_ops.while_loop(
+          lambda i, _: i < 4, body, [0, 0.0])
+      return gradients_impl.gradients(r, [var1, var2])
+
+    var1 = resource_variable_ops.ResourceVariable([1., 2., 3.])
+    var2 = resource_variable_ops.ResourceVariable([4., 5.])
+    self.evaluate(variables.global_variables_initializer())
+    grads = self.evaluate(bar(var1, var2))
+    self.assertAllEqual(gradient_checker_v2._to_numpy(grads[0]), [0., 2., 0.])
+    self.assertAllEqual(gradient_checker_v2._to_numpy(grads[1]), [0., 2.])
+
+  @test_util.run_deprecated_v1
+  def testWhileGrad_ResourceVarSparseRead(self):
+    # NOTE(skyewm): this test is interesting because the
+    # ResourceVariable.sparse_read gradient function returns an IndexedSlices.
+    var = resource_variable_ops.ResourceVariable(np.ones(5),
+                                                 dtype=dtypes.float32)
+    r = control_flow_ops.while_loop(
+        lambda i, _: i < 3,
+        lambda i, x: (i + 1, x * math_ops.reduce_sum(var.sparse_read([1, 3]))),
+        [0, constant_op.constant(1.0)])[1]
+    grad = gradients_impl.gradients(r, var)[0]
+
+    self.evaluate(variables.global_variables_initializer())
+    grad_val = self.evaluate(grad)
+    self.assertIsInstance(grad_val, ops.IndexedSlicesValue)
+    arr = gradient_checker_v2._to_numpy(grad_val)
+    self.assertAllEqual(arr, [0., 12., 0., 12., 0.])
+
+  @test_util.run_deprecated_v1
+  def testWhileGrad_MultiResourceVarSparseRead(self):
+    # NOTE(skyewm): this test is interesting because the
+    # ResourceVariable.sparse_read gradient function returns an IndexedSlices.
+    var1 = resource_variable_ops.ResourceVariable(np.ones(5),
+                                                  dtype=dtypes.float32)
+    var2 = resource_variable_ops.ResourceVariable(np.ones(3),
+                                                  dtype=dtypes.float32)
+    x1_init = constant_op.constant([0., 0.])
+    x2_init = constant_op.constant(1.)
+    x3_init = constant_op.constant(1.)
+
+    def body(i, unused_x1, x2, x3):
+      y1 = var1.sparse_read([1, 3])
+      y2 = x2 * 2
+      y3 = x3 * math_ops.reduce_sum(var2.sparse_read([0]))
+      return i + 1, y1, y2, y3
+
+    r = control_flow_ops.while_loop(
+        lambda i, x1, x2, x3: i < 3, body,
+        [0, x1_init, x2_init, x3_init])[1:]
+    var1_grad, var2_grad = gradients_impl.gradients(r, [var1, var2])
+
+    self.evaluate(variables.global_variables_initializer())
+    var1_grad_val = self.evaluate(var1_grad)
+    var2_grad_val = self.evaluate(var2_grad)
+    self.assertIsInstance(var1_grad_val, ops.IndexedSlicesValue)
+    self.assertIsInstance(var2_grad_val, ops.IndexedSlicesValue)
+    self.assertAllEqual(gradient_checker_v2._to_numpy(var1_grad_val),
+                        [0., 1., 0., 1., 0.])
+    self.assertAllEqual(gradient_checker_v2._to_numpy(var2_grad_val),
+                        [3., 0., 0.])
+
+  @test_util.run_deprecated_v1
+  def testWhileGrad_Gather(self):
+    # NOTE(skyewm): this test is interesting because the gather gradient
+    # function returns an IndexedSlices.
+    x = constant_op.constant([1., 1., 1., 1., 1.])
+    y = control_flow_ops.while_loop(
+        lambda i, _: i < 3,
+        lambda i, x: (i + 1, x + array_ops.gather(x, [0])),
+        [0, x[:1]])[1]
+    z = y * 3.0
+    grad = gradients_impl.gradients(z, x)[0]
+    self.assertEqual(self.evaluate(y), 8.)
+    self.assertAllEqual(self.evaluate(grad), [24., 0., 0., 0., 0.])
+
+  @test_util.run_deprecated_v1
+  def testWhileGrad_GatherNoFanOut(self):
+    # NOTE(skyewm): this test is interesting because the gather gradient
+    # function returns an IndexedSlices.
+    x = constant_op.constant([1., 1., 1., 1., 1.])
+    y = control_flow_ops.while_loop(
+        lambda i, _: i < 3,
+        lambda i, x: (i + 1, array_ops.gather(x, [0])),
+        [0, x[:1]])[1]
+    z = y * 3.0
+    grad = gradients_impl.gradients(z, x)[0]
+    self.assertEqual(self.evaluate(y), 1.)
+    self.assertAllEqual(self.evaluate(grad), [3., 0., 0., 0., 0.])
 
   @test_util.run_v1_only("b/120545219")
   def testWhileGradInCond(self):
@@ -2359,6 +2839,24 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(i_val, 3)
       self.assertAllClose(x_val, 1.0)
 
+  @test_util.run_gpu_only
+  def testGpuResourceAccess(self):
+    with ops.device(test.gpu_device_name()):
+      var = resource_variable_ops.ResourceVariable(constant_op.constant(3.0))
+
+    @def_function.function
+    def foo():
+      return control_flow_ops.while_loop(
+          lambda i, _: i < 3,
+          lambda i, x: (i + 1, control_flow_ops.cond(
+              constant_op.constant(True),
+              lambda: x + var,
+              lambda: x)),
+          [0, 0.0])[1]
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertEqual(self.evaluate(foo()), 9.0)
+
   def testNestedResourceAccess(self):
     var = resource_variable_ops.ResourceVariable(constant_op.constant(3.0))
 
@@ -2399,7 +2897,7 @@ class ControlFlowTest(test.TestCase):
     #   outer_loop(x) = g(g(x)) = 4x + 81
     #   outer_loop'(x) = 4
     # Note that v1 control flow gets 4.0 as well if the cond is removed.
-    if control_flow_ops.ENABLE_WHILE_V2 and control_flow_ops.ENABLE_COND_V2:
+    if control_flow_util.ENABLE_CONTROL_FLOW_V2:
       self.assertEqual(grad, 4.0)
 
   def testWhile_NestedInput(self):
@@ -2462,13 +2960,13 @@ class ControlFlowTest(test.TestCase):
       rx, ry = control_flow_ops.while_loop(c, b, [x, y], parallel_iterations=1)
 
       r = gradients_impl.gradients([rx, ry], x)
-      self.assertAllClose(304.0, r[0].eval())
+      self.assertAllClose(304.0, r[0])
       r = gradients_impl.gradients([rx, ry], y)
-      self.assertAllClose(124.0, r[0].eval())
+      self.assertAllClose(124.0, r[0])
       r = gradients_impl.gradients([rx], x)
-      self.assertAllClose(295.0, r[0].eval())
+      self.assertAllClose(295.0, r[0])
       r = gradients_impl.gradients([rx], y)
-      self.assertAllClose(120.0, r[0].eval())
+      self.assertAllClose(120.0, r[0])
 
   @test_util.run_deprecated_v1
   def testWhileGrad_Dependency(self):
@@ -2486,9 +2984,9 @@ class ControlFlowTest(test.TestCase):
       ri, rx = control_flow_ops.while_loop(c, b, [i, x], parallel_iterations=1)
 
       r = gradients_impl.gradients([ri, rx], x)
-      self.assertAllClose(1024.0, r[0].eval())
+      self.assertAllClose(1024.0, r[0])
       r = gradients_impl.gradients([rx], x)
-      self.assertAllClose(1024.0, r[0].eval())
+      self.assertAllClose(1024.0, r[0])
 
   @test_util.disable_control_flow_v2("b/116355153 (back_prop flag)")
   @test_util.run_v1_only("b/120545219")
@@ -2500,7 +2998,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [v], back_prop=False)
       r = math_ops.add(r, v)
       r = gradients_impl.gradients(r, v)
-      self.assertAllClose(1.0, r[0].eval())
+      self.assertAllClose(1.0, r[0])
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
   @test_util.run_v1_only("b/120545219")
@@ -2521,7 +3019,7 @@ class ControlFlowTest(test.TestCase):
           cond=cond, body=body, loop_vars=loop_vars)
       cost = math_ops.reduce_sum(tensors[2])
       grad = gradients_impl.gradients(cost, [variable])
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(np.ones([2, 3]), sess.run(grad[0]))
 
   @test_util.run_deprecated_v1
@@ -2561,7 +3059,7 @@ class ControlFlowTest(test.TestCase):
       _, rx = control_flow_ops.while_loop(c, b, [i, rx], parallel_iterations=1)
 
       r = gradients_impl.gradients([rx], x)
-      self.assertAllClose(1024.0, r[0].eval())
+      self.assertAllClose(1024.0, r[0])
 
   @test_util.run_v1_only("b/120545219")
   def testWhileGrad_ParallelTwoLoops(self):
@@ -2581,7 +3079,7 @@ class ControlFlowTest(test.TestCase):
       rx = math_ops.add(r1, r2)
 
       r = gradients_impl.gradients([rx], x)
-      self.assertAllClose(64.0, r[0].eval())
+      self.assertAllClose(64.0, r[0])
 
   @test_util.run_v1_only("b/120545219")
   def testWhileGrad_OneOutputWithControlDependencyOnSecond(self):
@@ -2683,10 +3181,10 @@ class ControlFlowTest(test.TestCase):
 
       def inner_loop(t):
         fn = lambda n: n + math_ops.square(var)
-        return functional_ops.map_fn(fn=fn, elems=t, parallel_iterations=10)
+        return map_fn.map_fn(fn=fn, elems=t, parallel_iterations=10)
 
       def outer_loop(inp):
-        return functional_ops.map_fn(
+        return map_fn.map_fn(
             fn=inner_loop, elems=inp, parallel_iterations=10)
 
       var = variables.Variable(constant_op.constant(3.0))
@@ -2696,7 +3194,7 @@ class ControlFlowTest(test.TestCase):
       train_op = optimizer.minimize(math_ops.reduce_mean(math_ops.square(res)))
       self.evaluate(variables.global_variables_initializer())
       self.evaluate(train_op)
-      self.assertAllClose(2.999, self.evaluate(var))
+      self.assertAllClose(2.999, var.read_value())
 
   def _testWhileCondGrad_Simple(self, use_gpu):
     with self.cached_session(use_gpu=use_gpu):
@@ -2714,13 +3212,11 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients(r, v)[0]
       self.assertAllClose(1024.0, self.evaluate(r))
 
-  @test_util.disable_control_flow_v2("b/117519152")
   @test_util.run_deprecated_v1
   def testWhileCondGrad_Simple(self):
     self._testWhileCondGrad_Simple(use_gpu=False)
     self._testWhileCondGrad_Simple(use_gpu=True)
 
-  @test_util.disable_control_flow_v2("b/117276490")
   @test_util.run_deprecated_v1
   def testWhileCondGrad_UnknownShape(self):
     with self.cached_session() as sess:
@@ -2783,7 +3279,7 @@ class ControlFlowTest(test.TestCase):
       grad_ys = [variables.VariableV1(73)._ref()]  # pylint: disable=protected-access
       grad = gradients_impl.gradients([r[1]], [x], grad_ys=grad_ys)
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertEqual(r[0].dtype, dtypes.int32)
       self.assertEqual(r[1].dtype, dtypes.float32_ref)
@@ -2841,7 +3337,6 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients(r.values, values)[0]
       self.assertAllClose(np.array([1024.0, 1024.0]), self.evaluate(r))
 
-  @test_util.disable_control_flow_v2("b/115920078 (gradients)")
   @test_util.run_v1_only("b/120545219")
   def testCallGradInLoop(self):
     with self.cached_session() as sess:
@@ -2875,13 +3370,44 @@ class ControlFlowTest(test.TestCase):
       def b(i, y):
         return [
             i + 1,
-            functional_ops.map_fn(lambda x: math_ops.multiply(x, param), y)
+            map_fn.map_fn(lambda x: math_ops.multiply(x, param), y)
         ]
 
       r = control_flow_ops.while_loop(c, b, [n0, y0], parallel_iterations=1)
       r = gradients_impl.gradients(r, param)[0]
       self.assertAllClose(107520.0, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
+  def testNestedWhileAndTensorArray(self):
+    n = constant_op.constant(3.0)
+
+    def Body(row, ta):
+
+      def InnerBody(row, col, ta):
+        # Note: row and col are 1-based.
+        ta = ta.write(
+            math_ops.cast(n * (row - 1.) + col - 1., dtypes.int32), row * col)
+        return row, col + 1., ta
+
+      ta = control_flow_ops.while_loop(
+          lambda _, col, _1: col <= n,
+          InnerBody, [row, constant_op.constant(1.), ta],
+          return_same_structure=False)[2]
+      return row + 1., ta
+
+    ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=9)
+    ta = control_flow_ops.while_loop(
+        lambda row, _: row <= n,
+        Body, [constant_op.constant(1.), ta],
+        return_same_structure=False)[1]
+
+    output = array_ops.reshape(ta.stack(), [3, 3])
+    self.assertAllEqual(
+        self.evaluate(output), [[1., 2., 3.], [2., 4., 6.], [3., 6., 9.]])
+    # TODO(b/117675481): This does not work with current TA. Enable with new TA.
+    # grad = gradients_impl.gradients(output, [n])
+    # self.assertEqual(self.evaluate(grad), 3.5)
+
   @test_util.run_deprecated_v1
   def testWhileGrad_StopGrad(self):
     with self.cached_session():
@@ -2972,7 +3498,6 @@ class ControlFlowTest(test.TestCase):
       all_ops = x.graph.get_operations()
       self.assertFalse(any(name in op.name for op in all_ops))
 
-  @test_util.disable_control_flow_v2("b/117954949")
   @test_util.run_deprecated_v1
   def testWhileGradGradFail(self):
     theta = variables.Variable(initial_value=1.)
@@ -2982,7 +3507,7 @@ class ControlFlowTest(test.TestCase):
 
     result = functional_ops.scan(fn, np.array([1., 2., 3.], dtype=np.float32))
     grad_theta = gradients_impl.gradients(result, theta)
-    if not control_flow_ops.ENABLE_WHILE_V2:
+    if not control_flow_util.ENABLE_CONTROL_FLOW_V2:
       with self.assertRaisesRegexp(TypeError, "Second-order gradient"):
         gradients_impl.gradients(grad_theta, theta)
     grad_theta_stopped = array_ops.stop_gradient(grad_theta)
@@ -3043,7 +3568,6 @@ class ControlFlowTest(test.TestCase):
       self.evaluate(q.initializer)
       self.assertAllClose([1., 1.], self.evaluate(dy_dq))
 
-  @test_util.disable_control_flow_v2("b/115920078 (gradients)")
   @test_util.run_v1_only("b/120545219")
   def testIssue16504(self):
     c = constant_op.constant(np.arange(100), dtype=dtypes.float32)
@@ -3092,7 +3616,7 @@ class ControlFlowTest(test.TestCase):
       grads = linalg_ops.norm(gradients_impl.gradients(r, vars_)[0])
       z = math_ops.add(r, array_ops.stop_gradient(math_ops.reduce_sum(grads)))
       result = gradients_impl.gradients(z, vars_)[0]
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual(5.0, self.evaluate(result))
 
   @test_util.run_v1_only("b/120545219")
@@ -3144,14 +3668,14 @@ class ControlFlowTest(test.TestCase):
               x < y: f1,
               x > z: f2
           }, default=f3, exclusive=True)
-      self.assertAllEqual(r1.eval(), 17)
+      self.assertAllEqual(r1, 17)
 
       r2 = control_flow_ops.case([(y > z, f1), (y > x, f2)], default=f3)
-      self.assertAllEqual(r2.eval(), 23)
+      self.assertAllEqual(r2, 23)
 
       # Duplicate events can happen, first one is selected
       r3 = control_flow_ops.case([(x < y, f1), (x < y, f2)], default=f3)
-      self.assertAllEqual(r3.eval(), 17)
+      self.assertAllEqual(r3, 17)
 
       # Duplicate events cause an error if exclusive = True
       r4 = control_flow_ops.case(
@@ -3161,7 +3685,7 @@ class ControlFlowTest(test.TestCase):
 
       # Check that the default is called if none of the others are
       r5 = control_flow_ops.case({x > y: f1}, default=f3)
-      self.assertAllEqual(r5.eval(), -1)
+      self.assertAllEqual(r5, -1)
 
       ran_once = [False, False, False]
 
@@ -3180,7 +3704,7 @@ class ControlFlowTest(test.TestCase):
           [(x < y, break_run_twice(0)), (x > y, break_run_twice(1))],
           default=lambda: constant_op.constant(2))
 
-      self.assertAllEqual(r6.eval(), 0)
+      self.assertAllEqual(r6, 0)
 
   @test_util.run_v1_only("b/120545219")
   def testCaseSideEffects(self):
@@ -3203,17 +3727,17 @@ class ControlFlowTest(test.TestCase):
       r2 = control_flow_ops.case(
           ((x > y, a), (x > y, b)), default=c, exclusive=True)
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllEqual(self.evaluate([v0, v1, v2]), [-1] * 3)
       self.assertEqual(2, self.evaluate(r2))
       self.assertAllEqual(self.evaluate([v0, v1, v2]), [-1, -1, 2])
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllEqual(self.evaluate([v0, v1, v2]), [-1] * 3)
       self.assertEqual(1, self.evaluate(r1))
       self.assertAllEqual(self.evaluate([v0, v1, v2]), [-1, 1, -1])
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllEqual(self.evaluate([v0, v1, v2]), [-1] * 3)
       self.assertEqual(0, self.evaluate(r0))
       self.assertAllEqual(self.evaluate([v0, v1, v2]), [0, -1, -1])
@@ -3236,7 +3760,7 @@ class ControlFlowTest(test.TestCase):
 
       i = control_flow_ops.cond(p, a, b)
       self.assertTrue(isinstance(i, ops.Tensor))
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertEqual(0, self.evaluate(v))
 
@@ -3494,7 +4018,7 @@ class ControlFlowTest(test.TestCase):
           lambda i, v: [i + 1, script_ops.py_func(func, [v], [dtypes.float32])[0]],
           [constant_op.constant(0), constant_op.constant(2.0, dtypes.float32)],
           [tensor_shape.unknown_shape(), tensor_shape.unknown_shape()])
-      self.assertEqual(r[1].eval(), 65536.0)
+      self.assertEqual(self.evaluate(r[1]), 65536.0)
 
   @test_util.run_v1_only("b/120545219")
   def testWhileFuncBasic(self):
@@ -3511,10 +4035,10 @@ class ControlFlowTest(test.TestCase):
           [tensor_shape.unknown_shape(),
            tensor_shape.unknown_shape()])
       grad = gradients_impl.gradients(r, x)[0]
-      self.assertEqual(r[1].eval(), 65536.0)
-      self.assertEqual(grad.eval(), 524288.0)
+      self.assertEqual(self.evaluate(r[1]), 65536.0)
+      self.assertEqual(self.evaluate(grad), 524288.0)
       # while_v2 does not have stacks.
-      if not control_flow_ops.ENABLE_WHILE_V2:
+      if not control_flow_util.ENABLE_CONTROL_FLOW_V2:
         self.assertEqual(
             len([op for op in x.graph.get_operations() if op.type == "StackV2"
                 ]), 1)
@@ -3564,6 +4088,21 @@ class ControlFlowTest(test.TestCase):
       result = func(qint)
       self.evaluate(result)
 
+  def testSparseIdentity(self):
+    st1 = sparse_tensor.SparseTensor([[0, 5]], ['x'], [10, 10])
+    st2 = control_flow_ops._Identity(st1)
+    self.assertAllEqual(st1.indices, st2.indices)
+    self.assertAllEqual(st1.values, st2.values)
+    self.assertAllEqual(st1.dense_shape, st2.dense_shape)
+
+  def testSparseEnterExit(self):
+    st1 = sparse_tensor.SparseTensor([[0, 5]], ['x'], [10, 10])
+    st2 = control_flow_ops._Enter(st1, "foo_1")
+    st3 = control_flow_ops.exit(st2)
+    self.assertAllEqual(st1.indices, st3.indices)
+    self.assertAllEqual(st1.values, st3.values)
+    self.assertAllEqual(st1.dense_shape, st3.dense_shape)
+
 
 class ControlFlowContextCheckTest(test.TestCase):
 
@@ -3620,14 +4159,14 @@ class ControlFlowContextCheckTest(test.TestCase):
     while_tensor = self._getWhileTensor()
     with self.assertRaisesRegexp(
         ValueError,
-        "Cannot use 'while_1/Add' as input to 'while/Const_1' because they are "
+        "Cannot use 'while/Const_1' as input to 'while_1/Add' because they are "
         "in different while loops. See info log for more details."):
       control_flow_ops.while_loop(lambda i: i < 10,
                                   lambda x: math_ops.add(1, while_tensor), [0])
 
     with self.assertRaisesRegexp(
         ValueError,
-        "Cannot use 'while_2/NextIteration' as input to 'while/Const_1' "
+        "Cannot use 'while/Const_1' as input to 'while_2/NextIteration' "
         "because they are in different while loops. See info log for more "
         "details."):
       control_flow_ops.while_loop(lambda i: i < 10, lambda i: while_tensor, [0])
@@ -3684,7 +4223,7 @@ class ControlFlowContextCheckTest(test.TestCase):
 
     with self.assertRaisesRegexp(
         ValueError,
-        "Cannot use 'cond/while_1/add' as input to 'cond/while/Const_1' because"
+        "Cannot use 'cond/while/Const_1' as input to 'cond/while_1/add' because"
         " they are in different while loops. See info log for more details."):
       control_flow_ops.cond(
           math_ops.less(1, 2), true_fn, lambda: constant_op.constant(0))
@@ -3876,7 +4415,7 @@ class WhileOpBenchmark(test.Benchmark):
     with session.Session() as sess, ops.device(default_device):
       # Get the initial id i, input x, and kernel.
       i, x, kernel = self._getInitVariables()
-      self.evaluate(variables.global_variables_initializer())
+      variables.global_variables_initializer().run()
 
       if static_unroll:
         for _ in xrange(steps):
diff --git a/tensorflow/python/kernel_tests/control_flow_util_v2_test.py b/tensorflow/python/kernel_tests/control_flow_util_v2_test.py
index d0374a77005db4597ddbce76c1d2a3b9ac0e792d..08d3214e288bf873515f0b5a45ddf1e50ee1b281 100644
--- a/tensorflow/python/kernel_tests/control_flow_util_v2_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_util_v2_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import control_flow_util_v2
 from tensorflow.python.platform import test
 
@@ -30,14 +31,11 @@ from tensorflow.python.platform import test
 class ControlFlowUtilV2Test(test.TestCase):
 
   def setUp(self):
-    self._enable_cond_v2_old = control_flow_ops.ENABLE_COND_V2
-    self._enable_while_v2_old = control_flow_ops.ENABLE_WHILE_V2
-    control_flow_ops.ENABLE_COND_V2 = True
-    control_flow_ops.ENABLE_WHILE_V2 = True
+    self._enable_control_flow_v2_old = control_flow_util.ENABLE_CONTROL_FLOW_V2
+    control_flow_util.ENABLE_CONTROL_FLOW_V2 = True
 
   def tearDown(self):
-    control_flow_ops.ENABLE_COND_V2 = self._enable_cond_v2_old
-    control_flow_ops.ENABLE_WHILE_V2 = self._enable_while_v2_old
+    control_flow_util.ENABLE_CONTROL_FLOW_V2 = self._enable_control_flow_v2_old
 
   def _create_control_flow(self, expect_in_defun):
     """Helper method for testInDefun."""
diff --git a/tensorflow/python/kernel_tests/conv1d_test.py b/tensorflow/python/kernel_tests/conv1d_test.py
index e8463323df90bd37d927f88bd41b09bef45de541..4b44bb6c913533b3025692b0eb06d7e2b77bfb9e 100644
--- a/tensorflow/python/kernel_tests/conv1d_test.py
+++ b/tensorflow/python/kernel_tests/conv1d_test.py
@@ -68,7 +68,7 @@ class Conv1DTest(test.TestCase):
       f = constant_op.constant(
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv1d_transpose(
-          x, f, y_shape, stride=stride, padding="VALID")
+          x, f, y_shape, strides=stride, padding="VALID")
       value = self.evaluate(output)
 
       cache_values = np.zeros(y_shape, dtype=np.float32)
diff --git a/tensorflow/python/kernel_tests/conv1d_transpose_test.py b/tensorflow/python/kernel_tests/conv1d_transpose_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..02ac5af7aae80277d7a93ef0585c1ccb41286bae
--- /dev/null
+++ b/tensorflow/python/kernel_tests/conv1d_transpose_test.py
@@ -0,0 +1,260 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for convolution related functionality in tensorflow.ops.nn."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import nn_ops
+import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
+from tensorflow.python.platform import test
+
+
+class Conv1DTransposeTest(test.TestCase):
+
+  def testConv1DTransposeSingleStride(self):
+    with self.cached_session():
+      strides = [1, 1, 1]
+
+      # Input, output: [batch, width, depth]
+      x_shape = [2, 6, 3]
+      y_shape = [2, 6, 2]
+
+      # Filter: [kernel_width, output_depth, input_depth]
+      f_shape = [3, 2, 3]
+
+      x = constant_op.constant(
+          1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+      f = constant_op.constant(
+          1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+      output = nn_ops.conv1d_transpose(
+          x, f, y_shape, strides=strides, padding="SAME")
+      value = self.evaluate(output)
+
+      for n in xrange(y_shape[0]):
+        for w in xrange(y_shape[1]):
+          for c in xrange(y_shape[2]):
+            target = 2 * 3.0
+            w_in = w > 0 and w < y_shape[1] - 1
+            if w_in:
+              target += 3.0
+            self.assertAllClose(target, value[n, w, c])
+
+  def testConv1DTransposeSame(self):
+    with self.cached_session():
+      strides = [1, 2, 1]
+
+      # Input, output: [batch, width, depth]
+      x_shape = [2, 4, 3]
+      y_shape = [2, 8, 2]
+
+      # Filter: [kernel_width, output_depth, input_depth]
+      f_shape = [3, 2, 3]
+
+      x = constant_op.constant(
+          1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+      f = constant_op.constant(
+          1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+      output = nn_ops.conv1d_transpose(
+          x, f, y_shape, strides=strides, padding="SAME")
+      value = self.evaluate(output)
+
+      for n in xrange(x_shape[0]):
+        for k in xrange(f_shape[1]):
+          for w in xrange(y_shape[1]):
+            target = 3.0
+            # We add a case for locations divisible by the stride.
+            w_in = w % strides[1] == 0 and w > 0 and w < y_shape[1] - 1
+            if w_in:
+              target += 3.0
+            self.assertAllClose(target, value[n, w, k])
+
+  def testConv1DTransposeValid(self):
+    with self.cached_session():
+      strides = [1, 2, 1]
+
+      # Input, output: [batch, width, depth]
+      x_shape = [2, 4, 3]
+      y_shape = [2, 9, 2]
+
+      # Filter: [kernel_width, output_depth, input_depth]
+      f_shape = [3, 2, 3]
+
+      x = constant_op.constant(
+          1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+      f = constant_op.constant(
+          1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+      output = nn_ops.conv1d_transpose(
+          x, f, y_shape, strides=strides, padding="VALID")
+      value = self.evaluate(output)
+
+      cache_values = np.zeros(y_shape, dtype=np.float32)
+
+      # The amount of padding added
+      pad = 1
+
+      for n in xrange(x_shape[0]):
+        for k in xrange(f_shape[1]):
+          for w in xrange(pad, y_shape[1] - pad):
+            target = 3.0
+            # We add a case for locations divisible by the stride.
+            w_in = w % strides[1] == 0 and w > pad and w < y_shape[1] - 1 - pad
+            if w_in:
+              target += 3.0
+            cache_values[n, w, k] = target
+
+          # copy values in the border
+          cache_values[n, 0, k] = cache_values[n, 1, k]
+          cache_values[n, -1, k] = cache_values[n, -2, k]
+          cache_values[n, :, k] = cache_values[n, :, k]
+
+    self.assertAllClose(cache_values, value)
+
+  @test_util.run_deprecated_v1
+  def testGradient(self):
+    x_shape = [2, 4, 3]
+    f_shape = [3, 2, 3]
+    y_shape = [2, 8, 2]
+    strides = [1, 2, 1]
+    np.random.seed(1)  # Make it reproducible.
+    x_val = np.random.random_sample(x_shape).astype(np.float64)
+    f_val = np.random.random_sample(f_shape).astype(np.float64)
+    with self.cached_session():
+      x = constant_op.constant(x_val, name="x", dtype=dtypes.float32)
+      f = constant_op.constant(f_val, name="f", dtype=dtypes.float32)
+      output = nn_ops.conv1d_transpose(
+          x, f, y_shape, strides=strides, padding="SAME")
+      err = gradient_checker.compute_gradient_error([x, f], [x_shape, f_shape],
+                                                    output, y_shape)
+    print("conv1d_transpose gradient err = %g " % err)
+    err_tolerance = 0.0005
+    self.assertLess(err, err_tolerance)
+
+  def testConv1DTransposeSingleStrideNCW(self):
+    # `NCW` data format is only supported for CUDA device.
+    if test.is_gpu_available(cuda_only=True):
+      with self.session(use_gpu=True):
+        strides = [1, 1, 1]
+
+        # Input, output: [batch, depth, width]
+        x_shape = [2, 3, 4]
+        y_shape = [2, 2, 4]
+
+        # Filter: [kernel_width, output_depth, input_depth]
+        f_shape = [3, 2, 3]
+
+        x = constant_op.constant(
+            1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+        f = constant_op.constant(
+            1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+
+        output = nn_ops.conv1d_transpose(
+            x, f, y_shape, strides=strides, padding="SAME", data_format="NCW")
+
+        value = self.evaluate(output)
+        for n in xrange(x_shape[0]):
+          for k in xrange(f_shape[1]):
+            for w in xrange(y_shape[2]):
+              target = 2 * 3.0
+              w_in = w > 0 and w < y_shape[2] - 1
+              if w_in:
+                target += 3.0
+              self.assertAllClose(target, value[n, k, w])
+
+  def testConv1DTransposeSameNCW(self):
+    # `NCW` data format is only supported for CUDA device.
+    if test.is_gpu_available(cuda_only=True):
+      with self.session(use_gpu=True):
+        strides = [1, 1, 2]
+
+        # Input, output: [batch, depth, width]
+        x_shape = [2, 3, 4]
+        y_shape = [2, 2, 8]
+
+        # Filter: [kernel_width, output_depth, input_depth]
+        f_shape = [3, 2, 3]
+
+        x = constant_op.constant(
+            1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+        f = constant_op.constant(
+            1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+
+        output = nn_ops.conv1d_transpose(
+            x, f, y_shape, strides=strides, padding="SAME", data_format="NCW")
+
+        value = self.evaluate(output)
+        for n in xrange(x_shape[0]):
+          for k in xrange(f_shape[1]):
+            for w in xrange(y_shape[2]):
+              target = 3.0
+              # We add a case for locations divisible by the stride.
+              w_in = w % strides[2] == 0 and w > 0 and w < y_shape[2] - 1
+              if w_in:
+                target += 3.0
+              self.assertAllClose(target, value[n, k, w])
+
+  def testConv1DTransposeValidNCW(self):
+    # `NCW` data format is only supported for CUDA device.
+    if test.is_gpu_available(cuda_only=True):
+      with self.session(use_gpu=True):
+        strides = [1, 1, 2]
+
+        # Input, output: [batch, depth, width]
+        x_shape = [2, 3, 4]
+        y_shape = [2, 2, 9]
+
+        # Filter: [kernel_width, output_depth, input_depth]
+        f_shape = [3, 2, 3]
+
+        x = constant_op.constant(
+            1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+        f = constant_op.constant(
+            1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+        output = nn_ops.conv1d_transpose(
+            x, f, y_shape, strides=strides, padding="VALID", data_format="NCW")
+
+        value = self.evaluate(output)
+        cache_values = np.zeros(y_shape, dtype=np.float32)
+        # The amount of padding added
+        pad = 1
+        for n in xrange(x_shape[0]):
+          for k in xrange(f_shape[1]):
+            for w in xrange(pad, y_shape[2] - pad):
+              target = 3.0
+              # We add a case for locations divisible by the stride.
+              w_in = w % strides[2] == 0 and w > pad and \
+                     w < y_shape[2] - 1 - pad
+              if w_in:
+                target += 3.0
+              cache_values[n, k, w] = target
+
+            # copy values in the border
+            cache_values[n, k, 0] = cache_values[n, k, 1]
+            cache_values[n, k, -1] = cache_values[n, k, -2]
+            cache_values[n, k, :] = cache_values[n, k, :]
+
+        self.assertAllClose(cache_values, value)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 2f6f3bb383b381de1dac78cc72882fe5fe4291c9..732d870c3ae11f368b76a62be4ef68fe525b7d19 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -26,13 +26,18 @@ import numpy as np
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.contrib import layers
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_impl
@@ -165,6 +170,12 @@ class Conv2DTest(test.TestCase):
       # as we will be using its gradients as reference for fp16 gradients.
       return [dtypes.float32, dtypes.float16, dtypes.float64]
 
+  def _CreateNumpyTensor(self, shape):
+    total_size = 1
+    for s in shape:
+      total_size *= s
+    return np.arange(1, total_size + 1, dtype=np.float32).reshape(shape)
+
   def _SetupValuesForDevice(self, tensor_in_sizes, filter_in_sizes, dilations,
                             strides, padding, data_format, dtype, use_gpu):
     """Verifies the output values of the convolution function.
@@ -183,26 +194,22 @@ class Conv2DTest(test.TestCase):
     Returns:
       Symbolic tensor value that can be used to execute the computation
     """
-    total_size_1 = 1
-    total_size_2 = 1
-    for s in tensor_in_sizes:
-      total_size_1 *= s
-    for s in filter_in_sizes:
-      total_size_2 *= s
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
-    x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
+    x1 = self._CreateNumpyTensor(tensor_in_sizes)
+    x2 = self._CreateNumpyTensor(filter_in_sizes)
 
     with test_util.device(use_gpu):
       t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=dtype)
       t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=dtype)
       strides = [1] + strides + [1]
       dilations = [1] + dilations + [1]
+      if isinstance(padding, (list, tuple)):
+        padding = [(0, 0)] + padding + [(0, 0)]
       if data_format == "NCHW":
         t1 = test_util.NHWCToNCHW(t1)
         strides = test_util.NHWCToNCHW(strides)
         dilations = test_util.NHWCToNCHW(dilations)
+        if isinstance(padding, (list, tuple)):
+          padding = test_util.NHWCToNCHW(padding)
       conv = nn_ops.conv2d(
           t1,
           t2,
@@ -249,22 +256,13 @@ class Conv2DTest(test.TestCase):
       tensors.append(_SetupVal(data_format, use_gpu))
     values = self.evaluate(tensors)
     for i in range(1, len(values)):
-      self.assertAllClose(values[0], values[i], rtol=1e-5, atol=1e-5)
+      self.assertAllClose(values[0], values[i], rtol=1e-3, atol=1e-3)
 
   def _ComputeReferenceDilatedConv(self, tensor_in_sizes, filter_in_sizes,
                                    stride, dilation, padding, data_format,
                                    use_gpu):
-    total_size_1 = 1
-    total_size_2 = 1
-    for s in tensor_in_sizes:
-      total_size_1 *= s
-    for s in filter_in_sizes:
-      total_size_2 *= s
-
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
-    x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
+    x1 = self._CreateNumpyTensor(tensor_in_sizes)
+    x2 = self._CreateNumpyTensor(filter_in_sizes)
     with test_util.device(use_gpu):
       t1 = constant_op.constant(x1, shape=tensor_in_sizes)
       t2 = constant_op.constant(x2, shape=filter_in_sizes)
@@ -299,7 +297,7 @@ class Conv2DTest(test.TestCase):
     return expected, computed
 
   def _VerifyDilatedConvValues(self, tensor_in_sizes, filter_in_sizes, strides,
-                               padding, dilations):
+                               padding, dilations, rtol=1e-4):
     expected_results = []
     computed_results = []
     for data_format, use_gpu in GetTestConfigs():
@@ -312,16 +310,29 @@ class Conv2DTest(test.TestCase):
       expected_values = self.evaluate(expected_results)
       computed_values = self.evaluate(computed_results)
       for e_value, c_value in zip(expected_values, computed_values):
-        tf_logging.info("expected = ", e_value)
-        tf_logging.info("actual = ", c_value)
+        tf_logging.debug("expected = %s", e_value)
+        tf_logging.debug("actual = %s", c_value)
         self.assertAllClose(
-            e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=1e-4)
+            e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=rtol)
 
-  def _VerifyValues(self, tensor_in_sizes, filter_in_sizes, strides, padding,
-                    expected):
+  def _VerifyValues(self,
+                    tensor_in_sizes,
+                    filter_in_sizes,
+                    strides,
+                    padding,
+                    expected,
+                    dilations=(1, 1),
+                    gpu_only=False,
+                    test_grappler_layout_optimizer=False,
+                    tol=1e-5,
+                    fp16_tol=1e-3):
+    if gpu_only and not test.is_gpu_available(cuda_only=True):
+      return
     tensors = []
-    dilations = [1, 1]
+    dilations = list(dilations)
     for (data_format, use_gpu) in GetTestConfigs():
+      if gpu_only and not use_gpu:
+        continue
       for dtype in self._DtypesToTest(use_gpu):
         result = self._SetupValuesForDevice(
             tensor_in_sizes,
@@ -332,19 +343,71 @@ class Conv2DTest(test.TestCase):
             data_format,
             dtype,
             use_gpu=use_gpu)
+        if test_grappler_layout_optimizer and data_format == "NHWC" and use_gpu:
+          # Grappler's layout optimizer will not optimize a fetch node, so
+          # this identity allows Grappler to optimize the Conv2D node.
+          result = array_ops.identity(result)
         tensors.append(result)
       values = self.evaluate(tensors)
       for i in range(len(tensors)):
         conv = tensors[i]
         value = values[i]
-        tf_logging.info("expected = ", expected)
-        tf_logging.info("actual = ", value)
-        tol = 1e-5
-        if value.dtype == np.float16:
-          tol = 1e-3
-        self.assertAllClose(expected, np.ravel(value), atol=tol, rtol=tol)
+        tf_logging.debug("expected = %s", expected)
+        tf_logging.debug("actual = %s", value)
+        tol_to_use = fp16_tol if value.dtype == np.float16 else tol
+        self.assertAllClose(expected, np.ravel(value), atol=tol_to_use,
+                            rtol=tol_to_use)
         self.assertShapeEqual(value, conv)
 
+  def _VerifyExplicitPaddings(self,
+                              tensor_in_sizes,
+                              filter_in_sizes,
+                              strides,
+                              padding,
+                              dilations=(1, 1),
+                              test_grappler_layout_optimizer=False,
+                              tol=1e-5,
+                              fp16_tol=1e-3):
+    """Verifies Conv2D with explicit padding generates correct values.
+
+    It does this by comparing with Conv2D without explicit padding. This
+    function assumes Conv2D without explicit padding works correctly.
+
+    Args:
+      tensor_in_sizes: Input tensor dimensions in [batch, input_rows,
+        input_cols, input_depth].
+      filter_in_sizes: Filter tensor dimensions in [kernel_rows, kernel_cols,
+        input_depth, output_depth].
+      strides: [row_stride, col_stride] for the convolution;
+      padding: Explicit padding amounts.
+      dilations: Dilation values
+      test_grappler_layout_optimizer: If True, allow the Grappler layout
+        optimizer to run, which turns NHWC Conv2Ds on the GPU to NCHW Conv2Ds.
+      tol: The absolute and relative tolerance for non-fp16 dtypes.
+      fp16_tol: The absolute and relative tolerance for fp16.
+    """
+    input_tensor = self._CreateNumpyTensor(tensor_in_sizes)
+    filter_tensor = self._CreateNumpyTensor(filter_in_sizes)
+    input_tensor = array_ops.pad(input_tensor, [(0, 0)] + padding + [(0, 0)])
+    dilations = list(dilations)
+    conv2d_result = nn_ops.conv2d(
+        input_tensor,
+        filter_tensor, [1] + list(strides) + [1],
+        "VALID",
+        dilations=[1] + dilations + [1])
+    expected = list(self.evaluate(array_ops.reshape(conv2d_result, [-1])))
+    self._VerifyValues(
+        tensor_in_sizes,
+        filter_in_sizes,
+        strides,
+        padding,
+        expected,
+        dilations,
+        gpu_only=True,
+        test_grappler_layout_optimizer=test_grappler_layout_optimizer,
+        tol=tol,
+        fp16_tol=fp16_tol)
+
   @test_util.run_in_graph_and_eager_modes
   def testConv2D1x1Filter(self):
     expected_output = [
@@ -510,6 +573,126 @@ class Conv2DTest(test.TestCase):
         dilations=[2, 2],
         padding="VALID")
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D0x0Padding(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[2, 2, 3, 3],
+        strides=[1, 1],
+        padding=[[0, 0], [0, 0]])
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[3, 4, 3, 2],
+        filter_in_sizes=[1, 1, 2, 1],
+        strides=[2, 2],
+        padding=[[0, 0], [0, 0]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D1x1Padding(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 3, 2],
+        filter_in_sizes=[2, 2, 2, 2],
+        strides=[1, 1],
+        padding=[[1, 1], [1, 1]])
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 2, 1],
+        filter_in_sizes=[1, 1, 1, 2],
+        strides=[1, 1],
+        padding=[[1, 1], [1, 1]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Padding(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 1, 2],
+        filter_in_sizes=[2, 1, 2, 1],
+        strides=[1, 1],
+        padding=[[2, 2], [2, 2]])
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 1, 2],
+        filter_in_sizes=[1, 1, 2, 1],
+        strides=[2, 1],
+        padding=[[2, 2], [2, 2]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2DOnlyBottomPadding(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[2, 2, 3, 2],
+        strides=[1, 1],
+        padding=[[0, 3], [0, 0]], tol=2e-5)
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[2, 2, 4, 3],
+        filter_in_sizes=[1, 2, 3, 2],
+        strides=[2, 2],
+        padding=[[0, 3], [0, 0]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2DOnlyTopRightPadding(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[2, 2, 3, 2],
+        strides=[1, 1],
+        padding=[[1, 0], [0, 2]],
+        tol=5e-5)
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 4, 2],
+        filter_in_sizes=[2, 2, 2, 2],
+        strides=[1, 3],
+        padding=[[1, 0], [0, 2]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2DLotsPadding(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 1, 1, 3],
+        filter_in_sizes=[2, 2, 3, 3],
+        strides=[1, 1],
+        padding=[[3, 4], [4, 2]])
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 1, 1],
+        filter_in_sizes=[2, 2, 1, 3],
+        strides=[2, 1],
+        padding=[[3, 4], [4, 2]])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2DExplicitPaddingWithDilations(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 3, 2, 1],
+        filter_in_sizes=[1, 2, 1, 2],
+        strides=[1, 1],
+        padding=[[1, 0], [0, 1]],
+        dilations=[2, 1])
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 3, 2],
+        filter_in_sizes=[3, 2, 2, 1],
+        strides=[1, 1],
+        padding=[[2, 1], [1, 2]],
+        dilations=[2, 3])
+
+  def testConv2DExplicitPaddingWithLayoutOptimizer(self):
+    # Test with Grappler's layout optimizer, to ensure the layout optimizer
+    # handles explicit padding correctly.
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 3, 2, 1],
+        filter_in_sizes=[1, 2, 1, 2],
+        strides=[1, 1],
+        padding=[[1, 0], [0, 1]],
+        dilations=[2, 1],
+        test_grappler_layout_optimizer=True)
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 3, 2],
+        filter_in_sizes=[3, 2, 2, 1],
+        strides=[1, 1],
+        padding=[[2, 1], [1, 2]],
+        dilations=[2, 3],
+        test_grappler_layout_optimizer=True)
+
   # TODO(yzhwang): this currently fails.
   # self._VerifyValues(tensor_in_sizes=[1, 8, 8, 1],
   #                   filter_in_sizes=[2, 2, 1, 1],
@@ -517,19 +700,22 @@ class Conv2DTest(test.TestCase):
   #                   expected=[72, 112, 392, 432])
 
   # Testing for backprops
-  def _RunAndVerifyBackpropInput(self, input_sizes, filter_sizes, output_sizes,
-                                 strides, padding, expected, data_format,
-                                 use_gpu, err):
-    total_output_size = 1
-    total_filter_size = 1
-    for s in output_sizes:
-      total_output_size *= s
-    for s in filter_sizes:
-      total_filter_size *= s
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    x1 = [f * 1.0 for f in range(1, total_filter_size + 1)]
-    x2 = [f * 1.0 for f in range(1, total_output_size + 1)]
+  def _RunAndVerifyBackpropInput(self,
+                                 input_sizes,
+                                 filter_sizes,
+                                 output_sizes,
+                                 strides,
+                                 padding,
+                                 expected,
+                                 data_format,
+                                 use_gpu,
+                                 err,
+                                 dilations=(1, 1)):
+    if use_gpu and not test.is_gpu_available(cuda_only=True):
+      return
+    x1 = self._CreateNumpyTensor(filter_sizes)
+    x2 = self._CreateNumpyTensor(output_sizes)
+    dilations = list(dilations)
     with test_util.device(use_gpu):
       if data_format == "NCHW":
         input_sizes = test_util.NHWCToNCHW(input_sizes)
@@ -537,18 +723,30 @@ class Conv2DTest(test.TestCase):
       t1 = constant_op.constant(x1, shape=filter_sizes)
       t2 = constant_op.constant(x2, shape=output_sizes)
       strides = [1] + strides + [1]
+      dilations = [1] + dilations + [1]
+      if isinstance(padding, (list, tuple)):
+        padding = [(0, 0)] + padding + [(0, 0)]
       if data_format == "NCHW":
         t2 = test_util.NHWCToNCHW(t2)
         strides = test_util.NHWCToNCHW(strides)
+        dilations = test_util.NHWCToNCHW(dilations)
+        if isinstance(padding, (list, tuple)):
+          padding = test_util.NHWCToNCHW((padding))
       conv = nn_ops.conv2d_backprop_input(
-          t0, t1, t2, strides=strides, padding=padding, data_format=data_format)
+          t0,
+          t1,
+          t2,
+          strides=strides,
+          padding=padding,
+          data_format=data_format,
+          dilations=dilations)
       if data_format == "NCHW":
         conv = test_util.NCHWToNHWC(conv)
       # "values" consists of two tensors for two backprops
       value = self.evaluate(conv)
       self.assertShapeEqual(value, conv)
-    tf_logging.info("expected = ", expected)
-    tf_logging.info("actual = ", value)
+    tf_logging.debug("expected = %s", expected)
+    tf_logging.debug("actual = %s", value)
     self.assertArrayNear(expected, value.flatten(), err)
 
   def _CompareBackpropInput(self, input_sizes, filter_sizes, output_sizes,
@@ -691,41 +889,51 @@ class Conv2DTest(test.TestCase):
           err=1e-5)
 
   # Testing for backprops
-  def _RunAndVerifyBackpropFilter(self, input_sizes, filter_sizes, output_sizes,
-                                  strides, padding, expected, data_format,
-                                  use_gpu):
-    total_input_size = 1
-    total_output_size = 1
-    for s in input_sizes:
-      total_input_size *= s
-    for s in output_sizes:
-      total_output_size *= s
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    x0 = [f * 1.0 for f in range(1, total_input_size + 1)]
-    x2 = [f * 1.0 for f in range(1, total_output_size + 1)]
+  def _RunAndVerifyBackpropFilter(self,
+                                  input_sizes,
+                                  filter_sizes,
+                                  output_sizes,
+                                  strides,
+                                  padding,
+                                  expected,
+                                  data_format,
+                                  use_gpu,
+                                  dilations=(1, 1),
+                                  err=1e-5):
+    x0 = self._CreateNumpyTensor(input_sizes)
+    x2 = self._CreateNumpyTensor(output_sizes)
+    dilations = list(dilations)
+    explicit_strides = [1] + strides + [1]
+    new_padding = padding
+    new_dilations = [1] + dilations + [1]
+    if isinstance(new_padding, (list, tuple)):
+      new_padding = [(0, 0)] + new_padding + [(0, 0)]
+    if data_format == "NCHW":
+      explicit_strides = test_util.NHWCToNCHW(explicit_strides)
+      new_dilations = test_util.NHWCToNCHW(new_dilations)
+      if isinstance(padding, (list, tuple)):
+        new_padding = test_util.NHWCToNCHW(new_padding)
     for dtype in self._DtypesToTest(use_gpu=use_gpu):
       with test_util.device(use_gpu):
         t0 = constant_op.constant(x0, shape=input_sizes, dtype=dtype)
         t1 = constant_op.constant(filter_sizes, shape=[len(filter_sizes)])
         t2 = constant_op.constant(x2, shape=output_sizes, dtype=dtype)
-        explicit_strides = [1] + strides + [1]
         if data_format == "NCHW":
           t0 = test_util.NHWCToNCHW(t0)
           t2 = test_util.NHWCToNCHW(t2)
-          explicit_strides = test_util.NHWCToNCHW(explicit_strides)
         conv = nn_ops.conv2d_backprop_filter(
             t0,
             t1,
             t2,
             strides=explicit_strides,
-            padding=padding,
+            padding=new_padding,
+            dilations=new_dilations,
             data_format=data_format)
         value = self.evaluate(conv)
         self.assertShapeEqual(value, conv)
-      tf_logging.info("expected = ", expected)
-      tf_logging.info("actual = ", value)
-      self.assertArrayNear(expected, value.flatten(), 1e-5)
+      tf_logging.debug("expected = %s", expected)
+      tf_logging.debug("actual = %s", value)
+      self.assertArrayNear(expected, value.flatten(), err)
 
   def _CompareBackFilter(self, input_sizes, filter_sizes, output_sizes,
                          conv_strides, padding):
@@ -866,16 +1074,8 @@ class Conv2DTest(test.TestCase):
   def _RunAndVerifyBackpropInputDilation(self, input_sizes, filter_sizes,
                                          output_sizes, strides, dilations,
                                          padding, data_format, use_gpu, err):
-    total_input_size = 1
-    total_filter_size = 1
-    for s in input_sizes:
-      total_input_size *= s
-    for s in filter_sizes:
-      total_filter_size *= s
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    x1 = [f * 1.0 for f in range(1, total_input_size + 1)]
-    x2 = [f * 1.0 for f in range(1, total_filter_size + 1)]
+    x1 = self._CreateNumpyTensor(input_sizes)
+    x2 = self._CreateNumpyTensor(filter_sizes)
     default_dilations = (dilations[0] == 1 and dilations[1] == 1)
     if default_dilations or use_gpu:
       with self.cached_session(use_gpu=use_gpu) as sess:
@@ -912,24 +1112,16 @@ class Conv2DTest(test.TestCase):
         value_2 = self.evaluate(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
-      tf_logging.info("expected = ", value_2)
-      tf_logging.info("actual = ", value)
+      tf_logging.debug("expected = %s", value_2)
+      tf_logging.debug("actual = %s", value)
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
   # Testing for backprops
   def _RunAndVerifyBackpropFilterDilation(self, input_sizes, filter_sizes,
                                           output_sizes, strides, dilations,
                                           padding, data_format, use_gpu, err):
-    total_input_size = 1
-    total_filter_size = 1
-    for s in input_sizes:
-      total_input_size *= s
-    for s in filter_sizes:
-      total_filter_size *= s
-    # Initializes the input tensor with array containing incrementing
-    # numbers from 1.
-    x1 = [f * 1.0 for f in range(1, total_input_size + 1)]
-    x2 = [f * 1.0 for f in range(1, total_filter_size + 1)]
+    x1 = self._CreateNumpyTensor(input_sizes)
+    x2 = self._CreateNumpyTensor(filter_sizes)
     default_dilations = (dilations[0] == 1 and dilations[1] == 1)
     if default_dilations or use_gpu:
       with self.cached_session(use_gpu=use_gpu) as sess:
@@ -965,8 +1157,8 @@ class Conv2DTest(test.TestCase):
         value_2 = self.evaluate(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
-      tf_logging.info("expected = ", value_2)
-      tf_logging.info("actual = ", value)
+      tf_logging.debug("expected = %s", value_2)
+      tf_logging.debug("actual = %s", value)
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
   def testConv2D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1(self):
@@ -1111,20 +1303,347 @@ class Conv2DTest(test.TestCase):
             use_gpu=use_gpu,
             err=1e-5)
 
+  def _RunAndVerifyBackpropInputExplicitPadding(self,
+                                                input_sizes,
+                                                filter_sizes,
+                                                output_sizes,
+                                                strides,
+                                                padding,
+                                                data_format,
+                                                dilations=(1, 1),
+                                                err=2e-5):
+    x1 = self._CreateNumpyTensor(filter_sizes)
+    x2 = self._CreateNumpyTensor(output_sizes)
+    dilations = list(dilations)
+    padded_input_sizes = input_sizes[:]
+    padded_input_sizes[1] += padding[0][0] + padding[0][1]
+    padded_input_sizes[2] += padding[1][0] + padding[1][1]
+    c = nn_ops.conv2d_backprop_input(
+        padded_input_sizes,
+        x1,
+        x2,
+        strides=[1] + strides + [1],
+        padding="VALID",
+        dilations=[1] + dilations + [1])
+    c = c[:, padding[0][0]:(c.shape[1] - padding[0][1]), padding[1][0]:(
+        c.shape[2] - padding[1][1]), :]
+    expected = list(self.evaluate(array_ops.reshape(c, [-1])))
+    self._RunAndVerifyBackpropInput(
+        input_sizes,
+        filter_sizes,
+        output_sizes,
+        strides,
+        padding,
+        expected,
+        data_format,
+        use_gpu=True,
+        err=err,
+        dilations=dilations)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Depth1Padding0x0BackpropInput(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 1, 2, 1],
+            strides=[1, 1],
+            padding=[[0, 0], [0, 0]],
+            data_format=data_format)
+
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 3, 4, 2],
+            filter_sizes=[2, 2, 2, 3],
+            output_sizes=[1, 1, 2, 3],
+            strides=[2, 2],
+            padding=[[0, 0], [0, 0]],
+            data_format=data_format)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Depth1Padding1x1BackpropInput(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 2],
+            output_sizes=[1, 3, 4, 2],
+            strides=[1, 1],
+            padding=[[1, 1], [1, 1]],
+            data_format=data_format, err=1e-4)
+
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 2, 3, 2],
+            filter_sizes=[1, 1, 2, 1],
+            output_sizes=[1, 4, 3, 1],
+            strides=[1, 2],
+            padding=[[1, 1], [1, 1]],
+            data_format=data_format)
+
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 4, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 4, 2, 1],
+            strides=[1, 2],
+            padding=[[1, 1], [1, 1]],
+            data_format=data_format,
+            dilations=[2, 2])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Depth1Padding2x2BackpropInput(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[2, 3, 1, 1],
+            filter_sizes=[2, 1, 1, 1],
+            output_sizes=[2, 2, 5, 1],
+            strides=[3, 1],
+            padding=[[2, 2], [2, 2]],
+            data_format=data_format)
+
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 3, 6, 1],
+            filter_sizes=[3, 2, 1, 1],
+            output_sizes=[1, 3, 4, 1],
+            strides=[1, 2],
+            padding=[[2, 2], [2, 2]],
+            data_format=data_format,
+            dilations=[2, 3])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Depth1Padding_1_8_4_1_BackpropInput(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 10, 8, 1],
+            strides=[1, 1],
+            padding=[[1, 8], [4, 2]],
+            data_format=data_format, err=5e-5)
+
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 5, 3, 1],
+            filter_sizes=[3, 2, 1, 1],
+            output_sizes=[1, 4, 8, 1],
+            strides=[3, 1],
+            padding=[[1, 8], [4, 2]],
+            data_format=data_format)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Depth1Padding_5_0_2_2_BackpropInput(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 3, 3, 1],
+            filter_sizes=[2, 1, 1, 1],
+            output_sizes=[1, 7, 7, 1],
+            strides=[1, 1],
+            padding=[[5, 0], [2, 2]],
+            data_format=data_format,
+            err=5e-5)
+
+        self._RunAndVerifyBackpropInputExplicitPadding(
+            input_sizes=[1, 4, 2, 1],
+            filter_sizes=[3, 3, 1, 1],
+            output_sizes=[1, 5, 2, 1],
+            strides=[1, 2],
+            padding=[[5, 0], [2, 2]],
+            data_format=data_format,
+            dilations=[2, 1])
+
+  def _RunAndVerifyBackpropFilterExplicitPadding(self,
+                                                 input_sizes,
+                                                 filter_sizes,
+                                                 output_sizes,
+                                                 strides,
+                                                 padding,
+                                                 data_format,
+                                                 dilations=(1, 1),
+                                                 err=1e-5):
+    x0 = self._CreateNumpyTensor(input_sizes)
+    x2 = self._CreateNumpyTensor(output_sizes)
+    dilations = list(dilations)
+
+    x0 = np.pad(x0, [(0, 0)] + padding + [(0, 0)], "constant")
+    c = nn_ops.conv2d_backprop_filter(
+        x0,
+        filter_sizes,
+        x2,
+        strides=[1] + strides + [1],
+        padding="VALID",
+        dilations=[1] + dilations + [1])
+    expected = list(self.evaluate(array_ops.reshape(c, [-1])))
+    self._RunAndVerifyBackpropFilter(
+        input_sizes,
+        filter_sizes,
+        output_sizes,
+        strides,
+        padding,
+        expected,
+        data_format,
+        use_gpu=True,
+        dilations=dilations,
+        err=err)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Depth1Padding0x0BackpropFilter(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 1, 2, 1],
+            strides=[1, 1],
+            padding=[[0, 0], [0, 0]],
+            data_format=data_format)
+
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 3, 4, 2],
+            filter_sizes=[2, 2, 2, 3],
+            output_sizes=[1, 1, 2, 3],
+            strides=[2, 2],
+            padding=[[0, 0], [0, 0]],
+            data_format=data_format)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Depth1Padding1x1BackpropFilter(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 2],
+            output_sizes=[1, 3, 4, 2],
+            strides=[1, 1],
+            padding=[[1, 1], [1, 1]],
+            data_format=data_format,
+            err=5e-5)
+
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 2, 3, 2],
+            filter_sizes=[1, 1, 2, 1],
+            output_sizes=[1, 4, 3, 1],
+            strides=[1, 2],
+            padding=[[1, 1], [1, 1]],
+            data_format=data_format)
+
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 4, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 4, 2, 1],
+            strides=[1, 2],
+            padding=[[1, 1], [1, 1]],
+            data_format=data_format,
+            dilations=[2, 2])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Depth1Padding2x2BackpropFilter(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[2, 3, 1, 1],
+            filter_sizes=[2, 1, 1, 1],
+            output_sizes=[2, 2, 5, 1],
+            strides=[3, 1],
+            padding=[[2, 2], [2, 2]],
+            data_format=data_format)
+
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 3, 6, 1],
+            filter_sizes=[3, 2, 1, 1],
+            output_sizes=[1, 3, 4, 1],
+            strides=[1, 2],
+            padding=[[2, 2], [2, 2]],
+            data_format=data_format,
+            dilations=[2, 3])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Depth1Padding_1_8_4_1_BackpropFilter(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 2, 3, 1],
+            filter_sizes=[2, 2, 1, 1],
+            output_sizes=[1, 10, 8, 1],
+            strides=[1, 1],
+            padding=[[1, 8], [4, 2]],
+            data_format=data_format,
+            err=1e-4)
+
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 5, 3, 1],
+            filter_sizes=[3, 2, 1, 1],
+            output_sizes=[1, 4, 8, 1],
+            strides=[3, 1],
+            padding=[[1, 8], [4, 2]],
+            data_format=data_format)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Depth1Padding_5_0_2_2_BackpropFilter(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 3, 3, 1],
+            filter_sizes=[2, 1, 1, 1],
+            output_sizes=[1, 7, 7, 1],
+            strides=[1, 1],
+            padding=[[5, 0], [2, 2]],
+            data_format=data_format,
+            err=1e-4)
+
+        self._RunAndVerifyBackpropFilterExplicitPadding(
+            input_sizes=[1, 4, 2, 1],
+            filter_sizes=[3, 3, 1, 1],
+            output_sizes=[1, 5, 2, 1],
+            strides=[1, 2],
+            padding=[[5, 0], [2, 2]],
+            data_format=data_format,
+            dilations=[2, 1])
+
   # Gradient checkers
   def ConstructAndTestGradient(self, batch, input_rows, input_cols, filter_rows,
                                filter_cols, in_depth, out_depth, stride_rows,
                                stride_cols, padding, test_input, data_format,
-                               use_gpu):
+                               use_gpu, max_err=0.002):
     input_shape = [batch, input_rows, input_cols, in_depth]
     filter_shape = [filter_rows, filter_cols, in_depth, out_depth]
     # TODO(yangke): re-factor the computation of output shape.
     if padding == "VALID":
       output_rows = (input_rows - filter_rows + stride_rows) // stride_rows
       output_cols = (input_cols - filter_cols + stride_cols) // stride_cols
-    else:
+    elif padding == "SAME":
       output_rows = (input_rows + stride_rows - 1) // stride_rows
       output_cols = (input_cols + stride_cols - 1) // stride_cols
+    else:
+      self.assertIsInstance(padding, (list, tuple))
+      output_rows = (input_rows + padding[1][0] + padding[1][1] - filter_rows +
+                     stride_rows) // stride_rows
+      output_cols = (input_cols + padding[2][0] + padding[2][1] - filter_cols +
+                     stride_cols) // stride_cols
     output_shape = [batch, output_rows, output_cols, out_depth]
     input_size = 1
     for x in input_shape:
@@ -1145,16 +1664,19 @@ class Conv2DTest(test.TestCase):
         filter_tensor = constant_op.constant(
             filter_data, shape=filter_shape, dtype=dtype, name="filter")
         strides = [1, stride_rows, stride_cols, 1]
+        new_padding = padding
         if data_format == "NCHW":
           new_input_tensor = test_util.NHWCToNCHW(input_tensor)
           strides = test_util.NHWCToNCHW(strides)
+          if isinstance(padding, (list, tuple)):
+            new_padding = test_util.NHWCToNCHW(padding)
         else:
           new_input_tensor = input_tensor
         conv = nn_ops.conv2d(
             new_input_tensor,
             filter_tensor,
             strides,
-            padding,
+            new_padding,
             data_format=data_format,
             name="conv")
         if data_format == "NCHW":
@@ -1178,8 +1700,8 @@ class Conv2DTest(test.TestCase):
           # since fp16 numerical gradients are too imprecise.
           err = np.fabs(jacob_t - reference_jacob_t).max()
 
-        tf_logging.info("conv_2d gradient error = ", err)
-        self.assertLess(err, 0.002)
+        tf_logging.debug("conv_2d gradient error = %s", err)
+        self.assertLess(err, max_err)
 
   def testInputGradientValidPaddingStrideOne(self):
     for (data_format, use_gpu) in GetTestConfigs():
@@ -1436,6 +1958,248 @@ class Conv2DTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  def testInputGradient1x1PaddingStrideOne(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=5,
+            input_cols=4,
+            filter_rows=3,
+            filter_cols=3,
+            in_depth=2,
+            out_depth=3,
+            stride_rows=1,
+            stride_cols=1,
+            padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+            test_input=True,
+            data_format=data_format,
+            use_gpu=use_gpu,
+            max_err=0.0025)
+
+  def testFilterGradient1x1PaddingStrideOne(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=5,
+            input_cols=4,
+            filter_rows=3,
+            filter_cols=3,
+            in_depth=2,
+            out_depth=3,
+            stride_rows=1,
+            stride_cols=1,
+            padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+            test_input=False,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  def testInputGradient1x1PaddingStrideTwo(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=4,
+            input_cols=5,
+            filter_rows=3,
+            filter_cols=3,
+            in_depth=2,
+            out_depth=3,
+            stride_rows=2,
+            stride_cols=2,
+            padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+            test_input=True,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  def testFilterGradient1x1PaddingStrideTwo(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=4,
+            input_cols=5,
+            filter_rows=3,
+            filter_cols=3,
+            in_depth=2,
+            out_depth=3,
+            stride_rows=2,
+            stride_cols=2,
+            padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+            test_input=False,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  def testInputGradient2x2PaddingStrideOne(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=5,
+            input_cols=4,
+            filter_rows=3,
+            filter_cols=3,
+            in_depth=2,
+            out_depth=3,
+            stride_rows=1,
+            stride_cols=1,
+            padding=[[0, 0], [2, 2], [2, 2], [0, 0]],
+            test_input=True,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  def testFilterGradient2x2PaddingStrideOne(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=5,
+            input_cols=4,
+            filter_rows=3,
+            filter_cols=3,
+            in_depth=2,
+            out_depth=3,
+            stride_rows=1,
+            stride_cols=1,
+            padding=[[0, 0], [2, 2], [2, 2], [0, 0]],
+            test_input=False,
+            data_format=data_format,
+            use_gpu=use_gpu,
+            max_err=0.003)
+
+  def testInputGradient1_2_3_4PaddingStride3x2(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=8,
+            input_cols=5,
+            filter_rows=4,
+            filter_cols=2,
+            in_depth=3,
+            out_depth=2,
+            stride_rows=3,
+            stride_cols=2,
+            padding=[[0, 0], [1, 2], [3, 4], [0, 0]],
+            test_input=True,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  def testFilterGradient1_2_3_4PaddingStride3x2(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=8,
+            input_cols=5,
+            filter_rows=4,
+            filter_cols=2,
+            in_depth=3,
+            out_depth=2,
+            stride_rows=3,
+            stride_cols=2,
+            padding=[[0, 0], [1, 2], [3, 4], [0, 0]],
+            test_input=False,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  def testInputGradient4_3_2_1PaddingStride2x1(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=3,
+            input_rows=5,
+            input_cols=7,
+            filter_rows=3,
+            filter_cols=2,
+            in_depth=1,
+            out_depth=2,
+            stride_rows=2,
+            stride_cols=1,
+            padding=[[0, 0], [4, 3], [2, 1], [0, 0]],
+            test_input=True,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  def testFilterGradient4_3_2_1PaddingStride2x1(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=3,
+            input_rows=5,
+            input_cols=7,
+            filter_rows=3,
+            filter_cols=2,
+            in_depth=1,
+            out_depth=2,
+            stride_rows=2,
+            stride_cols=1,
+            padding=[[0, 0], [4, 3], [2, 1], [0, 0]],
+            test_input=False,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  def testInputGradient0_0_0_5PaddingStride1x2(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=6,
+            input_cols=7,
+            filter_rows=3,
+            filter_cols=4,
+            in_depth=3,
+            out_depth=2,
+            stride_rows=1,
+            stride_cols=2,
+            padding=[[0, 0], [0, 0], [0, 5], [0, 0]],
+            test_input=True,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
+  def testFilterGradient0_0_0_5PaddingStride1x2(self):
+    if not test.is_gpu_available(cuda_only=True):
+      return
+    for (data_format, use_gpu) in GetTestConfigs():
+      if use_gpu:
+        self.ConstructAndTestGradient(
+            batch=2,
+            input_rows=6,
+            input_cols=7,
+            filter_rows=3,
+            filter_cols=4,
+            in_depth=3,
+            out_depth=2,
+            stride_rows=1,
+            stride_cols=2,
+            padding=[[0, 0], [0, 0], [0, 5], [0, 0]],
+            test_input=False,
+            data_format=data_format,
+            use_gpu=use_gpu)
+
   def testShapeFunctionEdgeCases(self):
     # All shapes unknown.
     c1 = nn_ops.conv2d(
@@ -1473,6 +2237,56 @@ class Conv2DTest(test.TestCase):
           strides=[1, 1, 1, 1],
           padding="SAME")
 
+    # Negative padding.
+    with self.assertRaises(ValueError):
+      nn_ops.conv2d(
+          array_ops.placeholder(dtypes.float32),
+          array_ops.placeholder(dtypes.float32),
+          strides=[1, 1, 1, 1],
+          padding=[[0, 0], [0, -1], [1, 2], [0, 0]])
+
+    # Nonzero padding in nonspatial dimension.
+    with self.assertRaises(ValueError):
+      nn_ops.conv2d(
+          array_ops.placeholder(dtypes.float32),
+          array_ops.placeholder(dtypes.float32),
+          strides=[1, 1, 1, 1],
+          padding=[[1, 0], [0, 0], [0, 0], [0, 0]])
+
+    # Nonzero NCHW padding in nonspatial dimension.
+    with self.assertRaises(ValueError):
+      nn_ops.conv2d(
+          array_ops.placeholder(dtypes.float32),
+          array_ops.placeholder(dtypes.float32),
+          strides=[1, 1, 1, 1],
+          padding=[[0, 0], [0, 1], [0, 0], [0, 0]],
+          data_format="NCHW")
+
+    # Wrong amount of padding
+    with self.assertRaises(ValueError):
+      nn_ops.conv2d(
+          array_ops.placeholder(dtypes.float32),
+          array_ops.placeholder(dtypes.float32),
+          strides=[1, 1, 1, 1],
+          padding=[[0, 0], [0, 0], [0, 0]])
+
+    # Only specify one padding amount per dimension
+    with self.assertRaises(ValueError):
+      nn_ops.conv2d(
+          array_ops.placeholder(dtypes.float32),
+          array_ops.placeholder(dtypes.float32),
+          strides=[1, 1, 1, 1],
+          padding=[[0], [0], [0], [0]])
+
+    # Explicit padding elements are not lists
+    with self.assertRaises(ValueError):
+      nn_ops.conv2d(
+          array_ops.placeholder(dtypes.float32),
+          array_ops.placeholder(dtypes.float32),
+          strides=[1, 1, 1, 1],
+          padding=[0, 0, 0, 0])
+
+  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testOpEdgeCases(self):
     with self.cached_session() as sess:
       # Illegal strides.
@@ -1513,6 +2327,41 @@ class Conv2DTest(test.TestCase):
                 strides=[1, 1, 1, 1],
                 padding="VALID"))
 
+      # Filter larger than input + padding.
+      with self.assertRaisesRegexp(ValueError, "Negative dimension size"):
+        sess.run(
+            nn_ops.conv2d(
+                array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
+                array_ops.placeholder(dtypes.float32, shape=[24, 25, 3, 2]),
+                strides=[1, 1, 1, 1],
+                padding=[[0, 0], [2, 2], [2, 2], [0, 0]]))
+
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        # Negative padding during backprop.
+        with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                     "nonnegative"):
+          sess.run(
+              nn_ops.conv2d_backprop_input([32, 20, 20, 3],
+                                           array_ops.placeholder(
+                                               dtypes.float32,
+                                               shape=[18, 18, 3, 2]),
+                                           array_ops.placeholder(
+                                               dtypes.float32,
+                                               shape=[32, 3, 2, 2]),
+                                           strides=[1, 1, 1, 1],
+                                           padding=[[0, 0], [-1, 0], [0, 0],
+                                                    [0, 0]]))
+        with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                     "nonnegative"):
+          sess.run(
+              nn_ops.conv2d_backprop_filter(
+                  array_ops.placeholder(dtypes.float32, shape=[32, 20, 20, 3]),
+                  [18, 18, 3, 2],
+                  array_ops.placeholder(dtypes.float32, shape=[32, 3, 2, 2]),
+                  strides=[1, 1, 1, 1],
+                  padding=[[0, 0], [-1, 0], [0, 0], [0, 0]]))
+
 
 class DepthwiseConv2DTest(test.TestCase):
 
@@ -1546,7 +2395,7 @@ class DepthwiseConv2DTest(test.TestCase):
       conv = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
       value = self.evaluate(conv)
-    tf_logging.info("value = ", value)
+    tf_logging.debug("value = %s", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
@@ -1668,7 +2517,7 @@ class SeparableConv2DTest(test.TestCase):
         conv = array_ops.transpose(conv, [0, 2, 3, 1])
 
       value = self.evaluate(conv)
-    tf_logging.info("value = ", value)
+    tf_logging.debug("value = %s", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-3)
     self.assertShapeEqual(value, conv)
 
@@ -1828,6 +2677,194 @@ class Conv2DBenchmark(test.Benchmark):
             name="conv_stack_iter_%d" % iter_index, wall_time=wall_time)
         tf_logging.info("conv_stack_iter_%d: %.4f" % (iter_index, wall_time))
 
+  def _bench_op(self, name, op, burn_iters, num_iters):
+    config = config_pb2.ConfigProto()
+    # Prevent Grappler from optimizing away the entire graph.
+    config.graph_options.rewrite_options.dependency_optimization = (
+        rewriter_config_pb2.RewriterConfig.OFF)
+    with session_lib.Session(config=config) as session:
+      variables.global_variables_initializer().run()
+      self.run_op_benchmark(
+          session, op, burn_iters=burn_iters, min_iters=num_iters, name=name)
+
+  def benchmarkExplicitVsManualPadding(self):
+    """Compare performance of EXPLICIT padding and calling tf.pad.
+
+    A Conv2D op with EXPLICIT padding is benchmarked, and a tf.pad with the same
+    padding followed by an equivalent Conv2D op is benchmarked.
+    """
+    if not test.is_gpu_available():
+      return
+
+    with ops.Graph().as_default():
+      burn_iters = 15
+      num_iters = 300
+      batch_size = 64
+      # The input and filter correspond to the first layer of Resnet50.
+      input = variables.Variable(  # pylint: disable=redefined-builtin
+          random_ops.random_uniform([
+              batch_size,
+              3,
+              224,
+              224
+          ]))
+      filter = variables.Variable(random_ops.random_uniform([7, 7, 3, 64]))  # pylint: disable=redefined-builtin
+      strides = [1, 1, 2, 2]
+      padding = [(0, 0), (0, 0), (3, 3), (3, 3)]
+      output_explicit_pad = nn_ops.conv2d(
+          input, filter, strides, padding=padding, data_format="NCHW")
+      input_padded = array_ops.pad(input, padding)
+      output_manual_pad = nn_ops.conv2d(
+          input_padded, filter, strides, padding="VALID", data_format="NCHW")
+      # Benchmark just the forward pass.
+      self._bench_op("explicit_pad_forward", output_explicit_pad.op, burn_iters,
+                     num_iters)
+      self._bench_op("manual_pad_forward", output_manual_pad.op, burn_iters,
+                     num_iters)
+
+      # Benchmark both the forward and backwards passes.
+      input_grad_explicit_pad, filter_grad_explicit_pad = (
+          gradients_impl.gradients(output_explicit_pad, [input, filter]))
+      self._bench_op(
+          "explicit_pad_backward",
+          control_flow_ops.group(input_grad_explicit_pad,
+                                 filter_grad_explicit_pad), burn_iters,
+          num_iters)
+      input_grad_manual_pad, filter_grad_manual_pad = gradients_impl.gradients(
+          output_manual_pad, [input, filter])
+      self._bench_op(
+          "manual_pad_backward",
+          control_flow_ops.group(input_grad_manual_pad, filter_grad_manual_pad),
+          burn_iters, num_iters)
+
+  def benchmarkExplicitVsSamePaddingGraph(self):
+    """Compare performance of EXPLICIT and SAME padding in graph mode.
+
+    A Conv2D op with SAME padding is benchmarked, and an equivalent Conv2D op
+    with explicit padding is benchmarked, where the padding is the same as in
+    the SAME case. The purpose is to ensure EXPLICIT padding is just as
+    efficient as the SAME case
+    """
+    if not test.is_gpu_available():
+      return
+
+    with ops.Graph().as_default():
+      burn_iters = 15
+      num_convs = 20
+      num_iters = 50
+      batch_size = 64
+      # The input and filter correspond to a middle layer of Resnet50.
+      input = variables.Variable(  # pylint: disable=redefined-builtin
+          random_ops.random_uniform([
+              batch_size,
+              256,
+              14,
+              14
+          ]))
+      filter = variables.Variable(random_ops.random_uniform([3, 3, 256, 256]))  # pylint: disable=redefined-builtin
+      strides = [1, 1, 1, 1]
+      padding = [(0, 0), (0, 0), (1, 1), (1, 1)]
+      output_explicit_pad = input
+      output_same_pad = input
+
+      for _ in range(num_convs):
+        output_explicit_pad = nn_ops.conv2d(
+            output_explicit_pad,
+            filter,
+            strides,
+            padding=padding,
+            data_format="NCHW")
+        output_same_pad = nn_ops.conv2d(
+            output_same_pad,
+            filter,
+            strides,
+            padding="SAME",
+            data_format="NCHW")
+      grad_explicit_pad, = gradients_impl.gradients(output_explicit_pad, filter)
+      grad_same_pad, = gradients_impl.gradients(output_same_pad, filter)
+      self._bench_op("graph_explicit_pad", grad_explicit_pad.op, burn_iters,
+                     num_iters)
+      self._bench_op("graph_same_pad", grad_same_pad.op, burn_iters, num_iters)
+
+  def benchmarkExplicitVsSamePaddingEager(self):
+    """Compare performance of EXPLICIT and SAME padding in eager mode.
+
+    A Conv2D op with SAME padding is benchmarked, and an equivalent Conv2D op
+    with explicit padding is benchmarked, where the padding is the same as in
+    the SAME case. Currently, EXPLICIT padding is slightly slower, due to the
+    fact the Python padding list must be checked and processed before the Conv2D
+    op can run.
+    """
+    # TODO(reedwm): Make EXPLICIT padding as fast as SAME padding.
+    if not test.is_gpu_available():
+      return
+
+    with context.eager_mode():
+      burn_iters = 15
+      num_convs = 20
+      num_iters = 50
+      batch_size = 64
+      # The input and filter correspond to a middle layer of Resnet50.
+      input = variables.Variable(  # pylint: disable=redefined-builtin
+          random_ops.random_uniform([
+              batch_size,
+              256,
+              14,
+              14
+          ]))
+      filter = variables.Variable(random_ops.random_uniform([3, 3, 256, 256]))  # pylint: disable=redefined-builtin
+      strides = [1, 1, 1, 1]
+      padding = [(0, 0), (0, 0), (1, 1), (1, 1)]
+      output_explicit_pad = input
+      output_same_pad = input
+      for _ in range(burn_iters):
+        output_explicit_pad = nn_ops.conv2d(
+            output_explicit_pad,
+            filter,
+            strides,
+            padding=padding,
+            data_format="NCHW")
+        output_same_pad = nn_ops.conv2d(
+            output_same_pad,
+            filter,
+            strides,
+            padding="SAME",
+            data_format="NCHW")
+
+      start = time.time()
+      for _ in range(num_iters):
+        with backprop.GradientTape() as tape:
+          for _ in range(num_convs):
+            output_explicit_pad = nn_ops.conv2d(
+                output_explicit_pad,
+                filter,
+                strides,
+                padding=padding,
+                data_format="NCHW")
+          tape.gradient(output_explicit_pad, filter)
+      end = time.time()
+      self.report_benchmark(
+          name="eager_explicit_pad",
+          wall_time=(end - start) / num_iters,
+          iters=num_iters)
+
+      start = time.time()
+      for _ in range(num_iters):
+        with backprop.GradientTape() as tape:
+          for _ in range(num_convs):
+            output_same_pad = nn_ops.conv2d(
+                output_same_pad,
+                filter,
+                strides,
+                padding="SAME",
+                data_format="NCHW")
+          tape.gradient(output_same_pad, filter)
+      end = time.time()
+      self.report_benchmark(
+          name="eager_same_pad",
+          wall_time=(end - start) / num_iters,
+          iters=num_iters)
+
 
 def GetInceptionFwdTest(input_size, filter_size, stride, padding,
                         gpu_only=False):
@@ -1855,7 +2892,8 @@ def GetInceptionFwdDilatedConvTest(input_size, filter_size, stride, padding):
           filter_in_sizes=filter_size,
           strides=[stride, stride],
           dilations=[2, 2],
-          padding=padding)
+          padding=padding,
+          rtol=5e-4)
 
   return Test
 
diff --git a/tensorflow/contrib/framework/python/ops/critical_section_test.py b/tensorflow/python/kernel_tests/critical_section_test.py
similarity index 84%
rename from tensorflow/contrib/framework/python/ops/critical_section_test.py
rename to tensorflow/python/kernel_tests/critical_section_test.py
index 34fd5018af125335845540dedfdffc984ba02313..7b1519c5e3c77d4676e5084ab06ed49b1a3c42f9 100644
--- a/tensorflow/contrib/framework/python/ops/critical_section_test.py
+++ b/tensorflow/python/kernel_tests/critical_section_test.py
@@ -18,13 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.framework.python.ops import critical_section_ops
+from tensorflow.python.data.experimental.ops import prefetching_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import critical_section_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
@@ -47,7 +49,7 @@ class CriticalSectionTest(test.TestCase):
           return array_ops.identity(c)
 
     num_concurrent = 100
-    r = [cs.execute(fn, 1.0, 2.0) for _ in range(num_concurrent)]
+    r = [cs.execute(lambda: fn(1.0, 2.0)) for _ in range(num_concurrent)]
     self.evaluate(v.initializer)
     r_value = self.evaluate(r)
     self.assertAllClose([2.0 * i for i in range(num_concurrent)],
@@ -73,7 +75,7 @@ class CriticalSectionTest(test.TestCase):
               array_ops.identity(inner_cond), true_fn, lambda: c)
 
         def execute():
-          return cs.execute(fn, 1.0, 2.0)
+          return cs.execute(lambda: fn(1.0, 2.0))
 
         r = [
             control_flow_ops.cond(array_ops.identity(outer_cond),
@@ -91,6 +93,7 @@ class CriticalSectionTest(test.TestCase):
         else:
           self.assertAllClose([0] * num_concurrent, r_value)
 
+  @test_util.run_v1_only("b/123990562 Sees CancelledError on some calls")
   def testCriticalSectionInParallelDoesntDeadlockOnError(self):
     # No eager mode execution of this test because eager does not
     # run fn() in parallel, which is where the deadlock could
@@ -102,12 +105,23 @@ class CriticalSectionTest(test.TestCase):
       error = control_flow_ops.Assert((i % 2) == 1, ["Error"])
       with ops.control_dependencies([error]):
         return v.read_value()
+
     num_concurrent = 2
-    r = [cs.execute(fn, i) for i in range(num_concurrent)]
+
+    @def_function.function(autograph=False)
+    def run_concurrently():
+      return [cs.execute(lambda: fn(i)) for i in range(num_concurrent)]
+
+    if not context.executing_eagerly():
+      run_concurrently = run_concurrently()
+
     self.evaluate(v.initializer)
     for _ in range(100):
       with self.assertRaisesOpError("Error"):
-        self.evaluate(r)
+        if context.executing_eagerly():
+          run_concurrently()
+        else:
+          self.evaluate(run_concurrently)
 
   @test_util.run_in_graph_and_eager_modes
   def testCreateCriticalSectionFnReturnsOp(self):
@@ -122,17 +136,20 @@ class CriticalSectionTest(test.TestCase):
           return control_flow_ops.no_op()
 
     num_concurrent = 100
-    r = [cs.execute(fn_return_op, 1.0, 2.0) for _ in range(num_concurrent)]
+    r = [cs.execute(lambda: fn_return_op(1.0, 2.0))
+         for _ in range(num_concurrent)]
     self.evaluate(v.initializer)
     self.evaluate(r)
     final_v = self.evaluate(v)
     self.assertAllClose(2.0 * num_concurrent, final_v)
 
+  @test_util.run_v1_only("Collections don't exist in TF2")
   def testCollection(self):
     cs = critical_section_ops.CriticalSection(shared_name="cs")
     self.assertIn(
         cs, ops.get_collection(critical_section_ops.CRITICAL_SECTIONS))
-    execute = cs.execute(lambda x: x + 1, 1.0, name="my_execute")
+    add = lambda x: x + 1
+    execute = cs.execute(lambda: add(1.0), name="my_execute")
     execute_op = [
         x for x in execute.graph.get_operations()
         if "my_execute" in x.name and "MutexLock" in x.type
@@ -142,18 +159,21 @@ class CriticalSectionTest(test.TestCase):
         [signature.op for signature in
          ops.get_collection(critical_section_ops.CRITICAL_SECTION_EXECUTIONS)])
 
+  @test_util.run_v1_only("b/123955885 Can't identify deadlocks in eager mode")
   def testRecursiveCriticalSectionAccessIsIllegal(self):
     # This does not work properly in eager mode.  Eager users will
     # just hit a deadlock if they do this.  But at least it'll be easier
     # to debug.
     cs = critical_section_ops.CriticalSection()
+    add = lambda y: y + 1
     def fn(x):
-      return cs.execute(lambda y: y + 1, x)
+      return cs.execute(lambda: add(x))
+
     with self.assertRaisesRegexp(
         ValueError,
         r"attempts to directly access the CriticalSection in which it "
         r"would be running"):
-      cs.execute(fn, 1.0)
+      cs.execute(lambda: fn(1.0))
 
   def testRecursiveCriticalSectionAccessViaCapturedTensorIsProtected(self):
     # This one is subtle; and we're being overly cautious here.  The
@@ -173,24 +193,24 @@ class CriticalSectionTest(test.TestCase):
     # operations are finished before anything runs within the critical section.
     cs = critical_section_ops.CriticalSection(shared_name="cs")
     fn = array_ops.identity
-    to_capture = cs.execute(fn, 1.0)
+    to_capture = cs.execute(lambda: fn(1.0))
     fn_captures = lambda x: x + to_capture
     to_capture_too = array_ops.identity(to_capture)
 
-    ex_0 = cs.execute(fn_captures, 1.0)
+    ex_0 = cs.execute(lambda: fn_captures(1.0))
 
     with ops.control_dependencies([to_capture]):
       # This is OK because to_capture will execute before this next call
-      ex_1 = cs.execute(fn_captures, 1.0)
+      ex_1 = cs.execute(lambda: fn_captures(1.0))
 
     dependency = array_ops.identity(to_capture)
 
     fn_captures_dependency = lambda x: x + dependency
 
-    ex_2 = cs.execute(fn_captures_dependency, 1.0)
+    ex_2 = cs.execute(lambda: fn_captures_dependency(1.0))
 
     with ops.control_dependencies([to_capture_too]):
-      ex_3 = cs.execute(fn_captures_dependency, 1.0)
+      ex_3 = cs.execute(lambda: fn_captures_dependency(1.0))
 
     # Ensure there's no actual deadlock on to_execute.
     self.assertEquals(2.0, self.evaluate(ex_0))
@@ -216,6 +236,8 @@ class CriticalSectionTest(test.TestCase):
         body_implicit_capture,
         [0, 0],
         parallel_iterations=25)
+    # For consistency between eager and graph mode.
+    i_n = array_ops.identity(i_n)
     logging.warn(
         "\n==============\nRunning "
         "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
@@ -241,6 +263,8 @@ class CriticalSectionTest(test.TestCase):
         body_implicit_capture_protected,
         [0, 0],
         parallel_iterations=25)
+    # For consistency between eager and graph mode.
+    i_n = array_ops.identity(i_n)
     logging.warn(
         "\n==============\nRunning "
         "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
@@ -257,13 +281,15 @@ class CriticalSectionTest(test.TestCase):
       # This version is ok because j is an argument to fn and we can
       # ensure there's a control dependency on j.
       fn = lambda x: x + 1
-      return (i + 1, cs.execute(fn, j))
+      return (i + 1, cs.execute(lambda: fn(j)))
 
     (i_n, j_n) = control_flow_ops.while_loop(
         lambda i, _: i < 1000,
         body_args_capture,
         [0, 0],
         parallel_iterations=25)
+    # For consistency between eager and graph mode.
+    i_n = array_ops.identity(i_n)
     logging.warn(
         "\n==============\nRunning "
         "'testRecursiveCriticalSectionAccessWithinLoopDoesNotDeadlock "
@@ -276,20 +302,23 @@ class CriticalSectionTest(test.TestCase):
         "body_args_capture'\n"
         "==============\n")
 
+  @test_util.run_v1_only("b/123955885 Can't identify deadlocks in eager mode")
   def testRecursiveCriticalSectionAccessIsIllegalSameSharedName(self):
     # This does not work properly in eager mode.  Eager users will
     # just hit a deadlock if they do this.  But at least it'll be easier
     # to debug.
     cs = critical_section_ops.CriticalSection(shared_name="cs")
     cs_same = critical_section_ops.CriticalSection(shared_name="cs")
+    add = lambda x: x + 1
     def fn(x):
-      return cs_same.execute(lambda x: x+1, x)
+      return cs_same.execute(lambda: add(x))
     with self.assertRaisesRegexp(
         ValueError,
         r"attempts to directly access the CriticalSection in which it "
         r"would be running"):
-      cs.execute(fn, 1.0)
+      cs.execute(lambda: fn(1.0))
 
+  @test_util.run_v1_only("b/123955885 Can't identify deadlocks in eager mode")
   def testMultipleCSExecutionsRequestSameResource(self):
     cs0 = critical_section_ops.CriticalSection()
     cs1 = critical_section_ops.CriticalSection()
@@ -327,20 +356,32 @@ class CriticalSectionTest(test.TestCase):
     # Note, here v must be a resource variable (or something similar),
     # otherwise it gets hoisted into the while_loop by the time we add
     # control dependencies to the lock_op.
+    def body(i):
+      add_j = lambda j: v + j + 1
+      return cs.execute(lambda: add_j(i))
     out = control_flow_ops.while_loop(
-        lambda i: i < 10, lambda i: cs.execute(lambda j: v + j + 1, i), [0])
+        lambda i: i < 10, body, [0])
     self.evaluate(v.initializer)
     self.assertEqual(10, self.evaluate(out))
 
   @test_util.run_in_graph_and_eager_modes
   def testInsideFunction(self):
+    if test_util.is_gpu_available():
+      self.skipTest(
+          "b/123899495: Colocation errors for critical sections in map on GPU")
     cs = critical_section_ops.CriticalSection()
-    v = resource_variable_ops.ResourceVariable(1)
+    with ops.device("/gpu:0" if test_util.is_gpu_available() else "/cpu:0"):
+      v = resource_variable_ops.ResourceVariable(1)
     def fn():
       return v.read_value()
 
     # map() creates a TensorFlow function.
-    ds = dataset_ops.Dataset.range(1).map(lambda _: cs.execute(fn))
+    ds = dataset_ops.Dataset.range(1)
+    if test_util.is_gpu_available():
+      ds = (ds.apply(prefetching_ops.copy_to_device("/gpu:0"))
+            .apply(prefetching_ops.map_on_gpu(lambda _: cs.execute(fn))))
+    else:
+      ds = ds.map(lambda _: cs.execute(fn))
 
     def get_first():
       if context.executing_eagerly():
diff --git a/tensorflow/python/kernel_tests/ctc_loss_op_test.py b/tensorflow/python/kernel_tests/ctc_loss_op_test.py
index e24f304c1b80787f43885055cad1de8cf43bb4db..352dedea4abc885d3f7765533b345e09ecec6dc9 100644
--- a/tensorflow/python/kernel_tests/ctc_loss_op_test.py
+++ b/tensorflow/python/kernel_tests/ctc_loss_op_test.py
@@ -242,7 +242,6 @@ class CTCLossTest(test.TestCase):
 
     self._testCTCLoss(inputs, seq_lens, labels, loss_truth, grad_truth)
 
-  @test_util.run_v1_only("b/120545219")
   def test_time_major(self):
     """Testing time_major param.
 
@@ -565,7 +564,6 @@ class CTCLossTestV2(test.TestCase):
               rtol=2e-06,
               atol=2e-06)
 
-  @test_util.run_v1_only("b/120545219")
   def testCollapseRepeated(self):
     collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
         labels=[[1, 3, 3, 3, 0],
@@ -579,7 +577,6 @@ class CTCLossTestV2(test.TestCase):
          [1, 4, 0, 0],
          [4, 2, 9, 4]])
 
-  @test_util.run_v1_only("b/120545219")
   def testCollapseRepeatedPreservesDtypes(self):
     collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
         labels=constant_op.constant(
@@ -597,7 +594,6 @@ class CTCLossTestV2(test.TestCase):
          [1, 4, 0, 0],
          [4, 2, 9, 4]])
 
-  @test_util.run_v1_only("b/120545219")
   def testCollapseRepeatedExtraPadding(self):
     collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
         labels=[[1, 3, 3, 3, 0, 0, 0],
@@ -611,7 +607,6 @@ class CTCLossTestV2(test.TestCase):
          [1, 4, 0, 0],
          [4, 2, 9, 4]])
 
-  @test_util.run_v1_only("b/120545219")
   def testCollapseRepeatedFrontRepeats(self):
     collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
         labels=[[1, 1, 1, 2, 2],
@@ -625,7 +620,6 @@ class CTCLossTestV2(test.TestCase):
          [1, 2],
          [1, 0]])
 
-  @test_util.run_v1_only("b/120545219")
   def testCollapseRepeatedAllLabelsTheSame(self):
     collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
         labels=[[1, 1, 1, 1, 1],
@@ -658,7 +652,6 @@ class CTCLossTestV2(test.TestCase):
 
     self.assertAllEqual(padded_dense, new_dense)
 
-  @test_util.run_v1_only("b/120545219")
   def testUnique(self):
     labels = [
         [3, 4, 4, 3],
@@ -674,7 +667,6 @@ class CTCLossTestV2(test.TestCase):
         [0, 0, 0, 1],
     ], idx)
 
-  @test_util.run_v1_only("b/120545219")
   def testSumStates(self):
     idx = [
         [0, 1, 0, 1],
@@ -694,7 +686,6 @@ class CTCLossTestV2(test.TestCase):
          [1.8, 0.8, 0.0, 0.0]]
     ], sum_of_states)
 
-  @test_util.run_v1_only("b/120545219")
   def testStateToOlabel(self):
     labels = [
         [3, 4, 3, 4],
@@ -733,7 +724,6 @@ class CTCLossTestV2(test.TestCase):
          [22.0 + 23.0 + 24.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
     ])
 
-  @test_util.run_v1_only("b/120545219")
   def testStateToOlabelUnique(self):
     labels = [
         [3, 4, 3, 4],
@@ -811,7 +801,7 @@ class CTCLossTestV2(test.TestCase):
       x = random_ops.random_uniform([])
       fn = lambda accum, elem: accum + x * elem
       out = ctc_ops._scan(fn, constant_op.constant([0.0, 1.0, 2.0]), 23.0)
-      self.assertAllEqual(*sess.run([
+      self.assertAllClose(*sess.run([
           [23.0 + x * 0.0, 23.0 + x * 1.0, 23.0 + x * 3.0], out
       ]))
 
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 9bb7d8b8b12baafe15fe9150e58c4e03749e7261..29d335d68cfe2167ff88c7b3dffee3d14d7af90d 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -595,7 +595,7 @@ class MinMaxOpTest(test.TestCase):
 
   def testScalar(self):
     x = np.random.rand(1, 3, 2) * 100.
-    y = np.asscalar(np.random.rand(1) * 100.)  # should broadcast
+    y = np.random.rand(1).item() * 100.  # should broadcast
     # dropped np.float64, int64 because TF automatically converts to 32 bit
     for t in [np.float32, np.int32]:
       self._compare(x.astype(t), t(y), use_gpu=False)
@@ -887,7 +887,7 @@ class ComplexMakeRealImagTest(test.TestCase):
       tf_angle = math_ops.angle(inx)
       tf_angle_val = self.evaluate(tf_angle)
 
-    self.assertAllEqual(np_angle, tf_angle_val)
+    self.assertAllClose(np_angle, tf_angle_val)
     self.assertShapeEqual(np_angle, tf_angle)
 
   def testAngle64(self):
@@ -895,18 +895,14 @@ class ComplexMakeRealImagTest(test.TestCase):
     imag = (np.arange(-3, 3) / 5.).reshape([1, 3, 2]).astype(np.float32)
     cplx = real + 1j * imag
     self._compareAngle(cplx, use_gpu=False)
-    # TODO: Enable GPU tests for angle op after resolving
-    # build failures on GPU (See #10643 for context).
-    # self._compareAngle(cplx, use_gpu=True)
+    self._compareAngle(cplx, use_gpu=True)
 
   def testAngle(self):
     real = (np.arange(-3, 3) / 4.).reshape([1, 3, 2]).astype(np.float64)
     imag = (np.arange(-3, 3) / 5.).reshape([1, 3, 2]).astype(np.float64)
     cplx = real + 1j * imag
     self._compareAngle(cplx, use_gpu=False)
-    # TODO: Enable GPU tests for angle op after resolving
-    # build failures on GPU (See #10643 for context).
-    # self._compareAngle(cplx, use_gpu=True)
+    self._compareAngle(cplx, use_gpu=True)
 
   @test_util.run_deprecated_v1
   def testRealReal(self):
diff --git a/tensorflow/python/kernel_tests/decode_raw_op_test.py b/tensorflow/python/kernel_tests/decode_raw_op_test.py
index 008e59ba3e64915d8642243d335701e8adea19c0..bb8d2cf6a051867a28f984378d0db4779b06c0e0 100644
--- a/tensorflow/python/kernel_tests/decode_raw_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_raw_op_test.py
@@ -89,6 +89,32 @@ class DecodeRawOpTest(test.TestCase):
 
       self.assertAllEqual(expected_result, result)
 
+  @test_util.run_deprecated_v1
+  def testToComplex64(self):
+    with self.cached_session():
+      in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
+      decode = parsing_ops.decode_raw(in_bytes, out_type=dtypes.complex64)
+      self.assertEqual([None, None], decode.get_shape().as_list())
+
+      expected_result = np.matrix([[1 + 1j, 2 - 2j, -3 + 3j, -4 - 4j]],
+                                  dtype="<c8")
+      result = decode.eval(feed_dict={in_bytes: [expected_result.tostring()]})
+
+      self.assertAllEqual(expected_result, result)
+
+  @test_util.run_deprecated_v1
+  def testToComplex128(self):
+    with self.cached_session():
+      in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
+      decode = parsing_ops.decode_raw(in_bytes, out_type=dtypes.complex128)
+      self.assertEqual([None, None], decode.get_shape().as_list())
+
+      expected_result = np.matrix([[1 + 1j, 2 - 2j, -3 + 3j, -4 - 4j]],
+                                  dtype="<c16")
+      result = decode.eval(feed_dict={in_bytes: [expected_result.tostring()]})
+
+      self.assertAllEqual(expected_result, result)
+
   @test_util.run_deprecated_v1
   def testEmptyStringInput(self):
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py b/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py
index 4e3da068b8927c324bf9b17fb8e19e1038470777..a778bf231bb80eefd6f4d602662fe50f67817a4f 100644
--- a/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py
+++ b/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -33,7 +32,6 @@ class AssignOpTest(test.TestCase):
   # NOTE(mrry): We exclude thess tests from the TSAN TAP target, because they
   #   contain benign and deliberate data races when multiple threads update
   #   the same parameters without a lock.
-  @test_util.run_v1_only("b/120545219")
   def testParallelUpdateWithoutLocking(self):
     with self.cached_session() as sess:
       ones_t = array_ops.fill([1024, 1024], 1.0)
@@ -42,7 +40,7 @@ class AssignOpTest(test.TestCase):
           state_ops.assign_add(
               p, ones_t, use_locking=False) for _ in range(20)
       ]
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       def run_add(add_op):
         self.evaluate(add_op)
@@ -61,7 +59,6 @@ class AssignOpTest(test.TestCase):
       self.assertTrue((vals >= ones).all())
       self.assertTrue((vals <= ones * 20).all())
 
-  @test_util.run_v1_only("b/120545219")
   def testParallelAssignWithoutLocking(self):
     with self.cached_session() as sess:
       ones_t = array_ops.fill([1024, 1024], float(1))
@@ -70,7 +67,7 @@ class AssignOpTest(test.TestCase):
           state_ops.assign(p, math_ops.multiply(ones_t, float(i)), False)
           for i in range(1, 21)
       ]
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       def run_assign(assign_op):
         self.evaluate(assign_op)
@@ -94,7 +91,6 @@ class AssignOpTest(test.TestCase):
   # contain non-benign but known data races between the variable assignment and
   # returning the output tensors. This issue will be resolved with the new
   # resource variables.
-  @test_util.run_v1_only("b/120545219")
   def testParallelUpdateWithLocking(self):
     with self.cached_session() as sess:
       zeros_t = array_ops.fill([1024, 1024], 0.0)
@@ -104,7 +100,7 @@ class AssignOpTest(test.TestCase):
           state_ops.assign_add(
               p, ones_t, use_locking=True) for _ in range(20)
       ]
-      p.initializer.run()
+      self.evaluate(p.initializer)
 
       def run_add(add_op):
         self.evaluate(add_op)
@@ -122,7 +118,6 @@ class AssignOpTest(test.TestCase):
       ones = np.ones((1024, 1024)).astype(np.float32)
       self.assertAllEqual(vals, ones * 20)
 
-  @test_util.run_v1_only("b/120545219")
   def testParallelAssignWithLocking(self):
     with self.cached_session() as sess:
       zeros_t = array_ops.fill([1024, 1024], 0.0)
@@ -133,7 +128,7 @@ class AssignOpTest(test.TestCase):
               p, math_ops.multiply(ones_t, float(i)), use_locking=True)
           for i in range(1, 21)
       ]
-      p.initializer.run()
+      self.evaluate(p.initializer)
 
       def run_assign(assign_op):
         self.evaluate(assign_op)
diff --git a/tensorflow/python/kernel_tests/depthtospace_op_test.py b/tensorflow/python/kernel_tests/depthtospace_op_test.py
index 96c9b5258e2a4a103a3d981a3340f67a01bbec94..b7a865cf13e90be8ad29fe491bd03c230eaa3c27 100644
--- a/tensorflow/python/kernel_tests/depthtospace_op_test.py
+++ b/tensorflow/python/kernel_tests/depthtospace_op_test.py
@@ -295,6 +295,7 @@ class DepthToSpaceTest(test.TestCase):
       actual_vals, expected_vals = self.evaluate([actual, expected])
       self.assertTrue(np.array_equal(actual_vals, expected_vals))
 
+  @test_util.disable_xla("b/123553551")  # Unsupported data format
   def testAgainstTranspose(self):
     self.compareToTranspose(3, 2, 3, 1, 2, "NHWC", False)
     self.compareToTranspose(3, 2, 3, 2, 2, "NHWC", False)
diff --git a/tensorflow/python/kernel_tests/diag_op_test.py b/tensorflow/python/kernel_tests/diag_op_test.py
index ed2a9e8e47e961549dbaa99a78624e22af146937..0bf48fd228fda5640e203f74b4717a2cfffd2ba3 100644
--- a/tensorflow/python/kernel_tests/diag_op_test.py
+++ b/tensorflow/python/kernel_tests/diag_op_test.py
@@ -65,6 +65,7 @@ class MatrixDiagTest(test.TestCase):
       array_ops.matrix_diag(0)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testInvalidShapeAtEval(self):
     with self.session(use_gpu=True):
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
@@ -269,6 +270,7 @@ class MatrixDiagPartTest(test.TestCase):
       array_ops.matrix_diag_part(0)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testInvalidShapeAtEval(self):
     with self.session(use_gpu=True):
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
diff --git a/tensorflow/python/kernel_tests/distributions/BUILD b/tensorflow/python/kernel_tests/distributions/BUILD
index 14532965d8c2c62139b3cd922acb9f90c0691d53..22c98201dd1847586af6a30eed8004757a21b335 100644
--- a/tensorflow/python/kernel_tests/distributions/BUILD
+++ b/tensorflow/python/kernel_tests/distributions/BUILD
@@ -23,6 +23,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -42,6 +43,7 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
     shard_count = 3,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -54,6 +56,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -71,6 +74,7 @@ cuda_py_test(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -86,6 +90,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -105,6 +110,7 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -119,6 +125,7 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -139,6 +146,7 @@ cuda_py_test(
         "noguitar",  # b/110489471
         "notap",  # b/110489471
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -154,6 +162,7 @@ cuda_py_test(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -169,6 +178,7 @@ cuda_py_test(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -184,6 +194,7 @@ cuda_py_test(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -200,6 +211,7 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
     tags = ["manual"],  # b/69001419
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -217,7 +229,15 @@ cuda_py_test(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
     ],
-    tags = ["nomsan"],  # disable to avoid false positives from scipy.
+    tags = [
+        # TODO(b/121223043): Re-enable this test after fixing "mean not defined"
+        # errors.
+        "no_mac",
+        "no_oss",
+        # disable to avoid false positives from scipy.
+        "nomsan",
+    ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -234,6 +254,7 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -252,6 +273,7 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -268,6 +290,7 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:variables",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -285,4 +308,5 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
diff --git a/tensorflow/python/kernel_tests/fifo_queue_test.py b/tensorflow/python/kernel_tests/fifo_queue_test.py
index 0579dddb70264199a53c140ab60ad2ddf9b00bb9..b88b43ff50781d49746abaa0e92c456907b8eb32 100644
--- a/tensorflow/python/kernel_tests/fifo_queue_test.py
+++ b/tensorflow/python/kernel_tests/fifo_queue_test.py
@@ -39,7 +39,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_v1_only("FIFOQueue removed from v2")
 class FIFOQueueTest(test.TestCase):
 
   def testConstructor(self):
@@ -1424,7 +1424,7 @@ class FIFOQueueTest(test.TestCase):
         session.run([a, c])
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_v1_only("FIFOQueue removed from v2")
 class FIFOQueueDictTest(test.TestCase):
 
   def testConstructor(self):
@@ -1585,7 +1585,7 @@ class FIFOQueueDictTest(test.TestCase):
       self.assertTrue([compat.as_bytes("dd"), compat.as_bytes("ee")], list(s))
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_v1_only("FIFOQueue removed from v2")
 class FIFOQueueWithTimeoutTest(test.TestCase):
 
   def testDequeueWithTimeout(self):
@@ -1620,7 +1620,7 @@ class FIFOQueueWithTimeoutTest(test.TestCase):
       self.assertEqual(37, self.evaluate(dequeued_t))
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_v1_only("FIFOQueue removed from v2")
 class QueueContainerTest(test.TestCase):
 
   def testContainer(self):
@@ -1631,6 +1631,7 @@ class QueueContainerTest(test.TestCase):
         compat.as_bytes("test"), q.queue_ref.op.get_attr("container"))
 
 
+@test_util.run_v1_only("FIFOQueue removed from v2")
 class FIFOQueueBenchmark(test.Benchmark):
   """Benchmark FIFOQueue operations."""
 
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index 95ee454614e6edb633b981e9173b2035550259c3..0ab2b4bdfb52665e7ebc9323d73ab41adac0ba4d 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -23,16 +23,17 @@ import numpy as np
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.eager import function as eager_function
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import gen_functional_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -191,133 +192,6 @@ class FunctionalOpsTest(test.TestCase):
       self.assertAllEqual(720.0, self.evaluate(r))
   # pylint: enable=unnecessary-lambda
 
-  @test_util.run_in_graph_and_eager_modes
-  def testMap_Simple(self):
-    nums = [1, 2, 3, 4, 5, 6]
-    elems = constant_op.constant(nums, name="data")
-    r = functional_ops.map_fn(
-        lambda x: math_ops.multiply(math_ops.add(x, 3), 2), elems)
-    self.assertAllEqual(
-        np.array([(x + 3) * 2 for x in nums]), self.evaluate(r))
-
-  def testMapSparseTensor(self):
-    with self.cached_session():
-      with self.assertRaises(TypeError):
-        functional_ops.map_fn(
-            lambda x: x,
-            sparse_tensor.SparseTensor(
-                indices=[[0, 0], [0, 1], [1, 0]],
-                values=constant_op.constant([0, 1, 2]),
-                dense_shape=[2, 2]))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testMapOverScalarErrors(self):
-    with self.assertRaisesRegexp(ValueError, "not scalars"):
-      functional_ops.map_fn(lambda x: x, [1, 2])
-    with self.assertRaisesRegexp(ValueError, "not a scalar"):
-      functional_ops.map_fn(lambda x: x, 1)
-
-  @test_util.run_deprecated_v1
-  def testMap_Scoped(self):
-    with self.cached_session() as sess:
-
-      def double_scoped(x):
-        """2x with a dummy 2 that is scoped."""
-        with variable_scope.variable_scope("body"):
-          # Dummy variable, just to check that scoping works as intended.
-          two = variable_scope.get_variable(
-              "two", [],
-              dtype=dtypes.int32,
-              initializer=init_ops.constant_initializer(2))
-          return math_ops.multiply(x, two)
-
-      with variable_scope.variable_scope("root") as varscope:
-        elems = constant_op.constant([1, 2, 3, 4, 5, 6], name="data")
-        doubles = np.array([2 * x for x in [1, 2, 3, 4, 5, 6]])
-
-        r = functional_ops.map_fn(double_scoped, elems)
-        # Check that we have the one variable we asked for here.
-        self.assertEqual(len(variables.trainable_variables()), 1)
-        self.assertEqual(variables.trainable_variables()[0].name,
-                         "root/body/two:0")
-        sess.run([variables.global_variables_initializer()])
-        self.assertAllEqual(doubles, self.evaluate(r))
-
-        # Now let's reuse our single variable.
-        varscope.reuse_variables()
-        r = functional_ops.map_fn(double_scoped, elems)
-        self.assertEqual(len(variables.trainable_variables()), 1)
-        self.assertAllEqual(doubles, self.evaluate(r))
-
-  @test_util.run_deprecated_v1
-  def testMap_Grad(self):
-    with self.cached_session():
-      param = constant_op.constant(2.0)
-      elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="elems")
-      y = functional_ops.map_fn(
-          lambda x: math_ops.multiply(math_ops.square(x), param), elems)
-      r = gradients_impl.gradients(y, param)[0]
-      self.assertAllEqual(91.0, self.evaluate(r))
-      r = gradients_impl.gradients(y, elems)[0]
-      self.assertAllEqual([4.0, 8.0, 12.0, 16.0, 20.0, 24.0], self.evaluate(r))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testMap_SimpleNotTensor(self):
-    nums = np.array([1, 2, 3, 4, 5, 6])
-    r = functional_ops.map_fn(
-        lambda x: math_ops.multiply(math_ops.add(x, 3), 2), nums)
-    self.assertAllEqual(
-        np.array([(x + 3) * 2 for x in nums]), self.evaluate(r))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testMap_SingleInputMultiOutput(self):
-    nums = np.array([1, 2, 3, 4, 5, 6])
-    r = functional_ops.map_fn(
-        lambda x: ((x + 3) * 2, -(x + 3) * 2),
-        nums,
-        dtype=(dtypes.int64, dtypes.int64))
-    self.assertEqual(2, len(r))
-    self.assertEqual((6,), r[0].get_shape())
-    self.assertEqual((6,), r[1].get_shape())
-    received = self.evaluate(r)
-    self.assertAllEqual((nums + 3) * 2, received[0])
-    self.assertAllEqual(-(nums + 3) * 2, received[1])
-
-  @test_util.run_in_graph_and_eager_modes
-  def testMap_MultiOutputMismatchedDtype(self):
-    nums = np.array([1, 2, 3, 4, 5, 6])
-    with self.assertRaisesRegexp(
-        TypeError, r"two structures don't have the same nested structure"):
-      # lambda emits tuple, but dtype is a list
-      functional_ops.map_fn(
-          lambda x: ((x + 3) * 2, -(x + 3) * 2),
-          nums,
-          dtype=[dtypes.int64, dtypes.int64])
-
-  @test_util.run_in_graph_and_eager_modes
-  def testMap_MultiInputSingleOutput(self):
-    nums = np.array([1, 2, 3, 4, 5, 6])
-    r = functional_ops.map_fn(
-        lambda x: x[0] * x[1][0] + x[1][1], (nums, (nums, -nums)),
-        dtype=dtypes.int64)
-    self.assertEqual((6,), r.get_shape())
-    received = self.evaluate(r)
-    self.assertAllEqual(nums * nums + (-nums), received)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testMap_MultiInputSameStructureOutput(self):
-    nums = np.array([1, 2, 3, 4, 5, 6])
-    r = functional_ops.map_fn(lambda x: (x[1][0], (x[1][1], x[0])),
-                              (nums, (2 * nums, -nums)))
-    r = [r[0], r[1][0], r[1][1]]
-    self.assertEqual((6,), r[0].get_shape())
-    self.assertEqual((6,), r[1].get_shape())
-    self.assertEqual((6,), r[2].get_shape())
-    received = self.evaluate(r)
-    self.assertAllEqual(2 * nums, received[0])
-    self.assertAllEqual(-nums, received[1])
-    self.assertAllEqual(nums, received[2])
-
   @test_util.run_in_graph_and_eager_modes
   def testScan_Simple(self):
     elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="data")
@@ -466,7 +340,7 @@ class FunctionalOpsTest(test.TestCase):
     loss = l0 + array_ops.stop_gradient(l1)
     grad = gradients_impl.gradients(ys=[loss], xs=[a, b])
     with self.test_session(use_gpu=True) as sess:
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.evaluate(grad)
 
   @test_util.run_in_graph_and_eager_modes
@@ -480,37 +354,6 @@ class FunctionalOpsTest(test.TestCase):
     y = functional_ops.foldl(fn, x, initializer=initializer)
     self.assertAllEqual(y.get_shape(), self.evaluate(y).shape)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testMapShape(self):
-    x = constant_op.constant([[1, 2, 3], [4, 5, 6]])
-    y = functional_ops.map_fn(lambda e: e, x)
-    self.assertAllEqual(y.get_shape(), self.evaluate(y).shape)
-
-  @test_util.run_deprecated_v1
-  def testMapUnknownShape(self):
-    x = array_ops.placeholder(dtypes.float32)
-    y = functional_ops.map_fn(lambda e: e, x)
-    self.assertIs(None, y.get_shape().dims)
-
-  @test_util.disable_control_flow_v2("b/119323354")
-  @test_util.run_in_graph_and_eager_modes
-  @test_util.run_v1_only("b/120545219")
-  def testMapEmptyScalar(self):
-    map_return = functional_ops.map_fn(lambda x: 1, constant_op.constant([]))
-    self.assertAllEqual([0], map_return.get_shape().dims)
-    self.assertAllEqual([0], self.evaluate(map_return).shape)
-
-  # TODO(akshayka): this test fails in eager: the iterable is of length 0 so
-  # so the body of the while loop never executes
-  @test_util.disable_control_flow_v2("b/119323354")
-  @test_util.run_v1_only("b/120545219")
-  def testMapEmptyTensor(self):
-    with self.cached_session():
-      map_return = functional_ops.map_fn(lambda x: array_ops.zeros([3, 2]),
-                                         constant_op.constant([]))
-      self.assertAllEqual([0, 3, 2], map_return.get_shape().dims)
-      self.assertAllEqual([0, 3, 2], self.evaluate(map_return).shape)
-
   @test_util.run_in_graph_and_eager_modes
   def testScanShape(self):
     x = constant_op.constant([[1, 2, 3], [4, 5, 6]])
@@ -762,6 +605,26 @@ class FunctionalOpsTest(test.TestCase):
           self.assertAllEqual(Run(sess, 20.), 210.)
           self.assertAllEqual(Run(sess, 100.), 5050.)
 
+  # Like above, but using int32 in order to ensure that int32 tensors don't get
+  # copied to the GPU during the application of the while.
+  def testWhileInt32(self):
+    with ops.Graph().as_default() as g:
+
+      @function.Defun(*[dtypes.int32] * 2)
+      def Cond(n, unused_x):
+        return n > 0
+
+      @function.Defun(*[dtypes.int32] * 2)
+      def Body(n, x):
+        return n - 1, x + n
+
+      def Run(sess, n):
+        return sess.run(functional_ops.While([n, 0], Cond, Body))[1]
+
+      with self.session(graph=g, use_gpu=True) as sess:
+        self.assertAllEqual(Run(sess, 20), 210)
+        self.assertAllEqual(Run(sess, 100), 5050)
+
   @test_util.run_deprecated_v1
   def testWhileLowering(self):
 
@@ -798,6 +661,7 @@ class FunctionalOpsTest(test.TestCase):
     self.assertAllEqual(Run(100., True), 5050.)
 
   @test_util.run_v1_only("b/120545219")
+  @test_util.disable_xla("b/123337890")  # Different error message
   def testWhileError(self):
     for use_gpu in (True, False):
       with ops.Graph().as_default() as g:
@@ -1248,6 +1112,37 @@ class PartitionedCallTest(test.TestCase):
       self.evaluate(op)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+@test_util.with_control_flow_v2
+class FunctionalOpsCaseTest(test.TestCase):
+
+  def testCase(self):
+    @eager_function.defun
+    def two(x):
+      return x * 2
+
+    @eager_function.defun
+    def three(x):
+      return x * 3
+
+    @eager_function.defun
+    def four(x):
+      return x * 4
+
+    def f(branch, x):
+      tmpl = array_ops.zeros_like(x)
+      return array_ops.identity(gen_functional_ops.case(
+          branch, input=[x], Tout=[dtypes.float32],
+          branches=[f.get_concrete_function(tmpl)
+                    for f in (two, three, four)])[0])
+    one = array_ops.ones([])
+    self.assertAllEqual(np.float32(2), self.evaluate(f(0, one)))
+    self.assertAllEqual(np.float32(3), self.evaluate(f(1, one)))
+    self.assertAllEqual(np.float32(4), self.evaluate(f(2, one)))
+    self.assertAllEqual(np.float32(4), self.evaluate(f(-1, one)))  # <0 default
+    self.assertAllEqual(np.float32(4), self.evaluate(f(6, one)))  # >=N default
+
+
 if __name__ == "__main__":
   test.main()
 
diff --git a/tensorflow/python/kernel_tests/gather_nd_op_test.py b/tensorflow/python/kernel_tests/gather_nd_op_test.py
index 320ffc9674bd2e0ce601084ab8fc375c4cbdc3e2..ad8376b48c8f05809b310a432a12a92786aba989 100644
--- a/tensorflow/python/kernel_tests/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_nd_op_test.py
@@ -56,6 +56,7 @@ class GatherNdTest(test.TestCase):
     self._testSimpleDtype("|S")  # byte strings in python2 + 3
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testEmptyIndicesAndParamsOKButJustEmptyParamsFails(self):
     with self.session(use_gpu=True):
       params = np.ones((3, 3), dtype=np.float32)
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index fc86068c3fc08d1ad01ba8dfa9bb4c5bc6c429f2..65043d9f4f0e7624fc42beb0ab299baa67d61f44 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -18,8 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -32,7 +34,7 @@ _TEST_TYPES = (dtypes.int64, dtypes.float32,
                dtypes.complex64, dtypes.complex128)
 
 
-class GatherTest(test.TestCase):
+class GatherTest(test.TestCase, parameterized.TestCase):
 
   def _buildParams(self, data, dtype):
     data = data.astype(dtype.as_numpy_dtype)
@@ -248,6 +250,244 @@ class GatherTest(test.TestCase):
           gather = array_ops.gather(params, indices, axis=2)
           self.assertAllEqual(gather.eval(), np.zeros((0, 0, 2)))
 
+  @parameterized.parameters([
+      # batch_dims=0 (equivalent to tf.gather)
+      dict(  # 2D indices
+          batch_dims=0,
+          params=[6, 7, 8, 9],
+          indices=[[2, 1], [0, 3]],
+          expected=[[8, 7], [6, 9]]),
+      dict(  # 3D indices
+          batch_dims=0,
+          params=[6, 7, 8, 9],
+          indices=[[[3, 1], [2, 0]], [[0, 3], [2, 2]]],
+          expected=[[[9, 7], [8, 6]], [[6, 9], [8, 8]]]),
+      dict(  # 4D indices
+          batch_dims=0,
+          params=[8, 9],
+          indices=[[[[0, 1], [1, 0]], [[0, 0], [1, 1]]],
+                   [[[1, 1], [0, 0]], [[0, 1], [1, 0]]]],
+          expected=[[[[8, 9], [9, 8]], [[8, 8], [9, 9]]],
+                    [[[9, 9], [8, 8]], [[8, 9], [9, 8]]]]),
+
+      # batch_dims=indices.shape.ndims - 1 (equivalent to tf.batch_gather)
+      dict(  # 2D indices (1 batch dim)
+          batch_dims=1,
+          params=[[10, 11, 12, 13], [20, 21, 22, 23]],
+          indices=[[2, 1], [0, 3]],
+          expected=[[12, 11], [20, 23]]),
+      dict(  # 3D indices (2 batch dims)
+          batch_dims=2,
+          params=[[[100, 101], [110, 111]], [[200, 201], [210, 211]]],
+          indices=[[[0, 1], [1, 0]], [[0, 0], [1, 1]]],
+          expected=[[[100, 101], [111, 110]], [[200, 200], [211, 211]]]),
+      dict(  # 2D indices (1 batch dim)
+          batch_dims=-1,
+          params=[[10, 11, 12, 13], [20, 21, 22, 23]],
+          indices=[[2, 1], [0, 3]],
+          expected=[[12, 11], [20, 23]]),
+      dict(  # 3D indices (2 batch dims)
+          batch_dims=-1,
+          params=[[[100, 101], [110, 111]], [[200, 201], [210, 211]]],
+          indices=[[[0, 1], [1, 0]], [[0, 0], [1, 1]]],
+          expected=[[[100, 101], [111, 110]], [[200, 200], [211, 211]]]),
+
+      # 0 < batch_dims < indices.shape.ndims - 1
+      dict(  # 3D indices (1 batch dim)
+          batch_dims=1,
+          params=[[10, 11, 12, 13], [20, 21, 22, 23]],
+          indices=[[[3, 1], [2, 0]], [[0, 3], [2, 2]]],
+          expected=[[[13, 11], [12, 10]], [[20, 23], [22, 22]]]),
+      dict(  # 4D indices (1 batch dim)
+          batch_dims=1,
+          params=[[6, 7], [8, 9]],
+          indices=[[[[0, 1], [1, 0]], [[0, 0], [1, 1]]],
+                   [[[1, 1], [0, 0]], [[0, 1], [1, 0]]]],
+          expected=[[[[6, 7], [7, 6]], [[6, 6], [7, 7]]],
+                    [[[9, 9], [8, 8]], [[8, 9], [9, 8]]]]),
+      dict(  # 4D indices (2 batch dims)
+          batch_dims=2,
+          params=[[[2, 3], [4, 5]], [[6, 7], [8, 9]]],
+          indices=[[[[0, 1], [1, 0]], [[0, 0], [1, 1]]],
+                   [[[1, 1], [0, 0]], [[0, 1], [1, 0]]]],
+          expected=[[[[2, 3], [3, 2]], [[4, 4], [5, 5]]],
+                    [[[7, 7], [6, 6]], [[8, 9], [9, 8]]]]),
+
+      # axis > 0
+      dict(  # 3D indices, batch_dims=1, axis=2
+          # params.shape  = [I1, J1, J2] = [2, 2, 3]
+          # indices.shape = [I1, K1, K2] = [2, 1, 5]
+          # result.shape  = [I1, J1, K1, K2] = [2, 2, 1, 5]
+          batch_dims=1,
+          axis=2,
+          params=[[[10, 11, 12], [13, 14, 15]], [[20, 21, 22], [23, 24, 25]]],
+          indices=[[[0, 1, 2, 1, 0]], [[0, 1, 2, 1, 0]]],
+          expected=[[[[10, 11, 12, 11, 10]], [[13, 14, 15, 14, 13]]],
+                    [[[20, 21, 22, 21, 20]], [[23, 24, 25, 24, 23]]]]),
+      dict(  # 3D indices, batch_dims=None, axis=1
+          batch_dims=None,
+          axis=1,
+          params=[[10, 11, 12], [13, 14, 15]],
+          indices=[1, 0],
+          expected=[[11, 10], [14, 13]]),
+  ])
+  @test_util.run_in_graph_and_eager_modes
+  def testBatchDims(self, params, indices, batch_dims, expected=None,
+                    axis=None):
+    result = array_ops.gather(params, indices, axis=axis, batch_dims=batch_dims)
+    self.assertAllEqual(expected, result)
+
+  @parameterized.parameters([
+      dict(
+          params_shape=[2, 3, 4, 5, 6, 7],
+          indices_shape=[2, 3, 8, 9, 10],
+          batch_dims=2,
+          axis=2,
+          output_shape=[2, 3, 8, 9, 10, 5, 6, 7]
+          # = params.shape[:2] + indices.shape[2:] + params.shape[3:]
+          ),
+      dict(
+          params_shape=[2, 3, 4, 5, 6, 7],
+          indices_shape=[2, 3, 8, 9, 10],
+          batch_dims=2,
+          axis=3,
+          output_shape=[2, 3, 4, 8, 9, 10, 6, 7]
+          # = params.shape[:3] + indices.shape[2:] + params.shape[4:]
+          ),
+      dict(
+          params_shape=[2, 3, 4, 5, 6, 7],
+          indices_shape=[2, 3, 8, 9, 10],
+          batch_dims=2,
+          axis=4,
+          output_shape=[2, 3, 4, 5, 8, 9, 10, 7]
+          # = params.shape[:4] + indices.shape[2:] + params.shape[5:]
+          ),
+      dict(
+          params_shape=[2, 3, 4, 5, 6, 7],
+          indices_shape=[2, 3, 8, 9, 10],
+          batch_dims=2,
+          axis=5,
+          output_shape=[2, 3, 4, 5, 6, 8, 9, 10]
+          # = params.shape[:5] + indices.shape[2:] + params.shape[6:]
+          ),
+      dict(
+          params_shape=[2, 3, 4, 5, 6, 7],
+          indices_shape=[2, 3, 8, 9, 10],
+          batch_dims=2,
+          axis=-4,
+          output_shape=[2, 3, 8, 9, 10, 5, 6, 7]
+          # = params.shape[:2] + indices.shape[2:] + params.shape[3:]
+          ),
+      dict(
+          params_shape=[2, 3, 4, 5, 6, 7],
+          indices_shape=[2, 3, 8, 9, 10],
+          batch_dims=2,
+          axis=-3,
+          output_shape=[2, 3, 4, 8, 9, 10, 6, 7]
+          # = params.shape[:3] + indices.shape[2:] + params.shape[4:]
+          ),
+      dict(
+          params_shape=[2, 3, 4, 5, 6, 7],
+          indices_shape=[2, 3, 8, 9, 10],
+          batch_dims=2,
+          axis=-2,
+          output_shape=[2, 3, 4, 5, 8, 9, 10, 7]
+          # = params.shape[:4] + indices.shape[2:] + params.shape[5:]
+          ),
+      dict(
+          params_shape=[2, 3, 4, 5, 6, 7],
+          indices_shape=[2, 3, 8, 9, 10],
+          batch_dims=2,
+          axis=-1,
+          output_shape=[2, 3, 4, 5, 6, 8, 9, 10]
+          # = params.shape[:5] + indices.shape[2:] + params.shape[6:]
+          ),
+  ])
+  @test_util.run_in_graph_and_eager_modes
+  def testBatchDimsMatchesPythonBatching(self, params_shape, indices_shape,
+                                         batch_dims, axis, output_shape):
+    """Checks that batch_dims matches multiple calls to tf.gather()."""
+    # Generate a `params` tensor with the indicated shape.
+    params_size = np.prod(params_shape)
+    params = np.reshape(np.arange(params_size), params_shape)
+
+    # Generate an `indices` tensor with the indicated shape, where each index
+    # is within the appropriate range.
+    indices_size = np.prod(indices_shape)
+    indices = np.reshape(np.arange(indices_size), indices_shape)
+    indices = indices % params_shape[axis]
+
+    # Perform repeated (batched) gather operations with numpy, to find the
+    # expected result.
+    expected = self._batchNumpyGather(params, indices, axis, batch_dims)
+
+    # On Windows, we get an exception if we pass in the transformed numpy
+    # arrays ("Failed to convert numpy ndarray to a Tensor (Unsupported
+    # feed type)."); so convert them back to lists before calling tf.gather.
+    params = params.tolist()
+    indices = indices.tolist()
+
+    result = array_ops.gather(params, indices, axis=axis, batch_dims=batch_dims)
+    self.assertAllEqual(output_shape, result.shape.as_list())
+    self.assertAllEqual(expected, result)
+
+  def _batchNumpyGather(self, params, indices, axis, batch_dims):
+    """Performs a batch gather by making recursive calls to np.take().
+
+    This is used by testBatchDims() to construct the expected value.
+
+    Args:
+      params: A numpy array
+      indices: A numpy array
+      axis: An integer
+      batch_dims: An integer
+    Returns:
+      A numpy array
+    """
+    if batch_dims == 0:
+      return np.take(params, indices, axis=axis)
+    self.assertEqual(params.shape[0], indices.shape[0])
+    if axis > 0:
+      axis -= 1
+    return np.stack([
+        self._batchNumpyGather(params[i], indices[i], axis, batch_dims - 1)
+        for i in range(params.shape[0])
+    ])
+
+  def testSkipEagerErrors(self):
+    if context.executing_eagerly():
+      return
+    with self.assertRaisesRegexp(ValueError, r"tf\.gather does not allow.*"):
+      array_ops.gather(
+          params=[1, 2],
+          batch_dims=1,
+          indices=array_ops.placeholder(dtypes.int32))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testErrors(self):
+
+    with self.assertRaisesRegexp(
+        ValueError, r"batch_dims = 2 must be less than rank\(indices\) = 2"):
+      array_ops.gather(
+          params=[[1, 2], [3, 4]], indices=[[1, 2], [3, 4]], batch_dims=2)
+
+    with self.assertRaisesRegexp(
+        ValueError, r"batch_dims = 1 must be less than rank\(params\) = 1"):
+      array_ops.gather(
+          params=[1, 2, 3, 4], indices=[[1, 2], [3, 4]], batch_dims=1)
+
+    with self.assertRaisesRegexp(
+        ValueError, r"batch_dims = 1 must be less than or equal to axis = 0"):
+      array_ops.gather(
+          params=[[1, 2], [3, 4]],
+          indices=[[1, 2], [3, 4]],
+          batch_dims=1,
+          axis=0)
+
+    one = array_ops.ones((), dtypes.int32)
+    with self.assertRaisesRegexp(TypeError, "batch_dims must be an int"):
+      array_ops.gather(params=[[1]], indices=[[1]], batch_dims=one)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index 09b9944baa1d92bfbcd484f5dba45cea28e6eafe..4b9681afd2cac5660107ca8072770f66944ec2a4 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -592,6 +592,22 @@ class LinSpaceTest(test.TestCase):
       self.assertArrayNear(self._LinSpace(5., 5., 3), np.array([5.] * 3), 1e-5)
       self.assertArrayNear(self._LinSpace(5., 5., 4), np.array([5.] * 4), 1e-5)
 
+  def testEndpointsAreExact(self):
+    for self.force_gpu in self._gpu_modes():
+      # Test some cases that produce last values not equal to "stop" when
+      # computed via start + (num - 1) * ((stop - start) / (num - 1)), since
+      # float arithmetic will introduce error through precision loss.
+      self.assertAllEqual(
+          self._LinSpace(0., 1., 42)[[0, -1]], np.array([0., 1.], np.float32))
+      self.assertAllEqual(
+          self._LinSpace(-1., 0., 42)[[0, -1]], np.array([-1., 0.], np.float32))
+      self.assertAllEqual(
+          self._LinSpace(.1, .2, 4)[[0, -1]], np.array([.1, .2], np.float32))
+      # Check a case for float64 error too.
+      self.assertAllEqual(
+          self._LinSpace(np.array(0., np.float64), .1, 12)[[0, -1]],
+          np.array([0., .1], np.float64))
+
 
 class DeviceTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index ba9e64979a48ccce82a283e74a1a024c4bcceda8..53815858e4c8fc9c9dad0246f9ff9933a47459bc 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -22,6 +22,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -38,6 +39,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -60,6 +62,7 @@ cuda_py_test(
         "noasan",  # times out, b/63678675
         "optonly",  # times out
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -76,6 +79,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -98,6 +102,7 @@ cuda_py_test(
         "noasan",
         "optonly",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -120,6 +125,7 @@ cuda_py_test(
         "noasan",  # times out, b/63678675
         "optonly",  # times out
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -144,6 +150,7 @@ cuda_py_test(
         "noasan",  # times out, b/63678675
         "optonly",  # times out, b/79171797
     ],
+    xla_enable_strict_auto_jit = False,
 )
 
 cuda_py_test(
@@ -166,6 +173,7 @@ cuda_py_test(
         "noasan",
         "optonly",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -187,6 +195,7 @@ cuda_py_test(
         "noasan",
         "optonly",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -209,6 +218,7 @@ cuda_py_test(
         "noasan",  # times out, b/63678675
         "optonly",  # times out
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -229,6 +239,7 @@ cuda_py_test(
         "noasan",
         "optonly",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -251,6 +262,7 @@ cuda_py_test(
         "noasan",
         "optonly",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -270,6 +282,7 @@ cuda_py_test(
         "noasan",
         "optonly",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -290,6 +303,7 @@ cuda_py_test(
         "noasan",  # times out
         "optonly",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -310,6 +324,7 @@ cuda_py_test(
         "noasan",
         "optonly",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -328,4 +343,5 @@ cuda_py_test(
     ],
     shard_count = 5,
     tags = ["optonly"],  # Test is flaky without optimization.
+    xla_enable_strict_auto_jit = True,
 )
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py
index 1bed4b5268e8d27a25ab735f7e3e1a6c9e4d5d95..f70d8c4e1cd557c34f07a90a39b102830d82dd0f 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py
@@ -114,5 +114,29 @@ class LinearOperatorAdjointTest(
     self.assertEqual("my_operator_adjoint", operator.name)
 
 
+class LinearOperatorAdjointNonSquareTest(
+    linear_operator_test_util.NonSquareLinearOperatorDerivedClassTest):
+  """Tests done in the base class NonSquareLinearOperatorDerivedClassTest."""
+
+  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+    shape_before_adjoint = list(build_info.shape)
+    # We need to swap the last two dimensions because we are taking the adjoint
+    # of this operator
+    shape_before_adjoint[-1], shape_before_adjoint[-2] = (
+        shape_before_adjoint[-2], shape_before_adjoint[-1])
+    matrix = linear_operator_test_util.random_normal(
+        shape_before_adjoint, dtype=dtype)
+
+    lin_op_matrix = matrix
+
+    if use_placeholder:
+      lin_op_matrix = array_ops.placeholder_with_default(matrix, shape=None)
+
+    operator = LinearOperatorAdjoint(
+        linalg.LinearOperatorFullMatrix(lin_op_matrix))
+
+    return operator, linalg.adjoint(matrix)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
index 8e296c026c09b36afd39b891befb767a222f5f19..12da8659caca2dcbd8e981dd7124b52737bff970 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
@@ -26,13 +26,59 @@ from tensorflow.python.ops.linalg import matmul_registrations  # pylint: disable
 from tensorflow.python.platform import test
 
 # pylint: disable=protected-access
+_ADJOINTS = linear_operator_algebra._ADJOINTS
+_registered_adjoint = linear_operator_algebra._registered_adjoint
 _CHOLESKY_DECOMPS = linear_operator_algebra._CHOLESKY_DECOMPS
-_MATMUL = linear_operator_algebra._MATMUL
 _registered_cholesky = linear_operator_algebra._registered_cholesky
+_INVERSES = linear_operator_algebra._INVERSES
+_registered_inverse = linear_operator_algebra._registered_inverse
+_MATMUL = linear_operator_algebra._MATMUL
 _registered_matmul = linear_operator_algebra._registered_matmul
 # pylint: enable=protected-access
 
 
+class AdjointTest(test.TestCase):
+
+  def testRegistration(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+
+      def _matmul(self, a):
+        pass
+
+      def _shape(self):
+        return tensor_shape.TensorShape([1, 1])
+
+      def _shape_tensor(self):
+        pass
+
+    # Register Adjoint to a lambda that spits out the name parameter
+    @linear_operator_algebra.RegisterAdjoint(CustomLinOp)
+    def _adjoint(a):  # pylint: disable=unused-argument,unused-variable
+      return "OK"
+
+    self.assertEqual("OK", CustomLinOp(dtype=None).adjoint())
+
+  def testRegistrationFailures(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+      pass
+
+    with self.assertRaisesRegexp(TypeError, "must be callable"):
+      linear_operator_algebra.RegisterAdjoint(CustomLinOp)("blah")
+
+    # First registration is OK
+    linear_operator_algebra.RegisterAdjoint(CustomLinOp)(lambda a: None)
+
+    # Second registration fails
+    with self.assertRaisesRegexp(ValueError, "has already been registered"):
+      linear_operator_algebra.RegisterAdjoint(CustomLinOp)(lambda a: None)
+
+  def testExactAdjointRegistrationsAllMatch(self):
+    for (k, v) in _ADJOINTS.items():
+      self.assertEqual(v, _registered_adjoint(k[0]))
+
+
 class CholeskyTest(test.TestCase):
 
   def testRegistration(self):
@@ -129,5 +175,51 @@ class MatmulTest(test.TestCase):
       self.assertEqual(v, _registered_matmul(k[0], k[1]))
 
 
+class InverseTest(test.TestCase):
+
+  def testRegistration(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+
+      def _matmul(self, a):
+        pass
+
+      def _shape(self):
+        return tensor_shape.TensorShape([1, 1])
+
+      def _shape_tensor(self):
+        pass
+
+    # Register Inverse to a lambda that spits out the name parameter
+    @linear_operator_algebra.RegisterInverse(CustomLinOp)
+    def _inverse(a):  # pylint: disable=unused-argument,unused-variable
+      return "OK"
+
+    with self.assertRaisesRegexp(ValueError, "singular"):
+      CustomLinOp(dtype=None, is_non_singular=False).inverse()
+
+    self.assertEqual("OK", CustomLinOp(
+        dtype=None, is_non_singular=True).inverse())
+
+  def testRegistrationFailures(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+      pass
+
+    with self.assertRaisesRegexp(TypeError, "must be callable"):
+      linear_operator_algebra.RegisterInverse(CustomLinOp)("blah")
+
+    # First registration is OK
+    linear_operator_algebra.RegisterInverse(CustomLinOp)(lambda a: None)
+
+    # Second registration fails
+    with self.assertRaisesRegexp(ValueError, "has already been registered"):
+      linear_operator_algebra.RegisterInverse(CustomLinOp)(lambda a: None)
+
+  def testExactRegistrationsAllMatch(self):
+    for (k, v) in _INVERSES.items():
+      self.assertEqual(v, _registered_inverse(k[0]))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
index f0cc5d709f9bfec2e3dcfadecc8f949bb6ce6e6d..28f8d20f61515328261771684d2571b80b686c64 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
@@ -136,6 +136,27 @@ class SquareLinearOperatorBlockDiagTest(
     self.assertTrue(operator.is_non_singular)
     self.assertFalse(operator.is_self_adjoint)
 
+  def test_block_diag_adjoint_type(self):
+    matrix = [[1., 0.], [0., 1.]]
+    operator = block_diag.LinearOperatorBlockDiag(
+        [
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_non_singular=True,
+            ),
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_non_singular=True,
+            ),
+        ],
+        is_non_singular=True,
+    )
+    adjoint = operator.adjoint()
+    self.assertIsInstance(
+        adjoint,
+        block_diag.LinearOperatorBlockDiag)
+    self.assertEqual(2, len(adjoint.operators))
+
   def test_block_diag_cholesky_type(self):
     matrix = [[1., 0.], [0., 1.]]
     operator = block_diag.LinearOperatorBlockDiag(
@@ -155,20 +176,38 @@ class SquareLinearOperatorBlockDiagTest(
         is_self_adjoint=True,
     )
     cholesky_factor = operator.cholesky()
-    self.assertTrue(isinstance(
+    self.assertIsInstance(
         cholesky_factor,
-        block_diag.LinearOperatorBlockDiag))
+        block_diag.LinearOperatorBlockDiag)
     self.assertEqual(2, len(cholesky_factor.operators))
-    self.assertTrue(
-        isinstance(
-            cholesky_factor.operators[0],
-            lower_triangular.LinearOperatorLowerTriangular)
+    self.assertIsInstance(
+        cholesky_factor.operators[0],
+        lower_triangular.LinearOperatorLowerTriangular)
+    self.assertIsInstance(
+        cholesky_factor.operators[1],
+        lower_triangular.LinearOperatorLowerTriangular
     )
-    self.assertTrue(
-        isinstance(
-            cholesky_factor.operators[1],
-            lower_triangular.LinearOperatorLowerTriangular)
+
+  def test_block_diag_inverse_type(self):
+    matrix = [[1., 0.], [0., 1.]]
+    operator = block_diag.LinearOperatorBlockDiag(
+        [
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_non_singular=True,
+            ),
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_non_singular=True,
+            ),
+        ],
+        is_non_singular=True,
     )
+    inverse = operator.inverse()
+    self.assertIsInstance(
+        inverse,
+        block_diag.LinearOperatorBlockDiag)
+    self.assertEqual(2, len(inverse.operators))
 
   def test_is_non_singular_auto_set(self):
     # Matrix with two positive eigenvalues, 11 and 8.
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
index dcbc0dd7c97184df150fc7094a28441fcfaa1257..5c3220e60f49e872bbf2b4f2f1bb63a2271f7b1d 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
@@ -187,6 +187,11 @@ class LinearOperatorDiagTest(
         linalg_lib.LinearOperatorDiag))
     self.assertAllClose([6., 9.], self.evaluate(operator_matmul.diag))
 
+  def test_diag_adjoint_type(self):
+    diag = [1., 3., 5., 8.]
+    operator = linalg.LinearOperatorDiag(diag, is_non_singular=True)
+    self.assertIsInstance(operator.adjoint(), linalg.LinearOperatorDiag)
+
   def test_diag_cholesky_type(self):
     diag = [1., 3., 5., 8.]
     operator = linalg.LinearOperatorDiag(
@@ -194,9 +199,12 @@ class LinearOperatorDiagTest(
         is_positive_definite=True,
         is_self_adjoint=True,
     )
-    self.assertTrue(isinstance(
-        operator.cholesky(),
-        linalg.LinearOperatorDiag))
+    self.assertIsInstance(operator.cholesky(), linalg.LinearOperatorDiag)
+
+  def test_diag_inverse_type(self):
+    diag = [1., 3., 5., 8.]
+    operator = linalg.LinearOperatorDiag(diag, is_non_singular=True)
+    self.assertIsInstance(operator.inverse(), linalg.LinearOperatorDiag)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
index 2da5e712d77b88ca6bb20a5f0920335f00c7b594..55eff59e03e83d12e7019922758ef065fe2e2812 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
@@ -259,15 +259,26 @@ class LinearOperatorIdentityTest(
           is_non_singular=None,
       )
 
+  def test_identity_adjoint_type(self):
+    operator = linalg_lib.LinearOperatorIdentity(
+        num_rows=2, is_non_singular=True)
+    self.assertIsInstance(
+        operator.adjoint(), linalg_lib.LinearOperatorIdentity)
+
   def test_identity_cholesky_type(self):
     operator = linalg_lib.LinearOperatorIdentity(
         num_rows=2,
         is_positive_definite=True,
         is_self_adjoint=True,
     )
-    self.assertTrue(isinstance(
-        operator.cholesky(),
-        linalg_lib.LinearOperatorIdentity))
+    self.assertIsInstance(
+        operator.cholesky(), linalg_lib.LinearOperatorIdentity)
+
+  def test_identity_inverse_type(self):
+    operator = linalg_lib.LinearOperatorIdentity(
+        num_rows=2, is_non_singular=True)
+    self.assertIsInstance(
+        operator.inverse(), linalg_lib.LinearOperatorIdentity)
 
 
 class LinearOperatorScaledIdentityTest(
@@ -458,7 +469,7 @@ class LinearOperatorScaledIdentityTest(
         is_positive_definite=False, is_non_singular=True)
     self.assertFalse(operator.is_positive_definite)
     self.assertTrue(operator.is_non_singular)
-    self.assertTrue(operator.is_self_adjoint is None)
+    self.assertTrue(operator.is_self_adjoint)  # Auto-set due to real multiplier
 
   def test_identity_matmul(self):
     operator1 = linalg_lib.LinearOperatorIdentity(num_rows=2)
@@ -491,9 +502,19 @@ class LinearOperatorScaledIdentityTest(
         is_positive_definite=True,
         is_self_adjoint=True,
     )
-    self.assertTrue(isinstance(
+    self.assertIsInstance(
         operator.cholesky(),
-        linalg_lib.LinearOperatorScaledIdentity))
+        linalg_lib.LinearOperatorScaledIdentity)
+
+  def test_scaled_identity_inverse_type(self):
+    operator = linalg_lib.LinearOperatorScaledIdentity(
+        num_rows=2,
+        multiplier=3.,
+        is_non_singular=True,
+    )
+    self.assertIsInstance(
+        operator.inverse(),
+        linalg_lib.LinearOperatorScaledIdentity)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
index 513b246803233f1117b48f1a3d413be42f15238a..166188f6cecac1c472d0855069c61fe0e2937b02 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
@@ -100,7 +100,7 @@ class SquareLinearOperatorKroneckerTest(
 
   @property
   def _tests_to_skip(self):
-    return ["det", "solve", "solve_with_broadcast"]
+    return ["det", "inverse", "solve", "solve_with_broadcast"]
 
   def _operator_and_matrix(
       self, build_info, dtype, use_placeholder,
@@ -192,6 +192,23 @@ class SquareLinearOperatorKroneckerTest(
     with self.assertRaisesRegexp(ValueError, ">=1 operators"):
       kronecker.LinearOperatorKronecker([])
 
+  def test_kronecker_adjoint_type(self):
+    matrix = [[1., 0.], [0., 1.]]
+    operator = kronecker.LinearOperatorKronecker(
+        [
+            linalg.LinearOperatorFullMatrix(
+                matrix, is_non_singular=True),
+            linalg.LinearOperatorFullMatrix(
+                matrix, is_non_singular=True),
+        ],
+        is_non_singular=True,
+    )
+    adjoint = operator.adjoint()
+    self.assertIsInstance(
+        adjoint,
+        kronecker.LinearOperatorKronecker)
+    self.assertEqual(2, len(adjoint.operators))
+
   def test_kronecker_cholesky_type(self):
     matrix = [[1., 0.], [0., 1.]]
     operator = kronecker.LinearOperatorKronecker(
@@ -211,20 +228,33 @@ class SquareLinearOperatorKroneckerTest(
         is_self_adjoint=True,
     )
     cholesky_factor = operator.cholesky()
-    self.assertTrue(isinstance(
+    self.assertIsInstance(
         cholesky_factor,
-        kronecker.LinearOperatorKronecker))
+        kronecker.LinearOperatorKronecker)
     self.assertEqual(2, len(cholesky_factor.operators))
-    self.assertTrue(
-        isinstance(
-            cholesky_factor.operators[0],
-            lower_triangular.LinearOperatorLowerTriangular)
-    )
-    self.assertTrue(
-        isinstance(
-            cholesky_factor.operators[1],
-            lower_triangular.LinearOperatorLowerTriangular)
+    self.assertIsInstance(
+        cholesky_factor.operators[0],
+        lower_triangular.LinearOperatorLowerTriangular)
+    self.assertIsInstance(
+        cholesky_factor.operators[1],
+        lower_triangular.LinearOperatorLowerTriangular)
+
+  def test_kronecker_inverse_type(self):
+    matrix = [[1., 0.], [0., 1.]]
+    operator = kronecker.LinearOperatorKronecker(
+        [
+            linalg.LinearOperatorFullMatrix(
+                matrix, is_non_singular=True),
+            linalg.LinearOperatorFullMatrix(
+                matrix, is_non_singular=True),
+        ],
+        is_non_singular=True,
     )
+    inverse = operator.inverse()
+    self.assertIsInstance(
+        inverse,
+        kronecker.LinearOperatorKronecker)
+    self.assertEqual(2, len(inverse.operators))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
index 18e13a76a097f72887cacc5d3de40b8d6babcb52..8f8b15e8ed8190b28cc7ae60d8411d74389a9be1 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
@@ -214,7 +214,7 @@ class LinearOperatorTest(test.TestCase):
     operator = LinearOperatorMatmulSolve(matrix, is_square=True)
     self.assertTrue(operator.is_square)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_deprecated_v1
   def test_linear_operator_matmul_hints_closed(self):
     matrix = array_ops.placeholder(dtypes.float32)
     operator1 = LinearOperatorMatmulSolve(matrix)
@@ -241,7 +241,7 @@ class LinearOperatorTest(test.TestCase):
     self.assertTrue(operator_matmul.is_self_adjoint)
     self.assertEqual(None, operator_matmul.is_positive_definite)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_deprecated_v1
   def test_linear_operator_matmul_hints_false(self):
     matrix = array_ops.placeholder(dtypes.float32)
     operator1 = LinearOperatorMatmulSolve(
@@ -274,7 +274,7 @@ class LinearOperatorTest(test.TestCase):
     self.assertEqual(None, operator_matmul.is_self_adjoint)
     self.assertEqual(None, operator_matmul.is_positive_definite)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_deprecated_v1
   def test_linear_operator_matmul_hint_infer_square(self):
     matrix1 = array_ops.placeholder(shape=[2, 3], dtype=dtypes.float32)
     matrix2 = array_ops.placeholder(shape=[3, 2], dtype=dtypes.float32)
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
index eb0b8ef127749e9e5709861d14b143877790bffd..10651d3c8afa0e29766d20c3dc8177af94678336 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
@@ -36,7 +36,8 @@ class LinearOperatorZerosTest(
 
   @property
   def _tests_to_skip(self):
-    return ["cholesky", "log_abs_det", "solve", "solve_with_broadcast"]
+    return [
+        "cholesky", "log_abs_det", "inverse", "solve", "solve_with_broadcast"]
 
   @property
   def _operator_build_infos(self):
diff --git a/tensorflow/python/kernel_tests/linalg_grad_test.py b/tensorflow/python/kernel_tests/linalg_grad_test.py
index ff84221611813cf37537b843087faa70ae1d3e8e..1494329f806eb13a0170c56f248f49e502038556 100644
--- a/tensorflow/python/kernel_tests/linalg_grad_test.py
+++ b/tensorflow/python/kernel_tests/linalg_grad_test.py
@@ -216,6 +216,7 @@ if __name__ == '__main__':
           shape = (rows, cols)
           name = '%s_%s_%s' % (dtype.__name__, '_'.join(map(str, shape)),
                                l2_regularization)
+          float32_tol_fudge = 5.1 if l2_regularization == 1e-6 else 4.0
           _AddTest(
               MatrixBinaryFunctorGradientTest,
               'MatrixSolveLsGradient',
@@ -226,6 +227,6 @@ if __name__ == '__main__':
                    linalg_ops.matrix_solve_ls(a, b, l)),
                   dtype,
                   shape,
-                  float32_tol_fudge=4.0))
+                  float32_tol_fudge))
 
   test_lib.main()
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index 489f6c9b00471e6c10a8a04830613e9c5b99661a..e3501294cab41a5c48e563607f163b424f594113 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_list_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
@@ -89,6 +90,58 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       l = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
       self.evaluate(l)
 
+  def testPopUninitializedTensorUseListElementShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[2, 3], num_elements=3)
+    _, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(e, np.zeros((2, 3)))
+
+  def testPopUninitializedTensorUseSpecifiedElementShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[None, 3], num_elements=3)
+    _, e = gen_list_ops.tensor_list_pop_back(
+        l, element_dtype=dtypes.float32, element_shape=[4, 3])
+    self.assertAllEqual(e, np.zeros((4, 3)))
+
+  def testPopUninitializedTensorWithInvalidElementShapeFails(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Trying to read an uninitialized tensor but "
+        "element_shape is not fully defined"):
+      _, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+      self.evaluate(e)
+
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[None, 2], num_elements=3)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Incompatible shapes during merge: \[1,3\] vs. \[\?,2\]"):
+      _, e = gen_list_ops.tensor_list_pop_back(
+          l, element_dtype=dtypes.float32, element_shape=[1, 3])
+      self.evaluate(e)
+
+  def testPushGetGrad(self):
+    with backprop.GradientTape() as tape:
+      l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.float32, element_shape=None)
+      c0 = constant_op.constant(5.0)
+      c1 = constant_op.constant([10.0, 20.0])
+      tape.watch(c0)
+      tape.watch(c1)
+      l = list_ops.tensor_list_push_back(l, c0)
+      l = list_ops.tensor_list_push_back(l, c1)
+      t1 = list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32)
+      self.assertAllEqual(self.evaluate(t1), [10.0, 20.0])
+      # t1 == c1 so the gradient should be [0., [1., 1.]]
+      # This tests that the gradient of push_back correctly converts DT_INVALID
+      # tensors to zeros. The list returned by the gradient of GetItem will
+      # have only have tensor at index 1 set and others set to DT_INVALID.
+      dt0, dt1 = tape.gradient(t1, [c0, c1])
+      self.assertAllEqual(self.evaluate(dt1), [1.0, 1.0])
+      self.assertEqual(self.evaluate(dt0), 0.0)
+
   def _testStack(self, max_num_elements):
     l = list_ops.empty_tensor_list(
         element_dtype=dtypes.float32,
@@ -130,7 +183,8 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     # Should raise an error when the element tensors do not all have the same
     # shape.
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, "unequal shapes"):
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Incompatible ranks during merge: 0 vs. 1"):
       l = list_ops.tensor_list_push_back(l, constant_op.constant([3.0, 4.0]))
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.evaluate(t)
@@ -151,7 +205,9 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     # Should raise an error when the element tensors do not all have the same
     # shape.
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, "unequal shapes"):
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Incompatible shapes during merge: \[1\] vs. \[2\]"):
       l = list_ops.tensor_list_push_back(l, constant_op.constant([2.0, 3.0]))
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.evaluate(t)
@@ -189,6 +245,54 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.evaluate(t)
 
+  def _testStackWithUninitializedTensors(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[], num_elements=3)
+    t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(t, [0., 0., 0.])
+
+  def testStackWithUninitializedTensors(self):
+    self._testStackWithUninitializedTensors()
+
+  def testStackWithUninitializedTensorsGpu(self):
+    if not context.num_gpus():
+      return
+    with context.device("gpu:0"):
+      self._testStackWithUninitializedTensors()
+
+  def _testStackWithUninitializedTensorsInferShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    l = list_ops.tensor_list_set_item(l, 1, [1., 2.])
+    t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(t, [[0., 0.], [1., 2.], [0., 0.]])
+
+  def testStackWithUninitializedTensorsInferShape(self):
+    self._testStackWithUninitializedTensorsInferShape()
+
+  def testStackWithUninitializedTensorsInferShapeGpu(self):
+    if not context.num_gpus():
+      return
+    with context.device("gpu:0"):
+      self._testStackWithUninitializedTensorsInferShape()
+
+  def testStackReservedListWithNoElementsAndPartialElementShapeFails(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Tried to stack list which only contains "
+                                 "uninitialized tensors and has a "
+                                 "non-fully-defined element_shape: <unknown>"):
+      t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testStackUsingSpecifiedElementShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    t = gen_list_ops.tensor_list_stack(
+        l, element_dtype=dtypes.float32, element_shape=[])
+    self.assertAllEqual(self.evaluate(t), np.zeros((3,)))
+
   @parameterized.named_parameters(("NoMaxNumElements", None),
                                   ("WithMaxNumElements", 2))
   def testGatherGrad(self, max_num_elements):
@@ -227,7 +331,8 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     # Should raise an error when the requested tensors do not all have the same
     # shape.
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, "unequal shapes"):
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Incompatible ranks during merge: 0 vs. 1"):
       t = list_ops.tensor_list_gather(l, [0, 2], element_dtype=dtypes.float32)
       self.evaluate(t)
 
@@ -251,7 +356,9 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     # Should raise an error when the requested tensors do not all have the same
     # shape.
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, "unequal shapes"):
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Incompatible shapes during merge: \[1\] vs. \[2\]"):
       t = list_ops.tensor_list_gather(l, [0, 2], element_dtype=dtypes.float32)
       self.evaluate(t)
 
@@ -290,12 +397,132 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       t = list_ops.tensor_list_gather(l, [], element_dtype=dtypes.float32)
       self.evaluate(t)
 
+  def testGatherGradWithNonContiguousIndices(self):
+    with backprop.GradientTape(persistent=True) as tape:
+      t = constant_op.constant([1.0, 2.0, 3.0])
+      l = list_ops.tensor_list_from_tensor(t, element_shape=[])
+      c = constant_op.constant(5.0)
+      tape.watch(c)
+      l = list_ops.tensor_list_set_item(l, 1, c)
+      t = list_ops.tensor_list_gather(l, [1], element_dtype=dtypes.float32)
+      self.assertAllEqual(self.evaluate(t), [5.0])
+      s = t[0] * t[0]
+    dt = tape.gradient(s, c)
+    self.assertAllEqual(self.evaluate(dt), 10.0)
+    dl = tape.gradient(t, l)
+    dl_length = list_ops.tensor_list_length(dl)
+    self.assertAllEqual(self.evaluate(dl_length), 3)
+
+  def _testGatherWithUninitializedTensors(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[], num_elements=3)
+    t = list_ops.tensor_list_gather(l, [0, 2], element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t), [0., 0.])
+
+  def testGatherWithUninitializedTensors(self):
+    self._testGatherWithUninitializedTensors()
+
+  def testGatherWithUninitializedTensorsGpu(self):
+    if not context.num_gpus():
+      return
+    with context.device("gpu:0"):
+      self._testGatherWithUninitializedTensors()
+
+  def _testGatherWithUninitializedTensorsInferShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    l = list_ops.tensor_list_set_item(l, 1, [1., 2.])
+    t = list_ops.tensor_list_gather(l, [1, 2], element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t), [[1., 2.], [0., 0.]])
+
+  def testGatherWithUninitializedTensorsInferShape(self):
+    self._testGatherWithUninitializedTensorsInferShape()
+
+  def testGatherWithUninitializedTensorsInferShapeGpu(self):
+    if not context.num_gpus():
+      return
+    with context.device("gpu:0"):
+      self._testGatherWithUninitializedTensorsInferShape()
+
+  def testGatherReservedListWithNoElementsAndPartialElementShapeFails(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Tried to gather uninitialized tensors from a"
+        " list with non-fully-defined element_shape"):
+      t = list_ops.tensor_list_gather(l, [0], element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testGatherUsingSpecifiedElementShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    t = gen_list_ops.tensor_list_gather(
+        l, [0, 1, 2], element_dtype=dtypes.float32, element_shape=[])
+    self.assertAllEqual(self.evaluate(t), np.zeros((3,)))
+
+  def testScatterOutputListSize(self):
+    c0 = constant_op.constant([1.0, 2.0])
+    l = list_ops.tensor_list_scatter(c0, [1, 3], [])
+    # TensorListScatter should return a list with size largest index + 1.
+    self.assertAllEqual(list_ops.tensor_list_length(l), 4)
+
+  def testScatterOutputListSizeWithNumElementsSpecified(self):
+    c0 = constant_op.constant([1.0, 2.0])
+    l = gen_list_ops.tensor_list_scatter_v2(
+        c0, [1, 3], list_ops._build_element_shape([]), num_elements=5)
+    # TensorListScatter should return a list with size num_elements.
+    self.assertAllEqual(list_ops.tensor_list_length(l), 5)
+
+  def testScatterFailsWhenIndexLargerThanNumElements(self):
+    c0 = constant_op.constant([1.0, 2.0])
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "TensorListScatter: Trying to scatter at index 3 in list with size 3"):
+      l = gen_list_ops.tensor_list_scatter_v2(
+          c0, [1, 3], list_ops._build_element_shape([]), num_elements=3)
+      self.evaluate(l)
+
+  def testScatterFailsWithInvalidNumElements(self):
+    c0 = constant_op.constant([1.0, 2.0])
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "TensorListScatter expects num_elements >= -1, found: -2"):
+      l = gen_list_ops.tensor_list_scatter_v2(
+          c0, [1, 3], list_ops._build_element_shape([]), num_elements=-2)
+      self.evaluate(l)
+
+  def testScatterWithInvalidRowsInInputTensorFails(self):
+    c0 = constant_op.constant([1.0, 2.0])
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Invalid number of rows in input tensor. Expected: 3 Actual: 2"):
+      l = list_ops.tensor_list_scatter(c0, [1, 0, 2], [])
+      self.evaluate(l)
+
+  def testScatterWithNegativeIndicesFails(self):
+    c0 = constant_op.constant([1.0, 2.0])
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Indices in TensorListScatter must all be non-negative."):
+      l = list_ops.tensor_list_scatter(c0, [-1, -2], element_shape=[])
+      self.evaluate(l)
+
+  def testScatterIntoExistingList(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[], num_elements=3)
+    l = list_ops.tensor_list_scatter(tensor=[1.], indices=[0], element_shape=[])
+    l = list_ops.tensor_list_scatter(
+        tensor=[2., 3.], indices=[1, 2], element_shape=[], input_handle=l)
+    self.assertAllEqual(
+        list_ops.tensor_list_stack(l, element_dtype=dtypes.float32),
+        [1., 2., 3.])
+
   def testScatterGrad(self):
     with backprop.GradientTape() as tape:
       c0 = constant_op.constant([1.0, 2.0])
       tape.watch(c0)
-      l = list_ops.tensor_list_scatter(
-          c0, [1, 0], ops.convert_to_tensor([], dtype=dtypes.int32))
+      l = list_ops.tensor_list_scatter(c0, [1, 0], element_shape=[])
       t0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
       t1 = list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32)
       self.assertAllEqual(self.evaluate(t0), 2.0)
@@ -304,14 +531,27 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     dt = tape.gradient(loss, c0)
     self.assertAllEqual(self.evaluate(dt), [2., 4.])
 
+  def testScatterWithPartialReadGrad(self):
+    with backprop.GradientTape() as tape:
+      c0 = constant_op.constant([1.0, 2.0])
+      tape.watch(c0)
+      l = list_ops.tensor_list_scatter(c0, [1, 0], element_shape=[])
+      t0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+      self.assertAllEqual(self.evaluate(t0), 2.0)
+      loss = t0 * t0
+    dt = tape.gradient(loss, c0)
+    self.assertAllEqual(self.evaluate(dt), [0., 4.])
+
   def testTensorListFromTensor(self):
     t = constant_op.constant([1.0, 2.0])
     l = list_ops.tensor_list_from_tensor(t, element_shape=[])
+    e = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+    self.assertAllEqual(e, 1.0)
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(self.evaluate(e), 2.0)
+    self.assertAllEqual(e, 2.0)
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(self.evaluate(e), 1.0)
-    self.assertAllEqual(self.evaluate(list_ops.tensor_list_length(l)), 0)
+    self.assertAllEqual(e, 1.0)
+    self.assertAllEqual(list_ops.tensor_list_length(l), 0)
 
   def testFromTensorGPU(self):
     if not context.num_gpus():
@@ -319,7 +559,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with context.device("gpu:0"):
       self.testTensorListFromTensor()
 
-  def testGetSetItem(self):
+  def testGetSet(self):
     t = constant_op.constant([1.0, 2.0])
     l = list_ops.tensor_list_from_tensor(t, element_shape=[])
     e0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
@@ -332,7 +572,22 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
-      self.testGetSetItem()
+      self.testGetSet()
+
+  def testGetSetReserved(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[], num_elements=2)
+    e0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+    self.assertAllEqual(e0, 0.0)
+    l = list_ops.tensor_list_set_item(l, 0, 3.0)
+    t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(t, [3.0, 0.0])
+
+  def testGetSetReservedGPU(self):
+    if not context.num_gpus():
+      return
+    with context.device("gpu:0"):
+      self.testGetSetReserved()
 
   def testSetGetGrad(self):
     with backprop.GradientTape() as tape:
@@ -345,6 +600,64 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.assertAllEqual(self.evaluate(e), 10.0)
     self.assertAllEqual(self.evaluate(tape.gradient(e, t)), 2.0)
 
+  def testGetUninitializedTensorUseListElementShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[], num_elements=3)
+    l = list_ops.tensor_list_set_item(l, 0, 5.)
+    e1 = list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32)
+    e2 = list_ops.tensor_list_get_item(l, 2, element_dtype=dtypes.float32)
+    self.assertEqual(self.evaluate(e1), 0.)
+    self.assertEqual(self.evaluate(e2), 0.)
+
+  def testGetUninitializedTensorUseSpecifiedElementShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    e0 = gen_list_ops.tensor_list_get_item(
+        l, 0, element_shape=[], element_dtype=dtypes.float32)
+    e1 = gen_list_ops.tensor_list_get_item(
+        l, 1, element_shape=[2, 3], element_dtype=dtypes.float32)
+    self.assertEqual(self.evaluate(e0), 0.)
+    self.assertAllEqual(self.evaluate(e1), np.zeros((2, 3)))
+
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[None, 3], num_elements=3)
+    e1 = gen_list_ops.tensor_list_get_item(
+        l, 1, element_shape=[2, 3], element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(e1), np.zeros((2, 3)))
+
+  def testGetUninitializedTensorWithInvalidElementShapeFails(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Trying to read an uninitialized tensor but "
+        "element_shape is not fully defined"):
+      e0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+      self.evaluate(e0)
+
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[None, 2], num_elements=3)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Incompatible shapes during merge: \[1,3\] vs. \[\?,2\]"):
+      e0 = gen_list_ops.tensor_list_get_item(
+          l, 0, element_dtype=dtypes.float32, element_shape=[1, 3])
+      self.evaluate(e0)
+
+  @test_util.run_deprecated_v1
+  @test_util.enable_control_flow_v2
+  def testSkipEagerSetItemIndexOutOfBounds(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[])
+    e0 = constant_op.constant(5.)
+    l = list_ops.tensor_list_set_item(
+        l, 0, 2. * e0, resize_if_index_out_of_bounds=True)
+    l = list_ops.tensor_list_set_item(
+        l, 1, 1., resize_if_index_out_of_bounds=True)
+    t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+    grad = gradients_impl.gradients(t, e0)[0]
+    self.assertAllEqual(self.evaluate(grad), 2.)
+
   @test_util.run_deprecated_v1
   def testSetOnEmptyListWithMaxNumElementsFails(self):
     l = list_ops.empty_tensor_list(
@@ -666,16 +979,25 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
               list_ops.empty_tensor_list([], dtypes.float32),
               element_dtype=dtypes.float32))
 
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "element shapes are not identical at index 0"):
+    if context.executing_eagerly():
+      expected_error = (
+          errors.InvalidArgumentError,
+          "element shapes are not identical at index 0")
+    else:
+      expected_error = (ValueError, "Shapes must be equal rank")
+    with self.assertRaisesRegexp(*expected_error):
       l_batch_of_vec_tls = array_ops.stack(
           [list_ops.tensor_list_from_tensor([[1.0]], element_shape=[1])] * 2)
       self.evaluate(
           list_ops.tensor_list_concat_lists(l_batch_0, l_batch_of_vec_tls,
                                             element_dtype=dtypes.float32))
 
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 r"input_b\[0\].dtype != element_dtype."):
+    if context.executing_eagerly():
+      expected_error = (errors.InvalidArgumentError,
+                        r"input_b\[0\].dtype != element_dtype.")
+    else:
+      expected_error = (ValueError, "input_b.type != element_dtype")
+    with self.assertRaisesRegexp(*expected_error):
       l_batch_of_int_tls = array_ops.stack(
           [list_ops.tensor_list_from_tensor([1], element_shape=[])] * 2)
       self.evaluate(
@@ -720,8 +1042,11 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.evaluate(
           list_ops.tensor_list_push_back_batch(l_batch, [[3.0], [4.0]]))
 
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "Invalid data type at index 0"):
+    if context.executing_eagerly():
+      expected_error = (errors.InvalidArgumentError, "Invalid data type")
+    else:
+      expected_error = (ValueError, "wrong element dtype")
+    with self.assertRaisesRegexp(*expected_error):
       self.evaluate(list_ops.tensor_list_push_back_batch(l_batch, [3, 4]))
 
   def testZerosLike(self):
@@ -917,9 +1242,8 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     l = list_ops.tensor_list_push_back(l, [[0., 1.]])
     l = list_ops.tensor_list_push_back(l, [[2.], [4.]])
     with self.assertRaisesRegexp(
-        errors.InvalidArgumentError,
-        r"Tried to concat tensors with unequal shapes: "
-        r"\[2\] vs \[1\]"):
+        errors.InvalidArgumentError, r"Incompatible shapes during merge: "
+        r"\[2\] vs. \[1\]"):
       t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
       self.evaluate(t)
 
@@ -980,6 +1304,65 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       t = list_ops.tensor_list_concat(l1, element_dtype=dtypes.float32)
       self.evaluate(t)
 
+  def testConcatWithUninitializedTensorsUseListElementShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[2, 3], num_elements=3)
+    t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(np.zeros((6, 3)), t)
+
+  def testConcatWithUninitializedTensorsUseProvidedElementShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    t = list_ops.tensor_list_concat(
+        l, element_dtype=dtypes.float32, element_shape=(2, 3))
+    self.assertAllEqual(np.zeros((6, 3)), t)
+
+  def testConcatWithUninitializedTensorsUseProvidedElementShapeAndLengths(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    t, _ = gen_list_ops.tensor_list_concat_v2(
+        l,
+        element_dtype=dtypes.float32,
+        element_shape=list_ops._build_element_shape((None, 3)),
+        leading_dims=[2, 3, 5])
+    self.assertAllEqual(np.zeros((10, 3)), t)
+    l = list_ops.tensor_list_set_item(l, 1, [[2., 3.], [4., 5.], [6., 7.]])
+    t, _ = gen_list_ops.tensor_list_concat_v2(
+        l,
+        element_dtype=dtypes.float32,
+        element_shape=list_ops._build_element_shape((None, 2)),
+        leading_dims=[2, 3, 4])
+    self.assertAllEqual([[0., 0.], [0., 0.], [2., 3.], [4., 5.], [6., 7.],
+                         [0., 0.], [0., 0.], [0., 0.], [0., 0.]], t)
+
+  def testConcatWithUninitializedTensorsInferShapeFromElements(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    l = list_ops.tensor_list_set_item(l, 1, [[2., 3.], [4., 5.], [6., 7.]])
+    t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+    self.assertAllEqual([[0., 0.], [0., 0.], [0., 0.], [2., 3.], [4., 5.],
+                         [6., 7.], [0., 0.], [0., 0.], [0., 0.]], t)
+
+  def testConcatWithUninitializedTensorsFailsIfNoElementShape(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=None, num_elements=3)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Trying to concat list with only uninitialized tensors "
+        r"but element_shape_except_first_dim_ is not fully defined"):
+      t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testConcatWithUninitializedTensorsFailsIfNoInputLengths(self):
+    l = list_ops.tensor_list_reserve(
+        element_dtype=dtypes.float32, element_shape=[None, 3], num_elements=3)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"List contains uninitialized tensor at index 0"
+        r" but leading_dims has only 0 elements."):
+      t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
   def testEvenSplit(self):
 
     def RunTest(input_tensor, lengths, expected_stacked_output):
@@ -1096,6 +1479,47 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                                    element_shape=[1],
                                    lengths=[1, 1])
 
+  def testResizeGrow(self):
+    l = list_ops.tensor_list_from_tensor([1., 2.], element_shape=[])
+    l = list_ops.tensor_list_resize(l, 4)
+    self.assertEqual(self.evaluate(list_ops.tensor_list_length(l)), 4)
+    self.assertEqual(
+        self.evaluate(
+            list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)),
+        1.)
+    self.assertEqual(
+        self.evaluate(
+            list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32)),
+        2.)
+
+  def testResizeShrink(self):
+    l = list_ops.tensor_list_from_tensor([1., 2., 3.], element_shape=[])
+    l = list_ops.tensor_list_resize(l, 2)
+    self.assertEqual(self.evaluate(list_ops.tensor_list_length(l)), 2)
+    self.assertAllEqual(
+        self.evaluate(
+            list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)),
+        [1., 2.])
+
+  def testResizeWithInvalidSizeFails(self):
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "TensorListSlice expects size to be non-negative"):
+      l = list_ops.tensor_list_from_tensor([1., 2., 3.], element_shape=[])
+      l = list_ops.tensor_list_resize(l, -1)
+      self.evaluate(l)
+
+  @test_util.run_deprecated_v1
+  @test_util.enable_control_flow_v2
+  def testSkipEagerResizeGrad(self):
+    t = constant_op.constant([1., 2., 3.])
+    l = list_ops.tensor_list_from_tensor(t, element_shape=[])
+    l = list_ops.tensor_list_set_item(
+        l, 3, 4., resize_if_index_out_of_bounds=True)
+    t1 = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+    grad = gradients_impl.gradients(t1, t)[0]
+    self.assertAllEqual(self.evaluate(grad), [1., 1., 1.])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/logging_ops_test.py b/tensorflow/python/kernel_tests/logging_ops_test.py
index 85035e5f7d308c323786bc9fd9017fda89dbec13..3896b138c9462250475c77ccec300a122e3b0a8c 100644
--- a/tensorflow/python/kernel_tests/logging_ops_test.py
+++ b/tensorflow/python/kernel_tests/logging_ops_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import string
 import sys
 import tempfile
 
@@ -37,6 +38,7 @@ from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
+
 class LoggingOpsTest(test.TestCase):
 
   @test_util.run_deprecated_v1
@@ -80,6 +82,17 @@ class PrintV2Test(test.TestCase):
       expected = "[0 1 2 ... 7 8 9]"
       self.assertTrue((expected + "\n") in printed.contents())
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testPrintOneStringTensor(self):
+    with self.cached_session():
+      tensor = ops.convert_to_tensor([char for char in string.ascii_lowercase])
+      with self.captureWritesToStream(sys.stderr) as printed:
+        print_op = logging_ops.print_v2(tensor)
+        self.evaluate(print_op)
+
+      expected = "[\"a\" \"b\" \"c\" ... \"x\" \"y\" \"z\"]"
+      self.assertIn((expected + "\n"), printed.contents())
+
   @test_util.run_in_graph_and_eager_modes()
   def testPrintOneTensorVarySummarize(self):
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/lookup_ops_test.py b/tensorflow/python/kernel_tests/lookup_ops_test.py
index ad81e0be649f17fe97691b1c5739dbe0bf4a63d2..baa1550fb4f4e54d039d9a5a87bf029fb8efc5e5 100644
--- a/tensorflow/python/kernel_tests/lookup_ops_test.py
+++ b/tensorflow/python/kernel_tests/lookup_ops_test.py
@@ -18,9 +18,13 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import tempfile
 import numpy as np
+import six
 
 from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import counter
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -32,12 +36,13 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import saver
 from tensorflow.python.training import server_lib
+from tensorflow.python.training.tracking import util as trackable
 
 
-class HashTableOpTest(test.TestCase):
+class HashTableTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def testHashTable(self):
     with self.cached_session():
       default_val = -1
@@ -45,9 +50,9 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.initializer.run()
+      self.evaluate(table.initializer)
 
-      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(3, self.evaluate(table.size()))
 
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
@@ -62,7 +67,6 @@ class HashTableOpTest(test.TestCase):
                             self.evaluate(exported_keys_tensor))
       self.assertItemsEqual([0, 1, 2], self.evaluate(exported_values_tensor))
 
-  @test_util.run_deprecated_v1
   def testHashTableFindHighRank(self):
     with self.cached_session():
       default_val = -1
@@ -70,18 +74,17 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.initializer.run()
+      self.evaluate(table.initializer)
 
-      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant(
-          [["brain", "salad"], ["tank", "tarkus"]])
+      input_string = constant_op.constant([["brain", "salad"],
+                                           ["tank", "tarkus"]])
       output = table.lookup(input_string)
 
       result = self.evaluate(output)
       self.assertAllEqual([[0, 1], [-1, -1]], result)
 
-  @test_util.run_deprecated_v1
   def testHashTableInitWithPythonArrays(self):
     with self.cached_session():
       default_val = -1
@@ -90,9 +93,9 @@ class HashTableOpTest(test.TestCase):
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(
               keys, values, value_dtype=dtypes.int64), default_val)
-      table.initializer.run()
+      self.evaluate(table.initializer)
 
-      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(3, self.evaluate(table.size()))
 
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
@@ -100,7 +103,6 @@ class HashTableOpTest(test.TestCase):
       result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
-  @test_util.run_deprecated_v1
   def testHashTableInitWithNumPyArrays(self):
     with self.cached_session():
       default_val = -1
@@ -108,9 +110,9 @@ class HashTableOpTest(test.TestCase):
       values = np.array([0, 1, 2], dtype=np.int64)
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.initializer.run()
+      self.evaluate(table.initializer)
 
-      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(3, self.evaluate(table.size()))
 
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
@@ -118,9 +120,9 @@ class HashTableOpTest(test.TestCase):
       result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
-  @test_util.run_deprecated_v1
   def testMultipleHashTables(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
+
       default_val = -1
       keys = constant_op.constant(["brain", "salad", "surgery"])
       values = constant_op.constant([0, 1, 2], dtypes.int64)
@@ -132,10 +134,12 @@ class HashTableOpTest(test.TestCase):
       table3 = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
 
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual(3, table1.size().eval())
-      self.assertAllEqual(3, table2.size().eval())
-      self.assertAllEqual(3, table3.size().eval())
+      self.evaluate(table1.initializer)
+      self.evaluate(table2.initializer)
+      self.evaluate(table3.initializer)
+      self.assertAllEqual(3, self.evaluate(table1.size()))
+      self.assertAllEqual(3, self.evaluate(table2.size()))
+      self.assertAllEqual(3, self.evaluate(table3.size()))
 
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output1 = table1.lookup(input_string)
@@ -147,7 +151,6 @@ class HashTableOpTest(test.TestCase):
       self.assertAllEqual([0, 1, -1], out2)
       self.assertAllEqual([0, 1, -1], out3)
 
-  @test_util.run_deprecated_v1
   def testHashTableWithTensorDefault(self):
     with self.cached_session():
       default_val = constant_op.constant(-1, dtypes.int64)
@@ -155,7 +158,7 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.initializer.run()
+      self.evaluate(table.initializer)
 
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
@@ -163,15 +166,14 @@ class HashTableOpTest(test.TestCase):
       result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
-  @test_util.run_deprecated_v1
   def testHashTableWithSparseTensorInput(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
       default_val = constant_op.constant(-1, dtypes.int64)
       keys = constant_op.constant(["brain", "salad", "surgery"])
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.initializer.run()
+      self.evaluate(table.initializer)
 
       sp_indices = [[0, 0], [0, 1], [1, 0]]
       sp_shape = [2, 2]
@@ -187,7 +189,6 @@ class HashTableOpTest(test.TestCase):
       self.assertAllEqual(sp_indices, out_indices)
       self.assertAllEqual(sp_shape, out_shape)
 
-  @test_util.run_deprecated_v1
   def testSignatureMismatch(self):
     with self.cached_session():
       default_val = -1
@@ -195,12 +196,12 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.initializer.run()
+      self.evaluate(table.initializer)
 
       # Ref types do not produce a lookup signature mismatch.
       input_string_ref = variables.Variable("brain")
-      variables.global_variables_initializer().run()
-      self.assertEqual(0, table.lookup(input_string_ref).eval())
+      self.evaluate(input_string_ref.initializer)
+      self.assertEqual(0, self.evaluate(table.lookup(input_string_ref)))
 
       input_string = constant_op.constant([1, 2, 3], dtypes.int64)
       with self.assertRaises(TypeError):
@@ -223,8 +224,9 @@ class HashTableOpTest(test.TestCase):
     with self.cached_session():
       default_val = -1
       table = lookup_ops.HashTable(
-          lookup_ops.KeyValueTensorInitializer(
-              ["a"], [1], value_dtype=dtypes.int64), default_val)
+          lookup_ops.KeyValueTensorInitializer(["a"], [1],
+                                               value_dtype=dtypes.int64),
+          default_val)
 
       input_string = constant_op.constant(["brain", "salad", "surgery"])
       output = table.lookup(input_string)
@@ -240,10 +242,10 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant([0, 1, 2], dtypes.int64)
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.initializer.run()
+      self.evaluate(table.initializer)
 
       with self.assertRaisesOpError("Table already initialized"):
-        table.initializer.run()
+        self.evaluate(table.initializer)
 
   @test_util.run_deprecated_v1
   def testInitializationWithInvalidDimensions(self):
@@ -259,10 +261,9 @@ class HashTableOpTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testMultipleSessions(self):
     # Start a server
-    server = server_lib.Server(
-        {
-            "local0": ["localhost:0"]
-        }, protocol="grpc", start=True)
+    server = server_lib.Server({"local0": ["localhost:0"]},
+                               protocol="grpc",
+                               start=True)
     # Create two sessions sharing the same state
     session1 = session.Session(server.target)
     session2 = session.Session(server.target)
@@ -277,16 +278,15 @@ class HashTableOpTest(test.TestCase):
 
     # Init the table in the first session.
     with session1:
-      table.initializer.run()
-      self.assertAllEqual(3, table.size().eval())
+      self.evaluate(table.initializer)
+      self.assertAllEqual(3, self.evaluate(table.size()))
 
     # Init the table in the second session and verify that we do not get a
     # "Table already initialized" error.
     with session2:
       table.initializer.run()
-      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(3, self.evaluate(table.size()))
 
-  @test_util.run_deprecated_v1
   def testHashTableInt32String(self):
     with self.cached_session():
       default_val = "n/a"
@@ -294,7 +294,7 @@ class HashTableOpTest(test.TestCase):
       values = constant_op.constant(["brain", "salad", "surgery"])
       table = lookup_ops.HashTable(
           lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
-      table.initializer.run()
+      self.evaluate(table.initializer)
 
       input_tensor = constant_op.constant([0, 1, -1])
       output = table.lookup(input_tensor)
@@ -311,7 +311,6 @@ class IndexTableFromFile(test.TestCase):
       f.write("\n".join(values) + "\n")
     return vocabulary_file
 
-  @test_util.run_deprecated_v1
   def test_string_index_table_from_file(self):
     vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
     with self.cached_session():
@@ -319,12 +318,12 @@ class IndexTableFromFile(test.TestCase):
           vocabulary_file=vocabulary_file, num_oov_buckets=1)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
-  @test_util.run_deprecated_v1
   def test_string_index_table_from_multicolumn_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain\t300", "salad\t20", "surgery\t1"))
@@ -336,12 +335,12 @@ class IndexTableFromFile(test.TestCase):
           value_column_index=lookup_ops.TextFileIndex.LINE_NUMBER)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
-  @test_util.run_deprecated_v1
   def test_string_index_table_from_multicolumn_file_custom_delimiter(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain 300", "salad 20", "surgery 1"))
@@ -354,12 +353,12 @@ class IndexTableFromFile(test.TestCase):
           delimiter=" ")
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
-  @test_util.run_deprecated_v1
   def test_string_index_table_from_file_tensor_filename(self):
     vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
     with self.cached_session():
@@ -368,14 +367,16 @@ class IndexTableFromFile(test.TestCase):
           vocabulary_file=vocabulary_file, num_oov_buckets=1)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((1, 2, 3), self.evaluate(ids))
-      self.assertEqual(1,
-                       len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
+      if not context.executing_eagerly():
+        self.assertEqual(1,
+                         len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("placeholder usage")
   def test_string_index_table_from_file_placeholder_filename(self):
     vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
     with self.cached_session():
@@ -393,7 +394,6 @@ class IndexTableFromFile(test.TestCase):
       self.assertEqual(0,
                        len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
 
-  @test_util.run_deprecated_v1
   def test_int32_index_table_from_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab2.txt", values=("42", "1", "-1000"))
@@ -405,12 +405,12 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(
           constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
-  @test_util.run_deprecated_v1
   def test_int64_index_table_from_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab3.txt", values=("42", "1", "-1000"))
@@ -422,12 +422,12 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(
           constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
-  @test_util.run_deprecated_v1
   def test_index_table_from_file_with_default_value(self):
     default_value = -42
     vocabulary_file = self._createVocabFile("f2i_vocab4.txt")
@@ -436,12 +436,12 @@ class IndexTableFromFile(test.TestCase):
           vocabulary_file=vocabulary_file, default_value=default_value)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((1, 2, default_value), self.evaluate(ids))
 
-  @test_util.run_deprecated_v1
   def test_index_table_from_file_with_oov_buckets(self):
     vocabulary_file = self._createVocabFile("f2i_vocab5.txt")
     with self.cached_session():
@@ -450,9 +450,10 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(
           constant_op.constant(["salad", "surgery", "tarkus", "toccata"]))
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual(
           (
               1,  # From vocabulary file.
@@ -490,7 +491,6 @@ class IndexTableFromFile(test.TestCase):
         vocabulary_file=vocabulary_file,
         vocab_size=0)
 
-  @test_util.run_deprecated_v1
   def test_index_table_from_file_with_vocab_size_too_small(self):
     vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
     with self.cached_session():
@@ -498,22 +498,22 @@ class IndexTableFromFile(test.TestCase):
           vocabulary_file=vocabulary_file, vocab_size=2)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((1, -1, -1), self.evaluate(ids))
-      self.assertEqual(2, table.size().eval())
+      self.assertEqual(2, self.evaluate(table.size()))
 
-  @test_util.run_deprecated_v1
   def test_index_table_from_file_with_vocab_size_too_large(self):
     vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
     with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, vocab_size=4)
-      self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                              "Invalid vocab_size", table.initializer.run)
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "Invalid vocab_size"):
+        table = lookup_ops.index_table_from_file(
+            vocabulary_file=vocabulary_file, vocab_size=4)
+        self.evaluate(table.initializer)
 
-  @test_util.run_deprecated_v1
   def test_index_table_from_file_with_vocab_size(self):
     vocabulary_file = self._createVocabFile("f2i_vocab8.txt")
 
@@ -528,11 +528,12 @@ class IndexTableFromFile(test.TestCase):
           vocabulary_file=vocabulary_file, vocab_size=3)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((1, 2, -1), self.evaluate(ids))
-      self.assertEqual(3, table.size().eval())
+      self.assertEqual(3, self.evaluate(table.size()))
 
   def test_index_table_from_file_with_invalid_hashers(self):
     vocabulary_file = self._createVocabFile("invalid_hasher.txt")
@@ -574,6 +575,7 @@ class KeyValueTensorInitializerTest(test.TestCase):
     with ops.Graph().as_default(), self.cached_session():
       init = lookup_ops.KeyValueTensorInitializer(
           ("brain", "salad", "surgery"), (0, 1, 2), dtypes.string, dtypes.int64)
+      self.assertEqual("", init._shared_name)
       table = lookup_ops.HashTable(init, default_value=-1)
       table.initializer.run()
 
@@ -583,40 +585,42 @@ class KeyValueTensorInitializerTest(test.TestCase):
         init1 = lookup_ops.KeyValueTensorInitializer(
             ("brain", "salad", "surgery"), (0, 1, 2), dtypes.string,
             dtypes.int64)
+        self.assertEqual("", init1._shared_name)
         table1 = lookup_ops.HashTable(init1, default_value=-1)
-        self.assertEquals("hash_table", table1.name)
-        self.assertEquals("table_scope/hash_table",
-                          table1.resource_handle.op.name)
+        self.assertEqual("hash_table", table1.name)
+        self.assertEqual("table_scope/hash_table",
+                         table1.resource_handle.op.name)
         init2 = lookup_ops.KeyValueTensorInitializer(
             ("brain", "salad", "surgery"), (0, 1, 2), dtypes.string,
             dtypes.int64)
+        self.assertEqual("", init2._shared_name)
         table2 = lookup_ops.HashTable(init2, default_value=-1)
-        self.assertEquals("hash_table_1", table2.name)
-        self.assertEquals("table_scope/hash_table_1",
-                          table2.resource_handle.op.name)
+        self.assertEqual("hash_table_1", table2.name)
+        self.assertEqual("table_scope/hash_table_1",
+                         table2.resource_handle.op.name)
 
   def test_int64(self):
     with ops.Graph().as_default(), self.cached_session():
       init = lookup_ops.KeyValueTensorInitializer((42, 1, -1000), (0, 1, 2),
                                                   dtypes.int64, dtypes.int64)
+      self.assertEqual("", init._shared_name)
       table = lookup_ops.HashTable(init, default_value=-1)
       table.initializer.run()
 
-  @test_util.run_deprecated_v1
   def test_int32(self):
     with ops.Graph().as_default(), self.cached_session():
       init = lookup_ops.KeyValueTensorInitializer((42, 1, -1000), (0, 1, 2),
                                                   dtypes.int32, dtypes.int64)
+      self.assertEqual("", init._shared_name)
       table = lookup_ops.HashTable(init, default_value=-1)
-      with self.assertRaisesRegexp(
-          errors_impl.OpError, "No OpKernel was registered"):
+      with self.assertRaisesRegexp(errors_impl.OpError,
+                                   "No OpKernel was registered"):
         table.initializer.run()
 
 
 class IndexTableFromTensor(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
-  @test_util.run_deprecated_v1
   def test_index_table_from_tensor_with_tensor_init(self):
     table = lookup_ops.index_table_from_tensor(
         vocabulary_list=("brain", "salad", "surgery"), num_oov_buckets=1)
@@ -633,7 +637,6 @@ class IndexTableFromTensor(test.TestCase):
     ids = table.lookup(constant_op.constant(("salad", "surgery", "tarkus")))
     self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
-  @test_util.run_deprecated_v1
   def test_int32_index_table_from_tensor_with_tensor_init(self):
     with self.cached_session():
       table = lookup_ops.index_table_from_tensor(
@@ -641,12 +644,12 @@ class IndexTableFromTensor(test.TestCase):
       ids = table.lookup(
           constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
 
-      with self.assertRaises(errors_impl.FailedPreconditionError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.FailedPreconditionError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
-  @test_util.run_deprecated_v1
   def test_int64_index_table_from_tensor_with_tensor_init(self):
     with self.cached_session():
       table = lookup_ops.index_table_from_tensor(
@@ -654,12 +657,12 @@ class IndexTableFromTensor(test.TestCase):
       ids = table.lookup(
           constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
 
-      with self.assertRaises(errors_impl.FailedPreconditionError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.FailedPreconditionError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
-  @test_util.run_deprecated_v1
   def test_index_table_from_tensor_with_default_value(self):
     default_value = -42
     with self.cached_session():
@@ -668,9 +671,10 @@ class IndexTableFromTensor(test.TestCase):
           default_value=default_value)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      with self.assertRaises(errors_impl.FailedPreconditionError):
-        self.evaluate(ids)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.FailedPreconditionError):
+          self.evaluate(ids)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((1, 2, default_value), self.evaluate(ids))
 
   def test_index_table_from_tensor_missing_vocabulary_list(self):
@@ -680,17 +684,13 @@ class IndexTableFromTensor(test.TestCase):
         lookup_ops.index_table_from_tensor(
             vocabulary_list=None, num_oov_buckets=1)
 
-  @test_util.run_deprecated_v1
   def test_index_table_from_tensor_empty_vocabulary_list(self):
     with self.cached_session():
-      table = lookup_ops.index_table_from_tensor(
-          vocabulary_list=np.array([], dtype=np.str_), num_oov_buckets=1)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "brain"]))
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(ids)
       with self.assertRaisesRegexp(
           errors_impl.OpError, "keys and values cannot be empty"):
-        lookup_ops.tables_initializer().run()
+        _ = lookup_ops.index_table_from_tensor(
+            vocabulary_list=np.array([], dtype=np.str_), num_oov_buckets=1)
+        self.evaluate(lookup_ops.tables_initializer())
 
   def test_index_table_from_tensor_with_invalid_hashers(self):
     with self.cached_session():
@@ -717,7 +717,6 @@ class IndexToStringTableFromFileTest(test.TestCase):
       f.write("\n".join(values) + "\n")
     return vocabulary_file
 
-  @test_util.run_deprecated_v1
   def test_index_to_string_table(self):
     vocabulary_path = self._createVocabFile("i2f_vocab1.txt")
     # vocabulary_file supports string and tensor
@@ -729,13 +728,13 @@ class IndexToStringTableFromFileTest(test.TestCase):
             vocabulary_file=vocabulary_file)
         features = table.lookup(
             constant_op.constant([0, 1, 2, 3], dtypes.int64))
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(features)
-        lookup_ops.tables_initializer().run()
+        if not context.executing_eagerly():
+          with self.assertRaises(errors_impl.OpError):
+            self.evaluate(features)
+        self.evaluate(lookup_ops.tables_initializer())
         self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
                             self.evaluate(features))
 
-  @test_util.run_deprecated_v1
   def test_index_to_string_table_from_multicolumn_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain\t300", "salad\t20", "surgery\t1"))
@@ -745,13 +744,13 @@ class IndexToStringTableFromFileTest(test.TestCase):
           key_column_index=lookup_ops.TextFileIndex.LINE_NUMBER,
           value_column_index=0)
       features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(features)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(features)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
                           self.evaluate(features))
 
-  @test_util.run_deprecated_v1
   def test_index_to_string_table_from_multicolumn_file_custom_delimiter(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain 300", "salad 20", "surgery 1"))
@@ -762,13 +761,13 @@ class IndexToStringTableFromFileTest(test.TestCase):
           value_column_index=0,
           delimiter=" ")
       features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(features)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(features)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
                           self.evaluate(features))
 
-  @test_util.run_deprecated_v1
   def test_index_to_string_table_with_default_value(self):
     default_value = b"NONE"
     vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
@@ -776,13 +775,13 @@ class IndexToStringTableFromFileTest(test.TestCase):
       table = lookup_ops.index_to_string_table_from_file(
           vocabulary_file=vocabulary_file, default_value=default_value)
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(features)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(features)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((b"salad", b"surgery", default_value),
                           self.evaluate(features))
 
-  @test_util.run_deprecated_v1
   def test_index_to_string_table_with_vocab_size_too_small(self):
     default_value = b"NONE"
     vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
@@ -792,27 +791,22 @@ class IndexToStringTableFromFileTest(test.TestCase):
           vocab_size=2,
           default_value=default_value)
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(features)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(features)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((b"salad", default_value, default_value),
                           self.evaluate(features))
 
-  @test_util.run_deprecated_v1
   def test_index_to_string_table_with_vocab_size_too_large(self):
     vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
     with self.cached_session():
-      table = lookup_ops.index_to_string_table_from_file(
-          vocabulary_file=vocabulary_file, vocab_size=4)
-      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
-
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(features)
-      init = lookup_ops.tables_initializer()
-      self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                              "Invalid vocab_size", init.run)
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "Invalid vocab_size"):
+        _ = lookup_ops.index_to_string_table_from_file(
+            vocabulary_file=vocabulary_file, vocab_size=4)
+        self.evaluate(lookup_ops.tables_initializer())
 
-  @test_util.run_deprecated_v1
   def test_index_to_string_table_with_vocab_size(self):
     vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
     with self.cached_session():
@@ -820,16 +814,16 @@ class IndexToStringTableFromFileTest(test.TestCase):
           vocabulary_file=vocabulary_file, vocab_size=3)
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
 
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(features)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(features)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((b"salad", b"surgery", b"UNK"),
                           self.evaluate(features))
 
 
 class IndexToStringTableFromTensorTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def test_index_to_string_table_from_tensor(self):
     with self.cached_session():
       vocabulary_list = constant_op.constant(["brain", "salad", "surgery"])
@@ -838,14 +832,14 @@ class IndexToStringTableFromTensorTest(test.TestCase):
 
       indices = constant_op.constant([0, 1, 2, 3], dtypes.int64)
       features = table.lookup(indices)
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(features)
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(features)
+      self.evaluate(lookup_ops.tables_initializer())
 
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
                           self.evaluate(features))
 
-  @test_util.run_deprecated_v1
   def test_duplicate_entries(self):
     with self.cached_session():
       vocabulary_list = constant_op.constant(["hello", "hello"])
@@ -853,10 +847,9 @@ class IndexToStringTableFromTensorTest(test.TestCase):
           vocabulary_list=vocabulary_list)
       indices = constant_op.constant([0, 1, 4], dtypes.int64)
       features = table.lookup(indices)
-      lookup_ops.tables_initializer().run()
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((b"hello", b"hello", b"UNK"), self.evaluate(features))
 
-  @test_util.run_deprecated_v1
   def test_index_to_string_with_default_value(self):
     default_value = b"NONE"
     with self.cached_session():
@@ -865,10 +858,10 @@ class IndexToStringTableFromTensorTest(test.TestCase):
           vocabulary_list=vocabulary_list, default_value=default_value)
       indices = constant_op.constant([1, 2, 4], dtypes.int64)
       features = table.lookup(indices)
-      with self.assertRaises(errors_impl.OpError):
-        self.evaluate(features)
-
-      lookup_ops.tables_initializer().run()
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(features)
+      self.evaluate(lookup_ops.tables_initializer())
       self.assertAllEqual((b"salad", b"surgery", default_value),
                           self.evaluate(features))
 
@@ -885,10 +878,11 @@ class InitializeTableFromFileOpTest(test.TestCase):
   def testInitializeStringTable(self):
     vocabulary_file = self._createVocabFile("one_column_1.txt")
     default_value = -1
-    table = lookup_ops.HashTable(
-        lookup_ops.TextFileInitializer(
-            vocabulary_file, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
-            dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER), default_value)
+    init = lookup_ops.TextFileInitializer(
+        vocabulary_file, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+        dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
+    self.assertTrue("one_column_1.txt_-2_-1" in init._shared_name)
+    table = lookup_ops.HashTable(init, default_value)
     self.evaluate(table.initializer)
 
     output = table.lookup(constant_op.constant(["brain", "salad", "tank"]))
@@ -896,19 +890,18 @@ class InitializeTableFromFileOpTest(test.TestCase):
     result = self.evaluate(output)
     self.assertAllEqual([0, 1, -1], result)
 
-  @test_util.run_deprecated_v1
   def testInitializeInt64Table(self):
     vocabulary_file = self._createVocabFile(
         "one_column_int64.txt", values=("42", "1", "-1000"))
 
     with self.cached_session():
       default_value = -1
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(
-              vocabulary_file, dtypes.int64,
-              lookup_ops.TextFileIndex.WHOLE_LINE, dtypes.int64,
-              lookup_ops.TextFileIndex.LINE_NUMBER), default_value)
-      table.initializer.run()
+      init = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.int64, lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
+      self.assertTrue("one_column_int64.txt_-2_-1" in init._shared_name)
+      table = lookup_ops.HashTable(init, default_value)
+      self.evaluate(table.initializer)
 
       output = table.lookup(
           constant_op.constant((42, 1, 11), dtype=dtypes.int64))
@@ -916,7 +909,6 @@ class InitializeTableFromFileOpTest(test.TestCase):
       result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
-  @test_util.run_deprecated_v1
   def testInitializeIndexTable(self):
     vocabulary_file = self._createVocabFile("one_column_2.txt")
 
@@ -924,11 +916,11 @@ class InitializeTableFromFileOpTest(test.TestCase):
       default_value = "UNK"
       key_index = lookup_ops.TextFileIndex.LINE_NUMBER
       value_index = lookup_ops.TextFileIndex.WHOLE_LINE
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.int64,
-                                         key_index, dtypes.string, value_index),
-          default_value)
-      table.initializer.run()
+      init = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.int64, key_index, dtypes.string, value_index)
+      self.assertTrue("one_column_2.txt_-1_-2" in init._shared_name)
+      table = lookup_ops.HashTable(init, default_value)
+      self.evaluate(table.initializer)
 
       input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
       output = table.lookup(input_values)
@@ -936,7 +928,6 @@ class InitializeTableFromFileOpTest(test.TestCase):
       result = self.evaluate(output)
       self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"], result)
 
-  @test_util.run_deprecated_v1
   def testMultiColumn(self):
     vocabulary_file = os.path.join(self.get_temp_dir(), "three_columns.txt")
     with open(vocabulary_file, "w") as f:
@@ -947,11 +938,11 @@ class InitializeTableFromFileOpTest(test.TestCase):
       key_index = 1
       value_index = 2
 
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
-                                         key_index, dtypes.int64, value_index),
-          default_value)
-      table.initializer.run()
+      init = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.string, key_index, dtypes.int64, value_index)
+      self.assertTrue("three_columns.txt_1_2" in init._shared_name)
+      table = lookup_ops.HashTable(init, default_value)
+      self.evaluate(table.initializer)
 
       input_string = constant_op.constant(["brain", "salad", "surgery"])
       output = table.lookup(input_string)
@@ -959,7 +950,6 @@ class InitializeTableFromFileOpTest(test.TestCase):
       result = self.evaluate(output)
       self.assertAllEqual([1, 5, 6], result)
 
-  @test_util.run_deprecated_v1
   def testInvalidDataTypeInMultiColumn(self):
     vocabulary_file = os.path.join(self.get_temp_dir(), "three_columns.txt")
     with open(vocabulary_file, "w") as f:
@@ -969,12 +959,12 @@ class InitializeTableFromFileOpTest(test.TestCase):
       default_value = -1
       key_index = 2
       value_index = 1
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
-                                         key_index, dtypes.int64, value_index),
-          default_value)
+      init = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.string, key_index, dtypes.int64, value_index)
+      self.assertTrue("three_columns.txt_2_1" in init._shared_name)
       with self.assertRaisesOpError("is not a valid"):
-        table.initializer.run()
+        table = lookup_ops.HashTable(init, default_value)
+        self.evaluate(table.initializer)
 
   def testInvalidDataType(self):
     vocabulary_file = self._createVocabFile("one_column_3.txt")
@@ -985,56 +975,48 @@ class InitializeTableFromFileOpTest(test.TestCase):
       value_index = lookup_ops.TextFileIndex.LINE_NUMBER
 
       with self.assertRaises(ValueError):
-        lookup_ops.HashTable(
-            lookup_ops.TextFileInitializer(vocabulary_file, dtypes.int64,
-                                           key_index, dtypes.string,
-                                           value_index), default_value)
+        init = lookup_ops.TextFileInitializer(vocabulary_file, dtypes.int64,
+                                              key_index, dtypes.string,
+                                              value_index)
+        self.assertTrue("one_column_3.txt_-2_-1" in init._shared_name)
+        lookup_ops.HashTable(init, default_value)
 
-  @test_util.run_deprecated_v1
   def testInvalidIndex(self):
     vocabulary_file = self._createVocabFile("one_column_4.txt")
     with self.cached_session():
       default_value = -1
       key_index = 1  # second column of the line
       value_index = lookup_ops.TextFileIndex.LINE_NUMBER
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
-                                         key_index, dtypes.int64, value_index),
-          default_value)
+      init = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.string, key_index, dtypes.int64, value_index)
+      self.assertTrue("one_column_4.txt_1_-1" in init._shared_name)
 
       with self.assertRaisesOpError("Invalid number of columns"):
-        table.initializer.run()
+        table = lookup_ops.HashTable(init, default_value)
+        self.evaluate(table.initializer)
 
-  @test_util.run_deprecated_v1
   def testInitializeSameTableWithMultipleNodes(self):
     vocabulary_file = self._createVocabFile("one_column_5.txt")
 
-    with self.cached_session() as sess:
-      shared_name = "shared-one-columm"
+    with self.cached_session():
       default_value = -1
-      table1 = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
-                                         lookup_ops.TextFileIndex.WHOLE_LINE,
-                                         dtypes.int64,
-                                         lookup_ops.TextFileIndex.LINE_NUMBER),
-          default_value,
-          shared_name=shared_name)
-      table2 = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
-                                         lookup_ops.TextFileIndex.WHOLE_LINE,
-                                         dtypes.int64,
-                                         lookup_ops.TextFileIndex.LINE_NUMBER),
-          default_value,
-          shared_name=shared_name)
-      table3 = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(vocabulary_file, dtypes.string,
-                                         lookup_ops.TextFileIndex.WHOLE_LINE,
-                                         dtypes.int64,
-                                         lookup_ops.TextFileIndex.LINE_NUMBER),
-          default_value,
-          shared_name=shared_name)
-
-      lookup_ops.tables_initializer().run()
+      init1 = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
+      self.assertTrue("one_column_5.txt_-2_-1" in init1._shared_name)
+      table1 = lookup_ops.HashTable(init1, default_value)
+      init2 = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
+      self.assertTrue("one_column_5.txt_-2_-1" in init2._shared_name)
+      table2 = lookup_ops.HashTable(init2, default_value)
+      init3 = lookup_ops.TextFileInitializer(
+          vocabulary_file, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
+      self.assertTrue("one_column_5.txt_-2_-1" in init3._shared_name)
+      table3 = lookup_ops.HashTable(init3, default_value)
+
+      self.evaluate(lookup_ops.tables_initializer())
 
       input_string = constant_op.constant(["brain", "salad", "tank"])
 
@@ -1057,64 +1039,66 @@ class InitializeTableFromFileOpTest(test.TestCase):
                 dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
             default_value)
 
-  @test_util.run_deprecated_v1
   def testInitializeWithVocabSize(self):
     with self.cached_session():
       default_value = -1
       vocab_size = 3
       vocabulary_file1 = self._createVocabFile("one_column6.txt")
-      table1 = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(
-              vocabulary_file1,
-              dtypes.string,
-              lookup_ops.TextFileIndex.WHOLE_LINE,
-              dtypes.int64,
-              lookup_ops.TextFileIndex.LINE_NUMBER,
-              vocab_size=vocab_size), default_value)
+      init1 = lookup_ops.TextFileInitializer(
+          vocabulary_file1,
+          dtypes.string,
+          lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64,
+          lookup_ops.TextFileIndex.LINE_NUMBER,
+          vocab_size=vocab_size)
+      self.assertTrue("one_column6.txt_3_-2_-1" in init1._shared_name)
+      table1 = lookup_ops.HashTable(init1, default_value)
 
       # Initialize from file.
-      table1.initializer.run()
-      self.assertEquals(vocab_size, table1.size().eval())
+      self.evaluate(table1.initializer)
+      self.assertEqual(vocab_size, self.evaluate(table1.size()))
 
       vocabulary_file2 = self._createVocabFile("one_column7.txt")
       vocab_size = 5
-      table2 = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(
-              vocabulary_file2,
-              dtypes.string,
-              lookup_ops.TextFileIndex.WHOLE_LINE,
-              dtypes.int64,
-              lookup_ops.TextFileIndex.LINE_NUMBER,
-              vocab_size=vocab_size), default_value)
+      init2 = lookup_ops.TextFileInitializer(
+          vocabulary_file2,
+          dtypes.string,
+          lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64,
+          lookup_ops.TextFileIndex.LINE_NUMBER,
+          vocab_size=vocab_size)
+      self.assertTrue("one_column7.txt_5_-2_-1" in init2._shared_name)
       with self.assertRaisesOpError("Invalid vocab_size"):
-        table2.initializer.run()
+        table2 = lookup_ops.HashTable(init2, default_value)
+        self.evaluate(table2.initializer)
 
       vocab_size = 1
       vocabulary_file3 = self._createVocabFile("one_column3.txt")
-      table3 = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(
-              vocabulary_file3,
-              dtypes.string,
-              lookup_ops.TextFileIndex.WHOLE_LINE,
-              dtypes.int64,
-              lookup_ops.TextFileIndex.LINE_NUMBER,
-              vocab_size=vocab_size), default_value)
+      init3 = lookup_ops.TextFileInitializer(
+          vocabulary_file3,
+          dtypes.string,
+          lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64,
+          lookup_ops.TextFileIndex.LINE_NUMBER,
+          vocab_size=vocab_size)
+      self.assertTrue("one_column3.txt_1_-2_-1" in init3._shared_name)
+      table3 = lookup_ops.HashTable(init3, default_value)
 
       # Smaller vocab size reads only vocab_size records.
-      table3.initializer.run()
-      self.assertEquals(vocab_size, table3.size().eval())
+      self.evaluate(table3.initializer)
+      self.assertEqual(vocab_size, self.evaluate(table3.size()))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("placeholder usage")
   def testFeedVocabularyName(self):
     vocabulary_file = self._createVocabFile("feed_vocabulary.txt")
 
     with self.cached_session():
       default_value = -1
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileInitializer(
-              "old_file.txt", dtypes.string,
-              lookup_ops.TextFileIndex.WHOLE_LINE, dtypes.int64,
-              lookup_ops.TextFileIndex.LINE_NUMBER), default_value)
+      init = lookup_ops.TextFileInitializer(
+          "old_file.txt", dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+          dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER)
+      self.assertTrue("old_file.txt_-2_-1" in init._shared_name)
+      table = lookup_ops.HashTable(init, default_value)
 
       # Initialize with non existing file (old_file.txt) should fail.
       # TODO(yleon): Update message, which might change per FileSystem.
@@ -1131,7 +1115,6 @@ class InitializeTableFromFileOpTest(test.TestCase):
       result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
-  @test_util.run_deprecated_v1
   def testInvalidFilenames(self):
     vocabulary_file = self._createVocabFile("filename_shape.txt")
 
@@ -1140,75 +1123,84 @@ class InitializeTableFromFileOpTest(test.TestCase):
 
       # Invalid data type
       other_type = constant_op.constant(1)
-      with self.assertRaises(ValueError):
+      with self.assertRaises(Exception) as cm:
         lookup_ops.HashTable(
             lookup_ops.TextFileInitializer(
                 other_type, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
                 dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
             default_value)
+      self.assertTrue(isinstance(cm.exception, (ValueError, TypeError)))
 
       # Non-scalar filename
       filenames = constant_op.constant([vocabulary_file, vocabulary_file])
-      with self.assertRaises(ValueError):
-        lookup_ops.HashTable(
-            lookup_ops.TextFileInitializer(
-                filenames, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
-                dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
-            default_value)
+      if not context.executing_eagerly():
+        with self.assertRaises(Exception) as cm:
+          lookup_ops.HashTable(
+              lookup_ops.TextFileInitializer(
+                  filenames, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+                  dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
+              default_value)
+        self.assertTrue(isinstance(cm.exception, (ValueError, TypeError)))
+      else:
+        with self.assertRaises(errors_impl.InvalidArgumentError):
+          lookup_ops.HashTable(
+              lookup_ops.TextFileInitializer(
+                  filenames, dtypes.string, lookup_ops.TextFileIndex.WHOLE_LINE,
+                  dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
+              default_value)
 
-  @test_util.run_deprecated_v1
   def testIdToStringTable(self):
     vocab_file = self._createVocabFile("feat_to_id_1.txt")
     with self.cached_session():
       default_value = "UNK"
       vocab_size = 3
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileStringTableInitializer(
-              vocab_file, vocab_size=vocab_size), default_value)
+      init = lookup_ops.TextFileStringTableInitializer(
+          vocab_file, vocab_size=vocab_size)
+      self.assertTrue("feat_to_id_1.txt_3_-1_-2", init._shared_name)
+      table = lookup_ops.HashTable(init, default_value)
 
-      table.initializer.run()
+      self.evaluate(table.initializer)
 
       input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
 
       out = table.lookup(input_values)
       self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"],
                           self.evaluate(out))
-      self.assertEquals(vocab_size, table.size().eval())
+      self.assertEqual(vocab_size, self.evaluate(table.size()))
 
-  @test_util.run_deprecated_v1
   def testStringToIdTable(self):
     vocab_file = self._createVocabFile("feat_to_id_2.txt")
     with self.cached_session():
       default_value = -1
       vocab_size = 3
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileIdTableInitializer(
-              vocab_file, vocab_size=vocab_size), default_value)
-      table.initializer.run()
+      init = lookup_ops.TextFileIdTableInitializer(
+          vocab_file, vocab_size=vocab_size)
+      self.assertTrue("feat_to_id_2.txt_3_-1_-2", init._shared_name)
+      table = lookup_ops.HashTable(init, default_value)
+      self.evaluate(table.initializer)
 
       input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
 
       out = table.lookup(input_string)
       self.assertAllEqual([0, 1, 2, -1], self.evaluate(out))
-      self.assertEquals(vocab_size, table.size().eval())
+      self.assertEqual(vocab_size, self.evaluate(table.size()))
 
-  @test_util.run_deprecated_v1
   def testInt64ToIdTable(self):
     vocab_file = self._createVocabFile(
         "feat_to_id_3.txt", values=("42", "1", "-1000"))
     with self.cached_session():
       default_value = -1
       vocab_size = 3
-      table = lookup_ops.HashTable(
-          lookup_ops.TextFileIdTableInitializer(
-              vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
-          default_value)
-      table.initializer.run()
+      init = lookup_ops.TextFileIdTableInitializer(
+          vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64)
+      self.assertTrue("feat_to_id_3.txt_3_-1_-2", init._shared_name)
+      table = lookup_ops.HashTable(init, default_value)
+      self.evaluate(table.initializer)
 
       out = table.lookup(
           constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64))
       self.assertAllEqual((0, 1, 2, -1), self.evaluate(out))
-      self.assertEquals(vocab_size, table.size().eval())
+      self.assertEqual(vocab_size, self.evaluate(table.size()))
 
 
 class IdTableWithHashBucketsTest(test.TestCase):
@@ -1238,7 +1230,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
 
       out = table.lookup(input_string)
       self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
-      self.assertEquals(vocab_size + oov_buckets, table.size().eval())
+      self.assertEqual(vocab_size + oov_buckets, table.size().eval())
 
   @test_util.run_deprecated_v1
   def testInt32IdTableWithHashBuckets(self):
@@ -1261,7 +1253,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
 
       out = table.lookup(values)
       self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
-      self.assertEquals(vocab_size + oov_buckets, table.size().eval())
+      self.assertEqual(vocab_size + oov_buckets, table.size().eval())
 
   @test_util.run_deprecated_v1
   def testInt64IdTableWithHashBuckets(self):
@@ -1282,7 +1274,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
 
       out = table.lookup(values)
       self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
-      self.assertEquals(vocab_size + oov_buckets, table.size().eval())
+      self.assertEqual(vocab_size + oov_buckets, table.size().eval())
 
   @test_util.run_deprecated_v1
   def testStringIdTableWithOnlyHashBucket(self):
@@ -1304,7 +1296,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
               4  # fingerprint("surgery") mod 5
           ],
           self.evaluate(out))
-      self.assertEquals(oov_buckets, table.size().eval())
+      self.assertEqual(oov_buckets, table.size().eval())
 
   @test_util.run_deprecated_v1
   def testInt32IdTableWithOnlyHashBucket(self):
@@ -1327,7 +1319,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
               2  # fingerprint("-1000") mod 5
           ],
           self.evaluate(out))
-      self.assertEquals(oov_buckets, table.size().eval())
+      self.assertEqual(oov_buckets, table.size().eval())
 
   def testFloat64IdTableWithOnlyHashBucket(self):
     with self.cached_session():
@@ -1375,8 +1367,8 @@ class IdTableWithHashBucketsTest(test.TestCase):
       out1, out2 = self.evaluate([out1, out2])
       self.assertAllEqual([5, 0, 1, 2, 5], out1)
       self.assertAllEqual([5, 0, 1, 2, 3], out2)
-      self.assertEquals(vocab_size + oov_buckets, table1.size().eval())
-      self.assertEquals(vocab_size + oov_buckets, table2.size().eval())
+      self.assertEqual(vocab_size + oov_buckets, table1.size().eval())
+      self.assertEqual(vocab_size + oov_buckets, table2.size().eval())
       test_util.assert_ops_in_graph({
           "table1_Lookup/hash_bucket": "StringToHashBucketFast",
           "table2_Lookup/hash_bucket": "StringToHashBucketStrong",
@@ -1385,7 +1377,6 @@ class IdTableWithHashBucketsTest(test.TestCase):
   @test_util.run_deprecated_v1
   def testIdTableWithHashBucketsInitializationAcrossSessions(self):
     vocab_file = self._createVocabFile("feat_to_id_5.txt")
-    shared_name = "across-sessions"
     with self.cached_session():
       default_value = -1
       vocab_size = 3
@@ -1393,9 +1384,8 @@ class IdTableWithHashBucketsTest(test.TestCase):
       table1 = lookup_ops.IdTableWithHashBuckets(
           lookup_ops.HashTable(
               lookup_ops.TextFileIdTableInitializer(
-                  vocab_file, vocab_size=vocab_size),
-              default_value,
-              shared_name=shared_name), oov_buckets)
+                  vocab_file, vocab_size=vocab_size), default_value),
+          oov_buckets)
 
       table1.initializer.run()
 
@@ -1405,7 +1395,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       out1 = table1.lookup(input_string_1)
 
       self.assertAllEqual([0, 1, 2, 3], self.evaluate(out1))
-      self.assertEquals(vocab_size + oov_buckets, table1.size().eval())
+      self.assertEqual(vocab_size + oov_buckets, table1.size().eval())
 
     with self.cached_session():
       default_value = -1
@@ -1417,16 +1407,15 @@ class IdTableWithHashBucketsTest(test.TestCase):
       table2 = lookup_ops.IdTableWithHashBuckets(
           lookup_ops.HashTable(
               lookup_ops.TextFileIdTableInitializer(
-                  vocab_file, vocab_size=vocab_size),
-              default_value,
-              shared_name=shared_name), oov_buckets)
+                  vocab_file, vocab_size=vocab_size), default_value),
+          oov_buckets)
 
       input_string_2 = constant_op.constant(["fruit", "salad", "UNK"])
 
       out2 = table2.lookup(input_string_2)
 
       self.assertAllEqual([3, 1, 3], self.evaluate(out2))
-      self.assertEquals(vocab_size + oov_buckets, table2.size().eval())
+      self.assertEqual(vocab_size + oov_buckets, table2.size().eval())
 
   @test_util.run_deprecated_v1
   def testIdTableWithHashBucketsWithMultipleInitializersDifferentDefault(self):
@@ -1460,8 +1449,8 @@ class IdTableWithHashBucketsTest(test.TestCase):
       out1, out2 = self.evaluate([out1, out2])
       self.assertAllEqual([0, 1, 2, -1], out1)
       self.assertAllEqual([-2, 1, -2], out2)
-      self.assertEquals(vocab_size + oov_buckets, table1.size().eval())
-      self.assertEquals(vocab_size + oov_buckets, table2.size().eval())
+      self.assertEqual(vocab_size + oov_buckets, table1.size().eval())
+      self.assertEqual(vocab_size + oov_buckets, table2.size().eval())
 
   @test_util.run_deprecated_v1
   def testSparseTensor(self):
@@ -1598,5 +1587,1535 @@ class IdTableWithHashBucketsTest(test.TestCase):
       self.assertIsNone(table.resource_handle)
 
 
+class MutableHashTableOpTest(test.TestCase):
+
+  def testMutableHashTable(self):
+    with self.cached_session():
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery", "tarkus"])
+      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                          default_val)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(4, self.evaluate(table.size()))
+
+      remove_string = constant_op.constant(["tarkus", "tank"])
+      self.evaluate(table.remove(remove_string))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
+      self.assertAllEqual([3], output.get_shape())
+
+      result = self.evaluate(output)
+      self.assertAllEqual([0, 1, -1], result)
+
+      exported_keys, exported_values = table.export()
+
+      # exported data is in the order of the internal map, i.e. undefined
+      sorted_keys = np.sort(self.evaluate(exported_keys))
+      sorted_values = np.sort(self.evaluate(exported_values))
+      self.assertAllEqual([b"brain", b"salad", b"surgery"], sorted_keys)
+      self.assertAllEqual([0, 1, 2], sorted_values)
+
+  @test_util.run_v1_only("SaverV1")
+  def testSaveRestore(self):
+    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
+    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
+
+    with self.session(graph=ops.Graph()) as sess:
+      v0 = variables.Variable(10.0, name="v0")
+      v1 = variables.Variable(20.0, name="v1")
+
+      default_val = -1
+      keys = constant_op.constant(["b", "c", "d"], dtypes.string)
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.MutableHashTable(
+          dtypes.string, dtypes.int64, default_val, name="t1", checkpoint=True)
+
+      save = saver.Saver()
+      self.evaluate(variables.global_variables_initializer())
+
+      # Check that the parameter nodes have been initialized.
+      self.assertEqual(10.0, self.evaluate(v0))
+      self.assertEqual(20.0, self.evaluate(v1))
+
+      self.assertAllEqual(0, self.evaluate(table.size()))
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      val = save.save(sess, save_path)
+      self.assertTrue(isinstance(val, six.string_types))
+      self.assertEqual(save_path, val)
+
+    with self.session(graph=ops.Graph()) as sess:
+      v0 = variables.Variable(-1.0, name="v0")
+      v1 = variables.Variable(-1.0, name="v1")
+      default_val = -1
+      table = lookup_ops.MutableHashTable(
+          dtypes.string, dtypes.int64, default_val, name="t1", checkpoint=True)
+      self.evaluate(
+          table.insert(
+              constant_op.constant(["a", "c"], dtypes.string),
+              constant_op.constant([12, 24], dtypes.int64)))
+      self.assertAllEqual(2, self.evaluate(table.size()))
+
+      save = saver.Saver()
+
+      # Restore the saved values in the parameter nodes.
+      save.restore(sess, save_path)
+      # Check that the parameter nodes have been restored.
+      self.assertEqual(10.0, self.evaluate(v0))
+      self.assertEqual(20.0, self.evaluate(v1))
+
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant(["a", "b", "c", "d", "e"],
+                                          dtypes.string)
+      output = table.lookup(input_string)
+      self.assertAllEqual([-1, 0, 1, 2, -1], self.evaluate(output))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testObjectSaveRestore(self):
+    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
+    save_prefix = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
+
+    v0 = variables.Variable(10.0, name="v0")
+    v1 = variables.Variable(20.0, name="v1")
+
+    default_val = -1
+    keys = constant_op.constant(["b", "c", "d"], dtypes.string)
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup_ops.MutableHashTable(
+        dtypes.string, dtypes.int64, default_val, name="t1", checkpoint=True)
+
+    checkpoint = trackable.Checkpoint(table=table, v0=v0, v1=v1)
+    self.evaluate([v0.initializer, v1.initializer])
+
+    # Check that the parameter nodes have been initialized.
+    self.assertEqual(10.0, self.evaluate(v0))
+    self.assertEqual(20.0, self.evaluate(v1))
+
+    self.assertAllEqual(0, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
+
+    save_path = checkpoint.save(save_prefix)
+    del table, checkpoint, v0, v1
+
+    v0 = variables.Variable(-1.0, name="v0")
+    v1 = variables.Variable(-1.0, name="v1")
+    default_val = -1
+    table = lookup_ops.MutableHashTable(
+        dtypes.string, dtypes.int64, default_val, name="t1", checkpoint=True)
+    self.evaluate(
+        table.insert(
+            constant_op.constant(["a", "c"], dtypes.string),
+            constant_op.constant([12, 24], dtypes.int64)))
+    self.assertAllEqual(2, self.evaluate(table.size()))
+
+    checkpoint = trackable.Checkpoint(table=table, v0=v0, v1=v1)
+
+    # Restore the saved values in the parameter nodes.
+    checkpoint.restore(save_path).run_restore_ops()
+    # Check that the parameter nodes have been restored.
+    self.assertEqual(10.0, self.evaluate(v0))
+    self.assertEqual(20.0, self.evaluate(v1))
+
+    self.assertAllEqual(3, self.evaluate(table.size()))
+
+    input_string = constant_op.constant(["a", "b", "c", "d", "e"],
+                                        dtypes.string)
+    output = table.lookup(input_string)
+    self.assertAllEqual([-1, 0, 1, 2, -1], self.evaluate(output))
+
+  @test_util.run_v1_only("Multiple sessions")
+  def testSharing(self):
+    # Start a server to store the table state
+    server = server_lib.Server({"local0": ["localhost:0"]},
+                               protocol="grpc",
+                               start=True)
+    # Create two sessions sharing the same state
+    session1 = session.Session(server.target)
+    session2 = session.Session(server.target)
+
+    table = lookup_ops.MutableHashTable(
+        dtypes.int64, dtypes.string, "-", name="t1")
+
+    # Populate the table in the first session
+    with session1:
+      self.assertAllEqual(0, table.size().eval())
+
+      keys = constant_op.constant([11, 12], dtypes.int64)
+      values = constant_op.constant(["a", "b"])
+      table.insert(keys, values).run()
+      self.assertAllEqual(2, table.size().eval())
+
+      output = table.lookup(constant_op.constant([11, 12, 13], dtypes.int64))
+      self.assertAllEqual([b"a", b"b", b"-"], output.eval())
+
+    # Verify that we can access the shared data from the second session
+    with session2:
+      self.assertAllEqual(2, table.size().eval())
+
+      output = table.lookup(constant_op.constant([10, 11, 12], dtypes.int64))
+      self.assertAllEqual([b"-", b"a", b"b"], output.eval())
+
+  def testMutableHashTableOfTensors(self):
+    with self.cached_session():
+      default_val = constant_op.constant([-1, -1], dtypes.int64)
+      keys = constant_op.constant(["brain", "salad", "surgery", "tarkus"])
+      values = constant_op.constant([[0, 1], [2, 3], [4, 5], [6, 7]],
+                                    dtypes.int64)
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                          default_val)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(4, self.evaluate(table.size()))
+
+      remove_string = constant_op.constant(["tarkus", "tank"])
+      self.evaluate(table.remove(remove_string))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
+      self.assertAllEqual([3, 2], output.get_shape())
+
+      result = self.evaluate(output)
+      self.assertAllEqual([[0, 1], [2, 3], [-1, -1]], result)
+
+      exported_keys, exported_values = table.export()
+      # exported data is in the order of the internal map, i.e. undefined
+      sorted_keys = np.sort(self.evaluate(exported_keys))
+      sorted_values = np.sort(self.evaluate(exported_values), axis=0)
+      self.assertAllEqual([b"brain", b"salad", b"surgery"], sorted_keys)
+      sorted_expected_values = np.sort([[4, 5], [2, 3], [0, 1]], axis=0)
+      self.assertAllEqual(sorted_expected_values, sorted_values)
+
+  def testMutableHashTableExportInsert(self):
+    with self.cached_session():
+      default_val = constant_op.constant([-1, -1], dtypes.int64)
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([[0, 1], [2, 3], [4, 5]], dtypes.int64)
+      table1 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                           default_val)
+      self.assertAllEqual(0, self.evaluate(table1.size()))
+      self.evaluate(table1.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table1.size()))
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      expected_output = [[0, 1], [2, 3], [-1, -1]]
+      output1 = table1.lookup(input_string)
+      self.assertAllEqual(expected_output, self.evaluate(output1))
+
+      exported_keys, exported_values = table1.export()
+      self.assertAllEqual(3, self.evaluate(exported_keys).size)
+      self.assertAllEqual(6, self.evaluate(exported_values).size)
+
+      # Populate a second table from the exported data
+      table2 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                           default_val)
+      self.assertAllEqual(0, self.evaluate(table2.size()))
+      self.evaluate(table2.insert(exported_keys, exported_values))
+      self.assertAllEqual(3, self.evaluate(table2.size()))
+
+      # Verify lookup result is still the same
+      output2 = table2.lookup(input_string)
+      self.assertAllEqual(expected_output, self.evaluate(output2))
+
+  def testMutableHashTableOfTensorsInvalidShape(self):
+    with self.cached_session():
+      default_val = constant_op.constant([-1, -1], dtypes.int64)
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                          default_val)
+
+      # Shape [6] instead of [3, 2]
+      values = constant_op.constant([0, 1, 2, 3, 4, 5], dtypes.int64)
+      with self.assertRaisesOpError("Expected shape"):
+        self.evaluate(table.insert(keys, values))
+
+      # Shape [2,3] instead of [3, 2]
+      values = constant_op.constant([[0, 1, 2], [3, 4, 5]], dtypes.int64)
+      with self.assertRaisesOpError("Expected shape"):
+        self.evaluate(table.insert(keys, values))
+
+      # Shape [2, 2] instead of [3, 2]
+      values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64)
+      with self.assertRaisesOpError("Expected shape"):
+        self.evaluate(table.insert(keys, values))
+
+      # Shape [3, 1] instead of [3, 2]
+      values = constant_op.constant([[0], [2], [4]], dtypes.int64)
+      with self.assertRaisesOpError("Expected shape"):
+        self.evaluate(table.insert(keys, values))
+
+      # Valid Insert
+      values = constant_op.constant([[0, 1], [2, 3], [4, 5]], dtypes.int64)
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+  def testMutableHashTableInvalidDefaultValue(self):
+    with self.cached_session():
+      default_val = constant_op.constant([[-1, -1]], dtypes.int64)
+      with self.assertRaisesOpError("Default value must be a vector"):
+        table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                            default_val)
+        self.assertAllEqual(0, self.evaluate(table.size()))
+
+  def testMutableHashTableDuplicateInsert(self):
+    with self.cached_session():
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery", "brain"])
+      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                          default_val)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
+
+      result = self.evaluate(output)
+      self.assertAllEqual([3, 1, -1], result)
+
+  def testMutableHashTableFindHighRank(self):
+    with self.cached_session():
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                          default_val)
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant([["brain", "salad"],
+                                           ["tank", "tarkus"]])
+      output = table.lookup(input_string)
+      self.assertAllEqual([2, 2], output.get_shape())
+
+      result = self.evaluate(output)
+      self.assertAllEqual([[0, 1], [-1, -1]], result)
+
+  def testMutableHashTableInsertHighRank(self):
+    with self.cached_session():
+      default_val = -1
+      keys = constant_op.constant([["brain", "salad"], ["surgery", "tank"]])
+      values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64)
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                          default_val)
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(4, self.evaluate(table.size()))
+
+      input_string = constant_op.constant(["brain", "salad", "tank", "tarkus"])
+      output = table.lookup(input_string)
+
+      result = self.evaluate(output)
+      self.assertAllEqual([0, 1, 3, -1], result)
+
+  def testMutableHashTableRemoveHighRank(self):
+    with self.test_session():
+      default_val = -1
+      keys = constant_op.constant([["brain", "salad"], ["surgery", "tank"]])
+      values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64)
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                          default_val)
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(4, self.evaluate(table.size()))
+
+      remove_string = constant_op.constant(["salad", "tarkus"])
+      self.evaluate(table.remove(remove_string))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant(["brain", "salad", "tank", "tarkus"])
+      output = table.lookup(input_string)
+
+      result = self.evaluate(output)
+      self.assertAllEqual([0, -1, 3, -1], result)
+
+  def testMutableHashTableOfTensorsFindHighRank(self):
+    with self.cached_session():
+      default_val = constant_op.constant([-1, -1, -1], dtypes.int64)
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([[0, 1, 2], [2, 3, 4], [4, 5, 6]],
+                                    dtypes.int64)
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                          default_val)
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant([["brain", "salad"],
+                                           ["tank", "tarkus"]])
+      output = table.lookup(input_string)
+      self.assertAllEqual([2, 2, 3], output.get_shape())
+
+      result = self.evaluate(output)
+      self.assertAllEqual(
+          [[[0, 1, 2], [2, 3, 4]], [[-1, -1, -1], [-1, -1, -1]]], result)
+
+  def testMutableHashTableOfTensorsRemoveHighRank(self):
+    with self.test_session():
+      default_val = constant_op.constant([-1, -1, -1], dtypes.int64)
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([[0, 1, 2], [2, 3, 4], [4, 5, 6]],
+                                    dtypes.int64)
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                          default_val)
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      remove_string = constant_op.constant([["brain", "tank"]])
+      self.evaluate(table.remove(remove_string))
+      self.assertAllEqual(2, self.evaluate(table.size()))
+
+      input_string = constant_op.constant([["brain", "salad"],
+                                           ["surgery", "tank"]])
+      output = table.lookup(input_string)
+      self.assertAllEqual([2, 2, 3], output.get_shape())
+
+      result = self.evaluate(output)
+      self.assertAllEqual(
+          [[[-1, -1, -1], [2, 3, 4]], [[4, 5, 6], [-1, -1, -1]]], result)
+
+  def testMultipleMutableHashTables(self):
+    with self.cached_session():
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+
+      table1 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                           default_val)
+      table2 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                           default_val)
+      table3 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                           default_val)
+      self.evaluate(table1.insert(keys, values))
+      self.evaluate(table2.insert(keys, values))
+      self.evaluate(table3.insert(keys, values))
+
+      self.assertAllEqual(3, self.evaluate(table1.size()))
+      self.assertAllEqual(3, self.evaluate(table2.size()))
+      self.assertAllEqual(3, self.evaluate(table3.size()))
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output1 = table1.lookup(input_string)
+      output2 = table2.lookup(input_string)
+      output3 = table3.lookup(input_string)
+
+      out1, out2, out3 = self.evaluate([output1, output2, output3])
+      self.assertAllEqual([0, 1, -1], out1)
+      self.assertAllEqual([0, 1, -1], out2)
+      self.assertAllEqual([0, 1, -1], out3)
+
+  def testMutableHashTableWithTensorDefault(self):
+    with self.cached_session():
+      default_val = constant_op.constant(-1, dtypes.int64)
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                          default_val)
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
+
+      result = self.evaluate(output)
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testSignatureMismatch(self):
+    with self.cached_session():
+      default_val = -1
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                          default_val)
+
+      # insert with keys of the wrong type
+      with self.assertRaises(ValueError):
+        self.evaluate(table.insert(constant_op.constant([4, 5, 6]), values))
+
+      # insert with values of the wrong type
+      with self.assertRaises(ValueError):
+        self.evaluate(table.insert(keys, constant_op.constant(["a", "b", "c"])))
+
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string_ref = variables.Variable("brain")
+      input_int64_ref = variables.Variable(-1, dtype=dtypes.int64)
+      self.evaluate(variables.global_variables_initializer())
+
+      # Ref types do not produce an insert signature mismatch.
+      self.evaluate(table.insert(input_string_ref, input_int64_ref))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      # Ref types do not produce a lookup signature mismatch.
+      self.assertEqual(-1, self.evaluate(table.lookup(input_string_ref)))
+
+      # lookup with keys of the wrong type
+      input_string = constant_op.constant([1, 2, 3], dtypes.int64)
+      with self.assertRaises(ValueError):
+        self.evaluate(table.lookup(input_string))
+
+      # default value of the wrong type
+      with self.assertRaises(TypeError):
+        lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, "UNK")
+
+  def testMutableHashTableStringFloat(self):
+    with self.cached_session():
+      default_val = -1.5
+      keys = constant_op.constant(["brain", "salad", "surgery"])
+      values = constant_op.constant([0, 1.1, 2.2], dtypes.float32)
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.float32,
+                                          default_val)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant(["brain", "salad", "tank"])
+      output = table.lookup(input_string)
+
+      result = self.evaluate(output)
+      self.assertAllClose([0, 1.1, default_val], result)
+
+  def testMutableHashTableIntFloat(self):
+    with self.cached_session():
+      default_val = -1.0
+      keys = constant_op.constant([3, 7, 0], dtypes.int64)
+      values = constant_op.constant([7.5, -1.2, 9.9], dtypes.float32)
+      table = lookup_ops.MutableHashTable(dtypes.int64, dtypes.float32,
+                                          default_val)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant([7, 0, 11], dtypes.int64)
+      output = table.lookup(input_string)
+
+      result = self.evaluate(output)
+      self.assertAllClose([-1.2, 9.9, default_val], result)
+
+  def testMutableHashTableInt64String(self):
+    with self.cached_session():
+      default_val = "n/a"
+      keys = constant_op.constant([0, 1, 2], dtypes.int64)
+      values = constant_op.constant(["brain", "salad", "surgery"])
+      table = lookup_ops.MutableHashTable(dtypes.int64, dtypes.string,
+                                          default_val)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant([0, 1, 3], dtypes.int64)
+      output = table.lookup(input_string)
+
+      result = self.evaluate(output)
+      self.assertAllEqual((b"brain", b"salad", b"n/a"), result)
+
+
+class MutableDenseHashTableOpTest(test.TestCase):
+
+  def testBasic(self):
+    with self.cached_session():
+
+      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
+      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+      table = lookup_ops.MutableDenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=-1,
+          empty_key=0,
+          deleted_key=-1)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(4, self.evaluate(table.size()))
+
+      remove_string = constant_op.constant([12, 15], dtypes.int64)
+      self.evaluate(table.remove(remove_string))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant([11, 12, 15], dtypes.int64)
+      output = table.lookup(input_string)
+      self.assertAllEqual([3], output.get_shape())
+
+      result = self.evaluate(output)
+      self.assertAllEqual([0, -1, -1], result)
+
+  def testBasicBool(self):
+    with self.cached_session():
+
+      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
+      values = constant_op.constant([True, True, True, True], dtypes.bool)
+      table = lookup_ops.MutableDenseHashTable(
+          dtypes.int64,
+          dtypes.bool,
+          default_value=False,
+          empty_key=0,
+          deleted_key=-1)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(4, self.evaluate(table.size()))
+
+      remove_string = constant_op.constant([11, 15], dtypes.int64)
+      self.evaluate(table.remove(remove_string))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant([11, 12, 15], dtypes.int64)
+      output = table.lookup(input_string)
+      self.assertAllEqual([3], output.get_shape())
+
+      result = self.evaluate(output)
+      self.assertAllEqual([False, True, False], result)
+
+  def testSameEmptyAndDeletedKey(self):
+    with self.cached_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "Empty and deleted keys"):
+        table = lookup_ops.MutableDenseHashTable(
+            dtypes.int64,
+            dtypes.int64,
+            default_value=-1,
+            empty_key=42,
+            deleted_key=42)
+        self.assertAllEqual(0, self.evaluate(table.size()))
+
+  @test_util.run_v1_only("uses placeholders")
+  def testLookupUnknownShape(self):
+    with self.cached_session():
+      keys = constant_op.constant([11, 12, 13], dtypes.int64)
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.MutableDenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=-1,
+          empty_key=0,
+          deleted_key=-1)
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      placeholder_keys = array_ops.placeholder(dtypes.int64)
+      output = table.lookup(placeholder_keys)
+      self.assertAllEqual(None, output.get_shape())
+      result = output.eval({placeholder_keys: [11, 12, 15]})
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testMapStringToFloat(self):
+    with self.cached_session():
+
+      keys = constant_op.constant(["a", "b", "c", "d"], dtypes.string)
+      values = constant_op.constant([0.0, 1.1, 2.2, 3.3], dtypes.float32)
+      default_value = constant_op.constant(-1.5, dtypes.float32)
+      table = lookup_ops.MutableDenseHashTable(
+          dtypes.string,
+          dtypes.float32,
+          default_value=default_value,
+          empty_key="",
+          deleted_key="$")
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(4, self.evaluate(table.size()))
+
+      remove_string = constant_op.constant(["b", "e"])
+      self.evaluate(table.remove(remove_string))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant(["a", "b", "d", "e"], dtypes.string)
+      output = table.lookup(input_string)
+      self.assertAllEqual([4], output.get_shape())
+
+      result = self.evaluate(output)
+      self.assertAllClose([0, -1.5, 3.3, -1.5], result)
+
+  def testMapInt64ToFloat(self):
+    for float_dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+
+        keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
+        values = constant_op.constant([0.0, 1.1, 2.2, 3.3], float_dtype)
+        default_value = constant_op.constant(-1.5, float_dtype)
+        table = lookup_ops.MutableDenseHashTable(
+            dtypes.int64,
+            float_dtype,
+            default_value=default_value,
+            empty_key=0,
+            deleted_key=-1)
+        self.assertAllEqual(0, self.evaluate(table.size()))
+
+        self.evaluate(table.insert(keys, values))
+        self.assertAllEqual(4, self.evaluate(table.size()))
+
+        remove_string = constant_op.constant([12, 15], dtypes.int64)
+        self.evaluate(table.remove(remove_string))
+        self.assertAllEqual(3, self.evaluate(table.size()))
+
+        input_string = constant_op.constant([11, 12, 14, 15], dtypes.int64)
+        output = table.lookup(input_string)
+        self.assertAllEqual([4], output.get_shape())
+
+        result = self.evaluate(output)
+        self.assertAllClose([0, -1.5, 3.3, -1.5], result)
+
+  def testVectorValues(self):
+    with self.cached_session():
+      keys = constant_op.constant([11, 12, 13], dtypes.int64)
+      values = constant_op.constant([[0, 1, 2, 3], [3, 4, 5, 6], [6, 7, 8, 9]],
+                                    dtypes.int64)
+      default_value = constant_op.constant([-1, -2, -3, -4], dtypes.int64)
+      table = lookup_ops.MutableDenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=default_value,
+          empty_key=0,
+          deleted_key=-1,
+          initial_num_buckets=4)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+      self.assertAllEqual(4, len(self.evaluate(table.export()[0])))
+
+      self.evaluate(
+          table.insert(
+              constant_op.constant([14], dtypes.int64),
+              constant_op.constant([[2, 3, 4, 5]], dtypes.int64)))
+      self.assertAllEqual(4, self.evaluate(table.size()))
+      self.assertAllEqual(8, len(self.evaluate(table.export()[0])))
+
+      remove_string = constant_op.constant([12, 16], dtypes.int64)
+      self.evaluate(table.remove(remove_string))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+      self.assertAllEqual(8, len(self.evaluate(table.export()[0])))
+
+      input_string = constant_op.constant([11, 12, 14, 15], dtypes.int64)
+      output = table.lookup(input_string)
+      self.assertAllEqual([4, 4],
+                          output.shape,
+                          msg="Saw shape: %s" % output.shape)
+
+      result = self.evaluate(output)
+      self.assertAllEqual(
+          [[0, 1, 2, 3], [-1, -2, -3, -4], [2, 3, 4, 5], [-1, -2, -3, -4]],
+          result)
+
+  def testVectorKeys(self):
+    with self.cached_session():
+      keys = constant_op.constant([[0, 1], [1, 2], [1, 3]], dtypes.int64)
+      values = constant_op.constant([10, 11, 12], dtypes.int64)
+      empty_key = constant_op.constant([0, 3], dtypes.int64)
+      deleted_key = constant_op.constant([-1, -1], dtypes.int64)
+      default_value = constant_op.constant(-1, dtypes.int64)
+      table = lookup_ops.MutableDenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=default_value,
+          empty_key=empty_key,
+          deleted_key=deleted_key,
+          initial_num_buckets=8)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      self.evaluate(
+          table.insert(
+              constant_op.constant([[0, 0]], dtypes.int64),
+              constant_op.constant([13], dtypes.int64)))
+      self.assertAllEqual(4, self.evaluate(table.size()))
+      self.assertAllEqual(8, len(self.evaluate(table.export()[0])))
+
+      remove_string = constant_op.constant([[1, 2], [7, 8]], dtypes.int64)
+      self.evaluate(table.remove(remove_string))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+      self.assertAllEqual(8, len(self.evaluate(table.export()[0])))
+
+      input_string = constant_op.constant([[0, 1], [1, 2], [1, 3], [0, 2]],
+                                          dtypes.int64)
+      output = table.lookup(input_string)
+      self.assertAllEqual([4], output.get_shape())
+
+      result = self.evaluate(output)
+      self.assertAllEqual([10, -1, 12, -1], result)
+
+  def testResize(self):
+    with self.cached_session():
+      keys = constant_op.constant([11, 12, 13], dtypes.int64)
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.MutableDenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=-1,
+          empty_key=0,
+          deleted_key=-1,
+          initial_num_buckets=4)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+      self.assertAllEqual(4, len(self.evaluate(table.export()[0])))
+
+      keys2 = constant_op.constant([12, 99], dtypes.int64)
+      self.evaluate(table.remove(keys2))
+      self.assertAllEqual(2, self.evaluate(table.size()))
+      self.assertAllEqual(4, len(self.evaluate(table.export()[0])))
+
+      keys3 = constant_op.constant([13, 14, 15, 16, 17], dtypes.int64)
+      values3 = constant_op.constant([3, 4, 5, 6, 7], dtypes.int64)
+
+      self.evaluate(table.insert(keys3, values3))
+      self.assertAllEqual(6, self.evaluate(table.size()))
+      self.assertAllEqual(16, len(self.evaluate(table.export()[0])))
+
+      keys4 = constant_op.constant([10, 11, 12, 13, 14, 15, 16, 17, 18],
+                                   dtypes.int64)
+      output = table.lookup(keys4)
+      self.assertAllEqual([-1, 0, -1, 3, 4, 5, 6, 7, -1], self.evaluate(output))
+
+  def testExport(self):
+    with self.cached_session():
+
+      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
+      values = constant_op.constant([1, 2, 3, 4], dtypes.int64)
+      table = lookup_ops.MutableDenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=-1,
+          empty_key=100,
+          deleted_key=200,
+          initial_num_buckets=8)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(4, self.evaluate(table.size()))
+
+      keys2 = constant_op.constant([12, 15], dtypes.int64)
+      self.evaluate(table.remove(keys2))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      exported_keys, exported_values = table.export()
+
+      np_keys = self.evaluate(exported_keys)
+      np_values = self.evaluate(exported_values)
+
+      self.assertAllEqual(8, len(np_keys))
+      self.assertAllEqual(8, len(np_values))
+
+      # pair up keys and values, drop extra added dimension
+      pairs = np.dstack((np_keys.flatten(), np_values.flatten()))[0]
+      # sort by key
+      pairs = pairs[pairs[:, 0].argsort()]
+      self.assertAllEqual([[11, 1], [13, 3], [14, 4], [100, 0], [100, 0],
+                           [100, 0], [100, 0], [200, 2]], pairs)
+
+  @test_util.run_v1_only("Saver V1 only")
+  def testSaveRestore(self):
+    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
+    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
+
+    with self.session(graph=ops.Graph()) as sess:
+      default_value = -1
+      empty_key = 0
+      deleted_key = -1
+      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
+      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+      table = lookup_ops.MutableDenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=default_value,
+          empty_key=empty_key,
+          deleted_key=deleted_key,
+          name="t1",
+          checkpoint=True,
+          initial_num_buckets=32)
+
+      save = saver.Saver()
+
+      self.assertAllEqual(0, table.size().eval())
+      table.insert(keys, values).run()
+      self.assertAllEqual(4, table.size().eval())
+      self.assertAllEqual(32, len(table.export()[0].eval()))
+
+      keys2 = constant_op.constant([12, 15], dtypes.int64)
+      table.remove(keys2).run()
+      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(32, len(table.export()[0].eval()))
+
+      val = save.save(sess, save_path)
+      self.assertTrue(isinstance(val, six.string_types))
+      self.assertEqual(save_path, val)
+
+    with self.session(graph=ops.Graph()) as sess:
+      table = lookup_ops.MutableDenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=default_value,
+          empty_key=empty_key,
+          deleted_key=deleted_key,
+          name="t1",
+          checkpoint=True,
+          initial_num_buckets=64)
+      table.insert(
+          constant_op.constant([11, 14], dtypes.int64),
+          constant_op.constant([12, 24], dtypes.int64)).run()
+      self.assertAllEqual(2, table.size().eval())
+      self.assertAllEqual(64, len(table.export()[0].eval()))
+
+      save = saver.Saver()
+
+      # Restore the saved values in the parameter nodes.
+      save.restore(sess, save_path)
+
+      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(32, len(table.export()[0].eval()))
+
+      input_string = constant_op.constant([10, 11, 12, 13, 14], dtypes.int64)
+      output = table.lookup(input_string)
+      self.assertAllEqual([-1, 0, -1, 2, 3], output.eval())
+
+  @test_util.run_in_graph_and_eager_modes
+  def testObjectSaveRestore(self):
+    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
+    save_prefix = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
+
+    default_value = -1
+    empty_key = 0
+    deleted_key = -1
+    keys = constant_op.constant([11, 12, 13], dtypes.int64)
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    save_table = lookup_ops.MutableDenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=default_value,
+        empty_key=empty_key,
+        deleted_key=deleted_key,
+        name="t1",
+        checkpoint=True,
+        initial_num_buckets=32)
+
+    save_checkpoint = trackable.Checkpoint(table=save_table)
+
+    self.assertAllEqual(0, self.evaluate(save_table.size()))
+    self.evaluate(save_table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(save_table.size()))
+    self.assertAllEqual(32, len(self.evaluate(save_table.export()[0])))
+
+    save_path = save_checkpoint.save(save_prefix)
+    del save_table, save_checkpoint
+
+    load_table = lookup_ops.MutableDenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=default_value,
+        empty_key=empty_key,
+        deleted_key=deleted_key,
+        name="t1",
+        checkpoint=True,
+        initial_num_buckets=64)
+    self.evaluate(
+        load_table.insert(
+            constant_op.constant([11, 14], dtypes.int64),
+            constant_op.constant([12, 24], dtypes.int64)))
+    self.assertAllEqual(2, self.evaluate(load_table.size()))
+    self.assertAllEqual(64, len(self.evaluate(load_table.export()[0])))
+
+    restore_checkpoint = trackable.Checkpoint(table=load_table)
+
+    # Restore the saved values in the parameter nodes.
+    restore_checkpoint.restore(save_path).run_restore_ops()
+
+    self.assertAllEqual(3, self.evaluate(load_table.size()))
+    self.assertAllEqual(32, len(self.evaluate(load_table.export()[0])))
+
+    input_string = constant_op.constant([10, 11, 12, 13, 14], dtypes.int64)
+    output = load_table.lookup(input_string)
+    self.assertAllEqual([-1, 0, 1, 2, -1], self.evaluate(output))
+
+  @test_util.run_v1_only("Saver V1 only")
+  def testVectorSaveRestore(self):
+    save_dir = os.path.join(self.get_temp_dir(), "vector_save_restore")
+    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
+
+    with self.session(graph=ops.Graph()) as sess:
+      empty_key = constant_op.constant([11, 13], dtypes.int64)
+      deleted_key = constant_op.constant([-2, -3], dtypes.int64)
+      default_value = constant_op.constant([-1, -2], dtypes.int64)
+      keys = constant_op.constant([[11, 12], [11, 14], [12, 13], [13, 14]],
+                                  dtypes.int64)
+      values = constant_op.constant([[0, 1], [2, 3], [2, 4], [4, 5]],
+                                    dtypes.int64)
+      table = lookup_ops.MutableDenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=default_value,
+          empty_key=empty_key,
+          deleted_key=deleted_key,
+          name="t1",
+          checkpoint=True,
+          initial_num_buckets=32)
+
+      save = saver.Saver()
+
+      self.assertAllEqual(0, table.size().eval())
+      table.insert(keys, values).run()
+      self.assertAllEqual(4, table.size().eval())
+      self.assertAllEqual(32, len(table.export()[0].eval()))
+
+      keys2 = constant_op.constant([[12, 13], [16, 17]], dtypes.int64)
+      table.remove(keys2).run()
+      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(32, len(table.export()[0].eval()))
+
+      val = save.save(sess, save_path)
+      self.assertTrue(isinstance(val, six.string_types))
+      self.assertEqual(save_path, val)
+
+    with self.session(graph=ops.Graph()) as sess:
+      empty_key = constant_op.constant([11, 13], dtypes.int64)
+      deleted_key = constant_op.constant([-2, -3], dtypes.int64)
+      default_value = constant_op.constant([-1, -2], dtypes.int64)
+      table = lookup_ops.MutableDenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=default_value,
+          empty_key=empty_key,
+          deleted_key=deleted_key,
+          name="t1",
+          checkpoint=True,
+          initial_num_buckets=64)
+      table.insert(
+          constant_op.constant([[11, 12], [13, 15]], dtypes.int64),
+          constant_op.constant([[21, 22], [23, 24]], dtypes.int64)).run()
+      self.assertAllEqual(2, table.size().eval())
+      self.assertAllEqual(64, len(table.export()[0].eval()))
+
+      save = saver.Saver()
+
+      # Restore the saved values in the parameter nodes.
+      save.restore(sess, save_path)
+
+      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(32, len(table.export()[0].eval()))
+
+      input_string = constant_op.constant(
+          [[11, 12], [11, 14], [11, 15], [13, 14], [13, 15]], dtypes.int64)
+      output = table.lookup(input_string)
+      self.assertAllEqual([[0, 1], [2, 3], [-1, -2], [4, 5], [-1, -2]],
+                          output.eval())
+
+  @test_util.run_v1_only("Saver V1 only")
+  def testVectorScalarSaveRestore(self):
+    save_dir = os.path.join(self.get_temp_dir(), "vector_scalar_save_restore")
+    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
+
+    with self.session(graph=ops.Graph()) as sess:
+      empty_key = constant_op.constant([11, 13], dtypes.int64)
+      deleted_key = constant_op.constant([-1, -1], dtypes.int64)
+      default_value = constant_op.constant(-1, dtypes.int64)
+      keys = constant_op.constant([[11, 12], [11, 14], [12, 13], [13, 14]],
+                                  dtypes.int64)
+      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+      table = lookup_ops.MutableDenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=default_value,
+          empty_key=empty_key,
+          deleted_key=deleted_key,
+          name="t2",
+          checkpoint=True,
+          initial_num_buckets=32)
+
+      save = saver.Saver()
+
+      self.assertAllEqual(0, table.size().eval())
+      table.insert(keys, values).run()
+      self.assertAllEqual(4, table.size().eval())
+      self.assertAllEqual(32, len(table.export()[0].eval()))
+
+      keys2 = constant_op.constant([[12, 13], [15, 16]], dtypes.int64)
+      table.remove(keys2).run()
+      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(32, len(table.export()[0].eval()))
+
+      val = save.save(sess, save_path)
+      self.assertTrue(isinstance(val, six.string_types))
+      self.assertEqual(save_path, val)
+
+    with self.session(graph=ops.Graph()) as sess:
+      empty_key = constant_op.constant([11, 13], dtypes.int64)
+      deleted_key = constant_op.constant([-1, -1], dtypes.int64)
+      default_value = constant_op.constant(-1, dtypes.int64)
+      table = lookup_ops.MutableDenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=default_value,
+          empty_key=empty_key,
+          deleted_key=deleted_key,
+          name="t2",
+          checkpoint=True,
+          initial_num_buckets=64)
+      table.insert(
+          constant_op.constant([[11, 12], [13, 15]], dtypes.int64),
+          constant_op.constant([3, 4], dtypes.int64)).run()
+      self.assertAllEqual(2, table.size().eval())
+      self.assertAllEqual(64, len(table.export()[0].eval()))
+
+      save = saver.Saver()
+
+      # Restore the saved values in the parameter nodes.
+      save.restore(sess, save_path)
+
+      self.assertAllEqual(3, table.size().eval())
+      self.assertAllEqual(32, len(table.export()[0].eval()))
+
+      input_string = constant_op.constant(
+          [[11, 12], [11, 14], [11, 15], [13, 14], [13, 15]], dtypes.int64)
+      output = table.lookup(input_string)
+      self.assertAllEqual([0, 1, -1, 3, -1], output.eval())
+
+  def testReprobe(self):
+    with self.cached_session():
+      # Insert 6 keys into a table with 8 buckets.
+      # The values are chosen to make sure collisions occur when using GCC STL
+      keys = constant_op.constant([11, 12, 13, 19, 20, 21], dtypes.int64)
+      values = constant_op.constant([51, 52, 53, 54, 55, 56], dtypes.int64)
+      table = lookup_ops.MutableDenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=-1,
+          empty_key=0,
+          deleted_key=-1,
+          initial_num_buckets=8)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(6, self.evaluate(table.size()))
+
+      input_string = constant_op.constant([10, 11, 12, 13, 14, 19, 20, 21, 22],
+                                          dtypes.int64)
+      output = table.lookup(input_string)
+      self.assertAllEqual([9], output.get_shape())
+
+      result = self.evaluate(output)
+      self.assertAllEqual([-1, 51, 52, 53, -1, 54, 55, 56, -1], result)
+
+  def testCustomEmptyKey(self):
+    with self.cached_session():
+      keys = constant_op.constant([11, 0, 13], dtypes.int64)
+      values = constant_op.constant([0, 1, 2], dtypes.int64)
+      table = lookup_ops.MutableDenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=-1,
+          empty_key=12,
+          deleted_key=-1)
+      self.assertAllEqual(0, self.evaluate(table.size()))
+
+      self.evaluate(table.insert(keys, values))
+      self.assertAllEqual(3, self.evaluate(table.size()))
+
+      input_string = constant_op.constant([11, 0, 15], dtypes.int64)
+      output = table.lookup(input_string)
+      self.assertAllEqual([3], output.get_shape())
+
+      result = self.evaluate(output)
+      self.assertAllEqual([0, 1, -1], result)
+
+  def testErrors(self):
+    with self.cached_session():
+      table = lookup_ops.MutableDenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=-1,
+          empty_key=0,
+          deleted_key=-1)
+
+      # Inserting the empty key returns an error
+      keys1 = constant_op.constant([11, 0], dtypes.int64)
+      values1 = constant_op.constant([0, 1], dtypes.int64)
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "empty_key"):
+        self.evaluate(table.insert(keys1, values1))
+
+      # Looking up the empty key returns an error
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "empty_key"):
+        self.evaluate(table.lookup(keys1))
+
+      # Inserting the deleted key returns an error
+      keys2 = constant_op.constant([11, -1], dtypes.int64)
+      values2 = constant_op.constant([0, 1], dtypes.int64)
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "deleted_key"):
+        self.evaluate(table.insert(keys2, values2))
+
+      # Looking up the empty key returns an error
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "deleted_key"):
+        self.evaluate(table.lookup(keys2))
+
+      # Arbitrary tensors of keys are not supported
+      keys = constant_op.constant([[11, 0], [12, 1]], dtypes.int64)
+      values = constant_op.constant([[11, 0], [12, 1]], dtypes.int64)
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "Expected key shape"):
+        self.evaluate(table.lookup(keys))
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "Expected key shape"):
+        self.evaluate(table.insert(keys, values))
+
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "Number of buckets must be"):
+        table2 = lookup_ops.MutableDenseHashTable(
+            dtypes.int64,
+            dtypes.int64,
+            default_value=-1,
+            empty_key=17,
+            deleted_key=-1,
+            initial_num_buckets=12)
+        self.assertAllEqual(0, self.evaluate(table2.size()))
+
+      with self.assertRaisesRegexp(
+          errors_impl.InvalidArgumentError,
+          "Empty and deleted keys must have same shape"):
+        table3 = lookup_ops.MutableDenseHashTable(
+            dtypes.int64,
+            dtypes.int64,
+            default_value=-1,
+            empty_key=42,
+            deleted_key=[1, 2])
+        self.assertAllEqual(0, self.evaluate(table3.size()))
+
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "Empty and deleted keys cannot be equal"):
+        table4 = lookup_ops.MutableDenseHashTable(
+            dtypes.int64,
+            dtypes.int64,
+            default_value=-1,
+            empty_key=42,
+            deleted_key=42)
+        self.assertAllEqual(0, self.evaluate(table4.size()))
+
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "Empty and deleted keys cannot be equal"):
+        table5 = lookup_ops.MutableDenseHashTable(
+            dtypes.int64,
+            dtypes.int64,
+            default_value=-1,
+            empty_key=[1, 2, 3],
+            deleted_key=[1, 2, 3])
+        self.assertAllEqual(0, self.evaluate(table5.size()))
+
+
+class MutableHashTableBenchmark(test.Benchmark):
+
+  def _create_table(self):
+    return lookup_ops.MutableHashTable(dtypes.int64, dtypes.float32, 0.0)
+
+  def benchmark_single_repeated_scalar_insert_scalar(self):
+    table = self._create_table()
+    value = variables.Variable(1.0)
+    insert = table.insert(0, value)
+    size = table.size()
+    with session.Session() as sess:
+      sess.run(value.initializer)
+      self.run_op_benchmark(sess, insert, burn_iters=10, min_iters=10000)
+      assert sess.run(size) == 1
+
+  def benchmark_many_repeated_scalar_insert_scalar(self):
+    table = self._create_table()
+    c = dataset_ops.make_one_shot_iterator(counter.Counter()).get_next()
+    value = variables.Variable(1.0)
+    insert = table.insert(c, value)
+    size = table.size()
+    with session.Session() as sess:
+      sess.run(value.initializer)
+      self.run_op_benchmark(sess, insert, burn_iters=10, min_iters=10000)
+      assert sess.run(size) >= 10000
+
+  def benchmark_single_repeated_batch_32_insert_scalar(self):
+    table = self._create_table()
+    value = variables.Variable([1.0] * 32)
+    insert = table.insert(list(range(32)), value)
+    size = table.size()
+    with session.Session() as sess:
+      sess.run(value.initializer)
+      self.run_op_benchmark(sess, insert, burn_iters=10, min_iters=1000)
+      assert sess.run(size) == 32
+
+  def benchmark_many_repeated_batch_32_insert_scalar(self):
+    table = self._create_table()
+    c = dataset_ops.make_one_shot_iterator(counter.Counter()).get_next()
+    value = variables.Variable([1.0] * 32)
+    insert = table.insert(32 * c + list(range(32)), value)
+    size = table.size()
+    with session.Session() as sess:
+      sess.run(value.initializer)
+      self.run_op_benchmark(sess, insert, burn_iters=10, min_iters=1000)
+      assert sess.run(size) >= 1000 * 32
+
+
+class MutableDenseHashTableBenchmark(MutableHashTableBenchmark):
+
+  def _create_table(self):
+    return lookup_ops.MutableDenseHashTable(
+        dtypes.int64,
+        dtypes.float32,
+        default_value=0.0,
+        empty_key=-1,
+        deleted_key=-2)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class StaticVocabularyTableTest(test.TestCase):
+
+  def _createVocabFile(self, basename, values=("brain", "salad", "surgery")):
+    vocabulary_file = os.path.join(self.get_temp_dir(), basename)
+    with open(vocabulary_file, "w") as f:
+      f.write("\n".join(values) + "\n")
+    return vocabulary_file
+
+  def testStringStaticVocabularyTable(self):
+    vocab_file = self._createVocabFile("feat_to_id_1.txt")
+    with self.cached_session():
+      vocab_size = 3
+      oov_buckets = 1
+      table = lookup_ops.StaticVocabularyTable(
+          lookup_ops.TextFileIdTableInitializer(
+              vocab_file, vocab_size=vocab_size), oov_buckets)
+
+      self.evaluate(table.initializer)
+
+      input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
+
+      out = table.lookup(input_string)
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
+      self.assertEqual(vocab_size + oov_buckets, self.evaluate(table.size()))
+
+  def testInt32StaticVocabularyTable(self):
+    vocab_file = self._createVocabFile("feat_to_id_2.txt", ("42", "1", "-1000"))
+    with self.cached_session():
+      vocab_size = 3
+      oov_buckets = 1
+      table = lookup_ops.StaticVocabularyTable(
+          lookup_ops.TextFileIdTableInitializer(
+              vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
+          oov_buckets,
+          lookup_key_dtype=dtypes.int32)
+
+      self.evaluate(table.initializer)
+
+      values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int32)
+
+      out = table.lookup(values)
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
+      self.assertEqual(vocab_size + oov_buckets, self.evaluate(table.size()))
+
+  def testInt64StaticVocabularyTable(self):
+    vocab_file = self._createVocabFile("feat_to_id_3.txt", ("42", "1", "-1000"))
+    with self.cached_session():
+      vocab_size = 3
+      oov_buckets = 1
+      table = lookup_ops.StaticVocabularyTable(
+          lookup_ops.TextFileIdTableInitializer(
+              vocab_file, vocab_size=vocab_size, key_dtype=dtypes.int64),
+          oov_buckets)
+
+      self.evaluate(table.initializer)
+
+      values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64)
+
+      out = table.lookup(values)
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
+      self.assertEqual(vocab_size + oov_buckets, self.evaluate(table.size()))
+
+  def testStringStaticVocabularyTableNoInitializer(self):
+    with self.cached_session():
+      oov_buckets = 5
+
+      # Set a table that only uses hash buckets, for each input value returns
+      # an id calculated by fingerprint("input") mod oov_buckets.
+      table = lookup_ops.StaticVocabularyTable(None, oov_buckets)
+      self.evaluate(table.initializer)
+
+      values = constant_op.constant(("brain", "salad", "surgery"))
+
+      out = table.lookup(values)
+      self.assertAllEqual(
+          [
+              3,  # fingerprint("brain") mod 5.
+              1,  # fingerprint("salad") mod 5.
+              4  # fingerprint("surgery") mod 5
+          ],
+          self.evaluate(out))
+      self.assertEqual(oov_buckets, self.evaluate(table.size()))
+
+  def testStaticVocabularyTableWithMultipleInitializers(self):
+    vocab_file = self._createVocabFile("feat_to_id_4.txt")
+    with self.cached_session():
+      vocab_size = 3
+      oov_buckets = 3
+
+      init = lookup_ops.TextFileIdTableInitializer(
+          vocab_file, vocab_size=vocab_size)
+      table1 = lookup_ops.StaticVocabularyTable(
+          init, oov_buckets, name="table1")
+
+      table2 = lookup_ops.StaticVocabularyTable(
+          init, oov_buckets, name="table2")
+
+      self.evaluate(lookup_ops.tables_initializer())
+
+      input_string = constant_op.constant(
+          ["fruit", "brain", "salad", "surgery", "UNK"])
+
+      out1 = table1.lookup(input_string)
+      out2 = table2.lookup(input_string)
+
+      out1, out2 = self.evaluate([out1, out2])
+      self.assertAllEqual([5, 0, 1, 2, 5], out1)
+      self.assertAllEqual([5, 0, 1, 2, 5], out2)
+      self.assertEqual(vocab_size + oov_buckets, self.evaluate(table1.size()))
+      self.assertEqual(vocab_size + oov_buckets, self.evaluate(table2.size()))
+
+  def testStaticVocabularyTableInitializationAcrossSessions(self):
+    vocab_file = self._createVocabFile("feat_to_id_5.txt")
+    with self.cached_session():
+      vocab_size = 3
+      oov_buckets = 1
+      table1 = lookup_ops.StaticVocabularyTable(
+          lookup_ops.TextFileIdTableInitializer(
+              vocab_file, vocab_size=vocab_size), oov_buckets)
+
+      self.evaluate(table1.initializer)
+
+      input_string_1 = constant_op.constant(
+          ["brain", "salad", "surgery", "UNK"])
+
+      out1 = table1.lookup(input_string_1)
+
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out1))
+      self.assertEqual(vocab_size + oov_buckets, self.evaluate(table1.size()))
+
+    with self.cached_session():
+      vocab_size = 3
+      oov_buckets = 1
+
+      # Underlying lookup table already initialized in previous session.
+      # No need to initialize table2
+      table2 = lookup_ops.StaticVocabularyTable(
+          lookup_ops.TextFileIdTableInitializer(
+              vocab_file, vocab_size=vocab_size), oov_buckets)
+
+      input_string_2 = constant_op.constant(["fruit", "salad", "UNK"])
+
+      out2 = table2.lookup(input_string_2)
+
+      self.assertAllEqual([3, 1, 3], self.evaluate(out2))
+      self.assertEqual(vocab_size + oov_buckets, self.evaluate(table2.size()))
+
+  def testSparseTensor(self):
+    vocab_file = self._createVocabFile("feat_to_id_7.txt")
+    input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
+    input_shape = [4, 4]
+    with self.cached_session() as sess:
+      sp_features = sparse_tensor.SparseTensor(
+          constant_op.constant(input_indices, dtypes.int64),
+          constant_op.constant(["brain", "salad", "brain", "surgery", "tarkus"],
+                               dtypes.string),
+          constant_op.constant(input_shape, dtypes.int64))
+
+      table = lookup_ops.StaticVocabularyTable(
+          lookup_ops.TextFileIdTableInitializer(vocab_file, vocab_size=3), 1)
+      self.evaluate(table.initializer)
+
+      sp_ids = table.lookup(sp_features)
+
+      self.assertAllEqual([5], sp_ids.values._shape_as_list())
+
+      sp_ids_ind, sp_ids_val, sp_ids_shape = sess.run(
+          [sp_ids.indices, sp_ids.values, sp_ids.dense_shape])
+
+      self.assertAllEqual(input_indices, sp_ids_ind)
+      self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
+      self.assertAllEqual(input_shape, sp_ids_shape)
+
+  def testInt32SparseTensor(self):
+    input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
+    input_shape = [4, 4]
+    with self.cached_session() as sess:
+      sp_features = sparse_tensor.SparseTensor(
+          constant_op.constant(input_indices, dtypes.int64),
+          constant_op.constant([42, 1, 42, -1000, 11], dtypes.int32),
+          constant_op.constant(input_shape, dtypes.int64))
+
+      table = lookup_ops.StaticVocabularyTable(
+          lookup_ops.KeyValueTensorInitializer((42, 1, -1000), (0, 1, 2),
+                                               dtypes.int64, dtypes.int64),
+          1,
+          lookup_key_dtype=dtypes.int32)
+      self.evaluate(table.initializer)
+
+      sp_ids = table.lookup(sp_features)
+
+      self.assertAllEqual([5], sp_ids.values._shape_as_list())
+
+      sp_ids_ind, sp_ids_val, sp_ids_shape = sess.run(
+          [sp_ids.indices, sp_ids.values, sp_ids.dense_shape])
+
+      self.assertAllEqual(input_indices, sp_ids_ind)
+      self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
+      self.assertAllEqual(input_shape, sp_ids_shape)
+
+  def testInt64SparseTensor(self):
+    input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
+    input_shape = [4, 4]
+    with self.cached_session() as sess:
+      sp_features = sparse_tensor.SparseTensor(
+          constant_op.constant(input_indices, dtypes.int64),
+          constant_op.constant([42, 1, 42, -1000, 11], dtypes.int64),
+          constant_op.constant(input_shape, dtypes.int64))
+
+      table = lookup_ops.StaticVocabularyTable(
+          lookup_ops.KeyValueTensorInitializer((42, 1, -1000), (0, 1, 2),
+                                               dtypes.int64, dtypes.int64), 1)
+      self.evaluate(table.initializer)
+
+      sp_ids = table.lookup(sp_features)
+
+      self.assertAllEqual([5], sp_ids.values._shape_as_list())
+
+      sp_ids_ind, sp_ids_val, sp_ids_shape = sess.run(
+          [sp_ids.indices, sp_ids.values, sp_ids.dense_shape])
+
+      self.assertAllEqual(input_indices, sp_ids_ind)
+      self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
+      self.assertAllEqual(input_shape, sp_ids_shape)
+
+  def testStaticVocabularyTableNoInnerTable(self):
+    with self.cached_session():
+      table = lookup_ops.StaticVocabularyTable(None, num_oov_buckets=1)
+      self.assertIsNone(table.resource_handle)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/losses_test.py b/tensorflow/python/kernel_tests/losses_test.py
index 4584a27e6227bf53e4de5f74730cc9b737214cd5..89788936dbe6df6b2da8cfd3cf23a88ed1bc3bfb 100644
--- a/tensorflow/python/kernel_tests/losses_test.py
+++ b/tensorflow/python/kernel_tests/losses_test.py
@@ -38,6 +38,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import momentum as momentum_lib
 
 
+@test_util.run_deprecated_v1
 class AbsoluteDifferenceLossTest(test.TestCase):
 
   def setUp(self):
@@ -51,26 +52,22 @@ class AbsoluteDifferenceLossTest(test.TestCase):
         losses.absolute_difference(
             self._predictions, self._predictions, weights=None)
 
-  @test_util.run_v1_only("b/120545219")
   def testAllCorrectNoLossWeight(self):
     loss = losses.absolute_difference(self._predictions, self._predictions)
     with self.cached_session():
       self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
-  @test_util.run_v1_only("b/120545219")
   def testNonZeroLoss(self):
     loss = losses.absolute_difference(self._labels, self._predictions)
     with self.cached_session():
       self.assertAlmostEqual(5.5, self.evaluate(loss), 3)
 
-  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithPythonScalarWeight(self):
     weights = 2.3
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
       self.assertAlmostEqual(5.5 * weights, self.evaluate(loss), 3)
 
-  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeight(self):
     weights = 2.3
     loss = losses.absolute_difference(self._labels, self._predictions,
@@ -148,7 +145,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertEquals(loss.op.name, 'softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 10.0, 3)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithPythonScalarWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -158,7 +155,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
       loss = losses.softmax_cross_entropy(labels, logits, weights)
       self.assertAlmostEqual(weights * 10.0, self.evaluate(loss), 3)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithScalarTensorWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -311,7 +308,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 10.0, 3)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithPythonScalarWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -321,7 +318,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
       self.assertAlmostEqual(weights * 10.0, self.evaluate(loss), 3)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithScalarTensorWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -654,6 +651,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
                              3)
 
 
+@test_util.run_deprecated_v1
 class LogLossTest(test.TestCase):
 
   def setUp(self):
@@ -677,13 +675,11 @@ class LogLossTest(test.TestCase):
       with self.assertRaises(ValueError):
         losses.log_loss(self._labels, self._labels, weights=None)
 
-  @test_util.run_v1_only("b/120545219")
   def testAllCorrectNoLossWeight(self):
     loss = losses.log_loss(self._labels, self._labels)
     with self.cached_session():
       self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
-  @test_util.run_v1_only("b/120545219")
   def testAllCorrectNoLossWeightWithPlaceholder(self):
     tf_predictions = array_ops.placeholder(
         dtypes.float32, shape=self._np_labels.shape)
@@ -692,14 +688,12 @@ class LogLossTest(test.TestCase):
       self.assertAlmostEqual(
           0.0, loss.eval(feed_dict={tf_predictions: self._np_labels}), 3)
 
-  @test_util.run_v1_only("b/120545219")
   def testNonZeroLoss(self):
     loss = losses.log_loss(self._labels, self._predictions)
     with self.cached_session():
       self.assertAlmostEqual(-np.sum(self._expected_losses) / 6.0,
                              self.evaluate(loss), 3)
 
-  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithPythonScalarWeight(self):
     weights = 2.3
     loss = losses.log_loss(self._labels, self._predictions, weights)
@@ -707,7 +701,6 @@ class LogLossTest(test.TestCase):
       self.assertAlmostEqual(weights * -np.sum(self._expected_losses) / 6.0,
                              self.evaluate(loss), 3)
 
-  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeight(self):
     weights = 2.3
     loss = losses.log_loss(self._labels, self._predictions,
@@ -716,7 +709,6 @@ class LogLossTest(test.TestCase):
       self.assertAlmostEqual(weights * -np.sum(self._expected_losses) / 6.0,
                              self.evaluate(loss), 3)
 
-  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeightAndPlaceholder(self):
     tf_predictions = array_ops.placeholder(
         dtypes.float32, shape=self._np_predictions.shape)
@@ -728,7 +720,6 @@ class LogLossTest(test.TestCase):
       self.assertAlmostEqual(weights * -np.sum(self._expected_losses) / 6.0,
                              loss, 3)
 
-  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeightAndPlaceholderWithRankOnly(self):
     tf_predictions = array_ops.placeholder(dtypes.float32, shape=[None, None])
     weights = 2.3
@@ -788,7 +779,6 @@ class LogLossTest(test.TestCase):
       self.assertAlmostEqual(-np.sum(expected_losses) / 5.0,
                              self.evaluate(loss), 3)
 
-  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithMeasurementSpecificWeightsWithPlaceholder(self):
     weights = np.array([3, 6, 5, 0, 4, 2]).reshape((2, 3))
     expected_losses = np.multiply(self._expected_losses, weights)
@@ -816,7 +806,6 @@ class LogLossTest(test.TestCase):
     with self.cached_session():
       self.assertAlmostEqual(-np.sum(expected_losses), self.evaluate(loss), 3)
 
-  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithSampleSpecificWeightsMostZeroWithPlaceholder(self):
     weights = np.array([0, 0, 0, 0, 0, 2]).reshape((2, 3))
     expected_losses = np.multiply(self._expected_losses, weights)
@@ -934,6 +923,7 @@ class HuberLossTest(test.TestCase):
       self.assertAllClose(expected, self.evaluate(loss), atol=1e-5)
 
 
+@test_util.run_deprecated_v1
 class MeanSquaredErrorTest(test.TestCase):
 
   def setUp(self):
@@ -955,26 +945,26 @@ class MeanSquaredErrorTest(test.TestCase):
           losses.mean_squared_error(predictions=constant_op.constant(0),
                                     labels=constant_op.constant(0)).eval())
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_deprecated_v1
   def testAllCorrectNoLossWeight(self):
     loss = losses.mean_squared_error(self._predictions, self._predictions)
     with self.cached_session():
       self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_deprecated_v1
   def testNonZeroLoss(self):
     loss = losses.mean_squared_error(self._labels, self._predictions)
     with self.cached_session():
       self.assertAlmostEqual(49.5, self.evaluate(loss), 3)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithPythonScalarWeight(self):
     weights = 2.3
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
       self.assertAlmostEqual(49.5 * weights, self.evaluate(loss), 3)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithScalarTensorWeight(self):
     weights = 2.3
     loss = losses.mean_squared_error(self._labels, self._predictions,
@@ -1013,6 +1003,7 @@ class MeanSquaredErrorTest(test.TestCase):
       self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
 
+@test_util.run_deprecated_v1
 class MeanPairwiseSquaredErrorTest(test.TestCase):
 
   def setUp(self):
@@ -1068,12 +1059,10 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
       self.assertAlmostEqual(
           expected_loss, dynamic_inputs_op.eval(feed_dict=feed_dict), places=3)
 
-  @test_util.run_v1_only("b/120545219")
   def testAllCorrectNoLossWeight(self):
     self._test_valid_weights(
         self._labels, self._labels, expected_loss=0.0)
 
-  @test_util.run_v1_only("b/120545219")
   def testNonZeroLoss(self):
     self._test_valid_weights(
         self._labels, self._predictions,
@@ -1104,7 +1093,6 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
           np_grad = self.evaluate(grad)
           self.assertFalse(np.isnan(np_grad).any())
 
-  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithPythonScalarWeight(self):
     weight = 2.3
     self._test_valid_weights(
@@ -1112,7 +1100,6 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
         expected_loss=weight * np.sum(self._expected_losses),
         weights=weight)
 
-  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeight(self):
     weights = 2.3
     loss = losses.mean_pairwise_squared_error(
@@ -1123,12 +1110,10 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
       self.assertAlmostEqual(weights * np.sum(self._expected_losses),
                              self.evaluate(loss), 3)
 
-  @test_util.run_deprecated_v1
   def testNonZeroLossWithScalarZeroWeight(self):
     self._test_valid_weights(
         self._labels, self._predictions, expected_loss=0.0, weights=0.0)
 
-  @test_util.run_deprecated_v1
   def test3d(self):
     labels = np.array([
         [[1, 9, 2], [12, 11, 10], [9, 8, 7]],
@@ -1140,7 +1125,6 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     ])
     self._test_valid_weights(labels, predictions, expected_loss=137.5)
 
-  @test_util.run_deprecated_v1
   def test3dWeightedScalar(self):
     labels = np.array([
         [[1, 9, 2], [12, 11, 10], [9, 8, 7]],
@@ -1179,7 +1163,6 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
             weights_placeholder: weights,
         })
 
-  @test_util.run_v1_only("b/120545219")
   def testInvalid3dWeighted2x0(self):
     labels = np.array([
         [[1, 9, 2], [12, 11, 10], [9, 8, 7]],
@@ -1192,7 +1175,6 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     self._test_invalid_weights(
         labels, predictions, weights=np.asarray((1.2, 3.4)))
 
-  @test_util.run_deprecated_v1
   def test3dWeighted2x3x3(self):
     labels = np.array([
         [[1, 9, 2], [12, 11, 10], [9, 8, 7]],
@@ -1209,7 +1191,6 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
         expected_loss=9 * 137.5,
         weights=np.ones((2, 3, 3)))
 
-  @test_util.run_deprecated_v1
   def testLossWithAllZeroBatchSpecificWeights(self):
     self._test_valid_weights(
         self._labels, self._predictions, expected_loss=0.0,
@@ -1251,6 +1232,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
         self.assertAlmostEqual(loss0 + loss1, loss0_1, 5)
 
 
+@test_util.run_deprecated_v1
 class CosineDistanceLossTest(test.TestCase):
 
   def setUp(self):
@@ -1329,7 +1311,6 @@ class CosineDistanceLossTest(test.TestCase):
     with self.cached_session():
       self.assertEqual(3.0 / 4.0, self.evaluate(loss))
 
-  @test_util.run_deprecated_v1
   def testMeasurementSpecificWeightsWithPlaceholderWithShape(self):
     tf_predictions = array_ops.placeholder(
         dtypes.float32, shape=self._labels.shape)
diff --git a/tensorflow/python/kernel_tests/lu_op_test.py b/tensorflow/python/kernel_tests/lu_op_test.py
index 06deb0e1c82175c33b028e017a5f54cc2549253b..951af020fe7b90dc1708eda1cdb7c896a4a546ea 100644
--- a/tensorflow/python/kernel_tests/lu_op_test.py
+++ b/tensorflow/python/kernel_tests/lu_op_test.py
@@ -27,8 +27,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
@@ -104,8 +104,8 @@ class LuOpTest(test.TestCase):
         verification_reshaped = array_ops.reshape(verification,
                                                   [-1, num_rows, num_cols])
         # Invert the permutation in each batch.
-        inv_perm_reshaped = functional_ops.map_fn(array_ops.invert_permutation,
-                                                  perm_reshaped)
+        inv_perm_reshaped = map_fn.map_fn(array_ops.invert_permutation,
+                                          perm_reshaped)
         batch_size = perm_reshaped.shape.as_list()[0]
         # Prepare the batch indices with the same shape as the permutation.
         # The corresponding batch index is paired with each of the `num_rows`
diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2b1d433c780a520fbb5a0168053f6708e74b95a
--- /dev/null
+++ b/tensorflow/python/kernel_tests/map_fn_test.py
@@ -0,0 +1,223 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.kernels.functional_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import map_fn
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+import tensorflow.python.ops.tensor_array_grad  # pylint: disable=unused-import
+from tensorflow.python.platform import test
+
+
+# pylint: disable=invalid-name
+def simple_scoped_fn(a, x):
+  """Simple function: (a, x) -> 2(x+a), but with "2" as a variable in scope."""
+  with variable_scope.variable_scope("body"):
+    # Dummy variable, just to check that scoping works as intended.
+    two = variable_scope.get_variable(
+        "two", [],
+        dtype=dtypes.int32,
+        initializer=init_ops.constant_initializer(2))
+    return math_ops.multiply(math_ops.add(a, x), two)
+
+
+@test_util.with_control_flow_v2
+class MapFnTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMap_Simple(self):
+    nums = [1, 2, 3, 4, 5, 6]
+    elems = constant_op.constant(nums, name="data")
+    r = map_fn.map_fn(
+        lambda x: math_ops.multiply(math_ops.add(x, 3), 2), elems)
+    self.assertAllEqual(
+        np.array([(x + 3) * 2 for x in nums]), self.evaluate(r))
+
+  def testMapDtypeEager(self):
+    with context.eager_mode():
+      dtype = map_fn.map_fn(lambda x: constant_op.constant(""),
+                            constant_op.constant([]),
+                            dtype=dtypes.string).dtype
+      self.assertEqual(dtype, dtypes.string)
+
+  def testMapSparseTensor(self):
+    with self.cached_session():
+      with self.assertRaises(TypeError):
+        map_fn.map_fn(
+            lambda x: x,
+            sparse_tensor.SparseTensor(
+                indices=[[0, 0], [0, 1], [1, 0]],
+                values=constant_op.constant([0, 1, 2]),
+                dense_shape=[2, 2]))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMapOverScalarErrors(self):
+    with self.assertRaisesRegexp(ValueError, "not scalars"):
+      map_fn.map_fn(lambda x: x, [1, 2])
+    with self.assertRaisesRegexp(ValueError, "not a scalar"):
+      map_fn.map_fn(lambda x: x, 1)
+
+  @test_util.run_deprecated_v1
+  def testMap_Scoped(self):
+    with self.cached_session() as sess:
+
+      def double_scoped(x):
+        """2x with a dummy 2 that is scoped."""
+        with variable_scope.variable_scope("body"):
+          # Dummy variable, just to check that scoping works as intended.
+          two = variable_scope.get_variable(
+              "two", [],
+              dtype=dtypes.int32,
+              initializer=init_ops.constant_initializer(2))
+          return math_ops.multiply(x, two)
+
+      with variable_scope.variable_scope("root") as varscope:
+        elems = constant_op.constant([1, 2, 3, 4, 5, 6], name="data")
+        doubles = np.array([2 * x for x in [1, 2, 3, 4, 5, 6]])
+
+        r = map_fn.map_fn(double_scoped, elems)
+        # Check that we have the one variable we asked for here.
+        self.assertEqual(len(variables.trainable_variables()), 1)
+        self.assertEqual(variables.trainable_variables()[0].name,
+                         "root/body/two:0")
+        sess.run([variables.global_variables_initializer()])
+        self.assertAllEqual(doubles, self.evaluate(r))
+
+        # Now let's reuse our single variable.
+        varscope.reuse_variables()
+        r = map_fn.map_fn(double_scoped, elems)
+        self.assertEqual(len(variables.trainable_variables()), 1)
+        self.assertAllEqual(doubles, self.evaluate(r))
+
+  @test_util.run_deprecated_v1
+  def testMap_Grad(self):
+    with self.cached_session():
+      param = constant_op.constant(2.0)
+      elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="elems")
+      y = map_fn.map_fn(
+          lambda x: math_ops.multiply(math_ops.square(x), param), elems)
+      r = gradients_impl.gradients(y, param)[0]
+      self.assertAllEqual(91.0, self.evaluate(r))
+      r = gradients_impl.gradients(y, elems)[0]
+      self.assertAllEqual([4.0, 8.0, 12.0, 16.0, 20.0, 24.0], self.evaluate(r))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMap_SimpleNotTensor(self):
+    nums = np.array([1, 2, 3, 4, 5, 6])
+    r = map_fn.map_fn(
+        lambda x: math_ops.multiply(math_ops.add(x, 3), 2), nums)
+    self.assertAllEqual(
+        np.array([(x + 3) * 2 for x in nums]), self.evaluate(r))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMap_SingleInputMultiOutput(self):
+    nums = np.array([1, 2, 3, 4, 5, 6])
+    r = map_fn.map_fn(
+        lambda x: ((x + 3) * 2, -(x + 3) * 2),
+        nums,
+        dtype=(dtypes.int64, dtypes.int64))
+    self.assertEqual(2, len(r))
+    self.assertEqual((6,), r[0].get_shape())
+    self.assertEqual((6,), r[1].get_shape())
+    received = self.evaluate(r)
+    self.assertAllEqual((nums + 3) * 2, received[0])
+    self.assertAllEqual(-(nums + 3) * 2, received[1])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMap_MultiOutputMismatchedDtype(self):
+    nums = np.array([1, 2, 3, 4, 5, 6])
+    with self.assertRaisesRegexp(
+        TypeError, r"two structures don't have the same nested structure"):
+      # lambda emits tuple, but dtype is a list
+      map_fn.map_fn(
+          lambda x: ((x + 3) * 2, -(x + 3) * 2),
+          nums,
+          dtype=[dtypes.int64, dtypes.int64])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMap_MultiInputSingleOutput(self):
+    nums = np.array([1, 2, 3, 4, 5, 6])
+    r = map_fn.map_fn(
+        lambda x: x[0] * x[1][0] + x[1][1], (nums, (nums, -nums)),
+        dtype=dtypes.int64)
+    self.assertEqual((6,), r.get_shape())
+    received = self.evaluate(r)
+    self.assertAllEqual(nums * nums + (-nums), received)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMap_MultiInputSameStructureOutput(self):
+    nums = np.array([1, 2, 3, 4, 5, 6])
+    r = map_fn.map_fn(lambda x: (x[1][0], (x[1][1], x[0])),
+                      (nums, (2 * nums, -nums)))
+    r = [r[0], r[1][0], r[1][1]]
+    self.assertEqual((6,), r[0].get_shape())
+    self.assertEqual((6,), r[1].get_shape())
+    self.assertEqual((6,), r[2].get_shape())
+    received = self.evaluate(r)
+    self.assertAllEqual(2 * nums, received[0])
+    self.assertAllEqual(-nums, received[1])
+    self.assertAllEqual(nums, received[2])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMapShape(self):
+    x = constant_op.constant([[1, 2, 3], [4, 5, 6]])
+    y = map_fn.map_fn(lambda e: e, x)
+    self.assertAllEqual(y.get_shape(), self.evaluate(y).shape)
+
+  @test_util.run_deprecated_v1
+  def testMapUnknownShape(self):
+    x = array_ops.placeholder(dtypes.float32)
+    y = map_fn.map_fn(lambda e: e, x)
+    self.assertIs(None, y.get_shape().dims)
+
+  # TODO(b/124383826): this test fails in eager: the iterable is of length 0 so
+  # so the body of the while loop never executes
+  @test_util.run_v1_only("b/120545219")
+  def testMapEmptyScalar(self):
+    map_return = map_fn.map_fn(lambda x: 1,
+                               constant_op.constant([], dtype=dtypes.int32))
+    self.assertAllEqual([0], map_return.get_shape().dims)
+    self.assertAllEqual([0], self.evaluate(map_return).shape)
+
+  # TODO(b/124383826): this test fails in eager: the iterable is of length 0 so
+  # so the body of the while loop never executes
+  @test_util.run_v1_only("b/120545219")
+  def testMapEmptyTensor(self):
+    with self.cached_session():
+      map_return = map_fn.map_fn(lambda x: array_ops.zeros([3, 2]),
+                                 constant_op.constant([]))
+      self.assertAllEqual([0, 3, 2], map_return.get_shape().dims)
+      self.assertAllEqual([0, 3, 2], self.evaluate(map_return).shape)
+
+
+if __name__ == "__main__":
+  test.main()
+
+# pylint: enable=invalid-name
diff --git a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
index 3edb390c724b6c71cd8849efc2b22a579e87247f..e09530b891504c652f60fa9558218df8135da778 100644
--- a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
@@ -114,7 +114,7 @@ class SquareRootOpTest(test.TestCase):
       sqrt2 = gen_linalg_ops.matrix_square_root(square2)
       all_ops = [sqrt1, sqrt2]
       sqrt = self.evaluate(all_ops)
-      self.assertAllEqual(sqrt[0], sqrt[1])
+      self.assertAllClose(sqrt[0], sqrt[1])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/pad_op_test.py b/tensorflow/python/kernel_tests/pad_op_test.py
index 7b1b054ae0656ef8ae988c1a3220a2a643afbcab..6fb8a4b5d8678e54623d194ef97ae65f2e494b15 100644
--- a/tensorflow/python/kernel_tests/pad_op_test.py
+++ b/tensorflow/python/kernel_tests/pad_op_test.py
@@ -223,7 +223,7 @@ class PadOpTest(test.TestCase):
   def testIntTypes(self):
     # TODO(touts): Figure out why the padding tests do not work on GPU
     # for int types and rank > 2.
-    for t in [np.int8, np.int32, np.int64]:
+    for t in [np.int8, np.uint8, np.int32, np.int64]:
       self._testAll(
           np.random.randint(-100, 100, (4, 4, 3)).astype(t),
           [[1, 0], [2, 3], [0, 2]], 0)
diff --git a/tensorflow/python/kernel_tests/padding_fifo_queue_test.py b/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
index e3999695d0605f49d1440c3305f020e4871940a3..214eaa0160efdb388f9fa79a63f60ee3e1904a8e 100644
--- a/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
+++ b/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
@@ -35,7 +35,7 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.platform import test
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_v1_only("PaddingFIFOQueue removed from v2")
 class PaddingFIFOQueueTest(test.TestCase):
 
   def testConstructor(self):
diff --git a/tensorflow/python/kernel_tests/partitioned_variables_test.py b/tensorflow/python/kernel_tests/partitioned_variables_test.py
index da79b4ecfc0a3972f610c1ed39cdd0201716bee4..edcbc2967e2fb14c8c2d3c6a3ae9b434876e02d5 100644
--- a/tensorflow/python/kernel_tests/partitioned_variables_test.py
+++ b/tensorflow/python/kernel_tests/partitioned_variables_test.py
@@ -323,26 +323,24 @@ class PartitionedVariablesTestCase(test.TestCase):
     for i in xrange(len(expected_specs)):
       self.assertEquals(expected_specs[i], slices[i]._save_slice_info.spec)
 
-  @test_util.run_deprecated_v1
   def testVecConstantInit(self):
     with self.cached_session():
       rnd_par = constant_op.constant([1, 2, 3, 4])
       vs = partitioned_variables.create_partitioned_variables([4], [4], rnd_par)
-      variables.global_variables_initializer().run()
-      val = array_ops.concat(vs, 0).eval()
+      self.evaluate(variables.global_variables_initializer())
+      val = array_ops.concat(vs, 0)
       rnd = self.evaluate(rnd_par)
       self.assertAllClose(rnd, val)
       self.assertEqual([dtypes.int32] * 4, [v.dtype.base_dtype for v in vs])
       self._TestSaveSpec(vs, ["4 0,1", "4 1,1", "4 2,1", "4 3,1"])
 
-  @test_util.run_deprecated_v1
   def testConstantInit(self):
     with self.cached_session():
       rnd_par = constant_op.constant([[1, 2, 3, 4], [5, 6, 7, 8]])
       vs = partitioned_variables.create_partitioned_variables([2, 4], [1, 2],
                                                               rnd_par)
-      variables.global_variables_initializer().run()
-      val = array_ops.concat(vs, 1).eval()
+      self.evaluate(variables.global_variables_initializer())
+      val = array_ops.concat(vs, 1)
       rnd = self.evaluate(rnd_par)
       self.assertAllClose(rnd, val)
       self.assertEqual([dtypes.int32] * 2, [v.dtype.base_dtype for v in vs])
@@ -356,7 +354,7 @@ class PartitionedVariablesTestCase(test.TestCase):
                                                                  rnd_par)
         vs2 = partitioned_variables.create_partitioned_variables([2, 4], [1, 2],
                                                                  rnd_par)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       var1_name = vs1[0]._save_slice_info.full_name
       var2_name = vs2[0]._save_slice_info.full_name
       self.assertEqual("hi/PartitionedVariable", var1_name)
@@ -376,7 +374,7 @@ class PartitionedVariablesTestCase(test.TestCase):
           vs, reuse=True, use_resource=use_resource):
         vs2 = partitioned_variables.create_partitioned_variables(
             [2, 4], [1, 2], rnd_par, dtype=dtypes.int32)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       var1_name = vs1[0]._save_slice_info.full_name
       var2_name = vs2[0]._save_slice_info.full_name
       self.assertEqual("hola/PartitionedVariable", var1_name)
@@ -393,7 +391,7 @@ class PartitionedVariablesTestCase(test.TestCase):
                                                                  rnd_par)
         vs2 = partitioned_variables.create_partitioned_variables([2, 4], [1, 2],
                                                                  rnd_par)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       var1_name = vs1[0]._save_slice_info.full_name
       var2_name = vs2[0]._save_slice_info.full_name
       # Currently, the name scope 'ola' has no effect.
@@ -408,18 +406,16 @@ class PartitionedVariablesTestCase(test.TestCase):
   def testName(self):
     self._testNameHelper(use_resource=False)
 
-  @test_util.run_deprecated_v1
   def testResourceName(self):
     self._testNameHelper(use_resource=True)
 
-  @test_util.run_v1_only("b/120545219")
   def testRandomInitValue(self):
     with self.cached_session():
       rnd = variables.Variable(random_ops.random_uniform([200, 40]))
       vs = partitioned_variables.create_partitioned_variables(
           rnd.get_shape(), [1, 10], rnd.initialized_value())
-      variables.global_variables_initializer().run()
-      val = array_ops.concat(vs, 1).eval()
+      self.evaluate(variables.global_variables_initializer())
+      val = array_ops.concat(vs, 1)
       rnd = self.evaluate(rnd)
       self.assertAllClose(rnd, val)
       self.assertEqual([dtypes.float32] * 10, [v.dtype.base_dtype for v in vs])
@@ -430,7 +426,6 @@ class PartitionedVariablesTestCase(test.TestCase):
           "200 40 0,200:36,4"
       ])
 
-  @test_util.run_v1_only("b/120545219")
   def testRandomInitUnevenPartitions(self):
     with self.cached_session():
       rnd = variables.Variable(
@@ -440,7 +435,7 @@ class PartitionedVariablesTestCase(test.TestCase):
               rnd.get_shape(), [1, i], rnd.initialized_value())
           for i in xrange(1, 10)
       ]
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       rnd_val = self.evaluate(rnd)
       # Only check the slice save specs for the first 5 tf.
       save_specs = [
@@ -462,33 +457,31 @@ class PartitionedVariablesTestCase(test.TestCase):
           ]
       ]
       for i, vs in enumerate(var_lists):
-        var_val = array_ops.concat(vs, 1).eval()
+        var_val = array_ops.concat(vs, 1)
         self.assertAllClose(rnd_val, var_val)
         self.assertEqual([dtypes.float64] * len(vs),
                          [v.dtype.base_dtype for v in vs])
         if i < len(save_specs):
           self._TestSaveSpec(vs, save_specs[i])
 
-  @test_util.run_v1_only("b/120545219")
   def testDegenerate(self):
     with self.cached_session():
       rnd = variables.Variable(random_ops.random_uniform([10, 43]))
       vs = partitioned_variables.create_partitioned_variables(
           rnd.get_shape(), [1, 1], rnd.initialized_value())
-      variables.global_variables_initializer().run()
-      val = array_ops.concat(vs, 0).eval()
+      self.evaluate(variables.global_variables_initializer())
+      val = array_ops.concat(vs, 0)
       rnd = self.evaluate(rnd)
       self.assertAllClose(rnd, val)
       self._TestSaveSpec(vs, ["10 43 0,10:0,43"])
 
-  @test_util.run_v1_only("b/120545219")
   def testSliceSizeOne(self):
     with self.cached_session():
       rnd = variables.Variable(random_ops.random_uniform([10, 43]))
       vs = partitioned_variables.create_partitioned_variables(
           rnd.get_shape(), [10, 1], rnd.initialized_value())
-      variables.global_variables_initializer().run()
-      val = array_ops.concat(vs, 0).eval()
+      self.evaluate(variables.global_variables_initializer())
+      val = array_ops.concat(vs, 0)
       rnd = self.evaluate(rnd)
       self.assertAllClose(rnd, val)
       self._TestSaveSpec(vs, [
@@ -497,7 +490,6 @@ class PartitionedVariablesTestCase(test.TestCase):
           "10 43 6,1:0,43", "10 43 7,1:0,43", "10 43 8,1:0,43", "10 43 9,1:0,43"
       ])
 
-  @test_util.run_deprecated_v1
   def testIotaInitializer(self):
     self.assertAllClose([0., 1., 2., 3.], _IotaInitializer([4]))
     self.assertAllClose([[0., 1.], [0., 10.], [0., 100.], [0., 1000.]],
@@ -505,11 +497,11 @@ class PartitionedVariablesTestCase(test.TestCase):
     with self.cached_session():
       vs = partitioned_variables.create_partitioned_variables([13, 5], [3, 1],
                                                               _IotaInitializer)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       slice0 = _IotaInitializer([5, 5])
       slice1 = _IotaInitializer([4, 5])
       slice2 = _IotaInitializer([4, 5])
-      val = array_ops.concat(vs, 0).eval()
+      val = array_ops.concat(vs, 0)
       self.assertAllClose(slice0 + slice1 + slice2, val)
       self._TestSaveSpec(vs, ["13 5 0,5:0,5", "13 5 5,4:0,5", "13 5 9,4:0,5"])
 
@@ -520,7 +512,7 @@ class PartitionedVariablesTestCase(test.TestCase):
     with self.cached_session():
       var0, var1 = partitioned_variables.create_partitioned_variables(
           [20, 12], [1, 2], init_ops.random_uniform_initializer())
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       val0, val1 = self.evaluate(var0).flatten(), self.evaluate(var1).flatten()
       self.assertTrue(np.linalg.norm(val0 - val1) > 1e-6)
     # Negative test that proves that slices have the same values if
@@ -528,7 +520,7 @@ class PartitionedVariablesTestCase(test.TestCase):
     with self.cached_session():
       var0, var1 = partitioned_variables.create_partitioned_variables(
           [20, 12], [1, 2], init_ops.random_uniform_initializer(seed=201))
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       val0, val1 = self.evaluate(var0).flatten(), self.evaluate(var1).flatten()
       self.assertAllClose(val0, val1)
 
@@ -607,8 +599,8 @@ class PartitionedVariablesTestCase(test.TestCase):
       self.assertTrue(
           c.op in concat_control_inputs,
           "var_x._concat() should get control dependencies from its scope.")
-      variables.global_variables_initializer().run()
-      self.assertAllClose(value.eval(), var_x.as_tensor().eval())
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllClose(value, var_x.as_tensor())
 
   def testMetaGraphSaveLoad(self):
     save_prefix = os.path.join(self.get_temp_dir(), "ckpt")
@@ -623,7 +615,7 @@ class PartitionedVariablesTestCase(test.TestCase):
         v0_part = v0._get_partitions()
         self.assertEqual(len(v0_list), 5)
         self.assertAllEqual(v0_part, (5, 1))
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
         save_graph.get_collection_ref("partvar").append(v0)
         saver = saver_lib.Saver()
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index c33b59bb99b716b7164c82f6e640a8a3f4680351..0cd6495d812ddb9813473a8861463dadcff9782e 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -730,6 +730,7 @@ class PoolingTest(test.TestCase):
         t = nn_ops.max_pool(
             t, ksize=ksize, strides=strides, padding="SAME").eval()
 
+  @test_util.disable_xla("b/123338077")  # Passes with XLA
   def testDepthwiseMaxPoolInvalidConfigs(self):
     self._testDepthwiseMaxPoolInvalidConfig(
         [1, 2, 2, 4], [1, 2, 2, 2], [1, 1, 1, 2],
@@ -1351,6 +1352,7 @@ class PoolingTest(test.TestCase):
             use_gpu=use_gpu,
             v2=v2)
 
+  @test_util.disable_xla("b/123923733")  # NaNs handled differently
   def _testMaxPoolGradDirectWithNans2_1(self):
     input_data = [float("nan")] * 16
     output_backprop = [11.0, 12.0, 13.0, 15.0, 16.0, 17.0, 19.0, 20.0, 21.0]
@@ -1425,6 +1427,7 @@ class PoolingTest(test.TestCase):
     else:
       del os.environ["TF_ENABLE_MAXPOOL_NANPROP"]
 
+  @test_util.disable_xla("b/123923733")  # NaNs handled differently
   def _testMaxPoolGradDirectWithNans2_2(self):
     input_data = [float("nan")] * 16
     output_backprop = [
@@ -1818,6 +1821,7 @@ class PoolingTest(test.TestCase):
             padding="SAME")
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testOpEdgeCases(self):
     with self.session(use_gpu=test.is_gpu_available()) as sess:
       pool_funcs = [nn_ops.max_pool, nn_ops.avg_pool]
@@ -1893,9 +1897,18 @@ if __name__ == "__main__":
        padding_) in GetShrunkInceptionMaxPoolShapes():
     setattr(PoolingTest, "testMaxPoolFwd_" + name_,
             GetMaxPoolFwdTest(input_size_, filter_size_, stride_, padding_))
-    setattr(PoolingTest, "testMaxPoolGrad_" + name_,
-            GetMaxPoolGradTest(input_size_, filter_size_, output_size_, stride_,
-                               padding_))
+    if name_ == "maxpool5":
+      setattr(
+          PoolingTest, "testMaxPoolGrad_" + name_,
+          test_util.disable_xla(
+              "b/123926014: incorrect output with only constants")(
+                  GetMaxPoolGradTest(input_size_, filter_size_, output_size_,
+                                     stride_, padding_)))
+    else:
+      setattr(
+          PoolingTest, "testMaxPoolGrad_" + name_,
+          GetMaxPoolGradTest(input_size_, filter_size_, output_size_, stride_,
+                             padding_))
     setattr(PoolingTest, "testMaxPoolGradGrad_" + name_,
             GetMaxPoolGradGradTest(input_size_, filter_size_, output_size_,
                                    stride_, padding_))
diff --git a/tensorflow/python/kernel_tests/priority_queue_test.py b/tensorflow/python/kernel_tests/priority_queue_test.py
index 49ec7ee4836d40719971822aff9e063b7235dc8b..84f395dd3436f278442c56fbe77e9e6fd3de49e7 100644
--- a/tensorflow/python/kernel_tests/priority_queue_test.py
+++ b/tensorflow/python/kernel_tests/priority_queue_test.py
@@ -34,9 +34,9 @@ import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
+@test_util.run_v1_only("PriorityQueue removed from v2")
 class PriorityQueueTest(test.TestCase):
 
-  @test_util.run_v1_only("b/120545219")
   def testRoundTripInsertReadOnceSorts(self):
     with self.cached_session() as sess:
       q = data_flow_ops.PriorityQueue(2000, (dtypes.string, dtypes.string), (
@@ -114,7 +114,6 @@ class PriorityQueueTest(test.TestCase):
         missed.remove((dv0, dv1))
       self.assertEqual(missed, set())
 
-  @test_util.run_v1_only("b/120545219")
   def testRoundTripFillsCapacityMultiThreadedEnqueueAndDequeue(self):
     with self.cached_session() as sess:
       q = data_flow_ops.PriorityQueue(10, (dtypes.int64), (()))
@@ -270,7 +269,6 @@ class PriorityQueueTest(test.TestCase):
         missed.remove((dv0, dv1))
       self.assertEqual(missed, set())
 
-  @test_util.run_v1_only("b/120545219")
   def testRoundTripInsertOnceReadOnceSorts(self):
     with self.cached_session() as sess:
       q = data_flow_ops.PriorityQueue(2000, (dtypes.string, dtypes.string), (
@@ -292,7 +290,6 @@ class PriorityQueueTest(test.TestCase):
       for e, dv0, dv1 in zip(deq_elem, deq_value_0, deq_value_1):
         self.assertTrue((dv0, dv1) in allowed[e])
 
-  @test_util.run_v1_only("b/120545219")
   def testRoundTripInsertOnceReadManySorts(self):
     with self.cached_session():
       q = data_flow_ops.PriorityQueue(2000, (dtypes.int64), (()))
@@ -301,7 +298,6 @@ class PriorityQueueTest(test.TestCase):
       deq_values = np.hstack((q.dequeue_many(100)[0].eval() for _ in range(10)))
       self.assertAllEqual(deq_values, sorted(elem))
 
-  @test_util.run_v1_only("b/120545219")
   def testRoundTripInsertOnceReadOnceLotsSorts(self):
     with self.cached_session():
       q = data_flow_ops.PriorityQueue(2000, (dtypes.int64), (()))
@@ -317,7 +313,6 @@ class PriorityQueueTest(test.TestCase):
       with self.assertRaises(TypeError):
         q.enqueue_many((["a", "b", "c"], ["a", "b", "c"])).run()
 
-  @test_util.run_v1_only("b/120545219")
   def testInsertingNonScalarFails(self):
     with self.cached_session() as sess:
       input_priority = array_ops.placeholder(dtypes.int64)
diff --git a/tensorflow/python/kernel_tests/qr_op_test.py b/tensorflow/python/kernel_tests/qr_op_test.py
index 5adb95c7d60e88e43f6f171f6594c8542ef53143..f9b221a365821265dfccce63f2e018779a14eb5d 100644
--- a/tensorflow/python/kernel_tests/qr_op_test.py
+++ b/tensorflow/python/kernel_tests/qr_op_test.py
@@ -67,8 +67,8 @@ class QrOpTest(test.TestCase):
       val = self.evaluate(all_ops)
       for i in range(8):
         q = 4 * i
-        self.assertAllEqual(val[q], val[q + 2])  # q1 == q2
-        self.assertAllEqual(val[q + 1], val[q + 3])  # r1 == r2
+        self.assertAllClose(val[q], val[q + 2])  # q1 == q2
+        self.assertAllClose(val[q + 1], val[q + 3])  # r1 == r2
 
 
 def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
diff --git a/tensorflow/python/kernel_tests/random/BUILD b/tensorflow/python/kernel_tests/random/BUILD
index dd81306db05aafac0d041320a193c7d92437a5fd..8452982a447ff5eaa1b4eaa11c5d6f8cbd6a7e8c 100644
--- a/tensorflow/python/kernel_tests/random/BUILD
+++ b/tensorflow/python/kernel_tests/random/BUILD
@@ -14,6 +14,14 @@ load("//tensorflow:tensorflow.bzl", "sycl_py_test")
 # Please avoid the py_tests and cuda_py_tests (plural) while we
 # fix the shared/overbroad dependencies.
 
+py_library(
+    name = "util",
+    srcs = ["util.py"],
+    deps = [
+        "//third_party/py/numpy",
+    ],
+)
+
 tf_py_test(
     name = "random_shuffle_queue_test",
     size = "small",
@@ -45,6 +53,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -64,6 +73,7 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
     ],
     shard_count = 3,
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -75,6 +85,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:random_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -88,6 +99,7 @@ cuda_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:random_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -103,6 +115,7 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:stateless_random_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -110,6 +123,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["random_gamma_test.py"],
     additional_deps = [
+        ":util",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -120,6 +134,7 @@ cuda_py_test(
     ],
     shard_count = 4,
     tags = ["nozapfhahn"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -137,6 +152,7 @@ cuda_py_test(
         "//tensorflow/python:random_grad",
         "//tensorflow/python:random_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_test(
@@ -144,6 +160,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["random_poisson_test.py"],
     additional_deps = [
+        ":util",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -152,4 +169,5 @@ cuda_py_test(
         "//tensorflow/python:platform",
         "//tensorflow/python:random_ops",
     ],
+    xla_enable_strict_auto_jit = True,
 )
diff --git a/tensorflow/python/kernel_tests/random/random_gamma_test.py b/tensorflow/python/kernel_tests/random/random_gamma_test.py
index a5952a21968c79c8bfbcbfef2b09852f24f29923..5cc13f67777aef07ab40e8926effc3a2a0d6430b 100644
--- a/tensorflow/python/kernel_tests/random/random_gamma_test.py
+++ b/tensorflow/python/kernel_tests/random/random_gamma_test.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
-
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
@@ -27,6 +25,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
+from tensorflow.python.kernel_tests.random import util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -69,16 +68,6 @@ class RandomGammaTest(test.TestCase):
       tf_logging.warn("Cannot test moments: %s" % e)
       return
 
-    # Check the given array of samples matches the given theoretical moment
-    # function at different orders. The test is considered passing if the
-    # z-tests of all statistical moments are all below z_limit.
-    # Parameters:
-    #   max_moments: the largest moments of the distribution to be tested
-    #   stride: the distance between samples to check for statistical properties
-    #       0 means the n-th moment of each sample
-    #       any other strides tests for spatial correlation between samples;
-    #   z_limit: the maximum z-test we would consider the test to pass;
-
     # The moments test is a z-value test.  This is the largest z-value
     # we want to tolerate. Since the z-test approximates a unit normal
     # distribution, it should almost definitely never exceed 6.
@@ -94,46 +83,13 @@ class RandomGammaTest(test.TestCase):
           max_moment = min(6, scale // 2)
           sampler = self._Sampler(
               20000, alpha, 1 / scale, dt, use_gpu=False, seed=12345)
-          moments = [0] * (max_moment + 1)
-          moments_sample_count = [0] * (max_moment + 1)
-          x = np.array(sampler().flat)  # sampler does 10x samples
-          for k in range(len(x)):
-            moment = 1.
-            for i in range(max_moment + 1):
-              index = k + i * stride
-              if index >= len(x):
-                break
-              moments[i] += moment
-              moments_sample_count[i] += 1
-              moment *= x[index]
-          for i in range(max_moment + 1):
-            moments[i] /= moments_sample_count[i]
-          for i in range(1, max_moment + 1):
-            g = stats.gamma(alpha, scale=scale)
-            if stride == 0:
-              moments_i_mean = g.moment(i)
-              moments_i_squared = g.moment(2 * i)
-            else:
-              moments_i_mean = pow(g.moment(1), i)
-              moments_i_squared = pow(g.moment(2), i)
-            # Calculate moment variance safely:
-            # This is just
-            #  (moments_i_squared - moments_i_mean**2) / moments_sample_count[i]
-            normalized_moments_i_var = (
-                moments_i_mean / moments_sample_count[i] *
-                (moments_i_squared / moments_i_mean - moments_i_mean))
-            # Assume every operation has a small numerical error.
-            # It takes i multiplications to calculate one i-th moment.
-            error_per_moment = i * np.finfo(dt.as_numpy_dtype).eps
-            total_variance = (normalized_moments_i_var + error_per_moment)
-            tiny = np.finfo(dt.as_numpy_dtype).tiny
-            self.assertGreaterEqual(total_variance, 0)
-            if total_variance < tiny:
-              total_variance = tiny
-            # z_test is approximately a unit normal distribution.
-            z_test = abs(
-                (moments[i] - moments_i_mean) / math.sqrt(total_variance))
-            self.assertLess(z_test, z_limit)
+          z_scores = util.test_moment_matching(
+              sampler(),
+              max_moment,
+              stats.gamma(alpha, scale=scale),
+              stride=stride,
+          )
+          self.assertAllLess(z_scores, z_limit)
 
   def _testZeroDensity(self, alpha):
     """Zero isn't in the support of the gamma distribution.
diff --git a/tensorflow/python/kernel_tests/random/random_grad_test.py b/tensorflow/python/kernel_tests/random/random_grad_test.py
index aac6eeac06abca3148947901b92b43058fe76e3c..38fa44f37152bbc1cb720594d171142ec7af9007 100644
--- a/tensorflow/python/kernel_tests/random/random_grad_test.py
+++ b/tensorflow/python/kernel_tests/random/random_grad_test.py
@@ -79,7 +79,7 @@ class RandomGammaGradTest(test.TestCase):
     shape = [2, 3]
     alpha = array_ops.ones([2, 2])
     beta = array_ops.ones([1, 2])
-    sample = random_ops.random_gamma(shape, alpha, beta)
+    sample = random_ops.random_gamma(shape, alpha, beta, seed=12345)
     grads_alpha, grads_beta = gradients_impl.gradients(sample, [alpha, beta])
     self.assertAllEqual(grads_alpha.shape, alpha.shape)
     self.assertAllEqual(grads_beta.shape, beta.shape)
@@ -89,7 +89,7 @@ class RandomGammaGradTest(test.TestCase):
     shape = []
     alpha = array_ops.ones([2, 2])
     beta = array_ops.ones([1, 2])
-    sample = random_ops.random_gamma(shape, alpha, beta)
+    sample = random_ops.random_gamma(shape, alpha, beta, seed=12345)
     grads_alpha, grads_beta = gradients_impl.gradients(sample, [alpha, beta])
     self.assertAllEqual(grads_alpha.shape, alpha.shape)
     self.assertAllEqual(grads_beta.shape, beta.shape)
@@ -99,7 +99,7 @@ class RandomGammaGradTest(test.TestCase):
     shape = array_ops.placeholder(dtypes.int32)
     alpha = array_ops.placeholder(dtypes.float32)
     beta = array_ops.placeholder(dtypes.float32)
-    sample = random_ops.random_gamma(shape, alpha, beta)
+    sample = random_ops.random_gamma(shape, alpha, beta, seed=12345)
     grads_alpha, grads_beta = gradients_impl.gradients(sample, [alpha, beta])
 
     alpha_val = np.ones([1, 2])
@@ -129,7 +129,8 @@ class RandomGammaGradTest(test.TestCase):
 
       alpha_val = np.logspace(-2, 3, dtype=np_dtype)
       alpha = constant_op.constant(alpha_val)
-      sample = random_ops.random_gamma([], alpha, np_dtype(1.0), dtype=dtype)
+      sample = random_ops.random_gamma(
+          [], alpha, np_dtype(1.0), dtype=dtype, seed=12345)
       actual = gradients_impl.gradients(sample, alpha)[0]
 
       (sample_val, actual_val) = self.evaluate((sample, actual))
@@ -175,7 +176,8 @@ class RandomGammaGradTest(test.TestCase):
     """
     np_dtype = dtype.as_numpy_dtype
     alpha = constant_op.constant(np.logspace(-2, 3, dtype=np_dtype))
-    sample = random_ops.random_gamma([], alpha, np_dtype(1.0), dtype=dtype)
+    sample = random_ops.random_gamma(
+        [], alpha, np_dtype(1.0), dtype=dtype, seed=12345)
     actual = gradients_impl.gradients(sample, alpha)[0]
 
     sample_sg = array_ops.stop_gradient(sample)
@@ -207,9 +209,9 @@ class RandomGammaGradTest(test.TestCase):
     Here we verify that the rhs is fairly close to one.
     The convergence speed is not great, so we use many samples and loose bounds.
     """
-    num_samples = 1000
+    num_samples = 10000
     alpha = constant_op.constant([0.8, 1e1, 1e3], dtype=dtypes.float32)
-    sample = random_ops.random_gamma([num_samples], alpha)
+    sample = random_ops.random_gamma([num_samples], alpha, seed=12345)
     # We need to average the gradients, which is equivalent to averaging the
     # samples and then doing backprop.
     mean_sample = math_ops.reduce_mean(sample, axis=0)
@@ -234,13 +236,13 @@ class RandomGammaGradTest(test.TestCase):
     We compare the Monte-Carlo estimate of the expectation with the
     true gradient.
     """
-    num_samples = 1000
+    num_samples = 10000
     t = 0.3
     alpha = 0.5
     expected = 1 + 2 * alpha - 2 * t
 
     alpha = constant_op.constant(alpha)
-    sample = random_ops.random_gamma([num_samples], alpha, 1.0)
+    sample = random_ops.random_gamma([num_samples], alpha, 1.0, seed=12345)
     loss = math_ops.reduce_mean(math_ops.square(sample - t))
     dloss_dalpha = gradients_impl.gradients(loss, alpha)[0]
     dloss_dalpha_val = self.evaluate(dloss_dalpha)
diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index 1384c3f446f97a76792a27cfc7f679e80402cbf0..68672a04bbdc48e066d90ceb5ff94ea705f75fd9 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -257,6 +257,7 @@ class TruncatedNormalTest(test.TestCase):
       self.assertAllEqual(rnd1, rnd2)
 
 
+@test_util.disable_all_xla("This never passed on XLA")
 class RandomUniformTest(RandomOpTestCommon):
 
   def _Sampler(self, num, minv, maxv, dtype, use_gpu, seed=None):
diff --git a/tensorflow/python/kernel_tests/random/random_poisson_test.py b/tensorflow/python/kernel_tests/random/random_poisson_test.py
index 0a6b004d682e5d810a5a3e09ca6dce867e5f41f1..51dd4cb47ca8561dfd01e20031651047fb2b70b9 100644
--- a/tensorflow/python/kernel_tests/random/random_poisson_test.py
+++ b/tensorflow/python/kernel_tests/random/random_poisson_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.kernel_tests.random import util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
@@ -49,14 +50,13 @@ class RandomPoissonTest(test.TestCase):
 
     return func
 
-  # TODO(srvasude): Factor this out along with the corresponding moment testing
-  # method in random_gamma_test into a single library.
   def testMoments(self):
     try:
       from scipy import stats  # pylint: disable=g-import-not-at-top
     except ImportError as e:
       tf_logging.warn("Cannot test moments: %s", e)
       return
+
     # The moments test is a z-value test.  This is the largest z-value
     # we want to tolerate. Since the z-test approximates a unit normal
     # distribution, it should almost definitely never exceed 6.
@@ -67,41 +67,13 @@ class RandomPoissonTest(test.TestCase):
         for lam in (3., 20):
           max_moment = 5
           sampler = self._Sampler(10000, lam, dt, use_gpu=False, seed=12345)
-          moments = [0] * (max_moment + 1)
-          moments_sample_count = [0] * (max_moment + 1)
-          x = np.array(sampler().flat)  # sampler does 10x samples
-          for k in range(len(x)):
-            moment = 1.
-            for i in range(max_moment + 1):
-              index = k + i * stride
-              if index >= len(x):
-                break
-              moments[i] += moment
-              moments_sample_count[i] += 1
-              moment *= x[index]
-          for i in range(max_moment + 1):
-            moments[i] /= moments_sample_count[i]
-          for i in range(1, max_moment + 1):
-            g = stats.poisson(lam)
-            if stride == 0:
-              moments_i_mean = g.moment(i)
-              moments_i_squared = g.moment(2 * i)
-            else:
-              moments_i_mean = pow(g.moment(1), i)
-              moments_i_squared = pow(g.moment(2), i)
-            moments_i_var = (
-                moments_i_squared - moments_i_mean * moments_i_mean)
-            # Assume every operation has a small numerical error.
-            # It takes i multiplications to calculate one i-th moment.
-            error_per_moment = i * 1e-6
-            total_variance = (
-                moments_i_var / moments_sample_count[i] + error_per_moment)
-            if not total_variance:
-              total_variance = 1e-10
-            # z_test is approximately a unit normal distribution.
-            z_test = abs(
-                (moments[i] - moments_i_mean) / np.sqrt(total_variance))
-            self.assertLess(z_test, z_limit)
+          z_scores = util.test_moment_matching(
+              sampler(),
+              max_moment,
+              stats.poisson(lam),
+              stride=stride,
+          )
+          self.assertAllLess(z_scores, z_limit)
 
   # Checks that the CPU and GPU implementation returns the same results,
   # given the same random seed
diff --git a/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py b/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
index dd814a22b4e59261b33e1a57fd9014147792858b..4a8144fadb4dde86ab716c21d1366de0a498d461 100644
--- a/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
+++ b/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
@@ -35,7 +35,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_v1_only("RandomShuffleQueue removed from v2")
 class RandomShuffleQueueTest(test.TestCase):
 
   def setUp(self):
@@ -1417,7 +1417,6 @@ class RandomShuffleQueueTest(test.TestCase):
 
       self.assertItemsEqual(elem, results)
 
-  @test_util.run_v1_only("b/120545219")
   def testBigDequeueMany(self):
     with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(2, 0, dtypes_lib.int32, ((),))
diff --git a/tensorflow/python/kernel_tests/random/util.py b/tensorflow/python/kernel_tests/random/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..67805c7f262480e18fd296e15fc4a436e70c0c58
--- /dev/null
+++ b/tensorflow/python/kernel_tests/random/util.py
@@ -0,0 +1,72 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for testing random variables."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def test_moment_matching(
+    samples,
+    number_moments,
+    dist,
+    stride=0):
+  """Return z-test scores for sample moments to match analytic moments.
+
+  Given `samples`, check that the first sample `number_moments` match
+  the given  `dist` moments by doing a z-test.
+
+  Args:
+    samples: Samples from target distribution.
+    number_moments: Python `int` describing how many sample moments to check.
+    dist: SciPy distribution object that provides analytic moments.
+    stride: Distance between samples to check for statistical properties.
+      A stride of 0 means to use all samples, while other strides test for
+      spatial correlation.
+  Returns:
+    Array of z_test scores.
+  """
+
+  sample_moments = []
+  expected_moments = []
+  variance_sample_moments = []
+  x = samples.flat
+  for i in range(1, number_moments + 1):
+    strided_range = x[::(i - 1) * stride + 1]
+    sample_moments.append(np.mean(strided_range ** i))
+    expected_moments.append(dist.moment(i))
+    variance_sample_moments.append(
+        (dist.moment(2 * i) - dist.moment(i) ** 2) / len(strided_range))
+
+  z_test_scores = []
+  for i in range(1, number_moments + 1):
+    # Assume every operation has a small numerical error.
+    # It takes i multiplications to calculate one i-th moment.
+    total_variance = (
+        variance_sample_moments[i - 1] +
+        i * np.finfo(samples.dtype).eps)
+    tiny = np.finfo(samples.dtype).tiny
+    assert np.all(total_variance > 0)
+    if total_variance < tiny:
+      total_variance = tiny
+    # z_test is approximately a unit normal distribution.
+    z_test_scores.append(abs(
+        (sample_moments[i - 1] - expected_moments[i - 1]) / np.sqrt(
+            total_variance)))
+  return z_test_scores
+
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 67a89461f3a885056f47c62af40bf6cfccd60583..5ab8bc3a0089742cfad891e772bd3a4ee447a55e 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -104,7 +104,8 @@ class ReductionUnknownShape(test.TestCase):
       for dtype, reductions in [(dtypes.float32,
                                  (math_ops.reduce_sum, math_ops.reduce_mean,
                                   math_ops.reduce_prod, math_ops.reduce_max,
-                                  math_ops.reduce_min)),
+                                  math_ops.reduce_min,
+                                  math_ops.reduce_euclidean_norm)),
                                 (dtypes.bool, (math_ops.reduce_all,
                                                math_ops.reduce_any))]:
         for reduction in reductions:
@@ -487,6 +488,79 @@ class MeanReductionTest(BaseReductionTest):
         self.assertTrue(np.all(np.isnan(y)))
 
 
+class EuclideanNormReductionTest(BaseReductionTest):
+
+  def _tf_reduce(self, x, reduction_axes, keepdims):
+    return math_ops.reduce_euclidean_norm(x, reduction_axes, keepdims)
+
+  def _np_reduce(self, x, reduction_axes, keepdims):
+    if isinstance(reduction_axes, list) or isinstance(reduction_axes,
+                                                      np.ndarray):
+      reduction_axes = tuple(reduction_axes)
+    if reduction_axes is None or reduction_axes != tuple():
+      np_fro = np.sqrt(
+          np.sum(x * np.conj(x), axis=reduction_axes, keepdims=keepdims))
+    else:
+      np_fro = x
+    if np.issubdtype(x.dtype, np.integer):
+      np_fro = np.floor(np_fro)
+    return np_fro
+
+  @test_util.run_deprecated_v1
+  def testAxesType(self):
+    for dtype in [dtypes.int64, dtypes.int32]:
+      with self.cached_session(use_gpu=True):
+        v = math_ops.reduce_mean([0, 0], constant_op.constant(0, dtype=dtype))
+        tf_v = self.evaluate(v)
+      self.assertAllEqual(tf_v, 0)
+
+  @test_util.run_deprecated_v1
+  def testInfinity(self):
+    for dtype in [np.float32, np.float64]:
+      for special_value_x in [-np.inf, np.inf]:
+        for special_value_y in [-np.inf, np.inf]:
+          np_arr = np.array([special_value_x, special_value_y]).astype(dtype)
+          self._compareAll(np_arr, None)
+
+  @test_util.run_deprecated_v1
+  def testInt32(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.int32)
+      self._compareAllAxes(np_arr)
+
+  @test_util.run_deprecated_v1
+  def testFloat32(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
+      self._compareAllAxes(np_arr)
+
+  @test_util.run_deprecated_v1
+  def testFloat64(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.float64)
+      self._compareAllAxes(np_arr)
+
+  @test_util.run_deprecated_v1
+  def testComplex64(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.complex64)
+      self._compareAllAxes(np_arr)
+
+  @test_util.run_deprecated_v1
+  def testComplex128(self):
+    for rank in range(1, _MAX_RANK + 1):
+      np_arr = self._makeIncremental((2,) * rank, dtypes.complex128)
+      self._compareAllAxes(np_arr)
+
+    with self.session(use_gpu=True):
+      for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
+        # A large number is needed to get Eigen to die
+        x = array_ops.zeros((0, 9938), dtype=dtype)
+        y = math_ops.reduce_euclidean_norm(x, [0]).eval()
+        self.assertEqual(y.shape, (9938,))
+        self.assertAllEqual(y, np.zeros(9938))
+
+
 class ProdReductionTest(BaseReductionTest):
 
   def _tf_reduce(self, x, reduction_axes, keepdims):
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test_big.py b/tensorflow/python/kernel_tests/reduction_ops_test_big.py
index 1e8524f72a9760af90695b3b24c6dda3e9ba8c4a..2d5cff383e46c3aac83eab6b830859a7614fd803 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test_big.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test_big.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -46,6 +48,7 @@ class BigReductionTest(BaseReductionTest):
   def _tf_reduce_sum(self, x, reduction_axes, keepdims):
     return math_ops.reduce_sum(x, reduction_axes, keepdims)
 
+  @test_util.run_deprecated_v1
   def testFloat32Sum(self):
     # make sure we test all possible kernel invocations
     # logic is the same for all ops, test just float32 for brevity
@@ -64,11 +67,13 @@ class BigReductionTest(BaseReductionTest):
         full_sum = np.ones([], dtype=np.float32) * size_x * size_y
 
         with self.session(graph=ops.Graph(), use_gpu=True) as sess:
-          tf_row_sum = self._tf_reduce_sum(arr, 1, False)
-          tf_col_sum = self._tf_reduce_sum(arr, 0, False)
-          tf_full_sum = self._tf_reduce_sum(arr, [0, 1], False)
+          arr_placeholder = array_ops.placeholder(dtype=np.float32,
+                                                  shape=(size_x, size_y))
+          tf_row_sum = self._tf_reduce_sum(arr_placeholder, 1, False)
+          tf_col_sum = self._tf_reduce_sum(arr_placeholder, 0, False)
+          tf_full_sum = self._tf_reduce_sum(arr_placeholder, [0, 1], False)
           tf_out_row, tf_out_col, tf_out_full = sess.run(
-              [tf_row_sum, tf_col_sum, tf_full_sum])
+              [tf_row_sum, tf_col_sum, tf_full_sum], {arr_placeholder: arr})
         self.assertAllClose(col_sum, tf_out_col)
         self.assertAllClose(row_sum, tf_out_row)
         self.assertAllClose(full_sum, tf_out_full)
@@ -82,12 +87,16 @@ class BigReductionTest(BaseReductionTest):
           sum_xz = np.ones([size_y], dtype=np.float32)
 
           with self.session(graph=ops.Graph(), use_gpu=True) as sess:
-            tf_sum_xz = self._tf_reduce_mean(arr, [0, 2], False)
-            tf_sum_y = self._tf_reduce_mean(arr, 1, False)
-            tf_out_sum_xz, tf_out_sum_y = sess.run([tf_sum_xz, tf_sum_y])
+            arr_placeholder = array_ops.placeholder(
+                dtype=np.float32, shape=(size_x, size_y, size_z))
+            tf_sum_xz = self._tf_reduce_mean(arr_placeholder, [0, 2], False)
+            tf_sum_y = self._tf_reduce_mean(arr_placeholder, 1, False)
+            tf_out_sum_xz, tf_out_sum_y = sess.run([tf_sum_xz, tf_sum_y],
+                                                   {arr_placeholder: arr})
           self.assertAllClose(sum_y, tf_out_sum_y)
           self.assertAllClose(sum_xz, tf_out_sum_xz)
 
+  @test_util.run_deprecated_v1
   def testFloat32Max(self):
     # make sure we test all possible kernel invocations
     # logic is the same for all ops, test just float32 for brevity
@@ -107,11 +116,13 @@ class BigReductionTest(BaseReductionTest):
         full_max = np.max(col_max)
 
         with self.session(graph=ops.Graph(), use_gpu=True) as sess:
-          tf_row_max = self._tf_reduce_max(arr, 1, False)
-          tf_col_max = self._tf_reduce_max(arr, 0, False)
-          tf_full_max = self._tf_reduce_max(arr, [0, 1], False)
+          arr_placeholder = array_ops.placeholder(dtype=np.float32,
+                                                  shape=(size_x, size_y))
+          tf_row_max = self._tf_reduce_max(arr_placeholder, 1, False)
+          tf_col_max = self._tf_reduce_max(arr_placeholder, 0, False)
+          tf_full_max = self._tf_reduce_max(arr_placeholder, [0, 1], False)
           tf_out_row, tf_out_col, tf_out_full = sess.run(
-              [tf_row_max, tf_col_max, tf_full_max])
+              [tf_row_max, tf_col_max, tf_full_max], {arr_placeholder: arr})
         self.assertAllClose(col_max, tf_out_col)
         self.assertAllClose(row_max, tf_out_row)
         self.assertAllClose(full_max, tf_out_full)
@@ -126,12 +137,16 @@ class BigReductionTest(BaseReductionTest):
           sum_xz = np.max(arr, axis=(0, 2))
 
           with self.session(graph=ops.Graph(), use_gpu=True) as sess:
-            tf_sum_xz = self._tf_reduce_max(arr, [0, 2], False)
-            tf_sum_y = self._tf_reduce_max(arr, 1, False)
-            tf_out_sum_xz, tf_out_sum_y = sess.run([tf_sum_xz, tf_sum_y])
+            arr_placeholder = array_ops.placeholder(
+                dtype=np.float32, shape=(size_x, size_y, size_z))
+            tf_sum_xz = self._tf_reduce_max(arr_placeholder, [0, 2], False)
+            tf_sum_y = self._tf_reduce_max(arr_placeholder, 1, False)
+            tf_out_sum_xz, tf_out_sum_y = sess.run(
+                [tf_sum_xz, tf_sum_y], {arr_placeholder: arr})
           self.assertAllClose(sum_y, tf_out_sum_y)
           self.assertAllClose(sum_xz, tf_out_sum_xz)
 
+  @test_util.run_deprecated_v1
   def testBooleanAll(self):
     # make sure we test all possible kernel invocations
     # test operation where T(0) is not the identity
@@ -150,11 +165,13 @@ class BigReductionTest(BaseReductionTest):
         full_sum = np.ones([1], dtype=np.bool).reshape([])
 
         with self.session(graph=ops.Graph(), use_gpu=True) as sess:
-          tf_row_sum = self._tf_reduce_all(arr, 1, False)
-          tf_col_sum = self._tf_reduce_all(arr, 0, False)
-          tf_full_sum = self._tf_reduce_all(arr, [0, 1], False)
+          arr_placeholder = array_ops.placeholder(dtype=np.bool,
+                                                  shape=(size_x, size_y))
+          tf_row_sum = self._tf_reduce_all(arr_placeholder, 1, False)
+          tf_col_sum = self._tf_reduce_all(arr_placeholder, 0, False)
+          tf_full_sum = self._tf_reduce_all(arr_placeholder, [0, 1], False)
           tf_out_row, tf_out_col, tf_out_full = sess.run(
-              [tf_row_sum, tf_col_sum, tf_full_sum])
+              [tf_row_sum, tf_col_sum, tf_full_sum], {arr_placeholder: arr})
         self.assertAllClose(col_sum, tf_out_col)
         self.assertAllClose(row_sum, tf_out_row)
         self.assertAllClose(full_sum, tf_out_full)
@@ -168,9 +185,12 @@ class BigReductionTest(BaseReductionTest):
           sum_xz = np.ones([size_y], dtype=np.bool)
 
           with self.session(graph=ops.Graph(), use_gpu=True) as sess:
-            tf_sum_xz = self._tf_reduce_all(arr, [0, 2], False)
-            tf_sum_y = self._tf_reduce_all(arr, 1, False)
-            tf_out_sum_xz, tf_out_sum_y = sess.run([tf_sum_xz, tf_sum_y])
+            arr_placeholder = array_ops.placeholder(
+                dtype=np.bool, shape=(size_x, size_y, size_z))
+            tf_sum_xz = self._tf_reduce_all(arr_placeholder, [0, 2], False)
+            tf_sum_y = self._tf_reduce_all(arr_placeholder, 1, False)
+            tf_out_sum_xz, tf_out_sum_y = sess.run(
+                [tf_sum_xz, tf_sum_y], {arr_placeholder: arr})
           self.assertAllClose(sum_y, tf_out_sum_y)
           self.assertAllClose(sum_xz, tf_out_sum_xz)
 
diff --git a/tensorflow/python/kernel_tests/relu_op_test.py b/tensorflow/python/kernel_tests/relu_op_test.py
index d4ba1ad77d5547ccb9fe4e2154d145751cf63514..ca02aa6e4e65919f26d330f623ba0e240cfc7f92 100644
--- a/tensorflow/python/kernel_tests/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/relu_op_test.py
@@ -86,6 +86,7 @@ class ReluTest(test.TestCase):
     self.assertAllClose(np_relu, tf_relu)
     self.assertShapeEqual(np_relu, tf_relu)
 
+  @test_util.disable_xla("b/123338077")  # Passes with XLA
   def testReluInt8x4BadShape(self):
     if not test.is_gpu_available(cuda_only=True):
       self.skipTest("No GPU available")
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 433957fd1d38890c0952c443097e4955e1eb99cb..163d5a316ce088015ac00e9fc582a5b71865b3c0 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -24,11 +24,15 @@ import pickle
 
 import numpy as np
 
+from tensorflow.core.framework import tensor_pb2
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -36,6 +40,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
@@ -285,12 +290,19 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       tmp_dir = self.get_temp_dir()
       fname = os.path.join(tmp_dir, "var.pickle")
       with open(fname, "wb") as f:
-        v = resource_variable_ops.ResourceVariable(10.0)
+        v = resource_variable_ops.ResourceVariable(
+            10.0,
+            dtype=dtypes.float16,
+            name="v")
         pickle.dump(v, f)
 
       with open(fname, "rb") as f:
-        v = pickle.load(f)
-        self.assertAllEqual(v.numpy(), 10.0)
+        new_v = pickle.load(f)
+        self.assertEqual(new_v.name, v.name)
+        self.assertEqual(new_v.shape, v.shape)
+        self.assertEqual(new_v.dtype, v.dtype)
+        self.assertEqual(new_v.trainable, v.trainable)
+        self.assertAllEqual(new_v.numpy(), v.numpy())
 
   @test_util.run_in_graph_and_eager_modes
   def testScatterDiv(self):
@@ -628,7 +640,6 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
           variable_def=other_v_def)
       self.assertTrue(other_v_prime._cached_value is not None)
 
-  @test_util.run_v1_only("b/120545219")
   def testVariableDefInitializedInstances(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v_def = resource_variable_ops.ResourceVariable(
@@ -688,7 +699,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
   def testToFromProto(self):
     with self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       w = resource_variable_ops.ResourceVariable.from_proto(v.to_proto())
       self.assertEquals(2, math_ops.add(w, 1).eval())
@@ -792,11 +803,11 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       with self.assertRaises(ValueError):
         _ = w.value().op.get_attr("_class")
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_deprecated_v1
   def testSharedName(self):
     with self.cached_session():
       v = resource_variable_ops.ResourceVariable(300.0, name="var4")
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       w = resource_variable_ops.var_handle_op(
           dtype=v.dtype.base_dtype, shape=v.get_shape(), shared_name="var4",
@@ -953,16 +964,46 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       state_ops.scatter_sub(v, [1], [3])
       self.assertAllEqual([1.0, -1.0], v.numpy())
 
+  def testScatterUpdateVariant(self):
+    with context.eager_mode():
+      v = resource_variable_ops.ResourceVariable([
+          list_ops.empty_tensor_list(
+              element_dtype=dtypes.float32, element_shape=[])
+      ])
+      v.scatter_update(
+          ops.IndexedSlices(
+              list_ops.tensor_list_from_tensor([1., 2.], element_shape=[]), 0))
+      self.assertAllEqual(
+          list_ops.tensor_list_get_item(v[0], 0, element_dtype=dtypes.float32),
+          1.)
+
+  def testGroupDoesntForceRead(self):
+    with ops.Graph().as_default():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      assign = v.assign_add(1.0)
+      g = control_flow_ops.group([assign])
+      self.assertEqual(g.control_inputs[0].type, "AssignAddVariableOp")
+
   def testScatterNdAddStateOps(self):
     with context.eager_mode():
       v = resource_variable_ops.ResourceVariable(
-          [1, 1, 1, 1, 1, 1, 1, 1], dtype=dtypes.float32, name="add")
+          [1, 2, 3, 4, 5, 6, 7, 8], dtype=dtypes.float32, name="add")
       indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
       updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
-      expected = np.array([1, 12, 1, 11, 10, 1, 1, 13])
+      expected = np.array([1, 13, 3, 14, 14, 6, 7, 20])
       state_ops.scatter_nd_add(v, indices, updates)
       self.assertAllClose(expected, v.numpy())
 
+  def testScatterNdSubStateOps(self):
+    with context.eager_mode():
+      v = resource_variable_ops.ResourceVariable(
+          [1, 2, 3, 4, 5, 6, 7, 8], dtype=dtypes.float32, name="sub")
+      indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
+      updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
+      expected = np.array([1, -9, 3, -6, -4, 6, 7, -4])
+      state_ops.scatter_nd_sub(v, indices, updates)
+      self.assertAllClose(expected, v.numpy())
+
   def testScatterUpdateCast(self):
     with context.eager_mode():
       v = resource_variable_ops.ResourceVariable([1.0, 2.0], name="update")
@@ -997,6 +1038,59 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
         with self.assertRaises(errors.InvalidArgumentError):
           session.run(copied.initializer)
 
+  def create_variant_shape_and_type_data(self):
+    variant_shape_and_type_data = (
+        cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData())
+    variant_shape_and_type_data.is_set = True
+    stored_shape = tensor_shape.TensorShape([None, 4]).as_proto()
+    stored_dtype = dtypes.float32.as_datatype_enum
+    # NOTE(ebrevdo): shape_and_type lacks append() in some versions of protobuf.
+    variant_shape_and_type_data.shape_and_type.extend([
+        cpp_shape_inference_pb2.CppShapeInferenceResult.HandleShapeAndType(
+            shape=stored_shape, dtype=stored_dtype)])
+    return variant_shape_and_type_data
+
+  @def_function.function
+  def create_constant_variant(self, value):
+    value = constant_op.constant(
+        tensor_pb2.TensorProto(
+            dtype=dtypes.variant.as_datatype_enum,
+            tensor_shape=tensor_shape.TensorShape([]).as_proto(),
+            variant_val=[
+                tensor_pb2.VariantTensorDataProto(
+                    # Match registration in variant_op_registry.cc
+                    type_name=b"int",
+                    metadata=np.array(value, dtype=np.int32).tobytes())
+            ]))
+    return value
+
+  # TODO(ebrevdo): Add run_in_graph_and_eager_modes once we can create
+  # EagerTensor constants with TensorProto inputs.
+  @test_util.run_in_graph_and_eager_modes()
+  def testVariantInitializer(self):
+    variant_shape_and_type_data = self.create_variant_shape_and_type_data()
+    value = self.create_constant_variant(3)
+    initializer = array_ops.fill([3], value)
+    resource_variable_ops._set_handle_shapes_and_types(  # pylint: disable=protected-access
+        initializer, variant_shape_and_type_data,
+        graph_mode=not context.executing_eagerly())
+    v = resource_variable_ops.ResourceVariable(initializer)
+    read = array_ops.identity(v)
+    read_variant_shape_and_type = (
+        resource_variable_ops.get_eager_safe_handle_data(read))
+    self.assertEqual(
+        read_variant_shape_and_type, variant_shape_and_type_data)
+    gather = v.sparse_read([0])
+    gather_variant_shape_and_type = (
+        resource_variable_ops.get_eager_safe_handle_data(gather))
+    self.assertEqual(
+        gather_variant_shape_and_type, variant_shape_and_type_data)
+    # Make sure initializer runs.
+    if not context.executing_eagerly():
+      self.evaluate(v.initializer)
+      self.evaluate(read.op)
+      self.evaluate(gather.op)
+
 
 class _MixedPrecisionVariableTest(test_util.TensorFlowTestCase):
 
@@ -1054,6 +1148,11 @@ class _MixedPrecisionVariableTest(test_util.TensorFlowTestCase):
     self.assertEqual(NotImplemented,
                      v._dense_var_to_tensor(dtype=dtypes.float16, as_ref=True))
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testDistributeStrategy(self):
+    v = resource_variable_ops.ResourceVariable(1, dtype=dtypes.int32)
+    self.assertIsNone(v._distribute_strategy)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py b/tensorflow/python/kernel_tests/rnn_cell_test.py
similarity index 67%
rename from tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
rename to tensorflow/python/kernel_tests/rnn_cell_test.py
index ef372b947cedf71e9d44423f10cc43375b467cd9..c732c9be17a010a491d39c0da7d2deedaa7bd3f1 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_cell_test.py
@@ -12,25 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for rnn module."""
+"""Tests for RNN cells."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import itertools
+import os
 
+from absl.testing import parameterized
 import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensorflow.contrib import rnn as rnn_lib
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops as ops_lib
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import layers as keras_layers
+from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
@@ -38,16 +42,18 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn
 from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
+from tensorflow.python.training.tracking import util as trackable_utils
 from tensorflow.python.util import nest
 
 
-class Plus1RNNCell(rnn_lib.RNNCell):
+class Plus1RNNCell(rnn_cell.RNNCell):
   """RNN Cell generating (output, new_state) = (input + 1, state + 1)."""
 
   @property
@@ -62,7 +68,7 @@ class Plus1RNNCell(rnn_lib.RNNCell):
     return (input_ + 1, state + 1)
 
 
-class DummyMultiDimensionalLSTM(rnn_lib.RNNCell):
+class DummyMultiDimensionalLSTM(rnn_cell.RNNCell):
   """LSTM Cell generating (output, new_state) = (input + 1, state + 1).
 
   The input to this cell may have an arbitrary number of dimensions that follow
@@ -97,7 +103,7 @@ class DummyMultiDimensionalLSTM(rnn_lib.RNNCell):
     return (input_ + 1, (h + 1, c + 1))
 
 
-class NestedRNNCell(rnn_lib.RNNCell):
+class NestedRNNCell(rnn_cell.RNNCell):
   """RNN Cell generating (output, new_state) = (input + 1, state + 1).
 
   The input, output and state of this cell is a tuple of two tensors.
@@ -161,18 +167,19 @@ class TestStateSaverWithCounters(TestStateSaver):
   inherits from the TestStateSaver and adds the counters for calls of functions.
   """
 
+  @test_util.run_v1_only("b/124229375")
   def __init__(self, batch_size, state_size):
     super(TestStateSaverWithCounters, self).__init__(batch_size, state_size)
     self._num_state_calls = variables_lib.VariableV1(0)
     self._num_save_state_calls = variables_lib.VariableV1(0)
 
   def state(self, name):
-    with ops_lib.control_dependencies(
+    with ops.control_dependencies(
         [state_ops.assign_add(self._num_state_calls, 1)]):
       return super(TestStateSaverWithCounters, self).state(name)
 
   def save_state(self, name, state):
-    with ops_lib.control_dependencies([state_ops.assign_add(
+    with ops.control_dependencies([state_ops.assign_add(
         self._num_save_state_calls, 1)]):
       return super(TestStateSaverWithCounters, self).save_state(name, state)
 
@@ -191,12 +198,14 @@ class RNNTest(test.TestCase):
     self._seed = 23489
     np.random.seed(self._seed)
 
+  @test_util.run_v1_only("b/124229375")
   def testInvalidSequenceLengthShape(self):
     cell = Plus1RNNCell()
     inputs = [array_ops.placeholder(dtypes.float32, shape=(3, 4))]
     with self.assertRaisesRegexp(ValueError, "must be a vector"):
       rnn.static_rnn(cell, inputs, dtype=dtypes.float32, sequence_length=4)
 
+  @test_util.run_v1_only("b/124229375")
   def testRNN(self):
     cell = Plus1RNNCell()
     batch_size = 2
@@ -224,6 +233,7 @@ class RNNTest(test.TestCase):
                           max_length * np.ones(
                               (batch_size, input_size), dtype=np.float32))
 
+  @test_util.run_v1_only("b/124229375")
   def testDropout(self):
     cell = Plus1RNNCell()
     full_dropout_cell = rnn_cell.DropoutWrapper(
@@ -260,6 +270,7 @@ class RNNTest(test.TestCase):
       for d_v in full_dropout_values[:-1]:  # Add 1.0 to dropped_out (all zeros)
         self.assertAllClose(d_v, np.ones_like(input_value))
 
+  @test_util.run_v1_only("b/124229375")
   def testDynamicCalculation(self):
     cell = Plus1RNNCell()
     sequence_length = array_ops.placeholder(dtypes.int64)
@@ -310,7 +321,7 @@ class RNNTest(test.TestCase):
                                      1.0 * (2 + 1) * np.ones((input_size)))))
 
   def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
-    with self.session(use_gpu=True, graph=ops_lib.Graph()):
+    with self.session(use_gpu=True, graph=ops.Graph()):
       if use_outer_scope:
         with variable_scope.variable_scope(prefix) as scope:
           factory(scope)
@@ -329,6 +340,7 @@ class RNNTest(test.TestCase):
         tf_logging.info(v.name)
       self.assertEqual(len(scope_vars), len(all_vars))
 
+  @test_util.run_v1_only("b/124229375")
   def testScope(self):
 
     def factory(scope):
@@ -358,21 +370,22 @@ class LSTMTest(test.TestCase):
     lstm = rnn_cell.LSTMCell(10)
     input_tensor = array_ops.ones([10, 50])
     lstm.build(input_tensor.get_shape())
-    self.assertEqual(lstm._bias.dtype, dtypes.float32_ref)
+    self.assertEqual(lstm._bias.dtype.base_dtype, dtypes.float32)
 
     # Explicitly pass dtype in constructor
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       lstm = rnn_cell.LSTMCell(10, dtype=dtype)
       input_tensor = array_ops.ones([10, 50])
       lstm.build(input_tensor.get_shape())
-      self.assertEqual(lstm._bias.dtype, dtype._as_ref)
+      self.assertEqual(lstm._bias.dtype.base_dtype, dtype)
 
+  @test_util.run_v1_only("b/124229375")
   def testNoProjNoSharding(self):
     num_units = 3
     input_size = 5
     batch_size = 2
     max_length = 8
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       cell = rnn_cell.LSTMCell(
@@ -389,12 +402,13 @@ class LSTMTest(test.TestCase):
       input_value = np.random.randn(batch_size, input_size)
       sess.run(outputs, feed_dict={inputs[0]: input_value})
 
+  @test_util.run_v1_only("b/124229375")
   def testCellClipping(self):
     num_units = 3
     input_size = 5
     batch_size = 2
     max_length = 8
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       cell = rnn_cell.LSTMCell(
@@ -419,12 +433,13 @@ class LSTMTest(test.TestCase):
       # if cell c is clipped to 0, tanh(c) = 0 => m==0
       self.assertAllEqual(value, np.zeros((batch_size, num_units)))
 
+  @test_util.run_v1_only("b/124229375")
   def testNoProjNoShardingSimpleStateSaver(self):
     num_units = 3
     input_size = 5
     batch_size = 2
     max_length = 8
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       state_saver = TestStateSaver(batch_size, 2 * num_units)
@@ -452,12 +467,13 @@ class LSTMTest(test.TestCase):
           })
       self.assertAllEqual(last_state_value, saved_state_value)
 
+  @test_util.run_v1_only("b/124229375")
   def testNoProjNoShardingTupleStateSaver(self):
     num_units = 3
     input_size = 5
     batch_size = 2
     max_length = 8
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       state_saver = TestStateSaver(batch_size, num_units)
@@ -486,12 +502,13 @@ class LSTMTest(test.TestCase):
       self.assertEqual(4, len(last_and_saved_states))
       self.assertAllEqual(last_and_saved_states[:2], last_and_saved_states[2:])
 
+  @test_util.run_v1_only("b/124229375")
   def testNoProjNoShardingNestedTupleStateSaver(self):
     num_units = 3
     input_size = 5
     batch_size = 2
     max_length = 8
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       state_saver = TestStateSaver(
@@ -556,13 +573,14 @@ class LSTMTest(test.TestCase):
         self.assertAllEqual(last_states[i],
                             named_saved_states[flat_state_names[i]])
 
+  @test_util.run_v1_only("b/124229375")
   def testProjNoSharding(self):
     num_units = 3
     input_size = 5
     batch_size = 2
     num_proj = 4
     max_length = 8
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       inputs = max_length * [
@@ -588,7 +606,7 @@ class LSTMTest(test.TestCase):
     num_proj = 4
     max_length = 8
     sequence_length = [4, 6]
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       inputs = max_length * [
@@ -630,7 +648,7 @@ class LSTMTest(test.TestCase):
       self.assertEqual(len(outputs_notuple), len(inputs))
       self.assertEqual(len(outputs_tuple), len(inputs))
       self.assertTrue(isinstance(state_tuple, tuple))
-      self.assertTrue(isinstance(state_notuple, ops_lib.Tensor))
+      self.assertTrue(isinstance(state_notuple, ops.Tensor))
 
       variables_lib.global_variables_initializer().run()
       input_value = np.random.randn(batch_size, input_size)
@@ -651,6 +669,7 @@ class LSTMTest(test.TestCase):
       state_tuple_v = sess.run(state_tuple, feed_dict={inputs[0]: input_value})
       self.assertAllEqual(state_notuple_v, np.hstack(state_tuple_v))
 
+  @test_util.run_v1_only("b/124229375")
   def testProjSharding(self):
     num_units = 3
     input_size = 5
@@ -659,7 +678,7 @@ class LSTMTest(test.TestCase):
     num_proj_shards = 3
     num_unit_shards = 2
     max_length = 8
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
 
@@ -684,6 +703,7 @@ class LSTMTest(test.TestCase):
       input_value = np.random.randn(batch_size, input_size)
       sess.run(outputs, feed_dict={inputs[0]: input_value})
 
+  @test_util.run_v1_only("b/124229375")
   def testDoubleInput(self):
     num_units = 3
     input_size = 5
@@ -692,7 +712,7 @@ class LSTMTest(test.TestCase):
     num_proj_shards = 3
     num_unit_shards = 2
     max_length = 8
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(-1, 1, seed=self._seed)
       inputs = max_length * [
           array_ops.placeholder(dtypes.float64, shape=(None, input_size))
@@ -720,6 +740,7 @@ class LSTMTest(test.TestCase):
       values = sess.run(outputs, feed_dict={inputs[0]: input_value})
       self.assertEqual(values[0].dtype, input_value.dtype)
 
+  @test_util.run_v1_only("b/124229375")
   def testShardNoShardEquivalentOutput(self):
     num_units = 3
     input_size = 5
@@ -728,7 +749,7 @@ class LSTMTest(test.TestCase):
     num_proj_shards = 3
     num_unit_shards = 2
     max_length = 8
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       inputs = max_length * [
           array_ops.placeholder(dtypes.float32, shape=(None, input_size))
       ]
@@ -774,6 +795,7 @@ class LSTMTest(test.TestCase):
       for (s_noshard, s_shard) in zip(state_values_noshard, state_values_shard):
         self.assertAllClose(s_noshard, s_shard, atol=1e-3)
 
+  @test_util.run_v1_only("b/124229375")
   def testDoubleInputWithDropoutAndDynamicCalculation(self):
     """Smoke test for using LSTM with doubles, dropout, dynamic calculation."""
 
@@ -784,7 +806,7 @@ class LSTMTest(test.TestCase):
     num_proj_shards = 3
     num_unit_shards = 2
     max_length = 8
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       sequence_length = array_ops.placeholder(dtypes.int64)
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
@@ -828,13 +850,14 @@ class LSTMTest(test.TestCase):
       self.assertEqual(values[0].dtype, input_value.dtype)
       self.assertEqual(state_value[0].dtype, input_value.dtype)
 
+  @test_util.run_v1_only("b/124229375")
   def testSharingWeightsWithReuse(self):
     num_units = 3
     input_size = 5
     batch_size = 2
     num_proj = 4
     max_length = 8
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(-1, 1, seed=self._seed)
       initializer_d = init_ops.random_uniform_initializer(
           -1, 1, seed=self._seed + 1)
@@ -878,13 +901,14 @@ class LSTMTest(test.TestCase):
         # Different weights used so outputs should be different.
         self.assertTrue(np.linalg.norm(o1 - o3) > 1e-6)
 
+  @test_util.run_v1_only("b/124229375")
   def testSharingWeightsWithDifferentNamescope(self):
     num_units = 3
     input_size = 5
     batch_size = 2
     num_proj = 4
     max_length = 8
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(-1, 1, seed=self._seed)
       inputs = max_length * [
           array_ops.placeholder(dtypes.float32, shape=(None, input_size))
@@ -896,10 +920,10 @@ class LSTMTest(test.TestCase):
           initializer=initializer,
           state_is_tuple=False)
 
-      with ops_lib.name_scope("scope0"):
+      with ops.name_scope("scope0"):
         with variable_scope.variable_scope("share_scope"):
           outputs0, _ = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
-      with ops_lib.name_scope("scope1"):
+      with ops.name_scope("scope1"):
         with variable_scope.variable_scope("share_scope", reuse=True):
           outputs1, _ = rnn.static_rnn(cell, inputs, dtype=dtypes.float32)
 
@@ -915,6 +939,7 @@ class LSTMTest(test.TestCase):
       for out0, out1 in zip(outputs0_values, outputs1_values):
         self.assertAllEqual(out0, out1)
 
+  @test_util.run_v1_only("b/124229375")
   def testDynamicRNNAllowsUnknownTimeDimension(self):
     inputs = array_ops.placeholder(dtypes.float32, shape=[1, None, 20])
     cell = rnn_cell.GRUCell(30)
@@ -930,7 +955,7 @@ class LSTMTest(test.TestCase):
     max_length = 8
     sequence_length = [4, 6]
     in_graph_mode = not context.executing_eagerly()
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       if in_graph_mode:
@@ -1006,7 +1031,7 @@ class LSTMTest(test.TestCase):
     max_length = 8
     sequence_length = [4, 6]
     in_graph_mode = not context.executing_eagerly()
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       initializer = init_ops.random_uniform_initializer(
           -0.01, 0.01, seed=self._seed)
       if in_graph_mode:
@@ -1117,7 +1142,7 @@ class LSTMTest(test.TestCase):
           state_is_tuple=False)
 
     ########### Step 1: Run static graph and generate readouts
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       if in_graph_mode:
         concat_inputs = array_ops.placeholder(
             dtypes.float32, shape=(time_steps, batch_size, input_size))
@@ -1154,8 +1179,8 @@ class LSTMTest(test.TestCase):
             for y in [outputs_static[0], outputs_static[-1], state_static]
         ])
         # Generate gradients of individual variables w.r.t. inputs
-        trainable_variables = ops_lib.get_collection(
-            ops_lib.GraphKeys.TRAINABLE_VARIABLES)
+        trainable_variables = ops.get_collection(
+            ops.GraphKeys.TRAINABLE_VARIABLES)
         assert len(trainable_variables) > 1, (
             "Count of trainable variables: %d" % len(trainable_variables))
         # pylint: disable=bad-builtin
@@ -1177,7 +1202,7 @@ class LSTMTest(test.TestCase):
             static_individual_variable_gradients, feed_dict=feeds)
 
     ########## Step 2: Run dynamic graph and generate readouts
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       if in_graph_mode:
         concat_inputs = array_ops.placeholder(
             dtypes.float32, shape=(time_steps, batch_size, input_size))
@@ -1225,8 +1250,8 @@ class LSTMTest(test.TestCase):
         ])
 
         # Generate gradients of individual variables w.r.t. inputs
-        trainable_variables = ops_lib.get_collection(
-            ops_lib.GraphKeys.TRAINABLE_VARIABLES)
+        trainable_variables = ops.get_collection(
+            ops.GraphKeys.TRAINABLE_VARIABLES)
         assert len(trainable_variables) > 1, (
             "Count of trainable variables: %d" % len(trainable_variables))
         dynamic_individual_variable_gradients = nest.flatten([
@@ -1259,12 +1284,12 @@ class LSTMTest(test.TestCase):
 
     self.assertEqual(len(values_static), len(values_dynamic))
     for (value_static, value_dynamic) in zip(values_static, values_dynamic):
-      self.assertAllEqual(value_static, value_dynamic)
-    self.assertAllEqual(state_value_static, state_value_dynamic)
+      self.assertAllClose(value_static, value_dynamic)
+    self.assertAllClose(state_value_static, state_value_dynamic)
 
     if in_graph_mode:
 
-      self.assertAllEqual(static_grad_values, dynamic_grad_values)
+      self.assertAllClose(static_grad_values, dynamic_grad_values)
 
       self.assertEqual(
           len(static_individual_grad_values),
@@ -1276,14 +1301,14 @@ class LSTMTest(test.TestCase):
       for i, (a, b) in enumerate(
           zip(static_individual_grad_values, dynamic_individual_grad_values)):
         tf_logging.info("Comparing individual gradients iteration %d" % i)
-        self.assertAllEqual(a, b)
+        self.assertAllClose(a, b)
 
       for i, (a, b) in enumerate(
           zip(static_individual_var_grad_values,
               dynamic_individual_var_grad_values)):
         tf_logging.info(
             "Comparing individual variable gradients iteration %d" % i)
-        self.assertAllEqual(a, b)
+        self.assertAllClose(a, b)
 
   @test_util.run_in_graph_and_eager_modes
   def testDynamicEquivalentToStaticRNN(self):
@@ -1337,7 +1362,7 @@ class BidirectionalRNNTest(test.TestCase):
     return input_value, inputs, outputs, state_fw, state_bw, sequence_length
 
   def _testBidirectionalRNN(self, use_shape):
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       input_value, inputs, outputs, state_fw, state_bw, sequence_length = (
           self._createBidirectionalRNN(use_shape, True))
       variables_lib.global_variables_initializer().run()
@@ -1358,33 +1383,33 @@ class BidirectionalRNNTest(test.TestCase):
       #
       # First sequence in batch is length=2
       # Check that the time=0 forward output is equal to time=1 backward output
-      self.assertEqual(out[0][0][0], out[1][0][3])
-      self.assertEqual(out[0][0][1], out[1][0][4])
-      self.assertEqual(out[0][0][2], out[1][0][5])
+      self.assertAllClose(out[0][0][0], out[1][0][3])
+      self.assertAllClose(out[0][0][1], out[1][0][4])
+      self.assertAllClose(out[0][0][2], out[1][0][5])
       # Check that the time=1 forward output is equal to time=0 backward output
-      self.assertEqual(out[1][0][0], out[0][0][3])
-      self.assertEqual(out[1][0][1], out[0][0][4])
-      self.assertEqual(out[1][0][2], out[0][0][5])
+      self.assertAllClose(out[1][0][0], out[0][0][3])
+      self.assertAllClose(out[1][0][1], out[0][0][4])
+      self.assertAllClose(out[1][0][2], out[0][0][5])
 
       # Second sequence in batch is length=3
       # Check that the time=0 forward output is equal to time=2 backward output
-      self.assertEqual(out[0][1][0], out[2][1][3])
-      self.assertEqual(out[0][1][1], out[2][1][4])
-      self.assertEqual(out[0][1][2], out[2][1][5])
+      self.assertAllClose(out[0][1][0], out[2][1][3])
+      self.assertAllClose(out[0][1][1], out[2][1][4])
+      self.assertAllClose(out[0][1][2], out[2][1][5])
       # Check that the time=1 forward output is equal to time=1 backward output
-      self.assertEqual(out[1][1][0], out[1][1][3])
-      self.assertEqual(out[1][1][1], out[1][1][4])
-      self.assertEqual(out[1][1][2], out[1][1][5])
+      self.assertAllClose(out[1][1][0], out[1][1][3])
+      self.assertAllClose(out[1][1][1], out[1][1][4])
+      self.assertAllClose(out[1][1][2], out[1][1][5])
       # Check that the time=2 forward output is equal to time=0 backward output
-      self.assertEqual(out[2][1][0], out[0][1][3])
-      self.assertEqual(out[2][1][1], out[0][1][4])
-      self.assertEqual(out[2][1][2], out[0][1][5])
+      self.assertAllClose(out[2][1][0], out[0][1][3])
+      self.assertAllClose(out[2][1][1], out[0][1][4])
+      self.assertAllClose(out[2][1][2], out[0][1][5])
       # Via the reasoning above, the forward and backward final state should be
       # exactly the same
       self.assertAllClose(s_fw, s_bw)
 
   def _testBidirectionalRNNWithoutSequenceLength(self, use_shape):
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       input_value, inputs, outputs, state_fw, state_bw, _ = (
           self._createBidirectionalRNN(use_shape, False))
       variables_lib.global_variables_initializer().run()
@@ -1402,22 +1427,19 @@ class BidirectionalRNNTest(test.TestCase):
       #
       # Both sequences in batch are length=8.  Check that the time=i
       # forward output is equal to time=8-1-i backward output
-      for i in xrange(8):
-        self.assertEqual(out[i][0][0], out[8 - 1 - i][0][3])
-        self.assertEqual(out[i][0][1], out[8 - 1 - i][0][4])
-        self.assertEqual(out[i][0][2], out[8 - 1 - i][0][5])
-      for i in xrange(8):
-        self.assertEqual(out[i][1][0], out[8 - 1 - i][1][3])
-        self.assertEqual(out[i][1][1], out[8 - 1 - i][1][4])
-        self.assertEqual(out[i][1][2], out[8 - 1 - i][1][5])
+      for i in range(8):
+        self.assertAllClose(out[i][0][0:3], out[8 - 1 - i][0][3:6])
+        self.assertAllClose(out[i][1][0:3], out[8 - 1 - i][1][3:6])
       # Via the reasoning above, the forward and backward final state should be
       # exactly the same
       self.assertAllClose(s_fw, s_bw)
 
+  @test_util.run_v1_only("b/124229375")
   def testBidirectionalRNN(self):
     self._testBidirectionalRNN(use_shape=False)
     self._testBidirectionalRNN(use_shape=True)
 
+  @test_util.run_v1_only("b/124229375")
   def testBidirectionalRNNWithoutSequenceLength(self):
     self._testBidirectionalRNNWithoutSequenceLength(use_shape=False)
     self._testBidirectionalRNNWithoutSequenceLength(use_shape=True)
@@ -1472,7 +1494,7 @@ class BidirectionalRNNTest(test.TestCase):
 
   def _testBidirectionalDynamicRNN(self, use_shape, use_state_tuple,
                                    use_time_major, use_sequence_length):
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       input_value, inputs, outputs, state_fw, state_bw, sequence_length = (
           self._createBidirectionalDynamicRNN(
               use_shape, use_state_tuple, use_time_major, use_sequence_length))
@@ -1534,6 +1556,7 @@ class BidirectionalRNNTest(test.TestCase):
           self.assertAllEqual(out[t, :, 0:3], out[max_length - t - 1, :, 3:6])
         self.assertAllClose(s_fw, s_bw)
 
+  @test_util.run_v1_only("b/124229375")
   def testBidirectionalDynamicRNN(self):
     # Generate 2^5 option values
     # from [True, True, True, True, True] to [False, False, False, False, False]
@@ -1549,7 +1572,7 @@ class BidirectionalRNNTest(test.TestCase):
     # REMARKS: factory(scope) is a function accepting a scope
     #          as an argument, such scope can be None, a string
     #          or a VariableScope instance.
-    with self.session(use_gpu=True, graph=ops_lib.Graph()):
+    with self.session(use_gpu=True, graph=ops.Graph()):
       if use_outer_scope:
         with variable_scope.variable_scope(prefix) as scope:
           factory(scope)
@@ -1568,6 +1591,7 @@ class BidirectionalRNNTest(test.TestCase):
         tf_logging.info(v.name)
       self.assertEqual(len(scope_vars), len(all_vars))
 
+  @test_util.run_v1_only("b/124229375")
   def testBidirectionalRNNScope(self):
 
     def factory(scope):
@@ -1578,6 +1602,7 @@ class BidirectionalRNNTest(test.TestCase):
     self._testScope(factory, use_outer_scope=False)
     self._testScope(factory, prefix=None, use_outer_scope=False)
 
+  @test_util.run_v1_only("b/124229375")
   def testBidirectionalDynamicRNNScope(self):
 
     def get_factory(use_time_major):
@@ -1606,13 +1631,14 @@ class MultiDimensionalLSTMTest(test.TestCase):
     self._seed = 23489
     np.random.seed(self._seed)
 
+  @test_util.run_v1_only("b/124229375")
   def testMultiDimensionalLSTMAllRNNContainers(self):
     feature_dims = (3, 4, 5)
     input_size = feature_dims
     batch_size = 2
     max_length = 8
     sequence_length = [4, 6]
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       inputs = max_length * [
           array_ops.placeholder(dtypes.float32, shape=(None,) + input_size)
       ]
@@ -1717,13 +1743,14 @@ class NestedLSTMTest(test.TestCase):
     self._seed = 23489
     np.random.seed(self._seed)
 
+  @test_util.run_v1_only("b/124229375")
   def testNestedIOLSTMAllRNNContainers(self):
     input_size = 5
     batch_size = 2
     state_size = 6
     max_length = 8
     sequence_length = [4, 6]
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       state_saver = TestStateSaver(batch_size, state_size)
       single_input = (array_ops.placeholder(
           dtypes.float32, shape=(None, input_size)),
@@ -1868,7 +1895,7 @@ class StateSaverRNNTest(test.TestCase):
     batch_size = 2
     state_saver = TestStateSaver(batch_size, 2 * num_units)
 
-    with self.session(use_gpu=True, graph=ops_lib.Graph()):
+    with self.session(use_gpu=True, graph=ops.Graph()):
       if use_outer_scope:
         with variable_scope.variable_scope(prefix) as scope:
           self._factory(scope=scope, state_saver=state_saver)
@@ -1900,6 +1927,7 @@ class StateSaverRNNTest(test.TestCase):
     have influence on number of calls to save_state and state methods of
     state_saver object (the number of calls should be same.)
     """
+    self.skipTest("b/124196246 Breakage for sess.run([out, ...]): 2 != 1")
 
     num_units = 3
     batch_size = 2
@@ -1935,6 +1963,7 @@ class GRUTest(test.TestCase):
     self._seed = 23489
     np.random.seed(self._seed)
 
+  @test_util.run_v1_only("b/124229375")
   def testDynamic(self):
     time_steps = 8
     num_units = 3
@@ -1945,7 +1974,7 @@ class GRUTest(test.TestCase):
 
     sequence_length = np.random.randint(0, time_steps, size=batch_size)
 
-    with self.session(use_gpu=True, graph=ops_lib.Graph()) as sess:
+    with self.session(use_gpu=True, graph=ops.Graph()) as sess:
       concat_inputs = array_ops.placeholder(
           dtypes.float32, shape=(time_steps, batch_size, input_size))
 
@@ -1967,7 +1996,7 @@ class GRUTest(test.TestCase):
       sess.run([outputs_dynamic, state_dynamic], feed_dict=feeds)
 
   def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
-    with self.session(use_gpu=True, graph=ops_lib.Graph()):
+    with self.session(use_gpu=True, graph=ops.Graph()):
       if use_outer_scope:
         with variable_scope.variable_scope(prefix) as scope:
           factory(scope)
@@ -1986,6 +2015,7 @@ class GRUTest(test.TestCase):
         tf_logging.info(v.name)
       self.assertEqual(len(scope_vars), len(all_vars))
 
+  @test_util.run_v1_only("b/124229375")
   def testDynamicScope(self):
     time_steps = 8
     num_units = 3
@@ -2016,8 +2046,9 @@ class RawRNNTest(test.TestCase):
     self._seed = 23489
     np.random.seed(self._seed)
 
+  @test_util.run_v1_only("b/124229375")
   def _testRawRNN(self, max_time):
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       batch_size = 16
       input_depth = 4
       num_units = 3
@@ -2115,6 +2146,7 @@ class RawRNNTest(test.TestCase):
         for i in range(1, len(gradients_val)):
           self.assertAllClose(gradients_dynamic_rnn_val[i], gradients_val[i])
 
+  @test_util.run_v1_only("b/124229375")
   def testRawRNNZeroLength(self):
     # NOTE: Because with 0 time steps, raw_rnn does not have shape
     # information about the input, it is impossible to perform
@@ -2125,8 +2157,9 @@ class RawRNNTest(test.TestCase):
   def testRawRNN(self):
     self._testRawRNN(max_time=10)
 
+  @test_util.run_v1_only("b/124229375")
   def testLoopState(self):
-    with self.session(graph=ops_lib.Graph()):
+    with self.session(graph=ops.Graph()):
       max_time = 10
       batch_size = 16
       input_depth = 4
@@ -2161,8 +2194,9 @@ class RawRNNTest(test.TestCase):
       loop_state = r[-1]
       self.assertEqual([10], loop_state.eval())
 
+  @test_util.run_v1_only("b/124229375")
   def testLoopStateWithTensorArray(self):
-    with self.session(graph=ops_lib.Graph()):
+    with self.session(graph=ops.Graph()):
       max_time = 4
       batch_size = 16
       input_depth = 4
@@ -2204,8 +2238,9 @@ class RawRNNTest(test.TestCase):
       loop_state = loop_state.stack()
       self.assertAllEqual([1, 2, 2 + 2, 4 + 3, 7 + 4], loop_state.eval())
 
+  @test_util.run_v1_only("b/124229375")
   def testEmitDifferentStructureThanCellOutput(self):
-    with self.session(graph=ops_lib.Graph()) as sess:
+    with self.session(graph=ops.Graph()) as sess:
       max_time = 10
       batch_size = 16
       input_depth = 4
@@ -2253,7 +2288,7 @@ class RawRNNTest(test.TestCase):
           np.ones((max_time, batch_size, 1), np.int64), output_vals[1])
 
   def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
-    with self.session(use_gpu=True, graph=ops_lib.Graph()):
+    with self.session(use_gpu=True, graph=ops.Graph()):
       if use_outer_scope:
         with variable_scope.variable_scope(prefix) as scope:
           factory(scope)
@@ -2272,6 +2307,7 @@ class RawRNNTest(test.TestCase):
         tf_logging.info(v.name)
       self.assertEqual(len(scope_vars), len(all_vars))
 
+  @test_util.run_v1_only("b/124229375")
   def testRawRNNScope(self):
     max_time = 10
     batch_size = 16
@@ -2329,7 +2365,7 @@ class DeviceWrapperCell(rnn_cell.RNNCell):
 
   def __call__(self, input_, state, scope=None):
     if self._device is not None:
-      with ops_lib.device(self._device):
+      with ops.device(self._device):
         return self._cell(input_, state, scope=scope)
     else:
       return self._cell(input_, state, scope=scope)
@@ -2353,11 +2389,11 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
     sequence_length = np.random.randint(0, time_steps, size=batch_size)
 
     if input_device is not None:
-      with ops_lib.device(input_device):
+      with ops.device(input_device):
         inputs = constant_op.constant(inputs)
 
     if rnn_device is not None:
-      with ops_lib.device(rnn_device):
+      with ops.device(rnn_device):
         outputs, _ = rnn.dynamic_rnn(
             gpu_cell,
             inputs,
@@ -2443,5 +2479,876 @@ class TensorArrayOnCorrectDeviceTest(test.TestCase):
     _assert_in("TensorArray", gpu_stats, cpu_stats)
 
 
+class RNNCellTest(test.TestCase, parameterized.TestCase):
+
+  @test_util.run_v1_only("b/124229375")
+  def testBasicRNNCell(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 2])
+        cell = rnn_cell_impl.BasicRNNCell(2)
+        g, _ = cell(x, m)
+        self.assertEqual([
+            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME
+        ], [v.name for v in cell.trainable_variables])
+        self.assertFalse(cell.non_trainable_variables)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run([g], {
+            x: np.array([[1., 1.]]),
+            m: np.array([[0.1, 0.1]])
+        })
+        self.assertEqual(res[0].shape, (1, 2))
+
+  @test_util.run_v1_only("b/124229375")
+  def testBasicRNNCellNotTrainable(self):
+    with self.cached_session() as sess:
+
+      def not_trainable_getter(getter, *args, **kwargs):
+        kwargs["trainable"] = False
+        return getter(*args, **kwargs)
+
+      with variable_scope.variable_scope(
+          "root",
+          initializer=init_ops.constant_initializer(0.5),
+          custom_getter=not_trainable_getter):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 2])
+        cell = rnn_cell_impl.BasicRNNCell(2)
+        g, _ = cell(x, m)
+        self.assertFalse(cell.trainable_variables)
+        self.assertEqual([
+            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+            "root/basic_rnn_cell/%s:0" % rnn_cell_impl._BIAS_VARIABLE_NAME
+        ], [v.name for v in cell.non_trainable_variables])
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run([g], {
+            x: np.array([[1., 1.]]),
+            m: np.array([[0.1, 0.1]])
+        })
+        self.assertEqual(res[0].shape, (1, 2))
+
+  @test_util.run_v1_only("b/124229375")
+  def testGRUCell(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 2])
+        g, _ = rnn_cell_impl.GRUCell(2)(x, m)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run([g], {
+            x: np.array([[1., 1.]]),
+            m: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.175991, 0.175991]])
+      with variable_scope.variable_scope(
+          "other", initializer=init_ops.constant_initializer(0.5)):
+        # Test GRUCell with input_size != num_units.
+        x = array_ops.zeros([1, 3])
+        m = array_ops.zeros([1, 2])
+        g, _ = rnn_cell_impl.GRUCell(2)(x, m)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run([g], {
+            x: np.array([[1., 1., 1.]]),
+            m: np.array([[0.1, 0.1]])
+        })
+        # Smoke test
+        self.assertAllClose(res[0], [[0.156736, 0.156736]])
+
+  @test_util.run_v1_only("b/124229375")
+  def testBasicLSTMCell(self):
+    for dtype in [dtypes.float16, dtypes.float32]:
+      np_dtype = dtype.as_numpy_dtype
+      with self.session(graph=ops.Graph()) as sess:
+        with variable_scope.variable_scope(
+            "root", initializer=init_ops.constant_initializer(0.5)):
+          x = array_ops.zeros([1, 2], dtype=dtype)
+          m = array_ops.zeros([1, 8], dtype=dtype)
+          cell = rnn_cell_impl.MultiRNNCell(
+              [
+                  rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
+                  for _ in range(2)
+              ],
+              state_is_tuple=False)
+          self.assertEqual(cell.dtype, None)
+          self.assertEqual("cell-0", cell._checkpoint_dependencies[0].name)
+          self.assertEqual("cell-1", cell._checkpoint_dependencies[1].name)
+          cell.get_config()  # Should not throw an error
+          g, out_m = cell(x, m)
+          # Layer infers the input type.
+          self.assertEqual(cell.dtype, dtype.name)
+          expected_variable_names = [
+              "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" %
+              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+              "root/multi_rnn_cell/cell_0/basic_lstm_cell/%s:0" %
+              rnn_cell_impl._BIAS_VARIABLE_NAME,
+              "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" %
+              rnn_cell_impl._WEIGHTS_VARIABLE_NAME,
+              "root/multi_rnn_cell/cell_1/basic_lstm_cell/%s:0" %
+              rnn_cell_impl._BIAS_VARIABLE_NAME
+          ]
+          self.assertEqual(expected_variable_names,
+                           [v.name for v in cell.trainable_variables])
+          self.assertFalse(cell.non_trainable_variables)
+          sess.run([variables_lib.global_variables_initializer()])
+          res = sess.run([g, out_m], {
+              x: np.array([[1., 1.]]),
+              m: 0.1 * np.ones([1, 8])
+          })
+          self.assertEqual(len(res), 2)
+          variables = variables_lib.global_variables()
+          self.assertEqual(expected_variable_names, [v.name for v in variables])
+          # The numbers in results were not calculated, this is just a
+          # smoke test.
+          self.assertAllClose(res[0], np.array(
+              [[0.240, 0.240]], dtype=np_dtype), 1e-2)
+          expected_mem = np.array(
+              [[0.689, 0.689, 0.448, 0.448, 0.398, 0.398, 0.240, 0.240]],
+              dtype=np_dtype)
+          self.assertAllClose(res[1], expected_mem, 1e-2)
+        with variable_scope.variable_scope(
+            "other", initializer=init_ops.constant_initializer(0.5)):
+          # Test BasicLSTMCell with input_size != num_units.
+          x = array_ops.zeros([1, 3], dtype=dtype)
+          m = array_ops.zeros([1, 4], dtype=dtype)
+          g, out_m = rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)(x, m)
+          sess.run([variables_lib.global_variables_initializer()])
+          res = sess.run(
+              [g, out_m], {
+                  x: np.array([[1., 1., 1.]], dtype=np_dtype),
+                  m: 0.1 * np.ones([1, 4], dtype=np_dtype)
+              })
+          self.assertEqual(len(res), 2)
+
+  @test_util.run_v1_only("b/124229375")
+  def testBasicLSTMCellDimension0Error(self):
+    """Tests that dimension 0 in both(x and m) shape must be equal."""
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        num_units = 2
+        state_size = num_units * 2
+        batch_size = 3
+        input_size = 4
+        x = array_ops.zeros([batch_size, input_size])
+        m = array_ops.zeros([batch_size - 1, state_size])
+        with self.assertRaises(ValueError):
+          g, out_m = rnn_cell_impl.BasicLSTMCell(
+              num_units, state_is_tuple=False)(x, m)
+          sess.run([variables_lib.global_variables_initializer()])
+          sess.run(
+              [g, out_m], {
+                  x: 1 * np.ones([batch_size, input_size]),
+                  m: 0.1 * np.ones([batch_size - 1, state_size])
+              })
+
+  def testBasicLSTMCellStateSizeError(self):
+    """Tests that state_size must be num_units * 2."""
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        num_units = 2
+        state_size = num_units * 3  # state_size must be num_units * 2
+        batch_size = 3
+        input_size = 4
+        x = array_ops.zeros([batch_size, input_size])
+        m = array_ops.zeros([batch_size, state_size])
+        with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
+          g, out_m = rnn_cell_impl.BasicLSTMCell(
+              num_units, state_is_tuple=False)(x, m)
+          sess.run([variables_lib.global_variables_initializer()])
+          sess.run(
+              [g, out_m], {
+                  x: 1 * np.ones([batch_size, input_size]),
+                  m: 0.1 * np.ones([batch_size, state_size])
+              })
+
+  @test_util.run_v1_only("b/124229375")
+  def testBasicLSTMCellStateTupleType(self):
+    with self.cached_session():
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m0 = (array_ops.zeros([1, 2]),) * 2
+        m1 = (array_ops.zeros([1, 2]),) * 2
+        cell = rnn_cell_impl.MultiRNNCell(
+            [rnn_cell_impl.BasicLSTMCell(2) for _ in range(2)],
+            state_is_tuple=True)
+        self.assertTrue(isinstance(cell.state_size, tuple))
+        self.assertTrue(
+            isinstance(cell.state_size[0], rnn_cell_impl.LSTMStateTuple))
+        self.assertTrue(
+            isinstance(cell.state_size[1], rnn_cell_impl.LSTMStateTuple))
+
+        # Pass in regular tuples
+        _, (out_m0, out_m1) = cell(x, (m0, m1))
+        self.assertTrue(isinstance(out_m0, rnn_cell_impl.LSTMStateTuple))
+        self.assertTrue(isinstance(out_m1, rnn_cell_impl.LSTMStateTuple))
+
+        # Pass in LSTMStateTuples
+        variable_scope.get_variable_scope().reuse_variables()
+        zero_state = cell.zero_state(1, dtypes.float32)
+        self.assertTrue(isinstance(zero_state, tuple))
+        self.assertTrue(isinstance(zero_state[0], rnn_cell_impl.LSTMStateTuple))
+        self.assertTrue(isinstance(zero_state[1], rnn_cell_impl.LSTMStateTuple))
+        _, (out_m0, out_m1) = cell(x, zero_state)
+        self.assertTrue(isinstance(out_m0, rnn_cell_impl.LSTMStateTuple))
+        self.assertTrue(isinstance(out_m1, rnn_cell_impl.LSTMStateTuple))
+
+  @test_util.run_v1_only("b/124229375")
+  def testBasicLSTMCellWithStateTuple(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m0 = array_ops.zeros([1, 4])
+        m1 = array_ops.zeros([1, 4])
+        cell = rnn_cell_impl.MultiRNNCell(
+            [
+                rnn_cell_impl.BasicLSTMCell(2, state_is_tuple=False)
+                for _ in range(2)
+            ],
+            state_is_tuple=True)
+        g, (out_m0, out_m1) = cell(x, (m0, m1))
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run(
+            [g, out_m0, out_m1], {
+                x: np.array([[1., 1.]]),
+                m0: 0.1 * np.ones([1, 4]),
+                m1: 0.1 * np.ones([1, 4])
+            })
+        self.assertEqual(len(res), 3)
+        # The numbers in results were not calculated, this is just a smoke test.
+        # Note, however, these values should match the original
+        # version having state_is_tuple=False.
+        self.assertAllClose(res[0], [[0.24024698, 0.24024698]])
+        expected_mem0 = np.array(
+            [[0.68967271, 0.68967271, 0.44848421, 0.44848421]])
+        expected_mem1 = np.array(
+            [[0.39897051, 0.39897051, 0.24024698, 0.24024698]])
+        self.assertAllClose(res[1], expected_mem0)
+        self.assertAllClose(res[2], expected_mem1)
+
+  @test_util.run_v1_only("b/124229375")
+  def testLSTMCell(self):
+    with self.cached_session() as sess:
+      num_units = 8
+      num_proj = 6
+      state_size = num_units + num_proj
+      batch_size = 3
+      input_size = 2
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([batch_size, input_size])
+        m = array_ops.zeros([batch_size, state_size])
+        cell = rnn_cell_impl.LSTMCell(
+            num_units=num_units,
+            num_proj=num_proj,
+            forget_bias=1.0,
+            state_is_tuple=False)
+        output, state = cell(x, m)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run(
+            [output, state], {
+                x: np.array([[1., 1.], [2., 2.], [3., 3.]]),
+                m: 0.1 * np.ones((batch_size, state_size))
+            })
+        self.assertEqual(len(res), 2)
+        # The numbers in results were not calculated, this is mostly just a
+        # smoke test.
+        self.assertEqual(res[0].shape, (batch_size, num_proj))
+        self.assertEqual(res[1].shape, (batch_size, state_size))
+        # Different inputs so different outputs and states
+        for i in range(1, batch_size):
+          self.assertTrue(
+              float(np.linalg.norm((res[0][0, :] - res[0][i, :]))) > 1e-6)
+          self.assertTrue(
+              float(np.linalg.norm((res[1][0, :] - res[1][i, :]))) > 1e-6)
+
+  @test_util.run_v1_only("b/124229375")
+  def testLSTMCellVariables(self):
+    with self.cached_session():
+      num_units = 8
+      num_proj = 6
+      state_size = num_units + num_proj
+      batch_size = 3
+      input_size = 2
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([batch_size, input_size])
+        m = array_ops.zeros([batch_size, state_size])
+        cell = rnn_cell_impl.LSTMCell(
+            num_units=num_units,
+            num_proj=num_proj,
+            forget_bias=1.0,
+            state_is_tuple=False)
+        cell(x, m)  # Execute to create variables
+      variables = variables_lib.global_variables()
+      self.assertEquals(variables[0].op.name, "root/lstm_cell/kernel")
+      self.assertEquals(variables[1].op.name, "root/lstm_cell/bias")
+      self.assertEquals(variables[2].op.name,
+                        "root/lstm_cell/projection/kernel")
+
+  @test_util.run_in_graph_and_eager_modes
+  def testWrapperCheckpointing(self):
+    for wrapper_type in [
+        rnn_cell_impl.DropoutWrapper,
+        rnn_cell_impl.ResidualWrapper,
+        lambda cell: rnn_cell_impl.MultiRNNCell([cell])]:
+      cell = rnn_cell_impl.BasicRNNCell(1)
+      wrapper = wrapper_type(cell)
+      wrapper(array_ops.ones([1, 1]),
+              state=wrapper.zero_state(batch_size=1, dtype=dtypes.float32))
+      self.evaluate([v.initializer for v in cell.variables])
+      checkpoint = trackable_utils.Checkpoint(wrapper=wrapper)
+      prefix = os.path.join(self.get_temp_dir(), "ckpt")
+      self.evaluate(cell._bias.assign([40.]))
+      save_path = checkpoint.save(prefix)
+      self.evaluate(cell._bias.assign([0.]))
+      checkpoint.restore(save_path).assert_consumed().run_restore_ops()
+      self.assertAllEqual([40.], self.evaluate(cell._bias))
+
+  @parameterized.parameters(
+      [rnn_cell_impl.ResidualWrapper, rnn_cell_impl.ResidualWrapperV2])
+  @test_util.run_in_graph_and_eager_modes
+  def testResidualWrapper(self, wrapper_type):
+    x = ops.convert_to_tensor(np.array([[1., 1., 1.]]))
+    m = ops.convert_to_tensor(np.array([[0.1, 0.1, 0.1]]))
+    base_cell = rnn_cell_impl.GRUCell(
+        3, kernel_initializer=init_ops.constant_initializer(0.5),
+        bias_initializer=init_ops.constant_initializer(0.5))
+    g, m_new = base_cell(x, m)
+    wrapper_object = wrapper_type(base_cell)
+    (name, dep), = wrapper_object._checkpoint_dependencies
+    wrapper_object.get_config()  # Should not throw an error
+    self.assertIs(dep, base_cell)
+    self.assertEqual("cell", name)
+
+    g_res, m_new_res = wrapper_object(x, m)
+    self.evaluate([variables_lib.global_variables_initializer()])
+    res = self.evaluate([g, g_res, m_new, m_new_res])
+    # Residual connections
+    self.assertAllClose(res[1], res[0] + [1., 1., 1.])
+    # States are left untouched
+    self.assertAllClose(res[2], res[3])
+
+  @parameterized.parameters(
+      [rnn_cell_impl.ResidualWrapper, rnn_cell_impl.ResidualWrapperV2])
+  @test_util.run_in_graph_and_eager_modes
+  def testResidualWrapperWithSlice(self, wrapper_type):
+    x = ops.convert_to_tensor(np.array([[1., 1., 1., 1., 1.]]))
+    m = ops.convert_to_tensor(np.array([[0.1, 0.1, 0.1]]))
+    base_cell = rnn_cell_impl.GRUCell(
+        3, kernel_initializer=init_ops.constant_initializer(0.5),
+        bias_initializer=init_ops.constant_initializer(0.5))
+    g, m_new = base_cell(x, m)
+
+    def residual_with_slice_fn(inp, out):
+      inp_sliced = array_ops.slice(inp, [0, 0], [-1, 3])
+      return inp_sliced + out
+
+    g_res, m_new_res = wrapper_type(
+        base_cell, residual_with_slice_fn)(x, m)
+    self.evaluate([variables_lib.global_variables_initializer()])
+    res_g, res_g_res, res_m_new, res_m_new_res = self.evaluate(
+        [g, g_res, m_new, m_new_res])
+    # Residual connections
+    self.assertAllClose(res_g_res, res_g + [1., 1., 1.])
+    # States are left untouched
+    self.assertAllClose(res_m_new, res_m_new_res)
+
+  @test_util.run_v1_only("b/124229375")
+  def testDeviceWrapper(self):
+    with variable_scope.variable_scope(
+        "root", initializer=init_ops.constant_initializer(0.5)):
+      x = array_ops.zeros([1, 3])
+      m = array_ops.zeros([1, 3])
+      wrapped = rnn_cell_impl.GRUCell(3)
+      cell = rnn_cell_impl.DeviceWrapper(wrapped, "/cpu:14159")
+      (name, dep), = cell._checkpoint_dependencies
+      cell.get_config()  # Should not throw an error
+      self.assertIs(dep, wrapped)
+      self.assertEqual("cell", name)
+
+      outputs, _ = cell(x, m)
+      self.assertTrue("cpu:14159" in outputs.device.lower())
+
+  def _retrieve_cpu_gpu_stats(self, run_metadata):
+    cpu_stats = None
+    gpu_stats = None
+    step_stats = run_metadata.step_stats
+    for ds in step_stats.dev_stats:
+      if "cpu:0" in ds.device[-5:].lower():
+        cpu_stats = ds.node_stats
+      if "gpu:0" == ds.device[-5:].lower():
+        gpu_stats = ds.node_stats
+    return cpu_stats, gpu_stats
+
+  def testDeviceWrapperDynamicExecutionNodesAreAllProperlyLocated(self):
+    if not test.is_gpu_available():
+      # Can't perform this test w/o a GPU
+      return
+
+    gpu_dev = test.gpu_device_name()
+    with self.session(use_gpu=True) as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 1, 3])
+        cell = rnn_cell_impl.DeviceWrapper(rnn_cell_impl.GRUCell(3), gpu_dev)
+        with ops.device("/cpu:0"):
+          outputs, _ = rnn.dynamic_rnn(
+              cell=cell, inputs=x, dtype=dtypes.float32)
+        run_metadata = config_pb2.RunMetadata()
+        opts = config_pb2.RunOptions(
+            trace_level=config_pb2.RunOptions.FULL_TRACE)
+
+        sess.run([variables_lib.global_variables_initializer()])
+        _ = sess.run(outputs, options=opts, run_metadata=run_metadata)
+
+      cpu_stats, gpu_stats = self._retrieve_cpu_gpu_stats(run_metadata)
+      self.assertFalse([s for s in cpu_stats if "gru_cell" in s.node_name])
+      self.assertTrue([s for s in gpu_stats if "gru_cell" in s.node_name])
+
+  @test_util.run_v1_only("b/124229375")
+  def testMultiRNNCell(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m = array_ops.zeros([1, 4])
+        multi_rnn_cell = rnn_cell_impl.MultiRNNCell(
+            [rnn_cell_impl.GRUCell(2) for _ in range(2)],
+            state_is_tuple=False)
+        _, ml = multi_rnn_cell(x, m)
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run(ml, {
+            x: np.array([[1., 1.]]),
+            m: np.array([[0.1, 0.1, 0.1, 0.1]])
+        })
+        # The numbers in results were not calculated, this is just a smoke test.
+        self.assertAllClose(res, [[0.175991, 0.175991, 0.13248, 0.13248]])
+        self.assertEqual(len(multi_rnn_cell.weights), 2 * 4)
+        self.assertTrue(
+            [x.dtype == dtypes.float32 for x in multi_rnn_cell.weights])
+
+  @test_util.run_v1_only("b/124229375")
+  def testMultiRNNCellWithStateTuple(self):
+    with self.cached_session() as sess:
+      with variable_scope.variable_scope(
+          "root", initializer=init_ops.constant_initializer(0.5)):
+        x = array_ops.zeros([1, 2])
+        m_bad = array_ops.zeros([1, 4])
+        m_good = (array_ops.zeros([1, 2]), array_ops.zeros([1, 2]))
+
+        # Test incorrectness of state
+        with self.assertRaisesRegexp(ValueError, "Expected state .* a tuple"):
+          rnn_cell_impl.MultiRNNCell(
+              [rnn_cell_impl.GRUCell(2) for _ in range(2)],
+              state_is_tuple=True)(x, m_bad)
+
+        _, ml = rnn_cell_impl.MultiRNNCell(
+            [rnn_cell_impl.GRUCell(2) for _ in range(2)],
+            state_is_tuple=True)(x, m_good)
+
+        sess.run([variables_lib.global_variables_initializer()])
+        res = sess.run(
+            ml, {
+                x: np.array([[1., 1.]]),
+                m_good[0]: np.array([[0.1, 0.1]]),
+                m_good[1]: np.array([[0.1, 0.1]])
+            })
+
+        # The numbers in results were not calculated, this is just a
+        # smoke test.  However, these numbers should match those of
+        # the test testMultiRNNCell.
+        self.assertAllClose(res[0], [[0.175991, 0.175991]])
+        self.assertAllClose(res[1], [[0.13248, 0.13248]])
+
+  @parameterized.parameters(
+      [[rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2],
+       [rnn_cell_impl.ResidualWrapper, rnn_cell_impl.ResidualWrapperV2]])
+  @test_util.run_in_graph_and_eager_modes
+  def testWrapperKerasStyle(self, wrapper, wrapper_v2):
+    """Tests if wrapper cell is instantiated in keras style scope."""
+    wrapped_cell_v2 = wrapper_v2(rnn_cell_impl.BasicRNNCell(1))
+    self.assertTrue(wrapped_cell_v2._keras_style)
+
+    wrapped_cell = wrapper(rnn_cell_impl.BasicRNNCell(1))
+    self.assertFalse(wrapped_cell._keras_style)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapperV2, rnn_cell_impl.ResidualWrapperV2])
+  @test_util.run_in_graph_and_eager_modes
+  def testWrapperV2VariableNames(self, wrapper):
+    """Tests that variables names do not depend on wrapper in RNN layer."""
+
+    def _rnn_input(apply_wrapper, name):
+      """Creates a RNN layer with/without wrapper and returns built rnn cell."""
+      with base_layer.keras_style_scope():
+        base_cell = rnn_cell_impl.MultiRNNCell(
+            [rnn_cell_impl.BasicRNNCell(1, name="basic_rnn_cell")
+             for _ in range(2)])
+      if apply_wrapper:
+        rnn_cell = wrapper(base_cell)
+      else:
+        rnn_cell = base_cell
+      rnn_layer = keras_layers.RNN(rnn_cell, name=name)
+      inputs = ops.convert_to_tensor([[[1]]], dtype=dtypes.float32)
+      _ = rnn_layer(inputs)
+      return base_cell._cells[0]
+
+    rnn_1 = _rnn_input(True, name="rnn_0")
+    rnn_2 = _rnn_input(False, name="rnn_1")
+
+    for i, cell in enumerate([rnn_1, rnn_2]):
+      var_prefix = "rnn_{}/cell_0/basic_rnn_cell/".format(i)
+      self.assertCountEqual([v.name for v in cell.weights],
+                            (var_prefix + "kernel:0", var_prefix + "bias:0"))
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapperV2, rnn_cell_impl.ResidualWrapperV2])
+  @test_util.run_in_graph_and_eager_modes
+  def testWrapperWeights(self, wrapper):
+    """Tests that wrapper weights contain wrapped cells weights."""
+
+    with base_layer.keras_style_scope():
+      base_cell = rnn_cell_impl.BasicRNNCell(1, name="basic_rnn_cell")
+    rnn_cell = wrapper(base_cell)
+    rnn_layer = keras_layers.RNN(rnn_cell)
+    inputs = ops.convert_to_tensor([[[1]]], dtype=dtypes.float32)
+    rnn_layer(inputs)
+
+    expected_weights = ["rnn/" + var for var in ("kernel:0", "bias:0")]
+    self.assertEqual(len(rnn_cell.weights), 2)
+    self.assertCountEqual([v.name for v in rnn_cell.weights], expected_weights)
+    self.assertCountEqual([v.name for v in rnn_cell.trainable_variables],
+                          expected_weights)
+    self.assertCountEqual([v.name for v in rnn_cell.non_trainable_variables],
+                          [])
+    self.assertCountEqual([v.name for v in rnn_cell._cell.weights],
+                          expected_weights)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapperV2, rnn_cell_impl.ResidualWrapperV2])
+  @test_util.run_in_graph_and_eager_modes
+  def testWrapperV2Caller(self, wrapper):
+    """Tests that wrapper V2 is using the LayerRNNCell's caller."""
+
+    with base_layer.keras_style_scope():
+      base_cell = rnn_cell_impl.MultiRNNCell(
+          [rnn_cell_impl.BasicRNNCell(1) for _ in range(2)])
+    rnn_cell = wrapper(base_cell)
+    inputs = ops.convert_to_tensor([[1]], dtype=dtypes.float32)
+    state = ops.convert_to_tensor([[1]], dtype=dtypes.float32)
+    _ = rnn_cell(inputs, [state, state])
+    weights = base_cell._cells[0].weights
+    self.assertLen(weights, expected_len=2)
+    self.assertTrue(all(["_wrapper" in v.name for v in weights]))
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapperV2, rnn_cell_impl.ResidualWrapperV2])
+  @test_util.run_in_graph_and_eager_modes
+  def testWrapperV2Build(self, wrapper):
+    cell = rnn_cell_impl.LSTMCell(10)
+    wrapper = wrapper(cell)
+    wrapper.build((1,))
+    self.assertTrue(cell.built)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class DropoutWrapperTest(test.TestCase, parameterized.TestCase):
+
+  def _testDropoutWrapper(self,
+                          batch_size=None,
+                          time_steps=None,
+                          parallel_iterations=None,
+                          wrapper_type=None,
+                          scope="root",
+                          **kwargs):
+    if batch_size is None and time_steps is None:
+      # 2 time steps, batch size 1, depth 3
+      batch_size = 1
+      time_steps = 2
+      x = constant_op.constant(
+          [[[2., 2., 2.]], [[1., 1., 1.]]], dtype=dtypes.float32)
+      m = rnn_cell_impl.LSTMStateTuple(
+          *[constant_op.constant([[0.1, 0.1, 0.1]], dtype=dtypes.float32)] * 2)
+    else:
+      x = constant_op.constant(
+          np.random.randn(time_steps, batch_size, 3).astype(np.float32))
+      m = rnn_cell_impl.LSTMStateTuple(*[
+          constant_op.
+          constant([[0.1, 0.1, 0.1]] * batch_size, dtype=dtypes.float32)] * 2)
+    outputs, final_state = rnn.dynamic_rnn(
+        cell=wrapper_type(
+            rnn_cell_impl.LSTMCell(
+                3, initializer=init_ops.constant_initializer(0.5)),
+            dtype=x.dtype, **kwargs),
+        time_major=True,
+        parallel_iterations=parallel_iterations,
+        inputs=x,
+        initial_state=m,
+        scope=scope)
+    self.evaluate([variables_lib.global_variables_initializer()])
+    res = self.evaluate([outputs, final_state])
+    self.assertEqual(res[0].shape, (time_steps, batch_size, 3))
+    self.assertEqual(res[1].c.shape, (batch_size, 3))
+    self.assertEqual(res[1].h.shape, (batch_size, 3))
+    return res
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperProperties(self, wrapper_type):
+    cell = rnn_cell_impl.BasicRNNCell(10)
+    wrapper = wrapper_type(cell)
+    # Github issue 15810
+    self.assertEqual(wrapper.wrapped_cell, cell)
+    self.assertEqual(wrapper.state_size, 10)
+    self.assertEqual(wrapper.output_size, 10)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperZeroState(self, wrapper_type):
+    class _Cell(rnn_cell_impl.BasicRNNCell):
+
+      def zero_state(self, batch_size=None, dtype=None):
+        return "wrapped_cell_zero_state"
+    wrapper = wrapper_type(_Cell(10))
+    self.assertEqual(wrapper.zero_state(10, dtypes.float32),
+                     "wrapped_cell_zero_state")
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperKeepAllConstantInput(self, wrapper_type):
+    keep = array_ops.ones([])
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep, output_keep_prob=keep, state_keep_prob=keep,
+        wrapper_type=wrapper_type)
+    true_full_output = np.array(
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
+    true_full_final_c = np.array(
+        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
+    self.assertAllClose(true_full_output, res[0])
+    self.assertAllClose(true_full_output[1], res[1].h)
+    self.assertAllClose(true_full_final_c, res[1].c)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperKeepAll(self, wrapper_type):
+    keep = variable_scope.get_variable("all", initializer=1.0)
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep, output_keep_prob=keep, state_keep_prob=keep,
+        wrapper_type=wrapper_type)
+    true_full_output = np.array(
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
+    true_full_final_c = np.array(
+        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
+    self.assertAllClose(true_full_output, res[0])
+    self.assertAllClose(true_full_output[1], res[1].h)
+    self.assertAllClose(true_full_final_c, res[1].c)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperWithSeed(self, wrapper_type):
+    keep_some = 0.5
+    random_seed.set_random_seed(2)
+    ## Use parallel_iterations = 1 in both calls to
+    ## _testDropoutWrapper to ensure the (per-time step) dropout is
+    ## consistent across both calls.  Otherwise the seed may not end
+    ## up being munged consistently across both graphs.
+    res_standard_1 = self._testDropoutWrapper(
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        seed=10,
+        parallel_iterations=1,
+        wrapper_type=wrapper_type,
+        scope="root_1")
+    random_seed.set_random_seed(2)
+    res_standard_2 = self._testDropoutWrapper(
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        seed=10,
+        parallel_iterations=1,
+        wrapper_type=wrapper_type,
+        scope="root_2")
+    self.assertAllClose(res_standard_1[0], res_standard_2[0])
+    self.assertAllClose(res_standard_1[1].c, res_standard_2[1].c)
+    self.assertAllClose(res_standard_1[1].h, res_standard_2[1].h)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperKeepNoOutput(self, wrapper_type):
+    keep_all = variable_scope.get_variable("all", initializer=1.0)
+    keep_none = variable_scope.get_variable("none", initializer=1e-6)
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep_all,
+        output_keep_prob=keep_none,
+        state_keep_prob=keep_all,
+        wrapper_type=wrapper_type)
+    true_full_output = np.array(
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
+    true_full_final_c = np.array(
+        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
+    self.assertAllClose(np.zeros(res[0].shape), res[0])
+    self.assertAllClose(true_full_output[1], res[1].h)
+    self.assertAllClose(true_full_final_c, res[1].c)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperKeepNoStateExceptLSTMCellMemory(self, wrapper_type):
+    keep_all = variable_scope.get_variable("all", initializer=1.0)
+    keep_none = variable_scope.get_variable("none", initializer=1e-6)
+    # Even though we dropout state, by default DropoutWrapper never
+    # drops out the memory ("c") term of an LSTMStateTuple.
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep_all,
+        output_keep_prob=keep_all,
+        state_keep_prob=keep_none,
+        wrapper_type=wrapper_type)
+    true_c_state = np.array([[1.713925, 1.713925, 1.713925]], dtype=np.float32)
+    true_full_output = np.array(
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
+    self.assertAllClose(true_full_output[0], res[0][0])
+    # Second output is modified by zero input state
+    self.assertGreater(np.linalg.norm(true_full_output[1] - res[0][1]), 1e-4)
+    # h state has been set to zero
+    self.assertAllClose(np.zeros(res[1].h.shape), res[1].h)
+    # c state of an LSTMStateTuple is NEVER modified.
+    self.assertAllClose(true_c_state, res[1].c)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperKeepNoInput(self, wrapper_type):
+    keep_all = variable_scope.get_variable("all", initializer=1.0)
+    keep_none = variable_scope.get_variable("none", initializer=1e-6)
+    true_full_output = np.array(
+        [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
+        dtype=np.float32)
+    true_full_final_c = np.array(
+        [[1.949385, 1.949385, 1.949385]], dtype=np.float32)
+    # All outputs are different because inputs are zeroed out
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep_none,
+        output_keep_prob=keep_all,
+        state_keep_prob=keep_all,
+        wrapper_type=wrapper_type)
+    self.assertGreater(np.linalg.norm(res[0] - true_full_output), 1e-4)
+    self.assertGreater(np.linalg.norm(res[1].h - true_full_output[1]), 1e-4)
+    self.assertGreater(np.linalg.norm(res[1].c - true_full_final_c), 1e-4)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperRecurrentOutput(self, wrapper_type):
+    keep_some = 0.8
+    keep_all = variable_scope.get_variable("all", initializer=1.0)
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep_all,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_all,
+        variational_recurrent=True,
+        wrapper_type=wrapper_type,
+        input_size=3,
+        batch_size=5,
+        time_steps=7)
+    # Ensure the same dropout pattern for all time steps
+    output_mask = np.abs(res[0]) > 1e-6
+    for m in output_mask[1:]:
+      self.assertAllClose(output_mask[0], m)
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperRecurrentStateInputAndOutput(self, wrapper_type):
+    keep_some = 0.9
+    res = self._testDropoutWrapper(
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        variational_recurrent=True,
+        wrapper_type=wrapper_type,
+        input_size=3,
+        batch_size=5,
+        time_steps=7)
+
+    # Smoke test for the state/input masks.
+    output_mask = np.abs(res[0]) > 1e-6
+    for time_step in output_mask:
+      # Ensure the same dropout output pattern for all time steps
+      self.assertAllClose(output_mask[0], time_step)
+      for batch_entry in time_step:
+        # Assert all batch entries get the same mask
+        self.assertAllClose(batch_entry, time_step[0])
+
+    # For state, ensure all batch entries have the same mask
+    state_c_mask = np.abs(res[1].c) > 1e-6
+    state_h_mask = np.abs(res[1].h) > 1e-6
+    for batch_entry in state_c_mask:
+      self.assertAllClose(batch_entry, state_c_mask[0])
+    for batch_entry in state_h_mask:
+      self.assertAllClose(batch_entry, state_h_mask[0])
+
+  @parameterized.parameters(
+      [rnn_cell_impl.DropoutWrapper, rnn_cell_impl.DropoutWrapperV2])
+  def testDropoutWrapperRecurrentStateInputAndOutputWithSeed(
+      self, wrapper_type):
+    keep_some = 0.9
+    random_seed.set_random_seed(2347)
+    np.random.seed(23487)
+    res0 = self._testDropoutWrapper(
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        variational_recurrent=True,
+        wrapper_type=wrapper_type,
+        input_size=3,
+        batch_size=5,
+        time_steps=7,
+        seed=-234987,
+        scope="root_0")
+    random_seed.set_random_seed(2347)
+    np.random.seed(23487)
+    res1 = self._testDropoutWrapper(
+        input_keep_prob=keep_some,
+        output_keep_prob=keep_some,
+        state_keep_prob=keep_some,
+        variational_recurrent=True,
+        wrapper_type=wrapper_type,
+        input_size=3,
+        batch_size=5,
+        time_steps=7,
+        seed=-234987,
+        scope="root_1")
+
+    output_mask = np.abs(res0[0]) > 1e-6
+    for time_step in output_mask:
+      # Ensure the same dropout output pattern for all time steps
+      self.assertAllClose(output_mask[0], time_step)
+      for batch_entry in time_step:
+        # Assert all batch entries get the same mask
+        self.assertAllClose(batch_entry, time_step[0])
+
+    # For state, ensure all batch entries have the same mask
+    state_c_mask = np.abs(res0[1].c) > 1e-6
+    state_h_mask = np.abs(res0[1].h) > 1e-6
+    for batch_entry in state_c_mask:
+      self.assertAllClose(batch_entry, state_c_mask[0])
+    for batch_entry in state_h_mask:
+      self.assertAllClose(batch_entry, state_h_mask[0])
+
+    # Ensure seeded calculation is identical.
+    self.assertAllClose(res0[0], res1[0])
+    self.assertAllClose(res0[1].c, res1[1].c)
+    self.assertAllClose(res0[1].h, res1[1].h)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index a49496e4ef15bc2772fe7abdac4d801b77787079..12b69da6c2e4806110b4af93042f94d5248d64e5 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -172,6 +172,26 @@ class RNNTest(test.TestCase):
           dtype=dtypes.float32,
           sequence_length=[[4]])
 
+  @test_util.run_in_graph_and_eager_modes
+  def testInvalidDtype(self):
+    if context.executing_eagerly():
+      inputs = np.zeros((3, 4, 5), dtype=np.int32)
+    else:
+      inputs = array_ops.placeholder(dtypes.int32, shape=(3, 4, 5))
+
+    cells = [
+        rnn_cell_impl.BasicRNNCell,
+        rnn_cell_impl.GRUCell,
+        rnn_cell_impl.BasicLSTMCell,
+        rnn_cell_impl.LSTMCell,
+    ]
+    for cell_cls in cells:
+      with self.cached_session():
+        with self.assertRaisesRegexp(
+            ValueError, "RNN cell only supports floating"):
+          cell = cell_cls(2, dtype=dtypes.int32)
+          rnn.dynamic_rnn(cell, inputs, dtype=dtypes.int32)
+
   @test_util.run_in_graph_and_eager_modes
   def testBatchSizeFromInput(self):
     cell = Plus1RNNCell()
diff --git a/tensorflow/python/kernel_tests/scan_ops_test.py b/tensorflow/python/kernel_tests/scan_ops_test.py
index 33e491fee1dadbcce225dfa70310d47a21b6893c..2a3021f982149f619eef2d32edd7929e0c8b7603 100644
--- a/tensorflow/python/kernel_tests/scan_ops_test.py
+++ b/tensorflow/python/kernel_tests/scan_ops_test.py
@@ -134,6 +134,7 @@ class CumsumTest(test.TestCase):
         self._compareAll(x, axis)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("b/123860949")  # The computation is constant folded
   def testLarge(self):
     for dtype in self.valid_dtypes:
       x = np.ones([1000000], dtype=dtype) / 1024
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 8510a08f0c96dd9ae08a2ca3e782cc7d28e86264..5bc301b61360584969e391b093a3f488dec06925 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -217,7 +218,7 @@ class StatefulScatterNdTest(test.TestCase):
   def testVariableRankAdd(self):
     self._VariableRankTests(_NumpyAdd, state_ops.scatter_nd_add)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_deprecated_v1
   def testVariableRankSub(self):
     self._VariableRankTests(_NumpySub, state_ops.scatter_nd_sub)
 
@@ -295,6 +296,7 @@ class StatefulScatterNdTest(test.TestCase):
                                     updates).get_shape().as_list(), shape)
 
   @test_util.run_v1_only("b/120545219")
+  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testResVarInvalidOutputShape(self):
     res = variables.Variable(
         initial_value=lambda: array_ops.zeros(shape=[], dtype=dtypes.float32),
@@ -325,7 +327,7 @@ class StatefulScatterNdTest(test.TestCase):
     shape = np.array([2, 2, 2])
     ref = variables.Variable(array_ops.zeros(shape, dtypes.int32))
     with self.assertRaisesWithPredicateMatch(
-        ValueError, "The outer \\d+ dimensions of indices\\.shape="):
+        ValueError, r"The outer \d+ dimensions of indices\.shape="):
       state_ops.scatter_nd_update(ref, indices, updates)
 
   @test_util.run_deprecated_v1
@@ -335,7 +337,7 @@ class StatefulScatterNdTest(test.TestCase):
     shape = np.array([2, 2, 2])
     ref = variables.Variable(array_ops.zeros(shape, dtypes.int32))
     with self.assertRaisesWithPredicateMatch(
-        ValueError, "The inner \\d+ dimensions of input\\.shape="):
+        ValueError, r"The inner \d+ dimensions of input\.shape="):
       state_ops.scatter_nd_update(ref, indices, updates)
 
   @test_util.run_deprecated_v1
@@ -539,7 +541,7 @@ class ScatterNdTest(test.TestCase):
     updates = array_ops.zeros([2, 2, 2], dtypes.int32)
     shape = np.array([2, 2, 2])
     with self.assertRaisesWithPredicateMatch(
-        ValueError, "The outer \\d+ dimensions of indices\\.shape="):
+        ValueError, r"The outer \d+ dimensions of indices\.shape="):
       self.scatter_nd(indices, updates, shape)
 
   @test_util.run_deprecated_v1
@@ -548,7 +550,7 @@ class ScatterNdTest(test.TestCase):
     updates = array_ops.zeros([2, 2], dtypes.int32)
     shape = np.array([2, 2, 2])
     with self.assertRaisesWithPredicateMatch(
-        ValueError, "The inner \\d+ dimensions of (input|output)\\.shape="):
+        ValueError, r"The inner \d+ dimensions of (input|output)\.shape="):
       self.scatter_nd(indices, updates, shape)
 
   @test_util.run_deprecated_v1
@@ -749,6 +751,16 @@ class ScatterNdTensorTest(test.TestCase):
       self.assertLess(err_added_wrt_updates, 2e-4)
       self.assertLess(err_subbed_wrt_updates, 2e-4)
 
+  def testTensorScatterUpdateWithForwarding(self):
+    @def_function.function
+    def _TestFn():
+      indices = constant_op.constant([[4], [3], [1], [7]])
+      updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
+      t = array_ops.ones([8], dtype=dtypes.float32)
+
+      return array_ops.tensor_scatter_update(t, indices, updates)
+
+    self.assertAllEqual(_TestFn(), [1, 11, 1, 10, 9, 1, 1, 12])
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/scatter_ops_test.py b/tensorflow/python/kernel_tests/scatter_ops_test.py
index 623c17d373cc7231d7191b715a77b6a3cf8701fc..ce7e0c04c861dcbeee85d496496b3e657b883e56 100644
--- a/tensorflow/python/kernel_tests/scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_ops_test.py
@@ -192,6 +192,10 @@ class ScatterTest(test.TestCase):
     if tf_scatter != state_ops.scatter_div:
       vtypes.append(np.int32)
 
+    if (tf_scatter == state_ops.scatter_min or
+        tf_scatter == state_ops.scatter_max):
+      vtypes.append(np.float16)
+
     for vtype in vtypes:
       for itype in (np.int32, np.int64):
         self._VariableRankTest(tf_scatter, vtype, itype, repeat_indices,
diff --git a/tensorflow/python/kernel_tests/session_ops_test.py b/tensorflow/python/kernel_tests/session_ops_test.py
index bc5d8e81511494ea82bbf703544ec36448b5e982..7d422278408207a3abcccf58921ec94b018a2cea 100644
--- a/tensorflow/python/kernel_tests/session_ops_test.py
+++ b/tensorflow/python/kernel_tests/session_ops_test.py
@@ -29,9 +29,9 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
-@test_util.run_v1_only("b/120545219")
 class SessionOpsTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testHandleBasic(self):
     with self.cached_session() as sess:
       # Return a handle.
@@ -46,6 +46,7 @@ class SessionOpsTest(test.TestCase):
       y = math_ops.multiply(x, 10)
       self.assertEqual(500, sess.run(y, feed_dict={f: h.handle}))
 
+  @test_util.run_deprecated_v1
   def testHandleEval(self):
     with self.cached_session() as sess:
       # Return a handle.
@@ -58,6 +59,7 @@ class SessionOpsTest(test.TestCase):
       # Get the tensor from its handle.
       self.assertEqual(50, h.eval())
 
+  @test_util.run_deprecated_v1
   def testHandleAndValue(self):
     with self.cached_session() as sess:
       # Return a handle and a value.
@@ -71,6 +73,7 @@ class SessionOpsTest(test.TestCase):
       self.assertEqual(50, h.eval())
       self.assertEqual(500, v)
 
+  @test_util.run_deprecated_v1
   def testHandleCond(self):
     with self.cached_session() as sess:
       # Return a handle and a value
@@ -91,6 +94,7 @@ class SessionOpsTest(test.TestCase):
 
       self.assertEqual(5000, result)
 
+  @test_util.run_deprecated_v1
   def testHandleForLoop(self):
     with self.cached_session() as sess:
       # Initialize a handle.
@@ -108,6 +112,7 @@ class SessionOpsTest(test.TestCase):
 
       self.assertEqual(100, h.eval())
 
+  @test_util.run_deprecated_v1
   def testHandleWhileLoop(self):
     with self.cached_session() as sess:
       # Initialize a handle.
@@ -128,6 +133,7 @@ class SessionOpsTest(test.TestCase):
 
       self.assertEqual(101, h.eval())
 
+  @test_util.run_deprecated_v1
   def testHandleMover(self):
     with self.cached_session() as sess:
       # Return a handle.
@@ -149,6 +155,7 @@ class SessionOpsTest(test.TestCase):
         h = self.evaluate(h)
         self.assertEqual(100, sess.run(y, feed_dict={f: h.handle}))
 
+  @test_util.run_deprecated_v1
   def testHandleDelete(self):
     with self.cached_session() as sess:
       # Return a handle.
@@ -158,6 +165,7 @@ class SessionOpsTest(test.TestCase):
       h = session_ops.get_session_handle(c)
       self.evaluate(h).delete()
 
+  @test_util.run_deprecated_v1
   def testHandleDeleteRaw(self):
     with self.cached_session() as sess:
       # Return a handle.
@@ -172,6 +180,7 @@ class SessionOpsTest(test.TestCase):
       f, x = session_ops.delete_session_tensor(raw_h)
       sess.run(x, feed_dict={f: raw_h})
 
+  @test_util.run_deprecated_v1
   def testMultiDevices(self):
     with self.cached_session() as sess:
       with ops.device(test.gpu_device_name()):
@@ -190,6 +199,7 @@ class SessionOpsTest(test.TestCase):
                      b_p: b_handle.handle})
       self.assertEqual(3.0, c_handle.eval())
 
+  @test_util.run_deprecated_v1
   def testHandleGC(self):
     with self.cached_session() as sess:
       # initial values live on CPU
@@ -214,6 +224,7 @@ class SessionOpsTest(test.TestCase):
             feed_dict={add_h1: one_handle.handle,
                        add_h2: x_handle.handle})
 
+  @test_util.run_deprecated_v1
   def testHandlePlacement(self):
     with self.cached_session() as sess:
       a = constant_op.constant(1.0)
@@ -234,7 +245,7 @@ class SessionOpsTest(test.TestCase):
                      b_p: b_handle.handle})
       self.assertEqual(3.0, c_handle.eval())
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_deprecated_v1
   def testFeedOneHandleDirectly(self):
     with self.cached_session() as sess:
       a = constant_op.constant(10.0)
@@ -246,7 +257,7 @@ class SessionOpsTest(test.TestCase):
 
       self.assertAllClose(2500.0, sess.run(d, feed_dict={c: h_c}))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_deprecated_v1
   def testDirectHandleFeedOverlappingWithFetches(self):
     with self.cached_session() as sess:
       a = constant_op.constant(10.0)
@@ -273,6 +284,7 @@ class SessionOpsTest(test.TestCase):
       self.assertAllClose(50.0, c_val)
       self.assertAllClose(50.0, d_val)
 
+  @test_util.run_deprecated_v1
   def testFeedTwoHandlesDirectly(self):
     with self.cached_session() as sess:
       a = constant_op.constant(10.0)
@@ -287,7 +299,7 @@ class SessionOpsTest(test.TestCase):
       self.assertAllClose(48.0, sess.run(e, feed_dict={c: h_c, d: h_d}))
       self.assertAllClose(-48.0, sess.run(e, feed_dict={c: h_d, d: h_c}))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_deprecated_v1
   def testFeedHandleToVariableDirectly(self):
     with self.cached_session() as sess:
       a = variables.Variable(12.0)
diff --git a/tensorflow/python/kernel_tests/signal/BUILD b/tensorflow/python/kernel_tests/signal/BUILD
index 8f4e31abe3c90af01029be719ee83c7c7dc42f0c..4caecc85ca5a1ab72648e015666a5666cf5335ab 100644
--- a/tensorflow/python/kernel_tests/signal/BUILD
+++ b/tensorflow/python/kernel_tests/signal/BUILD
@@ -29,6 +29,8 @@ cuda_py_tests(
         "//tensorflow/python:spectral_ops_test_util",
         "//tensorflow/python/ops/signal",
     ],
+    tags = ["no_rocm"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_tests(
@@ -44,7 +46,11 @@ cuda_py_tests(
         "//tensorflow/python/ops/signal",
     ],
     shard_count = 4,
-    tags = ["optonly"],
+    tags = [
+        "no_rocm",
+        "optonly",
+    ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_tests(
@@ -56,6 +62,7 @@ cuda_py_tests(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/ops/signal",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_tests(
@@ -70,6 +77,8 @@ cuda_py_tests(
         "//tensorflow/python/ops/signal",
         "//tensorflow/python:spectral_ops_test_util",
     ],
+    tags = ["no_rocm"],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_tests(
@@ -87,6 +96,7 @@ cuda_py_tests(
         "//tensorflow/python/ops/signal",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_tests(
@@ -104,6 +114,7 @@ cuda_py_tests(
         "//tensorflow/python/ops/signal",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_tests(
@@ -124,7 +135,11 @@ cuda_py_tests(
         "//tensorflow/python:spectral_ops_test_util",
         "//tensorflow/python/ops/signal",
     ],
-    tags = ["nomac"],
+    tags = [
+        "no_rocm",
+        "nomac",
+    ],
+    xla_enable_strict_auto_jit = True,
 )
 
 cuda_py_tests(
@@ -140,4 +155,5 @@ cuda_py_tests(
         "//tensorflow/python/ops/signal",
         "//tensorflow/python:platform_test",
     ],
+    xla_enable_strict_auto_jit = True,
 )
diff --git a/tensorflow/python/kernel_tests/signal/fft_ops_test.py b/tensorflow/python/kernel_tests/signal/fft_ops_test.py
index 5b1053428c0096c15fce7c4fa7b46d5999602057..4577587fa4e05b1e2fe3353c4a12ac37744bddb6 100644
--- a/tensorflow/python/kernel_tests/signal/fft_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/fft_ops_test.py
@@ -465,6 +465,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
                 gen_complex(complex_dims), rank, (size,) * rank)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("b/123738986")  # More assertions needed.
   def testError(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py
index 707b8a429f2be1fcce39516d368e2b7a05570652..a82492996a48448c3e5829ee6a8cede0bf20ad92 100644
--- a/tensorflow/python/kernel_tests/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/softmax_op_test.py
@@ -201,6 +201,15 @@ class SoftmaxTest(test.TestCase):
         use_gpu=False)
     self._testOverflow(use_gpu=False)
 
+  def testAlongNegativeDimension(self):
+    self._testSoftmax(
+        np.array([[[1., 1., 1., 1.], [1., 2., 3., 4.]],
+                  [[2., 3., 4., 5.], [6., 7., 8., 9.]],
+                  [[5., 4., 3., 2.], [1., 2., 3., 4.]]]).astype(np.float32),
+        dim=-2,
+        use_gpu=False)
+    self._testOverflow(use_gpu=False)
+
   def testShapeInference(self):
     op = nn_ops.softmax([[[1., 1., 1., 1.], [1., 2., 3., 4.]],
                          [[2., 3., 4., 5.], [6., 7., 8., 9.]],
diff --git a/tensorflow/python/kernel_tests/spacetodepth_op_test.py b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
index e96bc09f3652aaa4d41bddac6ad06daaff8bfbd6..69243afb69c90b53d9e470a2d81f4067d1c2191e 100644
--- a/tensorflow/python/kernel_tests/spacetodepth_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
@@ -285,6 +285,7 @@ class SpaceToDepthTest(test.TestCase):
       actual_vals, expected_vals = self.evaluate([actual, expected])
       self.assertTrue(np.array_equal(actual_vals, expected_vals))
 
+  @test_util.disable_xla("b/123553551")  # Unsupported data format
   def testAgainstTranspose(self):
     self.compareToTranspose(3, 2, 3, 1, 2, "NHWC", False)
     self.compareToTranspose(1, 2, 3, 2, 2, "NHWC", False)
diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py
index 7598991489ce6019352e19cb6c50819d91085b0d..ede12d1c83fb559f2164c0e7f46640315d0ced62 100644
--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@@ -72,11 +72,10 @@ class SparseToIndicatorTest(test_util.TensorFlowTestCase):
         constant_op.constant(val, dtype),
         constant_op.constant(shape, dtypes.int64))
 
-  @test_util.run_deprecated_v1
   def testInt32(self):
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_5x6(dtypes.int32)
-      output = sparse_ops.sparse_to_indicator(sp_input, 50).eval()
+      output = sparse_ops.sparse_to_indicator(sp_input, 50)
 
       expected_output = np.zeros((5, 50), dtype=np.bool)
       expected_trues = ((0, 0), (1, 10), (1, 13), (1, 14), (3, 32), (3, 33))
@@ -85,11 +84,10 @@ class SparseToIndicatorTest(test_util.TensorFlowTestCase):
 
       self.assertAllEqual(output, expected_output)
 
-  @test_util.run_deprecated_v1
   def testInt64(self):
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_5x6(dtypes.int64)
-      output = sparse_ops.sparse_to_indicator(sp_input, 50).eval()
+      output = sparse_ops.sparse_to_indicator(sp_input, 50)
 
       expected_output = np.zeros((5, 50), dtype=np.bool)
       expected_trues = [(0, 0), (1, 10), (1, 13), (1, 14), (3, 32), (3, 33)]
@@ -98,11 +96,10 @@ class SparseToIndicatorTest(test_util.TensorFlowTestCase):
 
       self.assertAllEqual(output, expected_output)
 
-  @test_util.run_deprecated_v1
   def testHigherRank(self):
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_2x3x4(dtypes.int64)
-      output = sparse_ops.sparse_to_indicator(sp_input, 200).eval()
+      output = sparse_ops.sparse_to_indicator(sp_input, 200)
 
       expected_output = np.zeros((2, 3, 200), dtype=np.bool)
       expected_trues = [(0, 0, 1), (0, 1, 10), (0, 1, 12), (1, 0, 103),
@@ -151,7 +148,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
   def testInt32AndFloat32(self):
     vocab_size = 50
     indices_v, values_v = self._SparseTensorValue_3x50(np.int32, np.float32)
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       for indices in (indices_v,
                       sparse_tensor.SparseTensor.from_value(indices_v)):
         for values in (values_v,
@@ -163,7 +160,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
 
   def testInt64AndFloat32(self):
     vocab_size = 50
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       indices, values = self._SparseTensor_3x50(np.int64, np.float32)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
@@ -172,7 +169,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
 
   def testInt64AndFloat64(self):
     vocab_size = 50
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       indices, values = self._SparseTensor_3x50(np.int64, np.float64)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
@@ -181,7 +178,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
 
   def testInt32AndFloat32NonCanonicalOrder(self):
     vocab_size = 50
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       indices, values = self._SparseTensor_3x50(np.int32, np.float32)
       sp_output = sparse_ops.sparse_merge(
           indices, values, vocab_size, already_sorted=True)
@@ -191,7 +188,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
 
   def testInt64AndFloat32NonCanonicalOrder(self):
     vocab_size = 50
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       indices, values = self._SparseTensor_3x50(np.int64, np.float32)
       sp_output = sparse_ops.sparse_merge(
           indices, values, vocab_size, already_sorted=True)
@@ -202,7 +199,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
   def testInt64AndFloat64NonCanonicalOrder(self):
     vocab_size = 50
     vocab_size_tensor = constant_op.constant(vocab_size, dtypes.int64)
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       indices, values = self._SparseTensor_3x50(np.int64, np.float64)
       sp_output = sparse_ops.sparse_merge(
           indices, values, vocab_size_tensor, already_sorted=True)
@@ -261,7 +258,7 @@ class SparseMergeHighDimTest(test_util.TensorFlowTestCase):
 
   def testInt64AndFloat32(self):
     vocab_size = [50, 31]
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       indices, values = self._SparseTensor_3x50(np.int64, np.float32)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
@@ -270,7 +267,7 @@ class SparseMergeHighDimTest(test_util.TensorFlowTestCase):
 
   def testInt64AndFloat64(self):
     vocab_size = [50, 31]
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       indices, values = self._SparseTensor_3x50(np.int64, np.float64)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
@@ -279,7 +276,7 @@ class SparseMergeHighDimTest(test_util.TensorFlowTestCase):
 
   def testInt64AndFloat64Shape(self):
     vocab_size = [50, 30]
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       indices, values = self._SparseTensor_3x50(np.int64, np.float64)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
@@ -300,9 +297,8 @@ class SparseRetainTest(test_util.TensorFlowTestCase):
   def _SparseTensor_5x6(self):
     return sparse_tensor.SparseTensor.from_value(self._SparseTensorValue_5x6())
 
-  @test_util.run_deprecated_v1
   def testBasic(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       for sp_input in (self._SparseTensorValue_5x6(), self._SparseTensor_5x6()):
         to_retain = np.array([1, 0, 0, 1, 1, 0], dtype=np.bool)
         sp_output = sparse_ops.sparse_retain(sp_input, to_retain)
@@ -314,7 +310,7 @@ class SparseRetainTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(output.dense_shape, [5, 6])
 
   def testRetainNone(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_5x6()
       to_retain = np.zeros((6,), dtype=np.bool)
       sp_output = sparse_ops.sparse_retain(sp_input, to_retain)
@@ -326,7 +322,7 @@ class SparseRetainTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(output.dense_shape, [5, 6])
 
   def testMismatchedRetainShape(self):
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_5x6()
       to_retain = np.array([1, 0, 0, 1, 0], dtype=np.bool)
       with self.assertRaises(ValueError):
@@ -358,16 +354,14 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
     return sparse_tensor.SparseTensorValue(self._IND_2_5_6, self._VAL_2_5_6,
                                            self._SHP_2_5_6)
 
-  @test_util.run_deprecated_v1
   def testStaticShapeInfoPreservedWhenNewShapeIsProvidedAndStatic(self):
     sp_input = self._SparseTensor_2x5x6()
     new_shape = np.array([3, 6, 7], dtype=np.int64)
     sp_output = sparse_ops.sparse_reset_shape(sp_input, new_shape)
     self.assertAllEqual([3, 6, 7], sp_output.get_shape())
 
-  @test_util.run_deprecated_v1
   def testBasic(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_2x5x6()
       new_shape = np.array([3, 6, 7], dtype=np.int64)
       sp_output = sparse_ops.sparse_reset_shape(sp_input, new_shape)
@@ -379,9 +373,8 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(output.values, [0, 10, 13, 14, 32, 33])
       self.assertAllEqual(output.dense_shape, [3, 6, 7])
 
-  @test_util.run_deprecated_v1
   def testInputUnavailableInGraphConstructionOk(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_input = self._SparseTensorValue_2x5x6()
       new_shape = np.array([3, 6, 7], dtype=np.int64)
       sp_output = sparse_ops.sparse_reset_shape(sp_input, new_shape)
@@ -409,7 +402,7 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(output.dense_shape, [3, 6, 7])
 
   def testTightBoundingBox(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_2x5x6()
       sp_output = sparse_ops.sparse_reset_shape(sp_input)
 
@@ -421,7 +414,7 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(output.dense_shape, [2, 4, 5])
 
   def testTightBoundingBoxEmpty(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_2x5x6_Empty()
       sp_output = sparse_ops.sparse_reset_shape(sp_input)
 
@@ -431,9 +424,8 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(output.values.shape, [0])
       self.assertAllEqual(output.dense_shape, [0, 0, 0])
 
-  @test_util.run_deprecated_v1
   def testInvalidRank(self):
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_2x5x6()
       new_shape = np.array([3, 7], dtype=np.int64)
 
@@ -450,7 +442,6 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       with self.assertRaisesOpError("x == y did not hold element-wise"):
         sess.run(out, feed_dict={new_shape: np.array([3, 7], dtype=np.int64)})
 
-  @test_util.run_deprecated_v1
   def testInvalidDimensionSizeStatic(self):
     sp_input = self._SparseTensor_2x5x6()
     new_shape = np.array([3, 7, 5], dtype=np.int64)
@@ -510,14 +501,13 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
         constant_op.constant(val, dtypes.int32),
         constant_op.constant(shape, dtypes.int64))
 
-  @test_util.run_deprecated_v1
   def testFillNumber(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       for sp_input in (self._SparseTensorValue_5x6(), self._SparseTensor_5x6()):
         sp_output, empty_row_indicator = (
             sparse_ops.sparse_fill_empty_rows(sp_input, -1))
 
-        output, empty_row_indicator_out = sess.run(
+        output, empty_row_indicator_out = self.evaluate(
             [sp_output, empty_row_indicator])
 
         self.assertAllEqual(
@@ -530,7 +520,7 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
   def testFillFloat(self):
-    with self.session(use_gpu=False) as sess:
+    with self.session(use_gpu=False):
       values = constant_op.constant(
           [0.0, 10.0, 13.0, 14.0, 32.0, 33.0], dtype=dtypes.float64)
       default_value = constant_op.constant(-1.0, dtype=dtypes.float64)
@@ -540,7 +530,7 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
           dense_shape=np.array([5, 6]))
       sp_output, empty_row_indicator = (sparse_ops.sparse_fill_empty_rows(
           sp_input, default_value))
-      output, empty_row_indicator_out = sess.run(
+      output, empty_row_indicator_out = self.evaluate(
           [sp_output, empty_row_indicator])
 
       self.assertAllEqual(output.indices, [[0, 0], [1, 0], [1, 3], [1, 4],
@@ -563,14 +553,13 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
       self.assertGreater(default_value_grad_err, 0)
       self.assertLess(default_value_grad_err, 1e-8)
 
-  @test_util.run_deprecated_v1
   def testFillString(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_String5x6()
       sp_output, empty_row_indicator = (
           sparse_ops.sparse_fill_empty_rows(sp_input, ""))
 
-      output, empty_row_indicator_out = sess.run(
+      output, empty_row_indicator_out = self.evaluate(
           [sp_output, empty_row_indicator])
 
       self.assertAllEqual(
@@ -582,14 +571,13 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(empty_row_indicator_out,
                           np.array([0, 0, 1, 0, 1]).astype(np.bool))
 
-  @test_util.run_deprecated_v1
   def testNoEmptyRows(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_input = self._SparseTensor_2x6()
       sp_output, empty_row_indicator = (
           sparse_ops.sparse_fill_empty_rows(sp_input, -1))
 
-      output, empty_row_indicator_out = sess.run(
+      output, empty_row_indicator_out = self.evaluate(
           [sp_output, empty_row_indicator])
 
       self.assertAllEqual(output.indices, [[0, 0], [1, 0], [1, 3], [1, 4]])
@@ -600,7 +588,6 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
 
 class SparseAddTest(test_util.TensorFlowTestCase):
 
-  @test_util.run_deprecated_v1
   def testValuesInVariable(self):
     indices = constant_op.constant([[1]], dtype=dtypes.int64)
     values = variables.Variable([1], trainable=False, dtype=dtypes.float32)
@@ -609,7 +596,7 @@ class SparseAddTest(test_util.TensorFlowTestCase):
     sp_input = sparse_tensor.SparseTensor(indices, values, shape)
     sp_output = sparse_ops.sparse_add(sp_input, sp_input)
 
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       self.evaluate(variables.global_variables_initializer())
       output = self.evaluate(sp_output)
       self.assertAllEqual(output.values, [2])
@@ -625,7 +612,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
   dense_shape = np.array([2, 3]).astype(np.int64)
 
   def _compare(self, sp_t, reduction_axes, ndims, keep_dims, do_sum):
-    densified = sparse_ops.sparse_tensor_to_dense(sp_t).eval()
+    densified = self.evaluate(sparse_ops.sparse_tensor_to_dense(sp_t))
 
     np_ans = densified
     if reduction_axes is None:
@@ -665,7 +652,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
                                                             reduction_axes,
                                                             keep_dims)
       # Convert to dense for comparison purposes.
-      out_sparse = sparse_ops.sparse_tensor_to_dense(tf_sparse_ans).eval()
+      out_sparse = sparse_ops.sparse_tensor_to_dense(tf_sparse_ans)
 
     self.assertAllClose(np_ans, out_dense)
     self.assertAllClose(np_ans, out_sparse)
@@ -676,14 +663,13 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
     self._compare(sp_t, reduction_axes, ndims, True, False)
     self._compare(sp_t, reduction_axes, ndims, True, True)
 
-  @test_util.run_deprecated_v1
   def testSimpleAndRandomInputs(self):
     if np.__version__ == "1.13.0":
       self.skipTest("numpy 1.13.0 bug")
 
     sp_t = sparse_tensor.SparseTensor(self.ind, self.vals, self.dense_shape)
 
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       self._compare_all(sp_t, None, ndims=2)
       self._compare_all(sp_t, 0, ndims=2)
       self._compare_all(sp_t, [1], ndims=2)
@@ -694,7 +680,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
 
     np.random.seed(1618)
     test_dims = [(1618, 1, 11, 7, 1), (1,), (1, 1, 1)]
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       for dims in test_dims:
         sp_t, unused_nnz = _sparsify(np.random.randn(*dims))
         # reduce all using None
@@ -706,15 +692,15 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
 
   def testInvalidAxes(self):
     sp_t = sparse_tensor.SparseTensor(self.ind, self.vals, self.dense_shape)
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       with self.assertRaisesOpError("Invalid reduction dimension -3"):
-        sparse_ops.sparse_reduce_sum(sp_t, -3).eval()
+        self.evaluate(sparse_ops.sparse_reduce_sum(sp_t, -3))
       with self.assertRaisesOpError("Invalid reduction dimension 2"):
-        sparse_ops.sparse_reduce_sum(sp_t, 2).eval()
+        self.evaluate(sparse_ops.sparse_reduce_sum(sp_t, 2))
       with self.assertRaisesOpError("Invalid reduction dimension -3"):
-        sparse_ops.sparse_reduce_max(sp_t, -3).eval()
+        self.evaluate(sparse_ops.sparse_reduce_max(sp_t, -3))
       with self.assertRaisesOpError("Invalid reduction dimension 2"):
-        sparse_ops.sparse_reduce_max(sp_t, 2).eval()
+        self.evaluate(sparse_ops.sparse_reduce_max(sp_t, 2))
 
   @test_util.run_deprecated_v1
   def testGradient(self):
@@ -745,7 +731,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
 
   def _testSparseReduceShape(self, sp_t, reduction_axes, ndims, keep_dims,
                              do_sum):
-    densified = sparse_ops.sparse_tensor_to_dense(sp_t).eval()
+    densified = self.evaluate(sparse_ops.sparse_tensor_to_dense(sp_t))
 
     np_op = np.sum
     tf_op = sparse_ops.sparse_reduce_sum
@@ -773,7 +759,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
   def testSparseReduceSumOrMaxShape(self):
     sp_t = sparse_tensor.SparseTensor(self.ind, self.vals, self.dense_shape)
 
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       for do_sum in [True, False]:
         for keep_dims in [True, False]:
           self._testSparseReduceShape(sp_t, None, 2, keep_dims, do_sum)
@@ -790,19 +776,17 @@ class SparseMathOpsTest(test_util.TensorFlowTestCase):
   def _check(self, result_tensor, result_np, input_sp_t):
     self.assertTrue(isinstance(result_tensor, sparse_tensor.SparseTensor))
     self.assertTrue(isinstance(input_sp_t, sparse_tensor.SparseTensor))
-    self.assertAllEqual(input_sp_t.indices.eval(), result_tensor.indices.eval())
-    self.assertAllEqual(input_sp_t.dense_shape.eval(),
-                        result_tensor.dense_shape.eval())
+    self.assertAllEqual(input_sp_t.indices, result_tensor.indices)
+    self.assertAllEqual(input_sp_t.dense_shape, result_tensor.dense_shape)
 
-    res_densified = sparse_ops.sparse_to_dense(result_tensor.indices,
-                                               result_tensor.dense_shape,
-                                               result_tensor.values).eval()
+    res_densified = sparse_ops.sparse_to_dense(
+        result_tensor.indices, result_tensor.dense_shape, result_tensor.values)
     self.assertAllEqual(result_np, res_densified)
 
   @test_util.run_deprecated_v1
   def testCwiseShapeValidation(self):
     # Test case for GitHub 24072.
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       a = array_ops.ones([3, 4, 1], dtype=dtypes.int32)
       b = sparse_tensor.SparseTensor([[0, 0, 1, 0], [0, 0, 3, 0]], [10, 20],
                                      [1, 1, 4, 2])
@@ -810,21 +794,20 @@ class SparseMathOpsTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
           "broadcasts dense to sparse only; got incompatible shapes"):
-        c.eval()
+        self.evaluate(c)
 
-  @test_util.run_deprecated_v1
   def testCwiseDivAndMul(self):
     np.random.seed(1618)
     sp_shapes = [(10, 10, 10), (5, 5), (1618,), (3, 3, 7)]
     dense_shapes = [(10, 10, 1), (5, 5), (1,), (1, 7)]
 
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       for dtype in [np.float32, np.float64, np.int32, np.int64]:
         for sp_shape, dense_shape in zip(sp_shapes, dense_shapes):
           sp_vals_np = np.random.rand(*sp_shape).astype(dtype) + 1
           dense_vals_np = np.random.rand(*dense_shape).astype(dtype) + 1
           sp_t, unused_nnz = _sparsify(sp_vals_np, thresh=1.5)
-          sp_t_densified = sparse_ops.sparse_tensor_to_dense(sp_t).eval()
+          sp_t_densified = sparse_ops.sparse_tensor_to_dense(sp_t)
           dense_t = constant_op.constant(dense_vals_np)
 
           self._check(sp_t / dense_t, sp_t_densified / dense_vals_np, sp_t)
@@ -834,11 +817,10 @@ class SparseMathOpsTest(test_util.TensorFlowTestCase):
 
           if dtype in [np.int32, np.int64]:
             res = sp_t / dense_t  # should invoke "__truediv__"
-            self.assertEqual(res.values.eval().dtype, np.float64)
+            self.assertEqual(res.values.dtype, np.float64)
 
-  @test_util.run_deprecated_v1
   def testCwiseAdd(self):
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       # Identity(2) + AllOnes(2,2).  Should be equal to 2 * Identity(2).
       indices = [[0, 0], [1, 1]]
       vals = [1, 1]
@@ -901,16 +883,15 @@ class SparseSoftmaxTest(test_util.TensorFlowTestCase):
       batched_sp_t, unused_nnz1 = _sparsify(
           sp_vals_np.reshape((1, n, m)), thresh=0.)  # No masking.
 
-      with self.cached_session(use_gpu=False):
+      with test_util.force_cpu():
         densified = constant_op.constant(sp_vals_np)
 
-        sp_result = sparse_ops.sparse_softmax(batched_sp_t).eval(
-        ).values.reshape((n, m))
+        sp_result = self.evaluate(
+            sparse_ops.sparse_softmax(batched_sp_t)).values.reshape((n, m))
         dense_result = nn_ops.softmax(densified)
 
-        self.assertAllClose(dense_result.eval(), sp_result)
+        self.assertAllClose(dense_result, sp_result)
 
-  @test_util.run_deprecated_v1
   def testHigherRanks(self):
     # For the first shape:
     # First batch:
@@ -933,11 +914,11 @@ class SparseSoftmaxTest(test_util.TensorFlowTestCase):
       sp_t, unused_nnz = _sparsify(values, thresh=1e-2)
       expected_values = [1., 1., 1., .5, .5]
 
-      with self.cached_session(use_gpu=False):
-        result = sparse_ops.sparse_softmax(sp_t).eval()
+      with test_util.force_cpu():
+        result = sparse_ops.sparse_softmax(sp_t)
 
         self.assertAllEqual(expected_values, result.values)
-        self.assertAllEqual(sp_t.indices.eval(), result.indices)
+        self.assertAllEqual(sp_t.indices, result.indices)
         self.assertAllEqual(shape, result.dense_shape)
 
   @test_util.run_deprecated_v1
@@ -960,25 +941,24 @@ class SparseMinimumMaximumTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(a.values, b.values)
     self.assertAllEqual(a.dense_shape, b.dense_shape)
 
-  @test_util.run_deprecated_v1
   def testBasic(self):
-    with self.cached_session(use_gpu=False):
+    with test_util.force_cpu():
       # 1-D, values at index 0.
       sp_zero = sparse_tensor.SparseTensor([[0]], [0], [7])
       sp_one = sparse_tensor.SparseTensor([[0]], [1], [7])
-      max_tf = sparse_ops.sparse_maximum(sp_zero, sp_one).eval()
-      min_tf = sparse_ops.sparse_minimum(sp_zero, sp_one).eval()
-      self._assertSparseTensorValueEqual(sp_one.eval(), max_tf)
-      self._assertSparseTensorValueEqual(sp_zero.eval(), min_tf)
+      max_tf = sparse_ops.sparse_maximum(sp_zero, sp_one)
+      min_tf = sparse_ops.sparse_minimum(sp_zero, sp_one)
+      self._assertSparseTensorValueEqual(sp_one, max_tf)
+      self._assertSparseTensorValueEqual(sp_zero, min_tf)
 
       # Values at different indices.
       sp_zero = sparse_tensor.SparseTensor([[0]], [0], [7])
       sp_zero_2 = sparse_tensor.SparseTensor([[1]], [0], [7])
       expected = sparse_tensor.SparseTensor([[0], [1]], [0, 0], [7])
-      max_tf = sparse_ops.sparse_maximum(sp_zero, sp_zero_2).eval()
-      min_tf = sparse_ops.sparse_minimum(sp_zero, sp_zero_2).eval()
-      self._assertSparseTensorValueEqual(expected.eval(), max_tf)
-      self._assertSparseTensorValueEqual(expected.eval(), min_tf)
+      max_tf = sparse_ops.sparse_maximum(sp_zero, sp_zero_2)
+      min_tf = sparse_ops.sparse_minimum(sp_zero, sp_zero_2)
+      self._assertSparseTensorValueEqual(expected, max_tf)
+      self._assertSparseTensorValueEqual(expected, min_tf)
 
   @test_util.run_deprecated_v1
   def testRandom(self):
@@ -1008,37 +988,36 @@ class SparseMinimumMaximumTest(test_util.TensorFlowTestCase):
             np.minimum(a_densified, b_densified), minimum_tf_densified)
 
   def testMismatchedShapes(self):
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       sp_zero = sparse_tensor.SparseTensor([[0, 0]], [0], [1, 1])
       sp_one = sparse_tensor.SparseTensor([[0]], [1], [2])
       with self.assertRaisesOpError("Operands do not have the same ranks"):
-        sparse_ops.sparse_maximum(sp_zero, sp_one).eval()
+        self.evaluate(sparse_ops.sparse_maximum(sp_zero, sp_one))
 
       sp_zero = sparse_tensor.SparseTensor([[0]], [0], [1])
       sp_one = sparse_tensor.SparseTensor([[0]], [1], [2])
       with self.assertRaisesOpError("Operands' shapes do not match"):
-        sparse_ops.sparse_maximum(sp_zero, sp_one).eval()
+        self.evaluate(sparse_ops.sparse_maximum(sp_zero, sp_one))
 
 
 class SparseTransposeTest(test.TestCase):
 
-  @test_util.run_deprecated_v1
   def testTranspose(self):
     if np.__version__ == "1.13.0":
       self.skipTest("numpy 1.13.0 bug")
 
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       np.random.seed(1618)
       shapes = [np.random.randint(1, 10, size=rank) for rank in range(1, 6)]
       for shape in shapes:
         for dtype in [np.int32, np.int64, np.float32, np.float64]:
           dn_input = np.random.randn(*shape).astype(dtype)
-          rank = array_ops.rank(dn_input).eval()
+          rank = self.evaluate(array_ops.rank(dn_input))
           perm = np.random.choice(rank, rank, False)
           sp_input, unused_a_nnz = _sparsify(dn_input)
           sp_trans = sparse_ops.sparse_transpose(sp_input, perm=perm)
-          dn_trans = sparse_ops.sparse_tensor_to_dense(sp_trans).eval()
-          expected_trans = array_ops.transpose(dn_input, perm=perm).eval()
+          dn_trans = sparse_ops.sparse_tensor_to_dense(sp_trans)
+          expected_trans = array_ops.transpose(dn_input, perm=perm)
           self.assertAllEqual(expected_trans.shape, sp_trans.get_shape())
           self.assertAllEqual(dn_trans, expected_trans)
 
diff --git a/tensorflow/python/kernel_tests/split_op_test.py b/tensorflow/python/kernel_tests/split_op_test.py
index 517db3450f3c43ea0989b59db5ccc7c089e9cec3..42b4d1b778296e936def71e5f347ed5019faae26 100644
--- a/tensorflow/python/kernel_tests/split_op_test.py
+++ b/tensorflow/python/kernel_tests/split_op_test.py
@@ -373,6 +373,7 @@ class SplitOpTest(test.TestCase):
     assert s1.shape.as_list() == [1]
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("b/123337890")  # Error messages differ
   def testNonexistentDimTensor(self):
     x = array_ops.placeholder(dtypes.int32)
     values = np.zeros([5, 30])
diff --git a/tensorflow/python/kernel_tests/stack_op_test.py b/tensorflow/python/kernel_tests/stack_op_test.py
index ca3357a0ed8f87cfcccd08a62c5b8526a898b664..04d635cdb1e6f35db10193c74623b5aa1013ee9d 100644
--- a/tensorflow/python/kernel_tests/stack_op_test.py
+++ b/tensorflow/python/kernel_tests/stack_op_test.py
@@ -81,7 +81,7 @@ class StackOpTest(test.TestCase):
     np.random.seed(7)
     with self.session(use_gpu=True):
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
-        for dtype in [np.bool, np.float32, np.int32, np.int64]:
+        for dtype in [np.bool, np.float32, np.int16, np.int32, np.int64]:
           data = np.random.randn(*shape).astype(dtype)
           # Stack back into a single tensorflow tensor directly using np array
           c = array_ops.stack(data)
diff --git a/tensorflow/python/kernel_tests/stage_op_test.py b/tensorflow/python/kernel_tests/stage_op_test.py
index 83e06ba48bdbbe3189eafde7d0f42c2e4ced68ab..29cd00b78923cf7413114f858fe4c23a379a5af5 100644
--- a/tensorflow/python/kernel_tests/stage_op_test.py
+++ b/tensorflow/python/kernel_tests/stage_op_test.py
@@ -166,6 +166,7 @@ class StageTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testCapacity(self):
+    self.skipTest('b/123423516 this test is flaky on gpu.')
     capacity = 3
 
     with ops.Graph().as_default() as G:
diff --git a/tensorflow/python/kernel_tests/summary_ops_test.py b/tensorflow/python/kernel_tests/summary_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8cc3299fe769794b80a135fbd168469abfd8fb8
--- /dev/null
+++ b/tensorflow/python/kernel_tests/summary_ops_test.py
@@ -0,0 +1,652 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for V2 summary ops from summary_ops_v2."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+import unittest
+
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import node_def_pb2
+from tensorflow.core.framework import step_stats_pb2
+from tensorflow.core.framework import summary_pb2
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.util import event_pb2
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine.sequential import Sequential
+from tensorflow.python.keras.layers.core import Activation
+from tensorflow.python.keras.layers.core import Dense
+from tensorflow.python.lib.io import tf_record
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+
+
+class SummaryOpsCoreTest(test_util.TensorFlowTestCase):
+
+  def testWrite(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      with summary_ops.create_file_writer(logdir).as_default():
+        output = summary_ops.write('tag', 42, step=12)
+        self.assertTrue(output.numpy())
+    events = events_from_logdir(logdir)
+    self.assertEqual(2, len(events))
+    self.assertEqual(12, events[1].step)
+    value = events[1].summary.value[0]
+    self.assertEqual('tag', value.tag)
+    self.assertEqual(42, to_numpy(value))
+
+  def testWrite_fromFunction(self):
+    logdir = self.get_temp_dir()
+    @def_function.function
+    def f():
+      with summary_ops.create_file_writer(logdir).as_default():
+        return summary_ops.write('tag', 42, step=12)
+    with context.eager_mode():
+      output = f()
+      self.assertTrue(output.numpy())
+    events = events_from_logdir(logdir)
+    self.assertEqual(2, len(events))
+    self.assertEqual(12, events[1].step)
+    value = events[1].summary.value[0]
+    self.assertEqual('tag', value.tag)
+    self.assertEqual(42, to_numpy(value))
+
+  def testWrite_metadata(self):
+    logdir = self.get_temp_dir()
+    metadata = summary_pb2.SummaryMetadata()
+    metadata.plugin_data.plugin_name = 'foo'
+    with context.eager_mode():
+      with summary_ops.create_file_writer(logdir).as_default():
+        summary_ops.write('obj', 0, 0, metadata=metadata)
+        summary_ops.write('bytes', 0, 0, metadata=metadata.SerializeToString())
+        m = constant_op.constant(metadata.SerializeToString())
+        summary_ops.write('string_tensor', 0, 0, metadata=m)
+    events = events_from_logdir(logdir)
+    self.assertEqual(4, len(events))
+    self.assertEqual(metadata, events[1].summary.value[0].metadata)
+    self.assertEqual(metadata, events[2].summary.value[0].metadata)
+    self.assertEqual(metadata, events[3].summary.value[0].metadata)
+
+  def testWrite_name(self):
+    @def_function.function
+    def f():
+      output = summary_ops.write('tag', 42, step=12, name='anonymous')
+      self.assertTrue(output.name.startswith('anonymous'))
+    f()
+
+  def testWrite_ndarray(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      with summary_ops.create_file_writer(logdir).as_default():
+        summary_ops.write('tag', [[1, 2], [3, 4]], step=12)
+    events = events_from_logdir(logdir)
+    value = events[1].summary.value[0]
+    self.assertAllEqual([[1, 2], [3, 4]], to_numpy(value))
+
+  def testWrite_tensor(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      t = constant_op.constant([[1, 2], [3, 4]])
+      with summary_ops.create_file_writer(logdir).as_default():
+        summary_ops.write('tag', t, step=12)
+      expected = t.numpy()
+    events = events_from_logdir(logdir)
+    value = events[1].summary.value[0]
+    self.assertAllEqual(expected, to_numpy(value))
+
+  def testWrite_tensor_fromFunction(self):
+    logdir = self.get_temp_dir()
+    @def_function.function
+    def f(t):
+      with summary_ops.create_file_writer(logdir).as_default():
+        summary_ops.write('tag', t, step=12)
+    with context.eager_mode():
+      t = constant_op.constant([[1, 2], [3, 4]])
+      f(t)
+      expected = t.numpy()
+    events = events_from_logdir(logdir)
+    value = events[1].summary.value[0]
+    self.assertAllEqual(expected, to_numpy(value))
+
+  def testWrite_stringTensor(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      with summary_ops.create_file_writer(logdir).as_default():
+        summary_ops.write('tag', [b'foo', b'bar'], step=12)
+    events = events_from_logdir(logdir)
+    value = events[1].summary.value[0]
+    self.assertAllEqual([b'foo', b'bar'], to_numpy(value))
+
+  @test_util.also_run_as_tf_function
+  def testWrite_noDefaultWriter(self):
+    with context.eager_mode():
+      self.assertFalse(summary_ops.write('tag', 42, step=0))
+
+  def testWrite_recordIf_constant(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      with summary_ops.create_file_writer(logdir).as_default():
+        self.assertTrue(summary_ops.write('default', 1, step=0))
+        with summary_ops.record_if(True):
+          self.assertTrue(summary_ops.write('set_on', 1, step=0))
+        with summary_ops.record_if(False):
+          self.assertFalse(summary_ops.write('set_off', 1, step=0))
+    events = events_from_logdir(logdir)
+    self.assertEqual(3, len(events))
+    self.assertEqual('default', events[1].summary.value[0].tag)
+    self.assertEqual('set_on', events[2].summary.value[0].tag)
+
+  def testWrite_recordIf_constant_fromFunction(self):
+    logdir = self.get_temp_dir()
+    @def_function.function
+    def f():
+      with summary_ops.create_file_writer(logdir).as_default():
+        # Use assertAllEqual instead of assertTrue since it works in a defun.
+        self.assertAllEqual(summary_ops.write('default', 1, step=0), True)
+        with summary_ops.record_if(True):
+          self.assertAllEqual(summary_ops.write('set_on', 1, step=0), True)
+        with summary_ops.record_if(False):
+          self.assertAllEqual(summary_ops.write('set_off', 1, step=0), False)
+    with context.eager_mode():
+      f()
+    events = events_from_logdir(logdir)
+    self.assertEqual(3, len(events))
+    self.assertEqual('default', events[1].summary.value[0].tag)
+    self.assertEqual('set_on', events[2].summary.value[0].tag)
+
+  def testWrite_recordIf_callable(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      step = variables.Variable(-1, dtype=dtypes.int64)
+      def record_fn():
+        step.assign_add(1)
+        return int(step % 2) == 0
+      with summary_ops.create_file_writer(logdir).as_default():
+        with summary_ops.record_if(record_fn):
+          self.assertTrue(summary_ops.write('tag', 1, step=step))
+          self.assertFalse(summary_ops.write('tag', 1, step=step))
+          self.assertTrue(summary_ops.write('tag', 1, step=step))
+          self.assertFalse(summary_ops.write('tag', 1, step=step))
+          self.assertTrue(summary_ops.write('tag', 1, step=step))
+    events = events_from_logdir(logdir)
+    self.assertEqual(4, len(events))
+    self.assertEqual(0, events[1].step)
+    self.assertEqual(2, events[2].step)
+    self.assertEqual(4, events[3].step)
+
+  def testWrite_recordIf_callable_fromFunction(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      step = variables.Variable(-1, dtype=dtypes.int64)
+      @def_function.function
+      def record_fn():
+        step.assign_add(1)
+        return math_ops.equal(step % 2, 0)
+      @def_function.function
+      def f():
+        with summary_ops.create_file_writer(logdir).as_default():
+          with summary_ops.record_if(record_fn):
+            return [
+                summary_ops.write('tag', 1, step=step),
+                summary_ops.write('tag', 1, step=step),
+                summary_ops.write('tag', 1, step=step)]
+      self.assertAllEqual(f(), [True, False, True])
+      self.assertAllEqual(f(), [False, True, False])
+    events = events_from_logdir(logdir)
+    self.assertEqual(4, len(events))
+    self.assertEqual(0, events[1].step)
+    self.assertEqual(2, events[2].step)
+    self.assertEqual(4, events[3].step)
+
+  def testWrite_recordIf_tensorInput_fromFunction(self):
+    logdir = self.get_temp_dir()
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec(shape=[], dtype=dtypes.int64)])
+    def f(step):
+      with summary_ops.create_file_writer(logdir).as_default():
+        with summary_ops.record_if(math_ops.equal(step % 2, 0)):
+          return summary_ops.write('tag', 1, step=step)
+    with context.eager_mode():
+      self.assertTrue(f(0))
+      self.assertFalse(f(1))
+      self.assertTrue(f(2))
+      self.assertFalse(f(3))
+      self.assertTrue(f(4))
+    events = events_from_logdir(logdir)
+    self.assertEqual(4, len(events))
+    self.assertEqual(0, events[1].step)
+    self.assertEqual(2, events[2].step)
+    self.assertEqual(4, events[3].step)
+
+  @test_util.also_run_as_tf_function
+  def testSummaryScope(self):
+    with summary_ops.summary_scope('foo') as (tag, scope):
+      self.assertEqual('foo', tag)
+      self.assertEqual('foo/', scope)
+      with summary_ops.summary_scope('bar') as (tag, scope):
+        self.assertEqual('foo/bar', tag)
+        self.assertEqual('foo/bar/', scope)
+      with summary_ops.summary_scope('with/slash') as (tag, scope):
+        self.assertEqual('foo/with/slash', tag)
+        self.assertEqual('foo/with/slash/', scope)
+      with ops.name_scope(None):
+        with summary_ops.summary_scope('unnested') as (tag, scope):
+          self.assertEqual('unnested', tag)
+          self.assertEqual('unnested/', scope)
+
+  @test_util.also_run_as_tf_function
+  def testSummaryScope_defaultName(self):
+    with summary_ops.summary_scope(None) as (tag, scope):
+      self.assertEqual('summary', tag)
+      self.assertEqual('summary/', scope)
+    with summary_ops.summary_scope(None, 'backup') as (tag, scope):
+      self.assertEqual('backup', tag)
+      self.assertEqual('backup/', scope)
+
+  @test_util.also_run_as_tf_function
+  def testSummaryScope_handlesCharactersIllegalForScope(self):
+    with summary_ops.summary_scope('f?o?o') as (tag, scope):
+      self.assertEqual('f?o?o', tag)
+      self.assertEqual('foo/', scope)
+    # If all characters aren't legal for a scope name, use default name.
+    with summary_ops.summary_scope('???', 'backup') as (tag, scope):
+      self.assertEqual('???', tag)
+      self.assertEqual('backup/', scope)
+
+  @test_util.also_run_as_tf_function
+  def testSummaryScope_nameNotUniquifiedForTag(self):
+    constant_op.constant(0, name='foo')
+    with summary_ops.summary_scope('foo') as (tag, _):
+      self.assertEqual('foo', tag)
+    with summary_ops.summary_scope('foo') as (tag, _):
+      self.assertEqual('foo', tag)
+    with ops.name_scope('with'):
+      constant_op.constant(0, name='slash')
+    with summary_ops.summary_scope('with/slash') as (tag, _):
+      self.assertEqual('with/slash', tag)
+
+
+class SummaryWriterTest(test_util.TensorFlowTestCase):
+
+  def testWriterInitAndClose(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      writer = summary_ops.create_file_writer(
+          logdir, max_queue=1000, flush_millis=1000000)
+      files = gfile.Glob(os.path.join(logdir, '*'))
+      self.assertEqual(1, len(files))
+      file1 = files[0]
+      self.assertEqual(1, len(events_from_file(file1)))  # file_version Event
+      # Calling init() again while writer is open has no effect
+      writer.init()
+      self.assertEqual(1, len(events_from_file(file1)))
+      with writer.as_default():
+        summary_ops.write('tag', 1, step=0)
+        self.assertEqual(1, len(events_from_file(file1)))
+        # Calling .close() should do an implicit flush
+        writer.close()
+        self.assertEqual(2, len(events_from_file(file1)))
+        # Calling init() on a closed writer should start a new file
+        time.sleep(1.1)  # Ensure filename has a different timestamp
+        writer.init()
+        files = gfile.Glob(os.path.join(logdir, '*'))
+        self.assertEqual(2, len(files))
+        files.remove(file1)
+        file2 = files[0]
+        self.assertEqual(1, len(events_from_file(file2)))  # file_version
+        self.assertEqual(2, len(events_from_file(file1)))  # should be unchanged
+
+  def testSharedName(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      # Create with default shared name (should match logdir)
+      writer1 = summary_ops.create_file_writer(logdir)
+      with writer1.as_default():
+        summary_ops.write('tag', 1, step=1)
+        summary_ops.flush()
+      # Create with explicit logdir shared name (should be same resource/file)
+      shared_name = 'logdir:' + logdir
+      writer2 = summary_ops.create_file_writer(logdir, name=shared_name)
+      with writer2.as_default():
+        summary_ops.write('tag', 1, step=2)
+        summary_ops.flush()
+      # Create with different shared name (should be separate resource/file)
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      writer3 = summary_ops.create_file_writer(logdir, name='other')
+      with writer3.as_default():
+        summary_ops.write('tag', 1, step=3)
+        summary_ops.flush()
+
+    event_files = iter(sorted(gfile.Glob(os.path.join(logdir, '*'))))
+
+    # First file has tags "one" and "two"
+    events = iter(events_from_file(next(event_files)))
+    self.assertEqual('brain.Event:2', next(events).file_version)
+    self.assertEqual(1, next(events).step)
+    self.assertEqual(2, next(events).step)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # Second file has tag "three"
+    events = iter(events_from_file(next(event_files)))
+    self.assertEqual('brain.Event:2', next(events).file_version)
+    self.assertEqual(3, next(events).step)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # No more files
+    self.assertRaises(StopIteration, lambda: next(event_files))
+
+  def testMaxQueue(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      with summary_ops.create_file_writer(
+          logdir, max_queue=1, flush_millis=999999).as_default():
+        get_total = lambda: len(events_from_logdir(logdir))
+        # Note: First tf.Event is always file_version.
+        self.assertEqual(1, get_total())
+        summary_ops.write('tag', 1, step=0)
+        self.assertEqual(1, get_total())
+        # Should flush after second summary since max_queue = 1
+        summary_ops.write('tag', 1, step=0)
+        self.assertEqual(3, get_total())
+
+  def testWriterFlush(self):
+    logdir = self.get_temp_dir()
+    get_total = lambda: len(events_from_logdir(logdir))
+    with context.eager_mode():
+      writer = summary_ops.create_file_writer(
+          logdir, max_queue=1000, flush_millis=1000000)
+      self.assertEqual(1, get_total())  # file_version Event
+      with writer.as_default():
+        summary_ops.write('tag', 1, step=0)
+        self.assertEqual(1, get_total())
+        writer.flush()
+        self.assertEqual(2, get_total())
+        summary_ops.write('tag', 1, step=0)
+        self.assertEqual(2, get_total())
+      # Exiting the "as_default()" should do an implicit flush
+      self.assertEqual(3, get_total())
+
+  def testFlushFunction(self):
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      writer = summary_ops.create_file_writer(
+          logdir, max_queue=999999, flush_millis=999999)
+      with writer.as_default(), summary_ops.always_record_summaries():
+        get_total = lambda: len(events_from_logdir(logdir))
+        # Note: First tf.Event is always file_version.
+        self.assertEqual(1, get_total())
+        summary_ops.write('tag', 1, step=0)
+        summary_ops.write('tag', 1, step=0)
+        self.assertEqual(1, get_total())
+        summary_ops.flush()
+        self.assertEqual(3, get_total())
+        # Test "writer" parameter
+        summary_ops.write('tag', 1, step=0)
+        self.assertEqual(3, get_total())
+        summary_ops.flush(writer=writer)
+        self.assertEqual(4, get_total())
+        summary_ops.write('tag', 1, step=0)
+        self.assertEqual(4, get_total())
+        summary_ops.flush(writer=writer._resource)  # pylint:disable=protected-access
+        self.assertEqual(5, get_total())
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def testEagerMemory(self):
+    logdir = self.get_temp_dir()
+    with summary_ops.create_file_writer(logdir).as_default():
+      summary_ops.write('tag', 1, step=0)
+
+  def testClose_closesOpenFile(self):
+    try:
+      import psutil  # pylint: disable=g-import-not-at-top
+    except ImportError:
+      raise unittest.SkipTest('test requires psutil')
+    proc = psutil.Process()
+    get_open_filenames = lambda: set(info[0] for info in proc.open_files())
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      writer = summary_ops.create_file_writer(logdir)
+      files = gfile.Glob(os.path.join(logdir, '*'))
+      self.assertEqual(1, len(files))
+      eventfile = files[0]
+      self.assertIn(eventfile, get_open_filenames())
+      writer.close()
+      self.assertNotIn(eventfile, get_open_filenames())
+
+  def testDereference_closesOpenFile(self):
+    try:
+      import psutil  # pylint: disable=g-import-not-at-top
+    except ImportError:
+      raise unittest.SkipTest('test requires psutil')
+    proc = psutil.Process()
+    get_open_filenames = lambda: set(info[0] for info in proc.open_files())
+    logdir = self.get_temp_dir()
+    with context.eager_mode():
+      writer = summary_ops.create_file_writer(logdir)
+      files = gfile.Glob(os.path.join(logdir, '*'))
+      self.assertEqual(1, len(files))
+      eventfile = files[0]
+      self.assertIn(eventfile, get_open_filenames())
+      del writer
+      self.assertNotIn(eventfile, get_open_filenames())
+
+
+class SummaryOpsTest(test_util.TensorFlowTestCase):
+
+  def run_metadata(self, *args, **kwargs):
+    assert context.executing_eagerly()
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir)
+    with writer.as_default():
+      summary_ops.run_metadata(*args, **kwargs)
+    writer.close()
+    events = events_from_logdir(logdir)
+    return events[1].summary
+
+  def run_metadata_graphs(self, *args, **kwargs):
+    assert context.executing_eagerly()
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir)
+    with writer.as_default():
+      summary_ops.run_metadata_graphs(*args, **kwargs)
+    writer.close()
+    events = events_from_logdir(logdir)
+    return events[1].summary
+
+  def create_run_metadata(self):
+    step_stats = step_stats_pb2.StepStats(dev_stats=[
+        step_stats_pb2.DeviceStepStats(
+            device='cpu:0',
+            node_stats=[step_stats_pb2.NodeExecStats(node_name='hello')])
+    ])
+    return config_pb2.RunMetadata(
+        function_graphs=[
+            config_pb2.RunMetadata.FunctionGraphs(
+                pre_optimization_graph=graph_pb2.GraphDef(
+                    node=[node_def_pb2.NodeDef(name='foo')]))
+        ],
+        step_stats=step_stats)
+
+  def keras_model(self, *args, **kwargs):
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir)
+    with writer.as_default():
+      summary_ops.keras_model(*args, **kwargs)
+    writer.close()
+    events = events_from_logdir(logdir)
+    # The first event contains no summary values. The written content goes to
+    # the second event.
+    return events[1].summary
+
+  @test_util.run_v2_only
+  def testRunMetadata_usesNameAsTag(self):
+    meta = config_pb2.RunMetadata()
+
+    with ops.name_scope('foo'):
+      summary = self.run_metadata(name='my_name', data=meta, step=1)
+      first_val = summary.value[0]
+
+    self.assertEqual('foo/my_name', first_val.tag)
+
+  @test_util.run_v2_only
+  def testRunMetadata_summaryMetadata(self):
+    expected_summary_metadata = """
+      plugin_data {
+        plugin_name: "graph_run_metadata"
+        content: "1"
+      }
+    """
+    meta = config_pb2.RunMetadata()
+    summary = self.run_metadata(name='my_name', data=meta, step=1)
+    actual_summary_metadata = summary.value[0].metadata
+    self.assertProtoEquals(expected_summary_metadata, actual_summary_metadata)
+
+  @test_util.run_v2_only
+  def testRunMetadata_wholeRunMetadata(self):
+    expected_run_metadata = """
+      step_stats {
+        dev_stats {
+          device: "cpu:0"
+          node_stats {
+            node_name: "hello"
+          }
+        }
+      }
+      function_graphs {
+        pre_optimization_graph {
+          node {
+            name: "foo"
+          }
+        }
+      }
+    """
+    meta = self.create_run_metadata()
+    summary = self.run_metadata(name='my_name', data=meta, step=1)
+    first_val = summary.value[0]
+
+    actual_run_metadata = config_pb2.RunMetadata.FromString(
+        first_val.tensor.string_val[0])
+    self.assertProtoEquals(expected_run_metadata, actual_run_metadata)
+
+  @test_util.run_v2_only
+  def testRunMetadataGraph_usesNameAsTag(self):
+    meta = config_pb2.RunMetadata()
+
+    with ops.name_scope('foo'):
+      summary = self.run_metadata_graphs(name='my_name', data=meta, step=1)
+      first_val = summary.value[0]
+
+    self.assertEqual('foo/my_name', first_val.tag)
+
+  @test_util.run_v2_only
+  def testRunMetadataGraph_summaryMetadata(self):
+    expected_summary_metadata = """
+      plugin_data {
+        plugin_name: "graph_run_metadata_graph"
+        content: "1"
+      }
+    """
+    meta = config_pb2.RunMetadata()
+    summary = self.run_metadata_graphs(name='my_name', data=meta, step=1)
+    actual_summary_metadata = summary.value[0].metadata
+    self.assertProtoEquals(expected_summary_metadata, actual_summary_metadata)
+
+  @test_util.run_v2_only
+  def testRunMetadataGraph_runMetadataFragment(self):
+    expected_run_metadata = """
+      function_graphs {
+        pre_optimization_graph {
+          node {
+            name: "foo"
+          }
+        }
+      }
+    """
+    meta = self.create_run_metadata()
+
+    summary = self.run_metadata_graphs(name='my_name', data=meta, step=1)
+    first_val = summary.value[0]
+
+    actual_run_metadata = config_pb2.RunMetadata.FromString(
+        first_val.tensor.string_val[0])
+    self.assertProtoEquals(expected_run_metadata, actual_run_metadata)
+
+  @test_util.run_v2_only
+  def testKerasModel(self):
+    model = Sequential(
+        [Dense(10, input_shape=(100,)),
+         Activation('relu', name='my_relu')])
+    summary = self.keras_model(name='my_name', data=model, step=1)
+    first_val = summary.value[0]
+    self.assertEqual(model.to_json(), first_val.tensor.string_val[0])
+
+
+def events_from_file(filepath):
+  """Returns all events in a single event file.
+
+  Args:
+    filepath: Path to the event file.
+
+  Returns:
+    A list of all tf.Event protos in the event file.
+  """
+  records = list(tf_record.tf_record_iterator(filepath))
+  result = []
+  for r in records:
+    event = event_pb2.Event()
+    event.ParseFromString(r)
+    result.append(event)
+  return result
+
+
+def events_from_logdir(logdir):
+  """Returns all events in the single eventfile in logdir.
+
+  Args:
+    logdir: The directory in which the single event file is sought.
+
+  Returns:
+    A list of all tf.Event protos from the single event file.
+
+  Raises:
+    AssertionError: If logdir does not contain exactly one file.
+  """
+  assert gfile.Exists(logdir)
+  files = gfile.ListDirectory(logdir)
+  assert len(files) == 1, 'Found not exactly one file in logdir: %s' % files
+  return events_from_file(os.path.join(logdir, files[0]))
+
+
+def to_numpy(summary_value):
+  return tensor_util.MakeNdarray(summary_value.tensor)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/kernel_tests/template_test.py b/tensorflow/python/kernel_tests/template_test.py
index 3b2a56bd1ff6ef81ae17773fd5a23bc96778ce63..f587a7ec4329a1b9a4df5bbfb3d8edcc1773cbcb 100644
--- a/tensorflow/python/kernel_tests/template_test.py
+++ b/tensorflow/python/kernel_tests/template_test.py
@@ -160,6 +160,21 @@ class TemplateTest(test.TestCase):
     self.assertEqual(1, len(result))
     self.assertNotEqual(len(first), len(result))
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_template_with_empty_name(self):
+    tpl = template.make_template("", variable_scoped_function)
+    with variable_scope.variable_scope("outer"):
+      x = variable_scope.get_variable("x", [])
+      v = tpl()
+    self.assertEqual("outer/", tpl.variable_scope_name)
+    self.assertEqual("outer//dummy:0", v.name)
+    if context.executing_eagerly():
+      # In eager mode `x` is not visible to the template since the template does
+      # not rely on global collections.
+      self.assertEqual([v], tpl.variables)
+    else:
+      self.assertEqual([x, v], tpl.variables)
+
   @test_util.run_in_graph_and_eager_modes
   def test_template_with_name(self):
     tmpl1 = template.make_template("s1", variable_scoped_function)
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 88625841bcc982bf477b619f3da0b70498f0542f..056e3b9ff88af197995739a10a080b91aac2ee5e 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import gradients_impl
@@ -184,8 +185,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([[0.0, 0.0], [4.0, 5.0], [0.0, 0.0]],
                           self.evaluate(ta.write(1, [[4.0, 5.0]]).concat()))
 
-  @test_util.disable_control_flow_v2("b/118890905")
-  @test_util.run_v1_only("b/118890905")
+  @test_util.run_v1_only("b/122324791")
   def testTensorArrayReadOrPackNotAllValuesAvailableFillsZeros(self):
     self._testTensorArrayReadOrPackNotAllValuesAvailableFillsZeros()
 
@@ -201,11 +201,21 @@ class TensorArrayTest(test.TestCase):
     self.assertAllEqual([[0.0, 0.0], [4.0, 5.0], [0.0, 0.0]],
                         self.evaluate(ta.write(1, [[4.0, 5.0]]).concat()))
 
-  @test_util.disable_control_flow_v2("b/118890905")
-  @test_util.run_v1_only("b/118890905")
+  @test_util.run_v1_only("b/122324791")
   def testTensorArrayReadOrPackNotAllValuesAvailableInferShapeFillsZeros(self):
     self._testTensorArrayReadOrPackNotAllValuesAvailableInferShapeFillsZeros()
 
+  @test_util.run_v1_only("Uses placeholders")
+  def testSkipEagerTensorArrayReadUninitializedInferShapeFillsZeros(self):
+    with self.cached_session(use_gpu=True) as sess:
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32,
+          tensor_array_name="foo",
+          size=3)
+      val = array_ops.placeholder(dtypes.float32)
+      self.assertAllEqual(
+          [[0.0, 0.0]], sess.run(ta.write(1, val).read(0), {val: [[4.0, 5.0]]}))
+
   def _testTensorArrayUnpackRead(self, tf_dtype):
     with self.cached_session(use_gpu=True):
       convert = _make_converter(tf_dtype)
@@ -345,7 +355,7 @@ class TensorArrayTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testSkipEagerTensorArrayGradGrad(self):
-    if not tensor_array_ops.ENABLE_TENSOR_ARRAY_V2:
+    if not control_flow_util.ENABLE_CONTROL_FLOW_V2:
       self.skipTest("Legacy TensorArray does not support double derivatives.")
     with self.test_session(use_gpu=True) as session:
       x = constant_op.constant(4.0)
@@ -424,12 +434,11 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(t_g_ta_0, t_g_ta_1)
       self.assertAllEqual([[4.0, 5.0]], d_r1_0)
 
-  @test_util.run_v1_only("b/120545219")
   def testTensorArrayWriteWrongIndexOrDataTypeFails(self):
     with self.session(use_gpu=True):
       ta = _make_ta(3, "foo", dtype=dtypes.float32)
       # Test writing the wrong datatype
-      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+      if (control_flow_util.ENABLE_CONTROL_FLOW_V2 and
           not context.executing_eagerly()):
         error_msg = ("Invalid data types; op elements string but list elements "
                      "float")
@@ -440,7 +449,7 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.write(0, "wrong_type_scalar").flow)
 
-      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+      if (control_flow_util.ENABLE_CONTROL_FLOW_V2 and
           not context.executing_eagerly()):
         error_msg = "Trying to modify element -1 in a list with 3 elements."
       else:
@@ -448,7 +457,7 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.write(-1, 3.0).flow)
 
-      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+      if (control_flow_util.ENABLE_CONTROL_FLOW_V2 and
           not context.executing_eagerly()):
         error_msg = "Trying to modify element 3 in a list with 3 elements"
       else:
@@ -458,7 +467,6 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.write(3, 3.0).flow)
 
-  @test_util.run_v1_only("b/120545219")
   def testTensorArrayReadWrongIndexOrDataTypeFails(self):
     with self.session(use_gpu=True):
       ta = _make_ta(3, "foo", dtype=dtypes.float32)
@@ -467,14 +475,14 @@ class TensorArrayTest(test.TestCase):
 
       # Test reading wrong datatype (only possible when constructing graphs).
       if (not context.executing_eagerly() and
-          not tensor_array_ops.ENABLE_TENSOR_ARRAY_V2):
+          not control_flow_util.ENABLE_CONTROL_FLOW_V2):
         r0_bad = gen_data_flow_ops.tensor_array_read_v3(
             handle=w0.handle, index=0, dtype=dtypes.float64, flow_in=w0.flow)
         with self.assertRaisesOpError(
             "TensorArray dtype is float but Op requested dtype double."):
           self.evaluate(r0_bad)
 
-      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+      if (control_flow_util.ENABLE_CONTROL_FLOW_V2 and
           not context.executing_eagerly()):
         error_msg = "Trying to access element -1 in a list with 3 elements."
       else:
@@ -483,7 +491,7 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.read(-1))
 
-      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+      if (control_flow_util.ENABLE_CONTROL_FLOW_V2 and
           not context.executing_eagerly()):
         error_msg = "Trying to access element 3 in a list with 3 elements."
       else:
@@ -504,7 +512,6 @@ class TensorArrayTest(test.TestCase):
           "it has already been written to."):
         self.evaluate(ta.write(2, 3.0).write(2, 3.0).flow)
 
-  @test_util.run_v1_only("b/120545219")
   def testTensorArrayConcatIncompatibleShapesFails(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -533,10 +540,12 @@ class TensorArrayTest(test.TestCase):
 
       # The exact error messages differ between eager execution and graph
       # construction as the former bubbles up the error from array_op.concat.
-      with self.assertRaisesOpError("shape"):
+      error_msg = ("Incompatible ranks"
+                   if control_flow_util.ENABLE_CONTROL_FLOW_V2 and
+                   not context.executing_eagerly() else "shape")
+      with self.assertRaisesRegexp(errors.InvalidArgumentError, error_msg):
         self.evaluate(w3.concat())
 
-  @test_util.run_v1_only("b/120545219")
   def testTensorArraySplitIncompatibleShapesFails(self):
     with self.session(use_gpu=True):
       in_eager_mode = context.executing_eagerly()
@@ -550,7 +559,7 @@ class TensorArrayTest(test.TestCase):
           ta.split([1.0, 2.0, 3.0], lengths).flow.eval(feed_dict={lengths: 1})
 
       error_msg = ("Unused values in tensor. Length of tensor: 3 Values used: 1"
-                   if tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+                   if control_flow_util.ENABLE_CONTROL_FLOW_V2 and
                    not in_eager_mode else
                    r"Expected sum of lengths to be equal to values.shape\[0\], "
                    r"but sum of lengths is 1 and value's shape is: \[3\]")
@@ -558,7 +567,7 @@ class TensorArrayTest(test.TestCase):
         self.evaluate(ta.split([1.0, 2.0, 3.0], [1]).flow)
 
       ta = _make_ta(1, "baz")
-      if tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and not in_eager_mode:
+      if control_flow_util.ENABLE_CONTROL_FLOW_V2 and not in_eager_mode:
         with self.assertRaisesRegexp(
             ValueError, "Shape must be at least rank 1 but is rank 0"):
           self.evaluate(ta.split(1.0, [1]).flow)
@@ -568,7 +577,7 @@ class TensorArrayTest(test.TestCase):
         ):
           self.evaluate(ta.split(1.0, [1]).flow)
 
-      if not tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 or in_eager_mode:
+      if not control_flow_util.ENABLE_CONTROL_FLOW_V2 or in_eager_mode:
         ta = _make_ta(2, "buz")
         with self.assertRaisesOpError(
             r"TensorArray's size is not equal to the size of lengths "
@@ -958,7 +967,7 @@ class TensorArrayTest(test.TestCase):
         v0_grad = gradients_impl.gradients([vout], [v0], [grad_val])[0]
         state0_grad = gradients_impl.gradients([vout], [state0], [grad_val])[0]
         var_grad = gradients_impl.gradients([vout], [var], [grad_val])[0]
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
       state0_t, var_t, v0_t, vout_t, v0_grad_t, var_grad_t, state0_grad_t = (
           self.evaluate(
@@ -1003,28 +1012,11 @@ class TensorArrayTest(test.TestCase):
     # self._testWhileLoopWritePackGradients(
     #     dynamic_size=False, dtype=tf.int64)
 
-  @test_util.disable_control_flow_v2("Testing v1 while_loop with v2 TA")
-  @test_util.enable_tensor_array_v2
-  def testWhileLoopV1WithTensorArrayV2(self):
-    size = 3
-    ta = tensor_array_ops.TensorArray(
-        dtype=dtypes.int32, size=size, element_shape=tensor_shape.scalar())
-
-    def Body(counter, ta):
-      return counter + 1, ta.write(counter, counter)
-
-    _, ta = control_flow_ops.while_loop(lambda i, _: i < size, Body, [0, ta])
-
-    for i in range(size):
-      self.assertEqual(self.evaluate(ta.read(i)), i)
-
-  @test_util.disable_control_flow_v2("b/117943489 (dynamic_size)")
   @test_util.run_v1_only("b/117943489")
   def testSkipEagerWhileLoopDynamicWritePackGradients(self):
     self._testWhileLoopWritePackGradients(
         dynamic_size=True, dtype=dtypes.float32)
 
-  @test_util.disable_control_flow_v2("b/119323158")
   def testGradSerialTwoLoops(self):
     with self.session(use_gpu=True):
       def loop(x):
@@ -1225,11 +1217,14 @@ class TensorArrayTest(test.TestCase):
       c1 = constant_op.constant([4.0, 5.0])
       w1 = w0.write(3, c1)
 
-      with self.assertRaisesOpError(
-          r"Could not read index 0 twice because it was cleared after a "
-          r"previous read \(perhaps try setting clear_after_read = false\?\)"):
-        with ops.control_dependencies([r0]):
-          self.evaluate(w1.read(0))
+      if not control_flow_util.ENABLE_CONTROL_FLOW_V2:
+        # TensorArray v2 does not support clear_after_read.
+        with self.assertRaisesOpError(
+            r"Could not read index 0 twice because it was cleared after a "
+            r"previous read \(perhaps try setting clear_after_read = false\?\)"
+        ):
+          with ops.control_dependencies([r0]):
+            self.evaluate(w1.read(0))
 
       r1 = w1.read(1)
       self.assertAllEqual(c1.get_shape(), r1.shape)
@@ -1238,7 +1233,6 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaises(ValueError):
         w1.write(4, c2)
 
-  @test_util.disable_control_flow_v2("b/117943489 (dynamic_size)")
   @test_util.run_v1_only("b/117943489")
   def testUnpackShape(self):
     self._testUnpackShape()
@@ -1270,7 +1264,7 @@ class TensorArrayTest(test.TestCase):
         self.assertEqual((2, 2), w0.read(1).get_shape())
       else:
         self.assertEqual(r0.get_shape().ndims, None)
-        if not tensor_array_ops.ENABLE_TENSOR_ARRAY_V2:
+        if not control_flow_util.ENABLE_CONTROL_FLOW_V2:
           self.assertEqual(
               tensor_shape.TensorShape(
                   ta1.handle.op.get_attr("element_shape")).ndims, None)
@@ -1299,13 +1293,23 @@ class TensorArrayTest(test.TestCase):
       grad_r0_vals = session.run(grad_r0)[0]
       self.assertAllEqual(grad_r0_vals, [1.0, 0.0])
 
-  # TODO(srbs): Figure out how to enable this. This is probably failing
-  # because we are trying to stack a TensorList with invalid tensors.
-  # That is because we do not receive gradients for all list indices.
-  # Figure out how TensorArray handles this.
-  def disabletestGradientWhenNotAllComponentsRead(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerGradientWhenNotAllComponentsRead(self):
     self._testGradientWhenNotAllComponentsRead()
 
+  @test_util.run_deprecated_v1
+  def testSkipEagerWriteButNotAllComponentsReadGrad(self):
+    with self.cached_session(use_gpu=True) as session:
+      x0 = constant_op.constant(5.0)
+      x1 = constant_op.constant(10.0)
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, size=2).write(0, x0).write(1, x1)
+      r0 = ta.read(0)
+      # calculate (dr0/dx0, dr0/dx1).  since r0 = x0, gradients are (1, 0).
+      grad_r0_x1 = gradients_impl.gradients(ys=[r0], xs=[x0, x1], grad_ys=[1.0])
+      grad_r0_x1_vals = session.run(grad_r0_x1)
+      self.assertAllEqual(grad_r0_x1_vals, [1.0, 0.0])
+
   def _testTensorArrayUnpackDynamic(self):
     with self.cached_session(use_gpu=True) as sess:
       ta = tensor_array_ops.TensorArray(
@@ -1318,12 +1322,10 @@ class TensorArrayTest(test.TestCase):
       grad = gradients_impl.gradients(ys=[r], xs=[x])
       self.assertAllEqual(np.array([1.0, 1.0, 1.0]), self.evaluate(grad)[0])
 
-  @test_util.disable_control_flow_v2("b/117943489")
   @test_util.run_v1_only("b/117943489")
   def testSkipEagerTensorArrayUnpackDynamic(self):
     self._testTensorArrayUnpackDynamic()
 
-  @test_util.disable_control_flow_v2("b/117943489")
   @test_util.run_v1_only("b/117943489")
   def testSkipEagerTensorArraySplitDynamic(self):
     with self.session(use_gpu=True) as sess:
@@ -1341,14 +1343,14 @@ class TensorArrayTest(test.TestCase):
     with self.cached_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, size=0, dynamic_size=False, infer_shape=False)
-      v2_msg = ("Tried to stack elements of a empty list with "
-                "non-fully-defined shape")
+      v2_msg = ("Tried to stack elements of an empty list with "
+                "non-fully-defined element_shape")
       v1_msg = (
           "TensorArray has size zero, but element shape <unknown> is not "
           "fully defined. Currently only static shapes are supported when "
           "packing zero-size TensorArrays.")
-      with self.assertRaisesOpError(v2_msg if tensor_array_ops
-                                    .ENABLE_TENSOR_ARRAY_V2 else v1_msg):
+      with self.assertRaisesOpError(
+          v2_msg if control_flow_util.ENABLE_CONTROL_FLOW_V2 else v1_msg):
         ta.stack().eval()
 
   @test_util.run_v1_only("b/120545219")
@@ -1363,7 +1365,10 @@ class TensorArrayTest(test.TestCase):
           dtype=dtypes.float32, size=0, dynamic_size=False, infer_shape=True)
       self.assertEqual(0, ta.size().eval())
       # Don't actually perform the pack.  This stores the static shape.
-      ta.unstack(array_ops.zeros([0, 3, 5])).mark_used()
+      if control_flow_util.ENABLE_CONTROL_FLOW_V2:
+        ta = ta.unstack(array_ops.zeros([0, 3, 5]))
+      else:
+        ta.unstack(array_ops.zeros([0, 3, 5])).mark_used()
       packed = ta.stack()
       concatenated = ta.concat()
       self.assertAllEqual([0, 3, 5], self.evaluate(packed).shape)
@@ -1371,12 +1376,10 @@ class TensorArrayTest(test.TestCase):
       # first dimension of zero
       self.assertAllEqual([0, 5], self.evaluate(concatenated).shape)
 
-  @test_util.disable_control_flow_v2("b/117943489")
   @test_util.run_v1_only("b/117943489")
   def testSkipEagerTensorArrayEvalEmptyWithDefault(self):
     self._testTensorArrayEvalEmptyWithDefault()
 
-  @test_util.disable_control_flow_v2("b/117943489")
   @test_util.run_v1_only("b/117943489")
   def testSkipEagerTensorArrayScatterReadAndGradients(self):
     with self.session(use_gpu=True) as session:
@@ -1404,8 +1407,43 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([10.0, -10.0], read_vals[1])
       self.assertAllEqual([[2.0, 3.0], [4.0, 5.0]], grad_vals[0])
 
-  @test_util.disable_control_flow_v2("b/117943286")
-  @test_util.run_v1_only("b/117943286")
+  @test_util.run_v1_only("b/117943489")
+  def testSkipEagerTensorArrayScatterPartialReadAndGradients(self):
+    with self.session(use_gpu=True) as session:
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32,
+          tensor_array_name="foo",
+          size=0,
+          dynamic_size=True)
+
+      indices = constant_op.constant([1, 8])
+      value = constant_op.constant([[1.0, -1.0], [10.0, -10.0]])
+
+      w = ta.scatter(indices, value)
+      r0 = w.read(1)
+
+      # Test combined gradients + aggregation of read(0)
+      grad = gradients_impl.gradients(
+          ys=[r0], xs=[value], grad_ys=[[2.0, 3.0]])[0]
+      read_val, grad_val = session.run([r0, grad])
+
+      self.assertAllEqual([1.0, -1.0], read_val)
+      self.assertAllEqual([[2.0, 3.0], [0.0, 0.0]], grad_val)
+
+  def testScatterIntoExistingList(self):
+    ta = tensor_array_ops.TensorArray(
+        dtype=dtypes.float32, tensor_array_name="foo", size=5)
+
+    ta = ta.scatter(indices=[3, 4], value=array_ops.ones([2]))
+    self.assertAllEqual(ta.stack(), [0., 0., 0., 1., 1.])
+
+    ta = ta.scatter(indices=[1], value=array_ops.ones([1]))
+    self.assertAllEqual(ta.stack(), [0., 1., 0., 1., 1.])
+
+    ta = ta.scatter(indices=[0, 2], value=[5., 6.])
+    self.assertAllEqual(ta.stack(), [5., 1., 6., 1., 1.])
+
+  @test_util.run_v1_only("b/118890905")
   def testTensorArrayWriteGatherAndGradients(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
@@ -1488,7 +1526,7 @@ class TensorArrayTest(test.TestCase):
       if "/task:1/" in d:
         self.assertTrue(
             [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
-      else:
+      elif "/host:CPU" not in d:
         self.assertFalse(
             [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
 
@@ -1592,7 +1630,7 @@ class TensorArrayTest(test.TestCase):
       self.assertEqual(tensor_shape.scalar(), read1.get_shape())
 
       if not context.executing_eagerly():
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
       read0_v, read1_v, size0_v, size1_v = self.evaluate((read0, read1, size0,
                                                           size1))
diff --git a/tensorflow/python/kernel_tests/tridiagonal_solve_op_test.py b/tensorflow/python/kernel_tests/tridiagonal_solve_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fd603a9547ee6f318f55f5b790953a51e591b0e
--- /dev/null
+++ b/tensorflow/python/kernel_tests/tridiagonal_solve_op_test.py
@@ -0,0 +1,456 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.linalg.linalg_impl.tridiagonal_solve."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.linalg import linalg_impl
+from tensorflow.python.platform import benchmark
+from tensorflow.python.platform import test
+
+_sample_diags = np.array([[2, 1, 4, 0], [1, 3, 2, 2], [0, 1, -1, 1]])
+_sample_rhs = np.array([1, 2, 3, 4])
+_sample_result = np.array([-9, 5, -4, 4])
+
+
+def _tfconst(array):
+  return constant_op.constant(array, dtypes.float64)
+
+
+def _tf_ones(shape):
+  return array_ops.ones(shape, dtype=dtypes.float64)
+
+
+class TridiagonalSolveOpTest(test.TestCase):
+
+  def _test(self,
+            diags,
+            rhs,
+            expected,
+            diags_format="compact",
+            transpose_rhs=False,
+            conjugate_rhs=False):
+    with self.cached_session(use_gpu=False):
+      result = linalg_impl.tridiagonal_solve(diags, rhs, diags_format,
+                                             transpose_rhs, conjugate_rhs)
+      self.assertAllClose(self.evaluate(result), expected)
+
+  def _testWithLists(self,
+                     diags,
+                     rhs,
+                     expected,
+                     diags_format="compact",
+                     transpose_rhs=False,
+                     conjugate_rhs=False):
+    self._test(
+        _tfconst(diags), _tfconst(rhs), _tfconst(expected), diags_format,
+        transpose_rhs, conjugate_rhs)
+
+  def _assertRaises(self, diags, rhs, diags_format="compact"):
+    with self.assertRaises(ValueError):
+      linalg_impl.tridiagonal_solve(diags, rhs, diags_format)
+
+  # Tests with various dtypes
+
+  def testReal(self):
+    for dtype in dtypes.float32, dtypes.float64:
+      self._test(
+          diags=constant_op.constant(_sample_diags, dtype),
+          rhs=constant_op.constant(_sample_rhs, dtype),
+          expected=constant_op.constant(_sample_result, dtype))
+
+  def testComplex(self):
+    for dtype in dtypes.complex64, dtypes.complex128:
+      self._test(
+          diags=constant_op.constant(_sample_diags, dtype) * (1 + 1j),
+          rhs=constant_op.constant(_sample_rhs, dtype) * (1 - 1j),
+          expected=constant_op.constant(_sample_result, dtype) * (1 - 1j) /
+          (1 + 1j))
+
+  # Tests with small matrix sizes
+
+  def test3x3(self):
+    self._testWithLists(
+        diags=[[2, -1, 0], [1, 3, 1], [0, -1, -2]],
+        rhs=[1, 2, 3],
+        expected=[-3, 2, 7])
+
+  def test2x2(self):
+    self._testWithLists(
+        diags=[[2, 0], [1, 3], [0, 1]], rhs=[1, 4], expected=[-5, 3])
+
+  def test1x1(self):
+    self._testWithLists(diags=[[0], [3], [0]], rhs=[6], expected=[2])
+
+  def test0x0(self):
+    self._test(
+        diags=constant_op.constant(0, shape=(3, 0), dtype=dtypes.float32),
+        rhs=constant_op.constant(0, shape=(0, 1), dtype=dtypes.float32),
+        expected=constant_op.constant(0, shape=(0, 1), dtype=dtypes.float32))
+
+  # Other edge cases
+
+  def testCaseRequiringPivoting(self):
+    # Without partial pivoting (e.g. Thomas algorithm) this would fail.
+    self._testWithLists(
+        diags=[[2, -1, 1, 0], [1, 4, 1, -1], [0, 2, -2, 3]],
+        rhs=[1, 2, 3, 4],
+        expected=[8, -3.5, 0, -4])
+
+  def testCaseRequiringPivotingLastRows(self):
+    self._testWithLists(
+        diags=[[2, 1, -1, 0], [1, -1, 2, 1], [0, 1, -6, 1]],
+        rhs=[1, 2, -1, -2],
+        expected=[5, -2, -5, 3])
+
+  def testNotInvertible(self):
+    with self.assertRaises(errors_impl.InvalidArgumentError):
+      self._testWithLists(
+          diags=[[2, -1, 1, 0], [1, 4, 1, -1], [0, 2, 0, 3]],
+          rhs=[1, 2, 3, 4],
+          expected=[8, -3.5, 0, -4])
+
+  def testDiagonal(self):
+    self._testWithLists(
+        diags=[[0, 0, 0, 0], [1, 2, -1, -2], [0, 0, 0, 0]],
+        rhs=[1, 2, 3, 4],
+        expected=[1, 1, -3, -2])
+
+  def testUpperTriangular(self):
+    self._testWithLists(
+        diags=[[2, 4, -1, 0], [1, 3, 1, 2], [0, 0, 0, 0]],
+        rhs=[1, 6, 4, 4],
+        expected=[13, -6, 6, 2])
+
+  def testLowerTriangular(self):
+    self._testWithLists(
+        diags=[[0, 0, 0, 0], [2, -1, 3, 1], [0, 1, 4, 2]],
+        rhs=[4, 5, 6, 1],
+        expected=[2, -3, 6, -11])
+
+  # Multiple right-hand sides and batching
+
+  def testWithTwoRightHandSides(self):
+    self._testWithLists(
+        diags=_sample_diags,
+        rhs=np.transpose([_sample_rhs, 2 * _sample_rhs]),
+        expected=np.transpose([_sample_result, 2 * _sample_result]))
+
+  def testBatching(self):
+    self._testWithLists(
+        diags=np.array([_sample_diags, -_sample_diags]),
+        rhs=np.array([_sample_rhs, 2 * _sample_rhs]),
+        expected=np.array([_sample_result, -2 * _sample_result]))
+
+  def testBatchingAndTwoRightHandSides(self):
+    rhs = np.transpose([_sample_rhs, 2 * _sample_rhs])
+    expected_result = np.transpose([_sample_result, 2 * _sample_result])
+    self._testWithLists(
+        diags=np.array([_sample_diags, -_sample_diags]),
+        rhs=np.array([rhs, 2 * rhs]),
+        expected=np.array([expected_result, -2 * expected_result]))
+
+  # Various input formats
+
+  def testSequenceFormat(self):
+    self._test(
+        diags=(_tfconst([2, 1, 4]), _tfconst([1, 3, 2, 2]), _tfconst([1, -1,
+                                                                      1])),
+        rhs=_tfconst([1, 2, 3, 4]),
+        expected=_tfconst([-9, 5, -4, 4]),
+        diags_format="sequence")
+
+  def testSequenceFormatWithDummyElements(self):
+    dummy = 20
+    self._test(
+        diags=(_tfconst([2, 1, 4, dummy]), _tfconst([1, 3, 2, 2]),
+               _tfconst([dummy, 1, -1, 1])),
+        rhs=_tfconst([1, 2, 3, 4]),
+        expected=_tfconst([-9, 5, -4, 4]),
+        diags_format="sequence")
+
+  def testSequenceFormatWithBatching(self):
+    self._test(
+        diags=(_tfconst([[2, 1, 4], [-2, -1, -4]]),
+               _tfconst([[1, 3, 2, 2], [-1, -3, -2, -2]]),
+               _tfconst([[1, -1, 1], [-1, 1, -1]])),
+        rhs=_tfconst([[1, 2, 3, 4], [1, 2, 3, 4]]),
+        expected=_tfconst([[-9, 5, -4, 4], [9, -5, 4, -4]]),
+        diags_format="sequence")
+
+  def testMatrixFormat(self):
+    self._testWithLists(
+        diags=[[1, 2, 0, 0], [1, 3, 1, 0], [0, -1, 2, 4], [0, 0, 1, 2]],
+        rhs=[1, 2, 3, 4],
+        expected=[-9, 5, -4, 4],
+        diags_format="matrix")
+
+  def testMatrixFormatWithMultipleRightHandSides(self):
+    self._testWithLists(
+        diags=[[1, 2, 0, 0], [1, 3, 1, 0], [0, -1, 2, 4], [0, 0, 1, 2]],
+        rhs=[[1, -1], [2, -2], [3, -3], [4, -4]],
+        expected=[[-9, 9], [5, -5], [-4, 4], [4, -4]],
+        diags_format="matrix")
+
+  def testMatrixFormatWithBatching(self):
+    self._testWithLists(
+        diags=[[[1, 2, 0, 0], [1, 3, 1, 0], [0, -1, 2, 4], [0, 0, 1, 2]],
+               [[-1, -2, 0, 0], [-1, -3, -1, 0], [0, 1, -2, -4], [0, 0, -1,
+                                                                  -2]]],
+        rhs=[[1, 2, 3, 4], [1, 2, 3, 4]],
+        expected=[[-9, 5, -4, 4], [9, -5, 4, -4]],
+        diags_format="matrix")
+
+  def testRightHandSideAsColumn(self):
+    self._testWithLists(
+        diags=_sample_diags,
+        rhs=np.transpose([_sample_rhs]),
+        expected=np.transpose([_sample_result]),
+        diags_format="compact")
+
+  # Tests with transpose and adjoint
+
+  def testTransposeRhs(self):
+    self._testWithLists(
+        diags=_sample_diags,
+        rhs=np.array([_sample_rhs, 2 * _sample_rhs]),
+        expected=np.array([_sample_result, 2 * _sample_result]),
+        transpose_rhs=True)
+
+  def testConjugateRhs(self):
+    self._testWithLists(
+        diags=_sample_diags,
+        rhs=np.transpose([_sample_rhs * (1 + 1j), _sample_rhs * (1 - 2j)]),
+        expected=np.transpose(
+            [_sample_result * (1 - 1j), _sample_result * (1 + 2j)]),
+        conjugate_rhs=True)
+
+  def testAdjointRhs(self):
+    self._testWithLists(
+        diags=_sample_diags,
+        rhs=np.array([_sample_rhs * (1 + 1j), _sample_rhs * (1 - 2j)]),
+        expected=np.array(
+            [_sample_result * (1 - 1j), _sample_result * (1 + 2j)]),
+        transpose_rhs=True,
+        conjugate_rhs=True)
+
+  def testTransposeRhsWithBatching(self):
+    self._testWithLists(
+        diags=np.array([_sample_diags, -_sample_diags]),
+        rhs=np.array([[_sample_rhs, 2 * _sample_rhs],
+                      [3 * _sample_rhs, 4 * _sample_rhs]]),
+        expected=np.array([[_sample_result, 2 * _sample_result],
+                           [-3 * _sample_result, -4 * _sample_result]]),
+        transpose_rhs=True)
+
+  def testTransposeRhsWithRhsAsVector(self):
+    self._testWithLists(
+        diags=_sample_diags,
+        rhs=_sample_rhs,
+        expected=_sample_result,
+        transpose_rhs=True)
+
+  def testConjugateRhsWithRhsAsVector(self):
+    self._testWithLists(
+        diags=_sample_diags,
+        rhs=_sample_rhs * (1 + 1j),
+        expected=_sample_result * (1 - 1j),
+        conjugate_rhs=True)
+
+  def testTransposeRhsWithRhsAsVectorAndBatching(self):
+    self._testWithLists(
+        diags=np.array([_sample_diags, -_sample_diags]),
+        rhs=np.array([_sample_rhs, 2 * _sample_rhs]),
+        expected=np.array([_sample_result, -2 * _sample_result]),
+        transpose_rhs=True)
+
+  # Invalid input shapes
+
+  def testInvalidShapesCompactFormat(self):
+
+    def test_raises(diags_shape, rhs_shape):
+      self._assertRaises(_tf_ones(diags_shape), _tf_ones(rhs_shape), "compact")
+
+    test_raises((5, 4, 4), (5, 4))
+    test_raises((5, 3, 4), (4, 5))
+    test_raises((5, 3, 4), (5))
+    test_raises((5), (5, 4))
+
+  def testInvalidShapesSequenceFormat(self):
+
+    def test_raises(diags_tuple_shapes, rhs_shape):
+      diagonals = tuple(_tf_ones(shape) for shape in diags_tuple_shapes)
+      self._assertRaises(diagonals, _tf_ones(rhs_shape), "sequence")
+
+    test_raises(((5, 4), (5, 4)), (5, 4))
+    test_raises(((5, 4), (5, 4), (5, 6)), (5, 4))
+    test_raises(((5, 3), (5, 4), (5, 6)), (5, 4))
+    test_raises(((5, 6), (5, 4), (5, 3)), (5, 4))
+    test_raises(((5, 4), (7, 4), (5, 4)), (5, 4))
+    test_raises(((5, 4), (7, 4), (5, 4)), (3, 4))
+
+  def testInvalidShapesMatrixFormat(self):
+
+    def test_raises(diags_shape, rhs_shape):
+      self._assertRaises(_tf_ones(diags_shape), _tf_ones(rhs_shape), "matrix")
+
+    test_raises((5, 4, 7), (5, 4))
+    test_raises((5, 4, 4), (3, 4))
+    test_raises((5, 4, 4), (5, 3))
+
+  # Tests with placeholders
+
+  def _testWithPlaceholders(self,
+                            diags_shape,
+                            rhs_shape,
+                            diags_feed,
+                            rhs_feed,
+                            expected,
+                            diags_format="compact"):
+    if context.executing_eagerly():
+      return
+    diags = array_ops.placeholder(dtypes.float64, shape=diags_shape)
+    rhs = array_ops.placeholder(dtypes.float64, shape=rhs_shape)
+    x = linalg_impl.tridiagonal_solve(diags, rhs, diags_format)
+    with self.cached_session(use_gpu=False) as sess:
+      result = sess.run(x, feed_dict={diags: diags_feed, rhs: rhs_feed})
+      self.assertAllClose(result, expected)
+
+  def testCompactFormatAllDimsUnknown(self):
+    self._testWithPlaceholders(
+        diags_shape=[None, None],
+        rhs_shape=[None],
+        diags_feed=_sample_diags,
+        rhs_feed=_sample_rhs,
+        expected=_sample_result)
+
+  def testCompactFormatUnknownMatrixSize(self):
+    self._testWithPlaceholders(
+        diags_shape=[3, None],
+        rhs_shape=[4],
+        diags_feed=_sample_diags,
+        rhs_feed=_sample_rhs,
+        expected=_sample_result)
+
+  def testCompactFormatUnknownRhsCount(self):
+    self._testWithPlaceholders(
+        diags_shape=[3, 4],
+        rhs_shape=[4, None],
+        diags_feed=_sample_diags,
+        rhs_feed=np.transpose([_sample_rhs, 2 * _sample_rhs]),
+        expected=np.transpose([_sample_result, 2 * _sample_result]))
+
+  def testCompactFormatUnknownBatchSize(self):
+    self._testWithPlaceholders(
+        diags_shape=[None, 3, 4],
+        rhs_shape=[None, 4],
+        diags_feed=np.array([_sample_diags, -_sample_diags]),
+        rhs_feed=np.array([_sample_rhs, 2 * _sample_rhs]),
+        expected=np.array([_sample_result, -2 * _sample_result]))
+
+  def testMatrixFormatWithUnknownDims(self):
+    if context.executing_eagerly():
+      return
+
+    def test_with_matrix_shapes(matrix_shape):
+      matrix = np.array([[1, 2, 0, 0], [1, 3, 1, 0], [0, -1, 2, 4],
+                         [0, 0, 1, 2]])
+      rhs = np.array([1, 2, 3, 4])
+      x = np.array([-9, 5, -4, 4])
+      self._testWithPlaceholders(
+          diags_shape=matrix_shape,
+          rhs_shape=[None, None],
+          diags_feed=matrix,
+          rhs_feed=np.transpose([rhs, 2 * rhs]),
+          expected=np.transpose([x, 2 * x]),
+          diags_format="matrix")
+
+    test_with_matrix_shapes(matrix_shape=[4, 4])
+    test_with_matrix_shapes(matrix_shape=[None, 4])
+    test_with_matrix_shapes(matrix_shape=[4, None])
+    with self.assertRaises(ValueError):
+      test_with_matrix_shapes(matrix_shape=[None, None])
+
+  def testSequenceFormatWithUnknownDims(self):
+    if context.executing_eagerly():
+      return
+    superdiag = array_ops.placeholder(dtypes.float64, shape=[None])
+    diag = array_ops.placeholder(dtypes.float64, shape=[None])
+    subdiag = array_ops.placeholder(dtypes.float64, shape=[None])
+    rhs = array_ops.placeholder(dtypes.float64, shape=[None])
+
+    x = linalg_impl.tridiagonal_solve((superdiag, diag, subdiag),
+                                      rhs,
+                                      diagonals_format="sequence")
+    with self.cached_session(use_gpu=False) as sess:
+      result = sess.run(
+          x,
+          feed_dict={
+              subdiag: [20, 1, -1, 1],
+              diag: [1, 3, 2, 2],
+              superdiag: [2, 1, 4, 20],
+              rhs: [1, 2, 3, 4]
+          })
+      self.assertAllClose(result, [-9, 5, -4, 4])
+
+  # Benchmark
+
+  class TridiagonalSolveBenchmark(test.Benchmark):
+    sizes = [(100000, 1, 1), (1000000, 1, 1), (10000000, 1, 1), (100000, 10, 1),
+             (100000, 100, 1), (10000, 1, 100), (10000, 1, 1000),
+             (10000, 1, 10000)]
+
+    def _generateData(self, matrix_size, batch_size, num_rhs, seed=42):
+      data = random_ops.random_normal(
+          shape=(batch_size, 3 + num_rhs, matrix_size),
+          dtype=dtypes.float64,
+          seed=seed)
+      diags = array_ops.stack([data[:, 0], data[:, 1], data[:, 2]], axis=-2)
+      rhs = data[:, 3:, :]
+      return diags, rhs
+
+    def benchmarkTridiagonalSolveOp(self):
+      for matrix_size, batch_size, num_rhs in self.sizes:
+        with ops.Graph().as_default(), \
+                session.Session(config=benchmark.benchmark_config()) as sess, \
+                ops.device("/cpu:0"):
+          diags, rhs = self._generateData(matrix_size, batch_size, num_rhs)
+          x = linalg_impl.tridiagonal_solve(diags, rhs, transpose_rhs=True)
+          variables.global_variables_initializer().run()
+          self.run_op_benchmark(
+              sess,
+              control_flow_ops.group(x),
+              min_iters=10,
+              store_memory_usage=False,
+              name=("tridiagonal_solve_matrix_size_{}_batch_size_{}_"
+                    "num_rhs_{}").format(matrix_size, batch_size, num_rhs))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/unicode_decode_op_test.py b/tensorflow/python/kernel_tests/unicode_decode_op_test.py
index c165021eea3eba54fbc77aa328acebaccd844a74..9a59f8a7acb8f87381399a556411d523a49d5d37 100644
--- a/tensorflow/python/kernel_tests/unicode_decode_op_test.py
+++ b/tensorflow/python/kernel_tests/unicode_decode_op_test.py
@@ -19,134 +19,686 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+import numpy as np
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import errors_impl as errors
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_string_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_string_ops
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import test
 
 
-# Account for python2 and python3 execution of the test.
-def codepoint(s):
-  if isinstance(s, bytes):
-    return ord(s.decode("utf-8"))
-  elif isinstance(s, str):
-    return ord(s)
-
-
-class UnicodeDecodeTest(test.TestCase):
-
-  def testBatchDecode(self):
-    text = constant_op.constant(
-        ["仅今年前", "分享介面終於迎來更新"])
-    row_splits, utf8_text, offsets = gen_string_ops.unicode_decode_with_offsets(
-        text, "utf-8")
-
-    with self.test_session():
-      self.assertAllEqual([
-          codepoint("仅"),
-          codepoint("今"),
-          codepoint("年"),
-          codepoint("前"),
-          codepoint("分"),
-          codepoint("享"),
-          codepoint("介"),
-          codepoint("面"),
-          codepoint("終"),
-          codepoint("於"),
-          codepoint("迎"),
-          codepoint("來"),
-          codepoint("更"),
-          codepoint("新")
-      ],
-                          self.evaluate(utf8_text).tolist())
-      self.assertAllEqual([0, 4, 14], self.evaluate(row_splits).tolist())
-      self.assertAllEqual([0, 3, 6, 9, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27],
-                          self.evaluate(offsets).tolist())
-
-  def testBasicDecodeWithOffset(self):
-    text = constant_op.constant(["仅今年前"])
-    row_splits, utf8_text, starts = gen_string_ops.unicode_decode_with_offsets(
-        text, "utf-8")
-
-    with self.test_session():
-      self.assertAllEqual([
-          codepoint("仅"),
-          codepoint("今"),
-          codepoint("年"),
-          codepoint("前"),
-      ],
-                          self.evaluate(utf8_text).tolist())
-      self.assertAllEqual(self.evaluate(row_splits).tolist(), [0, 4])
-      self.assertAllEqual(self.evaluate(starts).tolist(), [0, 3, 6, 9])
-
-  @test_util.run_deprecated_v1
-  def testStrictError(self):
-    text = constant_op.constant([b"\xFEED"])
-    _, error, _ = gen_string_ops.unicode_decode_with_offsets(
-        text, "utf-8", errors="strict")
-
-    with self.assertRaises(errors.InvalidArgumentError):
-      with self.test_session():
-        self.evaluate(error)
-
-  def testReplaceOnError(self):
-    text = constant_op.constant([b"\xFE"])
-
-    _, utf8_text, _ = gen_string_ops.unicode_decode_with_offsets(
-        text, "utf-8", errors="replace")
-
-    with self.test_session():
-      self.assertAllEqual(self.evaluate(utf8_text).tolist(), [65533])
-
-  @test_util.run_deprecated_v1
-  def testBadReplacementChar(self):
-    text = constant_op.constant([b"\xFE"])
-    _, error, _ = gen_string_ops.unicode_decode_with_offsets(
-        text, "utf-8", errors="replace", replacement_char=11141111)
-
-    with self.assertRaises(errors.InvalidArgumentError):
-      with self.test_session():
-        self.evaluate(error)
-
-  def testIgnoreOnError(self):
-    text = constant_op.constant([b"\xFEhello"])
-
-    _, utf8_text, _ = gen_string_ops.unicode_decode_with_offsets(
-        text, "utf-8", errors="ignore")
-
-    with self.test_session():
-      self.assertAllEqual(self.evaluate(utf8_text).tolist(), [
-          codepoint("h"),
-          codepoint("e"),
-          codepoint("l"),
-          codepoint("l"),
-          codepoint("o")
-      ])
-
-  @test_util.run_deprecated_v1
-  def testBadErrorPolicy(self):
-    text = constant_op.constant(["hippopotamus"])
-
-    with self.assertRaises(ValueError):
-      _, _, _ = gen_string_ops.unicode_decode_with_offsets(
-          text, "utf-8", errors="oranguatan")
-
-  def testReplaceControlChars(self):
-    text = constant_op.constant(["\x02仅今年前"])
-    row_splits, utf8_text, _ = gen_string_ops.unicode_decode_with_offsets(
-        text, "utf-8", replace_control_characters=True)
-
-    with self.test_session():
-      self.assertAllEqual([
-          65533,
-          codepoint("仅"),
-          codepoint("今"),
-          codepoint("年"),
-          codepoint("前"),
-      ],
-                          self.evaluate(utf8_text).tolist())
-      self.assertAllEqual([0, 5], self.evaluate(row_splits).tolist())
+def _nested_encode(x, encoding):
+  """Encode each string in a nested list with `encoding`."""
+  if isinstance(x, list):
+    return [_nested_encode(v, encoding) for v in x]
+  else:
+    return x.encode(encoding)
+
+
+def _nested_codepoints(x):
+  """Replace each string in a nested list with a list of its codepoints."""
+  # Works for Python 2 and 3, and for both UCS2 and UCS4 builds
+  if isinstance(x, list):
+    return [_nested_codepoints(v) for v in x]
+  else:
+    b = list(x.encode("utf-32-be"))
+    if any(isinstance(c, str) for c in b):
+      b = [ord(c) for c in b]
+    return [(b0 << 24) + (b1 << 16) + (b2 << 8) + b3
+            for b0, b1, b2, b3 in zip(b[::4], b[1::4], b[2::4], b[3::4])]
+
+
+def _nested_offsets(x, encoding):
+  """Replace each string in a nested list with a list of start offsets."""
+  if isinstance(x, list):
+    return [_nested_offsets(v, encoding) for v in x]
+  else:
+    if not x:
+      return []
+    encoded_x = x.encode("utf-32-be")
+    encoded_chars = [encoded_x[i:i + 4] for i in range(0, len(encoded_x), 4)]
+    char_lens = [
+        len(c.decode("utf-32-be").encode(encoding)) for c in encoded_chars
+    ]
+    return [0] + np.cumsum(char_lens).tolist()[:-1]
+
+
+def _nested_splitchars(x, encoding):
+  """Replace each string in a nested list with a list of char substrings."""
+  if isinstance(x, list):
+    return [_nested_splitchars(v, encoding) for v in x]
+  else:
+    b = x.encode("utf-32-be")
+    chars = zip(b[::4], b[1::4], b[2::4], b[3::4])
+    if str is bytes:
+      return [b"".join(c).decode("utf-32-be").encode(encoding) for c in chars]
+    else:
+      return [bytes(c).decode("utf-32-be").encode(encoding) for c in chars]
+
+
+def _make_sparse_tensor(indices, values, dense_shape, dtype=np.int32):
+  return sparse_tensor.SparseTensorValue(
+      np.array(indices, np.int64), np.array(values, dtype),
+      np.array(dense_shape, np.int64))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class UnicodeDecodeTest(ragged_test_util.RaggedTensorTestCase,
+                        parameterized.TestCase):
+
+  def testScalarDecode(self):
+    text = constant_op.constant(u"仅今年前".encode("utf-8"))
+    chars = ragged_string_ops.unicode_decode(text, "utf-8")
+    self.assertAllEqual(chars, [ord(c) for c in u"仅今年前"])
+
+  def testScalarDecodeWithOffset(self):
+    text = constant_op.constant(u"仅今年前".encode("utf-8"))
+    chars, starts = ragged_string_ops.unicode_decode_with_offsets(text, "utf-8")
+    self.assertAllEqual(chars, [ord(c) for c in u"仅今年前"])
+    self.assertAllEqual(starts, [0, 3, 6, 9])
+
+  def testVectorDecode(self):
+    text = constant_op.constant([u"仅今年前".encode("utf-8"), b"hello"])
+    chars = ragged_string_ops.unicode_decode(text, "utf-8")
+    expected_chars = [[ord(c) for c in u"仅今年前"],
+                      [ord(c) for c in u"hello"]]
+    self.assertRaggedEqual(chars, expected_chars)
+
+  def testVectorDecodeWithOffset(self):
+    text = constant_op.constant([u"仅今年前".encode("utf-8"), b"hello"])
+    chars, starts = ragged_string_ops.unicode_decode_with_offsets(text, "utf-8")
+    expected_chars = [[ord(c) for c in u"仅今年前"],
+                      [ord(c) for c in u"hello"]]
+    self.assertRaggedEqual(chars, expected_chars)
+    self.assertRaggedEqual(starts, [[0, 3, 6, 9], [0, 1, 2, 3, 4]])
+
+  @parameterized.parameters([
+      {"texts": u"仅今年前"},
+      {"texts": [u"G\xf6\xf6dnight", u"\U0001f60a"]},
+      {"texts": ["Hello", "world", "", u"👍"]},
+      {"texts": [["Hi", "there"], ["", u"\U0001f60a"]], "ragged_rank": 0},
+      {"texts": [["Hi", "there", ""], [u"😊"]], "ragged_rank": 1},
+      {"texts": [[[u"😊", u"🤠🧐"], []], [[u"🤓👻🤖"]]], "ragged_rank": 2},
+      {"texts": []}
+  ])  # pyformat: disable
+  def testBasicDecode(self, texts, ragged_rank=None):
+    input_tensor = ragged_factory_ops.constant_value(
+        _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes)
+    result = ragged_string_ops.unicode_decode(input_tensor, "UTF-8")
+    expected = _nested_codepoints(texts)
+    self.assertRaggedEqual(expected, result)
+
+  @parameterized.parameters([
+      {"texts": u"仅今年前"},
+      {"texts": [u"G\xf6\xf6dnight", u"\U0001f60a"]},
+      {"texts": ["Hello", "world", "", u"👍"]},
+      {"texts": [["Hi", "there"], ["", u"\U0001f60a"]], "ragged_rank": 0},
+      {"texts": [["Hi", "there", ""], [u"😊"]], "ragged_rank": 1},
+      {"texts": [[[u"😊", u"🤠🧐"], []], [[u"🤓👻🤖"]]], "ragged_rank": 2},
+      {"texts": []}
+  ])  # pyformat: disable
+  def testBasicDecodeWithOffsets(self, texts, ragged_rank=None):
+    input_tensor = ragged_factory_ops.constant_value(
+        _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes)
+    result = ragged_string_ops.unicode_decode_with_offsets(
+        input_tensor, "UTF-8")
+    expected_codepoints = _nested_codepoints(texts)
+    expected_offsets = _nested_offsets(texts, "UTF-8")
+    self.assertRaggedEqual(expected_codepoints, result[0])
+    self.assertRaggedEqual(expected_offsets, result[1])
+
+  def testDocstringExamples(self):
+    texts = [s.encode("utf8") for s in [u"G\xf6\xf6dnight", u"\U0001f60a"]]
+    codepoints1 = ragged_string_ops.unicode_decode(texts, "UTF-8")
+    codepoints2, offsets = ragged_string_ops.unicode_decode_with_offsets(
+        texts, "UTF-8")
+    self.assertRaggedEqual(
+        codepoints1, [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]])
+    self.assertRaggedEqual(
+        codepoints2, [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]])
+    self.assertRaggedEqual(offsets, [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]])
+
+  @parameterized.parameters([
+      dict(
+          texts=["Hello", "world", "", u"👍"],
+          expected=_make_sparse_tensor(
+              indices=[[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [1, 0], [1, 1],
+                       [1, 2], [1, 3], [1, 4], [3, 0]],
+              values=[72, 101, 108, 108, 111, 119, 111, 114, 108, 100, 128077],
+              dense_shape=[4, 5])),
+      dict(
+          texts=[["Hi", "there"], ["", u"\U0001f60a"]],
+          expected=_make_sparse_tensor(
+              indices=[[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 1, 1], [0, 1, 2],
+                       [0, 1, 3], [0, 1, 4], [1, 1, 0]],
+              values=[72, 105, 116, 104, 101, 114, 101, 128522],
+              dense_shape=[2, 2, 5])),
+      dict(
+          texts=[],
+          expected=_make_sparse_tensor(np.zeros([0, 2], np.int64), [], [0, 0])),
+  ])
+  def testDecodeWithSparseOutput(self, texts, expected):
+    input_tensor = np.array(_nested_encode(texts, "UTF-8"), dtype=bytes)
+    result = ragged_string_ops.unicode_decode(input_tensor, "UTF-8").to_sparse()
+    self.assertIsInstance(result, sparse_tensor.SparseTensor)
+    self.assertAllEqual(expected.indices, result.indices)
+    self.assertAllEqual(expected.values, result.values)
+    self.assertAllEqual(expected.dense_shape, result.dense_shape)
+
+  @parameterized.parameters([
+      dict(
+          texts=["Hello", "world", "", u"👍"],
+          expected=[[72, 101, 108, 108, 111], [119, 111, 114, 108, 100],
+                    [-1, -1, -1, -1, -1], [128077, -1, -1, -1, -1]]),
+      dict(
+          texts=[["Hi", "there"], ["", u"\U0001f60a"]],
+          expected=[[[72, 105, -1, -1, -1], [116, 104, 101, 114, 101]],
+                    [[-1, -1, -1, -1, -1], [128522, -1, -1, -1, -1]]],
+          ragged_rank=0),
+      dict(
+          texts=[["Hi", "there", ""], [u"😊"]],
+          expected=[[[72, 105, -1, -1, -1],
+                     [116, 104, 101, 114, 101],
+                     [-1, -1, -1, -1, -1]],
+                    [[128522, -1, -1, -1, -1],
+                     [-1, -1, -1, -1, -1],
+                     [-1, -1, -1, -1, -1]]]),
+      dict(
+          texts=[[[u"😊", u"🤠🧐"], []], [[u"🤓👻🤖"]]],
+          expected=[
+              [[[128522, -1, -1], [129312, 129488, -1]],
+               [[-1, -1, -1], [-1, -1, -1]]],
+              [[[129299, 128123, 129302], [-1, -1, -1]],
+               [[-1, -1, -1], [-1, -1, -1]]]]),
+      dict(texts=[], expected=np.zeros([0, 0], np.int64)),
+  ])  # pyformat: disable
+  def testDecodeWithPaddedOutput(self, texts, expected, ragged_rank=None):
+    input_tensor = ragged_factory_ops.constant_value(
+        _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes)
+    result = ragged_string_ops.unicode_decode(
+        input_tensor, "UTF-8").to_tensor(default_value=-1)
+    self.assertAllEqual(expected, result)
+
+  @parameterized.parameters([
+      dict(
+          input=[b"\xFE", b"hello", b"==\xFF==", b"world"],
+          input_encoding="UTF-8",
+          errors="replace",
+          expected=[[65533], [104, 101, 108, 108, 111],
+                    [61, 61, 65533, 61, 61], [119, 111, 114, 108, 100]]),
+      dict(
+          input=[b"\xFE", b"hello", b"==\xFF==", b"world"],
+          input_encoding="UTF-8",
+          errors="replace",
+          replacement_char=0,
+          expected=[[0], [104, 101, 108, 108, 111],
+                    [61, 61, 0, 61, 61], [119, 111, 114, 108, 100]]),
+      dict(
+          input=[b"\xFE", b"hello", b"==\xFF==", b"world"],
+          input_encoding="UTF-8",
+          errors="ignore",
+          expected=[[], [104, 101, 108, 108, 111],
+                    [61, 61, 61, 61], [119, 111, 114, 108, 100]]),
+      dict(
+          input=[b"\x00", b"hello", b"==\x01==", b"world"],
+          input_encoding="UTF-8",
+          replace_control_characters=True,
+          expected=[[65533], [104, 101, 108, 108, 111],
+                    [61, 61, 65533, 61, 61], [119, 111, 114, 108, 100]]),
+      dict(
+          input=[b"\x00", b"hello", b"==\x01==", b"world"],
+          input_encoding="UTF-8",
+          replace_control_characters=True,
+          replacement_char=0,
+          expected=[[0], [104, 101, 108, 108, 111],
+                    [61, 61, 0, 61, 61], [119, 111, 114, 108, 100]]),
+  ])  # pyformat: disable
+  def testErrorModes(self, expected=None, **args):
+    result = ragged_string_ops.unicode_decode(**args)
+    self.assertRaggedEqual(expected, result)
+
+  @parameterized.parameters([
+      dict(
+          input=[b"\xFE", b"hello", b"==\xFF==", b"world"],
+          input_encoding="UTF-8",
+          errors="replace",
+          expected=[[65533], [104, 101, 108, 108, 111],
+                    [61, 61, 65533, 61, 61], [119, 111, 114, 108, 100]],
+          expected_offsets=[[0], [0, 1, 2, 3, 4],
+                            [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]]),
+      dict(
+          input=[b"\xFE", b"hello", b"==\xFF==", b"world"],
+          input_encoding="UTF-8",
+          errors="replace",
+          replacement_char=0,
+          expected=[[0], [104, 101, 108, 108, 111],
+                    [61, 61, 0, 61, 61], [119, 111, 114, 108, 100]],
+          expected_offsets=[[0], [0, 1, 2, 3, 4],
+                            [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]]),
+      dict(
+          input=[b"\xFE", b"hello", b"==\xFF==", b"world"],
+          input_encoding="UTF-8",
+          errors="ignore",
+          expected=[[], [104, 101, 108, 108, 111],
+                    [61, 61, 61, 61], [119, 111, 114, 108, 100]],
+          expected_offsets=[[], [0, 1, 2, 3, 4],
+                            [0, 1, 3, 4], [0, 1, 2, 3, 4]]),
+      dict(
+          input=[b"\x00", b"hello", b"==\x01==", b"world"],
+          input_encoding="UTF-8",
+          replace_control_characters=True,
+          expected=[[65533], [104, 101, 108, 108, 111],
+                    [61, 61, 65533, 61, 61], [119, 111, 114, 108, 100]],
+          expected_offsets=[[0], [0, 1, 2, 3, 4],
+                            [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]]),
+      dict(
+          input=[b"\x00", b"hello", b"==\x01==", b"world"],
+          input_encoding="UTF-8",
+          replace_control_characters=True,
+          replacement_char=0,
+          expected=[[0], [104, 101, 108, 108, 111],
+                    [61, 61, 0, 61, 61], [119, 111, 114, 108, 100]],
+          expected_offsets=[[0], [0, 1, 2, 3, 4],
+                            [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]]),
+  ])  # pyformat: disable
+  def testErrorModesWithOffsets(self,
+                                expected=None,
+                                expected_offsets=None,
+                                **args):
+    result = ragged_string_ops.unicode_decode_with_offsets(**args)
+    self.assertRaggedEqual(result[0], expected)
+    self.assertRaggedEqual(result[1], expected_offsets)
+
+  @parameterized.parameters(
+      ("UTF-8", [u"こんにちは", u"你好", u"Hello"]),
+      ("UTF-16-BE", [u"こんにちは", u"你好", u"Hello"]),
+      ("UTF-32-BE", [u"こんにちは", u"你好", u"Hello"]),
+      ("US-ASCII", [u"Hello", "world"]),
+      ("ISO-8859-1", [u"ÀÈÓ", "AEO"]),
+      ("SHIFT-JIS", [u"Hello", u"こんにちは"]),
+  )
+  def testDecodeWithDifferentEncodings(self, encoding, texts):
+    expected = _nested_codepoints(texts)
+    input_tensor = constant_op.constant(_nested_encode(texts, encoding))
+    result = ragged_string_ops.unicode_decode(input_tensor, encoding)
+    self.assertRaggedEqual(expected, result)
+
+  @parameterized.parameters(
+      ("UTF-8", [u"こんにちは", u"你好", u"Hello"]),
+      ("UTF-16-BE", [u"こんにちは", u"你好", u"Hello"]),
+      ("UTF-32-BE", [u"こんにちは", u"你好", u"Hello"]),
+      ("US-ASCII", [u"Hello", "world"]),
+      ("ISO-8859-1", [u"ÀÈÓ", "AEO"]),
+      ("SHIFT-JIS", [u"Hello", u"こんにちは"]),
+  )
+  def testDecodeWithOffsetsWithDifferentEncodings(self, encoding, texts):
+    expected_codepoints = _nested_codepoints(texts)
+    expected_offsets = _nested_offsets(texts, encoding)
+    input_tensor = constant_op.constant(_nested_encode(texts, encoding))
+    result = ragged_string_ops.unicode_decode_with_offsets(
+        input_tensor, encoding)
+    self.assertRaggedEqual(expected_codepoints, result[0])
+    self.assertRaggedEqual(expected_offsets, result[1])
+
+  @parameterized.parameters([
+      dict(input=[b"\xFEED"],
+           errors="strict",
+           input_encoding="UTF-8",
+           exception=errors.InvalidArgumentError,
+           message="Invalid formatting on input string"),
+      dict(input="x",
+           input_encoding="UTF-8",
+           replacement_char=11141111,
+           exception=errors.InvalidArgumentError,
+           message="replacement_char out of unicode codepoint range"),
+      dict(input="x",
+           input_encoding="UTF-8",
+           errors="oranguatan",
+           exception=(ValueError, errors.InvalidArgumentError)),
+  ])  # pyformat: disable
+  def testExceptions(self, exception=None, message=None, **args):
+    with self.assertRaisesRegexp(exception, message):
+      self.evaluate(ragged_string_ops.unicode_decode(**args))
+
+  def testUnknownRankError(self):
+    if context.executing_eagerly():
+      return
+    s = array_ops.placeholder(dtypes.string)
+    message = "Rank of `input` must be statically known."
+    with self.assertRaisesRegexp(ValueError, message):
+      self.evaluate(ragged_string_ops.unicode_decode(s, input_encoding="UTF-8"))
+
+  @parameterized.parameters([
+      dict(
+          doc="Single string",
+          input=_nested_encode([u"仅今年前"], "utf-8"),
+          input_encoding="UTF-8",
+          expected_char_values=_nested_codepoints(u"仅今年前"),
+          expected_row_splits=[0, 4],
+          expected_char_to_byte_starts=[0, 3, 6, 9]),
+      dict(
+          doc="Multiple strings",
+          input=_nested_encode([u"仅今年前", u"你好"], "utf-8"),
+          input_encoding="UTF-8",
+          expected_char_values=_nested_codepoints(u"仅今年前你好"),
+          expected_row_splits=[0, 4, 6],
+          expected_char_to_byte_starts=[0, 3, 6, 9, 0, 3]),
+      dict(
+          doc="errors=replace",
+          input=b"=\xFE=",
+          input_encoding="UTF-8",
+          errors="replace",
+          expected_char_values=[61, 65533, 61],
+          expected_row_splits=[0, 3],
+          expected_char_to_byte_starts=[0, 1, 2]),
+      dict(
+          doc="errors=ignore",
+          input=b"=\xFE=",
+          input_encoding="UTF-8",
+          errors="ignore",
+          expected_char_values=[61, 61],
+          expected_row_splits=[0, 2],
+          expected_char_to_byte_starts=[0, 2]),
+  ])
+  def testDecodeGenOp(self,
+                      doc,
+                      expected_row_splits=None,
+                      expected_char_values=None,
+                      expected_char_to_byte_starts=None,
+                      **args):
+    """Test for the c++ interface (gen_string_ops.unicode_decode)."""
+    result = gen_string_ops.unicode_decode_with_offsets(**args)
+    self.assertAllEqual(expected_row_splits, result.row_splits)
+    self.assertAllEqual(expected_char_values, result.char_values)
+    self.assertAllEqual(expected_char_to_byte_starts,
+                        result.char_to_byte_starts)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class UnicodeSplitTest(ragged_test_util.RaggedTensorTestCase,
+                       parameterized.TestCase):
+
+  def testScalarSplit(self):
+    text = constant_op.constant(u"仅今年前".encode("UTF-8"))
+    chars = ragged_string_ops.unicode_split(text, "UTF-8")
+    self.assertAllEqual(chars, [c.encode("UTF-8") for c in u"仅今年前"])
+
+  def testScalarSplitWithOffset(self):
+    text = constant_op.constant(u"仅今年前".encode("UTF-8"))
+    chars, starts = ragged_string_ops.unicode_split_with_offsets(text, "UTF-8")
+    self.assertAllEqual(chars, [c.encode("UTF-8") for c in u"仅今年前"])
+    self.assertAllEqual(starts, [0, 3, 6, 9])
+
+  def testVectorSplit(self):
+    text = constant_op.constant([u"仅今年前".encode("UTF-8"), b"hello"])
+    chars = ragged_string_ops.unicode_split(text, "UTF-8")
+    expected_chars = [[c.encode("UTF-8") for c in u"仅今年前"],
+                      [c.encode("UTF-8") for c in u"hello"]]
+    self.assertRaggedEqual(chars, expected_chars)
+
+  def testVectorSplitWithOffset(self):
+    text = constant_op.constant([u"仅今年前".encode("UTF-8"), b"hello"])
+    chars, starts = ragged_string_ops.unicode_split_with_offsets(text, "UTF-8")
+    expected_chars = [[c.encode("UTF-8") for c in u"仅今年前"],
+                      [c.encode("UTF-8") for c in u"hello"]]
+    self.assertRaggedEqual(chars, expected_chars)
+    self.assertRaggedEqual(starts, [[0, 3, 6, 9], [0, 1, 2, 3, 4]])
+
+  @parameterized.parameters([
+      {"texts": u"仅今年前"},
+      {"texts": [u"G\xf6\xf6dnight", u"\U0001f60a"]},
+      {"texts": ["Hello", "world", "", u"👍"]},
+      {"texts": [["Hi", "there"], ["", u"\U0001f60a"]], "ragged_rank": 0},
+      {"texts": [["Hi", "there", ""], [u"😊"]], "ragged_rank": 1},
+      {"texts": [[[u"😊", u"🤠🧐"], []], [[u"🤓👻🤖"]]], "ragged_rank": 2},
+      {"texts": []}
+  ])  # pyformat: disable
+  def testBasicSplit(self, texts, ragged_rank=None):
+    input_tensor = ragged_factory_ops.constant_value(
+        _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes)
+    result = ragged_string_ops.unicode_split(input_tensor, "UTF-8")
+    expected = _nested_splitchars(texts, "UTF-8")
+    self.assertRaggedEqual(expected, result)
+
+  @parameterized.parameters([
+      {"texts": u"仅今年前"},
+      {"texts": [u"G\xf6\xf6dnight", u"\U0001f60a"]},
+      {"texts": ["Hello", "world", "", u"👍"]},
+      {"texts": [["Hi", "there"], ["", u"\U0001f60a"]], "ragged_rank": 0},
+      {"texts": [["Hi", "there", ""], [u"😊"]], "ragged_rank": 1},
+      {"texts": [[[u"😊", u"🤠🧐"], []], [[u"🤓👻🤖"]]], "ragged_rank": 2},
+      {"texts": []}
+  ])  # pyformat: disable
+  def testBasicSplitWithOffsets(self, texts, ragged_rank=None):
+    input_tensor = ragged_factory_ops.constant_value(
+        _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes)
+    result = ragged_string_ops.unicode_split_with_offsets(input_tensor, "UTF-8")
+    expected_codepoints = _nested_splitchars(texts, "UTF-8")
+    expected_offsets = _nested_offsets(texts, "UTF-8")
+    self.assertRaggedEqual(expected_codepoints, result[0])
+    self.assertRaggedEqual(expected_offsets, result[1])
+
+  def testDocstringExamples(self):
+    texts = [s.encode("utf8") for s in [u"G\xf6\xf6dnight", u"\U0001f60a"]]
+    codepoints1 = ragged_string_ops.unicode_split(texts, "UTF-8")
+    codepoints2, offsets = ragged_string_ops.unicode_split_with_offsets(
+        texts, "UTF-8")
+    self.assertRaggedEqual(
+        codepoints1,
+        [[b"G", b"\xc3\xb6", b"\xc3\xb6", b"d", b"n", b"i", b"g", b"h", b"t"],
+         [b"\xf0\x9f\x98\x8a"]])
+    self.assertRaggedEqual(
+        codepoints2,
+        [[b"G", b"\xc3\xb6", b"\xc3\xb6", b"d", b"n", b"i", b"g", b"h", b"t"],
+         [b"\xf0\x9f\x98\x8a"]])
+    self.assertRaggedEqual(offsets, [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]])
+
+  @parameterized.parameters([
+      dict(
+          texts=["Hello", "world", "", u"👍"],
+          expected=_make_sparse_tensor(
+              indices=[[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [1, 0], [1, 1],
+                       [1, 2], [1, 3], [1, 4], [3, 0]],
+              values=[b"H", b"e", b"l", b"l", b"o",
+                      b"w", b"o", b"r", b"l", b"d", b"\xf0\x9f\x91\x8d"],
+              dense_shape=[4, 5],
+              dtype=bytes)),
+      dict(
+          texts=[["Hi", "there"], ["", u"\U0001f60a"]],
+          expected=_make_sparse_tensor(
+              indices=[[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 1, 1], [0, 1, 2],
+                       [0, 1, 3], [0, 1, 4], [1, 1, 0]],
+              values=[b"H", b"i", b"t", b"h", b"e", b"r", b"e",
+                      b"\xf0\x9f\x98\x8a"],
+              dense_shape=[2, 2, 5],
+              dtype=bytes)),
+      dict(
+          texts=[],
+          expected=_make_sparse_tensor(
+              np.zeros([0, 2], np.int64), [], [0, 0], dtype=bytes)),
+  ])  # pyformat: disable
+  def testSplitWithSparseOutput(self, texts, expected):
+    input_tensor = np.array(_nested_encode(texts, "UTF-8"), dtype=bytes)
+    result = ragged_string_ops.unicode_split(input_tensor, "UTF-8").to_sparse()
+    self.assertIsInstance(result, sparse_tensor.SparseTensor)
+    self.assertAllEqual(expected.indices, result.indices)
+    self.assertAllEqual(expected.values, result.values)
+    self.assertAllEqual(expected.dense_shape, result.dense_shape)
+
+  @parameterized.parameters([
+      dict(
+          texts=["Hello", "world", "", u"👍"],
+          expected=[[b"H", b"e", b"l", b"l", b"o"],
+                    [b"w", b"o", b"r", b"l", b"d"],
+                    ["", "", "", "", ""],
+                    [b"\xf0\x9f\x91\x8d", "", "", "", ""]]),
+      dict(
+          texts=[["Hi", "there"], ["", u"\U0001f60a"]],
+          expected=[[[b"H", b"i", "", "", ""],
+                     [b"t", b"h", b"e", b"r", b"e"]],
+                    [["", "", "", "", ""],
+                     [b"\xf0\x9f\x98\x8a", "", "", "", ""]]],
+          ragged_rank=0),
+      dict(
+          texts=[["Hi", "there", ""], [u"😊"]],
+          expected=[[[b"H", b"i", "", "", ""],
+                     [b"t", b"h", b"e", b"r", b"e"],
+                     ["", "", "", "", ""]],
+                    [[b"\xf0\x9f\x98\x8a", "", "", "", ""],
+                     ["", "", "", "", ""],
+                     ["", "", "", "", ""]]]),
+      dict(
+          texts=[[[u"😊", u"🤠🧐"], []], [[u"🤓👻🤖"]]],
+          expected=[[[[b"\xf0\x9f\x98\x8a", "", ""],
+                      [b"\xf0\x9f\xa4\xa0", b"\xf0\x9f\xa7\x90", ""]],
+                     [["", "", ""],
+                      ["", "", ""]]],
+                    [[[b"\xf0\x9f\xa4\x93", b"\xf0\x9f\x91\xbb",
+                       b"\xf0\x9f\xa4\x96"],
+                      ["", "", ""]],
+                     [["", "", ""],
+                      ["", "", ""]]]]),
+      dict(texts=[], expected=np.zeros([0, 0], np.int64)),
+  ])  # pyformat: disable
+  def testSplitWithPaddedOutput(self, texts, expected, ragged_rank=None):
+    input_tensor = ragged_factory_ops.constant_value(
+        _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes)
+    result = ragged_string_ops.unicode_split(
+        input_tensor, "UTF-8").to_tensor(default_value="")
+    self.assertAllEqual(np.array(expected, dtype=bytes), result)
+
+  @parameterized.parameters([
+      dict(
+          input=[b"\xFE", b"hello", b"==\xFF==", b"world"],
+          input_encoding="UTF-8",
+          errors="replace",
+          expected=[[b"\xef\xbf\xbd"],
+                    [b"h", b"e", b"l", b"l", b"o"],
+                    [b"=", b"=", b"\xef\xbf\xbd", b"=", b"="],
+                    [b"w", b"o", b"r", b"l", b"d"]]),
+      dict(
+          input=[b"\xFE", b"hello", b"==\xFF==", b"world"],
+          input_encoding="UTF-8",
+          errors="replace",
+          replacement_char=0,
+          expected=[[b"\x00"],
+                    [b"h", b"e", b"l", b"l", b"o"],
+                    [b"=", b"=", b"\x00", b"=", b"="],
+                    [b"w", b"o", b"r", b"l", b"d"]]),
+      dict(
+          input=[b"\xFE", b"hello", b"==\xFF==", b"world"],
+          input_encoding="UTF-8",
+          errors="ignore",
+          expected=[[],
+                    [b"h", b"e", b"l", b"l", b"o"],
+                    [b"=", b"=", b"=", b"="],
+                    [b"w", b"o", b"r", b"l", b"d"]]),
+  ])  # pyformat: disable
+  def testErrorModes(self, expected=None, **args):
+    result = ragged_string_ops.unicode_split(**args)
+    self.assertRaggedEqual(expected, result)
+
+  @parameterized.parameters([
+      dict(
+          input=[b"\xFE", b"hello", b"==\xFF==", b"world"],
+          input_encoding="UTF-8",
+          errors="replace",
+          expected=[[b"\xef\xbf\xbd"],
+                    [b"h", b"e", b"l", b"l", b"o"],
+                    [b"=", b"=", b"\xef\xbf\xbd", b"=", b"="],
+                    [b"w", b"o", b"r", b"l", b"d"]],
+          expected_offsets=[[0], [0, 1, 2, 3, 4],
+                            [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]]),
+      dict(
+          input=[b"\xFE", b"hello", b"==\xFF==", b"world"],
+          input_encoding="UTF-8",
+          errors="replace",
+          replacement_char=0,
+          expected=[[b"\x00"],
+                    [b"h", b"e", b"l", b"l", b"o"],
+                    [b"=", b"=", b"\x00", b"=", b"="],
+                    [b"w", b"o", b"r", b"l", b"d"]],
+          expected_offsets=[[0], [0, 1, 2, 3, 4],
+                            [0, 1, 2, 3, 4], [0, 1, 2, 3, 4]]),
+      dict(
+          input=[b"\xFE", b"hello", b"==\xFF==", b"world"],
+          input_encoding="UTF-8",
+          errors="ignore",
+          expected=[[],
+                    [b"h", b"e", b"l", b"l", b"o"],
+                    [b"=", b"=", b"=", b"="],
+                    [b"w", b"o", b"r", b"l", b"d"]],
+          expected_offsets=[[], [0, 1, 2, 3, 4],
+                            [0, 1, 3, 4], [0, 1, 2, 3, 4]]),
+  ])  # pyformat: disable
+  def testErrorModesWithOffsets(self,
+                                expected=None,
+                                expected_offsets=None,
+                                **args):
+    result = ragged_string_ops.unicode_split_with_offsets(**args)
+    self.assertRaggedEqual(expected, result[0])
+    self.assertRaggedEqual(expected_offsets, result[1])
+
+  @parameterized.parameters(
+      ("UTF-8", [u"こんにちは", u"你好", u"Hello"]),
+      ("UTF-16-BE", [u"こんにちは", u"你好", u"Hello"]),
+      ("UTF-32-BE", [u"こんにちは", u"你好", u"Hello"]),
+  )
+  def testSplitWithDifferentEncodings(self, encoding, texts):
+    expected = _nested_splitchars(texts, encoding)
+    input_tensor = constant_op.constant(_nested_encode(texts, encoding))
+    result = ragged_string_ops.unicode_split(input_tensor, encoding)
+    self.assertRaggedEqual(expected, result)
+
+  @parameterized.parameters(
+      ("UTF-8", [u"こんにちは", u"你好", u"Hello"]),
+      ("UTF-16-BE", [u"こんにちは", u"你好", u"Hello"]),
+      ("UTF-32-BE", [u"こんにちは", u"你好", u"Hello"]),
+  )
+  def testSplitWithOffsetsWithDifferentEncodings(self, encoding, texts):
+    expected_codepoints = _nested_splitchars(texts, encoding)
+    expected_offsets = _nested_offsets(texts, encoding)
+    input_tensor = constant_op.constant(_nested_encode(texts, encoding))
+    result = ragged_string_ops.unicode_split_with_offsets(
+        input_tensor, encoding)
+    self.assertRaggedEqual(expected_codepoints, result[0])
+    self.assertRaggedEqual(expected_offsets, result[1])
+
+  @parameterized.parameters([
+      dict(input=[b"\xFEED"],
+           errors="strict",
+           input_encoding="UTF-8",
+           exception=errors.InvalidArgumentError,
+           message="Invalid formatting on input string"),
+      dict(input="x",
+           input_encoding="UTF-8",
+           replacement_char=11141111,
+           exception=errors.InvalidArgumentError,
+           message="replacement_char out of unicode codepoint range"),
+      dict(input="x",
+           input_encoding="UTF-8",
+           errors="oranguatan",
+           exception=(ValueError, errors.InvalidArgumentError)),
+  ])  # pyformat: disable
+  def testExceptions(self, exception=None, message=None, **args):
+    with self.assertRaisesRegexp(exception, message):
+      self.evaluate(ragged_string_ops.unicode_split(**args))
+
+  def testUnknownRankError(self):
+    if context.executing_eagerly():
+      return
+    s = array_ops.placeholder(dtypes.string)
+    message = "Rank of `input` must be statically known."
+    with self.assertRaisesRegexp(ValueError, message):
+      self.evaluate(ragged_string_ops.unicode_decode(s, input_encoding="UTF-8"))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index 336e9b0bca2339554339b655e2226ea35558bb00..028ef11fc496725fd6535dd28196e9fadcf2fee4 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -43,6 +43,11 @@ from tensorflow.python.util import compat
 
 class VariablesTestCase(test.TestCase):
 
+  @test_util.run_deprecated_v1
+  def testDistributeStrategy(self):
+    v = variables.VariableV1(0.0)
+    self.assertIsNone(v._distribute_strategy)
+
   @test_util.run_v1_only("b/120545219")
   def testInitialization(self):
     with self.cached_session():
@@ -66,7 +71,7 @@ class VariablesTestCase(test.TestCase):
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
         self.evaluate(var1)
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertAllClose(0.0, self.evaluate(var0))
       self.assertAllClose(1.1, self.evaluate(var1))
@@ -96,11 +101,11 @@ class VariablesTestCase(test.TestCase):
       self.assertEqual([3, 6], depdep.get_shape())
       self.assertEqual([3, 6], depdep.shape)
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
-      self.assertAllClose(rnd.eval(), self.evaluate(dep))
-      self.assertAllClose(rnd.eval() + self.evaluate(dep) + 2.0,
-                          self.evaluate(depdep))
+      self.assertAllClose(self.evaluate(rnd), self.evaluate(dep))
+      self.assertAllClose(
+          self.evaluate(rnd) + self.evaluate(dep) + 2.0, self.evaluate(depdep))
 
   def testIterable(self):
     with self.assertRaisesRegexp(TypeError, "not iterable"):
@@ -117,7 +122,7 @@ class VariablesTestCase(test.TestCase):
       plus_one = var.assign_add(1.0)
       minus_one = var.assign_sub(2.0)
       four = var.assign(4.0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(0.0, self.evaluate(var))
 
       self.assertAllClose(1.0, self.evaluate(plus_one))
@@ -136,7 +141,7 @@ class VariablesTestCase(test.TestCase):
       plus_one = var.assign_add(1.0)
       minus_one = var.assign_sub(2.0)
       four = var.assign(4.0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(0.0, self.evaluate(var))
 
       self.evaluate(plus_one)
@@ -166,7 +171,7 @@ class VariablesTestCase(test.TestCase):
       var = variables.Variable(zero)
       count_up_to = var.count_up_to(3)
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual(0, self.evaluate(var))
 
       self.assertEqual(0, self.evaluate(count_up_to))
@@ -264,10 +269,10 @@ class VariablesTestCase(test.TestCase):
     with self.cached_session():
       var_x = variables.Variable(2.0)
       var_y = variables.Variable(3.0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(2.0, self.evaluate(var_x))
       self.assertAllClose(3.0, self.evaluate(var_y))
-      self.assertAllClose(5.0, math_ops.add(var_x, var_y).eval())
+      self.assertAllClose(5.0, self.evaluate(math_ops.add(var_x, var_y)))
 
   @test_util.run_deprecated_v1
   def testZeroSizeVarSameAsConst(self):
@@ -277,9 +282,9 @@ class VariablesTestCase(test.TestCase):
       variable_mul = math_ops.matmul(zero_size_const, zero_size_var)
       const_mul = math_ops.matmul(
           zero_size_const, zero_size_const, transpose_b=True)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variable_output = self.evaluate(variable_mul)
-      self.assertAllClose(const_mul.eval(), variable_output)
+      self.assertAllClose(self.evaluate(const_mul), variable_output)
       self.assertAllClose([[0., 0.], [0., 0.]], variable_output)
 
   @test_util.run_deprecated_v1
@@ -372,7 +377,7 @@ class VariablesTestCase(test.TestCase):
       matmul = var_m.__matmul__([[10.0], [20.0]])
       rmatmul = var_m.__rmatmul__([[10.0], [20.0]])
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose([2.0], self.evaluate(add))
       self.assertAllClose([3.0], self.evaluate(radd))
       self.assertAllClose([1.0], self.evaluate(sub))
@@ -409,7 +414,7 @@ class VariablesTestCase(test.TestCase):
   def testSession(self):
     with self.cached_session() as sess:
       var = variables.Variable([1, 12])
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose([1, 12], self.evaluate(var))
 
   @test_util.run_v1_only("b/120545219")
@@ -431,7 +436,7 @@ class VariablesTestCase(test.TestCase):
       v1 = variables.Variable(initializer, dtype=dtypes.float32)
       self.assertEqual(shape, v1.get_shape())
       self.assertEqual(shape, v1.shape)
-      self.assertAllClose(value, v1.initial_value.eval())
+      self.assertAllClose(value, self.evaluate(v1.initial_value))
       with self.assertRaises(errors_impl.FailedPreconditionError):
         self.evaluate(v1)
 
@@ -439,11 +444,11 @@ class VariablesTestCase(test.TestCase):
           math_ops.negative(v1.initialized_value()), dtype=dtypes.float32)
       self.assertEqual(v1.get_shape(), v2.get_shape())
       self.assertEqual(v1.shape, v2.shape)
-      self.assertAllClose(np.negative(value), v2.initial_value.eval())
+      self.assertAllClose(np.negative(value), self.evaluate(v2.initial_value))
 
       with self.assertRaises(errors_impl.FailedPreconditionError):
         self.evaluate(v2)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(np.negative(value), self.evaluate(v2))
 
   def testConstraintArg(self):
@@ -465,10 +470,10 @@ class VariablesTestCase(test.TestCase):
       a = variables.Variable([1, 2, 3], dtype=dtypes.float32)
       b = variables.Variable(a.initialized_value() + 2)
       c = variables.Variable(b.initialized_value() + 2)
-      variables.global_variables_initializer().run()
-      self.assertAllEqual(a.eval(), [1, 2, 3])
-      self.assertAllEqual(b.eval(), [3, 4, 5])
-      self.assertAllEqual(c.eval(), [5, 6, 7])
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllEqual(self.evaluate(a), [1, 2, 3])
+      self.assertAllEqual(self.evaluate(b), [3, 4, 5])
+      self.assertAllEqual(self.evaluate(c), [5, 6, 7])
 
   @test_util.run_deprecated_v1
   def testInitializerFunctionDevicePlacement(self):
@@ -503,7 +508,7 @@ class VariablesTestCase(test.TestCase):
       # initialized_value should not rerun the initializer_op if the variable
       # has already been initialized elsewhere.
       self.evaluate(v.assign(1.0))
-      self.assertEqual(1.0, v.initialized_value().eval())
+      self.assertEqual(1.0, self.evaluate(v.initialized_value()))
 
     v_def.ClearField("initial_value_name")
     with ops.Graph().as_default(), self.cached_session() as sess:
@@ -537,7 +542,7 @@ class VariablesTestCase(test.TestCase):
   def testLoad(self):
     with self.cached_session():
       var = variables.Variable(np.zeros((5, 5), np.float32))
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       var.load(np.ones((5, 5), np.float32))
 
       self.assertAllClose(np.ones((5, 5), np.float32), self.evaluate(var))
@@ -573,7 +578,7 @@ class IsInitializedTest(test.TestCase):
       _ = v, w
       uninited = variables.report_uninitialized_variables()
       self.assertAllEqual(np.array([b"v", b"w"]), self.evaluate(uninited))
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual(0, self.evaluate(uninited).size)
 
   @test_util.run_v1_only("b/120545219")
@@ -601,20 +606,20 @@ class IsInitializedTest(test.TestCase):
       b = variables.Variable(array_ops.ones([2, 2]))
       objective = math_ops.reduce_sum(b + math_ops.matmul(
           a, a, transpose_a=True))
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       do_opt = gradient_descent.GradientDescentOptimizer(0.1).minimize(
           objective)
       self.evaluate([do_opt])
       self.assertAllClose([[0.9, 0.9], [0.9, 0.9]], self.evaluate(b))
 
 
+@test_util.run_v1_only("b/120545219")
 class ObsoleteIsInitializedTest(test.TestCase):
 
   def testNoVars(self):
     with ops.Graph().as_default():
       self.assertEqual(None, variables.assert_variables_initialized())
 
-  @test_util.run_v1_only("b/120545219")
   def testVariables(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v = variables.VariableV1([1, 2])
@@ -623,10 +628,9 @@ class ObsoleteIsInitializedTest(test.TestCase):
       inited = variables.assert_variables_initialized()
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
         self.evaluate(inited)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.evaluate(inited)
 
-  @test_util.run_v1_only("b/120545219")
   def testVariableList(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v = variables.VariableV1([1, 2])
@@ -766,36 +770,36 @@ class PartitionedVariableTest(test.TestCase):
       assign_list = pv_1.assign([c_0, c_1])
       assign_part_value = pv_1.assign_add(assign_ones)
       assign_part_var = pv_1.assign_sub(pv_0)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
-      self.assertEqual([1.0], plus_delta[0].eval())
+      self.assertEqual([1.0], self.evaluate(plus_delta[0]))
       self.assertEqual([1.0], self.evaluate(v0))
-      self.assertEqual([3.0], plus_delta[1].eval())
+      self.assertEqual([3.0], self.evaluate(plus_delta[1]))
       self.assertEqual([3.0], self.evaluate(v1))
 
-      self.assertEqual([-2.0], minus_delta[0].eval())
+      self.assertEqual([-2.0], self.evaluate(minus_delta[0]))
       self.assertEqual([-2.0], self.evaluate(v0))
-      self.assertEqual([-1.0], minus_delta[1].eval())
+      self.assertEqual([-1.0], self.evaluate(minus_delta[1]))
       self.assertEqual([-1.0], self.evaluate(v1))
 
-      self.assertEqual([1.0], assign_ones[0].eval())
+      self.assertEqual([1.0], self.evaluate(assign_ones[0]))
       self.assertEqual([1.0], self.evaluate(v0))
-      self.assertEqual([1.0], assign_ones[1].eval())
+      self.assertEqual([1.0], self.evaluate(assign_ones[1]))
       self.assertEqual([1.0], self.evaluate(v1))
 
-      self.assertEqual([2.0], assign_list[0].eval())
+      self.assertEqual([2.0], self.evaluate(assign_list[0]))
       self.assertEqual([2.0], self.evaluate(v2))
-      self.assertEqual([3.0], assign_list[1].eval())
+      self.assertEqual([3.0], self.evaluate(assign_list[1]))
       self.assertEqual([3.0], self.evaluate(v3))
 
-      self.assertEqual([3.0], assign_part_value[0].eval())
+      self.assertEqual([3.0], self.evaluate(assign_part_value[0]))
       self.assertEqual([3.0], self.evaluate(v2))
-      self.assertEqual([4.0], assign_part_value[1].eval())
+      self.assertEqual([4.0], self.evaluate(assign_part_value[1]))
       self.assertEqual([4.0], self.evaluate(v3))
 
-      self.assertEqual([2.0], assign_part_var[0].eval())
+      self.assertEqual([2.0], self.evaluate(assign_part_var[0]))
       self.assertEqual([2.0], self.evaluate(v2))
-      self.assertEqual([3.0], assign_part_var[1].eval())
+      self.assertEqual([3.0], self.evaluate(assign_part_var[1]))
       self.assertEqual([3.0], self.evaluate(v3))
 
 
diff --git a/tensorflow/python/kernel_tests/while_v2_test.py b/tensorflow/python/kernel_tests/while_v2_test.py
index cae459a34e934cc804a56f5738202377a1227274..2c6d275900a2a3b6ec8988ebfcb0f73fc9833cb8 100644
--- a/tensorflow/python/kernel_tests/while_v2_test.py
+++ b/tensorflow/python/kernel_tests/while_v2_test.py
@@ -22,6 +22,9 @@ from absl.testing import parameterized
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
@@ -29,12 +32,11 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import list_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
-from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.ops import while_v2
 from tensorflow.python.ops.control_flow_ops import while_loop as while_loop_v1
 from tensorflow.python.ops.while_v2 import while_loop as while_loop_v2
@@ -66,6 +68,21 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
       self.assertEqual(16., eval_result[0])
       self.assertSequenceEqual(sess.run(grad), [32.])
 
+  def testGradientTapeResourceVariable(self):
+    with context.eager_mode():
+      v = variables.Variable(1.)
+
+      @def_function.function
+      def fnWithLoop():  # pylint: disable=invalid-name
+        with backprop.GradientTape() as tape:
+          _, x = while_loop_v2(
+              lambda i, _: i < 2,
+              lambda i, x: (i + 1, x * v),
+              [0, 2.])
+        return tape.gradient(x, v)
+
+      self.assertAllEqual(fnWithLoop(), 4.0)
+
   @test_util.run_deprecated_v1
   def testMultipleLoopVarsBasic(self):
     x = constant_op.constant(5.)
@@ -118,6 +135,18 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
       self.assertSequenceEqual(self.evaluate(grady_1), [6.])
       self.assertSequenceEqual(self.evaluate(grady_2), [61.])
 
+  @test_util.run_deprecated_v1
+  def testGradientTape(self):
+    with backprop.GradientTape() as t:
+      x = constant_op.constant(2.)
+      t.watch(x)
+      ret = while_loop_v2(
+          lambda v: v < 4., lambda v: v * v, [x],
+          return_same_structure=False)  # x**2
+    grad = t.gradient(ret, x)
+    with self.cached_session() as sess:
+      self.assertAllEqual(sess.run(grad), 4.0)
+
   @test_util.run_deprecated_v1
   def testMultipleWhileLoops(self):
     x = constant_op.constant(2.)
@@ -365,7 +394,7 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     param = constant_op.constant(2.0)
     y0 = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="elems")
     # map_fn uses TensorArray internally.
-    r = functional_ops.map_fn(lambda x: math_ops.multiply(x, param), y0)
+    r = map_fn.map_fn(lambda x: math_ops.multiply(x, param), y0)
     grad = gradients_impl.gradients(r, param)[0]
     self.assertAllClose([2.0, 4.0, 6.0, 8.0, 10.0, 12.0], self.evaluate(r))
     self.assertAllClose(21.0, self.evaluate(grad))
@@ -410,38 +439,6 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
       self.assertEqual(self.evaluate(ret), 16.)
       self.assertSequenceEqual(self.evaluate(grad), [32.])
 
-  @test_util.run_deprecated_v1
-  def testNestedWhileAndTensorArray(self):
-    n = constant_op.constant(3.0)
-
-    def Body(row, ta, n):
-
-      def InnerBody(row, col, ta, n):
-        # Note: row and col are 1-based.
-        ta = ta.write(
-            math_ops.cast(n * (row - 1.) + col - 1., dtypes.int32), row * col)
-        return row, col + 1., ta, n
-
-      # TODO(b/118457764): Remove n from loop_vars from both loops once fixed.
-      ta = while_loop_v2(
-          lambda _, col, _1, n: col <= n,
-          InnerBody, [row, constant_op.constant(1.), ta, n],
-          return_same_structure=False)[2]
-      return row + 1., ta, n
-
-    ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=9)
-    ta = while_loop_v2(
-        lambda row, _, _1: row <= n,
-        Body, [constant_op.constant(1.), ta, n],
-        return_same_structure=False)[1]
-
-    output = array_ops.reshape(ta.stack(), [3, 3])
-    self.assertAllEqual(
-        self.evaluate(output), [[1., 2., 3.], [2., 4., 6.], [3., 6., 9.]])
-    # TODO(b/117675481): This does not work with current TA. Enable with new TA.
-    # grad = gradients_impl.gradients(output, [n])
-    # self.assertEqual(self.evaluate(grad), 3.5)
-
   @test_util.run_deprecated_v1
   def testForwardPassRewrite(self):
     x = constant_op.constant(1.0, name="x")
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index bfe591f875556c9dbcf3001bec4fe836bca3593f..bb50442dca1a5109b793a0daa950e4a2b9abd913 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -26,6 +26,7 @@ from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_contextlib
@@ -551,6 +552,10 @@ class Layer(base_layer.Layer):
         setattr(result, k, copy.deepcopy(v, memo))
     return result
 
+  def __setattr__(self, value, name):
+    # By-pass the automatic dependency tracking performed by the parent Layer.
+    super(trackable.Trackable, self).__setattr__(value, name)
+
 
 def _add_elements_to_collection(elements, collection_list):
   if context.executing_eagerly():
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index 5d4805e245e17376e8719466868326b34d7cf12d..03344c844d35aa74c09ccc9cc308fa921b4d1789 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -117,7 +117,7 @@ class Conv1D(keras_layers.Conv1D, base.Layer):
 
 @deprecation.deprecated(
     date=None,
-    instructions='Use keras.layers.conv1d instead.')
+    instructions='Use tf.keras.layers.Conv1D instead.')
 @tf_export(v1=['layers.conv1d'])
 def conv1d(inputs,
            filters,
@@ -316,7 +316,7 @@ class Conv2D(keras_layers.Conv2D, base.Layer):
 
 @deprecation.deprecated(
     date=None,
-    instructions='Use keras.layers.conv2d instead.')
+    instructions='Use tf.keras.layers.Conv2D instead.')
 @tf_export(v1=['layers.conv2d'])
 def conv2d(inputs,
            filters,
@@ -523,7 +523,7 @@ class Conv3D(keras_layers.Conv3D, base.Layer):
 
 @deprecation.deprecated(
     date=None,
-    instructions='Use keras.layers.conv3d instead.')
+    instructions='Use tf.keras.layers.Conv3D instead.')
 @tf_export(v1=['layers.conv3d'])
 def conv3d(inputs,
            filters,
@@ -853,7 +853,7 @@ class SeparableConv2D(keras_layers.SeparableConv2D, base.Layer):
 
 @deprecation.deprecated(
     date=None,
-    instructions='Use keras.layers.separable_conv1d instead.')
+    instructions='Use tf.keras.layers.SeparableConv1D instead.')
 @tf_export(v1=['layers.separable_conv1d'])
 def separable_conv1d(inputs,
                      filters,
@@ -973,7 +973,7 @@ def separable_conv1d(inputs,
 
 @deprecation.deprecated(
     date=None,
-    instructions='Use keras.layers.separable_conv2d instead.')
+    instructions='Use tf.keras.layers.SeparableConv2D instead.')
 @tf_export(v1=['layers.separable_conv2d'])
 def separable_conv2d(inputs,
                      filters,
@@ -1183,7 +1183,7 @@ class Conv2DTranspose(keras_layers.Conv2DTranspose, base.Layer):
 
 @deprecation.deprecated(
     date=None,
-    instructions='Use keras.layers.conv2d_transpose instead.')
+    instructions='Use tf.keras.layers.Conv2DTranspose instead.')
 @tf_export(v1=['layers.conv2d_transpose'])
 def conv2d_transpose(inputs,
                      filters,
@@ -1363,7 +1363,7 @@ class Conv3DTranspose(keras_layers.Conv3DTranspose, base.Layer):
 
 @deprecation.deprecated(
     date=None,
-    instructions='Use keras.layers.conv3d_transpose instead.')
+    instructions='Use tf.keras.layers.Conv3DTranspose instead.')
 @tf_export(v1=['layers.conv3d_transpose'])
 def conv3d_transpose(inputs,
                      filters,
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index b2d54a98272be53b69872e900901d9552177a172..7e12dcacd86a2f792743316f65a97806c7028fc0 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -64,7 +64,7 @@ class Dense(keras_layers.Dense, base.Layer):
       `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
     name: String, the name of the layer. Layers with the same name will
       share weights, but to avoid mistakes we require reuse=True in such cases.
-    reuse: Boolean, whether to reuse the weights of a previous layer
+    _reuse: Boolean, whether to reuse the weights of a previous layer
       by the same name.
 
   Properties:
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index f681cff6cff35bfd8ed0e3a880d26936a54fabee..77fbfd51bbbace5b043b719de45c474476f69fd4 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -64,6 +64,19 @@ bool IsPyFloat(PyObject* obj) {
          PyIsInstance(obj, &PyFloatingArrType_Type);  // NumPy float types
 }
 
+// If the input is a zero dimensional PyArray return it converted to a scalar.
+// Otherwise return the input and increment its reference count.
+// Users must Py_DECREF the output of this method.
+PyObject* ZeroDimArrayToScalar(PyObject* obj) {
+  if (PyArray_IsZeroDim(obj) && !PyArray_IsScalar(obj, Generic)) {
+    auto pyarray_obj = reinterpret_cast<PyArrayObject*>(obj);
+    obj = PyArray_ToScalar(PyArray_DATA(pyarray_obj), pyarray_obj);
+  } else {
+    Py_INCREF(obj);
+  }
+  return obj;
+}
+
 // Converts Python object `c` that should hold a Python string into a
 // C++ string in *out.  Returns nullptr on success, or a message on error.
 // Defined below, but forward declared here for use in PyRepr.
@@ -130,6 +143,10 @@ Status SampleElementFromSequence(PyObject* seq, PyObject** elem) {
 Status InferShapeAndType(PyObject* obj, TensorShape* shape, DataType* dtype) {
   std::vector<Safe_PyObjectPtr> refs_to_clean;
   while (true) {
+    // Convert any zero dimensional numpy arrays to scalars first of all.
+    // We also have to make sure a reference to the safe_obj is kept.
+    obj = ZeroDimArrayToScalar(obj);
+    refs_to_clean.push_back(make_safe(obj));
     // We test strings first, in case a string is considered a sequence.
     if (IsPyString(obj)) {
       *dtype = DT_STRING;
@@ -240,7 +257,9 @@ const char ErrorFoundFloat[] =
       }                                                                   \
       PyObject** l = PySequence_Fast_ITEMS(seq.get());                    \
       for (int64 i = 0; i < s; ++i) {                                     \
-        const char* error = CONVERT(l[i], *buf);                          \
+        auto scalar = ZeroDimArrayToScalar(l[i]);                         \
+        const char* error = CONVERT(scalar, *buf);                        \
+        Py_DECREF(scalar);                                                \
         if (TF_PREDICT_FALSE(error != nullptr)) return error;             \
         ++*buf;                                                           \
       }                                                                   \
@@ -253,7 +272,9 @@ const char ErrorFoundFloat[] =
     Tensor result(TYPE_ENUM, shape);                                      \
     if (shape.dims() == 0) { /* Scalar case */                            \
       TYPE value;                                                         \
-      const char* error = CONVERT(obj, &value);                           \
+      auto scalar = ZeroDimArrayToScalar(obj);                            \
+      const char* error = CONVERT(scalar, &value);                        \
+      Py_DECREF(scalar);                                                  \
       if (error != nullptr) return error;                                 \
       result.scalar<TYPE>()() = value;                                    \
     } else {                                                              \
@@ -331,8 +352,25 @@ DEFINE_HELPER(ConvertInt32, int32, DT_INT32, ConvertOneInt32);
 
 template <class T>
 const char* ConvertOneFloat(PyObject* v, T* out) {
+  if (PyErr_Occurred()) {
+    return nullptr;
+  }
   if (TF_PREDICT_TRUE(PyFloat_Check(v))) {
-    *out = PyFloat_AS_DOUBLE(v);
+    double as_double = PyFloat_AsDouble(v);
+    // Handle infinity.
+    if (as_double == std::numeric_limits<double>::infinity()) {
+      *out = std::numeric_limits<T>::infinity();
+      return nullptr;
+    } else if (as_double == -1 * std::numeric_limits<double>::infinity()) {
+      *out = -1 * std::numeric_limits<T>::infinity();
+      return nullptr;
+    }
+    // Check for overflow.
+    if (as_double > std::numeric_limits<T>::max() ||
+        as_double < std::numeric_limits<T>::lowest()) {
+      return ErrorOutOfRangeDouble;
+    }
+    *out = static_cast<T>(as_double);
     return nullptr;
   }
 #if PY_MAJOR_VERSION < 3
@@ -348,6 +386,9 @@ const char* ConvertOneFloat(PyObject* v, T* out) {
   }
   if (PyIsInstance(v, &PyFloatingArrType_Type)) {  // NumPy float types
     Safe_PyObjectPtr as_float = make_safe(PyNumber_Float(v));
+    if (PyErr_Occurred()) {
+      return nullptr;
+    }
     return ConvertOneFloat<T>(as_float.get(), out);
   }
   if (PyIsInstance(v, &PyIntegerArrType_Type)) {  // NumPy integers
@@ -356,6 +397,9 @@ const char* ConvertOneFloat(PyObject* v, T* out) {
 #else
     Safe_PyObjectPtr as_int = make_safe(PyNumber_Long(v));
 #endif
+    if (PyErr_Occurred()) {
+      return nullptr;
+    }
     return ConvertOneFloat<T>(as_int.get(), out);
   }
   return ErrorMixedTypes;
diff --git a/tensorflow/python/lib/io/file_io.i b/tensorflow/python/lib/io/file_io.i
index 0aa08ea3d15af40173186e0e1741a5b9f3d147bd..135e9e38d74b09975270242eb01a08038b5a2199 100644
--- a/tensorflow/python/lib/io/file_io.i
+++ b/tensorflow/python/lib/io/file_io.i
@@ -220,6 +220,16 @@ void AppendToFile(const string& file_content, tensorflow::WritableFile* file,
   }
 }
 
+int64 TellFile(tensorflow::WritableFile* file, TF_Status* out_status) {
+  int64 position = -1;
+  tensorflow::Status status = file->Tell(&position);
+  if (!status.ok()) {
+    Set_TF_Status_from_Status(out_status, status);
+  }
+  return position;
+}
+
+
 string ReadFromStream(tensorflow::io::BufferedInputStream* stream,
                       size_t bytes,
                       TF_Status* out_status) {
@@ -265,6 +275,7 @@ tensorflow::WritableFile* CreateWritableFile(const string& filename,
                                              TF_Status* out_status);
 void AppendToFile(const string& file_content, tensorflow::WritableFile* file,
                   TF_Status* out_status);
+int64 TellFile(tensorflow::WritableFile* file, TF_Status* out_status);
 string ReadFromStream(tensorflow::io::BufferedInputStream* stream,
                       size_t bytes,
                       TF_Status* out_status);
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index ee55d89bffcbaca2a68cbb028ae9ca5157e6f6df..2720962084b19a57ceefee64e604ce2376a53f78 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -196,8 +196,14 @@ class FileIO(object):
 
   def tell(self):
     """Returns the current position in the file."""
-    self._preread_check()
-    return self._read_buf.Tell()
+    if self._read_check_passed:
+      self._preread_check()
+      return self._read_buf.Tell()
+    else:
+      self._prewrite_check()
+
+      with errors.raise_exception_on_not_ok_status() as status:
+        return pywrap_tensorflow.TellFile(self._writable_file, status)
 
   def __enter__(self):
     """Make usable with "with" statement."""
diff --git a/tensorflow/python/module/BUILD b/tensorflow/python/module/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..55909cc2672b4e601f3a6e5607c1efe1b10e06cc
--- /dev/null
+++ b/tensorflow/python/module/BUILD
@@ -0,0 +1,31 @@
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "module",
+    srcs = ["module.py"],
+    deps = [
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/training/tracking",
+        "@six_archive//:six",
+    ],
+)
+
+tf_py_test(
+    name = "module_test",
+    srcs = ["module_test.py"],
+    additional_deps = [
+        ":module",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python:variables",
+    ],
+)
diff --git a/tensorflow/python/module/module.py b/tensorflow/python/module/module.py
new file mode 100644
index 0000000000000000000000000000000000000000..07455bd7f7cd9455c18f92a5eb4c898735d5e376
--- /dev/null
+++ b/tensorflow/python/module/module.py
@@ -0,0 +1,457 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Modules encapsulate building stateful components."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import re
+import sys
+
+import six
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variables
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.util import nest
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import tf_export
+
+NO_MODULE_NAME_SCOPE = "__no_module_name_scope__"
+
+
+class ModuleMetaclass(abc.ABCMeta):
+  """Metaclass for `tf.Module`."""
+
+  def __new__(mcs, name, bases, clsdict):
+    methods = []
+
+    for key, value in clsdict.items():
+      if key == "name_scope":
+        continue
+
+      elif key.startswith("__") and key != "__call__":
+        # Don't patch methods like `__getattr__` or `__del__`.
+        continue
+
+      elif tf_inspect.isfunction(value):
+        # We defer patching methods until after the type is created such that we
+        # can trigger the descriptor binding them to the class.
+        methods.append(key)
+
+      elif isinstance(value, property):
+        # TODO(tomhennigan) Preserve the type of property subclasses.
+        clsdict[key] = property(
+            value.fget if not value.fget else with_name_scope(value.fget),
+            value.fset if not value.fset else with_name_scope(value.fset),
+            value.fdel if not value.fdel else with_name_scope(value.fdel),
+            doc=value.__doc__)
+
+    cls = super(ModuleMetaclass, mcs).__new__(mcs, name, bases, clsdict)
+
+    for method_name in methods:
+      # Note: the below is quite subtle, we need to ensure that we're wrapping
+      # the method bound to the class. In some cases (e.g. `wrapt`) this is
+      # important since the method can trigger different behavior when it is
+      # bound (e.g. in wrapt `FunctionWrapper.__get__(None, cls)` produces a
+      # `BoundFunctionWrapper` which in turn populates the `instance` argument
+      # to decorator functions using args[0]).
+      # Equivalent to: `cls.__dict__[method_name].__get__(None, cls)`
+      method = getattr(cls, method_name)
+      method = with_name_scope(method)
+      setattr(cls, method_name, method)
+
+    return cls
+
+  def __call__(cls, *args, **kwargs):
+    # Call new such that we have an un-initialized module instance that we can
+    # still reference even if there is an exception during __init__. This is
+    # needed such that we can make sure the name_scope constructed in __init__
+    # is closed even if there is an exception.
+    module = cls.__new__(cls, *args, **kwargs)
+
+    # Now attempt to initialize the object.
+    try:
+      module.__init__(*args, **kwargs)
+    except:
+      # We must explicitly catch so that in Python 2 sys.exc_info() is populated
+      # before entering the finally block.
+      raise
+
+    finally:
+      # The base Module constructor enters the modules name scope before
+      # returning such that other functionality in the ctor happens within the
+      # modules name scope.
+      scope = getattr(module, "_ctor_name_scope", None)
+      exc_info = sys.exc_info()
+      if scope is None:
+        if exc_info[0] is None:
+          raise ValueError(
+              "Constructing a tf.Module without calling the super constructor "
+              "is not supported. Add the following as the first line in your "
+              "__init__ method:\n\n"
+              "super(%s, self).__init__()" % cls.__name__)
+      else:
+        scope.__exit__(*exc_info)
+        del module._ctor_name_scope
+
+    return module
+
+
+def wrap_with_name_scope(unbound_method):
+  """Patches the given method so it enters the modules name scope."""
+  def enter_name_scope(self, *args, **kwargs):
+    """Decorator that calls the given function in the module name scope.
+
+    Args:
+      self: Module instance.
+      *args: Positional arguments to `unbound_method`.
+      **kwargs: Keyword arguments to `unbound_method`.
+
+    Returns:
+      `with self.name_scope: return unbound_method(self, *args, **kwargs)`
+    """
+    try:
+      module_name_scope = self.name_scope
+    except AttributeError as exc_value_from:
+      exc_value = AttributeError(
+          "The super constructor must be called before any other methods in "
+          "your constructor. If this is not possible then annotate all the "
+          "methods called with `@no_module_name_scope`.")
+      six.raise_from(exc_value, exc_value_from)
+
+    with module_name_scope:
+      # tf.Module enters the module name scope for all methods. To disable this
+      # for a particular method annotate it with `@no_module_name_scope`.
+      return unbound_method(self, *args, **kwargs)
+
+  return enter_name_scope
+
+
+def wrap_with_name_scope_no_exception(unbound_method):
+  """Patches the given method so it enters the modules name scope."""
+  def enter_name_scope(self, *args, **kwargs):
+    with self.name_scope:
+      # tf.Module enters the module name scope for all methods. To disable this
+      # for a particular method annotate it with `@no_module_name_scope`.
+      return unbound_method(self, *args, **kwargs)
+  return enter_name_scope
+
+
+def with_name_scope(unbound_method):
+  """Patches the given method so it enters the modules name scope."""
+  if getattr(unbound_method, NO_MODULE_NAME_SCOPE, False):
+    # The function has been annotated to say that no autoscoping should be
+    # applied, so do not patch it.
+    return unbound_method
+
+  if isinstance(unbound_method, def_function.Function):
+    # Autograph cannot convert functions that have try/catch.
+    unbound_method._decorate(wrap_with_name_scope_no_exception)  # pylint: disable=protected-access
+    return unbound_method
+  else:
+    return tf_decorator.make_decorator(unbound_method,
+                                       wrap_with_name_scope(unbound_method))
+
+
+@tf_export("Module")
+class Module(six.with_metaclass(ModuleMetaclass, tracking.AutoTrackable)):
+  """Base neural network module class.
+
+  A module is a named container for `tf.Variable`s, other `tf.Module`s and
+  functions which apply to user input. For example a dense layer in a neural
+  network might be implemented as a `tf.Module`:
+
+  >>> class Dense(tf.Module):
+  ...   def __init__(self, in_features, output_features):
+  ...     super(Dense, self).__init__()
+  ...     self.w = tf.Variable(
+  ...         tf.random_normal([input_features, output_features]), name='w')
+  ...     self.b = tf.Variable(tf.zeros([output_features]), name='b')
+  ...
+  ...   def __call__(self, x):
+  ...     x = tf.convert_to_tensor(x, name='x')
+  ...     y = tf.matmul(x, self.w) + self.b
+  ...     return tf.nn.relu(y)
+
+  You can use the dense layer as you would expect:
+
+  >>> d = Dense(input_features=64, output_features=10)
+  >>> d(tf.ones([100, 64]))
+  <tf.Tensor: ...>
+
+  By subclassing `tf.Module` instead of `object` any variables created inside
+  the module are automatically created within the modules name scope:
+
+  >>> d.w.name
+  "dense/w:0"
+
+  In eager mode this is useful for debugging, and when used with `@tf.function`
+  the use of name scopes gives operations (e.g. matmul) useful names as well.
+
+  As well as automatic naming, the Dense module inherits methods for tracking
+  its variables:
+
+  >>> d.variables
+  (<tf.Variable 'dense/b:0' ...>, <tf.Variable 'dense/w:0' ...>)
+  """
+
+  def __init__(self, name=None):
+    if name is None:
+      name = camel_to_snake(type(self).__name__)
+    else:
+      if not valid_identifier(name):
+        raise ValueError(
+            "%r is not a valid module name. Module names must be valid Python "
+            "identifiers (e.g. a valid class name)." % name)
+
+    self._name = name
+    with ops.name_scope(name) as scope_name:
+      self._scope_name = scope_name
+
+    # Enter the name scope so subsequent code in the contructor (e.g. creating
+    # submodules) happens inside the modules name scope. This is exited when
+    # the subclass __init__ returns (this is implemented in ModuleMetaclass).
+    self._ctor_name_scope = self.name_scope
+    self._ctor_name_scope.__enter__()
+
+  @property
+  def name(self):
+    """Returns the name of this module as passed or determined in the ctor.
+
+    NOTE: This is not the same as the `self.name_scope.name` which includes
+    parent module names.
+    """
+    return self._name
+
+  @property
+  def name_scope(self):
+    """Returns a `tf.name_scope` instance for this class."""
+    # TODO(tomhennigan) Memoize once name scopes are re-entrant.
+    return ops.name_scope(self._scope_name)
+
+  @property
+  def variables(self):
+    """Sequence of variables owned by this module and it's submodules.
+
+    Note: this method uses reflection to find variables on the current instance
+    and submodules. For performance reasons you may wish to cache the result
+    of calling this method if you don't expect the return value to change.
+
+    Returns:
+      A sequence of variables for the current module (sorted by attribute
+      name) followed by variables from all submodules recursively (breadth
+      first).
+    """
+    return tuple(self._flatten(predicate=_IS_VARIABLE))
+
+  @property
+  def trainable_variables(self):
+    """Sequence of variables owned by this module and it's submodules.
+
+    Note: this method uses reflection to find variables on the current instance
+    and submodules. For performance reasons you may wish to cache the result
+    of calling this method if you don't expect the return value to change.
+
+    Returns:
+      A sequence of variables for the current module (sorted by attribute
+      name) followed by variables from all submodules recursively (breadth
+      first).
+    """
+    return tuple(self._flatten(predicate=_IS_TRAINABLE_VARIABLE))
+
+  @property
+  def submodules(self):
+    """Sequence of all sub-modules.
+
+    Submodules are modules which are properties of this module, or found as
+    properties of modules which are properties of this module (and so on).
+
+    >>> a = tf.Module()
+    >>> b = tf.Module()
+    >>> c = tf.Module()
+    >>> a.b = b
+    >>> b.c = c
+    >>> assert list(a.submodules) == [b, c]
+    >>> assert list(b.submodules) == [c]
+    >>> assert list(c.submodules) == []
+
+    Returns:
+      A sequence of all submodules.
+    """
+    return tuple(self._flatten(predicate=_IS_MODULE))
+
+  def _flatten(self,
+               recursive=True,
+               predicate=None,
+               attribute_traversal_key=None,
+               with_path=False):
+    """Flattened attribute values in sorted order by attribute name.
+
+    Modules are flattened by first walking their attributes in name order.
+    Each attribute value is then flattened to find leaf values. If flatten is
+    to be applied `recursive`ly then if the leaf is a `Module` it will also be
+    flattened to find leaves. Finally every leaf value is optionally tested
+    against the given `predicate` and finally yielded.
+
+    >>> class Foo(tf.Module):
+    ...   def __init__(self):
+    ...     super(Foo, self).__init__()
+    ...     self.x = [tf.constant('a'), tf.constant('b')]
+    ...     self.y = {'i': tf.constant('c'), 'j': tf.constant('d')}
+    ...     self.z = tf.constant('e')
+    ...
+    ...   @property
+    ...   def tensors(self):
+    ...     return tuple(self._flatten(predicate=is_tensor, with_path=True))
+
+    >>> foo = Foo()
+    >>> foo.tensors
+    ((('x', 0),   <tf.Tensor: ...'a'>),
+     (('x', 1),   <tf.Tensor: ...'b'>),
+     (('y', 'i'), <tf.Tensor: ...'c'>),
+     (('y', 'j'), <tf.Tensor: ...'d'>),
+     (('z',),     <tf.Tensor: ...'e'>))
+
+    `attribute_traversal_key` controls the order object properties are visited.
+    If not set objects are visited in ascending order by name.
+
+    Args:
+      recursive: Whether to recurse into child modules or not.
+      predicate: (Optional) If set then only values matching predicate are
+        yielded. A value of `None` (the default) means no items will be
+        filtered.
+      attribute_traversal_key: (Optional) Method to rekey object attributes
+        before they are sorted. Contract is the same as `key` argument to
+        builtin `sorted` and only applies to object properties.
+      with_path: (Optional) Whether to include the path to the object as well
+        as the object itself. If `with_path` is `True` then leaves will not be
+        de-duplicated (e.g. if the same leaf instance is reachable via multiple
+        modules then it will be yielded multiple times with different paths).
+
+    Returns:
+      Flat generator for leaves of the current module and optionally all
+      submodules.
+    """
+    if predicate is None:
+      predicate = lambda _: True
+
+    return _flatten_module(
+        self,
+        recursive=recursive,
+        predicate=predicate,
+        attribute_traversal_key=attribute_traversal_key,
+        with_path=with_path)
+
+  @classmethod
+  def no_name_scope(cls, method):
+    """Decorator to wrap a method, preventing automatic name scope wrapping.
+
+    By default, any method on a module is considered as a forwards function, and
+    so any variables / modules created by the method will be scoped as belonging
+    to the module. In some cases this is undesirable, for example when
+    implementing .clone() / .transpose(), as in those cases we want the new
+    module to have the scope of wherever the .transpose() call is made. To
+    allow this, decorate any methods with `no_module_name_scope`.
+
+    This logic is tied to ModuleMetaclass.__new__, if anything is
+    changed here corresponding changes will be needed there.
+
+    Args:
+      method: the method to wrap.
+
+    Returns:
+      The method, with a flag indicating no name scope wrapping should occur.
+    """
+    setattr(method, NO_MODULE_NAME_SCOPE, True)
+    return method
+
+_IS_VARIABLE = lambda o: isinstance(o, variables.Variable)
+_IS_TRAINABLE_VARIABLE = lambda o: (_IS_VARIABLE(o) and o.trainable)
+_IS_MODULE = lambda o: isinstance(o, Module)
+_CAMEL_TO_SNAKE_R = re.compile(r"((?<=[a-z0-9])[A-Z]|(?!^)[A-Z](?=[a-z]))")
+_VALID_IDENTIFIER = re.compile(r"^[a-zA-Z_]([a-zA-Z0-9_])*$")
+
+
+def valid_identifier(name):
+  return bool(_VALID_IDENTIFIER.match(name))
+
+
+def camel_to_snake(value):
+  return _CAMEL_TO_SNAKE_R.sub(r"_\1", value).lower()
+
+
+# AutoTrackable adds object attributes that users will not expect us to
+# include when flattening (these reference dependencies reachable via other
+# object attributes).
+AUTO_CHECKPOINTABLE_ATTRS = ("_unconditional_checkpoint_dependencies",
+                             "_unconditional_dependency_names")
+
+
+def _flatten_module(module,
+                    recursive,
+                    predicate,
+                    attribute_traversal_key,
+                    with_path,
+                    module_path=(),
+                    seen=None):
+  """Implementation of `flatten`."""
+  if seen is None:
+    seen = set([id(module)])
+
+  module_dict = vars(module)
+  submodules = []
+
+  for key in sorted(module_dict, key=attribute_traversal_key):
+    if key in AUTO_CHECKPOINTABLE_ATTRS:
+      continue
+
+    for leaf_path, leaf in nest.flatten_with_tuple_paths(module_dict[key]):
+      leaf_path = (key,) + leaf_path
+
+      # TODO(tomhennigan) Handle cycles for `with_path=True` (e.g. `a.a = a`).
+      if not with_path:
+        leaf_id = id(leaf)
+        if leaf_id in seen:
+          continue
+        seen.add(leaf_id)
+
+      if predicate(leaf):
+        if with_path:
+          yield module_path + leaf_path, leaf
+        else:
+          yield leaf
+
+      if recursive and isinstance(leaf, Module):
+        # Walk direct properties first then recurse.
+        submodules.append((module_path + leaf_path, leaf))
+
+  for submodule_path, submodule in submodules:
+    subvalues = _flatten_module(
+        submodule,
+        recursive=recursive,
+        predicate=predicate,
+        attribute_traversal_key=attribute_traversal_key,
+        with_path=with_path,
+        module_path=submodule_path,
+        seen=seen)
+
+    for subvalue in subvalues:
+      # Predicate is already tested for these values.
+      yield subvalue
diff --git a/tensorflow/python/module/module_test.py b/tensorflow/python/module/module_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..52bb97bc5bc6379e93ab586d5290be2be408485b
--- /dev/null
+++ b/tensorflow/python/module/module_test.py
@@ -0,0 +1,556 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.Module`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import collections
+
+from absl.testing import parameterized
+import six
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import ops
+from tensorflow.python.module import module
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class TestModuleNaming(test.TestCase):
+
+  def test_single_name(self):
+    mod = module.Module(name="simple")
+    self.assertEqual(mod.name, "simple")
+    self.assertEqual(mod.name_scope.name, "simple/")
+
+  def test_construct_in_scope(self):
+    with ops.name_scope("foo"):
+      mod = module.Module(name="bar")
+    self.assertEqual(mod.name, "bar")
+    self.assertEqual(mod.name_scope.name, "foo/bar/")
+
+  def test_enters_name_scope_in_call(self):
+    mod = ReturnsNameScopeModule()
+    for _ in range(3):
+      self.assertEqual(mod(), mod.name_scope.name)
+
+  def test_enters_name_scope_in_other_method(self):
+    mod = ReturnsNameScopeModule()
+    for _ in range(3):
+      self.assertEqual(mod.alternative_forward(), mod.name_scope.name)
+
+  def test_subclassed_module(self):
+    mod = SubclassedReturnsNameScopeModule()
+    for _ in range(3):
+      self.assertEqual(mod.alternative_forward(), mod.name_scope.name)
+      self.assertEqual(mod.alternative_alternative_forward(),
+                       mod.name_scope.name)
+
+  def test_submodule_created_late(self):
+    m = TreeModule()
+    self.assertEqual(m.name, "tree_module")
+    self.assertEqual(m.name_scope.name, "tree_module/")
+    leaf1 = m.new_leaf()
+    self.assertEqual(leaf1.name, "tree_module")
+    self.assertEqual(leaf1.name_scope.name, "tree_module/tree_module/")
+
+  def test_does_not_evaluate_property_methods(self):
+    mod = PropertyThrowsWhenCalledModule()
+    with self.assertRaises(AssertionError):
+      mod.raise_assertion_error  # pylint: disable=pointless-statement
+
+  def test_overridden_name_scope(self):
+    mod = ModuleOverridingNameScope()
+    self.assertEqual(mod(), mod.name_scope.name)
+    self.assertEqual(mod.alternative_forward(), mod.name_scope.name)
+
+  def test_patched_callable(self):
+    with ops.name_scope("foo"):
+      mod = module.Module(name="bar")
+    mod.foo = get_name_scope
+    # `foo` is not a method so we do not re-enter the name scope.
+    self.assertEqual(mod.foo(), "")
+
+  def test_property(self):
+    mod = PropertyModule()
+    mod.some_property = None, None  # None, None for the linter.
+    getter_scope_name, setter_scope_name = mod.some_property
+    self.assertEqual(getter_scope_name, "property_module/")
+    self.assertEqual(setter_scope_name, "property_module/")
+
+  def test_property_no_name_scope(self):
+    mod = PropertyModule()
+    mod.no_name_scope_property = None, None  # None, None for the linter.
+    getter_scope_name, setter_scope_name = mod.no_name_scope_property
+    self.assertEqual(getter_scope_name, "")
+    self.assertEqual(setter_scope_name, "")
+
+  def test_invalid_name(self):
+    msg = ".* is not a valid module name"
+    with self.assertRaisesRegexp(ValueError, msg):
+      module.Module(name="$Foo")
+
+  def test_modules_not_numbered_in_eager(self):
+    mod = RecursiveModule(2)
+    self.assertEqual(mod.name_scope.name, "badger/")
+    self.assertEqual(mod.child.name_scope.name, "badger/badger/")
+
+    mod = RecursiveModule(2)
+    self.assertEqual(mod.name_scope.name, "badger/")
+    self.assertEqual(mod.child.name_scope.name, "badger/badger/")
+
+  def test_module_numbering_in_graph(self):
+    with ops.Graph().as_default():
+      mod = RecursiveModule(2)
+      self.assertEqual(mod.name_scope.name, "badger/")
+      self.assertEqual(mod.child.name_scope.name, "badger/badger/")
+
+      mod = RecursiveModule(2)
+      self.assertEqual(mod.name_scope.name, "badger_1/")
+      self.assertEqual(mod.child.name_scope.name, "badger_1/badger/")
+
+  def test_ctor_error_closes_name_scope(self):
+    with self.assertRaises(ErrorModuleError):
+      # If super constructor is called then a name scope is opened then an error
+      # is thrown. The metaclass should handle this and close the namescope
+      # before re-throwing the exception.
+      ErrorModule(call_super=True)
+
+    self.assertEqual("", get_name_scope())
+
+  def test_ctor_error_handles_ctor_not_opening_name_scope(self):
+    with self.assertRaises(ErrorModuleError):
+      # If super ctor is not called then the name scope isn't opened. We need to
+      # ensure that this doesn't trigger an exception (e.g. the metaclass trying
+      # to __exit__ a non-existant name scope).
+      ErrorModule(call_super=False)
+
+    self.assertEqual("", get_name_scope())
+
+  def test_forward_method_closes_name_scope(self):
+    mod = ErrorModule(call_super=True, raise_in_constructor=False)
+    with self.assertRaises(ErrorModuleError):
+      mod()
+
+    self.assertEqual("", get_name_scope())
+
+  def test_get_attr_doesnt_enter_name_scope(self):
+    scope_names = []
+
+    class GetAttrModule(module.Module):
+
+      def __getattr__(self, name):
+        scope_names.append((name, get_name_scope()))
+        return super(GetAttrModule, self).__getattr__(name)
+
+    mod = GetAttrModule()
+    with self.assertRaises(AttributeError):
+      mod.does_not_exist  # pylint: disable=pointless-statement
+    self.assertIn(("does_not_exist", ""), scope_names)
+
+  def test_get_attribute_doesnt_enter_name_scope(self):
+    scope_names = []
+
+    class GetAttributeModule(module.Module):
+
+      def __getattribute__(self, name):
+        scope_names.append((name, get_name_scope()))
+        return super(GetAttributeModule, self).__getattribute__(name)
+
+    mod = GetAttributeModule()
+    with self.assertRaises(AttributeError):
+      mod.does_not_exist  # pylint: disable=pointless-statement
+    self.assertIn(("does_not_exist", ""), scope_names)
+
+
+class VariableNamingTest(test.TestCase):
+
+  def test_variable_names(self):
+    mod = RecursiveModule(3)
+    self.assertEqual(mod.w.name, "badger/mushroom:0")
+    self.assertEqual(mod.child.w.name, "badger/badger/mushroom:0")
+    self.assertEqual(mod.child.child.w.name, "badger/badger/badger/mushroom:0")
+
+
+class VariableTrackingTest(test.TestCase):
+
+  def test_variables(self):
+    m = RecursiveModule(3)
+    self.assertEqual(m.variables, (m.w, m.child.w, m.child.child.w))
+    self.assertEqual(m.child.variables, (m.child.w, m.child.child.w))
+    self.assertEqual(m.child.child.variables, (m.child.child.w,))
+
+  def test_trainable_variables(self):
+    m = RecursiveModule(3)
+    self.assertEqual(m.trainable_variables,
+                     (m.w, m.child.w, m.child.child.w))
+    self.assertEqual(m.child.trainable_variables,
+                     (m.child.w, m.child.child.w))
+    self.assertEqual(m.child.child.trainable_variables, (m.child.child.w,))
+
+  def test_trainable_variables_ignores_non_trainable(self):
+    m = RecursiveModule(3, trainable=False)
+    self.assertEqual(len(m.trainable_variables), 0)
+    self.assertEqual(len(m.child.trainable_variables), 0)
+    self.assertEqual(len(m.child.child.trainable_variables), 0)
+
+
+class ModuleTrackingTest(test.TestCase):
+
+  def test_submodules(self):
+    m = RecursiveModule(3)
+    self.assertEqual(list(m.submodules), [m.child, m.child.child])
+    self.assertEqual(list(m.child.submodules), [m.child.child])
+    self.assertEqual(list(m.child.child.submodules), [])
+
+  def test_non_ctor_submodule(self):
+    m = TreeModule()
+    leaf1 = m.new_leaf()
+    self.assertEqual(set(m.submodules), {leaf1})
+    leaf2 = m.new_leaf()
+    self.assertEqual(set(m.submodules), {leaf1, leaf2})
+
+
+class CommonErrorsTest(test.TestCase):
+
+  def test_not_calling_super_constructor(self):
+    msg = ("Constructing a tf.Module without calling the super constructor is "
+           "not supported")
+    with self.assertRaisesRegexp(ValueError, msg):
+      DoesNotCallSuperConstructorModule()
+
+  def test_calls_method_before_super(self):
+    msg = "super constructor must be called before any other methods"
+    with self.assertRaisesRegexp(AttributeError, msg):
+      CallsMethodBeforeSuperConstructorModule(allowed_method=False)
+
+  def test_annotated_method_is_allowed(self):
+    self.assertIsNotNone(
+        CallsMethodBeforeSuperConstructorModule(allowed_method=True))
+
+
+class ForwardMethodsTest(test.TestCase):
+
+  def testFunctionType(self):
+    mod = ModuleWithFunctionAnnotatedCall()
+    self.assertTrue(isinstance(mod.forward, def_function.Function))
+    self.assertTrue(isinstance(mod.forward_ag, def_function.Function))
+
+  def testEntersNameScope_call(self):
+    mod = ModuleWithFunctionAnnotatedCall()
+    self.assertEqual(mod.forward().numpy(),
+                     b"module_with_function_annotated_call/")
+    self.assertEqual(mod.forward_ag().numpy(),
+                     b"module_with_function_annotated_call/")
+
+  def testEntersNameScope_concreteFunction(self):
+    mod = ModuleWithFunctionAnnotatedCall()
+    self.assertEqual(mod.forward.get_concrete_function()().numpy(),
+                     b"module_with_function_annotated_call/")
+    self.assertEqual(mod.forward_ag.get_concrete_function()().numpy(),
+                     b"module_with_function_annotated_call/")
+
+
+class AbcTest(test.TestCase):
+
+  def testAbstract(self):
+    msg = "Can't instantiate .* abstract methods"
+    with self.assertRaisesRegexp(TypeError, msg):
+      AbstractModule()  # pylint: disable=abstract-class-instantiated
+
+  def testConcrete(self):
+    mod = ConcreteModule()
+    x, scope_name = mod(2.)
+    self.assertEqual(x, 4.)
+    self.assertEqual(scope_name, "concrete_module/")
+    self.assertEqual(get_name_scope(), "")
+
+
+def get_name_scope():
+  with ops.name_scope("x") as ns:
+    return ns[:-2]
+
+
+class ErrorModuleError(Exception):
+  pass
+
+
+class ErrorModule(module.Module):
+
+  def __init__(self, call_super, raise_in_constructor=True):
+    if call_super:
+      super(ErrorModule, self).__init__()
+    if raise_in_constructor:
+      raise ErrorModuleError("Deliberate error!")
+
+  def __call__(self):
+    raise ErrorModuleError("Deliberate error!")
+
+
+class RecursiveModule(module.Module):
+
+  def __init__(self, depth, trainable=True):
+    super(RecursiveModule, self).__init__(name="badger")
+    self.child = None
+    if depth > 1:
+      self.child = RecursiveModule(depth - 1, trainable=trainable)
+    self.w = variables.Variable(1.0, trainable=trainable, name="mushroom")
+
+
+@six.add_metaclass(abc.ABCMeta)
+class AbstractModule(module.Module):
+
+  @abc.abstractmethod
+  def __call__(self, x):
+    pass
+
+
+class ConcreteModule(AbstractModule):
+
+  def __call__(self, x):
+    return x ** 2, get_name_scope()
+
+
+class TreeModule(module.Module):
+
+  def __init__(self, name=None):
+    super(TreeModule, self).__init__(name=name)
+    self._leaves = []
+
+  def new_leaf(self, name=None):
+    leaf = TreeModule(name=name)
+    self._leaves.append(leaf)
+    return leaf
+
+
+class ReturnsNameScopeModule(module.Module):
+
+  def alternative_forward(self):
+    return get_name_scope()
+
+  def __call__(self):
+    return get_name_scope()
+
+
+class SubclassedReturnsNameScopeModule(ReturnsNameScopeModule):
+
+  def alternative_alternative_forward(self):
+    return get_name_scope()
+
+
+class PropertyThrowsWhenCalledModule(module.Module):
+
+  @property
+  def raise_assertion_error(self):
+    raise AssertionError
+
+
+class ModuleOverridingNameScope(ReturnsNameScopeModule):
+
+  @property
+  def name_scope(self):
+    return ops.name_scope("yolo/")
+
+
+class DoesNotCallSuperConstructorModule(module.Module):
+
+  def __init__(self):
+    # NOTE: Intentionally does not call super constructor.
+    pass
+
+
+class CallsMethodBeforeSuperConstructorModule(module.Module):
+
+  def __init__(self, allowed_method):
+    if allowed_method:
+      self.no_name_scope()
+    else:
+      self.with_name_scope()
+    super(CallsMethodBeforeSuperConstructorModule, self).__init__()
+
+  @module.Module.no_name_scope
+  def no_name_scope(self):
+    pass
+
+  def with_name_scope(self):
+    pass
+
+
+class ModuleWithFunctionAnnotatedCall(module.Module):
+
+  @def_function.function(autograph=False)
+  def forward(self):
+    return get_name_scope()
+
+  @def_function.function(autograph=True)
+  def forward_ag(self):
+    return get_name_scope()
+
+
+class PropertyModule(module.Module):
+
+  def __init__(self):
+    super(PropertyModule, self).__init__()
+    self._setter_scope_name = None
+
+  @property
+  def some_property(self):
+    getter_scope_name = get_name_scope()
+    return getter_scope_name, self._setter_scope_name
+
+  @some_property.setter
+  def some_property(self, my_property):
+    self._setter_scope_name = get_name_scope()
+
+  @property
+  @module.Module.no_name_scope
+  def no_name_scope_property(self):
+    getter_scope_name = get_name_scope()
+    return getter_scope_name, self._setter_scope_name
+
+  @no_name_scope_property.setter
+  @module.Module.no_name_scope
+  def no_name_scope_property(self, my_property):
+    self._setter_scope_name = get_name_scope()
+
+NamedPair = collections.namedtuple("NamedPair", ("first", "second"))
+mk_index_dict = lambda v: dict(enumerate(v))
+
+
+class FlattenTest(parameterized.TestCase, test.TestCase):
+
+  @parameterized.parameters(lambda v: NamedPair(*v), list, tuple, mk_index_dict)
+  def test_flatten(self, container_type):
+    parent = SimpleModule(container_type=container_type)
+    child = parent.c
+
+    self.assertEqual(
+        list(parent._flatten(recursive=False, predicate=IS_MEMBER)),
+        [parent.a[0], parent.a[1], parent.z])
+
+    self.assertEqual(
+        list(parent._flatten(predicate=IS_MEMBER)),
+        [parent.a[0], parent.a[1], parent.z, child.a[0], child.a[1], child.z])
+
+  def test_attribute_traversal_key(self):
+    mod = LayerModule()
+    self.assertEqual(
+        mod.variables,
+        mod._trainable_variables + mod._non_trainable_variables + [mod._bonus])
+
+  def test_with_path(self):
+    mod = module.Module()
+    mod.w = variables.Variable(1.)
+    mod.encoder = module.Module()
+    mod.encoder.w = [({"k": mod.w}, {"k": mod.w})]
+    mod.decoder = mod.encoder
+
+    state_dict = dict(
+        mod._flatten(with_path=True, predicate=module._IS_VARIABLE))
+
+    self.assertEqual(state_dict,
+                     {("w",): mod.w,
+                      ("encoder", "w", 0, 0, "k"): mod.encoder.w[0][0]["k"],
+                      ("encoder", "w", 0, 1, "k"): mod.encoder.w[0][1]["k"],
+                      ("decoder", "w", 0, 0, "k"): mod.decoder.w[0][0]["k"],
+                      ("decoder", "w", 0, 1, "k"): mod.decoder.w[0][1]["k"]},)
+
+
+class LayerModule(module.Module):
+
+  def __init__(self):
+    super(LayerModule, self).__init__()
+    self._trainable_variables = [
+        variables.Variable(1., name="a"),
+        variables.Variable(2., name="b"),
+    ]
+    self._non_trainable_variables = [
+        variables.Variable(3., name="c"),
+        variables.Variable(4., name="d"),
+    ]
+    self._bonus = variables.Variable(5., name="e")
+
+  @property
+  def variables(self):
+    def key_function(name):
+      indexes = {"_trainable_variables": 0, "_non_trainable_variables": 1}
+      return indexes.get(name, 2), name
+
+    return list(self._flatten(predicate=module._IS_VARIABLE,
+                              attribute_traversal_key=key_function))
+
+
+class MemberType(object):
+  """A simple type to search for."""
+  pass
+
+
+class SimpleModule(module.Module):
+
+  def __init__(self, create_child=True, container_type=list):
+    super(SimpleModule, self).__init__()
+    self.z = MemberType()
+    self.a = container_type([MemberType(), MemberType()])
+    if create_child:
+      self.c = SimpleModule(create_child=False)
+
+
+IS_MEMBER = lambda v: isinstance(v, MemberType)
+IS_MODULE = lambda v: isinstance(v, module.Module)
+
+
+class CustomMetaclass(type):
+
+  TAG = "__custom_metaclass__"
+
+  def __new__(mcs, name, bases, clsdict):
+    new_type = super(CustomMetaclass, mcs).__new__(mcs, name, bases, clsdict)
+    setattr(new_type, CustomMetaclass.TAG, True)
+    return new_type
+
+
+class CombiningMetaclass(module.ModuleMetaclass, CustomMetaclass):
+
+  TAG = "__combining_metaclass__"
+
+  def __new__(mcs, name, bases, clsdict):
+    new_type = super(CombiningMetaclass, mcs).__new__(mcs, name, bases, clsdict)
+    setattr(new_type, CombiningMetaclass.TAG, True)
+    return new_type
+
+
+@six.add_metaclass(CombiningMetaclass)
+class ModuleWithCustomMetaclass(module.Module):
+
+  def __init__(self):
+    super(ModuleWithCustomMetaclass, self).__init__()
+    self.init_name_scope = get_name_scope()
+
+
+class CustomMetaclassTest(test.TestCase):
+
+  def testSupportsCustomMetaclass(self):
+    m = ModuleWithCustomMetaclass()
+    self.assertEqual(m.init_name_scope, "module_with_custom_metaclass/")
+    self.assertTrue(getattr(ModuleWithCustomMetaclass, CombiningMetaclass.TAG))
+    self.assertTrue(getattr(ModuleWithCustomMetaclass, CustomMetaclass.TAG))
+
+if __name__ == "__main__":
+  v2_compat.enable_v2_behavior()
+  test.main()
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 45e741ef222b1dcde21b66ab6cdc3db9576a85ce..57572328f6d4a4f8284887f86ce3694bf0551f37 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -839,16 +839,11 @@ def _ScatterNdNonAliasingAddGrad(op, grad):
 def _BroadcastToGrad(op, grad):
   input_value = op.inputs[0]
   broadcast_shape = op.inputs[1]
-  # Assign ids for each position in input_value.
   input_value_shape = array_ops.shape(input_value)
-  input_value_size = array_ops.size(input_value)
-  ids = array_ops.reshape(math_ops.range(input_value_size), input_value_shape)
-  broadcast_ids = array_ops.broadcast_to(ids, broadcast_shape)
-  # Group by ids and sum its gradients.
-  grad_flatten = array_ops.reshape(grad, [-1])
-  broadcast_ids_flatten = array_ops.reshape(broadcast_ids, [-1])
-  updates_grad_flatten = math_ops.unsorted_segment_sum(grad_flatten,
-                                                       broadcast_ids_flatten,
-                                                       input_value_size)
-  updates_grad = array_ops.reshape(updates_grad_flatten, input_value_shape)
+  _, reduction_axes = gen_array_ops.broadcast_gradient_args(broadcast_shape,
+                                                            input_value_shape)
+  updates_grad_reshaped = math_ops.reduce_sum(grad,
+                                              axis=reduction_axes,
+                                              keepdims=True)
+  updates_grad = array_ops.reshape(updates_grad_reshaped, input_value_shape)
   return [updates_grad, None]
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 9dabbffb138093db6d3bd0dcf983d2f6cfdc5081..8db23c467a85649422d376bcf8b1289aa3937e19 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -67,16 +67,20 @@ def identity(input, name=None):  # pylint: disable=redefined-builtin
   Returns:
     A `Tensor`. Has the same type as `input`.
   """
-  if context.executing_eagerly():
+  if context.executing_eagerly() and not hasattr(input, "graph"):
     input = ops.convert_to_tensor(input)
-    in_device = input.device
+    in_device = input.backing_device
     # TODO(ashankar): Does 'identity' need to invoke execution callbacks?
     context_device = context.context().device_name
     if not context_device:
       context_device = "/job:localhost/replica:0/task:0/device:CPU:0"
-    if context_device != in_device:
-      return input._copy()  # pylint: disable=protected-access
-    return input
+    if context_device == in_device:
+      return input
+    else:
+      copied = input._copy()  # pylint: disable=protected-access
+      if hasattr(copied, "_handle_data"):
+        copied._handle_data = input._handle_data  # pylint: disable=protected-access
+      return copied
   else:
     ret = gen_array_ops.identity(input, name=name)
     # Propagate handle data for happier shape inference for resource variables.
@@ -87,6 +91,7 @@ def identity(input, name=None):  # pylint: disable=redefined-builtin
 
 # pylint: disable=redefined-builtin,protected-access
 @tf_export(v1=["expand_dims"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "Use the `axis` argument instead", "dim")
 def expand_dims(input, axis=None, name=None, dim=None):
   """Inserts a dimension of 1 into a tensor's shape.
@@ -356,12 +361,14 @@ def shape_n(input, out_type=dtypes.int32, name=None):
 
 
 @tf_export("size", v1=[])
+@dispatch.add_dispatch_support
 def size_v2(input, out_type=dtypes.int32, name=None):
   # pylint: disable=redefined-builtin
   return size(input, name, out_type)
 
 
 @tf_export(v1=["size"])
+@dispatch.add_dispatch_support
 def size(input, name=None, out_type=dtypes.int32):
   # pylint: disable=redefined-builtin
   """Returns the size of a tensor.
@@ -429,6 +436,7 @@ def size_internal(input, name=None, optimize=True, out_type=dtypes.int32):
 
 
 @tf_export("rank")
+@dispatch.add_dispatch_support
 def rank(input, name=None):
   # pylint: disable=redefined-builtin
   """Returns the rank of a tensor.
@@ -889,7 +897,7 @@ def _SliceHelperVar(var, slice_spec):
 
   """
 
-  return _slice_helper(var._AsTensor(), slice_spec, var)
+  return _slice_helper(var.value(), slice_spec, var)
 
 
 ops.Tensor._override_operator("__getitem__", _slice_helper)
@@ -2658,7 +2666,10 @@ def required_space_to_batch_paddings(input_shape,
 
 @tf_export(v1=["nn.space_to_batch", "space_to_batch"])
 @deprecation.deprecated_endpoints("space_to_batch")
-def space_to_batch(input, paddings, block_size, name=None):  # pylint: disable=redefined-builtin
+def space_to_batch(  # pylint: disable=missing-docstring
+    input, paddings, block_size=None, name=None, block_shape=None):  # pylint: disable=redefined-builtin
+  block_size = deprecation.deprecated_argument_lookup(
+      "block_shape", block_shape, "block_size", block_size)
   result = space_to_batch_nd(
       input,
       paddings=paddings,
@@ -2714,7 +2725,9 @@ depth_to_space_v2.__doc__ = gen_array_ops.depth_to_space.__doc__
 
 
 @tf_export(v1=["batch_to_space"])
-def batch_to_space(input, crops, block_size, name=None):  # pylint: disable=redefined-builtin
+def batch_to_space(input, crops, block_size, name=None, block_shape=None):  # pylint: disable=redefined-builtin,missing-docstring
+  block_size = deprecation.deprecated_argument_lookup(
+      "block_shape", block_shape, "block_size", block_size)
   result = batch_to_space_nd(
       input,
       crops=crops,
@@ -3189,7 +3202,7 @@ def where(condition, x=None, y=None, name=None):
 
   Returns:
     A `Tensor` with the same type and shape as `x`, `y` if they are non-None.
-    A `Tensor` with shape `(num_true, dim_size(condition))`.
+    Otherwise, a `Tensor` with shape `(num_true, rank(condition))`.
 
   Raises:
     ValueError: When exactly one of `x` or `y` is non-None.
@@ -3256,8 +3269,84 @@ reverse_sequence_v2.__doc__ = deprecation.rewrite_argument_docstring(
 
 
 @tf_export(v1=["gather"])
-def gather(params, indices, validate_indices=None, name=None, axis=0):
+@dispatch.add_dispatch_support
+def gather(params,
+           indices,
+           validate_indices=None,
+           name=None,
+           axis=None,
+           batch_dims=0):
+  r"""Gather slices from params axis axis according to indices.
+
+  Gather slices from params axis `axis` according to `indices`.  `indices` must
+  be an integer tensor of any dimension (usually 0-D or 1-D).
+
+  For 0-D (scalar) `indices`:
+
+  > `output`$$[p_0,          ..., p_{axis-1},        \hspace{5.1em}
+  >            p_{axis + 1}, ..., p_{N-1}]$$ =\
+  > `params`$$[p_0,          ..., p_{axis-1},        \hspace{1em}
+  >            indices,                              \hspace{1em}
+  >            p_{axis + 1}, ..., p_{N-1}]$$.
+
+  For 1-D (vector) `indices` with `batch_dims=0`:
+
+  > `output`$$[p_0,          ..., p_{axis-1},        \hspace{2.6em}
+  >            i,                                    \hspace{2.6em}
+  >            p_{axis + 1}, ..., p_{N-1}]$$ =\
+  > `params`$$[p_0,          ..., p_{axis-1},        \hspace{1em}
+  >            indices[i],                           \hspace{1em}
+  >            p_{axis + 1}, ..., p_{N-1}]$$.
+
+  In the general case, produces an output tensor where:
+
+  > `output`$$[p_0,             ..., p_{axis-1},     \hspace{1.2em}
+  >            i_{batch\_dims}, ..., i_{M-1},        \hspace{1.3em}
+  >            p_{axis + 1},    ..., p_{N-1}]$$ =\
+  > `params`$$[p_0,             ..., p_{axis-1},     \hspace{1em}
+  >            indices[i_0,     ..., i_{M-1}],       \hspace{1em}
+  >            p_{axis + 1},    ..., p_{N-1}]$$.
+
+  Where $$N$$=`ndims(params)` and $$M$$=`ndims(indices)`.
+  The shape of the output tensor is:
+
+  > `output.shape = params.shape[:axis] + indices.shape[batch_dims:] +
+  > params.shape[axis + 1:]`.
+
+  Note that on CPU, if an out of bound index is found, an error is returned.
+  On GPU, if an out of bound index is found, a 0 is stored in the corresponding
+  output value.
+
+  See also `tf.gather_nd`.
+
+  <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png"
+  alt>
+  </div>
+
+  Args:
+    params: The `Tensor` from which to gather values. Must be at least rank
+      `axis + 1`.
+    indices: The index `Tensor`.  Must be one of the following types: `int32`,
+      `int64`. Must be in range `[0, params.shape[axis])`.
+    validate_indices: Deprecated, does nothing.
+    name: A name for the operation (optional).
+    axis: A `Tensor`. Must be one of the following types: `int32`, `int64`. The
+      `axis` in `params` to gather `indices` from. Must be greater than or equal
+      to `batch_dims`.  Defaults to the first non-batch dimension. Supports
+      negative indexes.
+    batch_dims: An `integer`.  The number of batch dimensions.  Must be less
+      than `rank(indices)`.
+
+  Returns:
+    A `Tensor`. Has the same type as `params`.
+  """
   del validate_indices
+  if batch_dims != 0:
+    with ops.name_scope(name, "Gather", [params, indices, axis]):
+      return _batch_gather(params, indices, batch_dims, axis)
+  if axis is None:
+    axis = batch_dims
   if axis != 0:
     # Note that we do a sparse_read here to avoid snapshotting the entire
     # resource variable and doing a gather, which can be inefficient and lead to
@@ -3273,41 +3362,50 @@ def gather(params, indices, validate_indices=None, name=None, axis=0):
 
 @tf_export("gather", v1=[])
 @dispatch.add_dispatch_support
-def gather_v2(params, indices, validate_indices=None, axis=0, name=None):
+def gather_v2(params, indices, validate_indices=None, axis=None,
+              batch_dims=0, name=None):
   return gather(params, indices, validate_indices=validate_indices, name=name,
-                axis=axis)
+                axis=axis, batch_dims=batch_dims)
 
 
 gather.__doc__ = gather_v2.__doc__ = gen_array_ops.gather_v2.__doc__
 
 
-
-@tf_export("batch_gather")
+@tf_export(v1=["batch_gather"])
 @dispatch.add_dispatch_support
+@deprecation.deprecated(
+    "2017-10-25", "`tf.batch_gather` is deprecated, please use `tf.gather` "
+    "with `batch_dims` instead.")  # pylint: disable=missing-docstring
 def batch_gather(params, indices, name=None):
-  """Gather slices from `params` according to `indices` with leading batch dims.
-
-  This operation assumes that the leading dimensions of `indices` are dense,
-  and the gathers on the axis corresponding to the last dimension of `indices`.
-  More concretely it computes:
-
-  result[i1, ..., in] = params[i1, ..., in-1, indices[i1, ..., in]]
+  """Gather slices from params according to indices with leading batch dims."""
+  with ops.name_scope(name, "BatchGather", [params, indices]):
+    indices = ops.convert_to_tensor(indices, name="indices")
+    params = ops.convert_to_tensor(params, name="params")
+    if indices.shape.ndims is None:
+      raise ValueError(
+          "batch_gather does not allow indices with unknown shape.")
+    return _batch_gather(params, indices, batch_dims=indices.shape.ndims - 1)
 
-  Therefore `params` should be a Tensor of shape [A1, ..., AN, B1, ..., BM],
-  `indices` should be a Tensor of shape [A1, ..., AN-1, C] and `result` will be
-  a Tensor of size `[A1, ..., AN-1, C, B1, ..., BM]`.
 
-  In the case in which indices is a 1D tensor, this operation is equivalent to
-  `tf.gather`.
+def _batch_gather(params, indices, batch_dims, axis=None):
+  r"""Gather slices from params according to indices with leading batch dims.
 
-  See also `tf.gather` and `tf.gather_nd`.
+  This operation assumes that the leading `batch_dims` dimensions of `indices`
+  and `params` are batch dimensions; and performs a `tf.gather` operation within
+  each batch. (If `batch_dims` is not specified, then it defaults to
+  `rank(indices)-1`.)  In the case in which `batch_dims==0`, this operation
+  is equivalent to `tf.gather`.
 
   Args:
     params: A Tensor. The tensor from which to gather values.
     indices: A Tensor. Must be one of the following types: int32, int64. Index
-        tensor. Must be in range `[0, params.shape[axis]`, where `axis` is the
-        last dimension of `indices` itself.
-    name: A name for the operation (optional).
+      tensor. Must be in range `[0, params.shape[batch_dims]]`.
+    batch_dims: An integer or none.  The number of batch dimensions.  Must be
+      less than `rank(indices)`.  Defaults to `rank(indices) - 1` if None.
+    axis: A `Tensor`. Must be one of the following types: `int32`, `int64`. The
+      `axis` in `params` to gather `indices` from. Must be greater than or equal
+      to `batch_dims`.  Defaults to the first non-batch dimension. Supports
+      negative indexes.
 
   Returns:
     A Tensor. Has the same type as `params`.
@@ -3315,48 +3413,100 @@ def batch_gather(params, indices, name=None):
   Raises:
     ValueError: if `indices` has an unknown shape.
   """
-
-  with ops.name_scope(name):
-    indices = ops.convert_to_tensor(indices, name="indices")
-    params = ops.convert_to_tensor(params, name="params")
-    indices_shape = shape(indices)
-    params_shape = shape(params)
-
-    ndims = indices.shape.ndims
-    if ndims is None:
-      raise ValueError("batch_gather does not allow indices with unknown "
-                       "shape.")
-    batch_indices = indices
-    indices_dtype = indices.dtype.base_dtype
-    accum_dim_value = ones((), dtype=indices_dtype)
-    # Use correct type for offset index computation
-    casted_params_shape = gen_math_ops.cast(params_shape, indices_dtype)
-    for dim in range(ndims-1, 0, -1):
-      dim_value = casted_params_shape[dim-1]
-      accum_dim_value *= casted_params_shape[dim]
-      start = zeros((), dtype=indices_dtype)
-      step = ones((), dtype=indices_dtype)
-      dim_indices = gen_math_ops._range(start, dim_value, step)
-      dim_indices *= accum_dim_value
-      dim_shape = stack([1] * (dim - 1) + [dim_value] + [1] * (ndims - dim),
-                        axis=0)
-      batch_indices += reshape(dim_indices, dim_shape)
-
-    flat_indices = reshape(batch_indices, [-1])
-    outer_shape = params_shape[ndims:]
-    flat_inner_shape = gen_math_ops.prod(
-        params_shape[:ndims], [0], False)
-
-    flat_params = reshape(
-        params, concat([[flat_inner_shape], outer_shape], axis=0))
-    flat_result = gather(flat_params, flat_indices)
-    result = reshape(flat_result, concat([indices_shape, outer_shape], axis=0))
-    final_shape = indices.get_shape()[:ndims-1].merge_with(
-        params.get_shape()[:ndims -1])
-    final_shape = final_shape.concatenate(indices.get_shape().dims[ndims-1])
-    final_shape = final_shape.concatenate(params.get_shape()[ndims:])
-    result.set_shape(final_shape)
-    return result
+  if batch_dims is not None and not isinstance(batch_dims, int):
+    raise TypeError("batch_dims must be an int; got %r" % batch_dims)
+  indices = ops.convert_to_tensor(indices, name="indices")
+  params = ops.convert_to_tensor(params, name="params")
+
+  indices_ndims = indices.shape.ndims
+  if indices_ndims is None:
+    raise ValueError("tf.gather does not allow indices with unknown "
+                     "rank when batch_dims is specified.")
+  if batch_dims is None:
+    batch_dims = indices_ndims - 1
+  if batch_dims < 0:
+    batch_dims += indices_ndims
+  if batch_dims < 0 or batch_dims >= indices_ndims:
+    raise ValueError("batch_dims = %d must be less than rank(indices) = %d" %
+                     (batch_dims, indices_ndims))
+  if params.shape.ndims is not None and batch_dims >= params.shape.ndims:
+    raise ValueError("batch_dims = %d must be less than rank(params) = %d" %
+                     (batch_dims, params.shape.ndims))
+
+  # Handle axis by transposing the axis dimension to be the first non-batch
+  # dimension, recursively calling batch_gather with axis=0, and then
+  # transposing the result to put the pre-axis dimensions before the indices
+  # dimensions.
+  if axis is not None and axis != batch_dims:
+    # Adjust axis to be positive.
+    if not isinstance(axis, int):
+      axis = tf.where(axis < 0, axis + array_ops.rank(params), axis)
+    elif axis < 0 and params.shape.ndims is None:
+      axis = axis + array_ops.rank(params)
+    else:
+      if (axis < -params.shape.ndims) or (axis >= params.shape.ndims):
+        raise ValueError("axis (%d) out of range [%d, %d)" %
+                         (axis, -params.shape.ndims, params.shape.ndims))
+      if axis < 0:
+        axis += params.shape.ndims
+      if axis < batch_dims:
+        raise ValueError("batch_dims = %d must be less than or equal to "
+                         "axis = %d" % (batch_dims, axis))
+
+    # Move params[axis] up to params[batch_dims].
+    perm = [
+        list(range(batch_dims)), [axis],
+        gen_math_ops._range(batch_dims, axis, 1),
+        gen_math_ops._range(axis + 1, rank(params), 1)
+    ]
+    params = transpose(params, concat(perm, axis=0))
+
+    result = _batch_gather(params, indices, batch_dims=batch_dims)
+
+    # Move the result dimensions corresponding to params[batch_dims:axis]
+    # to just before the dimensions corresponding to indices[batch_dims:].
+    params_start = indices_ndims + axis - batch_dims
+    perm = [
+        list(range(batch_dims)),
+        gen_math_ops._range(indices_ndims, params_start, 1),
+        list(range(batch_dims, indices_ndims)),
+        gen_math_ops._range(params_start, rank(result), 1)
+    ]
+    return transpose(result, perm=concat(perm, axis=0))
+
+  indices_shape = shape(indices)
+  params_shape = shape(params)
+  batch_indices = indices
+  indices_dtype = indices.dtype.base_dtype
+  accum_dim_value = ones((), dtype=indices_dtype)
+  # Use correct type for offset index computation
+  casted_params_shape = gen_math_ops.cast(params_shape, indices_dtype)
+  for dim in range(batch_dims, 0, -1):
+    dim_value = casted_params_shape[dim - 1]
+    accum_dim_value *= casted_params_shape[dim]
+    start = zeros((), dtype=indices_dtype)
+    step = ones((), dtype=indices_dtype)
+    dim_indices = gen_math_ops._range(start, dim_value, step)
+    dim_indices *= accum_dim_value
+    dim_shape = stack(
+        [1] * (dim - 1) + [dim_value] + [1] * (indices_ndims - dim), axis=0)
+    batch_indices += reshape(dim_indices, dim_shape)
+
+  flat_indices = reshape(batch_indices, [-1])
+  outer_shape = params_shape[batch_dims + 1:]
+  flat_inner_shape = gen_math_ops.prod(params_shape[:batch_dims + 1], [0],
+                                       False)
+
+  flat_params = reshape(params, concat([[flat_inner_shape], outer_shape],
+                                       axis=0))
+  flat_result = gather(flat_params, flat_indices)
+  result = reshape(flat_result, concat([indices_shape, outer_shape], axis=0))
+  final_shape = indices.get_shape()[:batch_dims].merge_with(
+      params.get_shape()[:batch_dims])
+  final_shape = final_shape.concatenate(indices.get_shape().dims[batch_dims:])
+  final_shape = final_shape.concatenate(params.get_shape()[batch_dims + 1:])
+  result.set_shape(final_shape)
+  return result
 
 
 # Define quantize_v2 here in order to make name the second-to-last attribute,
@@ -3509,7 +3659,22 @@ def extract_image_patches_v2(
   return gen_array_ops.extract_image_patches(
       images, sizes, strides, rates, padding, name)
 
-extract_image_patches_deprecation = deprecation.deprecated_args(
+
+@tf_export(v1=["image.extract_image_patches", "extract_image_patches"])
+@deprecation.deprecated_args(
     None, "ksizes is deprecated, use sizes instead", "ksizes")
-tf_export(v1=["image.extract_image_patches", "extract_image_patches"])(
-    extract_image_patches_deprecation(gen_array_ops.extract_image_patches))
+def extract_image_patches(  # pylint: disable=missing-docstring
+    images,
+    ksizes=None,
+    strides=None,
+    rates=None,
+    padding=None,
+    name=None,
+    sizes=None):
+  ksizes = deprecation.deprecated_argument_lookup(
+      "sizes", sizes, "ksizes", ksizes)
+  return gen_array_ops.extract_image_patches(
+      images, ksizes, strides, rates, padding, name)
+
+
+extract_image_patches.__doc__ = gen_array_ops.extract_image_patches.__doc__
diff --git a/tensorflow/python/ops/boosted_trees_ops.py b/tensorflow/python/ops/boosted_trees_ops.py
index 37d649acf00c6905ae7330169321e5a5f8f487be..362c17ec6f40f7ea93802ef651365e7b4d061822 100644
--- a/tensorflow/python/ops/boosted_trees_ops.py
+++ b/tensorflow/python/ops/boosted_trees_ops.py
@@ -43,7 +43,7 @@ from tensorflow.python.ops.gen_boosted_trees_ops import is_boosted_trees_quantil
 # pylint: enable=unused-import
 
 from tensorflow.python.training import saver
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.tracking import tracking
 
 
 class PruningMode(object):
@@ -61,7 +61,36 @@ class PruningMode(object):
           sorted(cls._map))))
 
 
-class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
+class QuantileAccumulatorSaveable(saver.BaseSaverBuilder.SaveableObject):
+  """SaveableObject implementation for QuantileAccumulator."""
+
+  def __init__(self, resource_handle, create_op, num_streams, name):
+    self._resource_handle = resource_handle
+    self._num_streams = num_streams
+    self._create_op = create_op
+    bucket_boundaries = get_bucket_boundaries(self._resource_handle,
+                                              self._num_streams)
+    slice_spec = ''
+    specs = []
+
+    def make_save_spec(tensor, suffix):
+      return saver.BaseSaverBuilder.SaveSpec(tensor, slice_spec, name + suffix)
+
+    for i in range(self._num_streams):
+      specs += [
+          make_save_spec(bucket_boundaries[i], '_bucket_boundaries_' + str(i))
+      ]
+    super(QuantileAccumulatorSaveable, self).__init__(self._resource_handle,
+                                                      specs, name)
+
+  def restore(self, restored_tensors, unused_tensor_shapes):
+    bucket_boundaries = restored_tensors
+    with ops.control_dependencies([self._create_op]):
+      return quantile_resource_deserialize(
+          self._resource_handle, bucket_boundaries=bucket_boundaries)
+
+
+class QuantileAccumulator(tracking.TrackableResource):
   """SaveableObject implementation for QuantileAccumulator.
 
      The bucket boundaries are serialized and deserialized from checkpointing.
@@ -73,55 +102,58 @@ class QuantileAccumulator(saver.BaseSaverBuilder.SaveableObject):
                num_quantiles,
                name=None,
                max_elements=None):
+    self._eps = epsilon
+    self._num_streams = num_streams
+    self._num_quantiles = num_quantiles
+    super(QuantileAccumulator, self).__init__()
+
     with ops.name_scope(name, 'QuantileAccumulator') as name:
-      self._eps = epsilon
-      self._num_streams = num_streams
-      self._num_quantiles = num_quantiles
-      self._resource_handle = quantile_resource_handle_op(
-          container='', shared_name=name, name=name)
-      self._create_op = create_quantile_stream_resource(self._resource_handle,
-                                                        epsilon, num_streams)
-      is_initialized_op = is_quantile_resource_initialized(
-          self._resource_handle)
-      resources.register_resource(self._resource_handle, self._create_op,
-                                  is_initialized_op)
-      self._make_saveable(name)
-
-  def _make_saveable(self, name):
-    bucket_boundaries = get_bucket_boundaries(self._resource_handle,
-                                              self._num_streams)
-    slice_spec = ''
-    specs = []
-    for i in range(self._num_streams):
-      specs.append(
-          saver.BaseSaverBuilder.SaveSpec(
-              bucket_boundaries[i], slice_spec,
-              name + '_bucket_boundaries_' + str(i)))
-    super(QuantileAccumulator, self).__init__(self._resource_handle, specs,
-                                              name)
-    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self)
+      self._name = name
+      self._resource_handle = self.create_resource()
+      self._init_op = self.initialize()
+      is_initialized_op = self.is_initialized()
+    resources.register_resource(self.resource_handle, self._init_op,
+                                is_initialized_op)
+    self._saveable = QuantileAccumulatorSaveable(
+        self.resource_handle, self._init_op, self._num_streams,
+        self.resource_handle.name)
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable)
 
-  def restore(self, restored_tensors, unused_tensor_shapes):
-    bucket_boundaries = restored_tensors
-    with ops.control_dependencies([self._create_op]):
-      return quantile_resource_deserialize(
-          self._resource_handle, bucket_boundaries=bucket_boundaries)
+  def create_resource(self):
+    return quantile_resource_handle_op(
+        container='', shared_name=self._name, name=self._name)
+
+  def initialize(self):
+    return create_quantile_stream_resource(self.resource_handle, self._eps,
+                                           self._num_streams)
+
+  @property
+  def initializer(self):
+    if self._init_op is None:
+      self._init_op = self.initialize()
+    return self._init_op
+
+  def is_initialized(self):
+    return is_quantile_resource_initialized(self.resource_handle)
+
+  @property
+  def saveable(self):
+    return self._saveable
+
+  def _gather_saveables_for_checkpoint(self):
+    return {'quantile_accumulator', self._saveable}
 
   def add_summaries(self, float_columns, example_weights):
     summaries = make_quantile_summaries(float_columns, example_weights,
                                         self._eps)
-    summary_op = quantile_add_summaries(self._resource_handle, summaries)
+    summary_op = quantile_add_summaries(self.resource_handle, summaries)
     return summary_op
 
   def flush(self):
-    return quantile_flush(self._resource_handle, self._num_quantiles)
+    return quantile_flush(self.resource_handle, self._num_quantiles)
 
   def get_bucket_boundaries(self):
-    return get_bucket_boundaries(self._resource_handle, self._num_streams)
-
-  @property
-  def resource(self):
-    return self._resource_handle
+    return get_bucket_boundaries(self.resource_handle, self._num_streams)
 
 
 class _TreeEnsembleSavable(saver.BaseSaverBuilder.SaveableObject):
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index f1f36269cf2bd9bcd3d25638a82d776850bc6bb8..b452b4a0f341738aac1da7c7b78ba99a5a469e70 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -1526,6 +1526,25 @@ def _get_diff_for_monotonic_comparison(x):
     v1=['debugging.is_numeric_tensor', 'is_numeric_tensor'])
 @deprecation.deprecated_endpoints('is_numeric_tensor')
 def is_numeric_tensor(tensor):
+  """Returns `True` if the elements of `tensor` are numbers.
+
+  Specifically, returns `True` if the dtype of `tensor` is one of the following:
+
+  * `tf.float32`
+  * `tf.float64`
+  * `tf.int8`
+  * `tf.int16`
+  * `tf.int32`
+  * `tf.int64`
+  * `tf.uint8`
+  * `tf.qint8`
+  * `tf.qint32`
+  * `tf.quint8`
+  * `tf.complex64`
+
+  Returns `False` if `tensor` is of a non-numeric type or if `tensor` is not
+  a `tf.Tensor` object.
+  """
   return isinstance(tensor, ops.Tensor) and tensor.dtype in NUMERIC_TYPES
 
 
@@ -1702,7 +1721,7 @@ def assert_scalar_v2(tensor, message=None, name=None):
 @tf_export(v1=['debugging.assert_scalar', 'assert_scalar'])
 @deprecation.deprecated_endpoints('assert_scalar')
 def assert_scalar(tensor, name=None, message=None):
-  """Asserts that the given `tensor` is a scalar.
+  """Asserts that the given `tensor` is a scalar (i.e. zero-dimensional).
 
   This function raises `ValueError` unless it can be certain that the given
   `tensor` is a scalar. `ValueError` is also raised if the shape of `tensor` is
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index a237cfff826bf0fb4cacd0c25fe5d361e3d7b26e..767dcb9358e99f866d628b2aac3bd85ec78ef65a 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -126,8 +126,8 @@ def clip_by_norm(t, clip_norm, axes=None, name=None):
   In this case, the L2-norm of the output tensor is `clip_norm`.
 
   As another example, if `t` is a matrix and `axes == [1]`, then each row
-  of the output will have L2-norm equal to `clip_norm`. If `axes == [0]`
-  instead, each column of the output will be clipped.
+  of the output will have L2-norm less than or equal to `clip_norm`. If
+  `axes == [0]` instead, each column of the output will be clipped.
 
   This operation is typically used to clip gradients before applying them with
   an optimizer.
diff --git a/tensorflow/python/ops/clustering_ops.py b/tensorflow/python/ops/clustering_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d48b89cbacce34781819010addbcbd0ba66f9873
--- /dev/null
+++ b/tensorflow/python/ops/clustering_ops.py
@@ -0,0 +1,770 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Clustering Operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_clustering_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_impl
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.embedding_ops import embedding_lookup
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.python.ops.gen_clustering_ops import *
+# pylint: enable=wildcard-import
+
+# Euclidean distance between vectors U and V is defined as \\(||U - V||_F\\)
+# which is the square root of the sum of the absolute squares of the elements
+# difference.
+SQUARED_EUCLIDEAN_DISTANCE = 'squared_euclidean'
+# Cosine distance between vectors U and V is defined as
+# \\(1 - (U \dot V) / (||U||_F ||V||_F)\\)
+COSINE_DISTANCE = 'cosine'
+
+RANDOM_INIT = 'random'
+KMEANS_PLUS_PLUS_INIT = 'kmeans_plus_plus'
+KMC2_INIT = 'kmc2'
+
+# The name of the variable holding the cluster centers. Used by the Estimator.
+CLUSTERS_VAR_NAME = 'clusters'
+
+
+class KMeans(object):
+  """Creates the graph for k-means clustering."""
+
+  def __init__(self,
+               inputs,
+               num_clusters,
+               initial_clusters=RANDOM_INIT,
+               distance_metric=SQUARED_EUCLIDEAN_DISTANCE,
+               use_mini_batch=False,
+               mini_batch_steps_per_iteration=1,
+               random_seed=0,
+               kmeans_plus_plus_num_retries=2,
+               kmc2_chain_length=200):
+    """Creates an object for generating KMeans clustering graph.
+
+    This class implements the following variants of K-means algorithm:
+
+    If use_mini_batch is False, it runs standard full batch K-means. Each step
+    runs a single iteration of K-Means. This step can be run sharded across
+    multiple workers by passing a list of sharded inputs to this class. Note
+    however that a single step needs to process the full input at once.
+
+    If use_mini_batch is True, it runs a generalization of the mini-batch
+    K-means algorithm. It runs multiple iterations, where each iteration is
+    composed of mini_batch_steps_per_iteration steps. Two copies of cluster
+    centers are maintained: one that is updated at the end of each iteration,
+    and one that is updated every step. The first copy is used to compute
+    cluster allocations for each step, and for inference, while the second copy
+    is the one updated each step using the mini-batch update rule. After each
+    iteration is complete, this second copy is copied back the first copy.
+
+    Note that for use_mini_batch=True, when mini_batch_steps_per_iteration=1,
+    the algorithm reduces to the standard mini-batch algorithm. Also by setting
+    mini_batch_steps_per_iteration = num_inputs / batch_size, the algorithm
+    becomes an asynchronous version of the full-batch algorithm. Note however
+    that there is no guarantee by this implementation that each input is seen
+    exactly once per iteration. Also, different updates are applied
+    asynchronously without locking. So this asynchronous version may not behave
+    exactly like a full-batch version.
+
+    Args:
+      inputs: An input tensor or list of input tensors. It is assumed that the
+        data points have been previously randomly permuted.
+      num_clusters: An integer tensor specifying the number of clusters. This
+        argument is ignored if initial_clusters is a tensor or numpy array.
+      initial_clusters: Specifies the clusters used during initialization. One
+        of the following:
+        - a tensor or numpy array with the initial cluster centers.
+        - a function f(inputs, k) that returns up to k centers from `inputs`.
+        - "random": Choose centers randomly from `inputs`.
+        - "kmeans_plus_plus": Use kmeans++ to choose centers from `inputs`.
+        - "kmc2": Use the fast k-MC2 algorithm to choose centers from `inputs`.
+        In the last three cases, one batch of `inputs` may not yield
+        `num_clusters` centers, in which case initialization will require
+        multiple batches until enough centers are chosen. In the case of
+        "random" or "kmeans_plus_plus", if the input size is <= `num_clusters`
+        then the entire batch is chosen to be cluster centers.
+      distance_metric: Distance metric used for clustering. Supported options:
+        "squared_euclidean", "cosine".
+      use_mini_batch: If true, use the mini-batch k-means algorithm. Else assume
+        full batch.
+      mini_batch_steps_per_iteration: Number of steps after which the updated
+        cluster centers are synced back to a master copy.
+      random_seed: Seed for PRNG used to initialize seeds.
+      kmeans_plus_plus_num_retries: For each point that is sampled during
+        kmeans++ initialization, this parameter specifies the number of
+        additional points to draw from the current distribution before selecting
+        the best. If a negative value is specified, a heuristic is used to
+        sample O(log(num_to_sample)) additional points.
+      kmc2_chain_length: Determines how many candidate points are used by the
+        k-MC2 algorithm to produce one new cluster centers. If a (mini-)batch
+        contains less points, one new cluster center is generated from the
+        (mini-)batch.
+
+    Raises:
+      ValueError: An invalid argument was passed to initial_clusters or
+        distance_metric.
+    """
+    if isinstance(initial_clusters, str) and initial_clusters not in [
+        RANDOM_INIT, KMEANS_PLUS_PLUS_INIT, KMC2_INIT
+    ]:
+      raise ValueError(
+          "Unsupported initialization algorithm '%s'" % initial_clusters)
+    if distance_metric not in [SQUARED_EUCLIDEAN_DISTANCE, COSINE_DISTANCE]:
+      raise ValueError("Unsupported distance metric '%s'" % distance_metric)
+    self._inputs = inputs if isinstance(inputs, list) else [inputs]
+    self._num_clusters = num_clusters
+    self._initial_clusters = initial_clusters
+    self._distance_metric = distance_metric
+    self._use_mini_batch = use_mini_batch
+    self._mini_batch_steps_per_iteration = int(mini_batch_steps_per_iteration)
+    self._random_seed = random_seed
+    self._kmeans_plus_plus_num_retries = kmeans_plus_plus_num_retries
+    self._kmc2_chain_length = kmc2_chain_length
+
+  @classmethod
+  def _distance_graph(cls, inputs, clusters, distance_metric):
+    """Computes distance between each input and each cluster center.
+
+    Args:
+      inputs: list of input Tensors.
+      clusters: cluster Tensor.
+      distance_metric: distance metric used for clustering
+
+    Returns:
+      list of Tensors, where each element corresponds to each element in inputs.
+      The value is the distance of each row to all the cluster centers.
+      Currently only Euclidean distance and cosine distance are supported.
+    """
+    assert isinstance(inputs, list)
+    if distance_metric == SQUARED_EUCLIDEAN_DISTANCE:
+      return cls._compute_euclidean_distance(inputs, clusters)
+    elif distance_metric == COSINE_DISTANCE:
+      return cls._compute_cosine_distance(
+          inputs, clusters, inputs_normalized=True)
+    else:
+      assert False, str(distance_metric)
+
+  @classmethod
+  def _compute_euclidean_distance(cls, inputs, clusters):
+    """Computes Euclidean distance between each input and each cluster center.
+
+    Args:
+      inputs: list of input Tensors.
+      clusters: cluster Tensor.
+
+    Returns:
+      list of Tensors, where each element corresponds to each element in inputs.
+      The value is the distance of each row to all the cluster centers.
+    """
+    output = []
+    for inp in inputs:
+      with ops.colocate_with(inp, ignore_existing=True):
+        # Computes Euclidean distance. Note the first and third terms are
+        # broadcast additions.
+        squared_distance = (
+            math_ops.reduce_sum(math_ops.square(inp), 1, keepdims=True) -
+            2 * math_ops.matmul(inp, clusters, transpose_b=True) +
+            array_ops.transpose(
+                math_ops.reduce_sum(
+                    math_ops.square(clusters), 1, keepdims=True)))
+        output.append(squared_distance)
+
+    return output
+
+  @classmethod
+  def _compute_cosine_distance(cls, inputs, clusters, inputs_normalized=True):
+    """Computes cosine distance between each input and each cluster center.
+
+    Args:
+      inputs: list of input Tensor.
+      clusters: cluster Tensor
+      inputs_normalized: if True, it assumes that inp and clusters are
+      normalized and computes the dot product which is equivalent to the cosine
+      distance. Else it L2 normalizes the inputs first.
+
+    Returns:
+      list of Tensors, where each element corresponds to each element in inp.
+      The value is the distance of each row to all the cluster centers.
+    """
+    output = []
+    if not inputs_normalized:
+      with ops.colocate_with(clusters, ignore_existing=True):
+        clusters = nn_impl.l2_normalize(clusters, dim=1)
+    for inp in inputs:
+      with ops.colocate_with(inp, ignore_existing=True):
+        if not inputs_normalized:
+          inp = nn_impl.l2_normalize(inp, dim=1)
+        output.append(1 - math_ops.matmul(inp, clusters, transpose_b=True))
+    return output
+
+  def _infer_graph(self, inputs, clusters):
+    """Maps input to closest cluster and the score.
+
+    Args:
+      inputs: list of input Tensors.
+      clusters: Tensor of cluster centers.
+
+    Returns:
+      List of tuple, where each value in tuple corresponds to a value in inp.
+      The tuple has following three elements:
+      all_scores: distance of each input to each cluster center.
+      score: distance of each input to closest cluster center.
+      cluster_idx: index of cluster center closest to the corresponding input.
+    """
+    assert isinstance(inputs, list)
+    # Pairwise distances are used only by transform(). In all other cases, this
+    # sub-graph is not evaluated.
+    scores = self._distance_graph(inputs, clusters, self._distance_metric)
+    output = []
+    if (self._distance_metric == COSINE_DISTANCE and
+        not self._clusters_l2_normalized()):
+      # The cosine distance between normalized vectors x and y is the same as
+      # 2 * squared_euclidean_distance. We are using this fact and reusing the
+      # nearest_neighbors op.
+      # TODO(ands): Support COSINE distance in nearest_neighbors and remove
+      # this.
+      with ops.colocate_with(clusters, ignore_existing=True):
+        clusters = nn_impl.l2_normalize(clusters, dim=1)
+    for inp, score in zip(inputs, scores):
+      with ops.colocate_with(inp, ignore_existing=True):
+        (indices, distances) = gen_clustering_ops.nearest_neighbors(
+            inp, clusters, 1)
+        if self._distance_metric == COSINE_DISTANCE:
+          distances *= 0.5
+        output.append((score, array_ops.squeeze(distances, [-1]),
+                       array_ops.squeeze(indices, [-1])))
+    return zip(*output)
+
+  def _clusters_l2_normalized(self):
+    """Returns True if clusters centers are kept normalized."""
+    return (self._distance_metric == COSINE_DISTANCE and
+            (not self._use_mini_batch or
+             self._mini_batch_steps_per_iteration > 1))
+
+  def _create_variables(self, num_clusters):
+    """Creates variables.
+
+    Args:
+      num_clusters: an integer Tensor providing the number of clusters.
+
+    Returns:
+      Tuple with following elements:
+      - cluster_centers: a Tensor for storing cluster centers
+      - cluster_centers_initialized: bool Variable indicating whether clusters
+            are initialized.
+      - cluster_counts: a Tensor for storing counts of points assigned to this
+            cluster. This is used by mini-batch training.
+      - cluster_centers_updated: Tensor representing copy of cluster centers
+            that are updated every step.
+      - update_in_steps: numbers of steps left before we sync
+            cluster_centers_updated back to cluster_centers.
+    """
+    init_value = array_ops.constant([], dtype=dtypes.float32)
+    cluster_centers = variable_scope.variable(
+        init_value, name=CLUSTERS_VAR_NAME, validate_shape=False)
+    cluster_centers_initialized = variable_scope.variable(
+        False, dtype=dtypes.bool, name='initialized')
+
+    if self._use_mini_batch and self._mini_batch_steps_per_iteration > 1:
+      # Copy of cluster centers actively updated each step according to
+      # mini-batch update rule.
+      cluster_centers_updated = variable_scope.variable(
+          init_value, name='clusters_updated', validate_shape=False)
+      # How many steps till we copy the updated clusters to cluster_centers.
+      update_in_steps = variable_scope.variable(
+          self._mini_batch_steps_per_iteration,
+          dtype=dtypes.int64,
+          name='update_in_steps')
+      # Count of points assigned to cluster_centers_updated.
+      cluster_counts = variable_scope.variable(
+          array_ops.zeros([num_clusters], dtype=dtypes.int64))
+    else:
+      cluster_centers_updated = cluster_centers
+      update_in_steps = None
+      cluster_counts = (
+          variable_scope.variable(
+              array_ops.ones([num_clusters], dtype=dtypes.int64))
+          if self._use_mini_batch else None)
+    return (cluster_centers, cluster_centers_initialized, cluster_counts,
+            cluster_centers_updated, update_in_steps)
+
+  @classmethod
+  def _l2_normalize_data(cls, inputs):
+    """Normalized the input data."""
+    output = []
+    for inp in inputs:
+      with ops.colocate_with(inp, ignore_existing=True):
+        output.append(nn_impl.l2_normalize(inp, dim=1))
+    return output
+
+  def training_graph(self):
+    """Generate a training graph for kmeans algorithm.
+
+    This returns, among other things, an op that chooses initial centers
+    (init_op), a boolean variable that is set to True when the initial centers
+    are chosen (cluster_centers_initialized), and an op to perform either an
+    entire Lloyd iteration or a mini-batch of a Lloyd iteration (training_op).
+    The caller should use these components as follows. A single worker should
+    execute init_op multiple times until cluster_centers_initialized becomes
+    True. Then multiple workers may execute training_op any number of times.
+
+    Returns:
+      A tuple consisting of:
+      all_scores: A matrix (or list of matrices) of dimensions (num_input,
+        num_clusters) where the value is the distance of an input vector and a
+        cluster center.
+      cluster_idx: A vector (or list of vectors). Each element in the vector
+        corresponds to an input row in 'inp' and specifies the cluster id
+        corresponding to the input.
+      scores: Similar to cluster_idx but specifies the distance to the
+        assigned cluster instead.
+      cluster_centers_initialized: scalar indicating whether clusters have been
+        initialized.
+      init_op: an op to initialize the clusters.
+      training_op: an op that runs an iteration of training.
+    """
+    # Implementation of kmeans.
+    if (isinstance(self._initial_clusters, str) or
+        callable(self._initial_clusters)):
+      initial_clusters = self._initial_clusters
+      num_clusters = ops.convert_to_tensor(self._num_clusters)
+    else:
+      initial_clusters = ops.convert_to_tensor(self._initial_clusters)
+      num_clusters = array_ops.shape(initial_clusters)[0]
+
+    inputs = self._inputs
+    (cluster_centers_var, cluster_centers_initialized, total_counts,
+     cluster_centers_updated,
+     update_in_steps) = self._create_variables(num_clusters)
+    init_op = _InitializeClustersOpFactory(
+        self._inputs, num_clusters, initial_clusters, self._distance_metric,
+        self._random_seed, self._kmeans_plus_plus_num_retries,
+        self._kmc2_chain_length, cluster_centers_var, cluster_centers_updated,
+        cluster_centers_initialized).op()
+    cluster_centers = cluster_centers_var
+
+    if self._distance_metric == COSINE_DISTANCE:
+      inputs = self._l2_normalize_data(inputs)
+      if not self._clusters_l2_normalized():
+        cluster_centers = nn_impl.l2_normalize(cluster_centers, dim=1)
+
+    all_scores, scores, cluster_idx = self._infer_graph(inputs, cluster_centers)
+    if self._use_mini_batch:
+      sync_updates_op = self._mini_batch_sync_updates_op(
+          update_in_steps, cluster_centers_var, cluster_centers_updated,
+          total_counts)
+      assert sync_updates_op is not None
+      with ops.control_dependencies([sync_updates_op]):
+        training_op = self._mini_batch_training_op(
+            inputs, cluster_idx, cluster_centers_updated, total_counts)
+    else:
+      assert cluster_centers == cluster_centers_var
+      training_op = self._full_batch_training_op(
+          inputs, num_clusters, cluster_idx, cluster_centers_var)
+
+    return (all_scores, cluster_idx, scores, cluster_centers_initialized,
+            init_op, training_op)
+
+  def _mini_batch_sync_updates_op(self, update_in_steps, cluster_centers_var,
+                                  cluster_centers_updated, total_counts):
+    if self._use_mini_batch and self._mini_batch_steps_per_iteration > 1:
+      assert update_in_steps is not None
+      with ops.colocate_with(update_in_steps, ignore_existing=True):
+
+        def _f():
+          # Note that there is a race condition here, so we do a best effort
+          # updates here. We reset update_in_steps first so that other workers
+          # don't duplicate the updates. Also we update cluster_center_vars
+          # before resetting total_counts to avoid large updates to
+          # cluster_centers_updated based on partially updated
+          # cluster_center_vars.
+          with ops.control_dependencies([
+              state_ops.assign(update_in_steps,
+                               self._mini_batch_steps_per_iteration - 1)
+          ]):
+            with ops.colocate_with(
+                cluster_centers_updated, ignore_existing=True):
+              if self._distance_metric == COSINE_DISTANCE:
+                cluster_centers = nn_impl.l2_normalize(
+                    cluster_centers_updated, dim=1)
+              else:
+                cluster_centers = cluster_centers_updated
+            with ops.colocate_with(cluster_centers_var, ignore_existing=True):
+              with ops.control_dependencies(
+                  [state_ops.assign(cluster_centers_var, cluster_centers)]):
+                with ops.colocate_with(None, ignore_existing=True):
+                  with ops.control_dependencies([
+                      state_ops.assign(total_counts,
+                                       array_ops.zeros_like(total_counts))
+                  ]):
+                    return array_ops.identity(update_in_steps)
+
+        return control_flow_ops.cond(
+            update_in_steps <= 0, _f,
+            lambda: state_ops.assign_sub(update_in_steps, 1))
+    else:
+      return control_flow_ops.no_op()
+
+  def _mini_batch_training_op(self, inputs, cluster_idx_list, cluster_centers,
+                              total_counts):
+    """Creates an op for training for mini batch case.
+
+    Args:
+      inputs: list of input Tensors.
+      cluster_idx_list: A vector (or list of vectors). Each element in the
+        vector corresponds to an input row in 'inp' and specifies the cluster id
+        corresponding to the input.
+      cluster_centers: Tensor Ref of cluster centers.
+      total_counts: Tensor Ref of cluster counts.
+
+    Returns:
+      An op for doing an update of mini-batch k-means.
+    """
+    update_ops = []
+    for inp, cluster_idx in zip(inputs, cluster_idx_list):
+      with ops.colocate_with(inp, ignore_existing=True):
+        assert total_counts is not None
+        cluster_idx = array_ops.reshape(cluster_idx, [-1])
+        # Dedupe the unique ids of cluster_centers being updated so that updates
+        # can be locally aggregated.
+        unique_ids, unique_idx = array_ops.unique(cluster_idx)
+        num_unique_cluster_idx = array_ops.size(unique_ids)
+        # Fetch the old values of counts and cluster_centers.
+        with ops.colocate_with(total_counts, ignore_existing=True):
+          old_counts = array_ops.gather(total_counts, unique_ids)
+        # TODO(agarwal): This colocation seems to run into problems. Fix it.
+        with ops.colocate_with(cluster_centers, ignore_existing=True):
+          old_cluster_centers = array_ops.gather(cluster_centers, unique_ids)
+        # Locally aggregate the increment to counts.
+        count_updates = math_ops.unsorted_segment_sum(
+            array_ops.ones_like(unique_idx, dtype=total_counts.dtype),
+            unique_idx, num_unique_cluster_idx)
+        # Locally compute the sum of inputs mapped to each id.
+        # For a cluster with old cluster value x, old count n, and with data
+        # d_1,...d_k newly assigned to it, we recompute the new value as
+        # \\(x += (sum_i(d_i) - k * x) / (n + k)\\).
+        # Compute \\(sum_i(d_i)\\), see comment above.
+        cluster_center_updates = math_ops.unsorted_segment_sum(
+            inp, unique_idx, num_unique_cluster_idx)
+        # Shape to enable broadcasting count_updates and learning_rate to inp.
+        # It extends the shape with 1's to match the rank of inp.
+        broadcast_shape = array_ops.concat([
+            array_ops.reshape(num_unique_cluster_idx, [1]),
+            array_ops.ones(
+                array_ops.reshape(array_ops.rank(inp) - 1, [1]),
+                dtype=dtypes.int32)
+        ], 0)
+        # Subtract k * x, see comment above.
+        cluster_center_updates -= math_ops.cast(
+            array_ops.reshape(count_updates, broadcast_shape),
+            inp.dtype) * old_cluster_centers
+        learning_rate = math_ops.reciprocal(
+            math_ops.cast(old_counts + count_updates, inp.dtype))
+        learning_rate = array_ops.reshape(learning_rate, broadcast_shape)
+        # scale by 1 / (n + k), see comment above.
+        cluster_center_updates *= learning_rate
+        # Apply the updates.
+      update_counts = state_ops.scatter_add(total_counts, unique_ids,
+                                            count_updates)
+      update_cluster_centers = state_ops.scatter_add(
+          cluster_centers, unique_ids, cluster_center_updates)
+      update_ops.extend([update_counts, update_cluster_centers])
+    return control_flow_ops.group(*update_ops)
+
+  def _full_batch_training_op(self, inputs, num_clusters, cluster_idx_list,
+                              cluster_centers):
+    """Creates an op for training for full batch case.
+
+    Args:
+      inputs: list of input Tensors.
+      num_clusters: an integer Tensor providing the number of clusters.
+      cluster_idx_list: A vector (or list of vectors). Each element in the
+        vector corresponds to an input row in 'inp' and specifies the cluster id
+        corresponding to the input.
+      cluster_centers: Tensor Ref of cluster centers.
+
+    Returns:
+      An op for doing an update of mini-batch k-means.
+    """
+    cluster_sums = []
+    cluster_counts = []
+    epsilon = constant_op.constant(1e-6, dtype=inputs[0].dtype)
+    for inp, cluster_idx in zip(inputs, cluster_idx_list):
+      with ops.colocate_with(inp, ignore_existing=True):
+        cluster_sums.append(
+            math_ops.unsorted_segment_sum(inp, cluster_idx, num_clusters))
+        cluster_counts.append(
+            math_ops.unsorted_segment_sum(
+                array_ops.reshape(
+                    array_ops.ones(
+                        array_ops.reshape(array_ops.shape(inp)[0], [-1])),
+                    [-1, 1]), cluster_idx, num_clusters))
+    with ops.colocate_with(cluster_centers, ignore_existing=True):
+      new_clusters_centers = math_ops.add_n(cluster_sums) / (
+          math_ops.cast(math_ops.add_n(cluster_counts), cluster_sums[0].dtype) +
+          epsilon)
+      if self._clusters_l2_normalized():
+        new_clusters_centers = nn_impl.l2_normalize(new_clusters_centers, dim=1)
+    return state_ops.assign(cluster_centers, new_clusters_centers)
+
+
+class _InitializeClustersOpFactory(object):
+  """Internal class to create the op to initialize the clusters.
+
+    The op performs this algorithm (see constructor args):
+
+    num_remaining = num_clusters - length(cluster_centers)
+    if num_remaining == 0:
+      assert that cluster_centers_initialized is true
+    else:
+      assert that num_remaining > 0
+      new_centers = choose up to num_remaining initial centers
+      l2-normalize new_centers if using cosine distance
+      all_centers = concat(cluster_centers, new_centers)
+      cluster_centers := all_centers
+      if there is a cluster_centers_updated variable:
+        cluster_centers_updated := cluster_centers
+      num_now_remaining = num_clusters - length(cluster_centers)
+      if num_now_remaining == 0:
+        cluster_centers_initialized := true
+  """
+
+  # TODO(ccolby): Refactor this class so that kmc2 isn't so much a special case.
+
+  def __init__(self, inputs, num_clusters, initial_clusters, distance_metric,
+               random_seed, kmeans_plus_plus_num_retries, kmc2_chain_length,
+               cluster_centers, cluster_centers_updated,
+               cluster_centers_initialized):
+    """Creates an op factory.
+
+    Args:
+      inputs: See KMeans constructor.
+      num_clusters: An integer Tensor providing the number of clusters.
+      initial_clusters: See KMeans constructor.
+      distance_metric: See KMeans constructor.
+      random_seed: See KMeans constructor.
+      kmeans_plus_plus_num_retries: See KMeans constructor.
+      kmc2_chain_length: See KMeans constructor.
+      cluster_centers: The TF variable holding the initial centers. It may
+          already contain some centers when the op is executed.
+      cluster_centers_updated: A second TF variable to hold a copy of the
+          initial centers, used for full-batch mode. In mini-batch mode,
+          cluster_centers_updated is the same variable as cluster_centers.
+      cluster_centers_initialized: A boolean TF variable that will be set
+          to true when all the initial centers have been chosen.
+    """
+    # All of these instance variables are constants.
+    self._inputs = inputs
+    self._num_clusters = num_clusters
+    self._initial_clusters = initial_clusters
+    self._distance_metric = distance_metric
+    self._random_seed = random_seed
+    self._kmeans_plus_plus_num_retries = kmeans_plus_plus_num_retries
+    self._kmc2_chain_length = kmc2_chain_length
+    self._cluster_centers = cluster_centers
+    self._cluster_centers_updated = cluster_centers_updated
+    self._cluster_centers_initialized = cluster_centers_initialized
+
+    self._num_selected = array_ops.shape(self._cluster_centers)[0]
+    self._num_remaining = self._num_clusters - self._num_selected
+    self._num_data = math_ops.add_n(
+        [array_ops.shape(i)[0] for i in self._inputs])
+
+  def _random(self):
+    indices = random_ops.random_uniform(
+        array_ops.reshape(self._num_remaining, [-1]),
+        minval=0,
+        maxval=math_ops.cast(self._num_data, dtypes.int64),
+        seed=self._random_seed,
+        dtype=dtypes.int64)
+    return embedding_lookup(self._inputs, indices, partition_strategy='div')
+
+  def _kmeans_plus_plus(self):
+    # Points from only the first shard are used for initializing centers.
+    # TODO(ands): Use all points.
+    inp = self._inputs[0]
+    if self._distance_metric == COSINE_DISTANCE:
+      inp = nn_impl.l2_normalize(inp, dim=1)
+    return gen_clustering_ops.kmeans_plus_plus_initialization(
+        inp,
+        math_ops.to_int64(self._num_remaining), self._random_seed,
+        self._kmeans_plus_plus_num_retries)
+
+  def _kmc2_multiple_centers(self):
+    """Adds new initial cluster centers using the k-MC2 algorithm.
+
+    In each call to the op, the provided batch is split into subsets based on
+    the specified `kmc2_chain_length`. On each subset, a single Markov chain of
+    the k-MC2 algorithm is used to add *one* new center cluster center. If there
+    are less than `kmc2_chain_length` points in the subset, a single center is
+    added using one Markov chain on the full input. It is assumed that the
+    provided batch has previously been randomly permuted. Otherwise, k-MC2 may
+    return suboptimal centers.
+
+    Returns:
+      An op that adds new cluster centers.
+    """
+    # The op only operates on the first shard of data.
+    first_shard = self._inputs[0]
+    # Number of points in the input that can be used.
+    batch_size = array_ops.shape(first_shard)[0]
+    # Maximum number of subsets such that the size of each subset is at least
+    # `kmc2_chain_length`. Final subsets may be larger.
+    max_to_sample = math_ops.cast(
+        batch_size / self._kmc2_chain_length, dtype=dtypes.int32)
+    # We sample at least one new center and at most all remaining centers.
+    num_to_sample = math_ops.maximum(
+        math_ops.minimum(self._num_remaining, max_to_sample), 1)
+
+    def _cond(i, _):
+      """Stopping condition for the while loop."""
+      return math_ops.less(i, num_to_sample)
+
+    def _body(i, _):
+      """Body that adds a single new center based on a subset."""
+
+      def _sample_random():
+        """Returns a random point as a cluster center."""
+        # By assumption the batch is reshuffled and _sample_random is always
+        # called for i=0. Hence, we simply return the first point.
+        new_center = array_ops.reshape(first_shard[0], [1, -1])
+        if self._distance_metric == COSINE_DISTANCE:
+          new_center = nn_impl.l2_normalize(new_center, dim=1)
+        return new_center
+
+      def _sample_kmc2_chain():
+        """Returns previous centers as well as a new center sampled using k-MC2.
+        """
+        # Extract the subset from the underlying batch.
+        start = i * self._kmc2_chain_length
+        end = start + self._kmc2_chain_length
+        subset = first_shard[start:end]
+        # Compute the distances from points in the subset to previous centers.
+        _, distances = gen_clustering_ops.nearest_neighbors(
+            subset, self._cluster_centers, 1)
+        # Sample index of new center using k-MC2 Markov chain.
+        new_center_index = gen_clustering_ops.kmc2_chain_initialization(
+            array_ops.squeeze(distances), self._random_seed)
+        # Extract actual new center.
+        newly_sampled_center = array_ops.reshape(subset[new_center_index],
+                                                 [1, -1])
+        # Return concatenation with previously sampled centers.
+        if self._distance_metric == COSINE_DISTANCE:
+          newly_sampled_center = nn_impl.l2_normalize(
+              newly_sampled_center, dim=1)
+        return array_ops.concat([self._cluster_centers, newly_sampled_center],
+                                0)
+
+      # Obtain a random point if there are no previously sampled centers.
+      # Otherwise, construct a k-MC2 Markov chain.
+      new_centers = control_flow_ops.cond(
+          math_ops.equal(self._num_selected, 0), _sample_random,
+          _sample_kmc2_chain)
+      # Assign new cluster centers to underlying variable.
+      assigned_centers = state_ops.assign(
+          self._cluster_centers, new_centers, validate_shape=False)
+      if self._cluster_centers_updated is not self._cluster_centers:
+        assigned_centers = state_ops.assign(
+            self._cluster_centers_updated,
+            assigned_centers,
+            validate_shape=False)
+      return i + 1, self._num_clusters - array_ops.shape(assigned_centers)[0]
+
+    # Add num_to_sample new data points.
+    _, num_remaining = control_flow_ops.while_loop(_cond, _body, [0, 0])
+    return num_remaining
+
+  def _greedy_batch_sampler(self, sampler):
+    # If the input dataset size is smaller than the number of centers
+    # remaining, choose the entire input dataset as centers. This can happen
+    # with mini-batch. Otherwise, sample the batch according to the provided
+    # sampler.
+    return control_flow_ops.cond(self._num_data <= self._num_remaining,
+                                 lambda: array_ops.concat(self._inputs, 0),
+                                 sampler)
+
+  def _single_batch_sampler(self, sampler):
+    # Enforce that there are at least as many data points as centers
+    # remaining. This gives the provided sampler the chance to select all
+    # remaining centers from a single batch.
+    with ops.control_dependencies(
+        [check_ops.assert_greater_equal(self._num_data, self._num_remaining)]):
+      return sampler()
+
+  def _choose_initial_centers(self):
+    if isinstance(self._initial_clusters, str):
+      if self._initial_clusters == RANDOM_INIT:
+        return self._greedy_batch_sampler(self._random)
+      else:  # self._initial_clusters == KMEANS_PLUS_PLUS_INIT
+        return self._single_batch_sampler(self._kmeans_plus_plus)
+    elif callable(self._initial_clusters):
+      return self._initial_clusters(self._inputs, self._num_remaining)
+    else:
+      with ops.control_dependencies([
+          check_ops.assert_equal(self._num_remaining,
+                                 array_ops.shape(self._initial_clusters)[0])
+      ]):
+        return self._initial_clusters
+
+  def _add_new_centers(self):
+    """Adds some centers and returns the number of centers remaining."""
+    new_centers = self._choose_initial_centers()
+    if self._distance_metric == COSINE_DISTANCE:
+      new_centers = nn_impl.l2_normalize(new_centers, dim=1)
+    # If cluster_centers is empty, it doesn't have the right shape for concat.
+    all_centers = control_flow_ops.cond(
+        math_ops.equal(self._num_selected, 0), lambda: new_centers,
+        lambda: array_ops.concat([self._cluster_centers, new_centers], 0))
+    # TODO(ccolby): De-dupe all_centers?
+    a = state_ops.assign(
+        self._cluster_centers, all_centers, validate_shape=False)
+    if self._cluster_centers_updated is not self._cluster_centers:
+      a = state_ops.assign(
+          self._cluster_centers_updated, a, validate_shape=False)
+    return self._num_clusters - array_ops.shape(a)[0]
+
+  def _initialize(self):
+    with ops.control_dependencies([
+        check_ops.assert_positive(self._num_remaining),
+    ]):
+      if self._initial_clusters == KMC2_INIT:
+        num_now_remaining = self._kmc2_multiple_centers()
+      else:
+        num_now_remaining = self._add_new_centers()
+      return control_flow_ops.cond(
+          math_ops.equal(num_now_remaining, 0),
+          lambda: state_ops.assign(self._cluster_centers_initialized, True),
+          control_flow_ops.no_op)
+
+  def op(self):
+    """Returns the cluster initializer op."""
+    return control_flow_ops.cond(
+        math_ops.equal(self._num_remaining, 0),
+        lambda: check_ops.assert_equal(self._cluster_centers_initialized, True),
+        self._initialize)
diff --git a/tensorflow/python/ops/clustering_ops_test.py b/tensorflow/python/ops/clustering_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5804c660e67eedf09b0dec6e599d1cf644156a9d
--- /dev/null
+++ b/tensorflow/python/ops/clustering_ops_test.py
@@ -0,0 +1,212 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License.  You may obtain a copy of
+# the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# ==============================================================================
+"""Tests for clustering_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import clustering_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class KmeansPlusPlusInitializationTest(test.TestCase):
+
+  # All but one input point are close to (101, 1). With uniform random sampling,
+  # it is highly improbable for (-1, -1) to be selected.
+  def setUp(self):
+    self._points = np.array([[100., 0.],
+                             [101., 2.],
+                             [102., 0.],
+                             [100., 1.],
+                             [100., 2.],
+                             [101., 0.],
+                             [101., 0.],
+                             [101., 1.],
+                             [102., 0.],
+                             [-1., -1.]]).astype(np.float32)
+
+  def runTestWithSeed(self, seed):
+    with self.cached_session():
+      sampled_points = clustering_ops.kmeans_plus_plus_initialization(
+          self._points, 3, seed, (seed % 5) - 1)
+      self.assertAllClose(
+          sorted(self.evaluate(sampled_points).tolist()),
+          [[-1., -1.], [101., 1.], [101., 1.]],
+          atol=1.0)
+
+  def testBasic(self):
+    for seed in range(100):
+      self.runTestWithSeed(seed)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class KMC2InitializationTest(test.TestCase):
+
+  def runTestWithSeed(self, seed):
+    with self.cached_session():
+      distances = np.zeros(1000).astype(np.float32)
+      distances[6] = 10e7
+      distances[4] = 10e3
+
+      sampled_point = clustering_ops.kmc2_chain_initialization(distances, seed)
+      self.assertAllEqual(sampled_point, 6)
+      distances[6] = 0.0
+      sampled_point = clustering_ops.kmc2_chain_initialization(distances, seed)
+      self.assertAllEqual(sampled_point, 4)
+
+  def testBasic(self):
+    for seed in range(100):
+      self.runTestWithSeed(seed)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class KMC2InitializationLargeTest(test.TestCase):
+
+  def setUp(self):
+    self._distances = np.zeros(1001)
+    self._distances[500] = 100.0
+    self._distances[1000] = 50.0
+
+  def testBasic(self):
+    with self.cached_session():
+      counts = {}
+      seed = 0
+      for i in range(50):
+        sample = self.evaluate(
+            clustering_ops.kmc2_chain_initialization(self._distances, seed + i))
+        counts[sample] = counts.get(sample, 0) + 1
+      self.assertEquals(len(counts), 2)
+      self.assertTrue(500 in counts)
+      self.assertTrue(1000 in counts)
+      self.assertGreaterEqual(counts[500], 5)
+      self.assertGreaterEqual(counts[1000], 5)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class KMC2InitializationCornercaseTest(test.TestCase):
+
+  def setUp(self):
+    self._distances = np.zeros(10)
+
+  def runTestWithSeed(self, seed):
+    with self.cached_session():
+      sampled_point = clustering_ops.kmc2_chain_initialization(
+          self._distances, seed)
+      self.assertAllEqual(sampled_point, 0)
+
+  def testBasic(self):
+    for seed in range(100):
+      self.runTestWithSeed(seed)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+# A simple test that can be verified by hand.
+class NearestCentersTest(test.TestCase):
+
+  def setUp(self):
+    self._points = np.array([[100., 0.],
+                             [101., 2.],
+                             [99., 2.],
+                             [1., 1.]]).astype(np.float32)
+
+    self._centers = np.array([[100., 0.],
+                              [99., 1.],
+                              [50., 50.],
+                              [0., 0.],
+                              [1., 1.]]).astype(np.float32)
+
+  def testNearest1(self):
+    with self.cached_session():
+      [indices, distances] = clustering_ops.nearest_neighbors(self._points,
+                                                              self._centers, 1)
+      self.assertAllClose(indices, [[0], [0], [1], [4]])
+      self.assertAllClose(distances, [[0.], [5.], [1.], [0.]])
+
+  def testNearest2(self):
+    with self.cached_session():
+      [indices, distances] = clustering_ops.nearest_neighbors(self._points,
+                                                              self._centers, 2)
+      self.assertAllClose(indices, [[0, 1], [0, 1], [1, 0], [4, 3]])
+      self.assertAllClose(distances, [[0., 2.], [5., 5.], [1., 5.], [0., 2.]])
+
+
+@test_util.run_all_in_graph_and_eager_modes
+# A test with large inputs.
+class NearestCentersLargeTest(test.TestCase):
+
+  def setUp(self):
+    num_points = 1000
+    num_centers = 2000
+    num_dim = 100
+    max_k = 5
+    # Construct a small number of random points and later tile them.
+    points_per_tile = 10
+    assert num_points % points_per_tile == 0
+    points = np.random.standard_normal(
+        [points_per_tile, num_dim]).astype(np.float32)
+    # Construct random centers.
+    self._centers = np.random.standard_normal(
+        [num_centers, num_dim]).astype(np.float32)
+
+    # Exhaustively compute expected nearest neighbors.
+    def squared_distance(x, y):
+      return np.linalg.norm(x - y, ord=2)**2
+
+    nearest_neighbors = [
+        sorted([(squared_distance(point, self._centers[j]), j)
+                for j in range(num_centers)])[:max_k] for point in points
+    ]
+    expected_nearest_neighbor_indices = np.array(
+        [[i for _, i in nn] for nn in nearest_neighbors])
+    expected_nearest_neighbor_squared_distances = np.array(
+        [[dist for dist, _ in nn] for nn in nearest_neighbors])
+    # Tile points and expected results to reach requested size (num_points)
+    (self._points, self._expected_nearest_neighbor_indices,
+     self._expected_nearest_neighbor_squared_distances) = (
+         np.tile(x, (int(num_points / points_per_tile), 1))
+         for x in (points, expected_nearest_neighbor_indices,
+                   expected_nearest_neighbor_squared_distances))
+
+  def testNearest1(self):
+    with self.cached_session():
+      [indices, distances] = clustering_ops.nearest_neighbors(self._points,
+                                                              self._centers, 1)
+      self.assertAllClose(
+          indices,
+          self._expected_nearest_neighbor_indices[:, [0]])
+      self.assertAllClose(
+          distances,
+          self._expected_nearest_neighbor_squared_distances[:, [0]])
+
+  def testNearest5(self):
+    with self.cached_session():
+      [indices, distances] = clustering_ops.nearest_neighbors(self._points,
+                                                              self._centers, 5)
+      self.assertAllClose(
+          indices,
+          self._expected_nearest_neighbor_indices[:, 0:5])
+      self.assertAllClose(
+          distances,
+          self._expected_nearest_neighbor_squared_distances[:, 0:5])
+
+
+if __name__ == "__main__":
+  np.random.seed(0)
+  test.main()
diff --git a/tensorflow/python/ops/collective_ops.py b/tensorflow/python/ops/collective_ops.py
index 98668facd5bc56892fa00f258dfebcbe93c063da..32a71fc25d370f4e96ef4036f4fdee3c670502d2 100644
--- a/tensorflow/python/ops/collective_ops.py
+++ b/tensorflow/python/ops/collective_ops.py
@@ -48,7 +48,7 @@ def all_reduce(t, group_size, group_key, instance_key, merge_op, final_op,
   if not device.canonical_name(t.device):
     raise ValueError('Device assignment required for collective ops')
   if group_size <= 1:
-    raise ValueError('Parameter group_size to add_reduce must be at least 2.')
+    raise ValueError('Parameter group_size to all_reduce must be at least 2.')
   return gen_collective_ops.collective_reduce(t,
                                               group_size=group_size,
                                               group_key=group_key,
@@ -58,6 +58,35 @@ def all_reduce(t, group_size, group_key, instance_key, merge_op, final_op,
                                               subdiv_offsets=subdiv_offsets)
 
 
+def all_gather(t, group_size, group_key, instance_key):
+  """Accumulates tensors collectively, across devices, along first dimension.
+
+  Args:
+    t: the tensor to participate in the accumulation.
+    group_size: the total number of tensors to be collectively accumulated.
+      Each must reside on a different device.
+    group_key: an integer identifying the group of devices.
+    instance_key: an integer identifying the participating group of Ops.
+
+  Returns:
+    An Op implementing the distributed operation.
+
+  Raises:
+    ValueError: if any of the input parameter constraints are not met.
+  """
+  if not device.canonical_name(t.device):
+    raise ValueError('Device assignment required for collective ops')
+  if group_size <= 1:
+    raise ValueError('Parameter group_size to all_gather must be at least 2.')
+  dims = t.shape.as_list()
+  output_shape = [dims[0] * group_size] + dims[1:]
+  return gen_collective_ops.collective_gather(t,
+                                              shape=output_shape,
+                                              group_size=group_size,
+                                              group_key=group_key,
+                                              instance_key=instance_key)
+
+
 def broadcast_send(t, shape, dtype, group_size, group_key, instance_key):
   """Broadcasts one tensor to a group of others, across devices.
 
diff --git a/tensorflow/python/ops/collective_ops_test.py b/tensorflow/python/ops/collective_ops_test.py
index 0fd9368d2194e875aa5c4ddfb716f0898d6a9c49..c9b376caf8f07236c4dde2bda2ba43c89a0ad8c2 100644
--- a/tensorflow/python/ops/collective_ops_test.py
+++ b/tensorflow/python/ops/collective_ops_test.py
@@ -4,7 +4,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -25,8 +25,6 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import collective_ops
 from tensorflow.python.platform import test
 
-# TODO(tucker): Make these ops work in eager mode. b/79776476
-
 
 class CollectiveOpTest(test.TestCase):
 
@@ -50,6 +48,24 @@ class CollectiveOpTest(test.TestCase):
     self.assertAllClose(results[0], expected, rtol=1e-5, atol=1e-5)
     self.assertAllClose(results[1], expected, rtol=1e-5, atol=1e-5)
 
+  def _testMultipleConcurrentCollectiveReduce(self, t0, t1, expected):
+    group_key = 1
+    group_size = 2
+    num_instances = 2
+    all_reduces = []
+    config = config_pb2.ConfigProto(device_count={'CPU': group_size})
+    config.experimental.collective_deterministic_sequential_execution = True
+    with self.session(config=config) as sess:
+      for cpu in range(group_size):
+        with ops.device('/CPU:%d' % cpu):
+          in_tensor = constant_op.constant(t0 if cpu == 0 else t1)
+          for instance in range(num_instances):
+            all_reduces.append(collective_ops.all_reduce(
+                in_tensor, group_size, group_key, instance, 'Add', 'Div'))
+      results = sess.run(all_reduces)
+    for i in range(group_size * num_instances):
+      self.assertAllClose(results[i], expected, rtol=1e-5, atol=1e-5)
+
   @test_util.run_deprecated_v1
   def testCollectiveReduce(self):
     self._testCollectiveReduce([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
@@ -62,6 +78,13 @@ class CollectiveOpTest(test.TestCase):
                                [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3],
                                [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2], False)
 
+  @test_util.run_deprecated_v1
+  def testCollectiveMultipleConcurrentReduce(self):
+    self._testMultipleConcurrentCollectiveReduce(
+        [0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
+        [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3],
+        [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2])
+
   @test_util.run_deprecated_v1
   def testCollectiveReduceScalar(self):
     self._testCollectiveReduce(0.1, 0.3, 0.2, True)
@@ -89,6 +112,42 @@ class CollectiveOpTest(test.TestCase):
   def testCollectiveBroadcast(self):
     self._testCollectiveBroadcast([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1])
 
+  def _testCollectiveGather(self, t0, t1, expected, set_graph_key):
+    group_key = 1
+    instance_key = 1
+    with self.session(
+        config=config_pb2.ConfigProto(device_count={'CPU': 2})) as sess:
+      with ops.device('/CPU:0'):
+        in0 = constant_op.constant(t0)
+        colred0 = collective_ops.all_gather(in0, 2, group_key, instance_key)
+      with ops.device('/CPU:1'):
+        in1 = constant_op.constant(t1)
+        colred1 = collective_ops.all_gather(in1, 2, group_key, instance_key)
+      run_options = config_pb2.RunOptions()
+      if set_graph_key:
+        run_options.experimental.collective_graph_key = 1
+      results = sess.run([colred0, colred1], options=run_options)
+    self.assertAllClose(results[0], expected, rtol=1e-5, atol=1e-5)
+    self.assertAllClose(results[1], expected, rtol=1e-5, atol=1e-5)
+
+  @test_util.run_deprecated_v1
+  def testCollectiveGather(self):
+    self._testCollectiveGather([0, 1, 2, 3, 4, 5, 6, 7],
+                               [10, 11, 12, 13, 14, 15, 16, 17],
+                               [0, 1, 2, 3, 4, 5, 6, 7,
+                                10, 11, 12, 13, 14, 15, 16, 17],
+                               True)
+    self._testCollectiveGather([[0, 1, 2, 3], [4, 5, 6, 7]],
+                               [[10, 11, 12, 13], [14, 15, 16, 17]],
+                               [[0, 1, 2, 3], [4, 5, 6, 7],
+                                [10, 11, 12, 13], [14, 15, 16, 17]],
+                               True)
+    self._testCollectiveGather([[[0, 1], [2, 3]], [[4, 5], [6, 7]]],
+                               [[[10, 11], [12, 13]], [[14, 15], [16, 17]]],
+                               [[[0, 1], [2, 3]], [[4, 5], [6, 7]],
+                                [[10, 11], [12, 13]], [[14, 15], [16, 17]]],
+                               True)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/ops/cond_v2.py b/tensorflow/python/ops/cond_v2.py
index abc99c1205159bd4eb87e3a378fe95693ac84aa7..74f5b52f18ce92a747f5dcdf40bed247a2fc4831 100644
--- a/tensorflow/python/ops/cond_v2.py
+++ b/tensorflow/python/ops/cond_v2.py
@@ -35,7 +35,8 @@ from tensorflow.python.ops import control_flow_util_v2 as util
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gen_functional_ops
 from tensorflow.python.ops import gen_resource_variable_ops
-from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import gradients_util
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util import nest
 
 
@@ -61,41 +62,40 @@ def cond_v2(pred, true_fn, false_fn, name="cond"):
 
     # Automatic control dependencies are added in defuns, but not in v1
     # graphs. Propagate that behavior here.
-    add_control_dependencies = util.in_defun()
+    add_control_dependencies = ops.get_default_graph()._add_control_dependencies
     pred = ops.convert_to_tensor(pred)
 
     true_graph = func_graph_module.func_graph_from_py_func(
         true_name,
         true_fn, [], {},
         func_graph=util.CondBranchFuncGraph(
-            true_name, read_only_collections=False),
+            true_name, collections=ops.get_default_graph()._collections),  # pylint: disable=protected-access
         add_control_dependencies=add_control_dependencies,
         op_return_value=pred)
     false_graph = func_graph_module.func_graph_from_py_func(
         false_name,
         false_fn, [], {},
         func_graph=util.CondBranchFuncGraph(
-            false_name, read_only_collections=False),
+            false_name, collections=ops.get_default_graph()._collections),  # pylint: disable=protected-access
         add_control_dependencies=add_control_dependencies,
         op_return_value=pred)
 
-    outputs = _build_cond(pred, true_graph, false_graph,
-                          true_graph.external_captures,
-                          false_graph.external_captures,
-                          name=scope)
-
-    return func_graph_module.pack_sequence_as(true_graph.structured_outputs,
-                                              outputs)
+    return _build_cond(pred, true_graph, false_graph,
+                       true_graph.external_captures,
+                       false_graph.external_captures,
+                       name=scope)
 
 
 @ops.RegisterGradient("If")
 def _IfGrad(op, *grads):  # pylint: disable=invalid-name
   """The gradient of an If op produced by cond_v2."""
-  true_graph, false_graph = _get_func_graphs(op)
+  # Get the if operator (this logic handles the case where op is a MockOp)
+  if_op = op.outputs[0].op
+  true_graph, false_graph = _get_func_graphs(if_op)
   # Note: op.graph != ops.get_default_graph() when we are computing the gradient
   # of a nested cond.
-  assert true_graph.outer_graph == op.graph
-  assert false_graph.outer_graph == op.graph
+  assert true_graph.outer_graph == if_op.graph
+  assert false_graph.outer_graph == if_op.graph
 
   # Create grad functions that compute the gradient of the true/false forward
   # graphs. These functions will capture tensors from the forward pass
@@ -105,9 +105,6 @@ def _IfGrad(op, *grads):  # pylint: disable=invalid-name
   false_grad_graph = _create_grad_func(
       false_graph, grads, util.unique_grad_fn_name(false_graph.name))
 
-  assert ([t.dtype for t in true_grad_graph.outputs] ==
-          [t.dtype for t in false_grad_graph.outputs])
-
   if (true_grad_graph.if_op_needs_rewrite or
       false_grad_graph.if_op_needs_rewrite):
     # Modify 'op' to output the intermediates needed by the grad functions. Note
@@ -140,11 +137,12 @@ def _IfGrad(op, *grads):  # pylint: disable=invalid-name
     true_graph.name += "_rewritten"
     false_graph.name += "_rewritten"
 
-    op._set_func_attr("then_branch", util.create_new_tf_function(true_graph))
-    op._set_func_attr("else_branch", util.create_new_tf_function(false_graph))
-    op._set_type_list_attr("Tout", true_graph.output_types)
-    op._set_shape_list_attr("output_shapes", true_graph.output_shapes)
-    op._add_outputs(
+    if_op._set_func_attr("then_branch", util.create_new_tf_function(true_graph))
+    if_op._set_func_attr("else_branch",
+                         util.create_new_tf_function(false_graph))
+    if_op._set_type_list_attr("Tout", true_graph.output_types)
+    if_op._set_shape_list_attr("output_shapes", true_graph.output_shapes)
+    if_op._add_outputs(
         [t.dtype for t in extra_true_outputs],
         [t.shape for t in extra_true_outputs])
 
@@ -153,7 +151,10 @@ def _IfGrad(op, *grads):  # pylint: disable=invalid-name
   true_grad_inputs = _resolve_grad_inputs(true_graph, true_grad_graph)
   false_grad_inputs = _resolve_grad_inputs(false_graph, false_grad_graph)
 
-  outputs = _build_cond(op.inputs[0], true_grad_graph, false_grad_graph,
+  # This modifies true_grad_graph and false_grad_graph.
+  _make_output_composite_tensors_match(true_grad_graph, false_grad_graph)
+
+  outputs = _build_cond(if_op.inputs[0], true_grad_graph, false_grad_graph,
                         true_grad_inputs, false_grad_inputs)
 
   # The predicate has no gradient.
@@ -216,7 +217,8 @@ def _build_cond(pred, true_graph, false_graph, true_inputs, false_inputs,
 
   # Prevent fetching since the variant outputs can't be fetched directly.
   if_op.graph.prevent_fetching(if_op)
-  return tensors
+  return func_graph_module.pack_sequence_as(true_graph.structured_outputs,
+                                            tensors)
 
 
 def _get_func_graphs(if_op):
@@ -275,7 +277,7 @@ def _grad_fn(func_graph, grads):
   ys = []
   grad_ys = []
   for y, grad_y in zip(func_graph.outputs, grads):
-    if not gradients_impl.IsTrainable(y):
+    if not gradients_util.IsTrainable(y):
       continue
     ys.append(y)
     grad_ys.append(grad_y)
@@ -284,7 +286,7 @@ def _grad_fn(func_graph, grads):
   # func_graph in the current graph, which requires capturing tensors from
   # func_graph. The captured func_graph tensors are resolved to external tensors
   # in _resolve_grad_inputs.
-  result = gradients_impl._GradientsHelper(
+  result = gradients_util._GradientsHelper(
       ys, func_graph.inputs, grad_ys=grad_ys,
       src_graph=func_graph)
 
@@ -472,6 +474,50 @@ def _make_inputs_match(true_graph, false_graph, true_inputs, false_inputs):
   return new_inputs
 
 
+def _make_output_composite_tensors_match(true_graph, false_graph):
+  """Rewrites {true,false}_graph's outputs to use the same _TensorLike classes.
+
+  Currently the only transformation implemented is turning a Tensor into an
+  equivalent IndexedSlices if the other branch returns an IndexedSlices.
+  Updates {true,false}_graph.{outputs,structured_outputs}.
+
+  Args:
+    true_graph: FuncGraph
+    false_graph: FuncGraph
+
+  Raises:
+    TypeError: if a pair of outputs cannot be rewritten.
+  """
+  # Note: since this is only used for gradient graphs, we do not expect the
+  # outputs to be structured (e.g. nested lists), and thus do not need to use
+  # nest.flatten, etc.
+  true_outputs = list(true_graph.structured_outputs)
+  false_outputs = list(false_graph.structured_outputs)
+  assert len(true_outputs) == len(false_outputs)
+
+  for idx, (true_out, false_out) in enumerate(zip(true_outputs, false_outputs)):
+    if type(true_out) == type(false_out):  # pylint: disable=unidiomatic-typecheck
+      continue
+    if (isinstance(true_out, ops.IndexedSlices) and
+        isinstance(false_out, ops.Tensor)):
+      with false_graph.as_default():
+        false_outputs[idx] = math_ops._as_indexed_slices(false_out)
+    elif (isinstance(true_out, ops.Tensor) and
+          isinstance(false_out, ops.IndexedSlices)):
+      with true_graph.as_default():
+        true_outputs[idx] = math_ops._as_indexed_slices(true_out)
+    else:
+      raise TypeError(
+          "Cannot reconcile tf.cond %i-th outputs:\n"
+          "  true_fn returned:  %s\n"
+          "  false_fn returned: %s" % (idx, true_out, false_out))
+
+  true_graph.structured_outputs = true_outputs
+  true_graph.outputs = func_graph_module.flatten(true_outputs)
+  false_graph.structured_outputs = false_outputs
+  false_graph.outputs = func_graph_module.flatten(false_outputs)
+
+
 def _wrap_intermediates(func_graph, intermediates):
   with func_graph.as_default():
     return [gen_dataset_ops.optional_from_value([t]) for t in intermediates]
@@ -515,23 +561,30 @@ def _create_fakeparams(func_graph, template_tensors):
 
 def _check_same_outputs(true_graph, false_graph):
   """Raises an error if true_graph and false_graph have different outputs."""
-  true_output_types = [t.dtype for t in true_graph.outputs]
-  false_output_types = [t.dtype for t in false_graph.outputs]
-  if (len(true_graph.outputs) != len(false_graph.outputs) or
-      true_output_types != false_output_types):
+
+  def error(error_detail):
     raise TypeError(
-        "true_fn() and false_fn() must return the same number and type of "
-        "arguments, got:\n"
-        "  true_fn: %s\n"
-        "  false_fn: %s" % (true_output_types, false_output_types))
+        "true_fn and false_fn arguments to tf.cond must have the same number, "
+        "type, and overall structure of return values.\n"
+        "\n"
+        "true_fn output:  %s\n"
+        "false_fn output: %s\n"
+        "\n"
+        "Error details:\n"
+        "%s" % (true_graph.structured_outputs, false_graph.structured_outputs,
+                error_detail))
 
-  # Make sure `structured_outputs` for both graphs have the same structure.
   try:
     nest.assert_same_structure(true_graph.structured_outputs,
-                               false_graph.structured_outputs)
+                               false_graph.structured_outputs,
+                               expand_composites=True)
   except (ValueError, TypeError) as e:
-    raise ValueError("Outputs of true_fn and false_fn must have the same "
-                     "structure: %s" % str(e))
+    error(str(e))
+
+  assert len(true_graph.outputs) == len(false_graph.outputs)
+  for true_out, false_out in zip(true_graph.outputs, false_graph.outputs):
+    if true_out.dtype != false_out.dtype:
+      error("%s and %s have different types" % (true_out, false_out))
 
 
 def _get_output_shapes(true_graph_outputs, false_graph_outputs):
@@ -554,7 +607,8 @@ class _CondGradFuncGraph(util.CondBranchFuncGraph):
   """
 
   def __init__(self, name, forward_graph):
-    super(_CondGradFuncGraph, self).__init__(name, read_only_collections=False)
+    super(_CondGradFuncGraph, self).__init__(
+        name, collections=ops.get_default_graph()._collections)  # pylint: disable=protected-access
     self.if_op_needs_rewrite = False
     self._forward_graph = forward_graph
     # Maps from forward intermediate tensor -> the unwrapped captured
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index b7e50c1dae5ac1dc0968a3badb8f017e6b0384e1..32a5db2c1ae0687f3c9954e943735a7748a2b777 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -24,19 +24,17 @@ from __future__ import print_function
 import abc
 import collections
 import functools
-import os
 
 import six
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf import control_flow_pb2
-from tensorflow.python import tf2
 from tensorflow.python.eager import context
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -61,19 +59,16 @@ from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 
 # This is to avoid a circular dependency:
-# cond_v2 -> gradients_impl -> control_flow_ops
+# cond_v2 -> gradients_util -> control_flow_ops
 cond_v2 = LazyLoader("cond_v2", globals(),
                      "tensorflow.python.ops.cond_v2")
 
 # This is to avoid circular dependencies:
 # while_v2 -> control_flow_ops
-# while_v2 -> gradients_impl -> control_flow_ops
+# while_v2 -> gradients_util -> control_flow_ops
 while_v2 = LazyLoader("while_v2", globals(),
                       "tensorflow.python.ops.while_v2")
 
-ENABLE_COND_V2 = tf2.enabled() or os.getenv("TF_ENABLE_COND_V2", "0") != "0"
-ENABLE_WHILE_V2 = tf2.enabled() or os.getenv("TF_ENABLE_WHILE_V2", "0") != "0"
-
 # We override the 'tuple' for a control flow op, so we keep python's
 # existing 'tuple' for later use in this module.
 _basetuple = tuple
@@ -186,47 +181,29 @@ def _Identity(data, name=None):
   Returns:
     A Tensor with the same type and value as the input Tensor.
   """
-  data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True)
+  data = ops.internal_convert_to_tensor_or_composite(data, as_ref=True)
   if isinstance(data, ops.Tensor):
     if data.dtype._is_ref_dtype:  # pylint: disable=protected-access
       return gen_array_ops.ref_identity(data, name=name)
     else:
       return array_ops.identity(data, name=name)
+  elif isinstance(data, composite_tensor.CompositeTensor):
+    return nest.map_structure(_Identity, data, expand_composites=True)
   else:
-    if not isinstance(data, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-      raise TypeError("Type %s not supported" % type(data))
-    values = _Identity(data.values, name=name)
-    indices = array_ops.identity(data.indices, name="indices")
-    if isinstance(data, ops.IndexedSlices):
-      dense_shape = data.dense_shape
-      if dense_shape is not None:
-        dense_shape = array_ops.identity(dense_shape, name="dense_shape")
-      return ops.IndexedSlices(values, indices, dense_shape)
-    else:
-      dense_shape = array_ops.identity(data.dense_shape, name="dense_shape")
-      return sparse_tensor.SparseTensor(indices, values, dense_shape)
+    raise TypeError("Type %s not supported" % type(data))
 
 
 def _NextIteration(data, name=None):
-  data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True)
+  data = ops.internal_convert_to_tensor_or_composite(data, as_ref=True)
   if isinstance(data, ops.Tensor):
     if data.dtype._is_ref_dtype:  # pylint: disable=protected-access
       return ref_next_iteration(data, name=name)
     else:
       return next_iteration(data, name=name)
+  elif isinstance(data, composite_tensor.CompositeTensor):
+    return nest.map_structure(_NextIteration, data, expand_composites=True)
   else:
-    if not isinstance(data, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-      raise TypeError("Type %s not supported" % type(data))
-    values = _NextIteration(data.values, name=name)
-    indices = next_iteration(data.indices, name="indices")
-    if isinstance(data, ops.IndexedSlices):
-      dense_shape = data.dense_shape
-      if dense_shape is not None:
-        dense_shape = next_iteration(dense_shape, name="dense_shape")
-      return ops.IndexedSlices(values, indices, dense_shape)
-    else:
-      dense_shape = next_iteration(data.dense_shape, name="dense_shape")
-      return sparse_tensor.SparseTensor(indices, values, dense_shape)
+    raise TypeError("Type %s not supported" % type(data))
 
 
 def _Enter(data,
@@ -249,12 +226,13 @@ def _Enter(data,
     is_constant: If true, the output is constant within the child frame.
     parallel_iterations: The number of iterations allowed to run in parallel.
     use_ref: If true, use ref_enter if data is of ref type.
+    use_input_shape: If true, set the result's shape based on data's shape.
     name: A name for this operation (optional).
 
   Returns:
     The same tensor as `data`.
   """
-  data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True)
+  data = ops.internal_convert_to_tensor_or_composite(data, as_ref=True)
   if isinstance(data, ops.Tensor):
     if data.dtype._is_ref_dtype and use_ref:  # pylint: disable=protected-access
       result = gen_control_flow_ops.ref_enter(
@@ -265,46 +243,13 @@ def _Enter(data,
     if use_input_shape:
       result.set_shape(data.get_shape())
     return result
+  elif isinstance(data, composite_tensor.CompositeTensor):
+    def enter_component(t):
+      return _Enter(t, frame_name, is_constant, parallel_iterations,
+                    use_ref, use_input_shape)
+    return nest.map_structure(enter_component, data, expand_composites=True)
   else:
-    if not isinstance(data, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-      raise TypeError("Type %s not supported" % type(data))
-    values = _Enter(
-        data.values,
-        frame_name,
-        is_constant,
-        parallel_iterations=parallel_iterations,
-        use_input_shape=use_input_shape,
-        name=name)
-    indices = gen_control_flow_ops.enter(
-        data.indices,
-        frame_name,
-        is_constant,
-        parallel_iterations,
-        name="indices")
-    if use_input_shape:
-      indices.set_shape(data.indices.get_shape())
-    if isinstance(data, ops.IndexedSlices):
-      dense_shape = data.dense_shape
-      if dense_shape is not None:
-        dense_shape = gen_control_flow_ops.enter(
-            dense_shape,
-            frame_name,
-            is_constant,
-            parallel_iterations,
-            name="dense_shape")
-        if use_input_shape:
-          dense_shape.set_shape(data.dense_shape.get_shape())
-      return ops.IndexedSlices(values, indices, dense_shape)
-    else:
-      dense_shape = gen_control_flow_ops.enter(
-          data.dense_shape,
-          frame_name,
-          is_constant,
-          parallel_iterations,
-          name="dense_shape")
-      if use_input_shape:
-        dense_shape.set_shape(data.dense_shape.get_shape())
-      return sparse_tensor.SparseTensor(indices, values, dense_shape)
+    raise TypeError("Type %s not supported" % type(data))
 
 
 def exit(data, name=None):  # pylint: disable=redefined-builtin
@@ -319,25 +264,16 @@ def exit(data, name=None):  # pylint: disable=redefined-builtin
   Returns:
     The same tensor as `data`.
   """
-  data = ops.internal_convert_to_tensor_or_indexed_slices(data, as_ref=True)
+  data = ops.internal_convert_to_tensor_or_composite(data, as_ref=True)
   if isinstance(data, ops.Tensor):
     if data.dtype._is_ref_dtype:  # pylint: disable=protected-access
       return gen_control_flow_ops.ref_exit(data, name)
     else:
       return gen_control_flow_ops._exit(data, name)
+  elif isinstance(data, composite_tensor.CompositeTensor):
+    return nest.map_structure(exit, data, expand_composites=True)
   else:
-    if not isinstance(data, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-      raise TypeError("Type %s not supported" % type(data))
-    values = exit(data.values, name=name)
-    indices = gen_control_flow_ops._exit(data.indices, name="indices")
-    if isinstance(data, ops.IndexedSlices):
-      dense_shape = data.dense_shape
-      if dense_shape is not None:
-        dense_shape = gen_control_flow_ops._exit(dense_shape, name)
-      return ops.IndexedSlices(values, indices, dense_shape)
-    else:
-      dense_shape = gen_control_flow_ops._exit(data.dense_shape, name)
-      return sparse_tensor.SparseTensor(indices, values, dense_shape)
+    raise TypeError("Type %s not supported" % type(data))
 
 
 def switch(data, pred, dtype=None, name=None):
@@ -360,32 +296,19 @@ def switch(data, pred, dtype=None, name=None):
     to `output_true`, otherwise it goes to `output_false`.
   """
   with ops.name_scope(name, "Switch", [data, pred]) as name:
-    data = ops.internal_convert_to_tensor_or_indexed_slices(
+    data = ops.internal_convert_to_tensor_or_composite(
         data, dtype=dtype, name="data", as_ref=True)
     pred = ops.convert_to_tensor(pred, name="pred")
     if isinstance(data, ops.Tensor):
       return gen_control_flow_ops.switch(data, pred, name=name)
     else:
-      if not isinstance(data, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
+      if not isinstance(data, composite_tensor.CompositeTensor):
         raise TypeError("Type %s not supported" % type(data))
-      val, ind = data.values, data.indices
-      val_f, val_t = gen_control_flow_ops.switch(val, pred, name=name)
-      ind_f, ind_t = gen_control_flow_ops.switch(ind, pred, name="indices")
-      if isinstance(data, ops.IndexedSlices):
-        dense_shape = data.dense_shape
-        if dense_shape is not None:
-          dense_shape_f, dense_shape_t = gen_control_flow_ops.switch(
-              dense_shape, pred, name="dense_shape")
-        else:
-          dense_shape_f, dense_shape_t = None, None
-        return (ops.IndexedSlices(val_f, ind_f, dense_shape_f),
-                ops.IndexedSlices(val_t, ind_t, dense_shape_t))
-      else:
-        dense_shape = data.dense_shape
-        dense_shape_f, dense_shape_t = gen_control_flow_ops.switch(
-            data.dense_shape, pred, name="dense_shape")
-        return (sparse_tensor.SparseTensor(ind_f, val_f, dense_shape_f),
-                sparse_tensor.SparseTensor(ind_t, val_t, dense_shape_t))
+      tensors = nest.flatten(data, expand_composites=True)
+      mapped = [gen_control_flow_ops.switch(tensor, pred) for tensor in tensors]
+      mapped_f, mapped_t = zip(*mapped)
+      return (nest.pack_sequence_as(data, mapped_f, expand_composites=True),
+              nest.pack_sequence_as(data, mapped_t, expand_composites=True))
 
 
 def _SwitchRefOrTensor(data, pred, name="Switch"):
@@ -408,7 +331,7 @@ def _SwitchRefOrTensor(data, pred, name="Switch"):
   Raises:
     TypeError: if data is not a Tensor or IndexedSlices
   """
-  data = ops.convert_to_tensor_or_indexed_slices(data, name="data")
+  data = ops.convert_to_tensor_or_composite(data, name="data")
   # NOTE(vrv): ops.colocate_with(data, ignore_existing=True) below
   # addresses the following scenario.
   #
@@ -461,7 +384,7 @@ def merge(inputs, name=None):
     raise ValueError("At least one of the merge inputs is None: %s" % inputs)
   with ops.name_scope(name, "Merge", inputs) as name:
     inputs = [
-        ops.internal_convert_to_tensor_or_indexed_slices(inp, as_ref=True)
+        ops.internal_convert_to_tensor_or_composite(inp, as_ref=True)
         for inp in inputs
     ]
     if all(isinstance(v, ops.Tensor) for v in inputs):
@@ -469,30 +392,27 @@ def merge(inputs, name=None):
         return gen_control_flow_ops.ref_merge(inputs, name)
       else:
         return gen_control_flow_ops.merge(inputs, name)
-    elif all(isinstance(v, sparse_tensor.SparseTensor) for v in inputs):
-      # Only handle the case when all inputs are SparseTensor.
-      values, _ = merge([inp.values for inp in inputs], name=name)
-      indices, chosen_index = gen_control_flow_ops.merge(
-          [inp.indices for inp in inputs], name="indices")
-      dense_shape, _ = gen_control_flow_ops.merge(
-          [inp.dense_shape for inp in inputs], name="dense_shape")
-      return (sparse_tensor.SparseTensor(indices, values, dense_shape),
-              chosen_index)
     else:
-      # For now convert all the inputs as IndexedSlices.
-      inputs = math_ops._as_indexed_slices_list(inputs, optimize=False)
-      values, _ = merge([inp.values for inp in inputs], name=name)
-      indices, chosen_index = gen_control_flow_ops.merge(
-          [inp.indices for inp in inputs], name="indices")
-      if any(inp.dense_shape is not None for inp in inputs):
-        if any(inp.dense_shape is None for inp in inputs):
-          raise ValueError("Either all merged IndexedSlices must have a "
-                           "dense_shape, or none must have a dense_shape.")
-        dense_shape, _ = gen_control_flow_ops.merge(
-            [inp.dense_shape for inp in inputs], name="dense_shape")
-      else:
-        dense_shape = None
-      return ops.IndexedSlices(values, indices, dense_shape), chosen_index
+      # If there is a mix of tensors and indexed slices, then convert the
+      # tensors to indexed slices.
+      if all(isinstance(v, (ops.IndexedSlices, ops.Tensor)) for v in inputs):
+        inputs = math_ops._as_indexed_slices_list(inputs, optimize=False)
+
+      for v in inputs:
+        if not isinstance(v, composite_tensor.CompositeTensor):
+          raise TypeError("Type %s not supported" % type(v))
+
+      for v in inputs[1:]:
+        nest.assert_same_structure(inputs[0], v, expand_composites=True)
+
+      flat_inputs = [nest.flatten(v, expand_composites=True) for v in inputs]
+      merged_results = [gen_control_flow_ops.merge(component)
+                        for component in zip(*flat_inputs)]
+      flat_merged = [tensor for (tensor, _) in merged_results]
+      chosen_index = merged_results[0][1]
+      merged_inputs = nest.pack_sequence_as(inputs[0], flat_merged,
+                                            expand_composites=True)
+      return (merged_inputs, chosen_index)
 
 
 # pylint: enable=protected-access
@@ -542,6 +462,30 @@ def _ShapeLessThanOrEqual(shape1, shape2):
   return True
 
 
+def _get_shape_invariant(var, shape=None):
+  """Returns a shape invariant for the given variable.
+
+  If `var` is a `CompositeTensor`, then this uses
+  `_shape_invariant_to_components()` to get shape invariants for the
+  component tensors.
+
+  Args:
+    var: The tensor whose shape is described.
+    shape: The shape invariant for the tensor.  If not specified, then a default
+      shape invariant for `var` is returned.
+
+  Returns:
+    The shape invariant for `var` (if it is a `Tensor`), or the shape invariants
+    for the components that comprise `var` (if it is a `CompositeTensor`).
+  """
+  if isinstance(var, composite_tensor.CompositeTensor):
+    return var._shape_invariant_to_components(shape)  # pylint: disable=protected-access
+  elif shape is None:
+    return var.shape
+  else:
+    return shape
+
+
 def _SetShapeInvariants(input_vars, enter_vars, shapes):
   """Set the shapes of the tensors in `enter_vars` to `shapes`.
 
@@ -571,31 +515,7 @@ def _SetShapeInvariants(input_vars, enter_vars, shapes):
             (inp.name, inp.get_shape(), shape))
       var.set_shape(shape)
     else:
-      if not isinstance(var, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-        raise TypeError("Type %s not supported" % type(var))
-      if isinstance(var, ops.IndexedSlices):
-        if not _ShapeLessThanOrEqual(inp.values.get_shape(), shape):
-          raise ValueError(
-              "The shape invariant specified for %s is not compatible with "
-              "the initial shape of the values tensor of this IndexedSlices. "
-              "It enters the loop with shape %s, but the specified shape "
-              "invariant is %s." % (inp.values.name, inp.values.get_shape(),
-                                    shape))
-        var.values.set_shape(shape)
-        var.indices.set_shape(tensor_shape.TensorShape([shape[0]]))
-        if var.dense_shape is not None:
-          var.dense_shape.set_shape(tensor_shape.TensorShape([shape.ndims]))
-      else:
-        if not _ShapeLessThanOrEqual(inp.dense_shape.get_shape(), shape):
-          raise ValueError(
-              "The shape invariant specified for %s is not compatible with "
-              "the initial shape of the shape tensor of this SparseTensor. "
-              "It enters the loop with shape %s, but the specified shape "
-              "invariant is %s." % (inp.dense_shape.name,
-                                    inp.dense_shape.get_shape(), shape))
-        var.values.set_shape(tensor_shape.TensorShape([None]))
-        var.indices.set_shape(tensor_shape.TensorShape([None, shape.ndims]))
-        var.dense_shape.set_shape(shape)
+      raise TypeError("Type %s not supported" % type(var))
 
 
 def _EnforceShapeInvariant(merge_var, next_var):
@@ -624,49 +544,7 @@ def _EnforceShapeInvariant(merge_var, next_var):
           "use the `shape_invariants` argument of tf.while_loop to specify a "
           "less-specific shape." % (input_t.name, input_t.shape, n_shape))
   else:
-    if not isinstance(merge_var,
-                      (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-      raise TypeError("Type %s not supported" % type(merge_var))
-    if isinstance(merge_var, ops.IndexedSlices):
-      m_values_shape = merge_var.values.get_shape()
-      m_indices_shape = merge_var.indices.get_shape()
-      m_shape_shape = tensor_shape.TensorShape(None)
-      if merge_var.dense_shape is not None:
-        m_shape_shape = merge_var.dense_shape.get_shape()
-      n_values_shape = next_var.values.get_shape()
-      n_indices_shape = next_var.indices.get_shape()
-      n_shape_shape = tensor_shape.TensorShape(None)
-      if next_var.dense_shape is not None:
-        n_shape_shape = next_var.dense_shape.get_shape()
-      if (not _ShapeLessThanOrEqual(n_values_shape, m_values_shape) or
-          not _ShapeLessThanOrEqual(n_indices_shape, m_indices_shape)):
-        if not _ShapeLessThanOrEqual(n_values_shape, m_values_shape):
-          raise ValueError(
-              "The shape for %s is not an invariant for the loop. It enters "
-              "the loop with shape (%s, %s, %s), but has shape (%s, %s, %s) "
-              "after one iteration. Provide shape invariants using either the "
-              "`shape_invariants` argument of tf.while_loop or set_shape() "
-              "on the loop variables." %
-              (merge_var.name, m_values_shape, m_indices_shape, m_shape_shape,
-               n_values_shape, n_indices_shape, n_shape_shape))
-    else:
-      m_values_shape = merge_var.values.get_shape()
-      m_indices_shape = merge_var.indices.get_shape()
-      m_shape_shape = merge_var.dense_shape.get_shape()
-      n_values_shape = next_var.values.get_shape()
-      n_indices_shape = next_var.indices.get_shape()
-      n_shape_shape = next_var.dense_shape.get_shape()
-      if (not _ShapeLessThanOrEqual(n_values_shape, m_values_shape) or
-          not _ShapeLessThanOrEqual(n_indices_shape, m_indices_shape) or
-          not _ShapeLessThanOrEqual(n_shape_shape, m_shape_shape)):
-        raise ValueError(
-            "The shape for %s is not an invariant for the loop. It enters "
-            "the loop with shape (%s, %s, %s), but has shape (%s, %s, %s) "
-            "after one iteration. Provide shape invariants using either "
-            "the `shape_invariants` argument of tf.while_loop or set_shape() "
-            "on the loop variables." %
-            (merge_var.name, m_values_shape, m_indices_shape, m_shape_shape,
-             n_values_shape, n_indices_shape, n_shape_shape))
+    raise TypeError("Type %s not supported" % type(merge_var))
 
 
 def _AddNextAndBackEdge(m, v, enforce_shape_invariant=True):
@@ -681,26 +559,15 @@ def _AddNextAndBackEdge(m, v, enforce_shape_invariant=True):
       # TODO(skyewm): call this for other cases below (needs testing)
       _EnforceShapeInvariant(m, v)
     m.op._update_input(1, v)  # pylint: disable=protected-access
-  elif isinstance(m, ops.IndexedSlices):
+  elif isinstance(m, composite_tensor.CompositeTensor):
     # pylint: disable=protected-access
-    v = math_ops._as_indexed_slices(v, optimize=False)
-    v = _NextIteration(v)
-    m.values.op._update_input(1, v.values)
-    m.indices.op._update_input(1, v.indices)
+    def update_component(m_component, v_component):
+      m_component.op._update_input(1, v_component)
+    if isinstance(m, ops.IndexedSlices):
+      v = math_ops._as_indexed_slices(v, optimize=False)
     # pylint: enable=protected-access
-    if m.dense_shape is not None:
-      if v.dense_shape is None:
-        raise ValueError("Must have dense shape: %s" % v.name)
-      m.dense_shape.op._update_input(1, v.dense_shape)
-  elif isinstance(m, sparse_tensor.SparseTensor):
-    if not isinstance(v, sparse_tensor.SparseTensor):
-      raise ValueError("Must be a sparse tensor: %s" % v.name)
     v = _NextIteration(v)
-    # pylint: disable=protected-access
-    m.values.op._update_input(1, v.values)
-    m.indices.op._update_input(1, v.indices)
-    m.dense_shape.op._update_input(1, v.dense_shape)
-    # pylint: enable=protected-access
+    return nest.map_structure(update_component, m, v, expand_composites=True)
   else:
     raise TypeError("Type %s not supported" % type(m))
   return v
@@ -1618,7 +1485,8 @@ class ControlFlowContext(object):
   def ExitResult(self, result):
     """Make a list of tensors available in the outer context."""
     if self._outer_context:
-      nest.map_structure(lambda x: self._outer_context.AddName(x.name), result)
+      nest.map_structure(lambda x: self._outer_context.AddName(x.name), result,
+                         expand_composites=True)
 
   def GetWhileContext(self):
     """Return the while context containing this context."""
@@ -1925,19 +1793,9 @@ class CondContext(ControlFlowContext):
     if isinstance(v, ops.Operation):
       # Use pivot as the proxy for this op.
       return with_dependencies([v], self._pivot)
-    elif isinstance(v, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-      values = self._ProcessOutputTensor(v.values)
-      indices = self._ProcessOutputTensor(v.indices)
-      if isinstance(v, ops.IndexedSlices):
-        dense_shape = v.dense_shape
-        if dense_shape is not None:
-          dense_shape = self._ProcessOutputTensor(dense_shape)
-        return ops.IndexedSlices(values, indices, dense_shape)
-      else:
-        dense_shape = self._ProcessOutputTensor(v.dense_shape)
-        return sparse_tensor.SparseTensor(indices, values, dense_shape)
     else:
-      v = nest.map_structure(_convert_tensorarray_to_flow, v)
+      v = nest.map_structure(_convert_tensorarray_to_flow, v,
+                             expand_composites=True)
       return self._ProcessOutputTensor(ops.convert_to_tensor(v))
 
   def BuildCondBranch(self, fn):
@@ -1954,11 +1812,13 @@ class CondContext(ControlFlowContext):
           return no_op(), None
         else:
           original_result = nest.map_structure(array_ops.identity,
-                                               original_result)
+                                               original_result,
+                                               expand_composites=True)
     if original_result is None:
       return None, None
 
-    result = nest.map_structure(self._BuildCondTensor, original_result)
+    result = nest.map_structure(self._BuildCondTensor, original_result,
+                                expand_composites=True)
     if not isinstance(result, (list, _basetuple)):
       result = [result]
     return original_result, result
@@ -2052,7 +1912,9 @@ def cond(pred,
   ```
 
   """
-  if ENABLE_COND_V2 and not context.executing_eagerly():
+  # Always enable control flow v2 if building a function, regardless of toggle.
+  if (util.EnableControlFlowV2(ops.get_default_graph()) and
+      not context.executing_eagerly()):
     return cond_v2.cond_v2(pred, true_fn, false_fn, name)
 
   # We needed to make true_fn/false_fn keyword arguments for
@@ -2123,7 +1985,8 @@ def cond(pred,
 
     # Check that the return values of the two branches have the same structure.
     try:
-      nest.assert_same_structure(orig_res_t, orig_res_f)
+      nest.assert_same_structure(orig_res_t, orig_res_f,
+                                 expand_composites=True)
     except TypeError as e:
       raise TypeError(
           "Incompatible return types of true_fn and false_fn: {}".format(e))
@@ -2135,24 +1998,21 @@ def cond(pred,
     if not res_t:
       raise ValueError("true_fn and false_fn must return at least one result.")
 
-    res_t_flat = nest.flatten(res_t)
-    res_f_flat = nest.flatten(res_f)
-
-    for x, y in zip(res_t_flat, res_f_flat):
-      assert ((isinstance(x, ops.IndexedSlices) and
-               isinstance(y, ops.IndexedSlices)) or
-              (isinstance(x, sparse_tensor.SparseTensor) and
-               isinstance(y, sparse_tensor.SparseTensor)) or
-              (isinstance(x, ops.Tensor) and isinstance(y, ops.Tensor)))
-      val_x = x if isinstance(x, ops.Tensor) else x.values
-      val_y = y if isinstance(y, ops.Tensor) else y.values
-      if val_x.dtype.base_dtype != val_y.dtype.base_dtype:
-        raise ValueError(
-            "Outputs of true_fn and false_fn must have the same type: %s, %s" %
-            (val_x.dtype.name, val_y.dtype.name))
+    res_t_flat = nest.flatten(res_t, expand_composites=True)
+    res_f_flat = nest.flatten(res_f, expand_composites=True)
+
+    for i, (x, y) in enumerate(zip(res_t_flat, res_f_flat)):
+      assert isinstance(x, ops.Tensor) and isinstance(y, ops.Tensor)
+      if x.dtype.base_dtype != y.dtype.base_dtype:
+        _cast_indexed_slice_indices(res_t, res_t_flat, res_f_flat)
+        if res_t_flat[i].dtype.base_dtype != res_f_flat[i].dtype.base_dtype:
+          raise ValueError(
+              "Outputs of true_fn and false_fn must have the same type: "
+              "%s, %s" % (x.dtype.name, y.dtype.name))
 
     merges = [merge(pair)[0] for pair in zip(res_f_flat, res_t_flat)]
-    merges = _convert_flows_to_tensorarrays(nest.flatten(orig_res_t), merges)
+    merges = _convert_flows_to_tensorarrays(
+        nest.flatten(orig_res_t, expand_composites=True), merges)
 
     # Only add non-nested conds to the collection. Any nested control flow will
     # be encapsulated in the root context.
@@ -2161,7 +2021,8 @@ def cond(pred,
       ops.add_to_collection(ops.GraphKeys.COND_CONTEXT, context_t)
       ops.add_to_collection(ops.GraphKeys.COND_CONTEXT, context_f)
 
-    merges = nest.pack_sequence_as(structure=orig_res_t, flat_sequence=merges)
+    merges = nest.pack_sequence_as(structure=orig_res_t, flat_sequence=merges,
+                                   expand_composites=True)
 
     # Singleton lists and tuples are automatically unpacked if strict == False.
     if not strict:
@@ -2169,6 +2030,48 @@ def cond(pred,
     return merges
 
 
+def _cast_indexed_slice_indices(structure, flat_a, flat_b):
+  """Cast IndexedSlice.indices from int32 to int64 where necessary.
+
+  For each `IndexedSlices` in the nested structure `structure`, find its
+  indices `Tensor` in the corresponding flattened lists `flat_a` and `flat_b`
+  (where composites have been expanded); and if those indices tensors have
+  different dtypes (i.e., if one is int64 but the other is int32), then cast
+  them to both be int64.
+
+  Args:
+    structure: The nested structure that was flattened.
+    flat_a: A flattened list of `Tensors` whose structure matches
+        `structure`.  Will be modified in place to cast `IndexedSlices`
+        indices tensors to int64, where necessary.
+    flat_a: A flattened list of `Tensors` whose structure matches
+        `structure`.  Will be modified in place to cast `IndexedSlices`
+        indices tensors to int64, where necessary.
+  """
+  # Find the locations (in flat_a and flat_b) of the IndexedSlices'
+  # indices tensors.
+  indexed_slice_indices = []
+  current_index = 0
+  for item in nest.flatten(structure, expand_composites=False):
+    if isinstance(item, ops.IndexedSlices):
+      # indices is the second component of the composite tensor.
+      indexed_slice_indices.append(current_index + 1)
+    if nest.is_sequence_or_composite(item):
+      current_index += len(nest.flatten(item, expand_composites=True))
+    else:
+      current_index += 1
+  assert current_index == len(flat_a)
+
+  for index in indexed_slice_indices:
+    assert flat_a[index].dtype in (dtypes.int32, dtypes.int64)
+    assert flat_b[index].dtype in (dtypes.int32, dtypes.int64)
+    if flat_a[index].dtype != flat_b[index].dtype:
+      if flat_b[index].dtype == dtypes.int32:
+        flat_b[index] = math_ops.cast(flat_b[index], dtypes.int64)
+      else:
+        flat_a[index] = math_ops.cast(flat_a[index], dtypes.int64)
+
+
 # pylint: enable=g-doc-args
 # pylint: enable=redefined-outer-name
 
@@ -2942,21 +2845,12 @@ class WhileContext(ControlFlowContext):
       if isinstance(x, ops.Tensor):
         self._values.add(x.name)
       else:
-        self._values.add(x.values.name)
-        self._values.add(x.indices.name)
-        if isinstance(x, ops.IndexedSlices):
-          dense_shape = x.dense_shape
-        elif isinstance(x, sparse_tensor.SparseTensor):
-          dense_shape = x.dense_shape
-        else:
-          raise TypeError("Type %s not supported" % type(x))
-        if dense_shape is not None:
-          self._values.add(dense_shape.name)
+        raise TypeError("Type %s not supported" % type(x))
 
   def _BuildLoop(self, pred, body, original_loop_vars, loop_vars,
                  shape_invariants):
     """Core: Add the loop termination condition and body to the graph."""
-    flat_loop_vars = nest.flatten(original_loop_vars)
+    flat_loop_vars = nest.flatten(original_loop_vars, expand_composites=True)
 
     # Let the context know the loop variables so the loop variables
     # would be added in the outer contexts properly.
@@ -3008,7 +2902,8 @@ class WhileContext(ControlFlowContext):
         _convert_flows_to_tensorarrays(flat_loop_vars, merge_vars))
     packed_vars = nest.pack_sequence_as(
         structure=original_loop_vars,
-        flat_sequence=merge_vars_with_tensor_arrays)
+        flat_sequence=merge_vars_with_tensor_arrays,
+        expand_composites=True)
     c = ops.convert_to_tensor(pred(*packed_vars))
     self._pivot = loop_cond(c, name="LoopCond")
     switch_vars = [_SwitchRefOrTensor(x, self._pivot) for x in merge_vars]
@@ -3022,11 +2917,12 @@ class WhileContext(ControlFlowContext):
         _convert_flows_to_tensorarrays(flat_loop_vars, vars_for_body))
     packed_vars_for_body = nest.pack_sequence_as(
         structure=original_loop_vars,
-        flat_sequence=vars_for_body_with_tensor_arrays)
+        flat_sequence=vars_for_body_with_tensor_arrays,
+        expand_composites=True)
     pre_summaries = ops.get_collection(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
     body_result = body(*packed_vars_for_body)
     post_summaries = ops.get_collection(ops.GraphKeys._SUMMARY_COLLECTION)  # pylint: disable=protected-access
-    if not nest.is_sequence(body_result):
+    if not nest.is_sequence_or_composite(body_result):
       body_result = [body_result]
     if len(post_summaries) > len(pre_summaries):
       new_summaries = post_summaries[len(pre_summaries):]
@@ -3040,20 +2936,24 @@ class WhileContext(ControlFlowContext):
             return x
           return array_ops.identity(x)
 
-        body_result = nest.map_structure(map_fn, body_result)
+        body_result = nest.map_structure(map_fn, body_result,
+                                         expand_composites=True)
 
     # Compare the structure types of input and output of body.
     # For backwards compatibility, the first layer is forced to a list
     # during this comparison, because inputs are typically lists and
     # outputs of the body are typically tuples.
-    nest.assert_same_structure(list(packed_vars_for_body), list(body_result))
+    nest.assert_same_structure(list(packed_vars_for_body), list(body_result),
+                               expand_composites=True)
 
     # Store body_result to keep track of TensorArrays returned by body
     original_body_result = body_result
     # Convert TensorArrays returned by body into their flow variables
-    result = nest.map_structure(_convert_tensorarray_to_flow,
-                                nest.flatten(body_result))
-    result = ops.convert_n_to_tensor_or_indexed_slices(result)
+    result = nest.map_structure(
+        _convert_tensorarray_to_flow,
+        nest.flatten(body_result, expand_composites=True),
+        expand_composites=True)
+    result = ops.convert_n_to_tensor_or_composite(result)
 
     # Add NextIteration and the back edges to complete the loop.
     if len(merge_vars) != len(result):
@@ -3079,9 +2979,15 @@ class WhileContext(ControlFlowContext):
     # Keep original_loop_vars to identify which are TensorArrays
     original_loop_vars = loop_vars
     # Convert TensorArrays to their flow variables
-    loop_vars = nest.map_structure(_convert_tensorarray_to_flow,
-                                   nest.flatten(loop_vars))
-    loop_vars = ops.convert_n_to_tensor_or_indexed_slices(loop_vars)
+    loop_vars = nest.map_structure(
+        _convert_tensorarray_to_flow,
+        nest.flatten(loop_vars, expand_composites=False),
+        expand_composites=True)
+    loop_vars = ops.convert_n_to_tensor_or_composite(loop_vars)
+    if shape_invariants is None:
+      shape_invariants = nest.map_structure(
+          _get_shape_invariant, loop_vars, expand_composites=False)
+    loop_vars = nest.flatten(loop_vars, expand_composites=True)
     try:
       self.Enter()
       # _BuildLoop calls _update_input in several places. _mutation_lock()
@@ -3093,14 +2999,15 @@ class WhileContext(ControlFlowContext):
     finally:
       self.Exit()
 
-    flat_result = nest.flatten(original_body_result)
+    flat_result = nest.flatten(original_body_result, expand_composites=True)
     # Convert TensorArray flow variables outside the context back into
     # their associated TensorArrays for returning to caller.
     exit_vars_with_tensor_arrays = (
         _convert_flows_to_tensorarrays(flat_result, exit_vars))
     packed_exit_vars = nest.pack_sequence_as(
         structure=original_body_result,
-        flat_sequence=exit_vars_with_tensor_arrays)
+        flat_sequence=exit_vars_with_tensor_arrays,
+        expand_composites=True)
 
     if return_same_structure:
       return packed_exit_vars
@@ -3114,12 +3021,7 @@ class WhileContext(ControlFlowContext):
       if isinstance(e, ops.Tensor):
         xs = [e]
       else:
-        if not isinstance(e, (ops.IndexedSlices, sparse_tensor.SparseTensor)):
-          raise TypeError("Type %s not supported" % type(e))
-        xs = [e.values, e.indices]
-        shape = e.dense_shape
-        if shape is not None:
-          xs.append(shape)
+        raise TypeError("Type %s not supported" % type(e))
       for x in xs:
         inp_op = x.op.inputs[0].op
         control_inputs = graph._control_dependencies_for_inputs([inp_op])
@@ -3487,12 +3389,15 @@ def while_loop(cond,
   ```
 
   """
-  if ENABLE_WHILE_V2 and not context.executing_eagerly():
+  # Always enable control flow v2 if building a function, regardless of toggle.
+  if (util.EnableControlFlowV2(ops.get_default_graph()) and
+      not context.executing_eagerly()):
     return while_v2.while_loop(
         cond,
         body,
         loop_vars,
         shape_invariants=shape_invariants,
+        parallel_iterations=parallel_iterations,
         maximum_iterations=maximum_iterations,
         name=name,
         return_same_structure=return_same_structure)
@@ -3538,6 +3443,12 @@ def while_loop(cond,
         if try_to_pack and not isinstance(loop_vars, (list, _basetuple)):
           packed = True
           loop_vars = (loop_vars,)
+
+      def convert(x):
+        if isinstance(x, tensor_array_ops.TensorArray):
+          return x
+        return ops.convert_to_tensor(x)
+      loop_vars = nest.map_structure(convert, loop_vars)
       if maximum_iterations is not None:
         return loop_vars[1]
       else:
@@ -3546,7 +3457,12 @@ def while_loop(cond,
     if shape_invariants is not None:
       if maximum_iterations is not None:
         shape_invariants = (tensor_shape.TensorShape([]), shape_invariants)
-      nest.assert_same_structure(loop_vars, shape_invariants)
+
+      nest.assert_same_structure(loop_vars, shape_invariants,
+                                 expand_composites=False)
+      shape_invariants = nest.map_structure(
+          _get_shape_invariant, loop_vars, shape_invariants,
+          expand_composites=False)
 
     loop_context = WhileContext(
         maximum_iterations=maximum_iterations,
@@ -3588,7 +3504,7 @@ def _AsTensorList(x, p):
   for v in x:
     if isinstance(v, ops.Operation):
       v = with_dependencies([v], p)
-    v = ops.convert_to_tensor_or_indexed_slices(v)
+    v = ops.convert_to_tensor_or_composite(v)
     if isinstance(v, ops.Tensor):
       l.append(array_ops.identity(v))
     else:
@@ -3636,7 +3552,7 @@ def with_dependencies(dependencies, output_tensor, name=None):
                       list(dependencies) + [output_tensor]) as name:
     with ops.colocate_with(output_tensor):
       with ops.control_dependencies(dependencies):
-        output_tensor = ops.convert_to_tensor_or_indexed_slices(output_tensor)
+        output_tensor = ops.convert_to_tensor_or_composite(output_tensor)
         if isinstance(output_tensor, ops.Tensor):
           return _Identity(output_tensor, name=name)
         else:
@@ -3687,7 +3603,7 @@ def group(*inputs, **kwargs):
 
     # Sorts *inputs according to their devices.
     ops_on_device = {}  # device -> operations specified on the device.
-    for inp in nest.flatten(inputs):
+    for inp in nest.flatten(inputs, expand_composites=True):
       if not hasattr(inp, "device"):
         raise TypeError("Expected tf.group() expected Tensor arguments not "
                         "'%s' with type '%s'" % (inp, type(inp)))
@@ -3709,7 +3625,7 @@ def group(*inputs, **kwargs):
       """A sort key that allows None to be compared to strings."""
       return "" if dev is None else dev
 
-    for dev in sorted(six.iterkeys(ops_on_device), key=device_key):
+    for dev in sorted(ops_on_device, key=device_key):
       deps.append(_GroupControlDeps(dev, ops_on_device[dev]))
 
     with ops.control_dependencies(deps):
diff --git a/tensorflow/python/ops/control_flow_ops_benchmark.py b/tensorflow/python/ops/control_flow_ops_benchmark.py
index 9ba5ff2c0f8af44e8536b49a3c0e7ef6bfae4d28..9dd1e6673b854c3cbc248f0e5a5be4c67d2bd72c 100644
--- a/tensorflow/python/ops/control_flow_ops_benchmark.py
+++ b/tensorflow/python/ops/control_flow_ops_benchmark.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
@@ -94,28 +95,28 @@ class CondWithManyIntermediatesBenchmark(test.Benchmark):
               iters=self.NUM_ITERS)
 
   def benchmark_cond_v1_defun(self):
-    old_val = control_flow_ops.ENABLE_COND_V2
-    control_flow_ops.ENABLE_COND_V2 = False
+    old_val = control_flow_util.ENABLE_CONTROL_FLOW_V2
+    control_flow_util.ENABLE_CONTROL_FLOW_V2 = False
     self._benchmark_defun()
-    control_flow_ops.ENABLE_COND_V2 = old_val
+    control_flow_util.ENABLE_CONTROL_FLOW_V2 = old_val
 
   def benchmark_cond_v2_defun(self):
-    old_val = control_flow_ops.ENABLE_COND_V2
-    control_flow_ops.ENABLE_COND_V2 = True
+    old_val = control_flow_util.ENABLE_CONTROL_FLOW_V2
+    control_flow_util.ENABLE_CONTROL_FLOW_V2 = True
     self._benchmark_defun()
-    control_flow_ops.ENABLE_COND_V2 = old_val
+    control_flow_util.ENABLE_CONTROL_FLOW_V2 = old_val
 
   def benchmark_cond_v1_graph(self):
-    old_val = control_flow_ops.ENABLE_COND_V2
-    control_flow_ops.ENABLE_COND_V2 = False
+    old_val = control_flow_util.ENABLE_CONTROL_FLOW_V2
+    control_flow_util.ENABLE_CONTROL_FLOW_V2 = False
     self._benchmark_graph()
-    control_flow_ops.ENABLE_COND_V2 = old_val
+    control_flow_util.ENABLE_CONTROL_FLOW_V2 = old_val
 
   def benchmark_cond_v2_graph(self):
-    old_val = control_flow_ops.ENABLE_COND_V2
-    control_flow_ops.ENABLE_COND_V2 = True
+    old_val = control_flow_util.ENABLE_CONTROL_FLOW_V2
+    control_flow_util.ENABLE_CONTROL_FLOW_V2 = True
     self._benchmark_graph()
-    control_flow_ops.ENABLE_COND_V2 = old_val
+    control_flow_util.ENABLE_CONTROL_FLOW_V2 = old_val
 
 if __name__ == "__main__":
   ops.enable_eager_execution()
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 0c18b7208f5c4049722012504a26563f55aeca3c..f1dd4f529fc37c054a051d69f6aa1bec23c0805e 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -565,7 +565,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
                                         strict=strict)
 
     with self.cached_session() as sess:
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       true_feed_dict = {condition: True}
       true_feed_dict.update(feed_dict)
       result_cond, result_case = sess.run([output_cond, output_case],
diff --git a/tensorflow/python/ops/control_flow_util.py b/tensorflow/python/ops/control_flow_util.py
index cb628f4aa6441ec9cb03dfe873a79d06a66e37a1..e6fdbe34ec5e0504db749e273810ce1e8820d9da 100644
--- a/tensorflow/python/ops/control_flow_util.py
+++ b/tensorflow/python/ops/control_flow_util.py
@@ -23,10 +23,24 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import traceback
 
 from tensorflow.python.platform import tf_logging as logging
 
+ENABLE_CONTROL_FLOW_V2 = (os.getenv("TF_ENABLE_CONTROL_FLOW_V2", "0") != "0" or
+                          os.getenv("TF_ENABLE_COND_V2", "0") != "0" or
+                          os.getenv("TF_ENABLE_WHILE_V2", "0") != "0" or
+                          os.getenv("TF_ENABLE_TENSOR_ARRAY_V2", "0") != "0")
+
+
+def EnableControlFlowV2(graph):
+  """Returns whether control flow v2 should be used in `graph`."""
+  # Enable new control flow in FuncGraphs (but not legacy _FuncGraphs).
+  # TODO(skyewm): do something better than hasattr without messing up imports.
+  return ENABLE_CONTROL_FLOW_V2 or (
+      graph.building_function and not hasattr(graph, "_captured"))
+
 
 def IsInXLAContext(op):
   try:
@@ -43,6 +57,15 @@ def InXlaContext(graph):
   return GetContainingXLAContext(ctxt) is not None
 
 
+def GraphOrParentsInXlaContext(graph):
+  while True:
+    if InXlaContext(graph): return True
+    try:
+      graph = graph.outer_graph
+    except AttributeError:
+      return False
+
+
 def IsInWhileLoop(op):
   ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
   return GetContainingWhileContext(ctxt) is not None
@@ -308,7 +331,7 @@ def CheckInputFromValidContext(op, input_op):
     if while_ctxt:
       error_msg = (
           "Cannot use '%s' as input to '%s' because they are in different while"
-          " loops." % (op.name, input_op.name))
+          " loops." % (input_op.name, op.name))
     else:
       error_msg = (
           "Cannot use '%s' as input to '%s' because '%s' is in a while loop."
diff --git a/tensorflow/python/ops/control_flow_util_v2.py b/tensorflow/python/ops/control_flow_util_v2.py
index 5f56850884a5e9e424c77515406ef8c9b513e972..58917ad264a56578bb4c98ff9a3ef0b63a3cbf12 100644
--- a/tensorflow/python/ops/control_flow_util_v2.py
+++ b/tensorflow/python/ops/control_flow_util_v2.py
@@ -114,7 +114,7 @@ def maybe_set_lowering_attr(op):
   Args:
     op: An `If` or `While` Operation.
   """
-  if (not control_flow_util.IsInXLAContext(op) and
+  if (not control_flow_util.GraphOrParentsInXlaContext(op.graph) and
       context.context().get_function_call_options().executor_type
       != "SINGLE_THREADED_EXECUTOR"):
     # pylint: disable=protected-access
diff --git a/tensorflow/contrib/framework/python/ops/critical_section_ops.py b/tensorflow/python/ops/critical_section_ops.py
similarity index 92%
rename from tensorflow/contrib/framework/python/ops/critical_section_ops.py
rename to tensorflow/python/ops/critical_section_ops.py
index 71ab755aa2948c548db89b330bb93c9524412fa6..21872ffff139b3f5b74d044746a83f3ce5ab265b 100644
--- a/tensorflow/contrib/framework/python/ops/critical_section_ops.py
+++ b/tensorflow/python/ops/critical_section_ops.py
@@ -31,6 +31,10 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+__all__ = ["CriticalSection"]
 
 
 # Graph Keys
@@ -66,6 +70,7 @@ def _get_colocation(op):
     return None
 
 
+@tf_export("CriticalSection")
 class CriticalSection(object):
   """Critical section.
 
@@ -179,37 +184,36 @@ class CriticalSection(object):
   def name(self):
     return self._handle.op.name
 
-  def execute(self, fn, *args, **kwargs):
-    """Execute function `fn(*args, **kwargs)` inside the CriticalSection.
+  def execute(self, fn, exclusive_resource_access=True, name=None):
+    """Execute function `fn()` inside the critical section.
+
+    `fn` should not accept any arguments.  To add extra arguments to when
+    calling `fn` in the critical section, create a lambda:
+
+    ```python
+    critical_section.execute(lambda: fn(*my_args, **my_kwargs))
+    ```
 
     Args:
       fn: The function to execute.  Must return at least one tensor.
-      *args: Additional positional arguments to `fn`.
-      **kwargs: Additional keyword arguments to `fn`.
-        Several keywords are reserved for `execute`.  These are:
-
-        - name; The name to use when creating the execute operation.
-        - exclusive_resource_access; Whether the resources required by
-          `fn` should be exclusive to this `CriticalSection`.  Default: `True`.
-          You may want to set this to `False` if you will be accessing a
-          resource in read-only mode in two different CriticalSections.
+      exclusive_resource_access: Whether the resources required by
+        `fn` should be exclusive to this `CriticalSection`.  Default: `True`.
+        You may want to set this to `False` if you will be accessing a
+        resource in read-only mode in two different CriticalSections.
+      name: The name to use when creating the execute operation.
 
     Returns:
-      The tensors returned from `fn(*args, **kwargs)`.
+      The tensors returned from `fn()`.
 
     Raises:
       ValueError: If `fn` attempts to lock this `CriticalSection` in any nested
         or lazy way that may cause a deadlock.
-      ValueError: If `exclusive_resource_access` is not provided (is `True`) and
+      ValueError: If `exclusive_resource_access == True` and
         another `CriticalSection` has an execution requesting the same
-        resources as in `*args`, `**kwargs`, and any additionally captured
-        inputs in `fn`.  Note, even if `exclusive_resource_access` is `True`,
-        if another execution in another `CriticalSection` was created without
-        `exclusive_resource_access=True`, a `ValueError` will be raised.
+        resources as `fn``.  Note, even if `exclusive_resource_access` is
+        `True`, if another execution in another `CriticalSection` was created
+        without `exclusive_resource_access=True`, a `ValueError` will be raised.
     """
-    name = kwargs.pop("name", None)
-    exclusive_resource_access = kwargs.pop("exclusive_resource_access", True)
-
     with ops.name_scope(name, "critical_section_execute", []):
 
       # Ensure that mutex locking only happens *after* all args and
@@ -222,7 +226,7 @@ class CriticalSection(object):
         with ops.get_default_graph()._lock:  # pylint: disable=protected-access
           existing_ops = ops.get_default_graph().get_operations()
           with ops.control_dependencies([lock]):
-            r = fn(*args, **kwargs)
+            r = fn()
           # TODO(ebrevdo): If creating critical sections in a python loop, this
           # makes graph creation time quadratic.  Revisit if this
           # becomes a problem.
@@ -230,7 +234,7 @@ class CriticalSection(object):
                          .difference(existing_ops))
       else:
         with ops.control_dependencies([lock]):
-          r = fn(*args, **kwargs)
+          r = fn()
 
       if not context.executing_eagerly():
         self._add_control_dependencies_to_lock(created_ops, lock.op)
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index 3a7eb9355a66a213d3d60f103b818ef22fd839bd..80502daaac3b0daba1e207c7ccd76f6ec6eb2f72 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -24,26 +24,30 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_ctc_ops
 from tensorflow.python.ops import inplace_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.nn_grad import _BroadcastMul
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=protected-access, invalid-name
 @tf_export(v1=["nn.ctc_loss"])
-def ctc_loss(labels, inputs, sequence_length,
+def ctc_loss(labels, inputs=None, sequence_length=None,
              preprocess_collapse_repeated=False,
              ctc_merge_repeated=True,
-             ignore_longer_outputs_than_inputs=False, time_major=True):
+             ignore_longer_outputs_than_inputs=False, time_major=True,
+             logits=None):
   """Computes the CTC (Connectionist Temporal Classification) Loss.
 
   This op implements the CTC loss as presented in the article:
@@ -141,6 +145,7 @@ def ctc_loss(labels, inputs, sequence_length,
       avoids transposes at the beginning of the ctc_loss calculation.  However,
       most TensorFlow data is batch-major, so by this function also accepts
       inputs in batch-major form.
+    logits: Alias for inputs.
 
   Returns:
     A 1-D `float` `Tensor`, size `[batch]`, containing the negative log
@@ -155,6 +160,8 @@ def ctc_loss(labels, inputs, sequence_length,
     raise TypeError("Expected labels (first argument) to be a SparseTensor")
 
   # For internal calculations, we transpose to [time, batch, num_classes]
+  inputs = deprecation.deprecated_argument_lookup(
+      "logits", logits, "inputs", inputs)
   if not time_major:
     inputs = array_ops.transpose(inputs, [1, 0, 2])  # (B,T,N) => (T,B,N)
 
@@ -903,7 +910,7 @@ def ctc_unique_labels(labels, name=None):
           u.y, [[0, _get_dim(u.idx, 0) - _get_dim(u.y, 0)]])
       y = math_ops.cast(y, dtypes.int64)
       return [y, u.idx]
-    return functional_ops.map_fn(
+    return map_fn.map_fn(
         _unique, labels, dtype=[dtypes.int64, dtypes.int32])
 
 
@@ -1029,7 +1036,7 @@ def _scan(fn, elems, initial, reverse=False, inclusive=False, final_only=False):
   for the forward backward use case.
 
   Examples:
-    scan(lambda a, e: a + e, [1.0, 2.0, 3.0], 1.0) => [2.0, 3.0, 4.0]
+    scan(lambda a, e: a + e, [1.0, 2.0, 3.0], 1.0) => [2.0, 4.0, 7.0]
 
     Multiple accumulators:
       scan(lambda a, e: (a[0] + e, a[1] * e), [1.0, 2.0, 3.0], (0.0, 1.0))
@@ -1127,4 +1134,5 @@ def _scan(fn, elems, initial, reverse=False, inclusive=False, final_only=False):
 
 def _get_dim(tensor, i):
   """Get value of tensor shape[i] preferring static value if available."""
-  return tensor.shape[i].value or array_ops.shape(tensor)[i]
+  return tensor_shape.dimension_value(
+      tensor.shape[i]) or array_ops.shape(tensor)[i]
diff --git a/tensorflow/python/ops/cudnn_rnn_grad.py b/tensorflow/python/ops/cudnn_rnn_grad.py
index c618c470f201af14d26960efb6a68ace0ac29b88..d4c182a802ad52dc431dde5b184ebb79cb733dc5 100644
--- a/tensorflow/python/ops/cudnn_rnn_grad.py
+++ b/tensorflow/python/ops/cudnn_rnn_grad.py
@@ -71,3 +71,32 @@ def _cudnn_rnn_backward_v2(op, *grad):
       rnn_mode=op.get_attr("rnn_mode"),
       input_mode=op.get_attr("input_mode"),
       direction=op.get_attr("direction"))
+
+
+@ops.RegisterGradient("CudnnRNNV3")
+def _cudnn_rnn_backwardv3(op, *grads):
+  """Gradients for the CudnnRNNV3 op."""
+  if not op.get_attr("is_training"):
+    raise ValueError(
+        "To use CudnnRNNV3 in gradients, is_training must be set to"
+        " True.")
+  return gen_cudnn_rnn_ops.cudnn_rnn_backprop_v3(
+      input=op.inputs[0],
+      input_h=op.inputs[1],
+      input_c=op.inputs[2],
+      params=op.inputs[3],
+      sequence_lengths=op.inputs[4],
+      output=op.outputs[0],
+      output_h=op.outputs[1],
+      output_c=op.outputs[2],
+      output_backprop=grads[0],
+      output_h_backprop=grads[1],
+      output_c_backprop=grads[2],
+      reserve_space=op.outputs[3],
+      host_reserved=op.outputs[4],
+      dropout=op.get_attr("dropout"),
+      seed=op.get_attr("seed"),
+      seed2=op.get_attr("seed2"),
+      rnn_mode=op.get_attr("rnn_mode"),
+      input_mode=op.get_attr("input_mode"),
+      direction=op.get_attr("direction")) + (None,)
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index d96601ac21c7d7d62423b65a2e43d08449e23129..a2b7767dcd72e1b71251cd2d8d912bf942e2e4a5 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -56,7 +56,9 @@ def copy_handle_data(source_t, target_t):
       handle_data = source_t._handle_data  # pylint: disable=protected-access
     else:
       handle_data = resource_variable_ops.get_resource_handle_data(source_t)
-    if handle_data is not None and handle_data.is_set:
+    if (handle_data is not None
+        and handle_data.is_set
+        and handle_data.shape_and_type):
       # pylint: disable=protected-access
       pywrap_tensorflow.SetHandleShapeAndType(target_t.graph._c_graph,
                                               target_t._as_tf_output(),
@@ -183,7 +185,7 @@ def _graph_mode_decorator(f, *args, **kwargs):
                    current_var_scope.local_variables())
   new_vars = after_vars - before_vars
   for v in new_vars:
-    if not isinstance(v, resource_variable_ops.ResourceVariable):
+    if not resource_variable_ops.is_resource_variable(v):
       raise TypeError(
           "All variables used by a function wrapped with @custom_gradient must "
           "be `ResourceVariable`s. Ensure that no `variable_scope` is created "
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index 2030332e4eaec8574010217d26ef6ac52dd988d5..1557bdf0eda90c26a97ce83239190dd6f9023a58 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -113,8 +113,9 @@ def _shape_common(s1, s2):
 
 
 # pylint: disable=protected-access
-@tf_export("io.QueueBase", v1=["io.QueueBase", "QueueBase"])
-@deprecation.deprecated_endpoints("QueueBase")
+@tf_export("queue.QueueBase",
+           v1=["queue.QueueBase", "io.QueueBase", "QueueBase"])
+@deprecation.deprecated_endpoints(["io.QueueBase", "QueueBase"])
 class QueueBase(object):
   """Base class for queue implementations.
 
@@ -616,8 +617,11 @@ def _shared_name(shared_name):
 
 
 @tf_export(
-    "io.RandomShuffleQueue", v1=["io.RandomShuffleQueue", "RandomShuffleQueue"])
-@deprecation.deprecated_endpoints("RandomShuffleQueue")
+    "queue.RandomShuffleQueue",
+    v1=["queue.RandomShuffleQueue",
+        "io.RandomShuffleQueue", "RandomShuffleQueue"])
+@deprecation.deprecated_endpoints(
+    ["io.RandomShuffleQueue", "RandomShuffleQueue"])
 class RandomShuffleQueue(QueueBase):
   """A queue implementation that dequeues elements in a random order.
 
@@ -702,7 +706,8 @@ class RandomShuffleQueue(QueueBase):
     super(RandomShuffleQueue, self).__init__(dtypes, shapes, names, queue_ref)
 
 
-@tf_export("FIFOQueue")
+@tf_export("queue.FIFOQueue", v1=["queue.FIFOQueue", "FIFOQueue"])
+@deprecation.deprecated_endpoints("FIFOQueue")
 class FIFOQueue(QueueBase):
   """A queue implementation that dequeues elements in first-in first-out order.
 
@@ -760,8 +765,9 @@ class FIFOQueue(QueueBase):
 
 
 @tf_export(
-    "io.PaddingFIFOQueue", v1=["io.PaddingFIFOQueue", "PaddingFIFOQueue"])
-@deprecation.deprecated_endpoints("PaddingFIFOQueue")
+    "queue.PaddingFIFOQueue",
+    v1=["queue.PaddingFIFOQueue", "io.PaddingFIFOQueue", "PaddingFIFOQueue"])
+@deprecation.deprecated_endpoints(["io.PaddingFIFOQueue", "PaddingFIFOQueue"])
 class PaddingFIFOQueue(QueueBase):
   """A FIFOQueue that supports batching variable-sized tensors by padding.
 
@@ -835,8 +841,9 @@ class PaddingFIFOQueue(QueueBase):
     super(PaddingFIFOQueue, self).__init__(dtypes, shapes, names, queue_ref)
 
 
-@tf_export("io.PriorityQueue", v1=["io.PriorityQueue", "PriorityQueue"])
-@deprecation.deprecated_endpoints("PriorityQueue")
+@tf_export("queue.PriorityQueue",
+           v1=["queue.PriorityQueue", "io.PriorityQueue", "PriorityQueue"])
+@deprecation.deprecated_endpoints(["io.PriorityQueue", "PriorityQueue"])
 class PriorityQueue(QueueBase):
   """A queue implementation that dequeues elements in prioritized order.
 
diff --git a/tensorflow/python/ops/distributions/bijector_impl.py b/tensorflow/python/ops/distributions/bijector_impl.py
index 9c63385dd0152aae48b1f92fd8d350fc910fe564..a347cfdec1585f87ba0bf5e2e6fa604367604c7b 100644
--- a/tensorflow/python/ops/distributions/bijector_impl.py
+++ b/tensorflow/python/ops/distributions/bijector_impl.py
@@ -462,7 +462,7 @@ class Bijector(object):
 
 
   ```python
-  abs = tf.contrib.distributions.bijectors.AbsoluteValue()
+  abs = tfp.distributions.bijectors.AbsoluteValue()
 
   abs.forward(-1.)
   ==> 1.
diff --git a/tensorflow/python/ops/distributions/multinomial.py b/tensorflow/python/ops/distributions/multinomial.py
index 97d2b1b26c68dc53f0a77120c9d3820c1d0f017b..1b2dbfaf9fe4de3ca8f439737f36b71e26e8e368 100644
--- a/tensorflow/python/ops/distributions/multinomial.py
+++ b/tensorflow/python/ops/distributions/multinomial.py
@@ -23,7 +23,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
@@ -266,7 +266,7 @@ class Multinomial(distribution.Distribution):
       x = math_ops.reduce_sum(array_ops.one_hot(x, depth=k), axis=-2)  # [n, k]
       return x
 
-    x = functional_ops.map_fn(
+    x = map_fn.map_fn(
         _sample_single, [flat_logits, flat_ndraws],
         dtype=self.dtype)  # [B1B2...Bm, n, k]
 
diff --git a/tensorflow/python/ops/distributions/normal.py b/tensorflow/python/ops/distributions/normal.py
index 9acc0469885c2463e84f875314f07d1f3d55481a..0b36054db2f15538037c2f5f64a2b762c58e5105 100644
--- a/tensorflow/python/ops/distributions/normal.py
+++ b/tensorflow/python/ops/distributions/normal.py
@@ -291,5 +291,5 @@ def _kl_normal_normal(n_a, n_b, name=None):
     s_a_squared = math_ops.square(n_a.scale)
     s_b_squared = math_ops.square(n_b.scale)
     ratio = s_a_squared / s_b_squared
-    return (math_ops.square(n_a.loc - n_b.loc) / (two * s_b_squared) +
-            half * (ratio - one - math_ops.log(ratio)))
+    return (math_ops.squared_difference(n_a.loc, n_b.loc) / (two * s_b_squared)
+            + half * (ratio - one - math_ops.log(ratio)))
diff --git a/tensorflow/python/ops/distributions/transformed_distribution.py b/tensorflow/python/ops/distributions/transformed_distribution.py
index 1becfc18778e998d1a84594273e1637e580f2aad..3c6476864a0bb05feec828d69de8fb8bc138a74b 100644
--- a/tensorflow/python/ops/distributions/transformed_distribution.py
+++ b/tensorflow/python/ops/distributions/transformed_distribution.py
@@ -167,7 +167,7 @@ class TransformedDistribution(distribution_lib.Distribution):
   distribution:
 
   ```python
-  ds = tf.contrib.distributions
+  ds = tfp.distributions
   log_normal = ds.TransformedDistribution(
     distribution=ds.Normal(loc=0., scale=1.),
     bijector=ds.bijectors.Exp(),
@@ -177,7 +177,7 @@ class TransformedDistribution(distribution_lib.Distribution):
   A `LogNormal` made from callables:
 
   ```python
-  ds = tf.contrib.distributions
+  ds = tfp.distributions
   log_normal = ds.TransformedDistribution(
     distribution=ds.Normal(loc=0., scale=1.),
     bijector=ds.bijectors.Inline(
@@ -191,7 +191,7 @@ class TransformedDistribution(distribution_lib.Distribution):
   Another example constructing a Normal from a StandardNormal:
 
   ```python
-  ds = tf.contrib.distributions
+  ds = tfp.distributions
   normal = ds.TransformedDistribution(
     distribution=ds.Normal(loc=0., scale=1.),
     bijector=ds.bijectors.Affine(
@@ -209,7 +209,7 @@ class TransformedDistribution(distribution_lib.Distribution):
   multivariate Normal as a `TransformedDistribution`.
 
   ```python
-  ds = tf.contrib.distributions
+  ds = tfp.distributions
   # We will create two MVNs with batch_shape = event_shape = 2.
   mean = [[-1., 0],      # batch:0
           [0., 1]]       # batch:1
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index d0291e2095bdb6574c707c7458e4cc335fc4b825..881466cb23a65ec7d1946183574eb3b76f1a08fc 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -558,7 +558,7 @@ def embedding_lookup_sparse_v2(params,
                                combiner=None,
                                max_norm=None,
                                name=None):
-  return embedding_lookup_sparse_v2(
+  return embedding_lookup_sparse(
       params, sp_ids, sp_weights, partition_strategy, name, combiner, max_norm)
 
 
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index 57542e3c7baa0f4eb3dc53431c9a3060f0998c5b..448e45cae37be372bc7742d047a24d6017735c3e 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -19,14 +19,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -38,8 +36,8 @@ from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops.gen_functional_ops import remote_call
 # pylint: enable=unused-import
 from tensorflow.python.ops.gen_functional_ops import symbolic_gradient
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
+from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -143,7 +141,8 @@ def foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
         lambda i, a: i < n, compute, [i, a],
         parallel_iterations=parallel_iterations,
         back_prop=back_prop,
-        swap_memory=swap_memory)
+        swap_memory=swap_memory,
+        maximum_iterations=n)
 
     # TODO(akshayka): Remove the in_graph_mode check once caching devices are
     # supported in Eager
@@ -253,263 +252,15 @@ def foldr(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
         compute, [i, a],
         parallel_iterations=parallel_iterations,
         back_prop=back_prop,
-        swap_memory=swap_memory)
-
-    # TODO(akshayka): Remove the in_graph_mode check once caching devices are
-    # supported in Eager
-    if in_graph_mode and varscope_caching_device_was_none:
-      varscope.set_caching_device(None)
-
-    return r_a
-
-
-@tf_export("map_fn")
-def map_fn(fn, elems, dtype=None, parallel_iterations=None, back_prop=True,
-           swap_memory=False, infer_shape=True, name=None):
-  """map on the list of tensors unpacked from `elems` on dimension 0.
-
-  The simplest version of `map_fn` repeatedly applies the callable `fn` to a
-  sequence of elements from first to last. The elements are made of the
-  tensors unpacked from `elems`. `dtype` is the data type of the return
-  value of `fn`. Users must provide `dtype` if it is different from
-  the data type of `elems`.
-
-  Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
-  of the result tensor is `[values.shape[0]] + fn(values[0]).shape`.
-
-  This method also allows multi-arity `elems` and output of `fn`.  If `elems`
-  is a (possibly nested) list or tuple of tensors, then each of these tensors
-  must have a matching first (unpack) dimension.  The signature of `fn` may
-  match the structure of `elems`.  That is, if `elems` is
-  `(t1, [t2, t3, [t4, t5]])`, then an appropriate signature for `fn` is:
-  `fn = lambda (t1, [t2, t3, [t4, t5]]):`.
-
-  Furthermore, `fn` may emit a different structure than its input.  For example,
-  `fn` may look like: `fn = lambda t1: return (t1 + 1, t1 - 1)`.  In this case,
-  the `dtype` parameter is not optional: `dtype` must be a type or (possibly
-  nested) tuple of types matching the output of `fn`.
-
-  To apply a functional operation to the nonzero elements of a SparseTensor
-  one of the following methods is recommended. First, if the function is
-  expressible as TensorFlow ops, use
-
-  ```python
-    result = SparseTensor(input.indices, fn(input.values), input.dense_shape)
-  ```
-
-  If, however, the function is not expressible as a TensorFlow op, then use
-
-  ```python
-  result = SparseTensor(
-    input.indices, map_fn(fn, input.values), input.dense_shape)
-  ```
-
-  instead.
-
-  When executing eagerly, map_fn does not execute in parallel even if
-  `parallel_iterations` is set to a value > 1. You can still get the
-  performance benefits of running a function in parallel by using the
-  `tf.contrib.eager.defun` decorator,
-
-  ```python
-  # Assume the function being used in map_fn is fn.
-  # To ensure map_fn calls fn in parallel, use the defun decorator.
-  @tf.contrib.eager.defun
-  def func(tensor):
-    return tf.map_fn(fn, tensor)
-  ```
-
-  Note that if you use the defun decorator, any non-TensorFlow Python code
-  that you may have written in your function won't get executed. See
-  `tf.contrib.eager.defun` for more details. The recommendation would be to
-  debug without defun but switch to defun to get performance benefits of
-  running map_fn in parallel.
-
-  Args:
-    fn: The callable to be performed.  It accepts one argument, which will
-      have the same (possibly nested) structure as `elems`.  Its output
-      must have the same structure as `dtype` if one is provided, otherwise
-      it must have the same structure as `elems`.
-    elems: A tensor or (possibly nested) sequence of tensors, each of which
-      will be unpacked along their first dimension.  The nested sequence
-      of the resulting slices will be applied to `fn`.
-    dtype: (optional) The output type(s) of `fn`.  If `fn` returns a structure
-      of Tensors differing from the structure of `elems`, then `dtype` is not
-      optional and must have the same structure as the output of `fn`.
-    parallel_iterations: (optional) The number of iterations allowed to run
-      in parallel. When graph building, the default value is 10. While executing
-      eagerly, the default value is set to 1.
-    back_prop: (optional) True enables support for back propagation.
-    swap_memory: (optional) True enables GPU-CPU memory swapping.
-    infer_shape: (optional) False disables tests for consistent output shapes.
-    name: (optional) Name prefix for the returned tensors.
-
-  Returns:
-    A tensor or (possibly nested) sequence of tensors.  Each tensor packs the
-    results of applying `fn` to tensors unpacked from `elems` along the first
-    dimension, from first to last.
-
-  Raises:
-    TypeError: if `fn` is not callable or the structure of the output of
-      `fn` and `dtype` do not match, or if elems is a SparseTensor.
-    ValueError: if the lengths of the output of `fn` and `dtype` do not match.
-
-  Examples:
-    ```python
-    elems = np.array([1, 2, 3, 4, 5, 6])
-    squares = map_fn(lambda x: x * x, elems)
-    # squares == [1, 4, 9, 16, 25, 36]
-    ```
-
-    ```python
-    elems = (np.array([1, 2, 3]), np.array([-1, 1, -1]))
-    alternate = map_fn(lambda x: x[0] * x[1], elems, dtype=tf.int64)
-    # alternate == [-1, 2, -3]
-    ```
-
-    ```python
-    elems = np.array([1, 2, 3])
-    alternates = map_fn(lambda x: (x, -x), elems, dtype=(tf.int64, tf.int64))
-    # alternates[0] == [1, 2, 3]
-    # alternates[1] == [-1, -2, -3]
-    ```
-  """
-  if not callable(fn):
-    raise TypeError("fn must be callable.")
-
-  if isinstance(elems, sparse_tensor.SparseTensor):
-    raise TypeError(
-        "To perform a map on the values of a sparse tensor use either "
-        " SparseTensor(input.indices, fn(input.values), input.dense_shape) or "
-        " SparseTensor(input.indices, map_fn(fn, input.values), "
-        "input.dense_shape)")
-
-  in_graph_mode = not context.executing_eagerly()
-  # Set the default number of parallel_iterations depending on graph/eager mode.
-  if in_graph_mode and not parallel_iterations:
-    parallel_iterations = 10
-  elif not in_graph_mode and not parallel_iterations:
-    parallel_iterations = 1
-
-  if not in_graph_mode and parallel_iterations > 1:
-    logging.log_first_n(logging.WARN, "Setting parallel_iterations > 1 has no "
-                        "effect when executing eagerly. Consider calling map_fn"
-                        " with tf.contrib.eager.defun to execute fn in "
-                        "parallel.", 1)
-    parallel_iterations = 1
-
-  input_is_sequence = nest.is_sequence(elems)
-  input_flatten = lambda x: nest.flatten(x) if input_is_sequence else [x]
-  def input_pack(x):
-    return nest.pack_sequence_as(elems, x) if input_is_sequence else x[0]
-
-  if dtype is None:
-    output_is_sequence = input_is_sequence
-    output_flatten = input_flatten
-    output_pack = input_pack
-  else:
-    output_is_sequence = nest.is_sequence(dtype)
-    output_flatten = lambda x: nest.flatten(x) if output_is_sequence else [x]
-    def output_pack(x):
-      return (nest.pack_sequence_as(dtype, x)
-              if output_is_sequence else x[0])
-
-  elems_flat = input_flatten(elems)
-
-  with ops.name_scope(name, "map", elems_flat):
-    # TODO(akshayka): Remove the in_graph_mode check once caching devices are
-    # supported in Eager
-    if in_graph_mode:
-      # Any get_variable calls in fn will cache the first call locally
-      # and not issue repeated network I/O requests for each iteration.
-      varscope = vs.get_variable_scope()
-      varscope_caching_device_was_none = False
-      if varscope.caching_device is None:
-        # TODO(ebrevdo): Change to using colocate_with here and in other
-        # methods.
-        varscope.set_caching_device(lambda op: op.device)
-        varscope_caching_device_was_none = True
-
-    elems_flat = [
-        ops.convert_to_tensor(elem, name="elem") for elem in elems_flat]
-
-    dtype = dtype or input_pack([elem.dtype for elem in elems_flat])
-    dtype_flat = output_flatten(dtype)
-
-    # Convert elems to tensor array. n may be known statically.
-    static_shape = elems_flat[0].shape
-    if static_shape.ndims is not None and static_shape.ndims < 1:
-      if len(elems_flat) == 1:
-        raise ValueError("elems must be a 1+ dimensional Tensor, not a scalar")
-      else:
-        raise ValueError(
-            "elements in elems must be 1+ dimensional Tensors, not scalars"
-        )
-    n = (tensor_shape.dimension_value(static_shape[0])
-         or array_ops.shape(elems_flat[0])[0])
-
-    # TensorArrays are always flat
-    elems_ta = [
-        tensor_array_ops.TensorArray(dtype=elem.dtype, size=n,
-                                     dynamic_size=False,
-                                     infer_shape=True)
-        for elem in elems_flat]
-    # Unpack elements
-    elems_ta = [
-        elem_ta.unstack(elem) for elem_ta, elem in zip(elems_ta, elems_flat)]
-
-    i = constant_op.constant(0)
-
-    accs_ta = [
-        tensor_array_ops.TensorArray(dtype=dt, size=n,
-                                     dynamic_size=False,
-                                     infer_shape=infer_shape)
-        for dt in dtype_flat]
-
-    def compute(i, tas):
-      """The loop body of map_fn.
-
-      Args:
-        i: the loop counter
-        tas: the flat TensorArray accumulator list
-
-      Returns:
-        (i + 1, tas): the updated counter + updated TensorArrays
-
-      Raises:
-        TypeError: if dtype and packed_fn_values structure do not match
-        ValueType: if dtype and packed_fn_values lengths do not match
-      """
-      packed_values = input_pack([elem_ta.read(i) for elem_ta in elems_ta])
-      packed_fn_values = fn(packed_values)
-      nest.assert_same_structure(dtype or elems, packed_fn_values)
-      flat_fn_values = output_flatten(packed_fn_values)
-      tas = [ta.write(i, value) for (ta, value) in zip(tas, flat_fn_values)]
-      return (i + 1, tas)
-
-    _, r_a = control_flow_ops.while_loop(
-        lambda i, _: i < n, compute, (i, accs_ta),
-        parallel_iterations=parallel_iterations,
-        back_prop=back_prop,
         swap_memory=swap_memory,
         maximum_iterations=n)
-    results_flat = [r.stack() for r in r_a]
-
-    n_static = tensor_shape.Dimension(tensor_shape.dimension_value(
-        elems_flat[0].get_shape().with_rank_at_least(1)[0]))
-    for elem in elems_flat[1:]:
-      n_static.merge_with(tensor_shape.Dimension(tensor_shape.dimension_value(
-          elem.get_shape().with_rank_at_least(1)[0])))
-    for r in results_flat:
-      r.set_shape(tensor_shape.TensorShape(n_static).concatenate(
-          r.get_shape()[1:]))
 
     # TODO(akshayka): Remove the in_graph_mode check once caching devices are
     # supported in Eager
     if in_graph_mode and varscope_caching_device_was_none:
       varscope.set_caching_device(None)
 
-    return output_pack(results_flat)
+    return r_a
 
 
 @tf_export("scan")
@@ -646,13 +397,15 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
         ops.convert_to_tensor(elem, name="elem") for elem in elems_flat]
 
     # Convert elems to tensor array. n may be known statically.
-    n = (tensor_shape.dimension_value(elems_flat[0].shape[0])
-         or array_ops.shape(elems_flat[0])[0])
+    n = tensor_shape.dimension_value(elems_flat[0].shape[0])
+    if n is None:
+      n = array_ops.shape(elems_flat[0])[0]
 
     # TensorArrays are always flat
     elems_ta = [
         tensor_array_ops.TensorArray(dtype=elem.dtype, size=n,
                                      dynamic_size=False,
+                                     element_shape=elem.shape[1:],
                                      infer_shape=True)
         for elem in elems_flat]
     # Unpack elements
@@ -1022,17 +775,6 @@ def For(start,
   return ret
 # pylint: enable=invalid-name,protected-access
 
-_rewriter_config_optimizer_disabled = None
-
-def _get_disabled_rewriter_config():
-  global _rewriter_config_optimizer_disabled
-  if _rewriter_config_optimizer_disabled is None:
-    config = config_pb2.ConfigProto()
-    rewriter_config = config.graph_options.rewrite_options
-    rewriter_config.disable_meta_optimizer = True
-    _rewriter_config_optimizer_disabled = config.SerializeToString()
-  return _rewriter_config_optimizer_disabled
-
 
 def partitioned_call(args, f, tout=None, executing_eagerly=None, config=None,
                      executor_type=None):
@@ -1069,7 +811,7 @@ def partitioned_call(args, f, tout=None, executing_eagerly=None, config=None,
     executing_eagerly = context.executing_eagerly()
 
   if config is None:
-    config = _get_disabled_rewriter_config()
+    config = function_utils.get_disabled_rewriter_config()
 
   if executor_type is None:
     executor_type = ""
@@ -1101,7 +843,8 @@ def partitioned_call(args, f, tout=None, executing_eagerly=None, config=None,
   # When running in graph mode, the graph and function graphs are optimized
   # (i.e. run through grappler) per the session options, so we can disable any
   # eager-specific rewriting.
-  config_proto = attr_value_pb2.AttrValue(s=_get_disabled_rewriter_config())
+  config_proto = attr_value_pb2.AttrValue(
+      s=function_utils.get_disabled_rewriter_config())
 
   graph = ops.get_default_graph()
   f.add_to_graph(graph)
diff --git a/tensorflow/python/ops/gradient_checker_v2.py b/tensorflow/python/ops/gradient_checker_v2.py
index 5d473eeb5f4f00087672da53c5fef3ab63bdbd08..41fcaaca6824611fb4212df1f444e72bffdf0ea4 100644
--- a/tensorflow/python/ops/gradient_checker_v2.py
+++ b/tensorflow/python/ops/gradient_checker_v2.py
@@ -66,20 +66,31 @@ def _eval_indexed_slices(a):
 
 
 def _to_numpy(a):
-  """Converts Tensors and EagerTensors to numpy arrays.
+  """Converts Tensors, EagerTensors, and IndexedSlicesValue to numpy arrays.
 
   Args:
     a: any value.
 
   Returns:
     If a is EagerTensor or Tensor, returns the evaluation of a by calling
-    numpy() or run(). Otherwise returns a unchanged.
+    numpy() or run(). If a is IndexedSlicesValue, constructs the corresponding
+    dense numpy array. Otherwise returns a unchanged.
   """
   if isinstance(a, ops.EagerTensor):
     return a.numpy()
   if isinstance(a, ops.Tensor):
     sess = ops.get_default_session()
     return sess.run(a)
+  if isinstance(a, ops.IndexedSlicesValue):
+    arr = np.zeros(a.dense_shape)
+    assert len(a.values) == len(a.indices), (
+        "IndexedSlicesValue has %s value slices but %s indices\n%s" %
+        (a.values, a.indices, a))
+    for values_slice, index in zip(a.values, a.indices):
+      assert 0 <= index < len(arr), (
+          "IndexedSlicesValue has invalid index %s\n%s" % (index, a))
+      arr[index] += values_slice
+    return arr
   return a
 
 
diff --git a/tensorflow/python/ops/gradients.py b/tensorflow/python/ops/gradients.py
index cd11447e1f963a62d79855cfd8af42a35e978c79..96389abded3acf3c58f90faa601d4cf1e5eb8619 100644
--- a/tensorflow/python/ops/gradients.py
+++ b/tensorflow/python/ops/gradients.py
@@ -22,7 +22,7 @@ from __future__ import print_function
 from tensorflow.python.eager import function
 from tensorflow.python.eager.backprop import GradientTape
 from tensorflow.python.ops.custom_gradient import custom_gradient
-from tensorflow.python.ops.gradients_impl import AggregationMethod
+from tensorflow.python.ops.gradients_util import AggregationMethod
 from tensorflow.python.ops.gradients_impl import gradients
 from tensorflow.python.ops.gradients_impl import hessians
 from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 0a70d6ee61e64f94c41c1f1d0a5b6c3610b45c04..c66efad7d3131469c6d7abee51009eb015bc34d0 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -18,30 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import contextlib
-import warnings
-
-import numpy as np
-import six
-from six.moves import xrange  # pylint: disable=redefined-builtin
-
-from tensorflow.core.framework import attr_value_pb2
-from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import function as framework_function
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.framework.func_graph import FuncGraph
 from tensorflow.python.ops import array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import control_flow_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import control_flow_util
-from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import gradients_util
 from tensorflow.python.ops import image_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import linalg_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import linalg_ops  # pylint: disable=unused-import
@@ -51,503 +35,11 @@ from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import optional_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import random_grad  # pylint: disable=unused-import
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
 
-# This is to avoid a circular dependency (eager.function depends on
-# gradients_impl). This is set in eager/function.py.
-_function = None
-
-# Warn the user if we convert a sparse representation to dense with at
-# least this number of elements.
-_LARGE_SPARSE_NUM_ELEMENTS = 100000000
-
-
-def _IndexedSlicesToTensor(value, dtype=None, name=None, as_ref=False):
-  """Converts an IndexedSlices object `value` to a Tensor.
-
-  NOTE(mrry): This function is potentially expensive.
-
-  Args:
-    value: An ops.IndexedSlices object.
-    dtype: The dtype of the Tensor to be returned.
-    name: Optional name to use for the returned Tensor.
-    as_ref: True if a ref is requested.
-
-  Returns:
-    A dense Tensor representing the values in the given IndexedSlices.
-
-  Raises:
-    ValueError: If the IndexedSlices does not have the same dtype.
-  """
-  _ = as_ref
-  if dtype and not dtype.is_compatible_with(value.dtype):
-    raise ValueError(
-        "Tensor conversion requested dtype %s for IndexedSlices with dtype %s" %
-        (dtype.name, value.dtype.name))
-  if value.dense_shape is None:
-    raise ValueError(
-        "Tensor conversion requested for IndexedSlices without dense_shape: %s"
-        % str(value))
-  # TODO(mrry): Consider adding static shape information to
-  # IndexedSlices, to avoid using numpy here.
-  if not context.executing_eagerly():
-    dense_shape_value = tensor_util.constant_value(value.dense_shape)
-    if dense_shape_value is not None:
-      num_elements = np.prod(dense_shape_value)
-      if num_elements >= _LARGE_SPARSE_NUM_ELEMENTS:
-        warnings.warn(
-            "Converting sparse IndexedSlices to a dense Tensor with %d "
-            "elements. This may consume a large amount of memory." %
-            num_elements)
-    else:
-      warnings.warn(
-          "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
-          "This may consume a large amount of memory.")
-  return math_ops.unsorted_segment_sum(
-      value.values, value.indices, value.dense_shape[0], name=name)
-
-
-ops.register_tensor_conversion_function(ops.IndexedSlices,
-                                        _IndexedSlicesToTensor)
-
-
-def _MarkReachedOps(from_ops, reached_ops, func_graphs):
-  """Mark all ops reached from "from_ops".
-
-  Args:
-    from_ops: list of Operations.
-    reached_ops: set of Operations.
-    func_graphs: list of FuncGraphs. This method will traverse through
-      these functions if they capture from_ops or any reachable ops.
-  """
-  queue = collections.deque()
-  queue.extend(from_ops)
-  while queue:
-    op = queue.popleft()
-    if op not in reached_ops:
-      reached_ops.add(op)
-      for output in op.outputs:
-        if _IsBackpropagatable(output):
-          queue.extend(_Consumers(output, func_graphs))
-
-
-def _PendingCount(to_ops, from_ops, colocate_gradients_with_ops, func_graphs,
-                  xs):
-  """Initialize the pending count for ops between two lists of Operations.
-
-  'pending_count[op]' indicates the number of backprop inputs
-  to this operation.
-
-  Args:
-    to_ops: list of Operations.
-    from_ops: list of Operations.
-    colocate_gradients_with_ops: Python bool.  See docstring of gradients().
-    func_graphs: list of FuncGraphs. This method will traverse through
-      these functions if they capture from_ops or any reachable ops. This is
-      useful if to_ops occur in a function and from_ops are in an outer function
-      or graph.
-    xs: list of Tensors.
-
-  Returns:
-    A tuple containing: (1) the subset of to_ops reachable from from_ops by a
-    path of zero or more backpropagatable tensors, (2) a mapping from operation
-    to the number of backprop inputs to that op, and (3) a ControlFlowState
-    object which is not None if the ops between from_ops and to_ops contain
-    control flow loops.
-  """
-  # Mark reachable ops from from_ops.
-  reached_ops = set()
-  _MarkReachedOps(from_ops, reached_ops, func_graphs)
-  # X in reached_ops iff X is reachable from from_ops by a path of zero or more
-  # backpropagatable tensors.
-
-  reachable_to_ops = set(op for op in to_ops if op in reached_ops)
-
-  # Mark between ops.
-  between_ops = set()
-  between_op_list = []
-  queue = collections.deque()
-  queue.extend(to_ops)
-  while queue:
-    op = queue.popleft()
-    # We are interested in this op.
-    if op in reached_ops:
-      between_ops.add(op)
-      between_op_list.append(op)
-      # Clear the boolean so we won't add the inputs again.
-      reached_ops.remove(op)
-      for inp in _NonEagerInputs(op, xs):
-        queue.append(inp.op)
-  # X in between_ops iff X is on a path of zero or more backpropagatable tensors
-  # between from_ops and to_ops
-
-  # 'loop_state' is None if there are no while loops.
-  loop_state = control_flow_ops.MaybeCreateControlFlowState(
-      between_op_list, between_ops, colocate_gradients_with_ops)
-
-  # Initialize pending count for between ops.
-  pending_count = collections.defaultdict(int)
-  for op in between_op_list:
-    for x in _NonEagerInputs(op, xs):
-      if x.op in between_ops:
-        pending_count[x.op] += 1
-
-  return reachable_to_ops, pending_count, loop_state
-
-
-def _AsList(x):
-  return x if isinstance(x, (list, tuple)) else [x]
-
-
-def _DefaultGradYs(grad_ys,
-                   ys,
-                   colocate_gradients_with_ops,
-                   gradient_uid="__unsupported__"):
-  """Fill in default values for grad_ys.
-
-  Args:
-    grad_ys: List of gradients, can contain None.
-    ys: List of tensors.
-    colocate_gradients_with_ops: If True, try colocating gradients with
-      the corresponding op.
-    gradient_uid: A unique identifier within the graph indicating
-      which invocation of gradients is being executed. Used to cluster
-      ops for compilation.
-
-  Returns:
-    A list of gradients to use, without None.
-
-  Raises:
-    ValueError: If sizes of gradients and inputs don't match
-    TypeError: If type of any gradient is not valid for its input.
-  """
-  if len(grad_ys) != len(ys):
-    raise ValueError("Passed %d grad_ys for %d ys" % (len(grad_ys), len(ys)))
-  grad_ys = ops.convert_n_to_tensor_or_indexed_slices(grad_ys, name="grad_y")
-  new_grad_ys = []
-  for i in xrange(len(grad_ys)):
-    grad_y = grad_ys[i]
-    y = ys[i]
-    with _maybe_colocate_with(y.op, gradient_uid, colocate_gradients_with_ops):
-      if grad_y is None:
-        if y.dtype.is_complex:
-          raise TypeError(
-              "Gradients of complex tensors must set grad_ys (y.dtype = %r)" %
-              y.dtype)
-        new_grad_ys.append(
-            array_ops.fill(
-                array_ops.shape(y),
-                constant_op.constant(1, dtype=y.dtype, name="grad_ys_%d" % i)))
-        continue
-      if y.dtype.is_floating or y.dtype.is_integer:
-        if not grad_y.dtype.is_floating and not grad_y.dtype.is_integer:
-          raise TypeError(
-              "Gradient type %s generated for real or "
-              "integer-valued tensor %s with type %s must be "
-              "real or integer" % (dtypes.as_dtype(grad_y.dtype).name, y,
-                                   dtypes.as_dtype(y.dtype).name))
-      elif y.dtype.is_complex:
-        if not grad_y.dtype.is_complex:
-          raise TypeError(
-              "Gradient type %s generated for complex-valued "
-              "tensor %s with type %s must be real" % (dtypes.as_dtype(
-                  grad_y.dtype).name, y, dtypes.as_dtype(y.dtype).name))
-      elif y.dtype == dtypes.variant:
-        if grad_y.dtype != dtypes.variant:
-          raise TypeError(
-              "Gradient type %s generated for variant "
-              "tensor %s with type %s must be variant" % (dtypes.as_dtype(
-                  grad_y.dtype).name, y, dtypes.as_dtype(y.dtype).name))
-      elif y.dtype == dtypes.resource:
-        # We assume y is the handle of a ResourceVariable. The gradient of a
-        # ResourceVariable should be a numeric value, not another resource.
-        if grad_y.dtype == dtypes.resource:
-          raise TypeError("Input gradient %s for resource tensor %s should not "
-                          "be a resource" % (grad_y, y))
-      else:
-        raise TypeError(
-            "Tensor %s with type %s must be numeric "
-            "to obtain a default gradient" % (y, dtypes.as_dtype(y.dtype).name))
-      # Create a grad_y tensor in the name scope of the gradient.
-      # Required for TensorArrays to identify which gradient call a
-      # grad_y value is coming from.
-      if isinstance(grad_y, ops.IndexedSlices):
-        new_grad_ys.append(
-            ops.IndexedSlices(
-                indices=(array_ops.identity(
-                    grad_y.indices, name="grad_ys_%d_indices" % i)
-                         if isinstance(grad_y.indices, ops.Tensor) else
-                         grad_y.indices),
-                values=(array_ops.identity(
-                    grad_y.values, name="grad_ys_%d_values" % i) if isinstance(
-                        grad_y.values, ops.Tensor) else grad_y.values),
-                dense_shape=(array_ops.identity(
-                    grad_y.dense_shape, name="grad_ys_%d_shape" % i)
-                             if isinstance(grad_y.dense_shape, ops.Tensor) else
-                             grad_y.dense_shape)))
-      else:
-        new_grad_ys.append(array_ops.identity(grad_y, name="grad_ys_%d" % i))
-
-  return new_grad_ys
-
-
-def IsTrainable(tensor_or_dtype):
-  if isinstance(tensor_or_dtype, ops.Tensor):
-    dtype = tensor_or_dtype.dtype
-  else:
-    dtype = tensor_or_dtype
-  dtype = dtypes.as_dtype(dtype)
-  return dtype.base_dtype in (dtypes.float16, dtypes.float32, dtypes.float64,
-                              dtypes.complex64, dtypes.complex128,
-                              dtypes.resource, dtypes.variant)
-
-
-def _IsBackpropagatable(tensor):
-  if IsTrainable(tensor):
-    return True
-  dtype = dtypes.as_dtype(tensor.dtype)
-  return dtype.base_dtype == dtypes.bfloat16
-
-
-def _VerifyGeneratedGradients(grads, op):
-  """Verify that gradients are valid in number and type.
-
-  Args:
-    grads: List of generated gradients.
-    op: Operation for which the gradients where generated.
-
-  Raises:
-    ValueError: if sizes of gradients and inputs don't match.
-    TypeError: if type of any gradient is not valid for its input.
-  """
-  # While ops have inputs added to them during the gradient computation, so we
-  # skip the below check. See while_v2 for details.
-  if op.type == "While": return
-
-  if len(grads) != len(op.inputs):
-    raise ValueError("Num gradients %d generated for op %s do not match num "
-                     "inputs %d" % (len(grads), op.node_def, len(op.inputs)))
-
-
-def _StopOps(from_ops, stop_gradient_ops, pending_count, xs):
-  """The set of ops that terminate the gradient computation.
-
-  This computes the frontier of the forward graph *before* which backprop
-  should stop. Operations in the returned set will not be differentiated.
-  This set is defined as the subset of `from_ops` containing ops that have
-  no predecessor in `from_ops`. `pending_count` is the result of
-  `_PendingCount(xs, from_ops)`. An 'op' has predecessors in `from_ops`
-  iff pending_count[op] > 0.
-
-  In addition, none of `stop_gradient_ops` will be differentiated.
-
-  Args:
-    from_ops: list of Operations.
-    stop_gradient_ops: list of Operations never to backprop through.
-    pending_count: mapping from operation to number of backprop inputs.
-    xs: list of Tensors.
-
-  Returns:
-    The set of operations.
-  """
-  stop_ops = set()
-  for op in from_ops:
-    is_stop_op = True
-    for inp in _NonEagerInputs(op, xs):
-      if pending_count[inp.op] > 0:
-        is_stop_op = False
-        break
-    if is_stop_op:
-      stop_ops.add(op)
-  stop_ops.update(op for op in stop_gradient_ops)
-  return stop_ops
-
-
-@contextlib.contextmanager
-def _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops):  # pylint: disable=invalid-name
-  """Context to colocate with `op` if `colocate_gradients_with_ops`."""
-  if colocate_gradients_with_ops:
-    with ops._colocate_with_for_gradient(op, gradient_uid):  # pylint: disable=protected-access
-      yield
-  else:
-    yield
-
-
-def _IsPartitionedCall(op):
-  return op.type == "PartitionedCall" or op.type == "StatefulPartitionedCall"
-
-
-def _SymGrad(op, out_grads):
-  """Backprop through a function call node op given its outputs' gradients."""
-  f_in = [x for x in op.inputs] + out_grads
-  f_types = [x.dtype for x in op.inputs]
-  f = attr_value_pb2.NameAttrList()
-  if _IsPartitionedCall(op):
-    f.name = op.get_attr("f").name
-  else:
-    f.name = op.type
-  for k in op.node_def.attr:
-    f.attr[k].CopyFrom(op.node_def.attr[k])
-  # TODO(apassos) use a better dtype here
-  in_grads = functional_ops.symbolic_gradient(
-      input=f_in,
-      Tout=[x if x != dtypes.resource else dtypes.float32 for x in f_types],
-      f=f)
-  return in_grads
-
-
-def _MaybeCompile(scope, op, func, grad_fn):
-  """Compile the calculation in grad_fn if op was marked as compiled."""
-  scope = scope.rstrip("/").replace("/", "_")
-  if func is not None:
-    xla_compile = func.definition.attr["_XlaCompile"].b
-    xla_separate_compiled_gradients = func.definition.attr[
-        "_XlaSeparateCompiledGradients"].b
-    xla_scope = func.definition.attr["_XlaScope"].s.decode()
-  else:
-    try:
-      xla_compile = op.get_attr("_XlaCompile")
-      xla_separate_compiled_gradients = op.get_attr(
-          "_XlaSeparateCompiledGradients")
-      xla_scope = op.get_attr("_XlaScope").decode()
-    except ValueError:
-      return grad_fn()  # Exit early
-
-  if not xla_compile:
-    return grad_fn()  # Exit early
-
-  # If the gradients are supposed to be compiled separately, we give them a
-  # _XlaScope name that is based on the name_scope of the gradients.  Otherwise
-  # they just inherit the existing _XlaScope name, which lets them be merged
-  # together with the non-gradient computation.
-  if xla_separate_compiled_gradients:
-    xla_grad_scope = "%s_grad_%s" % (xla_scope, scope)
-  else:
-    xla_grad_scope = xla_scope
-
-  attrs = {
-      "_XlaCompile": attr_value_pb2.AttrValue(b=xla_compile),
-      "_XlaScope": attr_value_pb2.AttrValue(s=xla_grad_scope.encode())
-  }
-  with ops.get_default_graph()._attr_scope(attrs):  # pylint: disable=protected-access
-    return grad_fn()
-
-
-def _RaiseNoGradWrtInitialLoopValError(op, from_ops, xs):
-  """Raises an error if we backprop through a loop var."""
-  # Find the nearest 'to_op' reachable from 'op' to provide a more helpful error
-  # message.
-  target_op = None
-  queue = collections.deque([op])
-  visited = set()
-  while queue:
-    curr_op = queue.popleft()
-    if curr_op in visited: continue
-    visited.add(curr_op)
-    if curr_op in from_ops:
-      target_op = curr_op
-      break
-    queue.extend(t.op for t in _NonEagerInputs(curr_op, xs))
-  assert target_op
-  raise ValueError(
-      "Cannot compute gradient inside while loop with respect to op '%s'. "
-      "We do not support taking the gradient wrt or through the initial value "
-      "of a loop variable. Gradients can be computed through loop invariants "
-      "or wrt the input parameters to the loop body."
-      % target_op.name)
-
-
-def _IsFunction(graph):
-  return (isinstance(graph, FuncGraph) or
-          isinstance(graph, framework_function._FuncGraph))  # pylint: disable=protected-access
-
-
-def _Captures(func_graph):
-  if isinstance(func_graph, FuncGraph):
-    return func_graph.captures
-  else:
-    assert isinstance(func_graph, framework_function._FuncGraph)  # pylint: disable=protected-access
-    return func_graph._captured  # pylint: disable=protected-access
-
-
-def _MaybeCaptured(t):
-  """If t is a captured value placeholder, returns the original captured value.
-
-  Args:
-    t: Tensor
-
-  Returns:
-    A tensor, potentially from a different Graph/FuncGraph.
-  """
-  # pylint: disable=protected-access
-  if (not isinstance(t, ops.EagerTensor) and
-      _IsFunction(t.op.graph) and t.op.type == "Placeholder"):
-    for input_t, placeholder_t in _Captures(t.op.graph).items():
-      if t == placeholder_t:
-        return _MaybeCaptured(input_t)
-  # pylint: enable=protected-access
-  return t
-
-
-# TODO(skyewm): plumbing xs through everywhere is ugly, consider making
-# _GradientsHelper a class with xs as a member variable.
-def _NonEagerInputs(op, xs):
-  """Returns the inputs of op, crossing closure boundaries where necessary.
-
-  Does not return any captured EagerTensors, i.e., the number of tensors
-  returned may be less than than the actual number of inputs.
-
-  Args:
-    op: Operation
-    xs: list of Tensors we are differentiating w.r.t.
-
-  Returns:
-    A list of tensors. The tensors may be from multiple Graph/FuncGraphs if op
-    is in a FuncGraph and has captured inputs.
-  """
-  if _IsFunction(op.graph):  # pylint: disable=protected-access
-    inputs = []
-    for t in op.inputs:
-      # If we're differentiating w.r.t. `t`, do not attempt to traverse through
-      # it to a captured value. The algorithm needs to "see" `t` in this case,
-      # even if it's a function input for a captured value, whereas usually we'd
-      # like to traverse through these closures as if the captured value was the
-      # direct input to op.
-      if t not in xs:
-        t = _MaybeCaptured(t)
-        # Skip captured eager inputs.
-        if isinstance(t, ops.EagerTensor): continue
-      inputs.append(t)
-    return inputs
-  else:
-    return op.inputs
-
-
-def _Consumers(t, func_graphs):
-  """Returns the consumers of t, crossing closure boundaries where necessary.
-
-  Args:
-    t: Tensor
-    func_graphs: a list of FuncGraphs that may have captured t.
-
-  Returns:
-    A list of tensors. The tensors will be from the current graph and/or
-    func_graphs.
-  """
-  consumers = t.consumers()
-  for func in func_graphs:
-    for input_t, placeholder in _Captures(func).items():
-      if input_t == t:
-        consumers.extend(_Consumers(placeholder, func_graphs))
-  return consumers
-
-
 @tf_export(v1=["gradients"])
 def gradients(ys,
               xs,
@@ -658,10 +150,13 @@ def gradients(ys,
   # Creating the gradient graph for control flow mutates Operations.
   # _mutation_lock ensures a Session.run call cannot occur between creating and
   # mutating new ops.
-  with ops.get_default_graph()._mutation_lock():  # pylint: disable=protected-access
-    return _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
-                            gate_gradients, aggregation_method, stop_gradients,
-                            unconnected_gradients)
+  # pylint: disable=protected-access
+  with ops.get_default_graph()._mutation_lock():
+    return gradients_util._GradientsHelper(
+        ys, xs, grad_ys, name, colocate_gradients_with_ops,
+        gate_gradients, aggregation_method, stop_gradients,
+        unconnected_gradients)
+  # pylint: enable=protected-access
 
 
 @tf_export("gradients", v1=[])
@@ -771,540 +266,13 @@ def gradients_v2(ys,  # pylint: disable=invalid-name
   # Creating the gradient graph for control flow mutates Operations.
   # _mutation_lock ensures a Session.run call cannot occur between creating and
   # mutating new ops.
-  with ops.get_default_graph()._mutation_lock():  # pylint: disable=protected-access
-    return _GradientsHelper(ys, xs, grad_ys, name, True, gate_gradients,
-                            aggregation_method, stop_gradients,
-                            unconnected_gradients)
-
-
-def _GradientsHelper(ys,
-                     xs,
-                     grad_ys=None,
-                     name="gradients",
-                     colocate_gradients_with_ops=False,
-                     gate_gradients=False,
-                     aggregation_method=None,
-                     stop_gradients=None,
-                     unconnected_gradients=UnconnectedGradients.NONE,
-                     src_graph=None):
-  """Implementation of gradients()."""
-  if context.executing_eagerly():
-    raise RuntimeError("tf.gradients is not supported when eager execution "
-                       "is enabled. Use tf.GradientTape instead.")
-  if src_graph is None:
-    src_graph = ops.get_default_graph()
-  try:
-    unconnected_gradients = UnconnectedGradients(unconnected_gradients)
-  except ValueError:
-    raise ValueError(
-        "Unknown value for unconnected_gradients: %r" % unconnected_gradients)
-
-  # If src_graph is a _FuncGraph (i.e. a function body), gather it and all
-  # ancestor graphs. This is necessary for correctly handling captured values.
-  func_graphs = []
-  curr_graph = src_graph
-  while _IsFunction(curr_graph):
-    func_graphs.append(curr_graph)
-    if isinstance(curr_graph, FuncGraph):
-      curr_graph = curr_graph.outer_graph
-    else:
-      assert isinstance(curr_graph, framework_function._FuncGraph)  # pylint: disable=protected-access
-      curr_graph = curr_graph._outer_graph  # pylint: disable=protected-access
-
-  ys = _AsList(ys)
-  xs = _AsList(xs)
-  stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients)
-  if grad_ys is None:
-    grad_ys = [None] * len(ys)
-  else:
-    grad_ys = _AsList(grad_ys)
-
-  with ops.name_scope(
-      name, "gradients",
-      list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope:
-    # Get a uid for this call to gradients that can be used to help
-    # cluster ops for compilation.
-    gradient_uid = ops.get_default_graph().unique_name("uid")
-    ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
-    xs = [
-        x.handle if resource_variable_ops.is_resource_variable(x) else x
-        for x in xs
-    ]
-    xs = ops.internal_convert_n_to_tensor_or_indexed_slices(
-        xs, name="x", as_ref=True)
-    grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops,
-                             gradient_uid)
-
-    # The approach we take here is as follows: Create a list of all ops in the
-    # subgraph between the ys and xs.  Visit these ops in reverse order of ids
-    # to ensure that when we visit an op the gradients w.r.t its outputs have
-    # been collected.  Then aggregate these gradients if needed, call the op's
-    # gradient function, and add the generated gradients to the gradients for
-    # its input.
-
-    # Initialize the pending count for ops in the connected subgraph from ys
-    # to the xs.
-    to_ops = [t.op for t in ys]
-    from_ops = [t.op for t in xs]
-    stop_gradient_ops = [t.op for t in stop_gradients]
-    reachable_to_ops, pending_count, loop_state = _PendingCount(
-        to_ops, from_ops, colocate_gradients_with_ops, func_graphs, xs)
-
-    # Iterate over the collected ops.
-    #
-    # grads: op => list of gradients received on each output endpoint of the
-    # op.  The gradients for each endpoint are initially collected as a list.
-    # When it is time to call the op's gradient function, for each endpoint we
-    # aggregate the list of received gradients into a Add() Operation if there
-    # is more than one.
-    grads = {}
-
-    # Add the initial gradients for the ys.
-    for y, grad_y in zip(ys, grad_ys):
-      _SetGrad(grads, y, grad_y)
-
-    # Initialize queue with to_ops.
-    queue = collections.deque()
-    # Add the ops in 'to_ops' into the queue.
-    to_ops_set = set()
-    for op in to_ops:
-      # 'ready' handles the case where one output gradient relies on
-      # another output's gradient.
-      ready = (pending_count[op] == 0)
-      if ready and op not in to_ops_set and op in reachable_to_ops:
-        to_ops_set.add(op)
-        queue.append(op)
-
-    if loop_state:
-      loop_exits = loop_state.ProcessUnusedLoopExits(pending_count, to_ops_set)
-      for y in loop_exits:
-        if IsTrainable(y):
-          _SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
-          queue.append(y.op)
-
-    stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count, xs)
-    while queue:
-      # generate gradient subgraph for op.
-      op = queue.popleft()
-      with _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops):
-        if loop_state:
-          loop_state.EnterGradWhileContext(op, before=True)
-        out_grads = _AggregatedGrads(grads, op, gradient_uid, loop_state,
-                                     aggregation_method)
-        if loop_state:
-          loop_state.ExitGradWhileContext(op, before=True)
-
-        grad_fn = None
-        func_call = None
-        is_partitioned_call = _IsPartitionedCall(op)
-        # pylint: disable=protected-access
-        is_func_call = (
-            src_graph._is_function(op.type) or is_partitioned_call)
-        # pylint: enable=protected-access
-        has_out_grads = any(isinstance(g, ops.Tensor) or g for g in out_grads)
-        if has_out_grads and (op not in stop_ops):
-          try:
-            grad_fn = ops.get_gradient_function(op)
-          except LookupError:
-            if is_func_call:
-              if is_partitioned_call:
-                func_call = src_graph._get_function(  # pylint: disable=protected-access
-                    compat.as_bytes(op.get_attr("f").name))
-              else:
-                func_call = src_graph._get_function(op.type)  # pylint: disable=protected-access
-              # Note that __defun is not set if the graph is
-              # imported. If it's set, we prefer to access the original
-              # defun.
-              func_call = getattr(op, "__defun", func_call)
-              grad_fn = func_call.python_grad_func
-            else:
-              raise LookupError(
-                  "No gradient defined for operation '%s' (op type: %s)" %
-                  (op.name, op.type))
-        if loop_state:
-          loop_state.EnterGradWhileContext(op, before=False)
-
-        # NOTE(skyewm): We don't support computing gradients wrt a loop variable
-        # unless it's within the context of a single iteration (i.e. the
-        # gradient is wrt to the loop parameter in the body function, not wrt or
-        # through the initial value). This means if we're in a while loop
-        # context, we should never see a switch node from this context.
-        # pylint: disable=protected-access
-        if (control_flow_util.IsSwitch(op) and
-            op._control_flow_context is not None and
-            op._control_flow_context.IsWhileContext() and
-            op._control_flow_context ==
-            ops.get_default_graph()._get_control_flow_context()):
-          _RaiseNoGradWrtInitialLoopValError(op, from_ops, xs)
-        # pylint: enable=protected-access
-
-        if (grad_fn or is_func_call) and has_out_grads:
-          # NOTE: If _AggregatedGrads didn't compute a value for the i'th
-          # output, it means that the cost does not depend on output[i],
-          # therefore dC/doutput[i] is 0.
-          for i, out_grad in enumerate(out_grads):
-            if (not isinstance(out_grad, ops.Tensor) and not out_grad) and (
-                (not grad_fn and is_func_call) or IsTrainable(op.outputs[i])):
-              # Only trainable outputs or outputs for a function call that
-              # will use SymbolicGradient get a zero gradient. Gradient
-              # functions should ignore the gradient for other outputs.
-              # TODO(apassos) gradients of resource handles might be an
-              # issue here because of zeros.
-              if loop_state:
-                out_grads[i] = loop_state.ZerosLike(op, i)
-              else:
-                out_grads[i] = control_flow_ops.ZerosLikeOutsideLoop(op, i)
-          with ops.name_scope(op.name + "_grad"):
-            # pylint: disable=protected-access
-            with src_graph._original_op(op):
-              # pylint: enable=protected-access
-              if grad_fn:
-                # If grad_fn was found, do not use SymbolicGradient even for
-                # functions.
-                in_grads = _MaybeCompile(grad_scope, op, func_call,
-                                         lambda: grad_fn(op, *out_grads))
-              else:
-                # For function call ops, we add a 'SymbolicGradient'
-                # node to the graph to compute gradients.
-                in_grads = _MaybeCompile(grad_scope, op, func_call,
-                                         lambda: _SymGrad(op, out_grads))
-              in_grads = _AsList(in_grads)
-              _VerifyGeneratedGradients(in_grads, op)
-              if gate_gradients and len([x for x in in_grads
-                                         if x is not None]) > 1:
-                with ops.device(None):
-                  with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
-                      None,
-                      gradient_uid,
-                      ignore_existing=True):
-                    in_grads = control_flow_ops.tuple(in_grads)
-          _LogOpGradients(op, out_grads, in_grads)
-        else:
-          # If no grad_fn is defined or none of out_grads is available,
-          # just propagate a list of None backwards.
-          in_grads = [None] * len(_NonEagerInputs(op, xs))
-        for i, (t_in, in_grad) in enumerate(zip(_NonEagerInputs(op, xs),
-                                                in_grads)):
-          if in_grad is not None:
-            if (isinstance(in_grad, ops.Tensor) and
-                t_in.dtype != dtypes.resource):
-              try:
-                in_grad.set_shape(t_in.get_shape())
-              except ValueError:
-                raise ValueError(
-                    "Incompatible shapes between op input and calculated "
-                    "input gradient.  Forward operation: %s.  Input index: %d. "
-                    "Original input shape: %s.  "
-                    "Calculated input gradient shape: %s" %
-                    (op.name, i, t_in.shape, in_grad.shape))
-            _SetGrad(grads, t_in, in_grad)
-        if loop_state:
-          loop_state.ExitGradWhileContext(op, before=False)
-
-      # Update pending count for the inputs of op and enqueue ready ops.
-      _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state,
-                                    xs)
-
-  if loop_state:
-    loop_state.PostProcessing()
-  return [_GetGrad(grads, x, unconnected_gradients) for x in xs]
-
-
-def _HasAnyNotNoneGrads(grads, op):
-  """Return true iff op has real gradient."""
-  out_grads = _GetGrads(grads, op)
-  for out_grad in out_grads:
-    if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)):
-      return True
-    if out_grad and isinstance(out_grad, collections.Sequence):
-      if any(g is not None for g in out_grad):
-        return True
-  return False
-
-
-def _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state,
-                                  xs):
-  """Update pending count for the inputs of op and enqueue ready ops."""
-  for x in _NonEagerInputs(op, xs):
-    pending_count[x.op] -= 1
-    ready = (pending_count[x.op] == 0)
-    if loop_state and not ready:
-      ready = pending_count[x.op] > 0 and control_flow_util.IsLoopSwitch(x.op)
-    if ready:
-      if control_flow_util.IsLoopExit(x.op):
-        # if x is an exit without real gradient, defer processing them.
-        grad_state = loop_state.GetGradState(x.op, before=False)
-        grad_state.deferred_exits.append(x)
-        grad_state.pending_exits_count -= 1
-        if grad_state.pending_exits_count == 0:
-          # We now have all the exits so process them.
-          has_not_none_grad = False
-          for y in grad_state.deferred_exits:
-            if _HasAnyNotNoneGrads(grads, y.op):
-              has_not_none_grad = True
-              queue.append(y.op)
-            else:
-              grad_state.unused_exits.append(y)
-          if has_not_none_grad:
-            # For an unused exit, if it has trainable outputs, backprop
-            # a zero gradient. Otherwise, just ignore it.
-            for y in grad_state.unused_exits:
-              if IsTrainable(y):
-                _SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
-              queue.append(y.op)
-          else:
-            # All exits are "unused" so use None as gradient.
-            for y in grad_state.unused_exits:
-              queue.append(y.op)
-      else:
-        queue.append(x.op)
-
-
-def _SetGrad(grads, t, grad):
-  """Sets gradient "grad" in "grads" for tensor "t"."""
-  op = t.op
-  op_grads = grads.get(op)
-  if not op_grads:
-    op_grads = [[] for _ in xrange(len(op.outputs))]
-    grads[op] = op_grads
-  t_grads = op_grads[t.value_index]
-  if isinstance(t_grads, list):
-    t_grads.append(grad)
-  else:
-    assert control_flow_util.IsLoopSwitch(op)
-    op_grads[t.value_index] = grad
-
-
-def _GetGrad(grads, t, unconnected_gradients):
-  """Gets gradient for tensor "t"."""
-  op = t.op
-  op_grads = grads.get(op)
-  if not op_grads:
-    if unconnected_gradients == UnconnectedGradients.ZERO:
-      t_dtype = t.dtype if t.dtype != dtypes.resource else dtypes.float32
-      return array_ops.zeros_like(t, dtype=t_dtype)
-    elif unconnected_gradients == UnconnectedGradients.NONE:
-      return None
-    else:
-      raise ValueError(
-          "Unknown value for unconnected_gradients: %r" % unconnected_gradients)
-
-  t_grad = op_grads[t.value_index]
-  assert not isinstance(
-      t_grad, list), ("gradients list should have been aggregated by now.")
-  return t_grad
-
-
-def _GetGrads(grads, op):
-  """Gets all gradients for op."""
-  if op in grads:
-    return grads[op]
-  else:
-    return [[] for _ in xrange(len(op.outputs))]
-
-
-def _HandleNestedIndexedSlices(grad):
-  assert isinstance(grad, ops.IndexedSlices)
-  if isinstance(grad.values, ops.Tensor):
-    return grad
-  else:
-    assert isinstance(grad.values, ops.IndexedSlices)
-    g = _HandleNestedIndexedSlices(grad.values)
-    return ops.IndexedSlices(g.values, array_ops.gather(
-        grad.indices, g.indices), g.dense_shape)
-
-
-def _AccumulatorShape(inputs):
-  shape = tensor_shape.unknown_shape()
-  for i in inputs:
-    if isinstance(i, ops.Tensor):
-      shape = shape.merge_with(i.get_shape())
-  return shape
-
-
-def _LogOpGradients(op, out_grads, in_grads):
-  """Log the in and out grads of an op."""
-  logging.vlog(1, "Gradient for '" + op.name + "'")
-
-  def _FilterGrad(x):
-    if x is None:
-      return False
-    if isinstance(x, (list, tuple)):
-      return bool(x)
-    else:
-      return True
-
-  logging.vlog(1, "  in  --> %s",
-               ", ".join([x.name for x in out_grads if _FilterGrad(x)]))
-  logging.vlog(1, "  out --> %s",
-               ", ".join([x.name for x in in_grads if _FilterGrad(x)]))
-
-
-def _MultiDeviceAddN(tensor_list, gradient_uid):
-  """Adds tensors from potentially multiple devices."""
-  # Basic function structure comes from control_flow_ops.group().
-  # Sort tensors according to their devices.
-  tensors_on_device = collections.defaultdict(lambda: [])
-  for tensor in tensor_list:
-    tensors_on_device[tensor.device].append(tensor)
-
-  # For each device, add the tensors on that device first.
-  # Then gather the partial sums from multiple devices.
-  # TODO(sjhwang): Create hierarchical aggregation tree as pbar's suggestion.
-  # E.g., aggregate per GPU, then per task, and so on.
-  summands = []
-
-  def DeviceKey(dev):
-    return "" if dev is None else dev
-
-  for dev in sorted(six.iterkeys(tensors_on_device), key=DeviceKey):
-    tensors = tensors_on_device[dev]
-    with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
-        tensors[0].op,
-        gradient_uid,
-        ignore_existing=True):
-      summands.append(math_ops.add_n(tensors))
-
-  return math_ops.add_n(summands)
-
-
-@tf_export("AggregationMethod")
-class AggregationMethod(object):
-  """A class listing aggregation methods used to combine gradients.
-
-  Computing partial derivatives can require aggregating gradient
-  contributions. This class lists the various methods that can
-  be used to combine gradients in the graph:
-
-  *  `ADD_N`: All of the gradient terms are summed as part of one
-     operation using the "AddN" op. It has the property that all
-     gradients must be ready before any aggregation is performed.
-  *  `DEFAULT`: The system-chosen default aggregation method.
-  """
-  ADD_N = 0
-  DEFAULT = ADD_N
-  # The following are experimental and may not be supported in future releases.
-  EXPERIMENTAL_TREE = 1
-  EXPERIMENTAL_ACCUMULATE_N = 2
-
-
-def _AggregatedGrads(grads,
-                     op,
-                     gradient_uid,
-                     loop_state,
-                     aggregation_method=None):
-  """Get the aggregated gradients for op.
-
-  Args:
-    grads: The map of memoized gradients.
-    op: The op to get gradients for.
-    gradient_uid: A unique identifier within the graph indicating
-      which invocation of gradients is being executed. Used to cluster
-      ops for compilation.
-    loop_state: An object for maintaining the state of the while loops in the
-                graph. It is of type ControlFlowState. None if the graph
-                contains no while loops.
-    aggregation_method: Specifies the method used to combine gradient terms.
-      Accepted values are constants defined in the class `AggregationMethod`.
-
-  Returns:
-    A list of gradients, one per each output of `op`. If the gradients
-      for a particular output is a list, this function aggregates it
-      before returning.
-
-  Raises:
-    TypeError: if the incoming grads are not Tensors or IndexedSlices.
-    ValueError: if the arguments are invalid.
-
-  """
-  if aggregation_method is None:
-    aggregation_method = AggregationMethod.DEFAULT
-  if aggregation_method not in [
-      AggregationMethod.ADD_N, AggregationMethod.EXPERIMENTAL_TREE,
-      AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
-  ]:
-    raise ValueError(
-        "Invalid aggregation_method specified %s." % aggregation_method)
-  out_grads = _GetGrads(grads, op)
-  for i, out_grad in enumerate(out_grads):
-    if loop_state:
-      if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)):
-        assert control_flow_util.IsLoopSwitch(op)
-        continue
-    # Grads have to be Tensors or IndexedSlices
-    if (isinstance(out_grad, collections.Sequence) and not all(
-        isinstance(g, (ops.Tensor, ops.IndexedSlices))
-        for g in out_grad
-        if g is not None
-    )):
-      raise TypeError("gradients have to be either all Tensors "
-                      "or all IndexedSlices")
-    # Aggregate multiple gradients, and convert [] to None.
-    if out_grad:
-      if len(out_grad) < 2:
-        used = "nop"
-        out_grads[i] = out_grad[0]
-      elif all(isinstance(g, ops.Tensor) for g in out_grad if g is not None):
-        tensor_shape = _AccumulatorShape(out_grad)
-        if (aggregation_method == AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
-            and len(out_grad) > 2 and tensor_shape.is_fully_defined()):
-          # The benefit of using AccumulateN is that its inputs can be combined
-          # in any order and this can allow the expression to be evaluated with
-          # a smaller memory footprint.  When used with gpu_allocator_retry,
-          # it is possible to compute a sum of terms which are much larger than
-          # total GPU memory.
-          # AccumulateN can currently only be used if we know the shape for
-          # an accumulator variable.  If this is not known, or if we only have
-          # 2 grads then we fall through to the "tree" case below.
-          used = "accumulate_n"
-          out_grads[i] = math_ops.accumulate_n(out_grad)
-        elif aggregation_method in [
-            AggregationMethod.EXPERIMENTAL_TREE,
-            AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
-        ]:
-          # Aggregate all gradients by doing pairwise sums: this may
-          # reduce performance, but it can improve memory because the
-          # gradients can be released earlier.
-          #
-          # TODO(vrv): Consider replacing this with a version of
-          # tf.AddN() that eagerly frees its inputs as soon as they are
-          # ready, so the order of this tree does not become a problem.
-          used = "tree"
-          with ops.name_scope(op.name + "_gradient_sum"):
-            running_sum = out_grad[0]
-            for grad in out_grad[1:]:
-              running_sum = math_ops.add_n([running_sum, grad])
-            out_grads[i] = running_sum
-        else:
-          used = "add_n"
-          out_grads[i] = _MultiDeviceAddN(out_grad, gradient_uid)
-        logging.vlog(2, "  _AggregatedGrads %d x %s using %s", len(out_grad),
-                     tensor_shape, used)
-      else:
-        out_grads[i] = _AggregateIndexedSlicesGradients(out_grad)
-    else:  # not out_grad
-      # out_grads[i] is [], thus its aggregation is simply None.
-      out_grads[i] = None
-  return out_grads
-
-
-def _AggregateIndexedSlicesGradients(grads):
-  """Aggregates gradients of type `IndexedSlices` by concatenation."""
-  if len(grads) < 1:
-    return None
-  elif len(grads) == 1:
-    return grads[0]
-  else:
-    grads = math_ops._as_indexed_slices_list(  # pylint: disable=protected-access
-        [g for g in grads if g is not None])
-    grads = [_HandleNestedIndexedSlices(x) for x in grads]  # pylint: disable=protected-access
-    # Form IndexedSlices out of the concatenated values and indices.
-    concat_grad = ops.IndexedSlices(
-        array_ops.concat([x.values for x in grads], axis=0),
-        array_ops.concat([x.indices for x in grads], axis=0),
-        grads[0].dense_shape)
-
-    return concat_grad
+  # pylint: disable=protected-access
+  with ops.get_default_graph()._mutation_lock():
+    return gradients_util._GradientsHelper(
+        ys, xs, grad_ys, name, True, gate_gradients,
+        aggregation_method, stop_gradients,
+        unconnected_gradients)
+  # pylint: enable=protected-access
 
 
 # TODO(vrv): Make this available when we want to make it public.
@@ -1393,7 +361,7 @@ def hessians(ys,
     LookupError: if one of the operations between `xs` and `ys` does not
       have a registered gradient function.
   """
-  xs = _AsList(xs)
+  xs = gradients_util._AsList(xs)  # pylint: disable=protected-access
   kwargs = {
       "colocate_gradients_with_ops": colocate_gradients_with_ops,
       "gate_gradients": gate_gradients,
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index abdcbc7a3ac3b2e6d42bacf4ae454e277220f497..9caffa3ea8e8eee8132b24c67bb50b89f98c9a1b 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -45,6 +45,7 @@ from tensorflow.python.ops import data_flow_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import functional_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import gradients_util
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
@@ -1027,7 +1028,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
           conditional, lambda: alpha * 2, lambda: alpha * 3)
 
       g, = gradients_impl.gradients(output, alpha)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllEqual(g.eval(), [2.0])
       self.assertAllEqual(g.eval(feed_dict={conditional: False}), [3.0])
 
@@ -1040,12 +1041,12 @@ class AggregateIndexedSlicesGradientsTest(test_util.TensorFlowTestCase):
         self.evaluate(ops.convert_to_tensor(right)))
 
   def testNoGradients(self):
-    self.assertIsNone(gradients_impl._AggregateIndexedSlicesGradients([]))
+    self.assertIsNone(gradients_util._AggregateIndexedSlicesGradients([]))
 
   def testOneGradient(self):
     t = math_ops._as_indexed_slices(constant_op.constant(
         [[1., 2.], [0, 0], [3., 4.]]))
-    result = gradients_impl._AggregateIndexedSlicesGradients([t])
+    result = gradients_util._AggregateIndexedSlicesGradients([t])
     self._assert_indexed_slices_equal(t, result)
 
   def testMultipleGradients(self):
@@ -1055,7 +1056,7 @@ class AggregateIndexedSlicesGradientsTest(test_util.TensorFlowTestCase):
         [[0., 0.], [5, 6], [7., 8.]]))
     total = constant_op.constant(
         [[1., 2.], [5, 6], [10., 12.]])
-    result = gradients_impl._AggregateIndexedSlicesGradients([t0, t1])
+    result = gradients_util._AggregateIndexedSlicesGradients([t0, t1])
     self._assert_indexed_slices_equal(total, result)
 
   def testMultipleGradientsWithNones(self):
@@ -1066,7 +1067,7 @@ class AggregateIndexedSlicesGradientsTest(test_util.TensorFlowTestCase):
     t3 = None
     total = constant_op.constant(
         [[1., 2.], [5, 6], [10., 12.]])
-    result = gradients_impl._AggregateIndexedSlicesGradients([t0, t1, t3])
+    result = gradients_util._AggregateIndexedSlicesGradients([t0, t1, t3])
     self._assert_indexed_slices_equal(total, result)
 
   def testMixedTensorAndIndexedSlices(self):
@@ -1076,7 +1077,7 @@ class AggregateIndexedSlicesGradientsTest(test_util.TensorFlowTestCase):
         [[0., 0.], [5, 6], [7., 8.]])
     total = constant_op.constant(
         [[1., 2.], [5, 6], [10., 12.]])
-    result = gradients_impl._AggregateIndexedSlicesGradients([t0, t1])
+    result = gradients_util._AggregateIndexedSlicesGradients([t0, t1])
     self._assert_indexed_slices_equal(total, result)
 
 
diff --git a/tensorflow/python/ops/gradients_util.py b/tensorflow/python/ops/gradients_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..64c199ad29d421ceaa7364fe229b846d76ca2f6b
--- /dev/null
+++ b/tensorflow/python/ops/gradients_util.py
@@ -0,0 +1,1061 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implements the graph generation for computation of gradients."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import contextlib
+import warnings
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function as framework_function
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework.func_graph import FuncGraph
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import tf_export
+
+
+# Warn the user if we convert a sparse representation to dense with at
+# least this number of elements.
+_LARGE_SPARSE_NUM_ELEMENTS = 100000000
+
+
+def _IndexedSlicesToTensor(value, dtype=None, name=None, as_ref=False):
+  """Converts an IndexedSlices object `value` to a Tensor.
+
+  NOTE(mrry): This function is potentially expensive.
+
+  Args:
+    value: An ops.IndexedSlices object.
+    dtype: The dtype of the Tensor to be returned.
+    name: Optional name to use for the returned Tensor.
+    as_ref: True if a ref is requested.
+
+  Returns:
+    A dense Tensor representing the values in the given IndexedSlices.
+
+  Raises:
+    ValueError: If the IndexedSlices does not have the same dtype.
+  """
+  _ = as_ref
+  if dtype and not dtype.is_compatible_with(value.dtype):
+    raise ValueError(
+        "Tensor conversion requested dtype %s for IndexedSlices with dtype %s" %
+        (dtype.name, value.dtype.name))
+  if value.dense_shape is None:
+    raise ValueError(
+        "Tensor conversion requested for IndexedSlices without dense_shape: %s"
+        % str(value))
+  # TODO(mrry): Consider adding static shape information to
+  # IndexedSlices, to avoid using numpy here.
+  if not context.executing_eagerly():
+    dense_shape_value = tensor_util.constant_value(value.dense_shape)
+    if dense_shape_value is not None:
+      num_elements = np.prod(dense_shape_value)
+      if num_elements >= _LARGE_SPARSE_NUM_ELEMENTS:
+        warnings.warn(
+            "Converting sparse IndexedSlices to a dense Tensor with %d "
+            "elements. This may consume a large amount of memory." %
+            num_elements)
+    else:
+      warnings.warn(
+          "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
+          "This may consume a large amount of memory.")
+  return math_ops.unsorted_segment_sum(
+      value.values, value.indices, value.dense_shape[0], name=name)
+
+
+ops.register_tensor_conversion_function(ops.IndexedSlices,
+                                        _IndexedSlicesToTensor)
+
+
+def _MarkReachedOps(from_ops, reached_ops, func_graphs):
+  """Mark all ops reached from "from_ops".
+
+  Args:
+    from_ops: list of Operations.
+    reached_ops: set of Operations.
+    func_graphs: list of FuncGraphs. This method will traverse through
+      these functions if they capture from_ops or any reachable ops.
+  """
+  queue = collections.deque()
+  queue.extend(from_ops)
+  while queue:
+    op = queue.popleft()
+    if op not in reached_ops:
+      reached_ops.add(op)
+      for output in op.outputs:
+        if _IsBackpropagatable(output):
+          queue.extend(_Consumers(output, func_graphs))
+
+
+def _PendingCount(to_ops, from_ops, colocate_gradients_with_ops, func_graphs,
+                  xs):
+  """Initialize the pending count for ops between two lists of Operations.
+
+  'pending_count[op]' indicates the number of backprop inputs
+  to this operation.
+
+  Args:
+    to_ops: list of Operations.
+    from_ops: list of Operations.
+    colocate_gradients_with_ops: Python bool.  See docstring of gradients().
+    func_graphs: list of FuncGraphs. This method will traverse through
+      these functions if they capture from_ops or any reachable ops. This is
+      useful if to_ops occur in a function and from_ops are in an outer function
+      or graph.
+    xs: list of Tensors.
+
+  Returns:
+    A tuple containing: (1) the subset of to_ops reachable from from_ops by a
+    path of zero or more backpropagatable tensors, (2) a mapping from operation
+    to the number of backprop inputs to that op, and (3) a ControlFlowState
+    object which is not None if the ops between from_ops and to_ops contain
+    control flow loops.
+  """
+  # Mark reachable ops from from_ops.
+  reached_ops = set()
+  _MarkReachedOps(from_ops, reached_ops, func_graphs)
+  # X in reached_ops iff X is reachable from from_ops by a path of zero or more
+  # backpropagatable tensors.
+
+  reachable_to_ops = set(op for op in to_ops if op in reached_ops)
+
+  # Mark between ops.
+  between_ops = set()
+  between_op_list = []
+  queue = collections.deque()
+  queue.extend(to_ops)
+  while queue:
+    op = queue.popleft()
+    # We are interested in this op.
+    if op in reached_ops:
+      between_ops.add(op)
+      between_op_list.append(op)
+      # Clear the boolean so we won't add the inputs again.
+      reached_ops.remove(op)
+      for inp in _NonEagerInputs(op, xs):
+        queue.append(inp.op)
+  # X in between_ops iff X is on a path of zero or more backpropagatable tensors
+  # between from_ops and to_ops
+
+  # 'loop_state' is None if there are no while loops.
+  loop_state = control_flow_ops.MaybeCreateControlFlowState(
+      between_op_list, between_ops, colocate_gradients_with_ops)
+
+  # Initialize pending count for between ops.
+  pending_count = collections.defaultdict(int)
+  for op in between_op_list:
+    for x in _NonEagerInputs(op, xs):
+      if x.op in between_ops:
+        pending_count[x.op] += 1
+
+  return reachable_to_ops, pending_count, loop_state
+
+
+def _AsList(x):
+  return x if isinstance(x, (list, tuple)) else [x]
+
+
+def _DefaultGradYs(grad_ys,
+                   ys,
+                   colocate_gradients_with_ops,
+                   gradient_uid="__unsupported__"):
+  """Fill in default values for grad_ys.
+
+  Args:
+    grad_ys: List of gradients, can contain None.
+    ys: List of tensors.
+    colocate_gradients_with_ops: If True, try colocating gradients with
+      the corresponding op.
+    gradient_uid: A unique identifier within the graph indicating
+      which invocation of gradients is being executed. Used to cluster
+      ops for compilation.
+
+  Returns:
+    A list of gradients to use, without None.
+
+  Raises:
+    ValueError: If sizes of gradients and inputs don't match
+    TypeError: If type of any gradient is not valid for its input.
+  """
+  if len(grad_ys) != len(ys):
+    raise ValueError("Passed %d grad_ys for %d ys" % (len(grad_ys), len(ys)))
+  grad_ys = ops.convert_n_to_tensor_or_indexed_slices(grad_ys, name="grad_y")
+  new_grad_ys = []
+  for i in xrange(len(grad_ys)):
+    grad_y = grad_ys[i]
+    y = ys[i]
+    with _maybe_colocate_with(y.op, gradient_uid, colocate_gradients_with_ops):
+      if grad_y is None:
+        if y.dtype.is_complex:
+          raise TypeError(
+              "Gradients of complex tensors must set grad_ys (y.dtype = %r)" %
+              y.dtype)
+        new_grad_ys.append(
+            array_ops.fill(
+                array_ops.shape(y),
+                constant_op.constant(1, dtype=y.dtype, name="grad_ys_%d" % i)))
+        continue
+      if y.dtype.is_floating or y.dtype.is_integer:
+        if not grad_y.dtype.is_floating and not grad_y.dtype.is_integer:
+          raise TypeError(
+              "Gradient type %s generated for real or "
+              "integer-valued tensor %s with type %s must be "
+              "real or integer" % (dtypes.as_dtype(grad_y.dtype).name, y,
+                                   dtypes.as_dtype(y.dtype).name))
+      elif y.dtype.is_complex:
+        if not grad_y.dtype.is_complex:
+          raise TypeError(
+              "Gradient type %s generated for complex-valued "
+              "tensor %s with type %s must be real" % (dtypes.as_dtype(
+                  grad_y.dtype).name, y, dtypes.as_dtype(y.dtype).name))
+      elif y.dtype == dtypes.variant:
+        if grad_y.dtype != dtypes.variant:
+          raise TypeError(
+              "Gradient type %s generated for variant "
+              "tensor %s with type %s must be variant" % (dtypes.as_dtype(
+                  grad_y.dtype).name, y, dtypes.as_dtype(y.dtype).name))
+      elif y.dtype == dtypes.resource:
+        # We assume y is the handle of a ResourceVariable. The gradient of a
+        # ResourceVariable should be a numeric value, not another resource.
+        if grad_y.dtype == dtypes.resource:
+          raise TypeError("Input gradient %s for resource tensor %s should not "
+                          "be a resource" % (grad_y, y))
+      else:
+        raise TypeError(
+            "Tensor %s with type %s must be numeric "
+            "to obtain a default gradient" % (y, dtypes.as_dtype(y.dtype).name))
+      # Create a grad_y tensor in the name scope of the gradient.
+      # Required for TensorArrays to identify which gradient call a
+      # grad_y value is coming from.
+      if isinstance(grad_y, ops.IndexedSlices):
+        new_grad_ys.append(
+            ops.IndexedSlices(
+                indices=(array_ops.identity(
+                    grad_y.indices, name="grad_ys_%d_indices" % i)
+                         if isinstance(grad_y.indices, ops.Tensor) else
+                         grad_y.indices),
+                values=(array_ops.identity(
+                    grad_y.values, name="grad_ys_%d_values" % i) if isinstance(
+                        grad_y.values, ops.Tensor) else grad_y.values),
+                dense_shape=(array_ops.identity(
+                    grad_y.dense_shape, name="grad_ys_%d_shape" % i)
+                             if isinstance(grad_y.dense_shape, ops.Tensor) else
+                             grad_y.dense_shape)))
+      else:
+        new_grad_ys.append(array_ops.identity(grad_y, name="grad_ys_%d" % i))
+
+  return new_grad_ys
+
+
+def IsTrainable(tensor_or_dtype):
+  if isinstance(tensor_or_dtype, ops.Tensor):
+    dtype = tensor_or_dtype.dtype
+  else:
+    dtype = tensor_or_dtype
+  dtype = dtypes.as_dtype(dtype)
+  return dtype.base_dtype in (dtypes.float16, dtypes.float32, dtypes.float64,
+                              dtypes.complex64, dtypes.complex128,
+                              dtypes.resource, dtypes.variant)
+
+
+def _IsBackpropagatable(tensor):
+  if IsTrainable(tensor):
+    return True
+  dtype = dtypes.as_dtype(tensor.dtype)
+  return dtype.base_dtype == dtypes.bfloat16
+
+
+def _VerifyGeneratedGradients(grads, op):
+  """Verify that gradients are valid in number and type.
+
+  Args:
+    grads: List of generated gradients.
+    op: Operation for which the gradients where generated.
+
+  Raises:
+    ValueError: if sizes of gradients and inputs don't match.
+    TypeError: if type of any gradient is not valid for its input.
+  """
+  # While ops have inputs added to them during the gradient computation, so we
+  # skip the below check. See while_v2 for details.
+  if op.type == "While": return
+
+  if len(grads) != len(op.inputs):
+    raise ValueError("Num gradients %d generated for op %s do not match num "
+                     "inputs %d" % (len(grads), op.node_def, len(op.inputs)))
+
+
+def _StopOps(from_ops, stop_gradient_ops, pending_count, xs):
+  """The set of ops that terminate the gradient computation.
+
+  This computes the frontier of the forward graph *before* which backprop
+  should stop. Operations in the returned set will not be differentiated.
+  This set is defined as the subset of `from_ops` containing ops that have
+  no predecessor in `from_ops`. `pending_count` is the result of
+  `_PendingCount(xs, from_ops)`. An 'op' has predecessors in `from_ops`
+  iff pending_count[op] > 0.
+
+  In addition, none of `stop_gradient_ops` will be differentiated.
+
+  Args:
+    from_ops: list of Operations.
+    stop_gradient_ops: list of Operations never to backprop through.
+    pending_count: mapping from operation to number of backprop inputs.
+    xs: list of Tensors.
+
+  Returns:
+    The set of operations.
+  """
+  stop_ops = set()
+  for op in from_ops:
+    is_stop_op = True
+    for inp in _NonEagerInputs(op, xs):
+      if pending_count[inp.op] > 0:
+        is_stop_op = False
+        break
+    if is_stop_op:
+      stop_ops.add(op)
+  stop_ops.update(op for op in stop_gradient_ops)
+  return stop_ops
+
+
+@contextlib.contextmanager
+def _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops):  # pylint: disable=invalid-name
+  """Context to colocate with `op` if `colocate_gradients_with_ops`."""
+  if colocate_gradients_with_ops:
+    with ops._colocate_with_for_gradient(op, gradient_uid):  # pylint: disable=protected-access
+      yield
+  else:
+    yield
+
+
+def _IsPartitionedCall(op):
+  return op.type == "PartitionedCall" or op.type == "StatefulPartitionedCall"
+
+
+def _SymGrad(op, out_grads):
+  """Backprop through a function call node op given its outputs' gradients."""
+  f_in = [x for x in op.inputs] + out_grads
+  f_types = [x.dtype for x in op.inputs]
+  f = attr_value_pb2.NameAttrList()
+  if _IsPartitionedCall(op):
+    f.name = op.get_attr("f").name
+  else:
+    f.name = op.type
+  for k in op.node_def.attr:
+    f.attr[k].CopyFrom(op.node_def.attr[k])
+  # TODO(apassos) use a better dtype here
+  in_grads = functional_ops.symbolic_gradient(
+      input=f_in,
+      Tout=[x if x != dtypes.resource else dtypes.float32 for x in f_types],
+      f=f)
+  return in_grads
+
+
+def _MaybeCompile(scope, op, func, grad_fn):
+  """Compile the calculation in grad_fn if op was marked as compiled."""
+  scope = scope.rstrip("/").replace("/", "_")
+  if func is not None:
+    xla_compile = func.definition.attr["_XlaCompile"].b
+    xla_separate_compiled_gradients = func.definition.attr[
+        "_XlaSeparateCompiledGradients"].b
+    xla_scope = func.definition.attr["_XlaScope"].s.decode()
+  else:
+    try:
+      xla_compile = op.get_attr("_XlaCompile")
+      xla_separate_compiled_gradients = op.get_attr(
+          "_XlaSeparateCompiledGradients")
+      xla_scope = op.get_attr("_XlaScope").decode()
+    except ValueError:
+      return grad_fn()  # Exit early
+
+  if not xla_compile:
+    return grad_fn()  # Exit early
+
+  # If the gradients are supposed to be compiled separately, we give them a
+  # _XlaScope name that is based on the name_scope of the gradients.  Otherwise
+  # they just inherit the existing _XlaScope name, which lets them be merged
+  # together with the non-gradient computation.
+  if xla_separate_compiled_gradients:
+    xla_grad_scope = "%s_grad_%s" % (xla_scope, scope)
+  else:
+    xla_grad_scope = xla_scope
+
+  attrs = {
+      "_XlaCompile": attr_value_pb2.AttrValue(b=xla_compile),
+      "_XlaScope": attr_value_pb2.AttrValue(s=xla_grad_scope.encode())
+  }
+  with ops.get_default_graph()._attr_scope(attrs):  # pylint: disable=protected-access
+    return grad_fn()
+
+
+def _RaiseNoGradWrtInitialLoopValError(op, from_ops, xs):
+  """Raises an error if we backprop through a loop var."""
+  # Find the nearest 'to_op' reachable from 'op' to provide a more helpful error
+  # message.
+  target_op = None
+  queue = collections.deque([op])
+  visited = set()
+  while queue:
+    curr_op = queue.popleft()
+    if curr_op in visited: continue
+    visited.add(curr_op)
+    if curr_op in from_ops:
+      target_op = curr_op
+      break
+    queue.extend(t.op for t in _NonEagerInputs(curr_op, xs))
+  assert target_op
+  raise ValueError(
+      "Cannot compute gradient inside while loop with respect to op '%s'. "
+      "We do not support taking the gradient wrt or through the initial value "
+      "of a loop variable. Gradients can be computed through loop invariants "
+      "or wrt the input parameters to the loop body."
+      % target_op.name)
+
+
+def _IsFunction(graph):
+  return (isinstance(graph, FuncGraph) or
+          isinstance(graph, framework_function._FuncGraph))  # pylint: disable=protected-access
+
+
+def _Captures(func_graph):
+  if isinstance(func_graph, FuncGraph):
+    return func_graph.captures
+  else:
+    assert isinstance(func_graph, framework_function._FuncGraph)  # pylint: disable=protected-access
+    return func_graph._captured  # pylint: disable=protected-access
+
+
+def _MaybeCaptured(t):
+  """If t is a captured value placeholder, returns the original captured value.
+
+  Args:
+    t: Tensor
+
+  Returns:
+    A tensor, potentially from a different Graph/FuncGraph.
+  """
+  # pylint: disable=protected-access
+  if (not isinstance(t, ops.EagerTensor) and
+      _IsFunction(t.op.graph) and t.op.type == "Placeholder"):
+    for input_t, placeholder_t in _Captures(t.op.graph).items():
+      if t == placeholder_t:
+        return _MaybeCaptured(input_t)
+  # pylint: enable=protected-access
+  return t
+
+
+# TODO(skyewm): plumbing xs through everywhere is ugly, consider making
+# _GradientsHelper a class with xs as a member variable.
+def _NonEagerInputs(op, xs):
+  """Returns the inputs of op, crossing closure boundaries where necessary.
+
+  Does not return any captured EagerTensors, i.e., the number of tensors
+  returned may be less than than the actual number of inputs.
+
+  Args:
+    op: Operation
+    xs: list of Tensors we are differentiating w.r.t.
+
+  Returns:
+    A list of tensors. The tensors may be from multiple Graph/FuncGraphs if op
+    is in a FuncGraph and has captured inputs.
+  """
+  if _IsFunction(op.graph):  # pylint: disable=protected-access
+    inputs = []
+    for t in op.inputs:
+      # If we're differentiating w.r.t. `t`, do not attempt to traverse through
+      # it to a captured value. The algorithm needs to "see" `t` in this case,
+      # even if it's a function input for a captured value, whereas usually we'd
+      # like to traverse through these closures as if the captured value was the
+      # direct input to op.
+      if t not in xs:
+        t = _MaybeCaptured(t)
+        # Skip captured eager inputs.
+        if isinstance(t, ops.EagerTensor): continue
+      inputs.append(t)
+    return inputs
+  else:
+    return op.inputs
+
+
+def _Consumers(t, func_graphs):
+  """Returns the consumers of t, crossing closure boundaries where necessary.
+
+  Args:
+    t: Tensor
+    func_graphs: a list of FuncGraphs that may have captured t.
+
+  Returns:
+    A list of tensors. The tensors will be from the current graph and/or
+    func_graphs.
+  """
+  consumers = t.consumers()
+  for func in func_graphs:
+    for input_t, placeholder in _Captures(func).items():
+      if input_t == t:
+        consumers.extend(_Consumers(placeholder, func_graphs))
+  return consumers
+
+
+def _GradientsHelper(ys,
+                     xs,
+                     grad_ys=None,
+                     name="gradients",
+                     colocate_gradients_with_ops=False,
+                     gate_gradients=False,
+                     aggregation_method=None,
+                     stop_gradients=None,
+                     unconnected_gradients=UnconnectedGradients.NONE,
+                     src_graph=None):
+  """Implementation of gradients()."""
+  if context.executing_eagerly():
+    raise RuntimeError("tf.gradients is not supported when eager execution "
+                       "is enabled. Use tf.GradientTape instead.")
+  if src_graph is None:
+    src_graph = ops.get_default_graph()
+  try:
+    unconnected_gradients = UnconnectedGradients(unconnected_gradients)
+  except ValueError:
+    raise ValueError(
+        "Unknown value for unconnected_gradients: %r" % unconnected_gradients)
+
+  # If src_graph is a _FuncGraph (i.e. a function body), gather it and all
+  # ancestor graphs. This is necessary for correctly handling captured values.
+  func_graphs = []
+  curr_graph = src_graph
+  while _IsFunction(curr_graph):
+    func_graphs.append(curr_graph)
+    if isinstance(curr_graph, FuncGraph):
+      curr_graph = curr_graph.outer_graph
+    else:
+      assert isinstance(curr_graph, framework_function._FuncGraph)  # pylint: disable=protected-access
+      curr_graph = curr_graph._outer_graph  # pylint: disable=protected-access
+
+  ys = _AsList(ys)
+  xs = _AsList(xs)
+  stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients)
+  if grad_ys is None:
+    grad_ys = [None] * len(ys)
+  else:
+    grad_ys = _AsList(grad_ys)
+
+  with ops.name_scope(
+      name, "gradients",
+      list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope:
+    # Get a uid for this call to gradients that can be used to help
+    # cluster ops for compilation.
+    gradient_uid = ops.get_default_graph().unique_name("uid")
+    ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
+    xs = [
+        x.handle if resource_variable_ops.is_resource_variable(x) else x
+        for x in xs
+    ]
+    xs = ops.internal_convert_n_to_tensor_or_indexed_slices(
+        xs, name="x", as_ref=True)
+    grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops,
+                             gradient_uid)
+
+    # The approach we take here is as follows: Create a list of all ops in the
+    # subgraph between the ys and xs.  Visit these ops in reverse order of ids
+    # to ensure that when we visit an op the gradients w.r.t its outputs have
+    # been collected.  Then aggregate these gradients if needed, call the op's
+    # gradient function, and add the generated gradients to the gradients for
+    # its input.
+
+    # Initialize the pending count for ops in the connected subgraph from ys
+    # to the xs.
+    to_ops = [t.op for t in ys]
+    from_ops = [t.op for t in xs]
+    stop_gradient_ops = [t.op for t in stop_gradients]
+    reachable_to_ops, pending_count, loop_state = _PendingCount(
+        to_ops, from_ops, colocate_gradients_with_ops, func_graphs, xs)
+
+    # Iterate over the collected ops.
+    #
+    # grads: op => list of gradients received on each output endpoint of the
+    # op.  The gradients for each endpoint are initially collected as a list.
+    # When it is time to call the op's gradient function, for each endpoint we
+    # aggregate the list of received gradients into a Add() Operation if there
+    # is more than one.
+    grads = {}
+
+    # Add the initial gradients for the ys.
+    for y, grad_y in zip(ys, grad_ys):
+      _SetGrad(grads, y, grad_y)
+
+    # Initialize queue with to_ops.
+    queue = collections.deque()
+    # Add the ops in 'to_ops' into the queue.
+    to_ops_set = set()
+    for op in to_ops:
+      # 'ready' handles the case where one output gradient relies on
+      # another output's gradient.
+      ready = (pending_count[op] == 0)
+      if ready and op not in to_ops_set and op in reachable_to_ops:
+        to_ops_set.add(op)
+        queue.append(op)
+
+    if loop_state:
+      loop_exits = loop_state.ProcessUnusedLoopExits(pending_count, to_ops_set)
+      for y in loop_exits:
+        if IsTrainable(y):
+          _SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
+          queue.append(y.op)
+
+    stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count, xs)
+    while queue:
+      # generate gradient subgraph for op.
+      op = queue.popleft()
+      with _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops):
+        if loop_state:
+          loop_state.EnterGradWhileContext(op, before=True)
+        out_grads = _AggregatedGrads(grads, op, gradient_uid, loop_state,
+                                     aggregation_method)
+        if loop_state:
+          loop_state.ExitGradWhileContext(op, before=True)
+
+        grad_fn = None
+        func_call = None
+        is_partitioned_call = _IsPartitionedCall(op)
+        # pylint: disable=protected-access
+        is_func_call = (
+            src_graph._is_function(op.type) or is_partitioned_call)
+        # pylint: enable=protected-access
+        has_out_grads = any(isinstance(g, ops.Tensor) or g for g in out_grads)
+        if has_out_grads and (op not in stop_ops):
+          try:
+            grad_fn = ops.get_gradient_function(op)
+          except LookupError:
+            if is_func_call:
+              if is_partitioned_call:
+                func_call = src_graph._get_function(  # pylint: disable=protected-access
+                    compat.as_bytes(op.get_attr("f").name))
+              else:
+                func_call = src_graph._get_function(op.type)  # pylint: disable=protected-access
+              # Note that __defun is not set if the graph is
+              # imported. If it's set, we prefer to access the original
+              # defun.
+              func_call = getattr(op, "__defun", func_call)
+              grad_fn = func_call.python_grad_func
+            else:
+              raise LookupError(
+                  "No gradient defined for operation '%s' (op type: %s)" %
+                  (op.name, op.type))
+        if loop_state:
+          loop_state.EnterGradWhileContext(op, before=False)
+
+        # NOTE(skyewm): We don't support computing gradients wrt a loop variable
+        # unless it's within the context of a single iteration (i.e. the
+        # gradient is wrt to the loop parameter in the body function, not wrt or
+        # through the initial value). This means if we're in a while loop
+        # context, we should never see a switch node from this context.
+        # pylint: disable=protected-access
+        if (control_flow_util.IsSwitch(op) and
+            op._control_flow_context is not None and
+            op._control_flow_context.IsWhileContext() and
+            op._control_flow_context ==
+            ops.get_default_graph()._get_control_flow_context()):
+          _RaiseNoGradWrtInitialLoopValError(op, from_ops, xs)
+        # pylint: enable=protected-access
+
+        if (grad_fn or is_func_call) and has_out_grads:
+          # NOTE: If _AggregatedGrads didn't compute a value for the i'th
+          # output, it means that the cost does not depend on output[i],
+          # therefore dC/doutput[i] is 0.
+          for i, out_grad in enumerate(out_grads):
+            if (not isinstance(out_grad, ops.Tensor) and not out_grad) and (
+                (not grad_fn and is_func_call) or IsTrainable(op.outputs[i])):
+              # Only trainable outputs or outputs for a function call that
+              # will use SymbolicGradient get a zero gradient. Gradient
+              # functions should ignore the gradient for other outputs.
+              # TODO(apassos) gradients of resource handles might be an
+              # issue here because of zeros.
+              if loop_state:
+                out_grads[i] = loop_state.ZerosLike(op, i)
+              else:
+                out_grads[i] = control_flow_ops.ZerosLikeOutsideLoop(op, i)
+          with ops.name_scope(op.name + "_grad"):
+            # pylint: disable=protected-access
+            with src_graph._original_op(op):
+              # pylint: enable=protected-access
+              if grad_fn:
+                # If grad_fn was found, do not use SymbolicGradient even for
+                # functions.
+                in_grads = _MaybeCompile(grad_scope, op, func_call,
+                                         lambda: grad_fn(op, *out_grads))
+              else:
+                # For function call ops, we add a 'SymbolicGradient'
+                # node to the graph to compute gradients.
+                in_grads = _MaybeCompile(grad_scope, op, func_call,
+                                         lambda: _SymGrad(op, out_grads))
+              in_grads = _AsList(in_grads)
+              _VerifyGeneratedGradients(in_grads, op)
+              if gate_gradients and len([x for x in in_grads
+                                         if x is not None]) > 1:
+                with ops.device(None):
+                  with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
+                      None,
+                      gradient_uid,
+                      ignore_existing=True):
+                    in_grads = control_flow_ops.tuple(in_grads)
+          _LogOpGradients(op, out_grads, in_grads)
+        else:
+          # If no grad_fn is defined or none of out_grads is available,
+          # just propagate a list of None backwards.
+          in_grads = [None] * len(_NonEagerInputs(op, xs))
+        for i, (t_in, in_grad) in enumerate(zip(_NonEagerInputs(op, xs),
+                                                in_grads)):
+          if in_grad is not None:
+            if (isinstance(in_grad, ops.Tensor) and
+                t_in.dtype != dtypes.resource):
+              try:
+                in_grad.set_shape(t_in.get_shape())
+              except ValueError:
+                raise ValueError(
+                    "Incompatible shapes between op input and calculated "
+                    "input gradient.  Forward operation: %s.  Input index: %d. "
+                    "Original input shape: %s.  "
+                    "Calculated input gradient shape: %s" %
+                    (op.name, i, t_in.shape, in_grad.shape))
+            _SetGrad(grads, t_in, in_grad)
+        if loop_state:
+          loop_state.ExitGradWhileContext(op, before=False)
+
+      # Update pending count for the inputs of op and enqueue ready ops.
+      _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state,
+                                    xs)
+
+  if loop_state:
+    loop_state.PostProcessing()
+  return [_GetGrad(grads, x, unconnected_gradients) for x in xs]
+
+
+def _HasAnyNotNoneGrads(grads, op):
+  """Return true iff op has real gradient."""
+  out_grads = _GetGrads(grads, op)
+  for out_grad in out_grads:
+    if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)):
+      return True
+    if out_grad and isinstance(out_grad, collections.Sequence):
+      if any(g is not None for g in out_grad):
+        return True
+  return False
+
+
+def _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state,
+                                  xs):
+  """Update pending count for the inputs of op and enqueue ready ops."""
+  for x in _NonEagerInputs(op, xs):
+    pending_count[x.op] -= 1
+    ready = (pending_count[x.op] == 0)
+    if loop_state and not ready:
+      ready = pending_count[x.op] > 0 and control_flow_util.IsLoopSwitch(x.op)
+    if ready:
+      if control_flow_util.IsLoopExit(x.op):
+        # if x is an exit without real gradient, defer processing them.
+        grad_state = loop_state.GetGradState(x.op, before=False)
+        grad_state.deferred_exits.append(x)
+        grad_state.pending_exits_count -= 1
+        if grad_state.pending_exits_count == 0:
+          # We now have all the exits so process them.
+          has_not_none_grad = False
+          for y in grad_state.deferred_exits:
+            if _HasAnyNotNoneGrads(grads, y.op):
+              has_not_none_grad = True
+              queue.append(y.op)
+            else:
+              grad_state.unused_exits.append(y)
+          if has_not_none_grad:
+            # For an unused exit, if it has trainable outputs, backprop
+            # a zero gradient. Otherwise, just ignore it.
+            for y in grad_state.unused_exits:
+              if IsTrainable(y):
+                _SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
+              queue.append(y.op)
+          else:
+            # All exits are "unused" so use None as gradient.
+            for y in grad_state.unused_exits:
+              queue.append(y.op)
+      else:
+        queue.append(x.op)
+
+
+def _SetGrad(grads, t, grad):
+  """Sets gradient "grad" in "grads" for tensor "t"."""
+  op = t.op
+  op_grads = grads.get(op)
+  if not op_grads:
+    op_grads = [[] for _ in xrange(len(op.outputs))]
+    grads[op] = op_grads
+  t_grads = op_grads[t.value_index]
+  if isinstance(t_grads, list):
+    t_grads.append(grad)
+  else:
+    assert control_flow_util.IsLoopSwitch(op)
+    op_grads[t.value_index] = grad
+
+
+def _GetGrad(grads, t, unconnected_gradients):
+  """Gets gradient for tensor "t"."""
+  op = t.op
+  op_grads = grads.get(op)
+  if not op_grads:
+    if unconnected_gradients == UnconnectedGradients.ZERO:
+      t_dtype = t.dtype if t.dtype != dtypes.resource else dtypes.float32
+      return array_ops.zeros_like(t, dtype=t_dtype)
+    elif unconnected_gradients == UnconnectedGradients.NONE:
+      return None
+    else:
+      raise ValueError(
+          "Unknown value for unconnected_gradients: %r" % unconnected_gradients)
+
+  t_grad = op_grads[t.value_index]
+  assert not isinstance(
+      t_grad, list), ("gradients list should have been aggregated by now.")
+  return t_grad
+
+
+def _GetGrads(grads, op):
+  """Gets all gradients for op."""
+  if op in grads:
+    return grads[op]
+  else:
+    return [[] for _ in xrange(len(op.outputs))]
+
+
+def _HandleNestedIndexedSlices(grad):
+  assert isinstance(grad, ops.IndexedSlices)
+  if isinstance(grad.values, ops.Tensor):
+    return grad
+  else:
+    assert isinstance(grad.values, ops.IndexedSlices)
+    g = _HandleNestedIndexedSlices(grad.values)
+    return ops.IndexedSlices(g.values, array_ops.gather(
+        grad.indices, g.indices), g.dense_shape)
+
+
+def _AccumulatorShape(inputs):
+  shape = tensor_shape.unknown_shape()
+  for i in inputs:
+    if isinstance(i, ops.Tensor):
+      shape = shape.merge_with(i.get_shape())
+  return shape
+
+
+def _LogOpGradients(op, out_grads, in_grads):
+  """Log the in and out grads of an op."""
+  logging.vlog(1, "Gradient for '" + op.name + "'")
+
+  def _FilterGrad(x):
+    if x is None:
+      return False
+    if isinstance(x, (list, tuple)):
+      return bool(x)
+    else:
+      return True
+
+  logging.vlog(1, "  in  --> %s",
+               ", ".join([x.name for x in out_grads if _FilterGrad(x)]))
+  logging.vlog(1, "  out --> %s",
+               ", ".join([x.name for x in in_grads if _FilterGrad(x)]))
+
+
+def _MultiDeviceAddN(tensor_list, gradient_uid):
+  """Adds tensors from potentially multiple devices."""
+  # Basic function structure comes from control_flow_ops.group().
+  # Sort tensors according to their devices.
+  tensors_on_device = collections.defaultdict(lambda: [])
+  for tensor in tensor_list:
+    tensors_on_device[tensor.device].append(tensor)
+
+  # For each device, add the tensors on that device first.
+  # Then gather the partial sums from multiple devices.
+  # TODO(sjhwang): Create hierarchical aggregation tree as pbar's suggestion.
+  # E.g., aggregate per GPU, then per task, and so on.
+  summands = []
+
+  def DeviceKey(dev):
+    return "" if dev is None else dev
+
+  for dev in sorted(tensors_on_device, key=DeviceKey):
+    tensors = tensors_on_device[dev]
+    with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
+        tensors[0].op,
+        gradient_uid,
+        ignore_existing=True):
+      summands.append(math_ops.add_n(tensors))
+
+  return math_ops.add_n(summands)
+
+
+@tf_export("AggregationMethod")
+class AggregationMethod(object):
+  """A class listing aggregation methods used to combine gradients.
+
+  Computing partial derivatives can require aggregating gradient
+  contributions. This class lists the various methods that can
+  be used to combine gradients in the graph:
+
+  *  `ADD_N`: All of the gradient terms are summed as part of one
+     operation using the "AddN" op. It has the property that all
+     gradients must be ready before any aggregation is performed.
+  *  `DEFAULT`: The system-chosen default aggregation method.
+  """
+  ADD_N = 0
+  DEFAULT = ADD_N
+  # The following are experimental and may not be supported in future releases.
+  EXPERIMENTAL_TREE = 1
+  EXPERIMENTAL_ACCUMULATE_N = 2
+
+
+def _AggregatedGrads(grads,
+                     op,
+                     gradient_uid,
+                     loop_state,
+                     aggregation_method=None):
+  """Get the aggregated gradients for op.
+
+  Args:
+    grads: The map of memoized gradients.
+    op: The op to get gradients for.
+    gradient_uid: A unique identifier within the graph indicating
+      which invocation of gradients is being executed. Used to cluster
+      ops for compilation.
+    loop_state: An object for maintaining the state of the while loops in the
+                graph. It is of type ControlFlowState. None if the graph
+                contains no while loops.
+    aggregation_method: Specifies the method used to combine gradient terms.
+      Accepted values are constants defined in the class `AggregationMethod`.
+
+  Returns:
+    A list of gradients, one per each output of `op`. If the gradients
+      for a particular output is a list, this function aggregates it
+      before returning.
+
+  Raises:
+    TypeError: if the incoming grads are not Tensors or IndexedSlices.
+    ValueError: if the arguments are invalid.
+
+  """
+  if aggregation_method is None:
+    aggregation_method = AggregationMethod.DEFAULT
+  if aggregation_method not in [
+      AggregationMethod.ADD_N, AggregationMethod.EXPERIMENTAL_TREE,
+      AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
+  ]:
+    raise ValueError(
+        "Invalid aggregation_method specified %s." % aggregation_method)
+  out_grads = _GetGrads(grads, op)
+  for i, out_grad in enumerate(out_grads):
+    if loop_state:
+      if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)):
+        assert control_flow_util.IsLoopSwitch(op)
+        continue
+    # Grads have to be Tensors or IndexedSlices
+    if (isinstance(out_grad, collections.Sequence) and not all(
+        isinstance(g, (ops.Tensor, ops.IndexedSlices))
+        for g in out_grad
+        if g is not None
+    )):
+      raise TypeError("gradients have to be either all Tensors "
+                      "or all IndexedSlices")
+    # Aggregate multiple gradients, and convert [] to None.
+    if out_grad:
+      if len(out_grad) < 2:
+        used = "nop"
+        out_grads[i] = out_grad[0]
+      elif all(isinstance(g, ops.Tensor) for g in out_grad if g is not None):
+        tensor_shape = _AccumulatorShape(out_grad)
+        if (aggregation_method == AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
+            and len(out_grad) > 2 and tensor_shape.is_fully_defined()):
+          # The benefit of using AccumulateN is that its inputs can be combined
+          # in any order and this can allow the expression to be evaluated with
+          # a smaller memory footprint.  When used with gpu_allocator_retry,
+          # it is possible to compute a sum of terms which are much larger than
+          # total GPU memory.
+          # AccumulateN can currently only be used if we know the shape for
+          # an accumulator variable.  If this is not known, or if we only have
+          # 2 grads then we fall through to the "tree" case below.
+          used = "accumulate_n"
+          out_grads[i] = math_ops.accumulate_n(out_grad)
+        elif aggregation_method in [
+            AggregationMethod.EXPERIMENTAL_TREE,
+            AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
+        ]:
+          # Aggregate all gradients by doing pairwise sums: this may
+          # reduce performance, but it can improve memory because the
+          # gradients can be released earlier.
+          #
+          # TODO(vrv): Consider replacing this with a version of
+          # tf.AddN() that eagerly frees its inputs as soon as they are
+          # ready, so the order of this tree does not become a problem.
+          used = "tree"
+          with ops.name_scope(op.name + "_gradient_sum"):
+            running_sum = out_grad[0]
+            for grad in out_grad[1:]:
+              running_sum = math_ops.add_n([running_sum, grad])
+            out_grads[i] = running_sum
+        else:
+          used = "add_n"
+          out_grads[i] = _MultiDeviceAddN(out_grad, gradient_uid)
+        logging.vlog(2, "  _AggregatedGrads %d x %s using %s", len(out_grad),
+                     tensor_shape, used)
+      else:
+        out_grads[i] = _AggregateIndexedSlicesGradients(out_grad)
+    else:  # not out_grad
+      # out_grads[i] is [], thus its aggregation is simply None.
+      out_grads[i] = None
+  return out_grads
+
+
+def _AggregateIndexedSlicesGradients(grads):
+  """Aggregates gradients of type `IndexedSlices` by concatenation."""
+  if len(grads) < 1:
+    return None
+  elif len(grads) == 1:
+    return grads[0]
+  else:
+    grads = math_ops._as_indexed_slices_list(  # pylint: disable=protected-access
+        [g for g in grads if g is not None])
+    grads = [_HandleNestedIndexedSlices(x) for x in grads]  # pylint: disable=protected-access
+    # Form IndexedSlices out of the concatenated values and indices.
+    concat_grad = ops.IndexedSlices(
+        array_ops.concat([x.values for x in grads], axis=0),
+        array_ops.concat([x.indices for x in grads], axis=0),
+        grads[0].dense_shape)
+
+    return concat_grad
diff --git a/tensorflow/python/ops/image_grad_test.py b/tensorflow/python/ops/image_grad_test.py
index c481266dd71c1300612dbc384d240d34b98b3599..f363f1b24a058b12740aabf06e93b562475285fa 100644
--- a/tensorflow/python/ops/image_grad_test.py
+++ b/tensorflow/python/ops/image_grad_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.ops import image_ops
 from tensorflow.python.platform import test
 
 
+@test_util.disable_all_xla("b/124289666")  # align_corners=False unimplemented
 class ResizeNearestNeighborOpTest(test.TestCase):
 
   TYPES = [np.float32, np.float64]
@@ -149,6 +150,7 @@ class ResizeBilinearOpTest(test.TestCase):
     self.assertLess(err, 1e-3)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("b/124290659")  # align_corners=False unimplemented
   def testCompareGpuVsCpu(self):
     in_shape = [2, 4, 6, 3]
     out_shape = [2, 8, 16, 3]
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 24d049b726fb93401d916d60c0d37fe85de30719..44bd92f422ab7051cbf0baa0393ea7d552d7cce9 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -1100,7 +1100,8 @@ def resize_images_v2(images,
 def resize_image_with_pad(image,
                           target_height,
                           target_width,
-                          method=ResizeMethod.BILINEAR):
+                          method=ResizeMethod.BILINEAR,
+                          align_corners=False):
   """Resizes and pads an image to a target width and height.
 
   Resizes an image to a target width and height by keeping
@@ -1115,6 +1116,9 @@ def resize_image_with_pad(image,
     target_height: Target height.
     target_width: Target width.
     method: Method to use for resizing image. See `resize_images()`
+    align_corners: bool.  If True, the centers of the 4 corner pixels of the
+        input and output tensors are aligned, preserving the values at the
+        corner pixels. Defaults to `False`.
 
   Raises:
     ValueError: if `target_height` or `target_width` are zero or negative.
@@ -1180,7 +1184,10 @@ def resize_image_with_pad(image,
     p_width = max_(0, math_ops.cast(f_padding_width, dtype=dtypes.int32))
 
     # Resize first, then pad to meet requested dimensions
-    resized = resize_images(image, [resized_height, resized_width], method)
+    resized = resize_images(
+        image, [resized_height, resized_width],
+        method,
+        align_corners=align_corners)
 
     padded = pad_to_bounding_box(resized, p_height, p_width, target_height,
                                  target_width)
@@ -1250,14 +1257,14 @@ def random_brightness(image, max_delta, seed=None):
   interval `[-max_delta, max_delta)`.
 
   Args:
-    image: An image.
+    image: An image or images to adjust.
     max_delta: float, must be non-negative.
     seed: A Python integer. Used to create a random seed. See
       `tf.set_random_seed`
       for behavior.
 
   Returns:
-    The brightness-adjusted image.
+    The brightness-adjusted image(s).
 
   Raises:
     ValueError: if `max_delta` is negative.
@@ -1271,7 +1278,7 @@ def random_brightness(image, max_delta, seed=None):
 
 @tf_export('image.random_contrast')
 def random_contrast(image, lower, upper, seed=None):
-  """Adjust the contrast of an image by a random factor.
+  """Adjust the contrast of an image or images by a random factor.
 
   Equivalent to `adjust_contrast()` but uses a `contrast_factor` randomly
   picked in the interval `[lower, upper]`.
@@ -1281,11 +1288,10 @@ def random_contrast(image, lower, upper, seed=None):
     lower: float.  Lower bound for the random contrast factor.
     upper: float.  Upper bound for the random contrast factor.
     seed: A Python integer. Used to create a random seed. See
-      `tf.set_random_seed`
-      for behavior.
+      `tf.set_random_seed` for behavior.
 
   Returns:
-    The contrast-adjusted tensor.
+    The contrast-adjusted image(s).
 
   Raises:
     ValueError: if `upper <= lower` or if `lower < 0`.
@@ -1305,19 +1311,19 @@ def random_contrast(image, lower, upper, seed=None):
 def adjust_brightness(image, delta):
   """Adjust the brightness of RGB or Grayscale images.
 
-  This is a convenience method that converts an RGB image to float
-  representation, adjusts its brightness, and then converts it back to the
-  original data type. If several adjustments are chained it is advisable to
+  This is a convenience method that converts RGB images to float
+  representation, adjusts their brightness, and then converts them back to the
+  original data type. If several adjustments are chained, it is advisable to
   minimize the number of redundant conversions.
 
-  The value `delta` is added to all components of the tensor `image`. Both
-  `image` and `delta` are converted to `float` before adding (and `image` is
-  scaled appropriately if it is in fixed-point representation). For regular
+  The value `delta` is added to all components of the tensor `image`. `image` is
+  converted to `float` and scaled appropriately if it is in fixed-point
+  representation, and `delta` is converted to the same data type. For regular
   images, `delta` should be in the range `[0,1)`, as it is added to the image in
   floating point representation, where pixel values are in the `[0,1)` range.
 
   Args:
-    image: A tensor.
+    image: RGB image or images to adjust.
     delta: A scalar. Amount to add to the pixel values.
 
   Returns:
@@ -1327,10 +1333,14 @@ def adjust_brightness(image, delta):
     image = ops.convert_to_tensor(image, name='image')
     # Remember original dtype to so we can convert back if needed
     orig_dtype = image.dtype
-    flt_image = convert_image_dtype(image, dtypes.float32)
+
+    if orig_dtype in [dtypes.float16, dtypes.float32]:
+      flt_image = image
+    else:
+      flt_image = convert_image_dtype(image, dtypes.float32)
 
     adjusted = math_ops.add(
-        flt_image, math_ops.cast(delta, dtypes.float32), name=name)
+        flt_image, math_ops.cast(delta, flt_image.dtype), name=name)
 
     return convert_image_dtype(adjusted, orig_dtype, saturate=True)
 
@@ -1339,9 +1349,9 @@ def adjust_brightness(image, delta):
 def adjust_contrast(images, contrast_factor):
   """Adjust contrast of RGB or grayscale images.
 
-  This is a convenience method that converts an RGB image to float
-  representation, adjusts its contrast, and then converts it back to the
-  original data type. If several adjustments are chained it is advisable to
+  This is a convenience method that converts RGB images to float
+  representation, adjusts their contrast, and then converts them back to the
+  original data type. If several adjustments are chained, it is advisable to
   minimize the number of redundant conversions.
 
   `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
@@ -1366,7 +1376,11 @@ def adjust_contrast(images, contrast_factor):
     images = ops.convert_to_tensor(images, name='images')
     # Remember original dtype to so we can convert back if needed
     orig_dtype = images.dtype
-    flt_images = convert_image_dtype(images, dtypes.float32)
+
+    if orig_dtype in (dtypes.float16, dtypes.float32):
+      flt_images = images
+    else:
+      flt_images = convert_image_dtype(images, dtypes.float32)
 
     adjusted = gen_image_ops.adjust_contrastv2(
         flt_images, contrast_factor=contrast_factor, name=name)
@@ -1560,7 +1574,7 @@ def grayscale_to_rgb(images, name=None):
 # pylint: disable=invalid-name
 @tf_export('image.random_hue')
 def random_hue(image, max_delta, seed=None):
-  """Adjust the hue of an RGB image by a random factor.
+  """Adjust the hue of RGB images by a random factor.
 
   Equivalent to `adjust_hue()` but uses a `delta` randomly
   picked in the interval `[-max_delta, max_delta]`.
@@ -1570,10 +1584,10 @@ def random_hue(image, max_delta, seed=None):
   Args:
     image: RGB image or images. Size of the last dimension must be 3.
     max_delta: float.  Maximum value for the random delta.
-    seed: An operation-specific seed. It will be used in conjunction
-      with the graph-level seed to determine the real seeds that will be
-      used in this operation. Please see the documentation of
-      set_random_seed for its interaction with the graph-level random seed.
+    seed: An operation-specific seed. It will be used in conjunction with the
+      graph-level seed to determine the real seeds that will be used in this
+      operation. Please see the documentation of set_random_seed for its
+      interaction with the graph-level random seed.
 
   Returns:
     Adjusted image(s), same shape and DType as `image`.
@@ -1593,7 +1607,7 @@ def random_hue(image, max_delta, seed=None):
 
 @tf_export('image.adjust_hue')
 def adjust_hue(image, delta, name=None):
-  """Adjust hue of an RGB image.
+  """Adjust hue of RGB images.
 
   This is a convenience method that converts an RGB image to float
   representation, converts it to HSV, add an offset to the hue channel, converts
@@ -1601,7 +1615,7 @@ def adjust_hue(image, delta, name=None):
   are chained it is advisable to minimize the number of redundant conversions.
 
   `image` is an RGB image.  The image hue is adjusted by converting the
-  image to HSV and rotating the hue channel (H) by
+  image(s) to HSV and rotating the hue channel (H) by
   `delta`.  The image is then converted back to RGB.
 
   `delta` must be in the interval `[-1, 1]`.
@@ -1618,7 +1632,10 @@ def adjust_hue(image, delta, name=None):
     image = ops.convert_to_tensor(image, name='image')
     # Remember original dtype to so we can convert back if needed
     orig_dtype = image.dtype
-    flt_image = convert_image_dtype(image, dtypes.float32)
+    if orig_dtype in (dtypes.float16, dtypes.float32):
+      flt_image = image
+    else:
+      flt_image = convert_image_dtype(image, dtypes.float32)
 
     rgb_altered = gen_image_ops.adjust_hue(flt_image, delta)
 
@@ -1696,7 +1713,7 @@ def adjust_jpeg_quality(image, jpeg_quality, name=None):
 
 @tf_export('image.random_saturation')
 def random_saturation(image, lower, upper, seed=None):
-  """Adjust the saturation of an RGB image by a random factor.
+  """Adjust the saturation of RGB images by a random factor.
 
   Equivalent to `adjust_saturation()` but uses a `saturation_factor` randomly
   picked in the interval `[lower, upper]`.
@@ -1705,10 +1722,10 @@ def random_saturation(image, lower, upper, seed=None):
     image: RGB image or images. Size of the last dimension must be 3.
     lower: float.  Lower bound for the random saturation factor.
     upper: float.  Upper bound for the random saturation factor.
-    seed: An operation-specific seed. It will be used in conjunction
-      with the graph-level seed to determine the real seeds that will be
-      used in this operation. Please see the documentation of
-      set_random_seed for its interaction with the graph-level random seed.
+    seed: An operation-specific seed. It will be used in conjunction with the
+      graph-level seed to determine the real seeds that will be used in this
+      operation. Please see the documentation of set_random_seed for its
+      interaction with the graph-level random seed.
 
   Returns:
     Adjusted image(s), same shape and DType as `image`.
@@ -1729,17 +1746,17 @@ def random_saturation(image, lower, upper, seed=None):
 
 @tf_export('image.adjust_saturation')
 def adjust_saturation(image, saturation_factor, name=None):
-  """Adjust saturation of an RGB image.
+  """Adjust saturation of RGB images.
 
-  This is a convenience method that converts an RGB image to float
-  representation, converts it to HSV, add an offset to the saturation channel,
+  This is a convenience method that converts RGB images to float
+  representation, converts them to HSV, add an offset to the saturation channel,
   converts back to RGB and then back to the original data type. If several
   adjustments are chained it is advisable to minimize the number of redundant
   conversions.
 
-  `image` is an RGB image.  The image saturation is adjusted by converting the
-  image to HSV and multiplying the saturation (S) channel by
-  `saturation_factor` and clipping. The image is then converted back to RGB.
+  `image` is an RGB image or images.  The image saturation is adjusted by
+  converting the images to HSV and multiplying the saturation (S) channel by
+  `saturation_factor` and clipping. The images are then converted back to RGB.
 
   Args:
     image: RGB image or images. Size of the last dimension must be 3.
@@ -1753,11 +1770,14 @@ def adjust_saturation(image, saturation_factor, name=None):
     image = ops.convert_to_tensor(image, name='image')
     # Remember original dtype to so we can convert back if needed
     orig_dtype = image.dtype
-    flt_image = convert_image_dtype(image, dtypes.float32)
+    if orig_dtype in (dtypes.float16, dtypes.float32):
+      flt_image = image
+    else:
+      flt_image = convert_image_dtype(image, dtypes.float32)
+
+    adjusted = gen_image_ops.adjust_saturation(flt_image, saturation_factor)
 
-    return convert_image_dtype(
-        gen_image_ops.adjust_saturation(flt_image, saturation_factor),
-        orig_dtype)
+    return convert_image_dtype(adjusted, orig_dtype)
 
 
 @tf_export('io.is_jpeg', 'image.is_jpeg', v1=['io.is_jpeg', 'image.is_jpeg'])
@@ -3053,7 +3073,222 @@ def crop_and_resize_v2(
       image, boxes, box_indices, crop_size, method, extrapolation_value, name)
 
 
-crop_and_resize_deprecation = deprecation.deprecated_args(
+@tf_export(v1=['image.crop_and_resize'])
+@deprecation.deprecated_args(
     None, 'box_ind is deprecated, use box_indices instead', 'box_ind')
-tf_export(v1=['image.crop_and_resize'])(
-    crop_and_resize_deprecation(gen_image_ops.crop_and_resize))
+def crop_and_resize_v1(   # pylint: disable=missing-docstring
+    image,
+    boxes,
+    box_ind=None,
+    crop_size=None,
+    method='bilinear',
+    extrapolation_value=0,
+    name=None,
+    box_indices=None):
+  box_ind = deprecation.deprecated_argument_lookup(
+      "box_indices", box_indices, "box_ind", box_ind)
+  return gen_image_ops.crop_and_resize(
+      image, boxes, box_ind, crop_size, method, extrapolation_value, name)
+
+crop_and_resize_v1.__doc__ = gen_image_ops.crop_and_resize.__doc__
+
+
+@tf_export(v1=['image.extract_glimpse'])
+def extract_glimpse(
+    input,  # pylint: disable=redefined-builtin
+    size,
+    offsets,
+    centered=True,
+    normalized=True,
+    uniform_noise=True,
+    name=None):
+  """Extracts a glimpse from the input tensor.
+
+  Returns a set of windows called glimpses extracted at location
+  `offsets` from the input tensor. If the windows only partially
+  overlaps the inputs, the non overlapping areas will be filled with
+  random noise.
+
+  The result is a 4-D tensor of shape `[batch_size, glimpse_height,
+  glimpse_width, channels]`. The channels and batch dimensions are the
+  same as that of the input tensor. The height and width of the output
+  windows are specified in the `size` parameter.
+
+  The argument `normalized` and `centered` controls how the windows are built:
+
+  * If the coordinates are normalized but not centered, 0.0 and 1.0
+    correspond to the minimum and maximum of each height and width
+    dimension.
+  * If the coordinates are both normalized and centered, they range from
+    -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
+    left corner, the lower right corner is located at (1.0, 1.0) and the
+    center is at (0, 0).
+  * If the coordinates are not normalized they are interpreted as
+    numbers of pixels.
+
+  Args:
+    input: A `Tensor` of type `float32`. A 4-D float tensor of shape
+      `[batch_size, height, width, channels]`.
+    size: A `Tensor` of type `int32`. A 1-D tensor of 2 elements containing the
+      size of the glimpses to extract.  The glimpse height must be specified
+      first, following by the glimpse width.
+    offsets: A `Tensor` of type `float32`. A 2-D integer tensor of shape
+      `[batch_size, 2]` containing the y, x locations of the center of each
+      window.
+    centered: An optional `bool`. Defaults to `True`. indicates if the offset
+      coordinates are centered relative to the image, in which case the (0, 0)
+      offset is relative to the center of the input images. If false, the (0,0)
+      offset corresponds to the upper left corner of the input images.
+    normalized: An optional `bool`. Defaults to `True`. indicates if the offset
+      coordinates are normalized.
+    uniform_noise: An optional `bool`. Defaults to `True`. indicates if the
+      noise should be generated using a uniform distribution or a Gaussian
+      distribution.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type `float32`.
+  """
+  return gen_image_ops.extract_glimpse(
+      input=input,
+      size=size,
+      offsets=offsets,
+      centered=centered,
+      normalized=normalized,
+      uniform_noise=uniform_noise,
+      name=name)
+
+
+@tf_export('image.extract_glimpse', v1=[])
+def extract_glimpse_v2(
+    input,  # pylint: disable=redefined-builtin
+    size,
+    offsets,
+    centered=True,
+    normalized=True,
+    noise='uniform',
+    name=None):
+  """Extracts a glimpse from the input tensor.
+
+  Returns a set of windows called glimpses extracted at location
+  `offsets` from the input tensor. If the windows only partially
+  overlaps the inputs, the non overlapping areas will be filled with
+  random noise.
+
+  The result is a 4-D tensor of shape `[batch_size, glimpse_height,
+  glimpse_width, channels]`. The channels and batch dimensions are the
+  same as that of the input tensor. The height and width of the output
+  windows are specified in the `size` parameter.
+
+  The argument `normalized` and `centered` controls how the windows are built:
+
+  * If the coordinates are normalized but not centered, 0.0 and 1.0
+    correspond to the minimum and maximum of each height and width
+    dimension.
+  * If the coordinates are both normalized and centered, they range from
+    -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
+    left corner, the lower right corner is located at (1.0, 1.0) and the
+    center is at (0, 0).
+  * If the coordinates are not normalized they are interpreted as
+    numbers of pixels.
+
+  Args:
+    input: A `Tensor` of type `float32`. A 4-D float tensor of shape
+      `[batch_size, height, width, channels]`.
+    size: A `Tensor` of type `int32`. A 1-D tensor of 2 elements containing the
+      size of the glimpses to extract.  The glimpse height must be specified
+      first, following by the glimpse width.
+    offsets: A `Tensor` of type `float32`. A 2-D integer tensor of shape
+      `[batch_size, 2]` containing the y, x locations of the center of each
+      window.
+    centered: An optional `bool`. Defaults to `True`. indicates if the offset
+      coordinates are centered relative to the image, in which case the (0, 0)
+      offset is relative to the center of the input images. If false, the (0,0)
+      offset corresponds to the upper left corner of the input images.
+    normalized: An optional `bool`. Defaults to `True`. indicates if the offset
+      coordinates are normalized.
+    noise: An optional `string`. Defaults to `uniform`. indicates if the noise
+      should be `uniform` (uniform distribution), `gaussian` (gaussian
+      distribution), or `zero` (zero padding).
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type `float32`.
+  """
+  return gen_image_ops.extract_glimpse(
+      input=input,
+      size=size,
+      offsets=offsets,
+      centered=centered,
+      normalized=normalized,
+      noise=noise,
+      uniform_noise=False,
+      name=name)
+
+
+@tf_export('image.combined_non_max_suppression')
+def combined_non_max_suppression(boxes,
+                                 scores,
+                                 max_output_size_per_class,
+                                 max_total_size,
+                                 iou_threshold=0.5,
+                                 score_threshold=float('-inf'),
+                                 pad_per_class=False,
+                                 name=None):
+  """Greedily selects a subset of bounding boxes in descending order of score.
+
+  This operation performs non_max_suppression on the inputs per batch, across
+  all classes.
+  Prunes away boxes that have high intersection-over-union (IOU) overlap
+  with previously selected boxes.  Bounding boxes are supplied as
+  [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+  diagonal pair of box corners and the coordinates can be provided as normalized
+  (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+  is agnostic to where the origin is in the coordinate system. Also note that
+  this algorithm is invariant to orthogonal transformations and translations
+  of the coordinate system; thus translating or reflections of the coordinate
+  system result in the same boxes being selected by the algorithm.
+  The output of this operation is the final boxes, scores and classes tensor
+  returned after performing non_max_suppression.
+
+  Args:
+    boxes: A 4-D float `Tensor` of shape `[batch_size, num_boxes, q, 4]`. If `q`
+      is 1 then same boxes are used for all classes otherwise, if `q` is equal
+      to number of classes, class-specific boxes are used.
+    scores: A 3-D float `Tensor` of shape `[batch_size, num_boxes, num_classes]`
+      representing a single score corresponding to each box (each row of boxes).
+    max_output_size_per_class: A scalar integer `Tensor` representing the
+      maximum number of boxes to be selected by non max suppression per class
+    max_total_size: A scalar representing maximum number of boxes retained over
+      all classes.
+    iou_threshold: A float representing the threshold for deciding whether boxes
+      overlap too much with respect to IOU.
+    score_threshold: A float representing the threshold for deciding when to
+      remove boxes based on score.
+    pad_per_class: If false, the output nmsed boxes, scores and classes are
+      padded/clipped to `max_total_size`. If true, the output nmsed boxes,
+      scores and classes are padded to be of length
+      `max_size_per_class`*`num_classes`, unless it exceeds `max_total_size` in
+      which case it is clipped to `max_total_size`. Defaults to false.
+    name: A name for the operation (optional).
+
+  Returns:
+    'nmsed_boxes': A [batch_size, max_detections, 4] float32 tensor
+      containing the non-max suppressed boxes.
+    'nmsed_scores': A [batch_size, max_detections] float32 tensor containing
+      the scores for the boxes.
+    'nmsed_classes': A [batch_size, max_detections] float32 tensor
+      containing the class for boxes.
+    'valid_detections': A [batch_size] int32 tensor indicating the number of
+      valid detections per batch item. Only the top valid_detections[i] entries
+      in nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The rest of the
+      entries are zero paddings.
+  """
+  with ops.name_scope(name, 'combined_non_max_suppression'):
+    iou_threshold = ops.convert_to_tensor(
+        iou_threshold, dtype=dtypes.float32, name='iou_threshold')
+    score_threshold = ops.convert_to_tensor(
+        score_threshold, dtype=dtypes.float32, name='score_threshold')
+    return gen_image_ops.combined_non_max_suppression(
+        boxes, scores, max_output_size_per_class, max_total_size, iou_threshold,
+        score_threshold, pad_per_class)
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index e7249333bd35d07821004a39c3c78e52c1ee904d..490e80e09dbf026552f182d065a8dde9d8e5a429 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -61,7 +61,7 @@ class RGBToHSVTest(test_util.TensorFlowTestCase):
       inp = np.random.rand(*shape).astype(nptype)
 
       # Convert to HSV and back, as a batch and individually
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         batch0 = constant_op.constant(inp)
         batch1 = image_ops.rgb_to_hsv(batch0)
         batch2 = image_ops.hsv_to_rgb(batch1)
@@ -82,7 +82,7 @@ class RGBToHSVTest(test_util.TensorFlowTestCase):
     data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
     for nptype in [np.float32, np.float64]:
       rgb_np = np.array(data, dtype=nptype).reshape([2, 2, 3]) / 255.
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         hsv = image_ops.rgb_to_hsv(rgb_np)
         rgb = image_ops.hsv_to_rgb(hsv)
         rgb_tf = self.evaluate(rgb)
@@ -101,7 +101,7 @@ class RGBToYIQTest(test_util.TensorFlowTestCase):
       inp = np.random.rand(*shape).astype(nptype)
 
       # Convert to YIQ and back, as a batch and individually
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         batch0 = constant_op.constant(inp)
         batch1 = image_ops.rgb_to_yiq(batch0)
         batch2 = image_ops.yiq_to_rgb(batch1)
@@ -131,7 +131,7 @@ class RGBToYUVTest(test_util.TensorFlowTestCase):
       inp = np.random.rand(*shape).astype(nptype)
 
       # Convert to YUV and back, as a batch and individually
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         batch0 = constant_op.constant(inp)
         batch1 = image_ops.rgb_to_yuv(batch0)
         batch2 = image_ops.yuv_to_rgb(batch1)
@@ -173,7 +173,7 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
   def _TestRGBToGrayscale(self, x_np):
     y_np = self._RGBToGrayscale(x_np)
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.rgb_to_grayscale(x_tf)
       y_tf = self.evaluate(y)
@@ -195,7 +195,7 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
     y_np = np.array(
         [[1, 1, 1], [2, 2, 2]], dtype=np.uint8).reshape([1, 1, 2, 3])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.grayscale_to_rgb(x_tf)
       y_tf = self.evaluate(y)
@@ -205,7 +205,7 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
     x_np = np.array([[1, 2]], dtype=np.uint8).reshape([1, 2, 1])
     y_np = np.array([[1, 1, 1], [2, 2, 2]], dtype=np.uint8).reshape([1, 2, 3])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.grayscale_to_rgb(x_tf)
       y_tf = self.evaluate(y)
@@ -216,23 +216,23 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
     # Shape inference works and produces expected output where possible
     rgb_shape = [7, None, 19, 3]
     gray_shape = rgb_shape[:-1] + [1]
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       rgb_tf = array_ops.placeholder(dtypes.uint8, shape=rgb_shape)
       gray = image_ops.rgb_to_grayscale(rgb_tf)
       self.assertEqual(gray_shape, gray.get_shape().as_list())
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       gray_tf = array_ops.placeholder(dtypes.uint8, shape=gray_shape)
       rgb = image_ops.grayscale_to_rgb(gray_tf)
       self.assertEqual(rgb_shape, rgb.get_shape().as_list())
 
     # Shape inference does not break for unknown shapes
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       rgb_tf_unknown = array_ops.placeholder(dtypes.uint8)
       gray_unknown = image_ops.rgb_to_grayscale(rgb_tf_unknown)
       self.assertFalse(gray_unknown.get_shape())
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       gray_tf_unknown = array_ops.placeholder(dtypes.uint8)
       rgb_unknown = image_ops.grayscale_to_rgb(gray_tf_unknown)
       self.assertFalse(rgb_unknown.get_shape())
@@ -364,7 +364,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     y_data = [0, 13, 1, 54, 226, 59, 8, 234, 150, 255, 39, 1]
     y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_hue(x, delta)
       y_tf = self.evaluate(y)
@@ -379,7 +379,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     y_data = [13, 0, 11, 226, 54, 221, 234, 8, 92, 1, 217, 255]
     y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_hue(x, delta)
       y_tf = self.evaluate(y)
@@ -394,7 +394,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     y_data = [13, 0, 11, 226, 54, 221, 234, 8, 92, 1, 217, 255]
     y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_hue(x, delta)
       y_tf = self.evaluate(y)
@@ -419,7 +419,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     return y_v.reshape(x_np.shape)
 
   def _adjustHueTf(self, x_np, delta_h):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np)
       y = image_ops.adjust_hue(x, delta_h)
       y_tf = self.evaluate(y)
@@ -850,7 +850,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     y_data = [6, 9, 13, 140, 180, 226, 135, 121, 234, 172, 255, 128]
     y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_saturation(x, saturation_factor)
       y_tf = self.evaluate(y)
@@ -865,7 +865,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     y_data = [0, 5, 13, 0, 106, 226, 30, 0, 234, 89, 255, 0]
     y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_saturation(x, saturation_factor)
       y_tf = self.evaluate(y)
@@ -880,50 +880,12 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     y_data = [6, 9, 13, 140, 180, 226, 135, 121, 234, 172, 255, 128]
     y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_saturation(x, saturation_factor)
       y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
-  def _adjust_saturation(self, image, saturation_factor):
-    image = ops.convert_to_tensor(image, name="image")
-    orig_dtype = image.dtype
-    flt_image = image_ops.convert_image_dtype(image, dtypes.float32)
-    saturation_adjusted_image = gen_image_ops.adjust_saturation(
-        flt_image, saturation_factor)
-    return image_ops.convert_image_dtype(saturation_adjusted_image, orig_dtype)
-
-  def testHalfSaturationFused(self):
-    x_shape = [2, 2, 3]
-    x_rgb_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
-    x_np = np.array(x_rgb_data, dtype=np.uint8).reshape(x_shape)
-
-    saturation_factor = 0.5
-    y_rgb_data = [6, 9, 13, 140, 180, 226, 135, 121, 234, 172, 255, 128]
-    y_np = np.array(y_rgb_data, dtype=np.uint8).reshape(x_shape)
-
-    with self.test_session(use_gpu=True):
-      x = constant_op.constant(x_np, shape=x_shape)
-      y = self._adjust_saturation(x, saturation_factor)
-      y_tf = self.evaluate(y)
-      self.assertAllEqual(y_tf, y_np)
-
-  def testTwiceSaturationFused(self):
-    x_shape = [2, 2, 3]
-    x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
-    x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
-
-    saturation_factor = 2.0
-    y_data = [0, 5, 13, 0, 106, 226, 30, 0, 234, 89, 255, 0]
-    y_np = np.array(y_data, dtype=np.uint8).reshape(x_shape)
-
-    with self.test_session(use_gpu=True):
-      x = constant_op.constant(x_np, shape=x_shape)
-      y = self._adjust_saturation(x, saturation_factor)
-      y_tf = self.evaluate(y)
-      self.assertAllEqual(y_tf, y_np)
-
   def _adjustSaturationNp(self, x_np, scale):
     self.assertEqual(x_np.shape[-1], 3)
     x_v = x_np.reshape([-1, 3])
@@ -958,7 +920,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
         "gb_same",
         "rgb_same",
     ]
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       for x_shape in x_shapes:
         for test_style in test_styles:
           x_np = np.random.rand(*x_shape) * 255.
@@ -977,7 +939,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
           else:
             raise AssertionError("Invalid test style: %s" % (test_style))
           y_baseline = self._adjustSaturationNp(x_np, scale)
-          y_fused = self._adjust_saturation(x_np, scale).eval()
+          y_fused = image_ops.adjust_saturation(x_np, scale).eval()
           self.assertAllClose(y_fused, y_baseline, rtol=2e-5, atol=1e-5)
 
 
@@ -985,7 +947,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 
   def testInvolutionLeftRight(self):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(image_ops.flip_left_right(x_tf))
       y_tf = self.evaluate(y)
@@ -995,7 +957,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     x_np = np.array(
         [[[1, 2, 3], [1, 2, 3]], [[1, 2, 3], [1, 2, 3]]],
         dtype=np.uint8).reshape([2, 2, 3, 1])
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(image_ops.flip_left_right(x_tf))
       y_tf = self.evaluate(y)
@@ -1006,7 +968,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[3, 2, 1], [3, 2, 1]], dtype=np.uint8).reshape([2, 3, 1])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(x_tf)
       self.assertTrue(y.op.name.startswith("flip_left_right"))
@@ -1021,7 +983,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         [[[3, 2, 1], [3, 2, 1]], [[3, 2, 1], [3, 2, 1]]],
         dtype=np.uint8).reshape([2, 2, 3, 1])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(x_tf)
       y_tf = self.evaluate(y)
@@ -1033,7 +995,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     y_np = np.array([[3, 2, 1], [3, 2, 1]], dtype=np.uint8).reshape([2, 3, 1])
     seed = 42
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.random_flip_left_right(x_tf, seed=seed)
       self.assertTrue(y.op.name.startswith("random_flip_left_right"))
@@ -1073,7 +1035,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     x_np = np.vstack([x_np_raw for _ in range(batch_size)])
     y_np = np.vstack([y_np_raw for _ in range(batch_size)])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.random_flip_left_right(x_tf, seed=seed)
       self.assertTrue(y.op.name.startswith("random_flip_left_right"))
@@ -1104,7 +1066,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
   def testInvolutionUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(image_ops.flip_up_down(x_tf))
       y_tf = self.evaluate(y)
@@ -1115,7 +1077,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
         dtype=np.uint8).reshape([2, 2, 3, 1])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(image_ops.flip_up_down(x_tf))
       y_tf = self.evaluate(y)
@@ -1126,7 +1088,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(x_tf)
       self.assertTrue(y.op.name.startswith("flip_up_down"))
@@ -1141,7 +1103,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         [[[4, 5, 6], [1, 2, 3]], [[10, 11, 12], [7, 8, 9]]],
         dtype=np.uint8).reshape([2, 2, 3, 1])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(x_tf)
       y_tf = self.evaluate(y)
@@ -1154,7 +1116,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 
     seed = 42
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.random_flip_up_down(x_tf, seed=seed)
       self.assertTrue(y.op.name.startswith("random_flip_up_down"))
@@ -1193,7 +1155,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     x_np = np.vstack([x_np_raw for _ in range(batch_size)])
     y_np = np.vstack([y_np_raw for _ in range(batch_size)])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.random_flip_up_down(x_tf, seed=seed)
       self.assertTrue(y.op.name.startswith("random_flip_up_down"))
@@ -1224,7 +1186,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
   def testInvolutionTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(image_ops.transpose_image(x_tf))
       y_tf = self.evaluate(y)
@@ -1235,7 +1197,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]],
         dtype=np.uint8).reshape([2, 2, 3, 1])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(image_ops.transpose_image(x_tf))
       y_tf = self.evaluate(y)
@@ -1246,7 +1208,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[1, 4], [2, 5], [3, 6]], dtype=np.uint8).reshape([3, 2, 1])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(x_tf)
       self.assertTrue(y.op.name.startswith("transpose"))
@@ -1262,7 +1224,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         [[[1, 4], [2, 5], [3, 6]], [[7, 10], [8, 11], [9, 12]]],
         dtype=np.uint8).reshape([2, 3, 2, 1])
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(x_tf)
       y_tf = self.evaluate(y)
@@ -1313,7 +1275,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 
   def testRot90GroupOrder(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       rotated = image
       for _ in xrange(4):
         rotated = image_ops.rot90(rotated)
@@ -1321,7 +1283,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 
   def testRot90GroupOrderWithBatch(self):
     image = np.arange(48, dtype=np.uint8).reshape([2, 2, 4, 3])
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       rotated = image
       for _ in xrange(4):
         rotated = image_ops.rot90(rotated)
@@ -1330,7 +1292,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
   @test_util.run_deprecated_v1
   def testRot90NumpyEquivalence(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       k_placeholder = array_ops.placeholder(dtypes.int32, shape=[])
       y_tf = image_ops.rot90(image, k_placeholder)
       for k in xrange(4):
@@ -1340,7 +1302,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
   @test_util.run_deprecated_v1
   def testRot90NumpyEquivalenceWithBatch(self):
     image = np.arange(48, dtype=np.uint8).reshape([2, 2, 4, 3])
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       k_placeholder = array_ops.placeholder(dtypes.int32, shape=[])
       y_tf = image_ops.rot90(image, k_placeholder)
       for k in xrange(4):
@@ -1350,7 +1312,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 class AdjustContrastTest(test_util.TensorFlowTestCase):
 
   def _testContrast(self, x_np, y_np, contrast_factor):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.adjust_contrast(x, contrast_factor)
       y_tf = self.evaluate(y)
@@ -1405,7 +1367,7 @@ class AdjustContrastTest(test_util.TensorFlowTestCase):
     return y_np
 
   def _adjustContrastTf(self, x_np, contrast_factor):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np)
       y = image_ops.adjust_contrast(x, contrast_factor)
       y_tf = self.evaluate(y)
@@ -1438,12 +1400,12 @@ class AdjustContrastTest(test_util.TensorFlowTestCase):
 
 class AdjustBrightnessTest(test_util.TensorFlowTestCase):
 
-  def _testBrightness(self, x_np, y_np, delta):
-    with self.test_session(use_gpu=True):
+  def _testBrightness(self, x_np, y_np, delta, tol=1e-6):
+    with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.adjust_brightness(x, delta)
       y_tf = self.evaluate(y)
-      self.assertAllClose(y_tf, y_np, 1e-6)
+      self.assertAllClose(y_tf, y_np, tol)
 
   def testPositiveDeltaUint8(self):
     x_shape = [2, 2, 3]
@@ -1455,7 +1417,7 @@ class AdjustBrightnessTest(test_util.TensorFlowTestCase):
 
     self._testBrightness(x_np, y_np, delta=10. / 255.)
 
-  def testPositiveDeltaFloat(self):
+  def testPositiveDeltaFloat32(self):
     x_shape = [2, 2, 3]
     x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
     x_np = np.array(x_data, dtype=np.float32).reshape(x_shape) / 255.
@@ -1465,6 +1427,16 @@ class AdjustBrightnessTest(test_util.TensorFlowTestCase):
 
     self._testBrightness(x_np, y_np, delta=10. / 255.)
 
+  def testPositiveDeltaFloat16(self):
+    x_shape = [2, 2, 3]
+    x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
+    x_np = np.array(x_data, dtype=np.float16).reshape(x_shape) / 255.
+
+    y_data = [10, 15, 23, 64, 145, 236, 47, 18, 244, 100, 265, 11]
+    y_np = np.array(y_data, dtype=np.float16).reshape(x_shape) / 255.
+
+    self._testBrightness(x_np, y_np, delta=10. / 255., tol=1e-3)
+
   def testNegativeDelta(self):
     x_shape = [2, 2, 3]
     x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
@@ -1496,7 +1468,7 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
     x_np = np.arange(0, np.prod(x_shape), dtype=np.int32).reshape(x_shape)
     y_np = self._NumpyPerImageWhitening(x_np)
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.per_image_standardization(x)
       self.assertTrue(y.op.name.startswith("per_image_standardization"))
@@ -1507,14 +1479,14 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
     im_np = np.ones([19, 19, 3]).astype(np.float32) * 249
     im = constant_op.constant(im_np)
     whiten = image_ops.per_image_standardization(im)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       whiten_np = self.evaluate(whiten)
       self.assertFalse(np.any(np.isnan(whiten_np)))
 
   def testBatchWhitening(self):
     imgs_np = np.random.uniform(0., 255., [4, 24, 24, 3])
     whiten_np = [self._NumpyPerImageWhitening(img) for img in imgs_np]
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       imgs = constant_op.constant(imgs_np)
       whiten = image_ops.per_image_standardization(imgs)
       whiten_tf = self.evaluate(whiten)
@@ -1542,7 +1514,7 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
     if not use_tensor_inputs:
       self.assertTrue(y.get_shape().is_fully_defined())
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       return y.eval(feed_dict=feed_dict)
 
   def _assertReturns(self,
@@ -1721,7 +1693,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
     for x_shape in x_shapes:
       x_np = np.ones(x_shape, dtype=np.float32)
       for use_gpu in [True, False]:
-        with self.test_session(use_gpu=use_gpu):
+        with self.cached_session(use_gpu=use_gpu):
           x = constant_op.constant(x_np, shape=x_shape)
           y = image_ops.central_crop(x, 1.0)
           y_tf = self.evaluate(y)
@@ -1736,7 +1708,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
         dtype=np.int32).reshape(x_shape)
     y_np = np.array([[3, 4, 5, 6], [3, 4, 5, 6]]).reshape([2, 4, 1])
     for use_gpu in [True, False]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         x = constant_op.constant(x_np, shape=x_shape)
         y = image_ops.central_crop(x, 0.5)
         y_tf = self.evaluate(y)
@@ -1752,7 +1724,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
         dtype=np.int32).reshape(x_shape)
     y_np = np.array([[[3, 4, 5, 6], [3, 4, 5, 6]],
                      [[6, 5, 4, 3], [6, 5, 4, 3]]]).reshape([2, 2, 4, 1])
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.central_crop(x, 0.5)
       y_tf = self.evaluate(y)
@@ -1769,7 +1741,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
       x_np = np.zeros(x_shape, dtype=np.int32)
       y_np = np.zeros(y_shape, dtype=np.int32)
       for use_gpu in [True, False]:
-        with self.test_session(use_gpu=use_gpu):
+        with self.cached_session(use_gpu=use_gpu):
           x = array_ops.placeholder(shape=x_shape, dtype=dtypes.int32)
           y = image_ops.central_crop(x, 0.33)
           y_tf = y.eval(feed_dict={x: x_np})
@@ -1820,7 +1792,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
     x_shape = [13, 9, 3]
     x_np = np.ones(x_shape, dtype=np.float32)
     for use_gpu in [True, False]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         x = constant_op.constant(x_np, shape=x_shape)
         with self.assertRaises(ValueError):
           _ = image_ops.central_crop(x, 0.0)
@@ -1832,7 +1804,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
     for x_shape in x_shapes:
       x_np = np.ones(x_shape, dtype=np.float32)
       for use_gpu in [True, False]:
-        with self.test_session(use_gpu=use_gpu):
+        with self.cached_session(use_gpu=use_gpu):
           x = constant_op.constant(x_np, shape=x_shape)
           with self.assertRaises(ValueError):
             _ = image_ops.central_crop(x, 0.5)
@@ -1842,7 +1814,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
     x_shape = [13, 9, 3]
     x_np = np.ones(x_shape, dtype=np.float32)
     for use_gpu in [True, False]:
-      with self.test_session(use_gpu=use_gpu):
+      with self.cached_session(use_gpu=use_gpu):
         y = image_ops.central_crop(x_np, 1.0)
         self.assertTrue(y.op.name.startswith("central_crop"))
 
@@ -1867,7 +1839,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     if not use_tensor_inputs:
       self.assertTrue(y.get_shape().is_fully_defined())
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       return y.eval(feed_dict=feed_dict)
 
   def _assertReturns(self,
@@ -1927,7 +1899,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
 
     i = constant_op.constant([1, 0, 4, 3], dtype=dtypes.int64)
     y_tf = image_ops.pad_to_bounding_box(x, i[0], i[1], i[2], i[3])
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       self.assertAllClose(y, self.evaluate(y_tf))
 
   @test_util.run_deprecated_v1
@@ -2062,7 +2034,7 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
     fraction_object_covered = []
 
     num_iter = 1000
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       image_tf = constant_op.constant(image, shape=image.shape)
       image_size_tf = constant_op.constant(
           image_size_np, shape=image_size_np.shape)
@@ -2192,7 +2164,7 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
   def testSampleDistortedBoundingBoxShape(self):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       image_size = constant_op.constant(
           [40, 50, 1], shape=[3], dtype=dtypes.int32)
       bounding_box = constant_op.constant(
@@ -2230,7 +2202,7 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
 
   def testDefaultMinObjectCovered(self):
     # By default min_object_covered=0.1 if not provided
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       image_size = constant_op.constant(
           [40, 50, 1], shape=[3], dtype=dtypes.int32)
       bounding_box = constant_op.constant(
@@ -2303,7 +2275,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
       img_np = np.array(data, dtype=nptype).reshape(img_shape)
 
       for opt in self.OPTIONS:
-        with self.test_session(use_gpu=True) as sess:
+        with self.cached_session(use_gpu=True) as sess:
           image = constant_op.constant(img_np, shape=img_shape)
           y = image_ops.resize_images(image, [target_height, target_width], opt)
           yshape = array_ops.shape(y)
@@ -2312,7 +2284,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
           self.assertAllClose(resized, img_np, atol=1e-5)
 
       # Resizing with a single image must leave the shape unchanged also.
-      with self.test_session(use_gpu=True):
+      with self.cached_session(use_gpu=True):
         img_single = img_np.reshape(single_shape)
         image = constant_op.constant(img_single, shape=single_shape)
         y = image_ops.resize_images(image, [target_height, target_width],
@@ -2336,7 +2308,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     img_np = np.array(data, dtype=np.uint8).reshape(img_shape)
 
     for opt in self.OPTIONS:
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         image = constant_op.constant(img_np, shape=img_shape)
         y = image_ops.resize_images(image, new_size, opt)
         yshape = array_ops.shape(y)
@@ -2345,7 +2317,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
         self.assertAllClose(resized, img_np, atol=1e-5)
 
     # Resizing with a single image must leave the shape unchanged also.
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       img_single = img_np.reshape(single_shape)
       image = constant_op.constant(img_single, shape=single_shape)
       y = image_ops.resize_images(image, new_size, self.OPTIONS[0])
@@ -2427,6 +2399,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(img_shape, newshape)
         self.assertAllClose(resized, img_np, atol=1e-5)
 
+  @test_util.disable_xla("b/124289666")  # align_corners=False unimplemented
   def testResizeDown(self):
     # This test is also conducted with int8, so 127 is the maximum
     # value that can be used.
@@ -2450,7 +2423,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
 
         for opt in self.OPTIONS:
           if test.is_gpu_available() and self.shouldRunOnGPU(opt, nptype):
-            with self.test_session(use_gpu=True):
+            with self.cached_session(use_gpu=True):
               image = constant_op.constant(img_np, shape=img_shape)
               y = image_ops.resize_images(image, [target_height, target_width],
                                           opt)
@@ -2458,6 +2431,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
               resized = self.evaluate(y)
               self.assertAllClose(resized, expected, atol=1e-5)
 
+  @test_util.disable_xla("b/124289666")  # align_corners=False unimplemented
   def testResizeUpAlignCornersFalse(self):
     img_shape = [1, 3, 2, 1]
     data = [64, 32, 32, 64, 50, 100]
@@ -2485,7 +2459,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
           image_ops.ResizeMethod.BILINEAR,
           image_ops.ResizeMethod.NEAREST_NEIGHBOR, image_ops.ResizeMethod.AREA
       ]:
-        with self.test_session(use_gpu=True):
+        with self.cached_session(use_gpu=True):
           img_np = np.array(data, dtype=nptype).reshape(img_shape)
           image = constant_op.constant(img_np, shape=img_shape)
           y = image_ops.resize_images(
@@ -2521,7 +2495,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
           image_ops.ResizeMethod.BILINEAR,
           image_ops.ResizeMethod.NEAREST_NEIGHBOR, image_ops.ResizeMethod.AREA
       ]:
-        with self.test_session(use_gpu=True):
+        with self.cached_session(use_gpu=True):
           img_np = np.array(data, dtype=nptype).reshape(img_shape)
           image = constant_op.constant(img_np, shape=img_shape)
           y = image_ops.resize_images(
@@ -2549,7 +2523,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
         75, 81, 80, 72, 69, 70, 105, 112, 75, 36, 45, 92, 111, 105
     ]
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       image = constant_op.constant(img_np, shape=img_shape)
       y = image_ops.resize_images(image, [target_height, target_width],
                                   image_ops.ResizeMethod.BICUBIC)
@@ -2572,7 +2546,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
         73, 33, 23, 39, 73, 33, 23, 39, 14, 16, 19, 21, 14, 16, 19, 21
     ]
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       image = constant_op.constant(img_np, shape=img_shape)
       y = image_ops.resize_images(image, [target_height, target_width],
                                   image_ops.ResizeMethod.AREA)
@@ -2581,6 +2555,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
       resized = self.evaluate(y)
       self.assertAllClose(resized, expected, atol=1)
 
+  @test_util.disable_xla("b/124289666")  # align_corners=False unimplemented
   def testCompareNearestNeighbor(self):
     if test.is_gpu_available():
       input_shape = [1, 5, 6, 3]
@@ -2590,7 +2565,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
         for align_corners in [True, False]:
           img_np = np.arange(
               0, np.prod(input_shape), dtype=nptype).reshape(input_shape)
-          with self.test_session(use_gpu=True):
+          with self.cached_session(use_gpu=True):
             image = constant_op.constant(img_np, shape=input_shape)
             new_size = constant_op.constant([target_height, target_width])
             out_op = image_ops.resize_images(
@@ -2599,7 +2574,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                 image_ops.ResizeMethod.NEAREST_NEIGHBOR,
                 align_corners=align_corners)
             gpu_val = self.evaluate(out_op)
-          with self.test_session(use_gpu=False):
+          with self.cached_session(use_gpu=False):
             image = constant_op.constant(img_np, shape=input_shape)
             new_size = constant_op.constant([target_height, target_width])
             out_op = image_ops.resize_images(
@@ -2621,7 +2596,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
               0, np.prod(input_shape), dtype=nptype).reshape(input_shape)
           value = {}
           for use_gpu in [True, False]:
-            with self.test_session(use_gpu=use_gpu):
+            with self.cached_session(use_gpu=use_gpu):
               image = constant_op.constant(img_np, shape=input_shape)
               new_size = constant_op.constant([target_height, target_width])
               out_op = image_ops.resize_images(
@@ -2656,7 +2631,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
   @test_util.run_deprecated_v1
   def testNameScope(self):
     img_shape = [1, 3, 2, 1]
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       single_image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
       y = image_ops.resize_images(single_image, [55, 66])
       self.assertTrue(y.op.name.startswith("resize"))
@@ -2675,7 +2650,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     y = image_ops.resize_images(x_tensor, target_max,
                                 preserve_aspect_ratio=preserve_aspect_ratio)
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       return y.eval(feed_dict=feed_dict)
 
   def _assertResizeEqual(self, x, x_shape, y, y_shape,
@@ -2773,7 +2748,7 @@ class ResizeImageWithPadTest(test_util.TensorFlowTestCase):
     if not use_tensor_inputs:
       self.assertTrue(y.get_shape().is_fully_defined())
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       return y.eval(feed_dict=feed_dict)
 
   def _assertReturns(self,
@@ -2871,7 +2846,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
     if not use_tensor_inputs:
       self.assertTrue(y.get_shape().is_fully_defined())
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       return y.eval(feed_dict=feed_dict)
 
   def _assertReturns(self,
@@ -3126,7 +3101,7 @@ class JpegTest(test_util.TensorFlowTestCase):
     # Read a real jpeg and verify shape
     path = ("tensorflow/core/lib/jpeg/testdata/"
             "jpeg_merge_test1.jpg")
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       jpeg0 = io_ops.read_file(path)
       image0 = image_ops.decode_jpeg(jpeg0)
       image1 = image_ops.decode_jpeg(image_ops.encode_jpeg(image0))
@@ -3142,7 +3117,7 @@ class JpegTest(test_util.TensorFlowTestCase):
     cmyk_path = os.path.join(base, "jpeg_merge_test1_cmyk.jpg")
     shape = 256, 128, 3
     for channels in 3, 0:
-      with self.test_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True) as sess:
         rgb = image_ops.decode_jpeg(
             io_ops.read_file(rgb_path), channels=channels)
         cmyk = image_ops.decode_jpeg(
@@ -3199,7 +3174,7 @@ class JpegTest(test_util.TensorFlowTestCase):
           self.evaluate(result)
 
   def testSynthetic(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       # Encode it, then decode it, then encode it
       image0 = constant_op.constant(_SimpleColorRamp())
       jpeg0 = image_ops.encode_jpeg(image0)
@@ -3220,7 +3195,7 @@ class JpegTest(test_util.TensorFlowTestCase):
       self.assertLessEqual(len(jpeg0), 6000)
 
   def testSyntheticFasterAlgorithm(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       # Encode it, then decode it, then encode it
       image0 = constant_op.constant(_SimpleColorRamp())
       jpeg0 = image_ops.encode_jpeg(image0)
@@ -3244,7 +3219,7 @@ class JpegTest(test_util.TensorFlowTestCase):
       self.assertLessEqual(len(jpeg0), 6000)
 
   def testDefaultDCTMethodIsIntegerFast(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       # Compare decoding with both dct_option=INTEGER_FAST and
       # default.  They should be the same.
       image0 = constant_op.constant(_SimpleColorRamp())
@@ -3258,7 +3233,7 @@ class JpegTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
   def testShape(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       jpeg = constant_op.constant("nonsense")
       for channels in 0, 1, 3:
         image = image_ops.decode_jpeg(jpeg, channels=channels)
@@ -3270,7 +3245,7 @@ class JpegTest(test_util.TensorFlowTestCase):
     # Read a real jpeg and verify shape.
     path = ("tensorflow/core/lib/jpeg/testdata/"
             "jpeg_merge_test1.jpg")
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       jpeg = io_ops.read_file(path)
       # Extract shape without decoding.
       [image_shape] = sess.run([image_ops.extract_jpeg_shape(jpeg)])
@@ -3281,7 +3256,7 @@ class JpegTest(test_util.TensorFlowTestCase):
     # Read a cmyk jpeg image, and verify its shape.
     path = ("tensorflow/core/lib/jpeg/testdata/"
             "jpeg_merge_test1_cmyk.jpg")
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       jpeg = io_ops.read_file(path)
       [image_shape] = sess.run([image_ops.extract_jpeg_shape(jpeg)])
       # Cmyk jpeg image has 4 channels.
@@ -3297,7 +3272,7 @@ class PngTest(test_util.TensorFlowTestCase):
               (3, "lena_palette.png"), (4, "lena_palette_trns.png"))
     for channels_in, filename in inputs:
       for channels in 0, 1, 3, 4:
-        with self.test_session(use_gpu=True) as sess:
+        with self.cached_session(use_gpu=True) as sess:
           png0 = io_ops.read_file(prefix + filename)
           image0 = image_ops.decode_png(png0, channels=channels)
           png0, image0 = self.evaluate([png0, image0])
@@ -3307,7 +3282,7 @@ class PngTest(test_util.TensorFlowTestCase):
             self.assertAllEqual(image0, self.evaluate(image1))
 
   def testSynthetic(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       # Encode it, then decode it
       image0 = constant_op.constant(_SimpleColorRamp())
       png0 = image_ops.encode_png(image0, compression=7)
@@ -3322,7 +3297,7 @@ class PngTest(test_util.TensorFlowTestCase):
       self.assertLessEqual(len(png0), 750)
 
   def testSyntheticUint16(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       # Encode it, then decode it
       image0 = constant_op.constant(_SimpleColorRamp(), dtype=dtypes.uint16)
       png0 = image_ops.encode_png(image0, compression=7)
@@ -3337,7 +3312,7 @@ class PngTest(test_util.TensorFlowTestCase):
       self.assertLessEqual(len(png0), 1500)
 
   def testSyntheticTwoChannel(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       # Strip the b channel from an rgb image to get a two-channel image.
       gray_alpha = _SimpleColorRamp()[:, :, 0:2]
       image0 = constant_op.constant(gray_alpha)
@@ -3348,7 +3323,7 @@ class PngTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(image0, image1)
 
   def testSyntheticTwoChannelUint16(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       # Strip the b channel from an rgb image to get a two-channel image.
       gray_alpha = _SimpleColorRamp()[:, :, 0:2]
       image0 = constant_op.constant(gray_alpha, dtype=dtypes.uint16)
@@ -3360,7 +3335,7 @@ class PngTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
   def testShape(self):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       png = constant_op.constant("nonsense")
       for channels in 0, 1, 3:
         image = image_ops.decode_png(png, channels=channels)
@@ -3378,7 +3353,7 @@ class GifTest(test_util.TensorFlowTestCase):
     STRIDE = 5
     shape = (12, HEIGHT, WIDTH, 3)
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       gif0 = io_ops.read_file(prefix + filename)
       image0 = image_ops.decode_gif(gif0)
       gif0, image0 = self.evaluate([gif0, image0])
@@ -3405,7 +3380,7 @@ class GifTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
   def testShape(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       gif = constant_op.constant("nonsense")
       image = image_ops.decode_gif(gif)
       self.assertEqual(image.get_shape().as_list(), [None, None, None, 3])
@@ -3417,7 +3392,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
     x_np = np.array(original, dtype=original_dtype.as_numpy_dtype())
     y_np = np.array(expected, dtype=output_dtype.as_numpy_dtype())
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       image = constant_op.constant(x_np)
       y = image_ops.convert_image_dtype(image, output_dtype)
       self.assertTrue(y.dtype == output_dtype)
@@ -3433,7 +3408,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
   @test_util.run_deprecated_v1
   def testNoConvert(self):
     # Make sure converting to the same data type creates only an identity op
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       image = constant_op.constant([1], dtype=dtypes.uint8)
       image_ops.convert_image_dtype(image, dtypes.uint8)
       y = image_ops.convert_image_dtype(image, dtypes.uint8)
@@ -3443,7 +3418,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
   @test_util.run_deprecated_v1
   def testConvertBetweenInteger(self):
     # Make sure converting to between integer types scales appropriately
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       self._convert([0, 255], dtypes.uint8, dtypes.int16, [0, 255 * 128])
       self._convert([0, 32767], dtypes.int16, dtypes.uint8, [0, 255])
       self._convert([0, 2**32], dtypes.int64, dtypes.int32, [0, 1])
@@ -3452,7 +3427,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
   @test_util.run_deprecated_v1
   def testConvertBetweenFloat(self):
     # Make sure converting to between float types does nothing interesting
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       self._convert([-1.0, 0, 1.0, 200000], dtypes.float32, dtypes.float64,
                     [-1.0, 0, 1.0, 200000])
       self._convert([-1.0, 0, 1.0, 200000], dtypes.float64, dtypes.float32,
@@ -3461,7 +3436,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
   @test_util.run_deprecated_v1
   def testConvertBetweenIntegerAndFloat(self):
     # Make sure converting from and to a float type scales appropriately
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       self._convert([0, 1, 255], dtypes.uint8, dtypes.float32,
                     [0, 1.0 / 255.0, 1])
       self._convert([0, 1.1 / 255.0, 1], dtypes.float32, dtypes.uint8,
@@ -3469,7 +3444,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
   def testConvertBetweenInt16AndInt8(self):
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       # uint8, uint16
       self._convert([0, 255 * 256], dtypes.uint16, dtypes.uint8, [0, 255])
       self._convert([0, 255], dtypes.uint8, dtypes.uint16, [0, 255 * 256])
@@ -3500,7 +3475,7 @@ class TotalVariationTest(test_util.TensorFlowTestCase):
     """
 
     # Create a TensorFlow session.
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       # Add a constant to the TensorFlow graph that holds the input.
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
 
@@ -3640,7 +3615,8 @@ class TotalVariationTest(test_util.TensorFlowTestCase):
     # If we negate all pixel-values then the total variation is unchanged.
     self._test(-a, tot_var)
 
-    # Scale the pixel-values by a float. This scales the total variation as well.
+    # Scale the pixel-values by a float. This scales the total variation as
+    # well.
     b = 1.1 * a
     self._test(b, 1.1 * tot_var)
 
@@ -3888,7 +3864,7 @@ class VerifyCompatibleImageShapesTest(test_util.TensorFlowTestCase):
     img = array_ops.placeholder(dtype=dtypes.float32)
     img_np = np.array((2, 2))
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       _, _, checks = image_ops_impl._verify_compatible_image_shapes(img, img)
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(checks, {img: img_np})
@@ -3901,7 +3877,7 @@ class VerifyCompatibleImageShapesTest(test_util.TensorFlowTestCase):
     img1_np = np.array([1, 2, 2, 1])
     img2_np = np.array([1, 3, 3, 1])
 
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       _, _, checks = image_ops_impl._verify_compatible_image_shapes(img1, img2)
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(checks, {img1: img1_np, img2: img2_np})
@@ -3919,7 +3895,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
     return np.expand_dims(im, axis=0)
 
   def _LoadTestImages(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       q20 = self._LoadTestImage(sess, "cat_q20.jpg")
       q72 = self._LoadTestImage(sess, "cat_q72.jpg")
       q95 = self._LoadTestImage(sess, "cat_q95.jpg")
@@ -3940,7 +3916,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
     image2 = self._RandomImage((8, 8, 1), 1)
     psnr = self._PSNR_NumPy(image1, image2, 1)
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       tf_image1 = constant_op.constant(image1, shape=image1.shape,
                                        dtype=dtypes.float32)
       tf_image2 = constant_op.constant(image2, shape=image2.shape,
@@ -3954,7 +3930,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
     image2 = self._RandomImage((10, 8, 8, 1), 1)
     psnr = self._PSNR_NumPy(image1, image2, 1)
 
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       tf_image1 = constant_op.constant(image1, shape=image1.shape,
                                        dtype=dtypes.float32)
       tf_image2 = constant_op.constant(image2, shape=image2.shape,
@@ -3976,7 +3952,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
     self.assertNear(35.302, psnr3, 0.001)
 
     # Test TensorFlow implementation.
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       tf_q20 = constant_op.constant(q20, shape=q20.shape, dtype=dtypes.float32)
       tf_q72 = constant_op.constant(q72, shape=q72.shape, dtype=dtypes.float32)
       tf_q95 = constant_op.constant(q95, shape=q95.shape, dtype=dtypes.float32)
@@ -3991,7 +3967,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
   def testInfinity(self):
     q20, _, _ = self._LoadTestImages()
     psnr = self._PSNR_NumPy(q20, q20, 1)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       tf_q20 = constant_op.constant(q20, shape=q20.shape, dtype=dtypes.float32)
       tf_psnr = image_ops.psnr(tf_q20, tf_q20, 1, "psnr").eval()
       self.assertAllClose(psnr, tf_psnr, atol=0.001)
@@ -4006,7 +3982,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
     img1 = image_ops.convert_image_dtype(img1, dtypes.float32)
     img2 = image_ops.convert_image_dtype(img2, dtypes.float32)
     psnr_float32 = image_ops.psnr(img1, img2, 1.0)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       self.assertAllClose(
           psnr_uint8.eval(), self.evaluate(psnr_float32), atol=0.001)
 
@@ -4031,7 +4007,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
     return np.expand_dims(im, axis=0)
 
   def _LoadTestImages(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       return [self._LoadTestImage(sess, f) for f in self._filenames]
 
   def _RandomImage(self, shape, max_val):
@@ -4046,7 +4022,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
 
     ph = [array_ops.placeholder(dtype=dtypes.float32) for _ in range(2)]
     ssim = image_ops.ssim(*ph, max_val=1.0)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       scores = [ssim.eval(dict(zip(ph, t)))
                 for t in itertools.combinations_with_replacement(img, 2)]
     self.assertAllClose(expected, np.squeeze(scores), atol=1e-4)
@@ -4061,7 +4037,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
 
     ssim = image_ops.ssim(constant_op.constant(img1),
                           constant_op.constant(img2), 1.0)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       self.assertAllClose(expected, self.evaluate(ssim), atol=1e-4)
 
   def testBroadcast(self):
@@ -4073,7 +4049,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
     img2 = array_ops.expand_dims(img, axis=1)  # batch dims: 2, 1.
 
     ssim = image_ops.ssim(img1, img2, 1.0)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       self.assertAllClose(expected, self.evaluate(ssim), atol=1e-4)
 
   @test_util.run_deprecated_v1
@@ -4088,7 +4064,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
 
     ssim = image_ops.ssim(constant_op.constant(img1),
                           constant_op.constant(img2), 255)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       self.assertLess(ssim.eval(), 0)
 
   @test_util.run_deprecated_v1
@@ -4101,7 +4077,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
     img1 = image_ops.convert_image_dtype(img1, dtypes.float32)
     img2 = image_ops.convert_image_dtype(img2, dtypes.float32)
     ssim_float32 = image_ops.ssim(img1, img2, 1.0)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       self.assertAllClose(
           ssim_uint8.eval(), self.evaluate(ssim_float32), atol=0.001)
 
@@ -4126,7 +4102,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
     return np.expand_dims(im, axis=0)
 
   def _LoadTestImages(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       return [self._LoadTestImage(sess, f) for f in self._filenames]
 
   def _RandomImage(self, shape, max_val):
@@ -4144,7 +4120,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
 
     ph = [array_ops.placeholder(dtype=dtypes.float32) for _ in range(2)]
     msssim = image_ops.ssim_multiscale(*ph, max_val=1.0)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       scores = [msssim.eval(dict(zip(ph, t)))
                 for t in itertools.combinations_with_replacement(img, 2)]
 
@@ -4159,7 +4135,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
     msssim = image_ops.ssim_multiscale(*scaled_ph, max_val=1.0,
                                        power_factors=(1, 1, 1, 1, 1))
     grads = gradients.gradients(msssim, scalar)
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       np_grads = sess.run(grads, feed_dict={ph[0]: img[0], ph[1]: img[1]})
     self.assertTrue(np.isfinite(np_grads).all())
 
@@ -4174,7 +4150,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
 
     msssim = image_ops.ssim_multiscale(constant_op.constant(img1),
                                        constant_op.constant(img2), 1.0)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       self.assertAllClose(expected, self.evaluate(msssim), 1e-4)
 
   def testBroadcast(self):
@@ -4187,7 +4163,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
     img2 = array_ops.expand_dims(img, axis=1)  # batch dims: 2, 1.
 
     score_tensor = image_ops.ssim_multiscale(img1, img2, 1.0)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       self.assertAllClose(expected, self.evaluate(score_tensor), 1e-4)
 
   def testRange(self):
@@ -4197,7 +4173,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
     If any of the value is negative so that the geometric mean is not
     well-defined, then treat the MS-SSIM score as zero.
     """
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       img1 = self._LoadTestImage(sess, "checkerboard1.png")
       img2 = self._LoadTestImage(sess, "checkerboard3.png")
       images = [img1, img2, np.zeros_like(img1),
@@ -4222,7 +4198,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
     img1 = image_ops.convert_image_dtype(img1, dtypes.float32)
     img2 = image_ops.convert_image_dtype(img2, dtypes.float32)
     ssim_float32 = image_ops.ssim_multiscale(img1, img2, 1.0)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       self.assertAllClose(
           ssim_uint8.eval(), self.evaluate(ssim_float32), atol=0.001)
 
@@ -4263,7 +4239,7 @@ class ImageGradientsTest(test_util.TensorFlowTestCase):
     batch = constant_op.constant(batch)
     assert batch.get_shape().as_list() == [2, 2, 3, 2]
     dy, dx = image_ops.image_gradients(batch)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       actual_dy = self.evaluate(dy)
       actual_dx = self.evaluate(dx)
       self.assertAllClose(expected_dy, actual_dy)
@@ -4284,7 +4260,7 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
     expected = np.reshape([[[0, 0], [0, 12], [0, 0]],
                            [[0, 0], [0, 12], [0, 0]]], [1, 2, 3, 1, 2])
     sobel = image_ops.sobel_edges(img)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       actual_sobel = self.evaluate(sobel)
       self.assertAllClose(expected, actual_sobel)
 
@@ -4306,7 +4282,7 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
     expected_batch = np.concatenate([expected_two_channel] * batch_size, axis=0)
 
     sobel = image_ops.sobel_edges(img)
-    with self.test_session(use_gpu=True):
+    with self.cached_session(use_gpu=True):
       actual_sobel = self.evaluate(sobel)
       self.assertAllClose(expected_batch, actual_sobel)
 
@@ -4314,7 +4290,7 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
 class DecodeImageTest(test_util.TensorFlowTestCase):
 
   def testJpegUint16(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       base = "tensorflow/core/lib/jpeg/testdata"
       jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
       image0 = image_ops.decode_image(jpeg0, dtype=dtypes.uint16)
@@ -4324,7 +4300,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(image0, image1)
 
   def testPngUint16(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       base = "tensorflow/core/lib/png/testdata"
       png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
       image0 = image_ops.decode_image(png0, dtype=dtypes.uint16)
@@ -4334,7 +4310,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(image0, image1)
 
   def testGifUint16(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       base = "tensorflow/core/lib/gif/testdata"
       gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
       image0 = image_ops.decode_image(gif0, dtype=dtypes.uint16)
@@ -4344,7 +4320,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(image0, image1)
 
   def testBmpUint16(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       base = "tensorflow/core/lib/bmp/testdata"
       bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
       image0 = image_ops.decode_image(bmp0, dtype=dtypes.uint16)
@@ -4354,7 +4330,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(image0, image1)
 
   def testJpegFloat32(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       base = "tensorflow/core/lib/jpeg/testdata"
       jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
       image0 = image_ops.decode_image(jpeg0, dtype=dtypes.float32)
@@ -4364,7 +4340,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(image0, image1)
 
   def testPngFloat32(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       base = "tensorflow/core/lib/png/testdata"
       png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
       image0 = image_ops.decode_image(png0, dtype=dtypes.float32)
@@ -4374,7 +4350,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(image0, image1)
 
   def testGifFloat32(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       base = "tensorflow/core/lib/gif/testdata"
       gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
       image0 = image_ops.decode_image(gif0, dtype=dtypes.float32)
@@ -4384,7 +4360,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(image0, image1)
 
   def testBmpFloat32(self):
-    with self.test_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True) as sess:
       base = "tensorflow/core/lib/bmp/testdata"
       bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
       image0 = image_ops.decode_image(bmp0, dtype=dtypes.float32)
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index c0a4bcd51dd10f352366b74955241e5f97133130..035534ef49cc4c715b2101beb98e1d1aa6a72071 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -38,6 +38,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import linalg_ops_impl
@@ -46,10 +47,10 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.deprecation import  deprecated_arg_values
+from tensorflow.python.util.deprecation import  deprecated_args
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("keras.initializers.Initializer")
 class Initializer(object):
   """Initializer base class: all initializers inherit from this class.
   """
@@ -96,11 +97,15 @@ class Initializer(object):
     return cls(**config)
 
 
-@tf_export("keras.initializers.Zeros", "initializers.zeros",
-           "zeros_initializer", "keras.initializers.zeros")
+@tf_export(v1=["initializers.zeros", "zeros_initializer"])
+@deprecation.deprecated_endpoints("initializers.zeros")
 class Zeros(Initializer):
   """Initializer that generates tensors initialized to 0."""
 
+  @deprecated_args(None,
+                   "Call initializer instance with the dtype argument instead "
+                   "of passing it to the constructor",
+                   "dtype")
   def __init__(self, dtype=dtypes.float32):
     self.dtype = dtypes.as_dtype(dtype)
 
@@ -113,11 +118,15 @@ class Zeros(Initializer):
     return {"dtype": self.dtype.name}
 
 
-@tf_export("keras.initializers.Ones", "initializers.ones", "ones_initializer",
-           "keras.initializers.ones")
+@tf_export(v1=["initializers.ones", "ones_initializer"])
+@deprecation.deprecated_endpoints("initializers.ones", "ones_initializer")
 class Ones(Initializer):
   """Initializer that generates tensors initialized to 1."""
 
+  @deprecated_args(None,
+                   "Call initializer instance with the dtype argument instead "
+                   "of passing it to the constructor",
+                   "dtype")
   def __init__(self, dtype=dtypes.float32):
     self.dtype = dtypes.as_dtype(dtype)
 
@@ -130,8 +139,8 @@ class Ones(Initializer):
     return {"dtype": self.dtype.name}
 
 
-@tf_export("keras.initializers.Constant", "initializers.constant",
-           "constant_initializer", "keras.initializers.constant")
+@tf_export(v1=["initializers.constant", "constant_initializer"])
+@deprecation.deprecated_endpoints("constant_initializer")
 class Constant(Initializer):
   """Initializer that generates tensors with constant values.
 
@@ -211,6 +220,14 @@ class Constant(Initializer):
   ```
   """
 
+  @deprecated_args(None,
+                   "Call initializer instance with the dtype argument instead "
+                   "of passing it to the constructor",
+                   "dtype")
+  @deprecated_args(None,
+                   "Objects must now be the required shape or no shape "
+                   "can be specified",
+                   "verify_shape")
   def __init__(self, value=0, dtype=dtypes.float32, verify_shape=False):
     if not (np.isscalar(value) or isinstance(value, (list, tuple, np.ndarray))):
       raise TypeError(
@@ -237,7 +254,8 @@ class Constant(Initializer):
     return {"value": self.value, "dtype": self.dtype.name}
 
 
-@tf_export("initializers.random_uniform", "random_uniform_initializer")
+@tf_export(v1=["initializers.random_uniform", "random_uniform_initializer"])
+@deprecation.deprecated_endpoints("initializers.random_uniform")
 class RandomUniform(Initializer):
   """Initializer that generates tensors with a uniform distribution.
 
@@ -253,6 +271,10 @@ class RandomUniform(Initializer):
       calling the initializer.
   """
 
+  @deprecated_args(None,
+                   "Call initializer instance with the dtype argument instead "
+                   "of passing it to the constructor",
+                   "dtype")
   def __init__(self, minval=0, maxval=None, seed=None, dtype=dtypes.float32):
     self.minval = minval
     self.maxval = maxval
@@ -274,7 +296,8 @@ class RandomUniform(Initializer):
     }
 
 
-@tf_export("initializers.random_normal", "random_normal_initializer")
+@tf_export(v1=["initializers.random_normal", "random_normal_initializer"])
+@deprecation.deprecated_endpoints("initializers.random_normal")
 class RandomNormal(Initializer):
   """Initializer that generates tensors with a normal distribution.
 
@@ -290,6 +313,10 @@ class RandomNormal(Initializer):
       calling the initializer. Only floating point types are supported.
   """
 
+  @deprecated_args(None,
+                   "Call initializer instance with the dtype argument instead "
+                   "of passing it to the constructor",
+                   "dtype")
   def __init__(self, mean=0.0, stddev=1.0, seed=None, dtype=dtypes.float32):
     self.mean = mean
     self.stddev = stddev
@@ -311,7 +338,9 @@ class RandomNormal(Initializer):
     }
 
 
-@tf_export("initializers.truncated_normal", "truncated_normal_initializer")
+@tf_export(v1=["initializers.truncated_normal", "truncated_normal_initializer"])
+@deprecation.deprecated_endpoints("initializers.truncated_normal",
+                                  "truncated_normal_initializer")
 class TruncatedNormal(Initializer):
   """Initializer that generates a truncated normal distribution.
 
@@ -332,6 +361,10 @@ class TruncatedNormal(Initializer):
       calling the initializer. Only floating point types are supported.
   """
 
+  @deprecated_args(None,
+                   "Call initializer instance with the dtype argument instead "
+                   "of passing it to the constructor",
+                   "dtype")
   def __init__(self, mean=0.0, stddev=1.0, seed=None, dtype=dtypes.float32):
     self.mean = mean
     self.stddev = stddev
@@ -353,12 +386,10 @@ class TruncatedNormal(Initializer):
     }
 
 
-@tf_export(
-    "initializers.uniform_unit_scaling",
-    v1=[
-        "initializers.uniform_unit_scaling", "uniform_unit_scaling_initializer"
-    ])
-@deprecation.deprecated_endpoints("uniform_unit_scaling_initializer")
+@tf_export(v1=["initializers.uniform_unit_scaling",
+               "uniform_unit_scaling_initializer"])
+@deprecation.deprecated_endpoints("uniform_unit_scaling_initializer",
+                                  "initializers.uniform_unit_scaling")
 class UniformUnitScaling(Initializer):
   """Initializer that generates tensors without scaling variance.
 
@@ -390,6 +421,10 @@ class UniformUnitScaling(Initializer):
       ([pdf](http://arxiv.org/pdf/1412.6558.pdf))
   """
 
+  @deprecated_args(None,
+                   "Call initializer instance with the dtype argument instead "
+                   "of passing it to the constructor",
+                   "dtype")
   @deprecated(None,
               "Use tf.initializers.variance_scaling instead with distribution="
               "uniform to get equivalent behavior.")
@@ -421,14 +456,9 @@ class UniformUnitScaling(Initializer):
     return {"factor": self.factor, "seed": self.seed, "dtype": self.dtype.name}
 
 
-@tf_export(
-    "keras.initializers.VarianceScaling",
-    "initializers.variance_scaling",
-    v1=[
-        "keras.initializers.VarianceScaling", "initializers.variance_scaling",
-        "variance_scaling_initializer"
-    ])
-@deprecation.deprecated_endpoints("variance_scaling_initializer")
+@tf_export(v1=["initializers.variance_scaling", "variance_scaling_initializer"])
+@deprecation.deprecated_endpoints("initializers.variance_scaling",
+                                  "variance_scaling_initializer")
 class VarianceScaling(Initializer):
   """Initializer capable of adapting its scale to the shape of weights tensors.
 
@@ -459,6 +489,10 @@ class VarianceScaling(Initializer):
       "distribution" arguments.
   """
 
+  @deprecated_args(None,
+                   "Call initializer instance with the dtype argument instead "
+                   "of passing it to the constructor",
+                   "dtype")
   @deprecated_arg_values(
       None,
       "`normal` is a deprecated alias for `truncated_normal`",
@@ -498,7 +532,7 @@ class VarianceScaling(Initializer):
     else:
       scale /= max(1., (fan_in + fan_out) / 2.)
     if self.distribution == "normal" or self.distribution == "truncated_normal":
-    # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
+      # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
       stddev = math.sqrt(scale) / .87962566103423978
       return random_ops.truncated_normal(
           shape, 0.0, stddev, dtype, seed=self.seed)
@@ -521,15 +555,9 @@ class VarianceScaling(Initializer):
     }
 
 
-@tf_export(
-    "keras.initializers.Orthogonal",
-    "initializers.orthogonal",
-    "keras.initializers.orthogonal",
-    v1=[
-        "keras.initializers.Orthogonal", "initializers.orthogonal",
-        "orthogonal_initializer", "keras.initializers.orthogonal"
-    ])
-@deprecation.deprecated_endpoints("orthogonal_initializer")
+@tf_export(v1=["initializers.orthogonal", "orthogonal_initializer"])
+@deprecation.deprecated_endpoints("initializers.orthogonal",
+                                  "orthogonal_initializer")
 class Orthogonal(Initializer):
   """Initializer that generates an orthogonal matrix.
 
@@ -557,6 +585,10 @@ class Orthogonal(Initializer):
       ([pdf](https://arxiv.org/pdf/1312.6120.pdf))
   """
 
+  @deprecated_args(None,
+                   "Call initializer instance with the dtype argument instead "
+                   "of passing it to the constructor",
+                   "dtype")
   def __init__(self, gain=1.0, seed=None, dtype=dtypes.float32):
     self.gain = gain
     self.dtype = _assert_float_dtype(dtypes.as_dtype(dtype))
@@ -574,9 +606,12 @@ class Orthogonal(Initializer):
     num_rows = 1
     for dim in shape[:-1]:
       num_rows *= dim
-    num_cols = shape[-1]
-    flat_shape = (num_cols, num_rows) if num_rows < num_cols else (num_rows,
-                                                                   num_cols)
+    num_rows = int(num_rows)
+    num_cols = int(shape[-1])
+    if num_rows < num_cols:
+      flat_shape = (num_cols, num_rows)
+    else:
+      flat_shape = (num_rows, num_cols)
 
     # Generate a random matrix
     a = random_ops.random_normal(flat_shape, dtype=dtype, seed=self.seed)
@@ -593,6 +628,8 @@ class Orthogonal(Initializer):
     return {"gain": self.gain, "seed": self.seed, "dtype": self.dtype.name}
 
 
+# Note these haven't been ported to TF2.0. They are not currently visible and
+# the tests are non trivial to port
 class ConvolutionDeltaOrthogonal(Initializer):
   """Initializer that generates a delta orthogonal kernel for ConvNets.
 
@@ -1144,8 +1181,8 @@ class ConvolutionOrthogonal3D(ConvolutionOrthogonal):
     return self._dict_to_tensor(p, ksize, ksize, ksize)
 
 
-@tf_export("keras.initializers.Identity", "initializers.identity",
-           "keras.initializers.identity")
+@tf_export(v1=["initializers.identity"])
+@deprecation.deprecated_endpoints("initializers.identity")
 class Identity(Initializer):
   """Initializer that generates the identity matrix.
 
@@ -1157,6 +1194,10 @@ class Identity(Initializer):
       calling the initializer. Only floating point types are supported.
   """
 
+  @deprecated_args(None,
+                   "Call initializer instance with the dtype argument instead "
+                   "of passing it to the constructor",
+                   "dtype")
   def __init__(self, gain=1.0, dtype=dtypes.float32):
     self.gain = gain
     self.dtype = _assert_float_dtype(dtypes.as_dtype(dtype))
@@ -1168,6 +1209,8 @@ class Identity(Initializer):
           "Identity matrix initializer can only be used for 2D matrices.")
     if dtype is None:
       dtype = self.dtype
+    if isinstance(full_shape, tensor_shape.TensorShape):
+      full_shape = full_shape.as_list()
     initializer = linalg_ops_impl.eye(*full_shape, dtype=dtype)
     if partition_info is not None:
       initializer = array_ops.slice(initializer, partition_info.var_offset,
@@ -1178,8 +1221,9 @@ class Identity(Initializer):
     return {"gain": self.gain, "dtype": self.dtype.name}
 
 
-@tf_export("glorot_uniform_initializer", "keras.initializers.glorot_uniform",
-           "initializers.glorot_uniform")
+@tf_export(v1=["glorot_uniform_initializer", "initializers.glorot_uniform"])
+@deprecation.deprecated_endpoints("glorot_uniform_initializer",
+                                  "initializers.glorot_uniform")
 class GlorotUniform(VarianceScaling):
   """The Glorot uniform initializer, also called Xavier uniform initializer.
 
@@ -1200,6 +1244,10 @@ class GlorotUniform(VarianceScaling):
       ([pdf](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf))
   """
 
+  @deprecated_args(None,
+                   "Call initializer instance with the dtype argument instead "
+                   "of passing it to the constructor",
+                   "dtype")
   def __init__(self, seed=None, dtype=dtypes.float32):
     super(GlorotUniform, self).__init__(
         scale=1.0,
@@ -1212,21 +1260,17 @@ class GlorotUniform(VarianceScaling):
     return {"seed": self.seed, "dtype": self.dtype.name}
 
 
-@tf_export(
-    "keras.initializers.glorot_normal",
-    "initializers.glorot_normal",
-    v1=[
-        "glorot_normal_initializer", "keras.initializers.glorot_normal",
-        "initializers.glorot_normal"
-    ])
-@deprecation.deprecated_endpoints("glorot_normal_initializer")
+@tf_export(v1=["glorot_normal_initializer", "initializers.glorot_normal"])
+@deprecation.deprecated_endpoints("glorot_normal_initializer",
+                                  "initializers.glorot_normal")
 class GlorotNormal(VarianceScaling):
   """The Glorot normal initializer, also called Xavier normal initializer.
 
   It draws samples from a truncated normal distribution centered on 0
-  with `stddev = sqrt(2 / (fan_in + fan_out))`
-  where `fan_in` is the number of input units in the weight tensor
-  and `fan_out` is the number of output units in the weight tensor.
+  with standard deviation (after truncation) given by
+  `stddev = sqrt(2 / (fan_in + fan_out))` where `fan_in` is the number
+  of input units in the weight tensor and `fan_out` is the number of
+  output units in the weight tensor.
 
   Args:
     seed: A Python integer. Used to create random seeds. See
@@ -1239,6 +1283,10 @@ class GlorotNormal(VarianceScaling):
       ([pdf](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf))
   """
 
+  @deprecated_args(None,
+                   "Call initializer instance with the dtype argument instead "
+                   "of passing it to the constructor",
+                   "dtype")
   def __init__(self, seed=None, dtype=dtypes.float32):
     super(GlorotNormal, self).__init__(
         scale=1.0,
@@ -1273,13 +1321,14 @@ convolutional_orthogonal_3d = ConvolutionOrthogonal3D
 # pylint: enable=invalid-name
 
 
-@tf_export("keras.initializers.lecun_normal", "initializers.lecun_normal")
+@tf_export(v1=["initializers.lecun_normal"])
 def lecun_normal(seed=None):
   """LeCun normal initializer.
 
   It draws samples from a truncated normal distribution centered on 0
-  with `stddev = sqrt(1 / fan_in)`
-  where `fan_in` is the number of input units in the weight tensor.
+  with standard deviation (after truncation) given by
+  `stddev = sqrt(1 / fan_in)` where `fan_in` is the number of
+  input units in the weight tensor.
 
   Arguments:
       seed: A Python integer. Used to seed the random generator.
@@ -1289,7 +1338,7 @@ def lecun_normal(seed=None):
 
   References:
       - Self-Normalizing Neural Networks,
-      [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks)
+      [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks)  # pylint: disable=line-too-long
       ([pdf](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf))
       - Efficient Backprop,
       [Lecun et al., 1998](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf)
@@ -1298,7 +1347,7 @@ def lecun_normal(seed=None):
       scale=1., mode="fan_in", distribution="truncated_normal", seed=seed)
 
 
-@tf_export("keras.initializers.lecun_uniform", "initializers.lecun_uniform")
+@tf_export(v1=["initializers.lecun_uniform"])
 def lecun_uniform(seed=None):
   """LeCun uniform initializer.
 
@@ -1314,7 +1363,7 @@ def lecun_uniform(seed=None):
 
   References:
       - Self-Normalizing Neural Networks,
-      [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks)
+      [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks)  # pylint: disable=line-too-long
       ([pdf](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf))
       - Efficient Backprop,
       [Lecun et al., 1998](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf)
@@ -1323,13 +1372,14 @@ def lecun_uniform(seed=None):
       scale=1., mode="fan_in", distribution="uniform", seed=seed)
 
 
-@tf_export("keras.initializers.he_normal", "initializers.he_normal")
+@tf_export(v1=["initializers.he_normal"])
 def he_normal(seed=None):
   """He normal initializer.
 
   It draws samples from a truncated normal distribution centered on 0
-  with `stddev = sqrt(2 / fan_in)`
-  where `fan_in` is the number of input units in the weight tensor.
+  with standard deviation (after truncation) given by
+  `stddev = sqrt(2 / fan_in)` where `fan_in` is the number of
+  input units in the weight tensor.
 
   Arguments:
       seed: A Python integer. Used to seed the random generator.
@@ -1338,14 +1388,15 @@ def he_normal(seed=None):
       An initializer.
 
   References:
-      [He et al., 2015](https://www.cv-foundation.org/openaccess/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html)
+      [He et al., 2015]
+      (https://www.cv-foundation.org/openaccess/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html)  # pylint: disable=line-too-long
       ([pdf](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf))
   """
   return VarianceScaling(
       scale=2., mode="fan_in", distribution="truncated_normal", seed=seed)
 
 
-@tf_export("keras.initializers.he_uniform", "initializers.he_uniform")
+@tf_export(v1=["initializers.he_uniform"])
 def he_uniform(seed=None):
   """He uniform variance scaling initializer.
 
@@ -1360,7 +1411,8 @@ def he_uniform(seed=None):
       An initializer.
 
   References:
-      [He et al., 2015](https://www.cv-foundation.org/openaccess/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html)
+      [He et al., 2015]
+      (https://www.cv-foundation.org/openaccess/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html)  # pylint: disable=line-too-long
       ([pdf](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf))
   """
   return VarianceScaling(
@@ -1377,7 +1429,7 @@ def _compute_fans(shape):
     shape: Integer shape tuple or TF tensor shape.
 
   Returns:
-    A tuple of scalars (fan_in, fan_out).
+    A tuple of integer scalars (fan_in, fan_out).
   """
   if len(shape) < 1:  # Just to avoid errors for constants.
     fan_in = fan_out = 1
@@ -1389,12 +1441,12 @@ def _compute_fans(shape):
   else:
     # Assuming convolution kernels (2D, 3D, or more).
     # kernel shape: (..., input_depth, depth)
-    receptive_field_size = 1.
+    receptive_field_size = 1
     for dim in shape[:-2]:
       receptive_field_size *= dim
     fan_in = shape[-2] * receptive_field_size
     fan_out = shape[-1] * receptive_field_size
-  return fan_in, fan_out
+  return int(fan_in), int(fan_out)
 
 
 def _assert_float_dtype(dtype):
diff --git a/tensorflow/python/ops/init_ops_test.py b/tensorflow/python/ops/init_ops_test.py
index 1f22248004697438d2c8c05dc0c6762a20902d31..1205f367bc99c8e07b97d45b6e4ae7089a089e13 100644
--- a/tensorflow/python/ops/init_ops_test.py
+++ b/tensorflow/python/ops/init_ops_test.py
@@ -24,13 +24,15 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape as tensor_shape_lib
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class InitializersTest(test.TestCase):
 
   def _runner(self,
@@ -40,13 +42,8 @@ class InitializersTest(test.TestCase):
               target_std=None,
               target_max=None,
               target_min=None):
-    variable = resource_variable_ops.ResourceVariable(init(shape))
-    if context.executing_eagerly():
-      output = variable.numpy()
-    else:
-      sess = ops.get_default_session()
-      self.evaluate(variable.initializer)
-      output = self.evaluate(variable)
+    output = self.evaluate(init(shape))
+    self.assertEqual(output.shape, shape)
     lim = 3e-2
     if target_std is not None:
       self.assertGreater(lim, abs(output.std() - target_std))
@@ -58,114 +55,126 @@ class InitializersTest(test.TestCase):
       self.assertGreater(lim, abs(output.min() - target_min))
 
   def test_uniform(self):
-    tensor_shape = (9, 6, 7)
+    shape = (9, 6, 99)
     with self.cached_session():
-      self._runner(
-          init_ops.RandomUniform(minval=-1, maxval=1, seed=124),
-          tensor_shape,
-          target_mean=0.,
-          target_max=1,
-          target_min=-1)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        self._runner(
+            init_ops.RandomUniform(minval=-1, maxval=1, seed=124),
+            tensor_shape,
+            target_mean=0.,
+            target_max=1,
+            target_min=-1)
 
   def test_normal(self):
-    tensor_shape = (8, 12, 99)
+    shape = (8, 12, 99)
     with self.cached_session():
-      self._runner(
-          init_ops.RandomNormal(mean=0, stddev=1, seed=153),
-          tensor_shape,
-          target_mean=0.,
-          target_std=1)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        self._runner(
+            init_ops.RandomNormal(mean=0, stddev=1, seed=153),
+            tensor_shape,
+            target_mean=0.,
+            target_std=1)
 
   def test_truncated_normal(self):
-    tensor_shape = (12, 99, 7)
+    shape = (12, 99, 7)
     with self.cached_session():
-      self._runner(
-          init_ops.TruncatedNormal(mean=0, stddev=1, seed=126),
-          tensor_shape,
-          target_mean=0.,
-          target_max=2,
-          target_min=-2)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        self._runner(
+            init_ops.TruncatedNormal(mean=0, stddev=1, seed=126),
+            tensor_shape,
+            target_mean=0.,
+            target_max=2,
+            target_min=-2)
 
   def test_constant(self):
-    tensor_shape = (5, 6, 4)
+    shape = (5, 6, 4)
     with self.cached_session():
-      self._runner(
-          init_ops.Constant(2),
-          tensor_shape,
-          target_mean=2,
-          target_max=2,
-          target_min=2)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        self._runner(
+            init_ops.Constant(2),
+            tensor_shape,
+            target_mean=2,
+            target_max=2,
+            target_min=2)
 
   def test_lecun_uniform(self):
-    tensor_shape = (5, 6, 4, 2)
+    shape = (5, 6, 4, 2)
     with self.cached_session():
-      fan_in, _ = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(1. / fan_in)
-      self._runner(
-          init_ops.lecun_uniform(seed=123),
-          tensor_shape,
-          target_mean=0.,
-          target_std=std)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        fan_in, _ = init_ops._compute_fans(tensor_shape)
+        std = np.sqrt(1. / fan_in)
+        self._runner(
+            init_ops.lecun_uniform(seed=123),
+            tensor_shape,
+            target_mean=0.,
+            target_std=std)
 
   def test_glorot_uniform_initializer(self):
-    tensor_shape = (5, 6, 4, 2)
+    shape = (5, 6, 4, 2)
     with self.cached_session():
-      fan_in, fan_out = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(2. / (fan_in + fan_out))
-      self._runner(
-          init_ops.glorot_uniform_initializer(seed=123),
-          tensor_shape,
-          target_mean=0.,
-          target_std=std)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        fan_in, fan_out = init_ops._compute_fans(tensor_shape)
+        std = np.sqrt(2. / (fan_in + fan_out))
+        self._runner(
+            init_ops.glorot_uniform_initializer(seed=123),
+            tensor_shape,
+            target_mean=0.,
+            target_std=std)
 
   def test_he_uniform(self):
-    tensor_shape = (5, 6, 4, 2)
+    shape = (5, 6, 4, 2)
     with self.cached_session():
-      fan_in, _ = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(2. / fan_in)
-      self._runner(
-          init_ops.he_uniform(seed=123),
-          tensor_shape,
-          target_mean=0.,
-          target_std=std)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        fan_in, _ = init_ops._compute_fans(tensor_shape)
+        std = np.sqrt(2. / fan_in)
+        self._runner(
+            init_ops.he_uniform(seed=123),
+            tensor_shape,
+            target_mean=0.,
+            target_std=std)
 
   def test_lecun_normal(self):
-    tensor_shape = (5, 6, 4, 2)
+    shape = (5, 6, 4, 2)
     with self.cached_session():
-      fan_in, _ = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(1. / fan_in)
-      self._runner(
-          init_ops.lecun_normal(seed=123),
-          tensor_shape,
-          target_mean=0.,
-          target_std=std)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        fan_in, _ = init_ops._compute_fans(tensor_shape)
+        std = np.sqrt(1. / fan_in)
+        self._runner(
+            init_ops.lecun_normal(seed=123),
+            tensor_shape,
+            target_mean=0.,
+            target_std=std)
 
   def test_glorot_normal_initializer(self):
-    tensor_shape = (5, 6, 4, 2)
+    shape = (5, 6, 4, 2)
     with self.cached_session():
-      fan_in, fan_out = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(2. / (fan_in + fan_out))
-      self._runner(
-          init_ops.glorot_normal_initializer(seed=123),
-          tensor_shape,
-          target_mean=0.,
-          target_std=std)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        fan_in, fan_out = init_ops._compute_fans(tensor_shape)
+        std = np.sqrt(2. / (fan_in + fan_out))
+        self._runner(
+            init_ops.glorot_normal_initializer(seed=123),
+            tensor_shape,
+            target_mean=0.,
+            target_std=std)
 
   def test_he_normal(self):
-    tensor_shape = (5, 6, 4, 2)
+    shape = (5, 6, 4, 2)
     with self.cached_session():
-      fan_in, _ = init_ops._compute_fans(tensor_shape)
-      std = np.sqrt(2. / fan_in)
-      self._runner(
-          init_ops.he_normal(seed=123),
-          tensor_shape,
-          target_mean=0.,
-          target_std=std)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        fan_in, _ = init_ops._compute_fans(tensor_shape)
+        std = np.sqrt(2. / fan_in)
+        self._runner(
+            init_ops.he_normal(seed=123),
+            tensor_shape,
+            target_mean=0.,
+            target_std=std)
 
   def test_Orthogonal(self):
-    tensor_shape = (20, 20)
+    shape = (20, 20)
     with self.cached_session():
-      self._runner(init_ops.Orthogonal(seed=123), tensor_shape, target_mean=0.)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        self._runner(
+            init_ops.Orthogonal(seed=123), tensor_shape, target_mean=0.)
 
   def testVariablePlacementWithOrthogonalInitializer(self):
     if not context.context().num_gpus():
@@ -203,31 +212,36 @@ class InitializersTest(test.TestCase):
 
   def test_Identity(self):
     with self.cached_session():
-      tensor_shape = (3, 4, 5)
-      with self.assertRaises(ValueError):
+      shape = (3, 4, 5)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        with self.assertRaises(ValueError):
+          self._runner(
+              init_ops.Identity(),
+              tensor_shape,
+              target_mean=1. / int(tensor_shape[0]),
+              target_max=1.)
+
+      shape = (3, 3)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
         self._runner(
             init_ops.Identity(),
             tensor_shape,
-            target_mean=1. / tensor_shape[0],
+            target_mean=1. / int(tensor_shape[0]),
             target_max=1.)
 
-      tensor_shape = (3, 3)
-      self._runner(
-          init_ops.Identity(),
-          tensor_shape,
-          target_mean=1. / tensor_shape[0],
-          target_max=1.)
-
   def test_Zeros(self):
-    tensor_shape = (4, 5)
+    shape = (4, 5)
     with self.cached_session():
-      self._runner(
-          init_ops.Zeros(), tensor_shape, target_mean=0., target_max=0.)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        self._runner(
+            init_ops.Zeros(), tensor_shape, target_mean=0., target_max=0.)
 
   def test_Ones(self):
-    tensor_shape = (4, 5)
+    shape = (4, 5)
     with self.cached_session():
-      self._runner(init_ops.Ones(), tensor_shape, target_mean=1., target_max=1.)
+      for tensor_shape in [shape, tensor_shape_lib.TensorShape(shape)]:
+        self._runner(
+            init_ops.Ones(), tensor_shape, target_mean=1., target_max=1.)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/init_ops_v2.py b/tensorflow/python/ops/init_ops_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e614e0012a279a2c4257a850579bc63577207b7
--- /dev/null
+++ b/tensorflow/python/ops/init_ops_v2.py
@@ -0,0 +1,764 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operations often used for initializing tensors.
+
+All variable initializers returned by functions in this file should have the
+following signature:
+
+def _initializer(shape, dtype=dtypes.float32):
+  Args:
+    shape: List of `int` representing the shape of the output `Tensor`. Some
+      initializers may also be able to accept a `Tensor`.
+    dtype: (Optional) Type of the output `Tensor`.
+  Returns:
+    A `Tensor` of type `dtype` and `shape`.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_linalg_ops
+from tensorflow.python.ops import linalg_ops_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+class Initializer(object):
+  """Initializer base class: all initializers inherit from this class.
+  """
+
+  def __call__(self, shape, dtype=None):
+    """Returns a tensor object initialized as specified by the initializer.
+
+    Args:
+      shape: Shape of the tensor.
+      dtype: Optional dtype of the tensor. If not provided will return tensor
+       of `tf.float32`.
+    """
+    raise NotImplementedError
+
+  def get_config(self):
+    """Returns the configuration of the initializer as a JSON-serializable dict.
+
+    Returns:
+      A JSON-serializable Python dict.
+    """
+    return {}
+
+  @classmethod
+  def from_config(cls, config):
+    """Instantiates an initializer from a configuration dictionary.
+
+    Example:
+
+    ```python
+    initializer = RandomUniform(-1, 1)
+    config = initializer.get_config()
+    initializer = RandomUniform.from_config(config)
+    ```
+
+    Args:
+      config: A Python dictionary.
+        It will typically be the output of `get_config`.
+
+    Returns:
+      An Initializer instance.
+    """
+    config.pop("dtype", None)
+    return cls(**config)
+
+
+@tf_export("zeros_initializer", v1=[])
+class Zeros(Initializer):
+  """Initializer that generates tensors initialized to 0."""
+
+  def __call__(self, shape, dtype=dtypes.float32):
+    dtype = dtypes.as_dtype(dtype)
+    return array_ops.zeros(shape, dtype)
+
+
+@tf_export("ones_initializer", v1=[])
+class Ones(Initializer):
+  """Initializer that generates tensors initialized to 1."""
+
+  def __call__(self, shape, dtype=dtypes.float32):
+    """Returns a tensor object initialized as specified by the initializer.
+
+    Args:
+      shape: Shape of the tensor.
+      dtype: Optional dtype of the tensor. Only numeric or boolean dtypes are
+       supported.
+
+    Raises:
+      ValuesError: If the dtype is not numeric or boolean.
+    """
+    dtype = dtypes.as_dtype(dtype)
+    if not dtype.is_numpy_compatible or dtype == dtypes.string:
+      raise ValueError("Expected numeric or boolean dtype, got %s." % dtype)
+    return array_ops.ones(shape, dtype)
+
+
+@tf_export("constant_initializer", v1=[])
+class Constant(Initializer):
+  """Initializer that generates tensors with constant values.
+
+  The resulting tensor is populated with values of type `dtype`, as
+  specified by arguments `value` following the desired `shape` of the
+  new tensor (see examples below).
+
+  The argument `value` can be a constant value, or a list of values of type
+  `dtype`. If `value` is a list, then the length of the list must be less
+  than or equal to the number of elements implied by the desired shape of the
+  tensor. In the case where the total number of elements in `value` is less
+  than the number of elements required by the tensor shape, the last element
+  in `value` will be used to fill the remaining entries. If the total number of
+  elements in `value` is greater than the number of elements required by the
+  tensor shape, the initializer will raise a `ValueError`.
+
+  Args:
+    value: A Python scalar, list or tuple of values, or a N-dimensional numpy
+      array. All elements of the initialized variable will be set to the
+      corresponding value in the `value` argument.
+
+  Raises:
+    TypeError: If the input `value` is not one of the expected types.
+
+  Examples:
+    The following example can be rewritten using a numpy.ndarray instead
+    of the `value` list, even reshaped, as shown in the two commented lines
+    below the `value` list initialization.
+
+  ```python
+    >>> import numpy as np
+    >>> import tensorflow as tf
+
+    >>> value = [0, 1, 2, 3, 4, 5, 6, 7]
+    >>> # value = np.array(value)
+    >>> # value = value.reshape([2, 4])
+    >>> init = tf.constant_initializer(value)
+
+    >>> print('fitting shape:')
+    >>> with tf.Session():
+    >>>   x = tf.get_variable('x', shape=[2, 4], initializer=init)
+    >>>   x.initializer.run()
+    >>>   print(x.eval())
+
+    fitting shape:
+    [[ 0.  1.  2.  3.]
+     [ 4.  5.  6.  7.]]
+
+    >>> print('larger shape:')
+    >>> with tf.Session():
+    >>>   x = tf.get_variable('x', shape=[3, 4], initializer=init)
+    >>>   x.initializer.run()
+    >>>   print(x.eval())
+
+    larger shape:
+    [[ 0.  1.  2.  3.]
+     [ 4.  5.  6.  7.]
+     [ 7.  7.  7.  7.]]
+
+    >>> print('smaller shape:')
+    >>> with tf.Session():
+    >>>   x = tf.get_variable('x', shape=[2, 3], initializer=init)
+
+    ValueError: Too many elements provided. Needed at most 6, but received 8
+  ```
+  """
+
+  def __init__(self, value=0):
+    if not (np.isscalar(value) or isinstance(value, (list, tuple, np.ndarray))):
+      raise TypeError(
+          "Invalid type for initial value: %s (expected Python scalar, list or "
+          "tuple of values, or numpy.ndarray)." % type(value))
+    self.value = value
+
+  def __call__(self, shape, dtype=None):
+    """Returns a tensor object initialized as specified by the initializer.
+
+    Args:
+      shape: Shape of the tensor.
+      dtype: Optional dtype of the tensor. If not provided the dtype of the
+       tensor created will be the type of the inital value.
+
+    Raises:
+      TypeError: If the initializer cannot create a tensor of the requested
+       dtype.
+    """
+    if dtype is not None:
+      dtype = dtypes.as_dtype(dtype)
+    return constant_op.constant(
+        self.value, dtype=dtype, shape=shape)
+
+  def get_config(self):
+    return {"value": self.value}
+
+
+@tf_export("random_uniform_initializer", v1=[])
+class RandomUniform(Initializer):
+  """Initializer that generates tensors with a uniform distribution.
+
+  Args:
+    minval: A python scalar or a scalar tensor. Lower bound of the range
+      of random values to generate.
+    maxval: A python scalar or a scalar tensor. Upper bound of the range
+      of random values to generate.  Defaults to 1 for float types.
+    seed: A Python integer. Used to create random seeds. See
+      `tf.set_random_seed`
+      for behavior.
+  """
+
+  def __init__(self, minval=-0.05, maxval=0.05, seed=None):
+    self.minval = minval
+    self.maxval = maxval
+    self.seed = seed
+
+  def __call__(self, shape, dtype=dtypes.float32):
+    """Returns a tensor object initialized as specified by the initializer.
+
+    Args:
+      shape: Shape of the tensor.
+      dtype: Optional dtype of the tensor. Only floating point and integer
+      types are supported.
+
+    Raises:
+      ValueError: If the dtype is not numeric.
+    """
+    dtype = dtypes.as_dtype(dtype)
+    if not dtype.is_floating and not dtype.is_integer:
+      raise ValueError("Expected float or integer dtype, got %s." % dtype)
+    return random_ops.random_uniform(
+        shape, self.minval, self.maxval, dtype, seed=self.seed)
+
+  def get_config(self):
+    return {
+        "minval": self.minval,
+        "maxval": self.maxval,
+        "seed": self.seed
+    }
+
+
+@tf_export("random_normal_initializer", v1=[])
+class RandomNormal(Initializer):
+  """Initializer that generates tensors with a normal distribution.
+
+  Args:
+    mean: a python scalar or a scalar tensor. Mean of the random values
+      to generate.
+    stddev: a python scalar or a scalar tensor. Standard deviation of the
+      random values to generate.
+    seed: A Python integer. Used to create random seeds. See
+      `tf.set_random_seed`
+      for behavior.
+  """
+
+  def __init__(self, mean=0.0, stddev=0.05, seed=None):
+    self.mean = mean
+    self.stddev = stddev
+    self.seed = seed
+
+  def __call__(self, shape, dtype=dtypes.float32):
+    """Returns a tensor object initialized as specified by the initializer.
+
+    Args:
+      shape: Shape of the tensor.
+      dtype: Optional dtype of the tensor. Only floating point types are
+       supported.
+
+    Raises:
+      ValueError: If the dtype is not floating point
+    """
+    dtype = _assert_float_dtype(dtype)
+    return random_ops.random_normal(
+        shape, self.mean, self.stddev, dtype, seed=self.seed)
+
+  def get_config(self):
+    return {
+        "mean": self.mean,
+        "stddev": self.stddev,
+        "seed": self.seed
+    }
+
+
+class TruncatedNormal(Initializer):
+  """Initializer that generates a truncated normal distribution.
+
+  These values are similar to values from a `random_normal_initializer`
+  except that values more than two standard deviations from the mean
+  are discarded and re-drawn. This is the recommended initializer for
+  neural network weights and filters.
+
+  Args:
+    mean: a python scalar or a scalar tensor. Mean of the random values
+      to generate.
+    stddev: a python scalar or a scalar tensor. Standard deviation of the
+      random values to generate.
+    seed: A Python integer. Used to create random seeds. See
+      `tf.set_random_seed`
+      for behavior.
+  """
+
+  def __init__(self, mean=0.0, stddev=0.05, seed=None):
+    self.mean = mean
+    self.stddev = stddev
+    self.seed = seed
+
+  def __call__(self, shape, dtype=dtypes.float32):
+    """Returns a tensor object initialized as specified by the initializer.
+
+    Args:
+      shape: Shape of the tensor.
+      dtype: Optional dtype of the tensor. Only floating point types are
+       supported.
+
+    Raises:
+      ValueError: If the dtype is not floating point
+    """
+    dtype = _assert_float_dtype(dtype)
+    return random_ops.truncated_normal(
+        shape, self.mean, self.stddev, dtype, seed=self.seed)
+
+  def get_config(self):
+    return {
+        "mean": self.mean,
+        "stddev": self.stddev,
+        "seed": self.seed
+    }
+
+
+class VarianceScaling(Initializer):
+  """Initializer capable of adapting its scale to the shape of weights tensors.
+
+  With `distribution="truncated_normal" or "untruncated_normal"`,
+  samples are drawn from a truncated/untruncated normal
+  distribution with a mean of zero and a standard deviation (after truncation,
+  if used) `stddev = sqrt(scale / n)`
+  where n is:
+    - number of input units in the weight tensor, if mode = "fan_in"
+    - number of output units, if mode = "fan_out"
+    - average of the numbers of input and output units, if mode = "fan_avg"
+
+  With `distribution="uniform"`, samples are drawn from a uniform distribution
+  within [-limit, limit], with `limit = sqrt(3 * scale / n)`.
+
+  Args:
+    scale: Scaling factor (positive float).
+    mode: One of "fan_in", "fan_out", "fan_avg".
+    distribution: Random distribution to use. One of "truncated_normal",
+      "untruncated_normal" and  "uniform".
+    seed: A Python integer. Used to create random seeds. See
+      `tf.set_random_seed`
+      for behavior.
+
+  Raises:
+    ValueError: In case of an invalid value for the "scale", mode" or
+      "distribution" arguments.
+  """
+
+  def __init__(self,
+               scale=1.0,
+               mode="fan_in",
+               distribution="truncated_normal",
+               seed=None):
+    if scale <= 0.:
+      raise ValueError("`scale` must be positive float.")
+    if mode not in {"fan_in", "fan_out", "fan_avg"}:
+      raise ValueError("Invalid `mode` argument:", mode)
+    distribution = distribution.lower()
+    if distribution not in {"uniform", "truncated_normal",
+                            "untruncated_normal"}:
+      raise ValueError("Invalid `distribution` argument:", distribution)
+    self.scale = scale
+    self.mode = mode
+    self.distribution = distribution
+    self.seed = seed
+
+  def __call__(self, shape, dtype=dtypes.float32):
+    """Returns a tensor object initialized as specified by the initializer.
+
+    Args:
+      shape: Shape of the tensor.
+      dtype: Optional dtype of the tensor. Only floating point types are
+       supported.
+
+    Raises:
+      ValueError: If the dtype is not floating point
+    """
+    partition_info = None  # Keeps logic so can be readded later if necessary
+    dtype = _assert_float_dtype(dtype)
+    scale = self.scale
+    scale_shape = shape
+    if partition_info is not None:
+      scale_shape = partition_info.full_shape
+    fan_in, fan_out = _compute_fans(scale_shape)
+    if self.mode == "fan_in":
+      scale /= max(1., fan_in)
+    elif self.mode == "fan_out":
+      scale /= max(1., fan_out)
+    else:
+      scale /= max(1., (fan_in + fan_out) / 2.)
+    if self.distribution == "truncated_normal":
+      # constant from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
+      stddev = math.sqrt(scale) / .87962566103423978
+      return random_ops.truncated_normal(
+          shape, 0.0, stddev, dtype, seed=self.seed)
+    elif self.distribution == "untruncated_normal":
+      stddev = math.sqrt(scale)
+      return random_ops.random_normal(
+          shape, 0.0, stddev, dtype, seed=self.seed)
+    else:
+      limit = math.sqrt(3.0 * scale)
+      return random_ops.random_uniform(
+          shape, -limit, limit, dtype, seed=self.seed)
+
+  def get_config(self):
+    return {
+        "scale": self.scale,
+        "mode": self.mode,
+        "distribution": self.distribution,
+        "seed": self.seed
+    }
+
+
+class Orthogonal(Initializer):
+  """Initializer that generates an orthogonal matrix.
+
+  If the shape of the tensor to initialize is two-dimensional, it is initialized
+  with an orthogonal matrix obtained from the QR decomposition of a matrix of
+  random numbers drawn from a normal distribution.
+  If the matrix has fewer rows than columns then the output will have orthogonal
+  rows. Otherwise, the output will have orthogonal columns.
+
+  If the shape of the tensor to initialize is more than two-dimensional,
+  a matrix of shape `(shape[0] * ... * shape[n - 2], shape[n - 1])`
+  is initialized, where `n` is the length of the shape vector.
+  The matrix is subsequently reshaped to give a tensor of the desired shape.
+
+  Args:
+    gain: multiplicative factor to apply to the orthogonal matrix
+    seed: A Python integer. Used to create random seeds. See
+      `tf.set_random_seed`
+    for behavior.
+
+  References:
+      [Saxe et al., 2014](https://openreview.net/forum?id=_wzZwKpTDF_9C)
+      ([pdf](https://arxiv.org/pdf/1312.6120.pdf))
+  """
+
+  def __init__(self, gain=1.0, seed=None):
+    self.gain = gain
+    self.seed = seed
+
+  def __call__(self, shape, dtype=dtypes.float32):
+    """Returns a tensor object initialized as specified by the initializer.
+
+    Args:
+      shape: Shape of the tensor.
+      dtype: Optional dtype of the tensor. Only floating point types are
+       supported.
+
+    Raises:
+      ValueError: If the dtype is not floating point or the input shape is not
+       valid.
+    """
+    dtype = _assert_float_dtype(dtype)
+    # Check the shape
+    if len(shape) < 2:
+      raise ValueError("The tensor to initialize must be "
+                       "at least two-dimensional")
+    # Flatten the input shape with the last dimension remaining
+    # its original shape so it works for conv2d
+    num_rows = 1
+    for dim in shape[:-1]:
+      num_rows *= dim
+    num_cols = shape[-1]
+    flat_shape = (max(num_cols, num_rows), min(num_cols, num_rows))
+
+    # Generate a random matrix
+    a = random_ops.random_normal(flat_shape, dtype=dtype, seed=self.seed)
+    # Compute the qr factorization
+    q, r = gen_linalg_ops.qr(a, full_matrices=False)
+    # Make Q uniform
+    d = array_ops.diag_part(r)
+    q *= math_ops.sign(d)
+    if num_rows < num_cols:
+      q = array_ops.matrix_transpose(q)
+    return self.gain * array_ops.reshape(q, shape)
+
+  def get_config(self):
+    return {"gain": self.gain, "seed": self.seed}
+
+
+class Identity(Initializer):
+  """Initializer that generates the identity matrix.
+
+  Only use for 2D matrices.
+
+  Args:
+    gain: Multiplicative factor to apply to the identity matrix.
+  """
+
+  def __init__(self, gain=1.0):
+    self.gain = gain
+
+  def __call__(self, shape, dtype=dtypes.float32):
+    """Returns a tensor object initialized as specified by the initializer.
+
+    Args:
+      shape: Shape of the tensor.
+      dtype: Optional dtype of the tensor. Only floating point types are
+       supported.
+
+    Raises:
+      ValueError: If the dtype is not floating point
+    """
+    partition_info = None  # Keeps logic so can be readded later if necessary
+    dtype = _assert_float_dtype(dtype)
+    full_shape = shape if partition_info is None else partition_info.full_shape
+    if len(full_shape) != 2:
+      raise ValueError(
+          "Identity matrix initializer can only be used for 2D matrices.")
+    initializer = linalg_ops_impl.eye(*full_shape, dtype=dtype)
+    if partition_info is not None:
+      initializer = array_ops.slice(initializer, partition_info.var_offset,
+                                    shape)
+    return self.gain * initializer
+
+  def get_config(self):
+    return {"gain": self.gain}
+
+
+class GlorotUniform(VarianceScaling):
+  """The Glorot uniform initializer, also called Xavier uniform initializer.
+
+  It draws samples from a uniform distribution within [-limit, limit]
+  where `limit` is `sqrt(6 / (fan_in + fan_out))`
+  where `fan_in` is the number of input units in the weight tensor
+  and `fan_out` is the number of output units in the weight tensor.
+
+  Args:
+    seed: A Python integer. Used to create random seeds. See
+      `tf.set_random_seed`
+      for behavior.
+
+  References:
+      [Glorot et al., 2010](http://proceedings.mlr.press/v9/glorot10a.html)
+      ([pdf](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf))
+  """
+
+  def __init__(self, seed=None):
+    super(GlorotUniform, self).__init__(
+        scale=1.0,
+        mode="fan_avg",
+        distribution="uniform",
+        seed=seed)
+
+  def get_config(self):
+    return {"seed": self.seed}
+
+
+class GlorotNormal(VarianceScaling):
+  """The Glorot normal initializer, also called Xavier normal initializer.
+
+  It draws samples from a truncated normal distribution centered on 0
+  with `stddev = sqrt(2 / (fan_in + fan_out))`
+  where `fan_in` is the number of input units in the weight tensor
+  and `fan_out` is the number of output units in the weight tensor.
+
+  Args:
+    seed: A Python integer. Used to create random seeds. See
+      `tf.set_random_seed` for behavior.
+
+  References:
+      [Glorot et al., 2010](http://proceedings.mlr.press/v9/glorot10a.html)
+      ([pdf](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf))
+  """
+
+  def __init__(self, seed=None):
+    super(GlorotNormal, self).__init__(
+        scale=1.0,
+        mode="fan_avg",
+        distribution="truncated_normal",
+        seed=seed)
+
+  def get_config(self):
+    return {"seed": self.seed, "dtype": self.dtype.name}
+
+
+# Aliases.
+
+# pylint: disable=invalid-name
+zeros_initializer = Zeros
+ones_initializer = Ones
+constant_initializer = Constant
+random_uniform_initializer = RandomUniform
+random_normal_initializer = RandomNormal
+truncated_normal_initializer = TruncatedNormal
+variance_scaling_initializer = VarianceScaling
+glorot_uniform_initializer = GlorotUniform
+glorot_normal_initializer = GlorotNormal
+orthogonal_initializer = Orthogonal
+identity_initializer = Identity
+# pylint: enable=invalid-name
+
+
+def lecun_normal(seed=None):
+  """LeCun normal initializer.
+
+  It draws samples from a truncated normal distribution centered on 0
+  with `stddev = sqrt(1 / fan_in)`
+  where `fan_in` is the number of input units in the weight tensor.
+
+  Arguments:
+      seed: A Python integer. Used to seed the random generator.
+
+  Returns:
+      An initializer.
+
+  References:
+      - Self-Normalizing Neural Networks,
+      [Klambauer et al., 2017]
+      (https://papers.nips.cc/paper/6698-self-normalizing-neural-networks)
+      ([pdf]
+      (https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf))
+      - Efficient Backprop,
+      [Lecun et al., 1998](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf)
+  """
+  return VarianceScaling(
+      scale=1., mode="fan_in", distribution="truncated_normal", seed=seed)
+
+
+def lecun_uniform(seed=None):
+  """LeCun uniform initializer.
+
+  It draws samples from a uniform distribution within [-limit, limit]
+  where `limit` is `sqrt(3 / fan_in)`
+  where `fan_in` is the number of input units in the weight tensor.
+
+  Arguments:
+      seed: A Python integer. Used to seed the random generator.
+
+  Returns:
+      An initializer.
+
+  References:
+      - Self-Normalizing Neural Networks,
+      [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks) # pylint: disable=line-too-long
+      ([pdf](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf))
+      - Efficient Backprop,
+      [Lecun et al., 1998](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf)
+  """
+  return VarianceScaling(
+      scale=1., mode="fan_in", distribution="uniform", seed=seed)
+
+
+def he_normal(seed=None):
+  """He normal initializer.
+
+  It draws samples from a truncated normal distribution centered on 0
+  with `stddev = sqrt(2 / fan_in)`
+  where `fan_in` is the number of input units in the weight tensor.
+
+  Arguments:
+      seed: A Python integer. Used to seed the random generator.
+
+  Returns:
+      An initializer.
+
+  References:
+      [He et al., 2015](https://www.cv-foundation.org/openaccess/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html) # pylint: disable=line-too-long
+      ([pdf](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf))
+  """
+  return VarianceScaling(
+      scale=2., mode="fan_in", distribution="truncated_normal", seed=seed)
+
+
+def he_uniform(seed=None):
+  """He uniform variance scaling initializer.
+
+  It draws samples from a uniform distribution within [-limit, limit]
+  where `limit` is `sqrt(6 / fan_in)`
+  where `fan_in` is the number of input units in the weight tensor.
+
+  Arguments:
+      seed: A Python integer. Used to seed the random generator.
+
+  Returns:
+      An initializer.
+
+  References:
+      [He et al., 2015](https://www.cv-foundation.org/openaccess/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html) # pylint: disable=line-too-long
+      ([pdf](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf))
+  """
+  return VarianceScaling(
+      scale=2., mode="fan_in", distribution="uniform", seed=seed)
+
+
+# Utility functions.
+
+
+def _compute_fans(shape):
+  """Computes the number of input and output units for a weight shape.
+
+  Args:
+    shape: Integer shape tuple or TF tensor shape.
+
+  Returns:
+    A tuple of scalars (fan_in, fan_out).
+  """
+  if len(shape) < 1:  # Just to avoid errors for constants.
+    fan_in = fan_out = 1
+  elif len(shape) == 1:
+    fan_in = fan_out = shape[0]
+  elif len(shape) == 2:
+    fan_in = shape[0]
+    fan_out = shape[1]
+  else:
+    # Assuming convolution kernels (2D, 3D, or more).
+    # kernel shape: (..., input_depth, depth)
+    receptive_field_size = 1.
+    for dim in shape[:-2]:
+      receptive_field_size *= dim
+    fan_in = shape[-2] * receptive_field_size
+    fan_out = shape[-1] * receptive_field_size
+  return fan_in, fan_out
+
+
+def _assert_float_dtype(dtype):
+  """Validate and return floating point type based on `dtype`.
+
+  `dtype` must be a floating point type.
+
+  Args:
+    dtype: The data type to validate.
+
+  Returns:
+    Validated type.
+
+  Raises:
+    ValueError: if `dtype` is not a floating point type.
+  """
+  dtype = dtypes.as_dtype(dtype)
+  if not dtype.is_floating:
+    raise ValueError("Expected floating point type, got %s." % dtype)
+  return dtype
diff --git a/tensorflow/python/ops/init_ops_v2_test.py b/tensorflow/python/ops/init_ops_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..fceba1d04a25867a835e398889748bb3c2d3de3b
--- /dev/null
+++ b/tensorflow/python/ops/init_ops_v2_test.py
@@ -0,0 +1,512 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for initializers in init_ops_v2."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops_v2
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class InitializersTest(test.TestCase):
+
+  def _identical_test(self,
+                      init1,
+                      init2,
+                      assertion,
+                      shape=None,
+                      dtype=dtypes.float32):
+    if shape is None:
+      shape = [100]
+    t1 = self.evaluate(init1(shape, dtype))
+    t2 = self.evaluate(init2(shape, dtype))
+    self.assertEqual(tensor_shape.as_shape(shape), t1.shape)
+    self.assertEqual(tensor_shape.as_shape(shape), t2.shape)
+    self.assertEqual(assertion, np.allclose(t1, t2, rtol=1e-15, atol=1e-15))
+
+  def _duplicated_test(self,
+                       init,
+                       shape=None,
+                       dtype=dtypes.float32):
+    if shape is None:
+      shape = [100]
+    t1 = self.evaluate(init(shape, dtype))
+    t2 = self.evaluate(init(shape, dtype))
+    self.assertEqual(tensor_shape.as_shape(shape), t1.shape)
+    self.assertEqual(tensor_shape.as_shape(shape), t2.shape)
+    self.assertFalse(np.allclose(t1, t2, rtol=1e-15, atol=1e-15))
+
+  def _range_test(self,
+                  init,
+                  shape,
+                  target_mean=None,
+                  target_std=None,
+                  target_max=None,
+                  target_min=None):
+    output = self.evaluate(init(shape))
+    self.assertEqual(output.shape, shape)
+    lim = 3e-2
+    if target_std is not None:
+      self.assertGreater(lim, abs(output.std() - target_std))
+    if target_mean is not None:
+      self.assertGreater(lim, abs(output.mean() - target_mean))
+    if target_max is not None:
+      self.assertGreater(lim, abs(output.max() - target_max))
+    if target_min is not None:
+      self.assertGreater(lim, abs(output.min() - target_min))
+
+
+class ConstantInitializersTest(InitializersTest):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testZeros(self):
+    self._range_test(init_ops_v2.Zeros(), shape=(4, 5),
+                     target_mean=0., target_max=0.)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testOnes(self):
+    self._range_test(init_ops_v2.Ones(), shape=(4, 5),
+                     target_mean=1., target_max=1.)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConstantInt(self):
+    self._range_test(
+        init_ops_v2.Constant(2),
+        shape=(5, 6, 4),
+        target_mean=2,
+        target_max=2,
+        target_min=2)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConstantTuple(self):
+    init = init_ops_v2.constant_initializer((10, 20, 30))
+    tensor = init(shape=[3])
+    self.assertAllEqual(self.evaluate(tensor), [10, 20, 30])
+    self.assertEqual(tensor.shape, [3])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConstantInvalidValue(self):
+    c = constant_op.constant([1.0, 2.0, 3.0])
+    with self.assertRaisesRegexp(
+        TypeError, r"Invalid type for initial value: .*Tensor.*"):
+      init_ops_v2.constant_initializer(c)
+    v = variables.Variable([3.0, 2.0, 1.0])
+    with self.assertRaisesRegexp(
+        TypeError, r"Invalid type for initial value: .*Variable.*"):
+      init_ops_v2.constant_initializer(v)
+
+  def _testNDimConstantInitializer(self, value, shape, expected):
+    with test_util.use_gpu():
+      init = init_ops_v2.constant_initializer(value)
+      x = init(shape)
+
+      actual = self.evaluate(array_ops.reshape(x, [-1]))
+      self.assertEqual(len(actual), len(expected))
+      for a, e in zip(actual, expected):
+        self.assertEqual(a, e)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNDimConstantInitializer(self):
+    value = [0, 1, 2, 3, 4, 5]
+    shape = [2, 3]
+    expected = list(value)
+
+    self._testNDimConstantInitializer(value, shape, expected)
+    self._testNDimConstantInitializer(np.asarray(value), shape, expected)
+    self._testNDimConstantInitializer(np.asarray(value).reshape(tuple(shape)),
+                                      shape, expected)
+
+  def _testNDimConstantInitializerIncorrectNumberValues(self, value, shape):
+    with test_util.use_gpu():
+      init = init_ops_v2.constant_initializer(value)
+      self.assertRaises(TypeError,
+                        init,
+                        shape=shape)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNDimConstantInitializerIncorrectNumberValues(self):
+    value = [0, 1, 2, 3, 4, 5]
+
+    for shape in [[2, 4], [2, 2]]:
+      self._testNDimConstantInitializerIncorrectNumberValues(value, shape)
+      self._testNDimConstantInitializerIncorrectNumberValues(np.asarray(value),
+                                                             shape)
+      self._testNDimConstantInitializerIncorrectNumberValues(
+          np.asarray(value).reshape(tuple([2, 3])), shape)
+
+
+class RandomUniformInitializerTest(InitializersTest):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testRangeInitializer(self):
+    shape = (9, 6, 7)
+    self._range_test(
+        init_ops_v2.RandomUniform(minval=-1, maxval=1, seed=124),
+        shape,
+        target_mean=0.,
+        target_max=1,
+        target_min=-1)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInitializerIdentical(self):
+    self.skipTest("Doesn't work without the graphs")
+    init1 = init_ops_v2.RandomUniform(0, 7, seed=1)
+    init2 = init_ops_v2.RandomUniform(0, 7, seed=1)
+    self._identical_test(init1, init2, True)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInitializerDifferent(self):
+    init1 = init_ops_v2.RandomUniform(0, 7, seed=1)
+    init2 = init_ops_v2.RandomUniform(0, 7, seed=2)
+    self._identical_test(init1, init2, False)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDuplicatedInitializer(self):
+    init = init_ops_v2.RandomUniform(0.0, 1.0)
+    self._duplicated_test(init)
+
+
+class RandomNormalInitializerTest(InitializersTest):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testRangeInitializer(self):
+    self._range_test(
+        init_ops_v2.RandomNormal(mean=0, stddev=1, seed=153),
+        shape=(8, 12, 99),
+        target_mean=0.,
+        target_std=1)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInitializerIdentical(self):
+    self.skipTest("Doesn't work without the graphs")
+    init1 = init_ops_v2.RandomNormal(0, 7, seed=1)
+    init2 = init_ops_v2.RandomNormal(0, 7, seed=1)
+    self._identical_test(init1, init2, True)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInitializerDifferent(self):
+    init1 = init_ops_v2.RandomNormal(0, 7, seed=1)
+    init2 = init_ops_v2.RandomNormal(0, 7, seed=2)
+    self._identical_test(init1, init2, False)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDuplicatedInitializer(self):
+    init = init_ops_v2.RandomNormal(0.0, 1.0)
+    self._duplicated_test(init)
+
+
+class TruncatedNormalInitializerTest(InitializersTest):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testRangeInitializer(self):
+    self._range_test(
+        init_ops_v2.TruncatedNormal(mean=0, stddev=1, seed=126),
+        shape=(12, 99, 7),
+        target_mean=0.,
+        target_max=2,
+        target_min=-2)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInitializerIdentical(self):
+    self.skipTest("Not seeming to work in Eager mode")
+    init1 = init_ops_v2.TruncatedNormal(0.0, 1.0, seed=1)
+    init2 = init_ops_v2.TruncatedNormal(0.0, 1.0, seed=1)
+    self._identical_test(init1, init2, True)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInitializerDifferent(self):
+    init1 = init_ops_v2.TruncatedNormal(0.0, 1.0, seed=1)
+    init2 = init_ops_v2.TruncatedNormal(0.0, 1.0, seed=2)
+    self._identical_test(init1, init2, False)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDuplicatedInitializer(self):
+    init = init_ops_v2.TruncatedNormal(0.0, 1.0)
+    self._duplicated_test(init)
+
+  def testInvalidDataType(self):
+    init = init_ops_v2.TruncatedNormal(0.0, 1.0)
+    with self.assertRaises(ValueError):
+      init([1], dtype=dtypes.int32)
+
+
+class VarianceScalingInitializerTest(InitializersTest):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testTruncatedNormalDistribution(self):
+    shape = [100, 100]
+    expect_mean = 0.
+    expect_var = 1. / shape[0]
+    init = init_ops_v2.VarianceScaling(distribution="truncated_normal")
+
+    with test_util.use_gpu(), test.mock.patch.object(
+        random_ops, "truncated_normal",
+        wraps=random_ops.truncated_normal) as mock_truncated_normal:
+      x = self.evaluate(init(shape))
+      self.assertTrue(mock_truncated_normal.called)
+
+    self.assertNear(np.mean(x), expect_mean, err=1e-2)
+    self.assertNear(np.var(x), expect_var, err=1e-2)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNormalDistribution(self):
+    shape = [100, 100]
+    expect_mean = 0.
+    expect_var = 1. / shape[0]
+    init = init_ops_v2.VarianceScaling(distribution="truncated_normal")
+
+    with test_util.use_gpu(), test.mock.patch.object(
+        random_ops, "truncated_normal",
+        wraps=random_ops.truncated_normal) as mock_truncated_normal:
+      x = self.evaluate(init(shape))
+      self.assertTrue(mock_truncated_normal.called)
+
+    self.assertNear(np.mean(x), expect_mean, err=1e-2)
+    self.assertNear(np.var(x), expect_var, err=1e-2)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testUntruncatedNormalDistribution(self):
+    shape = [100, 100]
+    expect_mean = 0.
+    expect_var = 1. / shape[0]
+    init = init_ops_v2.VarianceScaling(
+        distribution="untruncated_normal")
+
+    with test_util.use_gpu(), test.mock.patch.object(
+        random_ops, "random_normal",
+        wraps=random_ops.random_normal) as mock_random_normal:
+      x = self.evaluate(init(shape))
+      self.assertTrue(mock_random_normal.called)
+
+    self.assertNear(np.mean(x), expect_mean, err=1e-2)
+    self.assertNear(np.var(x), expect_var, err=1e-2)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testUniformDistribution(self):
+    shape = [100, 100]
+    expect_mean = 0.
+    expect_var = 1. / shape[0]
+    init = init_ops_v2.VarianceScaling(distribution="uniform")
+
+    with test_util.use_gpu():
+      x = self.evaluate(init(shape))
+
+    self.assertNear(np.mean(x), expect_mean, err=1e-2)
+    self.assertNear(np.var(x), expect_var, err=1e-2)
+
+
+class OrthogonalInitializerTest(InitializersTest):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testRangeInitializer(self):
+    self._range_test(init_ops_v2.Orthogonal(seed=123), shape=(20, 20),
+                     target_mean=0.)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInitializerIdentical(self):
+    self.skipTest("Doesn't work without the graphs")
+    init1 = init_ops_v2.Orthogonal(seed=1)
+    init2 = init_ops_v2.Orthogonal(seed=1)
+    self._identical_test(init1, init2, True, (10, 10))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInitializerDifferent(self):
+    init1 = init_ops_v2.Orthogonal(seed=1)
+    init2 = init_ops_v2.Orthogonal(seed=2)
+    self._identical_test(init1, init2, False, (10, 10))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDuplicatedInitializer(self):
+    init = init_ops_v2.Orthogonal()
+    self._duplicated_test(init, (10, 10))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInvalidDataType(self):
+    init = init_ops_v2.Orthogonal()
+    self.assertRaises(ValueError, init, shape=(10, 10), dtype=dtypes.string)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInvalidShape(self):
+    init = init_ops_v2.Orthogonal()
+    with test_util.use_gpu():
+      self.assertRaises(ValueError, init, shape=[5])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testGain(self):
+    self.skipTest("Doesn't work without the graphs")
+    init1 = init_ops_v2.Orthogonal(seed=1)
+    init2 = init_ops_v2.Orthogonal(gain=3.14, seed=1)
+    with test_util.use_gpu():
+      t1 = self.evaluate(init1(shape=(10, 10)))
+      t2 = self.evaluate(init2(shape=(10, 10)))
+    self.assertAllClose(t1, t2 / 3.14)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testShapesValues(self):
+    for shape in [(10, 10), (10, 9, 8), (100, 5, 5), (50, 40), (40, 50)]:
+      init = init_ops_v2.Orthogonal()
+      tol = 1e-5
+      with test_util.use_gpu():
+        # Check the shape
+        t = self.evaluate(init(shape))
+        self.assertAllEqual(shape, t.shape)
+        # Check orthogonality by computing the inner product
+        t = t.reshape((np.prod(t.shape[:-1]), t.shape[-1]))
+        if t.shape[0] > t.shape[1]:
+          self.assertAllClose(
+              np.dot(t.T, t), np.eye(t.shape[1]), rtol=tol, atol=tol)
+        else:
+          self.assertAllClose(
+              np.dot(t, t.T), np.eye(t.shape[0]), rtol=tol, atol=tol)
+
+
+class IdentityInitializerTest(InitializersTest):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testRange(self):
+    with self.assertRaises(ValueError):
+      shape = (3, 4, 5)
+      self._range_test(
+          init_ops_v2.Identity(),
+          shape=shape,
+          target_mean=1. / shape[0],
+          target_max=1.)
+
+    shape = (3, 3)
+    self._range_test(
+        init_ops_v2.Identity(),
+        shape=shape,
+        target_mean=1. / shape[0],
+        target_max=1.)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInvalidDataType(self):
+    init = init_ops_v2.Identity()
+    self.assertRaises(ValueError, init, shape=[10, 5], dtype=dtypes.int32)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testInvalidShape(self):
+    init = init_ops_v2.Identity()
+    with test_util.use_gpu():
+      self.assertRaises(ValueError, init, shape=[5, 7, 7])
+      self.assertRaises(ValueError, init, shape=[5])
+      self.assertRaises(ValueError, init, shape=[])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNonSquare(self):
+    init = init_ops_v2.Identity()
+    shape = (10, 5)
+    with test_util.use_gpu():
+      self.assertAllClose(self.evaluate(init(shape)), np.eye(*shape))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testGain(self):
+    shape = (10, 10)
+    for dtype in [dtypes.float32, dtypes.float64]:
+      init_default = init_ops_v2.Identity()
+      init_custom = init_ops_v2.Identity(gain=0.9)
+      with test_util.use_gpu():
+        self.assertAllClose(self.evaluate(init_default(shape, dtype=dtype)),
+                            np.eye(*shape))
+      with test_util.use_gpu():
+        self.assertAllClose(self.evaluate(init_custom(shape, dtype=dtype)),
+                            np.eye(*shape) * 0.9)
+
+
+class GlorotInitializersTest(InitializersTest):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testGlorotUniform(self):
+    shape = (5, 6, 4, 2)
+    fan_in, fan_out = init_ops_v2._compute_fans(shape)
+    std = np.sqrt(2. / (fan_in + fan_out))
+    self._range_test(
+        init_ops_v2.GlorotUniform(seed=123),
+        shape,
+        target_mean=0.,
+        target_std=std)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_GlorotNormal(self):
+    shape = (5, 6, 4, 2)
+    fan_in, fan_out = init_ops_v2._compute_fans(shape)
+    std = np.sqrt(2. / (fan_in + fan_out))
+    self._range_test(
+        init_ops_v2.GlorotNormal(seed=123),
+        shape,
+        target_mean=0.,
+        target_std=std)
+
+
+class MethodInitializers(InitializersTest):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testLecunUniform(self):
+    shape = (5, 6, 4, 2)
+    fan_in, _ = init_ops_v2._compute_fans(shape)
+    std = np.sqrt(1. / fan_in)
+    self._range_test(
+        init_ops_v2.lecun_uniform(seed=123),
+        shape,
+        target_mean=0.,
+        target_std=std)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testHeUniform(self):
+    shape = (5, 6, 4, 2)
+    fan_in, _ = init_ops_v2._compute_fans(shape)
+    std = np.sqrt(2. / fan_in)
+    self._range_test(
+        init_ops_v2.he_uniform(seed=123),
+        shape,
+        target_mean=0.,
+        target_std=std)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testLecunNormal(self):
+    shape = (5, 6, 4, 2)
+    fan_in, _ = init_ops_v2._compute_fans(shape)
+    std = np.sqrt(1. / fan_in)
+    self._range_test(
+        init_ops_v2.lecun_normal(seed=123),
+        shape,
+        target_mean=0.,
+        target_std=std)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testHeNormal(self):
+    shape = (5, 6, 4, 2)
+    fan_in, _ = init_ops_v2._compute_fans(shape)
+    std = np.sqrt(2. / fan_in)
+    self._range_test(
+        init_ops_v2.he_normal(seed=123),
+        shape,
+        target_mean=0.,
+        target_std=std)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/linalg/adjoint_registrations.py b/tensorflow/python/ops/linalg/adjoint_registrations.py
new file mode 100644
index 0000000000000000000000000000000000000000..59ec97d2758f80aaa90c52646430b0d9c5e642bd
--- /dev/null
+++ b/tensorflow/python/ops/linalg/adjoint_registrations.py
@@ -0,0 +1,127 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Registrations for LinearOperator.adjoint."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_adjoint
+from tensorflow.python.ops.linalg import linear_operator_algebra
+from tensorflow.python.ops.linalg import linear_operator_block_diag
+from tensorflow.python.ops.linalg import linear_operator_circulant
+from tensorflow.python.ops.linalg import linear_operator_diag
+from tensorflow.python.ops.linalg import linear_operator_identity
+from tensorflow.python.ops.linalg import linear_operator_kronecker
+
+
+# By default, return LinearOperatorAdjoint which switched the .matmul
+# and .solve methods.
+@linear_operator_algebra.RegisterAdjoint(linear_operator.LinearOperator)
+def _adjoint_linear_operator(linop):
+  return linear_operator_adjoint.LinearOperatorAdjoint(
+      linop,
+      is_non_singular=linop.is_non_singular,
+      is_self_adjoint=linop.is_self_adjoint,
+      is_positive_definite=linop.is_positive_definite,
+      is_square=linop.is_square)
+
+
+@linear_operator_algebra.RegisterAdjoint(
+    linear_operator_adjoint.LinearOperatorAdjoint)
+def _adjoint_adjoint_linear_operator(linop):
+  return linop.operator
+
+
+@linear_operator_algebra.RegisterAdjoint(
+    linear_operator_identity.LinearOperatorIdentity)
+def _adjoint_identity(identity_operator):
+  return identity_operator
+
+
+@linear_operator_algebra.RegisterAdjoint(
+    linear_operator_identity.LinearOperatorScaledIdentity)
+def _adjoint_scaled_identity(identity_operator):
+  multiplier = identity_operator.multiplier
+  if multiplier.dtype.is_complex:
+    multiplier = math_ops.conj(multiplier)
+
+  return linear_operator_identity.LinearOperatorScaledIdentity(
+      num_rows=identity_operator._num_rows,  # pylint: disable=protected-access
+      multiplier=multiplier,
+      is_non_singular=identity_operator.is_non_singular,
+      is_self_adjoint=identity_operator.is_self_adjoint,
+      is_positive_definite=identity_operator.is_positive_definite,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterAdjoint(
+    linear_operator_diag.LinearOperatorDiag)
+def _adjoint_diag(diag_operator):
+  diag = diag_operator.diag
+  if diag.dtype.is_complex:
+    diag = math_ops.conj(diag)
+
+  return linear_operator_diag.LinearOperatorDiag(
+      diag=diag,
+      is_non_singular=diag_operator.is_non_singular,
+      is_self_adjoint=diag_operator.is_self_adjoint,
+      is_positive_definite=diag_operator.is_positive_definite,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterAdjoint(
+    linear_operator_block_diag.LinearOperatorBlockDiag)
+def _adjoint_block_diag(block_diag_operator):
+    # We take the adjoint of each block on the diagonal.
+  return linear_operator_block_diag.LinearOperatorBlockDiag(
+      operators=[
+          operator.adjoint() for operator in block_diag_operator.operators],
+      is_non_singular=block_diag_operator.is_non_singular,
+      is_self_adjoint=block_diag_operator.is_self_adjoint,
+      is_positive_definite=block_diag_operator.is_positive_definite,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterAdjoint(
+    linear_operator_kronecker.LinearOperatorKronecker)
+def _adjoint_kronecker(kronecker_operator):
+    # Adjoint of a Kronecker product is the Kronecker product
+    # of adjoints.
+  return linear_operator_kronecker.LinearOperatorKronecker(
+      operators=[
+          operator.adjoint() for operator in kronecker_operator.operators],
+      is_non_singular=kronecker_operator.is_non_singular,
+      is_self_adjoint=kronecker_operator.is_self_adjoint,
+      is_positive_definite=kronecker_operator.is_positive_definite,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterAdjoint(
+    linear_operator_circulant.LinearOperatorCirculant)
+def _adjoint_circulant(circulant_operator):
+  spectrum = circulant_operator.spectrum
+  if spectrum.dtype.is_complex:
+    spectrum = math_ops.conj(spectrum)
+
+  # Conjugating the spectrum is sufficient to get the adjoint.
+  return linear_operator_circulant.LinearOperatorCirculant(
+      spectrum=spectrum,
+      is_non_singular=circulant_operator.is_non_singular,
+      is_self_adjoint=circulant_operator.is_self_adjoint,
+      is_positive_definite=circulant_operator.is_positive_definite,
+      is_square=True)
diff --git a/tensorflow/python/ops/linalg/inverse_registrations.py b/tensorflow/python/ops/linalg/inverse_registrations.py
new file mode 100644
index 0000000000000000000000000000000000000000..12d1e7554cd59bf0f6d2754865090cf67e831da1
--- /dev/null
+++ b/tensorflow/python/ops/linalg/inverse_registrations.py
@@ -0,0 +1,114 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Registrations for LinearOperator.inverse."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_algebra
+from tensorflow.python.ops.linalg import linear_operator_block_diag
+from tensorflow.python.ops.linalg import linear_operator_circulant
+from tensorflow.python.ops.linalg import linear_operator_diag
+from tensorflow.python.ops.linalg import linear_operator_identity
+from tensorflow.python.ops.linalg import linear_operator_inversion
+from tensorflow.python.ops.linalg import linear_operator_kronecker
+
+
+# By default, return LinearOperatorInversion which switched the .matmul
+# and .solve methods.
+@linear_operator_algebra.RegisterInverse(linear_operator.LinearOperator)
+def _inverse_linear_operator(linop):
+  return linear_operator_inversion.LinearOperatorInversion(
+      linop,
+      is_non_singular=linop.is_non_singular,
+      is_self_adjoint=linop.is_self_adjoint,
+      is_positive_definite=linop.is_positive_definite,
+      is_square=linop.is_square)
+
+
+@linear_operator_algebra.RegisterInverse(
+    linear_operator_inversion.LinearOperatorInversion)
+def _inverse_inverse_linear_operator(linop_inversion):
+  return linop_inversion.operator
+
+
+@linear_operator_algebra.RegisterInverse(
+    linear_operator_diag.LinearOperatorDiag)
+def _inverse_diag(diag_operator):
+  return linear_operator_diag.LinearOperatorDiag(
+      1. / diag_operator.diag,
+      is_non_singular=diag_operator.is_non_singular,
+      is_self_adjoint=diag_operator.is_self_adjoint,
+      is_positive_definite=diag_operator.is_positive_definite,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterInverse(
+    linear_operator_identity.LinearOperatorIdentity)
+def _inverse_identity(identity_operator):
+  return identity_operator
+
+
+@linear_operator_algebra.RegisterInverse(
+    linear_operator_identity.LinearOperatorScaledIdentity)
+def _inverse_scaled_identity(identity_operator):
+  return linear_operator_identity.LinearOperatorScaledIdentity(
+      num_rows=identity_operator._num_rows,  # pylint: disable=protected-access
+      multiplier=1. / identity_operator.multiplier,
+      is_non_singular=identity_operator.is_non_singular,
+      is_self_adjoint=True,
+      is_positive_definite=identity_operator.is_positive_definite,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterInverse(
+    linear_operator_block_diag.LinearOperatorBlockDiag)
+def _inverse_block_diag(block_diag_operator):
+  # We take the inverse of each block on the diagonal.
+  return linear_operator_block_diag.LinearOperatorBlockDiag(
+      operators=[
+          operator.inverse() for operator in block_diag_operator.operators],
+      is_non_singular=block_diag_operator.is_non_singular,
+      is_self_adjoint=block_diag_operator.is_self_adjoint,
+      is_positive_definite=block_diag_operator.is_positive_definite,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterInverse(
+    linear_operator_kronecker.LinearOperatorKronecker)
+def _inverse_kronecker(kronecker_operator):
+  # Inverse decomposition of a Kronecker product is the Kronecker product
+  # of inverse decompositions.
+  return linear_operator_kronecker.LinearOperatorKronecker(
+      operators=[
+          operator.inverse() for operator in kronecker_operator.operators],
+      is_non_singular=kronecker_operator.is_non_singular,
+      is_self_adjoint=kronecker_operator.is_self_adjoint,
+      is_positive_definite=kronecker_operator.is_positive_definite,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterInverse(
+    linear_operator_circulant.LinearOperatorCirculant)
+def _inverse_circulant(circulant_operator):
+  # Inverting the spectrum is sufficient to get the inverse.
+  return linear_operator_circulant.LinearOperatorCirculant(
+      spectrum=1. / circulant_operator.spectrum,
+      is_non_singular=circulant_operator.is_non_singular,
+      is_self_adjoint=circulant_operator.is_self_adjoint,
+      is_positive_definite=circulant_operator.is_positive_definite,
+      is_square=True)
diff --git a/tensorflow/python/ops/linalg/linalg.py b/tensorflow/python/ops/linalg/linalg.py
index ac4fd4ebc6059a187828c757c852a470d8ee69a8..b9f8411c934aabfa30de2d684d5afcb354401509 100644
--- a/tensorflow/python/ops/linalg/linalg.py
+++ b/tensorflow/python/ops/linalg/linalg.py
@@ -20,7 +20,9 @@ from __future__ import print_function
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import,unused-import
+from tensorflow.python.ops.linalg import adjoint_registrations as _adjoint_registrations
 from tensorflow.python.ops.linalg import cholesky_registrations as _cholesky_registrations
+from tensorflow.python.ops.linalg import inverse_registrations as _inverse_registrations
 from tensorflow.python.ops.linalg import linear_operator_algebra as _linear_operator_algebra
 from tensorflow.python.ops.linalg import matmul_registrations as _matmul_registrations
 from tensorflow.python.ops.linalg.linalg_impl import *
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index df2bd887cdde6f651db572c2bdfebd2bc0170716..fec2b2713e3709d9104204412a8c52fd062e9336 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_linalg_ops
@@ -104,6 +105,7 @@ def adjoint(matrix, name=None):
   tf.linalg.adjoint(x)  # [[1 - 1j, 4 - 4j],
                         #  [2 - 2j, 5 - 5j],
                         #  [3 - 3j, 6 - 6j]]
+  ```
 
   Args:
     matrix:  A `Tensor`. Must be `float16`, `float32`, `float64`, `complex64`,
@@ -328,3 +330,189 @@ def matrix_exponential(input, name=None):  # pylint: disable=redefined-builtin
           result,
           array_ops.concat((batch_shape, array_ops.shape(result)[-2:]), axis=0))
     return array_ops.reshape(result, batch_shape.concatenate(result.shape[-2:]))
+
+
+@tf_export('linalg.tridiagonal_solve')
+def tridiagonal_solve(diagonals,
+                      rhs,
+                      diagonals_format='compact',
+                      transpose_rhs=False,
+                      conjugate_rhs=False,
+                      name=None):
+  r"""Solves tridiagonal systems of equations.
+
+  Solution is computed via Gaussian elemination with partial pivoting.
+
+  The input can be supplied in various formats: `matrix`, `tuple` and `compact`,
+  specified by the `diagonals_format` arg.
+
+  In `matrix` format, `diagonals` must be a tensor of shape `[..., M, M]`, with
+  two inner-most dimensions representing the square tridiagonal matrices.
+  Elements outside of the three diagonals will be ignored.
+
+  In `sequence` format, `diagonals` are supplied as a tuple or list of three
+  tensors of shapes `[..., N]`, `[..., M]`, `[..., N]` representing
+  superdiagonals, diagonals, and subdiagonals, respectively. `N` can be either
+  `M-1` or `M`; in the latter case, the last element of superdiagonal and the
+  first element of subdiagonal will be ignored.
+
+  In `compact` format the three diagonals are brought together into one tensor
+  of shape `[..., 3, M]`, with last two dimensions containing superdiagonals,
+  diagonals, and subdiagonals, in order. Similarly to `sequence` format,
+  elements `diagonals[..., 0, M-1]` and `diagonals[..., 2, 0]` are ignored.
+
+  The `compact` format is recommended as the one with best performance. In case
+  you need to cast a tensor into a compact format manually, use `tf.gather_nd`.
+  An example for a tensor of shape [m, m]:
+
+  ```python
+  rhs = tf.constant([...])
+  matrix = tf.constant([[...]])
+  m = matrix.shape[0]
+  dummy_idx = [0, 0]  # An arbitrary element to use as a dummy
+  indices = [[[i, i + 1] for i in range(m - 1)] + [dummy_idx],  # Superdiagonal
+           [[i, i] for i in range(m)],                          # Diagonal
+           [dummy_idx] + [[i + 1, i] for i in range(m - 1)]]    # Subdiagonal
+  diagonals=tf.gather_nd(matrix, indices)
+  x = tf.linalg.tridiagonal_solve(diagonals, rhs)
+  ```
+
+  Regardless of the `diagonals_format`, `rhs` is a tensor of shape `[..., M]` or
+  `[..., M, K]`. The latter allows to simultaneously solve K systems with the
+  same left-hand sides and K different right-hand sides. If `transpose_rhs`
+  is set to `True` the expected shape is `[..., M]` or `[..., K, M]`.
+
+  The batch dimensions, denoted as `...`, must be the same in `diagonals` and
+  `rhs`.
+
+  The output is a tensor of the same shape as `rhs`: either `[..., M]` or
+  `[..., M, K]`.
+
+  Args:
+    diagonals: A `Tensor` or tuple of `Tensor`s describing left-hand sides. The
+      shape depends of `diagonals_format`, see description above. Must be
+      `float32`, `float64`, `complex64`, or `complex128`.
+    rhs: A `Tensor` of shape [..., M] or [..., M, K] and with the same dtype as
+      `diagonals`.
+    diagonals_format: one of `matrix`, `sequence`, or `compact`. Default is
+      `compact`.
+    transpose_rhs: If `True`, `rhs` is transposed before solving (has no effect
+      if the shape of rhs is [..., M]).
+    conjugate_rhs: If `True`, `rhs` is conjugated before solving.
+    name:  A name to give this `Op` (optional).
+
+  Returns:
+    A `Tensor` of shape [..., M] or [..., M, K] containing the solutions.
+
+  Raises:
+    ValueError: An unsupported type is provided as input, or when the input
+    tensors have incorrect shapes.
+
+  """
+  if diagonals_format == 'compact':
+    return _tridiagonal_solve_compact_format(diagonals, rhs, transpose_rhs,
+                                             conjugate_rhs, name)
+
+  if diagonals_format == 'sequence':
+    if not isinstance(diagonals, (tuple, list)) or len(diagonals) != 3:
+      raise ValueError('Expected diagonals to be a sequence of length 3.')
+
+    superdiag, maindiag, subdiag = diagonals
+    if (not subdiag.shape[:-1].is_compatible_with(maindiag.shape[:-1]) or
+        not superdiag.shape[:-1].is_compatible_with(maindiag.shape[:-1])):
+      raise ValueError(
+          'Tensors representing the three diagonals must have the same shape,'
+          'except for the last dimension, got {}, {}, {}'.format(
+              subdiag.shape, maindiag.shape, superdiag.shape))
+
+    m = tensor_shape.dimension_value(maindiag.shape[-1])
+
+    def pad_if_necessary(t, name, last_dim_padding):
+      n = tensor_shape.dimension_value(t.shape[-1])
+      if not n or n == m:
+        return t
+      if n == m - 1:
+        paddings = (
+            [[0, 0] for _ in range(len(t.shape) - 1)] + [last_dim_padding])
+        return array_ops.pad(t, paddings)
+      raise ValueError('Expected {} to be have length {} or {}, got {}.'.format(
+          name, m, m - 1, n))
+
+    subdiag = pad_if_necessary(subdiag, 'subdiagonal', [1, 0])
+    superdiag = pad_if_necessary(superdiag, 'superdiagonal', [0, 1])
+
+    diagonals = array_ops.stack((superdiag, maindiag, subdiag), axis=-2)
+    return _tridiagonal_solve_compact_format(diagonals, rhs, transpose_rhs,
+                                             conjugate_rhs, name)
+
+  if diagonals_format == 'matrix':
+    m1 = tensor_shape.dimension_value(diagonals.shape[-1])
+    m2 = tensor_shape.dimension_value(diagonals.shape[-2])
+    if m1 and m2 and m1 != m2:
+      raise ValueError(
+          'Expected last two dimensions of diagonals to be same, got {} and {}'
+          .format(m1, m2))
+    m = m1 or m2
+    if not m:
+      raise ValueError('The size of the matrix needs to be known for '
+                       'diagonals_format="matrix"')
+
+    # Extract diagonals; use input[..., 0, 0] as "dummy" m-th elements of sub-
+    # and superdiagonal.
+    # gather_nd slices into first indices, whereas we need to slice into the
+    # last two, so transposing back and forth is necessary.
+    dummy_idx = [0, 0]
+    indices = ([[[1, 0], [0, 0], dummy_idx]] + [
+        [[i + 1, i], [i, i], [i - 1, i]] for i in range(1, m - 1)
+    ] + [[dummy_idx, [m - 1, m - 1], [m - 2, m - 1]]])
+    diagonals = array_ops.transpose(
+        array_ops.gather_nd(array_ops.transpose(diagonals), indices))
+    return _tridiagonal_solve_compact_format(diagonals, rhs, transpose_rhs,
+                                             conjugate_rhs, name)
+
+  raise ValueError('Unrecognized diagonals_format: {}'.format(diagonals_format))
+
+
+def _tridiagonal_solve_compact_format(diagonals,
+                                      rhs,
+                                      transpose_rhs=False,
+                                      conjugate_rhs=False,
+                                      name=None):
+  """Helper function used after the input has been cast to compact form."""
+  diags_rank, rhs_rank = len(diagonals.shape), len(rhs.shape)
+
+  if diags_rank < 2:
+    raise ValueError(
+        'Expected diagonals to have rank at least 2, got {}'.format(diags_rank))
+  if rhs_rank != diags_rank and rhs_rank != diags_rank - 1:
+    raise ValueError('Expected the rank of rhs to be {} or {}, got {}'.format(
+        diags_rank - 1, diags_rank, rhs_rank))
+  if diagonals.shape[-2] != 3:
+    raise ValueError('Expected 3 diagonals got {}'.format(diagonals.shape[-2]))
+  if not diagonals.shape[:-2].is_compatible_with(rhs.shape[:diags_rank - 2]):
+    raise ValueError('Batch shapes {} and {} are incompatible'.format(
+        diagonals.shape[:-2], rhs.shape[:diags_rank - 2]))
+
+  def check_num_lhs_matches_num_rhs():
+    if diagonals.shape[-1] != rhs.shape[-2]:
+      raise ValueError('Expected number of left-hand sided and right-hand '
+                       'sides to be equal, got {} and {}'.format(
+                           diagonals.shape[-1], rhs.shape[-2]))
+
+  if rhs_rank == diags_rank - 1:
+    # Rhs provided as a vector, ignoring transpose_rhs
+    if conjugate_rhs:
+      rhs = math_ops.conj(rhs)
+    rhs = array_ops.expand_dims(rhs, -1)
+    check_num_lhs_matches_num_rhs()
+    return array_ops.squeeze(
+        linalg_ops.tridiagonal_solve(diagonals, rhs, name), -1)
+
+  if transpose_rhs:
+    rhs = array_ops.matrix_transpose(rhs, conjugate=conjugate_rhs)
+  elif conjugate_rhs:
+    rhs = math_ops.conj(rhs)
+
+  check_num_lhs_matches_num_rhs()
+  result = linalg_ops.tridiagonal_solve(diagonals, rhs, name)
+  return array_ops.matrix_transpose(result) if transpose_rhs else result
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 8efafda3a1e7424442163a76aca95d14af4b8a70..8fa9f63e043a59da5b3ea425883cb954a065c1ee 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -381,7 +381,10 @@ class LinearOperator(object):
       `Dimension` object.
     """
     # Derived classes get this "for free" once .shape is implemented.
-    return self.shape[-1]
+    if self.shape.rank is None:
+      return tensor_shape.Dimension(None)
+    else:
+      return self.shape.dims[-1]
 
   def domain_dimension_tensor(self, name="domain_dimension_tensor"):
     """Dimension (in the sense of vector spaces) of the domain of this operator.
@@ -844,6 +847,51 @@ class LinearOperator(object):
 
       return self._solvevec(rhs, adjoint=adjoint)
 
+  def adjoint(self, name="adjoint"):
+    """Returns the adjoint of the current `LinearOperator`.
+
+    Given `A` representing this `LinearOperator`, return `A*`.
+    Note that calling `self.adjoint()` and `self.H` are equivalent.
+
+    Args:
+      name:  A name for this `Op`.
+
+    Returns:
+      `LinearOperator` which represents the adjoint of this `LinearOperator`.
+    """
+    if self.is_self_adjoint is True:  # pylint: disable=g-bool-id-comparison
+      return self
+    with self._name_scope(name):
+      return linear_operator_algebra.adjoint(self)
+
+  # self.H is equivalent to self.adjoint().
+  H = property(adjoint, None)
+
+  def inverse(self, name="inverse"):
+    """Returns the Inverse of this `LinearOperator`.
+
+    Given `A` representing this `LinearOperator`, return a `LinearOperator`
+    representing `A^-1`.
+
+    Args:
+      name: A name scope to use for ops added by this method.
+
+    Returns:
+      `LinearOperator` representing inverse of this matrix.
+
+    Raises:
+      ValueError: When the `LinearOperator` is not hinted to be `non_singular`.
+    """
+    if self.is_square is False:  # pylint: disable=g-bool-id-comparison
+      raise ValueError("Cannot take the Inverse: This operator represents "
+                       "a non square matrix.")
+    if self.is_non_singular is False:  # pylint: disable=g-bool-id-comparison
+      raise ValueError("Cannot take the Inverse: This operator represents "
+                       "a singular matrix.")
+
+    with self._name_scope(name):
+      return linear_operator_algebra.inverse(self)
+
   def cholesky(self, name="cholesky"):
     """Returns a Cholesky factor as a `LinearOperator`.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_adjoint.py b/tensorflow/python/ops/linalg/linear_operator_adjoint.py
index 858e224b9adda57b4d472ae2f61b2b6cda74c243..7ee4752d264b73689c172240f10c89e1a52c5b68 100644
--- a/tensorflow/python/ops/linalg/linear_operator_adjoint.py
+++ b/tensorflow/python/ops/linalg/linear_operator_adjoint.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator
@@ -83,7 +84,7 @@ class LinearOperatorAdjoint(linear_operator.LinearOperator):
     r"""Initialize a `LinearOperatorAdjoint`.
 
     `LinearOperatorAdjoint` is initialized with an operator `A`.  The `solve`
-    and `matmul` methods effectively flip the `adjoint` argument.  E.g.
+    and `matmul` methods  effectively flip the `adjoint` argument.  E.g.
 
     ```
     A = MyLinearOperator(...)
@@ -175,15 +176,24 @@ class LinearOperatorAdjoint(linear_operator.LinearOperator):
     return self.operator.assert_self_adjoint()
 
   def _shape(self):
-    return self.operator.shape
+    # Rotate last dimension
+    shape = self.operator.shape
+    return shape[:-2].concatenate([shape[-1], shape[-2]])
 
   def _shape_tensor(self):
-    return self.operator.shape_tensor()
+    # Rotate last dimension
+    shape = self.operator.shape_tensor()
+    return array_ops.concat([
+        shape[:-2], [shape[-1], shape[-2]]], axis=-1)
 
   def _matmul(self, x, adjoint=False, adjoint_arg=False):
     return self.operator.matmul(
         x, adjoint=(not adjoint), adjoint_arg=adjoint_arg)
 
+  def _matvec(self, x, adjoint=False, adjoint_arg=False):
+    return self.operator.matvec(
+        x, adjoint=(not adjoint), adjoint_arg=adjoint_arg)
+
   def _determinant(self):
     if self.is_self_adjoint:
       return self.operator.determinant()
@@ -201,7 +211,14 @@ class LinearOperatorAdjoint(linear_operator.LinearOperator):
     return self.operator.solve(
         rhs, adjoint=(not adjoint), adjoint_arg=adjoint_arg)
 
+  def _solvevec(self, rhs, adjoint=False, adjoint_arg=False):
+    return self.operator.solvevec(
+        rhs, adjoint=(not adjoint), adjoint_arg=adjoint_arg)
+
   def _to_dense(self):
     if self.is_self_adjoint:
       return self.operator.to_dense()
     return linalg.adjoint(self.operator.to_dense())
+
+  def _add_to_tensor(self, x):
+    return self.to_dense() + x
diff --git a/tensorflow/python/ops/linalg/linear_operator_algebra.py b/tensorflow/python/ops/linalg/linear_operator_algebra.py
index 7b99066e4c121ebd7546dfad1039c0dfa46bca11..0d1eab4b735d64ad988507d6c52fc52202389fd0 100644
--- a/tensorflow/python/ops/linalg/linear_operator_algebra.py
+++ b/tensorflow/python/ops/linalg/linear_operator_algebra.py
@@ -25,8 +25,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.util import tf_inspect
 
 
+_ADJOINTS = {}
 _CHOLESKY_DECOMPS = {}
 _MATMUL = {}
+_INVERSES = {}
 
 
 def _registered_function(type_list, registry):
@@ -45,6 +47,11 @@ def _registered_function(type_list, registry):
   return registry.get(tuple(r[1] for r in registered_combination), None)
 
 
+def _registered_adjoint(type_a):
+  """Get the Adjoint function registered for class a."""
+  return _registered_function([type_a], _ADJOINTS)
+
+
 def _registered_cholesky(type_a):
   """Get the Cholesky function registered for class a."""
   return _registered_function([type_a], _CHOLESKY_DECOMPS)
@@ -55,6 +62,34 @@ def _registered_matmul(type_a, type_b):
   return _registered_function([type_a, type_b], _MATMUL)
 
 
+def _registered_inverse(type_a):
+  """Get the Cholesky function registered for class a."""
+  return _registered_function([type_a], _INVERSES)
+
+
+def adjoint(lin_op_a, name=None):
+  """Get the adjoint associated to lin_op_a.
+
+  Args:
+    lin_op_a: The LinearOperator to take the adjoint of.
+    name: Name to use for this operation.
+
+  Returns:
+    A LinearOperator that represents the adjoint of `lin_op_a`.
+
+  Raises:
+    NotImplementedError: If no Adjoint method is defined for the LinearOperator
+      type of `lin_op_a`.
+  """
+  adjoint_fn = _registered_adjoint(type(lin_op_a))
+  if adjoint_fn is None:
+    raise ValueError("No adjoint registered for {}".format(
+        type(lin_op_a)))
+
+  with ops.name_scope(name, "Adjoint"):
+    return adjoint_fn(lin_op_a)
+
+
 def cholesky(lin_op_a, name=None):
   """Get the Cholesky factor associated to lin_op_a.
 
@@ -103,6 +138,71 @@ def matmul(lin_op_a, lin_op_b, name=None):
     return matmul_fn(lin_op_a, lin_op_b)
 
 
+def inverse(lin_op_a, name=None):
+  """Get the Inverse associated to lin_op_a.
+
+  Args:
+    lin_op_a: The LinearOperator to decompose.
+    name: Name to use for this operation.
+
+  Returns:
+    A LinearOperator that represents the inverse of `lin_op_a`.
+
+  Raises:
+    NotImplementedError: If no Inverse method is defined for the LinearOperator
+      type of `lin_op_a`.
+  """
+  inverse_fn = _registered_inverse(type(lin_op_a))
+  if inverse_fn is None:
+    raise ValueError("No inverse registered for {}".format(
+        type(lin_op_a)))
+
+  with ops.name_scope(name, "Inverse"):
+    return inverse_fn(lin_op_a)
+
+
+class RegisterAdjoint(object):
+  """Decorator to register an Adjoint implementation function.
+
+  Usage:
+
+  @linear_operator_algebra.RegisterAdjoint(lin_op.LinearOperatorIdentity)
+  def _adjoint_identity(lin_op_a):
+    # Return the identity matrix.
+  """
+
+  def __init__(self, lin_op_cls_a):
+    """Initialize the LinearOperator registrar.
+
+    Args:
+      lin_op_cls_a: the class of the LinearOperator to decompose.
+    """
+    self._key = (lin_op_cls_a,)
+
+  def __call__(self, adjoint_fn):
+    """Perform the Adjoint registration.
+
+    Args:
+      adjoint_fn: The function to use for the Adjoint.
+
+    Returns:
+      adjoint_fn
+
+    Raises:
+      TypeError: if adjoint_fn is not a callable.
+      ValueError: if a Adjoint function has already been registered for
+        the given argument classes.
+    """
+    if not callable(adjoint_fn):
+      raise TypeError(
+          "adjoint_fn must be callable, received: {}".format(adjoint_fn))
+    if self._key in _ADJOINTS:
+      raise ValueError("Adjoint({}) has already been registered to: {}".format(
+          self._key[0].__name__, _ADJOINTS[self._key]))
+    _ADJOINTS[self._key] = adjoint_fn
+    return adjoint_fn
+
+
 class RegisterCholesky(object):
   """Decorator to register a Cholesky implementation function.
 
@@ -189,3 +289,45 @@ class RegisterMatmul(object):
           self._key[1].__name__))
     _MATMUL[self._key] = matmul_fn
     return matmul_fn
+
+
+class RegisterInverse(object):
+  """Decorator to register an Inverse implementation function.
+
+  Usage:
+
+  @linear_operator_algebra.RegisterInverse(lin_op.LinearOperatorIdentity)
+  def _inverse_identity(lin_op_a):
+    # Return the identity matrix.
+  """
+
+  def __init__(self, lin_op_cls_a):
+    """Initialize the LinearOperator registrar.
+
+    Args:
+      lin_op_cls_a: the class of the LinearOperator to decompose.
+    """
+    self._key = (lin_op_cls_a,)
+
+  def __call__(self, inverse_fn):
+    """Perform the Inverse registration.
+
+    Args:
+      inverse_fn: The function to use for the Inverse.
+
+    Returns:
+      inverse_fn
+
+    Raises:
+      TypeError: if inverse_fn is not a callable.
+      ValueError: if a Inverse function has already been registered for
+        the given argument classes.
+    """
+    if not callable(inverse_fn):
+      raise TypeError(
+          "inverse_fn must be callable, received: {}".format(inverse_fn))
+    if self._key in _INVERSES:
+      raise ValueError("Inverse({}) has already been registered to: {}".format(
+          self._key[0].__name__, _INVERSES[self._key]))
+    _INVERSES[self._key] = inverse_fn
+    return inverse_fn
diff --git a/tensorflow/python/ops/linalg/linear_operator_identity.py b/tensorflow/python/ops/linalg/linear_operator_identity.py
index 32b222cb2a685ee3254065dfc26a230482004182..694557e50ae62f15d66ef713aa8512f719f97b0b 100644
--- a/tensorflow/python/ops/linalg/linear_operator_identity.py
+++ b/tensorflow/python/ops/linalg/linear_operator_identity.py
@@ -588,12 +588,19 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
     """
     self._assert_proper_shapes = assert_proper_shapes
 
-    if not is_square:
-      raise ValueError("A ScaledIdentity operator is always square.")
-
     with ops.name_scope(name, values=[multiplier, num_rows]):
       self._multiplier = ops.convert_to_tensor(multiplier, name="multiplier")
 
+      # Check and auto-set hints.
+      if not self._multiplier.dtype.is_complex:
+        if is_self_adjoint is False:  # pylint: disable=g-bool-id-comparison
+          raise ValueError("A real diagonal operator is always self adjoint.")
+        else:
+          is_self_adjoint = True
+
+      if not is_square:
+        raise ValueError("A ScaledIdentity operator is always square.")
+
       super(LinearOperatorScaledIdentity, self).__init__(
           dtype=self._multiplier.dtype,
           is_non_singular=is_non_singular,
diff --git a/tensorflow/python/ops/linalg/linear_operator_kronecker.py b/tensorflow/python/ops/linalg/linear_operator_kronecker.py
index f7e785caa5d8cc290f037944378f709633423a74..005b9b429b6327211feb9466bdca59b7a50870a7 100644
--- a/tensorflow/python/ops/linalg/linear_operator_kronecker.py
+++ b/tensorflow/python/ops/linalg/linear_operator_kronecker.py
@@ -71,7 +71,7 @@ class LinearOperatorKronecker(linear_operator.LinearOperator):
   `op1 x op2 x .. opJ` (we omit parentheses as the Kronecker product is
   associative).
 
-  If `opj` has shape `batch_shape_j` + [M_j, N_j`, then the composed operator
+  If `opj` has shape `batch_shape_j + [M_j, N_j]`, then the composed operator
   will have shape equal to `broadcast_batch_shape + [prod M_j, prod N_j]`,
   where the product is over all operators.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_test_util.py b/tensorflow/python/ops/linalg/linear_operator_test_util.py
index e50f572b5f431ae8b7cf3470ee799f170e83656c..0383098804f49bd768b122e94f6c5ff63b33a9a3 100644
--- a/tensorflow/python/ops/linalg/linear_operator_test_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_test_util.py
@@ -278,6 +278,23 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     self._skip_if_tests_to_skip_contains("matmul_with_broadcast")
     self._test_matmul(with_batch=False)
 
+  def test_adjoint(self):
+    self._skip_if_tests_to_skip_contains("adjoint")
+    for use_placeholder in self._use_placeholder_options:
+      for build_info in self._operator_build_infos:
+        for dtype in self._dtypes_to_test:
+          with self.test_session(graph=ops.Graph()) as sess:
+            sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+            operator, mat = self._operator_and_matrix(
+                build_info, dtype, use_placeholder=use_placeholder)
+            op_adjoint = operator.adjoint().to_dense()
+            op_adjoint_h = operator.H.to_dense()
+            mat_adjoint = linalg.adjoint(mat)
+            op_adjoint_v, op_adjoint_h_v, mat_adjoint_v = sess.run(
+                [op_adjoint, op_adjoint_h, mat_adjoint])
+            self.assertAC(mat_adjoint_v, op_adjoint_v)
+            self.assertAC(mat_adjoint_v, op_adjoint_h_v)
+
   def test_cholesky(self):
     self._skip_if_tests_to_skip_contains("cholesky")
     for use_placeholder in self._use_placeholder_options:
@@ -336,6 +353,22 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     self._skip_if_tests_to_skip_contains("solve_with_broadcast")
     self._test_solve(with_batch=False)
 
+  def _test_inverse(self):
+    for use_placeholder in self._use_placeholder_options:
+      for build_info in self._operator_build_infos:
+        for dtype in self._dtypes_to_test:
+          with self.session(graph=ops.Graph()) as sess:
+            sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+            operator, mat = self._operator_and_matrix(
+                build_info, dtype, use_placeholder=use_placeholder)
+            op_inverse_v, mat_inverse_v = sess.run([
+                operator.inverse().to_dense(), linalg.inv(mat)])
+            self.assertAC(op_inverse_v, mat_inverse_v)
+
+  def test_inverse(self):
+    self._skip_if_tests_to_skip_contains("inverse")
+    self._test_inverse()
+
   def test_trace(self):
     self._skip_if_tests_to_skip_contains("trace")
     for use_placeholder in self._use_placeholder_options:
@@ -463,7 +496,14 @@ class NonSquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
   @property
   def _tests_to_skip(self):
     """List of test names to skip."""
-    return ["cholesky", "solve", "solve_with_broadcast", "det", "log_abs_det"]
+    return [
+        "cholesky",
+        "inverse",
+        "solve",
+        "solve_with_broadcast",
+        "det",
+        "log_abs_det"
+    ]
 
   @property
   def _operator_build_infos(self):
diff --git a/tensorflow/python/ops/linalg/linear_operator_util.py b/tensorflow/python/ops/linalg/linear_operator_util.py
index 54d04e4a70bc65e0053575e7761680894e3702e5..6c18943dab03d434cb92d5510f48066f46615ba5 100644
--- a/tensorflow/python/ops/linalg/linear_operator_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_util.py
@@ -481,9 +481,9 @@ def _reshape_for_efficiency(a,
 
   # Permutation to put the extra dims at the end.
   perm = (
-      array_ops.concat(
-          (math_ops.range(b_extra_ndims, b.shape.ndims),
-           math_ops.range(0, b_extra_ndims)), 0))
+      np.concatenate(
+          (np.arange(b_extra_ndims, b.shape.ndims),
+           np.arange(0, b_extra_ndims)), 0))
   b_extra_on_end = array_ops.transpose(b, perm=perm)
 
   # Now squash this end into one long dim.
@@ -497,7 +497,7 @@ def _reshape_for_efficiency(a,
     y_extra_shape = array_ops.concat(
         (array_ops.shape(y)[:-1], [b_main_sh[-1]], b_extra_sh), 0)
     y_extra_on_end = array_ops.reshape(y, y_extra_shape)
-    return array_ops.transpose(
-        y_extra_on_end, perm=array_ops.invert_permutation(perm))
+    inverse_perm = np.argsort(perm)
+    return array_ops.transpose(y_extra_on_end, perm=inverse_perm)
 
   return a, b_squashed_end, reshape_inv, still_need_to_transpose
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index 1a9e7112b45cacb711ac176b92cb3bef0dc72f00..66960304c783965a7bb6f0cea27b07df4e1d842c 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -24,9 +24,9 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import linalg_ops_impl
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_linalg_ops import *
@@ -595,14 +595,14 @@ def norm(tensor,
     if ord in ['fro', 'euclidean', 2, 2.0]:
       if is_matrix_norm and ord in [2, 2.0]:
         rank = array_ops.rank(tensor)
-        positive_axis = functional_ops.map_fn(
+        positive_axis = map_fn.map_fn(
             lambda i: control_flow_ops.cond(i >= 0, lambda: i, lambda: i + rank),
             ops.convert_to_tensor(axis))
         axes = math_ops.range(rank)
         perm_before = array_ops.concat(
             [array_ops.setdiff1d(axes, positive_axis)[0], positive_axis],
             axis=0)
-        perm_after = functional_ops.map_fn(
+        perm_after = map_fn.map_fn(
             lambda i: math_ops.cast(
                 array_ops.squeeze(
                     array_ops.where(math_ops.equal(perm_before, i))),
@@ -619,6 +619,8 @@ def norm(tensor,
         result = math_ops.sqrt(
             math_ops.reduce_sum(
                 tensor * math_ops.conj(tensor), axis, keepdims=True))
+        # TODO(rmlarsen): Replace with the following, once gradients are defined
+        # result = math_ops.reduce_euclidean_norm(tensor, axis, keepdims=True)
     else:
       result = math_ops.abs(tensor)
       if ord == 1:
diff --git a/tensorflow/python/ops/list_ops.py b/tensorflow/python/ops/list_ops.py
index dbaae886d43e46ac193d1e7f28a6367192d2a640..ee01ff7cf687c6ebfb2e7069534f52047fc0d9f7 100644
--- a/tensorflow/python/ops/list_ops.py
+++ b/tensorflow/python/ops/list_ops.py
@@ -28,6 +28,12 @@ from tensorflow.python.ops import gen_list_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_list_ops import *
 # pylint: enable=wildcard-import
+from tensorflow.python.util.lazy_loader import LazyLoader
+
+# list_ops -> control_flow_ops -> tensor_array_ops -> list_ops
+control_flow_ops = LazyLoader(
+    "control_flow_ops", globals(),
+    "tensorflow.python.ops.control_flow_ops")
 
 
 ops.NotDifferentiable("TensorListConcatLists")
@@ -65,11 +71,77 @@ def tensor_list_from_tensor(tensor, element_shape, name=None):
       name=name)
 
 
-def tensor_list_concat(input_handle, element_dtype, name=None):
+def tensor_list_get_item(input_handle, index, element_dtype, element_shape=None,
+                         name=None):
+  return gen_list_ops.tensor_list_get_item(
+      input_handle=input_handle,
+      index=index,
+      element_shape=_build_element_shape(element_shape),
+      element_dtype=element_dtype,
+      name=name)
+
+
+def tensor_list_pop_back(input_handle, element_dtype, name=None):
+  return gen_list_ops.tensor_list_pop_back(
+      input_handle=input_handle,
+      element_shape=-1,
+      element_dtype=element_dtype,
+      name=name)
+
+
+def tensor_list_gather(input_handle,
+                       indices,
+                       element_dtype,
+                       element_shape=None,
+                       name=None):
+  return gen_list_ops.tensor_list_gather(
+      input_handle=input_handle,
+      indices=indices,
+      element_shape=_build_element_shape(element_shape),
+      element_dtype=element_dtype,
+      name=name)
+
+
+def tensor_list_scatter(tensor,
+                        indices,
+                        element_shape=None,
+                        input_handle=None,
+                        name=None):
+  if input_handle is not None:
+    return gen_list_ops.tensor_list_scatter_into_existing_list(
+        input_handle=input_handle, tensor=tensor, indices=indices, name=name)
+  else:
+    return gen_list_ops.tensor_list_scatter_v2(
+        tensor=tensor,
+        indices=indices,
+        element_shape=_build_element_shape(element_shape),
+        num_elements=-1,
+        name=name)
+
+
+def tensor_list_stack(input_handle,
+                      element_dtype,
+                      num_elements=-1,
+                      element_shape=None,
+                      name=None):
+  return gen_list_ops.tensor_list_stack(
+      input_handle=input_handle,
+      element_shape=_build_element_shape(element_shape),
+      element_dtype=element_dtype,
+      num_elements=num_elements,
+      name=name)
+
+
+def tensor_list_concat(input_handle, element_dtype, element_shape=None,
+                       name=None):
   # Ignore the lengths output of TensorListConcat. It is only used during
   # gradient computation.
-  return gen_list_ops.tensor_list_concat(
-      input_handle=input_handle, element_dtype=element_dtype, name=name)[0]
+  return gen_list_ops.tensor_list_concat_v2(
+      input_handle=input_handle,
+      element_dtype=element_dtype,
+      element_shape=_build_element_shape(element_shape),
+      leading_dims=ops.convert_to_tensor([], dtype=dtypes.int64),
+      name=name)[0]
 
 
 def tensor_list_split(tensor, element_shape, lengths, name=None):
@@ -80,10 +152,31 @@ def tensor_list_split(tensor, element_shape, lengths, name=None):
       name=name)
 
 
+def tensor_list_set_item(input_handle,
+                         index,
+                         item,
+                         resize_if_index_out_of_bounds=False,
+                         name=None):
+  """Sets `item` at `index` in input list."""
+  if resize_if_index_out_of_bounds:
+    input_list_size = gen_list_ops.tensor_list_length(input_handle)
+    # TODO(srbs): This could cause some slowdown. Consider fusing resize
+    # functionality in the SetItem op.
+    input_handle = control_flow_ops.cond(
+        index >= input_list_size,
+        lambda: gen_list_ops.tensor_list_resize(  # pylint: disable=g-long-lambda
+            input_handle, index + 1),
+        lambda: input_handle)
+  return gen_list_ops.tensor_list_set_item(
+      input_handle=input_handle, index=index, item=item, name=name)
+
+
 @ops.RegisterGradient("TensorListPushBack")
 def _PushBackGrad(op, dresult):
   return gen_list_ops.tensor_list_pop_back(
-      dresult, element_dtype=op.get_attr("element_dtype"))
+      dresult,
+      element_shape=array_ops.shape(op.inputs[1]),
+      element_dtype=op.get_attr("element_dtype"))
 
 
 @ops.RegisterGradient("TensorListPopBack")
@@ -93,47 +186,59 @@ def _PopBackGrad(op, dlist, delement):
         element_dtype=delement.dtype,
         element_shape=gen_list_ops.tensor_list_element_shape(
             op.outputs[0], shape_type=dtypes.int32))
-  return gen_list_ops.tensor_list_push_back(dlist, delement)
+  return gen_list_ops.tensor_list_push_back(dlist, delement), None
 
 
 @ops.RegisterGradient("TensorListStack")
 def _TensorListStackGrad(unused_op, dtensor):
-  return tensor_list_from_tensor(dtensor, element_shape=dtensor.shape[1:])
+  return tensor_list_from_tensor(dtensor, element_shape=dtensor.shape[1:]), None
 
 
 @ops.RegisterGradient("TensorListConcat")
+@ops.RegisterGradient("TensorListConcatV2")
 def _TensorListConcatGrad(op, dtensor, unused_dlengths):
-  # TODO(srbs): We lose the element_shape information in tensor_list_concat.
-  # Consider providing that as an output of TensorListConcat?
-  if dtensor.shape.rank is None:
-    element_shape = None
-  else:
-    element_shape = [None] + dtensor.shape.as_list()[1:]
-  return tensor_list_split(
+  """Gradient function for TensorListConcat."""
+  dlist = tensor_list_split(
       dtensor,
-      element_shape=_build_element_shape(element_shape),
+      element_shape=gen_list_ops.tensor_list_element_shape(
+          op.inputs[0], shape_type=dtypes.int32),
       lengths=op.outputs[1])
+  if op.type == "TensorListConcatV2":
+    return dlist, None, None
+  else:
+    return dlist
 
 
 @ops.RegisterGradient("TensorListSplit")
 def _TensorListSplitGrad(op, dlist):
-  return tensor_list_concat(dlist, element_dtype=op.inputs[0].dtype), None, None
+  tensor, _, lengths = op.inputs
+  element_shape = array_ops.slice(array_ops.shape(tensor), [1], [-1])
+  element_shape = array_ops.concat([[-1], element_shape], axis=0)
+  return gen_list_ops.tensor_list_concat_v2(
+      dlist,
+      element_shape=element_shape,
+      leading_dims=lengths,
+      element_dtype=op.inputs[0].dtype)[0], None, None
 
 
 @ops.RegisterGradient("TensorListFromTensor")
 def _TensorListFromTensorGrad(op, dlist):
   """Gradient for TensorListFromTensor."""
-  if op.inputs[0].shape.dims and op.inputs[0].shape.dims[0].value is not None:
-    num_elements = op.inputs[0].shape.dims[0].value
+  t = op.inputs[0]
+  if t.shape.dims and t.shape.dims[0].value is not None:
+    num_elements = t.shape.dims[0].value
   else:
     num_elements = None
   if dlist is None:
     dlist = empty_tensor_list(
-        element_dtype=op.inputs[0].dtype,
+        element_dtype=t.dtype,
         element_shape=gen_list_ops.tensor_list_element_shape(
             op.outputs[0], shape_type=dtypes.int32))
   tensor_grad = gen_list_ops.tensor_list_stack(
-      dlist, element_dtype=op.inputs[0].dtype, num_elements=num_elements)
+      dlist,
+      element_shape=array_ops.slice(array_ops.shape(t), [1], [-1]),
+      element_dtype=t.dtype,
+      num_elements=num_elements)
   shape_grad = None
   return tensor_grad, shape_grad
 
@@ -150,33 +255,74 @@ def _TensorListGetItemGrad(op, ditem):
       index=op.inputs[1],
       item=ditem)
   index_grad = None
-  return list_grad, index_grad
+  element_shape_grad = None
+  return list_grad, index_grad, element_shape_grad
 
 
 @ops.RegisterGradient("TensorListSetItem")
 def _TensorListSetItemGrad(op, dlist):
+  """Gradient function for TensorListSetItem."""
   _, index, item = op.inputs
   list_grad = gen_list_ops.tensor_list_set_item(
       dlist, index=index, item=array_ops.zeros_like(item))
   index_grad = None
-  element_grad = gen_list_ops.tensor_list_get_item(
-      dlist, index, element_dtype=item.dtype)
+  element_grad = tensor_list_get_item(
+      dlist,
+      index,
+      element_shape=array_ops.shape(item),
+      element_dtype=item.dtype)
   return list_grad, index_grad, element_grad
 
 
+@ops.RegisterGradient("TensorListResize")
+def _TensorListResizeGrad(op, dlist):
+  input_list, _ = op.inputs
+  input_list_size = gen_list_ops.tensor_list_length(input_list)
+  return gen_list_ops.tensor_list_resize(dlist, input_list_size), None
+
+
 @ops.RegisterGradient("TensorListGather")
 def _TensorListGatherGrad(op, dtensor):
-  _, indices = op.inputs
-  return gen_list_ops.tensor_list_scatter(
-      tensor=dtensor, indices=indices,
-      element_shape=ops.convert_to_tensor(-1, dtype=dtypes.int32)), None
+  """Gradient function for TensorListGather."""
+  input_list, indices, _ = op.inputs
+  element_shape = gen_list_ops.tensor_list_element_shape(
+      input_list, shape_type=dtypes.int32)
+  num_elements = gen_list_ops.tensor_list_length(input_list)
+  dlist = tensor_list_reserve(element_shape, num_elements, dtensor.dtype)
+  dlist = tensor_list_scatter(
+      tensor=dtensor, indices=indices, input_handle=dlist)
+  return dlist, None, None
 
 
 @ops.RegisterGradient("TensorListScatter")
+@ops.RegisterGradient("TensorListScatterV2")
 def _TensorListScatterGrad(op, dlist):
-  t, indices, _ = op.inputs
-  return gen_list_ops.tensor_list_gather(
-      dlist, indices, element_dtype=t.dtype), None
+  """Gradient function for TensorListScatter."""
+  tensor = op.inputs[0]
+  indices = op.inputs[1]
+  dtensor = gen_list_ops.tensor_list_gather(
+      dlist,
+      indices,
+      element_shape=array_ops.slice(array_ops.shape(tensor), [1], [-1]),
+      element_dtype=tensor.dtype)
+  if op.type == "TensorListScatterV2":
+    return dtensor, None, None, None
+  else:
+    return dtensor, None, None
+
+
+@ops.RegisterGradient("TensorListScatterIntoExistingList")
+def _TensorListScatterIntoExistingListGrad(op, dlist):
+  """Gradient function for TensorListScatterIntoExistingList."""
+  _, tensor, indices = op.inputs
+  dtensor = gen_list_ops.tensor_list_gather(
+      dlist,
+      indices,
+      element_shape=array_ops.slice(array_ops.shape(tensor), [1], [-1]),
+      element_dtype=tensor.dtype)
+  zeros = array_ops.zeros_like(tensor)
+  dlist = tensor_list_scatter(zeros, indices, indices, input_handle=dlist)
+  return dlist, dtensor, None
 
 
 def _build_element_shape(shape):
@@ -213,4 +359,13 @@ def _build_element_shape(shape):
   if not shape:
     return ops.convert_to_tensor(shape, dtype=dtypes.int32)
   # Shape is a sequence of dimensions. Convert None dims to -1.
-  return [d if d is not None else -1 for d in shape]
+  def convert(val):
+    if val is None:
+      return -1
+    if isinstance(val, ops.Tensor):
+      return val
+    if isinstance(val, tensor_shape.Dimension):
+      return val.value if val.value is not None else -1
+    return val
+
+  return [convert(d) for d in shape]
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 5a948a21946d0b9ce867901a00425857e4f06b1f..f05fbf4dd5665596b2a03d5b580b13877d14218b 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -25,6 +25,7 @@ import sys
 
 import six
 
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -40,6 +41,14 @@ from tensorflow.python.util import nest
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
+# Register printing to the cell output if we are in a Colab or Jupyter Notebook.
+try:
+  get_ipython()  # Exists in an ipython env like Jupyter or Colab
+  pywrap_tensorflow.TFE_Py_EnableInteractivePythonLogging()
+except NameError:
+  pass
+
+
 # The python wrapper for Assert is in control_flow_ops, as the Assert
 # call relies on certain conditionals for its dependencies.  Use
 # control_flow_ops.Assert.
@@ -193,9 +202,8 @@ def print_v2(*inputs, **kwargs):
     (This prints "tensors: [0 1 2 ... 7 8 9] {2: [0 2 4 ... 14 16 18]}" to
     sys.stdout)
 
-  Note: This op is only partially compatible with Jupyter notebooks and colabs.
-    Because it prints to the C++ standard out / standard error, this will go
-    in the notebook kernel's console output, not in the notebook cell output.
+  Note: In Jupyter notebooks and colabs, this operator prints to the notebook
+    cell outputs. It will not write to the notebook kernel's console logs.
 
   Args:
     *inputs: Positional arguments that are the inputs to print. Inputs in the
@@ -263,7 +271,7 @@ def print_v2(*inputs, **kwargs):
   # If we are only printing a single string scalar, there is no need to format
   if (len(inputs) == 1 and tensor_util.is_tensor(inputs[0])
       and (not isinstance(inputs[0], sparse_tensor.SparseTensor))
-      and inputs[0].shape and (inputs[0].dtype == dtypes.string)):
+      and (inputs[0].shape.ndims == 0)and (inputs[0].dtype == dtypes.string)):
     formatted_string = inputs[0]
   # Otherwise, we construct an appropriate template for the tensors we are
   # printing, and format the template using those tensors.
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index 758cb8041da63956c7a451e2030b9e9d98016f42..d991a47b552f4f788dc9769ebded6ec1dd1e8978 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -38,9 +38,10 @@ from tensorflow.python.ops import string_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_lookup_ops import *
+from tensorflow.python.training.saver import BaseSaverBuilder
 # pylint: enable=wildcard-import
-from tensorflow.python.training.checkpointable import base as checkpointable_base
-from tensorflow.python.training.checkpointable import tracking as checkpointable
+from tensorflow.python.training.tracking import base as trackable_base
+from tensorflow.python.training.tracking import tracking as trackable
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
@@ -65,6 +66,9 @@ def initialize_all_tables(name="init_all_tables"):
 def tables_initializer(name="init_all_tables"):
   """Returns an Op that initializes all tables of the default graph.
 
+  See the [Low Level Intro](https://www.tensorflow.org/guide/low_level_intro#feature_columns)
+  guide, for an example of usage.
+
   Args:
     name: Optional name for the initialization op.
 
@@ -98,7 +102,7 @@ def _check_table_dtypes(table, key_dtype, value_dtype):
                     (table.value_dtype, value_dtype))
 
 
-class LookupInterface(checkpointable.TrackableResource):
+class LookupInterface(trackable.TrackableResource):
   """Represent a lookup table that persists across different steps."""
 
   def __init__(self, key_dtype, value_dtype):
@@ -161,8 +165,8 @@ class InitializableLookupTableBase(LookupInterface):
     self._default_value = ops.convert_to_tensor(
         default_value, dtype=self._value_dtype)
     self._default_value.get_shape().merge_with(tensor_shape.scalar())
-    if isinstance(initializer, checkpointable_base.CheckpointableBase):
-      self._initializer = self._track_checkpointable(
+    if isinstance(initializer, trackable_base.Trackable):
+      self._initializer = self._track_trackable(
           initializer, "_initializer")
     self._resource_handle = self.create_resource()
     self._init_op = self.initialize()
@@ -193,10 +197,8 @@ class InitializableLookupTableBase(LookupInterface):
     Returns:
       A scalar tensor containing the number of elements in this table.
     """
-    with ops.name_scope(name, "%s_Size" % self.name,
-                        [self.resource_handle]) as scope:
-      return gen_lookup_ops.lookup_table_size_v2(
-          self.resource_handle, name=scope)
+    with ops.name_scope(name, "%s_Size" % self.name, [self.resource_handle]):
+      return gen_lookup_ops.lookup_table_size_v2(self.resource_handle)
 
   def lookup(self, keys, name=None):
     """Looks up `keys` in a table, outputs the corresponding values.
@@ -224,9 +226,9 @@ class InitializableLookupTableBase(LookupInterface):
 
     with ops.name_scope(
         name, "%s_Lookup" % self.name,
-        (self.resource_handle, key_tensor, self._default_value)) as scope:
+        (self.resource_handle, key_tensor, self._default_value)):
       values = gen_lookup_ops.lookup_table_find_v2(
-          self.resource_handle, key_tensor, self._default_value, name=scope)
+          self.resource_handle, key_tensor, self._default_value)
 
     values.set_shape(key_tensor.get_shape())
     if isinstance(keys, sparse_tensor.SparseTensor):
@@ -249,7 +251,7 @@ class HashTable(InitializableLookupTableBase):
   ```
   """
 
-  def __init__(self, initializer, default_value, shared_name=None, name=None):
+  def __init__(self, initializer, default_value, name=None):
     """Creates a non-initialized `HashTable` object.
 
     Creates a table, the type of its keys and values are specified by the
@@ -261,8 +263,6 @@ class HashTable(InitializableLookupTableBase):
       initializer: The table initializer to use. See `HashTable` kernel for
         supported key and value types.
       default_value: The value to use if a key is missing in the table.
-      shared_name: If non-empty, this table will be shared under
-        the given name across multiple sessions.
       name: A name for the operation (optional).
 
     Returns:
@@ -270,21 +270,22 @@ class HashTable(InitializableLookupTableBase):
     """
     self._initializer = initializer
     self._default_value = default_value
-    self._shared_name = shared_name
-    self._name = name
-    self._table_name = ""
+    self._shared_name = self._initializer._shared_name  # pylint: disable=protected-access
+    self._name = name or "hash_table"
+    self._table_name = None
     super(HashTable, self).__init__(default_value, initializer)
     self._value_shape = self._default_value.get_shape()
 
   def create_resource(self):
-    with ops.name_scope(self._name, "hash_table",
-                        (self._initializer, self._default_value)) as scope:
-      table_ref = gen_lookup_ops.hash_table_v2(
-          shared_name=self._shared_name,
-          key_dtype=self._initializer.key_dtype,
-          value_dtype=self._initializer.value_dtype,
-          name=scope)
-      self._table_name = scope.split("/")[-2]
+    table_ref = gen_lookup_ops.hash_table_v2(
+        shared_name=self._shared_name,
+        key_dtype=self._initializer.key_dtype,
+        value_dtype=self._initializer.value_dtype,
+        name=self._name)
+    if context.executing_eagerly():
+      self._table_name = None
+    else:
+      self._table_name = table_ref.op.name.split("/")[-1]
     return table_ref
 
   @property
@@ -301,18 +302,16 @@ class HashTable(InitializableLookupTableBase):
       A pair of tensors with the first tensor containing all keys and the
         second tensors containing all values in the table.
     """
-    with ops.name_scope(name, "%s_Export" % self.name,
-                        [self.resource_handle]) as name:
-      with ops.colocate_with(self.resource_handle):
-        exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
-            self.resource_handle, self._key_dtype, self._value_dtype, name=name)
+    with ops.name_scope(name, "%s_Export" % self.name, [self.resource_handle]):
+      exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
+          self.resource_handle, self._key_dtype, self._value_dtype)
 
     exported_values.set_shape(exported_keys.get_shape().concatenate(
         self._value_shape))
     return exported_keys, exported_values
 
 
-class TableInitializerBase(checkpointable_base.CheckpointableBase):
+class TableInitializerBase(trackable_base.Trackable):
   """Base class for lookup table initializers."""
 
   def __init__(self, key_dtype, value_dtype):
@@ -339,6 +338,17 @@ class TableInitializerBase(checkpointable_base.CheckpointableBase):
     """Returns the table initialization op."""
     raise NotImplementedError
 
+  @property
+  def _shared_name(self):
+    """Returns a shared name to be used by the table."""
+    shared_name = ""
+    if context.executing_eagerly():
+      # Ensure a unique name when eager execution is enabled to avoid spurious
+      # sharing issues.
+      # TODO(rohanj): Use context.shared_name() instead.
+      shared_name += str(ops.uid())
+    return shared_name
+
 
 class KeyValueTensorInitializer(TableInitializerBase):
   """Table initializers given `keys` and `values` tensors."""
@@ -353,11 +363,15 @@ class KeyValueTensorInitializer(TableInitializerBase):
       value_dtype: The `values` data type. Used when `values` is a python array.
       name: A name for the operation (optional).
     """
-    with ops.name_scope(name, "key_value_init", [keys, values]) as scope:
-      self._keys = ops.convert_to_tensor(keys, dtype=key_dtype, name="keys")
-      self._values = ops.convert_to_tensor(
-          values, dtype=value_dtype, name="values")
-      self._name = scope
+    self._keys = ops.convert_to_tensor(keys, dtype=key_dtype, name="keys")
+    self._values = ops.convert_to_tensor(
+        values, dtype=value_dtype, name="values")
+    self._name = name if name is not None else "key_value_init"
+    if context.executing_eagerly():
+      # Ensure a unique name when eager execution is enabled to avoid spurious
+      # sharing issues.
+      # TODO(rohanj): Use context.shared_name() instead.
+      self._name += str(ops.uid())
 
     super(KeyValueTensorInitializer, self).__init__(self._keys.dtype,
                                                     self._values.dtype)
@@ -377,19 +391,14 @@ class KeyValueTensorInitializer(TableInitializerBase):
     """
     _check_table_dtypes(table, self._keys.dtype, self._values.dtype)
     with ops.name_scope(
-        self._name, values=(table.resource_handle, self._keys,
-                            self._values)) as scope:
-      if context.executing_eagerly():
-        # Ensure a unique name when eager execution is enabled to avoid spurious
-        # sharing issues.
-        scope += str(ops.uid())
+        self._name, values=(table.resource_handle, self._keys, self._values)):
       if fwd_compat.forward_compatible(2018, 9, 19):
         init_op = gen_lookup_ops.lookup_table_import_v2(
-            table.resource_handle, self._keys, self._values, name=scope)
+            table.resource_handle, self._keys, self._values)
       else:
         # To maintain forward compatibiltiy, use the old implementation.
-        init_op = gen_lookup_ops.initialize_table_v2(
-            table.resource_handle, self._keys, self._values, name=scope)
+        init_op = gen_lookup_ops.initialize_table_v2(table.resource_handle,
+                                                     self._keys, self._values)
     ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
     return init_op
 
@@ -498,6 +507,7 @@ class TextFileInitializer(TableInitializerBase):
     if not isinstance(filename, ops.Tensor) and not filename:
       raise ValueError("Filename required for %s." % name)
 
+    self._filename_arg = filename
     key_dtype = dtypes.as_dtype(key_dtype)
     value_dtype = dtypes.as_dtype(value_dtype)
 
@@ -530,8 +540,8 @@ class TextFileInitializer(TableInitializerBase):
     self._vocab_size = vocab_size
     self._delimiter = delimiter
     self._name = name
-    self._filename = self._track_checkpointable(
-        checkpointable.TrackableAsset(filename),
+    self._filename = self._track_trackable(
+        trackable.TrackableAsset(filename),
         "_filename")
 
     super(TextFileInitializer, self).__init__(key_dtype, value_dtype)
@@ -550,18 +560,12 @@ class TextFileInitializer(TableInitializerBase):
       key and value data types.
     """
     _check_table_dtypes(table, self.key_dtype, self.value_dtype)
-    with ops.name_scope(self._name, "text_file_init",
-                        (table.resource_handle,)) as scope:
+    with ops.name_scope(self._name, "text_file_init", (table.resource_handle,)):
       filename = ops.convert_to_tensor(
           self._filename, dtypes.string, name="asset_filepath")
       init_op = gen_lookup_ops.initialize_table_from_text_file_v2(
-          table.resource_handle,
-          filename,
-          self._key_index,
-          self._value_index,
-          -1 if self._vocab_size is None else self._vocab_size,
-          self._delimiter,
-          name=scope)
+          table.resource_handle, filename, self._key_index, self._value_index,
+          -1 if self._vocab_size is None else self._vocab_size, self._delimiter)
     ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
     # If the filename tensor is anything other than a string constant (e.g., if
     # it is a placeholder) then it does not make sense to track it as an asset.
@@ -569,6 +573,21 @@ class TextFileInitializer(TableInitializerBase):
       ops.add_to_collection(ops.GraphKeys.ASSET_FILEPATHS, filename)
     return init_op
 
+  @property
+  def _shared_name(self):
+    if self._vocab_size:
+      # Keep the shared_name:
+      # <table_type>_<filename>_<vocab_size>_<key_index>_<value_index>
+      shared_name = "hash_table_%s_%d_%s_%s" % (
+          self._filename_arg, self._vocab_size, self._key_index,
+          self._value_index)
+    else:
+      # Keep the shared_name
+      # <table_type>_<filename>_<key_index>_<value_index>
+      shared_name = "hash_table_%s_%s_%s" % (self._filename_arg,
+                                             self._key_index, self._value_index)
+    return shared_name
+
 
 class TextFileStringTableInitializer(TextFileInitializer):
   """Table initializer for `int64` IDs to string tables from a text file."""
@@ -819,7 +838,10 @@ class IdTableWithHashBuckets(LookupInterface):
       raise TypeError(
           "hasher_spec must be of type HasherSpec, got %s" % hasher_spec)
     self._hasher_spec = hasher_spec
-    self._table_name = name.split("/")[-1]
+    if name:
+      self._table_name = name.split("/")[-1]
+    else:
+      self._table_name = None
     super(IdTableWithHashBuckets, self).__init__(key_dtype, dtypes.int64)
 
   def create_resource(self):
@@ -857,9 +879,9 @@ class IdTableWithHashBuckets(LookupInterface):
 
   def size(self, name=None):
     """Compute the number of elements in this table."""
-    with ops.name_scope(name, "%s_Size" % self.name) as scope:
+    with ops.name_scope(name, "%s_Size" % self.name):
       if self._table:
-        tsize = self._table.size(scope)
+        tsize = self._table.size()
       else:
         tsize = ops.convert_to_tensor(0, dtype=dtypes.int64)
       return tsize + self._num_oov_buckets
@@ -905,7 +927,7 @@ class IdTableWithHashBuckets(LookupInterface):
       ids = self._table.lookup(values, name=name)
     else:
       # TODO(yleon): Consider moving this functionality to its own kernel.
-      with ops.name_scope(name, "%s_Lookup" % self.name) as scope:
+      with ops.name_scope(name, "%s_Lookup" % self.name):
         str_to_hash_bucket = self._get_string_to_hash_bucket_fn(
             self._hasher_spec)
         buckets = str_to_hash_bucket(
@@ -916,7 +938,7 @@ class IdTableWithHashBuckets(LookupInterface):
           ids = self._table.lookup(values)
           buckets = math_ops.add(buckets, self._table.size())
           is_id_non_default = math_ops.not_equal(ids, self._table.default_value)
-          ids = array_ops.where(is_id_non_default, ids, buckets, name=scope)
+          ids = array_ops.where(is_id_non_default, ids, buckets)
         else:
           ids = buckets
     if isinstance(keys, sparse_tensor.SparseTensor):
@@ -924,6 +946,191 @@ class IdTableWithHashBuckets(LookupInterface):
     return ids
 
 
+class StaticVocabularyTable(LookupInterface):
+  """String to Id table wrapper that assigns out-of-vocabulary keys to buckets.
+
+  For example, if an instance of `StaticVocabularyTable` is initialized with a
+  string-to-id initializer that maps:
+
+  * `emerson -> 0`
+  * `lake -> 1`
+  * `palmer -> 2`
+
+  The `Vocabulary` object will performs the following mapping:
+
+  * `emerson -> 0`
+  * `lake -> 1`
+  * `palmer -> 2`
+  * `<other term> -> bucket_id`, where bucket_id will be between `3` and
+  `3 + num_oov_buckets - 1`, calculated by:
+  `hash(<term>) % num_oov_buckets + vocab_size`
+
+  If input_tensor is `["emerson", "lake", "palmer", "king", "crimson"]`,
+  the lookup result is `[0, 1, 2, 4, 7]`.
+
+  If `initializer` is None, only out-of-vocabulary buckets are used.
+
+  Example usage:
+
+  ```python
+  num_oov_buckets = 3
+  input_tensor = tf.constant(["emerson", "lake", "palmer", "king", "crimnson"])
+  table = tf.lookup.StaticVocabularyTable(
+      tf.TextFileIdTableInitializer(filename), num_oov_buckets)
+  out = table.lookup(input_tensor).
+  table.init.run()
+  print(out.eval())
+  ```
+
+  The hash function used for generating out-of-vocabulary buckets ID is
+  Fingerprint64.
+  """
+
+  def __init__(self,
+               initializer,
+               num_oov_buckets,
+               lookup_key_dtype=None,
+               name=None):
+    """Construct a `StaticVocabularyTable` object.
+
+    Args:
+      initializer: A TableInitializerBase object that contains the data used to
+        initialize the table. If None, then we only use out-of-vocab buckets.
+      num_oov_buckets: Number of buckets to use for out-of-vocabulary keys. Must
+        be greater than zero.
+      lookup_key_dtype: Data type of keys passed to `lookup`. Defaults to
+        `initializer.key_dtype` if `initializer` is specified, otherwise
+        `tf.string`. Must be string or integer, and must be castable to
+        `initializer.key_dtype`.
+      name: A name for the operation (optional).
+
+    Raises:
+      ValueError: when `num_oov_buckets` is not positive.
+      TypeError: when lookup_key_dtype or initializer.key_dtype are not
+        integer or string. Also when initializer.value_dtype != int64.
+    """
+    if num_oov_buckets <= 0:
+      raise ValueError("oov_buckets must be > 0.")
+    # If a name ends with a '/' it is a "name scope", remove all trailing '/'
+    # characters to use as table name.
+    if name:
+      name = name.rstrip("/")
+    if initializer:
+      if lookup_key_dtype is None:
+        lookup_key_dtype = initializer.key_dtype
+      supported_table_key_dtypes = (dtypes.int64, dtypes.string)
+      if initializer.key_dtype not in supported_table_key_dtypes:
+        raise TypeError("Invalid key dtype, expected one of %s, but got %s." %
+                        (supported_table_key_dtypes, initializer.key_dtype))
+      if initializer.key_dtype.is_integer != lookup_key_dtype.is_integer:
+        raise TypeError(
+            "Invalid key dtype, expected %s but got %s." %
+            ("integer" if lookup_key_dtype.is_integer else "non-integer",
+             initializer.key_dtype))
+      if initializer.value_dtype != dtypes.int64:
+        raise TypeError("Invalid value dtype, expected %s but got %s." %
+                        (dtypes.int64, initializer.value_dtype))
+      self._table = HashTable(initializer, default_value=-1)
+      name = name or self._table.name
+    else:
+      lookup_key_dtype = dtypes.string
+      self._table = None
+      name = name or "hash_bucket"
+    if (not lookup_key_dtype.is_integer) and (dtypes.string !=
+                                              lookup_key_dtype):
+      raise TypeError("Invalid key_dtype, expected integer or string, got %s." %
+                      lookup_key_dtype)
+    self._num_oov_buckets = num_oov_buckets
+
+    self._table_name = None
+    if name is not None:
+      self._table_name = name.split("/")[-1]
+    super(StaticVocabularyTable, self).__init__(lookup_key_dtype, dtypes.int64)
+
+  def create_resource(self):
+    if self._table is not None:
+      return self._table.create_resource()
+    return None
+
+  def initialize(self):
+    if self._table is not None:
+      return self._table.initialize()
+    with ops.name_scope(None, "init"):
+      return control_flow_ops.no_op()
+
+  @property
+  def initializer(self):
+    if self._table is not None:
+      return self._table._init_op  # pylint: disable=protected-access
+    with ops.name_scope(None, "init"):
+      return control_flow_ops.no_op()
+
+  @property
+  @deprecated("2018-12-15", "Use `initializer` instead.")
+  def init(self):
+    return self.initializer
+
+  @property
+  def resource_handle(self):
+    if self._table is not None:
+      return self._table.resource_handle
+    return None
+
+  @property
+  def name(self):
+    return self._table_name
+
+  def size(self, name=None):
+    """Compute the number of elements in this table."""
+    with ops.name_scope(name, "%s_Size" % self.name):
+      if self._table:
+        tsize = self._table.size()
+      else:
+        tsize = ops.convert_to_tensor(0, dtype=dtypes.int64)
+      return tsize + self._num_oov_buckets
+
+  def lookup(self, keys, name=None):
+    """Looks up `keys` in the table, outputs the corresponding values.
+
+    It assigns out-of-vocabulary keys to buckets based in their hashes.
+
+    Args:
+      keys: Keys to look up. May be either a `SparseTensor` or dense `Tensor`.
+      name: Optional name for the op.
+
+    Returns:
+      A `SparseTensor` if keys are sparse, otherwise a dense `Tensor`.
+
+    Raises:
+      TypeError: when `keys` doesn't match the table key data type.
+    """
+    if keys.dtype.base_dtype != self._key_dtype:
+      raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
+                      (self._key_dtype, keys.dtype))
+    values = keys
+    if isinstance(keys, sparse_tensor.SparseTensor):
+      values = keys.values
+    if self._table and (self._table.key_dtype.base_dtype == dtypes.int64):
+      values = math_ops.to_int64(values)
+
+    # TODO(yleon): Consider moving this functionality to its own kernel.
+    with ops.name_scope(name, "%s_Lookup" % self.name):
+      buckets = string_ops.string_to_hash_bucket_fast(
+          _as_string(values),
+          num_buckets=self._num_oov_buckets,
+          name="hash_bucket")
+      if self._table:
+        ids = self._table.lookup(values)
+        buckets = math_ops.add(buckets, self._table.size())
+        is_id_non_default = math_ops.not_equal(ids, self._table.default_value)
+        ids = array_ops.where(is_id_non_default, ids, buckets)
+      else:
+        ids = buckets
+    if isinstance(keys, sparse_tensor.SparseTensor):
+      return sparse_tensor.SparseTensor(keys.indices, ids, keys.dense_shape)
+    return ids
+
+
 def index_table_from_file(vocabulary_file=None,
                           num_oov_buckets=0,
                           vocab_size=None,
@@ -948,7 +1155,7 @@ def index_table_from_file(vocabulary_file=None,
   `[vocabulary size, vocabulary size + num_oov_buckets - 1]`.
 
   The underlying table must be initialized by calling
-  `tf.tables_initializer.run()` or `table.init.run()` once.
+  `session.run(tf.tables_initializer)` or `session.run(table.init)` once.
 
   To specify multi-column vocabulary files, use key_column_index and
   value_column_index and delimiter.
@@ -1020,22 +1227,9 @@ def index_table_from_file(vocabulary_file=None,
   if (not key_dtype.is_integer) and (dtypes.string != key_dtype.base_dtype):
     raise TypeError("Only integer and string keys are supported.")
 
-  with ops.name_scope(name, "string_to_index") as feat_to_id_scope:
+  with ops.name_scope(name, "string_to_index"):
     table = None
-    shared_name = ""
-    with ops.name_scope(None, "hash_table") as hash_table_scope:
-      if vocab_size:
-        # Keep the shared_name:
-        # <table_type>_<filename>_<vocab_size>_<key_index>_<value_index>
-        shared_name = "hash_table_%s_%d_%s_%s" % (vocabulary_file, vocab_size,
-                                                  key_column_index,
-                                                  value_column_index)
-      else:
-        # Keep the shared_name
-        # <table_type>_<filename>_<key_index>_<value_index>
-        shared_name = "hash_table_%s_%s_%s" % (vocabulary_file,
-                                               key_column_index,
-                                               value_column_index)
+    with ops.name_scope(None, "hash_table"):
       init = TextFileIdTableInitializer(
           vocabulary_file,
           vocab_size=vocab_size,
@@ -1045,14 +1239,12 @@ def index_table_from_file(vocabulary_file=None,
           value_column_index=value_column_index,
           delimiter=delimiter)
 
-      table = HashTable(
-          init, default_value, shared_name=shared_name, name=hash_table_scope)
+      table = HashTable(init, default_value)
     if num_oov_buckets:
       table = IdTableWithHashBuckets(
           table,
           num_oov_buckets=num_oov_buckets,
           hasher_spec=hasher_spec,
-          name=feat_to_id_scope,
           key_dtype=key_dtype)
 
     return table
@@ -1077,7 +1269,7 @@ def index_table_from_tensor(vocabulary_list,
   `[vocabulary list size, vocabulary list size + num_oov_buckets - 1]`.
 
   The underlying table must be initialized by calling
-  `tf.tables_initializer.run()` or `table.init.run()` once.
+  `session.run(tf.tables_initializer)` or `session.run(table.init)` once.
 
   Elements in `vocabulary_list` cannot have duplicates, otherwise when executing
   the table initializer op, it will throw a `FailedPreconditionError`.
@@ -1125,7 +1317,7 @@ def index_table_from_tensor(vocabulary_list,
   if (not dtype.is_integer) and (dtypes.string != dtype.base_dtype):
     raise TypeError("Only integer and string keys are supported.")
 
-  with ops.name_scope(name, "string_to_index") as feat_to_id_scope:
+  with ops.name_scope(name, "string_to_index"):
     keys = ops.convert_to_tensor(vocabulary_list)
     if keys.dtype.is_integer != dtype.is_integer:
       raise ValueError("Expected %s, got %s." %
@@ -1136,12 +1328,7 @@ def index_table_from_tensor(vocabulary_list,
     num_elements = array_ops.size(keys)
     values = math_ops.to_int64(math_ops.range(num_elements))
 
-    shared_name = ""
-    with ops.name_scope(None, "hash_table") as hash_table_scope:
-      if context.executing_eagerly():
-        # Ensure a unique name when eager execution is enabled to avoid spurious
-        # sharing issues.
-        shared_name += str(ops.uid())
+    with ops.name_scope(None, "hash_table"):
       table_keys = math_ops.to_int64(keys) if keys.dtype.is_integer else keys
       init = KeyValueTensorInitializer(
           table_keys,
@@ -1149,14 +1336,12 @@ def index_table_from_tensor(vocabulary_list,
           table_keys.dtype.base_dtype,
           dtypes.int64,
           name="table_init")
-      table = HashTable(
-          init, default_value, shared_name=shared_name, name=hash_table_scope)
+      table = HashTable(init, default_value)
     if num_oov_buckets:
       table = IdTableWithHashBuckets(
           table,
           num_oov_buckets=num_oov_buckets,
           hasher_spec=hasher_spec,
-          name=feat_to_id_scope,
           key_dtype=dtype)
     return table
 
@@ -1179,7 +1364,7 @@ def index_to_string_table_from_file(vocabulary_file,
   (an out-of-vocabulary entry) is assigned the `default_value`
 
   The underlying table must be initialized by calling
-  `tf.tables_initializer.run()` or `table.init.run()` once.
+  `session.run(tf.tables_initializer)` or `session.run(table.init)` once.
 
   To specify multi-column vocabulary files, use key_column_index and
   value_column_index and delimiter.
@@ -1238,18 +1423,7 @@ def index_to_string_table_from_file(vocabulary_file,
   if vocab_size is not None and vocab_size < 1:
     raise ValueError("vocab_size must be greater than 0, got %d." % vocab_size)
 
-  with ops.name_scope(name, "index_to_string") as scope:
-    shared_name = ""
-    if vocab_size:
-      # Keep a shared_name
-      # <table_type>_<filename>_<vocab_size>_<key_index>_<value_index>
-      shared_name = "hash_table_%s_%d_%s_%s" % (vocabulary_file, vocab_size,
-                                                key_column_index,
-                                                value_column_index)
-    else:
-      # Keep a shared_name <table_type>_<filename>_<key_index>_<value_index>
-      shared_name = "hash_table_%s_%s_%s" % (vocabulary_file, key_column_index,
-                                             value_column_index)
+  with ops.name_scope(name, "index_to_string"):
     init = TextFileStringTableInitializer(
         vocabulary_file,
         vocab_size=vocab_size,
@@ -1259,7 +1433,7 @@ def index_to_string_table_from_file(vocabulary_file,
         delimiter=delimiter)
 
     # TODO(yleon): Use a more effienct structure.
-    return HashTable(init, default_value, shared_name=shared_name, name=scope)
+    return HashTable(init, default_value)
 
 
 def index_to_string_table_from_tensor(vocabulary_list,
@@ -1276,7 +1450,7 @@ def index_to_string_table_from_tensor(vocabulary_list,
   (an out-of-vocabulary entry) is assigned the `default_value`
 
   The underlying table must be initialized by calling
-  `tf.tables_initializer.run()` or `table.init.run()` once.
+  `session.run(tf.tables_initializer)` or `session.run(table.init)` once.
 
   Elements in `vocabulary_list` cannot have duplicates, otherwise when executing
   the table initializer op, it will throw a `FailedPreconditionError`.
@@ -1312,16 +1486,501 @@ def index_to_string_table_from_tensor(vocabulary_list,
   if vocabulary_list is None:
     raise ValueError("vocabulary_list must be specified.")
 
-  with ops.name_scope(name, "index_to_string") as scope:
+  with ops.name_scope(name, "index_to_string"):
     vocabulary_list = ops.convert_to_tensor(vocabulary_list, dtypes.string)
     num_elements = array_ops.size(vocabulary_list)
     keys = math_ops.to_int64(math_ops.range(num_elements))
 
-    shared_name = ""
     init = KeyValueTensorInitializer(
         keys, vocabulary_list, dtypes.int64, dtypes.string, name="table_init")
     # TODO(yleon): Use a more effienct structure.
-    return HashTable(init, default_value, shared_name=shared_name, name=scope)
+    return HashTable(init, default_value)
+
+
+class MutableHashTable(LookupInterface):
+  """A generic mutable hash table implementation.
+
+  Data can be inserted by calling the insert method and removed by calling the
+  remove method. It does not support initialization via the init method.
+
+  Example usage:
+
+  ```python
+  table = tf.lookup.MutableHashTable(key_dtype=tf.string, value_dtype=tf.int64,
+                                     default_value=-1)
+  sess.run(table.insert(keys, values))
+  out = table.lookup(query_keys)
+  print(out.eval())
+  ```
+  """
+
+  def __init__(self,
+               key_dtype,
+               value_dtype,
+               default_value,
+               name="MutableHashTable",
+               checkpoint=True):
+    """Creates an empty `MutableHashTable` object.
+
+    Creates a table, the type of its keys and values are specified by key_dtype
+    and value_dtype, respectively.
+
+    Args:
+      key_dtype: the type of the key tensors.
+      value_dtype: the type of the value tensors.
+      default_value: The value to use if a key is missing in the table.
+      name: A name for the operation (optional).
+      checkpoint: if True, the contents of the table are saved to and restored
+        from checkpoints. If `shared_name` is empty for a checkpointed table, it
+        is shared using the table node name.
+
+    Returns:
+      A `MutableHashTable` object.
+
+    Raises:
+      ValueError: If checkpoint is True and no name was specified.
+    """
+    self._default_value = ops.convert_to_tensor(
+        default_value, dtype=value_dtype)
+    self._value_shape = self._default_value.get_shape()
+    self._checkpoint = checkpoint
+    self._key_dtype = key_dtype
+    self._value_dtype = value_dtype
+    self._name = name
+
+    self._shared_name = None
+    if context.executing_eagerly():
+      # TODO(allenl): This will leak memory due to kernel caching by the
+      # shared_name attribute value (but is better than the alternative of
+      # sharing everything by default when executing eagerly; hopefully creating
+      # tables in a loop is uncommon).
+      # TODO(rohanj): Use context.shared_name() instead.
+      self._shared_name = "table_%d" % (ops.uid(),)
+    super(MutableHashTable, self).__init__(key_dtype, value_dtype)
+
+    self._resource_handle = self.create_resource()
+    if checkpoint:
+      saveable = MutableHashTable._Saveable(self, name)
+      if not context.executing_eagerly():
+        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+
+  def create_resource(self):
+    # The table must be shared if checkpointing is requested for multi-worker
+    # training to work correctly. Use the node name if no shared_name has been
+    # explicitly specified.
+    use_node_name_sharing = self._checkpoint and self._shared_name is None
+    if self._default_value.get_shape().ndims == 0:
+      table_ref = gen_lookup_ops.mutable_hash_table_v2(
+          shared_name=self._shared_name,
+          use_node_name_sharing=use_node_name_sharing,
+          key_dtype=self._key_dtype,
+          value_dtype=self._value_dtype,
+          name=self._name)
+    else:
+      table_ref = gen_lookup_ops.mutable_hash_table_of_tensors_v2(
+          shared_name=self._shared_name,
+          use_node_name_sharing=use_node_name_sharing,
+          key_dtype=self._key_dtype,
+          value_dtype=self._value_dtype,
+          value_shape=self._default_value.get_shape(),
+          name=self._name)
+
+    if context.executing_eagerly():
+      self._table_name = None
+    else:
+      self._table_name = table_ref.op.name.split("/")[-1]
+    return table_ref
+
+  @property
+  def name(self):
+    return self._table_name
+
+  def size(self, name=None):
+    """Compute the number of elements in this table.
+
+    Args:
+      name: A name for the operation (optional).
+
+    Returns:
+      A scalar tensor containing the number of elements in this table.
+    """
+    with ops.name_scope(name, "%s_Size" % self.name, [self.resource_handle]):
+      with ops.colocate_with(self.resource_handle):
+        return gen_lookup_ops.lookup_table_size_v2(self.resource_handle)
+
+  def remove(self, keys, name=None):
+    """Removes `keys` and its associated values from the table.
+
+    If a key is not present in the table, it is silently ignored.
+
+    Args:
+      keys: Keys to remove. Can be a tensor of any shape. Must match the table's
+        key type.
+      name: A name for the operation (optional).
+
+    Returns:
+      The created Operation.
+
+    Raises:
+      TypeError: when `keys` do not match the table data types.
+    """
+    if keys.dtype != self._key_dtype:
+      raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
+                      (self._key_dtype, keys.dtype))
+
+    with ops.name_scope(name, "%s_lookup_table_remove" % self.name,
+                        (self.resource_handle, keys, self._default_value)):
+      op = gen_lookup_ops.lookup_table_remove_v2(self.resource_handle, keys)
+
+    return op
+
+  def lookup(self, keys, name=None):
+    """Looks up `keys` in a table, outputs the corresponding values.
+
+    The `default_value` is used for keys not present in the table.
+
+    Args:
+      keys: Keys to look up. Can be a tensor of any shape. Must match the
+        table's key_dtype.
+      name: A name for the operation (optional).
+
+    Returns:
+      A tensor containing the values in the same shape as `keys` using the
+        table's value type.
+
+    Raises:
+      TypeError: when `keys` do not match the table data types.
+    """
+    with ops.name_scope(name, "%s_lookup_table_find" % self.name,
+                        (self.resource_handle, keys, self._default_value)):
+      keys = ops.convert_to_tensor(keys, dtype=self._key_dtype, name="keys")
+      with ops.colocate_with(self.resource_handle):
+        values = gen_lookup_ops.lookup_table_find_v2(self.resource_handle, keys,
+                                                     self._default_value)
+    return values
+
+  def insert(self, keys, values, name=None):
+    """Associates `keys` with `values`.
+
+    Args:
+      keys: Keys to insert. Can be a tensor of any shape. Must match the table's
+        key type.
+      values: Values to be associated with keys. Must be a tensor of the same
+        shape as `keys` and match the table's value type.
+      name: A name for the operation (optional).
+
+    Returns:
+      The created Operation.
+
+    Raises:
+      TypeError: when `keys` or `values` doesn't match the table data
+        types.
+    """
+    with ops.name_scope(name, "%s_lookup_table_insert" % self.name,
+                        [self.resource_handle, keys, values]):
+      keys = ops.convert_to_tensor(keys, self._key_dtype, name="keys")
+      values = ops.convert_to_tensor(values, self._value_dtype, name="values")
+      with ops.colocate_with(self.resource_handle):
+        # pylint: disable=protected-access
+        op = gen_lookup_ops.lookup_table_insert_v2(self.resource_handle, keys,
+                                                   values)
+    return op
+
+  def export(self, name=None):
+    """Returns tensors of all keys and values in the table.
+
+    Args:
+      name: A name for the operation (optional).
+
+    Returns:
+      A pair of tensors with the first tensor containing all keys and the
+        second tensors containing all values in the table.
+    """
+    with ops.name_scope(name, "%s_lookup_table_export_values" % self.name,
+                        [self.resource_handle]):
+      with ops.colocate_with(self.resource_handle):
+        exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
+            self.resource_handle, self._key_dtype, self._value_dtype)
+    return exported_keys, exported_values
+
+  def _gather_saveables_for_checkpoint(self):
+    """For object-based checkpointing."""
+    return {"table": functools.partial(MutableHashTable._Saveable, table=self)}
+
+  class _Saveable(BaseSaverBuilder.SaveableObject):
+    """SaveableObject implementation for MutableHashTable."""
+
+    def __init__(self, table, name):
+      tensors = table.export()
+      specs = [
+          BaseSaverBuilder.SaveSpec(tensors[0], "", name + "-keys"),
+          BaseSaverBuilder.SaveSpec(tensors[1], "", name + "-values")
+      ]
+      # pylint: disable=protected-access
+      super(MutableHashTable._Saveable, self).__init__(table, specs, name)
+
+    def restore(self, restored_tensors, restored_shapes, name=None):
+      del restored_shapes  # unused
+      # pylint: disable=protected-access
+      with ops.name_scope(name, "%s_table_restore" % self.name):
+        with ops.colocate_with(self.op.resource_handle):
+          return gen_lookup_ops.lookup_table_import_v2(
+              self.op.resource_handle, restored_tensors[0], restored_tensors[1])
+
+
+class MutableDenseHashTable(LookupInterface):
+  """A generic mutable hash table implementation using tensors as backing store.
+
+  Data can be inserted by calling the insert method and removed by calling the
+  remove method. It does not support initialization via the init method.
+
+  It uses "open addressing" with quadratic reprobing to resolve collisions.
+  Compared to `MutableHashTable` the insert, remove and lookup operations in a
+  `MutableDenseHashTable` are typically faster, but memory usage can be higher.
+  However, `MutableDenseHashTable` does not require additional memory for
+  temporary tensors created during checkpointing and restore operations.
+
+  Example usage:
+
+  ```python
+  table = tf.lookup.MutableDenseHashTable(key_dtype=tf.int64,
+                                          value_dtype=tf.int64,
+                                          default_value=-1,
+                                          empty_key=0,
+                                          deleted_key=-1)
+
+  sess.run(table.insert(keys, values))
+  out = table.lookup(query_keys)
+  print(out.eval())
+  ```
+  """
+
+  # TODO(andreasst): consider extracting common code with MutableHashTable into
+  # a common superclass.
+  def __init__(self,
+               key_dtype,
+               value_dtype,
+               default_value,
+               empty_key,
+               deleted_key,
+               initial_num_buckets=None,
+               name="MutableDenseHashTable",
+               checkpoint=True):
+    """Creates an empty `MutableDenseHashTable` object.
+
+    Creates a table, the type of its keys and values are specified by key_dtype
+    and value_dtype, respectively.
+
+    Args:
+      key_dtype: the type of the key tensors.
+      value_dtype: the type of the value tensors.
+      default_value: The value to use if a key is missing in the table.
+      empty_key: the key to use to represent empty buckets internally. Must not
+        be used in insert, remove or lookup operations.
+      deleted_key: the key to use to represent deleted buckets internally. Must
+        not be used in insert, remove or lookup operations and be different from
+        the empty_key.
+      initial_num_buckets: the initial number of buckets.
+      name: A name for the operation (optional).
+      checkpoint: if True, the contents of the table are saved to and restored
+        from checkpoints. If `shared_name` is empty for a checkpointed table, it
+        is shared using the table node name.
+
+    Returns:
+      A `MutableDenseHashTable` object.
+
+    Raises:
+      ValueError: If checkpoint is True and no name was specified.
+    """
+    self._default_value = ops.convert_to_tensor(
+        default_value, dtype=value_dtype, name="default_value")
+    self._key_dtype = key_dtype
+    self._value_dtype = value_dtype
+    self._initial_num_buckets = initial_num_buckets
+    self._value_shape = self._default_value.get_shape()
+    self._checkpoint = checkpoint
+    self._name = name
+
+    self._empty_key = ops.convert_to_tensor(
+        empty_key, dtype=key_dtype, name="empty_key")
+    self._deleted_key = ops.convert_to_tensor(
+        deleted_key, dtype=key_dtype, name="deleted_key")
+    self._shared_name = None
+    if context.executing_eagerly():
+      # TODO(allenl): This will leak memory due to kernel caching by the
+      # shared_name attribute value (but is better than the alternative of
+      # sharing everything by default when executing eagerly; hopefully creating
+      # tables in a loop is uncommon).
+      # TODO(rohanj): Use context.shared_name() instead.
+      self._shared_name = "table_%d" % (ops.uid(),)
+    super(MutableDenseHashTable, self).__init__(key_dtype, value_dtype)
+
+    self._resource_handle = self.create_resource()
+    if checkpoint:
+      saveable = MutableDenseHashTable._Saveable(self, name)
+      if not context.executing_eagerly():
+        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+
+  def create_resource(self):
+    # The table must be shared if checkpointing is requested for multi-worker
+    # training to work correctly. Use the node name if no shared_name has been
+    # explicitly specified.
+    use_node_name_sharing = self._checkpoint and self._shared_name is None
+    table_ref = gen_lookup_ops.mutable_dense_hash_table_v2(
+        empty_key=self._empty_key,
+        deleted_key=self._deleted_key,
+        shared_name=self._shared_name,
+        use_node_name_sharing=use_node_name_sharing,
+        value_dtype=self._value_dtype,
+        value_shape=self._value_shape,
+        initial_num_buckets=self._initial_num_buckets,
+        name=self._name)
+    if context.executing_eagerly():
+      self._table_name = None
+    else:
+      self._table_name = table_ref.op.name.split("/")[-1]
+    return table_ref
+
+  @property
+  def name(self):
+    return self._table_name
+
+  def size(self, name=None):
+    """Compute the number of elements in this table.
+
+    Args:
+      name: A name for the operation (optional).
+
+    Returns:
+      A scalar tensor containing the number of elements in this table.
+    """
+    with ops.name_scope(name, "%s_Size" % self.name, [self.resource_handle]):
+      with ops.colocate_with(self.resource_handle):
+        return gen_lookup_ops.lookup_table_size_v2(self.resource_handle)
+
+  def lookup(self, keys, name=None):
+    """Looks up `keys` in a table, outputs the corresponding values.
+
+    The `default_value` is used for keys not present in the table.
+
+    Args:
+      keys: Keys to look up. Can be a tensor of any shape. Must match the
+        table's key_dtype.
+      name: A name for the operation (optional).
+
+    Returns:
+      A tensor containing the values in the same shape as `keys` using the
+        table's value type.
+
+    Raises:
+      TypeError: when `keys` do not match the table data types.
+    """
+    with ops.name_scope(name, "%s_lookup_table_find" % self.name,
+                        [self.resource_handle, keys]):
+      keys = ops.convert_to_tensor(keys, dtype=self._key_dtype, name="keys")
+      with ops.colocate_with(self.resource_handle):
+        values = gen_lookup_ops.lookup_table_find_v2(self.resource_handle, keys,
+                                                     self._default_value)
+
+    return values
+
+  def insert(self, keys, values, name=None):
+    """Associates `keys` with `values`.
+
+    Args:
+      keys: Keys to insert. Can be a tensor of any shape. Must match the table's
+        key type.
+      values: Values to be associated with keys. Must be a tensor of the same
+        shape as `keys` and match the table's value type.
+      name: A name for the operation (optional).
+
+    Returns:
+      The created Operation.
+
+    Raises:
+      TypeError: when `keys` or `values` doesn't match the table data
+        types.
+    """
+    with ops.name_scope(name, "%s_lookup_table_insert" % self.name,
+                        [self.resource_handle, keys, values]):
+      keys = ops.convert_to_tensor(keys, dtype=self._key_dtype, name="keys")
+      values = ops.convert_to_tensor(
+          values, dtype=self._value_dtype, name="values")
+      with ops.colocate_with(self.resource_handle):
+        op = gen_lookup_ops.lookup_table_insert_v2(self.resource_handle, keys,
+                                                   values)
+      return op
+
+  def remove(self, keys, name=None):
+    """Removes `keys` and its associated values from the table.
+
+    If a key is not present in the table, it is silently ignored.
+
+    Args:
+      keys: Keys to remove. Can be a tensor of any shape. Must match the table's
+        key type.
+      name: A name for the operation (optional).
+
+    Returns:
+      The created Operation.
+
+    Raises:
+      TypeError: when `keys` do not match the table data types.
+    """
+    if keys.dtype != self._key_dtype:
+      raise TypeError("Signature mismatch. Keys must be dtype %s, got %s." %
+                      (self._key_dtype, keys.dtype))
+
+    with ops.name_scope(name, "%s_lookup_table_remove" % self.name,
+                        (self.resource_handle, keys, self._default_value)):
+      # pylint: disable=protected-access
+      op = gen_lookup_ops.lookup_table_remove_v2(self.resource_handle, keys)
+
+    return op
+
+  def export(self, name=None):
+    """Returns tensors of all keys and values in the table.
+
+    Args:
+      name: A name for the operation (optional).
+
+    Returns:
+      A pair of tensors with the first tensor containing all keys and the
+        second tensors containing all values in the table.
+    """
+    with ops.name_scope(name, "%s_lookup_table_export_values" % self.name,
+                        [self.resource_handle]):
+      with ops.colocate_with(self.resource_handle):
+        exported_keys, exported_values = gen_lookup_ops.lookup_table_export_v2(
+            self.resource_handle, self._key_dtype, self._value_dtype)
+
+    return exported_keys, exported_values
+
+  def _gather_saveables_for_checkpoint(self):
+    """For object-based checkpointing."""
+    return {
+        "table": functools.partial(MutableDenseHashTable._Saveable, table=self)
+    }
+
+  class _Saveable(BaseSaverBuilder.SaveableObject):
+    """SaveableObject implementation for MutableDenseHashTable."""
+
+    def __init__(self, table, name):
+      tensors = table.export()
+      specs = [
+          BaseSaverBuilder.SaveSpec(tensors[0], "", name + "-keys"),
+          BaseSaverBuilder.SaveSpec(tensors[1], "", name + "-values")
+      ]
+      # pylint: disable=protected-access
+      super(MutableDenseHashTable._Saveable, self).__init__(table, specs, name)
+
+    def restore(self, restored_tensors, restored_shapes, name=None):
+      del restored_shapes  # unused
+      # pylint: disable=protected-access
+      with ops.name_scope(name, "%s_table_restore" % self.name):
+        with ops.colocate_with(self.op.resource_handle):
+          return gen_lookup_ops.lookup_table_import_v2(
+              self.op.resource_handle, restored_tensors[0], restored_tensors[1])
 
 
 ops.NotDifferentiable("LookupTableFind")
diff --git a/tensorflow/python/ops/losses/BUILD b/tensorflow/python/ops/losses/BUILD
index 4aea0265a72dcd2b2358f063fb0a51a5877076e7..9155d890ded7782ef7d64e631540e98e07f34a80 100644
--- a/tensorflow/python/ops/losses/BUILD
+++ b/tensorflow/python/ops/losses/BUILD
@@ -29,6 +29,7 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:util",
         "//tensorflow/python:weights_broadcast_ops",
+        "//tensorflow/python/distribute:distribute_lib",
     ],
 )
 
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 20397612bca9a9b81d9816ac1626ce15024d45f6..6cd1d8e5f8baf39f9051f95ead2f47d30826c945 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -18,7 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import confusion_matrix
@@ -33,31 +35,6 @@ from tensorflow.python.util.deprecation import deprecated_argument_lookup
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("losses.Reduction", "keras.losses.Reduction", v1=[])
-class ReductionV2(object):
-  """Types of loss reduction.
-
-  Contains the following values:
-
-  * `NONE`: Un-reduced weighted losses with the same shape as input.
-  * `SUM`: Scalar sum of weighted losses.
-  * `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
-  """
-
-  NONE = "none"
-  SUM = "sum"
-  SUM_OVER_BATCH_SIZE = "sum_over_batch_size"
-
-  @classmethod
-  def all(cls):
-    return (cls.NONE, cls.SUM, cls.SUM_OVER_BATCH_SIZE)
-
-  @classmethod
-  def validate(cls, key):
-    if key not in cls.all():
-      raise ValueError("Invalid Reduction Key %s." % key)
-
-
 @tf_export(v1=["losses.Reduction"])
 class Reduction(object):
   """Types of loss reduction.
@@ -68,8 +45,13 @@ class Reduction(object):
   * `SUM`: Scalar sum of weighted losses.
   * `MEAN`: Scalar `SUM` divided by sum of weights. DEPRECATED.
   * `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
+     Note that when using `tf.distribute.Strategy`, this is the global batch
+     size across all the replicas that are contributing to a single step.
   * `SUM_OVER_NONZERO_WEIGHTS`: Scalar `SUM` divided by number of non-zero
      weights. DEPRECATED.
+     Note that when using `tf.distribute.Strategy`, this is scaled by the
+     number of replicas that are contributing to a single step to get an
+     approximation to the global batch size.
   * `SUM_BY_NONZERO_WEIGHTS`: Same as `SUM_OVER_NONZERO_WEIGHTS`.
   """
 
@@ -138,7 +120,7 @@ def _num_present(losses, weights, per_batch=False):
        and not math_ops.equal(weights, 0.0))):
     return _num_elements(losses)
   with ops.name_scope(None, "num_present", (losses, weights)) as scope:
-    weights = math_ops.to_float(weights)
+    weights = math_ops.cast(weights, dtype=dtypes.float32)
     present = array_ops.where(
         math_ops.equal(weights, 0.0),
         array_ops.zeros_like(weights),
@@ -197,31 +179,28 @@ def compute_weighted_loss(
   """
   Reduction.validate(reduction)
   with ops.name_scope(scope, "weighted_loss", (losses, weights)):
-    # Save the `reduction` argument for loss normalization when distributing
-    # to multiple replicas.
-    # TODO(josh11b): Associate it with the returned op for more precision.
-    ops.get_default_graph()._last_loss_reduction = reduction  # pylint: disable=protected-access
-
     with ops.control_dependencies((
         weights_broadcast_ops.assert_broadcastable(weights, losses),)):
       losses = ops.convert_to_tensor(losses)
       input_dtype = losses.dtype
-      losses = math_ops.to_float(losses)
-      weights = math_ops.to_float(weights)
+      losses = math_ops.cast(losses, dtype=dtypes.float32)
+      weights = math_ops.cast(weights, dtype=dtypes.float32)
       weighted_losses = math_ops.multiply(losses, weights)
       if reduction == Reduction.NONE:
         loss = weighted_losses
       else:
         loss = math_ops.reduce_sum(weighted_losses)
+        num_replicas = (  # Used to convert from local to global batch size.
+            distribution_strategy_context.get_strategy().num_replicas_in_sync)
         if reduction == Reduction.MEAN:
-          loss = _safe_mean(
-              loss,
-              math_ops.reduce_sum(array_ops.ones_like(losses) * weights))
+          denom = (num_replicas *
+                   math_ops.reduce_sum(array_ops.ones_like(losses) * weights))
+          loss = _safe_mean(loss, denom)
         elif (reduction == Reduction.SUM_BY_NONZERO_WEIGHTS or
               reduction == Reduction.SUM_OVER_NONZERO_WEIGHTS):
-          loss = _safe_mean(loss, _num_present(losses, weights))
+          loss = _safe_mean(loss, num_replicas * _num_present(losses, weights))
         elif reduction == Reduction.SUM_OVER_BATCH_SIZE:
-          loss = _safe_mean(loss, _num_elements(losses))
+          loss = _safe_mean(loss, num_replicas * _num_elements(losses))
 
       # Convert the result back to the input type.
       loss = math_ops.cast(loss, input_dtype)
@@ -274,8 +253,8 @@ def absolute_difference(
     raise ValueError("predictions must not be None.")
   with ops.name_scope(scope, "absolute_difference",
                       (predictions, labels, weights)) as scope:
-    predictions = math_ops.to_float(predictions)
-    labels = math_ops.to_float(labels)
+    predictions = math_ops.cast(predictions, dtype=dtypes.float32)
+    labels = math_ops.cast(labels, dtype=dtypes.float32)
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
     losses = math_ops.abs(math_ops.subtract(predictions, labels))
     return compute_weighted_loss(
@@ -328,8 +307,8 @@ def cosine_distance(
     raise ValueError("predictions must not be None.")
   with ops.name_scope(scope, "cosine_distance_loss",
                       (predictions, labels, weights)) as scope:
-    predictions = math_ops.to_float(predictions)
-    labels = math_ops.to_float(labels)
+    predictions = math_ops.cast(predictions, dtype=dtypes.float32)
+    labels = math_ops.cast(labels, dtype=dtypes.float32)
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
 
     radial_diffs = math_ops.multiply(predictions, labels)
@@ -376,8 +355,8 @@ def hinge_loss(labels, logits, weights=1.0, scope=None,
   if logits is None:
     raise ValueError("logits must not be None.")
   with ops.name_scope(scope, "hinge_loss", (logits, labels, weights)) as scope:
-    logits = math_ops.to_float(logits)
-    labels = math_ops.to_float(labels)
+    logits = math_ops.cast(logits, dtype=dtypes.float32)
+    labels = math_ops.cast(labels, dtype=dtypes.float32)
     logits.get_shape().assert_is_compatible_with(labels.get_shape())
     # We first need to convert binary labels to -1/1 labels (as floats).
     all_ones = array_ops.ones_like(labels)
@@ -445,8 +424,8 @@ def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
     raise ValueError("predictions must not be None.")
   with ops.name_scope(scope, "huber_loss",
                       (predictions, labels, weights)) as scope:
-    predictions = math_ops.to_float(predictions)
-    labels = math_ops.to_float(labels)
+    predictions = math_ops.cast(predictions, dtype=dtypes.float32)
+    labels = math_ops.cast(labels, dtype=dtypes.float32)
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
     error = math_ops.subtract(predictions, labels)
     abs_error = math_ops.abs(error)
@@ -511,8 +490,8 @@ def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
     raise ValueError("predictions must not be None.")
   with ops.name_scope(scope, "log_loss",
                       (predictions, labels, weights)) as scope:
-    predictions = math_ops.to_float(predictions)
-    labels = math_ops.to_float(labels)
+    predictions = math_ops.cast(predictions, dtype=dtypes.float32)
+    labels = math_ops.cast(labels, dtype=dtypes.float32)
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
     losses = -math_ops.multiply(
         labels,
@@ -579,11 +558,11 @@ def mean_pairwise_squared_error(
     raise ValueError("predictions must not be None.")
   with ops.name_scope(scope, "mean_pairwise_squared_error",
                       (predictions, labels, weights)) as scope:
-    weights = math_ops.to_float(weights)
-    labels = math_ops.to_float(labels)
+    weights = math_ops.cast(weights, dtype=dtypes.float32)
+    labels = math_ops.cast(labels, dtype=dtypes.float32)
     with ops.control_dependencies((
         weights_broadcast_ops.assert_broadcastable(weights, labels),)):
-      predictions = math_ops.to_float(predictions)
+      predictions = math_ops.cast(predictions, dtype=dtypes.float32)
       predictions.get_shape().assert_is_compatible_with(labels.get_shape())
 
       diffs = math_ops.subtract(predictions, labels)
@@ -664,8 +643,8 @@ def mean_squared_error(
     raise ValueError("predictions must not be None.")
   with ops.name_scope(scope, "mean_squared_error",
                       (predictions, labels, weights)) as scope:
-    predictions = math_ops.to_float(predictions)
-    labels = math_ops.to_float(labels)
+    predictions = math_ops.cast(predictions, dtype=dtypes.float32)
+    labels = math_ops.cast(labels, dtype=dtypes.float32)
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
     losses = math_ops.squared_difference(predictions, labels)
     return compute_weighted_loss(
@@ -793,7 +772,7 @@ def softmax_cross_entropy(
 
     if label_smoothing > 0:
       num_classes = math_ops.cast(
-          array_ops.shape(onehot_labels)[1], logits.dtype)
+          array_ops.shape(onehot_labels)[-1], logits.dtype)
       smooth_positives = 1.0 - label_smoothing
       smooth_negatives = label_smoothing / num_classes
       onehot_labels = onehot_labels * smooth_positives + smooth_negatives
diff --git a/tensorflow/python/ops/losses/util.py b/tensorflow/python/ops/losses/util.py
index 97bba46661d056fd336c68988e3bc17ef4232487..73f4c750b886e0548d0c008fb84058b9ddc8a39d 100644
--- a/tensorflow/python/ops/losses/util.py
+++ b/tensorflow/python/ops/losses/util.py
@@ -25,7 +25,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("losses.add_loss")
+@tf_export(v1=["losses.add_loss"])
 def add_loss(loss, loss_collection=ops.GraphKeys.LOSSES):
   """Adds a externally defined loss to the collection of losses.
 
@@ -40,7 +40,7 @@ def add_loss(loss, loss_collection=ops.GraphKeys.LOSSES):
     ops.add_to_collection(loss_collection, loss)
 
 
-@tf_export("losses.get_losses")
+@tf_export(v1=["losses.get_losses"])
 def get_losses(scope=None, loss_collection=ops.GraphKeys.LOSSES):
   """Gets the list of losses from the loss_collection.
 
@@ -54,7 +54,7 @@ def get_losses(scope=None, loss_collection=ops.GraphKeys.LOSSES):
   return ops.get_collection(loss_collection, scope)
 
 
-@tf_export("losses.get_regularization_losses")
+@tf_export(v1=["losses.get_regularization_losses"])
 def get_regularization_losses(scope=None):
   """Gets the list of regularization losses.
 
@@ -67,7 +67,7 @@ def get_regularization_losses(scope=None):
   return ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES, scope)
 
 
-@tf_export("losses.get_regularization_loss")
+@tf_export(v1=["losses.get_regularization_loss"])
 def get_regularization_loss(scope=None, name="total_regularization_loss"):
   """Gets the total regularization loss.
 
@@ -85,7 +85,7 @@ def get_regularization_loss(scope=None, name="total_regularization_loss"):
     return constant_op.constant(0.0)
 
 
-@tf_export("losses.get_total_loss")
+@tf_export(v1=["losses.get_total_loss"])
 def get_total_loss(add_regularization_losses=True, name="total_loss"):
   """Returns a tensor whose value represents the total loss.
 
diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..6767e3eb228d9f934379db7acef2d935af71ff80
--- /dev/null
+++ b/tensorflow/python/ops/map_fn.py
@@ -0,0 +1,285 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Functional operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("map_fn")
+def map_fn(fn, elems, dtype=None, parallel_iterations=None, back_prop=True,
+           swap_memory=False, infer_shape=True, name=None):
+  """map on the list of tensors unpacked from `elems` on dimension 0.
+
+  The simplest version of `map_fn` repeatedly applies the callable `fn` to a
+  sequence of elements from first to last. The elements are made of the
+  tensors unpacked from `elems`. `dtype` is the data type of the return
+  value of `fn`. Users must provide `dtype` if it is different from
+  the data type of `elems`.
+
+  Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
+  of the result tensor is `[values.shape[0]] + fn(values[0]).shape`.
+
+  This method also allows multi-arity `elems` and output of `fn`.  If `elems`
+  is a (possibly nested) list or tuple of tensors, then each of these tensors
+  must have a matching first (unpack) dimension.  The signature of `fn` may
+  match the structure of `elems`.  That is, if `elems` is
+  `(t1, [t2, t3, [t4, t5]])`, then an appropriate signature for `fn` is:
+  `fn = lambda (t1, [t2, t3, [t4, t5]]):`.
+
+  Furthermore, `fn` may emit a different structure than its input.  For example,
+  `fn` may look like: `fn = lambda t1: return (t1 + 1, t1 - 1)`.  In this case,
+  the `dtype` parameter is not optional: `dtype` must be a type or (possibly
+  nested) tuple of types matching the output of `fn`.
+
+  To apply a functional operation to the nonzero elements of a SparseTensor
+  one of the following methods is recommended. First, if the function is
+  expressible as TensorFlow ops, use
+
+  ```python
+    result = SparseTensor(input.indices, fn(input.values), input.dense_shape)
+  ```
+
+  If, however, the function is not expressible as a TensorFlow op, then use
+
+  ```python
+  result = SparseTensor(
+    input.indices, map_fn(fn, input.values), input.dense_shape)
+  ```
+
+  instead.
+
+  When executing eagerly, map_fn does not execute in parallel even if
+  `parallel_iterations` is set to a value > 1. You can still get the
+  performance benefits of running a function in parallel by using the
+  `tf.contrib.eager.defun` decorator,
+
+  ```python
+  # Assume the function being used in map_fn is fn.
+  # To ensure map_fn calls fn in parallel, use the defun decorator.
+  @tf.contrib.eager.defun
+  def func(tensor):
+    return tf.map_fn(fn, tensor)
+  ```
+
+  Note that if you use the defun decorator, any non-TensorFlow Python code
+  that you may have written in your function won't get executed. See
+  `tf.contrib.eager.defun` for more details. The recommendation would be to
+  debug without defun but switch to defun to get performance benefits of
+  running map_fn in parallel.
+
+  Args:
+    fn: The callable to be performed.  It accepts one argument, which will
+      have the same (possibly nested) structure as `elems`.  Its output
+      must have the same structure as `dtype` if one is provided, otherwise
+      it must have the same structure as `elems`.
+    elems: A tensor or (possibly nested) sequence of tensors, each of which
+      will be unpacked along their first dimension.  The nested sequence
+      of the resulting slices will be applied to `fn`.
+    dtype: (optional) The output type(s) of `fn`.  If `fn` returns a structure
+      of Tensors differing from the structure of `elems`, then `dtype` is not
+      optional and must have the same structure as the output of `fn`.
+    parallel_iterations: (optional) The number of iterations allowed to run
+      in parallel. When graph building, the default value is 10. While executing
+      eagerly, the default value is set to 1.
+    back_prop: (optional) True enables support for back propagation.
+    swap_memory: (optional) True enables GPU-CPU memory swapping.
+    infer_shape: (optional) False disables tests for consistent output shapes.
+    name: (optional) Name prefix for the returned tensors.
+
+  Returns:
+    A tensor or (possibly nested) sequence of tensors.  Each tensor packs the
+    results of applying `fn` to tensors unpacked from `elems` along the first
+    dimension, from first to last.
+
+  Raises:
+    TypeError: if `fn` is not callable or the structure of the output of
+      `fn` and `dtype` do not match, or if elems is a SparseTensor.
+    ValueError: if the lengths of the output of `fn` and `dtype` do not match.
+
+  Examples:
+    ```python
+    elems = np.array([1, 2, 3, 4, 5, 6])
+    squares = map_fn(lambda x: x * x, elems)
+    # squares == [1, 4, 9, 16, 25, 36]
+    ```
+
+    ```python
+    elems = (np.array([1, 2, 3]), np.array([-1, 1, -1]))
+    alternate = map_fn(lambda x: x[0] * x[1], elems, dtype=tf.int64)
+    # alternate == [-1, 2, -3]
+    ```
+
+    ```python
+    elems = np.array([1, 2, 3])
+    alternates = map_fn(lambda x: (x, -x), elems, dtype=(tf.int64, tf.int64))
+    # alternates[0] == [1, 2, 3]
+    # alternates[1] == [-1, -2, -3]
+    ```
+  """
+  if not callable(fn):
+    raise TypeError("fn must be callable.")
+
+  if isinstance(elems, sparse_tensor.SparseTensor):
+    raise TypeError(
+        "To perform a map on the values of a sparse tensor use either "
+        " SparseTensor(input.indices, fn(input.values), input.dense_shape) or "
+        " SparseTensor(input.indices, map_fn(fn, input.values), "
+        "input.dense_shape)")
+
+  in_graph_mode = not context.executing_eagerly()
+  # Set the default number of parallel_iterations depending on graph/eager mode.
+  if in_graph_mode and not parallel_iterations:
+    parallel_iterations = 10
+  elif not in_graph_mode and not parallel_iterations:
+    parallel_iterations = 1
+
+  if not in_graph_mode and parallel_iterations > 1:
+    logging.log_first_n(logging.WARN, "Setting parallel_iterations > 1 has no "
+                        "effect when executing eagerly. Consider calling map_fn"
+                        " with tf.contrib.eager.defun to execute fn in "
+                        "parallel.", 1)
+    parallel_iterations = 1
+
+  input_is_sequence = nest.is_sequence(elems)
+  input_flatten = lambda x: nest.flatten(x) if input_is_sequence else [x]
+  def input_pack(x):
+    return nest.pack_sequence_as(elems, x) if input_is_sequence else x[0]
+
+  if dtype is None:
+    output_is_sequence = input_is_sequence
+    output_flatten = input_flatten
+    output_pack = input_pack
+  else:
+    output_is_sequence = nest.is_sequence(dtype)
+    output_flatten = lambda x: nest.flatten(x) if output_is_sequence else [x]
+    def output_pack(x):
+      return (nest.pack_sequence_as(dtype, x)
+              if output_is_sequence else x[0])
+
+  elems_flat = input_flatten(elems)
+
+  with ops.name_scope(name, "map", elems_flat):
+    # TODO(akshayka): Remove the in_graph_mode check once caching devices are
+    # supported in Eager
+    if in_graph_mode:
+      # Any get_variable calls in fn will cache the first call locally
+      # and not issue repeated network I/O requests for each iteration.
+      varscope = vs.get_variable_scope()
+      varscope_caching_device_was_none = False
+      if varscope.caching_device is None:
+        # TODO(ebrevdo): Change to using colocate_with here and in other
+        # methods.
+        varscope.set_caching_device(lambda op: op.device)
+        varscope_caching_device_was_none = True
+
+    elems_flat = [
+        ops.convert_to_tensor(elem, name="elem") for elem in elems_flat]
+
+    dtype = dtype or input_pack([elem.dtype for elem in elems_flat])
+    dtype_flat = output_flatten(dtype)
+
+    # Convert elems to tensor array. n may be known statically.
+    static_shape = elems_flat[0].shape
+    if static_shape.ndims is not None and static_shape.ndims < 1:
+      if len(elems_flat) == 1:
+        raise ValueError("elems must be a 1+ dimensional Tensor, not a scalar")
+      else:
+        raise ValueError(
+            "elements in elems must be 1+ dimensional Tensors, not scalars"
+        )
+    n = (tensor_shape.dimension_value(static_shape[0])
+         or array_ops.shape(elems_flat[0])[0])
+
+    # TensorArrays are always flat
+    elems_ta = [
+        tensor_array_ops.TensorArray(dtype=elem.dtype,
+                                     size=n,
+                                     dynamic_size=False,
+                                     infer_shape=True)
+        for elem in elems_flat]
+    # Unpack elements
+    elems_ta = [
+        elem_ta.unstack(elem) for elem_ta, elem in zip(elems_ta, elems_flat)]
+
+    i = constant_op.constant(0)
+
+    accs_ta = [
+        tensor_array_ops.TensorArray(dtype=dt,
+                                     size=n,
+                                     dynamic_size=False,
+                                     infer_shape=infer_shape)
+        for dt in dtype_flat]
+
+    def compute(i, tas):
+      """The loop body of map_fn.
+
+      Args:
+        i: the loop counter
+        tas: the flat TensorArray accumulator list
+
+      Returns:
+        (i + 1, tas): the updated counter + updated TensorArrays
+
+      Raises:
+        TypeError: if dtype and packed_fn_values structure do not match
+        ValueType: if dtype and packed_fn_values lengths do not match
+      """
+      packed_values = input_pack([elem_ta.read(i) for elem_ta in elems_ta])
+      packed_fn_values = fn(packed_values)
+      nest.assert_same_structure(dtype or elems, packed_fn_values)
+      flat_fn_values = output_flatten(packed_fn_values)
+      tas = [ta.write(i, value) for (ta, value) in zip(tas, flat_fn_values)]
+      return (i + 1, tas)
+
+    _, r_a = control_flow_ops.while_loop(
+        lambda i, _: i < n, compute, (i, accs_ta),
+        parallel_iterations=parallel_iterations,
+        back_prop=back_prop,
+        swap_memory=swap_memory,
+        maximum_iterations=n)
+    results_flat = [r.stack() for r in r_a]
+
+    n_static = tensor_shape.Dimension(tensor_shape.dimension_value(
+        elems_flat[0].get_shape().with_rank_at_least(1)[0]))
+    for elem in elems_flat[1:]:
+      n_static.merge_with(tensor_shape.Dimension(tensor_shape.dimension_value(
+          elem.get_shape().with_rank_at_least(1)[0])))
+    for r in results_flat:
+      r.set_shape(tensor_shape.TensorShape(n_static).concatenate(
+          r.get_shape()[1:]))
+
+    # TODO(akshayka): Remove the in_graph_mode check once caching devices are
+    # supported in Eager
+    if in_graph_mode and varscope_caching_device_was_none:
+      varscope.set_caching_device(None)
+
+    return output_pack(results_flat)
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index c7ec1c57d1b07232e2bdb05fc30f5456b792890f..42495b189d2c7e45ab9100e00f891974564196c2 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -47,6 +47,10 @@ def _ArgMinGrad(op, grad):
   return [None, None]
 
 
+# TODO(rmlarsen): Implement gradient.
+ops.NotDifferentiable("EuclideanNorm")
+
+
 @ops.RegisterGradient("Sum")
 def _SumGrad(op, grad):
   """Gradient for Sum."""
@@ -99,7 +103,7 @@ def _MinOrMaxGrad(op, grad):
   num_selected = array_ops.reshape(
       math_ops.reduce_sum(indicators, op.inputs[1]), output_shape_kept_dims)
 
-  return [math_ops.div(indicators, num_selected) * grad, None]
+  return [math_ops.divide(indicators, num_selected) * grad, None]
 
 
 @ops.RegisterGradient("Max")
@@ -196,7 +200,7 @@ def _SegmentMeanGrad(op, grad):
       array_ops.fill(array_ops.expand_dims(input_rank - 1, 0), 1)
   ], 0)
   ones = array_ops.fill(ones_shape, constant_op.constant(1, dtype=grad.dtype))
-  scaled_grad = math_ops.div(grad, math_ops.segment_sum(ones, op.inputs[1]))
+  scaled_grad = math_ops.divide(grad, math_ops.segment_sum(ones, op.inputs[1]))
   return array_ops.gather(scaled_grad, op.inputs[1]), None
 
 
@@ -260,7 +264,7 @@ def _SegmentMinOrMaxGrad(op, grad):
                                       op.inputs[1])
   # Compute the gradient for each segment. The gradient for the ith segment is
   # divided evenly among the selected elements in that segment.
-  weighted_grads = math_ops.div(grad, num_selected)
+  weighted_grads = math_ops.divide(grad, num_selected)
   gathered_grads = array_ops.gather(weighted_grads, op.inputs[1])
   return array_ops.where(is_selected, gathered_grads, zeros), None
 
@@ -314,7 +318,7 @@ def _UnsortedSegmentMinOrMaxGrad(op, grad):
       math_ops.cast(is_selected, grad.dtype), op.inputs[1], op.inputs[2])
   # Compute the gradient for each segment. The gradient for the ith segment is
   # divided evenly among the selected elements in that segment.
-  weighted_grads = math_ops.div(grad, num_selected)
+  weighted_grads = math_ops.divide(grad, num_selected)
   gathered_grads, _, _ = _GatherDropNegatives(weighted_grads, None,
                                               zero_clipped_indices,
                                               is_positive)
@@ -946,6 +950,26 @@ def _MulGrad(op, grad):
               math_ops.reduce_sum(gen_math_ops.mul(x, grad), ry), sy))
 
 
+@ops.RegisterGradient("MulNoNan")
+def _MulNoNanGrad(op, grad):
+  """The gradient of scalar multiplication with NaN-suppression."""
+  x = op.inputs[0]
+  y = op.inputs[1]
+  if (isinstance(grad, ops.Tensor) and
+      _ShapesFullySpecifiedAndEqual(x, y, grad)):
+    return gen_math_ops.mul_no_nan(grad, y), gen_math_ops.mul_no_nan(
+        x, grad)
+  assert x.dtype.base_dtype == y.dtype.base_dtype, (x.dtype, " vs. ", y.dtype)
+  sx = array_ops.shape(x)
+  sy = array_ops.shape(y)
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
+  return (array_ops.reshape(
+      math_ops.reduce_sum(gen_math_ops.mul_no_nan(grad, y), rx), sx),
+          array_ops.reshape(
+              math_ops.reduce_sum(gen_math_ops.mul_no_nan(x, grad), ry),
+              sy))
+
+
 @ops.RegisterGradient("Div")
 def _DivGrad(op, grad):
   """The gradient for the Div operator."""
@@ -956,10 +980,11 @@ def _DivGrad(op, grad):
   rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
   x = math_ops.conj(x)
   y = math_ops.conj(y)
-  return (array_ops.reshape(math_ops.reduce_sum(math_ops.div(grad, y), rx), sx),
+  return (array_ops.reshape(
+      math_ops.reduce_sum(math_ops.divide(grad, y), rx), sx),
           array_ops.reshape(
-              math_ops.reduce_sum(grad * math_ops.div(math_ops.div(-x, y), y),
-                                  ry), sy))
+              math_ops.reduce_sum(
+                  grad * math_ops.divide(math_ops.divide(-x, y), y), ry), sy))
 
 
 @ops.RegisterGradient("FloorDiv")
@@ -1343,3 +1368,20 @@ def _CumprodGrad(op, grad):
   out = math_ops.cumsum(
       prod * grad, axis, exclusive=exclusive, reverse=not reverse)
   return [out / x, None]
+
+
+@ops.RegisterGradient("NextAfter")
+def _NextAfterGrad(op, grad):
+  """Returns gradient of nextafter(x1, x2) with respect to x1 and x2."""
+  x1 = op.inputs[0]
+  x2 = op.inputs[1]
+  s_x1 = array_ops.shape(x1)
+  s_x2 = array_ops.shape(x2)
+  r_x1, r_x2 = gen_array_ops.broadcast_gradient_args(s_x1, s_x2)
+  with ops.control_dependencies([grad]):
+    partial_x1 = array_ops.ones(s_x1, dtype=x1.dtype)
+    partial_x2 = array_ops.zeros(s_x2, dtype=x2.dtype)
+    return (array_ops.reshape(
+        math_ops.reduce_sum(partial_x1 * grad, r_x1), s_x1),
+            array_ops.reshape(
+                math_ops.reduce_sum(partial_x2 * grad, r_x2), s_x2))
diff --git a/tensorflow/python/ops/math_grad_test.py b/tensorflow/python/ops/math_grad_test.py
index 822f89768c53c45def3bb93a53382b2375944528..96c24c3c98f2ec6f52317f0f9c46380eb7fe35c5 100644
--- a/tensorflow/python/ops/math_grad_test.py
+++ b/tensorflow/python/ops/math_grad_test.py
@@ -29,10 +29,13 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
+RAISE = execution_callbacks.ExecutionCallback.RAISE
+
 
 class SquaredDifferenceOpTest(test.TestCase):
 
@@ -277,6 +280,31 @@ class DivNoNanGradientTest(test.TestCase):
       self.assertAllClose(dy.eval(), np.zeros(y.shape.as_list()))
 
 
+class MulNoNanGradientTest(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def testBasicGradient(self):
+    inputs = constant_op.constant(np.arange(-3, 3), dtype=dtypes.float32)
+    outputs = math_ops.mul_no_nan(inputs, 1 + math_ops.abs(inputs))
+    with self.cached_session():
+      error = gradient_checker.compute_gradient_error(
+          inputs,
+          inputs.get_shape().as_list(), outputs,
+          outputs.get_shape().as_list())
+      self.assertLess(error, 1e-4)
+
+  @test_util.run_deprecated_v1
+  def testGradientWithRhsIsZero(self):
+    x_vals = [0, 1.0, np.nan, np.inf, np.NINF]
+    x = constant_op.constant(x_vals, dtype=dtypes.float32)
+    y = array_ops.zeros_like(x, dtype=dtypes.float32)
+    outputs = math_ops.mul_no_nan(x, y)
+    with self.cached_session():
+      dx, dy = gradients.gradients(outputs, [x, y])
+      self.assertAllClose(dx.eval(), np.zeros(x.shape.as_list()))
+      self.assertAllClose(dy.eval(), x_vals)
+
+
 class XlogyTest(test.TestCase):
 
   def _xlogy_gradients(self, x, y):
@@ -385,7 +413,7 @@ class PowGradTest(test.TestCase):
     self.assertAllClose([-2., 0., 2.], g)
 
   def test_zero_grad_tape(self):
-    with execution_callbacks.errstate(inf_or_nan=execution_callbacks.RAISE):
+    with execution_callbacks.errstate(inf_or_nan=RAISE):
       x = constant_op.constant([-1, 0., 1.])
       with backprop.GradientTape() as tape:
         tape.watch(x)
@@ -393,5 +421,59 @@ class PowGradTest(test.TestCase):
       g = self.evaluate(g)
       self.assertAllClose([-2., 0., 2.], g)
 
+
+@test_util.run_all_in_graph_and_eager_modes
+class NextAfterTest(test.TestCase):
+
+  def _nextafter_gradient(self, x1, x2):
+    with backprop.GradientTape() as tape:
+      tape.watch(x1)
+      tape.watch(x2)
+      y = math_ops.nextafter(x1, x2)
+      return tape.gradient(y, [x1, x2])
+
+  def testBasic(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      x1 = constant_op.constant(0.1, dtype=dtype)
+      x2 = constant_op.constant(3.1, dtype=dtype)
+      dx1, dx2 = self._nextafter_gradient(x1, x2)
+      expected_dx1 = constant_op.constant(1, dtype=dtype)
+      expected_dx2 = constant_op.constant(0, dtype=dtype)
+      self.assertAllClose(expected_dx1, dx1)
+      self.assertAllClose(expected_dx2, dx2)
+
+  def testDynamicShapes(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      default_x1 = constant_op.constant(0.1, dtype=dtype)
+      default_x2 = constant_op.constant(3.1, dtype=dtype)
+      x1 = array_ops.placeholder_with_default(default_x1, shape=None)
+      x2 = array_ops.placeholder_with_default(default_x2, shape=None)
+      dx1, dx2 = self._nextafter_gradient(x1, x2)
+      expected_dx1 = constant_op.constant(1, dtype=dtype)
+      expected_dx2 = constant_op.constant(0, dtype=dtype)
+      self.assertAllClose(expected_dx1, dx1)
+      self.assertAllClose(expected_dx2, dx2)
+
+  def testWithGradientChecker(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        x1 = np.array([-1, 0, 1, 2, 3], dtype=dtype.as_numpy_dtype)
+        x2 = np.array([2, 2, 2, 2, 2], dtype=dtype.as_numpy_dtype)
+        err = gradient_checker_v2.max_error(
+            *gradient_checker_v2.compute_gradient(
+                lambda x: math_ops.nextafter(x, x2), [x1]))  # pylint: disable=cell-var-from-loop
+        self.assertLess(err, 1e-3)
+
+  def testBroadcastingWithGradientChecker(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        x1 = np.array([-1, 0, 1, 2, 3], dtype=dtype.as_numpy_dtype)
+        x2 = np.array([2], dtype=dtype.as_numpy_dtype)
+        err = gradient_checker_v2.max_error(
+            *gradient_checker_v2.compute_gradient(
+                lambda x: math_ops.nextafter(x, x2), [x1]))  # pylint: disable=cell-var-from-loop
+        self.assertLess(err, 1e-3)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index e2b634ee8f8d18e1e0e43a9e10cb7f2532bbbf12..ddb399222157de4c251198b398d5545450193598 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -12,9 +12,59 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Basic arithmetic operators.
+"""Math Operations.
+
+Note: Functions taking `Tensor` arguments can also take anything accepted by
+`tf.convert_to_tensor`.
+
+Note: Elementwise binary operations in TensorFlow follow [numpy-style
+broadcasting](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html).
+
+TensorFlow provides a variety of math functions including:
+
+* Basic arithmetic operators and trigonometric functions.
+* Special math functions (like: `tf.math.igamma` and `tf.math.zeta`)
+* Complex number functions (like: `tf.math.imag` and `tf.math.angle`)
+* Reductions and scans (like: `tf.math.reduce_mean` and `tf.math.cumsum`)
+* Segment functions (like: `tf.math.segment_sum`)
+
+See: `tf.linalg` for matrix and tensor functions.
+
+<a id=Segmentation></a>
+
+## About Segmentation
+
+TensorFlow provides several operations that you can use to perform common
+math computations on tensor segments.
+Here a segmentation is a partitioning of a tensor along
+the first dimension, i.e. it  defines a mapping from the first dimension onto
+`segment_ids`. The `segment_ids` tensor should be the size of
+the first dimension, `d0`, with consecutive IDs in the range `0` to `k`,
+where `k<d0`.
+In particular, a segmentation of a matrix tensor is a mapping of rows to
+segments.
+
+For example:
+
+```python
+c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+tf.segment_sum(c, tf.constant([0, 0, 1]))
+#  ==>  [[0 0 0 0]
+#        [5 6 7 8]]
+```
+
+The standard `segment_*` functions assert that the segment indices are sorted.
+If you have unsorted indices use the equivalent `unsorted_segment_` function.
+Thses functions take an additional argument `num_segments` so that the output
+tensor can be efficiently allocated.
+
+``` python
+c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+tf.unsorted_segment_sum(c, tf.constant([0, 1, 0]), num_segments=2)
+# ==> [[ 6,  8, 10, 12],
+#       [-1, -2, -3, -4]]
+```
 
-See the [python/math_ops](python/math_ops) guide.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -49,6 +99,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 # Aliases for some automatically-generated names.
 linspace = gen_math_ops.lin_space
+nextafter = gen_math_ops.next_after
 
 arg_max = deprecation.deprecated(None, "Use `tf.math.argmax` instead")(arg_max)  # pylint: disable=used-before-assignment
 arg_min = deprecation.deprecated(None, "Use `tf.math.argmin` instead")(arg_min)  # pylint: disable=used-before-assignment
@@ -812,7 +863,8 @@ def _OverrideBinaryOperatorHelper(func, op_name, clazz_object=ops.Tensor):
         return func(x, y, name=name)
       elif not isinstance(y, sparse_tensor.SparseTensor):
         try:
-          y = ops.convert_to_tensor(y, dtype=x.dtype.base_dtype, name="y")
+          y = ops.convert_to_tensor_v2(y, dtype_hint=x.dtype.base_dtype,
+                                       name="y")
         except TypeError:
           # If the RHS is not a tensor, it might be a tensor aware object
           # that can implement the operator with knowledge of itself
@@ -1003,7 +1055,8 @@ def div(x, y, name=None):
   return _div_python2(x, y, name)
 
 
-@tf_export("div_no_nan")
+@tf_export("math.divide_no_nan", v1=["math.divide_no_nan", "div_no_nan"])
+@deprecation.deprecated_endpoints("div_no_nan")
 @dispatch.add_dispatch_support
 def div_no_nan(x, y, name=None):
   """Computes an unsafe divide which returns 0 if the y is zero.
@@ -1027,6 +1080,31 @@ def div_no_nan(x, y, name=None):
     return gen_math_ops.div_no_nan(x, y, name=name)
 
 
+@tf_export("math.multiply_no_nan")
+@dispatch.add_dispatch_support
+def multiply_no_nan(x, y, name=None):
+  """Computes the product of x and y and returns 0 if the y is zero, even if x is NaN or infinite.
+
+  Args:
+    x: A `Tensor`. Must be one of the following types: `float32`, `float64`.
+    y: A `Tensor` whose dtype is compatible with `x`.
+    name: A name for the operation (optional).
+
+  Returns:
+    The element-wise value of the x times y.
+  """
+
+  with ops.name_scope(name, "multiply_no_nan", [x, y]) as name:
+    x = ops.convert_to_tensor(x, name="x")
+    y = ops.convert_to_tensor(y, name="y", dtype=x.dtype.base_dtype)
+    x_dtype = x.dtype.base_dtype
+    y_dtype = y.dtype.base_dtype
+    if x_dtype != y_dtype:
+      raise TypeError(
+          "x and y must have the same dtype, got %r != %r" % (x_dtype, y_dtype))
+    return gen_math_ops.mul_no_nan(x, y, name=name)
+
+
 # TODO(aselle): This should be removed
 mod = gen_math_ops.floor_mod
 
@@ -1334,16 +1412,60 @@ def reduce_sum(input_tensor, axis=None, keepdims=False, name=None):
           name=name))
 
 
+@tf_export("math.reduce_euclidean_norm")
+def reduce_euclidean_norm(input_tensor, axis=None, keepdims=False, name=None):
+  """Computes the Euclidean norm of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  For example:
+
+  ```python
+  x = tf.constant([[1, 2, 3], [1, 1, 1]])
+  tf.reduce_euclidean_norm(x)  # sqrt(17)
+  tf.reduce_euclidean_norm(x, 0)  # [sqrt(2), sqrt(5), sqrt(10)]
+  tf.reduce_euclidean_norm(x, 1)  # [sqrt(14), sqrt(3)]
+  tf.reduce_euclidean_norm(x, 1, keepdims=True)  # [[sqrt(14)], [sqrt(3)]]
+  tf.reduce_euclidean_norm(x, [0, 1])  # sqrt(17)
+  ```
+
+  Args:
+    input_tensor: The tensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced tensor, of the same dtype as the input_tensor.
+  """
+  return _may_reduce_to_scalar(
+      keepdims, axis,
+      gen_math_ops.euclidean_norm(
+          input_tensor, _ReductionDims(input_tensor, axis), keepdims,
+          name=name))
+
+
 @tf_export(v1=["math.count_nonzero", "count_nonzero"])
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
-def count_nonzero(input_tensor,
+@deprecation.deprecated_args(
+    None, "reduction_indices is deprecated, use axis instead", "axis")
+def count_nonzero(input_tensor=None,
                   axis=None,
                   keepdims=None,
                   dtype=dtypes.int64,
                   name=None,
                   reduction_indices=None,
-                  keep_dims=None):
+                  keep_dims=None,
+                  input=None):  # pylint: disable=redefined-builtin
   """Computes number of nonzero elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1389,12 +1511,15 @@ def count_nonzero(input_tensor,
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
     keep_dims: Deprecated alias for `keepdims`.
+    input: Overrides input_tensor. For compatibility.
 
   Returns:
     The reduced tensor (number of nonzero values).
   """
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
+  input_tensor = deprecation.deprecated_argument_lookup(
+      "input", input, "input_tensor", input_tensor)
   axis = deprecation.deprecated_argument_lookup(
       "axis", axis,
       "reduction_indices", reduction_indices
@@ -1465,7 +1590,7 @@ def count_nonzero_v2(input,  # pylint: disable=redefined-builtin
     return cast(
         reduce_sum(
             # int64 reduction happens on GPU
-            to_int64(gen_math_ops.not_equal(input, zero)),
+            cast(gen_math_ops.not_equal(input, zero), dtypes.int64),
             axis=axis,
             keepdims=keepdims),
         dtype=dtype)
@@ -2640,6 +2765,8 @@ def _as_indexed_slices_list(inputs, optimize=True):
 def add_n(inputs, name=None):
   """Adds all input tensors element-wise.
 
+  Converts `IndexedSlices` objects into dense tensors prior to adding.
+
   Args:
     inputs: A list of `Tensor` or `IndexedSlices` objects, each with same shape
       and type.
@@ -2653,16 +2780,16 @@ def add_n(inputs, name=None):
     cannot be inferred.
   """
   if not inputs or not isinstance(inputs, (list, tuple)):
-    raise ValueError("inputs must be a list of at least one"
+    raise ValueError("inputs must be a list of at least one "
                      "Tensor/IndexedSlices with the same dtype and shape")
   inputs = ops.convert_n_to_tensor_or_indexed_slices(inputs)
   if not all(isinstance(x, (ops.Tensor, ops.IndexedSlices)) for x in inputs):
-    raise ValueError("inputs must be a list of at least one"
+    raise ValueError("inputs must be a list of at least one "
                      "Tensor/IndexedSlices with the same dtype and shape")
 
   if len(inputs) == 1:
     if isinstance(inputs[0], ops.IndexedSlices):
-      values = inputs[0].values
+      values = ops.convert_to_tensor(inputs[0])
     else:
       values = inputs[0]
     if name:
@@ -3061,8 +3188,8 @@ def reduced_shape(input_shape, axes):
     input_shape[axes] = 1
     return input_shape
 
-  input_shape = to_int32(input_shape)  # [2, 3, 5, 7]
-  axes = to_int32(axes)  # [1, 2]
+  input_shape = cast(input_shape, dtypes.int32)  # [2, 3, 5, 7]
+  axes = cast(axes, dtypes.int32)  # [1, 2]
 
   input_rank = array_ops.size(input_shape)  # 4
   axes = (axes + input_rank) % input_rank
@@ -3102,7 +3229,7 @@ def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
   r"""Computes the mean along segments of a tensor.
 
   Read [the section on
-  segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+  segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
   for an explanation of segments.
 
   This operator is similar to the unsorted segment sum operator found
@@ -3148,7 +3275,7 @@ def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
   r"""Computes the sum along segments of a tensor divided by the sqrt(N).
 
   Read [the section on
-  segmentation](https://tensorflow.org/api_guides/python/math_ops#segmentation)
+  segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
   for an explanation of segments.
 
   This operator is similar to the unsorted segment sum operator found
@@ -3195,7 +3322,7 @@ def sparse_segment_sum(data, indices, segment_ids, name=None,
   r"""Computes the sum along sparse segments of a tensor.
 
   Read [the section on
-  segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+  segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
   for an explanation of segments.
 
   Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
@@ -3282,7 +3409,7 @@ def sparse_segment_mean(data,
   r"""Computes the mean along sparse segments of a tensor.
 
   Read [the section on
-  segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+  segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
   for an explanation of segments.
 
   Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
@@ -3327,7 +3454,7 @@ def sparse_segment_mean_v2(data,
   r"""Computes the mean along sparse segments of a tensor.
 
   Read [the section on
-  segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+  segmentation](https://tensorflow.org/api_docs/python/tf/math#Segmentation)
   for an explanation of segments.
 
   Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index e185dbcd230906270b6c92fe70e6a350c34f030f..fb7e4aacc5e72fd73691050afe05575db16e9de8 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -22,6 +22,7 @@ import numpy as np
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -403,6 +404,18 @@ class AddNTest(test_util.TensorFlowTestCase):
                             [g.eval() for g in add_n_grad])
 
 
+  @test_util.run_deprecated_v1
+  def testIndexedSlices(self):
+    slc = ops.IndexedSlices(
+        array_ops.constant([1, 2], shape=[1, 2]), array_ops.constant([1]),
+        array_ops.constant([2, 2]))
+    slc_as_dense = np.array([[0, 0], [1, 2]])
+    with self.test_session(use_gpu=True):
+      # add_n currently always converts IndexedSlices to dense
+      self.assertAllEqual(slc_as_dense, math_ops.add_n([slc]).eval())
+      self.assertAllEqual(2 * slc_as_dense, math_ops.add_n([slc, slc]).eval())
+
+
 class DivAndModTest(test_util.TensorFlowTestCase):
   # TODO(aselle): Test more types before exposing new division operators.
 
@@ -545,6 +558,22 @@ class DivNoNanTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(tf_result, np_result)
 
 
+class MultiplyNoNanTest(test_util.TensorFlowTestCase):
+
+  @test_util.run_deprecated_v1
+  def testBasic(self):
+    for dtype in [np.float32, np.float64]:
+      values = [0, 1, np.nan, np.inf, np.NINF]
+      x = constant_op.constant(values, dtype=dtype)
+      zeros = constant_op.constant(np.zeros((5,)), dtype=dtype)
+      ones = constant_op.constant(np.ones((5,)), dtype=dtype)
+      with self.cached_session(use_gpu=True):
+        tf_result_zeros = math_ops.multiply_no_nan(x, zeros).eval()
+        self.assertAllEqual(tf_result_zeros, zeros)
+        tf_result_ones = math_ops.multiply_no_nan(x, ones).eval()
+        self.assertAllEqual(tf_result_ones, x)
+
+
 class XlogyTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes
@@ -615,5 +644,59 @@ class XdivyTest(test_util.TensorFlowTestCase):
         self.assertAllClose(x_over_y, xdivy_tf_np[1])
 
 
+class NextAfterTest(test_util.TensorFlowTestCase):
+
+  # Basic NextAfter tests that replicate numpy nextafter tests.
+  @test_util.run_in_graph_and_eager_modes
+  def testBasic(self):
+
+    for dtype in [dtypes.float32, dtypes.float64]:
+      one = constant_op.constant([1], dtype=dtype)
+      two = constant_op.constant([2], dtype=dtype)
+      zero = constant_op.constant([0], dtype=dtype)
+      nan = constant_op.constant([np.nan], dtype=dtype)
+
+      eps = constant_op.constant([np.finfo(dtype.as_numpy_dtype).eps],
+                                 dtype=dtype)
+
+      self.assertAllEqual(math_ops.nextafter(one, two) - one, eps)
+      self.assertAllLess(math_ops.nextafter(one, zero) - one, 0)
+      self.assertAllEqual(
+          math_ops.is_nan(math_ops.nextafter(nan, one)), [True])
+      self.assertAllEqual(
+          math_ops.is_nan(math_ops.nextafter(one, nan)), [True])
+      self.assertAllEqual(math_ops.nextafter(one, one), one)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBroadcasting(self):
+
+    for dtype in [dtypes.float32, dtypes.float64]:
+      one = constant_op.constant([1, 1], dtype=dtype)
+      two = constant_op.constant([2], dtype=dtype)
+
+      eps = np.finfo(dtype.as_numpy_dtype).eps
+
+      eps_const = constant_op.constant([eps, eps], dtype=dtype)
+
+      self.assertAllEqual(math_ops.nextafter(one, two) - one, eps_const)
+
+
+class BinaryOpsTest(test_util.TensorFlowTestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testErrorReceivedIfDtypeMismatchFromOp(self):
+    if context.executing_eagerly():
+      error = errors_impl.InvalidArgumentError
+      error_message = (
+          r"cannot compute Add as input #0\(zero-based\) was expected to be a "
+          r"float tensor but is a int32 tensor \[Op:Add\] name: add/")
+    else:
+      error = TypeError
+      error_message = ("Input 'y' of 'Add' Op has type float32 that does not "
+                       "match type int32 of argument 'x'.")
+    with self.assertRaisesRegexp(error, error_message):
+      a = array_ops.ones([1], dtype=dtypes.int32) + 1.0
+      self.evaluate(a)
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index ec39b1790e340a0d194dea8ab3419ca78fc9d126..e3cefb2e92e24c79125f84ab743cb75ea56ab214 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -621,7 +621,7 @@ def _confusion_matrix_at_thresholds(labels,
 
 
 def _aggregate_variable(v, collections):
-  f = lambda distribution, value: distribution.read_var(value)
+  f = lambda distribution, value: distribution.extended.read_var(value)
   return _aggregate_across_replicas(collections, f, v)
 
 
@@ -1295,7 +1295,7 @@ def mean_squared_error(labels,
 
   predictions, labels, weights = _remove_squeezable_dimensions(
       predictions=predictions, labels=labels, weights=weights)
-  squared_error = math_ops.square(labels - predictions)
+  squared_error = math_ops.squared_difference(labels, predictions)
   return mean(squared_error, weights, metrics_collections, updates_collections,
               name or 'mean_squared_error')
 
diff --git a/tensorflow/python/ops/nccl_ops.py b/tensorflow/python/ops/nccl_ops.py
index 6259ce0f948427cace576dbc3e21a410f531f4e2..6c8685cf63aeae5bb9f081a6a5282c472f724842 100644
--- a/tensorflow/python/ops/nccl_ops.py
+++ b/tensorflow/python/ops/nccl_ops.py
@@ -19,6 +19,8 @@ from __future__ import print_function
 
 import threading
 
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import device
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_nccl_ops
@@ -211,19 +213,27 @@ def _apply_all_reduce(reduction, tensors):
     raise ValueError('Must pass >0 tensors to all reduce operations')
 
   shared_name = _get_shared_name()
-  res = []
 
-  for t in tensors:
-    _check_device(t)
-    with ops.device(t.device):
-      res.append(
-          gen_nccl_ops.nccl_all_reduce(
-              input=t,
-              reduction=reduction,
-              num_devices=len(tensors),
-              shared_name=shared_name))
-
-  return res
+  def _all_reduce():
+    """Call nccl allreduce."""
+    res = []
+    for t in tensors:
+      _check_device(t)
+      with ops.device(t.device):
+        res.append(
+            gen_nccl_ops.nccl_all_reduce(
+                input=t,
+                reduction=reduction,
+                num_devices=len(tensors),
+                shared_name=shared_name))
+    return res
+
+  if context.executing_eagerly():
+    # Nccl ops will block unless they are executed concurrently such as in a
+    # graph or a defun.
+    return def_function.function(_all_reduce)()
+  else:
+    return _all_reduce()
 
 
 def _apply_reduce(reduction, tensors):
diff --git a/tensorflow/python/ops/nn_batchnorm_test.py b/tensorflow/python/ops/nn_batchnorm_test.py
index e978f1d32601890f8eb9b54fdd5738f626b7f863..fedf8e44c3ddfdac9739b88e019ed6d1e4485ab2 100644
--- a/tensorflow/python/ops/nn_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_batchnorm_test.py
@@ -206,6 +206,7 @@ class BatchNormalizationTest(test.TestCase):
                                   2)
 
   @test_util.run_deprecated_v1
+  @test_util.disable_xla("This test never passed for XLA")
   def testBatchNormGradImpl(self):
     x_shape = [7, 5, 4, 6]
     param_shape = [6]
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index 4bc33ff8bdb845510a9872db26c8adfdf1f50995..69e753aa956389a5dbfd132a09d6930fc5f4660c 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -436,6 +437,7 @@ class BatchNormalizationTest(test.TestCase):
       self._test_training(
           x_shape, dtype, [6], np.float32, use_gpu=False, data_format='NHWC')
 
+  @test_util.run_deprecated_v1
   def testBatchNormGradShape1(self):
     for is_training in [True, False]:
       x_shape = [1, 1, 6, 1]
@@ -463,6 +465,7 @@ class BatchNormalizationTest(test.TestCase):
             data_format='NHWC',
             is_training=is_training)
 
+  @test_util.run_deprecated_v1
   def testBatchNormGradShape2(self):
     for is_training in [True, False]:
       x_shape = [1, 1, 6, 2]
@@ -483,6 +486,7 @@ class BatchNormalizationTest(test.TestCase):
             data_format='NHWC',
             is_training=is_training)
 
+  @test_util.run_deprecated_v1
   def testBatchNormGradShape3(self):
     for is_training in [True, False]:
       x_shape = [1, 2, 1, 6]
@@ -496,6 +500,7 @@ class BatchNormalizationTest(test.TestCase):
               data_format='NCHW',
               is_training=is_training)
 
+  @test_util.run_deprecated_v1
   def testBatchNormGradShape4(self):
     for is_training in [True, False]:
       x_shape = [5, 7, 11, 4]
@@ -523,6 +528,8 @@ class BatchNormalizationTest(test.TestCase):
             data_format='NHWC',
             is_training=is_training)
 
+  @test_util.run_deprecated_v1
+  @test_util.disable_xla('This test never passed for XLA')
   def testBatchNormGradShape5(self):
     for is_training in [True, False]:
       x_shape = [0, 7, 11, 4]
@@ -581,6 +588,7 @@ class BatchNormalizationTest(test.TestCase):
           is_training=is_training,
           err_tolerance=err_tolerance)
 
+  @test_util.run_deprecated_v1
   def testBatchNormGradGradConfig1(self):
     config = {
         'shape': [2, 3, 4, 5],
@@ -589,6 +597,7 @@ class BatchNormalizationTest(test.TestCase):
     }
     self._testBatchNormGradGrad(config)
 
+  @test_util.run_deprecated_v1
   def testBatchNormGradGradConfig2(self):
     config = {
         'shape': [2, 3, 2, 2],
@@ -597,6 +606,7 @@ class BatchNormalizationTest(test.TestCase):
     }
     self._testBatchNormGradGrad(config)
 
+  @test_util.run_deprecated_v1
   def testBatchNormGradGradConfig3(self):
     config = {
         'shape': [2, 3, 4, 5],
@@ -605,6 +615,7 @@ class BatchNormalizationTest(test.TestCase):
     }
     self._testBatchNormGradGrad(config)
 
+  @test_util.run_deprecated_v1
   def testBatchNormGradGradConfig4(self):
     config = {
         'shape': [2, 3, 2, 2],
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 34404edc9a1250710d4cd7a50e04ad8d187a5d7f..e8b7b4c7151a8f280e3e2766ac76804659511ec4 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -50,7 +50,7 @@ def _Conv2DBackpropInputGrad(op, grad):
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
-          data_format=op.get_attr("data_format")),
+          data_format=op.get_attr("data_format").decode()),
       nn_ops.conv2d(
           grad,
           op.inputs[1],
@@ -58,7 +58,7 @@ def _Conv2DBackpropInputGrad(op, grad):
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
-          data_format=op.get_attr("data_format"))
+          data_format=op.get_attr("data_format").decode())
   ]
 
 
@@ -73,7 +73,7 @@ def _Conv2DBackpropFilterGrad(op, grad):
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
-          data_format=op.get_attr("data_format")), None,
+          data_format=op.get_attr("data_format").decode()), None,
       nn_ops.conv2d(
           op.inputs[0],
           grad,
@@ -81,13 +81,65 @@ def _Conv2DBackpropFilterGrad(op, grad):
           strides=op.get_attr("strides"),
           padding=op.get_attr("padding"),
           use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
+          data_format=op.get_attr("data_format").decode())
+  ]
+
+
+@ops.RegisterGradient("DepthwiseConv2dNativeBackpropInput")
+def _DepthwiseConv2dNativeBackpropInputGrad(op, grad):
+  """The derivatives for deconvolution.
+
+  Args:
+    op: the Deconvolution op.
+    grad: the tensor representing the gradient w.r.t. the output
+
+  Returns:
+    the gradients w.r.t. the input and the filter
+  """
+  return [
+      None,
+      nn_ops.depthwise_conv2d_native_backprop_filter(
+          grad,
+          array_ops.shape(op.inputs[1]),
+          op.inputs[2],
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=op.get_attr("data_format")),
+      nn_ops.depthwise_conv2d_native(
+          grad,
+          op.inputs[1],
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=op.get_attr("data_format"))
+  ]
+
+
+@ops.RegisterGradient("DepthwiseConv2dNativeBackpropFilter")
+def _DepthwiseConv2dNativeBackpropFilterGrad(op, grad):
+  return [
+      nn_ops.depthwise_conv2d_native_backprop_input(
+          array_ops.shape(op.inputs[0]),
+          grad,
+          op.inputs[2],
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
+          data_format=op.get_attr("data_format")), None,
+      nn_ops.depthwise_conv2d_native(
+          op.inputs[0],
+          grad,
+          dilations=op.get_attr("dilations"),
+          strides=op.get_attr("strides"),
+          padding=op.get_attr("padding"),
           data_format=op.get_attr("data_format"))
   ]
 
 
 @ops.RegisterGradient("Conv3D")
 def _Conv3DGrad(op, grad):
-  data_format = op.get_attr("data_format")
+  data_format = op.get_attr("data_format").decode()
   return [
       nn_ops.conv3d_backprop_input_v2(
           array_ops.shape(op.inputs[0]),
@@ -110,7 +162,7 @@ def _Conv3DGrad(op, grad):
 
 @ops.RegisterGradient("Conv3DBackpropInputV2")
 def _Conv3DBackpropInputGrad(op, grad):
-  data_format = op.get_attr("data_format")
+  data_format = op.get_attr("data_format").decode()
   return [
       None,
       nn_ops.conv3d_backprop_filter_v2(
@@ -133,7 +185,7 @@ def _Conv3DBackpropInputGrad(op, grad):
 
 @ops.RegisterGradient("Conv3DBackpropFilterV2")
 def _Conv3DBackpropFilterGrad(op, grad):
-  data_format = op.get_attr("data_format")
+  data_format = op.get_attr("data_format").decode()
   return [
       nn_ops.conv3d_backprop_input_v2(
           array_ops.shape(op.inputs[0]),
@@ -161,7 +213,7 @@ def _AvgPool3DGrad(op, grad):
       ksize=op.get_attr("ksize"),
       strides=op.get_attr("strides"),
       padding=op.get_attr("padding"),
-      data_format=op.get_attr("data_format"))
+      data_format=op.get_attr("data_format").decode())
 
 
 @ops.RegisterGradient("AvgPool3DGrad")
@@ -172,7 +224,7 @@ def _AvgPool3DGradGrad(op, grad):
               op.get_attr("ksize"),
               op.get_attr("strides"),
               op.get_attr("padding"),
-              data_format=op.get_attr("data_format")))
+              data_format=op.get_attr("data_format").decode()))
 
 
 @ops.RegisterGradient("MaxPool3D")
@@ -184,7 +236,7 @@ def _MaxPool3DGrad(op, grad):
       ksize=op.get_attr("ksize"),
       strides=op.get_attr("strides"),
       padding=op.get_attr("padding"),
-      data_format=op.get_attr("data_format"))
+      data_format=op.get_attr("data_format").decode())
 
 
 @ops.RegisterGradient("MaxPool3DGrad")
@@ -200,7 +252,7 @@ def _MaxPool3DGradGrad(op, grad):
               op.get_attr("ksize"),
               op.get_attr("strides"),
               padding=op.get_attr("padding"),
-              data_format=op.get_attr("data_format")))
+              data_format=op.get_attr("data_format").decode()))
 
 
 @ops.RegisterGradient("MaxPool3DGradGrad")
@@ -216,7 +268,7 @@ def _MaxPool3DGradGradGrad(op, grad):
               op.get_attr("ksize"),
               op.get_attr("strides"),
               padding=op.get_attr("padding"),
-              data_format=op.get_attr("data_format")))
+              data_format=op.get_attr("data_format").decode()))
 
 
 @ops.RegisterGradient("Softmax")
@@ -232,8 +284,8 @@ def _SoftmaxGrad(op, grad_softmax):
 
   Args:
      op: the Softmax op.
-     grad_softmax:  the tensor representing the gradient w.r.t. the
-       softmax output.
+     grad_softmax:  the tensor representing the gradient w.r.t. the softmax
+       output.
 
   Returns:
      gradient w.r.t the input to the softmax
@@ -309,15 +361,14 @@ def _BiasAddGradGrad(op, received_grad):
     data_format = None
 
   shape = array_ops.shape(op.inputs[0])
-  rank = array_ops.rank(op.inputs[0])
   bias_shape = array_ops.shape(received_grad)
 
   if data_format == b"NCHW":
     expanded_shape = array_ops.concat([
-        array_ops.ones_like(shape[:-3]), bias_shape,
-        array_ops.ones_like(shape[-2:])
+        array_ops.ones_like(shape[:1]), bias_shape,
+        array_ops.ones_like(shape[2:])
     ], 0)
-    tile_mults = array_ops.concat([shape[:-3], [1], shape[-2:]], 0)
+    tile_mults = array_ops.concat([shape[:1], [1], shape[2:]], 0)
   else:
     expanded_shape = array_ops.concat(
         [array_ops.ones_like(shape[:-1]), bias_shape], 0)
@@ -360,9 +411,9 @@ def _ReluGrad(op, grad):
 def _EluGradGrad(op, grad):
   elu_x = op.inputs[1]
   return (gen_nn_ops.elu_grad(grad, op.outputs[0]),
-          array_ops.where(elu_x < 0, grad * op.inputs[0],
-                          array_ops.zeros(
-                              shape=array_ops.shape(elu_x), dtype=elu_x.dtype)))
+          array_ops.where(
+              elu_x < 0, grad * op.inputs[0],
+              array_ops.zeros(shape=array_ops.shape(elu_x), dtype=elu_x.dtype)))
 
 
 @ops.RegisterGradient("SeluGrad")
@@ -370,11 +421,9 @@ def _SeluGradGrad(op, grad):
   x = op.inputs[1]
   scale_alpha = 1.7580993408473768599402175208123
   return (gen_nn_ops.elu_grad(grad, op.outputs[0]),
-          array_ops.where(x < 0.,
-                          gen_nn_ops.elu_grad(grad,
-                                              op.outputs[0] + scale_alpha),
-                          array_ops.zeros(
-                              shape=array_ops.shape(x), dtype=x.dtype)))
+          array_ops.where(
+              x < 0., gen_nn_ops.elu_grad(grad, op.outputs[0] + scale_alpha),
+              array_ops.zeros(shape=array_ops.shape(x), dtype=x.dtype)))
 
 
 @ops.RegisterGradient("Relu6")
@@ -485,10 +534,10 @@ def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad):
     softmax = nn_ops.softmax(logits)
 
     grad += ((grad_grad - array_ops.squeeze(
-        math_ops.matmul(array_ops.expand_dims(grad_grad, 1),
-                        array_ops.expand_dims(softmax, 2)),
-        axis=1)) *
-             softmax)
+        math_ops.matmul(
+            array_ops.expand_dims(grad_grad, 1),
+            array_ops.expand_dims(softmax, 2)),
+        axis=1)) * softmax)
 
   return grad, _BroadcastMul(grad_loss, -nn_ops.log_softmax(logits))
 
@@ -514,29 +563,40 @@ def _SparseSoftmaxCrossEntropyWithLogitsGrad(op, grad_0, _):
 
 @ops.RegisterGradient("Conv2D")
 def _Conv2DGrad(op, grad):
+  """Gradient function for Conv2D."""
   dilations = op.get_attr("dilations")
   strides = op.get_attr("strides")
   padding = op.get_attr("padding")
+  explicit_paddings = op.get_attr("explicit_paddings")
   use_cudnn_on_gpu = op.get_attr("use_cudnn_on_gpu")
   data_format = op.get_attr("data_format")
   shape_0, shape_1 = array_ops.shape_n([op.inputs[0], op.inputs[1]])
+
+  # We call the gen_nn_ops backprop functions instead of nn_ops backprop
+  # functions for performance reasons in Eager mode. gen_nn_ops functions take a
+  # `explicit_paddings` parameter, but nn_ops functions do not. So if were were
+  # to use the nn_ops functions, we would have to convert `padding` and
+  # `explicit_paddings` into a single `padding` parameter, increasing overhead
+  # in Eager mode.
   return [
-      nn_ops.conv2d_backprop_input(
+      gen_nn_ops.conv2d_backprop_input(
           shape_0,
           op.inputs[1],
           grad,
           dilations=dilations,
           strides=strides,
           padding=padding,
+          explicit_paddings=explicit_paddings,
           use_cudnn_on_gpu=use_cudnn_on_gpu,
           data_format=data_format),
-      nn_ops.conv2d_backprop_filter(
+      gen_nn_ops.conv2d_backprop_filter(
           op.inputs[0],
           shape_1,
           grad,
           dilations=dilations,
           strides=strides,
           padding=padding,
+          explicit_paddings=explicit_paddings,
           use_cudnn_on_gpu=use_cudnn_on_gpu,
           data_format=data_format)
   ]
@@ -774,9 +834,9 @@ def _BaseFusedBatchNormGrad(op, use_v2, *grad):
   Args:
     op: The BatchNormOp for which we need to compute gradients.
     use_v2: Boolean indicating whether to use the V2 version of the fused batch
-            norm gradient.
-    *grad: An argument list for tensors of gradients wrt the outputs
-          with grad[0] as grad_y.
+      norm gradient.
+    *grad: An argument list for tensors of gradients wrt the outputs with
+      grad[0] as grad_y.
 
   Returns:
     grad_x: gradient for x, which is scale * rsqrt(variance + epsilon) *
@@ -863,8 +923,7 @@ def _BatchNormGrad(grad_y,
     epsilon: A small float number added to the variance of x.
     data_format: The data format for input. Either b"NHWC" or b"NCHW".
     is_training: A bool value to indicate the operation is for training
-      (default)
-        or inference.
+      (default) or inference.
 
   Returns:
     A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient
@@ -928,9 +987,9 @@ def _FusedBatchNormGradGrad(op, *grad):
 
   Args:
     op: The FusedBatchNormGradOp for which we need to compute gradients.
-    *grad: An argument list for tensors of gradients wrt the outputs
-          with grad[0] as grad_grad_x, grad[1] as grad_grad_scale,
-          grad[2] as grad_grad_offset.
+    *grad: An argument list for tensors of gradients wrt the outputs with
+      grad[0] as grad_grad_x, grad[1] as grad_grad_scale, grad[2] as
+      grad_grad_offset.
 
   Returns:
     A tuple (grad_grad_y, grad_x, grad_scale, None, None), where grad_grad_y
@@ -996,29 +1055,31 @@ def _TopKGrad(op, grad, _):
   ind_shape = array_ops.shape(op.outputs[1])
 
   # int32 is not supported on GPU hence up-casting
-  ind_lastdim = array_ops.gather(math_ops.cast(
-      ind_shape, dtypes.int64), array_ops.size(ind_shape) - 1)
+  ind_lastdim = array_ops.gather(
+      math_ops.cast(ind_shape, dtypes.int64),
+      array_ops.size(ind_shape) - 1)
   # Flatten indices to 2D.
   ind_2d = array_ops.reshape(op.outputs[1], array_ops.stack([-1, ind_lastdim]))
 
-  in_lastdim = array_ops.gather(math_ops.cast(
-      in_shape, dtypes.int64), array_ops.size(in_shape) - 1)
+  in_lastdim = array_ops.gather(
+      math_ops.cast(in_shape, dtypes.int64),
+      array_ops.size(in_shape) - 1)
   outerdim = array_ops.shape(ind_2d)[0]
   # Compute linear indices (flattened to 1D).
-  ind = array_ops.reshape(ind_2d + math_ops.cast(array_ops.expand_dims(
-      math_ops.range(0, math_ops.cast(outerdim, dtypes.int64)
-                     * in_lastdim, in_lastdim), -1), dtypes.int32), [-1])
+  ind = array_ops.reshape(
+      ind_2d + math_ops.cast(
+          array_ops.expand_dims(
+              math_ops.range(0,
+                             math_ops.cast(outerdim, dtypes.int64) * in_lastdim,
+                             in_lastdim), -1), dtypes.int32), [-1])
 
   # Substitute grad to appropriate locations and fill the rest with zeros,
   # finally reshaping it to the original input shape.
   return [
       array_ops.reshape(
           array_ops.scatter_nd(
-              array_ops.expand_dims(ind, -1),
-              array_ops.reshape(grad, [-1]),
-              [math_ops.reduce_prod(in_shape)]
-          ),
-          in_shape),
+              array_ops.expand_dims(ind, -1), array_ops.reshape(grad, [-1]),
+              [math_ops.reduce_prod(in_shape)]), in_shape),
       array_ops.zeros([], dtype=dtypes.int32)
   ]
 
diff --git a/tensorflow/python/ops/nn_grad_test.py b/tensorflow/python/ops/nn_grad_test.py
index 95e05a977b856505f0b608442e85fda8468ead1f..783656a86932019e373e42b236acfacf96245faf 100644
--- a/tensorflow/python/ops/nn_grad_test.py
+++ b/tensorflow/python/ops/nn_grad_test.py
@@ -23,9 +23,11 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
 
@@ -49,5 +51,111 @@ class Relu6OpTest(test.TestCase):
       self.assertLess(error, 1e-4)
 
 
+class Conv2dOpTest(test.TestCase):
+
+  def run_test(self, x, y):
+    with self.test_session():
+      error = gradient_checker.compute_gradient_error(x,
+                                                      x.get_shape().as_list(),
+                                                      y,
+                                                      y.get_shape().as_list())
+      self.assertLess(error, 1e-3)
+
+  @test_util.run_deprecated_v1
+  def testConv2dGradWRTInput(self):
+    x = array_ops.placeholder(
+        dtype=dtypes.float32, shape=[1, 4, 4, 3], name='input')
+    f = constant_op.constant([0.5],
+                             dtype=dtypes.float32,
+                             shape=[2, 2, 3, 2],
+                             name='filter')
+    y = nn_ops.conv2d(x, f, [1, 1, 1, 1], 'SAME')
+    self.run_test(x, y)
+
+  @test_util.run_deprecated_v1
+  def testConv2dGradWRTFilter(self):
+    x = constant_op.constant([0.5],
+                             dtype=dtypes.float32,
+                             shape=[1, 4, 4, 3],
+                             name='input')
+    f = array_ops.placeholder(
+        dtype=dtypes.float32, shape=[2, 2, 3, 2], name='filter')
+    y = nn_ops.conv2d(x, f, [1, 1, 1, 1], 'SAME')
+    self.run_test(f, y)
+
+  @test_util.run_deprecated_v1
+  def testConv2dBackpropFilterGrad(self):
+    x = array_ops.placeholder(
+        dtype=dtypes.float32, shape=[1, 4, 4, 3], name='input')
+    f = constant_op.constant([0.5],
+                             dtype=dtypes.float32,
+                             shape=[2, 2, 3, 2],
+                             name='filter')
+    strides = [1, 1, 1, 1]
+    padding = 'SAME'
+    out = nn_impl.depthwise_conv2d(x, f, strides, padding)
+
+    grad_wrt_input = gradients_impl.gradients(out, x)[0]
+    self.run_test(f, grad_wrt_input)
+
+    grad_wrt_filter = gradients_impl.gradients(out, f)[0]
+    self.run_test(x, grad_wrt_filter)
+
+
+class DepthwiseConv2dTest(test.TestCase):
+
+  def run_test(self, x, y):
+    with self.test_session():
+      error = gradient_checker.compute_gradient_error(x,
+                                                      x.get_shape().as_list(),
+                                                      y,
+                                                      y.get_shape().as_list())
+      self.assertLess(error, 1e-3)
+
+  @test_util.run_deprecated_v1
+  def testDepthwiseConv2dGradWRTInput(self):
+    x = array_ops.placeholder(
+        dtype=dtypes.float32, shape=[1, 4, 4, 3], name='input')
+    f = constant_op.constant([0.5],
+                             dtype=dtypes.float32,
+                             shape=[2, 2, 3, 2],
+                             name='filter')
+    strides = [1, 1, 1, 1]
+    padding = 'SAME'
+    y = nn_impl.depthwise_conv2d(x, f, strides, padding)
+    self.run_test(x, y)
+
+  @test_util.run_deprecated_v1
+  def testDepthwiseConv2dGradWRTFilter(self):
+    x = constant_op.constant([0.5],
+                             dtype=dtypes.float32,
+                             shape=[1, 4, 4, 3],
+                             name='input')
+    f = array_ops.placeholder(
+        dtype=dtypes.float32, shape=[2, 2, 3, 2], name='filter')
+    strides = [1, 1, 1, 1]
+    padding = 'SAME'
+    y = nn_impl.depthwise_conv2d(x, f, strides, padding)
+    self.run_test(f, y)
+
+  @test_util.run_deprecated_v1
+  def testDepthwiseConv2dBackpropFilterGrad(self):
+    x = array_ops.placeholder(
+        dtype=dtypes.float32, shape=[1, 4, 4, 3], name='input')
+    f = constant_op.constant([0.5],
+                             dtype=dtypes.float32,
+                             shape=[2, 2, 3, 2],
+                             name='filter')
+    strides = [1, 1, 1, 1]
+    padding = 'SAME'
+    out = nn_impl.depthwise_conv2d(x, f, strides, padding)
+
+    grad_wrt_input = gradients_impl.gradients(out, x)[0]
+    self.run_test(f, grad_wrt_input)
+
+    grad_wrt_filter = gradients_impl.gradients(out, f)[0]
+    self.run_test(x, grad_wrt_filter)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 48dcab4842864b7322610e4328c1771f95ee352d..dc252c7e7f550c2d1424938129280c54e7e9eab9 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -32,7 +32,7 @@ from tensorflow.python.ops import gen_array_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import gen_sparse_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.deprecation import deprecated_argument_lookup
@@ -398,7 +398,7 @@ def _count_nonzero(input_tensor, dtype=dtypes.int64):
   Returns:
       number of nonzero values with type dtype
   """
-  with ops.name_scope("count_nonzero", [input_tensor]):
+  with ops.name_scope("count_nonzero", values=[input_tensor]):
     zero = array_ops.zeros([], dtype=input_tensor.dtype)
     nonzero_count = math_ops.reduce_sum(
         math_ops.cast(
@@ -456,7 +456,8 @@ def depthwise_conv2d(input,
                      padding,
                      rate=None,
                      name=None,
-                     data_format=None):
+                     data_format=None,
+                     dilations=None):
   """Depthwise 2-D convolution.
 
   Given a 4D input tensor ('NHWC' or 'NCHW' data formats)
@@ -467,7 +468,7 @@ def depthwise_conv2d(input,
   to `channel_multiplier` channels for each), then concatenates the results
   together.  The output has `in_channels * channel_multiplier` channels.
 
-  In detail,
+  In detail, with the default NHWC format,
 
       output[b, i, j, k * channel_multiplier + q] = sum_{di, dj}
            filter[di, dj, k, q] * input[b, strides[1] * i + rate[0] * di,
@@ -492,12 +493,14 @@ def depthwise_conv2d(input,
       greater than 1, then all values of strides must be 1.
     name: A name for this operation (optional).
     data_format: The data format for input. Either "NHWC" (default) or "NCHW".
+    dilations: Alias of rate.
 
   Returns:
     A 4-D `Tensor` with shape according to `data_format`.  E.g., for
     "NHWC" format, shape is
     `[batch, out_height, out_width, in_channels * channel_multiplier].`
   """
+  rate = deprecated_argument_lookup("dilations", dilations, "rate", rate)
   with ops.name_scope(name, "depthwise", [input, filter]) as name:
     input = ops.convert_to_tensor(input, name="tensor_in")
     filter = ops.convert_to_tensor(filter, name="filter_in")
@@ -540,7 +543,7 @@ def depthwise_conv2d_v2(input,
   to `channel_multiplier` channels for each), then concatenates the results
   together.  The output has `in_channels * channel_multiplier` channels.
 
-  In detail,
+  In detail, with the default NHWC format,
 
       output[b, i, j, k * channel_multiplier + q] = sum_{di, dj}
            filter[di, dj, k, q] * input[b, strides[1] * i + rate[0] * di,
@@ -591,7 +594,8 @@ def separable_conv2d(input,
                      padding,
                      rate=None,
                      name=None,
-                     data_format=None):
+                     data_format=None,
+                     dilations=None):
   """2-D convolution with separable filters.
 
   Performs a depthwise convolution that acts separately on channels followed by
@@ -599,7 +603,7 @@ def separable_conv2d(input,
   between dimensions `[1, 2]` and `3`, not spatial separability between
   dimensions `1` and `2`.
 
-  In detail,
+  In detail, with the default NHWC format,
 
       output[b, i, j, k] = sum_{di, dj, q, r}
           input[b, strides[1] * i + di, strides[2] * j + dj, q] *
@@ -631,12 +635,14 @@ def separable_conv2d(input,
       greater than 1, then all values of strides must be 1.
     name: A name for this operation (optional).
     data_format: The data format for input. Either "NHWC" (default) or "NCHW".
+    dilations: Alias of rate.
 
   Returns:
     A 4-D `Tensor` with shape according to 'data_format'. For
       example, with data_format="NHWC", shape is [batch, out_height,
       out_width, out_channels].
   """
+  rate = deprecated_argument_lookup("dilations", dilations, "rate", rate)
   with ops.name_scope(name, "separable_conv2d",
                       [input, depthwise_filter, pointwise_filter]) as name:
     input = ops.convert_to_tensor(input, name="tensor_in")
@@ -699,7 +705,7 @@ def separable_conv2d_v2(
   between dimensions `[1, 2]` and `3`, not spatial separability between
   dimensions `1` and `2`.
 
-  In detail,
+  In detail, with the default NHWC format,
 
       output[b, i, j, k] = sum_{di, dj, q, r}
           input[b, strides[1] * i + di, strides[2] * j + dj, q] *
@@ -751,7 +757,8 @@ def separable_conv2d_v2(
 
 
 @tf_export(v1=["nn.sufficient_statistics"])
-def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None):
+def sufficient_statistics(x, axes, shift=None, keep_dims=None, name=None,
+                          keepdims=None):
   """Calculate the sufficient statistics for the mean and variance of `x`.
 
   These sufficient statistics are computed using the one pass algorithm on
@@ -766,6 +773,7 @@ def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None):
       close to the true mean provides the most numerically stable results.
     keep_dims: produce statistics with the same dimensionality as the input.
     name: Name used to scope the operations that compute the sufficient stats.
+    keepdims: Alias for keep_dims.
 
   Returns:
     Four `Tensor` objects of the same type as `x`:
@@ -776,6 +784,10 @@ def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None):
     * the shift by which the mean must be corrected or None if `shift` is None.
   """
   axes = list(set(axes))
+  keep_dims = deprecated_argument_lookup(
+      "keepdims", keepdims, "keep_dims", keep_dims)
+  if keep_dims is None:
+    keep_dims = False
   with ops.name_scope(name, "sufficient_statistics", [x, shift]):
     x = ops.convert_to_tensor(x, name="x")
     x_shape = x.get_shape()
@@ -867,7 +879,8 @@ def moments(
     axes,
     shift=None,  # pylint: disable=unused-argument
     name=None,
-    keep_dims=False):
+    keep_dims=None,
+    keepdims=None):
   """Calculate the mean and variance of `x`.
 
   The mean and variance are calculated by aggregating the contents of `x`
@@ -890,10 +903,15 @@ def moments(
     shift: Not used in the current implementation
     name: Name used to scope the operations that compute the moments.
     keep_dims: produce moments with the same dimensionality as the input.
+    keepdims: Alias to keep_dims.
 
   Returns:
     Two `Tensor` objects: `mean` and `variance`.
   """
+  keep_dims = deprecated_argument_lookup(
+      "keepdims", keepdims, "keep_dims", keep_dims)
+  if keep_dims is None:
+    keep_dims = False
   with ops.name_scope(name, "moments", [x, axes]):
     # The dynamic range of fp16 is too limited to support the collection of
     # sufficient statistics. As a workaround we simply perform the operations
@@ -957,7 +975,8 @@ def moments_v2(
 
 
 @tf_export(v1=["nn.weighted_moments"])
-def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
+def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=None,
+                     keepdims=None):
   """Returns the frequency-weighted mean and variance of `x`.
 
   Args:
@@ -968,10 +987,15 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
       broadcast with x.
     name: Name used to scope the operation.
     keep_dims: Produce moments with the same dimensionality as the input.
+    keepdims: Alias of keep_dims.
 
   Returns:
     Two tensors: `weighted_mean` and `weighted_variance`.
   """
+  keep_dims = deprecated_argument_lookup(
+      "keepdims", keepdims, "keep_dims", keep_dims)
+  if keep_dims is None:
+    keep_dims = False
   with ops.name_scope(name, "weighted_moments", [x, frequency_weights, axes]):
     x = ops.convert_to_tensor(x, name="x")
     frequency_weights = ops.convert_to_tensor(
@@ -1184,14 +1208,17 @@ def fused_batch_norm(
 
 
 @tf_export(v1=["nn.batch_norm_with_global_normalization"])
-def batch_norm_with_global_normalization(t,
-                                         m,
-                                         v,
-                                         beta,
-                                         gamma,
-                                         variance_epsilon,
-                                         scale_after_normalization,
-                                         name=None):
+def batch_norm_with_global_normalization(t=None,
+                                         m=None,
+                                         v=None,
+                                         beta=None,
+                                         gamma=None,
+                                         variance_epsilon=None,
+                                         scale_after_normalization=None,
+                                         name=None,
+                                         input=None,  # pylint: disable=redefined-builtin
+                                         mean=None,
+                                         variance=None):
   """Batch normalization.
 
   This op is deprecated. See `tf.nn.batch_normalization`.
@@ -1213,10 +1240,16 @@ def batch_norm_with_global_normalization(t,
     scale_after_normalization: A bool indicating whether the resulted tensor
       needs to be multiplied with gamma.
     name: A name for this operation (optional).
+    input: Alias for t.
+    mean: Alias for m.
+    variance: Alias for v.
 
   Returns:
      A batch-normalized `t`.
   """
+  t = deprecated_argument_lookup("input", input, "t", t)
+  m = deprecated_argument_lookup("mean", mean, "m", m)
+  v = deprecated_argument_lookup("variance", variance, "v", v)
   return batch_normalization(t, m, v, beta, gamma if scale_after_normalization
                              else None, variance_epsilon, name)
 
@@ -1380,6 +1413,8 @@ def _compute_sampled_logits(weights,
     # weights shape is [num_classes, dim]
     all_w = embedding_ops.embedding_lookup(
         weights, all_ids, partition_strategy=partition_strategy)
+    if all_w.dtype != inputs.dtype:
+      all_w = math_ops.cast(all_w, inputs.dtype)
 
     # true_w shape is [batch_size * num_true, dim]
     true_w = array_ops.slice(all_w, [0, 0],
@@ -1397,6 +1432,8 @@ def _compute_sampled_logits(weights,
     # add the biases to the true and sampled logits.
     all_b = embedding_ops.embedding_lookup(
         biases, all_ids, partition_strategy=partition_strategy)
+    if all_b.dtype != inputs.dtype:
+      all_b = math_ops.cast(all_b, inputs.dtype)
     # true_b is a [batch_size * num_true] tensor
     # sampled_b is a [num_sampled] float tensor
     true_b = array_ops.slice(all_b, [0], array_ops.shape(labels_flat))
@@ -1436,7 +1473,7 @@ def _compute_sampled_logits(weights,
            array_ops.expand_dims(num_sampled, 0)], 0)
       if sampled_logits.dtype != acc_weights.dtype:
         acc_weights = math_ops.cast(acc_weights, sampled_logits.dtype)
-      sampled_logits += sparse_ops.sparse_to_dense(
+      sampled_logits += gen_sparse_ops.sparse_to_dense(
           sparse_indices,
           sampled_logits_shape,
           acc_weights,
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 611bfdac9a1b10a808cafeed585ac6e3427d18e9..836e8713952149b6e34c1e4ae622e51c548e2ebb 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import numbers
 
 import numpy as np
@@ -52,6 +53,30 @@ local_response_normalization = gen_nn_ops.lrn
 # pylint: disable=protected-access
 
 
+def _get_sequence(value, n, channel_index, name):
+  """Formats a value input for gen_nn_ops."""
+  if value is None:
+    value = [1]
+  elif not isinstance(value, collections.Sized):
+    value = [value]
+
+  current_n = len(value)
+  if current_n == n + 2:
+    return value
+  elif current_n == 1:
+    value = list((value[0],) * n)
+  elif current_n == n:
+    value = list(value)
+  else:
+    raise ValueError("{} should be of length 1, {} or {} but was {}".format(
+        name, n, n + 2, current_n))
+
+  if channel_index == 1:
+    return [1, 1] + value
+  else:
+    return [1] + value + [1]
+
+
 def _non_atrous_convolution(
     input,  # pylint: disable=redefined-builtin
     filter,  # pylint: disable=redefined-builtin
@@ -171,7 +196,7 @@ class _NonAtrousConvolution(object):
         raise ValueError("data_format must be \"NHWC\" or \"NCHW\".")
       self.strides = strides
       self.data_format = data_format
-      self.conv_op = gen_nn_ops.conv2d
+      self.conv_op = conv2d
     elif conv_dims == 3:
       if data_format is None or data_format == "NDHWC":
         strides = [1] + list(strides) + [1]
@@ -275,6 +300,24 @@ def dilation2d_v2(
                                name=name)
 
 
+@tf_export(v1=["nn.dilation2d"])
+def dilation2d_v1(  # pylint: disable=missing-docstring
+    input,  # pylint: disable=redefined-builtin
+    filter=None,  # pylint: disable=redefined-builtin
+    strides=None,
+    rates=None,
+    padding=None,
+    name=None,
+    filters=None,
+    dilations=None):
+  filter = deprecated_argument_lookup("filters", filters, "filter", filter)
+  rates = deprecated_argument_lookup("dilations", dilations, "rates", rates)
+  return gen_nn_ops.dilation2d(input, filter, strides, rates, padding, name)
+
+
+dilation2d_v1.__doc__ = gen_nn_ops.dilation2d.__doc__
+
+
 @tf_export("nn.with_space_to_batch")
 def with_space_to_batch(
     input,  # pylint: disable=redefined-builtin
@@ -487,7 +530,7 @@ class _WithSpaceToBatch(object):
     if spatial_dims != orig_spatial_dims or any(x < 1 for x in spatial_dims):
       raise ValueError(
           "spatial_dims must be a montonically increasing sequence of positive "
-          "integers")  # pylint: disable=line-too-long
+          "integers")
 
     if data_format is not None and data_format.startswith("NC"):
       expected_input_rank = spatial_dims[-1]
@@ -721,8 +764,9 @@ def convolution(
     strides=None,
     dilation_rate=None,
     name=None,
-    data_format=None):
-  # pylint: disable=line-too-long
+    data_format=None,
+    filters=None,
+    dilations=None):
   """Computes sums of N-D convolutions (actually cross-correlation).
 
   This also supports either output striding via the optional `strides` parameter
@@ -807,6 +851,8 @@ def convolution(
       starts with "NC").  For N=1, the valid values are "NWC" (default) and
       "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".
       For N=3, the valid values are "NDHWC" (default) and "NCDHW".
+    filters: Alias of filter.
+    dilations: Alias of dilation_rate.
 
   Returns:
     A `Tensor` with the same type as `input` of shape
@@ -834,21 +880,17 @@ def convolution(
       is other than `"VALID"` or `"SAME"`, or if data_format is invalid.
 
   """
-  # pylint: enable=line-too-long
-  with ops.name_scope(name, "convolution", [input, filter]) as name:
-    input = ops.convert_to_tensor(input, name="input")  # pylint: disable=redefined-builtin
-    input_shape = input.get_shape()
-    filter = ops.convert_to_tensor(filter, name="filter")  # pylint: disable=redefined-builtin
-    filter_shape = filter.get_shape()
-    op = Convolution(
-        input_shape,
-        filter_shape,
-        padding,
-        strides=strides,
-        dilation_rate=dilation_rate,
-        name=name,
-        data_format=data_format)
-    return op(input, filter)
+  filter = deprecated_argument_lookup("filters", filters, "filter", filter)
+  dilation_rate = deprecated_argument_lookup(
+      "dilations", dilations, "dilation_rate", dilation_rate)
+  return convolution_internal(
+      input,
+      filter,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      dilations=dilation_rate,
+      name=name)
 
 
 @tf_export("nn.convolution", v1=[])
@@ -860,14 +902,15 @@ def convolution_v2(
     data_format=None,
     dilations=None,
     name=None):
-  return convolution(
+  return convolution_internal(
       input,  # pylint: disable=redefined-builtin
       filters,
-      padding=padding,
       strides=strides,
-      dilation_rate=dilations,
-      name=name,
-      data_format=data_format)
+      padding=padding,
+      data_format=data_format,
+      dilations=dilations,
+      name=name)
+
 
 convolution_v2.__doc__ = deprecation.rewrite_argument_docstring(
     deprecation.rewrite_argument_docstring(
@@ -875,6 +918,75 @@ convolution_v2.__doc__ = deprecation.rewrite_argument_docstring(
     "filter", "filters")
 
 
+def convolution_internal(
+    input,  # pylint: disable=redefined-builtin
+    filters,
+    strides=None,
+    padding="VALID",
+    data_format=None,
+    dilations=None,
+    name=None):
+  """Internal function which performs rank agnostic convolution."""
+  with ops.name_scope(name, "convolution", [input, filters]) as name:
+    if isinstance(input.shape, tensor_shape.TensorShape) and \
+        input.shape.rank is not None:
+      n = len(input.shape) - 2
+    elif not isinstance(input.shape, tensor_shape.TensorShape) and \
+        input.shape is not None:
+      n = len(input.shape) - 2
+    elif isinstance(filters.shape, tensor_shape.TensorShape) and \
+        filters.shape.rank is not None:
+      n = len(filters.shape) - 2
+    elif not isinstance(filters.shape, tensor_shape.TensorShape) and \
+        filters.shape is not None:
+      n = len(filters.shape) - 2
+    else:
+      raise ValueError("rank of input or filter must be known")
+
+    if not 1 <= n <= 3:
+      raise ValueError(
+          "Input tensor must be of rank 3, 4 or 5 but was {}.".format(n + 2))
+
+    if data_format is None:
+      channel_index = n + 1
+    else:
+      channel_index = 1 if data_format.startswith("NC") else n + 1
+
+    strides = _get_sequence(strides, n, channel_index, "strides")
+    dilations = _get_sequence(dilations, n, channel_index, "dilations")
+
+    conv_ops = {1: conv1d, 2: gen_nn_ops.conv2d, 3: gen_nn_ops.conv3d}
+
+    if all(i == 1 for i in dilations):
+      # fast path if no dilation as gradient only supported on GPU for dilations
+      op = conv_ops[n]
+      return op(
+          input,
+          filters,
+          strides,
+          padding=padding,
+          data_format=data_format,
+          dilations=dilations,
+          name=name)
+    else:
+      if channel_index == 1:
+        strides = strides[2:]
+        dilations = dilations[2:]
+      else:
+        strides = strides[1:-1]
+        dilations = dilations[1:-1]
+
+      op = Convolution(
+          tensor_shape.as_shape(input.shape),
+          tensor_shape.as_shape(filters.shape),
+          padding,
+          strides=strides,
+          dilation_rate=dilations,
+          name=name,
+          data_format=data_format)
+      return op(input, filters)
+
+
 class Convolution(object):
   """Helper class for convolution.
 
@@ -975,8 +1087,8 @@ def pool(
     dilation_rate=None,
     strides=None,
     name=None,
-    data_format=None):
-  # pylint: disable=line-too-long
+    data_format=None,
+    dilations=None):
   """Performs an N-D pooling operation.
 
   In the case that `data_format` does not start with "NC", computes for
@@ -1032,6 +1144,7 @@ def pool(
       starts with "NC").  For N=1, the valid values are "NWC" (default) and
       "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".
       For N=3, the valid values are "NDHWC" (default) and "NCDHW".
+    dilations: Alias for dilation_rate
 
   Returns:
     Tensor of rank N+2, of shape
@@ -1056,6 +1169,8 @@ def pool(
     ValueError: if arguments are invalid.
 
   """
+  dilation_rate = deprecated_argument_lookup(
+      "dilations", dilations, "dilation_rate", dilation_rate)
   # pylint: enable=line-too-long
   with ops.name_scope(name, "%s_pool" % (pooling_type.lower()),
                       [input]) as scope:
@@ -1373,6 +1488,288 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
       name=name)
 
 
+def _convert_padding(padding):
+  """Converts Python padding to C++ padding for ops which take EXPLICIT padding.
+
+  Args:
+    padding: the `padding` argument for a Python op which supports EXPLICIT
+      padding.
+
+  Returns:
+    (padding, explicit_paddings) pair, which should be passed as attributes to a
+    C++ op.
+
+  Raises:
+    ValueError: If padding is invalid.
+  """
+  explicit_paddings = []
+  if padding == "EXPLICIT":
+    # Give a better error message if EXPLICIT is passed.
+    raise ValueError('"EXPLICIT" is not a valid value for the padding '
+                     "parameter. To use explicit padding, the padding "
+                     "parameter must be a list.")
+  if isinstance(padding, (list, tuple)):
+    for i, dim_paddings in enumerate(padding):
+      if not isinstance(dim_paddings, (list, tuple)):
+        raise ValueError("When padding is a list, each element of padding must "
+                         "be a list/tuple of size 2. Element with index %d of "
+                         "padding is not a list/tuple" % i)
+      if len(dim_paddings) != 2:
+        raise ValueError("When padding is a list, each element of padding must "
+                         "be a list/tuple of size 2. Element with index %d of "
+                         "padding has size %d" % (i, len(dim_paddings)))
+      explicit_paddings.extend(dim_paddings)
+    if len(padding) != 4:
+      raise ValueError("When padding is a list, it must be of size 4. Got "
+                       "padding of size: %d" % len(padding))
+    padding = "EXPLICIT"
+  return padding, explicit_paddings
+
+
+@tf_export(v1=["nn.conv1d"])
+@deprecation.deprecated_arg_values(
+    None,
+    "`NCHW` for data_format is deprecated, use `NCW` instead",
+    warn_once=True,
+    data_format="NCHW")
+@deprecation.deprecated_arg_values(
+    None,
+    "`NHWC` for data_format is deprecated, use `NWC` instead",
+    warn_once=True,
+    data_format="NHWC")
+def conv1d(
+    value=None,
+    filters=None,
+    stride=None,
+    padding=None,
+    use_cudnn_on_gpu=None,
+    data_format=None,
+    name=None,
+    input=None,  # pylint: disable=redefined-builtin
+    dilations=None):
+  r"""Computes a 1-D convolution given 3-D input and filter tensors.
+
+  Given an input tensor of shape
+    [batch, in_width, in_channels]
+  if data_format is "NWC", or
+    [batch, in_channels, in_width]
+  if data_format is "NCW",
+  and a filter / kernel tensor of shape
+  [filter_width, in_channels, out_channels], this op reshapes
+  the arguments to pass them to conv2d to perform the equivalent
+  convolution operation.
+
+  Internally, this op reshapes the input tensors and invokes `tf.nn.conv2d`.
+  For example, if `data_format` does not start with "NC", a tensor of shape
+    [batch, in_width, in_channels]
+  is reshaped to
+    [batch, 1, in_width, in_channels],
+  and the filter is reshaped to
+    [1, filter_width, in_channels, out_channels].
+  The result is then reshaped back to
+    [batch, out_width, out_channels]
+  \(where out_width is a function of the stride and padding as in conv2d\) and
+  returned to the caller.
+
+  Args:
+    value: A 3D `Tensor`.  Must be of type `float16`, `float32`, or `float64`.
+    filters: A 3D `Tensor`.  Must have the same type as `value`.
+    stride: An int or list of `ints` that has length `1` or `3`.  The number of
+      entries by which the filter is moved right at each step.
+    padding: 'SAME' or 'VALID'
+    use_cudnn_on_gpu: An optional `bool`.  Defaults to `True`.
+    data_format: An optional `string` from `"NWC", "NCW"`.  Defaults to `"NWC"`,
+      the data is stored in the order of [batch, in_width, in_channels].  The
+      `"NCW"` format stores data as [batch, in_channels, in_width].
+    name: A name for the operation (optional).
+    input: Alias for value.
+    dilations: An int or list of `ints` that has length `1` or `3` which
+      defaults to 1. The dilation factor for each dimension of input. If set to
+      k > 1, there will be k-1 skipped cells between each filter element on that
+      dimension. Dilations in the batch and depth dimensions must be 1.
+
+  Returns:
+    A `Tensor`.  Has the same type as input.
+
+  Raises:
+    ValueError: if `data_format` is invalid.
+  """
+  value = deprecation.deprecated_argument_lookup("input", input, "value", value)
+  with ops.name_scope(name, "conv1d", [value, filters]) as name:
+    # Reshape the input tensor to [batch, 1, in_width, in_channels]
+    if data_format is None or data_format == "NHWC" or data_format == "NWC":
+      data_format = "NHWC"
+      spatial_start_dim = 1
+      channel_index = 2
+    elif data_format == "NCHW" or data_format == "NCW":
+      data_format = "NCHW"
+      spatial_start_dim = 2
+      channel_index = 1
+    else:
+      raise ValueError("data_format must be \"NWC\" or \"NCW\".")
+    strides = [1] + _get_sequence(stride, 1, channel_index, "stride")
+    dilations = [1] + _get_sequence(dilations, 1, channel_index, "dilations")
+
+    value = array_ops.expand_dims(value, spatial_start_dim)
+    filters = array_ops.expand_dims(filters, 0)
+    result = gen_nn_ops.conv2d(
+        value,
+        filters,
+        strides,
+        padding,
+        use_cudnn_on_gpu=use_cudnn_on_gpu,
+        data_format=data_format,
+        dilations=dilations,
+        name=name)
+    return array_ops.squeeze(result, [spatial_start_dim])
+
+
+@tf_export("nn.conv1d", v1=[])
+def conv1d_v2(
+    input,  # pylint: disable=redefined-builtin
+    filters,
+    stride,
+    padding,
+    data_format="NWC",
+    dilations=None,
+    name=None):
+  r"""Computes a 1-D convolution given 3-D input and filter tensors.
+
+  Given an input tensor of shape
+    [batch, in_width, in_channels]
+  if data_format is "NWC", or
+    [batch, in_channels, in_width]
+  if data_format is "NCW",
+  and a filter / kernel tensor of shape
+  [filter_width, in_channels, out_channels], this op reshapes
+  the arguments to pass them to conv2d to perform the equivalent
+  convolution operation.
+
+  Internally, this op reshapes the input tensors and invokes `tf.nn.conv2d`.
+  For example, if `data_format` does not start with "NC", a tensor of shape
+    [batch, in_width, in_channels]
+  is reshaped to
+    [batch, 1, in_width, in_channels],
+  and the filter is reshaped to
+    [1, filter_width, in_channels, out_channels].
+  The result is then reshaped back to
+    [batch, out_width, out_channels]
+  \(where out_width is a function of the stride and padding as in conv2d\) and
+  returned to the caller.
+
+  Args:
+    input: A 3D `Tensor`.  Must be of type `float16`, `float32`, or `float64`.
+    filters: A 3D `Tensor`.  Must have the same type as `input`.
+    stride: An int or list of `ints` that has length `1` or `3`.  The number of
+      entries by which the filter is moved right at each step.
+    padding: 'SAME' or 'VALID'
+    data_format: An optional `string` from `"NWC", "NCW"`.  Defaults to `"NWC"`,
+      the data is stored in the order of [batch, in_width, in_channels].  The
+      `"NCW"` format stores data as [batch, in_channels, in_width].
+    dilations: An int or list of `ints` that has length `1` or `3` which
+      defaults to 1. The dilation factor for each dimension of input. If set to
+      k > 1, there will be k-1 skipped cells between each filter element on that
+      dimension. Dilations in the batch and depth dimensions must be 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`.  Has the same type as input.
+
+  Raises:
+    ValueError: if `data_format` is invalid.
+  """
+  return conv1d(
+      input,  # pylint: disable=redefined-builtin
+      filters,
+      stride,
+      padding,
+      use_cudnn_on_gpu=True,
+      data_format=data_format,
+      name=name,
+      dilations=dilations)
+
+
+@tf_export("nn.conv1d_transpose")
+def conv1d_transpose(
+    input,  # pylint: disable=redefined-builtin
+    filters,
+    output_shape,
+    strides,
+    padding="SAME",
+    data_format="NWC",
+    dilations=None,
+    name=None):
+  """The transpose of `conv1d`.
+
+  This operation is sometimes called "deconvolution" after [Deconvolutional
+  Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf),
+  but is really the transpose (gradient) of `conv1d` rather than an actual
+  deconvolution.
+
+  Args:
+    input: A 3-D `Tensor` of type `float` and shape
+      `[batch, in_width, in_channels]` for `NWC` data format or
+      `[batch, in_channels, in_width]` for `NCW` data format.
+    filters: A 3-D `Tensor` with the same type as `value` and shape
+      `[filter_width, output_channels, in_channels]`.  `filter`'s
+      `in_channels` dimension must match that of `value`.
+    output_shape: A 1-D `Tensor`, containing three elements, representing the
+      output shape of the deconvolution op.
+    strides: An int or list of `ints` that has length `1` or `3`.  The number of
+      entries by which the filter is moved right at each step.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
+      See the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. `'NWC'` and `'NCW'` are supported.
+    dilations: An int or list of `ints` that has length `1` or `3` which
+      defaults to 1. The dilation factor for each dimension of input. If set to
+      k > 1, there will be k-1 skipped cells between each filter element on that
+      dimension. Dilations in the batch and depth dimensions must be 1.
+    name: Optional name for the returned tensor.
+
+  Returns:
+    A `Tensor` with the same type as `value`.
+
+  Raises:
+    ValueError: If input/output depth does not match `filter`'s shape, if
+      `output_shape` is not at 3-element vector, if `padding` is other than
+      `'VALID'` or `'SAME'`, or if `data_format` is invalid.
+  """
+  with ops.name_scope(name, "conv1d_transpose",
+                      [input, filters, output_shape]) as name:
+    # The format could be either NWC or NCW, map to NHWC or NCHW
+    if data_format is None or data_format == "NWC":
+      data_format = "NHWC"
+      spatial_start_dim = 1
+      channel_index = 2
+    elif data_format == "NCW":
+      data_format = "NCHW"
+      spatial_start_dim = 2
+      channel_index = 1
+    else:
+      raise ValueError("data_format must be \"NWC\" or \"NCW\".")
+
+    # Reshape the input tensor to [batch, 1, in_width, in_channels]
+    strides = [1] + _get_sequence(strides, 1, channel_index, "stride")
+    dilations = [1] + _get_sequence(dilations, 1, channel_index, "dilations")
+
+    input = array_ops.expand_dims(input, spatial_start_dim)
+    filters = array_ops.expand_dims(filters, 0)
+    output_shape = list(output_shape)
+    output_shape = output_shape[: spatial_start_dim] + [1] + \
+                   output_shape[spatial_start_dim:]
+
+    result = gen_nn_ops.conv2d_backprop_input(
+        input_sizes=output_shape,
+        filter=filters,
+        out_backprop=input,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilations=dilations,
+        name=name)
+    return array_ops.squeeze(result, spatial_start_dim)
+
+
 @tf_export("nn.conv2d", v1=[])
 def conv2d_v2(input,  # pylint: disable=redefined-builtin
               filters,
@@ -1414,12 +1811,18 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
     filters: A `Tensor`. Must have the same type as `input`.
       A 4-D tensor of shape
       `[filter_height, filter_width, in_channels, out_channels]`
-    strides: A list of `ints`.
-      1-D tensor of length 4.  The stride of the sliding window for each
-      dimension of `input`. The dimension order is determined by the value of
-      `data_format`, see below for details.
-    padding: A `string` from: `"SAME", "VALID"`.
-      The type of padding algorithm to use.
+    strides: An int or list of `ints` that has length `1`, `2` or `4`.  The
+      stride of the sliding window for each dimension of `input`. If a single
+      value is given it is replicated in the `H` and `W` dimension. By default
+      the `N` and `C` dimensions are set to 1. The dimension order is determined
+      by the value of `data_format`, see below for details.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
+      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
+      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right]]`.
     data_format: An optional `string` from: `"NHWC", "NCHW"`.
       Defaults to `"NHWC"`.
       Specify the data format of the input and output data. With the
@@ -1427,40 +1830,138 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
           [batch, height, width, channels].
       Alternatively, the format could be "NCHW", the data storage order of:
           [batch, channels, height, width].
-    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
-      1-D tensor of length 4.  The dilation factor for each dimension of
-      `input`. If set to k > 1, there will be k-1 skipped cells between each
-      filter element on that dimension. The dimension order is determined by the
-      value of `data_format`, see above for details. Dilations in the batch and
-      depth dimensions must be 1.
+    dilations: An int or list of `ints` that has length `1`, `2` or `4`,
+      defaults to 1. The dilation factor for each dimension of`input`. If a
+      single value is given it is replicated in the `H` and `W` dimension. By
+      default the `N` and `C` dimensions are set to 1. If set to k > 1, there
+      will be k-1 skipped cells between each filter element on that dimension.
+      The dimension order is determined by the value of `data_format`, see above
+      for details. Dilations in the batch and depth dimensions if a 4-d tensor
+      must be 1.
     name: A name for the operation (optional).
 
   Returns:
     A `Tensor`. Has the same type as `input`.
   """
   # pylint: enable=line-too-long
-  if dilations is None:
-    dilations = [1, 1, 1, 1]
+  return conv2d(input,  # pylint: disable=redefined-builtin
+                filters,
+                strides,
+                padding,
+                use_cudnn_on_gpu=True,
+                data_format=data_format,
+                dilations=dilations,
+                name=name)
+
+
+@tf_export(v1=["nn.conv2d"])
+def conv2d(  # pylint: disable=redefined-builtin,dangerous-default-value
+    input,
+    filter=None,
+    strides=None,
+    padding=None,
+    use_cudnn_on_gpu=True,
+    data_format="NHWC",
+    dilations=[1, 1, 1, 1],
+    name=None,
+    filters=None):
+  r"""Computes a 2-D convolution given 4-D `input` and `filter` tensors.
+
+  Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+  and a filter / kernel tensor of shape
+  `[filter_height, filter_width, in_channels, out_channels]`, this op
+  performs the following:
+
+  1. Flattens the filter to a 2-D matrix with shape
+     `[filter_height * filter_width * in_channels, output_channels]`.
+  2. Extracts image patches from the input tensor to form a *virtual*
+     tensor of shape `[batch, out_height, out_width,
+     filter_height * filter_width * in_channels]`.
+  3. For each patch, right-multiplies the filter matrix and the image patch
+     vector.
+
+  In detail, with the default NHWC format,
+
+      output[b, i, j, k] =
+          sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q]
+                          * filter[di, dj, q, k]
+
+  Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+  horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+
+  Args:
+    input: A `Tensor`. Must be one of the following types:
+      `half`, `bfloat16`, `float32`, `float64`.
+      A 4-D tensor. The dimension order is interpreted according to the value
+      of `data_format`, see below for details.
+    filter: A `Tensor`. Must have the same type as `input`.
+      A 4-D tensor of shape
+      `[filter_height, filter_width, in_channels, out_channels]`
+    strides: An int or list of `ints` that has length `1`, `2` or `4`.  The
+      stride of the sliding window for each dimension of `input`. If a single
+      value is given it is replicated in the `H` and `W` dimension. By default
+      the `N` and `C` dimensions are set to 1. The dimension order is determined
+      by the value of `data_format`, see below for details.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
+      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
+      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right]]`.
+    use_cudnn_on_gpu: An optional `bool`. Defaults to `True`.
+    data_format: An optional `string` from: `"NHWC", "NCHW"`.
+      Defaults to `"NHWC"`.
+      Specify the data format of the input and output data. With the
+      default format "NHWC", the data is stored in the order of:
+          [batch, height, width, channels].
+      Alternatively, the format could be "NCHW", the data storage order of:
+          [batch, channels, height, width].
+    dilations: An int or list of `ints` that has length `1`, `2` or `4`,
+      defaults to 1. The dilation factor for each dimension of`input`. If a
+      single value is given it is replicated in the `H` and `W` dimension. By
+      default the `N` and `C` dimensions are set to 1. If set to k > 1, there
+      will be k-1 skipped cells between each filter element on that dimension.
+      The dimension order is determined by the value of `data_format`, see above
+      for details. Dilations in the batch and depth dimensions if a 4-d tensor
+      must be 1.
+    name: A name for the operation (optional).
+    filters: Alias for filter.
+
+  Returns:
+    A `Tensor`. Has the same type as `input`.
+  """
+  filter = deprecation.deprecated_argument_lookup(
+      "filters", filters, "filter", filter)
+  padding, explicit_paddings = _convert_padding(padding)
+  if data_format is None:
+    data_format = "NHWC"
+  channel_index = 1 if data_format.startswith("NC") else 3
+
+  strides = _get_sequence(strides, 2, channel_index, "strides")
+  dilations = _get_sequence(dilations, 2, channel_index, "dilations")
   return gen_nn_ops.conv2d(input,  # pylint: disable=redefined-builtin
-                           filters,
+                           filter,
                            strides,
                            padding,
-                           use_cudnn_on_gpu=True,
+                           use_cudnn_on_gpu=use_cudnn_on_gpu,
+                           explicit_paddings=explicit_paddings,
                            data_format=data_format,
                            dilations=dilations,
                            name=name)
-tf_export(v1=["nn.conv2d"])(gen_nn_ops.conv2d)
-
-
-@tf_export("nn.conv2d_backprop_filter", v1=[])
-def conv2d_backprop_filter_v2(input,  # pylint: disable=redefined-builtin
-                              filter_sizes,
-                              out_backprop,
-                              strides,
-                              padding,
-                              data_format="NHWC",
-                              dilations=None,
-                              name=None):
+
+
+@tf_export(v1=["nn.conv2d_backprop_filter"])
+def conv2d_backprop_filter(  # pylint: disable=redefined-builtin,dangerous-default-value
+    input,
+    filter_sizes,
+    out_backprop,
+    strides,
+    padding,
+    use_cudnn_on_gpu=True,
+    data_format="NHWC",
+    dilations=[1, 1, 1, 1],
+    name=None):
   r"""Computes the gradients of convolution with respect to the filter.
 
   Args:
@@ -1478,8 +1979,14 @@ def conv2d_backprop_filter_v2(input,  # pylint: disable=redefined-builtin
       The stride of the sliding window for each dimension of the input
       of the convolution. Must be in the same order as the dimension specified
       with format.
-    padding: A `string` from: `"SAME", "VALID"`.
-      The type of padding algorithm to use.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
+      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
+      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right]]`.
+    use_cudnn_on_gpu: An optional `bool`. Defaults to `True`.
     data_format: An optional `string` from: `"NHWC", "NCHW"`.
       Defaults to `"NHWC"`.
       Specify the data format of the input and output data. With the
@@ -1498,49 +2005,49 @@ def conv2d_backprop_filter_v2(input,  # pylint: disable=redefined-builtin
   Returns:
     A `Tensor`. Has the same type as `input`.
   """
-  if dilations is None:
-    dilations = [1, 1, 1, 1]
-  return gen_nn_ops.conv2d_backprop_filter(input,  # pylint: disable=redefined-builtin
-                                           filter_sizes,
-                                           out_backprop,
-                                           strides,
-                                           padding,
-                                           use_cudnn_on_gpu=True,
-                                           data_format=data_format,
-                                           dilations=dilations,
-                                           name=name)
-tf_export(v1=["nn.conv2d_backprop_filter"])(
-    gen_nn_ops.conv2d_backprop_filter)
-
-
-@tf_export("nn.conv2d_backprop_input", v1=[])
-def conv2d_backprop_input_v2(input_sizes,
-                             filters,
-                             out_backprop,
-                             strides,
-                             padding,
-                             data_format="NHWC",
-                             dilations=None,
-                             name=None):
+  padding, explicit_paddings = _convert_padding(padding)
+  return gen_nn_ops.conv2d_backprop_filter(
+      input, filter_sizes, out_backprop, strides, padding, use_cudnn_on_gpu,
+      explicit_paddings, data_format, dilations, name)
+
+
+@tf_export(v1=["nn.conv2d_backprop_input"])
+def conv2d_backprop_input(  # pylint: disable=redefined-builtin,dangerous-default-value
+    input_sizes,
+    filter=None,
+    out_backprop=None,
+    strides=None,
+    padding=None,
+    use_cudnn_on_gpu=True,
+    data_format="NHWC",
+    dilations=[1, 1, 1, 1],
+    name=None,
+    filters=None):
   r"""Computes the gradients of convolution with respect to the input.
 
   Args:
     input_sizes: A `Tensor` of type `int32`.
       An integer vector representing the shape of `input`,
       where `input` is a 4-D `[batch, height, width, channels]` tensor.
-    filters: A `Tensor`. Must be one of the following types:
+    filter: A `Tensor`. Must be one of the following types:
       `half`, `bfloat16`, `float32`, `float64`.
       4-D with shape
       `[filter_height, filter_width, in_channels, out_channels]`.
-    out_backprop: A `Tensor`. Must have the same type as `filters`.
+    out_backprop: A `Tensor`. Must have the same type as `filter`.
       4-D with shape `[batch, out_height, out_width, out_channels]`.
       Gradients w.r.t. the output of the convolution.
     strides: A list of `ints`.
       The stride of the sliding window for each dimension of the input
       of the convolution. Must be in the same order as the dimension specified
       with format.
-    padding: A `string` from: `"SAME", "VALID"`.
-      The type of padding algorithm to use.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
+      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
+      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right]]`.
+    use_cudnn_on_gpu: An optional `bool`. Defaults to `True`.
     data_format: An optional `string` from: `"NHWC", "NCHW"`.
       Defaults to `"NHWC"`.
       Specify the data format of the input and output data. With the
@@ -1555,39 +2062,36 @@ def conv2d_backprop_input_v2(input_sizes,
       the value of `data_format`, see above for details. Dilations in the batch
       and depth dimensions must be 1.
     name: A name for the operation (optional).
+    filters: Alias for filter.
 
   Returns:
-    A `Tensor`. Has the same type as `filters`.
+    A `Tensor`. Has the same type as `filter`.
   """
-  if dilations is None:
-    dilations = [1, 1, 1, 1]
-  return gen_nn_ops.conv2d_backprop_input(input_sizes,
-                                          filters,
-                                          out_backprop,
-                                          strides,
-                                          padding,
-                                          use_cudnn_on_gpu=True,
-                                          data_format=data_format,
-                                          dilations=dilations,
-                                          name=name)
-tf_export(v1=["nn.conv2d_backprop_input"])(
-    gen_nn_ops.conv2d_backprop_input)
+  filter = deprecation.deprecated_argument_lookup(
+      "filters", filters, "filter", filter)
+  padding, explicit_paddings = _convert_padding(padding)
+  return gen_nn_ops.conv2d_backprop_input(
+      input_sizes, filter, out_backprop, strides, padding, use_cudnn_on_gpu,
+      explicit_paddings, data_format, dilations, name)
 
 
 @tf_export(v1=["nn.conv2d_transpose"])
 def conv2d_transpose(
-    value,
-    filter,  # pylint: disable=redefined-builtin
-    output_shape,
-    strides,
+    value=None,
+    filter=None,  # pylint: disable=redefined-builtin
+    output_shape=None,
+    strides=None,
     padding="SAME",
     data_format="NHWC",
-    name=None):
+    name=None,
+    input=None,  # pylint: disable=redefined-builtin
+    filters=None,
+    dilations=None):
   """The transpose of `conv2d`.
 
   This operation is sometimes called "deconvolution" after [Deconvolutional
-  Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
-  actually the transpose (gradient) of `conv2d` rather than an actual
+  Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf),
+  but is really the transpose (gradient) of `conv2d` rather than an actual
   deconvolution.
 
   Args:
@@ -1599,12 +2103,25 @@ def conv2d_transpose(
       `in_channels` dimension must match that of `value`.
     output_shape: A 1-D `Tensor` representing the output shape of the
       deconvolution op.
-    strides: A list of ints. The stride of the sliding window for each
-      dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1`, `2` or `4`.  The
+      stride of the sliding window for each dimension of `input`. If a single
+      value is given it is replicated in the `H` and `W` dimension. By default
+      the `N` and `C` dimensions are set to 0. The dimension order is determined
+      by the value of `data_format`, see below for details.
     padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
       See the "returns" section of `tf.nn.convolution` for details.
     data_format: A string. 'NHWC' and 'NCHW' are supported.
     name: Optional name for the returned tensor.
+    input: Alias for value.
+    filters: Alias for filter.
+    dilations: An int or list of `ints` that has length `1`, `2` or `4`,
+      defaults to 1. The dilation factor for each dimension of`input`. If a
+      single value is given it is replicated in the `H` and `W` dimension. By
+      default the `N` and `C` dimensions are set to 1. If set to k > 1, there
+      will be k-1 skipped cells between each filter element on that dimension.
+      The dimension order is determined by the value of `data_format`, see above
+      for details. Dilations in the batch and depth dimensions if a 4-d tensor
+      must be 1.
 
   Returns:
     A `Tensor` with the same type as `value`.
@@ -1613,70 +2130,90 @@ def conv2d_transpose(
     ValueError: If input/output depth does not match `filter`'s shape, or if
       padding is other than `'VALID'` or `'SAME'`.
   """
+  value = deprecated_argument_lookup("input", input, "value", value)
+  filter = deprecated_argument_lookup("filters", filters, "filter", filter)
   with ops.name_scope(name, "conv2d_transpose",
                       [value, filter, output_shape]) as name:
-    if data_format not in ("NCHW", "NHWC"):
-      raise ValueError("data_format has to be either NCHW or NHWC.")
-    value = ops.convert_to_tensor(value, name="value")
-    filter = ops.convert_to_tensor(filter, name="filter")  # pylint: disable=redefined-builtin
-    axis = 3 if data_format == "NHWC" else 1
-    if not value.get_shape().dims[axis].is_compatible_with(
-        filter.get_shape()[3]):
-      raise ValueError("input channels does not match filter's input channels, "
-                       "{} != {}".format(value.get_shape()[axis],
-                                         filter.get_shape()[3]))
-
-    output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
-    if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(4)):
-      raise ValueError("output_shape must have shape (4,), got {}".format(
-          output_shape_.get_shape()))
-
-    if isinstance(output_shape, (list, np.ndarray)):
-      # output_shape's shape should be == [4] if reached this point.
-      if not filter.get_shape().dims[2].is_compatible_with(
-          output_shape[axis]):
-        raise ValueError(
-            "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[axis],
-                              filter.get_shape()[2]))
-
-    if padding != "VALID" and padding != "SAME":
-      raise ValueError("padding must be either VALID or SAME:"
-                       " {}".format(padding))
-
-    return gen_nn_ops.conv2d_backprop_input(
-        input_sizes=output_shape_,
-        filter=filter,
-        out_backprop=value,
-        strides=strides,
+    return conv2d_transpose_v2(
+        value,
+        filter,
+        output_shape,
+        strides,
         padding=padding,
         data_format=data_format,
+        dilations=dilations,
         name=name)
 
 
-# pylint: disable=redefined-builtin
 @tf_export("nn.conv2d_transpose", v1=[])
 def conv2d_transpose_v2(
-    input,
+    input,  # pylint: disable=redefined-builtin
     filters,  # pylint: disable=redefined-builtin
     output_shape,
     strides,
     padding="SAME",
     data_format="NHWC",
+    dilations=None,
     name=None):
-  return conv2d_transpose(
-      input,
-      filters,
-      output_shape,
-      strides,
-      padding=padding,
-      data_format=data_format,
-      name=name)
-# pylint: enable=redefined-builtin
-conv2d_transpose_v2.__doc__ = deprecation.rewrite_argument_docstring(
-    deprecation.rewrite_argument_docstring(
-        conv2d_transpose.__doc__, "filter", "filters"),
-    "value", "input")
+  """The transpose of `conv2d`.
+
+  This operation is sometimes called "deconvolution" after [Deconvolutional
+  Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
+  actually the transpose (gradient) of `conv2d` rather than an actual
+  deconvolution.
+
+  Args:
+    input: A 4-D `Tensor` of type `float` and shape `[batch, height, width,
+      in_channels]` for `NHWC` data format or `[batch, in_channels, height,
+      width]` for `NCHW` data format.
+    filters: A 4-D `Tensor` with the same type as `value` and shape `[height,
+      width, output_channels, in_channels]`.  `filter`'s `in_channels` dimension
+      must match that of `value`.
+    output_shape: A 1-D `Tensor` representing the output shape of the
+      deconvolution op.
+    strides: An int or list of `ints` that has length `1`, `2` or `4`.  The
+      stride of the sliding window for each dimension of `input`. If a single
+      value is given it is replicated in the `H` and `W` dimension. By default
+      the `N` and `C` dimensions are set to 0. The dimension order is determined
+      by the value of `data_format`, see below for details.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. 'NHWC' and 'NCHW' are supported.
+    dilations: An int or list of `ints` that has length `1`, `2` or `4`,
+      defaults to 1. The dilation factor for each dimension of`input`. If a
+      single value is given it is replicated in the `H` and `W` dimension. By
+      default the `N` and `C` dimensions are set to 1. If set to k > 1, there
+      will be k-1 skipped cells between each filter element on that dimension.
+      The dimension order is determined by the value of `data_format`, see above
+      for details. Dilations in the batch and depth dimensions if a 4-d tensor
+      must be 1.
+    name: Optional name for the returned tensor.
+
+  Returns:
+    A `Tensor` with the same type as `value`.
+
+  Raises:
+    ValueError: If input/output depth does not match `filter`'s shape, or if
+      padding is other than `'VALID'` or `'SAME'`.
+  """
+  with ops.name_scope(name, "conv2d_transpose",
+                      [input, filter, output_shape]) as name:
+    if data_format is None:
+      data_format = "NHWC"
+    channel_index = 1 if data_format.startswith("NC") else 3
+
+    strides = _get_sequence(strides, 2, channel_index, "strides")
+    dilations = _get_sequence(dilations, 2, channel_index, "dilations")
+
+    return gen_nn_ops.conv2d_backprop_input(
+        input_sizes=output_shape,
+        filter=filters,
+        out_backprop=input,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilations=dilations,
+        name=name)
 
 
 @tf_export("nn.atrous_conv2d_transpose")
@@ -1689,9 +2226,9 @@ def atrous_conv2d_transpose(value,
   """The transpose of `atrous_conv2d`.
 
   This operation is sometimes called "deconvolution" after [Deconvolutional
-  Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
-  actually the transpose (gradient) of `atrous_conv2d` rather than an actual
-  deconvolution.
+  Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf),
+  but is really the transpose (gradient) of `atrous_conv2d` rather than an
+  actual deconvolution.
 
   Args:
     value: A 4-D `Tensor` of type `float`. It needs to be in the default `NHWC`
@@ -1837,32 +2374,52 @@ def conv3d_v2(input,  # pylint: disable=redefined-builtin,missing-docstring
               name=None):
   if dilations is None:
     dilations = [1, 1, 1, 1, 1]
-  return gen_nn_ops.conv3d(input,  # pylint: disable=redefined-builtin
+  return gen_nn_ops.conv3d(input,
                            filters,
                            strides,
                            padding,
                            data_format=data_format,
                            dilations=dilations,
                            name=name)
-tf_export(v1=["nn.conv3d"])(gen_nn_ops.conv3d)
+
+
+@tf_export(v1=["nn.conv3d"])
+def conv3d_v1(  # pylint: disable=missing-docstring,dangerous-default-value
+    input,  # pylint: disable=redefined-builtin
+    filter=None,  # pylint: disable=redefined-builtin
+    strides=None,
+    padding=None,
+    data_format="NDHWC",
+    dilations=[1, 1, 1, 1, 1],
+    name=None,
+    filters=None):
+  filter = deprecated_argument_lookup("filters", filters, "filter", filter)
+  return gen_nn_ops.conv3d(
+      input, filter, strides, padding, data_format, dilations, name)
+
+
 conv3d_v2.__doc__ = deprecation.rewrite_argument_docstring(
     gen_nn_ops.conv3d.__doc__, "filter", "filters")
+conv3d_v1.__doc__ = gen_nn_ops.conv3d.__doc__
 
 
 @tf_export(v1=["nn.conv3d_transpose"])
 def conv3d_transpose(
     value,
-    filter,  # pylint: disable=redefined-builtin
-    output_shape,
-    strides,
+    filter=None,  # pylint: disable=redefined-builtin
+    output_shape=None,
+    strides=None,
     padding="SAME",
     data_format="NDHWC",
-    name=None):
+    name=None,
+    input=None,  # pylint: disable=redefined-builtin
+    filters=None,
+    dilations=None):
   """The transpose of `conv3d`.
 
   This operation is sometimes called "deconvolution" after [Deconvolutional
-  Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
-  actually the transpose (gradient) of `conv3d` rather than an actual
+  Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf),
+  but is really the transpose (gradient) of `conv3d` rather than an actual
   deconvolution.
 
   Args:
@@ -1880,6 +2437,16 @@ def conv3d_transpose(
     data_format: A string, either `'NDHWC'` or `'NCDHW`' specifying the layout
       of the input and output tensors. Defaults to `'NDHWC'`.
     name: Optional name for the returned tensor.
+    input: Alias of value.
+    filters: Alias of filter.
+    dilations: An int or list of `ints` that has length `1`, `3` or `5`,
+      defaults to 1. The dilation factor for each dimension of`input`. If a
+      single value is given it is replicated in the `D`, `H` and `W` dimension.
+      By default the `N` and `C` dimensions are set to 1. If set to k > 1, there
+      will be k-1 skipped cells between each filter element on that dimension.
+      The dimension order is determined by the value of `data_format`, see above
+      for details. Dilations in the batch and depth dimensions if a 5-d tensor
+      must be 1.
 
   Returns:
     A `Tensor` with the same type as `value`.
@@ -1888,68 +2455,166 @@ def conv3d_transpose(
     ValueError: If input/output depth does not match `filter`'s shape, or if
       padding is other than `'VALID'` or `'SAME'`.
   """
-  with ops.name_scope(name, "conv3d_transpose",
-                      [value, filter, output_shape]) as name:
-    value = ops.convert_to_tensor(value, name="value")
-    filter = ops.convert_to_tensor(filter, name="filter")  # pylint: disable=redefined-builtin
-    axis = 1 if data_format == "NCDHW" else 4
-    if not value.get_shape().dims[axis].is_compatible_with(
-        filter.get_shape()[4]):
-      raise ValueError("input channels does not match filter's input channels, "
-                       "{} != {}".format(value.get_shape()[axis],
-                                         filter.get_shape()[4]))
+  filter = deprecated_argument_lookup("filters", filters, "filter", filter)
+  value = deprecated_argument_lookup("input", input, "value", value)
+  return conv3d_transpose_v2(
+      value,
+      filter,
+      output_shape,
+      strides,
+      padding=padding,
+      data_format=data_format,
+      dilations=dilations,
+      name=name)
 
-    output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
-    if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(5)):
-      raise ValueError("output_shape must have shape (5,), got {}".format(
-          output_shape_.get_shape()))
 
-    if isinstance(output_shape, (list, np.ndarray)):
-      # output_shape's shape should be == [5] if reached this point.
-      if not filter.get_shape().dims[3].is_compatible_with(
-          output_shape[axis]):
-        raise ValueError(
-            "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[axis],
-                              filter.get_shape()[3]))
+@tf_export("nn.conv3d_transpose", v1=[])
+def conv3d_transpose_v2(input,  # pylint: disable=redefined-builtin
+                        filters,
+                        output_shape,
+                        strides,
+                        padding="SAME",
+                        data_format="NDHWC",
+                        dilations=None,
+                        name=None):
+  """The transpose of `conv3d`.
 
-    if padding != "VALID" and padding != "SAME":
-      raise ValueError("padding must be either VALID or SAME:"
-                       " {}".format(padding))
+  This operation is sometimes called "deconvolution" after [Deconvolutional
+  Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
+  actually the transpose (gradient) of `conv2d` rather than an actual
+  deconvolution.
+
+  Args:
+    input: A 5-D `Tensor` of type `float` and shape `[batch, height, width,
+      in_channels]` for `NHWC` data format or `[batch, in_channels, height,
+      width]` for `NCHW` data format.
+    filters: A 5-D `Tensor` with the same type as `value` and shape `[height,
+      width, output_channels, in_channels]`.  `filter`'s `in_channels` dimension
+      must match that of `value`.
+    output_shape: A 1-D `Tensor` representing the output shape of the
+      deconvolution op.
+    strides: An int or list of `ints` that has length `1`, `3` or `5`.  The
+      stride of the sliding window for each dimension of `input`. If a single
+      value is given it is replicated in the `D`, `H` and `W` dimension. By
+      default the `N` and `C` dimensions are set to 0. The dimension order is
+      determined by the value of `data_format`, see below for details.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. 'NDHWC' and 'NCDHW' are supported.
+    dilations: An int or list of `ints` that has length `1`, `3` or `5`,
+      defaults to 1. The dilation factor for each dimension of`input`. If a
+      single value is given it is replicated in the `D`, `H` and `W` dimension.
+      By default the `N` and `C` dimensions are set to 1. If set to k > 1, there
+      will be k-1 skipped cells between each filter element on that dimension.
+      The dimension order is determined by the value of `data_format`, see above
+      for details. Dilations in the batch and depth dimensions if a 5-d tensor
+      must be 1.
+    name: Optional name for the returned tensor.
+
+  Returns:
+    A `Tensor` with the same type as `value`.
+  """
+  with ops.name_scope(name, "conv3d_transpose",
+                      [input, filter, output_shape]) as name:
+    if data_format is None:
+      data_format = "NDHWC"
+    channel_index = 1 if data_format.startswith("NC") else 4
+
+    strides = _get_sequence(strides, 3, channel_index, "strides")
+    dilations = _get_sequence(dilations, 3, channel_index, "dilations")
 
     return gen_nn_ops.conv3d_backprop_input_v2(
-        input_sizes=output_shape_,
-        filter=filter,
-        out_backprop=value,
+        input_sizes=output_shape,
+        filter=filters,
+        out_backprop=input,
         strides=strides,
         padding=padding,
         data_format=data_format,
+        dilations=dilations,
         name=name)
 
 
-# pylint: disable=redefined-builtin
-@tf_export("nn.conv3d_transpose", v1=[])
-def conv3d_transpose_v2(
-    input,
-    filters,
-    output_shape,
-    strides,
-    padding="SAME",
-    data_format="NDHWC",
-    name=None):
-  return conv3d_transpose(
-      input,
-      filters,
-      output_shape,
-      strides,
-      padding=padding,
-      data_format=data_format,
-      name=name)
-# pylint: enable=redefined-builtin
-conv3d_transpose_v2.__doc__ = deprecation.rewrite_argument_docstring(
-    deprecation.rewrite_argument_docstring(
-        conv3d_transpose.__doc__, "filter", "filters"),
-    "value", "input")
+CONV_TRANSPOSE_OPS = (
+    conv1d_transpose,
+    conv2d_transpose_v2,
+    conv3d_transpose_v2,
+)
+
+
+@tf_export("nn.conv_transpose")
+def conv_transpose(input,  # pylint: disable=redefined-builtin
+                   filters,
+                   output_shape,
+                   strides,
+                   padding="SAME",
+                   data_format=None,
+                   dilations=None,
+                   name=None):
+  """The transpose of `convolution`.
+
+  This operation is sometimes called "deconvolution" after [Deconvolutional
+  Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
+  actually the transpose (gradient) of `convolution` rather than an actual
+  deconvolution.
+
+  Args:
+    input: An N+2 dimensional `Tensor` of shape
+      `[batch_size] + input_spatial_shape + [in_channels]` if data_format does
+      not start with "NC" (default), or
+      `[batch_size, in_channels] + input_spatial_shape` if data_format starts
+      with "NC". It must be one of the following types:
+      `half`, `bfloat16`, `float32`, `float64`.
+    filters: An N+2 dimensional `Tensor` with the same type as `input` and
+      shape `spatial_filter_shape + [in_channels, out_channels]`.
+    output_shape: A 1-D `Tensor` representing the output shape of the
+      deconvolution op.
+    strides: An int or list of `ints` that has length `1`, `N` or `N+2`.  The
+      stride of the sliding window for each dimension of `input`. If a single
+      value is given it is replicated in the spatial dimensions. By default
+      the `N` and `C` dimensions are set to 0. The dimension order is determined
+      by the value of `data_format`, see below for details.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string or None.  Specifies whether the channel dimension of
+      the `input` and output is the last dimension (default, or if `data_format`
+      does not start with "NC"), or the second dimension (if `data_format`
+      starts with "NC").  For N=1, the valid values are "NWC" (default) and
+      "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".
+      For N=3, the valid values are "NDHWC" (default) and "NCDHW".
+    dilations: An int or list of `ints` that has length `1`, `N` or `N+2`,
+      defaults to 1. The dilation factor for each dimension of`input`. If a
+      single value is given it is replicated in the spatial dimensions. By
+      default the `N` and `C` dimensions are set to 1. If set to k > 1, there
+      will be k-1 skipped cells between each filter element on that dimension.
+      The dimension order is determined by the value of `data_format`, see above
+      for details.
+    name: A name for the operation (optional). If not specified "conv_transpose"
+      is used.
+
+  Returns:
+    A `Tensor` with the same type as `value`.
+  """
+  with ops.name_scope(name, "conv_transpose",
+                      [input, filter, output_shape]) as name:
+    if output_shape is not None:
+      n = len(output_shape) - 2
+    else:
+      raise ValueError("output_shape cannot be None")
+
+    if not 1 <= n <= 3:
+      raise ValueError(
+          "output_shape must be of length 3, 4 or 5 but was {}.".format(n + 2))
+
+    op = CONV_TRANSPOSE_OPS[n-1]
+    return op(
+        input,
+        filters,
+        output_shape,
+        strides,
+        padding=padding,
+        data_format=data_format,
+        dilations=dilations,
+        name=name)
 
 
 @tf_export("nn.bias_add")
@@ -1967,13 +2632,21 @@ def bias_add(value, bias, data_format=None, name=None):
     bias: A 1-D `Tensor` with size matching the last dimension of `value`.
       Must be the same type as `value` unless `value` is a quantized type,
       in which case a different quantized type may be used.
-    data_format: A string. 'NHWC' and 'NCHW' are supported.
+    data_format: A string. 'N...C' and 'NC...' are supported.
     name: A name for the operation (optional).
 
   Returns:
     A `Tensor` with the same type as `value`.
   """
   with ops.name_scope(name, "BiasAdd", [value, bias]) as name:
+    if data_format is not None:
+      if data_format.startswith("NC"):
+        data_format = "NCHW"
+      elif data_format.startswith("N") and data_format.endswith("C"):
+        data_format = "NHWC"
+      else:
+        raise ValueError("data_format must be of the form `N...C` or `NC...`")
+
     if not context.executing_eagerly():
       value = ops.convert_to_tensor(value, name="input")
       bias = ops.convert_to_tensor(bias, dtype=value.dtype, name="bias")
@@ -2082,7 +2755,7 @@ def leaky_relu(features, alpha=0.2, name=None):
       features = math_ops.to_float(features)
     if compat.forward_compatible(2018, 11, 1):
       if isinstance(alpha, np.ndarray):
-        alpha = np.asscalar(alpha)
+        alpha = alpha.item()
       return gen_nn_ops.leaky_relu(features, alpha=alpha, name=name)
     alpha = ops.convert_to_tensor(alpha, dtype=features.dtype, name="alpha")
     return math_ops.maximum(alpha * features, features, name=name)
@@ -2151,7 +2824,7 @@ def _softmax(logits, compute_op, dim=-1, name=None):
 
   # We need its original shape for shape inference.
   shape = logits.get_shape()
-  is_last_dim = (dim is -1) or (dim == shape.ndims - 1)
+  is_last_dim = (dim == -1) or (dim == shape.ndims - 1)
 
   if is_last_dim:
     return compute_op(logits, name=name)
@@ -2159,7 +2832,7 @@ def _softmax(logits, compute_op, dim=-1, name=None):
   dim_val = dim
   if isinstance(dim, ops.Tensor):
     dim_val = tensor_util.constant_value(dim)
-  if dim_val is not None and (dim_val < -shape.ndims or dim_val >= shape.ndims):
+  if dim_val is not None and not -shape.ndims <= dim_val < shape.ndims:
     raise errors_impl.InvalidArgumentError(
         None, None,
         "Dimension (%d) must be in the range [%d, %d) where %d is the number of"
@@ -2169,6 +2842,14 @@ def _softmax(logits, compute_op, dim=-1, name=None):
   # If dim is not the last dimension, we have to do a transpose so that we can
   # still perform softmax on its last dimension.
 
+  # In case dim is negative (and is not last dimension -1), add shape.ndims
+  ndims = array_ops.rank(logits)
+  if not isinstance(dim, ops.Tensor):
+    if dim < 0:
+      dim += ndims
+  else:
+    dim = array_ops.where(math_ops.less(dim, 0), dim + ndims, dim)
+
   # Swap logits' dimension of dim and its last dimension.
   input_rank = array_ops.rank(logits)
   dim_axis = dim % shape.ndims
@@ -2496,7 +3177,8 @@ def softmax_cross_entropy_with_logits(
     labels=None,
     logits=None,
     dim=-1,
-    name=None):
+    name=None,
+    axis=None):
   """Computes softmax cross entropy between `logits` and `labels`.
 
   Measures the probability error in discrete classification tasks in which the
@@ -2536,12 +3218,14 @@ def softmax_cross_entropy_with_logits(
     logits: Unscaled log probabilities.
     dim: The class dimension. Defaulted to -1 which is the last dimension.
     name: A name for the operation (optional).
+    axis: Alias for dim.
 
   Returns:
     A `Tensor` that contains the softmax cross entropy loss. Its type is the
     same as `logits` and its shape is the same as `labels` except that it does
     not have the last dimension of `labels`.
   """
+  dim = deprecated_argument_lookup("axis", axis, "dim", dim)
   _ensure_xent_args("softmax_cross_entropy_with_logits", _sentinel, labels,
                     logits)
 
@@ -2677,51 +3361,397 @@ def sparse_softmax_cross_entropy_with_logits(
         return cost
 
 
-@tf_export("nn.avg_pool")
-def avg_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
-  """Performs the average pooling on the input.
+@tf_export("nn.avg_pool", v1=["nn.avg_pool_v2"])
+def avg_pool_v2(input, ksize, strides, padding, data_format=None, name=None):  # pylint: disable=redefined-builtin
+  """Performs the avg pooling on the input.
+
+  Each entry in `output` is the mean of the corresponding size `ksize`
+  window in `value`.
+
+  Args:
+    input:  Tensor of rank N+2, of shape `[batch_size] + input_spatial_shape +
+      [num_channels]` if `data_format` does not start with "NC" (default), or
+      `[batch_size, num_channels] + input_spatial_shape` if data_format starts
+      with "NC". Pooling happens over the spatial dimensions only.
+    ksize: An int or list of `ints` that has length `1`, `N` or `N+2`. The size
+      of the window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1`, `N` or `N+2`. The
+      stride of the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. Specifies the channel dimension. For N=1 it can be
+      either "NWC" (default) or "NCW", for N=2 it can be either "NHWC" (default)
+      or "NCHW" and for N=3 either "NDHWC" (default) or "NCDHW".
+    name: Optional name for the operation.
+
+  Returns:
+    A `Tensor` of format specified by `data_format`.
+    The average pooled output tensor.
+  """
+  if input.shape is not None:
+    n = len(input.shape) - 2
+  elif data_format is not None:
+    n = len(data_format) - 2
+  else:
+    raise ValueError(
+        "The input must have a rank or a data format must be given.")
+  if not 1 <= n <= 3:
+    raise ValueError(
+        "Input tensor must be of rank 3, 4 or 5 but was {}.".format(n + 2))
+
+  if data_format is None:
+    channel_index = n + 1
+  else:
+    channel_index = 1 if data_format.startswith("NC") else n + 1
+
+  ksize = _get_sequence(ksize, n, channel_index, "ksize")
+  strides = _get_sequence(strides, n, channel_index, "strides")
+
+  avg_pooling_ops = {
+      1: avg_pool1d,
+      2: gen_nn_ops.avg_pool,
+      3: gen_nn_ops.avg_pool3d
+  }
+
+  op = avg_pooling_ops[n]
+  return op(
+      input,
+      ksize=ksize,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      name=name)
+
+
+@tf_export(v1=["nn.avg_pool", "nn.avg_pool2d"])
+def avg_pool(value, ksize, strides, padding, data_format="NHWC",
+             name=None, input=None):  # pylint: disable=redefined-builtin
+  """Performs the average pooling on the input.
+
+  Each entry in `output` is the mean of the corresponding size `ksize`
+  window in `value`.
+
+  Args:
+    value: A 4-D `Tensor` of shape `[batch, height, width, channels]` and type
+      `float32`, `float64`, `qint8`, `quint8`, or `qint32`.
+    ksize: An int or list of `ints` that has length `1`, `2` or `4`. The size of
+      the window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1`, `2` or `4`. The
+      stride of the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
+      See the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. 'NHWC' and 'NCHW' are supported.
+    name: Optional name for the operation.
+    input: Alias for value.
+
+  Returns:
+    A `Tensor` with the same type as `value`.  The average pooled output tensor.
+  """
+  with ops.name_scope(name, "AvgPool", [value]) as name:
+    value = deprecation.deprecated_argument_lookup(
+        "input", input, "value", value)
+
+    if data_format is None:
+      data_format = "NHWC"
+    channel_index = 1 if data_format.startswith("NC") else 3
+
+    ksize = _get_sequence(ksize, 2, channel_index, "ksize")
+    strides = _get_sequence(strides, 2, channel_index, "strides")
+
+    return gen_nn_ops.avg_pool(
+        value,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
+
+
+@tf_export("nn.avg_pool2d", v1=[])
+def avg_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):  # pylint: disable=redefined-builtin
+  """Performs the average pooling on the input.
+
+  Each entry in `output` is the mean of the corresponding size `ksize`
+  window in `value`.
+
+  Args:
+    input: A 4-D `Tensor` of shape `[batch, height, width, channels]` and type
+      `float32`, `float64`, `qint8`, `quint8`, or `qint32`.
+    ksize: An int or list of `ints` that has length `1`, `2` or `4`. The size of
+      the window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1`, `2` or `4`. The
+      stride of the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
+      See the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. 'NHWC' and 'NCHW' are supported.
+    name: Optional name for the operation.
+
+  Returns:
+    A `Tensor` with the same type as `value`.  The average pooled output tensor.
+  """
+  with ops.name_scope(name, "AvgPool2D", [input]) as name:
+    if data_format is None:
+      data_format = "NHWC"
+    channel_index = 1 if data_format.startswith("NC") else 3
+
+    ksize = _get_sequence(ksize, 2, channel_index, "ksize")
+    strides = _get_sequence(strides, 2, channel_index, "strides")
+
+    return gen_nn_ops.avg_pool(
+        input,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
+
+
+@tf_export("nn.avg_pool1d")
+def avg_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):  # pylint: disable=redefined-builtin
+  """Performs the average pooling on the input.
+
+  Each entry in `output` is the mean of the corresponding size `ksize`
+  window in `value`.
+
+  Note internally this op reshapes and uses the underlying 2d operation.
+
+  Args:
+    input: A 3-D `Tensor` of the format specified by `data_format`.
+    ksize: An int or list of `ints` that has length `1` or `3`. The size of the
+      window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1` or `3`. The stride of
+      the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: An optional string from: "NWC", "NCW". Defaults to "NWC".
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of format specified by `data_format`.
+    The max pooled output tensor.
+  """
+  with ops.name_scope(name, "AvgPool1D", [input]) as name:
+    if data_format is None:
+      data_format = "NWC"
+    channel_index = 1 if data_format.startswith("NC") else 2
+    ksize = [1] + _get_sequence(ksize, 1, channel_index, "ksize")
+    strides = [1] + _get_sequence(strides, 1, channel_index, "strides")
+
+    data_format = "NHWC" if data_format == "NWC" else "NCHW"
+    expanding_dim = 1 if data_format == "NWC" else 2
+
+    input = array_ops.expand_dims_v2(input, expanding_dim)
+    result = gen_nn_ops.avg_pool(
+        input,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
+    return array_ops.squeeze(result, expanding_dim)
+
+
+@tf_export("nn.avg_pool3d")
+def avg_pool3d(input, ksize, strides, padding, data_format="NDHWC", name=None):  # pylint: disable=redefined-builtin
+  """Performs the average pooling on the input.
+
+  Each entry in `output` is the mean of the corresponding size `ksize`
+  window in `value`.
+
+  Args:
+    input: A 5-D `Tensor` of shape `[batch, height, width, channels]` and type
+      `float32`, `float64`, `qint8`, `quint8`, or `qint32`.
+    ksize: An int or list of `ints` that has length `1`, `3` or `5`. The size of
+      the window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1`, `3` or `5`. The
+      stride of the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
+      See the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. 'NDHWC' and 'NCDHW' are supported.
+    name: Optional name for the operation.
+
+  Returns:
+    A `Tensor` with the same type as `value`.  The average pooled output tensor.
+  """
+  with ops.name_scope(name, "AvgPool3D", [input]) as name:
+    if data_format is None:
+      data_format = "NDHWC"
+    channel_index = 1 if data_format.startswith("NC") else 3
+
+    ksize = _get_sequence(ksize, 3, channel_index, "ksize")
+    strides = _get_sequence(strides, 3, channel_index, "strides")
+
+    return gen_nn_ops.avg_pool3d(
+        input,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
+
+
+# pylint: disable=redefined-builtin
+@tf_export("nn.max_pool", v1=["nn.max_pool_v2"])
+def max_pool_v2(input, ksize, strides, padding, data_format=None, name=None):
+  """Performs the max pooling on the input.
+
+  Args:
+    input:  Tensor of rank N+2, of shape `[batch_size] + input_spatial_shape +
+      [num_channels]` if `data_format` does not start with "NC" (default), or
+      `[batch_size, num_channels] + input_spatial_shape` if data_format starts
+      with "NC". Pooling happens over the spatial dimensions only.
+    ksize: An int or list of `ints` that has length `1`, `N` or `N+2`. The size
+      of the window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1`, `N` or `N+2`. The
+      stride of the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. Specifies the channel dimension. For N=1 it can be
+      either "NWC" (default) or "NCW", for N=2 it can be either "NHWC" (default)
+      or "NCHW" and for N=3 either "NDHWC" (default) or "NCDHW".
+    name: Optional name for the operation.
+
+  Returns:
+    A `Tensor` of format specified by `data_format`.
+    The max pooled output tensor.
+  """
+  if input.shape is not None:
+    n = len(input.shape) - 2
+  elif data_format is not None:
+    n = len(data_format) - 2
+  else:
+    raise ValueError(
+        "The input must have a rank or a data format must be given.")
+  if not 1 <= n <= 3:
+    raise ValueError(
+        "Input tensor must be of rank 3, 4 or 5 but was {}.".format(n + 2))
+
+  if data_format is None:
+    channel_index = n + 1
+  else:
+    channel_index = 1 if data_format.startswith("NC") else n + 1
+
+  ksize = _get_sequence(ksize, n, channel_index, "ksize")
+  strides = _get_sequence(strides, n, channel_index, "strides")
+
+  max_pooling_ops = {
+      1: max_pool1d,
+      2: gen_nn_ops.max_pool,
+      3: gen_nn_ops.max_pool3d
+  }
+
+  op = max_pooling_ops[n]
+  return op(
+      input,
+      ksize=ksize,
+      strides=strides,
+      padding=padding,
+      data_format=data_format,
+      name=name)
+# pylint: enable=redefined-builtin
+
+
+@tf_export(v1=["nn.max_pool"])
+def max_pool(value,
+             ksize,
+             strides,
+             padding,
+             data_format="NHWC",
+             name=None,
+             input=None):  # pylint: disable=redefined-builtin
+  """Performs the max pooling on the input.
+
+  Args:
+    value: A 4-D `Tensor` of the format specified by `data_format`.
+    ksize: An int or list of `ints` that has length `1`, `2` or `4`.
+      The size of the window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1`, `2` or `4`.
+      The stride of the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
+      See the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string. 'NHWC', 'NCHW' and 'NCHW_VECT_C' are supported.
+    name: Optional name for the operation.
+    input: Alias for value.
+
+  Returns:
+    A `Tensor` of format specified by `data_format`.
+    The max pooled output tensor.
+  """
+  value = deprecation.deprecated_argument_lookup("input", input, "value", value)
+  with ops.name_scope(name, "MaxPool", [value]) as name:
+    if data_format is None:
+      data_format = "NHWC"
+    channel_index = 1 if data_format.startswith("NC") else 3
+
+    ksize = _get_sequence(ksize, 2, channel_index, "ksize")
+    strides = _get_sequence(strides, 2, channel_index, "strides")
+
+    return gen_nn_ops.max_pool(
+        value,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
+
+
+# pylint: disable=redefined-builtin
+@tf_export("nn.max_pool1d")
+def max_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):
+  """Performs the max pooling on the input.
 
-  Each entry in `output` is the mean of the corresponding size `ksize`
-  window in `value`.
+  Note internally this op reshapes and uses the underlying 2d operation.
 
   Args:
-    value: A 4-D `Tensor` of shape `[batch, height, width, channels]` and type
-      `float32`, `float64`, `qint8`, `quint8`, or `qint32`.
-    ksize: A list or tuple of 4 ints. The size of the window for each dimension
-      of the input tensor.
-    strides: A list or tuple of 4 ints. The stride of the sliding window for
-      each dimension of the input tensor.
-    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-      See the "returns" section of `tf.nn.convolution` for details.
-    data_format: A string. 'NHWC' and 'NCHW' are supported.
-    name: Optional name for the operation.
+    input: A 3-D `Tensor` of the format specified by `data_format`.
+    ksize: An int or list of `ints` that has length `1` or `3`. The size of the
+      window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1` or `3`. The stride of
+      the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: An optional string from: "NWC", "NCW". Defaults to "NWC".
+    name: A name for the operation (optional).
 
   Returns:
-    A `Tensor` with the same type as `value`.  The average pooled output tensor.
+    A `Tensor` of format specified by `data_format`.
+    The max pooled output tensor.
   """
-  with ops.name_scope(name, "AvgPool", [value]) as name:
-    value = ops.convert_to_tensor(value, name="input")
-    return gen_nn_ops.avg_pool(
-        value,
+  with ops.name_scope(name, "MaxPool1d", [input]) as name:
+    if data_format is None:
+      data_format = "NWC"
+    channel_index = 1 if data_format.startswith("NC") else 2
+    ksize = [1] + _get_sequence(ksize, 1, channel_index, "ksize")
+    strides = [1] + _get_sequence(strides, 1, channel_index, "strides")
+
+    data_format = "NHWC" if data_format == "NWC" else "NCHW"
+    expanding_dim = 1 if data_format == "NWC" else 2
+
+    input = array_ops.expand_dims_v2(input, expanding_dim)
+    result = gen_nn_ops.max_pool(
+        input,
         ksize=ksize,
         strides=strides,
         padding=padding,
         data_format=data_format,
         name=name)
+    return array_ops.squeeze(result, expanding_dim)
+# pylint: enable=redefined-builtin
 
 
-@tf_export("nn.max_pool")
-def max_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
+# pylint: disable=redefined-builtin
+@tf_export("nn.max_pool2d")
+def max_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):
   """Performs the max pooling on the input.
 
   Args:
-    value: A 4-D `Tensor` of the format specified by `data_format`.
-    ksize: A list or tuple of 4 ints. The size of the window for each dimension
-      of the input tensor.
-    strides: A list or tuple of 4 ints. The stride of the sliding window for
-      each dimension of the input tensor.
-    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-      See the "returns" section of `tf.nn.convolution` for details.
+    input: A 4-D `Tensor` of the format specified by `data_format`.
+    ksize: An int or list of `ints` that has length `1`, `2` or `4`. The size of
+      the window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1`, `2` or `4`. The
+      stride of the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
     data_format: A string. 'NHWC', 'NCHW' and 'NCHW_VECT_C' are supported.
     name: Optional name for the operation.
 
@@ -2729,20 +3759,69 @@ def max_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
     A `Tensor` of format specified by `data_format`.
     The max pooled output tensor.
   """
-  with ops.name_scope(name, "MaxPool", [value]) as name:
-    value = ops.convert_to_tensor(value, name="input")
+  with ops.name_scope(name, "MaxPool2d", [input]) as name:
+    if data_format is None:
+      data_format = "NHWC"
+    channel_index = 1 if data_format.startswith("NC") else 3
+
+    ksize = _get_sequence(ksize, 2, channel_index, "ksize")
+    strides = _get_sequence(strides, 2, channel_index, "strides")
+
     return gen_nn_ops.max_pool(
-        value,
+        input,
         ksize=ksize,
         strides=strides,
         padding=padding,
         data_format=data_format,
         name=name)
+# pylint: enable=redefined-builtin
 
 
 # pylint: disable=redefined-builtin
+@tf_export("nn.max_pool3d")
+def max_pool3d(input, ksize, strides, padding, data_format="NDHWC", name=None):
+  """Performs the max pooling on the input.
+
+  Args:
+    input: A 5-D `Tensor` of the format specified by `data_format`.
+    ksize: An int or list of `ints` that has length `1`, `3` or `5`. The size of
+      the window for each dimension of the input tensor.
+    strides: An int or list of `ints` that has length `1`, `3` or `5`. The
+      stride of the sliding window for each dimension of the input tensor.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: An optional string from: "NDHWC", "NCDHW". Defaults to "NDHWC".
+      The data format of the input and output data. With the default format
+      "NDHWC", the data is stored in the order of: [batch, in_depth, in_height,
+        in_width, in_channels]. Alternatively, the format could be "NCDHW", the
+      data storage order is: [batch, in_channels, in_depth, in_height,
+        in_width].
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of format specified by `data_format`.
+    The max pooled output tensor.
+  """
+  with ops.name_scope(name, "MaxPool3D", [input]) as name:
+    if data_format is None:
+      data_format = "NDHWC"
+    channel_index = 1 if data_format.startswith("NC") else 4
+
+    ksize = _get_sequence(ksize, 3, channel_index, "ksize")
+    strides = _get_sequence(strides, 3, channel_index, "strides")
+
+    return gen_nn_ops.max_pool3d(
+        input,
+        ksize=ksize,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        name=name)
+# pylint: enable=redefined-builtin
+
+
 @tf_export("nn.max_pool_with_argmax", v1=[])
-def max_pool_with_argmax_v2(input,
+def max_pool_with_argmax_v2(input,  # pylint: disable=redefined-builtin
                             ksize,
                             strides,
                             padding,
@@ -2766,9 +3845,9 @@ def max_pool_with_argmax_v2(input,
       `int32`, `uint8`, `int16`, `int8`, `int64`, `bfloat16`, `uint16`, `half`,
       `uint32`, `uint64`.
       4-D with shape `[batch, height, width, channels]`.  Input to pool over.
-    ksize: A list of `ints` that has length `>= 4`.
+    ksize: An int or list of `ints` that has length `1`, `2` or `4`.
       The size of the window for each dimension of the input tensor.
-    strides: A list of `ints` that has length `>= 4`.
+    strides: An int or list of `ints` that has length `1`, `2` or `4`.
       The stride of the sliding window for each dimension of the
       input tensor.
     padding: A `string` from: `"SAME", "VALID"`.
@@ -2791,6 +3870,9 @@ def max_pool_with_argmax_v2(input,
   if data_format != "NHWC":
     raise ValueError("Data formats other than 'NHWC' are not yet supported")
 
+  ksize = _get_sequence(ksize, 2, 3, "ksize")
+  strides = _get_sequence(strides, 2, 3, "strides")
+
   return gen_nn_ops.max_pool_with_argmax(input=input,
                                          ksize=ksize,
                                          strides=strides,
@@ -2798,7 +3880,27 @@ def max_pool_with_argmax_v2(input,
                                          Targmax=output_dtype,
                                          name=name)
 
-# pylint: enable=redefined-builtin
+
+@tf_export(v1=["nn.max_pool_with_argmax"])
+def max_pool_with_argmax_v1(input,  # pylint: disable=missing-docstring,redefined-builtin,invalid-name
+                            ksize,
+                            strides,
+                            padding,
+                            data_format="NHWC",
+                            Targmax=None,
+                            name=None,
+                            output_dtype=None):
+  if data_format != "NHWC":
+    raise ValueError("Data formats other than 'NHWC' are not yet supported")
+
+  Targmax = deprecated_argument_lookup(
+      "output_dtype", output_dtype, "Targmax", Targmax)
+  if Targmax is None:
+    Targmax = dtypes.int64
+  return gen_nn_ops.max_pool_with_argmax(
+      input, ksize, strides, padding, Targmax, name)
+
+max_pool_with_argmax_v1.__doc__ = gen_nn_ops.max_pool_with_argmax.__doc__
 
 
 @ops.RegisterStatistics("Conv2D", "flops")
@@ -2868,7 +3970,7 @@ def xw_plus_b(x, weights, biases, name=None):  # pylint: disable=invalid-name
     return bias_add(mm, biases, name=name)
 
 
-def xw_plus_b_v1(x, weights, biases, name=None):  # pylint: disable=invalid-name
+def xw_plus_b_v1(x, weights, biases, name=None):
   """Computes matmul(x, weights) + biases.
 
   This is a deprecated version of that will soon be removed.
@@ -2922,7 +4024,7 @@ def _get_noise_shape(x, noise_shape):
                              "Rate should be set to `rate = 1 - keep_prob`.",
                              "keep_prob")
 def dropout(x, keep_prob=None, noise_shape=None, seed=None, name=None,
-            rate=None):  # pylint: disable=invalid-name
+            rate=None):
   """Computes dropout.
 
   For each element of `x`, with probability `rate`, outputs `0`, and otherwise
@@ -2972,7 +4074,7 @@ def dropout(x, keep_prob=None, noise_shape=None, seed=None, name=None,
 
 
 @tf_export("nn.dropout", v1=[])
-def dropout_v2(x, rate, noise_shape=None, seed=None, name=None):  # pylint: disable=invalid-name
+def dropout_v2(x, rate, noise_shape=None, seed=None, name=None):
   """Computes dropout.
 
   With probability `rate`, drops elements of `x`. Input that are kept are
@@ -3032,15 +4134,19 @@ def dropout_v2(x, rate, noise_shape=None, seed=None, name=None):  # pylint: disa
         return x
 
     noise_shape = _get_noise_shape(x, noise_shape)
-
-    keep_prob = 1 - rate
-    # uniform [keep_prob, 1.0 + keep_prob)
-    random_tensor = keep_prob
-    random_tensor += random_ops.random_uniform(
+    # Sample a uniform distribution on [0.0, 1.0) and select values larger than
+    # rate.
+    #
+    # NOTE: Random uniform actually can only generate 2^23 floats on [1.0, 2.0)
+    # and subtract 1.0.
+    random_tensor = random_ops.random_uniform(
         noise_shape, seed=seed, dtype=x.dtype)
-    # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob)
-    binary_tensor = math_ops.floor(random_tensor)
-    ret = math_ops.divide(x, keep_prob) * binary_tensor
+    keep_prob = 1 - rate
+    scale = 1 / keep_prob
+    # NOTE: if (1.0 + rate) - 1 is equal to rate, then we want to consider that
+    # float to be selected, hence we use a >= comparison.
+    keep_mask = random_tensor >= rate
+    ret = x * scale * math_ops.cast(keep_mask, x.dtype)
     if not context.executing_eagerly():
       ret.set_shape(x.get_shape())
     return ret
@@ -3229,12 +4335,12 @@ def fractional_max_pool_v2(value,
 
   Args:
     value: A `Tensor`. 4-D with shape `[batch, height, width, channels]`.
-    pooling_ratio: A list of `floats` that has length >= 4.  Pooling ratio for
-      each dimension of `value`, currently only supports row and col dimension
-      and should be >= 1.0. For example, a valid pooling ratio looks like [1.0,
-      1.44, 1.73, 1.0]. The first and last elements must be 1.0 because we don't
-      allow pooling on batch and channels dimensions.  1.44 and 1.73 are pooling
-      ratio on height and width dimensions respectively.
+    pooling_ratio: An int or list of `ints` that has length `1`, `2` or `4`.
+      Pooling ratio for each dimension of `value`, currently only supports row
+      and col dimension and should be >= 1.0. For example, a valid pooling ratio
+      looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements must be 1.0
+      because we don't allow pooling on batch and channels dimensions.  1.44 and
+      1.73 are pooling ratio on height and width dimensions respectively.
     pseudo_random: An optional `bool`.  Defaults to `False`. When set to `True`,
       generates the pooling sequence in a pseudorandom fashion, otherwise, in a
       random fashion. Check paper [Benjamin Graham, Fractional
@@ -3260,6 +4366,8 @@ def fractional_max_pool_v2(value,
     row_pooling_sequence: A `Tensor` of type `int64`.
     col_pooling_sequence: A `Tensor` of type `int64`.
   """
+  pooling_ratio = _get_sequence(pooling_ratio, 2, 3, "pooling_ratio")
+
   if seed == 0:
     return gen_nn_ops.fractional_max_pool(value, pooling_ratio, pseudo_random,
                                           overlapping, deterministic=False,
@@ -3390,248 +4498,6 @@ def fractional_avg_pool_v2(value,
                                           seed=seed1, seed2=seed2, name=name)
 
 
-@tf_export(v1=["nn.conv1d"])
-@deprecation.deprecated_arg_values(
-    None,
-    "`NCHW` for data_format is deprecated, use `NCW` instead",
-    warn_once=True,
-    data_format="NCHW")
-@deprecation.deprecated_arg_values(
-    None,
-    "`NHWC` for data_format is deprecated, use `NWC` instead",
-    warn_once=True,
-    data_format="NHWC")
-def conv1d(value,
-           filters,
-           stride,
-           padding,
-           use_cudnn_on_gpu=None,
-           data_format=None,
-           name=None):
-  r"""Computes a 1-D convolution given 3-D input and filter tensors.
-
-  Given an input tensor of shape
-    [batch, in_width, in_channels]
-  if data_format is "NWC", or
-    [batch, in_channels, in_width]
-  if data_format is "NCW",
-  and a filter / kernel tensor of shape
-  [filter_width, in_channels, out_channels], this op reshapes
-  the arguments to pass them to conv2d to perform the equivalent
-  convolution operation.
-
-  Internally, this op reshapes the input tensors and invokes `tf.nn.conv2d`.
-  For example, if `data_format` does not start with "NC", a tensor of shape
-    [batch, in_width, in_channels]
-  is reshaped to
-    [batch, 1, in_width, in_channels],
-  and the filter is reshaped to
-    [1, filter_width, in_channels, out_channels].
-  The result is then reshaped back to
-    [batch, out_width, out_channels]
-  \(where out_width is a function of the stride and padding as in conv2d\) and
-  returned to the caller.
-
-  Args:
-    value: A 3D `Tensor`.  Must be of type `float16`, `float32`, or `float64`.
-    filters: A 3D `Tensor`.  Must have the same type as `value`.
-    stride: An `integer`.  The number of entries by which
-      the filter is moved right at each step.
-    padding: 'SAME' or 'VALID'
-    use_cudnn_on_gpu: An optional `bool`.  Defaults to `True`.
-    data_format: An optional `string` from `"NWC", "NCW"`.  Defaults
-      to `"NWC"`, the data is stored in the order of
-      [batch, in_width, in_channels].  The `"NCW"` format stores
-      data as [batch, in_channels, in_width].
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor`.  Has the same type as input.
-
-  Raises:
-    ValueError: if `data_format` is invalid.
-  """
-  with ops.name_scope(name, "conv1d", [value, filters]) as name:
-    # Reshape the input tensor to [batch, 1, in_width, in_channels]
-    if data_format is None or data_format == "NHWC" or data_format == "NWC":
-      data_format = "NHWC"
-      spatial_start_dim = 1
-      strides = [1, 1, stride, 1]
-    elif data_format == "NCHW" or data_format == "NCW":
-      data_format = "NCHW"
-      spatial_start_dim = 2
-      strides = [1, 1, 1, stride]
-    else:
-      raise ValueError("data_format must be \"NWC\" or \"NCW\".")
-    value = array_ops.expand_dims(value, spatial_start_dim)
-    filters = array_ops.expand_dims(filters, 0)
-    result = gen_nn_ops.conv2d(
-        value,
-        filters,
-        strides,
-        padding,
-        use_cudnn_on_gpu=use_cudnn_on_gpu,
-        data_format=data_format)
-    return array_ops.squeeze(result, [spatial_start_dim])
-
-
-@tf_export("nn.conv1d", v1=[])
-def conv1d_v2(input,  # pylint: disable=redefined-builtin
-              filters,
-              stride,
-              padding,
-              data_format=None,
-              name=None):
-  r"""Computes a 1-D convolution given 3-D input and filter tensors.
-
-  Given an input tensor of shape
-    [batch, in_width, in_channels]
-  if data_format is "NWC", or
-    [batch, in_channels, in_width]
-  if data_format is "NCW",
-  and a filter / kernel tensor of shape
-  [filter_width, in_channels, out_channels], this op reshapes
-  the arguments to pass them to conv2d to perform the equivalent
-  convolution operation.
-
-  Internally, this op reshapes the input tensors and invokes `tf.nn.conv2d`.
-  For example, if `data_format` does not start with "NC", a tensor of shape
-    [batch, in_width, in_channels]
-  is reshaped to
-    [batch, 1, in_width, in_channels],
-  and the filter is reshaped to
-    [1, filter_width, in_channels, out_channels].
-  The result is then reshaped back to
-    [batch, out_width, out_channels]
-  \(where out_width is a function of the stride and padding as in conv2d\) and
-  returned to the caller.
-
-  Args:
-    input: A 3D `Tensor`.  Must be of type `float16`, `float32`, or `float64`.
-    filters: A 3D `Tensor`.  Must have the same type as `input`.
-    stride: An `integer`.  The number of entries by which
-      the filter is moved right at each step.
-    padding: 'SAME' or 'VALID'
-    data_format: An optional `string` from `"NWC", "NCW"`.  Defaults
-      to `"NWC"`, the data is stored in the order of
-      [batch, in_width, in_channels].  The `"NCW"` format stores
-      data as [batch, in_channels, in_width].
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor`.  Has the same type as input.
-
-  Raises:
-    ValueError: if `data_format` is invalid.
-  """
-  return conv1d(input,  # pylint: disable=redefined-builtin
-                filters,
-                stride,
-                padding,
-                use_cudnn_on_gpu=True,
-                data_format=data_format,
-                name=name)
-
-
-def conv1d_transpose(
-    value,
-    filter,  # pylint: disable=redefined-builtin
-    output_shape,
-    stride,
-    padding="SAME",
-    data_format="NWC",
-    name=None):
-  """The transpose of `conv1d`.
-
-  This operation is sometimes called "deconvolution" after [Deconvolutional
-  Networks](http://www.matthewzeiler.com/pubs/cvpr2010/cvpr2010.pdf), but is
-  actually the transpose (gradient) of `conv1d` rather than an actual
-  deconvolution.
-
-  Args:
-    value: A 3-D `Tensor` of type `float` and shape
-      `[batch, in_width, in_channels]` for `NWC` data format or
-      `[batch, in_channels, in_width]` for `NCW` data format.
-    filter: A 3-D `Tensor` with the same type as `value` and shape
-      `[filter_width, output_channels, in_channels]`.  `filter`'s
-      `in_channels` dimension must match that of `value`.
-    output_shape: A 1-D `Tensor` representing the output shape of the
-      deconvolution op.
-    stride: An `integer`.  The number of entries by which
-      the filter is moved right at each step.
-    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-      See the "returns" section of `tf.nn.convolution` for details.
-    data_format: A string. 'NHWC' and 'NCHW' are supported.
-    name: Optional name for the returned tensor.
-
-  Returns:
-    A `Tensor` with the same type as `value`.
-
-  Raises:
-    ValueError: If input/output depth does not match `filter`'s shape, or if
-      padding is other than `'VALID'` or `'SAME'`.
-  """
-  with ops.name_scope(name, "conv1d_transpose",
-                      [value, filter, output_shape]) as name:
-    output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape")
-    if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(3)):
-      raise ValueError("output_shape must have shape (3,), got {}".format(
-          output_shape_.get_shape()))
-
-    # The format could be either NWC or NCW, map to NHWC or NCHW
-    if data_format is None or data_format == "NWC":
-      data_format_2d = "NHWC"
-      axis = 2
-    elif data_format == "NCW":
-      data_format_2d = "NCHW"
-      axis = 1
-    else:
-      raise ValueError("data_format must be \"NWC\" or \"NCW\".")
-
-    if not value.get_shape().dims[axis].is_compatible_with(
-        filter.get_shape()[2]):
-      raise ValueError("input channels does not match filter's input channels, "
-                       "{} != {}".format(value.get_shape()[axis],
-                                         filter.get_shape()[2]))
-
-    if isinstance(output_shape, (list, np.ndarray)):
-      # output_shape's shape should be == [3] if reached this point.
-      if not filter.get_shape().dims[1].is_compatible_with(
-          output_shape[axis]):
-        raise ValueError(
-            "output_shape does not match filter's output channels, "
-            "{} != {}".format(output_shape[axis],
-                              filter.get_shape()[1]))
-
-    if padding != "VALID" and padding != "SAME":
-      raise ValueError("padding must be either VALID or SAME:"
-                       " {}".format(padding))
-
-    # Reshape the input tensor to [batch, 1, in_width, in_channels]
-    if data_format_2d == "NHWC":
-      output_shape_ = array_ops.concat(
-          [output_shape_[:1], [1], output_shape_[1:]], axis=0)
-      spatial_start_dim = 1
-      strides = [1, 1, stride, 1]
-    else:
-      output_shape_ = array_ops.concat(
-          [output_shape_[:2], [1], output_shape_[2:]], axis=0)
-      spatial_start_dim = 2
-      strides = [1, 1, 1, stride]
-    value = array_ops.expand_dims(value, spatial_start_dim)
-    filter = array_ops.expand_dims(filter, 0)  # pylint: disable=redefined-builtin
-
-    result = gen_nn_ops.conv2d_backprop_input(
-        input_sizes=output_shape_,
-        filter=filter,
-        out_backprop=value,
-        strides=strides,
-        padding=padding,
-        data_format=data_format_2d,
-        name=name)
-    return array_ops.squeeze(result, [spatial_start_dim])
-
-
 @ops.RegisterStatistics("Dilation2D", "flops")
 def _calc_dilation2d_flops(graph, node):
   """Calculates the compute resources needed for Dilation2D."""
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 82fab741830fddd4ee0ba5c8e2644702ec199b4d..d79e420589f7c8346a30281a88637ea5d8fc16d2 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -24,9 +24,11 @@ from absl.testing import parameterized
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
@@ -1019,8 +1021,8 @@ class SwishTest(test_lib.TestCase):
   @test_util.run_deprecated_v1
   def testValues(self):
     np_values = np.array(
-        [np.linspace(-10.0, 0.0, 100),
-         np.linspace(0.0, 10.0, 100)],
+        [np.linspace(-7.0, 0.0, 100),
+         np.linspace(0.0, 7.0, 100)],
         dtype=np.float32)
     tf_values = constant_op.constant(np_values)
     actual_tf_outputs = nn_impl.swish(tf_values)
@@ -1239,5 +1241,206 @@ class DataFormatVectorPermuteTest(test_lib.TestCase):
       self.assertAllEqual(y_val, [[7, 4], [4, 5], [5, 1], [9, 3]])
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class AvgPoolTest(test_lib.TestCase):
+
+  def test1DTensor(self):
+    x = array_ops.ones([3, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.avg_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.avg_pool1d(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test1DNumpy(self):
+    x = np.ones([3, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.avg_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.avg_pool1d(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test2DTensor(self):
+    x = array_ops.ones([3, 6, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.avg_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.avg_pool(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test2DNumpy(self):
+    x = np.ones([3, 6, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.avg_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.avg_pool(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test3DTensor(self):
+    x = array_ops.ones([3, 7, 6, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.avg_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.avg_pool3d(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test3DNumpy(self):
+    x = np.ones([3, 7, 6, 6, 5], dtype=np.float32)
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.avg_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.avg_pool3d(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MaxPoolTest(test_lib.TestCase):
+
+  def test1DTensor(self):
+    x = array_ops.ones([3, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.max_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.max_pool1d(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test1DNumpy(self):
+    x = np.ones([3, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.max_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.max_pool1d(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test2DTensor(self):
+    x = array_ops.ones([3, 6, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.max_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.max_pool(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test2DNumpy(self):
+    x = np.ones([3, 6, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.max_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.max_pool(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test3DTensor(self):
+    x = array_ops.ones([3, 7, 6, 6, 5])
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.max_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.max_pool3d(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test3DNumpy(self):
+    x = np.ones([3, 7, 6, 6, 5], dtype=np.float32)
+    ksize = 2
+    strides = 2
+
+    y1 = nn_ops.max_pool_v2(x, ksize, strides, "SAME")
+    y2 = nn_ops.max_pool3d(x, ksize, strides, "SAME")
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def testIncorrectSizeInputSmall(self):
+    x = array_ops.ones([3, 4])
+    with self.assertRaisesRegex(
+        ValueError, "Input tensor must be of rank 3, 4 or 5 but was 2."):
+      nn_ops.max_pool_v2(x, 2, 2, "SAME")
+
+  def testIncorrectSizeInput(self):
+    x = array_ops.ones([3, 4, 1, 2, 1, 2])
+    with self.assertRaisesRegex(
+        ValueError, "Input tensor must be of rank 3, 4 or 5 but was 6."):
+      nn_ops.max_pool_v2(x, 2, 2, "SAME")
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class ConvolutionTest(test_lib.TestCase):
+
+  def testUnknownSize(self):
+    x = tensor_spec.TensorSpec(None, dtypes.float32, name="x")
+    k = np.ones([3, 6, 6, 5])
+
+    @def_function.function
+    def F(value):
+      return nn_ops.convolution(value, k, "SAME")
+
+    F.get_concrete_function(x)
+
+
+class ConvTransposeTest(test_lib.TestCase):
+
+  def test1DTensor(self):
+    t = array_ops.ones([2, 4, 3])
+    v = array_ops.ones([2, 5, 3])
+    strides = 2
+
+    y1 = nn_ops.conv1d_transpose(t, v, [2, 8, 5], strides)
+    y2 = nn_ops.conv_transpose(t, v, [2, 8, 5], strides)
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test2DTensor(self):
+    t = array_ops.ones([2, 4, 4, 3])
+    v = array_ops.ones([2, 2, 5, 3])
+    strides = 2
+
+    y1 = nn_ops.conv2d_transpose_v2(t, v, [2, 8, 8, 5], strides)
+    y2 = nn_ops.conv_transpose(t, v, [2, 8, 8, 5], strides)
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def test3DTensor(self):
+    t = array_ops.ones([2, 4, 4, 4, 3])
+    v = array_ops.ones([2, 2, 2, 5, 3])
+    strides = 2
+
+    y1 = nn_ops.conv3d_transpose_v2(t, v, [2, 8, 8, 8, 5], strides)
+    y2 = nn_ops.conv_transpose(t, v, [2, 8, 8, 8, 5], strides)
+
+    self.assertAllEqual(self.evaluate(y1), self.evaluate(y2))
+
+  def testIncorrectSizeInputSmall(self):
+    with self.assertRaisesRegex(
+        ValueError, "output_shape must be of length 3, 4 or 5 but was 2."):
+      nn_ops.conv_transpose(None, 2, [2, 3], "SAME")
+
+  def testIncorrectSizeInput(self):
+    with self.assertRaisesRegex(
+        ValueError, "output_shape must be of length 3, 4 or 5 but was 6."):
+      nn_ops.conv_transpose(None, 2, [2, 3, 4, 2, 5, 1], "SAME")
+
+  def testTensorsNoShape(self):
+    with self.assertRaisesRegex(ValueError, "output_shape cannot be None"):
+      nn_ops.conv_transpose(None, None, None, None)
+
+
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/python/ops/numerics.py b/tensorflow/python/ops/numerics.py
index 0ab39ad0a8edd60c78a6bea3ae31e4f025c9e0bd..2aba42ef8951d58be595dbe1208eba3a9fceb663 100644
--- a/tensorflow/python/ops/numerics.py
+++ b/tensorflow/python/ops/numerics.py
@@ -30,18 +30,23 @@ from tensorflow.python.util.tf_export import tf_export
 
 @tf_export(v1=["debugging.assert_all_finite", "verify_tensor_all_finite"])
 @deprecation.deprecated_endpoints("verify_tensor_all_finite")
-def verify_tensor_all_finite(t, msg, name=None):
+def verify_tensor_all_finite(t=None, msg=None, name=None, x=None, message=None):
   """Assert that the tensor does not contain any NaN's or Inf's.
 
   Args:
     t: Tensor to check.
     msg: Message to log on failure.
     name: A name for this operation (optional).
+    x: Alias for t.
+    message: Alias for msg.
 
   Returns:
     Same tensor as `t`.
   """
-  return verify_tensor_all_finite_v2(t, msg, name)
+  x = deprecation.deprecated_argument_lookup("x", x, "t", t)
+  message = deprecation.deprecated_argument_lookup(
+      "message", message, "msg", msg)
+  return verify_tensor_all_finite_v2(x, message, name)
 
 
 @tf_export("debugging.assert_all_finite", v1=[])
diff --git a/tensorflow/python/ops/parallel_for/BUILD b/tensorflow/python/ops/parallel_for/BUILD
index 07fc9433a2582225a8da687eb8c9563c8fcac9e2..0a2f3e25a29c5423915b5e9383867e5f690587b9 100644
--- a/tensorflow/python/ops/parallel_for/BUILD
+++ b/tensorflow/python/ops/parallel_for/BUILD
@@ -15,11 +15,13 @@ py_library(
         "control_flow_ops.py",
         "gradients.py",
         "pfor.py",
+        "test_util.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
         ":control_flow_ops",
         ":gradients",
+        ":test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:constant_op",
@@ -29,6 +31,7 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:functional_ops",
         "//tensorflow/python:gradients",
+        "//tensorflow/python:map_fn",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform",
@@ -54,7 +57,7 @@ py_library(
         "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:map_fn",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform",
@@ -83,12 +86,25 @@ py_library(
     ],
 )
 
+py_library(
+    name = "test_util",
+    srcs = ["test_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pfor_lib",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+    ],
+)
+
 cuda_py_test(
     name = "control_flow_ops_test",
-    size = "large",
     srcs = ["control_flow_ops_test.py"],
     additional_deps = [
         ":control_flow_ops",
+        ":test_util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:gradients",
@@ -99,6 +115,37 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:util",
     ],
+    tags = ["no_rocm"],
+)
+
+cuda_py_test(
+    name = "array_test",
+    srcs = ["array_test.py"],
+    additional_deps = [
+        ":control_flow_ops",
+        ":test_util",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python/eager:backprop",
+    ],
+    xla_enable_strict_auto_jit = True,
+)
+
+cuda_py_test(
+    name = "math_test",
+    srcs = ["math_test.py"],
+    additional_deps = [
+        ":control_flow_ops",
+        ":test_util",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:util",
+    ],
+    tags = ["optonly"],  # Too slow in non-opt mode
+    xla_enable_strict_auto_jit = True,
 )
 
 py_library(
@@ -115,7 +162,6 @@ py_library(
 
 cuda_py_test(
     name = "gradients_test",
-    size = "large",
     srcs = ["gradients_test.py"],
     additional_deps = [
         ":control_flow_ops",
@@ -128,4 +174,6 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python/ops/losses",
     ],
+    tags = ["optonly"],  # Too slow in non-opt mode
+    xla_enable_strict_auto_jit = True,
 )
diff --git a/tensorflow/python/ops/parallel_for/array_test.py b/tensorflow/python/ops/parallel_for/array_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f0c0f5b992b3f005dc8b75a6d0207237a5205bb
--- /dev/null
+++ b/tensorflow/python/ops/parallel_for/array_test.py
@@ -0,0 +1,274 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for vectorization of array kernels."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
+from tensorflow.python.ops.parallel_for import control_flow_ops as pfor_control_flow_ops
+from tensorflow.python.ops.parallel_for.test_util import PForTestCase
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class ArrayTest(PForTestCase):
+
+  def test_gather(self):
+    x = random_ops.random_uniform([3, 3, 3])
+
+    def loop_fn(i):
+      outputs = []
+      x_i = array_ops.gather(x, i)
+      for y in [x, x_i]:
+        axes = [0, 2, -1] if y == x else [0]
+        for axis in axes:
+          outputs.append(array_ops.gather(y, 2, axis=axis))
+          outputs.append(array_ops.gather(y, i, axis=axis))
+          outputs.append(array_ops.gather(y, [i], axis=axis))
+          outputs.append(array_ops.gather(y, [i, 2], axis=axis))
+          outputs.append(array_ops.gather(y, [[2, i], [i, 1]], axis=axis))
+      return outputs
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 20)
+
+  def test_shape(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x_i = array_ops.gather(x, i)
+      return array_ops.shape(x_i), array_ops.shape(x_i, out_type=dtypes.int64)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32, dtypes.int64])
+
+  def test_size(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x_i = array_ops.gather(x, i)
+      return array_ops.size(x_i), array_ops.size(x_i, out_type=dtypes.int64)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32, dtypes.int64])
+
+  def test_rank(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x_i = array_ops.gather(x, i)
+      return array_ops.rank(x_i)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32])
+
+  def test_shape_n(self):
+    x = random_ops.random_uniform([3, 2, 3])
+    y = random_ops.random_uniform([3])
+
+    def loop_fn(i):
+      x_i = array_ops.gather(x, i)
+      y_i = array_ops.gather(y, i)
+      return array_ops.shape_n([x_i, x, y, y_i]), array_ops.shape_n(
+          [x_i, x, y, y_i], out_type=dtypes.int64)
+
+    self._test_loop_fn(
+        loop_fn, 3, loop_fn_dtypes=[dtypes.int32] * 4 + [dtypes.int64] * 4)
+
+  def test_reshape(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.reshape(x1, [-1]), array_ops.reshape(x1, [1, 3, 1, -1])
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+
+  def test_expand_dims(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.expand_dims(
+          x1, axis=-1), array_ops.expand_dims(
+              x1, axis=1)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+
+  def test_slice(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.slice(x1, begin=(0, 1), size=(2, 1))
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_tile(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.tile(x1, [2, 1])
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_tile_loop_dependent(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.tile(x1, [i, 1])
+
+    with self.assertRaisesRegexp(ValueError, "expected to be loop invariant"):
+      pfor_control_flow_ops.pfor(loop_fn, 2)
+
+  def test_pack(self):
+    x = random_ops.random_uniform([3, 2, 3])
+    y = random_ops.random_uniform([2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.stack([x1, y], axis=-1)
+
+    self._test_loop_fn(loop_fn, 1)
+
+  def test_unpack(self):
+    x = random_ops.random_uniform([3, 2, 3, 4])
+
+    def loop_fn(i):
+      x_i = array_ops.gather(x, i)
+      return array_ops.unstack(
+          x_i, 4, axis=-1), array_ops.unstack(
+              x_i, 3, axis=1)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 7)
+
+  def test_pad(self):
+    x = random_ops.random_uniform([3, 2, 3])
+    padding = constant_op.constant([[1, 2], [3, 4]])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.pad(x1, padding, mode="CONSTANT")
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_split(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.split(x1, 2, axis=0), array_ops.split(x1, 3, axis=-1)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 5)
+
+  def test_split_v(self):
+    x = random_ops.random_uniform([3, 6, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return (array_ops.split(x1, [2, 1, 3], axis=0),
+              array_ops.split(x1, [3], axis=-1))
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 4)
+
+  def test_transpose(self):
+    x = random_ops.random_uniform([3, 2, 3, 4])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.transpose(x1, [2, 1, 0])
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_zeros_like(self):
+    x = random_ops.random_uniform([3, 2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      z = array_ops.zeros_like(x1),
+      return z, z + x1
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+
+  def test_concat_v2(self):
+    x = random_ops.random_uniform([3, 2, 3])
+    y = random_ops.random_uniform([2, 3])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return array_ops.concat(
+          [x1, x1, y], axis=0), array_ops.concat(
+              [x1, x1, y], axis=-1)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+
+  def test_unary_cwise_ops(self):
+    for op in [array_ops.identity, array_ops.stop_gradient]:
+      with backprop.GradientTape(persistent=True) as g:
+        x = random_ops.random_uniform([3, 5])
+        g.watch(x)
+
+      # pylint: disable=cell-var-from-loop
+      def loop_fn(i):
+        with g:
+          x1 = array_ops.gather(x, i)
+          y = op(x1) + x1
+          loss = nn.l2_loss(y)
+        return op(x), y, g.gradient(loss, x1)
+
+      # pylint: enable=cell-var-from-loop
+
+      self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 3)
+
+  def test_identity_n(self):
+    x = random_ops.random_uniform([3, 4])
+
+    def loop_fn(i):
+      return array_ops.identity_n([x, array_ops.gather(x, i)])
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+
+  def test_matrix_diag_part(self):
+    x = random_ops.random_uniform([3, 4, 2])
+
+    def loop_fn(i):
+      return array_ops.matrix_diag_part(array_ops.gather(x, i))
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32])
+
+  def test_strided_slice(self):
+    with backprop.GradientTape(persistent=True) as g:
+      x = random_ops.random_uniform([3, 3, 4, 4, 2, 2, 2])
+      g.watch(x)
+
+    def loop_fn(i):
+      with g:
+        x_i = array_ops.gather(x, i)
+        y = x_i[:2, ::2, 1::3, ..., array_ops.newaxis, 1]
+        loss = nn.l2_loss(y)
+      return y, g.gradient(loss, x_i)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 933bddd8ccaa830a394c8d69e4f1b33311315c99..1b1b336bd0e33f8b48a681e699c6f791fa1decc0 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -34,12 +34,11 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import bitwise_ops
-from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gradients as gradient_ops
 from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import parsing_ops
@@ -50,40 +49,13 @@ from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-im
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.parallel_for import control_flow_ops as pfor_control_flow_ops
+from tensorflow.python.ops.parallel_for.test_util import PForTestCase
 from tensorflow.python.platform import test
 from tensorflow.python.util import nest
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class PForTest(test.TestCase):
-
-  def _run_targets(self, targets1, targets2=None, run_init=True):
-    targets1 = nest.flatten(targets1)
-    targets2 = ([] if targets2 is None else nest.flatten(targets2))
-    assert len(targets1) == len(targets2) or not targets2
-    if run_init:
-      init = variables.global_variables_initializer()
-      self.evaluate(init)
-    return self.evaluate(targets1 + targets2)
-
-  def run_and_assert_equal(self, targets1, targets2):
-    outputs = self._run_targets(targets1, targets2)
-    outputs = nest.flatten(outputs)  # flatten SparseTensorValues
-    n = len(outputs) // 2
-    for i in range(n):
-      if outputs[i + n].dtype != np.object:
-        self.assertAllClose(outputs[i + n], outputs[i], rtol=1e-4, atol=1e-5)
-      else:
-        self.assertAllEqual(outputs[i + n], outputs[i])
-
-  def _test_loop_fn(self, loop_fn, iters,
-                    loop_fn_dtypes=dtypes.float32,
-                    parallel_iterations=None):
-    t1 = pfor_control_flow_ops.pfor(loop_fn, iters=iters,
-                                    parallel_iterations=parallel_iterations)
-    t2 = pfor_control_flow_ops.for_loop(loop_fn, loop_fn_dtypes, iters=iters,
-                                        parallel_iterations=parallel_iterations)
-    self.run_and_assert_equal(t1, t2)
+class PForTest(PForTestCase):
 
   def test_op_conversion_fallback_to_while_loop(self):
     # Note that we used top_k op for this test. If a converter gets defined for
@@ -129,246 +101,7 @@ class PForTest(test.TestCase):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class ArrayTest(PForTest):
-
-  def test_gather(self):
-    x = random_ops.random_uniform([3, 3, 3])
-
-    def loop_fn(i):
-      outputs = []
-      x_i = array_ops.gather(x, i)
-      for y in [x, x_i]:
-        axes = [0, 2, -1] if y == x else [0]
-        for axis in axes:
-          outputs.append(array_ops.gather(y, 2, axis=axis))
-          outputs.append(array_ops.gather(y, i, axis=axis))
-          outputs.append(array_ops.gather(y, [i], axis=axis))
-          outputs.append(array_ops.gather(y, [i, 2], axis=axis))
-          outputs.append(array_ops.gather(y, [[2, i], [i, 1]], axis=axis))
-      return outputs
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 20)
-
-  def test_shape(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x_i = array_ops.gather(x, i)
-      return array_ops.shape(x_i), array_ops.shape(x_i, out_type=dtypes.int64)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32, dtypes.int64])
-
-  def test_size(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x_i = array_ops.gather(x, i)
-      return array_ops.size(x_i), array_ops.size(x_i, out_type=dtypes.int64)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32, dtypes.int64])
-
-  def test_rank(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x_i = array_ops.gather(x, i)
-      return array_ops.rank(x_i)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32])
-
-  def test_shape_n(self):
-    x = random_ops.random_uniform([3, 2, 3])
-    y = random_ops.random_uniform([3])
-
-    def loop_fn(i):
-      x_i = array_ops.gather(x, i)
-      y_i = array_ops.gather(y, i)
-      return array_ops.shape_n([x_i, x, y, y_i]), array_ops.shape_n(
-          [x_i, x, y, y_i], out_type=dtypes.int64)
-
-    self._test_loop_fn(
-        loop_fn, 3, loop_fn_dtypes=[dtypes.int32] * 4 + [dtypes.int64] * 4)
-
-  def test_reshape(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.reshape(x1, [-1]), array_ops.reshape(x1, [1, 3, 1, -1])
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
-
-  def test_expand_dims(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.expand_dims(
-          x1, axis=-1), array_ops.expand_dims(
-              x1, axis=1)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
-
-  def test_slice(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.slice(x1, begin=(0, 1), size=(2, 1))
-
-    self._test_loop_fn(loop_fn, 3)
-
-  def test_tile(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.tile(x1, [2, 1])
-
-    self._test_loop_fn(loop_fn, 3)
-
-  def test_tile_loop_dependent(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.tile(x1, [i, 1])
-
-    with self.assertRaisesRegexp(ValueError, "expected to be loop invariant"):
-      pfor_control_flow_ops.pfor(loop_fn, 2)
-
-  def test_pack(self):
-    x = random_ops.random_uniform([3, 2, 3])
-    y = random_ops.random_uniform([2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.stack([x1, y], axis=-1)
-
-    self._test_loop_fn(loop_fn, 1)
-
-  def test_unpack(self):
-    x = random_ops.random_uniform([3, 2, 3, 4])
-
-    def loop_fn(i):
-      x_i = array_ops.gather(x, i)
-      return array_ops.unstack(
-          x_i, 4, axis=-1), array_ops.unstack(
-              x_i, 3, axis=1)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 7)
-
-  def test_pad(self):
-    x = random_ops.random_uniform([3, 2, 3])
-    padding = constant_op.constant([[1, 2], [3, 4]])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.pad(x1, padding, mode="CONSTANT")
-
-    self._test_loop_fn(loop_fn, 3)
-
-  def test_split(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.split(x1, 2, axis=0), array_ops.split(x1, 3, axis=-1)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 5)
-
-  def test_split_v(self):
-    x = random_ops.random_uniform([3, 6, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return (array_ops.split(x1, [2, 1, 3], axis=0),
-              array_ops.split(x1, [3], axis=-1))
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 4)
-
-  def test_transpose(self):
-    x = random_ops.random_uniform([3, 2, 3, 4])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.transpose(x1, [2, 1, 0])
-
-    self._test_loop_fn(loop_fn, 3)
-
-  def test_zeros_like(self):
-    x = random_ops.random_uniform([3, 2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      z = array_ops.zeros_like(x1),
-      return z, z + x1
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
-
-  def test_concat_v2(self):
-    x = random_ops.random_uniform([3, 2, 3])
-    y = random_ops.random_uniform([2, 3])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return array_ops.concat(
-          [x1, x1, y], axis=0), array_ops.concat(
-              [x1, x1, y], axis=-1)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
-
-  def test_unary_cwise_ops(self):
-    for op in [array_ops.identity, array_ops.stop_gradient]:
-      with backprop.GradientTape(persistent=True) as g:
-        x = random_ops.random_uniform([3, 5])
-        g.watch(x)
-
-      # pylint: disable=cell-var-from-loop
-      def loop_fn(i):
-        with g:
-          x1 = array_ops.gather(x, i)
-          y = op(x1) + x1
-          loss = nn.l2_loss(y)
-        return op(x), y, g.gradient(loss, x1)
-
-      # pylint: enable=cell-var-from-loop
-
-      self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 3)
-
-  def test_identity_n(self):
-    x = random_ops.random_uniform([3, 4])
-
-    def loop_fn(i):
-      return array_ops.identity_n([x, array_ops.gather(x, i)])
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
-
-  def test_matrix_diag_part(self):
-    x = random_ops.random_uniform([3, 4, 2])
-
-    def loop_fn(i):
-      return array_ops.matrix_diag_part(array_ops.gather(x, i))
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32])
-
-  def test_strided_slice(self):
-    with backprop.GradientTape(persistent=True) as g:
-      x = random_ops.random_uniform([3, 3, 4, 4, 2, 2, 2])
-      g.watch(x)
-
-    def loop_fn(i):
-      with g:
-        x_i = array_ops.gather(x, i)
-        y = x_i[:2, ::2, 1::3, ..., array_ops.newaxis, 1]
-        loss = nn.l2_loss(y)
-      return y, g.gradient(loss, x_i)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
-
-
-@test_util.run_all_in_graph_and_eager_modes
-class BitwiseTest(PForTest):
+class BitwiseTest(PForTestCase):
 
   def test_unary_cwise(self):
     for op in [bitwise_ops.invert]:
@@ -408,376 +141,7 @@ class BitwiseTest(PForTest):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class MathTest(PForTest):
-
-  def test_unary_cwise_ops(self):
-    complex_ops = [
-        math_ops.angle,
-        math_ops.imag,
-        math_ops.complex_abs,
-        math_ops.real,
-        math_ops.conj,
-    ]
-    real_ops = [
-        lambda x: math_ops.acosh(1 + math_ops.square(x)),
-        math_ops.abs,
-        math_ops.acos,
-        math_ops.asin,
-        math_ops.asinh,
-        math_ops.atan,
-        math_ops.atanh,
-        math_ops.bessel_i0e,
-        math_ops.bessel_i1e,
-        math_ops.cos,
-        math_ops.cosh,
-        math_ops.digamma,
-        math_ops.erf,
-        math_ops.erfc,
-        math_ops.exp,
-        math_ops.expm1,
-        math_ops.inv,
-        math_ops.is_finite,
-        math_ops.is_inf,
-        math_ops.lgamma,
-        math_ops.log,
-        math_ops.log1p,
-        math_ops.neg,
-        math_ops.negative,
-        math_ops.reciprocal,
-        math_ops.rint,
-        math_ops.round,
-        math_ops.rsqrt,
-        math_ops.sigmoid,
-        math_ops.sign,
-        math_ops.sin,
-        math_ops.sinh,
-        math_ops.sqrt,
-        math_ops.square,
-        math_ops.tan,
-        math_ops.tanh,
-        math_ops.tanh,
-        nn.elu,
-        nn.relu,
-        nn.relu6,
-        nn.selu,
-        nn.softplus,
-        nn.softsign,
-    ]
-    for op in complex_ops + real_ops:
-      with backprop.GradientTape(persistent=True) as g:
-        x = random_ops.random_uniform([3, 5])
-        g.watch(x)
-        if op in complex_ops:
-          y = random_ops.random_uniform([3, 5])
-          g.watch(y)
-          x = math_ops.complex(x, y)
-
-      # pylint: disable=cell-var-from-loop
-      output_dtypes = []
-      def loop_fn(i):
-        with g:
-          x1 = array_ops.gather(x, i)
-          y1 = op(x1)
-          outputs = [op(x), y1]
-          if y1.dtype == dtypes.float32:
-            loss = math_ops.reduce_sum(y1 * y1)
-          else:
-            loss = None
-        if loss is not None:
-          grad = g.gradient(loss, x1)
-          if grad is not None:
-            outputs.append(grad)
-        del output_dtypes[:]
-        output_dtypes.extend([t.dtype for t in outputs])
-        return outputs
-
-      # pylint: enable=cell-var-from-loop
-
-      self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=output_dtypes)
-
-  def test_unary_cwise_no_grad(self):
-    for op in [math_ops.ceil,
-               math_ops.floor,
-               math_ops.logical_not]:
-      x = random_ops.random_uniform([3, 5])
-      if op == math_ops.logical_not:
-        x = x > 0
-
-      # pylint: disable=cell-var-from-loop
-      def loop_fn(i):
-        return op(array_ops.gather(x, i))
-
-      # pylint: enable=cell-var-from-loop
-
-      self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=x.dtype)
-
-  def test_binary_cwise_ops(self):
-    logical_ops = [
-        math_ops.logical_and,
-        math_ops.logical_or,
-        math_ops.logical_xor
-    ]
-
-    # Wrapper functions restricting the range of inputs of zeta and polygamma.
-    def safe_polygamma(x, y):
-      return math_ops.polygamma(
-          math_ops.round(clip_ops.clip_by_value(y, 1, 10)),
-          x * x + 1)
-
-    def safe_zeta(x, y):
-      return math_ops.zeta(x * x + 1, y * y)
-
-    float_ops = [
-        math_ops.add,
-        math_ops.add_v2,
-        math_ops.atan2,
-        math_ops.complex,
-        math_ops.div,
-        math_ops.divide,
-        math_ops.div_no_nan,
-        math_ops.equal,
-        math_ops.floor_div,
-        math_ops.floor_mod,
-        math_ops.greater,
-        math_ops.greater_equal,
-        math_ops.igamma,
-        math_ops.igammac,
-        math_ops.igamma_grad_a,
-        math_ops.less,
-        math_ops.less_equal,
-        math_ops.maximum,
-        math_ops.minimum,
-        math_ops.mod,
-        math_ops.multiply,
-        math_ops.not_equal,
-        math_ops.pow,
-        math_ops.squared_difference,
-        math_ops.subtract,
-        math_ops.truncate_mod,
-        safe_polygamma,
-        safe_zeta,
-    ]
-    for op in logical_ops + float_ops:
-      x = random_ops.random_uniform([7, 3, 5])
-      y = random_ops.random_uniform([3, 5])
-      if op in logical_ops:
-        x = x > 0
-        y = y > 0
-
-      output_dtypes = []
-      # pylint: disable=cell-var-from-loop
-      def loop_fn(i):
-        x1 = array_ops.gather(x, i)
-        y1 = array_ops.gather(y, i)
-        outputs = [op(x, y), op(x1, y), op(x, y1), op(x1, y1), op(x1, x1)]
-        del output_dtypes[:]
-        output_dtypes.extend([t.dtype for t in outputs])
-        return outputs
-      # pylint: enable=cell-var-from-loop
-
-      self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=output_dtypes)
-
-  def test_approximate_equal(self):
-    x = random_ops.random_uniform([3, 5])
-    y = random_ops.random_uniform([3, 5])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      y1 = array_ops.gather(y, i)
-      return math_ops.approximate_equal(x1, y1)
-
-    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.bool])
-
-  def test_addn(self):
-    x = random_ops.random_uniform([2, 3, 5])
-    y = random_ops.random_uniform([3, 5])
-    z = random_ops.random_uniform([3, 5])
-
-    def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      return math_ops.add_n([x1, y, z])
-
-    self._test_loop_fn(loop_fn, 2)
-
-  def test_matmul(self):
-    for tr_a in (True, False):
-      for tr_b in (True, False):
-        for stack_a in (True, False):
-          for stack_b in (True, False):
-            shape_a = (5, 3) if tr_a else (3, 5)
-            if stack_a:
-              shape_a = (2,) + shape_a
-            shape_b = (7, 5) if tr_b else (5, 7)
-            if stack_b:
-              shape_b = (2,) + shape_b
-
-            x = random_ops.random_uniform(shape_a)
-            y = random_ops.random_uniform(shape_b)
-
-            # pylint: disable=cell-var-from-loop
-            def loop_fn(i):
-              a = array_ops.gather(x, i) if stack_a else x
-              b = array_ops.gather(y, i) if stack_b else y
-              return math_ops.matmul(a, b, transpose_a=tr_a, transpose_b=tr_b)
-
-            # pylint: enable=cell-var-from-loop
-
-            self._test_loop_fn(loop_fn, 2)
-
-  def test_batch_matmul(self):
-    for tr_a in (True, False):
-      for tr_b in (True, False):
-        for stack_a in (True, False):
-          for stack_b in (True, False):
-            shape_a = (4, 5, 3) if tr_a else (4, 3, 5)
-            if stack_a:
-              shape_a = (2,) + shape_a
-            shape_b = (4, 7, 5) if tr_b else (4, 5, 7)
-            if stack_b:
-              shape_b = (2,) + shape_b
-
-            x = random_ops.random_uniform(shape_a)
-            y = random_ops.random_uniform(shape_b)
-
-            # pylint: disable=cell-var-from-loop
-            def loop_fn(i):
-              a = array_ops.gather(x, i) if stack_a else x
-              b = array_ops.gather(y, i) if stack_b else y
-              return math_ops.matmul(a, b, transpose_a=tr_a, transpose_b=tr_b)
-
-            # pylint: enable=cell-var-from-loop
-
-            self._test_loop_fn(loop_fn, 2)
-
-  def test_reduction(self):
-    x = random_ops.random_uniform([2, 3, 4, 5])
-    for op in [
-        math_ops.reduce_sum, math_ops.reduce_prod, math_ops.reduce_max,
-        math_ops.reduce_min
-    ]:
-      for axis in ([1], None, [0, 2]):
-        for keepdims in (True, False):
-
-          # pylint: disable=cell-var-from-loop
-          def loop_fn(i):
-            a = array_ops.gather(x, i)
-            return op(a, axis=axis, keepdims=keepdims)
-
-          # pylint: enable=cell-var-from-loop
-
-          self._test_loop_fn(loop_fn, 2)
-
-  def test_cum_sum(self):
-    x = random_ops.random_uniform([2, 3, 4, 5])
-    for axis in (1, -2):
-      for exclusive in (True, False):
-        for reverse in (True, False):
-
-          # pylint: disable=cell-var-from-loop
-          def loop_fn(i):
-            a = array_ops.gather(x, i)
-            return math_ops.cumsum(
-                a, axis=axis, exclusive=exclusive, reverse=reverse)
-
-          # pylint: enable=cell-var-from-loop
-
-          self._test_loop_fn(loop_fn, 2)
-
-  def test_cum_prod(self):
-    x = random_ops.random_uniform([2, 3, 4, 5])
-    for axis in (1, -2):
-      for exclusive in (True, False):
-        for reverse in (True, False):
-
-          # pylint: disable=cell-var-from-loop
-          def loop_fn(i):
-            a = array_ops.gather(x, i)
-            return math_ops.cumprod(
-                a, axis=axis, exclusive=exclusive, reverse=reverse)
-
-          # pylint: enable=cell-var-from-loop
-
-          self._test_loop_fn(loop_fn, 2)
-
-  def test_bias_add(self):
-    x_shape = [2, 3, 4, 5, 6]
-    x = random_ops.random_uniform(x_shape)
-    for data_format in ("NCHW", "NHWC"):
-      with backprop.GradientTape(persistent=True) as g:
-        bias_dim = 2 if data_format == "NCHW" else -1
-        bias_shape = x_shape[bias_dim]
-        bias = random_ops.random_uniform([bias_shape])
-        g.watch(bias)
-
-      # pylint: disable=cell-var-from-loop
-      def loop_fn(i):
-        with g:
-          a = array_ops.gather(x, i)
-          y = nn.bias_add(a, bias, data_format=data_format)
-          loss = math_ops.reduce_sum(y * y)
-        return y, g.gradient(loss, bias)
-      # pylint: enable=cell-var-from-loop
-
-      self._test_loop_fn(
-          loop_fn, 2, loop_fn_dtypes=[dtypes.float32, dtypes.float32])
-
-  def test_unsorted_segment_sum(self):
-    t = random_ops.random_uniform([3, 3, 2])
-    segment_ids = constant_op.constant([[0, 0, 2], [0, 1, 2], [2, 2, 2]])
-    num_segments = 3
-
-    def loop_fn(i):
-      data = array_ops.gather(t, i)
-      data_0 = array_ops.gather(t, 0)
-      seg_ids = array_ops.gather(segment_ids, i)
-      return (math_ops.unsorted_segment_sum(data, seg_ids, num_segments),
-              math_ops.unsorted_segment_sum(data_0, seg_ids, num_segments))
-
-    self._test_loop_fn(loop_fn, 3, [dtypes.float32] * 2)
-
-  def test_cast(self):
-    x = constant_op.constant([[1], [2]])
-    y = constant_op.constant([[1.0], [2.0]])
-
-    def loop_fn(i):
-      return (math_ops.cast(array_ops.gather(x, i), dtypes.float32),
-              math_ops.cast(array_ops.gather(y, i), dtypes.int32))
-
-    self._test_loop_fn(
-        loop_fn, 2, loop_fn_dtypes=[dtypes.float32, dtypes.int32])
-
-  def test_tanh_axpy(self):
-    a = constant_op.constant(3.)
-    x = random_ops.random_uniform([4, 5])
-    y = random_ops.random_uniform([6, 5])
-    n = x.shape[0]
-
-    def loop_fn(i):
-      return math_ops.tanh(a * array_ops.gather(x, i) + array_ops.gather(y, i))
-
-    self._test_loop_fn(loop_fn, n)
-
-  def test_select(self):
-    cond = constant_op.constant([True, False])
-    a = random_ops.random_uniform([2, 3, 5])
-    b = random_ops.random_uniform([2, 3, 5])
-    for cond_shape in [2], [2, 3], [2, 3, 5]:
-      cond = random_ops.random_uniform(cond_shape) > 0.5
-
-      # pylint: disable=cell-var-from-loop
-      def loop_fn(i):
-        a_i = array_ops.gather(a, i)
-        b_i = array_ops.gather(b, i)
-        cond_i = array_ops.gather(cond, i)
-        return array_ops.where(cond_i, a_i, b_i)
-
-      # pylint: enable=cell-var-from-loop
-
-      self._test_loop_fn(loop_fn, 2)
-
-
-@test_util.run_all_in_graph_and_eager_modes
-class NNTest(PForTest):
+class NNTest(PForTestCase):
 
   def test_conv2d(self):
     x = random_ops.random_uniform([3, 2, 12, 12, 3])
@@ -887,6 +251,7 @@ class NNTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 3)
 
+  @test_util.disable_xla("This test never passed for XLA")
   def test_fused_batch_norm(self):
     data_formats = ["NHWC"]
     if test.is_gpu_available():
@@ -956,7 +321,7 @@ class NNTest(PForTest):
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
 
 
-class RandomTest(PForTest):
+class RandomTest(PForTestCase):
 
   # The random values generated in the two implementations are not guaranteed to
   # match. So we only check the returned shapes.
@@ -1009,8 +374,9 @@ class RandomTest(PForTest):
     self._test_loop_fn(loop_fn, 5)
 
 
-class LoggingTest(PForTest):
+class LoggingTest(PForTestCase):
 
+  @test_util.run_v1_only("b/122612051")
   def test_print(self):
     x = random_ops.random_uniform([3, 5])
 
@@ -1031,8 +397,9 @@ class LoggingTest(PForTest):
       sess.run(pfor_control_flow_ops.pfor(loop_fn, 3))
 
 
-class TensorArrayTest(PForTest):
+class TensorArrayTest(PForTestCase):
 
+  @test_util.run_v1_only("b/122612051")
   def test_create_outside_and_read(self):
 
     ta = tensor_array_ops.TensorArray(
@@ -1043,6 +410,7 @@ class TensorArrayTest(PForTest):
 
     self._test_loop_fn(loop_fn, 2, [dtypes.int32] * 2)
 
+  @test_util.run_v1_only("b/122612051")
   def test_create_outside_and_gather(self):
 
     ta = tensor_array_ops.TensorArray(
@@ -1053,6 +421,7 @@ class TensorArrayTest(PForTest):
 
     self._test_loop_fn(loop_fn, 2, [dtypes.int32] * 2)
 
+  @test_util.run_v1_only("b/122612051")
   def test_create_outside_and_write_and_scatter(self):
 
     t = tensor_array_ops.TensorArray(dtypes.int32, 10, clear_after_read=False)
@@ -1074,6 +443,7 @@ class TensorArrayTest(PForTest):
     output2 = self._run_targets(out2)
     self.assertAllClose(output2, output1)
 
+  @test_util.run_v1_only("b/122612051")
   def test_create_inside_and_write(self):
 
     def loop_fn(i):
@@ -1085,6 +455,7 @@ class TensorArrayTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, [dtypes.int32] * 2)
 
+  @test_util.run_v1_only("b/122612051")
   def test_create_inside_and_scatter(self):
 
     def loop_fn(i):
@@ -1097,6 +468,7 @@ class TensorArrayTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, [dtypes.int32] * 2)
 
+  @test_util.run_v1_only("b/122612051")
   def test_create_inside_and_read(self):
 
     def loop_fn(i):
@@ -1109,6 +481,7 @@ class TensorArrayTest(PForTest):
 
     self._test_loop_fn(loop_fn, 2, [dtypes.int32] * 3)
 
+  @test_util.run_v1_only("b/122612051")
   def test_create_inside_and_gather(self):
 
     def loop_fn(i):
@@ -1121,6 +494,7 @@ class TensorArrayTest(PForTest):
 
     self._test_loop_fn(loop_fn, 2, [dtypes.int32] * 3)
 
+  @test_util.run_v1_only("b/122612051")
   def test_grad(self):
     x = random_ops.random_uniform([3, 2])
     ta = tensor_array_ops.TensorArray(
@@ -1140,8 +514,9 @@ class TensorArrayTest(PForTest):
       self.assertAllClose(actual_grad, computed_grad)
 
 
-class StackTest(PForTest):
+class StackTest(PForTestCase):
 
+  @test_util.run_v1_only("b/122612051")
   def test_stack_inside_loop_invariant(self):
 
     def loop_fn(_):
@@ -1157,6 +532,7 @@ class StackTest(PForTest):
 
     self._test_loop_fn(loop_fn, 2, [dtypes.int32] * 2)
 
+  @test_util.run_v1_only("b/122612051")
   def test_stack_inside_push_loop_dependent(self):
 
     def loop_fn(i):
@@ -1172,6 +548,7 @@ class StackTest(PForTest):
 
     self._test_loop_fn(loop_fn, 2, [dtypes.int32] * 2)
 
+  @test_util.run_v1_only("b/122612051")
   def test_stack_outside_pop(self):
     s = data_flow_ops.stack_v2(max_size=4, elem_type=dtypes.int32)
     op = data_flow_ops.stack_push_v2(s, 5)
@@ -1195,6 +572,7 @@ class StackTest(PForTest):
     self.assertAllEqual([6, 6], v2)
     self.assertAllEqual(5, v3)
 
+  @test_util.run_v1_only("b/122612051")
   def test_stack_outside_push(self):
     s = data_flow_ops.stack_v2(max_size=4, elem_type=dtypes.int32)
 
@@ -1207,7 +585,7 @@ class StackTest(PForTest):
 
 # TODO(agarwal): test nested while_loops. This currently requires converting a
 # tf.cond.
-class ControlFlowTest(PForTest):
+class ControlFlowTest(PForTestCase):
 
   def test_while_outside_loop(self):
 
@@ -1218,6 +596,7 @@ class ControlFlowTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32])
 
+  @test_util.run_v1_only("b/122612051")
   def test_invariant_while(self):
 
     def loop_fn(_):
@@ -1225,6 +604,7 @@ class ControlFlowTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32])
 
+  @test_util.run_v1_only("b/122612051")
   def test_invariant_while_with_control_dependency(self):
 
     def loop_fn(i):
@@ -1234,6 +614,7 @@ class ControlFlowTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32])
 
+  @test_util.run_v1_only("b/122612051")
   def test_while_with_stateful_ops(self):
 
     def loop_fn(_):
@@ -1243,6 +624,7 @@ class ControlFlowTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32])
 
+  @test_util.run_v1_only("b/122612051")
   def test_while_unstacked_condition(self):
 
     def loop_fn(i):
@@ -1251,6 +633,7 @@ class ControlFlowTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32, dtypes.int32])
 
+  @test_util.run_v1_only("b/122612051")
   def test_while(self):
     x = random_ops.random_uniform([3, 5])
     lengths = constant_op.constant([4, 0, 2])
@@ -1266,6 +649,7 @@ class ControlFlowTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32])
 
+  @test_util.run_v1_only("b/122612051")
   def test_while_jacobian(self):
     x = random_ops.random_uniform([1, 3])
     y = random_ops.random_uniform([3, 3])
@@ -1293,6 +677,7 @@ class ControlFlowTest(PForTest):
       out, expected = sess.run([out, expected_output])
       self.assertAllClose(expected, out)
 
+  @test_util.run_v1_only("b/122612051")
   def test_tensor_array_as_loop_variable(self):
 
     def loop_fn(i):
@@ -1308,6 +693,7 @@ class ControlFlowTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32])
 
+  @test_util.run_v1_only("b/122612051")
   def test_read_tensor_array_partitioned_indices(self):
     # Note that tensor array values are pfor loop dependent, and the while loop
     # termination condition is also dependent on pfor iteration.
@@ -1325,6 +711,7 @@ class ControlFlowTest(PForTest):
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.int32])
 
+  @test_util.run_v1_only("b/122612051")
   def test_external_while_loop_grad(self):
     # Here we test that external while_loops that are extended from inside pfor
     # (due to gradient calls) are not actually converted. If the below was
@@ -1350,6 +737,7 @@ class ControlFlowTest(PForTest):
       self.assertAllEqual([1, 1, 1],
                           sess.run(pfor_control_flow_ops.pfor(loop_fn, 3)))
 
+  @test_util.run_v1_only("b/122612051")
   def test_tensor_array_grad(self):
     inp = constant_op.constant(np.random.rand(3, 4, 2), dtype=dtypes.float32)
     ta = tensor_array_ops.TensorArray(dtypes.float32, size=3)
@@ -1447,13 +835,15 @@ def create_dynamic_lstm(cell_fn, batch_size, state_size, max_steps):
   return pfor_output, tf_output
 
 
-class RNNTest(PForTest):
+class RNNTest(PForTestCase):
 
+  @test_util.run_v1_only("b/122612051")
   def test_dynamic_rnn(self):
     pfor_outputs, tf_outputs = create_dynamic_lstm(rnn_cell.BasicRNNCell,
                                                    3, 5, 7)
     self.run_and_assert_equal(pfor_outputs, tf_outputs)
 
+  @test_util.run_v1_only("b/122612051")
   def test_dynamic_lstm(self):
     pfor_outputs, tf_outputs = create_dynamic_lstm(rnn_cell.BasicLSTMCell,
                                                    3, 5, 7)
@@ -1538,15 +928,15 @@ class Benchmarks(test.Benchmark):
       b = 256
       params = 1000
       inp = random_ops.random_normal((b, params))
-      map_fn = lambda x: x * x
+      fn = lambda x: x * x
 
       def pfor_map_fn(f, x):
         return pfor_control_flow_ops.pfor(
             lambda i: f(array_ops.gather(x, i)),
             array_ops.shape(x)[0])
 
-      map_output = functional_ops.map_fn(map_fn, inp)
-      pfor_output = pfor_map_fn(map_fn, inp)
+      map_output = map_fn.map_fn(fn, inp)
+      pfor_output = pfor_map_fn(fn, inp)
 
       self._run(map_output, 100, name="tf_map_fn")
       self._run(pfor_output, 100, name="pfor_map_fn")
@@ -1576,8 +966,9 @@ class Benchmarks(test.Benchmark):
       self._run(tf_outputs, 100, name="tf_rnn")
 
 
-class SparseTest(PForTest):
+class SparseTest(PForTestCase):
 
+  @test_util.run_v1_only("b/122612051")
   def test_var_loop_len(self):
     num_iters = array_ops.placeholder(dtypes.int32)
 
@@ -1589,6 +980,7 @@ class SparseTest(PForTest):
     with self.cached_session() as sess:
       sess.run(pfor, feed_dict={num_iters: 3})
 
+  @test_util.run_v1_only("b/122612051")
   def test_sparse_result_none_stacked(self):
     num_iters = 10
 
@@ -1605,6 +997,7 @@ class SparseTest(PForTest):
     manual = sparse_tensor.SparseTensor(indices, values, dense_shapes)
     self.run_and_assert_equal(pfor, manual)
 
+  @test_util.run_v1_only("b/122612051")
   def test_sparse_result_all_stacked(self):
     num_iters = 10
 
@@ -1620,6 +1013,7 @@ class SparseTest(PForTest):
                                         (num_iters, num_iters))
     self.run_and_assert_equal(pfor, manual)
 
+  @test_util.run_v1_only("b/122612051")
   def test_sparse_result_indices_stacked(self):
     num_iters = 10
 
@@ -1634,6 +1028,7 @@ class SparseTest(PForTest):
                                         [1] * num_iters, (num_iters, num_iters))
     self.run_and_assert_equal(pfor, manual)
 
+  @test_util.run_v1_only("b/122612051")
   def test_sparse_result_values_stacked(self):
     num_iters = 10
 
@@ -1648,6 +1043,7 @@ class SparseTest(PForTest):
                                         (num_iters, num_iters))
     self.run_and_assert_equal(pfor, manual)
 
+  @test_util.run_v1_only("b/122612051")
   def test_sparse_result_shapes_stacked(self):
     num_iters = 10
 
@@ -1661,6 +1057,7 @@ class SparseTest(PForTest):
                                         [1] * num_iters, (num_iters, num_iters))
     self.run_and_assert_equal(pfor, manual)
 
+  @test_util.run_v1_only("b/122612051")
   def test_sparse_result_shapes_stacked_2D(self):
     num_iters = 10
 
@@ -1677,7 +1074,7 @@ class SparseTest(PForTest):
     self.run_and_assert_equal(pfor, manual)
 
 
-class ParsingTest(PForTest):
+class ParsingTest(PForTestCase):
 
   def test_decode_csv(self):
     csv_tensor = constant_op.constant([["1:2:3"], ["::"], ["7:8:9"]])
@@ -1689,6 +1086,7 @@ class ParsingTest(PForTest):
 
     self._test_loop_fn(loop_fn, iters=3, loop_fn_dtypes=[dtypes.int32] * 3)
 
+  @test_util.run_v1_only("b/122612051")
   def test_parse_single_example(self):
 
     def _int64_feature(*values):
diff --git a/tensorflow/python/ops/parallel_for/gradients_test.py b/tensorflow/python/ops/parallel_for/gradients_test.py
index 4342833e3eb362e81ff9f60b4649cc5b8de6250f..69635c5a79c032514cdcd83af7e52b6953b2dc0b 100644
--- a/tensorflow/python/ops/parallel_for/gradients_test.py
+++ b/tensorflow/python/ops/parallel_for/gradients_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training as keras_training
 from tensorflow.python.layers import layers as tf_layers
 from tensorflow.python.ops import array_ops
@@ -69,9 +70,10 @@ def fully_connected_model_fn(batch_size, activation_size, num_layers):
   return inp, model(inp)
 
 
-def lstm_model_fn(batch_size, state_size, steps):
+def lstm_model_fn(batch_size, state_size, steps, inputs_size=None):
+  inputs_size = inputs_size or state_size
   inputs = [
-      random_ops.random_normal([batch_size, state_size]) for _ in range(steps)
+      random_ops.random_normal([batch_size, inputs_size]) for _ in range(steps)
   ]
   cell = rnn_cell.BasicLSTMCell(state_size)
   init_state = cell.zero_state(batch_size, dtypes.float32)
@@ -107,8 +109,9 @@ def create_fc_batch_jacobian(batch_size, activation_size, num_layers):
   return pfor_jacobian, while_jacobian
 
 
-def create_lstm_batch_jacobian(batch_size, state_size, steps):
-  inp, output = lstm_model_fn(batch_size, state_size, steps)
+def create_lstm_batch_jacobian(batch_size, state_size, steps, inputs_size=None):
+  inp, output = lstm_model_fn(batch_size, state_size, steps,
+                              inputs_size=inputs_size)
   pfor_jacobian = gradients.batch_jacobian(output, inp, use_pfor=True)
   while_jacobian = gradients.batch_jacobian(output, inp, use_pfor=False)
   return pfor_jacobian, while_jacobian
@@ -180,9 +183,10 @@ def create_fc_per_eg_grad(batch_size, activation_size, num_layers):
   return pfor_outputs, while_outputs
 
 
-def create_lstm_per_eg_grad(batch_size, state_size, steps):
+def create_lstm_per_eg_grad(batch_size, state_size, steps, inputs_size=None):
+  inputs_size = inputs_size or state_size
   inputs = [
-      random_ops.random_normal([batch_size, state_size]) for _ in range(steps)
+      random_ops.random_normal([batch_size, inputs_size]) for _ in range(steps)
   ]
   cell = rnn_cell.BasicLSTMCell(state_size)
   init_state = cell.zero_state(batch_size, dtypes.float32)
@@ -297,6 +301,16 @@ def create_mnist_per_eg_grad(batch_size, data_format, training):
   return pfor_outputs, while_outputs
 
 
+def create_mnist_batch_jacobian(batch_size, data_format, training):
+  images = random_ops.random_uniform([batch_size, 28, 28])
+  model = Mnist(data_format)
+  logits = model(images, training=training)
+
+  pfor_jacobian = gradients.batch_jacobian(logits, images, use_pfor=True)
+  while_jacobian = gradients.batch_jacobian(logits, images, use_pfor=False)
+  return pfor_jacobian, while_jacobian
+
+
 def create_mnist_per_eg_jacobian(batch_size, data_format, training):
   images = random_ops.random_uniform([batch_size, 28, 28])
   model = Mnist(data_format)
@@ -338,6 +352,7 @@ def create_fc_per_eg_jacobians(batch_size, activation_size, num_layers):
   return jacobians, per_eg_jacobians_pfor, per_eg_jacobians_while
 
 
+@test_util.run_v1_only("b/122612051")
 class GradientsTest(test.TestCase):
 
   def run_and_assert_equal(self, targets1, targets2, atol=1e-4, rtol=1e-4):
@@ -477,9 +492,11 @@ class GradientsTest(test.TestCase):
     self.run_and_assert_equal(pfor_jacobian, while_jacobian)
 
   def test_lstm_batch_jacobian(self):
-    pfor_jacobian, while_jacobian = create_lstm_batch_jacobian(8, 4, 2)
+    pfor_jacobian, while_jacobian = create_lstm_batch_jacobian(8, 4, 2,
+                                                               inputs_size=128)
     self.run_and_assert_equal(pfor_jacobian, while_jacobian)
 
+  @test_util.disable_xla("This test never passed for XLA")
   def test_dynamic_lstm_batch_jacobian(self):
     pfor_jacobian, while_gradients = create_dynamic_lstm_batch_jacobian(8, 4, 3)
     with session.Session() as sess:
@@ -566,7 +583,7 @@ class GradientsBenchmarks(test.Benchmark):
       for _ in range(iters):
         self.evaluate(targets)
       end = time.time()
-    avg_time_ms = 1000 * (end - begin) / iters
+    avg_time_ms = (1000 * (end - begin)) / iters
     self.report_benchmark(iters=iters, wall_time=avg_time_ms, name=name)
     return avg_time_ms
 
@@ -578,7 +595,8 @@ class GradientsBenchmarks(test.Benchmark):
 
   def benchmark_lstm_batch_jacobian(self):
     with ops.Graph().as_default():
-      pfor_jacobian, while_jacobian = create_lstm_batch_jacobian(100, 32, 8)
+      pfor_jacobian, while_jacobian = create_lstm_batch_jacobian(
+          100, 32, 8, inputs_size=128)
       self._run(pfor_jacobian, 100, name="lstm_batch_jacobian_pfor")
       self._run(while_jacobian, 20, name="lstm_batch_jacobian_while")
 
@@ -627,13 +645,26 @@ class GradientsBenchmarks(test.Benchmark):
 
   def benchmark_mnist_per_eg_jacobian(self):
     with ops.Graph().as_default():
-      data_format = ("channels_first"
-                     if test.is_gpu_available() else "channels_last")
+      if test.is_gpu_available():
+        data_format = "channels_first"
+      else:
+        data_format = "channels_last"
       pfor_outputs, while_outputs = create_mnist_per_eg_jacobian(
           16, data_format, training=True)
       self._run(pfor_outputs, 20, name="mnist_per_eg_jacobian_pfor")
       self._run(while_outputs, 20, name="mnist_per_eg_jacobian_while")
 
+  def benchmark_mnist_batch_jacobian(self):
+    with ops.Graph().as_default():
+      if test.is_gpu_available():
+        data_format = "channels_first"
+      else:
+        data_format = "channels_last"
+      pfor_outputs, while_outputs = create_mnist_batch_jacobian(
+          128, data_format, training=True)
+      self._run(pfor_outputs, 20, name="mnist_batch_jacobian_pfor")
+      self._run(while_outputs, 20, name="mnist_batch_jacobian_while")
+
   def benchmark_fc_per_eg_jacobian(self):
     with ops.Graph().as_default():
       jacobians, per_eg_jacobians_pfor, per_eg_jacobians_while = (
diff --git a/tensorflow/python/ops/parallel_for/math_test.py b/tensorflow/python/ops/parallel_for/math_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a081e194f14ddc1eed4aed846a02706c051a71a
--- /dev/null
+++ b/tensorflow/python/ops/parallel_for/math_test.py
@@ -0,0 +1,428 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for vectorization of math kernels."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
+from tensorflow.python.ops.parallel_for.test_util import PForTestCase
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MathTest(PForTestCase):
+
+  def test_unary_cwise_ops(self):
+    complex_ops = [
+        math_ops.angle,
+        math_ops.imag,
+        math_ops.complex_abs,
+        math_ops.real,
+        math_ops.conj,
+    ]
+    real_ops = [
+        lambda x: math_ops.acosh(1 + math_ops.square(x)),
+        math_ops.abs,
+        math_ops.acos,
+        math_ops.asin,
+        math_ops.asinh,
+        math_ops.atan,
+        math_ops.atanh,
+        math_ops.bessel_i0e,
+        math_ops.bessel_i1e,
+        math_ops.cos,
+        math_ops.cosh,
+        math_ops.digamma,
+        math_ops.erf,
+        math_ops.erfc,
+        math_ops.exp,
+        math_ops.expm1,
+        math_ops.inv,
+        math_ops.is_finite,
+        math_ops.is_inf,
+        math_ops.lgamma,
+        math_ops.log,
+        math_ops.log1p,
+        math_ops.neg,
+        math_ops.negative,
+        math_ops.reciprocal,
+        math_ops.rint,
+        math_ops.round,
+        math_ops.rsqrt,
+        math_ops.sigmoid,
+        math_ops.sign,
+        math_ops.sin,
+        math_ops.sinh,
+        math_ops.sqrt,
+        math_ops.square,
+        math_ops.tan,
+        math_ops.tanh,
+        math_ops.tanh,
+        nn.elu,
+        nn.relu,
+        nn.relu6,
+        nn.selu,
+        nn.softplus,
+        nn.softsign,
+    ]
+    for op in complex_ops + real_ops:
+      with backprop.GradientTape(persistent=True) as g:
+        x = random_ops.random_uniform([3, 5])
+        g.watch(x)
+        if op in complex_ops:
+          y = random_ops.random_uniform([3, 5])
+          g.watch(y)
+          x = math_ops.complex(x, y)
+
+      # pylint: disable=cell-var-from-loop
+      output_dtypes = []
+      def loop_fn(i):
+        with g:
+          x1 = array_ops.gather(x, i)
+          y1 = op(x1)
+          outputs = [op(x), y1]
+          if y1.dtype == dtypes.float32:
+            loss = math_ops.reduce_sum(y1 * y1)
+          else:
+            loss = None
+        if loss is not None:
+          grad = g.gradient(loss, x1)
+          if grad is not None:
+            outputs.append(grad)
+        del output_dtypes[:]
+        output_dtypes.extend([t.dtype for t in outputs])
+        return outputs
+
+      # pylint: enable=cell-var-from-loop
+
+      self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=output_dtypes)
+
+  def test_unary_cwise_no_grad(self):
+    for op in [math_ops.ceil,
+               math_ops.floor,
+               math_ops.logical_not]:
+      x = random_ops.random_uniform([3, 5])
+      if op == math_ops.logical_not:
+        x = x > 0
+
+      # pylint: disable=cell-var-from-loop
+      def loop_fn(i):
+        return op(array_ops.gather(x, i))
+
+      # pylint: enable=cell-var-from-loop
+
+      self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=x.dtype)
+
+  def test_binary_cwise_ops(self):
+    logical_ops = [
+        math_ops.logical_and,
+        math_ops.logical_or,
+        math_ops.logical_xor
+    ]
+
+    # Wrapper functions restricting the range of inputs of zeta and polygamma.
+    def safe_polygamma(x, y):
+      return math_ops.polygamma(
+          math_ops.round(clip_ops.clip_by_value(y, 1, 10)),
+          x * x + 1)
+
+    def safe_zeta(x, y):
+      return math_ops.zeta(x * x + 1, y * y)
+
+    float_ops = [
+        math_ops.add,
+        math_ops.add_v2,
+        math_ops.atan2,
+        math_ops.complex,
+        math_ops.div,
+        math_ops.divide,
+        math_ops.div_no_nan,
+        math_ops.equal,
+        math_ops.floor_mod,
+        math_ops.greater,
+        math_ops.greater_equal,
+        math_ops.igamma,
+        math_ops.igammac,
+        math_ops.igamma_grad_a,
+        math_ops.less,
+        math_ops.less_equal,
+        math_ops.maximum,
+        math_ops.minimum,
+        math_ops.mod,
+        math_ops.multiply,
+        math_ops.not_equal,
+        math_ops.pow,
+        math_ops.squared_difference,
+        math_ops.subtract,
+        math_ops.truncate_mod,
+        safe_polygamma,
+        safe_zeta,
+    ]
+    # FloorDiv fails on XLA due floor's discontinuities exacerbating small
+    # division differences.
+    if not test_util.is_xla_enabled():
+      float_ops += [math_ops.floor_div]
+    for op in logical_ops + float_ops:
+      x = random_ops.random_uniform([7, 3, 5])
+      y = random_ops.random_uniform([3, 5])
+      if op in logical_ops:
+        x = x > 0
+        y = y > 0
+
+      output_dtypes = []
+      # pylint: disable=cell-var-from-loop
+      def loop_fn(i):
+        x1 = array_ops.gather(x, i)
+        y1 = array_ops.gather(y, i)
+        outputs = [op(x, y), op(x1, y), op(x, y1), op(x1, y1), op(x1, x1)]
+        del output_dtypes[:]
+        output_dtypes.extend([t.dtype for t in outputs])
+        return outputs
+      # pylint: enable=cell-var-from-loop
+
+      self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=output_dtypes)
+
+  def test_approximate_equal(self):
+    x = random_ops.random_uniform([3, 5])
+    y = random_ops.random_uniform([3, 5])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      y1 = array_ops.gather(y, i)
+      return math_ops.approximate_equal(x1, y1)
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.bool])
+
+  def test_addn(self):
+    x = random_ops.random_uniform([2, 3, 5])
+    y = random_ops.random_uniform([3, 5])
+    z = random_ops.random_uniform([3, 5])
+
+    def loop_fn(i):
+      x1 = array_ops.gather(x, i)
+      return math_ops.add_n([x1, y, z])
+
+    self._test_loop_fn(loop_fn, 2)
+
+  def test_matmul(self):
+    for tr_a in (True, False):
+      for tr_b in (True, False):
+        for stack_a in (True, False):
+          for stack_b in (True, False):
+            shape_a = (5, 3) if tr_a else (3, 5)
+            if stack_a:
+              shape_a = (2,) + shape_a
+            shape_b = (7, 5) if tr_b else (5, 7)
+            if stack_b:
+              shape_b = (2,) + shape_b
+
+            x = random_ops.random_uniform(shape_a)
+            y = random_ops.random_uniform(shape_b)
+
+            # pylint: disable=cell-var-from-loop
+            def loop_fn(i):
+              a = array_ops.gather(x, i) if stack_a else x
+              b = array_ops.gather(y, i) if stack_b else y
+              return math_ops.matmul(a, b, transpose_a=tr_a, transpose_b=tr_b)
+
+            # pylint: enable=cell-var-from-loop
+
+            self._test_loop_fn(loop_fn, 2)
+
+  def test_batch_matmul(self):
+    for tr_a in (True, False):
+      for tr_b in (True, False):
+        for stack_a in (True, False):
+          for stack_b in (True, False):
+            shape_a = (4, 5, 3) if tr_a else (4, 3, 5)
+            if stack_a:
+              shape_a = (2,) + shape_a
+            shape_b = (4, 7, 5) if tr_b else (4, 5, 7)
+            if stack_b:
+              shape_b = (2,) + shape_b
+
+            x = random_ops.random_uniform(shape_a)
+            y = random_ops.random_uniform(shape_b)
+
+            # pylint: disable=cell-var-from-loop
+            def loop_fn(i):
+              a = array_ops.gather(x, i) if stack_a else x
+              b = array_ops.gather(y, i) if stack_b else y
+              return math_ops.matmul(a, b, transpose_a=tr_a, transpose_b=tr_b)
+
+            # pylint: enable=cell-var-from-loop
+
+            self._test_loop_fn(loop_fn, 2)
+
+  def test_reduction(self):
+    x = random_ops.random_uniform([2, 3, 4, 5])
+    for op in [
+        math_ops.reduce_sum, math_ops.reduce_prod, math_ops.reduce_max,
+        math_ops.reduce_min, math_ops.reduce_mean,
+    ]:
+      for axis in ([1], None, [0, 2]):
+        for keepdims in (True, False):
+
+          # pylint: disable=cell-var-from-loop
+          def loop_fn(i):
+            a = array_ops.gather(x, i)
+            return op(a, axis=axis, keepdims=keepdims)
+
+          # pylint: enable=cell-var-from-loop
+
+          self._test_loop_fn(loop_fn, 2)
+
+  def test_cum_sum(self):
+    x = random_ops.random_uniform([2, 3, 4, 5])
+    for axis in (1, -2):
+      for exclusive in (True, False):
+        for reverse in (True, False):
+
+          # pylint: disable=cell-var-from-loop
+          def loop_fn(i):
+            a = array_ops.gather(x, i)
+            return math_ops.cumsum(
+                a, axis=axis, exclusive=exclusive, reverse=reverse)
+
+          # pylint: enable=cell-var-from-loop
+
+          self._test_loop_fn(loop_fn, 2)
+
+  def test_cum_prod(self):
+    x = random_ops.random_uniform([2, 3, 4, 5])
+    for axis in (1, -2):
+      for exclusive in (True, False):
+        for reverse in (True, False):
+
+          # pylint: disable=cell-var-from-loop
+          def loop_fn(i):
+            a = array_ops.gather(x, i)
+            return math_ops.cumprod(
+                a, axis=axis, exclusive=exclusive, reverse=reverse)
+
+          # pylint: enable=cell-var-from-loop
+
+          self._test_loop_fn(loop_fn, 2)
+
+  def test_bias_add(self):
+    for data_format in ("NCHW", "NHWC"):
+      for stacked_value in (True, False):
+        x_shape = [3, 4, 5, 6]
+        if stacked_value:
+          x_shape = [2] + x_shape
+        x = random_ops.random_uniform(x_shape)
+        for stacked_bias in (True, False):
+          if not (stacked_value or stacked_bias):
+            continue
+          with backprop.GradientTape(persistent=True) as g:
+            bias_dim = -1
+            if data_format == "NCHW":
+              bias_dim = 2 if stacked_value else 1
+            bias_shape = [x_shape[bias_dim]]
+            if stacked_bias:
+              bias_shape = [2] + bias_shape
+            bias = random_ops.random_uniform(bias_shape)
+            g.watch(bias)
+
+          # pylint: disable=cell-var-from-loop
+          def loop_fn(i):
+            with g:
+              a = array_ops.gather(x, i) if stacked_value else x
+              b = array_ops.gather(bias, i) if stacked_bias else bias
+              y = nn.bias_add(a, b, data_format=data_format)
+              loss = math_ops.reduce_sum(y * y)
+            grad = g.gradient(loss, bias)
+            if stacked_bias:
+              # If we gather over bias in loop_fn, the gradient will be an
+              # instance of `IndexedSlices` with attrs `values` and `indices`.
+              return y, grad.values, grad.indices
+            else:
+              return y, grad
+          # pylint: enable=cell-var-from-loop
+
+          out_dtypes = [dtypes.float32, dtypes.float32]
+          if stacked_bias:
+            out_dtypes = out_dtypes + [dtypes.int32]
+          self._test_loop_fn(
+              loop_fn, 2, loop_fn_dtypes=out_dtypes)
+
+  def test_unsorted_segment_sum(self):
+    t = random_ops.random_uniform([3, 3, 2])
+    segment_ids = constant_op.constant([[0, 0, 2], [0, 1, 2], [2, 2, 2]])
+    num_segments = 3
+
+    def loop_fn(i):
+      data = array_ops.gather(t, i)
+      data_0 = array_ops.gather(t, 0)
+      seg_ids = array_ops.gather(segment_ids, i)
+      return (math_ops.unsorted_segment_sum(data, seg_ids, num_segments),
+              math_ops.unsorted_segment_sum(data_0, seg_ids, num_segments))
+
+    self._test_loop_fn(loop_fn, 3, [dtypes.float32] * 2)
+
+  def test_cast(self):
+    x = constant_op.constant([[1], [2]])
+    y = constant_op.constant([[1.0], [2.0]])
+
+    def loop_fn(i):
+      return (math_ops.cast(array_ops.gather(x, i), dtypes.float32),
+              math_ops.cast(array_ops.gather(y, i), dtypes.int32))
+
+    self._test_loop_fn(
+        loop_fn, 2, loop_fn_dtypes=[dtypes.float32, dtypes.int32])
+
+  def test_tanh_axpy(self):
+    a = constant_op.constant(3.)
+    x = random_ops.random_uniform([4, 5])
+    y = random_ops.random_uniform([6, 5])
+    n = x.shape[0]
+
+    def loop_fn(i):
+      return math_ops.tanh(a * array_ops.gather(x, i) + array_ops.gather(y, i))
+
+    self._test_loop_fn(loop_fn, n)
+
+  def test_select(self):
+    cond = constant_op.constant([True, False])
+    a = random_ops.random_uniform([2, 3, 5])
+    b = random_ops.random_uniform([2, 3, 5])
+    for cond_shape in [2], [2, 3], [2, 3, 5]:
+      cond = random_ops.random_uniform(cond_shape) > 0.5
+
+      # pylint: disable=cell-var-from-loop
+      def loop_fn(i):
+        a_i = array_ops.gather(a, i)
+        b_i = array_ops.gather(b, i)
+        cond_i = array_ops.gather(cond, i)
+        return array_ops.where(cond_i, a_i, b_i)
+
+      # pylint: enable=cell-var-from-loop
+
+      self._test_loop_fn(loop_fn, 2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index a22c1126c93915da7acc5221594567f855557b84..b9f7a0ffca5af9b1283da0d7b1f88d9bad8aba2b 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -32,9 +32,9 @@ from tensorflow.python.ops import bitwise_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_parsing_ops
 from tensorflow.python.ops import gen_sparse_ops
+from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import parsing_ops
@@ -42,6 +42,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 
 flags.DEFINE_bool(
@@ -1037,7 +1038,7 @@ class PFor(object):
     if sparse_tensor_rank is not None:
       sparse_tensor_rank += 1
 
-    def map_fn(args):
+    def fn(args):
       res = gen_sparse_ops.serialize_sparse(
           args[0], args[1], args[2], out_type=dtypes.variant)
       return res
@@ -1046,8 +1047,8 @@ class PFor(object):
     # sparse tensor element and batch them all, then deserializes the batch.
     # TODO(rachelim): Try to do this without map_fn -- add the right offsets
     # to shape and indices tensors instead.
-    result = functional_ops.map_fn(
-        map_fn, [indices, values, shape], dtype=dtypes.variant)
+    result = map_fn.map_fn(
+        fn, [indices, values, shape], dtype=dtypes.variant)
     return sparse_ops.deserialize_sparse(
         result, dtype=values.dtype, rank=sparse_tensor_rank)
 
@@ -1876,6 +1877,7 @@ def _convert_batch_mat_mul(pfor_input):
 @RegisterPForWithArgs("Prod", math_ops.reduce_prod)
 @RegisterPForWithArgs("Max", math_ops.reduce_max)
 @RegisterPForWithArgs("Min", math_ops.reduce_min)
+@RegisterPForWithArgs("Mean", math_ops.reduce_mean)
 def _convert_reduction(pfor_input, _, op_func):
   t = pfor_input.stacked_input(0)
   indices = pfor_input.unstacked_input(1)
@@ -1899,17 +1901,30 @@ def _convert_cumfoo(pfor_input, _, op_func):
 
 @RegisterPFor("BiasAdd")
 def _convert_biasadd(pfor_input):
-  t = pfor_input.stacked_input(0)
-  bias = pfor_input.unstacked_input(1)
-  data_format = pfor_input.get_attr("data_format")
-  if data_format != b"NCHW":
+  t, t_stacked, _ = pfor_input.input(0)
+  bias, bias_stacked, _ = pfor_input.input(1)
+  data_format = pfor_input.get_attr("data_format").decode()
+  if bias_stacked:
+    # BiasAdd only supports 1-D biases, so cast bias to match value and use Add.
+    pfor_input.expanddim_inputs_for_broadcast()
+    t, _, _ = pfor_input.input(0)
+    bias = math_ops.cast(pfor_input.stacked_input(1), t.dtype)
+    if compat.as_bytes(data_format) == b"NCHW":
+      b_shape = array_ops.shape(bias)
+      new_b_shape = array_ops.concat(
+          [b_shape[:-3], b_shape[-1:], b_shape[-3:-1]], axis=0)
+      bias = array_ops.reshape(bias, new_b_shape)
+    return wrap(math_ops.add(t, bias), True)
+  else:
+    assert t_stacked, "At least one input to BiasAdd should be loop variant."
+    if compat.as_bytes(data_format) == b"NCHW":
+      shape = array_ops.shape(t)
+      flattened_shape = array_ops.concat([[-1], shape[2:]], axis=0)
+      t = array_ops.reshape(t, flattened_shape)
+      t = nn_ops.bias_add(t, bias, data_format="NCHW")
+      t = array_ops.reshape(t, shape)
+      return wrap(t, True)
     return wrap(nn_ops.bias_add(t, bias, data_format=data_format), True)
-  shape = array_ops.shape(t)
-  flattened_shape = array_ops.concat([[-1], shape[2:]], axis=0)
-  t = array_ops.reshape(t, flattened_shape)
-  t = nn_ops.bias_add(t, bias, data_format=b"NCHW")
-  t = array_ops.reshape(t, shape)
-  return wrap(t, True)
 
 
 @RegisterPFor("UnsortedSegmentSum")
diff --git a/tensorflow/python/ops/parallel_for/test_util.py b/tensorflow/python/ops/parallel_for/test_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b4ef2239e5dc2eb7614d167777821437ae1e812
--- /dev/null
+++ b/tensorflow/python/ops/parallel_for/test_util.py
@@ -0,0 +1,59 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test utility."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.parallel_for import control_flow_ops as pfor_control_flow_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import nest
+
+
+class PForTestCase(test.TestCase):
+  """Base class for test cases."""
+
+  def _run_targets(self, targets1, targets2=None, run_init=True):
+    targets1 = nest.flatten(targets1)
+    targets2 = ([] if targets2 is None else nest.flatten(targets2))
+    assert len(targets1) == len(targets2) or not targets2
+    if run_init:
+      init = variables.global_variables_initializer()
+      self.evaluate(init)
+    return self.evaluate(targets1 + targets2)
+
+  def run_and_assert_equal(self, targets1, targets2):
+    outputs = self._run_targets(targets1, targets2)
+    outputs = nest.flatten(outputs)  # flatten SparseTensorValues
+    n = len(outputs) // 2
+    for i in range(n):
+      if outputs[i + n].dtype != np.object:
+        self.assertAllClose(outputs[i + n], outputs[i], rtol=1e-4, atol=1e-5)
+      else:
+        self.assertAllEqual(outputs[i + n], outputs[i])
+
+  def _test_loop_fn(self, loop_fn, iters,
+                    loop_fn_dtypes=dtypes.float32,
+                    parallel_iterations=None):
+    t1 = pfor_control_flow_ops.pfor(loop_fn, iters=iters,
+                                    parallel_iterations=parallel_iterations)
+    t2 = pfor_control_flow_ops.for_loop(loop_fn, loop_fn_dtypes, iters=iters,
+                                        parallel_iterations=parallel_iterations)
+    self.run_and_assert_equal(t1, t2)
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
index d88543c400f2432ea620ccddcab983337abe3fc2..4f29fcc41d07c90dadefb6fcb503f07b3ab62473 100644
--- a/tensorflow/python/ops/ragged/BUILD
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow:tensorflow.bzl", "py_test")
+
 package(
     default_visibility = [
         "//intelligence/datum/prensor:__pkg__",
@@ -11,8 +13,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
-
 #-------------------------------------------------------------------------------
 # RaggedTensor
 #-------------------------------------------------------------------------------
@@ -24,10 +24,14 @@ py_library(
     tags = ["nofixdeps"],
     deps = [
         ":ragged_array_ops",
+        ":ragged_batch_gather_ops",
+        ":ragged_batch_gather_with_default_op",
+        ":ragged_concat_ops",
         ":ragged_conversion_ops",
         ":ragged_dispatch",
         ":ragged_factory_ops",
         ":ragged_functional_ops",
+        ":ragged_gather_ops",
         ":ragged_getitem",
         ":ragged_map_ops",
         ":ragged_math_ops",
@@ -37,6 +41,7 @@ py_library(
         ":ragged_tensor_shape",
         ":ragged_tensor_value",
         ":ragged_util",
+        ":ragged_where_op",
         ":segment_id_ops",
         "//tensorflow/python:util",
     ],
@@ -48,7 +53,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged_conversion_ops",
-        ":ragged_factory_ops",
         ":ragged_functional_ops",
         ":ragged_math_ops",
         ":ragged_tensor",
@@ -56,6 +60,7 @@ py_library(
         ":segment_id_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
@@ -65,6 +70,68 @@ py_library(
     ],
 )
 
+py_library(
+    name = "ragged_batch_gather_ops",
+    srcs = ["ragged_batch_gather_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_concat_ops",
+        ":ragged_conversion_ops",
+        ":ragged_gather_ops",
+        ":ragged_tensor",
+        ":ragged_tensor_shape",
+        ":ragged_util",
+        ":ragged_where_op",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+py_library(
+    name = "ragged_batch_gather_with_default_op",
+    srcs = [
+        "ragged_batch_gather_with_default_op.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_batch_gather_ops",
+        ":ragged_concat_ops",
+        ":ragged_dispatch",
+        ":ragged_operators",
+        ":ragged_tensor",
+        ":ragged_tensor_shape",
+        ":ragged_where_op",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+py_library(
+    name = "ragged_concat_ops",
+    srcs = ["ragged_concat_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_conversion_ops",
+        ":ragged_gather_ops",
+        ":ragged_tensor",
+        ":ragged_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_shape",
+    ],
+)
+
 py_library(
     name = "ragged_conversion_ops",
     srcs = ["ragged_conversion_ops.py"],
@@ -82,6 +149,7 @@ py_library(
         "//tensorflow/python:ragged_conversion_ops_gen",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
     ],
 )
 
@@ -95,6 +163,7 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python:util",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/ops/ragged:ragged_tensor_value",
         "//third_party/py/numpy",
@@ -110,6 +179,24 @@ py_library(
         ":ragged_tensor",
         ":ragged_util",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "ragged_gather_ops",
+    srcs = ["ragged_gather_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_conversion_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:ragged_array_ops_gen",
+        "//tensorflow/python:tensor_shape",
     ],
 )
 
@@ -118,8 +205,8 @@ py_library(
     srcs = ["ragged_getitem.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged_array_ops",
         ":ragged_factory_ops",
+        ":ragged_gather_ops",
         ":ragged_math_ops",
         ":ragged_tensor",
         "//tensorflow/python:array_ops",
@@ -147,6 +234,7 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:ragged_math_ops_gen",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python:util",
     ],
 )
 
@@ -167,12 +255,15 @@ py_library(
     srcs = ["ragged_string_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":ragged_array_ops",
         ":ragged_conversion_ops",
         ":ragged_factory_ops",
         ":ragged_tensor",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:util",
     ],
@@ -187,8 +278,10 @@ py_library(
         ":ragged_util",
         ":segment_id_ops",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:ragged_conversion_ops_gen",
         "//tensorflow/python:session",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
     ],
 )
 
@@ -216,7 +309,10 @@ py_library(
     name = "ragged_tensor_value",
     srcs = ["ragged_tensor_value.py"],
     srcs_version = "PY2AND3",
-    deps = ["//third_party/py/numpy"],
+    deps = [
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
+    ],
 )
 
 py_library(
@@ -233,6 +329,22 @@ py_library(
     ],
 )
 
+py_library(
+    name = "ragged_where_op",
+    srcs = ["ragged_where_op.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_concat_ops",
+        ":ragged_functional_ops",
+        ":ragged_gather_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
 py_library(
     name = "segment_id_ops",
     srcs = ["segment_id_ops.py"],
@@ -245,6 +357,7 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python:util",
     ],
 )
 
@@ -253,17 +366,17 @@ py_library(
     srcs = ["ragged_map_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged_array_ops",
-        ":ragged_factory_ops",
         ":ragged_tensor",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
@@ -276,12 +389,13 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged_array_ops",
-        ":ragged_factory_ops",
+        ":ragged_batch_gather_ops",
         ":ragged_math_ops",
         ":ragged_tensor",
         ":ragged_tensor_shape",
         ":ragged_util",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:bitwise_ops",
         "//tensorflow/python:clip_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
@@ -289,6 +403,7 @@ py_library(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:util",
+        "//tensorflow/python:variables",
         "//third_party/py/numpy",
     ],
 )
@@ -306,7 +421,7 @@ py_library(
         ":ragged_tensor_value",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -319,14 +434,20 @@ py_test(
         "no_windows",
     ],
     deps = [
-        ":ragged",
+        ":ragged",  # fixdeps: keep
+        ":ragged_factory_ops",
+        ":ragged_math_ops",
+        ":ragged_tensor",
+        ":ragged_tensor_value",
         ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -338,10 +459,9 @@ py_test(
     srcs = ["ragged_eager_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_factory_ops",
         ":ragged_test_util",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -352,7 +472,7 @@ py_test(
     srcs = ["ragged_range_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_math_ops",
         ":ragged_test_util",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
@@ -365,7 +485,8 @@ py_test(
     srcs = ["ragged_tensor_bounding_shape_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_factory_ops",
+        ":ragged_tensor",
         ":ragged_test_util",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
@@ -377,8 +498,10 @@ py_test(
     srcs = ["ragged_row_lengths_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_factory_ops",
+        ":ragged_tensor",
         ":ragged_test_util",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
         "@absl_py//absl/testing:parameterized",
@@ -390,7 +513,9 @@ py_test(
     srcs = ["ragged_gather_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        ":ragged_gather_ops",
         ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
@@ -399,6 +524,7 @@ py_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
     ],
 )
 
@@ -407,13 +533,18 @@ py_test(
     srcs = ["ragged_batch_gather_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_array_ops",
+        ":ragged_batch_gather_ops",
+        ":ragged_batch_gather_with_default_op",
+        ":ragged_factory_ops",
+        ":ragged_tensor",
         ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -423,12 +554,15 @@ py_test(
     srcs = ["ragged_gather_nd_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_factory_ops",
+        ":ragged_gather_ops",
         ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -439,8 +573,8 @@ py_test(
     srcs = ["ragged_row_splits_to_segment_ids_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
         ":ragged_test_util",
+        ":segment_id_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
@@ -452,8 +586,8 @@ py_test(
     srcs = ["ragged_segment_ids_to_row_splits_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
         ":ragged_test_util",
+        ":segment_id_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
@@ -465,7 +599,7 @@ py_test(
     srcs = ["ragged_from_tensor_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_tensor",
         ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
@@ -484,7 +618,10 @@ py_test(
         "no_windows",
     ],
     deps = [
-        ":ragged",
+        ":ragged",  # fixdeps: keep
+        ":ragged_factory_ops",
+        ":ragged_functional_ops",
+        ":ragged_tensor",
         ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
@@ -493,6 +630,7 @@ py_test(
         "//tensorflow/python:gradients_impl",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
     ],
 )
 
@@ -501,7 +639,7 @@ py_test(
     srcs = ["ragged_from_sparse_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_tensor",
         ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
@@ -509,6 +647,7 @@ py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/eager:context",
     ],
 )
 
@@ -517,7 +656,7 @@ py_test(
     srcs = ["ragged_to_tensor_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_factory_ops",
         ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_ops",
@@ -532,7 +671,9 @@ py_test(
     srcs = ["ragged_segment_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_factory_ops",
+        ":ragged_math_ops",
+        ":ragged_tensor",
         ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
@@ -548,13 +689,15 @@ py_test(
     srcs = ["ragged_reduce_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_factory_ops",
+        ":ragged_math_ops",
         ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -565,7 +708,9 @@ py_test(
     srcs = ["ragged_map_flat_values_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_factory_ops",
+        ":ragged_functional_ops",
+        ":ragged_tensor",
         ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
@@ -574,7 +719,6 @@ py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -584,6 +728,8 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",
+        ":ragged_factory_ops",
+        ":ragged_tensor",
         ":ragged_test_util",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_test_lib",
@@ -600,7 +746,9 @@ py_test(
         "no_windows",
     ],
     deps = [
-        ":ragged",
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        ":ragged_tensor_value",
         ":ragged_test_util",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
@@ -614,7 +762,8 @@ py_test(
     srcs = ["convert_to_tensor_or_ragged_tensor_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_factory_ops",
+        ":ragged_tensor",
         ":ragged_test_util",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
@@ -630,13 +779,15 @@ py_test(
     srcs = ["ragged_boolean_mask_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
         ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -646,7 +797,8 @@ py_test(
     srcs = ["ragged_concat_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_concat_ops",
+        ":ragged_factory_ops",
         ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
@@ -654,6 +806,7 @@ py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -663,7 +816,8 @@ py_test(
     srcs = ["ragged_stack_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_concat_ops",
+        ":ragged_factory_ops",
         ":ragged_test_util",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_test_lib",
@@ -672,12 +826,26 @@ py_test(
     ],
 )
 
+py_test(
+    name = "ragged_rank_op_test",
+    srcs = ["ragged_rank_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        ":ragged_test_util",
+        "//tensorflow/python:framework_test_lib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "ragged_tile_op_test",
     srcs = ["ragged_tile_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
         ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
@@ -709,7 +877,8 @@ py_test(
     srcs = ["ragged_expand_dims_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
         ":ragged_test_util",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
@@ -722,8 +891,9 @@ py_test(
     srcs = ["ragged_where_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged_factory_ops",
         ":ragged_test_util",
+        ":ragged_where_op",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
         "@absl_py//absl/testing:parameterized",
@@ -735,7 +905,9 @@ py_test(
     srcs = ["ragged_dispatch_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged",  # fixdeps: keep
+        ":ragged_factory_ops",
+        ":ragged_tensor",
         ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:clip_ops",
@@ -746,7 +918,9 @@ py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
+        "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -757,7 +931,8 @@ py_test(
     srcs = ["ragged_operators_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged",  # fixdeps: keep
+        ":ragged_factory_ops",
         ":ragged_test_util",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
@@ -770,7 +945,12 @@ py_test(
     srcs = ["ragged_map_fn_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged",  # fixdeps: keep
+        ":ragged_factory_ops",
+        ":ragged_functional_ops",
+        ":ragged_map_ops",
+        ":ragged_math_ops",
+        ":ragged_tensor",
         ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
@@ -789,7 +969,10 @@ py_test(
     srcs = ["ragged_tensor_shape_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged",
+        ":ragged",  # fixdeps: keep
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        ":ragged_tensor_shape",
         ":ragged_test_util",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_test_lib",
@@ -798,3 +981,18 @@ py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+py_test(
+    name = "ragged_size_op_test",
+    srcs = ["ragged_size_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        ":ragged_test_util",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/ops/ragged/__init__.py b/tensorflow/python/ops/ragged/__init__.py
index 3d915ee269b45571c9338ea1d734ddaa4b884a98..e9232a1c641c251ed61259ca6251f76fea785626 100644
--- a/tensorflow/python/ops/ragged/__init__.py
+++ b/tensorflow/python/ops/ragged/__init__.py
@@ -1,140 +1,52 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Ragged Tensors.
 
-This package defines the `tf.RaggedTensor` class, which
-represents tensors with non-uniform shapes.  In particular, each `RaggedTensor`
+This package defines ops for manipulating ragged tensors (`tf.RaggedTensor`),
+which are tensors with non-uniform shapes.  In particular, each `RaggedTensor`
 has one or more *ragged dimensions*, which are dimensions whose slices may have
 different lengths.  For example, the inner (column) dimension of
 `rt=[[3, 1, 4, 1], [], [5, 9, 2], [6], []]` is ragged, since the column slices
 (`rt[0, :]`, ..., `rt[4, :]`) have different lengths.  For a more detailed
-description of ragged tensors, see the `tf.RaggedTensor`
-class documentation.
-
-<!-- Ragged Classes & related helper functions -->
-@@RaggedTensor
-@@RaggedTensorType
-@@RaggedTensorValue
-@@is_ragged
-
-<!-- Factory Ops -->
-@@ragged_factory_ops
-@@constant
-@@constant_value
-@@convert_to_tensor_or_ragged_tensor
-
-<!-- Conversion Ops -->
-@@from_tensor
-@@to_tensor
-@@from_sparse
-@@to_sparse
-@@row_splits_to_segment_ids
-@@segment_ids_to_row_splits
-
-<!-- Array Ops -->
-@@gather
-@@batch_gather
-@@gather_nd
-@@boolean_mask
-@@concat
-@@stack
-@@tile
-@@expand_dims
-@@where
-
-<!-- Math Ops -->
-@@range
-
-@@segment_sum
-@@segment_prod
-@@segment_min
-@@segment_max
-@@segment_mean
-@@segment_sqrt_n
-
-@@reduce_sum
-@@reduce_prod
-@@reduce_min
-@@reduce_max
-@@reduce_mean
-@@reduce_all
-@@reduce_any
-
-<!-- Functional Ops -->
-@@map_flat_values
-@@map_fn
-
-<!-- Shape & broadcasting -->
-@@RaggedTensorDynamicShape
-@@broadcast_to
-@@broadcast_dynamic_shape
+description of ragged tensors, see the `tf.RaggedTensor` class documentation
+and the [Ragged Tensor Guide](/guide/ragged_tensors).
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_batch_gather_ops
+from tensorflow.python.ops.ragged import ragged_batch_gather_with_default_op
+from tensorflow.python.ops.ragged import ragged_concat_ops
+from tensorflow.python.ops.ragged import ragged_conversion_ops
 from tensorflow.python.ops.ragged import ragged_dispatch
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_functional_ops
+from tensorflow.python.ops.ragged import ragged_gather_ops
+from tensorflow.python.ops.ragged import ragged_getitem
+from tensorflow.python.ops.ragged import ragged_map_ops
+from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_operators
 from tensorflow.python.ops.ragged import ragged_string_ops
-
-from tensorflow.python.ops.ragged.ragged_array_ops import batch_gather
-from tensorflow.python.ops.ragged.ragged_array_ops import boolean_mask
-from tensorflow.python.ops.ragged.ragged_array_ops import concat
-from tensorflow.python.ops.ragged.ragged_array_ops import expand_dims
-from tensorflow.python.ops.ragged.ragged_array_ops import gather
-from tensorflow.python.ops.ragged.ragged_array_ops import gather_nd
-from tensorflow.python.ops.ragged.ragged_array_ops import stack
-from tensorflow.python.ops.ragged.ragged_array_ops import tile
-from tensorflow.python.ops.ragged.ragged_array_ops import where
-
-from tensorflow.python.ops.ragged.ragged_conversion_ops import from_sparse
-from tensorflow.python.ops.ragged.ragged_conversion_ops import from_tensor
-from tensorflow.python.ops.ragged.ragged_conversion_ops import to_sparse
-from tensorflow.python.ops.ragged.ragged_conversion_ops import to_tensor
-
-from tensorflow.python.ops.ragged.ragged_factory_ops import constant
-from tensorflow.python.ops.ragged.ragged_factory_ops import constant_value
-
-from tensorflow.python.ops.ragged.ragged_functional_ops import map_flat_values
-
-from tensorflow.python.ops.ragged.ragged_map_ops import map_fn
-
-from tensorflow.python.ops.ragged.ragged_math_ops import range  # pylint: disable=redefined-builtin
-
-from tensorflow.python.ops.ragged.ragged_math_ops import reduce_all
-from tensorflow.python.ops.ragged.ragged_math_ops import reduce_any
-from tensorflow.python.ops.ragged.ragged_math_ops import reduce_max
-from tensorflow.python.ops.ragged.ragged_math_ops import reduce_mean
-from tensorflow.python.ops.ragged.ragged_math_ops import reduce_min
-from tensorflow.python.ops.ragged.ragged_math_ops import reduce_prod
-from tensorflow.python.ops.ragged.ragged_math_ops import reduce_sum
-
-from tensorflow.python.ops.ragged.ragged_math_ops import segment_max
-from tensorflow.python.ops.ragged.ragged_math_ops import segment_mean
-from tensorflow.python.ops.ragged.ragged_math_ops import segment_min
-from tensorflow.python.ops.ragged.ragged_math_ops import segment_prod
-from tensorflow.python.ops.ragged.ragged_math_ops import segment_sqrt_n
-from tensorflow.python.ops.ragged.ragged_math_ops import segment_sum
-
-from tensorflow.python.ops.ragged.ragged_tensor import convert_to_tensor_or_ragged_tensor
-from tensorflow.python.ops.ragged.ragged_tensor import is_ragged
-from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
-from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensorType
-
-from tensorflow.python.ops.ragged.ragged_tensor_shape import broadcast_dynamic_shape
-from tensorflow.python.ops.ragged.ragged_tensor_shape import broadcast_to
-from tensorflow.python.ops.ragged.ragged_tensor_shape import RaggedTensorDynamicShape
-
-from tensorflow.python.ops.ragged.ragged_tensor_value import RaggedTensorValue
-
-from tensorflow.python.ops.ragged.segment_id_ops import row_splits_to_segment_ids
-from tensorflow.python.ops.ragged.segment_id_ops import segment_ids_to_row_splits
-
-from tensorflow.python.util import all_util as _all_util
-
-
-# Register OpDispatchers that override standard TF ops to work w/ RaggedTensors.
-__doc__ += ragged_dispatch.register_dispatchers()  # pylint: disable=redefined-builtin
-
-# Any symbol that is not referenced (with "@@name") in the module docstring
-# above will be removed.
-_all_util.remove_undocumented(__name__)
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_shape
+from tensorflow.python.ops.ragged import ragged_tensor_value
+from tensorflow.python.ops.ragged import ragged_where_op
+from tensorflow.python.ops.ragged import segment_id_ops
+
+# Add a list of the ops that support Ragged Tensors.
+__doc__ += ragged_dispatch.ragged_op_list()  # pylint: disable=redefined-builtin
diff --git a/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py b/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
index b88f18c8b61a2fbc33aeca1f799c8e518cac4bf6..be1ccd9c727d18cd00445f442583d92dad7a8f73 100644
--- a/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
+++ b/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.convert_to_tensor_or_ragged."""
+"""Tests for ragged_tensor.convert_to_tensor_or_ragged."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -24,7 +24,8 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -42,8 +43,8 @@ class RaggedConvertToTensorOrRaggedTensorTest(
       dict(pylist=[[1, 2], [3]], preferred_dtype=dtypes.string),
   ])
   def testConvertRaggedTensor(self, pylist, dtype=None, preferred_dtype=None):
-    rt = ragged.constant(pylist)
-    converted = ragged.convert_to_tensor_or_ragged_tensor(
+    rt = ragged_factory_ops.constant(pylist)
+    converted = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         rt, dtype, preferred_dtype)
     self.assertIs(converted, rt)
 
@@ -64,34 +65,40 @@ class RaggedConvertToTensorOrRaggedTensorTest(
                                    message,
                                    dtype=None,
                                    preferred_dtype=None):
-    rt = ragged.constant(pylist)
+    rt = ragged_factory_ops.constant(pylist)
 
     with self.assertRaisesRegexp(ValueError, message):
-      ragged.convert_to_tensor_or_ragged_tensor(rt, dtype, preferred_dtype)
+      ragged_tensor.convert_to_tensor_or_ragged_tensor(rt, dtype,
+                                                       preferred_dtype)
 
   #=============================================================================
   # Tests where the 'value' param is a RaggedTensorValue
   #=============================================================================
-  @parameterized.parameters([
-      dict(
-          value=ragged.constant_value([[1, 2], [3]], dtype=np.int32),
-          expected_dtype=dtypes.int32),
-      dict(
-          value=ragged.constant_value([[b'a', b'b'], [b'c']]),
-          expected_dtype=dtypes.string),
-      dict(
-          value=ragged.constant_value([[1, 2], [3]], dtype=np.int32),
-          dtype=dtypes.float32,
-          expected_dtype=dtypes.float32),
-      dict(
-          value=ragged.constant_value([[1, 2], [3]], dtype=np.int32),
-          preferred_dtype=dtypes.float32,
-          expected_dtype=dtypes.float32),
-      dict(
-          value=ragged.constant_value([[1, 2], [3]], dtype=np.int32),
-          preferred_dtype=dtypes.string,
-          expected_dtype=dtypes.int32),
-  ])
+  @parameterized.parameters(
+      [
+          dict(
+              value=ragged_factory_ops.constant_value([[1, 2], [3]],
+                                                      dtype=np.int32),
+              expected_dtype=dtypes.int32),
+          dict(
+              value=ragged_factory_ops.constant_value([[b'a', b'b'], [b'c']]),
+              expected_dtype=dtypes.string),
+          dict(
+              value=ragged_factory_ops.constant_value([[1, 2], [3]],
+                                                      dtype=np.int32),
+              dtype=dtypes.float32,
+              expected_dtype=dtypes.float32),
+          dict(
+              value=ragged_factory_ops.constant_value([[1, 2], [3]],
+                                                      dtype=np.int32),
+              preferred_dtype=dtypes.float32,
+              expected_dtype=dtypes.float32),
+          dict(
+              value=ragged_factory_ops.constant_value([[1, 2], [3]],
+                                                      dtype=np.int32),
+              preferred_dtype=dtypes.string,
+              expected_dtype=dtypes.int32),
+      ])
   def testConvertRaggedTensorValue(self,
                                    value,
                                    dtype=None,
@@ -99,7 +106,7 @@ class RaggedConvertToTensorOrRaggedTensorTest(
                                    expected_dtype=None):
     if expected_dtype is None:
       expected_dtype = value.dtype if dtype is None else dtype
-    converted = ragged.convert_to_tensor_or_ragged_tensor(
+    converted = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         value, dtype, preferred_dtype)
     self.assertEqual(value.ragged_rank, converted.ragged_rank)
     self.assertEqual(dtypes.as_dtype(expected_dtype), converted.dtype)
@@ -107,7 +114,8 @@ class RaggedConvertToTensorOrRaggedTensorTest(
 
   @parameterized.parameters([
       dict(
-          value=ragged.constant_value([['a', 'b'], ['c']], dtype=str),
+          value=ragged_factory_ops.constant_value([['a', 'b'], ['c']],
+                                                  dtype=str),
           dtype=dtypes.int32,
           message=r"invalid literal for int\(\) with base 10: 'a'"),
   ])
@@ -117,7 +125,8 @@ class RaggedConvertToTensorOrRaggedTensorTest(
                                         dtype=None,
                                         preferred_dtype=None):
     with self.assertRaisesRegexp(ValueError, message):
-      ragged.convert_to_tensor_or_ragged_tensor(value, dtype, preferred_dtype)
+      ragged_tensor.convert_to_tensor_or_ragged_tensor(value, dtype,
+                                                       preferred_dtype)
 
   #=============================================================================
   # Tests where the 'value' param is a Tensor
@@ -129,7 +138,7 @@ class RaggedConvertToTensorOrRaggedTensorTest(
   ])
   def testConvertTensor(self, pylist, dtype=None, preferred_dtype=None):
     tensor = constant_op.constant(pylist)
-    converted = ragged.convert_to_tensor_or_ragged_tensor(
+    converted = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         tensor, dtype, preferred_dtype)
     self.assertIs(tensor, converted)
 
@@ -152,7 +161,8 @@ class RaggedConvertToTensorOrRaggedTensorTest(
                              preferred_dtype=None):
     tensor = constant_op.constant(pylist)
     with self.assertRaisesRegexp(ValueError, message):
-      ragged.convert_to_tensor_or_ragged_tensor(tensor, dtype, preferred_dtype)
+      ragged_tensor.convert_to_tensor_or_ragged_tensor(tensor, dtype,
+                                                       preferred_dtype)
 
   #=============================================================================
   # Tests where the 'value' param is a np.array
@@ -184,7 +194,7 @@ class RaggedConvertToTensorOrRaggedTensorTest(
                             expected_dtype=None):
     if expected_dtype is None:
       expected_dtype = value.dtype if dtype is None else dtype
-    converted = ragged.convert_to_tensor_or_ragged_tensor(
+    converted = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         value, dtype, preferred_dtype)
     self.assertEqual(dtypes.as_dtype(expected_dtype), converted.dtype)
     self.assertAllEqual(value, converted)
@@ -201,7 +211,8 @@ class RaggedConvertToTensorOrRaggedTensorTest(
                                  dtype=None,
                                  preferred_dtype=None):
     with self.assertRaisesRegexp(ValueError, message):
-      ragged.convert_to_tensor_or_ragged_tensor(value, dtype, preferred_dtype)
+      ragged_tensor.convert_to_tensor_or_ragged_tensor(value, dtype,
+                                                       preferred_dtype)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_array_ops.py b/tensorflow/python/ops/ragged/ragged_array_ops.py
index b5917bc4ee6f6f5fb1d46f3e75cbdb66ef156bad..8c62cc4a7286c13d9c6aaa0da2e5a70d2abf1d32 100644
--- a/tensorflow/python/ops/ragged/ragged_array_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_array_ops.py
@@ -20,11 +20,8 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import gen_ragged_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_conversion_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
@@ -34,322 +31,6 @@ from tensorflow.python.ops.ragged import ragged_util
 from tensorflow.python.ops.ragged import segment_id_ops
 
 
-#===============================================================================
-# ragged_gather
-#===============================================================================
-# TODO(edloper): Add an `axis` argument
-def gather(params, indices, validate_indices=None, axis=0, name=None):
-  """Gathers ragged slices from `params` axis `0` according to `indices`.
-
-  Returns `RaggedTensor` output, such that:
-
-  ```python
-  output.shape = indices.shape + params.shape[1:]
-  output.ragged_rank = indices.shape.ndims + params.ragged_rank
-  output[i...j, d0...dn] = params[indices[i...j], d0...dn]
-  ```
-
-  `params` may be ragged.  `indices` may be ragged.
-  `indices` must have dtype `int32` or `int64`. If any index is out of bounds,
-  then an error is returned.
-
-  Examples:
-
-  ```python
-  >>> params = tf.constant(['a', 'b', 'c', 'd', 'e'])
-  >>> indices = tf.constant([3, 1, 2, 1, 0])
-  >>> ragged_params = ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
-  >>> ragged_indices = ragged.constant([[3, 1, 2], [1], [], [0]])
-
-  >>> print ragged.gather(params, ragged_indices)
-  [['d', 'b', 'c'], ['b'], [], ['a']]
-
-  >>> print ragged.gather(ragged_params, indices)
-  [['e'], ['d'], [], ['d'], ['a', 'b', 'c']]
-
-  >>> print ragged.gather(ragged_params, ragged_indices)
-  [[['e'], ['d'], []], [['d']], [], [['a', 'b', 'c']]]
-  ```
-
-  Args:
-    params: The potentially ragged tensor from which to gather values. Must be
-      at least rank 1.
-    indices: The potentially ragged tensor indicating which values to gather.
-      Must have dtype `int32` or `int64`.  Values must be in the range `[0,
-      params.shape[0]]`.
-    validate_indices: Ignored.
-    axis: Must be zero.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `RaggedTensor`, where `output.dtype=params.dtype` and
-    `output.shape=indices.shape + params.shape[1:]` and
-    `output.ragged_rank=indices.shape.ndims + params.ragged_rank`.
-
-  Raises:
-    ValueError: If indices.shape.ndims is not known statically.
-  """
-  del validate_indices
-  if not isinstance(axis, int) or axis != 0:
-    raise ValueError('axis>0 is not supported for ragged gather yet.')
-  with ops.name_scope(name, 'RaggedGather', [params, indices]):
-    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-        params, name='params')
-    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-        indices, name='indices')
-
-    if ragged_tensor.is_ragged(indices):
-      return indices.with_values(gather(params, indices.values))
-
-    if not ragged_tensor.is_ragged(params):
-      return array_ops.gather(params, indices)
-
-    indices = ops.convert_to_tensor(indices)
-    if indices.shape.ndims is None:
-      raise ValueError('indices.shape.ndims must be known statically')
-
-    result = gen_ragged_array_ops.ragged_gather(
-        indices=indices,
-        params_dense_values=params.flat_values,
-        params_nested_splits=params.nested_row_splits,
-        OUTPUT_RAGGED_RANK=indices.shape.ndims + len(params.nested_row_splits) -
-        1)
-
-    # Compose the RaggedTensor from splits & values.
-    return ragged_tensor.RaggedTensor.from_nested_row_splits(
-        result.output_dense_values, result.output_nested_splits)
-
-
-#===============================================================================
-# ragged.batch_gather
-#===============================================================================
-def batch_gather(params, indices, name=None):
-  """Gathers slices from `params` according to `indices` with batch dims.
-
-  This operation is similar to `gather`, but it assumes that the leading `N`
-  dimensions of `indices` and `params` are batch dimensions, and performs a
-  gather within each batch.  In particular, when using this operation with `N`
-  batch dimensions `B1...BN`:
-
-  * `indices` has shape `[B1...BN, I]`
-  * `params` has shape `[B1...BN, P1...PM]`.
-  * `result` has shape `[B1...BN, I, P2...PM]`.
-  * `result[b1...bN, i, p2...pM] =
-    params[b1...bN, indices[b1...bN, i], p2...pM]`
-
-  Args:
-    params: A potentially ragged tensor with shape `[B1...BN, P1...PM]` (`N>=0`,
-      `M>0`).
-    indices: A potentially ragged tensor with shape `[B1...BN, I]` (`N>=0`).
-    name: A name for the operation (optional).
-
-  Returns:
-    A potentially ragged tensor with shape `[B1...BN, I, P2...PM]`.
-    `result.ragged_rank = max(indices.ragged_rank, params.ragged_rank)`.
-
-  #### Example:
-    ```python
-    >>> params = ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
-    >>> indices = ragged.constant([[1, 2, 0], [], [], [0, 0]])
-    >>> ragged.batch_gather(params, indices)
-    [['b', 'c', 'a'], [], [], ['e', 'e']]
-    ```
-  """
-  if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)):
-    return array_ops.batch_gather(params, indices, name)
-
-  with ops.name_scope(name, 'RaggedBatchGather', [params, indices]):
-    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-        params, name='params')
-    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-        indices, name='indices')
-    indices_ndims = indices.shape.ndims
-    if indices_ndims is None:
-      raise ValueError(
-          'batch_gather does not allow indices with unknown shape.')
-    if indices_ndims == 0:
-      raise ValueError('indices.rank must be at least 1.')
-
-    if ragged_tensor.is_ragged(indices):
-      # If the outermost ragged dimension is a batch dimension, recurse.
-      if indices_ndims > 2:
-        if not ragged_tensor.is_ragged(params):
-          raise ValueError('batch shape from indices does '
-                           'not match params shape')
-        checks = [check_ops.assert_equal(params.row_splits, indices.row_splits)]
-        with ops.control_dependencies(checks):
-          return ragged_tensor.RaggedTensor.from_row_splits(
-              batch_gather(params.values, indices.values), indices.row_splits)
-
-      # Otherwise, indices is a 2D ragged tensor with 1 ragged dimension.
-      else:
-        # Ensure that `params` is ragged and has at least 2 dimensions.
-        if not ragged_tensor.is_ragged(params):
-          if params.shape.ndims is not None and params.shape.ndims < 2:
-            raise ValueError('batch shape from indices does '
-                             'not match params shape')
-          params = ragged_conversion_ops.from_tensor(params, ragged_rank=1)
-
-        # Adjust indices from within-batch to global (in params.values), and
-        # then use ragged.gather to gather them.
-        num_indices = indices.row_lengths()
-        params_starts = params.row_starts()
-        adjustments = ragged_util.repeat(params_starts, num_indices, axis=0)
-        adjusted_index_values = math_ops.to_int64(indices.values) + adjustments
-        return ragged_tensor.RaggedTensor.from_row_splits(
-            gather(params.values, adjusted_index_values), indices.row_splits)
-
-    else:  # params is a RaggedTensor and indices is a Tensor.
-      if indices_ndims == 1:
-        return gather(params, indices)
-      elif indices_ndims == 2:
-        # Adjust indices from batch-local to global (in params.values)
-        adjustments = array_ops.expand_dims(params.row_starts(), 1)
-        adjusted_indices = math_ops.to_int64(indices) + adjustments
-        return gather(params.values, adjusted_indices)
-      else:
-        raise ValueError('batch shape from indices does not match params shape')
-
-
-#===============================================================================
-# ragged.gather_nd
-#===============================================================================
-def gather_nd(params, indices, name=None):
-  """Gather slices from `params` using `n`-dimensional indices.
-
-  This operation is similar to `gather`, but it uses the innermost dimension
-  of `indices` to define a slice into `params`.  In particular, if:
-
-  * `indices` has shape `[A1...AN, I]`
-  * `params` has shape `[B1...BM]`
-
-  Then:
-
-  * `result` has shape `[A1...AN, B_{I+1}...BM]`.
-  * `result[a1...aN] = params[indices[a1...aN, :]]`
-
-  Args:
-    params: A potentially ragged tensor with shape `[A1...AN, I]`.
-    indices: A potentially ragged tensor with shape `[B1...BM]`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A potentially ragged tensor with shape `[A1...AN, B_{I+1}...BM]`.
-
-  #### Examples:
-    ```python
-    >>> params = tf.ragged.constant_value(
-    ...     [ [ ['000', '001'], ['010'              ]          ],
-    ...       [ ['100'       ], ['110', '111', '112'], ['120'] ],
-    ...       [ [            ], ['210'              ]          ] ])
-
-    >>> # Gather 2D slices from a 3D tensor
-    >>> ragged.gather_nd(params, [[2], [0]])
-    [ [ [            ], ['210'] ]
-      [ ['000', '001'], ['010'] ] ]
-
-    >>> # Gather 1D slices from a 3D tensor
-    >>> ragged.gather_nd(params, [[2, 1], [0, 0]])
-    [['210'], ['000', '001']]
-
-    >>> # Gather scalars from a 3D tensor
-    >>> ragged.gather_nd(params, [[0, 0, 1], [1, 1, 2]])
-    ['001', '112']
-    ```
-  """
-  if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)):
-    return array_ops.gather_nd(params, indices, name)
-
-  with ops.name_scope(name, 'RaggedGatherNd', [params, indices]):
-
-    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-        params, name='params')
-    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-        indices, name='indices')
-    indices_shape = indices.shape
-    indices_ndims = indices_shape.ndims
-    if indices_ndims is None:
-      raise ValueError('indices.rank be statically known.')
-    if indices_ndims == 0:
-      raise ValueError('indices.rank must be at least 1.')
-    if (ragged_tensor.is_ragged(indices) and
-        indices_ndims == indices.ragged_rank + 1):
-      raise ValueError('The innermost dimension of indices may not be ragged')
-
-    # `index_size` is the "n" in "gather_nd" -- i.e., the number of dimensions
-    # that each index slices into.
-    index_size = tensor_shape.dimension_value(indices_shape[-1])
-    if index_size is None:
-      raise ValueError('indices.shape[-1] must be statically known.')
-
-    # If `indices` has more than 2 dimensions, then recurse.  If `indices` is
-    # dense, then we convert it to ragged before recursing, and then convert
-    # the result back to `dense` if appropriate.
-    if indices_ndims > 2:
-      indices_is_dense = not ragged_tensor.is_ragged(indices)
-      if indices_is_dense:
-        indices = ragged_conversion_ops.from_tensor(
-            indices, ragged_rank=indices_ndims - 2)
-      result = indices.with_flat_values(gather_nd(params, indices.flat_values))
-      if (indices_is_dense and ragged_tensor.is_ragged(result) and
-          result.ragged_rank == indices_ndims - 2):
-        result = ragged_conversion_ops.to_tensor(result)
-      return result
-
-    # indices_ndims <= 2, and the innermost dimension of indices may not be
-    # ragged, so `indices` must not be ragged.
-    assert not ragged_tensor.is_ragged(indices)
-    assert ragged_tensor.is_ragged(params)
-
-    # Handle corner case: An empty index tuple selects the entire `params`
-    # value.  So if `index_size` is zero, then tile `params`.
-    if index_size == 0:
-      params_ndims = params.ragged_rank + array_ops.rank(params.flat_values)
-      for dim in range(indices_ndims - 1):
-        params = expand_dims(params, axis=0)
-      multiples = array_ops.concat([
-          array_ops.shape(indices)[:-1],
-          array_ops.ones([params_ndims], dtypes.int32)
-      ],
-                                   axis=0)
-      return tile(params, multiples)
-
-    # When index_size=1, we can just flatten the index tuples and use gather.
-    elif index_size == 1:
-      flattened_index_tuples = array_ops.reshape(indices, [-1])
-      return gather(params, flattened_index_tuples)
-
-    # Otherwise, params is a RaggedTensor, and indices is a 1D or 2D Tensor.
-    # Flatten both the index tuples and the params, such that the flattened
-    # index tuples point to the correct values in the flattened params; and
-    # then use ragged.gather on the flattened index tuples & params.
-    else:
-      indices = math_ops.to_int64(indices)
-
-      # Flatten the outermost 2 dimensions of the index tuples & params.
-      flattened_index_tuples = array_ops.gather(params.row_splits,
-                                                indices[..., 0])
-      flattened_index_tuples += indices[..., 1]
-      flattened_params = params.values
-
-      # Flatten any remaining dimensions.
-      for dim in range(2, index_size):
-        if not ragged_tensor.is_ragged(flattened_params):
-          flattened_index_tuples = array_ops.expand_dims(
-              flattened_index_tuples, axis=1)
-          flattened_index_tuples = array_ops.concat(
-              [flattened_index_tuples, indices[..., dim:]], axis=1)
-          return array_ops.gather_nd(flattened_params, flattened_index_tuples)
-
-        flattened_index_tuples = array_ops.gather(
-            flattened_params.row_starts(), flattened_index_tuples)
-        flattened_index_tuples += indices[..., dim]
-        flattened_params = flattened_params.values
-
-      # Gather using the flattened index tuples and params.
-      return gather(flattened_params, flattened_index_tuples)
-
-
 #===============================================================================
 # Masking
 #===============================================================================
@@ -444,7 +125,7 @@ def boolean_mask(data, mask, keepdims=False, name=None):
 
     # Get static rank of mask.
     if mask.shape.ndims is None:
-      raise ValueError('mask.shape.ndims must be kown statically.')
+      raise ValueError('mask.shape.ndims must be known statically.')
     elif mask.shape.ndims == 0:
       raise ValueError('mask cannot be scalar.')
 
@@ -543,260 +224,6 @@ def boolean_mask(data, mask, keepdims=False, name=None):
       return masked_values
 
 
-#===============================================================================
-# Concatenation and Stacking
-#===============================================================================
-def concat(values, axis, name=None):
-  """Concatenates potentially ragged tensors along one dimension.
-
-  Given a list of tensors with the same rank `K` (`K >= axis`), returns a
-  rank-`K` `RaggedTensor` `result` such that `result[i0...iaxis]` is the
-  concatenation of `[rt[i0...iaxis] for rt in values]`.
-
-  Args:
-    values: A list of potentially ragged tensors.  May not be empty. All
-      `values` must have the same rank and the same dtype; but unlike
-      `tf.concat`, they can have arbitrary shapes.
-    axis: A python integer, indicating the dimension along which to concatenate.
-      (Note: Unlike `tf.concat`, the `axis` parameter must be statically known.)
-        Negative values are supported only if the rank of at least one
-        `values` value is statically known.
-    name: A name prefix for the returned tensor (optional).
-
-  Returns:
-    A `RaggedTensor` with rank `K`.
-    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in values]))`.
-
-  Raises:
-    ValueError: If `values` is empty, if `axis` is out of bounds or if
-      the input tensors have different ranks.
-
-  #### Example:
-    ```python
-    >>> t1 = ragged.constant([[1, 2], [3, 4, 5]])
-    >>> t2 = ragged.constant([[6], [7, 8, 9]])
-    >>> ragged.concat([t1, t2], axis=0)
-    [[1, 2], [3, 4, 5], [6], [7, 8, 9]]
-    >>> ragged.concat([t1, t2], axis=1)
-    [[1, 2, 6], [3, 4, 5, 7, 8, 9]]
-    ```
-  """
-  if not isinstance(values, (list, tuple)):
-    values = [values]
-  with ops.name_scope(name, 'RaggedConcat', values):
-    return _ragged_stack_concat_helper(values, axis, stack_values=False)
-
-
-def stack(values, axis, name=None):
-  """Stacks potentially ragged tensors along one dimension.
-
-  Given a list of tensors with the same rank `K` (`K >= axis`), returns a
-  rank-`K+1` `RaggedTensor` `result` such that `result[i0...iaxis]` is the
-  list `[rt[i0...iaxis] for rt in values]`.
-
-  Args:
-    values: A list of potentially ragged tensors.  May not be empty. All
-      `values` must have the same rank and the same dtype; but unlike
-      `tf.concat`, they can have arbitrary shapes.
-    axis: A python integer, indicating the dimension along which to stack.
-      (Note: Unlike `tf.stack`, the `axis` parameter must be statically known.)
-        Negative values are supported only if the rank of at least one
-        `values` value is statically known.
-    name: A name prefix for the returned tensor (optional).
-
-  Returns:
-    A `RaggedTensor` with rank `K+1`.
-    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in values]))`.
-
-  Raises:
-    ValueError: If `values` is empty, if `axis` is out of bounds or if
-      the input tensors have different ranks.
-
-  #### Example:
-    ```python
-    >>> t1 = ragged.constant([[1, 2], [3, 4, 5]])
-    >>> t2 = ragged.constant([[6], [7, 8, 9]])
-    >>> ragged.stack([t1, t2], axis=0)
-    [[[1, 2], [3, 4, 5]], [[6], [7, 9, 0]]]
-    >>> ragged.stack([t1, t2], axis=1)
-    [[[1, 2], [6]], [[3, 4, 5], [7, 8, 9]]]
-    ```
-  """
-  if not isinstance(values, (list, tuple)):
-    values = [values]
-  with ops.name_scope(name, 'RaggedConcat', values):
-    return _ragged_stack_concat_helper(values, axis, stack_values=True)
-
-
-def _ragged_stack_concat_helper(rt_inputs, axis, stack_values):
-  """Helper function to concatenate or stack ragged tensors.
-
-  Args:
-    rt_inputs: A list of RaggedTensors or Tensors to combine.
-    axis: The axis along which to concatenate or stack.
-    stack_values: A boolean -- if true, then stack values; otherwise,
-      concatenate them.
-
-  Returns:
-    A RaggedTensor.
-  Raises:
-    ValueError: If rt_inputs is empty, or if axis is out of range.
-  """
-  # Validate parameters.
-  if not rt_inputs:
-    raise ValueError('rt_inputs may not be empty.')
-
-  # Convert input tensors.
-  rt_inputs = [
-      ragged_tensor.convert_to_tensor_or_ragged_tensor(
-          rt_input, name='rt_input') for rt_input in rt_inputs
-  ]
-
-  # Special case: if there's only one input, then return it as-is.
-  if len(rt_inputs) == 1:
-    if stack_values:
-      return expand_dims(rt_inputs[0], axis=0)
-    else:
-      return rt_inputs[0]
-
-  # Check the rank (number of dimensions) of the input tensors.
-  ndims = None
-  for rt in rt_inputs:
-    if ndims is None:
-      ndims = rt.shape.ndims
-    else:
-      rt.shape.assert_has_rank(ndims)
-
-  out_ndims = ndims if (ndims is None or not stack_values) else ndims + 1
-  axis = ragged_util.get_positive_axis(axis, out_ndims)
-
-  # If all the inputs are Tensors, and we're combining the final dimension,
-  # then we can delegate to the tf.stack/tf.concat operation, and return a
-  # Tensor.
-  if all(not ragged_tensor.is_ragged(rt) for rt in rt_inputs):
-    if ndims is not None and (axis == out_ndims - 1 or axis == ndims - 1):
-      if stack_values:
-        return array_ops.stack(rt_inputs, axis)
-      else:
-        return array_ops.concat(rt_inputs, axis)
-
-  # Convert any Tensor inputs to RaggedTensors.  This makes it
-  # possible to concatenate Tensors and RaggedTensors together.
-  for i in range(len(rt_inputs)):
-    if not ragged_tensor.is_ragged(rt_inputs[i]):
-      rt_inputs[i] = ragged_conversion_ops.from_tensor(
-          rt_inputs[i], ragged_rank=1)
-
-  # Convert the input tensors to all have the same ragged_rank.
-  ragged_rank = max(max(rt.ragged_rank for rt in rt_inputs), 1)
-  rt_inputs = [_increase_ragged_rank_to(rt, ragged_rank) for rt in rt_inputs]
-
-  if axis == 0:
-    return _ragged_stack_concat_axis_0(rt_inputs, stack_values)
-  elif axis == 1:
-    return _ragged_stack_concat_axis_1(rt_inputs, stack_values)
-  else:  # axis > 1: recurse.
-    values = [rt.values for rt in rt_inputs]
-    splits = [[rt_input.row_splits] for rt_input in rt_inputs]
-    with ops.control_dependencies(ragged_util.assert_splits_match(splits)):
-      return ragged_tensor.RaggedTensor.from_row_splits(
-          _ragged_stack_concat_helper(values, axis - 1, stack_values),
-          splits[0][0])
-
-
-def _ragged_stack_concat_axis_0(rt_inputs, stack_values):
-  """Helper function to concatenate or stack ragged tensors along axis 0.
-
-  Args:
-    rt_inputs: A list of RaggedTensors, all with the same rank and ragged_rank.
-    stack_values: Boolean.  If true, then stack values; otherwise, concatenate
-      them.
-
-  Returns:
-    A RaggedTensor.
-  """
-  # Concatenate the inner values together.
-  flat_values = [rt.flat_values for rt in rt_inputs]
-  concatenated_flat_values = array_ops.concat(flat_values, axis=0)
-
-  # Concatenate the splits together for each ragged dimension (adjusting
-  # split offsets as necessary).
-  nested_splits = [rt.nested_row_splits for rt in rt_inputs]
-  ragged_rank = rt_inputs[0].ragged_rank
-  concatenated_nested_splits = [
-      _concat_ragged_splits([ns[dim]
-                             for ns in nested_splits])
-      for dim in range(ragged_rank)
-  ]
-
-  # If we are performing a stack operation, then add another splits.
-  if stack_values:
-    stack_lengths = array_ops.stack([_nrows(rt) for rt in rt_inputs])
-    stack_splits = ragged_util.lengths_to_splits(stack_lengths)
-    concatenated_nested_splits.insert(0, stack_splits)
-
-  return ragged_tensor.RaggedTensor.from_nested_row_splits(
-      concatenated_flat_values, concatenated_nested_splits)
-
-
-def _ragged_stack_concat_axis_1(rt_inputs, stack_values):
-  """Helper function to concatenate or stack ragged tensors along axis 1.
-
-  Args:
-    rt_inputs: A list of RaggedTensors, all with the same rank and ragged_rank.
-    stack_values: Boolean.  If true, then stack values; otherwise, concatenate
-      them.
-
-  Returns:
-    A RaggedTensor.
-  """
-  num_inputs = len(rt_inputs)
-
-  rt_nrows = _nrows(rt_inputs[0])
-  nrows_msg = 'Input tensors have incompatible shapes.'
-  nrows_checks = [
-      check_ops.assert_equal(_nrows(rt), rt_nrows, message=nrows_msg)
-      for rt in rt_inputs[1:]
-  ]
-
-  with ops.control_dependencies(nrows_checks):
-    # Concatentate the inputs together to put them in a single ragged tensor.
-    concatenated_rt = _ragged_stack_concat_axis_0(rt_inputs, stack_values=False)
-
-    # Use ragged.gather to permute the rows of concatenated_rt.  In particular,
-    #   permuted_rt = [rt_inputs[0][0], ..., rt_inputs[N][0],
-    #                  rt_inputs[0][1], ..., rt_inputs[N][1],
-    #                      ...,
-    #                  rt_inputs[0][M], ..., rt_input[N][M]]
-    # where `N=num_inputs-1` and `M=rt_nrows-1`.
-    row_indices = math_ops.range(rt_nrows * num_inputs)
-    row_index_matrix = array_ops.reshape(row_indices, [num_inputs, -1])
-    transposed_row_index_matrix = array_ops.transpose(row_index_matrix)
-    row_permutation = array_ops.reshape(transposed_row_index_matrix, [-1])
-    permuted_rt = gather(concatenated_rt, row_permutation)
-
-    if stack_values:
-      # Add a new splits tensor to group together the values.
-      stack_splits = math_ops.range(0, rt_nrows * num_inputs + 1, num_inputs)
-      _copy_row_shape(rt_inputs, stack_splits)
-      return ragged_tensor.RaggedTensor.from_row_splits(permuted_rt,
-                                                        stack_splits)
-    else:
-      # Merge together adjacent rows by dropping the row-split indices that
-      # separate them.
-      concat_splits = permuted_rt.row_splits[::num_inputs]
-      _copy_row_shape(rt_inputs, concat_splits)
-      return ragged_tensor.RaggedTensor.from_row_splits(permuted_rt.values,
-                                                        concat_splits)
-
-
-def _copy_row_shape(rt_inputs, splits):
-  """Sets splits.shape to [rt[shape[0]+1] for each rt in rt_inputs."""
-  for rt in rt_inputs:
-    if rt.shape[0] is not None:
-      splits.set_shape(tensor_shape.TensorShape(rt.shape[0] + 1))
-
-
 #===============================================================================
 # Tiling
 #===============================================================================
@@ -819,7 +246,7 @@ def tile(input, multiples, name=None):  # pylint: disable=redefined-builtin
 
   #### Example:
     ```python
-    >>> rt = ragged.constant([[1, 2], [3]])
+    >>> rt = tf.ragged.constant([[1, 2], [3]])
     >>> ragged.tile(rt, [3, 2])
     [[1, 2, 1, 2], [3, 3], [1, 2, 1, 2], [3, 3], [1, 2, 1, 2], [3, 3]]
     ```
@@ -862,7 +289,7 @@ def _tile_ragged_values(rt_input, multiples, const_multiples=None):
 
   #### Example:
     ```python
-    >>> rt = ragged.constant([[1, 2], [3]])
+    >>> rt = tf.ragged.constant([[1, 2], [3]])
     >>> _tile_ragged_values(rt, [3, 2])
     [1, 2, 1, 2, 3, 3, 1, 2, 1, 2, 3, 3, 1, 2, 1, 2, 3, 3]
     ```
@@ -921,7 +348,7 @@ def _tile_ragged_splits(rt_input, multiples, const_multiples=None):
 
   #### Example:
     ```python
-    >>> rt = ragged.constant([[1, 2], [3]])
+    >>> rt = tf.ragged.constant([[1, 2], [3]])
     >>> _tile_ragged_splits(rt, [3, 2])
     [0, 4, 6, 10, 12, 16, 18]
     ```
@@ -1018,7 +445,7 @@ def expand_dims(input, axis, name=None):  # pylint: disable=redefined-builtin
 
   #### Examples:
     ```python
-    >>> rt = ragged.constant([[1, 2], [3]])
+    >>> rt = tf.ragged.constant([[1, 2], [3]])
     >>> print rt.shape
     TensorShape([2, None])
 
@@ -1058,134 +485,33 @@ def expand_dims(input, axis, name=None):  # pylint: disable=redefined-builtin
 
 
 #===============================================================================
-# ragged.where
+# RaggedTensor Size
 #===============================================================================
-def where(condition, x=None, y=None, name=None):
-  """Return the elements, either from `x` or `y`, depending on the `condition`.
-
-  : If both `x` and `y` are `None`:
-    Returns the coordinates of true elements of `condition`. The coordinates
-    are returned in a 2-D tensor with shape
-    `[num_true_values, dim_size(condition)]`, where `result[i]` is the
-    coordinates of the `i`th true value (in row-major order).
-
-  : If both `x` and `y` are non-`None`:
-    Returns a tensor formed by selecting values from `x` where condition is
-    true, and from `y` when condition is false.  In particular:
 
-    : If `condition`, `x`, and `y` all have the same shape:
 
-      * `result[i1...iN] = x[i1...iN]` if `condition[i1...iN]` is true.
-      * `result[i1...iN] = y[i1...iN]` if `condition[i1...iN]` is false.
+def size(input, out_type=dtypes.int32, name=None):  # pylint: disable=redefined-builtin
+  """Returns the size of a potentially ragged tensor.
 
-    : Otherwise:
-
-      * `condition` must be a vector.
-      * `x` and `y` must have the same number of dimensions.
-      * The outermost dimensions of `condition`, `x`, and `y` must all have the
-        same size.
-      * `result[i] = x[i]` if `condition[i]` is true.
-      * `result[i] = y[i]` if `condition[i]` is false.
+  The size of a ragged tensor is the size of its inner values.
 
   Args:
-    condition: A potentially ragged tensor of type `bool`
-    x: A potentially ragged tensor (optional).
-    y: A potentially ragged tensor (optional).  Must be specified if `x` is
-      specified.  Must have the same rank and type as `x`.
-    name: A name of the operation (optional)
+    input: A potentially ragged `Tensor`.
+    out_type: The numeric output type for the operation.
+    name: A name for the operation (optional).
 
   Returns:
-    : If both `x` and `y` are `None`:
-      A `Tensor` with shape `(num_true, dim_size(condition))`.
-    : Otherwise:
-      A potentially ragged tensor with the same type, rank, and outermost
-      dimension size as `x` and `y`.
-      `result.ragged_rank = max(x.ragged_rank, y.ragged_rank)`.
+    A Tensor of type `out_type`.
 
-  Raises:
-    ValueError: When exactly one of `x` or `y` is non-`None`; or when
-      `condition`, `x`, and `y` have incompatible shapes.
-
-  #### Examples:
+  #### Example:
     ```python
-    >>> # Coordinates where condition is true.
-    >>> condition = ragged.constant_value([[True, False, True], [False, True]])
-    >>> ragged.where(condition)
-    [[0, 0], [0, 2], [1, 1]]
-
-    >>> # Elementwise selection between x and y, based on condition.
-    >>> condition = ragged.constant_value([[True, False, True], [False, True]])
-    >>> x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
-    >>> y=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
-    >>> ragged.where(condition, x, y)
-    [['A', 'b', 'C'], ['d', 'E']]
-
-    >>> # Row selection between x and y, based on condition.
-    >>> condition = [True, False]
-    >>> x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
-    >>> y=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
-    >>> ragged.where(condition, x, y)
-    [['A', 'B', 'C'], ['d', 'e']]
+    >>> tf.size(tf.ragged.constant([[1, 2], [3]]))
+    3
     ```
   """
-  if (x is None) != (y is None):
-    raise ValueError('x and y must be either both None or both non-None')
-  with ops.name_scope('RaggedWhere', name, [condition, x, y]):
-    condition = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-        condition, name='condition')
-    if x is None:
-      return _coordinate_where(condition)
-    else:
-      x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x, name='x')
-      y = ragged_tensor.convert_to_tensor_or_ragged_tensor(y, name='y')
-      return _elementwise_where(condition, x, y)
-
-
-def _elementwise_where(condition, x, y):
-  """Ragged version of tf.where(condition, x, y)."""
-  condition_is_ragged = isinstance(condition, ragged_tensor.RaggedTensor)
-  x_is_ragged = isinstance(x, ragged_tensor.RaggedTensor)
-  y_is_ragged = isinstance(y, ragged_tensor.RaggedTensor)
-
-  if not (condition_is_ragged or x_is_ragged or y_is_ragged):
-    return array_ops.where(condition, x, y)
-
-  elif condition_is_ragged and x_is_ragged and y_is_ragged:
-    return ragged_functional_ops.map_flat_values(array_ops.where, condition, x,
-                                                 y)
-  elif not condition_is_ragged:
-    # Concatenate x and y, and then use `gather` to assemble the selected rows.
-    condition.shape.assert_has_rank(1)
-    x_nrows = _nrows(x)
-    x_and_y = concat([x, y], axis=0)
-    indices = array_ops.where(condition, math_ops.range(x_nrows),
-                              x_nrows + math_ops.range(_nrows(y)))
-    return gather(x_and_y, indices)
-
+  if ragged_tensor.is_ragged(input):
+    return array_ops.size(input.flat_values, out_type=out_type, name=name)
   else:
-    raise ValueError('Input shapes do not match.')
-
-
-def _coordinate_where(condition):
-  """Ragged version of tf.where(condition)."""
-  if not isinstance(condition, ragged_tensor.RaggedTensor):
-    return array_ops.where(condition)
-
-  # The coordinate for each `true` value in condition.values.
-  selected_coords = _coordinate_where(condition.values)
-
-  # Convert the first index in each coordinate to a row index and column index.
-  first_index = selected_coords[:, 0]
-  selected_rows = array_ops.gather(condition.value_rowids(), first_index)
-  selected_row_starts = array_ops.gather(condition.row_splits, selected_rows)
-  selected_cols = first_index - selected_row_starts
-
-  # Assemble the row & column index with the indices for inner dimensions.
-  return array_ops.concat([
-      array_ops.expand_dims(selected_rows, 1),
-      array_ops.expand_dims(selected_cols, 1), selected_coords[:, 1:]
-  ],
-                          axis=1)
+    return array_ops.size(input, out_type=out_type, name=name)
 
 
 #===============================================================================
@@ -1221,3 +547,32 @@ def _nrows(rt_input, out_type=dtypes.int64, name=None):
     with ops.name_scope(name, 'RaggedNRows', [rt_input]):
       return array_ops.shape(rt_input, out_type=out_type)[0]
 
+
+#===============================================================================
+# ragged.rank
+#===============================================================================
+def rank(input, name=None):  # pylint: disable=redefined-builtin
+  """Returns the rank of a RaggedTensor.
+
+  Returns a 0-D `int32` `Tensor` representing the rank of `input`.
+
+  For example:
+
+  ```python
+  # shape of tensor 't' is [2, None, None]
+  t = tf.ragged.constant([[[1], [2, 2]], [[3, 3, 3], [4, 4, 4, 4]]])
+  tf.rank(t)  # 3
+  ```
+
+  Args:
+    input: A `RaggedTensor`
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type `int32`.
+  """
+  with ops.name_scope(name, 'RaggedRank', [input]) as name:
+    if not ragged_tensor.is_ragged(input):
+      return array_ops.rank(input, name)
+
+    return input.ragged_rank + array_ops.rank(input.flat_values)
diff --git a/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py b/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
index 79f1ae591f9f2c9dfcf5b405b1c4d7370ab853a6..17c55eb810ab3b9718a5d5f94af3dba67fb673e8 100644
--- a/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tf.ragged.batch_gather."""
+"""Tests for ragged_batch_gather_ops.batch_gather."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,11 +21,15 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_batch_gather_ops
+from tensorflow.python.ops.ragged import ragged_batch_gather_with_default_op
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -40,10 +44,12 @@ class RaggedBatchGatherOpTest(ragged_test_util.RaggedTensorTestCase,
       #=========================================================================
       dict(
           descr='Docstring example',
-          params=ragged.constant_value([['a', 'b', 'c'], ['d'], [], ['e']]),
-          indices=ragged.constant_value([[1, 2, 0], [], [], [0, 0]]),
-          expected=ragged.constant_value([[b'b', b'c', b'a'], [], [],
-                                          [b'e', b'e']])),
+          params=ragged_factory_ops.constant_value([['a', 'b', 'c'], ['d'], [],
+                                                    ['e']]),
+          indices=ragged_factory_ops.constant_value([[1, 2, 0], [], [], [0,
+                                                                         0]]),
+          expected=ragged_factory_ops.constant_value([[b'b', b'c', b'a'], [],
+                                                      [], [b'e', b'e']])),
       #=========================================================================
       # 0 Batch Dimensions
       #=========================================================================
@@ -54,9 +60,10 @@ class RaggedBatchGatherOpTest(ragged_test_util.RaggedTensorTestCase,
           expected=[b'd', b'c']),
       dict(
           descr='params: [P1, (P2)], indices: [I], result: [I, (P2)]',
-          params=ragged.constant_value([['a', 'b'], [], ['c'], ['d', 'e']]),
+          params=ragged_factory_ops.constant_value([['a', 'b'], [], ['c'],
+                                                    ['d', 'e']]),
           indices=[3, 2],
-          expected=ragged.constant_value([[b'd', b'e'], [b'c']])),
+          expected=ragged_factory_ops.constant_value([[b'd', b'e'], [b'c']])),
       #=========================================================================
       # 1 Batch Dimension
       #=========================================================================
@@ -67,22 +74,24 @@ class RaggedBatchGatherOpTest(ragged_test_util.RaggedTensorTestCase,
           expected=[[b'c', b'a'], [b'd', b'e'], [b'h', b'g']]),
       dict(
           descr='params: [B1, (P1)], indices: [B1, I], result: [B1, I]',
-          params=ragged.constant_value([['a', 'b', 'c'], ['d', 'e'], ['g']]),
+          params=ragged_factory_ops.constant_value([['a', 'b', 'c'], ['d', 'e'],
+                                                    ['g']]),
           indices=[[2, 0], [0, 1], [0, 0]],
           expected=[[b'c', b'a'], [b'd', b'e'], [b'g', b'g']]),
       dict(
           descr='params: [B1, P1], indices: [B1, (I)], result: [B1, (I)]',
           params=[['a', 'b', 'c'], ['d', 'e', 'f'], ['g', 'h', 'i']],
-          indices=ragged.constant_value([[2, 0, 2], [0], [1]]),
-          expected=ragged.constant_value([[b'c', b'a', b'c'], [b'd'], [b'h']])),
+          indices=ragged_factory_ops.constant_value([[2, 0, 2], [0], [1]]),
+          expected=ragged_factory_ops.constant_value([[b'c', b'a', b'c'],
+                                                      [b'd'], [b'h']])),
       dict(
           descr=('params: [B1, (P1), (P2), P3], indices: [B1, I], '
                  'result: [B1, I, (P2), P3]'),
-          params=ragged.constant_value(
+          params=ragged_factory_ops.constant_value(
               [[[['a']], [['b'], ['c']]], [[['d'], ['e']], [['f']]], [[['g']]]],
               ragged_rank=2),
           indices=[[1, 0], [0, 1], [0, 0]],
-          expected=ragged.constant_value(
+          expected=ragged_factory_ops.constant_value(
               [[[[b'b'], [b'c']], [[b'a']]], [[[b'd'], [b'e']], [[b'f']]],
                [[[b'g']], [[b'g']]]],
               ragged_rank=2)),
@@ -98,31 +107,31 @@ class RaggedBatchGatherOpTest(ragged_test_util.RaggedTensorTestCase,
       dict(
           descr=('params: [B1, (B2), P1], indices: [B1, (B2), I], '
                  'result: [B1, (B2), I]'),
-          params=ragged.constant_value(
+          params=ragged_factory_ops.constant_value(
               [[['a', 'b', 'c'], ['d', 'e', 'f']], [['g', 'h', 'i']]],
               ragged_rank=1),
-          indices=ragged.constant_value([[[2, 0], [0, 1]], [[1, 0]]],
-                                        ragged_rank=1),
-          expected=ragged.constant_value(
+          indices=ragged_factory_ops.constant_value(
+              [[[2, 0], [0, 1]], [[1, 0]]], ragged_rank=1),
+          expected=ragged_factory_ops.constant_value(
               [[[b'c', b'a'], [b'd', b'e']], [[b'h', b'g']]], ragged_rank=1)),
       dict(
           descr=('params: [B1, (B2), (P1)], indices: [B1, (B2), I], '
                  'result: [B1, (B2), I]'),
-          params=ragged.constant_value([[['a', 'b', 'c'], ['d']], [['e', 'f']]],
-                                       ragged_rank=2),
-          indices=ragged.constant_value([[[2, 0], [0, 0]], [[1, 0]]],
-                                        ragged_rank=1),
-          expected=ragged.constant_value(
+          params=ragged_factory_ops.constant_value(
+              [[['a', 'b', 'c'], ['d']], [['e', 'f']]], ragged_rank=2),
+          indices=ragged_factory_ops.constant_value(
+              [[[2, 0], [0, 0]], [[1, 0]]], ragged_rank=1),
+          expected=ragged_factory_ops.constant_value(
               [[[b'c', b'a'], [b'd', b'd']], [[b'f', b'e']]], ragged_rank=1)),
       dict(
           descr=('params: [B1, (B2), P1], indices: [B1, (B2), (I)], '
                  'result: [B1, (B2), (I)]'),
-          params=ragged.constant_value(
+          params=ragged_factory_ops.constant_value(
               [[['a', 'b', 'c'], ['d', 'e', 'f']], [['g', 'h', 'i']]],
               ragged_rank=1),
-          indices=ragged.constant_value([[[2, 1, 0], [0]], [[1, 1]]],
-                                        ragged_rank=2),
-          expected=ragged.constant_value(
+          indices=ragged_factory_ops.constant_value(
+              [[[2, 1, 0], [0]], [[1, 1]]], ragged_rank=2),
+          expected=ragged_factory_ops.constant_value(
               [[[b'c', b'b', b'a'], [b'd']], [[b'h', b'h']]], ragged_rank=2)),
       #=========================================================================
       # 3 Batch Dimensions
@@ -131,70 +140,395 @@ class RaggedBatchGatherOpTest(ragged_test_util.RaggedTensorTestCase,
           descr=(
               'params: [B1, (B2), (B3), (P1)], indices: [B1, (B2), (B3), I], '
               'result: [B1, (B2), (B3), I]'),
-          params=ragged.constant_value(
+          params=ragged_factory_ops.constant_value(
               [[[['a', 'b', 'c'], ['d']], [['e', 'f']]]], ragged_rank=3),
-          indices=ragged.constant_value([[[[2, 0], [0, 0]], [[1, 0]]]],
-                                        ragged_rank=2),
-          expected=ragged.constant_value(
+          indices=ragged_factory_ops.constant_value(
+              [[[[2, 0], [0, 0]], [[1, 0]]]], ragged_rank=2),
+          expected=ragged_factory_ops.constant_value(
               [[[[b'c', b'a'], [b'd', b'd']], [[b'f', b'e']]]], ragged_rank=2)),
   ])
   def testRaggedBatchGather(self, descr, params, indices, expected):
-    result = ragged.batch_gather(params, indices)
+    result = ragged_batch_gather_ops.batch_gather(params, indices)
     self.assertRaggedEqual(result, expected)
 
+  @parameterized.parameters([
+      # Docstring example:
+      dict(
+          descr='Docstring example',
+          params=[['a', 'b', 'c'], ['d'], [], ['e']],
+          indices=[[1, 2, -1], [], [], [0, 10]],
+          expected=[['b', 'c', 'FOO'], [], [], ['e', 'FOO']],
+          default_value='FOO',
+      ),
+      # Dimensions:
+      # indices: [4]
+      # params: [2, (d1), (d2)]
+      dict(
+          descr='params: [2, (d1), (d2), indices: [4]',
+          indices=[1, 100, 0, -1],
+          params=[[['The', 'deal', 'came', 'about', '18', 'months', 'after',
+                    'Yahoo', '!', 'rejected', 'a', '47.5', '-', 'billion', '-',
+                    'dollar', 'takeover', 'offer', 'from', 'Microsoft', '.'],
+                   ['Trumpty', 'Dumpty', 'sat', 'on', 'a', 'wall']],
+                  [["It's", 'always', 'darkest', 'before', 'the', 'dawn']]],
+          expected=[[["It's", 'always', 'darkest', 'before', 'the', 'dawn']],
+                    [['$NONE^']],
+                    [['The', 'deal', 'came', 'about', '18', 'months', 'after',
+                      'Yahoo', '!', 'rejected', 'a', '47.5', '-', 'billion',
+                      '-', 'dollar', 'takeover', 'offer', 'from', 'Microsoft',
+                      '.'],
+                     ['Trumpty', 'Dumpty', 'sat', 'on', 'a', 'wall']],
+                    [['$NONE^']]],
+      ),
+      # Dimensions:
+      # params: [1, (d1)]
+      # indices: [3]
+      dict(
+          descr='params: rank 2, indices: rank 1',
+          params=[
+              ['Bruce', 'Wayne'],
+          ],
+          indices=[-1, 0, 1000],
+          expected=[['$NONE^'], ['Bruce', 'Wayne'], ['$NONE^']]
+      ),
+      # Dimensions:
+      # params: [1, (d1)]
+      # indices: [1, (d2)]
+      dict(
+          descr='Test underbound indices of shape [1, (d2)]',
+          params=[
+              ['The', 'deal', 'came', 'about', '18', 'months', 'after', 'Yahoo',
+               '!', 'rejected', 'a', '47.5', '-', 'billion', '-', 'dollar',
+               'takeover', 'offer', 'from', 'Microsoft', '.'],
+          ],
+          indices=[[8, -1]],
+          expected=[['!', '$NONE^']],
+      ),
+      dict(
+          descr='Test underbound indices of shape [2, (d2)]',
+          params=[
+              ['The', 'deal', 'came', 'about', '18', 'months', 'after', 'Yahoo',
+               '!', 'rejected', 'a', '47.5', '-', 'billion', '-', 'dollar',
+               'takeover', 'offer', 'from', 'Microsoft', '.'],
+              ['Who', 'let', 'the', 'dogs', 'out', '?'],
+          ],
+          indices=[[8, -1], [1, 100]],
+          expected=[['!', '$NONE^'], ['let', '$NONE^']],
+      ),
+      # Dimensions:
+      # params: [2, (d1)]
+      # indices: [2, (d2)]
+      dict(
+          descr='Test underbound indices of rank 2',
+          params=[
+              ['The', 'deal', 'came', 'about', '18', 'months', 'after', 'Yahoo',
+               '!', 'rejected', 'a', '47.5', '-', 'billion', '-', 'dollar',
+               'takeover', 'offer', 'from', 'Microsoft', '.'],
+              ['He', 'left', 'us', '.', 'Little', 'boys', 'crowded', 'together',
+               'on', 'long', 'wooden', 'benches', ',', 'and', 'in', 'the',
+               'center', 'of', 'the', 'room', 'sat', 'the', 'teacher', '.',
+               'His', 'black', 'beard', 'dripped', 'down', 'over', 'the',
+               'front', 'of', 'his', 'coat', '.', 'One', 'white', 'hand',
+               'poised', 'a', 'stick', 'above', 'his', 'desk', '.', 'He',
+               'turned', 'his', 'surly', ',', 'half', '-', 'closed', 'eyes',
+               'toward', 'us', ',', 'stared', 'for', 'a', 'second', ',', 'then',
+               'shouted', 'in', 'Yiddish', ',', '``', 'One', ',', 'two', ',',
+               'three', "''", '!', '!', 'Rapping', 'the', 'stick', 'against',
+               'the', 'desk', '.', 'The', 'little', 'boys', 'shrilled', 'out',
+               'a', 'Yiddish', 'translation', 'or', 'interpretation', 'of',
+               'the', 'Five', 'Books', 'of', 'Moses', ',', 'which', 'they',
+               'had', 'previously', 'chanted', 'in', 'Hebrew', '.']],
+          indices=[[8, -1], [3, 23, 35, 45, 75, 83, -121]],
+          expected=[['!', '$NONE^'], ['.', '.', '.', '.', '!', '.', '$NONE^']],
+      ),
+      dict(
+          descr='Test overbound indices of rank 2',
+          params=[
+              ['The', 'deal', 'came', 'about', '18', 'months', 'after', 'Yahoo',
+               '!', 'rejected', 'a', '47.5', '-', 'billion', '-', 'dollar',
+               'takeover', 'offer', 'from', 'Microsoft', '.'],
+              ['He', 'left', 'us', '.', 'Little', 'boys', 'crowded', 'together',
+               'on', 'long', 'wooden', 'benches', ',', 'and', 'in', 'the',
+               'center', 'of', 'the', 'room', 'sat', 'the', 'teacher', '.',
+               'His', 'black', 'beard', 'dripped', 'down', 'over', 'the',
+               'front', 'of', 'his', 'coat', '.', 'One', 'white', 'hand',
+               'poised', 'a', 'stick', 'above', 'his', 'desk', '.', 'He',
+               'turned', 'his', 'surly', ',', 'half', '-', 'closed', 'eyes',
+               'toward', 'us', ',', 'stared', 'for', 'a', 'second', ',', 'then',
+               'shouted', 'in', 'Yiddish', ',', '``', 'One', ',', 'two', ',',
+               'three', "''", '!', '!', 'Rapping', 'the', 'stick', 'against',
+               'the', 'desk', '.', 'The', 'little', 'boys', 'shrilled', 'out',
+               'a', 'Yiddish', 'translation', 'or', 'interpretation', 'of',
+               'the', 'Five', 'Books', 'of', 'Moses', ',', 'which', 'they',
+               'had', 'previously', 'chanted', 'in', 'Hebrew', '.']],
+          indices=[[8, 8823], [3, 23, 35, 45, 75, 83, 1234]],
+          expected=[['!', '$NONE^'], ['.', '.', '.', '.', '!', '.', '$NONE^']],
+      ),
+      # Dimensions:
+      # params: [2, (d1), 2]
+      # indices: [2, (d2)]
+      dict(
+          descr='params: rank 3, indices: rank 2',
+          params=[
+              [['The', 'deal'], ['takeover', 'offer'], ['from', 'Microsoft']],
+              [['Who', 'let'], ['the', 'dogs'], ['out', '?']],
+          ],
+          ragged_rank=1,
+          indices=[[1, -1, 2, 30], [1, 100]],
+          indices_ragged_rank=1,
+          expected=[[['takeover', 'offer'],
+                     ['$NONE^', '$NONE^'],
+                     ['from', 'Microsoft'],
+                     ['$NONE^', '$NONE^']],
+                    [['the', 'dogs'],
+                     ['$NONE^', '$NONE^']]],
+          expected_ragged_rank=1,
+          default_value=['$NONE^', '$NONE^'],
+      ),
+      # Dimensions:
+      # params: [2, (d1), (d2)]
+      # indices: [2, (d3)]
+      dict(
+          descr='params: [2, (d1), (d2)], indices: [2, (d3)]',
+          params=[
+              [['The', 'deal', 'came', 'about', '18', 'months', 'after',
+                'Yahoo', '!', 'rejected', 'a', '47.5', '-', 'billion', '-',
+                'dollar', 'takeover', 'offer', 'from', 'Microsoft', '.'],
+               ['Trumpty', 'Dumpty', 'sat', 'on', 'a', 'wall'],
+              ],
+              [['It\'s', 'always', 'darkest', 'before', 'the', 'dawn']]
+          ],
+          indices=[[1, 100], [0, -1]],
+          expected=[[['Trumpty', 'Dumpty', 'sat', 'on', 'a', 'wall'],
+                     ['$NONE^']],
+                    [["It's", 'always', 'darkest', 'before', 'the', 'dawn'],
+                     ['$NONE^']]]
+      ),
+      # Dimensions:
+      # params: [2, (d1), (d2)]
+      # indices: [2, (d1), (d3)]
+      dict(
+          descr='Test overbound indices of rank 3',
+          params=[
+              [['The', 'deal', 'came', 'about', '18', 'months', 'after',
+                'Yahoo', '!', 'rejected', 'a', '47.5', '-', 'billion', '-',
+                'dollar', 'takeover', 'offer', 'from', 'Microsoft', '.'],
+               ['Foo', 'bar', 'mar']],
+              [['He', 'left', 'us', '.', 'Little', 'boys', 'crowded',
+                'together', 'on', 'long', 'wooden', 'benches', ',', 'and', 'in',
+                'the', 'center', 'of', 'the', 'room', 'sat', 'the', 'teacher',
+                '.', 'His', 'black', 'beard', 'dripped', 'down', 'over', 'the',
+                'front', 'of', 'his', 'coat', '.', 'One', 'white', 'hand',
+                'poised', 'a', 'stick', 'above', 'his', 'desk', '.', 'He',
+                'turned', 'his', 'surly', ',', 'half', '-', 'closed', 'eyes',
+                'toward', 'us', ',', 'stared', 'for', 'a', 'second', ',',
+                'then', 'shouted', 'in', 'Yiddish', ',', '``', 'One', ',',
+                'two', ',',
+                'three', "''", '!', '!', 'Rapping', 'the', 'stick', 'against',
+                'the', 'desk', '.', 'The', 'little', 'boys', 'shrilled', 'out',
+                'a', 'Yiddish', 'translation', 'or', 'interpretation', 'of',
+                'the', 'Five', 'Books', 'of', 'Moses', ',', 'which', 'they',
+                'had', 'previously', 'chanted', 'in', 'Hebrew', '.'],
+               ['I', 'too', 'was', 'hustled', 'scammed', 'bamboozled', 'hood',
+                'winked', 'lead', 'astray']]
+          ],
+          indices=[[[8, 8823], [0, 100]], [[3, 23, 35, 45, 75, 83, 1234], [5]]],
+          expected=[[['!', '$NONE^'], ['Foo', '$NONE^']],
+                    [['.', '.', '.', '.', '!', '.', '$NONE^'],
+                     ['bamboozled']]],
+      ),
+      # params.shape = [2, (d1), 8]
+      # indices.shape = [2, (d1), 3]
+      dict(
+          descr='params = [2, (2, 1), 8], indices = [2, (2, 1), 3]',
+          params=[[['h'] * 8, ['w'] * 8], [['b'] * 8]],
+          ragged_rank=1,
+          indices=[[[0, 100, 1], [0, 1, 0]], [[1, 0, 0]]],
+          indices_ragged_rank=1,
+          expected=[[['h', '$NONE^', 'h'], ['w', 'w', 'w']], [['b', 'b', 'b']]],
+          expected_ragged_rank=1,
+      ),
+  ])
+  def testRaggedBatchGatherWithDefault(
+      self, descr, params, indices, expected, indices_ragged_rank=None,
+      expected_ragged_rank=None, ragged_rank=None, default_value='$NONE^'):
+    params = ragged_factory_ops.constant(params, ragged_rank=ragged_rank)
+    indices = ragged_factory_ops.constant(
+        indices, ragged_rank=indices_ragged_rank or ragged_rank)
+    expected = ragged_factory_ops.constant(
+        expected, ragged_rank=expected_ragged_rank or ragged_rank)
+    result = ragged_batch_gather_with_default_op.batch_gather_with_default(
+        params, indices, default_value)
+    self.assertRaggedEqual(result, expected)
+
+  @parameterized.parameters([
+      # Dimensions:
+      #  params: dims [2, 5], indices: [2, 2]
+      dict(
+          descr='params: dims [2, 5], indices: [2, 2]',
+          params=[
+              ['The', 'deal', 'came', 'about', '18'],
+              ['He', 'left', 'us', '.', 'Little']],
+          indices=[[0, -1], [3, 121]],
+          expected=[['The', '$NONE^'], ['.', '$NONE^']],
+          default_value='$NONE^',
+      ),
+      # Dimensions:
+      #  params: dims [2, 2, 5], indices: [2, 2]
+      dict(
+          descr='params: dims [2, 2, 5], indices: [2, 2]',
+          params=[
+              [['The', 'deal', 'came', 'about', '18'],
+               ['The', 'deal', 'came', 'about', '19'],
+              ],
+              [['He', 'left', 'us', '.', 'Little'],
+               ['The', 'deal', 'came', 'about', '20'],
+              ]
+          ],
+          indices=[[0, -1], [0, 121]],
+          expected=[[['The', 'deal', 'came', 'about', '18'],
+                     ['$NONE^', '$NONE^', '$NONE^', '$NONE^', '$NONE^']],
+                    [['He', 'left', 'us', '.', 'Little'],
+                     ['$NONE^', '$NONE^', '$NONE^', '$NONE^', '$NONE^']]],
+          default_value='$NONE^',
+      ),
+      # Test default_value with shape [5]
+      dict(
+          descr='params: dims [2, 2, 5], indices: [2, 2]',
+          params=[
+              [['The', 'deal', 'came', 'about', '18'],
+               ['The', 'deal', 'came', 'about', '19'],
+              ],
+              [['He', 'left', 'us', '.', 'Little'],
+               ['The', 'deal', 'came', 'about', '20'],
+              ]
+          ],
+          indices=[[0, -1], [0, 121]],
+          expected=[[['The', 'deal', 'came', 'about', '18'],
+                     [':FOO:', ':FOO:', ':FOO:', ':FOO:', ':FOO:']],
+                    [['He', 'left', 'us', '.', 'Little'],
+                     [':FOO:', ':FOO:', ':FOO:', ':FOO:', ':FOO:']]],
+          default_value=[':FOO:', ':FOO:', ':FOO:', ':FOO:', ':FOO:'],
+      ),
+  ])
+  def testRaggedBatchGatherWithDefaultOnTensors(
+      self, descr, params, indices, expected, default_value):
+    params = constant_op.constant(params)
+    indices = constant_op.constant(indices)
+    expected = constant_op.constant(expected)
+    result = ragged_batch_gather_with_default_op.batch_gather_with_default(
+        params, indices, default_value)
+    self.assertAllEqual(expected, result)
+
+  @parameterized.parameters([
+      dict(
+          params=[['The', 'deal', 'came', 'about', '18', 'months', 'after',
+                   'Yahoo', '!', 'rejected', 'a', '47.5', '-', 'billion', '-',
+                   'dollar', 'takeover', 'offer', 'from', 'Microsoft', '.']],
+          indices=[[[8, -1]]],
+          # Exception here because different errors are thrown in eager vs
+          # graph mode.
+          error=Exception,
+          default_value='$NONE^',
+      ),
+  ])
+  def testRankMismatch(
+      self, params, indices, default_value, error):
+    params = ragged_factory_ops.constant(params)
+    indices = ragged_factory_ops.constant(indices)
+    with self.assertRaises(error):
+      _ = ragged_batch_gather_with_default_op.batch_gather_with_default(
+          params, indices, default_value)
+
+  @parameterized.parameters([
+      # Dimensions:
+      # params: [2, (d1), 2]
+      # indices: [2, (d2)]
+      # default_value: []
+      dict(
+          descr='params: rank 3, indices: rank 2, default: rank = [], but'
+          ' should be [2]',
+          params=[
+              [['The', 'deal'], ['takeover', 'offer'], ['from', 'Microsoft']],
+              [['Who', 'let'], ['the', 'dogs'], ['out', '?']],
+          ],
+          ragged_rank=1,
+          indices=[[1, -1, 2, 30], [1, 100]],
+          indices_ragged_rank=1,
+          default_value='$NONE^',
+          error=Exception,
+      )
+  ])
+  def testInvalidDefaultValueRank(
+      self, descr, params, indices, default_value, error, ragged_rank=None,
+      indices_ragged_rank=None):
+    params = ragged_factory_ops.constant(params, ragged_rank=ragged_rank)
+    indices = ragged_factory_ops.constant(
+        indices, ragged_rank=indices_ragged_rank)
+    with self.assertRaises(error):
+      _ = ragged_batch_gather_with_default_op.batch_gather_with_default(
+          params, indices, default_value)
+
   def testRaggedBatchGatherUnknownRankError(self):
     if context.executing_eagerly():
       return
     params = [['a', 'b'], ['c', 'd']]
     indices = array_ops.placeholder(dtypes.int32, shape=None)
-    ragged_indices = ragged.RaggedTensor.from_row_splits(indices, [0, 2, 4])
+    ragged_indices = ragged_tensor.RaggedTensor.from_row_splits(
+        indices, [0, 2, 4])
 
     with self.assertRaisesRegexp(
         ValueError, 'batch_gather does not allow indices with unknown shape.'):
-      ragged.batch_gather(params, indices)
+      ragged_batch_gather_ops.batch_gather(params, indices)
 
     with self.assertRaisesRegexp(
         ValueError, 'batch_gather does not allow indices with unknown shape.'):
-      ragged.batch_gather(params, ragged_indices)
+      ragged_batch_gather_ops.batch_gather(params, ragged_indices)
 
-  @parameterized.parameters([
-      dict(
-          params=ragged.constant_value([['a'], ['b'], ['c']]),
-          indices=ragged.constant_value([[0], [0]]),
-          message='Dimensions 3 and 2 are not compatible'),
-      dict(
-          params=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
-          indices=ragged.constant_value([[[0, 0], [0, 0, 0]], [[0]]]),
-          message='batch shape from indices does not match params shape'),
-      dict(  # rank mismatch
-          params=ragged.constant_value([[[0, 0], [0, 0, 0]], [[0]]]),
-          indices=ragged.constant_value([[[0, 0]], [[0, 0, 0]], [[0]]]),
-          error=(ValueError, errors.InvalidArgumentError)),
-      dict(
-          params=ragged.constant_value([[[0, 0], [0, 0, 0]], [[0]], [[0]]]),
-          indices=ragged.constant_value([[[0, 0]], [[0, 0, 0]], [[0]]]),
-          error=errors.InvalidArgumentError,
-          message='.*Condition x == y did not hold.*'),
-      dict(
-          params=ragged.constant_value(['a', 'b', 'c']),
-          indices=ragged.constant_value([[0], [0]]),
-          message='batch shape from indices does not match params shape'),
-      dict(
-          params=ragged.constant_value([['a']]),
-          indices=0,
-          message='indices.rank must be at least 1.'),
-      dict(
-          params=ragged.constant_value([['a']]),
-          indices=[[[0]]],
-          message='batch shape from indices does not match params shape'),
-  ])
+  @parameterized.parameters(
+      [
+          dict(
+              params=ragged_factory_ops.constant_value([['a'], ['b'], ['c']]),
+              indices=ragged_factory_ops.constant_value([[0], [0]]),
+              message='Dimensions 3 and 2 are not compatible'),
+          dict(
+              params=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
+              indices=ragged_factory_ops.constant_value([[[0, 0], [0, 0, 0]],
+                                                         [[0]]]),
+              message='batch shape from indices does not match params shape'),
+          dict(  # rank mismatch
+              params=ragged_factory_ops.constant_value([[[0, 0], [0, 0, 0]],
+                                                        [[0]]]),
+              indices=ragged_factory_ops.constant_value([[[0, 0]], [[0, 0, 0]],
+                                                         [[0]]]),
+              error=(ValueError, errors.InvalidArgumentError)),
+          dict(
+              params=ragged_factory_ops.constant_value([[[0, 0], [0, 0, 0]],
+                                                        [[0]], [[0]]]),
+              indices=ragged_factory_ops.constant_value([[[0, 0]], [[0, 0, 0]],
+                                                         [[0]]]),
+              error=errors.InvalidArgumentError,
+              message='.*Condition x == y did not hold.*'),
+          dict(
+              params=ragged_factory_ops.constant_value(['a', 'b', 'c']),
+              indices=ragged_factory_ops.constant_value([[0], [0]]),
+              message='batch shape from indices does not match params shape'),
+          dict(
+              params=ragged_factory_ops.constant_value([['a']]),
+              indices=0,
+              message='indices.rank must be at least 1.'),
+          dict(
+              params=ragged_factory_ops.constant_value([['a']]),
+              indices=[[[0]]],
+              message='batch shape from indices does not match params shape'),
+      ])
   def testRaggedBatchGatherStaticError(self,
                                        params,
                                        indices,
                                        message=None,
                                        error=ValueError):
     with self.assertRaisesRegexp(error, message):
-      ragged.batch_gather(params, indices)
+      ragged_batch_gather_ops.batch_gather(params, indices)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_batch_gather_ops.py b/tensorflow/python/ops/ragged/ragged_batch_gather_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c57aead9192f657442d8f6c86be267f83317b87
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_batch_gather_ops.py
@@ -0,0 +1,120 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Batch gather operations for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_conversion_ops
+from tensorflow.python.ops.ragged import ragged_gather_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_util
+
+
+#===============================================================================
+# ragged.batch_gather
+#===============================================================================
+def batch_gather(params, indices, name=None):
+  """Gathers slices from `params` according to `indices` with batch dims.
+
+  This operation is similar to `gather`, but it assumes that the leading `N`
+  dimensions of `indices` and `params` are batch dimensions, and performs a
+  gather within each batch.  In particular, when using this operation with `N`
+  batch dimensions `B1...BN`:
+
+  * `indices` has shape `[B1...BN, I]`
+  * `params` has shape `[B1...BN, P1...PM]`.
+  * `result` has shape `[B1...BN, I, P2...PM]`.
+  * `result[b1...bN, i, p2...pM] =
+    params[b1...bN, indices[b1...bN, i], p2...pM]`
+
+  Args:
+    params: A potentially ragged tensor with shape `[B1...BN, P1...PM]` (`N>=0`,
+      `M>0`).
+    indices: A potentially ragged tensor with shape `[B1...BN, I]` (`N>=0`).
+    name: A name for the operation (optional).
+
+  Returns:
+    A potentially ragged tensor with shape `[B1...BN, I, P2...PM]`.
+    `result.ragged_rank = max(indices.ragged_rank, params.ragged_rank)`.
+
+  #### Example:
+    ```python
+    >>> params = tf.ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
+    >>> indices = tf.ragged.constant([[1, 2, 0], [], [], [0, 0]])
+    >>> tf.batch_gather(params, indices)
+    [['b', 'c', 'a'], [], [], ['e', 'e']]
+    ```
+  """
+  if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)):
+    return array_ops.batch_gather(params, indices, name)
+
+  with ops.name_scope(name, 'RaggedBatchGather', [params, indices]):
+    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        params, name='params')
+    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        indices, name='indices')
+    indices_ndims = indices.shape.ndims
+    if indices_ndims is None:
+      raise ValueError(
+          'batch_gather does not allow indices with unknown shape.')
+    if indices_ndims == 0:
+      raise ValueError('indices.rank must be at least 1.')
+
+    if ragged_tensor.is_ragged(indices):
+      # If the outermost ragged dimension is a batch dimension, recurse.
+      if indices_ndims > 2:
+        if not ragged_tensor.is_ragged(params):
+          raise ValueError('batch shape from indices does '
+                           'not match params shape')
+        checks = [check_ops.assert_equal(params.row_splits, indices.row_splits)]
+        with ops.control_dependencies(checks):
+          return ragged_tensor.RaggedTensor.from_row_splits(
+              batch_gather(params.values, indices.values), indices.row_splits)
+
+      # Otherwise, indices is a 2D ragged tensor with 1 ragged dimension.
+      else:
+        # Ensure that `params` is ragged and has at least 2 dimensions.
+        if not ragged_tensor.is_ragged(params):
+          if params.shape.ndims is not None and params.shape.ndims < 2:
+            raise ValueError('batch shape from indices does '
+                             'not match params shape')
+          params = ragged_conversion_ops.from_tensor(params, ragged_rank=1)
+
+        # Adjust indices from within-batch to global (in params.values), and
+        # then use ragged.gather to gather them.
+        num_indices = indices.row_lengths()
+        params_starts = params.row_starts()
+        adjustments = ragged_util.repeat(params_starts, num_indices, axis=0)
+        adjusted_index_values = math_ops.to_int64(indices.values) + adjustments
+        return ragged_tensor.RaggedTensor.from_row_splits(
+            ragged_gather_ops.gather(params.values, adjusted_index_values),
+            indices.row_splits)
+
+    else:  # params is a RaggedTensor and indices is a Tensor.
+      if indices_ndims == 1:
+        return ragged_gather_ops.gather(params, indices)
+      elif indices_ndims == 2:
+        # Adjust indices from batch-local to global (in params.values)
+        adjustments = array_ops.expand_dims(params.row_starts(), 1)
+        adjusted_indices = math_ops.to_int64(indices) + adjustments
+        return ragged_gather_ops.gather(params.values, adjusted_indices)
+      else:
+        raise ValueError('batch shape from indices does not match params shape')
diff --git a/tensorflow/python/ops/ragged/ragged_batch_gather_with_default_op.py b/tensorflow/python/ops/ragged/ragged_batch_gather_with_default_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d99540e3291dd1774e340e119b38601a9e48a6d
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_batch_gather_with_default_op.py
@@ -0,0 +1,186 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Array operations for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_dispatch  # pylint: disable=unused-import
+from tensorflow.python.ops.ragged import ragged_operators  # pylint: disable=unused-import
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_shape
+from tensorflow.python.ops.ragged import ragged_where_op
+
+
+#===============================================================================
+# ragged.batch_gather_with_default
+#===============================================================================
+def batch_gather_with_default(params,
+                              indices,
+                              default_value='',
+                              name=None):
+  """Same as `batch_gather` but inserts `default_value` for invalid indices.
+
+  This operation is similar to `batch_gather` except that it will substitute
+  the value for invalid indices with `default_value` as the contents.
+  See `batch_gather` for more details.
+
+
+  Args:
+    params: A potentially ragged tensor with shape `[B1...BN, P1...PM]` (`N>=0`,
+      `M>0`).
+    indices: A potentially ragged tensor with shape `[B1...BN, I]` (`N>=0`).
+    default_value: A value to be inserted in places where `indices` are out of
+      bounds. Must be the same dtype as params and either a scalar or rank 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    A potentially ragged tensor with shape `[B1...BN, I, P2...PM]`.
+    `result.ragged_rank = max(indices.ragged_rank, params.ragged_rank)`.
+
+  #### Example:
+    ```python
+    >>> params = tf.ragged.constant([
+          ['a', 'b', 'c'],
+          ['d'],
+          [],
+          ['e']])
+    >>> indices = tf.ragged.constant([[1, 2, -1], [], [], [0, 10]])
+    >>> batch_gather_with_default(params, indices, 'FOO')
+    [['b', 'c', 'FOO'], [], [], ['e', 'FOO']]
+  ```
+  """
+  with ops.name_scope(name, 'RaggedBatchGatherWithDefault'):
+    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        params, name='params',
+    )
+    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        indices, name='indices',
+    )
+    default_value = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        default_value, name='default_value',
+    )
+    # TODO(hterry): lift this restriction and support default_values of
+    #               of rank > 1
+    if (default_value.shape.ndims is not 0
+        and default_value.shape.ndims is not 1):
+      raise ValueError('"default_value" must be a scalar or vector')
+    upper_bounds = None
+    if indices.shape.ndims is None:
+      raise ValueError('Indices must have a known rank.')
+    if params.shape.ndims is None:
+      raise ValueError('Params must have a known rank.')
+
+    num_batch_dimensions = indices.shape.ndims - 1
+    pad = None
+    # The logic for this works as follows:
+    # - create a padded params, where:
+    #    padded_params[b1...bn, 0] = default_value
+    #    padded_params[b1...bn, i] = params[b1...bn, i-1] (i>0)
+    # - create an `upper_bounds` Tensor that contains the number of elements
+    #   in each innermost rank. Broadcast `upper_bounds` to be the same shape
+    #   as `indices`.
+    # - check to see which index in `indices` are out of bounds and substitute
+    #   it with the index containing `default_value` (the first).
+    # - call batch_gather with the indices adjusted.
+    with ops.control_dependencies([
+        check_ops.assert_greater_equal(array_ops.rank(params),
+                                       array_ops.rank(indices))]):
+      if ragged_tensor.is_ragged(params):
+        row_lengths = ragged_array_ops.expand_dims(
+            params.row_lengths(axis=num_batch_dimensions),
+            axis=-1)
+        upper_bounds = math_ops.cast(row_lengths, indices.dtype)
+
+        pad_shape = _get_pad_shape(params, indices)
+
+        pad = ragged_tensor_shape.broadcast_to(
+            default_value, pad_shape)
+      else:
+        params_shape = array_ops.shape(params)
+        pad_shape = array_ops.concat([
+            params_shape[:num_batch_dimensions],
+            [1],
+            params_shape[num_batch_dimensions + 1:params.shape.ndims]
+        ], 0)
+        upper_bounds = params_shape[num_batch_dimensions]
+        pad = array_ops.broadcast_to(default_value, pad_shape)
+
+      # Add `default_value` as the first value in the innermost (ragged) rank.
+      pad = math_ops.cast(pad, params.dtype)
+      padded_params = array_ops.concat(
+          [pad, params], axis=num_batch_dimensions)
+
+      # Adjust the indices by substituting out-of-bound indices to the
+      # default-value index (which is the first element)
+      shifted_indices = indices + 1
+      is_out_of_bounds = (indices < 0) | (indices > upper_bounds)
+      adjusted_indices = ragged_where_op.where(
+          is_out_of_bounds,
+          x=array_ops.zeros_like(indices), y=shifted_indices,
+      )
+      return array_ops.batch_gather(
+          params=padded_params, indices=adjusted_indices, name=name)
+
+
+def _get_pad_shape(params, indices):
+  """Gets the RaggedTensorDynamicShape for the pad tensor."""
+  num_batch_dimensions = indices.shape.ndims - 1
+  params_shape = ragged_tensor_shape.RaggedTensorDynamicShape.from_tensor(
+      params)
+
+  # We want to create a pad tensor that can be concatenated with the params.
+  if params.shape.ndims == indices.shape.ndims:
+    # When params and indices are the same rank, the shape of the pad tensor is
+    # almost identical to params, except the last dimension which has size = 1.
+    if params_shape.num_inner_dimensions is 0:
+      pad_dims = params_shape.partitioned_dim_sizes[:-1] + (
+          array_ops.ones_like(params_shape.partitioned_dim_sizes[-1]),)
+      return ragged_tensor_shape.RaggedTensorDynamicShape(
+          pad_dims, [])
+    else:
+      return ragged_tensor_shape.RaggedTensorDynamicShape(
+          params_shape.partitioned_dim_sizes,
+          array_ops.concat([params_shape.inner_dim_sizes[:-1], [1]], axis=0))
+  else:
+    # When the rank of indices < params, the pad has the same dimension as
+    # params up to the 'num_batch_dimensions' rank. Every dimension after that
+    # has size 1.
+    pad_dims = None
+    if num_batch_dimensions == 0:
+      pad_dims = (constant_op.constant(1, dtype=dtypes.int64),) + (
+          constant_op.constant([1], dtype=dtypes.int64),) * (
+              params_shape.num_partitioned_dimensions -
+              num_batch_dimensions - 1)
+    else:
+      batch_dimensions = params_shape.partitioned_dim_sizes[
+          :num_batch_dimensions]
+      gather_dimension = params_shape.partitioned_dim_sizes[
+          num_batch_dimensions]
+      pad_dims = batch_dimensions + (
+          array_ops.ones_like(gather_dimension),) * (
+              params_shape.num_partitioned_dimensions - num_batch_dimensions)
+
+    return ragged_tensor_shape.RaggedTensorDynamicShape(
+        pad_dims, params_shape.inner_dim_sizes)
diff --git a/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py b/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py
index b0f7459322792aeafaadd4db18ecd30105e8e74c..6f5fad13fb4afe9fdc0591dce71b5d33d0f005dd 100644
--- a/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.boolean_mask."""
+"""Tests for ragged_array_ops.boolean_mask."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -25,7 +25,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -54,25 +55,25 @@ class RaggedBooleanMaskOpTest(ragged_test_util.RaggedTensorTestCase,
           data=[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
           mask=[[T, F, T], [F, F, F], [T, F, F]],
           keepdims=True,
-          expected=ragged.constant_value([[1, 3], [], [7]])),
+          expected=ragged_factory_ops.constant_value([[1, 3], [], [7]])),
       dict(
           descr='Docstring example 3',
-          data=ragged.constant_value([[1, 2, 3], [4], [5, 6]]),
-          mask=ragged.constant_value([[F, F, T], [F], [T, T]]),
+          data=ragged_factory_ops.constant_value([[1, 2, 3], [4], [5, 6]]),
+          mask=ragged_factory_ops.constant_value([[F, F, T], [F], [T, T]]),
           keepdims=False,
           expected=[3, 5, 6]),
       dict(
           descr='Docstring example 4',
-          data=ragged.constant_value([[1, 2, 3], [4], [5, 6]]),
-          mask=ragged.constant_value([[F, F, T], [F], [T, T]]),
+          data=ragged_factory_ops.constant_value([[1, 2, 3], [4], [5, 6]]),
+          mask=ragged_factory_ops.constant_value([[F, F, T], [F], [T, T]]),
           keepdims=True,
-          expected=ragged.constant_value([[3], [], [5, 6]])),
+          expected=ragged_factory_ops.constant_value([[3], [], [5, 6]])),
       dict(
           descr='Docstring example 5',
-          data=ragged.constant_value([[1, 2, 3], [4], [5, 6]]),
+          data=ragged_factory_ops.constant_value([[1, 2, 3], [4], [5, 6]]),
           mask=[True, False, True],
           keepdims=False,
-          expected=ragged.constant_value([[1, 2, 3], [5, 6]])),
+          expected=ragged_factory_ops.constant_value([[1, 2, 3], [5, 6]])),
       #=========================================================================
       # Uniform data and uniform mask.
       #=========================================================================
@@ -93,7 +94,8 @@ class RaggedBooleanMaskOpTest(ragged_test_util.RaggedTensorTestCase,
           data=[[1, 2, 3], [4, 5, 6], [7, 8, 9], [0, 1, 2], [3, 4, 5]],
           mask=[[F, F, F], [T, F, T], [T, T, T], [F, F, F], [T, T, F]],
           keepdims=True,
-          expected=ragged.constant_value([[], [4, 6], [7, 8, 9], [], [3, 4]])),
+          expected=ragged_factory_ops.constant_value(
+              [[], [4, 6], [7, 8, 9], [], [3, 4]])),
       dict(
           descr='data.shape=[3, 2, 2]; mask.shape=[3]; keepdims=True',
           data=[[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
@@ -111,8 +113,9 @@ class RaggedBooleanMaskOpTest(ragged_test_util.RaggedTensorTestCase,
           data=[[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
           mask=[[T, F], [T, T], [F, F]],
           keepdims=True,
-          expected=ragged.constant_value([[[1, 2]], [[5, 6], [7, 8]], []],
-                                         ragged_rank=1)),
+          expected=ragged_factory_ops.constant_value(
+              [[[1, 2]], [[5, 6], [7, 8]], []],
+              ragged_rank=1)),
       dict(
           descr='data.shape=[3, 2, 2]; mask.shape=[3, 2]; keepdims=False',
           data=[[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
@@ -124,7 +127,7 @@ class RaggedBooleanMaskOpTest(ragged_test_util.RaggedTensorTestCase,
           data=[[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
           mask=[[[T, T], [F, T]], [[F, F], [F, F]], [[T, F], [T, T]]],
           keepdims=True,
-          expected=ragged.constant_value(
+          expected=ragged_factory_ops.constant_value(
               [[[1, 2], [4]], [[], []], [[2], [6, 8]]])),
       dict(
           descr='data.shape=mask.shape=[2, 2, 2, 2]; keepdims=True',
@@ -133,7 +136,7 @@ class RaggedBooleanMaskOpTest(ragged_test_util.RaggedTensorTestCase,
           mask=[[[[T, T], [F, F]], [[T, F], [F, F]]],
                 [[[F, F], [F, F]], [[T, T], [T, F]]]],
           keepdims=True,
-          expected=ragged.constant_value(
+          expected=ragged_factory_ops.constant_value(
               [[[[1, 2], []], [[5], []]], [[[], []], [[1, 3], [5]]]])),
       dict(
           descr='data.shape=mask.shape=[2, 2, 2, 2]; keepdims=False',
@@ -149,63 +152,64 @@ class RaggedBooleanMaskOpTest(ragged_test_util.RaggedTensorTestCase,
       #=========================================================================
       dict(
           descr='data.shape=[5, (D2)]; mask.shape=[5, (D2)]',
-          data=ragged.constant_value(
+          data=ragged_factory_ops.constant_value(
               [[1, 2], [3, 4, 5, 6], [7, 8, 9], [], [1, 2, 3]]),
-          mask=ragged.constant_value(
+          mask=ragged_factory_ops.constant_value(
               [[F, F], [F, T, F, T], [F, F, F], [], [T, F, T]]),
           keepdims=True,
-          expected=ragged.constant_value([[], [4, 6], [], [], [1, 3]])),
+          expected=ragged_factory_ops.constant_value(
+              [[], [4, 6], [], [], [1, 3]])),
       dict(
           descr='data.shape=[3, (D2), (D3)]; mask.shape=[3, (D2)]',
-          data=ragged.constant_value(
+          data=ragged_factory_ops.constant_value(
               [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]]),
-          mask=ragged.constant_value([[T, F], [T, T], [F, F]]),
+          mask=ragged_factory_ops.constant_value([[T, F], [T, T], [F, F]]),
           keepdims=True,
-          expected=ragged.constant_value(
+          expected=ragged_factory_ops.constant_value(
               [[[1, 2]], [[5, 6], [7, 8]], []])),
       dict(
           descr='data.shape=[3, (D2), (D3)]; mask.shape=[3, (D2)]',
-          data=ragged.constant_value(
+          data=ragged_factory_ops.constant_value(
               [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]]),
-          mask=ragged.constant_value([[T, F], [T, T], [F, F]]),
+          mask=ragged_factory_ops.constant_value([[T, F], [T, T], [F, F]]),
           keepdims=False,
-          expected=ragged.constant_value([[1, 2], [5, 6], [7, 8]])),
+          expected=ragged_factory_ops.constant_value([[1, 2], [5, 6], [7, 8]])),
       dict(
           descr='data.shape=[3, (D2), D3]; mask.shape=[3, (D2)]',
-          data=ragged.constant_value(
+          data=ragged_factory_ops.constant_value(
               [[[1, 2], [3, 4]], [[5, 6], [7, 8], [2, 4]], [[6, 8]]],
               ragged_rank=1),
-          mask=ragged.constant_value([[T, F], [T, T, F], [F]]),
+          mask=ragged_factory_ops.constant_value([[T, F], [T, T, F], [F]]),
           keepdims=True,
-          expected=ragged.constant_value(
+          expected=ragged_factory_ops.constant_value(
               [[[1, 2]], [[5, 6], [7, 8]], []],
               ragged_rank=1)),
       dict(
           descr='data.shape=[3, (D2), D3]; mask.shape=[3, (D2)]',
-          data=ragged.constant_value(
+          data=ragged_factory_ops.constant_value(
               [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4], [6, 8]]],
               ragged_rank=1),
-          mask=ragged.constant_value([[T, F], [T, T], [F, F]]),
+          mask=ragged_factory_ops.constant_value([[T, F], [T, T], [F, F]]),
           keepdims=False,
           expected=[[1, 2], [5, 6], [7, 8]]),
       dict(
           descr='data.shape=[3, (D2), (D3)]; mask.shape=[3, (D2), (D3)]',
-          data=ragged.constant_value(
+          data=ragged_factory_ops.constant_value(
               [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[2, 4]]]),
-          mask=ragged.constant_value(
+          mask=ragged_factory_ops.constant_value(
               [[[T, T], [F, T]], [[F, F], [F, F]], [[T, F]]]),
           keepdims=True,
-          expected=ragged.constant_value(
+          expected=ragged_factory_ops.constant_value(
               [[[1, 2], [4]], [[], []], [[2]]])),
       dict(
           descr=('data.shape=[3, (D2), (D3), (D4)]; '
                  'mask.shape=[3, (D2), (D3), (D4)]'),
-          data=ragged.constant_value(
+          data=ragged_factory_ops.constant_value(
               [[[[1, 2], [3, 4]], [[5, 6]]], [[[2, 4], [6, 8]]]]),
-          mask=ragged.constant_value(
+          mask=ragged_factory_ops.constant_value(
               [[[[T, T], [F, F]], [[T, F]]], [[[F, F], [T, T]]]]),
           keepdims=True,
-          expected=ragged.constant_value(
+          expected=ragged_factory_ops.constant_value(
               [[[[1, 2], []], [[5]]], [[[], [6, 8]]]])),
 
       #=========================================================================
@@ -214,125 +218,132 @@ class RaggedBooleanMaskOpTest(ragged_test_util.RaggedTensorTestCase,
       dict(
           descr='data.shape=[2, 3]; mask.shape=[2, (3)]',
           data=[[1, 2, 3], [4, 5, 6]],
-          mask=ragged.constant_value([[T, F, F], [F, T, T]]),
+          mask=ragged_factory_ops.constant_value([[T, F, F], [F, T, T]]),
           keepdims=True,
-          expected=ragged.constant_value([[1], [5, 6]])),
+          expected=ragged_factory_ops.constant_value([[1], [5, 6]])),
       dict(
           descr='data.shape=[2, 3, 2]; mask.shape=[2, (3)]',
           data=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 0], [2, 4]]],
-          mask=ragged.constant_value([[T, F, F], [F, T, T]]),
+          mask=ragged_factory_ops.constant_value([[T, F, F], [F, T, T]]),
           keepdims=True,
-          expected=ragged.constant_value(
+          expected=ragged_factory_ops.constant_value(
               [[[1, 2]], [[9, 0], [2, 4]]],
               ragged_rank=1)),
       dict(
           descr='data.shape=[2, 3, 2]; mask.shape=[2, (3), 2]',
           data=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 0], [2, 4]]],
-          mask=ragged.constant_value(
+          mask=ragged_factory_ops.constant_value(
               [[[T, F], [F, F], [T, T]], [[T, F], [F, T], [F, F]]],
               ragged_rank=1),
           keepdims=True,
-          expected=ragged.constant_value([[[1], [], [5, 6]], [[7], [0], []]])),
+          expected=ragged_factory_ops.constant_value(
+              [[[1], [], [5, 6]], [[7], [0], []]])),
 
       #=========================================================================
       # Ragged data and uniform mask.
       #=========================================================================
       dict(
           descr='data.shape=[4, (D2)]; mask.shape=[4]',
-          data=ragged.constant_value([[1, 2, 3], [4], [], [5, 6]]),
+          data=ragged_factory_ops.constant_value([[1, 2, 3], [4], [], [5, 6]]),
           mask=[T, F, T, F],
           keepdims=False,
-          expected=ragged.constant_value([[1, 2, 3], []])),
+          expected=ragged_factory_ops.constant_value([[1, 2, 3], []])),
       dict(
           descr='data.shape=[4, (D2), (D3)]; mask.shape=[4]',
-          data=ragged.constant_value(
+          data=ragged_factory_ops.constant_value(
               [[[1, 2, 3]], [[4], []], [[5, 6]], []]),
           mask=[T, F, T, T],
           keepdims=False,
-          expected=ragged.constant_value([[[1, 2, 3]], [[5, 6]], []])),
+          expected=ragged_factory_ops.constant_value(
+              [[[1, 2, 3]], [[5, 6]], []])),
       dict(
           descr='data.shape=[4, (D2), 2]; mask.shape=[4]',
-          data=ragged.constant_value(
+          data=ragged_factory_ops.constant_value(
               [[[1, 2], [3, 4]], [], [[5, 6]], [[7, 8], [9, 0], [1, 2]]],
               ragged_rank=1),
           mask=[T, F, F, T],
           keepdims=False,
-          expected=ragged.constant_value(
+          expected=ragged_factory_ops.constant_value(
               [[[1, 2], [3, 4]], [[7, 8], [9, 0], [1, 2]]],
               ragged_rank=1)),
       dict(
           descr='data.shape=[4, (D2), 2]; mask.shape=[4]',
-          data=ragged.constant_value(
+          data=ragged_factory_ops.constant_value(
               [[[1, 2], [3, 4]], [], [[5, 6]], [[7, 8], [9, 0], [1, 2]]],
               ragged_rank=1),
           mask=[T, F, F, T],
           keepdims=True,
-          expected=ragged.constant_value(
+          expected=ragged_factory_ops.constant_value(
               [[[1, 2], [3, 4]], [[7, 8], [9, 0], [1, 2]]],
               ragged_rank=1)),
       dict(
           descr='data.shape=[1, (2)]; mask.shape=[1, 2]',
-          data=ragged.constant_value([[1, 2]]),
+          data=ragged_factory_ops.constant_value([[1, 2]]),
           mask=[[T, F]],
           keepdims=True,
-          expected=ragged.constant_value([[1]])),
+          expected=ragged_factory_ops.constant_value([[1]])),
       dict(
           descr='data.shape=[2, (2), (D3)]; mask.shape=[2, 2]',
-          data=ragged.constant_value([[[1], [2, 3]], [[], [4, 5, 6]]]),
+          data=ragged_factory_ops.constant_value(
+              [[[1], [2, 3]], [[], [4, 5, 6]]]),
           mask=[[T, F], [T, T]],
           keepdims=True,
-          expected=ragged.constant_value([[[1]], [[], [4, 5, 6]]])),
+          expected=ragged_factory_ops.constant_value([[[1]], [[], [4, 5, 6]]])),
       dict(
           descr='data.shape=[2, (2), 3]; mask.shape=[2, 2]',
-          data=ragged.constant_value(
+          data=ragged_factory_ops.constant_value(
               [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [2, 4, 6]]],
               ragged_rank=1),
           mask=[[T, F], [T, T]],
           keepdims=True,
-          expected=ragged.constant_value(
+          expected=ragged_factory_ops.constant_value(
               [[[1, 2, 3]], [[7, 8, 9], [2, 4, 6]]],
               ragged_rank=1)),
       dict(
           descr='data.shape=[2, (2), 3]; mask.shape=[2, 2, 3]',
-          data=ragged.constant_value(
+          data=ragged_factory_ops.constant_value(
               [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [2, 4, 6]]],
               ragged_rank=1),
           mask=[[[T, F, F], [T, F, T]], [[T, F, T], [F, F, F]]],
           keepdims=True,
-          expected=ragged.constant_value([[[1], [4, 6]], [[7, 9], []]])),
+          expected=ragged_factory_ops.constant_value(
+              [[[1], [4, 6]], [[7, 9], []]])),
   ])  # pyformat: disable
   def testBooleanMask(self, descr, data, mask, keepdims, expected):
-    actual = ragged.boolean_mask(data, mask, keepdims=keepdims)
+    actual = ragged_array_ops.boolean_mask(data, mask, keepdims=keepdims)
     self.assertRaggedEqual(actual, expected)
 
   def testErrors(self):
     if not context.executing_eagerly():
       self.assertRaisesRegexp(ValueError,
-                              r'mask\.shape\.ndims must be kown statically',
-                              ragged.boolean_mask, [[1, 2]],
+                              r'mask\.shape\.ndims must be known statically',
+                              ragged_array_ops.boolean_mask, [[1, 2]],
                               array_ops.placeholder(dtypes.bool))
 
-    self.assertRaises(TypeError, ragged.boolean_mask, [[1, 2]], [[0, 1]])
+    self.assertRaises(TypeError, ragged_array_ops.boolean_mask, [[1, 2]],
+                      [[0, 1]])
     self.assertRaisesRegexp(
         ValueError, 'Tensor conversion requested dtype bool for '
-        'RaggedTensor with dtype int32', ragged.boolean_mask,
-        ragged.constant([[1, 2]]), ragged.constant([[0, 0]]))
+        'RaggedTensor with dtype int32', ragged_array_ops.boolean_mask,
+        ragged_factory_ops.constant([[1, 2]]),
+        ragged_factory_ops.constant([[0, 0]]))
 
     self.assertRaisesRegexp(
         ValueError, r'Shapes \(1, 2\) and \(1, 3\) are incompatible',
-        ragged.boolean_mask, [[1, 2]], [[True, False, True]])
+        ragged_array_ops.boolean_mask, [[1, 2]], [[True, False, True]])
 
     self.assertRaisesRegexp(errors.InvalidArgumentError,
                             r'Inputs must have identical ragged splits',
-                            ragged.boolean_mask, ragged.constant([[1, 2]]),
-                            ragged.constant([[True, False, True]]))
+                            ragged_array_ops.boolean_mask,
+                            ragged_factory_ops.constant([[1, 2]]),
+                            ragged_factory_ops.constant([[True, False, True]]))
 
     self.assertRaisesRegexp(ValueError, 'mask cannot be scalar',
-                            ragged.boolean_mask, [[1, 2]], True)
+                            ragged_array_ops.boolean_mask, [[1, 2]], True)
 
-    self.assertRaisesRegexp(ValueError,
-                            'mask cannot be scalar', ragged.boolean_mask,
-                            ragged.constant([[1, 2]]), True)
+    self.assertRaisesRegexp(ValueError, 'mask cannot be scalar',
+                            ragged_array_ops.boolean_mask,
+                            ragged_factory_ops.constant([[1, 2]]), True)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_concat_op_test.py b/tensorflow/python/ops/ragged/ragged_concat_op_test.py
index e72afb0448f5e7f7f4ab9aebefb712bfd7816133..62989d3025562db9af4b19d5a2922988591fe521 100644
--- a/tensorflow/python/ops/ragged/ragged_concat_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_concat_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.concat."""
+"""Tests for ragged_array_ops.concat."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -26,7 +26,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_concat_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -38,8 +39,8 @@ class RaggedConcatOpTest(ragged_test_util.RaggedTensorTestCase,
   def _rt_inputs_to_tensors(self, rt_inputs, ragged_ranks=None):
     if ragged_ranks is None:
       ragged_ranks = [None] * len(rt_inputs)
-    return [
-        ragged.constant(rt_input, ragged_rank=rrank)
+    return [  # pylint: disable=g-long-ternary
+        ragged_factory_ops.constant(rt_input, ragged_rank=rrank)
         if rrank != 0 else constant_op.constant(rt_input)
         for (rt_input, rrank) in zip(rt_inputs, ragged_ranks)
     ]
@@ -234,7 +235,7 @@ class RaggedConcatOpTest(ragged_test_util.RaggedTensorTestCase,
                        expected_ragged_rank=None,
                        expected_shape=None):
     rt_inputs = self._rt_inputs_to_tensors(rt_inputs, ragged_ranks)
-    concatenated = ragged.concat(rt_inputs, axis)
+    concatenated = ragged_concat_ops.concat(rt_inputs, axis)
     if expected_ragged_rank is not None:
       self.assertEqual(concatenated.ragged_rank, expected_ragged_rank)
     if expected_shape is not None:
@@ -275,7 +276,8 @@ class RaggedConcatOpTest(ragged_test_util.RaggedTensorTestCase,
                       message=None,
                       ragged_ranks=None):
     rt_inputs = self._rt_inputs_to_tensors(rt_inputs, ragged_ranks)
-    self.assertRaisesRegexp(error, message, ragged.concat, rt_inputs, axis)
+    self.assertRaisesRegexp(error, message, ragged_concat_ops.concat, rt_inputs,
+                            axis)
 
   @parameterized.parameters([
       dict(
@@ -292,7 +294,7 @@ class RaggedConcatOpTest(ragged_test_util.RaggedTensorTestCase,
     rt_inputs = [
         array_ops.placeholder_with_default(rt, shape=None) for rt in rt_inputs
     ]
-    concatenated = ragged.concat(rt_inputs, axis)
+    concatenated = ragged_concat_ops.concat(rt_inputs, axis)
     with self.assertRaisesRegexp(error, message):
       self.evaluate(concatenated)
 
@@ -305,7 +307,7 @@ class RaggedConcatOpTest(ragged_test_util.RaggedTensorTestCase,
     ]
     self.assertRaisesRegexp(
         ValueError, r'axis may only be negative if ndims is statically known.',
-        ragged.concat, rt_inputs, -1)
+        ragged_concat_ops.concat, rt_inputs, -1)
 
   def testSingleTensorInput(self):
     """Tests ragged_concat with a single tensor input.
@@ -314,8 +316,8 @@ class RaggedConcatOpTest(ragged_test_util.RaggedTensorTestCase,
     also pass in a single value (as with tf.concat), in which case it simply
     returns that tensor.  This test exercises that path.
     """
-    rt_inputs = ragged.constant([[1, 2], [3, 4]])
-    concatenated = ragged.concat(rt_inputs, 0)
+    rt_inputs = ragged_factory_ops.constant([[1, 2], [3, 4]])
+    concatenated = ragged_concat_ops.concat(rt_inputs, 0)
     self.assertRaggedEqual(concatenated, [[1, 2], [3, 4]])
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_concat_ops.py b/tensorflow/python/ops/ragged/ragged_concat_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f86b05e178a98f5c0afa9c201f83bb652ad8deb
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_concat_ops.py
@@ -0,0 +1,302 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Concat and stack operations for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_conversion_ops
+from tensorflow.python.ops.ragged import ragged_gather_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_util
+
+
+def concat(values, axis, name=None):
+  """Concatenates potentially ragged tensors along one dimension.
+
+  Given a list of tensors with the same rank `K` (`K >= axis`), returns a
+  rank-`K` `RaggedTensor` `result` such that `result[i0...iaxis]` is the
+  concatenation of `[rt[i0...iaxis] for rt in values]`.
+
+  Args:
+    values: A list of potentially ragged tensors.  May not be empty. All
+      `values` must have the same rank and the same dtype; but unlike
+      `tf.concat`, they can have arbitrary shapes.
+    axis: A python integer, indicating the dimension along which to concatenate.
+      (Note: Unlike `tf.concat`, the `axis` parameter must be statically known.)
+        Negative values are supported only if the rank of at least one
+        `values` value is statically known.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A `RaggedTensor` with rank `K`.
+    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in values]))`.
+
+  Raises:
+    ValueError: If `values` is empty, if `axis` is out of bounds or if
+      the input tensors have different ranks.
+
+  #### Example:
+    ```python
+    >>> t1 = tf.ragged.constant([[1, 2], [3, 4, 5]])
+    >>> t2 = tf.ragged.constant([[6], [7, 8, 9]])
+    >>> ragged.concat([t1, t2], axis=0)
+    [[1, 2], [3, 4, 5], [6], [7, 8, 9]]
+    >>> ragged.concat([t1, t2], axis=1)
+    [[1, 2, 6], [3, 4, 5, 7, 8, 9]]
+    ```
+  """
+  if not isinstance(values, (list, tuple)):
+    values = [values]
+  with ops.name_scope(name, 'RaggedConcat', values):
+    return _ragged_stack_concat_helper(values, axis, stack_values=False)
+
+
+def stack(values, axis=0, name=None):
+  """Stacks potentially ragged tensors along one dimension.
+
+  Given a list of tensors with the same rank `K` (`K >= axis`), returns a
+  rank-`K+1` `RaggedTensor` `result` such that `result[i0...iaxis]` is the
+  list `[rt[i0...iaxis] for rt in values]`.
+
+  Args:
+    values: A list of potentially ragged tensors.  May not be empty. All
+      `values` must have the same rank and the same dtype; but unlike
+      `tf.concat`, they can have arbitrary shapes.
+    axis: A python integer, indicating the dimension along which to stack.
+      (Note: Unlike `tf.stack`, the `axis` parameter must be statically known.)
+        Negative values are supported only if the rank of at least one
+        `values` value is statically known.
+    name: A name prefix for the returned tensor (optional).
+
+  Returns:
+    A `RaggedTensor` with rank `K+1`.
+    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in values]))`.
+
+  Raises:
+    ValueError: If `values` is empty, if `axis` is out of bounds or if
+      the input tensors have different ranks.
+
+  #### Example:
+    ```python
+    >>> t1 = tf.ragged.constant([[1, 2], [3, 4, 5]])
+    >>> t2 = tf.ragged.constant([[6], [7, 8, 9]])
+    >>> ragged.stack([t1, t2], axis=0)
+    [[[1, 2], [3, 4, 5]], [[6], [7, 9, 0]]]
+    >>> ragged.stack([t1, t2], axis=1)
+    [[[1, 2], [6]], [[3, 4, 5], [7, 8, 9]]]
+    ```
+  """
+  if not isinstance(values, (list, tuple)):
+    values = [values]
+  with ops.name_scope(name, 'RaggedConcat', values):
+    return _ragged_stack_concat_helper(values, axis, stack_values=True)
+
+
+def _ragged_stack_concat_helper(rt_inputs, axis, stack_values):
+  """Helper function to concatenate or stack ragged tensors.
+
+  Args:
+    rt_inputs: A list of RaggedTensors or Tensors to combine.
+    axis: The axis along which to concatenate or stack.
+    stack_values: A boolean -- if true, then stack values; otherwise,
+      concatenate them.
+
+  Returns:
+    A RaggedTensor.
+  Raises:
+    ValueError: If rt_inputs is empty, or if axis is out of range.
+  """
+  # Validate parameters.
+  if not rt_inputs:
+    raise ValueError('rt_inputs may not be empty.')
+
+  # Convert input tensors.
+  rt_inputs = [
+      ragged_tensor.convert_to_tensor_or_ragged_tensor(
+          rt_input, name='rt_input') for rt_input in rt_inputs
+  ]
+
+  # Special case: if there's only one input, then return it as-is.
+  if len(rt_inputs) == 1:
+    if stack_values:
+      return ragged_array_ops.expand_dims(rt_inputs[0], axis=axis)
+    else:
+      return rt_inputs[0]
+
+  # Check the rank (number of dimensions) of the input tensors.
+  ndims = None
+  for rt in rt_inputs:
+    if ndims is None:
+      ndims = rt.shape.ndims
+    else:
+      rt.shape.assert_has_rank(ndims)
+
+  out_ndims = ndims if (ndims is None or not stack_values) else ndims + 1
+  axis = ragged_util.get_positive_axis(axis, out_ndims)
+
+  # If all the inputs are Tensors, and we're combining the final dimension,
+  # then we can delegate to the tf.stack/tf.concat operation, and return a
+  # Tensor.
+  if all(not ragged_tensor.is_ragged(rt) for rt in rt_inputs):
+    if ndims is not None and (axis == out_ndims - 1 or axis == ndims - 1):
+      if stack_values:
+        return array_ops.stack(rt_inputs, axis)
+      else:
+        return array_ops.concat(rt_inputs, axis)
+
+  # Convert any Tensor inputs to RaggedTensors.  This makes it
+  # possible to concatenate Tensors and RaggedTensors together.
+  for i in range(len(rt_inputs)):
+    if not ragged_tensor.is_ragged(rt_inputs[i]):
+      rt_inputs[i] = ragged_conversion_ops.from_tensor(
+          rt_inputs[i], ragged_rank=1)
+
+  # Convert the input tensors to all have the same ragged_rank.
+  ragged_rank = max(max(rt.ragged_rank for rt in rt_inputs), 1)
+  rt_inputs = [_increase_ragged_rank_to(rt, ragged_rank) for rt in rt_inputs]
+
+  if axis == 0:
+    return _ragged_stack_concat_axis_0(rt_inputs, stack_values)
+  elif axis == 1:
+    return _ragged_stack_concat_axis_1(rt_inputs, stack_values)
+  else:  # axis > 1: recurse.
+    values = [rt.values for rt in rt_inputs]
+    splits = [[rt_input.row_splits] for rt_input in rt_inputs]
+    with ops.control_dependencies(ragged_util.assert_splits_match(splits)):
+      return ragged_tensor.RaggedTensor.from_row_splits(
+          _ragged_stack_concat_helper(values, axis - 1, stack_values),
+          splits[0][0])
+
+
+def _ragged_stack_concat_axis_0(rt_inputs, stack_values):
+  """Helper function to concatenate or stack ragged tensors along axis 0.
+
+  Args:
+    rt_inputs: A list of RaggedTensors, all with the same rank and ragged_rank.
+    stack_values: Boolean.  If true, then stack values; otherwise, concatenate
+      them.
+
+  Returns:
+    A RaggedTensor.
+  """
+  # Concatenate the inner values together.
+  flat_values = [rt.flat_values for rt in rt_inputs]
+  concatenated_flat_values = array_ops.concat(flat_values, axis=0)
+
+  # Concatenate the splits together for each ragged dimension (adjusting
+  # split offsets as necessary).
+  nested_splits = [rt.nested_row_splits for rt in rt_inputs]
+  ragged_rank = rt_inputs[0].ragged_rank
+  concatenated_nested_splits = [
+      _concat_ragged_splits([ns[dim]
+                             for ns in nested_splits])
+      for dim in range(ragged_rank)
+  ]
+
+  # If we are performing a stack operation, then add another splits.
+  if stack_values:
+    stack_lengths = array_ops.stack([rt.nrows() for rt in rt_inputs])
+    stack_splits = ragged_util.lengths_to_splits(stack_lengths)
+    concatenated_nested_splits.insert(0, stack_splits)
+
+  return ragged_tensor.RaggedTensor.from_nested_row_splits(
+      concatenated_flat_values, concatenated_nested_splits)
+
+
+def _ragged_stack_concat_axis_1(rt_inputs, stack_values):
+  """Helper function to concatenate or stack ragged tensors along axis 1.
+
+  Args:
+    rt_inputs: A list of RaggedTensors, all with the same rank and ragged_rank.
+    stack_values: Boolean.  If true, then stack values; otherwise, concatenate
+      them.
+
+  Returns:
+    A RaggedTensor.
+  """
+  num_inputs = len(rt_inputs)
+
+  rt_nrows = rt_inputs[0].nrows()
+  nrows_msg = 'Input tensors have incompatible shapes.'
+  nrows_checks = [
+      check_ops.assert_equal(rt.nrows(), rt_nrows, message=nrows_msg)
+      for rt in rt_inputs[1:]
+  ]
+
+  with ops.control_dependencies(nrows_checks):
+    # Concatentate the inputs together to put them in a single ragged tensor.
+    concatenated_rt = _ragged_stack_concat_axis_0(rt_inputs, stack_values=False)
+
+    # Use ragged.gather to permute the rows of concatenated_rt.  In particular,
+    #   permuted_rt = [rt_inputs[0][0], ..., rt_inputs[N][0],
+    #                  rt_inputs[0][1], ..., rt_inputs[N][1],
+    #                      ...,
+    #                  rt_inputs[0][M], ..., rt_input[N][M]]
+    # where `N=num_inputs-1` and `M=rt_nrows-1`.
+    row_indices = math_ops.range(rt_nrows * num_inputs)
+    row_index_matrix = array_ops.reshape(row_indices, [num_inputs, -1])
+    transposed_row_index_matrix = array_ops.transpose(row_index_matrix)
+    row_permutation = array_ops.reshape(transposed_row_index_matrix, [-1])
+    permuted_rt = ragged_gather_ops.gather(concatenated_rt, row_permutation)
+
+    if stack_values:
+      # Add a new splits tensor to group together the values.
+      stack_splits = math_ops.range(0, rt_nrows * num_inputs + 1, num_inputs)
+      _copy_row_shape(rt_inputs, stack_splits)
+      return ragged_tensor.RaggedTensor.from_row_splits(permuted_rt,
+                                                        stack_splits)
+    else:
+      # Merge together adjacent rows by dropping the row-split indices that
+      # separate them.
+      concat_splits = permuted_rt.row_splits[::num_inputs]
+      _copy_row_shape(rt_inputs, concat_splits)
+      return ragged_tensor.RaggedTensor.from_row_splits(permuted_rt.values,
+                                                        concat_splits)
+
+
+def _copy_row_shape(rt_inputs, splits):
+  """Sets splits.shape to [rt[shape[0]+1] for each rt in rt_inputs."""
+  for rt in rt_inputs:
+    if rt.shape[0] is not None:
+      splits.set_shape(tensor_shape.TensorShape(rt.shape[0] + 1))
+
+
+def _increase_ragged_rank_to(rt_input, ragged_rank):
+  """Adds ragged dimensions to `rt_input` so it has the desired ragged rank."""
+  if ragged_rank > 0:
+    if not ragged_tensor.is_ragged(rt_input):
+      rt_input = ragged_conversion_ops.from_tensor(rt_input)
+    if rt_input.ragged_rank < ragged_rank:
+      rt_input = rt_input.with_values(
+          _increase_ragged_rank_to(rt_input.values, ragged_rank - 1))
+  return rt_input
+
+
+def _concat_ragged_splits(splits_list):
+  """Concatenates a list of RaggedTensor splits to form a single splits."""
+  pieces = [splits_list[0]]
+  splits_offset = splits_list[0][-1]
+  for splits in splits_list[1:]:
+    pieces.append(splits[1:] + splits_offset)
+    splits_offset += splits[-1]
+  return array_ops.concat(pieces, axis=0)
diff --git a/tensorflow/python/ops/ragged/ragged_const_op_test.py b/tensorflow/python/ops/ragged/ragged_const_op_test.py
index c014f7103016104d3cc2e3ecbd18bbf3337a0153..29a9bdf53db650ef3a075d564e056751f1f018bb 100644
--- a/tensorflow/python/ops/ragged/ragged_const_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_const_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.constant."""
+"""Tests for ragged_factory_ops.constant."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -23,6 +23,8 @@ from absl.testing import parameterized
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -157,7 +159,7 @@ class RaggedConstOpTest(ragged_test_util.RaggedTensorTestCase,
       expected_dtype: The expected dtype for the resulting ragged tensor (used
         to test default/inferred types when dtype=None).
     """
-    rt = ragged.constant(
+    rt = ragged_factory_ops.constant(
         pylist, dtype=dtype, ragged_rank=ragged_rank, inner_shape=inner_shape)
 
     # If dtype was explicitly specified, check it.
@@ -168,14 +170,14 @@ class RaggedConstOpTest(ragged_test_util.RaggedTensorTestCase,
 
     # If ragged_rank was explicitly specified, check it.
     if ragged_rank is not None:
-      if isinstance(rt, ragged.RaggedTensor):
+      if isinstance(rt, ragged_tensor.RaggedTensor):
         self.assertEqual(rt.ragged_rank, ragged_rank)
       else:
         self.assertEqual(0, ragged_rank)
 
     # If inner_shape was explicitly specified, check it.
     if inner_shape is not None:
-      if isinstance(rt, ragged.RaggedTensor):
+      if isinstance(rt, ragged_tensor.RaggedTensor):
         self.assertEqual(rt.flat_values.shape.as_list()[1:], list(inner_shape))
       else:
         self.assertEqual(rt.shape.as_list(), list(inner_shape))
@@ -257,7 +259,7 @@ class RaggedConstOpTest(ragged_test_util.RaggedTensorTestCase,
     self.assertRaisesRegexp(
         exception,
         message,
-        ragged.constant,
+        ragged_factory_ops.constant,
         pylist,
         dtype=dtype,
         ragged_rank=ragged_rank,
@@ -294,12 +296,12 @@ class RaggedConstOpTest(ragged_test_util.RaggedTensorTestCase,
                                   message=None):
     """Tests for the _find_scalar_and_max_depth helper function."""
     if exception is not None:
-      self.assertRaisesRegexp(
-          exception, message,
-          ragged.ragged_factory_ops._find_scalar_and_max_depth, pylist)
+      self.assertRaisesRegexp(exception, message,
+                              ragged_factory_ops._find_scalar_and_max_depth,
+                              pylist)
     else:
       self.assertEqual(
-          ragged.ragged_factory_ops._find_scalar_and_max_depth(pylist),
+          ragged_factory_ops._find_scalar_and_max_depth(pylist),
           (scalar_depth, max_depth))
 
   @parameterized.parameters([
diff --git a/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py b/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
index 56768a9a479d0d3b568f4ff4b7f102837e26171d..7f474594b415cfd3e3b3e2b03df3bb84225cbdf2 100644
--- a/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.constant_value."""
+"""Tests for ragged_factory_ops.constant_value."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,7 +22,8 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -147,7 +148,7 @@ class RaggedConstantValueOpTest(ragged_test_util.RaggedTensorTestCase,
                        expected_shape=None,
                        expected_dtype=None):
     """Tests that `ragged_value(pylist).to_list() == pylist`."""
-    rt = ragged.constant_value(
+    rt = ragged_factory_ops.constant_value(
         pylist, dtype=dtype, ragged_rank=ragged_rank, inner_shape=inner_shape)
 
     # If dtype was explicitly specified, check it.
@@ -158,14 +159,14 @@ class RaggedConstantValueOpTest(ragged_test_util.RaggedTensorTestCase,
 
     # If ragged_rank was explicitly specified, check it.
     if ragged_rank is not None:
-      if isinstance(rt, ragged.RaggedTensorValue):
+      if isinstance(rt, ragged_tensor_value.RaggedTensorValue):
         self.assertEqual(rt.ragged_rank, ragged_rank)
       else:
         self.assertEqual(0, ragged_rank)
 
     # If inner_shape was explicitly specified, check it.
     if inner_shape is not None:
-      if isinstance(rt, ragged.RaggedTensorValue):
+      if isinstance(rt, ragged_tensor_value.RaggedTensorValue):
         self.assertEqual(rt.flat_values.shape[1:], inner_shape)
       else:
         self.assertEqual(rt.shape, inner_shape)
@@ -174,7 +175,7 @@ class RaggedConstantValueOpTest(ragged_test_util.RaggedTensorTestCase,
       self.assertEqual(tuple(rt.shape), expected_shape)
 
     if rt.shape:
-      if isinstance(rt, ragged.RaggedTensorValue):
+      if isinstance(rt, ragged_tensor_value.RaggedTensorValue):
         self.assertEqual(rt.to_list(), pylist)
       else:
         self.assertEqual(rt.tolist(), pylist)
@@ -257,11 +258,11 @@ class RaggedConstantValueOpTest(ragged_test_util.RaggedTensorTestCase,
                             inner_shape=None,
                             exception=None,
                             message=None):
-    """Tests that `ragged.constant_value()` raises an expected exception."""
+    """Tests that `constant_value()` raises an expected exception."""
     self.assertRaisesRegexp(
         exception,
         message,
-        ragged.constant_value,
+        ragged_factory_ops.constant_value,
         pylist,
         dtype=dtype,
         ragged_rank=ragged_rank,
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch.py b/tensorflow/python/ops/ragged/ragged_dispatch.py
index 7c74f7be62de0746418f57b2b2c06c31f2a5a4f5..3bda777482bf0965939c0a6a6d1a82c95d669aaf 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch.py
@@ -21,19 +21,25 @@ from __future__ import print_function
 import collections
 import numpy as np
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import gen_bitwise_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_batch_gather_ops
+from tensorflow.python.ops.ragged import ragged_concat_ops
+from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_tensor_shape
 from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.ops.ragged import ragged_where_op
 from tensorflow.python.util import dispatch
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_export
@@ -76,6 +82,8 @@ def _get_arg_infos(func, arg_names):
 
 def _is_convertible_to_tensor(value):
   """Returns true if `value` is convertible to a `Tensor`."""
+  if value is None:
+    return True
   if isinstance(value,
                 (ops.Tensor, variables.Variable, np.ndarray, int, float, str)):
     return True
@@ -280,6 +288,7 @@ _UNARY_ELEMENTWISE_OPS = [
     array_ops.zeros_like,
     array_ops.zeros_like_v2,
     clip_ops.clip_by_value,
+    gen_bitwise_ops.invert,
     math_ops.abs,
     math_ops.acos,
     math_ops.acosh,
@@ -346,6 +355,11 @@ _UNARY_LIST_ELEMENTWISE_OPS = [
 ]
 
 _BINARY_ELEMENTWISE_OPS = [
+    gen_bitwise_ops.bitwise_and,
+    gen_bitwise_ops.bitwise_or,
+    gen_bitwise_ops.bitwise_xor,
+    gen_bitwise_ops.left_shift,
+    gen_bitwise_ops.right_shift,
     math_ops.add,
     math_ops.atan2,
     math_ops.complex,
@@ -374,17 +388,58 @@ _BINARY_ELEMENTWISE_OPS = [
     math_ops.truncatemod,
 ]
 
+
+# We don't need to register a separate delegation handler for these v1 ops,
+# since they delegate to the v2 ops (which already have a handler).  But we
+# still want to include them in the ragged_op_list() output.
+_V1_OPS_THAT_DELEGATE_TO_V2_OPS = [
+    math_ops.reduce_sum,
+    math_ops.reduce_prod,
+    math_ops.reduce_min,
+    math_ops.reduce_max,
+    math_ops.reduce_mean,
+    math_ops.reduce_any,
+    math_ops.reduce_all,
+]
+
+
+def _ragged_gather_v1(params, indices, validate_indices=None, name=None,
+                      axis=0, batch_dims=0):
+  return ragged_gather_ops.gather(
+      params=params,
+      indices=indices,
+      validate_indices=validate_indices,
+      axis=axis,
+      batch_dims=batch_dims,
+      name=name)
+
+
+def _ragged_expand_dims_v1(input, axis=None, name=None, dim=None):  # pylint: disable=redefined-builtin
+  if dim is not None:
+    axis = dim
+  return ragged_array_ops.expand_dims(input=input, axis=axis, name=name)
+
+
+def _ragged_size_v1(input, name=None, out_type=dtypes.int32):  # pylint: disable=redefined-builtin
+  return ragged_array_ops.size(input=input, out_type=out_type, name=name)
+
+
 # (original_op, ragged_op, ragged_args)
 _RAGGED_DISPATCH_OPS = [
-    (array_ops.batch_gather, ragged_array_ops.batch_gather,
+    (array_ops.batch_gather, ragged_batch_gather_ops.batch_gather,
      ['params', 'indices']),
-    (array_ops.concat, ragged_array_ops.concat, ['values']),
+    (array_ops.concat, ragged_concat_ops.concat, ['[values]']),
+    (array_ops.expand_dims, _ragged_expand_dims_v1, ['input']),
     (array_ops.expand_dims_v2, ragged_array_ops.expand_dims, ['input']),
-    (array_ops.gather_v2, ragged_array_ops.gather, ['params', 'indices']),
-    (array_ops.gather_nd, ragged_array_ops.gather_nd, ['params', 'indices']),
-    (array_ops.stack, ragged_array_ops.stack, ['values']),
+    (array_ops.gather, _ragged_gather_v1, ['params', 'indices']),
+    (array_ops.gather_v2, ragged_gather_ops.gather, ['params', 'indices']),
+    (array_ops.gather_nd, ragged_gather_ops.gather_nd, ['params', 'indices']),
+    (array_ops.rank, ragged_array_ops.rank, ['input']),
+    (array_ops.size, _ragged_size_v1, ['input']),
+    (array_ops.size_v2, ragged_array_ops.size, ['input']),
+    (array_ops.stack, ragged_concat_ops.stack, ['[values]']),
     (array_ops.tile, ragged_array_ops.tile, ['input']),
-    (array_ops.where, ragged_array_ops.where, ['condition', 'x', 'y']),
+    (array_ops.where, ragged_where_op.where, ['condition', 'x', 'y']),
     (math_ops.unsorted_segment_sum, ragged_math_ops.segment_sum,
      ['data', 'segment_ids']),
     (math_ops.unsorted_segment_prod, ragged_math_ops.segment_prod,
@@ -415,7 +470,8 @@ def register_dispatchers():
       _BINARY_ELEMENTWISE_OPS + [x[0] for x in _RAGGED_DISPATCH_OPS])
   for op in op_list:
     _, undecorated_op = tf_decorator.unwrap(op)
-    if not hasattr(undecorated_op, tf_export.API_ATTRS['tensorflow'].names):
+    if not hasattr(undecorated_op,
+                   tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].names):
       raise AssertionError('Expected %s to be an exported symbol '
                            '(while adding a RaggedTensor dispatcher)')
 
@@ -431,10 +487,57 @@ def register_dispatchers():
   for (original_op, ragged_op, args) in _RAGGED_DISPATCH_OPS:
     RaggedDispatcher(original_op, ragged_op, args).register(original_op)
 
-  docstring = (
-      '\n\n### Additional ops that support `RaggedTensor`\n\n' + '\n'.join([
-          '* `tf.%s`' % tf_export.get_canonical_name_for_symbol(op)
-          for op in op_list
-      ]))
 
-  return docstring
+def _ragged_op_signature(op, ragged_args):
+  """Returns a signature for the given op, marking ragged args in bold."""
+  op_name = tf_export.get_canonical_name_for_symbol(op)
+  argspec = tf_inspect.getfullargspec(op)
+  arg_names = argspec.args
+
+  # Mark ragged arguments in bold.
+  for pos in ragged_args:
+    arg_names[pos] = '**' + arg_names[pos] + '**'
+
+  # Add argument defaults.
+  for pos in range(-1, -len(argspec.defaults) - 1, -1):
+    arg_names[pos] += '=`{!r}`'.format(argspec.defaults[pos])
+
+  # Add varargs and keyword args
+  if argspec.varargs:
+    arg_names.append('*' + argspec.varargs)
+  if argspec.varkw:
+    arg_names.append('**' + argspec.varkw)
+
+  return '* `tf.{}`({})'.format(op_name, ', '.join(arg_names))
+
+
+def _op_is_in_tf_version(op, version):
+  if version == 1:
+    return (tf_export.get_v1_names(tf_decorator.unwrap(op)[1]) or
+            op in _V1_OPS_THAT_DELEGATE_TO_V2_OPS)
+  elif version == 2:
+    return tf_export.get_v2_names(tf_decorator.unwrap(op)[1])
+  else:
+    raise ValueError('Expected version 1 or 2.')
+
+
+def ragged_op_list(tf_version=1):
+  """Returns a string listing operators that have dispathers registered."""
+  lines = []
+  for op in _UNARY_ELEMENTWISE_OPS + _UNARY_LIST_ELEMENTWISE_OPS:
+    if _op_is_in_tf_version(op, tf_version):
+      lines.append(_ragged_op_signature(op, [0]))
+  for op in _BINARY_ELEMENTWISE_OPS:
+    if _op_is_in_tf_version(op, tf_version):
+      lines.append(_ragged_op_signature(op, [0, 1]))
+  for op, _, ragged_args in _RAGGED_DISPATCH_OPS:
+    if _op_is_in_tf_version(op, tf_version):
+      arginfos = _get_arg_infos(op, ragged_args)
+      ragged_args = [arginfo.position for arginfo in arginfos]
+      lines.append(_ragged_op_signature(op, ragged_args))
+  return ('\n\n### Additional ops that support `RaggedTensor`\n\n'
+          'Arguments that accept `RaggedTensor`s are marked in **bold**.\n\n' +
+          '\n'.join(sorted(lines)) + 'n')
+
+
+register_dispatchers()
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch_test.py b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
index 82827aa2aafe22e7d6c61977ca6321cb69bd0db5..04ef0d7cd68a8f4e09d424584885c342d23a564c 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch_test.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
@@ -29,10 +29,12 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import gen_bitwise_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import ragged
 from tensorflow.python.ops import string_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -120,9 +122,15 @@ BINARY_BOOL_OPS = [
     math_ops.logical_xor,
 ]
 UNARY_INT_OPS = [
+    gen_bitwise_ops.invert,
     string_ops.unicode_script,
 ]
 BINARY_INT_OPS = [
+    gen_bitwise_ops.bitwise_and,
+    gen_bitwise_ops.bitwise_or,
+    gen_bitwise_ops.bitwise_xor,
+    gen_bitwise_ops.left_shift,
+    gen_bitwise_ops.right_shift,
     math_ops.truncatediv,
     math_ops.truncatemod,
 ]
@@ -134,8 +142,8 @@ class RaggedElementwiseOpsTest(ragged_test_util.RaggedTensorTestCase,
 
   def assertSameShape(self, x, y):
     """Checks that x and y have the same shape (including ragged shapes)."""
-    if isinstance(x, ragged.RaggedTensor):
-      self.assertIsInstance(y, ragged.RaggedTensor)
+    if isinstance(x, ragged_tensor.RaggedTensor):
+      self.assertIsInstance(y, ragged_tensor.RaggedTensor)
       self.assertEqual(x.ragged_rank, y.ragged_rank)
       for (x_splits, y_splits) in zip(x.nested_row_splits, y.nested_row_splits):
         self.assertAllEqual(x_splits, y_splits)
@@ -156,75 +164,85 @@ class RaggedElementwiseOpsTest(ragged_test_util.RaggedTensorTestCase,
           {'x': [1, -2, 3]},
           # 2-dimensional input
           {'x': [[-2, 3], [-3, 4]]},
-          {'x': ragged.constant_value([[-2, 3], [-3]], ragged_rank=1)},
+          {'x': ragged_factory_ops.constant_value(
+              [[-2, 3], [-3]], ragged_rank=1)},
           # 3-dimensional inputs
           {'x': [[[-2, 3], [3, 4]], [[7, 6], [5, 4]]]},
-          {'x': ragged.constant_value([[[-2, 3], [3, 4]], [[7, 6]]],
-                                      ragged_rank=1)},
-          {'x': ragged.constant_value([[[-2, 3, 4], []], [[7, 6]], []],
-                                      ragged_rank=2)},
+          {'x': ragged_factory_ops.constant_value(
+              [[[-2, 3], [3, 4]], [[7, 6]]],
+              ragged_rank=1)},
+          {'x': ragged_factory_ops.constant_value(
+              [[[-2, 3, 4], []], [[7, 6]], []],
+              ragged_rank=2)},
           ] +
       #=========================================================================
       # Test each unary op.
       #=========================================================================
-      [{'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]), 'op': op}
+      [{'x': ragged_factory_ops.constant_value([[-2.0, 3.0], [-3.0]]), 'op': op}
        for op in UNARY_FLOAT_OPS] +
-      [{'x': ragged.constant_value([[True, False], [True]]), 'op': op}
+      [{'x': ragged_factory_ops.constant_value([[True, False], [True]]),
+        'op': op}
        for op in UNARY_BOOL_OPS] +
-      [{'x': ragged.constant_value([[18, 512], [12412]], np.int32), 'op': op}
+      [{'x': ragged_factory_ops.constant_value([[18, 512], [12412]], np.int32),
+        'op': op}
        for op in UNARY_INT_OPS] +
-      [{'x': ragged.constant_value([['abcd', 'efgh'], ['aabbccdd']]), 'op': op}
+      [{'x': ragged_factory_ops.constant_value([['abcd', 'efgh'],
+                                                ['aabbccdd']]),
+        'op': op}
        for op in UNARY_STRING_OPS] +
       [
           {'op': clip_ops.clip_by_value,
-           'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
+           'x': ragged_factory_ops.constant_value([[-2.0, 3.0], [-3.0]]),
            'clip_value_min': 0.1, 'clip_value_max': 4.0},
           {'op': math_ops.cast,
-           'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
+           'x': ragged_factory_ops.constant_value([[-2.0, 3.0], [-3.0]]),
            'dtype': dtypes.int32},
           {'op': math_ops.saturate_cast,
-           'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
+           'x': ragged_factory_ops.constant_value([[-2.0, 3.0], [-3.0]]),
            'dtype': dtypes.int32},
           {'op': string_ops.string_to_hash_bucket,
-           'x': ragged.constant_value([['abcd', 'efgh'], ['aabbccdd']]),
+           'x': ragged_factory_ops.constant_value(
+               [['abcd', 'efgh'], ['aabbccdd']]),
            'num_buckets': 1000},
           {'op': string_ops.string_to_hash_bucket_fast,
-           'x': ragged.constant_value([['abcd', 'efgh'], ['aabbccdd']]),
+           'x': ragged_factory_ops.constant_value(
+               [['abcd', 'efgh'], ['aabbccdd']]),
            'num_buckets': 1000},
           {'op': string_ops.string_to_hash_bucket_strong,
-           'x': ragged.constant_value([['abcd', 'efgh'], ['aabbccdd']]),
+           'x': ragged_factory_ops.constant_value(
+               [['abcd', 'efgh'], ['aabbccdd']]),
            'num_buckets': 1000,
            'key': [1231, 12512]},
           {'op': string_ops.string_to_number,
-           'x': ragged.constant_value([['-2.0', '3.0'], ['-3.0']])},
+           'x': ragged_factory_ops.constant_value([['-2.0', '3.0'], ['-3.0']])},
           {'op': string_ops.regex_full_match,
-           'x': ragged.constant_value([['hello', '123'], ['1+1']]),
+           'x': ragged_factory_ops.constant_value([['hello', '123'], ['1+1']]),
            'pattern': r'\w+'},
           {'op': string_ops.regex_replace,
-           'x': ragged.constant_value([['hello', '123'], ['1+1']]),
+           'x': ragged_factory_ops.constant_value([['hello', '123'], ['1+1']]),
            'pattern': r'\d',
            'rewrite': '#'},
           {'op': string_ops.substr,
-           'x': ragged.constant_value([['hello', '123'], ['1+1']]),
+           'x': ragged_factory_ops.constant_value([['hello', '123'], ['1+1']]),
            'pos': 2, 'len': 3},
           {'op': array_ops.check_numerics,
-           'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
+           'x': ragged_factory_ops.constant_value([[-2.0, 3.0], [-3.0]]),
            'message': 'check-numerics'},
       ]
       )  # pyformat: disable
   def testUnaryElementwiseOp(self, x, op=math_ops.abs, **extra_args):
-    x = ragged.convert_to_tensor_or_ragged_tensor(x)
+    x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x)
     result = op(x, **extra_args)
 
     # Run the wrapped op on the dense values, for comparison.
-    dense_x = x.flat_values if isinstance(x, ragged.RaggedTensor) else x
+    dense_x = x.flat_values if isinstance(x, ragged_tensor.RaggedTensor) else x
     expected_flat_values = array_ops.reshape(op(dense_x, **extra_args), [-1])
 
     # Check that the result has the expected shape.
     self.assertSameShape(x, result)
 
     # Check that the result has the expected (flattened) values.
-    if isinstance(result, ragged.RaggedTensor):
+    if isinstance(result, ragged_tensor.RaggedTensor):
       result_flat_values = array_ops.reshape(result.flat_values, [-1])
     else:
       result_flat_values = array_ops.reshape(result, [-1])
@@ -245,19 +263,23 @@ class RaggedElementwiseOpsTest(ragged_test_util.RaggedTensorTestCase,
           {'x': [[-2, 3], [-3, -4]],
            'y': [[1, 2], [3, 4]]},
           # Shapes: x:(2, None), y:(2, None)
-          {'x': ragged.constant_value([[-2, 3], [-3]]),
-           'y': ragged.constant_value([[5, 6], [7]])},
+          {'x': ragged_factory_ops.constant_value([[-2, 3], [-3]]),
+           'y': ragged_factory_ops.constant_value([[5, 6], [7]])},
           # Shapes: x:(2, 2, 2), y:(2, 2, 2)
           {'x': [[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
            'y': [[[9, 3], [3, 4]], [[5, 2], [7, 6]]]},
           # Shapes: x:(2, None, None), y: (2, None, None)
-          {'x': ragged.constant_value([[[1, 2], [3], [4]], [[], [5, 7, 8]]]),
-           'y': ragged.constant_value([[[3, 8], [2], [5]], [[], [1, 9, 8]]])},
+          {'x': ragged_factory_ops.constant_value(
+              [[[1, 2], [3], [4]], [[], [5, 7, 8]]]),
+           'y': ragged_factory_ops.constant_value(
+               [[[3, 8], [2], [5]], [[], [1, 9, 8]]])},
           # Shapes: x:(2, None, 2), y: (2, None, 2)
-          {'x': ragged.constant_value([[[1, 2]], [[3, 4], [5, 6], [7, 8]]],
-                                      ragged_rank=1),
-           'y': ragged.constant_value([[[9, 3]], [[5, 2], [3, 4], [7, 6]]],
-                                      ragged_rank=1)},
+          {'x': ragged_factory_ops.constant_value(
+              [[[1, 2]], [[3, 4], [5, 6], [7, 8]]],
+              ragged_rank=1),
+           'y': ragged_factory_ops.constant_value(
+               [[[9, 3]], [[5, 2], [3, 4], [7, 6]]],
+               ragged_rank=1)},
 
           #=====================================================================
           # With broadcasting
@@ -279,47 +301,54 @@ class RaggedElementwiseOpsTest(ragged_test_util.RaggedTensorTestCase,
            'y': [[1, 2], [3, 4]]},
           # Shapes: x:(), y:(2, None)
           {'x': 10,                                 # Broadcast () -> (2, None)
-           'y': ragged.constant_value([[1, 2], [3]], dtype=np.int32)},
+           'y': ragged_factory_ops.constant_value(
+               [[1, 2], [3]], dtype=np.int32)},
           # TODO(edloper): Add tests for more advanced broadcasting, once we add
           # support for it.
 
           #=====================================================================
           # Keyword Args
           #=====================================================================
-          {'x': ragged.constant_value([[[1, 2], [3], [4]], [[], [5, 7, 8]]]),
-           'y': ragged.constant_value([[[3, 8], [2], [5]], [[], [1, 9, 8]]]),
+          {'x': ragged_factory_ops.constant_value(
+              [[[1, 2], [3], [4]], [[], [5, 7, 8]]]),
+           'y': ragged_factory_ops.constant_value(
+               [[[3, 8], [2], [5]], [[], [1, 9, 8]]]),
            'use_kwargs': ('x', 'y')},
-          {'x': ragged.constant_value([[[1, 2]], [[3, 4], [5, 6], [7, 8]]],
-                                      ragged_rank=1),
-           'y': ragged.constant_value([[[9, 3]], [[5, 2], [3, 4], [7, 6]]],
-                                      ragged_rank=1),
+          {'x': ragged_factory_ops.constant_value(
+              [[[1, 2]], [[3, 4], [5, 6], [7, 8]]],
+              ragged_rank=1),
+           'y': ragged_factory_ops.constant_value(
+               [[[9, 3]], [[5, 2], [3, 4], [7, 6]]],
+               ragged_rank=1),
            'use_kwargs': ('x', 'y')},
-          {'x': ragged.constant_value([[[1, 2]], [[3, 4], [5, 6], [7, 8]]],
-                                      ragged_rank=1),
-           'y': ragged.constant_value([[[9, 3]], [[5, 2], [3, 4], [7, 6]]],
-                                      ragged_rank=1),
+          {'x': ragged_factory_ops.constant_value(
+              [[[1, 2]], [[3, 4], [5, 6], [7, 8]]],
+              ragged_rank=1),
+           'y': ragged_factory_ops.constant_value(
+               [[[9, 3]], [[5, 2], [3, 4], [7, 6]]],
+               ragged_rank=1),
            'use_kwargs': ('x',)},
       ] +
       #=========================================================================
       # Test each unary op.
       #=========================================================================
-      [{'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
-        'y': ragged.constant_value([[5.0, 1.0], [12.0]]),
+      [{'x': ragged_factory_ops.constant_value([[-2.0, 3.0], [-3.0]]),
+        'y': ragged_factory_ops.constant_value([[5.0, 1.0], [12.0]]),
         'op': op}
        for op in BINARY_FLOAT_OPS] +
-      [{'x': ragged.constant_value([[-2, 3], [-3]]),
-        'y': ragged.constant_value([[5, 1], [12]]),
+      [{'x': ragged_factory_ops.constant_value([[-2, 3], [-3]]),
+        'y': ragged_factory_ops.constant_value([[5, 1], [12]]),
         'op': op}
        for op in BINARY_INT_OPS] +
-      [{'x': ragged.constant_value([[True, True], [False]]),
-        'y': ragged.constant_value([[False, True], [False]]),
+      [{'x': ragged_factory_ops.constant_value([[True, True], [False]]),
+        'y': ragged_factory_ops.constant_value([[False, True], [False]]),
         'op': op}
        for op in BINARY_BOOL_OPS]
       )  # pyformat: disable
   def testBinaryElementwiseOp(self, x, y, op=math_ops.add, **extra_args):
     use_kwargs = extra_args.pop('use_kwargs', ())
-    x = ragged.convert_to_tensor_or_ragged_tensor(x)
-    y = ragged.convert_to_tensor_or_ragged_tensor(y)
+    x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x)
+    y = ragged_tensor.convert_to_tensor_or_ragged_tensor(y)
     if 'x' in use_kwargs and 'y' in use_kwargs:
       result = op(x=x, y=y, **extra_args)
     elif 'y' in use_kwargs:
@@ -328,8 +357,8 @@ class RaggedElementwiseOpsTest(ragged_test_util.RaggedTensorTestCase,
       result = op(x, y, **extra_args)
 
     # Run the wrapped op on the dense values, for comparison.
-    dense_x = x.flat_values if isinstance(x, ragged.RaggedTensor) else x
-    dense_y = y.flat_values if isinstance(y, ragged.RaggedTensor) else y
+    dense_x = x.flat_values if isinstance(x, ragged_tensor.RaggedTensor) else x
+    dense_y = y.flat_values if isinstance(y, ragged_tensor.RaggedTensor) else y
     expected_flat_values = array_ops.reshape(
         op(dense_x, dense_y, **extra_args), [-1])
 
@@ -337,7 +366,7 @@ class RaggedElementwiseOpsTest(ragged_test_util.RaggedTensorTestCase,
     self.assertSameShape(y, result)
 
     # Check that the result has the expected (flattened) values.
-    if isinstance(result, ragged.RaggedTensor):
+    if isinstance(result, ragged_tensor.RaggedTensor):
       result_flat_values = array_ops.reshape(result.flat_values, [-1])
     else:
       result_flat_values = array_ops.reshape(result, [-1])
@@ -348,36 +377,44 @@ class RaggedElementwiseOpsTest(ragged_test_util.RaggedTensorTestCase,
           {'inputs': (12, 8, 3)},
           {'inputs': ([1, 2, 3], [7, 8, 9], [3, 6, 9])},
           {'inputs': ([[1, 2]], [[3, 4]], [[5, 6]])},
-          {'inputs': (ragged.constant_value([[1, 3], [-3]]),
-                      ragged.constant_value([[4, 7], [88]]),
-                      ragged.constant_value([[2, 9], [12]]))},
-          {'inputs': (ragged.constant_value([[[1, 3], [-3]], [[1]]]),
-                      ragged.constant_value([[[4, 7], [88]], [[2]]]),
-                      ragged.constant_value([[[2, 9], [12]], [[8]]]))},
-          {'inputs': (ragged.constant_value([[[1, 3], [3, 4]], [[1, 5]]],
-                                            ragged_rank=1),
-                      ragged.constant_value([[[4, 7], [1, 2]], [[2, 2]]],
-                                            ragged_rank=1),
-                      ragged.constant_value([[[2, 9], [5, 2]], [[8, 0]]],
-                                            ragged_rank=1))},
-          {'inputs': (ragged.constant_value([[[1, 3], [-3]], [[1]]]),
-                      ragged.constant_value([[[4, 7], [88]], [[2]]]),
-                      ragged.constant_value([[[2, 9], [12]], [[8]]])),
+          {'inputs': (ragged_factory_ops.constant_value([[1, 3], [-3]]),
+                      ragged_factory_ops.constant_value([[4, 7], [88]]),
+                      ragged_factory_ops.constant_value([[2, 9], [12]]))},
+          {'inputs': (ragged_factory_ops.constant_value(
+              [[[1, 3], [-3]], [[1]]]),
+                      ragged_factory_ops.constant_value(
+                          [[[4, 7], [88]], [[2]]]),
+                      ragged_factory_ops.constant_value(
+                          [[[2, 9], [12]], [[8]]]))},
+          {'inputs': (
+              ragged_factory_ops.constant_value([[[1, 3], [3, 4]], [[1, 5]]],
+                                                ragged_rank=1),
+              ragged_factory_ops.constant_value([[[4, 7], [1, 2]], [[2, 2]]],
+                                                ragged_rank=1),
+              ragged_factory_ops.constant_value([[[2, 9], [5, 2]], [[8, 0]]],
+                                                ragged_rank=1))},
+          {'inputs': (
+              ragged_factory_ops.constant_value([[[1, 3], [-3]], [[1]]]),
+              ragged_factory_ops.constant_value([[[4, 7], [88]], [[2]]]),
+              ragged_factory_ops.constant_value([[[2, 9], [12]], [[8]]])),
            'use_kwargs': True},
       ] + [
           {'op': math_ops.add_n,
-           'inputs': (ragged.constant_value([[1, 3], [-3]]),
-                      ragged.constant_value([[4, 7], [88]]),
-                      ragged.constant_value([[2, 9], [12]]))},
+           'inputs': (ragged_factory_ops.constant_value([[1, 3], [-3]]),
+                      ragged_factory_ops.constant_value([[4, 7], [88]]),
+                      ragged_factory_ops.constant_value([[2, 9], [12]]))},
           {'op': string_ops.string_join,
-           'inputs': (ragged.constant_value([['a', 'b'], ['c']]),
-                      ragged.constant_value([['foo', 'bar'], ['baz']]),
-                      ragged.constant_value([['2', '9'], ['12']]))},
+           'inputs': (
+               ragged_factory_ops.constant_value([['a', 'b'], ['c']]),
+               ragged_factory_ops.constant_value([['foo', 'bar'], ['baz']]),
+               ragged_factory_ops.constant_value([['2', '9'], ['12']]))},
       ])  # pyformat: disable
   def testListValuedElementwiseOp(self, inputs, op=math_ops.add_n,
                                   **extra_args):
     use_kwargs = extra_args.pop('use_kwargs', False)
-    inputs = [ragged.convert_to_tensor_or_ragged_tensor(x) for x in inputs]
+    inputs = [
+        ragged_tensor.convert_to_tensor_or_ragged_tensor(x) for x in inputs
+    ]
     if use_kwargs:
       result = op(inputs=inputs, **extra_args)
     else:
@@ -385,7 +422,7 @@ class RaggedElementwiseOpsTest(ragged_test_util.RaggedTensorTestCase,
 
     # Run the wrapped op on the dense values, for comparison.
     dense_inputs = [
-        x.flat_values if isinstance(x, ragged.RaggedTensor) else x
+        x.flat_values if isinstance(x, ragged_tensor.RaggedTensor) else x
         for x in inputs
     ]
     expected_flat_values = array_ops.reshape(
@@ -395,7 +432,7 @@ class RaggedElementwiseOpsTest(ragged_test_util.RaggedTensorTestCase,
     self.assertSameShape(inputs[0], result)
 
     # Check that the result has the expected (flattened) values.
-    if isinstance(result, ragged.RaggedTensor):
+    if isinstance(result, ragged_tensor.RaggedTensor):
       result_flat_values = array_ops.reshape(result.flat_values, [-1])
     else:
       result_flat_values = array_ops.reshape(result, [-1])
@@ -404,8 +441,8 @@ class RaggedElementwiseOpsTest(ragged_test_util.RaggedTensorTestCase,
   def testElementwiseOpUnknownRankError(self):
     if context.executing_eagerly():
       return
-    x = ragged.constant([[1, 2], [3]])
-    y = ragged.RaggedTensor.from_row_splits(
+    x = ragged_factory_ops.constant([[1, 2], [3]])
+    y = ragged_tensor.RaggedTensor.from_row_splits(
         array_ops.placeholder_with_default([1, 2, 3], shape=None), x.row_splits)
     with self.assertRaisesRegexp(ValueError,
                                  r'Unable to broadcast: unknown rank'):
@@ -413,32 +450,34 @@ class RaggedElementwiseOpsTest(ragged_test_util.RaggedTensorTestCase,
 
   @parameterized.parameters([
       dict(
-          x=ragged.constant_value([[1, 2], [3]]),
+          x=ragged_factory_ops.constant_value([[1, 2], [3]]),
           y=[[10]],
           expected=[[11, 12], [13]]),
       dict(
-          x=ragged.constant_value([[[1, 2], [3, 4]], [[5]]], ragged_rank=2),
-          y=ragged.constant_value([[[10], [20]], [[30]]], ragged_rank=1),
+          x=ragged_factory_ops.constant_value([[[1, 2], [3, 4]], [[5]]],
+                                              ragged_rank=2),
+          y=ragged_factory_ops.constant_value([[[10], [20]], [[30]]],
+                                              ragged_rank=1),
           expected=[[[11, 12], [23, 24]], [[35]]]),
       dict(
-          x=ragged.constant_value([[[1]]]),
-          y=ragged.constant_value([[1]]),
+          x=ragged_factory_ops.constant_value([[[1]]]),
+          y=ragged_factory_ops.constant_value([[1]]),
           expected=[[[2]]]),
   ])
   def testElementwiseOpBroadcast(self, x, y, expected):
-    x = ragged.convert_to_tensor_or_ragged_tensor(x, dtype=dtypes.int32)
-    y = ragged.convert_to_tensor_or_ragged_tensor(y, dtype=dtypes.int32)
+    x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x, dtype=dtypes.int32)
+    y = ragged_tensor.convert_to_tensor_or_ragged_tensor(y, dtype=dtypes.int32)
     result = x + y
     self.assertRaggedEqual(result, expected)
 
   def testElementwiseOpShapeMismatch(self):
-    x = ragged.constant([[1, 2, 3], [4, 5]])
-    y = ragged.constant([[1, 2, 3], [4, 5, 6]])
+    x = ragged_factory_ops.constant([[1, 2, 3], [4, 5]])
+    y = ragged_factory_ops.constant([[1, 2, 3], [4, 5, 6]])
     with self.assertRaises(errors.InvalidArgumentError):
       self.evaluate(math_ops.add(x, y))
 
   def testBinaryOpSparseAndRagged(self):
-    x = ragged.constant([[1, 2, 3], [4, 5]])
+    x = ragged_factory_ops.constant([[1, 2, 3], [4, 5]])
     y = sparse_tensor.SparseTensor([[0, 0], [0, 1], [2, 0]], [1, 2, 3], [3, 2])
     with self.assertRaises((TypeError, ValueError)):
       self.evaluate(math_ops.add(x, y))
@@ -446,6 +485,222 @@ class RaggedElementwiseOpsTest(ragged_test_util.RaggedTensorTestCase,
     with self.assertRaises((TypeError, ValueError)):
       self.evaluate(math_ops.add_n([x, y]))
 
+  @parameterized.parameters([
+      dict(
+          op=array_ops.batch_gather,
+          args=(ragged_factory_ops.constant_value([[5, 6, 7], [8, 9]]),
+                ragged_factory_ops.constant_value([[2, 1, 0], [1]])),
+          expected=ragged_factory_ops.constant_value([[7, 6, 5], [9]])),
+      dict(
+          op=array_ops.concat,
+          args=([
+              ragged_factory_ops.constant_value([[1, 2, 3], [4]],
+                                                dtype=np.int32),
+              np.array([[5, 6]], dtype=np.int32)
+          ],),
+          kwargs={'axis': 0},
+          expected=ragged_factory_ops.constant_value([[1, 2, 3], [4], [5, 6]])),
+      dict(
+          op=array_ops.expand_dims,
+          kwargs={
+              'input': ragged_factory_ops.constant_value([[1, 2], [3]]),
+              'axis': 0
+          },
+          expected=ragged_factory_ops.constant_value([[[1, 2], [3]]])),
+      dict(
+          op=array_ops.expand_dims_v2,
+          kwargs={
+              'input': ragged_factory_ops.constant_value([[1, 2], [3]]),
+              'axis': -1
+          },
+          expected=ragged_factory_ops.constant_value([[[1], [2]], [[3]]],
+                                                     ragged_rank=1),
+      ),
+      dict(
+          op=array_ops.gather,
+          kwargs={
+              'params': ragged_factory_ops.constant_value([[1, 2], [3]]),
+              'indices': [1, 0, 1]
+          },
+          expected=ragged_factory_ops.constant_value([[3], [1, 2], [3]])),
+      dict(
+          op=array_ops.gather_v2,
+          kwargs={
+              'params': ragged_factory_ops.constant_value([[1, 2], [3]]),
+              'indices': ragged_factory_ops.constant_value([[1, 0], [1]])
+          },
+          expected=ragged_factory_ops.constant_value([[[3], [1, 2]], [[3]]])),
+      dict(
+          op=array_ops.gather_nd,
+          kwargs={
+              'params': ragged_factory_ops.constant_value([[7, 8], [9]]),
+              'indices': [[0, 1], [1, 0], [0, 0]]
+          },
+          expected=ragged_factory_ops.constant_value([8, 9, 7])),
+      dict(
+          op=array_ops.stack,
+          args=([
+              ragged_factory_ops.constant_value([[1, 2, 3], [4]],
+                                                dtype=np.int32),
+              np.array([[5, 6]], dtype=np.int32)
+          ],),
+          expected=ragged_factory_ops.constant_value([[[1, 2, 3], [4]],
+                                                      [[5, 6]]])),
+      dict(
+          op=array_ops.tile,
+          args=([
+              ragged_factory_ops.constant_value([[1, 2], [3]], dtype=np.int32),
+              [2, 3]
+          ]),
+          expected=ragged_factory_ops.constant_value([[1, 2, 1, 2, 1, 2],
+                                                      [3, 3, 3],
+                                                      [1, 2, 1, 2, 1, 2],
+                                                      [3, 3, 3]])),
+      dict(
+          op=array_ops.where,
+          args=(ragged_factory_ops.constant_value([[True, False], [True]]),
+                ragged_factory_ops.constant_value([[b'A', b'B'], [b'C']]),
+                ragged_factory_ops.constant_value([[b'a', b'b'], [b'c']])),
+          expected=ragged_factory_ops.constant_value([[b'A', b'b'], [b'C']])),
+      dict(
+          op=array_ops.where,
+          args=(ragged_factory_ops.constant_value([[True, False], [True]]),),
+          expected=[[0, 0], [1, 0]]),
+      dict(
+          op=math_ops.unsorted_segment_sum,
+          kwargs={
+              'data': ragged_factory_ops.constant_value([[1, 2], [3]]),
+              'segment_ids': ragged_factory_ops.constant_value([[0, 2], [0]]),
+              'num_segments': 3
+          },
+          expected=[4, 0, 2]),
+      dict(
+          op=math_ops.unsorted_segment_prod,
+          kwargs={
+              'data': ragged_factory_ops.constant_value([[1, 2], [3]]),
+              'segment_ids': ragged_factory_ops.constant_value([[0, 2], [0]]),
+              'num_segments': 3
+          },
+          expected=[3, 1, 2]),
+      dict(
+          op=math_ops.unsorted_segment_min,
+          kwargs={
+              'data': ragged_factory_ops.constant_value([[1, 2], [3]]),
+              'segment_ids': ragged_factory_ops.constant_value([[0, 1], [0]]),
+              'num_segments': 2
+          },
+          expected=[1, 2]),
+      dict(
+          op=math_ops.unsorted_segment_max,
+          kwargs={
+              'data': ragged_factory_ops.constant_value([[1, 2], [3]]),
+              'segment_ids': ragged_factory_ops.constant_value([[0, 1], [0]]),
+              'num_segments': 2
+          },
+          expected=[3, 2]),
+      dict(
+          op=math_ops.unsorted_segment_mean,
+          kwargs={
+              'data': ragged_factory_ops.constant_value([[1, 2], [3]]),
+              'segment_ids': ragged_factory_ops.constant_value([[0, 1], [0]]),
+              'num_segments': 2
+          },
+          expected=[2, 2]),
+      dict(
+          op=math_ops.unsorted_segment_sqrt_n,
+          kwargs={
+              'data':
+                  ragged_factory_ops.constant_value([[1.0, 2.0],
+                                                     [3.0, 4.0, 6.0]]),
+              'segment_ids':
+                  ragged_factory_ops.constant_value([[0, 1], [0, 0, 0]]),
+              'num_segments':
+                  2
+          },
+          expected=[7.0, 2.0]),
+      dict(
+          op=math_ops.reduce_sum,
+          kwargs={
+              'input_tensor':
+                  ragged_factory_ops.constant_value([[1, 2], [3, 4, 5]]),
+              'axis':
+                  1
+          },
+          expected=[3, 12]),
+      dict(
+          op=math_ops.reduce_prod,
+          kwargs={
+              'input_tensor':
+                  ragged_factory_ops.constant_value([[1, 2], [3, 4, 5]]),
+              'axis':
+                  1
+          },
+          expected=[2, 60]),
+      dict(
+          op=math_ops.reduce_min,
+          kwargs={
+              'input_tensor':
+                  ragged_factory_ops.constant_value([[1, 2], [3, 4, 5]]),
+              'axis':
+                  1
+          },
+          expected=[1, 3]),
+      dict(
+          op=math_ops.reduce_max,
+          kwargs={
+              'input_tensor':
+                  ragged_factory_ops.constant_value([[1, 2], [3, 4, 5]]),
+              'axis':
+                  1
+          },
+          expected=[2, 5]),
+      dict(
+          op=math_ops.reduce_mean,
+          kwargs={
+              'input_tensor':
+                  ragged_factory_ops.constant_value([[1, 3], [3, 4, 5]]),
+              'axis':
+                  1
+          },
+          expected=[2, 4]),
+      dict(
+          op=math_ops.reduce_any,
+          kwargs={
+              'input_tensor':
+                  ragged_factory_ops.constant_value([[True, False],
+                                                     [True, True, True]]),
+              'axis':
+                  1
+          },
+          expected=[True, True]),
+      dict(
+          op=math_ops.reduce_all,
+          kwargs={
+              'input_tensor':
+                  ragged_factory_ops.constant_value([[True, False],
+                                                     [True, True, True]]),
+              'axis':
+                  1
+          },
+          expected=[False, True]),
+      dict(
+          op=array_ops.rank,
+          kwargs={'input': ragged_factory_ops.constant_value([[8, 3], [5]])},
+          expected=2),
+      dict(
+          op=array_ops.size,
+          kwargs={'input': ragged_factory_ops.constant_value([[8, 3], [5]])},
+          expected=3),
+      dict(
+          op=array_ops.size_v2,
+          kwargs={'input': ragged_factory_ops.constant_value([[8, 3], [5]])},
+          expected=3),
+  ])
+  def testRaggedDispatch(self, op, expected, args=(), kwargs=None):
+    if kwargs is None: kwargs = {}
+    result = op(*args, **kwargs)
+    self.assertRaggedEqual(result, expected)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_eager_test.py b/tensorflow/python/ops/ragged/ragged_eager_test.py
index f1befbf9613fefc4efd5efd3d8ebf17db9038581..86f01aace00d3b67bcaa78d4091d32fdab3242d7 100644
--- a/tensorflow/python/ops/ragged/ragged_eager_test.py
+++ b/tensorflow/python/ops/ragged/ragged_eager_test.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -35,7 +35,7 @@ class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
       dict(pylist=[[[1, 2], [3, 4]], [[5, 6], [], [7, 8]]], ragged_rank=1),
   ])
   def testRaggedTensorToList(self, pylist, ragged_rank=None):
-    rt = ragged.constant(pylist, ragged_rank)
+    rt = ragged_factory_ops.constant(pylist, ragged_rank)
     self.assertRaggedEqual(rt, pylist)
 
   @parameterized.parameters([
@@ -43,7 +43,7 @@ class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
       dict(pylist=[[[1, 2], [3]], [[4, 5, 6], [], [7]]]),
   ])
   def testRaggedTensorStr(self, pylist):
-    rt = ragged.constant(pylist)
+    rt = ragged_factory_ops.constant(pylist)
     self.assertEqual(str(rt), '<tf.RaggedTensor %s>' % pylist)
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_expand_dims_op_test.py b/tensorflow/python/ops/ragged/ragged_expand_dims_op_test.py
index 072f330e3c1c0a20ac7cecd84ec6b0e47003a3a0..c747bb304964b1fade5ddd701375a9e91de89c9e 100644
--- a/tensorflow/python/ops/ragged/ragged_expand_dims_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_expand_dims_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.expand_dims."""
+"""Tests for ragged_array_ops.expand_dims."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,7 +21,8 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -32,7 +33,7 @@ class RaggedExpandDimsOpTest(ragged_test_util.RaggedTensorTestCase,
 
   # An example 4-d ragged tensor with shape [3, (D2), (D3), 2], and the
   # expected result calling for expand_dims on each axis.  c.f. the table of
-  # expected result shapes in the ragged.expand_dims docstring.
+  # expected result shapes in the ragged_array_ops.expand_dims docstring.
   EXAMPLE4D = [[[[1, 1], [2, 2]], [[3, 3]]],
                [],
                [[], [[4, 4], [5, 5], [6, 6]]]]  # pyformat: disable
@@ -113,8 +114,8 @@ class RaggedExpandDimsOpTest(ragged_test_util.RaggedTensorTestCase,
                            expected,
                            ragged_rank=None,
                            expected_shape=None):
-    rt = ragged.constant(rt_input, ragged_rank=ragged_rank)
-    expanded = ragged.expand_dims(rt, axis=axis)
+    rt = ragged_factory_ops.constant(rt_input, ragged_rank=ragged_rank)
+    expanded = ragged_array_ops.expand_dims(rt, axis=axis)
     self.assertEqual(expanded.shape.ndims, rt.shape.ndims + 1)
     if expected_shape is not None:
       self.assertEqual(expanded.shape.as_list(), expected_shape)
diff --git a/tensorflow/python/ops/ragged/ragged_factory_ops.py b/tensorflow/python/ops/ragged/ragged_factory_ops.py
index 2c63e1c7994c31b6ed53e37e65498a843e2bb595..8cda98765bb1759f156693e759de73f1e2acad6c 100644
--- a/tensorflow/python/ops/ragged/ragged_factory_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_factory_ops.py
@@ -24,11 +24,13 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_tensor_value
+from tensorflow.python.util.tf_export import tf_export
 
 
 #===============================================================================
 # Op to construct a constant RaggedTensor from a nested Python list.
 #===============================================================================
+@tf_export("ragged.constant")
 def constant(pylist, dtype=None, ragged_rank=None, inner_shape=None, name=None):
   """Constructs a constant RaggedTensor from a nested Python list.
 
@@ -74,12 +76,13 @@ def constant(pylist, dtype=None, ragged_rank=None, inner_shape=None, name=None):
                            inner_shape)
 
 
+@tf_export(v1=["ragged.constant_value"])
 def constant_value(pylist, dtype=None, ragged_rank=None, inner_shape=None):
   """Constructs a RaggedTensorValue from a nested Python list.
 
-  > Warning: This function returns a `RaggedTensorValue`, not a `RaggedTensor`.
-  > If you wish to construct a constant `RaggedTensor`, use
-  > [`ragged.constant(...)`](constant.md) instead.
+  Warning: This function returns a `RaggedTensorValue`, not a `RaggedTensor`.
+  If you wish to construct a constant `RaggedTensor`, use
+  [`ragged.constant(...)`](constant.md) instead.
 
   Example:
 
diff --git a/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py b/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py
index 07cf910202770192f146328844dec8c12be542a7..c6998e274bed1bae78a156751785c7bb10a90abd 100644
--- a/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py
@@ -25,7 +25,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.ragged import ragged_test_util
-from tensorflow.python.ops.ragged import RaggedTensor
+from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
 from tensorflow.python.platform import googletest
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py b/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py
index 6a3d639c5e35f23db7d53994e0a0bfe5231e664b..68d3953f4cdf31458fc75397522b3f9fc8960098 100644
--- a/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py
@@ -25,7 +25,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_test_util
-from tensorflow.python.ops.ragged import RaggedTensor
+from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
 from tensorflow.python.platform import googletest
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_functional_ops.py b/tensorflow/python/ops/ragged/ragged_functional_ops.py
index 751f2c73592c676d0dd5eec4f9dc45430cd646b1..b6937a1c37940339f8ea451392b42718095c7e33 100644
--- a/tensorflow/python/ops/ragged/ragged_functional_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_functional_ops.py
@@ -21,18 +21,20 @@ from __future__ import print_function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("ragged.map_flat_values")
 def map_flat_values(op, *args, **kwargs):
-  """Applies `op` to the inner values of one or more RaggedTensors.
+  """Applies `op` to the values of one or more RaggedTensors.
 
   Replaces any `RaggedTensor` in `args` or `kwargs` with its `flat_values`
   tensor, and then calls `op`.  Returns a `RaggedTensor` that is constructed
-  from the input `RaggedTensor`s' `splits` and the value returned by
+  from the input `RaggedTensor`s' `nested_row_splits` and the value returned by
   the `op`.
 
   If the input arguments contain multiple `RaggedTensor`s, then they must have
-  identical `splits`.
+  identical `nested_row_splits`.
 
   Examples:
 
diff --git a/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py b/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
index 6673192752e613f671c175193fce83fbba60e48d..8e44368d4752ed01410de762b7cbda134ebfaa60 100644
--- a/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tf.ragged.gather_nd."""
+"""Tests for ragged_gather_ops.gather_nd."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -26,7 +26,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -45,18 +46,19 @@ class RaggedGatherNdOpTest(ragged_test_util.RaggedTensorTestCase,
       #=========================================================================
       dict(
           descr='Docstring example 1',
-          params=ragged.constant_value(DOCSTRING_PARAMS),
+          params=ragged_factory_ops.constant_value(DOCSTRING_PARAMS),
           indices=[[2], [0]],
-          expected=ragged.constant_value([[[], [b'210']],
-                                          [[b'000', b'001'], [b'010']]])),
+          expected=ragged_factory_ops.constant_value(
+              [[[], [b'210']], [[b'000', b'001'], [b'010']]])),
       dict(
           descr='Docstring example 2',
-          params=ragged.constant_value(DOCSTRING_PARAMS),
+          params=ragged_factory_ops.constant_value(DOCSTRING_PARAMS),
           indices=[[2, 1], [0, 0]],
-          expected=ragged.constant_value([[b'210'], [b'000', b'001']])),
+          expected=ragged_factory_ops.constant_value(
+              [[b'210'], [b'000', b'001']])),
       dict(
           descr='Docstring example 3',
-          params=ragged.constant_value(DOCSTRING_PARAMS),
+          params=ragged_factory_ops.constant_value(DOCSTRING_PARAMS),
           indices=[[0, 0, 1], [1, 1, 2]],
           expected=[b'001', b'112']),
       #=========================================================================
@@ -64,146 +66,157 @@ class RaggedGatherNdOpTest(ragged_test_util.RaggedTensorTestCase,
       #=========================================================================
       dict(
           descr='params: [B1, (B2)], indices: [0], result: [B1, (B2)]',
-          params=ragged.constant_value([['a', 'b', 'c'], ['d']]),
+          params=ragged_factory_ops.constant_value([['a', 'b', 'c'], ['d']]),
           indices=np.zeros([0], dtype=np.int32),
-          expected=ragged.constant_value([[b'a', b'b', b'c'], [b'd']])),
+          expected=ragged_factory_ops.constant_value(
+              [[b'a', b'b', b'c'], [b'd']])),
       dict(
           descr='params: [B1, (B2)], indices: [A1, 0], result: [A1, B1, (B2)]',
-          params=ragged.constant_value([['a', 'b', 'c'], ['d']]),
+          params=ragged_factory_ops.constant_value([['a', 'b', 'c'], ['d']]),
           indices=np.zeros([3, 0], dtype=np.int32),
-          expected=ragged.constant_value([[[b'a', b'b', b'c'], [b'd']],
-                                          [[b'a', b'b', b'c'], [b'd']],
-                                          [[b'a', b'b', b'c'], [b'd']]])),
+          expected=ragged_factory_ops.constant_value(
+              [[[b'a', b'b', b'c'], [b'd']],
+               [[b'a', b'b', b'c'], [b'd']],
+               [[b'a', b'b', b'c'], [b'd']]])),
       dict(
           descr=('params: [B1, (B2)], indices: [A1, A2, 0], '
                  'result: [A1, A2, B1, (B2)]'),
-          params=ragged.constant_value([['a', 'b', 'c'], ['d']]),
+          params=ragged_factory_ops.constant_value([['a', 'b', 'c'], ['d']]),
           indices=np.zeros([1, 3, 0], dtype=np.int32),
-          expected=ragged.constant_value([[[[b'a', b'b', b'c'], [b'd']],
-                                           [[b'a', b'b', b'c'], [b'd']],
-                                           [[b'a', b'b', b'c'], [b'd']]]])),
+          expected=ragged_factory_ops.constant_value(
+              [[[[b'a', b'b', b'c'], [b'd']],
+                [[b'a', b'b', b'c'], [b'd']],
+                [[b'a', b'b', b'c'], [b'd']]]])),
       dict(
           descr='params: [B1], indices: [A1, (A2), 0], result: [A1, (A2), B1]',
           params=['a'],
-          indices=ragged.constant_value([[[], []], [[]]],
-                                        ragged_rank=1,
-                                        dtype=np.int32),
-          expected=ragged.constant_value([[[b'a'], [b'a']], [[b'a']]],
-                                         ragged_rank=1)),
+          indices=ragged_factory_ops.constant_value(
+              [[[], []], [[]]],
+              ragged_rank=1,
+              dtype=np.int32),
+          expected=ragged_factory_ops.constant_value(
+              [[[b'a'], [b'a']], [[b'a']]],
+              ragged_rank=1)),
       #=========================================================================
       # Indices with 1 value (selects row from params)
       #=========================================================================
       dict(
           descr='params: [B1, (B2)], indices: [A1, 1], result: [A1, (B2)]',
-          params=ragged.constant_value([['a', 'b', 'c'], ['d']]),
+          params=ragged_factory_ops.constant_value([['a', 'b', 'c'], ['d']]),
           indices=[[1], [0]],
-          expected=ragged.constant_value([[b'd'], [b'a', b'b', b'c']])),
+          expected=ragged_factory_ops.constant_value(
+              [[b'd'], [b'a', b'b', b'c']])),
       dict(
           descr=('params: [B1, (B2), (B3)], indices: [A1, 1], '
                  'result: [A1, (B2), (B3)]'),
-          params=ragged.constant_value([[['a', 'b', 'c'], ['d']],
-                                        [['e', 'f']]]),
+          params=ragged_factory_ops.constant_value(
+              [[['a', 'b', 'c'], ['d']], [['e', 'f']]]),
           indices=[[1], [1]],
-          expected=ragged.constant_value([[[b'e', b'f']], [[b'e', b'f']]])),
+          expected=ragged_factory_ops.constant_value(
+              [[[b'e', b'f']], [[b'e', b'f']]])),
       dict(
           descr=('params: [B1, B2, B3], indices: [A1, (A2), 1], '
                  'result: [A1, (A2), B2, B3]'),
           params=[[['a']], [['b']]],
-          indices=ragged.constant_value([[[0]]], ragged_rank=1),
-          expected=ragged.constant_value([[[[b'a']]]], ragged_rank=1)),
+          indices=ragged_factory_ops.constant_value([[[0]]], ragged_rank=1),
+          expected=ragged_factory_ops.constant_value(
+              [[[[b'a']]]], ragged_rank=1)),
       #=========================================================================
       # Indices with 2 values (selects row & col from params)
       #=========================================================================
       dict(
           descr='params: [B1, (B2)], indices: [A1, 2], result: [A1]',
-          params=ragged.constant_value([['a', 'b', 'c'], ['d']]),
+          params=ragged_factory_ops.constant_value([['a', 'b', 'c'], ['d']]),
           indices=[[1, 0], [0, 0], [0, 2]],
-          expected=ragged.constant_value([b'd', b'a', b'c'])),
+          expected=ragged_factory_ops.constant_value([b'd', b'a', b'c'])),
       dict(
           descr=('params: [B1, (B2), (B3)], indices: [A1, 2], '
                  'result: [A1, (B3)]'),
-          params=ragged.constant_value([[['a', 'b', 'c'], ['d']],
-                                        [['e', 'f']]]),
+          params=ragged_factory_ops.constant_value(
+              [[['a', 'b', 'c'], ['d']], [['e', 'f']]]),
           indices=[[1, 0], [0, 1], [0, 0]],
-          expected=ragged.constant_value([[b'e', b'f'], [b'd'],
-                                          [b'a', b'b', b'c']])),
+          expected=ragged_factory_ops.constant_value(
+              [[b'e', b'f'], [b'd'], [b'a', b'b', b'c']])),
       dict(
           descr=('params: [B1, (B2), (B3)], indices: [A1, A2, 2], '
                  'result: [A1, (A2), (B3)]'),
-          params=ragged.constant_value([[['a', 'b', 'c'], ['d']],
-                                        [['e', 'f']]]),
+          params=ragged_factory_ops.constant_value(
+              [[['a', 'b', 'c'], ['d']], [['e', 'f']]]),
           indices=[[[1, 0], [0, 1], [0, 0]]],
-          expected=ragged.constant_value([[[b'e', b'f'], [b'd'],
-                                           [b'a', b'b', b'c']]])),
+          expected=ragged_factory_ops.constant_value(
+              [[[b'e', b'f'], [b'd'], [b'a', b'b', b'c']]])),
       dict(
           descr=('params: [B1, (B2), B3], indices: [A1, A2, 2], '
                  'result: [A1, A2, B3]'),
-          params=ragged.constant_value([[['a', 'b'], ['c', 'd']],
-                                        [['e', 'f']]],
-                                       ragged_rank=1),
+          params=ragged_factory_ops.constant_value(
+              [[['a', 'b'], ['c', 'd']],
+               [['e', 'f']]],
+              ragged_rank=1),
           indices=[[[1, 0], [0, 1], [0, 0]]],
           expected=[[[b'e', b'f'], [b'c', b'd'], [b'a', b'b']]]),
       dict(
           descr=('params: [B1, (B2), B3], indices: [A1, A2, A3, 2], '
                  'result: [A1, A2, A3, B3]'),
-          params=ragged.constant_value([[['a', 'b'], ['c', 'd']],
-                                        [['e', 'f']]],
-                                       ragged_rank=1),
+          params=ragged_factory_ops.constant_value(
+              [[['a', 'b'], ['c', 'd']],
+               [['e', 'f']]],
+              ragged_rank=1),
           indices=[[[[1, 0], [0, 1], [0, 0]]]],
           expected=[[[[b'e', b'f'], [b'c', b'd'], [b'a', b'b']]]]),
       dict(
           descr=('params: [B1, (B2), (B3)], indices: [A1, (A2), 2], '
                  'result: [A1, (A2), (B3)]'),
-          params=ragged.constant_value([[['a', 'b', 'c'], ['d']],
-                                        [['e', 'f']]]),
-          indices=ragged.constant_value([[[1, 0], [0, 1]], [[0, 0]]],
-                                        ragged_rank=1),
-          expected=ragged.constant_value([[[b'e', b'f'], [b'd']],
-                                          [[b'a', b'b', b'c']]])),
+          params=ragged_factory_ops.constant_value(
+              [[['a', 'b', 'c'], ['d']], [['e', 'f']]]),
+          indices=ragged_factory_ops.constant_value(
+              [[[1, 0], [0, 1]], [[0, 0]]],
+              ragged_rank=1),
+          expected=ragged_factory_ops.constant_value(
+              [[[b'e', b'f'], [b'd']], [[b'a', b'b', b'c']]])),
       #=========================================================================
       # Indices with 3 values
       #=========================================================================
       dict(
           descr=('params: [B1, (B2), (B3)], indices: [A1, 3], '
                  'result: [A1]'),
-          params=ragged.constant_value([[['a', 'b', 'c'], ['d']],
-                                        [['e', 'f']]]),
+          params=ragged_factory_ops.constant_value(
+              [[['a', 'b', 'c'], ['d']], [['e', 'f']]]),
           indices=[[1, 0, 1], [0, 0, 0], [0, 1, 0]],
           expected=[b'f', b'a', b'd']),
       dict(
           descr=('params: [B1, (B2), B3], indices: [A1, 3], '
                  'result: [A1]'),
-          params=ragged.constant_value([[['a', 'b'], ['c', 'd']],
-                                        [['e', 'f']]],
-                                       ragged_rank=1),
+          params=ragged_factory_ops.constant_value(
+              [[['a', 'b'], ['c', 'd']], [['e', 'f']]],
+              ragged_rank=1),
           indices=[[1, 0, 1], [0, 0, 0], [0, 1, 1]],
           expected=[b'f', b'a', b'd']),
       dict(
           descr=('params: [B1, (B2), (B3), B4], indices: [A1, 3], '
                  'result: [A1, B4]'),
-          params=ragged.constant_value([[[['a', 'b'], ['c', 'd']],
-                                         [['e', 'f']]]],
-                                       ragged_rank=2),
+          params=ragged_factory_ops.constant_value(
+              [[[['a', 'b'], ['c', 'd']], [['e', 'f']]]],
+              ragged_rank=2),
           indices=[[0, 0, 1], [0, 0, 0], [0, 1, 0]],
           expected=[[b'c', b'd'], [b'a', b'b'], [b'e', b'f']]),
   ])  # pyformat: disable
   def testRaggedGatherNd(self, descr, params, indices, expected):
-    result = ragged.gather_nd(params, indices)
+    result = ragged_gather_ops.gather_nd(params, indices)
     self.assertRaggedEqual(result, expected)
 
   def testRaggedGatherNdUnknownRankError(self):
     if context.executing_eagerly():
       return
-    params = ragged.constant([['a', 'b'], ['c', 'd']])
+    params = ragged_factory_ops.constant([['a', 'b'], ['c', 'd']])
     indices1 = array_ops.placeholder(dtypes.int32, shape=None)
     indices2 = array_ops.placeholder(dtypes.int32, shape=[None])
 
     with self.assertRaisesRegexp(ValueError,
                                  'indices.rank be statically known.'):
-      ragged.gather_nd(params, indices1)
+      ragged_gather_ops.gather_nd(params, indices1)
     with self.assertRaisesRegexp(
         ValueError, r'indices.shape\[-1\] must be statically known.'):
-      ragged.gather_nd(params, indices2)
+      ragged_gather_ops.gather_nd(params, indices2)
 
   @parameterized.parameters([
       dict(
@@ -211,12 +224,12 @@ class RaggedGatherNdOpTest(ragged_test_util.RaggedTensorTestCase,
           indices=0,
           error=(ValueError, errors.InvalidArgumentError)),
       dict(
-          params=ragged.constant_value([['a']]),
+          params=ragged_factory_ops.constant_value([['a']]),
           indices=0,
           message='indices.rank must be at least 1.'),
       dict(
           params=['a', 'b', 'c'],
-          indices=ragged.constant_value([[0]]),
+          indices=ragged_factory_ops.constant_value([[0]]),
           message='The innermost dimension of indices may not be ragged'),
   ])
   def testRaggedGatherNdStaticError(self,
@@ -225,7 +238,7 @@ class RaggedGatherNdOpTest(ragged_test_util.RaggedTensorTestCase,
                                     message=None,
                                     error=ValueError):
     with self.assertRaisesRegexp(error, message):
-      ragged.gather_nd(params, indices)
+      ragged_gather_ops.gather_nd(params, indices)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_gather_op_test.py b/tensorflow/python/ops/ragged/ragged_gather_op_test.py
index 42efdc8a7d384744041454b5e0bb90e5618b7184..eb64bb4ad1685dc1c9c850c4a9c9ef36e9ffa23f 100644
--- a/tensorflow/python/ops/ragged/ragged_gather_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_gather_op_test.py
@@ -12,12 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.gather."""
+"""Tests for ragged_array_ops.gather."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -25,7 +24,8 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -36,74 +36,79 @@ class RaggedGatherOpTest(ragged_test_util.RaggedTensorTestCase):
   def testDocStringExamples(self):
     params = constant_op.constant(['a', 'b', 'c', 'd', 'e'])
     indices = constant_op.constant([3, 1, 2, 1, 0])
-    ragged_params = ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
-    ragged_indices = ragged.constant([[3, 1, 2], [1], [], [0]])
+    ragged_params = ragged_factory_ops.constant([['a', 'b', 'c'], ['d'], [],
+                                                 ['e']])
+    ragged_indices = ragged_factory_ops.constant([[3, 1, 2], [1], [], [0]])
     self.assertRaggedEqual(
-        ragged.gather(params, ragged_indices),
+        ragged_gather_ops.gather(params, ragged_indices),
         [[b'd', b'b', b'c'], [b'b'], [], [b'a']])
     self.assertRaggedEqual(
-        ragged.gather(ragged_params, indices),
+        ragged_gather_ops.gather(ragged_params, indices),
         [[b'e'], [b'd'], [], [b'd'], [b'a', b'b', b'c']])
     self.assertRaggedEqual(
-        ragged.gather(ragged_params, ragged_indices),
+        ragged_gather_ops.gather(ragged_params, ragged_indices),
         [[[b'e'], [b'd'], []], [[b'd']], [], [[b'a', b'b', b'c']]])
 
   def testTensorParamsAndTensorIndices(self):
     params = ['a', 'b', 'c', 'd', 'e']
     indices = [2, 0, 2, 1]
     self.assertRaggedEqual(
-        ragged.gather(params, indices), [b'c', b'a', b'c', b'b'])
-    self.assertIsInstance(ragged.gather(params, indices), ops.Tensor)
+        ragged_gather_ops.gather(params, indices), [b'c', b'a', b'c', b'b'])
+    self.assertIsInstance(ragged_gather_ops.gather(params, indices), ops.Tensor)
 
   def testRaggedParamsAndTensorIndices(self):
-    params = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
+    params = ragged_factory_ops.constant([['a', 'b'], ['c', 'd', 'e'], ['f'],
+                                          [], ['g']])
     indices = [2, 0, 2, 1]
     self.assertRaggedEqual(
-        ragged.gather(params, indices),
+        ragged_gather_ops.gather(params, indices),
         [[b'f'], [b'a', b'b'], [b'f'], [b'c', b'd', b'e']])
 
   def testTensorParamsAndRaggedIndices(self):
     params = ['a', 'b', 'c', 'd', 'e']
-    indices = ragged.constant([[2, 1], [1, 2, 0], [3]])
+    indices = ragged_factory_ops.constant([[2, 1], [1, 2, 0], [3]])
     self.assertRaggedEqual(
-        ragged.gather(params, indices),
+        ragged_gather_ops.gather(params, indices),
         [[b'c', b'b'], [b'b', b'c', b'a'], [b'd']])
 
   def testRaggedParamsAndRaggedIndices(self):
-    params = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
-    indices = ragged.constant([[2, 1], [1, 2, 0], [3]])
+    params = ragged_factory_ops.constant([['a', 'b'], ['c', 'd', 'e'], ['f'],
+                                          [], ['g']])
+    indices = ragged_factory_ops.constant([[2, 1], [1, 2, 0], [3]])
     self.assertRaggedEqual(
-        ragged.gather(params, indices),
+        ragged_gather_ops.gather(params, indices),
         [[[b'f'], [b'c', b'd', b'e']],                # [[p[2], p[1]      ],
          [[b'c', b'd', b'e'], [b'f'], [b'a', b'b']],  #  [p[1], p[2], p[0]],
          [[]]]                                        #  [p[3]            ]]
     )  # pyformat: disable
 
   def testRaggedParamsAndScalarIndices(self):
-    params = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
+    params = ragged_factory_ops.constant([['a', 'b'], ['c', 'd', 'e'], ['f'],
+                                          [], ['g']])
     indices = 1
-    self.assertRaggedEqual(ragged.gather(params, indices), [b'c', b'd', b'e'])
+    self.assertRaggedEqual(
+        ragged_gather_ops.gather(params, indices), [b'c', b'd', b'e'])
 
   def test3DRaggedParamsAnd2DTensorIndices(self):
-    params = ragged.constant([[['a', 'b'], []], [['c', 'd'], ['e'], ['f']],
-                              [['g']]])
+    params = ragged_factory_ops.constant([[['a', 'b'], []],
+                                          [['c', 'd'], ['e'], ['f']], [['g']]])
     indices = [[1, 2], [0, 1], [2, 2]]
     self.assertRaggedEqual(
-        ragged.gather(params, indices),
+        ragged_gather_ops.gather(params, indices),
         [[[[b'c', b'd'], [b'e'], [b'f']], [[b'g']]],            # [[p1, p2],
          [[[b'a', b'b'], []], [[b'c', b'd'], [b'e'], [b'f']]],  #  [p0, p1],
          [[[b'g']], [[b'g']]]]                                  #  [p2, p2]]
     )  # pyformat: disable
 
   def testTensorParamsAnd4DRaggedIndices(self):
-    indices = ragged.constant(
+    indices = ragged_factory_ops.constant(
         [[[[3, 4], [0, 6]], []], [[[2, 1], [1, 0]], [[2, 5]], [[2, 3]]],
          [[[1, 0]]]],  # pyformat: disable
         ragged_rank=2,
         inner_shape=(2,))
     params = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
     self.assertRaggedEqual(
-        ragged.gather(params, indices),
+        ragged_gather_ops.gather(params, indices),
         [[[[b'd', b'e'], [b'a', b'g']], []],
          [[[b'c', b'b'], [b'b', b'a']], [[b'c', b'f']], [[b'c', b'd']]],
          [[[b'b', b'a']]]])  # pyformat: disable
@@ -111,27 +116,27 @@ class RaggedGatherOpTest(ragged_test_util.RaggedTensorTestCase):
   def testOutOfBoundsError(self):
     tensor_params = ['a', 'b', 'c']
     tensor_indices = [0, 1, 2]
-    ragged_params = ragged.constant([['a', 'b'], ['c']])
-    ragged_indices = ragged.constant([[0, 3]])
+    ragged_params = ragged_factory_ops.constant([['a', 'b'], ['c']])
+    ragged_indices = ragged_factory_ops.constant([[0, 3]])
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  r'indices\[1\] = 3 is not in \[0, 3\)'):
-      self.evaluate(ragged.gather(tensor_params, ragged_indices))
+      self.evaluate(ragged_gather_ops.gather(tensor_params, ragged_indices))
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  r'indices\[2\] = 2 is not in \[0, 2\)'):
-      self.evaluate(ragged.gather(ragged_params, tensor_indices))
+      self.evaluate(ragged_gather_ops.gather(ragged_params, tensor_indices))
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  r'indices\[1\] = 3 is not in \[0, 2\)'):
-      self.evaluate(ragged.gather(ragged_params, ragged_indices))
+      self.evaluate(ragged_gather_ops.gather(ragged_params, ragged_indices))
 
   def testUnknownIndicesRankError(self):
     if context.executing_eagerly():
       return
-    params = ragged.constant([], ragged_rank=1)
+    params = ragged_factory_ops.constant([], ragged_rank=1)
     indices = constant_op.constant([0], dtype=dtypes.int64)
     indices = array_ops.placeholder_with_default(indices, None)
     self.assertRaisesRegexp(ValueError,
                             r'indices\.shape\.ndims must be known statically',
-                            ragged.gather, params, indices)
+                            ragged_gather_ops.gather, params, indices)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_gather_ops.py b/tensorflow/python/ops/ragged/ragged_gather_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b49e0e549ff8a3948c335e54a90deb5708d4b7cd
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_gather_ops.py
@@ -0,0 +1,258 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Gather operations for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_ragged_array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_conversion_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+
+
+#===============================================================================
+# ragged_gather
+#===============================================================================
+# TODO(edloper): Add an `axis` argument
+def gather(params, indices, validate_indices=None, axis=0, batch_dims=0,
+           name=None):
+  """Gathers ragged slices from `params` axis `0` according to `indices`.
+
+  Returns `RaggedTensor` output, such that:
+
+  ```python
+  output.shape = indices.shape + params.shape[1:]
+  output.ragged_rank = indices.shape.ndims + params.ragged_rank
+  output[i...j, d0...dn] = params[indices[i...j], d0...dn]
+  ```
+
+  `params` may be ragged.  `indices` may be ragged.
+  `indices` must have dtype `int32` or `int64`. If any index is out of bounds,
+  then an error is returned.
+
+  Examples:
+
+  ```python
+  >>> params = tf.constant(['a', 'b', 'c', 'd', 'e'])
+  >>> indices = tf.constant([3, 1, 2, 1, 0])
+  >>> ragged_params = tf.ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
+  >>> ragged_indices = tf.ragged.constant([[3, 1, 2], [1], [], [0]])
+
+  >>> print ragged.gather(params, ragged_indices)
+  [['d', 'b', 'c'], ['b'], [], ['a']]
+
+  >>> print ragged.gather(ragged_params, indices)
+  [['e'], ['d'], [], ['d'], ['a', 'b', 'c']]
+
+  >>> print ragged.gather(ragged_params, ragged_indices)
+  [[['e'], ['d'], []], [['d']], [], [['a', 'b', 'c']]]
+  ```
+
+  Args:
+    params: The potentially ragged tensor from which to gather values. Must be
+      at least rank 1.
+    indices: The potentially ragged tensor indicating which values to gather.
+      Must have dtype `int32` or `int64`.  Values must be in the range `[0,
+      params.shape[0]]`.
+    validate_indices: Ignored.
+    axis: Must be zero.
+    batch_dims: Must be zero.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `RaggedTensor`, where `output.dtype=params.dtype` and
+    `output.shape=indices.shape + params.shape[1:]` and
+    `output.ragged_rank=indices.shape.ndims + params.ragged_rank`.
+
+  Raises:
+    ValueError: If indices.shape.ndims is not known statically.
+  """
+  del validate_indices
+  if not isinstance(axis, int) or axis != 0:
+    raise ValueError('axis != 0 is not supported for ragged gather yet.')
+  if not isinstance(batch_dims, int) or batch_dims != 0:
+    raise ValueError('batch_dims != 0 is not supported for ragged gather yet.')
+  with ops.name_scope(name, 'RaggedGather', [params, indices]):
+    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        params, name='params')
+    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        indices, name='indices')
+
+    if ragged_tensor.is_ragged(indices):
+      return indices.with_values(gather(params, indices.values))
+
+    if not ragged_tensor.is_ragged(params):
+      return array_ops.gather(params, indices)
+
+    indices = ops.convert_to_tensor(indices)
+    if indices.shape.ndims is None:
+      raise ValueError('indices.shape.ndims must be known statically')
+
+    result = gen_ragged_array_ops.ragged_gather(
+        indices=indices,
+        params_dense_values=params.flat_values,
+        params_nested_splits=params.nested_row_splits,
+        OUTPUT_RAGGED_RANK=indices.shape.ndims + len(params.nested_row_splits) -
+        1)
+
+    # Compose the RaggedTensor from splits & values.
+    return ragged_tensor.RaggedTensor.from_nested_row_splits(
+        result.output_dense_values, result.output_nested_splits)
+
+
+#===============================================================================
+# ragged.gather_nd
+#===============================================================================
+def gather_nd(params, indices, name=None):
+  """Gather slices from `params` using `n`-dimensional indices.
+
+  This operation is similar to `gather`, but it uses the innermost dimension
+  of `indices` to define a slice into `params`.  In particular, if:
+
+  * `indices` has shape `[A1...AN, I]`
+  * `params` has shape `[B1...BM]`
+
+  Then:
+
+  * `result` has shape `[A1...AN, B_{I+1}...BM]`.
+  * `result[a1...aN] = params[indices[a1...aN, :]]`
+
+  Args:
+    params: A potentially ragged tensor with shape `[A1...AN, I]`.
+    indices: A potentially ragged tensor with shape `[B1...BM]`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A potentially ragged tensor with shape `[A1...AN, B_{I+1}...BM]`.
+
+  #### Examples:
+    ```python
+    >>> params = tf.ragged.constant_value(
+    ...     [ [ ['000', '001'], ['010'              ]          ],
+    ...       [ ['100'       ], ['110', '111', '112'], ['120'] ],
+    ...       [ [            ], ['210'              ]          ] ])
+
+    >>> # Gather 2D slices from a 3D tensor
+    >>> ragged.gather_nd(params, [[2], [0]])
+    [ [ [            ], ['210'] ]
+      [ ['000', '001'], ['010'] ] ]
+
+    >>> # Gather 1D slices from a 3D tensor
+    >>> ragged.gather_nd(params, [[2, 1], [0, 0]])
+    [['210'], ['000', '001']]
+
+    >>> # Gather scalars from a 3D tensor
+    >>> ragged.gather_nd(params, [[0, 0, 1], [1, 1, 2]])
+    ['001', '112']
+    ```
+  """
+  if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)):
+    return array_ops.gather_nd(params, indices, name)
+
+  with ops.name_scope(name, 'RaggedGatherNd', [params, indices]):
+
+    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        params, name='params')
+    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        indices, name='indices')
+    indices_shape = indices.shape
+    indices_ndims = indices_shape.ndims
+    if indices_ndims is None:
+      raise ValueError('indices.rank be statically known.')
+    if indices_ndims == 0:
+      raise ValueError('indices.rank must be at least 1.')
+    if (ragged_tensor.is_ragged(indices) and
+        indices_ndims == indices.ragged_rank + 1):
+      raise ValueError('The innermost dimension of indices may not be ragged')
+
+    # `index_size` is the "n" in "gather_nd" -- i.e., the number of dimensions
+    # that each index slices into.
+    index_size = tensor_shape.dimension_value(indices_shape[-1])
+    if index_size is None:
+      raise ValueError('indices.shape[-1] must be statically known.')
+
+    # If `indices` has more than 2 dimensions, then recurse.  If `indices` is
+    # dense, then we convert it to ragged before recursing, and then convert
+    # the result back to `dense` if appropriate.
+    if indices_ndims > 2:
+      indices_is_dense = not ragged_tensor.is_ragged(indices)
+      if indices_is_dense:
+        indices = ragged_conversion_ops.from_tensor(
+            indices, ragged_rank=indices_ndims - 2)
+      result = indices.with_flat_values(gather_nd(params, indices.flat_values))
+      if (indices_is_dense and ragged_tensor.is_ragged(result) and
+          result.ragged_rank == indices_ndims - 2):
+        result = ragged_conversion_ops.to_tensor(result)
+      return result
+
+    # indices_ndims <= 2, and the innermost dimension of indices may not be
+    # ragged, so `indices` must not be ragged.
+    assert not ragged_tensor.is_ragged(indices)
+    assert ragged_tensor.is_ragged(params)
+
+    # Handle corner case: An empty index tuple selects the entire `params`
+    # value.  So if `index_size` is zero, then tile `params`.
+    if index_size == 0:
+      params_ndims = params.ragged_rank + array_ops.rank(params.flat_values)
+      for dim in range(indices_ndims - 1):
+        params = ragged_array_ops.expand_dims(params, axis=0)
+      multiples = array_ops.concat([
+          array_ops.shape(indices)[:-1],
+          array_ops.ones([params_ndims], dtypes.int32)
+      ],
+                                   axis=0)
+      return ragged_array_ops.tile(params, multiples)
+
+    # When index_size=1, we can just flatten the index tuples and use gather.
+    elif index_size == 1:
+      flattened_index_tuples = array_ops.reshape(indices, [-1])
+      return gather(params, flattened_index_tuples)
+
+    # Otherwise, params is a RaggedTensor, and indices is a 1D or 2D Tensor.
+    # Flatten both the index tuples and the params, such that the flattened
+    # index tuples point to the correct values in the flattened params; and
+    # then use ragged.gather on the flattened index tuples & params.
+    else:
+      indices = math_ops.to_int64(indices)
+
+      # Flatten the outermost 2 dimensions of the index tuples & params.
+      flattened_index_tuples = array_ops.gather(params.row_splits,
+                                                indices[..., 0])
+      flattened_index_tuples += indices[..., 1]
+      flattened_params = params.values
+
+      # Flatten any remaining dimensions.
+      for dim in range(2, index_size):
+        if not ragged_tensor.is_ragged(flattened_params):
+          flattened_index_tuples = array_ops.expand_dims(
+              flattened_index_tuples, axis=1)
+          flattened_index_tuples = array_ops.concat(
+              [flattened_index_tuples, indices[..., dim:]], axis=1)
+          return array_ops.gather_nd(flattened_params, flattened_index_tuples)
+
+        flattened_index_tuples = array_ops.gather(
+            flattened_params.row_starts(), flattened_index_tuples)
+        flattened_index_tuples += indices[..., dim]
+        flattened_params = flattened_params.values
+
+      # Gather using the flattened index tuples and params.
+      return gather(flattened_params, flattened_index_tuples)
diff --git a/tensorflow/python/ops/ragged/ragged_getitem.py b/tensorflow/python/ops/ragged/ragged_getitem.py
index 0fa72a36581150cd9408aa7bf12467bfaaab8893..d01cf67139b397977c30817fa515f5e30050b25b 100644
--- a/tensorflow/python/ops/ragged/ragged_getitem.py
+++ b/tensorflow/python/ops/ragged/ragged_getitem.py
@@ -18,12 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 
@@ -38,7 +39,7 @@ def ragged_tensor_getitem(self, key):
   IndexError; (2) use a default value; or (3) skip that value and return a
   tensor with fewer rows than we started with.  Following the guiding
   principles of Python ("In the face of ambiguity, refuse the temptation to
-  guess" <go/pep20>), we simply disallow this operation.
+  guess"), we simply disallow this operation.
 
   Any dimensions added by `array_ops.newaxis` will be ragged if the following
   dimension is ragged.
@@ -150,6 +151,27 @@ def _ragged_getitem(rt_input, key_list):
   else:
     starts = rt_input.row_splits[:-1]
     limits = rt_input.row_splits[1:]
+    if context.executing_eagerly():
+      # In python, __getitem__ should throw IndexError for out of bound
+      # indices. This will allow iteration run correctly as python will
+      # translate IndexError into StopIteration for next()/__next__().
+      # Below is an example:
+      #    import tensorflow as tf
+      #    r = tf.ragged.constant([[1., 2.], [3., 4., 5.], [6.]])
+      #    for elem in r:
+      #      print(elem)
+      # In non eager mode, the exception is thrown when session runs
+      # so we don't know if out of bound happens before.
+      # In eager mode, however, it is possible to find out when to
+      # throw out of bound IndexError.
+      # In the following row_key >= len(starts) is checked. In case of
+      # TypeError which happens when row_key is not an integer, the exception
+      # will simply be ignored as it will be processed later anyway.
+      try:
+        if int(row_key) >= len(starts):
+          raise IndexError("Row key {} out of bounds".format(row_key))
+      except (TypeError, ValueError):
+        pass
     row = rt_input.values[starts[row_key]:limits[row_key]]
     return row.__getitem__(inner_keys)
 
@@ -344,7 +366,7 @@ def _build_ragged_tensor_from_value_ranges(starts, limits, step, values):
 
   # Use `ragged_gather` or `array_ops.gather` to collect the values.
   if isinstance(values, ragged_tensor.RaggedTensor):
-    gathered_values = ragged_array_ops.gather(
+    gathered_values = ragged_gather_ops.gather(
         params=values, indices=value_indices.values)
   else:
     gathered_values = array_ops.gather(
diff --git a/tensorflow/python/ops/ragged/ragged_map_flat_values_op_test.py b/tensorflow/python/ops/ragged/ragged_map_flat_values_op_test.py
index 8b28cac99db29e9ab2a2758db3449413b83cd747..e9a7cdf6c06269f3e9c879911631b2c089be23d5 100644
--- a/tensorflow/python/ops/ragged/ragged_map_flat_values_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_map_flat_values_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.map_flat_values."""
+"""Tests for ragged_functional_ops.map_flat_values."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -24,7 +24,9 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_functional_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -38,68 +40,66 @@ class RaggedMapInnerValuesOpTest(ragged_test_util.RaggedTensorTestCase):
                                         args=(),
                                         kwargs=None):
     kwargs = kwargs or {}
-    result = ragged.map_flat_values(op, *args, **kwargs)
-    with self.test_session():
-      self.assertRaggedEqual(result, expected)
+    result = ragged_functional_ops.map_flat_values(op, *args, **kwargs)
+    self.assertRaggedEqual(result, expected)
 
   def testDocStringExamples(self):
     """Test the examples in apply_op_to_ragged_values.__doc__."""
-    rt = ragged.constant([[1, 2, 3], [], [4, 5], [6]])
-    v1 = ragged.map_flat_values(array_ops.ones_like, rt)
-    v2 = ragged.map_flat_values(math_ops.multiply, rt, rt)
-    v3 = ragged.map_flat_values(math_ops.add, rt, 5)
-    with self.test_session():
-      self.assertRaggedEqual(v1, [[1, 1, 1], [], [1, 1], [1]])
-      self.assertRaggedEqual(v2, [[1, 4, 9], [], [16, 25], [36]])
-      self.assertRaggedEqual(v3, [[6, 7, 8], [], [9, 10], [11]])
+    rt = ragged_factory_ops.constant([[1, 2, 3], [], [4, 5], [6]])
+    v1 = ragged_functional_ops.map_flat_values(array_ops.ones_like, rt)
+    v2 = ragged_functional_ops.map_flat_values(math_ops.multiply, rt, rt)
+    v3 = ragged_functional_ops.map_flat_values(math_ops.add, rt, 5)
+    self.assertRaggedEqual(v1, [[1, 1, 1], [], [1, 1], [1]])
+    self.assertRaggedEqual(v2, [[1, 4, 9], [], [16, 25], [36]])
+    self.assertRaggedEqual(v3, [[6, 7, 8], [], [9, 10], [11]])
 
   def testOpWithSingleRaggedTensorArg(self):
-    tensor = ragged.constant([[1, 2, 3], [], [4, 5]])
+    tensor = ragged_factory_ops.constant([[1, 2, 3], [], [4, 5]])
     self.assertRaggedMapInnerValuesReturns(
         op=array_ops.zeros_like,
         args=(tensor,),
         expected=[[0, 0, 0], [], [0, 0]])
 
   def testOpWithTwoRaggedTensorArgs(self):
-    x = ragged.constant([[3, 1, 4], [], [1, 5]])
-    y = ragged.constant([[1, 2, 3], [], [4, 5]])
+    x = ragged_factory_ops.constant([[3, 1, 4], [], [1, 5]])
+    y = ragged_factory_ops.constant([[1, 2, 3], [], [4, 5]])
     self.assertRaggedMapInnerValuesReturns(
         op=math_ops.multiply, args=(x, y), expected=[[3, 2, 12], [], [4, 25]])
 
   def testOpWithRaggedTensorAndScalarArgs(self):
-    y = ragged.constant([[1, 2, 3], [], [4, 5]])
+    y = ragged_factory_ops.constant([[1, 2, 3], [], [4, 5]])
     self.assertRaggedMapInnerValuesReturns(
         op=math_ops.multiply, args=(5, y), expected=[[5, 10, 15], [], [20, 25]])
 
   def testOpWithThreeRaggedTensorArgs(self):
-    condition = ragged.constant(
+    condition = ragged_factory_ops.constant(
         [[True, True, False], [], [True, False]])  # pyformat: disable
-    x = ragged.constant([['a', 'b', 'c'], [], ['d', 'e']])
-    y = ragged.constant([['A', 'B', 'C'], [], ['D', 'E']])
+    x = ragged_factory_ops.constant([['a', 'b', 'c'], [], ['d', 'e']])
+    y = ragged_factory_ops.constant([['A', 'B', 'C'], [], ['D', 'E']])
     self.assertRaggedMapInnerValuesReturns(
         op=array_ops.where,
         args=(condition, x, y),
         expected=[[b'a', b'b', b'C'], [], [b'd', b'E']])
 
   def testOpWithRaggedTensorListArg(self):
-    x = ragged.constant([[1, 2, 3], [], [4, 5]])
-    y = ragged.constant([[10, 20, 30], [], [40, 50]])
+    x = ragged_factory_ops.constant([[1, 2, 3], [], [4, 5]])
+    y = ragged_factory_ops.constant([[10, 20, 30], [], [40, 50]])
     self.assertRaggedMapInnerValuesReturns(
         op=math_ops.add_n,
         args=([x, y, x],),
         expected=[[12, 24, 36], [], [48, 60]])
 
   def testOpWithKeywordArgs(self):
-    x = ragged.constant([[3, 1, 4], [], [1, 5]])
-    y = ragged.constant([[1, 2, 3], [], [4, 5]])
+    x = ragged_factory_ops.constant([[3, 1, 4], [], [1, 5]])
+    y = ragged_factory_ops.constant([[1, 2, 3], [], [4, 5]])
     self.assertRaggedMapInnerValuesReturns(
         op=math_ops.multiply,
         kwargs=dict(x=x, y=y),
         expected=[[3, 2, 12], [], [4, 25]])
 
   def testOpWithMixedPositionalAndKeywordArgs(self):
-    x = ragged.constant([[3, 1, 4], [], [1, 5]])
-    y = ragged.constant([[1, 2, 3], [], [4, 5]])
+    x = ragged_factory_ops.constant([[3, 1, 4], [], [1, 5]])
+    y = ragged_factory_ops.constant([[1, 2, 3], [], [4, 5]])
     self.assertRaggedMapInnerValuesReturns(
         op=math_ops.multiply,
         args=(x,),
@@ -107,7 +107,7 @@ class RaggedMapInnerValuesOpTest(ragged_test_util.RaggedTensorTestCase):
         expected=[[3, 2, 12], [], [4, 25]])
 
   def testNonElementWiseOp(self):
-    x = ragged.constant(
+    x = ragged_factory_ops.constant(
         [[[3, 1, 4], [1, 5, 9], [2, 6, 5]], [], [[3, 5, 8], [9, 7, 9]]],
         ragged_rank=1)
     self.assertRaggedMapInnerValuesReturns(
@@ -122,21 +122,22 @@ class RaggedMapInnerValuesOpTest(ragged_test_util.RaggedTensorTestCase):
     # ragged_rank=0
     x0 = [3, 1, 4, 1, 5, 9, 2, 6, 5]
     y0 = [1, 2, 3, 4, 5, 6, 7, 8, 9]
-    with self.test_session():
-      self.assertRaggedEqual(
-          math_ops.multiply(x0, y0), [3, 2, 12, 4, 25, 54, 14, 48, 45])
+    self.assertRaggedEqual(
+        math_ops.multiply(x0, y0), [3, 2, 12, 4, 25, 54, 14, 48, 45])
 
     # ragged_rank=1
-    x1 = ragged.constant([[3, 1, 4], [], [1, 5], [9, 2], [6, 5]])
-    y1 = ragged.constant([[1, 2, 3], [], [4, 5], [6, 7], [8, 9]])
+    x1 = ragged_factory_ops.constant([[3, 1, 4], [], [1, 5], [9, 2], [6, 5]])
+    y1 = ragged_factory_ops.constant([[1, 2, 3], [], [4, 5], [6, 7], [8, 9]])
     self.assertRaggedMapInnerValuesReturns(
         op=math_ops.multiply,
         args=(x1, y1),
         expected=[[3, 2, 12], [], [4, 25], [54, 14], [48, 45]])
 
     # ragged_rank=2
-    x2 = ragged.constant([[[3, 1, 4]], [], [[], [1, 5]], [[9, 2], [6, 5]]])
-    y2 = ragged.constant([[[1, 2, 3]], [], [[], [4, 5]], [[6, 7], [8, 9]]])
+    x2 = ragged_factory_ops.constant([[[3, 1, 4]], [], [[], [1, 5]],
+                                      [[9, 2], [6, 5]]])
+    y2 = ragged_factory_ops.constant([[[1, 2, 3]], [], [[], [4, 5]],
+                                      [[6, 7], [8, 9]]])
     self.assertRaggedMapInnerValuesReturns(
         op=math_ops.multiply,
         args=(x2, y2),
@@ -147,10 +148,10 @@ class RaggedMapInnerValuesOpTest(ragged_test_util.RaggedTensorTestCase):
                  ])  # pyformat: disable
 
     # ragged_rank=3
-    x3 = ragged.constant([[[[3, 1, 4]], []], [], [[[], [1, 5]]],
-                          [[[9, 2], [6, 5]]]])
-    y3 = ragged.constant([[[[1, 2, 3]], []], [], [[[], [4, 5]]],
-                          [[[6, 7], [8, 9]]]])
+    x3 = ragged_factory_ops.constant([[[[3, 1, 4]], []], [], [[[], [1, 5]]],
+                                      [[[9, 2], [6, 5]]]])
+    y3 = ragged_factory_ops.constant([[[[1, 2, 3]], []], [], [[[], [4, 5]]],
+                                      [[[6, 7], [8, 9]]]])
     self.assertRaggedMapInnerValuesReturns(
         op=math_ops.multiply,
         args=(x3, y3),
@@ -162,8 +163,8 @@ class RaggedMapInnerValuesOpTest(ragged_test_util.RaggedTensorTestCase):
         ])  # pyformat: disable
 
   def testOpWithRaggedRankThree(self):
-    x = ragged.constant([[[3, 1, 4]], [], [[], [1, 5]]])
-    y = ragged.constant([[[1, 2, 3]], [], [[], [4, 5]]])
+    x = ragged_factory_ops.constant([[[3, 1, 4]], [], [[], [1, 5]]])
+    y = ragged_factory_ops.constant([[[1, 2, 3]], [], [[], [4, 5]]])
     self.assertRaggedMapInnerValuesReturns(
         op=math_ops.multiply,
         args=(x, y),
@@ -176,29 +177,30 @@ class RaggedMapInnerValuesOpTest(ragged_test_util.RaggedTensorTestCase):
         op=math_ops.multiply, args=(x, y), expected=[[2, 4], [6, 8], [10, 12]])
 
   def testRaggedTensorSplitsRaggedRankMismatchError(self):
-    x = ragged.constant([[3, 1, 4], [], [1, 5]])
-    y = ragged.constant([[[3, 1, 4], []], [], [[1, 5]]])
-    self.assertRaisesRegexp(ValueError,
-                            r'Inputs must have identical ragged splits.*',
-                            ragged.map_flat_values, math_ops.add, x, y)
+    x = ragged_factory_ops.constant([[3, 1, 4], [], [1, 5]])
+    y = ragged_factory_ops.constant([[[3, 1, 4], []], [], [[1, 5]]])
+    self.assertRaisesRegexp(
+        ValueError, r'Inputs must have identical ragged splits.*',
+        ragged_functional_ops.map_flat_values, math_ops.add, x, y)
 
   def testRaggedTensorSplitsValueMismatchError(self):
-    x = ragged.constant([[3, 1, 4], [], [1, 5]])
-    y = ragged.constant([[1], [2, 3], [4, 5]])
+    x = ragged_factory_ops.constant([[3, 1, 4], [], [1, 5]])
+    y = ragged_factory_ops.constant([[1], [2, 3], [4, 5]])
     self.assertRaisesRegexp(errors.InvalidArgumentError,
                             r'Inputs must have identical ragged splits.*',
-                            ragged.map_flat_values, math_ops.add, x, y)
+                            ragged_functional_ops.map_flat_values, math_ops.add,
+                            x, y)
 
   def testRaggedTensorSplitsMismatchErrorAtRuntime(self):
     splits1 = array_ops.placeholder_with_default(
         constant_op.constant([0, 3, 3, 5], dtypes.int64), None)
     splits2 = array_ops.placeholder_with_default(
         constant_op.constant([0, 1, 3, 5], dtypes.int64), None)
-    x = ragged.RaggedTensor.from_row_splits([3, 1, 4, 1, 5], splits1)
-    y = ragged.RaggedTensor.from_row_splits([1, 2, 3, 4, 5], splits2)
+    x = ragged_tensor.RaggedTensor.from_row_splits([3, 1, 4, 1, 5], splits1)
+    y = ragged_tensor.RaggedTensor.from_row_splits([1, 2, 3, 4, 5], splits2)
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  r'.*Inputs must have identical ragged splits'):
-      self.evaluate(ragged.map_flat_values(math_ops.add, x, y))
+      self.evaluate(ragged_functional_ops.map_flat_values(math_ops.add, x, y))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
index 49c0996b24f30dd33219d3292446239717bbf487..15206404b2a54e2660113755f392eec190e148f9 100644
--- a/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.map_fn."""
+"""Tests for ragged_map_ops.map_fn."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -21,12 +21,17 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops as mo
-from tensorflow.python.ops import ragged
 from tensorflow.python.ops import string_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_functional_ops
+from tensorflow.python.ops.ragged import ragged_map_ops
+from tensorflow.python.ops.ragged import ragged_math_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -65,7 +70,7 @@ class RaggedMapOpTest(ragged_test_util.RaggedTensorTestCase,
           elems=[[1, 2, 3], [4, 5], [6, 7]],
           expected_output=[[2, 3, 4], [5, 6], [7, 8]],
           dtype=dtypes.int64,
-          result_dtype=ragged.RaggedTensorType(
+          result_dtype=ragged_tensor.RaggedTensorType(
               dtype=dtypes.int64, ragged_rank=1),
       ),
       # [d1, (d2), d3] -> [d1, (d2), d3]
@@ -74,45 +79,45 @@ class RaggedMapOpTest(ragged_test_util.RaggedTensorTestCase,
           elems=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
           elems_ragged_rank=1,
           expected_ragged_rank=1,
-          result_dtype=ragged.RaggedTensorType(
+          result_dtype=ragged_tensor.RaggedTensorType(
               dtype=dtypes.int64, ragged_rank=1),
           expected_output=[[[2, 3], [4, 5]], [], [[6, 7], [8, 9], [10, 1]]],
       ),
       # [d1, (d2)] -> [d1, (d2), (d3)]
       dict(
-          fn=lambda x: ragged.RaggedTensor.from_row_starts(x, [0]),
+          fn=lambda x: ragged_tensor.RaggedTensor.from_row_starts(x, [0]),
           elems=[[1, 2, 3], [4, 5], [6, 7]],
           expected_output=[[[1, 2, 3]], [[4, 5]], [[6, 7]]],
-          result_dtype=ragged.RaggedTensorType(
+          result_dtype=ragged_tensor.RaggedTensorType(
               dtype=dtypes.int64, ragged_rank=2),
       ),
       # [d1, (d2), (d3)] -> [d1, (d2), (d3)]
       dict(
-          fn=lambda x: ragged.map_flat_values(mo.add, x, 1),
+          fn=lambda x: ragged_functional_ops.map_flat_values(mo.add, x, 1),
           elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
           expected_output=[[[2, 3, 4]], [[5, 6], [7, 8]]],
-          result_dtype=ragged.RaggedTensorType(
+          result_dtype=ragged_tensor.RaggedTensorType(
               dtype=dtypes.int64, ragged_rank=2),
       ),
       # [d1, (d2), (d3)] -> [d1, (d2)]
       dict(
-          fn=lambda x: ragged.reduce_sum(x, axis=1),
+          fn=lambda x: ragged_math_ops.reduce_sum(x, axis=1),
           elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
           expected_output=[[6], [9, 13]],
-          result_dtype=ragged.RaggedTensorType(
+          result_dtype=ragged_tensor.RaggedTensorType(
               dtype=dtypes.int64, ragged_rank=1),
       ),
       # [d1, (d2), (d3)] -> [d1, (d3)]
       dict(
-          fn=lambda x: ragged.reduce_sum(x, axis=0),
+          fn=lambda x: ragged_math_ops.reduce_sum(x, axis=0),
           elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
           expected_output=[[1, 2, 3], [10, 12]],
-          result_dtype=ragged.RaggedTensorType(
+          result_dtype=ragged_tensor.RaggedTensorType(
               dtype=dtypes.int64, ragged_rank=1),
       ),
       # [d1, (d2), (d3)] -> [d1]
       dict(
-          fn=ragged.reduce_sum,
+          fn=ragged_math_ops.reduce_sum,
           elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
           expected_output=[6, 22],
           result_dtype=dtypes.int64,
@@ -122,16 +127,16 @@ class RaggedMapOpTest(ragged_test_util.RaggedTensorTestCase,
           fn=mo.range,
           elems=[4, 0, 2],
           expected_output=[[0, 1, 2, 3], [], [0, 1]],
-          result_dtype=ragged.RaggedTensorType(
+          result_dtype=ragged_tensor.RaggedTensorType(
               dtype=dtypes.int64, ragged_rank=1),
       ),
       # [d1] -> [d1, (d2), (d3)]
       dict(
-          fn=lambda x: ragged.range(mo.range(x)),
+          fn=lambda x: ragged_math_ops.range(mo.range(x)),
           elems=[5, 0, 3],
           expected_output=[[[], [0], [0, 1], [0, 1, 2], [0, 1, 2, 3]], [],
                            [[], [0], [0, 1]]],
-          result_dtype=ragged.RaggedTensorType(
+          result_dtype=ragged_tensor.RaggedTensorType(
               dtype=dtypes.int64, ragged_rank=2),
       ),
       # [d1, (d2), (d3), (d4a), (d5)] ->  [d1, (d2), (d3), (d4b), (d5)]
@@ -140,7 +145,7 @@ class RaggedMapOpTest(ragged_test_util.RaggedTensorTestCase,
           elems=[[[[[1, 2, 3]], [[4], [5]]]], [[[[6, 7]]], [[[8], []]]]],
           expected_output=[[[[[2, 3, 4]], [[5], [6]]]], [[[[7, 8]]], [[[9],
                                                                        []]]]],
-          result_dtype=ragged.RaggedTensorType(
+          result_dtype=ragged_tensor.RaggedTensorType(
               dtype=dtypes.int64, ragged_rank=4),
       ),
   ])
@@ -157,25 +162,25 @@ class RaggedMapOpTest(ragged_test_util.RaggedTensorTestCase,
       result_dtype=None,
       infer_shape=False,
   ):
-    elems = ragged.constant(elems, dtype, elems_ragged_rank)
-    output = ragged.map_fn(
+    elems = ragged_factory_ops.constant(elems, dtype, elems_ragged_rank)
+    output = ragged_map_ops.map_fn(
         fn=fn, elems=elems, dtype=result_dtype, infer_shape=infer_shape)
 
-    expected_rt = ragged.constant(
+    expected_rt = ragged_factory_ops.constant(
         expected_output, ragged_rank=expected_ragged_rank)
     self.assertRaggedEqual(expected_rt, output)
 
   def testRaggedMapOnStructure(self):
-    batman = ragged.constant([[1, 2, 3], [4], [5, 6, 7]])
+    batman = ragged_factory_ops.constant([[1, 2, 3], [4], [5, 6, 7]])
     # [[10, 20, 30], [40], [50, 60, 70]]
-    robin = ragged.map_flat_values(mo.multiply, batman, 10)
+    robin = ragged_functional_ops.map_flat_values(mo.multiply, batman, 10)
 
     features = {'batman': batman, 'robin': robin}
 
     def _reduce_sum_from_all(f):
       return mo.reduce_sum(f['batman']) + mo.reduce_sum(f['robin'])
 
-    output = ragged.map_fn(
+    output = ragged_map_ops.map_fn(
         fn=_reduce_sum_from_all,
         elems=features,
         dtype=dtypes.int32,
@@ -185,9 +190,9 @@ class RaggedMapOpTest(ragged_test_util.RaggedTensorTestCase,
 
   # Test mapping over a dict of RTs can produce a dict of RTs.
   def testRaggedMapOnStructure_RaggedOutputs(self):
-    batman = ragged.constant([[1, 2, 3], [4], [5, 6, 7]])
+    batman = ragged_factory_ops.constant([[1, 2, 3], [4], [5, 6, 7]])
     # [[10, 20, 30], [40], [50, 60, 70]]
-    robin = ragged.map_flat_values(mo.multiply, batman, 10)
+    robin = ragged_functional_ops.map_flat_values(mo.multiply, batman, 10)
 
     features = {'batman': batman, 'robin': robin}
 
@@ -197,15 +202,17 @@ class RaggedMapOpTest(ragged_test_util.RaggedTensorTestCase,
           'robin': f['robin'] + 1,
       }
 
-    output = ragged.map_fn(
+    output = ragged_map_ops.map_fn(
         fn=_increment,
         elems=features,
         infer_shape=False,
         dtype={
             'batman':
-                ragged.RaggedTensorType(dtype=dtypes.int32, ragged_rank=1),
+                ragged_tensor.RaggedTensorType(
+                    dtype=dtypes.int32, ragged_rank=1),
             'robin':
-                ragged.RaggedTensorType(dtype=dtypes.int32, ragged_rank=1)
+                ragged_tensor.RaggedTensorType(
+                    dtype=dtypes.int32, ragged_rank=1)
         },
     )
 
@@ -213,8 +220,8 @@ class RaggedMapOpTest(ragged_test_util.RaggedTensorTestCase,
     self.assertRaggedEqual(output['robin'], [[11, 21, 31], [41], [51, 61, 71]])
 
   def testZip(self):
-    x = ragged.constant([[10, 20], [30, 40], [50, 60], [70], [80, 90, 100]],
-                        dtypes.int64)
+    x = ragged_factory_ops.constant(
+        [[10, 20], [30, 40], [50, 60], [70], [80, 90, 100]], dtypes.int64)
     y = array_ops.expand_dims(mo.range(x.nrows(), dtype=dtypes.int64), axis=1)
 
     def _zip(foo):
@@ -222,9 +229,9 @@ class RaggedMapOpTest(ragged_test_util.RaggedTensorTestCase,
       bar = backend.tile(y_val, array_ops.shape(x_val))
       return array_ops.stack([bar, x_val], axis=1)
 
-    output = ragged.map_fn(
+    output = ragged_map_ops.map_fn(
         _zip, (y, x),
-        dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=1),
+        dtype=ragged_tensor.RaggedTensorType(dtype=dtypes.int64, ragged_rank=1),
         infer_shape=False)
 
     self.assertRaggedEqual(
@@ -232,43 +239,58 @@ class RaggedMapOpTest(ragged_test_util.RaggedTensorTestCase,
                  [[3, 70]], [[4, 80], [4, 90], [4, 100]]])
 
   def testBatchGather(self):
-    tokens = ragged.constant([['hello', '.', 'there'], ['merhaba'],
-                              ['bonjour', '.', 'ca va', '?']])
-    indices = ragged.constant([[0, 2], [0], [0, 2]])
+    tokens = ragged_factory_ops.constant([['hello', '.', 'there'], ['merhaba'],
+                                          ['bonjour', '.', 'ca va', '?']])
+    indices = ragged_factory_ops.constant([[0, 2], [0], [0, 2]])
 
     def gather(x):
       tokens_val, indices_val = x
       return array_ops.gather(tokens_val, indices_val)
 
     data = tokens, indices
-    out = ragged.map_fn(
+    out = ragged_map_ops.map_fn(
         gather,
         data,
-        dtype=ragged.RaggedTensorType(dtype=dtypes.string, ragged_rank=1),
+        dtype=ragged_tensor.RaggedTensorType(
+            dtype=dtypes.string, ragged_rank=1),
         infer_shape=False)
 
     self.assertRaggedEqual(
         out, [[b'hello', b'there'], [b'merhaba'], [b'bonjour', b'ca va']])
 
   def testMismatchRaggedRank(self):
-    elems = ragged.constant([[[1, 2, 3]], [[4, 5], [6, 7]]])
-    fn = lambda x: ragged.reduce_sum(x, axis=0)
+    elems = ragged_factory_ops.constant([[[1, 2, 3]], [[4, 5], [6, 7]]])
+    fn = lambda x: ragged_math_ops.reduce_sum(x, axis=0)
     with self.assertRaisesWithLiteralMatch(
         ValueError, r'The declared ragged rank (23) mismatches the result (1)'):
-      _ = ragged.map_fn(
+      _ = ragged_map_ops.map_fn(
           fn,
           elems,
-          dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=23))
+          dtype=ragged_tensor.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=23))
 
   def testMismatchRaggedRank2(self):
-    elems = ragged.constant([[1, 2, 3], [4, 5], [6, 7]])
-    fn = lambda x: ragged.RaggedTensor.from_row_starts(x, [0])
+    elems = ragged_factory_ops.constant([[1, 2, 3], [4, 5], [6, 7]])
+    fn = lambda x: ragged_tensor.RaggedTensor.from_row_starts(x, [0])
     with self.assertRaisesWithLiteralMatch(
         ValueError, r'The declared ragged rank (10) mismatches the result (1)'):
-      _ = ragged.map_fn(
+      _ = ragged_map_ops.map_fn(
           fn,
           elems,
-          dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=10))
+          dtype=ragged_tensor.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=10))
+
+  def testMapOnSparseTensor(self):
+    s = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [0, 1], [1, 0], [1, 1]],
+        values=[0, 5, 0, 4],
+        dense_shape=[2, 2],
+    )
+    t2 = ragged_tensor.RaggedTensor.from_sparse(s)
+    id_t2 = ragged_map_ops.map_fn(
+        lambda x: x, t2,
+    )
+    self.assertRaggedEqual(id_t2, [[0, 5], [0, 4]])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_map_ops.py b/tensorflow/python/ops/ragged/ragged_map_ops.py
index af40352b1d02fe8ccce242d31fb33e2f8a21f1ce..1d342512c0206c8877f8e669c9e3df78d784fd8f 100644
--- a/tensorflow/python/ops/ragged/ragged_map_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_map_ops.py
@@ -12,11 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Functional operations.
-
-See the [Higher Order
-Functions](https://tensorflow.org/api_guides/python/functional_ops) guide.
-"""
+"""Functional operations for RaggedTensors."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -30,6 +26,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops.ragged import ragged_tensor
@@ -238,6 +235,7 @@ def map_fn(fn,
       n = (tensor_shape.dimension_value(static_shape[0]) or
            array_ops.shape(elems_flat[0])[0])
 
+    n = math_ops.cast(n, dtype=dtypes.int32)
     # Create a flat list of TAs.
 
     # Flatten the dtype structure to a list.
@@ -254,7 +252,7 @@ def map_fn(fn,
         for t in dtype_components_flat
     ]
 
-    i = constant_op.constant(0)
+    i = constant_op.constant(0, dtype=dtypes.int32)
 
     def compute(i, tas):
       """The loop body of map_fn.
diff --git a/tensorflow/python/ops/ragged/ragged_math_ops.py b/tensorflow/python/ops/ragged/ragged_math_ops.py
index 92f82be84aca06ae723f00103dccbdeb5c64371f..02e927b6991f8d86176c347442a2f49cfdf4ce92 100644
--- a/tensorflow/python/ops/ragged/ragged_math_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_math_ops.py
@@ -31,12 +31,14 @@ from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
 from tensorflow.python.ops.ragged import segment_id_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 #===============================================================================
 # ragged.range
 #===============================================================================
 # pylint: disable=redefined-builtin
+@tf_export('ragged.range')
 def range(starts, limits=None, deltas=1, dtype=None, name=None):
   """Returns a `RaggedTensor` containing the specified sequences of numbers.
 
@@ -269,28 +271,32 @@ def segment_max(data, segment_ids, num_segments, name=None):
 
 
 def segment_mean(data, segment_ids, num_segments, name=None):
-  # For docs, see: _RAGGED_SEGMENT_DOCSTRING
+  """For docs, see: _RAGGED_SEGMENT_DOCSTRING."""
   with ops.name_scope(name, 'RaggedSegmentMean',
                       [data, segment_ids, num_segments]):
     total = segment_sum(data, segment_ids, num_segments)
     ones = ragged_tensor.RaggedTensor.from_nested_row_splits(
         array_ops.ones_like(data.flat_values), data.nested_row_splits)
     count = segment_sum(ones, segment_ids, num_segments)
-    return ragged_tensor.RaggedTensor.from_nested_row_splits(
-        total.flat_values / count.flat_values, total.nested_row_splits)
+    if ragged_tensor.is_ragged(total):
+      return total.with_flat_values(total.flat_values / count.flat_values)
+    else:
+      return total / count
 
 
 def segment_sqrt_n(data, segment_ids, num_segments, name=None):
-  # For docs, see: _RAGGED_SEGMENT_DOCSTRING
+  """For docs, see: _RAGGED_SEGMENT_DOCSTRING."""
   with ops.name_scope(name, 'RaggedSegmentSqrtN',
                       [data, segment_ids, num_segments]):
     total = segment_sum(data, segment_ids, num_segments)
     ones = ragged_tensor.RaggedTensor.from_nested_row_splits(
         array_ops.ones_like(data.flat_values), data.nested_row_splits)
     count = segment_sum(ones, segment_ids, num_segments)
-    return ragged_tensor.RaggedTensor.from_nested_row_splits(
-        total.flat_values / math_ops.sqrt(count.flat_values),
-        total.nested_row_splits)
+    if ragged_tensor.is_ragged(total):
+      return total.with_flat_values(
+          total.flat_values / math_ops.sqrt(count.flat_values))
+    else:
+      return total / math_ops.sqrt(count)
 
 
 def _set_ragged_segment_docstring(func, combination, combined):
@@ -465,11 +471,11 @@ def _ragged_reduce_aggregate(reduce_op,
         return _ragged_reduce_aggregate(reduce_op, unsorted_segment_op,
                                         inner_reduced, axis[:-1], keepdims)
 
-    axis = ragged_util.get_positive_axis(axis, rt_input.shape.ndims)
-
     rt_input = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         rt_input, name='rt_input')
 
+    axis = ragged_util.get_positive_axis(axis, rt_input.shape.ndims)
+
     if axis == 0:
       # out[i_1, i_2, ..., i_N] = sum_{j} rt_input[j, i_1, i_2, ..., i_N]
       row_lengths = rt_input.row_splits[1:] - rt_input.row_splits[:-1]
diff --git a/tensorflow/python/ops/ragged/ragged_operators_test.py b/tensorflow/python/ops/ragged/ragged_operators_test.py
index 78bb37c341e9261a972445cbd34f8e1b0fc674d9..d1c6b902f2fa223b3fabfb4184e8ebb004b16a40 100644
--- a/tensorflow/python/ops/ragged/ragged_operators_test.py
+++ b/tensorflow/python/ops/ragged/ragged_operators_test.py
@@ -19,7 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -28,16 +28,16 @@ from tensorflow.python.platform import googletest
 class RaggedElementwiseOpsTest(ragged_test_util.RaggedTensorTestCase):
 
   def testOrderingOperators(self):
-    x = ragged.constant([[1, 5], [3]])
-    y = ragged.constant([[4, 5], [1]])
+    x = ragged_factory_ops.constant([[1, 5], [3]])
+    y = ragged_factory_ops.constant([[4, 5], [1]])
     self.assertRaggedEqual((x > y), [[False, False], [True]])
     self.assertRaggedEqual((x >= y), [[False, True], [True]])
     self.assertRaggedEqual((x < y), [[True, False], [False]])
     self.assertRaggedEqual((x <= y), [[True, True], [False]])
 
   def testArithmeticOperators(self):
-    x = ragged.constant([[1.0, -2.0], [8.0]])
-    y = ragged.constant([[4.0, 4.0], [2.0]])
+    x = ragged_factory_ops.constant([[1.0, -2.0], [8.0]])
+    y = ragged_factory_ops.constant([[4.0, 4.0], [2.0]])
     self.assertRaggedEqual(abs(x), [[1.0, 2.0], [8.0]])
 
     self.assertRaggedEqual((-x), [[-1.0, 2.0], [-8.0]])
@@ -70,8 +70,8 @@ class RaggedElementwiseOpsTest(ragged_test_util.RaggedTensorTestCase):
     self.assertRaggedEqual((x % 2.0), [[1.0, 0.0], [0.0]])
 
   def testLogicalOperators(self):
-    a = ragged.constant([[True, True], [False]])
-    b = ragged.constant([[True, False], [False]])
+    a = ragged_factory_ops.constant([[True, True], [False]])
+    b = ragged_factory_ops.constant([[True, False], [False]])
     self.assertRaggedEqual((~a), [[False, False], [True]])
 
     self.assertRaggedEqual((a & b), [[True, False], [False]])
@@ -87,7 +87,7 @@ class RaggedElementwiseOpsTest(ragged_test_util.RaggedTensorTestCase):
     self.assertRaggedEqual((True ^ b), [[False, True], [True]])
 
   def testDummyOperators(self):
-    a = ragged.constant([[True, True], [False]])
+    a = ragged_factory_ops.constant([[True, True], [False]])
     with self.assertRaisesRegexp(TypeError,
                                  'RaggedTensor may not be used as a boolean.'):
       bool(a)
diff --git a/tensorflow/python/ops/ragged/ragged_range_op_test.py b/tensorflow/python/ops/ragged/ragged_range_op_test.py
index 5ab3d4abc3988b05add4bf98e31e472d2d5b2e88..afe5866cff5002791a84a051f1a9fd1a9da06fb1 100644
--- a/tensorflow/python/ops/ragged/ragged_range_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_range_op_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -30,32 +30,32 @@ class RaggedRangeOpTest(ragged_test_util.RaggedTensorTestCase):
 
   def testDocStringExamples(self):
     """Examples from ragged_range.__doc__."""
-    rt1 = ragged.range([3, 5, 2])
+    rt1 = ragged_math_ops.range([3, 5, 2])
     self.assertRaggedEqual(rt1, [[0, 1, 2], [0, 1, 2, 3, 4], [0, 1]])
 
-    rt2 = ragged.range([0, 5, 8], [3, 3, 12])
+    rt2 = ragged_math_ops.range([0, 5, 8], [3, 3, 12])
     self.assertRaggedEqual(rt2, [[0, 1, 2], [], [8, 9, 10, 11]])
 
-    rt3 = ragged.range([0, 5, 8], [3, 3, 12], 2)
+    rt3 = ragged_math_ops.range([0, 5, 8], [3, 3, 12], 2)
     self.assertRaggedEqual(rt3, [[0, 2], [], [8, 10]])
 
   def testBasicRanges(self):
     # Specify limits only.
     self.assertRaggedEqual(
-        ragged.range([0, 3, 5]),
+        ragged_math_ops.range([0, 3, 5]),
         [list(range(0)), list(range(3)),
          list(range(5))])
 
     # Specify starts and limits.
     self.assertRaggedEqual(
-        ragged.range([0, 3, 5], [2, 3, 10]),
+        ragged_math_ops.range([0, 3, 5], [2, 3, 10]),
         [list(range(0, 2)),
          list(range(3, 3)),
          list(range(5, 10))])
 
     # Specify starts, limits, and deltas.
     self.assertRaggedEqual(
-        ragged.range([0, 3, 5], [4, 4, 15], [2, 3, 4]),
+        ragged_math_ops.range([0, 3, 5], [4, 4, 15], [2, 3, 4]),
         [list(range(0, 4, 2)),
          list(range(3, 4, 3)),
          list(range(5, 15, 4))])
@@ -63,20 +63,21 @@ class RaggedRangeOpTest(ragged_test_util.RaggedTensorTestCase):
   def testFloatRanges(self):
     expected = [[0.0, 0.4, 0.8, 1.2, 1.6, 2.0, 2.4, 2.8, 3.2, 3.6], [3.0],
                 [5.0, 7.2, 9.4, 11.6, 13.8]]
-    actual = ragged.range([0.0, 3.0, 5.0], [3.9, 4.0, 15.0], [0.4, 1.5, 2.2])
+    actual = ragged_math_ops.range([0.0, 3.0, 5.0], [3.9, 4.0, 15.0],
+                                   [0.4, 1.5, 2.2])
     self.assertEqual(
         expected,
         [[round(v, 5) for v in row] for row in self.eval_to_list(actual)])
 
   def testNegativeDeltas(self):
     self.assertRaggedEqual(
-        ragged.range([0, 3, 5], limits=0, deltas=-1),
+        ragged_math_ops.range([0, 3, 5], limits=0, deltas=-1),
         [list(range(0, 0, -1)),
          list(range(3, 0, -1)),
          list(range(5, 0, -1))])
 
     self.assertRaggedEqual(
-        ragged.range([0, -3, 5], limits=0, deltas=[-1, 1, -2]),
+        ragged_math_ops.range([0, -3, 5], limits=0, deltas=[-1, 1, -2]),
         [list(range(0, 0, -1)),
          list(range(-3, 0, 1)),
          list(range(5, 0, -2))])
@@ -84,40 +85,43 @@ class RaggedRangeOpTest(ragged_test_util.RaggedTensorTestCase):
   def testBroadcast(self):
     # Specify starts and limits, broadcast deltas.
     self.assertRaggedEqual(
-        ragged.range([0, 3, 5], [4, 4, 15], 3),
+        ragged_math_ops.range([0, 3, 5], [4, 4, 15], 3),
         [list(range(0, 4, 3)),
          list(range(3, 4, 3)),
          list(range(5, 15, 3))])
 
     # Broadcast all arguments.
-    self.assertRaggedEqual(ragged.range(0, 5, 1), [list(range(0, 5, 1))])
+    self.assertRaggedEqual(
+        ragged_math_ops.range(0, 5, 1), [list(range(0, 5, 1))])
 
   def testEmptyRanges(self):
-    rt1 = ragged.range([0, 5, 3], [0, 3, 5])
-    rt2 = ragged.range([0, 5, 5], [0, 3, 5], -1)
+    rt1 = ragged_math_ops.range([0, 5, 3], [0, 3, 5])
+    rt2 = ragged_math_ops.range([0, 5, 5], [0, 3, 5], -1)
     self.assertRaggedEqual(rt1, [[], [], [3, 4]])
     self.assertRaggedEqual(rt2, [[], [5, 4], []])
 
   def testShapeFnErrors(self):
-    self.assertRaises((ValueError, errors.InvalidArgumentError), ragged.range,
-                      [[0]], 5)
-    self.assertRaises((ValueError, errors.InvalidArgumentError), ragged.range,
-                      0, [[5]])
-    self.assertRaises((ValueError, errors.InvalidArgumentError), ragged.range,
-                      0, 5, [[0]])
-    self.assertRaises((ValueError, errors.InvalidArgumentError), ragged.range,
-                      [0], [1, 2])
+    self.assertRaises((ValueError, errors.InvalidArgumentError),
+                      ragged_math_ops.range, [[0]], 5)
+    self.assertRaises((ValueError, errors.InvalidArgumentError),
+                      ragged_math_ops.range, 0, [[5]])
+    self.assertRaises((ValueError, errors.InvalidArgumentError),
+                      ragged_math_ops.range, 0, 5, [[0]])
+    self.assertRaises((ValueError, errors.InvalidArgumentError),
+                      ragged_math_ops.range, [0], [1, 2])
 
   def testKernelErrors(self):
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  r'Requires delta != 0'):
-      self.evaluate(ragged.range(0, 0, 0))
+      self.evaluate(ragged_math_ops.range(0, 0, 0))
 
   def testShape(self):
-    self.assertRaggedEqual(ragged.range(0, 0, 1).shape.as_list(), [1, None])
-    self.assertRaggedEqual(ragged.range([1, 2, 3]).shape.as_list(), [3, None])
     self.assertRaggedEqual(
-        ragged.range([1, 2, 3], [4, 5, 6]).shape.as_list(), [3, None])
+        ragged_math_ops.range(0, 0, 1).shape.as_list(), [1, None])
+    self.assertRaggedEqual(
+        ragged_math_ops.range([1, 2, 3]).shape.as_list(), [3, None])
+    self.assertRaggedEqual(
+        ragged_math_ops.range([1, 2, 3], [4, 5, 6]).shape.as_list(), [3, None])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_rank_op_test.py b/tensorflow/python/ops/ragged/ragged_rank_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..54eee3bc0425852e82858684509838e5812dffde
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_rank_op_test.py
@@ -0,0 +1,89 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.rank op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from absl.testing import parameterized
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedRankOpTest(ragged_test_util.RaggedTensorTestCase,
+                       parameterized.TestCase):
+
+  @parameterized.parameters([
+      # Rank 0
+      dict(
+          test_input=1,
+          expected_rank=0,
+      ),
+      # Rank 1
+      dict(
+          test_input=[1],
+          expected_rank=1,
+      ),
+      dict(
+          test_input=[1, 2, 3, 4],
+          expected_rank=1,
+      ),
+      # Rank 2
+      dict(
+          test_input=[[1], [2], [3]],
+          expected_rank=2,
+      ),
+      # Rank 3
+      dict(
+          test_input=[[[1], [2, 3]], [[4], [5, 6, 7]]],
+          expected_rank=3,
+      ),
+      # Rank 3, ragged_rank=2
+      dict(
+          test_input=[[[1], [2, 3], [10, 20]],
+                      [[4], [5, 6, 7]]],
+          expected_rank=3,
+          ragged_rank=2,
+      ),
+      # Rank 4, ragged_rank=3 with dimensions: {2, (1, 2), (2), (1, 2)}
+      dict(
+          test_input=[[[[1], [2]]],
+                      [[[3, 4], [5, 6]], [[7, 8], [9, 10]]]],
+          expected_rank=4,
+      ),
+      # Rank 4, ragged_rank=2 with dimensions: {2, (1, 2), (1, 2), 2}
+      dict(
+          test_input=[
+              [[[1, 2]]],
+              [[[5, 6], [7, 8]],
+               [[9, 10], [11, 12]]]],
+          expected_rank=4,
+          ragged_rank=2,
+      ),
+
+  ])
+  def testRaggedRank(self, test_input, expected_rank, ragged_rank=None):
+    test_input = ragged_factory_ops.constant(
+        test_input, ragged_rank=ragged_rank)
+    self.assertAllEqual(ragged_array_ops.rank(
+        test_input), expected_rank)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_reduce_op_test.py b/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
index 890460221bf9fdebe134d6ced77b1fca2dbdffd5..a9fa378eebc01e97390c48f5aaeebee7e9791359 100644
--- a/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.reduce_<AGGREGATE> ops."""
+"""Tests for ragged_math_ops.reduce_<AGGREGATE> ops."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -26,7 +26,8 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -52,88 +53,88 @@ class RaggedReduceOpsTest(ragged_test_util.RaggedTensorTestCase,
       #    [2, 6   ]]
       #=========================================================================
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=0,
           expected=[15, 12, 4]  # = [3+1+9+2, 1+5+6, 4]
       ),
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=-2,
           expected=[15, 12, 4]  # = [3+1+9+2, 1+5+6, 4]
       ),
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=1,
           expected=[8, 6, 9, 8]  # = [3+1+4, 1+5, 9, 2+6]
       ),
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=-1,
           expected=[8, 6, 9, 8]  # = [3+1+4, 1+5, 9, 2+6]
       ),
       dict(
-          ragged_reduce_op=ragged.reduce_prod,
+          ragged_reduce_op=ragged_math_ops.reduce_prod,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=0,
           expected=[54, 30, 4]  # = [3*1*9*2, 1*5*6, 4]
       ),
       dict(
-          ragged_reduce_op=ragged.reduce_prod,
+          ragged_reduce_op=ragged_math_ops.reduce_prod,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=1,
           expected=[12, 5, 9, 12]  # = [3*1*4, 1*5, 9, 2*6]
       ),
       dict(
-          ragged_reduce_op=ragged.reduce_min,
+          ragged_reduce_op=ragged_math_ops.reduce_min,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=0,
           expected=[1, 1, 4]  # = [min(3, 1, 9, 2), min(1, 5, 6), 4]
       ),
       dict(
-          ragged_reduce_op=ragged.reduce_min,
+          ragged_reduce_op=ragged_math_ops.reduce_min,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=1,
           expected=[1, 1, 9, 2]  # = [min(3, 1, 4), min(1, 5), 9, min(2, 6)]
       ),
       dict(
-          ragged_reduce_op=ragged.reduce_max,
+          ragged_reduce_op=ragged_math_ops.reduce_max,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=0,
           expected=[9, 6, 4]  # = [max(3, 1, 9, 2), max(1, 5, 6), 4]
       ),
       dict(
-          ragged_reduce_op=ragged.reduce_max,
+          ragged_reduce_op=ragged_math_ops.reduce_max,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=1,
           expected=[4, 5, 9, 6]  # = [max(3, 1, 4), max(1, 5), 9, max(2, 6)]
       ),
       dict(
-          ragged_reduce_op=ragged.reduce_mean,
+          ragged_reduce_op=ragged_math_ops.reduce_mean,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=0,
           expected=[3.75, 4, 4]  # = [mean(3, 1, 9, 2), mean(1, 5, 6), 4]
       ),
       dict(
-          ragged_reduce_op=ragged.reduce_any,
+          ragged_reduce_op=ragged_math_ops.reduce_any,
           rt_input=[[True, True], [True, True, False, True], [False, True]],
           axis=0,
           expected=[True, True, False, True]),
       dict(
-          ragged_reduce_op=ragged.reduce_any,
+          ragged_reduce_op=ragged_math_ops.reduce_any,
           rt_input=[[True, True], [True, True, False, True], [False, True]],
           axis=1,
           expected=[True, True, True]),
       dict(
-          ragged_reduce_op=ragged.reduce_all,
+          ragged_reduce_op=ragged_math_ops.reduce_all,
           rt_input=[[True, True], [True, True, False, True], [False, True]],
           axis=0,
           expected=[False, True, False, True]),
       dict(
-          ragged_reduce_op=ragged.reduce_all,
+          ragged_reduce_op=ragged_math_ops.reduce_all,
           rt_input=[[True, True], [True, True, False, True], [False, True]],
           axis=1,
           expected=[True, False, False]),
@@ -150,53 +151,53 @@ class RaggedReduceOpsTest(ragged_test_util.RaggedTensorTestCase,
 
       # axis=None
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=None,
           expected=0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9),
       dict(
-          ragged_reduce_op=ragged.reduce_prod,
+          ragged_reduce_op=ragged_math_ops.reduce_prod,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=None,
           expected=0 * 1 * 2 * 3 * 4 * 5 * 6 * 7 * 8 * 9),
       dict(
-          ragged_reduce_op=ragged.reduce_min,
+          ragged_reduce_op=ragged_math_ops.reduce_min,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=None,
           expected=min(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)),
       dict(
-          ragged_reduce_op=ragged.reduce_max,
+          ragged_reduce_op=ragged_math_ops.reduce_max,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=None,
           expected=max(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)),
       dict(
-          ragged_reduce_op=ragged.reduce_mean,
+          ragged_reduce_op=ragged_math_ops.reduce_mean,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=None,
           expected=mean(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)),
       # axis=0
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=0,
           expected=[0 + 4 + 5 + 7 + 8, 1 + 6 + 9, 2, 3]),
       dict(
-          ragged_reduce_op=ragged.reduce_prod,
+          ragged_reduce_op=ragged_math_ops.reduce_prod,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=0,
           expected=[0 * 4 * 5 * 7 * 8, 1 * 6 * 9, 2, 3]),
       dict(
-          ragged_reduce_op=ragged.reduce_min,
+          ragged_reduce_op=ragged_math_ops.reduce_min,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=0,
           expected=[min(0, 4, 5, 7, 8), min(1, 6, 9), 2, 3]),
       dict(
-          ragged_reduce_op=ragged.reduce_max,
+          ragged_reduce_op=ragged_math_ops.reduce_max,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=0,
           expected=[max(0, 4, 5, 7, 8), max(1, 6, 9), 2, 3]),
       dict(
-          ragged_reduce_op=ragged.reduce_mean,
+          ragged_reduce_op=ragged_math_ops.reduce_mean,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=0,
           expected=[mean(0, 4, 5, 7, 8),
@@ -205,24 +206,24 @@ class RaggedReduceOpsTest(ragged_test_util.RaggedTensorTestCase,
       # Note: we don't test mean here because it gives a NaN, and this will
       # cause assertEqual to fail (since NaN != NaN).  See testMeanNan().
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=1,
           expected=[0 + 1 + 2 + 3, 4, 0, 5 + 6, 7, 8 + 9]),
       dict(
-          ragged_reduce_op=ragged.reduce_prod,
+          ragged_reduce_op=ragged_math_ops.reduce_prod,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=1,
           expected=[0 * 1 * 2 * 3, 4, 1, 5 * 6, 7, 8 * 9]),
       dict(
-          ragged_reduce_op=ragged.reduce_min,
+          ragged_reduce_op=ragged_math_ops.reduce_min,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=1,
           expected=[min(0, 1, 2, 3), 4, _MAX_INT32,
                     min(5, 6), 7,
                     min(8, 9)]),
       dict(
-          ragged_reduce_op=ragged.reduce_max,
+          ragged_reduce_op=ragged_math_ops.reduce_max,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=1,
           expected=[max(0, 1, 2, 3), 4, _MIN_INT32,
@@ -237,47 +238,47 @@ class RaggedReduceOpsTest(ragged_test_util.RaggedTensorTestCase,
       #  [[9   ]                ]]
       #=========================================================================
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=[],
           expected=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]]),
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=None,
           expected=sum([1, 2, 3, 4, 5, 6, 7, 8, 9])),
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=0,
           expected=[[1 + 6 + 9, 2 + 7], [], [3 + 8, 4, 5]]),
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=1,
           expected=[[1 + 3, 2 + 4, 5], [6 + 8, 7], [], [9]]),
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=2,
           expected=[[1 + 2, 0, 3 + 4 + 5], [6 + 7, 0, 8], [], [9]]),
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=[0, 1],
           expected=[1 + 3 + 6 + 8 + 9, 2 + 4 + 7, 5]),
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=[0, 2],
           expected=[1 + 6 + 9 + 2 + 7, 0, 3 + 8 + 4 + 5]),
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=[1, 2],
           expected=[1 + 2 + 3 + 4 + 5, 6 + 7 + 8, 0, 9]),
       dict(
-          ragged_reduce_op=ragged.reduce_sum,
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=[0, 1, 2],
           expected=sum([1, 2, 3, 4, 5, 6, 7, 8, 9])),
@@ -289,23 +290,23 @@ class RaggedReduceOpsTest(ragged_test_util.RaggedTensorTestCase,
       #  [[9   ]          ]]
       #=========================================================================
       dict(
-          ragged_reduce_op=ragged.reduce_mean,
+          ragged_reduce_op=ragged_math_ops.reduce_mean,
           rt_input=[[[1, 2], [3, 4, 5]], [[6, 7], [8]], [[9]]],
           axis=0,
           expected=[[mean(1, 6, 9), mean(2, 7)], [mean(3, 8), 4, 5]]),
       dict(
-          ragged_reduce_op=ragged.reduce_mean,
+          ragged_reduce_op=ragged_math_ops.reduce_mean,
           rt_input=[[[1, 2], [3, 4, 5]], [[6, 7], [8]], [[9]]],
           axis=1,
           expected=[[mean(1, 3), mean(2, 4), 5], [mean(6, 8), 7], [9]]),
       dict(
-          ragged_reduce_op=ragged.reduce_mean,
+          ragged_reduce_op=ragged_math_ops.reduce_mean,
           rt_input=[[[1, 2], [3, 4, 5]], [[6, 7], [8]], [[9]]],
           axis=2,
           expected=[[mean(1, 2), mean(3, 4, 5)], [mean(6, 7), 8], [9]]),
   )
   def testReduce(self, ragged_reduce_op, rt_input, axis, expected):
-    rt_input = ragged.constant(rt_input)
+    rt_input = ragged_factory_ops.constant(rt_input)
     reduced = ragged_reduce_op(rt_input, axis)
     self.assertRaggedEqual(reduced, expected)
 
@@ -319,27 +320,26 @@ class RaggedReduceOpsTest(ragged_test_util.RaggedTensorTestCase,
     expected = (
         np.array([0 + 1 + 2 + 3, 4, 0, 5 + 6, 7, 8 + 9]) / np.array(
             [4, 1, 0, 2, 1, 2]))
-    rt_input = ragged.constant(rt_as_list)
-    reduced = ragged.reduce_mean(rt_input, axis=1)
+    rt_input = ragged_factory_ops.constant(rt_as_list)
+    reduced = ragged_math_ops.reduce_mean(rt_input, axis=1)
     self.assertEqualWithNan(self.evaluate(reduced), expected)
 
   def testMeanWithTensorInputs(self):
     tensor = [[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]]
     expected = [2.0, 20.0]
-    reduced = ragged.reduce_mean(tensor, axis=1)
+    reduced = ragged_math_ops.reduce_mean(tensor, axis=1)
     self.assertRaggedEqual(reduced, expected)
 
   def testErrors(self):
-    rt_input = ragged.constant([[1, 2, 3], [4, 5]])
+    rt_input = ragged_factory_ops.constant([[1, 2, 3], [4, 5]])
     axis = array_ops.placeholder_with_default(constant_op.constant([0]), None)
 
     if not context.executing_eagerly():
       self.assertRaisesRegexp(
           ValueError, r'axis must be known at graph construction time.',
-          ragged.reduce_sum, rt_input, axis)
-    self.assertRaisesRegexp(TypeError,
-                            r'axis must be an int; got str.*',
-                            ragged.reduce_sum, rt_input, ['x'])
+          ragged_math_ops.reduce_sum, rt_input, axis)
+    self.assertRaisesRegexp(TypeError, r'axis must be an int; got str.*',
+                            ragged_math_ops.reduce_sum, rt_input, ['x'])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py b/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py
index 15112d6c9c56b0e15247fc7c2f0b8410a5b9d376..8f8089c9bf351be819c1e6ece0cc0165da1de5fb 100644
--- a/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py
@@ -22,7 +22,8 @@ from absl.testing import parameterized
 
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -117,11 +118,11 @@ class RaggedRowLengthsOp(ragged_test_util.RaggedTensorTestCase,
                      axis=1,
                      ragged_rank=None,
                      expected_ragged_rank=None):
-    rt = ragged.constant(rt_input, ragged_rank=ragged_rank)
+    rt = ragged_factory_ops.constant(rt_input, ragged_rank=ragged_rank)
     lengths = rt.row_lengths(axis)
     self.assertRaggedEqual(lengths, expected)
     if expected_ragged_rank is not None:
-      if isinstance(lengths, ragged.RaggedTensor):
+      if isinstance(lengths, ragged_tensor.RaggedTensor):
         self.assertEqual(lengths.ragged_rank, expected_ragged_rank)
       else:
         self.assertEqual(0, expected_ragged_rank)
@@ -137,7 +138,7 @@ class RaggedRowLengthsOp(ragged_test_util.RaggedTensorTestCase,
           exception=(ValueError, errors.InvalidArgumentError)),
   ])
   def testErrors(self, rt_input, exception, message=None, axis=1):
-    rt = ragged.constant(rt_input)
+    rt = ragged_factory_ops.constant(rt_input)
     with self.assertRaisesRegexp(exception, message):
       rt.row_lengths(axis)
 
diff --git a/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py b/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py
index 2970540f3e585a7e9399dbe561f148a5abc9ee2c..5384f3ac09df6ce6a2cb9fc19409afd84b09fcc1 100644
--- a/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the ragged.row_splits_to_segment_ids() op."""
+"""Tests for the segment_id_ops.row_splits_to_segment_ids() op."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
 from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.ops.ragged import segment_id_ops
 from tensorflow.python.platform import googletest
 
 
@@ -31,25 +31,25 @@ class RaggedSplitsToSegmentIdsOpTest(ragged_test_util.RaggedTensorTestCase):
   def testDocStringExample(self):
     splits = [0, 3, 3, 5, 6, 9]
     expected = [0, 0, 0, 2, 2, 3, 4, 4, 4]
-    segment_ids = ragged.row_splits_to_segment_ids(splits)
+    segment_ids = segment_id_ops.row_splits_to_segment_ids(splits)
     self.assertAllEqual(segment_ids, expected)
 
   def testEmptySplits(self):
     # Note: the splits for an empty ragged tensor contains a single zero.
-    segment_ids = ragged.row_splits_to_segment_ids([0])
+    segment_ids = segment_id_ops.row_splits_to_segment_ids([0])
     self.assertAllEqual(segment_ids, [])
 
   def testErrors(self):
     self.assertRaisesRegexp(ValueError, r'Invalid row_splits: \[\]',
-                            ragged.row_splits_to_segment_ids, [])
+                            segment_id_ops.row_splits_to_segment_ids, [])
     self.assertRaisesRegexp(
         ValueError, r'Tensor conversion requested dtype int64 for '
-        'Tensor with dtype float32', ragged.row_splits_to_segment_ids,
+        'Tensor with dtype float32', segment_id_ops.row_splits_to_segment_ids,
         constant_op.constant([0.5]))
     self.assertRaisesRegexp(ValueError, r'Shape \(\) must have rank 1',
-                            ragged.row_splits_to_segment_ids, 0)
+                            segment_id_ops.row_splits_to_segment_ids, 0)
     self.assertRaisesRegexp(ValueError, r'Shape \(1, 1\) must have rank 1',
-                            ragged.row_splits_to_segment_ids, [[0]])
+                            segment_id_ops.row_splits_to_segment_ids, [[0]])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py b/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py
index 4ed962676700ade62adb76b035a9b4e1dc5c5d73..73ee42a19dc204a006d41e8280efb6228be055ef 100644
--- a/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the ragged.segment_ids_to_row_splits() op."""
+"""Tests for the segment_id_ops.segment_ids_to_row_splits() op."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
 from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.ops.ragged import segment_id_ops
 from tensorflow.python.platform import googletest
 
 
@@ -31,38 +31,38 @@ class RaggedSplitsToSegmentIdsOpTest(ragged_test_util.RaggedTensorTestCase):
   def testDocStringExample(self):
     segment_ids = [0, 0, 0, 2, 2, 3, 4, 4, 4]
     expected = [0, 3, 3, 5, 6, 9]
-    splits = ragged.segment_ids_to_row_splits(segment_ids)
+    splits = segment_id_ops.segment_ids_to_row_splits(segment_ids)
     self.assertAllEqual(splits, expected)
 
   def testEmptySegmentIds(self):
     # Note: the splits for an empty ragged tensor contains a single zero.
-    segment_ids = ragged.segment_ids_to_row_splits([])
+    segment_ids = segment_id_ops.segment_ids_to_row_splits([])
     self.assertAllEqual(segment_ids, [0])
 
   def testErrors(self):
     self.assertRaisesRegexp(TypeError,
                             r'segment_ids must be an integer tensor.*',
-                            ragged.segment_ids_to_row_splits,
+                            segment_id_ops.segment_ids_to_row_splits,
                             constant_op.constant([0.5]))
     self.assertRaisesRegexp(ValueError, r'Shape \(\) must have rank 1',
-                            ragged.segment_ids_to_row_splits, 0)
+                            segment_id_ops.segment_ids_to_row_splits, 0)
     self.assertRaisesRegexp(ValueError, r'Shape \(1, 1\) must have rank 1',
-                            ragged.segment_ids_to_row_splits, [[0]])
+                            segment_id_ops.segment_ids_to_row_splits, [[0]])
 
   def testNumSegments(self):
     segment_ids = [0, 0, 0, 2, 2, 3, 4, 4, 4]
     num_segments = 7
     expected = [0, 3, 3, 5, 6, 9, 9, 9]
-    splits = ragged.segment_ids_to_row_splits(segment_ids, num_segments)
+    splits = segment_id_ops.segment_ids_to_row_splits(segment_ids, num_segments)
     self.assertAllEqual(splits, expected)
 
   def testUnsortedSegmentIds(self):
     # Segment ids are not required to be sorted.
     segment_ids = [0, 4, 3, 2, 4, 4, 2, 0, 0]
-    splits1 = ragged.segment_ids_to_row_splits(segment_ids)
+    splits1 = segment_id_ops.segment_ids_to_row_splits(segment_ids)
     expected1 = [0, 3, 3, 5, 6, 9]
 
-    splits2 = ragged.segment_ids_to_row_splits(segment_ids, 7)
+    splits2 = segment_id_ops.segment_ids_to_row_splits(segment_ids, 7)
     expected2 = [0, 3, 3, 5, 6, 9, 9, 9]
     self.assertAllEqual(splits1, expected1)
     self.assertAllEqual(splits2, expected2)
diff --git a/tensorflow/python/ops/ragged/ragged_segment_op_test.py b/tensorflow/python/ops/ragged/ragged_segment_op_test.py
index be1f39afef0e720c0c23d9d8571fc70907696d6d..435ce87e00d56e6fa34ecfcaa6cb72bbb8c3cfe8 100644
--- a/tensorflow/python/ops/ragged/ragged_segment_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_segment_op_test.py
@@ -25,7 +25,9 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_math_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -61,7 +63,7 @@ class RaggedSegmentOpsTest(ragged_test_util.RaggedTensorTestCase,
     Returns:
       The expected value, as a nested Python list.
     """
-    self.assertEqual(len(data), len(segment_ids))
+    self.assertLen(data, len(segment_ids))
 
     # Build an empty (num_segments x ncols) "grouped" matrix
     ncols = max(len(row) for row in data)
@@ -79,30 +81,30 @@ class RaggedSegmentOpsTest(ragged_test_util.RaggedTensorTestCase,
             for grouped_row in grouped]
 
   @parameterized.parameters(
-      (ragged.segment_sum, sum, [0, 0, 1, 1, 2, 2]),
-      (ragged.segment_sum, sum, [0, 0, 0, 1, 1, 1]),
-      (ragged.segment_sum, sum, [5, 4, 3, 2, 1, 0]),
-      (ragged.segment_sum, sum, [0, 0, 0, 10, 10, 10]),
-      (ragged.segment_prod, prod, [0, 0, 1, 1, 2, 2]),
-      (ragged.segment_prod, prod, [0, 0, 0, 1, 1, 1]),
-      (ragged.segment_prod, prod, [5, 4, 3, 2, 1, 0]),
-      (ragged.segment_prod, prod, [0, 0, 0, 10, 10, 10]),
-      (ragged.segment_min, min, [0, 0, 1, 1, 2, 2]),
-      (ragged.segment_min, min, [0, 0, 0, 1, 1, 1]),
-      (ragged.segment_min, min, [5, 4, 3, 2, 1, 0]),
-      (ragged.segment_min, min, [0, 0, 0, 10, 10, 10]),
-      (ragged.segment_max, max, [0, 0, 1, 1, 2, 2]),
-      (ragged.segment_max, max, [0, 0, 0, 1, 1, 1]),
-      (ragged.segment_max, max, [5, 4, 3, 2, 1, 0]),
-      (ragged.segment_max, max, [0, 0, 0, 10, 10, 10]),
-      (ragged.segment_mean, mean, [0, 0, 1, 1, 2, 2]),
-      (ragged.segment_mean, mean, [0, 0, 0, 1, 1, 1]),
-      (ragged.segment_mean, mean, [5, 4, 3, 2, 1, 0]),
-      (ragged.segment_mean, mean, [0, 0, 0, 10, 10, 10]),
+      (ragged_math_ops.segment_sum, sum, [0, 0, 1, 1, 2, 2]),
+      (ragged_math_ops.segment_sum, sum, [0, 0, 0, 1, 1, 1]),
+      (ragged_math_ops.segment_sum, sum, [5, 4, 3, 2, 1, 0]),
+      (ragged_math_ops.segment_sum, sum, [0, 0, 0, 10, 10, 10]),
+      (ragged_math_ops.segment_prod, prod, [0, 0, 1, 1, 2, 2]),
+      (ragged_math_ops.segment_prod, prod, [0, 0, 0, 1, 1, 1]),
+      (ragged_math_ops.segment_prod, prod, [5, 4, 3, 2, 1, 0]),
+      (ragged_math_ops.segment_prod, prod, [0, 0, 0, 10, 10, 10]),
+      (ragged_math_ops.segment_min, min, [0, 0, 1, 1, 2, 2]),
+      (ragged_math_ops.segment_min, min, [0, 0, 0, 1, 1, 1]),
+      (ragged_math_ops.segment_min, min, [5, 4, 3, 2, 1, 0]),
+      (ragged_math_ops.segment_min, min, [0, 0, 0, 10, 10, 10]),
+      (ragged_math_ops.segment_max, max, [0, 0, 1, 1, 2, 2]),
+      (ragged_math_ops.segment_max, max, [0, 0, 0, 1, 1, 1]),
+      (ragged_math_ops.segment_max, max, [5, 4, 3, 2, 1, 0]),
+      (ragged_math_ops.segment_max, max, [0, 0, 0, 10, 10, 10]),
+      (ragged_math_ops.segment_mean, mean, [0, 0, 1, 1, 2, 2]),
+      (ragged_math_ops.segment_mean, mean, [0, 0, 0, 1, 1, 1]),
+      (ragged_math_ops.segment_mean, mean, [5, 4, 3, 2, 1, 0]),
+      (ragged_math_ops.segment_mean, mean, [0, 0, 0, 10, 10, 10]),
   )
   def testRaggedSegment_Int(self, segment_op, combiner, segment_ids):
     rt_as_list = [[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]]
-    rt = ragged.constant(rt_as_list)
+    rt = ragged_factory_ops.constant(rt_as_list)
     num_segments = max(segment_ids) + 1
     expected = self.expected_value(rt_as_list, segment_ids, num_segments,
                                    combiner)
@@ -111,34 +113,34 @@ class RaggedSegmentOpsTest(ragged_test_util.RaggedTensorTestCase,
     self.assertRaggedEqual(segmented, expected)
 
   @parameterized.parameters(
-      (ragged.segment_sum, sum, [0, 0, 1, 1, 2, 2]),
-      (ragged.segment_sum, sum, [0, 0, 0, 1, 1, 1]),
-      (ragged.segment_sum, sum, [5, 4, 3, 2, 1, 0]),
-      (ragged.segment_sum, sum, [0, 0, 0, 10, 10, 10]),
-      (ragged.segment_prod, prod, [0, 0, 1, 1, 2, 2]),
-      (ragged.segment_prod, prod, [0, 0, 0, 1, 1, 1]),
-      (ragged.segment_prod, prod, [5, 4, 3, 2, 1, 0]),
-      (ragged.segment_prod, prod, [0, 0, 0, 10, 10, 10]),
-      (ragged.segment_min, min, [0, 0, 1, 1, 2, 2]),
-      (ragged.segment_min, min, [0, 0, 0, 1, 1, 1]),
-      (ragged.segment_min, min, [5, 4, 3, 2, 1, 0]),
-      (ragged.segment_min, min, [0, 0, 0, 10, 10, 10]),
-      (ragged.segment_max, max, [0, 0, 1, 1, 2, 2]),
-      (ragged.segment_max, max, [0, 0, 0, 1, 1, 1]),
-      (ragged.segment_max, max, [5, 4, 3, 2, 1, 0]),
-      (ragged.segment_max, max, [0, 0, 0, 10, 10, 10]),
-      (ragged.segment_mean, mean, [0, 0, 1, 1, 2, 2]),
-      (ragged.segment_mean, mean, [0, 0, 0, 1, 1, 1]),
-      (ragged.segment_mean, mean, [5, 4, 3, 2, 1, 0]),
-      (ragged.segment_mean, mean, [0, 0, 0, 10, 10, 10]),
-      (ragged.segment_sqrt_n, sqrt_n, [0, 0, 1, 1, 2, 2]),
-      (ragged.segment_sqrt_n, sqrt_n, [0, 0, 0, 1, 1, 1]),
-      (ragged.segment_sqrt_n, sqrt_n, [5, 4, 3, 2, 1, 0]),
-      (ragged.segment_sqrt_n, sqrt_n, [0, 0, 0, 10, 10, 10]),
+      (ragged_math_ops.segment_sum, sum, [0, 0, 1, 1, 2, 2]),
+      (ragged_math_ops.segment_sum, sum, [0, 0, 0, 1, 1, 1]),
+      (ragged_math_ops.segment_sum, sum, [5, 4, 3, 2, 1, 0]),
+      (ragged_math_ops.segment_sum, sum, [0, 0, 0, 10, 10, 10]),
+      (ragged_math_ops.segment_prod, prod, [0, 0, 1, 1, 2, 2]),
+      (ragged_math_ops.segment_prod, prod, [0, 0, 0, 1, 1, 1]),
+      (ragged_math_ops.segment_prod, prod, [5, 4, 3, 2, 1, 0]),
+      (ragged_math_ops.segment_prod, prod, [0, 0, 0, 10, 10, 10]),
+      (ragged_math_ops.segment_min, min, [0, 0, 1, 1, 2, 2]),
+      (ragged_math_ops.segment_min, min, [0, 0, 0, 1, 1, 1]),
+      (ragged_math_ops.segment_min, min, [5, 4, 3, 2, 1, 0]),
+      (ragged_math_ops.segment_min, min, [0, 0, 0, 10, 10, 10]),
+      (ragged_math_ops.segment_max, max, [0, 0, 1, 1, 2, 2]),
+      (ragged_math_ops.segment_max, max, [0, 0, 0, 1, 1, 1]),
+      (ragged_math_ops.segment_max, max, [5, 4, 3, 2, 1, 0]),
+      (ragged_math_ops.segment_max, max, [0, 0, 0, 10, 10, 10]),
+      (ragged_math_ops.segment_mean, mean, [0, 0, 1, 1, 2, 2]),
+      (ragged_math_ops.segment_mean, mean, [0, 0, 0, 1, 1, 1]),
+      (ragged_math_ops.segment_mean, mean, [5, 4, 3, 2, 1, 0]),
+      (ragged_math_ops.segment_mean, mean, [0, 0, 0, 10, 10, 10]),
+      (ragged_math_ops.segment_sqrt_n, sqrt_n, [0, 0, 1, 1, 2, 2]),
+      (ragged_math_ops.segment_sqrt_n, sqrt_n, [0, 0, 0, 1, 1, 1]),
+      (ragged_math_ops.segment_sqrt_n, sqrt_n, [5, 4, 3, 2, 1, 0]),
+      (ragged_math_ops.segment_sqrt_n, sqrt_n, [0, 0, 0, 10, 10, 10]),
   )
   def testRaggedSegment_Float(self, segment_op, combiner, segment_ids):
     rt_as_list = [[0., 1., 2., 3.], [4.], [], [5., 6.], [7.], [8., 9.]]
-    rt = ragged.constant(rt_as_list)
+    rt = ragged_factory_ops.constant(rt_as_list)
     num_segments = max(segment_ids) + 1
     expected = self.expected_value(rt_as_list, segment_ids, num_segments,
                                    combiner)
@@ -147,14 +149,14 @@ class RaggedSegmentOpsTest(ragged_test_util.RaggedTensorTestCase,
     self.assertRaggedAlmostEqual(segmented, expected, places=5)
 
   def testRaggedRankTwo(self):
-    rt = ragged.constant([
+    rt = ragged_factory_ops.constant([
         [[111, 112, 113, 114], [121],],  # row 0
         [],                              # row 1
         [[], [321, 322], [331]],         # row 2
         [[411, 412]]                     # row 3
     ])  # pyformat: disable
     segment_ids1 = [0, 2, 2, 2]
-    segmented1 = ragged.segment_sum(rt, segment_ids1, 3)
+    segmented1 = ragged_math_ops.segment_sum(rt, segment_ids1, 3)
     expected1 = [[[111, 112, 113, 114], [121]],     # row 0
                  [],                                # row 1
                  [[411, 412], [321, 322], [331]]    # row 2
@@ -162,21 +164,21 @@ class RaggedSegmentOpsTest(ragged_test_util.RaggedTensorTestCase,
     self.assertRaggedEqual(segmented1, expected1)
 
     segment_ids2 = [1, 2, 1, 1]
-    segmented2 = ragged.segment_sum(rt, segment_ids2, 3)
+    segmented2 = ragged_math_ops.segment_sum(rt, segment_ids2, 3)
     expected2 = [[],
                  [[111+411, 112+412, 113, 114], [121+321, 322], [331]],
                  []]  # pyformat: disable
     self.assertRaggedEqual(segmented2, expected2)
 
   def testRaggedSegmentIds(self):
-    rt = ragged.constant([
+    rt = ragged_factory_ops.constant([
         [[111, 112, 113, 114], [121],],  # row 0
         [],                              # row 1
         [[], [321, 322], [331]],         # row 2
         [[411, 412]]                     # row 3
     ])  # pyformat: disable
-    segment_ids = ragged.constant([[1, 2], [], [1, 1, 2], [2]])
-    segmented = ragged.segment_sum(rt, segment_ids, 3)
+    segment_ids = ragged_factory_ops.constant([[1, 2], [], [1, 1, 2], [2]])
+    segmented = ragged_math_ops.segment_sum(rt, segment_ids, 3)
     expected = [[],
                 [111+321, 112+322, 113, 114],
                 [121+331+411, 412]]  # pyformat: disable
@@ -184,35 +186,35 @@ class RaggedSegmentOpsTest(ragged_test_util.RaggedTensorTestCase,
 
   def testShapeMismatchError1(self):
     dt = constant_op.constant([1, 2, 3, 4, 5, 6])
-    segment_ids = ragged.constant([[1, 2], []])
+    segment_ids = ragged_factory_ops.constant([[1, 2], []])
     self.assertRaisesRegexp(
         ValueError, 'segment_ids.shape must be a prefix of data.shape, '
-        'but segment_ids is ragged and data is not.', ragged.segment_sum, dt,
-        segment_ids, 3)
+        'but segment_ids is ragged and data is not.',
+        ragged_math_ops.segment_sum, dt, segment_ids, 3)
 
   def testShapeMismatchError2(self):
-    rt = ragged.constant([
+    rt = ragged_factory_ops.constant([
         [[111, 112, 113, 114], [121]],  # row 0
         [],                             # row 1
         [[], [321, 322], [331]],        # row 2
         [[411, 412]]                    # row 3
     ])  # pyformat: disable
-    segment_ids = ragged.constant([[1, 2], [1], [1, 1, 2], [2]])
+    segment_ids = ragged_factory_ops.constant([[1, 2], [1], [1, 1, 2], [2]])
 
     # Error is raised at graph-building time if we can detect it then.
     self.assertRaisesRegexp(
         errors.InvalidArgumentError,
         'segment_ids.shape must be a prefix of data.shape.*',
-        ragged.segment_sum, rt, segment_ids, 3)
+        ragged_math_ops.segment_sum, rt, segment_ids, 3)
 
     # Otherwise, error is raised when we run the graph.
-    segment_ids2 = ragged.RaggedTensor.from_row_splits(
+    segment_ids2 = ragged_tensor.RaggedTensor.from_row_splits(
         array_ops.placeholder_with_default(segment_ids.values, None),
         array_ops.placeholder_with_default(segment_ids.row_splits, None))
     with self.assertRaisesRegexp(
         errors.InvalidArgumentError,
         'segment_ids.shape must be a prefix of data.shape.*'):
-      self.evaluate(ragged.segment_sum(rt, segment_ids2, 3))
+      self.evaluate(ragged_math_ops.segment_sum(rt, segment_ids2, 3))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_size_op_test.py b/tensorflow/python/ops/ragged/ragged_size_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ffed11b13c0bc80dbfc45e1af79a808af3da7d1
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_size_op_test.py
@@ -0,0 +1,48 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged.size."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedSizeOpTest(ragged_test_util.RaggedTensorTestCase,
+                       parameterized.TestCase):
+
+  @parameterized.parameters([
+      {'size': 1, 'test_input': 1},
+      {'size': 0, 'test_input': []},
+      {'size': 0, 'test_input': [], 'ragged_rank': 1},
+      {'size': 3, 'test_input': [1, 1, 1]},
+      {'size': 3, 'test_input': [[1, 1], [1]]},
+      {'size': 5, 'test_input': [[[1, 1, 1], [1]], [[1]]]},
+      {'size': 6, 'test_input': [[[1, 1], [1, 1]], [[1, 1]]], 'ragged_rank': 1},
+  ])
+  def testRaggedSize(self, test_input, size, ragged_rank=None):
+    input_rt = ragged_factory_ops.constant(test_input, ragged_rank=ragged_rank)
+    self.assertAllEqual(ragged_array_ops.size(input_rt), size)
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_stack_op_test.py b/tensorflow/python/ops/ragged/ragged_stack_op_test.py
index 17d80b5aadc936cfe11c3f65628cc57bf2c60361..e52ad4de20cd8697c7772123627f32d2b980b720 100644
--- a/tensorflow/python/ops/ragged/ragged_stack_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_stack_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.stack."""
+"""Tests for ragged_concat_ops.stack."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,7 +22,8 @@ from absl.testing import parameterized
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_concat_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -32,6 +33,52 @@ class RaggedStackOpTest(ragged_test_util.RaggedTensorTestCase,
                         parameterized.TestCase):
 
   @parameterized.parameters(
+      dict(
+          descr='One rank-2 input (ragged_rank=1), axis=0',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21']],),   # shape=(3, None)
+          axis=0,
+          expected=[[[b'a00', b'a01'], [], [b'a20', b'a21']]]),
+      dict(
+          descr='One rank-2 input (ragged_rank=1), axis=1',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],),   # shape=(3, None)
+          axis=1,
+          expected=[
+              [[b'a00', b'a01']],
+              [[]],
+              [[b'a20', b'a21', b'a22']]]),
+      dict(
+          descr='One rank-2 input (ragged_rank=1), axis=2',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],),   # shape=(3, None)
+          axis=2,
+          expected=[
+              [[b'a00'], [b'a01']], [],
+              [[b'a20'], [b'a21'], [b'a22']]]),
+      dict(
+          descr='One rank-2 input (ragged_rank=1), axis=-3',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21']],),   # shape=(3, None)
+          axis=-3,
+          expected=[[[b'a00', b'a01'], [], [b'a20', b'a21']]]),
+      dict(
+          descr='One rank-2 input (ragged_rank=1), axis=-2',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],),   # shape=(3, None)
+          axis=-2,
+          expected=[
+              [[b'a00', b'a01']],
+              [[]],
+              [[b'a20', b'a21', b'a22']]]),
+      dict(
+          descr='One rank-2 input (ragged_rank=1), axis=-1',
+          rt_inputs=(
+              [['a00', 'a01'], [], ['a20', 'a21', 'a22']],),  # shape=(3, None)
+          axis=-1,
+          expected=[
+              [[b'a00'], [b'a01']], [],
+              [[b'a20'], [b'a21'], [b'a22']]]),
       dict(
           descr='Two rank-2 inputs (ragged_rank=1), axis=0',
           rt_inputs=(
@@ -279,11 +326,11 @@ class RaggedStackOpTest(ragged_test_util.RaggedTensorTestCase,
     if ragged_ranks is None:
       ragged_ranks = [None] * len(rt_inputs)
     rt_inputs = [
-        ragged.constant(rt_input, ragged_rank=rrank)
+        ragged_factory_ops.constant(rt_input, ragged_rank=rrank)  # pylint: disable=g-long-ternary
         if rrank != 0 else constant_op.constant(rt_input)
         for (rt_input, rrank) in zip(rt_inputs, ragged_ranks)
     ]
-    stacked = ragged.stack(rt_inputs, axis)
+    stacked = ragged_concat_ops.stack(rt_inputs, axis)
     if expected_ragged_rank is not None:
       self.assertEqual(stacked.ragged_rank, expected_ragged_rank)
     if expected_shape is not None:
@@ -313,7 +360,8 @@ class RaggedStackOpTest(ragged_test_util.RaggedTensorTestCase,
           message='axis=3 out of bounds: expected -3<=axis<3'),
   )
   def testError(self, rt_inputs, axis, error, message):
-    self.assertRaisesRegexp(error, message, ragged.stack, rt_inputs, axis)
+    self.assertRaisesRegexp(error, message, ragged_concat_ops.stack, rt_inputs,
+                            axis)
 
   def testSingleTensorInput(self):
     """Tests ragged_stack with a single tensor input.
@@ -322,8 +370,8 @@ class RaggedStackOpTest(ragged_test_util.RaggedTensorTestCase,
     also pass in a single value (as with tf.stack), in which case it is
     equivalent to expand_dims(axis=0).  This test exercises that path.
     """
-    rt_inputs = ragged.constant([[1, 2], [3, 4]])
-    stacked = ragged.stack(rt_inputs, 0)
+    rt_inputs = ragged_factory_ops.constant([[1, 2], [3, 4]])
+    stacked = ragged_concat_ops.stack(rt_inputs, 0)
     self.assertRaggedEqual(stacked, [[[1, 2], [3, 4]]])
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_string_ops.py b/tensorflow/python/ops/ragged/ragged_string_ops.py
index 1f9f0abe4f04bf0a9a2822df28af842cd18fc553..4b22c23d0b07abfed7841005af18ffdcd31a8696 100644
--- a/tensorflow/python/ops/ragged/ragged_string_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_string_ops.py
@@ -22,6 +22,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_string_ops
+from tensorflow.python.ops.ragged import ragged_array_ops
 from tensorflow.python.ops.ragged import ragged_conversion_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util.tf_export import tf_export
@@ -29,16 +30,19 @@ from tensorflow.python.util.tf_export import tf_export
 
 # pylint: disable=redefined-builtin
 @tf_export("strings.unicode_encode")
-def unicode_encode(input, output_encoding, errors="replace",
-                   replacement_char=65533, name=None):
+def unicode_encode(input,
+                   output_encoding,
+                   errors="replace",
+                   replacement_char=65533,
+                   name=None):
   r"""Encodes each sequence of Unicode code points in `input` into a string.
 
   `result[i1...iN]` is the string formed by concatenating the Unicode
   codepoints `input[1...iN, :]`, encoded using `output_encoding`.
 
   Args:
-    input: An `N+1` dimensional potentially ragged integer tensor with
-        shape `[D1...DN, num_chars]`.
+    input: An `N+1` dimensional potentially ragged integer tensor with shape
+      `[D1...DN, num_chars]`.
     output_encoding: Unicode encoding that should be used to encode each
       codepoint sequence.  Can be `"UTF-8"`, `"UTF-16-BE"`, or `"UTF-32-BE"`.
     errors: Specifies the response when an invalid codepoint is encountered
@@ -92,8 +96,9 @@ def unicode_encode(input, output_encoding, errors="replace",
     else:
       if input_tensor.shape.ndims == 2:
         # The input tensor is of the correct 2-D shape, it's just not ragged.
-        return unicode_encode(ragged_conversion_ops.from_tensor(input_tensor),
-                              output_encoding, errors, replacement_char)
+        return unicode_encode(
+            ragged_conversion_ops.from_tensor(input_tensor), output_encoding,
+            errors, replacement_char)
       elif input_tensor.shape.ndims > 2:
         # We need to initially flatten the input tensor to 2-D, and then can
         # reshape the output of our processed flattened tensor.
@@ -116,3 +121,282 @@ def unicode_encode(input, output_encoding, errors="replace",
         output_tensor = unicode_encode(ragged_input_tensor, output_encoding,
                                        errors, replacement_char)
         return array_ops.reshape(output_tensor, [])
+
+
+# pylint: disable=redefined-builtin
+@tf_export("strings.unicode_decode")
+def unicode_decode(input,
+                   input_encoding,
+                   errors="replace",
+                   replacement_char=0xFFFD,
+                   replace_control_characters=False,
+                   name=None):
+  r"""Decodes each string in `input` into a sequence of Unicode code points.
+
+  `result[i1...iN, j]` is the Unicode codepoint for the `j`th character in
+  `input[i1...iN]`, when decoded using `input_encoding`.
+
+  Args:
+    input: An `N` dimensional potentially ragged `string` tensor with shape
+      `[D1...DN]`.  `N` must be statically known.
+    input_encoding: String name for the unicode encoding that should be used to
+      decode each string.
+    errors: Specifies the response when an input string can't be converted
+      using the indicated encoding. One of:
+      * `'strict'`: Raise an exception for any illegal substrings.
+      * `'replace'`: Replace illegal substrings with `replacement_char`.
+      * `'ignore'`: Skip illegal substrings.
+    replacement_char: The replacement codepoint to be used in place of invalid
+      substrings in `input` when `errors='replace'`; and in place of C0 control
+      characters in `input` when `replace_control_characters=True`.
+    replace_control_characters: Whether to replace the C0 control characters
+      `(U+0000 - U+001F)` with the `replacement_char`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `N+1` dimensional `int32` tensor with shape `[D1...DN, (num_chars)]`.
+    The returned tensor is a `tf.Tensor` if `input` is a scalar, or a
+    `tf.RaggedTensor` otherwise.
+
+  #### Example:
+    ```python
+    >>> input = [s.encode('utf8') for s in (u'G\xf6\xf6dnight', u'\U0001f60a')]
+    >>> tf.strings.unicode_decode(input, 'UTF-8').tolist()
+    [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]
+    ```
+  """
+  with ops.name_scope(name, "UnicodeDecode", [input]):
+    return _unicode_decode(input, input_encoding, errors, replacement_char,
+                           replace_control_characters, with_offsets=False)
+
+
+@tf_export("strings.unicode_decode_with_offsets")
+def unicode_decode_with_offsets(input,
+                                input_encoding,
+                                errors="replace",
+                                replacement_char=0xFFFD,
+                                replace_control_characters=False,
+                                name=None):
+  r"""Decodes each string into a sequence of code points with start offsets.
+
+  This op is similar to `tf.strings.decode(...)`, but it also returns the
+  start offset for each character in its respective string.  This information
+  can be used to align the characters with the original byte sequence.
+
+  Returns a tuple `(codepoints, start_offsets)` where:
+
+  * `codepoints[i1...iN, j]` is the Unicode codepoint for the `j`th character
+    in `input[i1...iN]`, when decoded using `input_encoding`.
+  * `start_offsets[i1...iN, j]` is the start byte offset for the `j`th
+    character in `input[i1...iN]`, when decoded using `input_encoding`.
+
+  Args:
+    input: An `N` dimensional potentially ragged `string` tensor with shape
+      `[D1...DN]`.  `N` must be statically known.
+    input_encoding: String name for the unicode encoding that should be used to
+      decode each string.
+    errors: Specifies the response when an input string can't be converted
+      using the indicated encoding. One of:
+      * `'strict'`: Raise an exception for any illegal substrings.
+      * `'replace'`: Replace illegal substrings with `replacement_char`.
+      * `'ignore'`: Skip illegal substrings.
+    replacement_char: The replacement codepoint to be used in place of invalid
+      substrings in `input` when `errors='replace'`; and in place of C0 control
+      characters in `input` when `replace_control_characters=True`.
+    replace_control_characters: Whether to replace the C0 control characters
+      `(U+0000 - U+001F)` with the `replacement_char`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tuple of `N+1` dimensional tensors `(codepoints, start_offsets)`.
+
+    * `codepoints` is an `int32` tensor with shape `[D1...DN, (num_chars)]`.
+    * `offsets` is an `int64` tensor with shape `[D1...DN, (num_chars)]`.
+
+    The returned tensors are `tf.Tensor`s if `input` is a scalar, or
+    `tf.RaggedTensor`s otherwise.
+
+  #### Example:
+    ```python
+    >>> input = [s.encode('utf8') for s in (u'G\xf6\xf6dnight', u'\U0001f60a')]
+    >>> result = tf.strings.unicode_decode_with_offsets(input, 'UTF-8')
+    >>> result[0].tolist()  # codepoints
+    [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]
+    >>> result[1].tolist()  # offsets
+   [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]]
+    ```
+  """
+  with ops.name_scope(name, "UnicodeDecodeWithOffsets", [input]):
+    return _unicode_decode(input, input_encoding, errors, replacement_char,
+                           replace_control_characters, with_offsets=True)
+
+
+@tf_export("strings.unicode_split")
+def unicode_split(input,
+                  input_encoding,
+                  errors="replace",
+                  replacement_char=0xFFFD,
+                  name=None):
+  r"""Splits each string in `input` into a sequence of Unicode code points.
+
+  `result[i1...iN, j]` is the substring of `input[i1...iN]` that encodes its
+  `j`th character, when decoded using `input_encoding`.
+
+  Args:
+    input: An `N` dimensional potentially ragged `string` tensor with shape
+      `[D1...DN]`.  `N` must be statically known.
+    input_encoding: String name for the unicode encoding that should be used to
+      decode each string.
+    errors: Specifies the response when an input string can't be converted
+      using the indicated encoding. One of:
+      * `'strict'`: Raise an exception for any illegal substrings.
+      * `'replace'`: Replace illegal substrings with `replacement_char`.
+      * `'ignore'`: Skip illegal substrings.
+    replacement_char: The replacement codepoint to be used in place of invalid
+      substrings in `input` when `errors='replace'`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `N+1` dimensional `int32` tensor with shape `[D1...DN, (num_chars)]`.
+    The returned tensor is a `tf.Tensor` if `input` is a scalar, or a
+    `tf.RaggedTensor` otherwise.
+
+  #### Example:
+    ```python
+    >>> input = [s.encode('utf8') for s in (u'G\xf6\xf6dnight', u'\U0001f60a')]
+    >>> tf.strings.unicode_split(input, 'UTF-8').tolist()
+    [['G', '\xc3\xb6', '\xc3\xb6', 'd', 'n', 'i', 'g', 'h', 't'],
+     ['\xf0\x9f\x98\x8a']]
+    ```
+  """
+  with ops.name_scope(name, "UnicodeSplit", [input]):
+    codepoints = _unicode_decode(input, input_encoding, errors,
+                                 replacement_char, False, with_offsets=False)
+    return unicode_encode(
+        ragged_array_ops.expand_dims(codepoints, -1),
+        output_encoding=input_encoding,
+        errors=errors,
+        replacement_char=replacement_char)
+
+
+@tf_export("strings.unicode_split_with_offsets")
+def unicode_split_with_offsets(input,
+                               input_encoding,
+                               errors="replace",
+                               replacement_char=0xFFFD,
+                               name=None):
+  r"""Splits each string into a sequence of code points with start offsets.
+
+  This op is similar to `tf.strings.decode(...)`, but it also returns the
+  start offset for each character in its respective string.  This information
+  can be used to align the characters with the original byte sequence.
+
+  Returns a tuple `(chars, start_offsets)` where:
+
+  * `chars[i1...iN, j]` is the substring of `input[i1...iN]` that encodes its
+    `j`th character, when decoded using `input_encoding`.
+  * `start_offsets[i1...iN, j]` is the start byte offset for the `j`th
+    character in `input[i1...iN]`, when decoded using `input_encoding`.
+
+  Args:
+    input: An `N` dimensional potentially ragged `string` tensor with shape
+      `[D1...DN]`.  `N` must be statically known.
+    input_encoding: String name for the unicode encoding that should be used to
+      decode each string.
+    errors: Specifies the response when an input string can't be converted
+      using the indicated encoding. One of:
+      * `'strict'`: Raise an exception for any illegal substrings.
+      * `'replace'`: Replace illegal substrings with `replacement_char`.
+      * `'ignore'`: Skip illegal substrings.
+    replacement_char: The replacement codepoint to be used in place of invalid
+      substrings in `input` when `errors='replace'`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tuple of `N+1` dimensional tensors `(codepoints, start_offsets)`.
+
+    * `codepoints` is an `int32` tensor with shape `[D1...DN, (num_chars)]`.
+    * `offsets` is an `int64` tensor with shape `[D1...DN, (num_chars)]`.
+
+    The returned tensors are `tf.Tensor`s if `input` is a scalar, or
+    `tf.RaggedTensor`s otherwise.
+
+  #### Example:
+    ```python
+    >>> input = [s.encode('utf8') for s in (u'G\xf6\xf6dnight', u'\U0001f60a')]
+    >>> result = tf.strings.unicode_split_with_offsets(input, 'UTF-8')
+    >>> result[0].tolist()  # character substrings
+    [['G', '\xc3\xb6', '\xc3\xb6', 'd', 'n', 'i', 'g', 'h', 't'],
+     ['\xf0\x9f\x98\x8a']]
+    >>> result[1].tolist()  # offsets
+   [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]]
+    ```
+  """
+  with ops.name_scope(name, "UnicodeSplitWithOffsets", [input]):
+    codepoints, offsets = _unicode_decode(input, input_encoding, errors,
+                                          replacement_char, False,
+                                          with_offsets=True)
+    chars = unicode_encode(
+        ragged_array_ops.expand_dims(codepoints, -1),
+        output_encoding=input_encoding,
+        errors=errors,
+        replacement_char=replacement_char)
+    return chars, offsets
+
+
+def _unicode_decode(input, input_encoding, errors, replacement_char,
+                    replace_control_characters, with_offsets):
+  """Decodes each string into a sequence of codepoints."""
+  input = ragged_tensor.convert_to_tensor_or_ragged_tensor(input, name="input")
+  input_ndims = input.shape.ndims
+  if input_ndims is None:
+    raise ValueError("Rank of `input` must be statically known.")
+
+  if input_ndims > 1:
+    # Convert to a ragged tensor with ragged_rank = input_ndims - 1.
+    if not ragged_tensor.is_ragged(input):
+      input = ragged_conversion_ops.from_tensor(
+          input, ragged_rank=input_ndims - 1)
+    elif input.ragged_rank < input_ndims - 1:
+      input = input.with_flat_values(
+          ragged_conversion_ops.from_tensor(
+              input.flat_values,
+              ragged_rank=input_ndims - input.ragged_rank + 1))
+
+  # Reshape the input to a flat vector, and apply the gen_string_ops op.
+  if ragged_tensor.is_ragged(input):
+    flat_input = array_ops.reshape(input.flat_values, [-1])
+  else:
+    flat_input = array_ops.reshape(input, [-1])
+
+  if with_offsets:
+    decode_op = gen_string_ops.unicode_decode_with_offsets
+  else:
+    decode_op = gen_string_ops.unicode_decode
+  flat_result = decode_op(
+      input=flat_input,
+      input_encoding=input_encoding,
+      errors=errors,
+      replacement_char=replacement_char,
+      replace_control_characters=replace_control_characters)
+
+  if input_ndims == 0:
+    codepoints = flat_result.char_values
+    if with_offsets:
+      offsets = flat_result.char_to_byte_starts
+  else:
+    codepoints = ragged_tensor.RaggedTensor.from_row_splits(
+        flat_result.char_values, flat_result.row_splits)
+    if input_ndims > 1:
+      codepoints = input.with_flat_values(codepoints)
+    if with_offsets:
+      offsets = ragged_tensor.RaggedTensor.from_row_splits(
+          flat_result.char_to_byte_starts, flat_result.row_splits)
+      if input_ndims > 1:
+        offsets = input.with_flat_values(offsets)
+
+  if with_offsets:
+    return codepoints, offsets
+  else:
+    return codepoints
+
diff --git a/tensorflow/python/ops/ragged/ragged_tensor.py b/tensorflow/python/ops/ragged/ragged_tensor.py
index 567c50203af592e57168063e20787b3ed621b8c8..8fb3c1f44ca2ddf3e83fff93dcd4eae3492adfa7 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.client import session
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -32,6 +33,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.ops.ragged import ragged_util
 from tensorflow.python.ops.ragged import segment_id_ops
+from tensorflow.python.util.tf_export import tf_export
 
 # pylint: disable=protected-access
 _eval_using_default_session = ops._eval_using_default_session
@@ -43,8 +45,9 @@ _eval_using_default_session = ops._eval_using_default_session
 #===============================================================================
 
 
-class RaggedTensor(object):
-  """Represents a ragged tensor (go/ragged).
+@tf_export("RaggedTensor")
+class RaggedTensor(composite_tensor.CompositeTensor):
+  """Represents a ragged tensor.
 
   A `RaggedTensor` is a tensor with one or more *ragged dimensions*, which are
   dimensions whose slices may have different lengths.  For example, the inner
@@ -1435,6 +1438,53 @@ class RaggedTensor(object):
       values = values.values
     return values
 
+  #=============================================================================
+  # Composite Tensor
+  #=============================================================================
+
+  def _to_components(self):
+    return (self.flat_values,) + self.nested_row_splits
+
+  @classmethod
+  def _from_components(cls, components):
+    return cls.from_nested_row_splits(components[0], components[1:])
+
+  def _shape_invariant_to_components(self, shape=None):
+    ragged_rank = self.ragged_rank
+    flat_values = self.flat_values
+
+    if shape is None:
+      # Default shape invariant
+      value_shape = flat_values.shape[1:]
+      values_shape = tensor_shape.TensorShape([None]).concatenate(value_shape)
+      return ((values_shape, self._row_splits.shape) +
+              tuple(tensor_shape.TensorShape([None])
+                    for i in range(1, ragged_rank)))
+    else:
+      # Explicitly specified shape invariant
+      if shape.ndims is not None and shape.ndims <= ragged_rank:
+        raise ValueError("Shape invariant %s does not have sufficient rank "
+                         "for a RaggedTensor with %d ragged dimensions." %
+                         (shape, self.ragged_rank))
+      if any(tensor_shape.dimension_value(shape[dim]) is not None
+             for dim in range(1, self.ragged_rank + 1)):
+        raise ValueError("Shape invariant dimension size must be None for "
+                         "ragged dimenions.")
+      nrows = tensor_shape.dimension_value(shape[0])
+      value_shape = shape[self.ragged_rank + 1:]
+      values_shape = tensor_shape.TensorShape([None]).concatenate(value_shape)
+      if nrows is None:
+        outer_splits_shape = tensor_shape.TensorShape([None])
+      else:
+        outer_splits_shape = tensor_shape.TensorShape([nrows + 1])
+      return ((values_shape, outer_splits_shape) +
+              tuple(tensor_shape.TensorShape([None])
+                    for i in range(1, ragged_rank)))
+
+  @property
+  def _is_graph_tensor(self):
+    return hasattr(self._values, 'graph')
+
 
 def is_ragged(value):
   """Returns true if `value` is a ragged tensor or ragged tensor value."""
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_bounding_shape_op_test.py b/tensorflow/python/ops/ragged/ragged_tensor_bounding_shape_op_test.py
index 4e6ebdf332e6f53b7a3af5679af1cbf27ec9f792..025a221626cd580d07b8993e59328e798e830cfa 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_bounding_shape_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_bounding_shape_op_test.py
@@ -19,7 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -29,30 +30,31 @@ class RaggedTensorBoundingShapeOp(ragged_test_util.RaggedTensorTestCase):
 
   def testDocStringExample(self):
     # This is the example from ragged.bounding_shape.__doc__.
-    rt = ragged.constant([[1, 2, 3, 4], [5], [], [6, 7, 8, 9], [10]])
+    rt = ragged_factory_ops.constant([[1, 2, 3, 4], [5], [], [6, 7, 8, 9],
+                                      [10]])
     self.assertRaggedEqual(rt.bounding_shape(), [5, 4])
 
   def test2DRaggedTensorWithOneRaggedDimension(self):
     values = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
-    rt1 = ragged.RaggedTensor.from_row_splits(values, [0, 2, 5, 6, 6, 7])
-    rt2 = ragged.RaggedTensor.from_row_splits(values, [0, 7])
-    rt3 = ragged.RaggedTensor.from_row_splits(values, [0, 0, 7, 7])
+    rt1 = ragged_tensor.RaggedTensor.from_row_splits(values, [0, 2, 5, 6, 6, 7])
+    rt2 = ragged_tensor.RaggedTensor.from_row_splits(values, [0, 7])
+    rt3 = ragged_tensor.RaggedTensor.from_row_splits(values, [0, 0, 7, 7])
     self.assertRaggedEqual(rt1.bounding_shape(), [5, 3])
     self.assertRaggedEqual(rt2.bounding_shape(), [1, 7])
     self.assertRaggedEqual(rt3.bounding_shape(), [3, 7])
 
   def test3DRaggedTensorWithOneRaggedDimension(self):
     values = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]]
-    rt1 = ragged.RaggedTensor.from_row_splits(values, [0, 2, 5, 6, 6, 7])
-    rt2 = ragged.RaggedTensor.from_row_splits(values, [0, 7])
-    rt3 = ragged.RaggedTensor.from_row_splits(values, [0, 0, 7, 7])
+    rt1 = ragged_tensor.RaggedTensor.from_row_splits(values, [0, 2, 5, 6, 6, 7])
+    rt2 = ragged_tensor.RaggedTensor.from_row_splits(values, [0, 7])
+    rt3 = ragged_tensor.RaggedTensor.from_row_splits(values, [0, 0, 7, 7])
     self.assertRaggedEqual(rt1.bounding_shape(), [5, 3, 2])
     self.assertRaggedEqual(rt2.bounding_shape(), [1, 7, 2])
     self.assertRaggedEqual(rt3.bounding_shape(), [3, 7, 2])
 
   def testExplicitAxisOptimizations(self):
-    rt = ragged.RaggedTensor.from_row_splits(b'a b c d e f g'.split(),
-                                             [0, 2, 5, 6, 6, 7])
+    rt = ragged_tensor.RaggedTensor.from_row_splits(b'a b c d e f g'.split(),
+                                                    [0, 2, 5, 6, 6, 7])
     self.assertRaggedEqual(rt.bounding_shape(0), 5)
     self.assertRaggedEqual(rt.bounding_shape(1), 3)
     self.assertRaggedEqual(rt.bounding_shape([1, 0]), [3, 5])
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py b/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py
index ec06aeaea546d679d65c7c8d64357393afd3eae2..bc0139cffd846662fe2df990a0eaa511cd7f0f63 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py
@@ -23,8 +23,11 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_shape
 from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.ops.ragged.ragged_tensor_shape import RaggedTensorDynamicShape
 from tensorflow.python.platform import googletest
 
 
@@ -33,8 +36,8 @@ class RaggedTensorBoundingShapeOp(ragged_test_util.RaggedTensorTestCase,
                                   parameterized.TestCase):
 
   def assertShapeEq(self, x, y):
-    assert isinstance(x, ragged.RaggedTensorDynamicShape)
-    assert isinstance(y, ragged.RaggedTensorDynamicShape)
+    assert isinstance(x, RaggedTensorDynamicShape)
+    assert isinstance(y, RaggedTensorDynamicShape)
     x_partitioned_dim_sizes = [
         self.eval_to_list(splits)  #
         for splits in x.partitioned_dim_sizes
@@ -54,39 +57,40 @@ class RaggedTensorBoundingShapeOp(ragged_test_util.RaggedTensorTestCase,
           value=[[['a', 'b', 'c'], ['d', 'e', 'f']]],
           expected_dim_sizes=[1, 2, 3]),
       dict(
-          value=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']]),
+          value=ragged_factory_ops.constant_value([['a', 'b', 'c'], ['d',
+                                                                     'e']]),
           expected_dim_sizes=[2, [3, 2]]),
       dict(
-          value=ragged.constant_value([[['a', 'b', 'c'], ['d', 'e']]]),
+          value=ragged_factory_ops.constant_value([[['a', 'b', 'c'], ['d',
+                                                                      'e']]]),
           expected_dim_sizes=[1, [2], [3, 2]]),
       dict(
-          value=ragged.constant_value([[['a', 'b', 'c'], ['d', 'e', 'f']]],
-                                      ragged_rank=1),
+          value=ragged_factory_ops.constant_value(
+              [[['a', 'b', 'c'], ['d', 'e', 'f']]], ragged_rank=1),
           expected_dim_sizes=[1, [2], 3]),
       dict(
-          value=ragged.constant_value([[[[1], [2]], [[3], [4]]],
-                                       [[[5], [6]]]], ragged_rank=1),
+          value=ragged_factory_ops.constant_value(
+              [[[[1], [2]], [[3], [4]]], [[[5], [6]]]], ragged_rank=1),
           expected_dim_sizes=[2, [2, 1], 2, 1]),
       dict(
-          value=ragged.constant_value([[10, 20], [30]]),
+          value=ragged_factory_ops.constant_value([[10, 20], [30]]),
           expected_dim_sizes=[2, [2, 1]]),
       # Docstring examples:
       dict(value=[[1, 2, 3], [4, 5, 6]], expected_dim_sizes=[2, 3]),
       dict(
-          value=ragged.constant_value([[1, 2], [], [3, 4, 5]]),
+          value=ragged_factory_ops.constant_value([[1, 2], [], [3, 4, 5]]),
           expected_dim_sizes=[3, [2, 0, 3]]),
       dict(
-          value=ragged.constant_value([[[1, 2], [3, 4]], [[5, 6]]],
-                                      ragged_rank=1),
+          value=ragged_factory_ops.constant_value([[[1, 2], [3, 4]], [[5, 6]]],
+                                                  ragged_rank=1),
           expected_dim_sizes=[2, [2, 1], 2]),
       dict(
-          value=ragged.constant_value([[[1, 2], [3]], [[4, 5]]]),
+          value=ragged_factory_ops.constant_value([[[1, 2], [3]], [[4, 5]]]),
           expected_dim_sizes=[2, [2, 1], [2, 1, 2]]),
   ])
   def testFromTensor(self, value, expected_dim_sizes):
-    shape = ragged.RaggedTensorDynamicShape.from_tensor(value)
-    expected = ragged.RaggedTensorDynamicShape.from_dim_sizes(
-        expected_dim_sizes)
+    shape = RaggedTensorDynamicShape.from_tensor(value)
+    expected = RaggedTensorDynamicShape.from_dim_sizes(expected_dim_sizes)
     self.assertShapeEq(shape, expected)
 
   @parameterized.parameters([
@@ -106,9 +110,8 @@ class RaggedTensorBoundingShapeOp(ragged_test_util.RaggedTensorTestCase,
           expected_dim_sizes=[1, 3, [3, 2, 4], 2, 3]),
   ])
   def testBroadcastToRank(self, dim_sizes, rank, expected_dim_sizes):
-    shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(dim_sizes)
-    expected = ragged.RaggedTensorDynamicShape.from_dim_sizes(
-        expected_dim_sizes)
+    shape = RaggedTensorDynamicShape.from_dim_sizes(dim_sizes)
+    expected = RaggedTensorDynamicShape.from_dim_sizes(expected_dim_sizes)
     broadcasted_shape = shape.broadcast_to_rank(rank)
     self.assertShapeEq(broadcasted_shape, expected)
     self.assertEqual(broadcasted_shape.rank, rank)
@@ -297,21 +300,19 @@ class RaggedTensorBoundingShapeOp(ragged_test_util.RaggedTensorTestCase,
         original_dim_sizes[axis] should be equal to `1` or `row_length`.
       broadcast_dim_sizes: THe dimension sizes after broadcasting.
     """
-    original_shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(
-        original_dim_sizes)
-    broadcast_shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(
-        broadcast_dim_sizes)
-    self.assertEqual(original_shape.rank, broadcast_shape.rank)
+    original_shape = RaggedTensorDynamicShape.from_dim_sizes(original_dim_sizes)
+    bcast_shape = RaggedTensorDynamicShape.from_dim_sizes(broadcast_dim_sizes)
+    self.assertEqual(original_shape.rank, bcast_shape.rank)
     # shape[axis].value == 1 and row_length > 1:
     bcast1 = original_shape.broadcast_dimension(axis, row_length)
     # shape[axis].value > 1 and row_length == shape[axis].value:
-    bcast2 = broadcast_shape.broadcast_dimension(axis, row_length)
+    bcast2 = bcast_shape.broadcast_dimension(axis, row_length)
     # shape[axis].value > 1 and row_length == 1:
-    bcast3 = broadcast_shape.broadcast_dimension(axis, 1)
+    bcast3 = bcast_shape.broadcast_dimension(axis, 1)
 
-    self.assertShapeEq(bcast1, broadcast_shape)
-    self.assertShapeEq(bcast2, broadcast_shape)
-    self.assertShapeEq(bcast3, broadcast_shape)
+    self.assertShapeEq(bcast1, bcast_shape)
+    self.assertShapeEq(bcast2, bcast_shape)
+    self.assertShapeEq(bcast3, bcast_shape)
 
   @parameterized.parameters(
       [
@@ -369,104 +370,115 @@ class RaggedTensorBoundingShapeOp(ragged_test_util.RaggedTensorTestCase,
               expected_dims=[2, (2, 1), 2, (2, 1, 2, 1, 2, 1)]),
       ])
   def testBroadcastDynamicShape(self, x_dims, y_dims, expected_dims):
-    x_shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(x_dims)
-    y_shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(y_dims)
-    expected = ragged.RaggedTensorDynamicShape.from_dim_sizes(expected_dims)
-    result1 = ragged.broadcast_dynamic_shape(x_shape, y_shape)
-    result2 = ragged.broadcast_dynamic_shape(y_shape, x_shape)
+    x_shape = RaggedTensorDynamicShape.from_dim_sizes(x_dims)
+    y_shape = RaggedTensorDynamicShape.from_dim_sizes(y_dims)
+    expected = RaggedTensorDynamicShape.from_dim_sizes(expected_dims)
+    result1 = ragged_tensor_shape.broadcast_dynamic_shape(x_shape, y_shape)
+    result2 = ragged_tensor_shape.broadcast_dynamic_shape(y_shape, x_shape)
     self.assertShapeEq(expected, result1)
     self.assertShapeEq(expected, result2)
 
   def testRepr(self):
-    shape = ragged.RaggedTensorDynamicShape.from_dim_sizes([2, (2, 1), 2, 1])
+    shape = RaggedTensorDynamicShape.from_dim_sizes([2, (2, 1), 2, 1])
     self.assertRegexpMatches(
         repr(shape),
         r'RaggedTensorDynamicShape\('
         r'partitioned_dim_sizes=\(<[^>]+>, <[^>]+>\), '
         r'inner_dim_sizes=<[^>]+>\)')
 
-  @parameterized.parameters([
-      dict(
-          x=[[10], [20], [30]],  # shape=[3, 1]
-          dim_sizes=[3, 2],
-          expected=[[10, 10], [20, 20], [30, 30]]),
-      dict(
-          x=[[10], [20], [30]],  # shape=[3, 1]
-          dim_sizes=[3, [3, 0, 2]],
-          expected=ragged.constant_value([[10, 10, 10], [], [30, 30]],
-                                         dtype=np.int32)),
-      dict(
-          x=[[[1, 2, 3]], [[4, 5, 6]]],  # shape = [2, 1, 3]
-          dim_sizes=[2, [2, 3], 3],
-          expected=ragged.constant_value(
-              [[[1, 2, 3], [1, 2, 3]], [[4, 5, 6], [4, 5, 6], [4, 5, 6]]],
-              dtype=np.int32,
-              ragged_rank=1)),
-      dict(
-          x=[[[1]], [[2]]],  # shape = [2, 1, 1]
-          dim_sizes=[2, [2, 3], [0, 2, 1, 2, 0]],
-          expected=ragged.constant_value([[[], [1, 1]], [[2], [2, 2], []]],
-                                         dtype=np.int32,
-                                         ragged_rank=2)),
-      dict(
-          x=10,
-          dim_sizes=[3, [3, 0, 2]],
-          expected=ragged.constant_value([[10, 10, 10], [], [10, 10]])),
-  ])
+  @parameterized.parameters(
+      [
+          dict(
+              x=[[10], [20], [30]],  # shape=[3, 1]
+              dim_sizes=[3, 2],
+              expected=[[10, 10], [20, 20], [30, 30]]),
+          dict(
+              x=[[10], [20], [30]],  # shape=[3, 1]
+              dim_sizes=[3, [3, 0, 2]],
+              expected=ragged_factory_ops.constant_value(
+                  [[10, 10, 10], [], [30, 30]], dtype=np.int32)),
+          dict(
+              x=[[[1, 2, 3]], [[4, 5, 6]]],  # shape = [2, 1, 3]
+              dim_sizes=[2, [2, 3], 3],
+              expected=ragged_factory_ops.constant_value(
+                  [[[1, 2, 3], [1, 2, 3]], [[4, 5, 6], [4, 5, 6], [4, 5, 6]]],
+                  dtype=np.int32,
+                  ragged_rank=1)),
+          dict(
+              x=[[[1]], [[2]]],  # shape = [2, 1, 1]
+              dim_sizes=[2, [2, 3], [0, 2, 1, 2, 0]],
+              expected=ragged_factory_ops.constant_value(
+                  [[[], [1, 1]], [[2], [2, 2], []]],
+                  dtype=np.int32,
+                  ragged_rank=2)),
+          dict(
+              x=10,
+              dim_sizes=[3, [3, 0, 2]],
+              expected=ragged_factory_ops.constant_value([[10, 10, 10], [],
+                                                          [10, 10]])),
+      ])
   def testRaggedBroadcastTo(self, x, dim_sizes, expected):
-    shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(dim_sizes)
-    result = ragged.broadcast_to(x, shape)
+    shape = RaggedTensorDynamicShape.from_dim_sizes(dim_sizes)
+    result = ragged_tensor_shape.broadcast_to(x, shape)
     self.assertEqual(
         getattr(result, 'ragged_rank', 0), getattr(expected, 'ragged_rank', 0))
     self.assertRaggedEqual(result, expected)
 
-  @parameterized.parameters([
-      dict(
-          doc='x.shape=[3, (D1)]; y.shape=[3, 1]; bcast.shape=[3, (D1)]',
-          x=ragged.constant_value([[1, 2, 3], [], [4, 5]], dtype=np.int32),
-          y=[[10], [20], [30]],
-          expected=ragged.constant_value([[11, 12, 13], [], [34, 35]])),
-      dict(
-          doc='x.shape=[3, (D1)]; y.shape=[]; bcast.shape=[3, (D1)]',
-          x=ragged.constant_value([[1, 2, 3], [], [4, 5]], dtype=np.int32),
-          y=10,
-          expected=ragged.constant_value([[11, 12, 13], [], [14, 15]])),
-      dict(
-          doc='x.shape=[1, (D1)]; y.shape=[3, 1]; bcast.shape=[3, (D1)]',
-          x=ragged.constant_value([[1, 2, 3]], dtype=np.int32),
-          y=[[10], [20], [30]],
-          expected=ragged.constant_value(
-              [[11, 12, 13], [21, 22, 23], [31, 32, 33]], dtype=np.int32)),
-      dict(
-          doc=('x.shape=[2, (D1), 1]; y.shape=[1, (D2)]; '
-               'bcast.shape=[2, (D1), (D2)]'),
-          x=ragged.constant_value([[[1], [2], [3]], [[4]]], ragged_rank=1),
-          y=ragged.constant_value([[10, 20, 30]]),
-          expected=ragged.constant_value([[[11, 21, 31], [12, 22, 32],
-                                           [13, 23, 33]], [[14, 24, 34]]])),
-      dict(
-          doc=('x.shape=[2, (D1), 1]; y.shape=[1, 1, 4]; '
-               'bcast.shape=[2, (D1), 4]'),
-          x=ragged.constant_value([[[10], [20]], [[30]]], ragged_rank=1),
-          y=[[[1, 2, 3, 4]]],
-          expected=ragged.constant_value(
-              [[[11, 12, 13, 14], [21, 22, 23, 24]], [[31, 32, 33, 34]]],
-              ragged_rank=1)),
-      dict(
-          doc=('x.shape=[2, (D1), 2, 1]; y.shape=[2, (D2)]; '
-               'bcast.shape=[2, (D1), (2), (D2)'),
-          x=ragged.constant_value([[[[1], [2]], [[3], [4]]],
-                                   [[[5], [6]]]],
-                                  ragged_rank=1),
-          y=ragged.constant_value([[10, 20], [30]]),
-          expected=ragged.constant_value(
-              [[[[11, 21], [32]], [[13, 23], [34]]],
-               [[[15, 25], [36]]]])),
-  ])
+  @parameterized.parameters(
+      [
+          dict(
+              doc='x.shape=[3, (D1)]; y.shape=[3, 1]; bcast.shape=[3, (D1)]',
+              x=ragged_factory_ops.constant_value([[1, 2, 3], [], [4, 5]],
+                                                  dtype=np.int32),
+              y=[[10], [20], [30]],
+              expected=ragged_factory_ops.constant_value([[11, 12, 13], [],
+                                                          [34, 35]])),
+          dict(
+              doc='x.shape=[3, (D1)]; y.shape=[]; bcast.shape=[3, (D1)]',
+              x=ragged_factory_ops.constant_value([[1, 2, 3], [], [4, 5]],
+                                                  dtype=np.int32),
+              y=10,
+              expected=ragged_factory_ops.constant_value([[11, 12, 13], [],
+                                                          [14, 15]])),
+          dict(
+              doc='x.shape=[1, (D1)]; y.shape=[3, 1]; bcast.shape=[3, (D1)]',
+              x=ragged_factory_ops.constant_value([[1, 2, 3]], dtype=np.int32),
+              y=[[10], [20], [30]],
+              expected=ragged_factory_ops.constant_value(
+                  [[11, 12, 13], [21, 22, 23], [31, 32, 33]], dtype=np.int32)),
+          dict(
+              doc=('x.shape=[2, (D1), 1]; y.shape=[1, (D2)]; '
+                   'bcast.shape=[2, (D1), (D2)]'),
+              x=ragged_factory_ops.constant_value([[[1], [2], [3]], [[4]]],
+                                                  ragged_rank=1),
+              y=ragged_factory_ops.constant_value([[10, 20, 30]]),
+              expected=ragged_factory_ops.constant_value([[[11, 21, 31],
+                                                           [12, 22, 32],
+                                                           [13, 23, 33]],
+                                                          [[14, 24, 34]]])),
+          dict(
+              doc=('x.shape=[2, (D1), 1]; y.shape=[1, 1, 4]; '
+                   'bcast.shape=[2, (D1), 4]'),
+              x=ragged_factory_ops.constant_value([[[10], [20]], [[30]]],
+                                                  ragged_rank=1),
+              y=[[[1, 2, 3, 4]]],
+              expected=ragged_factory_ops.constant_value(
+                  [[[11, 12, 13, 14], [21, 22, 23, 24]], [[31, 32, 33, 34]]],
+                  ragged_rank=1)),
+          dict(
+              doc=('x.shape=[2, (D1), 2, 1]; y.shape=[2, (D2)]; '
+                   'bcast.shape=[2, (D1), (2), (D2)'),
+              x=ragged_factory_ops.constant_value(
+                  [[[[1], [2]], [[3], [4]]], [[[5], [6]]]], ragged_rank=1),
+              y=ragged_factory_ops.constant_value([[10, 20], [30]]),
+              expected=ragged_factory_ops.constant_value([[[[11, 21], [32]],
+                                                           [[13, 23], [34]]],
+                                                          [[[15, 25], [36]]]])),
+      ])
   def testRaggedAddWithBroadcasting(self, x, y, expected, doc):
     expected_rrank = getattr(expected, 'ragged_rank', 0)
-    x = ragged.convert_to_tensor_or_ragged_tensor(x, dtype=dtypes.int32)
-    y = ragged.convert_to_tensor_or_ragged_tensor(y, dtype=dtypes.int32)
+    x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x, dtype=dtypes.int32)
+    y = ragged_tensor.convert_to_tensor_or_ragged_tensor(y, dtype=dtypes.int32)
     result = x + y
     result_rrank = getattr(result, 'ragged_rank', 0)
     self.assertEqual(expected_rrank, result_rrank)
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_test.py b/tensorflow/python/ops/ragged/ragged_tensor_test.py
index b8f1d97137d22376a39d9fa0e098f8c364383b65..62b7a6b1bc7890e4776bc101ffaceb70401532ac 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_test.py
@@ -30,9 +30,11 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_math_ops
+from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.ops.ragged import ragged_test_util
-from tensorflow.python.ops.ragged import RaggedTensor
+from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
 from tensorflow.python.platform import googletest
 
 
@@ -176,7 +178,7 @@ class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
     splits2 = np.array([0, 3, 5], dtype=np.int64)
 
     # Test construction of a RaggedTensorValue with ragged_rank=1.
-    rt_value = ragged.RaggedTensorValue(values, splits)
+    rt_value = ragged_tensor_value.RaggedTensorValue(values, splits)
     self.assertEqual(rt_value.row_splits.dtype, np.int64)
     self.assertEqual(rt_value.shape, (5, None))
     self.assertEqual(len(rt_value.nested_row_splits), 1)
@@ -186,8 +188,9 @@ class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
     self.assertAllEqual(values, rt_value.flat_values)
 
     # Test construction of a RaggedTensorValue with ragged_rank=2.
-    rt_value = ragged.RaggedTensorValue(
-        values=ragged.RaggedTensorValue(values, splits), row_splits=splits2)
+    rt_value = ragged_tensor_value.RaggedTensorValue(
+        values=ragged_tensor_value.RaggedTensorValue(values, splits),
+        row_splits=splits2)
     self.assertEqual(rt_value.row_splits.dtype, np.int64)
     self.assertEqual(rt_value.shape, (2, None, None))
     self.assertEqual(len(rt_value.nested_row_splits), 2)
@@ -825,14 +828,14 @@ class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
   # pylint: disable=invalid-slice-index
   @parameterized.parameters(
       # Tests for out-of-bound errors
-      (SLICE_BUILDER[5],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
-      (SLICE_BUILDER[-6],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
-      (SLICE_BUILDER[0, 2],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
-      (SLICE_BUILDER[3, 0],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[5], (IndexError, ValueError, errors.InvalidArgumentError),
+       '.*out of bounds.*'),
+      (SLICE_BUILDER[-6], (IndexError, ValueError, errors.InvalidArgumentError),
+       '.*out of bounds.*'),
+      (SLICE_BUILDER[0, 2], (IndexError, ValueError,
+                             errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[3, 0], (IndexError, ValueError,
+                             errors.InvalidArgumentError), '.*out of bounds.*'),
 
       # Indexing into an inner ragged dimension
       (SLICE_BUILDER[:, 3], ValueError,
@@ -950,14 +953,15 @@ class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
        'Cannot index into an inner ragged dimension.'),
 
       # Test for out-of-bounds errors.
-      (SLICE_BUILDER[1, 0],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[1, 0], (IndexError, ValueError,
+                             errors.InvalidArgumentError), '.*out of bounds.*'),
       (SLICE_BUILDER[0, 0, 3],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
-      (SLICE_BUILDER[5],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
-      (SLICE_BUILDER[0, 5],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+       (IndexError, ValueError,
+        errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[5], (IndexError, ValueError, errors.InvalidArgumentError),
+       '.*out of bounds.*'),
+      (SLICE_BUILDER[0, 5], (IndexError, ValueError,
+                             errors.InvalidArgumentError), '.*out of bounds.*'),
   )
   def testRaggedTensorGetItemErrorsWithRaggedRank2(self, slice_spec, expected,
                                                    message):
@@ -979,10 +983,10 @@ class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
     self._TestGetItem(rt, slice_spec, expected)
 
   @parameterized.parameters(
-      (SLICE_BUILDER[0],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
-      (SLICE_BUILDER[-1],
-       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[0], (IndexError, ValueError, errors.InvalidArgumentError),
+       '.*out of bounds.*'),
+      (SLICE_BUILDER[-1], (IndexError, ValueError, errors.InvalidArgumentError),
+       '.*out of bounds.*'),
   )
   def testRaggedTensorGetItemErrorsWithEmptyTensor(self, slice_spec, expected,
                                                    message):
@@ -1096,7 +1100,7 @@ class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
   def testRaggedTensorValueStr(self):
     values = [b'a', b'b', b'c', b'd', b'e', b'f', b'g']
     row_splits = [0, 2, 5, 6, 6, 7]
-    rt = ragged.RaggedTensorValue(
+    rt = ragged_tensor_value.RaggedTensorValue(
         np.array(values), np.array(row_splits, dtype=np.int64))
     expected_str = '<tf.RaggedTensorValue {}>'.format([[b'a', b'b'],
                                                        [b'c', b'd', b'e'],
@@ -1111,8 +1115,9 @@ class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
   #=============================================================================
 
   def testWithValues(self):
-    rt1 = ragged.constant([[1, 2], [3, 4, 5], [6], [], [7]])
-    rt2 = ragged.constant([[[1, 2], [3, 4, 5]], [[6]], [], [[], [7]]])
+    rt1 = ragged_factory_ops.constant([[1, 2], [3, 4, 5], [6], [], [7]])
+    rt2 = ragged_factory_ops.constant([[[1, 2], [3, 4, 5]], [[6]], [], [[],
+                                                                        [7]]])
 
     rt1_plus_10 = rt1.with_values(rt1.values + 10)
     rt2_times_10 = rt2.with_flat_values(rt2.flat_values * 10)
@@ -1135,8 +1140,8 @@ class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
     if context.executing_eagerly():
       return
 
-    rt1 = ragged.constant([[1, 2, 3], [4]])
-    rt2 = ragged.constant([[[], [1, 2]], [[3]]])
+    rt1 = ragged_factory_ops.constant([[1, 2, 3], [4]])
+    rt2 = ragged_factory_ops.constant([[[], [1, 2]], [[3]]])
     with self.test_session() as session:
       result = session.run({'rt1': rt1, 'rt2': rt2})
       self.assertCountEqual(sorted(result.keys()), ['rt1', 'rt2'])
@@ -1156,8 +1161,8 @@ class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
             array_ops.placeholder(dtypes.int64)
         ])
 
-    rt1_feed_val = ragged.constant_value([[1, 2, 3], [4]])
-    rt2_feed_val = ragged.constant_value([[[], [1, 2]], [[3]]])
+    rt1_feed_val = ragged_factory_ops.constant_value([[1, 2, 3], [4]])
+    rt2_feed_val = ragged_factory_ops.constant_value([[[], [1, 2]], [[3]]])
 
     with self.test_session() as session:
       result = session.run({
@@ -1186,13 +1191,13 @@ class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
     c = array_ops.placeholder(dtypes.int32, shape=[], name='c')
 
     # Feed values for placeholder inputs.
-    a_val = ragged.constant_value([[1, 2, 3], [4]])
-    b_val = ragged.constant_value([[5, 4, 3], [2]])
+    a_val = ragged_factory_ops.constant_value([[1, 2, 3], [4]])
+    b_val = ragged_factory_ops.constant_value([[5, 4, 3], [2]])
     c_val = 3
 
     # Compute some values.
-    r1 = ragged.reduce_sum(a * b, axis=1)
-    r2 = ragged.reduce_sum(a + c, axis=1)
+    r1 = ragged_math_ops.reduce_sum(a * b, axis=1)
+    r2 = ragged_math_ops.reduce_sum(a + c, axis=1)
 
     with self.test_session() as session:
       handle = session.partial_run_setup([r1, r2], [a, b, c])
@@ -1203,5 +1208,17 @@ class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
       res2 = session.partial_run(handle, r2, feed_dict={c: c_val})
       self.assertAllEqual(res2, [15, 7])
 
+  # Test case for GitHub issue 24679.
+  def testEagerForLoop(self):
+    if not context.executing_eagerly():
+      return
+
+    values = [[1., 2.], [3., 4., 5.], [6.]]
+    r = ragged_factory_ops.constant(values)
+    i = 0
+    for elem in r:
+      self.assertAllEqual(elem, values[i])
+      i += 1
+
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_value.py b/tensorflow/python/ops/ragged/ragged_tensor_value.py
index e94ca4afac63f3d1dafb148266683042c987934f..c5e498e95fb5bca7ba2d5496a8af33bd8b8eb0fd 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_value.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_value.py
@@ -20,11 +20,17 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.util.tf_export import tf_export
 
+
+@tf_export(v1=["ragged.RaggedTensorValue"])
 class RaggedTensorValue(object):
   """Represents the value of a `RaggedTensor`.
 
-  See `RaggedTensor` for a description of ragged tensors.
+  Warning: `RaggedTensorValue` should only be used in graph mode; in
+  eager mode, the `tf.RaggedTensor` class contains its value directly.
+
+  See `tf.RaggedTensor` for a description of ragged tensors.
   """
 
   def __init__(self, values, row_splits):
@@ -98,10 +104,3 @@ class RaggedTensorValue(object):
         values_as_list[self._row_splits[i]:self._row_splits[i + 1]]
         for i in range(len(self._row_splits) - 1)
     ]
-
-  def value_rowids(self, name=None):
-    del name
-    row_lengths = self._row_splits[1:] - self._row_splits[:-1]
-    nrows = self._row_splits.shape[-1] - 1
-    indices = np.arange(nrows)
-    return np.repeat(indices, repeats=row_lengths, axis=0)
diff --git a/tensorflow/python/ops/ragged/ragged_test_util.py b/tensorflow/python/ops/ragged/ragged_test_util.py
index 027417664d23683e0eb3906892b81c29c8847f6a..dcbab3021ecb483641e9376ec0cdfefa36fdd704 100644
--- a/tensorflow/python/ops/ragged/ragged_test_util.py
+++ b/tensorflow/python/ops/ragged/ragged_test_util.py
@@ -24,7 +24,8 @@ import numpy as np
 
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_value
 
 
 class RaggedTensorTestCase(test_util.TensorFlowTestCase):
@@ -32,14 +33,14 @@ class RaggedTensorTestCase(test_util.TensorFlowTestCase):
 
   def _GetPyList(self, a):
     """Converts a to a nested python list."""
-    if isinstance(a, ragged.RaggedTensor):
+    if isinstance(a, ragged_tensor.RaggedTensor):
       return self.evaluate(a).to_list()
     elif isinstance(a, ops.Tensor):
       a = self.evaluate(a)
       return a.tolist() if isinstance(a, np.ndarray) else a
     elif isinstance(a, np.ndarray):
       return a.tolist()
-    elif isinstance(a, ragged.RaggedTensorValue):
+    elif isinstance(a, ragged_tensor_value.RaggedTensorValue):
       return a.to_list()
     else:
       return np.array(a).tolist()
@@ -51,8 +52,8 @@ class RaggedTensorTestCase(test_util.TensorFlowTestCase):
     self.assertEqual(a_list, b_list)
 
     if not (isinstance(a, (list, tuple)) or isinstance(b, (list, tuple))):
-      a_ragged_rank = a.ragged_rank if ragged.is_ragged(a) else 0
-      b_ragged_rank = b.ragged_rank if ragged.is_ragged(b) else 0
+      a_ragged_rank = a.ragged_rank if ragged_tensor.is_ragged(a) else 0
+      b_ragged_rank = b.ragged_rank if ragged_tensor.is_ragged(b) else 0
       self.assertEqual(a_ragged_rank, b_ragged_rank)
 
   def assertRaggedAlmostEqual(self, a, b, places=7):
@@ -61,8 +62,8 @@ class RaggedTensorTestCase(test_util.TensorFlowTestCase):
     self.assertNestedListAlmostEqual(a_list, b_list, places, context='value')
 
     if not (isinstance(a, (list, tuple)) or isinstance(b, (list, tuple))):
-      a_ragged_rank = a.ragged_rank if ragged.is_ragged(a) else 0
-      b_ragged_rank = b.ragged_rank if ragged.is_ragged(b) else 0
+      a_ragged_rank = a.ragged_rank if ragged_tensor.is_ragged(a) else 0
+      b_ragged_rank = b.ragged_rank if ragged_tensor.is_ragged(b) else 0
       self.assertEqual(a_ragged_rank, b_ragged_rank)
 
   def assertNestedListAlmostEqual(self, a, b, places=7, context='value'):
@@ -79,7 +80,7 @@ class RaggedTensorTestCase(test_util.TensorFlowTestCase):
 
   def eval_to_list(self, tensor):
     value = self.evaluate(tensor)
-    if ragged.is_ragged(value):
+    if ragged_tensor.is_ragged(value):
       return value.to_list()
     elif isinstance(value, np.ndarray):
       return value.tolist()
@@ -87,8 +88,8 @@ class RaggedTensorTestCase(test_util.TensorFlowTestCase):
       return value
 
   def _eval_tensor(self, tensor):
-    if ragged.is_ragged(tensor):
-      return ragged.RaggedTensorValue(
+    if ragged_tensor.is_ragged(tensor):
+      return ragged_tensor_value.RaggedTensorValue(
           self._eval_tensor(tensor.values),
           self._eval_tensor(tensor.row_splits))
     else:
diff --git a/tensorflow/python/ops/ragged/ragged_tile_op_test.py b/tensorflow/python/ops/ragged/ragged_tile_op_test.py
index d3445571bff6c75e7a22e458bdf99d3886cd9614..8c03b166531c3ce07d7543677e70529413b37648 100644
--- a/tensorflow/python/ops/ragged/ragged_tile_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tile_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.tile."""
+"""Tests for ragged_array_ops.tile."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -24,7 +24,8 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -189,7 +190,7 @@ class RaggedTileOpTest(ragged_test_util.RaggedTensorTestCase,
                      multiples,
                      expected,
                      ragged_rank=None):
-    rt = ragged.constant(rt_input, ragged_rank)
+    rt = ragged_factory_ops.constant(rt_input, ragged_rank)
 
     expected_shape = [
         None if dim is None else dim * multiple
@@ -203,7 +204,7 @@ class RaggedTileOpTest(ragged_test_util.RaggedTensorTestCase,
         const_multiples, shape=[len(multiples)])
 
     for multiples_tensor in (const_multiples, non_const_multiples):
-      tiled = ragged.tile(rt, multiples_tensor)
+      tiled = ragged_array_ops.tile(rt, multiples_tensor)
       self.assertEqual(tiled.ragged_rank, rt.ragged_rank)
       self.assertEqual(tiled.shape.ndims, rt.shape.ndims)
       if multiples_tensor is const_multiples:
@@ -213,7 +214,7 @@ class RaggedTileOpTest(ragged_test_util.RaggedTensorTestCase,
   def testRaggedTileWithTensorInput(self):
     # When the input is a `Tensor`, ragged_tile just delegates to tf.tile.
     dt = constant_op.constant([[1, 2], [3, 4]])
-    tiled = ragged.tile(dt, [3, 2])
+    tiled = ragged_array_ops.tile(dt, [3, 2])
     expected = [[1, 2, 1, 2], [3, 4, 3, 4],
                 [1, 2, 1, 2], [3, 4, 3, 4],
                 [1, 2, 1, 2], [3, 4, 3, 4]]  # pyformat: disable
diff --git a/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py b/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py
index 46d7a56a7c8e0fa7a008625314e30786ffbbfefe..92959a98bfeaa27f0db697656d51cf7e46b10327 100644
--- a/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py
@@ -25,7 +25,9 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_functional_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -34,7 +36,7 @@ from tensorflow.python.platform import googletest
 class RaggedTensorToSparseOpTest(ragged_test_util.RaggedTensorTestCase):
 
   def testDocStringExample(self):
-    rt = ragged.constant([[1, 2, 3], [4], [], [5, 6]])
+    rt = ragged_factory_ops.constant([[1, 2, 3], [4], [], [5, 6]])
     st = self.evaluate(rt.to_sparse())
     self.assertAllEqual(st.indices,
                         [[0, 0], [0, 1], [0, 2], [1, 0], [3, 0], [3, 1]])
@@ -42,7 +44,8 @@ class RaggedTensorToSparseOpTest(ragged_test_util.RaggedTensorTestCase):
     self.assertAllEqual(st.dense_shape, [4, 3])
 
   def test2DRaggedTensorWithOneRaggedDimension(self):
-    rt = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
+    rt = ragged_factory_ops.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [],
+                                      ['g']])
     st = self.evaluate(rt.to_sparse())
     self.assertAllEqual(
         st.indices, [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [2, 0], [4, 0]])
@@ -50,9 +53,10 @@ class RaggedTensorToSparseOpTest(ragged_test_util.RaggedTensorTestCase):
     self.assertAllEqual(st.dense_shape, [5, 3])
 
   def test3DRaggedTensorWithOneRaggedDimension(self):
-    rt = ragged.constant([[[1, 2], [3, 4]], [[5, 6], [7, 8], [9, 10]],
-                          [[11, 12]], [], [[13, 14]]],
-                         ragged_rank=1)
+    rt = ragged_factory_ops.constant(
+        [[[1, 2], [3, 4]], [[5, 6], [7, 8], [9, 10]], [[11, 12]], [], [[13, 14]]
+        ],
+        ragged_rank=1)
     st = self.evaluate(rt.to_sparse())
     self.assertAllEqual(st.indices,
                         [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 1, 1], [1, 0, 0],
@@ -63,7 +67,7 @@ class RaggedTensorToSparseOpTest(ragged_test_util.RaggedTensorTestCase):
     self.assertAllEqual(st.dense_shape, [5, 3, 2])
 
   def test4DRaggedTensorWithOneRaggedDimension(self):
-    rt = ragged.constant(
+    rt = ragged_factory_ops.constant(
         [[[[1, 2], [3, 4]], [[5, 6], [7, 8]]], [], [[[9, 10], [11, 12]]]],
         ragged_rank=1)
     st = self.evaluate(rt.to_sparse())
@@ -87,9 +91,10 @@ class RaggedTensorToSparseOpTest(ragged_test_util.RaggedTensorTestCase):
     self.assertAllEqual(st.dense_shape, [3, 2, 2, 2])
 
   def test4DRaggedTensorWithTwoRaggedDimensions(self):
-    rt = ragged.constant([[[[1, 2], [3, 4]], [[5, 6], [7, 8], [9, 10]]],
-                          [[[11, 12]], [], [[13, 14]]], []],
-                         ragged_rank=2)
+    rt = ragged_factory_ops.constant(
+        [[[[1, 2], [3, 4]], [[5, 6], [7, 8], [9, 10]]],
+         [[[11, 12]], [], [[13, 14]]], []],
+        ragged_rank=2)
     st = self.evaluate(rt.to_sparse())
     self.assertAllEqual(
         st.indices,
@@ -114,19 +119,20 @@ class RaggedTensorToSparseOpTest(ragged_test_util.RaggedTensorTestCase):
     self.assertAllEqual(st.dense_shape, [3, 3, 3, 2])
 
   def testShape(self):
-    rt = ragged.constant([[1, 2], [3, 4, 5], [6], [], [7]])
+    rt = ragged_factory_ops.constant([[1, 2], [3, 4, 5], [6], [], [7]])
     st = rt.to_sparse()
     self.assertEqual(st.indices.shape.as_list(), [7, 2])
     self.assertEqual(st.values.shape.as_list(), [7])
     self.assertEqual(st.dense_shape.shape.as_list(), [2])
 
-    rt = ragged.constant([[[1, 2]], [], [[3, 4]], []], ragged_rank=1)
+    rt = ragged_factory_ops.constant([[[1, 2]], [], [[3, 4]], []],
+                                     ragged_rank=1)
     st = rt.to_sparse()
     self.assertEqual(st.indices.shape.as_list(), [4, 3])
     self.assertEqual(st.values.shape.as_list(), [4])
     self.assertEqual(st.dense_shape.shape.as_list(), [3])
 
-    rt = ragged.constant([[[1], [2, 3, 4, 5, 6, 7]], [[]]])
+    rt = ragged_factory_ops.constant([[[1], [2, 3, 4, 5, 6, 7]], [[]]])
     st = rt.to_sparse()
     self.assertEqual(st.indices.shape.as_list(), [7, 3])
     self.assertEqual(st.values.shape.as_list(), [7])
@@ -138,17 +144,17 @@ class RaggedTensorToSparseOpTest(ragged_test_util.RaggedTensorTestCase):
     empty_vector = array_ops.placeholder_with_default(
         array_ops.zeros([0], dtypes.int64), shape=None)
 
-    bad_rt1 = ragged.RaggedTensor.from_row_splits(
+    bad_rt1 = ragged_tensor.RaggedTensor.from_row_splits(
         row_splits=[2, 3], values=[1, 2, 3])
     bad_split0 = r'First value of ragged splits must be 0.*'
     with self.assertRaisesRegexp(errors.InvalidArgumentError, bad_split0):
       self.evaluate(bad_rt1.to_sparse())
 
-    bad_rt2 = ragged.RaggedTensor.from_row_splits(
+    bad_rt2 = ragged_tensor.RaggedTensor.from_row_splits(
         row_splits=[0, 5], values=empty_vector)
-    bad_rt3 = ragged.RaggedTensor.from_row_splits(
+    bad_rt3 = ragged_tensor.RaggedTensor.from_row_splits(
         row_splits=[0, 1],
-        values=ragged.RaggedTensor.from_row_splits(
+        values=ragged_tensor.RaggedTensor.from_row_splits(
             row_splits=[0, 5], values=empty_vector))
     split_mismatch1_error = r'Final value of ragged splits must match.*'
     for rt in [bad_rt2, bad_rt3]:
@@ -156,16 +162,16 @@ class RaggedTensorToSparseOpTest(ragged_test_util.RaggedTensorTestCase):
                                    split_mismatch1_error):
         self.evaluate(rt.to_sparse())
 
-    bad_rt4 = ragged.RaggedTensor.from_row_splits(
+    bad_rt4 = ragged_tensor.RaggedTensor.from_row_splits(
         row_splits=[0, 5],
-        values=ragged.RaggedTensor.from_row_splits(
+        values=ragged_tensor.RaggedTensor.from_row_splits(
             row_splits=[0], values=empty_vector))
     split_mismatch2_error = r'Final value of ragged splits must match.*'
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  split_mismatch2_error):
       self.evaluate(bad_rt4.to_sparse())
 
-    bad_rt5 = ragged.RaggedTensor.from_row_splits(
+    bad_rt5 = ragged_tensor.RaggedTensor.from_row_splits(
         row_splits=empty_vector, values=[])
     empty_splits_error = (r'ragged splits may not be empty.*')
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
@@ -176,11 +182,11 @@ class RaggedTensorToSparseOpTest(ragged_test_util.RaggedTensorTestCase):
     if context.executing_eagerly():
       return
     # rt1.shape == rt2.shape == [2, (D2), (D3), 2].
-    rt1 = ragged.constant([[[[1.0, 2.0], [3.0, 4.0]], [[5.0, 6.0]]]],
-                          ragged_rank=2)
-    rt2 = ragged.constant([[[[9.0, 8.0], [7.0, 6.0]], [[5.0, 4.0]]]],
-                          ragged_rank=2)
-    rt = ragged.map_flat_values(math_ops.add, rt1, rt2 * 2.0)
+    rt1 = ragged_factory_ops.constant(
+        [[[[1.0, 2.0], [3.0, 4.0]], [[5.0, 6.0]]]], ragged_rank=2)
+    rt2 = ragged_factory_ops.constant(
+        [[[[9.0, 8.0], [7.0, 6.0]], [[5.0, 4.0]]]], ragged_rank=2)
+    rt = ragged_functional_ops.map_flat_values(math_ops.add, rt1, rt2 * 2.0)
     st = rt.to_sparse()
 
     g1, g2 = gradients_impl.gradients(st.values,
diff --git a/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py b/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
index ffcc2be52e5538c6d99ee8bcb0ed5d368ac5ed42..ac75456813fc05f1ce74d2f5ea09283fec22de90 100644
--- a/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
@@ -23,7 +23,7 @@ from absl.testing import parameterized
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
@@ -34,7 +34,7 @@ class RaggedTensorToTensorOpTest(ragged_test_util.RaggedTensorTestCase,
 
   def testDocStringExamples(self):
     """Example from ragged_to_tensor.__doc__."""
-    rt = ragged.constant([[9, 8, 7], [], [6, 5], [4]])
+    rt = ragged_factory_ops.constant([[9, 8, 7], [], [6, 5], [4]])
     dt = rt.to_tensor()
     self.assertAllEqual(dt, [[9, 8, 7], [0, 0, 0], [6, 5, 0], [4, 0, 0]])
 
@@ -100,7 +100,7 @@ class RaggedTensorToTensorOpTest(ragged_test_util.RaggedTensorTestCase,
                                ragged_rank=None,
                                default=None,
                                expected_shape=None):
-    rt = ragged.constant(rt_input, ragged_rank=ragged_rank)
+    rt = ragged_factory_ops.constant(rt_input, ragged_rank=ragged_rank)
     dt = rt.to_tensor(default)
     self.assertIsInstance(dt, ops.Tensor)
     self.assertEqual(rt.dtype, dt.dtype)
@@ -129,7 +129,7 @@ class RaggedTensorToTensorOpTest(ragged_test_util.RaggedTensorTestCase,
       },
   )
   def testError(self, rt_input, default, error, ragged_rank=None):
-    rt = ragged.constant(rt_input, ragged_rank=ragged_rank)
+    rt = ragged_factory_ops.constant(rt_input, ragged_rank=ragged_rank)
     with self.assertRaisesRegexp(error[0], error[1]):
       rt.to_tensor(default)
 
diff --git a/tensorflow/python/ops/ragged/ragged_util_test.py b/tensorflow/python/ops/ragged/ragged_util_test.py
index 72a4155930708a0e8eb5808807bf788c67de862f..ab5436a91cc8440373798c65bdac3648319316f3 100644
--- a/tensorflow/python/ops/ragged/ragged_util_test.py
+++ b/tensorflow/python/ops/ragged/ragged_util_test.py
@@ -92,8 +92,7 @@ class RaggedUtilTest(ragged_test_util.RaggedTensorTestCase,
   ])
   def testRepeat(self, data, repeats, expected, axis=None):
     result = ragged_util.repeat(data, repeats, axis)
-    with self.test_session():
-      self.assertAllEqual(result, expected)
+    self.assertAllEqual(result, expected)
 
   @parameterized.parameters([
       dict(mode=mode, **args)
@@ -158,8 +157,7 @@ class RaggedUtilTest(ragged_test_util.RaggedTensorTestCase,
       repeats = array_ops.placeholder_with_default(repeats, None)
 
     result = ragged_util.repeat(data, repeats, axis)
-    with self.test_session():
-      self.assertAllEqual(result, expected)
+    self.assertAllEqual(result, expected)
 
   @parameterized.parameters([
       dict(
diff --git a/tensorflow/python/ops/ragged/ragged_where_op.py b/tensorflow/python/ops/ragged/ragged_where_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..d60ee49ee8adb2e4b117f9009bd602ab36f84046
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_where_op.py
@@ -0,0 +1,166 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""where operation for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_concat_ops
+from tensorflow.python.ops.ragged import ragged_functional_ops
+from tensorflow.python.ops.ragged import ragged_gather_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+
+
+def where(condition, x=None, y=None, name=None):
+  """Return the elements, either from `x` or `y`, depending on the `condition`.
+
+  : If both `x` and `y` are `None`:
+    Returns the coordinates of true elements of `condition`. The coordinates
+    are returned in a 2-D tensor with shape
+    `[num_true_values, dim_size(condition)]`, where `result[i]` is the
+    coordinates of the `i`th true value (in row-major order).
+
+  : If both `x` and `y` are non-`None`:
+    Returns a tensor formed by selecting values from `x` where condition is
+    true, and from `y` when condition is false.  In particular:
+
+    : If `condition`, `x`, and `y` all have the same shape:
+
+      * `result[i1...iN] = x[i1...iN]` if `condition[i1...iN]` is true.
+      * `result[i1...iN] = y[i1...iN]` if `condition[i1...iN]` is false.
+
+    : Otherwise:
+
+      * `condition` must be a vector.
+      * `x` and `y` must have the same number of dimensions.
+      * The outermost dimensions of `condition`, `x`, and `y` must all have the
+        same size.
+      * `result[i] = x[i]` if `condition[i]` is true.
+      * `result[i] = y[i]` if `condition[i]` is false.
+
+  Args:
+    condition: A potentially ragged tensor of type `bool`
+    x: A potentially ragged tensor (optional).
+    y: A potentially ragged tensor (optional).  Must be specified if `x` is
+      specified.  Must have the same rank and type as `x`.
+    name: A name of the operation (optional)
+
+  Returns:
+    : If both `x` and `y` are `None`:
+      A `Tensor` with shape `(num_true, dim_size(condition))`.
+    : Otherwise:
+      A potentially ragged tensor with the same type, rank, and outermost
+      dimension size as `x` and `y`.
+      `result.ragged_rank = max(x.ragged_rank, y.ragged_rank)`.
+
+  Raises:
+    ValueError: When exactly one of `x` or `y` is non-`None`; or when
+      `condition`, `x`, and `y` have incompatible shapes.
+
+  #### Examples:
+    ```python
+    >>> # Coordinates where condition is true.
+    >>> condition = tf.ragged.constant_value(
+    ...     [[True, False, True], [False, True]])
+    >>> ragged.where(condition)
+    [[0, 0], [0, 2], [1, 1]]
+
+    >>> # Elementwise selection between x and y, based on condition.
+    >>> condition = tf.ragged.constant_value(
+    ...     [[True, False, True], [False, True]])
+    >>> x = tf.ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
+    >>> y = tf.ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
+    >>> ragged.where(condition, x, y)
+    [['A', 'b', 'C'], ['d', 'E']]
+
+    >>> # Row selection between x and y, based on condition.
+    >>> condition = [True, False]
+    >>> x = tf.ragged.constant_value([['A', 'B', 'C'], ['D', 'E']])
+    >>> y = tf.ragged.constant_value([['a', 'b', 'c'], ['d', 'e']])
+    >>> ragged.where(condition, x, y)
+    [['A', 'B', 'C'], ['d', 'e']]
+    ```
+  """
+  if (x is None) != (y is None):
+    raise ValueError('x and y must be either both None or both non-None')
+  with ops.name_scope('RaggedWhere', name, [condition, x, y]):
+    condition = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        condition, name='condition')
+    if x is None:
+      return _coordinate_where(condition)
+    else:
+      x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x, name='x')
+      y = ragged_tensor.convert_to_tensor_or_ragged_tensor(y, name='y')
+      return _elementwise_where(condition, x, y)
+
+
+def _elementwise_where(condition, x, y):
+  """Ragged version of tf.where(condition, x, y)."""
+  condition_is_ragged = isinstance(condition, ragged_tensor.RaggedTensor)
+  x_is_ragged = isinstance(x, ragged_tensor.RaggedTensor)
+  y_is_ragged = isinstance(y, ragged_tensor.RaggedTensor)
+
+  if not (condition_is_ragged or x_is_ragged or y_is_ragged):
+    return array_ops.where(condition, x, y)
+
+  elif condition_is_ragged and x_is_ragged and y_is_ragged:
+    return ragged_functional_ops.map_flat_values(array_ops.where, condition, x,
+                                                 y)
+  elif not condition_is_ragged:
+    # Concatenate x and y, and then use `gather` to assemble the selected rows.
+    condition.shape.assert_has_rank(1)
+    x_nrows = _nrows(x)
+    x_and_y = ragged_concat_ops.concat([x, y], axis=0)
+    indices = array_ops.where(condition, math_ops.range(x_nrows),
+                              x_nrows + math_ops.range(_nrows(y)))
+    return ragged_gather_ops.gather(x_and_y, indices)
+
+  else:
+    raise ValueError('Input shapes do not match.')
+
+
+def _coordinate_where(condition):
+  """Ragged version of tf.where(condition)."""
+  if not isinstance(condition, ragged_tensor.RaggedTensor):
+    return array_ops.where(condition)
+
+  # The coordinate for each `true` value in condition.values.
+  selected_coords = _coordinate_where(condition.values)
+
+  # Convert the first index in each coordinate to a row index and column index.
+  first_index = selected_coords[:, 0]
+  selected_rows = array_ops.gather(condition.value_rowids(), first_index)
+  selected_row_starts = array_ops.gather(condition.row_splits, selected_rows)
+  selected_cols = first_index - selected_row_starts
+
+  # Assemble the row & column index with the indices for inner dimensions.
+  return array_ops.concat([
+      array_ops.expand_dims(selected_rows, 1),
+      array_ops.expand_dims(selected_cols, 1), selected_coords[:, 1:]
+  ],
+                          axis=1)
+
+
+def _nrows(rt_input, out_type=dtypes.int64, name=None):
+  if isinstance(rt_input, ragged_tensor.RaggedTensor):
+    return rt_input.nrows(out_type=out_type, name=name)
+  else:
+    with ops.name_scope(name, 'RaggedNRows', [rt_input]):
+      return array_ops.shape(rt_input, out_type=out_type)[0]
diff --git a/tensorflow/python/ops/ragged/ragged_where_op_test.py b/tensorflow/python/ops/ragged/ragged_where_op_test.py
index b3cd5a2debe0db0b1bac2b6396c78b9e94c3f671..e76a04072a5ae0f593a9897105962305a38c39bf 100644
--- a/tensorflow/python/ops/ragged/ragged_where_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_where_op_test.py
@@ -12,17 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.where."""
+"""Tests for ragged_array_ops.where."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 from absl.testing import parameterized
-
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.ops.ragged import ragged_where_op
 from tensorflow.python.platform import googletest
 
 
@@ -35,18 +34,24 @@ class RaggedWhereOpTest(ragged_test_util.RaggedTensorTestCase,
       # Docstring Examples
       #=========================================================================
       dict(  # shape=[D1, (D2)]
-          condition=ragged.constant_value([[True, False, True], [False, True]]),
+          condition=ragged_factory_ops.constant_value(
+              [[True, False, True], [False, True]]),
           expected=[[0, 0], [0, 2], [1, 1]]),
       dict(  # shape=[D1, (D2)]
-          condition=ragged.constant_value([[True, False, True], [False, True]]),
-          x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']]),
-          y=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']]),
-          expected=ragged.constant_value([[b'A', b'b', b'C'], [b'd', b'E']])),
+          condition=ragged_factory_ops.constant_value(
+              [[True, False, True], [False, True]]),
+          x=ragged_factory_ops.constant_value(
+              [['A', 'B', 'C'], ['D', 'E']]),
+          y=ragged_factory_ops.constant_value(
+              [['a', 'b', 'c'], ['d', 'e']]),
+          expected=ragged_factory_ops.constant_value(
+              [[b'A', b'b', b'C'], [b'd', b'E']])),
       dict(  # shape=[D1, (D2)]
-          condition=ragged.constant_value([True, False]),
-          x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']]),
-          y=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']]),
-          expected=ragged.constant_value([[b'A', b'B', b'C'], [b'd', b'e']])),
+          condition=ragged_factory_ops.constant_value([True, False]),
+          x=ragged_factory_ops.constant_value([['A', 'B', 'C'], ['D', 'E']]),
+          y=ragged_factory_ops.constant_value([['a', 'b', 'c'], ['d', 'e']]),
+          expected=ragged_factory_ops.constant_value(
+              [[b'A', b'B', b'C'], [b'd', b'e']])),
       #=========================================================================
       # Coordinate-retrieval mode
       #=========================================================================
@@ -57,24 +62,25 @@ class RaggedWhereOpTest(ragged_test_util.RaggedTensorTestCase,
           condition=[[True, False], [False, True]],
           expected=[[0, 0], [1, 1]]),
       dict(  # shape=[D1, (D2)]
-          condition=ragged.constant_value([[True, False, True], [False, True]]),
+          condition=ragged_factory_ops.constant_value(
+              [[True, False, True], [False, True]]),
           expected=[[0, 0], [0, 2], [1, 1]]),
       dict(  # shape=[D1, (D2), (D3)]
-          condition=ragged.constant_value([
+          condition=ragged_factory_ops.constant_value([
               [[True, False, True], [False, True]],
               [[True], [], [False], [False, True, False]]
           ]),
           expected=[[0, 0, 0], [0, 0, 2], [0, 1, 1],
                     [1, 0, 0], [1, 3, 1]]),
       dict(  # shape=[D1, (D2), D3]
-          condition=ragged.constant_value([
+          condition=ragged_factory_ops.constant_value([
               [[True, False], [False, True]],
               [[True, False], [False, False], [True, False], [False, True]]
           ], ragged_rank=1),
           expected=[[0, 0, 0], [0, 1, 1],
                     [1, 0, 0], [1, 2, 0], [1, 3, 1]]),
       dict(  # shape=[D1, (D2), (D3), (D4)]
-          condition=ragged.constant_value([
+          condition=ragged_factory_ops.constant_value([
               [[[], [True]]],
               [[[True, False, True], [False, True]],
                [[True], [], [False], [False, True, False]]]
@@ -101,44 +107,46 @@ class RaggedWhereOpTest(ragged_test_util.RaggedTensorTestCase,
           y=[['a', 'b'], ['d', 'e']],
           expected=[[b'A', b'b'], [b'd', b'E']]),
       dict(  # shape=[D1, (D2)]
-          condition=ragged.constant_value([[True, False, True], [False, True]]),
-          x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']]),
-          y=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']]),
-          expected=ragged.constant_value([[b'A', b'b', b'C'], [b'd', b'E']])),
+          condition=ragged_factory_ops.constant_value(
+              [[True, False, True], [False, True]]),
+          x=ragged_factory_ops.constant_value([['A', 'B', 'C'], ['D', 'E']]),
+          y=ragged_factory_ops.constant_value([['a', 'b', 'c'], ['d', 'e']]),
+          expected=ragged_factory_ops.constant_value(
+              [[b'A', b'b', b'C'], [b'd', b'E']])),
       dict(  # shape=[D1, (D2), D3]
-          condition=ragged.constant_value([
+          condition=ragged_factory_ops.constant_value([
               [[True, False], [False, True]],
               [[True, False], [False, False], [True, False], [False, True]]
           ], ragged_rank=1),
-          x=ragged.constant_value([
+          x=ragged_factory_ops.constant_value([
               [['A', 'B'], ['C', 'D']],
               [['E', 'F'], ['G', 'H'], ['I', 'J'], ['K', 'L']]
           ], ragged_rank=1),
-          y=ragged.constant_value([
+          y=ragged_factory_ops.constant_value([
               [['a', 'b'], ['c', 'd']],
               [['e', 'f'], ['g', 'h'], ['i', 'j'], ['k', 'l']]
           ], ragged_rank=1),
-          expected=ragged.constant_value([
+          expected=ragged_factory_ops.constant_value([
               [[b'A', b'b'], [b'c', b'D']],
               [[b'E', b'f'], [b'g', b'h'], [b'I', b'j'], [b'k', b'L']]
           ], ragged_rank=1)),
       dict(  # shape=[D1, (D2), (D3), (D4)]
-          condition=ragged.constant_value([
+          condition=ragged_factory_ops.constant_value([
               [[[], [True]]],
               [[[True, False, True], [False, True]],
                [[True], [], [False], [False, True, False]]]
           ]),
-          x=ragged.constant_value([
+          x=ragged_factory_ops.constant_value([
               [[[], ['A']]],
               [[['B', 'C', 'D'], ['E', 'F']],
                [['G'], [], ['H'], ['I', 'J', 'K']]]
           ]),
-          y=ragged.constant_value([
+          y=ragged_factory_ops.constant_value([
               [[[], ['a']]],
               [[['b', 'c', 'd'], ['e', 'f']],
                [['g'], [], ['h'], ['i', 'j', 'k']]]
           ]),
-          expected=ragged.constant_value([
+          expected=ragged_factory_ops.constant_value([
               [[[], [b'A']]],
               [[[b'B', b'c', b'D'], [b'e', b'F']],
                [[b'G'], [], [b'h'], [b'i', b'J', b'k']]]
@@ -154,22 +162,25 @@ class RaggedWhereOpTest(ragged_test_util.RaggedTensorTestCase,
           expected=[[b'A', b'B'], [b'c', b'd'], [b'E', b'F']]),
       dict(  # shape=[D1, (D2)]
           condition=[True, False, True],
-          x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E'], ['F', 'G']]),
-          y=ragged.constant_value([['a', 'b'], ['c'], ['d', 'e']]),
-          expected=ragged.constant_value([[b'A', b'B', b'C'], [b'c'],
-                                          [b'F', b'G']])),
+          x=ragged_factory_ops.constant_value(
+              [['A', 'B', 'C'], ['D', 'E'], ['F', 'G']]),
+          y=ragged_factory_ops.constant_value(
+              [['a', 'b'], ['c'], ['d', 'e']]),
+          expected=ragged_factory_ops.constant_value(
+              [[b'A', b'B', b'C'], [b'c'], [b'F', b'G']])),
       dict(  # shape=[D1, (D2), (D3), (D4)]
-          condition=ragged.constant_value([True, False]),
-          x=ragged.constant_value([
+          condition=ragged_factory_ops.constant_value([True, False]),
+          x=ragged_factory_ops.constant_value([
               [[[], ['A']]],
               [[['B', 'C', 'D'], ['E', 'F']],
                [['G'], [], ['H'], ['I', 'J', 'K']]]
           ]),
-          y=ragged.constant_value([[[['a']]], [[['b']]]]),
-          expected=ragged.constant_value([[[[], [b'A']]], [[[b'b']]]])),
+          y=ragged_factory_ops.constant_value([[[['a']]], [[['b']]]]),
+          expected=ragged_factory_ops.constant_value(
+              [[[[], [b'A']]], [[[b'b']]]])),
   ])   # pyformat: disable
   def testRaggedWhere(self, condition, expected, x=None, y=None):
-    result = ragged.where(condition, x, y)
+    result = ragged_where_op.where(condition, x, y)
     self.assertRaggedEqual(result, expected)
 
   @parameterized.parameters([
@@ -179,15 +190,16 @@ class RaggedWhereOpTest(ragged_test_util.RaggedTensorTestCase,
           error=ValueError,
           message='x and y must be either both None or both non-None'),
       dict(
-          condition=ragged.constant_value([[True, False, True], [False, True]]),
-          x=ragged.constant_value([['A', 'B', 'C'], ['D', 'E']]),
+          condition=ragged_factory_ops.constant_value([[True, False, True],
+                                                       [False, True]]),
+          x=ragged_factory_ops.constant_value([['A', 'B', 'C'], ['D', 'E']]),
           y=[['a', 'b'], ['d', 'e']],
           error=ValueError,
           message='Input shapes do not match.'),
   ])
   def testRaggedWhereErrors(self, condition, error, message, x=None, y=None):
     with self.assertRaisesRegexp(error, message):
-      ragged.where(condition, x, y)
+      ragged_where_op.where(condition, x, y)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/segment_id_ops.py b/tensorflow/python/ops/ragged/segment_id_ops.py
index fa2970c3e75af36d3f042ab23ab70c8d2cdb36ca..31e26e7c9d8b913e538b284654aa80d80c55150b 100644
--- a/tensorflow/python/ops/ragged/segment_id_ops.py
+++ b/tensorflow/python/ops/ragged/segment_id_ops.py
@@ -25,12 +25,14 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.util.tf_export import tf_export
 
 
 # For background on "segments" and "segment ids", see:
-# https://www.tensorflow.org/api_guides/python/math_ops#Segmentation
+# https://www.tensorflow.org/api_docs/python/tf/math#Segmentation
+@tf_export("ragged.row_splits_to_segment_ids")
 def row_splits_to_segment_ids(splits, name=None):
-  """Generates the segmentation corresponding to a RaggedTensor `splits` vector.
+  """Generates the segmentation corresponding to a RaggedTensor `row_splits`.
 
   Returns an integer vector `segment_ids`, where `segment_ids[i] == j` if
   `splits[j] <= i < splits[j+1]`.  Example:
@@ -62,9 +64,10 @@ def row_splits_to_segment_ids(splits, name=None):
 
 
 # For background on "segments" and "segment ids", see:
-# https://www.tensorflow.org/api_guides/python/math_ops#Segmentation
+# https://www.tensorflow.org/api_docs/python/tf/math#Segmentation
+@tf_export("ragged.segment_ids_to_row_splits")
 def segment_ids_to_row_splits(segment_ids, num_segments=None, name=None):
-  """Generates the RaggedTensor `splits` vector corresponding to a segmentation.
+  """Generates the RaggedTensor `row_splits` corresponding to a segmentation.
 
   Returns an integer vector `splits`, where `splits[0] = 0` and
   `splits[i] = splits[i-1] + count(segment_ids==i)`.  Example:
diff --git a/tensorflow/python/ops/raw_ops_test.py b/tensorflow/python/ops/raw_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..82ea137354e9fcbe0e2c9a88253f7f1e9b75722a
--- /dev/null
+++ b/tensorflow/python/ops/raw_ops_test.py
@@ -0,0 +1,42 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Raw ops tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RawOpsTest(test.TestCase):
+
+  def testSimple(self):
+
+    with self.assertRaises(TypeError):
+      _ = gen_math_ops.Add(1., 1.)
+
+    x = constant_op.constant(1)
+    self.assertEqual([2], self.evaluate(gen_math_ops.Add(x=x, y=x)))
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 1066b357b43bb60d5e5b078846fcd82e12e941c3..afc9e978d0519645d448c227637544d893c0bd38 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 import contextlib
+import functools
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import variable_pb2
@@ -36,13 +37,15 @@ from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_resource_variable_ops import *
 # pylint: enable=wildcard-import
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import compat
+from tensorflow.python.util.deprecation import deprecated
 
 
 def get_resource_handle_data(graph_op):
@@ -55,8 +58,118 @@ def get_resource_handle_data(graph_op):
       compat.as_bytes(handle_data))
 
 
-def eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
-  """Creates a variable handle with information to do shape inference."""
+def get_eager_safe_handle_data(handle):
+  """Get the data handle from the Tensor `handle`."""
+  assert isinstance(handle, ops.Tensor)
+
+  if isinstance(handle, ops.EagerTensor):
+    return handle._handle_data  # pylint: disable=protected-access
+  else:
+    return get_resource_handle_data(handle)
+
+
+def _set_handle_shapes_and_types(tensor, handle_data, graph_mode):
+  """Sets the shape inference result HandleData on tensor.
+
+  Args:
+    tensor: A `Tensor` or `EagerTensor`.
+    handle_data: A `CppShapeInferenceResult.HandleData`.
+    graph_mode: A python bool.
+  """
+  tensor._handle_data = handle_data  # pylint: disable=protected-access
+  if not graph_mode:
+    return
+
+  # Not an EagerTensor, so a graph tensor.
+  shapes, types = zip(*[(pair.shape, pair.dtype)
+                        for pair in handle_data.shape_and_type])
+  ranks = [len(s.dim) if not s.unknown_rank else -1 for s in shapes]
+  shapes = [[d.size for d in s.dim]
+            if not s.unknown_rank else None for s in shapes]
+  pywrap_tensorflow.TF_GraphSetOutputHandleShapesAndTypes_wrapper(
+      tensor._op._graph._c_graph,  # pylint: disable=protected-access
+      tensor._as_tf_output(),  # pylint: disable=protected-access
+      shapes, ranks, types)
+
+
+def _combine_handle_data(handle, initial_value):
+  """Concats HandleData from tensors `handle` and `initial_value`.
+
+  Args:
+    handle: A `Tensor` of dtype `resource`.
+    initial_value: A `Tensor`.
+
+  Returns:
+    A `CppShapeInferenceResult.HandleData`.  If `initial_value` has dtype
+    `variant`, the `HandleData` contains the concatenation of the shape_and_type
+    from both `handle` and `initial_value`.
+
+  Raises:
+    RuntimeError: If handle, which was returned by VarHandleOp, either has
+      no handle data, or its len(handle_data.shape_and_type) != 1.
+  """
+  assert handle.dtype == dtypes.resource
+
+  variable_handle_data = get_eager_safe_handle_data(handle)
+
+  if initial_value.dtype != dtypes.variant:
+    return variable_handle_data
+
+  extra_handle_data = get_eager_safe_handle_data(initial_value)
+  if extra_handle_data is not None and extra_handle_data.is_set:
+    if (variable_handle_data is None
+        or not variable_handle_data.is_set
+        or len(variable_handle_data.shape_and_type) != 1):
+      raise RuntimeError(
+          "Expected VarHandleOp to return a length==1 shape_and_type, "
+          "but saw: '%s'" % (variable_handle_data,))
+    variable_handle_data.shape_and_type.extend(
+        extra_handle_data.shape_and_type)
+  return variable_handle_data
+
+
+def eager_safe_variable_handle(initial_value, shared_name, name, graph_mode):
+  """Creates a variable handle with information to do shape inference.
+
+  The shape and dtype are read from `initial_value` and stored in the returned
+  resource tensor's handle data.
+
+  If `initial_value.dtype == tf.variant`, we additionally extract the handle
+  data (if any) from `initial_value` and append it to the `handle_data`.
+  In this case, the returned tensor's handle data is in the form
+
+  ```
+  is_set: true
+  shape_and_type {
+    shape {
+      // initial_value.shape
+    }
+    dtype: DT_VARIANT
+  }
+  shape_and_type {
+    // handle_data(initial_value).shape_and_type[0]
+  }
+  shape_and_type {
+    // handle_data(initial_value).shape_and_type[1]
+  }
+  ...
+  ```
+
+  Ops that read from this tensor, such as `ReadVariableOp` and
+  `AssignVariableOp`, know that `handle_data(handle).shape_and_type[1:]`
+  correspond to the handle data of the variant(s) stored in the Variable.
+
+  Args:
+    initial_value: A `Tensor`.
+    shared_name: A string.
+    name: A string.
+    graph_mode: A python bool.
+
+  Returns:
+    The handle, a `Tensor` of type `resource`.
+  """
+  shape = initial_value.get_shape()
+  dtype = initial_value.dtype.base_dtype
   container = ops.get_default_graph()._container  # pylint: disable=protected-access
   if container is None:
     container = ""
@@ -64,35 +177,38 @@ def eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
                                                    shared_name=shared_name,
                                                    name=name,
                                                    container=container)
+
   if graph_mode:
-    handle._handle_data = get_resource_handle_data(handle)  # pylint: disable=protected-access
+    full_handle_data = _combine_handle_data(handle, initial_value)
+    _set_handle_shapes_and_types(handle, full_handle_data, graph_mode)
+    return handle
+  else:
+    # We do not want two distinct ResourceVariable objects for the same
+    # underlying resource in the runtime.
+    # When in eager mode, explicitly ensure so here. When in graph mode, it's
+    # ensured by always generating different variable names.
+    exists = gen_resource_variable_ops.var_is_initialized_op(handle)
+    if exists:
+      raise ValueError("variable object with name '%s' already created. Use "
+                       "get_variable() if reuse is desired." %
+                       shared_name)
+    with context.graph_mode(), ops.Graph().as_default() as graph:
+      h = gen_resource_variable_ops.var_handle_op(shape=shape, dtype=dtype,
+                                                  shared_name=shared_name,
+                                                  name=name,
+                                                  container=container)
+
+      # Tensor._handle_data contains information for the shape-inference code to
+      # know the shape and dtype of the variable pointed to by a handle. Since
+      # shape inference doesn't run in eager mode we copy this data here for
+      # when the handle is captured by an eager mode function.
+      # pylint: disable=protected-access
+      full_handle_data = _combine_handle_data(h, initial_value)
+      _set_handle_shapes_and_types(handle, full_handle_data, graph_mode)
+      # pylint: enable=protected-access
+    # Clean up op->graph->op reference cycles.
+    ops.dismantle_graph(graph)
     return handle
-
-  # We do not want two distinct ResourceVariable objects for the same
-  # underlying resource in the runtime.
-  # When in eager mode, explicitly ensure so here. When in graph mode, it's
-  # ensured by always generating different variable names.
-  exists = gen_resource_variable_ops.var_is_initialized_op(handle)
-  if exists:
-    raise ValueError("variable object with name '%s' already created. Use "
-                     "get_variable() if reuse is desired." %
-                     shared_name)
-  with context.graph_mode(), ops.Graph().as_default() as graph:
-    h = gen_resource_variable_ops.var_handle_op(shape=shape, dtype=dtype,
-                                                shared_name=shared_name,
-                                                name=name,
-                                                container=container)
-
-    # Tensor._handle_data contains information for the shape-inference code to
-    # know the shape and dtype of the variable pointed to by a handle. Since
-    # shape inference doesn't run in eager mode we copy this data here for when
-    # the handle is captured by an eager mode function.
-    # pylint: disable=protected-access
-    handle._handle_data = get_resource_handle_data(h)
-    # pylint: enable=protected-access
-  # Clean up op->graph->op reference cycles.
-  ops.dismantle_graph(graph)
-  return handle
 
 
 @contextlib.contextmanager
@@ -159,8 +275,19 @@ def shape_safe_assign_variable_handle(handle, shape, value, name=None):
                                                       name=name)
 
 
-# TODO(apassos) make this be variables.Variable
-class ResourceVariable(variables.RefVariable):
+def _maybe_set_handle_data(dtype, handle, tensor):
+  if dtype == dtypes.variant:
+    # For DT_VARIANT types, the handle's shape_and_type[1:] stores the
+    # variant's handle data.  Extract it.
+    handle_data = get_eager_safe_handle_data(handle)
+    if handle_data.is_set and len(handle_data.shape_and_type) > 1:
+      tensor._handle_data = (  # pylint: disable=protected-access
+          cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData(
+              is_set=True,
+              shape_and_type=handle_data.shape_and_type[1:]))
+
+
+class ResourceVariable(variables.VariableV1):
   """Variable based on resource handles.
 
   See the [Variables How To](https://tensorflow.org/guide/variables)
@@ -217,19 +344,19 @@ class ResourceVariable(variables.RefVariable):
                initial_value=None,
                trainable=True,
                collections=None,
-               validate_shape=True,
+               validate_shape=True,  # pylint: disable=unused-argument
                caching_device=None,
                name=None,
                dtype=None,
                variable_def=None,
                import_scope=None,
-               constraint=None):
+               constraint=None,
+               distribute_strategy=None):
     """Creates a variable.
 
     Args:
       initial_value: A `Tensor`, or Python object convertible to a `Tensor`,
-        which is the initial value for the Variable. The initial value must have
-        a shape specified unless `validate_shape` is set to False. Can also be a
+        which is the initial value for the Variable. Can also be a
         callable with no argument that returns the initial value when called.
         (Note that initializer functions from init_ops.py must first be bound
          to a shape before being used here.)
@@ -262,6 +389,8 @@ class ResourceVariable(variables.RefVariable):
         variable and return the Tensor for the projected value
         (which must have the same shape). Constraints are not safe to
         use when doing asynchronous distributed training.
+      distribute_strategy: The tf.distribute.Strategy this variable is being
+        created inside of.
 
     Raises:
       ValueError: If the initial value is not specified, or does not have a
@@ -273,6 +402,7 @@ class ResourceVariable(variables.RefVariable):
     collections.
     @end_compatibility
     """
+    self._distribute_strategy = distribute_strategy
     if variable_def:
       if initial_value is not None:
         raise ValueError("variable_def and initial_value are mutually "
@@ -286,18 +416,24 @@ class ResourceVariable(variables.RefVariable):
           initial_value=initial_value,
           trainable=trainable,
           collections=collections,
-          validate_shape=validate_shape,
           caching_device=caching_device,
           name=name,
           dtype=dtype,
           constraint=constraint)
 
-  # pylint: disable=unused-argument
+  def __repr__(self):
+    if context.executing_eagerly() and not self._in_graph_mode:
+      return "<tf.Variable '%s' shape=%s dtype=%s, numpy=%s>" % (
+          self.name, self.get_shape(), self.dtype.name,
+          ops.numpy_text(self.read_value(), is_repr=True))
+    else:
+      return "<tf.Variable '%s' shape=%s dtype=%s>" % (
+          self.name, self.get_shape(), self.dtype.name)
+
   def _init_from_args(self,
                       initial_value=None,
                       trainable=True,
                       collections=None,
-                      validate_shape=True,
                       caching_device=None,
                       name=None,
                       dtype=None,
@@ -369,8 +505,8 @@ class ResourceVariable(variables.RefVariable):
     if constraint is not None and not callable(constraint):
       raise ValueError("The `constraint` argument must be a callable.")
 
-    if isinstance(initial_value, checkpointable.CheckpointInitialValue):
-      self._maybe_initialize_checkpointable()
+    if isinstance(initial_value, trackable.CheckpointInitialValue):
+      self._maybe_initialize_trackable()
       self._update_uid = initial_value.checkpoint_position.restore_uid
       initial_value = initial_value.wrapped_value
 
@@ -389,24 +525,27 @@ class ResourceVariable(variables.RefVariable):
         handle_name = ops._name_from_scope_name(name)
         if self._in_graph_mode:
           shared_name = handle_name
+          unique_id = shared_name
         else:
           # When in eager mode use a uid for the shared_name, to prevent
           # accidental sharing.
-          shared_name = "%s_%d" % (handle_name, ops.uid())
+          unique_id = "%s_%d" % (handle_name, ops.uid())
+          shared_name = context.shared_name()
         # Use attr_scope and device(None) to simulate the behavior of
         # colocate_with when the variable we want to colocate with doesn't
         # yet exist.
+        device_context_manager = (
+            ops.device if self._in_graph_mode else ops.NullContextmanager)
         attr = attr_value_pb2.AttrValue(
             list=attr_value_pb2.AttrValue.ListValue(
                 s=[compat.as_bytes("loc:@%s" % handle_name)]))
         with ops.get_default_graph()._attr_scope({"_class": attr}):
-          with ops.name_scope("Initializer"), ops.device(None):
+          with ops.name_scope("Initializer"), device_context_manager(None):
             initial_value = ops.convert_to_tensor(
                 initial_value() if init_from_fn else initial_value,
                 name="initial_value", dtype=dtype)
           self._handle = eager_safe_variable_handle(
-              shape=initial_value.get_shape(),
-              dtype=initial_value.dtype.base_dtype,
+              initial_value=initial_value,
               shared_name=shared_name,
               name=name,
               graph_mode=self._in_graph_mode)
@@ -420,7 +559,7 @@ class ResourceVariable(variables.RefVariable):
               "variable inside a loop or conditional, use a lambda as the "
               "initializer." % name)
         # pylint: enable=protected-access
-        self._unique_id = shared_name
+        self._unique_id = unique_id
         self._initial_value = initial_value if self._in_graph_mode else None
         self._handle_name = handle_name + ":0"
         self._dtype = initial_value.dtype.base_dtype
@@ -432,12 +571,15 @@ class ResourceVariable(variables.RefVariable):
                 gen_resource_variable_ops.var_is_initialized_op(self._handle))
           if initial_value is not None:
             with ops.name_scope("Assign") as n, ops.colocate_with(self._handle):
+              # pylint: disable=protected-access
               self._initializer_op = (
                   gen_resource_variable_ops.assign_variable_op(
                       self._handle,
-                      self._try_guard_against_uninitialized_dependencies(
+                      variables._try_guard_against_uninitialized_dependencies(
+                          name,
                           initial_value),
                       name=n))
+              # pylint: enable=protected-access
           with ops.name_scope("Read"), ops.colocate_with(self._handle):
             # Manually assign reads to the handle's device to avoid log
             # messages.
@@ -485,7 +627,6 @@ class ResourceVariable(variables.RefVariable):
       # all in graph mode.
       self._handle_deleter = EagerResourceDeleter(
           handle=self._handle, handle_device=self._handle.device)
-    self._cached_shape_as_list = None
 
   def _init_from_proto(self, variable_def, import_scope=None):
     """Initializes from `VariableDef` proto."""
@@ -543,7 +684,6 @@ class ResourceVariable(variables.RefVariable):
     self._caching_device = None
     self._dtype = dtypes.as_dtype(self._handle.op.get_attr("dtype"))
     self._constraint = None
-    self._cached_shape_as_list = None
 
   @contextlib.contextmanager
   def _assign_dependencies(self):
@@ -578,7 +718,8 @@ class ResourceVariable(variables.RefVariable):
         trainable=self._trainable,
         constraint=self._constraint,
         dtype=self._dtype,
-        name=self._shared_name + "_copy")
+        name=self._shared_name + "_copy",
+        distribute_strategy=self._distribute_strategy)
     memo[self._unique_id] = copied_variable
     return copied_variable
 
@@ -608,12 +749,9 @@ class ResourceVariable(variables.RefVariable):
     return self._shape
 
   def _shape_as_list(self):
-    if self._cached_shape_as_list:
-      return self._cached_shape_as_list
     if self.shape.ndims is None:
       return None
-    self._cached_shape_as_list = [dim.value for dim in self.shape.dims]
-    return self._cached_shape_as_list
+    return [dim.value for dim in self.shape.dims]
 
   def _shape_tuple(self):
     shape = self._shape_as_list()
@@ -673,6 +811,10 @@ class ResourceVariable(variables.RefVariable):
     """The op for this variable."""
     return self._handle.op
 
+  @property
+  def trainable(self):
+    return self._trainable
+
   def eval(self, session=None):
     """Evaluates and returns the value of this variable."""
     if context.executing_eagerly():
@@ -685,6 +827,7 @@ class ResourceVariable(variables.RefVariable):
     raise NotImplementedError(
         "numpy() is only available when eager execution is enabled.")
 
+  @deprecated(None, "Prefer Dataset.range instead.")
   def count_up_to(self, limit):
     """Increments this variable until it reaches `limit`.
 
@@ -708,22 +851,13 @@ class ResourceVariable(variables.RefVariable):
     return gen_state_ops.resource_count_up_to(self.handle, limit=limit,
                                               T=self.dtype)
 
-  def _set_save_slice_info(self, save_slice_info):
-    """Sets the slice info for this `ResourceVariable`.
-
-    Args:
-      save_slice_info: A `Variable.SaveSliceInfo` object.
-    """
-    self._save_slice_info = save_slice_info
-
-  def _get_save_slice_info(self):
-    return self._save_slice_info
-
   def _read_variable_op(self):
     if self.trainable:
       tape.variable_accessed(self)
     result = gen_resource_variable_ops.read_variable_op(self._handle,
                                                         self._dtype)
+    _maybe_set_handle_data(self._dtype, self._handle, result)
+
     if not context.executing_eagerly():
       # Note that if a control flow context is active the input of the read op
       # might not actually be the handle. This line bypasses it.
@@ -755,6 +889,17 @@ class ResourceVariable(variables.RefVariable):
         tape.variable_accessed(self)
       value = gen_resource_variable_ops.resource_gather(
           self._handle, indices, dtype=self._dtype, name=name)
+
+      if self._dtype == dtypes.variant:
+        # For DT_VARIANT types, the handle's shape_and_type[1:] stores the
+        # variant's handle data.  Extract it.
+        handle_data = get_eager_safe_handle_data(self._handle)
+        if handle_data.is_set and len(handle_data.shape_and_type) > 1:
+          value._handle_data = (  # pylint: disable=protected-access
+              cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData(
+                  is_set=True,
+                  shape_and_type=handle_data.shape_and_type[1:]))
+
     return array_ops.identity(value)
 
   def to_proto(self, export_scope=None):
@@ -807,13 +952,6 @@ class ResourceVariable(variables.RefVariable):
     return ResourceVariable(
         variable_def=variable_def, import_scope=import_scope)
 
-  def _AsTensor(self):
-    return self.value()
-
-  def _ref(self):
-    """Unsupported."""
-    raise NotImplementedError("ResourceVariable does not implement _ref()")
-
   def set_shape(self, shape):
     """Unsupported."""
     raise NotImplementedError("ResourceVariable does not implement set_shape()")
@@ -921,7 +1059,15 @@ class ResourceVariable(variables.RefVariable):
     return assign_op
 
   def __reduce__(self):
-    return (ResourceVariable, (self.numpy(),))
+    # The implementation mirrors that of __deepcopy__.
+    return functools.partial(
+        ResourceVariable,
+        initial_value=self.numpy(),
+        trainable=self.trainable,
+        name=self._shared_name,
+        dtype=self.dtype,
+        constraint=self.constraint,
+        distribute_strategy=self._distribute_strategy), ()
 
   def scatter_sub(self, sparse_delta, use_locking=False, name=None):
     """Subtracts `IndexedSlices` from this variable.
@@ -986,6 +1132,55 @@ class ResourceVariable(variables.RefVariable):
         self.handle, sparse_delta.indices,
         ops.convert_to_tensor(sparse_delta.values, self.dtype), name=name))
 
+  def batch_scatter_update(self, sparse_delta, use_locking=False, name=None):
+    """Assigns `IndexedSlices` to this variable batch-wise.
+
+    Analogous to `batch_gather`. This assumes that this variable and the
+    sparse_delta IndexedSlices have a series of leading dimensions that are the
+    same for all of them, and the updates are performed on the last dimension of
+    indices. In other words, the dimensions should be the following:
+
+    `num_prefix_dims = sparse_delta.indices.ndims - 1`
+    `batch_dim = num_prefix_dims + 1`
+    `sparse_delta.updates.shape = sparse_delta.indices.shape + var.shape[
+         batch_dim:]`
+
+    where
+
+    `sparse_delta.updates.shape[:num_prefix_dims]`
+    `== sparse_delta.indices.shape[:num_prefix_dims]`
+    `== var.shape[:num_prefix_dims]`
+
+    And the operation performed can be expressed as:
+
+    `var[i_1, ..., i_n,
+         sparse_delta.indices[i_1, ..., i_n, j]] = sparse_delta.updates[
+            i_1, ..., i_n, j]`
+
+    When sparse_delta.indices is a 1D tensor, this operation is equivalent to
+    `scatter_update`.
+
+    To avoid this operation one can looping over the first `ndims` of the
+    variable and using `scatter_update` on the subtensors that result of slicing
+    the first dimension. This is a valid option for `ndims = 1`, but less
+    efficient than this implementation.
+
+    Args:
+      sparse_delta: `IndexedSlices` to be assigned to this variable.
+      use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      ValueError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    return self._lazy_read(state_ops.batch_scatter_update(
+        self, sparse_delta.indices, sparse_delta.values,
+        use_locking=use_locking, name=name))
+
   def scatter_nd_sub(self, indices, updates, name=None):
     """Applies sparse subtraction to individual values or slices in a Variable.
 
@@ -1170,8 +1365,10 @@ class ResourceVariable(variables.RefVariable):
 
   def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
     del name
-    if dtype is not None and dtype != self.dtype:
-      return NotImplemented
+    if dtype is not None and not dtype.is_compatible_with(self.dtype):
+      raise ValueError(
+          "Incompatible type conversion requested to type {!r} for variable "
+          "of type {!r}".format(dtype.name, self.dtype.name))
     if as_ref:
       return self.read_value().op.inputs[0]
     else:
@@ -1223,6 +1420,12 @@ def _dense_var_to_tensor(var, dtype=None, name=None, as_ref=False):
   return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
 
 
+# Register a conversion function which reads the value of the variable,
+# allowing instances of the class to be used as tensors.
+ops.register_tensor_conversion_function(ResourceVariable, _dense_var_to_tensor)
+ops.register_dense_tensor_like_type(ResourceVariable)
+
+
 class _UnreadVariable(ResourceVariable):
   """Represents a future for a read of a variable.
 
@@ -1271,19 +1474,18 @@ class _UnreadVariable(ResourceVariable):
 
   def _read_variable_op(self):
     with ops.control_dependencies([self._parent_op]):
-      return gen_resource_variable_ops.read_variable_op(self._handle,
-                                                        self._dtype)
+      result = gen_resource_variable_ops.read_variable_op(self._handle,
+                                                          self._dtype)
+      _maybe_set_handle_data(self._dtype, self._handle, result)
+      return result
 
-  def set_shape(self, shape):
-    self._shape = shape
-    self._cached_shape_as_list = None
 
   @property
   def op(self):
     """The op for this variable."""
     return self._parent_op
 
-ops.register_tensor_conversion_function(_UnreadVariable, _dense_var_to_tensor)
+
 ops.register_dense_tensor_like_type(_UnreadVariable)
 
 
@@ -1361,15 +1563,12 @@ class _MixedPrecisionVariable(ResourceVariable):
     with ops.colocate_with(self._handle):
       res = gen_resource_variable_ops.read_variable_op(self._handle,
                                                        self._dtype)
+      _maybe_set_handle_data(self._dtype, self._handle, res)
       if self._read_dtype != self._dtype:
         return math_ops.cast(res, self._read_dtype)
       else:
         return res
 
-  def set_shape(self, shape):
-    self._shape = shape
-    self._cached_shape_as_list = None
-
   @property
   def op(self):
     """The op for this variable."""
@@ -1382,29 +1581,15 @@ class _MixedPrecisionVariable(ResourceVariable):
 
   def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
     del name
-    dtype = dtype or self.read_dtype
-    if dtype != self.read_dtype or as_ref:
+    if (dtype is not None and
+        not dtype.is_compatible_with(self.read_dtype) or as_ref):
       return NotImplemented
-    else:
-      res = self.value()
-    return res
+    return self.value()
 
   def _should_act_as_resource_variable(self):
     """To pass resource_variable_ops.is_resource_variable check."""
     pass
 
-# Register a conversion function which reads the value of the variable,
-# allowing instances of the class to be used as tensors.
-
-# Note: registering for Variable after ResourceVariable because inheritance will
-# otherwise lead to the wrong behavior.
-ops.register_tensor_conversion_function(ResourceVariable, _dense_var_to_tensor)
-ops.register_tensor_conversion_function(
-    variables.Variable, variables.Variable._TensorConversionFunction)  # pylint: disable=protected-access
-
-# pylint: disable=protected-access
-ops.register_dense_tensor_like_type(ResourceVariable)
-
 
 @ops.RegisterGradient("ReadVariableOp")
 def _ReadGrad(_, grad):
@@ -1499,7 +1684,7 @@ def copy_to_graph_uninitialized(var):
       constraint=var._constraint,
       dtype=var.dtype,
       name=var._shared_name)
-  new_variable._maybe_initialize_checkpointable()
+  new_variable._maybe_initialize_trackable()
   # pylint: enable=protected-access
   return new_variable
 
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index ffc45619a74e9b527047f3e55e94664581cb6591..cb9377df363351dccffd1155a14bdf0a9f4abcfc 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -50,7 +50,7 @@ from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
@@ -63,6 +63,15 @@ _WEIGHTS_VARIABLE_NAME = "kernel"
 ASSERT_LIKE_RNNCELL_ERROR_REGEXP = "is not an RNNCell"
 
 
+def _hasattr(obj, attr_name):
+  try:
+    getattr(obj, attr_name)
+  except AttributeError:
+    return False
+  else:
+    return True
+
+
 def assert_like_rnncell(cell_name, cell):
   """Raises a TypeError if cell is not like an RNNCell.
 
@@ -79,9 +88,9 @@ def assert_like_rnncell(cell_name, cell):
     TypeError: A human-friendly exception.
   """
   conditions = [
-      hasattr(cell, "output_size"),
-      hasattr(cell, "state_size"),
-      hasattr(cell, "get_initial_state") or hasattr(cell, "zero_state"),
+      _hasattr(cell, "output_size"),
+      _hasattr(cell, "state_size"),
+      _hasattr(cell, "get_initial_state") or _hasattr(cell, "zero_state"),
       callable(cell),
   ]
   errors = [
@@ -316,7 +325,7 @@ class RNNCell(base_layer.Layer):
     # zeros, especially when eager execution is enabled.
     state_size = self.state_size
     is_eager = context.executing_eagerly()
-    if is_eager and hasattr(self, "_last_zero_state"):
+    if is_eager and _hasattr(self, "_last_zero_state"):
       (last_state_size, last_batch_size, last_dtype,
        last_output) = getattr(self, "_last_zero_state")
       if (last_batch_size == batch_size and
@@ -405,6 +414,7 @@ class BasicRNNCell(LayerRNNCell):
                **kwargs):
     super(BasicRNNCell, self).__init__(
         _reuse=reuse, name=name, dtype=dtype, **kwargs)
+    _check_supported_dtypes(self.dtype)
     if context.executing_eagerly() and context.num_gpus() > 0:
       logging.warn("%s: Note that this cell is not optimized for performance. "
                    "Please use tf.contrib.cudnn_rnn.CudnnRNNTanh for better "
@@ -432,6 +442,7 @@ class BasicRNNCell(LayerRNNCell):
     if inputs_shape[-1] is None:
       raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
                        % str(inputs_shape))
+    _check_supported_dtypes(self.dtype)
 
     input_depth = inputs_shape[-1]
     self._kernel = self.add_variable(
@@ -446,7 +457,7 @@ class BasicRNNCell(LayerRNNCell):
 
   def call(self, inputs, state):
     """Most basic RNN: output = new_state = act(W * input + U * state + B)."""
-
+    _check_rnn_cell_input_dtypes([inputs, state])
     gate_inputs = math_ops.matmul(
         array_ops.concat([inputs, state], 1), self._kernel)
     gate_inputs = nn_ops.bias_add(gate_inputs, self._bias)
@@ -502,6 +513,7 @@ class GRUCell(LayerRNNCell):
                **kwargs):
     super(GRUCell, self).__init__(
         _reuse=reuse, name=name, dtype=dtype, **kwargs)
+    _check_supported_dtypes(self.dtype)
 
     if context.executing_eagerly() and context.num_gpus() > 0:
       logging.warn("%s: Note that this cell is not optimized for performance. "
@@ -531,7 +543,7 @@ class GRUCell(LayerRNNCell):
     if inputs_shape[-1] is None:
       raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
                        % str(inputs_shape))
-
+    _check_supported_dtypes(self.dtype)
     input_depth = inputs_shape[-1]
     self._gate_kernel = self.add_variable(
         "gates/%s" % _WEIGHTS_VARIABLE_NAME,
@@ -560,6 +572,7 @@ class GRUCell(LayerRNNCell):
 
   def call(self, inputs, state):
     """Gated recurrent unit (GRU) with nunits cells."""
+    _check_rnn_cell_input_dtypes([inputs, state])
 
     gate_inputs = math_ops.matmul(
         array_ops.concat([inputs, state], 1), self._gate_kernel)
@@ -675,6 +688,7 @@ class BasicLSTMCell(LayerRNNCell):
     """
     super(BasicLSTMCell, self).__init__(
         _reuse=reuse, name=name, dtype=dtype, **kwargs)
+    _check_supported_dtypes(self.dtype)
     if not state_is_tuple:
       logging.warn("%s: Using a concatenated state is slower and will soon be "
                    "deprecated.  Use state_is_tuple=True.", self)
@@ -708,7 +722,7 @@ class BasicLSTMCell(LayerRNNCell):
     if inputs_shape[-1] is None:
       raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
                        % str(inputs_shape))
-
+    _check_supported_dtypes(self.dtype)
     input_depth = inputs_shape[-1]
     h_depth = self._num_units
     self._kernel = self.add_variable(
@@ -736,6 +750,8 @@ class BasicLSTMCell(LayerRNNCell):
         `LSTMStateTuple` or a concatenated state, depending on
         `state_is_tuple`).
     """
+    _check_rnn_cell_input_dtypes([inputs, state])
+
     sigmoid = math_ops.sigmoid
     one = constant_op.constant(1, dtype=dtypes.int32)
     # Parameters of gates are concatenated into one multiply for efficiency.
@@ -858,6 +874,7 @@ class LSTMCell(LayerRNNCell):
     """
     super(LSTMCell, self).__init__(
         _reuse=reuse, name=name, dtype=dtype, **kwargs)
+    _check_supported_dtypes(self.dtype)
     if not state_is_tuple:
       logging.warn("%s: Using a concatenated state is slower and will soon be "
                    "deprecated.  Use state_is_tuple=True.", self)
@@ -913,7 +930,7 @@ class LSTMCell(LayerRNNCell):
     if inputs_shape[-1] is None:
       raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
                        % str(inputs_shape))
-
+    _check_supported_dtypes(self.dtype)
     input_depth = inputs_shape[-1]
     h_depth = self._num_units if self._num_proj is None else self._num_proj
     maybe_partitioner = (
@@ -979,6 +996,8 @@ class LSTMCell(LayerRNNCell):
       ValueError: If input size cannot be inferred from inputs via
         static shape inference.
     """
+    _check_rnn_cell_input_dtypes([inputs, state])
+
     num_proj = self._num_units if self._num_proj is None else self._num_proj
     sigmoid = math_ops.sigmoid
 
@@ -1066,8 +1085,107 @@ def _default_dropout_state_filter_visitor(substate):
   return True
 
 
-@tf_export("nn.rnn_cell.DropoutWrapper")
-class DropoutWrapper(RNNCell):
+class _RNNCellWrapperV1(RNNCell):
+  """Base class for cells wrappers V1 compatibility.
+
+  This class along with `_RNNCellWrapperV2` allows to define cells wrappers that
+  are compatible with V1 and V2, and defines helper methods for this purpose.
+  """
+
+  def __init__(self, cell):
+    super(_RNNCellWrapperV1, self).__init__()
+    self._cell = cell
+    if isinstance(cell, trackable.Trackable):
+      self._track_trackable(self._cell, name="cell")
+
+  def _call_wrapped_cell(self, inputs, state, cell_call_fn, **kwargs):
+    """Calls the wrapped cell and performs the wrapping logic.
+
+    This method is called from the wrapper's `call` or `__call__` methods.
+
+    Args:
+      inputs: A tensor with wrapped cell's input.
+      state: A tensor or tuple of tensors with wrapped cell's state.
+      cell_call_fn: Wrapped cell's method to use for step computation (cell's
+        `__call__` or 'call' method).
+      **kwargs: Additional arguments.
+
+    Returns:
+      A pair containing:
+      - Output: A tensor with cell's output.
+      - New state: A tensor or tuple of tensors with new wrapped cell's state.
+    """
+    raise NotImplementedError
+
+  def __call__(self, inputs, state, scope=None):
+    """Runs the RNN cell step computation.
+
+    We assume that the wrapped RNNCell is being built within its `__call__`
+    method. We directly use the wrapped cell's `__call__` in the overridden
+    wrapper `__call__` method.
+
+    This allows to use the wrapped cell and the non-wrapped cell equivalently
+    when using `__call__`.
+
+    Args:
+      inputs: A tensor with wrapped cell's input.
+      state: A tensor or tuple of tensors with wrapped cell's state.
+      scope: VariableScope for the subgraph created in the wrapped cells'
+        `__call__`.
+
+    Returns:
+      A pair containing:
+
+      - Output: A tensor with cell's output.
+      - New state: A tensor or tuple of tensors with new wrapped cell's state.
+    """
+    return self._call_wrapped_cell(
+        inputs, state, cell_call_fn=self._cell.__call__, scope=scope)
+
+
+class _RNNCellWrapperV2(LayerRNNCell, _RNNCellWrapperV1):
+  """Base class for cells wrappers V2 compatibility.
+
+  This class along with `_RNNCellWrapperV1` allows to define cells wrappers that
+  are compatible with V1 and V2, and defines helper methods for this purpose.
+  """
+
+  def __init__(self, *args, **kwargs):
+    super(_RNNCellWrapperV2, self).__init__(*args, **kwargs)
+    self._layers = [self._cell]
+
+  def call(self, inputs, state, **kwargs):
+    """Runs the RNN cell step computation.
+
+    When `call` is being used, we assume that the wrapper object has been built,
+    and therefore the wrapped cells has been built via its `build` method and
+    its `call` method can be used directly.
+
+    This allows to use the wrapped cell and the non-wrapped cell equivalently
+    when using `call` and `build`.
+
+    Args:
+      inputs: A tensor with wrapped cell's input.
+      state: A tensor or tuple of tensors with wrapped cell's state.
+      **kwargs: Additional arguments passed to the wrapped cell's `call`.
+
+    Returns:
+      A pair containing:
+
+      - Output: A tensor with cell's output.
+      - New state: A tensor or tuple of tensors with new wrapped cell's state.
+    """
+    return self._call_wrapped_cell(
+        inputs, state, cell_call_fn=self._cell.call, **kwargs)
+
+  def build(self, inputs_shape):
+    """Builds the wrapped cell."""
+    self._cell.build(inputs_shape)
+    self.built = True
+
+
+@tf_export(v1=["nn.rnn_cell.DropoutWrapper"])
+class DropoutWrapper(_RNNCellWrapperV1):
   """Operator adding dropout to inputs and outputs of the given cell."""
 
   def __init__(self, cell, input_keep_prob=1.0, output_keep_prob=1.0,
@@ -1137,7 +1255,7 @@ class DropoutWrapper(RNNCell):
         but not `callable`.
       ValueError: if any of the keep_probs are not between 0 and 1.
     """
-    super(DropoutWrapper, self).__init__()
+    super(DropoutWrapper, self).__init__(cell)
     assert_like_rnncell("cell", cell)
 
     if (dropout_state_filter_visitor is not None
@@ -1162,10 +1280,7 @@ class DropoutWrapper(RNNCell):
         else:
           setattr(self, "_%s" % attr, tensor_prob)
 
-    # Set cell, variational_recurrent, seed before running the code below
-    self._cell = cell
-    if isinstance(cell, checkpointable.CheckpointableBase):
-      self._track_checkpointable(self._cell, name="cell")
+    # Set variational_recurrent, seed before running the code below
     self._variational_recurrent = variational_recurrent
     self._seed = seed
 
@@ -1272,8 +1387,22 @@ class DropoutWrapper(RNNCell):
           shallow_filtered_substructure, dropout,
           *[shallow_filtered_substructure, values, recurrent_noise])
 
-  def __call__(self, inputs, state, scope=None):
-    """Run the cell with the declared dropouts."""
+  def _call_wrapped_cell(self, inputs, state, cell_call_fn, **kwargs):
+    """Runs the wrapped cell and applies dropout.
+
+    Args:
+      inputs: A tensor with wrapped cell's input.
+      state: A tensor or tuple of tensors with wrapped cell's state.
+      cell_call_fn: Wrapped cell's method to use for step computation (cell's
+        `__call__` or 'call' method).
+      **kwargs: Additional arguments.
+
+    Returns:
+      A pair containing:
+
+      - Output: A tensor with cell's output.
+      - New state: A tensor or tuple of tensors with new wrapped cell's state.
+    """
     def _should_dropout(p):
       return (not isinstance(p, float)) or p < 1
 
@@ -1281,7 +1410,7 @@ class DropoutWrapper(RNNCell):
       inputs = self._dropout(inputs, "input",
                              self._recurrent_input_noise,
                              self._input_keep_prob)
-    output, new_state = self._cell(inputs, state, scope=scope)
+    output, new_state = cell_call_fn(inputs, state, **kwargs)
     if _should_dropout(self._state_keep_prob):
       # Identify which subsets of the state to perform dropout on and
       # which ones to keep.
@@ -1298,8 +1427,96 @@ class DropoutWrapper(RNNCell):
     return output, new_state
 
 
-@tf_export("nn.rnn_cell.ResidualWrapper")
-class ResidualWrapper(RNNCell):
+@tf_export("rnn.DropoutWrapper", v1=[])
+class DropoutWrapperV2(_RNNCellWrapperV2, DropoutWrapper):
+  """Operator adding dropout to inputs and outputs of the given cell."""
+
+  def __init__(self, cell, input_keep_prob=1.0, output_keep_prob=1.0,
+               state_keep_prob=1.0, variational_recurrent=False,
+               input_size=None, dtype=None, seed=None,
+               dropout_state_filter_visitor=None):
+    """Create a cell with added input, state, and/or output dropout.
+
+    If `variational_recurrent` is set to `True` (**NOT** the default behavior),
+    then the same dropout mask is applied at every step, as described in:
+
+    Y. Gal, Z Ghahramani.  "A Theoretically Grounded Application of Dropout in
+    Recurrent Neural Networks".  https://arxiv.org/abs/1512.05287
+
+    Otherwise a different dropout mask is applied at every time step.
+
+    Note, by default (unless a custom `dropout_state_filter` is provided),
+    the memory state (`c` component of any `LSTMStateTuple`) passing through
+    a `DropoutWrapper` is never modified.  This behavior is described in the
+    above article.
+
+    Runs initialization in Keras style scope to use Keras-style variable
+    management.
+
+    Args:
+      cell: a LayerRNNCell, a projection to output_size is added to it.
+      input_keep_prob: unit Tensor or float between 0 and 1, input keep
+        probability; if it is constant and 1, no input dropout will be added.
+      output_keep_prob: unit Tensor or float between 0 and 1, output keep
+        probability; if it is constant and 1, no output dropout will be added.
+      state_keep_prob: unit Tensor or float between 0 and 1, output keep
+        probability; if it is constant and 1, no output dropout will be added.
+        State dropout is performed on the outgoing states of the cell.
+        **Note** the state components to which dropout is applied when
+        `state_keep_prob` is in `(0, 1)` are also determined by
+        the argument `dropout_state_filter_visitor` (e.g. by default dropout
+        is never applied to the `c` component of an `LSTMStateTuple`).
+      variational_recurrent: Python bool.  If `True`, then the same
+        dropout pattern is applied across all time steps per run call.
+        If this parameter is set, `input_size` **must** be provided.
+      input_size: (optional) (possibly nested tuple of) `TensorShape` objects
+        containing the depth(s) of the input tensors expected to be passed in to
+        the `DropoutWrapper`.  Required and used **iff**
+         `variational_recurrent = True` and `input_keep_prob < 1`.
+      dtype: (optional) The `dtype` of the input, state, and output tensors.
+        Required and used **iff** `variational_recurrent = True`.
+      seed: (optional) integer, the randomness seed.
+      dropout_state_filter_visitor: (optional), default: (see below).  Function
+        that takes any hierarchical level of the state and returns
+        a scalar or depth=1 structure of Python booleans describing
+        which terms in the state should be dropped out.  In addition, if the
+        function returns `True`, dropout is applied across this sublevel.  If
+        the function returns `False`, dropout is not applied across this entire
+        sublevel.
+        Default behavior: perform dropout on all terms except the memory (`c`)
+        state of `LSTMCellState` objects, and don't try to apply dropout to
+        `TensorArray` objects:
+        ```
+        def dropout_state_filter_visitor(s):
+          if isinstance(s, LSTMCellState):
+            # Never perform dropout on the c state.
+            return LSTMCellState(c=False, h=True)
+          elif isinstance(s, TensorArray):
+            return False
+          return True
+        ```
+
+    Raises:
+      TypeError: if `cell` is not an `RNNCell`, or `keep_state_fn` is provided
+        but not `callable`.
+      ValueError: if any of the keep_probs are not between 0 and 1.
+    """
+
+    with base_layer.keras_style_scope():
+      super(DropoutWrapperV2, self).__init__(
+          cell=cell,
+          input_keep_prob=input_keep_prob,
+          output_keep_prob=output_keep_prob,
+          state_keep_prob=state_keep_prob,
+          variational_recurrent=variational_recurrent,
+          input_size=input_size,
+          dtype=dtype,
+          seed=seed,
+          dropout_state_filter_visitor=dropout_state_filter_visitor)
+
+
+@tf_export(v1=["nn.rnn_cell.ResidualWrapper"])
+class ResidualWrapper(_RNNCellWrapperV1):
   """RNNCell wrapper that ensures cell inputs are added to the outputs."""
 
   def __init__(self, cell, residual_fn=None):
@@ -1312,10 +1529,7 @@ class ResidualWrapper(RNNCell):
         Defaults to calling nest.map_structure on (lambda i, o: i + o), inputs
         and outputs.
     """
-    super(ResidualWrapper, self).__init__()
-    self._cell = cell
-    if isinstance(cell, checkpointable.CheckpointableBase):
-      self._track_checkpointable(self._cell, name="cell")
+    super(ResidualWrapper, self).__init__(cell)
     self._residual_fn = residual_fn
 
   @property
@@ -1330,13 +1544,15 @@ class ResidualWrapper(RNNCell):
     with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]):
       return self._cell.zero_state(batch_size, dtype)
 
-  def __call__(self, inputs, state, scope=None):
+  def _call_wrapped_cell(self, inputs, state, cell_call_fn, **kwargs):
     """Run the cell and then apply the residual_fn on its inputs to its outputs.
 
     Args:
       inputs: cell inputs.
       state: cell state.
-      scope: optional cell scope.
+      cell_call_fn: Wrapped cell's method to use for step computation (cell's
+        `__call__` or 'call' method).
+      **kwargs: Additional arguments passed to the wrapped cell's `call`.
 
     Returns:
       Tuple of cell outputs and new state.
@@ -1345,7 +1561,7 @@ class ResidualWrapper(RNNCell):
       TypeError: If cell inputs and outputs have different structure (type).
       ValueError: If cell inputs and outputs have different structure (value).
     """
-    outputs, new_state = self._cell(inputs, state, scope=scope)
+    outputs, new_state = cell_call_fn(inputs, state, **kwargs)
     # Ensure shapes match
     def assert_shape_match(inp, out):
       inp.get_shape().assert_is_compatible_with(out.get_shape())
@@ -1357,6 +1573,29 @@ class ResidualWrapper(RNNCell):
     return (res_outputs, new_state)
 
 
+@tf_export("rnn.ResidualWrapper", v1=[])
+class ResidualWrapperV2(_RNNCellWrapperV2, ResidualWrapper):
+  """RNNCell wrapper that ensures cell inputs are added to the outputs."""
+
+  def __init__(self, cell, residual_fn=None):
+    """Constructs a `ResidualWrapperV2` for `cell`.
+
+    Runs initialization in Keras style scope to use Keras-style variable
+    management.
+
+    Args:
+      cell: An instance of `LayerRNNCell`.
+      residual_fn: (Optional) The function to map raw cell inputs and raw cell
+        outputs to the actual cell outputs of the residual network.
+        Defaults to calling nest.map_structure on (lambda i, o: i + o), inputs
+        and outputs.
+    """
+
+    with base_layer.keras_style_scope():
+      super(ResidualWrapperV2, self).__init__(
+          cell=cell, residual_fn=residual_fn)
+
+
 @tf_export("nn.rnn_cell.DeviceWrapper")
 class DeviceWrapper(RNNCell):
   """Operator that ensures an RNNCell runs on a particular device."""
@@ -1372,8 +1611,8 @@ class DeviceWrapper(RNNCell):
     """
     super(DeviceWrapper, self).__init__()
     self._cell = cell
-    if isinstance(cell, checkpointable.CheckpointableBase):
-      self._track_checkpointable(self._cell, name="cell")
+    if isinstance(cell, trackable.Trackable):
+      self._track_trackable(self._cell, name="cell")
     self._device = device
 
   @property
@@ -1439,11 +1678,11 @@ class MultiRNNCell(RNNCell):
 
     self._cells = cells
     for cell_number, cell in enumerate(self._cells):
-      # Add Checkpointable dependencies on these cells so their variables get
+      # Add Trackable dependencies on these cells so their variables get
       # saved with this object when using object-based saving.
-      if isinstance(cell, checkpointable.CheckpointableBase):
-        # TODO(allenl): Track down non-Checkpointable callers.
-        self._track_checkpointable(cell, name="cell-%d" % (cell_number,))
+      if isinstance(cell, trackable.Trackable):
+        # TODO(allenl): Track down non-Trackable callers.
+        self._track_trackable(cell, name="cell-%d" % (cell_number,))
     self._state_is_tuple = state_is_tuple
     if not state_is_tuple:
       if any(nest.is_sequence(c.state_size) for c in self._cells):
@@ -1519,3 +1758,31 @@ class MultiRNNCell(RNNCell):
                   array_ops.concat(new_states, 1))
 
     return cur_inp, new_states
+
+
+def _check_rnn_cell_input_dtypes(inputs):
+  """Check whether the input tensors are with supported dtypes.
+
+  Default RNN cells only support floats and complex as its dtypes since the
+  activation function (tanh and sigmoid) only allow those types. This function
+  will throw a proper error message if the inputs is not in a supported type.
+
+  Args:
+    inputs: tensor or nested structure of tensors that are feed to RNN cell as
+      input or state.
+
+  Raises:
+    ValueError: if any of the input tensor are not having dtypes of float or
+      complex.
+  """
+  for t in nest.flatten(inputs):
+    _check_supported_dtypes(t.dtype)
+
+
+def _check_supported_dtypes(dtype):
+  if dtype is None:
+    return
+  dtype = dtypes.as_dtype(dtype)
+  if not (dtype.is_floating or dtype.is_complex):
+    raise ValueError("RNN cell only supports floating point inputs, "
+                     "but saw dtype: %s" % dtype)
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index a5b31aff91660a6ac79c980dffb543e87fd40dfa..63b5eab56a3a6dc434ef03c3477945a9860bef65 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -391,12 +391,16 @@ def eager_py_func(func, inp, Tout, name=None):
 
 @deprecation.deprecated(
     date=None,
-    instructions="""tf.py_func is deprecated in TF V2. Instead, use
-    tf.py_function, which takes a python function which manipulates tf eager
+    instructions="""tf.py_func is deprecated in TF V2. Instead, there are two
+    options available in V2.
+    - tf.py_function takes a python function which manipulates tf eager
     tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
     an ndarray (just call tensor.numpy()) but having access to eager tensors
     means `tf.py_function`s can use accelerators such as GPUs as well as
     being differentiable using a gradient tape.
+    - tf.numpy_function maintains the semantics of the deprecated tf.py_func
+    (it is not differentiable, and manipulates numpy arrays). It drops the
+    stateful argument making all functions stateful.
     """)
 @tf_export(v1=["py_func"])
 def py_func(func, inp, Tout, stateful=True, name=None):
@@ -467,6 +471,13 @@ def py_func(func, inp, Tout, stateful=True, name=None):
   return _internal_py_func(
       func=func, inp=inp, Tout=Tout, stateful=stateful, eager=False, name=name)
 
+@tf_export("numpy_function", v1=[])
+def numpy_function(func, inp, Tout, name=None):
+  return py_func(func, inp, Tout, stateful=True, name=name)
+
+numpy_function.__doc__ = py_func.__doc__.replace(
+    "py_func", "numpy_function")
+
 
 ops.NotDifferentiable("PyFunc")
 ops.NotDifferentiable("PyFuncStateless")
diff --git a/tensorflow/python/ops/sparse_grad.py b/tensorflow/python/ops/sparse_grad.py
index 2ca9c0c647d14b792b2575c8f977d9dbe39efb4b..bef0a8ea4eda3bc3a7d79b275fccf7fbfb1fc3af 100644
--- a/tensorflow/python/ops/sparse_grad.py
+++ b/tensorflow/python/ops/sparse_grad.py
@@ -278,7 +278,7 @@ def _SparseSoftmaxGrad(op, grad):
       indices, sp_output.values * sp_grad.values, shape)
 
   # [..., B, 1], dense.
-  sum_reduced = -sparse_ops.sparse_reduce_sum(sp_product, [-1], keep_dims=True)
+  sum_reduced = -sparse_ops.sparse_reduce_sum(sp_product, [-1], keepdims=True)
   # sparse [..., B, C] + dense [..., B, 1] with broadcast; outputs sparse.
   sp_sum = sparse_ops.sparse_dense_cwise_add(sp_grad, sum_reduced)
 
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 097b485a115fb8153f77d0ad24c63b872fb2e8ca..db05eb13154a4158f0dc3bf7f0876963bf02742c 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -197,7 +197,8 @@ def sparse_concat(axis,
                   sp_inputs,
                   name=None,
                   expand_nonconcat_dim=False,
-                  concat_dim=None):
+                  concat_dim=None,
+                  expand_nonconcat_dims=None):
   """Concatenates a list of `SparseTensor` along the specified dimension.
 
   Concatenation is with respect to the dense versions of each sparse input.
@@ -286,6 +287,7 @@ def sparse_concat(axis,
     expand_nonconcat_dim: Whether to allow the expansion in the non-concat
       dimensions. Defaulted to False.
     concat_dim: The old (deprecated) name for axis.
+    expand_nonconcat_dims: alias for expand_nonconcat_dim
 
   Returns:
     A `SparseTensor` with the concatenated output.
@@ -293,6 +295,11 @@ def sparse_concat(axis,
   Raises:
     TypeError: If `sp_inputs` is not a list of `SparseTensor`.
   """
+  expand_nonconcat_dim = deprecation.deprecated_argument_lookup(
+      "expand_nonconcat_dims", expand_nonconcat_dims,
+      "expand_nonconcat_dim", expand_nonconcat_dim)
+  if expand_nonconcat_dims is not None:
+    expand_nonconcat_dim = expand_nonconcat_dims
   axis = deprecation.deprecated_argument_lookup("axis", axis, "concat_dim",
                                                 concat_dim)
   return sparse_concat_v2(axis, sp_inputs, expand_nonconcat_dim, name)
@@ -806,8 +813,8 @@ def sparse_split(keyword_required=KeywordRequired(),
   Graphically the output tensors are:
 
       output_tensor[0] =
-      [    a ]
-      [b c   ]
+      [    a   ]
+      [b c     ]
 
       output_tensor[1] =
       [ d e  ]
@@ -1774,7 +1781,9 @@ def sparse_reset_shape(sp_input, new_shape=None):
     output_shape_tensor = math_ops.cast(output_shape_tensor, dtypes.int64)
     # For cases when shape is known during graph construction, this catches the
     # error before the sparse_tensor.SparseTensor catches it.
-    output_shape_tensor.get_shape()[0].merge_with(in_shape.get_shape()[0])
+    if output_shape_tensor.get_shape().rank is not None:
+      output_shape_tensor.get_shape().dims[0].merge_with(
+          in_shape.get_shape().dims[0])
 
     output_shape_tensor_const = tensor_util.constant_value(output_shape_tensor)
     # For cases where all shapes are known during graph construction
diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py
index 94aaebed951a96a4aade8d05d36b3366e59708a5..41ba060a4b666ceb7ecdf04431b14dce48cfba4f 100644
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@@ -126,7 +126,9 @@ class LBetaTest(test.TestCase):
     x_b = [0.1]
     with self.session(use_gpu=True):
       self.assertAllClose(
-          1, self.evaluate(math_ops.exp(special_math_ops.lbeta(x_a))))
+          1,
+          self.evaluate(math_ops.exp(special_math_ops.lbeta(x_a))),
+          rtol=3e-6)
       self.assertAllClose(
           1, self.evaluate(math_ops.exp(special_math_ops.lbeta(x_b))))
       self.assertEqual((), special_math_ops.lbeta(x_a).get_shape())
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index c614d072badbdf7927d6c889288e1cf4e8d988ef..5e217d8ed2f3bbe427c144700e485d1be339545f 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -22,6 +22,8 @@ from __future__ import print_function
 
 import sys as _sys
 
+from tensorflow.python import autograph
+
 # pylint: disable=g-bad-import-order
 # Imports the following modules so that @RegisterGradient get executed.
 from tensorflow.python.ops import array_grad
@@ -52,6 +54,7 @@ from tensorflow.python.ops.control_flow_ops import tuple  # pylint: disable=rede
 # pylint: enable=redefined-builtin
 from tensorflow.python.eager import wrap_function
 from tensorflow.python.ops.control_flow_ops import while_loop
+from tensorflow.python.ops.critical_section_ops import *
 from tensorflow.python.ops.data_flow_ops import *
 from tensorflow.python.ops.functional_ops import *
 from tensorflow.python.ops.gradients import *
@@ -69,6 +72,8 @@ from tensorflow.python.ops.math_ops import *
 from tensorflow.python.ops.numerics import *
 from tensorflow.python.ops.parsing_ops import *
 from tensorflow.python.ops.partitioned_variables import *
+from tensorflow.python.ops.ragged import ragged_dispatch as _ragged_dispatch
+from tensorflow.python.ops.ragged import ragged_operators as _ragged_operators
 from tensorflow.python.ops.random_ops import *
 from tensorflow.python.ops.script_ops import py_func
 from tensorflow.python.ops.session_ops import *
@@ -100,3 +105,7 @@ from tensorflow.python.ops.variable_scope import *
 from tensorflow.python.ops.variables import *
 # pylint: enable=wildcard-import
 # pylint: enable=g-bad-import-order
+
+
+# These modules were imported to set up RaggedTensor operators and dispatchers:
+del _ragged_dispatch, _ragged_operators
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index 3ac69c1c202d71b91e42f0f4a5bdd80c881ef97d..be21263f4cbdbdd4a38b0e849e1fec15ba033712 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops.gen_state_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -224,6 +225,7 @@ def assign(ref, value, validate_shape=None, use_locking=None, name=None):
 
 
 @tf_export(v1=["count_up_to"])
+@deprecated(None, "Prefer Dataset.range instead.")
 def count_up_to(ref, limit, name=None):
   r"""Increments 'ref' until it reaches 'limit'.
 
@@ -430,19 +432,19 @@ def scatter_nd_add(ref, indices, updates, use_locking=False, name=None):
   `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 
   ```
-  [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+  [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
   ```
 
   For example, say we want to add 4 scattered elements to a rank-1 tensor to
-  8 elements. In Python, that update would look like this:
+  8 elements. In Python, that addition would look like this:
 
   ```python
-      ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-      indices = tf.constant([[4], [3], [1] ,[7]])
-      updates = tf.constant([9, 10, 11, 12])
-      add = tf.scatter_nd_add(ref, indices, updates)
-      with tf.Session() as sess:
-        print sess.run(add)
+  ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+  indices = tf.constant([[4], [3], [1], [7]])
+  updates = tf.constant([9, 10, 11, 12])
+  add = tf.scatter_nd_add(ref, indices, updates)
+  with tf.Session() as sess:
+    print sess.run(add)
   ```
 
   The resulting update to ref would look like this:
@@ -462,9 +464,8 @@ def scatter_nd_add(ref, indices, updates, use_locking=False, name=None):
     updates: A `Tensor`. Must have the same type as `ref`.
       A tensor of updated values to add to ref.
     use_locking: An optional `bool`. Defaults to `False`.
-      An optional bool. Defaults to True. If True, the assignment will
-      be protected by a lock; otherwise the behavior is undefined,
-      but may exhibit less contention.
+      If True, the assignment will be protected by a lock;
+      otherwise the behavior is undefined, but may exhibit less contention.
     name: A name for the operation (optional).
 
   Returns:
@@ -548,19 +549,19 @@ def scatter_nd_sub(ref, indices, updates, use_locking=False, name=None):
   `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 
   ```
-  [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+  [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
   ```
 
   For example, say we want to subtract 4 scattered elements from a rank-1 tensor
-  to 8 elements. In Python, that update would look like this:
+  with 8 elements. In Python, that update would look like this:
 
   ```python
-      ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-      indices = tf.constant([[4], [3], [1] ,[7]])
-      updates = tf.constant([9, 10, 11, 12])
-      op = tf.scatter_nd_sub(ref, indices, updates)
-      with tf.Session() as sess:
-        print sess.run(op)
+  ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+  indices = tf.constant([[4], [3], [1] ,[7]])
+  updates = tf.constant([9, 10, 11, 12])
+  op = tf.scatter_nd_sub(ref, indices, updates)
+  with tf.Session() as sess:
+    print sess.run(op)
   ```
 
   The resulting update to ref would look like this:
diff --git a/tensorflow/python/ops/stateful_random_ops.py b/tensorflow/python/ops/stateful_random_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..91ca25254bc28b865c29b42fd5fa8082a9f24241
--- /dev/null
+++ b/tensorflow/python/ops/stateful_random_ops.py
@@ -0,0 +1,260 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operations for generating random numbers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_stateful_random_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.training.tracking import \
+tracking
+from tensorflow.python.util.tf_export import tf_export
+
+# A seed for random ops (stateful and stateless) will always be 1024
+# bits, all of which will be sent to the C++ code. The actual C++
+# implementation of some algorithms may only use a lower part of the bits.
+
+MAX_INT64 = 2**63 - 1
+MIN_INT64 = -(2**63)
+UINT64_SPAN = 2**64
+# 'Variable' doesn't support uint32 or uint64 yet (due to reasons explained in
+# b/111604096 and cl/171681867), so I use signed int here. I choose int64
+# instead of int32 here because `VarHandleOp` doesn't support int32 on GPU.
+SEED_TYPE = "int64"
+SEED_MIN = MIN_INT64
+SEED_MAX = MAX_INT64
+SEED_UINT_SPAN = UINT64_SPAN
+SEED_TYPE_BITS = 64
+SEED_BIT_MASK = 0xFFFFFFFFFFFFFFFF
+SEED_SIZE = 16  # in units of SEED_TYPE
+
+
+STATE_TYPE = SEED_TYPE
+ALGORITHM_TYPE = STATE_TYPE
+RNG_ALG_PHILOX = 1
+DEFAULT_ALGORITHM = RNG_ALG_PHILOX
+
+
+def non_deterministic_seed():
+  """Makes a non-deterministic seed.
+
+  The implementation will be changed soon from pure Python to an op.
+
+  Returns:
+    a 1-D tensor.
+  """
+  return np.random.randint(
+      low=SEED_MIN, high=SEED_MAX + 1, size=SEED_SIZE,
+      dtype=SEED_TYPE)
+
+
+def _uint_to_int(n):
+  if n > SEED_MAX:
+    n = n - SEED_UINT_SPAN
+  return n
+
+
+PHILOX_STATE_SIZE = 3
+
+
+def _make_philox_state(seed):
+  """Makes a RNG state for Philox algorithm.
+
+  Args:
+    seed: an integer or 1-D tensor.
+
+  Returns:
+    a 1-D tensor.
+  """
+  int_types = (int,) if sys.version_info >= (3, 0) else (int, long)
+  if isinstance(seed, int_types):
+    # chop the Python integer (infinite precision) into chunks of SEED_TYPE
+    ls = []
+    for _ in range(PHILOX_STATE_SIZE):
+      ls.append(seed & SEED_BIT_MASK)
+      seed >>= SEED_TYPE_BITS
+    seed = ls
+  # to avoid overflow error from np.asarray
+  seed = list(map(_uint_to_int, seed))
+  seed = np.asarray(seed, dtype=STATE_TYPE)
+  if len(seed.shape) != 1:
+    raise ValueError(
+        "seed should only have one dimension; got shape: %s" % seed.shape)
+  seed = seed[0:PHILOX_STATE_SIZE]
+  # Padding with zeros on the right if too short
+  seed_size = seed.shape[0]
+  if seed_size < PHILOX_STATE_SIZE:
+    seed = np.pad(
+        seed, [(0, PHILOX_STATE_SIZE - seed_size)],
+        mode="constant",
+        constant_values=0)
+  assert seed.shape == (PHILOX_STATE_SIZE,), "Wrong seed.shape: %s" % seed.shape
+  return seed
+
+
+def _make_state_from_seed(seed, algorithm):
+  if algorithm == RNG_ALG_PHILOX:
+    return _make_philox_state(seed)
+  else:
+    raise ValueError("Unsupported algorithm id: %s" % algorithm)
+
+
+@tf_export("random.create_rng_state")
+def create_rng_state(seed, algorithm):
+  """Creates a RNG state.
+
+  Args:
+    seed: an integer or 1-D tensor.
+    algorithm: an integer representing the RNG algorithm.
+
+  Returns:
+    a 1-D tensor whose size depends on the algorithm.
+  """
+  return _make_state_from_seed(seed, algorithm)
+
+
+def _shape_tensor(shape):
+  """Convert to an int32 or int64 tensor, defaulting to int64 if empty."""
+  if isinstance(shape, (tuple, list)) and not shape:
+    dtype = dtypes.int64
+  else:
+    dtype = None
+  return ops.convert_to_tensor(shape, dtype=dtype, name="shape")
+
+
+@tf_export("random.experimental.Generator")
+class Generator(tracking.AutoTrackable):
+  """Random-number generator.
+
+  It uses Variable to manage its internal state.
+  """
+
+  def __init__(self, copy_from=None, seed=None, algorithm=None):
+    """Creates a generator.
+
+    Args:
+      copy_from: (optional) a generator to be copied from.
+      seed: (optional) the seed for the RNG. If None, it will be chosen
+            nondeterministically
+      algorithm: (optional) the RNG algorithm. If None, it will be
+                 auto-selected.
+    """
+    if copy_from is None:
+      if seed is None:
+        seed = non_deterministic_seed()
+      if algorithm is None:
+        # TODO(wangpeng): more sophisticated algorithm selection
+        algorithm = DEFAULT_ALGORITHM
+      state = create_rng_state(seed, algorithm)
+      self._state_var = variables.Variable(state, dtype=STATE_TYPE)
+      self._alg_var = variables.Variable(initial_value=algorithm,
+                                         dtype=ALGORITHM_TYPE)
+    else:
+      assert seed is None
+      self._state_var = variables.Variable(copy_from.state, dtype=STATE_TYPE)
+      self._alg_var = variables.Variable(initial_value=copy_from.algorithm,
+                                         dtype=ALGORITHM_TYPE)
+
+  def reset(self, seed):
+    """Resets the generator.
+
+    This function is not thread-safe: if it is run concurrently with a call to
+    sampling, the latter might see the new algorithm but the old state or vice
+    versa.
+
+    Args:
+      seed: the seed to reset the RNG to.
+    """
+    algorithm = int(self.algorithm)
+    state = create_rng_state(seed, algorithm)
+    self._state_var.assign(state)
+
+  @property
+  def state(self):
+    return self._state_var
+
+  @property
+  def algorithm(self):
+    return self._alg_var
+
+  # The following functions return a tensor and as a side effect update
+  # self._state_var.
+  def standard_normal(self, shape, dtype=dtypes.float32):
+    output = gen_stateful_random_ops.stateful_standard_normal_v2(
+        self.state.handle, self.algorithm, shape, dtype)
+    return output
+
+  def normal(self, shape, mean=0.0, stddev=1.0, dtype=dtypes.float32,
+             name=None):
+    with ops.name_scope(name, "stateful_normal", [shape, mean, stddev]) as name:
+      shape = _shape_tensor(shape)
+      mean = ops.convert_to_tensor(mean, dtype=dtype, name="mean")
+      stddev = ops.convert_to_tensor(stddev, dtype=dtype, name="stddev")
+      rnd = self.standard_normal(shape, dtype)
+      return math_ops.add(rnd * stddev, mean, name=name)
+
+  # TODO(wangpeng): implement other distributions (`uniform`,
+  #   `truncated_normal`, etc.)
+  # TODO(wangpeng): implement `make_seeds`
+  # TODO(wangpeng): implement `make_generators`
+
+
+# It's not safe to create TF ops before `init_google` is called, so this is
+# initialized to None and get a value the first time `get_global_generator` is
+# called.
+global_generator = None
+
+
+@tf_export("random.experimental.get_global_generator")
+def get_global_generator():
+  global global_generator
+  if global_generator is None:
+    global_generator = Generator()
+  return global_generator
+
+
+@tf_export("random.experimental.set_global_generator")
+def set_global_generator(generator):
+  global global_generator
+  global_generator = generator
+
+
+# This function creates a new Generator object (and the Variable object within),
+# which does not work well with tf.function because (1) tf.function puts
+# restrictions on Variable creation thus reset_global_generator can't be freely
+# used inside tf.function; (2) redirecting a global variable to
+# a new object is problematic with tf.function because the old object may be
+# captured by a 'tf.function'ed function and still be used by it.
+# A 'tf.function'ed function only keeps weak references to variables,
+# so deleting a variable and then calling that function again may raise an
+# error, as demonstrated by
+# random_test.py/RandomTest.testResetGlobalGeneratorBadWithDefun .
+# The function 'set_global_generator' below also has this problem.
+@tf_export("random.experimental.reset_global_generator")
+def reset_global_generator(seed, algorithm=None):
+  global global_generator
+  if algorithm is None:
+    # preserve the old algorithm
+    algorithm = int(get_global_generator().algorithm)
+  global_generator = Generator(seed=seed, algorithm=algorithm)
diff --git a/tensorflow/python/ops/stateful_random_ops_test.py b/tensorflow/python/ops/stateful_random_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0436736e004ee7882a9b1835ad4076128a002548
--- /dev/null
+++ b/tensorflow/python/ops/stateful_random_ops_test.py
@@ -0,0 +1,214 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for stateful_random_ops.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_random_ops
+from tensorflow.python.ops import gen_stateful_random_ops
+from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import stateful_random_ops as \
+random
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class StatefulRandomOpsTest(test.TestCase):
+
+  def testCreateRNGStateIntSeed(self):
+    """Tests `create_rng_state` when `seed` is int."""
+    # using leading 'F' to test overflow tolerance
+    state = random.create_rng_state(0xFFFF222233334444FFAA666677778888,
+                                    random.RNG_ALG_PHILOX)
+    self.assertAllEqual(
+        list(map(random._uint_to_int,
+                 [0xFFAA666677778888, 0xFFFF222233334444] +
+                 [0] * (random.PHILOX_STATE_SIZE - 2))),
+        state)
+
+  @test_util.run_v2_only
+  @test_util.also_run_as_tf_function
+  def testEagerAndDefun(self):
+    """A simple test to make sure the op works in eager and defunned mode."""
+    random.get_global_generator().normal((3,))
+
+  @test_util.run_v2_only
+  def testOpSeedSelectionAfterSetSeed(self):
+    """Tests that op-seed selection is reset after reseting global generator.
+
+    Fixing GitHub issue 9171:
+    https://github.com/tensorflow/tensorflow/issues/9171
+    """
+    shape = (3,)
+    random.get_global_generator().reset(1)
+    a = random.get_global_generator().normal(shape)
+    random.get_global_generator().reset(1)
+    b = random.get_global_generator().normal(shape)
+    self.assertAllEqual(a, b)
+
+    # Now do the above again using accelerated ('defun'ed) computation
+    @def_function.function
+    def f():
+      return random.get_global_generator().normal(shape)
+
+    random.get_global_generator().reset(1)
+    c = f()
+    random.get_global_generator().reset(1)
+    d = f()
+    self.assertAllEqual(c, d)
+    self.assertAllEqual(a, c)
+
+  @test_util.run_v2_only
+  def testOpSeedSelectionNotSensitive(self):
+    """Test that op-seed selection is not sensitive to trivial changes.
+
+    Test that op-seed selection is not sensitive to trivial computation
+    (i.e. graph) changes.
+
+    Fixing b/32087099
+    """
+    def f(include_print):
+      shape = constant_op.constant([5])
+      if include_print:
+        shape = logging_ops.Print(shape, [shape])
+      return random.get_global_generator().normal(shape)
+
+    def compare(fst_includes_print, snd_includes_print):
+      random.get_global_generator().reset(50)
+      fst = f(fst_includes_print)
+      random.get_global_generator().reset(50)
+      snd = f(snd_includes_print)
+      self.assertAllEqual(fst, snd)
+      # Now do the above again using accelerated (defunned) 'f'.
+      # Running 'f' with two different Boolean arguments should cause
+      # two different graphs to be generated, hence demonstrating the
+      # insensitivity to graph changes.
+      f_acc = def_function.function(f)
+      random.get_global_generator().reset(50)
+      fst = f_acc(fst_includes_print)
+      random.get_global_generator().reset(50)
+      snd = f_acc(snd_includes_print)
+      self.assertAllEqual(fst, snd)
+
+    compare(False, False)
+    compare(True, True)
+    compare(True, False)
+
+  @test_util.run_v2_only
+  def testCPUSameAsOldRandomOps(self):
+    """Tests that the generated numbers are the same as the old random_ops.py.
+
+    The CPU version.
+    """
+    seed1, seed2 = 79, 25
+    # note how the two seeds for the old op correspond to the seed for the new
+    # op
+    with ops.device("/device:CPU:0"):
+      random.reset_global_generator([0, seed2, seed1])
+    shape = constant_op.constant([4, 7])
+    dtype = dtypes.float64
+
+    # create a graph for the old op in order to call it many times
+    @def_function.function
+    def old():
+      with ops.device("/device:CPU:0"):
+        return gen_random_ops.random_standard_normal(
+            shape, dtype=dtype, seed=seed1, seed2=seed2)
+
+    def new():
+      with ops.device("/device:CPU:0"):
+        return random.get_global_generator().standard_normal(shape, dtype=dtype)
+
+    for _ in range(100):
+      self.assertAllEqual(old(), new())
+
+  @test_util.run_v2_only
+  @test_util.run_cuda_only
+  def testGPUSameAsOldRandomOps(self):
+    """Tests that the generated numbers are the same as the old random_ops.py.
+
+    The GPU version.
+    """
+    seed1, seed2 = 79, 25
+    with ops.device(test_util.gpu_device_name()):
+      random.reset_global_generator([0, seed2, seed1])
+    shape = constant_op.constant([4, 7])
+    dtype = dtypes.float64
+
+    @def_function.function
+    def old():
+      with ops.device(test_util.gpu_device_name()):
+        return gen_random_ops.random_standard_normal(
+            shape, dtype=dtype, seed=seed1, seed2=seed2)
+
+    def new():
+      with ops.device(test_util.gpu_device_name()):
+        return random.get_global_generator().standard_normal(shape, dtype=dtype)
+
+    for _ in range(100):
+      self.assertAllEqual(old(), new())
+
+  @test_util.run_v2_only
+  def testStatefulStandardNormal(self):
+    """Tests that op 'StatefulStandardNormal' still works.
+    """
+    shape = constant_op.constant([4, 7])
+    dtype = dtypes.float64
+    seed = 1234
+    algorithm = random.RNG_ALG_PHILOX
+    state = random._make_state_from_seed(seed, algorithm)
+    with ops.device("/device:CPU:0"):
+      var1 = variables.Variable(
+          np.concatenate((np.array([algorithm], dtype=random.STATE_TYPE),
+                          state), axis=None),
+          dtype=random.STATE_TYPE)
+      var2 = variables.Variable(state, dtype=random.STATE_TYPE)
+      for _ in range(100):
+        t1 = gen_stateful_random_ops.stateful_standard_normal(
+            var1.handle, shape, dtype)
+        t2 = gen_stateful_random_ops.stateful_standard_normal_v2(
+            var2.handle, algorithm, shape, dtype)
+        self.assertAllEqual(t1, t2)
+
+  @test_util.run_v2_only
+  def testResetGlobalGeneratorBadWithDefun(self):
+    """Demonstrates that reset_global_generator don't work properly with defun.
+    """
+    shape = (3,)
+
+    @def_function.function
+    def f():
+      return random.get_global_generator().normal(shape)
+
+    random.reset_global_generator(50)
+    with self.assertRaisesWithPredicateMatch(
+        AssertionError, "variable.*deleted"):
+      a = f()
+      random.reset_global_generator(50)
+      b = f()
+      self.assertAllEqual(a, b)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index 046459706c0881bd9a3cbd68e4d5553d0547947c..f75634d856e8331069faa8f4d348f2ded3823467 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -321,7 +321,10 @@ def reduce_join(inputs, axis=None,  # pylint: disable=missing-docstring
                 keep_dims=False,
                 separator="",
                 name=None,
-                reduction_indices=None):
+                reduction_indices=None,
+                keepdims=None):
+  keep_dims = deprecation.deprecated_argument_lookup(
+      "keepdims", keepdims, "keep_dims", keep_dims)
   inputs_t = ops.convert_to_tensor(inputs)
   reduction_indices = _reduce_join_reduction_dims(
       inputs_t, axis, reduction_indices)
@@ -367,7 +370,7 @@ def string_length_v2(input, unit="BYTE", name=None):
 string_length.__doc__ = gen_string_ops.string_length.__doc__
 
 
-@tf_export("substr")
+@tf_export(v1=["substr"])
 @deprecation.deprecated(None, "Use `tf.strings.substr` instead of `tf.substr`.")
 def substr_deprecated(input, pos, len, name=None, unit="BYTE"):
   return substr(input, pos, len, name=name, unit=unit)
@@ -380,14 +383,15 @@ substr_deprecated.__doc__ = gen_string_ops.substr.__doc__
 def substr(input, pos, len, name=None, unit="BYTE"):
   return gen_string_ops.substr(input, pos, len, unit=unit, name=name)
 
+substr.__doc__ = gen_string_ops.substr.__doc__
+
 
 @tf_export("strings.substr", v1=[])
 @dispatch.add_dispatch_support
 def substr_v2(input, pos, len, unit="BYTE", name=None):
-  return substr(input, pos, len, name=name, unit=unit)
-
+  return gen_string_ops.substr(input, pos, len, unit=unit, name=name)
 
-substr.__doc__ = gen_string_ops.substr.__doc__
+substr_v2.__doc__ = gen_string_ops.substr.__doc__
 
 
 ops.NotDifferentiable("RegexReplace")
@@ -421,9 +425,19 @@ def string_to_number(input, out_type=dtypes.float32, name=None):
     A `Tensor` of type `out_type`.
   """
   return gen_parsing_ops.string_to_number(input, out_type, name)
-tf_export(v1=["strings.to_number", "string_to_number"])(
-    gen_parsing_ops.string_to_number
-    )
+
+
+@tf_export(v1=["strings.to_number", "string_to_number"])
+def string_to_number_v1(
+    string_tensor=None,
+    out_type=dtypes.float32,
+    name=None,
+    input=None):
+  string_tensor = deprecation.deprecated_argument_lookup(
+      "input", input, "string_tensor", string_tensor)
+  return gen_parsing_ops.string_to_number(string_tensor, out_type, name)
+
+string_to_number_v1.__doc__ = gen_parsing_ops.string_to_number.__doc__
 
 
 @tf_export("strings.to_hash_bucket", v1=[])
@@ -449,6 +463,16 @@ def string_to_hash_bucket(input, num_buckets, name=None):
   """
   # pylint: enable=line-too-long
   return gen_string_ops.string_to_hash_bucket(input, num_buckets, name)
-tf_export(v1=["strings.to_hash_bucket", "string_to_hash_bucket"])(
-    gen_string_ops.string_to_hash_bucket
-    )
+
+
+@tf_export(v1=["strings.to_hash_bucket", "string_to_hash_bucket"])
+def string_to_hash_bucket_v1(
+    string_tensor=None,
+    num_buckets=None,
+    name=None,
+    input=None):
+  string_tensor = deprecation.deprecated_argument_lookup(
+      "input", input, "string_tensor", string_tensor)
+  return gen_string_ops.string_to_hash_bucket(string_tensor, num_buckets, name)
+
+string_to_hash_bucket_v1.__doc__ = gen_string_ops.string_to_hash_bucket.__doc__
diff --git a/tensorflow/python/ops/summary_ops_v2.py b/tensorflow/python/ops/summary_ops_v2.py
index 3f99b9f8773b3d26cf334044e0d127bf7443bfea..0f7e8cf12cf6d13b275f7bab513990c1460675bd 100644
--- a/tensorflow/python/ops/summary_ops_v2.py
+++ b/tensorflow/python/ops/summary_ops_v2.py
@@ -27,6 +27,8 @@ import time
 import six
 
 from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import summary_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -44,12 +46,6 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-
-# Dictionary mapping graph keys to a boolean Tensor (or callable returning
-# a boolean Tensor) indicating whether we should record summaries for the
-# graph identified by the key of the dictionary.
-_SHOULD_RECORD_SUMMARIES = {}
-
 # A global dictionary mapping graph keys to a list of summary writer init ops.
 _SUMMARY_WRITER_INIT_OP = {}
 
@@ -58,40 +54,51 @@ _RUN_NAME_PATTERNS = re.compile(r"^[^\x00-\x1F<>]{0,512}$")
 _USER_NAME_PATTERNS = re.compile(r"^[a-z]([-a-z0-9]{0,29}[a-z0-9])?$", re.I)
 
 
+def _should_record_summaries_internal():
+  """Returns boolean Tensor if summaries should/shouldn't be recorded, or None.
+  """
+  condition = context.context().recording_summaries
+  return condition() if callable(condition) else condition
+
+
+def _should_record_summaries_v2():
+  """Returns boolean Tensor which is true if summaries should be recorded.
+
+  If no recording status has been set, this defaults to True, unlike the public
+  should_record_summaries().
+  """
+  result = _should_record_summaries_internal()
+  return True if result is None else result
+
+
 def should_record_summaries():
   """Returns boolean Tensor which is true if summaries should be recorded."""
-  global _SHOULD_RECORD_SUMMARIES
-  key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-  should = _SHOULD_RECORD_SUMMARIES.setdefault(key, False)
-  return should() if callable(should) else should
+  result = _should_record_summaries_internal()
+  return False if result is None else result
 
 
+@tf_export("summary.record_if", v1=[])
 @tf_contextlib.contextmanager
-def _record_summaries(boolean=True):
+def record_if(condition):
   """Sets summary recording on or off per the provided boolean value.
 
   The provided value can be a python boolean, a scalar boolean Tensor, or
   or a callable providing such a value; if a callable is passed it will be
-  invoked each time should_record_summaries() is called to determine whether
-  summary writing should be enabled.
+  invoked on-demand to determine whether summary writing will occur.
 
   Args:
-    boolean: can be True, False, a bool Tensor, or a callable providing such.
-      Defaults to True.
+    condition: can be True, False, a bool Tensor, or a callable providing such.
 
   Yields:
     Returns a context manager that sets this value on enter and restores the
     previous value on exit.
   """
-  # TODO(nickfelt): make this threadlocal
-  global _SHOULD_RECORD_SUMMARIES
-  key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-  old = _SHOULD_RECORD_SUMMARIES.setdefault(key, False)
+  old = context.context().recording_summaries
   try:
-    _SHOULD_RECORD_SUMMARIES[key] = boolean
+    context.context().recording_summaries = condition
     yield
   finally:
-    _SHOULD_RECORD_SUMMARIES[key] = old
+    context.context().recording_summaries = old
 
 
 # TODO(apassos) consider how to handle local step here.
@@ -103,17 +110,17 @@ def record_summaries_every_n_global_steps(n, global_step=None):
     should = lambda: math_ops.equal(global_step % n, 0)
     if not context.executing_eagerly():
       should = should()
-  return _record_summaries(should)
+  return record_if(should)
 
 
 def always_record_summaries():
   """Sets the should_record_summaries Tensor to always true."""
-  return _record_summaries(True)
+  return record_if(True)
 
 
 def never_record_summaries():
   """Sets the should_record_summaries Tensor to always false."""
-  return _record_summaries(False)
+  return record_if(False)
 
 
 @tf_export("summary.SummaryWriter", v1=[])
@@ -178,16 +185,19 @@ class SummaryWriter(object):
       return self._close()
 
 
+@tf_export(v1=["summary.initialize"])
 def initialize(
     graph=None,  # pylint: disable=redefined-outer-name
     session=None):
   """Initializes summary writing for graph execution mode.
 
+  This operation is a no-op when executing eagerly.
+
   This helper method provides a higher-level alternative to using
   `tf.contrib.summary.summary_writer_initializer_op` and
   `tf.contrib.summary.graph`.
 
-  Most users will also want to call `tf.train.create_global_step`
+  Most users will also want to call `tf.compat.v1.train.create_global_step`
   which can happen before or after this function is called.
 
   Args:
@@ -370,6 +380,98 @@ def summary_writer_initializer_op():
   return _SUMMARY_WRITER_INIT_OP.setdefault(key, [])
 
 
+_INVALID_SCOPE_CHARACTERS = re.compile(r"[^-_/.A-Za-z0-9]")
+
+
+@tf_export("summary.summary_scope", v1=[])
+@tf_contextlib.contextmanager
+def summary_scope(name, default_name="summary", values=None):
+  """A context manager for use when defining a custom summary op.
+
+  This behaves similarly to `tf.name_scope`, except that it returns a generated
+  summary tag in addition to the scope name. The tag is structurally similar to
+  the scope name - derived from the user-provided name, prefixed with enclosing
+  name scopes if any - but we relax the constraint that it be uniquified, as
+  well as the character set limitation (so the user-provided name can contain
+  characters not legal for scope names; in the scope name these are removed).
+
+  This makes the summary tag more predictable and consistent for the user.
+
+  For example, to define a new summary op called `my_op`:
+
+  ```python
+  def my_op(name, my_value, step):
+    with tf.summary.summary_scope(name, "MyOp", [my_value]) as (tag, scope):
+      my_value = tf.convert_to_tensor(my_value)
+      return tf.summary.write(tag, my_value, step=step)
+  ```
+
+  Args:
+    name: string name for the summary.
+    default_name: Optional; if provided, used as default name of the summary.
+    values: Optional; passed as `values` parameter to name_scope.
+
+  Yields:
+    A tuple `(tag, scope)` as described above.
+  """
+  name = name or default_name
+  current_scope = ops.get_name_scope()
+  tag = current_scope + "/" + name if current_scope else name
+  # Strip illegal characters from the scope name, and if that leaves nothing,
+  # use None instead so we pick up the default name.
+  name = _INVALID_SCOPE_CHARACTERS.sub("", name) or None
+  with ops.name_scope(name, default_name, values) as scope:
+    yield tag, scope
+
+
+@tf_export("summary.write", v1=[])
+def write(tag, tensor, step, metadata=None, name=None):
+  """Writes a generic summary to the default SummaryWriter if one exists.
+
+  This exists primarily to support the definition of type-specific summary ops
+  like scalar() and image(), and is not intended for direct use unless defining
+  a new type-specific summary op.
+
+  Args:
+    tag: string tag used to identify the summary (e.g. in TensorBoard), usually
+      generated with `tf.summary.summary_scope`
+    tensor: the Tensor holding the summary data to write
+    step: `int64`-castable monotic step value for this summary
+    metadata: Optional SummaryMetadata, as a proto or serialized bytes
+    name: Optional string name for this op.
+
+  Returns:
+    True on success, or false if no summary was written because no default
+    summary writer was available.
+  """
+  with ops.name_scope(name, "write_summary") as scope:
+    if context.context().summary_writer_resource is None:
+      return constant_op.constant(False)
+    if metadata is None:
+      serialized_metadata = constant_op.constant(b"")
+    elif hasattr(metadata, "SerializeToString"):
+      serialized_metadata = constant_op.constant(metadata.SerializeToString())
+    else:
+      serialized_metadata = metadata
+
+    def record():
+      """Record the actual summary and return True."""
+      # Note the identity to move the tensor to the CPU.
+      with ops.device("cpu:0"):
+        write_summary_op = gen_summary_ops.write_summary(
+            context.context().summary_writer_resource,
+            step,
+            array_ops.identity(tensor),
+            tag,
+            serialized_metadata,
+            name=scope)
+        with ops.control_dependencies([write_summary_op]):
+          return constant_op.constant(True)
+
+    return smart_cond.smart_cond(
+        _should_record_summaries_v2(), record, _nothing, name="summary_cond")
+
+
 def summary_writer_function(name, tensor, function, family=None):
   """Helper function to write summaries.
 
@@ -630,3 +732,105 @@ def _choose_step(step):
   if not isinstance(step, ops.Tensor):
     return ops.convert_to_tensor(step, dtypes.int64)
   return step
+
+
+def run_metadata(name, data, step):
+  """Writes entire RunMetadata summary.
+
+  A RunMetadata can contain DeviceStats, partition graphs, and function graphs.
+  Please refer to the proto for definition of each field.
+
+  Args:
+    name: A name for this summary. The summary tag used for TensorBoard will be
+      this name prefixed by any active name scopes.
+    data: A RunMetadata proto to write.
+    step: Required `int64`-castable monotonic step value.
+
+  Returns:
+    True on success, or false if no summary was written because no default
+    summary writer was available.
+  """
+  summary_metadata = summary_pb2.SummaryMetadata()
+  # Hard coding a plugin name. Please refer to go/tb-plugin-name-hardcode for
+  # the rationale.
+  summary_metadata.plugin_data.plugin_name = "graph_run_metadata"
+  # version number = 1
+  summary_metadata.plugin_data.content = b"1"
+
+  with summary_scope(name,
+                     "graph_run_metadata_summary",
+                     [data, step]) as (tag, _):
+    return write(
+        tag=tag,
+        tensor=constant_op.constant(
+            data.SerializeToString(), dtype=dtypes.string),
+        step=step,
+        metadata=summary_metadata)
+
+
+def run_metadata_graphs(name, data, step):
+  """Writes graphs from a RunMetadata summary.
+
+  Args:
+    name: A name for this summary. The summary tag used for TensorBoard will be
+      this name prefixed by any active name scopes.
+    data: A RunMetadata proto to write.
+    step: Required `int64`-castable monotonic step value.
+
+  Returns:
+    True on success, or false if no summary was written because no default
+    summary writer was available.
+  """
+  summary_metadata = summary_pb2.SummaryMetadata()
+  # Hard coding a plugin name. Please refer to go/tb-plugin-name-hardcode for
+  # the rationale.
+  summary_metadata.plugin_data.plugin_name = "graph_run_metadata_graph"
+  # version number = 1
+  summary_metadata.plugin_data.content = b"1"
+
+  data = config_pb2.RunMetadata(
+      function_graphs=data.function_graphs,
+      partition_graphs=data.partition_graphs)
+
+  with summary_scope(name,
+                     "graph_run_metadata_graph_summary",
+                     [data, step]) as (tag, _):
+    return write(
+        tag=tag,
+        tensor=constant_op.constant(
+            data.SerializeToString(), dtype=dtypes.string),
+        step=step,
+        metadata=summary_metadata)
+
+
+def keras_model(name, data, step):
+  """Writes a Keras model as JSON to as a Summary.
+
+  Writing the Keras model configuration allows the TensorBoard graph plugin to
+  render a conceptual graph, as opposed to graph of ops.
+
+  Args:
+    name: A name for this summary. The summary tag used for TensorBoard will be
+      this name prefixed by any active name scopes.
+    data: A Keras Model to write.
+    step: Required `int64`-castable monotonic step value.
+
+  Returns:
+    True on success, or false if no summary was written because no default
+    summary writer was available.
+  """
+  summary_metadata = summary_pb2.SummaryMetadata()
+  # Hard coding a plugin name. Please refer to go/tb-plugin-name-hardcode for
+  # the rationale.
+  summary_metadata.plugin_data.plugin_name = "graph_keras_model"
+  # version number = 1
+  summary_metadata.plugin_data.content = b"1"
+
+  json_string = data.to_json()
+
+  with summary_scope(name, "graph_keras_model", [data, step]) as (tag, _):
+    return write(
+        tag=tag,
+        tensor=constant_op.constant(json_string, dtype=dtypes.string),
+        step=step,
+        metadata=summary_metadata)
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index 7c2d3be338766a4e25a817f824e06c665059bc01..ff4f23a0e75157f00167bdd6d9001fc6fa53a0a0 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -26,8 +26,8 @@ from tensorflow.python.eager import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.checkpointable import base as checkpointable
-from tensorflow.python.training.checkpointable import util as checkpointable_util
+from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.training.tracking import util as trackable_util
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.deprecation import deprecated
@@ -232,7 +232,7 @@ def _skip_common_stack_elements(stacktrace, base_case):
   return stacktrace[-1:]
 
 
-class Template(checkpointable.CheckpointableBase):
+class Template(trackable.Trackable):
   """Wrap a function to aid in variable sharing.
 
   Templates are functions that create variables the first time they are called
@@ -306,8 +306,8 @@ class Template(checkpointable.CheckpointableBase):
         result = self._func(*args, **kwargs)
       else:
         # The first time we run, restore variables if necessary (via
-        # Checkpointable).
-        with checkpointable_util.capture_dependencies(template=self):
+        # Trackable).
+        with trackable_util.capture_dependencies(template=self):
           result = self._func(*args, **kwargs)
 
       if self._variables_created:
@@ -387,8 +387,11 @@ class Template(checkpointable.CheckpointableBase):
     """Returns the variable scope name created by this Template."""
     if self._variable_scope:
       name = self._variable_scope.name
-      # To prevent partial matches on the scope_name, we add '/' at the end.
-      return name if name[-1] == "/" else name + "/"
+      if not name or name[-1] == "/":
+        return name
+      else:
+        # To prevent partial matches on the scope_name, we add '/' at the end.
+        return name + "/"
 
   @property
   def variables(self):
@@ -574,8 +577,8 @@ class EagerTemplate(Template):
         result = self._func(*args, **kwargs)
       else:
         # The first time we run, restore variables if necessary (via
-        # Checkpointable).
-        with checkpointable_util.capture_dependencies(template=self):
+        # Trackable).
+        with trackable_util.capture_dependencies(template=self):
           result = self._func(*args, **kwargs)
 
       if self._variables_created:
@@ -646,29 +649,6 @@ class EagerTemplate(Template):
         with self._template_store.as_default():
           return self._call_func(args, kwargs)
 
-  @property
-  def name(self):
-    """Returns the name given to this Template."""
-    return self._name
-
-  @property
-  def func(self):
-    """Returns the func given to this Template."""
-    return self._func
-
-  @property
-  def variable_scope(self):
-    """Returns the variable scope object created by this Template."""
-    return self._variable_scope
-
-  @property
-  def variable_scope_name(self):
-    """Returns the variable scope name created by this Template."""
-    if self._variable_scope:
-      name = self._variable_scope.name
-      # To prevent partial matches on the scope_name, we add '/' at the end.
-      return name if name[-1] == "/" else name + "/"
-
   @property
   def variables(self):
     """Returns the list of variables created by the Template."""
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index d1516949517f1f5df9291add96756eeacea29f51..65dee7797cf82c5a834f38b3d1a9efe95230a707 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -20,10 +20,8 @@ from __future__ import division
 from __future__ import print_function
 
 import contextlib
-import os
 import weakref
 
-from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -32,6 +30,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import list_ops
@@ -40,10 +39,6 @@ from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.tf_export import tf_export
 
 
-ENABLE_TENSOR_ARRAY_V2 = (
-    tf2.enabled() or os.getenv("TF_ENABLE_TENSOR_ARRAY_V2") is not None)
-
-
 # _GraphTensorArray accesses many of the hidden generated ops, but is in
 # fact built to wrap these methods.
 # pylint: disable=protected-access
@@ -119,6 +114,7 @@ class _GraphTensorArray(object):
 
     if clear_after_read is None:
       clear_after_read = True
+    self._dynamic_size = None
     dynamic_size = dynamic_size or False
 
     self._dtype = dtype
@@ -226,7 +222,9 @@ class _GraphTensorArray(object):
     """See TensorArray."""
     flow = array_ops.identity(self._flow)
     ta = TensorArray(
-        dtype=self._dtype, handle=self._handle, flow=flow,
+        dtype=self._dtype,
+        handle=self._handle,
+        flow=flow,
         infer_shape=self._infer_shape,
         colocate_with_first_write_call=self._colocate_with_first_write_call)
     ta._element_shape = self._element_shape
@@ -283,7 +281,9 @@ class _GraphTensorArray(object):
             flow_in=self._flow,
             name=name)
       ta = TensorArray(
-          dtype=self._dtype, handle=self._handle, flow=flow_out,
+          dtype=self._dtype,
+          handle=self._handle,
+          flow=flow_out,
           colocate_with_first_write_call=self._colocate_with_first_write_call)
       ta._infer_shape = self._infer_shape
       ta._element_shape = self._element_shape
@@ -354,7 +354,9 @@ class _GraphTensorArray(object):
             flow_in=self._flow,
             name=name)
       ta = TensorArray(
-          dtype=self._dtype, handle=self._handle, flow=flow_out,
+          dtype=self._dtype,
+          handle=self._handle,
+          flow=flow_out,
           colocate_with_first_write_call=self._colocate_with_first_write_call)
       ta._infer_shape = self._infer_shape
       ta._element_shape = self._element_shape
@@ -383,7 +385,9 @@ class _GraphTensorArray(object):
             flow_in=self._flow,
             name=name)
       ta = TensorArray(
-          dtype=self._dtype, handle=self._handle, flow=flow_out,
+          dtype=self._dtype,
+          handle=self._handle,
+          flow=flow_out,
           colocate_with_first_write_call=self._colocate_with_first_write_call)
       ta._infer_shape = self._infer_shape
       ta._element_shape = self._element_shape
@@ -453,7 +457,7 @@ class _GraphTensorArrayV2(object):
     del tensor_array_name
     del colocate_with_first_write_call
 
-    del dynamic_size  # TODO(b/117943489): Unused for now.
+    self._dynamic_size = dynamic_size
 
     if (flow is not None and
         (not isinstance(flow, ops.Tensor) or flow.dtype != dtypes.variant)):
@@ -530,10 +534,7 @@ class _GraphTensorArrayV2(object):
   def identity(self):
     """See TensorArray."""
     flow = array_ops.identity(self._flow)
-    ta = TensorArray(
-        dtype=self._dtype, flow=flow, infer_shape=self._infer_shape)
-    ta._element_shape = self._element_shape
-    return ta
+    return build_ta_with_new_flow(self, flow)
 
   def grad(self, source, flow=None, name=None):
     """Not supported."""
@@ -541,14 +542,20 @@ class _GraphTensorArrayV2(object):
 
   def read(self, index, name=None):
     """See TensorArray."""
-    value = list_ops.tensor_list_get_item(
-        input_handle=self._flow,
-        index=index,
-        element_dtype=self._dtype,
-        name=name)
-    if self._element_shape:
-      value.set_shape(self._element_shape[0].dims)
-    return value
+    with ops.name_scope(name, "TensorArrayV2Read", [self._flow, index]):
+      if self._element_shape:
+        element_shape = self._element_shape[0]
+      else:
+        element_shape = tensor_shape.TensorShape(None)
+      value = list_ops.tensor_list_get_item(
+          input_handle=self._flow,
+          index=index,
+          element_dtype=self._dtype,
+          element_shape=element_shape,
+          name=name)
+      if self._element_shape:
+        value.set_shape(self._element_shape[0].dims)
+      return value
 
   @tf_should_use.should_use_result
   def write(self, index, value, name=None):
@@ -558,27 +565,39 @@ class _GraphTensorArrayV2(object):
       if self._infer_shape:
         self._merge_element_shape(value.shape)
       flow_out = list_ops.tensor_list_set_item(
-          input_handle=self._flow, index=index, item=value, name=name)
-      ta = TensorArray(dtype=self._dtype, handle=None, flow=flow_out)
-      ta._infer_shape = self._infer_shape
-      ta._element_shape = self._element_shape
-      return ta
+          input_handle=self._flow,
+          index=index,
+          item=value,
+          resize_if_index_out_of_bounds=self._dynamic_size,
+          name=name)
+      return build_ta_with_new_flow(self, flow_out)
 
   def stack(self, name=None):
     """See TensorArray."""
     with ops.name_scope(name, "TensorArrayV2Stack", [self._flow]):
+      if self._element_shape:
+        element_shape = self._element_shape[0]
+      else:
+        element_shape = tensor_shape.TensorShape(None)
       value = list_ops.tensor_list_stack(
-          input_handle=self._flow, element_dtype=self._dtype)
+          input_handle=self._flow,
+          element_dtype=self._dtype,
+          element_shape=element_shape)
       if self._element_shape and self._element_shape[0].dims is not None:
         value.set_shape([None] + self._element_shape[0].dims)
       return value
 
   def gather(self, indices, name=None):
     """See TensorArray."""
+    if self._element_shape:
+      element_shape = self._element_shape[0]
+    else:
+      element_shape = tensor_shape.TensorShape(None)
     value = list_ops.tensor_list_gather(
         input_handle=self._flow,
         indices=indices,
         element_dtype=self._dtype,
+        element_shape=element_shape,
         name=name)
     if self._element_shape and self._element_shape[0].dims is not None:
       value.set_shape([None] + self._element_shape[0].dims)
@@ -586,10 +605,16 @@ class _GraphTensorArrayV2(object):
 
   def concat(self, name=None):
     """See TensorArray."""
-    value = list_ops.tensor_list_concat(
-        input_handle=self._flow, element_dtype=self._dtype, name=name)
     if self._element_shape and self._element_shape[0].dims is not None:
-      value.set_shape([None] + self._element_shape[0].dims[1:])
+      element_shape = [None] + self._element_shape[0].dims[1:]
+    else:
+      element_shape = None
+
+    value = list_ops.tensor_list_concat(
+        input_handle=self._flow,
+        element_dtype=self._dtype,
+        element_shape=element_shape,
+        name=name)
     return value
 
   @tf_should_use.should_use_result
@@ -601,15 +626,7 @@ class _GraphTensorArrayV2(object):
         self._merge_element_shape(value.shape[1:])
       flow_out = list_ops.tensor_list_from_tensor(
           tensor=value, element_shape=value.shape[1:])
-      ta = TensorArray(
-          dtype=self._dtype,
-          handle=self.handle,
-          flow=flow_out,
-          colocate_with_first_write_call=self._colocate_with_first_write_call)
-      ta._infer_shape = self._infer_shape
-      ta._element_shape = self._element_shape
-      ta._colocate_with = self._colocate_with
-      return ta
+      return build_ta_with_new_flow(self, flow_out)
 
   @tf_should_use.should_use_result
   def scatter(self, indices, value, name=None):
@@ -619,17 +636,10 @@ class _GraphTensorArrayV2(object):
       value = ops.convert_to_tensor(value, name="value")
       if self._infer_shape and not context.executing_eagerly():
         self._merge_element_shape(value.shape[1:])
+      element_shape = self._element_shape[0] if self._element_shape else None
       flow_out = list_ops.tensor_list_scatter(
-          tensor=value, indices=indices, element_shape=-1)
-      ta = TensorArray(
-          dtype=self._dtype,
-          handle=self.handle,
-          flow=flow_out,
-          colocate_with_first_write_call=self._colocate_with_first_write_call)
-      ta._infer_shape = self._infer_shape
-      ta._element_shape = self._element_shape
-      ta._colocate_with = self._colocate_with
-      return ta
+          tensor=value, indices=indices, input_handle=self._flow)
+      return build_ta_with_new_flow(self, flow_out)
 
   @tf_should_use.should_use_result
   def split(self, value, lengths, name=None):
@@ -649,15 +659,7 @@ class _GraphTensorArrayV2(object):
           lengths=lengths_64,
           element_shape=self._element_shape[0] if self._element_shape else None,
           name=name)
-      ta = TensorArray(
-          dtype=self._dtype,
-          handle=self.handle,
-          flow=flow_out,
-          colocate_with_first_write_call=self._colocate_with_first_write_call)
-      ta._infer_shape = self._infer_shape
-      ta._element_shape = self._element_shape
-      ta._colocate_with = self._colocate_with
-      return ta
+      return build_ta_with_new_flow(self, flow_out)
 
   def size(self, name=None):
     """See TensorArray."""
@@ -834,7 +836,7 @@ class _EagerTensorArray(object):
     if self._infer_shape:
       if self._element_shape is None:
         self._element_shape = value.shape
-      elif self._element_shape != value.shape:
+      elif not self._element_shape.is_compatible_with(value.shape):
         raise ValueError("Incompatible shape for value (%s), expected (%s)" %
                          (value.shape.as_list(), self._element_shape.as_list()))
 
@@ -863,12 +865,15 @@ class _EagerTensorArray(object):
     if self._tensor_array:
       for ix in range(len(self._tensor_array)):
         self._maybe_zero(ix)
-    return array_ops.stack(self._tensor_array, name=name)
+    return ops.convert_to_tensor(
+        self._tensor_array, name=name, dtype=self._dtype)
 
   def gather(self, indices, name=None):
     """See TensorArray."""
     del name  # not meaningful when executing eagerly.
-    return array_ops.stack([self._maybe_zero(i) for i in indices.numpy()])
+    if isinstance(indices, ops.EagerTensor):
+      indices = indices.numpy()
+    return array_ops.stack([self._maybe_zero(i) for i in indices])
 
   def concat(self, name=None):
     """See TensorArray."""
@@ -901,7 +906,9 @@ class _EagerTensorArray(object):
   def scatter(self, indices, value, name=None):
     """See TensorArray."""
     del name  # not meaningful when executing eagerly.
-    for index, val in zip(indices.numpy(), array_ops.unstack(value)):
+    if isinstance(indices, ops.EagerTensor):
+      indices = indices.numpy()
+    for index, val in zip(indices, array_ops.unstack(value)):
       self._write(index, val)  # pylint: disable=protected-access
     return self.parent()
 
@@ -1013,7 +1020,7 @@ class TensorArray(object):
     if context.executing_eagerly():
       implementation = _EagerTensorArray
     else:
-      if ENABLE_TENSOR_ARRAY_V2:
+      if control_flow_util.EnableControlFlowV2(ops.get_default_graph()):
         implementation = _GraphTensorArrayV2
       else:
         implementation = _GraphTensorArray
@@ -1047,6 +1054,10 @@ class TensorArray(object):
     """The reference to the TensorArray."""
     return self._implementation.handle
 
+  @property
+  def _dynamic_size(self):
+    return self._implementation._dynamic_size
+
   @property
   def _infer_shape(self):
     return self._implementation._infer_shape
@@ -1232,8 +1243,10 @@ class TensorArray(object):
 
 
 def build_ta_with_new_flow(old_ta, flow):
+  """Builds a TensorArray with a new `flow` tensor."""
   ta = TensorArray(
       dtype=old_ta.dtype,
+      dynamic_size=old_ta._dynamic_size,
       handle=old_ta.handle,
       flow=flow,
       infer_shape=old_ta._infer_shape,
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index ccce9e2f93bac26a69d8cadab9ece4cc2482c4e1..35c00778ae5c99cb5688c9ff1fa97b26c72dc855 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -842,8 +842,11 @@ class _VariableStore(object):
         if isinstance(var, resource_variable_ops.ResourceVariable):
           raise ValueError(err_msg)
         tb = var.op.traceback[::-1]
-        # Throw away internal tf entries and only take a few lines.
-        tb = [x for x in tb if "tensorflow/python" not in x[0]][:3]
+        # Throw away internal tf entries and only take a few lines. In some
+        # cases the traceback can be longer (e.g. if someone uses factory
+        # functions to create variables) so we take more than needed in the
+        # default case.
+        tb = [x for x in tb if "tensorflow/python" not in x[0]][:5]
         raise ValueError("%s Originally defined at:\n\n%s" % (err_msg, "".join(
             traceback.format_list(tb))))
       found_var = self._vars[name]
@@ -2480,12 +2483,13 @@ def default_variable_creator(next_creator=None, **kwargs):
     use_resource = _DEFAULT_USE_RESOURCE
   use_resource = use_resource or context.executing_eagerly()
   if use_resource:
+    distribute_strategy = kwargs.get("distribute_strategy", None)
     return resource_variable_ops.ResourceVariable(
         initial_value=initial_value, trainable=trainable,
         collections=collections, validate_shape=validate_shape,
         caching_device=caching_device, name=name, dtype=dtype,
         constraint=constraint, variable_def=variable_def,
-        import_scope=import_scope)
+        import_scope=import_scope, distribute_strategy=distribute_strategy)
   else:
     return variables.RefVariable(
         initial_value=initial_value, trainable=trainable,
@@ -2507,6 +2511,7 @@ def default_variable_creator_v2(next_creator=None, **kwargs):
   dtype = kwargs.get("dtype", None)
   import_scope = kwargs.get("import_scope", None)
   constraint = kwargs.get("constraint", None)
+  distribute_strategy = kwargs.get("distribute_strategy", None)
 
   # Set trainable value based on synchronization value.
   synchronization = kwargs.get("synchronization", VariableSynchronization.AUTO)
@@ -2517,7 +2522,7 @@ def default_variable_creator_v2(next_creator=None, **kwargs):
       initial_value=initial_value, trainable=trainable,
       validate_shape=validate_shape, caching_device=caching_device,
       name=name, dtype=dtype, constraint=constraint, variable_def=variable_def,
-      import_scope=import_scope)
+      import_scope=import_scope, distribute_strategy=distribute_strategy)
 
 
 variables.default_variable_creator = default_variable_creator
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index a31ce655183f8fb7e6331c2d6a4b3af8076902c8..219ba7fbb2ef8de3f0ebf020b95b1b3c945cc12d 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -35,7 +35,7 @@ from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import compat
 from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.deprecation import deprecated
@@ -59,21 +59,6 @@ def _make_getter(captured_getter, captured_previous):
   return getter
 
 
-def _has_cycle(op, path):
-  """Detect cycles in the dependencies of `initial_value`."""
-  if op.name in path:
-    return True
-  path.add(op.name)
-  for op_input in op.inputs:
-    if _has_cycle(op_input.op, path):
-      return True
-  for op_control_input in op.control_inputs:
-    if _has_cycle(op_control_input, path):
-      return True
-  path.remove(op.name)
-  return False
-
-
 @tf_export("VariableSynchronization")
 class VariableSynchronization(enum.Enum):
   """Indicates when a distributed variable will be synced.
@@ -153,7 +138,7 @@ class VariableMetaclass(type):
                         aggregation=VariableAggregation.NONE):
     """Call on Variable class. Useful to force the signature."""
     previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
-    for getter in ops.get_default_graph()._variable_creator_stack:  # pylint: disable=protected-access
+    for _, getter in ops.get_default_graph()._variable_creator_stack:  # pylint: disable=protected-access
       previous_getter = _make_getter(getter, previous_getter)
 
     # Reset `aggregation` that is explicitly set as `None` to the enum NONE.
@@ -189,7 +174,7 @@ class VariableMetaclass(type):
                         aggregation=VariableAggregation.NONE):
     """Call on Variable class. Useful to force the signature."""
     previous_getter = lambda **kws: default_variable_creator_v2(None, **kws)
-    for getter in ops.get_default_graph()._variable_creator_stack:  # pylint: disable=protected-access
+    for _, getter in ops.get_default_graph()._variable_creator_stack:  # pylint: disable=protected-access
       previous_getter = _make_getter(getter, previous_getter)
 
     # Reset `aggregation` that is explicitly set as `None` to the enum NONE.
@@ -219,7 +204,7 @@ class VariableMetaclass(type):
 
 @tf_export("Variable", v1=[])
 class Variable(six.with_metaclass(VariableMetaclass,
-                                  checkpointable.CheckpointableBase)):
+                                  trackable.Trackable)):
   """See the [Variables Guide](https://tensorflow.org/guide/variables).
 
   A variable maintains state in the graph across calls to `run()`. You add a
@@ -319,8 +304,9 @@ class Variable(six.with_metaclass(VariableMetaclass,
   tf.cond(v, lambda: v.assign(False), my_false_fn)  # Note: this is broken.
   ```
 
-  Here replacing adding `use_resource=True` when constructing the variable will
+  Here, adding `use_resource=True` when constructing the variable will
   fix any nondeterminism issues:
+
   ```
   v = tf.Variable(True, use_resource=True)
   tf.cond(v, lambda: v.assign(False), my_false_fn)
@@ -487,6 +473,10 @@ class Variable(six.with_metaclass(VariableMetaclass,
     """
     raise NotImplementedError
 
+  @deprecated(
+      None,
+      "Use Variable.read_value. Variables in 2.X are initialized "
+      "automatically both in eager and graph (inside tf.defun) contexts.")
   def initialized_value(self):
     """Returns the value of the initialized variable.
 
@@ -506,7 +496,10 @@ class Variable(six.with_metaclass(VariableMetaclass,
       A `Tensor` holding the value of this variable after its initializer
       has run.
     """
-    raise NotImplementedError
+    with ops.init_scope():
+      return control_flow_ops.cond(is_variable_initialized(self),
+                                   self.read_value,
+                                   lambda: self.initial_value)
 
   @property
   def initial_value(self):
@@ -837,6 +830,7 @@ class Variable(six.with_metaclass(VariableMetaclass,
     """
     raise NotImplementedError
 
+  @deprecated(None, "Prefer Dataset.range instead.")
   def count_up_to(self, limit):
     """Increments this variable until it reaches `limit`.
 
@@ -859,6 +853,9 @@ class Variable(six.with_metaclass(VariableMetaclass,
     """
     raise NotImplementedError
 
+  @deprecated(
+      None,
+      "Prefer Variable.assign which has equivalent behavior in 2.X.")
   def load(self, value, session=None):
     """Load new value into this variable.
 
@@ -892,7 +889,15 @@ class Variable(six.with_metaclass(VariableMetaclass,
     Raises:
         ValueError: Session is not passed and no default session
     """
-    raise NotImplementedError
+    if context.executing_eagerly():
+      self.assign(value)
+    else:
+      session = session or ops.get_default_session()
+      if session is None:
+        raise ValueError(
+            "Either session argument should be provided or default session "
+            "should be established")
+      session.run(self.initializer, {self.initializer.inputs[1]: value})
 
   # Conversion to tensor.
   @staticmethod
@@ -931,7 +936,7 @@ class Variable(six.with_metaclass(VariableMetaclass,
 
     def _run_op(a, *args, **kwargs):
       # pylint: disable=protected-access
-      return tensor_oper(a._AsTensor(), *args, **kwargs)
+      return tensor_oper(a.value(), *args, **kwargs)
 
     functools.update_wrapper(_run_op, tensor_oper)
     setattr(cls, operator, _run_op)
@@ -962,6 +967,18 @@ class Variable(six.with_metaclass(VariableMetaclass,
     """The name of this variable."""
     raise NotImplementedError
 
+  @property
+  def _shared_name(self):
+    """The shared name of the variable.
+
+      Unlike name(), shared_name doesn't have ":0" suffix. It is user-specified
+      name with name scope prefix.
+
+    Returns:
+      variable name.
+    """
+    return self.name[:self.name.index(":")]
+
   @property
   def initializer(self):
     """The initializer operation for this variable."""
@@ -997,8 +1014,12 @@ class Variable(six.with_metaclass(VariableMetaclass,
     raise NotImplementedError
 
   def get_shape(self):
-    """Alias of Variable.shape."""
-    raise NotImplementedError
+    """Alias of `Variable.shape`."""
+    return self.shape
+
+  def _gather_saveables_for_checkpoint(self):
+    """For implementing `Trackable`. This object is saveable on its own."""
+    return {trackable.VARIABLE_VALUE_KEY: self}
 
   def to_proto(self, export_scope=None):
     """Converts a `Variable` to a `VariableDef` protocol buffer.
@@ -1018,6 +1039,17 @@ class Variable(six.with_metaclass(VariableMetaclass,
     return RefVariable(variable_def=variable_def,
                        import_scope=import_scope)
 
+  def _set_save_slice_info(self, save_slice_info):
+    """Sets the slice info for this `Variable`.
+
+    Args:
+      save_slice_info: A `Variable.SaveSliceInfo` object.
+    """
+    self._save_slice_info = save_slice_info
+
+  def _get_save_slice_info(self):
+    return self._save_slice_info
+
   class SaveSliceInfo(object):
     """Information on how to save this Variable as a slice.
 
@@ -1103,6 +1135,9 @@ class Variable(six.with_metaclass(VariableMetaclass,
         return None
 
 
+Variable._OverloadAllOperators()  # pylint: disable=protected-access
+
+
 @tf_export(v1=["Variable"])
 class VariableV1(Variable):
   """See the [Variables Guide](https://tensorflow.org/guide/variables).
@@ -1204,7 +1239,7 @@ class VariableV1(Variable):
   tf.cond(v, lambda: v.assign(False), my_false_fn)  # Note: this is broken.
   ```
 
-  Here replacing adding `use_resource=True` when constructing the variable will
+  Here, adding `use_resource=True` when constructing the variable will
   fix any nondeterminism issues:
   ```
   v = tf.Variable(True, use_resource=True)
@@ -1471,8 +1506,8 @@ class RefVariable(VariableV1):
     # Store the graph key so optimizers know how to only retrieve variables from
     # this graph.
     self._graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-    if isinstance(initial_value, checkpointable.CheckpointInitialValue):
-      self._maybe_initialize_checkpointable()
+    if isinstance(initial_value, trackable.CheckpointInitialValue):
+      self._maybe_initialize_trackable()
       self._update_uid = initial_value.checkpoint_position.restore_uid
       initial_value = initial_value.wrapped_value
 
@@ -1541,7 +1576,8 @@ class RefVariable(VariableV1):
         # using their initialized_value() method.
         self._initializer_op = state_ops.assign(
             self._variable,
-            self._try_guard_against_uninitialized_dependencies(
+            _try_guard_against_uninitialized_dependencies(
+                name,
                 self._initial_value),
             validate_shape=validate_shape).op
 
@@ -1602,16 +1638,6 @@ class RefVariable(VariableV1):
     """Conversion function for Graph.as_graph_element()."""
     return self._variable
 
-  def _AsTensor(self):  # pylint: disable=invalid-name
-    """Converts this variable to a Tensor.
-
-    See `tf.Variable.value`.
-
-    Returns:
-      A `Tensor` containing the value of the variable.
-    """
-    return self._snapshot
-
   def value(self):
     """Returns the last snapshot of this variable.
 
@@ -1703,30 +1729,6 @@ class RefVariable(VariableV1):
     """
     return self._variable.eval(session=session)
 
-  def initialized_value(self):
-    """Returns the value of the initialized variable.
-
-    You should use this instead of the variable itself to initialize another
-    variable with a value that depends on the value of this variable.
-
-    ```python
-    # Initialize 'v' with a random tensor.
-    v = tf.Variable(tf.truncated_normal([10, 40]))
-    # Use `initialized_value` to guarantee that `v` has been
-    # initialized before its value is used to initialize `w`.
-    # The random values are picked only once.
-    w = tf.Variable(v.initialized_value() * 2.0)
-    ```
-
-    Returns:
-      A `Tensor` holding the value of this variable after its initializer
-      has run.
-    """
-    with ops.init_scope():
-      return control_flow_ops.cond(is_variable_initialized(self),
-                                   self.read_value,
-                                   lambda: self.initial_value)
-
   @property
   def initial_value(self):
     """Returns the Tensor used as the initial value for the variable.
@@ -2117,6 +2119,7 @@ class RefVariable(VariableV1):
                                               new_axis_mask=new_axis_mask,
                                               shrink_axis_mask=shrink_axis_mask)
 
+  @deprecated(None, "Prefer Dataset.range instead.")
   def count_up_to(self, limit):
     """Increments this variable until it reaches `limit`.
 
@@ -2139,49 +2142,6 @@ class RefVariable(VariableV1):
     """
     return state_ops.count_up_to(self._variable, limit=limit)
 
-  def load(self, value, session=None):
-    """Load new value into this variable.
-
-    Writes new value to variable's memory. Doesn't add ops to the graph.
-
-    This convenience method requires a session where the graph
-    containing this variable has been launched. If no session is
-    passed, the default session is used.  See `tf.Session` for more
-    information on launching a graph and on sessions.
-
-    ```python
-    v = tf.Variable([1, 2])
-    init = tf.global_variables_initializer()
-
-    with tf.Session() as sess:
-        sess.run(init)
-        # Usage passing the session explicitly.
-        v.load([2, 3], sess)
-        print(v.eval(sess)) # prints [2 3]
-        # Usage with the default session.  The 'with' block
-        # above makes 'sess' the default session.
-        v.load([3, 4], sess)
-        print(v.eval()) # prints [3 4]
-    ```
-
-    Args:
-        value: New variable value
-        session: The session to use to evaluate this variable. If
-          none, the default session is used.
-
-    Raises:
-        ValueError: Session is not passed and no default session
-    """
-    if context.executing_eagerly():
-      self.assign(value)
-    else:
-      session = session or ops.get_default_session()
-      if session is None:
-        raise ValueError(
-            "Either session argument should be provided or default session "
-            "should be established")
-      session.run(self._initializer_op, {self._initializer_op.inputs[1]: value})
-
   # Conversion to tensor.
   @staticmethod
   def _TensorConversionFunction(v, dtype=None, name=None, as_ref=False):  # pylint: disable=invalid-name
@@ -2196,134 +2156,6 @@ class RefVariable(VariableV1):
     else:
       return v.value()
 
-  def _gather_saveables_for_checkpoint(self):
-    """For implementing `Checkpointable`. This object is saveable on its own."""
-    return {checkpointable.VARIABLE_VALUE_KEY: self}
-
-  def _try_guard_against_uninitialized_dependencies(self, initial_value):
-    """Attempt to guard against dependencies on uninitialized variables.
-
-    Replace references to variables in `initial_value` with references to the
-    variable's initialized values. The initialized values are essentially
-    conditional TensorFlow graphs that return a variable's value if it is
-    initialized or its `initial_value` if it hasn't been initialized. This
-    replacement is done on a best effort basis:
-
-    - If the `initial_value` graph contains cycles, we don't do any
-      replacements for that graph.
-    - If the variables that `initial_value` depends on are not present in the
-      `GLOBAL_VARIABLES` or `LOCAL_VARIABLES` we don't replace them.
-
-    In these cases, it is up to the caller to ensure that the `initial_value`
-    graph uses initialized variables or that they guard access to variables
-    using their `initialized_value` method.
-
-    Args:
-      initial_value: `Tensor`. The initial value.
-    Returns:
-      A `Tensor` suitable to initialize a variable.
-    Raises:
-      TypeError: If `initial_value` is not a `Tensor`.
-    """
-    if not isinstance(initial_value, ops.Tensor):
-      raise TypeError("initial_value needs to be a Tensor: %s" % initial_value)
-
-    # Don't modify initial_value if it contains any cyclic dependencies.
-    if _has_cycle(initial_value.op, path=set()):
-      return initial_value
-
-    return self._safe_initial_value_from_tensor(initial_value, op_cache={})
-
-  def _safe_initial_value_from_tensor(self, tensor, op_cache):
-    """Replace dependencies on variables with their initialized values.
-
-    Args:
-      tensor: A `Tensor`. The tensor to replace.
-      op_cache: A dict mapping operation names to `Operation`s. Used to memoize
-        the results so as to avoid creating redundant operations.
-    Returns:
-      A `Tensor` compatible with `tensor`. Any inputs that lead to variable
-      values will be replaced with a corresponding graph that uses the
-      variable's initialized values. This is done on a best-effort basis. If no
-      modifications need to be made then `tensor` will be returned unchanged.
-    """
-    op = tensor.op
-    new_op = op_cache.get(op.name)
-    if new_op is None:
-      new_op = self._safe_initial_value_from_op(op, op_cache)
-      op_cache[op.name] = new_op
-    return new_op.outputs[tensor.value_index]
-
-  def _safe_initial_value_from_op(self, op, op_cache):
-    """Replace dependencies on variables with their initialized values.
-
-    Args:
-      op: An `Operation`. The operation to replace.
-      op_cache: A dict mapping operation names to `Operation`s. Used to memoize
-        the results so as to avoid creating redundant operations.
-    Returns:
-      An `Operation` compatible with `op`. Any inputs that lead to variable
-      values will be replaced with a corresponding graph that uses the
-      variable's initialized values. This is done on a best-effort basis. If no
-      modifications need to be made then `op` will be returned unchanged.
-    """
-    op_type = op.node_def.op
-    if op_type in ("IsVariableInitialized", "VarIsInitializedOp",
-                   "ReadVariableOp"):
-      return op
-
-    # Attempt to find the initialized_value of any variable reference / handles.
-    # TODO(b/70206927): Fix handling of ResourceVariables.
-    if op_type in ("Variable", "VariableV2", "VarHandleOp"):
-      initialized_value = self._find_initialized_value_for_variable(op)
-      return op if initialized_value is None else initialized_value.op
-
-    # Recursively build initializer expressions for inputs.
-    modified = False
-    new_op_inputs = []
-    for op_input in op.inputs:
-      new_op_input = self._safe_initial_value_from_tensor(op_input, op_cache)
-      new_op_inputs.append(new_op_input)
-      modified = modified or (new_op_input != op_input)
-
-    # If at least one input was modified, replace the op.
-    if modified:
-      new_op_type = op_type
-      if new_op_type == "RefSwitch":
-        new_op_type = "Switch"
-      new_op_name = op.node_def.name + "_" + self.name
-      new_op_name = new_op_name.replace(":", "_")
-      return self.graph.create_op(
-          new_op_type, new_op_inputs,
-          op._output_types,  # pylint: disable=protected-access
-          name=new_op_name, attrs=op.node_def.attr)
-
-    return op
-
-  def _find_initialized_value_for_variable(self, variable_op):
-    """Find the initialized value for a variable op.
-
-    To do so, lookup the variable op in the variables collection.
-
-    Args:
-      variable_op: A variable `Operation`.
-    Returns:
-      A `Tensor` representing the initialized value for the variable or `None`
-      if the initialized value could not be found.
-    """
-    try:
-      var_names = [variable_op.node_def.name, variable_op.node_def.name + ":0"]
-      for collection_name in (ops.GraphKeys.GLOBAL_VARIABLES,
-                              ops.GraphKeys.LOCAL_VARIABLES):
-        for var in self.graph.get_collection(collection_name):
-          if var.name in var_names:
-            return var.initialized_value()
-    except AttributeError:
-      # Return None when an incomplete user-defined variable type was put in
-      # the collection.
-      return None
-    return None
-
   # NOTE(mrry): This enables the Variable's overloaded "right" binary
   # operators to run when the left operand is an ndarray, because it
   # accords the Variable class higher priority than an ndarray, or a
@@ -2338,18 +2170,6 @@ class RefVariable(VariableV1):
     """The name of this variable."""
     return self._variable.name
 
-  @property
-  def _shared_name(self):
-    """The shared name of the variable.
-
-      Unlike name(), shared_name doesn't have ":0" suffix. It is user-specified
-      name with name scope prefix.
-
-    Returns:
-      variable name.
-    """
-    return self.name[:-2]
-
   @property
   def initializer(self):
     """The initializer operation for this variable."""
@@ -2375,6 +2195,11 @@ class RefVariable(VariableV1):
     """The `Graph` of this variable."""
     return self._variable.graph
 
+  @property
+  def _distribute_strategy(self):
+    """The `tf.distribute.Strategy` that this variable was created under."""
+    return None   # Ref variables are never created inside a strategy.
+
   @property
   def shape(self):
     """The `TensorShape` of this variable.
@@ -2384,10 +2209,6 @@ class RefVariable(VariableV1):
     """
     return self._variable.get_shape()
 
-  def get_shape(self):
-    """Alias of Variable.shape."""
-    return self.shape
-
   def to_proto(self, export_scope=None):
     """Converts a `Variable` to a `VariableDef` protocol buffer.
 
@@ -2475,16 +2296,150 @@ class RefVariable(VariableV1):
         " if you want a new python Tensor object.", 1)
     return self ** other
 
-  def _set_save_slice_info(self, save_slice_info):
-    """Sets the slice info for this `Variable`.
 
-    Args:
-      save_slice_info: A `Variable.SaveSliceInfo` object.
-    """
-    self._save_slice_info = save_slice_info
+def _try_guard_against_uninitialized_dependencies(name, initial_value):
+  """Attempt to guard against dependencies on uninitialized variables.
+
+  Replace references to variables in `initial_value` with references to the
+  variable's initialized values. The initialized values are essentially
+  conditional TensorFlow graphs that return a variable's value if it is
+  initialized or its `initial_value` if it hasn't been initialized. This
+  replacement is done on a best effort basis:
+
+  - If the `initial_value` graph contains cycles, we don't do any
+    replacements for that graph.
+  - If the variables that `initial_value` depends on are not present in the
+    `GLOBAL_VARIABLES` or `LOCAL_VARIABLES` we don't replace them.
+
+  In these cases, it is up to the caller to ensure that the `initial_value`
+  graph uses initialized variables or that they guard access to variables
+  using their `initialized_value` method.
+
+  Args:
+    name: Variable name.
+    initial_value: `Tensor`. The initial value.
+  Returns:
+    A `Tensor` suitable to initialize a variable.
+  Raises:
+    TypeError: If `initial_value` is not a `Tensor`.
+  """
+  if not isinstance(initial_value, ops.Tensor):
+    raise TypeError("initial_value needs to be a Tensor: %s" % initial_value)
+
+  # Don't modify initial_value if it contains any cyclic dependencies.
+  if _has_cycle(initial_value.op, path=set()):
+    return initial_value
+  return _safe_initial_value_from_tensor(name, initial_value, op_cache={})
 
-  def _get_save_slice_info(self):
-    return self._save_slice_info
+
+def _has_cycle(op, path):
+  """Detect cycles in the dependencies of `initial_value`."""
+  if op.name in path:
+    return True
+  path.add(op.name)
+  for op_input in op.inputs:
+    if _has_cycle(op_input.op, path):
+      return True
+  for op_control_input in op.control_inputs:
+    if _has_cycle(op_control_input, path):
+      return True
+  path.remove(op.name)
+  return False
+
+
+def _safe_initial_value_from_tensor(name, tensor, op_cache):
+  """Replace dependencies on variables with their initialized values.
+
+  Args:
+    name: Variable name.
+    tensor: A `Tensor`. The tensor to replace.
+    op_cache: A dict mapping operation names to `Operation`s. Used to memoize
+      the results so as to avoid creating redundant operations.
+  Returns:
+    A `Tensor` compatible with `tensor`. Any inputs that lead to variable
+    values will be replaced with a corresponding graph that uses the
+    variable's initialized values. This is done on a best-effort basis. If no
+    modifications need to be made then `tensor` will be returned unchanged.
+  """
+  op = tensor.op
+  new_op = op_cache.get(op.name)
+  if new_op is None:
+    new_op = _safe_initial_value_from_op(name, op, op_cache)
+    op_cache[op.name] = new_op
+  return new_op.outputs[tensor.value_index]
+
+
+def _safe_initial_value_from_op(name, op, op_cache):
+  """Replace dependencies on variables with their initialized values.
+
+  Args:
+    name: Variable name.
+    op: An `Operation`. The operation to replace.
+    op_cache: A dict mapping operation names to `Operation`s. Used to memoize
+      the results so as to avoid creating redundant operations.
+  Returns:
+    An `Operation` compatible with `op`. Any inputs that lead to variable
+    values will be replaced with a corresponding graph that uses the
+    variable's initialized values. This is done on a best-effort basis. If no
+    modifications need to be made then `op` will be returned unchanged.
+  """
+  op_type = op.node_def.op
+  if op_type in ("IsVariableInitialized", "VarIsInitializedOp",
+                 "ReadVariableOp"):
+    return op
+
+  # Attempt to find the initialized_value of any variable reference / handles.
+  # TODO(b/70206927): Fix handling of ResourceVariables.
+  if op_type in ("Variable", "VariableV2", "VarHandleOp"):
+    initialized_value = _find_initialized_value_for_variable(op)
+    return op if initialized_value is None else initialized_value.op
+
+  # Recursively build initializer expressions for inputs.
+  modified = False
+  new_op_inputs = []
+  for op_input in op.inputs:
+    new_op_input = _safe_initial_value_from_tensor(name, op_input, op_cache)
+    new_op_inputs.append(new_op_input)
+    modified = modified or (new_op_input != op_input)
+
+  # If at least one input was modified, replace the op.
+  if modified:
+    new_op_type = op_type
+    if new_op_type == "RefSwitch":
+      new_op_type = "Switch"
+    new_op_name = op.node_def.name + "_" + name
+    new_op_name = new_op_name.replace(":", "_")
+    return op.graph.create_op(
+        new_op_type, new_op_inputs,
+        op._output_types,  # pylint: disable=protected-access
+        name=new_op_name, attrs=op.node_def.attr)
+
+  return op
+
+
+def _find_initialized_value_for_variable(variable_op):
+  """Find the initialized value for a variable op.
+
+  To do so, lookup the variable op in the variables collection.
+
+  Args:
+    variable_op: A variable `Operation`.
+  Returns:
+    A `Tensor` representing the initialized value for the variable or `None`
+    if the initialized value could not be found.
+  """
+  try:
+    var_names = [variable_op.node_def.name, variable_op.node_def.name + ":0"]
+    for collection_name in (ops.GraphKeys.GLOBAL_VARIABLES,
+                            ops.GraphKeys.LOCAL_VARIABLES):
+      for var in variable_op.graph.get_collection(collection_name):
+        if var.name in var_names:
+          return var.initialized_value()
+  except AttributeError:
+    # Return None when an incomplete user-defined variable type was put in
+    # the collection.
+    return None
+  return None
 
 
 class PartitionedVariable(object):
@@ -2637,6 +2592,12 @@ class PartitionedVariable(object):
   def shape(self):
     return self.get_shape()
 
+  @property
+  def _distribute_strategy(self):
+    """The `tf.distribute.Strategy` that this variable was created under."""
+    # NOTE(yuefengz): Today, no partitioned variables in a distribute strategy.
+    return None
+
   def get_shape(self):
     return self._shape
 
@@ -2699,6 +2660,15 @@ class PartitionedVariable(object):
       return assign_list
     return [assign.op for assign in assign_list]
 
+
+# Register a conversion function which reads the value of the variable,
+# allowing instances of the class to be used as tensors.
+ops.register_tensor_conversion_function(
+    RefVariable,
+    RefVariable._TensorConversionFunction)  # pylint: disable=protected-access
+ops.register_dense_tensor_like_type(RefVariable)
+
+
 @tf_export(v1=["global_variables"])
 def global_variables(scope=None):
   """Returns global variables.
@@ -3022,12 +2992,7 @@ def report_uninitialized_variables(var_list=None,
         # uninitialized variables.
         return array_ops.boolean_mask(variable_names_tensor, variables_mask)
 
-# pylint: disable=protected-access
-Variable._OverloadAllOperators()
 
 ops.register_tensor_conversion_function(
-    PartitionedVariable, PartitionedVariable._TensorConversionFunction)
-# pylint: enable=protected-access
-
-
-ops.register_dense_tensor_like_type(Variable)
+    PartitionedVariable,
+    PartitionedVariable._TensorConversionFunction)  # pylint: disable=protected-access
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index d00c158d156b225553b52437324accd019c76aee..68f1cbae0af740b8e184c60c1b48fcfcc504a493 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -38,10 +38,12 @@ from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import control_flow_util_v2 as util
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import gen_functional_ops
-from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import gen_resource_variable_ops
+from tensorflow.python.ops import gradients_util
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import while_v2_indexed_slices_rewriter
 from tensorflow.python.util import nest
 
 # pylint: disable=protected-access
@@ -52,18 +54,12 @@ from tensorflow.python.util import nest
 # to them and then pass those in as data inputs. This should probably be
 # handled in the CapturingGraph itself.
 
-# Op types that output a resource tensor representing a TensorArray handle.
-TENSOR_ARRAY_HANDLE_OPS = (
-    "TensorArrayV3",
-    "TensorArrayGradV3",
-    "TensorArrayGradWithShape",
-)
-
 
 def while_loop(cond,
                body,
                loop_vars,
                shape_invariants=None,
+               parallel_iterations=10,
                maximum_iterations=None,
                name=None,
                return_same_structure=True):
@@ -106,7 +102,7 @@ def while_loop(cond,
 
     # Automatic control dependencies are added in defuns, but not in v1
     # graphs. Propagate that behavior here.
-    add_control_dependencies = util.in_defun()
+    add_control_dependencies = ops.get_default_graph()._add_control_dependencies
 
     # Build a `cond` wrapper that can handle the extra counter loop_var.
     def wrapped_cond(loop_counter, *args):
@@ -122,42 +118,39 @@ def while_loop(cond,
             loop_counter < maximum_iterations,
             cond(*_pack_sequence_as(orig_loop_vars, args)))
 
+    # NOTE(skyewm): we set collections to the outer graph's collections for
+    # compatibility with TPUEstimator.
     cond_graph = func_graph_module.func_graph_from_py_func(
         cond_name,
         wrapped_cond,
-        loop_vars, {},
+        [],  # We provide signature instead of args.
+        {},
         signature=_build_signature(loop_vars, shape_invariants),
-        func_graph=util.WhileCondFuncGraph(cond_name),
+        func_graph=util.WhileCondFuncGraph(
+            cond_name, collections=ops.get_default_graph()._collections),  # pylint: disable=protected-access
         add_control_dependencies=add_control_dependencies)
 
-    # Add external_captures of cond to the list of loop vars.
-    # Note that external tensors will be treated as loop invariants, i.e.,
-    # the value of that tensor in each iteration is the same as it was at the
-    # beginning of the loop execution.
-    loop_vars = loop_vars + cond_graph.external_captures
-    shape_invariants = shape_invariants + type(shape_invariants)(
-        [t.shape for t in cond_graph.external_captures])
-
     def wrapped_body(loop_counter, *args):
       """Loop body augmented with counter update.
 
       Args:
         loop_counter: Loop counter which needs to be incremented in the body.
         *args: List of args
-          args[:len_orig_loop_vars] - Args for the original loop body.
-          args[len_orig_loop_vars:] - External captures of cond. These get
-            passed through as is.
 
       Returns:
         A list of tensors the same length as args.
       """
+      # Capture the tensors already captured in cond_graph so that they appear
+      # in the same order in body_graph.external_captures.
+      for t in cond_graph.external_captures:
+        ops.get_default_graph().capture(t)
+
       # Convert the flow variables in `args` to TensorArrays. `args` should
       # already have the same structure as `orig_loop_vars` but currently there
       # is no nest.zip so we call `_pack_sequence_as` which flattens both
       # `orig_loop_vars` and `args`, converts flows in `args` to TensorArrays
       # and packs it into the structure of `orig_loop_vars`.
-      outputs = body(
-          *_pack_sequence_as(orig_loop_vars, args[:len_orig_loop_vars]))
+      outputs = body(*_pack_sequence_as(orig_loop_vars, args))
       if not nest.is_sequence(outputs):
         outputs = [outputs]
       # Compare the structure of input and output of body converting the
@@ -166,19 +159,18 @@ def while_loop(cond,
 
       outputs = _tensor_array_to_flow(outputs)
 
-      # Return the external_captures of cond_graph as is, i.e., treat them as
-      # loop invariants.
       # TODO(srbs): Update lowering code to create _Enter nodes with
       # is_constant=True for inputs that are directly passed to outputs.
-      return [loop_counter + 1] + list(outputs) + list(
-          args[len_orig_loop_vars:])
+      return [loop_counter + 1] + list(outputs)
 
     body_graph = func_graph_module.func_graph_from_py_func(
         body_name,
         wrapped_body,
-        loop_vars, {},
+        [],  # We provide signature instead of args.
+        {},
         signature=_build_signature(loop_vars, shape_invariants),
-        func_graph=util.WhileBodyFuncGraph(body_name),
+        func_graph=util.WhileBodyFuncGraph(
+            body_name, collections=ops.get_default_graph()._collections),  # pylint: disable=protected-access
         add_control_dependencies=add_control_dependencies)
     # Add external captures of body to the list of loop vars.
     # Note that external tensors will be treated as loop invariants, i.e.,
@@ -189,17 +181,15 @@ def while_loop(cond,
     # is_constant=True for inputs that are directly passed to outputs.
     body_graph.outputs.extend(body_graph.internal_captures)
 
-    # Capture `external_captures` of `body_graph` in `cond_graph` so that it
-    # expects to receive those as arguments.
-    # TODO(b/118457764): Dedup tensors that are captured in both the cond and
-    # body. This logic already exists in cond_v2.
+    # Capture the extra `external_captures` of `body_graph` in `cond_graph` so
+    # that it expects to receive those as arguments.
     with cond_graph.as_default():
-      for external_capture in body_graph.external_captures:
-        assert external_capture not in cond_graph.captures, (
-            "Looks like both cond and body are capturing the same tensor %s. "
-            "This is not supported yet. For now consider passing,"
-            " this as a loop variable." % str(external_capture))
-        cond_graph.capture(external_capture)
+      num_cond_captures = len(cond_graph.external_captures)
+      assert (cond_graph.external_captures ==
+              body_graph.external_captures[:num_cond_captures])
+      for body_capture in body_graph.external_captures[num_cond_captures:]:
+        assert body_capture not in cond_graph.captures
+        cond_graph.capture(body_capture)
 
     # Make sure that the shapes of the loop outputs are compatible with the
     # shape invariants, or the shapes of the loop vars if the invariants are not
@@ -218,6 +208,7 @@ def while_loop(cond,
         util.create_new_tf_function(cond_graph),
         util.create_new_tf_function(body_graph),
         output_shapes=[t.shape for t in body_graph.outputs],
+        parallel_iterations=parallel_iterations,
         name=scope)
 
     _copy_handle_data(body_graph.outputs, outputs)
@@ -249,38 +240,25 @@ def while_loop(cond,
 @ops.RegisterGradient("While")
 def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
   """The gradient of a While op produced by while_loop."""
-  cond_graph = _get_graph(op, "cond")
-  body_graph = _get_graph(op, "body")
+  # Note that op is not always the same as while_op because the gradient tape,
+  # for eager mode compatibility, forgets information about the proper op. Since
+  # the loop cannot run in eager mode, however, we can safely introspect into
+  # the graph here.
+  while_op = op.outputs[0].op
+  cond_graph = _get_graph(while_op, "cond")
+  body_graph = _get_graph(while_op, "body")
   orig_num_params = len(body_graph.outputs)
 
   maximum_iterations = op.get_attr(
       "_maximum_iterations") if _is_in_xla_context() else None
+  parallel_iterations = op.get_attr("parallel_iterations")
   assert not _is_in_xla_context() or maximum_iterations is not None
+  maximum_iterations = _validate_and_convert_to_tensor(maximum_iterations)
 
-  # Set the incoming gradient of TensorArray handles to None. The gradient
-  # implementation currently assumes all resource tensors correspond to float32
-  # ResourceVariables, which can lead to runtime shape errors when used with a
-  # TensorArray. This is a workaround until TensorArrays are reimplemented with
-  # TensorLists instead of resources.
-  # Also set the incoming gradient of non-trainable inputs to None. It is
-  # possible that we receive non-None gradients for non-trainable types in
-  # nested while loops because we accumulate outputs of the inner while as
-  # variant tensors which are trainable and hence receive zeros_like tensors in
-  # the gradient pass. The non-trainable tensors then receive the popped zeros
-  # tensor from this zeros variant. The gradient for the loop vars corresponding
-  # to these tensors is None or zeros (this happens only if the loop var is
-  # accumulated as well) in _grad_fn so we reset these.
-  # TODO(b/118712257): Remove the IsTrainable filter once we can handle None
-  # output grads in _grad_fn.
-  grads = [
-      None if _is_tensor_array_handle(output) or not _is_trainable(output)
-      else grad for grad, output in zip(grads, body_graph.outputs)
-  ]
+  grads = [_preprocess_grad(grad, body_out, while_out)
+           for grad, body_out, while_out
+           in zip(grads, body_graph.outputs, while_op.outputs)]
 
-  # Ensure that all non-resource trainable outputs have incoming gradients.
-  assert all(g is not None or o.dtype == dtypes.resource or not _is_trainable(o)
-             for o, g in zip(body_graph.outputs, grads)
-            ), "All trainable loop vars must receive incoming gradients."
   # We compute the gradient for the sub-graph between trainable ys and xs
   # with non-None incoming gradients. We later pad the None's to the list of
   # outputs.
@@ -303,18 +281,23 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
     new_inputs = body_grad_graph.empty_tensor_lists
     new_outputs = body_graph.outputs[orig_num_params:]
 
-    op._set_func_attr("cond", util.create_new_tf_function(cond_graph))
-    op._set_func_attr("body", util.create_new_tf_function(body_graph))
-    op._set_type_list_attr("T", body_graph.output_types)
-    op._set_shape_list_attr("output_shapes", body_graph.output_shapes)
-    op._add_while_inputs(new_inputs)
-    op._add_outputs([t.dtype for t in new_outputs],
-                    [t.shape for t in new_outputs])
+    while_op._set_func_attr("cond", util.create_new_tf_function(cond_graph))
+    while_op._set_func_attr("body", util.create_new_tf_function(body_graph))
+    while_op._set_type_list_attr("T", body_graph.output_types)
+    while_op._set_shape_list_attr("output_shapes", body_graph.output_shapes)
+    while_op._add_while_inputs(new_inputs)
+    while_op._add_outputs([t.dtype for t in new_outputs],
+                          [t.shape for t in new_outputs])
     _copy_handle_data(new_outputs, op.outputs[orig_num_params:])
 
-  captured_inputs = _resolve_grad_captures(body_graph, body_grad_graph, op)
+  captured_inputs = _resolve_grad_captures(body_graph, body_grad_graph,
+                                           while_op)
   loop_vars = args + captured_inputs
 
+  # This modifies body_grad_graph.
+  loop_vars = while_v2_indexed_slices_rewriter.rewrite_grad_indexed_slices(
+      grads, body_grad_graph, loop_vars, while_op.inputs)
+
   def grad_cond(counter, max_iters, *unused_args):
     return counter < max_iters
 
@@ -330,33 +313,65 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
       util.create_new_tf_function(cond_grad_graph),
       util.create_new_tf_function(body_grad_graph),
       output_shapes=[t.shape for t in body_grad_graph.outputs],
-      name="%s_grad" % op.name)
+      parallel_iterations=parallel_iterations,
+      name="%s_grad" % while_op.name)
+  grad_op = outputs[0].op
 
   _copy_handle_data(body_grad_graph.outputs, outputs)
-  util.maybe_set_lowering_attr(outputs[0].op)
-  _maybe_set_maximum_iterations_attr(outputs[0].op, maximum_iterations)
+  util.maybe_set_lowering_attr(grad_op)
+  _maybe_set_maximum_iterations_attr(grad_op, maximum_iterations)
 
   # See comment in while_loop.
   outputs = [array_ops.identity(t) for t in outputs]
+  return _get_structured_grad_output(outputs, grads, body_grad_graph)
+
+
+def _preprocess_grad(grad, body_graph_output, while_op_output):
+  """Returns the initial gradient to be used for a given output tensor.
+
+  Args:
+    grad: the original gradient Tensor passed to the gradient function.
+    body_graph_output: the corresponding Tensor in the body graph.
+    while_op_output: the corresponding Tensor output of the While op.
+
+  Returns:
+    A Tensor or None.
+  """
+  # Set the incoming gradient of non-trainable inputs to None. It is possible
+  # that we receive non-None gradients for non-trainable types in nested while
+  # loops because we accumulate outputs of the inner while as variant tensors
+  # which are trainable and hence receive zeros_like tensors in the gradient
+  # pass. The non-trainable tensors then receive the popped zeros tensor from
+  # this zeros variant. The gradient for the loop vars corresponding to these
+  # tensors is None or zeros (this happens only if the loop var is accumulated
+  # as well) in _grad_fn so we reset these.
+  # TODO(b/118712257): Remove once we can handle None output grads in _grad_fn.
+  if not _is_trainable(body_graph_output):
+    return None
+
+  # GradientTape initializes resource and variant grads as None instead of
+  # zeros. Set to zeros so _GradientsHelper computes the gradients instead of
+  # returning None.
+  if (while_op_output.dtype in (dtypes.resource, dtypes.variant)
+      and grad is None):
+    return _zeros_like(while_op_output)
+
+  return grad
 
-  # Set None as the output gradient for tensors with None input gradient
-  # e.g. TensorArray handles.
-  # outputs[0] is the loop counter.
-  # outputs[1] is the total number of loop iterations.
-  index = 2
-  none_padded_outputs = []
-  for g in grads:
-    if g is None:
-      none_padded_outputs.append(None)
-    else:
-      none_padded_outputs.append(outputs[index])
-      index += 1
-  return none_padded_outputs
+
+# TODO(skyewm): make this return constants if op_output's shape is fully
+# defined (this can be done by checking the "shape" attr of resource vars).
+def _zeros_like(op_output):
+  """Like array_ops.zeros_like() but also accepts resource var handles."""
+  if op_output.dtype == dtypes.resource:
+    return array_ops.zeros(
+        gen_resource_variable_ops.variable_shape(op_output))
+  return array_ops.zeros_like(op_output)
 
 
 def _is_trainable(tensor):
   """Returns whether the given tensor is trainable."""
-  if not gradients_impl.IsTrainable(tensor):
+  if not gradients_util.IsTrainable(tensor):
     return False
 
   # Special case: untrainable accumulator output. The gradients algorithm
@@ -367,7 +382,7 @@ def _is_trainable(tensor):
   if tensor.op.type == "TensorListPopBack" and tensor.value_index == 0:
     assert tensor.dtype == dtypes.variant
     element_type = tensor.op.get_attr("element_dtype")
-    return gradients_impl.IsTrainable(element_type)
+    return gradients_util.IsTrainable(element_type)
 
   return True
 
@@ -387,28 +402,30 @@ def _validate_and_convert_to_tensor(maximum_iterations):
   Raises:
     ValueError: If `maximum_iterations` is invalid.
   """
-  if _is_in_xla_context():
-    if maximum_iterations is None:
-      raise ValueError("maximum_iterations is None. It is required and must "
-                       "be statically known (e.g. a constant value or known "
-                       "shape dimension) when building while_loop in XLA "
-                       "context.")
-    if isinstance(maximum_iterations, ops.Tensor):
-      # Get the constant value from the `maximum_iterations` tensor to avoid
-      # capturing a Const tensor from outside this graph.
-      maximum_iterations = tensor_util.constant_value(maximum_iterations)
-      if maximum_iterations is None:
-        raise ValueError("maximum_iterations must be statically known (e.g. a "
-                         "constant value or known shape dimension) when "
-                         "building while_loop in XLA context.")
-
-  if maximum_iterations is not None:
-    # EmptyTensorList expects `max_num_elements` to be of type int32.
-    maximum_iterations = ops.convert_to_tensor(
-        maximum_iterations, dtype=dtypes.int32, name="maximum_iterations")
-    if maximum_iterations.shape.ndims != 0:
-      raise ValueError("maximum_iterations must be a scalar, saw shape: %s" %
-                       maximum_iterations.shape)
+  if maximum_iterations is None:
+    return None
+
+  if _is_in_xla_context() and isinstance(maximum_iterations, ops.Tensor):
+    # Get the constant value from the `maximum_iterations` tensor to avoid
+    # capturing a Const tensor from outside this graph.
+    value = tensor_util.constant_value(maximum_iterations)
+    if value is None:
+      # XLA requires maximum_iterations to be statically known (e.g. a
+      # constant value or known shape dimension) when intermediate values
+      # from the forward pass are needed in the gradients pass. However,
+      # maximum_iterations may not be required if the gradient isn't built
+      # or no intermediates are required, thus we return the tensor as is.
+      return maximum_iterations
+
+    maximum_iterations = value
+
+  # EmptyTensorList expects `max_num_elements` to be of type int32.
+  maximum_iterations = ops.convert_to_tensor(
+      maximum_iterations, dtype=dtypes.int32, name="maximum_iterations")
+  if maximum_iterations.shape.ndims != 0:
+    raise ValueError("maximum_iterations must be a scalar, saw shape: %s" %
+                     maximum_iterations.shape)
+
   return maximum_iterations
 
 
@@ -479,14 +496,15 @@ def _create_grad_func(ys, xs, grads, cond_graph, body_graph, name, while_op,
   # Add the popped accumulators to the list of outputs.
   for internal_capture in grad_func_graph.internal_captures:
     if internal_capture in grad_func_graph.popped_tensor_lists:
-      grad_func_graph.outputs.append(
-          grad_func_graph.popped_tensor_lists[internal_capture])
+      new_output = grad_func_graph.popped_tensor_lists[internal_capture]
     elif internal_capture.dtype == dtypes.resource:
-      grad_func_graph.outputs.append(internal_capture)
+      new_output = internal_capture
     else:
       raise ValueError("Tensor %s is in list of internal_captures but is"
                        " neither a resource nor is in popped_tensor_lists." %
                        str(internal_capture))
+    grad_func_graph.outputs.append(new_output)
+    grad_func_graph.structured_outputs.append(new_output)
 
   return grad_func_graph, args
 
@@ -516,7 +534,7 @@ def _grad_fn(ys, xs, args, func_graph):
   # func_graph. The captured func_graph tensors are resolved to external tensors
   # after the forward While op has been rewritten in _resolve_grad_captures.
   # TODO(srbs): Mark GradientsHelper as public?
-  grad_outs = gradients_impl._GradientsHelper(
+  grad_outs = gradients_util._GradientsHelper(
       ys, xs, grad_ys=grad_ys, src_graph=func_graph,
       unconnected_gradients="zero")
 
@@ -569,6 +587,45 @@ def _resolve_grad_captures(body_graph, body_grad_graph, while_op):
   return new_capture_inputs
 
 
+def _get_structured_grad_output(outputs, grads, body_grad_graph):
+  """Returns the values that should be returned from the while grad function.
+
+  Args:
+    outputs: the raw Tensor outputs of the grad While op.
+    grads: the input gradients to the gradient function.
+    body_grad_graph: _WhileBodyGradFuncGraph.
+
+  Returns:
+    A list of gradient values. May include Nones.
+  """
+  result = []
+  # outputs[0] is the loop counter.
+  # outputs[1] is the total number of loop iterations.
+  outputs_idx = 2
+  structured_outputs_idx = 2
+  for g in grads:
+    # Set None as the output gradient for tensors with None input gradient.
+    if g is None:
+      result.append(None)
+      continue
+    output = body_grad_graph.structured_outputs[structured_outputs_idx]
+    structured_outputs_idx += 1
+    if isinstance(output, ops.IndexedSlices):
+      # TODO(skyewm): is there a more robust way to determine the order of
+      # flattened IndexedSlices components?
+      result.append(ops.IndexedSlices(
+          values=outputs[outputs_idx],
+          indices=outputs[outputs_idx + 1],
+          dense_shape=outputs[outputs_idx + 2]))
+      outputs_idx += 3
+    else:
+      assert isinstance(output, ops.Tensor)
+      result.append(outputs[outputs_idx])
+      outputs_idx += 1
+
+  return result
+
+
 def _get_accumulator(tensor):
   r"""Returns TensorList if any containing accumulated values of tensor.
 
@@ -710,9 +767,9 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
     """
     if (not whitelisted and tensor.graph is not self and
         tensor.graph != self._forward_graph):
-      raise ValueError("Attempting to capture tensor", str(tensor),
-                       " which is not in the forward graph but in ",
-                       _graph_name(tensor.graph), ".")
+      raise ValueError("Attempting to capture tensor %s which is not in the "
+                       "forward graph but in %s." %
+                       (str(tensor), _graph_name(tensor.graph)))
     return super(_WhileBodyGradFuncGraph, self).capture(tensor, name)
 
   def _capture_helper(self, tensor, name):
@@ -728,31 +785,9 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
     if captured_tensor is not None:
       return captured_tensor
 
+    # Resource tensors are not accumulated and handled specially.
     if tensor.dtype == dtypes.resource:
-      # Resource-type tensors are not accumulated.
-      # If a resource tensor exists in the loop body it must either be a loop
-      # input or an output of a nested While op inside the loop body which
-      # had captured the external resource.
-      if tensor in self._forward_graph.inputs:
-        index = self._forward_graph.inputs.index(tensor)
-      elif tensor.op.type == "While":
-        # Captured resources occur at the same index in the lists of inputs and
-        # outputs of a while op. So we lookup the input of `tensor.op` at the
-        # same index as the index of `tensor` in the `tensor.op.outputs`.
-        index = self._forward_graph.inputs.index(
-            tensor.op.inputs[tensor.value_index])
-      else:
-        raise ValueError(
-            "Taking gradient of a while loop which creates"
-            " a resource in its body is not supported: %s" % str(tensor))
-      # This must be a loop invariant.
-      assert self._forward_graph.inputs[index] == self._forward_graph.outputs[
-          index], "Resource tensors must be loop invariants %s." % str(
-              self._forward_graph._while.inputs[index])
-      tensor_in_outer_graph = self._forward_graph._while.inputs[index]
-      self._indirect_captures[tensor] = self.capture(
-          tensor_in_outer_graph, whitelisted=True)
-      return self._indirect_captures[tensor]
+      return self._resource_capture_helper(tensor)
 
     # Create or find an existing accumulator output for `tensor` in the forward
     # graph, and fetch from this accumulator in the gradient graph to get the
@@ -793,6 +828,111 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
     self.popped_tensor_lists[captured_accumulator] = new_tensor_list
     return captured_tensor
 
+  def _resource_capture_helper(self, tensor):
+    """Returns the captured resource tensor.
+
+    Resource-type tensors are not accumulated. If a resource tensor exists in
+    the loop body it must either be a loop input or an output of a nested While
+    op inside the loop body which had captured the external resource.
+
+    Args:
+      tensor: the external resource Tensor to be captured.
+
+    Returns:
+      Tensor in this graph.
+    """
+    assert tensor.dtype == dtypes.resource
+
+    index = self._resource_input_index(
+        tensor.name,
+        [t.name for t in self._forward_graph.inputs],
+        {op.name: op.node_def for op in self._forward_graph.get_operations()},
+        self._forward_graph._functions)
+
+    input_placeholder = self._forward_graph.inputs[index]
+    tensor_in_outer_graph = self._forward_graph._while.inputs[index]
+
+    assert input_placeholder.dtype == dtypes.resource
+    assert tensor_in_outer_graph.dtype == dtypes.resource
+    # This must be a loop invariant.
+    assert input_placeholder == self._forward_graph.outputs[index], (
+        "Resource tensors must be loop invariants %s." %
+        tensor_in_outer_graph)
+
+    self._indirect_captures[tensor] = self.capture(
+        tensor_in_outer_graph, whitelisted=True)
+    return self._indirect_captures[tensor]
+
+  def _resource_input_index(self, tensor_name, input_names, node_defs,
+                            functions):
+    """Returns the index of the input corresponding to `tensor_name`.
+
+    This method is used to find the corresponding index of an arbitrary resource
+    tensor in a function (the function could be a loop body). We assume that
+    resource handles are never created in functions, so that every resource
+    tensor can be traced back to a function input.
+
+    The awkward signature of this method is to make it work with both FuncGraphs
+    and FunctionDefs. This is so we can recurse on function call ops without
+    building the corresponding FuncGraph (note that even if a FuncGraph for a
+    FunctionDef already exists, the input/output/node names may have been
+    changed when the FuncGraph was serialized to the FunctionDef, which makes it
+    unusable with this algorithm).
+
+    Args:
+      tensor_name: the name of the resource tensor to be resolved to an input.
+      input_names: a list of the names of all inputs to the function.
+      node_defs: a dict mapping op name -> NodeDef for every op in the function.
+      functions: a dict mapping function name -> _EagerDefinedFunction.
+
+    Returns:
+      The index into input_names corresponding to `tensor_name`.
+    """
+    while tensor_name not in input_names:
+      # FunctionDefs and graphs use different tensor naming conventions.
+      parts = tensor_name.split(":")
+      if len(parts) == 3:
+        op_name, _, output_idx = parts
+      elif len(parts) == 2:
+        op_name, output_idx = parts
+      else:
+        assert len(parts) == 1
+        op_name = parts[0]
+        output_idx = 0
+      output_idx = int(output_idx)
+      node_def = node_defs[op_name]
+
+      if node_def.op == "While":
+        # Captured resources occur at the same index in the lists of inputs and
+        # outputs of a while op. So we lookup the input of `tensor.op` at the
+        # same index as the index of `tensor` in the `tensor.op.outputs`.
+        tensor_name = node_def.input[output_idx]
+      elif node_def.op in ("PartitionedCall", "StatefulPartitionedCall"):
+        # Functions output any captured resource tensors used by their
+        # gradients.  `tensor_name` is one of these outputs from a nested
+        # function call, so recursively find the corresponding input in the
+        # nested FunctionDef.
+        func_name = node_def.attr["f"].func.name
+        fdef = functions[func_name].definition
+        output_arg_name = fdef.signature.output_arg[output_idx].name
+        output_tensor_name = fdef.ret[output_arg_name]
+        input_index = self._resource_input_index(
+            output_tensor_name,
+            [arg.name for arg in fdef.signature.input_arg],
+            {ndef.name: ndef for ndef in fdef.node_def},
+            functions)
+        tensor_name = node_def.input[input_index]
+      else:
+        # We assume there are no other ops types that will "forward" resource
+        # handles like this, so all other handles must have been created by the
+        # op. (Note that cond_v2 wraps resource handle outputs in optionals,
+        # which we'll end up accumulating).
+        raise ValueError(
+            "Taking gradient of a while loop which creates "
+            "a resource in its body is not supported: %s" % op_name)
+
+    return input_names.index(tensor_name)
+
 
 def _check_shapes_compat(output_tensors, shape_invariants, input_tensors):
   for (t, shape, input_t) in zip(output_tensors, shape_invariants,
@@ -813,7 +953,7 @@ def _check_num_inputs_outputs(cond_graph, body_graph, num_flattened_loop_vars):
   assert len(cond_graph.outputs) == 1, (
       "cond_graph has %d outputs; Expected: 1" % len(cond_graph.outputs))
   assert len(body_graph.inputs) == num_flattened_loop_vars, (
-      "body_graph takes %d inputs; Expected: %d" % (len(cond_graph.inputs),
+      "body_graph takes %d inputs; Expected: %d" % (len(body_graph.inputs),
                                                     num_flattened_loop_vars))
   assert len(body_graph.outputs) == num_flattened_loop_vars, (
       "body_graph has %d outputs; Expected: %d" % (len(body_graph.outputs),
@@ -826,7 +966,7 @@ def _copy_handle_data(src_tensors, tgt_tensors):
 
 
 def _maybe_set_maximum_iterations_attr(op, maximum_iterations):
-  if control_flow_util.IsInXLAContext(op):
+  if maximum_iterations is not None and control_flow_util.IsInXLAContext(op):
     # Store the maximum_iterations to use in the gradient pass.
     op._set_attr(  # pylint: disable=protected-access
         "_maximum_iterations",
@@ -853,45 +993,12 @@ def _graph_name(graph):
   return "Base"
 
 
-def _is_tensor_array_handle(tensor):
-  """Returns whether tensor is a TensorArray handle."""
-  if tensor.dtype != dtypes.resource:
-    return False
-
-  if tensor.op.type == "While":
-    # We assume that any resource outputs of a While op correspond to a captured
-    # resource input (as opposed to a loop variable specified by the user).
-    # NOTE(skyewm): we could actually check this, but I can't think of when you
-    # would have a resource loop variable.
-    tensor = tensor.op.inputs[tensor.value_index]
-
-  # TODO(b/118452219): add test coverage for this.
-  tensor = func_graph_module.maybe_captured(tensor)
-
-  if isinstance(tensor, ops.EagerTensor):
-    # Eager execution doesn't quite support legacy tensorarray
-    return False
-
-  return tensor.op.type in TENSOR_ARRAY_HANDLE_OPS
-
-
 def _pack_sequence_as(structure_with_tas, loop_vars):
   """Like `nest.pack_sequence_as` but also replaces flows with TensorArrays."""
 
   def flow_to_tensor_array(flow, ta):  # pylint: disable=missing-docstring
-    if isinstance(ta, tensor_array_ops.TensorArray):
-      # pylint: disable=protected-access
-      new_ta = tensor_array_ops.TensorArray(
-          dtype=ta.dtype,
-          handle=ta.handle,
-          flow=flow,
-          infer_shape=ta._infer_shape,
-          colocate_with_first_write_call=ta._colocate_with_first_write_call)
-      new_ta._colocate_with = ta._colocate_with
-      new_ta._element_shape = ta._element_shape
-      # pylint: enable=protected-access
-      return new_ta
-    return flow
+    return (tensor_array_ops.build_ta_with_new_flow(ta, flow) if isinstance(  # pylint: disable=g-long-ternary
+        ta, tensor_array_ops.TensorArray) else flow)
 
   flattened_loop_vars = [
       flow_to_tensor_array(*z)
diff --git a/tensorflow/python/ops/while_v2_indexed_slices_rewriter.py b/tensorflow/python/ops/while_v2_indexed_slices_rewriter.py
new file mode 100644
index 0000000000000000000000000000000000000000..30e9709d703bc44885971f5e7fe49986ae2f5f2b
--- /dev/null
+++ b/tensorflow/python/ops/while_v2_indexed_slices_rewriter.py
@@ -0,0 +1,279 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Methods for rewriting while_v2 grad functions with IndexedSlices output."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import func_graph
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_resource_variable_ops
+from tensorflow.python.util import nest
+
+
+def rewrite_grad_indexed_slices(grads, body_grad_graph, loop_vars,
+                                forward_inputs):
+  """Handles special case of IndexedSlices returned from while gradient.
+
+  Some gradient functions return IndexedSlices instead of a Tensor (e.g. the
+  gradient of Gather ops). When this happens in the gradient of a while body,
+  the resulting gradient body function will have mismatched inputs and outputs,
+  since the input is a single Tensor, but the IndexedSlices gets unnested into
+  three output Tensors.
+
+  This function fixes this by rewriting the gradient body to have three inputs
+  to match the three outputs, i.e., it effectively converts the input Tensor
+  into an input IndexedSlices. It also returns new `loop_vars` to reflect the
+  new inputs.
+
+  Args:
+    grads: the input gradient Tensors to the while gradient computation.
+    body_grad_graph: _WhileBodyGradFuncGraph.
+    loop_vars: list of Tensors. The inputs to body_grad_graph.
+    forward_inputs: list of Tensors. The (flat) inputs to the forward-pass
+      While op.
+
+  Returns:
+    The new loop_vars to pass to body_grad_graph.
+  """
+  # Match up body_grad_graph.structured_outputs with the corresponding
+  # forward_inputs.
+  #
+  # Note that we don't expect a gradient computation to have structured output
+  # (e.g. no nested lists), so no need to flatten
+  # body_grad_graph.structured_outputs. However, structured_outputs may still
+  # contain composite tensors such as IndexedSlices, unlike
+  # body_grad_graph.outputs, which contains flattened composite tensors.
+  inputs_with_grads = [t for g, t in zip(grads, forward_inputs)
+                       if g is not None]
+  # Skip loop counter and total number of loop iterations.
+  structured_outputs = body_grad_graph.structured_outputs[2:]
+
+  for forward_input, output in zip(inputs_with_grads, structured_outputs):
+    if not isinstance(output, ops.IndexedSlices): continue
+
+    if forward_input.dtype == dtypes.resource:
+      # TODO(skyewm): In theory we should use this for all captured inputs, not
+      # just resource handles (which can only be captured). We can do this by
+      # checking that forward_input is passed straight through to its output.
+      loop_vars = _rewrite_input_as_indexed_slices(body_grad_graph, output,
+                                                   forward_input, loop_vars)
+    else:
+      _rewrite_output_as_tensor(body_grad_graph, output)
+
+  return loop_vars
+
+
+def _rewrite_output_as_tensor(body_grad_graph, grad_output_slices):
+  """Rewrites grad_output_slices to be a Tensor output.
+
+  Args:
+    body_grad_graph: _WhileBodyGradFuncGraph.
+    grad_output_slices: IndexedSlices output of body_grad_graph.
+  """
+  with body_grad_graph.as_default():
+    new_output = ops.convert_to_tensor_v2(grad_output_slices)
+
+  idx = body_grad_graph.structured_outputs.index(grad_output_slices)
+  body_grad_graph.structured_outputs[idx] = new_output
+  body_grad_graph.outputs = func_graph.flatten(
+      body_grad_graph.structured_outputs)
+
+
+def _rewrite_input_as_indexed_slices(body_grad_graph, grad_output_slices,
+                                     forward_input, loop_vars):
+  """Rewrites grad_output_slices's corresponding input to be an IndexedSlices.
+
+  This rewrite requires that forward_input was captured in the forward loop,
+  i.e. is not a user-specified loop variable. This is important because the
+  rewrite assumes that forward_input is passed through to its corresponding
+  output unchanged. This assumption is used in _rewrite_input_as_indexed_slices,
+  which depends on the exact gradient structure produced by the input's fanout.
+
+  This can yield a more efficient computation than using
+  _rewrite_output_as_tensor, since it preserves the IndexedSlices structure
+  instead of converting the IndexedSlices to a dense Tensor.
+
+  Args:
+    body_grad_graph: _WhileBodyGradFuncGraph.
+    grad_output_slices: IndexedSlices output of body_grad_graph.
+    forward_input: the corresonding Tensor input to the forward loop.
+    loop_vars: list of Tensors. The inputs to body_grad_graph.
+
+  Returns:
+    The new loop_vars to pass to body_grad_graph.
+  """
+  # Create initial IndexedSlices that will be the input to the grad While
+  # op. This will start as zeros, and accumulate the IndexedSlices grad output.
+  # Note that because forward_input is captured and not a loop var, its incoming
+  # gradient should always be zero.
+  init_slices = _create_grad_indexed_slices_init(grad_output_slices,
+                                                 forward_input)
+
+  # Create a new version of grad_output_slices's gradient computation that uses
+  # the new IndexedSlices input instead of the original Tensor input. We'll
+  # return the new computation and leave the old computation as dead code.
+  # TODO(skyewm): considering pruning body_grad_graph to remove the old
+  # computation.
+  with body_grad_graph.as_default():
+    input_slices = ops.IndexedSlices(
+        values=body_grad_graph.capture(init_slices.values, whitelisted=True),
+        indices=body_grad_graph.capture(init_slices.indices, whitelisted=True),
+        dense_shape=body_grad_graph.capture(init_slices.dense_shape,
+                                            whitelisted=True))
+
+    # Remove the captured tensors from the function inputs. We'll add them back
+    # at the correct index in _update_indexed_slices_param.
+    for t in _flatten(init_slices):
+      captured_t = body_grad_graph.captures.pop(t)
+      body_grad_graph.inputs.remove(captured_t)
+
+    new_output_slices = _rewrite_grad_indexed_slices_output(grad_output_slices,
+                                                            input_slices)
+
+  # Update body_grad_graph's inputs and outputs to reflect the new
+  # IndexedSlices computation.
+  return _update_indexed_slices_param(
+      body_grad_graph, loop_vars, init_slices, input_slices, new_output_slices,
+      grad_output_slices)
+
+
+def _create_grad_indexed_slices_init(grad_output_slices, forward_input):
+  """Creates an IndexedSlices to pass as input to the while grad function.
+
+  Args:
+    grad_output_slices: IndexedSlices. The corresponding while grad function
+      output.
+    forward_input: Tensor. The corresonding input to the forward while op.
+
+  Returns:
+    Zeros IndexedSlices, created in current Graph.
+  """
+  assert isinstance(grad_output_slices, ops.IndexedSlices)
+  assert isinstance(forward_input, ops.Tensor)
+  values_out = grad_output_slices.values
+  indices_out = grad_output_slices.indices
+
+  # Create the initial values tensor.
+  if values_out.shape.is_fully_defined():
+    values_shape = tensor_shape.TensorShape([0] +
+                                            values_out.shape.as_list()[1:])
+    values = array_ops.zeros(values_shape, dtype=values_out.dtype,
+                             name="values_init")
+  else:
+    if forward_input.dtype == dtypes.resource:
+      forward_shape = gen_resource_variable_ops.variable_shape(forward_input)
+    else:
+      forward_shape = array_ops.shape(forward_input)
+    values_shape = array_ops.concat([[0], forward_shape[1:]], 0)
+    values = array_ops.zeros(values_shape, dtype=values_out.dtype,
+                             name="values_init")
+
+  # Create the initial indices tensor.
+  indices = constant_op.constant([], indices_out.dtype, name="indices_init")
+
+  # Create the initial dense_shape tensor. We assume is the same shape as
+  # forward_input, since captured tensors don't change shape across loop
+  # iterations.
+  if forward_input.dtype == dtypes.resource:
+    shape = gen_resource_variable_ops.variable_shape(forward_input,
+                                                     name="shape_init")
+  else:
+    shape = array_ops.shape(forward_input, name="shape_init")
+
+  return ops.IndexedSlices(values=values, indices=indices, dense_shape=shape)
+
+
+def _rewrite_grad_indexed_slices_output(old_output_slices, new_input_slices):
+  """Creates a new verson of old_output_slices with new_input_slices as input.
+
+  This method assumes that old_output_slices.{values,indices} are produced by
+  concatenating the incoming gradient Tensor input with the IndexedSlices
+  produced by the gradient computation of the while body. See
+  gradients_impl._AggregateIndexedSlicesGradients for where these concats are
+  constructed. We build new concats that use new_input_slices instead of the
+  original Tensor input.
+
+  Args:
+    old_output_slices: original IndexedSlices output of while gradient.
+    new_input_slices: new IndexedSlices to use as input to while gradient.
+
+  Returns:
+    A new IndexedSlices to replace old_output_slices.
+  """
+
+  def rewrite(old_output, new_input):
+    assert old_output.type == "Identity"
+    concat_op = old_output.inputs[0].op
+    assert concat_op.type == "ConcatV2"
+    # Don't include axis arg
+    old_concat_args = concat_op.inputs[:-1]
+    # We assume that the original gradient input was the first argument to the
+    # concat op.
+    # TODO(skyewm): do this in a more robust way.
+    return array_ops.concat([new_input] + old_concat_args[1:], 0)
+
+  values = rewrite(old_output_slices.values.op, new_input_slices.values)
+  indices = rewrite(old_output_slices.indices.op, new_input_slices.indices)
+  return ops.IndexedSlices(values=values, indices=indices,
+                           dense_shape=new_input_slices.dense_shape)
+
+
+def _update_indexed_slices_param(graph, loop_vars, init_slices, input_slices,
+                                 output_slices, old_output_slices):
+  """Updates graph with new IndexedSlices input/output.
+
+  Updates graph's metadata to output the gradient computation defined by
+  init_slices, input_slices, and output_slices, instead of outputting
+  old_output_slices. Also returns a new version of loop_vars with init_slices
+  replacing the old input.
+
+  Args:
+    graph: _WhileBodyGradFuncGraph.
+    loop_vars: the inputs to graph.
+    init_slices: the new IndexedSlices to use as input to graph.
+    input_slices: the new IndexedSlices in graph that should be fed by
+      init_slices.
+    output_slices: the new IndexedSlices in graph that should be the
+      corresonding output to input_slices.
+    old_output_slices: the IndexedSlices in graph that are currently
+      being output.
+
+  Returns:
+    New loop_vars to pass to graph.
+  """
+  structured_idx = graph.structured_outputs.index(old_output_slices)
+  # We assume that the component tensors of old_output_slices appear
+  # sequentially in graph.outputs. We use the first of these tensors
+  # as the reference index.
+  flat_idx = graph.outputs.index(func_graph.flatten(old_output_slices)[0])
+
+  graph.structured_outputs[structured_idx] = output_slices
+  graph.outputs = func_graph.flatten(
+      graph.structured_outputs)
+
+  graph.inputs = (graph.inputs[:flat_idx] + _flatten(input_slices) +
+                  graph.inputs[flat_idx + 1:])
+
+  return loop_vars[:flat_idx] + _flatten(init_slices) + loop_vars[flat_idx + 1:]
+
+
+def _flatten(arg):
+  return nest.flatten(arg, expand_composites=True)
diff --git a/tensorflow/python/platform/app.py b/tensorflow/python/platform/app.py
index 7b917235c0a73421552b7aebaa3192de969e5f3a..303b70ff57e4eba5d1338e4ea30fbe5a0c8b652e 100644
--- a/tensorflow/python/platform/app.py
+++ b/tensorflow/python/platform/app.py
@@ -18,109 +18,23 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import errno as _errno
 import sys as _sys
 
+from absl.app import run as _run
+
 from tensorflow.python.platform import flags
 from tensorflow.python.util.tf_export import tf_export
 
 
-def _usage(shorthelp):
-  """Writes __main__'s docstring to stdout with some help text.
-
-  Args:
-    shorthelp: bool, if True, prints only flags from the main module,
-        rather than all flags.
-  """
-  doc = _sys.modules['__main__'].__doc__
-  if not doc:
-    doc = '\nUSAGE: %s [flags]\n' % _sys.argv[0]
-    doc = flags.text_wrap(doc, indent='       ', firstline_indent='')
-  else:
-    # Replace all '%s' with sys.argv[0], and all '%%' with '%'.
-    num_specifiers = doc.count('%') - 2 * doc.count('%%')
-    try:
-      doc %= (_sys.argv[0],) * num_specifiers
-    except (OverflowError, TypeError, ValueError):
-      # Just display the docstring as-is.
-      pass
-  if shorthelp:
-    flag_str = flags.FLAGS.main_module_help()
-  else:
-    flag_str = str(flags.FLAGS)
-  try:
-    _sys.stdout.write(doc)
-    if flag_str:
-      _sys.stdout.write('\nflags:\n')
-      _sys.stdout.write(flag_str)
-    _sys.stdout.write('\n')
-  except IOError as e:
-    # We avoid printing a huge backtrace if we get EPIPE, because
-    # "foo.par --help | less" is a frequent use case.
-    if e.errno != _errno.EPIPE:
-      raise
-
-
-class _HelpFlag(flags.BooleanFlag):
-  """Special boolean flag that displays usage and raises SystemExit."""
-  NAME = 'help'
-  SHORT_NAME = 'h'
-
-  def __init__(self):
-    super(_HelpFlag, self).__init__(
-        self.NAME, False, 'show this help', short_name=self.SHORT_NAME)
-
-  def parse(self, arg):
-    if arg:
-      _usage(shorthelp=True)
-      print()
-      print('Try --helpfull to get a list of all flags.')
-      _sys.exit(1)
-
-
-class _HelpshortFlag(_HelpFlag):
-  """--helpshort is an alias for --help."""
-  NAME = 'helpshort'
-  SHORT_NAME = None
-
-
-class _HelpfullFlag(flags.BooleanFlag):
-  """Display help for flags in main module and all dependent modules."""
-
-  def __init__(self):
-    super(_HelpfullFlag, self).__init__('helpfull', False, 'show full help')
-
-  def parse(self, arg):
-    if arg:
-      _usage(shorthelp=False)
-      _sys.exit(1)
-
-
-_define_help_flags_called = False
-
-
-def _define_help_flags():
-  global _define_help_flags_called
-  if not _define_help_flags_called:
-    flags.DEFINE_flag(_HelpFlag())
-    flags.DEFINE_flag(_HelpfullFlag())
-    flags.DEFINE_flag(_HelpshortFlag())
-    _define_help_flags_called = True
+def _parse_flags_tolerate_undef(argv):
+  """Parse args, returning any unknown flags (ABSL defaults to crashing)."""
+  return flags.FLAGS(_sys.argv if argv is None else argv, known_only=True)
 
 
 @tf_export(v1=['app.run'])
 def run(main=None, argv=None):
   """Runs the program with an optional 'main' function and 'argv' list."""
 
-  # Define help flags.
-  _define_help_flags()
-
-  # Parse known flags.
-  argv = flags.FLAGS(_sys.argv if argv is None else argv, known_only=True)
-
   main = main or _sys.modules['__main__'].main
 
-  # Call the main function, passing through any arguments
-  # to the final program.
-  _sys.exit(main(argv))
-
+  _run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
diff --git a/tensorflow/python/platform/gfile.py b/tensorflow/python/platform/gfile.py
index d0159e9e9816ba730c843d2b46936b142d47ff79..dd2c615e9e0ca193b68c4242cb64163bc9266762 100644
--- a/tensorflow/python/platform/gfile.py
+++ b/tensorflow/python/platform/gfile.py
@@ -37,7 +37,7 @@ from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export(v1=['gfile.GFile', 'gfile.Open'], v2=['io.gfile.GFile'])
+@tf_export('io.gfile.GFile', v1=['gfile.GFile', 'gfile.Open', 'io.gfile.GFile'])
 class GFile(_FileIO):
   """File I/O wrappers without thread locking.
 
diff --git a/tensorflow/python/platform/googletest.py b/tensorflow/python/platform/googletest.py
index 5b20e36a693b2ae283ffe4cefa2210c0cb61dcfc..802721e34b04d87fc095f6d6900dd2d99b14faef 100644
--- a/tensorflow/python/platform/googletest.py
+++ b/tensorflow/python/platform/googletest.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Imports unittest as a replacement for testing.pybase.googletest."""
+"""Imports absltest as a replacement for testing.pybase.googletest."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -26,7 +26,7 @@ import tempfile
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
-from unittest import *
+from absl.testing.absltest import *
 # pylint: enable=wildcard-import
 
 from tensorflow.python.framework import errors
@@ -41,7 +41,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 Benchmark = benchmark.TensorFlowBenchmark  # pylint: disable=invalid-name
 
-unittest_main = main
+absltest_main = main
 
 # We keep a global variable in this module to make sure we create the temporary
 # directory only once per test binary invocation.
@@ -51,7 +51,7 @@ _googletest_temp_dir = ''
 # pylint: disable=invalid-name
 # pylint: disable=undefined-variable
 def g_main(argv):
-  """Delegate to unittest.main after redefining testLoader."""
+  """Delegate to absltest.main after redefining testLoader."""
   if 'TEST_SHARD_STATUS_FILE' in os.environ:
     try:
       f = None
@@ -67,7 +67,7 @@ def g_main(argv):
 
   if ('TEST_TOTAL_SHARDS' not in os.environ or
       'TEST_SHARD_INDEX' not in os.environ):
-    return unittest_main(argv=argv)
+    return absltest_main(argv=argv)
 
   total_shards = int(os.environ['TEST_TOTAL_SHARDS'])
   shard_index = int(os.environ['TEST_SHARD_INDEX'])
@@ -87,7 +87,7 @@ def g_main(argv):
   # Override getTestCaseNames
   base_loader.getTestCaseNames = getShardedTestCaseNames
 
-  unittest_main(argv=argv, testLoader=base_loader)
+  absltest_main(argv=argv, testLoader=base_loader)
 
 
 # Redefine main to allow running benchmarks
@@ -112,6 +112,9 @@ def GetTempDir():
                               os.path.basename(tf_inspect.getfile(first_frame)))
       temp_dir = tempfile.mkdtemp(prefix=temp_dir.rstrip('.py'))
 
+    # Make sure we have the correct path separators.
+    temp_dir = temp_dir.replace('/', os.sep)
+
     def delete_temp_dir(dirname=temp_dir):
       try:
         file_io.delete_recursively(dirname)
@@ -119,6 +122,7 @@ def GetTempDir():
         logging.error('Error removing %s: %s', dirname, e)
 
     atexit.register(delete_temp_dir)
+
     _googletest_temp_dir = temp_dir
 
   return _googletest_temp_dir
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index 0654104a3436366bb5fe88e2c3415cc957cbfde8..fcab57c12c95cd18fd5e32279a1a42b296a4d130 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -61,6 +61,7 @@ cuda_py_test(
         "no_pip",
         "oss_serial",
     ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 cuda_py_test(
@@ -76,6 +77,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     tags = ["no_pip"],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 py_library(
@@ -130,6 +132,7 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     tags = ["no_pip"],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
 
 py_library(
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index 994206cd63a915de93bc109e7b217ad997c787a7..0a6ba12094b5e2d4374acbe0d23e8355c3b309c2 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -69,4 +69,5 @@ cuda_py_test(
     tags = [
         "no_pip",
     ],
+    xla_enable_strict_auto_jit = False,  # Node names are different with autojit
 )
diff --git a/tensorflow/python/profiler/internal/run_metadata_test.py b/tensorflow/python/profiler/internal/run_metadata_test.py
index f96d721f46e162ee6753377569aacb439cd591d5..9e92a8f5f3670cb30d910456789b6d186f26a66b 100644
--- a/tensorflow/python/profiler/internal/run_metadata_test.py
+++ b/tensorflow/python/profiler/internal/run_metadata_test.py
@@ -50,7 +50,7 @@ def _extract_node(run_meta, node_name):
       dev = dev[dev.find('cpu:'):]
     elif dev.find('gpu:') > 0:
       dev = dev[dev.find('gpu:'):]
-    else:
+    elif '/host:cpu' not in dev:
       assert False, 'Unrecognized device name: %s' % dev
 
     for node_stat in dev_stat.node_stats:
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index adbce95c6f9f54909bbca2fdd3e31142bb2e6bc9..fe7a41afb421015a0b57335989a5aad441fbf91d 100755
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -22,8 +22,11 @@ limitations under the License.
 %rename("%s") TFE_ContextListDevices;
 %rename("%s") TFE_ContextAddFunction;
 %rename("%s") TFE_ContextAddFunctionDef;
+%rename("%s") TFE_ContextHasFunction;
 %rename("%s") TFE_ContextEnableRunMetadata;
 %rename("%s") TFE_ContextDisableRunMetadata;
+%rename("%s") TFE_ContextEnableGraphCollection;
+%rename("%s") TFE_ContextDisableGraphCollection;
 %rename("%s") TFE_ContextExportRunMetadata;
 %rename("%s") TFE_ContextClearCaches;
 %rename("%s") TFE_ContextGetDevicePlacementPolicy;
@@ -32,6 +35,15 @@ limitations under the License.
 %rename("%s") TFE_ContextSetServerDef;
 %rename("%s") TFE_ContextAsyncWait;
 %rename("%s") TFE_ContextAsyncClearError;
+%rename("%s") TFE_NewProfiler;
+%rename("%s") TFE_ProfilerIsOk;
+%rename("%s") TFE_DeleteProfiler;
+%rename("%s") TFE_ProfilerSerializeToString;
+%rename("%s") TFE_NewProfilerContext;
+%rename("%s") TFE_ProfilerContextSetEagerContext;
+%rename("%s") TFE_DeleteProfilerContext;
+%rename("%s") TFE_StartProfilerServer;
+%rename("%s") TFE_ProfilerClientStartTracing;
 %rename("%s") TFE_OpNameGetAttrType;
 %rename("%s") TFE_Py_InitEagerTensor;
 %rename("%s") TFE_Py_SetEagerTensorProfiler;
@@ -64,13 +76,18 @@ limitations under the License.
 %rename("%s") TFE_DeleteContextOptions;
 %rename("%s") TFE_Py_TensorShapeSlice;
 %rename("%s") TFE_Py_TensorShapeOnDevice;
+%rename("%s") TFE_Py_EnableInteractivePythonLogging;
 %rename("%s") TFE_ContextStartStep;
 %rename("%s") TFE_ContextEndStep;
 %rename("%s") TFE_Py_RegisterVSpace;
 %rename("%s") TFE_Py_EncodeArg;
+%rename("%s") TFE_EnableCollectiveOps;
+%rename("%s") TF_PickUnusedPortOrDie;
 
 %{
 #include "tensorflow/python/eager/pywrap_tfe.h"
+#include "tensorflow/c/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
 %}
 
 %typemap(in) (const void* proto) {
@@ -133,6 +150,34 @@ limitations under the License.
   $1 = const_cast<char*>(TFE_GetPythonString($input));
 }
 
+// For const parameters in a function, SWIG pretty much ignores the const.
+// See: http://www.swig.org/Doc2.0/SWIG.html#SWIG_nn13
+// Hence the 'const_cast'.
+%typemap(in) const char* name {
+  $1 = const_cast<char*>(TFE_GetPythonString($input));
+}
+
+// For const parameters in a function, SWIG pretty much ignores the const.
+// See: http://www.swig.org/Doc2.0/SWIG.html#SWIG_nn13
+// Hence the 'const_cast'.
+%typemap(in) const char* service_addr {
+  $1 = const_cast<char*>(TFE_GetPythonString($input));
+}
+
+// For const parameters in a function, SWIG pretty much ignores the const.
+// See: http://www.swig.org/Doc2.0/SWIG.html#SWIG_nn13
+// Hence the 'const_cast'.
+%typemap(in) const char* logdir {
+  $1 = const_cast<char*>(TFE_GetPythonString($input));
+}
+
+// For const parameters in a function, SWIG pretty much ignores the const.
+// See: http://www.swig.org/Doc2.0/SWIG.html#SWIG_nn13
+// Hence the 'const_cast'.
+%typemap(in) const char* worker_list {
+  $1 = const_cast<char*>(TFE_GetPythonString($input));
+}
+
 %typemap(in) (TFE_Context*) {
   $1 = (TFE_Context*)PyCapsule_GetPointer($input, nullptr);
 
@@ -169,6 +214,25 @@ limitations under the License.
       }
       if (EagerTensor_CheckExact(elem)) {
         (*$1)[i] = EagerTensor_Handle(elem);
+      } else if (tensorflow::swig::IsTensor(elem)) {
+        // If it isnt an EagerTensor, but is still a Tensor, it must be a graph
+        // tensor.
+        SWIG_exception_fail(
+            SWIG_TypeError,
+            tensorflow::strings::StrCat(
+                "An op outside of the function building code is being passed\n"
+                "a \"Graph\" tensor. It is possible to have Graph tensors\n"
+                "leak out of the function building context by including a\n"
+                "tf.init_scope in your function building code.\n"
+                "For example, the following function will fail:\n",
+                "  @tf.function\n",
+                "  def has_init_scope():\n",
+                "    my_constant = tf.constant(1.)\n",
+                "    with tf.init_scope():\n",
+                "      added = my_constant * 2\n",
+                "The graph tensor has name: ",
+                TFE_GetPythonString(PyObject_GetAttrString(elem, "name")))
+                .c_str());
       } else {
         SWIG_exception_fail(
             SWIG_TypeError,
@@ -226,6 +290,8 @@ limitations under the License.
 %native(TFE_Py_FastPathExecute) TFE_Py_FastPathExecute_C;
 
 %include "tensorflow/python/eager/pywrap_tfe.h"
+%include "tensorflow/c/c_api_experimental.h"
+%include "tensorflow/c/eager/c_api_experimental.h"
 
 // Clear all typemaps.
 %typemap(out) TF_DataType;
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 53d0640542f257bff707047cd405a0dad5055449..01d4818879f3e1d8d64b8393b15103214e2b581e 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -11,7 +11,7 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
 
@@ -71,7 +71,7 @@ py_library(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:lib",
         "//tensorflow/python:platform",
-        "//tensorflow/python:training",
+        "//tensorflow/python:saver",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
     ],
@@ -98,17 +98,16 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "loader_test",
     size = "small",
     srcs = ["loader_test.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:private"],
-    deps = [
+    additional_deps = [
         ":builder",
         ":loader",
         ":signature_def_utils",
         ":utils",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
@@ -118,7 +117,6 @@ py_test(
         "//tensorflow/python:state_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -155,15 +153,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "saved_model_test",
     size = "small",
     srcs = ["saved_model_test.py"],
-    data = ["//tensorflow/cc/saved_model:saved_model_half_plus_two"],
-    srcs_version = "PY2AND3",
-    tags = ["no_windows"],
-    visibility = ["//visibility:private"],
-    deps = [
+    additional_deps = [
         ":builder",
         ":constants",
         ":loader",
@@ -186,6 +180,8 @@ py_test(
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
     ],
+    data = ["//tensorflow/cc/saved_model:saved_model_half_plus_two"],
+    tags = ["no_windows"],
 )
 
 py_library(
@@ -205,13 +201,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "utils_test",
     size = "small",
     srcs = ["utils_test.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:private"],
-    deps = [
+    additional_deps = [
         ":utils",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -237,13 +231,11 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "signature_def_utils_test",
     size = "small",
     srcs = ["signature_def_utils_test.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:private"],
-    deps = [
+    additional_deps = [
         ":signature_constants",
         ":signature_def_utils",
         ":utils",
@@ -254,12 +246,11 @@ py_test(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "simple_save_test",
     size = "small",
     srcs = ["simple_save_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":loader",
         ":signature_constants",
         ":simple_save",
@@ -270,12 +261,22 @@ py_test(
     ],
 )
 
-tf_proto_library(
-    name = "saved_object_graph",
-    srcs = ["saved_object_graph.proto"],
-    cc_api_version = 2,
-    protodeps = tf_additional_all_protos(),
-    visibility = ["//tensorflow:internal"],
+py_library(
+    name = "signature_serialization",
+    srcs = [
+        "signature_serialization.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":revived_types",
+        ":signature_constants",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:util",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/training/tracking:base",
+    ],
 )
 
 py_library(
@@ -287,14 +288,19 @@ py_library(
     deps = [
         ":builder",
         ":constants",
-        ":loader",
-        ":saved_object_graph_py",
+        ":function_serialization",
+        ":nested_structure_coder",
+        ":revived_types",
         ":signature_constants",
         ":signature_def_utils",
+        ":signature_serialization",
         ":tag_constants",
         ":utils",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:lib",
@@ -303,23 +309,26 @@ py_library(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:function",
-        "//tensorflow/python/training/checkpointable:base",
-        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python/training/saving:functional_saver",
+        "//tensorflow/python/training/tracking",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/training/tracking:graph_view",
+        "//tensorflow/python/training/tracking:object_identity",
+        "//tensorflow/python/training/tracking:util",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "save_test",
     srcs = ["save_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":loader",
         ":save",
         ":signature_constants",
         ":tag_constants",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -330,25 +339,152 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":constants",
+        ":function_deserialization",
+        ":load_v1_in_v2",
         ":loader",
-        ":saved_object_graph_py",
+        ":nested_structure_coder",
+        ":revived_types",
+        ":utils",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
         "//tensorflow/python:lib",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
-        "//tensorflow/python/training/checkpointable:tracking",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/training/tracking",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/training/tracking:graph_view",
+        "//tensorflow/python/training/tracking:util",
     ],
 )
 
-py_test(
-    name = "load_test",
-    srcs = ["load_test.py"],
+py_library(
+    name = "load_v1_in_v2",
+    srcs = [
+        "load_v1_in_v2.py",
+    ],
     srcs_version = "PY2AND3",
     deps = [
+        ":loader",
+        ":signature_serialization",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:saver",
+        "//tensorflow/python/eager:wrap_function",
+        "//tensorflow/python/training/tracking",
+    ],
+)
+
+tf_py_test(
+    name = "load_test",
+    srcs = ["load_test.py"],
+    additional_deps = [
         ":load",
         ":save",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:lib",
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/training/checkpointable:tracking",
+        "//tensorflow/python/training/tracking:tracking",
+    ],
+)
+
+tf_py_test(
+    name = "load_v1_in_v2_test",
+    srcs = ["load_v1_in_v2_test.py"],
+    additional_deps = [
+        ":builder",
+        ":load",
+        ":save",
+        ":signature_def_utils",
+        ":simple_save",
+        ":utils",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/training/tracking:tracking",
+        "//tensorflow/python:variables",
+    ],
+)
+
+py_library(
+    name = "revived_types",
+    srcs = [
+        "revived_types.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
+tf_py_test(
+    name = "revived_types_test",
+    srcs = ["revived_types_test.py"],
+    additional_deps = [
+        ":revived_types",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_library(
+    name = "function_serialization",
+    srcs = [
+        "function_serialization.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":nested_structure_coder",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:function",
+    ],
+)
+
+py_library(
+    name = "function_deserialization",
+    srcs = [
+        "function_deserialization.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":nested_structure_coder",
+        "//tensorflow/python/eager:def_function",
+    ],
+)
+
+py_library(
+    name = "nested_structure_coder",
+    srcs = ["nested_structure_coder.py"],
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:framework",
+        "@six_archive//:six",
+    ],
+)
+
+tf_py_test(
+    name = "nested_structure_coder_test",
+    srcs = ["nested_structure_coder_test.py"],
+    additional_deps = [
+        ":nested_structure_coder",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:framework",
+        "//tensorflow/python/eager:test",
     ],
 )
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index f37d283a2a2cbb50faf62f1ae24cd69bd0f29d74..37af428dcb97d77f85e0555edcc1ca959a479943 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -95,11 +95,13 @@ class _SavedModelBuilder(object):
 
     self._export_dir = export_dir
     if file_io.file_exists(export_dir):
-      raise AssertionError(
-          "Export directory already exists. Please specify a different export "
-          "directory: %s" % export_dir)
-
-    file_io.recursive_create_dir(self._export_dir)
+      if file_io.list_directory(export_dir):
+        raise AssertionError(
+            "Export directory already exists, and isn't empty. Please choose "
+            "a different export directory, or delete all the contents of the "
+            "specified directory: %s" % export_dir)
+    else:
+      file_io.recursive_create_dir(self._export_dir)
 
     # Boolean to track whether variables and assets corresponding to the
     # SavedModel have been saved. Specifically, the first meta graph to be added
diff --git a/tensorflow/python/saved_model/function_deserialization.py b/tensorflow/python/saved_model/function_deserialization.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1ad37180e334b6d1d8719ba4b4c0819bfa8ba02
--- /dev/null
+++ b/tensorflow/python/saved_model/function_deserialization.py
@@ -0,0 +1,372 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tools for deserializing `Function`s."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+
+from tensorflow.core.framework import function_pb2
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function as function_lib
+from tensorflow.python.framework import func_graph as func_graph_lib
+from tensorflow.python.framework import function_def_to_graph as function_def_lib
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import nested_structure_coder
+from tensorflow.python.util import compat
+from tensorflow.python.util import nest
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+
+
+def _is_tensor(t):
+  return isinstance(t, (ops.Tensor, resource_variable_ops.ResourceVariable))
+
+
+def _call_concrete_function(function, inputs):
+  """Calls a restored Function with structured inputs.
+
+  This differs from `function.__call__` in that inputs and outputs are
+  structured and that it casts inputs to tensors if needed.
+
+  Note: this does not checks that non-tensor inputs match. That should be
+  done before via `_concrete_function_callable_with`.
+
+  Args:
+    function: ConcreteFunction to call.
+    inputs: Structured inputs compatible with
+        `function.graph.structured_input_signature`.
+
+  Returns:
+    The structured function output.
+  """
+  expected_structure = function.graph.structured_input_signature
+  flatten_inputs = nest.flatten_up_to(expected_structure, inputs)
+  tensor_inputs = []
+  for arg, expected in zip(flatten_inputs, nest.flatten(expected_structure)):
+    if isinstance(expected, tensor_spec.TensorSpec):
+      tensor_inputs.append(
+          ops.convert_to_tensor(arg, dtype_hint=expected.dtype))
+  result = function._call_flat(tensor_inputs)  # pylint: disable=protected-access
+  if isinstance(result, ops.Operation):
+    return None
+  return result
+
+
+def _try_convert_to_tensor_spec(arg, dtype_hint):
+  """Returns None or TensorSpec obtained if `arg` is converted to tensor."""
+  try:
+    # Note: try conversion in a FuncGraph to avoid poluting current context.
+    with func_graph_lib.FuncGraph(name="guess_conversion").as_default():
+      result = ops.convert_to_tensor(arg, dtype_hint=dtype_hint)
+      return tensor_spec.TensorSpec(shape=result.shape, dtype=result.dtype)
+  except (TypeError, ValueError):
+    return None
+
+
+def _concrete_function_callable_with(function, inputs, allow_conversion):
+  """Returns whether concrete `function` can be called with `inputs`."""
+  expected_structure = function.graph.structured_input_signature
+  try:
+    flatten_inputs = nest.flatten_up_to(expected_structure, inputs)
+  except (TypeError, ValueError):
+    return False
+  for arg, expected in zip(flatten_inputs, nest.flatten(expected_structure)):
+    if isinstance(expected, tensor_spec.TensorSpec):
+      if allow_conversion:
+        arg = _try_convert_to_tensor_spec(arg, dtype_hint=expected.dtype)
+      if not _is_tensor(arg) and not isinstance(arg, tensor_spec.TensorSpec):
+        return False
+      if arg.dtype != expected.dtype:
+        return False
+      if not expected.shape.is_compatible_with(arg.shape):
+        return False
+    else:
+      if arg != expected:
+        return False
+  return True
+
+
+def _deserialize_function_spec(function_spec_proto, coder):
+  """Deserialize a FunctionSpec object from its proto representation."""
+  typeless_fullargspec = coder.decode_proto(function_spec_proto.fullargspec)
+  fullargspec = tf_inspect.FullArgSpec(
+      args=typeless_fullargspec.args,
+      varargs=typeless_fullargspec.varargs,
+      varkw=typeless_fullargspec.varkw,
+      defaults=typeless_fullargspec.defaults,
+      kwonlyargs=typeless_fullargspec.kwonlyargs,
+      kwonlydefaults=typeless_fullargspec.kwonlydefaults,
+      annotations=typeless_fullargspec.annotations)
+  is_method = function_spec_proto.is_method
+  args_to_prepend = coder.decode_proto(function_spec_proto.args_to_prepend)
+  kwargs_to_include = coder.decode_proto(function_spec_proto.kwargs_to_include)
+  input_signature = coder.decode_proto(function_spec_proto.input_signature)
+  return function_lib.FunctionSpec(fullargspec, is_method, args_to_prepend,
+                                   kwargs_to_include, input_signature)
+
+
+# TODO(allenl): The fact that we can't derive ConcreteFunction calling
+# conventions from the serialized input spec right now is unfortunate. Merging
+# these would be good, maybe by adding TensorSpec names to cache keys so renamed
+# keyword arguments would yield different ConcreteFunctions.
+def setup_bare_concrete_function(saved_bare_concrete_function,
+                                 concrete_functions):
+  """Makes a restored bare concrete function callable."""
+  # Bare concrete functions accept only flat lists of Tensors with unique
+  # names.
+  concrete_function = concrete_functions[
+      saved_bare_concrete_function.concrete_function_name]
+  # pylint: disable=protected-access
+  concrete_function._arg_keywords = (
+      saved_bare_concrete_function.argument_keywords)
+  concrete_function._num_positional_args = (
+      saved_bare_concrete_function.allowed_positional_arguments)
+  # pylint: enable=protected-access
+  concrete_function.add_to_graph()
+  return concrete_function
+
+
+class RestoredFunction(def_function.Function):
+  """Wrapper class for a function that has been restored from saved state.
+
+  See `def_function.Function`.
+  """
+
+  def __init__(self, python_function, name, function_spec, concrete_functions):
+    # TODO(mdan): We may enable autograph once exceptions are supported.
+    super(RestoredFunction, self).__init__(
+        python_function, name, autograph=False)
+    self._concrete_functions = concrete_functions
+    # This does not propagate to stateful and stateless functions of the
+    # RestoredFunction, which will have seen only defunned
+    # restored_function_body(*args, **kwargs). That's why we have to
+    # canonicalize inputs inside restored_function_body.
+    self._function_spec = function_spec
+
+  def _list_all_concrete_functions_for_serialization(self):
+    return self._concrete_functions
+
+
+def recreate_function(saved_function, concrete_functions):
+  """Creates a `Function` from a `SavedFunction`.
+
+  Args:
+    saved_function: `SavedFunction` proto.
+    concrete_functions: map from function name to `ConcreteFunction`.
+
+  Returns:
+    A `Function`.
+  """
+  # TODO(andresp): Construct a `Function` with the cache populated
+  # instead of creating a new `Function` backed by a Python layer to
+  # glue things together. Current approach is nesting functions deeper for each
+  # serialization cycle.
+
+  coder = nested_structure_coder.StructureCoder()
+  function_spec = _deserialize_function_spec(saved_function.function_spec,
+                                             coder)
+
+  def restored_function_body(*args, **kwargs):
+    """Calls a restored function."""
+    # TODO(allenl): Functions saved with input_signatures should revive with
+    # input_signatures.
+    try:
+      canonicalized_inputs = function_spec.canonicalize_function_inputs(
+          *args, **kwargs)
+    except ValueError as e:
+      raise ValueError(
+          "Cannot canonicalize input args %r and kwargs %r. Error: %r." %
+          (args, kwargs, e))
+
+    # First try to find a concrete function that can be called without input
+    # conversions. This allows one to pick a more specific trace in case there
+    # was also a more expensive one that supported tensors.
+    for allow_conversion in [False, True]:
+      for function_name in saved_function.concrete_functions:
+        function = concrete_functions[function_name]
+        if _concrete_function_callable_with(function,
+                                            canonicalized_inputs,
+                                            allow_conversion):
+          return _call_concrete_function(function, canonicalized_inputs)
+
+    available_signatures = [
+        concrete_functions[function_name].graph.structured_input_signature
+        for function_name in saved_function.concrete_functions
+    ]
+    raise ValueError(
+        "Could not find matching function to call for canonicalized inputs %r. "
+        "Only existing signatures are %r."
+        % (canonicalized_inputs, available_signatures))
+
+  concrete_function_objects = []
+  for concrete_function_name in saved_function.concrete_functions:
+    concrete_function_objects.append(concrete_functions[concrete_function_name])
+
+  restored_function = RestoredFunction(
+      restored_function_body,
+      restored_function_body.__name__,
+      function_spec,
+      concrete_function_objects)
+
+  return tf_decorator.make_decorator(
+      restored_function_body,
+      restored_function,
+      decorator_argspec=function_spec.fullargspec)
+
+
+def load_function_def_library(library):
+  """Load a set of functions as concrete functions without captured inputs.
+
+  Functions names are manipulated during load such that they do not overlap
+  with previously created ones.
+
+  Args:
+    library: FunctionDefLibrary proto message.
+
+  Returns:
+    Map of original function names in the library to instances of
+    `ConcreteFunction` without captured inputs.
+
+  Raises:
+    ValueError: if functions dependencies have a cycle.
+  """
+  functions = {}
+
+  load_shared_name_suffix = "_load_{}".format(ops.uid())
+  for fdef in _sort_function_defs(library):
+    copy = _fix_fdef(fdef, functions, load_shared_name_suffix)
+
+    func_graph = function_def_lib.function_def_to_graph(copy)
+    for dep in _list_function_deps(fdef):
+      functions[dep].add_to_graph(func_graph)
+    func = function_lib.ConcreteFunction(func_graph)
+    func.add_to_graph()
+
+    functions[fdef.signature.name] = func
+
+    # Also register the gradients in the current root context.
+    with ops.init_scope():
+      func._register_gradient()  # pylint: disable=protected-access
+
+  return functions
+
+
+def _sort_function_defs(library):
+  """Return a topologic sort of FunctionDefs in a library."""
+  edges = collections.defaultdict(list)
+  in_count = collections.defaultdict(lambda: 0)
+
+  for fdef in library.function:
+    for dep in _list_function_deps(fdef):
+      edges[dep].append(fdef.signature.name)
+      in_count[fdef.signature.name] += 1
+
+  ready = [
+      fdef.signature.name
+      for fdef in library.function
+      if in_count[fdef.signature.name] == 0
+  ]
+  output = []
+  while ready:
+    node = ready.pop()
+    output.append(node)
+    for dest in edges[node]:
+      in_count[dest] -= 1
+      if not in_count[dest]:
+        ready.append(dest)
+
+  if len(output) != len(library.function):
+    failed_to_resolve = sorted(set(in_count.keys()) - set(output))
+    raise ValueError("There is a cyclic-dependency between functions. ",
+                     "Could not resolve %r." % (failed_to_resolve,))
+
+  reverse = {fdef.signature.name: fdef for fdef in library.function}
+  return [reverse[x] for x in output]
+
+
+def _fix_fdef(orig_fdef, functions, shared_name_suffix):
+  """Fixes a FunctionDef proto to be loaded in current context.
+
+  In particular, when loading a function library into an eager context, one
+  must rename the functions to avoid conflicts with existent functions.
+
+  Args:
+    orig_fdef: FunctionDef proto to fix. It is not modified.
+    functions: map from function name to a ConcreteFunction instance.
+    shared_name_suffix: A unique string for this load which helps to avoid
+      `shared_name` collisions across loads. Two functions from the same load
+      using the same `shared_name` still need to share, but functions from
+      different loads with the same `shared_name` should not.
+
+  Returns:
+    A fixed copy of the original FunctionDef.
+  """
+  fdef = function_pb2.FunctionDef()
+  fdef.CopyFrom(orig_fdef)
+  for node_def in fdef.node_def:
+    if "_gradient_op_type" in node_def.attr:
+      if node_def.op in ["StatefulPartitionedCall", "PartitionedCall"]:
+        # TODO(andresp): This code assumes that the gradient registered for this
+        # function call is the default gradient for the function and not a
+        # custom one.
+        fname = node_def.attr["f"].func.name
+        node_def.attr["_gradient_op_type"].s = compat.as_bytes(
+            functions[fname]._gradient_name)  # pylint: disable=protected-access
+      else:
+        logging.warning("Importing a function (%s) with ops with custom "
+                        "gradients. Will likely fail if a gradient is "
+                        "requested.", fdef.signature.name)
+    for _, attr_value in node_def.attr.items():
+      if attr_value.func.name:
+        attr_value.func.name = functions[attr_value.func.name].name
+
+    # TODO(b/124205571): Avoid accidental sharing and destruction of restored
+    # resources. For now uniquify "shared_name" when loading functions to avoid
+    # sharing.
+    if "shared_name" in node_def.attr:
+      node_def.attr["shared_name"].s += compat.as_bytes(shared_name_suffix)
+
+  fdef.signature.name = _clean_function_name(fdef.signature.name)
+  return fdef
+
+
+def _list_function_deps(fdef):
+  # TODO(andresp): Recurse into list attributes and into NameAttrList attrs both
+  # when listing deps and when fixing them. `function_def_to_graph` also
+  # requires fixes.
+  deps = set()
+  for node_def in fdef.node_def:
+    for _, attr_value in node_def.attr.items():
+      if attr_value.WhichOneof("value") == "func":
+        deps.add(attr_value.func.name)
+  return deps
+
+
+def _clean_function_name(name):
+  """Vanity function to keep the function names comprehensible."""
+  # Note: each time a function is wrapped into `function_lib.ConcreteFunction`
+  # its name becomes "__inference_<orig>_xyz".
+  match = re.search(r"^__inference_(.*)_\d+$", name)
+  if match:
+    return match.group(1)
+  else:
+    return name
diff --git a/tensorflow/python/saved_model/function_serialization.py b/tensorflow/python/saved_model/function_serialization.py
new file mode 100644
index 0000000000000000000000000000000000000000..e876eef8b349ac17a42cb284a861784b4d941998
--- /dev/null
+++ b/tensorflow/python/saved_model/function_serialization.py
@@ -0,0 +1,87 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tools for serializing `Function`s."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import saved_object_graph_pb2
+from tensorflow.python.framework import func_graph as func_graph_module
+from tensorflow.python.saved_model import nested_structure_coder
+
+
+def _serialize_function_spec(function_spec, coder):
+  """Serialize a FunctionSpec object into its proto representation."""
+  proto = saved_object_graph_pb2.FunctionSpec()
+  proto.fullargspec.CopyFrom(coder.encode_structure(function_spec.fullargspec))
+  proto.is_method = function_spec.is_method
+  proto.args_to_prepend.CopyFrom(
+      coder.encode_structure(function_spec.args_to_prepend))
+  proto.kwargs_to_include.CopyFrom(
+      coder.encode_structure(function_spec.kwargs_to_include))
+  proto.input_signature.CopyFrom(
+      coder.encode_structure(function_spec.input_signature))
+  return proto
+
+
+def serialize_concrete_function(concrete_function, node_ids, coder):
+  """Build a SavedConcreteFunction."""
+  bound_inputs = []
+  try:
+    for capture in concrete_function.captured_inputs:
+      bound_inputs.append(node_ids[capture])
+  except KeyError:
+    raise KeyError(
+        "Failed to add concrete function %s to object based saved model as it "
+        "captures tensor %s which is unsupported or not reachable from root. "
+        "One reason could be that a stateful object or a variable that the "
+        "function depends on is not assigned to an attribute of the serialized "
+        "trackable object "
+        "(see SaveTest.test_captures_unreachable_variable)."
+        % (concrete_function.name, capture))
+  concrete_function_proto = saved_object_graph_pb2.SavedConcreteFunction()
+  structured_outputs = func_graph_module.convert_structure_to_signature(
+      concrete_function.structured_outputs)
+  concrete_function_proto.canonicalized_input_signature.CopyFrom(
+      coder.encode_structure(concrete_function.structured_input_signature))
+  concrete_function_proto.output_signature.CopyFrom(
+      coder.encode_structure(structured_outputs))
+  concrete_function_proto.bound_inputs.extend(bound_inputs)
+  return concrete_function_proto
+
+
+def serialize_bare_concrete_function(concrete_function):
+  """Build a SavedBareConcreteFunction."""
+  # pylint: disable=protected-access
+  return saved_object_graph_pb2.SavedBareConcreteFunction(
+      concrete_function_name=concrete_function.name,
+      allowed_positional_arguments=concrete_function._num_positional_args,
+      argument_keywords=concrete_function._arg_keywords)
+  # pylint: enable=protected-access
+
+
+def serialize_function(function):
+  """Build a SavedFunction proto."""
+  coder = nested_structure_coder.StructureCoder()
+  proto = saved_object_graph_pb2.SavedFunction()
+
+  function_spec_proto = _serialize_function_spec(function.function_spec, coder)
+  proto.function_spec.CopyFrom(function_spec_proto)
+  all_concrete_functions = \
+      function._list_all_concrete_functions_for_serialization()  # pylint: disable=protected-access
+  for concrete_function in all_concrete_functions:
+    proto.concrete_functions.append(concrete_function.name)
+  return proto
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
index e3095f4ee5e09ae0973164acc748e2d922e8a991..f37678b093d379c71e4f6313046189515b706415 100644
--- a/tensorflow/python/saved_model/load.py
+++ b/tensorflow/python/saved_model/load.py
@@ -12,46 +12,182 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Import a checkpointable object from a SavedModel."""
+"""Import a trackable object from a SavedModel."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import os
 
-from tensorflow.python.lib.io import file_io
-from tensorflow.python.saved_model import constants
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.saved_model import function_deserialization
+from tensorflow.python.saved_model import load_v1_in_v2
 from tensorflow.python.saved_model import loader_impl
-from tensorflow.python.saved_model import saved_object_graph_pb2
+from tensorflow.python.saved_model import nested_structure_coder
+from tensorflow.python.saved_model import revived_types
 from tensorflow.python.saved_model import utils_impl as saved_model_utils
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.util import compat
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import graph_view
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 class _Loader(object):
   """Helper class to load an object-based SavedModel."""
 
   def __init__(self, object_graph_proto, saved_model_proto, export_dir):
-    self._asset_file_def = saved_model_proto.meta_graphs[0].asset_file_def
+    meta_graph = saved_model_proto.meta_graphs[0]
+    self._asset_file_def = meta_graph.asset_file_def
+    self._operation_attributes = {
+        node.name: node.attr for node in meta_graph.graph_def.node}
     self._proto = object_graph_proto
     self._export_dir = export_dir
+    self._concrete_functions = (
+        function_deserialization.load_function_def_library(
+            meta_graph.graph_def.library))
     self._load_all()
+    # TODO(b/124045874): There are limitations with functions whose captures
+    # trigger other functions to be executed. For now it is only guaranteed to
+    # work if the captures of a function only trigger functions without
+    # captures.
+    self._setup_functions_structures()
+    self._setup_functions_captures()
+    self._restore_checkpoint()
+
+    for node in self._nodes:
+      if isinstance(node, tracking.TrackableResource):
+        init_op = node.initialize()
+        ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
+
+  def _setup_functions_structures(self):
+    """Setup structure for inputs and outputs of restored functions."""
+    coder = nested_structure_coder.StructureCoder()
+    for name, proto in sorted(self._proto.concrete_functions.items()):
+      concrete_function = self._concrete_functions[name]
+      # By setting the structured_outputs directly, we can rely on this
+      # function_lib.ConcreteFunction object to perform the output repacking
+      # logic. The only limitation of that logic is that it only works
+      # with output that is convertible to Tensors and the conversion
+      # always happens. For example tf.TensorShape([2, 3]) will be
+      # converted to Tensor representing [2, 3].
+      original_outputs = coder.decode_proto(proto.output_signature)
+      # The original_outputs here had Tensors converted to TensorSpecs, so
+      # the restored function's structured_outputs field will not be
+      # exactly the same. Fortunately the repacking logic cares only about
+      # the structure.
+      # TODO(vbardiovsky): Should we just replicate the structures, with
+      # Nones instead of real objects?
+      concrete_function._func_graph.structured_outputs = original_outputs  # pylint: disable=protected-access
+      concrete_function._func_graph.structured_input_signature = (  # pylint: disable=protected-access
+          coder.decode_proto(proto.canonicalized_input_signature))
+
+  def _setup_functions_captures(self):
+    """Setup captures and variables in restored functions."""
+    concrete_functions = sorted(self._proto.concrete_functions.items())
+    for name, proto in concrete_functions:
+      concrete_function = self._concrete_functions[name]
+      bound_inputs = [
+          self._get_tensor_from_node(node_id)
+          for node_id in proto.bound_inputs]
+      bound_variables = [
+          self._nodes[node_id]
+          for node_id in proto.bound_inputs
+          if self._proto.nodes[node_id].WhichOneof("kind") == "variable"
+      ]
+      # TODO(andresp): This is only injecting the captured inputs into the
+      # concrete function, note that we did not modify the FuncGraph
+      # itself.
+      concrete_function._captured_inputs = bound_inputs  # pylint: disable=protected-access
+      concrete_function._func_graph.variables = bound_variables  # pylint: disable=protected-access
+
+  def _get_tensor_from_node(self, node_id):
+    """Resolves a node id into a tensor to be captured for a function."""
+    with ops.init_scope():
+      obj = self._nodes[node_id]
+      if resource_variable_ops.is_resource_variable(obj):
+        return obj.handle
+      elif isinstance(obj, tracking.TrackableAsset):
+        return obj.asset_path
+      elif tensor_util.is_tensor(obj):
+        return obj
+      elif isinstance(obj, tracking.TrackableResource):
+        # Note: this executes restored functions in the TrackableResource.
+        return obj.resource_handle
+      raise ValueError("Can't convert node %s to tensor" % (type(obj)))
 
   def _load_all(self):
-    self._nodes = [self._recreate(proto) for proto in self._proto.nodes]
+    """Load all saved objects and wire their properties."""
+    self._nodes = []
+    node_setters = []
+    for proto in self._proto.nodes:
+      node, setter = self._recreate(proto)
+      self._nodes.append(node)
+      node_setters.append(setter)
     # After creating the objects, construct the edges between the objects.
-    for obj, object_proto in zip(self._nodes, self._proto.nodes):
+    for obj, object_proto, setter in zip(self._nodes, self._proto.nodes,
+                                         node_setters):
       for reference in object_proto.children:
-        setattr(obj, reference.local_name, self._nodes[reference.node_id])
+        setter(obj, reference.local_name, self._nodes[reference.node_id])
+        # Note: if an object has an attribute `__call__` add a class method
+        # that allows `obj()` syntax to work. This is done per-instance to
+        # allow `callable` to be used to find out if an object is callable.
+        if reference.local_name == "__call__":
+          setattr(type(obj), "__call__", _call_attribute)
+
+  def _restore_checkpoint(self):
+    """Load state from checkpoint into the deserialized objects."""
+    variables_path = saved_model_utils.get_variables_path(self._export_dir)
+    # TODO(andresp): Clean use of private methods of TrackableSaver.
+    # pylint: disable=protected-access
+    saver = util.TrackableSaver(graph_view.ObjectGraphView(self.get(0)))
+    saver._file_prefix_placeholder = constant_op.constant(variables_path)
+    load_status = saver.restore(variables_path)
+    load_status.assert_existing_objects_matched()
+    checkpoint = load_status._checkpoint
+
+    # When running in eager mode, the `restore` call above has already run and
+    # restored the state of trackables, call `position.restore_ops()` will
+    # return an empty list as there is nothing left to do. In graph mode, that
+    # will return the list of ops that must run to restore the object on that
+    # position. We have to wire them in the initializers of the objects so that
+    # they get initialized properly when using common practices (e.g. the ones
+    # used by ManagedSession) without further user action.
+    for object_id, obj in dict(checkpoint.object_by_proto_id).items():
+      position = base.CheckpointPosition(checkpoint=checkpoint,
+                                         proto_id=object_id)
+      restore_ops = position.restore_ops()
+      if restore_ops:
+        if resource_variable_ops.is_resource_variable(obj):
+          obj._initializer_op = restore_ops
+        else:
+          raise NotImplementedError(
+              ("Missing functionality to restore state of object "
+               "%r from the checkpoint." % obj))
 
   def get(self, node_id):
     return self._nodes[node_id]
 
   def _recreate(self, proto):
+    """Creates a Python object from a SavedObject protocol buffer."""
     factory = {
         "user_object": lambda: self._recreate_user_object(proto.user_object),
         "asset": lambda: self._recreate_asset(proto.asset),
+        "function": lambda: self._recreate_function(proto.function),
+        "bare_concrete_function": functools.partial(
+            self._recreate_bare_concrete_function,
+            proto.bare_concrete_function),
+        "variable": lambda: self._recreate_variable(proto.variable),
+        "constant": lambda: self._recreate_constant(proto.constant),
+        "resource": lambda: self._recreate_resource(proto.resource),
     }
     kind = proto.WhichOneof("kind")
     if kind not in factory:
@@ -59,38 +195,135 @@ class _Loader(object):
     return factory[kind]()
 
   def _recreate_user_object(self, proto):
-    del proto
-    return tracking.Checkpointable()
+    """Instantiates a SavedUserObject."""
+    looked_up = revived_types.deserialize(proto)
+    if looked_up is None:
+      # Note: each user object has its own class. This allows to make each one
+      # individually callable by adding a `__call__` method to the classes of
+      # the objects instances that have a `__call__` property.
+
+      class _UserObject(tracking.AutoTrackable):
+        pass
+
+      return _UserObject(), setattr
+    return looked_up
 
   def _recreate_asset(self, proto):
     filename = os.path.join(
         saved_model_utils.get_assets_dir(self._export_dir),
         self._asset_file_def[proto.asset_file_def_index].filename)
-    return tracking.TrackableAsset(filename)
+    return tracking.TrackableAsset(filename), setattr
+
+  def _recreate_function(self, proto):
+    return function_deserialization.recreate_function(
+        proto, self._concrete_functions), setattr
+
+  def _recreate_bare_concrete_function(self, proto):
+    return function_deserialization.setup_bare_concrete_function(
+        proto, self._concrete_functions), setattr
+
+  def _recreate_variable(self, proto):
+    # TODO(andresp): Can we use the checkpointed value as initializer?
+    dummy_value = init_ops.Zeros(dtype=proto.dtype)(shape=proto.shape)
+    return variables.Variable(dummy_value, trainable=proto.trainable), setattr
+
+  def _recreate_constant(self, proto):
+    tensor_proto = self._operation_attributes[proto.operation]["value"].tensor
+    imported_constant = constant_op.constant(
+        tensor_util.MakeNdarray(tensor_proto))
+    return imported_constant, setattr
+
+  def _recreate_resource(self, proto):
+    del proto
+    return _RestoredResource(), setattr
+
+
+# TODO(b/124205571,b/124092991): Solve destruction of resources.
+class _RestoredResource(tracking.TrackableResource):
+  """Restored SavedResource."""
+
+  def create_resource(self):
+    raise RuntimeError()
+
+  def initialize(self):
+    raise RuntimeError()
+
+  def _list_functions_for_serialization(self):
+    # Overwrite this method to avoid the implementation of
+    # base class to re-wrap the polymorphic functions into
+    # another layer of `tf.function`.
+    return {
+        "create_resource": self.create_resource,
+        "initialize": self.initialize,
+    }
+
+
+def _call_attribute(instance, *args, **kwargs):
+  return instance.__call__(*args, **kwargs)
+
+
+@tf_export("saved_model.load", v1=["saved_model.load_v2"])
+def load(export_dir, tags=None):
+  """Load a SavedModel from `export_dir`.
+
+  Signatures associated with the SavedModel are available as functions:
+
+  ```python
+  imported = tf.saved_model.load(path)
+  f = imported.signatures["serving_default"]
+  print(f(x=tf.constant([[1.]])))
+  ```
+
+  Objects exported with `tf.saved_model.save` additionally have trackable
+  objects and functions assigned to attributes:
 
+  ```python
+  exported = tf.train.Checkpoint(v=tf.Variable(3.))
+  exported.f = tf.function(
+      lambda x: exported.v * x,
+      input_signature=[tf.TensorSpec(shape=None, dtype=tf.float32)])
+  tf.saved_model.save(exported, path)
+  imported = tf.saved_model.load(path)
+  assert 3. == imported.v.numpy()
+  assert 6. == imported.f(x=tf.constant(2.)).numpy()
+  ```
 
-def _load_saved_object_graph_proto(filename):
-  with file_io.FileIO(filename, "rb") as f:
-    contents = f.read()
-    return saved_object_graph_pb2.SavedObjectGraph.FromString(contents)
+  Args:
+    export_dir: The SavedModel directory to load from.
+    tags: A tag or sequence of tags identifying the MetaGraph to load. Optional
+      if the SavedModel contains a single MetaGraph, as for those exported from
+      `tf.saved_model.load`.
 
+  Returns:
+    A trackable object with a `signatures` attribute mapping from signature
+    keys to functions. If the SavedModel was exported by `tf.saved_model.load`,
+    it also points to trackable objects and functions which were attached
+    to the exported object.
 
-def load(export_dir):
-  """Load a SavedModel from `export_dir`."""
+  Raises:
+    ValueError: If `tags` don't match a MetaGraph in the SavedModel.
+  """
+  if tags is not None:
+    # Supports e.g. tags=SERVING and tags=[SERVING]
+    tags = nest.flatten(tags)
   saved_model_proto = loader_impl.parse_saved_model(export_dir)
-  object_graph_filename = os.path.join(
-      compat.as_bytes(export_dir),
-      compat.as_bytes(constants.EXTRA_ASSETS_DIRECTORY),
-      compat.as_bytes("object_graph.pb"))
-  if file_io.file_exists(object_graph_filename):
-    object_graph_proto = _load_saved_object_graph_proto(object_graph_filename)
-    loader = _Loader(object_graph_proto,
-                     saved_model_proto,
-                     export_dir)
-    root = loader.get(0)
+  if (len(saved_model_proto.meta_graphs) == 1
+      and saved_model_proto.meta_graphs[0].HasField("object_graph_def")):
+    meta_graph_def = saved_model_proto.meta_graphs[0]
+    if (tags is not None
+        and set(tags) != set(meta_graph_def.meta_info_def.tags)):
+      raise ValueError(
+          ("The SavedModel at {} has one MetaGraph with tags {}, but got an "
+           "incompatible argument tags={} to tf.saved_model.load. You may omit "
+           "it, pass 'None', or pass matching tags.")
+          .format(export_dir, meta_graph_def.meta_info_def.tags, tags))
+    object_graph_proto = meta_graph_def.object_graph_def
+    with ops.init_scope():
+      loader = _Loader(object_graph_proto,
+                       saved_model_proto,
+                       export_dir)
+      root = loader.get(0)
   else:
-    raise NotImplementedError(
-        "Currently only SavedModels exported with `tf.saved_model.save` may be "
-        "imported. Other SavedModels may eventually be supported via load().")
-  # TODO(allenl): load functions from the SavedModel into the eager context
+    with ops.init_scope():
+      root = load_v1_in_v2.load(export_dir, tags)
   return root
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index a2971101cdb5ae93613df65f0379866244a7a3fe..845fc40e5c57e60dc209b5b0b3cb4310b05c80c9 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -12,56 +12,98 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for checkpointable object SavedModel loading."""
+"""Tests for trackable object SavedModel loading."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import os
 import tempfile
 
+from absl.testing import parameterized
+
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.saved_model import load
 from tensorflow.python.saved_model import save
-from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.training import monitored_session
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
+from tensorflow.python.util import tf_inspect
 
 
-class LoadTest(test.TestCase):
+@parameterized.named_parameters(
+    dict(testcase_name="ReloadOnce", cycles=1),
+    dict(testcase_name="ReloadTwice", cycles=2),
+    dict(testcase_name="ReloadThrice", cycles=3))
+class LoadTest(test.TestCase, parameterized.TestCase):
 
-  def test_structure_import(self):
-    root = tracking.Checkpointable()
-    root.f = def_function.function(
-        lambda x: 2. * x,
-        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
-    root.dep_one = tracking.Checkpointable()
-    root.dep_two = tracking.Checkpointable()
-    root.dep_two.dep = tracking.Checkpointable()
+  def cycle(self, obj, cycles=1, signatures=None):
+    to_save = obj
+    # TODO(vbardiovsky): It would be nice if exported protos reached a fixed
+    # point w.r.t. saving/restoring, ideally after 2nd saving.
+    for _ in range(cycles):
+      path = tempfile.mkdtemp(prefix=self.get_temp_dir())
+      save.save(to_save, path, signatures)
+      loaded = load.load(path)
+      to_save = loaded
+    return loaded
+
+  def test_structure_import(self, cycles):
+    root = tracking.AutoTrackable()
+    root.dep_one = tracking.AutoTrackable()
+    root.dep_two = tracking.AutoTrackable()
+    root.dep_two.dep = tracking.AutoTrackable()
     root.dep_three = root.dep_two.dep
-    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    save.save(root, save_dir)
-    imported = load.load(save_dir)
+    imported = self.cycle(root, cycles)
     self.assertIs(imported.dep_three, imported.dep_two.dep)
     self.assertIsNot(imported.dep_one, imported.dep_two)
 
+  def test_variables(self, cycles):
+    root = tracking.AutoTrackable()
+    root.v1 = variables.Variable(1., trainable=True)
+    root.v2 = variables.Variable(2., trainable=False)
+    imported = self.cycle(root, cycles)
+    self.assertEqual(imported.v1.numpy(), 1.0)
+    self.assertTrue(imported.v1.trainable)
+    self.assertEqual(imported.v2.numpy(), 2.0)
+    self.assertFalse(imported.v2.trainable)
+
+  def test_capture_variables(self, cycles):
+    root = tracking.AutoTrackable()
+    root.weights = variables.Variable(2.)
+    root.f = def_function.function(
+        lambda x: root.weights * x,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    imported = self.cycle(root, cycles)
+    self.assertEqual(4., imported.f(constant_op.constant(2.)).numpy())
+    imported.weights.assign(4.0)
+    self.assertEqual(8., imported.f(constant_op.constant(2.)).numpy())
+
   def _make_asset(self, contents):
     filename = tempfile.mktemp(prefix=self.get_temp_dir())
     with open(filename, "w") as f:
       f.write(contents)
     return filename
 
-  def test_assets_import(self):
+  def test_assets(self, cycles):
     file1 = self._make_asset("contents 1")
     file2 = self._make_asset("contents 2")
 
-    root = tracking.Checkpointable()
-    root.f = def_function.function(
-        lambda x: 2. * x,
-        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    root = tracking.AutoTrackable()
     root.asset1 = tracking.TrackableAsset(file1)
     root.asset2 = tracking.TrackableAsset(file2)
 
@@ -75,27 +117,997 @@ class LoadTest(test.TestCase):
 
     imported = load.load(load_dir)
     with open(imported.asset1.asset_path.numpy(), "r") as f:
-      self.assertEquals("contents 1", f.read())
+      self.assertEqual("contents 1", f.read())
     with open(imported.asset2.asset_path.numpy(), "r") as f:
-      self.assertEquals("contents 2", f.read())
+      self.assertEqual("contents 2", f.read())
 
-  def test_assets_dedup(self):
-    vocab = self._make_asset("contents")
-    root = tracking.Checkpointable()
+  def test_capture_assets(self, cycles):
+    root = tracking.AutoTrackable()
+    root.vocab = tracking.TrackableAsset(self._make_asset("contents"))
     root.f = def_function.function(
-        lambda x: 2. * x,
-        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+        lambda: root.vocab.asset_path,
+        input_signature=[])
+    imported = self.cycle(root, cycles)
+    original_output = root.f().numpy()
+    imported_output = imported.f().numpy()
+    self.assertNotEqual(original_output, imported_output)
+    with open(imported_output, "r") as f:
+      self.assertEqual("contents", f.read())
 
-    root.asset1 = tracking.TrackableAsset(vocab)
-    root.asset2 = tracking.TrackableAsset(vocab)
+  def test_capture_assets_in_graph(self, cycles):
+    root = tracking.AutoTrackable()
+    root.vocab = tracking.TrackableAsset(self._make_asset("contents"))
+    root.f = def_function.function(
+        lambda: root.vocab.asset_path,
+        input_signature=[])
+
+    original_output = root.f().numpy()
+
+    if cycles > 1:
+      root = self.cycle(root, cycles - 1)
+    path = tempfile.mkdtemp(prefix=self.get_temp_dir())
+    save.save(root, path)
 
-    export_dir = os.path.join(self.get_temp_dir(), "save_dir")
-    save.save(root, export_dir)
-    imported = load.load(export_dir)
+    with ops.Graph().as_default():
+      imported = load.load(path)
+      imported_tensor = imported.f()
+      with monitored_session.MonitoredSession() as sess:
+        imported_output = sess.run(imported_tensor)
+        self.assertNotEqual(original_output, imported_output)
+        with open(imported_output, "r") as f:
+          self.assertEqual("contents", f.read())
 
+  def test_dedup_assets(self, cycles):
+    vocab = self._make_asset("contents")
+    root = tracking.AutoTrackable()
+    root.asset1 = tracking.TrackableAsset(vocab)
+    root.asset2 = tracking.TrackableAsset(vocab)
+    imported = self.cycle(root, cycles)
     self.assertEqual(imported.asset1.asset_path.numpy(),
                      imported.asset2.asset_path.numpy())
 
+  def test_implicit_input_signature(self, cycles):
+    @def_function.function
+    def func(x):
+      return 2 * x
+
+    root = tracking.AutoTrackable()
+    root.f = func
+
+    # Add two traces.
+    root.f(constant_op.constant(1.))
+    root.f(constant_op.constant(1))
+
+    imported = self.cycle(root, cycles)
+
+    self.assertEqual(4., imported.f(constant_op.constant(2.)).numpy())
+    self.assertEqual(14, imported.f(constant_op.constant(7)).numpy())
+
+  def test_explicit_input_signature(self, cycles):
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    def func(x):
+      return 2 * x
+
+    root = tracking.AutoTrackable()
+    root.f = func
+
+    imported = self.cycle(root, cycles)
+    self.assertEqual(4., imported.f(constant_op.constant(2.0)).numpy())
+
+  def test_explicit_save_signature(self, cycles):
+    @def_function.function
+    def func(x):
+      return 2 * x
+
+    root = tracking.AutoTrackable()
+    root.f = func
+
+    imported = self.cycle(
+        root, cycles, {
+            "f":
+                root.f.get_concrete_function(
+                    tensor_spec.TensorSpec(None, dtypes.float32))
+        })
+    self.assertEqual(4., imported.f(constant_op.constant(2.0)).numpy())
+
+  def test_nested_functions(self, cycles):
+    f = def_function.function(
+        lambda x: x*2.0,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    g = def_function.function(
+        lambda x: f(x) + 1.0,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+
+    root = tracking.AutoTrackable()
+    root.g = g
+    imported = self.cycle(root, cycles)
+    imported.g(constant_op.constant([1.0]))
+
+  def test_function_with_default_bool_input(self, cycles):
+
+    def func(x, training=False):
+      if training:
+        return 2 * x
+      else:
+        return 7
+
+    root = tracking.AutoTrackable()
+    root.f = def_function.function(func)
+
+    self.assertEqual(20, root.f(constant_op.constant(10), True).numpy())
+    self.assertEqual(7, root.f(constant_op.constant(1)).numpy())
+    self.assertEqual(2, root.f(constant_op.constant(1), True).numpy())
+
+    imported = self.cycle(root, cycles)
+
+    self.assertEqual(4, imported.f(constant_op.constant(2), True).numpy())
+    self.assertEqual(7, imported.f(constant_op.constant(2)).numpy())
+
+  def test_function_with_default_none_input(self, cycles):
+
+    def func(x, dtype=None):
+      if dtype:
+        return array_ops.zeros(shape=x.shape, dtype=dtype)
+      else:
+        return array_ops.zeros(shape=x.shape, dtype=dtypes.float32)
+
+    root = tracking.AutoTrackable()
+    root.f = def_function.function(func)
+
+    self.assertAllEqual([0.0, 0.0, 0.0],
+                        root.f(constant_op.constant([1, 2, 3])).numpy())
+    self.assertAllEqual([0.0, 0.0, 0.0],
+                        root.f(constant_op.constant([1.0, 2.0, 3.0])).numpy())
+    self.assertAllEqual([0.0, 0.0, 0.0, 0.0],
+                        root.f(constant_op.constant([1, 2, 3, 4])).numpy())
+    self.assertAllEqual([0, 0, 0],
+                        root.f(
+                            constant_op.constant([1.0, 2.0, 3.0]),
+                            dtype=dtypes.int32).numpy())
+
+    concrete_functions = root.f._list_all_concrete_functions_for_serialization()  # pylint: disable=protected-access
+    self.assertEqual(4, len(concrete_functions))
+
+    imported = self.cycle(root, cycles)
+
+    self.assertAllEqual([0.0, 0.0, 0.0],
+                        imported.f(constant_op.constant([1, 2, 3]),
+                                   None).numpy())
+    self.assertAllEqual([0.0, 0.0, 0.0],
+                        imported.f(constant_op.constant([1.0, 2.0,
+                                                         3.0])).numpy())
+    self.assertAllEqual([0.0, 0.0, 0.0, 0.0],
+                        imported.f(constant_op.constant([1, 2, 3, 4])).numpy())
+    self.assertAllEqual([0, 0, 0],
+                        imported.f(
+                            constant_op.constant([1.0, 2.0, 3.0]),
+                            dtype=dtypes.int32).numpy())
+
+  def test_function_no_return(self, cycles):
+
+    class TrackableWithOneVariable(tracking.AutoTrackable):
+
+      def __init__(self, initial_value=0.0):
+        super(TrackableWithOneVariable, self).__init__()
+        self.variable = variables.Variable(initial_value)
+
+      @def_function.function
+      def increase(self, by=1.0):
+        self.variable.assign_add(by)
+
+    obj = TrackableWithOneVariable(5.0)
+
+    obj.increase(constant_op.constant(10.0))
+    self.assertEqual(15.0, obj.variable.numpy())
+    obj.increase()
+    self.assertEqual(16.0, obj.variable.numpy())
+
+    imported = self.cycle(obj, cycles)
+
+    imported.increase(constant_op.constant(10.0))
+    self.assertEqual(26.0, imported.variable.numpy())
+    imported.increase(constant_op.constant(1.0))
+    self.assertEqual(27.0, imported.variable.numpy())
+
+  def test_structured_inputs(self, cycles):
+
+    def func(x, training=True):
+      # x is a nested structure, we care about one particular tensor.
+      _, (a, b) = x
+      if training:
+        return 2 * a["a"] + b
+      else:
+        return 7
+
+    root = tracking.AutoTrackable()
+    root.f = def_function.function(func)
+
+    x = constant_op.constant(10)
+    y = constant_op.constant(11)
+
+    input1 = [6, ({"a": x}, y)]
+    input2 = [7, ({"a": x}, y)]  # Not compatible with input1 signature.
+    input3 = [6, ({"a": y}, x)]  # Compatible with input1 signature.
+
+    # Note: by only calling f(input1) before serialization, only inputs with
+    # matching signature will be valid on the loaded model.
+    self.assertEqual(31, root.f(input1).numpy())
+
+    imported = self.cycle(root, cycles)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 "Could not find matching function to call"):
+      imported.f(input2)
+
+    self.assertEqual(31, imported.f(input1).numpy())
+    self.assertEqual(32, imported.f(input3).numpy())
+
+  def test_structured_output(self, cycles):
+
+    # Use fields with non-alphabetical order
+    named_tuple_type = collections.namedtuple("NamedTupleHello", ["b", "a"])
+
+    def func(input1, input2):
+      named_tuple = named_tuple_type(a=input1 + input2, b=input1 * input2)
+      return [named_tuple, input2, {"x": 0.5}]
+
+    root = tracking.AutoTrackable()
+    root.f = def_function.function(func)
+
+    result = root.f(constant_op.constant(2), constant_op.constant(3))
+
+    self.assertEqual(5, result[0].a.numpy())
+    self.assertEqual(6, result[0].b.numpy())
+    self.assertEqual(["b", "a"], list(result[0]._asdict().keys()))
+    self.assertEqual(3, result[1].numpy())
+    self.assertEqual(0.5, result[2]["x"].numpy())
+
+    imported = self.cycle(root, cycles)
+
+    result = imported.f(constant_op.constant(2), constant_op.constant(5))
+    self.assertEqual(7, result[0].a.numpy())
+    self.assertEqual(10, result[0].b.numpy())
+    self.assertEqual(["b", "a"], list(result[0]._asdict().keys()))
+    self.assertEqual(5, result[1].numpy())
+    self.assertEqual(0.5, result[2]["x"].numpy())
+
+  def test_positional_arguments(self, cycles):
+    def func(x, training=False, abc=7.1, defg=7.7):
+      del abc
+      if training:
+        return 2 * x
+      if defg == 7:
+        return 6
+      else:
+        return 7
+
+    root = tracking.AutoTrackable()
+    root.f = def_function.function(func)
+
+    self.assertEqual(20, root.f(constant_op.constant(10), True).numpy())
+    self.assertEqual(7, root.f(constant_op.constant(1)).numpy())
+    self.assertEqual(2, root.f(constant_op.constant(1), True).numpy())
+    self.assertEqual(6, root.f(constant_op.constant(1), defg=7.0).numpy())
+
+    imported = self.cycle(root, cycles)
+
+    self.assertEqual(4, imported.f(constant_op.constant(2), True).numpy())
+    self.assertEqual(7, imported.f(constant_op.constant(2)).numpy())
+    self.assertEqual(6, imported.f(constant_op.constant(1), defg=7.0).numpy())
+
+  def test_additional_kwargs(self, cycles):
+    def func(x, training=False, **options):
+      del options
+      if training:
+        return 2 * x
+      else:
+        return 7
+
+    root = tracking.AutoTrackable()
+    root.f = def_function.function(func)
+
+    x = constant_op.constant(10)
+    self.assertEqual(7, root.f(x, learning_rate=0.5, epochs=3).numpy())
+
+    imported = self.cycle(root, cycles)
+
+    with self.assertRaisesRegexp(ValueError,
+                                 "Could not find matching function to call.*"):
+      imported.f(x, learning_rate=0.5, epochs=4)
+
+    self.assertEqual(7, imported.f(x, learning_rate=0.5, epochs=3).numpy())
+
+  def test_member_function(self, cycles):
+    class TrackableWithMember(tracking.AutoTrackable):
+
+      def __init__(self):
+        super(TrackableWithMember, self).__init__()
+        self._some_value = 20
+
+      @def_function.function
+      def f(self, x, training=False):
+        if training:
+          return 2 * x
+        else:
+          return 7 + self._some_value
+
+    root = TrackableWithMember()
+
+    self.assertEqual(20, root.f(constant_op.constant(10), True).numpy())
+    self.assertEqual(27, root.f(constant_op.constant(1)).numpy())
+    self.assertEqual(2, root.f(constant_op.constant(1), True).numpy())
+
+    imported = self.cycle(root, cycles)
+
+    self.assertEqual(4, imported.f(constant_op.constant(2), True).numpy())
+    self.assertEqual(27, imported.f(constant_op.constant(2)).numpy())
+
+  def test_side_effect_listing(self, cycles):
+    class M(tracking.AutoTrackable):
+
+      def __init__(self):
+        super(M, self).__init__()
+        self.var = None
+
+      @def_function.function(
+          input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+      def f(self, x):
+        if self.var is None:
+          self.var = variables.Variable(2.)
+        return x * self.var
+
+    m = M()
+    self.cycle(m)
+    self.assertEqual(4.0, m.f(constant_op.constant(2.0)).numpy())
+
+  def test_basic_backprop(self, cycles):
+    weight = variables.Variable(1., trainable=True)
+    bias = variables.Variable(0., trainable=True)
+    g = def_function.function(
+        lambda x: x*weight + bias,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+
+    root = tracking.AutoTrackable()
+    root.weight = weight
+    root.bias = bias
+    root.g = g
+    imported = self.cycle(root, cycles)
+    with backprop.GradientTape() as t:
+      x = constant_op.constant([3.5])
+      loss = imported.g(x)
+      grad = t.gradient(loss, [imported.weight, imported.bias])
+      self.assertAllClose(grad, [3.5, 1.0])
+
+  def test_nested_backprop(self, cycles):
+    weight = variables.Variable(1., trainable=True)
+    bias = variables.Variable(0., trainable=True)
+
+    # Note: this function gets called from other function defs via a
+    # "PartitionedCall" op node.
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec(None, dtypes.float32),
+        tensor_spec.TensorSpec(None, dtypes.float32)])
+    def mul(x, y):
+      return x * y
+
+    # Note: this function gets called from other function defs via a
+    # "StatefulPartitionedCall" op node.
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec(None, dtypes.float32)])
+    def f(x):
+      return mul(weight.read_value(), x)
+
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec(None, dtypes.float32)])
+    def g(x):
+      return f(x) + bias,
+
+    @def_function.function(input_signature=[
+        tensor_spec.TensorSpec(None, dtypes.float32)])
+    def h(x):
+      return g(x) + bias,
+
+    root = tracking.AutoTrackable()
+    root.weight = weight
+    root.bias = bias
+    root.g = h
+
+    imported = self.cycle(root, cycles)
+    with backprop.GradientTape() as t:
+      x = constant_op.constant([3.5])
+      loss = imported.g(x)
+    grad = t.gradient(loss, [imported.weight, imported.bias])
+    self.assertAllClose(grad, [3.5, 2.0])
+
+  def test_callable(self, cycles):
+    class M1(tracking.AutoTrackable):
+
+      @def_function.function(
+          input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+      def __call__(self, x):
+        return x
+
+    root = tracking.AutoTrackable()
+    root.m1 = M1()
+    root.m2 = tracking.AutoTrackable()
+    root.m2.__call__ = def_function.function(
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])(
+            lambda x: x*3.0)
+    imported = self.cycle(root, cycles)
+    x = constant_op.constant(1.0)
+
+    self.assertTrue(callable(imported.m1))
+    self.assertAllEqual(root.m1(x), imported.m1(x))
+
+    # Note: `root.m2` was not callable since `__call__` attribute was set
+    # into the instance and not on the class. But after a serialization cycle
+    # that starts to work.
+    self.assertTrue(callable(imported.m2))
+    self.assertAllEqual(root.m2.__call__(x), imported.m2(x))
+
+    # Verify that user objects without `__call__` attribute are not callable.
+    self.assertFalse(callable(imported))
+
+  def test_chain_callable(self, cycles):
+    func = def_function.function(
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])(
+            lambda x: x*3.0)
+    root = tracking.AutoTrackable()
+    root.__call__ = tracking.AutoTrackable()
+    root.__call__.__call__ = tracking.AutoTrackable()
+    root.__call__.__call__.__call__ = func
+
+    imported = self.cycle(root, cycles)
+    self.assertTrue(callable(imported))
+    x = constant_op.constant(1.0)
+    self.assertAllEqual(imported(x).numpy(), 3.0)
+
+  def test_load_in_graph_mode(self, cycles):
+    root = tracking.AutoTrackable()
+    root.v1 = variables.Variable(1.)
+    root.v2 = variables.Variable(2.)
+    root.f = def_function.function(
+        lambda x: root.v2 * x,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+
+    if cycles > 1:
+      root = self.cycle(root, cycles - 1)
+    path = tempfile.mkdtemp(prefix=self.get_temp_dir())
+    save.save(root, path)
+
+    with ops.Graph().as_default():
+      imported = load.load(path)
+      var_v1 = imported.v1
+      output = imported.f(constant_op.constant(2.))
+      with monitored_session.MonitoredSession() as sess:
+        self.assertEqual(1.0, sess.run(var_v1))
+        self.assertEqual(4.0, sess.run(output))
+
+  def test_load_in_func_graph(self, cycles):
+    root = tracking.AutoTrackable()
+    root.v1 = variables.Variable(1.)
+    root.v2 = variables.Variable(2.)
+    root.f = def_function.function(
+        lambda x: root.v2 * x,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+
+    if cycles > 1:
+      root = self.cycle(root, cycles - 1)
+    path = tempfile.mkdtemp(prefix=self.get_temp_dir())
+    save.save(root, path)
+
+    closure = tracking.AutoTrackable()
+    @def_function.function
+    def func(x):
+      if not hasattr(closure, "model"):
+        closure.model = load.load(path)
+      return closure.model.f(x)
+
+    inputs = constant_op.constant(2.)
+    self.assertEqual(4.0, func(inputs).numpy())
+
+  def test_soft_matching(self, cycles):
+
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec([None], dtypes.int32)])
+    def func(x):
+      return 2 * x
+
+    root = tracking.AutoTrackable()
+    root.f = func
+
+    self.assertAllEqual([2], root.f(constant_op.constant([1])).numpy())
+    self.assertAllEqual([2, 4], root.f(constant_op.constant([1, 2])).numpy())
+
+    concrete_functions = root.f._list_all_concrete_functions_for_serialization()  # pylint: disable=protected-access
+    self.assertEqual(1, len(concrete_functions))
+
+    imported = self.cycle(root, cycles)
+
+    with self.assertRaisesRegexp(ValueError, "Cannot canonicalize"):
+      # We cannot call the function with a constant of shape ().
+      self.assertEqual(7, imported.f(constant_op.constant(2)).numpy())
+
+    # TODO(vbardiovsky): When classes are revived with input_signatures, we
+    # should also check that the calls below are not generating any more
+    # concrete functions.
+    self.assertAllEqual([2, 4, 6, 8],
+                        imported.f(constant_op.constant([1, 2, 3, 4])).numpy())
+    self.assertAllEqual([2, 4, 6],
+                        imported.f(constant_op.constant([1, 2, 3])).numpy())
+
+  def test_get_concrete_function(self, cycles):
+
+    @def_function.function
+    def func(x, training=False):
+      if training:
+        return 2 * x
+      else:
+        return 3 * x
+
+    func.get_concrete_function(
+        tensor_spec.TensorSpec([None], dtypes.int32), True)
+    func.get_concrete_function(tensor_spec.TensorSpec([None], dtypes.float32))
+
+    root = tracking.AutoTrackable()
+    root.f = func
+
+    imported = self.cycle(root, cycles)
+
+    concrete = imported.f.get_concrete_function(
+        training=True, x=tensor_spec.TensorSpec([None], dtypes.int32))
+
+    self.assertAllEqual([2, 4, 6, 8],
+                        concrete(x=constant_op.constant([1, 2, 3, 4])).numpy())
+    with self.assertRaisesRegexp(ValueError,
+                                 "Could not find matching function to call"):
+      imported.f.get_concrete_function(
+          tensor_spec.TensorSpec([None], dtypes.int32))
+    imported.f.get_concrete_function(
+        tensor_spec.TensorSpec([None], dtypes.int32), True)
+
+  def test_concrete_function(self, cycles):
+
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec([None], dtypes.int32)])
+    def func(x):
+      return 2 * x
+
+    root = tracking.AutoTrackable()
+    root.f = func.get_concrete_function()
+
+    self.assertAllEqual([2], root.f(constant_op.constant([1])).numpy())
+    self.assertAllEqual([2, 4], root.f(constant_op.constant([1, 2])).numpy())
+
+    # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
+    imported = self.cycle(root, cycles, signatures={})
+
+    self.assertAllEqual([2, 4, 6, 8],
+                        imported.f(constant_op.constant([1, 2, 3, 4])).numpy())
+    self.assertAllEqual([2, 4, 6],
+                        imported.f(constant_op.constant([1, 2, 3])).numpy())
+
+  def test_concrete_function_arg_names(self, cycles):
+
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec([None], dtypes.int32)])
+    def func(x):
+      return 2 * x
+
+    root = tracking.AutoTrackable()
+    root.f = func.get_concrete_function()
+
+    self.assertAllEqual([2], root.f(constant_op.constant([1])).numpy())
+
+    # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
+    imported = self.cycle(root, cycles, signatures={})
+
+    self.assertAllEqual([2, 4, 6],
+                        imported.f(x=constant_op.constant([1, 2, 3])).numpy())
+
+  def test_concrete_function_no_signature(self, cycles):
+    @def_function.function
+    def func(x):
+      return 2 * x
+
+    root = tracking.AutoTrackable()
+    root.f = func.get_concrete_function(constant_op.constant([1]))
+    self.assertAllEqual([4], root.f(constant_op.constant([2])).numpy())
+    # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
+    imported = self.cycle(root, cycles, signatures={})
+    self.assertAllEqual([6],
+                        imported.f(constant_op.constant([3])).numpy())
+
+  def test_concrete_function_backprop(self, cycles):
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec([None], dtypes.float32)])
+    def func(x):
+      return x ** 2.
+    root = tracking.AutoTrackable()
+    root.f = func.get_concrete_function()
+
+    def _compute_gradient(function):
+      with backprop.GradientTape() as tape:
+        inp = constant_op.constant(1.)
+        tape.watch(inp)
+        output = function(inp)
+      return tape.gradient(output, inp)
+
+    self.assertEqual(2., _compute_gradient(root.f).numpy())
+    # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
+    imported = self.cycle(root, cycles, signatures={})
+    self.assertEqual(2., _compute_gradient(imported.f).numpy())
+
+  def test_revived_concrete_function_kwargs(self, cycles):
+
+    @def_function.function
+    def func(x, y):
+      return x * (y + 1.)
+    root = tracking.AutoTrackable()
+    root.f = func.get_concrete_function(
+        tensor_spec.TensorSpec([], dtypes.float32),
+        tensor_spec.TensorSpec([], dtypes.float32))
+    self.assertEqual(8., root.f(y=constant_op.constant(3.),
+                                x=constant_op.constant(2.)).numpy())
+    # TODO(andresp): Fix exporting of loaded concrete functions as signatures.
+    imported = self.cycle(root, cycles, signatures={})
+    self.assertEqual(8., imported.f(y=constant_op.constant(3.),
+                                    x=constant_op.constant(2.)).numpy())
+
+  def test_revived_concrete_function_tensorspec_kwargs(self, cycles):
+
+    @def_function.function
+    def func(*args):
+      x, y = args
+      return x * (y + 1.)
+    root = tracking.AutoTrackable()
+    root.f = func.get_concrete_function(
+        tensor_spec.TensorSpec([], dtypes.float32, name="x"),
+        tensor_spec.TensorSpec([], dtypes.float32, name="y"))
+    self.assertEqual(8., root.f(y=constant_op.constant(3.),
+                                x=constant_op.constant(2.)).numpy())
+    imported = self.cycle(root, cycles, signatures={})
+    self.assertEqual(8., imported.f(y=constant_op.constant(3.),
+                                    x=constant_op.constant(2.)).numpy())
+
+  def test_concrete_function_variable_argument(self, cycles):
+    # TODO(allenl): Fix variables in input signatures.
+    self.skipTest("Need to fix encoding of variables in inputs signatures")
+    capture = variables.Variable(0)
+
+    @def_function.function
+    def func(v):
+      v.assign_add(1)
+      capture.assign_sub(1)
+
+    vsave = variables.Variable(1)
+    root = tracking.AutoTrackable()
+    root.f = func.get_concrete_function(vsave)
+    root.capture = capture
+    self.assertEqual(1, vsave.numpy())
+    root.f(vsave)
+    self.assertEqual(2, vsave.numpy())
+    self.assertEqual(-1, capture.numpy())
+    imported = self.cycle(root, cycles)
+
+    vload = variables.Variable(1)
+    imported.f(vload)
+    self.assertEqual(2, vload.numpy())
+    imported.f(v=vload)
+    self.assertEqual(3, vload.numpy())
+    self.assertEqual(-3, imported.capture.numpy())
+    self.assertEqual(-1, capture.numpy())
+
+  def test_function_and_component(self, cycles):
+
+    @def_function.function
+    def func(v):
+      return v + 1
+
+    root = tracking.AutoTrackable()
+    root.func = func
+    root.concrete_func = func.get_concrete_function(
+        tensor_spec.TensorSpec(None, dtypes.int32))
+    one = constant_op.constant(1)
+    self.assertEqual(2, root.func(one).numpy())
+    self.assertEqual(2, root.concrete_func(one).numpy())
+    imported = self.cycle(root, cycles)
+    self.assertEqual(2, imported.func(one).numpy())
+    self.assertEqual(2, imported.concrete_func(one).numpy())
+
+  def test_dict(self, cycles):
+    root = tracking.AutoTrackable()
+    root.variables = dict(a=variables.Variable(1.))
+    root.variables["b"] = variables.Variable(2.)
+    root.variables["c"] = 1
+    root.funcs = dict(
+        a=def_function.function(lambda: constant_op.constant(100.)))
+    root.funcs["conc"] = root.funcs["a"].get_concrete_function()
+    imported = self.cycle(root, cycles)
+    self.assertEqual(1., imported.variables["a"].numpy())
+    self.assertEqual(2., imported.variables["b"].numpy())
+    self.assertEqual(set(["a", "b"]), set(imported.variables.keys()))
+    self.assertEqual(100., imported.funcs["a"]().numpy())
+    self.assertEqual(100., imported.funcs["conc"]().numpy())
+
+  def test_list(self, cycles):
+    root = tracking.AutoTrackable()
+    root.variables = [variables.Variable(1.)]
+    root.variables.append(1)
+    root.variables.append(variables.Variable(3.))
+    imported = self.cycle(root, cycles)
+    self.assertEqual(1., imported.variables[0].numpy())
+    self.assertEqual(3., imported.variables[2].numpy())
+    self.assertIs(None, imported.variables[1])
+    self.assertEqual(3, len(imported.variables))
+
+  def test_functions_list(self, cycles):
+    root = tracking.AutoTrackable()
+    v1 = variables.Variable(1.)
+    root.losses = [def_function.function(lambda: math_ops.reduce_sum(v1 ** 2))]
+    root.variables = [v1]
+
+    @def_function.function
+    def _v2_loss():
+      if len(root.variables) == 1:
+        v2 = variables.Variable(2.)
+        root.variables.append(v2)
+      return math_ops.reduce_sum(root.variables[1] ** 2)
+
+    root.losses.append(_v2_loss)
+    self.assertAllClose([1., 4.], [loss() for loss in root.losses])
+    imported = self.cycle(root, cycles)
+    self.assertAllClose([1., 4.], [loss() for loss in imported.losses])
+    imported.variables[0].assign(3.)
+    imported.variables[1].assign(4.)
+    self.assertAllClose([9., 16.], [loss() for loss in imported.losses])
+
+  def test_captured_constant(self, cycles):
+    const = array_ops.zeros([100])
+    root = tracking.AutoTrackable()
+    root.f = def_function.function(lambda: const + 1.)
+    root.g = def_function.function(lambda: const + 2.)
+    self.assertAllClose(array_ops.ones([100]), root.f())
+    self.assertAllClose(2. * array_ops.ones([100]), root.g())
+    imported = self.cycle(root, cycles)
+    self.assertAllClose(array_ops.ones([100]), imported.f())
+    self.assertAllClose(2. * array_ops.ones([100]), imported.g())
+    # TODO(b/123408994): Use the public get_concrete_function.
+    f_concrete = imported.f._list_all_concrete_functions_for_serialization()[0]
+    g_concrete = imported.g._list_all_concrete_functions_for_serialization()[0]
+    self.assertLen(f_concrete.captured_inputs, 1)
+    self.assertLen(g_concrete.captured_inputs, 1)
+    # We should be using the same captured EagerTensor in both functions, not
+    # duplicating the constant.
+    self.assertIs(f_concrete.captured_inputs[0],
+                  g_concrete.captured_inputs[0])
+
+  def test_functions_accessed_once(self, cycles):
+
+    class Exported(tracking.AutoTrackable):
+
+      def __init__(self):
+        self._counter = 0
+
+      @property
+      def make_func(self):
+        @def_function.function
+        def f():
+          return constant_op.constant(self._counter)
+        f.get_concrete_function()  # force a trace
+        self._counter += 1
+        return f
+
+    exported = Exported()
+    imported = self.cycle(exported, cycles)
+    self.assertEqual(0, imported.make_func().numpy())
+    self.assertEqual(1, exported.make_func().numpy())
+
+  def test_overwritten_signatures_error(self, cycles):
+    exported = tracking.AutoTrackable()
+    exported.f = def_function.function(lambda: constant_op.constant(1.))
+    imported = self.cycle(
+        exported, cycles,
+        signatures={"key": exported.f.get_concrete_function()})
+    self.assertEqual(1., imported.signatures["key"]()["output_0"].numpy())
+    imported.signatures = {"key1": imported.signatures["key"]}
+    with self.assertRaisesRegexp(ValueError, "signatures"):
+      save.save(imported, tempfile.mkdtemp(prefix=self.get_temp_dir()))
+
+  def test_signature_loading(self, cycles):
+
+    class Exported(tracking.AutoTrackable):
+
+      def __init__(self):
+        self.v = variables.Variable(3.)
+
+      @def_function.function
+      def do(self, x):
+        return self.v * x
+
+    exported = Exported()
+    imported = self.cycle(
+        exported,
+        signatures=exported.do.get_concrete_function(
+            tensor_spec.TensorSpec(None, dtypes.float32)))
+    for _ in range(cycles - 1):
+      imported = self.cycle(imported, signatures=imported.signatures)
+    self.assertEqual(["serving_default"], list(imported.signatures.keys()))
+    imported_function = imported.signatures["serving_default"]
+    two = constant_op.constant(2.)
+    self.assertEqual(6., imported_function(x=two)["output_0"].numpy())
+    imported.v.assign(4.)
+    self.assertEqual(8., imported_function(x=two)["output_0"].numpy())
+    self.assertEqual(8., imported_function(two)["output_0"].numpy())
+    with self.assertRaises(TypeError):
+      # The signatures mapping is immutable
+      imported.signatures["random_key"] = 3
+
+  def test_multiple_argument_signatures_no_positional(self, cycles):
+
+    class Exported(tracking.AutoTrackable):
+
+      @def_function.function
+      def do(self, x, y):
+        return x + y
+
+    exported = Exported()
+    imported = self.cycle(
+        exported, signatures=exported.do.get_concrete_function(
+            tensor_spec.TensorSpec(None, dtypes.float32),
+            tensor_spec.TensorSpec(None, dtypes.float32)))
+    for _ in range(cycles - 1):
+      imported = self.cycle(imported, signatures=imported.signatures)
+    with self.assertRaises(TypeError):
+      imported.signatures["serving_default"](
+          constant_op.constant(1.),
+          y=constant_op.constant(2.))
+    self.assertEqual(
+        {"output_0": 3.},
+        self.evaluate(imported.signatures["serving_default"](
+            x=constant_op.constant(1.),
+            y=constant_op.constant(2.))))
+
+  def _make_model_with_tables(self):
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table1_initializer = lookup_ops.KeyValueTensorInitializer(keys, values)
+    table1 = lookup_ops.HashTable(table1_initializer, default_val)
+
+    table2_file = self._make_asset("test\nfoo\nbrain\n")
+    table2_initializer = lookup_ops.TextFileIdTableInitializer(table2_file)
+    table2 = lookup_ops.HashTable(table2_initializer, default_val)
+
+    def _make_lookup_function(table):
+      signature = [tensor_spec.TensorSpec(None, dtypes.string)]
+      return def_function.function(input_signature=signature)(
+          lambda x: table.lookup(x))  # pylint: disable=unnecessary-lambda
+
+    root = tracking.AutoTrackable()
+    root.table1 = table1
+    root.lookup1 = _make_lookup_function(table1)
+    root.table2 = table2
+    root.lookup2 = _make_lookup_function(table2)
+    return root
+
+  def test_table(self, cycles):
+    root = self._make_model_with_tables()
+    imported = self.cycle(root, cycles, signatures={})
+    keys = constant_op.constant(["brain", "test", "foo", "surgery"])
+    self.assertAllEqual([0, -1, -1, 2], imported.lookup1(keys).numpy())
+    self.assertAllEqual([2, 0, 1, -1], imported.lookup2(keys).numpy())
+
+  def test_table_in_graph(self, cycles):
+    root = self._make_model_with_tables()
+
+    if cycles > 1:
+      root = self.cycle(root, cycles - 1)
+    path = tempfile.mkdtemp(prefix=self.get_temp_dir())
+    save.save(root, path)
+    imported = self.cycle(root, 1)
+
+    with ops.Graph().as_default():
+      imported = load.load(path)
+      keys = constant_op.constant(["brain", "test", "foo", "surgery"])
+      output1 = imported.lookup1(keys)
+      output2 = imported.lookup2(keys)
+      with monitored_session.MonitoredSession() as sess:
+        self.assertAllEqual([0, -1, -1, 2], sess.run(output1))
+        self.assertAllEqual([2, 0, 1, -1], sess.run(output2))
+
+  def test_perserve_argspec(self, cycles):
+    def f(a, b, c):  # pylint: disable=unused-argument
+      return None
+
+    original_fullargspec = tf_inspect.getfullargspec(f)
+
+    root = tracking.AutoTrackable()
+    root.f = def_function.function(f)
+    imported = self.cycle(root, cycles)
+
+    restored_fullargspec = tf_inspect.getfullargspec(imported.f)
+    self.assertEqual(original_fullargspec, restored_fullargspec)
+
+  def test_canonicalize_inputs(self, cycles):
+    @def_function.function(autograph=False)
+    def func(a=1, b=2, c=3, training=True):
+      if training:
+        return [a, b, c, training]
+      else:
+        return [c, b, a, training]
+
+    # TODO(b/123501567): Work-around to trigger generic traces of a function
+    # with extra non tensor args.
+    signature = 3*[tensor_spec.TensorSpec(None, dtypes.float32)]
+    @def_function.function(input_signature=signature)
+    def trigger(a, b, c):
+      func(a, b, c, True)
+      func(a, b, c, False)
+
+    trigger.get_concrete_function()
+
+    root = tracking.AutoTrackable()
+    root.f = func
+    root = self.cycle(root, cycles)
+    self.assertAllEqual(root.f(), [1.0, 2.0, 3.0, True])
+    self.assertAllEqual(root.f(-1.0, training=False), [3.0, 2.0, -1.0, False])
+
+    with self.assertRaisesRegexp(ValueError,
+                                 "Could not find matching function"):
+      root.f(["hello", 1.0])
+
+  def test_prefer_specific_trace(self, cycles):
+    @def_function.function(autograph=False)
+    def func(a):
+      if isinstance(a, int):
+        return a
+      else:
+        return a + 1
+
+    self.assertAllEqual(2, func(2).numpy())
+    self.assertAllEqual(3, func(constant_op.constant(2)).numpy())
+
+    root = tracking.AutoTrackable()
+    root.f = func
+    root = self.cycle(root, cycles)
+    self.assertAllEqual(2, root.f(2).numpy())
+    self.assertAllEqual(4, root.f(3).numpy())
+    self.assertAllEqual(3, root.f(constant_op.constant(2)).numpy())
+    self.assertAllEqual(4, root.f(constant_op.constant(3)).numpy())
+
+
+class SingleCycleTests(test.TestCase, parameterized.TestCase):
+
+  def test_load_with_tags(self):
+    root = tracking.AutoTrackable()
+    path = tempfile.mkdtemp(prefix=self.get_temp_dir())
+    save.save(root, path)
+    with self.assertRaises(ValueError):
+      load.load(path, tags=[tag_constants.EVAL])
+    load.load(path, tags=[tag_constants.SERVING])
+    load.load(path, tags=tag_constants.SERVING)
+
+  def test_docstring_examples(self):
+    path = tempfile.mkdtemp(prefix=self.get_temp_dir())
+    exported = util.Checkpoint(v=variables.Variable(3.))
+    exported.f = def_function.function(
+        lambda x: exported.v * x,
+        input_signature=[
+            tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32)])
+    save.save(exported, path)
+    imported = load.load(path)
+    self.assertEqual(3., imported.v.numpy())
+    self.assertEqual(6., imported.f(x=constant_op.constant(2.)).numpy())
+
+    save.save(exported, path, exported.f.get_concrete_function())
+    imported = load.load(path)
+    f = imported.signatures["serving_default"]
+    self.assertAllEqual(
+        [[-3.]],
+        f(x=constant_op.constant([[-1.]]))["output_0"].numpy())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/load_v1_in_v2.py b/tensorflow/python/saved_model/load_v1_in_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb8ed7cfcce8d01ed5d5726f66a387a6239792f4
--- /dev/null
+++ b/tensorflow/python/saved_model/load_v1_in_v2.py
@@ -0,0 +1,159 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Import a TF v1-style SavedModel when executing eagerly."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+from tensorflow.python.eager import wrap_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.saved_model import loader_impl
+from tensorflow.python.saved_model import signature_serialization
+from tensorflow.python.training import saver as tf_saver
+from tensorflow.python.training.tracking import tracking
+
+
+class _Initializer(tracking.TrackableResource):
+  """Represents an initialization operation restored from a SavedModel.
+
+  Without this object re-export of imported 1.x SavedModels would omit the
+  original SavedModel's initialization procedure.
+
+  Created when `tf.saved_model.load` loads a TF 1.x-style SavedModel with an
+  initialization op. This object holds a function which runs the
+  initialization. It does not require any manual user intervention;
+  `tf.saved_model.save` will see this object and automatically add it to the
+  exported SavedModel, and `tf.saved_model.load` runs the initialization
+  function automatically.
+  """
+
+  def __init__(self, init_fn, asset_paths):
+    super(_Initializer, self).__init__()
+    self._asset_paths = asset_paths
+    self._init_fn = init_fn
+
+  def create_resource(self):
+    return array_ops.placeholder(
+        dtype=dtypes.resource, shape=[], name="unused_resource")
+
+  def initialize(self):
+    self._init_fn(*[path.asset_path for path in self._asset_paths])
+
+
+class _EagerSavedModelLoader(loader_impl.SavedModelLoader):
+  """Loads a SavedModel without using Sessions."""
+
+  def get_meta_graph_def_from_tags(self, tags):
+    """Override to support implicit one-MetaGraph loading with tags=None."""
+    if tags is None:
+      if len(self._saved_model.meta_graphs) != 1:
+        tag_sets = [mg.meta_info_def.tags
+                    for mg in self._saved_model.meta_graphs]
+        raise ValueError(
+            ("Importing a SavedModel with tf.saved_model.load requires a "
+             "'tags=' argument if there is more than one MetaGraph. Got "
+             "'tags=None', but there are {} MetaGraphs in the SavedModel with "
+             "tag sets {}. Pass a 'tags=' argument to load this SavedModel.")
+            .format(len(self._saved_model.meta_graphs), tag_sets))
+      return self._saved_model.meta_graphs[0]
+    return super(_EagerSavedModelLoader, self).get_meta_graph_def_from_tags(
+        tags)
+
+  def load_graph(self, returns, meta_graph_def):
+    """Called from wrap_function to import `meta_graph_def`."""
+    # pylint: disable=protected-access
+    saver, _ = tf_saver._import_meta_graph_with_return_elements(
+        meta_graph_def)
+    # pylint: enable=protected-access
+    returns[0] = saver
+
+  def restore_variables(self, wrapped, saver):
+    """Restores variables from the checkpoint."""
+    if saver is not None:
+      saver_def = saver.saver_def
+      restore_fn = wrapped.prune(
+          feeds=[wrapped.graph.as_graph_element(
+              saver_def.filename_tensor_name)],
+          fetches=[wrapped.graph.as_graph_element(saver_def.restore_op_name)])
+      restore_fn(constant_op.constant(self._variables_path))
+
+  def _extract_signatures(self, wrapped, meta_graph_def):
+    """Creates ConcreteFunctions for signatures in `meta_graph_def`."""
+    signature_functions = {}
+    for signature_key, signature_def in meta_graph_def.signature_def.items():
+      input_names, input_specs = zip(*signature_def.inputs.items())
+      # TODO(allenl): Support optional arguments
+      signature_fn = wrapped.prune(
+          feeds=[wrapped.graph.as_graph_element(inp.name)
+                 for inp in input_specs],
+          fetches={name: wrapped.graph.as_graph_element(out.name)
+                   for name, out in signature_def.outputs.items()})
+      # pylint: disable=protected-access
+      signature_fn._arg_keywords = input_names
+      if len(input_names) == 1:
+        # Allowing positional arguments does not create any ambiguity if there's
+        # only one.
+        signature_fn._num_positional_args = 1
+      else:
+        signature_fn._num_positional_args = 0
+      # pylint: enable=protected-access
+      signature_functions[signature_key] = signature_fn
+    return signature_functions
+
+  def load(self, tags):
+    """Creates an object from the MetaGraph identified by `tags`."""
+    meta_graph_def = self.get_meta_graph_def_from_tags(tags)
+    load_graph_returns = [None]
+    wrapped = wrap_function.wrap_function(
+        functools.partial(self.load_graph, load_graph_returns, meta_graph_def),
+        signature=[])
+    saver, = load_graph_returns
+    self.restore_variables(wrapped, saver)
+    with wrapped.graph.as_default():
+      init_op = loader_impl.get_init_op(meta_graph_def)
+    root = tracking.AutoTrackable()
+    if init_op is not None:
+      asset_feed_tensors = []
+      asset_paths = []
+      for tensor_name, value in loader_impl.get_asset_tensors(
+          self._export_dir, meta_graph_def).items():
+        asset_feed_tensors.append(wrapped.graph.as_graph_element(tensor_name))
+        asset_paths.append(tracking.TrackableAsset(value))
+      init_fn = wrapped.prune(
+          feeds=asset_feed_tensors,
+          fetches=[wrapped.graph.as_graph_element(init_op)])
+      initializer = _Initializer(init_fn, asset_paths)
+      initializer.initialize()
+      root.initializer = initializer
+      root.asset_paths = asset_paths
+    else:
+      root.asset_paths = []
+    signature_functions = self._extract_signatures(wrapped, meta_graph_def)
+
+    root.signatures = signature_serialization.create_signature_map(
+        signature_functions)
+    root.variables = list(wrapped.graph.variables)
+    return root
+
+
+def load(export_dir, tags):
+  """Load a v1-style SavedModel as an object."""
+  loader = _EagerSavedModelLoader(export_dir)
+  return loader.load(tags=tags)
diff --git a/tensorflow/python/saved_model/load_v1_in_v2_test.py b/tensorflow/python/saved_model/load_v1_in_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce2c0cd960e5da06cf7cd28b5db556471c1b891e
--- /dev/null
+++ b/tensorflow/python/saved_model/load_v1_in_v2_test.py
@@ -0,0 +1,208 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for importing a TF v1-style SavedModel when executing eagerly."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.saved_model import builder_impl
+from tensorflow.python.saved_model import load
+from tensorflow.python.saved_model import save
+from tensorflow.python.saved_model import signature_def_utils
+from tensorflow.python.saved_model import simple_save
+from tensorflow.python.saved_model import utils_impl
+
+
+class LoadTest(test.TestCase):
+
+  def _v1_single_metagraph_saved_model(self, use_resource):
+    export_graph = ops.Graph()
+    with export_graph.as_default():
+      start = array_ops.placeholder(
+          shape=[None], dtype=dtypes.float32, name="start")
+      if use_resource:
+        distractor = variables.RefVariable(-1., name="distractor")
+        v = resource_variable_ops.ResourceVariable(3., name="v")
+      else:
+        # "distractor" gets saved in the checkpoint and so used in the restore
+        # function, but not in the pruned function for the signature. This tests
+        # node naming: it needs to be consistent (and ideally always the same as
+        # the node in the original GraphDef) for the resource manager to find
+        # the right variable.
+        distractor = variables.RefVariable(-1., name="distractor")
+        v = variables.RefVariable(3., name="v")
+      local_variable = variables.VariableV1(
+          1.,
+          collections=[ops.GraphKeys.LOCAL_VARIABLES],
+          trainable=False,
+          use_resource=True)
+      output = array_ops.identity(start * v * local_variable, name="output")
+      with session_lib.Session() as session:
+        session.run([v.initializer, distractor.initializer,
+                     local_variable.initializer])
+        path = os.path.join(self.get_temp_dir(), "saved_model", str(ops.uid()))
+        simple_save.simple_save(
+            session,
+            path,
+            inputs={"start": start},
+            outputs={"output": output},
+            legacy_init_op=local_variable.initializer)
+    return path
+
+  def test_resource_variable_import(self):
+    imported = load.load(self._v1_single_metagraph_saved_model(
+        use_resource=True))
+    fn = imported.signatures["serving_default"]
+    self.assertEqual({"output": 6.},
+                     self.evaluate(fn(constant_op.constant(2.))))
+    self.assertAllEqual([3., 1.], self.evaluate(imported.variables))
+    imported.variables[0].assign(4.)
+    self.assertEqual({"output": 8.},
+                     self.evaluate(fn(start=constant_op.constant(2.))))
+    imported.variables[1].assign(2.)
+    self.assertEqual({"output": 24.},
+                     self.evaluate(fn(start=constant_op.constant(3.))))
+    self.assertTrue(imported.variables[0].trainable)
+    self.assertFalse(imported.variables[1].trainable)
+    with backprop.GradientTape() as tape:
+      output = fn(start=constant_op.constant(4.))
+    self.assertEqual(imported.variables[:1], list(tape.watched_variables()))
+    self.assertEqual(8., tape.gradient(output, imported.variables[0]).numpy())
+
+  def test_ref_variable_import(self):
+    saved = self._v1_single_metagraph_saved_model(use_resource=False)
+    imported = load.load(saved)
+    fn = imported.signatures["serving_default"]
+    self.assertEqual(6., fn(start=constant_op.constant(2.))["output"].numpy())
+
+  def _v1_multi_metagraph_saved_model(self):
+    export_graph = ops.Graph()
+    with export_graph.as_default():
+      start = array_ops.placeholder(
+          shape=[None], dtype=dtypes.float32, name="start")
+      v = resource_variable_ops.ResourceVariable(21.)
+      first_output = array_ops.identity(start * v, name="first_output")
+      second_output = array_ops.identity(v, name="second_output")
+      with session_lib.Session() as session:
+        session.run(v.initializer)
+        path = os.path.join(self.get_temp_dir(), "saved_model", str(ops.uid()))
+        builder = builder_impl.SavedModelBuilder(path)
+        builder.add_meta_graph_and_variables(
+            session, tags=["first"],
+            signature_def_map={
+                "first_key": signature_def_utils.build_signature_def(
+                    {"first_start": utils_impl.build_tensor_info(start)},
+                    {"first_output": utils_impl.build_tensor_info(
+                        first_output)})})
+        builder.add_meta_graph(
+            tags=["second"],
+            signature_def_map={
+                "second_key": signature_def_utils.build_signature_def(
+                    {"second_start": utils_impl.build_tensor_info(start)},
+                    {"second_output": utils_impl.build_tensor_info(
+                        second_output)})})
+        builder.save()
+    return path
+
+  def test_multi_meta_graph_loading(self):
+    with self.assertRaisesRegexp(ValueError, "2 MetaGraphs"):
+      load.load(self._v1_multi_metagraph_saved_model())
+    first_imported = load.load(self._v1_multi_metagraph_saved_model(),
+                               tags=["first"])
+    self.assertEqual({"first_output": 42.},
+                     self.evaluate(first_imported.signatures["first_key"](
+                         first_start=constant_op.constant(2.))))
+    second_imported = load.load(self._v1_multi_metagraph_saved_model(),
+                                tags=["second"])
+    with self.assertRaisesRegexp(TypeError, "second_start"):
+      second_imported.signatures["second_key"](x=constant_op.constant(2.))
+    with self.assertRaisesRegexp(TypeError, "second_start"):
+      second_imported.signatures["second_key"](
+          second_start=constant_op.constant(2.),
+          x=constant_op.constant(2.))
+    self.assertEqual({"second_output": 21.},
+                     self.evaluate(second_imported.signatures["second_key"](
+                         second_start=constant_op.constant(2.))))
+
+  def _v1_asset_saved_model(self):
+    export_graph = ops.Graph()
+    vocab_path = os.path.join(self.get_temp_dir(), "vocab.txt")
+    with open(vocab_path, "w") as f:
+      f.write("alpha\nbeta\ngamma\n")
+    with export_graph.as_default():
+      initializer = lookup_ops.TextFileInitializer(
+          vocab_path,
+          key_dtype=dtypes.string,
+          key_index=lookup_ops.TextFileIndex.WHOLE_LINE,
+          value_dtype=dtypes.int64,
+          value_index=lookup_ops.TextFileIndex.LINE_NUMBER)
+      table = lookup_ops.HashTable(
+          initializer, default_value=-1)
+      start = array_ops.placeholder(
+          shape=None, dtype=dtypes.string, name="in")
+      output = table.lookup(start, name="out")
+      with session_lib.Session() as session:
+        session.run([table.initializer])
+        path = os.path.join(self.get_temp_dir(), "saved_model", str(ops.uid()))
+        simple_save.simple_save(
+            session,
+            path,
+            inputs={"start": start},
+            outputs={"output": output},
+            legacy_init_op=table.initializer)
+    file_io.delete_file(vocab_path)
+    return path
+
+  def test_asset_loading(self):
+    first_path = self._v1_asset_saved_model()
+    imported = load.load(first_path)
+    fn = imported.signatures["serving_default"]
+    self.assertAllClose({"output": [2, 0]},
+                        fn(start=constant_op.constant(["gamma", "alpha"])))
+    second_path = os.path.join(self.get_temp_dir(), "saved_model",
+                               str(ops.uid()))
+    save.save(imported, second_path, signatures=imported.signatures)
+    shutil.rmtree(first_path)
+    second_import = load.load(second_path)
+    fn = second_import.signatures["serving_default"]
+    self.assertAllClose({"output": [2, 0]},
+                        fn(start=constant_op.constant(["gamma", "alpha"])))
+
+    third_path = os.path.join(self.get_temp_dir(), "saved_model",
+                              str(ops.uid()))
+    save.save(second_import, third_path, signatures=second_import.signatures)
+    shutil.rmtree(second_path)
+    third_import = load.load(third_path)
+    fn = third_import.signatures["serving_default"]
+    self.assertAllClose({"output": [2, 0]},
+                        fn(start=constant_op.constant(["gamma", "alpha"])))
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index e5be03aae4905f4465ac87590da610a7d46e2ae4..bfabef9174de2b7ae7a330785d735c7193569683 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -88,7 +88,7 @@ def parse_saved_model(export_dir):
 _parse_saved_model = parse_saved_model
 
 
-def _get_asset_tensors(export_dir, meta_graph_def_to_load, import_scope=None):
+def get_asset_tensors(export_dir, meta_graph_def_to_load, import_scope=None):
   """Gets the asset tensors, if defined in the meta graph def to load.
 
   Args:
@@ -393,7 +393,7 @@ class SavedModelLoader(object):
     meta_graph_def = self.get_meta_graph_def_from_tags(tags)
     with sess.graph.as_default():
       # Get asset tensors, if any.
-      asset_tensors_dictionary = _get_asset_tensors(
+      asset_tensors_dictionary = get_asset_tensors(
           self._export_dir, meta_graph_def, import_scope=import_scope)
 
       init_op = get_init_op(meta_graph_def, import_scope)
diff --git a/tensorflow/python/saved_model/model_utils/BUILD b/tensorflow/python/saved_model/model_utils/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..493574a225d16fdada0ce08b569e06bf0aa06e16
--- /dev/null
+++ b/tensorflow/python/saved_model/model_utils/BUILD
@@ -0,0 +1,117 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Description:
+#   Keras saving and loading libraries.
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "model_utils",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":export_output",
+        ":export_utils",
+        ":mode_keys",
+    ],
+)
+
+py_library(
+    name = "export_output",
+    srcs = ["export_output.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/saved_model:signature_def_utils",
+    ],
+)
+
+py_test(
+    name = "export_output_test",
+    srcs = ["export_output_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":export_output",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/saved_model:signature_constants",
+    ],
+)
+
+py_library(
+    name = "export_utils",
+    srcs = ["export_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":export_output",
+        ":mode_keys",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:util",
+        "//tensorflow/python/saved_model:signature_constants",
+        "//tensorflow/python/saved_model:signature_def_utils",
+        "//tensorflow/python/saved_model:tag_constants",
+    ],
+)
+
+py_test(
+    name = "export_test",
+    srcs = ["export_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":export_utils",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/saved_model:signature_constants",
+        "//tensorflow/python/saved_model:signature_def_utils",
+    ],
+)
+
+py_library(
+    name = "mode_keys",
+    srcs = ["mode_keys.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_test(
+    name = "mode_keys_test",
+    srcs = ["mode_keys_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":mode_keys",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/python/saved_model/model_utils/__init__.py b/tensorflow/python/saved_model/model_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f54c96def1bc10d334b62b9c4b0f201b2850a07
--- /dev/null
+++ b/tensorflow/python/saved_model/model_utils/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utils for saving a Keras Model or Estimator to the SavedModel format."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import
+from tensorflow.python.saved_model.model_utils.export_output import *
+from tensorflow.python.saved_model.model_utils.export_utils import build_all_signature_defs
+from tensorflow.python.saved_model.model_utils.export_utils import export_outputs_for_mode
+from tensorflow.python.saved_model.model_utils.export_utils import EXPORT_TAG_MAP
+from tensorflow.python.saved_model.model_utils.export_utils import get_export_outputs
+from tensorflow.python.saved_model.model_utils.export_utils import get_temp_export_dir
+from tensorflow.python.saved_model.model_utils.export_utils import get_timestamped_export_dir
+from tensorflow.python.saved_model.model_utils.export_utils import SIGNATURE_KEY_MAP
+# pylint: enable=wildcard-import
diff --git a/tensorflow/python/saved_model/model_utils/export_output.py b/tensorflow/python/saved_model/model_utils/export_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..b571bad067ebd0cbfdd3bfd94ee76d002d5f1880
--- /dev/null
+++ b/tensorflow/python/saved_model/model_utils/export_output.py
@@ -0,0 +1,407 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Classes for different types of export output."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.saved_model import signature_def_utils
+
+
+class ExportOutput(object):
+  """Represents an output of a model that can be served.
+
+  These typically correspond to model heads.
+  """
+
+  __metaclass__ = abc.ABCMeta
+
+  _SEPARATOR_CHAR = '/'
+
+  @abc.abstractmethod
+  def as_signature_def(self, receiver_tensors):
+    """Generate a SignatureDef proto for inclusion in a MetaGraphDef.
+
+    The SignatureDef will specify outputs as described in this ExportOutput,
+    and will use the provided receiver_tensors as inputs.
+
+    Args:
+      receiver_tensors: a `Tensor`, or a dict of string to `Tensor`, specifying
+        input nodes that will be fed.
+    """
+    pass
+
+  def _check_output_key(self, key, error_label):
+    # For multi-head models, the key can be a tuple.
+    if isinstance(key, tuple):
+      key = self._SEPARATOR_CHAR.join(key)
+
+    if not isinstance(key, six.string_types):
+      raise ValueError(
+          '{} output key must be a string; got {}.'.format(error_label, key))
+    return key
+
+  def _wrap_and_check_outputs(
+      self, outputs, single_output_default_name, error_label=None):
+    """Wraps raw tensors as dicts and checks type.
+
+    Note that we create a new dict here so that we can overwrite the keys
+    if necessary.
+
+    Args:
+      outputs: A `Tensor` or a dict of string to `Tensor`.
+      single_output_default_name: A string key for use in the output dict
+        if the provided `outputs` is a raw tensor.
+      error_label: descriptive string for use in error messages. If none,
+        single_output_default_name will be used.
+
+    Returns:
+      A dict of tensors
+
+    Raises:
+      ValueError: if the outputs dict keys are not strings or tuples of strings
+        or the values are not Tensors.
+    """
+    if not isinstance(outputs, dict):
+      outputs = {single_output_default_name: outputs}
+
+    output_dict = {}
+    for key, value in outputs.items():
+      error_name = error_label or single_output_default_name
+      key = self._check_output_key(key, error_name)
+      if not isinstance(value, ops.Tensor):
+        raise ValueError(
+            '{} output value must be a Tensor; got {}.'.format(
+                error_name, value))
+
+      output_dict[key] = value
+    return output_dict
+
+
+class ClassificationOutput(ExportOutput):
+  """Represents the output of a classification head.
+
+  Either classes or scores or both must be set.
+
+  The classes `Tensor` must provide string labels, not integer class IDs.
+
+  If only classes is set, it is interpreted as providing top-k results in
+  descending order.
+
+  If only scores is set, it is interpreted as providing a score for every class
+  in order of class ID.
+
+  If both classes and scores are set, they are interpreted as zipped, so each
+  score corresponds to the class at the same index.  Clients should not depend
+  on the order of the entries.
+  """
+
+  def __init__(self, scores=None, classes=None):
+    """Constructor for `ClassificationOutput`.
+
+    Args:
+      scores: A float `Tensor` giving scores (sometimes but not always
+          interpretable as probabilities) for each class.  May be `None`, but
+          only if `classes` is set.  Interpretation varies-- see class doc.
+      classes: A string `Tensor` giving predicted class labels.  May be `None`,
+          but only if `scores` is set.  Interpretation varies-- see class doc.
+
+    Raises:
+      ValueError: if neither classes nor scores is set, or one of them is not a
+          `Tensor` with the correct dtype.
+    """
+    if (scores is not None
+        and not (isinstance(scores, ops.Tensor)
+                 and scores.dtype.is_floating)):
+      raise ValueError('Classification scores must be a float32 Tensor; '
+                       'got {}'.format(scores))
+    if (classes is not None
+        and not (isinstance(classes, ops.Tensor)
+                 and dtypes.as_dtype(classes.dtype) == dtypes.string)):
+      raise ValueError('Classification classes must be a string Tensor; '
+                       'got {}'.format(classes))
+    if scores is None and classes is None:
+      raise ValueError('At least one of scores and classes must be set.')
+
+    self._scores = scores
+    self._classes = classes
+
+  @property
+  def scores(self):
+    return self._scores
+
+  @property
+  def classes(self):
+    return self._classes
+
+  def as_signature_def(self, receiver_tensors):
+    if len(receiver_tensors) != 1:
+      raise ValueError('Classification input must be a single string Tensor; '
+                       'got {}'.format(receiver_tensors))
+    (_, examples), = receiver_tensors.items()
+    if dtypes.as_dtype(examples.dtype) != dtypes.string:
+      raise ValueError('Classification input must be a single string Tensor; '
+                       'got {}'.format(receiver_tensors))
+    return signature_def_utils.classification_signature_def(
+        examples, self.classes, self.scores)
+
+
+class RegressionOutput(ExportOutput):
+  """Represents the output of a regression head."""
+
+  def __init__(self, value):
+    """Constructor for `RegressionOutput`.
+
+    Args:
+      value: a float `Tensor` giving the predicted values.  Required.
+
+    Raises:
+      ValueError: if the value is not a `Tensor` with dtype tf.float32.
+    """
+    if not (isinstance(value, ops.Tensor) and value.dtype.is_floating):
+      raise ValueError('Regression output value must be a float32 Tensor; '
+                       'got {}'.format(value))
+    self._value = value
+
+  @property
+  def value(self):
+    return self._value
+
+  def as_signature_def(self, receiver_tensors):
+    if len(receiver_tensors) != 1:
+      raise ValueError('Regression input must be a single string Tensor; '
+                       'got {}'.format(receiver_tensors))
+    (_, examples), = receiver_tensors.items()
+    if dtypes.as_dtype(examples.dtype) != dtypes.string:
+      raise ValueError('Regression input must be a single string Tensor; '
+                       'got {}'.format(receiver_tensors))
+    return signature_def_utils.regression_signature_def(examples, self.value)
+
+
+class PredictOutput(ExportOutput):
+  """Represents the output of a generic prediction head.
+
+  A generic prediction need not be either a classification or a regression.
+
+  Named outputs must be provided as a dict from string to `Tensor`,
+  """
+  _SINGLE_OUTPUT_DEFAULT_NAME = 'output'
+
+  def __init__(self, outputs):
+    """Constructor for PredictOutput.
+
+    Args:
+      outputs: A `Tensor` or a dict of string to `Tensor` representing the
+        predictions.
+
+    Raises:
+      ValueError: if the outputs is not dict, or any of its keys are not
+          strings, or any of its values are not `Tensor`s.
+    """
+
+    self._outputs = self._wrap_and_check_outputs(
+        outputs, self._SINGLE_OUTPUT_DEFAULT_NAME, error_label='Prediction')
+
+  @property
+  def outputs(self):
+    return self._outputs
+
+  def as_signature_def(self, receiver_tensors):
+    return signature_def_utils.predict_signature_def(receiver_tensors,
+                                                     self.outputs)
+
+
+class _SupervisedOutput(ExportOutput):
+  """Represents the output of a supervised training or eval process."""
+  __metaclass__ = abc.ABCMeta
+
+  LOSS_NAME = 'loss'
+  PREDICTIONS_NAME = 'predictions'
+  METRICS_NAME = 'metrics'
+
+  METRIC_VALUE_SUFFIX = 'value'
+  METRIC_UPDATE_SUFFIX = 'update_op'
+
+  _loss = None
+  _predictions = None
+  _metrics = None
+
+  def __init__(self, loss=None, predictions=None, metrics=None):
+    """Constructor for SupervisedOutput (ie, Train or Eval output).
+
+    Args:
+      loss: dict of Tensors or single Tensor representing calculated loss.
+      predictions: dict of Tensors or single Tensor representing model
+        predictions.
+      metrics: Dict of metric results keyed by name.
+        The values of the dict can be one of the following:
+        (1) instance of `Metric` class.
+        (2) (metric_value, update_op) tuples, or a single tuple.
+        metric_value must be a Tensor, and update_op must be a Tensor or Op.
+
+    Raises:
+      ValueError: if any of the outputs' dict keys are not strings or tuples of
+        strings or the values are not Tensors (or Operations in the case of
+        update_op).
+    """
+
+    if loss is not None:
+      loss_dict = self._wrap_and_check_outputs(loss, self.LOSS_NAME)
+      self._loss = self._prefix_output_keys(loss_dict, self.LOSS_NAME)
+    if predictions is not None:
+      pred_dict = self._wrap_and_check_outputs(
+          predictions, self.PREDICTIONS_NAME)
+      self._predictions = self._prefix_output_keys(
+          pred_dict, self.PREDICTIONS_NAME)
+    if metrics is not None:
+      self._metrics = self._wrap_and_check_metrics(metrics)
+
+  def _prefix_output_keys(self, output_dict, output_name):
+    """Prepend output_name to the output_dict keys if it doesn't exist.
+
+    This produces predictable prefixes for the pre-determined outputs
+    of SupervisedOutput.
+
+    Args:
+      output_dict: dict of string to Tensor, assumed valid.
+      output_name: prefix string to prepend to existing keys.
+
+    Returns:
+      dict with updated keys and existing values.
+    """
+
+    new_outputs = {}
+    for key, val in output_dict.items():
+      key = self._prefix_key(key, output_name)
+      new_outputs[key] = val
+    return new_outputs
+
+  def _prefix_key(self, key, output_name):
+    if key.find(output_name) != 0:
+      key = output_name + self._SEPARATOR_CHAR + key
+    return key
+
+  def _wrap_and_check_metrics(self, metrics):
+    """Handle the saving of metrics.
+
+    Metrics is either a tuple of (value, update_op), or a dict of such tuples.
+    Here, we separate out the tuples and create a dict with names to tensors.
+
+    Args:
+      metrics: Dict of metric results keyed by name.
+        The values of the dict can be one of the following:
+        (1) instance of `Metric` class.
+        (2) (metric_value, update_op) tuples, or a single tuple.
+        metric_value must be a Tensor, and update_op must be a Tensor or Op.
+
+    Returns:
+      dict of output_names to tensors
+
+    Raises:
+      ValueError: if the dict key is not a string, or the metric values or ops
+        are not tensors.
+    """
+    if not isinstance(metrics, dict):
+      metrics = {self.METRICS_NAME: metrics}
+
+    outputs = {}
+    for key, value in metrics.items():
+      if isinstance(value, tuple):
+        metric_val, metric_op = value
+      else:  # value is a keras.Metrics object
+        metric_val = value.result()
+        assert len(value.updates) == 1  # We expect only one update op.
+        metric_op = value.updates[0]
+      key = self._check_output_key(key, self.METRICS_NAME)
+      key = self._prefix_key(key, self.METRICS_NAME)
+
+      val_name = key + self._SEPARATOR_CHAR + self.METRIC_VALUE_SUFFIX
+      op_name = key + self._SEPARATOR_CHAR + self.METRIC_UPDATE_SUFFIX
+      if not isinstance(metric_val, ops.Tensor):
+        raise ValueError(
+            '{} output value must be a Tensor; got {}.'.format(
+                key, metric_val))
+      if (not isinstance(metric_op, ops.Tensor) and
+          not isinstance(metric_op, ops.Operation)):
+        raise ValueError(
+            '{} update_op must be a Tensor or Operation; got {}.'.format(
+                key, metric_op))
+
+      # We must wrap any ops in a Tensor before export, as the SignatureDef
+      # proto expects tensors only. See b/109740581
+      metric_op_tensor = metric_op
+      if isinstance(metric_op, ops.Operation):
+        with ops.control_dependencies([metric_op]):
+          metric_op_tensor = constant_op.constant([], name='metric_op_wrapper')
+
+      outputs[val_name] = metric_val
+      outputs[op_name] = metric_op_tensor
+
+    return outputs
+
+  @property
+  def loss(self):
+    return self._loss
+
+  @property
+  def predictions(self):
+    return self._predictions
+
+  @property
+  def metrics(self):
+    return self._metrics
+
+  @abc.abstractmethod
+  def _get_signature_def_fn(self):
+    """Returns a function that produces a SignatureDef given desired outputs."""
+    pass
+
+  def as_signature_def(self, receiver_tensors):
+    signature_def_fn = self._get_signature_def_fn()
+    return signature_def_fn(
+        receiver_tensors, self.loss, self.predictions, self.metrics)
+
+
+class TrainOutput(_SupervisedOutput):
+  """Represents the output of a supervised training process.
+
+  This class generates the appropriate signature def for exporting
+  training output by type-checking and wrapping loss, predictions, and metrics
+  values.
+  """
+
+  def _get_signature_def_fn(self):
+    return signature_def_utils.supervised_train_signature_def
+
+
+class EvalOutput(_SupervisedOutput):
+  """Represents the output of a supervised eval process.
+
+  This class generates the appropriate signature def for exporting
+  eval output by type-checking and wrapping loss, predictions, and metrics
+  values.
+  """
+
+  def _get_signature_def_fn(self):
+    return signature_def_utils.supervised_eval_signature_def
diff --git a/tensorflow/python/saved_model/model_utils/export_output_test.py b/tensorflow/python/saved_model/model_utils/export_output_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5262e9fa1e959f0845f9783fdb3fd3ed1a739b46
--- /dev/null
+++ b/tensorflow/python/saved_model/model_utils/export_output_test.py
@@ -0,0 +1,405 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for export."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.framework import tensor_shape_pb2
+from tensorflow.core.framework import types_pb2
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model.model_utils import export_output as export_output_lib
+
+
+class ExportOutputTest(test.TestCase):
+
+  def test_regress_value_must_be_float(self):
+    with context.graph_mode():
+      value = array_ops.placeholder(dtypes.string, 1, name='output-tensor-1')
+      with self.assertRaisesRegexp(
+          ValueError, 'Regression output value must be a float32 Tensor'):
+        export_output_lib.RegressionOutput(value)
+
+  def test_classify_classes_must_be_strings(self):
+    with context.graph_mode():
+      classes = array_ops.placeholder(dtypes.float32, 1, name='output-tensor-1')
+      with self.assertRaisesRegexp(
+          ValueError, 'Classification classes must be a string Tensor'):
+        export_output_lib.ClassificationOutput(classes=classes)
+
+  def test_classify_scores_must_be_float(self):
+    with context.graph_mode():
+      scores = array_ops.placeholder(dtypes.string, 1, name='output-tensor-1')
+      with self.assertRaisesRegexp(
+          ValueError, 'Classification scores must be a float32 Tensor'):
+        export_output_lib.ClassificationOutput(scores=scores)
+
+  def test_classify_requires_classes_or_scores(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'At least one of scores and classes must be set.'):
+      export_output_lib.ClassificationOutput()
+
+  def test_build_standardized_signature_def_regression(self):
+    with context.graph_mode():
+      input_tensors = {
+          'input-1':
+              array_ops.placeholder(
+                  dtypes.string, 1, name='input-tensor-1')
+      }
+      value = array_ops.placeholder(dtypes.float32, 1, name='output-tensor-1')
+
+      export_output = export_output_lib.RegressionOutput(value)
+      actual_signature_def = export_output.as_signature_def(input_tensors)
+
+      expected_signature_def = meta_graph_pb2.SignatureDef()
+      shape = tensor_shape_pb2.TensorShapeProto(
+          dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)])
+      dtype_float = types_pb2.DataType.Value('DT_FLOAT')
+      dtype_string = types_pb2.DataType.Value('DT_STRING')
+      expected_signature_def.inputs[
+          signature_constants.REGRESS_INPUTS].CopyFrom(
+              meta_graph_pb2.TensorInfo(name='input-tensor-1:0',
+                                        dtype=dtype_string,
+                                        tensor_shape=shape))
+      expected_signature_def.outputs[
+          signature_constants.REGRESS_OUTPUTS].CopyFrom(
+              meta_graph_pb2.TensorInfo(name='output-tensor-1:0',
+                                        dtype=dtype_float,
+                                        tensor_shape=shape))
+
+      expected_signature_def.method_name = (
+          signature_constants.REGRESS_METHOD_NAME)
+      self.assertEqual(actual_signature_def, expected_signature_def)
+
+  def test_build_standardized_signature_def_classify_classes_only(self):
+    """Tests classification with one output tensor."""
+    with context.graph_mode():
+      input_tensors = {
+          'input-1':
+              array_ops.placeholder(
+                  dtypes.string, 1, name='input-tensor-1')
+      }
+      classes = array_ops.placeholder(dtypes.string, 1, name='output-tensor-1')
+
+      export_output = export_output_lib.ClassificationOutput(classes=classes)
+      actual_signature_def = export_output.as_signature_def(input_tensors)
+
+      expected_signature_def = meta_graph_pb2.SignatureDef()
+      shape = tensor_shape_pb2.TensorShapeProto(
+          dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)])
+      dtype_string = types_pb2.DataType.Value('DT_STRING')
+      expected_signature_def.inputs[
+          signature_constants.CLASSIFY_INPUTS].CopyFrom(
+              meta_graph_pb2.TensorInfo(name='input-tensor-1:0',
+                                        dtype=dtype_string,
+                                        tensor_shape=shape))
+      expected_signature_def.outputs[
+          signature_constants.CLASSIFY_OUTPUT_CLASSES].CopyFrom(
+              meta_graph_pb2.TensorInfo(name='output-tensor-1:0',
+                                        dtype=dtype_string,
+                                        tensor_shape=shape))
+
+      expected_signature_def.method_name = (
+          signature_constants.CLASSIFY_METHOD_NAME)
+      self.assertEqual(actual_signature_def, expected_signature_def)
+
+  def test_build_standardized_signature_def_classify_both(self):
+    """Tests multiple output tensors that include classes and scores."""
+    with context.graph_mode():
+      input_tensors = {
+          'input-1':
+              array_ops.placeholder(
+                  dtypes.string, 1, name='input-tensor-1')
+      }
+      classes = array_ops.placeholder(dtypes.string, 1,
+                                      name='output-tensor-classes')
+      scores = array_ops.placeholder(dtypes.float32, 1,
+                                     name='output-tensor-scores')
+
+      export_output = export_output_lib.ClassificationOutput(
+          scores=scores, classes=classes)
+      actual_signature_def = export_output.as_signature_def(input_tensors)
+
+      expected_signature_def = meta_graph_pb2.SignatureDef()
+      shape = tensor_shape_pb2.TensorShapeProto(
+          dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)])
+      dtype_float = types_pb2.DataType.Value('DT_FLOAT')
+      dtype_string = types_pb2.DataType.Value('DT_STRING')
+      expected_signature_def.inputs[
+          signature_constants.CLASSIFY_INPUTS].CopyFrom(
+              meta_graph_pb2.TensorInfo(name='input-tensor-1:0',
+                                        dtype=dtype_string,
+                                        tensor_shape=shape))
+      expected_signature_def.outputs[
+          signature_constants.CLASSIFY_OUTPUT_CLASSES].CopyFrom(
+              meta_graph_pb2.TensorInfo(name='output-tensor-classes:0',
+                                        dtype=dtype_string,
+                                        tensor_shape=shape))
+      expected_signature_def.outputs[
+          signature_constants.CLASSIFY_OUTPUT_SCORES].CopyFrom(
+              meta_graph_pb2.TensorInfo(name='output-tensor-scores:0',
+                                        dtype=dtype_float,
+                                        tensor_shape=shape))
+
+      expected_signature_def.method_name = (
+          signature_constants.CLASSIFY_METHOD_NAME)
+      self.assertEqual(actual_signature_def, expected_signature_def)
+
+  def test_build_standardized_signature_def_classify_scores_only(self):
+    """Tests classification without classes tensor."""
+    with context.graph_mode():
+      input_tensors = {
+          'input-1':
+              array_ops.placeholder(
+                  dtypes.string, 1, name='input-tensor-1')
+      }
+
+      scores = array_ops.placeholder(dtypes.float32, 1,
+                                     name='output-tensor-scores')
+
+      export_output = export_output_lib.ClassificationOutput(
+          scores=scores)
+      actual_signature_def = export_output.as_signature_def(input_tensors)
+
+      expected_signature_def = meta_graph_pb2.SignatureDef()
+      shape = tensor_shape_pb2.TensorShapeProto(
+          dim=[tensor_shape_pb2.TensorShapeProto.Dim(size=1)])
+      dtype_float = types_pb2.DataType.Value('DT_FLOAT')
+      dtype_string = types_pb2.DataType.Value('DT_STRING')
+      expected_signature_def.inputs[
+          signature_constants.CLASSIFY_INPUTS].CopyFrom(
+              meta_graph_pb2.TensorInfo(name='input-tensor-1:0',
+                                        dtype=dtype_string,
+                                        tensor_shape=shape))
+      expected_signature_def.outputs[
+          signature_constants.CLASSIFY_OUTPUT_SCORES].CopyFrom(
+              meta_graph_pb2.TensorInfo(name='output-tensor-scores:0',
+                                        dtype=dtype_float,
+                                        tensor_shape=shape))
+
+      expected_signature_def.method_name = (
+          signature_constants.CLASSIFY_METHOD_NAME)
+      self.assertEqual(actual_signature_def, expected_signature_def)
+
+  def test_predict_outputs_valid(self):
+    """Tests that no errors are raised when provided outputs are valid."""
+    outputs = {
+        'output0': constant_op.constant([0]),
+        u'output1': constant_op.constant(['foo']),
+    }
+    export_output_lib.PredictOutput(outputs)
+
+    # Single Tensor is OK too
+    export_output_lib.PredictOutput(constant_op.constant([0]))
+
+  def test_predict_outputs_invalid(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        'Prediction output key must be a string'):
+      export_output_lib.PredictOutput({1: constant_op.constant([0])})
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        'Prediction output value must be a Tensor'):
+      export_output_lib.PredictOutput({
+          'prediction1': sparse_tensor.SparseTensor(
+              indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+      })
+
+
+class MockSupervisedOutput(export_output_lib._SupervisedOutput):
+  """So that we can test the abstract class methods directly."""
+
+  def _get_signature_def_fn(self):
+    pass
+
+
+class SupervisedOutputTest(test.TestCase):
+
+  def test_supervised_outputs_valid(self):
+    """Tests that no errors are raised when provided outputs are valid."""
+    with context.graph_mode():
+      loss = {'my_loss': constant_op.constant([0])}
+      predictions = {u'output1': constant_op.constant(['foo'])}
+      metric_obj = metrics_module.Mean()
+      metric_obj.update_state(constant_op.constant([0]))
+      metrics = {
+          'metrics': metric_obj,
+          'metrics2': (constant_op.constant([0]), constant_op.constant([10]))
+      }
+
+      outputter = MockSupervisedOutput(loss, predictions, metrics)
+      self.assertEqual(outputter.loss['loss/my_loss'], loss['my_loss'])
+      self.assertEqual(
+          outputter.predictions['predictions/output1'], predictions['output1'])
+      self.assertEqual(outputter.metrics['metrics/update_op'].name,
+                       'metric_op_wrapper:0')
+      self.assertEqual(
+          outputter.metrics['metrics2/update_op'], metrics['metrics2'][1])
+
+      # Single Tensor is OK too
+      outputter = MockSupervisedOutput(
+          loss['my_loss'], predictions['output1'], metrics['metrics'])
+      self.assertEqual(outputter.loss, {'loss': loss['my_loss']})
+      self.assertEqual(
+          outputter.predictions, {'predictions': predictions['output1']})
+      self.assertEqual(outputter.metrics['metrics/update_op'].name,
+                       'metric_op_wrapper_1:0')
+
+  def test_supervised_outputs_none(self):
+    outputter = MockSupervisedOutput(
+        constant_op.constant([0]), None, None)
+    self.assertEqual(len(outputter.loss), 1)
+    self.assertEqual(outputter.predictions, None)
+    self.assertEqual(outputter.metrics, None)
+
+  def test_supervised_outputs_invalid(self):
+    with self.assertRaisesRegexp(ValueError, 'predictions output value must'):
+      MockSupervisedOutput(constant_op.constant([0]), [3], None)
+    with self.assertRaisesRegexp(ValueError, 'loss output value must'):
+      MockSupervisedOutput('str', None, None)
+    with self.assertRaisesRegexp(ValueError, 'metrics output value must'):
+      MockSupervisedOutput(None, None, (15.3, 4))
+    with self.assertRaisesRegexp(ValueError, 'loss output key must'):
+      MockSupervisedOutput({25: 'Tensor'}, None, None)
+
+  def test_supervised_outputs_tuples(self):
+    """Tests that no errors are raised when provided outputs are valid."""
+    with context.graph_mode():
+      loss = {('my', 'loss'): constant_op.constant([0])}
+      predictions = {(u'output1', '2'): constant_op.constant(['foo'])}
+      metric_obj = metrics_module.Mean()
+      metric_obj.update_state(constant_op.constant([0]))
+      metrics = {
+          ('metrics', '1'):
+              metric_obj,
+          ('metrics', '2'): (constant_op.constant([0]),
+                             constant_op.constant([10]))
+      }
+
+      outputter = MockSupervisedOutput(loss, predictions, metrics)
+      self.assertEqual(set(outputter.loss.keys()), set(['loss/my/loss']))
+      self.assertEqual(set(outputter.predictions.keys()),
+                       set(['predictions/output1/2']))
+      self.assertEqual(
+          set(outputter.metrics.keys()),
+          set([
+              'metrics/1/value', 'metrics/1/update_op', 'metrics/2/value',
+              'metrics/2/update_op'
+          ]))
+
+  def test_supervised_outputs_no_prepend(self):
+    """Tests that no errors are raised when provided outputs are valid."""
+    with context.graph_mode():
+      loss = {'loss': constant_op.constant([0])}
+      predictions = {u'predictions': constant_op.constant(['foo'])}
+      metric_obj = metrics_module.Mean()
+      metric_obj.update_state(constant_op.constant([0]))
+      metrics = {
+          'metrics_1': metric_obj,
+          'metrics_2': (constant_op.constant([0]), constant_op.constant([10]))
+      }
+
+      outputter = MockSupervisedOutput(loss, predictions, metrics)
+      self.assertEqual(set(outputter.loss.keys()), set(['loss']))
+      self.assertEqual(set(outputter.predictions.keys()), set(['predictions']))
+      self.assertEqual(
+          set(outputter.metrics.keys()),
+          set([
+              'metrics_1/value', 'metrics_1/update_op', 'metrics_2/update_op',
+              'metrics_2/value'
+          ]))
+
+  def test_train_signature_def(self):
+    with context.graph_mode():
+      loss = {'my_loss': constant_op.constant([0])}
+      predictions = {u'output1': constant_op.constant(['foo'])}
+      metric_obj = metrics_module.Mean()
+      metric_obj.update_state(constant_op.constant([0]))
+      metrics = {
+          'metrics_1': metric_obj,
+          'metrics_2': (constant_op.constant([0]), constant_op.constant([10]))
+      }
+
+      outputter = export_output_lib.TrainOutput(loss, predictions, metrics)
+
+      receiver = {u'features': constant_op.constant(100, shape=(100, 2)),
+                  'labels': constant_op.constant(100, shape=(100, 1))}
+      sig_def = outputter.as_signature_def(receiver)
+
+      self.assertTrue('loss/my_loss' in sig_def.outputs)
+      self.assertTrue('metrics_1/value' in sig_def.outputs)
+      self.assertTrue('metrics_2/value' in sig_def.outputs)
+      self.assertTrue('predictions/output1' in sig_def.outputs)
+      self.assertTrue('features' in sig_def.inputs)
+
+  def test_eval_signature_def(self):
+    with context.graph_mode():
+      loss = {'my_loss': constant_op.constant([0])}
+      predictions = {u'output1': constant_op.constant(['foo'])}
+
+      outputter = export_output_lib.EvalOutput(loss, predictions, None)
+
+      receiver = {u'features': constant_op.constant(100, shape=(100, 2)),
+                  'labels': constant_op.constant(100, shape=(100, 1))}
+      sig_def = outputter.as_signature_def(receiver)
+
+      self.assertTrue('loss/my_loss' in sig_def.outputs)
+      self.assertFalse('metrics/value' in sig_def.outputs)
+      self.assertTrue('predictions/output1' in sig_def.outputs)
+      self.assertTrue('features' in sig_def.inputs)
+
+  def test_metric_op_is_tensor(self):
+    """Tests that ops.Operation is wrapped by a tensor for metric_ops."""
+    with context.graph_mode():
+      loss = {'my_loss': constant_op.constant([0])}
+      predictions = {u'output1': constant_op.constant(['foo'])}
+      metric_obj = metrics_module.Mean()
+      metric_obj.update_state(constant_op.constant([0]))
+      metrics = {
+          'metrics_1': metric_obj,
+          'metrics_2': (constant_op.constant([0]), control_flow_ops.no_op())
+      }
+
+      outputter = MockSupervisedOutput(loss, predictions, metrics)
+
+      self.assertTrue(outputter.metrics['metrics_1/update_op'].name.startswith(
+          'metric_op_wrapper'))
+      self.assertTrue(
+          isinstance(outputter.metrics['metrics_1/update_op'], ops.Tensor))
+      self.assertTrue(
+          isinstance(outputter.metrics['metrics_1/value'], ops.Tensor))
+
+      self.assertEqual(outputter.metrics['metrics_2/value'],
+                       metrics['metrics_2'][0])
+      self.assertTrue(outputter.metrics['metrics_2/update_op'].name.startswith(
+          'metric_op_wrapper'))
+      self.assertTrue(
+          isinstance(outputter.metrics['metrics_2/update_op'], ops.Tensor))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/saved_model/model_utils/export_test.py b/tensorflow/python/saved_model/model_utils/export_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c87d2ee6ae703d50c916dbedc7fcc03936880f71
--- /dev/null
+++ b/tensorflow/python/saved_model/model_utils/export_test.py
@@ -0,0 +1,288 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for export utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tempfile
+import time
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import signature_def_utils
+from tensorflow.python.saved_model.model_utils import export_output
+from tensorflow.python.saved_model.model_utils import export_utils
+from tensorflow.python.saved_model.model_utils.mode_keys import KerasModeKeys
+
+
+class ExportTest(test_util.TensorFlowTestCase):
+
+  @test_util.deprecated_graph_mode_only
+  def test_build_all_signature_defs_without_receiver_alternatives(self):
+    receiver_tensor = array_ops.placeholder(dtypes.string)
+    output_1 = constant_op.constant([1.])
+    output_2 = constant_op.constant(["2"])
+    output_3 = constant_op.constant(["3"])
+    export_outputs = {
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+            export_output.RegressionOutput(value=output_1),
+        "head-2": export_output.ClassificationOutput(classes=output_2),
+        "head-3": export_output.PredictOutput(outputs={
+            "some_output_3": output_3
+        }),
+    }
+
+    signature_defs = export_utils.build_all_signature_defs(
+        receiver_tensor, export_outputs)
+
+    expected_signature_defs = {
+        "serving_default":
+            signature_def_utils.regression_signature_def(receiver_tensor,
+                                                         output_1),
+        "head-2":
+            signature_def_utils.classification_signature_def(receiver_tensor,
+                                                             output_2, None),
+        "head-3":
+            signature_def_utils.predict_signature_def({
+                "input": receiver_tensor
+            }, {"some_output_3": output_3})
+    }
+
+    self.assertDictEqual(expected_signature_defs, signature_defs)
+
+  @test_util.deprecated_graph_mode_only
+  def test_build_all_signature_defs_with_dict_alternatives(self):
+    receiver_tensor = array_ops.placeholder(dtypes.string)
+    receiver_tensors_alternative_1 = {
+        "foo": array_ops.placeholder(dtypes.int64),
+        "bar": array_ops.sparse_placeholder(dtypes.float32)}
+    receiver_tensors_alternatives = {"other": receiver_tensors_alternative_1}
+    output_1 = constant_op.constant([1.])
+    output_2 = constant_op.constant(["2"])
+    output_3 = constant_op.constant(["3"])
+    export_outputs = {
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+            export_output.RegressionOutput(value=output_1),
+        "head-2": export_output.ClassificationOutput(classes=output_2),
+        "head-3": export_output.PredictOutput(outputs={
+            "some_output_3": output_3
+        }),
+    }
+
+    signature_defs = export_utils.build_all_signature_defs(
+        receiver_tensor, export_outputs, receiver_tensors_alternatives)
+
+    expected_signature_defs = {
+        "serving_default":
+            signature_def_utils.regression_signature_def(
+                receiver_tensor,
+                output_1),
+        "head-2":
+            signature_def_utils.classification_signature_def(
+                receiver_tensor,
+                output_2, None),
+        "head-3":
+            signature_def_utils.predict_signature_def(
+                {"input": receiver_tensor},
+                {"some_output_3": output_3}),
+        "other:head-3":
+            signature_def_utils.predict_signature_def(
+                receiver_tensors_alternative_1,
+                {"some_output_3": output_3})
+
+        # Note that the alternatives 'other:serving_default' and
+        # 'other:head-2' are invalid, because regession and classification
+        # signatures must take a single string input.  Here we verify that
+        # these invalid signatures are not included in the export_utils.
+    }
+
+    self.assertDictEqual(expected_signature_defs, signature_defs)
+
+  @test_util.deprecated_graph_mode_only
+  def test_build_all_signature_defs_with_single_alternatives(self):
+    receiver_tensor = array_ops.placeholder(dtypes.string)
+    receiver_tensors_alternative_1 = array_ops.placeholder(dtypes.int64)
+    receiver_tensors_alternative_2 = array_ops.sparse_placeholder(
+        dtypes.float32)
+    # Note we are passing single Tensors as values of
+    # receiver_tensors_alternatives, where normally that is a dict.
+    # In this case a dict will be created using the default receiver tensor
+    # name "input".
+    receiver_tensors_alternatives = {"other1": receiver_tensors_alternative_1,
+                                     "other2": receiver_tensors_alternative_2}
+    output_1 = constant_op.constant([1.])
+    output_2 = constant_op.constant(["2"])
+    output_3 = constant_op.constant(["3"])
+    export_outputs = {
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+            export_output.RegressionOutput(value=output_1),
+        "head-2": export_output.ClassificationOutput(classes=output_2),
+        "head-3": export_output.PredictOutput(outputs={
+            "some_output_3": output_3
+        }),
+    }
+
+    signature_defs = export_utils.build_all_signature_defs(
+        receiver_tensor, export_outputs, receiver_tensors_alternatives)
+
+    expected_signature_defs = {
+        "serving_default":
+            signature_def_utils.regression_signature_def(
+                receiver_tensor,
+                output_1),
+        "head-2":
+            signature_def_utils.classification_signature_def(
+                receiver_tensor,
+                output_2, None),
+        "head-3":
+            signature_def_utils.predict_signature_def(
+                {"input": receiver_tensor},
+                {"some_output_3": output_3}),
+        "other1:head-3":
+            signature_def_utils.predict_signature_def(
+                {"input": receiver_tensors_alternative_1},
+                {"some_output_3": output_3}),
+        "other2:head-3":
+            signature_def_utils.predict_signature_def(
+                {"input": receiver_tensors_alternative_2},
+                {"some_output_3": output_3})
+
+        # Note that the alternatives 'other:serving_default' and 'other:head-2'
+        # are invalid, because regession and classification signatures must take
+        # a single string input.  Here we verify that these invalid signatures
+        # are not included in the export_utils.
+    }
+
+    self.assertDictEqual(expected_signature_defs, signature_defs)
+
+  def test_build_all_signature_defs_export_outputs_required(self):
+    receiver_tensor = constant_op.constant(["11"])
+
+    with self.assertRaises(ValueError) as e:
+      export_utils.build_all_signature_defs(receiver_tensor, None)
+
+    self.assertTrue(str(e.exception).startswith(
+        "export_outputs must be a dict"))
+
+  def test_get_timestamped_export_dir(self):
+    export_dir_base = tempfile.mkdtemp() + "export/"
+    export_dir_1 = export_utils.get_timestamped_export_dir(
+        export_dir_base)
+    time.sleep(2)
+    export_dir_2 = export_utils.get_timestamped_export_dir(
+        export_dir_base)
+    time.sleep(2)
+    export_dir_3 = export_utils.get_timestamped_export_dir(
+        export_dir_base)
+
+    # Export directories should be named using a timestamp that is seconds
+    # since epoch.  Such a timestamp is 10 digits long.
+    time_1 = os.path.basename(export_dir_1)
+    self.assertEqual(10, len(time_1))
+    time_2 = os.path.basename(export_dir_2)
+    self.assertEqual(10, len(time_2))
+    time_3 = os.path.basename(export_dir_3)
+    self.assertEqual(10, len(time_3))
+
+    self.assertTrue(int(time_1) < int(time_2))
+    self.assertTrue(int(time_2) < int(time_3))
+
+  @test_util.deprecated_graph_mode_only
+  def test_build_all_signature_defs_serving_only(self):
+    receiver_tensor = {"input": array_ops.placeholder(dtypes.string)}
+    output_1 = constant_op.constant([1.])
+    export_outputs = {
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+            export_output.PredictOutput(outputs=output_1),
+        "train": export_output.TrainOutput(loss=output_1),
+    }
+
+    signature_defs = export_utils.build_all_signature_defs(
+        receiver_tensor, export_outputs)
+
+    expected_signature_defs = {
+        "serving_default": signature_def_utils.predict_signature_def(
+            receiver_tensor, {"output": output_1})
+    }
+
+    self.assertDictEqual(expected_signature_defs, signature_defs)
+
+    signature_defs = export_utils.build_all_signature_defs(
+        receiver_tensor, export_outputs, serving_only=False)
+
+    expected_signature_defs.update({
+        "train": signature_def_utils.supervised_train_signature_def(
+            receiver_tensor, loss={"loss": output_1})
+    })
+
+    self.assertDictEqual(expected_signature_defs, signature_defs)
+
+  @test_util.deprecated_graph_mode_only
+  def test_export_outputs_for_mode(self):
+    predictions = {"predictions": constant_op.constant([1.])}
+    loss = {"loss": constant_op.constant([2.])}
+    metrics = {
+        "metrics": (constant_op.constant([3.]), constant_op.constant([4.]))}
+    expected_metrics = {
+        "metrics/value": metrics["metrics"][0],
+        "metrics/update_op": metrics["metrics"][1]
+    }
+
+    def _build_export_output(mode):
+      return export_utils.export_outputs_for_mode(
+          mode, None, predictions, loss, metrics)
+
+    ret = _build_export_output(KerasModeKeys.TRAIN)
+    self.assertIn(signature_constants.DEFAULT_TRAIN_SIGNATURE_DEF_KEY, ret)
+    export_out = ret[signature_constants.DEFAULT_TRAIN_SIGNATURE_DEF_KEY]
+    self.assertIsInstance(export_out, export_output.TrainOutput)
+    self.assertEqual(export_out.predictions, predictions)
+    self.assertEqual(export_out.loss, loss)
+    self.assertEqual(export_out.metrics, expected_metrics)
+
+    ret = _build_export_output(KerasModeKeys.TEST)
+    self.assertIn(signature_constants.DEFAULT_EVAL_SIGNATURE_DEF_KEY, ret)
+    export_out = ret[signature_constants.DEFAULT_EVAL_SIGNATURE_DEF_KEY]
+    self.assertIsInstance(export_out, export_output.EvalOutput)
+    self.assertEqual(export_out.predictions, predictions)
+    self.assertEqual(export_out.loss, loss)
+    self.assertEqual(export_out.metrics, expected_metrics)
+
+    ret = _build_export_output(KerasModeKeys.PREDICT)
+    self.assertIn(signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, ret)
+    export_out = ret[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+    self.assertIsInstance(export_out, export_output.PredictOutput)
+    self.assertEqual(export_out.outputs, predictions)
+
+    classes = constant_op.constant(["class5"])
+    ret = export_utils.export_outputs_for_mode(
+        KerasModeKeys.PREDICT,
+        {"classify": export_output.ClassificationOutput(
+            classes=classes)})
+    self.assertIn("classify", ret)
+    export_out = ret["classify"]
+    self.assertIsInstance(export_out, export_output.ClassificationOutput)
+    self.assertEqual(export_out.classes, classes)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/saved_model/model_utils/export_utils.py b/tensorflow/python/saved_model/model_utils/export_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..adb6bf26677e0ff0e465291cdfc08e92a27ee85d
--- /dev/null
+++ b/tensorflow/python/saved_model/model_utils/export_utils.py
@@ -0,0 +1,354 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for creating SavedModels."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import os
+import time
+
+import six
+
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import signature_def_utils
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.saved_model.model_utils import export_output as export_output_lib
+from tensorflow.python.saved_model.model_utils import mode_keys
+from tensorflow.python.saved_model.model_utils.mode_keys import KerasModeKeys as ModeKeys
+from tensorflow.python.util import compat
+
+
+# Mapping of the modes to appropriate MetaGraph tags in the SavedModel.
+EXPORT_TAG_MAP = mode_keys.ModeKeyMap(**{
+    ModeKeys.PREDICT: [tag_constants.SERVING],
+    ModeKeys.TRAIN: [tag_constants.TRAINING],
+    ModeKeys.TEST: [tag_constants.EVAL]})
+
+# For every exported mode, a SignatureDef map should be created using the
+# functions `export_outputs_for_mode` and `build_all_signature_defs`. By
+# default, this map will contain a single Signature that defines the input
+# tensors and output predictions, losses, and/or metrics (depending on the mode)
+# The default keys used in the SignatureDef map are defined below.
+SIGNATURE_KEY_MAP = mode_keys.ModeKeyMap(**{
+    ModeKeys.PREDICT: signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
+    ModeKeys.TRAIN: signature_constants.DEFAULT_TRAIN_SIGNATURE_DEF_KEY,
+    ModeKeys.TEST: signature_constants.DEFAULT_EVAL_SIGNATURE_DEF_KEY})
+
+# Default names used in the SignatureDef input map, which maps strings to
+# TensorInfo protos.
+SINGLE_FEATURE_DEFAULT_NAME = 'feature'
+SINGLE_RECEIVER_DEFAULT_NAME = 'input'
+SINGLE_LABEL_DEFAULT_NAME = 'label'
+
+### Below utilities are specific to SavedModel exports.
+
+
+def build_all_signature_defs(receiver_tensors,
+                             export_outputs,
+                             receiver_tensors_alternatives=None,
+                             serving_only=True):
+  """Build `SignatureDef`s for all export outputs.
+
+  Args:
+    receiver_tensors: a `Tensor`, or a dict of string to `Tensor`, specifying
+      input nodes where this receiver expects to be fed by default.  Typically,
+      this is a single placeholder expecting serialized `tf.Example` protos.
+    export_outputs: a dict of ExportOutput instances, each of which has
+      an as_signature_def instance method that will be called to retrieve
+      the signature_def for all export output tensors.
+    receiver_tensors_alternatives: a dict of string to additional
+      groups of receiver tensors, each of which may be a `Tensor` or a dict of
+      string to `Tensor`.  These named receiver tensor alternatives generate
+      additional serving signatures, which may be used to feed inputs at
+      different points within the input receiver subgraph.  A typical usage is
+      to allow feeding raw feature `Tensor`s *downstream* of the
+      tf.parse_example() op.  Defaults to None.
+    serving_only: boolean; if true, resulting signature defs will only include
+      valid serving signatures. If false, all requested signatures will be
+      returned.
+
+  Returns:
+    signature_def representing all passed args.
+
+  Raises:
+    ValueError: if export_outputs is not a dict
+  """
+  if not isinstance(receiver_tensors, dict):
+    receiver_tensors = {SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors}
+  if export_outputs is None or not isinstance(export_outputs, dict):
+    raise ValueError('export_outputs must be a dict and not'
+                     '{}'.format(type(export_outputs)))
+
+  signature_def_map = {}
+  excluded_signatures = {}
+  for output_key, export_output in export_outputs.items():
+    signature_name = '{}'.format(output_key or 'None')
+    try:
+      signature = export_output.as_signature_def(receiver_tensors)
+      signature_def_map[signature_name] = signature
+    except ValueError as e:
+      excluded_signatures[signature_name] = str(e)
+
+  if receiver_tensors_alternatives:
+    for receiver_name, receiver_tensors_alt in (
+        six.iteritems(receiver_tensors_alternatives)):
+      if not isinstance(receiver_tensors_alt, dict):
+        receiver_tensors_alt = {
+            SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors_alt
+        }
+      for output_key, export_output in export_outputs.items():
+        signature_name = '{}:{}'.format(receiver_name or 'None', output_key or
+                                        'None')
+        try:
+          signature = export_output.as_signature_def(receiver_tensors_alt)
+          signature_def_map[signature_name] = signature
+        except ValueError as e:
+          excluded_signatures[signature_name] = str(e)
+
+  _log_signature_report(signature_def_map, excluded_signatures)
+
+  # The above calls to export_output_lib.as_signature_def should return only
+  # valid signatures; if there is a validity problem, they raise a ValueError,
+  # in which case we exclude that signature from signature_def_map above.
+  # The is_valid_signature check ensures that the signatures produced are
+  # valid for serving, and acts as an additional sanity check for export
+  # signatures produced for serving. We skip this check for training and eval
+  # signatures, which are not intended for serving.
+  if serving_only:
+    signature_def_map = {
+        k: v
+        for k, v in signature_def_map.items()
+        if signature_def_utils.is_valid_signature(v)
+    }
+  return signature_def_map
+
+
+_FRIENDLY_METHOD_NAMES = {
+    signature_constants.CLASSIFY_METHOD_NAME: 'Classify',
+    signature_constants.REGRESS_METHOD_NAME: 'Regress',
+    signature_constants.PREDICT_METHOD_NAME: 'Predict',
+    signature_constants.SUPERVISED_TRAIN_METHOD_NAME: 'Train',
+    signature_constants.SUPERVISED_EVAL_METHOD_NAME: 'Eval',
+}
+
+
+def _log_signature_report(signature_def_map, excluded_signatures):
+  """Log a report of which signatures were produced."""
+  sig_names_by_method_name = collections.defaultdict(list)
+
+  # We'll collect whatever method_names are present, but also we want to make
+  # sure to output a line for each of the three standard methods even if they
+  # have no signatures.
+  for method_name in _FRIENDLY_METHOD_NAMES:
+    sig_names_by_method_name[method_name] = []
+
+  for signature_name, sig in signature_def_map.items():
+    sig_names_by_method_name[sig.method_name].append(signature_name)
+
+  # TODO(b/67733540): consider printing the full signatures, not just names
+  for method_name, sig_names in sig_names_by_method_name.items():
+    if method_name in _FRIENDLY_METHOD_NAMES:
+      method_name = _FRIENDLY_METHOD_NAMES[method_name]
+    logging.info('Signatures INCLUDED in export for {}: {}'.format(
+        method_name, sig_names if sig_names else 'None'))
+
+  if excluded_signatures:
+    logging.info('Signatures EXCLUDED from export because they cannot be '
+                 'be served via TensorFlow Serving APIs:')
+    for signature_name, message in excluded_signatures.items():
+      logging.info('\'{}\' : {}'.format(signature_name, message))
+
+  if not signature_def_map:
+    logging.warn('Export includes no signatures!')
+  elif (signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY not in
+        signature_def_map):
+    logging.warn('Export includes no default signature!')
+
+
+# When we create a timestamped directory, there is a small chance that the
+# directory already exists because another process is also creating these
+# directories. In this case we just wait one second to get a new timestamp and
+# try again. If this fails several times in a row, then something is seriously
+# wrong.
+MAX_DIRECTORY_CREATION_ATTEMPTS = 10
+
+
+def get_timestamped_export_dir(export_dir_base):
+  """Builds a path to a new subdirectory within the base directory.
+
+  Each export is written into a new subdirectory named using the
+  current time.  This guarantees monotonically increasing version
+  numbers even across multiple runs of the pipeline.
+  The timestamp used is the number of seconds since epoch UTC.
+
+  Args:
+    export_dir_base: A string containing a directory to write the exported
+        graph and checkpoints.
+  Returns:
+    The full path of the new subdirectory (which is not actually created yet).
+
+  Raises:
+    RuntimeError: if repeated attempts fail to obtain a unique timestamped
+      directory name.
+  """
+  attempts = 0
+  while attempts < MAX_DIRECTORY_CREATION_ATTEMPTS:
+    timestamp = int(time.time())
+
+    result_dir = os.path.join(
+        compat.as_bytes(export_dir_base), compat.as_bytes(str(timestamp)))
+    if not gfile.Exists(result_dir):
+      # Collisions are still possible (though extremely unlikely): this
+      # directory is not actually created yet, but it will be almost
+      # instantly on return from this function.
+      return result_dir
+    time.sleep(1)
+    attempts += 1
+    logging.warn('Directory {} already exists; retrying (attempt {}/{})'.format(
+        result_dir, attempts, MAX_DIRECTORY_CREATION_ATTEMPTS))
+  raise RuntimeError('Failed to obtain a unique export directory name after '
+                     '{} attempts.'.format(MAX_DIRECTORY_CREATION_ATTEMPTS))
+
+
+def get_temp_export_dir(timestamped_export_dir):
+  """Builds a directory name based on the argument but starting with 'temp-'.
+
+  This relies on the fact that TensorFlow Serving ignores subdirectories of
+  the base directory that can't be parsed as integers.
+
+  Args:
+    timestamped_export_dir: the name of the eventual export directory, e.g.
+      /foo/bar/<timestamp>
+
+  Returns:
+    A sister directory prefixed with 'temp-', e.g. /foo/bar/temp-<timestamp>.
+  """
+  (dirname, basename) = os.path.split(timestamped_export_dir)
+  temp_export_dir = os.path.join(
+      compat.as_bytes(dirname), compat.as_bytes('temp-{}'.format(basename)))
+  return temp_export_dir
+
+
+def export_outputs_for_mode(
+    mode, serving_export_outputs=None, predictions=None, loss=None,
+    metrics=None):
+  """Util function for constructing a `ExportOutput` dict given a mode.
+
+  The returned dict can be directly passed to `build_all_signature_defs` helper
+  function as the `export_outputs` argument, used for generating a SignatureDef
+  map.
+
+  Args:
+    mode: A `ModeKeys` specifying the mode.
+    serving_export_outputs: Describes the output signatures to be exported to
+      `SavedModel` and used during serving. Should be a dict or None.
+    predictions: A dict of Tensors or single Tensor representing model
+        predictions. This argument is only used if serving_export_outputs is not
+        set.
+    loss: A dict of Tensors or single Tensor representing calculated loss.
+    metrics: A dict of (metric_value, update_op) tuples, or a single tuple.
+      metric_value must be a Tensor, and update_op must be a Tensor or Op
+
+  Returns:
+    Dictionary mapping the a key to an `tf.estimator.export.ExportOutput` object
+    The key is the expected SignatureDef key for the mode.
+
+  Raises:
+    ValueError: if an appropriate ExportOutput cannot be found for the mode.
+  """
+  if mode not in SIGNATURE_KEY_MAP:
+    raise ValueError(
+        'Export output type not found for mode: {}. Expected one of: {}.\n'
+        'One likely error is that V1 Estimator Modekeys were somehow passed to '
+        'this function. Please ensure that you are using the new ModeKeys.'
+        .format(mode, SIGNATURE_KEY_MAP.keys()))
+  signature_key = SIGNATURE_KEY_MAP[mode]
+  if mode_keys.is_predict(mode):
+    return get_export_outputs(serving_export_outputs, predictions)
+  elif mode_keys.is_train(mode):
+    return {signature_key: export_output_lib.TrainOutput(
+        loss=loss, predictions=predictions, metrics=metrics)}
+  else:
+    return {signature_key: export_output_lib.EvalOutput(
+        loss=loss, predictions=predictions, metrics=metrics)}
+
+
+def get_export_outputs(export_outputs, predictions):
+  """Validate export_outputs or create default export_outputs.
+
+  Args:
+    export_outputs: Describes the output signatures to be exported to
+      `SavedModel` and used during serving. Should be a dict or None.
+    predictions:  Predictions `Tensor` or dict of `Tensor`.
+
+  Returns:
+    Valid export_outputs dict
+
+  Raises:
+    TypeError: if export_outputs is not a dict or its values are not
+      ExportOutput instances.
+  """
+  if export_outputs is None:
+    default_output = export_output_lib.PredictOutput(predictions)
+    export_outputs = {
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: default_output}
+
+  if not isinstance(export_outputs, dict):
+    raise TypeError('export_outputs must be dict, given: {}'.format(
+        export_outputs))
+  for v in six.itervalues(export_outputs):
+    if not isinstance(v, export_output_lib.ExportOutput):
+      raise TypeError(
+          'Values in export_outputs must be ExportOutput objects. '
+          'Given: {}'.format(export_outputs))
+
+  _maybe_add_default_serving_output(export_outputs)
+
+  return export_outputs
+
+
+def _maybe_add_default_serving_output(export_outputs):
+  """Add a default serving output to the export_outputs if not present.
+
+  Args:
+    export_outputs: Describes the output signatures to be exported to
+      `SavedModel` and used during serving. Should be a dict.
+
+  Returns:
+    export_outputs dict with default serving signature added if necessary
+
+  Raises:
+    ValueError: if multiple export_outputs were provided without a default
+      serving key.
+  """
+  if len(export_outputs) == 1:
+    (key, value), = export_outputs.items()
+    if key != signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+      export_outputs[
+          signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] = value
+  if len(export_outputs) > 1:
+    if (signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+        not in export_outputs):
+      raise ValueError(
+          'Multiple export_outputs were provided, but none of them is '
+          'specified as the default.  Do this by naming one of them with '
+          'signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY.')
+
+  return export_outputs
diff --git a/tensorflow/python/saved_model/model_utils/mode_keys.py b/tensorflow/python/saved_model/model_utils/mode_keys.py
new file mode 100644
index 0000000000000000000000000000000000000000..2912de7210f8b8900f7383b537d13bc664f15158
--- /dev/null
+++ b/tensorflow/python/saved_model/model_utils/mode_keys.py
@@ -0,0 +1,109 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utils for managing different mode strings used by Keras and Estimator models.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+
+class KerasModeKeys(object):
+  """Standard names for model modes.
+
+  The following standard keys are defined:
+
+  * `TRAIN`: training/fitting mode.
+  * `TEST`: testing/evaluation mode.
+  * `PREDICT`: prediction/inference mode.
+  """
+
+  TRAIN = 'train'
+  TEST = 'test'
+  PREDICT = 'predict'
+
+
+# TODO(kathywu): Remove copy in Estimator after nightlies
+class EstimatorModeKeys(object):
+  """Standard names for Estimator model modes.
+
+  The following standard keys are defined:
+
+  * `TRAIN`: training/fitting mode.
+  * `EVAL`: testing/evaluation mode.
+  * `PREDICT`: predication/inference mode.
+  """
+
+  TRAIN = 'train'
+  EVAL = 'eval'
+  PREDICT = 'infer'
+
+
+def is_predict(mode):
+  return mode in [KerasModeKeys.PREDICT, EstimatorModeKeys.PREDICT]
+
+
+def is_eval(mode):
+  return mode in [KerasModeKeys.TEST, EstimatorModeKeys.EVAL]
+
+
+def is_train(mode):
+  return mode in [KerasModeKeys.TRAIN, EstimatorModeKeys.TRAIN]
+
+
+class ModeKeyMap(collections.Mapping):
+  """Map using ModeKeys as keys.
+
+  This class creates an immutable mapping from modes to values. For example,
+  SavedModel export of Keras and Estimator models use this to map modes to their
+  corresponding MetaGraph tags/SignatureDef keys.
+
+  Since this class uses modes, rather than strings, as keys, both "predict"
+  (Keras's PREDICT ModeKey) and "infer" (Estimator's PREDICT ModeKey) map to the
+  same value.
+  """
+
+  def __init__(self, **kwargs):
+    self._internal_dict = {}
+    self._keys = []
+    for key in kwargs:
+      self._keys.append(key)
+      dict_key = self._get_internal_key(key)
+      if dict_key in self._internal_dict:
+        raise ValueError(
+            'Error creating ModeKeyMap. Multiple keys/values found for {} mode.'
+            .format(dict_key))
+      self._internal_dict[dict_key] = kwargs[key]
+
+  def _get_internal_key(self, key):
+    """Return keys used for the internal dictionary."""
+    if is_train(key):
+      return KerasModeKeys.TRAIN
+    if is_eval(key):
+      return KerasModeKeys.TEST
+    if is_predict(key):
+      return KerasModeKeys.PREDICT
+    raise ValueError('Invalid mode key: {}.'.format(key))
+
+  def __getitem__(self, key):
+    return self._internal_dict[self._get_internal_key(key)]
+
+  def __iter__(self):
+    return iter(self._keys)
+
+  def __len__(self):
+    return len(self._keys)
diff --git a/tensorflow/python/saved_model/model_utils/mode_keys_test.py b/tensorflow/python/saved_model/model_utils/mode_keys_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..26795ef8b16a08e6426fa8399a38135dc8a4ac7c
--- /dev/null
+++ b/tensorflow/python/saved_model/model_utils/mode_keys_test.py
@@ -0,0 +1,65 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ModeKey Tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model.model_utils import mode_keys
+
+
+class ModeKeyMapTest(test.TestCase):
+
+  def test_map(self):
+    mode_map = mode_keys.ModeKeyMap(**{
+        mode_keys.KerasModeKeys.PREDICT: 3,
+        mode_keys.KerasModeKeys.TEST: 1
+    })
+
+    # Test dictionary __getitem__
+    self.assertEqual(3, mode_map[mode_keys.KerasModeKeys.PREDICT])
+    self.assertEqual(3, mode_map[mode_keys.EstimatorModeKeys.PREDICT])
+    self.assertEqual(1, mode_map[mode_keys.KerasModeKeys.TEST])
+    self.assertEqual(1, mode_map[mode_keys.EstimatorModeKeys.EVAL])
+    with self.assertRaises(KeyError):
+      _ = mode_map[mode_keys.KerasModeKeys.TRAIN]
+    with self.assertRaises(KeyError):
+      _ = mode_map[mode_keys.EstimatorModeKeys.TRAIN]
+    with self.assertRaisesRegexp(ValueError, 'Invalid mode'):
+      _ = mode_map['serve']
+
+    # Test common dictionary methods
+    self.assertLen(mode_map, 2)
+    self.assertEqual({1, 3}, set(mode_map.values()))
+    self.assertEqual(
+        {mode_keys.KerasModeKeys.TEST, mode_keys.KerasModeKeys.PREDICT},
+        set(mode_map.keys()))
+
+    # Map is immutable
+    with self.assertRaises(TypeError):
+      mode_map[mode_keys.KerasModeKeys.TEST] = 1
+
+  def test_invalid_init(self):
+    with self.assertRaisesRegexp(ValueError, 'Multiple keys/values found'):
+      _ = mode_keys.ModeKeyMap(**{
+          mode_keys.KerasModeKeys.PREDICT: 3,
+          mode_keys.EstimatorModeKeys.PREDICT: 1
+      })
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/saved_model/nested_structure_coder.py b/tensorflow/python/saved_model/nested_structure_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..59a2687edafdf7f8b98a6a00670ad0d975bbf1d7
--- /dev/null
+++ b/tensorflow/python/saved_model/nested_structure_coder.py
@@ -0,0 +1,436 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Module that encodes (decodes) nested structures into (from) protos.
+
+The intended use is to serialize everything needed to restore a `Function` that
+was saved into a SavedModel. This may include concrete function inputs and
+outputs, signatures, function specs, etc.
+
+Example use:
+coder = nested_structure_coder.StructureCoder()
+# Encode into proto.
+signature_proto = coder.encode_structure(function.input_signature)
+# Decode into a Python object.
+restored_signature = coder.decode_proto(signature_proto)
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import functools
+import six
+
+from tensorflow.core.protobuf import struct_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.util import compat
+
+
+class NotEncodableError(Exception):
+  """Error raised when a coder cannot encode an object."""
+
+
+class StructureCoder(object):
+  """Encoder and decoder for nested structures into protos."""
+
+  _codecs = []
+
+  @classmethod
+  def register_codec(cls, x):
+    cls._codecs.append(x)
+
+  @classmethod
+  def _get_encoders(cls):
+    return [(c.can_encode, c.do_encode) for c in cls._codecs]
+
+  @classmethod
+  def _get_decoders(cls):
+    return [(c.can_decode, c.do_decode) for c in cls._codecs]
+
+  def _map_structure(self, pyobj, coders):
+    for can, do in coders:
+      if can(pyobj):
+        recursion_fn = functools.partial(self._map_structure, coders=coders)
+        return do(pyobj, recursion_fn)
+    raise NotEncodableError(
+        "No encoder for object [%s] of type [%s]." % (str(pyobj), type(pyobj)))
+
+  def encode_structure(self, nested_structure):
+    """Encodes nested structures composed of encodable types into a proto.
+
+    Args:
+      nested_structure: Structure to encode.
+
+    Returns:
+      Encoded proto.
+
+    Raises:
+      NotEncodableError: For values for which there are no encoders.
+    """
+    return self._map_structure(nested_structure, self._get_encoders())
+
+  def can_encode(self, nested_structure):
+    """Determines whether a nested structure can be encoded into a proto.
+
+    Args:
+      nested_structure: Structure to encode.
+
+    Returns:
+      True if the nested structured can be encoded.
+    """
+    try:
+      self.encode_structure(nested_structure)
+    except NotEncodableError:
+      return False
+    return True
+
+  def decode_proto(self, proto):
+    """Decodes proto representing a nested structure.
+
+    Args:
+      proto: Proto to decode.
+
+    Returns:
+      Decoded structure.
+
+    Raises:
+      NotEncodableError: For values for which there are no encoders.
+    """
+    return self._map_structure(proto, self._get_decoders())
+
+
+class _ListCodec(object):
+  """Codec for lists."""
+
+  def can_encode(self, pyobj):
+    return isinstance(pyobj, list)
+
+  def do_encode(self, list_value, encode_fn):
+    encoded_list = struct_pb2.StructuredValue()
+    encoded_list.list_value.CopyFrom(struct_pb2.ListValue())
+    for element in list_value:
+      encoded_list.list_value.values.add().CopyFrom(encode_fn(element))
+    return encoded_list
+
+  def can_decode(self, value):
+    return value.HasField("list_value")
+
+  def do_decode(self, value, decode_fn):
+    return [decode_fn(element) for element in value.list_value.values]
+
+
+StructureCoder.register_codec(_ListCodec())
+
+
+def _is_tuple(obj):
+  return not _is_named_tuple(obj) and isinstance(obj, tuple)
+
+
+def _is_named_tuple(instance):
+  """Returns True iff `instance` is a `namedtuple`.
+
+  Args:
+    instance: An instance of a Python object.
+
+  Returns:
+    True if `instance` is a `namedtuple`.
+  """
+  if not isinstance(instance, tuple):
+    return False
+  return (hasattr(instance, "_fields") and
+          isinstance(instance._fields, collections.Sequence) and
+          all(isinstance(f, six.string_types) for f in instance._fields))
+
+
+class _TupleCodec(object):
+  """Codec for tuples."""
+
+  def can_encode(self, pyobj):
+    return _is_tuple(pyobj)
+
+  def do_encode(self, tuple_value, encode_fn):
+    encoded_tuple = struct_pb2.StructuredValue()
+    encoded_tuple.tuple_value.CopyFrom(struct_pb2.TupleValue())
+    for element in tuple_value:
+      encoded_tuple.tuple_value.values.add().CopyFrom(encode_fn(element))
+    return encoded_tuple
+
+  def can_decode(self, value):
+    return value.HasField("tuple_value")
+
+  def do_decode(self, value, decode_fn):
+    return tuple(decode_fn(element) for element in value.tuple_value.values)
+
+
+StructureCoder.register_codec(_TupleCodec())
+
+
+class _DictCodec(object):
+  """Codec for dicts."""
+
+  def can_encode(self, pyobj):
+    return isinstance(pyobj, dict)
+
+  def do_encode(self, dict_value, encode_fn):
+    encoded_dict = struct_pb2.StructuredValue()
+    encoded_dict.dict_value.CopyFrom(struct_pb2.DictValue())
+    for key, value in dict_value.items():
+      encoded_dict.dict_value.fields[key].CopyFrom(encode_fn(value))
+    return encoded_dict
+
+  def can_decode(self, value):
+    return value.HasField("dict_value")
+
+  def do_decode(self, value, decode_fn):
+    return {key: decode_fn(val) for key, val in value.dict_value.fields.items()}
+
+
+StructureCoder.register_codec(_DictCodec())
+
+
+class _NamedTupleCodec(object):
+  """Codec for namedtuples.
+
+  Encoding and decoding a namedtuple reconstructs a namedtuple with a different
+  actual Python type, but with same `typename` and `fields`.
+  """
+
+  def can_encode(self, pyobj):
+    return _is_named_tuple(pyobj)
+
+  def do_encode(self, named_tuple_value, encode_fn):
+    encoded_named_tuple = struct_pb2.StructuredValue()
+    encoded_named_tuple.named_tuple_value.CopyFrom(struct_pb2.NamedTupleValue())
+    encoded_named_tuple.named_tuple_value.name = \
+      named_tuple_value.__class__.__name__
+    for key in named_tuple_value._fields:
+      pair = encoded_named_tuple.named_tuple_value.values.add()
+      pair.key = key
+      pair.value.CopyFrom(encode_fn(named_tuple_value._asdict()[key]))
+    return encoded_named_tuple
+
+  def can_decode(self, value):
+    return value.HasField("named_tuple_value")
+
+  def do_decode(self, value, decode_fn):
+    key_value_pairs = value.named_tuple_value.values
+    items = [(pair.key, decode_fn(pair.value)) for pair in key_value_pairs]
+    named_tuple_type = collections.namedtuple(value.named_tuple_value.name,
+                                              [item[0] for item in items])
+    return named_tuple_type(**dict(items))
+
+
+StructureCoder.register_codec(_NamedTupleCodec())
+
+
+class _Float64Codec(object):
+  """Codec for floats."""
+
+  def can_encode(self, pyobj):
+    return isinstance(pyobj, float)
+
+  def do_encode(self, float64_value, encode_fn):
+    del encode_fn
+    value = struct_pb2.StructuredValue()
+    value.float64_value = float64_value
+    return value
+
+  def can_decode(self, value):
+    return value.HasField("float64_value")
+
+  def do_decode(self, value, decode_fn):
+    del decode_fn
+    return value.float64_value
+
+
+StructureCoder.register_codec(_Float64Codec())
+
+
+class _Int64Codec(object):
+  """Codec for Python integers (limited to 64 bit values)."""
+
+  def can_encode(self, pyobj):
+    return not isinstance(pyobj, bool) and isinstance(pyobj, int)
+
+  def do_encode(self, int_value, encode_fn):
+    del encode_fn
+    value = struct_pb2.StructuredValue()
+    value.int64_value = int_value
+    return value
+
+  def can_decode(self, value):
+    return value.HasField("int64_value")
+
+  def do_decode(self, value, decode_fn):
+    del decode_fn
+    return int(value.int64_value)
+
+
+StructureCoder.register_codec(_Int64Codec())
+
+
+class _StringCodec(object):
+  """Codec for strings.
+
+  See StructuredValue.string_value in proto/struct.proto for more detailed
+  explanation.
+  """
+
+  def can_encode(self, pyobj):
+    return isinstance(pyobj, str)
+
+  def do_encode(self, string_value, encode_fn):
+    del encode_fn
+    value = struct_pb2.StructuredValue()
+    value.string_value = string_value
+    return value
+
+  def can_decode(self, value):
+    return value.HasField("string_value")
+
+  def do_decode(self, value, decode_fn):
+    del decode_fn
+    return compat.as_str(value.string_value)
+
+
+StructureCoder.register_codec(_StringCodec())
+
+
+class _NoneCodec(object):
+  """Codec for None."""
+
+  def can_encode(self, pyobj):
+    return pyobj is None
+
+  def do_encode(self, none_value, encode_fn):
+    del encode_fn, none_value
+    value = struct_pb2.StructuredValue()
+    value.none_value.CopyFrom(struct_pb2.NoneValue())
+    return value
+
+  def can_decode(self, value):
+    return value.HasField("none_value")
+
+  def do_decode(self, value, decode_fn):
+    del decode_fn, value
+    return None
+
+
+StructureCoder.register_codec(_NoneCodec())
+
+
+class _BoolCodec(object):
+  """Codec for booleans."""
+
+  def can_encode(self, pyobj):
+    return isinstance(pyobj, bool)
+
+  def do_encode(self, bool_value, encode_fn):
+    del encode_fn
+    value = struct_pb2.StructuredValue()
+    value.bool_value = bool_value
+    return value
+
+  def can_decode(self, value):
+    return value.HasField("bool_value")
+
+  def do_decode(self, value, decode_fn):
+    del decode_fn
+    return value.bool_value
+
+
+StructureCoder.register_codec(_BoolCodec())
+
+
+class _TensorShapeCodec(object):
+  """Codec for `TensorShape`."""
+
+  def can_encode(self, pyobj):
+    return isinstance(pyobj, tensor_shape.TensorShape)
+
+  def do_encode(self, tensor_shape_value, encode_fn):
+    del encode_fn
+    encoded_tensor_shape = struct_pb2.StructuredValue()
+    encoded_tensor_shape.tensor_shape_value.CopyFrom(
+        tensor_shape_value.as_proto())
+    return encoded_tensor_shape
+
+  def can_decode(self, value):
+    return value.HasField("tensor_shape_value")
+
+  def do_decode(self, value, decode_fn):
+    del decode_fn
+    return tensor_shape.TensorShape(value.tensor_shape_value)
+
+
+StructureCoder.register_codec(_TensorShapeCodec())
+
+
+class _TensorTypeCodec(object):
+  """Codec for `TensorType`."""
+
+  def can_encode(self, pyobj):
+    return isinstance(pyobj, dtypes.DType)
+
+  def do_encode(self, tensor_dtype_value, encode_fn):
+    del encode_fn
+    encoded_tensor_type = struct_pb2.StructuredValue()
+    encoded_tensor_type.tensor_dtype_value = tensor_dtype_value.as_datatype_enum
+    return encoded_tensor_type
+
+  def can_decode(self, value):
+    return value.HasField("tensor_dtype_value")
+
+  def do_decode(self, value, decode_fn):
+    del decode_fn
+    return dtypes.DType(value.tensor_dtype_value)
+
+
+StructureCoder.register_codec(_TensorTypeCodec())
+
+
+class _TensorSpecCodec(object):
+  """Codec for `TensorSpec`."""
+
+  def can_encode(self, pyobj):
+    return isinstance(pyobj, tensor_spec.TensorSpec)
+
+  def do_encode(self, tensor_spec_value, encode_fn):
+    encoded_tensor_spec = struct_pb2.StructuredValue()
+    encoded_tensor_spec.tensor_spec_value.CopyFrom(
+        struct_pb2.TensorSpecProto(
+            shape=encode_fn(tensor_spec_value.shape).tensor_shape_value,
+            dtype=encode_fn(tensor_spec_value.dtype).tensor_dtype_value,
+            name=tensor_spec_value.name))
+    return encoded_tensor_spec
+
+  def can_decode(self, value):
+    return value.HasField("tensor_spec_value")
+
+  def do_decode(self, value, decode_fn):
+    return tensor_spec.TensorSpec(
+        shape=decode_fn(
+            struct_pb2.StructuredValue(
+                tensor_shape_value=value.tensor_spec_value.shape)),
+        dtype=decode_fn(
+            struct_pb2.StructuredValue(
+                tensor_dtype_value=value.tensor_spec_value.dtype)),
+        name=value.tensor_spec_value.name)
+
+
+StructureCoder.register_codec(_TensorSpecCodec())
diff --git a/tensorflow/python/saved_model/nested_structure_coder_test.py b/tensorflow/python/saved_model/nested_structure_coder_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1538fbf1271dadaa9bb7d82359f8ea38fcb95a01
--- /dev/null
+++ b/tensorflow/python/saved_model/nested_structure_coder_test.py
@@ -0,0 +1,183 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for nested structure coding."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.core.protobuf import struct_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import nested_structure_coder
+
+
+class NestedStructureTest(test.TestCase):
+
+  def setUp(self):
+    self._coder = nested_structure_coder.StructureCoder()
+
+  def testEncodeDecodeList(self):
+    structure = [1.5, 2.5, 3.0]
+    self.assertTrue(self._coder.can_encode(structure))
+    encoded = self._coder.encode_structure(structure)
+    expected = struct_pb2.StructuredValue()
+    expected.list_value.values.add().float64_value = 1.5
+    expected.list_value.values.add().float64_value = 2.5
+    expected.list_value.values.add().float64_value = 3.0
+    self.assertEqual(expected, encoded)
+    decoded = self._coder.decode_proto(encoded)
+    self.assertEqual(structure, decoded)
+
+  def testEncodeDecodeTuple(self):
+    structure = ("hello", [3, (2, 1)])
+    self.assertTrue(self._coder.can_encode(structure))
+    encoded = self._coder.encode_structure(structure)
+    expected = struct_pb2.StructuredValue()
+    expected.tuple_value.values.add().string_value = "hello"
+    list_value = expected.tuple_value.values.add().list_value
+    list_value.values.add().int64_value = 3
+    tuple_value = list_value.values.add().tuple_value
+    tuple_value.values.add().int64_value = 2
+    tuple_value.values.add().int64_value = 1
+    self.assertEqual(expected, encoded)
+    decoded = self._coder.decode_proto(encoded)
+    self.assertEqual(structure, decoded)
+
+  def testEncodeDecodeDict(self):
+    structure = dict(a=3, b=[7, 2.5])
+    self.assertTrue(self._coder.can_encode(structure))
+    encoded = self._coder.encode_structure(structure)
+    expected = struct_pb2.StructuredValue()
+    expected.dict_value.fields["a"].int64_value = 3
+    list_value = expected.dict_value.fields["b"].list_value
+    list_value.values.add().int64_value = 7
+    list_value.values.add().float64_value = 2.5
+    self.assertEqual(expected, encoded)
+    decoded = self._coder.decode_proto(encoded)
+    self.assertIsInstance(decoded["a"], int)
+    self.assertEqual(structure, decoded)
+
+  def testEncodeDecodeTensorShape(self):
+    structure = [tensor_shape.TensorShape([1, 2, 3]), "hello"]
+    self.assertTrue(self._coder.can_encode(structure))
+    encoded = self._coder.encode_structure(structure)
+    expected = struct_pb2.StructuredValue()
+    expected_list = expected.list_value
+    expected_tensor_shape = expected_list.values.add().tensor_shape_value
+    expected_tensor_shape.dim.add().size = 1
+    expected_tensor_shape.dim.add().size = 2
+    expected_tensor_shape.dim.add().size = 3
+    expected_tensor_shape = expected_list.values.add().string_value = "hello"
+    self.assertEqual(expected, encoded)
+    decoded = self._coder.decode_proto(encoded)
+    self.assertEqual(structure, decoded)
+
+  def testEncodeDecodeNamedTuple(self):
+    named_tuple_type = collections.namedtuple("NamedTuple", ["x", "y"])
+    named_tuple = named_tuple_type(x=[1, 2], y="hello")
+    self.assertTrue(self._coder.can_encode(named_tuple))
+    encoded = self._coder.encode_structure(named_tuple)
+    expected = struct_pb2.StructuredValue()
+    expected_named_tuple = expected.named_tuple_value
+    expected_named_tuple.name = "NamedTuple"
+    key_value_pair = expected_named_tuple.values.add()
+    key_value_pair.key = "x"
+    list_value = key_value_pair.value.list_value
+    list_value.values.add().int64_value = 1
+    list_value.values.add().int64_value = 2
+    key_value_pair = expected_named_tuple.values.add()
+    key_value_pair.key = "y"
+    key_value_pair.value.string_value = "hello"
+    self.assertEqual(expected, encoded)
+    decoded = self._coder.decode_proto(encoded)
+    self.assertEqual(named_tuple._asdict(), decoded._asdict())
+    self.assertEqual(named_tuple.__class__.__name__, decoded.__class__.__name__)
+
+  def testNone(self):
+    structure = [1.0, None]
+    self.assertTrue(self._coder.can_encode(structure))
+    encoded = self._coder.encode_structure(structure)
+    expected = struct_pb2.StructuredValue()
+    expected.list_value.values.add().float64_value = 1.0
+    expected.list_value.values.add().none_value.CopyFrom(struct_pb2.NoneValue())
+    self.assertEqual(expected, encoded)
+    decoded = self._coder.decode_proto(encoded)
+    self.assertEqual(structure, decoded)
+
+  def testBool(self):
+    structure = [False]
+    self.assertTrue(self._coder.can_encode(structure))
+    encoded = self._coder.encode_structure(structure)
+    expected = struct_pb2.StructuredValue()
+    expected.list_value.values.add().bool_value = False
+    self.assertEqual(expected, encoded)
+    decoded = self._coder.decode_proto(encoded)
+    self.assertEqual(structure, decoded)
+
+  def testEmptyStructures(self):
+    structure = [list(), dict(), tuple()]
+    self.assertTrue(self._coder.can_encode(structure))
+    encoded = self._coder.encode_structure(structure)
+    expected = struct_pb2.StructuredValue()
+    expected.list_value.values.add().list_value.CopyFrom(struct_pb2.ListValue())
+    expected.list_value.values.add().dict_value.CopyFrom(struct_pb2.DictValue())
+    expected.list_value.values.add().tuple_value.CopyFrom(
+        struct_pb2.TupleValue())
+    self.assertEqual(expected, encoded)
+    decoded = self._coder.decode_proto(encoded)
+    self.assertEqual(structure, decoded)
+
+  def testDtype(self):
+    structure = [dtypes.int64]
+    self.assertTrue(self._coder.can_encode(structure))
+    encoded = self._coder.encode_structure(structure)
+    expected = struct_pb2.StructuredValue()
+    list_value = expected.list_value.values.add()
+    list_value.tensor_dtype_value = dtypes.int64.as_datatype_enum
+    self.assertEqual(expected, encoded)
+    decoded = self._coder.decode_proto(encoded)
+    self.assertEqual(structure, decoded)
+
+  def testEncodeDecodeTensorSpec(self):
+    structure = [tensor_spec.TensorSpec([1, 2, 3], dtypes.int64, "hello")]
+    self.assertTrue(self._coder.can_encode(structure))
+    encoded = self._coder.encode_structure(structure)
+    expected = struct_pb2.StructuredValue()
+    expected_list = expected.list_value
+    expected_tensor_spec = expected_list.values.add().tensor_spec_value
+    expected_tensor_spec.shape.dim.add().size = 1
+    expected_tensor_spec.shape.dim.add().size = 2
+    expected_tensor_spec.shape.dim.add().size = 3
+    expected_tensor_spec.name = "hello"
+    expected_tensor_spec.dtype = dtypes.int64.as_datatype_enum
+    self.assertEqual(expected, encoded)
+    decoded = self._coder.decode_proto(encoded)
+    self.assertEqual(structure, decoded)
+
+  def testNotEncodable(self):
+
+    class NotEncodable(object):
+      pass
+
+    self.assertFalse(self._coder.can_encode([NotEncodable()]))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/saved_model/revived_types.py b/tensorflow/python/saved_model/revived_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..3140d3d617d458dccb1bb6107b679b927d0d151e
--- /dev/null
+++ b/tensorflow/python/saved_model/revived_types.py
@@ -0,0 +1,167 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Handles types registrations for tf.saved_model.load."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.framework import versions_pb2
+from tensorflow.core.protobuf import saved_object_graph_pb2
+
+
+class VersionedTypeRegistration(object):
+  """Holds information about one version of a revived type."""
+
+  def __init__(self, object_factory, version, min_producer_version,
+               min_consumer_version, bad_consumers=None, setter=setattr):
+    """Identify a revived type version.
+
+    Args:
+      object_factory: A callable which takes a SavedUserObject proto and returns
+        a trackable object. Dependencies are added later via `setter`.
+      version: An integer, the producer version of this wrapper type. When
+        making incompatible changes to a wrapper, add a new
+        `VersionedTypeRegistration` with an incremented `version`. The most
+        recent version will be saved, and all registrations with a matching
+        identifier will be searched for the highest compatible version to use
+        when loading.
+      min_producer_version: The minimum producer version number required to use
+        this `VersionedTypeRegistration` when loading a proto.
+      min_consumer_version: `VersionedTypeRegistration`s with a version number
+        less than `min_consumer_version` will not be used to load a proto saved
+        with this object. `min_consumer_version` should be set to the lowest
+        version number which can successfully load protos saved by this
+        object. If no matching registration is available on load, the object
+        will be revived with a generic trackable type.
+
+        `min_consumer_version` and `bad_consumers` are a blunt tool, and using
+        them will generally break forward compatibility: previous versions of
+        TensorFlow will revive newly saved objects as opaque trackable
+        objects rather than wrapped objects. When updating wrappers, prefer
+        saving new information but preserving compatibility with previous
+        wrapper versions. They are, however, useful for ensuring that
+        previously-released buggy wrapper versions degrade gracefully rather
+        than throwing exceptions when presented with newly-saved SavedModels.
+      bad_consumers: A list of consumer versions which are incompatible (in
+        addition to any version less than `min_consumer_version`).
+      setter: A callable with the same signature as `setattr` to use when adding
+        dependencies to generated objects.
+    """
+    self.setter = setter
+    self.identifier = None  # Set after registration
+    self._object_factory = object_factory
+    self.version = version
+    self._min_consumer_version = min_consumer_version
+    self._min_producer_version = min_producer_version
+    if bad_consumers is None:
+      bad_consumers = []
+    self._bad_consumers = bad_consumers
+
+  def to_proto(self):
+    """Create a SavedUserObject proto."""
+    # For now wrappers just use dependencies to save their state, so the
+    # SavedUserObject doesn't depend on the object being saved.
+    # TODO(allenl): Add a wrapper which uses its own proto.
+    return saved_object_graph_pb2.SavedUserObject(
+        identifier=self.identifier,
+        version=versions_pb2.VersionDef(
+            producer=self.version,
+            min_consumer=self._min_consumer_version,
+            bad_consumers=self._bad_consumers))
+
+  def from_proto(self, proto):
+    """Recreate a trackable object from a SavedUserObject proto."""
+    return self._object_factory(proto)
+
+  def should_load(self, proto):
+    """Checks if this object should load the SavedUserObject `proto`."""
+    if proto.identifier != self.identifier:
+      return False
+    if self.version < proto.version.min_consumer:
+      return False
+    if proto.version.producer < self._min_producer_version:
+      return False
+    for bad_version in proto.version.bad_consumers:
+      if self.version == bad_version:
+        return False
+    return True
+
+
+# string identifier -> (predicate, [VersionedTypeRegistration])
+_REVIVED_TYPE_REGISTRY = {}
+_TYPE_IDENTIFIERS = []
+
+
+def register_revived_type(identifier, predicate, versions):
+  """Register a type for revived objects.
+
+  Args:
+    identifier: A unique string identifying this class of objects.
+    predicate: A Boolean predicate for this registration. Takes a
+      trackable object as an argument. If True, `type_registration` may be
+      used to save and restore the object.
+    versions: A list of `VersionedTypeRegistration` objects.
+  """
+  # Keep registrations in order of version. We always use the highest matching
+  # version (respecting the min consumer version and bad consumers).
+  versions.sort(key=lambda reg: reg.version, reverse=True)
+  if not versions:
+    raise AssertionError("Need at least one version of a registered type.")
+  version_numbers = set()
+  for registration in versions:
+    # Copy over the identifier for use in generating protos
+    registration.identifier = identifier
+    if registration.version in version_numbers:
+      raise AssertionError(
+          "Got multiple registrations with version {} for type {}".format(
+              registration.version, identifier))
+    version_numbers.add(registration.version)
+  if identifier in _REVIVED_TYPE_REGISTRY:
+    raise AssertionError(
+        "Duplicate registrations for type {}".format(identifier))
+
+  _REVIVED_TYPE_REGISTRY[identifier] = (predicate, versions)
+  _TYPE_IDENTIFIERS.append(identifier)
+
+
+def serialize(obj):
+  """Create a SavedUserObject from a trackable object."""
+  for identifier in _TYPE_IDENTIFIERS:
+    predicate, versions = _REVIVED_TYPE_REGISTRY[identifier]
+    if predicate(obj):
+      # Always uses the most recent version to serialize.
+      return versions[0].to_proto()
+  return None
+
+
+def deserialize(proto):
+  """Create a trackable object from a SavedUserObject proto.
+
+  Args:
+    proto: A SavedUserObject to deserialize.
+
+  Returns:
+    A tuple of (trackable, assignment_fn) where assignment_fn has the same
+    signature as setattr and should be used to add dependencies to
+    `trackable` when they are available.
+  """
+  _, type_registrations = _REVIVED_TYPE_REGISTRY.get(
+      proto.identifier, (None, None))
+  if type_registrations is not None:
+    for type_registration in type_registrations:
+      if type_registration.should_load(proto):
+        return (type_registration.from_proto(proto), type_registration.setter)
+  return None
diff --git a/tensorflow/python/saved_model/revived_types_test.py b/tensorflow/python/saved_model/revived_types_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bd806f315aa7899d7e816739191a6c1af111912
--- /dev/null
+++ b/tensorflow/python/saved_model/revived_types_test.py
@@ -0,0 +1,110 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for revived type matching."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.framework import versions_pb2
+from tensorflow.core.protobuf import saved_object_graph_pb2
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import revived_types
+from tensorflow.python.training.tracking import tracking
+
+
+class CustomTestClass(tracking.AutoTrackable):
+
+  def __init__(self, version):
+    self.version = version
+
+
+revived_types.register_revived_type(
+    "test_type",
+    lambda obj: isinstance(obj, CustomTestClass),
+    versions=[
+        revived_types.VersionedTypeRegistration(
+            object_factory=lambda _: CustomTestClass(1),
+            version=1, min_producer_version=1,
+            min_consumer_version=1),
+        revived_types.VersionedTypeRegistration(
+            object_factory=lambda _: CustomTestClass(2),
+            version=2, min_producer_version=2, min_consumer_version=1),
+        revived_types.VersionedTypeRegistration(
+            object_factory=lambda _: CustomTestClass(3),
+            version=3, min_producer_version=3, min_consumer_version=2),
+        revived_types.VersionedTypeRegistration(
+            object_factory=lambda _: CustomTestClass(4),
+            version=4, min_producer_version=4, min_consumer_version=2,
+            bad_consumers=[3]),
+    ]
+)
+
+
+class RegistrationMatchingTest(test.TestCase):
+
+  def test_save_typecheck(self):
+    self.assertIs(revived_types.serialize(tracking.AutoTrackable()), None)
+
+  def test_load_identifier_not_found(self):
+    nothing_matches = revived_types.deserialize(
+        saved_object_graph_pb2.SavedUserObject(
+            identifier="_unregistered_type",
+            version=versions_pb2.VersionDef(
+                producer=1,
+                min_consumer=1,
+                bad_consumers=[])))
+    self.assertIs(nothing_matches, None)
+
+  def test_most_recent_version_saved(self):
+    serialized = revived_types.serialize(CustomTestClass(None))
+    self.assertEqual([3], serialized.version.bad_consumers)
+    deserialized, _ = revived_types.deserialize(serialized)
+    self.assertIsInstance(deserialized, CustomTestClass)
+    self.assertEqual(4, deserialized.version)
+
+  def test_min_consumer_version(self):
+    nothing_matches = revived_types.deserialize(
+        saved_object_graph_pb2.SavedUserObject(
+            identifier="test_type",
+            version=versions_pb2.VersionDef(
+                producer=5,
+                min_consumer=5,
+                bad_consumers=[])))
+    self.assertIs(nothing_matches, None)
+
+  def test_bad_versions(self):
+    deserialized, _ = revived_types.deserialize(
+        saved_object_graph_pb2.SavedUserObject(
+            identifier="test_type",
+            version=versions_pb2.VersionDef(
+                producer=5,
+                min_consumer=1,
+                bad_consumers=[4, 3])))
+    self.assertEqual(2, deserialized.version)
+
+  def test_min_producer_version(self):
+    deserialized, _ = revived_types.deserialize(
+        saved_object_graph_pb2.SavedUserObject(
+            identifier="test_type",
+            version=versions_pb2.VersionDef(
+                producer=3,
+                min_consumer=0,
+                bad_consumers=[])))
+    self.assertEqual(3, deserialized.version)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index ab6fcb7196fcc243d69b53b595b53b0dd00071f4..6cdbee4187cd3250c11736606d7003a26cd226b2 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -12,18 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Exports a SavedModel from a Checkpointable Python object."""
+"""Exports a SavedModel from a Trackable Python object."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import collections
-import functools
 import os
 
+from tensorflow.core.framework import versions_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saved_model_pb2
+from tensorflow.core.protobuf import saved_object_graph_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as defun
@@ -31,153 +32,220 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_spec
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.saved_model import builder_impl
 from tensorflow.python.saved_model import constants
-from tensorflow.python.saved_model import saved_object_graph_pb2
+from tensorflow.python.saved_model import function_serialization
+from tensorflow.python.saved_model import nested_structure_coder
+from tensorflow.python.saved_model import revived_types
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
+from tensorflow.python.saved_model import signature_serialization
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.saved_model import utils_impl
-from tensorflow.python.training.checkpointable import base
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.saving import functional_saver
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import graph_view
+from tensorflow.python.training.tracking import object_identity
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
 from tensorflow.python.util import compat
-from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
+_UNCOPIABLE_DTYPES = frozenset((dtypes.resource, dtypes.variant))
 
-def _check_for_functional_keras_model(root):
-  """Makes an export signature for `root` if it's a functional Keras Model."""
-  # If nothing is decorated yet but this is a functional Keras Model (duck
-  # typed), we'll try to make a signature ourselves.
-  try:
-    inputs = root.inputs
-    input_names = root.input_names
-  except AttributeError:
-    return None
-  input_signature = []
-  for input_tensor, input_name in zip(inputs, input_names):
-    input_signature.append(tensor_spec.TensorSpec(
-        shape=input_tensor.shape, dtype=input_tensor.dtype,
-        name=input_name))
-
-  @def_function.function(input_signature=input_signature)
-  def _wrapped_model(*args):
-    outputs_list = nest.flatten(root(inputs=list(args)))
-    return {name: output for name, output
-            in zip(root.output_names, outputs_list)}
-  return _wrapped_model
-
-
-def _find_function_to_export(root):
-  """Iterate over `root`'s attributes, finding traced functions."""
-  exported_function = None
-  previous_attribute_name = None
-  for attribute_name in dir(root):
-    attribute_value = getattr(root, attribute_name, None)
-    if isinstance(attribute_value, def_function.PolymorphicFunction):
-      if exported_function is not None:
-        raise ValueError(
-            ("Exporting an object with no "
-             "tf.saved_model.save(..., signatures=...) "
-             "argument specified, and with more than one "
-             "@tf.function-decorated method attached to it: {}. The signature "
-             "keys for these functions are ambiguous. Specify signature "
-             "functions explicitly.").format(
-                 [previous_attribute_name, attribute_name]))
-      exported_function = attribute_value
-      previous_attribute_name = attribute_name
-  if exported_function is None:
-    exported_function = _check_for_functional_keras_model(root)
-  if exported_function is None:
-    raise ValueError(
-        ("Exporting an object with no tf.saved_model.save(..., signatures=...) "
-         "argument specified, and with no @tf.function-decorated methods "
-         "attached to it. In the future this will be a supported use-case for "
-         "Python re-import, but at the moment saving a SavedModel without "
-         "signatures does not make sense, as the only consumers will expect "
-         "signatures. Either decorate a method or specify a signature function "
-         "explicitly."))
-  return exported_function
-
-
-def _canonicalize_signatures(signatures):
-  """Converts `signatures` into a dictionary of concrete functions."""
-  if not isinstance(signatures, collections.Mapping):
-    signatures = {
-        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signatures}
-  concrete_signatures = {}
-  for serving_key, signature_function in signatures.items():
-    if isinstance(signature_function, (defun.PolymorphicFunction,
-                                       def_function.PolymorphicFunction)):
-      input_signature = signature_function._input_signature  # pylint: disable=protected-access
-      if input_signature is None:
-        raise ValueError(
-            ("Unable to use the function {} as a signature directly. Functions "
-             "used to generate serving signatures must either have an "
-             "`input_signature=` specified when constructed, or must be "
-             "converted to concrete functions using "
-             "`f.get_concrete_function(...)`.").format(signature_function))
-      signature_function = signature_function.get_concrete_function()
-    elif not isinstance(signature_function, defun.Function):
-      raise ValueError(
-          ("Expected a TensorFlow function to generate a signature for, but "
-           "got {}. Python functions may be decorated with "
-           "`@tf.function(input_signature=...)` and passed as signatures "
-           "directly, or created without a signature using `@tf.function` "
-           "and then converted to a concrete TensorFlow function using "
-           "`f.get_concrete_function(...)`.").format(signature_function))
-    concrete_signatures[serving_key] = signature_function
-  return concrete_signatures
-
-
-def _is_flat(sequence):
-  sequence_flat = nest.flatten(sequence)
-  try:
-    nest.assert_same_structure(sequence_flat, sequence)
-    return True
-  except ValueError:
-    return False
-  except TypeError:
-    return False
-
-
-def _normalize_outputs(outputs, function_name, signature_key):
-  """Construct an output dictionary from unnormalized function outputs."""
-  if isinstance(outputs, collections.Mapping):
-    for key, value in outputs.items():
-      if not isinstance(value, ops.Tensor):
-        raise ValueError(
-            ("Got a dictionary containing non-Tensor value {} for key {} "
-             "in the output of the function {} used to generate a SavedModel "
-             "signature. Dictionaries outputs for functions used as signatures "
-             "should have one Tensor output per string key.")
-            .format(value, key, compat.as_str_any(function_name)))
-    return outputs
-  else:
-    original_outputs = outputs
-    if not isinstance(outputs, collections.Sequence):
-      outputs = [outputs]
-    if not _is_flat(outputs):
-      raise ValueError(
-          ("Got non-flat outputs '{}' from '{}' for SavedModel "
-           "signature '{}'. Signatures have one Tensor per output, so "
-           "to have predictable names Python functions used to generate "
-           "these signatures should avoid outputting Tensors in nested "
-           "structures.")
-          .format(original_outputs, function_name, signature_key))
-    return {("output_{}".format(output_index)): output
-            for output_index, output
-            in enumerate(outputs)}
+
+# A container for an EagerTensor constant which has been copied to the exported
+# Graph.
+_CapturedConstant = collections.namedtuple(
+    "_CapturedConstant", ["eager_tensor", "graph_tensor"])
+
+
+class _AugmentedGraphView(graph_view.ObjectGraphView):
+  """An extendable graph which also tracks functions attached to objects.
+
+  Extensions through `add_object` appear in the object graph and any checkpoints
+  generated from it, even if they are not dependencies of the node they were
+  attached to in the saving program. For example a `.signatures` attribute is
+  added to exported SavedModel root objects without modifying the root object
+  itself.
+
+  Also tracks functions attached to objects in the graph, through the caching
+  `list_functions` method. Enumerating functions only through this method
+  ensures that we get a consistent view of functions, even if object attributes
+  create new functions every time they are accessed.
+  """
+
+  def __init__(self, root):
+    super(_AugmentedGraphView, self).__init__(root)
+    # Object -> (name -> dep)
+    self._extra_dependencies = object_identity.ObjectIdentityDictionary()
+    self._functions = object_identity.ObjectIdentityDictionary()
+
+  def add_object(self, parent_node, name_in_parent, subgraph_root):
+    """Attach an object to `parent_node`, overriding any existing dependency."""
+    self._extra_dependencies.setdefault(
+        parent_node, {})[name_in_parent] = subgraph_root
+
+  def list_dependencies(self, obj):
+    """Overrides a parent method to include `add_object` objects."""
+    extra_dependencies = self._extra_dependencies.get(obj, {})
+    used_names = set()
+    for name, dep in super(_AugmentedGraphView, self).list_dependencies(obj):
+      used_names.add(name)
+      if name in extra_dependencies:
+        yield base.TrackableReference(name, extra_dependencies[name])
+      else:
+        yield base.TrackableReference(name, dep)
+    for name, dep in extra_dependencies.items():
+      if name in used_names:
+        continue
+      yield base.TrackableReference(name, dep)
+
+  def list_functions(self, obj):
+    obj_functions = self._functions.get(obj, None)
+    if obj_functions is None:
+      obj_functions = obj._list_functions_for_serialization()  # pylint: disable=protected-access
+      self._functions[obj] = obj_functions
+    return obj_functions
+
+
+class _SaveableView(object):
+  """Provides a frozen view over a trackable root.
+
+  This class helps creating a single stable view over an object to save. The
+  saving code should access properties and functions via this class and not via
+  the original object as there are cases where an object construct their
+  trackable attributes and functions dynamically per call and will yield
+  different objects if invoked more than once.
+
+  Changes to the graph, for example adding objects, must happen in
+  `checkpoint_view` (an `_AugmentedGraphView`) before the `_SaveableView` is
+  constructed. Changes after the `_SaveableView` has been constructed will be
+  ignored.
+  """
+
+  def __init__(self, checkpoint_view):
+    self.checkpoint_view = checkpoint_view
+    trackable_objects, node_ids, slot_variables = (
+        self.checkpoint_view.objects_ids_and_slot_variables())
+    self.nodes = trackable_objects
+    self.node_ids = node_ids
+    self.captured_tensor_node_ids = object_identity.ObjectIdentityDictionary()
+    self.slot_variables = slot_variables
+    self.concrete_functions = []
+
+    # Also add `Function`s as nodes.
+    nodes_without_functions = list(self.nodes)
+    seen_function_names = set()
+    for node in nodes_without_functions:
+      for function in checkpoint_view.list_functions(node).values():
+        if function not in self.node_ids:
+          self.node_ids[function] = len(self.nodes)
+          self.nodes.append(function)
+        if isinstance(function, def_function.Function):
+          # Force listing the concrete functions for the side effects:
+          #  - populate the cache for functions that have an input_signature
+          #  and have not been called.
+          #  - force side effects of creation of concrete functions, e.g. create
+          #  variables on first run.
+          concrete_functions = (
+              function._list_all_concrete_functions_for_serialization())  # pylint: disable=protected-access
+        else:
+          concrete_functions = [function]
+        for concrete_function in concrete_functions:
+          if concrete_function.name not in seen_function_names:
+            seen_function_names.add(concrete_function.name)
+            self.concrete_functions.append(concrete_function)
+
+  @property
+  def root(self):
+    return self.nodes[0]
+
+  def fill_object_graph_proto(self, proto):
+    """Populate the nodes, children and slot_variables of a SavedObjectGraph."""
+    for node_id, node in enumerate(self.nodes):
+      assert self.node_ids[node] == node_id
+      object_proto = proto.nodes.add()
+      object_proto.slot_variables.extend(self.slot_variables.get(node, ()))
+      if isinstance(node, (def_function.Function, defun.ConcreteFunction,
+                           _CapturedConstant)):
+        continue
+      for child in self.checkpoint_view.list_dependencies(node):
+        child_proto = object_proto.children.add()
+        child_proto.node_id = self.node_ids[child.ref]
+        child_proto.local_name = child.name
+      for local_name, ref_function in (
+          self.checkpoint_view.list_functions(node).items()):
+        child_proto = object_proto.children.add()
+        child_proto.node_id = self.node_ids[ref_function]
+        child_proto.local_name = local_name
+
+  def map_resources(self):
+    """Makes new resource handle ops corresponding to existing resource tensors.
+
+    Creates resource handle ops in the current default graph, whereas
+    `accessible_objects` will be from an eager context. Resource mapping adds
+    resource handle ops to the main GraphDef of a SavedModel, which allows the
+    C++ loader API to interact with variables.
+
+    Returns:
+      A tuple of (object_map, resource_map, asset_info):
+        object_map: A dictionary mapping from object in `accessible_objects` to
+          replacement objects created to hold the new resource tensors.
+        resource_map: A dictionary mapping from resource tensors extracted from
+          `accessible_objects` to newly created resource tensors.
+        asset_info: An _AssetInfo tuple describing external assets referenced
+          from accessible_objects.
+    """
+    # Only makes sense when adding to the export Graph
+    assert not context.executing_eagerly()
+    # TODO(allenl): Handle MirroredVariables and other types of variables which
+    # may need special casing.
+    object_map = object_identity.ObjectIdentityDictionary()
+    resource_map = {}
+    asset_info = _AssetInfo(
+        asset_defs=[],
+        asset_initializers_by_resource={},
+        asset_filename_map={},
+        asset_index={})
+    for node_id, obj in enumerate(self.nodes):
+      if isinstance(obj, tracking.TrackableResource):
+        new_resource = obj.create_resource()
+        resource_map[obj.resource_handle] = new_resource
+        self.captured_tensor_node_ids[obj.resource_handle] = node_id
+      elif resource_variable_ops.is_resource_variable(obj):
+        new_variable = resource_variable_ops.copy_to_graph_uninitialized(obj)
+        object_map[obj] = new_variable
+        resource_map[obj.handle] = new_variable.handle
+        self.captured_tensor_node_ids[obj.handle] = node_id
+      elif isinstance(obj, tracking.TrackableAsset):
+        _process_asset(obj, asset_info, resource_map)
+        self.captured_tensor_node_ids[obj.asset_path] = node_id
+
+    for concrete_function in self.concrete_functions:
+      for capture in concrete_function.captured_inputs:
+        if (isinstance(capture, ops.EagerTensor)
+            and capture.dtype not in _UNCOPIABLE_DTYPES
+            and capture not in self.captured_tensor_node_ids):
+          copied_tensor = constant_op.constant(capture.numpy())
+          node_id = len(self.nodes)
+          node = _CapturedConstant(
+              eager_tensor=capture, graph_tensor=copied_tensor)
+          self.nodes.append(node)
+          self.node_ids[capture] = node_id
+          self.node_ids[node] = node_id
+          self.captured_tensor_node_ids[capture] = node_id
+          resource_map[capture] = copied_tensor
+
+    return object_map, resource_map, asset_info
 
 
 def _tensor_dict_to_tensorinfo(tensor_dict):
-  return {key: utils_impl.build_tensor_info(value)
+  return {key: utils_impl.build_tensor_info_internal(value)
           for key, value in tensor_dict.items()}
 
 
@@ -204,18 +272,12 @@ def _map_captures_to_created_tensors(
   for exterior, interior in original_captures.items():
     mapped_resource = resource_map.get(exterior, None)
     if mapped_resource is None:
-      if exterior.dtype == dtypes.resource:
-        raise AssertionError(
-            ("Tried to export a function which references untracked stateful "
-             "object {}. Stateful TensorFlow objects (e.g. tf.Variable) must "
-             "be tracked by the main object. Objects may be tracked by "
-             "assigning them to an attribute of another tracked object, or to "
-             "an attribute of the main object directly.")
-            .format(interior))
-      else:
-        # This is a captured Tensor, but it's not a resource. We'll just add it
-        # to the graph as a constant.
-        mapped_resource = constant_op.constant(exterior.numpy())
+      raise AssertionError(
+          ("Tried to export a function which references untracked object {}."
+           "TensorFlow objects (e.g. tf.Variable) captured by functions must "
+           "be tracked by assigning them to an attribute of a tracked object "
+           "or assigned to an attribute of the main object directly.")
+          .format(interior))
     export_captures.append(mapped_resource)
   return export_captures
 
@@ -309,8 +371,8 @@ def _generate_signatures(signature_functions, resource_map):
 
   Args:
     signature_functions: A dictionary mapping string keys to concrete TensorFlow
-      functions (e.g. from `_canonicalize_signatures`) which will be used to
-      generate SignatureDefs.
+      functions (e.g. from `signature_serialization.canonicalize_signatures`)
+      which will be used to generate SignatureDefs.
     resource_map: A dictionary mapping from resource tensors in the eager
       context to resource tensors in the Graph being exported. This dictionary
       is used to re-bind resources captured by functions to tensors which will
@@ -341,13 +403,12 @@ def _generate_signatures(signature_functions, resource_map):
     mapped_inputs, exterior_argument_placeholders = (
         _map_function_arguments_to_created_inputs(
             argument_inputs, signature_key, function.name))
-    outputs = _normalize_outputs(
-        _call_function_with_mapped_captures(
-            function, mapped_inputs, resource_map),
-        function.name, signature_key)
+    outputs = _call_function_with_mapped_captures(
+        function, mapped_inputs, resource_map)
     signatures[signature_key] = signature_def_utils.build_signature_def(
         _tensor_dict_to_tensorinfo(exterior_argument_placeholders),
-        _tensor_dict_to_tensorinfo(outputs))
+        _tensor_dict_to_tensorinfo(outputs),
+        method_name=signature_constants.PREDICT_METHOD_NAME)
   return signatures
 
 
@@ -359,10 +420,13 @@ def _trace_resource_initializers(accessible_objects):
     obj.initialize()
     return constant_op.constant(1.)  # Dummy control output
 
+  def _wrap_obj_initializer(obj):
+    return lambda: _wrap_initializer(obj)
+
   for obj in accessible_objects:
     if isinstance(obj, tracking.TrackableResource):
       resource_initializers.append(def_function.function(
-          functools.partial(_wrap_initializer, obj),
+          _wrap_obj_initializer(obj),
           # All inputs are captures.
           input_signature=[]).get_concrete_function())
   return resource_initializers
@@ -402,79 +466,33 @@ def _process_asset(trackable_asset, asset_info, resource_map):
   asset_def.filename = path
   asset_def.tensor_info.name = asset_path_initializer.name
   asset_info.asset_defs.append(asset_def)
-  asset_info.asset_initializers_by_resource[original_variable.handle] = (
+  asset_info.asset_initializers_by_resource[original_variable] = (
       asset_variable.initializer)
   asset_info.asset_index[trackable_asset] = len(asset_info.asset_defs) - 1
-  resource_map[original_variable.handle] = asset_variable.handle
-
+  resource_map[original_variable] = asset_variable
 
-def _map_resources(accessible_objects):
-  """Makes new resource handle ops corresponding to existing resource tensors.
 
-  Creates resource handle ops in the current default graph, whereas
-  `accessible_objects` will be from an eager context. Resource mapping adds
-  resource handle ops to the main GraphDef of a SavedModel, which allows the C++
-  loader API to interact with variables.
-
-  Args:
-    accessible_objects: A list of objects, some of which may contain resources,
-      to create replacements for.
-
-  Returns:
-    A tuple of (object_map, resource_map, asset_info):
-      object_map: A dictionary mapping from object in `accessible_objects` to
-        replacement objects created to hold the new resource tensors.
-      resource_map: A dictionary mapping from resource tensors extracted from
-        `accessible_objects` to newly created resource tensors.
-      asset_info: An _AssetInfo tuple describing external assets referenced from
-        accessible_objects.
-  """
-  # TODO(allenl): Handle MirroredVariables and other types of variables which
-  # may need special casing.
-  object_map = {}
-  resource_map = {}
-  asset_info = _AssetInfo(
-      asset_defs=[],
-      asset_initializers_by_resource={},
-      asset_filename_map={},
-      asset_index={})
-  for obj in accessible_objects:
-    if isinstance(obj, tracking.TrackableResource):
-      new_resource = obj.create_resource()
-      resource_map[obj.resource_handle] = new_resource
-    elif resource_variable_ops.is_resource_variable(obj):
-      new_variable = resource_variable_ops.copy_to_graph_uninitialized(obj)
-      object_map[obj] = new_variable
-      resource_map[obj.handle] = new_variable.handle
-    elif isinstance(obj, tracking.TrackableAsset):
-      _process_asset(obj, asset_info, resource_map)
-  return object_map, resource_map, asset_info
-
-
-def _fill_meta_graph_def(meta_graph_def, obj, signature_functions,
-                         object_saver):
+def _fill_meta_graph_def(meta_graph_def, saveable_view, signature_functions):
   """Generates a MetaGraph which calls `signature_functions`.
 
   Args:
     meta_graph_def: The MetaGraphDef proto to fill.
-    obj: The checkpointable object being exported.
+    saveable_view: The _SaveableView being exported.
     signature_functions: A dictionary mapping signature keys to concrete
       functions containing signatures to add to the MetaGraph.
-    object_saver: A CheckpointableSaver to add to the MetaGraph.
 
   Returns:
     An _AssetInfo, which contains information to help creating the SavedModel.
   """
-  signatures = {}
   # List objects from the eager context to make sure Optimizers give us the
   # right Graph-dependent variables.
-  accessible_objects = util.list_objects(obj)
+  accessible_objects = saveable_view.nodes
   resource_initializer_functions = _trace_resource_initializers(
       accessible_objects)
   exported_graph = ops.Graph()
   resource_initializer_ops = []
   with exported_graph.as_default():
-    object_map, resource_map, asset_info = _map_resources(accessible_objects)
+    object_map, resource_map, asset_info = saveable_view.map_resources()
     for resource_initializer_function in resource_initializer_functions:
       asset_dependencies = []
       for capture in resource_initializer_function.graph.external_captures:
@@ -486,6 +504,8 @@ def _fill_meta_graph_def(meta_graph_def, obj, signature_functions,
         resource_initializer_ops.append(
             _call_function_with_mapped_captures(
                 resource_initializer_function, [], resource_map))
+    resource_initializer_ops.extend(
+        asset_info.asset_initializers_by_resource.values())
     with ops.control_dependencies(resource_initializer_ops):
       init_op = control_flow_ops.no_op()
     # Add the same op to the main_op collection and to the init_op
@@ -501,15 +521,17 @@ def _fill_meta_graph_def(meta_graph_def, obj, signature_functions,
   # gathering from the eager context so Optimizers save the right set of
   # variables, but want any operations associated with the save/restore to be in
   # the exported graph (thus the `to_graph` argument).
-  saver = object_saver.freeze(object_map=object_map, to_graph=exported_graph)
+  saver = functional_saver.Saver(
+      saveable_view.checkpoint_view.frozen_saveable_objects(
+          object_map=object_map, to_graph=exported_graph))
+
   with exported_graph.as_default():
     signatures = _generate_signatures(signature_functions, resource_map)
+    for concrete_function in saveable_view.concrete_functions:
+      concrete_function.add_to_graph()
     saver_def = saver.to_proto()
     meta_graph_def.saver_def.CopyFrom(saver_def)
   graph_def = exported_graph.as_graph_def(add_shapes=True)
-  # Clean reference cycles so repeated export()s don't make work for the garbage
-  # collector.
-  ops.dismantle_graph(exported_graph)
 
   meta_graph_def.graph_def.CopyFrom(graph_def)
   meta_graph_def.meta_info_def.tags.append(tag_constants.SERVING)
@@ -517,29 +539,27 @@ def _fill_meta_graph_def(meta_graph_def, obj, signature_functions,
   for signature_key, signature in signatures.items():
     meta_graph_def.signature_def[signature_key].CopyFrom(signature)
   meta_graph.strip_graph_default_valued_attrs(meta_graph_def)
-  return asset_info
+  return asset_info, exported_graph
 
 
-def _write_object_graph(root, export_dir, asset_file_def_index):
+def _serialize_object_graph(saveable_view, asset_file_def_index):
   """Save a SavedObjectGraph proto for `root`."""
-  # SavedObjectGraph is similar to the CheckpointableObjectGraph proto in the
+  # SavedObjectGraph is similar to the TrackableObjectGraph proto in the
   # checkpoint. It will eventually go into the SavedModel.
   proto = saved_object_graph_pb2.SavedObjectGraph()
+  saveable_view.fill_object_graph_proto(proto)
 
-  checkpointable_objects, node_ids, slot_variables = util.find_objects(root)
-  util.fill_object_graph_proto(checkpointable_objects, node_ids, slot_variables,
-                               proto)
+  coder = nested_structure_coder.StructureCoder()
+  for concrete_function in saveable_view.concrete_functions:
+    serialized = function_serialization.serialize_concrete_function(
+        concrete_function, saveable_view.captured_tensor_node_ids, coder)
+    if serialized is not None:
+      proto.concrete_functions[concrete_function.name].CopyFrom(
+          serialized)
 
-  for obj, obj_proto in zip(checkpointable_objects, proto.nodes):
+  for obj, obj_proto in zip(saveable_view.nodes, proto.nodes):
     _write_object_proto(obj, obj_proto, asset_file_def_index)
-
-  extra_asset_dir = os.path.join(
-      compat.as_bytes(export_dir),
-      compat.as_bytes(constants.EXTRA_ASSETS_DIRECTORY))
-  file_io.recursive_create_dir(extra_asset_dir)
-  object_graph_filename = os.path.join(
-      extra_asset_dir, compat.as_bytes("object_graph.pb"))
-  file_io.write_string_to_file(object_graph_filename, proto.SerializeToString())
+  return proto
 
 
 def _write_object_proto(obj, proto, asset_file_def_index):
@@ -547,15 +567,37 @@ def _write_object_proto(obj, proto, asset_file_def_index):
   if isinstance(obj, tracking.TrackableAsset):
     proto.asset.SetInParent()
     proto.asset.asset_file_def_index = asset_file_def_index[obj]
+  elif resource_variable_ops.is_resource_variable(obj):
+    proto.variable.SetInParent()
+    proto.variable.trainable = obj.trainable
+    proto.variable.dtype = obj.dtype.as_datatype_enum
+    proto.variable.shape.CopyFrom(obj.shape.as_proto())
+  elif isinstance(obj, def_function.Function):
+    proto.function.CopyFrom(
+        function_serialization.serialize_function(obj))
+  elif isinstance(obj, defun.ConcreteFunction):
+    proto.bare_concrete_function.CopyFrom(
+        function_serialization.serialize_bare_concrete_function(obj))
+  elif isinstance(obj, _CapturedConstant):
+    proto.constant.operation = obj.graph_tensor.op.name
+  elif isinstance(obj, tracking.TrackableResource):
+    proto.resource.SetInParent()
   else:
-    proto.user_object.SetInParent()
+    registered_type_proto = revived_types.serialize(obj)
+    if registered_type_proto is None:
+      # Fallback for types with no matching registration
+      registered_type_proto = saved_object_graph_pb2.SavedUserObject(
+          identifier="_generic_user_object",
+          version=versions_pb2.VersionDef(
+              producer=1, min_consumer=1, bad_consumers=[]))
+    proto.user_object.CopyFrom(registered_type_proto)
 
 
 @tf_export("saved_model.save",
            v1=["saved_model.save", "saved_model.experimental.save"])
 def save(obj, export_dir, signatures=None):
   # pylint: disable=line-too-long
-  """Exports the Checkpointable object `obj` to [SavedModel format](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md).
+  """Exports the Trackable object `obj` to [SavedModel format](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md).
 
   Example usage:
 
@@ -599,7 +641,11 @@ def save(obj, export_dir, signatures=None):
   which case outputs will be numbered, or a dictionary mapping string keys to
   `Tensor`, in which case the keys will be used to name outputs.
 
-  Since `tf.keras.Model` objects are also Checkpointable, this function can be
+  Signatures are available in objects returned by `tf.saved_model.load` as a
+  `.signatures` attribute. This is a reserved attribute: `tf.saved_model.save`
+  on an object with a custom `.signatures` attribute will raise an exception.
+
+  Since `tf.keras.Model` objects are also Trackable, this function can be
   used to export Keras models. For example, exporting with a signature
   specified:
 
@@ -685,7 +731,7 @@ def save(obj, export_dir, signatures=None):
   prior to the TensorFlow 2.0 release.
 
   Args:
-    obj: A checkpointable object to export.
+    obj: A trackable object to export.
     export_dir: A directory in which to write the SavedModel.
     signatures: Optional, either a `tf.function` with an input signature
       specified or the result of `f.get_concrete_function` on a
@@ -698,26 +744,58 @@ def save(obj, export_dir, signatures=None):
       `tf.saved_model.signature_constants` module.
 
   Raises:
-    ValueError: If `obj` is not checkpointable.
+    ValueError: If `obj` is not trackable.
+
+  @compatibility(eager)
+  Not supported when graph building. From TensorFlow 1.x,
+  `tf.enable_eager_execution()` must run first. May not be called from within a
+  function body.
+  @end_compatibility
   """
+  if not context.executing_eagerly():
+    with ops.init_scope():
+      if context.executing_eagerly():
+        raise AssertionError(
+            "tf.saved_model.save is not supported inside a traced "
+            "@tf.function. Move the call to the outer eagerly-executed "
+            "context.")
+      else:
+        raise AssertionError(
+            "tf.saved_model.save is not supported when graph building. "
+            "tf.enable_eager_execution() must run first when calling it from "
+            "TensorFlow 1.x.")
   # pylint: enable=line-too-long
-  if not isinstance(obj, base.CheckpointableBase):
+  if not isinstance(obj, base.Trackable):
     raise ValueError(
-        "Expected a Checkpointable object for export, got {}.".format(obj))
+        "Expected a Trackable object for export, got {}.".format(obj))
+
+  checkpoint_graph_view = _AugmentedGraphView(obj)
   if signatures is None:
-    # Note that we run this before saving the checkpoint, since looping over
-    # attributes may have the side effect of creating variables in some cases.
-    signatures = _find_function_to_export(obj)
+    signatures = signature_serialization.find_function_to_export(
+        checkpoint_graph_view)
+
+  signatures = signature_serialization.canonicalize_signatures(signatures)
+  signature_serialization.validate_saveable_view(checkpoint_graph_view)
+  signature_map = signature_serialization.create_signature_map(signatures)
+  checkpoint_graph_view.add_object(
+      parent_node=checkpoint_graph_view.root,
+      name_in_parent=signature_serialization.SIGNATURE_ATTRIBUTE_NAME,
+      subgraph_root=signature_map)
+
+  # Use _SaveableView to provide a frozen listing of properties and functions.
+  # Note we run this twice since, while constructing the view the first time
+  # there can be side effects of creating variables.
+  _ = _SaveableView(checkpoint_graph_view)
+  saveable_view = _SaveableView(checkpoint_graph_view)
 
-  signatures = _canonicalize_signatures(signatures)
   # TODO(allenl): Factor out some subset of SavedModelBuilder which is 2.x
   # compatible (no sessions) and share it with this export API rather than
   # making a SavedModel proto and writing it directly.
   saved_model = saved_model_pb2.SavedModel()
   meta_graph_def = saved_model.meta_graphs.add()
-  object_saver = util.CheckpointableSaver(obj)
-  asset_info = _fill_meta_graph_def(
-      meta_graph_def, obj, signatures, object_saver)
+  object_saver = util.TrackableSaver(checkpoint_graph_view)
+  asset_info, exported_graph = _fill_meta_graph_def(
+      meta_graph_def, saveable_view, signatures)
   saved_model.saved_model_schema_version = (
       constants.SAVED_MODEL_SCHEMA_VERSION)
   # So far we've just been generating protocol buffers with no I/O. Now we write
@@ -730,5 +808,11 @@ def save(obj, export_dir, signatures=None):
   path = os.path.join(
       compat.as_bytes(export_dir),
       compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
+  object_graph_proto = _serialize_object_graph(
+      saveable_view, asset_info.asset_index)
+  meta_graph_def.object_graph_def.CopyFrom(object_graph_proto)
   file_io.write_string_to_file(path, saved_model.SerializeToString())
-  _write_object_graph(obj, export_dir, asset_info.asset_index)
+  # Clean reference cycles so repeated export()s don't make work for the garbage
+  # collector. Before this point we need to keep references to captured
+  # constants in the saved graph.
+  ops.dismantle_graph(exported_graph)
diff --git a/tensorflow/python/saved_model/save_test.py b/tensorflow/python/saved_model/save_test.py
index 97218a98eae38decc9c296a420074b7d4ec1f5e3..ca1d5738ed7a7b0d0bf8ee2488fbedba258e2c3c 100644
--- a/tensorflow/python/saved_model/save_test.py
+++ b/tensorflow/python/saved_model/save_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for checkpointable object SavedModel save."""
+"""Tests for trackable object SavedModel save."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,23 +21,19 @@ from __future__ import print_function
 import os
 import sys
 
-import numpy
-
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras.engine import input_layer
-from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
-from tensorflow.python.keras.layers import merge
+from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.lib.io import file_io
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
@@ -45,17 +41,16 @@ from tensorflow.python.saved_model import loader
 from tensorflow.python.saved_model import save
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
-from tensorflow.python.training import adam
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
+from tensorflow.python.util import compat
 
 
-class _ModelWithOptimizer(training.Model):
+class _ModelWithOptimizer(util.Checkpoint):
 
   def __init__(self):
-    super(_ModelWithOptimizer, self).__init__()
     self.dense = core.Dense(1)
-    self.optimizer = adam.AdamOptimizer(0.01)
+    self.optimizer = adam.Adam(0.01)
 
   @def_function.function(
       input_signature=(tensor_spec.TensorSpec([None, 2], dtypes.float32),
@@ -63,7 +58,7 @@ class _ModelWithOptimizer(training.Model):
   def call(self, x, y):
     with backprop.GradientTape() as tape:
       loss = math_ops.reduce_mean((self.dense(x) - y) ** 2.)
-    trainable_variables = self.trainable_variables
+    trainable_variables = self.dense.trainable_variables
     gradients = tape.gradient(loss, trainable_variables)
     self.optimizer.apply_gradients(zip(gradients, trainable_variables))
     return {"loss": loss}
@@ -92,7 +87,7 @@ def _import_and_infer(
 class SaveTest(test.TestCase):
 
   def test_method_save_signature(self):
-    root = tracking.Checkpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(
         lambda x: 2. * x,
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
@@ -104,7 +99,7 @@ class SaveTest(test.TestCase):
         _import_and_infer(save_dir, {"x": 1.}))
 
   def test_method_save_concrete(self):
-    root = tracking.Checkpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(
         lambda z: {"out": 2. * z})
     root.f(constant_op.constant(1.))
@@ -120,16 +115,35 @@ class SaveTest(test.TestCase):
             save_dir, {"z": 1.}, signature_key="non_default_key"))
 
   def test_non_concrete_error(self):
-    root = tracking.Checkpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(lambda x: 2. * x)
     root.f(constant_op.constant(1.))
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     with self.assertRaisesRegexp(
-        ValueError, "must be converted to concrete functions"):
+        ValueError, "Expected a TensorFlow function"):
       save.save(root, save_dir, root.f)
 
+  def test_captures_unreachable_variable(self):
+    root = tracking.AutoTrackable()
+    unreachable_variable = variables.Variable([5.0, 2.0])
+    root.reachable_variable = variables.Variable([1.0, 3.0])
+
+    @def_function.function
+    def increase_variable(x):
+      return 2 * unreachable_variable * x + root.reachable_variable
+
+    root.f = increase_variable
+
+    self.assertAllEqual([101.0, 83.0],
+                        root.f(constant_op.constant([10.0, 20.0])).numpy())
+
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+
+    with self.assertRaisesRegexp(KeyError, "not reachable from root"):
+      save.save(root, save_dir)
+
   def test_nested_inputs(self):
-    root = tracking.Checkpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(
         lambda x: 2. * x[0],
         input_signature=([tensor_spec.TensorSpec(None, dtypes.float32),
@@ -142,7 +156,7 @@ class SaveTest(test.TestCase):
       root.f.get_concrete_function()
 
   def test_nested_outputs(self):
-    root = tracking.Checkpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(lambda x: (2. * x, (3. * x, 4. * x)))
     root.f(constant_op.constant(1.))
     to_save = root.f.get_concrete_function(constant_op.constant(1.))
@@ -163,7 +177,7 @@ class SaveTest(test.TestCase):
       save.save(root, save_dir, to_save)
 
   def test_variable(self):
-    root = tracking.Checkpointable()
+    root = tracking.AutoTrackable()
     root.v1 = variables.Variable(3.)
     root.v2 = variables.Variable(2.)
     root.f = def_function.function(
@@ -179,25 +193,20 @@ class SaveTest(test.TestCase):
     x = constant_op.constant([[3., 4.]])
     y = constant_op.constant([2.])
     model = _ModelWithOptimizer()
-    first_loss = model(x, y)
+    first_loss = model.call(x, y)
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     save.save(model, save_dir, model.call)
-    second_loss = model(x, y)
+    second_loss = model.call(x, y)
     self.assertNotEqual(first_loss, second_loss)
     self.assertAllClose(
         second_loss,
         _import_and_infer(save_dir, {"x": [[3., 4.]], "y": [2.]}))
 
-  def test_trivial_save_exception(self):
-    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    with self.assertRaisesRegexp(ValueError, "signature"):
-      save.save(tracking.Checkpointable(), save_dir)
-
   def test_single_method_default_signature(self):
     model = _ModelWithOptimizer()
     x = constant_op.constant([[3., 4.]])
     y = constant_op.constant([2.])
-    model(x, y)
+    model.call(x, y)
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     save.save(model, save_dir)
     self.assertIn("loss",
@@ -205,7 +214,7 @@ class SaveTest(test.TestCase):
                                     {"x": [[3., 4.]], "y": [2.]}))
 
   def test_single_function_default_signature(self):
-    model = tracking.Checkpointable()
+    model = tracking.AutoTrackable()
     model.f = def_function.function(lambda: 3., input_signature=())
     model.f()
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
@@ -213,28 +222,26 @@ class SaveTest(test.TestCase):
     self.assertAllClose({"output_0": 3.},
                         _import_and_infer(save_dir, {}))
 
-  def test_ambiguous_signatures(self):
-    model = _ModelWithOptimizer()
-    x = constant_op.constant([[3., 4.]])
-    y = constant_op.constant([2.])
-    model(x, y)
-    model.second_function = def_function.function(lambda: 1.)
+  def test_single_function_no_signature(self):
+    model = tracking.AutoTrackable()
+    model.f = def_function.function(lambda: 3.)
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    with self.assertRaisesRegexp(ValueError, "call.*second_function"):
-      save.save(model, save_dir)
+    save.save(model, save_dir)
 
-  def test_subclassed_no_signature(self):
+  def test_find_default_save_function(self):
 
-    class Subclassed(training.Model):
+    class ObjWithDefaultSignature(util.Checkpoint):
 
-      def call(self, inputs):
-        return inputs * 2.
+      @def_function.function(input_signature=[tensor_spec.TensorSpec(
+          shape=None, dtype=dtypes.float32)])
+      def _default_save_signature(self, x):
+        return x + x + 1
 
+    obj = ObjWithDefaultSignature()
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    model = Subclassed()
-    with self.assertRaisesRegexp(
-        ValueError, "no @tf.function-decorated methods"):
-      save.save(model, save_dir)
+    save.save(obj, save_dir)
+    self.assertAllClose(
+        {"output_0": 7.}, _import_and_infer(save_dir, {"x": 3.}))
 
   def test_docstring(self):
 
@@ -252,6 +259,27 @@ class SaveTest(test.TestCase):
     self.assertAllClose({"output_0": 7.},
                         _import_and_infer(save_dir, {"x": 3.}))
 
+  def test_datastructures(self):
+
+    class HasDatastructures(util.Checkpoint):
+
+      def __init__(self):
+        self.a = [1.]
+        self.a.append(variables.Variable(2.))
+        self.b = {"a": variables.Variable(3.)}
+
+      @def_function.function(input_signature=[tensor_spec.TensorSpec(
+          shape=None, dtype=dtypes.float32)])
+      def add(self, x):
+        return x + math_ops.add_n(self.a) + self.b["a"]
+
+    to_save = HasDatastructures()
+    to_save.add(constant_op.constant(1.))
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(to_save, save_dir)
+    self.assertAllClose({"output_0": 10.},
+                        _import_and_infer(save_dir, {"x": 4.}))
+
   def test_default_attr_stripping(self):
 
     class Complex(util.Checkpoint):
@@ -270,51 +298,19 @@ class SaveTest(test.TestCase):
     graph = ops.Graph()
     with graph.as_default(), self.session(graph) as session:
       loader.load(session, [tag_constants.SERVING], save_dir)
-      func, = graph._functions.values()
+      func, = [f for name, f in graph._functions.items() if "call" in name]
       complex_node, = [
           node for node in func.definition.node_def if node.op == "Complex"]
       self.assertNotIn("T", complex_node.attr)
       self.assertNotIn("Tout", complex_node.attr)
 
-  def test_export_functional_keras_model(self):
-    x = input_layer.Input((4,), name="x")
-    y = core.Dense(4, name="out")(x)
-    model = training.Model(x, y)
-    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    save.save(model, save_dir)
-    self.assertAllClose(
-        {"out": model(array_ops.ones([1, 4]))},
-        _import_and_infer(save_dir, {"x": [[1., 1., 1., 1.]]}))
-
-  @test_util.run_v1_only("b/120545219")
-  def test_export_functional_keras_model_after_fit(self):
-    x = input_layer.Input((1,))
-    y = core.Dense(1, name="y")(x)
-    model = training.Model(x, y)
-    model.compile(optimizer="sgd", loss="mse")
-    model.fit(x=numpy.array([[1.]]),
-              y=numpy.array([2.]), epochs=2)
-    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    save.save(model, save_dir)
-    self.assertAllClose(
-        {"y": model(constant_op.constant([[1.], [2.]]))},
-        _import_and_infer(save_dir, {"input_1": [[1.], [2.]]}))
-
-  def test_export_multi_input_functional_keras_model(self):
-    x1 = input_layer.Input((2,), name="x1")
-    x2 = input_layer.Input((2,), name="x2")
-    y1 = core.Dense(4)(merge.Add()([x1, x2]))
-    y2 = core.Dense(4)(merge.Multiply()([x1, x2]))
-    model = training.Model([x1, x2], [y1, y2])
+  def test_signature_attribute_reserved(self):
+    root = util.Checkpoint(signatures=variables.Variable(1.))
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    save.save(model, save_dir)
-    outputs = model([array_ops.ones([1, 2]), 2. * array_ops.ones([1, 2])])
-    self.assertAllClose(
-        {"dense": outputs[0], "dense_1": outputs[1]},
-        _import_and_infer(
-            save_dir,
-            {"x1": [[1., 1.]],
-             "x2": [[2., 2.]]}))
+    with self.assertRaisesRegexp(ValueError, "del obj.signatures"):
+      save.save(root, save_dir)
+    del root.signatures
+    save.save(root, save_dir)
 
 
 class AssetTests(test.TestCase):
@@ -325,6 +321,18 @@ class AssetTests(test.TestCase):
     with open(self._vocab_path, "w") as f:
       f.write("alpha\nbeta\ngamma\n")
 
+  def test_asset_path_returned(self):
+    root = tracking.AutoTrackable()
+    root.path = tracking.TrackableAsset(self._vocab_path)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    root.get_asset = def_function.function(lambda: root.path.asset_path)
+    save.save(root, save_dir, signatures=root.get_asset.get_concrete_function())
+    second_dir = os.path.join(self.get_temp_dir(), "second_dir")
+    file_io.rename(save_dir, second_dir)
+    imported_path = _import_and_infer(second_dir, {})["output_0"]
+    self.assertIn(compat.as_str_any(second_dir),
+                  compat.as_str_any(imported_path))
+
   def test_table(self):
     initializer = lookup_ops.TextFileInitializer(
         self._vocab_path,
@@ -354,7 +362,7 @@ class AssetTests(test.TestCase):
         _import_and_infer(second_dir, {"keys": ["gamma", "beta"]}))
 
   def test_unused_asset(self):
-    root = tracking.Checkpointable()
+    root = tracking.AutoTrackable()
     root.f = def_function.function(
         lambda x: 2. * x,
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
@@ -366,17 +374,54 @@ class AssetTests(test.TestCase):
         {"output_0": [0.2]},
         _import_and_infer(export_dir, {"x": [0.1]}))
 
+  def test_sensible_graph_building_exception(self):
+    root = util.Checkpoint(v=variables.Variable(2.))
+    root.f = def_function.function(
+        lambda x: 2. * root.v,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    export_dir = os.path.join(self.get_temp_dir(), "save_dir")
+    @def_function.function
+    def _calls_save():
+      save.save(root, export_dir)
+    with self.assertRaisesRegexp(AssertionError, "tf.function"):
+      _calls_save()
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(AssertionError, "enable_eager_execution"):
+        save.save(root, export_dir)
+
+
+class _ModelWithOptimizerUsingDefun(util.Checkpoint):
+
+  def __init__(self):
+    self.dense = core.Dense(1)
+    self.optimizer = adam.Adam(0.01)
+
+  # Using defun due to control flow v2 cycles, b/121159261. def_function uses
+  # conds to gate variable initialization and so triggers cond reference cycles,
+  # but the thing being wrapped here does not use cond itself.
+  @function.defun(
+      input_signature=(tensor_spec.TensorSpec([None, 2], dtypes.float32),
+                       tensor_spec.TensorSpec([None], dtypes.float32)),
+  )
+  def call(self, x, y):
+    with backprop.GradientTape() as tape:
+      loss = math_ops.reduce_mean((self.dense(x) - y) ** 2.)
+    trainable_variables = self.dense.trainable_variables
+    gradients = tape.gradient(loss, trainable_variables)
+    self.optimizer.apply_gradients(zip(gradients, trainable_variables))
+    return {"loss": loss}
+
 
 class MemoryTests(test.TestCase):
 
   def setUp(self):
-    self._model = _ModelWithOptimizer()
+    self._model = _ModelWithOptimizerUsingDefun()
 
   @test_util.assert_no_garbage_created
   def test_no_reference_cycles(self):
     x = constant_op.constant([[3., 4.]])
     y = constant_op.constant([2.])
-    self._model(x, y)
+    self._model.call(x, y)
     if sys.version_info[0] < 3:
       # TODO(allenl): debug reference cycles in Python 2.x
       self.skipTest("This test only works in Python 3+. Reference cycles are "
diff --git a/tensorflow/python/saved_model/saved_model.py b/tensorflow/python/saved_model/saved_model.py
index fcde6b47e4ff10dbd84801e08597591a10818d51..9c926d789f4199666e2ffb68bdc9134751ba17e8 100644
--- a/tensorflow/python/saved_model/saved_model.py
+++ b/tensorflow/python/saved_model/saved_model.py
@@ -29,6 +29,7 @@ from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.saved_model import utils
+from tensorflow.python.saved_model.load import load
 from tensorflow.python.saved_model.save import save
 # pylint: enable=unused-import
 # pylint: disable=wildcard-import
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index 8d94c7c989d12df965bd5cc5954d30972238ff3c..e36b8b30bf25c0d6f9b78cfdc2afee31f106f632 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -1084,7 +1084,7 @@ class SavedModelTest(SavedModelTestBase):
       # CheckpointedOp is a key-value table that can be saved across sessions.
       # The table register itself in SAVEABLE_OBJECTS collection.
       v1 = saver_test_utils.CheckpointedOp(name="v1")
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       v1.insert("k1", 3.0).run()
       # Once the table is restored, we can access it through this reference.
       ops.add_to_collection("table_ref", v1.table_ref)
diff --git a/tensorflow/python/saved_model/saved_object_graph.proto b/tensorflow/python/saved_model/saved_object_graph.proto
deleted file mode 100644
index 3991fbede42655e39bec93226b6295603c394cf4..0000000000000000000000000000000000000000
--- a/tensorflow/python/saved_model/saved_object_graph.proto
+++ /dev/null
@@ -1,73 +0,0 @@
-syntax = "proto3";
-
-import "tensorflow/core/protobuf/checkpointable_object_graph.proto";
-
-option cc_enable_arenas = true;
-
-package tensorflow;
-
-// A SavedObjectGraph is part of object-based SavedModels in TF 2.0. It
-// describes the directed graph of Python objects (or equivalent in other
-// languages) that make up a model, with nodes[0] at the root.
-
-// SavedObjectGraph shares some structure with CheckpointableObjectGraph, but
-// ObjectGraph belongs to the SavedModel and contains pointers to functions and
-// type information, while CheckpointableObjectGraph lives in the checkpoint and
-// contains pointers only to variable values.
-
-// NOTE: This protocol buffer format is experimental and subject to change.
-
-message SavedObjectGraph {
-  // List of objects in the SavedModel.
-  //
-  // The position of the object in this list indicates its id.
-  // Nodes[0] is considered the root node.
-  repeated SavedObject nodes = 1;
-}
-
-message SavedObject {
-  // Objects which this object depends on: named edges in the dependency
-  // graph.
-  //
-  // Note: only valid if kind == "object".
-  repeated CheckpointableObjectGraph.CheckpointableObject.ObjectReference
-      children = 1;
-
-  // Removed when forking from CheckpointableObjectGraph.
-  reserved "attributes";
-  reserved 2;
-
-  // Slot variables owned by this object. This describes the three-way
-  // (optimizer, variable, slot variable) relationship; none of the three
-  // depend on the others directly.
-  //
-  // Note: only valid if kind == "object".
-  repeated CheckpointableObjectGraph.CheckpointableObject.SlotVariableReference
-      slot_variables = 3;
-
-  oneof kind {
-    SavedUserObject user_object = 4;
-    SavedAsset asset = 5;
-  }
-}
-
-// A SavedUserObject is an object (in the object-oriented language of the
-// TensorFlow program) of some user- or framework-defined class other than
-// those handled specifically by the other kinds of SavedObjects.
-//
-// This object cannot be evaluated as a tensor, and therefore cannot be bound
-// to an input of a function.
-message SavedUserObject {}
-
-// A SavedAsset represents a file in a SavedModel.
-//
-// When bound to a function this object evaluates to a Variable from which the
-// absolute filename can be read. Users should not expect the filename to be
-// maintained.
-message SavedAsset {
-  // Index into `MetaGraphDef.asset_file_def[]` that describes the Asset.
-  //
-  // Only the field `AssetFileDef.filename` is used. Other fields, such as
-  // `AssetFileDef.tensor_info`, MUST be ignored.
-  uint32 asset_file_def_index = 1;
-}
diff --git a/tensorflow/python/saved_model/signature_constants.py b/tensorflow/python/saved_model/signature_constants.py
index 0efe1763430eade223801b63f958405212eebe34..525d18d18e186c3a9bc551150a7fe5fcd60f0356 100644
--- a/tensorflow/python/saved_model/signature_constants.py
+++ b/tensorflow/python/saved_model/signature_constants.py
@@ -136,6 +136,9 @@ tf_export(
 ################################################################################
 # Train/Eval API constants.
 # Not exported while export_all_saved_models is experimental.
+DEFAULT_TRAIN_SIGNATURE_DEF_KEY = "train"
+
+DEFAULT_EVAL_SIGNATURE_DEF_KEY = "eval"
 
 SUPERVISED_TRAIN_METHOD_NAME = "tensorflow/supervised/training"
 
diff --git a/tensorflow/python/saved_model/signature_def_utils_impl.py b/tensorflow/python/saved_model/signature_def_utils_impl.py
index f6e6e1d13ecdea684f14dcaaa39f1c66f72ac352..2e0a0afeec630eb97467d6967d989dd9bf5ce898 100644
--- a/tensorflow/python/saved_model/signature_def_utils_impl.py
+++ b/tensorflow/python/saved_model/signature_def_utils_impl.py
@@ -30,7 +30,6 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export(
-    'saved_model.build_signature_def',
     v1=[
         'saved_model.build_signature_def',
         'saved_model.signature_def_utils.build_signature_def'
@@ -63,7 +62,6 @@ def build_signature_def(inputs=None, outputs=None, method_name=None):
 
 
 @tf_export(
-    'saved_model.regression_signature_def',
     v1=[
         'saved_model.regression_signature_def',
         'saved_model.signature_def_utils.regression_signature_def'
@@ -112,7 +110,6 @@ def regression_signature_def(examples, predictions):
 
 
 @tf_export(
-    'saved_model.classification_signature_def',
     v1=[
         'saved_model.classification_signature_def',
         'saved_model.signature_def_utils.classification_signature_def'
@@ -172,7 +169,6 @@ def classification_signature_def(examples, classes, scores):
 
 
 @tf_export(
-    'saved_model.predict_signature_def',
     v1=[
         'saved_model.predict_signature_def',
         'saved_model.signature_def_utils.predict_signature_def'
@@ -270,7 +266,6 @@ def _supervised_signature_def(
 
 
 @tf_export(
-    'saved_model.is_valid_signature',
     v1=[
         'saved_model.is_valid_signature',
         'saved_model.signature_def_utils.is_valid_signature'
diff --git a/tensorflow/python/saved_model/signature_serialization.py b/tensorflow/python/saved_model/signature_serialization.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cfd5b14eff25644b55dbed0c78f5d4ff24b6891
--- /dev/null
+++ b/tensorflow/python/saved_model/signature_serialization.py
@@ -0,0 +1,263 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Helpers for working with signatures in tf.saved_model.save."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function as defun
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.saved_model import revived_types
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.training.tracking import base
+from tensorflow.python.util import compat
+from tensorflow.python.util import nest
+
+
+DEFAULT_SIGNATURE_ATTR = "_default_save_signature"
+SIGNATURE_ATTRIBUTE_NAME = "signatures"
+
+
+def _get_signature(function):
+  if (isinstance(function, (defun.Function, def_function.Function)) and
+      function._input_signature is not None):  # pylint: disable=protected-access
+    function = function.get_concrete_function()
+  if not isinstance(function, defun.ConcreteFunction):
+    return None
+  return function
+
+
+def _valid_signature(concrete_function):
+  """Returns whether concrete function can be converted to a signature."""
+  if not concrete_function.outputs:
+    # Functions without outputs don't make sense as signatures. We just don't
+    # have any way to run an Operation with no outputs as a SignatureDef in the
+    # 1.x style.
+    return False
+  try:
+    _normalize_outputs(concrete_function.structured_outputs, "unused", "unused")
+  except ValueError:
+    return False
+  return True
+
+
+def find_function_to_export(saveable_view):
+  """Function to export, None if no suitable function was found."""
+  # If the user did not specify signatures, check the root object for a function
+  # that can be made into a signature.
+  functions = saveable_view.list_functions(saveable_view.root)
+  signature = functions.get(DEFAULT_SIGNATURE_ATTR, None)
+  if signature is not None:
+    return signature
+
+  # TODO(andresp): Discuss removing this behaviour. It can lead to WTFs when a
+  # user decides to annotate more functions with tf.function and suddenly
+  # serving that model way later in the process stops working.
+  possible_signatures = []
+  for function in functions.values():
+    concrete = _get_signature(function)
+    if concrete is not None and _valid_signature(concrete):
+      possible_signatures.append(concrete)
+  if len(possible_signatures) == 1:
+    single_function = possible_signatures[0]
+    signature = _get_signature(single_function)
+    if signature and  _valid_signature(signature):
+      return signature
+  return None
+
+
+def canonicalize_signatures(signatures):
+  """Converts `signatures` into a dictionary of concrete functions."""
+  if signatures is None:
+    return {}
+  if not isinstance(signatures, collections.Mapping):
+    signatures = {
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signatures}
+  concrete_signatures = {}
+  for signature_key, function in signatures.items():
+    signature_function = _get_signature(function)
+    if signature_function is None:
+      raise ValueError(
+          ("Expected a TensorFlow function to generate a signature for, but "
+           "got {}. Only `tf.functions` with an input signature or "
+           "concrete functions can be used as a signature.").format(function))
+
+    # Re-wrap the function so that it returns a dictionary of Tensors. This
+    # matches the format of 1.x-style signatures.
+    # pylint: disable=cell-var-from-loop
+    @def_function.function
+    def signature_wrapper(**kwargs):
+      structured_outputs = signature_function(**kwargs)
+      return _normalize_outputs(
+          structured_outputs, signature_function.name, signature_key)
+    # TODO(b/123902469): Use ConcreteFunction.structured_inputs once their names
+    # always match keyword arguments.
+    tensor_spec_signature = {}
+    for keyword, tensor in zip(
+        signature_function._arg_keywords,  # pylint: disable=protected-access
+        signature_function.inputs):
+      keyword = compat.as_str(keyword)
+      tensor_spec_signature[keyword] = tensor_spec.TensorSpec.from_tensor(
+          tensor, name=keyword)
+    final_concrete = signature_wrapper.get_concrete_function(
+        **tensor_spec_signature)
+    # pylint: disable=protected-access
+    if len(final_concrete._arg_keywords) == 1:
+      # If there is only one input to the signature, a very common case, then
+      # ordering is unambiguous and we can let people pass a positional
+      # argument. Since SignatureDefs are unordered (protobuf "map") multiple
+      # arguments means we need to be keyword-only.
+      final_concrete._num_positional_args = 1
+    else:
+      final_concrete._num_positional_args = 0
+    # pylint: enable=protected-access
+    concrete_signatures[signature_key] = final_concrete
+    # pylint: enable=cell-var-from-loop
+  return concrete_signatures
+
+
+def _is_flat(sequence):
+  sequence_flat = nest.flatten(sequence)
+  try:
+    nest.assert_same_structure(sequence_flat, sequence)
+    return True
+  except ValueError:
+    return False
+  except TypeError:
+    return False
+
+
+def _normalize_outputs(outputs, function_name, signature_key):
+  """Construct an output dictionary from unnormalized function outputs."""
+  if isinstance(outputs, collections.Mapping):
+    for key, value in outputs.items():
+      if not isinstance(value, ops.Tensor):
+        raise ValueError(
+            ("Got a dictionary containing non-Tensor value {} for key {} "
+             "in the output of the function {} used to generate a SavedModel "
+             "signature. Dictionaries outputs for functions used as signatures "
+             "should have one Tensor output per string key.")
+            .format(value, key, compat.as_str_any(function_name)))
+    return outputs
+  else:
+    original_outputs = outputs
+    if not isinstance(outputs, collections.Sequence):
+      outputs = [outputs]
+    if not _is_flat(outputs):
+      raise ValueError(
+          ("Got non-flat outputs '{}' from '{}' for SavedModel "
+           "signature '{}'. Signatures have one Tensor per output, so "
+           "to have predictable names Python functions used to generate "
+           "these signatures should avoid outputting Tensors in nested "
+           "structures.")
+          .format(original_outputs, function_name, signature_key))
+    return {("output_{}".format(output_index)): output
+            for output_index, output
+            in enumerate(outputs)}
+
+
+# _SignatureMap is immutable to ensure that users do not expect changes to be
+# reflected in the SavedModel. Using public APIs, tf.saved_model.load() is the
+# only way to create a _SignatureMap and there is no way to modify it. So we can
+# safely ignore/overwrite ".signatures" attributes attached to objects being
+# saved if they contain a _SignatureMap. A ".signatures" attribute containing
+# any other type (e.g. a regular dict) will raise an exception asking the user
+# to first "del obj.signatures" if they want it overwritten.
+class _SignatureMap(collections.Mapping, base.Trackable):
+  """A collection of SavedModel signatures."""
+
+  def __init__(self):
+    self._signatures = {}
+
+  def _add_signature(self, name, concrete_function):
+    """Adds a signature to the _SignatureMap."""
+    # Ideally this object would be immutable, but restore is streaming so we do
+    # need a private API for adding new signatures to an existing object.
+    self._signatures[name] = concrete_function
+
+  def __getitem__(self, key):
+    return self._signatures[key]
+
+  def __iter__(self):
+    return iter(self._signatures)
+
+  def __len__(self):
+    return len(self._signatures)
+
+  def __repr__(self):
+    return "_SignatureMap({})".format(self._signatures)
+
+  def _list_functions_for_serialization(self):
+    return {
+        key: value for key, value in self.items()
+        if isinstance(value, (def_function.Function, defun.ConcreteFunction))
+    }
+
+
+revived_types.register_revived_type(
+    "signature_map",
+    lambda obj: isinstance(obj, _SignatureMap),
+    versions=[revived_types.VersionedTypeRegistration(
+        # Standard dependencies are enough to reconstruct the trackable
+        # items in dictionaries, so we don't need to save any extra information.
+        object_factory=lambda proto: _SignatureMap(),
+        version=1,
+        min_producer_version=1,
+        min_consumer_version=1,
+        setter=_SignatureMap._add_signature  # pylint: disable=protected-access
+    )])
+
+
+def create_signature_map(signatures):
+  """Creates an object containing `signatures`."""
+  signature_map = _SignatureMap()
+  for name, func in signatures.items():
+    # This true of any signature that came from canonicalize_signatures. Here as
+    # a sanity check on saving; crashing on load (e.g. in _add_signature) would
+    # be more problematic in case future export changes violated these
+    # assertions.
+    assert isinstance(func, defun.ConcreteFunction)
+    assert isinstance(func.structured_outputs, collections.Mapping)
+    # pylint: disable=protected-access
+    if len(func._arg_keywords) == 1:
+      assert 1 == func._num_positional_args
+    else:
+      assert 0 == func._num_positional_args
+    signature_map._add_signature(name, func)
+    # pylint: enable=protected-access
+  return signature_map
+
+
+def validate_saveable_view(saveable_view):
+  """Performs signature-related sanity checks on `saveable_view`."""
+  for name, dep in saveable_view.list_dependencies(
+      saveable_view.root):
+    if name == SIGNATURE_ATTRIBUTE_NAME:
+      if not isinstance(dep, _SignatureMap):
+        raise ValueError(
+            ("Exporting an object {} which has an attribute named "
+             "'{signatures}'. This is a reserved attribute used to store "
+             "SavedModel signatures in objects which come from "
+             "`tf.saved_model.load`. Delete this attribute "
+             "(e.g. 'del obj.{signatures}') before saving if this shadowing is "
+             "acceptable.").format(
+                 saveable_view.root,
+                 signatures=SIGNATURE_ATTRIBUTE_NAME))
+      break
diff --git a/tensorflow/python/saved_model/tag_constants.py b/tensorflow/python/saved_model/tag_constants.py
index 8c84c9fbe4d8e65273433dc98f9da34a2183f90e..7793d4921444de45966023aff087e7865068e251 100644
--- a/tensorflow/python/saved_model/tag_constants.py
+++ b/tensorflow/python/saved_model/tag_constants.py
@@ -33,7 +33,7 @@ tf_export(
 # Tag for the `training` graph.
 TRAINING = "train"
 tf_export(
-    "saved_model.TRANING",
+    "saved_model.TRAINING",
     v1=["saved_model.TRAINING",
         "saved_model.tag_constants.TRAINING"]).export_constant(
             __name__, "TRAINING")
diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py
index 5caabe59fec1a0819629bd9ff16ad5be19f0890a..2e7b2080574e875233181a1476eb328a07e718c5 100644
--- a/tensorflow/python/saved_model/utils_impl.py
+++ b/tensorflow/python/saved_model/utils_impl.py
@@ -22,6 +22,7 @@ import os
 
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -49,11 +50,21 @@ def build_tensor_info(tensor):
   Args:
     tensor: Tensor or SparseTensor whose name, dtype and shape are used to
         build the TensorInfo. For SparseTensors, the names of the three
-        constitutent Tensors are used.
+        constituent Tensors are used.
 
   Returns:
     A TensorInfo protocol buffer constructed based on the supplied argument.
+
+  Raises:
+    RuntimeError: If eager execution is enabled.
   """
+  if context.executing_eagerly():
+    raise RuntimeError("build_tensor_info is not supported in Eager mode.")
+  return build_tensor_info_internal(tensor)
+
+
+def build_tensor_info_internal(tensor):
+  """Utility function to build TensorInfo proto from a Tensor."""
   tensor_info = meta_graph_pb2.TensorInfo(
       dtype=dtypes.as_dtype(tensor.dtype).as_datatype_enum,
       tensor_shape=tensor.get_shape().as_proto())
diff --git a/tensorflow/python/saved_model/utils_test.py b/tensorflow/python/saved_model/utils_test.py
index 2afe8abfd646f26f0562d7cc56b82c5781a586ef..1e12de91b8652328632010d716f75f551aaab2db 100644
--- a/tensorflow/python/saved_model/utils_test.py
+++ b/tensorflow/python/saved_model/utils_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import types_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -81,6 +82,12 @@ class UtilsTest(test.TestCase):
     self.assertEqual(42, x_tensor_info.tensor_shape.dim[0].size)
     self.assertEqual(69, x_tensor_info.tensor_shape.dim[1].size)
 
+  def testBuildTensorInfoEager(self):
+    x = constant_op.constant(1, name="x")
+    with context.eager_mode(), self.assertRaisesRegexp(
+        RuntimeError, "build_tensor_info is not supported in Eager mode"):
+      utils.build_tensor_info(x)
+
   @test_util.run_v1_only("b/120545219")
   def testGetTensorFromInfoDense(self):
     expected = array_ops.placeholder(dtypes.float32, 1, name="x")
diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index 0c13016712f316e113723c4c0c250ef636a3fcf0..a01feb3dde041de2ca33f5f4d9fea6a1b6869d41 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Tensor summaries for exporting information about a model.
+"""Operations for writing summary data, for use in analysis and visualization.
 
-See the [Summary](https://tensorflow.org/api_guides/python/summary) guide.
+See the [Summaries and
+TensorBoard](https://www.tensorflow.org/guide/summaries_and_tensorboard) guide.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/summary/summary_iterator.py b/tensorflow/python/summary/summary_iterator.py
index 321b11ffb73487405428340df94010ed8ddbfcd4..3675c235cfba1063bf2e338fd223dce6c540bec6 100644
--- a/tensorflow/python/summary/summary_iterator.py
+++ b/tensorflow/python/summary/summary_iterator.py
@@ -24,7 +24,7 @@ from tensorflow.python.lib.io import tf_record
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('train.summary_iterator')
+@tf_export(v1=['train.summary_iterator'])
 def summary_iterator(path):
   # pylint: disable=line-too-long
   """An iterator for reading `Event` protocol buffers from an event file.
diff --git a/tensorflow/python/summary/writer/writer.py b/tensorflow/python/summary/writer/writer.py
index 78217b503ffac90811c6ae8316bc0c0b907e7bf7..a66be4f833713d106deda15fef56f48ef4a321d3 100644
--- a/tensorflow/python/summary/writer/writer.py
+++ b/tensorflow/python/summary/writer/writer.py
@@ -279,7 +279,7 @@ class SummaryToEventTransformer(object):
     self.event_writer.add_event(event)
 
 
-@tf_export("summary.FileWriter")
+@tf_export(v1=["summary.FileWriter"])
 class FileWriter(SummaryToEventTransformer):
   """Writes `Summary` protocol buffers to event files.
 
diff --git a/tensorflow/python/summary/writer/writer_cache.py b/tensorflow/python/summary/writer/writer_cache.py
index 645fa28a37fb125b6b1224961251bc8879d5fe6d..c62a7ce1a3f6eb6cd223f70dabd478b2dba24394 100644
--- a/tensorflow/python/summary/writer/writer_cache.py
+++ b/tensorflow/python/summary/writer/writer_cache.py
@@ -25,7 +25,7 @@ from tensorflow.python.summary.writer.writer import FileWriter
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('summary.FileWriterCache')
+@tf_export(v1=['summary.FileWriterCache'])
 class FileWriterCache(object):
   """Cache for file writers.
 
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index 901d6bc335f3a10439e2f02d0db2b237a89fece0..e483155dcfbc9e93c8b8aa28e83b6122ec99822e 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -13,15 +13,19 @@ load("//tensorflow:tensorflow.bzl", "py_binary")
 # Transitive dependencies of this target will be included in the pip package.
 py_library(
     name = "tools_pip",
-    deps = [
+    data = [
         ":freeze_graph",
         ":import_pb_to_tensorboard",
         ":inspect_checkpoint",
         ":optimize_for_inference",
         ":print_selective_registration_header",
         ":saved_model_cli",
-        ":saved_model_utils",
         ":strip_unused",
+        # Include the TF upgrade script to users can run it directly after install TF
+        "//tensorflow/tools/compatibility:tf_upgrade_v2",
+    ],
+    deps = [
+        ":saved_model_utils",
         # The following py_library are needed because
         # py_binary may not depend on them when --define=no_tensorflow_py_deps=true
         # is specified. See https://github.com/tensorflow/tensorflow/issues/22390
@@ -29,8 +33,6 @@ py_library(
         ":optimize_for_inference_lib",
         ":selective_registration_header_lib",
         ":strip_unused_lib",
-        # Include the TF upgrade script to users can run it directly after install TF
-        "//tensorflow/tools/compatibility:tf_upgrade_v2",
     ],
 )
 
@@ -38,7 +40,20 @@ py_library(
     name = "saved_model_utils",
     srcs = ["saved_model_utils.py"],
     srcs_version = "PY2AND3",
-    deps = ["//tensorflow/contrib/saved_model:reader"],
+)
+
+py_test(
+    name = "saved_model_utils_test",
+    size = "small",
+    srcs = ["saved_model_utils_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
+    visibility = ["//visibility:private"],
+    deps = [
+        ":saved_model_utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/saved_model",
+    ],
 )
 
 py_library(
@@ -64,6 +79,13 @@ py_binary(
     name = "freeze_graph",
     srcs = ["freeze_graph.py"],
     srcs_version = "PY2AND3",
+    deps = [":freeze_graph_main_lib"],
+)
+
+py_library(
+    name = "freeze_graph_main_lib",
+    srcs = ["freeze_graph.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":freeze_graph_lib",
     ],
@@ -73,6 +95,13 @@ py_binary(
     name = "import_pb_to_tensorboard",
     srcs = ["import_pb_to_tensorboard.py"],
     srcs_version = "PY2AND3",
+    deps = [":import_pb_to_tensorboard_lib"],
+)
+
+py_library(
+    name = "import_pb_to_tensorboard_lib",
+    srcs = ["import_pb_to_tensorboard.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python",
@@ -90,7 +119,7 @@ py_test(
     srcs = ["freeze_graph_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":freeze_graph",
+        ":freeze_graph_lib",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -107,6 +136,13 @@ py_binary(
     name = "inspect_checkpoint",
     srcs = ["inspect_checkpoint.py"],
     srcs_version = "PY2AND3",
+    deps = [":inspect_checkpoint_lib"],
+)
+
+py_library(
+    name = "inspect_checkpoint_lib",
+    srcs = ["inspect_checkpoint.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python",  # TODO(b/34059704): remove when fixed
         "//tensorflow/python:platform",
@@ -180,6 +216,13 @@ py_binary(
     name = "optimize_for_inference",
     srcs = ["optimize_for_inference.py"],
     srcs_version = "PY2AND3",
+    deps = [":optimize_for_inference_main_lib"],
+)
+
+py_library(
+    name = "optimize_for_inference_main_lib",
+    srcs = ["optimize_for_inference.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":optimize_for_inference_lib",
         "//tensorflow/core:protos_all_py",
@@ -227,6 +270,14 @@ py_binary(
     srcs = ["print_selective_registration_header.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
+    deps = [":print_selective_registration_header_lib"],
+)
+
+py_library(
+    name = "print_selective_registration_header_lib",
+    srcs = ["print_selective_registration_header.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
     deps = [
         ":selective_registration_header_lib",
         "//tensorflow/python:platform",
@@ -248,9 +299,15 @@ py_binary(
     name = "saved_model_cli",
     srcs = ["saved_model_cli.py"],
     srcs_version = "PY2AND3",
+    deps = [":saved_model_cli_lib"],
+)
+
+py_library(
+    name = "saved_model_cli_lib",
+    srcs = ["saved_model_cli.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":saved_model_utils",
-        "//tensorflow/contrib/saved_model:saved_model_py",
         "//tensorflow/python",
         "//tensorflow/python/debug:local_cli_wrapper",
     ],
@@ -268,7 +325,7 @@ py_test(
         "no-internal-py3",
     ],
     deps = [
-        ":saved_model_cli",
+        ":saved_model_cli_lib",
         "//tensorflow/core:protos_all_py",
     ],
 )
diff --git a/tensorflow/python/tools/api/generator/BUILD b/tensorflow/python/tools/api/generator/BUILD
index 9fd069c5be0e61083e38ecdb2f974f9d38ee9216..109c71b41d02ce6a84653044449baf9df5f088da 100644
--- a/tensorflow/python/tools/api/generator/BUILD
+++ b/tensorflow/python/tools/api/generator/BUILD
@@ -6,6 +6,8 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow/python/tools/api/generator:api_init_files.bzl", "TENSORFLOW_API_INIT_FILES")
 load("//tensorflow/python/tools/api/generator:api_init_files_v1.bzl", "TENSORFLOW_API_INIT_FILES_V1")
+load("//tensorflow/python/tools/api/generator:api_init_files.bzl", "KERAS_API_INIT_FILES")
+load("//tensorflow/python/tools/api/generator:api_init_files_v1.bzl", "KERAS_API_INIT_FILES_V1")
 
 exports_files(
     [
@@ -55,7 +57,7 @@ py_test(
     args = [
         "--package=tensorflow.python",
         "--api_name=tensorflow",
-    ] + TENSORFLOW_API_INIT_FILES + TENSORFLOW_API_INIT_FILES_V1,
+    ] + KERAS_API_INIT_FILES + KERAS_API_INIT_FILES_V1 + TENSORFLOW_API_INIT_FILES + TENSORFLOW_API_INIT_FILES_V1,
     main = "doc_srcs_test.py",
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index 0245ac50a65a99a4e93733de17d680fe816e7db1..a1dd37f42f2d29cfe964e9155e31d4d335028019 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -4,12 +4,18 @@
 TENSORFLOW_API_INIT_FILES = [
     # BEGIN GENERATED FILES
     "__init__.py",
+    "audio/__init__.py",
+    "autograph/__init__.py",
+    "autograph/experimental/__init__.py",
     "bitwise/__init__.py",
     "compat/__init__.py",
+    "config/__init__.py",
     "data/__init__.py",
     "data/experimental/__init__.py",
     "debugging/__init__.py",
     "distribute/__init__.py",
+    "distribute/cluster_resolver/__init__.py",
+    "distribute/experimental/__init__.py",
     "dtypes/__init__.py",
     "errors/__init__.py",
     "experimental/__init__.py",
@@ -18,7 +24,36 @@ TENSORFLOW_API_INIT_FILES = [
     "graph_util/__init__.py",
     "image/__init__.py",
     "io/__init__.py",
-    "initializers/__init__.py",
+    "queue/__init__.py",
+    "linalg/__init__.py",
+    "lite/__init__.py",
+    "lite/constants/__init__.py",
+    "lite/experimental/__init__.py",
+    "lite/experimental/nn/__init__.py",
+    "math/__init__.py",
+    "nest/__init__.py",
+    "nn/__init__.py",
+    "nn/rnn_cell/__init__.py",
+    "quantization/__init__.py",
+    "ragged/__init__.py",
+    "random/__init__.py",
+    "raw_ops/__init__.py",
+    "rnn/__init__.py",
+    "saved_model/__init__.py",
+    "sets/__init__.py",
+    "signal/__init__.py",
+    "sparse/__init__.py",
+    "strings/__init__.py",
+    "summary/__init__.py",
+    "sysconfig/__init__.py",
+    "test/__init__.py",
+    "train/__init__.py",
+    "version/__init__.py",
+    # END GENERATED FILES
+]
+
+KERAS_API_INIT_FILES = [
+    "__init__.py",
     "keras/__init__.py",
     "keras/activations/__init__.py",
     "keras/applications/__init__.py",
@@ -47,10 +82,12 @@ TENSORFLOW_API_INIT_FILES = [
     "keras/experimental/__init__.py",
     "keras/initializers/__init__.py",
     "keras/layers/__init__.py",
+    "keras/layers/experimental/__init__.py",
     "keras/losses/__init__.py",
     "keras/metrics/__init__.py",
     "keras/models/__init__.py",
     "keras/optimizers/__init__.py",
+    "keras/optimizers/schedules/__init__.py",
     "keras/preprocessing/__init__.py",
     "keras/preprocessing/image/__init__.py",
     "keras/preprocessing/sequence/__init__.py",
@@ -59,24 +96,4 @@ TENSORFLOW_API_INIT_FILES = [
     "keras/utils/__init__.py",
     "keras/wrappers/__init__.py",
     "keras/wrappers/scikit_learn/__init__.py",
-    "linalg/__init__.py",
-    "lite/__init__.py",
-    "lite/constants/__init__.py",
-    "losses/__init__.py",
-    "math/__init__.py",
-    "nn/__init__.py",
-    "nn/rnn_cell/__init__.py",
-    "quantization/__init__.py",
-    "random/__init__.py",
-    "saved_model/__init__.py",
-    "sets/__init__.py",
-    "signal/__init__.py",
-    "sparse/__init__.py",
-    "strings/__init__.py",
-    "summary/__init__.py",
-    "sysconfig/__init__.py",
-    "test/__init__.py",
-    "train/__init__.py",
-    "version/__init__.py",
-    # END GENERATED FILES
 ]
diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
index e35b9c43740d4e59e9478cca978b15c7451ac96e..43ef09b9fa8e12e2e5bd16e11e117a1909a5bf11 100644
--- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
@@ -5,12 +5,18 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     # BEGIN GENERATED FILES
     "__init__.py",
     "app/__init__.py",
+    "audio/__init__.py",
+    "autograph/__init__.py",
+    "autograph/experimental/__init__.py",
     "bitwise/__init__.py",
     "compat/__init__.py",
+    "config/__init__.py",
     "data/__init__.py",
     "data/experimental/__init__.py",
     "debugging/__init__.py",
     "distribute/__init__.py",
+    "distribute/cluster_resolver/__init__.py",
+    "distribute/experimental/__init__.py",
     "distributions/__init__.py",
     "dtypes/__init__.py",
     "errors/__init__.py",
@@ -21,63 +27,29 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "graph_util/__init__.py",
     "image/__init__.py",
     "io/__init__.py",
+    "queue/__init__.py",
     "initializers/__init__.py",
-    "keras/__init__.py",
-    "keras/activations/__init__.py",
-    "keras/applications/__init__.py",
-    "keras/applications/densenet/__init__.py",
-    "keras/applications/inception_resnet_v2/__init__.py",
-    "keras/applications/inception_v3/__init__.py",
-    "keras/applications/mobilenet/__init__.py",
-    "keras/applications/mobilenet_v2/__init__.py",
-    "keras/applications/nasnet/__init__.py",
-    "keras/applications/resnet50/__init__.py",
-    "keras/applications/vgg16/__init__.py",
-    "keras/applications/vgg19/__init__.py",
-    "keras/applications/xception/__init__.py",
-    "keras/backend/__init__.py",
-    "keras/callbacks/__init__.py",
-    "keras/constraints/__init__.py",
-    "keras/datasets/__init__.py",
-    "keras/datasets/boston_housing/__init__.py",
-    "keras/datasets/cifar10/__init__.py",
-    "keras/datasets/cifar100/__init__.py",
-    "keras/datasets/fashion_mnist/__init__.py",
-    "keras/datasets/imdb/__init__.py",
-    "keras/datasets/mnist/__init__.py",
-    "keras/datasets/reuters/__init__.py",
-    "keras/estimator/__init__.py",
-    "keras/experimental/__init__.py",
-    "keras/initializers/__init__.py",
-    "keras/layers/__init__.py",
-    "keras/losses/__init__.py",
-    "keras/metrics/__init__.py",
-    "keras/models/__init__.py",
-    "keras/optimizers/__init__.py",
-    "keras/preprocessing/__init__.py",
-    "keras/preprocessing/image/__init__.py",
-    "keras/preprocessing/sequence/__init__.py",
-    "keras/preprocessing/text/__init__.py",
-    "keras/regularizers/__init__.py",
-    "keras/utils/__init__.py",
-    "keras/wrappers/__init__.py",
-    "keras/wrappers/scikit_learn/__init__.py",
     "layers/__init__.py",
     "layers/experimental/__init__.py",
     "linalg/__init__.py",
     "lite/__init__.py",
     "lite/constants/__init__.py",
+    "lite/experimental/__init__.py",
+    "lite/experimental/nn/__init__.py",
     "logging/__init__.py",
     "losses/__init__.py",
     "manip/__init__.py",
     "math/__init__.py",
     "metrics/__init__.py",
+    "nest/__init__.py",
     "nn/__init__.py",
     "nn/rnn_cell/__init__.py",
     "profiler/__init__.py",
     "python_io/__init__.py",
     "quantization/__init__.py",
+    "ragged/__init__.py",
     "random/__init__.py",
+    "raw_ops/__init__.py",
     "resource_loader/__init__.py",
     "strings/__init__.py",
     "saved_model/__init__.py",
@@ -103,3 +75,49 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "version/__init__.py",
     # END GENERATED FILES
 ]
+
+KERAS_API_INIT_FILES_V1 = [
+    "__init__.py",
+    "keras/__init__.py",
+    "keras/activations/__init__.py",
+    "keras/applications/__init__.py",
+    "keras/applications/densenet/__init__.py",
+    "keras/applications/inception_resnet_v2/__init__.py",
+    "keras/applications/inception_v3/__init__.py",
+    "keras/applications/mobilenet/__init__.py",
+    "keras/applications/mobilenet_v2/__init__.py",
+    "keras/applications/nasnet/__init__.py",
+    "keras/applications/resnet50/__init__.py",
+    "keras/applications/vgg16/__init__.py",
+    "keras/applications/vgg19/__init__.py",
+    "keras/applications/xception/__init__.py",
+    "keras/backend/__init__.py",
+    "keras/callbacks/__init__.py",
+    "keras/constraints/__init__.py",
+    "keras/datasets/__init__.py",
+    "keras/datasets/boston_housing/__init__.py",
+    "keras/datasets/cifar10/__init__.py",
+    "keras/datasets/cifar100/__init__.py",
+    "keras/datasets/fashion_mnist/__init__.py",
+    "keras/datasets/imdb/__init__.py",
+    "keras/datasets/mnist/__init__.py",
+    "keras/datasets/reuters/__init__.py",
+    "keras/estimator/__init__.py",
+    "keras/experimental/__init__.py",
+    "keras/initializers/__init__.py",
+    "keras/layers/__init__.py",
+    "keras/layers/experimental/__init__.py",
+    "keras/losses/__init__.py",
+    "keras/metrics/__init__.py",
+    "keras/models/__init__.py",
+    "keras/optimizers/__init__.py",
+    "keras/optimizers/schedules/__init__.py",
+    "keras/preprocessing/__init__.py",
+    "keras/preprocessing/image/__init__.py",
+    "keras/preprocessing/sequence/__init__.py",
+    "keras/preprocessing/text/__init__.py",
+    "keras/regularizers/__init__.py",
+    "keras/utils/__init__.py",
+    "keras/wrappers/__init__.py",
+    "keras/wrappers/scikit_learn/__init__.py",
+]
diff --git a/tensorflow/python/tools/api/generator/doc_srcs.py b/tensorflow/python/tools/api/generator/doc_srcs.py
index abb5886deb3d9dd2e6981ee5822b0323a87eef1d..28bf0e9d015e6f4b28e8cfbf0dbb5a3ccec66f11 100644
--- a/tensorflow/python/tools/api/generator/doc_srcs.py
+++ b/tensorflow/python/tools/api/generator/doc_srcs.py
@@ -54,12 +54,14 @@ _TENSORFLOW_DOC_SOURCES = {
     'nn': DocSource(docstring_module_name='ops.nn_ops'),
     'nn.rnn_cell': DocSource(docstring_module_name='ops.rnn_cell'),
     'python_io': DocSource(docstring_module_name='lib.io.python_io'),
+    'ragged': DocSource(docstring_module_name='ops.ragged'),
     'resource_loader': DocSource(
         docstring_module_name='platform.resource_loader'),
     'sets': DocSource(docstring_module_name='ops.sets'),
     'signal': DocSource(docstring_module_name='ops.signal.signal'),
     'sparse': DocSource(docstring_module_name='ops.sparse_ops'),
     'strings': DocSource(docstring_module_name='ops.string_ops'),
+    'summary': DocSource(docstring_module_name='summary.summary'),
     'sysconfig': DocSource(docstring_module_name='platform.sysconfig'),
     'test': DocSource(docstring_module_name='platform.test'),
     'train': DocSource(docstring_module_name='training.training'),
diff --git a/tensorflow/python/tools/api/generator/output_init_files_test.py b/tensorflow/python/tools/api/generator/output_init_files_test.py
index ab154af9101e32ecacda276004b0e2c39ced0b83..7013f007e583b7d35dcb6f8bfdbea2fefdbb3101 100644
--- a/tensorflow/python/tools/api/generator/output_init_files_test.py
+++ b/tensorflow/python/tools/api/generator/output_init_files_test.py
@@ -45,7 +45,7 @@ def _get_modules(package, attr_name, constants_attr_name):
       API constant names.
 
   Returns:
-    Set of TensorFow API modules.
+    Set of TensorFlow API modules.
   """
   modules = set()
   # TODO(annarev): split up the logic in create_python_api.py so that
diff --git a/tensorflow/python/tools/freeze_graph.py b/tensorflow/python/tools/freeze_graph.py
index 893309f35afe96361dd639444d736f01cfc0b593..ab82ee9fd410e646c0c1f9b302d47bb3021bb514 100644
--- a/tensorflow/python/tools/freeze_graph.py
+++ b/tensorflow/python/tools/freeze_graph.py
@@ -240,13 +240,13 @@ def freeze_graph_with_def_protos(input_graph_def,
 
 
 def _parse_input_graph_proto(input_graph, input_binary):
-  """Parser input tensorflow graph into GraphDef proto."""
+  """Parses input tensorflow graph into GraphDef proto."""
   if not gfile.Exists(input_graph):
     print("Input graph file '" + input_graph + "' does not exist!")
     return -1
   input_graph_def = graph_pb2.GraphDef()
   mode = "rb" if input_binary else "r"
-  with gfile.FastGFile(input_graph, mode) as f:
+  with gfile.GFile(input_graph, mode) as f:
     if input_binary:
       input_graph_def.ParseFromString(f.read())
     else:
@@ -255,13 +255,13 @@ def _parse_input_graph_proto(input_graph, input_binary):
 
 
 def _parse_input_meta_graph_proto(input_graph, input_binary):
-  """Parser input tensorflow graph into MetaGraphDef proto."""
+  """Parses input tensorflow graph into MetaGraphDef proto."""
   if not gfile.Exists(input_graph):
     print("Input meta graph file '" + input_graph + "' does not exist!")
     return -1
   input_meta_graph_def = MetaGraphDef()
   mode = "rb" if input_binary else "r"
-  with gfile.FastGFile(input_graph, mode) as f:
+  with gfile.GFile(input_graph, mode) as f:
     if input_binary:
       input_meta_graph_def.ParseFromString(f.read())
     else:
@@ -271,12 +271,12 @@ def _parse_input_meta_graph_proto(input_graph, input_binary):
 
 
 def _parse_input_saver_proto(input_saver, input_binary):
-  """Parser input tensorflow Saver into SaverDef proto."""
+  """Parses input tensorflow Saver into SaverDef proto."""
   if not gfile.Exists(input_saver):
     print("Input saver file '" + input_saver + "' does not exist!")
     return -1
   mode = "rb" if input_binary else "r"
-  with gfile.FastGFile(input_saver, mode) as f:
+  with gfile.GFile(input_saver, mode) as f:
     saver_def = saver_pb2.SaverDef()
     if input_binary:
       saver_def.ParseFromString(f.read())
diff --git a/tensorflow/python/tools/import_pb_to_tensorboard.py b/tensorflow/python/tools/import_pb_to_tensorboard.py
index 6d2fec3ad6ea193dd72bb29a5f5450f5356d4f1a..edfdb77b90bd26cf6979c5462ccf4703b85b8185 100644
--- a/tensorflow/python/tools/import_pb_to_tensorboard.py
+++ b/tensorflow/python/tools/import_pb_to_tensorboard.py
@@ -53,7 +53,7 @@ def import_to_tensorboard(model_dir, log_dir):
     View your imported `.pb` model as a graph.
   """
   with session.Session(graph=ops.Graph()) as sess:
-    with gfile.FastGFile(model_dir, "rb") as f:
+    with gfile.GFile(model_dir, "rb") as f:
       graph_def = graph_pb2.GraphDef()
       graph_def.ParseFromString(f.read())
       importer.import_graph_def(graph_def)
diff --git a/tensorflow/python/tools/optimize_for_inference.py b/tensorflow/python/tools/optimize_for_inference.py
index fbf8c2d70999cc5a92c220754b0f8e2287fb6644..693e34348bd14ce7de96d9ed12516267594d5abf 100644
--- a/tensorflow/python/tools/optimize_for_inference.py
+++ b/tensorflow/python/tools/optimize_for_inference.py
@@ -92,7 +92,7 @@ def main(unused_args):
       FLAGS.toco_compatible)
 
   if FLAGS.frozen_graph:
-    f = gfile.FastGFile(FLAGS.output, "w")
+    f = gfile.GFile(FLAGS.output, "w")
     f.write(output_graph_def.SerializeToString())
   else:
     graph_io.write_graph(output_graph_def,
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index c4c3756c0407f2ed6a6a411b6778b2431428eea6..cdef42e2bf8df4834677bb809194183332c6f279 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -30,9 +30,8 @@ import sys
 import warnings
 
 import numpy as np
-
 from six import integer_types
-from tensorflow.contrib.saved_model.python.saved_model import reader
+
 from tensorflow.core.example import example_pb2
 from tensorflow.core.framework import types_pb2
 from tensorflow.python.client import session
@@ -56,7 +55,7 @@ def _show_tag_sets(saved_model_dir):
   Args:
     saved_model_dir: Directory containing the SavedModel to inspect.
   """
-  tag_sets = reader.get_saved_model_tag_sets(saved_model_dir)
+  tag_sets = saved_model_utils.get_saved_model_tag_sets(saved_model_dir)
   print('The given SavedModel contains the following tag-sets:')
   for tag_set in sorted(tag_sets):
     print(', '.join(sorted(tag_set)))
@@ -190,7 +189,7 @@ def _show_all(saved_model_dir):
   Args:
     saved_model_dir: Directory containing the SavedModel to inspect.
   """
-  tag_sets = reader.get_saved_model_tag_sets(saved_model_dir)
+  tag_sets = saved_model_utils.get_saved_model_tag_sets(saved_model_dir)
   for tag_set in sorted(tag_sets):
     print("\nMetaGraphDef with tag-set: '%s' "
           "contains the following SignatureDefs:" % ', '.join(tag_set))
@@ -654,11 +653,33 @@ def scan(args):
     scan_meta_graph_def(
         saved_model_utils.get_meta_graph_def(args.dir, args.tag_set))
   else:
-    saved_model = reader.read_saved_model(args.dir)
+    saved_model = saved_model_utils.read_saved_model(args.dir)
     for meta_graph_def in saved_model.meta_graphs:
       scan_meta_graph_def(meta_graph_def)
 
 
+def convert_with_tensorrt(args):
+  """Function triggered by 'convert tensorrt' command.
+
+  Args:
+    args: A namespace parsed from command line.
+  """
+  # Import here instead of at top, because this will crash if TensorRT is
+  # not installed
+  from tensorflow.contrib import tensorrt  # pylint: disable=g-import-not-at-top
+  tensorrt.create_inference_graph(
+      None,
+      None,
+      max_batch_size=args.max_batch_size,
+      max_workspace_size_bytes=args.max_workspace_size_bytes,
+      precision_mode=args.precision_mode,
+      minimum_segment_size=args.minimum_segment_size,
+      is_dynamic_op=args.is_dynamic_op,
+      input_saved_model_dir=args.dir,
+      input_saved_model_tags=args.tag_set.split(','),
+      output_saved_model_dir=args.output_dir)
+
+
 def create_parser():
   """Creates a parser that parse the command line arguments.
 
@@ -812,6 +833,71 @@ def create_parser():
       help='tag-set of graph in SavedModel to scan, separated by \',\'')
   parser_scan.set_defaults(func=scan)
 
+  # convert command
+  convert_msg = ('Usage example:\n'
+                 'To convert the SavedModel to one that have TensorRT ops:\n'
+                 '$saved_model_cli convert \\\n'
+                 '   --dir /tmp/saved_model \\\n'
+                 '   --tag_set serve \\\n'
+                 '   --output_dir /tmp/saved_model_trt \\\n'
+                 '   tensorrt \n')
+  parser_convert = subparsers.add_parser(
+      'convert',
+      description=convert_msg,
+      formatter_class=argparse.RawTextHelpFormatter)
+  parser_convert.add_argument(
+      '--dir',
+      type=str,
+      required=True,
+      help='directory containing the SavedModel to convert')
+  parser_convert.add_argument(
+      '--output_dir',
+      type=str,
+      required=True,
+      help='output directory for the converted SavedModel')
+  parser_convert.add_argument(
+      '--tag_set',
+      type=str,
+      required=True,
+      help='tag-set of graph in SavedModel to convert, separated by \',\'')
+  convert_subparsers = parser_convert.add_subparsers(
+      title='conversion methods',
+      description='valid conversion methods',
+      help='the conversion to run with the SavedModel')
+  parser_convert_with_tensorrt = convert_subparsers.add_parser(
+      'tensorrt',
+      description='Convert the SavedModel with Tensorflow-TensorRT integration',
+      formatter_class=argparse.RawTextHelpFormatter)
+  parser_convert_with_tensorrt.add_argument(
+      '--max_batch_size',
+      type=int,
+      default=1,
+      help='max size for the input batch')
+  parser_convert_with_tensorrt.add_argument(
+      '--max_workspace_size_bytes',
+      type=int,
+      default=2 << 20,
+      help=('the maximum GPU temporary memory which the TRT engine can use at '
+            'execution time'))
+  parser_convert_with_tensorrt.add_argument(
+      '--precision_mode',
+      type=str,
+      default='FP32',
+      help='one of FP32, FP16 and INT8')
+  parser_convert_with_tensorrt.add_argument(
+      '--minimum_segment_size',
+      type=int,
+      default=3,
+      help=('the minimum number of nodes required for a subgraph to be replaced'
+            'in a TensorRT node'))
+  parser_convert_with_tensorrt.add_argument(
+      '--is_dynamic_op',
+      type=bool,
+      default=False,
+      help=('whether to generate dynamic TRT ops which will build the TRT '
+            'network and engine at run time'))
+  parser_convert_with_tensorrt.set_defaults(func=convert_with_tensorrt)
+
   return parser
 
 
diff --git a/tensorflow/python/tools/saved_model_utils.py b/tensorflow/python/tools/saved_model_utils.py
index c27d7a2658a096d1f5ce515dbc1f86423eb113de..17c4b8cb8319363a4a2d422a563ae1227d673366 100644
--- a/tensorflow/python/tools/saved_model_utils.py
+++ b/tensorflow/python/tools/saved_model_utils.py
@@ -18,7 +18,78 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.saved_model.python.saved_model import reader
+import os
+
+from google.protobuf import message
+from google.protobuf import text_format
+from tensorflow.core.protobuf import saved_model_pb2
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.saved_model import constants
+from tensorflow.python.util import compat
+
+
+def read_saved_model(saved_model_dir):
+  """Reads the savedmodel.pb or savedmodel.pbtxt file containing `SavedModel`.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel file.
+
+  Returns:
+    A `SavedModel` protocol buffer.
+
+  Raises:
+    IOError: If the file does not exist, or cannot be successfully parsed.
+  """
+  # Build the path to the SavedModel in pbtxt format.
+  path_to_pbtxt = os.path.join(
+      compat.as_bytes(saved_model_dir),
+      compat.as_bytes(constants.SAVED_MODEL_FILENAME_PBTXT))
+  # Build the path to the SavedModel in pb format.
+  path_to_pb = os.path.join(
+      compat.as_bytes(saved_model_dir),
+      compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
+
+  # Ensure that the SavedModel exists at either path.
+  if not file_io.file_exists(path_to_pbtxt) and not file_io.file_exists(
+      path_to_pb):
+    raise IOError("SavedModel file does not exist at: %s" % saved_model_dir)
+
+  # Parse the SavedModel protocol buffer.
+  saved_model = saved_model_pb2.SavedModel()
+  if file_io.file_exists(path_to_pb):
+    try:
+      file_content = file_io.FileIO(path_to_pb, "rb").read()
+      saved_model.ParseFromString(file_content)
+      return saved_model
+    except message.DecodeError as e:
+      raise IOError("Cannot parse file %s: %s." % (path_to_pb, str(e)))
+  elif file_io.file_exists(path_to_pbtxt):
+    try:
+      file_content = file_io.FileIO(path_to_pbtxt, "rb").read()
+      text_format.Merge(file_content.decode("utf-8"), saved_model)
+      return saved_model
+    except text_format.ParseError as e:
+      raise IOError("Cannot parse file %s: %s." % (path_to_pbtxt, str(e)))
+  else:
+    raise IOError("SavedModel file does not exist at: %s/{%s|%s}" %
+                  (saved_model_dir, constants.SAVED_MODEL_FILENAME_PBTXT,
+                   constants.SAVED_MODEL_FILENAME_PB))
+
+
+def get_saved_model_tag_sets(saved_model_dir):
+  """Retrieves all the tag-sets available in the SavedModel.
+
+  Args:
+    saved_model_dir: Directory containing the SavedModel.
+
+  Returns:
+    String representation of all tag-sets in the SavedModel.
+  """
+  saved_model = read_saved_model(saved_model_dir)
+  all_tags = []
+  for meta_graph_def in saved_model.meta_graphs:
+    all_tags.append(list(meta_graph_def.meta_info_def.tags))
+  return all_tags
 
 
 def get_meta_graph_def(saved_model_dir, tag_set):
@@ -39,7 +110,7 @@ def get_meta_graph_def(saved_model_dir, tag_set):
   Returns:
     A MetaGraphDef corresponding to the tag-set.
   """
-  saved_model = reader.read_saved_model(saved_model_dir)
+  saved_model = read_saved_model(saved_model_dir)
   set_of_tags = set(tag_set.split(','))
   for meta_graph_def in saved_model.meta_graphs:
     if set(meta_graph_def.meta_info_def.tags) == set_of_tags:
diff --git a/tensorflow/python/tools/saved_model_utils_test.py b/tensorflow/python/tools/saved_model_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5512dea1f74c8a27045c0036fb0d6df9681169bf
--- /dev/null
+++ b/tensorflow/python/tools/saved_model_utils_test.py
@@ -0,0 +1,116 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for SavedModel utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import builder as saved_model_builder
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.tools import saved_model_utils
+
+
+def tearDownModule():
+  file_io.delete_recursively(test.get_temp_dir())
+
+
+class SavedModelUtilTest(test.TestCase):
+
+  def _init_and_validate_variable(self, sess, variable_name, variable_value):
+    v = variables.Variable(variable_value, name=variable_name)
+    sess.run(variables.global_variables_initializer())
+    self.assertEqual(variable_value, v.eval())
+
+  @test_util.deprecated_graph_mode_only
+  def testReadSavedModelValid(self):
+    saved_model_dir = os.path.join(test.get_temp_dir(), "valid_saved_model")
+    builder = saved_model_builder.SavedModelBuilder(saved_model_dir)
+    with self.session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+      builder.add_meta_graph_and_variables(sess, [tag_constants.TRAINING])
+    builder.save()
+
+    actual_saved_model_pb = saved_model_utils.read_saved_model(saved_model_dir)
+    self.assertEqual(len(actual_saved_model_pb.meta_graphs), 1)
+    self.assertEqual(
+        len(actual_saved_model_pb.meta_graphs[0].meta_info_def.tags), 1)
+    self.assertEqual(actual_saved_model_pb.meta_graphs[0].meta_info_def.tags[0],
+                     tag_constants.TRAINING)
+
+  def testReadSavedModelInvalid(self):
+    saved_model_dir = os.path.join(test.get_temp_dir(), "invalid_saved_model")
+    with self.assertRaisesRegexp(
+        IOError, "SavedModel file does not exist at: %s" % saved_model_dir):
+      saved_model_utils.read_saved_model(saved_model_dir)
+
+  @test_util.deprecated_graph_mode_only
+  def testGetSavedModelTagSets(self):
+    saved_model_dir = os.path.join(test.get_temp_dir(), "test_tags")
+    builder = saved_model_builder.SavedModelBuilder(saved_model_dir)
+
+    # Graph with a single variable. SavedModel invoked to:
+    # - add with weights.
+    # - a single tag (from predefined constants).
+    with self.session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+      builder.add_meta_graph_and_variables(sess, [tag_constants.TRAINING])
+
+    # Graph that updates the single variable. SavedModel invoked to:
+    # - simply add the model (weights are not updated).
+    # - a single tag (from predefined constants).
+    with self.session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 43)
+      builder.add_meta_graph([tag_constants.SERVING])
+
+    # Graph that updates the single variable. SavedModel is invoked:
+    # - to add the model (weights are not updated).
+    # - multiple predefined tags.
+    with self.session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 44)
+      builder.add_meta_graph([tag_constants.SERVING, tag_constants.GPU])
+
+    # Graph that updates the single variable. SavedModel is invoked:
+    # - to add the model (weights are not updated).
+    # - multiple predefined tags for serving on TPU.
+    with self.session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 44)
+      builder.add_meta_graph([tag_constants.SERVING, tag_constants.TPU])
+
+    # Graph that updates the single variable. SavedModel is invoked:
+    # - to add the model (weights are not updated).
+    # - multiple custom tags.
+    with self.session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 45)
+      builder.add_meta_graph(["foo", "bar"])
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    actual_tags = saved_model_utils.get_saved_model_tag_sets(saved_model_dir)
+    expected_tags = [["train"], ["serve"], ["serve", "gpu"], ["serve", "tpu"],
+                     ["foo", "bar"]]
+    self.assertEqual(expected_tags, actual_tags)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/tools/strip_unused_lib.py b/tensorflow/python/tools/strip_unused_lib.py
index b1d195607604b406f68b28824564afc642cc43ad..decd7e2fc83f1ffefda187ac80cf9c11edda01da 100644
--- a/tensorflow/python/tools/strip_unused_lib.py
+++ b/tensorflow/python/tools/strip_unused_lib.py
@@ -102,7 +102,7 @@ def strip_unused_from_files(input_graph, input_binary, output_graph,
 
   input_graph_def = graph_pb2.GraphDef()
   mode = "rb" if input_binary else "r"
-  with gfile.FastGFile(input_graph, mode) as f:
+  with gfile.GFile(input_graph, mode) as f:
     if input_binary:
       input_graph_def.ParseFromString(f.read())
     else:
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a76c6205648d75146782684c171cc345e7fff394
--- /dev/null
+++ b/tensorflow/python/tpu/BUILD
@@ -0,0 +1,334 @@
+# Description: Operations defined for Cloud TPUs
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_custom_op_library",
+    "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+    "tf_py_test",
+)
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+
+licenses(["notice"])  # Apache 2.0
+
+package(
+    default_visibility = [
+        "//cloud/vmm/testing/tests/tpu:__subpackages__",
+        "//knowledge/cerebra/sense/im2query:__subpackages__",
+        "//learning/brain:__subpackages__",
+        "//learning/deepmind:__subpackages__",
+        "//medical/pathology:__subpackages__",
+        "//tensorflow:__subpackages__",
+        "//vr/perception:__subpackages__",
+    ],
+)
+
+py_library(
+    name = "tpu_py",
+    srcs = ["ops/tpu_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:tpu_ops_gen",
+    ],
+)
+
+py_library(
+    name = "async_checkpoint",
+    srcs = ["async_checkpoint.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:summary_ops_v2",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/estimator:estimator_py",
+    ],
+)
+
+py_library(
+    name = "tpu_estimator",
+    srcs = [
+        "_tpu_estimator_embedding.py",
+        "error_handling.py",
+        "tpu_config.py",
+        "tpu_context.py",
+        "tpu_estimator.py",
+        "util.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":async_checkpoint",
+        ":feature_column",
+        ":functional",
+        ":tpu_embedding",
+        ":tpu_lib",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:function",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:session",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:summary_ops_v2",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/estimator:util",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "functional",
+    srcs = ["functional.py"],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        "//tensorflow/python:tpu_ops_gen",
+    ],
+)
+
+py_library(
+    name = "tpu",
+    srcs = [
+        "__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":feature_column",
+        ":tpu_embedding",
+        ":tpu_estimator",
+        ":tpu_lib",
+    ],
+)
+
+py_library(
+    name = "tpu_lib",
+    srcs = [
+        "__init__.py",
+        "bfloat16.py",
+        "device_assignment.py",
+        "session_support.py",
+        "tensor_tracer.py",
+        "topology.py",
+        "tpu.py",
+        "tpu_feed.py",
+        "tpu_function.py",
+        "tpu_optimizer.py",
+        "tpu_sharding.py",
+        "tpu_system_metadata.py",
+        "training_loop.py",
+        "xla.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":datasets",
+        ":functional",
+        ":tpu_py",
+        "//tensorflow/compiler/xla/experimental/xla_sharding",
+        "//tensorflow/compiler/xla/python_api:xla_shape",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/core/protobuf/tpu:compilation_result_proto_py",
+        "//tensorflow/core/protobuf/tpu:dynamic_padding_proto_py",
+        "//tensorflow/core/protobuf/tpu:optimization_parameters_proto_py",
+        "//tensorflow/core/protobuf/tpu:topology_proto_py",
+        "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_py",
+        "//tensorflow/core/protobuf/tpu:tpu_embedding_output_layout_proto_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:control_flow_util",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tpu_ops_gen",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/ops/losses",
+        "//tensorflow/python/tpu/profiler",
+    ],
+)
+
+py_library(
+    name = "datasets",
+    srcs = [
+        "datasets.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:function",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:readers",
+    ],
+)
+
+tf_py_test(
+    name = "datasets_test",
+    size = "medium",
+    srcs = ["datasets_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        ":datasets",
+    ],
+    grpc_enabled = True,
+    shard_count = 4,
+    tags = ["no_oss"],
+)
+
+tf_py_test(
+    name = "tpu_test",
+    size = "small",
+    srcs = ["tpu_test.py"],
+    additional_deps = [
+        ":tpu",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:layers",
+    ],
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
+)
+
+tf_py_test(
+    name = "tpu_sharding_test",
+    size = "small",
+    srcs = ["tpu_sharding_test.py"],
+    additional_deps = [
+        ":tpu",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+    ],
+)
+
+tf_py_test(
+    name = "bfloat16_test",
+    size = "small",
+    srcs = ["bfloat16_test.py"],
+    additional_deps = [
+        ":tpu",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+    ],
+)
+
+tf_py_test(
+    name = "tpu_infeed_test",
+    size = "small",
+    srcs = ["tpu_infeed_test.py"],
+    additional_deps = [
+        ":tpu",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+tf_py_test(
+    name = "tpu_config_test",
+    size = "small",
+    srcs = ["tpu_config_test.py"],
+    additional_deps = [
+        ":tpu_estimator",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+tf_py_test(
+    name = "tpu_estimator_signals_test",
+    size = "small",
+    srcs = ["tpu_estimator_signals_test.py"],
+    additional_deps = [
+        ":tpu_estimator",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+    ],
+    # TODO(jhseu): Remove. Fails in OSS on Python 3.
+    tags = ["no_oss"],
+)
+
+tf_py_test(
+    name = "topology_test",
+    size = "medium",
+    srcs = ["topology_test.py"],
+    additional_deps = [
+        ":tpu",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+py_library(
+    name = "tpu_embedding",
+    srcs = [
+        "tpu_embedding.py",
+        "tpu_embedding_gradient.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tpu_lib",
+        "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:tpu_ops_gen",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "feature_column",
+    srcs = ["feature_column.py"],
+    deps = [
+        ":tpu_lib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
+    ],
+)
+
+tf_py_test(
+    name = "feature_column_test",
+    srcs = [
+        "feature_column_test.py",
+    ],
+    additional_deps = [
+        ":feature_column",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
+    ],
+    main = "feature_column_test.py",
+)
diff --git a/tensorflow/python/tpu/__init__.py b/tensorflow/python/tpu/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dffd7064b19f353aed6afa3ad383564643a4a90
--- /dev/null
+++ b/tensorflow/python/tpu/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Ops related to Tensor Processing Units."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/python/tpu/_tpu_estimator_embedding.py b/tensorflow/python/tpu/_tpu_estimator_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..08e0e968a648ebdd51de1719cb8c4784c8fdf286
--- /dev/null
+++ b/tensorflow/python/tpu/_tpu_estimator_embedding.py
@@ -0,0 +1,334 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""Tooling for support TPU embedding in TPUEstimator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.feature_column import feature_column as core_fc
+from tensorflow.python.feature_column import feature_column_lib as core_fc_lib
+from tensorflow.python.tpu import feature_column as tpu_fc
+from tensorflow.python.tpu import tpu_embedding
+
+# pylint: disable=protected-access
+_TPU_EMBEDDING_COLUMN_CLASSES = (tpu_fc._TPUEmbeddingColumn,
+                                 tpu_fc._TPUSharedEmbeddingColumn)
+_EMBEDDING_COLUMN_CLASSES = (core_fc._EmbeddingColumn,
+                             core_fc_lib.EmbeddingColumn,
+                             core_fc._SharedEmbeddingColumn)
+_SUPPORTED_FEATURE_COLUMNS = (core_fc._NumericColumn, core_fc_lib.NumericColumn)
+
+# pylint: enable=protected-access
+
+_TABLE_NAME_PREFIX = 'tbl_'
+_LEN_TABLE_NAME_PREFIX = len(_TABLE_NAME_PREFIX)
+
+
+def _get_table_name_from_embedding_var_name(embedding_var_name):
+  return '{}{}'.format(_TABLE_NAME_PREFIX, embedding_var_name)
+
+
+def _get_embedding_var_name_from_table_name(table_name):
+  return table_name[_LEN_TABLE_NAME_PREFIX:]
+
+
+def _get_embedding_variable_name(scope_name, var_name):
+  return '{}/{}'.format(scope_name, var_name)
+
+
+def _get_slot_variable_names(scope_name, var_name, optimization_parameters):
+  """Return embedding variable names which are consistent with CPU runs."""
+  if isinstance(optimization_parameters, tpu_embedding.AdagradParameters):
+    return tpu_embedding.AdagradSlotVariableName(
+        '{}/{}/Adagrad'.format(scope_name, var_name)
+    )
+  elif isinstance(optimization_parameters, tpu_embedding.AdamParameters):
+    return tpu_embedding.AdamSlotVariableNames(
+        '{}/{}/Adam/m'.format(scope_name, var_name),
+        '{}/{}/Adam/v'.format(scope_name, var_name)
+    )
+  elif isinstance(optimization_parameters,
+                  tpu_embedding.StochasticGradientDescentParameters):
+    return None
+  else:
+    raise ValueError('Support to infer full variable name '
+                     'for optimization_parameter {} has not been added.'
+                     .format(optimization_parameters))
+
+
+def get_full_variable_names(
+    graph, table_to_config_dict, optimization_parameters):
+  """Return embedding variable names and slot variables which are consistent with CPU runs."""
+  collection = graph.get_collection_ref(tpu_fc._TPU_FC_TO_SCOPE)  # pylint: disable=protected-access
+  if not collection:
+    raise RuntimeError(
+        'Embedding feature column did not capture any thing. Make sure the '
+        'feature columns passed to TPUEstimator constructor is properly '
+        'used in model_fn.')
+
+  embedding_variable_name_by_table = {}
+  slot_variable_names_by_table = {}
+  for table_name in table_to_config_dict:
+    embedding_var_name = _get_embedding_var_name_from_table_name(table_name)
+    (scope_name, var_name) = collection[0][embedding_var_name]
+    embedding_variable_name_by_table[table_name] = (
+        _get_embedding_variable_name(scope_name, var_name))
+    slot_variable_names_by_table[table_name] = _get_slot_variable_names(
+        scope_name, var_name, optimization_parameters)
+
+  graph.clear_collection(tpu_fc._TPU_FC_TO_SCOPE)  # pylint: disable=protected-access
+  return embedding_variable_name_by_table, slot_variable_names_by_table
+
+
+def get_tpu_embedding_config_from_feature_columns(feature_columns):
+  """Create configs for TPUEmbedding from a list of feature columns.
+
+  This function will place one embedding tensor per table and the return is
+  intended to be used as input to TPUEmbedding.
+
+  Args:
+    feature_columns: a list of supported feature columns.
+
+  Returns:
+    A pair of dicts, the first maps tables to their config, the second maps
+    features to tables.
+  """
+
+  allowed = (tpu_fc._TPUEmbeddingColumn, tpu_fc._TPUSharedEmbeddingColumn)  # pylint: disable=protected-access
+
+  for column in feature_columns:
+    if not isinstance(column, allowed):
+      raise TypeError(
+          'Unsupported feature column {}. Supported types are {}.'.format(
+              type(column), allowed))
+
+  table_to_config = {}
+  feature_to_table = {}
+  for column in feature_columns:
+    feature_name = column.get_feature_key_name()
+    table_name = _get_table_name_from_embedding_var_name(
+        column.get_embedding_var_name())
+    if feature_name in feature_to_table:
+      raise ValueError(
+          'Feature column {} is used with multiple embeddings and this is '
+          'not supported.'.format(feature_name))
+    feature_to_table[feature_name] = table_name
+    vocabulary_size, dimension = column.get_embedding_table_size()
+    table_to_config[table_name] = tpu_embedding.TableConfig(
+        vocabulary_size=vocabulary_size,
+        dimension=dimension,
+        initializer=column.get_initializer(),
+        combiner=column.get_combiner())
+
+  return table_to_config, feature_to_table
+
+
+def _get_tpu_embedding_optimization_parameters(embedding_config_spec):
+  """Get tpu_embedding._OptimizationParameters from EmbeddingConfigSpec."""
+  if embedding_config_spec.optimizer_type == 'adagrad':
+    return tpu_embedding.AdagradParameters(
+        embedding_config_spec.learning_rate,
+        embedding_config_spec.adagrad_initial_accumulator,
+        embedding_config_spec.use_gradient_accumulation)
+  elif embedding_config_spec.optimizer_type == 'sgd':
+    return tpu_embedding.StochasticGradientDescentParameters(
+        embedding_config_spec.learning_rate,
+        embedding_config_spec.use_gradient_accumulation)
+  elif embedding_config_spec.optimizer_type == 'adam':
+    return tpu_embedding.AdamParameters(
+        embedding_config_spec.learning_rate,
+        embedding_config_spec.adam_parameters.beta1,
+        embedding_config_spec.adam_parameters.beta2,
+        embedding_config_spec.adam_parameters.epsilon,
+        use_gradient_accumulation=embedding_config_spec
+        .use_gradient_accumulation)
+  else:
+    raise ValueError('optimizer_type must be adagrad or sgd or adam for now.')
+
+
+AdamParameters = collections.namedtuple('AdamParameters',
+                                        ['beta1', 'beta2', 'epsilon'])
+
+
+# TODO(shizhiw): Improve the API to support more optimizer parameters in API.
+class EmbeddingConfigSpec(
+    collections.namedtuple('EmbeddingConfigSpec', [
+        'feature_columns', 'learning_rate', 'optimizer_type',
+        'adagrad_initial_accumulator', 'clipping_limit',
+        'use_gradient_accumulation', 'adam_parameters'
+    ])):
+  """Class to keep track of embedding config specification."""
+
+  def __new__(cls,
+              feature_columns,
+              learning_rate,
+              optimizer_type='adagrad',
+              adagrad_initial_accumulator=None,
+              clipping_limit=None,
+              use_gradient_accumulation=False,
+              adam_parameters=None):
+    """Creates an EmbeddingConfigSpec instance.
+
+    Args:
+      feature_columns: All `FeatureColumn`s used by model.
+      learning_rate: embedding optimizer learning rate.
+      optimizer_type: (String) Name of the optimizer for embedding gradients
+        updates. Must be either 'adagrad' ( `tf.train.AdagradOptimizer`, default
+        value), 'sgd' (`tf.train.GradientDescentOptimizer`), or 'adam'
+        (`tf.contrib.opt.LazyAdamOptimizer`) for lazy Adam. This optimizer will
+        be applied to all embedding variables specified by `feature_columns`.
+      adagrad_initial_accumulator: Initial accumulator for Adagrad. Used when
+        optimizer_type is 'adagrad'. Default is `0.1`.
+      clipping_limit: (Optional) Clipping limit (absolute value).
+      use_gradient_accumulation: (Experimental) Whether to accumulate the
+        gradients across TPU embedding mini-batches. Gradient accumulation does
+        not affect SGD and therefore this is applicable only for Adagrad.
+      adam_parameters: AdamParameters. Used when optimizer_type is 'adam'.
+        Default is 0.9 for beta1, 0.999 for beta2 and 1e-8 for epsilon.
+
+    Returns:
+      An EmbeddingConfigSpec instance.
+
+    Raises:
+      ValueError: If the feature_columns are not specified.
+      TypeError: If the feature columns are not of ths correct type (one of
+        _SUPPORTED_FEATURE_COLUMNS, _TPU_EMBEDDING_COLUMN_CLASSES OR
+        _EMBEDDING_COLUMN_CLASSES).
+      ValueError: If use_gradient_accumulation is True for SGD.
+      ValueError: If `optimizer_type` is not one of "adagrad" or "sgd" or
+        "adam".
+    """
+    if not feature_columns:
+      raise ValueError('`feature_columns` cannot be `None` or empty.')
+
+    # It is unknown at this moment, whether the TPUEstimator is running in CPU
+    # or TPU mode. So allow non-TPU embedding columns also.
+    supported_classes = tuple(
+        list(_SUPPORTED_FEATURE_COLUMNS) + list(_TPU_EMBEDDING_COLUMN_CLASSES) +
+        list(_EMBEDDING_COLUMN_CLASSES))
+
+    for column in feature_columns:
+      if not isinstance(column, supported_classes):
+        raise TypeError(
+            'All feature columns must be supported types in {}. Got {}'.format(
+                supported_classes, type(column)))
+
+    if optimizer_type == 'adagrad':
+      if adagrad_initial_accumulator is None:
+        adagrad_initial_accumulator = 0.1
+      if adagrad_initial_accumulator <= 0:
+        raise ValueError('Adagrad initial_accumulator must be positive')
+    elif optimizer_type == 'sgd':
+      if use_gradient_accumulation:
+        raise ValueError('Gradient accumulation makes sense for Adagrad only.')
+    elif optimizer_type == 'adam':
+      if adam_parameters is None:
+        adam_parameters = AdamParameters(0.9, 0.999, 1e-8)
+      if adam_parameters.beta1 < 0. or adam_parameters.beta1 >= 1.:
+        raise ValueError('beta1 must be between 0. and 1; got {}.'.format(
+            adam_parameters.beta1))
+      if adam_parameters.beta2 < 0. or adam_parameters.beta2 >= 1.:
+        raise ValueError('beta2 must be between 0. and 1; got {}.'.format(
+            adam_parameters.beta2))
+      if adam_parameters.epsilon <= 0.:
+        raise ValueError('epsilon must be positive; got {}.'.format(
+            adam_parameters.epsilon))
+    else:
+      raise ValueError('optimizer_type must be adagrad or sgd or adam for now.')
+
+    return super(EmbeddingConfigSpec, cls).__new__(
+        cls,
+        feature_columns=feature_columns,
+        learning_rate=learning_rate,
+        optimizer_type=optimizer_type,
+        adagrad_initial_accumulator=adagrad_initial_accumulator,
+        clipping_limit=clipping_limit,
+        use_gradient_accumulation=use_gradient_accumulation,
+        adam_parameters=adam_parameters)
+
+
+class EmbeddingConfig(object):
+  """This is the internal immutable object for embedding config.
+
+  `_EmbeddingConfig` is responsible to _translate_ user provided
+  `EmbeddingConfigSpec` to internal data structures, mostly constructor
+  arguments of `TPUEmbedding`.
+  """
+
+  def __init__(self, embedding_config_spec, train_batch_size, eval_batch_size,
+               num_hosts, num_cores, master):
+    self._embedding_config_spec = embedding_config_spec
+    self._train_batch_size = train_batch_size
+    self._eval_batch_size = eval_batch_size
+    self._num_hosts = num_hosts
+    self._num_cores = num_cores
+    self._master = master
+
+    self._table_to_config_dict, self._feature_to_table_dict = (
+        get_tpu_embedding_config_from_feature_columns(
+            embedding_config_spec.feature_columns))
+    self._optimization_parameters = _get_tpu_embedding_optimization_parameters(
+        self._embedding_config_spec)
+    self._mode_to_tpu_embedding_dict = {}
+    self.dummy_table_variables = None
+
+  def has_embedding_tables(self):
+    return bool(self._table_to_config_dict)
+
+  def _create_tpu_embedding(self, mode):
+    """Create tpu_embedding.TPUEmbedding based on mode."""
+    if mode == model_fn_lib.ModeKeys.TRAIN:
+      batch_size = self._train_batch_size
+    else:
+      batch_size = self._eval_batch_size
+
+    if mode == model_fn_lib.ModeKeys.TRAIN:
+      tpu_embedding_mode = tpu_embedding.TRAINING
+    elif (mode == model_fn_lib.ModeKeys.EVAL or
+          mode == model_fn_lib.ModeKeys.PREDICT):
+      tpu_embedding_mode = tpu_embedding.INFERENCE
+    else:
+      raise ValueError('Mode {} is not supported.'.format(mode))
+
+    tpu_embedding_ = tpu_embedding.TPUEmbedding(
+        self._table_to_config_dict,
+        self._feature_to_table_dict,
+        batch_size,
+        tpu_embedding_mode,
+        self._master,
+        self._optimization_parameters,
+    )
+    return tpu_embedding_
+
+  def get_tpu_embedding(self, mode):
+    if mode not in self._mode_to_tpu_embedding_dict:
+      self._mode_to_tpu_embedding_dict[mode] = (
+          self._create_tpu_embedding(mode))
+    return self._mode_to_tpu_embedding_dict[mode]
+
+
+def split_inputs(ctx, features, labels):
+  """Splits the dense and sparse tensors inside the features and labels."""
+  sparse_features = collections.OrderedDict()
+  if ctx.embedding_config:
+    tpu_embedding_ = ctx.embedding_config.tpu_embedding
+    for feature_key in tpu_embedding_.feature_to_table_dict:
+      sparse_features[feature_key] = features.pop(feature_key)
+
+  return features, labels, sparse_features
diff --git a/tensorflow/python/tpu/async_checkpoint.py b/tensorflow/python/tpu/async_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b09ce173a64ba3f93ec019c8fd65dc4710f0fcf
--- /dev/null
+++ b/tensorflow/python/tpu/async_checkpoint.py
@@ -0,0 +1,212 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+"""Hook for asynchronous checkpointing.
+
+This hook dispatches checkpoint writing operations in a separate thread to
+allow execution to continue on the main thread.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import threading
+import time
+
+from tensorflow.core.util.event_pb2 import SessionLog
+from tensorflow.python.framework import meta_graph
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import training_util
+from tensorflow.python.training.session_run_hook import SessionRunArgs
+from tensorflow.python.training.summary_io import SummaryWriterCache
+
+
+class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
+  """Saves checkpoints every N steps or seconds."""
+
+  def __init__(self,
+               checkpoint_dir,
+               save_secs=None,
+               save_steps=None,
+               saver=None,
+               checkpoint_basename="model.ckpt",
+               scaffold=None,
+               listeners=None):
+    """Initializes a `CheckpointSaverHook`.
+
+    Args:
+      checkpoint_dir: `str`, base directory for the checkpoint files.
+      save_secs: `int`, save every N secs.
+      save_steps: `int`, save every N steps.
+      saver: `Saver` object, used for saving.
+      checkpoint_basename: `str`, base name for the checkpoint files.
+      scaffold: `Scaffold`, use to get saver object.
+      listeners: List of `CheckpointSaverListener` subclass instances. Used for
+        callbacks that run immediately before or after this hook saves the
+        checkpoint.
+
+    Raises:
+      ValueError: One of `save_steps` or `save_secs` should be set.
+      ValueError: At most one of `saver` or `scaffold` should be set.
+    """
+    logging.info("Create AsyncCheckpointSaverHook.")
+    if saver is not None and scaffold is not None:
+      raise ValueError("You cannot provide both saver and scaffold.")
+    self._saver = saver
+    self._save_thread = None
+    self._write_graph_thread = None
+    self._checkpoint_dir = checkpoint_dir
+    self._save_path = os.path.join(checkpoint_dir, checkpoint_basename)
+    self._scaffold = scaffold
+    self._timer = basic_session_run_hooks.SecondOrStepTimer(
+        every_secs=save_secs, every_steps=save_steps)
+    self._listeners = listeners or []
+    self._steps_per_run = 1
+    self._summary_writer = None
+    self._global_step_tensor = None
+
+    self._last_checkpoint_step = None
+
+  def _set_steps_per_run(self, steps_per_run):
+    self._steps_per_run = steps_per_run
+
+  def begin(self):
+    self._summary_writer = SummaryWriterCache.get(self._checkpoint_dir)
+    self._global_step_tensor = training_util._get_or_create_global_step_read()  # pylint: disable=protected-access
+    if self._global_step_tensor is None:
+      raise RuntimeError(
+          "Global step should be created to use CheckpointSaverHook.")
+    for l in self._listeners:
+      l.begin()
+
+  def after_create_session(self, session, coord):
+    global_step = session.run(self._global_step_tensor)
+
+    # We do write graph and saver_def at the first call of before_run.
+    # We cannot do this in begin, since we let other hooks to change graph and
+    # add variables in begin. Graph is finalized after all begin calls.
+    def _write_graph_fn(self):
+      training_util.write_graph(
+          ops.get_default_graph().as_graph_def(add_shapes=True),
+          self._checkpoint_dir, "graph.pbtxt")
+    self._write_graph_thread = threading.Thread(target=_write_graph_fn,
+                                                args=[self])
+    self._write_graph_thread.start()
+
+    saver_def = self._get_saver().saver_def if self._get_saver() else None
+    graph = ops.get_default_graph()
+    meta_graph_def = meta_graph.create_meta_graph_def(
+        graph_def=graph.as_graph_def(add_shapes=True), saver_def=saver_def)
+    self._summary_writer.add_graph(graph)
+    self._summary_writer.add_meta_graph(meta_graph_def)
+    # The checkpoint saved here is the state at step "global_step".
+    self._save(session, global_step)
+    self._timer.update_last_triggered_step(global_step)
+
+  def before_run(self, run_context):  # pylint: disable=unused-argument
+    return SessionRunArgs(self._global_step_tensor)
+
+  def after_run(self, run_context, run_values):
+    global_step = run_context.session.run(self._global_step_tensor)
+    if self._timer.should_trigger_for_step(global_step):
+      self._timer.update_last_triggered_step(global_step)
+      logging.info("Triggering checkpoint. %s", global_step)
+      if self._save(run_context.session, global_step):
+        run_context.request_stop()
+
+  def end(self, session):
+    if self._save_thread:
+      logging.info("Waiting for any pending checkpoints to finish.")
+      self._save_thread.join()
+    if self._write_graph_thread:
+      logging.info("Waiting for any pending write_graph to finish.")
+      self._write_graph_thread.join()
+
+    last_step = session.run(self._global_step_tensor)
+
+    if self._last_checkpoint_step != last_step:
+      self._save(session, last_step, asynchronous=False)
+
+    for l in self._listeners:
+      l.end(session, last_step)
+
+  def _save(self, session, step, asynchronous=True):
+    """Saves the latest checkpoint, returns should_stop."""
+
+    # Skip saving on step 0
+    if step == 0:
+      return
+
+    def _save_fn():
+      """Run the saver process."""
+      logging.info("Saving checkpoints for %d into %s.", step, self._save_path)
+
+      start_time = time.time()
+      for l in self._listeners:
+        l.before_save(session, step)
+
+      self._get_saver().save(session, self._save_path, global_step=step)
+      self._summary_writer.add_session_log(
+          SessionLog(
+              status=SessionLog.CHECKPOINT, checkpoint_path=self._save_path),
+          step)
+
+      for l in self._listeners:
+        l.after_save(session, step)
+
+      end_time = time.time()
+      logging.info("Checkpoint actual writing time: (%.3f sec)",
+                   end_time - start_time)
+      logging.info("Checkpoint finished for %d into %s.", step, self._save_path)
+
+    if not asynchronous:
+      self._last_checkpoint_step = step
+      _save_fn()
+      return
+
+    if self._save_thread is not None:
+      self._save_thread.join(timeout=0.1)
+      if self._save_thread.is_alive():
+        logging.info("Saver thread still in progress, skipping checkpoint.")
+        return
+
+    self._last_checkpoint_step = step
+    self._save_thread = threading.Thread(target=_save_fn)
+    self._save_thread.start()
+
+  def _get_saver(self):
+    if self._saver is not None:
+      return self._saver
+    elif self._scaffold is not None:
+      return self._scaffold.saver
+
+    # Get saver from the SAVERS collection if present.
+    collection_key = ops.GraphKeys.SAVERS
+    savers = ops.get_collection(collection_key)
+    if not savers:
+      raise RuntimeError(
+          "No items in collection {}. Please add a saver to the collection "
+          "or provide a saver or scaffold.".format(collection_key))
+    elif len(savers) > 1:
+      raise RuntimeError(
+          "More than one item in collection {}. "
+          "Please indicate which one to use by passing it to the constructor."
+          .format(collection_key))
+
+    self._saver = savers[0]
+    return savers[0]
diff --git a/tensorflow/python/tpu/bfloat16.py b/tensorflow/python/tpu/bfloat16.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa74f651aa63c72d14eb78c8af479263810e9b7d
--- /dev/null
+++ b/tensorflow/python/tpu/bfloat16.py
@@ -0,0 +1,77 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Helper context for running models with bfloat16."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import tf_contextlib
+
+
+def _get_custom_getter():
+  """Returns a custom getter that this class's methods must be called under.
+
+  All methods of this class must be called under a variable scope that was
+  passed this custom getter. Example:
+
+  ```python
+  network = ConvNetBuilder(...)
+  with tf.variable_scope('cg', custom_getter=network.get_custom_getter()):
+    network.conv(...)
+    # Call more methods of network here
+  ```
+
+  Currently, this custom getter only does anything if self.use_tf_layers is
+  True. In that case, it causes variables to be stored as dtype
+  self.variable_type, then casted to the requested dtype, instead of directly
+  storing the variable as the requested dtype.
+  """
+
+  def inner_custom_getter(getter, *args, **kwargs):
+    """Custom getter that forces variables to have type self.variable_type."""
+    cast_to_bfloat16 = False
+    requested_dtype = kwargs['dtype']
+    if requested_dtype == dtypes.bfloat16:
+      # Only change the variable dtype if doing so does not decrease variable
+      # precision.
+      kwargs['dtype'] = dtypes.float32
+      cast_to_bfloat16 = True
+    var = getter(*args, **kwargs)
+    # This if statement is needed to guard the cast, because batch norm
+    # assigns directly to the return value of this custom getter. The cast
+    # makes the return value not a variable so it cannot be assigned. Batch
+    # norm variables are always in fp32 so this if statement is never
+    # triggered for them.
+    if cast_to_bfloat16:
+      var = math_ops.cast(var, dtypes.bfloat16)
+    return var
+
+  return inner_custom_getter
+
+
+@tf_contextlib.contextmanager
+def bfloat16_scope():
+  """Scope class for bfloat16 variables so that the model uses custom getter.
+
+  This enables variables to be read as bfloat16 type when using get_variable.
+  """
+  with variable_scope.variable_scope(
+      '', custom_getter=_get_custom_getter()) as varscope:
+    yield varscope
diff --git a/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py b/tensorflow/python/tpu/bfloat16_test.py
similarity index 96%
rename from tensorflow/contrib/tpu/python/tpu/bfloat16_test.py
rename to tensorflow/python/tpu/bfloat16_test.py
index 26fd3768278cacd076e5fee8bdad75d0486678d0..3308e01700770c6ab4b0616a83b1207e49c82777 100644
--- a/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py
+++ b/tensorflow/python/tpu/bfloat16_test.py
@@ -19,11 +19,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tpu.python.tpu import bfloat16
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import variable_scope
-
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import bfloat16
 
 
 class BFloat16ScopeTest(test.TestCase):
diff --git a/tensorflow/python/tpu/datasets.py b/tensorflow/python/tpu/datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc0cd41d210ac6f8de1b20ebf744ee1e1dd04137
--- /dev/null
+++ b/tensorflow/python/tpu/datasets.py
@@ -0,0 +1,191 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+"""Library of Cloud TPU helper functions for data loading."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import interleave_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.data.ops import readers
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import functional_ops
+
+
+def _TextLineDataset(filename):
+  buffer_size = 8 * 1024 * 1024  # 8 MiB per file
+  dataset = readers.TextLineDataset(filename, buffer_size=buffer_size)
+  return dataset
+
+
+def _TFRecordDataset(filename):
+  buffer_size = 8 * 1024 * 1024  # 8 MiB per file
+  dataset = readers.TFRecordDataset(filename, buffer_size=buffer_size)
+  return dataset
+
+
+_FILETYPE_MAP = {
+    'tfrecord': _TFRecordDataset,
+    'textline': _TextLineDataset,
+    'text': _TextLineDataset,
+}
+
+
+def StreamingFilesDataset(files,
+                          filetype=None,
+                          file_reader_job=None,
+                          worker_job=None,
+                          num_epochs=None,
+                          filename_shuffle_buffer_size=None,
+                          num_parallel_reads=None,
+                          batch_transfer_size=None,
+                          sloppy=None):
+  """StreamingFilesDataset constructs a dataset to stream from workers (GCE VM).
+
+  Because Cloud TPUs are allocated over the network, a Cloud TPU cannot read
+  files local to your GCE VM. In order to train using files stored on your local
+  VM (e.g. on local SSD for extreme performance), use the StreamingFilesDataset
+  helper to generate a dataset to feed your Cloud TPU with files from your GCE
+  VM.
+
+  The resulting dataset may return an OutOfRangeError if there are no files
+  found as a result of the fileglob expansion.
+
+  Note: StreamingFilesDataset assumes that the session is using a
+  TPUClusterResolver and has therefore a worker and a coordinator job. File
+  loading will be done on the coordinator job.
+
+  Args:
+    files: A string glob to match files, or a `tf.data.Dataset` generating file
+      names.
+    filetype: A string (one of 'tfrecord', or 'textline') or a single-argument
+      TensorFlow function that when given a filename returns a dataset.
+    file_reader_job: An optional string that corresponds to the job that should
+      perform the file reads.
+    worker_job: An optional string that corresponds to the job that should
+      process the tensors (i.e. your GPU or TPU worker).
+    num_epochs: The number of epochs through the training set that should be
+      generated. By default, it will repeat infinitely.
+    filename_shuffle_buffer_size: An optional integer whose value controls the
+      shuffling of the file names. If you would like to read from the files in
+      the same order, set to 0 or False.
+    num_parallel_reads: An optional integer controlling the number of files to
+      read from concurrently. (Set to 1 for no parallelism.)
+    batch_transfer_size: An optional integer controlling the batching used to
+      amortize the remote function invocation overhead. Set to a very large
+      number to increase throughput. Set to a very small number to reduce memory
+      consumption. Set to False to skip batching.
+    sloppy: (Optional.) If `False`, read input data while maintaining a
+      deterministic order. (This may have significant performance impacts.)
+      sloppy defaults to: True.
+  Returns:
+    A `tf.data.Dataset` with an infinite stream of elements generated by a
+    parallel interleaving of the set of files matched (or generated) by `files`
+    with a type is the output of the dataset specified by `filetype`.
+
+  Raises:
+    ValueError: if any argument is not of the expected type.
+  """
+  if filetype is None:
+    filetype = 'tfrecord'
+
+  if isinstance(filetype, str):
+    if filetype not in _FILETYPE_MAP:
+      raise ValueError('Unexpected filetype: %s' % filetype)
+    reader_fn = _FILETYPE_MAP[filetype]
+  elif callable(filetype):
+    reader_fn = filetype
+  else:
+    raise ValueError('filetype should be a string or a callable')
+
+  file_reader_job = file_reader_job or 'coordinator'
+
+  worker_job = worker_job or 'worker'
+
+  if filename_shuffle_buffer_size is None:
+    filename_shuffle_buffer_size = 4096
+
+  num_parallel_reads = num_parallel_reads or 8
+
+  if batch_transfer_size is None:
+    batch_transfer_size = 256
+
+  if sloppy is None:
+    sloppy = True
+
+  with ops.device('/job:%s' % file_reader_job):
+    if isinstance(files, str):
+      source_dataset = dataset_ops.Dataset.list_files(files)
+    elif isinstance(files, dataset_ops.DatasetV2):
+      source_dataset = files
+    else:
+      raise ValueError('files was not a string or a dataset: %s' % files)
+
+    if filename_shuffle_buffer_size:
+      source_dataset = source_dataset.shuffle(
+          buffer_size=filename_shuffle_buffer_size)
+
+    source_dataset = source_dataset.apply(
+        interleave_ops.parallel_interleave(
+            reader_fn, cycle_length=num_parallel_reads, sloppy=sloppy))
+
+    source_dataset = source_dataset.repeat(num_epochs)
+
+    if batch_transfer_size:
+      source_dataset = source_dataset.batch(batch_transfer_size)
+
+    source_dataset = source_dataset.prefetch(1)
+
+    source_iterator = dataset_ops.make_one_shot_iterator(source_dataset)
+    source_handle = source_iterator.string_handle()
+
+  @function.Defun(dtypes.string)
+  def LoadingFunc(h):
+    remote_iterator = iterator_ops.Iterator.from_string_handle(
+        h, source_dataset.output_types, source_dataset.output_shapes)
+    return remote_iterator.get_next()
+
+  def MapFn(unused_input):
+    if isinstance(source_dataset.output_types, dtypes.DType):
+      output_types = [source_dataset.output_types]
+    elif isinstance(source_dataset.output_types, (list, tuple)):
+      output_types = source_dataset.output_types
+    else:
+      raise ValueError('source dataset has invalid output types')
+    remote_calls = functional_ops.remote_call(
+        args=[source_handle],
+        Tout=output_types,
+        f=LoadingFunc,
+        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)
+    if len(remote_calls) == 1:
+      return remote_calls[0]
+    else:
+      return remote_calls
+
+  with ops.device('/job:%s' % worker_job):
+    output_dataset = dataset_ops.Dataset.range(2).repeat().map(
+        MapFn, num_parallel_calls=4 if sloppy else None)
+    output_dataset = output_dataset.prefetch(1)
+
+    if batch_transfer_size:
+      # Undo the batching used during the transfer.
+      output_dataset = output_dataset.apply(batching.unbatch()).prefetch(1)
+
+  return output_dataset
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets_test.py b/tensorflow/python/tpu/datasets_test.py
similarity index 91%
rename from tensorflow/contrib/tpu/python/tpu/datasets_test.py
rename to tensorflow/python/tpu/datasets_test.py
index 52d87b800401c3e584da9843916cfc7a767c082a..416dd9496cc18af8354d4d961b54a50f3db99a24 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets_test.py
+++ b/tensorflow/python/tpu/datasets_test.py
@@ -20,16 +20,17 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.contrib.tpu.python.tpu import datasets
 from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import datasets
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
 
@@ -55,6 +56,7 @@ class DatasetsTest(test.TestCase):
     session_config = config_pb2.ConfigProto(cluster_def=self._cluster_def)
 
     self._sess = session.Session(self._worker.target, config=session_config)
+    self._worker_device = '/job:' + worker_job.name
 
   def testTextLineDataset(self):
     all_contents = []
@@ -70,7 +72,8 @@ class DatasetsTest(test.TestCase):
     dataset = datasets.StreamingFilesDataset(
         os.path.join(self.get_temp_dir(), 'text_line.*.txt'), filetype='text')
 
-    iterator = dataset_ops.make_initializable_iterator(dataset)
+    with ops.device(self._worker_device):
+      iterator = dataset_ops.make_initializable_iterator(dataset)
     self._sess.run(iterator.initializer)
     get_next = iterator.get_next()
 
@@ -94,7 +97,8 @@ class DatasetsTest(test.TestCase):
     dataset = datasets.StreamingFilesDataset(
         os.path.join(self.get_temp_dir(), 'tf_record*'), filetype='tfrecord')
 
-    iterator = dataset_ops.make_initializable_iterator(dataset)
+    with ops.device(self._worker_device):
+      iterator = dataset_ops.make_initializable_iterator(dataset)
     self._sess.run(iterator.initializer)
     get_next = iterator.get_next()
 
@@ -121,7 +125,8 @@ class DatasetsTest(test.TestCase):
 
     dataset = datasets.StreamingFilesDataset(filenames, filetype='tfrecord')
 
-    iterator = dataset_ops.make_initializable_iterator(dataset)
+    with ops.device(self._worker_device):
+      iterator = dataset_ops.make_initializable_iterator(dataset)
     self._sess.run(iterator.initializer)
     get_next = iterator.get_next()
 
@@ -154,7 +159,8 @@ class DatasetsTest(test.TestCase):
         os.path.join(self.get_temp_dir(), 'fixed_length*'),
         filetype=FixedLengthFile)
 
-    iterator = dataset_ops.make_initializable_iterator(dataset)
+    with ops.device(self._worker_device):
+      iterator = dataset_ops.make_initializable_iterator(dataset)
     self._sess.run(iterator.initializer)
     get_next = iterator.get_next()
 
@@ -177,7 +183,8 @@ class DatasetsTest(test.TestCase):
     dataset = datasets.StreamingFilesDataset(
         dataset_ops.Dataset.range(10), filetype=gen_dataset)
 
-    iterator = dataset_ops.make_initializable_iterator(dataset)
+    with ops.device(self._worker_device):
+      iterator = dataset_ops.make_initializable_iterator(dataset)
     self._sess.run(iterator.initializer)
     get_next = iterator.get_next()
 
diff --git a/tensorflow/python/tpu/device_assignment.py b/tensorflow/python/tpu/device_assignment.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd8a7a348cf18fdeb91b769570029fdef68c1bf3
--- /dev/null
+++ b/tensorflow/python/tpu/device_assignment.py
@@ -0,0 +1,313 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+"""Library of TPU helper functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.python.tpu.topology import Topology
+
+
+SINGLE_CORE_ASSIGNMENT = [[[0, 0, 0]]]
+
+
+def _compute_task_and_cores_to_replicas(core_assignment, topology):
+  """Computes a nested dict which maps task and logical core to replicas."""
+  task_and_cores_to_replicas = {}
+  for replica in xrange(core_assignment.shape[0]):
+    for logical_core in xrange(core_assignment.shape[1]):
+      coordinates = core_assignment[replica, logical_core, :]
+      task_id = topology.task_ordinal_at_coordinates(coordinates)
+      if task_id not in task_and_cores_to_replicas:
+        task_and_cores_to_replicas[task_id] = {}
+      if logical_core not in task_and_cores_to_replicas[task_id]:
+        task_and_cores_to_replicas[task_id][logical_core] = set()
+
+      task_and_cores_to_replicas[task_id][logical_core].add(replica)
+
+  task_to_sorted_replica_id = {}
+
+  for task, core_to_replicas in task_and_cores_to_replicas.items():
+    core_to_sorted_replicas = {}
+    for core, replicas in core_to_replicas.items():
+      core_to_sorted_replicas[core] = sorted(replicas)
+
+    task_to_sorted_replica_id[task] = core_to_sorted_replicas
+  return task_to_sorted_replica_id
+
+
+class DeviceAssignment(object):
+  """Mapping from logical cores in a computation to the physical TPU topology.
+
+  Prefer to use the `device_assignment()` helper to construct a
+  `DeviceAssignment`; it is easier if less flexible than constructing a
+  `DeviceAssignment` directly.
+  """
+
+  def __init__(self, topology, core_assignment):
+    """Constructs a `DeviceAssignment` object.
+
+    Args:
+      topology: A `Topology` object that describes the physical TPU topology.
+      core_assignment: A logical to physical core mapping, represented as a
+        rank 3 numpy array. See the description of the `core_assignment`
+        property for more details.
+
+    Raises:
+      ValueError: If `topology` is not `Topology` object.
+      ValueError: If `core_assignment` is not a rank 3 numpy array.
+    """
+    if not isinstance(topology, Topology):
+      raise ValueError("topology must be a Topology object, got {}".format(
+          type(topology)))
+    core_assignment = np.asarray(core_assignment, dtype=np.int32)
+
+    self._topology = topology
+
+    if core_assignment.ndim != 3:
+      raise ValueError("core_assignment must be a rank 3 numpy array, "
+                       "got shape {}".format(core_assignment.shape))
+
+    self._num_replicas = core_assignment.shape[0]
+    self._num_cores_per_replica = core_assignment.shape[1]
+
+    if core_assignment.shape[-1] != topology.mesh_rank:
+      raise ValueError(
+          "minor dimension of core_assignment must have size equal to topology "
+          "rank ({}), got shape {}".format(topology.mesh_rank,
+                                           core_assignment.shape))
+
+    self._core_assignment = core_assignment
+    self._task_and_cores_to_replicas = _compute_task_and_cores_to_replicas(
+        self._core_assignment, topology)
+
+  @property
+  def topology(self):
+    """A `Topology` that describes the TPU topology."""
+    return self._topology
+
+  @property
+  def num_cores_per_replica(self):
+    """The number of cores per replica."""
+    return self._num_cores_per_replica
+
+  @property
+  def num_replicas(self):
+    """The number of replicas of the computation."""
+    return self._num_replicas
+
+  @property
+  def core_assignment(self):
+    """The logical to physical core mapping.
+
+    Returns:
+      An integer numpy array of rank 3, with shape
+      `[num_replicas, num_cores_per_replica, topology_rank]`. Maps
+      (replica, logical core) pairs to physical topology coordinates.
+    """
+    return self._core_assignment
+
+  def coordinates(self, replica, logical_core):
+    """Returns the physical topology coordinates of a logical core."""
+    return tuple(self.core_assignment[replica, logical_core, :])
+
+  def lookup_replicas(self, task_id, logical_core):
+    """Lookup replica ids by task number and logical core.
+
+    Args:
+      task_id: TensorFlow task number.
+      logical_core: An integer, identifying a logical core.
+    Returns:
+      A sorted list of the replicas that are attached to that task and
+      logical_core.
+    Raises:
+      ValueError: If no replica exists in the task which contains the logical
+      core.
+    """
+    try:
+      return self._task_and_cores_to_replicas[task_id][logical_core]
+    except KeyError:
+      raise ValueError(
+          "Can not find any replica in task: {} contains logical_core: {} ".
+          format(task_id, logical_core))
+
+  def tpu_ordinal(self, replica=0, logical_core=0):
+    """Returns the ordinal of the TPU device assigned to a logical core."""
+    coordinates = self.coordinates(replica, logical_core)
+    return self._topology.tpu_device_ordinal_at_coordinates(coordinates)
+
+  def host_device(self, replica=0, logical_core=0, job=None):
+    """Returns the CPU device attached to a logical core."""
+    coordinates = self.coordinates(replica, logical_core)
+    return self._topology.cpu_device_name_at_coordinates(coordinates, job=job)
+
+  def tpu_device(self, replica=0, logical_core=0, job=None):
+    """Returns the name of the TPU device assigned to a logical core."""
+    coordinates = self.coordinates(replica, logical_core)
+    return self._topology.tpu_device_name_at_coordinates(coordinates, job=job)
+
+
+def device_assignment(topology,
+                      computation_shape=None,
+                      computation_stride=None,
+                      num_replicas=1):
+  """Computes a device_assignment of a computation across a TPU topology.
+
+  Attempts to choose a compact grid of cores for locality.
+
+  Returns a `DeviceAssignment` that describes the cores in the topology assigned
+  to each core of each replica.
+
+  `computation_shape` and `computation_stride` values should be powers of 2 for
+  optimal packing.
+
+  Args:
+    topology: A `Topology` object that describes the TPU cluster topology.
+      To obtain a TPU topology, evaluate the `Tensor` returned by
+      `initialize_system` using `Session.run`. Either a serialized
+      `TopologyProto` or a `Topology` object may be passed. Note: you must
+      evaluate the `Tensor` first; you cannot pass an unevaluated `Tensor` here.
+    computation_shape: A rank 1 int32 numpy array with size equal to the
+      topology rank, describing the shape of the computation's block of cores.
+      If None, the `computation_shape` is `[1] * topology_rank`.
+    computation_stride: A rank 1 int32 numpy array of size `topology_rank`,
+      describing the inter-core spacing of the `computation_shape` cores in the
+      TPU topology. If None, the `computation_stride` is `[1] * topology_rank`.
+    num_replicas: The number of computation replicas to run. The replicas will
+      be packed into the free spaces of the topology.
+
+  Returns:
+    A DeviceAssignment object, which describes the mapping between the logical
+    cores in each computation replica and the physical cores in the TPU
+    topology.
+
+  Raises:
+    ValueError: If `topology` is not a valid `Topology` object.
+    ValueError: If `computation_shape` or `computation_stride` are not 1D int32
+      numpy arrays with shape [3] where all values are positive.
+    ValueError: If computation's replicas cannot fit into the TPU topology.
+  """
+  # Deserialize the Topology proto, if it is a string.
+  if isinstance(topology, bytes):
+    topology = Topology(serialized=topology)
+
+  if not isinstance(topology, Topology):
+    raise ValueError("`topology` is not a Topology object; got {}".format(
+        type(topology)))
+
+  topology_rank = len(topology.mesh_shape)
+  mesh_shape = topology.mesh_shape
+  if computation_shape is None:
+    computation_shape = np.array([1] * topology_rank, dtype=np.int32)
+  else:
+    computation_shape = np.asarray(computation_shape, dtype=np.int32)
+
+  if computation_stride is None:
+    computation_stride = np.array([1] * topology_rank, dtype=np.int32)
+  else:
+    computation_stride = np.asarray(computation_stride, dtype=np.int32)
+
+  if computation_shape.shape != (topology_rank,):
+    raise ValueError("computation_shape must have shape [{}]; got {}".format(
+        topology_rank, computation_shape.shape))
+  if computation_stride.shape != (topology_rank,):
+    raise ValueError("computation_stride must have shape [{}]; got {}".format(
+        topology_rank, computation_stride.shape))
+
+  if any(computation_shape < 1):
+    raise ValueError(
+        "computation_shape must be positive; got computation_shape={}".format(
+            computation_shape))
+  if any(computation_stride < 1):
+    raise ValueError(
+        "computation_stride must be positive; got computation_stride={}".format(
+            computation_stride))
+
+  # Computes the physical size of one computation instance.
+  computation_footprint = computation_shape * computation_stride
+  if any(computation_footprint > mesh_shape):
+    raise ValueError(
+        "computation footprint {} does not fit in TPU topology shape {}".format(
+            computation_footprint, mesh_shape))
+
+  # Computes how many copies of the computation footprint fit in the mesh.
+  block_counts = mesh_shape // computation_footprint
+
+  replica_counts = block_counts * computation_stride
+  max_replicas = np.prod(replica_counts)
+  if num_replicas > max_replicas:
+    raise ValueError(
+        "requested {} replicas but only {} replicas with shape {} and "
+        "computation_stride {} fit in a TPU mesh of shape {}".format(
+            num_replicas, max_replicas, computation_shape, computation_stride,
+            mesh_shape))
+
+  def ceil_of_ratio(n, m):
+    return (n + m - 1) // m
+
+  replica_shape = [0] * topology_rank
+  if num_replicas > 0:
+    remaining_replicas = num_replicas
+    remaining_dims = topology_rank
+
+    # Choose dimensions as close to an equal cube as possible, in order of
+    # increasing dimension size. By visiting dimensions in increasing size, we
+    # assign the most constrained dimension first, so we won't make infeasible
+    # choices.
+    #
+    # As a secondary sort order, visit the dimensions in reverse order. This
+    # means we try to use both cores on the same chip in preference to two cores
+    # on different chips.
+    for x, ni in sorted(((x, -i) for (i, x) in enumerate(replica_counts))):
+      i = -ni
+      target_size = int(math.ceil(remaining_replicas**(1.0 / remaining_dims)))
+      replica_shape[i] = min(target_size, x)
+      remaining_replicas = ceil_of_ratio(remaining_replicas, replica_shape[i])
+      remaining_dims -= 1
+
+    assert remaining_replicas == 1 and remaining_dims == 0
+
+  # Assigns an offset to each replica such that no two replicas overlap.
+  replica_offsets = np.full([num_replicas, topology_rank], -1, dtype=np.int32)
+  for replica in xrange(num_replicas):
+    # Chooses a replica number in each axis.
+    t = replica
+    pos = []
+    for dim in replica_shape[::-1]:
+      pos.append(t % dim)
+      t //= dim
+    replica_pos = np.array(pos[::-1], dtype=np.int32)
+
+    # Determines where that replica starts in each axis.
+    outer = replica_pos // computation_stride
+    inner = replica_pos % computation_stride
+    replica_offsets[replica, :] = outer * computation_footprint + inner
+
+  # Computes a complete logical core -> physical core mapping for each replica.
+  indices = [
+      np.arange(0, computation_shape[i] * computation_stride[i],
+                computation_stride[i]) for i in xrange(topology_rank)
+  ]
+  indices = np.concatenate(
+      [i[..., np.newaxis] for i in np.meshgrid(*indices, indexing="ij")],
+      axis=-1)
+  indices = indices.reshape((-1, topology_rank))
+  assignment = indices + replica_offsets[:, np.newaxis, :]
+  return DeviceAssignment(topology, core_assignment=assignment)
diff --git a/tensorflow/python/tpu/error_handling.py b/tensorflow/python/tpu/error_handling.py
new file mode 100644
index 0000000000000000000000000000000000000000..52e1ea42370d653d1de7c12eee4b456ec7ce921c
--- /dev/null
+++ b/tensorflow/python/tpu/error_handling.py
@@ -0,0 +1,132 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""ErrorRendezvous handler for collecting errors from multiple threads."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import sys
+import threading
+import time
+
+import six
+
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import tf_logging as logging
+
+_UNINTERESTING_ERRORS = (errors.CancelledError,)
+
+
+class ErrorRendezvous(object):
+  """Resolve errors from multiple threads during TPU execution.
+
+  TPU errors can occur on the infeed or outfeed threads as well as the main
+  training thread.
+
+  Depending on which thread "wins" and receives the session error first, we may
+  end up showing users a confusing and non-actionable error message (session
+  cancelled) instead of a root cause (e.g. a bad filename).
+
+  The rendezvous object provides a location to capture these errors until all
+  threads terminate.  At that point we can choose the most informative error
+  to report.
+  """
+
+  def __init__(self, num_sources):
+    # string -> (message, traceback)
+    self._errors = {}
+    self._num_sources = num_sources
+    self._session_cancel_timer = None
+
+  def record_error(self, source, exc_info, session=None):
+    """Report an exception from the given source.
+
+    If a session is passed, a timer will be registered to close it after a few
+    seconds.  This is necessary to ensure the main training loop does not hang
+    if an infeed/oufeed error occurs.  We sleep a few seconds to allow a more
+    interesting error from another thread to propagate.
+
+    Args:
+      source: string, source of the error
+      exc_info: Output from `sys.exc_info` (type, value, traceback)
+      session: Session to close after delay.
+    """
+    _, value, _ = exc_info
+    self._errors[source] = exc_info
+    logging.info('Error recorded from %s: %s', source, value)
+
+    if session is not None and self._session_cancel_timer is None:
+
+      def _cancel_session():
+        time.sleep(5)
+        try:
+          session.close()
+        except:  # pylint: disable=bare-except
+          pass
+
+      self._session_cancel_timer = threading.Thread(target=_cancel_session,)
+      self._session_cancel_timer.daemon = True
+      self._session_cancel_timer.start()
+
+  def record_done(self, source):
+    """Mark execution source `source` as done.
+
+    If an error was originally reported from `source` it is left intact.
+
+    Args:
+      source: `str`, source being recorded
+    """
+    logging.info('%s marked as finished', source)
+    if source not in self._errors:
+      self._errors[source] = None
+
+  @contextlib.contextmanager
+  def catch_errors(self, source, session=None):
+    """Context manager to report any errors within a block."""
+    try:
+      yield
+    except Exception:  # pylint: disable=broad-except
+      self.record_error(source, sys.exc_info(), session)
+
+  def raise_errors(self, timeout_sec=0):
+    """Wait for up to `timeout` seconds for all error sources to finish.
+
+    Preferentially raise "interesting" errors (errors not in the
+    _UNINTERESTING_ERRORS) set.
+
+    Args:
+      timeout_sec: Seconds to wait for other error sources.
+    """
+    for _ in range(timeout_sec):
+      if len(self._errors) == self._num_sources:
+        break
+      time.sleep(1)
+
+    kept_errors = [(k, v) for (k, v) in self._errors.items() if v is not None]
+
+    # First check for any interesting errors, then fall back on the session
+    # cancelled errors etc.
+    for k, (typ, value, traceback) in kept_errors:
+      if isinstance(value, _UNINTERESTING_ERRORS):
+        continue
+      else:
+        logging.warn('Reraising captured error')
+        six.reraise(typ, value, traceback)
+
+    for k, (typ, value, traceback) in kept_errors:
+      logging.warn('Reraising captured error')
+      six.reraise(typ, value, traceback)
diff --git a/tensorflow/python/tpu/feature_column.py b/tensorflow/python/tpu/feature_column.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f7e93910c834831b5aa79440de3660f974a2310
--- /dev/null
+++ b/tensorflow/python/tpu/feature_column.py
@@ -0,0 +1,435 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""TPU Feature Column Library."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column import feature_column_lib as fc_lib
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.tpu import tpu
+from tensorflow.python.tpu import tpu_function
+# pylint: disable=protected-access
+
+
+_TPU_FC_TO_SCOPE = '_tpu_feature_column_scope'
+_SUPPORTED_CATEGORICAL_COLUMNS = (fc._IdentityCategoricalColumn,
+                                  fc._VocabularyFileCategoricalColumn,
+                                  fc._VocabularyListCategoricalColumn,
+                                  fc._WeightedCategoricalColumn,
+                                  fc_lib.IdentityCategoricalColumn,
+                                  fc_lib.VocabularyFileCategoricalColumn,
+                                  fc_lib.VocabularyListCategoricalColumn,
+                                  fc_lib.WeightedCategoricalColumn)
+
+
+def embedding_column(categorical_column,
+                     dimension,
+                     combiner='mean',
+                     initializer=None):
+  """TPU embedding_column for `tf.feature_column.embedding_column`.
+
+  Note that the interface for TPU embedding_column is different from the non-TPU
+  version. The following args available for the non-TPU version are NOT
+  supported: ckpt_to_load_from, tensor_name_in_ckp, max_norm and trainable.
+
+  Args:
+    categorical_column: A categorical_column returned from
+        categorical_column_with_identity,  weighted_categorical_column,
+        categorical_column_with_vocabulary_list or
+        categorical_column_with_vocabulary_file.
+    dimension: An integer specifying dimension of the embedding, must be > 0.
+    combiner: A string specifying how to reduce if there are multiple entries
+      in a single row. For more information, see
+      `tf.feature_column.embedding_column`.
+    initializer: A variable initializer function to be used in embedding
+      variable initialization. If not specified, defaults to
+      `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
+      `1/sqrt(dimension)`.
+
+  Returns:
+    A  _TPUEmbeddingColumn.
+
+  Raises:
+    ValueError: if `dimension` not > 0.
+    ValueError: if `initializer` is specified but not callable.
+  """
+  if not isinstance(categorical_column, _SUPPORTED_CATEGORICAL_COLUMNS):
+    raise TypeError(
+        'categorical_column for tpu '
+        ' embedding_column must be type %s, got %s.' % (' or '.join([
+            cc.__name__ for cc in _SUPPORTED_CATEGORICAL_COLUMNS
+        ]), type(categorical_column)))
+  if (dimension is None) or (dimension < 1):
+    raise ValueError('Invalid dimension {}.'.format(dimension))
+
+  if (initializer is not None) and (not callable(initializer)):
+    raise ValueError('initializer must be callable if specified. '
+                     'Embedding of column_name: {}'.format(
+                         categorical_column.name))
+  if initializer is None:
+    initializer = init_ops.truncated_normal_initializer(
+        mean=0.0, stddev=1 / math.sqrt(dimension))
+
+  embedding_shape = categorical_column._num_buckets, dimension  # pylint: disable=protected-access
+
+  def _creator(weight_collections, scope):
+    embedding_column_layer = fc._EmbeddingColumnLayer(
+        embedding_shape=embedding_shape,
+        initializer=initializer,
+        weight_collections=weight_collections,
+        trainable=True,
+        name='embedding_column_layer')
+    return embedding_column_layer(None, scope=scope)  # pylint: disable=not-callable
+
+  column = _TPUEmbeddingColumn(
+      categorical_column=categorical_column,
+      dimension=dimension,
+      combiner=combiner,
+      layer_creator=_creator,
+      ckpt_to_load_from=None,
+      tensor_name_in_ckpt=None,
+      max_norm=None,
+      trainable=True)
+  # For Embedding column, the initializer is hidden inside the creator Fn, which
+  # is not accessiable later. So, we attach it to a speicial field. Also note
+  # that non-TPU Embedding column and non-TPU shared Embedding column handle the
+  # initializer differently. See shared_embedding_columns for details.
+  column._tpu_initializer = initializer
+  return column
+
+
+def shared_embedding_columns(categorical_columns,
+                             dimension,
+                             combiner='mean',
+                             initializer=None,
+                             shared_embedding_collection_name=None):
+  """List of dense columns that convert from sparse, categorical input."""
+  for categorical_column in categorical_columns:
+    if not isinstance(categorical_column, _SUPPORTED_CATEGORICAL_COLUMNS):
+      raise TypeError(
+          'categorical_column for tpu '
+          ' shared_embedding_columns must be type %s, got %s.' % (' or '.join([
+              cc.__name__ for cc in _SUPPORTED_CATEGORICAL_COLUMNS
+          ]), type(categorical_column)))
+  columns = fc_lib.shared_embedding_columns(
+      categorical_columns,
+      dimension,
+      combiner=combiner,
+      initializer=initializer,
+      shared_embedding_collection_name=shared_embedding_collection_name,
+      ckpt_to_load_from=None,
+      tensor_name_in_ckpt=None,
+      max_norm=None,
+      trainable=True)
+
+  # Use the initializer and shared_embedding_collection_name to create TPU
+  # version
+  initializer = columns[0].initializer
+  shared_embedding_collection_name = columns[0].shared_embedding_collection_name
+  tpu_columns = []
+
+  # Create the state (_SharedEmbeddingColumnLayer) here.
+  for categorical_column in categorical_columns:
+    column = _TPUSharedEmbeddingColumn(
+        categorical_column=categorical_column,
+        dimension=dimension,
+        combiner=combiner,
+        initializer=initializer,
+        shared_embedding_collection_name=shared_embedding_collection_name,
+        ckpt_to_load_from=None,
+        tensor_name_in_ckpt=None,
+        max_norm=None,
+        trainable=True)
+    tpu_columns.append(column)
+
+  return tpu_columns
+
+
+class _TPUBaseEmbeddingColumn(object):
+  """Base class for TPU Embedding Column."""
+
+  def __init__(self, categorical_column):
+    self._tpu_categorical_column = categorical_column
+
+  def get_combiner(self):
+    """Returns the embedding combiner."""
+    raise NotImplementedError('not implemented')
+
+  def get_embedding_table_size(self):
+    """Returns the embedding table size, tuple of vocab size and dimension."""
+    raise NotImplementedError('not implemented')
+
+  def get_feature_key_name(self):
+    """Returns the feature key name in the features dict."""
+    raise NotImplementedError('not impl')
+
+  def get_weight_key_name(self):
+    """Return the key name for weights."""
+    raise NotImplementedError('not impl')
+
+  def get_embedding_var_name(self):
+    """Returns the embedding variable name.
+
+    Feature key name and embedding variable name are usually one-to-one mapping.
+    But for shared embedding columns, it is many-to-one mapping.
+    """
+    raise NotImplementedError('not impl')
+
+  def get_initializer(self):
+    """Returns the initializer."""
+    raise NotImplementedError('not impl')
+
+  def is_categorical_column_weighted(self):
+    """Check if the categorical column of the embedding column is weighted."""
+    raise NotImplementedError('not impl')
+
+
+class _TPUEmbeddingColumn(_TPUBaseEmbeddingColumn, fc._EmbeddingColumn):
+  """Core Embedding Column."""
+
+  def __new__(cls,
+              categorical_column,
+              dimension,
+              combiner='mean',
+              layer_creator=None,
+              ckpt_to_load_from=None,
+              tensor_name_in_ckpt=None,
+              max_norm=None,
+              trainable=True):
+    # Note, args ckpt_to_load_from, tensor_name_in_ckpt, max_norm and trainable
+    # are not supported on TPU. They are solely for matching the signature of
+    # __new__ of parent class fc._EmbeddingColumn.
+    return fc._EmbeddingColumn.__new__(
+        cls,
+        categorical_column,
+        dimension,
+        combiner=combiner,
+        layer_creator=layer_creator,
+        ckpt_to_load_from=ckpt_to_load_from,
+        tensor_name_in_ckpt=tensor_name_in_ckpt,
+        max_norm=max_norm,
+        trainable=trainable)
+
+  def __init__(self,
+               categorical_column,
+               dimension,
+               combiner='mean',
+               layer_creator=None,
+               ckpt_to_load_from=None,
+               tensor_name_in_ckpt=None,
+               max_norm=None,
+               trainable=True):
+    _TPUBaseEmbeddingColumn.__init__(self, categorical_column)
+    self._key = None
+
+  def get_combiner(self):
+    return self.combiner
+
+  def get_embedding_table_size(self):
+    """Returns num_ids and width."""
+    return (self.categorical_column._num_buckets, self.dimension)
+
+  def get_feature_key_name(self):
+    """get_feature_key_name."""
+    if self.is_categorical_column_weighted():
+      return self.categorical_column.categorical_column.name
+    return self.categorical_column.name
+
+  def get_weight_key_name(self):
+    """get_weight_key_name."""
+    if self.is_categorical_column_weighted():
+      return self.categorical_column.weight_feature_key
+    return None
+
+  def get_embedding_var_name(self):
+    """get_embedding_var_name."""
+    return self.categorical_column.name
+
+  def get_initializer(self):
+    return self._tpu_initializer
+
+  def is_categorical_column_weighted(self):
+    """Check if the categorical column of the embedding column is weighted."""
+    if isinstance(
+        self.categorical_column,
+        (
+            fc._WeightedCategoricalColumn,  # pylint: disable=protected-access
+            fc_lib.WeightedCategoricalColumn)):
+      return True
+    return False
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    if tpu.under_tpu_inference_context():
+      def host_computation():
+        return fc._EmbeddingColumn._get_dense_tensor(
+            self, inputs, weight_collections, trainable)
+      return tpu.outside_compilation(host_computation)
+
+    if _is_running_on_cpu():
+      return fc._EmbeddingColumn._get_dense_tensor(
+          self, inputs, weight_collections, trainable)
+
+    # TPU mode
+    # Get the embeddings from the LazyBuilder.
+    tensor = inputs.get(self.get_feature_key_name())
+
+    # Add to collection for _create_tpu_embedding_variables_and_ops
+    _record_variable_scope_and_name(self.get_embedding_var_name(),
+                                    'embedding_weights')
+
+    return tensor
+
+
+class _TPUSharedEmbeddingColumn(_TPUBaseEmbeddingColumn,
+                                fc._SharedEmbeddingColumn):
+  """Core Shared Embedding Column."""
+
+  def __new__(cls,
+              categorical_column,
+              dimension,
+              combiner='mean',
+              initializer=None,
+              shared_embedding_collection_name=None,
+              ckpt_to_load_from=None,
+              tensor_name_in_ckpt=None,
+              max_norm=None,
+              trainable=True):
+    return fc._SharedEmbeddingColumn.__new__(
+        cls,
+        categorical_column,
+        dimension,
+        combiner=combiner,
+        initializer=initializer,
+        shared_embedding_collection_name=shared_embedding_collection_name,
+        ckpt_to_load_from=ckpt_to_load_from,
+        tensor_name_in_ckpt=tensor_name_in_ckpt,
+        max_norm=max_norm,
+        trainable=trainable)
+
+  def __init__(self,
+               categorical_column,
+               dimension,
+               combiner='mean',
+               initializer=None,
+               shared_embedding_collection_name=None,
+               ckpt_to_load_from=None,
+               tensor_name_in_ckpt=None,
+               max_norm=None,
+               trainable=True):
+
+    _TPUBaseEmbeddingColumn.__init__(self, categorical_column)
+    self._key = None
+
+  def get_combiner(self):
+    return self.combiner
+
+  def get_embedding_table_size(self):
+    """Returns num_ids and width."""
+    return (self.categorical_column._num_buckets, self.dimension)
+
+  def get_feature_key_name(self):
+    """get_feature_key_name."""
+    if self.is_categorical_column_weighted():
+      return self.categorical_column.categorical_column.name
+    return self.categorical_column.name
+
+  def get_weight_key_name(self):
+    """get_weight_key_name."""
+    if self.is_categorical_column_weighted():
+      return self.categorical_column.weight_feature_key
+    return None
+
+  def get_embedding_var_name(self):
+    """get_embedding_var_name."""
+    return self.shared_embedding_collection_name
+
+  def get_initializer(self):
+    return self.initializer
+
+  def is_categorical_column_weighted(self):
+    """Check if the categorical column of the embedding column is weighted."""
+    if isinstance(
+        self.categorical_column,
+        (
+            fc._WeightedCategoricalColumn,  # pylint: disable=protected-access
+            fc_lib.WeightedCategoricalColumn)):
+      return True
+    return False
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    if tpu.under_tpu_inference_context():
+      def host_computation():
+        return fc._SharedEmbeddingColumn._get_dense_tensor(
+            self, inputs, weight_collections, trainable)
+      return tpu.outside_compilation(host_computation)
+
+    if _is_running_on_cpu():
+      return fc._SharedEmbeddingColumn._get_dense_tensor(
+          self, inputs, weight_collections, trainable)
+
+    # TPU mode
+    # Get the embeddings from the LazyBuilder.
+    tensor = inputs.get(self.get_feature_key_name())
+
+    # Add to collection for _create_tpu_embedding_variables_and_ops
+    _record_variable_scope_and_name(
+        self.get_embedding_var_name(),
+        'embedding_weights',
+        is_shared_embedding=True)
+    return tensor
+
+
+def _record_variable_scope_and_name(embedding_var_name,
+                                    embedding_var_name_in_fc,
+                                    is_shared_embedding=False):
+  """Add embedding variable name and scope to collection."""
+  g = ops.get_default_graph()
+  collection = g.get_collection_ref(_TPU_FC_TO_SCOPE)
+  if not collection:
+    collection.append({})
+
+  var_def_dict = collection[0]
+
+  captured_scope = variable_scope.get_variable_scope()
+  captured_scope_name = captured_scope.name
+
+  if embedding_var_name in var_def_dict:
+    if (var_def_dict[embedding_var_name][0] != captured_scope_name
+        and not is_shared_embedding):
+      raise ValueError(
+          'For embedding var name {}, the variable scope name is different, '
+          'got {}; expected {}'.format(embedding_var_name,
+                                       captured_scope_name,
+                                       var_def_dict[embedding_var_name][0]))
+    if var_def_dict[embedding_var_name][1] != embedding_var_name_in_fc:
+      raise ValueError(
+          'For embedding var name {}, the embedding name is different, '
+          'got {}; expected {}'.format(embedding_var_name,
+                                       embedding_var_name_in_fc,
+                                       var_def_dict[embedding_var_name][1]))
+  else:
+    var_def_dict[embedding_var_name] = (captured_scope_name,
+                                        embedding_var_name_in_fc)
+
+
+def _is_running_on_cpu():
+  """Returns True if the current context is CPU model."""
+  return tpu_function.get_tpu_context().number_of_shards is None
diff --git a/tensorflow/python/tpu/feature_column_test.py b/tensorflow/python/tpu/feature_column_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6feeb0198de76efc373e8be11765508220935605
--- /dev/null
+++ b/tensorflow/python/tpu/feature_column_test.py
@@ -0,0 +1,286 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""Tests for python.tpu.feature_column."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column import feature_column_lib as fc_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.platform import test
+from tensorflow.python.tpu import feature_column as tpu_fc
+
+
+def _initialized_session():
+  sess = session.Session()
+  sess.run(variables_lib.global_variables_initializer())
+  sess.run(lookup_ops.tables_initializer())
+  return sess
+
+
+class EmbeddingColumnTest(test.TestCase):
+
+  def test_defaults(self):
+    categorical_column = fc_lib.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    embedding_dimension = 2
+    embedding_column = tpu_fc.embedding_column(
+        categorical_column, dimension=embedding_dimension)
+    self.assertIs(categorical_column, embedding_column.categorical_column)
+    self.assertEqual(embedding_dimension, embedding_column.dimension)
+    self.assertEqual('mean', embedding_column.combiner)
+    self.assertEqual('aaa_embedding', embedding_column.name)
+    self.assertEqual('aaa_embedding', embedding_column._var_scope_name)
+    self.assertEqual((embedding_dimension,), embedding_column._variable_shape)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+    }, embedding_column._parse_example_spec)
+
+  def test_all_constructor_args(self):
+    categorical_column = fc_lib.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    embedding_dimension = 2
+    embedding_column = tpu_fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        combiner='my_combiner',
+        initializer=lambda: 'my_initializer')
+    self.assertIs(categorical_column, embedding_column.categorical_column)
+    self.assertEqual(embedding_dimension, embedding_column.dimension)
+    self.assertEqual('my_combiner', embedding_column.combiner)
+    self.assertEqual('aaa_embedding', embedding_column.name)
+    self.assertEqual('aaa_embedding', embedding_column._var_scope_name)
+    self.assertEqual((embedding_dimension,), embedding_column._variable_shape)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+    }, embedding_column._parse_example_spec)
+
+  def test_get_dense_tensor(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc_lib.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = tpu_fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup = embedding_column._get_dense_tensor(
+        fc._LazyBuilder({
+            'aaa': sparse_input
+        }))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+
+
+class SharedEmbeddingColumnTest(test.TestCase):
+
+  def test_defaults(self):
+    categorical_column_a = fc_lib.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc_lib.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    embedding_dimension = 2
+    embedding_column_b, embedding_column_a = tpu_fc.shared_embedding_columns(
+        [categorical_column_b, categorical_column_a],
+        dimension=embedding_dimension)
+    self.assertIs(categorical_column_a, embedding_column_a.categorical_column)
+    self.assertIs(categorical_column_b, embedding_column_b.categorical_column)
+    self.assertEqual(embedding_dimension, embedding_column_a.dimension)
+    self.assertEqual(embedding_dimension, embedding_column_b.dimension)
+    self.assertEqual('mean', embedding_column_a.combiner)
+    self.assertEqual('mean', embedding_column_b.combiner)
+    self.assertIsNotNone(embedding_column_a.initializer)
+    self.assertIsNotNone(embedding_column_b.initializer)
+    self.assertEqual('aaa_bbb_shared_embedding',
+                     embedding_column_a.shared_embedding_collection_name)
+    self.assertEqual('aaa_bbb_shared_embedding',
+                     embedding_column_b.shared_embedding_collection_name)
+    self.assertEqual('aaa_shared_embedding', embedding_column_a.name)
+    self.assertEqual('bbb_shared_embedding', embedding_column_b.name)
+    self.assertEqual('aaa_bbb_shared_embedding',
+                     embedding_column_a._var_scope_name)
+    self.assertEqual('aaa_bbb_shared_embedding',
+                     embedding_column_b._var_scope_name)
+    self.assertEqual((embedding_dimension,), embedding_column_a._variable_shape)
+    self.assertEqual((embedding_dimension,), embedding_column_b._variable_shape)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+    }, embedding_column_a._parse_example_spec)
+    self.assertEqual({
+        'bbb': parsing_ops.VarLenFeature(dtypes.int64)
+    }, embedding_column_b._parse_example_spec)
+
+  def test_all_constructor_args(self):
+    categorical_column_a = fc_lib.categorical_column_with_identity(
+        key='aaa', num_buckets=3)
+    categorical_column_b = fc_lib.categorical_column_with_identity(
+        key='bbb', num_buckets=3)
+    embedding_dimension = 2
+    embedding_column_a, embedding_column_b = tpu_fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension,
+        combiner='my_combiner',
+        initializer=lambda: 'my_initializer',
+        shared_embedding_collection_name='var_scope_name')
+    self.assertIs(categorical_column_a, embedding_column_a.categorical_column)
+    self.assertIs(categorical_column_b, embedding_column_b.categorical_column)
+    self.assertEqual(embedding_dimension, embedding_column_a.dimension)
+    self.assertEqual(embedding_dimension, embedding_column_b.dimension)
+    self.assertEqual('my_combiner', embedding_column_a.combiner)
+    self.assertEqual('my_combiner', embedding_column_b.combiner)
+    self.assertEqual('my_initializer', embedding_column_a.initializer())
+    self.assertEqual('my_initializer', embedding_column_b.initializer())
+    self.assertEqual('var_scope_name',
+                     embedding_column_a.shared_embedding_collection_name)
+    self.assertEqual('var_scope_name',
+                     embedding_column_b.shared_embedding_collection_name)
+    self.assertEqual('aaa_shared_embedding', embedding_column_a.name)
+    self.assertEqual('bbb_shared_embedding', embedding_column_b.name)
+    self.assertEqual('var_scope_name', embedding_column_a._var_scope_name)
+    self.assertEqual('var_scope_name', embedding_column_b._var_scope_name)
+    self.assertEqual((embedding_dimension,), embedding_column_a._variable_shape)
+    self.assertEqual((embedding_dimension,), embedding_column_b._variable_shape)
+    self.assertEqual({
+        'aaa': parsing_ops.VarLenFeature(dtypes.int64)
+    }, embedding_column_a._parse_example_spec)
+    self.assertEqual({
+        'bbb': parsing_ops.VarLenFeature(dtypes.int64)
+    }, embedding_column_b._parse_example_spec)
+
+  def test_get_dense_tensor(self):
+    # Inputs.
+    vocabulary_size = 3
+    # -1 values are ignored.
+    input_a = np.array([
+        [2, -1, -1],  # example 0, ids [2]
+        [0, 1, -1]
+    ])  # example 1, ids [0, 1]
+    input_b = np.array([
+        [0, -1, -1],  # example 0, ids [0]
+        [-1, -1, -1]
+    ])  # example 1, ids []
+    input_features = {'aaa': input_a, 'bbb': input_b}
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups_a = (
+        # example 0:
+        (7., 11.),  # ids [2], embedding = [7, 11]
+        # example 1:
+        (2., 3.5),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+    )
+    expected_lookups_b = (
+        # example 0:
+        (1., 2.),  # ids [0], embedding = [1, 2]
+        # example 1:
+        (0., 0.),  # ids [], embedding = [0, 0]
+    )
+
+    # Build columns.
+    categorical_column_a = fc_lib.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc_lib.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    embedding_column_a, embedding_column_b = tpu_fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension,
+        initializer=_initializer)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup_a = embedding_column_a._get_dense_tensor(
+        fc._LazyBuilder(input_features))
+    embedding_lookup_b = embedding_column_b._get_dense_tensor(
+        fc._LazyBuilder(input_features))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    embedding_var = global_vars[0]
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, embedding_var.eval())
+      self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
+      self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/tpu/functional.py b/tensorflow/python/tpu/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..045ec523bbbd1a16f616fbf2c3b11b66053968f5
--- /dev/null
+++ b/tensorflow/python/tpu/functional.py
@@ -0,0 +1,23 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Functional operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.tpu.ops import tpu_ops
+
+TPUPartitionedCall = tpu_ops.tpu_partitioned_call  # pylint: disable=invalid-name
diff --git a/tensorflow/python/tpu/ops/tpu_ops.py b/tensorflow/python/tpu/ops/tpu_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..38dd2734ac201c65413f835325c6a06b7821763c
--- /dev/null
+++ b/tensorflow/python/tpu/ops/tpu_ops.py
@@ -0,0 +1,428 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Operations for TPUs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.python.ops import gen_tpu_ops
+from tensorflow.python.ops.gen_tpu_ops import *
+# pylint: enable=wildcard-import,unused-import
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import tpu_function
+
+
+def _create_default_group_assignment():
+  num_shards = tpu_function.get_tpu_context().number_of_shards
+  if num_shards is None:
+    logging.warning(
+        "cross_replica_sum should be used within a tpu_shard_context, but "
+        "got unset number_of_shards. Assuming 1.")
+    num_shards = 1
+  group_assignment = [list(range(num_shards))]
+  return group_assignment
+
+
+def all_to_all(x,
+               concat_dimension,
+               split_dimension,
+               split_count,
+               group_assignment=None,
+               name=None):
+  """Exchange data across TPU replicas.
+
+  Args:
+    x: The local tensor.
+    concat_dimension: The dimension number to concatenate.
+    split_dimension: The dimension number to split.
+    split_count: The number of splits, this number must equal to the sub-group
+      size(group_assignment.get_shape()[1])
+    group_assignment: Optional 2d int32 lists with shape [num_groups,
+      num_replicas_per_group]. `group_assignment[i]` represents the replica
+      ids in the ith subgroup.
+    name: Optional op name.
+
+  Returns:
+    A `Tensor` which is concatenated by data from different replicas.
+  """
+  if group_assignment is None:
+    group_assignment = _create_default_group_assignment()
+  return gen_tpu_ops.all_to_all(
+      x,
+      group_assignment,
+      concat_dimension=concat_dimension,
+      split_dimension=split_dimension,
+      split_count=split_count,
+      name=name)
+
+
+@ops.RegisterGradient("AllToAll")
+def _all_to_all_grad(op, grad):
+  # The gradient of a all-to-all is also a all-to-all but the
+  # split_dimension and concat_dimension is swapped.
+  # The graident with respect to group_assignment is None.
+  return [
+      gen_tpu_ops.all_to_all(
+          grad,
+          op.inputs[1],
+          concat_dimension=op.get_attr("split_dimension"),
+          split_dimension=op.get_attr("concat_dimension"),
+          split_count=op.get_attr("split_count")), None
+  ]
+
+
+def cross_replica_sum(x, group_assignment=None, name=None):
+  """Sum the input tensor across replicas according to group_assignment.
+
+  Args:
+    x: The local tensor to the sum.
+    group_assignment: Optional 2d int32 lists with shape [num_groups,
+      num_replicas_per_group]. `group_assignment[i]` represents the replica
+      ids in the ith subgroup.
+    name: Optional op name.
+
+  Returns:
+    A `Tensor` which is summed across replicas.
+  """
+  if group_assignment is None:
+    group_assignment = _create_default_group_assignment()
+
+  return gen_tpu_ops.cross_replica_sum(x, group_assignment, name=name)
+
+
+def collective_permute(x, source_target_pairs, name=None):
+  """Permute the input tensor across replicas given source_target_pairs.
+
+  For each source_target_pair <a, b>, we send replica a's input to replica b.
+  Each replica id must only appear once in the source column. Also it must
+  only appear once in the target column.
+  For the replica id not in the target column, this op returns a zero tensor
+  with the same shape and dtype of the input x.
+
+  For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
+  source_target_pairs=`[[0,1],[1,2],[2,3]]` gets the outputs:
+  `[0, A, B, C]`.
+
+  Args:
+    x: The local tensor to be permuted.
+    source_target_pairs: 2d int lists with shape [num_pairs, 2].
+      source_target_pairs[i][0] represents the source replica id and
+      source_target_pairs[i][1] represents the target replica id.
+    name: Optional op name.
+
+  Returns:
+    A `Tensor` which is permuted.
+  """
+  return gen_tpu_ops.collective_permute(x, source_target_pairs, name=name)
+
+
+@ops.RegisterGradient("CollectivePermute")
+def _collective_permute_grad(op, grad):
+  # The gradient of a collective permute operation is also a collective
+  # permute, but with source/target pairs reversed. The gradient with respect
+  # to input argument `source_target_pairs` is `None`.
+  source_target_pairs = op.inputs[1][:, ::-1]
+  return [gen_tpu_ops.collective_permute(grad, source_target_pairs), None]
+
+
+@ops.RegisterGradient("CrossReplicaSum")
+def _cross_replica_sum_grad(op, grad):
+  # The gradient of a cross replica sum is also a cross-replica sum.
+  # The gradient with respect to group_assignment is None.
+  return [gen_tpu_ops.cross_replica_sum(grad, op.inputs[1]), None]
+
+
+# This extra type checking exists to give a more helpful error message in
+# the common case that uint8 and int64 values are infed. Remove when both
+# types are supported.
+
+_SUPPORTED_INFEED_DTYPES = set([
+    dtypes.bool, dtypes.int32, dtypes.int64, dtypes.bfloat16, dtypes.float32,
+    dtypes.complex64, dtypes.uint32
+])
+
+
+@ops.RegisterGradient("TPUEmbeddingActivations")
+def _embedding_activations_grad(activations_op, grad_wrt_activations):
+  """Saves the gradient of embedding activations ops in a graph collection."""
+  g = ops.get_default_graph()
+  table_id = activations_op.get_attr("table_id")
+  lookup_id = activations_op.get_attr("lookup_id")
+  table_gradients = g.get_collection_ref(
+      "tpu_embedding_gradients_table_%d" % table_id)
+
+  if not table_gradients:
+    raise RuntimeError(
+        "Gradients for TPUEmbedding have been generated in non-training mode."
+        "This is not expected. Consider putting your Optimizer.minimize code "
+        "behind the training mode condition check. For Estimator, you can "
+        "do \n\n"
+        "    if mode == tf.estimator.ModeKeys.TRAIN:\n"
+        "        train_op = opt.minimize(loss)\n"
+        "\n")
+
+  table_gradients[lookup_id] = array_ops.identity(grad_wrt_activations)
+  return [
+      # RegisterGradient requires that value be returned for all inputs. Since
+      # the first argument (tpu_gradient_variable_{table_name}) has shape [1],
+      # we will return zeros(shape=[1]). The actual gradient w.r.t. the
+      # embedding activations (grad_wrt_activations) has the same shape as the
+      # activations returned by  embedding_activations.
+      array_ops.zeros(arg.shape, dtype=dtypes.float32)
+      for arg in activations_op.inputs
+  ]
+
+
+def infeed_dequeue(dtype, shape, name=None):
+  """A placeholder op for a value that will be fed into the computation.
+
+  Args:
+    dtype: A `tf.DType`. The type of elements in the tensor.
+    shape: A `tf.TensorShape` or list of `ints`. The shape of the tensor.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type `dtype`.
+    A tensor that will be provided using the infeed mechanism.
+
+  Raises:
+    TypeError: If 'dtype` is not a supported infeed type.
+  """
+  if dtype not in _SUPPORTED_INFEED_DTYPES:
+    raise TypeError(
+        "{} is not a supported TPU infeed type. Supported types are: "
+        "{}".format(dtype, list(_SUPPORTED_INFEED_DTYPES)))
+
+  return gen_tpu_ops.infeed_dequeue(dtype, shape, name=name)
+
+
+# pylint: disable=redefined-outer-name
+def infeed_dequeue_tuple(dtypes, shapes, name=None):
+  """A placeholder op for values fed into the TPU simultaneously as a tuple.
+
+  Args:
+    dtypes: A list of `tf.DType`s that has length `>= 1`.
+      The element types of each element in `outputs`.
+    shapes: A list of shapes (each a `tf.TensorShape` or list of `ints`).
+      The shapes of each tensor in `outputs`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A list of `Tensor` objects of type `dtypes`.
+    A list of tensors that will be provided using the infeed mechanism.
+
+  Raises:
+    TypeError: If a type in 'dtypes` is not a supported infeed type.
+  """
+  for dtype in dtypes:
+    if dtype not in _SUPPORTED_INFEED_DTYPES:
+      raise TypeError(
+          "{} is not a supported TPU infeed type. Supported types are: "
+          "{}".format(dtype, list(_SUPPORTED_INFEED_DTYPES)))
+  return gen_tpu_ops.infeed_dequeue_tuple(dtypes, shapes, name=name)
+# pylint: enable=redefined-outer-name
+
+
+# pylint: disable=protected-access
+def send_tpu_embedding_gradients(inputs,
+                                 config,
+                                 learning_rates=None,
+                                 name=None):
+  """A placeholder op for feeding per-sample gradients to the embedding layer.
+
+  Args:
+    inputs: A TensorList of gradients with which to update embedding tables.
+        This argument has the same length and shapes as the return value of
+        RecvTPUEmbeddingActivations, but contains gradients of the model's
+        loss with respect to the embedding activations. The embedding tables
+        are updated from these gradients via the optimizers specified in the
+        TPU embedding configuration given to tpu.initialize_system.
+    config: Serialized TPUEmbeddingConfiguration proto.
+    learning_rates: A TensorList of float32 scalars, one for each dynamic
+        learning rate tag: see the comments in
+        //third_party/tensorflow/core/protobuf/tpu/
+                                             optimization_parameters.proto.
+        Multiple tables can share the same dynamic learning rate tag as
+        specified in the configuration. If the learning rates for all tables
+        are constant, this list should be empty.
+    name: A name for the operation (optional).
+
+  Returns:
+    A SendTPUEmbeddingGradients operation.
+  """
+  if learning_rates is None:
+    learning_rates = []
+  return gen_tpu_ops.send_tpu_embedding_gradients(
+      inputs=inputs, learning_rates=learning_rates, config=config, name=name)
+
+
+send_tpu_embedding_gradients.__doc__ = (
+    gen_tpu_ops.send_tpu_embedding_gradients.__doc__)
+
+
+# pylint: disable=protected-access
+def enqueue_tpu_embedding_integer_batch(batch,
+                                        device_ordinal,
+                                        mode_override=None,
+                                        name=None):
+  """A placeholder op for enqueueing embedding IDs to the TPU.
+
+  Args:
+    batch: A list of 1D tensors, one for each embedding table, containing the
+      indices into the tables.
+    device_ordinal: The TPU device to use. Should be >= 0 and less than the
+      number of TPU cores in the task on which the node is placed.
+    mode_override: A string input that overrides the mode specified in the
+      TPUEmbeddingConfiguration. Supported values are {'unspecified',
+      'inference', 'training', 'backward_pass_only'}. When set to
+      'unspecified', the mode set in TPUEmbeddingConfiguration is used,
+      otherwise mode_override is used (optional).
+    name: A name for the operation (optional).
+
+  Returns:
+    An EnqueueTPUEmbeddingIntegerBatch operation.
+  """
+  if mode_override is None:
+    mode_override = "unspecified"
+  return gen_tpu_ops.enqueue_tpu_embedding_integer_batch(
+      batch=batch,
+      device_ordinal=device_ordinal,
+      mode_override=mode_override,
+      name=name)
+
+
+enqueue_tpu_embedding_integer_batch.__doc__ = (
+    gen_tpu_ops.enqueue_tpu_embedding_integer_batch.__doc__)
+
+
+# pylint: disable=protected-access
+def enqueue_tpu_embedding_sparse_batch(sample_indices,
+                                       embedding_indices,
+                                       aggregation_weights,
+                                       device_ordinal,
+                                       combiners=None,
+                                       mode_override=None,
+                                       name=None):
+  """A placeholder op for enqueueing embedding IDs to the TPU.
+
+  Args:
+    sample_indices: A list of rank 1 Tensors specifying the training example
+      and feature to which the corresponding embedding_indices and
+      aggregation_weights values belong. sample_indices[i] must equal b * nf +
+      f, where nf is the number of features from the corresponding table, f is
+      in [0, nf), and b is in [0, batch size).
+    embedding_indices: A list of rank 1 Tensors, indices into the embedding
+      tables.
+    aggregation_weights: A list of rank 1 Tensors containing per sample --
+      i.e. per (training example, feature) -- aggregation weights.
+    device_ordinal: The TPU device to use. Should be >= 0 and less than the
+      number of TPU cores in the task on which the node is placed.
+    combiners: A list of string scalars, one for each embedding table that
+      specify how to normalize the embedding activations after weighted
+      summation. Supported combiners are 'mean', 'sum', or 'sqrtn'. It is
+      invalid to have the sum of the weights be 0 for 'mean' or the sum of the
+      squared weights be 0 for 'sqrtn'. If combiners isn't passed, the default
+      is to use 'sum' for all tables (optional).
+    mode_override: A string input that overrides the mode specified in the
+      TPUEmbeddingConfiguration. Supported values are {'unspecified',
+      'inference', 'training', 'backward_pass_only'}. When set to
+      'unspecified', the mode set in TPUEmbeddingConfiguration is used,
+      otherwise mode_override is used (optional).
+    name: A name for the operation (optional).
+
+  Returns:
+    An EnqueueTPUEmbeddingSparseBatch operation.
+  """
+  if mode_override is None:
+    mode_override = "unspecified"
+  return gen_tpu_ops.enqueue_tpu_embedding_sparse_batch(
+      sample_indices=sample_indices,
+      embedding_indices=embedding_indices,
+      aggregation_weights=aggregation_weights,
+      device_ordinal=device_ordinal,
+      combiners=combiners,
+      mode_override=mode_override,
+      name=name)
+
+
+enqueue_tpu_embedding_sparse_batch.__doc__ = (
+    gen_tpu_ops.enqueue_tpu_embedding_sparse_batch.__doc__)
+
+
+# pylint: disable=protected-access
+def enqueue_tpu_embedding_sparse_tensor_batch(sample_indices,
+                                              embedding_indices,
+                                              aggregation_weights,
+                                              table_ids,
+                                              device_ordinal,
+                                              combiners=None,
+                                              mode_override=None,
+                                              name=None):
+  """A placeholder op for enqueueing embedding IDs to the TPU.
+
+  Args:
+    sample_indices: A list of rank 1 Tensors specifying the training example
+      to which the corresponding embedding_indices and aggregation_weights
+      values belong. It corresponds to sp_ids.indices[:,0] in
+      embedding_lookup_sparse().
+    embedding_indices: A list of rank 1 Tensors, indices into the embedding
+      tables. It corresponds to sp_ids.values in embedding_lookup_sparse().
+    aggregation_weights: A list of rank 1 Tensors containing per training
+      example aggregation weights. It corresponds to sp_weights.values in
+      embedding_lookup_sparse().
+    table_ids: A list of integers specifying the identifier of the embedding
+      table (offset of TableDescriptor in the TPUEmbeddingConfiguration) to
+      lookup the corresponding input. The ith input is looked up using
+      table_ids[i]. The size of the table_ids list must be equal to that of
+      sample_indices, embedding_indices and aggregation_weights.
+    device_ordinal: The TPU device to use. Should be >= 0 and less than the
+      number of TPU cores in the task on which the node is placed.
+    combiners: A list of string scalars, one for each embedding table that
+      specify how to normalize the embedding activations after weighted
+      summation. Supported combiners are 'mean', 'sum', or 'sqrtn'. It is
+      invalid to have the sum of the weights be 0 for 'mean' or the sum of the
+      squared weights be 0 for 'sqrtn'. If combiners isn't passed, the default
+      is to use 'sum' for all tables (optional).
+    mode_override: A string input that overrides the mode specified in the
+      TPUEmbeddingConfiguration. Supported values are {'unspecified',
+      'inference', 'training', 'backward_pass_only'}. When set to
+      'unspecified', the mode set in TPUEmbeddingConfiguration is used,
+      otherwise mode_override is used (optional).
+    name: A name for the operation (optional).
+
+  Returns:
+    An EnqueueTPUEmbeddingSparseTensorBatch operation.
+  """
+  if mode_override is None:
+    mode_override = "unspecified"
+  return gen_tpu_ops.enqueue_tpu_embedding_sparse_tensor_batch(
+      sample_indices=sample_indices,
+      embedding_indices=embedding_indices,
+      aggregation_weights=aggregation_weights,
+      table_ids=table_ids,
+      device_ordinal=device_ordinal,
+      combiners=combiners,
+      mode_override=mode_override,
+      name=name)
+
+
+enqueue_tpu_embedding_sparse_tensor_batch.__doc__ = (
+    gen_tpu_ops.enqueue_tpu_embedding_sparse_tensor_batch.__doc__)
diff --git a/tensorflow/python/tpu/ops/tpu_ordinal_selector_op.py b/tensorflow/python/tpu/ops/tpu_ordinal_selector_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f2dce26cd5dd1183d51491715186f57fbe95fab
--- /dev/null
+++ b/tensorflow/python/tpu/ops/tpu_ordinal_selector_op.py
@@ -0,0 +1,20 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Operations to select TPU core to run."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/python/tpu/profiler/BUILD b/tensorflow/python/tpu/profiler/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..bfe79454cc16db7dedadcddb1aad20c3d7792528
--- /dev/null
+++ b/tensorflow/python/tpu/profiler/BUILD
@@ -0,0 +1,27 @@
+licenses(["notice"])  # Apache 2.0
+
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+)
+
+py_library(
+    name = "profiler",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tpu_profiler_analysis_pb2_grpc",
+        "//tensorflow/core/profiler:profiler_analysis_proto_py",
+        "//tensorflow/core/profiler:protos_all_py",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "tpu_profiler_analysis_pb2_grpc",
+    srcs = ["tpu_profiler_analysis_pb2_grpc.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = ["//tensorflow/core/profiler:profiler_analysis_proto_py"],
+)
diff --git a/tensorflow/python/tpu/profiler/__init__.py b/tensorflow/python/tpu/profiler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c183aaf53543f7bb38475525d1777048925ff62
--- /dev/null
+++ b/tensorflow/python/tpu/profiler/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Classes for TPU trace events."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=wildcard-import,unused-import
+from tensorflow.core.profiler.trace_events_pb2 import *
+from tensorflow.core.profiler.profiler_analysis_pb2 import *
+# pylint: enable=wildcard-import,unused-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = ['Trace', 'Resource', 'Device', 'TraceEvent']
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis_pb2_grpc.py b/tensorflow/python/tpu/profiler/tpu_profiler_analysis_pb2_grpc.py
similarity index 100%
rename from tensorflow/contrib/tpu/profiler/tpu_profiler_analysis_pb2_grpc.py
rename to tensorflow/python/tpu/profiler/tpu_profiler_analysis_pb2_grpc.py
diff --git a/tensorflow/python/tpu/session_support.py b/tensorflow/python/tpu/session_support.py
new file mode 100644
index 0000000000000000000000000000000000000000..3df7185be0828df13f7088689e9891f985450182
--- /dev/null
+++ b/tensorflow/python/tpu/session_support.py
@@ -0,0 +1,438 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+"""Operations for handling session logging and shutdown notifications."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+import time
+from google.protobuf import text_format
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.util import event_pb2
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import training_util
+
+_WATCHDOG = None
+
+
+class CoordinatorShutdownException(Exception):
+  """Raised when the coordinator needs to shutdown."""
+  pass
+
+
+def _clone_session(session, graph=None):
+  return session_lib.Session(
+      target=session.sess_str,
+      config=session._config,  # pylint: disable=protected-access
+      graph=graph if graph else session.graph)
+
+
+def _make_heartbeat_op(session, device, request_ph):
+  """Return a heartbeat op or None if heartbeats are not supported by device."""
+  try:
+    # Test if we can connect in a isolated graph + session
+    with ops.Graph().as_default():
+      with _clone_session(session) as temp_session:
+        with ops.device(device):
+          heartbeat_op = tpu_ops.worker_heartbeat('')
+          options = config_pb2.RunOptions(timeout_in_ms=5000)
+          temp_session.run(heartbeat_op, options=options)
+  except errors.InvalidArgumentError as _:
+    logging.warning('Error running heartbeat on %s', device)
+    return None
+  except errors.DeadlineExceededError as _:
+    logging.warning('Timeout connecting to %s when testing heartbeat', device)
+    return None
+
+  # If we successfully connected and pinged the worker, go ahead and construct
+  # the operation.
+  with ops.device(device):
+    return tpu_ops.worker_heartbeat(request_ph)
+
+
+class WorkerHeartbeatManager(object):
+  """Manages the status/heartbeat monitor for a set of workers."""
+
+  def __init__(self, session, devices, heartbeat_ops, request_placeholder):
+    """Construct a new WorkerHeartbeatManager.
+
+    (Prefer using `WorkerHeartbeatManager.from_devices` when possible.)
+
+    Args:
+      session: `tf.Session`, session to use for heartbeat operations.
+      devices: `list[string]` Set of devices to connect to.
+      heartbeat_ops: `list[tf.Operation]` Heartbeat operations.
+      request_placeholder: `tf.Placeholder[String]` Placeholder used to specify
+        the WorkerHeartbeatRequest protocol buffer.
+    """
+    self._session = session
+    self._devices = devices
+    self._ops = heartbeat_ops
+    self._request_placeholder = request_placeholder
+
+  @staticmethod
+  def from_devices(session, devices):
+    """Construct a heartbeat manager for the given devices."""
+    if not devices:
+      logging.error('Trying to create heartbeat manager with no devices?')
+
+    logging.info('Creating heartbeat manager for %s', devices)
+    request_placeholder = array_ops.placeholder(
+        name='worker_heartbeat_request', dtype=dtypes.string)
+
+    heartbeat_ops = []
+    kept_devices = []
+    for device in devices:
+      heartbeat_op = _make_heartbeat_op(session, device, request_placeholder)
+      if heartbeat_op is not None:
+        kept_devices.append(device)
+        heartbeat_ops.append(heartbeat_op)
+      else:
+        logging.warning('Heartbeat support not available for %s', device)
+
+    return WorkerHeartbeatManager(session, kept_devices, heartbeat_ops,
+                                  request_placeholder)
+
+  def num_workers(self):
+    return len(self._devices)
+
+  def configure(self, message):
+    """Configure heartbeat manager for all devices.
+
+    Args:
+      message: `event_pb2.WorkerHeartbeatRequest`
+    Returns: `None`
+    """
+    logging.info('Configuring worker heartbeat: %s',
+                 text_format.MessageToString(message))
+    self._session.run(self._ops,
+                      {self._request_placeholder: message.SerializeToString()})
+
+  def ping(self, request=None, timeout_in_ms=5000):
+    """Ping all workers, returning the parsed status results."""
+    if request is None:
+      request = event_pb2.WorkerHeartbeatRequest()
+
+    options = config_pb2.RunOptions(timeout_in_ms=timeout_in_ms)
+    results = self._session.run(
+        self._ops,
+        feed_dict={self._request_placeholder: request.SerializeToString()},
+        options=options)
+    parsed_results = [
+        event_pb2.WorkerHeartbeatResponse.FromString(res_pb)
+        for res_pb in results
+    ]
+    logging.debug('Ping results: %s', parsed_results)
+    return parsed_results
+
+  def lame_workers(self):
+    """Ping all workers, returning manager containing lame workers (or None)."""
+    ping_results = self.ping()
+    lame_workers = []
+
+    for ping_response, device, op in zip(ping_results, self._devices,
+                                         self._ops):
+      if ping_response.health_status != event_pb2.OK:
+        lame_workers.append((device, op))
+
+    if not lame_workers:
+      return None
+
+    bad_devices, bad_ops = zip(*lame_workers)
+    return WorkerHeartbeatManager(self._session, bad_devices, bad_ops,
+                                  self._request_placeholder)
+
+  def __repr__(self):
+    return 'HeartbeatManager(%s)' % ','.join(self._devices)
+
+  def shutdown(self, timeout_ms=10000):
+    """Shutdown all workers after `shutdown_timeout_secs`."""
+    logging.info('Shutting down %s.', self)
+    req = event_pb2.WorkerHeartbeatRequest(
+        watchdog_config=event_pb2.WatchdogConfig(timeout_ms=timeout_ms),
+        shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR)
+    self.configure(req)
+
+    # Wait for workers to shutdown.  This isn't strictly required
+    # but it avoids triggering multiple checkpoints with the same lame worker.
+    logging.info('Waiting %dms for worker shutdown.', timeout_ms)
+    time.sleep(timeout_ms / 1000)
+
+
+def all_worker_devices(session):
+  """Return a list of devices for each worker in the system."""
+  devices = session.list_devices()
+  return [
+      device.name
+      for device in devices
+      if ':CPU:' in device.name and 'coordinator' not in device.name
+  ]
+
+
+class WatchdogManager(threading.Thread):
+  """Configures worker watchdog timer and handles periodic pings.
+
+  Usage:
+    # Ping workers every minute, shutting down workers if they haven't received
+    # a ping after 1 hour.
+    watchdog_manager = WatchdogManager(
+      ping_interval=60, shutdown_timeout=3600
+    )
+
+    # Use as a context manager, resetting watchdog on context exit:
+    with watchdog_manager:
+      session.run(...)
+
+    # Or setup globally; watchdog will remain active until program exit.
+    watchdog_manager.configure_and_run()
+  """
+
+  def __init__(self,
+               session,
+               devices=None,
+               ping_interval=60,
+               shutdown_timeout=3600):
+    """Initialize a watchdog manager.
+
+    Args:
+      session: Session connected to worker devices.  A cloned session and graph
+        will be created for managing worker pings.
+      devices: Set of devices to monitor.  If none, all workers will be
+        monitored.
+      ping_interval: Time, in seconds, between watchdog pings.
+      shutdown_timeout: Time, in seconds, before watchdog timeout.
+    """
+    threading.Thread.__init__(self)
+    self.ping_interval = ping_interval
+    self.shutdown_timeout = shutdown_timeout
+    self.daemon = True
+    self._config = session._config  # pylint: disable=protected-access
+    self._target = session.sess_str
+    self._running = False
+    self._devices = devices
+
+    self._graph = None
+    self._session = None
+    self._worker_manager = None
+
+  def _reset_manager(self):
+    """Reset the graph, session and worker manager."""
+    self._graph = ops.Graph()
+    self._session = session_lib.Session(
+        target=self._target,
+        graph=self._graph,
+        config=self._config,
+    )
+
+    if self._devices is None:
+      self._devices = all_worker_devices(self._session)
+
+    with self._graph.as_default():
+      self._worker_manager = WorkerHeartbeatManager.from_devices(
+          self._session, self._devices)
+
+    self._worker_manager.configure(
+        event_pb2.WorkerHeartbeatRequest(
+            watchdog_config=event_pb2.WatchdogConfig(
+                timeout_ms=self.shutdown_timeout * 1000,),
+            shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR))
+
+  def configure_and_run(self):
+    logging.info(
+        'Enabling watchdog timer with %d second timeout '
+        'and %d second ping interval.', self.shutdown_timeout,
+        self.ping_interval)
+    self._reset_manager()
+    self._running = True
+    self.start()
+
+  def stop(self):
+    logging.info('Stopping worker watchdog.')
+    self._worker_manager.configure(
+        event_pb2.WorkerHeartbeatRequest(
+            watchdog_config=event_pb2.WatchdogConfig(timeout_ms=-1,),
+            shutdown_mode=event_pb2.NOT_CONFIGURED))
+    self._running = False
+    self.join()
+
+  def __enter__(self):
+    self.configure_and_run()
+
+  def __exit__(self, exc_type, exc_val, exc_tb):
+    self.stop()
+
+  def run(self):
+    # Don't fetch logs or adjust timing: just ping the watchdog.
+    #
+    # If we hit an exception, reset our session as it is likely broken.
+    while self._running:
+      try:
+        self._worker_manager.ping(request=None)
+        time.sleep(self.ping_interval)
+      except errors.OpError as e:
+        # Catch any TF errors that occur so we don't stop sending heartbeats
+        logging.debug('Caught error while sending heartbeat: %s', e)
+        self._reset_manager()
+
+
+def start_worker_watchdog(session,
+                          devices=None,
+                          ping_interval=60,
+                          shutdown_timeout=3600):
+  """Start global worker watchdog to shutdown workers on coordinator exit."""
+  global _WATCHDOG
+  if _WATCHDOG is None:
+    # Ensure we can send a few pings before we timeout!
+    ping_interval = min(shutdown_timeout / 10., ping_interval)
+    _WATCHDOG = WatchdogManager(session, devices, ping_interval,
+                                shutdown_timeout)
+    _WATCHDOG.configure_and_run()
+
+
+class GracefulShutdownHook(session_run_hook.SessionRunHook):
+  """Session hook that watches for shutdown events.
+
+  If a shutdown is indicated, `saver.save(checkpoint_prefix)` is executed, and a
+  SystemShutdown exception is raised to terminate the main session.  If `saver`
+  is None the `SAVERS` collection will be read to find a saver.
+
+  `on_shutdown_hooks` is an optional list of functions that should be called
+  after checkpointing.  The function is called with (`run_context`,
+  `all_workers`, `lame_workers`).
+
+  If `heartbeat_group` is not specified, it will default to all CPU workers
+  in the system.
+  """
+
+  def __init__(self, checkpoint_prefix, saver=None, on_shutdown_hooks=None):
+    self._saver = saver
+    self._checkpoint_prefix = checkpoint_prefix
+    self._on_shutdown_hooks = on_shutdown_hooks if on_shutdown_hooks else []
+
+    # Worker heartbeats are managed independently of the main training graph.
+    self._graph = ops.Graph()
+    self._workers = None
+    self._session = None
+    self._heartbeat_supported = False
+
+  def after_create_session(self, training_session, coord):  # pylint: disable=unused-argument
+    # N.B. We have to pull the global step here to avoid it being unavailable
+    # at checkpoint time; the graph has been frozen at that point.
+    if training_util.get_global_step() is None and self.saver() is not None:
+      raise ValueError(
+          'Saver defined but no global step.  Run `get_or_create_global_step()`'
+          ' in your model definition to allow checkpointing.')
+
+    with self._graph.as_default():
+      logging.info('Installing graceful shutdown hook.')
+      self._session = _clone_session(training_session, self._graph)
+      self._workers = WorkerHeartbeatManager.from_devices(
+          self._session, all_worker_devices(self._session))
+      self._heartbeat_supported = self._workers.num_workers() > 0
+      if self._heartbeat_supported:
+        self._workers.configure(
+            event_pb2.WorkerHeartbeatRequest(
+                shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR))
+      else:
+        logging.warn(
+            'No workers support hearbeats. Failure handling will be disabled.')
+
+  def saver(self):
+    if self._saver:
+      return self._saver
+
+    savers = ops.get_collection(ops.GraphKeys.SAVERS)
+    if not savers:
+      return None
+
+    if not isinstance(savers, list):
+      return savers
+
+    if len(savers) > 1:
+      logging.error(
+          'Multiple savers in the SAVERS collection.  On-demand checkpointing '
+          'will be disabled. Pass an explicit `saver` to the constructor to '
+          'override this behavior.')
+      return None
+
+    return savers[0]
+
+  def after_run(self, run_context, run_values):
+    del run_values
+
+    if not self._heartbeat_supported:
+      return
+
+    lame_workers = self._workers.lame_workers()
+    if lame_workers:
+      logging.info('ShutdownHook: lame workers found: %s', lame_workers)
+
+      if self.saver():
+        logging.info('ShutdownHook: saving checkpoint to %s',
+                     self._checkpoint_prefix)
+        self.saver().save(
+            run_context.session,
+            self._checkpoint_prefix,
+            global_step=training_util.get_global_step(),
+            write_state=True,
+        )
+      else:
+        logging.info('ShutdownHook: no Saver defined.')
+
+      for fn in self._on_shutdown_hooks:
+        fn(run_context, self._workers, lame_workers)
+
+
+class RestartComputation(object):
+  """Restart the entire computation.
+
+  This hook shuts down all workers and returns control to the top-level by
+  throwing a CoordinatorShutdownException.
+  """
+
+  def __init__(self, timeout_ms=10000):
+    self.timeout_ms = timeout_ms
+
+  def __call__(self, run_context, all_workers, lame_workers):
+    del run_context, lame_workers
+    all_workers.shutdown(timeout_ms=self.timeout_ms)
+
+    logging.info('Terminating coordinator.')
+    raise CoordinatorShutdownException()
+
+
+class ShutdownLameWorkers(object):
+  """Shutdown lamed workers.
+
+  Processing will continue normally (typically by waiting for the down
+  workers to be restarted).
+  """
+
+  def __init__(self, timeout_ms=10000):
+    self.timeout_in_ms = timeout_ms
+
+  def __call__(self, run_context, all_workers, lame_workers):
+    lame_workers.shutdown(timeout_ms=self.timeout_in_ms)
diff --git a/tensorflow/python/tpu/tensor_tracer.py b/tensorflow/python/tpu/tensor_tracer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d41d8768a3acbfd3521e04967b59e356c0f2943
--- /dev/null
+++ b/tensorflow/python/tpu/tensor_tracer.py
@@ -0,0 +1,1638 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========================================================================
+"""A utility to trace tensor values on TPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import os.path
+import re
+import sys
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import graph_io
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import logging_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import tpu
+from tensorflow.python.tpu.ops import tpu_ops
+
+_TRACER_LOG_PREFIX = ' [>>>TT>>>]'
+_DEVICE_TYPE_TPU = 'tpu'
+_DEVICE_TYPE_CPU = 'cpu'
+_TRACE_MODE_NAN_INF = 'nan-inf'
+_TRACE_MODE_PART_TENSOR = 'part-tensor'
+_TRACE_MODE_PART_TENSOR_SIZE = 3
+_TRACE_MODE_FULL_TENSOR = 'full-tensor'
+_TRACE_MODE_NORM = 'norm'
+_TRACE_MODE_MAX_ABS = 'max-abs'
+_SUBMODE_BRIEF = 'brief'
+_SUBMODE_DETAILED = 'detailed'
+_REASON_OUTSIDE_OP_RANGE = 'not-traced-outside-op-range'
+_REASON_UNSAFE_OP = 'not-traced-unsafe-op'
+_REASON_WHILELOOP_OP = 'not-traced-special-whileloop-op'
+_REASON_UNSAFE_SCALAR = 'not-traced-unsafe-scalar'
+_REASON_LESS_INTERESTING_OP = 'not-traced-less-interesting-op'
+_REASON_DEVICE_MISMATCH = 'not-traced-device-mismatch'
+_REASON_DYNAMIC_SHAPE = 'not-traced-dynamic-shape'
+_REASON_SCALAR_GET_TRACED = 'traced-scalar'
+_REASON_TENSOR_GET_TRACED = 'traced-tensor'
+_REASON_USER_INCLUDED = 'traced-user-included'
+_REASON_USER_EXCLUDED = 'not-traced-user-excluded'
+_REASON_NOT_EXECUTED = 'not-traced-not-in-exec-path'
+_REASON_NON_NUMERIC_TENSOR = 'not-traced-non-numeric-tensor'
+_REASON_FEEDS_WHILELOOP_OP = 'not-traced-feeds-special-whileloop-op'
+_MARKER_SECTION_BEGIN = '!!!!!!! section-begin:'
+_MARKER_SECTION_END = '!!!!!!! section-end:'
+_SECTION_NAME_CONFIG = 'configuration'
+_SECTION_NAME_REASON = 'reason'
+_SECTION_NAME_OP_LIST = 'op-list'
+_SECTION_NAME_TENSOR_LIST = 'tensor-list'
+_SECTION_NAME_CACHE_INDEX_MAP = 'cache-index-map'
+_SECTION_NAME_GRAPH = 'graph'
+_FIELD_NAME_VERSION = 'version:'
+_FIELD_NAME_DEVICE = 'device:'
+_FIELD_NAME_TRACE_MODE = 'trace-mode:'
+_FIELD_NAME_SUBMODE = 'submode:'
+_FIELD_NAME_NUM_REPLICAS = 'num-replicas:'
+_FIELD_NAME_NUM_REPLICAS_PER_HOST = 'num-replicas-per-host:'
+_FIELD_NAME_NUM_HOSTS = 'num-hosts:'
+_FIELD_NAME_NUM_OPS = 'number-of-ops:'
+_FIELD_NAME_NUM_TENSORS = 'number-of-tensors:'
+_FIELD_NAME_NUM_CACHE_INDICES = 'number-of-indices:'
+_FIELD_NAME_TOPOLOGICAL_SORT_SUCCEED = 'topological-sort-succeed:'
+_FLAGS_ENV_VAR = 'TENSOR_TRACER_FLAGS'
+_FLAG_SINGLE_QUOTE_PAT = re.compile(r"\s*--([^=]+)='([^']*)'")
+_FLAG_DOUBLE_QUOTE_PAT = re.compile(r'\s*--([^=]+)="([^"]*)"')
+_FLAG_NO_QUOTE_PAT = re.compile(r'\s*--([^=]+)=(\S*)')
+_FLAG_NO_EQUAL_PAT = re.compile(r'\s*--([^=]+)\s*')
+_FLAG_NAME_ENABLE = 'enable'
+_FLAG_NAME_TRACE_MODE = 'trace_mode'
+_FLAG_NAME_USE_COMPACT_TRACE = 'compact_trace'
+_FLAG_NAME_SUBMODE = 'submode'
+_FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS = 'include_less_interesting_ops'
+_FLAG_NAME_EXCLUDED_OPNAMES = 'excluded_opnames'
+_FLAG_NAME_EXCLUDED_OPTYPES = 'excluded_optypes'
+_FLAG_NAME_INCLUDED_OPNAMES = 'included_opnames'
+_FLAG_NAME_INCLUDED_OPTYPES = 'included_optypes'
+_FLAG_NAME_TRACE_DIR = 'trace_dir'
+_FLAG_NAME_REPORT_FILE = 'report_file'
+_FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR = 'use_test_undeclared_outputs_dir'
+_FLAG_NAME_OP_RANGE = 'op_range'
+# Folder to dump the pre (before tensor tracer updates) and post graphs (after
+# tensor tracer updates).
+_FLAG_DUMP_BEFORE_AFTER_GRAPHS = 'dump_graphs'
+_OP_RANGE_PAT = re.compile(r'(\d+):(\d+)')
+_OUTPUT_STREAM_ESCAPE = 'file://'
+_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR = 'TEST_UNDECLARED_OUTPUTS_DIR'
+_TENSOR_TRACER_COLLECTION = 'tensor_tracer_variables'
+_TENSOR_TRACER_CHECKPOINT = 'tensor_tracer_checkpoint'
+_TRACE_FILE_NAME = 'trace.all'
+_COMPACT_TRACE_FILE_PREFIX = 'compact_trace.'
+_COMPACT_TRACE_ENTRY_INIT_VALUE = -1.0
+_TENSOR_TRACER_STORAGE = 'tensor_tracer_storage'
+_TENSOR_VALUES_CACHE = 'tensor_values_cache'
+_REPLICA_ID_TAG = '#replica-id: '
+
+
+def tensor_tracepoint(tensor, checkpoint_name):
+  """Adds a checkpoint with the given checkpoint name for the given tensor.
+
+  The tensor will be added to the list of tensors that will be traced by the
+  tensor tracer.
+
+  Args:
+     tensor: the tensor object for which the tracing is requested.
+     checkpoint_name: a string name for the checkpoint. This name has to be a
+     unique name if used within model comparison. The tensors that have the same
+     checkpoint identifier is compared in model comparison.
+  Returns:
+    The provided tensor.
+  """
+
+  tensor.graph.get_collection(_TENSOR_TRACER_COLLECTION)
+  tensor.graph.add_to_collection(_TENSOR_TRACER_COLLECTION,
+                                 (tensor, checkpoint_name))
+  return tensor
+
+
+def keras_layer_tracepoint(layer, checkpoint_name):
+  """An interface for adding the tensor outputs of a keras layer.
+
+  Encapsulates tensor_tracepoint.
+
+  Args:
+     layer: A keras layer.
+     checkpoint_name: a string name for the checkpoint. This name has to be a
+     unique name if used within model comparison. The tensors that have the same
+     checkpoint identifier is compared in model comparison.
+
+  Returns:
+    The provided layer.
+  """
+  try:
+    outputs = layer.output
+    if tensor_util.is_tensor(outputs):
+      tensor_tracepoint(outputs, '%s' % (checkpoint_name))
+    else:
+      idx = 0
+      for output_tensor in outputs:
+        if tensor_util.is_tensor(outputs):
+          tensor_tracepoint(output_tensor, '%s_%d' % (checkpoint_name, idx))
+        idx += 1
+  except AttributeError:
+    pass
+  except RuntimeError:
+    pass
+  return layer
+
+
+def _trace_files_need_precreated(output_dir):
+  """Return True if trace files must be pre-created by users."""
+
+  if not output_dir.startswith('/'):
+    return False
+  if len(output_dir) < 5:
+    return False
+  if output_dir[2] != 'n':
+    return False
+  if output_dir[3] != 's':
+    return False
+  if output_dir[1] != 'c':
+    return False
+  if output_dir[4] != '/':
+    return False
+  return True
+
+
+def _get_tensor_values_cache(graph=None):
+  """Returns the variable that implements tensor-value caching."""
+
+  graph = graph or ops.get_default_graph()
+  collection = graph.get_collection(_TENSOR_TRACER_STORAGE)
+  if len(collection) == 1:
+    return collection[0]
+  elif not collection:
+    raise RuntimeError('%s has not been created'%_TENSOR_VALUES_CACHE)
+  else:
+    raise RuntimeError('Multiple %s created'%_TENSOR_VALUES_CACHE)
+  return None
+
+
+def _create_tensor_values_cache(graph, num_tensors):
+  """Creates a variable as the cache to store intermediate tensor values."""
+  graph = graph or ops.get_default_graph()
+  # Create in proper graph and base name_scope.
+  with graph.as_default() as g, g.name_scope(None):
+    return variable_scope.get_variable(
+        _TENSOR_VALUES_CACHE,
+        shape=[num_tensors],
+        dtype=dtypes.float32,
+        initializer=init_ops.constant_initializer(
+            _COMPACT_TRACE_ENTRY_INIT_VALUE),
+        trainable=False,
+        use_resource=True,
+        collections=[_TENSOR_TRACER_STORAGE, ops.GraphKeys.GLOBAL_VARIABLES])
+
+
+class TensorTracer(object):
+  """A software construct for tracing tensor values in a TF graph on TPU.
+
+  This utility is disabled by default. It can be enabled by setting
+  the TENSOR_TRACER_FLAGS env variable as:
+    export TENSOR_TRACER_FLAGS="--enable=1"
+  If it is enabled, it will trace the output tensor values of
+  selected Ops in the graph. It has two outputs: (1) the traces and (2)
+  a report. The traces are dumped to a specified local file on the TPU
+  host. The report is printed to the log.info of the TPU job.
+  By passing options via the env variable, users can change:
+     (1) the trace mode (e.g., detecting NaN/Inf, printing partial or
+         full tensor values)
+     (2) which Ops to be traced (via op.name or op.type)
+     (3) output trace file path.
+  """
+  # The set of graphs that are rewritten by tensor tracer.
+  _traced_graphs = set()
+  @staticmethod
+  def _match_next_flag(flags, pos):
+    """Returns the match for the next TensorTracer flag.
+
+    Args:
+       flags: a string that contains the flags.
+       pos: where in flags to start the search.
+
+    Returns:
+       A pair where the first element is the regular-expression
+       match found and the second element indicates if the match
+       has a value.
+    """
+
+    match = _FLAG_DOUBLE_QUOTE_PAT.match(flags, pos)
+    if match:
+      return match, True
+    match = _FLAG_SINGLE_QUOTE_PAT.match(flags, pos)
+    if match:
+      return match, True
+    match = _FLAG_NO_QUOTE_PAT.match(flags, pos)
+    if match:
+      return match, True
+    match = _FLAG_NO_EQUAL_PAT.match(flags, pos)
+    if match:
+      # The flag is found but is not given a value.
+      return match, False
+    # The flag is not found.
+    return None, False
+
+  @staticmethod
+  def validate_flag_names():
+    """Validates if the TensorTrace flags passed are valid."""
+    valid_flag_names = [_FLAG_NAME_ENABLE, _FLAG_NAME_TRACE_MODE,
+                        _FLAG_NAME_USE_COMPACT_TRACE,
+                        _FLAG_NAME_SUBMODE,
+                        _FLAG_NAME_EXCLUDED_OPNAMES,
+                        _FLAG_NAME_EXCLUDED_OPTYPES,
+                        _FLAG_NAME_INCLUDED_OPNAMES,
+                        _FLAG_NAME_INCLUDED_OPTYPES,
+                        _FLAG_NAME_TRACE_DIR,
+                        _FLAG_NAME_REPORT_FILE,
+                        _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR,
+                        _FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS,
+                        _FLAG_NAME_OP_RANGE,
+                        _FLAG_DUMP_BEFORE_AFTER_GRAPHS]
+    tensor_tracer_flags = os.environ.get(_FLAGS_ENV_VAR)
+    if not tensor_tracer_flags:
+      return
+    pos = 0
+    while True:
+      match, _ = TensorTracer._match_next_flag(tensor_tracer_flags, pos)
+      if not match:
+        break
+      flag_name = match.group(1)
+      if flag_name not in valid_flag_names:
+        raise ValueError(
+            'The flag name "%s" passed via the environment variable "%s" '
+            'is invalid. Valid flag names are:'
+            '\n%s'%(flag_name, _FLAGS_ENV_VAR, valid_flag_names))
+      pos = match.end()
+
+  @staticmethod
+  def print_flag_values():
+    """Prints all TensorTracer flags passed via environment variables."""
+
+    tensor_tracer_flags = os.environ.get(_FLAGS_ENV_VAR)
+    if not tensor_tracer_flags:
+      return 'Env variable "%s" is not set'%_FLAGS_ENV_VAR
+    result = 'Env variable "%s" is set to "%s"\n'%(_FLAGS_ENV_VAR,
+                                                   tensor_tracer_flags)
+    result += 'Individual flag value:\n'
+    pos = 0
+    while True:
+      match, has_value = TensorTracer._match_next_flag(
+          tensor_tracer_flags, pos)
+      if not match:
+        break
+      flag_name = match.group(1)
+      if has_value:
+        flag_value = match.group(2)
+      else:
+        flag_value = None
+      result += '  %s: %s\n'%(flag_name, flag_value)
+      pos = match.end()
+    result += '\n'
+    return result
+
+  @staticmethod
+  def get_flag_value(wanted_flag_name):
+    """Returns the value of a TensorTracer flags.
+
+    Args:
+      wanted_flag_name: the name the the flag we are looking for.
+
+    Returns:
+      A pair where the first element indicates if the flag is
+      found and the second element is the value of the flag.
+
+    Raises:
+      RuntimeError: If supposedly deadcode is reached.
+    """
+
+    tensor_tracer_flags = os.getenv(_FLAGS_ENV_VAR)
+    if not tensor_tracer_flags:
+      return False, None
+    pos = 0
+    while True:
+      match, has_value = TensorTracer._match_next_flag(
+          tensor_tracer_flags, pos)
+      if not match:
+        return False, None
+      flag_name = match.group(1)
+      if has_value:
+        flag_value = match.group(2)
+      else:
+        flag_value = None
+      if flag_name == wanted_flag_name:
+        return True, flag_value
+      pos = match.end()
+    raise RuntimeError('Should not reach here.')
+
+  @staticmethod
+  def flag_value_to_re_list(flag_name):
+    """Converts list of strings to compiled RE."""
+
+    re_list = []
+    found, flag_value = TensorTracer.get_flag_value(flag_name)
+    if not found or not flag_value:
+      return re_list
+    list_of_values = flag_value.split()
+    for v in list_of_values:
+      r = re.compile(v)
+      re_list.append(r)
+    return re_list
+
+  @staticmethod
+  def _is_flag_on(flag_name):
+    """Returns True if the given flag is on."""
+
+    found, flag_value = TensorTracer.get_flag_value(flag_name)
+    if not found:
+      return False
+    if flag_value is None:
+      return True
+    # Depends on the flag value.
+    flag_value = flag_value.lower()
+    enabled = flag_value in ['1', 't', 'true', 'y', 'yes']
+    return enabled
+
+  @staticmethod
+  def is_enabled():
+    """Returns True if TensorTracer is enabled."""
+
+    return TensorTracer._is_flag_on(_FLAG_NAME_ENABLE)
+
+  @staticmethod
+  def use_test_undeclared_outputs_dir():
+    """Decides the output directory of the report and trace files.
+
+    Args:
+       None.
+
+    Returns:
+       True if the output files should be written to the
+       test-undeclared-outputs-directory defined via an
+       env variable.
+    """
+
+    return TensorTracer._is_flag_on(
+        _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR)
+
+  @staticmethod
+  def use_compact_trace():
+    return TensorTracer._is_flag_on(
+        _FLAG_NAME_USE_COMPACT_TRACE)
+
+  @staticmethod
+  def check_device_type(device_type):
+    """Checks if the given device type is valid."""
+
+    if device_type not in [_DEVICE_TYPE_TPU, _DEVICE_TYPE_CPU]:
+      raise ValueError('Invalid device_type "%s"'%device_type)
+
+  @staticmethod
+  def check_trace_mode(trace_mode):
+    """Checks if the given trace mode is valid."""
+
+    valid_trace_modes = [_TRACE_MODE_NAN_INF, _TRACE_MODE_PART_TENSOR,
+                         _TRACE_MODE_FULL_TENSOR, _TRACE_MODE_NORM,
+                         _TRACE_MODE_MAX_ABS]
+    if trace_mode not in valid_trace_modes:
+      raise ValueError('Invalid trace mode "%s" given to the Tensor_Tracer.'
+                       'Valid trace modes are: %s'%(trace_mode,
+                                                    valid_trace_modes))
+
+  @staticmethod
+  def check_submode(submode):
+    """Checks if the given submode is valid."""
+
+    if not submode:
+      return
+    valid_submodes = [_SUBMODE_DETAILED, _SUBMODE_BRIEF]
+    if submode not in valid_submodes:
+      raise ValueError('Invalid submode "%s" given to the Tensor_Tracer.'
+                       'Valid submodes are: %s'%(submode,
+                                                 valid_submodes))
+
+  @staticmethod
+  def loop_cond_op(op):
+    return op.type in ('LoopCond', 'RefLoopCond')
+
+  @staticmethod
+  def while_loop_op(op):
+    """Returns true if op is one of the special ops of in a while loop.
+
+    Args:
+       op: A tf.Operation.
+
+    Returns:
+       True if the given op is one of [Switch, Merge, Enter, Exit,
+       NextIteration, LoopCond], which are all building blocks for TF while
+       loops.
+    """
+    return  (control_flow_util.IsLoopSwitch(op) or
+             control_flow_util.IsLoopMerge(op) or
+             control_flow_util.IsLoopEnter(op) or
+             control_flow_util.IsLoopExit(op) or
+             TensorTracer.loop_cond_op(op) or
+             op.type in ('RefNextIteration', 'NextIteration'))
+
+  @staticmethod
+  def unsafe_op(op):
+    """Returns True if this op is not safe to be traced."""
+
+    if control_flow_util.IsInCond(op):
+      return True
+    # Reasons for not including following op types:
+    #    Assign: cause incorrect result with CPU tracing.
+    if op.type in ['Assign']:
+      return True
+    return False
+
+  @staticmethod
+  def device_mismatch(device_type, op):
+    if device_type == _DEVICE_TYPE_TPU:
+      # pylint: disable=protected-access
+      return tpu._TPU_REPLICATE_ATTR not in op.node_def.attr
+      # pylint: enable=protected-access
+    return False
+
+  @staticmethod
+  def unsafe_scalar_trace(op):
+    """Return true if scalar output tensor from Op is not safe to be traced."""
+
+    # Tracing the following causes cycle in the graph on TPU.
+    if op.type in ['LoopCond', 'Enter', 'Merge', 'Const',
+                   'Switch', 'Less', 'ReadVariableOp']:
+      return True
+    # Tracing the following will cause casting-issue
+    # with the norm tracing mode or other compilation issues on CPU.
+    if op.type in ['VarHandleOp', 'IteratorToStringHandle',
+                   'IteratorGetNext', 'OneShotIterator',
+                   'IteratorV2', 'MakeIterator',
+                   'BatchDatasetV2', 'MapDataset',
+                   'FixedLengthRecordDataset', 'TakeDataset', 'ZipDataset',
+                   'Placeholder', 'PlaceholderWithDefault', 'StridedSlice']:
+      return True
+    return False
+
+  @staticmethod
+  def less_interesting_op(op):
+    """Returns True if the given Op is not an interesting one to be traced."""
+
+    found, _ = TensorTracer.get_flag_value(
+        _FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS)
+    if found:
+      # users force to include all ops.
+      return False
+    # Following ops are highly unlikey to cause bugs.
+    return op.type in ['Const', 'Identity', 'Cast', 'Shape']
+
+  @staticmethod
+  def reason(op_idx, details):
+    """Returns reason why the Op at op_idx is traced or not."""
+
+    return '%d %s'%(op_idx, details)
+
+  @staticmethod
+  def topological_sort(g):
+    """Performs topological sort on the given graph.
+
+    Args:
+       g: the graph.
+
+    Returns:
+       A pair where the first element indicates if the topological
+       sort succeeded (True if there is no cycle found; False if a
+       cycle is found) and the second element is either the sorted
+       list of nodes or the cycle of nodes found.
+    """
+
+    def visit(op, cycle, permanently_marked_ops,
+              temporarily_marked_ops, sorted_ops):
+      """Recursively visits all Ops in a graph.
+
+      Args:
+         op: the current Op being visited.
+         cycle: a cycle of Ops found.
+         permanently_marked_ops: the set of Ops that were already visited.
+         temporarily_marked_ops: the set of Ops that we have visited during
+                                 the current descent.
+         sorted_ops: the list of Ops sorted in topological order.
+      """
+
+      if cycle:
+        return
+      if op in permanently_marked_ops:
+        return
+      if op in temporarily_marked_ops:
+        cycle = temporarily_marked_ops
+        return
+      temporarily_marked_ops.add(op)
+      for i in range(len(op.outputs)):
+        out_tensor = op.outputs[i]
+        for consumer_op in out_tensor.consumers():
+          visit(consumer_op, cycle, permanently_marked_ops,
+                temporarily_marked_ops, sorted_ops)
+      # pylint: disable=protected-access
+      for ctrl_output_op in op._control_outputs:
+        # pylint: enable=protected-access
+        visit(ctrl_output_op, cycle, permanently_marked_ops,
+              temporarily_marked_ops, sorted_ops)
+      temporarily_marked_ops.remove(op)
+      permanently_marked_ops.add(op)
+      sorted_ops.insert(0, op)
+
+    graph_cycle = set([])
+    sorted_ops = []
+    permanently_marked_ops = set([])
+    temporarily_marked_ops = set([])
+    unsorted_ops = g.get_operations()
+    for op in unsorted_ops:
+      visit(op, graph_cycle, permanently_marked_ops,
+            temporarily_marked_ops, sorted_ops)
+    if graph_cycle:
+      return (False, graph_cycle)
+    else:
+      assert len(unsorted_ops) == len(sorted_ops)
+      return (True, sorted_ops)
+
+  @staticmethod
+  def _make_op_and_tensor_maps(op_list):
+    """Creates various maps and lists from op_list.
+
+    Args:
+       op_list: a list of Ops
+
+    Returns:
+       opname_idx_map: a map from Op's name to its index in op_list.
+       tensor_list: a list of output tensors of the Ops in op_list.
+       tensorname_idx_map: a map from output tensor name to its index
+                           in tensor_list.
+    """
+
+    opname_idx_map = {}
+    tensor_list = []
+    tensorname_idx_map = {}
+    for op_id, op in enumerate(op_list):
+      if op.name in opname_idx_map:
+        raise ValueError('Duplicated Op name: %s'%op.name)
+      opname_idx_map[op.name] = op_id
+      for output_tensor in op.outputs:
+        if output_tensor.name not in tensorname_idx_map:
+          tensor_list.append(output_tensor)
+          tensorname_idx_map[output_tensor.name] = len(tensor_list)-1
+    return (opname_idx_map, tensor_list, tensorname_idx_map)
+
+  def __init__(self):
+    """Initializes a TensorTracer.
+
+    Sets the various member fields from the flags (if given) or the defaults.
+    """
+    self._version = 'use-outside-compilation'
+    self._device_type = None
+    TensorTracer.validate_flag_names()
+    found, self._trace_mode = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_MODE)
+    if not found or not self._trace_mode:
+      self._trace_mode = _TRACE_MODE_NAN_INF
+    TensorTracer.check_trace_mode(self._trace_mode)
+    found, self._submode = TensorTracer.get_flag_value(_FLAG_NAME_SUBMODE)
+    if not found or not self._submode:
+      self._submode = _SUBMODE_DETAILED
+    TensorTracer.check_submode(self._submode)
+    self._part_tensor_size = _TRACE_MODE_PART_TENSOR_SIZE
+    self._instrument_records = {}
+    self._set_trace_dir()
+    self._set_report_file()
+    self._set_op_range()
+    self._set_excluded_opnames()
+    self._set_excluded_optypes()
+    self._set_included_opnames()
+    self._set_included_optypes()
+    self._num_replicas = None
+    self._num_replicas_per_host = None
+    self._num_hosts = None
+    self._replica_id = None
+    _, self._graph_dump_path = TensorTracer.get_flag_value(
+        _FLAG_DUMP_BEFORE_AFTER_GRAPHS)
+
+  def _add_replica_id_to_graph(self):
+    """Adds nodes for computing the replica ID to the graph."""
+
+    if self._num_replicas:
+      with ops.control_dependencies(None):
+        # Uses None as dependency to run outside of TPU graph rewrites.
+        self._replica_id = tpu_ops.tpu_replicated_input(
+            list(range(self._num_replicas)),
+            name='tt_replica_id')
+    else:
+      self._replica_id = 'unknown'
+
+  def _set_trace_dir(self):
+    found, self._trace_dir = TensorTracer.get_flag_value(_FLAG_NAME_TRACE_DIR)
+    if found and self._trace_dir \
+       and TensorTracer.use_test_undeclared_outputs_dir():
+      raise ValueError('Cannot not use --%s and --%s at the same time'
+                       %(_FLAG_NAME_TRACE_DIR,
+                         _FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR))
+    if TensorTracer.use_test_undeclared_outputs_dir():
+      self._trace_dir = os.environ.get(_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR)
+
+  def _set_report_file(self):
+    """Sets the path of the output report file."""
+
+    found, self._report_file_path = TensorTracer.get_flag_value(
+        _FLAG_NAME_REPORT_FILE)
+    if found and self._report_file_path \
+       and TensorTracer.use_test_undeclared_outputs_dir():
+      if os.path.isabs(self._report_file_path):
+        raise ValueError('If use_test_undeclared_outputs_dir is set,'
+                         'report_file_path cannot be an absolute path (%s)'
+                         %self._report_file_path)
+      outputs_dir = os.environ.get(_TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR)
+      self._report_file_path = os.path.join(outputs_dir,
+                                            self._report_file_path)
+    if not self._report_file_path:
+      self._report_file = None
+      return
+    try:
+      self._report_file = gfile.Open(self._report_file_path, 'w')
+    except IOError as e:
+      raise e
+
+  def _close_report_file(self):
+    if self._report_file:
+      self._report_file.close()
+
+  def _set_op_range(self):
+    """Sets the index range of the Ops that we will consider tracing."""
+
+    found, op_range = TensorTracer.get_flag_value(_FLAG_NAME_OP_RANGE)
+    if not found or not op_range:
+      self._op_range = (-1, -1)  # this means including all ops.
+      return
+    match = _OP_RANGE_PAT.match(op_range)
+    if not match:
+      self._op_range = (-1, -1)  # this means including all ops.
+      return
+    self._op_range = (int(match.group(1)), int(match.group(2)))
+
+  def _inside_op_range(self, idx):
+    """Return True if the given index is inside the selected range."""
+
+    if idx < self._op_range[0]:
+      return False
+    return self._op_range[1] < 0 or idx <= self._op_range[1]
+
+  def _set_excluded_opnames(self):
+    self._excluded_opname_re_list = TensorTracer.flag_value_to_re_list(
+        _FLAG_NAME_EXCLUDED_OPNAMES)
+
+  def _set_excluded_optypes(self):
+    self._excluded_optype_re_list = TensorTracer.flag_value_to_re_list(
+        _FLAG_NAME_EXCLUDED_OPTYPES)
+
+  def _set_included_opnames(self):
+    self._included_opname_re_list = TensorTracer.flag_value_to_re_list(
+        _FLAG_NAME_INCLUDED_OPNAMES)
+
+  def _set_included_optypes(self):
+    self._included_optype_re_list = TensorTracer.flag_value_to_re_list(
+        _FLAG_NAME_INCLUDED_OPTYPES)
+
+  def _is_user_included_op(self, op):
+    for opname_re in self._included_opname_re_list:
+      if opname_re.match(op.name):
+        return True
+    for optype_re in self._included_optype_re_list:
+      if optype_re.match(op.type):
+        return True
+    return False
+
+  def _is_user_excluded_op(self, op):
+    for opname_re in self._excluded_opname_re_list:
+      if opname_re.match(op.name):
+        return True
+    for optype_re in self._excluded_optype_re_list:
+      if optype_re.match(op.type):
+        return True
+    return False
+
+  def _use_tensor_values_cache(self):
+    """Returns True if immediate tensors should be first saved to a cache."""
+
+    if self._trace_mode not in set([_TRACE_MODE_NAN_INF,
+                                    _TRACE_MODE_NORM, _TRACE_MODE_MAX_ABS]):
+      return False
+    if self._trace_dir and _trace_files_need_precreated(self._trace_dir):
+      return True
+    if TensorTracer.use_compact_trace():
+      return True
+    return False
+
+  def _save_tensor_value_to_cache_op(self, graph, cache_idx, updates):
+    """Returns an Op that will save the given updates to an entry in the cache."""
+
+    cache = _get_tensor_values_cache(graph)
+    indices = constant_op.constant([cache_idx])
+    return state_ops.scatter_update(cache, indices, updates).op
+
+  def _write_report(self, content):
+    """Writes the given content to the report."""
+
+    line = '%s %s'%(_TRACER_LOG_PREFIX, content)
+    if self._report_file:
+      self._report_file.write(line)
+    else:
+      logging.info(line)
+
+  def _write_config_section(self):
+    """Writes the config section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_CONFIG))
+    self._write_report('%s %s\n'%(_FIELD_NAME_VERSION, self._version))
+    self._write_report('%s %s\n'%(_FIELD_NAME_DEVICE, self._device_type))
+    self._write_report('%s %s\n'%(_FIELD_NAME_TRACE_MODE, self._trace_mode))
+    self._write_report('%s %s\n'%(_FIELD_NAME_SUBMODE, self._submode))
+    self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS, self._num_replicas))
+    self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS_PER_HOST,
+                                  self._num_replicas_per_host))
+    self._write_report('%s %s\n'%(_FIELD_NAME_NUM_HOSTS, self._num_hosts))
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_CONFIG))
+
+  def _write_reason_section(self):
+    """Writes the reason section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_REASON))
+    for key in sorted(self._instrument_records):
+      self._write_report('"%s" %s\n'%(key, self._instrument_records[key]))
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_REASON))
+
+  def _write_op_list_section(self, op_list):
+    """Writes the Op-list section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_OP_LIST))
+    self._write_report('%s %d\n'%(_FIELD_NAME_NUM_OPS, len(op_list)))
+    for i in range(0, len(op_list)):
+      op = op_list[i]
+      line = '%d "%s" %s'%(i, op.name, op.type)
+      for out_tensor in op.outputs:
+        if out_tensor.name not in self._tensorname_idx_map:
+          raise ValueError(
+              'out_tensor %s is not in tensorname_idx_map'%out_tensor.name)
+        line += ' %d'%self._tensorname_idx_map[out_tensor.name]
+      line += '\n'
+      self._write_report(line)
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_OP_LIST))
+
+  def _write_tensor_list_section(self, tensor_list, opname_idx_map):
+    """Writes the tensor-list section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN,
+                                  _SECTION_NAME_TENSOR_LIST))
+    self._write_report('%s %d\n'%(_FIELD_NAME_NUM_TENSORS, len(tensor_list)))
+    for i in range(0, len(tensor_list)):
+      tensor = tensor_list[i]
+      line = '%d "%s"'%(i, tensor.name)
+      for consumer_op in tensor.consumers():
+        if consumer_op.name not in opname_idx_map:
+          raise ValueError(
+              'consumer_op %s is not in opname_idx_map'%consumer_op.name)
+        line += ' %d'%opname_idx_map[consumer_op.name]
+      line += '\n'
+      self._write_report(line)
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END,
+                                  _SECTION_NAME_TENSOR_LIST))
+
+  def _write_cache_index_map_section(self):
+    """Writes the mapping from cache index to tensor index to the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN,
+                                  _SECTION_NAME_CACHE_INDEX_MAP))
+    self._write_report('%s %d\n'%(_FIELD_NAME_NUM_CACHE_INDICES,
+                                  len(self._cache_idx_to_tensor_idx)))
+    for cache_idx in range(0, len(self._cache_idx_to_tensor_idx)):
+      tensor_idx = self._cache_idx_to_tensor_idx[cache_idx]
+      line = '%d %d\n'%(cache_idx, tensor_idx)
+      self._write_report(line)
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END,
+                                  _SECTION_NAME_CACHE_INDEX_MAP))
+
+  def _write_graph_section(self, succeed, sorted_or_cycle):
+    """Writes the graph section of the report."""
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN, _SECTION_NAME_GRAPH))
+    self._write_report('%s %s\n'%(_FIELD_NAME_TOPOLOGICAL_SORT_SUCCEED,
+                                  succeed))
+    l = list(sorted_or_cycle)
+    for i in range(0, len(l)):
+      self._write_report('%d "%s"\n'%(i, l[i].name))
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END, _SECTION_NAME_GRAPH))
+
+  def _preprocess_traced_tensor(self, tensor):
+    """Computes NAN/Norm/Max on TPUs before sending to CPU.
+
+    Args:
+      tensor: The tensor to be traced.
+    Returns:
+      A tensor that should be input to the trace_function.
+    Raises:
+      RuntimeError: If the trace mode is invalid.
+    """
+
+    def _detect_nan_inf(tensor):
+      """Trace function for detecting any NaN/Inf in the tensor."""
+
+      if tensor.dtype.is_floating:
+        mask = math_ops.reduce_any(
+            gen_math_ops.logical_or(
+                gen_math_ops.is_nan(tensor), gen_math_ops.is_inf(tensor)))
+        output_tensor = control_flow_ops.cond(mask,
+                                              lambda: constant_op.constant(1.0),
+                                              lambda: constant_op.constant(0.0))
+      else:
+        output_tensor = constant_op.constant(0.0)
+      # The shape has to be 1. Set it if it does not have the information.
+      output_tensor = array_ops.reshape(output_tensor, [1])
+      return output_tensor
+
+    def _show_norm(tensor):
+      tensor = math_ops.cast(tensor, dtypes.float32)
+      output_tensor = linalg_ops.norm(tensor)
+      # The shape has to be 1. Set it if it does not have the information.
+      output_tensor = array_ops.reshape(output_tensor, [1])
+      return output_tensor
+
+    def _show_max_abs(tensor):
+      tensor = math_ops.cast(tensor, dtypes.float32)
+      output_tensor = math_ops.reduce_max(math_ops.abs(tensor))
+      zero = constant_op.constant(0, dtypes.float32)
+      output_tensor = gen_math_ops.maximum(zero, output_tensor)
+      # The shape has to be 1. Set it if it does not have the information.
+      output_tensor = array_ops.reshape(output_tensor, [1])
+      return output_tensor
+
+    if self._trace_mode == _TRACE_MODE_NAN_INF:
+      return _detect_nan_inf(tensor)
+    if self._trace_mode == _TRACE_MODE_PART_TENSOR:
+      return tensor
+    if self._trace_mode == _TRACE_MODE_FULL_TENSOR:
+      return tensor
+    if self._trace_mode == _TRACE_MODE_NORM:
+      return _show_norm(tensor)
+    if self._trace_mode == _TRACE_MODE_MAX_ABS:
+      return _show_max_abs(tensor)
+    raise RuntimeError(
+        'Tensor trace fun for %s is not yet implemented' % self._trace_mode)
+
+  def _make_tensor_trace_fun(self, tensor_name):
+    """Makes the tensor tracing function called by outside compilation.
+
+    Args:
+      tensor_name: name of the tensor being traced.
+
+    Returns:
+      A function to be passed as the first argument to outside compilation.
+
+    Raises:
+      RuntimeError: If the trace mode is invalid.
+    """
+
+    def _print_tensor(tensor_name, num_elements, tensor, output_tensor):
+      """Prints a tensor value to a file.
+
+      Args:
+        tensor_name: name of the tensor being traced.
+        num_elements: number of elements to print (-1 means print all).
+        tensor: the tensor needs to be returned.
+        output_tensor: the tensor needs to be printed.
+
+      Returns:
+        The same tensor passed via the "tensor" argument.
+
+      Raises:
+        ValueError: If tensor_name is not already in
+                    self._tensorname_idx_map.
+      """
+
+      if self._submode == _SUBMODE_BRIEF:
+        if tensor_name not in self._tensorname_idx_map:
+          raise ValueError(
+              'Tensor name %s is not in the tensorname_idx_map'%tensor_name)
+        msg = '%d'%self._tensorname_idx_map[tensor_name]
+      else:
+        msg = '"%s"'%tensor_name
+
+      if self._trace_dir:
+        output_path = os.path.join(self._trace_dir, _TRACE_FILE_NAME)
+        output_stream = _OUTPUT_STREAM_ESCAPE + output_path
+      else:
+        output_stream = sys.stderr
+      return logging_ops.print_v2(msg, array_ops.shape(output_tensor),
+                                  '@', self._replica_id,
+                                  '\n', output_tensor, '\n',
+                                  summarize=num_elements,
+                                  output_stream=output_stream)
+
+    def _show_part_tensor(tensor):
+      """Trace function for printing part of the tensor."""
+
+      return _print_tensor(tensor_name, self._part_tensor_size,
+                           tensor, tensor)
+
+    def _show_full_tensor(tensor):
+      """Trace function for printing the entire tensor."""
+
+      return _print_tensor(tensor_name, -1, tensor, tensor)
+
+    if self._trace_mode == _TRACE_MODE_PART_TENSOR:
+      return _show_part_tensor
+    # The input tensor has a shape of "[1]" for _TRACE_MODE_NAN_INF,
+    # _TRACE_MODE_NORM, and _TRACE_MODE_MAX_ABS, as related computations are
+    # performed within TPUs and only their results are transferred to CPU.
+    # Simply, print the full tensor for these trace modes.
+    if self._trace_mode in [
+        _TRACE_MODE_NAN_INF, _TRACE_MODE_NORM, _TRACE_MODE_FULL_TENSOR,
+        _TRACE_MODE_MAX_ABS
+    ]:
+      return _show_full_tensor
+
+    raise RuntimeError('Tensor trace fun for %s is not yet implemented'
+                       %self._trace_mode)
+
+  def _skip_op(self, op_id, op, user_included, user_excluded,
+               in_exec_path=True):
+    """Returns True if we should not trace Op."""
+
+    if TensorTracer.while_loop_op(op):
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_WHILELOOP_OP)
+      return True
+    if TensorTracer.unsafe_op(op):
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_UNSAFE_OP)
+      return True
+    if TensorTracer.device_mismatch(self._device_type, op):
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_DEVICE_MISMATCH)
+      return True
+    if not in_exec_path:
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_NOT_EXECUTED)
+      return True
+
+    if not self._inside_op_range(op_id):
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_OUTSIDE_OP_RANGE)
+      return True
+    if TensorTracer.less_interesting_op(op):
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_LESS_INTERESTING_OP)
+      return True
+    if user_included:
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_USER_INCLUDED)
+      return False
+    if user_excluded:
+      self._instrument_records[op.name] = TensorTracer.reason(
+          op_id, _REASON_USER_EXCLUDED)
+      return True
+    return False
+
+  def _skip_tensor(self, op_id, out_tensor, user_included,
+                   user_excluded):
+    """Returns True if we should not trace out_tensor."""
+
+    # Skips a tensor if the tensor has a non-numeric type.
+    #   Note: we cannot use check_ops.is_numeric_tensor(out_tensor)
+    #         because it also excludes tensors with dtypes, bool, and
+    #         float32_ref, which we actually want to trace.
+    non_numeric_tensor_types = set([dtypes.variant, dtypes.resource,
+                                    dtypes.string])
+    if out_tensor.dtype in non_numeric_tensor_types:
+      self._instrument_records[out_tensor.name] = TensorTracer.reason(
+          op_id, _REASON_NON_NUMERIC_TENSOR)
+      return True
+    # Skip a tensor if it feeds a special while loop op.
+    if [consumer for consumer in out_tensor.consumers() if
+        TensorTracer.while_loop_op(consumer)]:
+      self._instrument_records[out_tensor.name] = TensorTracer.reason(
+          op_id, _REASON_FEEDS_WHILELOOP_OP)
+      return True
+    if user_included:
+      self._instrument_records[out_tensor.name] = TensorTracer.reason(
+          op_id, _REASON_USER_INCLUDED)
+      return False
+    if user_excluded:
+      self._instrument_records[out_tensor.name] = TensorTracer.reason(
+          op_id, _REASON_USER_EXCLUDED)
+      return True
+    if not out_tensor.get_shape().is_fully_defined():
+      # If trace mode is nan-inf, norm or max, then the tensor will be reduced
+      # to a scalar before the outside compilation call.
+      if self._trace_mode in [
+          _TRACE_MODE_NAN_INF, _TRACE_MODE_NORM, _TRACE_MODE_MAX_ABS
+      ]:
+        self._instrument_records[out_tensor.name] = TensorTracer.reason(
+            op_id, _REASON_TENSOR_GET_TRACED)
+        return False
+      else:
+        self._instrument_records[out_tensor.name] = TensorTracer.reason(
+            op_id, _REASON_DYNAMIC_SHAPE)
+        return True
+    rank = len(out_tensor.shape)
+    if rank < 1:
+      # scalar
+      if TensorTracer.unsafe_scalar_trace(out_tensor.op):
+        self._instrument_records[out_tensor.name] = TensorTracer.reason(
+            op_id, _REASON_UNSAFE_SCALAR)
+        return True
+      else:
+        self._instrument_records[out_tensor.name] = TensorTracer.reason(
+            op_id, _REASON_SCALAR_GET_TRACED)
+        return False
+    else:
+      # tensor
+      self._instrument_records[out_tensor.name] = TensorTracer.reason(
+          op_id, _REASON_TENSOR_GET_TRACED)
+      return False
+
+  def _filter_execution_path_operations(self, operations, fetches):
+    """Returns the set of ops in the execution path to compute given fetches."""
+
+    # If no fetch provided, then return all operations.
+    if fetches is None:
+      return set(operations)
+    # Convert to list, if a single element is provided.
+    if not isinstance(fetches, (list, tuple)):
+      fetches = [fetches]
+    # If a tensor is given as fetch, convert it to op.
+    op_fetches = []
+    for fetch in fetches:
+      if isinstance(fetch, ops.Operation):
+        op_fetches.append(fetch)
+      elif isinstance(fetch, ops.Tensor):
+        op_fetches.append(fetch.op)
+      else:
+        raise RuntimeError('Given fetch:%s is neither a tensor nor an op.'
+                           %fetch)
+
+    execution_path_operations = set(op_fetches)
+    traverse_stack = list(op_fetches)
+    while True:
+      if not traverse_stack:
+        break
+      head_op = traverse_stack.pop()
+      input_ops = [tensor_input.op for tensor_input in head_op.inputs]
+      input_ops.extend(head_op.control_inputs)
+
+      for input_op in input_ops:
+        if input_op not in execution_path_operations:
+          # Filter out loop condition operations, tracing them causes a cycle.
+          # Trace only the loop-body.
+          if TensorTracer.loop_cond_op(input_op):
+            continue
+          execution_path_operations.add(input_op)
+          traverse_stack.append(input_op)
+    return execution_path_operations
+
+  def _determine_traced_tensors(self, graph, ops_in_exec_path):
+    """Determines the tensors that will be traced."""
+
+    self._traced_tensorname_to_cache_idx_map = {}
+    self._cache_idx_to_tensor_idx = []
+    operations = graph.get_operations()
+    checkpoint_operations = self._get_checkpoints(graph)
+    for op_id, op in enumerate(operations):
+      if checkpoint_operations and op.name not in checkpoint_operations:
+        continue
+      user_included = self._is_user_included_op(op)
+      user_excluded = self._is_user_excluded_op(op)
+      in_exec_path = op in ops_in_exec_path
+      if self._skip_op(op_id, op, user_included, user_excluded, in_exec_path):
+        continue
+      for i in range(len(op.outputs)):
+        out_tensor = op.outputs[i]
+        if self._skip_tensor(op_id, out_tensor, user_included,
+                             user_excluded):
+          continue
+        tensor_name = out_tensor.name
+        if tensor_name in self._traced_tensorname_to_cache_idx_map:
+          raise ValueError(
+              'Tensor name %s should not be already in '
+              'traced_tensorname_to_cache_idx_map'%tensor_name)
+        if tensor_name not in self._tensorname_idx_map:
+          raise ValueError(
+              'Tensor name %s is not in the tensorname_idx_map'%tensor_name)
+        tensor_idx = self._tensorname_idx_map[tensor_name]
+        cache_idx = len(self._traced_tensorname_to_cache_idx_map)
+        self._traced_tensorname_to_cache_idx_map[tensor_name] = cache_idx
+        self._cache_idx_to_tensor_idx.append(tensor_idx)
+        if len(self._traced_tensorname_to_cache_idx_map) != len(
+            self._cache_idx_to_tensor_idx):
+          raise RuntimeError('len(self._traced_tensorname_to_cache_idx_map) != '
+                             'len(self._cache_idx_to_tensor_idx')
+
+  def _check_trace_files(self):
+    """Checks if any requirements for trace files are satisfied."""
+
+    if not self._trace_dir:
+      # traces will be written to stderr. No need to check trace files.
+      return
+    if _trace_files_need_precreated(self._trace_dir):
+      for replica_id in range(0, self._num_replicas):
+        trace_file_path = os.path.join(
+            self._trace_dir,
+            _COMPACT_TRACE_FILE_PREFIX) + '%d'%replica_id
+        if not gfile.Exists(trace_file_path):
+          raise RuntimeError(
+              '%s must be pre-created with the '
+              'appropriate properties.'%trace_file_path)
+    else:
+      if not gfile.Exists(self._trace_dir):
+        gfile.MkDir(self._trace_dir)
+        if not gfile.Exists(self._trace_dir):
+          raise RuntimeError('Failed to create %s'%self._trace_dir)
+
+  def _pre_tracing(self, graph, fetches):
+    """Work needs to be done prior to TPU or CPU tracing."""
+
+    self._check_trace_files()
+    operations = graph.get_operations()
+    (opname_idx_map, tensor_list, self._tensorname_idx_map) = (
+        TensorTracer._make_op_and_tensor_maps(operations))
+    self._write_config_section()
+    self._write_op_list_section(operations)
+    self._write_tensor_list_section(tensor_list, opname_idx_map)
+    # Filter out the operations that won't be executed.
+    # if fetches=None, then ops_in_exec_path = set(operations)
+    ops_in_exec_path = self._filter_execution_path_operations(operations,
+                                                              fetches)
+    self._determine_traced_tensors(graph, ops_in_exec_path)
+    self._write_cache_index_map_section()
+    # Does the topological sort before adding any nodes to the graph.
+    (succeed, sorted_or_cycle) = TensorTracer.topological_sort(graph)
+    if self._use_tensor_values_cache():
+      _create_tensor_values_cache(graph,
+                                  len(self._cache_idx_to_tensor_idx))
+    return (ops_in_exec_path, succeed, sorted_or_cycle)
+
+  def _post_tracing(self, succeed, sorted_or_cycle):
+    """Work needs to be done after TPU or CPU tracing."""
+
+    self._write_reason_section()
+    self._write_graph_section(succeed, sorted_or_cycle)
+    self._close_report_file()
+
+  def _get_checkpoints(self, graph):
+    """Returns the list of Ops that produce the tensors traced with API.
+
+    Args:
+      graph: the graph of Ops.
+
+    Returns:
+      A set of operation names which should be traced.
+    """
+
+    self._write_report('%s %s\n'%(_MARKER_SECTION_BEGIN,
+                                  _TENSOR_TRACER_CHECKPOINT))
+    checkpoint_operations = set()
+    tensor_tracer_variables = graph.get_collection(_TENSOR_TRACER_COLLECTION)
+    for (tensor, checkpoint_name) in tensor_tracer_variables:
+      self._write_report('%s %s\n'%(tensor.name, checkpoint_name))
+      checkpoint_operations.add(tensor.op.name)
+    self._write_report('%s %s\n'%(_MARKER_SECTION_END,
+                                  _TENSOR_TRACER_CHECKPOINT))
+    return checkpoint_operations
+
+  def _generate_flush_cache_op(self, graph, start_replica, on_tpu):
+    """Generates an Op that will flush the cache to file.
+
+    Args:
+      graph: the graph of Ops
+      start_replica: the ID of the first replica being flushed by this Op.
+      on_tpu: if the graph is executed on TPU.
+
+    Returns:
+      The Op to flush the cache to file.
+    """
+    def _make_flush_fun(replica_id):
+      """Makes a function for flushing the cache for the given replica."""
+
+      def _fun():
+        """A function that flushes the cache to a file."""
+
+        def _flush_fun(cache):
+          """Flushes the cache to a file."""
+
+          if isinstance(replica_id, str):
+            replica_id_str = replica_id
+          else:
+            replica_id_str = '%d'%replica_id
+          if self._trace_dir:
+            output_path = os.path.join(self._trace_dir,
+                                       _COMPACT_TRACE_FILE_PREFIX) \
+                                       + replica_id_str
+            output_stream = _OUTPUT_STREAM_ESCAPE + output_path
+          else:
+            output_stream = sys.stderr
+          new_step_line = _REPLICA_ID_TAG + replica_id_str
+          print_op = logging_ops.print_v2(
+              new_step_line, '\n',
+              cache, '\n',
+              summarize=-1,
+              output_stream=output_stream)
+          with ops.control_dependencies([print_op]):
+            return constant_op.constant(0).op
+
+        cache = _get_tensor_values_cache(graph)
+        if on_tpu:
+          flush_op = tpu.outside_compilation(_flush_fun, cache.value())
+        else:
+          flush_op = _flush_fun(cache.value())
+        with ops.control_dependencies([flush_op]):
+          reset_value = constant_op.constant(_COMPACT_TRACE_ENTRY_INIT_VALUE,
+                                             dtype=cache.dtype,
+                                             shape=cache.shape)
+          assign_op = state_ops.assign(cache, reset_value).op
+          with ops.control_dependencies([assign_op]):
+            return flush_op.outputs[0]
+
+      return _fun
+
+    def _f(replica_id):
+      return _make_flush_fun(replica_id)
+    def _eq(x):
+      return math_ops.equal(x, self._replica_id)
+    def _do_nothing():
+      return constant_op.constant(0)
+
+    return control_flow_ops.case({\
+                                  _eq(start_replica): _f(start_replica), \
+                                  _eq(start_replica+1): _f(start_replica+1), \
+                                  _eq(start_replica+2): _f(start_replica+2), \
+                                  _eq(start_replica+3): _f(start_replica+3), \
+                                  _eq(start_replica+4): _f(start_replica+4), \
+                                  _eq(start_replica+5): _f(start_replica+5), \
+                                  _eq(start_replica+6): _f(start_replica+6), \
+                                  _eq(start_replica+7): _f(start_replica+7), \
+    },
+                                 default=_do_nothing,
+                                 exclusive=True).op
+
+  def _flush_tensor_values_cache(self, graph, tensor_fetches, op_fetches,
+                                 on_tpu):
+    """Flushes the intermediate tensor values in the graph to the cache.
+
+    Args:
+      graph: the graph of Ops
+      tensor_fetches: list of tensor results returned by the model_fn.
+      op_fetches: list of ops that are returned by the model_fn, e.g., train_op.
+      on_tpu: if the graph is executed on TPU.
+
+    Returns:
+      An identical copy of tensor_fetches.
+    """
+    # Add a dependency to op and tensor fetches to make sure that all tracing
+    # ops are executed before flushing trace results.
+    with ops.control_dependencies(op_fetches +
+                                  [tensor.op for tensor in tensor_fetches]):
+      flush_cache_op_list = []
+      for host in range(self._num_hosts):
+        start_replica = host * 8
+        flush_op = self._generate_flush_cache_op(graph, start_replica, on_tpu)
+        flush_cache_op_list.append(flush_op)
+      return control_flow_ops.tuple(tensor_fetches,
+                                    control_inputs=flush_cache_op_list)
+
+  def _process_tensor_fetches(self, tensor_fetches):
+    """Check that tensor_fetches is not empty and have valid tensors."""
+    # If none or empty list.
+    if tensor_fetches is None:
+      raise RuntimeError('tensor_fetches provided to tensor_tracer cannot be '
+                         'None.')
+    if not isinstance(tensor_fetches, (list, tuple)):
+      tensor_fetches = [tensor_fetches]
+    elif not tensor_fetches:
+      raise RuntimeError('tensor_fetches provided to tensor_tracer cannot be '
+                         'empty list.')
+    fetches = []
+    for fetch in tensor_fetches:
+      if isinstance(fetch, ops.Tensor):
+        fetches.append(fetch)
+      else:
+        raise RuntimeError('Given tensor_fetch:%s is not a tensor.' % fetch)
+    return fetches
+
+  def _process_op_fetches(self, op_fetches):
+    """Check that op_fetches have valid ops."""
+    if op_fetches is None:
+      return []
+
+    if not isinstance(op_fetches, (list, tuple)):
+      op_fetches = [op_fetches]
+
+    fetches = []
+    for fetch in op_fetches:
+      if isinstance(fetch, ops.Operation):
+        fetches.append(fetch)
+      else:
+        logging.warning('Ignoring the given op_fetch:%s, which is not an op.' %
+                        fetch)
+    return fetches
+
+  def _convert_fetches_to_input_format(self, input_fetches, current_fetches):
+    """Changes current_fetches' format, so that it matches input_fetches."""
+    if isinstance(input_fetches, ops.Tensor):
+      if len(current_fetches) != 1:
+        raise RuntimeError('Tensor tracer input/output fetches do not match.')
+      return current_fetches[0]
+    else:
+      if len(current_fetches) != len(current_fetches):
+        raise RuntimeError('Tensor tracer input/output fetches do not match.')
+      elif isinstance(input_fetches, tuple):
+        return tuple(current_fetches)
+      else:
+        return current_fetches
+
+  def _get_op_control_flow_context(self, op):
+    """Returns the control flow of the given op.
+
+    Args:
+      op: tf.Operation for which the control flow context is requested.
+    Returns:
+      op_control_flow_context: which the is control flow context of the given
+      op. If the operation type is LoopExit, returns the outer control flow
+      context.
+    """
+    # pylint: disable=protected-access
+    op_control_flow_context = op._control_flow_context
+    # pylint: enable=protected-access
+    if control_flow_util.IsLoopExit(op):
+      op_control_flow_context = op_control_flow_context.outer_context
+    return op_control_flow_context
+
+  def _trace_execution(self, graph,
+                       tensor_fetches,
+                       op_fetches=None,
+                       on_tpu=True):
+    """Commong tracing function for both CPU and TPUs.
+
+    The caller function should set _device_type, _num_replicas,
+    _num_replicas_per_host, _num_hosts and _replica_id before calling
+    _trace_execution.
+
+
+    Args:
+      graph: the graph of Ops executed on the TPU.
+      tensor_fetches: a (list,tuple,or a single object) of tensor fetches
+        returned by model_fn given to session.run. Function must be provided
+        with as least one tensor to fetch.
+      op_fetches: A list of op fetches returned by model_fn given to
+        session.run. op_fetches and tensor_fetches are used to determine the
+        nodes that will be executed. Can be None.
+      on_tpu: True if executing on TPU.
+
+    Returns:
+      tensor_fetches: an exact copy of tensor_fetches that has additional
+                      dependencies.
+    Raises:
+      RuntimeError: If tensor_fetches is None or empty.
+    """
+    def _cast_unsupported_dtypes(tensor):
+      """Casts tensor to a supported type."""
+
+      if tensor.dtype.__eq__(dtypes.int64):
+        # outside-compilation doesn't support int64 input yet.
+        return math_ops.cast(tensor, dtypes.int32)
+      if tensor.dtype.__eq__(dtypes.bfloat16) or tensor.dtype.__eq__(
+          dtypes.float16):
+        # Since host can't handle bf16, convert tensor to f32.
+        return math_ops.cast(tensor, dtypes.float32)
+      return tensor
+
+    TensorTracer.check_device_type(self._device_type)
+    # Check in_tensor_fetches, and op_fetches and convert them to lists.
+    processed_t_fetches = self._process_tensor_fetches(tensor_fetches)
+    op_fetches = self._process_op_fetches(op_fetches)
+    all_fetches = op_fetches + [tensor.op for tensor in processed_t_fetches]
+
+    # Filter the set of ops that will be executed, and topological sort.
+    (exec_op_set, succeed, sorted_or_cycle) = self._pre_tracing(graph,
+                                                                all_fetches)
+
+    tensor_fetch_set = set(processed_t_fetches)
+    tracing_ops = []
+
+    # pylint: disable=protected-access
+    current_control_flow_context = graph._get_control_flow_context()
+    # pylint: enable=protected-access
+
+    # Trace ops only if they are in the execution path.
+    for op in exec_op_set:
+      for i in range(len(op.outputs)):
+        out_tensor = op.outputs[i]
+        tensor_name = out_tensor.name
+        if tensor_name not in self._traced_tensorname_to_cache_idx_map:
+          continue
+        # Create the list of consumers before calling _preprocess_traced_tensor.
+        # Otherwise, adding control input below, will introduce a cycle in the
+        # graph.
+        consumers = out_tensor.consumers()
+        # Not all consumers may be in the exec path. Filter out the consumers
+        # to keep the graph simpler.
+        consumers = [cop for cop in consumers if cop in exec_op_set]
+
+        # If there is no consumer of the tensor, there is no need to trace it;
+        # unless the tensor itself is one of the fetches.
+        is_a_fetched_tensor = out_tensor in tensor_fetch_set
+        if (not consumers) and (not is_a_fetched_tensor):
+          continue
+
+        op_control_flow_context = self._get_op_control_flow_context(op)
+        # pylint: disable=protected-access
+        graph._set_control_flow_context(op_control_flow_context)
+        # pylint: enable=protected-access
+        processed_out_tensor = self._preprocess_traced_tensor(out_tensor)
+
+        if on_tpu:
+          processed_out_tensor = _cast_unsupported_dtypes(processed_out_tensor)
+
+        if self._use_tensor_values_cache():
+          cache_idx = self._traced_tensorname_to_cache_idx_map[tensor_name]
+          trace_op = self._save_tensor_value_to_cache_op(graph,
+                                                         cache_idx,
+                                                         processed_out_tensor)
+        elif on_tpu:
+          trace_op = tpu.outside_compilation(
+              self._make_tensor_trace_fun(tensor_name), processed_out_tensor)
+        else:
+          trace_fun = self._make_tensor_trace_fun(tensor_name)
+          trace_op = trace_fun(processed_out_tensor)
+
+        if is_a_fetched_tensor:
+          tracing_ops.append(trace_op)
+          continue
+        # Add it to all consumers, as some consumers may not be executed if they
+        # are in a control flow.
+        for consumer_op in consumers:
+          # pylint: disable=protected-access
+          consumer_op._add_control_input(trace_op)
+          # pylint: enable=protected-access
+
+    # pylint: disable=protected-access
+    graph._set_control_flow_context(current_control_flow_context)
+    # pylint: enable=protected-access
+    if tracing_ops:
+      # If we are tracing a fetched tensor, their dependency is stored in
+      # tracing_ops.
+      processed_t_fetches = control_flow_ops.tuple(processed_t_fetches,
+                                                   control_inputs=tracing_ops)
+    if self._use_tensor_values_cache():
+      processed_t_fetches = self._flush_tensor_values_cache(graph,
+                                                            processed_t_fetches,
+                                                            op_fetches,
+                                                            on_tpu=on_tpu)
+    self._post_tracing(succeed, sorted_or_cycle)
+    # processed_t_fetches is a list at this point. Convert it to the same
+    # format as given in tensor_fetches.
+    return self._convert_fetches_to_input_format(tensor_fetches,
+                                                 processed_t_fetches)
+
+  def trace_tpu(self, graph,
+                tensor_fetches,
+                op_fetches=None,
+                num_replicas=None,
+                num_replicas_per_host=None,
+                num_hosts=None):
+    """Traces the tensors generated by TPU Ops in a TF graph.
+
+    Args:
+      graph: the graph of Ops executed on the TPU.
+      tensor_fetches: a (list,tuple,or a single object) of tensor fetches
+        returned by model_fn given to session.run. Function must be provided
+        with as least one tensor to fetch.
+      op_fetches: A list of op fetches returned by model_fn given to
+        session.run. op_fetches and tensor_fetches are used to determine the
+        nodes that will be executed. Can be None.
+      num_replicas: number of replicas used on the TPU.
+      num_replicas_per_host: number of replicas per TPU host.
+      num_hosts: total number of TPU hosts.
+
+    Returns:
+      tensor_fetches: an exact copy of tensor_fetches that has additional
+                      dependencies.
+    Raises:
+      RuntimeError: If num_replicas_per_host > 8.
+      RuntimeError: If tensor_fetches is None or empty.
+    """
+
+    if graph in TensorTracer._traced_graphs:
+      logging.warning('Graph is already rewritten with tensor tracer, ignoring '
+                      'multiple calls.')
+      return tensor_fetches
+    else:
+      TensorTracer._traced_graphs.add(graph)
+    self._device_type = _DEVICE_TYPE_TPU
+    self._num_replicas = num_replicas
+    self._num_replicas_per_host = num_replicas_per_host
+    self._num_hosts = num_hosts
+    if self._num_replicas is not None:
+      if self._num_replicas_per_host is None:
+        self._num_replicas_per_host = 8
+      if self._num_hosts is None:
+        self._num_hosts = num_replicas // self._num_replicas_per_host + \
+            (num_replicas % self._num_replicas_per_host > 0)
+
+    if self._num_replicas_per_host > 8:
+      # Checks for the assumption in _generate_flush_cache_op().
+      raise RuntimeError('num_replicas_per_host (%d) is '
+                         'greater than 8'%self._num_replicas_per_host)
+    if self._graph_dump_path:
+      graph_io.write_graph(graph, self._graph_dump_path,
+                           'graph_before_tt.pbtxt')
+    with graph.as_default():
+      self._add_replica_id_to_graph()
+      tensor_fetches = self._trace_execution(graph, tensor_fetches, op_fetches,
+                                             on_tpu=True)
+    if self._graph_dump_path:
+      graph_io.write_graph(graph, self._graph_dump_path,
+                           'graph_after_tt.pbtxt')
+    return tensor_fetches
+
+  def trace_cpu(self, graph, tensor_fetches, op_fetches=None):
+    """Traces the tensors generated by CPU Ops in a TF graph.
+
+    Args:
+      graph: the graph of Ops executed on the CPU.
+      tensor_fetches: a (list,tuple,or a single object) of tensor fetches
+        returned by model_fn given to session.run. Function must be provided
+        with as least one tensor to fetch.
+      op_fetches: A list of op fetches returned by model_fn given to
+        session.run. op_fetches and tensor_fetches are used to determine the
+        nodes that will be executed. Can be None.
+
+    Returns:
+      tensor_fetches: an exact copy of tensor_fetches that has additional
+                      dependencies.
+    Raises:
+      RuntimeError: If tensor_fetches is None or empty.
+    """
+
+    if graph in TensorTracer._traced_graphs:
+      logging.warning('Graph is already rewritten with tensor tracer, ignoring '
+                      'multiple calls.')
+      return tensor_fetches
+    else:
+      TensorTracer._traced_graphs.add(graph)
+
+    self._device_type = _DEVICE_TYPE_CPU
+    self._num_replicas = 1
+    self._num_replicas_per_host = 1
+    self._num_hosts = 1
+    self._replica_id = 0
+    if self._graph_dump_path:
+      graph_io.write_graph(graph, self._graph_dump_path,
+                           'graph_before_tt.pbtxt')
+    with graph.as_default():
+      tensor_fetches = self._trace_execution(graph, tensor_fetches, op_fetches,
+                                             on_tpu=False)
+    if self._graph_dump_path:
+      graph_io.write_graph(graph, self._graph_dump_path,
+                           'graph_after_tt.pbtxt')
+    return tensor_fetches
+
+
diff --git a/tensorflow/python/tpu/topology.py b/tensorflow/python/tpu/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..00ee21e694d15d2e795b0b35289e1e116b9e76cf
--- /dev/null
+++ b/tensorflow/python/tpu/topology.py
@@ -0,0 +1,220 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+"""Defines the `Topology` class, that describes a TPU fabric topology."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.core.protobuf.tpu import topology_pb2
+
+
+def _tpu_device_name(job, task, device):
+  """Returns the device name for the TPU `device` on `task` of `job`."""
+  if job is None:
+    return "/task:%d/device:TPU:%d" % (task, device)
+  else:
+    return "/job:%s/task:%d/device:TPU:%d" % (job, task, device)
+
+
+def _tpu_host_device_name(job, task):
+  """Returns the device name for the CPU device on `task` of `job`."""
+  if job is None:
+    return "/task:%d/device:CPU:0" % task
+  else:
+    return "/job:%s/task:%d/device:CPU:0" % (job, task)
+
+
+class Topology(object):
+  """Describes a set of TPU devices.
+
+  Represents both the shape of the physical mesh, and the mapping between
+  TensorFlow TPU devices to physical mesh coordinates.
+  """
+
+  def __init__(self, serialized=None, mesh_shape=None, device_coordinates=None):
+    """Builds a Topology object.
+
+    If `serialized` is not `None`, the topology is parsed from `serialized` and
+    the other arguments are ignored. Otherwise, the topology is computed from
+    `mesh_shape` and `device_coordinates`.
+
+    Args:
+      serialized: A serialized `TopologyProto`, or `None`. If not `None`, the
+        serialized proto is parsed to discover the topology.
+      mesh_shape: A sequence of 3 positive integers, or `None`. If not `None`,
+        the shape of the TPU topology, in number of cores. Ignored if
+        `serialized` is not `None`.
+      device_coordinates: A rank 3 numpy array that describes the mapping from
+        TensorFlow TPU devices to TPU fabric coordinates, or `None`. Ignored
+        if `serialized is not `None`.
+
+    Raises:
+      ValueError: If `serialized` does not describe a well-formed topology.
+      ValueError: If `serialized` is `None` and `mesh_shape` is not a sequence
+        of 3 positive integers.
+      ValueError: If `serialized` is `None` and `device_coordinates` is not a
+        rank 3 numpy int32 array that describes a valid coordinate mapping.
+    """
+
+    self._serialized = serialized
+
+    if serialized:
+      self._parse_topology(serialized)
+    else:
+      self._mesh_shape = np.asarray(mesh_shape, dtype=np.int32)
+      self._device_coordinates = np.asarray(device_coordinates, np.int32)
+      if len(self._mesh_shape) != 3 or any(self._mesh_shape < 1):
+        raise ValueError("`mesh_shape` must be a sequence of 3 positive "
+                         "entries; got {}".format(self._mesh_shape))
+
+      if (len(self._device_coordinates.shape) != 3 or
+          self._device_coordinates.shape[2] != len(self._mesh_shape)):
+        raise ValueError("`device_coordinates` must be a rank 3 int32 array "
+                         "with minor dimension equal to the mesh shape rank")
+
+    self._topology_tasks, self._topology_devices = self._invert_topology()
+
+  def _parse_topology(self, serialized):
+    """Parses a serialized `TopologyProto` into `self`."""
+    proto = topology_pb2.TopologyProto()
+    proto.ParseFromString(serialized)
+
+    self._mesh_shape = np.array(proto.mesh_shape, dtype=np.int32)
+    if len(self._mesh_shape) != 3 or any(self._mesh_shape < 1):
+      raise ValueError("`mesh_shape` must be a vector of size 3 with positive "
+                       "entries; got {}".format(self._mesh_shape))
+
+    if proto.num_tasks < 0:
+      raise ValueError("`num_tasks` must be >= 0; got {}".format(
+          proto.num_tasks))
+    if proto.num_tpu_devices_per_task < 0:
+      raise ValueError("`num_tpu_devices_per_task` must be >= 0; got {}".format(
+          proto.num_tpu_devices_per_task))
+
+    expected_coordinates_size = (
+        proto.num_tasks * proto.num_tpu_devices_per_task * len(
+            proto.mesh_shape))
+    if len(proto.device_coordinates) != expected_coordinates_size:
+      raise ValueError("`device_coordinates` must have shape num_tasks ({}) * "
+                       "num_tpu_devices_per_task ({}) * len(mesh_shape) ({}); "
+                       "got shape {}".format(proto.num_tasks,
+                                             proto.num_tpu_devices_per_task,
+                                             proto.mesh_shape,
+                                             len(proto.device_coordinates)))
+
+    coords = np.array(proto.device_coordinates, dtype=np.int32)
+    if any(coords < 0):
+      raise ValueError("`device_coordinates` must be >= 0")
+    coords = coords.reshape((proto.num_tasks, proto.num_tpu_devices_per_task,
+                             len(proto.mesh_shape)))
+    self._device_coordinates = coords
+
+  def _invert_topology(self):
+    """Inverts a [task,device,axis] topology to [x,y,z] -> task/device maps."""
+    tasks = np.full(list(self.mesh_shape), -1, dtype=np.int32)
+    devices = np.full(list(self.mesh_shape), -1, dtype=np.int32)
+    for task in xrange(self.device_coordinates.shape[0]):
+      for device in xrange(self.device_coordinates.shape[1]):
+        x, y, z = self.device_coordinates[task, device, :]
+        tasks[x, y, z] = task
+        devices[x, y, z] = device
+    return tasks, devices
+
+  @property
+  def mesh_shape(self):
+    """A rank 1 int32 array describing the shape of the TPU topology."""
+    return self._mesh_shape
+
+  @property
+  def mesh_rank(self):
+    """Returns the number of dimensions in the mesh."""
+    return len(self._mesh_shape)
+
+  @property
+  def device_coordinates(self):
+    """Describes the mapping from TPU devices to topology coordinates.
+
+    Returns:
+      A rank 3 int32 array with shape `[tasks, devices, axis]`.
+      `tasks` is the number of tasks in the TPU cluster, `devices` is the number
+      of TPU devices per task, and `axis` is the number of axes in the TPU
+      cluster topology. Each entry gives the `axis`-th coordinate in the
+      topology of a task/device pair. TPU topologies are 3-dimensional, with
+      dimensions `(x, y, core number)`.
+    """
+    return self._device_coordinates
+
+  def task_ordinal_at_coordinates(self, device_coordinates):
+    """Returns the TensorFlow task number attached to `device_coordinates`.
+
+    Args:
+      device_coordinates: An integer sequence describing a device's physical
+        coordinates in the TPU fabric.
+
+    Returns:
+      Returns the TensorFlow task number that contains the TPU device with those
+      physical coordinates.
+    """
+    return self._topology_tasks[tuple(device_coordinates)]
+
+  def tpu_device_ordinal_at_coordinates(self, device_coordinates):
+    """Returns the TensorFlow device number at `device_coordinates`.
+
+    Args:
+      device_coordinates: An integer sequence describing a device's physical
+        coordinates in the TPU fabric.
+
+    Returns:
+      Returns the TensorFlow device number within the task corresponding to
+      attached to the device with those physical coordinates.
+    """
+    return self._topology_devices[tuple(device_coordinates)]
+
+  def cpu_device_name_at_coordinates(self, device_coordinates, job=None):
+    """Returns the CPU device attached to a logical core."""
+    return _tpu_host_device_name(
+        job, self._topology_tasks[tuple(device_coordinates)])
+
+  def tpu_device_name_at_coordinates(self, device_coordinates, job=None):
+    """Returns the name of the TPU device assigned to a logical core."""
+    return _tpu_device_name(job,
+                            self._topology_tasks[tuple(device_coordinates)],
+                            self._topology_devices[tuple(device_coordinates)])
+
+  @property
+  def num_tasks(self):
+    """Returns the number of TensorFlow tasks in the TPU slice."""
+    return self._device_coordinates.shape[0]
+
+  @property
+  def num_tpus_per_task(self):
+    """Returns the number of TPU devices per task in the TPU slice."""
+    return self._device_coordinates.shape[1]
+
+  def serialized(self):
+    """Returns the serialized form of the topology."""
+    if self._serialized is None:
+      proto = topology_pb2.TopologyProto()
+      proto.mesh_shape[:] = list(self._mesh_shape)
+      proto.num_tasks = self._device_coordinates.shape[0]
+      proto.num_tpu_devices_per_task = self._device_coordinates.shape[1]
+      proto.device_coordinates.extend(list(self._device_coordinates.flatten()))
+      self._serialized = proto.SerializeToString()
+
+    return self._serialized
diff --git a/tensorflow/contrib/tpu/python/tpu/topology_test.py b/tensorflow/python/tpu/topology_test.py
similarity index 96%
rename from tensorflow/contrib/tpu/python/tpu/topology_test.py
rename to tensorflow/python/tpu/topology_test.py
index fafe3254d84551d3d7ed8a9d3346849411714f97..9e1b7de859703f6859017d57cd73b4fbda7237b4 100644
--- a/tensorflow/contrib/tpu/python/tpu/topology_test.py
+++ b/tensorflow/python/tpu/topology_test.py
@@ -19,9 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tpu.python.tpu import topology
-
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import topology
 
 
 class TopologyTest(test.TestCase):
diff --git a/tensorflow/python/tpu/tpu.py b/tensorflow/python/tpu/tpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..02489a9b10e81d818f6d02d72e9e8c13988805c3
--- /dev/null
+++ b/tensorflow/python/tpu/tpu.py
@@ -0,0 +1,1576 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+
+"""Library of TPU helper functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.core.protobuf.tpu import dynamic_padding_pb2 as dynamic_padding
+from tensorflow.python.compat import compat as api_compat
+from tensorflow.python.framework import device as pydev
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import tpu_function
+from tensorflow.python.tpu import xla
+from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.util import compat
+from tensorflow.python.util import nest
+
+
+# Operations that indicate some error in the users graph, e.g. a placeholder
+# that's introduced outside of the infeed.
+_BLACKLISTED_OPS = set([
+    "Placeholder",
+])
+
+# XLA doesn't currently support reading of intermediate tensors, thus some ops
+# are not supported.
+_UNSUPPORTED_OPS = set([
+    "AudioSummary",
+    "AudioSummaryV2",
+    "HistogramSummary",
+    "ImageSummary",
+    "MergeSummary",
+    "Print",
+    "ScalarSummary",
+    "TensorSummary",
+    "TensorSummaryV2",
+    ])
+
+_MAX_WARNING_LINES = 5
+
+_TPU_REPLICATE_ATTR = "_tpu_replicate"
+_TPU_COMPILATION_STATUS_ATTR = "_tpu_compilation_status"
+_OUTSIDE_COMPILATION_ATTR = "_xla_outside_compilation"
+
+
+def _tpu_system_device_name(job):
+  """Returns the device name for the TPU_SYSTEM device of `job`."""
+  if job is None:
+    return "/device:TPU_SYSTEM:0"
+  else:
+    return "/job:%s/device:TPU_SYSTEM:0" % job
+
+
+def initialize_system(embedding_config=None, job=None):
+  """Initializes a distributed TPU system for use with TensorFlow.
+
+  Args:
+    embedding_config: If not None, a `TPUEmbeddingConfiguration` proto
+      describing the desired configuration of the hardware embedding lookup
+      tables. If embedding_config is None, no hardware embeddings can be used.
+    job: The job (the XXX in TensorFlow device specification /job:XXX) that
+      contains the TPU devices that will be initialized. If job=None it is
+      assumed there is only one job in the TensorFlow flock, and an error will
+      be returned if this assumption does not hold.
+  Returns:
+    A serialized `TopologyProto` that describes the TPU system. Note:
+      the topology must be evaluated using `Session.run` before it can be used.
+  """
+  config_string = ("" if embedding_config is None else
+                   embedding_config.SerializeToString())
+  with ops.device(_tpu_system_device_name(job)):
+    return tpu_ops.configure_distributed_tpu(embedding_config=config_string)
+
+
+def shutdown_system(job=None):
+  """Shuts down a running a distributed TPU system."""
+  with ops.device(_tpu_system_device_name(job)):
+    shutdown_distributed_tpu = tpu_ops.shutdown_distributed_tpu()
+  return shutdown_distributed_tpu
+
+
+def core(num):
+  """Returns the device name for a core in a replicated TPU computation.
+
+  Args:
+    num: the virtual core number within each replica to which operators should
+    be assigned.
+  Returns:
+    A device name, suitable for passing to `tf.device()`.
+  """
+  return "device:TPU_REPLICATED_CORE:{}".format(num)
+
+
+class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
+  """A `ControlFlowContext` for nodes inside a TPU computation.
+
+  The primary role of `TPUReplicateContext` is to mark operators inside a
+  tpu.replicate() computation with the attribute "_tpu_replicate=XYZ", where XYZ
+  is a unique name.
+
+  We use a `ControlFlowContext` to perform the annotation since it integrates
+  with Tensorflow constructs like ResourceVariables. For example, if a
+  `ResourceVariable` is constructed inside a tpu.replicate() block, the
+  `ResourceVariable` implementation can use
+  `with ops.control_dependencies(None)` to build the variable's definition
+  outside the replicated computation.
+  """
+
+  def __init__(self, name, num_replicas, pivot):
+    """Builds a new TPUReplicateContext.
+
+    Args:
+      name: a unique name for the context, used to populate the `_tpu_replicate`
+        attribute.
+      num_replicas: an integer that gives the number of replicas for the
+        computation.
+      pivot: a pivot node. Nodes in the TPUReplicateContext that do not have any
+        inputs will have a control dependency on the pivot node. This ensures
+        that nodes are correctly included in any enclosing control flow
+        contexts.
+    """
+    super(TPUReplicateContext, self).__init__()
+    self._num_replicas = num_replicas
+    self._outer_device_function_stack = None
+    self._oc_dev_fn_stack = None
+    self._outside_compilation_cluster = None
+    self._outside_compilation_counter = 0
+    self._in_gradient_colocation = None
+    self._gradient_colocation_stack = []
+    self._host_compute_core = []
+    self._name = name
+    self._name_as_bytes = compat.as_bytes(name)
+    self._unsupported_ops = []
+    self._pivot = pivot
+    self._replicated_vars = {}
+
+  def get_replicated_var_handle(self, name, vars_):
+    """Returns a variable handle for replicated TPU variable 'var'.
+
+    This is a method used by an experimental replicated variable implementation
+    and is not intended as a public API.
+
+    Args:
+      name: The common name of the variable.
+      vars_: The replicated TPU variables.
+
+    Returns:
+      The handle of the TPU replicated input node.
+    """
+    handle = self._replicated_vars.get(name)
+    if handle is not None:
+      return handle
+
+    # Builds a TPUReplicatedInput node for the variable, if one does not already
+    # exist. The TPUReplicatedInput node must belong to the enclosing
+    # control-flow scope of the TPUReplicateContext.
+    # TODO(phawkins): consider changing the contract of the TPU encapsulation
+    # so the TPUReplicatedInput nodes go inside the TPUReplicateContext scope
+    # instead.
+
+    # pylint: disable=protected-access
+    graph = ops.get_default_graph()
+    saved_context = graph._get_control_flow_context()
+    graph._set_control_flow_context(self.outer_context)
+    handle = tpu_ops.tpu_replicated_input(
+        [v.handle for v in vars_], name=name + "/handle")
+    graph._set_control_flow_context(saved_context)
+    # pylint: enable=protected-access
+    self._replicated_vars[name] = handle
+    return handle
+
+  def report_unsupported_operations(self):
+    if self._unsupported_ops:
+      op_str = "\n".join(["  %s (%s)" % (op.type, op.name)
+                          for op in self._unsupported_ops[:_MAX_WARNING_LINES]])
+      logging.warning("%d unsupported operations found: \n%s",
+                      len(self._unsupported_ops), op_str)
+      if len(self._unsupported_ops) > _MAX_WARNING_LINES:
+        logging.warning("... and %d more" %
+                        (len(self._unsupported_ops) - _MAX_WARNING_LINES))
+
+  def EnterGradientColocation(self, op, gradient_uid):
+    if op is not None:
+      self._gradient_colocation_stack.append(op)
+      if not self._outside_compilation_cluster:
+        try:
+          outside_attr = op.get_attr(_OUTSIDE_COMPILATION_ATTR)
+          if self._in_gradient_colocation:
+            raise NotImplementedError(
+                "Cannot nest gradient colocation operations outside compilation"
+            )
+          if gradient_uid == "__unsupported__":
+            raise NotImplementedError(
+                "No gradient_uid calling gradient within outside_compilation")
+          # When we take the gradient of an op X in an outside_compilation
+          # cluster C in a forward computation we would like to put the ops
+          # corresponding to the gradient of X into a new outside_compilation
+          # cluster C'. However, if we take the gradient of X twice, the second
+          # one should get yet another new outside_compilation cluster C''.
+          #
+          # The mechanism we adopt is to use a 'root_cluster' which is the
+          # cluster that X was in before we took gradients, and a 'gradient_uid'
+          # which is different for every invocation of gradients, and put the
+          # gradient of X in cluster 'root_cluster.gradient_uid'.
+          #
+          # When taking a gradient of a gradient, some ops will be colocated
+          # with Op in the forward pass (e.g., cluster root_cluster) and some in
+          # the backward pass (e.g., cluster root_cluster.initial_gradient_uid).
+          # We need all of the grad-of-grad ops to be in the same cluster to
+          # avoid cyclic dependencies between clusters. We adopt a heuristic
+          # that puts any op clustered with root_cluster.<xxx> in
+          # root_cluster.gradient_uid, even if xxx was initial_gradient_uid.
+          self._in_gradient_colocation = op
+          parts = outside_attr.split(".")
+          cluster = parts[0] + "." + gradient_uid
+          self._EnterOutsideCompilationScope(cluster=cluster)
+        except ValueError:
+          # The attr was not present: do nothing.
+          pass
+
+  def ExitGradientColocation(self, op, gradient_uid):
+    if op is not None:
+      if not self._gradient_colocation_stack:
+        raise errors.InternalError(
+            op.node_def, op,
+            "Badly nested gradient colocation: empty stack when popping Op " +
+            op.name)
+      last_op = self._gradient_colocation_stack.pop()
+      if op is last_op:
+        if op is self._in_gradient_colocation:
+          self._in_gradient_colocation = None
+          self._ExitOutsideCompilationScope()
+      else:
+        raise errors.InternalError(
+            op.node_def, op, "Badly nested gradient colocation, expected " +
+            last_op + ", got " + op.name)
+
+  def _EnterOutsideCompilationScope(self, cluster=None):
+
+    class FakeOp(object):
+      """A helper class to determine the current device.
+
+      Supports only the type and device set/get methods needed to run the
+      graph's _apply_device_function method.
+      """
+
+      def __init__(self):
+        self._device = ""
+
+      @property
+      def type(self):
+        return "FakeOp"
+
+      @property
+      def device(self):
+        return self._device
+
+      def _set_device(self, device):
+        if isinstance(device, pydev.DeviceSpec):
+          self._device = device.to_string()
+        else:
+          self._device = device
+
+    if self._outside_compilation_cluster:
+      raise NotImplementedError("Cannot nest outside_compilation clusters")
+    if cluster:
+      self._outside_compilation_cluster = cluster
+    else:
+      self._outside_compilation_cluster = str(self._outside_compilation_counter)
+      self._outside_compilation_counter += 1
+    graph = ops.get_default_graph()
+    fake_op = FakeOp()
+    graph._apply_device_functions(fake_op)  # pylint: disable=protected-access
+    device = pydev.DeviceSpec.from_string(fake_op.device)
+    if (device.device_type == "TPU_REPLICATED_CORE" and
+        device.device_index is not None):
+      self._host_compute_core.append(self._outside_compilation_cluster + ":" +
+                                     str(device.device_index))
+    self._oc_dev_fn_stack = graph._device_function_stack  # pylint: disable=protected-access
+    graph._device_function_stack = self._outer_device_function_stack  # pylint: disable=protected-access
+
+  def _ExitOutsideCompilationScope(self):
+    if not self._outside_compilation_cluster:
+      raise NotImplementedError(
+          "Attempted to exit outside_compilation scope when not in scope")
+    self._outside_compilation_cluster = None
+    graph = ops.get_default_graph()
+    graph._device_function_stack = self._oc_dev_fn_stack  # pylint: disable=protected-access
+
+  def Enter(self):
+    if not self._outer_device_function_stack:
+      # Capture the device function stack at the time of first entry
+      # since that is the stack that will be used outside_compilation.
+      graph = ops.get_default_graph()
+      # pylint: disable=protected-access
+      self._outer_device_function_stack = graph._device_function_stack.copy()
+      # pylint: enable=protected-access
+    super(TPUReplicateContext, self).Enter()
+
+  def HostComputeCore(self):
+    return self._host_compute_core
+
+  def _RemoveExternalControlEdges(self, op):
+    """Remove any external control dependency on this op."""
+    internal_control_inputs = []
+    external_control_inputs = []
+    for x in op.control_inputs:
+      # pylint: disable=protected-access
+      is_internal_op = False
+      ctxt = x._get_control_flow_context()
+      while ctxt is not None:
+        if ctxt == self:
+          is_internal_op = True
+          break
+        ctxt = ctxt._outer_context
+      if is_internal_op:
+        internal_control_inputs.append(x)
+      else:
+        external_control_inputs.append(x)
+      # pylint: enable=protected-access
+    # pylint: disable=protected-access
+    op._remove_all_control_inputs()
+    op._add_control_inputs(internal_control_inputs)
+    # pylint: enable=protected-access
+    return internal_control_inputs, external_control_inputs
+
+  def AddOp(self, op):
+    # pylint: disable=protected-access
+    if op.type in _BLACKLISTED_OPS:
+      logging.error("Operation of type %s (%s) is not supported on the TPU. "
+                    "Execution will fail if this op is used in the graph. " %
+                    (op.type, op.name))
+
+    if op.type in _UNSUPPORTED_OPS:
+      self._unsupported_ops.append(op)
+
+    if any(x.dtype._is_ref_dtype for x in op.inputs):
+      raise NotImplementedError(
+          "Non-resource Variables are not supported inside TPU computations "
+          "(operator name: %s)" % op.name)
+    if _TPU_REPLICATE_ATTR in op.node_def.attr:
+      raise ValueError("TPU computations cannot be nested")
+    op._set_attr(_TPU_REPLICATE_ATTR,
+                 attr_value_pb2.AttrValue(s=self._name_as_bytes))
+    if self._outside_compilation_cluster:
+      op._set_attr(
+          _OUTSIDE_COMPILATION_ATTR,
+          attr_value_pb2.AttrValue(
+              s=compat.as_bytes(self._outside_compilation_cluster)))
+    if self._num_replicas > 1 or not self._outside_compilation_cluster:
+      # Prevent feeding or fetching anything that is being compiled,
+      # and any replicated outside_compilation Op.
+      op.graph.prevent_feeding(op)
+      op.graph.prevent_fetching(op)
+
+    # Remove any control edges from outer control flow contexts. These may cause
+    # mismatched frame errors.
+    (internal_control_inputs,
+     external_control_inputs) = self._RemoveExternalControlEdges(op)
+
+    if not op.inputs:
+      # Add a control edge from the control pivot to this op.
+      if not internal_control_inputs:
+        # pylint: disable=protected-access
+        op._add_control_input(self.GetControlPivot())
+        # pylint: enable=protected-access
+    else:
+      for index in xrange(len(op.inputs)):
+        x = op.inputs[index]
+        real_x = self.AddValue(x)
+        if real_x != x:
+          op._update_input(index, real_x)  # pylint: disable=protected-access
+
+    if external_control_inputs:
+      # Use an identity to pull control inputs as data inputs. Note that we
+      # ignore ops which don't have outputs. TODO(phawkins): fix that.
+      with ops.control_dependencies(None):
+        self.Enter()
+        external_control_inputs = [
+            array_ops.identity(x.outputs[0]).op
+            for x in external_control_inputs
+            if x.outputs
+        ]
+        self.Exit()
+      # pylint: disable=protected-access
+      op._add_control_inputs(external_control_inputs)
+      # pylint: enable=protected-access
+
+    # Mark op's outputs as seen by this context and any outer contexts.
+    output_names = [x.name for x in op.outputs]
+    context = self
+    while context is not None:
+      # pylint: disable=protected-access
+      context._values.update(output_names)
+      context = context._outer_context
+      # pylint: enable=protected-access
+
+    if self._outer_context:
+      self._outer_context.AddInnerOp(op)
+
+  def AddValue(self, val):
+    """Add `val` to the current context and its outer context recursively."""
+    if val.name in self._values:
+      # Use the real value if it comes from outer context.
+      result = self._external_values.get(val.name)
+      return val if result is None else result
+
+    result = val
+    self._values.add(val.name)
+    if self._outer_context:
+      result = self._outer_context.AddValue(val)
+      self._values.add(result.name)
+
+    self._external_values[val.name] = result
+
+    return result
+
+  def AddInnerOp(self, op):
+    self.AddOp(op)
+    if self._outer_context:
+      self._outer_context.AddInnerOp(op)
+
+  @property
+  def grad_state(self):
+    # Define the gradient loop state associated with the TPUReplicateContext to
+    # be None as the TPUReplicateContext does not get nested nor does the
+    # grad_state outside the TPUReplicateContext affect the graph inside so the
+    # grad_state should be as if this is the top-level gradient state.
+    return None
+
+  @property
+  def back_prop(self):
+    """Forwards to the enclosing while context, if any."""
+    if self.GetWhileContext():
+      return self.GetWhileContext().back_prop
+    return False
+
+  def GetControlPivot(self):
+    return self._pivot
+
+
+def outside_compilation(computation, *args, **kwargs):
+  """Builds part of a computation outside any current TPU replicate scope.
+
+  Args:
+    computation: A Python function that builds the computation to
+      place on the host.
+    *args: the positional arguments for the computation.
+    **kwargs: the keyword arguments for the computation.
+
+  Returns:
+    The Tensors returned by computation.
+  """
+  args = [] if args is None else args
+  graph = ops.get_default_graph()
+
+  # If we are in a TPUReplicateContext, signal that we are now
+  # outside_compilation
+  initial_context = graph._get_control_flow_context()  # pylint: disable=protected-access
+  context = initial_context
+  while context:
+    if isinstance(context, TPUReplicateContext):
+      context._EnterOutsideCompilationScope()  # pylint: disable=protected-access
+    context = context.outer_context
+
+  retval = computation(*args, **kwargs)
+
+  # If we are in a TPUReplicateContext, signal that we are no longer
+  # outside_compilation
+  final_context = graph._get_control_flow_context()  # pylint: disable=protected-access
+  if initial_context is not final_context:
+    raise NotImplementedError(
+        "Control-flow context cannot be different at start and end of an "
+        "outside_compilation scope")
+  context = initial_context
+  while context:
+    if isinstance(context, TPUReplicateContext):
+      context._ExitOutsideCompilationScope()  # pylint: disable=protected-access
+    context = context.outer_context
+
+  return retval
+
+
+def replicate(computation,
+              inputs=None,
+              infeed_queue=None,
+              device_assignment=None,
+              name=None,
+              maximum_shapes=None):
+  """Builds a graph operator that runs a replicated TPU computation.
+
+  Args:
+    computation: A Python function that builds the computation to replicate.
+    inputs: A list of lists of input tensors or `None` (equivalent to
+      `[[]]`), indexed by `[replica_num][input_num]`. All replicas must
+      have the same number of inputs. Each input can be a nested structure
+      containing values that are convertible to tensors. Note that passing an
+      N-dimension list of compatible values will result in a N-dimention list of
+      scalar tensors rather than a single Rank-N tensors. If you need different
+      behavior, convert part of inputs to tensors with `tf.convert_to_tensor`.
+    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
+      of arguments as inputs to computation.
+    device_assignment: If not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. Uses a default device assignment if `None`. The
+      `DeviceAssignment` may be omitted if each replica of the computation uses
+      only one core, and there is either only one replica, or the number of
+      replicas is equal to the number of cores in the TPU system.
+    name: (Deprecated) Does nothing.
+    maximum_shapes: A nested structure of tf.TensorShape representing the shape
+      to which the respective component of each input element in each replica
+      should be padded. Any unknown dimensions (e.g. tf.Dimension(None) in a
+      tf.TensorShape or -1 in a tensor-like object) will be padded to the
+      maximum size of that dimension over all replicas. Note that if the input
+      dimension is already static, we won't do padding on it and we require the
+      maximum_shapes to have the same value or None on that dimension. The
+      structure of `maximum_shapes` needs to be the same as `inputs[0]`.
+  Returns:
+    A list of outputs, indexed by `[replica_num]` each output can be a nested
+    structure same as what computation() returns with a few exceptions.
+
+    Exceptions include:
+      1) None output: a NoOp would be returned which control-depends on
+         computation.
+      2) Single value output: A tuple containing the value would be returned.
+      3) Operation-only outputs: a NoOp would be returned which
+         control-depends on computation.
+      TODO(b/121383831): Investigate into removing these special cases.
+
+  Raises:
+    ValueError: If all replicas do not have equal numbers of input tensors.
+    ValueError: If the number of inputs per replica does not match
+      the number of formal parameters to `computation`.
+    ValueError: If the static `inputs` dimensions don't match with the values
+      given in `maximum_shapes`.
+    ValueError: If the structure of inputs per replica does not match
+      the structure of `maximum_shapes`.
+  """
+  return split_compile_and_replicate(
+      computation,
+      inputs,
+      infeed_queue,
+      device_assignment,
+      name,
+      maximum_shapes=maximum_shapes)[1]
+
+
+def _pad_all_input(inputs, padded_shapes):
+  """Pad all input tensors given padded_shapes.
+
+  The real shape tensors will be concatenated with the padded original inputs.
+
+  Args:
+    inputs: The original inputs.
+    padded_shapes: A list of padded shapes for each input.
+
+  Returns:
+    The padded inputs and a PaddingMap list which maps the padded input
+    dimension to the real shape argument index.
+  """
+  input_shape_tensors = []
+  for core_idx, inputs_per_core in enumerate(inputs):
+    for idx, input_tensor in enumerate(inputs_per_core):
+      if core_idx == 0:
+        input_shape_tensors.append([])
+      input_shape_tensors[idx].append(array_ops.shape(input_tensor))
+
+  maximum_shapes = []
+  for shapes_per_input in input_shape_tensors:
+    maximum_shapes.append(
+        math_ops.reduce_max(array_ops.stack(shapes_per_input), axis=0))
+
+  padded_inputs = []
+  real_shapes = []
+  padding_maps = []
+  for core_idx, inputs_per_core in enumerate(inputs):
+    padded_inputs.append([])
+    real_shapes.append([])
+    real_shape_idx = len(inputs_per_core) - 1
+    for idx, input_tensor in enumerate(inputs_per_core):
+      input_shape_tensor = input_shape_tensors[idx][core_idx]
+      input_shape = input_tensor.get_shape()
+      padded_shape = padded_shapes[idx]
+
+      # The static shape of inputs should be compatible with the given padded
+      # shapes.
+      input_shape.assert_is_compatible_with(padded_shape)
+
+      if input_shape.is_fully_defined():
+        # Do nothing if the shape of the whole tensor is already static.
+        padded_inputs[core_idx].append(input_tensor)
+      else:
+        # Only pad the non static shape dimension.
+        for i, s in enumerate(input_shape):
+          if s.value is None:
+            if core_idx == 0:
+              real_shape_idx += 1
+              padding_map = dynamic_padding.PaddingMap()
+              padding_map.arg_index = idx
+              padding_map.shape_index = i
+              padding_map.padding_arg_index = real_shape_idx
+              padding_maps.append(padding_map)
+            real_shapes[core_idx].append(
+                math_ops.cast(input_shape_tensor[i], dtypes.uint32))
+
+        paddings = []
+        for i, s in enumerate(padded_shape):
+          if input_shape[i].value:
+            # Don't pad if input shape is already static.
+            padding = [0, 0]
+          else:
+            if s.value:
+              # Pad to the given maximum value.
+              padding = [0, s.value - input_shape_tensor[i]]
+            else:
+              # If maximum value is not given, then pad to the maximum dimension
+              # among all the cores.
+              padding = [0, maximum_shapes[idx][i] - input_shape_tensor[i]]
+          paddings.append(padding)
+
+        padded_input = array_ops.pad(input_tensor, paddings)
+        padded_inputs[core_idx].append(padded_input)
+
+  num_replicas = len(padded_inputs)
+  for i in range(num_replicas):
+    padded_inputs[i].extend(real_shapes[i])
+
+  return padded_inputs, padding_maps
+
+
+def split_compile_and_replicate(computation,
+                                inputs=None,
+                                infeed_queue=None,
+                                device_assignment=None,
+                                name=None,
+                                use_tpu=True,
+                                maximum_shapes=None):
+  """Builds graph operators that runs compilation and replicated computation.
+
+  This is a lower level interface than replicate that returns a separate compile
+  and execute output tensor. In the generated graph the compile op feeds into
+  the execute op and no additional compilation is incurred when running the
+  compile op before the execute op. The compile op returns additional
+  information about the compilation but does not return the compiled program.
+
+  Args:
+    computation: A Python function that builds the computation to replicate.
+    inputs: A list of lists of input tensors or `None` (equivalent to
+      `[[]]`), indexed by `[replica_num][input_num]`. All replicas must
+      have the same number of inputs. Each input can be a nested structure
+      containing values that are convertible to tensors. Note that passing an
+      N-dimension list of compatible values will result in a N-dimention list of
+      scalar tensors rather than a single Rank-N tensors. If you need different
+      behavior, convert part of inputs to tensors with `tf.convert_to_tensor`.
+    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
+      of arguments as inputs to computation.
+    device_assignment: If not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. Uses a default device assignment if `None`. The
+      `DeviceAssignment` may be omitted if each replica of the computation uses
+      only one core, and there is either only one replica, or the number of
+      replicas is equal to the number of cores in the TPU system.
+    name: (Deprecated) Does nothing.
+    use_tpu: When false, the input `computation` is executed on the XLA CPU/GPU
+      backends. Currently, only supports a default placement (computation is
+      placed on GPU if one is available, and on CPU if not).
+    maximum_shapes: A nested structure of tf.TensorShape representing the shape
+      to which the respective component of each input element in each replica
+      should be padded. Any unknown dimensions (e.g. tf.Dimension(None) in a
+      tf.TensorShape or -1 in a tensor-like object) will be padded to the
+      maximum size of that dimension over all replicas. Note that if the input
+      dimension is already static, we won't do padding on it and we require the
+      maximum_shapes to have the same value or None on that dimension. The
+      structure of `maximum_shapes` needs to be the same as `inputs[0]`.
+
+  Returns:
+    A list of lists with the first list corresponding to the compile op and the
+    second a list of output tensors, indexed by `[replica_num][output_num]`.
+  Raises:
+    ValueError: If all replicas do not have equal numbers of input tensors.
+    ValueError: If the number of inputs per replica does not match
+      the number of formal parameters to `computation`.
+    ValueError: If the static `inputs` dimensions don't match with the values
+      given in `maximum_shapes`.
+    ValueError: If the structure of inputs per replica does not match
+      the structure of `maximum_shapes`.
+  """
+  del name
+  inputs = [[]] if inputs is None else inputs
+
+  metadata_kwargs = {}
+  if device_assignment is not None:
+    # Turn the Numpy array into a flattened list so we can pass it as an
+    # operator attribute.
+    metadata_kwargs = {
+        "topology":
+            device_assignment.topology.serialized(),
+        "device_assignment":
+            device_assignment.core_assignment.flatten().tolist()
+    }
+    # TODO(phawkins): remove this case after the forward compatibility window
+    # expires on 2018-10-5.
+    if api_compat.forward_compatible(2018, 10, 5):
+      metadata_kwargs["num_cores_per_replica"] = (
+          device_assignment.num_cores_per_replica)
+    else:
+      metadata_kwargs["computation_shape"] = [
+          device_assignment.num_cores_per_replica
+      ]
+
+  if ((not isinstance(inputs, list)) or
+      any(not isinstance(inp, (list, tuple)) for inp in inputs)):
+    raise TypeError("tpu.replicate() inputs must be a list of lists/tuples")
+
+  num_replicas = len(inputs)
+
+  # No replicas? Nothing to do.
+  if num_replicas == 0:
+    return []
+
+  # Checks all replicas have the same structure.
+  for i in xrange(1, num_replicas):
+    nest.assert_same_structure(inputs[0], inputs[i])
+
+  # Flatten inputs.
+  flat_inputs = [
+      nest.flatten(per_replica_input) for per_replica_input in inputs
+  ]
+  # Converts inputs to Tensors.
+  flat_inputs = [[ops.convert_to_tensor(x) for x in inp] for inp in flat_inputs]
+
+  # Verifies that all replicas have matching numbers and types of inputs
+  flat_input_types = [x.dtype for x in flat_inputs[0]]
+  input_arity = len(inputs[0])
+  flat_input_arity = len(flat_input_types)
+  for i in range(num_replicas):
+    if len(inputs[i]) != input_arity:
+      raise ValueError("Replicas must have the same number of inputs. "
+                       "Replica 0 had {} inputs, replica {} had {} "
+                       "inputs.".format(input_arity, i, len(inputs[i])))
+
+    types = [x.dtype for x in flat_inputs[i]]
+    if types != flat_input_types:
+      raise ValueError("Replicas must have matching input types. Replica 0 had "
+                       "input types {}, replica {} had input types {}".format(
+                           flat_input_types, i, types))
+
+  arg_error = xla.check_function_argument_count(
+      computation, input_arity, infeed_queue)
+  if arg_error is not None:
+    if infeed_queue is None:
+      raise TypeError(
+          "Supplied computation cannot be called with the specified inputs. "
+          "You specified %d inputs: %s, but the computation needs %s" % (
+              input_arity, str([i.name for i in inputs[0]]), arg_error))
+    else:
+      raise TypeError(
+          "Supplied computation cannot be called with the specified inputs. "
+          "You specified %d inputs: %s and %d additional inputs from infeed,"
+          " but the computation needs %s" % (input_arity, str(
+              [i.name
+               for i in inputs[0]]), infeed_queue.number_of_tuple_elements,
+                                             arg_error))
+
+  if maximum_shapes:
+    if infeed_queue:
+      raise ValueError(
+          "Dynamic input shapes are not supported with infeed queues")
+
+    # Make sure maximum_shapes has the same structure as inputs.
+    nest.assert_same_structure(inputs[0], maximum_shapes, check_types=False)
+
+    # Flatten padded shapes.
+    flat_maximum_shapes = nest.flatten(maximum_shapes)
+    flat_maximum_shapes = [
+        tensor_shape.TensorShape(s) for s in flat_maximum_shapes
+    ]
+
+    flat_inputs, padding_maps = _pad_all_input(flat_inputs, flat_maximum_shapes)
+
+    serialized_padding_maps = []
+    for padding_map in padding_maps:
+      serialized_padding_maps.append(padding_map.SerializeToString())
+    metadata_kwargs["padding_map"] = serialized_padding_maps
+
+  metadata_kwargs["step_marker_location"] = getattr(
+      computation, "step_marker_location", "STEP_MARK_AT_ENTRY")
+
+  graph = ops.get_default_graph()
+
+  # Fan-in: Builds a TPUReplicatedInput node for each input.
+  flat_replicated_inputs = []
+  for i in range(0, len(flat_inputs[0])):
+    replicas = [flat_inputs[replica][i] for replica in xrange(num_replicas)]
+    flat_replicated_inputs.append(
+        tpu_ops.tpu_replicated_input(replicas, name="input{}".format(i)))
+
+  cluster_name = graph.unique_name("cluster")
+  pivot = control_flow_ops.no_op(name=cluster_name + "/pivot")
+  context = TPUReplicateContext(
+      name=cluster_name, num_replicas=num_replicas, pivot=pivot)
+  try:
+    context.Enter()
+
+    metadata = tpu_ops.tpu_replicate_metadata(
+        num_replicas=num_replicas, use_tpu=use_tpu, **metadata_kwargs)
+
+    with tpu_function.tpu_shard_context(
+        num_replicas), ops.control_dependencies([metadata]):
+
+      # Add identity ops so even unused inputs are "consumed" by the
+      # computation. This is to avoid orphaned TPUReplicatedInput nodes.
+      # TODO(phawkins): consider instead pruning unused TPUReplicatedInput
+      # and eliding trivial TPUReplicatedInput/TPUReplicatedOutput pairs.
+      flat_replicated_inputs = [
+          array_ops.identity(x, name="replicated_input_{}".format(i))
+          for i, x in enumerate(flat_replicated_inputs)
+      ]
+      for i in flat_replicated_inputs:
+        # pylint: disable=protected-access
+        # Add an attribute to the identity node so that they could be removed in
+        # encapsulate TPU computation pass if unused. However we don't remove
+        # inputs when dynamic padding is enabled.
+        # TODO(rxsang): Use other ways except argument index in padding_map so
+        # outside compilation can work with dynamic padding correctly.
+        if maximum_shapes is None:
+          i.op._set_attr("_tpu_input_identity",
+                         attr_value_pb2.AttrValue(b=True))
+        # pylint: enable=protected-access
+
+      # Unflatten the computation inputs to match original input structure.
+      computation_inputs = nest.pack_sequence_as(
+          structure=inputs[0],
+          flat_sequence=flat_replicated_inputs[:flat_input_arity])
+
+      # If there is an infeed queue, adds the dequeued values to the
+      # computation's inputs.
+      if infeed_queue is not None:
+        infeed_queue.set_number_of_shards(num_replicas)
+        for t in infeed_queue.generate_dequeue_op():
+          computation_inputs.append(t)
+
+      # Only resource variables work inside a TPU computation, so turn on
+      # resource variables for the computation.
+      # TODO(phawkins): consider removing this code. It will
+      # be less confusing to clients if they knowingly choose to use resource
+      # variables.
+      # Partitioned variables is not supported (b/112311320).
+      vscope = variable_scope.get_variable_scope()
+      saved_use_resource = vscope.use_resource
+      saved_custom_getter = vscope.custom_getter
+
+      def custom_getter(getter, name, *args, **kwargs):
+        """Variables on TPU have a few restrictions."""
+        partitioner = kwargs["partitioner"]
+        if partitioner is not None:
+          kwargs["partitioner"] = None
+          logging.warning(
+              "Partitioned variables are not supported on TPU. Got "
+              "`partitioner` that is {} for variable {}. "
+              "Setting `partitioner` to `None`."
+              .format(partitioner, name))
+        if saved_custom_getter is None:
+          return getter(name, *args, **kwargs)
+        else:
+          return saved_custom_getter(getter, name, *args, **kwargs)
+
+      vscope.set_use_resource(True)
+      vscope.set_custom_getter(custom_getter)
+
+      outputs = computation(*computation_inputs)
+
+      vscope.set_use_resource(saved_use_resource)
+      vscope.set_custom_getter(saved_custom_getter)
+
+    outputs_is_flat = xla.is_flat(outputs)
+    if outputs_is_flat:
+      output_tensors, control_deps = _postprocess_flat_outputs(outputs)
+    else:
+      output_tensors, control_deps = _postprocess_non_flat_outputs(outputs)
+
+    # tensor_tracer imports tpu.py. Local import to tensor_tracer to avoid
+    # import-cycle
+    # pylint: disable=g-import-not-at-top
+    from tensorflow.python.tpu import tensor_tracer
+    # pylint: enable=g-import-not-at-top
+    if tensor_tracer.TensorTracer.is_enabled():
+      tt = tensor_tracer.TensorTracer()
+      output_tensors = tt.trace_tpu(ops.get_default_graph(),
+                                    output_tensors, control_deps,
+                                    num_replicas)
+
+    context.ExitResult(output_tensors)
+  finally:
+    context.report_unsupported_operations()
+    context.Exit()
+    host_compute_core = context.HostComputeCore()
+
+  if host_compute_core:
+    attr_value = attr_value_pb2.AttrValue()
+    attr_value.list.s.extend([compat.as_bytes(x) for x in host_compute_core])
+    metadata._set_attr("host_compute_core", attr_value)  # pylint: disable=protected-access
+
+  with ops.control_dependencies([metadata]):
+    if use_tpu:
+      compile_status = tpu_ops.tpu_compilation_result()
+      op = compile_status.op
+      attr_value = attr_value_pb2.AttrValue(s=compat.as_bytes(cluster_name))
+      op._set_attr(_TPU_COMPILATION_STATUS_ATTR, attr_value)  # pylint: disable=protected-access
+    else:
+      compile_status = control_flow_ops.no_op(name="compilation_status")
+
+  if not output_tensors:
+    # Returns a list of NoOps dependent on the replication Op, indexed by
+    # [replica_num].
+    return [
+        compile_status,
+        [
+            control_flow_ops.group(control_deps, name="shard_%d" % i)
+            for i in range(num_replicas)
+        ]
+    ]
+
+  # Fan-out: Builds a TPUReplicatedOutput node for each output.
+  replicated_outputs = [[] for i in xrange(num_replicas)]
+  for i, t in enumerate(output_tensors):
+    # Fan-out: Builds a TPUReplicatedOutput node for each output.
+    ys = tpu_ops.tpu_replicated_output(
+        t, num_replicas, name="output{}".format(i))
+
+    # Wraps the outputs in identity operators so the names of any possible
+    # `fetch` nodes are preserved by the replication rewrite.
+    with ops.control_dependencies(control_deps):
+      for replica in xrange(num_replicas):
+        replicated_outputs[replica].append(
+            array_ops.identity(
+                ys[replica], name="output_%d_shard_%d" % (i, replica)))
+
+  if not outputs_is_flat:
+    replicated_outputs = [
+        nest.pack_sequence_as(outputs, replica_outs)
+        for replica_outs in replicated_outputs
+    ]
+
+  return [compile_status, replicated_outputs]
+
+
+def _postprocess_flat_outputs(outputs):
+  """Validates non-flat outputs, add backs device assignments and other attrs.
+
+  Args:
+    outputs: Output from `computation` inside `tpu.rewrite`.
+
+  Returns:
+    Tensors and Operations extracted from outputs.
+  """
+  # Following code segment is to preserve legacy behavior. Previously we only
+  # supported flat outputs and thus for consistency it was nice to convert even
+  # single element into a tuple. But now that we support arbitrary output
+  # structure, this is no longer necessary.
+  # TODO(b/121383831): Migrate all legacy use cases and delete this special
+  # case.
+  # If the computation returns `None`, make it an empty tuple.
+  if outputs is None:
+    outputs = tuple()
+  # If the computation only returned one value, makes it a tuple.
+  if not isinstance(outputs, collections.Sequence):
+    outputs = (outputs,)
+
+  # Append `no_op` here so that fetching any return value of this function
+  # will trigger TPUExecute node.
+  outputs += (control_flow_ops.no_op(),)
+  try:
+    with ops.device(core(0)):
+      outputs = [
+          o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o)
+          for o in outputs
+      ]
+  except Exception as e:
+    raise ValueError(
+        "TPU function return values must all either be Operations or "
+        "convertible to Tensors. Got '%s'" % str(e))
+
+  # Separates the returned Operations and Tensors.
+  output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
+  output_tensors = [o for o in outputs if not isinstance(o, ops.Operation)]
+
+  if outputs != output_tensors + output_operations:
+    raise ValueError(
+        "TPU functions must return zero-or more Tensor values followed by "
+        "zero or more Operations.")
+
+  # Wraps outputs in Identity ops. Otherwise a replicated input copied
+  # straight to an output would bypass the replicate(). This would be bad
+  # because the TPUReplicatedInput/TPUReplicatedOutput operator would not
+  # be rewritten away, leading to a runtime error.
+  # TODO(phawkins): extend the rewrite to elide these nodes instead.
+  new_output_tensors = []
+  for t in output_tensors:
+    with ops.device(t.device if t.device else core(0)):
+      o = array_ops.identity(t)
+      # pylint: disable=protected-access
+      o.op._set_attr("_tpu_output_identity", attr_value_pb2.AttrValue(b=True))
+      # pylint: enable=protected-access
+      new_output_tensors.append(o)
+  return new_output_tensors, output_operations
+
+
+def _postprocess_non_flat_outputs(outputs):
+  """Validates non-flat outputs, add backs device assignments and other attrs.
+
+  Args:
+    outputs: Output from `computation` inside `tpu.rewrite`.
+
+  Returns:
+    Tensors extracted from outputs and an empty list because Operations are not
+    allowed in non-flat outputs..
+  """
+
+  # Flatten output items.
+  flat_outputs = nest.flatten(outputs)
+
+  # Convert all non-Operation outputs to Tensors.
+  for i, o in enumerate(flat_outputs):
+    if isinstance(o, ops.Operation):
+      raise ValueError(
+          "tpu.rewrite does not support Operation as return value in non-flat "
+          "output structure. You can set returned Operations as control "
+          "dependencies of returned Tensors so Operations are triggered when "
+          'Tensors are evaluated. Operation found: "%s"' % o.name)
+
+    try:
+      o = ops.convert_to_tensor(o)
+    except Exception as e:
+      raise ValueError(
+          "TPU function return values must all either be Operations or "
+          'convertible to Tensors. Got error: "%s"' % str(e))
+
+    # Wraps outputs in Identity ops. Otherwise a replicated input copied
+    # straight to an output would bypass the replicate(). This would be bad
+    # because the TPUReplicatedInput/TPUReplicatedOutput operator would not
+    # be rewritten away, leading to a runtime error.
+    # TODO(phawkins): extend the rewrite to elide these nodes instead.
+    with ops.device(core(0)):
+      o = array_ops.identity(o)
+      # pylint: disable=protected-access
+      o.op._set_attr("_tpu_output_identity", attr_value_pb2.AttrValue(b=True))
+      # pylint: enable=protected-access
+      flat_outputs[i] = array_ops.identity(o)
+
+  # All flat_outputs are Tensors, and no Operations.
+  return flat_outputs, []
+
+
+def split_compile_and_shard(computation,
+                            inputs=None,
+                            num_shards=1,
+                            input_shard_axes=None,
+                            outputs_from_all_shards=True,
+                            output_shard_axes=None,
+                            infeed_queue=None,
+                            device_assignment=None,
+                            name=None):
+  """Shards `computation` for parallel execution.
+
+  `inputs` must be a list of Tensors or None (equivalent to an empty list), each
+  of which has a corresponding split axis (from `input_shard_axes`). Each input
+  is split into `num_shards` pieces along the corresponding axis, and
+  computation is applied to each shard in parallel.
+
+  Tensors are broadcast to all shards if they are lexically captured by
+  `computation`. e.g.,
+
+  x = tf.constant(7)
+  def computation():
+    return x + 3
+  ... = shard(computation, ...)
+
+  If `outputs_from_all_shards` is true, the outputs from all shards of
+  `computation` are concatenated back together along their `output_shards_axes`.
+  Otherwise, each output is taken from an arbitrary shard.
+
+  Inputs and outputs of the computation must be at least rank-1 Tensors.
+
+  Args:
+    computation: A Python function that builds a computation to apply to each
+      shard of the input.
+    inputs: A list of input tensors or None (equivalent to an empty list). Each
+      input tensor has a corresponding shard axes, given by `input_shard_axes`,
+      which must have size divisible by `num_shards`.
+    num_shards: The number of shards.
+    input_shard_axes: A list of dimensions along which to shard `inputs`, or
+      `None`. `None` means "shard all inputs along dimension 0". If not `None`,
+      there must be one dimension per input.
+    outputs_from_all_shards: Boolean or list of boolean. For each output, if
+      `True`, outputs from all shards are concatenated along the corresponding
+      `output_shard_axes` entry. Otherwise, each output is taken
+      from an arbitrary shard. If the argument is a boolean, the argument's
+      value is used for each output.
+    output_shard_axes: A list of dimensions along which to concatenate the
+      outputs of `computation`, or `None`. `None` means "concatenate all outputs
+      along dimension 0". If not `None`, there must be one dimension per output.
+      Ignored if `outputs_from_all_shards` is False.
+    infeed_queue: If not `None`, the `InfeedQueue` to use to augment the inputs
+      of `computation`.
+    device_assignment: If not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. Uses a default device assignment if `None`. The
+      `DeviceAssignment` may be omitted if each shard of the computation uses
+      only one core, and there is either only one shard, or the number of shards
+      is equal to the number of cores in the TPU system.
+    name: (Deprecated) Does nothing.
+  Returns:
+    A tuple of (compile op, [output tensors]).
+  Raises:
+    ValueError: If num_shards <= 0
+    ValueError: If len(input_shard_axes) != len(inputs)
+    ValueError: If len(output_shard_axes) != len(outputs from `computation`)
+  """
+  # TODO(phawkins): consider adding support for broadcasting Tensors passed as
+  # inputs.
+
+  if num_shards <= 0:
+    raise ValueError("num_shards must be a positive integer.")
+
+  inputs = [] if inputs is None else inputs
+  if not isinstance(inputs, list):
+    raise TypeError("tpu.shard()'s inputs must be a list of Tensors or None.")
+
+  # Converts inputs to Tensors.
+  inputs = [ops.convert_to_tensor(x) for x in inputs]
+
+  if input_shard_axes is None:
+    input_shard_axes = [0] * len(inputs)
+  if len(inputs) != len(input_shard_axes):
+    raise ValueError("Length of input_shard_axes must be equal to the number "
+                     "of inputs.")
+
+  if inputs:
+    # Splits the `inputs` along the corresponding `input_shard_axes`, giving
+    # lists with layout [input][shard]
+    split_inputs = [
+        array_ops.split(x, num_shards, axis=axis)
+        for (axis, x) in zip(input_shard_axes, inputs)]
+
+    # Transposes the input lists to have layout [shard][input]
+    transposed_inputs = [list(i) for i in zip(*split_inputs)]
+  else:
+    transposed_inputs = [[]] * num_shards
+
+  compile_op, outputs = split_compile_and_replicate(
+      computation,
+      transposed_inputs,
+      infeed_queue=infeed_queue,
+      device_assignment=device_assignment,
+      name=name)
+
+  # There must be at least one shard since num_shards > 0.
+  # TODO(b/36647078) remove disable when pylint bug is fixed.
+  # pylint: disable=indexing-exception
+  if isinstance(outputs[0], ops.Operation):
+    # pylint: enable=indexing-exception
+    # There were no outputs from the computation and replicate returned a list
+    # of NoOps with control dependencies on the computation. Return the first
+    # one so it can be used as a control dependency or fetch node.
+    # TODO(b/36647078) remove disable when pylint bug is fixed.
+    # pylint: disable=indexing-exception
+    return compile_op, [outputs[0]]
+    # pylint: enable=indexing-exception
+
+  # TODO(b/36647078) remove disable when pylint bug is fixed.
+  # pylint: disable=indexing-exception
+  num_outputs = len(outputs[0])
+  # pylint: enable=indexing-exception
+
+  if output_shard_axes is None:
+    output_shard_axes = [0] * num_outputs
+  if num_outputs != len(output_shard_axes):
+    raise ValueError("Length of output_shard_axes must be equal to the number "
+                     "of outputs.")
+
+  if isinstance(outputs_from_all_shards, bool):
+    outputs_from_all_shards = [outputs_from_all_shards] * num_outputs
+
+  if num_outputs != len(outputs_from_all_shards):
+    raise ValueError("Length of outputs_from_all_shards must be equal to the "
+                     "number of outputs.")
+
+  results = []
+  for (axis, all_shards, x) in zip(output_shard_axes, outputs_from_all_shards,
+                                   zip(*outputs)):
+    if all_shards:
+      # Concatenate all of the outputs together (use stack for scalars).
+      shape = x[0].shape
+      is_scalar = shape is not None and (shape.ndims == 0)
+      results.append((array_ops.stack(list(x)) if is_scalar
+                      else array_ops.concat(list(x), axis=axis)))
+    else:
+      # TODO(phawkins): use a smarter policy, e.g., round-robin across shards.
+      results.append(x[0])
+
+  return compile_op, results
+
+
+def shard(computation,
+          inputs=None,
+          num_shards=1,
+          input_shard_axes=None,
+          outputs_from_all_shards=True,
+          output_shard_axes=None,
+          infeed_queue=None,
+          device_assignment=None,
+          name=None):
+  """Shards `computation` for parallel execution.
+
+  `inputs` must be a list of Tensors or None (equivalent to an empty list), each
+  of which has a corresponding split axis (from `input_shard_axes`). Each input
+  is split into `num_shards` pieces along the corresponding axis, and
+  computation is applied to each shard in parallel.
+
+  Tensors are broadcast to all shards if they are lexically captured by
+  `computation`. e.g.,
+
+  x = tf.constant(7)
+  def computation():
+    return x + 3
+  ... = shard(computation, ...)
+
+  TODO(phawkins): consider adding support for broadcasting Tensors passed
+  as inputs.
+
+  If `outputs_from_all_shards` is true, the outputs from all shards of
+  `computation` are concatenated back together along their `output_shards_axes`.
+  Otherwise, each output is taken from an arbitrary shard.
+
+  Inputs and outputs of the computation must be at least rank-1 Tensors.
+
+  Args:
+    computation: A Python function that builds a computation to apply to each
+      shard of the input.
+    inputs: A list of input tensors or None (equivalent to an empty list). Each
+      input tensor has a corresponding shard axes, given by `input_shard_axes`,
+      which must have size divisible by `num_shards`.
+    num_shards: The number of shards.
+    input_shard_axes: A list of dimensions along which to shard `inputs`, or
+      `None`. `None` means "shard all inputs along dimension 0". If not `None`,
+      there must be one dimension per input.
+    outputs_from_all_shards: Boolean or list of boolean. For each output, if
+      `True`, outputs from all shards are concatenated along the corresponding
+      `output_shard_axes` entry. Otherwise, each output is taken
+      from an arbitrary shard. If the argument is a boolean, the argument's
+      value is used for each output.
+    output_shard_axes: A list of dimensions along which to concatenate the
+      outputs of `computation`, or `None`. `None` means "concatenate all outputs
+      along dimension 0". If not `None`, there must be one dimension per output.
+      Ignored if `outputs_from_all_shards` is False.
+    infeed_queue: If not `None`, the `InfeedQueue` to use to augment the inputs
+      of `computation`.
+    device_assignment: If not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. Uses a default device assignment if `None`. The
+      `DeviceAssignment` may be omitted if each shard of the computation uses
+      only one core, and there is either only one shard, or the number of shards
+      is equal to the number of cores in the TPU system.
+    name: (Deprecated) Does nothing.
+  Returns:
+    A list of output tensors.
+  Raises:
+    ValueError: If num_shards <= 0
+    ValueError: If len(input_shard_axes) != len(inputs)
+    ValueError: If len(output_shard_axes) != len(outputs from `computation`)
+  """
+  return split_compile_and_shard(
+      computation,
+      inputs=inputs,
+      num_shards=num_shards,
+      input_shard_axes=input_shard_axes,
+      outputs_from_all_shards=outputs_from_all_shards,
+      output_shard_axes=output_shard_axes,
+      infeed_queue=infeed_queue,
+      device_assignment=device_assignment,
+      name=name)[1]
+
+
+def batch_parallel(computation,
+                   inputs=None,
+                   num_shards=1,
+                   infeed_queue=None,
+                   device_assignment=None,
+                   name=None):
+  """Shards `computation` along the batch dimension for parallel execution.
+
+  Convenience wrapper around shard().
+
+  `inputs` must be a list of Tensors or None (equivalent to an empty list).
+  Each input is split into `num_shards` pieces along the 0-th dimension, and
+  computation is applied to each shard in parallel.
+
+  Tensors are broadcast to all shards if they are lexically captured by
+  `computation`. e.g.,
+
+  x = tf.constant(7)
+  def computation():
+    return x + 3
+  ... = shard(computation, ...)
+
+  The outputs from all shards are concatenated back together along their 0-th
+  dimension.
+
+  Inputs and outputs of the computation must be at least rank-1 Tensors.
+
+  Args:
+    computation: A Python function that builds a computation to apply to each
+      shard of the input.
+    inputs: A list of input tensors or None (equivalent to an empty list). The
+      0-th dimension of each Tensor must have size divisible by `num_shards`.
+    num_shards: The number of shards.
+    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
+      of arguments as inputs to `computation`.
+    device_assignment: If not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. Uses a default device assignment if `None`. The
+      `DeviceAssignment` may be omitted if each shard of the computation uses
+      only one core, and there is either only one shard, or the number of shards
+      is equal to the number of cores in the TPU system.
+    name: (Deprecated) Does nothing.
+  Returns:
+    A list of output tensors.
+  Raises:
+    ValueError: If `num_shards <= 0`
+  """
+  return shard(
+      computation,
+      inputs,
+      num_shards=num_shards,
+      infeed_queue=infeed_queue,
+      device_assignment=device_assignment,
+      name=name)
+
+
+def rewrite(computation,
+            inputs=None,
+            infeed_queue=None,
+            device_assignment=None,
+            name=None):
+  """Rewrites `computation` for execution on a TPU system.
+
+  Args:
+    computation: A Python function that builds a computation to apply to the
+      input. If the function takes n inputs, 'inputs' should be a list of n
+      tensors.
+
+      `computation` may return a list of operations and tensors. Tensors must
+      come before operations in the returned list.  The return value of
+      `rewrite` is a list of tensors corresponding to the tensors from the
+      output of `computation`.
+
+      All `Operation`s constructed during `computation` will be executed when
+      evaluating any of the returned output tensors, not just the ones returned.
+    inputs: A list of input tensors or `None` (equivalent to an empty list).
+      Each input can be a nested structure containing values that are
+      convertible to tensors. Note that passing an N-dimension list of
+      compatible values will result in a N-dimention list of scalar tensors
+      rather than a single Rank-N tensors. If you need different behavior,
+      convert part of inputs to tensors with `tf.convert_to_tensor`.
+    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
+      of arguments as inputs to `computation`.
+    device_assignment: if not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. May be omitted for a single-core computation, in which
+      case the core attached to task 0, TPU device 0 is used.
+    name: (Deprecated) Does nothing.
+  Returns:
+    Same data structure as if computation(*inputs) is called directly with some
+    exceptions for correctness. Exceptions include:
+      1) None output: a NoOp would be returned which control-depends on
+         computation.
+      2) Single value output: A tuple containing the value would be returned.
+      3) Operation-only outputs: a NoOp would be returned which
+         control-depends on computation.
+      TODO(b/121383831): Investigate into removing these special cases.
+  """
+  # TODO(b/36647078) remove disable when pylint bug is fixed.
+  # pylint: disable=indexing-exception
+  return replicate(
+      computation,
+      None if inputs is None else [inputs],
+      infeed_queue=infeed_queue,
+      device_assignment=device_assignment,
+      name=name)[0]
+  # pylint: enable=indexing-exception
+
+  # Operations that indicate some error in the user's inference graph.
+_BLACKLISTED_INFERENCE_OPS = set([
+    "ReadVariableOp",
+    "AssignVariableOp",
+    "AssignAddVariableOp",
+    "AssignSubVariableOp",
+    "VarHandleOp",
+    "Variable",
+    "VariableV2",
+])
+
+
+def under_tpu_inference_context():
+  """Check if it is currently under `tpu.rewrite_for_inference()`."""
+  graph = ops.get_default_graph()
+
+  context = graph._get_control_flow_context()  # pylint: disable=protected-access
+  while context:
+    if isinstance(context, _TPUInferenceContext):
+      return True
+    context = context.outer_context
+
+  return False
+
+
+class _TPUInferenceContext(control_flow_ops.XLAControlFlowContext):
+  """A `ControlFlowContext` for nodes inside a TPU inference computation.
+
+  The primary role of `TPUReplicateContext` is to sanity check operators inside
+  a tpu.rewrite_for_inference() computation.
+  """
+
+  def __init__(self, name):
+    super(_TPUInferenceContext, self).__init__()
+    self._name = name
+
+  def AddOp(self, op):
+    self._AddOpInternal(op)
+
+  def _AddOpInternal(self, op):
+    # pylint: disable=protected-access
+    if op.type in _BLACKLISTED_INFERENCE_OPS:
+      raise NotImplementedError(
+          "Operation of type %s (%s) is not supported on the TPU for inference."
+          " Execution will fail if this op is used in the graph. Make sure your"
+          " variables are using variable_scope." % (op.type, op.name))
+    if self._outer_context:
+      self._outer_context.AddInnerOp(op)
+
+  def AddValue(self, val):
+    result = val
+    if self._outer_context:
+      result = self._outer_context.AddValue(val)
+    return result
+
+  def AddInnerOp(self, op):
+    self._AddOpInternal(op)
+
+  @property
+  def grad_state(self):
+    return None
+
+
+def validate_inference_rewrite_for_variables(graph):
+  """Validates whether rewrite_for_inference() 'worked' for variables.
+
+     The rewrite_for_inference() method is supposed to append GuaranteeConstOps
+     after ReadVariableOps, but this mechanism works only if you are using
+     tf.get_variable() to create and access variables in your tpu computation.
+     This validation method can be called immediately after calling
+     tpu.rewrite_for_inference() to check whether GuaranteeConstOps where added
+     to the graph.
+
+     Typical usages:
+       tpu.validate_inference_rewrite_for_variables(tf.get_default_graph())
+
+       tpu.validate_inference_rewrite_for_variables(sess.graph)
+
+  Args:
+    graph: The graph which needs to be validated.
+  Raises:
+    RuntimeError: if validation failed.
+  """
+  if not any(x.type == "GuaranteeConst" for x in graph.get_operations()):
+    raise RuntimeError(
+        "No GuaranteeConst ops found in the graph after running "
+        "tpu.rewrite_for_inference(...). Please check that you are using "
+        "tf.get_variable() to create and access variables in your tpu "
+        "computation.")
+
+
+def rewrite_for_inference(computation,
+                          inputs=None,
+                          infeed_queue=None,
+                          device_assignment=None,
+                          name=None):
+  """Rewrites `computation` for inference on a TPU system.
+
+     Other than 'rewriting' the computation to run on a TPU, if using variables
+     in your computation, it moves the ReadVariableOps outside the TPU
+     computation, and adds GuaranteeConst ops just after the ReadVariableOps.
+     This mechanism works only if you are using tf.get_variable() to create and
+     access variables in your tpu computation. You can validate whether this
+     worked, by calling validate_inference_rewrite_for_variables() method
+     immediately after this method to check whether GuaranteeConstOps where
+     added to the graph.
+
+  Args:
+    computation: A Python function that builds a computation to apply to the
+      input. If the function takes n inputs, 'inputs' should be a list of n
+      tensors. If the function returns m outputs, rewrite will return a list of
+      m tensors.
+    inputs: A list of input tensors or `None` (equivalent to an empty list).
+    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
+      of arguments as inputs to `computation`.
+    device_assignment: if not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. May be omitted for a single-core computation, in which
+      case the core attached to task 0, TPU device 0 is used.
+    name: The name of the operator.
+  Returns:
+    A list of output tensors.
+  """
+
+  def guarantee_const_getter(getter, name, *args, **kwargs):
+    with ops.control_dependencies(None):
+      return array_ops.guarantee_const(
+          getter(name, *args, **kwargs), name=name + "/GuaranteeConst")
+
+  def wrapped_computation(*args, **kwargs):
+    """Execute computation under `_TPUInferenceContext`."""
+    context = _TPUInferenceContext(
+        name=ops.get_default_graph().unique_name("rewrite_for_inference"))
+    try:
+      context.Enter()
+
+      vscope = variable_scope.get_variable_scope()
+      prev_custom_getter = vscope.custom_getter
+      prev_caching_device = vscope.caching_device
+      vscope.set_custom_getter(guarantee_const_getter)
+      vscope.set_caching_device(lambda op: op.device)
+
+      result = computation(*args, **kwargs)
+
+      vscope.set_custom_getter(prev_custom_getter)
+      vscope.set_caching_device(prev_caching_device)
+    finally:
+      context.Exit()
+    return result
+
+  # pylint: disable=undefined-variable
+  return rewrite(
+      wrapped_computation,
+      inputs=inputs,
+      infeed_queue=infeed_queue,
+      device_assignment=device_assignment,
+      name=name)
+  # pylint: enable=undefined-variable
diff --git a/tensorflow/python/tpu/tpu_config.py b/tensorflow/python/tpu/tpu_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9038a95221249b84ba0b64f66ec7fb24e1877a1
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_config.py
@@ -0,0 +1,276 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+
+"""A RunConfig subclass with TPU support."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import json
+import os
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.estimator import run_config as run_config_lib
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import util as util_lib
+
+# pylint: disable=protected-access
+_TF_CONFIG_ENV = run_config_lib._TF_CONFIG_ENV
+_SERVICE_KEY = run_config_lib._SERVICE_KEY
+_TPU_WORKER_JOB_NAME = 'tpu_worker_job_name'
+# pylint: enable=protected-access
+
+
+class InputPipelineConfig(object):
+  r"""Please see the definition of these values in TPUConfig."""
+  PER_SHARD_V1 = 1
+  PER_HOST_V1 = 2
+  PER_HOST_V2 = 3
+  BROADCAST = 4
+
+
+class TPUConfig(
+    collections.namedtuple('TPUConfig', [
+        'iterations_per_loop',
+        'num_shards',
+        'num_cores_per_replica',
+        'per_host_input_for_training',
+        'tpu_job_name',
+        'initial_infeed_sleep_secs',
+        'input_partition_dims',
+    ])):
+  r"""TPU related configuration required by `TPUEstimator`.
+
+  Args:
+    iterations_per_loop: This is the number of train steps running in TPU
+      system before returning to CPU host for each `Session.run`. This means
+      global step is increased `iterations_per_loop` times in one `Session.run`.
+      It is recommended to be set as number of global steps for next checkpoint.
+    num_shards: (Deprecated, ignored by TPUEstimator).
+      The number of model replicas in the system. For non-model-parallelism
+      case, this number equals the total number of TPU cores. For
+      model-parallelism, the total number of TPU cores equals
+      num_cores_per_replica * num_shards.
+    num_cores_per_replica: Defaults to `None`, which disables model parallelism.
+      An integer which describes the number of TPU cores per model replica. This
+      is required by model-parallelism which enables partitioning
+      the model to multiple cores. Currently num_cores_per_replica must be
+      1, 2, 4, or 8.
+    per_host_input_for_training: If `True`, `PER_HOST_V1`, or `PER_HOST_V2`,
+      `input_fn` is invoked once on each host. With the per-core input pipeline
+      configuration, it is invoked once for each core.
+      With a global batch size `train_batch_size` in `TPUEstimator` constructor,
+      the batch size for each shard is `train_batch_size` // #hosts in the
+      `True` or `PER_HOST_V1` mode. In `PER_HOST_V2` mode, it is
+      `train_batch_size` // #cores. In `BROADCAST` mode, `input_fn` is only
+      invoked once on host 0 and the tensors are broadcasted to all other
+      replicas. The batch size equals to train_batch_size`. With the per-core
+      input pipeline configuration, the shard batch size is also
+      `train_batch_size` // #cores.
+      Note: per_host_input_for_training==PER_SHARD_V1 only supports mode.TRAIN.
+    tpu_job_name: The name of the TPU job. Typically, this name is auto-inferred
+      within TPUEstimator, however when using ClusterSpec propagation in more
+      esoteric cluster configurations, you may need to specify the job name as a
+      string.
+    initial_infeed_sleep_secs: The number of seconds the infeed thread should
+      wait before enqueueing the first batch. This helps avoid timeouts for
+      models that require a long compilation time.
+    input_partition_dims: A nested list to describe the partition dims
+      for all the tensors from input_fn(). The structure of
+      input_partition_dims must match the structure of `features` and
+      `labels` from input_fn(). The total number of partitions must match
+      `num_cores_per_replica`. For example, if input_fn() returns two tensors:
+      images with shape [N, H, W, C] and labels [N].
+      input_partition_dims = [[1, 2, 2, 1], None] will split the images to 4
+      pieces and feed into 4 TPU cores. labels tensor are directly broadcasted
+      to all the TPU cores since the partition dims is `None`.
+      Current limitations: This feature is only supported with the PER_HOST_V2
+      input mode.
+
+    Raises:
+      ValueError: If `num_cores_per_replica` is not 1, 2, 4, 8 or 16.
+  """
+
+  def __new__(cls,
+              iterations_per_loop=2,
+              num_shards=None,
+              num_cores_per_replica=None,
+              per_host_input_for_training=True,
+              tpu_job_name=None,
+              initial_infeed_sleep_secs=None,
+              input_partition_dims=None):
+
+    # Check iterations_per_loop.
+    util_lib.check_positive_integer(iterations_per_loop,
+                                    'TPUConfig iterations_per_loop')
+
+    # Check num_shards.
+    if num_shards is not None:
+      util_lib.check_positive_integer(num_shards, 'TPUConfig num_shards')
+
+    if input_partition_dims is not None:
+      if len(input_partition_dims) != 1 and len(input_partition_dims) != 2:
+        raise ValueError(
+            'input_partition_dims must be a list/tuple with one or two'
+            ' elements.')
+
+      if per_host_input_for_training is not InputPipelineConfig.PER_HOST_V2:
+        raise ValueError(
+            'input_partition_dims is only supported in PER_HOST_V2 mode.')
+
+      if num_cores_per_replica is None:
+        raise ValueError(
+            'input_partition_dims requires setting num_cores_per_replica.')
+
+    # Check num_cores_per_replica
+    if num_cores_per_replica is not None:
+      if num_cores_per_replica not in [1, 2, 4, 8, 16]:
+        raise ValueError(
+            'num_cores_per_replica must be 1, 2, 4, 8, or 16; got {}'.format(
+                str(num_cores_per_replica)))
+
+    # per_host_input_for_training may be True, False, or integer in [1..3].
+    # Map legacy values (True, False) to numeric values.
+    if per_host_input_for_training is False:
+      per_host_input_for_training = InputPipelineConfig.PER_SHARD_V1
+    elif per_host_input_for_training is True:
+      per_host_input_for_training = InputPipelineConfig.PER_HOST_V1
+
+    # Check initial_infeed_sleep_secs.
+    if initial_infeed_sleep_secs:
+      util_lib.check_positive_integer(initial_infeed_sleep_secs,
+                                      'TPUConfig initial_infeed_sleep_secs')
+
+    tpu_job_name = tpu_job_name or _get_tpu_job_name_from_tf_config()
+
+    return super(TPUConfig, cls).__new__(
+        cls,
+        iterations_per_loop=iterations_per_loop,
+        num_shards=num_shards,
+        num_cores_per_replica=num_cores_per_replica,
+        per_host_input_for_training=per_host_input_for_training,
+        tpu_job_name=tpu_job_name,
+        initial_infeed_sleep_secs=initial_infeed_sleep_secs,
+        input_partition_dims=input_partition_dims)
+
+
+class RunConfig(run_config_lib.RunConfig):
+  """RunConfig with TPU support."""
+
+  def __init__(self,
+               tpu_config=None,
+               evaluation_master=None,
+               master=None,
+               cluster=None,
+               **kwargs):
+    """Constructs a RunConfig.
+
+    Args:
+      tpu_config: the TPUConfig that specifies TPU-specific configuration.
+      evaluation_master: a string. The address of the master to use for eval.
+        Defaults to master if not set.
+      master: a string. The address of the master to use for training.
+      cluster: a ClusterResolver
+      **kwargs: keyword config parameters.
+
+    Raises:
+      ValueError: if cluster is not None and the provided session_config has a
+        cluster_def already.
+    """
+    super(RunConfig, self).__init__(**kwargs)
+    self._tpu_config = tpu_config or TPUConfig()
+    self._cluster = cluster
+
+    # If user sets master and/or evaluation_master explicitly, including empty
+    # string '', take it. Otherwise, take the values set by parent class.
+    if master is not None:
+      if cluster is not None:
+        raise ValueError('Both master and cluster are set.')
+      self._master = master
+    else:
+      if cluster:
+        self._master = cluster.master()
+
+    if evaluation_master is not None:
+      self._evaluation_master = evaluation_master
+    elif (not self._evaluation_master and
+          self.task_type != run_config_lib.TaskType.EVALUATOR):
+      # If the task type is EVALUATOR, it means some cluster manager sets the
+      # TF_CONFIG. In that case, we respect the configuration in TF_CONFIG.
+      #
+      # Otherwise, it means user executes the code without external cluster
+      # manager. For that, we optimize the user experience by setting
+      # evaluation_master to master, unless user overwrites it.
+      self._evaluation_master = self._master
+
+    # Set the ClusterSpec to use
+    if cluster:
+      self._cluster_spec = cluster.cluster_spec()
+
+      # Merge the cluster_def into the ConfigProto.
+      if self._session_config is None:  # pylint: disable=access-member-before-definition
+        self._session_config = config_pb2.ConfigProto(
+            allow_soft_placement=True, isolate_session_state=True)
+      if self._session_config.HasField('cluster_def'):
+        raise ValueError(
+            'You cannot provide a ClusterResolver and '
+            'session_config.cluster_def.')
+      if self._cluster_spec:
+        self._session_config.cluster_def.CopyFrom(
+            self._cluster_spec.as_cluster_def())
+
+  def _maybe_overwrite_session_config_for_distributed_training(self):
+    # Overrides the parent class session_config overwrite for between-graph. TPU
+    # runs with in-graph, which should not have device filter. Doing nothing
+    # ("pass") basically disables it.
+    pass
+
+  @property
+  def evaluation_master(self):
+    return self._evaluation_master
+
+  @property
+  def master(self):
+    return self._master
+
+  @property
+  def tpu_config(self):
+    return self._tpu_config
+
+  @property
+  def cluster(self):
+    return self._cluster
+
+  def replace(self, **kwargs):
+    if 'tpu_config' not in kwargs:
+      return super(RunConfig, self).replace(**kwargs)
+
+    tpu_config = kwargs.pop('tpu_config')
+    new_instance = super(RunConfig, self).replace(**kwargs)
+    new_instance._tpu_config = tpu_config  # pylint: disable=protected-access
+    return new_instance
+
+
+def _get_tpu_job_name_from_tf_config():
+  """Extracts the TPU job name from TF_CONFIG env variable."""
+  # TODO(xiejw): Extends this to support both TF_CONFIG env variable and cluster
+  # spec propagation.
+  tf_config = json.loads(os.environ.get(_TF_CONFIG_ENV, '{}'))
+  tpu_job_name = tf_config.get(_SERVICE_KEY, {}).get(_TPU_WORKER_JOB_NAME)
+  if tpu_job_name:
+    logging.info('Load TPU job name from TF_CONFIG: %s', tpu_job_name)
+  return tpu_job_name
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config_test.py b/tensorflow/python/tpu/tpu_config_test.py
similarity index 98%
rename from tensorflow/contrib/tpu/python/tpu/tpu_config_test.py
rename to tensorflow/python/tpu/tpu_config_test.py
index b2fe0a688861503ae0bc55208f5dfc4d664419fd..22fb3032169851e5ee58d6b40bef52ece8593ba1 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config_test.py
+++ b/tensorflow/python/tpu/tpu_config_test.py
@@ -20,10 +20,10 @@ from __future__ import print_function
 
 import json
 
-from tensorflow.contrib.tpu.python.tpu import tpu_config as tpu_config_lib
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import tpu_config as tpu_config_lib
 
 
 def _set_tf_config_env_variable(tf_config):
diff --git a/tensorflow/python/tpu/tpu_context.py b/tensorflow/python/tpu/tpu_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..2511e427a26fbfd58b3b3c7a2179f2cd74626ce2
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_context.py
@@ -0,0 +1,763 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""TPU system metadata and associated tooling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from contextlib import contextmanager
+import copy
+
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import _tpu_estimator_embedding
+from tensorflow.python.tpu import device_assignment as tpu_device_assignment
+from tensorflow.python.tpu import tpu_config
+from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
+
+
+_DEFAULT_JOB_NAME = 'tpu_worker'
+_DEFAULT_COORDINATOR_JOB_NAME = 'coordinator'
+_LOCAL_MASTERS = ('', 'local')
+_NUM_CORES_TO_COMPUTATION_SHAPE = {
+    1: [1, 1, 1],
+    2: [1, 1, 2],
+    4: [1, 2, 2],
+    8: [2, 2, 2],
+    16: [4, 2, 2],
+}
+
+
+class TPUContext(object):
+  """A context that holds the current configuration of the TPU computation."""
+
+  def __init__(self,
+               internal_ctx,
+               input_device=None,
+               invocation_index=None,
+               call_from_input_fn=True):
+    self._internal_ctx = internal_ctx
+    self._input_device = input_device
+    self._invocation_index = invocation_index
+    self._call_from_input_fn = call_from_input_fn
+
+  def current_input_fn_deployment(self):
+    """The configuration of the current input_fn invocation.
+
+    The configuration depends on `TPUConfig.per_host_input_for_training`. See
+    `TPUConfig` for details.
+
+    Only set in params dict of input_fn
+
+    Returns:
+      A tuple of
+        1. Device spec string: String, is the current CPU host where the
+           input_fn is invoked.
+        2. Current invocation index: Int, 0-based index of the input_fn
+           invocation. See next item for details.
+        3. Total invocation count: Int, the total number of times to invoke the
+           input_fn on all CPU hosts. Each invocation will be passed with a new
+           `TPUContext` instance with current invocation index set properly.
+        4. Total number of replicas consumed by current_invocation: Int, the
+           number of replicas fed by the data returned by current input_fn. For
+           example, for per_core input pipeline deployment
+           and non-model-parallelism, total invocation count is equal to
+           the number of cores in the system and num replicas consumed by
+           current invocation is 1. For per-host v2 input pipeline deployment,
+           total invocation count is equal to the number of hosts in the system
+           and num replicas consumed by current invocation is equal to number of
+           cores per host.
+
+    Raises:
+      RuntimeError: If this method must not be called from input_fn.
+    """
+    if not self._call_from_input_fn:
+      raise RuntimeError('This TPUContext instance must not be called from'
+                         ' model_fn.')
+
+    if self._internal_ctx.is_input_sharded_per_core():
+      total_invocation_count = (self._internal_ctx.num_hosts
+                                * self._internal_ctx.num_of_replicas_per_host)
+      replicas_consumed = 1
+    elif self._internal_ctx.is_input_broadcast_with_iterators():
+      total_invocation_count = 1
+      replicas_consumed = self._internal_ctx.num_replicas
+    else:
+      total_invocation_count = self._internal_ctx.num_hosts
+      replicas_consumed = self._internal_ctx.num_of_replicas_per_host
+    return (self._input_device, self._invocation_index,
+            total_invocation_count, replicas_consumed)
+
+  @property
+  def num_replicas(self):
+    """The total number of replicas.
+
+    For non-model-parallelism, num_replicas should be the total num of TPU
+    cores in the system.
+
+    Returns:
+      The number of replicas.
+    """
+    return self._internal_ctx.num_replicas
+
+  @property
+  def num_hosts(self):
+    """The number of hosts for the TPU system."""
+    return self._internal_ctx.num_hosts
+
+  @property
+  def current_host(self):
+    """The current host index for the TPU system."""
+    return self._invocation_index
+
+  @property
+  def num_of_replicas_per_host(self):
+    """The number of replicas for each host."""
+    if self._internal_ctx.model_parallelism_enabled:
+      raise ValueError(
+          'num_of_replicas_per_host is not supported for model_parallelism')
+    return self._internal_ctx.num_of_replicas_per_host
+
+  @property
+  def device_assignment(self):
+    """Returns device_assignment object."""
+    if self._call_from_input_fn:
+      raise RuntimeError('This TPUContext instance must not be called from'
+                         ' input_fn.')
+    return self._internal_ctx.device_assignment
+
+  def device_for_replica(self, replica_id):
+    """Returns the tuple of (CPU device and device ordinal) for replica.
+
+    This should be used for full replicate for non-model-parallelism.
+
+    Args:
+       replica_id: Int, the replica index.
+
+    Returns:
+       A tuple of device spec for CPU device and int device ordinal.
+    """
+    # Note that: For the non-model parallelism, the mapping could be
+    # a random permutation. The order should not matter in most cases
+    # as far as model is replicated to all cores in the system.
+    return self._internal_ctx.device_for_replica(replica_id)
+
+  @property
+  def tpu_host_placement_function(self):
+    """Returns the TPU host place function.
+
+    The place function takes host_id as the input and returns the TF device
+    for the correspoding host.
+    """
+
+    def _placement_function(host_id):
+      """Return the host device given host_id."""
+      return self._internal_ctx.tpu_host_placement_function(host_id=host_id)
+
+    return _placement_function
+
+
+class _InternalTPUContext(object):
+  """A context holds immutable states of TPU computation.
+
+  This immutable object holds TPUEstimator config, train/eval batch size, and
+  `TPUEstimator.use_tpu`, which is expected to be passed around. It also
+  provides utility functions, based on the current state, to determine other
+  information commonly required by TPU computation, such as TPU device names,
+  TPU hosts, shard batch size, etc.
+
+  if eval_on_tpu is False, then execution of eval on TPU is disabled.
+  if eval_on_tpu is True, but use_tpu is False, a warning is issued,
+  and TPU execution is disabled for all modes.
+
+  N.B. As `mode` is not immutable state in Estimator, but essential to
+  distinguish between TPU training and evaluation, a common usage for
+  _InternalTPUContext with `mode` is as follows:
+  ```
+  with _ctx.with_mode(mode) as ctx:
+    if ctx.is_running_on_cpu():
+       ...
+  ```
+  """
+
+  def __init__(self,
+               config,
+               train_batch_size,
+               eval_batch_size,
+               predict_batch_size,
+               use_tpu,
+               eval_on_tpu=True,
+               embedding_config_spec=None):
+    self._config = config
+    self._train_batch_size = train_batch_size
+    self._eval_batch_size = eval_batch_size
+    self._predict_batch_size = predict_batch_size
+    self._use_tpu = use_tpu
+    logging.info('_TPUContext: eval_on_tpu %s', eval_on_tpu)
+    if not use_tpu and eval_on_tpu:
+      logging.warning('eval_on_tpu ignored because use_tpu is False.')
+
+    self._eval_on_tpu = eval_on_tpu
+    self._model_parallelism_enabled = (
+        use_tpu and config.tpu_config.num_cores_per_replica)
+    self._mode = None
+    num_cores_per_replica = config.tpu_config.num_cores_per_replica
+    if self._model_parallelism_enabled:
+      self._computation_shape = _NUM_CORES_TO_COMPUTATION_SHAPE[
+          num_cores_per_replica]
+    else:
+      self._computation_shape = None
+    self._lazy_tpu_system_metadata_dict = {}  # key by master address
+    self._lazy_device_assignment_dict = {}  # key by master address
+    self._lazy_validation_dict = {}  # key by ModeKeys
+    self._embedding_config_spec = embedding_config_spec
+    self._lazy_embedding_config_dict = {}  # key by master address
+
+  def _assert_mode(self):
+    if self._mode is None:
+      raise RuntimeError(
+          '`mode` needs to be set via contextmanager `with_mode`.')
+    return self._mode
+
+  @contextmanager
+  def with_mode(self, mode):
+    # NOTE(xiejw): Shallow copy is enough. It will share he lazy dictionaries,
+    # such as _lazy_tpu_system_metadata_dict between new copy and the original
+    # one. Note that all lazy states stored in properties _lazy_foo are sort of
+    # immutable as they should be same for the process lifetime.
+    new_ctx = copy.copy(self)
+    new_ctx._mode = mode  # pylint: disable=protected-access
+    yield new_ctx
+
+  @property
+  def mode(self):
+    return self._assert_mode()
+
+  def _get_master_address(self):
+    mode = self._assert_mode()
+    config = self._config
+    master = (
+        config.master
+        if mode != model_fn_lib.ModeKeys.EVAL else config.evaluation_master)
+    return master
+
+  def _get_tpu_system_metadata(self):
+    """Gets the (maybe cached) TPU system metadata."""
+    master = self._get_master_address()
+    tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
+    if tpu_system_metadata is not None:
+      return tpu_system_metadata
+
+    cluster_def = None
+    if (self._config.session_config and
+        self._config.session_config.cluster_def.job):
+      cluster_def = self._config.session_config.cluster_def
+
+    # pylint: disable=protected-access
+    tpu_system_metadata = (
+        tpu_system_metadata_lib._query_tpu_system_metadata(
+            master,
+            cluster_def=cluster_def,
+            query_topology=self.model_parallelism_enabled))
+
+    self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
+    return tpu_system_metadata
+
+  def _get_device_assignment(self):
+    """Gets the (maybe cached) TPU device assignment."""
+    master = self._get_master_address()
+    device_assignment = self._lazy_device_assignment_dict.get(master)
+    if device_assignment is not None:
+      return device_assignment
+
+    tpu_system_metadata = self._get_tpu_system_metadata()
+
+    device_assignment = tpu_device_assignment.device_assignment(
+        tpu_system_metadata.topology,
+        computation_shape=self._computation_shape,
+        num_replicas=self.num_replicas)
+
+    logging.info('num_cores_per_replica: %s',
+                 str(self._config.tpu_config.num_cores_per_replica))
+    logging.info('computation_shape: %s', str(self._computation_shape))
+    logging.info('num_replicas: %d', self.num_replicas)
+    logging.info('device_assignment.topology.device_coordinates: %s',
+                 str(device_assignment.topology.device_coordinates))
+    logging.info('device_assignment.core_assignment: %s',
+                 str(device_assignment.core_assignment))
+
+    self._lazy_device_assignment_dict[master] = device_assignment
+    return device_assignment
+
+  @property
+  def embedding_config(self):
+    """Returns the embedding config based on current mode."""
+    master = self._get_master_address()
+    if master in self._lazy_embedding_config_dict:
+      embedding_config = self._lazy_embedding_config_dict[master]
+    else:
+      embedding_config = None
+      if self._use_tpu and self._embedding_config_spec:
+        embedding_config = _tpu_estimator_embedding.EmbeddingConfig(
+            self._embedding_config_spec, self._train_batch_size,
+            self._eval_batch_size, self.num_hosts, self.num_cores, master)
+        if not embedding_config.has_embedding_tables():
+          embedding_config = None
+      self._lazy_embedding_config_dict[master] = embedding_config
+
+    if embedding_config is not None:
+      mode = self._assert_mode()
+      # Dynamically attach tpu_embedding based on mode. With
+      # this, we could keep embedding_config immutable but call site always
+      # accesses the unified API '.tpu_embedding'.
+      embedding_config.tpu_embedding = embedding_config.get_tpu_embedding(mode)
+    return embedding_config
+
+  @property
+  def model_parallelism_enabled(self):
+    return self._model_parallelism_enabled
+
+  @property
+  def input_partition_dims(self):
+    return self._config.tpu_config.input_partition_dims
+
+  @property
+  def device_assignment(self):
+    return (self._get_device_assignment()
+            if self._model_parallelism_enabled else None)
+
+  @property
+  def num_of_cores_per_host(self):
+    metadata = self._get_tpu_system_metadata()
+    return metadata.num_of_cores_per_host
+
+  @property
+  def num_cores(self):
+    metadata = self._get_tpu_system_metadata()
+    return metadata.num_cores
+
+  @property
+  def num_of_replicas_per_host(self):
+    """Return the number of replicas per host."""
+    if self.model_parallelism_enabled:
+      return self.num_replicas // self.num_hosts
+    else:
+      return self.num_of_cores_per_host
+
+  @property
+  def num_replicas(self):
+    num_cores_in_system = self.num_cores
+
+    if self.model_parallelism_enabled:
+      num_cores_per_replica = self._config.tpu_config.num_cores_per_replica
+      if num_cores_per_replica > num_cores_in_system:
+        raise ValueError(
+            'The num of cores required by the model parallelism, specified by '
+            'TPUConfig.num_cores_per_replica, is larger than the total num of '
+            'TPU cores in the system. num_cores_per_replica: {}, num cores '
+            'in the system: {}'.format(num_cores_per_replica,
+                                       num_cores_in_system))
+
+      if num_cores_in_system % num_cores_per_replica != 0:
+        raise RuntimeError(
+            'The num of cores in the system ({}) is not divisible by the num '
+            'of cores ({}) required by the model parallelism, specified by '
+            'TPUConfig.num_cores_per_replica. This should never happen!'.format(
+                num_cores_in_system, num_cores_per_replica))
+
+      return num_cores_in_system // num_cores_per_replica
+    else:
+      return num_cores_in_system
+
+  @property
+  def num_hosts(self):
+    metadata = self._get_tpu_system_metadata()
+    return metadata.num_hosts
+
+  @property
+  def config(self):
+    return self._config
+
+  def is_input_sharded_per_core(self):
+    """Return true if input_fn is invoked per-core (other than per-host)."""
+    mode = self._assert_mode()
+    return (mode == model_fn_lib.ModeKeys.TRAIN and
+            (self._config.tpu_config.per_host_input_for_training is
+             tpu_config.InputPipelineConfig.PER_SHARD_V1))
+
+  def is_input_per_host_with_iterators(self):
+    """Return true if input_fn should be run in the per-host v2 config."""
+    return (self._config.tpu_config.per_host_input_for_training is
+            tpu_config.InputPipelineConfig.PER_HOST_V2)
+
+  def is_input_broadcast_with_iterators(self):
+    """Return true if input_fn should be run in the full_replicae config."""
+    return (self._config.tpu_config.per_host_input_for_training is
+            tpu_config.InputPipelineConfig.BROADCAST)
+
+  def is_running_on_cpu(self, is_export_mode=False):
+    """Determines whether the input_fn and model_fn should be invoked on CPU.
+
+    This API also validates user provided configuration, such as batch size,
+    according the lazy initialized TPU system metadata.
+
+    Args:
+      is_export_mode: Indicates whether the current mode is for exporting the
+        model, when mode == PREDICT. Only with this bool, we could
+        tell whether user is calling the Estimator.predict or
+        Estimator.export_savedmodel, which are running on TPU and CPU
+        respectively. Parent class Estimator does not distinguish these two.
+
+    Returns:
+      bool, whether current input_fn or model_fn should be running on CPU.
+
+    Raises:
+      ValueError: any configuration is invalid.
+    """
+
+    is_running_on_cpu = self._is_running_on_cpu(is_export_mode)
+    if not is_running_on_cpu:
+      self._validate_tpu_configuration()
+    return is_running_on_cpu
+
+  def _is_running_on_cpu(self, is_export_mode):
+    """Determines whether the input_fn and model_fn should be invoked on CPU."""
+    mode = self._assert_mode()
+
+    if not self._use_tpu:
+      return True
+
+    if mode == model_fn_lib.ModeKeys.EVAL and not self._eval_on_tpu:
+      logging.info('_is_running_on_cpu: eval_on_tpu disabled')
+      return True
+
+    if is_export_mode:
+      return True
+
+    return False
+
+  @property
+  def global_batch_size(self):
+    mode = self._assert_mode()
+    if mode == model_fn_lib.ModeKeys.TRAIN:
+      return self._train_batch_size
+    elif mode == model_fn_lib.ModeKeys.EVAL:
+      return self._eval_batch_size
+    elif mode == model_fn_lib.ModeKeys.PREDICT:
+      return self._predict_batch_size
+    else:
+      return None
+
+  @property
+  def batch_size_for_input_fn(self):
+    """Returns the shard batch size for `input_fn`."""
+    global_batch_size = self.global_batch_size
+
+    if (self.is_running_on_cpu() or self.is_input_broadcast_with_iterators()):
+      return global_batch_size
+
+    # On TPU
+    if self.is_input_sharded_per_core() or (
+        self.is_input_per_host_with_iterators()):
+      return global_batch_size // self.num_replicas
+    else:
+      return global_batch_size // self.num_hosts
+
+  @property
+  def batch_size_for_model_fn(self):
+    """Returns the shard batch size for `model_fn`."""
+    global_batch_size = self.global_batch_size
+
+    if (self.is_running_on_cpu() or self.is_input_broadcast_with_iterators()):
+      return global_batch_size
+
+    # On TPU. always sharded per shard.
+    return global_batch_size // self.num_replicas
+
+  @property
+  def master_job(self):
+    """Returns the job name to use to place TPU computations on.
+
+    Returns:
+      A string containing the job name, or None if no job should be specified.
+
+    Raises:
+      ValueError: If the user needs to specify a tpu_job_name, because we are
+        unable to infer the job name automatically, or if the user-specified job
+        names are inappropriate.
+    """
+    run_config = self._config
+    # If the user specifies the tpu_job_name, use that.
+    if run_config.tpu_config.tpu_job_name:
+      return run_config.tpu_config.tpu_job_name
+
+    # The tpu job is determined by the run_config. Right now, this method is
+    # required as tpu_config is not part of the RunConfig.
+    mode = self._assert_mode()
+    master = (
+        run_config.evaluation_master
+        if mode == model_fn_lib.ModeKeys.EVAL else run_config.master)
+    if master in _LOCAL_MASTERS:
+      return None
+
+    if (not run_config.session_config or
+        not run_config.session_config.cluster_def.job):
+      return _DEFAULT_JOB_NAME
+    cluster_def = run_config.session_config.cluster_def
+    job_names = set([job.name for job in cluster_def.job])
+    if _DEFAULT_JOB_NAME in job_names:
+      # b/37868888 tracks allowing ClusterSpec propagation to reuse job names.
+      raise ValueError('Currently, tpu_worker is not an allowed job name.')
+    if len(job_names) == 1:
+      return cluster_def.job[0].name
+    if len(job_names) == 2:
+      if _DEFAULT_COORDINATOR_JOB_NAME in job_names:
+        job_names.remove(_DEFAULT_COORDINATOR_JOB_NAME)
+        return job_names.pop()
+      # TODO(b/67716447): Include more sophisticated heuristics.
+    raise ValueError(
+        'Could not infer TPU job name. Please specify a tpu_job_name as part '
+        'of your TPUConfig.')
+
+  @property
+  def tpu_host_placement_function(self):
+    """Returns the TPU host place function."""
+
+    master = self.master_job
+
+    def _placement_function(_sentinal=None, replica_id=None, host_id=None):  # pylint: disable=invalid-name
+      """Return the host device given replica_id or host_id."""
+      assert _sentinal is None
+      if replica_id is not None and host_id is not None:
+        raise RuntimeError(
+            'replica_id and host_id can have only one non-None value.')
+
+      if master is None:
+        return '/replica:0/task:0/device:CPU:0'
+      else:
+        if replica_id is not None:
+          if self.model_parallelism_enabled:
+            return self.device_assignment.host_device(
+                replica=replica_id, job=master)
+          else:
+            host_id = replica_id / self.num_of_cores_per_host
+
+        return '/job:%s/task:%d/device:CPU:0' % (master, host_id)
+
+    return _placement_function
+
+  @property
+  def tpu_device_placement_function(self):
+    """Returns a TPU device placement Fn."""
+    master = self.master_job
+    job_device = '' if master is None else ('/job:%s' % master)
+
+    def _placement_function(i):
+      if self.model_parallelism_enabled:
+        return self.device_assignment.tpu_device(replica=i, job=master)
+      else:
+        num_of_cores_per_host = self.num_of_cores_per_host
+        host_id = i / num_of_cores_per_host
+        ordinal_id = i % num_of_cores_per_host
+        return '%s/task:%d/device:TPU:%d' % (job_device, host_id, ordinal_id)
+
+    return _placement_function
+
+  def tpu_ordinal_function(self, host_id):
+    """Returns the TPU ordinal fn."""
+
+    def _tpu_ordinal_function(shard_index_in_host):
+      """Return the TPU ordinal associated with a shard.
+
+      Required because the enqueue ops are placed on CPU.
+
+      Args:
+        shard_index_in_host: the shard index
+
+      Returns:
+        The ordinal of the TPU device the shard's infeed should be placed on.
+      """
+      if self.model_parallelism_enabled:
+        # We put both enqueue/dequeue ops at tpu.core(0) in each replica.
+        replica = self.device_assignment.lookup_replicas(host_id,
+                                                         0)[shard_index_in_host]
+        return self.device_assignment.tpu_ordinal(replica=replica)
+      else:
+        return shard_index_in_host % self.num_of_cores_per_host
+
+    return _tpu_ordinal_function
+
+  def _validate_tpu_configuration(self):
+    """Validates the configuration based on the TPU system metadata."""
+    mode = self._assert_mode()
+    if self._lazy_validation_dict.get(mode):
+      return
+
+    # All following information is obtained from TPU system metadata.
+    num_cores = self.num_cores
+    num_replicas = self.num_replicas
+    num_hosts = self.num_hosts
+
+    if not num_cores:
+      tpu_system_metadata = self._get_tpu_system_metadata()
+      raise RuntimeError(
+          'Cannot find any TPU cores in the system. Please double check '
+          'Tensorflow master address and TPU worker(s). Available devices '
+          'are {}.'.format(tpu_system_metadata.devices))
+
+    if self._config.tpu_config.num_shards:
+      user_provided_num_replicas = self._config.tpu_config.num_shards
+      if user_provided_num_replicas != num_replicas:
+        message = (
+            'TPUConfig.num_shards is not set correctly. According to TPU '
+            'system metadata for Tensorflow master ({}): num_replicas should '
+            'be ({}), got ({}). For non-model-parallelism, num_replicas should '
+            'be the total num of TPU cores in the system. For '
+            'model-parallelism, the total number of TPU cores should be '
+            'num_cores_per_replica * num_replicas. Please set it '
+            'accordingly or leave it as `None`'.format(
+                self._get_master_address(), num_replicas,
+                user_provided_num_replicas))
+
+        raise ValueError(message)
+
+    if self._config.tpu_config.num_cores_per_replica:
+      num_cores_per_replica = self._config.tpu_config.num_cores_per_replica
+      num_cores_per_host = self._get_tpu_system_metadata().num_of_cores_per_host
+      if num_cores_per_replica > num_cores_per_host:
+        raise ValueError(
+            'The num of cores required by the model parallelism, specified by '
+            'TPUConfig.num_cores_per_replica, is larger than the '
+            'num_cores_per_host. num_cores_per_replica: {}, '
+            'num_cores_per_host: {}'.format(num_cores_per_replica,
+                                            num_cores_per_host))
+
+    if mode == model_fn_lib.ModeKeys.TRAIN:
+      if (self._train_batch_size % num_replicas != 0 and
+          not self.is_input_broadcast_with_iterators()):
+        raise ValueError(
+            'train batch size {} must be divisible by number of replicas {}'
+            .format(self._train_batch_size, num_replicas))
+
+    elif mode == model_fn_lib.ModeKeys.EVAL:
+      if self._eval_batch_size is None:
+        raise ValueError(
+            'eval_batch_size in TPUEstimator constructor cannot be `None`'
+            'if .evaluate is running on TPU.')
+      if (self._eval_batch_size % num_replicas != 0 and
+          not self.is_input_broadcast_with_iterators()):
+        raise ValueError(
+            'eval batch size {} must be divisible by number of replicas {}'
+            .format(self._eval_batch_size, num_replicas))
+      if num_hosts > 1 and not self.is_input_broadcast_with_iterators():
+        raise ValueError(
+            'TPUEstimator.evaluate should be running on single TPU'
+            ' instead of a Pod.')
+    else:
+      assert mode == model_fn_lib.ModeKeys.PREDICT
+      if self._predict_batch_size is None:
+        raise ValueError(
+            'predict_batch_size in TPUEstimator constructor should not be '
+            '`None` if .predict is running on TPU.')
+      if (self._predict_batch_size % num_replicas != 0 and
+          not self.is_input_broadcast_with_iterators()):
+        raise ValueError(
+            'predict batch size {} must be divisible by number of replicas {}'
+            .format(self._predict_batch_size, num_replicas))
+      if num_hosts > 1 and not self.is_input_broadcast_with_iterators():
+        raise ValueError(
+            'TPUEstimator.predict should be running on single TPU worker. '
+            'got {}.'.format(num_hosts))
+
+    # Record the state "validated" into lazy dictionary.
+    self._lazy_validation_dict[mode] = True
+
+  def device_for_replica(self, replica_id):
+    """Returns the tuple of (CPU device and device ordinal) for replica.
+
+    This should be used for full replicate for non-model-parallelism.
+
+    Args:
+       replica_id: Int, the replica index.
+
+    Returns:
+       A tuple of device spec for CPU device and int device ordinal.
+    """
+    master = self.master_job
+
+    if self.model_parallelism_enabled:
+      return (self.device_assignment.host_device(
+          replica=replica_id, job=master),
+              self.device_assignment.tpu_ordinal(replica=replica_id))
+
+    job_device = '' if master is None else ('/job:%s' % master)
+
+    num_of_replicas_per_host = self.num_of_replicas_per_host
+    host_id = replica_id / num_of_replicas_per_host
+    ordinal_id = replica_id % num_of_replicas_per_host
+
+    host_device = '%s/task:%d/device:CPU:0' % (job_device, host_id)
+    return (host_device, ordinal_id)
+
+
+class _OneCoreTPUContext(_InternalTPUContext):
+  """Special _InternalTPUContext for one core usage."""
+
+  def __init__(self, config, train_batch_size, eval_batch_size,
+               predict_batch_size, use_tpu):
+
+    super(_OneCoreTPUContext, self).__init__(
+        config, train_batch_size, eval_batch_size,
+        predict_batch_size, use_tpu)
+
+  def _get_tpu_system_metadata(self):
+    """Gets the (maybe cached) TPU system metadata."""
+    master = self._get_master_address()
+    tpu_system_metadata = self._lazy_tpu_system_metadata_dict.get(master)
+    if tpu_system_metadata is not None:
+      return tpu_system_metadata
+
+    tpu_system_metadata = (
+        tpu_system_metadata_lib._TPUSystemMetadata(  # pylint: disable=protected-access
+            num_cores=1,
+            num_hosts=1,
+            num_of_cores_per_host=1,
+            topology=None,
+            devices=[]))
+
+    self._lazy_tpu_system_metadata_dict[master] = tpu_system_metadata
+    return tpu_system_metadata
+
+
+def _get_tpu_context(config, train_batch_size, eval_batch_size,
+                     predict_batch_size, use_tpu, eval_on_tpu,
+                     embedding_config_spec):
+  """Returns an instance of `_InternalTPUContext`."""
+
+  if (config.tpu_config.num_shards == 1 and
+      config.tpu_config.num_cores_per_replica is None):
+    if embedding_config_spec is not None:
+      raise ValueError('Setting TPUConfig.num_shards==1 is unsupported '
+                       'when embedding_config_spec is not None.')
+    logging.warning(
+        'Setting TPUConfig.num_shards==1 is an unsupported behavior. '
+        'Please fix as soon as possible (leaving num_shards as None.)')
+    return _OneCoreTPUContext(config, train_batch_size, eval_batch_size,
+                              predict_batch_size, use_tpu)
+
+  return _InternalTPUContext(config, train_batch_size, eval_batch_size,
+                             predict_batch_size, use_tpu, eval_on_tpu,
+                             embedding_config_spec)
diff --git a/tensorflow/python/tpu/tpu_embedding.py b/tensorflow/python/tpu/tpu_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7f17395f11899450968ec08c8440b10b5386e45
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding.py
@@ -0,0 +1,1102 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TPU embedding APIs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import copy
+import math
+import re
+import six
+
+from tensorflow.core.protobuf.tpu import optimization_parameters_pb2
+from tensorflow.core.protobuf.tpu import tpu_embedding_configuration_pb2 as elc
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
+from tensorflow.python.tpu.ops import tpu_ops
+
+TRAINING = elc.TPUEmbeddingConfiguration.TRAINING
+INFERENCE = elc.TPUEmbeddingConfiguration.INFERENCE
+
+
+class TableConfig(
+    collections.namedtuple(
+        'TableConfig',
+        ['vocabulary_size', 'dimension', 'initializer', 'combiner'])):
+  """Embedding table configuration."""
+
+  def __new__(cls,
+              vocabulary_size,
+              dimension,
+              initializer=None,
+              combiner='mean'):
+    """Embedding table configuration.
+
+    Args:
+      vocabulary_size: Number of vocabulary (/rows) in the table.
+      dimension: The embedding dimension.
+      initializer: A variable initializer function to be used in embedding
+        variable initialization. If not specified, defaults to
+        `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
+        `1/sqrt(dimension)`.
+      combiner: A string specifying how to reduce if there are multiple entries
+        in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
+        'mean' the default. 'sqrtn' often achieves good accuracy, in particular
+        with bag-of-words columns. For more information, see
+        `tf.nn.embedding_lookup_sparse`.
+
+    Returns:
+      `TableConfig`.
+
+    Raises:
+      ValueError: if `vocabulary_size` is not positive integer.
+      ValueError: if `dimension` is not positive integer.
+      ValueError: if `initializer` is specified and is not callable.
+      ValueError: if `combiner` is not supported.
+    """
+    if not isinstance(vocabulary_size, int) or vocabulary_size < 1:
+      raise ValueError('Invalid vocabulary_size {}.'.format(vocabulary_size))
+
+    if not isinstance(dimension, int) or dimension < 1:
+      raise ValueError('Invalid dimension {}.'.format(dimension))
+
+    if (initializer is not None) and (not callable(initializer)):
+      raise ValueError('initializer must be callable if specified.')
+    if initializer is None:
+      initializer = init_ops.truncated_normal_initializer(
+          mean=0.0, stddev=1 / math.sqrt(dimension))
+
+    if combiner not in ('mean', 'sum', 'sqrtn'):
+      raise ValueError('Invalid combiner {}'.format(combiner))
+
+    return super(TableConfig, cls).__new__(cls, vocabulary_size, dimension,
+                                           initializer, combiner)
+
+
+AdamSlotVariableNames = collections.namedtuple(
+    'AdamSlotVariableNames', ['m', 'v'])
+
+AdagradSlotVariableName = collections.namedtuple(
+    'AdagradSlotVariableName', ['accumulator'])
+
+AdamSlotVariables = collections.namedtuple(
+    'AdamSlotVariables', ['m', 'v'])
+
+AdagradSlotVariable = collections.namedtuple(
+    'AdagradSlotVariable', ['accumulator'])
+
+VariablesAndOps = collections.namedtuple(
+    'VariablesAndOps',
+    ['embedding_variables_by_table', 'slot_variables_by_table',
+     'load_ops', 'retrieve_ops']
+)
+
+
+# TODO(shizhiw): Factor `use_gradient_accumulation` and
+# `pipeline_execution_with_tensor_core` out of `_OptimizationParameters`.
+class _OptimizationParameters(object):
+  """Parameters common to all optimizations."""
+
+  def __init__(self, learning_rate, use_gradient_accumulation,
+               pipeline_execution_with_tensor_core):
+    self.learning_rate = learning_rate
+    self.use_gradient_accumulation = use_gradient_accumulation
+    self.pipeline_execution_with_tensor_core = (
+        pipeline_execution_with_tensor_core)
+
+
+class AdagradParameters(_OptimizationParameters):
+  """Optimization parameters for Adagrad."""
+
+  def __init__(self, learning_rate, initial_accumulator,
+               use_gradient_accumulation=False,
+               pipeline_execution_with_tensor_core=True):
+    """Optimization parameters for Adagrad.
+
+    Args:
+      learning_rate: used for updating embedding table.
+      initial_accumulator: initial accumulator for Adagrad.
+      use_gradient_accumulation: setting this to `True` makes embedding
+         gradients calculation more accurate but slower. Please see
+         `optimization_parameters.proto` for details.
+         for details.
+      pipeline_execution_with_tensor_core: setting this to `True` makes training
+        faster, but trained model will be different if step N and step N+1
+        involve the same set of embedding ID. Please see
+        `tpu_embedding_configuration.proto` for details.
+    """
+    super(AdagradParameters, self).__init__(learning_rate,
+                                            use_gradient_accumulation,
+                                            pipeline_execution_with_tensor_core)
+    self.initial_accumulator = initial_accumulator
+
+
+class AdamParameters(_OptimizationParameters):
+  """Optimization parameters for Adam."""
+
+  def __init__(self, learning_rate,
+               beta1=0.9,
+               beta2=0.999,
+               epsilon=1e-08,
+               lazy_adam=True,
+               sum_inside_sqrt=True,
+               use_gradient_accumulation=False,
+               pipeline_execution_with_tensor_core=True):
+    """Optimization parameters for Adam.
+
+    Args:
+      learning_rate: a floating point value. The learning rate.
+      beta1: A float value.
+        The exponential decay rate for the 1st moment estimates.
+      beta2: A float value.
+        The exponential decay rate for the 2nd moment estimates.
+      epsilon: A small constant for numerical stability.
+      lazy_adam: Use lazy Adam instead of Adam. Lazy Adam trains faster.
+        Please see `optimization_parameters.proto` for details.
+      sum_inside_sqrt: This improves training speed. Please see
+        `optimization_parameters.proto` for details.
+      use_gradient_accumulation: setting this to `True` makes embedding
+        gradients calculation more accurate but slower. Please see
+        `optimization_parameters.proto` for details.
+        for details.
+      pipeline_execution_with_tensor_core: setting this to `True` makes training
+        faster, but trained model will be different if step N and step N+1
+        involve the same set of embedding ID. Please see
+        `tpu_embedding_configuration.proto` for details.
+    """
+    super(AdamParameters, self).__init__(learning_rate,
+                                         use_gradient_accumulation,
+                                         pipeline_execution_with_tensor_core)
+    self.beta1 = beta1
+    self.beta2 = beta2
+    self.epsilon = epsilon
+    self.lazy_adam = lazy_adam
+    self.sum_inside_sqrt = sum_inside_sqrt
+
+
+class StochasticGradientDescentParameters(_OptimizationParameters):
+  """Optimization parameters for stochastic gradient descent.
+
+  Args:
+    learning_rate: a floating point value. The learning rate.
+    use_gradient_accumulation: setting this to `True` makes embedding
+      gradients calculation more accurate but slower. Please see
+         `optimization_parameters.proto` for details.
+    pipeline_execution_with_tensor_core: setting this to `True` makes training
+      faster, but trained model will be different if step N and step N+1
+      involve the same set of embedding ID. Please see
+      `tpu_embedding_configuration.proto` for details.
+    """
+
+  def __init__(self, learning_rate, use_gradient_accumulation=False,
+               pipeline_execution_with_tensor_core=True):
+    super(StochasticGradientDescentParameters, self).__init__(
+        learning_rate, use_gradient_accumulation,
+        pipeline_execution_with_tensor_core)
+
+
+class TPUEmbedding(object):
+  """API for using TPU for embedding.
+
+    Example:
+    ```
+    table_config_user = tpu_embedding.TableConfig(
+        vocabulary_size=4, dimension=2,
+        initializer=initializer, combiner='mean')
+    table_to_config_dict = {'video': table_config_video,
+                          'user': table_config_user}
+    feature_to_table_dict = {'watched': 'video',
+                             'favorited': 'video',
+                             'friends': 'user'}
+    batch_size = 4
+    num_hosts = 1
+    optimization_parameters = tpu_embedding.AdagradParameters(1., 1.)
+    mode = tpu_embedding.TRAINING
+    embedding = tpu_embedding.TPUEmbedding(
+        table_to_config_dict, feature_to_table_dict,
+        batch_size, num_hosts, mode, optimization_parameters)
+
+    batch_size_per_core = embedding.batch_size_per_core
+    sparse_features_list = []
+    for host in hosts:
+      with ops.device(host):
+        for _ in range(embedding.num_cores_per_host):
+          sparse_features = {}
+          sparse_features['watched'] = sparse_tensor.SparseTensor(...)
+          sparse_features['favorited'] = sparse_tensor.SparseTensor(...)
+          sparse_features['friends'] = sparse_tensor.SparseTensor(...)
+          sparse_features_list.append(sparse_features)
+
+    enqueue_ops = embedding.generate_enqueue_ops(sparse_features_list)
+    embedding_variables_and_ops = embedding.create_variables_and_ops()
+
+    def computation():
+      activations = embedding.get_activations()
+      loss = compute_loss(activations)
+
+      base_optimizer = gradient_descent.GradientDescentOptimizer(
+          learning_rate=1)
+      cross_shard_optimizer = tpu_optimizer.CrossShardOptimizer(
+          base_optimizer)
+
+      train_op = cross_shard_optimizer.minimize(loss)
+      gradients = (
+          tpu_embedding_gradient.get_gradients_through_compute_gradients(
+              cross_shard_optimizer, loss, activations)
+      send_gradients_op = embedding.generate_send_gradients_op(gradients)
+      with ops.control_dependencies([train_op, send_gradients_op]):
+        loss = array_ops.identity(loss)
+
+    loss = tpu.shard(computation,
+                     num_shards=embedding.num_cores)
+
+    with self.test_session() as sess:
+      sess.run(tpu.initialize_system(embedding_config=
+                                     embedding.config_proto))
+      sess.run(variables.global_variables_initializer())
+      sess.run(embedding_variables_and_ops.load_ops())
+      sess.run(enqueue_ops)
+      loss_val = sess.run(loss)
+    ```
+  """
+
+  # TODO(shizhiw): Instead of `feature_to_table_dict` which maps to table
+  # name, consider `feature_to_config_dict` which maps to `FeatureConfig`.
+  # `FeatureConfig` could have fields other than table name. For example, it
+  # could have a field to indicate that the feature should not be used to
+  # update embedding table (cr/204852758, cr/204940540). Also, this can support
+  # different combiners for different features within the same table.
+  # TODO(shizhiw, b/118512626): Remove `batch_size` from `__init__` and move it
+  # to `FeatureConfig`?
+
+  # TODO(shizhiw): will it be cleaner to make `table_to_config_dict` and
+  # `feature_to_table_dict` lists of `TableSpec` and `FeatureSpec` respectively?
+
+  # TODO(shizhiw): Consider adding `input_fn` as an option to remove boilerplate
+  # for-loops around construction of inputs.
+
+  # `optimization_parameter` applies to all tables. If the need arises,
+  # we can add `optimization_parameters` to `TableConfig` to override this
+  # global setting.
+  def __init__(self,
+               table_to_config_dict,
+               feature_to_table_dict,
+               batch_size,
+               mode,
+               master,
+               optimization_parameters=None):
+    """API for using TPU for embedding lookups.
+
+    Args:
+      table_to_config_dict: A dictionary mapping from string of table name to
+        `TableConfig`. Table refers to an embedding table, e.g. `params`
+        argument to `tf.nn.embedding_lookup_sparse()`.
+      feature_to_table_dict: A dictionary mapping from string of feature name
+        to string of table name. Feature refers to ids to lookup in embedding
+        table, e.g. `sp_ids` argument to `tf.nn.embedding_lookup_sparse()`.
+      batch_size: An `int` representing the global batch size.
+      mode: `TRAINING` or `INFERENCE`.
+      master: A `string` representing the TensorFlow master to use.
+      optimization_parameters: `AdagradParameters`, `AdamParameters`,
+        `Stochasticgradientdescentparameters`. Must be set in training and must
+        be `None` in inference.
+
+    Raises:
+      ValueError: if any input is invalid.
+    """
+    _validate_table_to_config_dict(table_to_config_dict)
+    # Avoid nondeterminism from `Dict` iteration order by using `OrderedDict`.
+    self._table_to_config_dict = _create_ordered_dict(table_to_config_dict)
+    self._combiners = _create_combiners(self._table_to_config_dict)
+
+    _validate_feature_to_table_dict(table_to_config_dict, feature_to_table_dict)
+    self._feature_to_table_dict = _create_ordered_dict(feature_to_table_dict)
+    self._table_to_features_dict = _create_table_to_features_dict(
+        self._feature_to_table_dict)
+
+    self._batch_size = batch_size
+
+    self._master = master
+    self._tpu_system_metadata = (
+        tpu_system_metadata_lib._query_tpu_system_metadata(self._master))  # pylint: disable=protected-access
+    if self._tpu_system_metadata.num_cores == 0:
+      raise ValueError('TPUEmbedding needs TPUs, but master {} does not have '
+                       'TPUs.'.format(self._master))
+    self._num_hosts = self._tpu_system_metadata.num_hosts
+    self._hosts = [device.name for device in self._tpu_system_metadata.devices
+                   if 'device:CPU:' in device.name]
+    self._num_cores_per_host = self._tpu_system_metadata.num_of_cores_per_host
+    self._num_cores = self._tpu_system_metadata.num_cores
+
+    _validate_batch_size(self._batch_size, self._num_cores)
+    self._batch_size_per_core = self._batch_size // self._num_cores
+
+    # TODO(shizhiw): remove `mode`?
+    if mode == TRAINING:
+      _validate_optimization_parameters(optimization_parameters)
+      self._optimization_parameters = optimization_parameters
+    elif mode == INFERENCE:
+      if optimization_parameters is not None:
+        raise ValueError('`optimization_parameters` should be `None` '
+                         'for inference mode.')
+      self._optimization_parameters = (
+          StochasticGradientDescentParameters(1.))
+    else:
+      raise ValueError('`mode` only supports {} and {}; got {}.'
+                       .format(TRAINING, INFERENCE, mode))
+    self._mode = mode
+
+    # TODO(shizhiw): move `optimization_parameters` into `_optimizer_handler`
+    # and create special handler for inference that inherits from
+    # StochasticGradientDescentHandler with more user-friendly error message
+    # on get_slot().
+    self._optimizer_handler = _get_optimization_handler(
+        self._optimization_parameters)
+
+    self._config_proto = self._create_config_proto()
+
+  @property
+  def hosts(self):
+    """A list of device names for CPU hosts.
+
+    Returns:
+      A list of device names for CPU hosts.
+    """
+    return copy.copy(self._hosts)
+
+  # TODO(shizhiw): change to num_tensor_cores_per_host to be more explicit and
+  # to be consistent with `tpu_embedding_configuration.proto`.
+  @property
+  def num_cores_per_host(self):
+    """Number of TPU cores on a CPU host.
+
+    Returns:
+      Number of TPU cores on a CPU host.
+    """
+    return self._num_cores_per_host
+
+  @property
+  def num_cores(self):
+    """Total number of TPU cores on all hosts.
+
+    Returns:
+      Total number of TPU cores on all hosts.
+    """
+    return self._num_cores
+
+  @property
+  def batch_size_per_core(self):
+    """Batch size for each TPU core.
+
+    The sparse tensors in `sparse_features_list` to `generate_enqueue_ops`
+       must have batch dimension equal to this.
+
+    Returns:
+      Batch size for each TPU core.
+    """
+    return self._batch_size_per_core
+
+  @property
+  def config_proto(self):
+    """Create embedding config proto for `tpu.initialize_system()`.
+
+    Returns:
+      an `TPUEmbeddingConfiguration` proto describing the desired
+         configuration of the hardware embedding lookup tables, which
+         is passed to `tpu.initialize_system()`.
+    """
+    return self._config_proto
+
+  @property
+  def table_to_config_dict(self):
+    return copy.copy(self._table_to_config_dict)
+
+  @property
+  def feature_to_table_dict(self):
+    return copy.copy(self._feature_to_table_dict)
+
+  @property
+  def table_to_features_dict(self):
+    return copy.copy(self._table_to_features_dict)
+
+  @property
+  def optimization_parameters(self):
+    return self._optimization_parameters
+
+  def _create_config_proto(self):
+    """Create `TPUEmbeddingConfiguration`."""
+    config_proto = elc.TPUEmbeddingConfiguration()
+    for table in self._table_to_config_dict:
+      table_descriptor = config_proto.table_descriptor.add()
+      table_descriptor.name = table
+
+      table_config = self._table_to_config_dict[table]
+      table_descriptor.vocabulary_size = table_config.vocabulary_size
+      table_descriptor.dimension = table_config.dimension
+
+      features_for_table = self._table_to_features_dict[table]
+      table_descriptor.num_features = len(features_for_table)
+
+      table_descriptor.optimization_parameters.learning_rate.constant = (
+          self._optimization_parameters.learning_rate)
+      table_descriptor.optimization_parameters.gradient_accumulation_status = (
+          optimization_parameters_pb2.GradientAccumulationStatus.ENABLED
+          if self._optimization_parameters.use_gradient_accumulation else
+          optimization_parameters_pb2.GradientAccumulationStatus.DISABLED)
+      self._optimizer_handler.set_optimization_parameters(table_descriptor)
+
+    config_proto.mode = self._mode
+    config_proto.batch_size_per_tensor_core = self._batch_size_per_core
+    config_proto.num_hosts = self._num_hosts
+    config_proto.num_tensor_cores = self._num_cores
+    config_proto.sharding_strategy = elc.TPUEmbeddingConfiguration.DIV_DEFAULT
+    config_proto.pipeline_execution_with_tensor_core = (
+        self._optimization_parameters.pipeline_execution_with_tensor_core)
+
+    return config_proto
+
+  def create_variables_and_ops(self, embedding_variable_name_by_table=None,
+                               slot_variable_names_by_table=None):
+    """Create embedding and slot variables, with ops to load and retrieve them.
+
+    Args:
+      embedding_variable_name_by_table: A dictionary mapping from string of
+        table name to string of embedding variable name. If `None`,
+        defaults from `get_default_slot_variable_names()` will be used.
+      slot_variable_names_by_table: A dictionary mapping from string of table
+        name to `AdamSlotVariableNames`, `AdagradSlotVariableNames` etc. If
+        `None`, defaults from `get_default_slot_variable_names()` will be used.
+
+    Returns:
+      `tpu_embedding.VariablesAndOps` with:
+        A dictionary mapping from string of table name to embedding variables,
+        A dictionary mapping from string of table name to AdagradSlotVariable,
+         AdamSlotVariables etc with slot variables,
+        A function which returns a list of ops to load embedding and slot
+         variables from TPU to CPU.
+        A function which returns a list of ops to retrieve embedding and slot
+         variables from TPU to CPU.
+    """
+    embedding_variables_by_table = {}
+    slot_variables_by_table = {}
+    load_op_fns = []
+    retrieve_op_fns = []
+    for table in self._table_to_config_dict:
+      if embedding_variable_name_by_table:
+        embedding_variable_name = embedding_variable_name_by_table[table]
+      else:
+        embedding_variable_name = table
+      if slot_variable_names_by_table:
+        slot_variable_names = slot_variable_names_by_table[table]
+      else:
+        slot_variable_names = (
+            self._optimizer_handler.get_default_slot_variable_names(table))
+
+      device_fn = _create_device_fn(self._hosts)
+      with ops.device(device_fn):
+        table_variables = _create_partitioned_variables(
+            name=embedding_variable_name,
+            num_hosts=self._num_hosts,
+            vocabulary_size=self._table_to_config_dict[table].vocabulary_size,
+            embedding_dimension=self._table_to_config_dict[table].dimension,
+            initializer=self._table_to_config_dict[table].initializer,
+            collections=[ops.GraphKeys.GLOBAL_VARIABLES])
+        embedding_variables_by_table[table] = table_variables
+
+        slot_variables_for_table, load_ops_fn, retrieve_ops_fn = (
+            self._optimizer_handler.create_variables_and_ops(
+                table, slot_variable_names, self._num_hosts,
+                self._table_to_config_dict[table], table_variables)
+        )
+        slot_variables_by_table[table] = slot_variables_for_table
+        load_op_fns.append(load_ops_fn)
+        retrieve_op_fns.append(retrieve_ops_fn)
+
+    def load_ops():
+      """Calls and returns the load ops for each embedding table.
+
+      Returns:
+        A list of ops to load embedding and slot variables from CPU to TPU.
+      """
+      load_ops_list = []
+      for load_op_fn in load_op_fns:
+        load_ops_list.extend(load_op_fn())
+      return load_ops_list
+
+    def retrieve_ops():
+      """Calls and returns the retrieve ops for each embedding table.
+
+      Returns:
+        A list of ops to retrieve embedding and slot variables from TPU to CPU.
+      """
+      retrieve_ops_list = []
+      for retrieve_op_fn in retrieve_op_fns:
+        retrieve_ops_list.extend(retrieve_op_fn())
+      return retrieve_ops_list
+
+    return VariablesAndOps(embedding_variables_by_table,
+                           slot_variables_by_table,
+                           load_ops, retrieve_ops)
+
+  def generate_enqueue_ops(self, sparse_features_list):
+    """Generate enqueue ops.
+
+    Args:
+      sparse_features_list: a list of dictionary mapping from string
+        of feature names to sparse tensor. Each dictionary is for one
+        TPU core. Dictionaries for the same core should be contiguous
+        on the list.
+
+    Returns:
+      Ops to enqueue to TPU for embedding.
+    """
+    self._validate_generate_enqueue_ops_sparse_features_list(
+        sparse_features_list)
+    return [
+        self._generate_enqueue_op(
+            sparse_features, device_ordinal=i % self._num_cores_per_host)
+        for i, sparse_features in enumerate(sparse_features_list)
+    ]
+
+  def _validate_generate_enqueue_ops_sparse_features_list(
+      self, sparse_features_list):
+    """Validate `sparse_features_list`."""
+    if len(sparse_features_list) != self._num_cores:
+      raise ValueError('Length of `sparse_features_list` should match the '
+                       'number of cores; '
+                       '`len(sparse_features_list)` is {}, '
+                       'number of cores is {}.'.format(
+                           len(sparse_features_list), self._num_cores))
+
+    feature_set = set(self._feature_to_table_dict.keys())
+    contiguous_device = None
+    for i, sparse_features in enumerate(sparse_features_list):
+      used_feature_set = set(sparse_features.keys())
+
+      # Check features are valid.
+      missing_feature_set = feature_set - used_feature_set
+      if missing_feature_set:
+        raise ValueError('`sparse_features_list[{}]` misses a feature that is '
+                         'in `feature_to_config_dict`: {}.'.format(
+                             i, missing_feature_set))
+
+      extra_feature_set = used_feature_set - feature_set
+      if extra_feature_set:
+        raise ValueError('`sparse_features_list[{}]` has a feature that is not '
+                         'in `feature_to_config_dict`: {}.'.format(
+                             i, extra_feature_set))
+
+      device = None
+      device_feature = None
+      for feature, tensor in six.iteritems(sparse_features):
+        if not isinstance(tensor, sparse_tensor.SparseTensor):
+          raise ValueError('`sparse_features_list[{}]` has a feature that is '
+                           'not mapped to `SparseTensor`. '
+                           '`feature`: {}, type: {}'.format(
+                               i, feature, type(tensor)))
+
+        # Check all features are on the same device.
+        if device is None:
+          device = tensor.op.device
+          device_feature = feature
+        else:
+          if device != tensor.op.device:
+            raise ValueError('Devices are different between features in '
+                             '`sparse_features_list[{}]`; '
+                             'devices: {}, {}; features: {}, {}.'.format(
+                                 i, device, tensor.op.device, feature,
+                                 device_feature))
+
+      if i % self._num_cores_per_host:
+        if device != contiguous_device:
+          raise ValueError('We expect the `sparse_features` which are on the '
+                           'same host to be contiguous in '
+                           '`sparse_features_list`, '
+                           '`sparse_features_list[{}]` is on device {}, '
+                           'but is expected to be on device {}.'.format(
+                               i, device, contiguous_device))
+      else:
+        contiguous_device = device
+
+  def _generate_enqueue_op(self, sparse_features, device_ordinal):
+    with ops.colocate_with(list(sparse_features.values())[0]):
+      sample_idcs, embedding_idcs, aggregation_weights = (
+          self._format_for_tpu_embedding_sparse_batch(sparse_features))
+      return tpu_ops.enqueue_tpu_embedding_sparse_batch(
+          sample_idcs,
+          embedding_idcs,
+          aggregation_weights,
+          combiners=self._combiners,
+          device_ordinal=device_ordinal)
+
+  def _format_for_tpu_embedding_sparse_batch(self, sparse_features):
+    """Format sparse features for `enqueue_tpu_embedding_sparse_batch()`.
+
+    Args:
+      sparse_features: a `Dict` of `SparseTensor`s for embedding.
+
+    Returns:
+      Arguments for `enqueue_tpu_embedding_sparse_batch()`.
+    """
+
+    sample_idcs, embedding_idcs, aggregation_weights = list(), list(), list()
+    for table in self._table_to_features_dict:
+      sample_t, indices_t, weights_t = list(), list(), list()
+
+      features = self._table_to_features_dict[table]
+      for i, feature in enumerate(features):
+        tensor = sparse_features[feature]
+        sample_indices = tensor.indices[:, 0]
+        embedding_indices = tensor.values
+        weights = array_ops.ones_like(embedding_indices)
+        sample_t.append(i * self._batch_size_per_core + sample_indices)
+        indices_t.append(embedding_indices)
+        weights_t.append(weights)
+
+      sample_idcs.append(
+          math_ops.cast(array_ops.concat(sample_t, axis=0), dtype=dtypes.int32))
+      embedding_idcs.append(
+          math_ops.cast(
+              array_ops.concat(indices_t, axis=0), dtype=dtypes.int32))
+      aggregation_weights.append(
+          math_ops.cast(
+              array_ops.concat(weights_t, axis=0), dtype=dtypes.float32))
+
+    return sample_idcs, embedding_idcs, aggregation_weights
+
+  def get_activations(self):
+    """Get activations for features.
+
+    This should be called within `computation` that is passed to
+      `tpu.replicate` and friends.
+
+    Returns:
+      A dictionary mapping from `String` of feature name to `Tensor`
+        of activation.
+    """
+    recv_activations = tpu_ops.recv_tpu_embedding_activations(
+        num_outputs=len(self._table_to_config_dict),
+        config=self._config_proto.SerializeToString())
+
+    activations = collections.OrderedDict()
+    for table_id, table in enumerate(self._table_to_features_dict):
+      features = self._table_to_features_dict[table]
+      for lookup_id, feature in enumerate(features):
+        start_row = lookup_id * self._batch_size_per_core
+        end_row = start_row + self._batch_size_per_core
+        activations[feature] = recv_activations[table_id][start_row:end_row, :]
+    return activations
+
+  def generate_send_gradients_op(self, feature_to_gradient_dict):
+    """Send gradient to TPU embedding.
+
+    Args:
+      feature_to_gradient_dict: dict mapping feature names to gradient wrt
+        activations.
+
+    Returns:
+      SendTPUEmbeddingGradients Op.
+
+    Raises:
+      RuntimeError: If `mode` is not `TRAINING`.
+    """
+    if self._mode != TRAINING:
+      raise RuntimeError('Only in training mode gradients need to '
+                         'be sent to TPU embedding; got mode {}.'
+                         .format(self._mode))
+    gradients = []
+    for table in self._table_to_features_dict:
+      features = self._table_to_features_dict[table]
+      table_gradients = [
+          feature_to_gradient_dict[feature] for feature in features
+      ]
+      concat_table_grads = array_ops.concat(table_gradients, axis=0)
+      gradients.append(concat_table_grads)
+    return tpu_ops.send_tpu_embedding_gradients(
+        inputs=gradients, config=self.config_proto.SerializeToString())
+
+
+def _validate_table_to_config_dict(table_to_config_dict):
+  """Validate `table_to_config_dict`."""
+  for k, v in six.iteritems(table_to_config_dict):
+    if not isinstance(v, TableConfig):
+      raise ValueError('Value of `table_to_config_dict` must be of type '
+                       '`TableConfig`, got {} for {}.'.format(type(v), k))
+
+
+def _validate_feature_to_table_dict(table_to_config_dict,
+                                    feature_to_table_dict):
+  """Validate `feature_to_table_dict`."""
+  used_table_set = set(feature_to_table_dict.values())
+  table_set = set(table_to_config_dict.keys())
+
+  unused_table_set = table_set - used_table_set
+  if unused_table_set:
+    raise ValueError('`table_to_config_dict` specifies table that is not '
+                     'used in `feature_to_table_dict`: {}.'
+                     .format(unused_table_set))
+
+  extra_table_set = used_table_set - table_set
+  if extra_table_set:
+    raise ValueError('`feature_to_table_dict` refers to a table that is not '
+                     'specified in `table_to_config_dict`: {}.'
+                     .format(extra_table_set))
+
+
+def _validate_batch_size(batch_size, num_cores):
+  if batch_size % num_cores:
+    raise ValueError('`batch_size` is not a multiple of number of '
+                     'cores. `batch_size`={}, `_num_cores`={}.'.format(
+                         batch_size, num_cores))
+
+
+def _validate_optimization_parameters(optimization_parameters):
+  if not isinstance(optimization_parameters, _OptimizationParameters):
+    raise ValueError('`optimization_parameters` must inherit from '
+                     '`_OptimizationPramaters`. '
+                     '`type(optimization_parameters)`={}'.format(
+                         type(optimization_parameters)))
+
+
+class _OptimizerHandler(object):
+  """Interface class for handling optimizer specific logic."""
+
+  def __init__(self, optimization_parameters):
+    self._optimization_parameters = optimization_parameters
+
+  def set_optimization_parameters(self, table_descriptor):
+    raise NotImplementedError()
+
+  def get_default_slot_variable_names(self, table):
+    raise NotImplementedError()
+
+  def create_variables_and_ops(self, table, slot_variable_names, num_hosts,
+                               table_config, table_variables):
+    raise NotImplementedError()
+
+
+class _AdagradHandler(_OptimizerHandler):
+  """Handles Adagrad specific logic."""
+
+  def __init__(self, optimization_parameters):
+    super(_AdagradHandler, self).__init__(optimization_parameters)
+    self._table_to_accumulator_variables_dict = {}
+
+  def set_optimization_parameters(self, table_descriptor):
+    table_descriptor.optimization_parameters.adagrad.SetInParent()
+
+  def get_default_slot_variable_names(self, table):
+    return AdagradSlotVariableName('{}/{}'.format(table, 'Adagrad'))
+
+  def create_variables_and_ops(self, table, slot_variable_names, num_hosts,
+                               table_config, table_variables):
+    accumulator_initializer = init_ops.constant_initializer(
+        self._optimization_parameters.initial_accumulator)
+    accumulator_variables = _create_partitioned_variables(
+        name=slot_variable_names.accumulator,
+        num_hosts=num_hosts,
+        vocabulary_size=table_config.vocabulary_size,
+        embedding_dimension=table_config.dimension,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
+        initializer=accumulator_initializer)
+    slot_variables = AdagradSlotVariable(accumulator_variables)
+
+    def load_ops_fn():
+      """Returns the retrieve ops for AdaGrad embedding tables.
+
+      Returns:
+        A list of ops to load embedding and slot variables from CPU to TPU.
+      """
+      load_op_list = []
+      for host_id, table_variable, accumulator_variable in (zip(
+          range(num_hosts), table_variables, accumulator_variables)):
+        with ops.colocate_with(table_variable):
+          load_parameters_op = (
+              tpu_ops.load_tpu_embedding_adagrad_parameters(
+                  parameters=table_variable,
+                  accumulators=accumulator_variable,
+                  table_name=table,
+                  num_shards=num_hosts,
+                  shard_id=host_id))
+        load_op_list.append(load_parameters_op)
+      return load_op_list
+
+    def retrieve_ops_fn():
+      """Returns the retrieve ops for AdaGrad embedding tables.
+
+      Returns:
+        A list of ops to retrieve embedding and slot variables from TPU to CPU.
+      """
+      retrieve_op_list = []
+      for host_id, table_variable, accumulator_variable in (zip(
+          range(num_hosts), table_variables, accumulator_variables)):
+        with ops.colocate_with(table_variable):
+          retrieved_table, retrieved_accumulator = (
+              tpu_ops.retrieve_tpu_embedding_adagrad_parameters(
+                  table_name=table,
+                  num_shards=num_hosts,
+                  shard_id=host_id))
+          retrieve_parameters_op = control_flow_ops.group(
+              state_ops.assign(table_variable, retrieved_table),
+              state_ops.assign(accumulator_variable, retrieved_accumulator))
+        retrieve_op_list.append(retrieve_parameters_op)
+      return retrieve_op_list
+
+    return slot_variables, load_ops_fn, retrieve_ops_fn
+
+
+class _AdamHandler(_OptimizerHandler):
+  """Handles Adam specific logic."""
+
+  def __init__(self, optimization_parameters):
+    super(_AdamHandler, self).__init__(optimization_parameters)
+    self._table_to_m_variables_dict = {}
+    self._table_to_v_variables_dict = {}
+
+  def set_optimization_parameters(self, table_descriptor):
+    table_descriptor.optimization_parameters.adam.beta1 = (
+        self._optimization_parameters.beta1)
+    table_descriptor.optimization_parameters.adam.beta2 = (
+        self._optimization_parameters.beta2)
+    table_descriptor.optimization_parameters.adam.epsilon = (
+        self._optimization_parameters.epsilon)
+    table_descriptor.optimization_parameters.adam.use_non_lazy_adam = (
+        not self._optimization_parameters.lazy_adam)
+    table_descriptor.optimization_parameters.adam.use_sum_inside_sqrt = (
+        self._optimization_parameters.sum_inside_sqrt)
+
+  def get_default_slot_variable_names(self, table):
+    return AdamSlotVariableNames('{}/{}/m'.format(table, 'Adam'),
+                                 '{}/{}/v'.format(table, 'Adam'))
+
+  def create_variables_and_ops(self, table, slot_variable_names, num_hosts,
+                               table_config, table_variables):
+    m_initializer = init_ops.zeros_initializer()
+    m_variables = _create_partitioned_variables(
+        name=slot_variable_names.m,
+        num_hosts=num_hosts,
+        vocabulary_size=table_config.vocabulary_size,
+        embedding_dimension=table_config.dimension,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
+        initializer=m_initializer)
+    v_initializer = init_ops.zeros_initializer()
+    v_variables = _create_partitioned_variables(
+        name=slot_variable_names.v,
+        num_hosts=num_hosts,
+        vocabulary_size=table_config.vocabulary_size,
+        embedding_dimension=table_config.dimension,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
+        initializer=v_initializer)
+    slot_variables = AdamSlotVariables(m_variables, v_variables)
+
+    def load_ops_fn():
+      """Returns the retrieve ops for AdaGrad embedding tables.
+
+      Returns:
+        A list of ops to load embedding and slot variables from CPU to TPU.
+      """
+      load_op_list = []
+      for host_id, table_variable, m_variable, v_variable in (zip(
+          range(num_hosts), table_variables,
+          m_variables, v_variables)):
+        with ops.colocate_with(table_variable):
+          load_parameters_op = (
+              tpu_ops.load_tpu_embedding_adam_parameters(
+                  parameters=table_variable,
+                  momenta=m_variable,
+                  velocities=v_variable,
+                  table_name=table,
+                  num_shards=num_hosts,
+                  shard_id=host_id))
+
+      load_op_list.append(load_parameters_op)
+      return load_op_list
+
+    def retrieve_ops_fn():
+      """Returns the retrieve ops for Adam embedding tables.
+
+      Returns:
+        A list of ops to retrieve embedding and slot variables from TPU to CPU.
+      """
+
+      retrieve_op_list = []
+      for host_id, table_variable, m_variable, v_variable in (zip(
+          range(num_hosts), table_variables,
+          m_variables, v_variables)):
+        with ops.colocate_with(table_variable):
+          retrieved_table, retrieved_m, retrieved_v = (
+              tpu_ops.retrieve_tpu_embedding_adam_parameters(
+                  table_name=table,
+                  num_shards=num_hosts,
+                  shard_id=host_id))
+          retrieve_parameters_op = control_flow_ops.group(
+              state_ops.assign(table_variable, retrieved_table),
+              state_ops.assign(m_variable, retrieved_m),
+              state_ops.assign(v_variable, retrieved_v))
+
+        retrieve_op_list.append(retrieve_parameters_op)
+      return retrieve_op_list
+
+    return slot_variables, load_ops_fn, retrieve_ops_fn
+
+
+class _StochasticGradientDescentHandler(_OptimizerHandler):
+  """Handles stochastic gradient descent specific logic."""
+
+  def set_optimization_parameters(self, table_descriptor):
+    (table_descriptor.optimization_parameters.stochastic_gradient_descent
+     .SetInParent())
+
+  def get_default_slot_variable_names(self, table):
+    return None
+
+  def create_variables_and_ops(self, table, slot_variable_names, num_hosts,
+                               table_config, table_variables):
+    del table_config
+
+    def load_ops_fn():
+      """Returns the retrieve ops for AdaGrad embedding tables.
+
+      Returns:
+        A list of ops to load embedding and slot variables from CPU to TPU.
+      """
+      load_op_list = []
+      for host_id, table_variable in (zip(
+          range(num_hosts), table_variables)):
+        with ops.colocate_with(table_variable):
+          load_parameters_op = (
+              tpu_ops
+              .load_tpu_embedding_stochastic_gradient_descent_parameters(
+                  parameters=table_variable,
+                  table_name=table,
+                  num_shards=num_hosts,
+                  shard_id=host_id))
+
+        load_op_list.append(load_parameters_op)
+      return load_op_list
+
+    def retrieve_ops_fn():
+      """Returns the retrieve ops for SGD embedding tables.
+
+      Returns:
+        A list of ops to retrieve embedding and slot variables from TPU to CPU.
+      """
+
+      retrieve_op_list = []
+      for host_id, table_variable in (zip(
+          range(num_hosts), table_variables)):
+        with ops.colocate_with(table_variable):
+          retrieved_table = (
+              tpu_ops
+              .retrieve_tpu_embedding_stochastic_gradient_descent_parameters(
+                  table_name=table,
+                  num_shards=num_hosts,
+                  shard_id=host_id))
+          retrieve_parameters_op = control_flow_ops.group(
+              state_ops.assign(table_variable, retrieved_table))
+
+        retrieve_op_list.append(retrieve_parameters_op)
+      return retrieve_op_list
+
+    return None, load_ops_fn, retrieve_ops_fn
+
+
+def _get_optimization_handler(optimization_parameters):
+  if isinstance(optimization_parameters, AdagradParameters):
+    return _AdagradHandler(optimization_parameters)
+  elif isinstance(optimization_parameters, AdamParameters):
+    return _AdamHandler(optimization_parameters)
+  elif isinstance(optimization_parameters, StochasticGradientDescentParameters):
+    return _StochasticGradientDescentHandler(optimization_parameters)
+  else:
+    return NotImplementedError()
+
+
+def _create_ordered_dict(d):
+  """Create an OrderedDict from Dict."""
+  return collections.OrderedDict((k, d[k]) for k in sorted(d))
+
+
+def _create_combiners(table_to_config_dict):
+  return [table_to_config_dict[t].combiner for t in table_to_config_dict]
+
+
+def _create_table_to_features_dict(feature_to_table_dict):
+  """Create mapping from table to a list of its features."""
+  table_to_features_dict_tmp = {}
+  for feature, table in six.iteritems(feature_to_table_dict):
+    if table in table_to_features_dict_tmp:
+      table_to_features_dict_tmp[table].append(feature)
+    else:
+      table_to_features_dict_tmp[table] = [feature]
+
+  table_to_features_dict = collections.OrderedDict()
+  for table in sorted(table_to_features_dict_tmp):
+    table_to_features_dict[table] = sorted(table_to_features_dict_tmp[table])
+  return table_to_features_dict
+
+
+def _create_device_fn(hosts):
+  """Create device_fn() to use with _create_partitioned_variables()."""
+
+  def device_fn(op):
+    """Returns the `device` for `op`."""
+    part_match = re.match(r'.*/part_(\d+)(/|$)', op.name)
+
+    if part_match:
+      idx = int(part_match.group(1))
+    else:
+      raise RuntimeError('Internal Error: '
+                         'Expected %s to contain /part_*.' % op.name)
+
+    device = hosts[idx]
+    return device
+
+  return device_fn
+
+
+def _create_partitioned_variables(name,
+                                  num_hosts,
+                                  vocabulary_size,
+                                  embedding_dimension,
+                                  initializer,
+                                  collections=None):  # pylint: disable=redefined-outer-name
+  """Creates ParitionedVariables based on `num_hosts` for `table`."""
+  # TODO(shizhiw): automatically place embedding lookup elsewhere?
+  if vocabulary_size < num_hosts:
+    raise ValueError('`vocabulary_size`({}) is smaller than `num_hosts`({}). '
+                     'As TPU embedding is not optimized for small tables, '
+                     'please consider other ways for this embedding lookup.')
+
+  return list(variable_scope.get_variable(
+      name,
+      shape=(vocabulary_size, embedding_dimension),
+      partitioner=partitioned_variables.fixed_size_partitioner(num_hosts),
+      dtype=dtypes.float32,
+      initializer=initializer,
+      collections=collections,
+      trainable=False))
diff --git a/tensorflow/python/tpu/tpu_embedding_gradient.py b/tensorflow/python/tpu/tpu_embedding_gradient.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7de661cc35ff5439f9ce5a88fc5642cdeb07daf
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding_gradient.py
@@ -0,0 +1,153 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""Optional helper for gradient handling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.tpu.ops import tpu_ops
+
+
+def get_gradients_through_compute_gradients(optimizer, loss, activations):
+  """Compute gradients to send to TPU embedding.
+
+  Args:
+    optimizer: a subclass of optimizer.Optimizer, usually CrossShardOptimizer.
+      Used to call compute_gradients().
+    loss: a Tensor to call optimizer.compute_gradients() on.
+    activations: an OrderedDict mapping feature_name to Tensors of activations.
+
+  Returns:
+    An OrderedDict mapping from feature name Strings to Tensors of gradients of
+      the loss wrt the activations of the features.
+  """
+  activation_list = activations.values()
+  grads_and_vars = optimizer.compute_gradients(loss, activation_list)
+  grads = [grad for grad, _ in grads_and_vars]
+  feature_to_gradient_dict = collections.OrderedDict(
+      zip(activations.keys(), grads))
+  return feature_to_gradient_dict
+
+
+def create_dummy_table_variables(tpu_embedding):
+  """Create dummy embedding table variables.
+
+  The sole purpose of these dummy variables are to trigger gradient
+  calcuation wrt them so that the gradients wrt activation can be captured
+  and later sent to TPU embedding.
+
+  Args:
+    tpu_embedding: TPUEmbedding, dummy table variables will be created for use
+      with tpu_embedding.
+
+  Returns:
+    A tuple of dummy variables and their initializer.
+
+  Raises:
+    RuntimeError: if collection to store gradients already exists and is not
+    empty.
+  """
+  dummy_table_variables = collections.OrderedDict()
+  for table_id, table in enumerate(tpu_embedding.table_to_features_dict):
+    dummy_table_variables[table] = (
+        # Explicitly specifying collections prevents this variable from
+        # being added to the GLOBAL_VARIABLES collection, so that Saver()
+        # ignores it.
+        # But Tensorflow optimizer creates slot variable for these dummy
+        # variable, e.g. tpu_embedding_dummy_table_variable_mlp_user/Adam{_1},
+        # which will be in GLOBAL_VARIABLES collection,
+        variable_scope.get_variable(
+            'tpu_embedding_dummy_table_variable_{}'.format(table),
+            dtype=dtypes.float32,
+            shape=[1],
+            use_resource=True,
+            trainable=True,
+            collections=['tpu_embedding_dummy_table_variables']))
+
+    g = ops.get_default_graph()
+    table_gradients = g.get_collection_ref(
+        'tpu_embedding_gradients_table_{}'.format(table_id))
+    if table_gradients:
+      raise RuntimeError(
+          'tpu_embedding_gradients_table_{} is not empty.'.format(table_id))
+    table_gradients.extend(
+        [None] * len(tpu_embedding.table_to_features_dict[table]))
+
+  return (dummy_table_variables,
+          variables.variables_initializer(
+              dummy_table_variables.values(),
+              name='tpu_embedding_dummy_table_variables_init'))
+
+
+def hook_dummy_table_variables_to_activations(tpu_embedding, activations,
+                                              dummy_table_variables):
+  """Have activations depend on dummy table variables for gradient intercept.
+
+  Args:
+    tpu_embedding: TPUEmbedding, activations and dummy_table_variables are from
+      tpu_embedding.
+    activations: An OrderedDict of feature name String to activation tensors.
+    dummy_table_variables: An OrderedDict of table name String to dummy table
+      variables.
+
+  Returns:
+    An OrderedDict of feature name String to activation tensors, which can be
+      used just as the activations input.
+  """
+  new_activations = collections.OrderedDict()
+  for feature in activations:
+    table = tpu_embedding.feature_to_table_dict[feature]
+    new_activations[feature] = tpu_ops.tpu_embedding_activations(
+        dummy_table_variables[table],
+        activations[feature],
+        table_id=tpu_embedding.table_to_config_dict.keys().index(table),
+        lookup_id=tpu_embedding.table_to_features_dict[table].index(feature))
+  return new_activations
+
+
+def get_gradients_through_dummy_table_variables(tpu_embedding):
+  """Get gradients wrt the activations of each feature.
+
+  Args:
+    tpu_embedding: TPUEmbedding, create dummy table variable to be used with
+      tpu_embedding.
+
+  Returns:
+    An OrderedDict mapping feature name to gradient.
+
+  Raises:
+    ValueError: if some gradients are not defined.
+  """
+  g = ops.get_default_graph()
+  feature_to_gradient_dict = collections.OrderedDict()
+  for table_id, table in enumerate(tpu_embedding.table_to_config_dict):
+    table_gradients = g.get_collection(
+        'tpu_embedding_gradients_table_{}'.format(table_id))
+    if any(gradient is None for gradient in table_gradients):
+      raise ValueError(
+          'Table {} with id {} has undefined gradients: this is probably '
+          'because the model asked TPUEmbedding to compute activations that '
+          'were not used.'.format(table, table_id))
+    for feature, gradient in zip(tpu_embedding.table_to_features_dict[table],
+                                 table_gradients):
+      feature_to_gradient_dict[feature] = gradient
+  return feature_to_gradient_dict
diff --git a/tensorflow/python/tpu/tpu_estimator.py b/tensorflow/python/tpu/tpu_estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea6b297ac3b993720e63a532649f5e174e91c9cf
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_estimator.py
@@ -0,0 +1,3760 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""TPUEstimator class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import copy
+import os
+import signal
+import sys
+import threading
+import time
+
+import numpy as np
+import six
+from six.moves import queue as Queue  # pylint: disable=redefined-builtin
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.core.framework import variable_pb2
+from tensorflow.core.framework.summary_pb2 import Summary
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf.tpu import compilation_result_pb2 as tpu_compilation_result
+from tensorflow.python.client import session as tf_session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest as data_nest
+from tensorflow.python.estimator import estimator as estimator_lib
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator.export import export_output as export_output_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import summary_ops_v2 as contrib_summary
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.summary import summary
+from tensorflow.python.tpu import _tpu_estimator_embedding
+from tensorflow.python.tpu import error_handling
+from tensorflow.python.tpu import functional as tpu_functional
+from tensorflow.python.tpu import session_support
+from tensorflow.python.tpu import tensor_tracer
+from tensorflow.python.tpu import tpu
+from tensorflow.python.tpu import tpu_config
+from tensorflow.python.tpu import tpu_context
+from tensorflow.python.tpu import tpu_embedding_gradient
+from tensorflow.python.tpu import tpu_feed
+from tensorflow.python.tpu import tpu_function
+from tensorflow.python.tpu import training_loop
+from tensorflow.python.tpu import util as util_lib
+from tensorflow.python.tpu._tpu_estimator_embedding import AdamParameters  # pylint: disable=unused-import
+from tensorflow.python.tpu._tpu_estimator_embedding import EmbeddingConfigSpec  # pylint: disable=unused-import
+from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import evaluation
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import training
+from tensorflow.python.training import training_util
+from tensorflow.python.util import function_utils
+from tensorflow.python.util import nest
+from tensorflow.python.util import tf_inspect
+
+_INITIAL_LOSS = 1e7
+_ZERO_LOSS = 0.
+_TPU_ESTIMATOR = 'tpu_estimator'
+_ITERATIONS_PER_LOOP_VAR = 'iterations_per_loop'
+_BATCH_SIZE_KEY = 'batch_size'
+_CTX_KEY = 'context'
+_USE_TPU_KEY = 'use_tpu'
+_CROSS_REPLICA_SUM_OP = 'CrossReplicaSum'
+_ONE_GIGABYTE = 1024 * 1024 * 1024
+_TPU_ENQUEUE_OPS = '_tpu_enqueue_ops'
+_TPU_TRAIN_OP = '_tpu_train_op'
+_REWRITE_FOR_INFERENCE_MODE = '_rewrite_for_inference'
+_KEY_WHEN_PREDICTIONS_IS_A_TENSOR = '_key_when_predictions_is_a_tensor'
+
+# Ideally _USE_TPU_KEY should be reserved as well. However there are already
+# models that make use of this key, thus it can not be reserved now to prevent
+# breakage. In the long run, we would like to mitigate this by migrating models
+# off of using _USE_TPU_KEY.
+_RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY, _CTX_KEY]
+
+# TODO(b/65703635): Flip the value and remove all dead code. Currently, this is
+# only used for per-core based deployments. For per-host based pipelines, if a
+# user returns a Dataset instance it will be automatically wrapped in a
+# tf.while_loop (This can be disabled by returning features and labels
+# explicitly).
+_WRAP_INPUT_FN_INTO_WHILE_LOOP = False
+
+ops.register_proto_function(
+    '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR),
+    proto_type=variable_pb2.VariableDef,
+    to_proto=resource_variable_ops._to_proto_fn,  # pylint: disable=protected-access
+    from_proto=resource_variable_ops._from_proto_fn)  # pylint: disable=protected-access
+
+
+def _is_iterable(obj):
+  """A Python 2 and 3 compatible util to check whether `obj` is iterable."""
+  try:
+    iter(obj)
+    return True
+  except TypeError:
+    return False
+
+
+class CatchInvalidHostcallFunctions(control_flow_ops.XLAControlFlowContext):
+
+  def AddOp(self, op):
+    if op.type in [
+        'AudioSummary', 'AudioSummaryV2', 'HistogramSummary', 'ImageSummary',
+        'MergeSummary', 'ScalarSummary', 'TensorSummary', 'TensorSummaryV2'
+    ]:
+      raise ValueError('Use tf.contrib.summary inside of host_calls.')
+
+
+def _create_global_step(graph):
+  graph = graph or ops.get_default_graph()
+  if training.get_global_step(graph) is not None:
+    raise ValueError('"global_step" already exists.')
+  # Create in proper graph and base name_scope.
+  with graph.as_default() as g, g.name_scope(None):
+    return variable_scope.get_variable(
+        ops.GraphKeys.GLOBAL_STEP,
+        shape=[],
+        dtype=dtypes.int64,
+        initializer=init_ops.zeros_initializer(),
+        trainable=False,
+        use_resource=True,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.GLOBAL_STEP])
+
+
+def _create_or_get_iterations_per_loop():
+  """Creates or gets the iterations_per_loop variable.
+
+  In TPUEstimator, the user provided computation, the model_fn, is wrapped
+  inside a tf.while_loop for peak performance. The iterations of the loop are
+  specified by this variable, which adjusts its value on the CPU after each TPU
+  program execution and before the next TPU execution.
+
+  The purpose of using a variable, rather then a constant, is to allow
+  TPUEstimator adapt the TPU training iterations according to the final steps
+  specified by users. For example, if the user sets the iterations_per_loop as 4
+  in TPUConfig and steps as 10 in TPUEstimator.train(), the iterations_per_loop
+  variable will have the following value before each TPU training.
+
+      - 1-th TPU execution: iterations_per_loop = 4
+      - 2-th TPU execution: iterations_per_loop = 4
+      - 3-th TPU execution: iterations_per_loop = 2
+
+  As model_fn increases the global step once per train_op invocation, the global
+  step is 10 after all TPU executions, matching the steps=10 inputs passed in by
+  users.
+
+  Returns:
+    A TF non-trainable resource variable.
+
+  Raises:
+    RuntimeError: If multi iterations_per_loop variables were found.
+  """
+  graph = ops.get_default_graph()
+  collection_name = '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR)
+  iter_vars = graph.get_collection(collection_name)
+  if len(iter_vars) == 1:
+    return iter_vars[0]
+  elif len(iter_vars) > 1:
+    raise RuntimeError('Multiple iterations_per_loop_var in collection.')
+
+  with ops.colocate_with(training_util.get_global_step()):
+    with variable_scope.variable_scope(
+        _TPU_ESTIMATOR, reuse=variable_scope.AUTO_REUSE):
+      return variable_scope.get_variable(
+          _ITERATIONS_PER_LOOP_VAR,
+          initializer=init_ops.zeros_initializer(),
+          shape=[],
+          dtype=dtypes.int32,
+          trainable=False,
+          collections=[collection_name, ops.GraphKeys.LOCAL_VARIABLES],
+          use_resource=True)
+
+
+def _sync_variables_ops(ctx):
+  """Create varriables synchronization ops.
+
+  Gets the variables back from TPU nodes. This means the variables updated
+  by TPU will now be *synced* to host memory.
+  In BROADCAST mode, we skip this sync since the variables are ususally too
+  big to transmit via RPC.
+
+  Args:
+    ctx: A `_InternalTPUContext` instance with mode.
+
+  Returns:
+    A list of sync ops.
+  """
+
+  if not ctx.is_input_broadcast_with_iterators():
+    return [
+        array_ops.check_numerics(v.read_value(),
+                                 'Gradient for %s is NaN' % v.name).op
+        for v in variables.trainable_variables()
+    ]
+  else:
+    return [control_flow_ops.no_op()]
+
+
+def _increase_eval_step_op(iterations_per_loop):
+  """Returns an op to increase the eval step for TPU evaluation.
+
+  Args:
+    iterations_per_loop: Tensor. The number of eval steps running in TPU system
+      before returning to CPU host for each `Session.run`.
+
+  Returns:
+    An operation
+  """
+  eval_step = evaluation._get_or_create_eval_step()  # pylint: disable=protected-access
+  # Estimator evaluate increases 1 by default. So, we increase the difference.
+  return state_ops.assign_add(
+      eval_step,
+      math_ops.cast(iterations_per_loop - 1, dtype=eval_step.dtype),
+      use_locking=True)
+
+
+def _extract_key_names(tensor_or_dict):
+  if isinstance(tensor_or_dict, dict):
+    return sorted(tensor_or_dict.keys())
+  return []
+
+
+class _SIGNAL(object):
+  """Signal used to control the thread of infeed/outfeed.
+
+  All preserved signals must be negative numbers. Positive numbers are used to
+  indicate the number of iterations for next training/evaluation loop.
+  """
+  NEXT_BATCH = -1
+  STOP = -2
+
+
+class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
+  """Ops and objects returned from a `model_fn` and passed to `TPUEstimator`.
+
+  See `EstimatorSpec` for `mode`, `predictions`, `loss`, `train_op`, and
+  `export_outputs`.
+
+  For evaluation, `eval_metrics `is a tuple of `metric_fn` and `tensors`, where
+  `metric_fn` runs on CPU to generate metrics and `tensors` represents the
+  `Tensor`s transferred from TPU system to CPU host and passed to `metric_fn`.
+  To be precise, TPU evaluation expects a slightly different signature from the
+  `tf.estimator.Estimator`. While `EstimatorSpec.eval_metric_ops` expects a
+  dict, `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`.
+  The `tensors` could be a list of `Tensor`s or dict of names to `Tensor`s. The
+  `tensors` usually specify the model logits, which are transferred back from
+  TPU system to CPU host. All tensors must have be batch-major, i.e., the batch
+  size is the first dimension. Once all tensors are available at CPU host from
+  all shards, they are concatenated (on CPU) and passed as positional arguments
+  to the `metric_fn` if `tensors` is list or keyword arguments if `tensors` is
+  a dict. `metric_fn` takes the `tensors` and returns a dict from metric string
+  name to the result of calling a metric function, namely a `(metric_tensor,
+  update_op)` tuple. See `TPUEstimator` for MNIST example how to specify the
+  `eval_metrics`.
+
+  `scaffold_fn` is a function running on CPU to generate the `Scaffold`. This
+  function should not capture any Tensors in `model_fn`.
+
+  `host_call` is a tuple of a `function` and a list or dictionary of `tensors`
+  to pass to that function and returns a list of Tensors. `host_call` currently
+  works for train() and evaluate(). The Tensors returned by the function is
+  executed on the CPU on every step, so there is communication overhead when
+  sending tensors from TPU to CPU. To reduce the overhead, try reducing the
+  size of the tensors. The `tensors` are concatenated along their major (batch)
+  dimension, and so must be >= rank 1. The `host_call` is useful for writing
+  summaries with `tf.contrib.summary.create_file_writer`.
+  """
+
+  def __new__(cls,
+              mode,
+              predictions=None,
+              loss=None,
+              train_op=None,
+              eval_metrics=None,
+              export_outputs=None,
+              scaffold_fn=None,
+              host_call=None,
+              training_hooks=None,
+              evaluation_hooks=None,
+              prediction_hooks=None):
+    """Creates a validated `TPUEstimatorSpec` instance."""
+    host_calls = {}
+    if eval_metrics is not None:
+      host_calls['eval_metrics'] = eval_metrics
+    if host_call is not None:
+      host_calls['host_call'] = host_call
+    _OutfeedHostCall.validate(host_calls)
+
+    training_hooks = tuple(training_hooks or [])
+    evaluation_hooks = tuple(evaluation_hooks or [])
+    prediction_hooks = tuple(prediction_hooks or [])
+
+    for hook in training_hooks + evaluation_hooks + prediction_hooks:
+      if not isinstance(hook, session_run_hook.SessionRunHook):
+        raise TypeError('All hooks must be SessionRunHook instances, given: {}'
+                        .format(hook))
+
+    return super(TPUEstimatorSpec, cls).__new__(
+        cls,
+        mode=mode,
+        predictions=predictions,
+        loss=loss,
+        train_op=train_op,
+        eval_metrics=eval_metrics,
+        export_outputs=export_outputs,
+        scaffold_fn=scaffold_fn,
+        host_call=host_call,
+        training_hooks=training_hooks,
+        evaluation_hooks=evaluation_hooks,
+        prediction_hooks=prediction_hooks)
+
+  def as_estimator_spec(self):
+    """Creates an equivalent `EstimatorSpec` used by CPU train/eval."""
+    host_calls = {}
+    if self.eval_metrics is not None:
+      host_calls['eval_metrics'] = self.eval_metrics
+    if self.host_call is not None:
+      host_calls['host_call'] = self.host_call
+    host_call_ret = _OutfeedHostCall.create_cpu_hostcall(host_calls)
+    eval_metric_ops = None
+    if self.eval_metrics is not None:
+      eval_metric_ops = host_call_ret['eval_metrics']
+    hooks = None
+    if self.host_call is not None:
+      hooks = [_OutfeedHostCallHook(host_call_ret['host_call'])]
+    loss = self.loss
+    if tensor_tracer.TensorTracer.is_enabled() \
+       and self.train_op is not None:
+      tt = tensor_tracer.TensorTracer()
+      loss = tt.trace_cpu(ops.get_default_graph(), loss, self.train_op)
+
+    hooks = tuple(hooks or [])
+    scaffold = self.scaffold_fn() if self.scaffold_fn else None
+    return model_fn_lib.EstimatorSpec(
+        mode=self.mode,
+        predictions=self.predictions,
+        loss=loss,
+        train_op=self.train_op,
+        eval_metric_ops=eval_metric_ops,
+        export_outputs=self.export_outputs,
+        scaffold=scaffold,
+        training_hooks=self.training_hooks + hooks,
+        evaluation_hooks=self.evaluation_hooks + hooks,
+        prediction_hooks=self.prediction_hooks + hooks)
+
+
+class _OpQueueContext(object):
+  """Manages work queue and thread for a infeed/outfeed thread."""
+
+  def __init__(self, name, target, args):
+    self._name = name
+    self._queue = Queue.Queue()
+    args = (self,) + args
+    self._thread = threading.Thread(name=name, target=target, args=args)
+    self._thread.daemon = True
+    self._thread.start()
+
+  def stop(self):
+    self._queue.put(_SIGNAL.STOP)
+
+  def send_next_batch_signal(self, iterations):
+    self._queue.put(iterations)
+
+  def read_iteration_counts(self):
+    while True:
+      iterations = self._queue.get(block=True)
+      logging.debug('%s read iterations %s', self._name, iterations)
+      if iterations == _SIGNAL.STOP:
+        logging.info('%s received shutdown signal, stopping.', self._name)
+        return
+      yield iterations
+
+  def join(self):
+    logging.info('Shutting down %s thread.', self._name)
+    self.stop()
+    self._thread.join()
+
+
+class _OpSignalOnceQueueContext(_OpQueueContext):
+  """Manages work queue and thread for a infeed/outfeed thread.
+
+  This subclass only signals once.
+  """
+
+  def __init__(self, name, target, args):
+    super(_OpSignalOnceQueueContext, self).__init__(name, target, args)
+    self._has_signaled = False
+
+  def send_next_batch_signal(self, iterations):
+    if not self._has_signaled:
+      self._queue.put(iterations)
+      self._has_signaled = True
+
+
+class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
+  """A Session hook setting up the TPU initialization, infeed, and outfeed.
+
+  This hook does two major things:
+  1. initialize and shutdown TPU system.
+  2. launch and join the threads for infeed enqueue and (optional) outfeed
+     dequeue.
+  """
+
+  def __init__(self,
+               ctx,
+               enqueue_ops,
+               dequeue_ops,
+               tpu_compile_op,
+               run_infeed_loop_on_coordinator=True,
+               rendezvous=None,
+               master=None,
+               session_config=None,
+               tpu_init_ops=None):
+    self._master_job = ctx.master_job
+    self._enqueue_ops = enqueue_ops
+    self._dequeue_ops = dequeue_ops
+    self._rendezvous = rendezvous
+    self._master = master
+    self._session_config = session_config
+    self._init_ops = list(tpu_init_ops or [])
+    if ctx.embedding_config is None:
+      self._embedding_layer_config = None
+    else:
+      self._embedding_layer_config = (
+          ctx.embedding_config.tpu_embedding.config_proto)
+    self._run_infeed_loop_on_coordinator = run_infeed_loop_on_coordinator
+    self._initial_infeed_sleep_secs = (
+        ctx.config.tpu_config.initial_infeed_sleep_secs)
+
+    self._feed_error = None
+    self._finished = False
+    # When using model parallelism, the TPU is pre-initialized at startup to
+    # fetch mesh information.  We skip re-initializing it here to avoid
+    # suspected issues due to the mesh layout changing on the second
+    # initialization.
+    self._should_initialize_tpu = not ctx.model_parallelism_enabled
+    self._tpu_compile_op = tpu_compile_op
+
+  def begin(self):
+    logging.info('TPU job name %s', self._master_job)
+    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
+    if self._should_initialize_tpu:
+      self._finalize_ops = [tpu.shutdown_system(job=self._master_job)]
+    else:
+      self._finalize_ops = []
+
+    summary_writer_init_ops = contrib_summary.summary_writer_initializer_op()
+    self._init_ops.extend(summary_writer_init_ops)
+    # Get all the writer resources from the initializer, so we know what to
+    # flush.
+    for op in summary_writer_init_ops:
+      self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
+
+  def _run_infeed(self, queue_ctx, session):
+    logging.info('Starting infeed thread controller.')
+    if self._initial_infeed_sleep_secs:
+      logging.info('Infeed thread sleeping for %d seconds.',
+                   self._initial_infeed_sleep_secs)
+      time.sleep(self._initial_infeed_sleep_secs)
+      logging.info('Infeed thread starting after sleep')
+
+    with self._rendezvous.catch_errors(source='infeed', session=session):
+      if self._run_infeed_loop_on_coordinator:
+        for count, steps in enumerate(queue_ctx.read_iteration_counts()):
+          for i in xrange(steps):
+            logging.debug('Infeed enqueue for iteration (%d, %d)', count, i)
+            session.run(self._enqueue_ops)
+      else:
+        for _ in queue_ctx.read_iteration_counts():
+          session.run(self._enqueue_ops)
+      logging.info('Infeed thread finished, shutting down.')
+
+  def _run_outfeed(self, queue_ctx, session):
+    logging.info('Starting outfeed thread controller.')
+    with self._rendezvous.catch_errors(source='outfeed', session=session):
+      for count, steps in enumerate(queue_ctx.read_iteration_counts()):
+        for i in xrange(steps):
+          logging.debug('Outfeed dequeue for iteration (%d, %d)', count, i)
+          session.run(self._dequeue_ops)
+      logging.info('Outfeed thread finished, shutting down.')
+
+  def _create_infeed_controller(self, name, target, args):
+    return _OpQueueContext(name=name, target=target, args=args)
+
+  def _assertCompilationSucceeded(self, result, coord):
+    proto = tpu_compilation_result.CompilationResultProto()
+    proto.ParseFromString(result)
+    if proto.status_error_message:
+      logging.error('Compilation failed: {}'.format(proto.status_error_message))
+      coord.request_stop()
+    else:
+      logging.info('Compilation succeeded')
+
+  def after_create_session(self, session, coord):
+    if self._should_initialize_tpu:
+      logging.info('Init TPU system')
+      start = time.time()
+      with ops.Graph().as_default():
+        with tf_session.Session(
+            self._master, config=self._session_config) as sess:
+          sess.run(
+              tpu.initialize_system(
+                  job=self._master_job,
+                  embedding_config=self._embedding_layer_config))
+      logging.info('Initialized TPU in %d seconds', time.time() - start)
+
+    session.run(self._init_ops,
+                options=config_pb2.RunOptions(timeout_in_ms=5 * 60 * 1000))
+
+    if os.environ.get('TPU_SPLIT_COMPILE_AND_EXECUTE', '') == '1':
+      logging.info('Compiling user program: this may take a while...')
+      self._assertCompilationSucceeded(session.run(self._tpu_compile_op), coord)
+
+    self._infeed_controller = self._create_infeed_controller(
+        name='InfeedController', target=self._run_infeed, args=(session,))
+
+    self._outfeed_controller = _OpQueueContext(
+        name='OutfeedController', target=self._run_outfeed, args=(session,))
+
+    # Enable the worker watchdog to terminate workers on coordinator exit.
+    watchdog_timeout = int(os.environ.get('TF_TPU_WATCHDOG_TIMEOUT', '0'))
+    if watchdog_timeout > 0:
+      session_support.start_worker_watchdog(session,
+                                            shutdown_timeout=watchdog_timeout)
+
+  def before_run(self, run_context):
+    self._feed_error = None
+
+    iterations = run_context.session.run(self._iterations_per_loop_var)
+
+    logging.info('Enqueue next (%d) batch(es) of data to infeed.', iterations)
+    self._infeed_controller.send_next_batch_signal(iterations)
+
+    logging.info('Dequeue next (%d) batch(es) of data from outfeed.',
+                 iterations)
+    self._outfeed_controller.send_next_batch_signal(iterations)
+
+  def end(self, session):
+    self._finished = True
+    logging.info('Stop infeed thread controller')
+    self._infeed_controller.join()
+    self._rendezvous.record_done('infeed')
+
+    logging.info('Stop output thread controller')
+    self._outfeed_controller.join()
+    self._rendezvous.record_done('outfeed')
+
+    logging.info('Shutdown TPU system.')
+    session.run(self._finalize_ops)
+
+
+class TPUInfeedOutfeedSessionHookForPrediction(TPUInfeedOutfeedSessionHook):
+
+  def __init__(self, ctx, enqueue_ops, dequeue_ops, tpu_compile_op,
+               rendezvous=None, master=None, session_config=None):
+    super(TPUInfeedOutfeedSessionHookForPrediction, self).__init__(
+        ctx,
+        enqueue_ops,
+        dequeue_ops,
+        tpu_compile_op=tpu_compile_op,
+        run_infeed_loop_on_coordinator=False,
+        rendezvous=rendezvous,
+        master=master,
+        session_config=session_config)
+
+  def _create_infeed_controller(self, name, target, args):
+    return _OpSignalOnceQueueContext(name=name, target=target, args=args)
+
+
+class _TPUStopAtStepHook(session_run_hook.SessionRunHook):
+  """Hook that requests stop at a specified step.
+
+  This hook is similar to the `session_run_hook._StopAfterNEvalsHook` with
+  following differences for TPU training:
+
+  1. This hook sets the variable for iterations_per_loop, which is used by
+     `TPUInfeedOutfeedSessionHook` to control the iterations for infeed/outfeed.
+     As the hook execution order is not guaranteed, the variable update is
+     handled in `after_create_session` and `after_run` as
+     `TPUInfeedOutfeedSessionHook` reads the variable value in `before_run`.
+
+  2. For each training loop (session.run), the global step could be increased
+     multiple times on TPU. The global step tensor value will be explicitly read
+     again in `after_run` to ensure the latest value is retrieved to avoid race
+     condition.
+  """
+
+  def __init__(self, iterations, num_steps=None, last_step=None):
+    """Initializes a `StopAtStepHook`.
+
+    Args:
+      iterations: The number of iterations to run optimizer per training loop.
+      num_steps: Number of steps to execute.
+      last_step: Step after which to stop.
+
+    Raises:
+      ValueError: If one of the arguments is invalid.
+    """
+    if num_steps is None and last_step is None:
+      raise ValueError('One of num_steps or last_step must be specified.')
+    if num_steps is not None and last_step is not None:
+      raise ValueError('Only one of num_steps or last_step can be specified.')
+    self._num_steps = num_steps
+    self._last_step = last_step
+    self._iterations = iterations
+
+  def _next_iterations(self, global_step, last_step):
+    gap = last_step - global_step
+    return min(gap, self._iterations)
+
+  def begin(self):
+    self._global_step_tensor = training_util.get_global_step()
+    if self._global_step_tensor is None:
+      raise RuntimeError('Global step should be created.')
+
+    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
+
+  def after_create_session(self, session, coord):
+    global_step = session.run(self._global_step_tensor)
+    if self._last_step is None:
+      self._last_step = global_step + self._num_steps
+
+    iterations = self._next_iterations(global_step, self._last_step)
+
+    self._iterations_per_loop_var.load(iterations, session=session)
+
+  def after_run(self, run_context, run_values):
+    # Global step cannot be retrieved via SessionRunArgs and before_run due to
+    # race condition.
+    global_step = run_context.session.run(self._global_step_tensor)
+    if global_step >= self._last_step:
+      run_context.request_stop()
+    else:
+      iterations = self._next_iterations(global_step, self._last_step)
+      self._iterations_per_loop_var.load(
+          iterations, session=run_context.session)
+
+
+class _SetEvalIterationsHook(session_run_hook.SessionRunHook):
+  """Hook that requests stop at a specified step."""
+
+  def __init__(self, num_steps):
+    """Initializes a `_SetEvalIterationsHook`.
+
+    Args:
+      num_steps: Number of steps to execute.
+    """
+    self._num_steps = num_steps
+
+  def begin(self):
+    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
+
+  def after_create_session(self, session, coord):
+    self._iterations_per_loop_var.load(self._num_steps, session=session)
+
+
+class _StoppingPredictHook(session_run_hook.SessionRunHook):
+  """Hook that requests stop according to the stopping signal in prediction."""
+
+  def __init__(self, scalar_stopping_signal):
+    self._scalar_stopping_signal = scalar_stopping_signal
+
+  def begin(self):
+    self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
+
+  def after_create_session(self, session, coord):
+    # This is not necessary as we do not run infeed enqueue and outfeed dequeue
+    # in side threads for prediction model. But it makes the
+    # TPUInfeedOutfeedSessionHook prints nice message.
+    self._iterations_per_loop_var.load(1, session=session)
+
+  def before_run(self, run_context):
+    return session_run_hook.SessionRunArgs(self._scalar_stopping_signal)
+
+  def after_run(self, run_context, run_values):
+    _ = run_context
+    scalar_stopping_signal = run_values.results
+    if _StopSignals.should_stop(scalar_stopping_signal):
+      # NOTE(xiejw): In prediction, stopping signals are inserted for each
+      # batch. And we append one more batch to signal the system it should stop.
+      # The data flow might look like
+      #
+      #  batch   0: images, labels, stop = 0  (user provided)
+      #  batch   1: images, labels, stop = 0  (user provided)
+      #  ...
+      #  batch  99: images, labels, stop = 0  (user provided)
+      #  batch 100: images, labels, stop = 1  (TPUEstimator appended)
+      #
+      # where the final batch (id = 100) is appended by TPUEstimator, so we
+      # should drop it before returning the predictions to user.
+      # To achieve that, we throw the OutOfRangeError in after_run. Once
+      # Monitored Session sees this error in SessionRunHook.after_run, the
+      # "current" prediction, i.e., batch with id=100, will be discarded
+      # immediately
+      raise errors.OutOfRangeError(None, None, 'Stopped by stopping signal.')
+
+
+def generate_per_core_enqueue_ops_fn_for_host(
+    ctx, input_fn, inputs_structure_recorder, host_device, host_id):
+  """Generates infeed enqueue ops for per-core input_fn on a single host."""
+  captured_infeed_queue = _CapturedObject()
+  tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
+
+  def enqueue_ops_fn():
+    """A fn returns enqueue_ops."""
+    num_cores_per_host = ctx.num_of_cores_per_host
+    per_host_sharded_inputs = []
+    for core_ordinal in range(num_cores_per_host):
+      with ops.name_scope('ordinal_%d' % (core_ordinal)):
+        user_context = tpu_context.TPUContext(
+            internal_ctx=ctx,
+            input_device=host_device,
+            invocation_index=host_id * ctx.num_of_cores_per_host + core_ordinal)
+        inputs = _Inputs.from_input_fn(input_fn(user_context))
+        if inputs.is_dataset:
+          raise TypeError(
+              '`input_fn` returning `Dataset`  is not yet supported in '
+              'per-Core input pipeline deployment yet. Please set '
+              'TPUConfig.per_host_input_for_training to True or return '
+              '`features` and `labels` from `input_fn`')
+        features, labels = inputs.features_and_labels()
+
+        inputs_structure_recorder.validate_and_record_structure(
+            features, labels)
+        flattened_inputs = (
+            inputs_structure_recorder.flatten_features_and_labels(
+                features, labels))
+        per_host_sharded_inputs.append(flattened_inputs)
+
+    infeed_queue = tpu_feed.InfeedQueue(
+        number_of_tuple_elements=len(per_host_sharded_inputs[0]))
+    captured_infeed_queue.capture(infeed_queue)
+
+    per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
+        per_host_sharded_inputs, tpu_ordinal_function=tpu_ordinal_function_impl)
+    return per_host_enqueue_ops
+
+  return enqueue_ops_fn, captured_infeed_queue
+
+
+def generate_per_host_enqueue_ops_fn_for_host(
+    ctx, input_fn, inputs_structure_recorder, batch_axis, device, host_id):
+  """Generates infeed enqueue ops for per-host input_fn on a single host."""
+  captured_infeed_queue = _CapturedObject()
+
+  dataset_initializer = None
+
+  with ops.device(device):
+    user_context = tpu_context.TPUContext(
+        internal_ctx=ctx, input_device=device, invocation_index=host_id)
+    inputs = _Inputs.from_input_fn(input_fn(user_context))
+
+    is_dataset = inputs.is_dataset
+    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
+      if not is_dataset:
+        raise TypeError(
+            'For mode PREDICT, `input_fn` must return `Dataset` instead of '
+            '`features` and `labels`.')
+      if batch_axis is not None:
+        raise TypeError('For mode PREDICT, batch_axis is not supported yet.')
+      inputs = _InputsWithStoppingSignals(
+          dataset=inputs.dataset,
+          batch_size=ctx.batch_size_for_input_fn,
+          add_padding=True)
+
+    if is_dataset:
+      dataset_initializer = inputs.dataset_initializer()
+
+    tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
+
+  def enqueue_ops_fn():
+    """A Fn returning the TPU infeed enqueue ops.
+
+    By providing as a Fn, it can be invoked inside the tf.while_loop such that
+    the input pipeline for multiple iterations can be executed by one
+    Session.run call.
+
+    Returns:
+      list of dict of ops.
+    """
+    with ops.device(device):
+      num_of_replicas_per_host = ctx.num_of_replicas_per_host
+      # Convert user input to features and labels.  If the user returns a
+      # dataset, it is initialized and the features and labels extracted via
+      # `dataset.iterator.get_next()`
+      features, labels = inputs.features_and_labels()
+      signals = inputs.signals()
+
+      inputs_structure_recorder.validate_and_record_structure(features, labels)
+      unsharded_tensor_list = (
+          inputs_structure_recorder.flatten_features_and_labels(
+              features, labels, signals))
+
+      infeed_queue = tpu_feed.InfeedQueue(
+          tuple_types=[t.dtype for t in unsharded_tensor_list],
+          tuple_shapes=[t.shape for t in unsharded_tensor_list],
+          shard_dimensions=batch_axis)
+      captured_infeed_queue.capture(infeed_queue)
+      infeed_queue.set_number_of_shards(num_of_replicas_per_host)
+      per_host_enqueue_ops = (
+          infeed_queue.split_inputs_and_generate_enqueue_ops(
+              unsharded_tensor_list,
+              placement_function=lambda x: device,
+              tpu_ordinal_function=tpu_ordinal_function_impl))
+      if signals is None:
+        return per_host_enqueue_ops
+      else:
+        return {
+            'ops': per_host_enqueue_ops,
+            'signals': signals,
+        }
+
+  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
+
+
+def generate_per_host_v2_enqueue_ops_fn_for_host(
+    ctx, input_fn, inputs_structure_recorder, device, host_id):
+  """Generates infeed enqueue ops for per-host input_fn on a single host."""
+  captured_infeed_queue = _CapturedObject()
+  dataset_initializer = None
+
+  with ops.device(device):
+    user_context = tpu_context.TPUContext(
+        internal_ctx=ctx, input_device=device, invocation_index=host_id)
+    inputs = _Inputs.from_input_fn(input_fn(user_context))
+
+    is_dataset = inputs.is_dataset
+    if not is_dataset:
+      raise TypeError('`input_fn` must return a `Dataset` for the PER_HOST_V2 '
+                      'input pipeline configuration.')
+
+    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
+      inputs = _InputsWithStoppingSignals(
+          dataset=inputs.dataset,
+          batch_size=ctx.batch_size_for_input_fn,
+          add_padding=True,
+          num_invocations_per_step=ctx.num_of_replicas_per_host)
+
+    dataset_initializer = inputs.dataset_initializer()
+    tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id)
+
+  def enqueue_ops_fn():
+    """Generates the per_host enqueue ops."""
+    control_deps = []
+    per_host_sharded_inputs = []
+    sparse_features_list = []
+    num_replicas_per_host = ctx.num_of_replicas_per_host
+    cached_signals = None
+    with ops.device(device):
+      if not inputs.is_dataset:
+        raise TypeError('`input_fn` must return a `Dataset` for this mode.')
+      for _ in range(num_replicas_per_host):
+        # Use control dependencies to ensure a deterministic ordering.
+        with ops.control_dependencies(control_deps):
+          features, labels = inputs.features_and_labels()  # Calls get_next()
+          signals = inputs.signals()
+
+          # All the replicas share the replica 0's stopping singal.
+          # This avoids inconsistent state among different model replcias.
+          if cached_signals:
+            signals['stopping'] = cached_signals['stopping']
+          else:
+            cached_signals = signals
+
+        features, labels, sparse_features = (
+            _tpu_estimator_embedding.split_inputs(ctx, features, labels))
+        sparse_features_list.append(sparse_features)
+
+        inputs_structure_recorder.validate_and_record_structure(
+            features, labels)
+        flattened_inputs = (
+            inputs_structure_recorder.flatten_features_and_labels(
+                features, labels, signals))
+        control_deps.extend(flattened_inputs)
+        per_host_sharded_inputs.append(flattened_inputs)
+
+      if inputs_structure_recorder.flattened_input_dims:
+        input_partition_dims = inputs_structure_recorder.flattened_input_dims
+        if signals:
+          input_partition_dims += [None] * len(signals)
+        # pylint: disable=protected-access
+        infeed_queue = tpu_feed._PartitionedInfeedQueue(
+            number_of_tuple_elements=len(per_host_sharded_inputs[0]),
+            host_id=host_id,
+            input_partition_dims=input_partition_dims,
+            device_assignment=ctx.device_assignment)
+        per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
+            per_host_sharded_inputs)
+      else:
+        infeed_queue = tpu_feed.InfeedQueue(
+            number_of_tuple_elements=len(per_host_sharded_inputs[0]))
+        per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
+            per_host_sharded_inputs,
+            tpu_ordinal_function=tpu_ordinal_function_impl)
+      captured_infeed_queue.capture(infeed_queue)
+
+    if ctx.embedding_config:
+      per_host_enqueue_ops.extend(
+          ctx.embedding_config.tpu_embedding.generate_enqueue_ops(
+              sparse_features_list))
+
+    if signals is None:
+      return per_host_enqueue_ops
+    else:
+      return {
+          'ops': per_host_enqueue_ops,
+          'signals': signals,
+      }
+
+  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
+
+
+def generate_broadcast_enqueue_ops_fn(ctx, input_fn, inputs_structure_recorder,
+                                      num_hosts):
+  """Generates infeed enqueue ops for one input_fn on all the hosts."""
+  captured_infeed_queue = _CapturedObject()
+  dataset_initializer = None
+  device_0 = ctx.tpu_host_placement_function(host_id=0)
+  with ops.device(device_0):
+    user_context = tpu_context.TPUContext(
+        internal_ctx=ctx, input_device=device_0, invocation_index=0)
+    inputs = _Inputs.from_input_fn(input_fn(user_context))
+
+    is_dataset = inputs.is_dataset
+    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
+      if not is_dataset:
+        raise TypeError(
+            'For mode PREDICT, `input_fn` must return `Dataset` instead of '
+            '`features` and `labels`.')
+
+      inputs = _InputsWithStoppingSignals(
+          dataset=inputs.dataset,
+          batch_size=ctx.batch_size_for_input_fn,
+          add_padding=True)
+
+    if is_dataset:
+      dataset_initializer = inputs.dataset_initializer()
+    num_replicas_per_host = ctx.num_of_replicas_per_host
+
+  def tpu_ordinal_function_impl(replica_id):
+    if ctx.device_assignment:
+      return ctx.device_assignment.tpu_ordinal(replica=replica_id)
+    else:
+      return replica_id % num_replicas_per_host
+
+  def device_function_impl(replica_id):
+    return ctx.tpu_host_placement_function(replica_id=replica_id)
+
+  def enqueue_ops_fn():
+    """Generates enqueue ops for all the hosts."""
+    broadcasted_inputs = []
+    flattened_inputs = None  # Cache result from input_fn.
+    signals = None
+    for host_id in xrange(num_hosts):
+      with ops.device(ctx.tpu_host_placement_function(host_id=host_id)):
+        for _ in xrange(ctx.num_of_replicas_per_host):
+          # Note: input_fn is only called once at host 0 for the first replica.
+          # The features and labels returned from that invocation are
+          # broadcasted to other replicas(including the replicas on other
+          # hosts).
+          if flattened_inputs is None:
+            features, labels = inputs.features_and_labels()  # Calls get_next()
+            signals = inputs.signals()
+
+            inputs_structure_recorder.validate_and_record_structure(
+                features, labels)
+            flattened_inputs = (
+                inputs_structure_recorder.flatten_features_and_labels(
+                    features, labels, signals))
+          broadcasted_inputs.append(flattened_inputs)
+
+    infeed_queue = tpu_feed.InfeedQueue(
+        number_of_tuple_elements=len(broadcasted_inputs[0]))
+    captured_infeed_queue.capture(infeed_queue)
+    enqueue_ops = infeed_queue.generate_enqueue_ops(
+        broadcasted_inputs,
+        tpu_ordinal_function=tpu_ordinal_function_impl,
+        placement_function=device_function_impl)
+
+    if signals is None:
+      return enqueue_ops
+    else:
+      return {
+          'ops': enqueue_ops,
+          'signals': signals,
+      }
+
+  return enqueue_ops_fn, captured_infeed_queue, dataset_initializer
+
+
+class _InputPipeline(object):
+  """`_InputPipeline` handles invoking `input_fn` and piping to infeed queue.
+
+  `_InputPipeline` abstracts the per-core/per-host `input_fn` invocation from
+  call site.  To be precise, based on the configuration in
+  `_InternalTPUContext`,  it invokes `input_fn` for all cores (usually
+  multi-host TPU training) or for one host (usually for single-host TPU
+  evaluation), and sends all `features` and `labels` returned by `input_fn` to
+  TPU infeed. For per-core invocation, `features` and `labels` are piped to
+  infeed directly, one tuple for each core. For per-host invocation,  `features`
+  and `labels` are split at host (with respect to `batch_axis`) and piped to all
+  cores accordingly.
+
+  In addition, flatten/unflatten are handled by `_InputPipeline` also.  Model
+  inputs returned by the `input_fn` can have one of the following forms:
+  1. features
+  2. (features, labels)
+  3. ((arbitrarily nested structure of features), labels)
+
+  Internally, form 1 is reformed to `(features, None)` as features and labels
+  are passed separately to underlying methods. For TPU training, TPUEstimator
+  may expect multiple `features` and `labels` tuples one for each core.
+
+  TPUEstimator allows various different structures for inputs (namely `features`
+  and `labels`).  Both `features` and `labels` can be any nested sturcture
+  supported by TF nest (namely, dict, tuples, namedtuples or any nested
+  structure of such of Tensors).  `labels` could be `None` as well.
+
+  These are flattened before they are passed to the infeed/outfeed library
+  as that expectes flattend lists.
+  """
+
+  class InputsStructureRecorder(object):
+    """The recorder to record inputs structure."""
+
+    def __init__(self, input_partition_dims=None):
+      # Holds the structure of inputs
+      self._feature_structure = {}
+      self._flattened_input_dims = None
+
+      if input_partition_dims:
+        # This should have been validated in TPUConfig.
+        assert len(input_partition_dims) <= 2, 'must have 1 or 2 elements.'
+        if len(input_partition_dims) == 2:
+          self._feature_dims, self._label_dims = input_partition_dims
+        else:
+          self._feature_dims = input_partition_dims[0]
+          self._label_dims = None
+
+        assert self._feature_dims is not None, ('input_partition_dims[0] must '
+                                                'not be None')
+      else:
+        self._feature_dims = None
+        self._label_dims = None
+
+      # Internal state.
+      self._initialized = False
+
+    @property
+    def flattened_input_dims(self):
+      assert self._initialized, 'InputsStructureRecorder is not initialized.'
+      return self._flattened_input_dims
+
+    def has_labels(self):
+      return 'labels' in self._feature_structure
+
+    def _flatten_input_dims(self, feature_dims, feature_dims_names, label_dims,
+                            label_dims_names, label_names, has_labels):
+      """Flatten input dims with the same order as flattened input tensors."""
+      flattened_input_dims = []
+      if feature_dims_names:
+        # We need a fixed ordering for matching the tensors in features.
+        flattened_input_dims.extend(
+            [feature_dims[name] for name in feature_dims_names])
+      else:
+        flattened_input_dims.append(feature_dims)
+
+      if label_dims_names:
+        # We need a fixed ordering for matching the tensors in labels.
+        flattened_input_dims.extend(
+            [label_dims[name] for name in label_dims_names])
+      else:
+        if label_names:
+          num_tensors_in_label = len(label_names)
+        else:
+          num_tensors_in_label = int(has_labels)
+        # Setting `None` in input_partition_dims[1] will apply `None` to
+        # all the tensors in labels, regardless of internal structure.
+        flattened_input_dims.extend([label_dims] * num_tensors_in_label)
+
+      return flattened_input_dims
+
+    def validate_and_record_structure(self, features, labels):
+      """Validates and records the structure of `features` and `labels`."""
+      # Extract structure.
+      has_labels = labels is not None
+      feature_names = _extract_key_names(features)
+      label_names = _extract_key_names(labels)
+
+      if not self._initialized:
+        # Record structure.
+        self._initialized = True
+        if self._feature_dims is not None:
+          feature_dims_names = _extract_key_names(self._feature_dims)
+          if feature_dims_names != feature_names:
+            raise ValueError(
+                'TPUConfig.input_partition_dims[0] mismatched feature'
+                ' keys. Expected {}, got {}'.format(feature_names,
+                                                    feature_dims_names))
+
+          label_dims_names = _extract_key_names(self._label_dims)
+          if self._label_dims is not None and label_dims_names != label_names:
+            raise ValueError(
+                'TPUConfig.input_partition_dims[1] mismatched label'
+                ' keys. Expected {}, got {}'.format(label_names,
+                                                    label_dims_names))
+
+          self._flattened_input_dims = self._flatten_input_dims(
+              self._feature_dims, feature_dims_names, self._label_dims,
+              label_dims_names, label_names, has_labels)
+
+    def flatten_features_and_labels(self, features, labels, signals=None):
+      """Flattens the `features` and `labels` to a single tensor list."""
+      self._feature_structure['features'] = features
+      if labels is not None:
+        self._feature_structure['labels'] = labels
+      if signals is not None:
+        self._feature_structure['signals'] = signals
+      return data_nest.flatten(self._feature_structure)
+
+    def unflatten_features_and_labels(self, flattened_inputs):
+      """Restores the flattened inputs to original features and labels form.
+
+      Args:
+        flattened_inputs: Flattened inputs for each shard.
+
+      Returns:
+        A tuple of (`features`, `labels`), where `labels` could be None.
+        Each one, if present, should have identical structure (single tensor vs
+        dict) as the one returned by input_fn.
+
+      Raises:
+        ValueError: If the number of expected tensors from `flattened_inputs`
+          mismatches the recorded structure.
+      """
+
+      unflattened_inputs = data_nest.pack_sequence_as(self._feature_structure,
+                                                      flattened_inputs)
+      return _Inputs(
+          unflattened_inputs['features'],
+          unflattened_inputs.get('labels'),
+          signals=unflattened_inputs.get('signals'))
+
+  def __init__(self, input_fn, batch_axis, ctx):
+    """Constructor.
+
+    Args:
+      input_fn: input fn for train or eval.
+      batch_axis: A python tuple of int values describing how each tensor
+        produced by the Estimator `input_fn` should be split across the TPU
+        compute shards.
+      ctx: A `_InternalTPUContext` instance with mode.
+
+    Raises:
+      ValueError: If both `sharded_features` and `num_cores` are `None`.
+    """
+    self._inputs_structure_recorder = _InputPipeline.InputsStructureRecorder(
+        ctx.input_partition_dims)
+
+    self._sharded_per_core = ctx.is_input_sharded_per_core()
+    self._input_fn = input_fn
+    self._infeed_queue = None
+    self._ctx = ctx
+    self._batch_axis = batch_axis
+
+  def generate_infeed_enqueue_ops_and_dequeue_fn(self):
+    """Generates infeed enqueue ops and dequeue_fn."""
+    # While tf.while_loop is called, the body function, which invokes
+    # `enqueue_fn` passed in, is called to construct the graph. So, input_fn
+    # structure is recorded.
+    enqueue_ops, all_hooks, run_infeed_loop_on_coordinator = (
+        self._invoke_input_fn_and_record_structure())
+
+    self._validate_input_pipeline()
+
+    def dequeue_fn():
+      """dequeue_fn is used by TPU to retrieve the tensors."""
+      # In the model-parallel case, both the host-side and device-side
+      # computations must agree on the core on which infeed takes place. We
+      # choose to perform infeed on logical core 0 of each replica.
+      values = self._infeed_queue.generate_dequeue_op(tpu_device=0)
+      # The unflatten process uses the structure information recorded above.
+      return self._inputs_structure_recorder.unflatten_features_and_labels(
+          values)
+
+    return (enqueue_ops, dequeue_fn, all_hooks, run_infeed_loop_on_coordinator)
+
+  def _invoke_input_fn_and_record_structure(self):
+    """Deploys the input pipeline and record input structure."""
+    enqueue_ops = []
+    infeed_queues = []
+    all_dataset_initializers = []
+    num_hosts = self._ctx.num_hosts
+    tpu_host_placement_fn = self._ctx.tpu_host_placement_function
+
+    run_infeed_loop_on_coordinator = True
+
+    if self._sharded_per_core:
+      # Per-Core input pipeline deployment.
+      # Invoke input pipeline for each core and placed on the corresponding
+      # host.
+      for host_id in range(num_hosts):
+        host_device = tpu_host_placement_fn(host_id=host_id)
+        with ops.device(host_device):
+          with ops.name_scope('input_pipeline_task%d' % (host_id)):
+            enqueue_ops_fn, captured_infeed_queue = (
+                generate_per_core_enqueue_ops_fn_for_host(
+                    self._ctx, self._input_fn, self._inputs_structure_recorder,
+                    host_device, host_id))
+
+            if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
+              run_infeed_loop_on_coordinator = False
+              enqueue_ops.append(
+                  _wrap_computation_in_while_loop(
+                      device=host_device, op_fn=enqueue_ops_fn))
+            else:
+              enqueue_ops.append(enqueue_ops_fn())
+            # Infeed_queue_getter must be called after enqueue_ops_fn is called.
+            infeed_queues.append(captured_infeed_queue.get())
+
+    elif self._ctx.is_input_broadcast_with_iterators():
+      # Only calls input_fn in host 0.
+      host_device = tpu_host_placement_fn(host_id=0)
+      enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
+          generate_broadcast_enqueue_ops_fn(self._ctx, self._input_fn,
+                                            self._inputs_structure_recorder,
+                                            num_hosts))
+      if dataset_initializer:
+        all_dataset_initializers.append(dataset_initializer)
+        run_infeed_loop_on_coordinator = False
+        wrap_fn = (
+            _wrap_computation_in_while_loop
+            if self._ctx.mode != model_fn_lib.ModeKeys.PREDICT else
+            _wrap_computation_in_while_loop_with_stopping_signals)
+        enqueue_ops.append(wrap_fn(device=host_device, op_fn=enqueue_ops_fn))
+      else:
+        enqueue_ops.append(enqueue_ops_fn())
+      infeed_queues.append(captured_infeed_queue.get())
+    else:
+      for host_id in range(num_hosts):
+        host_device = tpu_host_placement_fn(host_id=host_id)
+        with ops.device(host_device):
+          with ops.name_scope('input_pipeline_task%d' % (host_id)):
+            if self._ctx.is_input_per_host_with_iterators():
+              enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
+                  generate_per_host_v2_enqueue_ops_fn_for_host(
+                      self._ctx, self._input_fn,
+                      self._inputs_structure_recorder, host_device, host_id))
+            else:
+              enqueue_ops_fn, captured_infeed_queue, dataset_initializer = (
+                  generate_per_host_enqueue_ops_fn_for_host(
+                      self._ctx, self._input_fn,
+                      self._inputs_structure_recorder, self._batch_axis,
+                      host_device, host_id))
+
+            # NOTE(xiejw): We dispatch here based on the return type of the
+            # users `input_fn`.
+            #
+            # 1. If input_fn returns a Dataset instance, we initialize the
+            # iterator outside of tf.while_loop, and call the iterator.get_next
+            # inside tf.while_loop.  This should be always safe.
+            #
+            # 2. If input_fn returns (features, labels), it is too late to wrap
+            # them inside tf.while_loop, as resource initialization cannot be
+            # handled in TF control flow properly. In this case, we will use
+            # python loop to enqueue the data into TPU system.  This may be
+            # slow compared to the previous case.
+            if dataset_initializer:
+              all_dataset_initializers.append(dataset_initializer)
+              run_infeed_loop_on_coordinator = False
+              wrap_fn = (
+                  _wrap_computation_in_while_loop
+                  if self._ctx.mode != model_fn_lib.ModeKeys.PREDICT else
+                  _wrap_computation_in_while_loop_with_stopping_signals)
+              enqueue_ops.append(
+                  wrap_fn(device=host_device, op_fn=enqueue_ops_fn))
+            else:
+              enqueue_ops.append(enqueue_ops_fn())
+            infeed_queues.append(captured_infeed_queue.get())
+    # infeed_queue is used to generate dequeue ops. The only thing it uses for
+    # dequeue is dtypes and types. So, any one can be used. Here, grab the
+    # first one.
+    self._infeed_queue = infeed_queues[0]
+    return enqueue_ops, [
+        util_lib.MultiHostDatasetInitializerHook(all_dataset_initializers)
+    ], run_infeed_loop_on_coordinator
+
+  def _validate_input_pipeline(self):
+    """Validates the input pipeline.
+
+    Perform some sanity checks to log user friendly information. We should
+    error out to give users better error message. But, if
+    _WRAP_INPUT_FN_INTO_WHILE_LOOP is False (legacy behavior), we cannot break
+    user code, so, log a warning.
+
+    Raises:
+      RuntimeError: If the validation failed.
+    """
+    if ops.get_default_graph().get_collection(ops.GraphKeys.QUEUE_RUNNERS):
+      err_msg = ('Input pipeline contains one or more QueueRunners. '
+                 'It could be slow and not scalable. Please consider '
+                 'converting your input pipeline to use `tf.data` instead (see '
+                 'https://www.tensorflow.org/guide/datasets for '
+                 'instructions.')
+      if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
+        raise RuntimeError(err_msg)
+      else:
+        logging.warn(err_msg)
+
+
+def call_computation(computation,
+                     experimental_exported_model_uses_all_cores=True):
+  """Call computation.
+
+  computation uses a single-core for TPU inference. If
+  `experimental_exported_model_uses_all_cores` is `True`, this function will
+  round-robin
+  computation among all TPU cores visible to the host; otherwise, it will use
+  a single core.
+
+  Args:
+    computation: A Python function that takes no inputs and builds computation
+      graph. If `computation` returns m outputs, this function will return a
+      list of m Tensors.
+    experimental_exported_model_uses_all_cores: Whether to round-robin among all
+      cores visible to the host, or to use a single core.
+
+  Returns:
+    A list of output tensors.
+  """
+  if experimental_exported_model_uses_all_cores:
+    # Using `TPUPartitionedCall` makes it possible to target a different
+    # TPU core with every `Session.run()` call. Note that the entire inference
+    # graph executes on a single core, and that invocations of this graph
+    # will round-robin among the cores attached to a host.
+    @function.Defun(capture_resource_var_by_value=False)
+    def tpu_subgraph():
+      return computation()
+
+    return tpu_functional.TPUPartitionedCall(
+        args=tpu_subgraph.captured_inputs,
+        device_ordinal=tpu_ops.tpu_ordinal_selector(),
+        Tout=[o.type for o in tpu_subgraph.definition.signature.output_arg],
+        f=tpu_subgraph)
+  else:
+    return computation()
+
+
+class _ModelFnWrapper(object):
+  """A `model_fn` wrapper.
+
+  This makes calling model_fn on CPU and TPU easier and more consistent and
+  performs necessary check and mutation required by TPU training and evaluation.
+
+  In addition, this wrapper manages converting the `model_fn` to a single TPU
+  train and eval step.
+  """
+
+  def __init__(self, model_fn, config, params, ctx):
+    self._model_fn = model_fn
+    self._config = config
+    self._params = params
+    self._ctx = ctx
+
+  def call_without_tpu(self, features, labels, is_export_mode):
+    return self._call_model_fn(features, labels, is_export_mode=is_export_mode)
+
+  def _add_embedding_features(self, features, hook_dummy_table_variables):
+    """Add embedding features, optionally add hook to intercept gradient."""
+    if self._ctx.embedding_config:
+      tpu_embedding_ = self._ctx.embedding_config.tpu_embedding
+      embedding_activations = tpu_embedding_.get_activations()
+      if hook_dummy_table_variables:
+        new_embedding_activations = (
+            tpu_embedding_gradient.hook_dummy_table_variables_to_activations(
+                tpu_embedding_, embedding_activations,
+                self._ctx.embedding_config.dummy_table_variables))
+        features.update(new_embedding_activations)
+      else:
+        features.update(embedding_activations)
+
+  def convert_to_single_tpu_train_step(self, dequeue_fn):
+    """Converts user provided model_fn` as a single train step on TPU.
+
+    The user provided `model_fn` takes input tuple
+    (features, labels) and produces the EstimatorSpec with train_op and loss for
+    train `mode`. This usually represents a single train computation on CPU.
+
+    For TPU training, a train (computation) step is first wrapped in a
+    tf.while_loop control flow to repeat for many times and then replicated to
+    all TPU shards. Besides the input should be taken from TPU infeed rather
+    than input pipeline (input_fn) directly. To fit TPU loop and replicate
+    pattern, the original train computation should be reformed, which is the
+    returned `train_step`.
+
+    Args:
+      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
+        infeed dequeue channel.
+
+    Returns:
+      A tuple of train_fn, host_calls, and captured scaffold_fn. The train_fn
+      representing the train step for TPU.
+    """
+
+    host_call = _OutfeedHostCall(self._ctx)
+    captured_scaffold_fn = _CapturedObject()
+    captured_training_hooks = _CapturedObject()
+
+    def train_step(loss):
+      """Training step function for use inside a while loop."""
+      del loss  # unused; required in function signature.
+      inputs = dequeue_fn()
+      features, labels = inputs.features_and_labels()
+      self._add_embedding_features(features, True)
+
+      estimator_spec = self._verify_estimator_spec(
+          self._call_model_fn(features, labels))
+      loss, train_op = estimator_spec.loss, estimator_spec.train_op
+
+      if isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
+        captured_scaffold_fn.capture(estimator_spec.scaffold_fn)
+      else:
+        captured_scaffold_fn.capture(None)
+
+      captured_training_hooks.capture(estimator_spec.training_hooks)
+
+      if self._ctx.embedding_config is None:
+        apply_sparse_grads = []
+      else:
+        tpu_embedding_ = self._ctx.embedding_config.tpu_embedding
+        gradients = (
+            tpu_embedding_gradient.get_gradients_through_dummy_table_variables(
+                tpu_embedding_)
+        )
+        apply_sparse_grads = [
+            tpu_embedding_.generate_send_gradients_op(gradients)
+        ]
+
+      # We must run train_op to update the variables prior to running the
+      # outfeed.
+      with ops.control_dependencies([train_op] + apply_sparse_grads):
+        host_call_outfeed_ops = []
+        if (isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec)  # pylint: disable=protected-access
+            and estimator_spec.host_call is not None):
+          host_call.record({'host_call': estimator_spec.host_call})
+          host_call_outfeed_ops = host_call.create_enqueue_op()
+        with ops.control_dependencies(host_call_outfeed_ops):
+          return array_ops.identity(loss)
+
+    return (train_step, host_call, captured_scaffold_fn,
+            captured_training_hooks)
+
+  def convert_to_single_tpu_eval_step(self, dequeue_fn):
+    """Converts user provided model_fn` as a single eval step on TPU.
+
+    Similar to training, the user provided `model_fn` takes input tuple
+    (features, labels) and produces the TPUEstimatorSpec with eval_metrics for
+    eval `mode`. This usually represents a single evaluation computation on CPU.
+
+    For TPU evaluation, a eval (computation) step is first wrapped in a
+    tf.while_loop control flow to repeat for many times and then replicated to
+    all TPU shards. Besides the input and output are slightly different. Input,
+    features and labels, should be taken from TPU infeed rather than input
+    pipeline (input_fn) directly. Output is managed in two stages.  First, the
+    model outputs as the result of evaluation computation, usually model logits,
+    should be transferred from TPU system to CPU. Then, all model outputs are
+    concatenated first on CPU and sent to the metric_fn for metrics computation.
+    To fit TPU evaluation pattern, the original eval computation should be
+    reformed, which is the returned `eval_step`.
+
+    Args:
+      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
+        infeed dequeue channel.
+
+    Returns:
+      A tuple of eval_fn, host_calls, and captured scaffold_fn. The eval_fn
+      representing the eval step for TPU.
+    """
+    host_calls = _OutfeedHostCall(self._ctx)
+    captured_scaffold_fn = _CapturedObject()
+    captured_eval_hooks = _CapturedObject()
+
+    def eval_step(total_loss):
+      """Evaluation step function for use inside a while loop."""
+      inputs = dequeue_fn()
+      features, labels = inputs.features_and_labels()
+      self._add_embedding_features(features, False)
+
+      tpu_estimator_spec = self._call_model_fn(features, labels)
+      if not isinstance(tpu_estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
+        raise RuntimeError(
+            'estimator_spec used by TPU evaluation must have type'
+            '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec)))
+
+      loss = tpu_estimator_spec.loss
+      captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
+      captured_eval_hooks.capture(tpu_estimator_spec.evaluation_hooks)
+
+      to_record = {}
+      if tpu_estimator_spec.eval_metrics:
+        to_record['eval_metrics'] = tpu_estimator_spec.eval_metrics
+      if tpu_estimator_spec.host_call is not None:
+        # We assume that evaluate won't update global step, so we don't wrap
+        # this host_call.
+        to_record['host_call'] = tpu_estimator_spec.host_call
+      host_calls.record(to_record)
+
+      with ops.control_dependencies(host_calls.create_enqueue_op()):
+        return math_ops.add(total_loss, loss)
+
+    return eval_step, host_calls, captured_scaffold_fn, captured_eval_hooks
+
+  def convert_to_single_tpu_predict_step(self, dequeue_fn):
+    """Converts user provided model_fn` as a single predict step on TPU.
+
+    Args:
+      dequeue_fn: The function to retrieve inputs, features and labels, from TPU
+        infeed dequeue channel.
+
+    Returns:
+      A tuple of predict_fn, host_calls, and captured scaffold_fn. The
+      predict_fn representing the predict step for TPU.
+    """
+    host_calls = _OutfeedHostCall(self._ctx)
+    captured_scaffold_fn = _CapturedObject()
+    captured_predict_hooks = _CapturedObject()
+
+    def predict_step(unused_scalar_stopping_signal):
+      """Evaluation step function for use inside a while loop."""
+      inputs = dequeue_fn()
+      features, labels = inputs.features_and_labels()
+      stopping_signals = inputs.signals()
+
+      assert stopping_signals is not None, (
+          'Internal Error: `signals` is missing.')
+
+      tpu_estimator_spec = self._call_model_fn(
+          features, labels, is_export_mode=False)
+      if not isinstance(tpu_estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
+        raise RuntimeError(
+            'estimator_spec used by TPU prediction must have type'
+            '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec)))
+
+      self._verify_tpu_spec_predictions(tpu_estimator_spec.predictions)
+
+      captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
+      captured_predict_hooks.capture(tpu_estimator_spec.prediction_hooks)
+      to_record = {}
+      identity_fn = lambda **kwargs: kwargs
+      to_record['predictions'] = [identity_fn, tpu_estimator_spec.predictions]
+      to_record['signals'] = [identity_fn, stopping_signals]
+      if tpu_estimator_spec.host_call is not None:
+        to_record['host_call'] = tpu_estimator_spec.host_call
+      host_calls.record(to_record)
+
+      with ops.control_dependencies(host_calls.create_enqueue_op()):
+        return _StopSignals.as_scalar_stopping_signal(stopping_signals)
+
+    return (predict_step, host_calls, captured_scaffold_fn,
+            captured_predict_hooks)
+
+  def _verify_tpu_spec_predictions(self, predictions):
+    """Validates TPUEstimatorSpec.predictions dict."""
+    # TODO(xiejw): Adds validation for prediction dictionrary.
+    # TODO(xiejw): Adds support for single tensor as predictions.
+    if not isinstance(predictions, dict):
+      raise TypeError('TPUEstimatorSpec.predictions must be dict of Tensors.')
+
+    for (key, tensor) in predictions.items():
+      if tensor.shape.dims[0].value is None:
+        raise ValueError(
+            'The tensor with key ({}) in TPUEstimatorSpec.predictions has '
+            'dynamic shape (should be static). Tensor: {}'.format(key, tensor))
+    return predictions
+
+  def _validate_model_features_and_labels(self, features, labels,
+                                          is_export_mode):
+    """Validates that the features and labels for the model function are valid.
+
+    A valid features/labels object is the one with:
+    - Type: A tensor or any nested structure of tensors supported by TF nest,
+        namely nested dictionary, tuple, namedtuple, or sequence of tensors.
+    - Static shape if is_export_mode is False.
+
+    Args:
+      features: the features that would be input to the model function.
+      labels: the labels that would be input to the model function.
+      is_export_mode: boolean value specifying if in export mode.
+
+    Raises:
+      TypeError: If features/labels are not of the correct type.
+      ValueError: If features/labels have dynamic shape.
+    """
+
+    def validate(obj, obj_name):
+      """Helper validate function."""
+      if is_export_mode or self._ctx.is_running_on_cpu(is_export_mode):
+        return
+      if isinstance(obj, ops.Tensor):
+        if not obj.get_shape().is_fully_defined():
+          raise ValueError(
+              'The {} to the model returned by input_fn must have static shape.'
+              ' Tensor: {}'.format(obj_name, obj))
+      else:
+        for tensor in data_nest.flatten(obj):
+          if not tensor.get_shape().is_fully_defined():
+            raise ValueError(
+                ('The {} to the model returned by input_fn must have static '
+                 'shape. Tensor: {}').format(obj_name, tensor))
+
+    validate(features, 'features')
+    if labels is not None:
+      validate(labels, 'labels')
+
+  def _call_model_fn(self, features, labels, is_export_mode=False):
+    """Calls the model_fn with required parameters."""
+    self._validate_model_features_and_labels(features, labels, is_export_mode)
+    model_fn_args = function_utils.fn_args(self._model_fn)
+    kwargs = {}
+
+    # Makes deep copy with `config` and params` in case user mutates them.
+    config = copy.deepcopy(self._config)
+    params = copy.deepcopy(self._params)
+
+    if 'labels' in model_fn_args:
+      kwargs['labels'] = labels
+    elif labels is not None:
+      raise ValueError(
+          'model_fn does not take labels, but input_fn returns labels.')
+    if 'mode' in model_fn_args:
+      kwargs['mode'] = self._ctx.mode
+    if 'config' in model_fn_args:
+      kwargs['config'] = config
+    if 'params' in model_fn_args:
+      kwargs['params'] = params
+
+    if 'params' not in model_fn_args:
+      raise ValueError('model_fn ({}) does not include params argument, '
+                       'required by TPUEstimator to pass batch size as '
+                       'params[\'batch_size\']'.format(self._model_fn))
+
+    if is_export_mode:
+      batch_size_for_model_fn = None
+    else:
+      batch_size_for_model_fn = self._ctx.batch_size_for_model_fn
+
+    if batch_size_for_model_fn is not None:
+      _add_item_to_params(params, _BATCH_SIZE_KEY, batch_size_for_model_fn)
+
+    running_on_cpu = self._ctx.is_running_on_cpu(is_export_mode)
+    _add_item_to_params(params, _USE_TPU_KEY, not running_on_cpu)
+
+    if not running_on_cpu:
+      user_context = tpu_context.TPUContext(
+          internal_ctx=self._ctx, call_from_input_fn=False)
+      _add_item_to_params(params, _CTX_KEY, user_context)
+
+    estimator_spec = self._model_fn(features=features, **kwargs)
+    if (running_on_cpu and
+        isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec)):  # pylint: disable=protected-access
+      # The estimator_spec will be passed to `Estimator` directly, which expects
+      # type `EstimatorSpec`.
+      return estimator_spec.as_estimator_spec()
+    else:
+      return estimator_spec
+
+  def _verify_estimator_spec(self, estimator_spec):
+    """Validates the estimator_spec."""
+    if isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
+      return estimator_spec
+
+    err_msg = '{} returned by EstimatorSpec is not supported in TPUEstimator.'
+    if estimator_spec.training_chief_hooks:
+      raise ValueError(
+          err_msg.format('training_chief_hooks') + 'If you want' +
+          ' to pass training hooks, please pass via training_hooks.')
+
+    if estimator_spec.scaffold:
+      logging.warning('EstimatorSpec.Scaffold is ignored by TPU train/eval. '
+                      'Please use TPUEstimatorSpec.')
+    return estimator_spec
+
+
+class _OutfeedHostCall(object):
+  """Support for `eval_metrics` and `host_call` in TPUEstimatorSpec."""
+
+  def __init__(self, ctx):
+    self._ctx = ctx
+    self._names = []
+    # All of these are dictionaries of lists keyed on the name.
+    self._host_fns = {}
+    self._tensor_keys = collections.defaultdict(list)
+    self._tensors = collections.defaultdict(list)
+    self._tensor_dtypes = collections.defaultdict(list)
+    self._tensor_shapes = collections.defaultdict(list)
+
+  @staticmethod
+  def validate(host_calls):
+    """Validates the `eval_metrics` and `host_call` in `TPUEstimatorSpec`."""
+
+    for name, host_call in host_calls.items():
+      if not isinstance(host_call, (tuple, list)):
+        raise ValueError('{} should be tuple or list'.format(name))
+      if len(host_call) != 2:
+        raise ValueError('{} should have two elements.'.format(name))
+      if not callable(host_call[0]):
+        raise TypeError('{}[0] should be callable.'.format(name))
+      if not isinstance(host_call[1], (tuple, list, dict)):
+        raise ValueError('{}[1] should be tuple or list, or dict.'.format(name))
+
+      if isinstance(host_call[1], (tuple, list)):
+        fullargspec = tf_inspect.getfullargspec(host_call[0])
+        fn_args = function_utils.fn_args(host_call[0])
+        # wrapped_hostcall_with_global_step uses varargs, so we allow that.
+        if fullargspec.varargs is None and len(host_call[1]) != len(fn_args):
+          raise RuntimeError(
+              'In TPUEstimatorSpec.{}, length of tensors {} does not match '
+              'method args of the function, which takes {}.'.format(
+                  name, len(host_call[1]), len(fn_args)))
+
+  @staticmethod
+  def create_cpu_hostcall(host_calls):
+    """Runs on the host_call on CPU instead of TPU when use_tpu=False."""
+
+    _OutfeedHostCall.validate(host_calls)
+    ret = {}
+    for name, host_call in host_calls.items():
+      host_fn, tensors = host_call
+      if isinstance(tensors, (tuple, list)):
+        ret[name] = host_fn(*tensors)
+      else:
+        # Must be dict.
+        try:
+          ret[name] = host_fn(**tensors)
+        except TypeError as e:
+          logging.warning(
+              'Exception while calling %s: %s. It is likely the tensors '
+              '(%s[1]) do not match the '
+              'function\'s arguments', name, e, name)
+          raise
+    return ret
+
+  def record(self, host_calls):
+    """Records the host_call structure."""
+
+    for name, host_call in host_calls.items():
+      host_fn, tensor_list_or_dict = host_call
+      self._names.append(name)
+      self._host_fns[name] = host_fn
+
+      if isinstance(tensor_list_or_dict, dict):
+        for (key, tensor) in six.iteritems(tensor_list_or_dict):
+          self._tensor_keys[name].append(key)
+          self._tensors[name].append(tensor)
+          self._tensor_dtypes[name].append(tensor.dtype)
+          self._tensor_shapes[name].append(tensor.shape)
+      else:
+        # List or tuple.
+        self._tensor_keys[name] = None
+        for tensor in tensor_list_or_dict:
+          self._tensors[name].append(tensor)
+          self._tensor_dtypes[name].append(tensor.dtype)
+          self._tensor_shapes[name].append(tensor.shape)
+
+  def create_enqueue_op(self):
+    """Create the op to enqueue the recorded host_calls.
+
+    Returns:
+      A list of enqueue ops, which is empty if there are no host calls.
+    """
+    if not self._names:
+      return []
+
+    tensors = []
+    # TODO(jhseu): Consider deduping tensors.
+    for name in self._names:
+      tensors.extend(self._tensors[name])
+
+    with ops.device(tpu.core(0)):
+      return [tpu_ops.outfeed_enqueue_tuple(tensors)]
+
+  def create_tpu_hostcall(self):
+    """Sends the tensors through outfeed and runs the host_fn on CPU.
+
+    The tensors are concatenated along dimension 0 to form a global tensor
+    across all shards. The concatenated function is passed to the host_fn and
+    executed on the first host.
+
+    Returns:
+      A dictionary mapping name to the return type of the host_call by that
+      name.
+
+    Raises:
+      RuntimeError: If outfeed tensor is scalar.
+    """
+    if not self._names:
+      return {}
+
+    ret = {}
+    # For each i, dequeue_ops[i] is a list containing the tensors from all
+    # shards. This list is concatenated later.
+    dequeue_ops = []
+    tensor_dtypes = []
+    tensor_shapes = []
+    for name in self._names:
+      for _ in self._tensors[name]:
+        dequeue_ops.append([])
+      for dtype in self._tensor_dtypes[name]:
+        tensor_dtypes.append(dtype)
+      for shape in self._tensor_shapes[name]:
+        tensor_shapes.append(shape)
+
+    # Outfeed ops execute on each replica's first logical core. Note: we must
+    # constraint it such that we have at most one outfeed dequeue and enqueue
+    # per replica.
+    for i in xrange(self._ctx.num_replicas):
+      host_device, ordinal_id = self._ctx.device_for_replica(i)
+      with ops.device(host_device):
+        outfeed_tensors = tpu_ops.outfeed_dequeue_tuple(
+            dtypes=tensor_dtypes,
+            shapes=tensor_shapes,
+            device_ordinal=ordinal_id)
+        for j, item in enumerate(outfeed_tensors):
+          dequeue_ops[j].append(item)
+
+    # Deconstruct dequeue ops.
+    flat_dequeue_ops = []
+    for l in dequeue_ops:
+      flat_dequeue_ops.extend(l)
+
+    dequeue_ops_by_name = {}
+    pos = 0
+    for name in self._names:
+      dequeue_ops_by_name[name] = dequeue_ops[pos:pos +
+                                              len(self._tensors[name])]
+      pos += len(self._tensors[name])
+
+    def _call_host_fn(fn, *args, **kw):
+      context = CatchInvalidHostcallFunctions()
+      context.Enter()
+      result = fn(*args, **kw)
+      context.Exit()
+      context.ExitResult(result)
+      return result
+
+    # It is assumed evaluation always happens on single host TPU system. So,
+    # place all ops on tpu host if possible.
+    #
+    # TODO(jhseu): Evaluate whether this is right for summaries.
+    with ops.device(self._ctx.tpu_host_placement_function(replica_id=0)):
+      for name in self._names:
+        dequeue_ops = dequeue_ops_by_name[name]
+        for i, item in enumerate(dequeue_ops):
+          if dequeue_ops[i][0].shape.ndims == 0:
+            raise RuntimeError(
+                'All tensors outfed from TPU should preserve batch size '
+                'dimension, but got scalar {}'.format(dequeue_ops[i][0]))
+          # TODO(xiejw): Make the specification of the outfeed combinaton
+          # function more explicit and well-documented.  We may want to give the
+          # user the option of concatenating along any axis.
+          if (self._ctx.config.tpu_config.per_host_input_for_training is
+              tpu_config.InputPipelineConfig.BROADCAST):
+            # If the infeed is in BROADCAST mode (each core recieving the same
+            # input), then we assume that the cores also produce identical
+            # copies of the same output, and we simply take the output from
+            # the first core.  This mode is used by Mesh-TensorFlow.
+            with ops.control_dependencies(dequeue_ops[i]):
+              dequeue_ops[i] = array_ops.identity(dequeue_ops[i][0])
+          else:
+            # Assume that the input has been batch-split and that axis 0 of the
+            # output tensors represents the batch size.  Concatenate along
+            # the axis 0 to re-combine the batch.
+            dequeue_ops[i] = array_ops.concat(dequeue_ops[i], axis=0)
+
+        if self._tensor_keys[name] is not None:
+          # The user-provided eval_metrics[1] is a dict.
+          dequeue_ops = dict(zip(self._tensor_keys[name], dequeue_ops))
+          try:
+            ret[name] = _call_host_fn(self._host_fns[name], **dequeue_ops)
+          except TypeError as e:
+            logging.warning(
+                'Exception while calling %s: %s. It is likely the tensors '
+                '(%s[1]) do not match the '
+                'function\'s arguments', name, e, name)
+            raise
+        else:
+          ret[name] = _call_host_fn(self._host_fns[name], *dequeue_ops)
+
+    # force all dequeue operations to be run if not consumed by the host calls
+    ret['__force_dequeue'] = control_flow_ops.group(*flat_dequeue_ops)
+    return ret
+
+
+class _OutfeedHostCallHook(session_run_hook.SessionRunHook):
+  """Hook to run host calls when use_tpu=False."""
+
+  def __init__(self, tensors):
+    self._tensors = tensors
+
+  def begin(self):
+    # We duplicate this code from the TPUInfeedOutfeedSessionHook rather than
+    # create a separate hook to guarantee execution order, because summaries
+    # need to be initialized before the outfeed thread starts.
+    # TODO(jhseu): Make a wrapper hook instead?
+    self._init_ops = contrib_summary.summary_writer_initializer_op()
+    # Get all the writer resources from the initializer, so we know what to
+    # flush.
+    self._finalize_ops = []
+    for op in self._init_ops:
+      self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
+
+  def after_create_session(self, session, coord):
+    session.run(self._init_ops)
+
+  def before_run(self, run_context):
+    return basic_session_run_hooks.SessionRunArgs(self._tensors)
+
+  def end(self, session):
+    session.run(self._finalize_ops)
+
+
+class ExamplesPerSecondHook(basic_session_run_hooks.StepCounterHook):
+  """Calculate and report global_step/sec and examples/sec during runtime."""
+
+  def __init__(self,
+               batch_size,
+               every_n_steps=100,
+               every_n_secs=None,
+               output_dir=None,
+               summary_writer=None):
+    self._batch_size = batch_size
+    super(ExamplesPerSecondHook, self).__init__(
+        every_n_steps=every_n_steps,
+        every_n_secs=every_n_secs,
+        output_dir=output_dir,
+        summary_writer=summary_writer)
+
+  def _log_and_record(self, elapsed_steps, elapsed_time, global_step):
+    global_step_per_sec = elapsed_steps / elapsed_time
+    examples_per_sec = self._batch_size * global_step_per_sec
+    if self._summary_writer is not None:
+      global_step_summary = Summary(value=[
+          Summary.Value(tag='global_step/sec', simple_value=global_step_per_sec)
+      ])
+      example_summary = Summary(value=[
+          Summary.Value(tag='examples/sec', simple_value=examples_per_sec)
+      ])
+      self._summary_writer.add_summary(global_step_summary, global_step)
+      self._summary_writer.add_summary(example_summary, global_step)
+    logging.info('global_step/sec: %g', global_step_per_sec)
+    logging.info('examples/sec: %g', examples_per_sec)
+
+
+class InstallSignalHandlerHook(session_run_hook.SessionRunHook):
+  """Change SIGINT (CTRL^C) handler to force quit the process.
+
+  The default behavior often results in hanging processes.
+  The original handler is restored after training/evaluation.
+  """
+
+  def __init__(self):
+    self._signal_fn = signal.getsignal(signal.SIGINT)
+
+  def before_run(self, run_context):
+    signal.signal(signal.SIGINT, signal.SIG_DFL)
+
+  def end(self, session):
+    signal.signal(signal.SIGINT, self._signal_fn)
+
+
+class TPUEstimator(estimator_lib.Estimator):
+  """Estimator with TPU support.
+
+  TPUEstimator also supports training on CPU and GPU. You don't need to define
+  a separate `tf.estimator.Estimator`.
+
+  TPUEstimator handles many of the details of running on TPU devices, such as
+  replicating inputs and models for each core, and returning to host
+  periodically to run hooks.
+
+  TPUEstimator transforms a global batch size in params to a per-shard batch
+  size when calling the `input_fn` and `model_fn`. Users should specify
+  global batch size in constructor, and then get the batch size for each shard
+  in `input_fn` and `model_fn` by `params['batch_size']`.
+
+  - For training, `model_fn` gets per-core batch size; `input_fn` may get
+    per-core or per-host batch size depending on `per_host_input_for_training`
+    in `TPUConfig` (See docstring for TPUConfig for details).
+
+  - For evaluation and prediction, `model_fn` gets per-core batch size and
+    `input_fn` get per-host batch size.
+
+  Evaluation
+  ==========
+
+  `model_fn` should return `TPUEstimatorSpec`, which expects the `eval_metrics`
+  for TPU evaluation. If eval_on_tpu is False, the evaluation will execute on
+  CPU or GPU; in this case the following discussion on TPU evaluation does not
+  apply.
+
+  `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`, where
+  `tensors` could be a list of any nested structure of `Tensor`s (See
+  `TPUEstimatorSpec` for details).  `metric_fn` takes the `tensors` and returns
+  a dict from metric string name to the result of calling a metric function,
+  namely a `(metric_tensor, update_op)` tuple.
+
+  One can set `use_tpu` to `False` for testing. All training, evaluation, and
+  predict will be executed on CPU. `input_fn` and `model_fn` will receive
+  `train_batch_size` or `eval_batch_size` unmodified as `params['batch_size']`.
+
+  Current limitations:
+  --------------------
+
+  1. TPU evaluation only works on a single host (one TPU worker) except
+     BROADCAST mode.
+
+  2. `input_fn` for evaluation should **NOT** raise an end-of-input exception
+     (`OutOfRangeError` or `StopIteration`). And all evaluation steps and all
+     batches should have the same size.
+
+  Example (MNIST):
+  ----------------
+
+  ```
+  # The metric Fn which runs on CPU.
+  def metric_fn(labels, logits):
+    predictions = tf.argmax(logits, 1)
+    return {
+      'accuracy': tf.metrics.precision(
+          labels=labels, predictions=predictions),
+    }
+
+  # Your model Fn which runs on TPU (eval_metrics is list in this example)
+  def model_fn(features, labels, mode, config, params):
+    ...
+    logits = ...
+
+    if mode = tf.estimator.ModeKeys.EVAL:
+      return tpu_estimator.TPUEstimatorSpec(
+          mode=mode,
+          loss=loss,
+          eval_metrics=(metric_fn, [labels, logits]))
+
+  # or specify the eval_metrics tensors as dict.
+  def model_fn(features, labels, mode, config, params):
+    ...
+    final_layer_output = ...
+
+    if mode = tf.estimator.ModeKeys.EVAL:
+      return tpu_estimator.TPUEstimatorSpec(
+          mode=mode,
+          loss=loss,
+          eval_metrics=(metric_fn, {
+              'labels': labels,
+              'logits': final_layer_output,
+          }))
+  ```
+
+  Prediction
+  ==========
+
+  Prediction on TPU is an experimental feature to support large batch inference.
+  It is not designed for latency-critical system. In addition, due to some
+  usability issues, for prediction with small dataset, CPU `.predict`, i.e.,
+  creating a new `TPUEstimator` instance with `use_tpu=False`, might be more
+  convenient.
+
+  Note: In contrast to TPU training/evaluation, the `input_fn` for prediction
+  *should* raise an end-of-input exception (`OutOfRangeError` or
+  `StopIteration`), which serves as the stopping signal to `TPUEstimator`. To be
+  precise, the ops created by `input_fn` produce one batch of the data.
+  The `predict()` API processes one batch at a time. When reaching the end of
+  the data source, an end-of-input exception should be raised by one of these
+  operations. The user usually does not need to do this manually. As long as the
+  dataset is not repeated forever, the `tf.data` API will raise an end-of-input
+  exception automatically after the last batch has been produced.
+
+  Note: Estimator.predict returns a Python generator. Please consume all the
+  data from the generator so that TPUEstimator can shutdown the TPU system
+  properly for user.
+
+  Current limitations:
+  --------------------
+  1. TPU prediction only works on a single host (one TPU worker).
+
+  2. `input_fn` must return a `Dataset` instance rather than `features`. In
+  fact, .train() and .evaluate() also support Dataset as return value.
+
+  Example (MNIST):
+  ----------------
+  ```
+  height = 32
+  width = 32
+  total_examples = 100
+
+  def predict_input_fn(params):
+    batch_size = params['batch_size']
+
+    images = tf.random_uniform(
+        [total_examples, height, width, 3], minval=-1, maxval=1)
+
+    dataset = tf.data.Dataset.from_tensor_slices(images)
+    dataset = dataset.map(lambda images: {'image': images})
+
+    dataset = dataset.batch(batch_size)
+    return dataset
+
+  def model_fn(features, labels, params, mode):
+     # Generate predictions, called 'output', from features['image']
+
+    if mode == tf.estimator.ModeKeys.PREDICT:
+      return tf.contrib.tpu.TPUEstimatorSpec(
+          mode=mode,
+          predictions={
+              'predictions': output,
+              'is_padding': features['is_padding']
+          })
+
+  tpu_est = TPUEstimator(
+      model_fn=model_fn,
+      ...,
+      predict_batch_size=16)
+
+  # Fully consume the generator so that TPUEstimator can shutdown the TPU
+  # system.
+  for item in tpu_est.predict(input_fn=input_fn):
+    # Filter out item if the `is_padding` is 1.
+    # Process the 'predictions'
+  ```
+
+  Exporting
+  =========
+
+  `export_savedmodel` exports 2 metagraphs, one with `tag_constants.SERVING`,
+  and another with `tag_constants.SERVING` and `tag_constants.TPU`.
+  At serving time, these tags are used to select metagraph to load.
+
+  Before running the graph on TPU, TPU system needs to be initialized. If
+  TensorFlow Serving model-server is used, this is done automatically. If
+  not, please call `session.run(tpu.initialize_system())`.
+
+  `tpu.outside_compilation` can be used to wrap TPU incompatible ops in
+  `model_fn`.
+
+  Example:
+  ----------------
+
+  ```
+  def model_fn(features, labels, mode, config, params):
+    ...
+    logits = ...
+    export_outputs = {
+      'logits': export_output_lib.PredictOutput(
+        {'logits': logits})
+    }
+
+    def host_call(logits):
+      class_ids = math_ops.argmax(logits)
+      classes = string_ops.as_string(class_ids)
+      export_outputs['classes'] =
+        export_output_lib.ClassificationOutput(classes=classes)
+
+    tpu.outside_compilation(host_call, logits)
+
+    ...
+  ```
+
+  """
+
+  def __init__(self,
+               model_fn=None,
+               model_dir=None,
+               config=None,
+               params=None,
+               use_tpu=True,
+               train_batch_size=None,
+               eval_batch_size=None,
+               predict_batch_size=None,
+               batch_axis=None,
+               eval_on_tpu=True,
+               export_to_tpu=True,
+               export_to_cpu=True,
+               warm_start_from=None,
+               experimental_exported_model_uses_all_cores=False,
+               experimental_export_device_assignment=False,
+               experimental_embedding_config_spec=None):
+    """Constructs an `TPUEstimator` instance.
+
+    Args:
+      model_fn: Model function as required by `Estimator` which returns
+        EstimatorSpec or TPUEstimatorSpec. `training_hooks`, 'evaluation_hooks',
+        and `prediction_hooks` must not capure any TPU Tensor inside the
+        model_fn.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator to
+        continue training a previously saved model. If `None`, the model_dir in
+        `config` will be used if set. If both are set, they must be same. If
+        both are `None`, a temporary directory will be used.
+      config: An `tpu_config.RunConfig` configuration object. Cannot be `None`.
+      params: An optional `dict` of hyper parameters that will be passed into
+        `input_fn` and `model_fn`.  Keys are names of parameters, values are
+        basic python types. There are reserved keys for `TPUEstimator`,
+        including 'batch_size'.
+      use_tpu: A bool indicating whether TPU support is enabled. Currently, -
+        TPU training and evaluation respect this bit, but eval_on_tpu can
+        override execution of eval. See below. - Predict still happens on CPU.
+      train_batch_size: An int representing the global training batch size.
+        TPUEstimator transforms this global batch size to a per-shard batch
+        size, as params['batch_size'], when calling `input_fn` and `model_fn`.
+        Cannot be `None` if `use_tpu` is `True`. Must be divisible by total
+        number of replicas.
+      eval_batch_size: An int representing evaluation batch size. Must be
+        divisible by total number of replicas.
+      predict_batch_size: An int representing the prediction batch size. Must be
+        divisible by total number of replicas.
+      batch_axis: A python tuple of int values describing how each tensor
+        produced by the Estimator `input_fn` should be split across the TPU
+        compute shards. For example, if your input_fn produced (images, labels)
+        where the images tensor is in `HWCN` format, your shard dimensions would
+        be [3, 0], where 3 corresponds to the `N` dimension of your images
+        Tensor, and 0 corresponds to the dimension along which to split the
+        labels to match up with the corresponding images. If None is supplied,
+        and per_host_input_for_training is True, batches will be sharded based
+        on the major dimension. If tpu_config.per_host_input_for_training is
+        False or `PER_HOST_V2`, batch_axis is ignored.
+      eval_on_tpu: If False, evaluation runs on CPU or GPU. In this case, the
+        model_fn must return `EstimatorSpec` when called with `mode` as `EVAL`.
+      export_to_tpu: If True, `export_savedmodel()` exports a metagraph for
+        serving on TPU. Note that unsupported export modes such as EVAL will be
+        ignored. For those modes, only a CPU model will be exported.
+        Currently, export_to_tpu only supports PREDICT.
+      export_to_cpu: If True, `export_savedmodel()` exports a metagraph for
+        serving on CPU.
+      warm_start_from: Optional string filepath to a checkpoint or SavedModel to
+        warm-start from, or a `tf.estimator.WarmStartSettings` object to fully
+        configure warm-starting.  If the string filepath is provided instead of
+        a `WarmStartSettings`, then all variables are warm-started, and it is
+        assumed that vocabularies and Tensor names are unchanged.
+      experimental_exported_model_uses_all_cores: Whether to round-robin among
+        all cores visible to the host which is serving the saved model, or to
+        use a single core. This is a temporary flag to enable using all TPU
+        cores for inference with TPUPartitionedCall(). Once outside compilation
+        is supported in TPUPartitionedCall(), this flag will be enabled by
+        default.
+      experimental_export_device_assignment: Whether to include the device
+        assignment in the exported model. Doing so is useful in case of model
+        parallel inference but will tie the exported model to the TPU topology
+        used to export the model.
+      experimental_embedding_config_spec: Optional EmbeddingConfigSpec instance
+        to support using TPU embedding. IT IS STILL WORK IN PROGRESS, SO PLEASE
+        DO NOT USE.
+
+    Raises:
+      ValueError: `params` has reserved keys already.
+    """
+    if config is None or not isinstance(config, tpu_config.RunConfig):
+      raise ValueError(
+          '`config` must be provided with type `tpu_config.RunConfig`')
+
+    if params is not None and any(k in params for k in _RESERVED_PARAMS_KEYS):
+      raise ValueError('{} are reserved keys but existed in params {}.'.format(
+          _RESERVED_PARAMS_KEYS, params))
+
+    if use_tpu:
+      # Perform some very basic validations. More validations will be found in
+      # _InternalTPUContext.
+      if train_batch_size is None:
+        raise ValueError('`train_batch_size` cannot be `None`')
+      util_lib.check_positive_integer(train_batch_size, 'train_batch_size')
+
+      if (config.tpu_config.per_host_input_for_training is
+          tpu_config.InputPipelineConfig.PER_SHARD_V1 and
+          config.tpu_config.num_cores_per_replica):
+        raise ValueError(
+            'Model parallelism only supports per host input for training. '
+            'Please adjust TPURunconfig.per_host_input_for_training.')
+
+      if eval_batch_size is not None:
+        util_lib.check_positive_integer(eval_batch_size, 'eval_batch_size')
+
+      if predict_batch_size is not None:
+        util_lib.check_positive_integer(predict_batch_size,
+                                        'predict_batch_size')
+
+    # Verifies the model_fn signature according to Estimator framework.
+    estimator_lib._verify_model_fn_args(model_fn, params)  # pylint: disable=protected-access
+    # We cannot store config and params in this constructor as parent
+    # constructor might change them, such as assigning a temp dir for
+    # config.model_dir.
+    model_function = self._augment_model_fn(model_fn, batch_axis)
+
+    # Overwrite log_step_count_steps to disable TensorLoggingHook and
+    # StepCounterHook from being created in Estimator. TPUEstimator already
+    # added equivalent hooks in _augment_model_fn above.
+    self._log_every_n_steps = config.log_step_count_steps
+    config = config.replace(log_step_count_steps=None)
+
+    # Passing non-None params as wrapped model_fn has it.
+    params = params or {}
+    super(TPUEstimator, self).__init__(
+        model_fn=model_function,
+        model_dir=model_dir,
+        config=config,
+        params=params,
+        warm_start_from=warm_start_from)
+    self._iterations_per_training_loop = (
+        self._config.tpu_config.iterations_per_loop)
+
+    # All properties passed to _InternalTPUContext are immutable.
+    # pylint: disable=protected-access
+    self._ctx = tpu_context._get_tpu_context(
+        self._config, train_batch_size, eval_batch_size, predict_batch_size,
+        use_tpu, eval_on_tpu, experimental_embedding_config_spec)
+
+    self._export_to_cpu = export_to_cpu
+    self._export_to_tpu = export_to_tpu
+    self._experimental_exported_model_uses_all_cores = (
+        experimental_exported_model_uses_all_cores)
+    self._experimental_export_device_assignment = (
+        experimental_export_device_assignment)
+    if (experimental_exported_model_uses_all_cores and
+        experimental_export_device_assignment):
+      raise ValueError('experimental_exported_model_uses_all_cores and '
+                       'experimental_export_device_assignment is not supported '
+                       'at the same time.')
+
+    self._is_input_fn_invoked = None
+    self._rendezvous = {}
+
+  def _add_meta_graph_for_mode(self,
+                               builder,
+                               input_receiver_fn_map,
+                               checkpoint_path,
+                               save_variables=True,
+                               mode=model_fn_lib.ModeKeys.PREDICT,
+                               export_tags=None,
+                               check_variables=True):
+    if self._export_to_tpu and mode != model_fn_lib.ModeKeys.PREDICT:
+      logging.warning('TPUEstimator only handles mode PREDICT for exporting '
+                      'when `export_to_tpu` is `True`; Mode {} will be ignored '
+                      'for TPU.'.format(mode))
+
+    if not self._export_to_cpu and not self._export_to_tpu:
+      raise ValueError('One of export_to_cpu and export_to_tpu must be true.')
+
+    if self._export_to_cpu:
+      (super(TPUEstimator, self)._add_meta_graph_for_mode(
+          builder,
+          input_receiver_fn_map,
+          checkpoint_path,
+          save_variables,
+          mode=mode,
+          export_tags=export_tags,
+          check_variables=check_variables))
+
+    if self._export_to_tpu and mode == model_fn_lib.ModeKeys.PREDICT:
+      input_receiver_fn_map = {
+          _REWRITE_FOR_INFERENCE_MODE: input_receiver_fn_map[mode]
+      }
+      export_tags = [tag_constants.SERVING, tag_constants.TPU]
+      mode = _REWRITE_FOR_INFERENCE_MODE
+
+      # See b/110052256 for why `check_variables` is `False`.
+      if not self._export_to_cpu:
+        check_variables = save_variables = True
+      else:
+        check_variables = save_variables = False
+      (super(TPUEstimator, self)._add_meta_graph_for_mode(
+          builder,
+          input_receiver_fn_map,
+          checkpoint_path,
+          save_variables=save_variables,
+          mode=mode,
+          export_tags=export_tags,
+          check_variables=check_variables))
+
+  def _call_model_fn(self, features, labels, mode, config):
+    if mode == _REWRITE_FOR_INFERENCE_MODE:
+      return self._call_model_fn_for_inference(features, labels, mode, config)
+    else:
+      return super(TPUEstimator, self)._call_model_fn(features, labels, mode,
+                                                      config)
+
+  def _call_model_fn_for_inference(self, features, labels, mode, config):
+    """Wraps `_call_model_fn` for `export_savedmodel`."""
+    if mode != _REWRITE_FOR_INFERENCE_MODE:
+      raise ValueError('mode must be {}; '
+                       'got {}.'.format(_REWRITE_FOR_INFERENCE_MODE, mode))
+
+    computation, capture = self._build_computation_for_inference(
+        features, labels, mode, config)
+    tensors = call_computation(
+        computation,
+        experimental_exported_model_uses_all_cores=self
+        ._experimental_exported_model_uses_all_cores)
+    estimator_spec, export_outputs_dict, predictions_dict, none_indices = (
+        capture.get())
+    predictions_list = tensors[:len(predictions_dict)]
+    export_outputs_list_without_none = tensors[len(predictions_dict):]
+
+    # Reinsert `None`s which we've taken out in
+    # `_build_computation_for_inference()`.
+    export_outputs_list = []
+    while none_indices or export_outputs_list_without_none:
+      if none_indices and none_indices[0] == len(export_outputs_list):
+        export_outputs_list.append(None)
+        none_indices.pop(0)
+      else:
+        export_outputs_list.append(export_outputs_list_without_none.pop(0))
+
+    # Reconstruct `export_outputs` with updated tensors.
+    new_export_outputs_dict = nest.pack_sequence_as(export_outputs_dict,
+                                                    export_outputs_list)
+    export_outputs = estimator_spec.export_outputs
+    new_export_outputs = collections.OrderedDict(
+        (k, _clone_export_output_with_tensors(export_outputs[k], v))
+        for k, v in six.iteritems(new_export_outputs_dict))
+    # Reconstruct `predictions` with updated tensors.
+    new_predictions = nest.pack_sequence_as(predictions_dict, predictions_list)
+    if (len(new_predictions) == 1 and
+        _KEY_WHEN_PREDICTIONS_IS_A_TENSOR in new_predictions):
+      new_predictions = new_predictions[_KEY_WHEN_PREDICTIONS_IS_A_TENSOR]
+
+    return estimator_spec._replace(
+        export_outputs=new_export_outputs, predictions=new_predictions)
+
+  def _build_computation_for_inference(self, features, labels, mode, config):
+    capture = _CapturedObject()
+
+    def computation():
+      """Computation to be passed to `TPUPartitionedCall()`."""
+      tpu_computation, tpu_capture = self._build_tpu_computation_for_inference(
+          features, labels, mode, config)
+
+      if self._experimental_export_device_assignment:
+        # Export the device assignment as part of the model. This is useful for
+        # model parallel usecases where the model relies on the mapping between
+        # logical and physical devices.
+        with self._ctx.with_mode(mode) as ctx:
+          device_assignment = ctx.device_assignment
+      else:
+        device_assignment = None
+
+      if self._experimental_exported_model_uses_all_cores:
+        tensors_on_cpu = tpu.rewrite(
+            tpu_computation, device_assignment=device_assignment)
+      else:
+        tensors_on_cpu = tpu.rewrite_for_inference(
+            tpu_computation, device_assignment=device_assignment)
+
+      (estimator_spec, export_outputs_dict, export_outputs_list,
+       predictions_dict) = (
+           tpu_capture.get())
+      predictions_list = tensors_on_cpu[:len(predictions_dict)]
+      export_outputs_tpu_on_cpu_list = tensors_on_cpu[len(predictions_dict):]
+
+      # Reconstruct tensors used in export_outputs, with TPU tensors replaced
+      # with their CPU counterpart returned from `rewrite_for_inference()`.
+      # `function.Defun()` does not like `None`s in return values, so we leave
+      # `None`s out but record their positions for later reconstruction.
+      export_outputs_list_without_none = []
+      none_indices = []
+      for i, t in enumerate(export_outputs_list):
+        if t is None:
+          none_indices.append(i)
+        else:
+          export_outputs_list_without_none.append(
+              export_outputs_tpu_on_cpu_list.pop(0))
+
+      capture.capture((estimator_spec, export_outputs_dict, predictions_dict,
+                       none_indices))
+      return predictions_list + export_outputs_list_without_none
+
+    return computation, capture
+
+  def _build_tpu_computation_for_inference(self, features, labels, mode,
+                                           config):
+    capture = _CapturedObject()
+
+    def computation():
+      """Compute tpu tensors used in export_outputs.
+
+      Passed to rewrite_for_inference so that model_fn will be called under
+      the rewriting contexts. Only tpu tensors are returned, but export_outputs
+      and scaffold are captured.
+
+      Returns:
+         A list of Tensors used in export_outputs and not marked for
+         outside_compilation.
+      """
+      # We should only call model fn once and it should be inside `computation`
+      # so that building the graph will happen under `rewrite_for_inference`.
+      mode = model_fn_lib.ModeKeys.PREDICT
+      estimator_spec = self._call_model_fn(features, labels, mode, config)
+
+      # We pick the TPU tensors out from `export_output` and later return them
+      # from `computation` for rewriting.
+      export_outputs_dict = collections.OrderedDict(
+          (k, _export_output_to_tensors(v))
+          for k, v in six.iteritems(estimator_spec.export_outputs))
+      export_outputs_list = nest.flatten(export_outputs_dict)
+      export_outputs_tpu_list = [
+          t for t in export_outputs_list if t is not None
+      ]
+
+      if isinstance(estimator_spec.predictions, dict):
+        predictions_dict = collections.OrderedDict(
+            (k, v) for k, v in six.iteritems(estimator_spec.predictions))
+      else:
+        predictions_dict = {
+            _KEY_WHEN_PREDICTIONS_IS_A_TENSOR: estimator_spec.predictions
+        }
+      predictions_list = nest.flatten(predictions_dict)
+
+      # We cannot return everything we want through the return values, so
+      # capture the rest here for later use.
+      capture.capture((estimator_spec, export_outputs_dict, export_outputs_list,
+                       predictions_dict))
+      return predictions_list + export_outputs_tpu_list
+
+    return computation, capture
+
+  def _create_global_step(self, graph):
+    """Creates a global step suitable for TPUs.
+
+    Args:
+      graph: The graph in which to create the global step.
+
+    Returns:
+      A global step `Tensor`.
+
+    Raises:
+      ValueError: if the global step tensor is already defined.
+    """
+    return _create_global_step(graph)
+
+  def _convert_train_steps_to_hooks(self, steps, max_steps):
+    with self._ctx.with_mode(model_fn_lib.ModeKeys.TRAIN) as ctx:
+      if ctx.is_running_on_cpu():
+        return super(TPUEstimator, self)._convert_train_steps_to_hooks(
+            steps, max_steps)
+
+    # On TPU.
+    if steps is None and max_steps is None:
+      raise ValueError(
+          'For TPU training, one of `steps` or `max_steps` must be set. '
+          'Cannot be both `None`.')
+
+    # Estimator.train has explicit positiveness check.
+    if steps is not None:
+      util_lib.check_positive_integer(steps, 'Train steps')
+    if max_steps is not None:
+      util_lib.check_positive_integer(max_steps, 'Train max_steps')
+
+    return [
+        _TPUStopAtStepHook(self._iterations_per_training_loop, steps, max_steps)
+    ]
+
+  def _convert_eval_steps_to_hooks(self, steps):
+    with self._ctx.with_mode(model_fn_lib.ModeKeys.EVAL) as ctx:
+      if ctx.is_running_on_cpu():
+        return super(TPUEstimator, self)._convert_eval_steps_to_hooks(steps)
+
+    if steps is None:
+      raise ValueError('Evaluate `steps` must be set on TPU. Cannot be `None`.')
+
+    util_lib.check_positive_integer(steps, 'Eval steps')
+
+    return [
+        evaluation._StopAfterNEvalsHook(  # pylint: disable=protected-access
+            num_evals=steps),
+        _SetEvalIterationsHook(steps)
+    ]
+
+  def _call_input_fn(self, input_fn, mode):
+    """Calls the input function.
+
+    Args:
+      input_fn: The input function.
+      mode: ModeKeys
+
+    Returns:
+      In TPU mode, returns an input_fn to be called later in model_fn.
+      Otherwise, calls the input_fn and returns either fatures or
+        (features, labels).
+
+    Raises:
+      ValueError: if input_fn takes invalid arguments or does not have `params`.
+    """
+    input_fn_args = function_utils.fn_args(input_fn)
+    config = self.config  # a deep copy.
+    kwargs = {}
+    if 'params' in input_fn_args:
+      kwargs['params'] = self.params  # a deep copy.
+    else:
+      raise ValueError('input_fn ({}) does not include params argument, '
+                       'required by TPUEstimator to pass batch size as '
+                       'params["batch_size"]'.format(input_fn))
+    if 'config' in input_fn_args:
+      kwargs['config'] = config
+
+    if 'mode' in input_fn_args:
+      kwargs['mode'] = mode
+
+    # Records the fact input_fn has been invoked.
+    self._is_input_fn_invoked = True
+
+    with self._ctx.with_mode(mode) as ctx:
+      # Setting the batch size in params first. This helps user to have same
+      # input_fn for use_tpu=True/False.
+      batch_size_for_input_fn = ctx.batch_size_for_input_fn
+      if batch_size_for_input_fn is not None:
+        _add_item_to_params(kwargs['params'], _BATCH_SIZE_KEY,
+                            batch_size_for_input_fn)
+
+      # For export_savedmodel, input_fn is never passed to Estimator. So,
+      # `is_export_mode` must be False.
+      if ctx.is_running_on_cpu(is_export_mode=False):
+        with ops.device('/device:CPU:0'):
+          return input_fn(**kwargs)
+
+      # For TPU computation, input_fn should be invoked in a tf.while_loop for
+      # performance. While constructing the tf.while_loop, the structure of
+      # inputs returned by the `input_fn` needs to be recorded. The structure
+      # includes whether features or labels is dict or single Tensor, dict keys,
+      # tensor shapes, and dtypes. The recorded structure is used to create the
+      # infeed dequeue ops, which must be wrapped and passed as a Fn, called
+      # inside the TPU computation, as the TPU computation is wrapped inside a
+      # tf.while_loop also. So, we either pass input_fn to model_fn or pass
+      # dequeue_fn to model_fn. Here, `input_fn` is passed directly as
+      # `features` in `model_fn` signature.
+      def _input_fn(ctx):
+        _add_item_to_params(kwargs['params'], _CTX_KEY, ctx)
+        return input_fn(**kwargs)
+
+      return _input_fn
+
+  def _validate_features_in_predict_input(self, result):
+    """Skip the validation.
+
+    For TPUEstimator, we do not need to check the result type. `_InputPipeline`
+    has stronger check. Parent class's check generates confusing warning msg.
+
+    Args:
+      result: `features` returned by input_fn.
+    """
+    pass
+
+  def train(self,
+            input_fn,
+            hooks=None,
+            steps=None,
+            max_steps=None,
+            saving_listeners=None):
+    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
+    self._rendezvous[model_fn_lib.ModeKeys.TRAIN] = rendezvous
+    try:
+      return super(TPUEstimator, self).train(
+          input_fn=input_fn,
+          hooks=hooks,
+          steps=steps,
+          max_steps=max_steps,
+          saving_listeners=saving_listeners)
+    except Exception:  # pylint: disable=broad-except
+      rendezvous.record_error('training_loop', sys.exc_info())
+    finally:
+      rendezvous.record_done('training_loop')
+      rendezvous.raise_errors()
+
+  def evaluate(self,
+               input_fn,
+               steps=None,
+               hooks=None,
+               checkpoint_path=None,
+               name=None):
+    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
+    self._rendezvous[model_fn_lib.ModeKeys.EVAL] = rendezvous
+    try:
+      return super(TPUEstimator, self).evaluate(
+          input_fn,
+          steps=steps,
+          hooks=hooks,
+          checkpoint_path=checkpoint_path,
+          name=name)
+    except Exception:  # pylint: disable=broad-except
+      rendezvous.record_error('evaluation_loop', sys.exc_info())
+    finally:
+      rendezvous.record_done('evaluation_loop')
+      rendezvous.raise_errors()
+
+  def predict(self,
+              input_fn,
+              predict_keys=None,
+              hooks=None,
+              checkpoint_path=None,
+              yield_single_examples=True):
+    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
+    self._rendezvous[model_fn_lib.ModeKeys.PREDICT] = rendezvous
+    try:
+      for result in super(TPUEstimator, self).predict(
+          input_fn=input_fn,
+          predict_keys=predict_keys,
+          hooks=hooks,
+          checkpoint_path=checkpoint_path,
+          yield_single_examples=yield_single_examples):
+        yield result
+    except Exception:  # pylint: disable=broad-except
+      rendezvous.record_error('prediction_loop', sys.exc_info())
+    finally:
+      rendezvous.record_done('prediction_loop')
+      rendezvous.raise_errors()
+
+    rendezvous.record_done('prediction_loop')
+    rendezvous.raise_errors()
+
+  def _augment_model_fn(self, model_fn, batch_axis):
+    """Returns a new model_fn, which wraps the TPU support."""
+
+    def _model_fn(features, labels, mode, config, params):
+      """A Estimator `model_fn` for TPUEstimator."""
+      with self._ctx.with_mode(mode) as ctx:
+        model_fn_wrapper = _ModelFnWrapper(model_fn, config, params, ctx)
+
+        # `input_fn` is called in `train()`, `evaluate()`, and `predict()`,
+        # but not in `export_savedmodel()`.
+        if self._is_input_fn_invoked:
+          is_export_mode = False
+        else:
+          is_export_mode = True
+
+        # Clear the bit.
+        self._is_input_fn_invoked = None
+
+        # examples_hook is added to training_hooks for both CPU and TPU
+        # execution.
+        if self._log_every_n_steps is not None:
+          examples_hook = ExamplesPerSecondHook(
+              ctx.global_batch_size,
+              # pylint:disable=g-long-ternary
+              output_dir=(self.model_dir
+                          if not config or config.save_summary_steps
+                          else None),
+              # pylint:enable=g-long-ternary
+              every_n_steps=self._log_every_n_steps)
+
+        if ctx.is_running_on_cpu(is_export_mode=is_export_mode):
+          logging.info('Running %s on CPU', mode)
+          estimator_spec = model_fn_wrapper.call_without_tpu(
+              features, labels, is_export_mode=is_export_mode)
+          if self._log_every_n_steps is not None:
+            estimator_spec = estimator_spec._replace(
+                training_hooks=estimator_spec.training_hooks + (examples_hook,))
+          return estimator_spec
+
+        assert labels is None, '`labels` passed to `model_fn` must be `None`.'
+        # TPUEstimator._call_input_fn passes `input_fn` as features to here.
+        assert callable(features), '`input_fn` is not callable.'
+        input_fn = features
+
+        tpu_init_ops = []
+        if ctx.embedding_config and mode == model_fn_lib.ModeKeys.TRAIN:
+          dummy_table_variables, dummy_table_variables_init = (
+              tpu_embedding_gradient.create_dummy_table_variables(
+                  ctx.embedding_config.tpu_embedding))
+          ctx.embedding_config.dummy_table_variables = dummy_table_variables
+          tpu_init_ops.append(dummy_table_variables_init)
+
+        input_holders = _InputPipeline(input_fn, batch_axis, ctx)
+        enqueue_ops, dequeue_fn, input_hooks, run_infeed_loop_on_coordinator = (
+            input_holders.generate_infeed_enqueue_ops_and_dequeue_fn())
+
+        graph = ops.get_default_graph()
+        for enqueue_op in enqueue_ops:
+          if isinstance(enqueue_op, list):
+            graph.get_collection_ref(_TPU_ENQUEUE_OPS).extend(enqueue_op)
+          else:
+            graph.add_to_collection(_TPU_ENQUEUE_OPS, enqueue_op)
+
+        if mode == model_fn_lib.ModeKeys.TRAIN:
+          compile_op, loss, host_call, scaffold, training_hooks = (
+              _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
+          if ctx.embedding_config:
+            g = ops.get_default_graph()
+            table_to_config_dict = (
+                ctx.embedding_config.tpu_embedding.table_to_config_dict)
+            optimization_parameters = (
+                ctx.embedding_config.tpu_embedding.optimization_parameters)
+            embedding_variable_name_by_table, slot_variable_names_by_table = (
+                _tpu_estimator_embedding.get_full_variable_names(
+                    g, table_to_config_dict, optimization_parameters
+                )
+            )
+            embedding_variables_and_ops = (
+                ctx.embedding_config.tpu_embedding.create_variables_and_ops(
+                    embedding_variable_name_by_table,
+                    slot_variable_names_by_table
+                ))
+            tpu_init_ops.extend(embedding_variables_and_ops.load_ops())
+
+          host_ops = host_call.create_tpu_hostcall()
+          if host_ops is None:
+            host_ops = []
+
+          shutdown_hooks = []
+          shutdown_mode = os.environ.get('TF_TPU_GRACEFUL_SHUTDOWN_MODE',
+                                         'shutdown_worker')
+          if shutdown_mode:
+            if shutdown_mode == 'shutdown_worker':
+              finalizer_hooks = [
+                  session_support.ShutdownLameWorkers(timeout_ms=60 * 1000),
+              ]
+            elif shutdown_mode == 'shutdown_computation':
+              finalizer_hooks = [
+                  session_support.RestartComputation(timeout_ms=60 * 1000),
+              ]
+            else:
+              raise ValueError(
+                  'Unknown TF_TPU_GRACEFUL_SHUTDOWN_MODE "%s"' % shutdown_mode)
+
+            shutdown_hooks.append(
+                session_support.GracefulShutdownHook(
+                    checkpoint_prefix=self.model_dir + '/model.ckpt',
+                    on_shutdown_hooks=finalizer_hooks))
+
+          with ops.control_dependencies([loss]):
+            global_step = array_ops.identity(training.get_global_step())
+          hooks = input_hooks + shutdown_hooks
+          hooks.extend([
+              TPUInfeedOutfeedSessionHook(
+                  ctx,
+                  enqueue_ops,
+                  host_ops,
+                  tpu_compile_op=compile_op,
+                  run_infeed_loop_on_coordinator=(
+                      run_infeed_loop_on_coordinator),
+                  rendezvous=self._rendezvous[mode],
+                  master=self._config.master,
+                  session_config=self._session_config,
+                  tpu_init_ops=tpu_init_ops),
+              InstallSignalHandlerHook()
+          ])
+          if self._log_every_n_steps is not None:
+            logging_hook_frequency = (  # Divide and round up
+                (self._log_every_n_steps +
+                 self._config.tpu_config.iterations_per_loop - 1) //
+                self._config.tpu_config.iterations_per_loop)
+            hooks.append(
+                training.LoggingTensorHook({
+                    'loss': array_ops.identity(loss),
+                    'step': global_step,
+                },
+                                           every_n_iter=logging_hook_frequency))
+            examples_hook._set_steps_per_run(  # pylint: disable=protected-access
+                self._config.tpu_config.iterations_per_loop)
+            hooks.append(examples_hook)
+
+          if training_hooks:
+            hooks.extend(training_hooks)
+
+          chief_hooks = []
+          if (self._config.save_checkpoints_secs or
+              self._config.save_checkpoints_steps):
+            checkpoint_hook = training.CheckpointSaverHook(
+                self.model_dir,
+                save_secs=self._config.save_checkpoints_secs,
+                save_steps=self._config.save_checkpoints_steps,
+                scaffold=scaffold)
+            checkpoint_hook._set_steps_per_run(  # pylint: disable=protected-access
+                self._config.tpu_config.iterations_per_loop)
+            chief_hooks.append(checkpoint_hook)
+
+          summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss)
+          with ops.control_dependencies([loss]):
+            update_ops = _sync_variables_ops(ctx)
+            if ctx.embedding_config:
+              update_ops.extend(embedding_variables_and_ops.retrieve_ops())
+
+          # Validate the TPU training graph to catch basic errors
+          _validate_tpu_training_graph()
+
+          train_op = control_flow_ops.group(*update_ops)
+          graph.add_to_collection(_TPU_TRAIN_OP, train_op)
+
+          return model_fn_lib.EstimatorSpec(
+              mode,
+              loss=loss,
+              training_chief_hooks=chief_hooks,
+              training_hooks=hooks,
+              train_op=train_op,
+              scaffold=scaffold)
+
+        if mode == model_fn_lib.ModeKeys.EVAL:
+          compile_op, total_loss, host_calls, scaffold, eval_hooks = (
+              _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
+          iterations_per_loop_var = _create_or_get_iterations_per_loop()
+          mean_loss = math_ops.div(
+              total_loss,
+              math_ops.cast(iterations_per_loop_var, dtype=total_loss.dtype))
+
+          with ops.control_dependencies([mean_loss]):
+            # After TPU evaluation computation is done (the mean_loss tensor),
+            # reads all variables back from TPU and updates the eval step
+            # counter properly
+            internal_ops_to_run = _sync_variables_ops(ctx)
+            internal_ops_to_run.append(
+                _increase_eval_step_op(iterations_per_loop_var))
+
+          host_call_ret = host_calls.create_tpu_hostcall()
+          eval_metric_ops = {}
+          eval_update_ops = []
+
+          eval_metrics = host_call_ret.get('eval_metrics', {})
+          if eval_metrics:
+            # Creates a dummy metric update_op for all metrics. Estimator
+            # expects all metrics in `eval_metric_ops` have update_op and calls
+            # them one by one. The real metric update_ops are invoked in a
+            # separated thread. So, here give Estimator the dummy op for all
+            # metrics.
+            with ops.control_dependencies(internal_ops_to_run):
+              dummy_update_op = control_flow_ops.no_op()
+
+            for k, v in eval_metrics.items():
+              eval_metric_ops[k] = (v[0], dummy_update_op)
+              eval_update_ops.append(v[1])
+          else:
+            # If no eval metrics are passed, create an identity node for the
+            # loss and add `internal_ops_to_run` to its dependencies. So
+            # `internal_ops_to_run` can be executed.
+            with ops.control_dependencies(internal_ops_to_run):
+              mean_loss = array_ops.identity(mean_loss)
+
+          if 'host_call' not in host_call_ret:
+            host_ops = []
+          else:
+            host_ops = host_call_ret['host_call']
+          hooks = [
+              TPUInfeedOutfeedSessionHook(
+                  ctx,
+                  enqueue_ops,
+                  eval_update_ops + host_ops,
+                  tpu_compile_op=compile_op,
+                  run_infeed_loop_on_coordinator=(
+                      run_infeed_loop_on_coordinator),
+                  rendezvous=self._rendezvous[mode],
+                  master=self._config.evaluation_master,
+                  session_config=self._session_config,
+                  tpu_init_ops=tpu_init_ops)
+          ] + input_hooks
+
+          if eval_hooks:
+            hooks.extend(eval_hooks)
+
+          return model_fn_lib.EstimatorSpec(
+              mode,
+              loss=mean_loss,
+              evaluation_hooks=hooks,
+              eval_metric_ops=eval_metric_ops,
+              scaffold=scaffold)
+
+        # Predict
+        assert mode == model_fn_lib.ModeKeys.PREDICT
+
+        (compile_op, dummy_predict_op, host_calls,
+         scaffold, prediction_hooks) = _predict_on_tpu_system(
+             ctx, model_fn_wrapper, dequeue_fn)
+        with ops.control_dependencies([dummy_predict_op]):
+          internal_ops_to_run = _sync_variables_ops(ctx)
+          with ops.control_dependencies(internal_ops_to_run):
+            dummy_predict_op = control_flow_ops.no_op()
+
+        # In train and evaluation, the main TPU program is passed to monitored
+        # training session to run. Infeed enqueue and outfeed dequeue are
+        # executed in side threads. This is not the configuration for
+        # prediction mode.
+        #
+        # For prediction, the Estimator executes the EstimatorSpec.predictions
+        # directly and yield the element (via generator) to call site. So, the
+        # outfeed based prediction must be passed to MonitoredSession directly.
+        # Other parts of the TPU execution are organized as follows.
+        #
+        # 1. All outfeed based Tensors must be grouped with predictions Tensors
+        #    to form a single invocation. This avoid the issue we might trigger
+        #    multiple outfeeds incorrectly. To achieve this, `host_call` is
+        #    placed in control_dependencies of `stopping_signals`, and
+        #    `stopping_signals` is passed into _StoppingPredictHook, which sets
+        #    the `stopping_signals` as SessionRunArgs. MonitoredSession merges
+        #    all SessionRunArgs with the fetch in session.run together.
+        #
+        # 2. The TPU program (dummy_predict_op) and enqueue_ops (infeed Enqueue)
+        #    are grouped together. They will be launched once and only once in
+        #    side threads and they quit naturally according to the SAME stopping
+        #    condition.
+        enqueue_ops.append(dummy_predict_op)
+
+        host_call_ret = host_calls.create_tpu_hostcall()
+        if 'host_call' not in host_call_ret:
+          host_ops = []
+        else:
+          host_ops = host_call_ret['host_call']
+
+        predictions = host_call_ret['predictions']
+        _verify_cross_hosts_transfer_size(
+            predictions,
+            message=(
+                'The estimated size for TPUEstimatorSpec.predictions is too '
+                'large.'))
+        signals = host_call_ret['signals']
+
+        with ops.control_dependencies(host_ops):
+          host_ops = []  # Empty, we do do not need it anymore.
+          scalar_stopping_signal = _StopSignals.as_scalar_stopping_signal(
+              signals)
+          predictions = _PaddingSignals.slice_tensor_or_dict(
+              predictions, signals)
+
+        hooks = [
+            _StoppingPredictHook(scalar_stopping_signal),
+            TPUInfeedOutfeedSessionHookForPrediction(
+                ctx, enqueue_ops, host_ops, rendezvous=self._rendezvous[mode],
+                tpu_compile_op=compile_op,
+                master=self._config.master,
+                session_config=self._session_config),
+        ] + input_hooks
+
+        if prediction_hooks:
+          hooks.extend(prediction_hooks)
+
+        return model_fn_lib.EstimatorSpec(
+            mode,
+            prediction_hooks=hooks,
+            predictions=predictions,
+            scaffold=scaffold)
+
+    return _model_fn
+
+
+def _export_output_to_tensors(export_output):
+  """Get a list of `Tensors` used in `export_output`.
+
+  Args:
+    export_output: an `ExportOutput` object such as `ClassificationOutput`,
+      `RegressionOutput`, or `PredictOutput`.
+
+  Returns:
+    a list of tensors used in export_output.
+
+  Raises:
+    ValueError: if `export_output` is not one of `ClassificationOutput`,
+        `RegressionOutput`, or `PredictOutput`.
+  """
+  if isinstance(export_output, export_output_lib.ClassificationOutput):
+    return [export_output.scores, export_output.classes]
+  elif isinstance(export_output, export_output_lib.RegressionOutput):
+    return [export_output.value]
+  elif isinstance(export_output, export_output_lib.PredictOutput):
+    return list(export_output.outputs.values())
+  else:
+    raise ValueError(
+        '`export_output` must be have type `ClassificationOutput`, '
+        '`RegressionOutput`, or `PredictOutput`; got {}.'.format(export_output))
+
+
+def _clone_export_output_with_tensors(export_output, tensors):
+  """Clones `export_output` but with new `tensors`.
+
+  Args:
+    export_output: an `ExportOutput` object such as `ClassificationOutput`,
+      `RegressionOutput`, or `PredictOutput`.
+    tensors: a list of `Tensors` used to construct a new `export_output`.
+
+  Returns:
+    A dict similar to `export_output` but with `tensors`.
+
+  Raises:
+    ValueError: if `export_output` is not one of `ClassificationOutput`,
+        `RegressionOutput`, or `PredictOutput`.
+  """
+  if isinstance(export_output, export_output_lib.ClassificationOutput):
+    if len(tensors) != 2:
+      raise ValueError('tensors must be of length 2; '
+                       'got {}.'.format(len(tensors)))
+    return export_output_lib.ClassificationOutput(*tensors)
+  elif isinstance(export_output, export_output_lib.RegressionOutput):
+    if len(tensors) != 1:
+      raise ValueError('tensors must be of length 1; '
+                       'got {}'.format(len(tensors)))
+    return export_output_lib.RegressionOutput(*tensors)
+  elif isinstance(export_output, export_output_lib.PredictOutput):
+    return export_output_lib.PredictOutput(
+        dict(zip(export_output.outputs.keys(), tensors)))
+  else:
+    raise ValueError(
+        '`export_output` must be have type `ClassificationOutput`, '
+        '`RegressionOutput`, or `PredictOutput`; got {}.'.format(export_output))
+
+
+def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
+  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
+  iterations_per_loop_var = _create_or_get_iterations_per_loop()
+
+  (single_tpu_eval_step, host_calls, captured_scaffold_fn, captured_eval_hooks
+  ) = model_fn_wrapper.convert_to_single_tpu_eval_step(dequeue_fn)
+
+  def multi_tpu_eval_steps_on_single_shard():
+    return training_loop.repeat(iterations_per_loop_var, single_tpu_eval_step,
+                                [_ZERO_LOSS])
+
+  (compile_op, loss,) = tpu.split_compile_and_shard(
+      multi_tpu_eval_steps_on_single_shard,
+      inputs=[],
+      num_shards=ctx.num_replicas,
+      outputs_from_all_shards=False,
+      device_assignment=ctx.device_assignment)
+
+  loss = loss[0]
+  scaffold = _get_scaffold(captured_scaffold_fn)
+  return compile_op, loss, host_calls, scaffold, captured_eval_hooks.get()
+
+
+def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
+  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
+  iterations_per_loop_var = _create_or_get_iterations_per_loop()
+
+  (single_tpu_train_step, host_call, captured_scaffold_fn,
+   captured_training_hooks) = (
+       model_fn_wrapper.convert_to_single_tpu_train_step(dequeue_fn))
+
+  @tpu_function.on_device_training_loop
+  def multi_tpu_train_steps_on_single_shard():
+    return training_loop.repeat(iterations_per_loop_var, single_tpu_train_step,
+                                [_INITIAL_LOSS])
+
+  (compile_op, loss,) = tpu.split_compile_and_shard(
+      multi_tpu_train_steps_on_single_shard,
+      inputs=[],
+      num_shards=ctx.num_replicas,
+      outputs_from_all_shards=False,
+      device_assignment=ctx.device_assignment)
+
+  loss = loss[0]
+  scaffold = _get_scaffold(captured_scaffold_fn)
+  return compile_op, loss, host_call, scaffold, captured_training_hooks.get()
+
+
+def _predict_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
+  """Executes `model_fn_wrapper` multiple times on all TPU shards."""
+  (single_tpu_predict_step, host_calls, captured_scaffold_fn,
+   captured_predict_hooks
+  ) = model_fn_wrapper.convert_to_single_tpu_predict_step(dequeue_fn)
+
+  @tpu_function.on_device_training_loop
+  def multi_tpu_predict_steps_on_single_shard():
+
+    def cond(scalar_stopping_signal):
+      return math_ops.logical_not(
+          _StopSignals.should_stop(scalar_stopping_signal))
+
+    inputs = [_StopSignals.NON_STOPPING_SIGNAL]
+    outputs = training_loop.while_loop(
+        cond, single_tpu_predict_step, inputs=inputs, name=b'loop')
+    return outputs
+
+  (compile_op, dummy_predict_op,) = tpu.split_compile_and_shard(
+      multi_tpu_predict_steps_on_single_shard,
+      inputs=[],
+      num_shards=ctx.num_replicas,
+      outputs_from_all_shards=False,
+      device_assignment=ctx.device_assignment)
+
+  dummy_predict_op = dummy_predict_op[0]
+  scaffold = _get_scaffold(captured_scaffold_fn)
+  return (compile_op, dummy_predict_op, host_calls, scaffold,
+          captured_predict_hooks.get())
+
+
+def _wrap_computation_in_while_loop(device, op_fn):
+  """Wraps the ops generated by `op_fn` in tf.while_loop."""
+
+  def computation(i):
+    with ops.control_dependencies(op_fn()):
+      return i + 1
+
+  iterations_per_loop_var = _create_or_get_iterations_per_loop()
+  # By setting parallel_iterations=1, the parallel execution in while_loop is
+  # basically turned off.
+  with ops.device(device):
+    iterations = array_ops.identity(iterations_per_loop_var)
+    return control_flow_ops.while_loop(
+        lambda i: i < iterations,
+        computation, [constant_op.constant(0)],
+        parallel_iterations=1)
+
+
+def _wrap_computation_in_while_loop_with_stopping_signals(device, op_fn):
+  """Wraps the ops generated by `op_fn` in tf.while_loop."""
+
+  def cond(scalar_stopping_signal):
+    return math_ops.logical_not(
+        _StopSignals.should_stop(scalar_stopping_signal))
+
+  def computation(unused_scalar_stopping_signal):
+    return_value = op_fn()
+    execute_ops = return_value['ops']
+    signals = return_value['signals']
+    with ops.control_dependencies(execute_ops):
+      return _StopSignals.as_scalar_stopping_signal(signals)
+
+  # By setting parallel_iterations=1, the parallel execution in while_loop is
+  # basically turned off.
+  with ops.device(device):
+    return control_flow_ops.while_loop(
+        cond,
+        computation, [_StopSignals.NON_STOPPING_SIGNAL],
+        parallel_iterations=1)
+
+
+def _validate_tpu_training_graph():
+  """Validate graph before running distributed training.
+
+  Raises:
+    ValueError: If the graph seems invalid for running on device
+  """
+  operations = ops.get_default_graph().get_operations()
+
+  # Check if there is atleast one CrossReplicaSum operation in the graph
+  # This should be introduced by using the CrossShardOptimizer wrapper
+  cross_replica_sum_ops = [
+      o for o in operations if o.type == _CROSS_REPLICA_SUM_OP
+  ]
+  if not cross_replica_sum_ops:
+    raise ValueError(
+        'CrossShardOptimizer must be used for model training on TPUs.')
+
+
+class _CapturedObject(object):
+  """A placeholder to capture an object.
+
+  This is useful when we need to capture a Python object in the Tensorflow
+  control flow body function and use it outside the control flow.
+  """
+
+  def __init__(self):
+    self._object = None
+    self._captured = False
+
+  def capture(self, o):
+    if self._captured:
+      raise RuntimeError(
+          'InternalError: Object can capture only once. Please file bug.')
+
+    self._captured = True
+    self._object = o
+
+  def get(self):
+    if not self._captured:
+      raise RuntimeError(
+          'InternalError: Object is not captured properly before `get`. '
+          'Please file bug.')
+    return self._object
+
+
+def _get_scaffold(captured_scaffold_fn):
+  """Retrieves the Scaffold from `captured_scaffold_fn`."""
+  with _CapturingContext(message='Inside scaffold_fn'):
+    scaffold_fn = captured_scaffold_fn.get()
+    if scaffold_fn:
+      scaffold = scaffold_fn()
+      if scaffold is None:
+        raise ValueError(
+            'TPUEstimatorSpec.scaffold_fn returns None, which is not allowed')
+    else:
+      scaffold = None
+
+  if scaffold:
+    wrapped_finalize = scaffold.finalize
+
+    def _finalize():
+      with _CapturingContext('Inside Scaffold.finalize'):
+        wrapped_finalize()
+
+    scaffold.finalize = _finalize
+  return scaffold
+
+
+class _CapturingContext(control_flow_ops.ControlFlowContext):
+  """Tracks references to Tensors defined in TPU replication."""
+
+  def __init__(self, message):
+    control_flow_ops.ControlFlowContext.__init__(self)
+    self._message = message
+
+  def to_control_flow_context_def(self, context_def, export_scope=None):
+    # pylint: disable=useless-super-delegation
+    # NOTE(slebedev): the method is required by `ControlFlowContext`.
+    super(_CapturingContext, self).to_control_flow_context_def(
+        context_def, export_scope)
+
+  def AddOp(self, op):  # pylint: disable=invalid-name
+    for c in op.inputs:
+      if tpu._TPU_REPLICATE_ATTR in c.op.node_def.attr:  # pylint: disable=protected-access
+        raise ValueError('{}: Op {} depends on TPU computation {}, '
+                         'which is not allowed.'.format(self._message, op, c))
+
+  def __enter__(self):
+    # pylint: disable=protected-access
+    self._g = ops.get_default_graph()
+    self._old = self._g._get_control_flow_context()
+    self._g._set_control_flow_context(self)
+    # pylint: enable=protected-access
+
+  def __exit__(self, _, __, ___):  # pylint: disable=invalid-name
+    self._g._set_control_flow_context(self._old)  # pylint: disable=protected-access
+
+
+class _Inputs(object):
+  """A data structure representing the input_fn returned values.
+
+  This also supports the returned value from input_fn as `Dataset`.
+  """
+
+  def __init__(self, features=None, labels=None, dataset=None, signals=None):
+    if dataset is not None and (features is not None or labels is not None or
+                                signals is not None):
+      raise RuntimeError('Internal Error: Either (features and labels) or '
+                         'dataset should be provided, not both. Please file '
+                         'bug')
+
+    self._features = features
+    self._labels = labels
+    self._signals = signals
+
+    self._dataset = dataset
+    self._iterator = None
+
+  @staticmethod
+  def from_input_fn(return_values):
+    """Returns an `_Inputs` instance according to `input_fn` return value."""
+    if isinstance(return_values, dataset_ops.DatasetV2):
+      dataset = return_values
+      return _Inputs(dataset=dataset)
+
+    features, labels = _Inputs._parse_inputs(return_values)
+    return _Inputs(features, labels)
+
+  @staticmethod
+  def _parse_inputs(return_values):
+    if isinstance(return_values, tuple):
+      features, labels = return_values
+    else:
+      features, labels = return_values, None
+    return features, labels
+
+  @property
+  def is_dataset(self):
+    """Returns True if the return value from input_fn is Dataset."""
+    return self._dataset is not None
+
+  def dataset_initializer(self):
+    """Returns the dataset's initializer.
+
+    The initializer must be run before calling `features_and_labels`.
+    """
+    self._iterator = dataset_ops.make_initializable_iterator(self._dataset)
+    return self._iterator.initializer
+
+  def features_and_labels(self):
+    """Gets `features` and `labels`."""
+    if self.is_dataset:
+      if self._iterator is None:
+        raise RuntimeError('Internal error: Must run dataset_initializer '
+                           'before calling features_and_labels(). Please file '
+                           'a bug!')
+      return _Inputs._parse_inputs(self._iterator.get_next())
+
+    return (self._features, self._labels)
+
+  def signals(self):
+    return self._signals
+
+  @property
+  def dataset(self):
+    return self._dataset
+
+
+class _InputsWithStoppingSignals(_Inputs):
+  """Inputs with `_StopSignals` inserted into the dataset."""
+
+  def __init__(self,
+               dataset,
+               batch_size,
+               add_padding=False,
+               num_invocations_per_step=1):
+
+    assert dataset is not None
+    user_provided_dataset = dataset.map(
+        _InputsWithStoppingSignals.insert_stopping_signal(
+            stop=False, batch_size=batch_size, add_padding=add_padding))
+    if num_invocations_per_step == 1:
+      final_batch_dataset = dataset.take(1).map(
+          _InputsWithStoppingSignals.insert_stopping_signal(
+              stop=True, batch_size=batch_size, add_padding=add_padding))
+    else:
+      # We append (2 * num_invocations_per_step - 1) batches for exhausting the
+      # user_provided_dataset and stop properly.
+      # For example, if num_invocations_per_step is 2, we append 3 additional
+      # padding batches: b1, b2, b3.
+      # If user_provided_dataset contains two batches: a1, a2
+      # Step 1: [a1, a2]
+      # Step 2: [b1, b2] -> STOP
+      # If user_provided_dataset contains three batches: a1, a2, a3.
+      # The training loops:
+      # Step 1: [a1, a2]
+      # Step 2: [a3, b1]
+      # Step 3: [b2, b3] -> STOP.
+      final_batch_dataset = dataset.take(1).map(
+          _InputsWithStoppingSignals.insert_stopping_signal(
+              stop=True, batch_size=batch_size, add_padding=add_padding))
+      final_batch_dataset = final_batch_dataset.repeat(
+          2 * num_invocations_per_step - 1)
+
+      def _set_mask(data_dict):
+        signals = data_dict['signals']
+        signals['padding_mask'] = array_ops.ones_like(signals['padding_mask'])
+        data_dict['signals'] = signals
+        return data_dict
+
+      # Mask out the extra batch.
+      final_batch_dataset = final_batch_dataset.map(_set_mask)
+
+    dataset = user_provided_dataset.concatenate(final_batch_dataset).prefetch(2)
+
+    super(_InputsWithStoppingSignals, self).__init__(dataset=dataset)
+    self._current_inputs = None
+
+  def features_and_labels(self):
+    if self._current_inputs is not None:
+      raise RuntimeError(
+          'Internal Error: The previous inputs have not been properly '
+          'consumed. First call features_and_labels, then call signals.')
+
+    inputs_with_signals = self._iterator.get_next()
+    features = inputs_with_signals['features']
+    labels = inputs_with_signals.get('labels')
+
+    self._current_inputs = inputs_with_signals
+    return features, labels
+
+  def signals(self):
+    """Returns the `Signals` from `_Inputs`."""
+    if self._current_inputs is None:
+      raise RuntimeError(
+          'Internal Error: The current inputs have not been properly '
+          'generated. First call features_and_labels, then call signals.')
+    signals = self._current_inputs['signals']
+    self._current_inputs = None
+    return signals
+
+  @staticmethod
+  def insert_stopping_signal(stop, batch_size, add_padding=False):
+    """Inserts stopping_signal into dataset via _map_fn.
+
+    Here we change the data structure in the dataset, such that the return value
+    is a dictionary now and `features`, `labels`, and `signals` are three
+    distinguished keys in that dict. This provides a better structure, which
+    eases the process to decompose the inputs (see `features_and_labels`).
+
+    Args:
+      stop: bool, state of current stopping signals.
+      batch_size: int, batch size.
+      add_padding: bool, whether to pad the tensor to full batch size.
+
+    Returns:
+      A map_fn passed to dataset.map API.
+    """
+
+    def _map_fn(*args):
+      """The map fn to insert signals."""
+      if len(args) == 1:
+        # Unpack the single Tensor/dict argument as features. This is required
+        # for the input_fn returns no labels.
+        args = args[0]
+      features, labels = _Inputs._parse_inputs(args)
+      new_input_dict = {}
+
+      if add_padding:
+        padding_mask, features, labels = (
+            _PaddingSignals.pad_features_and_labels(features, labels,
+                                                    batch_size))
+
+        new_input_dict['features'] = features
+        if labels is not None:
+          new_input_dict['labels'] = labels
+
+      else:
+        new_input_dict['features'] = features
+        if labels is not None:
+          new_input_dict['labels'] = labels
+        padding_mask = None
+
+      new_input_dict['signals'] = _StopSignals(
+          stop=stop, batch_size=batch_size,
+          padding_mask=padding_mask).as_dict()
+
+      return new_input_dict
+
+    return _map_fn
+
+
+class _StopSignals(object):
+  """Signals class holding all logic to handle TPU stopping condition."""
+
+  NON_STOPPING_SIGNAL = False
+  STOPPING_SIGNAL = True
+
+  def __init__(self, stop, batch_size, padding_mask=None):
+    self._stop = stop
+    self._batch_size = batch_size
+    self._padding_mask = padding_mask
+
+  def as_dict(self):
+    """Returns the signals as Python dict."""
+    shape = [self._batch_size, 1]
+    dtype = dtypes.bool
+
+    if self._stop:
+      stopping = array_ops.ones(shape=shape, dtype=dtype)
+    else:
+      stopping = array_ops.zeros(shape=shape, dtype=dtype)
+
+    signals = {'stopping': stopping}
+    if self._padding_mask is not None:
+      signals['padding_mask'] = self._padding_mask
+    return signals
+
+  @staticmethod
+  def as_scalar_stopping_signal(signals):
+    return array_ops.identity(signals['stopping'][0][0])
+
+  @staticmethod
+  def should_stop(scalar_stopping_signal):
+    """Detects whether scalar_stopping_signal indicates stopping."""
+    if isinstance(scalar_stopping_signal, ops.Tensor):
+      # STOPPING_SIGNAL is a constant True. Here, the logical_and is just the TF
+      # way to express the bool check whether scalar_stopping_signal is True.
+      return math_ops.logical_and(scalar_stopping_signal,
+                                  _StopSignals.STOPPING_SIGNAL)
+    else:
+      # For non Tensor case, it is used in SessionRunHook. So, we cannot modify
+      # the graph anymore. Here, we use pure Python.
+      return bool(scalar_stopping_signal)
+
+
+class _PaddingSignals(object):
+  """Signals class holding all logic to handle padding."""
+
+  @staticmethod
+  def pad_features_and_labels(features, labels, batch_size):
+    """Pads out the batch dimension of features and labels."""
+    real_batch_size = array_ops.shape(
+        _PaddingSignals._find_any_tensor(features))[0]
+
+    batch_size_tensor = constant_op.constant(batch_size, dtypes.int32)
+
+    check_greater = check_ops.assert_greater_equal(
+        batch_size_tensor,
+        real_batch_size,
+        data=(batch_size_tensor, real_batch_size),
+        message='The real batch size should not be greater than batch_size.')
+
+    with ops.control_dependencies([check_greater]):
+      missing_count = batch_size_tensor - real_batch_size
+
+    def pad_single_tensor(tensor):
+      """Pads out the batch dimension of a tensor to the complete batch_size."""
+      rank = len(tensor.shape)
+      assert rank > 0
+      padding = array_ops.stack([[0, missing_count]] + [[0, 0]] * (rank - 1))
+      padded_shape = (batch_size,) + tuple(tensor.shape[1:])
+      padded_tensor = array_ops.pad(tensor, padding)
+      padded_tensor.set_shape(padded_shape)
+      return padded_tensor
+
+    def nest_pad(tensor_or_dict):
+      return nest.map_structure(pad_single_tensor, tensor_or_dict)
+
+    features = nest_pad(features)
+    if labels is not None:
+      labels = nest_pad(labels)
+
+    padding_mask = _PaddingSignals._padding_mask(real_batch_size, missing_count,
+                                                 batch_size)
+
+    return padding_mask, features, labels
+
+  @staticmethod
+  def slice_tensor_or_dict(tensor_or_dict, signals):
+    """Slice the real Tensors according to padding mask in signals."""
+
+    padding_mask = signals['padding_mask']
+    batch_size = array_ops.shape(padding_mask)[0]
+
+    def verify_batch_size(tensor):
+      check_batch_size = math_ops.equal(batch_size, tensor.shape[0])
+      with ops.control_dependencies([check_batch_size]):
+        return array_ops.identity(tensor)
+
+    def slice_single_tensor(tensor):
+      rank = len(tensor.shape)
+      assert rank > 0
+      real_batch_size = batch_size - math_ops.reduce_sum(padding_mask)
+      return verify_batch_size(tensor)[0:real_batch_size]
+
+    # As we split the Tensors to all TPU cores and concat them back, it is
+    # important to ensure the real data is placed before padded ones, i.e.,
+    # order is preserved. By that, the sliced padding mask should have all 0's.
+    # If this assertion failed, # the slice logic here would not hold.
+    sliced_padding_mask = slice_single_tensor(padding_mask)
+    assert_padding_mask = math_ops.equal(
+        math_ops.reduce_sum(sliced_padding_mask), 0)
+
+    with ops.control_dependencies([assert_padding_mask]):
+      should_stop = _StopSignals.should_stop(
+          _StopSignals.as_scalar_stopping_signal(signals))
+
+    is_full_batch = math_ops.equal(math_ops.reduce_sum(padding_mask), 0)
+
+    def slice_fn(tensor):
+      # If the current batch is full batch or part of stopping signals, we do
+      # not need to slice to save performance.
+      return control_flow_ops.cond(
+          math_ops.logical_or(should_stop, is_full_batch),
+          (lambda: verify_batch_size(tensor)),
+          (lambda: slice_single_tensor(tensor)))
+
+    return nest.map_structure(slice_fn, tensor_or_dict)
+
+  @staticmethod
+  def _find_any_tensor(batch_features):
+    tensors = [
+        x for x in nest.flatten(batch_features) if isinstance(x, ops.Tensor)
+    ]
+    if not tensors:
+      raise ValueError('Cannot find any Tensor in features dict.')
+    return tensors[0]
+
+  @staticmethod
+  def _padding_mask(real_batch_size, missing_count, batch_size):
+    padding_mask = array_ops.concat([
+        array_ops.zeros((real_batch_size,), dtype=dtypes.int32),
+        array_ops.ones((missing_count,), dtype=dtypes.int32)
+    ],
+                                    axis=0)
+    padding_mask.set_shape((batch_size,))
+    return padding_mask
+
+
+def _verify_cross_hosts_transfer_size(tensor_dict, message):
+  total_size = 0
+  tensor_structure = {}
+  for key, tensor in tensor_dict.items():
+    shape = tensor.shape
+    size = np.product(shape) * tensor.dtype.size
+    tensor_structure[key] = shape
+    total_size += size
+  if total_size >= _ONE_GIGABYTE:
+    raise ValueError(
+        '{} The transfer size is larger than the protobuf limit. Please '
+        'consider to use Tensors with smaller shapes or reduce batch '
+        'size. Given:\n'
+        '{}'.format(
+            message, '\n'.join([
+                ' -- Key: {}, Shape: {}'.format(k, v)
+                for k, v in tensor_structure.items()
+            ])))
+
+
+def _add_item_to_params(params, key, value):
+  """Adds a new item into `params`."""
+  if hasattr(params, 'set_hparam'):
+    # For HParams, we need to use special API.
+    if key in params:
+      params.set_hparam(key, value)
+    else:
+      params.add_hparam(key, value)
+  else:
+    # Now params is Python dict.
+    params[key] = value
+
+
+def export_estimator_savedmodel(estimator,
+                                export_dir_base,
+                                serving_input_receiver_fn,
+                                assets_extra=None,
+                                as_text=False,
+                                checkpoint_path=None,
+                                strip_default_attrs=False):
+  """Export `Estimator` trained model for TPU inference.
+
+  Args:
+    estimator: `Estimator` with which model has been trained.
+    export_dir_base: A string containing a directory in which to create
+      timestamped subdirectories containing exported SavedModels.
+    serving_input_receiver_fn: A function that takes no argument and returns a
+      `ServingInputReceiver` or `TensorServingInputReceiver`.
+    assets_extra: A dict specifying how to populate the assets.extra directory
+      within the exported SavedModel, or `None` if no extra assets are needed.
+    as_text: whether to write the SavedModel proto in text format.
+    checkpoint_path: The checkpoint path to export.  If `None` (the default),
+      the most recent checkpoint found within the model directory is chosen.
+    strip_default_attrs: Boolean. If `True`, default-valued attributes will be
+      removed from the NodeDefs.
+
+  Returns:
+    The string path to the exported directory.
+  """
+  # `TPUEstimator` requires `tpu_config.RunConfig`, so we cannot use
+  # `estimator.config`.
+  config = tpu_config.RunConfig(model_dir=estimator.model_dir)
+  est = TPUEstimator(
+      estimator._model_fn,  # pylint: disable=protected-access
+      config=config,
+      params=estimator.params,
+      use_tpu=True,
+      train_batch_size=2048,  # Does not matter.
+      eval_batch_size=2048,  # Does not matter.
+  )
+  return est.export_savedmodel(export_dir_base, serving_input_receiver_fn,
+                               assets_extra, as_text, checkpoint_path,
+                               strip_default_attrs)
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py b/tensorflow/python/tpu/tpu_estimator_signals_test.py
similarity index 99%
rename from tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py
rename to tensorflow/python/tpu/tpu_estimator_signals_test.py
index e3ea983abfd24d03c964fbc647b56262e15e0a96..ca3eeaa9c9ace9bdbf6a3c6efa8b84eeecc7a60f 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py
+++ b/tensorflow/python/tpu/tpu_estimator_signals_test.py
@@ -20,12 +20,12 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.tpu.python.tpu import tpu_estimator
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import tpu_estimator
 
 
 def make_input_fn(num_samples):
diff --git a/tensorflow/python/tpu/tpu_feed.py b/tensorflow/python/tpu/tpu_feed.py
new file mode 100644
index 0000000000000000000000000000000000000000..de1adc80e6015a8418bd3ea470d1a17561eb542b
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_feed.py
@@ -0,0 +1,919 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+
+"""Helper library for handling infeed between hosts and TPUs.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.compiler.xla.experimental.xla_sharding import xla_sharding
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.tpu import tpu
+from tensorflow.python.tpu import tpu_sharding
+from tensorflow.python.tpu.ops import tpu_ops
+
+from tensorflow.python.util import nest
+
+
+def partition_or_replicate_on_host(tensor, dims):
+  """Partitions or replicates the input tensor.
+
+    The ops inside this function are placed on the host side.
+
+  Args:
+    tensor: The input tensor which will be partioned or replicated.
+    dims: A list of integer describes how to partition the input tensor.
+
+  Returns:
+    An iterator of `Tensor`s or a list of partioned tensors.
+  """
+  if dims is None:
+    return itertools.repeat(tensor)
+  dims = np.array(dims)
+  output = [tensor]
+  shape_list = np.array(tensor.shape.as_list())
+  quotients, remainders = np.divmod(shape_list, dims)
+  for axis, (quotient, remainder, dim, original_size) in enumerate(
+      zip(quotients, remainders, dims, shape_list)):
+    if dim <= 1:
+      continue
+    if remainder > 0:
+      # For each dimension, when it cannot be evenly partitioned, XLA assumes
+      # tensors are partitioned in a greedy manner by using
+      # ceil_ratio(size/dim) first. E.g. 2D tensor with shape (5, 14) and dims
+      # are (2, 4). Since 5 % 2 = 1 and 14 % 4 = 2, [5, 14] =>
+      # [[(3, 4), (3, 4), (2, 4), (2, 2)],
+      # [(2, 4), (2, 4), (2, 4), (2, 2)]]
+      ceil_ratio = quotient + 1
+      num_full_slots, left_over = np.divmod(original_size, ceil_ratio)
+      num_or_size_splits = [ceil_ratio] * num_full_slots + [left_over]
+      if len(num_or_size_splits) < dim:
+        num_or_size_splits += [0] * (dim - len(num_or_size_splits))
+      new_output = []
+      for x in output:
+        new_output.append(
+            array_ops.split(
+                x, num_or_size_splits=num_or_size_splits, axis=axis))
+      output = new_output
+    else:
+      output = [array_ops.split(x, dim, axis=axis) for x in output]
+    output = nest.flatten(output)
+  return output
+
+
+def _tag_sharding_attribute_for_dequeued_tensor(tensor, dims):
+  """Tags appropriate XLA sharding attribute to the dequeued tensor.
+
+  Args:
+    tensor: The dequeued tensor on TPU.
+    dims: A list of integer describes how the tensor is partitioned.
+
+  Returns:
+    The same tensor with the xla_sharding attribute.
+  """
+  if dims is None:
+    return xla_sharding.replicate(tensor)
+  elif np.prod(dims) == 1:
+    return xla_sharding.assign_device(tensor, 0)
+  else:
+    tile_assignment = np.arange(np.prod(dims)).reshape(dims)
+    return xla_sharding.tile(tensor=tensor, tile_assignment=tile_assignment)
+
+
+def tag_sharding_attribute_for_dequeued_tensors(dequeues, dims):
+  """Tags appropriate XLA sharding attribute to the dequeued tensors.
+
+  Args:
+    dequeues: A list of dequeued tensors on TPU.
+    dims: A list of integer describes how the tensor is partitioned.
+
+  Returns:
+    The same dequeues with appropriate xla_sharding attribute.
+  """
+  nest.assert_shallow_structure(dequeues, dims)
+  return nest.map_structure_up_to(
+      dequeues, _tag_sharding_attribute_for_dequeued_tensor, dequeues, dims)
+
+
+class InfeedQueue(object):
+  """A helper object to build a device infeed queue.
+
+  The InfeedQueue builds the host-side and device-side Ops to enqueue and
+  dequeue elements, respectively, and ensures that their types and
+  shapes match.
+  """
+
+  def __init__(self,
+               number_of_tuple_elements=None,
+               tuple_types=None,
+               tuple_shapes=None,
+               shard_dimensions=None,
+               name=None):
+    """Creates a new InfeedQueue with the given configuration.
+
+    The configuration need not be fully specified at creation since it
+    can be modified subsequently by methods that set the values
+    explicitly or infer them from the shapes of inputs.
+
+    Args:
+      number_of_tuple_elements: the number of Tensors fed atomically through the
+        queue, must be present unless it can be inferred from other arguments.
+      tuple_types: if not None, a list of types of the elements of the queue.
+      tuple_shapes: if not None, a list of shapes of the elements of the queue.
+      shard_dimensions: if not None, a list of dimensions on which the
+        elements of the queue should be sharded during automatic
+        parallelization.
+      name: the name of the queue.
+
+    Raises:
+      ValueError: if number_of_tuple_elements <= 0; or
+        number_of_tuple_arguments, tuple_types, tuple_shapes, and
+        shard_dimensions are all None; or the length of tuple_types,
+        tuple_shapes, or shard_dimensions is not equal to
+        number_of_tuple_elements; or any element of shard_dimensions
+        can't be converted to a Dimension.
+      TypeError: if any element of tuple_types or tuple_shapes can't
+        be converted to a dtype or TensorShape, respectively.
+    """
+    self._frozen = False
+    self._generated_enqueue_ops = False
+    self._generated_dequeue_op = False
+    self._name = "InfeedQueue" if name is None else name
+    if number_of_tuple_elements is None:
+      if tuple_types is not None:
+        number_of_tuple_elements = len(tuple_types)
+      elif tuple_shapes is not None:
+        number_of_tuple_elements = len(tuple_shapes)
+      elif shard_dimensions is not None:
+        number_of_tuple_elements = len(shard_dimensions)
+      else:
+        raise ValueError(
+            "number of tuple elements cannot be inferred from InfeedQueue "
+            "constructor")
+    if number_of_tuple_elements <= 0:
+      raise ValueError("number_of_tuple_elements %d must be > 0" %
+                       number_of_tuple_elements)
+    # Make an empty sharding policy for each tuple element.
+    self._sharding_policies = [
+        tpu_sharding.ShardingPolicy()
+        for _ in xrange(number_of_tuple_elements)
+    ]
+    if tuple_types is not None:
+      self.set_tuple_types(tuple_types)
+    else:
+      self._tuple_types = None
+    if tuple_shapes is not None:
+      self.set_tuple_shapes(tuple_shapes)
+    else:
+      self._tuple_shapes = None
+    if shard_dimensions is not None:
+      self.set_shard_dimensions(shard_dimensions)
+    self._validate()
+
+  def _validate(self):
+    """Checks that the configuration is self-consistent.
+
+    Raises:
+      ValueError: if the shapes and sharding policies don't match.
+    """
+    if self.tuple_shapes is not None:
+      for (policy, shape) in zip(self._sharding_policies, self._tuple_shapes):
+        # Raise an error if the policy is incompatible with the shape.
+        _ = policy.get_sharded_shape(shape)
+
+  @property
+  def number_of_tuple_elements(self):
+    """Returns the number of InfeedQueue tuple elements."""
+    return len(self._sharding_policies)
+
+  @property
+  def tuple_types(self):
+    """Returns the types of the InfeedQueue tuple elements."""
+    return self._tuple_types
+
+  def set_tuple_types(self, tuple_types):
+    """Sets the type of each element of the queue.
+
+    tuple_types must be a list of length
+    self.number_of_tuple_elements, and each element must be
+    convertible to a dtype.
+
+    Args:
+      tuple_types: the types of each queue element.
+
+    Raises:
+      ValueError: if tuple_types is not of length
+        self.number_of_tuple_elements.
+      TypeError: if an element of tuple_types cannot be converted to a
+        dtype.
+    """
+    if len(tuple_types) != self.number_of_tuple_elements:
+      raise ValueError("tuple_types is %s, but must be a list of length %d" %
+                       (str(tuple_types), self.number_of_tuple_elements))
+    if self._frozen:
+      for (frozen, updated) in zip(self._tuple_types, tuple_types):
+        if frozen != updated:
+          raise ValueError(
+              "Trying to update InfeedQueue with frozen configuration with an "
+              "incompatible type. Frozen types are %s, updated types are %s" % (
+                  str(self._tuple_types), str(tuple_types)))
+    else:
+      try:
+        self._tuple_types = [dtypes.as_dtype(t) for t in tuple_types]
+      except (TypeError) as e:
+        raise TypeError(
+            "tuple_types is %s, but must be a list of elements each "
+            "convertible to dtype: got error %s" % (str(tuple_types), str(e)))
+
+  @property
+  def tuple_shapes(self):
+    """Returns the shapes of the InfeedQueue tuple elements."""
+    return self._tuple_shapes
+
+  def set_tuple_shapes(self, tuple_shapes):
+    """Sets the shape of each element of the queue.
+
+    tuple_shapes must be a list of length
+    self.number_of_tuple_elements, and each element must be
+    convertible to a TensorShape.
+
+    Args:
+      tuple_shapes: the shapes of each queue element.
+
+    Raises:
+      ValueError: if tuple_shapes is not of length
+        self.number_of_tuple_elements.
+      TypeError: if an element of tuple_shapes cannot be converted to
+        a TensorShape.
+    """
+    if len(tuple_shapes) != self.number_of_tuple_elements:
+      raise ValueError("tuple_shapes is %s, but must be a list of length %d" %
+                       (str(tuple_shapes), self.number_of_tuple_elements))
+    try:
+      tuple_shapes = [tensor_shape.as_shape(shape) for shape in tuple_shapes]
+    except (ValueError, TypeError) as e:
+      raise TypeError(
+          "tuple_shapes is %s, but must be a list of elements each "
+          "convertible to TensorShape: got error %s" % (str(tuple_shapes),
+                                                        str(e)))
+    if self._frozen:
+      for (frozen, updated) in zip(self._tuple_shapes, tuple_shapes):
+        if frozen != updated:
+          raise ValueError(
+              "Trying to update InfeedQueue with frozen configuration with an "
+              "incompatible shape. Frozen shapes are %s, updated shapes are %s"
+              % (str(self._tuple_shapes), str(tuple_shapes)))
+    else:
+      self._tuple_shapes = tuple_shapes
+    self._validate()
+
+  @property
+  def sharding_policies(self):
+    """Returns the sharding policies of the InfeedQueue tuple elements."""
+    return self._sharding_policies
+
+  @property
+  def shard_dimensions(self):
+    """Gets the shard dimension of each tuple element.
+
+    Returns:
+      A list of length number_of_tuple_elements, where each list entry
+      is the shard dimension of that tuple element or None if the
+      shard dimension has not been set.
+    """
+    # The number of shards is always the same for all the policies.
+    return [policy.shard_dimension for policy in self._sharding_policies]
+
+  def set_shard_dimensions(self, shard_dimensions):
+    """Sets the shard_dimension of each element of the queue.
+
+    shard_dimensions must be a list of length
+    self.number_of_tuple_elements, and each element must be
+    convertible to a Dimension compatible with self.tuple_shapes.
+
+    Args:
+      shard_dimensions: the dimensions of each queue element.
+
+    Raises:
+      ValueError: if shard_dimensions is not of length
+        self.number_of_tuple_elements; or an element of
+        shard_dimensions cannot be converted to a Dimension; or an
+        element of shard_dimensions is a Dimension that is out of
+        range for the corresponding tuple element shape.
+    """
+    if len(shard_dimensions) != self.number_of_tuple_elements:
+      raise ValueError("shard_dimensions is %s, but must be a list of length %d"
+                       % (str(shard_dimensions),
+                          self.number_of_tuple_elements))
+    for (policy, dimension) in zip(self._sharding_policies, shard_dimensions):
+      policy.set_shard_dimension(dimension)
+    self._validate()
+
+  @property
+  def number_of_shards(self):
+    """Gets the number of shards to use for the InfeedQueue.
+
+    Returns:
+      Number of shards or None if the number of shards has not been set.
+    """
+    # The number of shards is always the same for all the policies.
+    return self._sharding_policies[0].number_of_shards
+
+  def set_number_of_shards(self, number_of_shards):
+    """Sets the number of shards to use for the InfeedQueue.
+
+    Args:
+      number_of_shards: number of ways to shard the InfeedQueue.
+
+    Raises:
+      ValueError: if number_of_shards is not > 0; or the policies have
+        been frozen and number_of_shards was already set to something
+        else.
+    """
+    for policy in self._sharding_policies:
+      policy.set_number_of_shards(number_of_shards)
+    self._validate()
+
+  def set_configuration_from_input_tensors(self, input_tensors):
+    """Sets the shapes and types of the queue tuple elements.
+
+    input_tensors is a list of Tensors whose types and shapes are used
+    to set the queue configuration.
+
+    Args:
+      input_tensors: list of Tensors of the same types and shapes as
+        the desired queue Tuple.
+
+    Raises:
+      ValueError: if input_tensors is not a list of length
+        self.number_of_tuple_elements
+    """
+    if len(input_tensors) != self.number_of_tuple_elements:
+      raise ValueError("input_tensors is %s, but should be a list of %d Tensors"
+                       % (str(input_tensors), self.number_of_tuple_elements))
+    self.set_tuple_shapes([t.shape for t in input_tensors])
+    self.set_tuple_types([t.dtype for t in input_tensors])
+
+  def set_configuration_from_sharded_input_tensors(self, input_tensors):
+    """Sets the shapes and types of the queue tuple elements.
+
+    input_tensors is a list of lists of Tensors whose types and shapes are used
+    to set the queue configuration. The length of the outer list is the number
+    of shards required, and each inner list is the tuple of Tensors to use to
+    determine the types and shapes of the corresponding shard. This method
+    depends on the shard dimension, and calling it freezes the shard policy.
+
+    Args:
+      input_tensors: list of lists of Tensors. The outer list length corresponds
+        to the desired number of shards, and each inner list is the size
+        and shape of the desired configuration of the corresponding shard.
+
+    Raises:
+      ValueError: if any inner list is not a list of length
+        self.number_of_tuple_elements; or the inner lists do not combine to
+        form a consistent unsharded shape.
+      TypeError: if the types of the Tensors in the inner lists do not match.
+    """
+    if not self._frozen:
+      # Unset the tuple shapes in case the configuration becomes
+      # transiently inconsistent.
+      self._tuple_shapes = None
+    number_of_shards = len(input_tensors)
+    self.set_number_of_shards(number_of_shards)
+    for t in input_tensors:
+      if len(t) != self.number_of_tuple_elements:
+        raise ValueError(
+            "input_tensors is %s but must be a list of lists, where each inner"
+            " list has length number_of_tuple_elements=%d" % (
+                str(input_tensors), self.number_of_tuple_elements))
+    # Transpose the inputs to make a list of shard shapes for each tuple
+    # element.
+    sharded_shapes = [[t[i].shape for t in input_tensors]
+                      for i in xrange(self.number_of_tuple_elements)]
+    # For each tuple, get the unsharded shape using that tuple's policy.
+    unsharded_shapes = [
+        policy.get_unsharded_shape(s)
+        for (policy, s) in zip(self._sharding_policies, sharded_shapes)
+    ]
+    self.set_tuple_shapes(unsharded_shapes)
+    for i in xrange(1, self.number_of_shards):
+      for (t1, t2) in zip(input_tensors[0], input_tensors[i]):
+        if t1.dtype != t2.dtype:
+          raise TypeError(
+              "types of the tuple elements of input_tensors %s are not "
+              "consistent" % str(input_tensors))
+    self.set_tuple_types([t.dtype for t in input_tensors[0]])
+
+  def freeze(self):
+    """Freezes the InfeedQueue so it can no longer be modified.
+
+    The configuration is implicitly frozen before any host-side or
+    device-side Ops are generated. The configuration cannot be frozen
+    until the types and shapes of the tuple elements have been set.
+
+    Raises:
+      ValueError: if the types or shapes of the tuple elements have not been
+      set.
+    """
+    self._frozen = True
+    if self._tuple_types is None:
+      raise ValueError(
+          "Can't freeze an InfeedQueue without setting all tuple types.")
+    if self._tuple_shapes is None:
+      raise ValueError(
+          "Can't freeze an InfeedQueue without setting all tuple shapes.")
+    for shape in self._tuple_shapes:
+      if shape.dims is None:
+        raise ValueError(
+            "Can't freeze an InfeedQueue without setting all tuple shapes.")
+    for policy in self._sharding_policies:
+      policy.freeze()
+    self._validate()
+
+  def generate_dequeue_op(self, tpu_device=0):
+    """Generates the device-side Op to dequeue a tuple from the queue.
+
+    Implicitly freezes the queue configuration if it is not already
+    frozen, which will raise errors if the shapes and types have not
+    been fully specified.
+
+    Args:
+      tpu_device: The TPU device ordinal where the infeed instruction should be
+        placed. If None, no explicit placement will be performed, and it is up
+        to the user to call this API from within a proper TPU device scope.
+        The XLA code will fail if the TPU dequeue instruction is not bound to
+        any device.
+
+    Returns:
+      A list of Outputs corresponding to a shard of infeed dequeued
+      into XLA, suitable for use within a replicated block.
+
+    Raises:
+      ValueError: if the types or shapes of the tuple elements have not been
+      set; or if a dequeue op has already been generated.
+    """
+    self.freeze()
+    if self._generated_dequeue_op:
+      raise ValueError("Can't generate two dequeue Ops from the same queue")
+    self._generated_dequeue_op = True
+    full_name = "%s/dequeue" % self._name
+    sharded_shapes = [
+        policy.get_sharded_shape(shape)
+        for (shape, policy) in zip(self._tuple_shapes, self._sharding_policies)
+    ]
+    if tpu_device is not None:
+      with ops.device(tpu.core(tpu_device)):
+        return tpu_ops.infeed_dequeue_tuple(
+            dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
+    else:
+      return tpu_ops.infeed_dequeue_tuple(
+          dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
+
+  def _generate_enqueue_op(self,
+                           inputs,
+                           name_prefix,
+                           index,
+                           device=None,
+                           tpu_ordinal=-1):
+    """Generate a host-side Op to enqueue a tuple to the queue.
+
+    If device is None the inputs are all required to have the same
+    device specification, and the enqueue Op is colocated with
+    inputs[0]. Otherwise the enqueue Op is placed on 'device'.
+
+    Args:
+      inputs: a list of Tensors with the types and shapes of the tuple elements.
+      name_prefix: the base name for the Op.
+      index: the shard index, used to uniquify the Op name.
+      device: device to place the Op on, or None if it should be
+        colocated with the inputs.
+      tpu_ordinal: ordinal of the TPU device on the host to use for
+      infeed if device is a CPU device. Should be set to -1 if device
+      is a TPU device.
+
+    Returns:
+      An Op corresponding to a shard of infeed enqueued at the host,
+      suitable for use within a replicated block.
+
+    Raises:
+      ValueError: if device is None and inputs do not all have the
+        same device specification.
+    """
+    full_name = "%s/%d" % (name_prefix, index)
+    shapes = [t.shape for t in inputs]
+    if device is None:
+      devices = [t.device for t in inputs]
+      for i in xrange(1, self.number_of_tuple_elements):
+        if devices[0] != devices[i]:
+          raise ValueError(
+              "input devices for shard %d are %s, but should all be the same" %
+              (index, str(devices)))
+      with ops.colocate_with(inputs[0]):
+        return tpu_ops.infeed_enqueue_tuple(
+            inputs=inputs,
+            shapes=shapes,
+            name=full_name,
+            device_ordinal=tpu_ordinal)
+    else:
+      with ops.device(device):
+        return tpu_ops.infeed_enqueue_tuple(
+            inputs=inputs,
+            shapes=shapes,
+            name=full_name,
+            device_ordinal=tpu_ordinal)
+
+  def generate_enqueue_ops(self,
+                           sharded_inputs,
+                           tpu_ordinal_function=None,
+                           placement_function=None):
+    """Generates the host-side Ops to enqueue the shards of a tuple.
+
+    sharded_inputs is a list, one for each shard, of lists of
+    Tensors. sharded_inputs[0] is the tuple of Tensors to use to feed
+    shard 0 if the queue. Returns the host-side Ops that must be run to
+    enqueue the sharded tuple. The Op for shard i is colocated with the inputs
+    for shard i.
+
+    Implicitly freezes the queue configuration if it is not already
+    frozen. If the configuration has already been frozen, and is not
+    compatible with the types and shapes of sharded_inputs, an error
+    will be raised.
+
+    Args:
+      sharded_inputs: a list of lists of Tensors. The length of the outer list
+        determines the number of shards. Each inner list indicates the types
+        and shapes of the tuples in the corresponding shard.
+      tpu_ordinal_function: if not None, a function that takes the
+        shard index as input and returns the ordinal of the TPU device
+        the shard's infeed should be placed on. tpu_ordinal_function must be
+        set if the inputs are placed on CPU devices.
+      placement_function: if not None, a function that takes the shard index as
+        input and returns the host device where the enqueue op should be placed
+        on.
+
+    Returns:
+      A list of host-side Ops, one for each shard, that when executed together
+      will enqueue a full-size element of infeed.
+
+    Raises:
+      ValueError: if the queue configuration has previously been frozen and the
+        shapes of the elements of sharded_inputs are not compatible with the
+        frozen configuration; or if the shapes of the elements of sharded_inputs
+        don't form a consistent unsharded tuple; or if the elements of a tuple
+        have different device constraints.
+      TypeError: if the queue configuration has previously been frozen and the
+        types of the elements of sharded_inputs are not compatible with the
+        frozen configuration; or if the types of the elements of sharded_inputs
+        don't form a consistent unsharded tuple.
+    """
+    self.set_configuration_from_sharded_input_tensors(sharded_inputs)
+    self.freeze()
+    if self._generated_enqueue_ops:
+      raise ValueError("Can't generate two enqueue Ops from the same queue")
+    self._generated_enqueue_ops = True
+    if tpu_ordinal_function is None:
+      tpu_ordinal_function = lambda index: -1
+    name_prefix = "%s/enqueue" % self._name
+    return [
+        self._generate_enqueue_op(
+            shard,
+            name_prefix,
+            index,
+            tpu_ordinal=tpu_ordinal_function(index),
+            device=placement_function(index) if placement_function else None)
+        for (shard, index) in zip(sharded_inputs, xrange(self.number_of_shards))
+    ]
+
+  # TODO(misard) Generalize this to the case of systems that don't
+  # have 8 devices per host, and figure out what to do with
+  # model-parallelism.
+  def _default_placement_function(self, index):
+    return "/task:%d/device:CPU:0" % (index / 8)
+
+  def _default_ordinal_function(self, index):
+    return index % 8
+
+  # TODO(b/36470756) remove this from tutorials once we have a better story
+  # for automatic placement of input pipelines.
+  def split_inputs_and_generate_enqueue_ops(self,
+                                            inputs,
+                                            device_assignment=None,
+                                            placement_function=None,
+                                            tpu_ordinal_function=None):
+    """POORLY-PERFORMING ON MULTI-HOST SYSTEMS.
+
+    Generates the host-side Ops to enqueue a tuple.
+
+    This method performs poorly because it takes an entire input on a single
+    host, splits it, and distributes it to all of the cores. It is present only
+    to simplify tutorial examples.
+
+    inputs is a list of Tensors to use to feed the queue. Each input is split
+    into self.number_of_shards shards. Returns an Op for each shard to enqueue
+    the shard. The Op for shard i is placed on device placement_function(i).
+
+    Implicitly freezes the queue configuration if it is not already
+    frozen. If the configuration has already been frozen, and is not
+    compatible with the types and shapes of inputs, an error
+    will be raised.
+
+    Args:
+      inputs: a list of Tensors which indicates the types and shapes of the
+        queue tuple.
+     device_assignment: if not `None`, a TPU `DeviceAssignment`. If
+        device_assignment is not `None`, but `placement_function` and
+        `ordinal_function` are None, then `device_assignment` will be used to
+        place infeeds on the first k TPU shards, where k is the number of shards
+        in the queue. If all three are `None`, then default placement and
+        ordinal functions are used.
+      placement_function: if not None, a function that takes the shard
+        index as input and returns a device string indicating which
+        device the shard's infeed should be placed on. If placement_function
+        and tpu_ordinal_function are None, inputs are sharded round-robin
+        across the devices in the system.
+      tpu_ordinal_function: if not None, a function that takes the
+        shard index as input and returns the ordinal of the TPU device
+        the shard's infeed should be placed on. If placement_function
+        and tpu_ordinal_function are None, inputs are sharded round-robin
+        across the devices in the system.
+
+    Returns:
+      A list of host-side Ops, one for each shard, that when executed together
+      will enqueue a full-size element of infeed.
+
+    Raises:
+      ValueError: if the queue configuration has previously been frozen and the
+        shapes of the elements of inputs are not compatible with the frozen
+        configuration.
+      TypeError: if the queue configuration has previously been frozen and the
+        types of the elements of inputs are not compatible with the frozen
+        configuration.
+    """
+    if device_assignment is None:
+      if placement_function is None:
+        placement_function = self._default_placement_function
+      if tpu_ordinal_function is None:
+        tpu_ordinal_function = self._default_ordinal_function
+    else:
+
+      def _placement_function_from_map(index):
+        return device_assignment.host_device(replica=index)
+
+      def _ordinal_function_from_map(index):
+        return device_assignment.tpu_ordinal(replica=index)
+
+      if placement_function is None:
+        placement_function = _placement_function_from_map
+      if tpu_ordinal_function is None:
+        tpu_ordinal_function = _ordinal_function_from_map
+    self.set_configuration_from_input_tensors(inputs)
+    self.freeze()
+    if self._generated_enqueue_ops:
+      raise ValueError("Can't generate two enqueue Ops from the same queue")
+    self._generated_enqueue_ops = True
+    split_name_prefix = "%s/split" % self._name
+    if self.number_of_shards == 1:
+      transposed_sharded_inputs = [[inp] for inp in inputs]
+    else:
+
+      def split_fn(inp, num_shards, axis, name):
+        with ops.colocate_with(inp):
+          return array_ops.split(inp, num_shards, axis=axis, name=name)
+
+      transposed_sharded_inputs = [
+          split_fn(
+              inp,
+              self.number_of_shards,
+              axis=policy.shard_dimension,
+              name="%s/%d" % (split_name_prefix, index))
+          for (inp, policy, index) in zip(inputs, self._sharding_policies,
+                                          xrange(self.number_of_tuple_elements))
+      ]
+    sharded_inputs = [[shard[i] for shard in transposed_sharded_inputs]
+                      for i in xrange(self.number_of_shards)]
+    name_prefix = "%s/enqueue" % self._name
+    return [
+        self._generate_enqueue_op(
+            shard,
+            name_prefix,
+            index,
+            device=placement_function(index),
+            tpu_ordinal=tpu_ordinal_function(index))
+        for (shard, index) in zip(sharded_inputs, xrange(self.number_of_shards))
+    ]
+
+
+class _PartitionedInfeedQueue(InfeedQueue):
+  """A helper object to build a device infeed queue with input partition.
+
+  Args:
+    number_of_tuple_elements: the number of Tensors fed atomically through the
+      queue, must be present unless it can be inferred from other arguments.
+    device_assignment: A TPU `DeviceAssignment` which is used to place all the
+      partitions to different TPU infeed queues.
+    host_id: The id of the host machine.
+    input_partition_dims: A nested list/tuple of integers. Each inner
+      list/tuple describes how to partition the corresponding input tensor.
+    tuple_types: If not None, a list of types of the elements of the queue.
+    tuple_shapes: If not None, a list of shapes of the elements of the queue.
+    name: The name of the queue.
+  """
+
+  def __init__(self,
+               number_of_tuple_elements,
+               device_assignment,
+               host_id,
+               input_partition_dims=None,
+               tuple_types=None,
+               tuple_shapes=None,
+               name=None):
+    super(_PartitionedInfeedQueue, self).__init__(
+        number_of_tuple_elements=number_of_tuple_elements,
+        tuple_types=tuple_types,
+        tuple_shapes=None,
+        shard_dimensions=None,
+        name="PartitionedInfeedQueue" if name is None else name)
+    self._input_partition_dims = input_partition_dims
+    self._host_id = host_id
+    self._device_assignment = device_assignment
+
+  def generate_dequeue_op(self, tpu_device=0):
+    """Generate TPU dequeue ops.
+
+    Args:
+      tpu_device: The TPU device ordinal where the infeed instruction should be
+        placed.
+
+    Returns:
+      A list of Outputs corresponding to a partition of infeed dequeued
+      into XLA, suitable for use within a replicated block.
+
+    Raises:
+      ValueError: if the types or shapes of the tuple elements have not been
+      set; or if a dequeue op has already been generated.
+    """
+    self.freeze()
+    if self._generated_dequeue_op:
+      raise ValueError("Can't generate two dequeue Ops from the same queue")
+    self._generated_dequeue_op = True
+    full_name = "%s/dequeue" % self._name
+    sharded_shapes = [
+        policy.get_sharded_shape(shape)
+        for (shape, policy) in zip(self._tuple_shapes, self._sharding_policies)
+    ]
+    with ops.device(tpu.core(tpu_device)):
+      values = tpu_ops.infeed_dequeue_tuple(
+          dtypes=self._tuple_types, shapes=sharded_shapes, name=full_name)
+    return tag_sharding_attribute_for_dequeued_tensors(
+        values, self._input_partition_dims)
+
+  def generate_enqueue_ops(self, per_host_sharded_inputs):
+    """Generates the host-side Ops to enqueue the partitioned inputs.
+
+    per_host_sharded_inputs is a list, one for each replica, of lists of
+    Tensors. sharded_inputs[i] is the tuple of Tensors to use to feed
+    replica i.
+    sharded_inputs[i][j] is partitioned by self._input_partition_dims[j].
+
+    For example, if sharded_inputs[i][j] is a 2-D Tensor:
+    [[A, B, C, D],
+     [E ,F, G, H]]
+    self._input_partition_dims[j] is [2, 4].
+
+    sharded_inputs[i][j] will be partitioned and flattened into:
+    [A, B, C, D, E, F, G, H] and fed into the logical core ids:
+    [0, 1, 2, 3, 4, 5, 6, 7] respectively.
+
+    Args:
+      per_host_sharded_inputs: a list of lists of Tensors. The length of the
+        outer list determines the number of shards. Each inner list indicates
+        the types and shapes of the tuples in the corresponding shard.
+
+    Returns:
+      A list of host-side Ops, one for each shard, that when executed together
+      will enqueue a full-size element of infeed.
+
+    Raises:
+      ValueError: if the queue configuration has previously been frozen and the
+        shapes of the elements of sharded_inputs are not compatible with the
+        frozen configuration; or if the shapes of the elements of sharded_inputs
+        don't form a consistent unsharded tuple; or if the elements of a tuple
+        have different device constraints; or if the partition dims are invalid.
+      TypeError: if the queue configuration has previously been frozen and the
+        types of the elements of sharded_inputs are not compatible with the
+        frozen configuration; or if the types of the elements of sharded_inputs
+        don't form a consistent unsharded tuple.
+    """
+    self.set_configuration_from_sharded_input_tensors(per_host_sharded_inputs)
+    number_of_replicas_per_host = len(per_host_sharded_inputs)
+    number_of_tuple_elements = len(per_host_sharded_inputs[0])
+
+    assert len(self._input_partition_dims) == number_of_tuple_elements
+    per_host_enqueue_ops = []
+
+    for replica_index in range(number_of_replicas_per_host):
+      flattened_inputs = per_host_sharded_inputs[replica_index]
+      inputs_part_dims_flat = nest.flatten_up_to(flattened_inputs,
+                                                 self._input_partition_dims)
+      inputs_parted_iters = [
+          iter(self._check_dims_and_partition_or_replicate_on_host(x, dims))
+          for x, dims in zip(per_host_sharded_inputs[replica_index],
+                             inputs_part_dims_flat)
+      ]
+
+      for logical_core in xrange(self._device_assignment.num_cores_per_replica):
+        # Places different partitions to different logic cores.
+        replica_id = self._device_assignment.lookup_replicas(
+            self._host_id, logical_core)[replica_index]
+        ordinal = self._device_assignment.tpu_ordinal(
+            replica=replica_id, logical_core=logical_core)
+        infeed_inputs = []
+        for it in inputs_parted_iters:
+          input_for_device = next(it, None)
+          if input_for_device is not None:
+            infeed_inputs.append(input_for_device)
+
+        if infeed_inputs:
+          per_host_enqueue_ops.append(
+              tpu_ops.infeed_enqueue_tuple(
+                  inputs=infeed_inputs,
+                  shapes=[x.shape for x in infeed_inputs],
+                  name="enqueue/replica_{0}/input_{1}".format(
+                      replica_index, logical_core),
+                  device_ordinal=ordinal))
+    return per_host_enqueue_ops
+
+  def _check_input_partition_dims(self, tensor, dims):
+    """Checks that input partition dims are valid for the `Tensor`.
+
+    Args:
+      tensor: Input tensor for partitioning.
+      dims: A list of integer describes how to partition the input tensor.
+
+    Raises:
+      ValueError: If the tensor can't be partitioned by dims or the
+        num_cores_per_replica doesn't match the number of
+        partitions(dims.prod()).
+    """
+    # No partitioning specified, so don't perform further checks.
+    if dims is None:
+      return
+
+    dims = np.array(dims)
+
+    if (dims < 1).any():
+      raise ValueError("All input partition dims must be >= 1.")
+
+    # No partitioning, so don't perform further checks.
+    if dims.prod() == 1:
+      return
+
+    if dims.prod() != self._device_assignment.num_cores_per_replica:
+      raise ValueError(
+          "The product of each input parition dim should equal to "
+          "num_cores_per_replica. (dim = {}, num_cores_per_replica "
+          "= {})".format(dims, self._device_assignment.num_cores_per_replica))
+    if dims.shape[0] != tensor.shape.ndims:
+      raise ValueError(
+          "Input partition dims must have the same number of dimensions "
+          "as the `Tensor` to be partitioned. (tensor shape = {}, input "
+          "partition dims = {}).".format(tensor.shape.as_list(), dims))
+
+    tensor.shape.assert_is_fully_defined()
+
+  def _check_dims_and_partition_or_replicate_on_host(self, tensor, dims):
+    """Checks dims and partitions or replicates the input tensor.
+
+      The ops inside this function are placed on the host side.
+
+    Args:
+      tensor: The input tensor which will be partioned or replicated.
+      dims: A list of integer describes how to partition the input tensor.
+
+    Returns:
+      An iterator of `Tensor`s or a list of partioned tensors.
+    """
+    self._check_input_partition_dims(tensor, dims)
+    return partition_or_replicate_on_host(tensor, dims)
diff --git a/tensorflow/python/tpu/tpu_function.py b/tensorflow/python/tpu/tpu_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..422c7d3b26ffb4ad1b72450c4803ac2eb87cea3b
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_function.py
@@ -0,0 +1,66 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Helper library for functions used during TPU compilation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+
+
+class TpuContext(object):
+  """A context object holding state about the TPU computation being built."""
+
+  def __init__(self):
+    """Creates a new TpuContext."""
+    self._number_of_shards = None
+
+  @property
+  def number_of_shards(self):
+    return self._number_of_shards
+
+  def set_number_of_shards(self, number_of_shards):
+    self._number_of_shards = number_of_shards
+
+
+# The Tpu context holds the number of shards when a sharded computation is
+# being built, or None if no computation is being built.
+_current_tpu_context = TpuContext()
+
+
+@contextlib.contextmanager
+def tpu_shard_context(number_of_shards):
+  if _current_tpu_context.number_of_shards is not None:
+    raise NotImplementedError("tpu_shard_context cannot be nested.")
+  try:
+    _current_tpu_context.set_number_of_shards(number_of_shards)
+    yield
+  finally:
+    _current_tpu_context.set_number_of_shards(None)
+
+
+def get_tpu_context():
+  return _current_tpu_context
+
+
+# Decorator function for tpu computation func that was passed to tpu.rewrite()
+# if there is an embedded training loop in this func, trace tools will generate
+# step markers for each iteration.
+def on_device_training_loop(func):
+  # Value for this attribute is from xla.DebugOptions.StepMarkerLocation.
+  setattr(func, "step_marker_location", "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP")
+  return func
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_infeed_test.py b/tensorflow/python/tpu/tpu_infeed_test.py
similarity index 99%
rename from tensorflow/contrib/tpu/python/tpu/tpu_infeed_test.py
rename to tensorflow/python/tpu/tpu_infeed_test.py
index a41ff60d0af6c89fa9825d557aceefc9f6b8098d..3e90979157f891a989209fea4e56ff7090dde837 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_infeed_test.py
+++ b/tensorflow/python/tpu/tpu_infeed_test.py
@@ -19,11 +19,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tpu.python.tpu import tpu_feed
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import tpu_feed
 
 
 class InfeedTest(test.TestCase):
diff --git a/tensorflow/python/tpu/tpu_optimizer.py b/tensorflow/python/tpu/tpu_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..22c409eaa1cd4d499b72dbfbf429324d5f641e7c
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_optimizer.py
@@ -0,0 +1,203 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Optimizer that implements cross-shard gradient reduction for TPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import tpu_function
+from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.training import optimizer
+
+
+class CrossShardOptimizer(optimizer.Optimizer):
+  """An optimizer that averages gradients across TPU shards."""
+
+  def __init__(self,
+               opt,
+               reduction=losses.Reduction.MEAN,
+               name="CrossShardOptimizer",
+               group_assignment=None):
+    """Construct a new cross-shard optimizer.
+
+    Args:
+      opt: An existing `Optimizer` to encapsulate.
+      reduction: The reduction to apply to the shard losses.
+      name: Optional name prefix for the operations created when applying
+        gradients. Defaults to "CrossShardOptimizer".
+      group_assignment: Optional 2d int32 lists with shape
+        [num_groups, num_replicas_per_group] which describles how to apply
+        optimizer to subgroups.
+
+    Raises:
+      ValueError: If reduction is not a valid cross-shard reduction.
+    """
+    if reduction not in (losses.Reduction.SUM, losses.Reduction.MEAN):
+      raise ValueError("Unsupported reduction: %s." % reduction)
+
+    super(CrossShardOptimizer, self).__init__(False, name)
+    self._opt = opt
+    self._reduction = reduction
+    self._group_assignment = group_assignment
+
+  def _verify_and_get_subgroup_size(self, group_assignment, num_shards):
+    """Verify group_assignment and get the subgroup size".
+
+    Args:
+      group_assignment: list of group ids for applying the optimizer
+        to subgroups.
+      num_shards: The number of TPU shards.
+
+    Returns:
+      The size of one subgroup in group_assignment.
+
+    Raises:
+      ValueError: If group_assignment is invalid.
+    """
+    if not group_assignment:
+      return None
+    if not (isinstance(group_assignment, list) and
+            all(isinstance(i, list) for i in group_assignment)):
+      raise ValueError("group_assignment must be a list of list. Got {}".format(
+          group_assignment))
+
+    replica_ids = set()
+    for g in group_assignment:
+      for i in g:
+        replica_ids.add(i)
+
+    if set(range(num_shards)) != replica_ids:
+      raise ValueError("group_assignment must be a permutation of range({0})."
+                       " Got group_assignment={1}".format(
+                           num_shards, group_assignment))
+
+    subgroup_size_list = [len(group) for group in group_assignment]
+    if all(subgroup_size_list[0] == size for size in subgroup_size_list):
+      return subgroup_size_list[0]
+    else:
+      raise ValueError("The size of each subgroup in group_assignment must "
+                       "be equal. Got group_assignment={}".format(
+                           self._group_assignment))
+
+  def compute_gradients(self, loss, var_list=None, **kwargs):
+    """Compute gradients of "loss" for the variables in "var_list".
+
+    This simply wraps the compute_gradients() from the real optimizer. The
+    gradients will be aggregated in the apply_gradients() so that user can
+    modify the gradients like clipping with per replica global norm if needed.
+    The global norm with aggregated gradients can be bad as one replica's huge
+    gradients can hurt the gradients from other replicas.
+
+    Args:
+      loss: A Tensor containing the value to minimize.
+      var_list: Optional list or tuple of `tf.Variable` to update to minimize
+        `loss`.  Defaults to the list of variables collected in the graph
+        under the key `GraphKey.TRAINABLE_VARIABLES`.
+      **kwargs: Keyword arguments for compute_gradients().
+
+    Returns:
+      A list of (gradient, variable) pairs.
+
+    Raises:
+      ValueError: If not within a tpu_shard_context or group_assignment is
+        invalid.
+    """
+    num_shards = tpu_function.get_tpu_context().number_of_shards
+    if num_shards is None:
+      logging.warning(
+          "CrossShardOptimizer should be used within a tpu_shard_context, but "
+          "got unset number_of_shards. Assuming 1.")
+      num_shards = 1
+
+    subgroup_size = self._verify_and_get_subgroup_size(self._group_assignment,
+                                                       num_shards)
+
+    if num_shards > 1 and self._reduction == losses.Reduction.MEAN:
+      if self._group_assignment:
+        scale = 1.0 / subgroup_size
+      else:
+        scale = 1.0 / num_shards
+      loss *= scale
+
+    return self._opt.compute_gradients(loss, var_list=var_list, **kwargs)
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    """Apply gradients to variables.
+
+    Calls tpu_ops.cross_replica_sum() to sum gradient contributions across
+    replicas, and then applies the real optimizer.
+
+    Args:
+      grads_and_vars: List of (gradient, variable) pairs as returned by
+        compute_gradients().
+      global_step: Optional Variable to increment by one after the
+        variables have been updated.
+      name: Optional name for the returned operation.  Default to the
+        name passed to the Optimizer constructor.
+
+    Returns:
+      An `Operation` that applies the gradients. If `global_step` was not None,
+      that operation also increments `global_step`.
+
+    Raises:
+      ValueError: If the grads_and_vars is malformed.
+    """
+    summed_grads_and_vars = []
+    for (grad, var) in grads_and_vars:
+      if grad is None:
+        summed_grads_and_vars.append((grad, var))
+      else:
+        with ops.colocate_with(grad):
+          summed_grads_and_vars.append((tpu_ops.cross_replica_sum(
+              grad, self._group_assignment), var))
+    return self._opt.apply_gradients(summed_grads_and_vars, global_step, name)
+
+  def get_slot(self, *args, **kwargs):
+    """Return a slot named "name" created for "var" by the Optimizer.
+
+    This simply wraps the get_slot() from the actual optimizer.
+
+    Args:
+      *args: Arguments for get_slot().
+      **kwargs: Keyword arguments for get_slot().
+
+    Returns:
+      The `Variable` for the slot if it was created, `None` otherwise.
+    """
+    return self._opt.get_slot(*args, **kwargs)
+
+  def get_slot_names(self, *args, **kwargs):
+    """Return a list of the names of slots created by the `Optimizer`.
+
+    This simply wraps the get_slot_names() from the actual optimizer.
+
+    Args:
+      *args: Arguments for get_slot().
+      **kwargs: Keyword arguments for get_slot().
+
+    Returns:
+      A list of strings.
+    """
+    return self._opt.get_slot_names(*args, **kwargs)
+
+  def variables(self):
+    """Forwarding the variables from the underlying optimizer."""
+    return self._opt.variables()
diff --git a/tensorflow/python/tpu/tpu_sharding.py b/tensorflow/python/tpu/tpu_sharding.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5af03f33ca8f13af517007672e9ce0e12be6205
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_sharding.py
@@ -0,0 +1,253 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Helper library for sharding during TPU compilation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.python.framework import tensor_shape
+
+_DEFAULT_NUMBER_OF_SHARDS = 1
+_DEFAULT_SHARD_DIMENSION = 0
+
+
+# TODO(b/36777903) change other parts of tpu.py to use this class.
+class ShardingPolicy(object):
+  """An object use to hold the sharding policy for a Tensor.
+  """
+
+  def __init__(self):
+    self._number_of_shards = None
+    self._shard_dimension = None
+    self._frozen = False
+
+  def __str__(self):
+    if self.number_of_shards is None or self.shard_dimension is None:
+      return "ShardingPolicy(unset)"
+    else:
+      return ("ShardingPolicy(%d shards dimension %d)" %
+              (self.number_of_shards, self.shard_dimension))
+
+  def _fill_default_values(self):
+    if self._number_of_shards is None:
+      self._number_of_shards = _DEFAULT_NUMBER_OF_SHARDS
+    if self._shard_dimension is None:
+      self._shard_dimension = tensor_shape.as_dimension(
+          _DEFAULT_SHARD_DIMENSION)
+
+  def freeze(self):
+    """Prevents further modification to the sharding policy.
+
+    Any values that have not been set when freeze is called are set to
+    defaults. If the ShardingPolicy is already frozen, this is a NoOp.
+    """
+    if not self._frozen:
+      self._fill_default_values()
+      self._frozen = True
+
+  @property
+  def number_of_shards(self):
+    """Returns the number of shards in the policy or None if unspecified."""
+    return self._number_of_shards
+
+  def set_number_of_shards(self, number_of_shards):
+    """Sets the number of shards for the current policy.
+
+    If the policy has been frozen then number_of_shards must match the
+    existing setting.
+
+    Args:
+      number_of_shards: The number of shards to use in the policy.
+
+    Raises:
+      ValueError: If the policy has been frozen and number_of_shards
+        differs from the frozen value; or number_of_shards <= 0.
+    """
+    if self._frozen:
+      if self._number_of_shards != number_of_shards:
+        raise ValueError(
+            "Can't set sharding policy to use %d shards since it has been "
+            "frozen to use %d." % (number_of_shards, self._number_of_shards))
+    else:
+      if number_of_shards > 0:
+        self._number_of_shards = number_of_shards
+      else:
+        raise ValueError(
+            "Can't set sharding policy to use %s shards; value must be >0",
+            str(number_of_shards))
+
+  @property
+  def shard_dimension(self):
+    """Returns the shard dimension of the policy or None if unspecified."""
+    return self._shard_dimension
+
+  def set_shard_dimension(self, shard_dimension):
+    """Sets the shard dimension for the current policy.
+
+    If the policy has been frozen then shard_dimension must match the
+    existing setting.
+
+    Args:
+      shard_dimension: The shard dimension to use in the policy.
+
+    Raises:
+      ValueError: If the policy has been frozen and shard_dimension
+        differs from the frozen value, or shard_dimension can't be
+        interpreted as a Dimension.
+    """
+    if self._frozen:
+      if self._shard_dimension != shard_dimension:
+        raise ValueError(
+            "Can't set shard dimension to %d since it has been frozen to "
+            "use %d." % (shard_dimension, self._shard_dimension))
+    else:
+      self._shard_dimension = tensor_shape.as_dimension(shard_dimension)
+
+  def merge(self, other):
+    """Merges the policy of another policy into the current policy.
+
+    Args:
+      other: The policy to merge into this one.
+
+    Raises:
+      ValueError: If this policy has been frozen and the merge conflicts with
+      the frozen policy.
+    """
+    if other.number_of_shards is not None:
+      self.set_number_of_shards(other.number_of_shards)
+    if other.shard_dimension is not None:
+      self.set_shard_dimension(other.shard_dimension)
+
+  def get_sharded_shape(self, shape, shard_index=None):
+    """Returns the shape of a shard of a full Tensor.
+
+    When given the shape of a 'full-size' Tensor, returns the shape of
+    the sub-Tensor after it has been sharded. Freezes the policy if it
+    has not yet been frozen.
+
+    Args:
+      shape: The shape of the full-size Tensor to be sharded.
+      shard_index: The index of the shard whose shape should be returned.
+        shard_index can be None for sharding policies that use the same
+        shape for every shard.
+      freeze_config:
+
+    Returns:
+      The shape of the sharded version of the Tensor.
+
+    Raises:
+      ValueError: If shard_index is None when shards are of different
+        shapes; or shard_index is not None and
+        !(0<=shard_index<number_of_shards); or shape does not have at
+        least self.shard_dimension+1 dimensions; or the value of
+        shape's shard dimension is not a multiple of
+        self.number_of_shards
+    """
+    if self._shard_dimension is None or self._number_of_shards is None:
+      # Don't raise an error if the config is unset.
+      return None
+    if shard_index is not None:
+      if shard_index < 0 or shard_index >= self.number_of_shards:
+        raise ValueError("shard_index %d, but must be in [0,%d)." %
+                         (shard_index, self._number_of_shards))
+    shape = tensor_shape.as_shape(shape)
+    if self._number_of_shards == 1:
+      # Don't do anything when there's only one shard.
+      return shape
+    ndims = shape.ndims
+    if ndims is None:
+      raise ValueError("shape must be a specified shape not Unknown")
+    if ndims <= self._shard_dimension:
+      raise ValueError("shape %s does not contain shard_dimension %d" %
+                       (shape.as_list(), self._shard_dimension))
+    dims = shape.as_list()
+    if dims[self._shard_dimension] is None:
+      raise ValueError("shape %s must have a fixed size for dimension %d "
+                       "that is known at graph construction time." %
+                       (shape.as_list(), self._shard_dimension))
+    if (dims[self._shard_dimension] % self._number_of_shards) != 0:
+      raise ValueError("shape %s cannot be sharded %d ways along dimension %d" %
+                       (shape.as_list(), self._number_of_shards,
+                        self._shard_dimension))
+    dims[self._shard_dimension] /= self._number_of_shards
+    return tensor_shape.as_shape(dims)
+
+  def _unshard_shape(self, shape):
+    """Return the unsharded shape that would generate a given sharded shape.
+
+    Args:
+      shape: the sharded shape to unshard
+
+    Returns:
+      The unsharded shape.
+
+    Raises:
+      ValueError: if shape is unknown or does not contain
+        self.shard_dimension
+      TypeError: if shape is not convertible to a TensorShape
+    """
+    shape = tensor_shape.as_shape(shape)
+    if self._number_of_shards == 1:
+      # Don't do anything when there's only one shard.
+      return shape
+    ndims = shape.ndims
+    if ndims is None:
+      raise ValueError("shape must be a specified shape not Unknown")
+    if ndims <= self._shard_dimension:
+      raise ValueError("shape %s does not contain shard_dimension %d" %
+                       (shape.as_list(), self._shard_dimension))
+    dims = shape.as_list()
+    dims[self._shard_dimension] *= self._number_of_shards
+    return tensor_shape.as_shape(dims)
+
+  def get_unsharded_shape(self, shapes):
+    """Returns the shape of an unsharded Tensor given a list of shards.
+
+    When given a list of shapes of shards, returns the shape of the
+    unsharded Tensor that would generate the shards. Sets defaults for the
+    policy if number_of_shards or shard_dimension is None.
+
+    Args:
+      shapes: The shapes of the Tensor shards to be combined.
+
+    Returns:
+      The shape of the unsharded version of the Tensor.
+
+    Raises:
+      ValueError: if shapes is not a list of length
+        self.number_of_shards; or any element of shapes is not a valid
+        shape consistent with the sharding policy; or the list of
+        shapes is not a valid sharding of a full shape.
+      TypeError: if an element of shapes is not convertible to a
+        TensorShape
+    """
+    self._fill_default_values()
+    if len(shapes) != self.number_of_shards:
+      raise ValueError(
+          "shapes is %s but must be a list of length number_of_shards=%d" % (
+              str(shapes), self.number_of_shards))
+    unsharded_shapes = [self._unshard_shape(s) for s in shapes]
+    for i in xrange(self.number_of_shards - 1):
+      if not unsharded_shapes[i].is_compatible_with(
+          unsharded_shapes[self.number_of_shards - 1]):
+        raise ValueError(
+            "sharded shapes %s are not consistent shards of a full shape "
+            "sharded %d ways along dimension %d" % (
+                str(shapes), self.number_of_shards, self.shard_dimension))
+    return unsharded_shapes[0]
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_sharding_test.py b/tensorflow/python/tpu/tpu_sharding_test.py
similarity index 98%
rename from tensorflow/contrib/tpu/python/tpu/tpu_sharding_test.py
rename to tensorflow/python/tpu/tpu_sharding_test.py
index b0a5511d2d7683a5e0f527e49651df236c7a68d4..21d2a0897a0ff938359a4ca29e077678778ddc56 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_sharding_test.py
+++ b/tensorflow/python/tpu/tpu_sharding_test.py
@@ -19,10 +19,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tpu.python.tpu import tpu_sharding
 
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import tpu_sharding
 
 
 class ShardingTest(test.TestCase):
diff --git a/tensorflow/python/tpu/tpu_system_metadata.py b/tensorflow/python/tpu/tpu_system_metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..866895922a42e0db8c46f515a817a31fc54d3401
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_system_metadata.py
@@ -0,0 +1,156 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""TPU system metadata and associated tooling."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import tpu
+
+_PINGING_MASTER_TIMEOUT_IN_MS = 60 * 1000  # 1 min
+_RETRY_TIMES = 120
+_INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS = 300 * 1000  # 5 mins
+
+_TPU_DEVICE_REG = re.compile(r'.*task:(\d+)/.*device:TPU:(\d+)$')
+
+# _TPUSystemMetadata is used by TPUEstimator to hold TPU configuration,
+# including num_cores and num_hosts.
+_TPUSystemMetadata = collections.namedtuple('_TPUSystemMetadata', [
+    'num_cores',
+    'num_hosts',
+    'num_of_cores_per_host',
+    'topology',
+    'devices',
+])
+
+
+def _query_tpu_system_metadata(master_address, cluster_def=None,
+                               query_topology=False):
+  """Automatically detects the TPU system metadata in the system."""
+  tpu_core_count = 0
+  devices = []
+  device_dict = collections.defaultdict(list)
+
+  # TODO(b/120564445): Replace with standard library for retries.
+  retry_count = 1
+  while True:
+    logging.info('Querying Tensorflow master (%s) for TPU system metadata.',
+                 master_address)
+    try:
+      with ops.Graph().as_default():
+        with session_lib.Session(
+            master_address,
+            config=get_session_config_with_timeout(
+                _PINGING_MASTER_TIMEOUT_IN_MS,
+                cluster_def)) as sess:
+          devices = sess.list_devices()
+          for device in devices:
+            match = _TPU_DEVICE_REG.match(device.name)
+            if match:
+              host_id = match.group(1)
+              core_id = match.group(2)
+              device_dict[host_id].append(core_id)
+              tpu_core_count += 1
+          break
+    except errors.DeadlineExceededError:
+      msg = ('Failed to connect to the Tensorflow master. The TPU worker may '
+             'not be ready (still scheduling) or the Tensorflow master address '
+             'is incorrect: got (%s).' %
+             (master_address))
+
+      # TODO(xiejw): For local or grpc master we might not need retry logic
+      # here.
+      if retry_count <= _RETRY_TIMES:
+        logging.warning('%s', msg)
+        logging.warning('Retrying (%d/%d).', retry_count, _RETRY_TIMES)
+        retry_count += 1
+      else:
+        raise ValueError(msg)
+
+  num_of_cores_per_host = 0
+  if tpu_core_count:
+    num_cores_per_host_set = set(
+        [len(core_ids) for core_ids in device_dict.values()])
+    if len(num_cores_per_host_set) != 1:
+      raise RuntimeError(
+          'TPU cores on each host is not same. This should not happen!. '
+          'devices: {}'.format(devices))
+    num_of_cores_per_host = num_cores_per_host_set.pop()
+
+  topology = None
+  if query_topology:
+    if not tpu_core_count:
+      raise RuntimeError(
+          'Cannot find any TPU cores in the system (master address {}). '
+          'This usually means the master address is incorrect or the '
+          'TPU worker has some problems. Available devices: {}'.format(
+              master_address, devices))
+
+    topology = _obtain_topology(master_address, cluster_def)
+
+  metadata = _TPUSystemMetadata(
+      num_cores=tpu_core_count,
+      num_hosts=len(device_dict),
+      num_of_cores_per_host=num_of_cores_per_host,
+      topology=topology,
+      devices=devices)
+
+  if tpu_core_count:
+    logging.info('Found TPU system:')
+    logging.info('*** Num TPU Cores: %d', metadata.num_cores)
+    logging.info('*** Num TPU Workers: %d', metadata.num_hosts)
+    logging.info('*** Num TPU Cores Per Worker: %d',
+                 metadata.num_of_cores_per_host)
+    for device in metadata.devices:
+      logging.info('*** Available Device: %s', device)
+  else:
+    logging.info('Failed to find TPU: %s', metadata)
+  return metadata
+
+
+def _obtain_topology(master_address, cluster_def):
+  """Obtains TPU fabric topology."""
+  try:
+    logging.info('Initializing TPU system (master: %s) to fetch topology '
+                 'for model parallelism. This might take a while.',
+                 master_address)
+    with ops.Graph().as_default():
+      session_config = get_session_config_with_timeout(
+          _INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS, cluster_def)
+      with session_lib.Session(
+          master_address, config=session_config) as sess:
+        topology = sess.run(tpu.initialize_system())
+        return topology
+  except errors.DeadlineExceededError:
+    raise ValueError(
+        'Fail to initialize TPU system with master (%s). '
+        'Please double check the TPU system is functional.' % (
+            master_address))
+
+
+def get_session_config_with_timeout(timeout_in_secs, cluster_def):
+  """Returns a session given a timeout and a cluster configuration."""
+  config = config_pb2.ConfigProto(
+      operation_timeout_in_ms=timeout_in_secs, cluster_def=cluster_def)
+  return config
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_test.py b/tensorflow/python/tpu/tpu_test.py
similarity index 94%
rename from tensorflow/contrib/tpu/python/tpu/tpu_test.py
rename to tensorflow/python/tpu/tpu_test.py
index 6bdaa528f9f946ae4b9813d554409da2406b1f8d..69b03811daa7055e20471474767242788eecdc0a 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_test.py
+++ b/tensorflow/python/tpu/tpu_test.py
@@ -19,18 +19,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.tpu.python.tpu import tpu
-from tensorflow.contrib.tpu.python.tpu import tpu_feed
-from tensorflow.contrib.tpu.python.tpu import training_loop
-
 from tensorflow.python.framework import dtypes
 from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
-
 from tensorflow.python.platform import test
+from tensorflow.python.tpu import tpu
+from tensorflow.python.tpu import tpu_feed
+from tensorflow.python.tpu import training_loop
 
 
 class TPUContextTest(test.TestCase):
diff --git a/tensorflow/python/tpu/training_loop.py b/tensorflow/python/tpu/training_loop.py
new file mode 100644
index 0000000000000000000000000000000000000000..cffeb7e915a06a513e96e1ed60beabf6b79b6518
--- /dev/null
+++ b/tensorflow/python/tpu/training_loop.py
@@ -0,0 +1,222 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Library for constructing a training loop, suitable for TPUs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.tpu import tensor_tracer
+from tensorflow.python.tpu import tpu_function
+from tensorflow.python.tpu import xla
+
+
+def while_loop(condition, body, inputs=None, infeed_queue=None, name=None):
+  """Builds a training loop for TPUs.
+
+  The set of loop-carried tensors corresponds to `inputs`.  Both
+  `condition` and `body` take the current value of the loop-carried
+  tensors. 'body' additionally takes a tuple of infeed from
+  infeed_queue if infeed_queue is not None. `condition` must return a
+  single boolean value that determines whether iteration
+  continues. `body` must return an updated list of values for the
+  loop-carried tensors.
+
+  Args:
+    condition: a Python function that builds the loop condition.
+    body: a Python function that builds the loop body.
+    inputs: a list of initial values passed into the training loop, or
+      None (equivalent to an empty list).
+    infeed_queue: if not None, the infeed queue from which to append a tuple
+      of arguments as inputs to condition.
+    name: (Deprecated) Does nothing.
+
+  Returns:
+    The final values of the loop-carried tensors.
+
+  Raises:
+    TypeError: if body or condition has the wrong signature.
+  """
+  del name
+  # Converts inputs to Tensors.
+  inputs = [] if inputs is None else [ops.convert_to_tensor(x) for
+                                      x in inputs]
+  input_types = [x.dtype for x in inputs]
+  input_arity = len(inputs)
+
+  body_arg_error = xla.check_function_argument_count(
+      body, input_arity, infeed_queue)
+  if body_arg_error is not None:
+    if infeed_queue is None:
+      raise TypeError(
+          "Supplied loop body function cannot be called with the specified "
+          "inputs. You specified %d inputs: %s, but the loop body needs %s" % (
+              input_arity, str([i.name for i in inputs]), body_arg_error))
+    else:
+      raise TypeError(
+          "Supplied loop body function cannot be called with the specified "
+          "inputs. You specified %d inputs: %s and %d additional inputs from "
+          "infeed, but the computation needs %s" % (input_arity, str(
+              [i.name for i in inputs]), infeed_queue.number_of_tuple_elements,
+                                                    body_arg_error))
+  condition_arg_error = xla.check_function_argument_count(
+      condition, input_arity, None)
+  if condition_arg_error is not None:
+    if infeed_queue is None:
+      raise TypeError(
+          "Supplied loop condition function cannot be called with the "
+          "specified inputs. You specified %d inputs: %s, but the loop "
+          "condition needs %s" % (input_arity, str([i.name for i in inputs]),
+                                  condition_arg_error))
+    else:
+      raise TypeError(
+          "Supplied loop condition function cannot be called with the "
+          "specified inputs. You specified %d inputs: %s, but the loop "
+          "condition needs %s. Note that infeed is not passed to the loop "
+          "condition." % (input_arity, str([i.name for i in inputs]),
+                          condition_arg_error))
+
+  def condition_wrapper(*inputs):
+    # Discards the dummy output added for arity-0 loops.
+    if input_arity == 0:
+      inputs = []
+    return condition(*inputs)
+
+  def body_wrapper(*inputs):
+    """Wrapper around `body` that handles infeed queues and control deps."""
+    inputs = list(inputs)
+
+    # Discards the dummy output added for arity-0 loops.
+    if input_arity == 0:
+      inputs = []
+
+    # Runs `body` with the dequeue_ops appended.
+    if infeed_queue:
+      number_of_shards = tpu_function.get_tpu_context().number_of_shards
+      if number_of_shards is None:
+        raise ValueError("Can't build training loop with infeed when there is "
+                         "no tpu_shard_context. Are you building a loop or "
+                         "graph directly rather than from inside tpu.rewrite, "
+                         "tpu.batch_parallel, tpu.shard, or tpu.replicate?")
+      infeed_queue.set_number_of_shards(number_of_shards)
+      dequeue_ops = [d for d in infeed_queue.generate_dequeue_op()]
+    else:
+      dequeue_ops = []
+    outputs = body(*(inputs + dequeue_ops))
+
+    # If the computation only returned one value, make it a tuple.
+    if not isinstance(outputs, (list, tuple)):
+      outputs = (outputs,)
+
+    outputs = [
+        o if isinstance(o, ops.Operation) else ops.convert_to_tensor(o)
+        for o in outputs
+    ]
+
+    # Separates the returned Operations and Tensors.
+    output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
+    output_tensors = [o for o in outputs
+                      if not isinstance(o, ops.Operation)]
+
+    if outputs != output_tensors + output_operations:
+      raise ValueError(
+          "TPU training loop body must return zero or more Tensor values "
+          "followed by zero or more Operations.")
+
+    output_types = [op.dtype for op in output_tensors]
+    if input_types != output_types:
+      raise TypeError(
+          "Mismatch between input types and output types for training loop "
+          "body: {} vs {}".format(input_types, output_types))
+
+    # Add the dequeue operations to output_operations to ensure they are run
+    # by the loop, even if the programmer's loop body does not use them.
+    output_operations += dequeue_ops
+
+    # Add a dummy output, if needed.
+    if not output_tensors:
+      output_tensors = array_ops.constant(0)
+
+    if output_operations:
+      # TODO(phawkins): in principle this is too restrictive since it serializes
+      # the training loop steps. In practice it does not matter since this loop
+      # will be compiled by XLA.
+      output_tensors = control_flow_ops.tuple(output_tensors,
+                                              control_inputs=output_operations)
+
+    if tensor_tracer.TensorTracer.is_enabled():
+      num_replicas = tpu_function.get_tpu_context().number_of_shards
+      if num_replicas is None:
+        num_replicas = 1
+      tt = tensor_tracer.TensorTracer()
+      output_tensors = tt.trace_tpu(ops.get_default_graph(),
+                                    output_tensors, None,
+                                    num_replicas)
+    return output_tensors
+
+  # If the body has arity 0, add a dummy loop-carried value to which we can add
+  # control dependencies from any side-effecting operations.
+  if input_arity == 0:
+    inputs = [array_ops.constant(0)]
+  return control_flow_ops.while_loop(
+      condition_wrapper, body_wrapper, inputs, name="", parallel_iterations=1)
+
+
+def repeat(n, body, inputs=None, infeed_queue=None, name=None):
+  """Builds a training loop that executes a fixed number of iterations.
+
+  The set of loop-carried tensors correspond to `inputs`.
+  `body` must be a function that takes and returns the values of the
+  loop-carried tensors.
+
+  Args:
+    n: the number of loop iterations
+    body: a Python function that builds the loop body.
+    inputs: a list of initial values passed into the training loop or
+      None (equivalent to an empty list).
+    infeed_queue: if not None, the infeed queue from which to append a tuple
+      of arguments as inputs to condition.
+    name: (Deprecated) Does nothing.
+  Returns:
+    The final values of the loop-carried tensors.
+  Raises:
+    ValueError: if there is a type error.
+  """
+  def _convert_to_list(xs):
+    if not isinstance(xs, (list, tuple)):
+      return [xs]
+    else:
+      return list(xs)
+
+  def cond(i, *args):
+    del args
+    return i < n
+
+  def body_wrapper(i, *args):
+    return [i + 1] + _convert_to_list(body(*args))
+
+  inputs = [0] if inputs is None else [0] + _convert_to_list(inputs)
+  outputs = while_loop(
+      cond, body_wrapper, inputs=inputs, infeed_queue=infeed_queue, name=name)
+  outputs = _convert_to_list(outputs)
+  if len(outputs) == 1:
+    # Returns the Op rather than an empty list.
+    return outputs[0].op
+  else:
+    return outputs[1:]
diff --git a/tensorflow/python/tpu/util.py b/tensorflow/python/tpu/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfb8ce1d1821da05c853bb0d10b1db3a857ccb1b
--- /dev/null
+++ b/tensorflow/python/tpu/util.py
@@ -0,0 +1,51 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+
+"""Utilities for the functionalities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+import six
+
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import training
+
+def check_positive_integer(value, name):
+  """Checks whether `value` is a positive integer."""
+  if not isinstance(value, six.integer_types):
+    raise TypeError('{} must be int, got {}'.format(name, type(value)))
+
+  if value <= 0:
+    raise ValueError('{} must be positive, got {}'.format(name, value))
+
+
+# TODO(b/118302029) Remove this copy of MultiHostDatasetInitializerHook after we
+# release a tensorflow_estimator with MultiHostDatasetInitializerHook in
+# python/estimator/util.py.
+class MultiHostDatasetInitializerHook(training.SessionRunHook):
+  """Creates a SessionRunHook that initializes all passed iterators."""
+
+  def __init__(self, dataset_initializers):
+    self._initializers = dataset_initializers
+
+  def after_create_session(self, session, coord):
+    del coord
+    start = time.time()
+    session.run(self._initializers)
+    logging.info('Initialized dataset iterators in %d seconds',
+                 time.time() - start)
diff --git a/tensorflow/python/tpu/xla.py b/tensorflow/python/tpu/xla.py
new file mode 100644
index 0000000000000000000000000000000000000000..58476fae3d132aeeac7c23f176e23ea609478b15
--- /dev/null
+++ b/tensorflow/python/tpu/xla.py
@@ -0,0 +1,106 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""XLA utility functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.python.util import tf_inspect
+
+
+def is_flat(outputs):
+  """Checks if outputs is a flat structure.
+
+    Following structures and values are considered flat:
+    1) None
+    2) A single object
+    3) A list or tuple of Tensors/Operations
+
+    The only structures that this function understands are sequences and
+    dictionaries.  E.g. this means that if outputs contains a single
+    user-defined Object, it is considered to be flat. Errors are raised later on
+    if that Object cannot be converted to a Tensor.
+
+  Args:
+    outputs: Output from `computation` inside `xla.compile`.
+
+  Returns:
+    A boolean indicates whether outputs is flat.
+  """
+  # If outputs is a list or tuple, check if it has any nested structure. If
+  # there is, then outputs is non-flat.
+  if isinstance(outputs, collections.Sequence):
+    for o in outputs:
+      if isinstance(o, collections.Sequence) or isinstance(o, dict):
+        return False
+
+  # If outputs is a dict, it is non-flat.
+  if isinstance(outputs, dict):
+    return False
+
+  # Getting here means either outputs itself is a single non-structured value
+  # or it is a flat list of single non-structured values.
+  return True
+
+
+def check_function_argument_count(func, input_arity, infeed_queue):
+  """Validate the number of input arguments to an XLA function.
+
+  Args:
+    func: the Python function that will be called to generate the body of an XLA
+      computation graph.
+    input_arity: the number of explicit arguments supplied by the caller.
+    infeed_queue: if not None, the infeed queue that will supply
+      additional arguments to the function.
+
+  Returns:
+    None if function can be called with the supplied number of
+      arguments, or an error string if it cannot.
+  """
+  def format_error(complaint, quantity):
+    return '%s %d argument%s' % (complaint, quantity, ''
+                                 if quantity == 1 else 's')
+
+  num_args_supplied = input_arity
+  if infeed_queue is not None:
+    num_args_supplied += infeed_queue.number_of_tuple_elements
+  arg_spec = tf_inspect.getargspec(func)
+  num_func_args = len(arg_spec.args)
+  if arg_spec.defaults is None:
+    num_func_defaults = 0
+  else:
+    num_func_defaults = len(arg_spec.defaults)
+  min_func_args = num_func_args - num_func_defaults
+  if num_args_supplied < min_func_args:
+    # The required number of arguments is not enough to call the function.
+    if num_func_defaults == 0 and arg_spec.varargs is None:
+      return format_error('exactly', num_func_args)
+    else:
+      return format_error('at least', min_func_args)
+  if arg_spec.varargs is None and num_args_supplied > num_func_args:
+    # The required number of arguments is too many to call the function.
+    if num_func_defaults == 0:
+      return format_error('exactly', num_func_args)
+    else:
+      return format_error('at most', num_func_args)
+  # Reaching here means either
+  # 1) There are varargs, func can accept any number of arguments greater than
+  # the minimum.
+  # 2) Number of supplied arguments falls in range of acceptable argument count
+  # of func.
+  return None
diff --git a/tensorflow/python/training/adagrad_test.py b/tensorflow/python/training/adagrad_test.py
index 1e2d29b337338985fb8ac27ab11d65667d22ee21..3528fdaa8b09b588d594d1aef61812a41c1ce373 100644
--- a/tensorflow/python/training/adagrad_test.py
+++ b/tensorflow/python/training/adagrad_test.py
@@ -106,7 +106,7 @@ class AdagradOptimizerTest(test.TestCase):
         pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
         loss = pred * pred
         sgd_op = adagrad.AdagradOptimizer(1.0).minimize(loss)
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         # Fetch params to validate initial values
         self.assertAllCloseAccordingToType([[1.0, 2.0], [3.0, 4.0]],
                                            self.evaluate(var0))
@@ -129,7 +129,7 @@ class AdagradOptimizerTest(test.TestCase):
             constant_op.constant(3.0), initial_accumulator_value=0.1)
         ada_update = ada_opt.apply_gradients(
             zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         # Fetch params to validate initial values
         self.assertAllClose([1.0, 2.0], self.evaluate(var0))
         self.assertAllClose([3.0, 4.0], self.evaluate(var1))
@@ -163,7 +163,7 @@ class AdagradOptimizerTest(test.TestCase):
         ada_opt = adagrad.AdagradOptimizer(3.0, initial_accumulator_value=0.1)
         ada_update = ada_opt.apply_gradients(
             zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         # Fetch params to validate initial values
         self.assertAllClose([[1.0], [2.0]], self.evaluate(var0))
         self.assertAllClose([[3.0], [4.0]], self.evaluate(var1))
@@ -198,7 +198,7 @@ class AdagradOptimizerTest(test.TestCase):
             [(grad_repeated_index, repeated_index_update_var)])
         aggregated_update = adagrad.AdagradOptimizer(3.0).apply_gradients(
             [(grad_aggregated, aggregated_update_var)])
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose(aggregated_update_var.eval(),
                             self.evaluate(repeated_index_update_var))
         for _ in range(3):
@@ -223,7 +223,7 @@ class AdagradOptimizerTest(test.TestCase):
             2.0).minimize(loss_repeated)
         update_op_aggregated = adagrad.AdagradOptimizer(
             2.0).minimize(loss_aggregated)
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllCloseAccordingToType(
             self.evaluate(var_repeated), self.evaluate(var_aggregated))
         for _ in range(3):
@@ -289,7 +289,7 @@ class AdagradOptimizerTest(test.TestCase):
         self.assertEquals(slot0.get_shape(), var0.get_shape())
         slot1 = ada_opt.get_slot(var1, "accumulator")
         self.assertEquals(slot1.get_shape(), var1.get_shape())
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
         # Fetch params to validate initial values.
         self.assertAllClose([1.0, 2.0], self.evaluate(var0))
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 0c701f47122caf7ae561ddfa84b98925226930e0..46ec3be54ec6851bd096d59f7298b6202608c53b 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Adam for TensorFlow."""
 from __future__ import absolute_import
 from __future__ import division
@@ -37,9 +36,14 @@ class AdamOptimizer(optimizer.Optimizer):
   ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
   """
 
-  def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
-               use_locking=False, name="Adam"):
-    """Construct a new Adam optimizer.
+  def __init__(self,
+               learning_rate=0.001,
+               beta1=0.9,
+               beta2=0.999,
+               epsilon=1e-8,
+               use_locking=False,
+               name="Adam"):
+    r"""Construct a new Adam optimizer.
 
     Initialization:
 
@@ -48,7 +52,7 @@ class AdamOptimizer(optimizer.Optimizer):
     $$t := 0 \text{(Initialize timestep)}$$
 
     The update rule for `variable` with gradient `g` uses an optimization
-    described at the end of section2 of the paper:
+    described at the end of section 2 of the paper:
 
     $$t := t + 1$$
     $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
@@ -75,23 +79,20 @@ class AdamOptimizer(optimizer.Optimizer):
 
     Args:
       learning_rate: A Tensor or a floating point value.  The learning rate.
-      beta1: A float value or a constant float tensor.
-        The exponential decay rate for the 1st moment estimates.
-      beta2: A float value or a constant float tensor.
-        The exponential decay rate for the 2nd moment estimates.
+      beta1: A float value or a constant float tensor. The exponential decay
+        rate for the 1st moment estimates.
+      beta2: A float value or a constant float tensor. The exponential decay
+        rate for the 2nd moment estimates.
       epsilon: A small constant for numerical stability. This epsilon is
         "epsilon hat" in the Kingma and Ba paper (in the formula just before
         Section 2.1), not the epsilon in Algorithm 1 of the paper.
       use_locking: If True use locks for update operations.
       name: Optional name for the operations created when applying gradients.
-        Defaults to "Adam".
-
-    @compatibility(eager)
-    When eager execution is enabled, `learning_rate`, `beta1`, `beta2`, and
-    `epsilon` can each be a callable that takes no arguments and returns the
-    actual value to use. This can be useful for changing these values across
-    different invocations of optimizer functions.
-    @end_compatibility
+        Defaults to "Adam".  @compatibility(eager) When eager execution is
+        enabled, `learning_rate`, `beta1`, `beta2`, and `epsilon` can each be a
+        callable that takes no arguments and returns the actual value to use.
+        This can be useful for changing these values across different
+        invocations of optimizer functions. @end_compatibility
     """
     super(AdamOptimizer, self).__init__(use_locking, name)
     self._lr = learning_rate
@@ -105,9 +106,6 @@ class AdamOptimizer(optimizer.Optimizer):
     self._beta2_t = None
     self._epsilon_t = None
 
-    # Created in SparseApply if needed.
-    self._updated_lr = None
-
   def _get_beta_accumulators(self):
     with ops.init_scope():
       if context.executing_eagerly():
@@ -123,12 +121,10 @@ class AdamOptimizer(optimizer.Optimizer):
     # workers (these need to go on the same PS, otherwise some updates are
     # silently ignored).
     first_var = min(var_list, key=lambda x: x.name)
-    self._create_non_slot_variable(initial_value=self._beta1,
-                                   name="beta1_power",
-                                   colocate_with=first_var)
-    self._create_non_slot_variable(initial_value=self._beta2,
-                                   name="beta2_power",
-                                   colocate_with=first_var)
+    self._create_non_slot_variable(
+        initial_value=self._beta1, name="beta1_power", colocate_with=first_var)
+    self._create_non_slot_variable(
+        initial_value=self._beta2, name="beta2_power", colocate_with=first_var)
 
     # Create slots for the first and second moments.
     for v in var_list:
@@ -151,28 +147,34 @@ class AdamOptimizer(optimizer.Optimizer):
     v = self.get_slot(var, "v")
     beta1_power, beta2_power = self._get_beta_accumulators()
     return training_ops.apply_adam(
-        var, m, v,
+        var,
+        m,
+        v,
         math_ops.cast(beta1_power, var.dtype.base_dtype),
         math_ops.cast(beta2_power, var.dtype.base_dtype),
         math_ops.cast(self._lr_t, var.dtype.base_dtype),
         math_ops.cast(self._beta1_t, var.dtype.base_dtype),
         math_ops.cast(self._beta2_t, var.dtype.base_dtype),
         math_ops.cast(self._epsilon_t, var.dtype.base_dtype),
-        grad, use_locking=self._use_locking).op
+        grad,
+        use_locking=self._use_locking).op
 
   def _resource_apply_dense(self, grad, var):
     m = self.get_slot(var, "m")
     v = self.get_slot(var, "v")
     beta1_power, beta2_power = self._get_beta_accumulators()
     return training_ops.resource_apply_adam(
-        var.handle, m.handle, v.handle,
+        var.handle,
+        m.handle,
+        v.handle,
         math_ops.cast(beta1_power, grad.dtype.base_dtype),
         math_ops.cast(beta2_power, grad.dtype.base_dtype),
         math_ops.cast(self._lr_t, grad.dtype.base_dtype),
         math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
         math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
         math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
-        grad, use_locking=self._use_locking)
+        grad,
+        use_locking=self._use_locking)
 
   def _apply_sparse_shared(self, grad, var, indices, scatter_add):
     beta1_power, beta2_power = self._get_beta_accumulators()
@@ -186,8 +188,7 @@ class AdamOptimizer(optimizer.Optimizer):
     # m_t = beta1 * m + (1 - beta1) * g_t
     m = self.get_slot(var, "m")
     m_scaled_g_values = grad * (1 - beta1_t)
-    m_t = state_ops.assign(m, m * beta1_t,
-                           use_locking=self._use_locking)
+    m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
     with ops.control_dependencies([m_t]):
       m_t = scatter_add(m, indices, m_scaled_g_values)
     # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
@@ -197,26 +198,29 @@ class AdamOptimizer(optimizer.Optimizer):
     with ops.control_dependencies([v_t]):
       v_t = scatter_add(v, indices, v_scaled_g_values)
     v_sqrt = math_ops.sqrt(v_t)
-    var_update = state_ops.assign_sub(var,
-                                      lr * m_t / (v_sqrt + epsilon_t),
-                                      use_locking=self._use_locking)
+    var_update = state_ops.assign_sub(
+        var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)
     return control_flow_ops.group(*[var_update, m_t, v_t])
 
   def _apply_sparse(self, grad, var):
     return self._apply_sparse_shared(
-        grad.values, var, grad.indices,
+        grad.values,
+        var,
+        grad.indices,
         lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
-            x, i, v, use_locking=self._use_locking))
+            x,
+            i,
+            v,
+            use_locking=self._use_locking))
 
   def _resource_scatter_add(self, x, i, v):
     with ops.control_dependencies(
-        [resource_variable_ops.resource_scatter_add(
-            x.handle, i, v)]):
+        [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
       return x.value()
 
   def _resource_apply_sparse(self, grad, var, indices):
-    return self._apply_sparse_shared(
-        grad, var, indices, self._resource_scatter_add)
+    return self._apply_sparse_shared(grad, var, indices,
+                                     self._resource_scatter_add)
 
   def _finish(self, update_ops, name_scope):
     # Update the power accumulators.
@@ -227,5 +231,5 @@ class AdamOptimizer(optimizer.Optimizer):
             beta1_power * self._beta1_t, use_locking=self._use_locking)
         update_beta2 = beta2_power.assign(
             beta2_power * self._beta2_t, use_locking=self._use_locking)
-    return control_flow_ops.group(*update_ops + [update_beta1, update_beta2],
-                                  name=name_scope)
+    return control_flow_ops.group(
+        *update_ops + [update_beta1, update_beta2], name=name_scope)
diff --git a/tensorflow/python/training/adam_test.py b/tensorflow/python/training/adam_test.py
index b0bae275773cf05b4e6233706b60f60ca13c9ac0..15958112bd8ca25a5dc434f0630da0c6685f130c 100644
--- a/tensorflow/python/training/adam_test.py
+++ b/tensorflow/python/training/adam_test.py
@@ -68,8 +68,8 @@ class AdamOptimizerTest(test.TestCase):
           var0 = resource_variable_ops.ResourceVariable(var0_np)
           var1 = resource_variable_ops.ResourceVariable(var1_np)
         else:
-          var0 = variables.Variable(var0_np)
-          var1 = variables.Variable(var1_np)
+          var0 = variables.RefVariable(var0_np)
+          var1 = variables.RefVariable(var1_np)
         grads0_np_indices = np.array([0, 1], dtype=np.int32)
         grads0 = ops.IndexedSlices(
             constant_op.constant(grads0_np),
@@ -156,6 +156,9 @@ class AdamOptimizerTest(test.TestCase):
                               self.evaluate(repeated_index_update_var))
 
   def doTestBasic(self, use_resource=False, use_callable_params=False):
+    if context.executing_eagerly() and not use_resource:
+      self.skipTest(
+          "Skipping test with use_resource=False and executing eagerly.")
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
       with self.session(graph=ops.Graph()):
         # Initialize variables for numpy implementation.
@@ -171,8 +174,8 @@ class AdamOptimizerTest(test.TestCase):
           var1 = resource_variable_ops.ResourceVariable(
               var1_np, name="var1_%d" % i)
         else:
-          var0 = variables.Variable(var0_np)
-          var1 = variables.Variable(var1_np)
+          var0 = variables.RefVariable(var0_np)
+          var1 = variables.RefVariable(var1_np)
         grads0 = constant_op.constant(grads0_np)
         grads1 = constant_op.constant(grads1_np)
 
@@ -194,6 +197,14 @@ class AdamOptimizerTest(test.TestCase):
         self.assertTrue(beta2_power is not None)
         self.assertIn(beta1_power, opt_variables)
         self.assertIn(beta2_power, opt_variables)
+        # Ensure that non-slot variables are the same type as the requested
+        # variables.
+        self.assertEqual(
+            use_resource,
+            resource_variable_ops.is_resource_variable(beta1_power))
+        self.assertEqual(
+            use_resource,
+            resource_variable_ops.is_resource_variable(beta2_power))
 
         if not context.executing_eagerly():
           with ops.Graph().as_default():
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index 1af27626ba764b0bf4a2787e492983a72c1491e9..55ef162eb1516fadc3d6ceaeb5ef44caee175b88 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -1122,7 +1122,7 @@ class StepCounterHookTest(test.TestCase):
         self.assertGreater(summary_value.simple_value, 0)
 
 
-@test_util.run_v1_only('b/120545219')
+@test_util.run_deprecated_v1
 class SummarySaverHookTest(test.TestCase):
 
   def setUp(self):
@@ -1404,7 +1404,7 @@ class FinalOpsHookTest(test.TestCase):
                              hook.final_ops_values.tolist())
 
 
-@test_util.run_v1_only('b/120545219')
+@test_util.run_deprecated_v1
 class ResourceSummarySaverHookTest(test.TestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/training/checkpoint_management.py b/tensorflow/python/training/checkpoint_management.py
index f745ab4824ac364b51758e6c3fb60a5679d210fb..21fa6b3b5d3f8c306f0116f4d21940164c28b104 100644
--- a/tensorflow/python/training/checkpoint_management.py
+++ b/tensorflow/python/training/checkpoint_management.py
@@ -56,10 +56,6 @@ def _GetCheckpointFilename(save_dir, latest_filename):
   return os.path.join(save_dir, latest_filename)
 
 
-@deprecation.deprecated(
-    date=None,
-    instructions=("Use tf.train.CheckpointManager to manage checkpoints rather "
-                  "than editing the Checkpoint proto manually."))
 @tf_export(v1=["train.generate_checkpoint_state_proto"])
 def generate_checkpoint_state_proto(save_dir,
                                     model_checkpoint_path,
@@ -625,7 +621,8 @@ class CheckpointManager(object):
                >= self._last_preserved_timestamp)):
         self._last_preserved_timestamp = timestamp
         continue
-      remove_checkpoint(filename)
+      _delete_file_if_exists(filename + ".index")
+      _delete_file_if_exists(filename + ".data-?????-of-?????")
 
   def _record_state(self):
     """Saves the `CheckpointManager`'s state in `directory`."""
diff --git a/tensorflow/python/training/checkpoint_management_test.py b/tensorflow/python/training/checkpoint_management_test.py
index 8606ec4a206ffbce85cf4071934deeb5a545b055..053298d1a592df821cd56e15d9026f6386f0e502 100644
--- a/tensorflow/python/training/checkpoint_management_test.py
+++ b/tensorflow/python/training/checkpoint_management_test.py
@@ -38,7 +38,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as saver_module
 from tensorflow.python.training.checkpoint_state_pb2 import CheckpointState
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import util
 
 
 class LatestCheckpointWithRelativePaths(test.TestCase):
diff --git a/tensorflow/python/training/checkpoint_ops_test.py b/tensorflow/python/training/checkpoint_ops_test.py
index c48154713929b91050e070051add9fee7c428805..a0fd2dc6bae9b4a3376dffc67355de289e59c00e 100644
--- a/tensorflow/python/training/checkpoint_ops_test.py
+++ b/tensorflow/python/training/checkpoint_ops_test.py
@@ -154,7 +154,7 @@ class LoadAndRemapWrappersTest(test.TestCase):
         partitioner=partitioned_variables.fixed_size_partitioner(2))
 
     with self.cached_session():
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(expected_remapped_matrix,
                           remapped_matrix.as_tensor().eval())
 
@@ -188,7 +188,7 @@ class LoadAndRemapWrappersTest(test.TestCase):
         partitioner=partitioned_variables.fixed_size_partitioner(2))
 
     with self.cached_session():
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(expected_remapped_matrix,
                           remapped_matrix.as_tensor().eval())
 
@@ -226,7 +226,7 @@ class LoadAndRemapWrappersTest(test.TestCase):
         partitioner=partitioned_variables.fixed_size_partitioner(2))
 
     with self.cached_session():
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(expected_remapped_matrix,
                           remapped_matrix.as_tensor().eval())
 
@@ -262,7 +262,7 @@ class LoadAndRemapWrappersTest(test.TestCase):
         partitioner=partitioned_variables.fixed_size_partitioner(2))
 
     with self.cached_session():
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(expected_remapped_matrix,
                           remapped_matrix.as_tensor().eval())
 
@@ -296,7 +296,7 @@ class LoadAndRemapWrappersTest(test.TestCase):
         partitioner=partitioned_variables.fixed_size_partitioner(2))
 
     with self.cached_session():
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(expected_remapped_embeddings,
                           remapped_embeddings.as_tensor().eval())
 
@@ -342,7 +342,7 @@ class LoadAndRemapWrappersTest(test.TestCase):
         partitioner=partitioned_variables.fixed_size_partitioner(2))
 
     with self.cached_session():
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(expected_remapped_embeddings,
                           remapped_embeddings.as_tensor().eval())
 
@@ -380,7 +380,7 @@ class LoadAndRemapWrappersTest(test.TestCase):
         partitioner=partitioned_variables.fixed_size_partitioner(2))
 
     with self.cached_session():
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose(expected_remapped_embeddings,
                           remapped_embeddings.as_tensor().eval())
 
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index 74b46179e75423b530191cce5a52034879712eaa..5e18f4b722b402a892125903ac82bf5991c385cd 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -180,8 +180,8 @@ def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
       (in default graph).
 
   Raises:
-    tf.errors.OpError: If missing checkpoints or tensors in checkpoints.
-    ValueError: If missing variables in current graph.
+    ValueError: If missing variables in current graph, or if missing
+      checkpoints or tensors in checkpoints.
   """
   if distribution_strategy_context.get_cross_replica_context():
     _init_from_checkpoint(None, ckpt_dir_or_file, assignment_map)
diff --git a/tensorflow/python/training/evaluation.py b/tensorflow/python/training/evaluation.py
index a10178f8cfe3af1ac45a5084b8e16abe1beee267..37d46795b16cb4b4ed5ce2b4f5cf9b17cdcafab3 100644
--- a/tensorflow/python/training/evaluation.py
+++ b/tensorflow/python/training/evaluation.py
@@ -253,7 +253,7 @@ def _evaluate_once(checkpoint_path,
       if isinstance(h, (_StopAfterNEvalsHook, _MultiStepStopAfterNEvalsHook)):
         h._set_evals_completed_tensor(eval_step_value)  # pylint: disable=protected-access
 
-  logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%d-%H:%M:%S',
+  logging.info('Starting evaluation at ' + time.strftime('%Y-%m-%dT%H:%M:%SZ',
                                                          time.gmtime()))
 
   # Prepare the session creator.
diff --git a/tensorflow/python/training/evaluation_test.py b/tensorflow/python/training/evaluation_test.py
index 3de4ceda759d927aaf743a0aa0159c50b0dbefb7..690c97e3db196ddeb5a212e3b254cf6c01907789 100644
--- a/tensorflow/python/training/evaluation_test.py
+++ b/tensorflow/python/training/evaluation_test.py
@@ -26,10 +26,10 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.layers import layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import metrics
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import losses
@@ -117,16 +117,18 @@ class EvaluateOnceTest(test.TestCase):
     logits = logistic_classifier(inputs)
     predictions = math_ops.round(logits)
 
-    accuracy, update_op = metrics.accuracy(
-        predictions=predictions, labels=labels)
+    accuracy = metrics_module.Accuracy()
+    update_op = accuracy.update_state(labels, predictions)
 
     checkpoint_path = saver.latest_checkpoint(checkpoint_dir)
 
     final_ops_values = evaluation._evaluate_once(
         checkpoint_path=checkpoint_path,
         eval_ops=update_op,
-        final_ops={'accuracy': accuracy},
-        hooks=[evaluation._StopAfterNEvalsHook(1),])
+        final_ops={'accuracy': (accuracy.result(), update_op)},
+        hooks=[
+            evaluation._StopAfterNEvalsHook(1),
+        ])
     self.assertTrue(final_ops_values['accuracy'] > .99)
 
   def testEvaluateWithFiniteInputs(self):
@@ -148,17 +150,21 @@ class EvaluateOnceTest(test.TestCase):
     logits = logistic_classifier(inputs)
     predictions = math_ops.round(logits)
 
-    accuracy, update_op = metrics.accuracy(
-        predictions=predictions, labels=labels)
+    accuracy = metrics_module.Accuracy()
+    update_op = accuracy.update_state(labels, predictions)
 
     checkpoint_path = saver.latest_checkpoint(checkpoint_dir)
 
     final_ops_values = evaluation._evaluate_once(
         checkpoint_path=checkpoint_path,
         eval_ops=update_op,
-        final_ops={'accuracy': accuracy,
-                   'eval_steps': evaluation._get_or_create_eval_step()},
-        hooks=[evaluation._StopAfterNEvalsHook(None),])
+        final_ops={
+            'accuracy': (accuracy.result(), update_op),
+            'eval_steps': evaluation._get_or_create_eval_step()
+        },
+        hooks=[
+            evaluation._StopAfterNEvalsHook(None),
+        ])
     self.assertTrue(final_ops_values['accuracy'] > .99)
     # Runs evaluation for 4 iterations. First 2 evaluate full batch of 6 inputs
     # each; the 3rd iter evaluates the remaining 4 inputs, and the last one
diff --git a/tensorflow/python/training/input_test.py b/tensorflow/python/training/input_test.py
index d89f5f3bbd879a32ab55cf70e366c5c82ef0f266..5efc15d56f9530569b98a9cde975d74de1f110ef 100644
--- a/tensorflow/python/training/input_test.py
+++ b/tensorflow/python/training/input_test.py
@@ -58,7 +58,7 @@ class MatchFilenamesOnceTest(test_lib.TestCase):
       question = inp.match_filenames_once(
           os.path.join(self.get_temp_dir(), "match_filenames.?"))
       one = inp.match_filenames_once(additional[1])
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       self.assertItemsEqual(
           map(compat.as_bytes, filenames), self.evaluate(star))
@@ -84,7 +84,7 @@ class LimitEpochsTest(test_lib.TestCase):
     with self.cached_session():
       love_me = constant_op.constant("Love Me")
       love_me_two_times = inp.limit_epochs(love_me, num_epochs=2)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       self.assertEqual(b"Love Me", self.evaluate(love_me_two_times))
       self.assertEqual(b"Love Me", self.evaluate(love_me_two_times))
@@ -105,7 +105,7 @@ class InputProducerTest(test_lib.TestCase):
           input_tensor, num_epochs=num_epochs, shuffle=False)
       dequeue_many = queue.dequeue_many(len(input_tensor) * num_epochs)
       dequeue = queue.dequeue()
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -132,7 +132,7 @@ class InputProducerTest(test_lib.TestCase):
           input_tensor, element_shape=[4], num_epochs=num_epochs, shuffle=False)
       dequeue_many = queue.dequeue_many(len(input_value) * num_epochs)
       dequeue = queue.dequeue()
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -163,7 +163,7 @@ class StringInputProducerTest(test_lib.TestCase):
           strings, num_epochs=num_epochs, shuffle=False)
       dequeue_many = queue.dequeue_many(len(strings) * num_epochs)
       dequeue = queue.dequeue()
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -186,7 +186,7 @@ class StringInputProducerTest(test_lib.TestCase):
           strings, num_epochs=num_epochs, shuffle=True, seed=271828)
       dequeue_many = queue.dequeue_many(len(strings))
       dequeue = queue.dequeue()
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -234,7 +234,7 @@ class StringInputProducerTest(test_lib.TestCase):
           constant_op.constant(
               [], dtype=dtypes.string))
       dequeue = queue.dequeue()
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners(coord=coord)
       with self.assertRaises(errors_impl.OutOfRangeError):
@@ -284,7 +284,7 @@ class RangeInputProducerTest(test_lib.TestCase):
           range_size, num_epochs=num_epochs, shuffle=False)
       dequeue_many = queue.dequeue_many(range_size * num_epochs)
       dequeue = queue.dequeue()
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -307,7 +307,7 @@ class RangeInputProducerTest(test_lib.TestCase):
           range_size, num_epochs=num_epochs, shuffle=True, seed=314159)
       dequeue_many = queue.dequeue_many(range_size)
       dequeue = queue.dequeue()
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -358,7 +358,7 @@ class SliceInputProducerTest(test_lib.TestCase):
       source_ints = [2, 3, 5, 7]
       slices = inp.slice_input_producer(
           [source_strings, source_ints], num_epochs=num_epochs, shuffle=False)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -386,7 +386,7 @@ class SliceInputProducerTest(test_lib.TestCase):
           num_epochs=num_epochs,
           shuffle=True,
           seed=161803)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -487,7 +487,7 @@ class BatchTest(test_lib.TestCase):
         batched = inp.batch(
             [counter, sparse_counter, "string"], batch_size=batch_size)
         batched_fetch = batched
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -555,7 +555,7 @@ class BatchTest(test_lib.TestCase):
       counter = examples.count_up_to(num_batches * batch_size)
       string = array_ops.tile(["string"],
                               math_ops.to_int32(array_ops.stack([counter])))
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       batched = inp.batch(
           [counter, string], batch_size=batch_size, dynamic_pad=True)
@@ -590,7 +590,7 @@ class BatchTest(test_lib.TestCase):
           dense_shape=[1])
       pre_batched = inp.batch([counter, sparse_counter, "string"], batch_size=2)
       batched = inp.batch(pre_batched, enqueue_many=True, batch_size=batch_size)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -629,7 +629,7 @@ class BatchTest(test_lib.TestCase):
           [counter, sparse_counter, "string"],
           batch_size=batch_size,
           num_threads=4)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -672,7 +672,7 @@ class BatchTest(test_lib.TestCase):
           [counter, sparse_counter, "string"],
           batch_size=batch_size,
           allow_smaller_final_batch=True)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -730,7 +730,7 @@ class BatchTest(test_lib.TestCase):
           batch_size=batch_size,
           num_threads=4,
           allow_smaller_final_batch=True)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -1058,7 +1058,7 @@ class BatchJoinTest(test_lib.TestCase):
                           batched_fetch[1].dense_shape.get_shape().as_list())
       self.assertAllEqual((batch_size,), batched_fetch[2].get_shape().as_list())
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -1157,7 +1157,7 @@ class BatchJoinTest(test_lib.TestCase):
       self.assertAllEqual((batch_size,), batched[0].get_shape().as_list())
       self.assertAllEqual((batch_size, None), batched[1].get_shape().as_list())
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -1244,7 +1244,7 @@ class BatchJoinTest(test_lib.TestCase):
       self.assertAllEqual((2,), batched[1].dense_shape.get_shape().as_list())
       self.assertAllEqual((None,), batched[2].get_shape().as_list())
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -1339,7 +1339,7 @@ class BatchJoinTest(test_lib.TestCase):
       self.assertAllEqual((None,), batched[0].get_shape().as_list())
       self.assertAllEqual((None, None), batched[1].get_shape().as_list())
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -1644,7 +1644,7 @@ class ShuffleBatchTest(test_lib.TestCase):
             min_after_dequeue=16,
             seed=141421)
         batched_fetch = batched
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -1702,7 +1702,7 @@ class ShuffleBatchTest(test_lib.TestCase):
           seed=141421,
           allow_smaller_final_batch=True)
       batched_fetch = batched
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -1756,7 +1756,7 @@ class ShuffleBatchTest(test_lib.TestCase):
           min_after_dequeue=16,
           seed=173205,
           num_threads=4)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -1807,7 +1807,7 @@ class ShuffleBatchTest(test_lib.TestCase):
           seed=173205,
           num_threads=4,
           allow_smaller_final_batch=True)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -2070,7 +2070,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
                           batched_fetch[1].dense_shape.get_shape().as_list())
       self.assertAllEqual((batch_size,), batched_fetch[2].get_shape().as_list())
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
@@ -2165,7 +2165,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
       self.assertAllEqual((2,), batched[1].dense_shape.get_shape().as_list())
       self.assertAllEqual((None,), batched[2].get_shape().as_list())
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners()
 
diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py
index c52e89db1f47eb303b7160cef77c01bcb46aebba..ab9d923bedc721413a2120fc5be3ce302fef4e0f 100644
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@@ -17,8 +17,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from tensorflow.python.eager import context
-from tensorflow.python.training import learning_rate_decay_v2
+from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -88,15 +91,15 @@ def exponential_decay(learning_rate,
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  decayed_lr = learning_rate_decay_v2.exponential_decay(learning_rate,
-                                                        global_step,
-                                                        decay_steps,
-                                                        decay_rate,
-                                                        staircase=staircase,
-                                                        name=name)
+  decayed_lr = learning_rate_schedule.ExponentialDecay(learning_rate,
+                                                       decay_steps,
+                                                       decay_rate,
+                                                       staircase=staircase,
+                                                       name=name)
   if not context.executing_eagerly():
-    decayed_lr = decayed_lr()
-
+    decayed_lr = decayed_lr(global_step)
+  else:
+    decayed_lr = functools.partial(decayed_lr, global_step)
   return decayed_lr
 
 
@@ -143,11 +146,12 @@ def piecewise_constant(x, boundaries, values, name=None):
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  decayed_lr = learning_rate_decay_v2.piecewise_constant(x, boundaries, values,
-                                                         name=name)
+  decayed_lr = learning_rate_schedule.PiecewiseConstantDecay(
+      boundaries, values, name=name)
   if not context.executing_eagerly():
-    decayed_lr = decayed_lr()
-
+    decayed_lr = decayed_lr(x)
+  else:
+    decayed_lr = functools.partial(decayed_lr, x)
   return decayed_lr
 
 
@@ -236,9 +240,8 @@ def polynomial_decay(learning_rate,
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  decayed_lr = learning_rate_decay_v2.polynomial_decay(
+  decayed_lr = learning_rate_schedule.PolynomialDecay(
       learning_rate,
-      global_step,
       decay_steps,
       end_learning_rate=end_learning_rate,
       power=power,
@@ -246,8 +249,9 @@ def polynomial_decay(learning_rate,
       name=name)
 
   if not context.executing_eagerly():
-    decayed_lr = decayed_lr()
-
+    decayed_lr = decayed_lr(global_step)
+  else:
+    decayed_lr = functools.partial(decayed_lr, global_step)
   return decayed_lr
 
 
@@ -323,13 +327,15 @@ def natural_exp_decay(learning_rate,
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  decayed_lr = learning_rate_decay_v2.natural_exp_decay(
-      learning_rate, global_step, decay_steps, decay_rate, staircase=staircase,
+  natural_exp_rate = math_ops.exp(math_ops.negative(decay_rate))
+  decayed_lr = learning_rate_schedule.ExponentialDecay(
+      learning_rate, decay_steps, natural_exp_rate, staircase=staircase,
       name=name)
 
   if not context.executing_eagerly():
-    decayed_lr = decayed_lr()
-
+    decayed_lr = decayed_lr(global_step)
+  else:
+    decayed_lr = functools.partial(decayed_lr, global_step)
   return decayed_lr
 
 
@@ -405,17 +411,17 @@ def inverse_time_decay(learning_rate,
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  decayed_lr = learning_rate_decay_v2.inverse_time_decay(
+  decayed_lr = learning_rate_schedule.InverseTimeDecay(
       learning_rate,
-      global_step,
       decay_steps,
       decay_rate,
       staircase=staircase,
       name=name)
 
   if not context.executing_eagerly():
-    decayed_lr = decayed_lr()
-
+    decayed_lr = decayed_lr(global_step)
+  else:
+    decayed_lr = functools.partial(decayed_lr, global_step)
   return decayed_lr
 
 
@@ -468,12 +474,13 @@ def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None):
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  decayed_lr = learning_rate_decay_v2.cosine_decay(
-      learning_rate, global_step, decay_steps, alpha=alpha, name=name)
+  decayed_lr = learning_rate_schedule.CosineDecay(
+      learning_rate, decay_steps, alpha=alpha, name=name)
 
   if not context.executing_eagerly():
-    decayed_lr = decayed_lr()
-
+    decayed_lr = decayed_lr(global_step)
+  else:
+    decayed_lr = functools.partial(decayed_lr, global_step)
   return decayed_lr
 
 
@@ -535,9 +542,8 @@ def cosine_decay_restarts(learning_rate,
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  decayed_lr = learning_rate_decay_v2.cosine_decay_restarts(
+  decayed_lr = learning_rate_schedule.CosineDecayRestarts(
       learning_rate,
-      global_step,
       first_decay_steps,
       t_mul=t_mul,
       m_mul=m_mul,
@@ -545,8 +551,9 @@ def cosine_decay_restarts(learning_rate,
       name=name)
 
   if not context.executing_eagerly():
-    decayed_lr = decayed_lr()
-
+    decayed_lr = decayed_lr(global_step)
+  else:
+    decayed_lr = functools.partial(decayed_lr, global_step)
   return decayed_lr
 
 
@@ -617,9 +624,8 @@ def linear_cosine_decay(learning_rate,
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  decayed_lr = learning_rate_decay_v2.linear_cosine_decay(
+  decayed_lr = learning_rate_schedule.LinearCosineDecay(
       learning_rate,
-      global_step,
       decay_steps,
       num_periods=num_periods,
       alpha=alpha,
@@ -627,8 +633,9 @@ def linear_cosine_decay(learning_rate,
       name=name)
 
   if not context.executing_eagerly():
-    decayed_lr = decayed_lr()
-
+    decayed_lr = decayed_lr(global_step)
+  else:
+    decayed_lr = functools.partial(decayed_lr, global_step)
   return decayed_lr
 
 
@@ -707,8 +714,8 @@ def noisy_linear_cosine_decay(learning_rate,
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  decayed_lr = learning_rate_decay_v2.noisy_linear_cosine_decay(
-      learning_rate, global_step,
+  decayed_lr = learning_rate_schedule.NoisyLinearCosineDecay(
+      learning_rate,
       decay_steps,
       initial_variance=initial_variance,
       variance_decay=variance_decay,
@@ -718,6 +725,7 @@ def noisy_linear_cosine_decay(learning_rate,
       name=name)
 
   if not context.executing_eagerly():
-    decayed_lr = decayed_lr()
-
+    decayed_lr = decayed_lr(global_step)
+  else:
+    decayed_lr = functools.partial(decayed_lr, global_step)
   return decayed_lr
diff --git a/tensorflow/python/training/learning_rate_decay_v2.py b/tensorflow/python/training/learning_rate_decay_v2.py
deleted file mode 100644
index eb69feb17d3983ddb494cdf63ae30edee7062915..0000000000000000000000000000000000000000
--- a/tensorflow/python/training/learning_rate_decay_v2.py
+++ /dev/null
@@ -1,898 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Various learning rate decay functions."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import math
-
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.util.tf_export import tf_export
-
-
-@tf_export("train.exponential_decay", v1=[])
-def exponential_decay(learning_rate,
-                      global_step,
-                      decay_steps,
-                      decay_rate,
-                      staircase=False,
-                      name=None):
-  """Applies exponential decay to the learning rate.
-
-  When training a model, it is often recommended to lower the learning rate as
-  the training progresses.  This function applies an exponential decay function
-  to a provided initial learning rate.  It requires a `global_step` value to
-  compute the decayed learning rate.  You can just pass a TensorFlow variable
-  that you increment at each training step.
-
-  The function returns a no-arg function that produces the decayed learning
-  rate. This can be useful for changing the learning rate value across
-  different invocations of optimizer functions.
-  It is computed as:
-
-  ```python
-  decayed_learning_rate = learning_rate *
-                          decay_rate ^ (global_step / decay_steps)
-  ```
-
-  If the argument `staircase` is `True`, then `global_step / decay_steps` is an
-  integer division and the decayed learning rate follows a staircase function.
-
-  Example: decay every 100000 steps with a base of 0.96:
-
-  ```python
-  ...
-  global_step = tf.Variable(0, trainable=False)
-  starter_learning_rate = 0.1
-  learning_rate_fn = tf.train.exponential_decay(starter_learning_rate,
-                                                global_step, 100000, 0.96,
-                                                staircase=True)
-  # Passing global_step to minimize() will increment it at each step.
-  learning_step = (
-      tf.train.GradientDescentOptimizer(learning_rate_fn)
-      .minimize(...my loss..., global_step=global_step)
-  )
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` `Tensor` or a
-      Python number.  The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Global step to use for the decay computation.  Must not be negative.
-    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Must be positive.  See the decay computation above.
-    decay_rate: A scalar `float32` or `float64` `Tensor` or a
-      Python number.  The decay rate.
-    staircase: Boolean.  If `True` decay the learning rate at discrete intervals
-    name: String.  Optional name of the operation.  Defaults to
-      'ExponentialDecay'.
-
-  Returns:
-    A no-arg function that outputs the decayed learning rate, a scalar `Tensor`
-    of the same type as `learning_rate`.
-
-  Raises:
-    ValueError: if `global_step` is not supplied.
-  """
-  if global_step is None:
-    raise ValueError("global_step is required for exponential_decay.")
-  def decayed_lr(learning_rate, global_step, decay_steps, decay_rate,
-                 staircase, name):
-    """Helper to recompute learning rate; most helpful in eager-mode."""
-    with ops.name_scope(
-        name, "ExponentialDecay",
-        [learning_rate, global_step, decay_steps, decay_rate]) as name:
-      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
-      dtype = learning_rate.dtype
-      decay_steps = math_ops.cast(decay_steps, dtype)
-      decay_rate = math_ops.cast(decay_rate, dtype)
-
-      global_step_recomp = math_ops.cast(global_step, dtype)
-      p = global_step_recomp / decay_steps
-      if staircase:
-        p = math_ops.floor(p)
-      return math_ops.multiply(
-          learning_rate, math_ops.pow(decay_rate, p), name=name)
-
-  return functools.partial(decayed_lr, learning_rate, global_step, decay_steps,
-                           decay_rate, staircase, name)
-
-
-@tf_export("train.piecewise_constant_decay", v1=[])
-def piecewise_constant(x, boundaries, values, name=None):
-  """Piecewise constant from boundaries and interval values.
-
-  This function returns a no-arg callable to compute the piecewise constant.
-  This can be useful for changing the learning rate value across
-  different invocations of optimizer functions.
-
-  Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5
-    for the next 10000 steps, and 0.1 for any additional steps.
-
-  ```python
-  global_step = tf.Variable(0, trainable=False)
-  boundaries = [100000, 110000]
-  values = [1.0, 0.5, 0.1]
-  learning_rate_fn = tf.train.piecewise_constant(global_step, boundaries,
-    values)
-  learning_rate = learning_rate_fn()
-
-  # Later, whenever we perform an optimization step, we increment global_step.
-  ```
-
-  Args:
-    x: A 0-D scalar `Tensor`. Must be one of the following types: `float32`,
-      `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`.
-    boundaries: A list of `Tensor`s or `int`s or `float`s with strictly
-      increasing entries, and with all elements having the same type as `x`.
-    values: A list of `Tensor`s or `float`s or `int`s that specifies the values
-      for the intervals defined by `boundaries`. It should have one more element
-      than `boundaries`, and all elements should have the same type.
-    name: A string. Optional name of the operation. Defaults to
-      'PiecewiseConstant'.
-
-  Returns:
-    A no-arg function that outputs a 0-D Tensor. The output of the no-arg
-    function is `values[0]` when `x <= boundaries[0]`,
-    `values[1]` when `x > boundaries[0]` and `x <= boundaries[1]`, ...,
-    and values[-1] when `x > boundaries[-1]`.
-
-  Raises:
-    ValueError: if types of `x` and `boundaries` do not match, or types of all
-        `values` do not match or
-        the number of elements in the lists does not match.
-  """
-  if len(boundaries) != len(values) - 1:
-    raise ValueError(
-        "The length of boundaries should be 1 less than the length of values")
-  def decayed_lr(x, boundaries, values, name):
-    """Helper to recompute learning rate; most helpful in eager-mode."""
-    with ops.name_scope(name, "PiecewiseConstant",
-                        [x, boundaries, values, name]) as name:
-      boundaries = ops.convert_n_to_tensor(boundaries)
-      values = ops.convert_n_to_tensor(values)
-      x_recomp = ops.convert_to_tensor(x)
-      # Avoid explicit conversion to x's dtype. This could result in faulty
-      # comparisons, for example if floats are converted to integers.
-      for i, b in enumerate(boundaries):
-        if b.dtype.base_dtype != x_recomp.dtype.base_dtype:
-          # We can promote int32 boundaries to int64 without loss of precision.
-          # This covers the most common case where the user passes in boundaries
-          # as an array of Python integers.
-          if (b.dtype.base_dtype == dtypes.int32 and
-              x_recomp.dtype.base_dtype == dtypes.int64):
-            b = math_ops.cast(b, x_recomp.dtype.base_dtype)
-            boundaries[i] = b
-          else:
-            raise ValueError(
-                "Boundaries (%s) must have the same dtype as x (%s)." %
-                (b.dtype.base_dtype, x_recomp.dtype.base_dtype))
-      # TODO(rdipietro): Ensure that boundaries' elements strictly increases.
-      for v in values[1:]:
-        if v.dtype.base_dtype != values[0].dtype.base_dtype:
-          raise ValueError(
-              "Values must have elements all with the same dtype (%s vs %s)." %
-              (values[0].dtype.base_dtype, v.dtype.base_dtype))
-      pred_fn_pairs = []
-      pred_fn_pairs.append((x_recomp <= boundaries[0], lambda: values[0]))
-      pred_fn_pairs.append((x_recomp > boundaries[-1], lambda: values[-1]))
-      for low, high, v in zip(boundaries[:-1], boundaries[1:], values[1:-1]):
-        # Need to bind v here; can do this with lambda v=v: ...
-        pred = (x_recomp > low) & (x_recomp <= high)
-        pred_fn_pairs.append((pred, lambda v=v: v))
-
-      # The default isn't needed here because our conditions are mutually
-      # exclusive and exhaustive, but tf.case requires it.
-      default = lambda: values[0]
-      return control_flow_ops.case(pred_fn_pairs, default, exclusive=True)
-
-  return functools.partial(decayed_lr, x, boundaries, values, name)
-
-
-@tf_export("train.polynomial_decay", v1=[])
-def polynomial_decay(learning_rate,
-                     global_step,
-                     decay_steps,
-                     end_learning_rate=0.0001,
-                     power=1.0,
-                     cycle=False,
-                     name=None):
-  """Applies a polynomial decay to the learning rate.
-
-  It is commonly observed that a monotonically decreasing learning rate, whose
-  degree of change is carefully chosen, results in a better performing model.
-  This function applies a polynomial decay function to a provided initial
-  `learning_rate` to reach an `end_learning_rate` in the given `decay_steps`.
-
-  It requires a `global_step` value to compute the decayed learning rate.  You
-  can just pass a TensorFlow variable that you increment at each training step.
-
-  The function returns a no-arg callable that outputs the decayed learning
-  rate. This can be useful for changing the learning rate value across
-  different invocations of optimizer functions. It is computed as:
-
-  ```python
-  global_step = min(global_step, decay_steps)
-  decayed_learning_rate = (learning_rate - end_learning_rate) *
-                          (1 - global_step / decay_steps) ^ (power) +
-                          end_learning_rate
-
-  ```
-
-  If `cycle` is True then a multiple of `decay_steps` is used, the first one
-  that is bigger than `global_steps`.
-
-  ```python
-  decay_steps = decay_steps * ceil(global_step / decay_steps)
-  decayed_learning_rate_fn = (learning_rate - end_learning_rate) *
-                          (1 - global_step / decay_steps) ^ (power) +
-                          end_learning_rate
-  decayed_learning_rate = decayed_learning_rate_fn()
-
-  ```
-
-  Example: decay from 0.1 to 0.01 in 10000 steps using sqrt (i.e. power=0.5):
-
-  ```python
-  ...
-  global_step = tf.Variable(0, trainable=False)
-  starter_learning_rate = 0.1
-  end_learning_rate = 0.01
-  decay_steps = 10000
-  learning_rate_fn = tf.train.polynomial_decay(starter_learning_rate,
-                                               global_step, decay_steps,
-                                               end_learning_rate,
-                                               power=0.5)
-  # Passing global_step to minimize() will increment it at each step.
-  learning_step = (
-      tf.train.GradientDescentOptimizer(learning_rate_fn)
-      .minimize(...my loss..., global_step=global_step)
-  )
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` `Tensor` or a
-      Python number.  The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Global step to use for the decay computation.  Must not be negative.
-    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Must be positive.  See the decay computation above.
-    end_learning_rate: A scalar `float32` or `float64` `Tensor` or a
-      Python number.  The minimal end learning rate.
-    power: A scalar `float32` or `float64` `Tensor` or a
-      Python number.  The power of the polynomial. Defaults to linear, 1.0.
-    cycle: A boolean, whether or not it should cycle beyond decay_steps.
-    name: String.  Optional name of the operation. Defaults to
-      'PolynomialDecay'.
-
-  Returns:
-    A no-arg function that outputs the decayed learning rate, a scalar `Tensor`
-    of the same type as `learning_rate`.
-
-  Raises:
-    ValueError: if `global_step` is not supplied.
-  """
-  if global_step is None:
-    raise ValueError("global_step is required for polynomial_decay.")
-  def decayed_lr(learning_rate, global_step, decay_steps, end_learning_rate,
-                 power, cycle, name):
-    """Helper to recompute learning rate; most helpful in eager-mode."""
-    with ops.name_scope(
-        name, "PolynomialDecay",
-        [learning_rate, global_step, decay_steps, end_learning_rate, power]
-    ) as name:
-      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
-      dtype = learning_rate.dtype
-      end_learning_rate = math_ops.cast(end_learning_rate, dtype)
-      power = math_ops.cast(power, dtype)
-
-      global_step_recomp = math_ops.cast(global_step, dtype)
-      decay_steps_recomp = math_ops.cast(decay_steps, dtype)
-      if cycle:
-        # Find the first multiple of decay_steps that is bigger than
-        # global_step. If global_step is zero set the multiplier to 1
-        multiplier = control_flow_ops.cond(
-            math_ops.equal(global_step_recomp, 0), lambda: 1.0,
-            lambda: math_ops.ceil(global_step_recomp / decay_steps))
-        decay_steps_recomp = math_ops.multiply(decay_steps_recomp, multiplier)
-      else:
-        # Make sure that the global_step used is not bigger than decay_steps.
-        global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
-
-      p = math_ops.div(global_step_recomp, decay_steps_recomp)
-      return math_ops.add(
-          math_ops.multiply(learning_rate - end_learning_rate,
-                            math_ops.pow(1 - p, power)),
-          end_learning_rate,
-          name=name)
-
-  return functools.partial(
-      decayed_lr, learning_rate, global_step, decay_steps, end_learning_rate,
-      power, cycle, name)
-
-
-@tf_export("train.natural_exp_decay", v1=[])
-def natural_exp_decay(learning_rate,
-                      global_step,
-                      decay_steps,
-                      decay_rate,
-                      staircase=False,
-                      name=None):
-  """Applies natural exponential decay to the initial learning rate.
-
-  When training a model, it is often recommended to lower the learning rate as
-  the training progresses.  This function applies an exponential decay function
-  to a provided initial learning rate.  It requires an `global_step` value to
-  compute the decayed learning rate.  You can just pass a TensorFlow variable
-  that you increment at each training step.
-
-  The function returns a no-arg callable that produces the decayed learning
-  rate. This can be useful for changing the learning rate value across
-  different invocations of optimizer functions. It is computed as:
-
-  ```python
-  decayed_learning_rate = learning_rate * exp(-decay_rate * global_step /
-  decay_step)
-  ```
-
-  or, if `staircase` is `True`, as:
-
-  ```python
-  decayed_learning_rate = learning_rate * exp(-decay_rate * floor(global_step /
-  decay_step))
-  ```
-
-  Example: decay exponentially with a base of 0.96:
-
-  ```python
-  ...
-  global_step = tf.Variable(0, trainable=False)
-  learning_rate = 0.1
-  decay_steps = 5
-  k = 0.5
-  learning_rate_fn = tf.train.natural_exp_decay(learning_rate, global_step,
-                                                decay_steps, k)
-
-  # Passing global_step to minimize() will increment it at each step.
-  learning_step = (
-      tf.train.GradientDescentOptimizer(learning_rate_fn)
-      .minimize(...my loss..., global_step=global_step)
-  )
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` `Tensor` or a
-      Python number.  The initial learning rate.
-    global_step: A Python number.
-      Global step to use for the decay computation.  Must not be negative.
-    decay_steps: How often to apply decay.
-    decay_rate: A Python number.  The decay rate.
-    staircase: Whether to apply decay in a discrete staircase, as opposed to
-      continuous, fashion.
-    name: String.  Optional name of the operation.  Defaults to
-      'ExponentialTimeDecay'.
-
-  Returns:
-    A no-arg function that outputs the decayed learning rate, a scalar `Tensor`
-    of the same type as `learning_rate`.
-
-  Raises:
-    ValueError: if `global_step` is not supplied.
-  """
-  if global_step is None:
-    raise ValueError("global_step is required for natural_exp_decay.")
-  def decayed_lr(learning_rate, global_step, decay_steps, decay_rate, staircase,
-                 name):
-    """Helper to recompute learning rate; most helpful in eager-mode."""
-    with ops.name_scope(name, "NaturalExpDecay",
-                        [learning_rate, global_step, decay_rate]) as name:
-      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
-      dtype = learning_rate.dtype
-      decay_steps = math_ops.cast(decay_steps, dtype)
-      decay_rate = math_ops.cast(decay_rate, dtype)
-
-      global_step_recomp = math_ops.cast(global_step, dtype)
-      p = global_step_recomp / decay_steps
-      if staircase:
-        p = math_ops.floor(p)
-      exponent = math_ops.exp(
-          math_ops.multiply(math_ops.negative(decay_rate), p))
-      return math_ops.multiply(learning_rate, exponent, name=name)
-
-  return functools.partial(decayed_lr, learning_rate, global_step, decay_steps,
-                           decay_rate, staircase, name)
-
-
-@tf_export("train.inverse_time_decay", v1=[])
-def inverse_time_decay(learning_rate,
-                       global_step,
-                       decay_steps,
-                       decay_rate,
-                       staircase=False,
-                       name=None):
-  """Applies inverse time decay to the initial learning rate.
-
-  When training a model, it is often recommended to lower the learning rate as
-  the training progresses.  This function applies an inverse decay function
-  to a provided initial learning rate.  It requires an `global_step` value to
-  compute the decayed learning rate.  You can just pass a TensorFlow variable
-  that you increment at each training step.
-
-  The function returns a no-arg callable that produces the decayed learning
-  rate. This can be useful for changing the learning rate value across
-  different invocations of optimizer functions. It is computed as:
-
-  ```python
-  decayed_learning_rate = learning_rate / (1 + decay_rate * global_step /
-  decay_step)
-  ```
-
-  or, if `staircase` is `True`, as:
-
-  ```python
-  decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step /
-  decay_step))
-  ```
-
-  Example: decay 1/t with a rate of 0.5:
-
-  ```python
-  ...
-  global_step = tf.Variable(0, trainable=False)
-  learning_rate = 0.1
-  decay_steps = 1.0
-  decay_rate = 0.5
-  learning_rate_fn = tf.train.inverse_time_decay(learning_rate, global_step,
-  decay_steps, decay_rate)
-
-  # Passing global_step to minimize() will increment it at each step.
-  learning_step = (
-      tf.train.GradientDescentOptimizer(learning_rate_fn)
-      .minimize(...my loss..., global_step=global_step)
-  )
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` `Tensor` or a
-      Python number.  The initial learning rate.
-    global_step: A Python number.
-      Global step to use for the decay computation.  Must not be negative.
-    decay_steps: How often to apply decay.
-    decay_rate: A Python number.  The decay rate.
-    staircase: Whether to apply decay in a discrete staircase, as opposed to
-      continuous, fashion.
-    name: String.  Optional name of the operation.  Defaults to
-      'InverseTimeDecay'.
-
-  Returns:
-    A no-arg function that outputs the decayed learning rate, a scalar `Tensor`
-    of the same type as `learning_rate`.
-
-  Raises:
-    ValueError: if `global_step` is not supplied.
-  """
-  if global_step is None:
-    raise ValueError("global_step is required for inverse_time_decay.")
-  def decayed_lr(learning_rate, global_step, decay_steps, decay_rate, staircase,
-                 name):
-    """Helper to recompute learning rate; most helpful in eager-mode."""
-    with ops.name_scope(name, "InverseTimeDecay",
-                        [learning_rate, global_step, decay_rate]) as name:
-      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
-      dtype = learning_rate.dtype
-      decay_steps = math_ops.cast(decay_steps, dtype)
-      decay_rate = math_ops.cast(decay_rate, dtype)
-
-      global_step_recomp = math_ops.cast(global_step, dtype)
-      p = global_step_recomp / decay_steps
-      if staircase:
-        p = math_ops.floor(p)
-      const = math_ops.cast(constant_op.constant(1), dtype)
-      denom = math_ops.add(const, math_ops.multiply(decay_rate, p))
-      return math_ops.div(learning_rate, denom, name=name)
-
-  return functools.partial(decayed_lr, learning_rate, global_step, decay_steps,
-                           decay_rate, staircase, name)
-
-
-@tf_export("train.cosine_decay", v1=[])
-def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0,
-                 name=None):
-  """Applies cosine decay to the learning rate.
-
-  See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
-  with Warm Restarts. https://arxiv.org/abs/1608.03983
-
-  When training a model, it is often recommended to lower the learning rate as
-  the training progresses.  This function applies a cosine decay function
-  to a provided initial learning rate.  It requires a `global_step` value to
-  compute the decayed learning rate.  You can just pass a TensorFlow variable
-  that you increment at each training step.
-
-  The function returns a no-arg callable that produces the decayed learning
-  rate. This can be useful for changing the learning rate value across
-  different invocations of optimizer functions. It is computed as:
-
-  ```python
-  global_step = min(global_step, decay_steps)
-  cosine_decay = 0.5 * (1 + cos(pi * global_step / decay_steps))
-  decayed = (1 - alpha) * cosine_decay + alpha
-  decayed_learning_rate = learning_rate * decayed
-  ```
-
-  Example usage:
-  ```python
-  decay_steps = 1000
-  lr_decayed_fn = tf.train.cosine_decay(learning_rate, global_step, decay_steps)
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
-      The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Global step to use for the decay computation.
-    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Number of steps to decay over.
-    alpha: A scalar `float32` or `float64` Tensor or a Python number.
-      Minimum learning rate value as a fraction of learning_rate.
-    name: String. Optional name of the operation.  Defaults to 'CosineDecay'.
-  Returns:
-    A no-arg function that outputs the decayed learning rate, a scalar `Tensor`
-    of the same type as `learning_rate`.
-  Raises:
-    ValueError: if `global_step` is not supplied.
-  """
-  if global_step is None:
-    raise ValueError("cosine decay requires global_step")
-  def decayed_lr(learning_rate, global_step, decay_steps, alpha, name):
-    """Helper to recompute learning rate; most helpful in eager-mode."""
-    with ops.name_scope(name, "CosineDecay",
-                        [learning_rate, global_step]) as name:
-      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
-      dtype = learning_rate.dtype
-      decay_steps = math_ops.cast(decay_steps, dtype)
-
-      global_step_recomp = math_ops.cast(global_step, dtype)
-      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
-      completed_fraction = global_step_recomp / decay_steps
-      cosine_decayed = 0.5 * (1.0 + math_ops.cos(
-          constant_op.constant(math.pi) * completed_fraction))
-
-      decayed = (1 - alpha) * cosine_decayed + alpha
-      return math_ops.multiply(learning_rate, decayed)
-
-  return functools.partial(decayed_lr, learning_rate, global_step, decay_steps,
-                           alpha, name)
-
-
-@tf_export("train.cosine_decay_restarts", v1=[])
-def cosine_decay_restarts(learning_rate,
-                          global_step,
-                          first_decay_steps,
-                          t_mul=2.0,
-                          m_mul=1.0,
-                          alpha=0.0,
-                          name=None):
-  """Applies cosine decay with restarts to the learning rate.
-
-  See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
-  with Warm Restarts. https://arxiv.org/abs/1608.03983
-
-  When training a model, it is often recommended to lower the learning rate as
-  the training progresses.  This function applies a cosine decay function with
-  restarts to a provided initial learning rate.  It requires a `global_step`
-  value to compute the decayed learning rate.  You can just pass a TensorFlow
-  variable that you increment at each training step.
-
-  The function returns a no-arg callable that produces the decayed learning
-  rate while taking into account possible warm restarts. This can be useful for
-  changing the learning rate value across different invocations of optimizer
-  functions.
-
-  The learning rate multiplier first decays
-  from 1 to `alpha` for `first_decay_steps` steps. Then, a warm
-  restart is performed. Each new warm restart runs for `t_mul` times more steps
-  and with `m_mul` times smaller initial learning rate.
-
-  Example usage:
-  ```python
-  first_decay_steps = 1000
-  lr_decayed_fn = tf.train.cosine_decay_restarts(learning_rate, global_step,
-                                     first_decay_steps)
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
-      The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Global step to use for the decay computation.
-    first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Number of steps to decay over.
-    t_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
-      Used to derive the number of iterations in the i-th period
-    m_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
-      Used to derive the initial learning rate of the i-th period:
-    alpha: A scalar `float32` or `float64` Tensor or a Python number.
-      Minimum learning rate value as a fraction of the learning_rate.
-    name: String. Optional name of the operation.  Defaults to 'SGDRDecay'.
-  Returns:
-    A no-arg function that outputs the decayed learning rate, a scalar `Tensor`
-    of the same type as `learning_rate`.
-
-  Raises:
-    ValueError: if `global_step` is not supplied.
-  """
-  if global_step is None:
-    raise ValueError("cosine decay restarts requires global_step")
-  def decayed_lr(learning_rate, global_step, first_decay_steps, t_mul, m_mul,
-                 alpha, name):
-    """Helper to recompute learning rate; most helpful in eager-mode."""
-    with ops.name_scope(name, "SGDRDecay", [learning_rate, global_step]
-                       ) as name:
-      learning_rate = ops.convert_to_tensor(
-          learning_rate, name="initial_learning_rate")
-      dtype = learning_rate.dtype
-      first_decay_steps = math_ops.cast(first_decay_steps, dtype)
-      alpha = math_ops.cast(alpha, dtype)
-      t_mul = math_ops.cast(t_mul, dtype)
-      m_mul = math_ops.cast(m_mul, dtype)
-
-      global_step_recomp = math_ops.cast(global_step, dtype)
-      completed_fraction = global_step_recomp / first_decay_steps
-
-      def compute_step(completed_fraction, geometric=False):
-        """Helper for `cond` operation."""
-        if geometric:
-          i_restart = math_ops.floor(
-              math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) /
-              math_ops.log(t_mul))
-
-          sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul)
-          completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart
-
-        else:
-          i_restart = math_ops.floor(completed_fraction)
-          completed_fraction -= i_restart
-
-        return i_restart, completed_fraction
-
-      i_restart, completed_fraction = control_flow_ops.cond(
-          math_ops.equal(t_mul, 1.0),
-          lambda: compute_step(completed_fraction, geometric=False),
-          lambda: compute_step(completed_fraction, geometric=True))
-
-      m_fac = m_mul**i_restart
-      cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos(
-          constant_op.constant(math.pi) * completed_fraction))
-      decayed = (1 - alpha) * cosine_decayed + alpha
-
-      return math_ops.multiply(learning_rate, decayed, name=name)
-
-  return functools.partial(decayed_lr, learning_rate, global_step,
-                           first_decay_steps, t_mul, m_mul, alpha, name)
-
-
-@tf_export("train.linear_cosine_decay", v1=[])
-def linear_cosine_decay(learning_rate,
-                        global_step,
-                        decay_steps,
-                        num_periods=0.5,
-                        alpha=0.0,
-                        beta=0.001,
-                        name=None):
-  """Applies linear cosine decay to the learning rate.
-
-  See [Bello et al., ICML2017] Neural Optimizer Search with RL.
-  https://arxiv.org/abs/1709.07417
-
-  For the idea of warm starts here controlled by `num_periods`,
-  see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
-  with Warm Restarts. https://arxiv.org/abs/1608.03983
-
-  Note that linear cosine decay is more aggressive than cosine decay and
-  larger initial learning rates can typically be used.
-
-  When training a model, it is often recommended to lower the learning rate as
-  the training progresses.  This function applies a linear cosine decay function
-  to a provided initial learning rate.  It requires a `global_step` value to
-  compute the decayed learning rate.  You can just pass a TensorFlow variable
-  that you increment at each training step.
-
-  The function returns a no-arg callable that produces the decayed learning
-  rate. This can be useful for changing the learning rate value across
-  different invocations of optimizer functions. It is computed as:
-
-  ```python
-  global_step = min(global_step, decay_steps)
-  linear_decay = (decay_steps - global_step) / decay_steps)
-  cosine_decay = 0.5 * (
-      1 + cos(pi * 2 * num_periods * global_step / decay_steps))
-  decayed = (alpha + linear_decay) * cosine_decay + beta
-  decayed_learning_rate = learning_rate * decayed
-  ```
-
-  Example usage:
-  ```python
-  decay_steps = 1000
-  lr_decayed_fn = tf.train.linear_cosine_decay(learning_rate, global_step,
-                                               decay_steps)
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
-      The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Global step to use for the decay computation.
-    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Number of steps to decay over.
-    num_periods: Number of periods in the cosine part of the decay.
-      See computation above.
-    alpha: See computation above.
-    beta: See computation above.
-    name: String.  Optional name of the operation.  Defaults to
-      'LinearCosineDecay'.
-  Returns:
-    A no-arg function that outputs the decayed learning rate, a scalar `Tensor`
-    of the same type as `learning_rate`.
-  Raises:
-    ValueError: if `global_step` is not supplied.
-  """
-  if global_step is None:
-    raise ValueError("linear cosine decay requires global_step")
-  def decayed_lr(learning_rate, global_step, decay_steps, num_periods, alpha,
-                 beta, name):
-    """Helper to recompute learning rate; most helpful in eager-mode."""
-    with ops.name_scope(name, "LinearCosineDecay",
-                        [learning_rate, global_step]) as name:
-      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
-      dtype = learning_rate.dtype
-      decay_steps = math_ops.cast(decay_steps, dtype)
-      num_periods = math_ops.cast(num_periods, dtype)
-      alpha = math_ops.cast(alpha, dtype)
-      beta = math_ops.cast(beta, dtype)
-
-      global_step_recomp = math_ops.cast(global_step, dtype)
-      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
-      linear_decayed = (decay_steps - global_step_recomp) / decay_steps
-      completed_fraction = global_step_recomp / decay_steps
-      fraction = 2.0 * num_periods * completed_fraction
-      cosine_decayed = 0.5 * (
-          1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
-
-      linear_cosine_decayed = (alpha + linear_decayed) * cosine_decayed + beta
-      return math_ops.multiply(learning_rate, linear_cosine_decayed, name=name)
-
-  return functools.partial(decayed_lr, learning_rate, global_step, decay_steps,
-                           num_periods, alpha, beta, name)
-
-
-@tf_export("train.noisy_linear_cosine_decay", v1=[])
-def noisy_linear_cosine_decay(learning_rate,
-                              global_step,
-                              decay_steps,
-                              initial_variance=1.0,
-                              variance_decay=0.55,
-                              num_periods=0.5,
-                              alpha=0.0,
-                              beta=0.001,
-                              name=None):
-  """Applies noisy linear cosine decay to the learning rate.
-
-  See [Bello et al., ICML2017] Neural Optimizer Search with RL.
-  https://arxiv.org/abs/1709.07417
-
-  For the idea of warm starts here controlled by `num_periods`,
-  see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
-  with Warm Restarts. https://arxiv.org/abs/1608.03983
-
-  Note that linear cosine decay is more aggressive than cosine decay and
-  larger initial learning rates can typically be used.
-
-  When training a model, it is often recommended to lower the learning rate as
-  the training progresses.  This function applies a noisy linear
-  cosine decay function to a provided initial learning rate.
-  It requires a `global_step` value to compute the decayed learning rate.
-  You can just pass a TensorFlow variable that you increment at each
-  training step.
-
-  The function returns a no-arg callable that produces the decayed learning
-  rate. This can be useful for changing the learning rate value across
-  different invocations of optimizer functions. It is computed as:
-
-  ```python
-  global_step = min(global_step, decay_steps)
-  linear_decay = (decay_steps - global_step) / decay_steps)
-  cosine_decay = 0.5 * (
-      1 + cos(pi * 2 * num_periods * global_step / decay_steps))
-  decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta
-  decayed_learning_rate = learning_rate * decayed
-  ```
-  where eps_t is 0-centered gaussian noise with variance
-  initial_variance / (1 + global_step) ** variance_decay
-
-  Example usage:
-  ```python
-  decay_steps = 1000
-  lr_decayed_fn = tf.train.noisy_linear_cosine_decay(learning_rate, global_step,
-                                                     decay_steps)
-  ```
-
-  Args:
-    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
-      The initial learning rate.
-    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Global step to use for the decay computation.
-    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
-      Number of steps to decay over.
-    initial_variance: initial variance for the noise. See computation above.
-    variance_decay: decay for the noise's variance. See computation above.
-    num_periods: Number of periods in the cosine part of the decay.
-      See computation above.
-    alpha: See computation above.
-    beta: See computation above.
-    name: String.  Optional name of the operation.  Defaults to
-      'NoisyLinearCosineDecay'.
-  Returns:
-    A no-arg function that outputs the decayed learning rate, a scalar `Tensor`
-    of the same type as `learning_rate`.
-  Raises:
-    ValueError: if `global_step` is not supplied.
-  """
-  if global_step is None:
-    raise ValueError("noisy linear cosine decay requires global_step")
-  def decayed_lr(learning_rate, global_step, decay_steps, initial_variance,
-                 variance_decay, num_periods, alpha, beta, name):
-    """Helper to recompute learning rate; most helpful in eager-mode."""
-    with ops.name_scope(name, "NoisyLinearCosineDecay",
-                        [learning_rate, global_step]) as name:
-      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
-      dtype = learning_rate.dtype
-      decay_steps = math_ops.cast(decay_steps, dtype)
-      initial_variance = math_ops.cast(initial_variance, dtype)
-      variance_decay = math_ops.cast(variance_decay, dtype)
-      num_periods = math_ops.cast(num_periods, dtype)
-      alpha = math_ops.cast(alpha, dtype)
-      beta = math_ops.cast(beta, dtype)
-
-      global_step_recomp = math_ops.cast(global_step, dtype)
-      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
-      linear_decayed = (decay_steps - global_step_recomp) / decay_steps
-      variance = initial_variance / (
-          math_ops.pow(1.0 + global_step_recomp, variance_decay))
-      std = math_ops.sqrt(variance)
-      noisy_linear_decayed = (
-          linear_decayed + random_ops.random_normal(
-              linear_decayed.shape, stddev=std))
-
-      completed_fraction = global_step_recomp / decay_steps
-      fraction = 2.0 * num_periods * completed_fraction
-      cosine_decayed = 0.5 * (
-          1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
-      noisy_linear_cosine_decayed = (
-          (alpha + noisy_linear_decayed) * cosine_decayed + beta)
-
-      return math_ops.multiply(
-          learning_rate, noisy_linear_cosine_decayed, name=name)
-
-  return functools.partial(decayed_lr, learning_rate, global_step, decay_steps,
-                           initial_variance, variance_decay, num_periods, alpha,
-                           beta, name)
diff --git a/tensorflow/python/training/learning_rate_decay_v2_test.py b/tensorflow/python/training/learning_rate_decay_v2_test.py
deleted file mode 100644
index cb96773e299a37db1d5792c84d6a837147e09d04..0000000000000000000000000000000000000000
--- a/tensorflow/python/training/learning_rate_decay_v2_test.py
+++ /dev/null
@@ -1,497 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Functional test for learning rate decay."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-
-from tensorflow.python.eager import context
-from tensorflow.python.framework import test_util
-# Import resource_variable_ops for the variables-to-tensor implicit conversion.
-from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import googletest
-from tensorflow.python.training import learning_rate_decay_v2
-
-
-class LRDecayTestV2(test_util.TensorFlowTestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def testContinuous(self):
-    self.evaluate(variables.global_variables_initializer())
-    step = 5
-    decayed_lr = learning_rate_decay_v2.exponential_decay(0.05, step, 10, 0.96)
-    expected = .05 * 0.96**(5.0 / 10.0)
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testStaircase(self):
-    if context.executing_eagerly():
-      step = resource_variable_ops.ResourceVariable(0)
-      self.evaluate(variables.global_variables_initializer())
-      decayed_lr = learning_rate_decay_v2.exponential_decay(
-          .1, step, 3, 0.96, staircase=True)
-
-      # No change to learning rate due to staircase
-      expected = .1
-      self.evaluate(step.assign(1))
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-      expected = .1
-      self.evaluate(step.assign(2))
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-      # Decayed learning rate
-      expected = .1 * 0.96 ** (100 // 3)
-      self.evaluate(step.assign(100))
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_deprecated_v1
-  def testVariables(self):
-    step = variables.Variable(1)
-    assign_1 = step.assign(1)
-    assign_2 = step.assign(2)
-    assign_100 = step.assign(100)
-    decayed_lr = learning_rate_decay_v2.exponential_decay(
-        .1, step, 3, 0.96, staircase=True)
-    self.evaluate(variables.global_variables_initializer())
-    # No change to learning rate
-    self.evaluate(assign_1.op)
-    self.assertAllClose(self.evaluate(decayed_lr()), .1, 1e-6)
-    self.evaluate(assign_2.op)
-    self.assertAllClose(self.evaluate(decayed_lr()), .1, 1e-6)
-    # Decayed learning rate
-    self.evaluate(assign_100.op)
-    expected = .1 * 0.96**(100 // 3)
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testPiecewiseConstant(self):
-    x = resource_variable_ops.ResourceVariable(-999)
-    decayed_lr = learning_rate_decay_v2.piecewise_constant(
-        x, [100, 110, 120], [1.0, 0.1, 0.01, 0.001])
-
-    self.evaluate(variables.global_variables_initializer())
-
-    self.assertAllClose(self.evaluate(decayed_lr()), 1.0, 1e-6)
-    self.evaluate(x.assign(100))
-    self.assertAllClose(self.evaluate(decayed_lr()), 1.0, 1e-6)
-    self.evaluate(x.assign(105))
-    self.assertAllClose(self.evaluate(decayed_lr()), 0.1, 1e-6)
-    self.evaluate(x.assign(110))
-    self.assertAllClose(self.evaluate(decayed_lr()), 0.1, 1e-6)
-    self.evaluate(x.assign(120))
-    self.assertAllClose(self.evaluate(decayed_lr()), 0.01, 1e-6)
-    self.evaluate(x.assign(999))
-    self.assertAllClose(self.evaluate(decayed_lr()), 0.001, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testPiecewiseConstantEdgeCases(self):
-    x_int = resource_variable_ops.ResourceVariable(
-        0, dtype=variables.dtypes.int32)
-    boundaries, values = [-1.0, 1.0], [1, 2, 3]
-    with self.assertRaises(ValueError):
-      decayed_lr = learning_rate_decay_v2.piecewise_constant(
-          x_int, boundaries, values)
-      decayed_lr()
-
-    x = resource_variable_ops.ResourceVariable(0.0)
-    boundaries, values = [-1.0, 1.0], [1.0, 2, 3]
-    with self.assertRaises(ValueError):
-      decayed_lr = learning_rate_decay_v2.piecewise_constant(
-          x, boundaries, values)()
-      decayed_lr()
-
-    # Test that ref types are valid.
-    if not context.executing_eagerly():
-      x = variables.Variable(0.0)
-      x_ref = x.op.outputs[0]   # float32_ref tensor should be accepted
-      boundaries, values = [1.0, 2.0], [1, 2, 3]
-      learning_rate_decay_v2.piecewise_constant(x_ref, boundaries, values)
-
-    # Test casting boundaries from int32 to int64.
-    x_int64 = resource_variable_ops.ResourceVariable(
-        0, dtype=variables.dtypes.int64)
-    boundaries, values = [1, 2, 3], [0.4, 0.5, 0.6, 0.7]
-    decayed_lr = learning_rate_decay_v2.piecewise_constant(
-        x_int64, boundaries, values)
-
-    self.evaluate(variables.global_variables_initializer())
-    self.assertAllClose(self.evaluate(decayed_lr()), 0.4, 1e-6)
-    self.evaluate(x_int64.assign(1))
-    self.assertAllClose(self.evaluate(decayed_lr()), 0.4, 1e-6)
-    self.evaluate(x_int64.assign(2))
-    self.assertAllClose(self.evaluate(decayed_lr()), 0.5, 1e-6)
-    self.evaluate(x_int64.assign(3))
-    self.assertAllClose(self.evaluate(decayed_lr()), 0.6, 1e-6)
-    self.evaluate(x_int64.assign(4))
-    self.assertAllClose(self.evaluate(decayed_lr()), 0.7, 1e-6)
-
-
-class LinearDecayTestV2(test_util.TensorFlowTestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def testHalfWay(self):
-    step = 5
-    lr = 0.05
-    end_lr = 0.0
-    decayed_lr = learning_rate_decay_v2.polynomial_decay(lr, step, 10, end_lr)
-    expected = lr * 0.5
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testEnd(self):
-    step = 10
-    lr = 0.05
-    end_lr = 0.001
-    decayed_lr = learning_rate_decay_v2.polynomial_decay(lr, step, 10, end_lr)
-    expected = end_lr
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testHalfWayWithEnd(self):
-    step = 5
-    lr = 0.05
-    end_lr = 0.001
-    decayed_lr = learning_rate_decay_v2.polynomial_decay(lr, step, 10, end_lr)
-    expected = (lr + end_lr) * 0.5
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testBeyondEnd(self):
-    step = 15
-    lr = 0.05
-    end_lr = 0.001
-    decayed_lr = learning_rate_decay_v2.polynomial_decay(lr, step, 10, end_lr)
-    expected = end_lr
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testBeyondEndWithCycle(self):
-    step = 15
-    lr = 0.05
-    end_lr = 0.001
-    decayed_lr = learning_rate_decay_v2.polynomial_decay(
-        lr, step, 10, end_lr, cycle=True)
-    expected = (lr - end_lr) * 0.25 + end_lr
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-
-class SqrtDecayTestV2(test_util.TensorFlowTestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def testHalfWay(self):
-    step = 5
-    lr = 0.05
-    end_lr = 0.0
-    power = 0.5
-    decayed_lr = learning_rate_decay_v2.polynomial_decay(
-        lr, step, 10, end_lr, power=power)
-    expected = lr * 0.5**power
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testEnd(self):
-    step = 10
-    lr = 0.05
-    end_lr = 0.001
-    power = 0.5
-    decayed_lr = learning_rate_decay_v2.polynomial_decay(
-        lr, step, 10, end_lr, power=power)
-    expected = end_lr
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testHalfWayWithEnd(self):
-    step = 5
-    lr = 0.05
-    end_lr = 0.001
-    power = 0.5
-    decayed_lr = learning_rate_decay_v2.polynomial_decay(
-        lr, step, 10, end_lr, power=power)
-    expected = (lr - end_lr) * 0.5**power + end_lr
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testBeyondEnd(self):
-    step = 15
-    lr = 0.05
-    end_lr = 0.001
-    power = 0.5
-    decayed_lr = learning_rate_decay_v2.polynomial_decay(
-        lr, step, 10, end_lr, power=power)
-    expected = end_lr
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testBeyondEndWithCycle(self):
-    step = 15
-    lr = 0.05
-    end_lr = 0.001
-    power = 0.5
-    decayed_lr = learning_rate_decay_v2.polynomial_decay(
-        lr, step, 10, end_lr, power=power, cycle=True)
-    expected = (lr - end_lr) * 0.25**power + end_lr
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-
-class PolynomialDecayTestV2(test_util.TensorFlowTestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def testBeginWithCycle(self):
-    lr = 0.001
-    decay_steps = 10
-    step = 0
-    decayed_lr = learning_rate_decay_v2.polynomial_decay(
-        lr, step, decay_steps, cycle=True)
-    expected = lr
-    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-
-class ExponentialDecayTestV2(test_util.TensorFlowTestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDecay(self):
-    initial_lr = 0.1
-    k = 10
-    decay_rate = 0.96
-    step = resource_variable_ops.ResourceVariable(0)
-    decayed_lr = learning_rate_decay_v2.natural_exp_decay(initial_lr, step, k,
-                                                          decay_rate)
-
-    self.evaluate(variables.global_variables_initializer())
-    for i in range(k + 1):
-      expected = initial_lr * math.exp(-i / k * decay_rate)
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-      self.evaluate(step.assign_add(1))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testStaircase(self):
-    initial_lr = 0.1
-    k = 10
-    decay_rate = 0.96
-    step = resource_variable_ops.ResourceVariable(0)
-    decayed_lr = learning_rate_decay_v2.natural_exp_decay(
-        initial_lr, step, k, decay_rate, staircase=True)
-
-    self.evaluate(variables.global_variables_initializer())
-    for i in range(k + 1):
-      expected = initial_lr * math.exp(-decay_rate * (i // k))
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-      self.evaluate(step.assign_add(1))
-
-
-class InverseDecayTestV2(test_util.TensorFlowTestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDecay(self):
-    initial_lr = 0.1
-    k = 10
-    decay_rate = 0.96
-    step = resource_variable_ops.ResourceVariable(0)
-    decayed_lr = learning_rate_decay_v2.inverse_time_decay(initial_lr, step, k,
-                                                           decay_rate)
-
-    self.evaluate(variables.global_variables_initializer())
-    for i in range(k + 1):
-      expected = initial_lr / (1 + i / k * decay_rate)
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-      self.evaluate(step.assign_add(1))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testStaircase(self):
-    initial_lr = 0.1
-    k = 10
-    decay_rate = 0.96
-    step = resource_variable_ops.ResourceVariable(0)
-    decayed_lr = learning_rate_decay_v2.inverse_time_decay(
-        initial_lr, step, k, decay_rate, staircase=True)
-
-    self.evaluate(variables.global_variables_initializer())
-    for i in range(k + 1):
-      expected = initial_lr / (1 + decay_rate * (i // k))
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-      self.evaluate(step.assign_add(1))
-
-
-class CosineDecayTestV2(test_util.TensorFlowTestCase):
-
-  def np_cosine_decay(self, step, decay_steps, alpha=0.0):
-    step = min(step, decay_steps)
-    completed_fraction = step / decay_steps
-    decay = 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
-    return (1.0 - alpha) * decay + alpha
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDecay(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    for step in range(0, 1500, 250):
-      decayed_lr = learning_rate_decay_v2.cosine_decay(initial_lr, step,
-                                                       num_training_steps)
-      expected = self.np_cosine_decay(step, num_training_steps)
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testAlpha(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    alpha = 0.1
-    for step in range(0, 1500, 250):
-      decayed_lr = learning_rate_decay_v2.cosine_decay(initial_lr, step,
-                                                       num_training_steps,
-                                                       alpha)
-      expected = self.np_cosine_decay(step, num_training_steps, alpha)
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-
-class CosineDecayRestartsTestV2(test_util.TensorFlowTestCase):
-
-  def np_cosine_decay_restarts(self, step, decay_steps, t_mul=2.0, m_mul=1.0,
-                               alpha=0.0):
-    fac = 1.0
-    while step >= decay_steps:
-      step -= decay_steps
-      decay_steps *= t_mul
-      fac *= m_mul
-
-    completed_fraction = step / decay_steps
-    decay = fac * 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
-    return (1.0 - alpha) * decay + alpha
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDecay(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    for step in range(0, 1500, 250):
-      decayed_lr = learning_rate_decay_v2.cosine_decay_restarts(
-          initial_lr, step, num_training_steps)
-      expected = self.np_cosine_decay_restarts(step, num_training_steps)
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testAlpha(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    alpha = 0.1
-    for step in range(0, 1500, 250):
-      decayed_lr = learning_rate_decay_v2.cosine_decay_restarts(
-          initial_lr, step, num_training_steps, alpha=alpha)
-      expected = self.np_cosine_decay_restarts(
-          step, num_training_steps, alpha=alpha)
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testMMul(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    m_mul = 0.9
-    for step in range(0, 1500, 250):
-      decayed_lr = learning_rate_decay_v2.cosine_decay_restarts(
-          initial_lr, step, num_training_steps, m_mul=m_mul)
-      expected = self.np_cosine_decay_restarts(
-          step, num_training_steps, m_mul=m_mul)
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testTMul(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    t_mul = 1.0
-    for step in range(0, 1500, 250):
-      decayed_lr = learning_rate_decay_v2.cosine_decay_restarts(
-          initial_lr, step, num_training_steps, t_mul=t_mul)
-      expected = self.np_cosine_decay_restarts(
-          step, num_training_steps, t_mul=t_mul)
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-
-class LinearCosineDecayTestV2(test_util.TensorFlowTestCase):
-
-  def np_linear_cosine_decay(self,
-                             step,
-                             decay_steps,
-                             alpha=0.0,
-                             beta=0.001,
-                             num_periods=0.5):
-    step = min(step, decay_steps)
-    linear_decayed = float(decay_steps - step) / decay_steps
-    fraction = 2.0 * num_periods * step / float(decay_steps)
-    cosine_decayed = 0.5 * (1.0 + math.cos(math.pi * fraction))
-    return (alpha + linear_decayed) * cosine_decayed + beta
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDefaultDecay(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    for step in range(0, 1500, 250):
-      decayed_lr = learning_rate_decay_v2.linear_cosine_decay(
-          initial_lr, step, num_training_steps)
-      expected = self.np_linear_cosine_decay(step, num_training_steps)
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testNonDefaultDecay(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    for step in range(0, 1500, 250):
-      decayed_lr = learning_rate_decay_v2.linear_cosine_decay(
-          initial_lr,
-          step,
-          num_training_steps,
-          alpha=0.1,
-          beta=1e-4,
-          num_periods=5)
-      expected = self.np_linear_cosine_decay(
-          step, num_training_steps, alpha=0.1, beta=1e-4, num_periods=5)
-      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
-
-
-class NoisyLinearCosineDecayTestV2(test_util.TensorFlowTestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDefaultNoisyLinearCosine(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    for step in range(0, 1500, 250):
-      # No numerical check because of noise
-      decayed_lr = learning_rate_decay_v2.noisy_linear_cosine_decay(
-          initial_lr, step, num_training_steps)
-      # Cannot be deterministically tested
-      self.evaluate(decayed_lr())
-
-  @test_util.run_in_graph_and_eager_modes
-  def testNonDefaultNoisyLinearCosine(self):
-    num_training_steps = 1000
-    initial_lr = 1.0
-    for step in range(0, 1500, 250):
-      # No numerical check because of noise
-      decayed_lr = learning_rate_decay_v2.noisy_linear_cosine_decay(
-          initial_lr,
-          step,
-          num_training_steps,
-          initial_variance=0.5,
-          variance_decay=0.1,
-          alpha=0.1,
-          beta=1e-4,
-          num_periods=5)
-      # Cannot be deterministically tested
-      self.evaluate(decayed_lr())
-
-if __name__ == "__main__":
-  googletest.main()
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 6a7d27df5c322bfad37cf1ef207f66353d636111..7d7e95c7e60fe379ded98c60ca89a71a288b3e50 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -41,6 +41,8 @@ from tensorflow.python.training import queue_runner
 from tensorflow.python.training import saver as training_saver
 from tensorflow.python.training import session_manager as sm
 from tensorflow.python.training import session_run_hook
+from tensorflow.python.training.tracking import graph_view
+from tensorflow.python.training.tracking import util as trackable_util
 from tensorflow.python.util import function_utils
 from tensorflow.python.util.tf_export import tf_export
 
@@ -136,6 +138,16 @@ class Scaffold(object):
         string tensor containing a serialized `Summary` proto.
       saver: Optional `tf.train.Saver` object to use to save and restore
         variables.
+
+        May also be a `tf.train.Checkpoint` object, in which case object-based
+        checkpoints are saved. This will also load some object-based checkpoints
+        saved from elsewhere, but that loading may be fragile since it uses
+        fixed keys rather than performing a full graph-based match. For example
+        if a variable has two paths from the `Checkpoint` object because two
+        `Model` objects share the `Layer` object that owns it, removing one
+        `Model` may change the keys and break checkpoint loading through this
+        API, whereas a graph-based match would match the variable through the
+        other `Model`.
       copy_from_scaffold: Optional scaffold object to copy fields from. Its
         fields will be overwritten by the provided fields in this function.
     """
@@ -216,7 +228,13 @@ class Scaffold(object):
     if self._saver is None:
       self._saver = training_saver._get_saver_or_default()  # pylint: disable=protected-access
     # pylint: enable=g-long-lambda
-    self._saver.build()
+    if isinstance(self._saver, trackable_util.Checkpoint):
+      self._saver = training_saver.Saver(
+          var_list=graph_view.ObjectGraphView(
+              self._saver).frozen_saveable_objects(),
+          sharded=True)
+    else:
+      self._saver.build()
 
     ops.get_default_graph().finalize()
     logging.info('Graph was finalized.')
@@ -1392,9 +1410,11 @@ class _HookedSession(_WrappedSession):
     options.output_partition_graphs = max(
         options.output_partition_graphs,
         incoming_options.output_partition_graphs)
-
     options.debug_options.debug_tensor_watch_opts.extend(
         incoming_options.debug_options.debug_tensor_watch_opts)
     options.debug_options.reset_disk_byte_usage = (
         options.debug_options.reset_disk_byte_usage or
         incoming_options.debug_options.reset_disk_byte_usage)
+    options.report_tensor_allocations_upon_oom = (
+        options.report_tensor_allocations_upon_oom or
+        incoming_options.report_tensor_allocations_upon_oom)
diff --git a/tensorflow/python/training/monitored_session_test.py b/tensorflow/python/training/monitored_session_test.py
index 99ee9ea7e2e4d32f9a24513d9c46f9de4fa2d797..6d24f8e17e797cc7e525d7a359010be45ca7b71c 100644
--- a/tensorflow/python/training/monitored_session_test.py
+++ b/tensorflow/python/training/monitored_session_test.py
@@ -1364,11 +1364,13 @@ class RunOptionsMetadataHook(session_run_hook.SessionRunHook):
   """A hook that observes & optionally modifies RunOptions and RunMetadata."""
 
   def __init__(self, trace_level, timeout_in_ms, output_partition_graphs,
-               debug_tensor_watch):
+               debug_tensor_watch, report_tensor_allocations_upon_oom):
     self._trace_level = trace_level
     self._timeout_in_ms = timeout_in_ms
     self._output_partition_graphs = output_partition_graphs
     self._debug_tensor_watch = debug_tensor_watch
+    self._report_tensor_allocations_upon_oom = (
+        report_tensor_allocations_upon_oom)
 
     self.run_options_list = []
     self.run_metadata_list = []
@@ -1377,7 +1379,9 @@ class RunOptionsMetadataHook(session_run_hook.SessionRunHook):
     options = config_pb2.RunOptions(
         trace_level=self._trace_level,
         timeout_in_ms=self._timeout_in_ms,
-        output_partition_graphs=self._output_partition_graphs)
+        output_partition_graphs=self._output_partition_graphs,
+        report_tensor_allocations_upon_oom=self
+        ._report_tensor_allocations_upon_oom)
     options.debug_options.debug_tensor_watch_opts.extend(
         [self._debug_tensor_watch])
     return session_run_hook.SessionRunArgs(None, None, options=options)
@@ -1746,13 +1750,13 @@ class MonitoredSessionTest(test.TestCase):
           output_slot=0,
           debug_ops=['DebugIdentity'],
           debug_urls=[])
-      hook_a = RunOptionsMetadataHook(2, 30000, False, watch_a)
+      hook_a = RunOptionsMetadataHook(2, 30000, False, watch_a, False)
       watch_b = debug_pb2.DebugTensorWatch(
           node_name='my_const_2',
           output_slot=0,
           debug_ops=['DebugIdentity'],
           debug_urls=[])
-      hook_b = RunOptionsMetadataHook(3, 60000, True, watch_b)
+      hook_b = RunOptionsMetadataHook(3, 60000, True, watch_b, True)
       with monitored_session.MonitoredSession(
           hooks=[hook_a, hook_b]) as session:
         self.assertEqual(42, session.run(my_const))
@@ -1761,16 +1765,15 @@ class MonitoredSessionTest(test.TestCase):
         # timeout_in_ms=60000 should have overridden 30000;
         # output_partition_graphs=True should have overridden False.
         # The two debug tensor watches should have been merged.
-        self.assertEqual(
-            [
-                config_pb2.RunOptions(
-                    trace_level=3,
-                    timeout_in_ms=60000,
-                    output_partition_graphs=True,
-                    debug_options=debug_pb2.DebugOptions(
-                        debug_tensor_watch_opts=[watch_a, watch_b]))
-            ],
-            hook_b.run_options_list)
+        self.assertEqual([
+            config_pb2.RunOptions(
+                trace_level=3,
+                timeout_in_ms=60000,
+                output_partition_graphs=True,
+                debug_options=debug_pb2.DebugOptions(
+                    debug_tensor_watch_opts=[watch_a, watch_b]),
+                report_tensor_allocations_upon_oom=True),
+        ], hook_b.run_options_list)
         self.assertEqual(1, len(hook_b.run_metadata_list))
         self.assertTrue(
             isinstance(hook_b.run_metadata_list[0], config_pb2.RunMetadata))
@@ -1788,7 +1791,7 @@ class MonitoredSessionTest(test.TestCase):
           output_slot=0,
           debug_ops=['DebugIdentity'],
           debug_urls=[])
-      hook = RunOptionsMetadataHook(2, 60000, False, hook_watch)
+      hook = RunOptionsMetadataHook(2, 60000, False, hook_watch, False)
       with monitored_session.MonitoredSession(hooks=[hook]) as session:
         caller_watch = debug_pb2.DebugTensorWatch(
             node_name='my_const',
@@ -1796,7 +1799,10 @@ class MonitoredSessionTest(test.TestCase):
             debug_ops=['DebugIdentity'],
             debug_urls=[])
         caller_options = config_pb2.RunOptions(
-            trace_level=3, timeout_in_ms=30000, output_partition_graphs=True)
+            trace_level=3,
+            timeout_in_ms=30000,
+            output_partition_graphs=True,
+            report_tensor_allocations_upon_oom=True)
         caller_options.debug_options.debug_tensor_watch_opts.extend(
             [caller_watch])
         self.assertEqual(42, session.run(my_const, options=caller_options))
@@ -1807,16 +1813,15 @@ class MonitoredSessionTest(test.TestCase):
         # from the hook.
         # The two debug watches from the caller and the hook should be merged,
         # in that order.
-        self.assertEqual(
-            [
-                config_pb2.RunOptions(
-                    trace_level=3,
-                    timeout_in_ms=60000,
-                    output_partition_graphs=True,
-                    debug_options=debug_pb2.DebugOptions(
-                        debug_tensor_watch_opts=[caller_watch, hook_watch]))
-            ],
-            hook.run_options_list)
+        self.assertEqual([
+            config_pb2.RunOptions(
+                trace_level=3,
+                timeout_in_ms=60000,
+                output_partition_graphs=True,
+                debug_options=debug_pb2.DebugOptions(
+                    debug_tensor_watch_opts=[caller_watch, hook_watch]),
+                report_tensor_allocations_upon_oom=True),
+        ], hook.run_options_list)
         self.assertEqual(1, len(hook.run_metadata_list))
         self.assertTrue(
             isinstance(hook.run_metadata_list[0], config_pb2.RunMetadata))
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index 72670f0ca39f67b151abcb1813ede7ee36c6544b..6efcab28c5249fe943f6d4a1b0b6b7866271571f 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -98,12 +98,12 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
       def merge_fn(strategy, v, value):
         value = strategy.extended.reduce_to(
             ds_reduce_util.ReduceOp.MEAN, value, v)
-        return strategy.update(v, update_fn, value)
+        return strategy.extended.update(v, update_fn, args=(value,))
 
       return replica_context.merge_call(merge_fn, args=(variable, value))
     else:
       strategy = distribution_strategy_context.get_cross_replica_context()
-      return strategy.update(variable, update_fn, value)
+      return strategy.extended.update(variable, update_fn, args=(value,))
 
 
 def weighted_moving_average(value,
@@ -505,13 +505,13 @@ class ExponentialMovingAverage(object):
     ```
     Args:
       moving_avg_variables: a list of variables that require to use of the
-        moving variable name to be restored. If None, it will default to
+        moving average variable name to be restored. If None, it will default to
         variables.moving_average_variables() + variables.trainable_variables()
 
     Returns:
-      A map from restore_names to variables. The restore_name can be the
-      moving_average version of the variable name if it exist, or the original
-      variable name.
+      A map from restore_names to variables. The restore_name is either the
+      original or the moving average version of the variable name, depending
+      on whether the variable name is in the `moving_avg_variables`.
     """
     name_map = {}
     if moving_avg_variables is None:
diff --git a/tensorflow/python/training/moving_averages_test.py b/tensorflow/python/training/moving_averages_test.py
index 03bcde9c8498ed03d2eaf52c7f1e2d4211e0ddc6..0a7cff4f56207dcfadf095da5e03371730417ad2 100644
--- a/tensorflow/python/training/moving_averages_test.py
+++ b/tensorflow/python/training/moving_averages_test.py
@@ -43,7 +43,7 @@ class MovingAveragesTest(test.TestCase):
       decay = 0.25
       assign = moving_averages.assign_moving_average(
           var, val, decay, zero_debias=False)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose([10.0, 11.0], self.evaluate(var))
       assign.op.run()
       self.assertAllClose(
@@ -57,7 +57,7 @@ class MovingAveragesTest(test.TestCase):
       val = constant_op.constant([1.0, 2.0], dtypes.float32)
       decay = 0.25
       assign = moving_averages.assign_moving_average(var, val, decay)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllClose([0.0, 0.0], self.evaluate(var))
       assign.op.run()
       self.assertAllClose(
@@ -98,7 +98,7 @@ class MovingAveragesTest(test.TestCase):
       val = array_ops.placeholder(dtypes.float32, [])
 
       wma = moving_averages.weighted_moving_average(val, decay, weight)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       # Get the first weighted moving average.
       val_1 = 3.0
@@ -125,7 +125,7 @@ class MovingAveragesTest(test.TestCase):
       val = array_ops.placeholder(dtypes.bfloat16, [])
 
       wma = moving_averages.weighted_moving_average(val, decay, weight)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       # Get the first weighted moving average.
       val_1 = 3.0
@@ -164,7 +164,7 @@ class ExponentialMovingAverageTest(test.TestCase):
     thirties = _Repeat(30.0, dim)
     var0 = variables.Variable(tens, name="v0")
     var1 = variables.Variable(thirties, name="v1")
-    variables.global_variables_initializer().run()
+    self.evaluate(variables.global_variables_initializer())
     # Note that tensor2 is not a Variable but just a plain Tensor resulting
     # from the sum operation.
     tensor2 = var0 + var1
@@ -178,7 +178,7 @@ class ExponentialMovingAverageTest(test.TestCase):
     self.assertFalse(avg0 in variables.trainable_variables())
     self.assertFalse(avg1 in variables.trainable_variables())
     self.assertFalse(avg2 in variables.trainable_variables())
-    variables.global_variables_initializer().run()
+    self.evaluate(variables.global_variables_initializer())
 
     self.assertEqual("v0/ExponentialMovingAverage:0", avg0.name)
     self.assertEqual("v1/ExponentialMovingAverage:0", avg1.name)
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index d9ebdcad1f3c83c0e0d4b8496d601fce2669fbff..4361f07e196050c87338d0f7102f530d2c2c9be7 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -24,7 +24,6 @@ import abc
 
 import six
 
-from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.eager import backprop
@@ -40,7 +39,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import slot_creator
-from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -215,10 +214,10 @@ def _get_processor(v):
 
 @tf_export(v1=["train.Optimizer"])
 class Optimizer(
-    # Optimizers inherit from CheckpointableBase rather than Checkpointable
+    # Optimizers inherit from Trackable rather than AutoTrackable
     # since they do most of their dependency management themselves (slot
     # variables are special-cased, and non-slot variables are keyed to graphs).
-    checkpointable.CheckpointableBase):
+    trackable.Trackable):
   """Base class for optimizers.
 
   This class defines the API to add Ops to train a model.  You never use this
@@ -334,9 +333,9 @@ class Optimizer(
     #   ... }
     self._slots = {}
     self._non_slot_dict = {}
-    # For implementing Checkpointable. Stores information about how to restore
+    # For implementing Trackable. Stores information about how to restore
     # slot variables which have not yet been created
-    # (checkpointable._CheckpointPosition objects).
+    # (trackable._CheckpointPosition objects).
     #  {slot_name :
     #      {_var_key(variable_to_train): [checkpoint_position, ... ], ... },
     #   ... }
@@ -461,12 +460,6 @@ class Optimizer(
           tape.watch(var_list)
         loss_value = loss()
 
-        # Scale loss if using a "mean" loss reduction and multiple replicas.
-        # Have to be careful to call distribute_lib.get_loss_reduction()
-        # *after* loss() is evaluated, so we know what loss reduction it uses.
-        # TODO(josh11b): Test that we handle weight decay in a reasonable way.
-        loss_value = self._scale_loss(loss_value)
-
       if var_list is None:
         var_list = tape.watched_variables()
       # TODO(jhseu): Figure out why GradientTape's gradients don't require loss
@@ -481,9 +474,6 @@ class Optimizer(
           "`loss` passed to Optimizer.compute_gradients should "
           "be a function when eager execution is enabled.")
 
-    # Scale loss if using a "mean" loss reduction and multiple replicas.
-    loss = self._scale_loss(loss)
-
     if gate_gradients not in [Optimizer.GATE_NONE, Optimizer.GATE_OP,
                               Optimizer.GATE_GRAPH]:
       raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, "
@@ -518,15 +508,6 @@ class Optimizer(
          if g is not None and v.dtype != dtypes.resource])
     return grads_and_vars
 
-  @staticmethod
-  def _scale_loss(loss_value):
-    if distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN:
-      num_replicas = \
-        distribute_ctx.get_distribution_strategy().num_replicas_in_sync
-      if num_replicas > 1:
-        loss_value *= (1. / num_replicas)
-    return loss_value
-
   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
     """Apply gradients to variables.
 
@@ -554,14 +535,15 @@ class Optimizer(
     # by most optimizers.  It relies on the subclass implementing the following
     # methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse().
 
-    # Handle DistributionStrategy case.
-    if distribute_ctx.get_cross_replica_context():
-      raise RuntimeError("Use `_distributed_apply()` instead of "
-                         "`apply_gradients()` in a cross-replica context.")
-    # TODO(isaprykin): Get rid of `has_distribution_strategy()` check by
+    # TODO(isaprykin): Get rid of `has_strategy()` check by
     # always calling _distributed_apply(), using the default distribution
     # as needed.
-    if distribute_ctx.has_distribution_strategy():
+    if distribute_ctx.has_strategy():
+      # Handle DistributionStrategy case.
+      if distribute_ctx.in_cross_replica_context():
+        raise RuntimeError("Use `_distributed_apply()` instead of "
+                           "`apply_gradients()` in a cross-replica context.")
+
       grads_and_vars = get_filtered_grad_fn(lambda: grads_and_vars)()
       return distribute_ctx.get_replica_context().merge_call(
           self._distributed_apply, args=(grads_and_vars, global_step, name))
@@ -814,29 +796,32 @@ class Optimizer(
     key = (name, graph)
     v = self._non_slot_dict.get(key, None)
     if v is None:
-      self._maybe_initialize_checkpointable()
-      distribution_strategy = distribute_ctx.get_distribution_strategy()
-      with distribution_strategy.colocate_vars_with(colocate_with):
+      self._maybe_initialize_trackable()
+      distribution_strategy = distribute_ctx.get_strategy()
+      with distribution_strategy.extended.colocate_vars_with(colocate_with):
         if eager:
           restored_initial_value = self._preload_simple_restoration(
               name=name, shape=None)
           if restored_initial_value is not None:
             initial_value = restored_initial_value
-        v = variable_scope.variable(initial_value, name=name, trainable=False)
+        v = variable_scope.variable(
+            initial_value, name=name, trainable=False,
+            use_resource=resource_variable_ops.is_resource_variable(
+                colocate_with))
       # Restore this variable by name if necessary, but don't add a
-      # Checkpointable dependency. Optimizers return the current graph's
+      # Trackable dependency. Optimizers return the current graph's
       # non-slot variables from _checkpoint_dependencies explicitly rather
       # than unconditionally adding dependencies (since there may be multiple
       # non-slot variables with the same name in different graphs, trying to
       # save all of them would result in errors).
-      self._handle_deferred_dependencies(name=name, checkpointable=v)
+      self._handle_deferred_dependencies(name=name, trackable=v)
       self._non_slot_dict[key] = v
 
     return v
 
   @property
   def _checkpoint_dependencies(self):
-    """From Checkpointable. Gather graph-specific non-slot variables to save."""
+    """From Trackable. Gather graph-specific non-slot variables to save."""
     current_graph_non_slot_variables = []
     current_graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
     for (name, _), variable_object in sorted(self._non_slot_dict.items(),
@@ -844,13 +829,13 @@ class Optimizer(
                                              key=lambda item: item[0][0]):
       if variable_object._graph_key == current_graph_key:  # pylint: disable=protected-access
         current_graph_non_slot_variables.append(
-            checkpointable.CheckpointableReference(
+            trackable.TrackableReference(
                 name=name, ref=variable_object))
     return (super(Optimizer, self)._checkpoint_dependencies
             + current_graph_non_slot_variables)
 
   def _lookup_dependency(self, name):
-    """From Checkpointable. Find a non-slot variable in the current graph."""
+    """From Trackable. Find a non-slot variable in the current graph."""
     unconditional = super(Optimizer, self)._lookup_dependency(name)
     if unconditional is not None:
       return unconditional
@@ -1155,7 +1140,7 @@ class Optimizer(
     return named_slots[_var_key(var)]
 
   # --------------
-  # For implementing the Checkpointable interface.
+  # For implementing the Trackable interface.
   # --------------
 
   def _restore_slot_variable(self, slot_name, variable, slot_variable):
@@ -1186,8 +1171,8 @@ class Optimizer(
     slot variable needs to be restored).
 
     Args:
-      slot_variable_position: A `checkpointable._CheckpointPosition` object
-        indicating the slot variable `Checkpointable` object to be restored.
+      slot_variable_position: A `trackable._CheckpointPosition` object
+        indicating the slot variable `Trackable` object to be restored.
       slot_name: The name of this `Optimizer`'s slot to restore into.
       variable: The variable object this slot is being created for.
     """
@@ -1205,7 +1190,7 @@ class Optimizer(
         # (aside from double initialization), and makes variable creator scopes
         # behave the same way they do when graph building.
         and not ops.get_default_graph()._variable_creator_stack):  # pylint: disable=protected-access
-      initializer = checkpointable.CheckpointInitialValue(
+      initializer = trackable.CheckpointInitialValue(
           checkpoint_position=slot_variable_position)
       slot_variable = self._get_or_make_slot(
           var=variable,
diff --git a/tensorflow/python/training/optimizer_test.py b/tensorflow/python/training/optimizer_test.py
index e175b5a79989e4c7b6b4c736eefe0250e9ebbcc9..ac831cb6422f8995b81c81e86f038041e4fb2567 100644
--- a/tensorflow/python/training/optimizer_test.py
+++ b/tensorflow/python/training/optimizer_test.py
@@ -24,7 +24,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
-from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import gradients_util
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
@@ -75,7 +75,7 @@ class OptimizerTest(test.TestCase):
         opt_op = sgd_op.minimize(
             cost,
             global_step, [var0, var1],
-            aggregation_method=gradients_impl.AggregationMethod.
+            aggregation_method=gradients_util.AggregationMethod.
             EXPERIMENTAL_ACCUMULATE_N)
 
         variables.global_variables_initializer().run()
diff --git a/tensorflow/python/training/proximal_gradient_descent.py b/tensorflow/python/training/proximal_gradient_descent.py
index 369b6cbb50e5c621737c095a24eeb473f3870534..6eca0e6cb5f32a34b178c14c9fe86d00fdd0fdfe 100644
--- a/tensorflow/python/training/proximal_gradient_descent.py
+++ b/tensorflow/python/training/proximal_gradient_descent.py
@@ -27,7 +27,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.ProximalGradientDescentOptimizer")
+@tf_export(v1=["train.ProximalGradientDescentOptimizer"])
 class ProximalGradientDescentOptimizer(optimizer.Optimizer):
   # pylint: disable=line-too-long
   """Optimizer that implements the proximal gradient descent algorithm.
diff --git a/tensorflow/python/training/queue_runner_test.py b/tensorflow/python/training/queue_runner_test.py
index 2f6e924f98e5068d9f50e6efe93c58771b9acade..2868e7bcc69c0d81bd46a4db0239f76fb75a12a1 100644
--- a/tensorflow/python/training/queue_runner_test.py
+++ b/tensorflow/python/training/queue_runner_test.py
@@ -39,9 +39,9 @@ from tensorflow.python.training import queue_runner_impl
 _MockOp = collections.namedtuple("MockOp", ["name"])
 
 
+@test_util.run_v1_only("QueueRunner removed from v2")
 class QueueRunnerTest(test.TestCase):
 
-  @test_util.run_v1_only("b/120545219")
   def testBasic(self):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
@@ -49,7 +49,7 @@ class QueueRunnerTest(test.TestCase):
       var = variables.VariableV1(zero64)
       count_up_to = var.count_up_to(3)
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       qr = queue_runner_impl.QueueRunner(queue, [count_up_to])
       threads = qr.create_threads(sess)
       self.assertEqual(sorted(t.name for t in threads),
@@ -62,7 +62,6 @@ class QueueRunnerTest(test.TestCase):
       # The variable should be 3.
       self.assertEqual(3, self.evaluate(var))
 
-  @test_util.run_v1_only("b/120545219")
   def testTwoOps(self):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
@@ -77,7 +76,7 @@ class QueueRunnerTest(test.TestCase):
       self.assertEqual(sorted(t.name for t in threads),
                        ["QueueRunnerThread-fifo_queue-CountUpTo:0",
                         "QueueRunnerThread-fifo_queue-CountUpTo_1:0"])
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       for t in threads:
         t.start()
       for t in threads:
@@ -86,14 +85,13 @@ class QueueRunnerTest(test.TestCase):
       self.assertEqual(3, self.evaluate(var0))
       self.assertEqual(30, self.evaluate(var1))
 
-  @test_util.run_deprecated_v1
   def testExceptionsCaptured(self):
     with self.cached_session() as sess:
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
       qr = queue_runner_impl.QueueRunner(queue, [_MockOp("i fail"),
                                                  _MockOp("so fail")])
       threads = qr.create_threads(sess)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       for t in threads:
         t.start()
       for t in threads:
@@ -103,7 +101,6 @@ class QueueRunnerTest(test.TestCase):
       self.assertTrue("Operation not in the graph" in str(exceptions[0]))
       self.assertTrue("Operation not in the graph" in str(exceptions[1]))
 
-  @test_util.run_deprecated_v1
   def testRealDequeueEnqueue(self):
     with self.cached_session() as sess:
       q0 = data_flow_ops.FIFOQueue(3, dtypes.float32)
@@ -132,7 +129,6 @@ class QueueRunnerTest(test.TestCase):
       with self.assertRaisesRegexp(errors_impl.OutOfRangeError, "is closed"):
         self.evaluate(dequeue1)
 
-  @test_util.run_v1_only("b/120545219")
   def testRespectCoordShouldStop(self):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
@@ -140,7 +136,7 @@ class QueueRunnerTest(test.TestCase):
       var = variables.VariableV1(zero64)
       count_up_to = var.count_up_to(3)
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       qr = queue_runner_impl.QueueRunner(queue, [count_up_to])
       # As the coordinator to stop.  The queue runner should
       # finish immediately.
@@ -157,7 +153,6 @@ class QueueRunnerTest(test.TestCase):
       # The variable should be 0.
       self.assertEqual(0, self.evaluate(var))
 
-  @test_util.run_deprecated_v1
   def testRequestStopOnException(self):
     with self.cached_session() as sess:
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
@@ -170,7 +165,6 @@ class QueueRunnerTest(test.TestCase):
       with self.assertRaisesRegexp(ValueError, "Operation not in the graph"):
         coord.join()
 
-  @test_util.run_deprecated_v1
   def testGracePeriod(self):
     with self.cached_session() as sess:
       # The enqueue will quickly block.
@@ -188,7 +182,6 @@ class QueueRunnerTest(test.TestCase):
       # the queue to be closed and the enqueue to terminate.
       coord.join(stop_grace_period_secs=1.0)
 
-  @test_util.run_deprecated_v1
   def testMultipleSessions(self):
     with self.cached_session() as sess:
       with session.Session() as other_sess:
@@ -196,7 +189,7 @@ class QueueRunnerTest(test.TestCase):
         var = variables.VariableV1(zero64)
         count_up_to = var.count_up_to(3)
         queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         coord = coordinator.Coordinator()
         qr = queue_runner_impl.QueueRunner(queue, [count_up_to])
         # NOTE that this test does not actually start the threads.
@@ -204,7 +197,6 @@ class QueueRunnerTest(test.TestCase):
         other_threads = qr.create_threads(other_sess, coord=coord)
         self.assertEqual(len(threads), len(other_threads))
 
-  @test_util.run_deprecated_v1
   def testIgnoreMultiStarts(self):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
@@ -212,7 +204,7 @@ class QueueRunnerTest(test.TestCase):
       var = variables.VariableV1(zero64)
       count_up_to = var.count_up_to(3)
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       coord = coordinator.Coordinator()
       qr = queue_runner_impl.QueueRunner(queue, [count_up_to])
       threads = []
@@ -221,7 +213,6 @@ class QueueRunnerTest(test.TestCase):
       new_threads = qr.create_threads(sess, coord=coord)
       self.assertEqual([], new_threads)
 
-  @test_util.run_v1_only("b/120545219")
   def testThreads(self):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
@@ -229,7 +220,7 @@ class QueueRunnerTest(test.TestCase):
       var = variables.VariableV1(zero64)
       count_up_to = var.count_up_to(3)
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       qr = queue_runner_impl.QueueRunner(queue, [count_up_to,
                                                  _MockOp("bad_op")])
       threads = qr.create_threads(sess, start=True)
@@ -249,7 +240,6 @@ class QueueRunnerTest(test.TestCase):
       self.assertEqual(1, len(exceptions))
       self.assertTrue("Operation not in the graph" in str(exceptions[0]))
 
-  @test_util.run_deprecated_v1
   def testName(self):
     with ops.name_scope("scope"):
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32, name="queue")
@@ -259,7 +249,6 @@ class QueueRunnerTest(test.TestCase):
     self.assertEqual(
         1, len(ops.get_collection(ops.GraphKeys.QUEUE_RUNNERS, "scope")))
 
-  @test_util.run_deprecated_v1
   def testStartQueueRunners(self):
     # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
     zero64 = constant_op.constant(0, dtype=dtypes.int64)
@@ -278,7 +267,6 @@ class QueueRunnerTest(test.TestCase):
       # The variable should be 3.
       self.assertEqual(3, self.evaluate(var))
 
-  @test_util.run_deprecated_v1
   def testStartQueueRunnersRaisesIfNotASession(self):
     zero64 = constant_op.constant(0, dtype=dtypes.int64)
     var = variables.VariableV1(zero64)
@@ -292,7 +280,6 @@ class QueueRunnerTest(test.TestCase):
       with self.assertRaisesRegexp(TypeError, "tf.Session"):
         queue_runner_impl.start_queue_runners("NotASession")
 
-  @test_util.run_deprecated_v1
   def testStartQueueRunnersIgnoresMonitoredSession(self):
     zero64 = constant_op.constant(0, dtype=dtypes.int64)
     var = variables.VariableV1(zero64)
@@ -307,7 +294,6 @@ class QueueRunnerTest(test.TestCase):
           monitored_session.MonitoredSession())
       self.assertFalse(threads)
 
-  @test_util.run_deprecated_v1
   def testStartQueueRunnersNonDefaultGraph(self):
     # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
     graph = ops.Graph()
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 348b8bf1ef0a89a971eb26c9cb7e5f9d01c51a4b..215fc3965248c6582993bae3844cc0e3067acce3 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -17,7 +17,7 @@
 """Save and restore variables.
 
 Symbols in this file are deprecated. See replacements in
-tensorflow/python/training/checkpointable and tensorflow/python/training/saving.
+tensorflow/python/training/trackable and tensorflow/python/training/saving.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -29,10 +29,9 @@ import time
 import uuid
 
 import numpy as np
-
-from tensorflow.core.protobuf import checkpointable_object_graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saver_pb2
+from tensorflow.core.protobuf import trackable_object_graph_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
@@ -51,9 +50,9 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.training.saving import saveable_object
 from tensorflow.python.training.saving import saveable_object_util
+from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
@@ -1078,7 +1077,8 @@ class Saver(object):
            meta_graph_suffix="meta",
            write_meta_graph=True,
            write_state=True,
-           strip_default_attrs=False):
+           strip_default_attrs=False,
+           save_debug_info=False):
     # pylint: disable=line-too-long
     """Saves variables.
 
@@ -1108,6 +1108,10 @@ class Saver(object):
       strip_default_attrs: Boolean. If `True`, default-valued attributes will be
         removed from the NodeDefs. For a detailed guide, see
         [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+      save_debug_info: If `True`, save the GraphDebugInfo to a separate file,
+        which in the same directory of save_path and with `_debug` added before
+        the file extension. This is only enabled when `write_meta_graph` is
+        `True`
 
     Returns:
       A string: path prefix used for the checkpoint files.  If the saver is
@@ -1193,7 +1197,8 @@ class Saver(object):
       if not context.executing_eagerly():
         with sess.graph.as_default():
           self.export_meta_graph(
-              meta_graph_filename, strip_default_attrs=strip_default_attrs)
+              meta_graph_filename, strip_default_attrs=strip_default_attrs,
+              save_debug_info=save_debug_info)
 
     if self._is_empty:
       return None
@@ -1207,7 +1212,8 @@ class Saver(object):
                         export_scope=None,
                         clear_devices=False,
                         clear_extraneous_savers=False,
-                        strip_default_attrs=False):
+                        strip_default_attrs=False,
+                        save_debug_info=False):
     # pylint: disable=line-too-long
     """Writes `MetaGraphDef` to save_path/filename.
 
@@ -1224,6 +1230,9 @@ class Saver(object):
       strip_default_attrs: Boolean. If `True`, default-valued attributes will be
         removed from the NodeDefs. For a detailed guide, see
         [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+      save_debug_info: If `True`, save the GraphDebugInfo to a separate file,
+        which in the same directory of filename and with `_debug` added before
+        the file extension.
 
     Returns:
       A `MetaGraphDef` proto.
@@ -1238,7 +1247,8 @@ class Saver(object):
         export_scope=export_scope,
         clear_devices=clear_devices,
         clear_extraneous_savers=clear_extraneous_savers,
-        strip_default_attrs=strip_default_attrs)
+        strip_default_attrs=strip_default_attrs,
+        save_debug_info=save_debug_info)
 
   def restore(self, sess, save_path):
     """Restores previously saved variables.
@@ -1497,6 +1507,7 @@ def export_meta_graph(filename=None,
                       clear_devices=False,
                       clear_extraneous_savers=False,
                       strip_default_attrs=False,
+                      save_debug_info=False,
                       **kwargs):
   # pylint: disable=line-too-long
   """Returns `MetaGraphDef` proto. Optionally writes it to filename.
@@ -1527,6 +1538,9 @@ def export_meta_graph(filename=None,
     strip_default_attrs: Boolean. If `True`, default-valued attributes will be
       removed from the NodeDefs. For a detailed guide, see
       [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+    save_debug_info: If `True`, save the GraphDebugInfo to a separate file,
+      which in the same directory of filename and with `_debug` added before
+      the file extend.
     **kwargs: Optional keyed arguments.
 
   Returns:
@@ -1537,12 +1551,13 @@ def export_meta_graph(filename=None,
     RuntimeError: If called with eager execution enabled.
 
   @compatibility(eager)
-  Exporting/importing meta graphs is not supported. No graph exists when eager
-  execution is enabled.
+  Exporting/importing meta graphs is not supported unless both `graph_def` and
+  `graph` are provided. No graph exists when eager execution is enabled.
   @end_compatibility
   """
   # pylint: enable=line-too-long
-  if context.executing_eagerly():
+  if context.executing_eagerly() and not (graph_def is not None and
+                                          graph is not None):
     raise RuntimeError("Exporting/importing meta graphs is not supported when "
                        "eager execution is enabled. No graph exists when eager "
                        "execution is enabled.")
@@ -1558,6 +1573,7 @@ def export_meta_graph(filename=None,
       clear_devices=clear_devices,
       clear_extraneous_savers=clear_extraneous_savers,
       strip_default_attrs=strip_default_attrs,
+      save_debug_info=save_debug_info,
       **kwargs)
   return meta_graph_def
 
@@ -1588,9 +1604,9 @@ def object_graph_key_mapping(checkpoint_path):
   """
   reader = pywrap_tensorflow.NewCheckpointReader(checkpoint_path)
   object_graph_string = reader.get_tensor(
-      checkpointable.OBJECT_GRAPH_PROTO_KEY)
+      trackable.OBJECT_GRAPH_PROTO_KEY)
   object_graph_proto = (
-      checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+      trackable_object_graph_pb2.TrackableObjectGraph())
   object_graph_proto.ParseFromString(object_graph_string)
   names_to_keys = {}
   for node in object_graph_proto.nodes:
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 95c21cb815fd8cf9aa5e9efb98efd6be7108f51a..9b2a1da7c29723b589b67484bd2e1d880ef1363d 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -73,9 +73,9 @@ from tensorflow.python.training import queue_runner_impl
 from tensorflow.python.training import saver as saver_module
 from tensorflow.python.training import saver_test_utils
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import base as checkpointable_base
-from tensorflow.python.training.checkpointable import tracking as checkpointable_tracking
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import base as trackable_base
+from tensorflow.python.training.tracking import tracking as trackable_tracking
+from tensorflow.python.training.tracking import util as trackable_utils
 from tensorflow.python.util import compat
 
 
@@ -124,8 +124,8 @@ class SaverTest(test.TestCase):
       if not context.executing_eagerly():
         self.assertEqual(
             len(variables.report_uninitialized_variables().eval()), 2)
-        self.assertEqual(0, len(v2.keys().eval()))
-        self.assertEqual(0, len(v2.values().eval()))
+        self.assertEqual(0, len(self.evaluate(v2.keys())))
+        self.assertEqual(0, len(self.evaluate(v2.values())))
       # Restore the saved values in the parameter nodes.
       save = saver_module.Saver({"v0": v0, "v1": v1, "v2": v2.saveable})
       save.restore(sess, save_path)
@@ -331,10 +331,10 @@ class SaverTest(test.TestCase):
       self.evaluate(init_all_op)
 
       # Check that the parameter nodes have been initialized.
-      self.assertEqual(10.0, v0.eval())
-      self.assertEqual(20.0, v1.eval())
-      self.assertEqual(b"k1", v2.keys().eval())
-      self.assertEqual(30.0, v2.values().eval())
+      self.assertEqual(10.0, self.evaluate(v0))
+      self.assertEqual(20.0, self.evaluate(v1))
+      self.assertEqual(b"k1", self.evaluate(v2.keys()))
+      self.assertEqual(30.0, self.evaluate(v2.values()))
 
       # Save the initialized values in the file at "save_path"
       val = save.save(sess, save_path1)
@@ -360,16 +360,16 @@ class SaverTest(test.TestCase):
       # Assert that the variables are not initialized.
       self.assertEqual(
           len(variables.report_uninitialized_variables().eval()), 2)
-      self.assertEqual(0, len(v2.keys().eval()))
-      self.assertEqual(0, len(v2.values().eval()))
+      self.assertEqual(0, len(self.evaluate(v2.keys())))
+      self.assertEqual(0, len(self.evaluate(v2.values())))
 
       # Restore the saved values in the parameter nodes.
       save.restore(sess, save_path2)
       # Check that the parameter nodes have been restored.
-      self.assertEqual(10.0, v0.eval())
-      self.assertEqual(20.0, v1.eval())
-      self.assertEqual(b"k1", v2.keys().eval())
-      self.assertEqual(30.0, v2.values().eval())
+      self.assertEqual(10.0, self.evaluate(v0))
+      self.assertEqual(20.0, self.evaluate(v1))
+      self.assertEqual(b"k1", self.evaluate(v2.keys()))
+      self.assertEqual(30.0, self.evaluate(v2.values()))
 
   @test_util.run_deprecated_v1
   def testFilenameTensor(self):
@@ -398,7 +398,7 @@ class SaverTest(test.TestCase):
       # Build a graph with 1 node, and save and restore for them.
       v = variables.VariableV1(np.int64(15), name="v")
       save = saver_module.Saver({"v": v}, restore_sequentially=True)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       # Save the initialized values in the file at "save_path"
       val = save.save(sess, save_path)
@@ -416,7 +416,7 @@ class SaverTest(test.TestCase):
       # Restore the saved values in the parameter nodes.
       save.restore(sess, save_path)
       # Check that the parameter nodes have been restored.
-      self.assertEqual(np.int64(15), v.eval())
+      self.assertEqual(np.int64(15), self.evaluate(v))
 
   def testSomeErrors(self):
     with ops_lib.Graph().as_default():
@@ -478,14 +478,14 @@ class SaverTest(test.TestCase):
       v2 = saver_test_utils.CheckpointedOp(name="v2")
       v2_init = v2.insert("k1", 30.0)
       save = saver_module.Saver([v0, v1, v2.saveable])
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       v2_init.run()
 
       # Check that the parameter nodes have been initialized.
-      self.assertEqual(10.0, v0.eval())
-      self.assertEqual(20.0, v1.eval())
-      self.assertEqual(b"k1", v2.keys().eval())
-      self.assertEqual(30.0, v2.values().eval())
+      self.assertEqual(10.0, self.evaluate(v0))
+      self.assertEqual(20.0, self.evaluate(v1))
+      self.assertEqual(b"k1", self.evaluate(v2.keys()))
+      self.assertEqual(30.0, self.evaluate(v2.values()))
 
       # Save the initialized values in the file at "save_path"
       val = save.save(sess, save_path)
@@ -506,16 +506,16 @@ class SaverTest(test.TestCase):
       with self.assertRaisesWithPredicateMatch(
           errors_impl.OpError, lambda e: "uninitialized value v1" in e.message):
         self.evaluate(v1)
-      self.assertEqual(0, len(v2.keys().eval()))
-      self.assertEqual(0, len(v2.values().eval()))
+      self.assertEqual(0, len(self.evaluate(v2.keys())))
+      self.assertEqual(0, len(self.evaluate(v2.values())))
 
       # Restore the saved values in the parameter nodes.
       save.restore(sess, save_path)
       # Check that the parameter nodes have been restored.
-      self.assertEqual(10.0, v0.eval())
-      self.assertEqual(20.0, v1.eval())
-      self.assertEqual(b"k1", v2.keys().eval())
-      self.assertEqual(30.0, v2.values().eval())
+      self.assertEqual(10.0, self.evaluate(v0))
+      self.assertEqual(20.0, self.evaluate(v1))
+      self.assertEqual(b"k1", self.evaluate(v2.keys()))
+      self.assertEqual(30.0, self.evaluate(v2.values()))
 
     # Build another graph with 2 nodes, initialized
     # differently, and a Restore node for them.
@@ -525,20 +525,20 @@ class SaverTest(test.TestCase):
       v2_2 = saver_test_utils.CheckpointedOp(name="v2")
       save2 = saver_module.Saver([v0_2, v1_2, v2_2.saveable])
       v2_2.insert("k1000", 3000.0).run()
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       # Check that the parameter nodes have been initialized.
-      self.assertEqual(1000.0, v0_2.eval())
-      self.assertEqual(2000.0, v1_2.eval())
-      self.assertEqual(b"k1000", v2_2.keys().eval())
-      self.assertEqual(3000.0, v2_2.values().eval())
+      self.assertEqual(1000.0, self.evaluate(v0_2))
+      self.assertEqual(2000.0, self.evaluate(v1_2))
+      self.assertEqual(b"k1000", self.evaluate(v2_2.keys()))
+      self.assertEqual(3000.0, self.evaluate(v2_2.values()))
       # Restore the values saved earlier in the parameter nodes.
       save2.restore(sess, save_path)
       # Check that the parameter nodes have been restored.
-      self.assertEqual(10.0, v0_2.eval())
-      self.assertEqual(20.0, v1_2.eval())
-      self.assertEqual(b"k1", v2_2.keys().eval())
-      self.assertEqual(30.0, v2_2.values().eval())
+      self.assertEqual(10.0, self.evaluate(v0_2))
+      self.assertEqual(20.0, self.evaluate(v1_2))
+      self.assertEqual(b"k1", self.evaluate(v2_2.keys()))
+      self.assertEqual(30.0, self.evaluate(v2_2.values()))
 
   def _SaveAndLoad(self, var_name, var_value, other_value, save_path):
     with self.session(graph=ops_lib.Graph()) as sess:
@@ -582,14 +582,14 @@ class SaverTest(test.TestCase):
       with sess.graph.device(test.gpu_device_name()):
         v0_1 = variables.VariableV1(123.45)
       save = saver_module.Saver({"v0": v0_1})
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       save.save(sess, save_path)
 
     with session.Session("", graph=ops_lib.Graph()) as sess:
       with sess.graph.device(test.gpu_device_name()):
         v0_2 = variables.VariableV1(543.21)
       save = saver_module.Saver({"v0": v0_2})
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
   def testSharedServerOnGPU(self):
     if not test.is_gpu_available():
@@ -599,14 +599,14 @@ class SaverTest(test.TestCase):
       with sess.graph.device(test.gpu_device_name()):
         v0_1 = variables.VariableV1(123.45)
       save = saver_module.Saver({"v0": v0_1}, sharded=True, allow_empty=True)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       save.save(sess, save_path)
 
     with session.Session("", graph=ops_lib.Graph()) as sess:
       with sess.graph.device(test.gpu_device_name()):
         v0_2 = variables.VariableV1(543.21)
       save = saver_module.Saver({"v0": v0_2}, sharded=True, allow_empty=True)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
   def testVariables(self):
     save_path = os.path.join(self.get_temp_dir(), "variables")
@@ -627,10 +627,10 @@ class SaverTest(test.TestCase):
       # Saver with no arg, defaults to 'all variables'.
       save = saver_module.Saver()
       save.restore(sess, save_path)
-      self.assertAllClose(1.0, one.eval())
-      self.assertAllClose([2.0, 2.0, 2.0], twos.eval())
-      self.assertEqual(b"k1", v2.keys().eval())
-      self.assertEqual(3.0, v2.values().eval())
+      self.assertAllClose(1.0, self.evaluate(one))
+      self.assertAllClose([2.0, 2.0, 2.0], self.evaluate(twos))
+      self.assertEqual(b"k1", self.evaluate(v2.keys()))
+      self.assertEqual(3.0, self.evaluate(v2.values()))
 
   def testVarListShouldBeEmptyInDeferredBuild(self):
     with ops_lib.Graph().as_default():
@@ -664,8 +664,8 @@ class SaverTest(test.TestCase):
       # Saver with no arg, defaults to 'all variables'.
       save = saver_module.Saver()
       save.restore(sess, save_path)
-      self.assertAllClose(1.0, one.eval())
-      self.assertAllClose([2.0, 2.0, 2.0], twos.eval())
+      self.assertAllClose(1.0, self.evaluate(one))
+      self.assertAllClose([2.0, 2.0, 2.0], self.evaluate(twos))
 
   @test_util.run_v1_only("b/120545219")
   def testReshape(self):
@@ -691,7 +691,8 @@ class SaverTest(test.TestCase):
       var = variables.VariableV1([[0.0, 0.0], [0.0, 0.0], [0.0, 0.0]])
       save = saver_module.Saver(reshape=True)
       save.restore(sess, save_path)
-      self.assertAllClose([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], var.eval())
+      self.assertAllClose([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
+                          self.evaluate(var))
 
   @test_util.run_in_graph_and_eager_modes
   def testSaveWithGlobalStep(self, pad_step_number=False):
@@ -726,7 +727,6 @@ class SaverTest(test.TestCase):
   def testSaveWithGlobalStepWithPadding(self):
     self.testSaveWithGlobalStep(pad_step_number=True)
 
-  @test_util.run_v1_only("b/120545219")
   def testSaveToNonexistingPath(self):
     file_io.write_string_to_file(
         os.path.join(self.get_temp_dir(), "actually_a_file"), "")
@@ -753,8 +753,8 @@ class SaverTest(test.TestCase):
           self.evaluate(init_all_op)
 
           # Check that the parameter nodes have been initialized.
-          self.assertEqual(10.0, v0.eval())
-          self.assertEqual(20.0, v1.eval())
+          self.assertEqual(10.0, self.evaluate(v0))
+          self.assertEqual(20.0, self.evaluate(v1))
 
           # Save the graph.
           save.save(sess, save_path)
@@ -763,13 +763,12 @@ class SaverTest(test.TestCase):
           # Restore the saved values in the parameter nodes.
           save.restore(sess, save_path)
           # Check that the parameter nodes have been restored.
-          self.assertEqual(10.0, v0.eval())
-          self.assertEqual(20.0, v1.eval())
+          self.assertEqual(10.0, self.evaluate(v0))
+          self.assertEqual(20.0, self.evaluate(v1))
       except ValueError as exc:
         error_msg_template = "Parent directory of {} doesn't exist, can't save."
         self.assertEqual(error_msg_template.format(save_path), str(exc))
 
-  @test_util.run_deprecated_v1
   def testSaveToURI(self):
     # ParseURI functions don't work on Windows yet.
     # TODO(jhseu): Remove this check when it works.
@@ -789,8 +788,8 @@ class SaverTest(test.TestCase):
       self.evaluate(init_all_op)
 
       # Check that the parameter nodes have been initialized.
-      self.assertEqual(10.0, v0.eval())
-      self.assertEqual(20.0, v1.eval())
+      self.assertEqual(10.0, self.evaluate(v0))
+      self.assertEqual(20.0, self.evaluate(v1))
       save.save(sess, save_path)
 
   def testSaveRestoreAndValidateVariableDtype(self):
@@ -835,7 +834,7 @@ class SaverTest(test.TestCase):
       orig_vars = _model()
       self.evaluate(variables.global_variables_initializer())
       save = saver_module.Saver(max_to_keep=1)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       save.save(sess, save_dir)
       orig_vals = self.evaluate(orig_vars)
 
@@ -882,7 +881,7 @@ class SaveRestoreShardedTest(test.TestCase):
           },
           write_version=self._WRITE_VERSION,
           sharded=True)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       t0.insert("k1", 30.0).run()
       t1.insert("k2", 40.0).run()
       val = save.save(sess, save_path)
@@ -908,15 +907,15 @@ class SaveRestoreShardedTest(test.TestCase):
             },
             write_version=self._WRITE_VERSION,
             sharded=True)
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         t0.insert("k11", 33.0).run()
-        self.assertEqual(111, v0.eval())
-        self.assertEqual(b"k11", t0.keys().eval())
-        self.assertEqual(33.0, t0.values().eval())
+        self.assertEqual(111, self.evaluate(v0))
+        self.assertEqual(b"k11", self.evaluate(t0.keys()))
+        self.assertEqual(33.0, self.evaluate(t0.values()))
         save.restore(sess, save_path + "-00000-of-00002")
-        self.assertEqual(10, v0.eval())
-        self.assertEqual(b"k1", t0.keys().eval())
-        self.assertEqual(30.0, t0.values().eval())
+        self.assertEqual(10, self.evaluate(v0))
+        self.assertEqual(b"k1", self.evaluate(t0.keys()))
+        self.assertEqual(30.0, self.evaluate(t0.values()))
 
       # Restore different ops from shard 1 of the saved files.
       with session.Session(
@@ -932,15 +931,15 @@ class SaveRestoreShardedTest(test.TestCase):
             },
             write_version=self._WRITE_VERSION,
             sharded=True)
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         t1.insert("k22", 44.0).run()
-        self.assertEqual(222, v1.eval())
-        self.assertEqual(b"k22", t1.keys().eval())
-        self.assertEqual(44.0, t1.values().eval())
+        self.assertEqual(222, self.evaluate(v1))
+        self.assertEqual(b"k22", self.evaluate(t1.keys()))
+        self.assertEqual(44.0, self.evaluate(t1.values()))
         save.restore(sess, save_path + "-00001-of-00002")
-        self.assertEqual(20, v1.eval())
-        self.assertEqual(b"k2", t1.keys().eval())
-        self.assertEqual(40.0, t1.values().eval())
+        self.assertEqual(20, self.evaluate(v1))
+        self.assertEqual(b"k2", self.evaluate(t1.keys()))
+        self.assertEqual(40.0, self.evaluate(t1.values()))
 
     # Now try a restore with the sharded filename.
     with session.Session(
@@ -961,26 +960,26 @@ class SaveRestoreShardedTest(test.TestCase):
           },
           write_version=self._WRITE_VERSION,
           sharded=True)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       t0.insert("k11", 33.0).run()
       t1.insert("k22", 44.0).run()
-      self.assertEqual(111, v0.eval())
-      self.assertEqual(222, v1.eval())
-      self.assertEqual(b"k11", t0.keys().eval())
-      self.assertEqual(33.0, t0.values().eval())
-      self.assertEqual(b"k22", t1.keys().eval())
-      self.assertEqual(44.0, t1.values().eval())
+      self.assertEqual(111, self.evaluate(v0))
+      self.assertEqual(222, self.evaluate(v1))
+      self.assertEqual(b"k11", self.evaluate(t0.keys()))
+      self.assertEqual(33.0, self.evaluate(t0.values()))
+      self.assertEqual(b"k22", self.evaluate(t1.keys()))
+      self.assertEqual(44.0, self.evaluate(t1.values()))
       save_path = os.path.join(self.get_temp_dir(), "sharded_basics")
       if save._write_version is saver_pb2.SaverDef.V1:
         save.restore(sess, save_path + "-?????-of-?????")
       else:
         save.restore(sess, save_path)
-      self.assertEqual(10, v0.eval())
-      self.assertEqual(20, v1.eval())
-      self.assertEqual(b"k1", t0.keys().eval())
-      self.assertEqual(30.0, t0.values().eval())
-      self.assertEqual(b"k2", t1.keys().eval())
-      self.assertEqual(40.0, t1.values().eval())
+      self.assertEqual(10, self.evaluate(v0))
+      self.assertEqual(20, self.evaluate(v1))
+      self.assertEqual(b"k1", self.evaluate(t0.keys()))
+      self.assertEqual(30.0, self.evaluate(t0.values()))
+      self.assertEqual(b"k2", self.evaluate(t1.keys()))
+      self.assertEqual(40.0, self.evaluate(t1.values()))
 
     if save._write_version is saver_pb2.SaverDef.V1:
       self.assertEqual(
@@ -1028,7 +1027,7 @@ class SaveRestoreShardedTest(test.TestCase):
           else:
             vs = [variables.VariableV1(rnd, name=var_name)]
 
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         if call_saver_with_dict:
           saver = saver_module.Saver({var_name: vs[0]})
         else:
@@ -1056,7 +1055,7 @@ class SaveRestoreShardedTest(test.TestCase):
                   name=var_name)
           ]
 
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         if call_saver_with_dict:
           saver = saver_module.Saver({
               var_name: new_vs[0]
@@ -1203,7 +1202,7 @@ class MaxToKeepTest(test.TestCase):
     with self.cached_session() as sess:
       v = variables.VariableV1(10.0, name="v")
       save = saver_module.Saver({"v": v}, max_to_keep=2)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual([], save.last_checkpoints)
 
       s1 = save.save(sess, os.path.join(save_dir, "s1"))
@@ -1388,7 +1387,7 @@ class MaxToKeepTest(test.TestCase):
               "v0": v0,
               "v1": v1
           }, sharded=True, max_to_keep=2)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual([], save.last_checkpoints)
 
       s1 = save.save(sess, os.path.join(save_dir, "s1"))
@@ -1434,14 +1433,13 @@ class MaxToKeepTest(test.TestCase):
       self.assertTrue(
           gfile.Exists(checkpoint_management.meta_graph_filename(s3)))
 
-  @test_util.run_deprecated_v1
   def testNoMaxToKeep(self):
     save_dir = self._get_test_dir("no_max_to_keep")
     save_dir2 = self._get_test_dir("max_to_keep_0")
 
     with self.cached_session() as sess:
       v = variables.VariableV1(10.0, name="v")
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       # Test max_to_keep being None.
       save = saver_module.Saver({"v": v}, max_to_keep=None)
@@ -1463,14 +1461,13 @@ class MaxToKeepTest(test.TestCase):
       self.assertEqual([], save2.last_checkpoints)
       self.assertTrue(checkpoint_management.checkpoint_exists(s2))
 
-  @test_util.run_deprecated_v1
   def testNoMetaGraph(self):
     save_dir = self._get_test_dir("no_meta_graph")
 
     with self.cached_session() as sess:
       v = variables.VariableV1(10.0, name="v")
       save = saver_module.Saver({"v": v})
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       s1 = save.save(sess, os.path.join(save_dir, "s1"), write_meta_graph=False)
       self.assertTrue(checkpoint_management.checkpoint_exists(s1))
@@ -1487,7 +1484,6 @@ class KeepCheckpointEveryNHoursTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   @test.mock.patch.object(saver_module, "time")
-  @test_util.run_deprecated_v1
   def testNonSharded(self, mock_time):
     save_dir = self._get_test_dir("keep_checkpoint_every_n_hours")
 
@@ -1607,7 +1603,6 @@ class SaveRestoreWithVariableNameMap(test.TestCase):
       self.assertEqual(20.0, self.evaluate(v1))
 
   @test_util.run_in_graph_and_eager_modes
-  @test_util.run_v1_only("b/120545219")
   def testNonReshapeResourceVariable(self):
     self._testNonReshape(resource_variable_ops.ResourceVariable)
 
@@ -1714,7 +1709,7 @@ class MetaGraphTest(test.TestCase):
       saver1 = saver_module.Saver({"v1": v1}, name="saver1")
       ops_lib.add_to_collection("savers", saver0)
       ops_lib.add_to_collection("savers", saver1)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       # Saves to different checkpoints.
       saver0.save(sess, saver0_ckpt)
       saver1.save(sess, saver1_ckpt)
@@ -1760,7 +1755,8 @@ class MetaGraphTest(test.TestCase):
       new_saver0.restore(sess, saver0_ckpt)
       v0 = sess.graph.get_tensor_by_name("v0:0")
       v1 = sess.graph.get_tensor_by_name("v1:0")
-      self.assertAllEqual([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], v0.eval())
+      self.assertAllEqual([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
+                          self.evaluate(v0))
       self.assertEqual([3, 2], v0.get_shape())
       self.assertEqual([], v1.get_shape())
       with self.assertRaisesWithPredicateMatch(
@@ -1770,7 +1766,7 @@ class MetaGraphTest(test.TestCase):
       new_saver1 = savers[1]
       new_saver1.restore(sess, saver1_ckpt)
       v1 = sess.graph.get_tensor_by_name("v1:0")
-      self.assertEqual(11.0, v1.eval())
+      self.assertEqual(11.0, self.evaluate(v1))
 
   @test_util.run_v1_only("b/120545219")
   def testMultiSaverCollection(self):
@@ -1794,7 +1790,7 @@ class MetaGraphTest(test.TestCase):
       saver1 = saver_module.Saver({"v1": v1}, name="saver1")
       ops_lib.add_to_collection("savers", saver0)
       ops_lib.add_to_collection("savers", saver1)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       # Saves to different checkpoints.
       saver0.save(sess, saver0_ckpt)
@@ -1878,7 +1874,7 @@ class MetaGraphTest(test.TestCase):
 
       # The names are different and will work.
       slice_saver = saver_module.Saver({"first": v1, "second": v2})
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       # Exports to meta_graph
       meta_graph_def = slice_saver.export_meta_graph(filename)
 
@@ -2093,7 +2089,6 @@ class MetaGraphTest(test.TestCase):
       return i + 1, x + r
     self._testWhileLoopAndGradientSerDes(body)
 
-  @test_util.run_deprecated_v1
   def testNestedControlFlowSerDes(self):
     # Test while loop in a cond in a while loop.
     # pylint: disable=g-long-lambda
@@ -2745,7 +2740,7 @@ class ScopedGraphTest(test.TestCase):
       graph.add_to_collection(ops_lib.GraphKeys.SAVERS, saver2)
 
     with self.session(graph=graph) as sess:
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       saver1.save(sess, saver1_ckpt, write_state=False)
       saver2.save(sess, saver2_ckpt, write_state=False)
 
@@ -2762,7 +2757,7 @@ class ScopedGraphTest(test.TestCase):
 
     with self.session(graph=graph1) as sess:
       saver_list1[0].restore(sess, saver1_ckpt)
-      self.assertEqual(1.0, var_dict1["variable1:0"].eval())
+      self.assertEqual(1.0, self.evaluate(var_dict1["variable1:0"]))
 
     graph2 = ops_lib.Graph()
     var_dict2 = meta_graph.copy_scoped_meta_graph(
@@ -2777,18 +2772,18 @@ class ScopedGraphTest(test.TestCase):
 
     with self.session(graph=graph2) as sess:
       saver_list2[0].restore(sess, saver2_ckpt)
-      self.assertEqual(2.0, var_dict2["variable2:0"].eval())
+      self.assertEqual(2.0, self.evaluate(var_dict2["variable2:0"]))
 
 
-class _OwnsAVariableSimple(checkpointable_base.CheckpointableBase):
-  """A Checkpointable object which can be saved using a tf.train.Saver."""
+class _OwnsAVariableSimple(trackable_base.Trackable):
+  """A Trackable object which can be saved using a tf.train.Saver."""
 
   def __init__(self):
     self.non_dep_variable = variable_scope.get_variable(
         name="non_dep_variable", initializer=6., use_resource=True)
 
   def _gather_saveables_for_checkpoint(self):
-    return {checkpointable_base.VARIABLE_VALUE_KEY: self.non_dep_variable}
+    return {trackable_base.VARIABLE_VALUE_KEY: self.non_dep_variable}
 
   # The Saver sorts by name before parsing, so we need a name property.
   @property
@@ -2813,8 +2808,8 @@ class _MirroringSaveable(
         self._mirrored_variable.assign(tensor))
 
 
-class _OwnsMirroredVariables(checkpointable_base.CheckpointableBase):
-  """A Checkpointable object which returns a more complex SaveableObject."""
+class _OwnsMirroredVariables(trackable_base.Trackable):
+  """A Trackable object which returns a more complex SaveableObject."""
 
   def __init__(self):
     self.non_dep_variable = variable_scope.get_variable(
@@ -2828,7 +2823,7 @@ class _OwnsMirroredVariables(checkpointable_base.CheckpointableBase):
           primary_variable=self.non_dep_variable,
           mirrored_variable=self.mirrored,
           name=name)
-    return {checkpointable_base.VARIABLE_VALUE_KEY: _saveable_factory}
+    return {trackable_base.VARIABLE_VALUE_KEY: _saveable_factory}
 
   # The Saver sorts by name before parsing, so we need a name property.
   @property
@@ -2836,11 +2831,11 @@ class _OwnsMirroredVariables(checkpointable_base.CheckpointableBase):
     return self.non_dep_variable.name
 
 
-class NonLayerCheckpointable(checkpointable_tracking.Checkpointable):
+class NonLayerTrackable(trackable_tracking.AutoTrackable):
 
   def __init__(self):
-    super(NonLayerCheckpointable, self).__init__()
-    self.a_variable = checkpointable_utils.add_variable(
+    super(NonLayerTrackable, self).__init__()
+    self.a_variable = trackable_utils.add_variable(
         self, name="a_variable", shape=[])
 
 
@@ -2851,19 +2846,19 @@ class MyModel(training.Model):
     super(MyModel, self).__init__()
     self._named_dense = core.Dense(1, use_bias=True)
     self._second = core.Dense(1, use_bias=False)
-    # We can still track Checkpointables which aren't Layers.
-    self._non_layer = NonLayerCheckpointable()
+    # We can still track Trackables which aren't Layers.
+    self._non_layer = NonLayerTrackable()
 
   def call(self, values):
     ret = self._second(self._named_dense(values))
     return ret
 
 
-class CheckpointableCompatibilityTests(test.TestCase):
+class TrackableCompatibilityTests(test.TestCase):
 
   # TODO(allenl): Track down python3 reference cycles in these tests.
   @test_util.run_in_graph_and_eager_modes
-  def testNotSaveableButIsCheckpointable(self):
+  def testNotSaveableButIsTrackable(self):
     v = _OwnsAVariableSimple()
     test_dir = self.get_temp_dir()
     prefix = os.path.join(test_dir, "ckpt")
@@ -2928,13 +2923,13 @@ class CheckpointableCompatibilityTests(test.TestCase):
     model = MyModel()
     optimizer = adam.AdamOptimizer(0.001)
     optimizer_step = training_util.get_or_create_global_step()
-    root_checkpointable = checkpointable_utils.Checkpoint(
+    root_trackable = trackable_utils.Checkpoint(
         optimizer=optimizer, model=model, optimizer_step=optimizer_step)
     train_op = optimizer.minimize(
         functools.partial(model, input_value),
         global_step=optimizer_step)
-    self.evaluate(checkpointable_utils.gather_initializers(
-        root_checkpointable))
+    self.evaluate(trackable_utils.gather_initializers(
+        root_trackable))
     self.evaluate(train_op)
     # A regular variable, a slot variable, and a non-slot Optimizer variable
     # with known values to check when loading.
@@ -2943,24 +2938,24 @@ class CheckpointableCompatibilityTests(test.TestCase):
         var=model._named_dense.bias, name="m").assign([2.]))
     beta1_power, _ = optimizer._get_beta_accumulators()
     self.evaluate(beta1_power.assign(3.))
-    return root_checkpointable
+    return root_trackable
 
-  def _set_sentinels(self, root_checkpointable):
-    self.evaluate(root_checkpointable.model._named_dense.bias.assign([101.]))
+  def _set_sentinels(self, root_trackable):
+    self.evaluate(root_trackable.model._named_dense.bias.assign([101.]))
     self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, name="m")
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, name="m")
         .assign([102.]))
-    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
     self.evaluate(beta1_power.assign(103.))
 
-  def _check_sentinels(self, root_checkpointable):
+  def _check_sentinels(self, root_trackable):
     self.assertAllEqual(
-        [1.], self.evaluate(root_checkpointable.model._named_dense.bias))
+        [1.], self.evaluate(root_trackable.model._named_dense.bias))
     self.assertAllEqual([2.], self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, name="m")))
-    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, name="m")))
+    beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
     self.assertAllEqual(3., self.evaluate(beta1_power))
 
   def testVariableNotFoundErrorRaised(self):
@@ -3010,7 +3005,6 @@ class CheckpointableCompatibilityTests(test.TestCase):
             "a mismatch between the current graph and the graph"):
           a_saver.restore(sess=sess, save_path=save_path)
 
-  @test_util.run_v1_only("b/120545219")
   def testLoadFromObjectBasedGraph(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
@@ -3018,13 +3012,13 @@ class CheckpointableCompatibilityTests(test.TestCase):
     save_graph = ops_lib.Graph()
     with save_graph.as_default(), self.session(graph=save_graph) as sess:
       root = self._initialized_model()
-      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      object_saver = trackable_utils.Checkpoint(root=root)
       save_path = object_saver.save(file_prefix=checkpoint_prefix)
 
       # An incompatible object-based checkpoint to check error messages
       var = resource_variable_ops.ResourceVariable(1., name="a")
       self.evaluate(var.initializer)
-      second_saver = checkpointable_utils.CheckpointableSaver(var)
+      second_saver = trackable_utils.Checkpoint(v=var)
       second_path = second_saver.save(file_prefix=os.path.join(
           checkpoint_directory, "second"))
 
@@ -3052,7 +3046,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
     save_graph = ops_lib.Graph()
     with save_graph.as_default(), self.session(graph=save_graph):
       root = self._initialized_model()
-      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      object_saver = trackable_utils.Checkpoint(root=root)
       save_path = object_saver.save(file_prefix=checkpoint_prefix)
 
     with context.eager_mode():
diff --git a/tensorflow/python/training/saving/BUILD b/tensorflow/python/training/saving/BUILD
index 67ccd59b88c289a11791c9098a2014c48e6c33fb..adb50f9aa6b2e3c19c9d616cf996d37ae503d7a4 100644
--- a/tensorflow/python/training/saving/BUILD
+++ b/tensorflow/python/training/saving/BUILD
@@ -49,7 +49,7 @@ py_library(
     deps = [
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:variables",
-        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/tracking:base",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/training/saving/functional_saver.py b/tensorflow/python/training/saving/functional_saver.py
index 7eed3336626ef63942a40702f9787e6b5847b97b..4ff2742c2f1b8b68528914c5c23414b1f87c957b 100644
--- a/tensorflow/python/training/saving/functional_saver.py
+++ b/tensorflow/python/training/saving/functional_saver.py
@@ -18,12 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.protobuf import saver_pb2
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.training.saving import saveable_object
-from tensorflow.python.training.saving import saveable_object_util
+from tensorflow.python.util import nest
 
 
 class Saver(object):
@@ -47,7 +50,29 @@ class Saver(object):
             "Saver expected a list of SaveableObjects, got %s." % (saveable,))
     self._saveable_objects = saveable_objects
 
-  # TODO(b/120569892): Use tf.function here
+  def to_proto(self):
+    """Serializes to a SaverDef referencing the current graph."""
+    filename_tensor = array_ops.placeholder(
+        shape=[], dtype=dtypes.string, name="saver_filename")
+    # TODO(allenl): Add save and restore function names to the proto directly.
+    signature = (tensor_spec.TensorSpec(shape=(), dtype=dtypes.string),)
+    # Autograph is off because of reference cycles which must be collected when
+    # a function is created and destroyed (as in tf.saved_model.save). It's also
+    # not necessary, so having it off may be slightly faster.
+    #
+    # TODO(b/121302372): We should be able to decorate save() and restore()
+    # unconditionally.
+    save_tensor = def_function.function(
+        self.save, input_signature=signature, autograph=False)(filename_tensor)
+    restore_op = def_function.function(
+        self.restore, input_signature=signature, autograph=False)(
+            filename_tensor).op
+    return saver_pb2.SaverDef(
+        filename_tensor_name=filename_tensor.name,
+        save_tensor_name=save_tensor.name,
+        restore_op_name=restore_op.name,
+        version=saver_pb2.SaverDef.V2)
+
   def save(self, file_prefix):
     """Save the saveable objects to a checkpoint with `file_prefix`.
 
@@ -66,11 +91,11 @@ class Saver(object):
         tensor_names.append(spec.name)
         tensors.append(spec.tensor)
         tensor_slices.append(spec.slice_spec)
-    with ops.control_dependencies(
-        [io_ops.save_v2(file_prefix, tensor_names, tensor_slices, tensors)]):
-      return array_ops.identity(file_prefix)
+    with ops.device("cpu:0"):
+      with ops.control_dependencies([io_ops.save_v2(
+          file_prefix, tensor_names, tensor_slices, tensors)]):
+        return array_ops.identity(file_prefix)
 
-  # TODO(b/120569892): Use tf.function here
   def restore(self, file_prefix):
     """Restore the saveable objects from a checkpoint with `file_prefix`.
 
@@ -79,23 +104,35 @@ class Saver(object):
         files to read from.
 
     Returns:
-      An operation which restores the `Saver`'s `SaveableObject`s when run, or
-      None if executing eagerly.
+      A scalar string Tensor containing `file_prefix` with control dependencies
+      on the restore ops.
     """
-    restore_ops = []
-    for saveable in self._saveable_objects:
-      if saveable.device:
-        device = saveable_object_util.set_cpu0(saveable.device)
-      else:
-        device = None
-      with ops.device(device):
-        tensors = []
-        for spec in saveable.specs:
-          tensors.append(
-              io_ops.restore_v2(
-                  file_prefix,
-                  [spec.name],
-                  [spec.slice_spec],
-                  [spec.dtype])[0])
-        restore_ops.append(saveable.restore(tensors, restored_shapes=None))
-    return control_flow_ops.group(restore_ops)
+    restore_ops = restore_from_saveable_objects(
+        file_prefix, self._saveable_objects)
+    with ops.device("cpu:0"):
+      with ops.control_dependencies(restore_ops):
+        return array_ops.identity(file_prefix)
+
+
+def restore_from_saveable_objects(file_prefix, saveable_objects):
+  """Reads from a checkpoint and returns restore ops for `saveable_objects`s."""
+  restore_specs = []
+  tensor_structure = []
+  for saveable in saveable_objects:
+    saveable_tensor_structure = []
+    tensor_structure.append(saveable_tensor_structure)
+    for spec in saveable.specs:
+      saveable_tensor_structure.append(spec.name)
+      restore_specs.append((spec.name, spec.slice_spec, spec.dtype))
+  tensor_names, tensor_slices, tensor_dtypes = zip(*restore_specs)
+  with ops.device("cpu:0"):
+    restored_tensors = io_ops.restore_v2(
+        file_prefix, tensor_names, tensor_slices, tensor_dtypes)
+  structured_restored_tensors = nest.pack_sequence_as(
+      tensor_structure, restored_tensors)
+  restore_ops = []
+  for saveable, restored_tensors in zip(saveable_objects,
+                                        structured_restored_tensors):
+    restore_ops.append(saveable.restore(restored_tensors,
+                                        restored_shapes=None))
+  return restore_ops
diff --git a/tensorflow/python/training/saving/functional_saver_test.py b/tensorflow/python/training/saving/functional_saver_test.py
index 40002255aacd4b3579bab6ea44bc9e5ee98f9177..a394f0b5c6695d5ae0165628b7691b6c7757b4e8 100644
--- a/tensorflow/python/training/saving/functional_saver_test.py
+++ b/tensorflow/python/training/saving/functional_saver_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 
 from tensorflow.python.eager import test
+from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training.saving import functional_saver
@@ -45,6 +46,34 @@ class SaverTest(test.TestCase):
     second_saver.restore(save_path)
     self.assertEqual(2., self.evaluate(v2))
 
+  def test_to_proto(self):
+    v1 = resource_variable_ops.ResourceVariable(2.)
+    saver = functional_saver.Saver(
+        saveable_object_util.saveable_objects_for_op(v1, "x"))
+    prefix = os.path.join(self.get_temp_dir(), "ckpt")
+
+    proto_accumulator = []
+    wrapped = wrap_function.wrap_function(
+        lambda: proto_accumulator.append(saver.to_proto()), signature=())
+    self.assertEqual(1, len(proto_accumulator))
+    proto = proto_accumulator[0]
+    save = wrapped.prune(
+        feeds=wrapped.graph.get_tensor_by_name(proto.filename_tensor_name),
+        fetches=wrapped.graph.get_tensor_by_name(proto.save_tensor_name))
+    restore = wrapped.prune(
+        feeds=wrapped.graph.get_tensor_by_name(proto.filename_tensor_name),
+        fetches=wrapped.graph.get_operation_by_name(proto.restore_op_name))
+    save_path = save(constant_op.constant(prefix))
+    v1.assign(1.)
+    restore(constant_op.constant(save_path))
+    self.assertEqual(2., self.evaluate(v1))
+
+    v2 = resource_variable_ops.ResourceVariable(3.)
+    second_saver = functional_saver.Saver(
+        saveable_object_util.saveable_objects_for_op(v2, "x"))
+    second_saver.restore(save_path)
+    self.assertEqual(2., self.evaluate(v2))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/saving/saveable_object.py b/tensorflow/python/training/saving/saveable_object.py
index 4b19294b6545de8105443a46a112a416f6bf481c..981d4580fcb3390380e58e90a2edefa2cae5f066 100644
--- a/tensorflow/python/training/saving/saveable_object.py
+++ b/tensorflow/python/training/saving/saveable_object.py
@@ -66,6 +66,11 @@ class SaveableObject(object):
     self.name = name
     self._device = None
 
+  @property
+  def optional_restore(self):
+    """A hint to restore assertions that this object is optional."""
+    return False  # Default to required
+
   @property
   def device(self):
     """The device for SaveSpec Tensors."""
diff --git a/tensorflow/python/training/saving/saveable_object_util.py b/tensorflow/python/training/saving/saveable_object_util.py
index fa88d2c6ebd2f29c2d2de7583a918dcbc6b28b51..eeec19e5886d48828b85ab7aa6931db38a561613 100644
--- a/tensorflow/python/training/saving/saveable_object_util.py
+++ b/tensorflow/python/training/saving/saveable_object_util.py
@@ -26,8 +26,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.training.saving import saveable_object
+from tensorflow.python.training.tracking import base as trackable
 
 
 # Op names which identify variable reads which should be saved.
@@ -137,7 +137,7 @@ def saveable_objects_for_op(op, name):
   if not isinstance(name, six.string_types):
     raise TypeError(
         "names_to_saveables must be a dict mapping string names to "
-        "checkpointable operations. Name is not a string: %s" % name)
+        "trackable operations. Name is not a string: %s" % name)
   if isinstance(op, saveable_object.SaveableObject):
     yield op
   elif isinstance(op, (list, tuple, variables.PartitionedVariable)):
@@ -165,11 +165,11 @@ def saveable_objects_for_op(op, name):
         yield ResourceVariableSaveable(
             variable, variable._save_slice_info.spec, name)
     # pylint: enable=protected-access
-  elif isinstance(op, checkpointable.CheckpointableBase) and not isinstance(
+  elif isinstance(op, trackable.Trackable) and not isinstance(
       op, variables.Variable):
     # pylint: disable=protected-access
     for attr, factory in op._gather_saveables_for_checkpoint().items():
-      if attr == checkpointable.VARIABLE_VALUE_KEY:
+      if attr == trackable.VARIABLE_VALUE_KEY:
         # Keep original name for classes masquerading as variables.
         full_name = name
       else:
@@ -250,15 +250,18 @@ def op_list_to_dict(op_list, convert_variable_to_tensor=True):
         names_to_saveables[name].append(var)
       else:
         names_to_saveables[name] = [var]
-    elif (isinstance(var, checkpointable.CheckpointableBase)
+    elif (isinstance(var, trackable.Trackable)
           and not isinstance(var, variables.Variable)):
-      checkpointable_saveables = [
+      trackable_saveables = [
           (factory() if callable(factory) else factory)
           for factory in var._gather_saveables_for_checkpoint().values()]
       names_to_saveables.update(
-          op_list_to_dict(checkpointable_saveables))
+          op_list_to_dict(trackable_saveables))
     else:
-      if context.executing_eagerly():
+      # Variables (reference and resource) have an _in_graph_mode property
+      # indicating whether they were created in a graph building context. We
+      # also get Tensors when graph building, which do not have this property.
+      if not getattr(var, "_in_graph_mode", True):
         if not isinstance(var, resource_variable_ops.ResourceVariable):
           raise ValueError(
               "Can only save/restore ResourceVariables when eager execution "
@@ -323,7 +326,7 @@ def validate_and_slice_inputs(names_to_saveables):
 
   Raises:
     TypeError: If any of the keys are not strings or any of the
-      values are not one of Tensor or Variable or a checkpointable operation.
+      values are not one of Tensor or Variable or a trackable operation.
     ValueError: If the same operation is given in more than one value
       (this also applies to slices of SlicedVariables).
   """
diff --git a/tensorflow/python/training/server_lib.py b/tensorflow/python/training/server_lib.py
index 302ca2dd44b99d2a5cfeffa163d95634513f9eaa..c5ca2ac403567c237307b12662fd6277afa794fa 100644
--- a/tensorflow/python/training/server_lib.py
+++ b/tensorflow/python/training/server_lib.py
@@ -23,6 +23,7 @@ from tensorflow.core.protobuf import tensorflow_server_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -93,7 +94,8 @@ def _make_server_def(server_or_cluster_def, job_name, task_index, protocol,
   return server_def
 
 
-@tf_export("train.Server")
+@tf_export("distribute.Server", v1=["distribute.Server", "train.Server"])
+@deprecation.deprecated_endpoints("train.Server")
 class Server(object):
   """An in-process TensorFlow server, for use in distributed training.
 
@@ -342,6 +344,9 @@ class ClusterSpec(object):
     ret = {}
     for job in self.jobs:
       task_indices = self.task_indices(job)
+      if len(task_indices) == 0:
+        ret[job] = {}
+        continue
       if max(task_indices) + 1 == len(task_indices):
         # Return a list because the task indices are dense. This
         # matches the behavior of `as_dict()` before support for
diff --git a/tensorflow/python/training/server_lib_test.py b/tensorflow/python/training/server_lib_test.py
index 92cdc1c4ad0832fc3f8593bebabe76d4e6dc0cc0..db45d80bd2b890d8a8fcc5aaff55b0a3a720a167 100644
--- a/tensorflow/python/training/server_lib_test.py
+++ b/tensorflow/python/training/server_lib_test.py
@@ -453,6 +453,29 @@ class ClusterSpecTest(test.TestCase):
                          tasks { key: 2 value: 'worker2:2222' } }
     """
 
+    self.assertProtoEquals(expected_proto, cluster_spec.as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto,
+        server_lib.ClusterSpec(cluster_spec).as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto,
+        server_lib.ClusterSpec(cluster_spec.as_cluster_def()).as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto,
+        server_lib.ClusterSpec(cluster_spec.as_dict()).as_cluster_def())
+
+  def testProtoDictDefEquivalencesWithZeroWorker(self):
+    cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": []
+    })
+
+    expected_proto = """
+    job { name: 'ps' tasks { key: 0 value: 'ps0:2222' }
+                     tasks { key: 1 value: 'ps1:2222' } }
+    job { name: 'worker' }
+    """
+
     self.assertProtoEquals(expected_proto, cluster_spec.as_cluster_def())
     self.assertProtoEquals(
         expected_proto, server_lib.ClusterSpec(cluster_spec).as_cluster_def())
diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index 0f68fcfe8bb4cb81e54ba27d35bfb0b2e3888a1b..104247e60ece2477506e94c152bf9b4f26a806cd 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -21,6 +21,7 @@ import time
 import numpy as np
 
 from tensorflow.python.client import session
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
@@ -181,8 +182,16 @@ class SessionManager(object):
         set.
     """
     self._target = master
-    sess = session.Session(self._target, graph=self._graph, config=config)
 
+    # This is required to so that we initialize the TPU device before
+    # restoring from checkpoint since we'll be placing variables on the device
+    # and TPUInitialize wipes out the memory of the device.
+    strategy = distribution_strategy_context.get_strategy()
+    if strategy and hasattr(strategy.extended,
+                            "_experimental_initialize_system"):
+      strategy.extended._experimental_initialize_system()  # pylint: disable=protected-access
+
+    sess = session.Session(self._target, graph=self._graph, config=config)
     if checkpoint_dir and checkpoint_filename_with_path:
       raise ValueError("Can not provide both checkpoint_dir and "
                        "checkpoint_filename_with_path.")
diff --git a/tensorflow/python/training/session_run_hook.py b/tensorflow/python/training/session_run_hook.py
index e9a61def7430fec0190c8f7b788fd7b72492e432..886ca46ed59d7626b970261c531e7087da4b411e 100644
--- a/tensorflow/python/training/session_run_hook.py
+++ b/tensorflow/python/training/session_run_hook.py
@@ -94,7 +94,7 @@ import collections
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.SessionRunHook")
+@tf_export(v1=["train.SessionRunHook"])
 class SessionRunHook(object):
   """Hook to extend calls to MonitoredSession.run()."""
 
diff --git a/tensorflow/python/training/slot_creator.py b/tensorflow/python/training/slot_creator.py
index ecf5a96ed49146fe4cafce6a809925aab5bdc6fb..0868cfdea8896e00b4348919b43d948ae30ee956 100644
--- a/tensorflow/python/training/slot_creator.py
+++ b/tensorflow/python/training/slot_creator.py
@@ -58,10 +58,19 @@ def _create_slot_var(primary, val, scope, validate_shape, shape, dtype):
   # When init from val instead of callable initializer, the shape is expected to
   # be None, not <unknown> or any fully defined shape.
   shape = shape if callable(val) else None
+  if resource_variable_ops.is_resource_variable(primary):
+    use_resource = True
+  elif isinstance(primary, variables.RefVariable):
+    use_resource = False
+  else:
+    use_resource = None
   slot = variable_scope.get_variable(
-      scope, initializer=val, trainable=False,
-      use_resource=resource_variable_ops.is_resource_variable(primary),
-      shape=shape, dtype=dtype,
+      scope,
+      initializer=val,
+      trainable=False,
+      use_resource=use_resource,
+      shape=shape,
+      dtype=dtype,
       validate_shape=validate_shape)
   variable_scope.get_variable_scope().set_partitioner(current_partitioner)
 
@@ -112,9 +121,8 @@ def create_slot(primary, val, name, colocate_with_primary=True):
     prefix = primary.op.name
   with variable_scope.variable_scope(None, prefix + "/" + name):
     if colocate_with_primary:
-      distribution_strategy = (
-          distribution_strategy_context.get_distribution_strategy())
-      with distribution_strategy.colocate_vars_with(primary):
+      distribution_strategy = distribution_strategy_context.get_strategy()
+      with distribution_strategy.extended.colocate_vars_with(primary):
         return _create_slot_var(primary, val, "", validate_shape, None, None)
     else:
       return _create_slot_var(primary, val, "", validate_shape, None, None)
@@ -150,9 +158,8 @@ def create_slot_with_initializer(primary, initializer, shape, dtype, name,
     prefix = primary.op.name
   with variable_scope.variable_scope(None, prefix + "/" + name):
     if colocate_with_primary:
-      distribution_strategy = (
-          distribution_strategy_context.get_distribution_strategy())
-      with distribution_strategy.colocate_vars_with(primary):
+      distribution_strategy = distribution_strategy_context.get_strategy()
+      with distribution_strategy.extended.colocate_vars_with(primary):
         return _create_slot_var(primary, initializer, "", validate_shape, shape,
                                 dtype)
     else:
diff --git a/tensorflow/python/training/slot_creator_test.py b/tensorflow/python/training/slot_creator_test.py
index f1f0d58a6913a542093ada7a948969f47928a43b..ec2eec39324eaed08406d6301b8a329d4888d688 100644
--- a/tensorflow/python/training/slot_creator_test.py
+++ b/tensorflow/python/training/slot_creator_test.py
@@ -38,7 +38,7 @@ class SlotCreatorTest(test.TestCase):
       v = variables.Variable([1.0, 2.5], name="var")
       slot = slot_creator.create_slot(v, v.initialized_value(), name="slot")
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertEqual("var/slot", slot.op.name)
       self.assertEqual([2], slot.get_shape().as_list())
@@ -51,7 +51,7 @@ class SlotCreatorTest(test.TestCase):
       v = constant_op.constant([1.0, 2.5], name="const")
       slot = slot_creator.create_slot(v, v * 2, name="slot")
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertEqual("const/slot", slot.op.name)
       self.assertEqual([2], slot.get_shape().as_list())
@@ -66,7 +66,7 @@ class SlotCreatorTest(test.TestCase):
         slot = slot_creator.create_zeros_slot(
             v, name="slot", dtype=dtypes.float64)
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertEqual("var/slot", slot.op.name)
       self.assertEqual([2], slot.get_shape().as_list())
@@ -88,7 +88,7 @@ class SlotCreatorTest(test.TestCase):
         slot = slot_creator.create_zeros_slot(
             v, name="slot", dtype=dtypes.float64)
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertEqual("var/slot", slot.op.name)
       self.assertEqual([2], array_ops.shape(slot).eval())
@@ -102,7 +102,7 @@ class SlotCreatorTest(test.TestCase):
       with ops.control_dependencies(None):
         slot = slot_creator.create_zeros_slot(v, name="slot")
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertEqual("const/slot", slot.op.name)
       self.assertEqual([2], slot.get_shape().as_list())
@@ -118,7 +118,7 @@ class SlotCreatorTest(test.TestCase):
         slot = slot_creator.create_zeros_slot(
             v, name="slot", dtype=dtypes.float64)
 
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertEqual("const/slot", slot.op.name)
       self.assertEqual([2], array_ops.shape(slot).eval())
diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py
index cd4590db7f6550f8790ad683c9aaecf145ad12da..21e9a99e7ceeebc6b021bb899fd77faa5e19ed48 100644
--- a/tensorflow/python/training/sync_replicas_optimizer.py
+++ b/tensorflow/python/training/sync_replicas_optimizer.py
@@ -260,9 +260,8 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
     # local_anchor op will be placed on this worker task by default.
     local_anchor = control_flow_ops.no_op()
     # Colocating local_step variable prevents it being placed on the PS.
-    distribution_strategy = (
-        distribution_strategy_context.get_distribution_strategy())
-    with distribution_strategy.colocate_vars_with(local_anchor):
+    distribution_strategy = distribution_strategy_context.get_strategy()
+    with distribution_strategy.extended.colocate_vars_with(local_anchor):
       self._local_step = variable_scope.variable(
           initial_value=0,
           trainable=False,
diff --git a/tensorflow/python/training/tensorboard_logging_test.py b/tensorflow/python/training/tensorboard_logging_test.py
index 5088ab07e5e387c880aadc8de7385b53df911a29..ffc7eb5b96e4e536ed88cc43d76d075d012a77ee 100644
--- a/tensorflow/python/training/tensorboard_logging_test.py
+++ b/tensorflow/python/training/tensorboard_logging_test.py
@@ -33,7 +33,7 @@ from tensorflow.python.summary.writer import writer
 from tensorflow.python.training import tensorboard_logging
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_deprecated_v1
 class EventLoggingTest(test.TestCase):
 
   def setUp(self):
@@ -87,7 +87,6 @@ class EventLoggingTest(test.TestCase):
                                   (event_pb2.LogMessage.ERROR, "format")])
     self.assertEqual(2, self.logged_message_count)
 
-  @test_util.run_v1_only("b/120545219")
   def testVerbosity(self):
     tensorboard_logging.set_summary_writer(self._sw)
     tensorboard_logging.set_verbosity(tensorboard_logging.ERROR)
@@ -115,7 +114,6 @@ class EventLoggingTest(test.TestCase):
     tensorboard_logging.warn("this should work")
     self.assertEqual(1, self.logged_message_count)
 
-  @test_util.run_v1_only("b/120545219")
   def testSummaryWriterFailsAfterClear(self):
     tensorboard_logging._clear_summary_writer()
     with self.assertRaises(RuntimeError):
diff --git a/tensorflow/python/training/checkpointable/BUILD b/tensorflow/python/training/tracking/BUILD
similarity index 58%
rename from tensorflow/python/training/checkpointable/BUILD
rename to tensorflow/python/training/tracking/BUILD
index 26a0ac35b763e4b8a2c9143d88a2a97259715262..40a6e93e09c8a5c603e183f87a15652c08eaca44 100644
--- a/tensorflow/python/training/checkpointable/BUILD
+++ b/tensorflow/python/training/tracking/BUILD
@@ -11,7 +11,8 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_test")
 
 py_library(
     name = "base",
@@ -28,14 +29,14 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/training/saving:saveable_object",
+        "@six_archive//:six",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "base_test",
     srcs = ["base_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":base",
         "//tensorflow/python:client_testlib",
     ],
@@ -51,11 +52,10 @@ py_library(
     ],
 )
 
-py_test(
+tf_py_test(
     name = "tracking_test",
     srcs = ["tracking_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":base",
         ":tracking",
         "//tensorflow/python:client_testlib",
@@ -75,14 +75,14 @@ py_library(
     deps = [
         ":base",
         ":layer_utils",
+        "//tensorflow/python/saved_model:revived_types",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "data_structures_test",
     srcs = ["data_structures_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
+    additional_deps = [
         ":data_structures",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_test_lib",
@@ -95,6 +95,29 @@ py_test(
     ],
 )
 
+py_library(
+    name = "object_identity",
+    srcs = ["object_identity.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_library(
+    name = "graph_view",
+    srcs = ["graph_view.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base",
+        ":object_identity",
+        ":tracking",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/training/saving:saveable_object",
+        "//tensorflow/python/training/saving:saveable_object_util",
+    ],
+)
+
 py_library(
     name = "util",
     srcs = ["util.py"],
@@ -102,17 +125,19 @@ py_library(
     deps = [
         ":base",
         ":data_structures",
+        ":graph_view",
+        ":object_identity",
         ":tracking",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:checkpoint_management",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:io_ops_gen",
+        "//tensorflow/python:lib",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:saver",
         "//tensorflow/python:session",
@@ -123,20 +148,82 @@ py_library(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/training/saving:functional_saver",
-        "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/training/saving:saveable_object_util",
     ],
 )
 
-py_test(
+tf_py_test(
     name = "util_test",
     srcs = ["util_test.py"],
-    srcs_version = "PY2AND3",
+    additional_deps = [
+        ":base",
+        ":graph_view",
+        ":tracking",
+        ":util",
+        "@absl_py//absl/testing:parameterized",
+        "@six_archive//:six",
+        "//tensorflow/python/keras/optimizer_v2",
+        "//tensorflow/python:checkpoint_management",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:saver",
+        "//tensorflow/python:session",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:template",
+        "//tensorflow/python:training_util",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras:engine",
+        "//tensorflow/python/keras:layers",
+        "//tensorflow/python:variables",
+    ],
     tags = ["notsan"],  # b/74395663
+)
+
+tf_xla_py_test(
+    name = "util_xla_test",
+    srcs = ["util_xla_test.py"],
+    tags = [
+        "no_pip",
+        "no_rocm",
+        "nomac",
+        "notsan",  # b/74395663
+    ],
     deps = [
+        ":tracking",
+        ":util",
+        "//tensorflow/compiler/tests:xla_test",
+        "//tensorflow/python:checkpoint_management",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/keras:engine",
+        "//tensorflow/python/keras:layers",
+        "//tensorflow/python/keras/optimizer_v2",
+    ],
+)
+
+tf_py_test(
+    name = "util_with_v1_optimizers_test",
+    srcs = ["util_with_v1_optimizers_test.py"],
+    additional_deps = [
         ":base",
+        ":graph_view",
         ":tracking",
         ":util",
+        "@absl_py//absl/testing:parameterized",
+        "@six_archive//:six",
         "//tensorflow/python:checkpoint_management",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
@@ -153,12 +240,16 @@ py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:training_util",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python/distribute:mirrored_strategy",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/keras:engine",
         "//tensorflow/python/keras:layers",
-        "@six_archive//:six",
+    ],
+    tags = [
+        "no_windows",  # b/124401331
+        "notsan",  # b/74395663
     ],
 )
diff --git a/tensorflow/python/training/checkpointable/base.py b/tensorflow/python/training/tracking/base.py
similarity index 79%
rename from tensorflow/python/training/checkpointable/base.py
rename to tensorflow/python/training/tracking/base.py
index 3cd1c6f9c8b0b5b5acf517e5f5801db66d0045b2..4a8960d3cabdac1852f42e89a6c2e61d0ca4577e 100644
--- a/tensorflow/python/training/checkpointable/base.py
+++ b/tensorflow/python/training/tracking/base.py
@@ -44,18 +44,18 @@ OBJECT_GRAPH_PROTO_KEY = "_CHECKPOINTABLE_OBJECT_GRAPH"
 
 
 # A key indicating a variable's value in an object's checkpointed Tensors
-# (Checkpointable._gather_saveables_for_checkpoint). If this is the only key and
+# (Trackable._gather_saveables_for_checkpoint). If this is the only key and
 # the object has no dependencies, then its value may be restored on object
 # creation (avoiding double assignment when executing eagerly).
 VARIABLE_VALUE_KEY = "VARIABLE_VALUE"
 OBJECT_CONFIG_JSON_KEY = "OBJECT_CONFIG_JSON"
 
-CheckpointableReference = collections.namedtuple(
-    "CheckpointableReference",
+TrackableReference = collections.namedtuple(
+    "TrackableReference",
     [
         # The local name for this dependency.
         "name",
-        # The Checkpointable object being referenced.
+        # The Trackable object being referenced.
         "ref"
     ])
 
@@ -142,9 +142,14 @@ class PythonStringStateSaveable(PythonStateSaveable):
       state_callback: A function taking no arguments which returns a
         string. This function is run every time a checkpoint is written.
       restore_callback: A function taking a Python string, used to restore
-        state. Optional; defaults to doing nothing.
+        state. Optional; defaults to doing nothing, in which case it is ignored
+        by status assertions such as assert_consumed().
     """
-    self._state_callback = state_callback
+    self._has_trivial_state_callback = (restore_callback is None)
+    def _state_callback_wrapper():
+      with ops.init_scope():
+        return state_callback()
+    self._state_callback = _state_callback_wrapper
     self._restore_callback = restore_callback
     with ops.device("/cpu:0"):
       self._save_string = constant_op.constant("", dtype=dtypes.string)
@@ -153,14 +158,21 @@ class PythonStringStateSaveable(PythonStateSaveable):
     super(PythonStringStateSaveable, self).__init__(
         self._save_string, [spec], name)
 
+  @property
+  def optional_restore(self):
+    """For values with no restore, relaxes assert_consumed()."""
+    return self._has_trivial_state_callback
+
   def feed_dict_additions(self):
     """When running a graph, indicates fresh state to feed."""
     return {self._save_string: self._state_callback()}
 
   def freeze(self):
     """Create a frozen `SaveableObject` which saves the current state."""
+    def _constant_state():
+      return constant_op.constant(self._state_callback(), dtype=dtypes.string)
     return NoRestoreSaveable(
-        tensor=self._state_callback,
+        tensor=_constant_state,
         dtype=dtypes.string,
         name=self.name)
 
@@ -175,34 +187,34 @@ class PythonStringStateSaveable(PythonStateSaveable):
     return control_flow_ops.no_op()
 
 
-class _CheckpointPosition(object):
-  """Indicates a position within a `_Checkpoint`."""
+class CheckpointPosition(object):
+  """Indicates a position within a `_CheckpointRestoreCoordinator`."""
 
   def __init__(self, checkpoint, proto_id):
     """Specify an object within a checkpoint.
 
     Args:
-      checkpoint: A _Checkpoint object.
-      proto_id: The index of this object in CheckpointableObjectGraph.nodes.
+      checkpoint: A _CheckpointRestoreCoordinator object.
+      proto_id: The index of this object in TrackableObjectGraph.nodes.
     """
     self._checkpoint = checkpoint
     self._proto_id = proto_id
 
-  def restore(self, checkpointable):
-    """Restore this value into `checkpointable`."""
+  def restore(self, trackable):
+    """Restore this value into `trackable`."""
     with ops.init_scope():
-      if self.bind_object(checkpointable):
+      if self.bind_object(trackable):
         # This object's correspondence with a checkpointed object is new, so
         # process deferred restorations for it and its dependencies.
-        restore_ops = checkpointable._restore_from_checkpoint_position(self)  # pylint: disable=protected-access
+        restore_ops = trackable._restore_from_checkpoint_position(self)  # pylint: disable=protected-access
         if restore_ops:
           self._checkpoint.new_restore_ops(restore_ops)
 
-  def bind_object(self, checkpointable):
+  def bind_object(self, trackable):
     """Set a checkpoint<->object correspondence and process slot variables.
 
     Args:
-      checkpointable: The object to record a correspondence for.
+      trackable: The object to record a correspondence for.
     Returns:
       True if this is a new assignment, False if this object has already been
       mapped to a checkpointed `Object` proto.
@@ -210,14 +222,14 @@ class _CheckpointPosition(object):
       AssertionError: If another object is already bound to the `Object` proto.
     """
     checkpoint = self.checkpoint
-    checkpoint.all_python_objects.add(checkpointable)
+    checkpoint.all_python_objects.add(trackable)
     current_assignment = checkpoint.object_by_proto_id.get(self._proto_id, None)
     if current_assignment is None:
-      checkpoint.object_by_proto_id[self._proto_id] = checkpointable
+      checkpoint.object_by_proto_id[self._proto_id] = trackable
       for deferred_slot_restoration in (
           checkpoint.deferred_slot_restorations.pop(self._proto_id, ())):
-        checkpointable._create_or_restore_slot_variable(  # pylint: disable=protected-access
-            slot_variable_position=_CheckpointPosition(
+        trackable._create_or_restore_slot_variable(  # pylint: disable=protected-access
+            slot_variable_position=CheckpointPosition(
                 checkpoint=checkpoint,
                 proto_id=deferred_slot_restoration.slot_variable_id),
             variable=deferred_slot_restoration.original_variable,
@@ -232,15 +244,15 @@ class _CheckpointPosition(object):
           checkpoint.deferred_slot_restorations.setdefault(
               slot_restoration.optimizer_id, []).append(
                   _DeferredSlotVariableRestoration(
-                      original_variable=checkpointable,
+                      original_variable=trackable,
                       slot_variable_id=slot_restoration.slot_variable_id,
                       slot_name=slot_restoration.slot_name))
         else:
           optimizer_object._create_or_restore_slot_variable(  # pylint: disable=protected-access
-              slot_variable_position=_CheckpointPosition(
+              slot_variable_position=CheckpointPosition(
                   checkpoint=checkpoint,
                   proto_id=slot_restoration.slot_variable_id),
-              variable=checkpointable,
+              variable=trackable,
               slot_name=slot_restoration.slot_name)
       return True  # New assignment
     else:
@@ -248,14 +260,14 @@ class _CheckpointPosition(object):
       # we don't need to do anything besides check that the mapping is
       # consistent (if the dependency DAG is not a tree then there are
       # multiple paths to the same object).
-      if current_assignment is not checkpointable:
+      if current_assignment is not trackable:
         logging.warning(
             ("Inconsistent references when loading the checkpoint into this "
-             "object graph. Either the Checkpointable object references in the "
+             "object graph. Either the Trackable object references in the "
              "Python program have changed in an incompatible way, or the "
              "checkpoint was generated in an incompatible program.\n\nTwo "
              "checkpoint references resolved to different objects (%s and %s).")
-            % (current_assignment, checkpointable))
+            % (current_assignment, trackable))
       return False  # Not a new assignment
 
   def is_simple_variable(self):
@@ -294,7 +306,7 @@ class _CheckpointPosition(object):
 
   def _gather_ops_or_named_saveables(self):
     """Looks up or creates SaveableObjects which don't have cached ops."""
-    saveables = self.checkpointable._gather_saveables_for_checkpoint()  # pylint: disable=protected-access
+    saveables = self.trackable._gather_saveables_for_checkpoint()  # pylint: disable=protected-access
     # Name saveables based on the name this object had when it was checkpointed.
     named_saveables = {}
     python_saveables = []
@@ -313,15 +325,16 @@ class _CheckpointPosition(object):
       # the SaveableObject itself has been cached. If not, we'll make it, and
       # either way we'll extract new ops from it (or if it has Python state to
       # restore, we'll run that).
-      if self._checkpoint.saveable_object_cache is None:
+      saveables_cache = self._checkpoint.graph_view.saveables_cache
+      if saveables_cache is None:
         # No SaveableObject caching when executing eagerly.
         saveable = None
       else:
         # If we've already created and cached a SaveableObject for this
         # attribute, we can re-use it to avoid re-creating some ops when graph
         # building.
-        saveable_list = self._checkpoint.saveable_object_cache.get(
-            self.checkpointable, {}).get(serialized_tensor.name, (None,))
+        saveable_list = saveables_cache.get(
+            self.trackable, {}).get(serialized_tensor.name, (None,))
         if len(saveable_list) == 1:
           # Almost every attribute will have exactly one SaveableObject.
           saveable, = saveable_list
@@ -335,7 +348,7 @@ class _CheckpointPosition(object):
         # the SaveableObject.
         if serialized_tensor.checkpoint_key not in saveable.name:
           saveable = None
-          del self._checkpoint.saveable_object_cache[self.checkpointable]
+          del saveables_cache[self.trackable]
           break
       if saveable is None:
         # If there was no cached SaveableObject, we should check if the Python
@@ -346,16 +359,17 @@ class _CheckpointPosition(object):
           # added or deleted. Stores unused attributes so an exception can be
           # raised if the user decides to check that everything in the
           # checkpoint was loaded.
-          self._checkpoint.unused_attributes.setdefault(
-              self.checkpointable, []).append(serialized_tensor.name)
+          if not serialized_tensor.optional_restore:
+            self._checkpoint.unused_attributes.setdefault(
+                self.trackable, []).append(serialized_tensor.name)
           continue
         if callable(saveable_factory):
           saveable = saveable_factory(name=serialized_tensor.checkpoint_key)
         else:
           saveable = saveable_factory
-        if self._checkpoint.saveable_object_cache is not None:
-          self._checkpoint.saveable_object_cache.setdefault(
-              self.checkpointable, {})[serialized_tensor.name] = [saveable]
+        if saveables_cache is not None:
+          saveables_cache.setdefault(
+              self.trackable, {})[serialized_tensor.name] = [saveable]
       if isinstance(saveable, PythonStateSaveable):
         python_saveables.append(saveable)
       else:
@@ -365,7 +379,7 @@ class _CheckpointPosition(object):
   def restore_ops(self):
     """Create or fetch restore ops for this object's attributes.
 
-    Requires that the `Checkpointable` Python object has been bound to an object
+    Requires that the `Trackable` Python object has been bound to an object
     ID in the checkpoint.
 
     Returns:
@@ -384,7 +398,7 @@ class _CheckpointPosition(object):
     return self._checkpoint
 
   @property
-  def checkpointable(self):
+  def trackable(self):
     return self._checkpoint.object_by_proto_id[self._proto_id]
 
   @property
@@ -422,11 +436,11 @@ _SlotVariableRestoration = collections.namedtuple(
 def no_automatic_dependency_tracking(method):
   """Disables automatic dependency tracking on attribute assignment.
 
-  Use to decorate any method of a Checkpointable object. Attribute assignment in
+  Use to decorate any method of a Trackable object. Attribute assignment in
   that method will not add dependencies (also respected in Model). Harmless if
   used in a class which does not do automatic dependency tracking (which means
   it's safe to use in base classes which may have subclasses which also inherit
-  from Checkpointable).
+  from Trackable).
 
   Args:
     method: The method to decorate.
@@ -439,46 +453,47 @@ def no_automatic_dependency_tracking(method):
     previous_value = getattr(self, "_setattr_tracking", True)
     self._setattr_tracking = False  # pylint: disable=protected-access
     try:
-      method(self, *args, **kwargs)
+      result = method(self, *args, **kwargs)
     finally:
       self._setattr_tracking = previous_value  # pylint: disable=protected-access
+    return result
 
   return tf_decorator.make_decorator(
       target=method, decorator_func=_method_wrapper)
 
 
-class CheckpointableBase(object):
-  """Base class for `Checkpointable` objects without automatic dependencies.
+class Trackable(object):
+  """Base class for `Trackable` objects without automatic dependencies.
 
   This class has no __setattr__ override for performance reasons. Dependencies
   must be added explicitly. Unless attribute assignment is performance-critical,
-  use `Checkpointable` instead. Use `CheckpointableBase` for `isinstance`
+  use `AutoTrackable` instead. Use `Trackable` for `isinstance`
   checks.
   """
 
-  # CheckpointableBase does not do automatic dependency tracking, but uses the
+  # Trackable does not do automatic dependency tracking, but uses the
   # no_automatic_dependency_tracking decorator so it can avoid adding
-  # dependencies if a subclass is Checkpointable / inherits from Model (both of
+  # dependencies if a subclass is Trackable / inherits from Model (both of
   # which have __setattr__ overrides).
   @no_automatic_dependency_tracking
-  def _maybe_initialize_checkpointable(self):
+  def _maybe_initialize_trackable(self):
     """Initialize dependency management.
 
     Not __init__, since most objects will forget to call it.
     """
     if hasattr(self, "_unconditional_checkpoint_dependencies"):
       # __init__ already called. This check means that we don't need
-      # Checkpointable.__init__() in the constructor of every TensorFlow object.
+      # Trackable.__init__() in the constructor of every TensorFlow object.
       return
-    # A list of CheckpointableReference objects. Some classes implementing
-    # `Checkpointable`, notably `Optimizer`s, may override the
+    # A list of TrackableReference objects. Some classes implementing
+    # `Trackable`, notably `Optimizer`s, may override the
     # _checkpoint_dependencies property with conditional dependencies
     # (e.g. based on the current graph when saving).
     self._unconditional_checkpoint_dependencies = []
-    # Maps names -> Checkpointable objects
+    # Maps names -> Trackable objects
     self._unconditional_dependency_names = {}
-    # Restorations for other Checkpointable objects on which this object may
-    # eventually depend. Maps local name -> _CheckpointPosition list. Optimizers
+    # Restorations for other Trackable objects on which this object may
+    # eventually depend. Maps local name -> CheckpointPosition list. Optimizers
     # tack on conditional dependencies, and so need separate management of
     # deferred dependencies too.
     self._unconditional_deferred_dependencies = {}
@@ -516,8 +531,8 @@ class CheckpointableBase(object):
     May be overridden to include conditional dependencies.
 
     Returns:
-      A list of `CheckpointableReference` objects indicating named
-      `Checkpointable` dependencies which should be saved along with this
+      A list of `TrackableReference` objects indicating named
+      `Trackable` dependencies which should be saved along with this
       object.
     """
     return self._unconditional_checkpoint_dependencies
@@ -526,13 +541,13 @@ class CheckpointableBase(object):
   def _deferred_dependencies(self):
     """A dictionary with deferred dependencies.
 
-    Stores restorations for other Checkpointable objects on which this object
+    Stores restorations for other Trackable objects on which this object
     may eventually depend. May be overridden by sub-classes (e.g. Optimizers use
     conditional dependencies based the current graph, and so need separate
     management of deferred dependencies too).
 
     Returns:
-      A dictionary mapping from local name to a list of _CheckpointPosition
+      A dictionary mapping from local name to a list of CheckpointPosition
       objects.
     """
     return self._unconditional_deferred_dependencies
@@ -545,7 +560,7 @@ class CheckpointableBase(object):
     Args:
       name: The local name of the dependency.
     Returns:
-      A `Checkpointable` object, or `None` if no dependency by this name was
+      A `Trackable` object, or `None` if no dependency by this name was
       found.
     """
     return self._unconditional_dependency_names.get(name, None)
@@ -554,9 +569,9 @@ class CheckpointableBase(object):
       self, name, shape=None, dtype=dtypes.float32,
       initializer=None, getter=None, overwrite=False,
       **kwargs_for_getter):
-    """Restore-on-create for a variable be saved with this `Checkpointable`.
+    """Restore-on-create for a variable be saved with this `Trackable`.
 
-    If the user has requested that this object or another `Checkpointable` which
+    If the user has requested that this object or another `Trackable` which
     depends on this object be restored from a checkpoint (deferred loading
     before variable object creation), `initializer` may be ignored and the value
     from the checkpoint used instead.
@@ -578,7 +593,7 @@ class CheckpointableBase(object):
     Raises:
       ValueError: If the variable name is not unique.
     """
-    self._maybe_initialize_checkpointable()
+    self._maybe_initialize_trackable()
     with ops.init_scope():
       if context.executing_eagerly():
         # If this is a variable with a single Tensor stored in the checkpoint,
@@ -594,11 +609,11 @@ class CheckpointableBase(object):
               isinstance(initializer, CheckpointInitialValue)
               and (initializer.restore_uid
                    > checkpoint_initializer.restore_uid))):
-        # If multiple Checkpointable objects are "creating" the same variable
+        # If multiple Trackable objects are "creating" the same variable
         # via the magic of custom getters, the one with the highest restore UID
         # (the one called last) has to make the final initializer. If another
         # custom getter interrupts this process by overwriting the initializer,
-        # then we'll catch that when we call _track_checkpointable. So this is
+        # then we'll catch that when we call _track_trackable. So this is
         # "best effort" to set the initializer with the highest restore UID.
         initializer = checkpoint_initializer
         shape = None
@@ -610,12 +625,12 @@ class CheckpointableBase(object):
     # assign again. It will add this variable to our dependencies, and if there
     # is a non-trivial restoration queued, it will handle that. This also
     # handles slot variables.
-    if not overwrite or isinstance(new_variable, CheckpointableBase):
-      return self._track_checkpointable(new_variable, name=name,
-                                        overwrite=overwrite)
+    if not overwrite or isinstance(new_variable, Trackable):
+      return self._track_trackable(new_variable, name=name,
+                                   overwrite=overwrite)
     else:
       # TODO(allenl): Some variable types are not yet supported. Remove this
-      # fallback once all get_variable() return types are Checkpointable.
+      # fallback once all get_variable() return types are Trackable.
       return new_variable
 
   def _preload_simple_restoration(self, name, shape):
@@ -654,46 +669,46 @@ class CheckpointableBase(object):
     return CheckpointInitialValue(
         checkpoint_position=checkpoint_position, shape=shape)
 
-  def _track_checkpointable(self, checkpointable, name, overwrite=False):
-    """Declare a dependency on another `Checkpointable` object.
+  def _track_trackable(self, trackable, name, overwrite=False):
+    """Declare a dependency on another `Trackable` object.
 
     Indicates that checkpoints for this object should include variables from
-    `checkpointable`.
+    `trackable`.
 
-    Variables in a checkpoint are mapped to `Checkpointable`s based on the names
+    Variables in a checkpoint are mapped to `Trackable`s based on the names
     provided when the checkpoint was written. To avoid breaking existing
     checkpoints when modifying a class, neither variable names nor dependency
-    names (the names passed to `_track_checkpointable`) may change.
+    names (the names passed to `_track_trackable`) may change.
 
     Args:
-      checkpointable: A `Checkpointable` which this object depends on.
-      name: A local name for `checkpointable`, used for loading checkpoints into
+      trackable: A `Trackable` which this object depends on.
+      name: A local name for `trackable`, used for loading checkpoints into
         the correct objects.
       overwrite: Boolean, whether silently replacing dependencies is OK. Used
         for __setattr__, where throwing an error on attribute reassignment would
         be inappropriate.
 
     Returns:
-      `checkpointable`, for convenience when declaring a dependency and
+      `trackable`, for convenience when declaring a dependency and
       assigning to a member variable in one statement.
 
     Raises:
-      TypeError: If `checkpointable` does not inherit from `Checkpointable`.
+      TypeError: If `trackable` does not inherit from `Trackable`.
       ValueError: If another object is already tracked by this name.
     """
-    self._maybe_initialize_checkpointable()
-    if not isinstance(checkpointable, CheckpointableBase):
+    self._maybe_initialize_trackable()
+    if not isinstance(trackable, Trackable):
       raise TypeError(
-          ("Checkpointable._track_checkpointable() passed type %s, not a "
-           "Checkpointable.") % (type(checkpointable),))
-    new_reference = CheckpointableReference(name=name, ref=checkpointable)
+          ("Trackable._track_trackable() passed type %s, not a "
+           "Trackable.") % (type(trackable),))
+    new_reference = TrackableReference(name=name, ref=trackable)
     current_object = self._lookup_dependency(name)
     if (current_object is not None
-        and current_object is not checkpointable):
+        and current_object is not trackable):
       if not overwrite:
         raise ValueError(
-            ("Called Checkpointable._track_checkpointable() with name='%s', "
-             "but a Checkpointable with this name is already declared as a "
+            ("Called Trackable._track_trackable() with name='%s', "
+             "but a Trackable with this name is already declared as a "
              "dependency. Names must be unique (or overwrite=True).") % (name,))
       # This is a weird thing to do, but we're not going to stop people from
       # using __setattr__.
@@ -704,20 +719,20 @@ class CheckpointableBase(object):
     elif current_object is None:
       self._unconditional_checkpoint_dependencies.append(new_reference)
       self._handle_deferred_dependencies(
-          name=name, checkpointable=checkpointable)
-    self._unconditional_dependency_names[name] = checkpointable
-    return checkpointable
+          name=name, trackable=trackable)
+    self._unconditional_dependency_names[name] = trackable
+    return trackable
 
-  def _handle_deferred_dependencies(self, name, checkpointable):
-    """Pop and load any deferred checkpoint restores into `checkpointable`.
+  def _handle_deferred_dependencies(self, name, trackable):
+    """Pop and load any deferred checkpoint restores into `trackable`.
 
-    This method does not add a new dependency on `checkpointable`, but it does
+    This method does not add a new dependency on `trackable`, but it does
     check if any outstanding/deferred dependencies have been queued waiting for
     this dependency to be added (matched based on `name`). If so,
-    `checkpointable` and its dependencies are restored. The restorations are
+    `trackable` and its dependencies are restored. The restorations are
     considered fulfilled and so are deleted.
 
-    `_track_checkpointable` is more appropriate for adding a
+    `_track_trackable` is more appropriate for adding a
     normal/unconditional dependency, and includes handling for deferred
     restorations. This method allows objects such as `Optimizer` to use the same
     restoration logic while managing conditional dependencies themselves, by
@@ -727,25 +742,25 @@ class CheckpointableBase(object):
 
     Args:
       name: The name of the dependency within this object (`self`), used to
-        match `checkpointable` with values saved in a checkpoint.
-      checkpointable: The Checkpointable object to restore (inheriting from
-        `CheckpointableBase`).
+        match `trackable` with values saved in a checkpoint.
+      trackable: The Trackable object to restore (inheriting from
+        `Trackable`).
     """
-    self._maybe_initialize_checkpointable()
-    checkpointable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
+    self._maybe_initialize_trackable()
+    trackable._maybe_initialize_trackable()  # pylint: disable=protected-access
     deferred_dependencies_list = self._deferred_dependencies.pop(name, ())
     for checkpoint_position in sorted(
         deferred_dependencies_list,
         key=lambda restore: restore.checkpoint.restore_uid,
         reverse=True):
-      checkpoint_position.restore(checkpointable)
+      checkpoint_position.restore(trackable)
 
     # Pass on any name-based restores queued in this object.
     for name_based_restore in sorted(
         self._name_based_restores,
         key=lambda checkpoint: checkpoint.restore_uid,
         reverse=True):
-      checkpointable._name_based_attribute_restore(name_based_restore)  # pylint: disable=protected-access
+      trackable._name_based_attribute_restore(name_based_restore)  # pylint: disable=protected-access
 
   def _restore_from_checkpoint_position(self, checkpoint_position):
     """Restore this object and its dependencies (may be deferred)."""
@@ -758,7 +773,7 @@ class CheckpointableBase(object):
     while visit_queue:
       current_position = visit_queue.popleft()
       restore_ops.extend(nest.flatten(
-          current_position.checkpointable  # pylint: disable=protected-access
+          current_position.trackable  # pylint: disable=protected-access
           ._single_restoration_from_checkpoint_position(
               checkpoint_position=current_position,
               visit_queue=visit_queue)))
@@ -767,7 +782,7 @@ class CheckpointableBase(object):
   def _single_restoration_from_checkpoint_position(
       self, checkpoint_position, visit_queue):
     """Restore this object, and either queue its dependencies or defer them."""
-    self._maybe_initialize_checkpointable()
+    self._maybe_initialize_trackable()
     checkpoint = checkpoint_position.checkpoint
     # If the UID of this restore is lower than our current update UID, we don't
     # need to actually restore the object. However, we should pass the
@@ -778,7 +793,7 @@ class CheckpointableBase(object):
     else:
       restore_ops = ()
     for child in checkpoint_position.object_proto.children:
-      child_position = _CheckpointPosition(
+      child_position = CheckpointPosition(
           checkpoint=checkpoint,
           proto_id=child.node_id)
       local_object = self._lookup_dependency(child.local_name)
@@ -788,7 +803,7 @@ class CheckpointableBase(object):
         self._deferred_dependencies.setdefault(child.local_name, []).append(
             child_position)
       else:
-        if child_position.bind_object(checkpointable=local_object):
+        if child_position.bind_object(trackable=local_object):
           # This object's correspondence is new, so dependencies need to be
           # visited. Delay doing it so that we get a breadth-first dependency
           # resolution order (shallowest paths first). The caller is responsible
@@ -804,7 +819,7 @@ class CheckpointableBase(object):
     or variables easily converted to `SaveableObject`s (as in `tf.train.Saver`'s
     `var_list` constructor argument).
 
-    `SaveableObjects` have a name set, which Checkpointable needs to generate
+    `SaveableObjects` have a name set, which Trackable needs to generate
     itself. So rather than returning `SaveableObjects` directly, this method
     should return a dictionary of callables which take `name` arguments and
     return `SaveableObjects` with that name.
@@ -833,13 +848,28 @@ class CheckpointableBase(object):
       return {}
     weak_self = weakref.ref(self)
     def _state_callback():
+      """Serializes `self.get_config()` for saving."""
       dereferenced_self = weak_self()
       if dereferenced_self:
-        return json.dumps(dereferenced_self,
-                          default=serialization.get_json_type,
-                          sort_keys=True).encode("utf8")
+        return json.dumps(
+            dereferenced_self,
+            default=serialization.get_json_type,
+            sort_keys=True).encode("utf8")
       else:
         return ""
     return {OBJECT_CONFIG_JSON_KEY: functools.partial(
         PythonStringStateSaveable,
         state_callback=_state_callback)}
+
+  def _list_functions_for_serialization(self):
+    """Lists the functions of this trackable to serialize.
+
+    Internal sub-classes can override this with specific logic. E.g.
+    `AutoTrackable` provides an implementation that returns the `attr`
+    that return functions.
+
+    Returns:
+        A dictionary mapping attribute names to `Function` or
+        `ConcreteFunction`.
+    """
+    return dict()
diff --git a/tensorflow/python/training/checkpointable/base_test.py b/tensorflow/python/training/tracking/base_test.py
similarity index 59%
rename from tensorflow/python/training/checkpointable/base_test.py
rename to tensorflow/python/training/tracking/base_test.py
index fd935ac559ed7cd607145e7b2433a00c1f8431ea..4a74417e3ba9a081ad2a6c7150e63ffd3aa898fa 100644
--- a/tensorflow/python/training/checkpointable/base_test.py
+++ b/tensorflow/python/training/tracking/base_test.py
@@ -16,32 +16,35 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+import os
+
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
-from tensorflow.python.training.checkpointable import base
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import util
 
 
 class InterfaceTests(test.TestCase):
 
   def testOverwrite(self):
-    root = base.CheckpointableBase()
-    leaf = base.CheckpointableBase()
-    root._track_checkpointable(leaf, name="leaf")
+    root = base.Trackable()
+    leaf = base.Trackable()
+    root._track_trackable(leaf, name="leaf")
     (current_name, current_dependency), = root._checkpoint_dependencies
     self.assertIs(leaf, current_dependency)
     self.assertEqual("leaf", current_name)
-    duplicate_name_dep = base.CheckpointableBase()
+    duplicate_name_dep = base.Trackable()
     with self.assertRaises(ValueError):
-      root._track_checkpointable(duplicate_name_dep, name="leaf")
-    root._track_checkpointable(duplicate_name_dep, name="leaf", overwrite=True)
+      root._track_trackable(duplicate_name_dep, name="leaf")
+    root._track_trackable(duplicate_name_dep, name="leaf", overwrite=True)
     (current_name, current_dependency), = root._checkpoint_dependencies
     self.assertIs(duplicate_name_dep, current_dependency)
     self.assertEqual("leaf", current_name)
 
   def testAddVariableOverwrite(self):
-    root = base.CheckpointableBase()
+    root = base.Trackable()
     a = root._add_variable_with_custom_getter(
         name="v", shape=[], getter=variable_scope.get_variable)
     self.assertEqual([root, a], util.list_objects(root))
@@ -57,5 +60,30 @@ class InterfaceTests(test.TestCase):
             name="v", shape=[], overwrite=False,
             getter=variable_scope.get_variable)
 
+  def testAssertConsumedWithUnusedPythonState(self):
+    has_config = base.Trackable()
+    has_config.get_config = lambda: {}
+    saved = util.Checkpoint(obj=has_config)
+    save_path = saved.save(os.path.join(self.get_temp_dir(), "ckpt"))
+    restored = util.Checkpoint(obj=base.Trackable())
+    restored.restore(save_path).assert_consumed()
+
+  def testAssertConsumedFailsWithUsedPythonState(self):
+    has_config = base.Trackable()
+    attributes = {
+        "foo_attr": functools.partial(
+            base.PythonStringStateSaveable,
+            state_callback=lambda: "",
+            restore_callback=lambda x: None)}
+    has_config._gather_saveables_for_checkpoint = lambda: attributes
+    saved = util.Checkpoint(obj=has_config)
+    save_path = saved.save(os.path.join(self.get_temp_dir(), "ckpt"))
+    restored = util.Checkpoint(obj=base.Trackable())
+    status = restored.restore(save_path)
+    with self.assertRaisesRegexp(AssertionError, "foo_attr"):
+      status.assert_consumed()
+
+
 if __name__ == "__main__":
+  ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/training/checkpointable/data_structures.py b/tensorflow/python/training/tracking/data_structures.py
similarity index 73%
rename from tensorflow/python/training/checkpointable/data_structures.py
rename to tensorflow/python/training/tracking/data_structures.py
index 817552f32696e34d123d1da5057388c1bd96139c..7f1cc50710452ec7262be03434f4d1dc450b5442 100644
--- a/tensorflow/python/training/checkpointable/data_structures.py
+++ b/tensorflow/python/training/tracking/data_structures.py
@@ -1,4 +1,4 @@
-"""Checkpointable data structures."""
+"""Trackable data structures."""
 # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,20 +19,25 @@ from __future__ import print_function
 
 import collections
 import copy
+import operator
+import sys
 
 import six
 
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function as defun
 from tensorflow.python.ops import variables
-from tensorflow.python.training.checkpointable import base
-from tensorflow.python.training.checkpointable import layer_utils
+from tensorflow.python.saved_model import revived_types
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import layer_utils
 
 
 class NoDependency(object):
-  """Allows attribute assignment to `Checkpointable` objects with no dependency.
+  """Allows attribute assignment to `Trackable` objects with no dependency.
 
   Example usage:
   ```python
-  obj = Checkpointable()
+  obj = Trackable()
   obj.has_dependency = tf.Variable(0., name="dep")
   obj.no_dependency = NoDependency(tf.Variable(1., name="nodep"))
   assert obj.no_dependency.name == "nodep:0"
@@ -54,16 +59,22 @@ class NoDependency(object):
 
 def _wrap_or_unwrap(value):
   """Wraps basic data structures, unwraps NoDependency objects."""
+  # pylint: disable=unidiomatic-typecheck
+  # Exact type checking to avoid mucking up custom logic in list/dict
+  # subclasses, e.g. collections.Counter.
   if isinstance(value, NoDependency):
     return value.value
-  if isinstance(value, base.CheckpointableBase):
-    return value  # Skip conversion for already checkpointable objects.
-  elif isinstance(value, dict):
+  if isinstance(value, base.Trackable):
+    return value  # Skip conversion for already trackable objects.
+  elif type(value) == dict:
     return _DictWrapper(value)
-  elif isinstance(value, list):
+  elif type(value) == collections.OrderedDict:
+    return _DictWrapper(value)
+  elif type(value) == list:
     return _ListWrapper(value)
   else:
     return value
+  # pylint: enable=unidiomatic-typecheck
   # TODO(allenl): Handle other common data structures. Tuples will require
   # special casing (tuple subclasses are not weak referenceable, so replacement
   # with a wrapper that subclasses tuple on attribute assignment works poorly,
@@ -72,19 +83,19 @@ def _wrap_or_unwrap(value):
   # come up with names. Dictionaries should look like lists.
 
 
-def sticky_attribute_assignment(checkpointable, name, value):
+def sticky_attribute_assignment(trackable, name, value):
   """Adds dependencies, generally called from __setattr__.
 
-  This behavior is shared between Checkpointable and Model.
+  This behavior is shared between Trackable and Model.
 
-  Respects NoDependency indicators, but otherwise makes checkpointable objects
+  Respects NoDependency indicators, but otherwise makes trackable objects
   out of common data structures and tracks objects by their attribute names.
 
   Args:
-    checkpointable: The object to add dependencies to (generally the one having
+    trackable: The object to add dependencies to (generally the one having
       an attribute assigned).
     name: The attribute name being assigned.
-    value: The value being assigned. Not necessarily a checkpointable object.
+    value: The value being assigned. Not necessarily a trackable object.
 
   Returns:
     The value which should be stored in the attribute (unwrapped from a
@@ -97,18 +108,18 @@ def sticky_attribute_assignment(checkpointable, name, value):
   value = _wrap_or_unwrap(value)
   if not add_dependency:
     return value
-  if isinstance(value, base.CheckpointableBase):
-    checkpointable._track_checkpointable(  # pylint: disable=protected-access
+  if isinstance(value, base.Trackable):
+    trackable._track_trackable(  # pylint: disable=protected-access
         value, name=name,
-        # Allow the user to switch the Checkpointable which is tracked by this
+        # Allow the user to switch the Trackable which is tracked by this
         # name, since assigning a new variable to an attribute has
         # historically been fine (e.g. Adam did this).
         overwrite=True)
   return value
 
 
-class CheckpointableDataStructure(base.CheckpointableBase):
-  """Base class for data structures which contain checkpointable objects."""
+class TrackableDataStructure(base.Trackable):
+  """Base class for data structures which contain trackable objects."""
 
   def __init__(self):
     self.trainable = True
@@ -117,14 +128,14 @@ class CheckpointableDataStructure(base.CheckpointableBase):
   def _track_value(self, value, name):
     """Add a dependency on `value`."""
     value = sticky_attribute_assignment(
-        checkpointable=self, value=value, name=name)
+        trackable=self, value=value, name=name)
     if isinstance(value, variables.Variable):
       self._extra_variables.append(value)
-    if not isinstance(value, base.CheckpointableBase):
+    if not isinstance(value, base.Trackable):
       raise ValueError(
-          ("Only checkpointable objects (such as Layers or Optimizers) may be "
+          ("Only trackable objects (such as Layers or Optimizers) may be "
            "stored in a List object. Got %s, which does not inherit from "
-           "CheckpointableBase.") % (value,))
+           "Trackable.") % (value,))
     if hasattr(value, "_use_resource_variables"):
       # In subclassed models, legacy layers (tf.layers) must always use
       # resource variables.
@@ -133,7 +144,7 @@ class CheckpointableDataStructure(base.CheckpointableBase):
 
   @property
   def _values(self):
-    """An iterable/sequence which may contain checkpointable objects."""
+    """An iterable/sequence which may contain trackable objects."""
     raise NotImplementedError("Abstract method")
 
   @property
@@ -143,7 +154,7 @@ class CheckpointableDataStructure(base.CheckpointableBase):
     # they're wrapping if out of sync.
     collected = []
     for obj in self._values:
-      if (isinstance(obj, CheckpointableDataStructure)
+      if (isinstance(obj, TrackableDataStructure)
           or layer_utils.is_layer(obj)
           or layer_utils.has_weights(obj)):
         collected.append(obj)
@@ -210,19 +221,19 @@ class CheckpointableDataStructure(base.CheckpointableBase):
     return id(self)
 
   def __eq__(self, other):
-    # Similar to Tensors, checkpointable data structures use object-identity
+    # Similar to Tensors, trackable data structures use object-identity
     # equality to support set/dict membership.
     return self is other
 
 
-class List(CheckpointableDataStructure, collections.Sequence):
-  """An append-only sequence type which is checkpointable.
+class List(TrackableDataStructure, collections.Sequence):
+  """An append-only sequence type which is trackable.
 
   Maintains checkpoint dependencies on its contents (which must also be
-  checkpointable), and forwards any `Layer` metadata such as updates and losses.
+  trackable), and forwards any `Layer` metadata such as updates and losses.
 
   Note that `List` is purely a container. It lets a `tf.keras.Model` or
-  other checkpointable object know about its contents, but does not call any
+  other trackable object know about its contents, but does not call any
   `Layer` instances which are added to it. To indicate a sequence of `Layer`
   instances which should be called sequentially, use `tf.keras.Sequential`.
 
@@ -243,7 +254,7 @@ class List(CheckpointableDataStructure, collections.Sequence):
       return aggregation
   ```
 
-  This kind of wrapping is necessary because `Checkpointable` objects do not
+  This kind of wrapping is necessary because `Trackable` objects do not
   (yet) deeply inspect regular Python data structures, so for example assigning
   a regular list (`self.layer_list = [layers.Dense(3)]`) does not create a
   checkpoint dependency and does not add the `Layer` instance's weights to its
@@ -258,9 +269,12 @@ class List(CheckpointableDataStructure, collections.Sequence):
       self._storage[index] = self._track_value(
           element, name=self._name_element(index))
 
-  def __copy__(self):
+  def copy(self):
     return type(self)(copy.copy(self._storage))
 
+  def __copy__(self):
+    return self.copy()
+
   def __deepcopy__(self, memo):
     return type(self)(copy.deepcopy(self._storage, memo))
 
@@ -276,39 +290,61 @@ class List(CheckpointableDataStructure, collections.Sequence):
     return self
 
   def append(self, value):
-    """Add a new checkpointable value."""
+    """Add a new trackable value."""
     value = self._track_value(value, self._name_element(len(self._storage)))
     self._storage.append(value)
 
   def extend(self, values):
-    """Add a sequence of checkpointable values."""
+    """Add a sequence of trackable values."""
     for value in values:
-      self._storage.append(self._track_value(
-          value, name=self._name_element(len(self._storage))))
+      self.append(value)
 
   def __iadd__(self, values):
     self.extend(values)
     return self
 
   def __add__(self, other):
-    if isinstance(other, List):
-      return self.__class__(self._storage + other._storage)  # pylint: disable=protected-access
-    else:
-      return self.__class__(self._storage + other)
+    return self.__class__(self._storage + getattr(other, "_storage", other))
+
+  def __imul__(self, y):
+    if y <= 0:
+      raise ValueError(
+          "List only supports append, multiplying in place by %d removes "
+          "elements." % y)
+
+    n = len(self._storage)
+    for _ in range(y - 1):
+      for i in range(n):
+        self.append(self._storage[i])
+
+    return self
+
+  def __mul__(self, n):
+    return self.__class__(self._storage * n)
+
+  def __rmul__(self, n):
+    return self * n
 
   def __radd__(self, other):
-    return self + other
+    return self.__class__(other) + self
 
   def __getitem__(self, key):
     return self._storage[key]
 
+  def __getslice__(self, i, j):
+    return self._storage[slice(i, j)]
+
   def __len__(self):
     return len(self._storage)
 
   def __repr__(self):
     return "List(%s)" % (repr(self._storage),)
 
+  def __sizeof__(self):
+    return super(List, self).__sizeof__() + sys.getsizeof(self._storage)
 
+
+# TODO(tomhennigan) Update to collections.UserList?
 class _ListWrapper(List, collections.MutableSequence,
                    # Shadowed, but there for isinstance checks.
                    list):
@@ -320,7 +356,7 @@ class _ListWrapper(List, collections.MutableSequence,
   occupied, meaning both elements get the same names at different times) and
   refuses to save.
 
-  On assignment to an attribute of a Model or Checkpointable object, Python
+  On assignment to an attribute of a Model or Trackable object, Python
   lists are replaced with _ListWrapper. Wrapping a list in a
   `tf.contrib.checkpoint.NoDependency` object prevents this.
   """
@@ -380,17 +416,17 @@ class _ListWrapper(List, collections.MutableSequence,
     if self._non_append_mutation:
       raise ValueError(
           ("Unable to save the object %s (a list wrapper constructed to track "
-           "checkpointable TensorFlow objects). A list element was replaced "
-           "(__setitem__), deleted, or inserted. In order to support "
-           "restoration on object creation, tracking is exclusively for "
-           "append-only data structures.\n\nIf you don't need this list "
-           "checkpointed, wrap it in a tf.contrib.checkpoint.NoDependency "
-           "object; it will be automatically un-wrapped and subsequently "
-           "ignored." % (self,)))
+           "trackable TensorFlow objects). A list element was replaced "
+           "(__setitem__, __setslice__), deleted (__delitem__, __delslice__), "
+           "or moved (sort). In order to support restoration on object "
+           "creation, tracking is exclusively for append-only data structures."
+           "\n\nIf you don't need this list checkpointed, wrap it in a "
+           "tf.contrib.checkpoint.NoDependency object; it will be "
+           "automatically un-wrapped and subsequently ignored." % (self,)))
     if self._external_modification:
       raise ValueError(
           ("Unable to save the object %s (a list wrapper constructed to track "
-           "checkpointable TensorFlow objects). The wrapped list was modified "
+           "trackable TensorFlow objects). The wrapped list was modified "
            "outside the wrapper (its final value was %s, its value when a "
            "checkpoint dependency was added was %s), which breaks restoration "
            "on object creation.\n\nIf you don't need this list checkpointed, "
@@ -404,17 +440,43 @@ class _ListWrapper(List, collections.MutableSequence,
     del self._storage[key]
 
   def __setitem__(self, key, value):
-    self._non_append_mutation = True
-    self._storage[key] = value
+    self._check_external_modification()
+
+    if isinstance(key, slice):
+      # Note: this is quite inefficient, but the list API supports a broad range
+      # of slice setters (e.g. truncate, extend, replace) and immitating this
+      # for a range of Python versions is non-trivial.
+      storage_copy = list(self._storage)
+      self._storage[key] = value
+
+      len_before = len(storage_copy)
+      len_now = len(self._storage)
+      for i in range(max(len_before, len_now)):
+        value_now = self._storage[i] if i < len_now else None
+        value_before = storage_copy[i] if i < len_before else None
+
+        if isinstance(value_before, base.Trackable):
+          self._non_append_mutation = True
+
+        if value_now is not None and value_now != value_before:
+          self._storage[i] = self._track_value(self._storage[i],
+                                               self._name_element(i))
+
+    else:
+      if isinstance(self._storage[key], base.Trackable):
+        self._non_append_mutation = True
+      self._storage[key] = self._track_value(value, self._name_element(key))
+
+    self._update_snapshot()
 
   def append(self, value):
-    """Add a new checkpointable value."""
+    """Add a new trackable value."""
     self._check_external_modification()
     super(_ListWrapper, self).append(value)
     self._update_snapshot()
 
   def extend(self, values):
-    """Add a sequence of checkpointable values."""
+    """Add a sequence of trackable values."""
     self._check_external_modification()
     super(_ListWrapper, self).extend(values)
     self._update_snapshot()
@@ -446,26 +508,43 @@ class _ListWrapper(List, collections.MutableSequence,
     self._non_append_mutation = True
     self._storage.insert(index, obj)
 
+  def sort(self):
+    self._non_append_mutation = True
+    self._storage.sort()
+
+  def __setslice__(self, i, j, y):
+    self.__setitem__(slice(i, j), y)
+
+  def __delslice__(self, i, j):
+    self._non_append_mutation = True
+    del self._storage[slice(i, j)]
+
   def _track_value(self, value, name):
-    """Allows storage of non-checkpointable objects."""
+    """Allows storage of non-trackable objects."""
     try:
       value = super(_ListWrapper, self)._track_value(value=value, name=name)
     except ValueError:
-      # Even if this value isn't checkpointable, we need to make sure
+      # Even if this value isn't trackable, we need to make sure
       # NoDependency objects get unwrapped.
       value = sticky_attribute_assignment(
-          checkpointable=self, value=value, name=name)
+          trackable=self, value=value, name=name)
     return value
 
   def __repr__(self):
     return "ListWrapper(%s)" % (repr(self._storage),)
 
+  def _list_functions_for_serialization(self):
+    return {
+        str(key): value for key, value in enumerate(self)
+        if _is_function(value)
+    }
+
 
-class Mapping(CheckpointableDataStructure, collections.Mapping):
-  """An append-only checkpointable mapping data structure with string keys.
+class Mapping(TrackableDataStructure, collections.Mapping):
+  """An append-only trackable mapping data structure with string keys.
 
   Maintains checkpoint dependencies on its contents (which must also be
-  checkpointable), named based on its keys.
+  trackable), named based on its keys.
 
   Note that once a key has been added, it may not be deleted or replaced. If
   names may not be unique, see `tf.contrib.checkpoint.UniqueNameTracker`.
@@ -542,7 +621,7 @@ class Mapping(CheckpointableDataStructure, collections.Mapping):
 # patching all of the "wrapped" dict's methods instead of creating a wrapper
 # object is an option, but not a very attractive one (replacing methods without
 # creating reference cycles is difficult, and then dicts would need to be
-# special cased everywhere as being checkpointable).
+# special cased everywhere as being trackable).
 class _DictWrapper(Mapping, collections.MutableMapping):
   """Wraps built-in dicts to support restore-on-create for variables.
 
@@ -598,7 +677,7 @@ class _DictWrapper(Mapping, collections.MutableMapping):
       raise ValueError(
           "Unable to save the object %s (a dictionary wrapper constructed "
           "automatically on attribute assignment). The wrapped dictionary "
-          "contains a non-string key which maps to a checkpointable object or "
+          "contains a non-string key which maps to a trackable object or "
           "mutable data structure.\n\nIf you don't need this dictionary "
           "checkpointed, wrap it in a tf.contrib.checkpoint.NoDependency "
           "object; it will be automatically un-wrapped and subsequently "
@@ -607,7 +686,7 @@ class _DictWrapper(Mapping, collections.MutableMapping):
       raise ValueError(
           "Unable to save the object %s (a dictionary wrapper constructed "
           "automatically on attribute assignment). A key mapping to a "
-          "checkpointable object was overwritten or deleted, which would "
+          "trackable object was overwritten or deleted, which would "
           "cause problems for restoration.\n\nIf you don't need this "
           "dictionary checkpointed, wrap it in a "
           "tf.contrib.checkpoint.NoDependency object; it will be automatically "
@@ -648,7 +727,7 @@ class _DictWrapper(Mapping, collections.MutableMapping):
     self._last_wrapped_dict_snapshot = dict(self)
 
   def _track_value(self, value, name):
-    """Allows storage of non-checkpointable objects."""
+    """Allows storage of non-trackable objects."""
     if isinstance(name, six.string_types):
       string_key = True
     else:
@@ -658,15 +737,15 @@ class _DictWrapper(Mapping, collections.MutableMapping):
       no_dependency = isinstance(value, NoDependency)
       value = super(_DictWrapper, self)._track_value(value=value, name=name)
       if not (string_key or no_dependency):
-        # A non-string key maps to a checkpointable value. This data structure
+        # A non-string key maps to a trackable value. This data structure
         # is not saveable.
         self._non_string_key = True
       return value
     except ValueError:
-      # Even if this value isn't checkpointable, we need to make sure
+      # Even if this value isn't trackable, we need to make sure
       # NoDependency objects get unwrapped.
       return sticky_attribute_assignment(
-          checkpointable=self, value=value, name=name)
+          trackable=self, value=value, name=name)
 
   def _name_element(self, key):
     """Don't throw errors for non-string keys."""
@@ -685,31 +764,32 @@ class _DictWrapper(Mapping, collections.MutableMapping):
     else:
       value = _wrap_or_unwrap(value)
       existing_dependency = None
-      if not no_dep and isinstance(value, base.CheckpointableBase):
+      if not no_dep and isinstance(value, base.Trackable):
         # Non-string keys are OK as long as we have no reason to add a
         # dependency on the value (either because the value is not
-        # checkpointable, or because it was wrapped in a NoDependency object).
+        # trackable, or because it was wrapped in a NoDependency object).
         self._non_string_key = True
-    current_value = self._storage.setdefault(key, value)
-    if current_value is not value:
-      if ((not no_dep and isinstance(value, base.CheckpointableBase))
-          # We don't want to just check that the existing object is
-          # checkpointable, since it may have been wrapped in a NoDependency
-          # object.
-          or existing_dependency is not None):
-        # A checkpointable object was replaced under the same key; this means
-        # that restoring would be error-prone, so we'll throw an exception on
-        # save.
-        self._non_append_mutation = True
-      self._storage[key] = value
+    if key in self._storage:
+      previous_value = self._storage[key]
+      if previous_value is not value:
+        if ((not no_dep and isinstance(value, base.Trackable))
+            # We don't want to just check that the existing object is
+            # trackable, since it may have been wrapped in a NoDependency
+            # object.
+            or existing_dependency is not None):
+          # A trackable object was replaced under the same key; this means
+          # that restoring would be error-prone, so we'll throw an exception on
+          # save.
+          self._non_append_mutation = True
+    self._storage[key] = value
 
     self._update_snapshot()
 
   def __delitem__(self, key):
     self._check_external_modification()
     existing_value = self[key]
-    if isinstance(existing_value, base.CheckpointableBase):
-      # Deleting tracked checkpointable values means restoring is problematic,
+    if isinstance(existing_value, base.Trackable):
+      # Deleting tracked trackable values means restoring is problematic,
       # so we'll throw an exception on save.
       self._non_append_mutation = True
     del self._storage[key]
@@ -727,3 +807,43 @@ class _DictWrapper(Mapping, collections.MutableMapping):
   def update(self, *args, **kwargs):
     for key, value in dict(*args, **kwargs).items():
       self[key] = value
+
+  def _list_functions_for_serialization(self):
+    return {
+        key: value for key, value in self.items()
+        if _is_function(value)
+    }
+
+
+def _is_function(x):
+  return isinstance(x, (def_function.Function, defun.ConcreteFunction))
+
+revived_types.register_revived_type(
+    "trackable_dict_wrapper",
+    lambda obj: isinstance(obj, _DictWrapper),
+    versions=[revived_types.VersionedTypeRegistration(
+        # Standard dependencies are enough to reconstruct the trackable
+        # items in dictionaries, so we don't need to save any extra information.
+        object_factory=lambda proto: _DictWrapper({}),
+        version=1,
+        min_producer_version=1,
+        min_consumer_version=1,
+        setter=operator.setitem)])
+
+
+def _set_list_item(list_object, index_string, value):
+  item_index = int(index_string)
+  if len(list_object) <= item_index:
+    list_object.extend([None] * (1 + item_index - len(list_object)))
+  list_object[item_index] = value
+
+
+revived_types.register_revived_type(
+    "trackable_list_wrapper",
+    lambda obj: isinstance(obj, _ListWrapper),
+    versions=[revived_types.VersionedTypeRegistration(
+        object_factory=lambda proto: _ListWrapper([]),
+        version=1,
+        min_producer_version=1,
+        min_consumer_version=1,
+        setter=_set_list_item)])
diff --git a/tensorflow/python/training/checkpointable/data_structures_test.py b/tensorflow/python/training/tracking/data_structures_test.py
similarity index 76%
rename from tensorflow/python/training/checkpointable/data_structures_test.py
rename to tensorflow/python/training/tracking/data_structures_test.py
index bcec6e01001eec6c164cf4bb17db3d4ed55b0935..a03614bd60e60d66c781d321e4029f7cc0c4f5fe 100644
--- a/tensorflow/python/training/checkpointable/data_structures_test.py
+++ b/tensorflow/python/training/tracking/data_structures_test.py
@@ -34,9 +34,9 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.training.checkpointable import data_structures
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import data_structures
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
 
 
 class HasList(training.Model):
@@ -145,12 +145,12 @@ class ListTests(test.TestCase):
     model.l2.append(second_layer)
     self.assertEqual([first_layer, second_layer], model.layers)
 
-  def testNotCheckpointable(self):
-    class NotCheckpointable(object):
+  def testNotTrackable(self):
+    class NotTrackable(object):
       pass
 
     with self.assertRaises(ValueError):
-      data_structures.List([NotCheckpointable()])
+      data_structures.List([NotTrackable()])
 
   def testCallNotImplemented(self):
     with self.assertRaisesRegexp(TypeError, "not callable"):
@@ -207,11 +207,88 @@ class ListTests(test.TestCase):
     self.assertEqual([v], l.trainable_weights)
     self.assertEqual([v2], l.non_trainable_weights)
 
+  def testCopy(self):
+    v1 = resource_variable_ops.ResourceVariable(1.)
+    v2 = resource_variable_ops.ResourceVariable(1.)
+    v3 = resource_variable_ops.ResourceVariable(1.)
+
+    l1 = data_structures.List([v1, v2])
+    l2 = l1.copy()
+    l2.append(v3)
+    self.assertEqual(list(l1), [v1, v2])
+    self.assertEqual(list(l2), [v1, v2, v3])
+
+  def testSlicing(self):
+    v1 = resource_variable_ops.ResourceVariable(1.)
+    v2 = resource_variable_ops.ResourceVariable(1.)
+    v3 = resource_variable_ops.ResourceVariable(1.)
+    v4 = resource_variable_ops.ResourceVariable(1.)
+
+    l = data_structures.List([v1, v2, v3, v4])
+    self.assertEqual(l[1:], [v2, v3, v4])
+    self.assertEqual(l[1:-1], [v2, v3])
+    self.assertEqual(l[:-1], [v1, v2, v3])
+
+  def testHash(self):
+    has_sequences = set([data_structures.List(),
+                         data_structures.List()])
+    self.assertEqual(2, len(has_sequences))
+    self.assertNotIn(data_structures.List(), has_sequences)
+
+  def testIMul_zero(self):
+    l = data_structures.List([])
+    with self.assertRaisesRegexp(ValueError, "List only supports append"):
+      l *= 0
+
+  def testIMul(self):
+    v = resource_variable_ops.ResourceVariable(1.)
+    l = data_structures.List([v])
+    l *= 2
+    self.assertEqual(list(l), [v] * 2)
+
+  def testMul(self):
+    v = resource_variable_ops.ResourceVariable(1.)
+    l = data_structures.List([v, v, v])
+    self.assertEqual(list(l * 2), [v, v, v] * 2)
+
+  def testRMul(self):
+    v = resource_variable_ops.ResourceVariable(1.)
+    l = data_structures.List([v, v, v])
+    self.assertEqual(list(2 * l), [v, v, v] * 2)
+
+
+class ListWrapperTest(test.TestCase):
+
+  IGNORED = ("__new__", "__init__", "__subclasshook__", "__getattribute__")
+
+  def test_overrides_all_list_methods(self):
+    not_overridden = []
+
+    for name in dir(list):
+      if name in ListWrapperTest.IGNORED:
+        continue
+
+      list_method = getattr(list, name)
+
+      if not callable(list_method):
+        continue
+
+      object_method = getattr(object, name, None)
+      if object_method is not None and object_method == list_method:
+        # Skip methods that aren't overridden from object.
+        continue
+
+      if list_method == getattr(data_structures._ListWrapper, name):
+        not_overridden.append(name)
+
+    if not_overridden:
+      self.fail("_ListWrapper does not override %s" % (not_overridden))
+
   def testListWrapperBasic(self):
     # _ListWrapper, unlike List, compares like the built-in list type (since it
     # is used to automatically replace lists).
-    a = tracking.Checkpointable()
-    b = tracking.Checkpointable()
+    a = tracking.AutoTrackable()
+    b = tracking.AutoTrackable()
     self.assertEqual([a, a],
                      [a, a])
     self.assertEqual(data_structures._ListWrapper([a, a]),
@@ -244,6 +321,10 @@ class ListTests(test.TestCase):
     self.assertEqual([a, a], [a] + data_structures._ListWrapper([a]))
     self.assertIsInstance(data_structures._ListWrapper([a]), list)
 
+  def testAcceptsNonTrackableContent(self):
+    l = data_structures._ListWrapper([1, 2, 3])
+    self.assertEqual(l, [1, 2, 3])
+
   def testWrapperChangesList(self):
     l = []
     l_wrapper = data_structures._ListWrapper(l)
@@ -263,13 +344,61 @@ class ListTests(test.TestCase):
     l.append(layer)
     self.assertEqual([layer], l_wrapper.layers)
 
-  def testHashing(self):
-    has_sequences = set([data_structures.List(),
-                         data_structures.List()])
-    self.assertEqual(2, len(has_sequences))
-    self.assertNotIn(data_structures.List(), has_sequences)
+  def testNotHashable(self):
     with self.assertRaises(TypeError):
-      has_sequences.add(data_structures._ListWrapper([]))
+      hash(data_structures._ListWrapper())
+
+  def testDelItem(self):
+    l = data_structures._ListWrapper([1, 2, 3, 4])
+    del l[0]
+    self.assertEqual(l, [2, 3, 4])
+    self.assertUnableToSave(l, "Unable to save .*__delitem__")
+
+  def testDelSlice(self):
+    l = data_structures._ListWrapper([1, 2, 3, 4])
+    del l[2:3]
+    self.assertEqual(l, [1, 2, 4])
+    self.assertUnableToSave(l, "Unable to save .*__delslice__")
+
+  def testSetSlice_canSaveForNonTrackableItems(self):
+    l = data_structures._ListWrapper([1, 2, 3, 4])
+    l[:] = 2, 8, 9, 0
+    self.assertEqual(l, [2, 8, 9, 0])
+    l._maybe_initialize_trackable()  # pylint: disable=protected-access
+    self.assertEqual(len(l._checkpoint_dependencies), 0)  # pylint: disable=protected-access
+
+  def testSetSlice_cannotSaveIfTrackableModified(self):
+    v1 = resource_variable_ops.ResourceVariable(1.)
+    v2 = resource_variable_ops.ResourceVariable(1.)
+    l = data_structures._ListWrapper([1, 2, v1, v2])
+    l[:] = 2, 8, 9, v2
+    self.assertEqual(l, [2, 8, 9, v2])
+    self.assertUnableToSave(l, "Unable to save .*__setslice__")
+
+  def testSetSlice_truncate(self):
+    l = data_structures._ListWrapper([1, 2, 3, 4])
+    l[:] = []
+    self.assertEqual(l, [])
+
+  def testSetSlice_extend(self):
+    l = data_structures._ListWrapper([1, 2, 3, 4])
+    l[2:] = 1, 2, 3, 4
+    self.assertEqual(l, [1, 2, 1, 2, 3, 4])
+
+  def testSort(self):
+    l = data_structures._ListWrapper([1, 2, 3, 4])
+    l.sort()
+    self.assertEqual(l, [1, 2, 3, 4])
+    # Regardless of being a no-op for the input list, we still refuse to save.
+    # This is intentional since otherwise we would end up with a hard to debug
+    # case for users (e.g. sometimes sort on a ListWrapper is trackable and
+    # other times it is not).
+    self.assertUnableToSave(l, "Unable to save .*sort")
+
+  def assertUnableToSave(self, l, msg):
+    l._maybe_initialize_trackable()  # pylint: disable=protected-access
+    with self.assertRaisesRegexp(ValueError, msg):
+      return l._checkpoint_dependencies  # pylint: disable=protected-access
 
 
 class HasMapping(training.Model):
@@ -337,7 +466,7 @@ class MappingTests(test.TestCase):
 
   def testLayerCollectionWithExternalMutation(self):
     d = {}
-    root = tracking.Checkpointable()
+    root = tracking.AutoTrackable()
     root.wrapper = d
     self.assertEqual([], root.wrapper.layers)
     self.assertEqual([], root.wrapper.trainable_weights)
@@ -355,7 +484,7 @@ class MappingTests(test.TestCase):
     self.assertEqual(2, len(has_mappings))
     self.assertNotIn(data_structures.Mapping(), has_mappings)
     # In contrast to Mapping, dict wrappers are not hashable
-    a = tracking.Checkpointable()
+    a = tracking.AutoTrackable()
     a.d = {}
     self.assertEqual({}, a.d)
     self.assertFalse({} != a.d)  # pylint: disable=g-explicit-bool-comparison
@@ -364,7 +493,7 @@ class MappingTests(test.TestCase):
       set([a.d])
 
   def testDictWrapperBadKeys(self):
-    a = tracking.Checkpointable()
+    a = tracking.AutoTrackable()
     a.d = {}
     a.d[1] = data_structures.List()
     model = training.Model()
@@ -374,7 +503,7 @@ class MappingTests(test.TestCase):
       model.save_weights(save_path)
 
   def testDictWrapperNoDependency(self):
-    a = tracking.Checkpointable()
+    a = tracking.AutoTrackable()
     a.d = data_structures.NoDependency({})
     a.d[1] = [3]
     self.assertEqual([a], util.list_objects(a))
@@ -384,8 +513,8 @@ class MappingTests(test.TestCase):
     model.save_weights(save_path)
     model.load_weights(save_path)
 
-  def testNonStringKeyNotCheckpointableValue(self):
-    a = tracking.Checkpointable()
+  def testNonStringKeyNotTrackableValue(self):
+    a = tracking.AutoTrackable()
     a.d = {}
     a.d["a"] = [3]
     a.d[1] = data_structures.NoDependency([3])
@@ -396,18 +525,18 @@ class MappingTests(test.TestCase):
     model.save_weights(save_path)
     model.load_weights(save_path)
 
-  def testNonAppendNotCheckpointable(self):
+  def testNonAppendNotTrackable(self):
     # Non-append mutations (deleting or overwriting values) are OK when the
     # values aren't tracked.
-    a = tracking.Checkpointable()
+    a = tracking.AutoTrackable()
     a.d = {}
     a.d["a"] = [3]
     a.d[1] = 3
     a.d[1] = 2
     self.assertEqual(2, a.d[1])
     del a.d[1]
-    a.d[2] = data_structures.NoDependency(tracking.Checkpointable())
-    second = tracking.Checkpointable()
+    a.d[2] = data_structures.NoDependency(tracking.AutoTrackable())
+    second = tracking.AutoTrackable()
     a.d[2] = data_structures.NoDependency(second)
     self.assertIs(second, a.d[2])
     self.assertEqual([a, a.d, a.d["a"]], util.list_objects(a))
@@ -469,7 +598,7 @@ class MappingTests(test.TestCase):
     self.assertEqual({1: 3}, new_dict)
 
   def testListShallowCopy(self):
-    root = tracking.Checkpointable()
+    root = tracking.AutoTrackable()
     orig_list = [[1.]]
     root.a = orig_list
     copied = copy.copy(root.a)
@@ -486,7 +615,7 @@ class MappingTests(test.TestCase):
       util.list_objects(copy.copy(root.a))
 
   def testListDeepCopy(self):
-    root = tracking.Checkpointable()
+    root = tracking.AutoTrackable()
     orig_list = [[1.]]
     root.a = orig_list
     copied = copy.deepcopy(root.a)
@@ -503,7 +632,7 @@ class MappingTests(test.TestCase):
       util.list_objects(copy.deepcopy(root.a))
 
   def testDictShallowCopy(self):
-    root = tracking.Checkpointable()
+    root = tracking.AutoTrackable()
     orig_dict = {"a": [1.]}
     root.a = orig_dict
     copied = copy.copy(root.a)
@@ -520,7 +649,7 @@ class MappingTests(test.TestCase):
       util.list_objects(copy.copy(root.a))
 
   def testDictDeepCopy(self):
-    root = tracking.Checkpointable()
+    root = tracking.AutoTrackable()
     orig_dict = {"a": [1.]}
     root.a = orig_dict
     copied = copy.deepcopy(root.a)
@@ -536,9 +665,9 @@ class MappingTests(test.TestCase):
     with self.assertRaises(ValueError):
       util.list_objects(copy.deepcopy(root.a))
 
-  def testShallowCopyCheckpointable(self):
-    original = tracking.Checkpointable()
-    original_sub = tracking.Checkpointable()
+  def testShallowCopyTrackable(self):
+    original = tracking.AutoTrackable()
+    original_sub = tracking.AutoTrackable()
     original.a = [[1.]]
     original.b = {"a": original_sub}
     shallow_copied = copy.copy(original)
@@ -550,16 +679,16 @@ class MappingTests(test.TestCase):
     self.assertIn(shallow_copied.b, shallow_deps)
     self.assertIn(shallow_copied.b["a"], shallow_deps)
 
-  def testDeepCopyCheckpointable(self):
-    original = tracking.Checkpointable()
-    original_sub = tracking.Checkpointable()
+  def testDeepCopyTrackable(self):
+    original = tracking.AutoTrackable()
+    original_sub = tracking.AutoTrackable()
     original.a = [[1.]]
     original.b = {"a": original_sub}
     deep_copied = copy.deepcopy(original)
     self.assertIsNot(original, deep_copied)
     self.assertIsNot(original_sub, deep_copied.b["a"])
     self.assertEqual([[1.]], deep_copied.a)
-    self.assertIsInstance(deep_copied.b["a"], tracking.Checkpointable)
+    self.assertIsInstance(deep_copied.b["a"], tracking.AutoTrackable)
     deps = util.list_objects(deep_copied)
     self.assertIn(deep_copied.a, deps)
     self.assertIn(deep_copied.b, deps)
@@ -571,5 +700,17 @@ class MappingTests(test.TestCase):
     self.assertIsInstance(result, dict)
     self.assertEqual({1: 2, 3: 4}, result)
 
+  def testListAddOrder(self):
+    self.assertEqual([1., 2.],
+                     data_structures._ListWrapper([1.])
+                     + data_structures._ListWrapper([2.]))
+    self.assertEqual([1., 2.],
+                     data_structures._ListWrapper([1.])
+                     + [2.])
+    self.assertEqual([1., 2.],
+                     [1.]
+                     + data_structures._ListWrapper([2.]))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/tracking/graph_view.py b/tensorflow/python/training/tracking/graph_view.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba2387870182cef6e578c7b947f07a4957fdf22c
--- /dev/null
+++ b/tensorflow/python/training/tracking/graph_view.py
@@ -0,0 +1,431 @@
+"""Manages a graph of Trackable objects."""
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import weakref
+
+from tensorflow.core.protobuf import trackable_object_graph_pb2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.training import optimizer as optimizer_v1
+from tensorflow.python.training.saving import saveable_object as saveable_object_lib
+from tensorflow.python.training.saving import saveable_object_util
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import object_identity
+from tensorflow.python.training.tracking import tracking
+
+
+_ESCAPE_CHAR = "."  # For avoiding conflicts with user-specified names.
+
+# Keyword for identifying that the next bit of a checkpoint variable name is a
+# slot name. Checkpoint names for slot variables look like:
+#
+#   <path to variable>/<_OPTIMIZER_SLOTS_NAME>/<path to optimizer>/<slot name>
+#
+# Where <path to variable> is a full path from the checkpoint root to the
+# variable being slotted for.
+_OPTIMIZER_SLOTS_NAME = _ESCAPE_CHAR + "OPTIMIZER_SLOT"
+# Keyword for separating the path to an object from the name of an
+# attribute in checkpoint names. Used like:
+#   <path to variable>/<_OBJECT_ATTRIBUTES_NAME>/<name of attribute>
+_OBJECT_ATTRIBUTES_NAME = _ESCAPE_CHAR + "ATTRIBUTES"
+
+
+def _escape_local_name(name):
+  # We need to support slashes in local names for compatibility, since this
+  # naming scheme is being patched in to things like Layer.add_variable where
+  # slashes were previously accepted. We also want to use slashes to indicate
+  # edges traversed to reach the variable, so we escape forward slashes in
+  # names.
+  return (name.replace(_ESCAPE_CHAR, _ESCAPE_CHAR + _ESCAPE_CHAR)
+          .replace(r"/", _ESCAPE_CHAR + "S"))
+
+
+def _object_prefix_from_path(path_to_root):
+  return "/".join(
+      (_escape_local_name(trackable.name)
+       for trackable in path_to_root))
+
+
+def _slot_variable_naming_for_optimizer(optimizer_path):
+  """Make a function for naming slot variables in an optimizer."""
+  # Name slot variables:
+  #
+  #   <variable name>/<_OPTIMIZER_SLOTS_NAME>/<optimizer path>/<slot name>
+  #
+  # where <variable name> is exactly the checkpoint name used for the original
+  # variable, including the path from the checkpoint root and the local name in
+  # the object which owns it. Note that we only save slot variables if the
+  # variable it's slotting for is also being saved.
+
+  optimizer_identifier = "/%s/%s/" % (_OPTIMIZER_SLOTS_NAME, optimizer_path)
+
+  def _name_slot_variable(variable_path, slot_name):
+    """With an optimizer specified, name a slot variable."""
+    return (variable_path
+            + optimizer_identifier
+            + _escape_local_name(slot_name))
+
+  return _name_slot_variable
+
+
+def _serialize_slot_variables(trackable_objects, node_ids, object_names):
+  """Gather and name slot variables."""
+  non_slot_objects = list(trackable_objects)
+  slot_variables = object_identity.ObjectIdentityDictionary()
+  for trackable in non_slot_objects:
+    if (isinstance(trackable, optimizer_v1.Optimizer)
+        # TODO(b/110718070): Fix Keras imports.
+        or hasattr(trackable, "_create_or_restore_slot_variable")):
+      naming_scheme = _slot_variable_naming_for_optimizer(
+          optimizer_path=object_names[trackable])
+      slot_names = trackable.get_slot_names()
+      for slot_name in slot_names:
+        for original_variable_node_id, original_variable in enumerate(
+            non_slot_objects):
+          try:
+            slot_variable = trackable.get_slot(
+                original_variable, slot_name)
+          except (AttributeError, KeyError):
+            slot_variable = None
+          if slot_variable is None:
+            continue
+          slot_variable._maybe_initialize_trackable()  # pylint: disable=protected-access
+          if slot_variable._checkpoint_dependencies:  # pylint: disable=protected-access
+            # TODO(allenl): Gather dependencies of slot variables.
+            raise NotImplementedError(
+                "Currently only variables with no dependencies can be saved as "
+                "slot variables. File a feature request if this limitation "
+                "bothers you.")
+          if slot_variable in node_ids:
+            raise NotImplementedError(
+                "A slot variable was re-used as a dependency of a "
+                "Trackable object. This is not currently allowed. File a "
+                "feature request if this limitation bothers you.")
+          checkpoint_name = naming_scheme(
+              variable_path=object_names[original_variable],
+              slot_name=slot_name)
+          object_names[slot_variable] = checkpoint_name
+          slot_variable_node_id = len(trackable_objects)
+          node_ids[slot_variable] = slot_variable_node_id
+          trackable_objects.append(slot_variable)
+          slot_variable_proto = (
+              trackable_object_graph_pb2.TrackableObjectGraph
+              .TrackableObject.SlotVariableReference(
+                  slot_name=slot_name,
+                  original_variable_node_id=original_variable_node_id,
+                  slot_variable_node_id=slot_variable_node_id))
+          slot_variables.setdefault(trackable, []).append(
+              slot_variable_proto)
+  return slot_variables
+
+
+class ObjectGraphView(object):
+  """Gathers and serializes an object graph."""
+
+  def __init__(self, root, saveables_cache=None):
+    """Configure the graph view.
+
+    Args:
+      root: A `Trackable` object whose variables (including the variables
+        of dependencies, recursively) should be saved. May be a weak reference.
+      saveables_cache: A dictionary mapping `Trackable` objects ->
+        attribute names -> SaveableObjects, used to avoid re-creating
+        SaveableObjects when graph building.
+    """
+    self._root_ref = root
+    self._saveables_cache = saveables_cache
+
+  def list_dependencies(self, obj):
+    # pylint: disable=protected-access
+    obj._maybe_initialize_trackable()
+    return obj._checkpoint_dependencies
+    # pylint: enable=protected-access
+
+  @property
+  def saveables_cache(self):
+    """Maps Trackable objects -> attribute names -> list(SaveableObjects).
+
+    Used to avoid re-creating SaveableObjects when graph building. None when
+    executing eagerly.
+
+    Returns:
+      The cache (an object-identity dictionary), or None if caching is disabled.
+    """
+    return self._saveables_cache
+
+  @property
+  def root(self):
+    if isinstance(self._root_ref, weakref.ref):
+      derefed = self._root_ref()
+      assert derefed is not None
+      return derefed
+    else:
+      return self._root_ref
+
+  def _breadth_first_traversal(self):
+    """Find shortest paths to all dependencies of self.root."""
+    bfs_sorted = []
+    to_visit = collections.deque([self.root])
+    path_to_root = object_identity.ObjectIdentityDictionary()
+    path_to_root[self.root] = ()
+    while to_visit:
+      current_trackable = to_visit.popleft()
+      if isinstance(current_trackable, tracking.NotTrackable):
+        raise NotImplementedError(
+            ("The object %s does not support object-based saving. File a "
+             "feature request if this limitation bothers you. In the meantime, "
+             "you can remove the dependency on this object and save everything "
+             "else.")
+            % (current_trackable,))
+      bfs_sorted.append(current_trackable)
+      for name, dependency in self.list_dependencies(current_trackable):
+        if dependency not in path_to_root:
+          path_to_root[dependency] = (
+              path_to_root[current_trackable] + (
+                  base.TrackableReference(name, dependency),))
+          to_visit.append(dependency)
+    return bfs_sorted, path_to_root
+
+  def _add_attributes_to_object_graph(
+      self, trackable_objects, object_graph_proto, node_ids, object_names,
+      object_map):
+    """Create SaveableObjects and corresponding SerializedTensor protos."""
+    named_saveable_objects = []
+    if self._saveables_cache is None:
+      # No SaveableObject caching. Either we're executing eagerly, or building a
+      # static save which is specialized to the current Python state.
+      feed_additions = None
+    else:
+      # If we are caching SaveableObjects, we need to build up a feed_dict with
+      # functions computing volatile Python state to be saved with the
+      # checkpoint.
+      feed_additions = {}
+    for checkpoint_id, (trackable, object_proto) in enumerate(
+        zip(trackable_objects, object_graph_proto.nodes)):
+      assert node_ids[trackable] == checkpoint_id
+      object_name = object_names[trackable]
+      if object_map is None:
+        object_to_save = trackable
+      else:
+        object_to_save = object_map.get(trackable, trackable)
+      if self._saveables_cache is not None:
+        cached_attributes = self._saveables_cache.setdefault(object_to_save, {})
+      else:
+        cached_attributes = None
+
+      for name, saveable_factory in (
+          object_to_save._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
+        attribute = object_proto.attributes.add()
+        attribute.name = name
+        attribute.checkpoint_key = "%s/%s/%s" % (
+            object_name, _OBJECT_ATTRIBUTES_NAME, _escape_local_name(name))
+        if cached_attributes is None:
+          saveables = None
+        else:
+          saveables = cached_attributes.get(name, None)
+          if saveables is not None:
+            for saveable in saveables:
+              if attribute.checkpoint_key not in saveable.name:
+                # The checkpoint key for this SaveableObject is different. We
+                # need to re-create it.
+                saveables = None
+                del cached_attributes[name]
+                break
+        if saveables is None:
+          if callable(saveable_factory):
+            maybe_saveable = saveable_factory(name=attribute.checkpoint_key)
+          else:
+            maybe_saveable = saveable_factory
+          if isinstance(maybe_saveable, saveable_object_lib.SaveableObject):
+            saveables = (maybe_saveable,)
+          else:
+            # Figure out the name-based Saver's name for this variable. If it's
+            # already a SaveableObject we'd just get the checkpoint key back, so
+            # we leave full_name blank.
+            saver_dict = saveable_object_util.op_list_to_dict(
+                [maybe_saveable], convert_variable_to_tensor=False)
+            full_name, = saver_dict.keys()
+            saveables = tuple(saveable_object_util.saveable_objects_for_op(
+                op=maybe_saveable, name=attribute.checkpoint_key))
+            for saveable in saveables:
+              saveable.full_name = full_name
+          for saveable in saveables:
+            if attribute.checkpoint_key not in saveable.name:
+              raise AssertionError(
+                  ("The object %s produced a SaveableObject with name '%s' for "
+                   "attribute '%s'. Expected a name containing '%s'.")
+                  % (trackable, name, saveable.name,
+                     attribute.checkpoint_key))
+          if cached_attributes is not None:
+            cached_attributes[name] = saveables
+
+        optional_restore = None
+        for saveable in saveables:
+          if optional_restore is None:
+            optional_restore = saveable.optional_restore
+          else:
+            optional_restore = optional_restore and saveable.optional_restore
+
+          if hasattr(saveable, "full_name"):
+            attribute.full_name = saveable.full_name
+          if isinstance(saveable, base.PythonStateSaveable):
+            if feed_additions is None:
+              assert self._saveables_cache is None
+              # If we're not caching saveables, then we're either executing
+              # eagerly or building a static save/restore (e.g. for a
+              # SavedModel). In either case, we should embed the current Python
+              # state in the graph rather than relying on a feed dict.
+              saveable = saveable.freeze()
+            else:
+              saveable_feed_dict = saveable.feed_dict_additions()
+              for new_feed_key in saveable_feed_dict.keys():
+                if new_feed_key in feed_additions:
+                  raise AssertionError(
+                      ("The object %s tried to feed a value for the Tensor %s "
+                       "when saving, but another object is already feeding a "
+                       "value.")
+                      % (trackable, new_feed_key))
+              feed_additions.update(saveable_feed_dict)
+          named_saveable_objects.append(saveable)
+        if optional_restore is None:
+          optional_restore = False
+        attribute.optional_restore = optional_restore
+
+    return named_saveable_objects, feed_additions
+
+  def _fill_object_graph_proto(self, trackable_objects,
+                               node_ids,
+                               slot_variables,
+                               object_graph_proto=None):
+    """Name non-slot `Trackable`s and add them to `object_graph_proto`."""
+    if object_graph_proto is None:
+      object_graph_proto = (
+          trackable_object_graph_pb2.TrackableObjectGraph())
+    for checkpoint_id, trackable in enumerate(trackable_objects):
+      assert node_ids[trackable] == checkpoint_id
+      object_proto = object_graph_proto.nodes.add()
+      object_proto.slot_variables.extend(slot_variables.get(trackable, ()))
+      for child in self.list_dependencies(trackable):
+        child_proto = object_proto.children.add()
+        child_proto.node_id = node_ids[child.ref]
+        child_proto.local_name = child.name
+    return object_graph_proto
+
+  def _serialize_gathered_objects(self, trackable_objects, path_to_root,
+                                  object_map=None):
+    """Create SaveableObjects and protos for gathered objects."""
+    object_names = object_identity.ObjectIdentityDictionary()
+    for obj, path in path_to_root.items():
+      object_names[obj] = _object_prefix_from_path(path)
+    node_ids = object_identity.ObjectIdentityDictionary()
+    for node_id, node in enumerate(trackable_objects):
+      node_ids[node] = node_id
+    slot_variables = _serialize_slot_variables(
+        trackable_objects=trackable_objects,
+        node_ids=node_ids,
+        object_names=object_names)
+    object_graph_proto = self._fill_object_graph_proto(
+        trackable_objects=trackable_objects,
+        node_ids=node_ids,
+        slot_variables=slot_variables)
+    named_saveable_objects, feed_additions = (
+        self._add_attributes_to_object_graph(
+            trackable_objects=trackable_objects,
+            object_graph_proto=object_graph_proto,
+            node_ids=node_ids,
+            object_names=object_names,
+            object_map=object_map))
+    return named_saveable_objects, object_graph_proto, feed_additions
+
+  def serialize_object_graph(self):
+    """Determine checkpoint keys for variables and build a serialized graph.
+
+    Non-slot variables are keyed based on a shortest path from the root saveable
+    to the object which owns the variable (i.e. the one which called
+    `Trackable._add_variable` to create it).
+
+    Slot variables are keyed based on a shortest path to the variable being
+    slotted for, a shortest path to their optimizer, and the slot name.
+
+    Returns:
+      A tuple of (named_variables, object_graph_proto, feed_additions):
+        named_variables: A dictionary mapping names to variable objects.
+        object_graph_proto: A TrackableObjectGraph protocol buffer
+          containing the serialized object graph and variable references.
+        feed_additions: A dictionary mapping from Tensors to values which should
+          be fed when saving.
+
+    Raises:
+      ValueError: If there are invalid characters in an optimizer's slot names.
+    """
+    trackable_objects, path_to_root = self._breadth_first_traversal()
+    return self._serialize_gathered_objects(
+        trackable_objects, path_to_root)
+
+  def frozen_saveable_objects(self, object_map=None, to_graph=None):
+    """Creates SaveableObjects with the current object graph frozen."""
+    trackable_objects, path_to_root = self._breadth_first_traversal()
+    if to_graph:
+      target_context = to_graph.as_default
+    else:
+      target_context = ops.NullContextmanager
+    with target_context():
+      named_saveable_objects, graph_proto, _ = self._serialize_gathered_objects(
+          trackable_objects,
+          path_to_root,
+          object_map)
+      with ops.device("/cpu:0"):
+        object_graph_tensor = constant_op.constant(
+            graph_proto.SerializeToString(), dtype=dtypes.string)
+      named_saveable_objects.append(
+          base.NoRestoreSaveable(
+              tensor=object_graph_tensor,
+              name=base.OBJECT_GRAPH_PROTO_KEY))
+    return named_saveable_objects
+
+  def objects_ids_and_slot_variables(self):
+    """Traverse the object graph and list all accessible objects.
+
+    Looks for `Trackable` objects which are dependencies of
+    `root_trackable`. Includes slot variables only if the variable they are
+    slotting for and the optimizer are dependencies of `root_trackable`
+    (i.e. if they would be saved with a checkpoint).
+
+    Returns:
+      A tuple of (trackable objects, object -> node id, slot variables)
+    """
+    trackable_objects, path_to_root = self._breadth_first_traversal()
+    object_names = object_identity.ObjectIdentityDictionary()
+    for obj, path in path_to_root.items():
+      object_names[obj] = _object_prefix_from_path(path)
+    node_ids = object_identity.ObjectIdentityDictionary()
+    for node_id, node in enumerate(trackable_objects):
+      node_ids[node] = node_id
+    slot_variables = _serialize_slot_variables(
+        trackable_objects=trackable_objects,
+        node_ids=node_ids,
+        object_names=object_names)
+    return trackable_objects, node_ids, slot_variables
+
+  def list_objects(self):
+    """Traverse the object graph and list all accessible objects."""
+    trackable_objects, _, _ = self.objects_ids_and_slot_variables()
+    return trackable_objects
diff --git a/tensorflow/python/training/checkpointable/layer_utils.py b/tensorflow/python/training/tracking/layer_utils.py
similarity index 88%
rename from tensorflow/python/training/checkpointable/layer_utils.py
rename to tensorflow/python/training/tracking/layer_utils.py
index ec764bca895e6c008e6f7049746953e04250159d..818563c32fa6ed726156781704b869978409652c 100644
--- a/tensorflow/python/training/checkpointable/layer_utils.py
+++ b/tensorflow/python/training/tracking/layer_utils.py
@@ -21,13 +21,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.training.tracking import object_identity
+
 
 def is_layer(obj):
   """Implicit check for Layer-like objects."""
   # TODO(b/110718070): Replace with isinstance(obj, base_layer.Layer).
-  return (hasattr(obj, "call")
-          and hasattr(obj, "build")
-          and hasattr(obj, "variables"))
+  return hasattr(obj, "_is_layer")
 
 
 def has_weights(obj):
@@ -38,15 +38,21 @@ def has_weights(obj):
 
 
 def filter_empty_layer_containers(layer_list):
-  """Filter out empty Layer-like containers."""
+  """Filter out empty Layer-like containers and uniquify."""
+  existing = object_identity.ObjectIdentitySet()
+  to_visit = layer_list[::-1]
   filtered = []
-  for obj in layer_list:
+  while to_visit:
+    obj = to_visit.pop()
+    if obj in existing:
+      continue
+    existing.add(obj)
     if is_layer(obj):
       filtered.append(obj)
     elif hasattr(obj, "layers"):
-      # Checkpointable data structures will not show up in ".layers" lists, but
+      # Trackable data structures will not show up in ".layers" lists, but
       # the layers they contain will.
-      filtered.extend(obj.layers)
+      to_visit.extend(obj.layers[::-1])
   return filtered
 
 
diff --git a/tensorflow/python/training/tracking/object_identity.py b/tensorflow/python/training/tracking/object_identity.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4eef5b34b55dbf41bea09c5ac6ec7aadaac70ad
--- /dev/null
+++ b/tensorflow/python/training/tracking/object_identity.py
@@ -0,0 +1,156 @@
+"""Utilities for collecting objects based on "is" comparison."""
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import weakref
+
+
+class _ObjectIdentityWrapper(object):
+  """Wraps an object, mapping __eq__ on wrapper to "is" on wrapped.
+
+  Since __eq__ is based on object identity, it's safe to also define __hash__
+  based on object ids. This lets us add unhashable types like trackable
+  _ListWrapper objects to object-identity collections.
+  """
+
+  def __init__(self, wrapped):
+    self._wrapped = wrapped
+
+  @property
+  def unwrapped(self):
+    return self._wrapped
+
+  def __eq__(self, other):
+    if isinstance(other, _ObjectIdentityWrapper):
+      return self._wrapped is other._wrapped  # pylint: disable=protected-access
+    return self._wrapped is other
+
+  def __hash__(self):
+    # Wrapper id() is also fine for weakrefs. In fact, we rely on
+    # id(weakref.ref(a)) == id(weakref.ref(a)) and weakref.ref(a) is
+    # weakref.ref(a) in _WeakObjectIdentityWrapper.
+    return id(self._wrapped)
+
+
+class _WeakObjectIdentityWrapper(_ObjectIdentityWrapper):
+
+  def __init__(self, wrapped):
+    super(_WeakObjectIdentityWrapper, self).__init__(weakref.ref(wrapped))
+
+  @property
+  def unwrapped(self):
+    return self._wrapped()
+
+
+class ObjectIdentityDictionary(collections.MutableMapping):
+  """A mutable mapping data structure which compares using "is".
+
+  This is necessary because we have trackable objects (_ListWrapper) which
+  have behavior identical to built-in Python lists (including being unhashable
+  and comparing based on the equality of their contents by default).
+  """
+
+  def __init__(self):
+    self._storage = {}
+
+  def _wrap_key(self, key):
+    return _ObjectIdentityWrapper(key)
+
+  def __getitem__(self, key):
+    return self._storage[self._wrap_key(key)]
+
+  def __setitem__(self, key, value):
+    self._storage[self._wrap_key(key)] = value
+
+  def __delitem__(self, key):
+    del self._storage[self._wrap_key(key)]
+
+  def __len__(self):
+    return len(self._storage)
+
+  def __iter__(self):
+    for key in self._storage:
+      yield key.unwrapped
+
+
+class ObjectIdentityWeakKeyDictionary(ObjectIdentityDictionary):
+  """Like weakref.WeakKeyDictionary, but compares objects with "is"."""
+
+  def _wrap_key(self, key):
+    return _WeakObjectIdentityWrapper(key)
+
+  def __len__(self):
+    # Iterate, discarding old weak refs
+    return len(list(self._storage))
+
+  def __iter__(self):
+    keys = self._storage.keys()
+    for key in keys:
+      unwrapped = key.unwrapped
+      if unwrapped is None:
+        del self[key]
+      else:
+        yield unwrapped
+
+
+class ObjectIdentitySet(collections.MutableSet):
+  """Like the built-in set, but compares objects with "is"."""
+
+  def __init__(self, *args):
+    self._storage = set([self._wrap_key(obj) for obj in list(*args)])
+
+  def _wrap_key(self, key):
+    return _ObjectIdentityWrapper(key)
+
+  def __contains__(self, key):
+    return self._wrap_key(key) in self._storage
+
+  def discard(self, key):
+    self._storage.discard(self._wrap_key(key))
+
+  def add(self, key):
+    self._storage.add(self._wrap_key(key))
+
+  def __len__(self):
+    return len(self._storage)
+
+  def __iter__(self):
+    keys = list(self._storage)
+    for key in keys:
+      yield key.unwrapped
+
+
+class ObjectIdentityWeakSet(ObjectIdentitySet):
+  """Like weakref.WeakSet, but compares objects with "is"."""
+
+  def _wrap_key(self, key):
+    return _WeakObjectIdentityWrapper(key)
+
+  def __len__(self):
+    # Iterate, discarding old weak refs
+    return len([_ for _ in self])
+
+  def __iter__(self):
+    keys = list(self._storage)
+    for key in keys:
+      unwrapped = key.unwrapped
+      if unwrapped is None:
+        self.discard(key)
+      else:
+        yield unwrapped
diff --git a/tensorflow/python/training/checkpointable/tracking.py b/tensorflow/python/training/tracking/tracking.py
similarity index 58%
rename from tensorflow/python/training/checkpointable/tracking.py
rename to tensorflow/python/training/tracking/tracking.py
index 4e96aee0c51d441c4a32ce68943e27dbf592349c..c8024e14ab14e0fa6a254458de184131d0d5ab91 100644
--- a/tensorflow/python/training/checkpointable/tracking.py
+++ b/tensorflow/python/training/tracking/tracking.py
@@ -1,4 +1,4 @@
-"""Dependency tracking for checkpointable objects."""
+"""Dependency tracking for trackable objects."""
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,12 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function as defun
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.training.checkpointable import base
-from tensorflow.python.training.checkpointable import data_structures
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.util import tf_contextlib
 
 
@@ -30,21 +30,21 @@ from tensorflow.python.util import tf_contextlib
 _RESOURCE_TRACKER_STACK = []
 
 
-class NotCheckpointable(object):
+class NotTrackable(object):
   """Marks instances of child classes as unsaveable using an object-based API.
 
-  Useful for marking objects which would otherwise look checkpointable because
-  of inheritance (e.g. through `Layer`) as not checkpointable. Inheriting from
-  `NotCheckpointable` does not prevent an object from being assigned to any
+  Useful for marking objects which would otherwise look trackable because
+  of inheritance (e.g. through `Layer`) as not trackable. Inheriting from
+  `NotTrackable` does not prevent an object from being assigned to any
   attributes, but will throw an error on save/restore.
   """
   pass
 
 
-class Checkpointable(base.CheckpointableBase):
+class AutoTrackable(base.Trackable):
   """Manages dependencies on other objects.
 
-  `Checkpointable` objects may have dependencies: other `Checkpointable` objects
+  `Trackable` objects may have dependencies: other `Trackable` objects
   which should be saved if the object declaring the dependency is saved. A
   correctly saveable program has a dependency graph such that if changing a
   global variable affects an object (e.g. changes the behavior of any of its
@@ -52,34 +52,60 @@ class Checkpointable(base.CheckpointableBase):
   the variable.
 
   Dependency edges have names, and are created implicitly when a
-  `Checkpointable` object is assigned to an attribute of another
-  `Checkpointable` object. For example:
+  `Trackable` object is assigned to an attribute of another
+  `Trackable` object. For example:
 
   ```
-  obj = Checkpointable()
+  obj = Trackable()
   obj.v = ResourceVariable(0.)
   ```
 
-  The `Checkpointable` object `obj` now has a dependency named "v" on a
+  The `Trackable` object `obj` now has a dependency named "v" on a
   variable.
 
-  `Checkpointable` objects may specify `Tensor`s to be saved and restored
+  `Trackable` objects may specify `Tensor`s to be saved and restored
   directly (e.g. a `Variable` indicating how to save itself) rather than through
   dependencies on other objects. See
-  `Checkpointable._gather_saveables_for_checkpoint` for details.
+  `Trackable._gather_saveables_for_checkpoint` for details.
   """
 
   def __setattr__(self, name, value):
-    """Support self.foo = checkpointable syntax."""
+    """Support self.foo = trackable syntax."""
     if getattr(self, "_setattr_tracking", True):
       value = data_structures.sticky_attribute_assignment(
-          checkpointable=self, value=value, name=name)
-    super(Checkpointable, self).__setattr__(name, value)
+          trackable=self, value=value, name=name)
+    super(AutoTrackable, self).__setattr__(name, value)
+
+  def __delattr__(self, name):
+    self._maybe_initialize_trackable()
+    if name in self._unconditional_dependency_names:
+      del self._unconditional_dependency_names[name]
+      for index, (dep_name, _) in enumerate(
+          self._unconditional_checkpoint_dependencies):
+        if dep_name == name:
+          del self._unconditional_checkpoint_dependencies[index]
+          break
+    super(AutoTrackable, self).__delattr__(name)
 
   def _no_dependency(self, value):
-    """Override to allow CheckpointableBase to disable dependency tracking."""
+    """Override to allow TrackableBase to disable dependency tracking."""
     return data_structures.NoDependency(value)
 
+  def _list_functions_for_serialization(self):
+    """Return a dict of `Function`s of a trackable."""
+    functions = dict()
+    for attribute_name in dir(self):
+      try:
+        attribute_value = getattr(self, attribute_name, None)
+      except Exception:  # pylint: disable=broad-except
+        # We really don't want to throw an exception just because some object's
+        # attribute accessor is broken.
+        attribute_value = None
+      if isinstance(attribute_value, (def_function.Function,
+                                      defun.ConcreteFunction)):
+        functions[attribute_name] = attribute_value
+    return functions
+
 
 class ResourceTracker(object):
   """An object that tracks a list of resources."""
@@ -124,7 +150,7 @@ def resource_tracker_scope(resource_tracker):
     _RESOURCE_TRACKER_STACK = old
 
 
-class TrackableResource(base.CheckpointableBase):
+class TrackableResource(base.Trackable):
   """Base class for all resources that need to be tracked."""
 
   def __init__(self):
@@ -150,29 +176,34 @@ class TrackableResource(base.CheckpointableBase):
       self._resource_handle = self.create_resource()
     return self._resource_handle
 
+  def _list_functions_for_serialization(self):
+    @def_function.function(input_signature=[], autograph=False)
+    def _creator():
+      resource = self.create_resource()
+      return resource
+
+    @def_function.function(input_signature=[], autograph=False)
+    def _initializer():
+      self.initialize()
+      return 1  # Dummy return
+
+    return {
+        "create_resource": _creator,
+        "initialize": _initializer,
+    }
+
 
-class TrackableAsset(base.CheckpointableBase):
+class TrackableAsset(base.Trackable):
   """Base class for asset files which need to be tracked."""
 
   def __init__(self, path):
     """Record the full path to the asset."""
-    # We use a variable here so that @tf.functions do not capture a literal
-    # value. The init_scope prevents functions from capturing `path` in an
+    # The init_scope prevents functions from capturing `path` in an
     # initialization graph, since it is transient and should not end up in a
-    # serialized function body. When serialized in a SavedModel, the variable
-    # will be set during the loading process to its location in the assets/
-    # directory.
+    # serialized function body.
     with ops.init_scope():
-      if context.executing_eagerly():
-        self._path = self._no_dependency(
-            resource_variable_ops.ResourceVariable(
-                path, dtype=dtypes.string,
-                name="asset_path"))
-      else:
-        # Adding a variable is too disruptive when v1-style graph building,
-        # since things may get fed and local variable initializers would then
-        # need to be run.
-        self._path = path
+      self._path = ops.internal_convert_to_tensor(path, dtype=dtypes.string,
+                                                  name="asset_path")
 
   @property
   def asset_path(self):
diff --git a/tensorflow/python/training/checkpointable/tracking_test.py b/tensorflow/python/training/tracking/tracking_test.py
similarity index 81%
rename from tensorflow/python/training/checkpointable/tracking_test.py
rename to tensorflow/python/training/tracking/tracking_test.py
index 17c5461bc25e5e409cc04d0182603e8406dc7d47..37e14174368f074c1e9f430d2001a5e3f57008c6 100644
--- a/tensorflow/python/training/checkpointable/tracking_test.py
+++ b/tensorflow/python/training/tracking/tracking_test.py
@@ -25,35 +25,35 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
-from tensorflow.python.training.checkpointable import base
-from tensorflow.python.training.checkpointable import data_structures
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import data_structures
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util
 from tensorflow.python.util import nest
 
 
 class InterfaceTests(test.TestCase):
 
   def testMultipleAssignment(self):
-    root = tracking.Checkpointable()
-    root.leaf = tracking.Checkpointable()
+    root = tracking.AutoTrackable()
+    root.leaf = tracking.AutoTrackable()
     root.leaf = root.leaf
-    duplicate_name_dep = tracking.Checkpointable()
+    duplicate_name_dep = tracking.AutoTrackable()
     with self.assertRaisesRegexp(ValueError, "already declared"):
-      root._track_checkpointable(duplicate_name_dep, name="leaf")
+      root._track_trackable(duplicate_name_dep, name="leaf")
     # No error; we're overriding __setattr__, so we can't really stop people
     # from doing this while maintaining backward compatibility.
     root.leaf = duplicate_name_dep
-    root._track_checkpointable(duplicate_name_dep, name="leaf", overwrite=True)
+    root._track_trackable(duplicate_name_dep, name="leaf", overwrite=True)
     self.assertIs(duplicate_name_dep, root._lookup_dependency("leaf"))
     (_, dep_object), = root._checkpoint_dependencies
     self.assertIs(duplicate_name_dep, dep_object)
 
   def testNoDependency(self):
-    root = tracking.Checkpointable()
-    hasdep = tracking.Checkpointable()
+    root = tracking.AutoTrackable()
+    hasdep = tracking.AutoTrackable()
     root.hasdep = hasdep
-    nodep = tracking.Checkpointable()
+    nodep = tracking.AutoTrackable()
     root.nodep = data_structures.NoDependency(nodep)
     self.assertEqual(1, len(root._checkpoint_dependencies))
     self.assertIs(root._checkpoint_dependencies[0].ref, root.hasdep)
@@ -66,16 +66,31 @@ class InterfaceTests(test.TestCase):
       def __init__(self):
         super(NoDependencyModel, self).__init__()
         self.a = []
-        self.b = tracking.Checkpointable()
+        self.b = tracking.AutoTrackable()
 
     nodeps = NoDependencyModel()
     self.assertEqual([nodeps], util.list_objects(nodeps))
 
+  def testRemoveDependency(self):
+    root = tracking.AutoTrackable()
+    root.a = tracking.AutoTrackable()
+    self.assertEqual(1, len(root._checkpoint_dependencies))
+    self.assertEqual(1, len(root._unconditional_checkpoint_dependencies))
+    self.assertIs(root.a, root._checkpoint_dependencies[0].ref)
+    del root.a
+    self.assertFalse(hasattr(root, "a"))
+    self.assertEqual(0, len(root._checkpoint_dependencies))
+    self.assertEqual(0, len(root._unconditional_checkpoint_dependencies))
+    root.a = tracking.AutoTrackable()
+    self.assertEqual(1, len(root._checkpoint_dependencies))
+    self.assertEqual(1, len(root._unconditional_checkpoint_dependencies))
+    self.assertIs(root.a, root._checkpoint_dependencies[0].ref)
+
   def testListBasic(self):
-    a = tracking.Checkpointable()
-    b = tracking.Checkpointable()
+    a = tracking.AutoTrackable()
+    b = tracking.AutoTrackable()
     a.l = [b]
-    c = tracking.Checkpointable()
+    c = tracking.AutoTrackable()
     a.l.append(c)
     a_deps = util.list_objects(a)
     self.assertIn(b, a_deps)
@@ -87,10 +102,10 @@ class InterfaceTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testMutationDirtiesList(self):
-    a = tracking.Checkpointable()
-    b = tracking.Checkpointable()
+    a = tracking.AutoTrackable()
+    b = tracking.AutoTrackable()
     a.l = [b]
-    c = tracking.Checkpointable()
+    c = tracking.AutoTrackable()
     a.l.insert(0, c)
     checkpoint = util.Checkpoint(a=a)
     with self.assertRaisesRegexp(ValueError, "A list element was replaced"):
@@ -98,11 +113,11 @@ class InterfaceTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testOutOfBandEditDirtiesList(self):
-    a = tracking.Checkpointable()
-    b = tracking.Checkpointable()
+    a = tracking.AutoTrackable()
+    b = tracking.AutoTrackable()
     held_reference = [b]
     a.l = held_reference
-    c = tracking.Checkpointable()
+    c = tracking.AutoTrackable()
     held_reference.append(c)
     checkpoint = util.Checkpoint(a=a)
     with self.assertRaisesRegexp(ValueError, "The wrapped list was modified"):
@@ -110,25 +125,25 @@ class InterfaceTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testNestedLists(self):
-    a = tracking.Checkpointable()
+    a = tracking.AutoTrackable()
     a.l = []
-    b = tracking.Checkpointable()
+    b = tracking.AutoTrackable()
     a.l.append([b])
-    c = tracking.Checkpointable()
+    c = tracking.AutoTrackable()
     a.l[0].append(c)
     a_deps = util.list_objects(a)
     self.assertIn(b, a_deps)
     self.assertIn(c, a_deps)
     a.l[0].append(1)
-    d = tracking.Checkpointable()
+    d = tracking.AutoTrackable()
     a.l[0].append(d)
     a_deps = util.list_objects(a)
     self.assertIn(d, a_deps)
     self.assertIn(b, a_deps)
     self.assertIn(c, a_deps)
     self.assertNotIn(1, a_deps)
-    e = tracking.Checkpointable()
-    f = tracking.Checkpointable()
+    e = tracking.AutoTrackable()
+    f = tracking.AutoTrackable()
     a.l1 = [[], [e]]
     a.l1[0].append(f)
     a_deps = util.list_objects(a)
@@ -183,7 +198,7 @@ class InterfaceTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testAssertions(self):
-    a = tracking.Checkpointable()
+    a = tracking.AutoTrackable()
     a.l = {"k": [numpy.zeros([2, 2])]}
     self.assertAllEqual(nest.flatten({"k": [numpy.zeros([2, 2])]}),
                         nest.flatten(a.l))
diff --git a/tensorflow/python/training/checkpointable/util.py b/tensorflow/python/training/tracking/util.py
similarity index 62%
rename from tensorflow/python/training/checkpointable/util.py
rename to tensorflow/python/training/tracking/util.py
index a54f41a54fa1364af417a85e7faa9ee0693fada1..f5f70a443c4eb84d7783410909ee0698f03ab3c8 100644
--- a/tensorflow/python/training/checkpointable/util.py
+++ b/tensorflow/python/training/tracking/util.py
@@ -1,4 +1,4 @@
-"""Utilities for saving/loading Checkpointable objects."""
+"""Utilities for saving/loading Trackable objects."""
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,11 +18,10 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
-import collections
 import os
 import weakref
 
-from tensorflow.core.protobuf import checkpointable_object_graph_pb2
+from tensorflow.core.protobuf import trackable_object_graph_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import context
@@ -32,6 +31,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_io_ops as io_ops
@@ -39,45 +39,29 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import optimizer as optimizer_lib
 from tensorflow.python.training import saver as v1_saver_lib
-from tensorflow.python.training.checkpointable import base
-from tensorflow.python.training.checkpointable import data_structures
-from tensorflow.python.training.checkpointable import tracking
 from tensorflow.python.training.saving import functional_saver
-from tensorflow.python.training.saving import saveable_object as saveable_object_lib
 from tensorflow.python.training.saving import saveable_object_util
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import data_structures
+from tensorflow.python.training.tracking import graph_view as graph_view_lib
+from tensorflow.python.training.tracking import object_identity
+from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
 
-_ESCAPE_CHAR = "."  # For avoiding conflicts with user-specified names.
-
-# Keyword for identifying that the next bit of a checkpoint variable name is a
-# slot name. Checkpoint names for slot variables look like:
-#
-#   <path to variable>/<_OPTIMIZER_SLOTS_NAME>/<path to optimizer>/<slot name>
-#
-# Where <path to variable> is a full path from the checkpoint root to the
-# variable being slotted for.
-_OPTIMIZER_SLOTS_NAME = _ESCAPE_CHAR + "OPTIMIZER_SLOT"
-# Keyword for separating the path to an object from the name of an
-# attribute in checkpoint names. Used like:
-#   <path to variable>/<_OBJECT_ATTRIBUTES_NAME>/<name of attribute>
-_OBJECT_ATTRIBUTES_NAME = _ESCAPE_CHAR + "ATTRIBUTES"
-
-
 class _CheckpointRestoreCoordinator(object):
   """Holds the status of an object-based checkpoint load."""
 
   def __init__(self, object_graph_proto, save_path, save_path_tensor,
-               restore_op_cache, saveable_object_cache):
+               restore_op_cache, graph_view):
     """Specify the checkpoint being loaded.
 
     Args:
-      object_graph_proto: The CheckpointableObjectGraph protocol buffer
+      object_graph_proto: The TrackableObjectGraph protocol buffer
         associated with this checkpoint.
       save_path: A string, the path to the checkpoint, as returned by
         `tf.train.latest_checkpoint`.
@@ -87,10 +71,8 @@ class _CheckpointRestoreCoordinator(object):
         `_CheckpointRestoreCoordinator`s for the same Python objects, used to
         look up restore ops by name to avoid re-creating them across multiple
         `restore()` calls.
-      saveable_object_cache: A mapping of checkpointable objects -> attribute
-        names -> list(`SaveableObject`s), used when `SaveableObjects` must be
-        referenced every restore (e.g. for Python state); otherwise they would
-        create their own ops every restore.
+      graph_view: A graph_view_lib.ObjectGraphView object for the restored
+        objects.
     """
     self.object_graph_proto = object_graph_proto
     self.restore_uid = ops.uid()
@@ -98,7 +80,7 @@ class _CheckpointRestoreCoordinator(object):
     # not loaded into any object, for error checking.
     self.unused_attributes = weakref.WeakKeyDictionary()
     # Dictionary mapping from an id in the protocol buffer flat array to
-    # Checkpointable Python objects. This mapping may be deferred if a
+    # Trackable Python objects. This mapping may be deferred if a
     # checkpoint is restored before all dependencies have been tracked. Uses
     # weak references so that partial restorations don't create reference cycles
     # (as objects with deferred dependencies will generally have references to
@@ -108,7 +90,7 @@ class _CheckpointRestoreCoordinator(object):
     # use them (for example because of inconsistent references when
     # loading). Used to make status assertions fail when loading checkpoints
     # that don't quite match.
-    self.all_python_objects = _ObjectIdentityWeakSet()
+    self.all_python_objects = object_identity.ObjectIdentityWeakSet()
     self.save_path_tensor = save_path_tensor
     self.save_path_string = save_path
     self.dtype_map = pywrap_tensorflow.NewCheckpointReader(
@@ -119,7 +101,7 @@ class _CheckpointRestoreCoordinator(object):
     # this checkpoint.
     self.restore_ops = []
     self.restore_ops_by_name = restore_op_cache
-    self.saveable_object_cache = saveable_object_cache
+    self.graph_view = graph_view
     self.new_restore_ops_callback = None
     # A mapping from optimizer proto ids to lists of slot variables to be
     # restored when the optimizer is tracked. Only includes slot variables whose
@@ -176,25 +158,13 @@ class _CheckpointRestoreCoordinator(object):
         raise AssertionError(
             ("Saveable keys changed when validating. Got back %s, was "
              "expecting %s") % (tensor_saveables.keys(), validated_names))
-      for saveable in validated_saveables:
-        if saveable.device:
-          device = saveable_object_util.set_cpu0(saveable.device)
-        else:
-          device = None
-        with ops.device(device):
-          tensors = []
-          for spec in saveable.specs:
-            tensors.append(
-                io_ops.restore_v2(
-                    self.save_path_tensor,
-                    [spec.name],
-                    [spec.slice_spec],
-                    [spec.dtype])[0])
-          restore_op = saveable.restore(tensors, restored_shapes=None)
-        if not context.executing_eagerly():
+      new_restore_ops = functional_saver.restore_from_saveable_objects(
+          self.save_path_tensor, validated_saveables)
+      if not context.executing_eagerly():
+        restore_ops.extend(new_restore_ops)
+        for saveable, restore_op in zip(validated_saveables, new_restore_ops):
           assert saveable.name not in self.restore_ops_by_name
           self.restore_ops_by_name[saveable.name] = restore_op
-          restore_ops.append(restore_op)
     return restore_ops
 
 
@@ -207,7 +177,7 @@ class _NameBasedRestoreCoordinator(object):
     self.unused_attributes = weakref.WeakKeyDictionary()
     self.restore_uid = ops.uid()
 
-  def globally_named_object_attributes(self, checkpointable):
+  def globally_named_object_attributes(self, trackable):
     """Create globally named SaveableObjects from attributes.
 
     If an object's attribute has no global name specified (default construction
@@ -216,13 +186,13 @@ class _NameBasedRestoreCoordinator(object):
     fail; see `NameBasedSaverStatus`).
 
     Args:
-      checkpointable: An object to save.
+      trackable: An object to save.
 
     Yields:
-      SaveableObjects for `checkpointable`'s attributes.
+      SaveableObjects for `trackable`'s attributes.
     """
     for attribute_name, saveable_factory in (
-        checkpointable._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
+        trackable._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
       if callable(saveable_factory):
         try:
           # This saveable object factory does not have a default name= argument,
@@ -231,7 +201,7 @@ class _NameBasedRestoreCoordinator(object):
           # fails.
           saveable = saveable_factory()
         except TypeError:
-          self.unused_attributes.setdefault(checkpointable, []).append(
+          self.unused_attributes.setdefault(trackable, []).append(
               attribute_name)
           continue
       else:
@@ -244,14 +214,14 @@ class _NameBasedRestoreCoordinator(object):
             op=op, name=name):
           yield saveable_object
 
-  def eager_restore(self, checkpointable):
-    """Runs restore ops for `checkpointable`'s attributes."""
+  def eager_restore(self, trackable):
+    """Runs restore ops for `trackable`'s attributes."""
     # When graph building, we don't add any restore ops to the graph until
     # run_restore_ops/initialize_or_restore on the status object for name-based
     # checkpoints.
     assert context.executing_eagerly()
     for saveable in self.globally_named_object_attributes(
-        checkpointable):
+        trackable):
       restored_tensors = []
       tensor_missing = False
       for spec in saveable.specs:
@@ -311,10 +281,10 @@ def _default_getter(name, shape, dtype, initializer=None,
     )
 
 
-def add_variable(checkpointable, name, shape=None, dtype=dtypes.float32,
+def add_variable(trackable, name, shape=None, dtype=dtypes.float32,
                  initializer=None):
-  """Add a variable to a Checkpointable with no scope influence."""
-  return checkpointable._add_variable_with_custom_getter(  # pylint: disable=protected-access
+  """Add a variable to a Trackable with no scope influence."""
+  return trackable._add_variable_with_custom_getter(  # pylint: disable=protected-access
       name=name, shape=shape, dtype=dtype,
       initializer=initializer, getter=_default_getter)
 
@@ -337,7 +307,7 @@ def object_metadata(save_path):
     save_path: The path to the checkpoint, as returned by `save` or
       `tf.train.latest_checkpoint`.
   Returns:
-    A parsed `tf.contrib.checkpoint.CheckpointableObjectGraph` protocol buffer.
+    A parsed `tf.contrib.checkpoint.TrackableObjectGraph` protocol buffer.
   Raises:
     ValueError: If an object graph was not found in the checkpoint.
   """
@@ -352,492 +322,44 @@ def object_metadata(save_path):
          'saver and does not contain an object dependency graph.') % (
              save_path, base.OBJECT_GRAPH_PROTO_KEY))
   object_graph_proto = (
-      checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+      trackable_object_graph_pb2.TrackableObjectGraph())
   object_graph_proto.ParseFromString(object_graph_string)
   return object_graph_proto
 
 
-class _ObjectIdentityWrapper(object):
-  """Wraps an object, mapping __eq__ on wrapper to "is" on wrapped.
-
-  Since __eq__ is based on object identity, it's safe to also define __hash__
-  based on object ids. This lets us add unhashable types like checkpointable
-  _ListWrapper objects to object-identity collections.
-  """
-
-  def __init__(self, wrapped):
-    self._wrapped = wrapped
-
-  @property
-  def unwrapped(self):
-    return self._wrapped
-
-  def __eq__(self, other):
-    if isinstance(other, _ObjectIdentityWrapper):
-      return self._wrapped is other._wrapped  # pylint: disable=protected-access
-    return self._wrapped is other
-
-  def __hash__(self):
-    # Wrapper id() is also fine for weakrefs. In fact, we rely on
-    # id(weakref.ref(a)) == id(weakref.ref(a)) and weakref.ref(a) is
-    # weakref.ref(a) in _WeakObjectIdentityWrapper.
-    return id(self._wrapped)
-
-
-class _WeakObjectIdentityWrapper(_ObjectIdentityWrapper):
-
-  def __init__(self, wrapped):
-    super(_WeakObjectIdentityWrapper, self).__init__(weakref.ref(wrapped))
-
-  @property
-  def unwrapped(self):
-    return self._wrapped()
-
-
-class _ObjectIdentityDictionary(collections.MutableMapping):
-  """A mutable mapping data structure which compares using "is".
-
-  This is necessary because we have checkpointable objects (_ListWrapper) which
-  have behavior identical to built-in Python lists (including being unhashable
-  and comparing based on the equality of their contents by default).
-  """
-
-  def __init__(self):
-    self._storage = {}
-
-  def _wrap_key(self, key):
-    return _ObjectIdentityWrapper(key)
-
-  def __getitem__(self, key):
-    return self._storage[self._wrap_key(key)]
-
-  def __setitem__(self, key, value):
-    self._storage[self._wrap_key(key)] = value
-
-  def __delitem__(self, key):
-    del self._storage[self._wrap_key(key)]
-
-  def __len__(self):
-    return len(self._storage)
-
-  def __iter__(self):
-    for key in self._storage:
-      yield key.unwrapped
-
-
-class _ObjectIdentityWeakKeyDictionary(_ObjectIdentityDictionary):
-  """Like weakref.WeakKeyDictionary, but compares objects with "is"."""
-
-  def _wrap_key(self, key):
-    return _WeakObjectIdentityWrapper(key)
-
-  def __len__(self):
-    # Iterate, discarding old weak refs
-    return len(list(self._storage))
-
-  def __iter__(self):
-    keys = self._storage.keys()
-    for key in keys:
-      unwrapped = key.unwrapped
-      if unwrapped is None:
-        del self[key]
-      else:
-        yield unwrapped
-
-
-class _ObjectIdentitySet(collections.MutableSet):
-  """Like the built-in set, but compares objects with "is"."""
-
-  def __init__(self, *args):
-    self._storage = set([self._wrap_key(obj) for obj in list(*args)])
-
-  def _wrap_key(self, key):
-    return _ObjectIdentityWrapper(key)
-
-  def __contains__(self, key):
-    return self._wrap_key(key) in self._storage
-
-  def discard(self, key):
-    self._storage.discard(self._wrap_key(key))
-
-  def add(self, key):
-    self._storage.add(self._wrap_key(key))
-
-  def __len__(self):
-    return len(self._storage)
-
-  def __iter__(self):
-    keys = list(self._storage)
-    for key in keys:
-      yield key.unwrapped
-
-
-class _ObjectIdentityWeakSet(_ObjectIdentitySet):
-  """Like weakref.WeakSet, but compares objects with "is"."""
-
-  def _wrap_key(self, key):
-    return _WeakObjectIdentityWrapper(key)
-
-  def __len__(self):
-    # Iterate, discarding old weak refs
-    return len([_ for _ in self])
-
-  def __iter__(self):
-    keys = list(self._storage)
-    for key in keys:
-      unwrapped = key.unwrapped
-      if unwrapped is None:
-        self.discard(key)
-      else:
-        yield unwrapped
-
-
-def _breadth_first_checkpointable_traversal(root_checkpointable):
-  """Find shortest paths to all variables owned by dependencies of root."""
-  bfs_sorted = []
-  to_visit = collections.deque([root_checkpointable])
-  path_to_root = _ObjectIdentityDictionary()
-  path_to_root[root_checkpointable] = ()
-  while to_visit:
-    current_checkpointable = to_visit.popleft()
-    if isinstance(current_checkpointable, tracking.NotCheckpointable):
-      raise NotImplementedError(
-          ("The object %s does not support object-based saving. File a feature "
-           "request if this limitation bothers you. In the meantime, you can "
-           "remove the dependency on this object and save everything else.")
-          % (current_checkpointable,))
-    current_checkpointable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
-    bfs_sorted.append(current_checkpointable)
-    for child_checkpointable in (
-        current_checkpointable._checkpoint_dependencies):  # pylint: disable=protected-access
-      if child_checkpointable.ref not in path_to_root:
-        path_to_root[child_checkpointable.ref] = (
-            path_to_root[current_checkpointable] + (child_checkpointable,))
-        to_visit.append(child_checkpointable.ref)
-  return bfs_sorted, path_to_root
-
-
-def _escape_local_name(name):
-  # We need to support slashes in local names for compatibility, since this
-  # naming scheme is being patched in to things like Layer.add_variable where
-  # slashes were previously accepted. We also want to use slashes to indicate
-  # edges traversed to reach the variable, so we escape forward slashes in
-  # names.
-  return (name.replace(_ESCAPE_CHAR, _ESCAPE_CHAR + _ESCAPE_CHAR)
-          .replace(r"/", _ESCAPE_CHAR + "S"))
-
-
-def _object_prefix_from_path(path_to_root):
-  return "/".join(
-      (_escape_local_name(checkpointable.name)
-       for checkpointable in path_to_root))
-
-
-def _slot_variable_naming_for_optimizer(optimizer_path):
-  """Make a function for naming slot variables in an optimizer."""
-  # Name slot variables:
-  #
-  #   <variable name>/<_OPTIMIZER_SLOTS_NAME>/<optimizer path>/<slot name>
-  #
-  # where <variable name> is exactly the checkpoint name used for the original
-  # variable, including the path from the checkpoint root and the local name in
-  # the object which owns it. Note that we only save slot variables if the
-  # variable it's slotting for is also being saved.
-
-  optimizer_identifier = "/%s/%s/" % (_OPTIMIZER_SLOTS_NAME, optimizer_path)
-
-  def _name_slot_variable(variable_path, slot_name):
-    """With an optimizer specified, name a slot variable."""
-    return (variable_path
-            + optimizer_identifier
-            + _escape_local_name(slot_name))
-
-  return _name_slot_variable
-
-
-def _serialize_slot_variables(checkpointable_objects, node_ids, object_names):
-  """Gather and name slot variables."""
-  non_slot_objects = list(checkpointable_objects)
-  slot_variables = _ObjectIdentityDictionary()
-  for checkpointable in non_slot_objects:
-    if isinstance(checkpointable, optimizer_lib.Optimizer):
-      naming_scheme = _slot_variable_naming_for_optimizer(
-          optimizer_path=object_names[checkpointable])
-      slot_names = checkpointable.get_slot_names()
-      for slot_name in slot_names:
-        for original_variable_node_id, original_variable in enumerate(
-            non_slot_objects):
-          try:
-            slot_variable = checkpointable.get_slot(
-                original_variable, slot_name)
-          except AttributeError:
-            slot_variable = None
-          if slot_variable is None:
-            continue
-          slot_variable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
-          if slot_variable._checkpoint_dependencies:  # pylint: disable=protected-access
-            # TODO(allenl): Gather dependencies of slot variables.
-            raise NotImplementedError(
-                "Currently only variables with no dependencies can be saved as "
-                "slot variables. File a feature request if this limitation "
-                "bothers you.")
-          if slot_variable in node_ids:
-            raise NotImplementedError(
-                "A slot variable was re-used as a dependency of a "
-                "Checkpointable object. This is not currently allowed. File a "
-                "feature request if this limitation bothers you.")
-          checkpoint_name = naming_scheme(
-              variable_path=object_names[original_variable],
-              slot_name=slot_name)
-          object_names[slot_variable] = checkpoint_name
-          slot_variable_node_id = len(checkpointable_objects)
-          node_ids[slot_variable] = slot_variable_node_id
-          checkpointable_objects.append(slot_variable)
-          slot_variable_proto = (
-              checkpointable_object_graph_pb2.CheckpointableObjectGraph
-              .CheckpointableObject.SlotVariableReference(
-                  slot_name=slot_name,
-                  original_variable_node_id=original_variable_node_id,
-                  slot_variable_node_id=slot_variable_node_id))
-          slot_variables.setdefault(checkpointable, []).append(
-              slot_variable_proto)
-  return slot_variables
-
-
-def _add_attributes_to_object_graph(
-    checkpointable_objects, object_graph_proto, node_ids, object_names,
-    saveables_cache, object_map):
-  """Create SaveableObjects and corresponding SerializedTensor protos."""
-  named_saveable_objects = []
-  if saveables_cache is None:
-    # No SaveableObject caching. Either we're executing eagerly, or building a
-    # static save which is specialized to the current Python state.
-    feed_additions = None
-  else:
-    # If we are caching SaveableObjects, we need to build up a feed_dict with
-    # functions computing volatile Python state to be saved with the checkpoint.
-    feed_additions = {}
-  for checkpoint_id, (checkpointable, object_proto) in enumerate(
-      zip(checkpointable_objects, object_graph_proto.nodes)):
-    assert node_ids[checkpointable] == checkpoint_id
-    object_name = object_names[checkpointable]
-    if object_map:
-      object_to_save = object_map.get(checkpointable, checkpointable)
-    else:
-      object_to_save = checkpointable
-    if saveables_cache is not None:
-      cached_attributes = saveables_cache.setdefault(object_to_save, {})
-    else:
-      cached_attributes = None
-
-    for name, saveable_factory in (
-        object_to_save._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
-      attribute = object_proto.attributes.add()
-      attribute.name = name
-      attribute.checkpoint_key = "%s/%s/%s" % (
-          object_name, _OBJECT_ATTRIBUTES_NAME, _escape_local_name(name))
-      if cached_attributes is None:
-        saveables = None
-      else:
-        saveables = cached_attributes.get(name, None)
-        if saveables is not None:
-          for saveable in saveables:
-            if attribute.checkpoint_key not in saveable.name:
-              # The checkpoint key for this SaveableObject is different. We need
-              # to re-create it.
-              saveables = None
-              del cached_attributes[name]
-              break
-      if saveables is None:
-        if callable(saveable_factory):
-          maybe_saveable = saveable_factory(name=attribute.checkpoint_key)
-        else:
-          maybe_saveable = saveable_factory
-        if isinstance(maybe_saveable, saveable_object_lib.SaveableObject):
-          saveables = (maybe_saveable,)
-        else:
-          # Figure out the name-based Saver's name for this variable. If it's
-          # already a SaveableObject we'd just get the checkpoint key back, so
-          # we leave full_name blank.
-          saver_dict = saveable_object_util.op_list_to_dict(
-              [maybe_saveable], convert_variable_to_tensor=False)
-          full_name, = saver_dict.keys()
-          saveables = tuple(saveable_object_util.saveable_objects_for_op(
-              op=maybe_saveable, name=attribute.checkpoint_key))
-          for saveable in saveables:
-            saveable.full_name = full_name
-        for saveable in saveables:
-          if attribute.checkpoint_key not in saveable.name:
-            raise AssertionError(
-                ("The object %s produced a SaveableObject with name '%s' for "
-                 "attribute '%s'. Expected a name containing '%s'.")
-                % (checkpointable, name, saveable.name,
-                   attribute.checkpoint_key))
-        if cached_attributes is not None:
-          cached_attributes[name] = saveables
-
-      for saveable in saveables:
-        if hasattr(saveable, "full_name"):
-          attribute.full_name = saveable.full_name
-        if isinstance(saveable, base.PythonStateSaveable):
-          if feed_additions is None:
-            assert saveables_cache is None
-            # If we're not caching saveables, then we're either executing
-            # eagerly or building a static save/restore (e.g. for a
-            # SavedModel). In either case, we should embed the current Python
-            # state in the graph rather than relying on a feed dict.
-            saveable = saveable.freeze()
-          else:
-            saveable_feed_dict = saveable.feed_dict_additions()
-            for new_feed_key in saveable_feed_dict.keys():
-              if new_feed_key in feed_additions:
-                raise AssertionError(
-                    ("The object %s tried to feed a value for the Tensor %s "
-                     "when saving, but another object is already feeding a "
-                     "value.")
-                    % (checkpointable, new_feed_key))
-            feed_additions.update(saveable_feed_dict)
-        named_saveable_objects.append(saveable)
-
-  return named_saveable_objects, feed_additions
-
-
-def fill_object_graph_proto(checkpointable_objects,
-                            node_ids,
-                            slot_variables,
-                            object_graph_proto=None):
-  """Name non-slot `Checkpointable`s and add them to `object_graph_proto`."""
-  if object_graph_proto is None:
-    object_graph_proto = (
-        checkpointable_object_graph_pb2.CheckpointableObjectGraph())
-  for checkpoint_id, checkpointable in enumerate(checkpointable_objects):
-    assert node_ids[checkpointable] == checkpoint_id
-    object_proto = object_graph_proto.nodes.add()
-    object_proto.slot_variables.extend(slot_variables.get(checkpointable, ()))
-    for child in checkpointable._checkpoint_dependencies:  # pylint: disable=protected-access
-      child_proto = object_proto.children.add()
-      child_proto.node_id = node_ids[child.ref]
-      child_proto.local_name = child.name
-  return object_graph_proto
-
-
-def _serialize_gathered_objects(
-    checkpointable_objects, path_to_root, saveables_cache, object_map):
-  """Create SaveableObjects and protos for gathered objects."""
-  object_names = _ObjectIdentityDictionary()
-  for obj, path in path_to_root.items():
-    object_names[obj] = _object_prefix_from_path(path)
-  node_ids = _ObjectIdentityDictionary()
-  for node_id, node in enumerate(checkpointable_objects):
-    node_ids[node] = node_id
-  slot_variables = _serialize_slot_variables(
-      checkpointable_objects=checkpointable_objects,
-      node_ids=node_ids,
-      object_names=object_names)
-  object_graph_proto = fill_object_graph_proto(
-      checkpointable_objects=checkpointable_objects,
-      node_ids=node_ids,
-      slot_variables=slot_variables)
-  named_saveable_objects, feed_additions = _add_attributes_to_object_graph(
-      checkpointable_objects=checkpointable_objects,
-      object_graph_proto=object_graph_proto,
-      node_ids=node_ids,
-      object_names=object_names,
-      saveables_cache=saveables_cache,
-      object_map=object_map)
-  return named_saveable_objects, object_graph_proto, feed_additions
-
-
-def _serialize_object_graph(root_checkpointable, saveables_cache):
-  """Determine checkpoint keys for variables and build a serialized graph.
-
-  Non-slot variables are keyed based on a shortest path from the root saveable
-  to the object which owns the variable (i.e. the one which called
-  `Checkpointable._add_variable` to create it).
-
-  Slot variables are keyed based on a shortest path to the variable being
-  slotted for, a shortest path to their optimizer, and the slot name.
-
-  Args:
-    root_checkpointable: A `Checkpointable` object whose variables (including
-      the variables of dependencies, recursively) should be saved.
-    saveables_cache: A dictionary mapping `Checkpointable` objects -> attribute
-      names -> SaveableObjects, used to avoid re-creating SaveableObjects when
-      graph building.
-
-  Returns:
-    A tuple of (named_variables, object_graph_proto, feed_additions):
-      named_variables: A dictionary mapping names to variable objects.
-      object_graph_proto: A CheckpointableObjectGraph protocol buffer containing
-        the serialized object graph and variable references.
-      feed_additions: A dictionary mapping from Tensors to values which should
-        be fed when saving.
-
-  Raises:
-    ValueError: If there are invalid characters in an optimizer's slot names.
-  """
-  checkpointable_objects, path_to_root = (
-      _breadth_first_checkpointable_traversal(root_checkpointable))
-  return _serialize_gathered_objects(
-      checkpointable_objects, path_to_root, saveables_cache, object_map=None)
-
-
-def named_saveables(root_checkpointable):
-  """Gather list of all SaveableObjects in the Checkpointable object."""
-  return _serialize_object_graph(root_checkpointable, None)[0]
-
-
-def find_objects(root_checkpointable):
-  """Find and number objects which are dependencies of `root_checkpointable`."""
-  checkpointable_objects, path_to_root = (
-      _breadth_first_checkpointable_traversal(root_checkpointable))
-  object_names = _ObjectIdentityDictionary()
-  for obj, path in path_to_root.items():
-    object_names[obj] = _object_prefix_from_path(path)
-  node_ids = _ObjectIdentityDictionary()
-  for node_id, node in enumerate(checkpointable_objects):
-    node_ids[node] = node_id
-  slot_variables = _serialize_slot_variables(
-      checkpointable_objects=checkpointable_objects,
-      node_ids=node_ids,
-      object_names=object_names)
-  return checkpointable_objects, node_ids, slot_variables
-
-
-def list_objects(root_checkpointable):
+def list_objects(root_trackable):
   """Traverse the object graph and list all accessible objects.
 
-  Looks for `Checkpointable` objects which are dependencies of
-  `root_checkpointable`. Includes slot variables only if the variable they are
-  slotting for and the optimizer are dependencies of `root_checkpointable`
+  Looks for `Trackable` objects which are dependencies of
+  `root_trackable`. Includes slot variables only if the variable they are
+  slotting for and the optimizer are dependencies of `root_trackable`
   (i.e. if they would be saved with a checkpoint).
 
   Args:
-    root_checkpointable: A `Checkpointable` object whose dependencies should be
+    root_trackable: A `Trackable` object whose dependencies should be
       flattened.
   Returns:
     A flat list of objects.
   """
-  checkpointable_objects, _, _ = find_objects(root_checkpointable)
-  return checkpointable_objects
+  return graph_view_lib.ObjectGraphView(root_trackable).list_objects()
 
 
-def gather_initializers(root_checkpointable):
+def gather_initializers(root_trackable):
   """Traverse the object graph and find initialization ops.
 
-  Looks for `Checkpointable` objects which are dependencies of
-  `root_checkpointable` and which have an `initializer` property. Includes
+  Looks for `Trackable` objects which are dependencies of
+  `root_trackable` and which have an `initializer` property. Includes
   initializers for slot variables only if the variable they are slotting for and
-  the optimizer are dependencies of `root_checkpointable` (i.e. if they would be
+  the optimizer are dependencies of `root_trackable` (i.e. if they would be
   saved with a checkpoint).
 
   Args:
-    root_checkpointable: A `Checkpointable` object to gather initializers for.
+    root_trackable: A `Trackable` object to gather initializers for.
   Returns:
     A list of initialization ops.
   """
-  checkpointable_objects = list_objects(root_checkpointable)
-  return [c.initializer for c in checkpointable_objects
+  trackable_objects = list_objects(root_trackable)
+  return [c.initializer for c in trackable_objects
           if hasattr(c, "initializer") and c.initializer is not None]
 
 
@@ -847,7 +369,7 @@ def capture_dependencies(template):
 
   Requires that `template.variable_scope` is active.
 
-  This scope is intended as a compatibility measure, allowing a checkpointable
+  This scope is intended as a compatibility measure, allowing a trackable
   object to add dependencies on variables created in a block of code which is
   not aware of object-based saving (and instead uses variable names
   heavily). This is how `Template` objects add dependencies on variables and
@@ -861,17 +383,17 @@ def capture_dependencies(template):
   """
   name_prefix = template.variable_scope.name
 
-  def _checkpointable_custom_creator(next_creator, name, initial_value,
-                                     checkpointable_parent=None, **kwargs):
-    """A variable creation hook which adds Checkpointable dependencies.
+  def _trackable_custom_creator(next_creator, name, initial_value,
+                                trackable_parent=None, **kwargs):
+    """A variable creation hook which adds Trackable dependencies.
 
     Set for example during a `Template`'s first wrapped function
-    execution. Ensures that (a) `template` depends on any checkpointable
+    execution. Ensures that (a) `template` depends on any trackable
     objects using their own `capture_dependencies` scope inside this scope which
     create variables, and (b) that any variables not in a more deeply nested
     scope are added as dependencies directly.
 
-    The `checkpointable_parent` argument is passed between custom creators but
+    The `trackable_parent` argument is passed between custom creators but
     ignored when the variable object itself is created. This argument indicates
     (if not `None`) that a more deeply nested scope has already added the
     variable as a dependency, and that parent scopes should add a dependency on
@@ -885,8 +407,8 @@ def capture_dependencies(template):
         but scopes opened within this scope are respected.
       initial_value: See `variable_scope.variable_creator_scope`. Taken
         explicitly so the argument can be re-named and used with
-        `Checkpointable._add_variable_with_custom_getter`.
-      checkpointable_parent: If not None, a more deeply nested checkpointable
+        `Trackable._add_variable_with_custom_getter`.
+      trackable_parent: If not None, a more deeply nested trackable
         object and its name prefix which were passed to `capture_dependencies`
         to add a dependency on (rather than depending on the variable directly).
       **kwargs: Passed through to the next creator.
@@ -903,28 +425,28 @@ def capture_dependencies(template):
           **inner_kwargs)
     if name is not None and name.startswith(name_prefix):
       scope_stripped_name = name[len(name_prefix) + 1:]
-      if not checkpointable_parent:
+      if not trackable_parent:
         return template._add_variable_with_custom_getter(  # pylint: disable=protected-access
             initializer=initial_value,
             name=scope_stripped_name,
             getter=_call_next_creator_renaming_initializer,
-            # Disable error checking for Checkpointable. Exceptions are instead
+            # Disable error checking for Trackable. Exceptions are instead
             # raised if necessary when the object-based saver tries to
             # save/restore the object.
             overwrite=True,
-            checkpointable_parent=(template, name_prefix),
+            trackable_parent=(template, name_prefix),
             **kwargs)
       else:
-        parent_object, parent_name_prefix = checkpointable_parent
-        template._track_checkpointable(  # pylint: disable=protected-access
+        parent_object, parent_name_prefix = trackable_parent
+        template._track_trackable(  # pylint: disable=protected-access
             parent_object,
             name=parent_name_prefix[len(name_prefix) + 1:],
             overwrite=True)
     return next_creator(
         name=name, initial_value=initial_value,
-        checkpointable_parent=(template, name_prefix), **kwargs)
+        trackable_parent=(template, name_prefix), **kwargs)
 
-  with variable_scope.variable_creator_scope(_checkpointable_custom_creator):
+  with variable_scope.variable_creator_scope(_trackable_custom_creator):
     yield
 
 
@@ -1000,10 +522,10 @@ class CheckpointLoadStatus(_LoadStatus):
   See `Saver.restore` for usage examples.
   """
 
-  def __init__(self, checkpoint, feed_dict, root_checkpointable):
+  def __init__(self, checkpoint, feed_dict, graph_view):
     self._checkpoint = checkpoint
     self._feed_dict = feed_dict
-    self._root_checkpointable = root_checkpointable
+    self._graph_view = graph_view
 
   def assert_consumed(self):
     """Asserts that all objects in the checkpoint have been created/matched.
@@ -1018,8 +540,8 @@ class CheckpointLoadStatus(_LoadStatus):
     """
     self.assert_existing_objects_matched()
     for node_id, node in enumerate(self._checkpoint.object_graph_proto.nodes):
-      checkpointable = self._checkpoint.object_by_proto_id.get(node_id, None)
-      if checkpointable is None:
+      trackable = self._checkpoint.object_by_proto_id.get(node_id, None)
+      if trackable is None:
         raise AssertionError("Unresolved object in checkpoint: %s" % (node,))
     if self._checkpoint.slot_restorations:
       # Sanity check; this collection should be clear if everything has been
@@ -1030,11 +552,11 @@ class CheckpointLoadStatus(_LoadStatus):
       raise AssertionError(
           ("Unused attributes in these objects (the attributes exist in the "
            "checkpoint but not in the objects): %s") % (
-               self._checkpoint.unused_attributes.items(),))
+               list(self._checkpoint.unused_attributes.items()),))
     return self
 
   def assert_existing_objects_matched(self):
-    """Asserts that checkpointable Python objects have been matched.
+    """Asserts that trackable Python objects have been matched.
 
     Note that this is a weaker assertion than `assert_consumed`. It will only
     fail for existing Python objects which are (transitive) dependencies of the
@@ -1051,22 +573,23 @@ class CheckpointLoadStatus(_LoadStatus):
         of the root object but does not have a value in the checkpoint.
     """
     for node_id, node in enumerate(self._checkpoint.object_graph_proto.nodes):
-      checkpointable = self._checkpoint.object_by_proto_id.get(node_id, None)
-      if (checkpointable is not None
-          and checkpointable._update_uid < self._checkpoint.restore_uid):  # pylint: disable=protected-access
+      trackable = self._checkpoint.object_by_proto_id.get(node_id, None)
+      if (trackable is not None
+          and trackable._update_uid < self._checkpoint.restore_uid):  # pylint: disable=protected-access
         raise AssertionError(
             "Object not assigned a value from checkpoint: %s" % (node,))
-    for checkpointable_object in list_objects(self._root_checkpointable):
+    for trackable_object in self._graph_view.list_objects():
       # Remove data structures that do not contain any variables from
       # restoration checks.
-      if (isinstance(checkpointable_object,
-                     data_structures.CheckpointableDataStructure) and
-          not checkpointable_object._checkpoint_dependencies):
+      if (isinstance(trackable_object,
+                     data_structures.TrackableDataStructure) and
+          not trackable_object._checkpoint_dependencies):
         continue
-      self._checkpoint.all_python_objects.add(checkpointable_object)
+      self._checkpoint.all_python_objects.add(trackable_object)
     unused_python_objects = (
-        _ObjectIdentitySet(self._checkpoint.all_python_objects)
-        - _ObjectIdentitySet(self._checkpoint.object_by_proto_id.values()))
+        object_identity.ObjectIdentitySet(self._checkpoint.all_python_objects)
+        - object_identity.ObjectIdentitySet(
+            self._checkpoint.object_by_proto_id.values()))
     if unused_python_objects:
       raise AssertionError(
           ("Some Python objects were not bound to checkpointed values, likely "
@@ -1076,12 +599,14 @@ class CheckpointLoadStatus(_LoadStatus):
 
   def assert_nontrivial_match(self):
     """Raises an exception if only the root object matched."""
-    for checkpointable_object in list_objects(self._root_checkpointable):
-      self._checkpoint.all_python_objects.add(checkpointable_object)
+    for trackable_object in self._graph_view.list_objects():
+      self._checkpoint.all_python_objects.add(trackable_object)
     if len(self._checkpoint.object_by_proto_id) <= 1:
       unused_python_objects = (
-          _ObjectIdentitySet(self._checkpoint.all_python_objects)
-          - _ObjectIdentitySet(self._checkpoint.object_by_proto_id.values()))
+          object_identity.ObjectIdentitySet(
+              self._checkpoint.all_python_objects)
+          - object_identity.ObjectIdentitySet(
+              self._checkpoint.object_by_proto_id.values()))
       if unused_python_objects:
         raise AssertionError(
             ("Nothing except the root object matched a checkpointed value. "
@@ -1091,7 +616,7 @@ class CheckpointLoadStatus(_LoadStatus):
       else:
         raise AssertionError(
             "Nothing to load. No dependencies have been added to %s yet." % (
-                self._root_checkpointable,))
+                self._graph_view.root,))
     return self
 
   def run_restore_ops(self, session=None):
@@ -1121,8 +646,8 @@ class CheckpointLoadStatus(_LoadStatus):
       return  # Initialization and restoration ops are run eagerly
     if session is None:
       session = ops.get_default_session()
-    all_objects = list_objects(self._root_checkpointable)
-    already_initialized_objects = _ObjectIdentitySet(
+    all_objects = self._graph_view.list_objects()
+    already_initialized_objects = object_identity.ObjectIdentitySet(
         self._checkpoint.object_by_proto_id.values())
     initializers_for_non_restored_variables = [
         c.initializer for c in all_objects
@@ -1144,9 +669,9 @@ class InitializationOnlyStatus(_LoadStatus):
   otherwise.
   """
 
-  def __init__(self, root_checkpointable, restore_uid):
+  def __init__(self, graph_view, restore_uid):
     self._restore_uid = restore_uid
-    self._root_checkpointable = root_checkpointable
+    self._graph_view = graph_view
 
   def assert_consumed(self):
     """Assertion for consistency with `CheckpointLoadStatus`. Always fails."""
@@ -1194,9 +719,9 @@ class InitializationOnlyStatus(_LoadStatus):
       return  # run eagerly
     if session is None:
       session = ops.get_default_session()
-    checkpointable_objects = list_objects(self._root_checkpointable)
+    trackable_objects = self._graph_view.list_objects()
     initializers = [
-        c.initializer for c in checkpointable_objects
+        c.initializer for c in trackable_objects
         if hasattr(c, "initializer") and c.initializer is not None
         and (getattr(c, "_update_uid", self._restore_uid - 1)
              < self._restore_uid)]
@@ -1219,9 +744,9 @@ class NameBasedSaverStatus(_LoadStatus):
   # interferes with isinstance checks.
   @deprecation.deprecated(
       date=None, instructions=_DEPRECATED_RESTORE_INSTRUCTIONS)
-  def __init__(self, checkpoint, root_checkpointable):
+  def __init__(self, checkpoint, graph_view):
     self._checkpoint = checkpoint
-    self._root_checkpointable = root_checkpointable
+    self._graph_view = graph_view
 
   def assert_consumed(self):
     """Raises an exception if any variables/objects are unmatched."""
@@ -1230,11 +755,11 @@ class NameBasedSaverStatus(_LoadStatus):
       raise AssertionError(
           "Some objects had attributes which were not restored: %s"
           % (unused_attributes,))
-    for checkpointable in list_objects(self._root_checkpointable):
+    for trackable in self._graph_view.list_objects():
       # pylint: disable=protected-access
-      checkpointable._maybe_initialize_checkpointable()
-      if checkpointable._update_uid < self._checkpoint.restore_uid:
-        raise AssertionError("Object not restored: %s" % (checkpointable,))
+      trackable._maybe_initialize_trackable()
+      if trackable._update_uid < self._checkpoint.restore_uid:
+        raise AssertionError("Object not restored: %s" % (trackable,))
       # pylint: enable=protected-access
     return self
 
@@ -1256,19 +781,19 @@ class NameBasedSaverStatus(_LoadStatus):
 
   def _gather_saveable_objects(self):
     """Walk the object graph, using global names for SaveableObjects."""
-    objects = list_objects(self._root_checkpointable)
+    objects = self._graph_view.list_objects()
     saveable_objects = []
-    for checkpointable in objects:
+    for trackable in objects:
       # pylint: disable=protected-access
-      checkpointable._maybe_initialize_checkpointable()
-      if checkpointable._update_uid < self._checkpoint.restore_uid:
-        checkpointable._update_uid = self._checkpoint.restore_uid
+      trackable._maybe_initialize_trackable()
+      if trackable._update_uid < self._checkpoint.restore_uid:
+        trackable._update_uid = self._checkpoint.restore_uid
       else:
         continue
       # pylint: enable=protected-access
       saveable_objects.extend(
           self._checkpoint.globally_named_object_attributes(
-              checkpointable))
+              trackable))
     return saveable_objects
 
   def run_restore_ops(self, session=None):
@@ -1304,36 +829,32 @@ class _SessionWithFeedDictAdditions(session_lib.SessionInterface):
         fetches=fetches, feed_dict=feed_dict, **kwargs)
 
 
-class CheckpointableSaver(object):
-  """Saves and restores a `Checkpointable` object and its dependencies.
+class TrackableSaver(object):
+  """Saves and restores a `Trackable` object and its dependencies.
 
-  See `Checkpointable` for details of dependency management. `Saver` wraps
+  See `Trackable` for details of dependency management. `Saver` wraps
   `tf.train.Saver` for saving, including extra information about the graph of
   dependencies between Python objects. When restoring, it uses this information
   about the save-time dependency graph to more robustly match objects with their
   checkpointed values. When executing eagerly, it supports restoring variables
   on object creation (see `Saver.restore`).
 
-  Values in a checkpoint are mapped to `Checkpointable` Python objects
+  Values in a checkpoint are mapped to `Trackable` Python objects
   (`Variable`s, `Optimizer`s, `Layer`s) based on the names provided when the
   checkpoint was written. To avoid breaking existing checkpoints when modifying
-  a class, dependency names (the names of attributes to which `Checkpointable`
+  a class, dependency names (the names of attributes to which `Trackable`
   objects are assigned) may not change. These names are local to objects, in
   contrast to the `Variable.name`-based save/restore from `tf.train.Saver`, and
   so allow additional program transformations.
   """
 
-  def __init__(self, root_checkpointable):
+  def __init__(self, graph_view):
     """Configure saving.
 
     Args:
-      root_checkpointable: The root of the object graph to save/restore. This
-        object and all of its dependencies are saved in the checkpoint. When
-        restoring, objects are matched and restored starting from this root.
+      graph_view: A `GraphView` object containing a description of the object
+        graph to save.
     """
-    # Allow passing in a weak reference to avoid reference cycles when
-    # `Checkpointable` objects save themselves.
-    self._root_checkpointable_ref = root_checkpointable
     # The file prefix placeholder is created lazily when graph building (and not
     # at all when executing eagerly) to avoid creating ops in the constructor
     # (when they may never be necessary).
@@ -1347,34 +868,13 @@ class CheckpointableSaver(object):
 
     # Op caching for restore, shared between _CheckpointRestoreCoordinators
     self._restore_op_cache = {}
-
-    if context.executing_eagerly():
-      # SaveableObjects are always recreated when executing eagerly.
-      self._saveable_object_cache = None
-    else:
-      # Maps Checkpointable objects -> attribute names -> list(SaveableObjects),
-      # to avoid re-creating SaveableObjects when graph building.
-      self._saveable_object_cache = _ObjectIdentityWeakKeyDictionary()
-
-  @property
-  def _root_checkpointable(self):
-    if isinstance(self._root_checkpointable_ref, weakref.ref):
-      derefed = self._root_checkpointable_ref()
-      assert derefed is not None
-      return derefed
-    else:
-      return self._root_checkpointable_ref
+    self._graph_view = graph_view
 
   def _gather_saveables(
-      self, object_graph_tensor=None, saveable_object_cache=None):
+      self, object_graph_tensor=None):
     """Wraps _serialize_object_graph to include the object graph proto."""
-    assert ((object_graph_tensor is None and saveable_object_cache is None)
-            or (object_graph_tensor is not None
-                and saveable_object_cache is not None))
     (named_saveable_objects, graph_proto,
-     feed_additions) = _serialize_object_graph(
-         self._root_checkpointable,
-         saveables_cache=saveable_object_cache)
+     feed_additions) = self._graph_view.serialize_object_graph()
     if object_graph_tensor is None:
       with ops.device("/cpu:0"):
         object_graph_tensor = constant_op.constant(
@@ -1389,50 +889,16 @@ class CheckpointableSaver(object):
             name=base.OBJECT_GRAPH_PROTO_KEY))
     return named_saveable_objects, graph_proto, feed_additions
 
-  def freeze(self, object_map=None, to_graph=None):
-    """Creates a `tf.train.Saver` with the current object graph frozen."""
-    checkpointable_objects, path_to_root = (
-        _breadth_first_checkpointable_traversal(self._root_checkpointable))
-    if to_graph:
-      target_context = to_graph.as_default
-    else:
-      target_context = ops.NullContextmanager
-    with target_context():
-      named_saveable_objects, graph_proto, _ = _serialize_gathered_objects(
-          checkpointable_objects,
-          path_to_root,
-          saveables_cache=None,
-          object_map=object_map)
-      with ops.device("/cpu:0"):
-        object_graph_tensor = constant_op.constant(
-            graph_proto.SerializeToString(), dtype=dtypes.string)
-      named_saveable_objects.append(
-          base.NoRestoreSaveable(
-              tensor=object_graph_tensor,
-              name=base.OBJECT_GRAPH_PROTO_KEY))
-      # TODO(allenl): Swap in a function-based saver here once it can serialize
-      # to a SaverDef.
-      return v1_saver_lib.Saver(
-          var_list=named_saveable_objects, max_to_keep=None)
-
   def _save_cached_when_graph_building(
       self,
       file_prefix,
-      object_graph_tensor=None,
-      saveable_object_cache=None):
+      object_graph_tensor=None):
     """Create or retrieve save ops.
 
-    When graph building, `saveable_object_cache` will typically be non-`None`,
-    meaning that existing `SaveableObject`s are re-used across calls to
-    `_prepare_save` even if the object graph has grown. This avoids
-    unnecessarily re-creating save ops.
-
     Args:
       file_prefix: The prefix for saved checkpoint files.
       object_graph_tensor: A `Tensor` to which the current object graph will be
         fed.
-      saveable_object_cache: A dictionary; if specified, used to cache
-        `SaveableObject`s.
 
     Returns:
       A two-element tuple with a filename tensor and a feed_dict of tensors to
@@ -1442,14 +908,14 @@ class CheckpointableSaver(object):
     """
     (named_saveable_objects, graph_proto,
      feed_additions) = self._gather_saveables(
-         object_graph_tensor=object_graph_tensor,
-         saveable_object_cache=saveable_object_cache)
+         object_graph_tensor=object_graph_tensor)
     if (self._last_save_object_graph != graph_proto
         # When executing eagerly, we need to re-create SaveableObjects each time
         # save() is called so they pick up new Tensors passed to their
         # constructors. That means the Saver needs to be copied with a new
         # var_list.
-        or context.executing_eagerly()):
+        or context.executing_eagerly()
+        or ops.inside_function()):
       saver = functional_saver.Saver(named_saveable_objects)
       with ops.device("/cpu:0"):
         self._cached_save_operation = saver.save(file_prefix)
@@ -1460,7 +926,7 @@ class CheckpointableSaver(object):
     """Save a training checkpoint.
 
     The saved checkpoint includes variables created by this object and any
-    Checkpointable objects it depends on at the time `Saver.save()` is called.
+    Trackable objects it depends on at the time `Saver.save()` is called.
 
     Args:
       file_prefix: A prefix to use for the checkpoint filenames
@@ -1469,8 +935,8 @@ class CheckpointableSaver(object):
       checkpoint_number: An integer variable or Tensor, used to number
         checkpoints. Typically this value is saved along with other variables in
         training checkpoints, which will happen automatically if it was created
-        by `root_checkpointable` or one of its dependencies (via
-        `Checkpointable._add_variable`).
+        by `root_trackable` or one of its dependencies (via
+        `Trackable._add_variable`).
       session: The session to evaluate variables in. Ignored when executing
         eagerly. If not provided when graph building, the default session is
         used.
@@ -1479,10 +945,11 @@ class CheckpointableSaver(object):
       The full path to the checkpoint.
     """
     feed_dict = {}
-    graph_building = not context.executing_eagerly()
+    use_session = (not context.executing_eagerly()
+                   and not ops.inside_function())
     if checkpoint_number:
       file_prefix = "%s-%d" % (file_prefix, checkpoint_number)
-    if graph_building:
+    if use_session:
       if self._object_graph_feed_tensor is None:
         with ops.device("/cpu:0"):
           self._object_graph_feed_tensor = constant_op.constant(
@@ -1501,28 +968,26 @@ class CheckpointableSaver(object):
     file_io.recursive_create_dir(os.path.dirname(file_prefix))
     save_path, new_feed_additions = self._save_cached_when_graph_building(
         file_prefix=file_prefix_tensor,
-        object_graph_tensor=object_graph_tensor,
-        saveable_object_cache=self._saveable_object_cache)
+        object_graph_tensor=object_graph_tensor)
     if new_feed_additions:
       feed_dict.update(new_feed_additions)
-    if not graph_building:
+    if not use_session:
       session = None
     elif session is None:
       session = ops.get_default_session()
 
     if session:
-      save_path = session.run(save_path, feed_dict=feed_dict)
+      return session.run(save_path, feed_dict=feed_dict)
     else:
-      save_path = save_path.numpy()
-    return save_path
+      return save_path
 
   def restore(self, save_path):
     """Restore a training checkpoint.
 
-    Restores `root_checkpointable` and any objects that it tracks
+    Restores `root_trackable` and any objects that it tracks
     (transitive). Either assigns values immediately if variables to restore have
     been created already, or defers restoration until the variables are
-    created. Dependencies added to the `root_checkpointable` passed to the
+    created. Dependencies added to the `root_trackable` passed to the
     constructor after this call will be matched if they have a corresponding
     object in the checkpoint.
 
@@ -1575,7 +1040,7 @@ class CheckpointableSaver(object):
       object is returned which runs restore ops from a name-based saver.
     """
     if save_path is None:
-      return InitializationOnlyStatus(self._root_checkpointable, ops.uid())
+      return InitializationOnlyStatus(self._graph_view, ops.uid())
     reader = pywrap_tensorflow.NewCheckpointReader(save_path)
     graph_building = not context.executing_eagerly()
     if graph_building:
@@ -1591,15 +1056,15 @@ class CheckpointableSaver(object):
       restore_coordinator = _NameBasedRestoreCoordinator(
           save_path=save_path, dtype_map=dtype_map)
       if not graph_building:
-        for existing_checkpointable in list_objects(self._root_checkpointable):
+        for existing_trackable in self._graph_view.list_objects():
           # pylint: disable=protected-access
-          existing_checkpointable._maybe_initialize_checkpointable()
-          existing_checkpointable._name_based_restores.add(restore_coordinator)
-          existing_checkpointable._name_based_attribute_restore(
+          existing_trackable._maybe_initialize_trackable()
+          existing_trackable._name_based_restores.add(restore_coordinator)
+          existing_trackable._name_based_attribute_restore(
               restore_coordinator)
           # pylint: enable=protected-access
       return NameBasedSaverStatus(
-          restore_coordinator, root_checkpointable=self._root_checkpointable)
+          restore_coordinator, graph_view=self._graph_view)
 
     if graph_building:
       if self._file_prefix_placeholder is None:
@@ -1612,25 +1077,25 @@ class CheckpointableSaver(object):
         file_prefix_tensor = constant_op.constant(save_path)
       file_prefix_feed_dict = None
     object_graph_proto = (
-        checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+        trackable_object_graph_pb2.TrackableObjectGraph())
     object_graph_proto.ParseFromString(object_graph_string)
     checkpoint = _CheckpointRestoreCoordinator(
         object_graph_proto=object_graph_proto,
         save_path=save_path,
         save_path_tensor=file_prefix_tensor,
         restore_op_cache=self._restore_op_cache,
-        saveable_object_cache=self._saveable_object_cache)
-    base._CheckpointPosition(  # pylint: disable=protected-access
-        checkpoint=checkpoint, proto_id=0).restore(self._root_checkpointable)
+        graph_view=self._graph_view)
+    base.CheckpointPosition(checkpoint=checkpoint, proto_id=0).restore(
+        self._graph_view.root)
     load_status = CheckpointLoadStatus(
         checkpoint,
-        root_checkpointable=self._root_checkpointable,
+        graph_view=self._graph_view,
         feed_dict=file_prefix_feed_dict)
     return load_status
 
 
-def frozen_saver(root_checkpointable):
-  """Creates a static `tf.train.Saver` from a checkpointable object.
+def frozen_saver(root_trackable):
+  """Creates a static `tf.train.Saver` from a trackable object.
 
   The returned `Saver` saves object-based checkpoints, but these checkpoints
   will no longer reflect structural changes to the object graph, only changes to
@@ -1644,21 +1109,34 @@ def frozen_saver(root_checkpointable):
   object graph and the current Python object graph.
 
   Args:
-    root_checkpointable: A checkpointable object to save.
+    root_trackable: A trackable object to save.
 
   Returns:
-    A `tf.train.Saver` which saves object-based checkpoints for the object graph
-    frozen at the time `frozen_saver` was called.
+    A saver which saves object-based checkpoints for the object graph frozen at
+    the time `frozen_saver` was called.
   """
-  return CheckpointableSaver(root_checkpointable).freeze()
+  named_saveable_objects = graph_view_lib.ObjectGraphView(
+      root_trackable).frozen_saveable_objects()
+  return functional_saver.Saver(named_saveable_objects)
+
+
+def saver_with_op_caching(obj):
+  """A TrackableSaver with a SaveableObject cache when graph building."""
+  if context.executing_eagerly():
+    saveables_cache = None
+  else:
+    saveables_cache = object_identity.ObjectIdentityWeakKeyDictionary()
+  return TrackableSaver(graph_view_lib.ObjectGraphView(
+      weakref.ref(obj),
+      saveables_cache=saveables_cache))
 
 
 @tf_export("train.Checkpoint")
-class Checkpoint(tracking.Checkpointable):
-  """Groups checkpointable objects, saving and restoring them.
+class Checkpoint(tracking.AutoTrackable):
+  """Groups trackable objects, saving and restoring them.
 
   `Checkpoint`'s constructor accepts keyword arguments whose values are types
-  that contain checkpointable state, such as `tf.train.Optimizer`
+  that contain trackable state, such as `tf.train.Optimizer`
   implementations, `tf.Variable`, `tf.keras.Layer` implementations, or
   `tf.keras.Model` implementations. It saves these values with a checkpoint, and
   maintains a `save_counter` for numbering checkpoints.
@@ -1750,24 +1228,23 @@ class Checkpoint(tracking.Checkpointable):
 
     Args:
       **kwargs: Keyword arguments are set as attributes of this object, and are
-        saved with the checkpoint. Values must be checkpointable objects.
+        saved with the checkpoint. Values must be trackable objects.
     Raises:
-      ValueError: If objects in `kwargs` are not checkpointable.
+      ValueError: If objects in `kwargs` are not trackable.
     """
     super(Checkpoint, self).__init__()
     for k, v in sorted(kwargs.items(), key=lambda item: item[0]):
-      if not isinstance(v, (base.CheckpointableBase,
-                            def_function.PolymorphicFunction)):
+      if not isinstance(v, (base.Trackable, def_function.Function)):
         raise ValueError(
-            ("`Checkpoint` was expecting a checkpointable object (an object "
-             "derived from `CheckpointableBase`), got %s. If you believe this "
-             "object should be checkpointable (i.e. it is part of the "
+            ("`Checkpoint` was expecting a trackable object (an object "
+             "derived from `TrackableBase`), got %s. If you believe this "
+             "object should be trackable (i.e. it is part of the "
              "TensorFlow Python API and manages state), please open an issue.")
             % (v,))
       setattr(self, k, v)
     self._save_counter = None  # Created lazily for restore-on-create.
     self._save_assign_op = None
-    self._saver = CheckpointableSaver(weakref.ref(self))
+    self._saver = saver_with_op_caching(self)
 
   def _maybe_create_save_counter(self):
     """Create a save counter if it does not yet exist."""
@@ -1784,7 +1261,7 @@ class Checkpoint(tracking.Checkpointable):
     """Writes a training checkpoint.
 
     The checkpoint includes variables created by this object and any
-    checkpointable objects it depends on at the time `Checkpoint.write()` is
+    trackable objects it depends on at the time `Checkpoint.write()` is
     called.
 
     `write` does not number checkpoints, increment `save_counter`, or update the
@@ -1802,9 +1279,18 @@ class Checkpoint(tracking.Checkpointable):
     Returns:
       The full path to the checkpoint (i.e. `file_prefix`).
     """
-    return compat.as_str(self._saver.save(
+    output = self._saver.save(
         file_prefix=file_prefix,
-        session=session))
+        session=session)
+    if tensor_util.is_tensor(output):
+      if context.executing_eagerly():
+        return compat.as_str(output.numpy())
+      else:
+        # Function building
+        return output
+    else:
+      # Graph + Session, so we already session.ran it.
+      return compat.as_str(output)
 
   @property
   def save_counter(self):
@@ -1822,7 +1308,7 @@ class Checkpoint(tracking.Checkpointable):
     """Saves a training checkpoint and provides basic checkpoint management.
 
     The saved checkpoint includes variables created by this object and any
-    checkpointable objects it depends on at the time `Checkpoint.save()` is
+    trackable objects it depends on at the time `Checkpoint.save()` is
     called.
 
     `save` is a basic convenience wrapper around the `write` method,
@@ -1845,6 +1331,14 @@ class Checkpoint(tracking.Checkpointable):
     """
     graph_building = not context.executing_eagerly()
     if graph_building:
+      if ops.inside_function():
+        raise NotImplementedError(
+            "Calling tf.train.Checkpoint.save() from a function is not "
+            "supported, as save() modifies saving metadata in ways not "
+            "supported by TensorFlow Operations. Consider using "
+            "tf.train.Checkpoint.write(), a lower-level API which does not "
+            "update metadata. tf.train.latest_checkpoint and related APIs will "
+            "not see this checkpoint.")
       if session is None:
         session = ops.get_default_session()
       if self._save_counter is None:
@@ -1863,10 +1357,11 @@ class Checkpoint(tracking.Checkpointable):
       checkpoint_number = assign_op.numpy()
     file_path = self.write("%s-%d" % (file_prefix, checkpoint_number),
                            session=session)
-    checkpoint_management.update_checkpoint_state(
+    checkpoint_management.update_checkpoint_state_internal(
         save_dir=os.path.dirname(file_prefix),
         model_checkpoint_path=file_path,
-        all_model_checkpoint_paths=[file_path])
+        all_model_checkpoint_paths=[file_path],
+        save_relative_paths=True)
     return file_path
 
   def restore(self, save_path):
@@ -1878,7 +1373,7 @@ class Checkpoint(tracking.Checkpointable):
     restore have been created already, or defers restoration until the variables
     are created. Dependencies added after this call will be matched if they have
     a corresponding object in the checkpoint (the restore request will queue in
-    any checkpointable object waiting for the expected dependency to be added).
+    any trackable object waiting for the expected dependency to be added).
 
     When graph building, restoration ops are added to the graph but not run
     immediately.
diff --git a/tensorflow/python/training/checkpointable/util_test.py b/tensorflow/python/training/tracking/util_test.py
similarity index 64%
rename from tensorflow/python/training/checkpointable/util_test.py
rename to tensorflow/python/training/tracking/util_test.py
index 3bdab4cb0bf990543a18cab885f540b8d1f78ed8..e2878915f8e15d49162b1d97969496c794fc110d 100644
--- a/tensorflow/python/training/checkpointable/util_test.py
+++ b/tensorflow/python/training/tracking/util_test.py
@@ -20,10 +20,10 @@ import functools
 import json
 import os
 
+from absl.testing import parameterized
 import six
 
 from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -35,27 +35,28 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import template
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.training import adam
+from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
-from tensorflow.python.training.checkpointable import base
-from tensorflow.python.training.checkpointable import tracking
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.tracking import base
+from tensorflow.python.training.tracking import graph_view
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util as trackable_utils
 
 
-class NonLayerCheckpointable(tracking.Checkpointable):
+class NonLayerTrackable(tracking.AutoTrackable):
 
   def __init__(self):
-    super(NonLayerCheckpointable, self).__init__()
-    self.a_variable = checkpointable_utils.add_variable(
+    super(NonLayerTrackable, self).__init__()
+    self.a_variable = trackable_utils.add_variable(
         self, name="a_variable", shape=[])
 
 
@@ -67,8 +68,8 @@ class MyModel(training.Model):
     super(MyModel, self).__init__()
     self._named_dense = core.Dense(1, use_bias=True)
     self._second = core.Dense(1, use_bias=False)
-    # We can still track Checkpointables which aren't Layers.
-    self._non_layer = NonLayerCheckpointable()
+    # We can still track Trackables which aren't Layers.
+    self._non_layer = NonLayerTrackable()
 
   def call(self, values):
     ret = self._second(self._named_dense(values))
@@ -77,21 +78,30 @@ class MyModel(training.Model):
 
 class InterfaceTests(test.TestCase):
 
+  def testLayerDeduplication(self):
+    model = training.Model()
+    layer_one = core.Dense(1)
+    layer_two = core.Dense(1)
+    model.other_path = [layer_one, layer_two]
+    model.l2 = layer_two
+    model.l1 = layer_one
+    self.assertEqual([layer_one, layer_two], model.layers)
+
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testAddVariable(self):
-    obj = NonLayerCheckpointable()
+    obj = NonLayerTrackable()
     with self.assertRaisesRegexp(ValueError, "do not specify shape"):
-      checkpointable_utils.add_variable(
+      trackable_utils.add_variable(
           obj, name="shape_specified_twice", shape=[], initializer=1)
-    constant_initializer = checkpointable_utils.add_variable(
+    constant_initializer = trackable_utils.add_variable(
         obj, name="constant_initializer", initializer=1)
     with variable_scope.variable_scope("some_variable_scope"):
-      ones_initializer = checkpointable_utils.add_variable(
+      ones_initializer = trackable_utils.add_variable(
           obj,
           name="ones_initializer",
           shape=[2],
           initializer=init_ops.ones_initializer(dtype=dtypes.float32))
-    bare_initializer = checkpointable_utils.add_variable(
+    bare_initializer = trackable_utils.add_variable(
         obj,
         name="bare_initializer",
         shape=[2, 2],
@@ -102,12 +112,12 @@ class InterfaceTests(test.TestCase):
     # naming conflicts within an object.
     other_duplicate = resource_variable_ops.ResourceVariable(
         name="duplicate", initial_value=1.)
-    duplicate = checkpointable_utils.add_variable(
+    duplicate = trackable_utils.add_variable(
         obj, name="duplicate", shape=[])
     with self.assertRaisesRegexp(ValueError, "'duplicate'.*already declared"):
-      checkpointable_utils.add_variable(obj, name="duplicate", shape=[])
+      trackable_utils.add_variable(obj, name="duplicate", shape=[])
 
-    self.evaluate(checkpointable_utils.gather_initializers(obj))
+    self.evaluate(trackable_utils.gather_initializers(obj))
     self.assertEqual("constant_initializer:0", constant_initializer.name)
     self.assertEqual(1, self.evaluate(constant_initializer))
     self.assertEqual("some_variable_scope/ones_initializer:0",
@@ -125,8 +135,8 @@ class InterfaceTests(test.TestCase):
       # The .name attribute may be globally influenced, but the checkpoint name
       # won't be (tested below).
       self.assertEqual("duplicate_1:0", duplicate.name)
-    named_variables, _, _ = checkpointable_utils._serialize_object_graph(
-        obj, saveables_cache=None)
+    named_variables, _, _ = (
+        graph_view.ObjectGraphView(obj).serialize_object_graph())
     expected_checkpoint_names = (
         "a_variable/.ATTRIBUTES/VARIABLE_VALUE",
         "bare_initializer/.ATTRIBUTES/VARIABLE_VALUE",
@@ -139,20 +149,20 @@ class InterfaceTests(test.TestCase):
 
   def testInitNotCalled(self):
 
-    class NoInit(tracking.Checkpointable):
+    class NoInit(tracking.AutoTrackable):
 
       def __init__(self):
         pass
 
-    # __init__ for Checkpointable will be called implicitly.
-    checkpointable_utils.add_variable(NoInit(), "var", shape=[])
+    # __init__ for Trackable will be called implicitly.
+    trackable_utils.add_variable(NoInit(), "var", shape=[])
 
   def testShapeDtype(self):
-    root = tracking.Checkpointable()
-    v1 = checkpointable_utils.add_variable(
+    root = tracking.AutoTrackable()
+    v1 = trackable_utils.add_variable(
         root, name="v1", initializer=3., dtype=dtypes.float64)
     self.assertEqual(dtypes.float64, v1.dtype)
-    v2 = checkpointable_utils.add_variable(
+    v2 = trackable_utils.add_variable(
         root,
         name="v2",
         shape=[3],
@@ -166,34 +176,34 @@ class InterfaceTests(test.TestCase):
       checkpoint_directory = self.get_temp_dir()
       checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
       dense = core.Dense(1)
-      checkpoint = checkpointable_utils.Checkpoint(dense=dense)
+      checkpoint = trackable_utils.Checkpoint(dense=dense)
       dense(constant_op.constant([[1.]]))
       save_path = checkpoint.save(checkpoint_prefix)
 
-    objects = checkpointable_utils.object_metadata(save_path)
+    objects = trackable_utils.object_metadata(save_path)
     all_variable_names = []
     for obj in objects.nodes:
       for attribute in obj.attributes:
         all_variable_names.append(attribute.full_name)
     self.assertIn("dense/kernel", all_variable_names)
 
-  def testNotCheckpointable(self):
+  def testNotTrackable(self):
 
     class CallsFunctionalStuff(
-        tracking.NotCheckpointable, tracking.Checkpointable):
+        tracking.NotTrackable, tracking.AutoTrackable):
       pass
 
     test_dir = self.get_temp_dir()
     prefix = os.path.join(test_dir, "ckpt")
-    checkpoint = checkpointable_utils.Checkpoint(x=CallsFunctionalStuff())
+    checkpoint = trackable_utils.Checkpoint(x=CallsFunctionalStuff())
     with self.assertRaises(NotImplementedError):
       checkpoint.save(prefix)
 
     class CallsFunctionalStuffOtherMRO(
-        tracking.Checkpointable, tracking.NotCheckpointable):
+        tracking.AutoTrackable, tracking.NotTrackable):
       pass
 
-    checkpoint_reversed = checkpointable_utils.Checkpoint(
+    checkpoint_reversed = trackable_utils.Checkpoint(
         x=CallsFunctionalStuffOtherMRO())
     with self.assertRaises(NotImplementedError):
       checkpoint_reversed.save(prefix)
@@ -220,8 +230,8 @@ class _MirroringSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
         self._mirrored_variable.assign(tensor))
 
 
-class _OwnsMirroredVariables(base.CheckpointableBase):
-  """A Checkpointable object which returns a more complex SaveableObject."""
+class _OwnsMirroredVariables(base.Trackable):
+  """A Trackable object which returns a more complex SaveableObject."""
 
   def __init__(self):
     self.non_dep_variable = variable_scope.get_variable(
@@ -243,7 +253,7 @@ class _OwnsMirroredVariables(base.CheckpointableBase):
     return self.non_dep_variable.name
 
 
-class CheckpointingTests(test.TestCase):
+class CheckpointingTests(parameterized.TestCase, test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testNamingWithOptimizer(self):
@@ -252,41 +262,31 @@ class CheckpointingTests(test.TestCase):
     # A nuisance Model using the same optimizer. Its slot variables should not
     # go in the checkpoint, since it is never depended on.
     other_model = MyModel()
-    optimizer = adam.AdamOptimizer(0.001)
-    optimizer_step = training_util.get_or_create_global_step()
-    root_checkpointable = checkpointable_utils.Checkpoint(
-        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
-    if context.executing_eagerly():
-      optimizer.minimize(
-          lambda: model(input_value),
-          global_step=optimizer_step)
-      optimizer.minimize(
-          lambda: other_model(input_value),
-          global_step=optimizer_step)
-    else:
-      train_op = optimizer.minimize(
-          model(input_value), global_step=optimizer_step)
-      optimizer.minimize(
-          other_model(input_value),
-          global_step=optimizer_step)
-      self.evaluate(checkpointable_utils.gather_initializers(
-          root_checkpointable))
-      self.evaluate(train_op)
-    named_variables, serialized_graph, _ = (
-        checkpointable_utils._serialize_object_graph(
-            root_checkpointable, saveables_cache=None))
-    expected_checkpoint_names = (
-        # Created in the root node, so no prefix.
-        "optimizer_step",
-        "model/_second/kernel",
-        "model/_named_dense/kernel",
-        "model/_named_dense/bias",
-        # non-Layer dependency of the model
-        "model/_non_layer/a_variable",
-        # The optimizer creates two non-slot variables
-        "optimizer/beta1_power",
-        "optimizer/beta2_power",
-        # Slot variables
+    optimizer = adam.Adam(0.001)
+    step = training_util.get_or_create_global_step()
+    root_trackable = trackable_utils.Checkpoint(
+        optimizer=optimizer, model=model, step=step)
+
+    with backprop.GradientTape() as tape:
+      loss = model(input_value)
+    variables = model.trainable_variables
+    gradients = tape.gradient(loss, variables)
+    train_op = control_flow_ops.group(
+        optimizer.apply_gradients(zip(gradients, variables)),
+        step.assign_add(1))
+
+    with backprop.GradientTape() as tape:
+      loss = other_model(input_value)
+    variables = other_model.trainable_variables
+    gradients = tape.gradient(loss, variables)
+    optimizer.apply_gradients(zip(gradients, variables))
+
+    self.evaluate(trackable_utils.gather_initializers(
+        root_trackable))
+    self.evaluate(train_op)
+    named_variables, serialized_graph, _ = graph_view.ObjectGraphView(
+        root_trackable).serialize_object_graph()
+    expected_slot_keys = (
         "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/m",
         "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/v",
         "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m",
@@ -294,9 +294,26 @@ class CheckpointingTests(test.TestCase):
         "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/m",
         "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/v",
     )
+    expected_checkpoint_names = (
+        # Created in the root node, so no prefix.
+        "step",
+        "model/_second/kernel",
+        "model/_named_dense/kernel",
+        "model/_named_dense/bias",
+        # non-Layer dependency of the model
+        "model/_non_layer/a_variable",
+        "optimizer/learning_rate",
+        "optimizer/beta_1",
+        "optimizer/beta_2",
+        "optimizer/epsilon",
+        "optimizer/iter",
+        "optimizer/decay",
+    ) + expected_slot_keys
     suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
     expected_checkpoint_names = [
         name + suffix for name in expected_checkpoint_names]
+    expected_checkpoint_names.append(
+        "optimizer/.ATTRIBUTES/OBJECT_CONFIG_JSON")
     # The Dense layers also save get_config() JSON
     expected_checkpoint_names.extend(
         ["model/_second/.ATTRIBUTES/OBJECT_CONFIG_JSON",
@@ -307,7 +324,7 @@ class CheckpointingTests(test.TestCase):
     # Check that we've mapped to the right variable objects (not exhaustive)
     self.assertEqual(
         "global_step",
-        named_variables["optimizer_step" + suffix].full_name)
+        named_variables["step" + suffix].full_name)
     self.assertEqual(
         "my_model/dense_1/kernel",
         named_variables["model/_second/kernel" + suffix].full_name)
@@ -315,53 +332,36 @@ class CheckpointingTests(test.TestCase):
         "my_model/dense/kernel",
         named_variables["model/_named_dense/kernel" + suffix].full_name)
     self.assertEqual(
-        "beta1_power",
-        named_variables["optimizer/beta1_power" + suffix].full_name)
+        "beta_1",
+        named_variables["optimizer/beta_1" + suffix].full_name)
     self.assertEqual(
-        "beta2_power",
-        named_variables["optimizer/beta2_power" + suffix].full_name)
+        "beta_2",
+        named_variables["optimizer/beta_2" + suffix].full_name)
     # Spot check the generated protocol buffers.
     self.assertEqual("optimizer",
                      serialized_graph.nodes[0].children[1].local_name)
     optimizer_node = serialized_graph.nodes[serialized_graph.nodes[0].children[
         1].node_id]
-    self.assertEqual("beta1_power",
-                     optimizer_node.children[0].local_name)
-    self.assertEqual("beta1_power",
-                     serialized_graph.nodes[optimizer_node.children[0].node_id]
-                     .attributes[0].full_name)
-    self.assertEqual(
-        "my_model/dense/kernel",
-        serialized_graph.nodes[optimizer_node.slot_variables[0]
-                               .original_variable_node_id]
-        .attributes[0].full_name)
-    # We strip off the :0 suffix, as variable.name-based saving does.
-    self.assertEqual(
-        "my_model/dense/kernel/Adam",
-        serialized_graph.nodes[optimizer_node.slot_variables[0]
-                               .slot_variable_node_id]
-        .attributes[0].full_name)
-    self.assertEqual(
-        "my_model/dense/kernel/Adam:0",
-        optimizer.get_slot(
-            var=model._named_dense.kernel,
-            name="m").name)
-    self.assertEqual(
-        "model/_named_dense/kernel" + suffix,
-        serialized_graph.nodes[
-            optimizer_node.slot_variables[0]
-            .original_variable_node_id].attributes[0].checkpoint_key)
-    self.assertEqual("m", optimizer_node.slot_variables[0].slot_name)
-    self.assertEqual(
-        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m" + suffix,
-        serialized_graph.nodes[
-            optimizer_node.slot_variables[0]
-            .slot_variable_node_id].attributes[0].checkpoint_key)
+    children = [node.local_name for node in optimizer_node.children]
+    six.assertCountEqual(
+        self,
+        # Non-slot dependencies
+        ["beta_1", "beta_2", "iter", "decay", "epsilon", "learning_rate"],
+        children)
+    serialized_slot_keys = []
+    for slot in optimizer_node.slot_variables:
+      for attribute in (
+          serialized_graph.nodes[slot.slot_variable_node_id].attributes):
+        serialized_slot_keys.append(attribute.checkpoint_key)
+    six.assertCountEqual(
+        self,
+        [key + suffix for key in expected_slot_keys],
+        serialized_slot_keys)
 
   @test_util.run_in_graph_and_eager_modes
   def testMoreComplexSaveableReturned(self):
     v = _OwnsMirroredVariables()
-    checkpoint = checkpointable_utils.Checkpoint(v=v)
+    checkpoint = trackable_utils.Checkpoint(v=v)
     test_dir = self.get_temp_dir()
     prefix = os.path.join(test_dir, "ckpt")
     self.evaluate(v.non_dep_variable.assign(42.))
@@ -397,44 +397,40 @@ class CheckpointingTests(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testSaveRestore(self):
     model = MyModel()
-    optimizer = adam.AdamOptimizer(0.001)
-    root_checkpointable = checkpointable_utils.Checkpoint(
+    optimizer = adam.Adam(0.001)
+    root_trackable = trackable_utils.Checkpoint(
         optimizer=optimizer, model=model)
     input_value = constant_op.constant([[3.]])
-    if context.executing_eagerly():
-      optimizer.minimize(
-          lambda: model(input_value))
-    else:
-      train_op = optimizer.minimize(model(input_value))
-      # TODO(allenl): Make initialization more pleasant when graph building.
-      root_checkpointable.save_counter  # pylint: disable=pointless-statement
-      self.evaluate(checkpointable_utils.gather_initializers(
-          root_checkpointable))
-      self.evaluate(train_op)
+    with backprop.GradientTape() as tape:
+      loss = model(input_value)
+    variables = model.trainable_variables
+    gradients = tape.gradient(loss, variables)
+    train_op = optimizer.apply_gradients(zip(gradients, variables))
+    root_trackable.save_counter  # pylint: disable=pointless-statement
+    self.evaluate(trackable_utils.gather_initializers(
+        root_trackable))
+    self.evaluate(train_op)
     prefix = os.path.join(self.get_temp_dir(), "ckpt")
     self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
     m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
     self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
-    save_path = root_checkpointable.save(file_prefix=prefix)
+    save_path = root_trackable.save(file_prefix=prefix)
     self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
-    self.evaluate(state_ops.assign(root_checkpointable.save_counter, 3))
-    optimizer_variables = self.evaluate(optimizer.variables())
+    self.evaluate(state_ops.assign(root_trackable.save_counter, 3))
+    optimizer_variables = self.evaluate(
+        sorted(optimizer.variables(), key=lambda v: v.name))
     self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
     # Immediate restoration
-    status = root_checkpointable.restore(save_path=save_path).assert_consumed()
+    status = root_trackable.restore(save_path=save_path).assert_consumed()
     status.run_restore_ops()
     self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
-    self.assertAllEqual(1, self.evaluate(root_checkpointable.save_counter))
+    self.assertAllEqual(1, self.evaluate(root_trackable.save_counter))
     self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
     if not context.executing_eagerly():
       return  # Restore-on-create is only supported when executing eagerly
     on_create_model = MyModel()
-    on_create_optimizer = adam.AdamOptimizer(
-        0.001,
-        # Preserve beta1_power and beta2_power when appying gradients so we can
-        # test that they've been restored correctly.
-        beta1=1.0, beta2=1.0)
-    on_create_root = checkpointable_utils.Checkpoint(
+    on_create_optimizer = adam.Adam(0.001)
+    on_create_root = trackable_utils.Checkpoint(
         optimizer=on_create_optimizer, model=on_create_model)
     # Deferred restoration
     status = on_create_root.restore(save_path=save_path)
@@ -450,20 +446,21 @@ class CheckpointingTests(test.TestCase):
     on_create_m_bias_slot = on_create_optimizer.get_slot(
         on_create_model._named_dense.variables[1], "m")
     status.assert_existing_objects_matched()
-    with self.assertRaises(AssertionError):
-      status.assert_consumed()
+    if not context.executing_eagerly():
+      with self.assertRaises(AssertionError):
+        status.assert_consumed()
     # Optimizer slot variables are created when the original variable is
     # restored.
     self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
-    self.assertAllEqual(optimizer_variables[2:],
-                        self.evaluate(on_create_optimizer.variables()))
     dummy_var = resource_variable_ops.ResourceVariable([1.])
-    on_create_optimizer.minimize(loss=dummy_var.read_value)
+    on_create_optimizer.minimize(loss=dummy_var.read_value,
+                                 var_list=[dummy_var])
     status.assert_existing_objects_matched()
     status.assert_consumed()
-    beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators()
-    self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power))
-    self.assertAllEqual(optimizer_variables[1], self.evaluate(beta2_power))
+    self.assertAllEqual(
+        optimizer_variables,
+        # Creation order is different, so .variables() needs to be re-sorted.
+        self.evaluate(sorted(optimizer.variables(), key=lambda v: v.name)))
 
   # TODO(allenl): Debug garbage created by this test in python3.
   def testDeferredRestorationUsageEager(self):
@@ -473,21 +470,22 @@ class CheckpointingTests(test.TestCase):
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     for training_continuation in range(3):
       model = MyModel()
-      optimizer = adam.AdamOptimizer(0.001)
-      root = checkpointable_utils.Checkpoint(
-          optimizer=optimizer, model=model,
-          optimizer_step=training_util.get_or_create_global_step())
+      optimizer = adam.Adam(0.001)
+      root = trackable_utils.Checkpoint(
+          optimizer=optimizer, model=model)
       root.restore(checkpoint_management.latest_checkpoint(
           checkpoint_directory))
       for _ in range(num_training_steps):
         # TODO(allenl): Use a Dataset and serialize/checkpoint it.
         input_value = constant_op.constant([[3.]])
-        optimizer.minimize(
-            lambda: model(input_value),  # pylint: disable=cell-var-from-loop
-            global_step=root.optimizer_step)
+        with backprop.GradientTape() as tape:
+          loss = model(input_value)
+        variables = model.trainable_variables
+        gradients = tape.gradient(loss, variables)
+        optimizer.apply_gradients(zip(gradients, variables))
       root.save(file_prefix=checkpoint_prefix)
       self.assertEqual((training_continuation + 1) * num_training_steps,
-                       root.optimizer_step.numpy())
+                       root.optimizer.iterations.numpy())
 
   def testUsageGraph(self):
     """Expected usage when graph building."""
@@ -498,14 +496,16 @@ class CheckpointingTests(test.TestCase):
       for training_continuation in range(3):
         with ops.Graph().as_default():
           model = MyModel()
-          optimizer = adam.AdamOptimizer(0.001)
-          root = checkpointable_utils.Checkpoint(
-              optimizer=optimizer, model=model,
-              global_step=training_util.get_or_create_global_step())
+          optimizer = adam.Adam(0.001)
+          root = trackable_utils.Checkpoint(
+              optimizer=optimizer, model=model)
           input_value = constant_op.constant([[3.]])
-          train_op = optimizer.minimize(
-              model(input_value),
-              global_step=root.global_step)
+          with backprop.GradientTape() as tape:
+            loss = model(input_value)
+          variables = model.trainable_variables
+          gradients = tape.gradient(loss, variables)
+          train_op = optimizer.apply_gradients(zip(gradients, variables))
+
           checkpoint_path = checkpoint_management.latest_checkpoint(
               checkpoint_directory)
           with self.session(graph=ops.get_default_graph()) as session:
@@ -524,7 +524,7 @@ class CheckpointingTests(test.TestCase):
               session.run(train_op)
             root.save(file_prefix=checkpoint_prefix, session=session)
             self.assertEqual((training_continuation + 1) * num_training_steps,
-                             session.run(root.global_step))
+                             session.run(root.optimizer.iterations))
             self.assertEqual(training_continuation + 1,
                              session.run(root.save_counter))
 
@@ -534,21 +534,23 @@ class CheckpointingTests(test.TestCase):
     # Does create garbage when executing eagerly due to ops.Graph() creation.
     num_training_steps = 10
     checkpoint_directory = self.get_temp_dir()
+    def _train_fn(model, input_value):
+      with backprop.GradientTape() as tape:
+        loss = model(input_value)
+      variables = model.trainable_variables
+      gradients = tape.gradient(loss, variables)
+      return optimizer.apply_gradients(zip(gradients, variables))
     for training_continuation in range(3):
       with test_util.device(use_gpu=True):
         model = MyModel()
-        optimizer = adam.AdamOptimizer(0.001)
-        root = checkpointable_utils.Checkpoint(
-            optimizer=optimizer, model=model,
-            global_step=training_util.get_or_create_global_step())
+        optimizer = adam.Adam(0.001)
+        root = trackable_utils.Checkpoint(
+            optimizer=optimizer, model=model)
         manager = checkpoint_management.CheckpointManager(
             root, checkpoint_directory, max_to_keep=1)
         status = root.restore(save_path=manager.latest_checkpoint)
         input_value = constant_op.constant([[3.]])
-        train_fn = functools.partial(
-            optimizer.minimize,
-            functools.partial(model, input_value),
-            global_step=root.global_step)
+        train_fn = functools.partial(_train_fn, model, input_value)
         if not context.executing_eagerly():
           train_fn = functools.partial(self.evaluate, train_fn())
         status.initialize_or_restore()
@@ -556,40 +558,42 @@ class CheckpointingTests(test.TestCase):
           train_fn()
         manager.save()
         self.assertEqual((training_continuation + 1) * num_training_steps,
-                         self.evaluate(root.global_step))
+                         self.evaluate(root.optimizer.iterations))
         self.assertEqual(training_continuation + 1,
                          self.evaluate(root.save_counter))
 
   @test_util.run_in_graph_and_eager_modes
   def testFreezing(self):
-    with self.cached_session(use_gpu=True) as session:
+    with test_util.use_gpu():
       # Save an object-based checkpoint using a frozen saver
       directory = self.get_temp_dir()
       prefix = os.path.join(directory, "ckpt")
       v = resource_variable_ops.ResourceVariable(0, dtype=dtypes.int64)
-      checkpoint = checkpointable_utils.Checkpoint(v=v)
+      checkpoint = trackable_utils.Checkpoint(v=v)
       self.evaluate(v.assign(3))
       # Create the save counter so assert_consumed doesn't complain about it not
       # existing in the checkpoint on restore.
       self.evaluate(checkpoint.save_counter.assign(12))
-      saver = checkpointable_utils.frozen_saver(checkpoint)
-      save_path = saver.save(session, prefix)
+      saver = trackable_utils.frozen_saver(checkpoint)
+      with ops.device("cpu:0"):
+        prefix_tensor = constant_op.constant(prefix)
+      save_path = self.evaluate(saver.save(prefix_tensor))
       self.evaluate(v.assign(10))
       # Use the frozen saver to restore the same object graph
-      saver.restore(session, save_path)
+      self.evaluate(saver.restore(prefix_tensor))
       self.assertEqual(3, self.evaluate(v))
 
       # Restore using another frozen saver on an identical object graph
       del v, checkpoint, saver
       v = resource_variable_ops.ResourceVariable(0, dtype=dtypes.int64)
-      checkpoint = checkpointable_utils.Checkpoint(v=v)
-      saver = checkpointable_utils.frozen_saver(checkpoint)
-      saver.restore(session, save_path)
+      checkpoint = trackable_utils.Checkpoint(v=v)
+      saver = trackable_utils.frozen_saver(checkpoint)
+      self.evaluate(saver.restore(prefix_tensor))
       self.assertEqual(3, self.evaluate(v))
 
       # Restore as an object-based checkpoint
       del v, checkpoint, saver
-      checkpoint = checkpointable_utils.Checkpoint()
+      checkpoint = trackable_utils.Checkpoint()
       status = checkpoint.restore(save_path)
       v = resource_variable_ops.ResourceVariable(0, dtype=dtypes.int64)
       if context.executing_eagerly():
@@ -605,7 +609,7 @@ class CheckpointingTests(test.TestCase):
     directory = self.get_temp_dir()
     prefix = os.path.join(directory, "ckpt")
     step = resource_variable_ops.ResourceVariable(0, dtype=dtypes.int64)
-    checkpoint = checkpointable_utils.Checkpoint(step=step)
+    checkpoint = trackable_utils.Checkpoint(step=step)
     self.evaluate(step.initializer)
     for i in range(5):
       path = checkpoint.write("%s-%d" % (prefix, self.evaluate(step)))
@@ -625,10 +629,9 @@ class CheckpointingTests(test.TestCase):
       with test_util.device(use_gpu=True):
         model = MyModel()
         # Don't actually train so we can test variable values
-        optimizer = adam.AdamOptimizer(0.)
-        root = checkpointable_utils.Checkpoint(
-            optimizer=optimizer, model=model,
-            global_step=training_util.get_or_create_global_step())
+        optimizer = adam.Adam(0.)
+        root = trackable_utils.Checkpoint(
+            optimizer=optimizer, model=model)
         checkpoint_path = checkpoint_management.latest_checkpoint(
             checkpoint_directory)
         status = root.restore(save_path=checkpoint_path)
@@ -639,8 +642,7 @@ class CheckpointingTests(test.TestCase):
           with backprop.GradientTape() as tape:
             loss = _call_model(constant_op.constant([[3.]]))
           gradients = tape.gradient(loss, model.variables)
-          return optimizer.apply_gradients(zip(gradients, model.variables),
-                                           global_step=root.global_step)
+          return optimizer.apply_gradients(zip(gradients, model.variables))
         if not context.executing_eagerly():
           train_fn = functools.partial(
               self.evaluate, train_fn())
@@ -654,17 +656,17 @@ class CheckpointingTests(test.TestCase):
           self.evaluate(model.variables[0].assign([[42.]]))
         root.save(file_prefix=checkpoint_prefix)
         self.assertEqual((training_continuation + 1) * num_training_steps,
-                         self.evaluate(root.global_step))
+                         self.evaluate(optimizer.iterations))
         self.assertEqual(training_continuation + 1,
                          self.evaluate(root.save_counter))
   # pylint: enable=cell-var-from-loop
 
   def _get_checkpoint_name(self, name):
-    root = tracking.Checkpointable()
-    checkpointable_utils.add_variable(
+    root = tracking.AutoTrackable()
+    trackable_utils.add_variable(
         root, name=name, shape=[1, 2], dtype=dtypes.float64)
-    (named_variable,), _, _ = checkpointable_utils._serialize_object_graph(
-        root, saveables_cache=None)
+    (named_variable,), _, _ = graph_view.ObjectGraphView(
+        root).serialize_object_graph()
     with ops.name_scope("root/" + named_variable.name):
       pass  # Make sure we can use this as an op name if we prefix it.
     return named_variable.name
@@ -681,23 +683,23 @@ class CheckpointingTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testNumberedPath(self):
-    root = tracking.Checkpointable()
-    leaf = tracking.Checkpointable()
+    root = tracking.AutoTrackable()
+    leaf = tracking.AutoTrackable()
     root.leaf = leaf
-    checkpointable_utils.add_variable(leaf, name="v", shape=[])
-    (named_variable,), _, _ = checkpointable_utils._serialize_object_graph(
-        root, saveables_cache=None)
+    trackable_utils.add_variable(leaf, name="v", shape=[])
+    (named_variable,), _, _ = graph_view.ObjectGraphView(
+        root).serialize_object_graph()
     self.assertEqual(r"leaf/v/.ATTRIBUTES/VARIABLE_VALUE", named_variable.name)
 
   @test_util.run_in_graph_and_eager_modes
   def testLocalNameValidation(self):
-    root = tracking.Checkpointable()
-    leaf = tracking.Checkpointable()
+    root = tracking.AutoTrackable()
+    leaf = tracking.AutoTrackable()
     # Dots are escaped, which avoids conflicts with reserved names.
-    root._track_checkpointable(leaf, name=".ATTRIBUTES")
-    checkpointable_utils.add_variable(checkpointable=leaf, name="a", shape=[])
-    (named_variable,), _, _ = checkpointable_utils._serialize_object_graph(
-        root, saveables_cache=None)
+    root._track_trackable(leaf, name=".ATTRIBUTES")
+    trackable_utils.add_variable(trackable=leaf, name="a", shape=[])
+    (named_variable,), _, _ = graph_view.ObjectGraphView(
+        root).serialize_object_graph()
     self.assertEqual("..ATTRIBUTES/a/.ATTRIBUTES/VARIABLE_VALUE",
                      named_variable.name)
 
@@ -716,10 +718,10 @@ class CheckpointingTests(test.TestCase):
 
     with context.eager_mode():
       model = Model()
-      optimizer = adam.AdamOptimizer(learning_rate=0.05)
+      optimizer = adam.Adam(learning_rate=0.05)
       checkpoint_directory = self.get_temp_dir()
       checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      checkpoint = checkpointable_utils.Checkpoint(
+      checkpoint = trackable_utils.Checkpoint(
           model=model, optimizer=optimizer)
       for _ in range(2):
         checkpoint.save(checkpoint_prefix)
@@ -733,13 +735,13 @@ class CheckpointingTests(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testLateDependencyTracking(self):
 
-    class Dependency(tracking.Checkpointable):
+    class Dependency(tracking.AutoTrackable):
 
       def build(self):
-        self.var = checkpointable_utils.add_variable(
+        self.var = trackable_utils.add_variable(
             self, "var", initializer=0.)
 
-    class LateDependencies(tracking.Checkpointable):
+    class LateDependencies(trackable_utils.Checkpoint):
 
       def add_dep(self):
         self.dep = Dependency()
@@ -750,11 +752,9 @@ class CheckpointingTests(test.TestCase):
     self.evaluate(state_ops.assign(original.dep.var, 123.))
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_path = checkpointable_utils.CheckpointableSaver(
-        original).save(checkpoint_prefix)
+    save_path = original.save(checkpoint_prefix)
     load_into = LateDependencies()
-    status = checkpointable_utils.CheckpointableSaver(
-        load_into).restore(save_path)
+    status = load_into.restore(save_path)
     status.assert_existing_objects_matched()
     with self.assertRaises(AssertionError):
       status.assert_consumed()
@@ -766,13 +766,13 @@ class CheckpointingTests(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testDepAfterVar(self):
 
-    class Dependency(tracking.Checkpointable):
+    class Dependency(tracking.AutoTrackable):
 
       def build(self):
-        self.var = checkpointable_utils.add_variable(
+        self.var = trackable_utils.add_variable(
             self, "var", initializer=0.)
 
-    class DepAfterVar(tracking.Checkpointable):
+    class DepAfterVar(trackable_utils.Checkpoint):
 
       def add_dep(self):
         dep = Dependency()
@@ -784,12 +784,10 @@ class CheckpointingTests(test.TestCase):
     self.evaluate(state_ops.assign(dep_after_var.dep.var, -14.))
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    save_path = checkpointable_utils.CheckpointableSaver(dep_after_var).save(
-        checkpoint_prefix)
+    save_path = dep_after_var.save(checkpoint_prefix)
 
     loaded_dep_after_var = DepAfterVar()
-    status = checkpointable_utils.CheckpointableSaver(
-        loaded_dep_after_var).restore(save_path)
+    status = loaded_dep_after_var.restore(save_path)
     loaded_dep_after_var.add_dep()
     status.assert_consumed()
     status.run_restore_ops()
@@ -799,89 +797,86 @@ class CheckpointingTests(test.TestCase):
   def testDeferredSlotRestoration(self):
     checkpoint_directory = self.get_temp_dir()
 
-    root = tracking.Checkpointable()
-    root.var = checkpointable_utils.add_variable(
+    root = trackable_utils.Checkpoint()
+    root.var = trackable_utils.add_variable(
         root, name="var", initializer=0.)
-    optimizer = adam.AdamOptimizer(0.1)
-    if context.executing_eagerly():
-      optimizer.minimize(root.var.read_value)
-    else:
-      train_op = optimizer.minimize(root.var)
-      # Note that `optimizer` has not been added as a dependency of
-      # `root`. Create a one-off grouping so that slot variables for `root.var`
-      # get initialized too.
-      self.evaluate(checkpointable_utils.gather_initializers(
-          checkpointable_utils.Checkpoint(root=root, optimizer=optimizer)))
-      self.evaluate(train_op)
+    optimizer = adam.Adam(0.1)
+    variables = [root.var]
+    gradients = [1.]
+    train_op = optimizer.apply_gradients(zip(gradients, variables))
+    # Note that `optimizer` has not been added as a dependency of
+    # `root`. Create a one-off grouping so that slot variables for `root.var`
+    # get initialized too.
+    self.evaluate(trackable_utils.gather_initializers(
+        trackable_utils.Checkpoint(root=root, optimizer=optimizer)))
+    self.evaluate(train_op)
     self.evaluate(state_ops.assign(root.var, 12.))
-    no_slots_path = checkpointable_utils.CheckpointableSaver(root).save(
-        os.path.join(checkpoint_directory, "no_slots"))
+    no_slots_path = root.save(os.path.join(checkpoint_directory, "no_slots"))
     root.optimizer = optimizer
     self.evaluate(state_ops.assign(root.var, 13.))
-    self.evaluate(state_ops.assign(optimizer.get_slot(name="m", var=root.var),
-                                   14.))
-    slots_path = checkpointable_utils.CheckpointableSaver(root).save(
-        os.path.join(checkpoint_directory, "with_slots"))
-    new_root = tracking.Checkpointable()
+    self.evaluate(state_ops.assign(
+        optimizer.get_slot(slot_name="m", var=root.var),
+        14.))
+    slots_path = root.save(os.path.join(checkpoint_directory, "with_slots"))
+    new_root = trackable_utils.Checkpoint()
     # Load the slot-containing checkpoint (deferred), then immediately overwrite
     # the non-slot variable (also deferred).
-    slot_status = checkpointable_utils.CheckpointableSaver(
-        new_root).restore(slots_path)
-    no_slot_status = checkpointable_utils.CheckpointableSaver(
-        new_root).restore(no_slots_path)
+    slot_status = new_root.restore(slots_path)
+    no_slot_status = new_root.restore(no_slots_path)
     with self.assertRaises(AssertionError):
       no_slot_status.assert_consumed()
-    new_root.var = checkpointable_utils.add_variable(
+    new_root.var = trackable_utils.add_variable(
         new_root, name="var", shape=[])
     no_slot_status.assert_consumed()
     no_slot_status.run_restore_ops()
     self.assertEqual(12., self.evaluate(new_root.var))
-    new_root.optimizer = adam.AdamOptimizer(0.1)
+    new_root.optimizer = adam.Adam(0.1)
     slot_status.assert_existing_objects_matched()
-    with self.assertRaisesRegexp(AssertionError, "beta1_power"):
-      slot_status.assert_consumed()
+    if not context.executing_eagerly():
+      with self.assertRaisesRegexp(AssertionError, "Unresolved object"):
+        slot_status.assert_consumed()
     self.assertEqual(12., self.evaluate(new_root.var))
     if context.executing_eagerly():
       # Slot variables are only created with restoring initializers when
       # executing eagerly.
       self.assertEqual(14., self.evaluate(
-          new_root.optimizer.get_slot(name="m", var=new_root.var)))
+          new_root.optimizer.get_slot(slot_name="m", var=new_root.var)))
     else:
-      self.assertIs(new_root.optimizer.get_slot(name="m", var=new_root.var),
-                    None)
-    if context.executing_eagerly():
-      new_root.optimizer.minimize(new_root.var.read_value)
-    else:
-      train_op = new_root.optimizer.minimize(new_root.var)
-      # The slot variable now exists; restore() didn't create it, but we should
-      # now have a restore op for it.
-      slot_status.run_restore_ops()
+      # Slot variables are not created eagerly when graph building.
+      with self.assertRaises(KeyError):
+        new_root.optimizer.get_slot(slot_name="m", var=new_root.var)
+    variables = [new_root.var]
+    gradients = [1.]
+    train_op = new_root.optimizer.apply_gradients(zip(gradients, variables))
+    # The slot variable now exists; restore() didn't create it, but we should
+    # now have a restore op for it.
+    slot_status.run_restore_ops()
+    if not context.executing_eagerly():
+      # The train op hasn't run when graph building, so the slot variable has
+      # its restored value. It has run in eager, so the value will be different.
       self.assertEqual(14., self.evaluate(
-          new_root.optimizer.get_slot(name="m", var=new_root.var)))
-      self.evaluate(train_op)
+          new_root.optimizer.get_slot(slot_name="m", var=new_root.var)))
+    self.evaluate(train_op)
     slot_status.assert_consumed()
 
   @test_util.run_in_graph_and_eager_modes
   def testOverlappingRestores(self):
     checkpoint_directory = self.get_temp_dir()
-    save_root = tracking.Checkpointable()
-    save_root.dep = tracking.Checkpointable()
-    save_root.dep.var = checkpointable_utils.add_variable(
+    save_root = trackable_utils.Checkpoint()
+    save_root.dep = tracking.AutoTrackable()
+    save_root.dep.var = trackable_utils.add_variable(
         save_root.dep, name="var", initializer=0.)
     self.evaluate(state_ops.assign(save_root.dep.var, 12.))
-    saver = checkpointable_utils.CheckpointableSaver(save_root)
-    first_path = saver.save(os.path.join(checkpoint_directory, "first"))
+    first_path = save_root.save(os.path.join(checkpoint_directory, "first"))
     self.evaluate(state_ops.assign(save_root.dep.var, 13.))
-    second_path = saver.save(os.path.join(checkpoint_directory, "second"))
-
-    first_root = tracking.Checkpointable()
-    second_root = tracking.Checkpointable()
-    first_status = checkpointable_utils.CheckpointableSaver(
-        first_root).restore(first_path)
-    second_status = checkpointable_utils.CheckpointableSaver(
-        second_root).restore(second_path)
-    load_dep = tracking.Checkpointable()
-    load_dep.var = checkpointable_utils.add_variable(
+    second_path = save_root.save(os.path.join(checkpoint_directory, "second"))
+
+    first_root = trackable_utils.Checkpoint()
+    second_root = trackable_utils.Checkpoint()
+    first_status = first_root.restore(first_path)
+    second_status = second_root.restore(second_path)
+    load_dep = tracking.AutoTrackable()
+    load_dep.var = trackable_utils.add_variable(
         load_dep, name="var", shape=[])
     first_root.dep = load_dep
     first_status.assert_consumed()
@@ -894,14 +889,12 @@ class CheckpointingTests(test.TestCase):
 
     # Try again with the order of the restore() reversed. The last restore
     # determines the final value.
-    first_root = tracking.Checkpointable()
-    second_root = tracking.Checkpointable()
-    second_status = checkpointable_utils.CheckpointableSaver(
-        second_root).restore(second_path)
-    first_status = checkpointable_utils.CheckpointableSaver(
-        first_root).restore(first_path)
-    load_dep = tracking.Checkpointable()
-    load_dep.var = checkpointable_utils.add_variable(
+    first_root = trackable_utils.Checkpoint()
+    second_root = trackable_utils.Checkpoint()
+    second_status = second_root.restore(second_path)
+    first_status = first_root.restore(first_path)
+    load_dep = tracking.AutoTrackable()
+    load_dep.var = trackable_utils.add_variable(
         load_dep, name="var", shape=[])
     first_root.dep = load_dep
     first_status.assert_consumed()
@@ -916,24 +909,22 @@ class CheckpointingTests(test.TestCase):
   def testAmbiguousLoad(self):
     # Not OK to split one checkpoint object into two
     checkpoint_directory = self.get_temp_dir()
-    save_root = tracking.Checkpointable()
-    save_root.dep_one = tracking.Checkpointable()
-    save_root.dep_two = tracking.Checkpointable()
-    dep_three = tracking.Checkpointable()
+    save_root = trackable_utils.Checkpoint()
+    save_root.dep_one = tracking.AutoTrackable()
+    save_root.dep_two = tracking.AutoTrackable()
+    dep_three = tracking.AutoTrackable()
     save_root.dep_one.dep_three = dep_three
     save_root.dep_two.dep_three = dep_three
-    checkpointable_utils.add_variable(dep_three, name="var", initializer=0.)
-    self.evaluate(checkpointable_utils.gather_initializers(save_root))
-    save_path = checkpointable_utils.CheckpointableSaver(save_root).save(
-        os.path.join(checkpoint_directory, "ckpt"))
-    load_root = tracking.Checkpointable()
-    status = checkpointable_utils.CheckpointableSaver(load_root).restore(
-        save_path)
-    load_root.dep_one = tracking.Checkpointable()
-    load_root.dep_two = tracking.Checkpointable()
-    load_root.dep_one.dep_three = tracking.Checkpointable()
-    load_root.dep_two.dep_three = tracking.Checkpointable()
-    checkpointable_utils.add_variable(
+    trackable_utils.add_variable(dep_three, name="var", initializer=0.)
+    self.evaluate(trackable_utils.gather_initializers(save_root))
+    save_path = save_root.save(os.path.join(checkpoint_directory, "ckpt"))
+    load_root = trackable_utils.Checkpoint()
+    status = load_root.restore(save_path)
+    load_root.dep_one = tracking.AutoTrackable()
+    load_root.dep_two = tracking.AutoTrackable()
+    load_root.dep_one.dep_three = tracking.AutoTrackable()
+    load_root.dep_two.dep_three = tracking.AutoTrackable()
+    trackable_utils.add_variable(
         load_root.dep_one.dep_three, name="var", initializer=0.)
     with self.assertRaises(AssertionError):
       status.assert_consumed()
@@ -944,24 +935,23 @@ class CheckpointingTests(test.TestCase):
   def testObjectsCombined(self):
     # Currently fine to load two checkpoint objects into one Python object
     checkpoint_directory = self.get_temp_dir()
-    save_root = tracking.Checkpointable()
-    save_root.dep_one = tracking.Checkpointable()
-    save_root.dep_two = tracking.Checkpointable()
-    checkpointable_utils.add_variable(
+    save_root = trackable_utils.Checkpoint()
+    save_root.dep_one = tracking.AutoTrackable()
+    save_root.dep_two = tracking.AutoTrackable()
+    trackable_utils.add_variable(
         save_root.dep_one, name="var1", initializer=32., dtype=dtypes.float64)
-    checkpointable_utils.add_variable(
+    trackable_utils.add_variable(
         save_root.dep_two, name="var2", initializer=64., dtype=dtypes.float64)
-    self.evaluate(checkpointable_utils.gather_initializers(save_root))
-    save_path = checkpointable_utils.CheckpointableSaver(save_root).save(
-        os.path.join(checkpoint_directory, "ckpt"))
-    load_root = tracking.Checkpointable()
-    load_root.dep_one = tracking.Checkpointable()
+    self.evaluate(trackable_utils.gather_initializers(save_root))
+    save_path = save_root.save(os.path.join(checkpoint_directory, "ckpt"))
+    load_root = trackable_utils.Checkpoint()
+    load_root.dep_one = tracking.AutoTrackable()
     load_root.dep_two = load_root.dep_one
-    v1 = checkpointable_utils.add_variable(
+    v1 = trackable_utils.add_variable(
         load_root.dep_one, name="var1", shape=[], dtype=dtypes.float64)
-    v2 = checkpointable_utils.add_variable(
+    v2 = trackable_utils.add_variable(
         load_root.dep_one, name="var2", shape=[], dtype=dtypes.float64)
-    status = checkpointable_utils.CheckpointableSaver(load_root).restore(
+    status = load_root.restore(
         save_path).assert_consumed().assert_existing_objects_matched()
     status.run_restore_ops()
     self.assertEqual(32., self.evaluate(v1))
@@ -971,31 +961,29 @@ class CheckpointingTests(test.TestCase):
   def testDependencyLoop(self):
     # Note: this test creates garbage during eager execution because it
     # purposefully creates a reference cycle.
-    first = tracking.Checkpointable()
-    second = tracking.Checkpointable()
+    first = trackable_utils.Checkpoint()
+    second = trackable_utils.Checkpoint()
     first.second = second
     second.first = first
-    first.v = checkpointable_utils.add_variable(
+    first.v = trackable_utils.add_variable(
         first, "v1", initializer=[3., 1., 4.])
-    second.v = checkpointable_utils.add_variable(
+    second.v = trackable_utils.add_variable(
         second, "v2", initializer=[1., 1., 2., 3.])
-    self.evaluate(checkpointable_utils.gather_initializers(first))
+    self.evaluate(trackable_utils.gather_initializers(first))
     checkpoint_directory = self.get_temp_dir()
-    save_path = checkpointable_utils.CheckpointableSaver(first).save(
-        os.path.join(checkpoint_directory, "ckpt"))
+    save_path = first.save(os.path.join(checkpoint_directory, "ckpt"))
 
     # Test deferred loading
-    first_load = tracking.Checkpointable()
-    status = checkpointable_utils.CheckpointableSaver(
-        first_load).restore(save_path)
-    second_load = tracking.Checkpointable()
+    first_load = trackable_utils.Checkpoint()
+    status = first_load.restore(save_path)
+    second_load = tracking.AutoTrackable()
     first_load.second = second_load
     second_load.first = first_load
     with self.assertRaises(AssertionError):
       status.assert_consumed()
-    first_load.v = checkpointable_utils.add_variable(
+    first_load.v = trackable_utils.add_variable(
         first_load, "v1", shape=[3])
-    second_load.v = checkpointable_utils.add_variable(
+    second_load.v = trackable_utils.add_variable(
         second_load, "v2", shape=[4])
     status.assert_consumed()
     status.run_restore_ops()
@@ -1007,8 +995,7 @@ class CheckpointingTests(test.TestCase):
     self.assertAllEqual([2., 7., 1.], self.evaluate(first_load.v))
     self.evaluate(second_load.v.assign([2., 7., 1., 8.]))
     self.assertAllEqual([2., 7., 1., 8.], self.evaluate(second_load.v))
-    status = checkpointable_utils.CheckpointableSaver(first_load).restore(
-        save_path).assert_consumed()
+    status = first_load.restore(save_path).assert_consumed()
     status.run_restore_ops()
     self.assertAllEqual([3., 1., 4.], self.evaluate(first_load.v))
     self.assertAllEqual([1., 1., 2., 3.], self.evaluate(second_load.v))
@@ -1017,19 +1004,17 @@ class CheckpointingTests(test.TestCase):
   def testRestoreOnAssign(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    first = tracking.Checkpointable()
-    first.var1 = variables.Variable(0., name="outside_var")
-    first.var2 = variables.Variable(0., name="blah")
+    first = trackable_utils.Checkpoint()
+    first.var1 = variables_lib.Variable(0., name="outside_var")
+    first.var2 = variables_lib.Variable(0., name="blah")
     self.evaluate(first.var1.assign(4.))
     self.evaluate(first.var2.assign(8.))
-    save_path = checkpointable_utils.CheckpointableSaver(first).save(
-        checkpoint_prefix)
-
-    second = tracking.Checkpointable()
-    second.var2 = variables.Variable(0., name="blah")
-    status = checkpointable_utils.CheckpointableSaver(
-        second).restore(save_path)
-    recreated_var1 = variables.Variable(0., name="outside_var")
+    save_path = first.save(checkpoint_prefix)
+
+    second = trackable_utils.Checkpoint()
+    second.var2 = variables_lib.Variable(0., name="blah")
+    status = second.restore(save_path)
+    recreated_var1 = variables_lib.Variable(0., name="outside_var")
     status.run_restore_ops()
     self.assertEqual(8., self.evaluate(second.var2))
     self.evaluate(recreated_var1.assign(-2.))
@@ -1045,26 +1030,26 @@ class CheckpointingTests(test.TestCase):
       with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = tracking.Checkpointable()
-        obj.var = variable_scope.get_variable(name="v", initializer=0.)
-        obj.opt = adam.AdamOptimizer(0.1)
-        obj.opt.minimize(obj.var.read_value())
-        self.evaluate(checkpointable_utils.gather_initializers(obj))
-        saver = checkpointable_utils.CheckpointableSaver(obj)
-        saver.save(checkpoint_prefix)
-        before_ops = graph.get_operations()
-        saver.save(checkpoint_prefix)
-        self.assertEqual(before_ops, graph.get_operations())
+        obj = trackable_utils.Checkpoint()
+        obj.var = variables_lib.Variable(0., name="v")
+        obj.opt = adam.Adam(0.1)
+        variables = [obj.var]
+        gradients = [1.]
+        obj.opt.apply_gradients(zip(gradients, variables))
+        self.evaluate(trackable_utils.gather_initializers(obj))
+        obj.save(checkpoint_prefix)
+        graph.finalize()
+        obj.save(checkpoint_prefix)
 
   @test_util.run_in_graph_and_eager_modes
   def testCheckpointState(self):
     # No checkpoints are deleted by default
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    obj = tracking.Checkpointable()
+    obj = tracking.AutoTrackable()
     obj.var = variable_scope.get_variable(name="v", initializer=0.)
-    self.evaluate(checkpointable_utils.gather_initializers(obj))
-    saver = checkpointable_utils.Checkpoint(obj=obj)
+    self.evaluate(trackable_utils.gather_initializers(obj))
+    saver = trackable_utils.Checkpoint(obj=obj)
     for _ in range(10):
       saver.save(checkpoint_prefix)
     expected_filenames = ["checkpoint"]
@@ -1081,10 +1066,10 @@ class CheckpointingTests(test.TestCase):
   def testCheckpointStateChangingVarList(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    obj = tracking.Checkpointable()
+    obj = tracking.AutoTrackable()
     obj.var = variable_scope.get_variable(name="v", initializer=0.)
-    self.evaluate(checkpointable_utils.gather_initializers(obj))
-    checkpoint = checkpointable_utils.Checkpoint(obj=obj)
+    self.evaluate(trackable_utils.gather_initializers(obj))
+    checkpoint = trackable_utils.Checkpoint(obj=obj)
     looped_variables = []
     for iteration in range(10):
       new_variable = resource_variable_ops.ResourceVariable(iteration)
@@ -1134,80 +1119,22 @@ class CheckpointingTests(test.TestCase):
       with graph.as_default(), self.session(graph):
         checkpoint_directory = self.get_temp_dir()
         checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-        obj = tracking.Checkpointable()
-        obj.var = variable_scope.get_variable(name="v", initializer=0.)
-        obj.opt = adam.AdamOptimizer(0.1)
-        obj.opt.minimize(obj.var.read_value())
-        self.evaluate(checkpointable_utils.gather_initializers(obj))
-        saver = checkpointable_utils.CheckpointableSaver(obj)
-        save_path = saver.save(checkpoint_prefix)
-        saver.restore(save_path)
-        before_ops = graph.get_operations()
-        saver.restore(save_path)
-        self.assertEqual(before_ops, graph.get_operations())
-
-  def testMultipleGraphsNonSlotVariables(self):
-    with context.graph_mode():
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      optimizer = adam.AdamOptimizer(0.001)
-      # Construct a model in one graph
-      first_graph = ops.Graph()
-      first_session = session_lib.Session(graph=first_graph)
-      with first_graph.as_default(), first_session.as_default():
-        first_variable = resource_variable_ops.ResourceVariable([1.])
-        first_root_checkpointable = checkpointable_utils.Checkpoint(
-            optimizer=optimizer, variable=first_variable)
-        train_op = optimizer.minimize(first_variable.read_value)
-        self.evaluate(checkpointable_utils.gather_initializers(
-            first_root_checkpointable))
-        self.evaluate(train_op)
-        self.evaluate(first_variable.assign([1.]))
-        self.evaluate(optimizer.get_slot(
-            var=first_variable, name="m").assign([2.]))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.evaluate(beta1_power.assign(3.))
-
-      # Save and load in a second graph
-      second_graph = ops.Graph()
-      with second_graph.as_default(), session_lib.Session(graph=second_graph):
-        second_variable = resource_variable_ops.ResourceVariable([1.])
-        second_root_checkpointable = checkpointable_utils.Checkpoint(
-            optimizer=optimizer, variable=second_variable)
-        train_op = optimizer.minimize(second_variable.read_value)
-        second_root_checkpointable.restore(None).initialize_or_restore()
-        self.evaluate(train_op)
-        self.evaluate(second_variable.assign([4.]))
-        self.evaluate(optimizer.get_slot(
-            var=second_variable, name="m").assign([5.]))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.evaluate(beta1_power.assign(6.))
-        save_path = second_root_checkpointable.save(checkpoint_prefix)
-        self.evaluate(second_variable.assign([7.]))
-        self.evaluate(optimizer.get_slot(
-            var=second_variable, name="m").assign([8.]))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.assertAllEqual(6., self.evaluate(beta1_power))
-        status = second_root_checkpointable.restore(save_path)
-        status.assert_consumed().run_restore_ops()
-        self.assertAllEqual([4.], self.evaluate(second_variable))
-        self.assertAllEqual([5.], self.evaluate(optimizer.get_slot(
-            var=second_variable, name="m")))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.assertAllEqual(6., self.evaluate(beta1_power))
-
-      # Check that the first graph is unmolested
-      with first_graph.as_default(), first_session.as_default():
-        self.assertAllEqual([1.], self.evaluate(first_variable))
-        self.assertAllEqual([2.], self.evaluate(optimizer.get_slot(
-            var=first_variable, name="m")))
-        beta1_power, _ = optimizer._get_beta_accumulators()
-        self.assertAllEqual(3., self.evaluate(beta1_power))
+        obj = trackable_utils.Checkpoint()
+        obj.var = variables_lib.Variable(0., name="v")
+        obj.opt = adam.Adam(0.1)
+        variables = [obj.var]
+        gradients = [1.]
+        obj.opt.apply_gradients(zip(gradients, variables))
+        self.evaluate(trackable_utils.gather_initializers(obj))
+        save_path = obj.save(checkpoint_prefix)
+        obj.restore(save_path)
+        graph.finalize()
+        obj.restore(save_path)
 
   @test_util.run_in_graph_and_eager_modes
   def test_sequential(self):
     model = sequential.Sequential()
-    checkpoint = checkpointable_utils.Checkpoint(model=model)
+    checkpoint = trackable_utils.Checkpoint(model=model)
     model.add(core.Dense(4))
     second_dense = core.Dense(5)
     model.add(second_dense)
@@ -1224,7 +1151,7 @@ class CheckpointingTests(test.TestCase):
     self.assertAllEqual([1., 2., 3., 4., 5.], self.evaluate(second_dense.bias))
 
     deferred_sequential = sequential.Sequential()
-    deferred_sequential_checkpoint = checkpointable_utils.Checkpoint(
+    deferred_sequential_checkpoint = trackable_utils.Checkpoint(
         model=deferred_sequential)
     status = deferred_sequential_checkpoint.restore(save_path)
     deferred_sequential.add(core.Dense(4))
@@ -1243,81 +1170,93 @@ class CheckpointingTests(test.TestCase):
     optimizer_only_prefix = os.path.join(checkpoint_directory, "opt")
     with test_util.device(use_gpu=True):
       model = MyModel()
-      optimizer = adam.AdamOptimizer(0.001)
-      root = checkpointable_utils.Checkpoint(
-          model=model,  # Do not save the optimizer with the checkpoint.
-          global_step=training_util.get_or_create_global_step())
-      optimizer_checkpoint = checkpointable_utils.Checkpoint(
+      optimizer = adam.Adam(0.001)
+      root = trackable_utils.Checkpoint(
+          model=model)  # Do not save the optimizer with the checkpoint.
+      optimizer_checkpoint = trackable_utils.Checkpoint(
           optimizer=optimizer)
 
       checkpoint_path = checkpoint_management.latest_checkpoint(
           checkpoint_directory)
       status = root.restore(save_path=checkpoint_path)
       input_value = constant_op.constant([[3.]])
-      train_fn = functools.partial(
-          optimizer.minimize,
-          functools.partial(model, input_value),
-          global_step=root.global_step)
+      def train_fn():
+        with backprop.GradientTape() as tape:
+          loss = model(input_value)
+        variables = model.trainable_variables
+        gradients = tape.gradient(loss, variables)
+        return optimizer.apply_gradients(zip(gradients, variables))
       if not context.executing_eagerly():
         train_fn = functools.partial(self.evaluate, train_fn())
       status.initialize_or_restore()
-      self.evaluate([v.initializer for v in optimizer.variables()])
+      # TODO(tanzheny): Add hyper variables to .variables(), and set them with
+      # set_weights etc.
+      variables_not_in_the_variables_property = [
+          obj for obj in optimizer._hyper.values()
+          if isinstance(obj, variables_lib.Variable)]
+      self.evaluate([v.initializer for v
+                     in optimizer.variables()
+                     + variables_not_in_the_variables_property])
       train_fn()
       model_save_path = root.save(file_prefix=checkpoint_prefix)
-      self.evaluate(optimizer.variables()[0].assign(42.))
+      self.evaluate(optimizer.beta_1.assign(42.))
       optimizer_save_path = optimizer_checkpoint.save(optimizer_only_prefix)
+    del train_fn
 
     # Restore into a graph with the optimizer
     with test_util.device(use_gpu=True):
       model = MyModel()
-      optimizer = adam.AdamOptimizer(0.001)
-      root = checkpointable_utils.Checkpoint(
-          optimizer=optimizer, model=model,
-          global_step=training_util.get_or_create_global_step())
+      optimizer = adam.Adam(0.001)
+      root = trackable_utils.Checkpoint(
+          optimizer=optimizer, model=model)
       status = root.restore(save_path=model_save_path)
       input_value = constant_op.constant([[3.]])
-      train_fn = functools.partial(
-          optimizer.minimize,
-          functools.partial(model, input_value),
-          global_step=root.global_step)
+      def train_fn1():
+        with backprop.GradientTape() as tape:
+          loss = model(input_value)
+        variables = model.trainable_variables
+        gradients = tape.gradient(loss, variables)
+        return optimizer.apply_gradients(zip(gradients, variables))
       if not context.executing_eagerly():
-        train_fn = functools.partial(self.evaluate, train_fn())
+        train_fn1 = functools.partial(self.evaluate, train_fn1())
       status.initialize_or_restore()
-      train_fn()
+      train_fn1()
       with self.assertRaises(AssertionError):
         status.assert_existing_objects_matched()
       with self.assertRaises(AssertionError):
         status.assert_consumed()
+    del train_fn1
 
     # Make sure initialization doesn't clobber later restores
     with test_util.device(use_gpu=True):
       model = MyModel()
-      optimizer = adam.AdamOptimizer(0.001, beta1=1.0)
-      root = checkpointable_utils.Checkpoint(
-          optimizer=optimizer, model=model,
-          global_step=training_util.get_or_create_global_step())
-      opt_root = checkpointable_utils.Checkpoint(
+      optimizer = adam.Adam(0.001, beta_1=1.0)
+      root = trackable_utils.Checkpoint(
+          optimizer=optimizer, model=model)
+      opt_root = trackable_utils.Checkpoint(
           optimizer=optimizer)
       status = root.restore(save_path=model_save_path)
       init_only_optimizer_status = opt_root.restore(save_path=None)
       optimizer_status = opt_root.restore(save_path=optimizer_save_path)
       input_value = constant_op.constant([[3.]])
-      train_fn = functools.partial(
-          optimizer.minimize,
-          functools.partial(model, input_value),
-          global_step=root.global_step)
+      def train_fn2():
+        with backprop.GradientTape() as tape:
+          loss = model(input_value)
+        variables = model.trainable_variables
+        gradients = tape.gradient(loss, variables)
+        return optimizer.apply_gradients(zip(gradients, variables))
       if not context.executing_eagerly():
-        train_fn = functools.partial(self.evaluate, train_fn())
+        train_fn2 = functools.partial(self.evaluate, train_fn2())
       optimizer_status.run_restore_ops()
       status.initialize_or_restore()
       init_only_optimizer_status.initialize_or_restore()
-      train_fn()
-      self.assertEqual(42., self.evaluate(optimizer.variables()[0]))
+      train_fn2()
+      self.assertEqual(42., self.evaluate(optimizer.beta_1))
 
   @test_util.run_in_graph_and_eager_modes
-  def test_restore_after_adding_empty_checkpointable_data_structure(self):
-    model = NonLayerCheckpointable()
-    checkpoint = checkpointable_utils.Checkpoint(model=model)
+  def test_restore_after_adding_empty_trackable_data_structure(self):
+    model = NonLayerTrackable()
+    checkpoint = trackable_utils.Checkpoint(model=model)
     checkpoint.restore(None).initialize_or_restore()
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
@@ -1325,30 +1264,53 @@ class CheckpointingTests(test.TestCase):
 
     del model, checkpoint
 
-    model = NonLayerCheckpointable()
+    model = NonLayerTrackable()
     model.dict = {"a": 1}
     model.list = {"b": 1}
-    checkpoint = checkpointable_utils.Checkpoint(model=model)
+    checkpoint = trackable_utils.Checkpoint(model=model)
     load_status = checkpoint.restore(save_path)
     load_status.assert_existing_objects_matched().run_restore_ops()
 
-
-class _ManualScope(tracking.Checkpointable):
+  @test_util.run_in_graph_and_eager_modes
+  def test_write_checkpoint_from_function(self):
+    checkpoint_prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    save_checkpoint = trackable_utils.Checkpoint(
+        v=variables_lib.Variable(1.))
+
+    @def_function.function
+    def _write_checkpoint():
+      save_path = save_checkpoint.write(checkpoint_prefix)
+      return save_path
+
+    self.evaluate([save_checkpoint.v.initializer])
+    self.evaluate(_write_checkpoint())
+    load_checkpoint = trackable_utils.Checkpoint(
+        v=variables_lib.Variable(0.))
+    load_checkpoint.restore(checkpoint_prefix).run_restore_ops()
+    self.assertEqual(1., self.evaluate(load_checkpoint.v))
+    self.evaluate(save_checkpoint.v.assign(3.))
+    self.evaluate(_write_checkpoint())
+    self.evaluate(save_checkpoint.v.assign(0.))
+    load_checkpoint.restore(checkpoint_prefix).run_restore_ops()
+    self.assertEqual(3., self.evaluate(load_checkpoint.v))
+
+
+class _ManualScope(tracking.AutoTrackable):
 
   def __call__(self):
     with variable_scope.variable_scope("ManualScope") as vs:
       self.variable_scope = vs
-      with checkpointable_utils.capture_dependencies(template=self):
+      with trackable_utils.capture_dependencies(template=self):
         return self._build()
 
   def _build(self):
     return variable_scope.get_variable(name="in_manual_scope", shape=[])
 
 
-class TemplateTests(test.TestCase):
+class TemplateTests(parameterized.TestCase, test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
-  def test_checkpointable_save_restore(self):
+  def test_trackable_save_restore(self):
 
     def _templated():
       v = variable_scope.get_variable(
@@ -1365,14 +1327,15 @@ class TemplateTests(test.TestCase):
     six.assertCountEqual(
         self,
         [v1_save, v2_save, manual_scope, manual_scope_v, save_template],
-        checkpointable_utils.list_objects(save_template))
+        trackable_utils.list_objects(save_template))
     manual_dep, = manual_scope._checkpoint_dependencies
     self.assertEqual("in_manual_scope", manual_dep.name)
     self.assertIs(manual_scope_v, manual_dep.ref)
-    optimizer = adam.AdamOptimizer(0.0)
-    save_root = checkpointable_utils.Checkpoint(
+    optimizer = adam.Adam(0.0)
+    save_root = trackable_utils.Checkpoint(
         my_template=save_template, optimizer=optimizer)
-    optimizer.minimize(v1_save.read_value)
+    optimizer.minimize(v1_save.read_value,
+                       var_list=[v1_save])
     self.evaluate([v.initializer for v in save_template.variables])
     self.evaluate([v.initializer for v in optimizer.variables()])
     self.evaluate(v1_save.assign([12.]))
@@ -1382,13 +1345,13 @@ class TemplateTests(test.TestCase):
     save_path = save_root.save(checkpoint_prefix)
 
     load_template = template.make_template("s2", _templated)
-    load_optimizer = adam.AdamOptimizer(0.0)
-    load_root = checkpointable_utils.Checkpoint(
+    load_optimizer = adam.Adam(0.0)
+    load_root = trackable_utils.Checkpoint(
         my_template=load_template, optimizer=load_optimizer)
     status = load_root.restore(save_path)
     var, var_plus_one, var2, _, _ = load_template()
-    load_optimizer.minimize(var.read_value)
-    self.assertEqual(3, len(load_template._checkpoint_dependencies))
+    load_optimizer.minimize(var.read_value, var_list=[var])
+    self.assertLen(load_template._checkpoint_dependencies, 3)
     self.assertEqual("v", load_template._checkpoint_dependencies[0].name)
     self.assertEqual("v2", load_template._checkpoint_dependencies[1].name)
     self.assertEqual("ManualScope",
@@ -1399,7 +1362,7 @@ class TemplateTests(test.TestCase):
     self.assertAllEqual([14.], self.evaluate(var2))
 
   @test_util.run_in_graph_and_eager_modes
-  def test_checkpointable_save_restore_nested(self):
+  def test_trackable_save_restore_nested(self):
 
     def _inner_template():
       v = variable_scope.get_variable(
@@ -1416,7 +1379,7 @@ class TemplateTests(test.TestCase):
 
     with variable_scope.variable_scope("ignored"):
       save_template = template.make_template("s1", _outer_template)
-      save_root = checkpointable_utils.Checkpoint(my_template=save_template)
+      save_root = trackable_utils.Checkpoint(my_template=save_template)
       (inner_template_one, inner_template_two), _ = save_template()
     self.evaluate(inner_template_one.variables[0].assign([20.]))
     self.evaluate(inner_template_two.variables[0].assign([25.]))
@@ -1425,18 +1388,18 @@ class TemplateTests(test.TestCase):
     save_path = save_root.save(checkpoint_prefix)
 
     load_template = template.make_template("s2", _outer_template)
-    load_root = checkpointable_utils.Checkpoint(my_template=load_template)
+    load_root = trackable_utils.Checkpoint(my_template=load_template)
     status = load_root.restore(save_path)
     (inner_template_one, inner_template_two), (v1, v2, v3) = load_template()
     outer_template_dependencies = load_root.my_template._checkpoint_dependencies
-    self.assertEqual(2, len(outer_template_dependencies))
+    self.assertLen(outer_template_dependencies, 2)
     self.assertEqual("i1", outer_template_dependencies[0].name)
     self.assertIs(inner_template_one, outer_template_dependencies[0].ref)
     self.assertEqual("i2", outer_template_dependencies[1].name)
     self.assertIs(inner_template_two, outer_template_dependencies[1].ref)
-    self.assertEqual(1, len(inner_template_one._checkpoint_dependencies))
+    self.assertLen(inner_template_one._checkpoint_dependencies, 1)
     self.assertEqual("v", inner_template_one._checkpoint_dependencies[0].name)
-    self.assertEqual(1, len(inner_template_two._checkpoint_dependencies))
+    self.assertLen(inner_template_two._checkpoint_dependencies, 1)
     self.assertEqual("v", inner_template_two._checkpoint_dependencies[0].name)
     status.assert_consumed().run_restore_ops()
     self.assertAllEqual([20.], self.evaluate(v1))
@@ -1449,42 +1412,41 @@ class CheckpointCompatibilityTests(test.TestCase):
   def _initialized_model(self):
     input_value = constant_op.constant([[3.]])
     model = MyModel()
-    optimizer = adam.AdamOptimizer(0.001)
-    optimizer_step = training_util.get_or_create_global_step()
-    root_checkpointable = checkpointable_utils.Checkpoint(
-        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
-    train_op = optimizer.minimize(
-        functools.partial(model, input_value),
-        global_step=optimizer_step)
-    self.evaluate(checkpointable_utils.gather_initializers(
-        root_checkpointable))
+    optimizer = adam.Adam(0.001)
+    root_trackable = trackable_utils.Checkpoint(
+        optimizer=optimizer, model=model)
+    with backprop.GradientTape() as tape:
+      loss = model(input_value)
+    variables = model.trainable_variables
+    gradients = tape.gradient(loss, variables)
+    train_op = optimizer.apply_gradients(zip(gradients, variables))
+    self.evaluate(trackable_utils.gather_initializers(
+        root_trackable))
     self.evaluate(train_op)
     # A regular variable, a slot variable, and a non-slot Optimizer variable
     # with known values to check when loading.
     self.evaluate(model._named_dense.bias.assign([1.]))
     self.evaluate(optimizer.get_slot(
-        var=model._named_dense.bias, name="m").assign([2.]))
-    beta1_power, _ = optimizer._get_beta_accumulators()
-    self.evaluate(beta1_power.assign(3.))
-    return root_checkpointable
+        var=model._named_dense.bias, slot_name="m").assign([2.]))
+    self.evaluate(optimizer.beta_1.assign(3.))
+    return root_trackable
 
-  def _set_sentinels(self, root_checkpointable):
-    self.evaluate(root_checkpointable.model._named_dense.bias.assign([101.]))
+  def _set_sentinels(self, root_trackable):
+    self.evaluate(root_trackable.model._named_dense.bias.assign([101.]))
     self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, name="m")
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, slot_name="m")
         .assign([102.]))
-    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
-    self.evaluate(beta1_power.assign(103.))
+    self.evaluate(root_trackable.optimizer.beta_1.assign(103.))
 
-  def _check_sentinels(self, root_checkpointable):
+  def _check_sentinels(self, root_trackable):
     self.assertAllEqual(
-        [1.], self.evaluate(root_checkpointable.model._named_dense.bias))
+        [1.], self.evaluate(root_trackable.model._named_dense.bias))
     self.assertAllEqual([2.], self.evaluate(
-        root_checkpointable.optimizer.get_slot(
-            var=root_checkpointable.model._named_dense.bias, name="m")))
-    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
-    self.assertAllEqual(3., self.evaluate(beta1_power))
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, slot_name="m")))
+    self.assertAllEqual(3.,
+                        self.evaluate(root_trackable.optimizer.beta_1))
 
   def _write_name_based_checkpoint(self):
     checkpoint_directory = self.get_temp_dir()
@@ -1497,7 +1459,7 @@ class CheckpointCompatibilityTests(test.TestCase):
         name_saver = saver_lib.Saver()
         return name_saver.save(
             sess=session, save_path=checkpoint_prefix,
-            global_step=root.optimizer_step)
+            global_step=root.optimizer.iterations)
 
   @test_util.run_in_graph_and_eager_modes
   def testLoadFromNameBasedSaver(self):
@@ -1508,7 +1470,8 @@ class CheckpointCompatibilityTests(test.TestCase):
       self._set_sentinels(root)
       with self.assertRaises(AssertionError):
         self._check_sentinels(root)
-      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      object_saver = trackable_utils.TrackableSaver(
+          graph_view.ObjectGraphView(root))
       self._set_sentinels(root)
       status = object_saver.restore(save_path)
       if context.executing_eagerly():
@@ -1580,7 +1543,7 @@ class PythonMetadataTests(test.TestCase):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     dense = core.Dense(1)
-    checkpoint = checkpointable_utils.Checkpoint(dense=dense)
+    checkpoint = trackable_utils.Checkpoint(dense=dense)
     dense(constant_op.constant([[1.]]))
     checkpoint.restore(None).initialize_or_restore()
     save_path = checkpoint.save(checkpoint_prefix)
@@ -1600,7 +1563,7 @@ class PythonMetadataTests(test.TestCase):
       return json.loads(layer_json.decode("utf-8"))
 
     layer_data = _get_dense_node_from_object_graph(
-        checkpointable_utils.object_metadata(save_path))
+        trackable_utils.object_metadata(save_path))
     self.assertEqual("Dense", layer_data["class_name"])
     self.assertEqual(1, layer_data["config"]["units"])
 
@@ -1610,7 +1573,7 @@ class PythonMetadataTests(test.TestCase):
     dense.units = 42
     save_path = checkpoint.save(checkpoint_prefix)
     layer_data = _get_dense_node_from_object_graph(
-        checkpointable_utils.object_metadata(save_path))
+        trackable_utils.object_metadata(save_path))
     self.assertEqual("Dense", layer_data["class_name"])
     self.assertEqual(42, layer_data["config"]["units"])
 
diff --git a/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py b/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..793929380ff210ff3f8fdb873d66b81ad6556f27
--- /dev/null
+++ b/tensorflow/python/training/tracking/util_with_v1_optimizers_test.py
@@ -0,0 +1,933 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for object-based saving which use tf.train.* optimizers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+
+import six
+
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import template
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import adam
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training import training_util
+from tensorflow.python.training.tracking import graph_view
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util as trackable_utils
+
+
+class NonLayerTrackable(tracking.AutoTrackable):
+
+  def __init__(self):
+    super(NonLayerTrackable, self).__init__()
+    self.a_variable = trackable_utils.add_variable(
+        self, name="a_variable", shape=[])
+
+
+# pylint: disable=not-callable
+class MyModel(training.Model):
+  """A concrete Model for testing."""
+
+  def __init__(self):
+    super(MyModel, self).__init__()
+    self._named_dense = core.Dense(1, use_bias=True)
+    self._second = core.Dense(1, use_bias=False)
+    # We can still track Trackables which aren't Layers.
+    self._non_layer = NonLayerTrackable()
+
+  def call(self, values):
+    ret = self._second(self._named_dense(values))
+    return ret
+
+
+class CheckpointingTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testNamingWithOptimizer(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    # A nuisance Model using the same optimizer. Its slot variables should not
+    # go in the checkpoint, since it is never depended on.
+    other_model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    optimizer_step = training_util.get_or_create_global_step()
+    root_trackable = trackable_utils.Checkpoint(
+        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+    if context.executing_eagerly():
+      optimizer.minimize(
+          lambda: model(input_value),
+          global_step=optimizer_step)
+      optimizer.minimize(
+          lambda: other_model(input_value),
+          global_step=optimizer_step)
+    else:
+      train_op = optimizer.minimize(
+          model(input_value), global_step=optimizer_step)
+      optimizer.minimize(
+          other_model(input_value),
+          global_step=optimizer_step)
+      self.evaluate(trackable_utils.gather_initializers(
+          root_trackable))
+      self.evaluate(train_op)
+    named_variables, serialized_graph, _ = graph_view.ObjectGraphView(
+        root_trackable).serialize_object_graph()
+    expected_checkpoint_names = (
+        # Created in the root node, so no prefix.
+        "optimizer_step",
+        "model/_second/kernel",
+        "model/_named_dense/kernel",
+        "model/_named_dense/bias",
+        # non-Layer dependency of the model
+        "model/_non_layer/a_variable",
+        # The optimizer creates two non-slot variables
+        "optimizer/beta1_power",
+        "optimizer/beta2_power",
+        # Slot variables
+        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/v",
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/v",
+        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/v",
+    )
+    suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
+    expected_checkpoint_names = [
+        name + suffix for name in expected_checkpoint_names]
+    # The Dense layers also save get_config() JSON
+    expected_checkpoint_names.extend(
+        ["model/_second/.ATTRIBUTES/OBJECT_CONFIG_JSON",
+         "model/_named_dense/.ATTRIBUTES/OBJECT_CONFIG_JSON"])
+    named_variables = {v.name: v for v in named_variables}
+    six.assertCountEqual(self, expected_checkpoint_names,
+                         named_variables.keys())
+    # Check that we've mapped to the right variable objects (not exhaustive)
+    self.assertEqual(
+        "global_step",
+        named_variables["optimizer_step" + suffix].full_name)
+    self.assertEqual(
+        "my_model/dense_1/kernel",
+        named_variables["model/_second/kernel" + suffix].full_name)
+    self.assertEqual(
+        "my_model/dense/kernel",
+        named_variables["model/_named_dense/kernel" + suffix].full_name)
+    self.assertEqual(
+        "beta1_power",
+        named_variables["optimizer/beta1_power" + suffix].full_name)
+    self.assertEqual(
+        "beta2_power",
+        named_variables["optimizer/beta2_power" + suffix].full_name)
+    # Spot check the generated protocol buffers.
+    self.assertEqual("optimizer",
+                     serialized_graph.nodes[0].children[1].local_name)
+    optimizer_node = serialized_graph.nodes[serialized_graph.nodes[0].children[
+        1].node_id]
+    self.assertEqual("beta1_power",
+                     optimizer_node.children[0].local_name)
+    self.assertEqual("beta1_power",
+                     serialized_graph.nodes[optimizer_node.children[0].node_id]
+                     .attributes[0].full_name)
+    self.assertEqual(
+        "my_model/dense/kernel",
+        serialized_graph.nodes[optimizer_node.slot_variables[0]
+                               .original_variable_node_id]
+        .attributes[0].full_name)
+    # We strip off the :0 suffix, as variable.name-based saving does.
+    self.assertEqual(
+        "my_model/dense/kernel/Adam",
+        serialized_graph.nodes[optimizer_node.slot_variables[0]
+                               .slot_variable_node_id]
+        .attributes[0].full_name)
+    self.assertEqual(
+        "my_model/dense/kernel/Adam:0",
+        optimizer.get_slot(
+            var=model._named_dense.kernel,
+            name="m").name)
+    self.assertEqual(
+        "model/_named_dense/kernel" + suffix,
+        serialized_graph.nodes[
+            optimizer_node.slot_variables[0]
+            .original_variable_node_id].attributes[0].checkpoint_key)
+    self.assertEqual("m", optimizer_node.slot_variables[0].slot_name)
+    self.assertEqual(
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m" + suffix,
+        serialized_graph.nodes[
+            optimizer_node.slot_variables[0]
+            .slot_variable_node_id].attributes[0].checkpoint_key)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testSaveRestore(self):
+    model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    root_trackable = trackable_utils.Checkpoint(
+        optimizer=optimizer, model=model)
+    input_value = constant_op.constant([[3.]])
+    if context.executing_eagerly():
+      optimizer.minimize(
+          lambda: model(input_value))
+    else:
+      train_op = optimizer.minimize(model(input_value))
+      # TODO(allenl): Make initialization more pleasant when graph building.
+      root_trackable.save_counter  # pylint: disable=pointless-statement
+      self.evaluate(trackable_utils.gather_initializers(
+          root_trackable))
+      self.evaluate(train_op)
+    prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
+    m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
+    self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
+    save_path = root_trackable.save(file_prefix=prefix)
+    self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
+    self.evaluate(state_ops.assign(root_trackable.save_counter, 3))
+    optimizer_variables = self.evaluate(optimizer.variables())
+    self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
+    # Immediate restoration
+    status = root_trackable.restore(save_path=save_path).assert_consumed()
+    status.run_restore_ops()
+    self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
+    self.assertAllEqual(1, self.evaluate(root_trackable.save_counter))
+    self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
+    if not context.executing_eagerly():
+      return  # Restore-on-create is only supported when executing eagerly
+    on_create_model = MyModel()
+    on_create_optimizer = adam.AdamOptimizer(
+        0.001,
+        # Preserve beta1_power and beta2_power when appying gradients so we can
+        # test that they've been restored correctly.
+        beta1=1.0, beta2=1.0)
+    on_create_root = trackable_utils.Checkpoint(
+        optimizer=on_create_optimizer, model=on_create_model)
+    # Deferred restoration
+    status = on_create_root.restore(save_path=save_path)
+    status.assert_nontrivial_match()
+    status.assert_existing_objects_matched()
+    with self.assertRaises(AssertionError):
+      status.assert_consumed()
+    on_create_model(constant_op.constant([[3.]]))  # create variables
+    self.assertAllEqual(1, self.evaluate(on_create_root.save_counter))
+    self.assertAllEqual([42.],
+                        self.evaluate(
+                            on_create_model._named_dense.variables[1]))
+    on_create_m_bias_slot = on_create_optimizer.get_slot(
+        on_create_model._named_dense.variables[1], "m")
+    status.assert_existing_objects_matched()
+    with self.assertRaises(AssertionError):
+      status.assert_consumed()
+    # Optimizer slot variables are created when the original variable is
+    # restored.
+    self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
+    self.assertAllEqual(optimizer_variables[2:],
+                        self.evaluate(on_create_optimizer.variables()))
+    dummy_var = resource_variable_ops.ResourceVariable([1.])
+    on_create_optimizer.minimize(loss=dummy_var.read_value)
+    status.assert_existing_objects_matched()
+    status.assert_consumed()
+    beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators()
+    self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power))
+    self.assertAllEqual(optimizer_variables[1], self.evaluate(beta2_power))
+
+  # TODO(allenl): Debug garbage created by this test in python3.
+  def testDeferredRestorationUsageEager(self):
+    """An idiomatic eager execution example."""
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001)
+      root = trackable_utils.Checkpoint(
+          optimizer=optimizer, model=model,
+          optimizer_step=training_util.get_or_create_global_step())
+      root.restore(checkpoint_management.latest_checkpoint(
+          checkpoint_directory))
+      for _ in range(num_training_steps):
+        # TODO(allenl): Use a Dataset and serialize/checkpoint it.
+        input_value = constant_op.constant([[3.]])
+        optimizer.minimize(
+            lambda: model(input_value),  # pylint: disable=cell-var-from-loop
+            global_step=root.optimizer_step)
+      root.save(file_prefix=checkpoint_prefix)
+      self.assertEqual((training_continuation + 1) * num_training_steps,
+                       root.optimizer_step.numpy())
+
+  def testEagerDistributionStrategy(self):
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+    def _train_fn(optimizer, model):
+      input_value = constant_op.constant([[3.]])
+      optimizer.minimize(
+          functools.partial(model, input_value),
+          global_step=root.optimizer_step)
+
+    for training_continuation in range(3):
+      strategy = mirrored_strategy.MirroredStrategy()
+      with strategy.scope():
+        model = MyModel()
+        optimizer = adam.AdamOptimizer(0.001)
+        root = trackable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            optimizer_step=training_util.get_or_create_global_step())
+        root.restore(checkpoint_management.latest_checkpoint(
+            checkpoint_directory))
+
+        for _ in range(num_training_steps):
+          strategy.extended.call_for_each_replica(
+              functools.partial(_train_fn, optimizer, model))
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         root.optimizer_step.numpy())
+
+  def testGraphDistributionStrategy(self):
+    self.skipTest("b/121381184")
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+    def _train_fn(optimizer, model):
+      input_value = constant_op.constant([[3.]])
+      return optimizer.minimize(
+          functools.partial(model, input_value),
+          global_step=root.optimizer_step)
+
+    for training_continuation in range(3):
+      with ops.Graph().as_default():
+        strategy = mirrored_strategy.MirroredStrategy()
+        with strategy.scope():
+          model = MyModel()
+          optimizer = adam.AdamOptimizer(0.001)
+          root = trackable_utils.Checkpoint(
+              optimizer=optimizer, model=model,
+              optimizer_step=training_util.get_or_create_global_step())
+          status = root.restore(checkpoint_management.latest_checkpoint(
+              checkpoint_directory))
+          train_op = strategy.extended.call_for_each_replica(
+              functools.partial(_train_fn, optimizer, model))
+          with self.session() as session:
+            if training_continuation > 0:
+              status.assert_consumed()
+            status.initialize_or_restore()
+            for _ in range(num_training_steps):
+              session.run(train_op)
+            root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         root.optimizer_step.numpy())
+
+  def testUsageGraph(self):
+    """Expected usage when graph building."""
+    with context.graph_mode():
+      num_training_steps = 10
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      for training_continuation in range(3):
+        with ops.Graph().as_default():
+          model = MyModel()
+          optimizer = adam.AdamOptimizer(0.001)
+          root = trackable_utils.Checkpoint(
+              optimizer=optimizer, model=model,
+              global_step=training_util.get_or_create_global_step())
+          input_value = constant_op.constant([[3.]])
+          train_op = optimizer.minimize(
+              model(input_value),
+              global_step=root.global_step)
+          checkpoint_path = checkpoint_management.latest_checkpoint(
+              checkpoint_directory)
+          with self.session(graph=ops.get_default_graph()) as session:
+            status = root.restore(save_path=checkpoint_path)
+            status.initialize_or_restore(session=session)
+            if checkpoint_path is None:
+              self.assertEqual(0, training_continuation)
+              with self.assertRaises(AssertionError):
+                status.assert_consumed()
+              with self.assertRaises(AssertionError):
+                status.assert_existing_objects_matched()
+            else:
+              status.assert_consumed()
+              status.assert_existing_objects_matched()
+            for _ in range(num_training_steps):
+              session.run(train_op)
+            root.save(file_prefix=checkpoint_prefix, session=session)
+            self.assertEqual((training_continuation + 1) * num_training_steps,
+                             session.run(root.global_step))
+            self.assertEqual(training_continuation + 1,
+                             session.run(root.save_counter))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testAgnosticUsage(self):
+    """Graph/eager agnostic usage."""
+    # Does create garbage when executing eagerly due to ops.Graph() creation.
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    for training_continuation in range(3):
+      with test_util.device(use_gpu=True):
+        model = MyModel()
+        optimizer = adam.AdamOptimizer(0.001)
+        root = trackable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            global_step=training_util.get_or_create_global_step())
+        manager = checkpoint_management.CheckpointManager(
+            root, checkpoint_directory, max_to_keep=1)
+        status = root.restore(save_path=manager.latest_checkpoint)
+        input_value = constant_op.constant([[3.]])
+        train_fn = functools.partial(
+            optimizer.minimize,
+            functools.partial(model, input_value),
+            global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(self.evaluate, train_fn())
+        status.initialize_or_restore()
+        for _ in range(num_training_steps):
+          train_fn()
+        manager.save()
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         self.evaluate(root.global_step))
+        self.assertEqual(training_continuation + 1,
+                         self.evaluate(root.save_counter))
+
+  # pylint: disable=cell-var-from-loop
+  @test_util.run_in_graph_and_eager_modes
+  def testWithDefun(self):
+    num_training_steps = 2
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      with test_util.device(use_gpu=True):
+        model = MyModel()
+        # Don't actually train so we can test variable values
+        optimizer = adam.AdamOptimizer(0.)
+        root = trackable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            global_step=training_util.get_or_create_global_step())
+        checkpoint_path = checkpoint_management.latest_checkpoint(
+            checkpoint_directory)
+        status = root.restore(save_path=checkpoint_path)
+        def train_fn():
+          @def_function.function
+          def _call_model(x):
+            return model(x)
+          with backprop.GradientTape() as tape:
+            loss = _call_model(constant_op.constant([[3.]]))
+          gradients = tape.gradient(loss, model.variables)
+          return optimizer.apply_gradients(zip(gradients, model.variables),
+                                           global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(
+              self.evaluate, train_fn())
+        status.initialize_or_restore()
+        for _ in range(num_training_steps):
+          train_fn()
+        if training_continuation > 0:
+          status.assert_consumed()
+          self.assertAllClose([[42.]], self.evaluate(model.variables[0]))
+        else:
+          self.evaluate(model.variables[0].assign([[42.]]))
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         self.evaluate(root.global_step))
+        self.assertEqual(training_continuation + 1,
+                         self.evaluate(root.save_counter))
+  # pylint: enable=cell-var-from-loop
+
+  def _get_checkpoint_name(self, name):
+    root = tracking.AutoTrackable()
+    trackable_utils.add_variable(
+        root, name=name, shape=[1, 2], dtype=dtypes.float64)
+    (named_variable,), _, _ = trackable_utils._serialize_object_graph(
+        root, saveables_cache=None)
+    with ops.name_scope("root/" + named_variable.name):
+      pass  # Make sure we can use this as an op name if we prefix it.
+    return named_variable.name
+
+  def testAnonymousVarsInInit(self):
+
+    class Model(training.Model):
+
+      def __init__(self):
+        super(Model, self).__init__()
+        self.w = resource_variable_ops.ResourceVariable(0.0)
+        self.b = resource_variable_ops.ResourceVariable(0.0)
+        self.vars = [self.w, self.b]
+
+      def call(self, x):
+        return x * self.w + self.b
+
+    with context.eager_mode():
+      model = Model()
+      optimizer = adam.AdamOptimizer(learning_rate=0.05)
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      checkpoint = trackable_utils.Checkpoint(
+          model=model, optimizer=optimizer)
+      for _ in range(2):
+        checkpoint.save(checkpoint_prefix)
+        with backprop.GradientTape() as tape:
+          loss = (constant_op.constant(1.)
+                  - model(constant_op.constant(1.))) ** 2
+        grad = tape.gradient(loss, model.vars)
+        optimizer.apply_gradients(
+            [(g, v) for g, v in zip(grad, model.vars)])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeferredSlotRestoration(self):
+    checkpoint_directory = self.get_temp_dir()
+
+    root = trackable_utils.Checkpoint()
+    root.var = trackable_utils.add_variable(
+        root, name="var", initializer=0.)
+    optimizer = adam.AdamOptimizer(0.1)
+    if context.executing_eagerly():
+      optimizer.minimize(root.var.read_value)
+    else:
+      train_op = optimizer.minimize(root.var)
+      # Note that `optimizer` has not been added as a dependency of
+      # `root`. Create a one-off grouping so that slot variables for `root.var`
+      # get initialized too.
+      self.evaluate(trackable_utils.gather_initializers(
+          trackable_utils.Checkpoint(root=root, optimizer=optimizer)))
+      self.evaluate(train_op)
+    self.evaluate(state_ops.assign(root.var, 12.))
+    no_slots_path = root.save(os.path.join(checkpoint_directory, "no_slots"))
+    root.optimizer = optimizer
+    self.evaluate(state_ops.assign(root.var, 13.))
+    self.evaluate(state_ops.assign(optimizer.get_slot(name="m", var=root.var),
+                                   14.))
+    slots_path = root.save(os.path.join(checkpoint_directory, "with_slots"))
+    new_root = trackable_utils.Checkpoint()
+    # Load the slot-containing checkpoint (deferred), then immediately overwrite
+    # the non-slot variable (also deferred).
+    slot_status = new_root.restore(slots_path)
+    no_slot_status = new_root.restore(no_slots_path)
+    with self.assertRaises(AssertionError):
+      no_slot_status.assert_consumed()
+    new_root.var = trackable_utils.add_variable(
+        new_root, name="var", shape=[])
+    no_slot_status.assert_consumed()
+    no_slot_status.run_restore_ops()
+    self.assertEqual(12., self.evaluate(new_root.var))
+    new_root.optimizer = adam.AdamOptimizer(0.1)
+    slot_status.assert_existing_objects_matched()
+    with self.assertRaisesRegexp(AssertionError, "beta1_power"):
+      slot_status.assert_consumed()
+    self.assertEqual(12., self.evaluate(new_root.var))
+    if context.executing_eagerly():
+      # Slot variables are only created with restoring initializers when
+      # executing eagerly.
+      self.assertEqual(14., self.evaluate(
+          new_root.optimizer.get_slot(name="m", var=new_root.var)))
+    else:
+      self.assertIs(new_root.optimizer.get_slot(name="m", var=new_root.var),
+                    None)
+    if context.executing_eagerly():
+      new_root.optimizer.minimize(new_root.var.read_value)
+    else:
+      train_op = new_root.optimizer.minimize(new_root.var)
+      # The slot variable now exists; restore() didn't create it, but we should
+      # now have a restore op for it.
+      slot_status.run_restore_ops()
+      self.assertEqual(14., self.evaluate(
+          new_root.optimizer.get_slot(name="m", var=new_root.var)))
+      self.evaluate(train_op)
+    slot_status.assert_consumed()
+
+  def testManySavesGraph(self):
+    """Saves after the first should not modify the graph."""
+    with context.graph_mode():
+      graph = ops.Graph()
+      with graph.as_default(), self.session(graph):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        obj = trackable_utils.Checkpoint()
+        obj.var = variable_scope.get_variable(name="v", initializer=0.)
+        obj.opt = adam.AdamOptimizer(0.1)
+        obj.opt.minimize(obj.var.read_value())
+        self.evaluate(trackable_utils.gather_initializers(obj))
+        obj.save(checkpoint_prefix)
+        before_ops = graph.get_operations()
+        obj.save(checkpoint_prefix)
+        self.assertEqual(before_ops, graph.get_operations())
+
+  def testManyRestoresGraph(self):
+    """Restores after the first should not modify the graph."""
+    with context.graph_mode():
+      graph = ops.Graph()
+      with graph.as_default(), self.session(graph):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        obj = trackable_utils.Checkpoint()
+        obj.var = variable_scope.get_variable(name="v", initializer=0.)
+        obj.opt = adam.AdamOptimizer(0.1)
+        obj.opt.minimize(obj.var.read_value())
+        self.evaluate(trackable_utils.gather_initializers(obj))
+        save_path = obj.save(checkpoint_prefix)
+        obj.restore(save_path)
+        before_ops = graph.get_operations()
+        obj.restore(save_path)
+        self.assertEqual(before_ops, graph.get_operations())
+
+  def testMultipleGraphsNonSlotVariables(self):
+    with context.graph_mode():
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      optimizer = adam.AdamOptimizer(0.001)
+      # Construct a model in one graph
+      first_graph = ops.Graph()
+      first_session = session_lib.Session(graph=first_graph)
+      with first_graph.as_default(), first_session.as_default():
+        first_variable = resource_variable_ops.ResourceVariable([1.])
+        first_root_trackable = trackable_utils.Checkpoint(
+            optimizer=optimizer, variable=first_variable)
+        train_op = optimizer.minimize(first_variable.read_value)
+        self.evaluate(trackable_utils.gather_initializers(
+            first_root_trackable))
+        self.evaluate(train_op)
+        self.evaluate(first_variable.assign([1.]))
+        self.evaluate(optimizer.get_slot(
+            var=first_variable, name="m").assign([2.]))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.evaluate(beta1_power.assign(3.))
+
+      # Save and load in a second graph
+      second_graph = ops.Graph()
+      with second_graph.as_default(), session_lib.Session(graph=second_graph):
+        second_variable = resource_variable_ops.ResourceVariable([1.])
+        second_root_trackable = trackable_utils.Checkpoint(
+            optimizer=optimizer, variable=second_variable)
+        train_op = optimizer.minimize(second_variable.read_value)
+        second_root_trackable.restore(None).initialize_or_restore()
+        self.evaluate(train_op)
+        self.evaluate(second_variable.assign([4.]))
+        self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m").assign([5.]))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.evaluate(beta1_power.assign(6.))
+        save_path = second_root_trackable.save(checkpoint_prefix)
+        self.evaluate(second_variable.assign([7.]))
+        self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m").assign([8.]))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(6., self.evaluate(beta1_power))
+        status = second_root_trackable.restore(save_path)
+        status.assert_consumed().run_restore_ops()
+        self.assertAllEqual([4.], self.evaluate(second_variable))
+        self.assertAllEqual([5.], self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m")))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(6., self.evaluate(beta1_power))
+
+      # Check that the first graph is unmolested
+      with first_graph.as_default(), first_session.as_default():
+        self.assertAllEqual([1.], self.evaluate(first_variable))
+        self.assertAllEqual([2.], self.evaluate(optimizer.get_slot(
+            var=first_variable, name="m")))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(3., self.evaluate(beta1_power))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_initialize_if_not_restoring(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    optimizer_only_prefix = os.path.join(checkpoint_directory, "opt")
+    with test_util.device(use_gpu=True):
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001)
+      root = trackable_utils.Checkpoint(
+          model=model,  # Do not save the optimizer with the checkpoint.
+          global_step=training_util.get_or_create_global_step())
+      optimizer_checkpoint = trackable_utils.Checkpoint(
+          optimizer=optimizer)
+
+      checkpoint_path = checkpoint_management.latest_checkpoint(
+          checkpoint_directory)
+      status = root.restore(save_path=checkpoint_path)
+      input_value = constant_op.constant([[3.]])
+      train_fn = functools.partial(
+          optimizer.minimize,
+          functools.partial(model, input_value),
+          global_step=root.global_step)
+      if not context.executing_eagerly():
+        train_fn = functools.partial(self.evaluate, train_fn())
+      status.initialize_or_restore()
+      self.evaluate([v.initializer for v in optimizer.variables()])
+      train_fn()
+      model_save_path = root.save(file_prefix=checkpoint_prefix)
+      self.evaluate(optimizer.variables()[0].assign(42.))
+      optimizer_save_path = optimizer_checkpoint.save(optimizer_only_prefix)
+
+    # Restore into a graph with the optimizer
+    with test_util.device(use_gpu=True):
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001)
+      root = trackable_utils.Checkpoint(
+          optimizer=optimizer, model=model,
+          global_step=training_util.get_or_create_global_step())
+      status = root.restore(save_path=model_save_path)
+      input_value = constant_op.constant([[3.]])
+      train_fn = functools.partial(
+          optimizer.minimize,
+          functools.partial(model, input_value),
+          global_step=root.global_step)
+      if not context.executing_eagerly():
+        train_fn = functools.partial(self.evaluate, train_fn())
+      status.initialize_or_restore()
+      train_fn()
+      with self.assertRaises(AssertionError):
+        status.assert_existing_objects_matched()
+      with self.assertRaises(AssertionError):
+        status.assert_consumed()
+
+    # Make sure initialization doesn't clobber later restores
+    with test_util.device(use_gpu=True):
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001, beta1=1.0)
+      root = trackable_utils.Checkpoint(
+          optimizer=optimizer, model=model,
+          global_step=training_util.get_or_create_global_step())
+      opt_root = trackable_utils.Checkpoint(
+          optimizer=optimizer)
+      status = root.restore(save_path=model_save_path)
+      init_only_optimizer_status = opt_root.restore(save_path=None)
+      optimizer_status = opt_root.restore(save_path=optimizer_save_path)
+      input_value = constant_op.constant([[3.]])
+      train_fn = functools.partial(
+          optimizer.minimize,
+          functools.partial(model, input_value),
+          global_step=root.global_step)
+      if not context.executing_eagerly():
+        train_fn = functools.partial(self.evaluate, train_fn())
+      optimizer_status.run_restore_ops()
+      status.initialize_or_restore()
+      init_only_optimizer_status.initialize_or_restore()
+      train_fn()
+      self.assertEqual(42., self.evaluate(optimizer.variables()[0]))
+
+
+class _ManualScope(tracking.AutoTrackable):
+
+  def __call__(self):
+    with variable_scope.variable_scope("ManualScope") as vs:
+      self.variable_scope = vs
+      with trackable_utils.capture_dependencies(template=self):
+        return self._build()
+
+  def _build(self):
+    return variable_scope.get_variable(name="in_manual_scope", shape=[])
+
+
+class TemplateTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_trackable_save_restore(self):
+
+    def _templated():
+      v = variable_scope.get_variable(
+          "v", shape=[1], initializer=init_ops.zeros_initializer(),
+          use_resource=True)
+      v2 = variable_scope.get_variable(
+          "v2", shape=[1], initializer=init_ops.zeros_initializer(),
+          use_resource=True)
+      manual = _ManualScope()
+      return v, v + 1., v2, manual, manual()
+
+    save_template = template.make_template("s1", _templated)
+    v1_save, _, v2_save, manual_scope, manual_scope_v = save_template()
+    six.assertCountEqual(
+        self,
+        [v1_save, v2_save, manual_scope, manual_scope_v, save_template],
+        trackable_utils.list_objects(save_template))
+    manual_dep, = manual_scope._checkpoint_dependencies
+    self.assertEqual("in_manual_scope", manual_dep.name)
+    self.assertIs(manual_scope_v, manual_dep.ref)
+    optimizer = adam.AdamOptimizer(0.0)
+    save_root = trackable_utils.Checkpoint(
+        my_template=save_template, optimizer=optimizer)
+    optimizer.minimize(v1_save.read_value)
+    self.evaluate([v.initializer for v in save_template.variables])
+    self.evaluate([v.initializer for v in optimizer.variables()])
+    self.evaluate(v1_save.assign([12.]))
+    self.evaluate(v2_save.assign([14.]))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = save_root.save(checkpoint_prefix)
+
+    load_template = template.make_template("s2", _templated)
+    load_optimizer = adam.AdamOptimizer(0.0)
+    load_root = trackable_utils.Checkpoint(
+        my_template=load_template, optimizer=load_optimizer)
+    status = load_root.restore(save_path)
+    var, var_plus_one, var2, _, _ = load_template()
+    load_optimizer.minimize(var.read_value)
+    self.assertEqual(3, len(load_template._checkpoint_dependencies))
+    self.assertEqual("v", load_template._checkpoint_dependencies[0].name)
+    self.assertEqual("v2", load_template._checkpoint_dependencies[1].name)
+    self.assertEqual("ManualScope",
+                     load_template._checkpoint_dependencies[2].name)
+    status.assert_consumed().run_restore_ops()
+    self.assertAllEqual([12.], self.evaluate(var))
+    self.assertAllEqual([13.], self.evaluate(var_plus_one))
+    self.assertAllEqual([14.], self.evaluate(var2))
+
+
+class CheckpointCompatibilityTests(test.TestCase):
+
+  def _initialized_model(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    optimizer_step = training_util.get_or_create_global_step()
+    root_trackable = trackable_utils.Checkpoint(
+        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+    train_op = optimizer.minimize(
+        functools.partial(model, input_value),
+        global_step=optimizer_step)
+    self.evaluate(trackable_utils.gather_initializers(
+        root_trackable))
+    self.evaluate(train_op)
+    # A regular variable, a slot variable, and a non-slot Optimizer variable
+    # with known values to check when loading.
+    self.evaluate(model._named_dense.bias.assign([1.]))
+    self.evaluate(optimizer.get_slot(
+        var=model._named_dense.bias, name="m").assign([2.]))
+    beta1_power, _ = optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(3.))
+    return root_trackable
+
+  def _set_sentinels(self, root_trackable):
+    self.evaluate(root_trackable.model._named_dense.bias.assign([101.]))
+    self.evaluate(
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, name="m")
+        .assign([102.]))
+    beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(103.))
+
+  def _check_sentinels(self, root_trackable):
+    self.assertAllEqual(
+        [1.], self.evaluate(root_trackable.model._named_dense.bias))
+    self.assertAllEqual([2.], self.evaluate(
+        root_trackable.optimizer.get_slot(
+            var=root_trackable.model._named_dense.bias, name="m")))
+    beta1_power, _ = root_trackable.optimizer._get_beta_accumulators()
+    self.assertAllEqual(3., self.evaluate(beta1_power))
+
+  def _write_name_based_checkpoint(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.session(
+          graph=save_graph) as session:
+        root = self._initialized_model()
+        name_saver = saver_lib.Saver()
+        return name_saver.save(
+            sess=session, save_path=checkpoint_prefix,
+            global_step=root.optimizer_step)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testLoadFromNameBasedSaver(self):
+    """Save a name-based checkpoint, load it using the object-based API."""
+    with test_util.device(use_gpu=True):
+      save_path = self._write_name_based_checkpoint()
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      with self.assertRaises(AssertionError):
+        self._check_sentinels(root)
+      object_saver = trackable_utils.TrackableSaver(
+          graph_view.ObjectGraphView(root))
+      self._set_sentinels(root)
+      status = object_saver.restore(save_path)
+      if context.executing_eagerly():
+        self._check_sentinels(root)
+      if context.executing_eagerly():
+        with self.assertRaisesRegexp(AssertionError, "OBJECT_CONFIG_JSON"):
+          status.assert_consumed()
+        with self.assertRaisesRegexp(AssertionError, "OBJECT_CONFIG_JSON"):
+          status.assert_existing_objects_matched()
+        with self.assertRaisesRegexp(AssertionError, "OBJECT_CONFIG_JSON"):
+          status.assert_nontrivial_match()
+      else:
+        # When graph building, we haven't read any keys, so we don't know
+        # whether the restore will be complete.
+        with self.assertRaisesRegexp(AssertionError, "not restored"):
+          status.assert_consumed()
+        with self.assertRaisesRegexp(AssertionError, "not restored"):
+          status.assert_existing_objects_matched()
+        with self.assertRaisesRegexp(AssertionError, "not restored"):
+          status.assert_nontrivial_match()
+      status.run_restore_ops()
+      self._check_sentinels(root)
+      self._set_sentinels(root)
+      status = object_saver.restore(save_path)
+      status.initialize_or_restore()
+      self._check_sentinels(root)
+      # Check that there is no error when keys are missing from the name-based
+      # checkpoint.
+      root.not_in_name_checkpoint = resource_variable_ops.ResourceVariable([1.])
+      status = object_saver.restore(save_path)
+      with self.assertRaises(AssertionError):
+        status.assert_existing_objects_matched()
+
+  def testSaveGraphLoadEager(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.session(
+          graph=save_graph) as session:
+        root = self._initialized_model()
+        save_path = root.save(session=session, file_prefix=checkpoint_prefix)
+    with context.eager_mode():
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      root.restore(save_path).assert_consumed()
+      self._check_sentinels(root)
+
+  def testSaveEagerLoadGraph(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.eager_mode():
+      root = self._initialized_model()
+      save_path = root.save(file_prefix=checkpoint_prefix)
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.session(
+          graph=save_graph):
+        root = self._initialized_model()
+        self._set_sentinels(root)
+        root.restore(save_path).assert_consumed().run_restore_ops()
+        self._check_sentinels(root)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/training/tracking/util_xla_test.py b/tensorflow/python/training/tracking/util_xla_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e8dd0a6fd3231b335758a7e0fb05c7db37ac95c
--- /dev/null
+++ b/tensorflow/python/training/tracking/util_xla_test.py
@@ -0,0 +1,84 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.eager import backprop
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.platform import test
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util as trackable_utils
+
+
+class NonLayerTrackable(tracking.AutoTrackable):
+
+  def __init__(self):
+    super(NonLayerTrackable, self).__init__()
+    self.a_variable = trackable_utils.add_variable(
+        self, name="a_variable", shape=[])
+
+
+class Subclassed(training.Model):
+  """A concrete Model for testing."""
+
+  def __init__(self):
+    super(Subclassed, self).__init__()
+    self._named_dense = core.Dense(1, use_bias=True)
+    self._second = core.Dense(1, use_bias=False)
+    # We can still track Trackables which aren't Layers.
+    self._non_layer = NonLayerTrackable()
+
+  def call(self, values):
+    ret = self._second(self._named_dense(values))
+    return ret
+
+
+class CheckpointingTests(xla_test.XLATestCase):
+
+  def testDeferredRestorationUsageEager(self):
+    """An idiomatic eager execution example."""
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    for training_continuation in range(3):
+      with self.test_scope():
+        model = Subclassed()
+        optimizer = adam.Adam(0.001)
+        root = trackable_utils.Checkpoint(
+            optimizer=optimizer, model=model)
+        manager = checkpoint_management.CheckpointManager(
+            root, checkpoint_directory, max_to_keep=2)
+        root.restore(manager.latest_checkpoint)
+        for _ in range(num_training_steps):
+          input_value = constant_op.constant([[3.]])
+          with backprop.GradientTape() as tape:
+            loss = model(input_value)
+          variables = model.trainable_variables
+          gradients = tape.gradient(loss, variables)
+          optimizer.apply_gradients(zip(gradients, variables))
+        manager.save()
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         root.optimizer.iterations.numpy())
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index ae71a628c1f9e1e7e86a25cbcacab0bd400ed279..5a1527888977ec0e2d88dfe313ee4767bf59cba4 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -68,7 +68,7 @@ from tensorflow.python.training.basic_session_run_hooks import FinalOpsHook
 from tensorflow.python.training.basic_session_run_hooks import FeedFnHook
 from tensorflow.python.training.basic_session_run_hooks import ProfilerHook
 from tensorflow.python.training.basic_loops import basic_train_loop
-from tensorflow.python.training.checkpointable.util import Checkpoint
+from tensorflow.python.training.tracking.util import Checkpoint
 from tensorflow.python.training.checkpoint_utils import init_from_checkpoint
 from tensorflow.python.training.checkpoint_utils import list_variables
 from tensorflow.python.training.checkpoint_utils import load_checkpoint
diff --git a/tensorflow/python/training/training_ops_test.py b/tensorflow/python/training/training_ops_test.py
index ba0f40999b48ffb8411c2cd0e7f4608f84ff292b..8ba6abdcf956bdebc00145a53ca34322847c180f 100644
--- a/tensorflow/python/training/training_ops_test.py
+++ b/tensorflow/python/training/training_ops_test.py
@@ -53,7 +53,7 @@ class TrainingOpsTest(TensorFlowTestCase):
     self.setUp()
     with self.session(use_gpu=use_gpu):
       var = variables.VariableV1(x)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllCloseAccordingToType(x, self.evaluate(var))
       apply_sgd = training_ops.apply_gradient_descent(var, alpha, delta)
       out = self.evaluate(apply_sgd)
@@ -74,7 +74,7 @@ class TrainingOpsTest(TensorFlowTestCase):
     with self.session(use_gpu=use_gpu):
       var = variables.VariableV1(x)
       accum = variables.VariableV1(y)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertAllCloseAccordingToType(x, self.evaluate(var))
       apply_adagrad = training_ops.apply_adagrad(var, accum, lr, grad)
@@ -99,7 +99,7 @@ class TrainingOpsTest(TensorFlowTestCase):
       var = variables.VariableV1(x)
       accum = variables.VariableV1(y)
       linear = variables.VariableV1(z)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertAllCloseAccordingToType(x, self.evaluate(var))
       apply_ftrl = training_ops.apply_ftrl(var, accum, linear, grad, lr, l1, l2,
@@ -156,7 +156,7 @@ class TrainingOpsTest(TensorFlowTestCase):
     with self.session(use_gpu=False):
       var = variables.VariableV1(x)
       accum = variables.VariableV1(y)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertAllCloseAccordingToType(x, self.evaluate(var))
       sparse_apply_adagrad = training_ops.sparse_apply_adagrad(
@@ -187,7 +187,7 @@ class TrainingOpsTest(TensorFlowTestCase):
       var = variables.VariableV1(x)
       accum = variables.VariableV1(y)
       linear = variables.VariableV1(z)
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertAllCloseAccordingToType(x, self.evaluate(var))
       sparse_apply_ftrl = training_ops.sparse_apply_ftrl(
@@ -285,7 +285,7 @@ class TrainingOpsTest(TensorFlowTestCase):
       beta2_power_t = variables.VariableV1(beta2_power)
       lr_t = constant_op.constant(lr, self._toType(var.dtype), [])
       epsilon_t = constant_op.constant(epsilon, self._toType(var.dtype), [])
-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertAllCloseAccordingToType(var, self.evaluate(var_t))
       new_var, _, _ = self._adamUpdateNumpy(var, grad, t, m, v, lr, beta1,
diff --git a/tensorflow/python/util/deprecation.py b/tensorflow/python/util/deprecation.py
index 9aaf0c2de9756718645e77de416c653182994019..f8e8d4c28a50629f108abeb0700d82fba311666c 100644
--- a/tensorflow/python/util/deprecation.py
+++ b/tensorflow/python/util/deprecation.py
@@ -100,7 +100,13 @@ def _validate_deprecation_args(date, instructions):
 def _call_location(outer=False):
   """Returns call location given level up from current call."""
   stack = tf_stack.extract_stack()
-  frame = stack[-4 if outer else -3]
+  length = len(stack)
+  if length == 0:  # should never happen as we're in a function
+    return 'UNKNOWN'
+  index = length-4 if outer else length-3
+  if index < 0:
+    index = 0
+  frame = stack[index]
   return '{filename}:{lineno}'.format(filename=frame[0], lineno=frame[1])
 
 
diff --git a/tensorflow/python/util/example_parser_configuration.py b/tensorflow/python/util/example_parser_configuration.py
index e3fdcf956e543c516335762a7c47e5547256a2a7..dc8937a31995c1752ea49638ff23ff805a39753f 100644
--- a/tensorflow/python/util/example_parser_configuration.py
+++ b/tensorflow/python/util/example_parser_configuration.py
@@ -101,7 +101,7 @@ def extract_example_parser_configuration(parse_example_op, sess):
     fixed_config.shape.CopyFrom(
         tensor_shape.TensorShape(dense_shapes[i]).as_proto())
 
-    fixed_config.dtype = int(dense_types[i])
+    fixed_config.dtype = dense_types[i].as_datatype_enum
     # Get the output tensor name.
     fixed_config.values_output_tensor_name = parse_example_op.outputs[
         dense_values_start + i].name
@@ -111,7 +111,7 @@ def extract_example_parser_configuration(parse_example_op, sess):
     key = fetched[sparse_keys_start + i]
     feature_config = config.feature_map[key]
     var_len_feature = feature_config.var_len_feature
-    var_len_feature.dtype = int(sparse_types[i])
+    var_len_feature.dtype = sparse_types[i].as_datatype_enum
     var_len_feature.indices_output_tensor_name = parse_example_op.outputs[
         sparse_indices_start + i].name
     var_len_feature.values_output_tensor_name = parse_example_op.outputs[
diff --git a/tensorflow/python/util/function_utils.py b/tensorflow/python/util/function_utils.py
index a56dfbff8e383134f3ad475736b7679dcceb055f..84e45bec6fc58f18a6ce6f0e8576e2cdb135ed8d 100644
--- a/tensorflow/python/util/function_utils.py
+++ b/tensorflow/python/util/function_utils.py
@@ -22,6 +22,7 @@ import functools
 
 import six
 
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
@@ -113,3 +114,16 @@ def get_func_code(func):
       return None
   else:
     raise ValueError('Argument must be callable')
+
+
+_rewriter_config_optimizer_disabled = None
+
+
+def get_disabled_rewriter_config():
+  global _rewriter_config_optimizer_disabled
+  if _rewriter_config_optimizer_disabled is None:
+    config = config_pb2.ConfigProto()
+    rewriter_config = config.graph_options.rewrite_options
+    rewriter_config.disable_meta_optimizer = True
+    _rewriter_config_optimizer_disabled = config.SerializeToString()
+  return _rewriter_config_optimizer_disabled
diff --git a/tensorflow/python/util/lazy_loader.py b/tensorflow/python/util/lazy_loader.py
index 6d2622b1c0472d14481f67e612c1bf276a5a16ab..a9499f8334c518ae2f427b53c4913ed0e036bd30 100644
--- a/tensorflow/python/util/lazy_loader.py
+++ b/tensorflow/python/util/lazy_loader.py
@@ -31,17 +31,25 @@ class LazyLoader(types.ModuleType):
   """
 
   # The lint error here is incorrect.
-  def __init__(self, local_name, parent_module_globals, name):  # pylint: disable=super-on-old-class
+  def __init__(self, local_name, parent_module_globals, name, warning=None):  # pylint: disable=super-on-old-class
     self._local_name = local_name
     self._parent_module_globals = parent_module_globals
+    self._warning = warning
 
     super(LazyLoader, self).__init__(name)
 
   def _load(self):
+    """Load the module and insert it into the parent's globals."""
     # Import the target module and insert it into the parent's namespace
     module = importlib.import_module(self.__name__)
     self._parent_module_globals[self._local_name] = module
 
+    # Emit a warning if one was specified
+    if self._warning:
+      print(self._warning)
+      # Make sure to only warn once.
+      self._warning = None
+
     # Update this object's dict so that if someone keeps a reference to the
     #   LazyLoader, lookups are efficient (__getattr__ is only called on lookups
     #   that fail).
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index be8b0f1949ff7655d14c81ce29d643a919176fe6..9a1241cff3f0879166f46c3e56f54b0658f2ac7d 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -39,18 +39,47 @@ import collections as _collections
 import six as _six
 
 from tensorflow.python import pywrap_tensorflow as _pywrap_tensorflow
+from tensorflow.python.util.tf_export import tf_export
 
 
-def _get_attrs_values(obj):
-  """Returns the list of values from an attrs instance."""
+_SHALLOW_TREE_HAS_INVALID_KEYS = (
+    "The shallow_tree's keys are not a subset of the input_tree's keys. The "
+    "shallow_tree has the following keys that are not in the input_tree: {}.")
+
+_STRUCTURES_HAVE_MISMATCHING_TYPES = (
+    "The two structures don't have the same sequence type. Input structure has "
+    "type {shallow_type}, while shallow structure has type {input_type}.")
+
+_INPUT_TREE_SMALLER_THAN_SHALLOW_TREE = (
+    "The input_tree has fewer elements than the input_tree. Input structure "
+    "has length {input_size}, while shallow structure has length "
+    "{shallow_size}.")
+
+_IF_SHALLOW_IS_SEQ_INPUT_MUST_BE_SEQ = (
+    "If shallow structure is a sequence, input must also be a sequence. "
+    "Input has type: {}.")
+
+
+def _get_attrs_items(obj):
+  """Returns a list of (name, value) pairs from an attrs instance.
+
+  The list will be sorted by name.
+
+  Args:
+    obj: an object.
+
+  Returns:
+    A list of (attr_name, attr_value) pairs, sorted by attr_name.
+  """
   attrs = getattr(obj.__class__, "__attrs_attrs__")
-  return [getattr(obj, a.name) for a in attrs]
+  attr_names = sorted([a.name for a in attrs])
+  return [(attr_name, getattr(obj, attr_name)) for attr_name in attr_names]
 
 
 def _sorted(dict_):
   """Returns a sorted list of the dict keys, with error if keys not sortable."""
   try:
-    return sorted(_six.iterkeys(dict_))
+    return sorted(dict_)
   except TypeError:
     raise TypeError("nest only supports dicts with sortable keys.")
 
@@ -74,6 +103,7 @@ def _is_namedtuple(instance, strict=False):
 # See the swig file (util.i) for documentation.
 _is_mapping = _pywrap_tensorflow.IsMapping
 _is_attrs = _pywrap_tensorflow.IsAttrs
+_is_composite_tensor = _pywrap_tensorflow.IsCompositeTensor
 
 
 def _sequence_like(instance, args):
@@ -94,30 +124,56 @@ def _sequence_like(instance, args):
     # ordered and plain dicts (e.g., flattening a dict but using a
     # corresponding `OrderedDict` to pack it back).
     result = dict(zip(_sorted(instance), args))
-    return type(instance)((key, result[key]) for key in _six.iterkeys(instance))
+    return type(instance)((key, result[key]) for key in instance)
   elif _is_namedtuple(instance) or _is_attrs(instance):
     return type(instance)(*args)
+  elif _is_composite_tensor(instance):
+    return instance._from_components(args)  # pylint: disable=protected-access
   else:
     # Not a namedtuple
     return type(instance)(args)
 
 
 def _yield_value(iterable):
-  """Yields the next value from the given iterable."""
-  if _is_mapping(iterable):
+  for _, v in _yield_sorted_items(iterable):
+    yield v
+
+
+def _yield_sorted_items(iterable):
+  """Yield (key, value) pairs for `iterable` in a deterministic order.
+
+  For Sequences, the key will be an int, the array index of a value.
+  For Mappings, the key will be the dictionary key.
+  For objects (e.g. namedtuples), the key will be the attribute name.
+
+  In all cases, the keys will be iterated in sorted order.
+
+  Args:
+    iterable: an iterable.
+
+  Yields:
+    The iterable's (key, value) pairs, in order of sorted keys.
+  """
+  if isinstance(iterable, _collections.Mapping):
     # Iterate through dictionaries in a deterministic order by sorting the
     # keys. Notice this means that we ignore the original order of `OrderedDict`
     # instances. This is intentional, to avoid potential bugs caused by mixing
     # ordered and plain dicts (e.g., flattening a dict but using a
     # corresponding `OrderedDict` to pack it back).
     for key in _sorted(iterable):
-      yield iterable[key]
+      yield key, iterable[key]
   elif _is_attrs(iterable):
-    for value in _get_attrs_values(iterable):
-      yield value
+    for item in _get_attrs_items(iterable):
+      yield item
+  elif _is_namedtuple(iterable):
+    for field in iterable._fields:
+      yield field, getattr(iterable, field)
+  elif _is_composite_tensor(iterable):
+    for item in enumerate(iterable._to_components()):  # pylint: disable=protected-access
+      yield item
   else:
-    for value in iterable:
-      yield value
+    for item in enumerate(iterable):
+      yield item
 
 
 # See the swig file (util.i) for documentation.
@@ -125,7 +181,55 @@ is_sequence = _pywrap_tensorflow.IsSequence
 
 
 # See the swig file (util.i) for documentation.
-flatten = _pywrap_tensorflow.Flatten
+is_sequence_or_composite = _pywrap_tensorflow.IsSequenceOrComposite
+
+
+@tf_export("nest.is_nested")
+def is_nested(seq):
+  """Returns true if its input is a collections.Sequence (except strings).
+
+  Args:
+    seq: an input sequence.
+
+  Returns:
+    True if the sequence is a not a string and is a collections.Sequence or a
+    dict.
+  """
+  return is_sequence(seq)
+
+
+@tf_export("nest.flatten")
+def flatten(structure, expand_composites=False):
+  """Returns a flat list from a given nested structure.
+
+  If nest is not a sequence, tuple, or dict, then returns a single-element list:
+  [nest].
+
+  In the case of dict instances, the sequence consists of the values, sorted by
+  key to ensure deterministic behavior. This is true also for OrderedDict
+  instances: their sequence order is ignored, the sorting order of keys is used
+  instead. The same convention is followed in pack_sequence_as. This correctly
+  repacks dicts and OrderedDicts after they have been flattened, and also allows
+  flattening an OrderedDict and then repacking it back using a corresponding
+  plain dict, or vice-versa. Dictionaries with non-sortable keys cannot be
+  flattened.
+
+  Users must not modify any collections used in nest while this function is
+  running.
+
+  Args:
+    structure: an arbitrarily nested structure or a scalar object. Note, numpy
+      arrays are considered scalars.
+    expand_composites: If true, then composite tensors such as tf.SparseTensor
+       and tf.RaggedTensor are expanded into their component tensors.
+
+  Returns:
+    A Python list, the flattened version of the input.
+
+  Raises:
+    TypeError: The nest is or contains a dict with non-sortable keys.
+  """
+  return _pywrap_tensorflow.Flatten(structure, expand_composites)
 
 
 # See the swig file (util.i) for documentation.
@@ -144,12 +248,14 @@ class _DotString(object):
 _DOT = _DotString()
 
 
-def assert_same_structure(nest1, nest2, check_types=True):
+@tf_export("nest.assert_same_structure")
+def assert_same_structure(nest1, nest2, check_types=True,
+                          expand_composites=False):
   """Asserts that two structures are nested in the same way.
 
   Note that namedtuples with identical name and fields are always considered
   to have the same shallow structure (even with `check_types=True`).
-  For intance, this code will print `True`:
+  For instance, this code will print `True`:
 
   ```python
   def nt(a, b):
@@ -166,8 +272,10 @@ def assert_same_structure(nest1, nest2, check_types=True):
         size. Note that namedtuples with identical name and fields are always
         considered to have the same shallow structure. Two types will also be
         considered the same if they are both list subtypes (which allows "list"
-        and "_ListWrapper" from checkpointable dependency tracking to compare
+        and "_ListWrapper" from trackable dependency tracking to compare
         equal).
+    expand_composites: If true, then composite tensors such as `tf.SparseTensor`
+        and `tf.RaggedTensor` are expanded into their component tensors.
 
   Raises:
     ValueError: If the two structures do not have the same number of elements or
@@ -176,7 +284,8 @@ def assert_same_structure(nest1, nest2, check_types=True):
       their substructures. Only possible if `check_types` is `True`.
   """
   try:
-    _pywrap_tensorflow.AssertSameStructure(nest1, nest2, check_types)
+    _pywrap_tensorflow.AssertSameStructure(nest1, nest2, check_types,
+                                           expand_composites)
   except (ValueError, TypeError) as e:
     str1 = str(map_structure(lambda _: _DOT, nest1))
     str2 = str(map_structure(lambda _: _DOT, nest2))
@@ -242,13 +351,14 @@ def flatten_dict_items(dictionary):
   return flat_dictionary
 
 
-def _packed_nest_with_indices(structure, flat, index):
+def _packed_nest_with_indices(structure, flat, index, is_seq):
   """Helper function for pack_sequence_as.
 
   Args:
     structure: Substructure (list / tuple / dict) to mimic.
     flat: Flattened values to output substructure for.
     index: Index at which to start reading from flat.
+    is_seq: Function used to test if a value should be treated as a sequence.
 
   Returns:
     The tuple (new_index, child), where:
@@ -263,8 +373,8 @@ def _packed_nest_with_indices(structure, flat, index):
   """
   packed = []
   for s in _yield_value(structure):
-    if is_sequence(s):
-      new_index, child = _packed_nest_with_indices(s, flat, index)
+    if is_seq(s):
+      new_index, child = _packed_nest_with_indices(s, flat, index, is_seq)
       packed.append(_sequence_like(s, child))
       index = new_index
     else:
@@ -273,7 +383,8 @@ def _packed_nest_with_indices(structure, flat, index):
   return index, packed
 
 
-def pack_sequence_as(structure, flat_sequence):
+@tf_export("nest.pack_sequence_as")
+def pack_sequence_as(structure, flat_sequence, expand_composites=False):
   """Returns a given flattened sequence packed into a given structure.
 
   If `structure` is a scalar, `flat_sequence` must be a single-element list;
@@ -293,6 +404,8 @@ def pack_sequence_as(structure, flat_sequence):
         tuples, and dicts. Note: numpy arrays and strings are considered
         scalars.
     flat_sequence: flat sequence to pack.
+    expand_composites: If true, then composite tensors such as `tf.SparseTensor`
+        and `tf.RaggedTensor` are expanded into their component tensors.
 
   Returns:
     packed: `flat_sequence` converted to have the same recursive structure as
@@ -303,17 +416,19 @@ def pack_sequence_as(structure, flat_sequence):
       element counts.
     TypeError: `structure` is or contains a dict with non-sortable keys.
   """
-  if not is_sequence(flat_sequence):
+  is_seq = is_sequence_or_composite if expand_composites else is_sequence
+  if not is_seq(flat_sequence):
     raise TypeError("flat_sequence must be a sequence")
 
-  if not is_sequence(structure):
+  if not is_seq(structure):
     if len(flat_sequence) != 1:
       raise ValueError("Structure is a scalar but len(flat_sequence) == %d > 1"
                        % len(flat_sequence))
     return flat_sequence[0]
 
   try:
-    final_index, packed = _packed_nest_with_indices(structure, flat_sequence, 0)
+    final_index, packed = _packed_nest_with_indices(structure, flat_sequence,
+                                                    0, is_seq)
     if final_index < len(flat_sequence):
       raise IndexError
   except IndexError:
@@ -326,7 +441,8 @@ def pack_sequence_as(structure, flat_sequence):
   return _sequence_like(structure, packed)
 
 
-def map_structure(func, *structure, **check_types_dict):
+@tf_export("nest.map_structure")
+def map_structure(func, *structure, **kwargs):
   """Applies `func` to each entry in `structure` and returns a new structure.
 
   Applies `func(x[0], x[1], ...)` where x[i] is an entry in
@@ -337,12 +453,18 @@ def map_structure(func, *structure, **check_types_dict):
     func: A callable that accepts as many arguments as there are structures.
     *structure: scalar, or tuple or list of constructed scalars and/or other
       tuples/lists, or scalars.  Note: numpy arrays are considered as scalars.
-    **check_types_dict: only valid keyword argument is `check_types`. If set to
-      `True` (default) the types of iterables within the structures have to be
-      same (e.g. `map_structure(func, [1], (1,))` raises a `TypeError`
-      exception). To allow this set this argument to `False`.
-      Note that namedtuples with identical name and fields are always
-      considered to have the same shallow structure.
+    **kwargs: Valid keyword args are:
+
+      * `check_types`: If set to `True` (default) the types of
+        iterables within the structures have to be same (e.g.
+        `map_structure(func, [1], (1,))` raises a `TypeError`
+        exception). To allow this set this argument to `False`.
+        Note that namedtuples with identical name and fields are always
+        considered to have the same shallow structure.
+      * `expand_composites`: If set to `True`, then composite tensors such
+        as `tf.SparseTensor` and `tf.RaggedTensor` are expanded into their
+        component tensors.  If `False` (the default), then composite tensors
+        are not expanded.
 
   Returns:
     A new structure with the same arity as `structure`, whose values correspond
@@ -364,21 +486,25 @@ def map_structure(func, *structure, **check_types_dict):
   if not structure:
     raise ValueError("Must provide at least one structure")
 
-  if check_types_dict:
-    if "check_types" not in check_types_dict or len(check_types_dict) > 1:
-      raise ValueError("Only valid keyword argument is check_types")
-    check_types = check_types_dict["check_types"]
-  else:
-    check_types = True
+  check_types = True
+  expand_composites = False
+  if kwargs:
+    check_types = kwargs.pop("check_types", check_types)
+    expand_composites = kwargs.pop("expand_composites", expand_composites)
+    if kwargs:
+      raise ValueError("Only valid keyword arguments are check_types "
+                       "and expand_composites")
 
   for other in structure[1:]:
-    assert_same_structure(structure[0], other, check_types=check_types)
+    assert_same_structure(structure[0], other, check_types=check_types,
+                          expand_composites=expand_composites)
 
-  flat_structure = [flatten(s) for s in structure]
+  flat_structure = [flatten(s, expand_composites) for s in structure]
   entries = zip(*flat_structure)
 
   return pack_sequence_as(
-      structure[0], [func(*x) for x in entries])
+      structure[0], [func(*x) for x in entries],
+      expand_composites=expand_composites)
 
 
 def map_structure_with_paths(func, *structure, **kwargs):
@@ -413,39 +539,86 @@ def map_structure_with_paths(func, *structure, **kwargs):
       the type of sequence in any of their substructures.
     ValueError: If no structures are provided.
   """
-  if not callable(func):
-    raise TypeError("func must be callable, got: %s" % func)
-  if not structure:
-    raise ValueError("Must provide at least one structure")
+  def wrapper_func(tuple_path, *inputs, **kwargs):
+    string_path = "/".join(str(s) for s in tuple_path)
+    return func(string_path, *inputs, **kwargs)
 
-  check_types = kwargs.pop("check_types", True)
-  for other in structure[1:]:
-    assert_same_structure(structure[0], other, check_types=check_types)
+  return map_structure_with_tuple_paths_up_to(structure[0],
+                                              wrapper_func,
+                                              *structure,
+                                              **kwargs)
 
-  # First set paths_and_values to:
-  # [[(p11, v11), ... (p1n, v1n)], ... [(pm1, vm1), ... (pmn, vmn)]]
-  paths_and_values = [flatten_with_joined_string_paths(s) for s in structure]
 
-  # Now zip(*paths_and_values) would be:
-  # [((p11, v11), ... (pm1, vm1)), ... ((p1n, v1n), ... (pmn, vmn))]
-  # so grouped_by_path is set to:
-  # [[(p11, ... pm1), (v11, ... vm1)], ... [(p1n, ... pmn), (v1n, ... vmn)]]
-  # Note that p1i, ... pmi must all be equal since the structures are the same.
-  grouped_by_path = [zip(*p_v) for p_v in zip(*paths_and_values)]
+def map_structure_with_tuple_paths(func, *structure, **kwargs):
+  """Applies `func` to each entry in `structure` and returns a new structure.
 
-  return pack_sequence_as(structure[0], [
-      func(paths[0], *values, **kwargs) for paths, values in grouped_by_path])
+  Applies `func(tuple_path, x[0], x[1], ..., **kwargs)` where `x[i]` is an entry
+  in `structure[i]` and `tuple_path` is a tuple of indices and/or dictionary
+  keys (as returned by `nest.yield_flat_paths`), which uniquely specifies the
+  common path to x[i] in the structures. All structures in `structure` must have
+  the same arity, and the return value will contain the results in the same
+  structure. Special kwarg `check_types` determines whether the types of
+  iterables within the structure must be the same-- see **kwargs definition
+  below.
 
+  Args:
+    func: A callable with the signature `func(tuple_path, *values, **kwargs)`
+      that is evaluated on the leaves of the structure.
+    *structure: A variable number of compatible structures to process.
+    **kwargs: Optional kwargs to be passed through to func. Special kwarg
+      `check_types` is not passed to func, but instead determines whether the
+      types of iterables within the structures have to be same (e.g.
+      `map_structure(func, [1], (1,))` raises a `TypeError` exception). To allow
+      this set this argument to `False`.
 
-def _yield_flat_up_to(shallow_tree, input_tree):
-  """Yields elements `input_tree` partially flattened up to `shallow_tree`."""
-  if is_sequence(shallow_tree):
-    for shallow_branch, input_branch in zip(_yield_value(shallow_tree),
-                                            _yield_value(input_tree)):
-      for input_leaf in _yield_flat_up_to(shallow_branch, input_branch):
-        yield input_leaf
+  Returns:
+    A structure of the same form as the input structures whose leaves are the
+    result of evaluating func on corresponding leaves of the input structures.
+
+  Raises:
+    TypeError: If `func` is not callable or if the structures do not match
+      each other by depth tree.
+    TypeError: If `check_types` is not `False` and the two structures differ in
+      the type of sequence in any of their substructures.
+    ValueError: If no structures are provided.
+  """
+  return map_structure_with_tuple_paths_up_to(structure[0],
+                                              func,
+                                              *structure,
+                                              **kwargs)
+
+
+def _yield_flat_up_to(shallow_tree, input_tree, path=()):
+  """Yields (path, value) pairs of input_tree flattened up to shallow_tree.
+
+  Args:
+    shallow_tree: Nested structure. Traverse no further than its leaf nodes.
+    input_tree: Nested structure. Return the paths and values from this tree.
+      Must have the same upper structure as shallow_tree.
+    path: Tuple. Optional argument, only used when recursing. The path from the
+      root of the original shallow_tree, down to the root of the shallow_tree
+      arg of this recursive call.
+
+  Yields:
+    Pairs of (path, value), where path the tuple path of a leaf node in
+    shallow_tree, and value is the value of the corresponding node in
+    input_tree.
+  """
+  if (isinstance(shallow_tree, _six.string_types) or
+      not any([isinstance(shallow_tree, _collections.Sequence),
+               isinstance(shallow_tree, _collections.Mapping),
+               _is_namedtuple(shallow_tree),
+               _is_attrs(shallow_tree)])):
+    yield (path, input_tree)
   else:
-    yield input_tree
+    input_tree = dict(_yield_sorted_items(input_tree))
+    for shallow_key, shallow_subtree in _yield_sorted_items(shallow_tree):
+      subpath = path + (shallow_key,)
+      input_subtree = input_tree[shallow_key]
+      for leaf_path, leaf_value in _yield_flat_up_to(shallow_subtree,
+                                                     input_subtree,
+                                                     path=subpath):
+        yield (leaf_path, leaf_value)
 
 
 def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
@@ -459,15 +632,15 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
 
   The following code will raise an exception:
   ```python
-    shallow_tree = ["a", "b"]
-    input_tree = ["c", ["d", "e"], "f"]
+    shallow_tree = {"a": "A", "b": "B"}
+    input_tree = {"a": 1, "c": 2}
     assert_shallow_structure(shallow_tree, input_tree)
   ```
 
   The following code will not raise an exception:
   ```python
     shallow_tree = ["a", "b"]
-    input_tree = ["c", ["d", "e"]]
+    input_tree = ["c", ["d", "e"], "f"]
     assert_shallow_structure(shallow_tree, input_tree)
   ```
 
@@ -499,40 +672,34 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
       input_is_namedtuple = _is_namedtuple(input_tree, False)
       if shallow_is_namedtuple and input_is_namedtuple:
         if not _same_namedtuples(shallow_tree, input_tree):
-          raise TypeError(
-              "The two namedtuples don't have the same sequence type. Input "
-              "structure has type %s, while shallow structure has type %s."
-              % (type(input_tree), type(shallow_tree)))
+          raise TypeError(_STRUCTURES_HAVE_MISMATCHING_TYPES.format(
+              input_type=type(input_tree),
+              shallow_type=type(shallow_tree)))
+
       elif not (isinstance(shallow_tree, _collections.Mapping)
                 and isinstance(input_tree, _collections.Mapping)):
-        raise TypeError(
-            "The two structures don't have the same sequence type. Input "
-            "structure has type %s, while shallow structure has type %s."
-            % (type(input_tree), type(shallow_tree)))
-
-    if len(input_tree) != len(shallow_tree):
-      raise ValueError(
-          "The two structures don't have the same sequence length. Input "
-          "structure has length %s, while shallow structure has length %s."
-          % (len(input_tree), len(shallow_tree)))
+        raise TypeError(_STRUCTURES_HAVE_MISMATCHING_TYPES.format(
+            input_type=type(input_tree),
+            shallow_type=type(shallow_tree)))
 
-    if check_types and isinstance(shallow_tree, (dict, _collections.Mapping)):
-      if set(input_tree) != set(shallow_tree):
-        raise ValueError(
-            "The two structures don't have the same keys. Input "
-            "structure has keys %s, while shallow structure has keys %s." %
-            (list(_six.iterkeys(input_tree)),
-             list(_six.iterkeys(shallow_tree))))
+    if len(input_tree) < len(shallow_tree):
+      raise ValueError(_INPUT_TREE_SMALLER_THAN_SHALLOW_TREE.format(
+          input_size=len(input_tree),
+          shallow_size=len(shallow_tree)))
 
-      input_tree = list(sorted(_six.iteritems(input_tree)))
-      shallow_tree = list(sorted(_six.iteritems(shallow_tree)))
+    if isinstance(shallow_tree, _collections.Mapping):
+      absent_keys = set(shallow_tree) - set(input_tree)
+      if absent_keys:
+        raise ValueError(_SHALLOW_TREE_HAS_INVALID_KEYS
+                         .format(sorted(absent_keys)))
 
-    for shallow_branch, input_branch in zip(shallow_tree, input_tree):
+    for shallow_branch, input_branch in zip(_yield_value(shallow_tree),
+                                            _yield_value(input_tree)):
       assert_shallow_structure(shallow_branch, input_branch,
                                check_types=check_types)
 
 
-def flatten_up_to(shallow_tree, input_tree):
+def flatten_up_to(shallow_tree, input_tree, check_types=True):
   """Flattens `input_tree` up to `shallow_tree`.
 
   Any further depth in structure in `input_tree` is retained as elements in the
@@ -589,6 +756,8 @@ def flatten_up_to(shallow_tree, input_tree):
     shallow_tree: a possibly pruned structure of input_tree.
     input_tree: an arbitrarily nested structure or a scalar object.
       Note, numpy arrays are considered scalars.
+    check_types: bool. If True, check that each node in shallow_tree has the
+      same type as the corresponding node in input_tree.
 
   Returns:
     A Python list, the partially flattened version of `input_tree` according to
@@ -601,11 +770,106 @@ def flatten_up_to(shallow_tree, input_tree):
     ValueError: If the sequence lengths of `shallow_tree` are different from
       `input_tree`.
   """
-  assert_shallow_structure(shallow_tree, input_tree)
+  assert_shallow_structure(shallow_tree, input_tree, check_types)
+  # Discard paths returned by _yield_flat_up_to.
+  return list(v for _, v in _yield_flat_up_to(shallow_tree, input_tree))
+
+
+def flatten_with_tuple_paths_up_to(shallow_tree, input_tree, check_types=True):
+  """Flattens `input_tree` up to `shallow_tree`.
+
+  Any further depth in structure in `input_tree` is retained as elements in the
+  partially flattened output.
+
+  Returns a list of (path, value) pairs, where value a leaf node in the
+  flattened tree, and path is the tuple path of that leaf in input_tree.
+
+  If `shallow_tree` and `input_tree` are not sequences, this returns a
+  single-element list: `[((), input_tree)]`.
+
+  Use Case:
+
+  Sometimes we may wish to partially flatten a nested sequence, retaining some
+  of the nested structure. We achieve this by specifying a shallow structure,
+  `shallow_tree`, we wish to flatten up to.
+
+  The input, `input_tree`, can be thought of as having the same structure as
+  `shallow_tree`, but with leaf nodes that are themselves tree structures.
+
+  Examples:
+
+  ```python
+  input_tree = [[[2, 2], [3, 3]], [[4, 9], [5, 5]]]
+  shallow_tree = [[True, True], [False, True]]
+
+  flattened_input_tree = flatten_with_tuple_paths_up_to(shallow_tree,
+                                                        input_tree)
+  flattened_shallow_tree = flatten_with_tuple_paths_up_to(shallow_tree,
+                                                          shallow_tree)
+
+  # Output is:
+  # [((0, 0), [2, 2]),
+  #  ((0, 1), [3, 3]),
+  #  ((1, 0), [4, 9]),
+  #  ((1, 1), [5, 5])]
+  #
+  # [((0, 0), True),
+  #  ((0, 1), True),
+  #  ((1, 0), False),
+  #  ((1, 1), True)]
+  ```
+
+  ```python
+  input_tree = [[('a', 1), [('b', 2), [('c', 3), [('d', 4)]]]]]
+  shallow_tree = [['level_1', ['level_2', ['level_3', ['level_4']]]]]
+
+  input_tree_flattened_as_shallow_tree = flatten_up_to(shallow_tree, input_tree)
+  input_tree_flattened = flatten(input_tree)
+
+  # Output is:
+  # [((0, 0), ('a', 1)),
+  #  ((0, 1, 0), ('b', 2)),
+  #  ((0, 1, 1, 0), ('c', 3)),
+  #  ((0, 1, 1, 1), ('d', 4))]
+  # ['a', 1, 'b', 2, 'c', 3, 'd', 4]
+  ```
+
+  Non-Sequence Edge Cases:
+
+  ```python
+  flatten_with_tuple_paths_up_to(0, 0)  # Output: [(), 0]
+
+  flatten_with_tuple_paths_up_to(0, [0, 1, 2])  # Output: [(), [0, 1, 2]]
+
+  flatten_with_tuple_paths_up_to([0, 1, 2], 0)  # Output: TypeError
+
+  flatten_with_tuple_paths_up_to([0, 1, 2], [0, 1, 2])
+  # Output: [((0,) 0), ((1,), 1), ((2,), 2)]
+  ```
+
+  Args:
+    shallow_tree: a possibly pruned structure of input_tree.
+    input_tree: an arbitrarily nested structure or a scalar object.
+      Note, numpy arrays are considered scalars.
+    check_types: bool. If True, check that each node in shallow_tree has the
+      same type as the corresponding node in input_tree.
+
+  Returns:
+    A Python list, the partially flattened version of `input_tree` according to
+    the structure of `shallow_tree`.
+
+  Raises:
+    TypeError: If `shallow_tree` is a sequence but `input_tree` is not.
+    TypeError: If the sequence types of `shallow_tree` are different from
+      `input_tree`.
+    ValueError: If the sequence lengths of `shallow_tree` are different from
+      `input_tree`.
+  """
+  assert_shallow_structure(shallow_tree, input_tree, check_types=check_types)
   return list(_yield_flat_up_to(shallow_tree, input_tree))
 
 
-def map_structure_up_to(shallow_tree, func, *inputs):
+def map_structure_up_to(shallow_tree, func, *inputs, **kwargs):
   """Applies a function or op to a number of partially flattened inputs.
 
   The `inputs` are flattened up to `shallow_tree` before being mapped.
@@ -625,6 +889,14 @@ def map_structure_up_to(shallow_tree, func, *inputs):
 
   Examples:
 
+  ```python
+  shallow_tree = [None, None]
+  inp_val = [1, 2, 3]
+  out = map_structure_up_to(shallow_tree, lambda x: 2 * x, inp_val)
+
+  # Output is: [2, 4]
+  ```
+
   ```python
   ab_tuple = collections.namedtuple("ab_tuple", "a, b")
   op_tuple = collections.namedtuple("op_tuple", "add, mul")
@@ -654,6 +926,11 @@ def map_structure_up_to(shallow_tree, func, *inputs):
         shallow_tree. The function `func` is applied to corresponding
         partially flattened elements of each input, so the function must support
         arity of `len(inputs)`.
+    **kwargs: kwargs to feed to func(). Special kwarg
+      `check_types` is not passed to func, but instead determines whether the
+      types of iterables within the structures have to be same (e.g.
+      `map_structure(func, [1], (1,))` raises a `TypeError` exception). To allow
+      this set this argument to `False`.
 
   Raises:
     TypeError: If `shallow_tree` is a sequence but `input_tree` is not.
@@ -666,16 +943,93 @@ def map_structure_up_to(shallow_tree, func, *inputs):
     result of repeatedly applying `func`, with same structure as
     `shallow_tree`.
   """
+  return map_structure_with_tuple_paths_up_to(
+      shallow_tree,
+      lambda _, *values: func(*values),  # Discards the path arg.
+      *inputs,
+      **kwargs)
+
+
+def map_structure_with_tuple_paths_up_to(shallow_tree, func, *inputs, **kwargs):
+  """Applies a function or op to a number of partially flattened inputs.
+
+  Like map_structure_up_to(), except that the 'func' argument takes a path
+  tuple as its first argument, followed by the corresponding values from
+  *inputs.
+
+  Example:
+
+  lowercase = {'a': 'a', 'b': ('b0', 'b1')}
+  uppercase = {'a': 'A', 'b': ('B0', 'B1')}
+
+  def print_path_and_values(path, *values):
+    print("path: {}, values: {}".format(path, values))
+
+  shallow_tree = {'a': None}
+  map_structure_with_tuple_paths_up_to(shallow_tree,
+                                       print_path_and_values,
+                                       lowercase,
+                                       uppercase)
+  >>> path: ('a',), values: ('a', 'A')
+  >>> path: ('b', 0), values: ('b0', 'B0')
+  >>> path: ('b', 1), values: ('b1', 'B1')
+
+  shallow_tree = {'b': None}
+  map_structure_with_tuple_paths_up_to(shallow_tree,
+                                       print_path_and_values,
+                                       lowercase,
+                                       uppercase,
+                                       check_types=False)
+  >>> path: ('b', 1), values: (('bo', 'b1'), ('B0', 'B1'))
+
+  shallow_tree = {'a': None, 'b': {1: None}}
+  map_structure_with_tuple_paths_up_to(shallow_tree,
+                                       print_path_and_values,
+                                       lowercase,
+                                       uppercase,
+                                       check_types=False)
+  >>> path: ('a',), values: ('a', 'A')
+  >>> path: ('b', 1), values: ('b1', B1')
+
+  Args:
+    shallow_tree: a shallow tree, common to all the inputs.
+    func: callable that takes args (path, inputs_0_value, ... , inputs_N_value),
+      where path is a tuple path to a leaf node in shallow_tree, and
+      inputs_i_value is the corresponding value from inputs[i].
+    *inputs: nested structures that are all structurally compatible with
+        shallow_tree.
+    **kwargs: kwargs to feed to func(). Special kwarg
+      `check_types` is not passed to func, but instead determines whether the
+      types of iterables within the structures have to be same (e.g.
+      `map_structure(func, [1], (1,))` raises a `TypeError` exception). To allow
+      this set this argument to `False`.
+
+  Raises:
+    TypeError: If `shallow_tree` is a sequence but one of `*inputs` is not.
+    TypeError: If the sequence types of `shallow_tree` are different from
+      `input_tree`.
+    ValueError: If the sequence lengths of `shallow_tree` are different from
+      `input_tree`.
+
+  Returns:
+    Result of repeatedly applying `func`. Has same structure as `shallow_tree`.
+  """
   if not inputs:
     raise ValueError("Cannot map over no sequences")
+
+  check_types = kwargs.pop("check_types", True)
+
   for input_tree in inputs:
-    assert_shallow_structure(shallow_tree, input_tree)
+    assert_shallow_structure(shallow_tree, input_tree, check_types=check_types)
 
   # Flatten each input separately, apply the function to corresponding elements,
   # then repack based on the structure of the first input.
-  all_flattened_up_to = [flatten_up_to(shallow_tree, input_tree)
-                         for input_tree in inputs]
-  results = [func(*tensors) for tensors in zip(*all_flattened_up_to)]
+  flat_value_lists = [flatten_up_to(shallow_tree, input_tree, check_types)
+                      for input_tree in inputs]
+  flat_path_list = [path for path, _
+                    in _yield_flat_up_to(shallow_tree, inputs[0])]
+  results = [func(*args, **kwargs) for args in zip(flat_path_list,
+                                                   *flat_value_lists)]
   return pack_sequence_as(structure=shallow_tree, flat_sequence=results)
 
 
@@ -774,27 +1128,8 @@ def yield_flat_paths(nest):
     Tuples containing index or key values which form the path to a specific
       leaf value in the nested structure.
   """
-
-  # The _maybe_add_final_path_element function is used below in order to avoid
-  # adding trailing slashes when the sub-element recursed into is a leaf.
-  if isinstance(nest, (dict, _collections.Mapping)):
-    for key in _sorted(nest):
-      value = nest[key]
-      for sub_path in yield_flat_paths(value):
-        yield (key,) + sub_path
-  elif _is_namedtuple(nest):
-    for key in nest._fields:
-      value = getattr(nest, key)
-      for sub_path in yield_flat_paths(value):
-        yield (key,) + sub_path
-  elif isinstance(nest, _six.string_types):
-    yield ()
-  elif isinstance(nest, _collections.Sequence):
-    for idx, value in enumerate(nest):
-      for sub_path in yield_flat_paths(value):
-        yield (idx,) + sub_path
-  else:
-    yield ()
+  for k, _ in _yield_flat_up_to(nest, nest):
+    yield k
 
 
 def flatten_with_joined_string_paths(structure, separator="/"):
@@ -820,5 +1155,24 @@ def flatten_with_joined_string_paths(structure, separator="/"):
   return list(zip(flat_string_paths, flatten(structure)))
 
 
+def flatten_with_tuple_paths(structure):
+  """Returns a list of `(tuple_path, leaf_element)` tuples.
+
+  The order of pairs produced matches that of `nest.flatten`. This allows you
+  to flatten a nested structure while keeping information about where in the
+  structure each data element was located. See `nest.yield_flat_paths`
+  for more information about tuple paths.
+
+  Args:
+    structure: the nested structure to flatten.
+
+  Returns:
+    A list of `(tuple_path, leaf_element)` tuples. Each `tuple_path` is a tuple
+    of indices and/or dictionary keys that uniquely specify the path to
+    `leaf_element` within `structure`.
+  """
+  return list(zip(yield_flat_paths(structure), flatten(structure)))
+
+
 _pywrap_tensorflow.RegisterType("Mapping", _collections.Mapping)
 _pywrap_tensorflow.RegisterType("Sequence", _collections.Sequence)
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index d0d0c5f7935ba0a4d2b867b3c6fb6bd52c7cd54a..0540f71f7a98b3fd574c98ae5d0406a4b5d94ff5 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -209,12 +209,12 @@ class NestTest(parameterized.TestCase, test.TestCase):
   def testFlatten_numpyIsNotFlattened(self):
     structure = np.array([1, 2, 3])
     flattened = nest.flatten(structure)
-    self.assertEqual(len(flattened), 1)
+    self.assertLen(flattened, 1)
 
   def testFlatten_stringIsNotFlattened(self):
     structure = "lots of letters"
     flattened = nest.flatten(structure)
-    self.assertEqual(len(flattened), 1)
+    self.assertLen(flattened, 1)
     unflattened = nest.pack_sequence_as("goodbye", flattened)
     self.assertEqual(structure, unflattened)
 
@@ -231,17 +231,17 @@ class NestTest(parameterized.TestCase, test.TestCase):
                             ["and", "goodbye", "again"])
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
-  def testIsSequence(self):
-    self.assertFalse(nest.is_sequence("1234"))
-    self.assertTrue(nest.is_sequence([1, 3, [4, 5]]))
-    self.assertTrue(nest.is_sequence(((7, 8), (5, 6))))
-    self.assertTrue(nest.is_sequence([]))
-    self.assertTrue(nest.is_sequence({"a": 1, "b": 2}))
-    self.assertFalse(nest.is_sequence(set([1, 2])))
+  def testIsNested(self):
+    self.assertFalse(nest.is_nested("1234"))
+    self.assertTrue(nest.is_nested([1, 3, [4, 5]]))
+    self.assertTrue(nest.is_nested(((7, 8), (5, 6))))
+    self.assertTrue(nest.is_nested([]))
+    self.assertTrue(nest.is_nested({"a": 1, "b": 2}))
+    self.assertFalse(nest.is_nested(set([1, 2])))
     ones = array_ops.ones([2, 3])
-    self.assertFalse(nest.is_sequence(ones))
-    self.assertFalse(nest.is_sequence(math_ops.tanh(ones)))
-    self.assertFalse(nest.is_sequence(np.ones((4, 5))))
+    self.assertFalse(nest.is_nested(ones))
+    self.assertFalse(nest.is_nested(math_ops.tanh(ones)))
+    self.assertFalse(nest.is_nested(np.ones((4, 5))))
 
   @parameterized.parameters({"mapping_type": _CustomMapping},
                             {"mapping_type": dict})
@@ -510,30 +510,28 @@ class NestTest(parameterized.TestCase, test.TestCase):
   def testAssertShallowStructure(self):
     inp_ab = ["a", "b"]
     inp_abc = ["a", "b", "c"]
-    expected_message = (
-        "The two structures don't have the same sequence length. Input "
-        "structure has length 2, while shallow structure has length 3.")
-    with self.assertRaisesRegexp(ValueError, expected_message):
-      nest.assert_shallow_structure(inp_abc, inp_ab)
+    with self.assertRaisesWithLiteralMatch(
+        ValueError,
+        nest._INPUT_TREE_SMALLER_THAN_SHALLOW_TREE.format(
+            shallow_size=len(inp_abc),
+            input_size=len(inp_ab))):
+      nest.assert_shallow_structure(shallow_tree=inp_abc, input_tree=inp_ab)
 
     inp_ab1 = [(1, 1), (2, 2)]
     inp_ab2 = [[1, 1], [2, 2]]
-    expected_message = (
-        "The two structures don't have the same sequence type. Input structure "
-        "has type <(type|class) 'tuple'>, while shallow structure has type "
-        "<(type|class) 'list'>.")
-    with self.assertRaisesRegexp(TypeError, expected_message):
+    with self.assertRaisesWithLiteralMatch(
+        TypeError,
+        nest._STRUCTURES_HAVE_MISMATCHING_TYPES.format(
+            shallow_type=type(inp_ab2[0]),
+            input_type=type(inp_ab1[0]))):
       nest.assert_shallow_structure(inp_ab2, inp_ab1)
     nest.assert_shallow_structure(inp_ab2, inp_ab1, check_types=False)
 
     inp_ab1 = {"a": (1, 1), "b": {"c": (2, 2)}}
     inp_ab2 = {"a": (1, 1), "b": {"d": (2, 2)}}
-    expected_message = (
-        r"The two structures don't have the same keys. Input "
-        r"structure has keys \['c'\], while shallow structure has "
-        r"keys \['d'\].")
-
-    with self.assertRaisesRegexp(ValueError, expected_message):
+    with self.assertRaisesWithLiteralMatch(
+        ValueError,
+        nest._SHALLOW_TREE_HAS_INVALID_KEYS.format(["d"])):
       nest.assert_shallow_structure(inp_ab2, inp_ab1)
 
     inp_ab = collections.OrderedDict([("a", 1), ("b", (2, 3))])
@@ -688,6 +686,244 @@ class NestTest(parameterized.TestCase, test.TestCase):
     flattened_shallow_tree = nest.flatten_up_to(shallow_tree, shallow_tree)
     self.assertEqual(flattened_shallow_tree, shallow_tree)
 
+  def testFlattenWithTuplePathsUpTo(self):
+    def get_paths_and_values(shallow_tree, input_tree):
+      path_value_pairs = nest.flatten_with_tuple_paths_up_to(shallow_tree,
+                                                             input_tree)
+      paths = [p for p, _ in path_value_pairs]
+      values = [v for _, v in path_value_pairs]
+      return paths, values
+
+    # Shallow tree ends at scalar.
+    input_tree = [[[2, 2], [3, 3]], [[4, 9], [5, 5]]]
+    shallow_tree = [[True, True], [False, True]]
+    (flattened_input_tree_paths,
+     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree_paths,
+                     [(0, 0), (0, 1), (1, 0), (1, 1)])
+    self.assertEqual(flattened_input_tree, [[2, 2], [3, 3], [4, 9], [5, 5]])
+    self.assertEqual(flattened_shallow_tree_paths,
+                     [(0, 0), (0, 1), (1, 0), (1, 1)])
+    self.assertEqual(flattened_shallow_tree, [True, True, False, True])
+
+    # Shallow tree ends at string.
+    input_tree = [[("a", 1), [("b", 2), [("c", 3), [("d", 4)]]]]]
+    shallow_tree = [["level_1", ["level_2", ["level_3", ["level_4"]]]]]
+    (input_tree_flattened_as_shallow_tree_paths,
+     input_tree_flattened_as_shallow_tree) = get_paths_and_values(shallow_tree,
+                                                                  input_tree)
+    input_tree_flattened_paths = [p for p, _ in
+                                  nest.flatten_with_tuple_paths(input_tree)]
+    input_tree_flattened = nest.flatten(input_tree)
+    self.assertEqual(input_tree_flattened_as_shallow_tree_paths,
+                     [(0, 0), (0, 1, 0), (0, 1, 1, 0), (0, 1, 1, 1, 0)])
+    self.assertEqual(input_tree_flattened_as_shallow_tree,
+                     [("a", 1), ("b", 2), ("c", 3), ("d", 4)])
+
+    self.assertEqual(input_tree_flattened_paths,
+                     [(0, 0, 0), (0, 0, 1),
+                      (0, 1, 0, 0), (0, 1, 0, 1),
+                      (0, 1, 1, 0, 0), (0, 1, 1, 0, 1),
+                      (0, 1, 1, 1, 0, 0), (0, 1, 1, 1, 0, 1)])
+    self.assertEqual(input_tree_flattened, ["a", 1, "b", 2, "c", 3, "d", 4])
+
+    # Make sure dicts are correctly flattened, yielding values, not keys.
+    input_tree = {"a": 1, "b": {"c": 2}, "d": [3, (4, 5)]}
+    shallow_tree = {"a": 0, "b": 0, "d": [0, 0]}
+    (input_tree_flattened_as_shallow_tree_paths,
+     input_tree_flattened_as_shallow_tree) = get_paths_and_values(shallow_tree,
+                                                                  input_tree)
+    self.assertEqual(input_tree_flattened_as_shallow_tree_paths,
+                     [("a",), ("b",), ("d", 0), ("d", 1)])
+    self.assertEqual(input_tree_flattened_as_shallow_tree,
+                     [1, {"c": 2}, 3, (4, 5)])
+
+    # Namedtuples.
+    ab_tuple = collections.namedtuple("ab_tuple", "a, b")
+    input_tree = ab_tuple(a=[0, 1], b=2)
+    shallow_tree = ab_tuple(a=0, b=1)
+    (input_tree_flattened_as_shallow_tree_paths,
+     input_tree_flattened_as_shallow_tree) = get_paths_and_values(shallow_tree,
+                                                                  input_tree)
+    self.assertEqual(input_tree_flattened_as_shallow_tree_paths,
+                     [("a",), ("b",)])
+    self.assertEqual(input_tree_flattened_as_shallow_tree,
+                     [[0, 1], 2])
+
+    # Nested dicts, OrderedDicts and namedtuples.
+    input_tree = collections.OrderedDict(
+        [("a", ab_tuple(a=[0, {"b": 1}], b=2)),
+         ("c", {"d": 3, "e": collections.OrderedDict([("f", 4)])})])
+    shallow_tree = input_tree
+    (input_tree_flattened_as_shallow_tree_paths,
+     input_tree_flattened_as_shallow_tree) = get_paths_and_values(shallow_tree,
+                                                                  input_tree)
+    self.assertEqual(input_tree_flattened_as_shallow_tree_paths,
+                     [("a", "a", 0),
+                      ("a", "a", 1, "b"),
+                      ("a", "b"),
+                      ("c", "d"),
+                      ("c", "e", "f")])
+    self.assertEqual(input_tree_flattened_as_shallow_tree, [0, 1, 2, 3, 4])
+    shallow_tree = collections.OrderedDict([("a", 0), ("c", {"d": 3, "e": 1})])
+    (input_tree_flattened_as_shallow_tree_paths,
+     input_tree_flattened_as_shallow_tree) = get_paths_and_values(shallow_tree,
+                                                                  input_tree)
+    self.assertEqual(input_tree_flattened_as_shallow_tree_paths,
+                     [("a",),
+                      ("c", "d"),
+                      ("c", "e")])
+    self.assertEqual(input_tree_flattened_as_shallow_tree,
+                     [ab_tuple(a=[0, {"b": 1}], b=2),
+                      3,
+                      collections.OrderedDict([("f", 4)])])
+    shallow_tree = collections.OrderedDict([("a", 0), ("c", 0)])
+    (input_tree_flattened_as_shallow_tree_paths,
+     input_tree_flattened_as_shallow_tree) = get_paths_and_values(shallow_tree,
+                                                                  input_tree)
+    self.assertEqual(input_tree_flattened_as_shallow_tree_paths,
+                     [("a",), ("c",)])
+    self.assertEqual(input_tree_flattened_as_shallow_tree,
+                     [ab_tuple(a=[0, {"b": 1}], b=2),
+                      {"d": 3, "e": collections.OrderedDict([("f", 4)])}])
+
+    ## Shallow non-list edge-case.
+    # Using iterable elements.
+    input_tree = ["input_tree"]
+    shallow_tree = "shallow_tree"
+    (flattened_input_tree_paths,
+     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree_paths, [()])
+    self.assertEqual(flattened_input_tree, [input_tree])
+    self.assertEqual(flattened_shallow_tree_paths, [()])
+    self.assertEqual(flattened_shallow_tree, [shallow_tree])
+
+    input_tree = ["input_tree_0", "input_tree_1"]
+    shallow_tree = "shallow_tree"
+    (flattened_input_tree_paths,
+     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree_paths, [()])
+    self.assertEqual(flattened_input_tree, [input_tree])
+    self.assertEqual(flattened_shallow_tree_paths, [()])
+    self.assertEqual(flattened_shallow_tree, [shallow_tree])
+
+    # Test case where len(shallow_tree) < len(input_tree)
+    input_tree = {"a": "A", "b": "B", "c": "C"}
+    shallow_tree = {"a": 1, "c": 2}
+    (flattened_input_tree_paths,
+     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree_paths, [("a",), ("c",)])
+    self.assertEqual(flattened_input_tree, ["A", "C"])
+    self.assertEqual(flattened_shallow_tree_paths, [("a",), ("c",)])
+    self.assertEqual(flattened_shallow_tree, [1, 2])
+
+    # Using non-iterable elements.
+    input_tree = [0]
+    shallow_tree = 9
+    (flattened_input_tree_paths,
+     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree_paths, [()])
+    self.assertEqual(flattened_input_tree, [input_tree])
+    self.assertEqual(flattened_shallow_tree_paths, [()])
+    self.assertEqual(flattened_shallow_tree, [shallow_tree])
+
+    input_tree = [0, 1]
+    shallow_tree = 9
+    (flattened_input_tree_paths,
+     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree_paths, [()])
+    self.assertEqual(flattened_input_tree, [input_tree])
+    self.assertEqual(flattened_shallow_tree_paths, [()])
+    self.assertEqual(flattened_shallow_tree, [shallow_tree])
+
+    ## Both non-list edge-case.
+    # Using iterable elements.
+    input_tree = "input_tree"
+    shallow_tree = "shallow_tree"
+    (flattened_input_tree_paths,
+     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree_paths, [()])
+    self.assertEqual(flattened_input_tree, [input_tree])
+    self.assertEqual(flattened_shallow_tree_paths, [()])
+    self.assertEqual(flattened_shallow_tree, [shallow_tree])
+
+    # Using non-iterable elements.
+    input_tree = 0
+    shallow_tree = 0
+    (flattened_input_tree_paths,
+     flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_input_tree_paths, [()])
+    self.assertEqual(flattened_input_tree, [input_tree])
+    self.assertEqual(flattened_shallow_tree_paths, [()])
+    self.assertEqual(flattened_shallow_tree, [shallow_tree])
+
+    ## Input non-list edge-case.
+    # Using iterable elements.
+    input_tree = "input_tree"
+    shallow_tree = ["shallow_tree"]
+    with self.assertRaisesWithLiteralMatch(
+        TypeError,
+        nest._IF_SHALLOW_IS_SEQ_INPUT_MUST_BE_SEQ.format(type(input_tree))):
+      (flattened_input_tree_paths,
+       flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_shallow_tree_paths, [(0,)])
+    self.assertEqual(flattened_shallow_tree, shallow_tree)
+
+    input_tree = "input_tree"
+    shallow_tree = ["shallow_tree_9", "shallow_tree_8"]
+    with self.assertRaisesWithLiteralMatch(
+        TypeError,
+        nest._IF_SHALLOW_IS_SEQ_INPUT_MUST_BE_SEQ.format(type(input_tree))):
+      (flattened_input_tree_paths,
+       flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_shallow_tree_paths, [(0,), (1,)])
+    self.assertEqual(flattened_shallow_tree, shallow_tree)
+
+    # Using non-iterable elements.
+    input_tree = 0
+    shallow_tree = [9]
+    with self.assertRaisesWithLiteralMatch(
+        TypeError,
+        nest._IF_SHALLOW_IS_SEQ_INPUT_MUST_BE_SEQ.format(type(input_tree))):
+      (flattened_input_tree_paths,
+       flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_shallow_tree_paths, [(0,)])
+    self.assertEqual(flattened_shallow_tree, shallow_tree)
+
+    input_tree = 0
+    shallow_tree = [9, 8]
+    with self.assertRaisesWithLiteralMatch(
+        TypeError,
+        nest._IF_SHALLOW_IS_SEQ_INPUT_MUST_BE_SEQ.format(type(input_tree))):
+      (flattened_input_tree_paths,
+       flattened_input_tree) = get_paths_and_values(shallow_tree, input_tree)
+    (flattened_shallow_tree_paths,
+     flattened_shallow_tree) = get_paths_and_values(shallow_tree, shallow_tree)
+    self.assertEqual(flattened_shallow_tree_paths, [(0,), (1,)])
+    self.assertEqual(flattened_shallow_tree, shallow_tree)
+
   def testMapStructureUpTo(self):
     # Named tuples.
     ab_tuple = collections.namedtuple("ab_tuple", "a, b")
@@ -719,7 +955,9 @@ class NestTest(parameterized.TestCase, test.TestCase):
     # Non-equal dicts.
     inp_val = dict(a=2, b=3)
     inp_ops = dict(a=dict(add=1, mul=2), c=dict(add=2, mul=3))
-    with self.assertRaisesRegexp(ValueError, "same keys"):
+    with self.assertRaisesWithLiteralMatch(
+        ValueError,
+        nest._SHALLOW_TREE_HAS_INVALID_KEYS.format(["b"])):
       nest.map_structure_up_to(
           inp_val,
           lambda val, ops: (val + ops["add"]) * ops["mul"], inp_val, inp_ops)
@@ -736,7 +974,9 @@ class NestTest(parameterized.TestCase, test.TestCase):
     # Non-equal dict/mapping.
     inp_val = dict(a=2, b=3)
     inp_ops = _CustomMapping(a=dict(add=1, mul=2), c=dict(add=2, mul=3))
-    with self.assertRaisesRegexp(ValueError, "same keys"):
+    with self.assertRaisesWithLiteralMatch(
+        ValueError,
+        nest._SHALLOW_TREE_HAS_INVALID_KEYS.format(["b"])):
       nest.map_structure_up_to(
           inp_val,
           lambda val, ops: (val + ops["add"]) * ops["mul"], inp_val, inp_ops)
@@ -791,37 +1031,46 @@ class NestTest(parameterized.TestCase, test.TestCase):
       expected = inputs_expected["expected"]
       self.assertEqual(list(nest.yield_flat_paths(inputs)), expected)
 
-  def testFlattenWithStringPaths(self):
-    for inputs_expected in (
-        {"inputs": [], "expected": []},
-        {"inputs": [23, "42"], "expected": [("0", 23), ("1", "42")]},
-        {"inputs": [[[[108]]]], "expected": [("0/0/0/0", 108)]}):
-      inputs = inputs_expected["inputs"]
-      expected = inputs_expected["expected"]
-      self.assertEqual(
-          nest.flatten_with_joined_string_paths(inputs, separator="/"),
-          expected)
-
-  # Need a separate test for namedtuple as we can't declare tuple definitions
-  # in the @parameterized arguments.
-  def testFlattenNamedTuple(self):
-    # pylint: disable=invalid-name
-    Foo = collections.namedtuple("Foo", ["a", "b"])
-    Bar = collections.namedtuple("Bar", ["c", "d"])
-    # pylint: enable=invalid-name
-    test_cases = [
-        (Foo(a=3, b=Bar(c=23, d=42)),
-         [("a", 3), ("b/c", 23), ("b/d", 42)]),
-        (Foo(a=Bar(c=23, d=42), b=Bar(c=0, d="something")),
-         [("a/c", 23), ("a/d", 42), ("b/c", 0), ("b/d", "something")]),
-        (Bar(c=42, d=43),
-         [("c", 42), ("d", 43)]),
-        (Bar(c=[42], d=43),
-         [("c/0", 42), ("d", 43)]),
-    ]
-    for inputs, expected in test_cases:
-      self.assertEqual(
-          list(nest.flatten_with_joined_string_paths(inputs)), expected)
+  # We cannot define namedtuples within @parameterized argument lists.
+  # pylint: disable=invalid-name
+  Foo = collections.namedtuple("Foo", ["a", "b"])
+  Bar = collections.namedtuple("Bar", ["c", "d"])
+  # pylint: enable=invalid-name
+
+  @parameterized.parameters([
+      dict(inputs=[], expected=[]),
+      dict(inputs=[23, "42"], expected=[("0", 23), ("1", "42")]),
+      dict(inputs=[[[[108]]]], expected=[("0/0/0/0", 108)]),
+      dict(inputs=Foo(a=3, b=Bar(c=23, d=42)),
+           expected=[("a", 3), ("b/c", 23), ("b/d", 42)]),
+      dict(inputs=Foo(a=Bar(c=23, d=42), b=Bar(c=0, d="thing")),
+           expected=[("a/c", 23), ("a/d", 42), ("b/c", 0), ("b/d", "thing")]),
+      dict(inputs=Bar(c=42, d=43),
+           expected=[("c", 42), ("d", 43)]),
+      dict(inputs=Bar(c=[42], d=43),
+           expected=[("c/0", 42), ("d", 43)]),
+  ])
+  def testFlattenWithStringPaths(self, inputs, expected):
+    self.assertEqual(
+        nest.flatten_with_joined_string_paths(inputs, separator="/"),
+        expected)
+
+  @parameterized.parameters([
+      dict(inputs=[], expected=[]),
+      dict(inputs=[23, "42"], expected=[((0,), 23), ((1,), "42")]),
+      dict(inputs=[[[[108]]]], expected=[((0, 0, 0, 0), 108)]),
+      dict(inputs=Foo(a=3, b=Bar(c=23, d=42)),
+           expected=[(("a",), 3), (("b", "c"), 23), (("b", "d"), 42)]),
+      dict(inputs=Foo(a=Bar(c=23, d=42), b=Bar(c=0, d="thing")),
+           expected=[(("a", "c"), 23), (("a", "d"), 42), (("b", "c"), 0),
+                     (("b", "d"), "thing")]),
+      dict(inputs=Bar(c=42, d=43),
+           expected=[(("c",), 42), (("d",), 43)]),
+      dict(inputs=Bar(c=[42], d=43),
+           expected=[(("c", 0), 42), (("d",), 43)]),
+  ])
+  def testFlattenWithTuplePaths(self, inputs, expected):
+    self.assertEqual(nest.flatten_with_tuple_paths(inputs), expected)
 
   @parameterized.named_parameters(
       ("tuples", (1, 2), (3, 4), True, (("0", 4), ("1", 6))),
@@ -840,18 +1089,55 @@ class NestTest(parameterized.TestCase, test.TestCase):
     self.assertEqual(expected, result)
 
   @parameterized.named_parameters(
-      ("tuples", (1, 2), (3, 4, 5), ValueError),
+      ("tuples", (1, 2, 3), (4, 5), ValueError),
       ("dicts", {"a": 1}, {"b": 2}, ValueError),
       ("mixed", (1, 2), [3, 4], TypeError),
       ("nested",
-       {"a": [2, 3], "b": [1, 3]},
-       {"b": [5, 6, 7], "a": [8, 9]},
+       {"a": [2, 3, 4], "b": [1, 3]},
+       {"b": [5, 6], "a": [8, 9]},
        ValueError
       ))
   def testMapWithPathsIncompatibleStructures(self, s1, s2, error_type):
     with self.assertRaises(error_type):
       nest.map_structure_with_paths(lambda path, *s: 0, s1, s2)
 
+  @parameterized.named_parameters([
+      dict(testcase_name="Tuples", s1=(1, 2), s2=(3, 4),
+           check_types=True, expected=(((0,), 4), ((1,), 6))),
+      dict(testcase_name="Dicts", s1={"a": 1, "b": 2}, s2={"b": 4, "a": 3},
+           check_types=True, expected={"a": (("a",), 4), "b": (("b",), 6)}),
+      dict(testcase_name="Mixed", s1=(1, 2), s2=[3, 4],
+           check_types=False, expected=(((0,), 4), ((1,), 6))),
+      dict(testcase_name="Nested",
+           s1={"a": [2, 3], "b": [1, 2, 3]},
+           s2={"b": [5, 6, 7], "a": [8, 9]},
+           check_types=True,
+           expected={"a": [(("a", 0), 10), (("a", 1), 12)],
+                     "b": [(("b", 0), 6), (("b", 1), 8), (("b", 2), 10)]}),
+  ])
+  def testMapWithTuplePathsCompatibleStructures(
+      self, s1, s2, check_types, expected):
+    def path_and_sum(path, *values):
+      return path, sum(values)
+    result = nest.map_structure_with_tuple_paths(
+        path_and_sum, s1, s2, check_types=check_types)
+    self.assertEqual(expected, result)
+
+  @parameterized.named_parameters([
+      dict(testcase_name="Tuples", s1=(1, 2, 3), s2=(4, 5),
+           error_type=ValueError),
+      dict(testcase_name="Dicts", s1={"a": 1}, s2={"b": 2},
+           error_type=ValueError),
+      dict(testcase_name="Mixed", s1=(1, 2), s2=[3, 4], error_type=TypeError),
+      dict(testcase_name="Nested",
+           s1={"a": [2, 3, 4], "b": [1, 3]},
+           s2={"b": [5, 6], "a": [8, 9]},
+           error_type=ValueError)
+  ])
+  def testMapWithTuplePathsIncompatibleStructures(self, s1, s2, error_type):
+    with self.assertRaises(error_type):
+      nest.map_structure_with_tuple_paths(lambda path, *s: 0, s1, s2)
+
 
 class NestBenchmark(test.Benchmark):
 
diff --git a/tensorflow/python/util/serialization.py b/tensorflow/python/util/serialization.py
index cff864c0304b02aaa6339efb403388c65ab6fec4..2164ba4dbf22b46e7fad3ac45a164ddbdd2f01c0 100644
--- a/tensorflow/python/util/serialization.py
+++ b/tensorflow/python/util/serialization.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+
 import numpy as np
 
 from tensorflow.python.framework import tensor_shape
@@ -61,4 +63,7 @@ def get_json_type(obj):
   if isinstance(obj, tensor_shape.TensorShape):
     return obj.as_list()
 
+  if isinstance(obj, collections.Mapping):
+    return dict(obj)
+
   raise TypeError('Not JSON Serializable:', obj)
diff --git a/tensorflow/python/util/tf_decorator.py b/tensorflow/python/util/tf_decorator.py
index 0cfc836246d2d885c28d168fe90b08a325cf6ded..f5ce5bd42a5b1c844931131ea9a969431fa2ef49 100644
--- a/tensorflow/python/util/tf_decorator.py
+++ b/tensorflow/python/util/tf_decorator.py
@@ -59,7 +59,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools as _functools
 import traceback as _traceback
 
 
@@ -95,9 +94,17 @@ def make_decorator(target,
     decorator_func.__name__ = target.__name__
   if hasattr(target, '__module__'):
     decorator_func.__module__ = target.__module__
+  if hasattr(target, '__dict__'):
+    # Copy dict entries from target which are not overridden by decorator_func.
+    for name in target.__dict__:
+      if name not in decorator_func.__dict__:
+        decorator_func.__dict__[name] = target.__dict__[name]
   if hasattr(target, '__doc__'):
     decorator_func.__doc__ = decorator.__doc__
   decorator_func.__wrapped__ = target
+  # Keeping a second handle to `target` allows callers to detect whether the
+  # decorator was modified using `rewrap`.
+  decorator_func.__original_wrapped__ = target
   return decorator_func
 
 
@@ -173,6 +180,8 @@ def unwrap(maybe_tf_decorator):
       decorators.append(getattr(cur, '_tf_decorator'))
     else:
       break
+    if not hasattr(decorators[-1], 'decorated_target'):
+      break
     cur = decorators[-1].decorated_target
   return decorators, cur
 
@@ -202,8 +211,8 @@ class TFDecorator(object):
     else:
       self.__doc__ = ''
 
-  def __get__(self, obj, objtype):
-    return _functools.partial(self.__call__, obj)
+  def __get__(self, instance, owner):
+    return self._decorated_target.__get__(instance, owner)
 
   def __call__(self, *args, **kwargs):
     return self._decorated_target(*args, **kwargs)
diff --git a/tensorflow/python/util/tf_decorator_test.py b/tensorflow/python/util/tf_decorator_test.py
index 9198f0b3fad1590bedac71b30cf332e35cb489fe..48d735189cdb0acb394747aa3a99864393ccda7b 100644
--- a/tensorflow/python/util/tf_decorator_test.py
+++ b/tensorflow/python/util/tf_decorator_test.py
@@ -170,6 +170,17 @@ class TfDecoratorTest(test.TestCase):
     self.assertEqual('Return parameters.',
                      TestDecoratedClass().return_params.__doc__)
 
+  def testTarget__get__IsProxied(self):
+    class Descr(object):
+
+      def __get__(self, instance, owner):
+        return self
+
+    class Foo(object):
+      foo = tf_decorator.TFDecorator('Descr', Descr())
+
+    self.assertIsInstance(Foo.foo, Descr)
+
 
 def test_wrapper(*args, **kwargs):
   return test_function(*args, **kwargs)
@@ -199,6 +210,20 @@ class TfMakeDecoratorTest(test.TestCase):
     decorator = getattr(decorated, '_tf_decorator')
     self.assertEqual('test decorator doc', decorator.decorator_doc)
 
+  def testUpdatesDictWithMissingEntries(self):
+    test_function.foobar = True
+    decorated = tf_decorator.make_decorator(test_function, test_wrapper)
+    self.assertTrue(decorated.foobar)
+    del test_function.foobar
+
+  def testUpdatesDict_doesNotOverridePresentEntries(self):
+    test_function.foobar = True
+    test_wrapper.foobar = False
+    decorated = tf_decorator.make_decorator(test_function, test_wrapper)
+    self.assertFalse(decorated.foobar)
+    del test_function.foobar
+    del test_wrapper.foobar
+
   def testSetsTFDecoratorArgSpec(self):
     argspec = tf_inspect.ArgSpec(
         args=['a', 'b', 'c'],
diff --git a/tensorflow/python/util/tf_export.py b/tensorflow/python/util/tf_export.py
index ec70cae7d2fc00f793e8ffa0aec331e32e11115f..7b44d72ab94d675f4305b9dbdac46628dde01d4a 100644
--- a/tensorflow/python/util/tf_export.py
+++ b/tensorflow/python/util/tf_export.py
@@ -46,8 +46,10 @@ import functools
 import sys
 
 from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
 
 ESTIMATOR_API_NAME = 'estimator'
+KERAS_API_NAME = 'keras'
 TENSORFLOW_API_NAME = 'tensorflow'
 
 # List of subpackage names used by TensorFlow components. Have to check that
@@ -64,7 +66,10 @@ API_ATTRS = {
         '_tf_api_constants'),
     ESTIMATOR_API_NAME: _Attributes(
         '_estimator_api_names',
-        '_estimator_api_constants')
+        '_estimator_api_constants'),
+    KERAS_API_NAME: _Attributes(
+        '_keras_api_names',
+        '_keras_api_constants')
 }
 
 API_ATTRS_V1 = {
@@ -73,7 +78,10 @@ API_ATTRS_V1 = {
         '_tf_api_constants_v1'),
     ESTIMATOR_API_NAME: _Attributes(
         '_estimator_api_names_v1',
-        '_estimator_api_constants_v1')
+        '_estimator_api_constants_v1'),
+    KERAS_API_NAME: _Attributes(
+        '_keras_api_names_v1',
+        '_keras_api_constants_v1')
 }
 
 
@@ -147,10 +155,104 @@ def get_canonical_name(api_names, deprecated_api_names):
   return None
 
 
+def get_v1_names(symbol):
+  """Get a list of TF 1.* names for this symbol.
+
+  Args:
+    symbol: symbol to get API names for.
+
+  Returns:
+    List of all API names for this symbol including TensorFlow and
+    Estimator names.
+  """
+  names_v1 = []
+  tensorflow_api_attr_v1 = API_ATTRS_V1[TENSORFLOW_API_NAME].names
+  estimator_api_attr_v1 = API_ATTRS_V1[ESTIMATOR_API_NAME].names
+  keras_api_attr_v1 = API_ATTRS_V1[KERAS_API_NAME].names
+
+  if not hasattr(symbol, tensorflow_api_attr_v1):
+    return names_v1
+  if tensorflow_api_attr_v1 in symbol.__dict__:
+    names_v1.extend(getattr(symbol, tensorflow_api_attr_v1))
+  if estimator_api_attr_v1 in symbol.__dict__:
+    names_v1.extend(getattr(symbol, estimator_api_attr_v1))
+  if keras_api_attr_v1 in symbol.__dict__:
+    names_v1.extend(getattr(symbol, keras_api_attr_v1))
+  return names_v1
+
+
+def get_v2_names(symbol):
+  """Get a list of TF 2.0 names for this symbol.
+
+  Args:
+    symbol: symbol to get API names for.
+
+  Returns:
+    List of all API names for this symbol including TensorFlow and
+    Estimator names.
+  """
+  names_v2 = []
+  tensorflow_api_attr = API_ATTRS[TENSORFLOW_API_NAME].names
+  estimator_api_attr = API_ATTRS[ESTIMATOR_API_NAME].names
+  keras_api_attr = API_ATTRS[KERAS_API_NAME].names
+
+  if not hasattr(symbol, tensorflow_api_attr):
+    return names_v2
+  if tensorflow_api_attr in symbol.__dict__:
+    names_v2.extend(getattr(symbol, tensorflow_api_attr))
+  if estimator_api_attr in symbol.__dict__:
+    names_v2.extend(getattr(symbol, estimator_api_attr))
+  if keras_api_attr in symbol.__dict__:
+    names_v2.extend(getattr(symbol, keras_api_attr))
+  return names_v2
+
+
+def get_v1_constants(module):
+  """Get a list of TF 1.* constants in this module.
+
+  Args:
+    module: TensorFlow module.
+
+  Returns:
+    List of all API constants under the given module including TensorFlow and
+    Estimator constants.
+  """
+  constants_v1 = []
+  tensorflow_constants_attr_v1 = API_ATTRS_V1[TENSORFLOW_API_NAME].constants
+  estimator_constants_attr_v1 = API_ATTRS_V1[ESTIMATOR_API_NAME].constants
+
+  if hasattr(module, tensorflow_constants_attr_v1):
+    constants_v1.extend(getattr(module, tensorflow_constants_attr_v1))
+  if hasattr(module, estimator_constants_attr_v1):
+    constants_v1.extend(getattr(module, estimator_constants_attr_v1))
+  return constants_v1
+
+
+def get_v2_constants(module):
+  """Get a list of TF 2.0 constants in this module.
+
+  Args:
+    module: TensorFlow module.
+
+  Returns:
+    List of all API constants under the given module including TensorFlow and
+    Estimator constants.
+  """
+  constants_v2 = []
+  tensorflow_constants_attr = API_ATTRS[TENSORFLOW_API_NAME].constants
+  estimator_constants_attr = API_ATTRS[ESTIMATOR_API_NAME].constants
+
+  if hasattr(module, tensorflow_constants_attr):
+    constants_v2.extend(getattr(module, tensorflow_constants_attr))
+  if hasattr(module, estimator_constants_attr):
+    constants_v2.extend(getattr(module, estimator_constants_attr))
+  return constants_v2
+
+
 class api_export(object):  # pylint: disable=invalid-name
   """Provides ways to export symbols to the TensorFlow API."""
 
-  def __init__(self, *args, **kwargs):
+  def __init__(self, *args, **kwargs):  # pylint: disable=g-doc-args
     """Export under the names *args (first one is considered canonical).
 
     Args:
@@ -168,6 +270,10 @@ class api_export(object):  # pylint: disable=invalid-name
     """
     self._names = args
     self._names_v1 = kwargs.get('v1', args)
+    if 'v2' in kwargs:
+      raise ValueError('You passed a "v2" argument to tf_export. This is not '
+                       'what you want. Pass v2 names directly as positional '
+                       'arguments instead.')
     self._api_name = kwargs.get('api_name', TENSORFLOW_API_NAME)
     self._overrides = kwargs.get('overrides', [])
     self._allow_multiple_exports = kwargs.get('allow_multiple_exports', False)
@@ -274,5 +380,15 @@ class api_export(object):  # pylint: disable=invalid-name
         (self._names_v1, name))
 
 
+def kwarg_only(f):
+  """A wrapper that throws away all non-kwarg arguments."""
+  def wrapper(**kwargs):
+    return f(**kwargs)
+
+  return tf_decorator.make_decorator(
+      f, wrapper, decorator_argspec=tf_inspect.getargspec(f))
+
+
 tf_export = functools.partial(api_export, api_name=TENSORFLOW_API_NAME)
 estimator_export = functools.partial(api_export, api_name=ESTIMATOR_API_NAME)
+keras_export = functools.partial(api_export, api_name=KERAS_API_NAME)
diff --git a/tensorflow/python/util/tf_export_test.py b/tensorflow/python/util/tf_export_test.py
index a0fac8bf362627e6802821e3b33c0f107c5c97ce..20625792e9bf88ebca34ba00a885742c6d6f745f 100644
--- a/tensorflow/python/util/tf_export_test.py
+++ b/tensorflow/python/util/tf_export_test.py
@@ -62,6 +62,10 @@ class ValidateExportTest(test.TestCase):
         del symbol._tf_api_names
       if hasattr(symbol, '_tf_api_names_v1'):
         del symbol._tf_api_names_v1
+      if hasattr(symbol, '_estimator_api_names'):
+        del symbol._estimator_api_names
+      if hasattr(symbol, '_estimator_api_names_v1'):
+        del symbol._estimator_api_names_v1
 
   def _CreateMockModule(self, name):
     mock_module = self.MockModule(name)
@@ -74,6 +78,10 @@ class ValidateExportTest(test.TestCase):
     decorated_function = export_decorator(_test_function)
     self.assertEquals(decorated_function, _test_function)
     self.assertEquals(('nameA', 'nameB'), decorated_function._tf_api_names)
+    self.assertEquals(['nameA', 'nameB'],
+                      tf_export.get_v1_names(decorated_function))
+    self.assertEquals(['nameA', 'nameB'],
+                      tf_export.get_v2_names(decorated_function))
 
   def testExportMultipleFunctions(self):
     export_decorator1 = tf_export.tf_export('nameA', 'nameB')
@@ -95,6 +103,22 @@ class ValidateExportTest(test.TestCase):
     export_decorator_b(TestClassB)
     self.assertEquals(('TestClassA1',), TestClassA._tf_api_names)
     self.assertEquals(('TestClassB1',), TestClassB._tf_api_names)
+    self.assertEquals(['TestClassA1'], tf_export.get_v1_names(TestClassA))
+    self.assertEquals(['TestClassB1'], tf_export.get_v1_names(TestClassB))
+
+  def testExportClassInEstimator(self):
+    export_decorator_a = tf_export.tf_export('TestClassA1')
+    export_decorator_a(TestClassA)
+    self.assertEquals(('TestClassA1',), TestClassA._tf_api_names)
+
+    export_decorator_b = tf_export.estimator_export(
+        'estimator.TestClassB1')
+    export_decorator_b(TestClassB)
+    self.assertTrue('_tf_api_names' not in TestClassB.__dict__)
+    self.assertEquals(('TestClassA1',), TestClassA._tf_api_names)
+    self.assertEquals(['TestClassA1'], tf_export.get_v1_names(TestClassA))
+    self.assertEquals(['estimator.TestClassB1'],
+                      tf_export.get_v1_names(TestClassB))
 
   def testExportSingleConstant(self):
     module1 = self._CreateMockModule('module1')
@@ -103,6 +127,10 @@ class ValidateExportTest(test.TestCase):
     export_decorator.export_constant('module1', 'test_constant')
     self.assertEquals([(('NAME_A', 'NAME_B'), 'test_constant')],
                       module1._tf_api_constants)
+    self.assertEquals([(('NAME_A', 'NAME_B'), 'test_constant')],
+                      tf_export.get_v1_constants(module1))
+    self.assertEquals([(('NAME_A', 'NAME_B'), 'test_constant')],
+                      tf_export.get_v2_constants(module1))
 
   def testExportMultipleConstants(self):
     module1 = self._CreateMockModule('module1')
diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
index 5f1e776640df3e2b75e6a0b8accfce40098cf36c..3a1c4a6e12d66d7395f9c6b29ab53b3ce05da793 100644
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py
@@ -36,6 +36,19 @@ else:
       'annotations'
   ])
 
+
+def _convert_maybe_argspec_to_fullargspec(argspec):
+  if isinstance(argspec, FullArgSpec):
+    return argspec
+  return FullArgSpec(
+      args=argspec.args,
+      varargs=argspec.varargs,
+      varkw=argspec.keywords,
+      defaults=argspec.defaults,
+      kwonlyargs=[],
+      kwonlydefaults=None,
+      annotations={})
+
 if hasattr(_inspect, 'getfullargspec'):
   _getfullargspec = _inspect.getfullargspec  # pylint: disable=invalid-name
 
@@ -74,16 +87,7 @@ else:
     Returns:
       A FullArgSpec with empty kwonlyargs, kwonlydefaults and annotations.
     """
-    argspecs = getargspec(target)
-    fullargspecs = FullArgSpec(
-        args=argspecs.args,
-        varargs=argspecs.varargs,
-        varkw=argspecs.keywords,
-        defaults=argspecs.defaults,
-        kwonlyargs=[],
-        kwonlydefaults=None,
-        annotations={})
-    return fullargspecs
+    return _convert_maybe_argspec_to_fullargspec(getargspec(target))
 
 
 def currentframe():
@@ -238,7 +242,7 @@ def getfullargspec(obj):
     directly on the callable.
   """
   decorators, target = tf_decorator.unwrap(obj)
-  return next((d.decorator_argspec
+  return next((_convert_maybe_argspec_to_fullargspec(d.decorator_argspec)
                for d in decorators
                if d.decorator_argspec is not None), _getfullargspec(target))
 
@@ -380,3 +384,22 @@ def isroutine(object):  # pylint: disable=redefined-builtin
 def stack(context=1):
   """TFDecorator-aware replacement for inspect.stack."""
   return _inspect.stack(context)[1:]
+
+
+def getsource_no_unwrap(obj):
+  """Return source code for an object. Does not unwrap TFDecorators.
+
+  The source code is returned literally, including indentation for functions not
+  at the top level. This function is analogous to inspect.getsource, with one
+  key difference - it doesn't unwrap decorators. For simplicity, support for
+  some Python object types is dropped (tracebacks, frames, code objects).
+
+  Args:
+      obj: a class, method, or function object.
+
+  Returns:
+      source code as a string
+
+  """
+  lines, lnum = _inspect.findsource(obj)
+  return ''.join(_inspect.getblock(lines[lnum:]))
diff --git a/tensorflow/python/util/tf_inspect_test.py b/tensorflow/python/util/tf_inspect_test.py
index 02d075cdff97fc11274186b42e10d71744234364..910848e67f970ee8abf79e05414cd03c81bcdfab 100644
--- a/tensorflow/python/util/tf_inspect_test.py
+++ b/tensorflow/python/util/tf_inspect_test.py
@@ -727,6 +727,73 @@ class TfInspectGetCallArgsTest(test.TestCase):
         'c': 'goodbye'
     }, tf_inspect.getcallargs(decorated, 4, c='goodbye'))
 
+  def testGetSourceNoUnwrapHandlesPlainDecorator(self):
+    def dec(f):
+      def wrapper(*args, **kwargs):
+        return f(*args, **kwargs)
+      return wrapper
+
+    @dec
+    def f():
+      return 1
+
+    source = tf_inspect.getsource_no_unwrap(f)
+    self.assertNotIn('dec', source)
+    self.assertIn('wrapper', source)
+    self.assertNotIn('return 1', source)
+
+  def testGetSourceNoUnwrapHandlesFunctoolsDecorator(self):
+    def dec(f):
+      @functools.wraps(f)
+      def wrapper(*args, **kwargs):
+        return f(*args, **kwargs)
+      return wrapper
+
+    @dec
+    def f():
+      return 1
+
+    source = tf_inspect.getsource_no_unwrap(f)
+    self.assertNotIn('dec', source)
+    self.assertIn('wrapper', source)
+    self.assertNotIn('return 1', source)
+
+  def testGetSourceNoUnwrapHandlesPlainDecoratorFactory(self):
+    def dec_factory():
+      def dec(f):
+        def wrapper(*args, **kwargs):
+          return f(*args, **kwargs)
+        return wrapper
+      return dec
+
+    @dec_factory()
+    def f():
+      return 1
+
+    source = tf_inspect.getsource_no_unwrap(f)
+    self.assertNotIn('factory', source)
+    self.assertNotIn('dec', source)
+    self.assertIn('wrapper', source)
+    self.assertNotIn('return 1', source)
+
+  def testGetSourceNoUnwrapHandlesFunctoolsDecoratorFactory(self):
+    def dec_factory():
+      def dec(f):
+        @functools.wraps(f)
+        def wrapper(*args, **kwargs):
+          return f(*args, **kwargs)
+        return wrapper
+      return dec
+
+    @dec_factory()
+    def f():
+      return 1
+
+    source = tf_inspect.getsource_no_unwrap(f)
+    self.assertNotIn('factory', source)
+    self.assertNotIn('dec', source)
+    self.assertIn('wrapper', source)
+    self.assertNotIn('return 1', source)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index e69eec73a0ef8b37f042d9a0f5bf63569b6f5b39..bda0cba82fa31528337cd35d26f5daa577a43d55 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -455,6 +455,14 @@ class SparseTensorValueIterator : public ValueIterator {
   Safe_PyObjectPtr tensor_;
 };
 
+// Returns nullptr (to raise an exception) when next() is called.  Caller
+// should have already called PyErr_SetString.
+class ErrorValueIterator : public ValueIterator {
+ public:
+  ErrorValueIterator() {}
+  Safe_PyObjectPtr next() override { return nullptr; }
+};
+
 class AttrsValueIterator : public ValueIterator {
  public:
   explicit AttrsValueIterator(PyObject* nested) : nested_(nested) {
@@ -497,6 +505,35 @@ bool IsSparseTensorValueType(PyObject* o) {
              o, reinterpret_cast<PyTypeObject*>(sparse_tensor_value_type)) == 1;
 }
 
+// Returns 1 if `o` is an instance of CompositeTensor.
+// Returns 0 otherwise.
+// Returns -1 if an error occurred.
+bool IsCompositeTensorHelper(PyObject* o) {
+  static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
+    PyObject* composite_tensor_type = GetRegisteredType("CompositeTensor");
+    if (TF_PREDICT_FALSE(composite_tensor_type == nullptr)) {
+      PyErr_SetString(PyExc_RuntimeError,
+                      tensorflow::strings::StrCat(
+                          "CompositeTensor type has not been set. "
+                          "Please register the type with the identifier "
+                          "\"CompositeTensor\" using RegisterType.")
+                          .c_str());
+      return -1;
+    }
+    int is_instance = PyObject_IsInstance(to_check, composite_tensor_type);
+
+    // Don't cache a failed is_instance check.
+    if (is_instance == -1) return -1;
+
+    return static_cast<int>(is_instance != 0);
+  });
+  return check_cache->CachedLookup(o);
+}
+
+int IsSequenceOrCompositeHelper(PyObject* o) {
+  return IsSequence(o) || IsCompositeTensor(o);
+}
+
 int IsSequenceForDataHelper(PyObject* o) {
   return IsSequenceHelper(o) == 1 && !PyList_Check(o) &&
          !IsSparseTensorValueType(o);
@@ -529,6 +566,18 @@ ValueIteratorPtr GetValueIteratorForData(PyObject* nested) {
   }
 }
 
+// Similar to GetValueIterator above, but expands CompositeTensors.
+ValueIteratorPtr GetValueIteratorForComposite(PyObject* nested) {
+  if (IsCompositeTensor(nested)) {
+    static char expand_method_name[] = "_to_components";
+    nested = PyObject_CallMethod(nested, expand_method_name, nullptr);
+    if (PyErr_Occurred() || nested == nullptr) {
+      return absl::make_unique<ErrorValueIterator>();
+    }
+  }
+  return GetValueIterator(nested);
+}
+
 bool FlattenHelper(
     PyObject* nested, PyObject* list,
     const std::function<int(PyObject*)>& is_sequence_helper,
@@ -596,7 +645,8 @@ void SetDifferentKeysError(PyObject* dict1, PyObject* dict2, string* error_msg,
 bool AssertSameStructureHelper(
     PyObject* o1, PyObject* o2, bool check_types, string* error_msg,
     bool* is_type_error,
-    const std::function<int(PyObject*)>& is_sequence_helper) {
+    const std::function<int(PyObject*)>& is_sequence_helper,
+    const std::function<ValueIteratorPtr(PyObject*)>& value_iterator_getter) {
   DCHECK(error_msg);
   DCHECK(is_type_error);
   const bool is_seq1 = is_sequence_helper(o1);
@@ -702,8 +752,8 @@ bool AssertSameStructureHelper(
     }
   }
 
-  ValueIteratorPtr iter1 = GetValueIterator(o1);
-  ValueIteratorPtr iter2 = GetValueIterator(o2);
+  ValueIteratorPtr iter1 = value_iterator_getter(o1);
+  ValueIteratorPtr iter2 = value_iterator_getter(o2);
 
   if (!iter1->valid() || !iter2->valid()) return false;
 
@@ -714,9 +764,9 @@ bool AssertSameStructureHelper(
       if (Py_EnterRecursiveCall(" in assert_same_structure")) {
         return false;
       }
-      bool no_internal_errors =
-          AssertSameStructureHelper(v1.get(), v2.get(), check_types, error_msg,
-                                    is_type_error, is_sequence_helper);
+      bool no_internal_errors = AssertSameStructureHelper(
+          v1.get(), v2.get(), check_types, error_msg, is_type_error,
+          is_sequence_helper, value_iterator_getter);
       Py_LeaveRecursiveCall();
       if (!no_internal_errors) return false;
       if (!error_msg->empty()) return true;
@@ -742,9 +792,13 @@ bool IsAttrs(PyObject* o) { return IsAttrsHelper(o) == 1; }
 bool IsTensor(PyObject* o) { return IsTensorHelper(o) == 1; }
 bool IsIndexedSlices(PyObject* o) { return IsIndexedSlicesHelper(o) == 1; }
 
-PyObject* Flatten(PyObject* nested) {
+PyObject* Flatten(PyObject* nested, bool expand_composites) {
   PyObject* list = PyList_New(0);
-  if (FlattenHelper(nested, list, IsSequenceHelper, GetValueIterator)) {
+  const std::function<int(PyObject*)>& is_sequence_helper =
+      expand_composites ? IsSequenceOrCompositeHelper : IsSequenceHelper;
+  const std::function<ValueIteratorPtr(PyObject*)>& get_value_iterator =
+      expand_composites ? GetValueIteratorForComposite : GetValueIterator;
+  if (FlattenHelper(nested, list, is_sequence_helper, get_value_iterator)) {
     return list;
   } else {
     Py_DECREF(list);
@@ -752,6 +806,12 @@ PyObject* Flatten(PyObject* nested) {
   }
 }
 
+bool IsSequenceOrComposite(PyObject* o) {
+  return IsSequenceOrCompositeHelper(o) == 1;
+}
+
+bool IsCompositeTensor(PyObject* o) { return IsCompositeTensorHelper(o) == 1; }
+
 bool IsSequenceForData(PyObject* o) { return IsSequenceForDataHelper(o) == 1; }
 
 PyObject* FlattenForData(PyObject* nested) {
@@ -850,11 +910,16 @@ PyObject* SameNamedtuples(PyObject* o1, PyObject* o2) {
   }
 }
 
-PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types) {
+PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types,
+                              bool expand_composites) {
+  const std::function<int(PyObject*)>& is_sequence_helper =
+      expand_composites ? IsSequenceOrCompositeHelper : IsSequenceHelper;
+  const std::function<ValueIteratorPtr(PyObject*)>& get_value_iterator =
+      expand_composites ? GetValueIteratorForComposite : GetValueIterator;
   string error_msg;
   bool is_type_error = false;
   AssertSameStructureHelper(o1, o2, check_types, &error_msg, &is_type_error,
-                            IsSequenceHelper);
+                            is_sequence_helper, get_value_iterator);
   if (PyErr_Occurred()) {
     // Don't hide Python exceptions while checking (e.g. errors fetching keys
     // from custom mappings).
@@ -878,7 +943,7 @@ PyObject* AssertSameStructureForData(PyObject* o1, PyObject* o2,
   string error_msg;
   bool is_type_error = false;
   AssertSameStructureHelper(o1, o2, check_types, &error_msg, &is_type_error,
-                            IsSequenceForDataHelper);
+                            IsSequenceForDataHelper, GetValueIterator);
   if (PyErr_Occurred()) {
     // Don't hide Python exceptions while checking (e.g. errors fetching keys
     // from custom mappings).
diff --git a/tensorflow/python/util/util.h b/tensorflow/python/util/util.h
index f37cd527d819fad36bcac7b914e416bf788c8cb3..4a5db93401c328c056d80f678dd47d66306d53b3 100644
--- a/tensorflow/python/util/util.h
+++ b/tensorflow/python/util/util.h
@@ -33,6 +33,30 @@ namespace swig {
 //   dict.
 bool IsSequence(PyObject* o);
 
+// Implements the same interface as nest.is_sequence_or_composite
+// Returns a true if its input is a collections.Sequence (except strings)
+// or a CompositeTensor.
+//
+// Args:
+//   seq: an input sequence.
+//
+// Returns:
+//   True if the sequence is a not a string and is a collections.Sequence or a
+//   dict or a CompositeTensor.
+bool IsSequenceOrComposite(PyObject* o);
+
+// Implements the same interface as nest.is_sequence_or_composite
+// Returns a true if its input is a collections.Sequence (except strings)
+// or a CompositeTensor.
+//
+// Args:
+//   seq: an input sequence.
+//
+// Returns:
+//   True if the sequence is a not a string and is a collections.Sequence or a
+//   dict or a CompositeTensor.
+bool IsCompositeTensor(PyObject* o);
+
 // Implements the same interface as tensorflow.util.nest._is_namedtuple
 // Returns Py_True iff `instance` should be considered a `namedtuple`.
 //
@@ -118,7 +142,8 @@ PyObject* SameNamedtuples(PyObject* o1, PyObject* o2);
 //
 // Returns:
 //  Py_None on success, nullptr on error.
-PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types);
+PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types,
+                              bool expand_composites);
 
 // Implements the same interface as tensorflow.util.nest.flatten
 //
@@ -139,6 +164,9 @@ PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types);
 // Args:
 //   nest: an arbitrarily nested structure or a scalar object. Note, numpy
 //       arrays are considered scalars.
+//   expand_composites: If true, then composite tensors (such as
+//       `tf.SparseTensor` and `tf.RaggedTensor` are flattened into their
+//       component tensors.
 //
 // Returns:
 //   A Python list, the flattened version of the input.
@@ -146,7 +174,7 @@ PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types);
 //
 // Raises:
 //   TypeError: The nest is or contains a dict with non-sortable keys.
-PyObject* Flatten(PyObject* nested);
+PyObject* Flatten(PyObject* nested, bool expand_composites = false);
 
 // The tensorflow.python.data package has its own nest utility that follows very
 // slightly different semantics for its functions than the tensorflow.python
diff --git a/tensorflow/python/util/util.i b/tensorflow/python/util/util.i
index 4d34d61eee65ea48ad4fbb2894699695110fc76c..6e2a3d8ccfc48bd9234e0c42229fb37dd9fa1ce4 100644
--- a/tensorflow/python/util/util.i
+++ b/tensorflow/python/util/util.i
@@ -35,7 +35,7 @@ limitations under the License.
 %noexception tensorflow::swig::IsTensor;
 
 %feature("docstring") tensorflow::swig::IsSequence
-"""Returns a true if its input is a collections.Sequence (except strings).
+"""Returns true if its input is a collections.Sequence (except strings).
 
 Args:
   seq: an input sequence.
@@ -47,6 +47,31 @@ Returns:
 %unignore tensorflow::swig::IsSequence;
 %noexception tensorflow::swig::IsSequence;
 
+%feature("docstring") tensorflow::swig::IsSequenceOrComposite
+"""Returns true if its input is a sequence or a `CompositeTensor`.
+
+Args:
+  seq: an input sequence.
+
+Returns:
+  True if the sequence is a not a string and is a collections.Sequence or a
+  dict or a CompositeTensor.
+"""
+%unignore tensorflow::swig::IsSequenceOrComposite;
+%noexception tensorflow::swig::IsSequenceOrComposite;
+
+%feature("docstring") tensorflow::swig::IsCompositeTensor
+"""Returns true if its input is a `CompositeTensor`.
+
+Args:
+  seq: an input sequence.
+
+Returns:
+  True if the sequence is a CompositeTensor.
+"""
+%unignore tensorflow::swig::IsCompositeTensor;
+%noexception tensorflow::swig::IsCompositeTensor;
+
 %unignore tensorflow::swig::IsNamedtuple;
 %noexception tensorflow::swig::IsNamedtuple;
 
@@ -103,6 +128,8 @@ running.
 Args:
   nest: an arbitrarily nested structure or a scalar object. Note, numpy
       arrays are considered scalars.
+  expand_composites: If true, then composite tensors such as `tf.SparseTensor`
+      and `tf.RaggedTensor` are expanded into their component tensors.
 
 Returns:
   A Python list, the flattened version of the input.
@@ -112,6 +139,7 @@ Raises:
 """
 %unignore tensorflow::swig::Flatten;
 %noexception tensorflow::swig::Flatten;
+%feature("kwargs") tensorflow::swig::Flatten;
 
 %feature("docstring") tensorflow::swig::IsSequenceForData
 """Returns a true if `seq` is a Sequence or dict (except strings/lists).
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index 4c764a7b099010a980c007c5cdff7f20f7ba2106..03c8d6f535f09a40cd4e0ae3fe52279eda22f607 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -1,115 +1,683 @@
-licenses(["restricted"])
+# GPU executor library for data-parallel kernel launches and cross-platform
+# HPC-library APIs.
+#
+# Throughout this file, all targets are built with the standard crosstool and
+# do not link against restricted binary blobs.
 
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
-load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
-load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
+load("//tensorflow/stream_executor:build_defs.bzl", "stream_executor_friends")
+
+package_group(
+    name = "friends",
+    packages = stream_executor_friends(),
+)
+
+package(
+    default_visibility = [":friends"],
+)
+
+# Filegroup used to collect source files for the dependency check.
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+cc_library(
+    name = "launch_dim",
+    hdrs = [
+        "gpu_launch_dim.h",
+        "launch_dim.h",
+    ],
+    deps = [
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "device_description",
+    srcs = ["device_description.cc"],
+    hdrs = ["device_description.h"],
+    deps = [
+        ":launch_dim",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "event",
+    srcs = [
+        "blas.h",
+        "device_description.h",
+        "device_options.h",
+        "dnn.h",
+        "event.cc",
+        "fft.h",
+        "kernel_cache_config.h",
+        "launch_dim.h",
+        "plugin.h",
+        "plugin_registry.h",
+        "rng.h",
+        "shared_memory_config.h",
+        "stream_executor_pimpl.h",
+        "temporary_device_memory.h",
+        "temporary_memory_manager.h",
+        "trace_listener.h",
+    ],
+    hdrs = [
+        "device_memory.h",
+        "event.h",
+        "kernel.h",
+        "kernel_spec.h",
+        "platform.h",
+        "stream.h",
+        "stream_executor_internal.h",
+    ],
+    deps = [
+        ":allocator_stats",
+        ":dnn_proto_cc",
+        ":host_or_device_scalar",
+        ":stream_executor_headers",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "kernel",
+    srcs = [
+        "dnn.h",
+        "fft.h",
+        "kernel.cc",
+        "plugin.h",
+        "rng.h",
+        "stream.h",
+        "stream_executor_pimpl.h",
+        "temporary_device_memory.h",
+        "temporary_memory_manager.h",
+    ],
+    hdrs = [
+        "blas.h",
+        "device_description.h",
+        "device_options.h",
+        "event.h",
+        "kernel.h",
+        "kernel_spec.h",
+        "launch_dim.h",
+        "multi_platform_manager.h",
+        "platform.h",
+        "plugin_registry.h",
+        "shared_memory_config.h",
+        "stream_executor.h",
+        "stream_executor_internal.h",
+        "timer.h",
+        "trace_listener.h",
+    ],
+    deps = [
+        ":allocator_stats",
+        ":device_memory",
+        ":dnn_proto_cc",
+        ":host_or_device_scalar",
+        ":kernel_cache_config",
+        ":stream_executor_headers",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "kernel_spec",
+    srcs = ["kernel_spec.cc"],
+    hdrs = ["kernel_spec.h"],
+    deps = [
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "kernel_cache_config",
+    hdrs = ["kernel_cache_config.h"],
+)
+
+cc_library(
+    name = "module_spec",
+    hdrs = ["module_spec.h"],
+    deps = [
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "shared_memory_config",
+    hdrs = ["shared_memory_config.h"],
+)
+
+cc_library(
+    name = "stream_header",
+    hdrs = [
+        "blas.h",
+        "device_memory.h",
+        "dnn.h",
+        "event.h",
+        "fft.h",
+        "gpu_launch_dim.h",
+        "kernel.h",
+        "kernel_cache_config.h",
+        "launch_dim.h",
+        "stream.h",
+        "temporary_device_memory.h",
+        "temporary_memory_manager.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":allocator_stats",
+        ":dnn_proto_cc",
+        ":host_or_device_scalar",
+        ":stream_executor_headers",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+# It implements :stream_header
+cc_library(
+    name = "stream",
+    srcs = [
+        "stream.cc",
+    ],
+    hdrs = ["stream.h"],
+    deps = [
+        ":blas",
+        ":device_memory",
+        ":dnn",
+        ":event",
+        ":fft",
+        ":host_or_device_scalar",
+        ":kernel",
+        ":launch_dim",
+        ":platform",
+        ":rng",
+        ":stream_executor_headers",
+        ":stream_executor_internal",
+        ":stream_executor_pimpl",
+        ":temporary_memory_manager",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "timer",
+    srcs = [
+        "device_description.h",
+        "kernel_cache_config.h",
+        "timer.cc",
+    ],
+    hdrs = [
+        "blas.h",
+        "kernel.h",
+        "stream.h",
+        "stream_executor.h",
+        "timer.h",
+    ],
+    deps = [
+        ":host_or_device_scalar",
+        ":platform",
+        ":stream_executor_headers",
+        ":stream_executor_internal",
+        ":stream_executor_pimpl_header",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "platform",
+    srcs = ["platform.cc"],
+    hdrs = ["platform.h"],
+    deps = [
+        ":plugin",
+        ":stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "rng",
+    srcs = ["rng.cc"],
+    hdrs = ["rng.h"],
+    deps = ["//tensorflow/stream_executor/platform"],
+)
+
+cc_library(
+    name = "temporary_device_memory",
+    srcs = [
+        "event.h",
+        "temporary_device_memory.cc",
+        "temporary_memory_manager.h",
+    ],
+    hdrs = ["temporary_device_memory.h"],
+    deps = [
+        ":device_memory",
+        ":stream_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "temporary_memory_manager",
+    srcs = ["temporary_memory_manager.cc"],
+    hdrs = ["temporary_memory_manager.h"],
+    deps = [
+        ":device_memory",
+        ":stream_executor_pimpl_header",
+        ":stream_header",
+        ":temporary_device_memory",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+    ],
+)
 
-STREAM_EXECUTOR_HEADERS = glob([
-    "*.h",
-    "cuda/*.h",
-    "host/*.h",
-    "lib/*.h",
-    "lib/gtl/*.h",
-    "platform/**/*.h",
-])
+cc_library(
+    name = "fft",
+    hdrs = ["fft.h"],
+    deps = [
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "blas",
+    srcs = ["blas.cc"],
+    hdrs = ["blas.h"],
+    deps = [
+        ":host_or_device_scalar",
+        ":stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "device_memory",
+    hdrs = ["device_memory.h"],
+    deps = ["//tensorflow/stream_executor/platform"],
+)
+
+cc_library(
+    name = "host_or_device_scalar",
+    hdrs = ["host_or_device_scalar.h"],
+    deps = [
+        ":device_memory",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "device_options",
+    hdrs = ["device_options.h"],
+    deps = [
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "executor_cache",
+    srcs = [
+        "device_description.h",
+        "device_memory.h",
+        "device_options.h",
+        "event.h",
+        "executor_cache.cc",
+        "launch_dim.h",
+        "plugin.h",
+        "plugin_registry.h",
+        "rng.h",
+        "stream_executor_pimpl.h",
+        "temporary_device_memory.h",
+        "temporary_memory_manager.h",
+    ],
+    hdrs = [
+        "blas.h",
+        "dnn.h",
+        "executor_cache.h",
+        "fft.h",
+        "kernel.h",
+        "kernel_cache_config.h",
+        "kernel_spec.h",
+        "platform.h",
+        "shared_memory_config.h",
+        "stream.h",
+        "stream_executor_internal.h",
+        "trace_listener.h",
+    ],
+    deps = [
+        ":allocator_stats",
+        ":dnn_proto_cc",
+        ":host_or_device_scalar",
+        ":stream_executor_headers",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "multi_platform_manager",
+    srcs = ["multi_platform_manager.cc"],
+    hdrs = ["multi_platform_manager.h"],
+    deps = [
+        ":platform",
+        ":stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_library(
+    name = "plugin",
+    srcs = ["plugin.cc"],
+    hdrs = ["plugin.h"],
+)
+
+cc_library(
+    name = "plugin_registry",
+    srcs = ["plugin_registry.cc"],
+    hdrs = ["plugin_registry.h"],
+    deps = [
+        ":blas",
+        ":dnn",
+        ":fft",
+        ":multi_platform_manager",
+        ":platform",
+        ":plugin",
+        ":stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
+
+cc_library(
+    name = "scratch_allocator",
+    srcs = ["scratch_allocator.cc"],
+    hdrs = ["scratch_allocator.h"],
+    deps = [
+        ":device_memory",
+        ":stream_header",
+        ":temporary_device_memory",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
 
 tf_proto_library(
     name = "dnn_proto",
     srcs = ["dnn.proto"],
     cc_api_version = 2,
     default_header = True,
-    protodeps = tf_additional_all_protos(),
+    provide_cc_alias = True,
+)
+
+tf_proto_library(
+    name = "logging_proto",
+    srcs = ["logging.proto"],
+    cc_api_version = 2,
+    protodeps = [":dnn_proto"],
+    provide_cc_alias = True,
+    visibility = [":friends"],
 )
 
 cc_library(
-    name = "stream_executor_impl",
-    srcs = glob(
-        [
-            "*.cc",
-            "host/*.cc",
-            "cuda/cuda_platform_id.cc",
-            "lib/*.cc",
-            "platform/default/*.cc",
-        ],
-        exclude = [
-            "**/*_test.cc",
-        ],
-    ),
-    hdrs = STREAM_EXECUTOR_HEADERS,
-    linkopts = select({
-        "//tensorflow:freebsd": [],
-        "//tensorflow:windows": [],
-        "//conditions:default": ["-ldl"],
-    }),
-    visibility = ["//visibility:public"],
+    name = "dnn",
+    srcs = ["dnn.cc"],
+    hdrs = ["dnn.h"],
     deps = [
-        ":dnn_proto_cc_impl",
+        ":device_memory",
+        ":dnn_proto_cc",
+        ":stream_executor_headers",
         "//tensorflow/core:lib",
-        "//tensorflow/core:ptr_util",
-        "@com_google_absl//absl/container:flat_hash_map",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
-        "@local_config_cuda//cuda:cuda_headers",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
     ],
-    alwayslink = 1,
 )
 
 cc_library(
-    name = "stream_executor",
-    hdrs = STREAM_EXECUTOR_HEADERS,
+    name = "stream_executor_internal",
+    srcs = [
+        "dnn.h",
+        "stream_executor_internal.cc",
+    ],
+    hdrs = [
+        "shared_memory_config.h",
+        "stream_executor_internal.h",
+    ],
+    deps = [
+        ":allocator_stats",
+        ":device_description",
+        ":device_memory",
+        ":device_options",
+        ":dnn_proto_cc",
+        ":kernel",
+        ":kernel_cache_config",
+        ":kernel_spec",
+        ":launch_dim",
+        ":plugin_registry",
+        ":stream_executor_headers",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "stream_executor_pimpl_header",
+    hdrs = [
+        "device_description.h",
+        "dnn.h",
+        "kernel.h",
+        "kernel_cache_config.h",
+        "shared_memory_config.h",
+        "stream_executor_pimpl.h",
+    ],
     visibility = ["//visibility:public"],
     deps = [
+        ":allocator_stats",
         ":dnn_proto_cc",
+        ":platform",
+        ":stream_executor_headers",
+        ":stream_executor_internal",
         "//tensorflow/core:lib",
-        "//tensorflow/core:ptr_util",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
-        "@local_config_cuda//cuda:cuda_headers",
-    ] + if_static([":stream_executor_impl"]),
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
 )
 
-cc_header_only_library(
-    name = "stream_executor_headers_lib",
-    visibility = ["//visibility:public"],
+# It implements :stream_executor_pimpl_header
+cc_library(
+    name = "stream_executor_pimpl",
+    srcs = ["stream_executor_pimpl.cc"],
+    hdrs = ["stream_executor_pimpl.h"],
     deps = [
-        ":stream_executor",
+        ":blas",
+        ":executor_cache",
+        ":fft",
+        ":kernel",
+        ":platform",
+        ":rng",
+        ":stream_executor_headers",
+        ":stream_header",
+        ":timer",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
+# The stream_executor_headers target does not prescribe an implementation.
+#
+# TODO(b/25131218) this is OBSOLETE/DEPRECATED -- get rid of this target altogether
 cc_library(
-    name = "cuda_platform",
-    srcs = if_cuda_is_configured(
-        glob(
-            [
-                "cuda/*.cc",
-            ],
-            exclude = [
-                "cuda/*_test.cc",
-                "cuda/cuda_platform_id.cc",
-            ],
-        ),
-    ),
-    copts = select({
-        "//tensorflow:windows": ["/DNOGDI"],
-        "//conditions:default": [],
-    }),
-    linkopts = select({
-        "//tensorflow:freebsd": [],
-        "//tensorflow:windows": [],
-        "//conditions:default": ["-ldl"],
-    }),
+    name = "stream_executor_headers",
+    hdrs = [
+        "blas.h",
+        "device_description.h",
+        "device_memory.h",
+        "device_options.h",
+        "dnn.h",
+        "event.h",
+        "executor_cache.h",
+        "fft.h",
+        "gpu_launch_dim.h",
+        "kernel.h",
+        "kernel_cache_config.h",
+        "kernel_spec.h",
+        "launch_dim.h",
+        "module_spec.h",
+        "multi_platform_manager.h",
+        "platform.h",
+        "plugin.h",
+        "plugin_registry.h",
+        "rng.h",
+        "shared_memory_config.h",
+        "stream.h",
+        "stream_executor.h",
+        "stream_executor_internal.h",
+        "stream_executor_pimpl.h",
+        "temporary_device_memory.h",
+        "temporary_memory_manager.h",
+        "timer.h",
+        "trace_listener.h",
+    ],
     visibility = ["//visibility:public"],
     deps = [
-        ":stream_executor",
+        ":allocator_stats",
+        ":dnn_proto_cc",
+        ":host_or_device_scalar",
         "//tensorflow/core:lib",
-        "//tensorflow/core/kernels:ops_util",
-        "@local_config_cuda//cuda:cuda_headers",
-    ] + if_cuda_is_configured([
-        "//tensorflow/core:cuda",
-        "@local_config_cuda//cuda:cuda_driver",
-        "@local_config_cuda//cuda:cudnn",
-    ]),
-    alwayslink = 1,
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "stream_executor",
+    hdrs = ["stream_executor.h"],
+    deps = [":stream_executor_headers"] + if_static([":stream_executor_impl"]),
+)
+
+cc_library(
+    name = "stream_executor_impl",
+    deps = [
+        ":device_description",
+        ":device_memory",
+        ":dnn_proto_cc",
+        ":dnn_proto_cc_impl",
+        ":event",
+        ":kernel",
+        ":launch_dim",
+        ":multi_platform_manager",
+        ":platform",
+        ":stream",
+        ":stream_executor_headers",
+        ":stream_executor_pimpl",
+        ":timer",
+    ],
+)
+
+cc_library(
+    name = "allocator_stats",
+    srcs = [
+        "allocator_stats.cc",
+    ],
+    hdrs = ["allocator_stats.h"],
+    deps = [
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+tf_cc_test(
+    name = "stream_test",
+    size = "small",
+    srcs = ["stream_test.cc"],
+    deps = [
+        ":stream_executor",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/stream_executor/host:host_platform",
+    ],
+)
+
+alias(
+    name = "cuda_platform",
+    actual = "//tensorflow/stream_executor/cuda:all_runtime",
+)
+
+alias(
+    name = "rocm_platform",
+    actual = "//tensorflow/stream_executor/rocm:all_runtime",
 )
diff --git a/tensorflow/stream_executor/allocator_stats.cc b/tensorflow/stream_executor/allocator_stats.cc
new file mode 100644
index 0000000000000000000000000000000000000000..440d6f46a3cbd8740c55a239865ce5f89b96b4f3
--- /dev/null
+++ b/tensorflow/stream_executor/allocator_stats.cc
@@ -0,0 +1,32 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/allocator_stats.h"
+#include "absl/strings/str_format.h"
+
+namespace stream_executor {
+
+string AllocatorStats::DebugString() const {
+  return absl::StrFormat(
+      "Limit:        %20lld\n"
+      "InUse:        %20lld\n"
+      "MaxInUse:     %20lld\n"
+      "NumAllocs:    %20lld\n"
+      "MaxAllocSize: %20lld\n",
+      this->bytes_limit ? *this->bytes_limit : 0, this->bytes_in_use,
+      this->peak_bytes_in_use, this->num_allocs, this->largest_alloc_size);
+}
+
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/allocator_stats.h b/tensorflow/stream_executor/allocator_stats.h
new file mode 100644
index 0000000000000000000000000000000000000000..786ceb0fdd6fdea829d095923dc774d63a5de625
--- /dev/null
+++ b/tensorflow/stream_executor/allocator_stats.h
@@ -0,0 +1,50 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ALLOCATOR_STATS_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ALLOCATOR_STATS_H_
+
+#include <string>
+
+#include "absl/types/optional.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace stream_executor {
+
+// Runtime statistics collected by an allocator. Exactly the same as
+// tensorflow::AllocatorStats, but independently defined to preserve the mutual
+// independence of StreamExecutor and TensorFlow.
+struct AllocatorStats {
+  int64 num_allocs;          // Number of allocations.
+  int64 bytes_in_use;        // Number of bytes in use.
+  int64 peak_bytes_in_use;   // The peak bytes in use.
+  int64 largest_alloc_size;  // The largest single allocation seen.
+
+  // The upper limit of bytes of user allocatable device memory, if such a limit
+  // is known.
+  absl::optional<int64> bytes_limit;
+
+  AllocatorStats()
+      : num_allocs(0),
+        bytes_in_use(0),
+        peak_bytes_in_use(0),
+        largest_alloc_size(0) {}
+
+  string DebugString() const;
+};
+
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ALLOCATOR_STATS_H_
diff --git a/tensorflow/stream_executor/build_defs.bzl b/tensorflow/stream_executor/build_defs.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..b2da13e5ae708e40d8cb86ce6f1e2b8f1f0bb7a4
--- /dev/null
+++ b/tensorflow/stream_executor/build_defs.bzl
@@ -0,0 +1,21 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_is_configured")
+load("@local_config_rocm//rocm:build_defs.bzl", "rocm_is_configured")
+
+def stream_executor_friends():
+    return ["//tensorflow/..."]
+
+def tf_additional_cuda_platform_deps():
+    return []
+
+# Use dynamic loading, therefore should be empty.
+def tf_additional_cuda_driver_deps():
+    return []
+
+def tf_additional_cudnn_plugin_deps():
+    return []
+
+# Returns whether any GPU backend is configuered.
+def if_gpu_is_configured(x):
+    if cuda_is_configured() or rocm_is_configured():
+        return x
+    return []
diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..03dd92e19c1b1429d6cfa742c111b22d4d802e87
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -0,0 +1,411 @@
+# Description:
+#   CUDA-platform specific StreamExecutor support code.
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load(
+    "//tensorflow/stream_executor:build_defs.bzl",
+    "stream_executor_friends",
+    "tf_additional_cuda_driver_deps",
+    "tf_additional_cuda_platform_deps",
+    "tf_additional_cudnn_plugin_deps",
+)
+load("//tensorflow:tensorflow.bzl", "tf_copts")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+
+package_group(
+    name = "friends",
+    packages = stream_executor_friends(),
+)
+
+package(
+    default_visibility = [":friends"],
+)
+
+# Filegroup used to collect source files for the dependency check.
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+cc_library(
+    name = "cuda_platform_id",
+    srcs = ["cuda_platform_id.cc"],
+    hdrs = ["cuda_platform_id.h"],
+    deps = ["//tensorflow/stream_executor:platform"],
+)
+
+cc_library(
+    name = "cuda_platform",
+    srcs = if_cuda_is_configured(["cuda_platform.cc"]),
+    hdrs = if_cuda_is_configured(["cuda_platform.h"]),
+    visibility = ["//visibility:public"],
+    deps = if_cuda_is_configured(
+        [
+            ":cuda_driver",
+            ":cuda_gpu_executor",
+            ":cuda_platform_id",
+            "//tensorflow/stream_executor",  # buildcleaner: keep
+            "//tensorflow/stream_executor:executor_cache",
+            "//tensorflow/stream_executor:multi_platform_manager",
+            "//tensorflow/stream_executor:stream_executor_pimpl_header",
+            "//tensorflow/stream_executor/lib",
+            "//tensorflow/stream_executor/platform",
+        ],
+    ) + tf_additional_cuda_platform_deps(),
+    alwayslink = True,  # Registers itself with the MultiPlatformManager.
+)
+
+cc_library(
+    name = "cuda_diagnostics",
+    srcs = if_cuda_is_configured(["cuda_diagnostics.cc"]),
+    hdrs = if_cuda_is_configured(["cuda_diagnostics.h"]),
+    deps = if_cuda_is_configured([
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/stream_executor/gpu:gpu_diagnostics_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ]),
+)
+
+cc_library(
+    name = "cuda_driver",
+    srcs = if_cuda_is_configured(["cuda_driver.cc"]),
+    hdrs = if_cuda_is_configured([
+        "cuda_driver.h",
+        "cuda_driver_wrapper.h",
+    ]),
+    deps = if_cuda_is_configured([
+        ":cuda_diagnostics",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/stream_executor:device_options",
+        "//tensorflow/stream_executor/gpu:gpu_driver_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ] + tf_additional_cuda_driver_deps()) + select({
+        # include dynamic loading implementation only when if_cuda_is_configured and build dynamically
+        "//tensorflow:using_cuda_nvcc_with_dynamic_build": ["cudart_stub"],
+        "//tensorflow:using_cuda_clang_with_dynamic_build": ["cudart_stub"],
+        "//conditions:default": ["//tensorflow/core:cuda"],
+    }),
+)
+
+cc_library(
+    name = "cudart_stub",
+    srcs = select({
+        # include dynamic loading implementation only when if_cuda_is_configured and build dynamically
+        "//tensorflow:using_cuda_nvcc_with_dynamic_build": ["cudart_stub.cc"],
+        "//tensorflow:using_cuda_clang_with_dynamic_build": ["cudart_stub.cc"],
+        "//conditions:default": [],
+    }),
+    visibility = ["//visibility:public"],
+    deps = select({
+        "//tensorflow:using_cuda_nvcc_with_dynamic_build": [
+            "@local_config_cuda//cuda:cuda_headers",
+            "//tensorflow/stream_executor/lib",
+            "//tensorflow/stream_executor/platform:dso_loader",
+        ],
+        "//tensorflow:using_cuda_clang_with_dynamic_build": [
+            "@local_config_cuda//cuda:cuda_headers",
+            "//tensorflow/stream_executor/lib",
+            "//tensorflow/stream_executor/platform:dso_loader",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
+# The activation library is tightly coupled to the executor library.
+# TODO(leary) split up cuda_gpu_executor.cc so that this can stand alone.
+cc_library(
+    name = "cuda_activation_header",
+    hdrs = ["cuda_activation.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/stream_executor/gpu:gpu_activation_header",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "cuda_activation",
+    srcs = [],
+    hdrs = if_cuda_is_configured(["cuda_activation.h"]),
+    deps = if_cuda_is_configured([
+        ":cuda_driver",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor/gpu:gpu_activation",
+        "//tensorflow/stream_executor/platform",
+    ]),
+)
+
+cc_library(
+    name = "cuda_gpu_executor_header",
+    textual_hdrs = if_cuda_is_configured(["cuda_gpu_executor.h"]),
+    visibility = ["//visibility:public"],
+    deps = if_cuda_is_configured([
+        ":cuda_kernel",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor/gpu:gpu_executor_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ]),
+)
+
+cc_library(
+    name = "cublas_plugin",
+    srcs = if_cuda_is_configured(["cuda_blas.cc"]),
+    hdrs = if_cuda_is_configured(["cuda_blas.h"]),
+    visibility = ["//visibility:public"],
+    deps = if_cuda_is_configured([
+        ":cuda_activation",
+        ":cuda_gpu_executor",
+        ":cuda_platform_id",
+        ":cuda_stream",
+        ":cuda_timer",
+        ":cuda_helpers",
+        "@com_google_absl//absl/strings",
+        "//third_party/eigen3",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:host_or_device_scalar",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor:timer",
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ] + if_static(["@local_config_cuda//cuda:cublas"])),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "cufft_plugin",
+    srcs = if_cuda_is_configured(["cuda_fft.cc"]),
+    hdrs = if_cuda_is_configured(["cuda_fft.h"]),
+    visibility = ["//visibility:public"],
+    deps = if_cuda_is_configured([
+        ":cuda_activation_header",
+        ":cuda_gpu_executor_header",
+        ":cuda_platform_id",
+        ":cuda_stream",
+        ":cuda_helpers",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:fft",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ] + if_static(["@local_config_cuda//cuda:cufft"])),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "cudnn_plugin",
+    srcs = if_cuda_is_configured(["cuda_dnn.cc"]),
+    hdrs = if_cuda_is_configured(["cuda_dnn.h"]),
+    copts = [
+        # STREAM_EXECUTOR_CUDNN_WRAP would fail on Clang with the default
+        # setting of template depth 256
+        "-ftemplate-depth-512",
+    ],
+    visibility = ["//visibility:public"],
+    deps = if_cuda_is_configured([
+        ":cuda_activation",
+        ":cuda_diagnostics",
+        ":cuda_driver",
+        ":cuda_gpu_executor",
+        ":cuda_platform_id",
+        ":cuda_stream",
+        ":cuda_timer",
+        ":cudnn_version",
+        "@com_google_absl//absl/strings",
+        "//third_party/eigen3",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:logger",
+        "//tensorflow/stream_executor:dnn",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:logging_proto_cc",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor:temporary_device_memory",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ]) + tf_additional_cudnn_plugin_deps() + if_cuda_is_configured(if_static([
+        "@local_config_cuda//cuda:cudnn",
+    ])),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "curand_plugin",
+    srcs = if_cuda_is_configured(["cuda_rng.cc"]),
+    hdrs = if_cuda_is_configured(["cuda_rng.h"]),
+    deps = if_cuda_is_configured([
+        ":cuda_activation",
+        ":cuda_gpu_executor",
+        ":cuda_platform_id",
+        ":cuda_stream",
+        ":cuda_helpers",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:rng",
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
+        "//tensorflow/stream_executor/gpu:gpu_rng_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ] + if_static(["@local_config_cuda//cuda:curand"])),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "cuda_kernel",
+    srcs = if_cuda_is_configured(["cuda_kernel.cc"]),
+    hdrs = if_cuda_is_configured(["cuda_kernel.h"]),
+    deps = if_cuda_is_configured([
+        ":cuda_driver",
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor/gpu:gpu_kernel_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ]),
+)
+
+# TODO(leary) we likely need to canonicalize/eliminate this.
+cc_library(
+    name = "cuda_helpers",
+    textual_hdrs = if_cuda_is_configured(["cuda_helpers.h"]),
+    deps = if_cuda_is_configured([
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
+    ]),
+)
+
+cc_library(
+    name = "cuda_event",
+    srcs = if_cuda_is_configured(["cuda_event.cc"]),
+    hdrs = if_cuda_is_configured(["cuda_event.h"]),
+    deps = if_cuda_is_configured([
+        ":cuda_driver",
+        ":cuda_gpu_executor_header",
+        ":cuda_stream",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/gpu:gpu_event",
+        "//tensorflow/stream_executor/gpu:gpu_stream_header",
+        "//tensorflow/stream_executor/lib",
+    ]),
+)
+
+cc_library(
+    name = "cuda_stream",
+    srcs = [],
+    hdrs = if_cuda_is_configured(["cuda_stream.h"]),
+    deps = if_cuda_is_configured([
+        ":cuda_driver",
+        ":cuda_gpu_executor_header",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor:stream_header",
+        "//tensorflow/stream_executor/gpu:gpu_stream",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ]),
+)
+
+cc_library(
+    name = "cuda_timer",
+    srcs = [],
+    hdrs = if_cuda_is_configured(["cuda_timer.h"]),
+    deps = if_cuda_is_configured([
+        ":cuda_driver",
+        ":cuda_gpu_executor_header",
+        ":cuda_stream",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/gpu:gpu_timer",
+        "//tensorflow/stream_executor/lib",
+    ]),
+)
+
+cc_library(
+    name = "cuda_gpu_executor",
+    srcs = if_cuda_is_configured(["cuda_gpu_executor.cc"]),
+    hdrs = if_cuda_is_configured(["cuda_gpu_executor.h"]),
+    deps = if_cuda_is_configured([
+        ":cuda_activation",
+        ":cuda_diagnostics",
+        ":cuda_driver",
+        ":cuda_event",
+        ":cuda_kernel",
+        ":cuda_platform_id",
+        ":cuda_stream",
+        ":cuda_timer",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor:timer",
+        "//tensorflow/stream_executor/gpu:gpu_executor_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ]),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "cudnn_version",
+    srcs = ["cudnn_version.cc"],
+    hdrs = ["cudnn_version.h"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "cudnn_version_test",
+    srcs = ["cudnn_version_test.cc"],
+    deps = [
+        ":cudnn_version",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "all_runtime",
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cublas_plugin",
+        ":cuda_driver",
+        ":cuda_platform",
+        ":cudnn_plugin",
+        ":cufft_plugin",
+        ":curand_plugin",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/stream_executor/cuda/cuda_activation.h b/tensorflow/stream_executor/cuda/cuda_activation.h
index ef9807820fda493a9ab926ae0509beaafeebdf2e..2b80ae094d17bc8ad957044545ff46daf4aeb103 100644
--- a/tensorflow/stream_executor/cuda/cuda_activation.h
+++ b/tensorflow/stream_executor/cuda/cuda_activation.h
@@ -17,13 +17,13 @@ limitations under the License.
 // It reaches into the CUDA implementation to activate an underlying CUDA
 // context.
 //
-// Having this file separate from cuda_gpu_executor.h means that dependent
+// Having this file separate from cuda/cuda_gpu_executor.h means that dependent
 // code does not also have to depend on cuda.h.
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_ACTIVATION_H_
 
-#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/gpu/gpu_activation.h"
 
 namespace stream_executor {
 
@@ -31,29 +31,7 @@ class StreamExecutor;
 
 namespace cuda {
 
-class CUDAExecutor;
-class ScopedActivateContext;
-
-// Activates a CUDA context within an enclosing scope.
-class ScopedActivateExecutorContext {
- public:
-  // Form that takes a CUDA executor implementation.
-  explicit ScopedActivateExecutorContext(CUDAExecutor* cuda_exec);
-
-  // Form that takes a pImpl executor and extracts a CUDA implementation --
-  // fatal failure if it is not CUDA inside.
-  explicit ScopedActivateExecutorContext(StreamExecutor* stream_exec);
-
-  ScopedActivateExecutorContext(ScopedActivateExecutorContext&& other);
-
-  ~ScopedActivateExecutorContext();
-
- private:
-  // The cuda.h-using datatype that we wrap.
-  ScopedActivateContext* driver_scoped_activate_context_;
-
-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedActivateExecutorContext);
-};
+using ScopedActivateExecutorContext = gpu::ScopedActivateExecutorContext;
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index 957f6c98da564500f81d7185ce6a151003549ee5..5bbb98664e80287410264fc1a288d4e0fc5e480e 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -58,16 +58,12 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/cuda/cuda_timer.h"
 #include "tensorflow/stream_executor/device_memory.h"
-
-#ifndef PLATFORM_GOOGLE
-#include "tensorflow/stream_executor/dso_loader.h"
-#endif
-
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/status_macros.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
@@ -75,7 +71,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
 PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuBlasPlugin);
 
@@ -261,8 +257,8 @@ namespace wrap {
   struct WrapperShim__##__name {                                    \
     static const char *kName;                                       \
     template <typename... Args>                                     \
-    cublasStatus_t operator()(CUDAExecutor *parent, Args... args) { \
-      cuda::ScopedActivateExecutorContext sac{parent};              \
+    cublasStatus_t operator()(GpuExecutor *parent, Args... args) { \
+      gpu::ScopedActivateExecutorContext sac{parent};              \
       return ::__name(args...);                                     \
     }                                                               \
   } __name;                                                         \
@@ -294,8 +290,8 @@ namespace wrap {
       return f;                                                           \
     }                                                                     \
     template <typename... Args>                                           \
-    cublasStatus_t operator()(CUDAExecutor* parent, Args... args) {       \
-      cuda::ScopedActivateExecutorContext sac{parent};                    \
+    cublasStatus_t operator()(GpuExecutor* parent, Args... args) {        \
+      gpu::ScopedActivateExecutorContext sac{parent};                     \
       return DynLoad()(args...);                                          \
     }                                                                     \
   } __name;                                                               \
@@ -399,7 +395,7 @@ class ScopedCublasPointerMode {
   //
   // Parameters:
   //  handle: The cublas library handle to act upon in setting the pointer mode.
-  explicit ScopedCublasPointerMode(CUDAExecutor *parent, cublasHandle_t handle)
+  explicit ScopedCublasPointerMode(GpuExecutor *parent, cublasHandle_t handle)
       : parent_(parent), handle_(handle), ok_(false) {}
 
   // Attempts the switch to the requested scoped pointer mode, new_mode.
@@ -437,7 +433,7 @@ class ScopedCublasPointerMode {
   }
 
  private:
-  CUDAExecutor *parent_;   // Executor establishing this pointer mode for.
+  GpuExecutor *parent_;   // Executor establishing this pointer mode for.
   cublasHandle_t handle_;  // Handle to the cuBLAS instance of interest.
   cublasPointerMode_t old_mode_;  // Prior cuBLAS pointer mode, to be restored.
   bool ok_;                       // Whether the change was successful.
@@ -460,7 +456,7 @@ class ScopedCublasMathMode {
   //
   // Parameters:
   //  handle: The cublas library handle to act upon in setting the math mode.
-  explicit ScopedCublasMathMode(CUDAExecutor *parent, cublasHandle_t handle)
+  explicit ScopedCublasMathMode(GpuExecutor *parent, cublasHandle_t handle)
       : parent_(parent), handle_(handle), ok_(false) {}
 
   // Attempts the switch to the requested scoped math mode, new_mode.
@@ -497,7 +493,7 @@ class ScopedCublasMathMode {
   }
 
  private:
-  CUDAExecutor *parent_;   // Executor establishing this math mode for.
+  GpuExecutor *parent_;   // Executor establishing this math mode for.
   cublasHandle_t handle_;  // Handle to the cuBLAS instance of interest.
   cublasMath_t old_mode_;  // Prior cuBLAS math mode, to be restored.
   bool ok_;                // Whether the change was successful.
@@ -514,7 +510,7 @@ bool CUDABlas::Init() {
   return true;
 }
 
-CUDABlas::CUDABlas(cuda::CUDAExecutor *parent)
+CUDABlas::CUDABlas(gpu::GpuExecutor *parent)
     : parent_(CHECK_NOTNULL(parent)), blas_(nullptr) {}
 
 CUDABlas::~CUDABlas() {
@@ -525,10 +521,10 @@ CUDABlas::~CUDABlas() {
 
 bool CUDABlas::SetStream(Stream *stream) {
   CHECK(stream != nullptr);
-  CHECK(AsCUDAStreamValue(stream) != nullptr);
+  CHECK(AsGpuStreamValue(stream) != nullptr);
   CHECK(blas_ != nullptr);
   cublasStatus_t ret =
-      wrap::cublasSetStream(parent_, blas_, AsCUDAStreamValue(stream));
+      wrap::cublasSetStream(parent_, blas_, AsGpuStreamValue(stream));
   if (ret != CUBLAS_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for cuBLAS calls: " << ToString(ret);
     return false;
@@ -706,7 +702,7 @@ bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
                           DeviceMemory<float> *result) {
   return DoBlasInternal(wrap::cublasSasum, stream,
                         false /* = pointer_mode_host */, elem_count,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(result));
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
@@ -714,7 +710,7 @@ bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
                           DeviceMemory<double> *result) {
   return DoBlasInternal(wrap::cublasDasum, stream,
                         false /* = pointer_mode_host */, elem_count,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(result));
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
@@ -722,7 +718,7 @@ bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
                           DeviceMemory<float> *result) {
   return DoBlasInternal(
       wrap::cublasScasum, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      elem_count, GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
@@ -730,7 +726,7 @@ bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
                           DeviceMemory<double> *result) {
   return DoBlasInternal(
       wrap::cublasDzasum, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      elem_count, GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha,
@@ -738,7 +734,7 @@ bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha,
                           DeviceMemory<float> *y, int incy) {
   return DoBlasInternal(wrap::cublasSaxpy, stream,
                         true /* = pointer_mode_host */, elem_count, &alpha,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(y), incy);
+                        GpuMemory(x), incx, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, double alpha,
@@ -746,7 +742,7 @@ bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, double alpha,
                           DeviceMemory<double> *y, int incy) {
   return DoBlasInternal(wrap::cublasDaxpy, stream,
                         true /* = pointer_mode_host */, elem_count, &alpha,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(y), incy);
+                        GpuMemory(x), incx, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
@@ -755,8 +751,8 @@ bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<float>> *y, int incy) {
   return DoBlasInternal(wrap::cublasCaxpy, stream,
                         true /* = pointer_mode_host */, elem_count,
-                        CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
-                        CUDAComplex(CUDAMemoryMutable(y)), incy);
+                        GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
@@ -765,8 +761,8 @@ bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<double>> *y, int incy) {
   return DoBlasInternal(wrap::cublasZaxpy, stream,
                         true /* = pointer_mode_host */, elem_count,
-                        CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
-                        CUDAComplex(CUDAMemoryMutable(y)), incy);
+                        GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
@@ -774,7 +770,7 @@ bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
                           DeviceMemory<float> *y, int incy) {
   return DoBlasInternal(wrap::cublasScopy, stream,
                         true /* = pointer_mode_host */, elem_count,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(y), incy);
+                        GpuMemory(x), incx, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
@@ -782,7 +778,7 @@ bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
                           DeviceMemory<double> *y, int incy) {
   return DoBlasInternal(wrap::cublasDcopy, stream,
                         true /* = pointer_mode_host */, elem_count,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(y), incy);
+                        GpuMemory(x), incx, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
@@ -790,8 +786,8 @@ bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<float>> *y, int incy) {
   return DoBlasInternal(wrap::cublasCcopy, stream,
                         true /* = pointer_mode_host */, elem_count,
-                        CUDAComplex(CUDAMemory(x)), incx,
-                        CUDAComplex(CUDAMemoryMutable(y)), incy);
+                        GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
@@ -799,8 +795,8 @@ bool CUDABlas::DoBlasCopy(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<double>> *y, int incy) {
   return DoBlasInternal(wrap::cublasZcopy, stream,
                         true /* = pointer_mode_host */, elem_count,
-                        CUDAComplex(CUDAMemory(x)), incx,
-                        CUDAComplex(CUDAMemoryMutable(y)), incy);
+                        GpuComplex(GpuMemory(x)), incx,
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count,
@@ -809,7 +805,7 @@ bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count,
                          DeviceMemory<float> *result) {
   return DoBlasInternal(
       wrap::cublasSdot, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(result));
+      GpuMemory(x), incx, GpuMemory(y), incy, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count,
@@ -818,7 +814,7 @@ bool CUDABlas::DoBlasDot(Stream *stream, uint64 elem_count,
                          DeviceMemory<double> *result) {
   return DoBlasInternal(
       wrap::cublasDdot, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(result));
+      GpuMemory(x), incx, GpuMemory(y), incy, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count,
@@ -827,8 +823,8 @@ bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<float>> *result) {
   return DoBlasInternal(
       wrap::cublasCdotc, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
-      CUDAComplex(CUDAMemoryMutable(result)));
+      GpuComplex(GpuMemory(x)), incx, GpuComplex(GpuMemory(y)), incy,
+      GpuComplex(GpuMemoryMutable(result)));
 }
 
 bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count,
@@ -837,8 +833,8 @@ bool CUDABlas::DoBlasDotc(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<double>> *result) {
   return DoBlasInternal(
       wrap::cublasZdotc, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
-      CUDAComplex(CUDAMemoryMutable(result)));
+      GpuComplex(GpuMemory(x)), incx, GpuComplex(GpuMemory(y)), incy,
+      GpuComplex(GpuMemoryMutable(result)));
 }
 
 bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count,
@@ -847,8 +843,8 @@ bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<float>> *result) {
   return DoBlasInternal(
       wrap::cublasCdotu, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
-      CUDAComplex(CUDAMemoryMutable(result)));
+      GpuComplex(GpuMemory(x)), incx, GpuComplex(GpuMemory(y)), incy,
+      GpuComplex(GpuMemoryMutable(result)));
 }
 
 bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count,
@@ -857,8 +853,8 @@ bool CUDABlas::DoBlasDotu(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<double>> *result) {
   return DoBlasInternal(
       wrap::cublasZdotu, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
-      CUDAComplex(CUDAMemoryMutable(result)));
+      GpuComplex(GpuMemory(x)), incx, GpuComplex(GpuMemory(y)), incy,
+      GpuComplex(GpuMemoryMutable(result)));
 }
 
 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
@@ -866,7 +862,7 @@ bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
                           DeviceMemory<float> *result) {
   return DoBlasInternal(wrap::cublasSnrm2, stream,
                         false /* = pointer_mode_host */, elem_count,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(result));
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
@@ -874,7 +870,7 @@ bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
                           DeviceMemory<double> *result) {
   return DoBlasInternal(wrap::cublasDnrm2, stream,
                         false /* = pointer_mode_host */, elem_count,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(result));
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
@@ -882,7 +878,7 @@ bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
                           DeviceMemory<float> *result) {
   return DoBlasInternal(
       wrap::cublasScnrm2, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      elem_count, GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
@@ -890,7 +886,7 @@ bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
                           DeviceMemory<double> *result) {
   return DoBlasInternal(
       wrap::cublasDznrm2, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      elem_count, GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
@@ -898,7 +894,7 @@ bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
                          DeviceMemory<float> *y, int incy, float c, float s) {
   return DoBlasInternal(
       wrap::cublasSrot, stream, true /* = pointer_mode_host */, elem_count,
-      CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy, &c, &s);
+      GpuMemoryMutable(x), incx, GpuMemoryMutable(y), incy, &c, &s);
 }
 
 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
@@ -907,7 +903,7 @@ bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
                          double s) {
   return DoBlasInternal(
       wrap::cublasDrot, stream, true /* = pointer_mode_host */, elem_count,
-      CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy, &c, &s);
+      GpuMemoryMutable(x), incx, GpuMemoryMutable(y), incy, &c, &s);
 }
 
 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
@@ -916,8 +912,8 @@ bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
                          float c, float s) {
   return DoBlasInternal(wrap::cublasCsrot, stream,
                         true /* = pointer_mode_host */, elem_count,
-                        CUDAComplex(CUDAMemoryMutable(x)), incx,
-                        CUDAComplex(CUDAMemoryMutable(y)), incy, &c, &s);
+                        GpuComplex(GpuMemoryMutable(x)), incx,
+                        GpuComplex(GpuMemoryMutable(y)), incy, &c, &s);
 }
 
 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
@@ -926,17 +922,17 @@ bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
                          double c, double s) {
   return DoBlasInternal(wrap::cublasZdrot, stream,
                         true /* = pointer_mode_host */, elem_count,
-                        CUDAComplex(CUDAMemoryMutable(x)), incx,
-                        CUDAComplex(CUDAMemoryMutable(y)), incy, &c, &s);
+                        GpuComplex(GpuMemoryMutable(x)), incx,
+                        GpuComplex(GpuMemoryMutable(y)), incy, &c, &s);
 }
 
 bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<float> *a,
                           DeviceMemory<float> *b, DeviceMemory<float> *c,
                           DeviceMemory<float> *s) {
   return DoBlasInternal(wrap::cublasSrotg, stream,
-                        false /* = pointer_mode_host */, CUDAMemoryMutable(a),
-                        CUDAMemoryMutable(b), CUDAMemoryMutable(c),
-                        CUDAMemoryMutable(s));
+                        false /* = pointer_mode_host */, GpuMemoryMutable(a),
+                        GpuMemoryMutable(b), GpuMemoryMutable(c),
+                        GpuMemoryMutable(s));
 }
 
 bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<double> *a,
@@ -944,8 +940,8 @@ bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<double> *a,
                           DeviceMemory<double> *s) {
   return DoBlasInternal(wrap::cublasDrotg, stream,
                         false /* = pointer_mode_host */,
-                        CUDAComplex(CUDAMemoryMutable(a)), CUDAMemoryMutable(b),
-                        CUDAMemoryMutable(c), CUDAMemoryMutable(s));
+                        GpuComplex(GpuMemoryMutable(a)), GpuMemoryMutable(b),
+                        GpuMemoryMutable(c), GpuMemoryMutable(s));
 }
 
 bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<float>> *a,
@@ -954,8 +950,8 @@ bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<float>> *a,
                           DeviceMemory<std::complex<float>> *s) {
   return DoBlasInternal(
       wrap::cublasCrotg, stream, false /* = pointer_mode_host */,
-      CUDAComplex(CUDAMemoryMutable(a)), CUDAComplex(CUDAMemoryMutable(b)),
-      CUDAComplex(CUDAMemoryMutable(c)), CUDAComplex(CUDAMemoryMutable(s)));
+      GpuComplex(GpuMemoryMutable(a)), GpuComplex(GpuMemoryMutable(b)),
+      GpuComplex(GpuMemoryMutable(c)), GpuComplex(GpuMemoryMutable(s)));
 }
 
 bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<double>> *a,
@@ -964,8 +960,8 @@ bool CUDABlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<double>> *a,
                           DeviceMemory<std::complex<double>> *s) {
   return DoBlasInternal(
       wrap::cublasZrotg, stream, false /* = pointer_mode_host */,
-      CUDAComplex(CUDAMemoryMutable(a)), CUDAComplex(CUDAMemoryMutable(b)),
-      CUDAComplex(CUDAMemoryMutable(c)), CUDAComplex(CUDAMemoryMutable(s)));
+      GpuComplex(GpuMemoryMutable(a)), GpuComplex(GpuMemoryMutable(b)),
+      GpuComplex(GpuMemoryMutable(c)), GpuComplex(GpuMemoryMutable(s)));
 }
 
 bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count,
@@ -974,8 +970,8 @@ bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count,
                           const DeviceMemory<float> &param) {
   return DoBlasInternal(wrap::cublasSrotm, stream,
                         false /* = pointer_mode_host */, elem_count,
-                        CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy,
-                        CUDAMemory(param));
+                        GpuMemoryMutable(x), incx, GpuMemoryMutable(y), incy,
+                        GpuMemory(param));
 }
 
 bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count,
@@ -984,8 +980,8 @@ bool CUDABlas::DoBlasRotm(Stream *stream, uint64 elem_count,
                           const DeviceMemory<double> &param) {
   return DoBlasInternal(wrap::cublasDrotm, stream,
                         false /* = pointer_mode_host */, elem_count,
-                        CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy,
-                        CUDAMemory(param));
+                        GpuMemoryMutable(x), incx, GpuMemoryMutable(y), incy,
+                        GpuMemory(param));
 }
 
 bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<float> *d1,
@@ -993,9 +989,9 @@ bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<float> *d1,
                            const DeviceMemory<float> &y1,
                            DeviceMemory<float> *param) {
   return DoBlasInternal(wrap::cublasSrotmg, stream,
-                        false /* = pointer_mode_host */, CUDAMemoryMutable(d1),
-                        CUDAMemoryMutable(d2), CUDAMemoryMutable(x1),
-                        CUDAMemory(y1), CUDAMemoryMutable(param));
+                        false /* = pointer_mode_host */, GpuMemoryMutable(d1),
+                        GpuMemoryMutable(d2), GpuMemoryMutable(x1),
+                        GpuMemory(y1), GpuMemoryMutable(param));
 }
 
 bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<double> *d1,
@@ -1003,37 +999,37 @@ bool CUDABlas::DoBlasRotmg(Stream *stream, DeviceMemory<double> *d1,
                            const DeviceMemory<double> &y1,
                            DeviceMemory<double> *param) {
   return DoBlasInternal(wrap::cublasDrotmg, stream,
-                        false /* = pointer_mode_host */, CUDAMemoryMutable(d1),
-                        CUDAMemoryMutable(d2), CUDAMemoryMutable(x1),
-                        CUDAMemory(y1), CUDAMemoryMutable(param));
+                        false /* = pointer_mode_host */, GpuMemoryMutable(d1),
+                        GpuMemoryMutable(d2), GpuMemoryMutable(x1),
+                        GpuMemory(y1), GpuMemoryMutable(param));
 }
 
 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha,
                           DeviceMemory<float> *x, int incx) {
   return DoBlasInternal(wrap::cublasSscal, stream,
                         true /* = pointer_mode_host */, elem_count, &alpha,
-                        CUDAMemoryMutable(x), incx);
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
                           DeviceMemory<double> *x, int incx) {
   return DoBlasInternal(wrap::cublasDscal, stream,
                         true /* = pointer_mode_host */, elem_count, &alpha,
-                        CUDAMemoryMutable(x), incx);
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha,
                           DeviceMemory<std::complex<float>> *x, int incx) {
   return DoBlasInternal(
       wrap::cublasCsscal, stream, true /* = pointer_mode_host */, elem_count,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx);
+      GpuComplex(&alpha), GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
                           DeviceMemory<std::complex<double>> *x, int incx) {
   return DoBlasInternal(
       wrap::cublasZdscal, stream, true /* = pointer_mode_host */, elem_count,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx);
+      GpuComplex(&alpha), GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count,
@@ -1041,7 +1037,7 @@ bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<float>> *x, int incx) {
   return DoBlasInternal(
       wrap::cublasCscal, stream, true /* = pointer_mode_host */, elem_count,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx);
+      GpuComplex(&alpha), GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count,
@@ -1049,7 +1045,7 @@ bool CUDABlas::DoBlasScal(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<double>> *x, int incx) {
   return DoBlasInternal(
       wrap::cublasZscal, stream, true /* = pointer_mode_host */, elem_count,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemoryMutable(x)), incx);
+      GpuComplex(&alpha), GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
@@ -1057,7 +1053,7 @@ bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
                           DeviceMemory<float> *y, int incy) {
   return DoBlasInternal(wrap::cublasSswap, stream,
                         true /* = pointer_mode_host */, elem_count,
-                        CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy);
+                        GpuMemoryMutable(x), incx, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
@@ -1065,7 +1061,7 @@ bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
                           DeviceMemory<double> *y, int incy) {
   return DoBlasInternal(wrap::cublasDswap, stream,
                         true /* = pointer_mode_host */, elem_count,
-                        CUDAMemoryMutable(x), incx, CUDAMemoryMutable(y), incy);
+                        GpuMemoryMutable(x), incx, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
@@ -1073,8 +1069,8 @@ bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<float>> *y, int incy) {
   return DoBlasInternal(wrap::cublasCswap, stream,
                         true /* = pointer_mode_host */, elem_count,
-                        CUDAComplex(CUDAMemoryMutable(x)), incx,
-                        CUDAComplex(CUDAMemoryMutable(y)), incy);
+                        GpuComplex(GpuMemoryMutable(x)), incx,
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
@@ -1082,8 +1078,8 @@ bool CUDABlas::DoBlasSwap(Stream *stream, uint64 elem_count,
                           DeviceMemory<std::complex<double>> *y, int incy) {
   return DoBlasInternal(wrap::cublasZswap, stream,
                         true /* = pointer_mode_host */, elem_count,
-                        CUDAComplex(CUDAMemoryMutable(x)), incx,
-                        CUDAComplex(CUDAMemoryMutable(y)), incy);
+                        GpuComplex(GpuMemoryMutable(x)), incx,
+                        GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
@@ -1091,7 +1087,7 @@ bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(wrap::cublasIsamax, stream,
                         false /* = pointer_mode_host */, elem_count,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(result));
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
@@ -1099,7 +1095,7 @@ bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(wrap::cublasIdamax, stream,
                         false /* = pointer_mode_host */, elem_count,
-                        CUDAMemory(x), incx, CUDAMemoryMutable(result));
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
@@ -1107,7 +1103,7 @@ bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(
       wrap::cublasIcamax, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      elem_count, GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
@@ -1115,7 +1111,7 @@ bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
                            int incx, DeviceMemory<int> *result) {
   return DoBlasInternal(
       wrap::cublasIzamax, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      elem_count, GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
@@ -1123,7 +1119,7 @@ bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(
       wrap::cublasIsamin, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      elem_count, GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
@@ -1131,7 +1127,7 @@ bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(
       wrap::cublasIdamin, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      elem_count, GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
@@ -1139,7 +1135,7 @@ bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(
       wrap::cublasIcamin, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      elem_count, GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
@@ -1147,7 +1143,7 @@ bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            int incx, DeviceMemory<int> *result) {
   return DoBlasInternal(
       wrap::cublasIzamin, stream, false /* = pointer_mode_host */,
-      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      elem_count, GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
@@ -1157,8 +1153,8 @@ bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
                           DeviceMemory<float> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasSgbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(trans), m, n, kl, ku, &alpha, CUDAMemory(a), lda,
-      CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
+      CUDABlasTranspose(trans), m, n, kl, ku, &alpha, GpuMemory(a), lda,
+      GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
@@ -1168,8 +1164,8 @@ bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
                           DeviceMemory<double> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasDgbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(trans), m, n, kl, ku, &alpha, CUDAMemory(a), lda,
-      CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
+      CUDABlasTranspose(trans), m, n, kl, ku, &alpha, GpuMemory(a), lda,
+      GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
@@ -1181,9 +1177,9 @@ bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
                           DeviceMemory<std::complex<float>> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasCgbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(trans), m, n, kl, ku, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+      CUDABlasTranspose(trans), m, n, kl, ku, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
@@ -1195,9 +1191,9 @@ bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
                           DeviceMemory<std::complex<double>> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasZgbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(trans), m, n, kl, ku, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+      CUDABlasTranspose(trans), m, n, kl, ku, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
@@ -1206,8 +1202,8 @@ bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
                           float beta, DeviceMemory<float> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasSgemv, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(trans), m, n, &alpha, CUDAMemory(a), lda, CUDAMemory(x),
-      incx, &beta, CUDAMemoryMutable(y), incy);
+      CUDABlasTranspose(trans), m, n, &alpha, GpuMemory(a), lda, GpuMemory(x),
+      incx, &beta, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
@@ -1216,8 +1212,8 @@ bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
                           double beta, DeviceMemory<double> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasDgemv, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(trans), m, n, &alpha, CUDAMemory(a), lda, CUDAMemory(x),
-      incx, &beta, CUDAMemoryMutable(y), incy);
+      CUDABlasTranspose(trans), m, n, &alpha, GpuMemory(a), lda, GpuMemory(x),
+      incx, &beta, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
@@ -1228,9 +1224,9 @@ bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
                           DeviceMemory<std::complex<float>> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasCgemv, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(trans), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+      CUDABlasTranspose(trans), m, n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
@@ -1241,9 +1237,9 @@ bool CUDABlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
                           DeviceMemory<std::complex<double>> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasZgemv, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(trans), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+      CUDABlasTranspose(trans), m, n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, float alpha,
@@ -1252,7 +1248,7 @@ bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, float alpha,
                          DeviceMemory<float> *a, int lda) {
   return DoBlasInternal(
       wrap::cublasSger, stream, true /* = pointer_mode_host */, m, n, &alpha,
-      CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda);
+      GpuMemory(x), incx, GpuMemory(y), incy, GpuMemoryMutable(a), lda);
 }
 
 bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, double alpha,
@@ -1261,7 +1257,7 @@ bool CUDABlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, double alpha,
                          DeviceMemory<double> *a, int lda) {
   return DoBlasInternal(
       wrap::cublasDger, stream, true /* = pointer_mode_host */, m, n, &alpha,
-      CUDAMemory(x), incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda);
+      GpuMemory(x), incx, GpuMemory(y), incy, GpuMemoryMutable(a), lda);
 }
 
 bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
@@ -1271,8 +1267,8 @@ bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
                           DeviceMemory<std::complex<float>> *a, int lda) {
   return DoBlasInternal(
       wrap::cublasCgerc, stream, true /* = pointer_mode_host */, m, n,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda);
+      GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(GpuMemory(y)), incy, GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
 bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
@@ -1282,8 +1278,8 @@ bool CUDABlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
                           DeviceMemory<std::complex<double>> *a, int lda) {
   return DoBlasInternal(
       wrap::cublasZgerc, stream, true /* = pointer_mode_host */, m, n,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda);
+      GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(GpuMemory(y)), incy, GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
 bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
@@ -1293,8 +1289,8 @@ bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
                           DeviceMemory<std::complex<float>> *a, int lda) {
   return DoBlasInternal(
       wrap::cublasCgeru, stream, true /* = pointer_mode_host */, m, n,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda);
+      GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(GpuMemory(y)), incy, GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
 bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
@@ -1304,8 +1300,8 @@ bool CUDABlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
                           DeviceMemory<std::complex<double>> *a, int lda) {
   return DoBlasInternal(
       wrap::cublasZgeru, stream, true /* = pointer_mode_host */, m, n,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(CUDAMemory(y)), incy, CUDAComplex(CUDAMemoryMutable(a)), lda);
+      GpuComplex(&alpha), GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(GpuMemory(y)), incy, GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
 bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1316,9 +1312,9 @@ bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<std::complex<float>> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasChbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, k, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+      CUDABlasUpperLower(uplo), n, k, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1329,9 +1325,9 @@ bool CUDABlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<std::complex<double>> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasZhbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, k, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+      CUDABlasUpperLower(uplo), n, k, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1342,9 +1338,9 @@ bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<std::complex<float>> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasChemv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+      CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1355,9 +1351,9 @@ bool CUDABlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<std::complex<double>> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasZhemv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+      CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1366,8 +1362,8 @@ bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
                          DeviceMemory<std::complex<float>> *a, int lda) {
   return DoBlasInternal(
       wrap::cublasCher, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, &alpha, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(CUDAMemoryMutable(a)), lda);
+      CUDABlasUpperLower(uplo), n, &alpha, GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
 bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1376,8 +1372,8 @@ bool CUDABlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
                          DeviceMemory<std::complex<double>> *a, int lda) {
   return DoBlasInternal(
       wrap::cublasZher, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, &alpha, CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(CUDAMemoryMutable(a)), lda);
+      CUDABlasUpperLower(uplo), n, &alpha, GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
 bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1387,9 +1383,9 @@ bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<std::complex<float>> *a, int lda) {
   return DoBlasInternal(
       wrap::cublasCher2, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
-      CUDAComplex(CUDAMemoryMutable(a)), lda);
+      CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(x)), incx, GpuComplex(GpuMemory(y)), incy,
+      GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
 bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1399,9 +1395,9 @@ bool CUDABlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<std::complex<double>> *a, int lda) {
   return DoBlasInternal(
       wrap::cublasZher2, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
-      CUDAComplex(CUDAMemoryMutable(a)), lda);
+      CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(x)), incx, GpuComplex(GpuMemory(y)), incy,
+      GpuComplex(GpuMemoryMutable(a)), lda);
 }
 
 bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1412,9 +1408,9 @@ bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<std::complex<float>> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasChpmv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(ap)), CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+      CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(ap)), GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1425,9 +1421,9 @@ bool CUDABlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<std::complex<double>> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasZhpmv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(ap)), CUDAComplex(CUDAMemory(x)), incx,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(y)), incy);
+      CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(ap)), GpuComplex(GpuMemory(x)), incx,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(y)), incy);
 }
 
 bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1436,8 +1432,8 @@ bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
                          DeviceMemory<std::complex<float>> *ap) {
   return DoBlasInternal(
       wrap::cublasChpr, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemoryMutable(ap)));
+      CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(x)), incx, GpuComplex(GpuMemoryMutable(ap)));
 }
 
 bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1446,8 +1442,8 @@ bool CUDABlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
                          DeviceMemory<std::complex<double>> *ap) {
   return DoBlasInternal(
       wrap::cublasZhpr, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemoryMutable(ap)));
+      CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(x)), incx, GpuComplex(GpuMemoryMutable(ap)));
 }
 
 bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1457,9 +1453,9 @@ bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<std::complex<float>> *ap) {
   return DoBlasInternal(
       wrap::cublasChpr2, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
-      CUDAComplex(CUDAMemoryMutable(ap)));
+      CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(x)), incx, GpuComplex(GpuMemory(y)), incy,
+      GpuComplex(GpuMemoryMutable(ap)));
 }
 
 bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1469,9 +1465,9 @@ bool CUDABlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<std::complex<double>> *ap) {
   return DoBlasInternal(
       wrap::cublasZhpr2, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(x)), incx, CUDAComplex(CUDAMemory(y)), incy,
-      CUDAComplex(CUDAMemoryMutable(ap)));
+      CUDABlasUpperLower(uplo), n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(x)), incx, GpuComplex(GpuMemory(y)), incy,
+      GpuComplex(GpuMemoryMutable(ap)));
 }
 
 bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1480,8 +1476,8 @@ bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           float beta, DeviceMemory<float> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasSsbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, k, &alpha, CUDAMemory(a), lda, CUDAMemory(x),
-      incx, &beta, CUDAMemoryMutable(y), incy);
+      CUDABlasUpperLower(uplo), n, k, &alpha, GpuMemory(a), lda, GpuMemory(x),
+      incx, &beta, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1490,8 +1486,8 @@ bool CUDABlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           double beta, DeviceMemory<double> *y, int incy) {
   return DoBlasInternal(
       wrap::cublasDsbmv, stream, true /* = pointer_mode_host */,
-      CUDABlasUpperLower(uplo), n, k, &alpha, CUDAMemory(a), lda, CUDAMemory(x),
-      incx, &beta, CUDAMemoryMutable(y), incy);
+      CUDABlasUpperLower(uplo), n, k, &alpha, GpuMemory(a), lda, GpuMemory(x),
+      incx, &beta, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1500,8 +1496,8 @@ bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<float> *y, int incy) {
   return DoBlasInternal(wrap::cublasSspmv, stream,
                         true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(ap),
-                        CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(ap),
+                        GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1510,8 +1506,8 @@ bool CUDABlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<double> *y, int incy) {
   return DoBlasInternal(wrap::cublasDspmv, stream,
                         true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(ap),
-                        CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(ap),
+                        GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1519,8 +1515,8 @@ bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
                          DeviceMemory<float> *ap) {
   return DoBlasInternal(wrap::cublasSspr, stream,
                         true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
-                        incx, CUDAMemoryMutable(ap));
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x),
+                        incx, GpuMemoryMutable(ap));
 }
 
 bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1528,8 +1524,8 @@ bool CUDABlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
                          DeviceMemory<double> *ap) {
   return DoBlasInternal(wrap::cublasDspr, stream,
                         true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
-                        incx, CUDAMemoryMutable(ap));
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x),
+                        incx, GpuMemoryMutable(ap));
 }
 
 bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1538,8 +1534,8 @@ bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<float> *ap) {
   return DoBlasInternal(wrap::cublasSspr2, stream,
                         true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
-                        incx, CUDAMemory(y), incy, CUDAMemoryMutable(ap));
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x),
+                        incx, GpuMemory(y), incy, GpuMemoryMutable(ap));
 }
 
 bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1548,8 +1544,8 @@ bool CUDABlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<double> *ap) {
   return DoBlasInternal(wrap::cublasDspr2, stream,
                         true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
-                        incx, CUDAMemory(y), incy, CUDAMemoryMutable(ap));
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x),
+                        incx, GpuMemory(y), incy, GpuMemoryMutable(ap));
 }
 
 bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1558,8 +1554,8 @@ bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<float> *y, int incy) {
   return DoBlasInternal(wrap::cublasSsymv, stream,
                         true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(a), lda,
-                        CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(a), lda,
+                        GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1568,8 +1564,8 @@ bool CUDABlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<double> *y, int incy) {
   return DoBlasInternal(wrap::cublasDsymv, stream,
                         true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(a), lda,
-                        CUDAMemory(x), incx, &beta, CUDAMemoryMutable(y), incy);
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(a), lda,
+                        GpuMemory(x), incx, &beta, GpuMemoryMutable(y), incy);
 }
 
 bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1577,8 +1573,8 @@ bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
                          DeviceMemory<float> *a, int lda) {
   return DoBlasInternal(wrap::cublasSsyr, stream,
                         true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
-                        incx, CUDAMemoryMutable(a), lda);
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x),
+                        incx, GpuMemoryMutable(a), lda);
 }
 
 bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1586,8 +1582,8 @@ bool CUDABlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
                          DeviceMemory<double> *a, int lda) {
   return DoBlasInternal(wrap::cublasDsyr, stream,
                         true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
-                        incx, CUDAMemoryMutable(a), lda);
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x),
+                        incx, GpuMemoryMutable(a), lda);
 }
 
 bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1596,8 +1592,8 @@ bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<float> *a, int lda) {
   return DoBlasInternal(wrap::cublasSsyr2, stream,
                         true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
-                        incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda);
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x),
+                        incx, GpuMemory(y), incy, GpuMemoryMutable(a), lda);
 }
 
 bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
@@ -1606,8 +1602,8 @@ bool CUDABlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
                           DeviceMemory<double> *a, int lda) {
   return DoBlasInternal(wrap::cublasDsyr2, stream,
                         true /* = pointer_mode_host */,
-                        CUDABlasUpperLower(uplo), n, &alpha, CUDAMemory(x),
-                        incx, CUDAMemory(y), incy, CUDAMemoryMutable(a), lda);
+                        CUDABlasUpperLower(uplo), n, &alpha, GpuMemory(x),
+                        incx, GpuMemory(y), incy, GpuMemoryMutable(a), lda);
 }
 
 bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
@@ -1617,8 +1613,8 @@ bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasStbmv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda,
-                        CUDAMemoryMutable(x), incx);
+                        CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda,
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
@@ -1628,8 +1624,8 @@ bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasDtbmv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda,
-                        CUDAMemoryMutable(x), incx);
+                        CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda,
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
@@ -1640,8 +1636,8 @@ bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasCtbmv, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-      CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda,
-      CUDAComplex(CUDAMemoryMutable(x)), incx);
+      CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)), lda,
+      GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
@@ -1652,8 +1648,8 @@ bool CUDABlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasZtbmv, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-      CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda,
-      CUDAComplex(CUDAMemoryMutable(x)), incx);
+      CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)), lda,
+      GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
@@ -1663,8 +1659,8 @@ bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasStbsv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda,
-                        CUDAMemoryMutable(x), incx);
+                        CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda,
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
@@ -1674,8 +1670,8 @@ bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasDtbsv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, k, CUDAMemory(a), lda,
-                        CUDAMemoryMutable(x), incx);
+                        CUDABlasDiagonal(diag), n, k, GpuMemory(a), lda,
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
@@ -1686,8 +1682,8 @@ bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasCtbsv, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-      CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda,
-      CUDAComplex(CUDAMemoryMutable(x)), incx);
+      CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)), lda,
+      GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
@@ -1698,8 +1694,8 @@ bool CUDABlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasZtbsv, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-      CUDABlasDiagonal(diag), n, k, CUDAComplex(CUDAMemory(a)), lda,
-      CUDAComplex(CUDAMemoryMutable(x)), incx);
+      CUDABlasDiagonal(diag), n, k, GpuComplex(GpuMemory(a)), lda,
+      GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
@@ -1709,7 +1705,7 @@ bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasStpmv, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-      CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx);
+      CUDABlasDiagonal(diag), n, GpuMemory(ap), GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
@@ -1719,7 +1715,7 @@ bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasDtpmv, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-      CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx);
+      CUDABlasDiagonal(diag), n, GpuMemory(ap), GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
@@ -1729,8 +1725,8 @@ bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasCtpmv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)),
-                        CUDAComplex(CUDAMemoryMutable(x)), incx);
+                        CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)),
+                        GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
@@ -1740,8 +1736,8 @@ bool CUDABlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasZtpmv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)),
-                        CUDAComplex(CUDAMemoryMutable(x)), incx);
+                        CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)),
+                        GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
@@ -1751,7 +1747,7 @@ bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasStpsv, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-      CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx);
+      CUDABlasDiagonal(diag), n, GpuMemory(ap), GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
@@ -1761,7 +1757,7 @@ bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasDtpsv, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-      CUDABlasDiagonal(diag), n, CUDAMemory(ap), CUDAMemoryMutable(x), incx);
+      CUDABlasDiagonal(diag), n, GpuMemory(ap), GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
@@ -1771,8 +1767,8 @@ bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasCtpsv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)),
-                        CUDAComplex(CUDAMemoryMutable(x)), incx);
+                        CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)),
+                        GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
@@ -1782,8 +1778,8 @@ bool CUDABlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasZtpsv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(ap)),
-                        CUDAComplex(CUDAMemoryMutable(x)), incx);
+                        CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(ap)),
+                        GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
@@ -1793,8 +1789,8 @@ bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasStrmv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAMemory(a), lda,
-                        CUDAMemoryMutable(x), incx);
+                        CUDABlasDiagonal(diag), n, GpuMemory(a), lda,
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
@@ -1804,8 +1800,8 @@ bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasDtrmv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAMemory(a), lda,
-                        CUDAMemoryMutable(x), incx);
+                        CUDABlasDiagonal(diag), n, GpuMemory(a), lda,
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
@@ -1815,8 +1811,8 @@ bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasCtrmv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)),
-                        lda, CUDAComplex(CUDAMemoryMutable(x)), incx);
+                        CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)),
+                        lda, GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
@@ -1826,8 +1822,8 @@ bool CUDABlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasZtrmv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)),
-                        lda, CUDAComplex(CUDAMemoryMutable(x)), incx);
+                        CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)),
+                        lda, GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
@@ -1837,8 +1833,8 @@ bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasStrsv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAMemory(a), lda,
-                        CUDAMemoryMutable(x), incx);
+                        CUDABlasDiagonal(diag), n, GpuMemory(a), lda,
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
@@ -1848,8 +1844,8 @@ bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasDtrsv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAMemory(a), lda,
-                        CUDAMemoryMutable(x), incx);
+                        CUDABlasDiagonal(diag), n, GpuMemory(a), lda,
+                        GpuMemoryMutable(x), incx);
 }
 
 bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
@@ -1859,8 +1855,8 @@ bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasCtrsv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)),
-                        lda, CUDAComplex(CUDAMemoryMutable(x)), incx);
+                        CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)),
+                        lda, GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
@@ -1870,8 +1866,8 @@ bool CUDABlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasZtrsv, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans),
-                        CUDABlasDiagonal(diag), n, CUDAComplex(CUDAMemory(a)),
-                        lda, CUDAComplex(CUDAMemoryMutable(x)), incx);
+                        CUDABlasDiagonal(diag), n, GpuComplex(GpuMemory(a)),
+                        lda, GpuComplex(GpuMemoryMutable(x)), incx);
 }
 
 bool CUDABlas::DoBlasGemm(
@@ -1925,9 +1921,9 @@ bool CUDABlas::DoBlasGemm(
   return DoBlasInternalImpl(
       wrap::cublasSgemmEx, stream, true /* = pointer_mode_host */,
       true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
-      CUDABlasTranspose(transb), m, n, k, &alpha, CUDAMemory(a),
-      SE_CUDA_DATA_HALF, lda, CUDAMemory(b), SE_CUDA_DATA_HALF, ldb, &beta,
-      CUDAMemoryMutable(c), SE_CUDA_DATA_HALF, ldc);
+      CUDABlasTranspose(transb), m, n, k, &alpha, GpuMemory(a),
+      SE_CUDA_DATA_HALF, lda, GpuMemory(b), SE_CUDA_DATA_HALF, ldb, &beta,
+      GpuMemoryMutable(c), SE_CUDA_DATA_HALF, ldc);
 
 #else
   LOG(ERROR) << "fp16 sgemm is not implemented in this cuBLAS version "
@@ -1972,7 +1968,7 @@ bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
   return DoBlasInternal(
       wrap::cublasSgemm, stream, true /* = pointer_mode_host */,
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
-      CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
+      GpuMemory(a), lda, GpuMemory(b), ldb, &beta, GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
@@ -1983,7 +1979,7 @@ bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
   return DoBlasInternal(
       wrap::cublasDgemm, stream, true /* = pointer_mode_host */,
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
-      CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
+      GpuMemory(a), lda, GpuMemory(b), ldb, &beta, GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
@@ -1996,9 +1992,9 @@ bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
   return DoBlasInternal(
       wrap::cublasCgemm, stream, true /* = pointer_mode_host */,
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
-      CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta),
-      CUDAComplex(CUDAMemoryMutable(c)), ldc);
+      GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+      GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
+      GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
@@ -2011,9 +2007,9 @@ bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
   return DoBlasInternal(
       wrap::cublasZgemm, stream, true /* = pointer_mode_host */,
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
-      CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta),
-      CUDAComplex(CUDAMemoryMutable(c)), ldc);
+      GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+      GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
+      GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasGemvWithProfiling(
@@ -2120,10 +2116,10 @@ bool CUDABlas::DoBlasGemvWithProfilingImpl(
     const DeviceMemory<T> &a, int lda, const DeviceMemory<T> &x, int incx,
     const T &beta, DeviceMemory<T> *y, int incy,
     blas::ProfileResult *output_profile_result) {
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   if (output_profile_result != nullptr) {
-    timer.reset(new CUDATimer(parent_));
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+    timer.reset(new GpuTimer(parent_));
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
       return false;
     }
   }
@@ -2133,9 +2129,9 @@ bool CUDABlas::DoBlasGemvWithProfilingImpl(
       DoBlasGemv(stream, trans, m, n, alpha, a, lda, x, incx, beta, y, incy);
 
   if (timer != nullptr && result) {
-    // CUDATimer will CHECK-fail if we Stop() it while the stream is in an error
+    // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error
     // state.
-    if (!timer->Stop(AsCUDAStream(stream))) {
+    if (!timer->Stop(AsGpuStream(stream))) {
       return false;
     }
     output_profile_result->set_is_valid(true);
@@ -2152,10 +2148,10 @@ bool CUDABlas::DoBlasGemmWithProfilingImpl(
     uint64 n, uint64 k, const ParamType &alpha, const DeviceMemory<T> &a,
     int lda, const DeviceMemory<T> &b, int ldb, const ParamType &beta,
     DeviceMemory<T> *c, int ldc, blas::ProfileResult *output_profile_result) {
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   if (output_profile_result != nullptr) {
-    timer.reset(new CUDATimer(parent_));
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+    timer.reset(new GpuTimer(parent_));
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
       return false;
     }
   }
@@ -2165,9 +2161,9 @@ bool CUDABlas::DoBlasGemmWithProfilingImpl(
                            ldb, beta, c, ldc);
 
   if (timer != nullptr && result) {
-    // CUDATimer will CHECK-fail if we Stop() it while the stream is in an error
+    // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error
     // state.
-    if (!timer->Stop(AsCUDAStream(stream))) {
+    if (!timer->Stop(AsGpuStream(stream))) {
       return false;
     }
     output_profile_result->set_is_valid(true);
@@ -2242,13 +2238,13 @@ bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
     return false;
   }
 
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   if (output_profile_result != nullptr) {
-    timer.reset(new CUDATimer(parent_));
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+    timer.reset(new GpuTimer(parent_));
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
       VLOG(2) << "DoBlasGemmWithAlgorithm returning false because "
                  "output_profile_result was given, but we were unable to "
-                 "create a CUDATimer.";
+                 "create a GpuTimer.";
       return false;
     }
   }
@@ -2274,19 +2270,19 @@ bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
   bool result = DoBlasInternalFailureOK(
       wrap::cublasGemmEx, stream, /* pointer_mode_host = */ !alpha.is_pointer(),
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-      alpha.is_pointer() ? CUDAMemory(alpha.pointer()) : &alpha.value(),
-      CUDAMemory(a), cuda_in_type, lda, CUDAMemory(b), cuda_in_type, ldb,
-      beta.is_pointer() ? CUDAMemory(beta.pointer()) : &beta.value(),
-      CUDAMemoryMutable(c), CUDADataType<OutT>::type, ldc,
+      alpha.is_pointer() ? GpuMemory(alpha.pointer()) : &alpha.value(),
+      GpuMemory(a), cuda_in_type, lda, GpuMemory(b), cuda_in_type, ldb,
+      beta.is_pointer() ? GpuMemory(beta.pointer()) : &beta.value(),
+      GpuMemoryMutable(c), CUDADataType<OutT>::type, ldc,
       CUDAComputationType(computation_type),
       static_cast<cublasGemmAlgo_t>(algorithm));
 
   if (timer != nullptr && result) {
-    // CUDATimer will CHECK-fail if we Stop() it while the stream is in an error
+    // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error
     // state.
-    if (!timer->Stop(AsCUDAStream(stream))) {
+    if (!timer->Stop(AsGpuStream(stream))) {
       VLOG(2) << "DoBlasGemmWithAlgorithm returning false; unable to stop "
-                 "CUDATimer.";
+                 "GpuTimer.";
       return false;
     }
     output_profile_result->set_is_valid(true);
@@ -2474,7 +2470,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
     c_raw_ptrs.push_back(static_cast<T *>(c_ptrs_to_wrappers[i]->opaque()));
   }
 
-  typedef typename HalfAsFloat<typename CUDAComplexT<T>::type>::type CUDA_T;
+  typedef typename HalfAsFloat<typename GpuComplexT<T>::type>::type CUDA_T;
 
   const size_t size = batch_count * sizeof(CUDA_T *);
 
@@ -2539,11 +2535,11 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
     cudaDataType_t compute_type =
         (data_type == CUDA_R_16F ? CUDA_R_32F : data_type);
     const void **a_void_ptrs = reinterpret_cast<const void **>(
-        const_cast<const CUDA_T **>(CUDAMemory(a)));
+        const_cast<const CUDA_T **>(GpuMemory(a)));
     const void **b_void_ptrs = reinterpret_cast<const void **>(
-        const_cast<const CUDA_T **>(CUDAMemory(b)));
+        const_cast<const CUDA_T **>(GpuMemory(b)));
     void **c_void_ptrs =
-        reinterpret_cast<void **>(const_cast<CUDA_T **>(CUDAMemory(c)));
+        reinterpret_cast<void **>(const_cast<CUDA_T **>(GpuMemory(c)));
     bool ok;
     ok = DoBlasInternalImpl(
         wrap::cublasGemmBatchedEx, stream, true /* = pointer_mode_host */,
@@ -2563,9 +2559,9 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
     bool ok = DoBlasInternal(
         cublas_func, stream, true /* = pointer_mode_host */,
         CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-        CUDAComplex(&alpha), const_cast<const CUDA_T **>(CUDAMemory(a)), lda,
-        const_cast<const CUDA_T **>(CUDAMemory(b)), ldb, CUDAComplex(&beta),
-        const_cast<CUDA_T **>(CUDAMemory(c)), ldc, batch_count);
+        GpuComplex(&alpha), const_cast<const CUDA_T **>(GpuMemory(a)), lda,
+        const_cast<const CUDA_T **>(GpuMemory(b)), ldb, GpuComplex(&beta),
+        const_cast<CUDA_T **>(GpuMemory(c)), ldc, batch_count);
     if (ok) {
       return port::Status::OK();
     }
@@ -2697,8 +2693,8 @@ bool CUDABlas::DoBlasGemmStridedBatched(
           wrap::cublasGemmStridedBatchedEx, stream,
           true /* = pointer_mode_host */, true /* = err_on_failure */,
           use_tensor_ops, CUDABlasTranspose(transa), CUDABlasTranspose(transb),
-          m, n, k, &alpha, CUDAMemory(a), CUDA_R_16F, lda, stride_a,
-          CUDAMemory(b), CUDA_R_16F, ldb, stride_b, &beta, CUDAMemoryMutable(c),
+          m, n, k, &alpha, GpuMemory(a), CUDA_R_16F, lda, stride_a,
+          GpuMemory(b), CUDA_R_16F, ldb, stride_b, &beta, GpuMemoryMutable(c),
           CUDA_R_16F, ldc, stride_c, batch_count, CUDA_R_32F, algo);
       if (ok) {
         return true;
@@ -2712,11 +2708,11 @@ bool CUDABlas::DoBlasGemmStridedBatched(
   // Either CUDA_VERSION < 9.1 or SM < 5.0. Fall back to a loop.
   for (int batch = 0; batch < batch_count; ++batch) {
     const auto *a_matrix =
-        reinterpret_cast<const __half *>(CUDAMemory(a) + batch * stride_a);
+        reinterpret_cast<const __half *>(GpuMemory(a) + batch * stride_a);
     const auto *b_matrix =
-        reinterpret_cast<const __half *>(CUDAMemory(b) + batch * stride_b);
+        reinterpret_cast<const __half *>(GpuMemory(b) + batch * stride_b);
     auto *c_matrix =
-        reinterpret_cast<__half *>(CUDAMemoryMutable(c) + batch * stride_c);
+        reinterpret_cast<__half *>(GpuMemoryMutable(c) + batch * stride_c);
     bool ok = DoBlasInternalImpl(
         wrap::cublasSgemmEx, stream, true /* = pointer_mode_host */,
         true /* = err_on_failure= */, use_tensor_ops, CUDABlasTranspose(transa),
@@ -2740,8 +2736,8 @@ bool CUDABlas::DoBlasGemmStridedBatched(
   return DoBlasInternal(
       wrap::cublasSgemmStridedBatched, stream, true /* = pointer_mode_host */,
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
-      CUDAMemory(a), lda, stride_a, CUDAMemory(b), ldb, stride_b, &beta,
-      CUDAMemoryMutable(c), ldc, stride_c, batch_count);
+      GpuMemory(a), lda, stride_a, GpuMemory(b), ldb, stride_b, &beta,
+      GpuMemoryMutable(c), ldc, stride_c, batch_count);
 }
 
 bool CUDABlas::DoBlasGemmStridedBatched(
@@ -2753,8 +2749,8 @@ bool CUDABlas::DoBlasGemmStridedBatched(
   return DoBlasInternal(
       wrap::cublasDgemmStridedBatched, stream, true /* = pointer_mode_host */,
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k, &alpha,
-      CUDAMemory(a), lda, stride_a, CUDAMemory(b), ldb, stride_b, &beta,
-      CUDAMemoryMutable(c), ldc, stride_c, batch_count);
+      GpuMemory(a), lda, stride_a, GpuMemory(b), ldb, stride_b, &beta,
+      GpuMemoryMutable(c), ldc, stride_c, batch_count);
 }
 
 bool CUDABlas::DoBlasGemmStridedBatched(
@@ -2767,9 +2763,9 @@ bool CUDABlas::DoBlasGemmStridedBatched(
   return DoBlasInternal(
       wrap::cublasCgemmStridedBatched, stream, true /* = pointer_mode_host */,
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, stride_a,
-      CUDAComplex(CUDAMemory(b)), ldb, stride_b, CUDAComplex(&beta),
-      CUDAComplex(CUDAMemoryMutable(c)), ldc, stride_c, batch_count);
+      GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, stride_a,
+      GpuComplex(GpuMemory(b)), ldb, stride_b, GpuComplex(&beta),
+      GpuComplex(GpuMemoryMutable(c)), ldc, stride_c, batch_count);
 }
 
 bool CUDABlas::DoBlasGemmStridedBatched(
@@ -2782,9 +2778,9 @@ bool CUDABlas::DoBlasGemmStridedBatched(
   return DoBlasInternal(
       wrap::cublasZgemmStridedBatched, stream, true /* = pointer_mode_host */,
       CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, stride_a,
-      CUDAComplex(CUDAMemory(b)), ldb, stride_b, CUDAComplex(&beta),
-      CUDAComplex(CUDAMemoryMutable(c)), ldc, stride_c, batch_count);
+      GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, stride_a,
+      GpuComplex(GpuMemory(b)), ldb, stride_b, GpuComplex(&beta),
+      GpuComplex(GpuMemoryMutable(c)), ldc, stride_c, batch_count);
 }
 
 bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
@@ -2796,9 +2792,9 @@ bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
                           DeviceMemory<std::complex<float>> *c, int ldc) {
   return DoBlasInternal(
       wrap::cublasChemm, stream, true /* = pointer_mode_host */,
-      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc);
+      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(b)), ldb,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
@@ -2810,9 +2806,9 @@ bool CUDABlas::DoBlasHemm(Stream *stream, blas::Side side,
                           DeviceMemory<std::complex<double>> *c, int ldc) {
   return DoBlasInternal(
       wrap::cublasZhemm, stream, true /* = pointer_mode_host */,
-      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc);
+      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(b)), ldb,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
@@ -2824,8 +2820,8 @@ bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasCherk, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
-                        &beta, CUDAComplex(CUDAMemoryMutable(c)), ldc);
+                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        &beta, GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
@@ -2837,8 +2833,8 @@ bool CUDABlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasZherk, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
-                        &beta, CUDAComplex(CUDAMemoryMutable(c)), ldc);
+                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        &beta, GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
@@ -2851,9 +2847,9 @@ bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasCher2k, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
-                        CUDAComplex(CUDAMemory(b)), ldb, &beta,
-                        CUDAComplex(CUDAMemoryMutable(c)), ldc);
+                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(b)), ldb, &beta,
+                        GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
@@ -2866,9 +2862,9 @@ bool CUDABlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasZher2k, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
-                        CUDAComplex(CUDAMemory(b)), ldb, &beta,
-                        CUDAComplex(CUDAMemoryMutable(c)), ldc);
+                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(b)), ldb, &beta,
+                        GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
@@ -2878,8 +2874,8 @@ bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
                           DeviceMemory<float> *c, int ldc) {
   return DoBlasInternal(
       wrap::cublasSsymm, stream, true /* = pointer_mode_host */,
-      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, &alpha, CUDAMemory(a),
-      lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
+      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, &alpha, GpuMemory(a),
+      lda, GpuMemory(b), ldb, &beta, GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
@@ -2889,8 +2885,8 @@ bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
                           DeviceMemory<double> *c, int ldc) {
   return DoBlasInternal(
       wrap::cublasDsymm, stream, true /* = pointer_mode_host */,
-      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, &alpha, CUDAMemory(a),
-      lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
+      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, &alpha, GpuMemory(a),
+      lda, GpuMemory(b), ldb, &beta, GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
@@ -2902,9 +2898,9 @@ bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
                           DeviceMemory<std::complex<float>> *c, int ldc) {
   return DoBlasInternal(
       wrap::cublasCsymm, stream, true /* = pointer_mode_host */,
-      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc);
+      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(b)), ldb,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
@@ -2916,9 +2912,9 @@ bool CUDABlas::DoBlasSymm(Stream *stream, blas::Side side,
                           DeviceMemory<std::complex<double>> *c, int ldc) {
   return DoBlasInternal(
       wrap::cublasZsymm, stream, true /* = pointer_mode_host */,
-      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemory(b)), ldb,
-      CUDAComplex(&beta), CUDAComplex(CUDAMemoryMutable(c)), ldc);
+      CUDABlasSide(side), CUDABlasUpperLower(uplo), m, n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemory(b)), ldb,
+      GpuComplex(&beta), GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
@@ -2928,7 +2924,7 @@ bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasSsyrk, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha,
-      CUDAMemory(a), lda, &beta, CUDAMemoryMutable(c), ldc);
+      GpuMemory(a), lda, &beta, GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
@@ -2938,7 +2934,7 @@ bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasDsyrk, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha,
-      CUDAMemory(a), lda, &beta, CUDAMemoryMutable(c), ldc);
+      GpuMemory(a), lda, &beta, GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
@@ -2950,8 +2946,8 @@ bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasCsyrk, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(&beta),
-      CUDAComplex(CUDAMemoryMutable(c)), ldc);
+      GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, GpuComplex(&beta),
+      GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
@@ -2963,8 +2959,8 @@ bool CUDABlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasZsyrk, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k,
-      CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(&beta),
-      CUDAComplex(CUDAMemoryMutable(c)), ldc);
+      GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda, GpuComplex(&beta),
+      GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
@@ -2975,7 +2971,7 @@ bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasSsyr2k, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha,
-      CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
+      GpuMemory(a), lda, GpuMemory(b), ldb, &beta, GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
@@ -2986,7 +2982,7 @@ bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(
       wrap::cublasDsyr2k, stream, true /* = pointer_mode_host */,
       CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n, k, &alpha,
-      CUDAMemory(a), lda, CUDAMemory(b), ldb, &beta, CUDAMemoryMutable(c), ldc);
+      GpuMemory(a), lda, GpuMemory(b), ldb, &beta, GpuMemoryMutable(c), ldc);
 }
 
 bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
@@ -2999,9 +2995,9 @@ bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasCsyr2k, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
-                        CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta),
-                        CUDAComplex(CUDAMemoryMutable(c)), ldc);
+                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
+                        GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
@@ -3014,9 +3010,9 @@ bool CUDABlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
   return DoBlasInternal(wrap::cublasZsyr2k, stream,
                         true /* = pointer_mode_host */,
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(trans), n,
-                        k, CUDAComplex(&alpha), CUDAComplex(CUDAMemory(a)), lda,
-                        CUDAComplex(CUDAMemory(b)), ldb, CUDAComplex(&beta),
-                        CUDAComplex(CUDAMemoryMutable(c)), ldc);
+                        k, GpuComplex(&alpha), GpuComplex(GpuMemory(a)), lda,
+                        GpuComplex(GpuMemory(b)), ldb, GpuComplex(&beta),
+                        GpuComplex(GpuMemoryMutable(c)), ldc);
 }
 
 bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
@@ -3027,8 +3023,8 @@ bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
   return DoBlasInternal(
       wrap::cublasStrmm, stream, true /* = pointer_mode_host */,
       CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
-      CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a), lda,
-      CUDAMemoryMutable(b), ldb, CUDAMemoryMutable(b), ldb);
+      CUDABlasDiagonal(diag), m, n, &alpha, GpuMemory(a), lda,
+      GpuMemoryMutable(b), ldb, GpuMemoryMutable(b), ldb);
 }
 
 bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
@@ -3039,8 +3035,8 @@ bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
   return DoBlasInternal(
       wrap::cublasDtrmm, stream, true /* = pointer_mode_host */,
       CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
-      CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a), lda,
-      CUDAMemoryMutable(b), ldb, CUDAMemoryMutable(b), ldb);
+      CUDABlasDiagonal(diag), m, n, &alpha, GpuMemory(a), lda,
+      GpuMemoryMutable(b), ldb, GpuMemoryMutable(b), ldb);
 }
 
 bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
@@ -3052,9 +3048,9 @@ bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
   return DoBlasInternal(
       wrap::cublasCtrmm, stream, true /* = pointer_mode_host */,
       CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
-      CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb,
-      CUDAComplex(CUDAMemoryMutable(b)), ldb);
+      CUDABlasDiagonal(diag), m, n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemoryMutable(b)), ldb,
+      GpuComplex(GpuMemoryMutable(b)), ldb);
 }
 
 bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
@@ -3066,9 +3062,9 @@ bool CUDABlas::DoBlasTrmm(Stream *stream, blas::Side side,
   return DoBlasInternal(
       wrap::cublasZtrmm, stream, true /* = pointer_mode_host */,
       CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
-      CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb,
-      CUDAComplex(CUDAMemoryMutable(b)), ldb);
+      CUDABlasDiagonal(diag), m, n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemoryMutable(b)), ldb,
+      GpuComplex(GpuMemoryMutable(b)), ldb);
 }
 
 bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
@@ -3079,8 +3075,8 @@ bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
   return DoBlasInternal(wrap::cublasStrsm, stream,
                         true /* = pointer_mode_host */, CUDABlasSide(side),
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
-                        CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a),
-                        lda, CUDAMemoryMutable(b), ldb);
+                        CUDABlasDiagonal(diag), m, n, &alpha, GpuMemory(a),
+                        lda, GpuMemoryMutable(b), ldb);
 }
 
 bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
@@ -3091,8 +3087,8 @@ bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
   return DoBlasInternal(wrap::cublasDtrsm, stream,
                         true /* = pointer_mode_host */, CUDABlasSide(side),
                         CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
-                        CUDABlasDiagonal(diag), m, n, &alpha, CUDAMemory(a),
-                        lda, CUDAMemoryMutable(b), ldb);
+                        CUDABlasDiagonal(diag), m, n, &alpha, GpuMemory(a),
+                        lda, GpuMemoryMutable(b), ldb);
 }
 
 bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
@@ -3104,8 +3100,8 @@ bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
   return DoBlasInternal(
       wrap::cublasCtrsm, stream, true /* = pointer_mode_host */,
       CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
-      CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb);
+      CUDABlasDiagonal(diag), m, n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemoryMutable(b)), ldb);
 }
 
 bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
@@ -3117,19 +3113,19 @@ bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
   return DoBlasInternal(
       wrap::cublasZtrsm, stream, true /* = pointer_mode_host */,
       CUDABlasSide(side), CUDABlasUpperLower(uplo), CUDABlasTranspose(transa),
-      CUDABlasDiagonal(diag), m, n, CUDAComplex(&alpha),
-      CUDAComplex(CUDAMemory(a)), lda, CUDAComplex(CUDAMemoryMutable(b)), ldb);
+      CUDABlasDiagonal(diag), m, n, GpuComplex(&alpha),
+      GpuComplex(GpuMemory(a)), lda, GpuComplex(GpuMemoryMutable(b)), ldb);
 }
 
-}  // namespace cuda
+}  // namespace gpu
 
 void initialize_cublas() {
   port::Status status =
       PluginRegistry::Instance()->RegisterFactory<PluginRegistry::BlasFactory>(
-          cuda::kCudaPlatformId, cuda::kCuBlasPlugin, "cuBLAS",
+          cuda::kCudaPlatformId, gpu::kCuBlasPlugin, "cuBLAS",
           [](internal::StreamExecutorInterface *parent) -> blas::BlasSupport * {
-            cuda::CUDAExecutor *cuda_executor =
-                dynamic_cast<cuda::CUDAExecutor *>(parent);
+            gpu::GpuExecutor *cuda_executor =
+                dynamic_cast<gpu::GpuExecutor *>(parent);
             if (cuda_executor == nullptr) {
               LOG(ERROR)
                   << "Attempting to initialize an instance of the cuBLAS "
@@ -3137,7 +3133,7 @@ void initialize_cublas() {
               return nullptr;
             }
 
-            cuda::CUDABlas *blas = new cuda::CUDABlas(cuda_executor);
+            gpu::CUDABlas *blas = new gpu::CUDABlas(cuda_executor);
             if (!blas->Init()) {
               // Note: Init() will log a more specific error.
               delete blas;
@@ -3152,7 +3148,7 @@ void initialize_cublas() {
   }
 
   PluginRegistry::Instance()->SetDefaultFactory(
-      cuda::kCudaPlatformId, PluginKind::kBlas, cuda::kCuBlasPlugin);
+      cuda::kCudaPlatformId, PluginKind::kBlas, gpu::kCuBlasPlugin);
 }
 
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
index 0fb05089d7530aa298a332e4e6c714eddd7799e9..63d03056d911fe807617f0987e751825248ae607 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -33,26 +33,26 @@ namespace stream_executor {
 
 class Stream;
 
-namespace cuda {
+namespace gpu {
 
 // Opaque and unique identifier for the cuBLAS plugin.
 extern const PluginId kCuBlasPlugin;
 
-class CUDAExecutor;
+class GpuExecutor;
 
 // BLAS plugin for CUDA platform via cuBLAS library.
 //
 // This satisfies the platform-agnostic BlasSupport interface.
 //
 // Note that the cuBLAS handle that this encapsulates is implicitly tied to the
-// context (and, as a result, the device) that the parent CUDAExecutor is tied
+// context (and, as a result, the device) that the parent GpuExecutor is tied
 // to. This simply happens as an artifact of creating the cuBLAS handle when a
 // CUDA context is active.
 //
 // Thread-safe post-initialization.
 class CUDABlas : public blas::BlasSupport {
  public:
-  explicit CUDABlas(CUDAExecutor *parent);
+  explicit CUDABlas(GpuExecutor *parent);
 
   // Allocates a cuBLAS handle.
   bool Init();
@@ -145,9 +145,9 @@ class CUDABlas : public blas::BlasSupport {
   // mutex that guards the cuBLAS handle for this device.
   mutex mu_;
 
-  // CUDAExecutor which instantiated this CUDABlas.
+  // GpuExecutor which instantiated this CUDABlas.
   // Immutable post-initialization.
-  CUDAExecutor *parent_;
+  GpuExecutor *parent_;
 
   // cuBLAS library handle on the device.
   cublasHandle_t blas_ GUARDED_BY(mu_);
@@ -155,7 +155,7 @@ class CUDABlas : public blas::BlasSupport {
   SE_DISALLOW_COPY_AND_ASSIGN(CUDABlas);
 };
 
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
index 6af71b6c9d194182e79decd3f1beeb96d8141974..e58ebee80da613a63e00d7627abf4e8f8c99bc5b 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@@ -52,13 +52,6 @@ limitations under the License.
 namespace stream_executor {
 namespace cuda {
 
-#ifdef __APPLE__
-static const CFStringRef kDriverKextIdentifier = CFSTR("com.nvidia.CUDA");
-#elif !defined(PLATFORM_WINDOWS)
-static const char *kDriverVersionPath = "/proc/driver/nvidia/version";
-#endif
-
-
 string DriverVersionToString(DriverVersion version) {
   return port::Printf("%d.%d.%d", std::get<0>(version), std::get<1>(version), std::get<2>(version));
 }
@@ -112,6 +105,18 @@ port::StatusOr<DriverVersion> StringToDriverVersion(const string &value) {
   return result;
 }
 
+}  // namespace cuda
+}  // namespace stream_executor
+
+namespace stream_executor {
+namespace gpu {
+
+#ifdef __APPLE__
+static const CFStringRef kDriverKextIdentifier = CFSTR("com.nvidia.CUDA");
+#elif !defined(PLATFORM_WINDOWS)
+static const char *kDriverVersionPath = "/proc/driver/nvidia/version";
+#endif
+
 // -- class Diagnostician
 
 string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
@@ -190,11 +195,11 @@ void Diagnostician::LogDiagnosticInformation() {
   }
   port::StatusOr<DriverVersion> dso_version = FindDsoVersion();
   LOG(INFO) << "libcuda reported version is: "
-            << DriverVersionStatusToString(dso_version);
+            << cuda::DriverVersionStatusToString(dso_version);
 
   port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
   LOG(INFO) << "kernel reported version is: "
-	  << DriverVersionStatusToString(kernel_version);
+            << cuda::DriverVersionStatusToString(kernel_version);
 #endif
 
   // OS X kernel driver does not report version accurately
@@ -232,7 +237,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
     }
     const size_t length = suffix_pos - start;
     const string version = path.substr(start, length);
-    result = StringToDriverVersion(version);
+    result = cuda::StringToDriverVersion(version);
   }
 #else
 #if !defined(PLATFORM_WINDOWS) && !defined(ANDROID_TEGRA)
@@ -260,7 +265,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
       // TODO(b/22689637): Eliminate the explicit namespace if possible.
       auto stripped_dso_version = port::StripSuffixString(dso_version, ".ld64");
       auto result = static_cast<port::StatusOr<DriverVersion> *>(data);
-      *result = StringToDriverVersion(stripped_dso_version);
+      *result = cuda::StringToDriverVersion(stripped_dso_version);
       return 1;
     }
     return 0;
@@ -292,7 +297,7 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
   // TODO(b/22689637): Eliminate the explicit namespace if possible.
   auto stripped_kernel_version =
       port::StripSuffixString(kernel_version, ".ld64");
-  return StringToDriverVersion(stripped_kernel_version);
+  return cuda::StringToDriverVersion(stripped_kernel_version);
 }
 
 void Diagnostician::WarnOnDsoKernelMismatch(
@@ -301,12 +306,12 @@ void Diagnostician::WarnOnDsoKernelMismatch(
   if (kernel_version.ok() && dso_version.ok() &&
       dso_version.ValueOrDie() == kernel_version.ValueOrDie()) {
     LOG(INFO) << "kernel version seems to match DSO: "
-              << DriverVersionToString(kernel_version.ValueOrDie());
+              << cuda::DriverVersionToString(kernel_version.ValueOrDie());
   } else {
     LOG(ERROR) << "kernel version "
-               << DriverVersionStatusToString(kernel_version)
+               << cuda::DriverVersionStatusToString(kernel_version)
                << " does not match DSO version "
-               << DriverVersionStatusToString(dso_version)
+               << cuda::DriverVersionStatusToString(dso_version)
                << " -- cannot find working devices in this configuration";
   }
 }
@@ -336,9 +341,9 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
     // see
     // https://developer.apple.com/library/mac/documentation/CoreFoundation/Conceptual/CFStrings/Articles/AccessingContents.html#//apple_ref/doc/uid/20001184-100980-TPXREF112
     if (version == NULL) {
-      return StringToDriverVersion("");
+      return cuda::StringToDriverVersion("");
     }
-    return StringToDriverVersion(version);
+    return cuda::StringToDriverVersion(version);
   }
   CFRelease(kext_infos);
   auto status = port::Status(
@@ -387,6 +392,5 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
 #endif
 }
 
-
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.h b/tensorflow/stream_executor/cuda/cuda_diagnostics.h
index f2db2eb20a18c671e055b910809dfde940a5e3f8..0837e136fd428570cb0d4ebddc85bedf66375f1a 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.h
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.h
@@ -16,17 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
 
-#include "tensorflow/stream_executor/platform/port.h"
-#include <tuple>
-
-#include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
 
 namespace stream_executor {
 namespace cuda {
 
 // e.g. DriverVersion{346, 3, 4}
-using DriverVersion = std::tuple<int, int, int>;
+using DriverVersion = gpu::DriverVersion;
 
 // Converts a parsed driver version to string form.
 string DriverVersionToString(DriverVersion version);
@@ -35,61 +31,9 @@ string DriverVersionToString(DriverVersion version);
 string DriverVersionStatusToString(port::StatusOr<DriverVersion> version);
 
 // Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
-port::StatusOr<DriverVersion> StringToDriverVersion(const string &value);
-
-class Diagnostician {
- public:
-  // Logs diagnostic information when CUDA appears to be misconfigured (e.g. is
-  // not initializing).
-  //
-  // Note: if we're running on a machine that has no GPUs, we don't want to
-  // produce very much log spew beyond saying, "looks like there's no CUDA
-  // kernel
-  // module running".
-  //
-  // Note: we use non-Google-File:: API here because we may be called before
-  // InitGoogle has completed.
-  static void LogDiagnosticInformation();
-
-  // Given the driver version file contents, finds the kernel module version and
-  // returns it as a string.
-  //
-  // This is solely used for more informative log messages when the user is
-  // running on a machine that happens to have a libcuda/kernel driver mismatch.
-  static port::StatusOr<DriverVersion> FindKernelModuleVersion(
-      const string &driver_version_file_contents);
-
-  // Extracts the kernel driver version from the current host.
-  static port::StatusOr<DriverVersion> FindKernelDriverVersion();
-
-  // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
-  // driver-interfacing DSO version number. Returns it as a string.
-  static port::StatusOr<DriverVersion> FindDsoVersion();
-
-  // Logs information about the kernel driver version and userspace driver
-  // library version.
-  static void LogDriverVersionInformation();
-
- private:
-
-  // Given the DSO version number and the driver version file contents, extracts
-  // the driver version and compares, warning the user in the case of
-  // incompatibility.
-  //
-  // This is solely used for more informative log messages when the user is
-  // running on a machine that happens to have a libcuda/kernel driver mismatch.
-  static void WarnOnDsoKernelMismatch(
-      port::StatusOr<DriverVersion> dso_version,
-      port::StatusOr<DriverVersion> kernel_version);
-
-  // Logs information about the dev nodes present on this machine: their
-  // existence, permissions, accessibility from this uid/gid.
-  static void LogDevNodeDiagnosticInformation();
-
-  static string GetDevNodePath(int dev_node_ordinal);
+port::StatusOr<DriverVersion> StringToDriverVersion(const string& value);
 
-  SE_DISALLOW_COPY_AND_ASSIGN(Diagnostician);
-};
+using Diagnostician = gpu::Diagnostician;
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 1f2e2f48bbddf5f638135129e502cfe233d5952f..bae71b4e8d3b95629f93e77e925d1877ee6d5d6d 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logger.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
@@ -38,6 +39,8 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/mathutil.h"
 #include "tensorflow/stream_executor/lib/threadpool.h"
+#include "tensorflow/stream_executor/logging.pb.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/scratch_allocator.h"
@@ -48,8 +51,14 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 // clang-format on
 
+#pragma clang diagnostic push
+
+// Make sure that Eigen::half forward declaration in dnn.h matches the
+// declaration in Eigen.
+#pragma clang diagnostic warning "-Wmismatched-tags"
+
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
 PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuDnnPlugin);
 
@@ -73,17 +82,6 @@ static_assert(CUDNN_VERSION >= 6000, "cuDNN needs to be version 6.0 or higher");
     }                                                                    \
   } while (false)
 
-// Returns whether status is 'ok', and potentially logs the error.
-bool IsStatusOk(const port::Status& status, bool report_error) {
-  if (status.ok()) {
-    return true;
-  }
-  if (report_error) {
-    LOG(ERROR) << status.error_message();
-  }
-  return false;
-}
-
 // Converts (via narrowing) a type T value to a type U, and checks that the
 // value has no value change due to the conversion.
 template <typename WideT, typename NarrowT>
@@ -139,7 +137,7 @@ class CudnnHandle {
  public:
   // Takes ownership of the executor context and the lock to access cuDNN
   // using handle.
-  CudnnHandle(cuda::ScopedActivateExecutorContext context, mutex_lock lock,
+  CudnnHandle(gpu::ScopedActivateExecutorContext context, mutex_lock lock,
               cudnnHandle_t handle)
       : context_(std::move(context)), lock_(std::move(lock)), handle_(handle) {}
 
@@ -148,13 +146,163 @@ class CudnnHandle {
   cudnnHandle_t handle() const { return handle_; }
 
  private:
-  cuda::ScopedActivateExecutorContext context_;
+  gpu::ScopedActivateExecutorContext context_;
   mutex_lock lock_;
   cudnnHandle_t handle_;  // Not owned.
 };
 
 }  // namespace
 
+#ifdef PLATFORM_GOOGLE
+// This macro wraps a global identifier, given by __name, in a callable
+// structure that loads the DLL symbol out of the DSO handle in a thread-safe
+// manner on first use. This dynamic loading technique is used to avoid DSO
+// dependencies on vendor libraries which may or may not be available in the
+// deployed binary environment.
+#define STREAM_EXECUTOR_CUDNN_WRAP(__name)   \
+  struct WrapperShim__##__name {             \
+    template <typename... Args>              \
+    cudnnStatus_t operator()(Args... args) { \
+      return ::__name(args...);              \
+    }                                        \
+  } __name;
+
+#else
+#define STREAM_EXECUTOR_CUDNN_WRAP(__name)                                \
+  struct DynLoadShim__##__name {                                          \
+    static const char* kName;                                             \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
+    static void* GetDsoHandle() {                                         \
+      auto s = internal::CachedDsoLoader::GetCudnnDsoHandle();            \
+      return s.ValueOrDie();                                              \
+    }                                                                     \
+    static FuncPtrT LoadOrDie() {                                         \
+      void* f;                                                            \
+      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                          kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                         \
+                    << " in cudnn DSO; dlerror: " << s.error_message();   \
+      return reinterpret_cast<FuncPtrT>(f);                               \
+    }                                                                     \
+    static FuncPtrT DynLoad() {                                           \
+      static FuncPtrT f = LoadOrDie();                                    \
+      return f;                                                           \
+    }                                                                     \
+    template <typename... Args>                                           \
+    cudnnStatus_t operator()(Args... args) {                              \
+      return DynLoad()(args...);                                          \
+    }                                                                     \
+  } __name;                                                               \
+  const char* DynLoadShim__##__name::kName = #__name;
+#endif
+
+// clang-format off
+#define CUDNN_ROUTINE_EACH_V7000_UNDER(__macro)               \
+  __macro(cudnnActivationForward)                             \
+  __macro(cudnnAddTensor)                                     \
+  __macro(cudnnBatchNormalizationBackward)                    \
+  __macro(cudnnBatchNormalizationForwardInference)            \
+  __macro(cudnnBatchNormalizationForwardTraining)             \
+  __macro(cudnnConvolutionBackwardBias)                       \
+  __macro(cudnnConvolutionBackwardData)                       \
+  __macro(cudnnConvolutionBackwardFilter)                     \
+  __macro(cudnnConvolutionBiasActivationForward)              \
+  __macro(cudnnConvolutionForward)                            \
+  __macro(cudnnCreate)                                        \
+  __macro(cudnnCreateActivationDescriptor)                    \
+  __macro(cudnnCreateConvolutionDescriptor)                   \
+  __macro(cudnnCreateDropoutDescriptor)                       \
+  __macro(cudnnCreateFilterDescriptor)                        \
+  __macro(cudnnCreateLRNDescriptor)                           \
+  __macro(cudnnCreatePersistentRNNPlan)                       \
+  __macro(cudnnCreatePoolingDescriptor)                       \
+  __macro(cudnnCreateRNNDescriptor)                           \
+  __macro(cudnnCreateTensorDescriptor)                        \
+  __macro(cudnnDestroy)                                       \
+  __macro(cudnnDestroyActivationDescriptor)                   \
+  __macro(cudnnDestroyConvolutionDescriptor)                  \
+  __macro(cudnnDestroyDropoutDescriptor)                      \
+  __macro(cudnnDestroyFilterDescriptor)                       \
+  __macro(cudnnDestroyLRNDescriptor)                          \
+  __macro(cudnnDestroyPersistentRNNPlan)                      \
+  __macro(cudnnDestroyPoolingDescriptor)                      \
+  __macro(cudnnDestroyRNNDescriptor)                          \
+  __macro(cudnnDestroyTensorDescriptor)                       \
+  __macro(cudnnDropoutGetStatesSize)                          \
+  __macro(cudnnGetActivationDescriptor)                       \
+  __macro(cudnnGetConvolutionBackwardDataAlgorithm)           \
+  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize)       \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithm)         \
+  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize)     \
+  __macro(cudnnGetConvolutionForwardAlgorithm)                \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize)            \
+  __macro(cudnnGetConvolutionNdDescriptor)                    \
+  __macro(cudnnGetConvolutionNdForwardOutputDim)              \
+  __macro(cudnnGetFilterNdDescriptor)                         \
+  __macro(cudnnGetProperty)                                   \
+  __macro(cudnnGetRNNLinLayerBiasParams)                      \
+  __macro(cudnnGetRNNLinLayerMatrixParams)                    \
+  __macro(cudnnGetRNNParamsSize)                              \
+  __macro(cudnnGetRNNTrainingReserveSize)                     \
+  __macro(cudnnGetRNNWorkspaceSize)                           \
+  __macro(cudnnLRNCrossChannelBackward)                       \
+  __macro(cudnnLRNCrossChannelForward)                        \
+  __macro(cudnnPoolingBackward)                               \
+  __macro(cudnnPoolingForward)                                \
+  __macro(cudnnRNNBackwardData)                               \
+  __macro(cudnnRNNBackwardWeights)                            \
+  __macro(cudnnRNNForwardInference)                           \
+  __macro(cudnnRNNForwardTraining)                            \
+  __macro(cudnnSetActivationDescriptor)                       \
+  __macro(cudnnSetConvolutionNdDescriptor)                    \
+  __macro(cudnnSetDropoutDescriptor)                          \
+  __macro(cudnnSetFilterNdDescriptor)                         \
+  __macro(cudnnSetLRNDescriptor)                              \
+  __macro(cudnnSetPersistentRNNPlan)                          \
+  __macro(cudnnSetPoolingNdDescriptor)                        \
+  __macro(cudnnSetRNNDescriptor)                              \
+  __macro(cudnnSetRNNDescriptor_v6)                           \
+  __macro(cudnnSetStream)                                     \
+  __macro(cudnnSetTensor4dDescriptor)                         \
+  __macro(cudnnSetTensorNdDescriptor)                         \
+  __macro(cudnnTransformTensor)
+
+// clang-format on
+
+CUDNN_ROUTINE_EACH_V7000_UNDER(STREAM_EXECUTOR_CUDNN_WRAP)
+#undef CUDNN_ROUTINE_EACH_V7000_UNDER
+
+#if CUDNN_VERSION >= 7000
+// clang-format off
+#define CUDNN_ROUTINE_EACH_V7000(__macro)                    \
+  __macro(cudnnSetRNNMatrixMathType)                         \
+  __macro(cudnnSetConvolutionMathType)                       \
+  __macro(cudnnSetConvolutionGroupCount)
+
+// clang-format on
+
+CUDNN_ROUTINE_EACH_V7000(STREAM_EXECUTOR_CUDNN_WRAP)
+#undef CUDNN_ROUTINE_EACH_V7000
+#endif
+
+#if CUDNN_VERSION >= 7201
+// clang-format off
+#define CUDNN_ROUTINE_EACH_V7210(__macro)                     \
+  __macro(cudnnCreateRNNDataDescriptor)                       \
+  __macro(cudnnDestroyRNNDataDescriptor)                      \
+  __macro(cudnnRNNBackwardDataEx)                             \
+  __macro(cudnnRNNBackwardWeightsEx)                          \
+  __macro(cudnnRNNForwardInferenceEx)                         \
+  __macro(cudnnRNNForwardTrainingEx)                          \
+  __macro(cudnnSetRNNDataDescriptor)                          \
+  __macro(cudnnSetRNNPaddingMode)
+
+// clang-format on
+
+CUDNN_ROUTINE_EACH_V7210(STREAM_EXECUTOR_CUDNN_WRAP)
+#undef CUDNN_ROUTINE_EACH_V7210
+#endif
+
 // Wraps a cuDNN handle and provides access to it through CudnnHandle
 // instances, which also locks a mutex, acquires the CUDA context, and sets
 // the stream that cuDNN should use to enqueue any work.
@@ -186,10 +334,10 @@ class CudnnAccess {
   // The legacy default stream synchronizes with all other streams and it is
   // therefore a bad idea (performance wise) to call any cuDNN APIs that
   // enqueue work in the stream.
-  CudnnHandle GetHandle(CUDAExecutor* executor, Stream* stream) {
+  CudnnHandle GetHandle(GpuExecutor* executor, Stream* stream) {
     mutex_lock lock(mutex_);
-    cuda::ScopedActivateExecutorContext context(executor);
-    CUstream cu_stream = stream ? AsCUDAStreamValue(stream) : cudaStreamLegacy;
+    gpu::ScopedActivateExecutorContext context(executor);
+    CUstream cu_stream = stream ? AsGpuStreamValue(stream) : cudaStreamLegacy;
     auto status = cudnnSetStream(handle_, cu_stream);
     CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << "Failed to set cuDNN stream.";
     return CudnnHandle(std::move(context), std::move(lock), handle_);
@@ -300,7 +448,7 @@ port::Status GetLoadedCudnnVersion(CudnnVersion* version) {
 
 }  // namespace
 
-CudnnSupport::CudnnSupport(CUDAExecutor* parent) : parent_(parent) {}
+CudnnSupport::CudnnSupport(GpuExecutor* parent) : parent_(parent) {}
 
 port::Status CudnnSupport::Init() {
   ScopedActivateExecutorContext context(parent_);
@@ -333,14 +481,14 @@ port::Status CudnnSupport::Init() {
   CHECK_EQ(cudnn_handle, nullptr);
   LOG(ERROR) << "Could not create cudnn handle: " << ToString(status);
   if (status == CUDNN_STATUS_NOT_INITIALIZED) {
-    auto result = cuda::Diagnostician::FindKernelDriverVersion();
+    auto result = gpu::Diagnostician::FindKernelDriverVersion();
     if (!result.ok()) {
       LOG(ERROR) << "Error retrieving driver version: "
-                 << DriverVersionStatusToString(result);
+                 << cuda::DriverVersionStatusToString(result);
     } else {
       const auto& version = result.ValueOrDie();
       LOG(ERROR) << "Possibly insufficient driver version: "
-                 << DriverVersionToString(version);
+                 << cuda::DriverVersionToString(version);
     }
   }
 
@@ -365,6 +513,13 @@ struct TensorDescriptorDeleter {
     CHECK_CUDNN_OK(cudnnDestroyTensorDescriptor(descriptor));
   }
 };
+#if CUDNN_VERSION >= 7201
+struct RNNDataDescriptorDeleter {
+  void operator()(cudnnRNNDataDescriptor_t descriptor) const {
+    CHECK_CUDNN_OK(cudnnDestroyRNNDataDescriptor(descriptor));
+  }
+};
+#endif
 struct FilterDescriptorDeleter {
   void operator()(cudnnFilterDescriptor_t descriptor) const {
     CHECK_CUDNN_OK(cudnnDestroyFilterDescriptor(descriptor));
@@ -410,6 +565,10 @@ struct PersistentRnnPlanDeleter {
 // RAII wrappers for cuDNN types.
 using TensorDescriptor =
     std::unique_ptr<cudnnTensorStruct, TensorDescriptorDeleter>;
+#if CUDNN_VERSION >= 7201
+using RNNDataDescriptor =
+    std::unique_ptr<cudnnRNNDataStruct, RNNDataDescriptorDeleter>;
+#endif
 using FilterDescriptor =
     std::unique_ptr<cudnnFilterStruct, FilterDescriptorDeleter>;
 using ConvolutionDescriptor =
@@ -431,6 +590,13 @@ TensorDescriptor CreateTensorDescriptor() {
   CHECK_CUDNN_OK(cudnnCreateTensorDescriptor(&result));
   return TensorDescriptor(result);
 }
+#if CUDNN_VERSION >= 7201
+RNNDataDescriptor CreateRNNDataDescriptor() {
+  cudnnRNNDataDescriptor_t result;
+  CHECK_CUDNN_OK(cudnnCreateRNNDataDescriptor(&result));
+  return RNNDataDescriptor(result);
+}
+#endif
 FilterDescriptor CreateFilterDescriptor() {
   cudnnFilterDescriptor_t result;
   CHECK_CUDNN_OK(cudnnCreateFilterDescriptor(&result));
@@ -617,6 +783,18 @@ bool BatchnormSpatialPersistentEnabled() {
   return is_enabled;
 }
 
+// A helper function to decide whether to enable deterministic functionality.
+bool RequireDeterminism() {
+  static bool is_enabled = [] {
+    bool is_enabled = false;
+    TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_CUDNN_DETERMINISTIC",
+                                               /*default_val=*/false,
+                                               &is_enabled));
+    return is_enabled;
+  }();
+  return is_enabled;
+}
+
 // Turns a ConvolutionDescriptor structure into a cudnn convolution handle
 // within a scope.
 class CudnnConvolutionDescriptor {
@@ -708,10 +886,13 @@ class CudnnPoolingDescriptor {
     std::transform(shape64.cbegin(), shape64.cend(), shape.begin(),
                    &CheckedNarrowing<int64, int>);
     bool propagate_nans = pooling_descriptor.propagate_nans();
+    auto cudnn_max_pooling_mode = RequireDeterminism()
+                                      ? CUDNN_POOLING_MAX_DETERMINISTIC
+                                      : CUDNN_POOLING_MAX;
     CHECK_CUDNN_OK(cudnnSetPoolingNdDescriptor(
         handle_.get(),
         (pooling_descriptor.mode() == dnn::PoolingMode::kMaximum
-             ? CUDNN_POOLING_MAX
+             ? cudnn_max_pooling_mode
              : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING),
         propagate_nans ? CUDNN_PROPAGATE_NAN : CUDNN_NOT_PROPAGATE_NAN, nd,
         shape.data(), padding.data(), strides.data()));
@@ -818,9 +999,11 @@ cudnnDataType_t ToCudnnDataType(
     dnn::DataLayout data_layout = dnn::DataLayout::kBatchDepthYX) {
   switch (data_type) {
     case dnn::DataType::kFloat:
+      return CUDNN_DATA_FLOAT;
     case dnn::DataType::kDouble:
+      return CUDNN_DATA_DOUBLE;
     case dnn::DataType::kHalf:
-      return static_cast<cudnnDataType_t>(data_type);
+      return CUDNN_DATA_HALF;
     case dnn::DataType::kInt8:
       return data_layout == dnn::DataLayout::kBatchDepthYX4 ? CUDNN_DATA_INT8x4
                                                             : CUDNN_DATA_INT8;
@@ -831,6 +1014,15 @@ cudnnDataType_t ToCudnnDataType(
   }
 }
 
+cudnnDataType_t ToCudnnDataType(dnn::DataType data_type,
+                                dnn::FilterLayout filter_layout) {
+  if (data_type == dnn::DataType::kInt8 &&
+      filter_layout == dnn::FilterLayout::kOutputInputYX4) {
+    return CUDNN_DATA_INT8x4;
+  }
+  return ToCudnnDataType(data_type);
+}
+
 template <typename T>
 cudnnDataType_t GetCudnnDataType(
     dnn::DataLayout data_layout = dnn::DataLayout::kBatchDepthYX) {
@@ -961,7 +1153,7 @@ class CudnnRnnParamsDescriptor {
 }  // namespace
 
 class CudnnRnnDescriptor : public dnn::RnnDescriptor {
-  CudnnRnnDescriptor(const CudnnHandle& cudnn, cuda::RnnDescriptor rnn_desc,
+  CudnnRnnDescriptor(const CudnnHandle& cudnn, gpu::RnnDescriptor rnn_desc,
                      PersistentRnnPlan rnn_plan, int num_layers,
                      int hidden_size, int input_size, int batch_size,
                      cudnnRNNInputMode_t input_mode,
@@ -1001,7 +1193,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
         CudnnDropoutDescriptor dropout_desc,
         CudnnDropoutDescriptor::Create(cudnn, dropout, seed, state_allocator));
 
-    cuda::RnnDescriptor rnn_desc = CreateRnnDescriptor();
+    gpu::RnnDescriptor rnn_desc = CreateRnnDescriptor();
     cudnnRNNAlgo_t rnn_algo = ToCudnnRNNAlgo(algorithm_config.algorithm());
 
     // TODO: allow the user to choose an algorithm.
@@ -1012,6 +1204,14 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
         /*mode=*/rnn_mode, /*algo=*/rnn_algo,
         /*dataType=*/compute_type));
 
+    // TODO: For now, we only use cudnnRNN**Ex API to process padded inputs.
+    // But in the future if these APIs are used to process full length arrays,
+    // we need to distinguish when to set it.
+#if CUDNN_VERSION >= 7201
+    RETURN_IF_CUDNN_ERROR(
+        cudnnSetRNNPaddingMode(rnn_desc.get(), CUDNN_RNN_PADDED_IO_ENABLED));
+#endif
+
     port::StatusOr<PersistentRnnPlan> rnn_plan_wrapper;
     PersistentRnnPlan rnn_plan;
     if (rnn_algo == CUDNN_RNN_ALGO_PERSIST_DYNAMIC) {
@@ -1084,7 +1284,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
   }
 
  private:
-  cuda::RnnDescriptor rnn_desc_;
+  gpu::RnnDescriptor rnn_desc_;
   PersistentRnnPlan rnn_plan_;
   int num_layers_;
   int hidden_size_;
@@ -1162,14 +1362,21 @@ port::StatusOr<CudnnRnnParamsDescriptor> CudnnRnnParamsDescriptor::Create(
     for (int region = 0; region < region_count_per_layer; region++) {
       for (int type = 0; type < 2; type++) {
         void* offset = nullptr;
-        RETURN_IF_CUDNN_ERROR((type == 0 ? cudnnGetRNNLinLayerMatrixParams
-                                         : cudnnGetRNNLinLayerBiasParams)(
-            /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc,
-            /*layer=*/layer, /*xDesc=*/input_desc.get(),
-            /*wDesc=*/filter_desc.get(),
-            /*w=*/nullptr, /*linLayerID=*/region,
-            /*linLayerMatDesc=*/region_desc_handle.get(),
-            /*linLayerMat or linLayerBias=*/&offset));
+        RETURN_IF_CUDNN_ERROR(
+            type == 0 ? cudnnGetRNNLinLayerMatrixParams(
+                            /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc,
+                            /*layer=*/layer, /*xDesc=*/input_desc.get(),
+                            /*wDesc=*/filter_desc.get(),
+                            /*w=*/nullptr, /*linLayerID=*/region,
+                            /*linLayerMatDesc=*/region_desc_handle.get(),
+                            /*linLayerMat or linLayerBias=*/&offset)
+                      : cudnnGetRNNLinLayerBiasParams(
+                            /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc,
+                            /*layer=*/layer, /*xDesc=*/input_desc.get(),
+                            /*wDesc=*/filter_desc.get(),
+                            /*w=*/nullptr, /*linLayerID=*/region,
+                            /*linLayerMatDesc=*/region_desc_handle.get(),
+                            /*linLayerMat or linLayerBias=*/&offset));
         int dims[] = {1, 1, 1};
         cudnnDataType_t data_type;
         cudnnTensorFormat_t tensor_format;
@@ -1196,26 +1403,32 @@ port::StatusOr<CudnnRnnParamsDescriptor> CudnnRnnParamsDescriptor::Create(
 
 class CudnnRnnSequenceTensorDescriptor
     : public dnn::RnnSequenceTensorDescriptor {
-  CudnnRnnSequenceTensorDescriptor(CUDAExecutor* parent, int seq_length,
+  CudnnRnnSequenceTensorDescriptor(GpuExecutor* parent, int max_seq_length,
                                    int batch_size, int data_size,
                                    cudnnDataType_t data_type,
+#if CUDNN_VERSION >= 7201
+                                   RNNDataDescriptor data_handle,
+#endif
                                    TensorDescriptor handle)
-      : parent_(parent),
-        seq_length_(seq_length),
+      : max_seq_length_(max_seq_length),
         batch_size_(batch_size),
         data_size_(data_size),
         data_type_(data_type),
         handle_(std::move(handle)),
-        handles_(seq_length, handle_.get()) {}
+#if CUDNN_VERSION >= 7201
+        rnn_data_handle_(std::move(data_handle)),
+#endif
+        handles_(max_seq_length, handle_.get()) {
+  }
 
  public:
   CudnnRnnSequenceTensorDescriptor(CudnnRnnSequenceTensorDescriptor&&) =
       default;
 
   static port::StatusOr<CudnnRnnSequenceTensorDescriptor> Create(
-      CUDAExecutor* parent, int seq_length, int batch_size, int data_size,
+      GpuExecutor* parent, int max_seq_length, int batch_size, int data_size,
       cudnnDataType_t data_type) {
-    CHECK_GT(seq_length, 0);
+    CHECK_GT(max_seq_length, 0);
     int dims[] = {batch_size, data_size, 1};
     int strides[] = {dims[1] * dims[2], dims[2], 1};
     TensorDescriptor tensor_desc = CreateTensorDescriptor();
@@ -1223,37 +1436,85 @@ class CudnnRnnSequenceTensorDescriptor
         /*tensorDesc=*/tensor_desc.get(), /*dataType=*/data_type,
         /*nbDims=*/sizeof(dims) / sizeof(dims[0]), /*dimA=*/dims,
         /*strideA=*/strides));
-    return CudnnRnnSequenceTensorDescriptor(parent, seq_length, batch_size,
+    return CudnnRnnSequenceTensorDescriptor(parent, max_seq_length, batch_size,
                                             data_size, data_type,
+#if CUDNN_VERSION >= 7201
+                                            nullptr,
+#endif
                                             std::move(tensor_desc));
   }
 
+  static port::StatusOr<CudnnRnnSequenceTensorDescriptor> Create(
+      GpuExecutor* parent, int max_seq_length, int batch_size, int data_size,
+      const absl::Span<const int>& seq_lengths, cudnnDataType_t data_type) {
+#if CUDNN_VERSION >= 7201
+    CHECK_GT(max_seq_length, 0);
+    int dims[] = {batch_size, data_size, 1};
+    int strides[] = {dims[1] * dims[2], dims[2], 1};
+    TensorDescriptor tensor_desc = CreateTensorDescriptor();
+    RETURN_IF_CUDNN_ERROR(cudnnSetTensorNdDescriptor(
+        /*tensorDesc=*/tensor_desc.get(), /*dataType=*/data_type,
+        /*nbDims=*/sizeof(dims) / sizeof(dims[0]), /*dimA=*/dims,
+        /*strideA=*/strides));
+    const int* seq_lengths_array = seq_lengths.data();
+    RNNDataDescriptor data_desc = CreateRNNDataDescriptor();
+    float padding_fill = 0.0f;
+    RETURN_IF_CUDNN_ERROR(cudnnSetRNNDataDescriptor(
+        /*RNNDataDesc=*/data_desc.get(), /*dataType*/ data_type,
+        /*layout=*/CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED,
+        /*maxSeqLength=*/max_seq_length,
+        /*batchSize=*/batch_size, /*vectorSize=*/data_size,
+        /*seqLengthArray=*/seq_lengths_array,
+        /*paddingFill*/ (void*)&padding_fill));
+    return CudnnRnnSequenceTensorDescriptor(
+        parent, max_seq_length, batch_size, data_size, data_type,
+        std::move(data_desc), std::move(tensor_desc));
+#else
+    return port::Status(port::error::INVALID_ARGUMENT,
+                        "No supported cudnnSetRNNDataDescriptor when "
+                        "CUDNN_VERSION < 7.2.1");
+#endif
+  }
+
   const cudnnTensorDescriptor_t* handles() const {
     return handles_.data();
   }
+#if CUDNN_VERSION >= 7201
+  const cudnnRNNDataDescriptor_t data_handle() const {
+    return rnn_data_handle_.get();
+  }
+#endif
 
-  int seq_length() const { return seq_length_; }
+  int max_seq_length() const { return max_seq_length_; }
   int batch_size() const { return batch_size_; }
   int data_size() const { return data_size_; }
+  bool is_var_seq_lengths() const {
+#if CUDNN_VERSION >= 7201
+    return rnn_data_handle_ != nullptr;
+#else
+    return false;
+#endif
+  }
 
  private:
-  CUDAExecutor* parent_;
-  int seq_length_;
+  int max_seq_length_;
   int batch_size_;
   int data_size_;
   cudnnDataType_t data_type_;
   TensorDescriptor handle_;
+#if CUDNN_VERSION >= 7201
+  RNNDataDescriptor rnn_data_handle_;
+#endif
   std::vector<cudnnTensorDescriptor_t> handles_;  // Copies of handle_.
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnRnnSequenceTensorDescriptor);
 };
 
 class CudnnRnnStateTensorDescriptor : public dnn::RnnStateTensorDescriptor {
  public:
-  CudnnRnnStateTensorDescriptor(CUDAExecutor* parent, int num_layers,
+  CudnnRnnStateTensorDescriptor(GpuExecutor* parent, int num_layers,
                                 int batch_size, int data_size,
                                 cudnnDataType_t data_type)
-      : parent_(parent),
-        handle_(CreateTensorDescriptor()),
+      : handle_(CreateTensorDescriptor()),
         num_layers_(num_layers),
         batch_size_(batch_size),
         data_size_(data_size),
@@ -1273,7 +1534,6 @@ class CudnnRnnStateTensorDescriptor : public dnn::RnnStateTensorDescriptor {
   int data_size() const { return data_size_; }
 
  private:
-  CUDAExecutor* parent_;
   TensorDescriptor handle_;
   int num_layers_;
   int batch_size_;
@@ -1287,7 +1547,7 @@ namespace {
 struct RnnModelDims {
   int num_layers = 0;
   int batch_size = 0;
-  int seq_length = 0;
+  int max_seq_length = 0;
   int hidden_size = 0;
   int input_size = 0;
   int dir_count = 0;
@@ -1312,7 +1572,7 @@ port::StatusOr<RnnModelDims> ExtractAndCheckRnnForward(
   RnnModelDims model_dims;
   model_dims.num_layers = rnn_desc.num_layers();
   model_dims.batch_size = input_desc.batch_size();
-  model_dims.seq_length = input_desc.seq_length();
+  model_dims.max_seq_length = input_desc.max_seq_length();
   model_dims.hidden_size = rnn_desc.hidden_size();
   model_dims.input_size = input_desc.data_size();
   model_dims.dir_count =
@@ -1330,7 +1590,7 @@ port::StatusOr<RnnModelDims> ExtractAndCheckRnnForward(
         input_h_desc.data_size() == input_c_desc.data_size())) {
     return port::Status(port::error::INVALID_ARGUMENT, "Invalid input_c shape");
   }
-  if (!(output_desc.seq_length() == model_dims.seq_length &&
+  if (!(output_desc.max_seq_length() == model_dims.max_seq_length &&
         output_desc.batch_size() == model_dims.batch_size &&
         output_desc.data_size() ==
             model_dims.hidden_size * model_dims.dir_count)) {
@@ -1377,7 +1637,7 @@ port::StatusOr<DeviceMemory<uint8>> CreateRnnWorkspace(
   size_t workspace_size_in_bytes = 0;
   RETURN_IF_CUDNN_ERROR(cudnnGetRNNWorkspaceSize(
       /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
-      /*seqLength=*/input_desc.seq_length(), /*xDesc=*/input_desc.handles(),
+      /*seqLength=*/input_desc.max_seq_length(), /*xDesc=*/input_desc.handles(),
       /*sizeInBytes=*/&workspace_size_in_bytes));
   // Allocate the workspace.
   if (workspace_size_in_bytes == 0) {
@@ -1427,7 +1687,7 @@ port::Status CudnnSupport::DoRnnForwardImpl(
     size_t reserve_space_size_in_bytes = 0;
     RETURN_IF_CUDNN_ERROR(cudnnGetRNNTrainingReserveSize(
         /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
-        /*seqLength=*/model_dims.seq_length, /*xDesc=*/input_desc.handles(),
+        /*seqLength=*/model_dims.max_seq_length, /*xDesc=*/input_desc.handles(),
         /*sizeInBytes=*/&reserve_space_size_in_bytes));
 
     if (reserve_space_size_in_bytes > 0) {
@@ -1437,48 +1697,99 @@ port::Status CudnnSupport::DoRnnForwardImpl(
     }
   }
 
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   const bool is_profiling = output_profile_result != nullptr;
   if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));
+    timer.reset(new GpuTimer(parent_));
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to start timer");
     }
   }
 
   if (!is_training) {
-    RETURN_IF_CUDNN_ERROR(cudnnRNNForwardInference(
-        /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
-        /*seqLength=*/model_dims.seq_length, /*xDesc=*/input_desc.handles(),
-        /*x=*/input_data.opaque(), /*hxDesc=*/input_h_desc.handle(),
-        /*hx=*/input_h_data.opaque(), /*cxDesc=*/input_c_desc.handle(),
-        /*cx=*/input_c_data.opaque(), /*wDesc=*/rnn_desc.params_handle(),
-        /*w=*/params.opaque(), /*yDesc=*/output_desc.handles(),
-        /*y=*/output_data->opaque(), /*hyDesc=*/output_h_desc.handle(),
-        /*hy=*/output_h_data->opaque(), /*cyDesc=*/output_c_desc.handle(),
-        /*cy=*/output_c_data->opaque(), /*workspace=*/workspace.opaque(),
-        /*workSpaceSizeInBytes=*/workspace.size()));
+    if (input_desc.is_var_seq_lengths()) {
+#if CUDNN_VERSION >= 7201
+      RETURN_IF_CUDNN_ERROR(cudnnRNNForwardInferenceEx(
+          /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
+          /*xDesc=*/input_desc.data_handle(), /*x=*/input_data.opaque(),
+          /*hxDesc=*/input_h_desc.handle(), /*hx=*/input_h_data.opaque(),
+          /*cxDesc=*/input_c_desc.handle(), /*cx=*/input_c_data.opaque(),
+          /*wDesc=*/rnn_desc.params_handle(), /*w=*/params.opaque(),
+          /*yDesc=*/output_desc.data_handle(),
+          /*y=*/output_data->opaque(),
+          /*hyDesc=*/output_h_desc.handle(), /*hy=*/output_h_data->opaque(),
+          /*cyDesc=*/output_c_desc.handle(), /*cy=*/output_c_data->opaque(),
+          nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+          nullptr,
+          /*workspace=*/workspace.opaque(),
+          /*workSpaceSizeInBytes=*/workspace.size()));
+#else
+      return port::Status(port::error::INVALID_ARGUMENT,
+                          "No supported cudnnRNNForwardInferenceEx when "
+                          "CUDNN_VERSION < 7.2.1");
+#endif
+    } else {
+      RETURN_IF_CUDNN_ERROR(cudnnRNNForwardInference(
+          /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
+          /*seqLength=*/model_dims.max_seq_length,
+          /*xDesc=*/input_desc.handles(),
+          /*x=*/input_data.opaque(), /*hxDesc=*/input_h_desc.handle(),
+          /*hx=*/input_h_data.opaque(), /*cxDesc=*/input_c_desc.handle(),
+          /*cx=*/input_c_data.opaque(), /*wDesc=*/rnn_desc.params_handle(),
+          /*w=*/params.opaque(), /*yDesc=*/output_desc.handles(),
+          /*y=*/output_data->opaque(), /*hyDesc=*/output_h_desc.handle(),
+          /*hy=*/output_h_data->opaque(), /*cyDesc=*/output_c_desc.handle(),
+          /*cy=*/output_c_data->opaque(), /*workspace=*/workspace.opaque(),
+          /*workSpaceSizeInBytes=*/workspace.size()));
+    }
   } else {
-    RETURN_IF_CUDNN_ERROR(cudnnRNNForwardTraining(
-        /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
-        /*seqLength=*/model_dims.seq_length, /*xDesc=*/input_desc.handles(),
-        /*x=*/input_data.opaque(), /*hxDesc=*/input_h_desc.handle(),
-        /*hx=*/input_h_data.opaque(), /*cxDesc=*/input_c_desc.handle(),
-        /*cx=*/input_c_data.opaque(), /*wDesc=*/rnn_desc.params_handle(),
-        /*w=*/params.opaque(), /*yDesc=*/output_desc.handles(),
-        /*y=*/output_data->opaque(), /*hyDesc=*/output_h_desc.handle(),
-        /*hy=*/output_h_data->opaque(), /*cyDesc=*/output_c_desc.handle(),
-        /*cy=*/output_c_data->opaque(), /*workspace=*/workspace.opaque(),
-        /*workSpaceSizeInBytes=*/workspace.size(),
-        /*reserveSpace=*/reserve_space.opaque(),
-        /*reserveSpaceSizeInBytes=*/reserve_space.size()));
+    if (input_desc.is_var_seq_lengths()) {
+#if CUDNN_VERSION >= 7201
+      // cudnnSetRNNPaddingMode(rnn_desc.handle(), CUDNN_RNN_PADDED_IO_ENABLED);
+      RETURN_IF_CUDNN_ERROR(cudnnRNNForwardTrainingEx(
+          /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
+          /*xDesc=*/input_desc.data_handle(), /*x=*/input_data.opaque(),
+          /*hxDesc=*/input_h_desc.handle(), /*hx=*/input_h_data.opaque(),
+          /*cxDesc=*/input_c_desc.handle(), /*cx=*/input_c_data.opaque(),
+          /*wDesc=*/rnn_desc.params_handle(), /*w=*/params.opaque(),
+          /*yDesc=*/output_desc.data_handle(),
+          /*y=*/output_data->opaque(),
+          /*hyDesc=*/output_h_desc.handle(), /*hy=*/output_h_data->opaque(),
+          /*cyDesc=*/output_c_desc.handle(), /*cy=*/output_c_data->opaque(),
+          nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+          nullptr,
+          /*workspace=*/workspace.opaque(),
+          /*workSpaceSizeInBytes=*/workspace.size(),
+          /*reserveSpace=*/reserve_space.opaque(),
+          /*reserveSpaceSizeInBytes=*/reserve_space.size()));
+#else
+      return port::Status(port::error::INVALID_ARGUMENT,
+                          "No supported cudnnRNNForwardTrainingEx when "
+                          "CUDNN_VERSION < 7.2.1");
+#endif
+    } else {
+      RETURN_IF_CUDNN_ERROR(cudnnRNNForwardTraining(
+          /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
+          /*seqLength=*/model_dims.max_seq_length,
+          /*xDesc=*/input_desc.handles(),
+          /*x=*/input_data.opaque(), /*hxDesc=*/input_h_desc.handle(),
+          /*hx=*/input_h_data.opaque(), /*cxDesc=*/input_c_desc.handle(),
+          /*cx=*/input_c_data.opaque(), /*wDesc=*/rnn_desc.params_handle(),
+          /*w=*/params.opaque(), /*yDesc=*/output_desc.handles(),
+          /*y=*/output_data->opaque(), /*hyDesc=*/output_h_desc.handle(),
+          /*hy=*/output_h_data->opaque(), /*cyDesc=*/output_c_desc.handle(),
+          /*cy=*/output_c_data->opaque(), /*workspace=*/workspace.opaque(),
+          /*workSpaceSizeInBytes=*/workspace.size(),
+          /*reserveSpace=*/reserve_space.opaque(),
+          /*reserveSpaceSizeInBytes=*/reserve_space.size()));
+    }
   }
 
   if (is_profiling) {
-    if (!timer->Stop(AsCUDAStream(stream))) {
+    if (!timer->Stop(AsGpuStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
     auto algo_desc = *rnn_desc.algorithm_config().algorithm();
@@ -1529,59 +1840,113 @@ port::Status CudnnSupport::DoRnnBackwardImpl(
                       CreateRnnWorkspace(stream, cudnn, rnn_desc, input_desc,
                                          workspace_allocator));
 
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   const bool is_profiling = output_profile_result != nullptr;
   if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));
+    timer.reset(new GpuTimer(parent_));
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to start timer");
     }
   }
 
-  RETURN_IF_CUDNN_ERROR(cudnnRNNBackwardData(
-      /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
-      /*seqLength=*/model_dims.seq_length, /*yDesc=*/output_desc.handles(),
-      /*y=*/output_data.opaque(), /*dyDesc=*/output_desc.handles(),
-      /*dy=*/output_backprop_data.opaque(),
-      /*dhyDesc=*/output_h_desc.handle(),
-      /*dhy=*/output_h_backprop_data.opaque(),
-      /*dcyDesc=*/output_c_desc.handle(),
-      /*dcy=*/output_c_backprop_data.opaque(),
-      /*wDesc=*/rnn_desc.params_handle(), /*w=*/params.opaque(),
-      /*hxDesc=*/input_h_desc.handle(), /*hx=*/input_h_data.opaque(),
-      /*cxDesc=*/input_c_desc.handle(), /*cx=*/input_c_data.opaque(),
-      /*dxDesc=*/input_desc.handles(), /*dx=*/input_backprop_data->opaque(),
-      /*dhxDesc=*/input_h_desc.handle(),
-      /*dhx=*/input_h_backprop_data->opaque(),
-      /*dcxDesc=*/input_c_desc.handle(),
-      /*dcx=*/input_c_backprop_data->opaque(),
-      /*workspace=*/workspace.opaque(),
-      /*workSpaceSizeInBytes=*/workspace.size(),
-      /*reserveSpace=*/reserve_space_data->opaque(),
-      /*reserveSpaceSizeInBytes=*/reserve_space_data->size()));
-
-  if (params_backprop_data != nullptr) {
-    // Clear the dw to zeros.
-    stream->ThenMemZero(params_backprop_data, params_backprop_data->size());
-    // make the backward weight call
-    RETURN_IF_CUDNN_ERROR(cudnnRNNBackwardWeights(
+  if (input_desc.is_var_seq_lengths()) {
+#if CUDNN_VERSION >= 7201
+    RETURN_IF_CUDNN_ERROR(cudnnRNNBackwardDataEx(
+        /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
+        /*yDesc=*/output_desc.data_handle(), /*y=*/output_data.opaque(),
+        /*dyDesc=*/output_desc.data_handle(),
+        /*dy=*/output_backprop_data.opaque(), nullptr, nullptr,
+        /*dhyDesc=*/output_h_desc.handle(),
+        /*dhy=*/output_h_backprop_data.opaque(),
+        /*dcyDesc=*/output_c_desc.handle(),
+        /*dcy=*/output_c_backprop_data.opaque(),
+        /*wDesc=*/rnn_desc.params_handle(), /*w=*/params.opaque(),
+        /*hxDesc=*/input_h_desc.handle(), /*hx=*/input_h_data.opaque(),
+        /*cxDesc=*/input_c_desc.handle(), /*cx=*/input_c_data.opaque(),
+        /*dxDesc=*/input_desc.data_handle(),
+        /*dx=*/input_backprop_data->opaque(),
+        /*dhxDesc=*/input_h_desc.handle(),
+        /*dhx=*/input_h_backprop_data->opaque(),
+        /*dcxDesc=*/input_c_desc.handle(),
+        /*dcx=*/input_c_backprop_data->opaque(), nullptr, nullptr,
+        /*workspace=*/workspace.opaque(),
+        /*workSpaceSizeInBytes=*/workspace.size(),
+        /*reserveSpace=*/reserve_space_data->opaque(),
+        /*reserveSpaceSizeInBytes=*/reserve_space_data->size()));
+#else
+    return port::Status(port::error::INVALID_ARGUMENT,
+                        "No supported cudnnRNNBackwardDataEx when "
+                        "CUDNN_VERSION < 7.2.1");
+#endif
+  } else {
+    RETURN_IF_CUDNN_ERROR(cudnnRNNBackwardData(
         /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
-        /*seqLength=*/model_dims.seq_length, /*xDesc=*/input_desc.handles(),
-        /*x=*/input_data.opaque(), /*hxDesc=*/input_h_desc.handle(),
-        /*hx=*/input_h_data.opaque(), /*yDesc=*/output_desc.handles(),
-        /*y=*/output_data.opaque(), /*workspace=*/workspace.opaque(),
+        /*seqLength=*/model_dims.max_seq_length,
+        /*yDesc=*/output_desc.handles(),
+        /*y=*/output_data.opaque(), /*dyDesc=*/output_desc.handles(),
+        /*dy=*/output_backprop_data.opaque(),
+        /*dhyDesc=*/output_h_desc.handle(),
+        /*dhy=*/output_h_backprop_data.opaque(),
+        /*dcyDesc=*/output_c_desc.handle(),
+        /*dcy=*/output_c_backprop_data.opaque(),
+        /*wDesc=*/rnn_desc.params_handle(), /*w=*/params.opaque(),
+        /*hxDesc=*/input_h_desc.handle(), /*hx=*/input_h_data.opaque(),
+        /*cxDesc=*/input_c_desc.handle(), /*cx=*/input_c_data.opaque(),
+        /*dxDesc=*/input_desc.handles(), /*dx=*/input_backprop_data->opaque(),
+        /*dhxDesc=*/input_h_desc.handle(),
+        /*dhx=*/input_h_backprop_data->opaque(),
+        /*dcxDesc=*/input_c_desc.handle(),
+        /*dcx=*/input_c_backprop_data->opaque(),
+        /*workspace=*/workspace.opaque(),
         /*workSpaceSizeInBytes=*/workspace.size(),
-        /*dwDesc=*/rnn_desc.params_handle(),
-        /*dw=*/params_backprop_data->opaque(),
         /*reserveSpace=*/reserve_space_data->opaque(),
         /*reserveSpaceSizeInBytes=*/reserve_space_data->size()));
   }
 
+  if (params_backprop_data != nullptr) {
+    // Clear the dw to zeros.
+    stream->ThenMemZero(params_backprop_data, params_backprop_data->size());
+    if (input_desc.is_var_seq_lengths()) {
+#if CUDNN_VERSION >= 7201
+      RETURN_IF_CUDNN_ERROR(cudnnRNNBackwardWeightsEx(
+          /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
+          /*xDesc=*/input_desc.data_handle(), /*x=*/input_data.opaque(),
+          /*hxDesc=*/input_h_desc.handle(), /*hx=*/input_h_data.opaque(),
+          /*yDesc=*/output_desc.data_handle(),
+          /*y=*/output_data.opaque(),
+          /*workspace=*/workspace.opaque(),
+          /*workSpaceSizeInBytes=*/workspace.size(),
+          /*dwDesc=*/rnn_desc.params_handle(),
+          /*dw=*/params_backprop_data->opaque(),
+          /*reserveSpace=*/reserve_space_data->opaque(),
+          /*reserveSpaceSizeInBytes=*/reserve_space_data->size()));
+#else
+      return port::Status(port::error::INVALID_ARGUMENT,
+                          "No supported cudnnRNNBackwardWeightsEx when "
+                          "CUDNN_VERSION < 7.2.1");
+#endif
+    } else {
+      // make the backward weight call
+      RETURN_IF_CUDNN_ERROR(cudnnRNNBackwardWeights(
+          /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
+          /*seqLength=*/model_dims.max_seq_length,
+          /*xDesc=*/input_desc.handles(),
+          /*x=*/input_data.opaque(), /*hxDesc=*/input_h_desc.handle(),
+          /*hx=*/input_h_data.opaque(), /*yDesc=*/output_desc.handles(),
+          /*y=*/output_data.opaque(), /*workspace=*/workspace.opaque(),
+          /*workSpaceSizeInBytes=*/workspace.size(),
+          /*dwDesc=*/rnn_desc.params_handle(),
+          /*dw=*/params_backprop_data->opaque(),
+          /*reserveSpace=*/reserve_space_data->opaque(),
+          /*reserveSpaceSizeInBytes=*/reserve_space_data->size()));
+    }
+  }
+
   if (is_profiling) {
-    if (!timer->Stop(AsCUDAStream(stream))) {
+    if (!timer->Stop(AsGpuStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
     auto algo_desc = *rnn_desc.algorithm_config().algorithm();
@@ -1616,17 +1981,29 @@ CudnnSupport::createRnnDescriptor(
 }
 
 port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
-CudnnSupport::createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
-                                                int data_size,
+CudnnSupport::createRnnSequenceTensorDescriptor(int max_seq_length,
+                                                int batch_size, int data_size,
                                                 dnn::DataType data_type) {
   SE_ASSIGN_OR_RETURN(CudnnRnnSequenceTensorDescriptor descriptor,
                       CudnnRnnSequenceTensorDescriptor::Create(
-                          parent_, seq_length, batch_size, data_size,
+                          parent_, max_seq_length, batch_size, data_size,
                           ToCudnnDataType(data_type)));
   return std::unique_ptr<dnn::RnnSequenceTensorDescriptor>(
       new CudnnRnnSequenceTensorDescriptor(std::move(descriptor)));
 }
 
+port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
+CudnnSupport::createRnnSequenceTensorDescriptor(
+    int max_seq_length, int batch_size, int data_size,
+    const absl::Span<const int>& seq_lengths, dnn::DataType data_type) {
+  SE_ASSIGN_OR_RETURN(CudnnRnnSequenceTensorDescriptor descriptor,
+                      CudnnRnnSequenceTensorDescriptor::Create(
+                          parent_, max_seq_length, batch_size, data_size,
+                          seq_lengths, ToCudnnDataType(data_type)));
+  return std::unique_ptr<dnn::RnnSequenceTensorDescriptor>(
+      new CudnnRnnSequenceTensorDescriptor(std::move(descriptor)));
+}
+
 port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
 CudnnSupport::createRnnStateTensorDescriptor(int num_layer, int batch_size,
                                              int data_size,
@@ -1668,7 +2045,6 @@ bool CudnnSupport::DoRnnForward(
       static_cast<const CudnnRnnStateTensorDescriptor&>(output_h_desc);
   const CudnnRnnStateTensorDescriptor& cudnn_output_c_desc =
       static_cast<const CudnnRnnStateTensorDescriptor&>(output_c_desc);
-
   return IsStatusOk(
       DoRnnForwardImpl<Eigen::half>(
           stream, cudnn_rnn_desc, cudnn_input_desc, input_data,
@@ -1710,7 +2086,6 @@ bool CudnnSupport::DoRnnForward(
       static_cast<const CudnnRnnStateTensorDescriptor&>(output_h_desc);
   const CudnnRnnStateTensorDescriptor& cudnn_output_c_desc =
       static_cast<const CudnnRnnStateTensorDescriptor&>(output_c_desc);
-
   return IsStatusOk(
       DoRnnForwardImpl<float>(
           stream, cudnn_rnn_desc, cudnn_input_desc, input_data,
@@ -1753,7 +2128,6 @@ bool CudnnSupport::DoRnnForward(
       static_cast<const CudnnRnnStateTensorDescriptor&>(output_h_desc);
   const CudnnRnnStateTensorDescriptor& cudnn_output_c_desc =
       static_cast<const CudnnRnnStateTensorDescriptor&>(output_c_desc);
-
   return IsStatusOk(
       DoRnnForwardImpl<double>(
           stream, cudnn_rnn_desc, cudnn_input_desc, input_data,
@@ -1803,7 +2177,6 @@ bool CudnnSupport::DoRnnBackward(
       static_cast<const CudnnRnnStateTensorDescriptor&>(output_h_desc);
   const CudnnRnnStateTensorDescriptor& cudnn_output_c_desc =
       static_cast<const CudnnRnnStateTensorDescriptor&>(output_c_desc);
-
   return IsStatusOk(
       DoRnnBackwardImpl<Eigen::half>(
           stream, cudnn_rnn_desc, cudnn_input_desc, input_data,
@@ -1855,7 +2228,6 @@ bool CudnnSupport::DoRnnBackward(
       static_cast<const CudnnRnnStateTensorDescriptor&>(output_h_desc);
   const CudnnRnnStateTensorDescriptor& cudnn_output_c_desc =
       static_cast<const CudnnRnnStateTensorDescriptor&>(output_c_desc);
-
   return IsStatusOk(
       DoRnnBackwardImpl<float>(
           stream, cudnn_rnn_desc, cudnn_input_desc, input_data,
@@ -1908,7 +2280,6 @@ bool CudnnSupport::DoRnnBackward(
       static_cast<const CudnnRnnStateTensorDescriptor&>(output_h_desc);
   const CudnnRnnStateTensorDescriptor& cudnn_output_c_desc =
       static_cast<const CudnnRnnStateTensorDescriptor&>(output_c_desc);
-
   return IsStatusOk(
       DoRnnBackwardImpl<double>(
           stream, cudnn_rnn_desc, cudnn_input_desc, input_data,
@@ -2350,6 +2721,23 @@ cudnnDataType_t GetRnnComputeType(dnn::DataType data_type) {
   }
 }
 
+dnn::DataType GetConvAccumulatorType(dnn::DataType data_type) {
+  switch (data_type) {
+    case dnn::DataType::kFloat:
+    case dnn::DataType::kDouble:
+      return data_type;
+    case dnn::DataType::kHalf:
+      return CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
+                 ? dnn::DataType::kFloat
+                 : dnn::DataType::kHalf;
+    case dnn::DataType::kInt8:
+    case dnn::DataType::kInt32:
+      return dnn::DataType::kInt32;
+    default:
+      LOG(FATAL) << "Invalid DNN data type: " << static_cast<int>(data_type);
+  }
+}
+
 // Determines whether we can safely perform a winograd non-fused convolution for
 // the given input and output shapes.  This works around b/68264959, an integer
 // overflow in cuDNNv5 and cuDNNv6.
@@ -2377,23 +2765,137 @@ bool ShouldIncludeWinogradNonfusedAlgo(
 }
 #endif
 
+dnn::ConvolutionProto GenerateConvProto(
+    dnn::ConvolutionKind kind, dnn::DataType element_type,
+    const dnn::BatchDescriptor& input_descriptor,
+    const dnn::FilterDescriptor& filter_descriptor,
+    const dnn::BatchDescriptor& output_descriptor, dnn::AlgorithmDesc algorithm,
+    const dnn::ConvolutionDescriptor& convolution_descriptor, double conv_scale,
+    double side_value_scale, dnn::DataType acc_type,
+    dnn::ActivationMode activation) {
+  dnn::ConvolutionProto conv_config;
+  conv_config.set_kind(kind);
+  *conv_config.mutable_input() = input_descriptor.ToProto(element_type);
+  *conv_config.mutable_filter() = filter_descriptor.ToProto(element_type);
+  *conv_config.mutable_output() = output_descriptor.ToProto(element_type);
+  *conv_config.mutable_algorithm() = algorithm.ToProto();
+  *conv_config.mutable_conv_desc() = convolution_descriptor.ToProto();
+  conv_config.mutable_conv_desc()->set_compute_mode(acc_type);
+  conv_config.set_conv_scale(conv_scale);
+  conv_config.set_side_value_scale(side_value_scale);
+  conv_config.set_activation(activation);
+  return conv_config;
+}
+
+void LogCudaProto(const dnn::ConvolutionProto& conv, float profile_time_ms,
+                  StreamExecutor* stream_executor) {
+  {
+    // For rolling-out, temporarily cap the number of logs per process.
+    // TODO(timshen): remove it.
+    static int count_down = 200;
+    if (count_down == 0) {
+      return;
+    }
+    count_down--;
+  }
+
+  ConvLogEntry conv_log;
+  *conv_log.mutable_convolution() = conv;
+  conv_log.set_profile_time_ms(profile_time_ms);
+
+  auto info = conv_log.mutable_cuda_info();
+  int cc_major, cc_minor;
+  stream_executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                                  &cc_minor);
+  info->mutable_compute_capability()->set_major(cc_major);
+  info->mutable_compute_capability()->set_minor(cc_minor);
+
+  if (auto* dnn = stream_executor->AsDnn()) {
+    port::StatusOr<dnn::VersionInfo> version_or = dnn->GetVersion();
+    if (version_or.ok()) {
+      const auto& version = version_or.ValueOrDie();
+      info->mutable_cudnn_version()->set_major(version.major_version());
+      info->mutable_cudnn_version()->set_minor(version.minor_version());
+      info->mutable_cudnn_version()->set_patch(version.patch());
+    }
+  }
+  tensorflow::Logger::Singleton()->LogProto(conv_log);
+}
+
 }  // namespace
 
-template <class T>
-port::Status CudnnSupport::DoConvolveImpl(
-    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-    const DeviceMemory<T>& input_data,
+port::Status CudnnSupport::DoPrepareForConvolution(
+    dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+    const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
     const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<T>& filter_data,
+    DeviceMemoryBase filter_data, const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemoryBase output_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::BatchDescriptor& output_descriptor, DeviceMemory<T>* output_data,
-    dnn::DataType accumulator_type, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
+    ScratchAllocator* scratch_allocator, dnn::AlgorithmDesc* algorithm_desc,
+    DeviceMemory<uint8>* scratch_memory) {
+  CudnnTensorDescriptor input_nd(
+      input_descriptor,
+      ToCudnnDataType(element_type, input_descriptor.layout()));
+  CudnnFilterDescriptor filter_nd(
+      filter_descriptor,
+      ToCudnnDataType(element_type, filter_descriptor.layout()));
+  CudnnTensorDescriptor output_nd(
+      output_descriptor,
+      ToCudnnDataType(element_type, output_descriptor.layout()));
+  CudnnConvolutionDescriptor conv(
+      convolution_descriptor,
+      ToCudnnDataType(GetConvAccumulatorType(element_type)));
+
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+
+  switch (kind) {
+    case dnn::ConvolutionKind::FORWARD: {
+      SE_ASSIGN_OR_RETURN(
+          *algorithm_desc,
+          GetCudnnConvolutionForwardAlgorithm(
+              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
+              output_nd, scratch_allocator, scratch_memory));
+      break;
+    }
+    case dnn::ConvolutionKind::BACKWARD_DATA: {
+      SE_ASSIGN_OR_RETURN(
+          *algorithm_desc,
+          GetCudnnConvolutionBackwardDataAlgorithm(
+              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
+              output_nd, scratch_allocator, scratch_memory));
+      break;
+    }
+    case dnn::ConvolutionKind::BACKWARD_FILTER: {
+      SE_ASSIGN_OR_RETURN(
+          *algorithm_desc,
+          GetCudnnConvolutionBackwardFilterAlgorithm(
+              stream, cudnn, algorithm_config, input_nd, filter_nd, conv,
+              output_nd, scratch_allocator, scratch_memory));
+      break;
+    }
+    default:
+      return port::InternalError(
+          absl::StrCat("Unexpected convolution kind ", static_cast<int>(kind)));
+  }
+
+  return port::Status::OK();
+}
+
+port::Status CudnnSupport::DoConvolve(
+    dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+    const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+    const dnn::FilterDescriptor& filter_descriptor,
+    DeviceMemoryBase filter_data, const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemoryBase output_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    dnn::AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory,
     dnn::ProfileResult* output_profile_result) {
-  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
+  cudnnDataType_t cudnn_type = ToCudnnDataType(element_type);
   CudnnTensorDescriptor input_nd(input_descriptor, cudnn_type);
   CudnnTensorDescriptor output_nd(output_descriptor, cudnn_type);
-  CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
+  CudnnFilterDescriptor filter_nd(filter_descriptor, cudnn_type);
+  auto accumulator_type = GetConvAccumulatorType(element_type);
   CudnnConvolutionDescriptor conv(convolution_descriptor,
                                   ToCudnnDataType(accumulator_type));
 
@@ -2411,28 +2913,22 @@ port::Status CudnnSupport::DoConvolveImpl(
 
   const bool is_profiling = output_profile_result != nullptr;
 
-  DeviceMemory<uint8> scratch;
-  SE_ASSIGN_OR_RETURN(dnn::AlgorithmDesc algo_desc,
-                      GetCudnnConvolutionForwardAlgorithm(
-                          stream, cudnn, algorithm_config, input_nd, filter,
-                          conv, output_nd, scratch_allocator, &scratch));
-
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));  // NOLINT
+    timer.reset(new GpuTimer(parent_));  // NOLINT
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to start timer");
     }
   }
 
-  // Report an error if we might be hitting a cuDNN bug that accesses illegal
-  // memory. See nvbugs/2138754, b/80018418.
-  if (CUDNN_VERSION < 7300) {
-    SE_RETURN_IF_ERROR([&] {
-      if (algo_desc.algo_id() != CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) {
+  auto get_fwd_bugs = [&]() -> port::Status {
+    // Report an error if we might be hitting a cuDNN bug that accesses illegal
+    // memory. See nvbugs/2138754, b/80018418.
+    if (CUDNN_VERSION < 7300) {
+      if (algorithm_desc.algo_id() != CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) {
         return port::Status::OK();
       }
       if (input_descriptor.ndims() < 3) {
@@ -2454,33 +2950,178 @@ port::Status CudnnSupport::DoConvolveImpl(
       SE_RETURN_IF_ERROR(check_sizes(input_descriptor.count(),
                                      output_descriptor.feature_map_count()));
       return port::Status::OK();
-    }());
-  }
+    }
+    if (algorithm_desc.algo_id() ==
+            CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
+        !ShouldIncludeWinogradNonfusedAlgo(input_descriptor,
+                                           output_descriptor)) {
+      return port::Status(
+          port::error::FAILED_PRECONDITION,
+          "This configuration has potential integer overflow in "
+          "cuDNNv5 and cuDNNv6. See b/68264959.");
+    }
+    return port::Status::OK();
+  };
 
-  if (algo_desc.algo_id() == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
-      !ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor)) {
-    return port::Status(port::error::FAILED_PRECONDITION,
-                        "This configuration has potential integer overflow in "
-                        "cuDNNv5 and cuDNNv6. See b/68264959.");
-  }
+  auto get_bwd_data_bugs = [&]() -> port::Status {
+    if (algorithm_desc.algo_id() ==
+            CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
+        !ShouldIncludeWinogradNonfusedAlgo(input_descriptor,
+                                           output_descriptor)) {
+      return port::Status(
+          port::error::FAILED_PRECONDITION,
+          "This configuration has potential integer overflow in "
+          "cuDNNv5 and cuDNNv6. See b/68264959.");
+    }
 
-  RETURN_IF_CUDNN_ERROR(cudnnConvolutionForward(
-      cudnn.handle(),
-      /*alpha=*/alpha, /*srcDesc=*/input_nd.handle(),
-      /*srcData=*/input_data.opaque(), /*filterDesc=*/filter.handle(),
-      /*filterData=*/filter_data.opaque(), /*convDesc=*/conv.handle(),
-      /*algo=*/ToConvForwardAlgo(algo_desc), /*workSpace=*/scratch.opaque(),
-      /*workSpaceSizeInBytes=*/scratch.size(), /*beta=*/beta,
-      /*yDesc=*/output_nd.handle(), /*y=*/output_data->opaque()));
+    // Cudnn 7.1.4 has a bug if the workspace of the following convolution is
+    // not zero-initialized, nvbugs/2254619.
+    if (CUDNN_VERSION >= 7000 && CUDNN_VERSION < 7300 &&
+        algorithm_desc.algo_id() == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 &&
+        cudnn_type == CUDNN_DATA_HALF && algorithm_desc.tensor_ops_enabled() &&
+        input_descriptor.layout() == dnn::DataLayout::kBatchYXDepth &&
+        filter_descriptor.layout() == dnn::FilterLayout::kOutputInputYX &&
+        output_descriptor.layout() == dnn::DataLayout::kBatchDepthYX &&
+        (convolution_descriptor.vertical_filter_stride() > 1 ||
+         convolution_descriptor.horizontal_filter_stride() > 1)) {
+      stream->ThenMemZero(&scratch_memory, scratch_memory.size());
+    }
+    return port::Status::OK();
+  };
+
+  auto get_bwd_filter_bugs = [&]() -> port::Status {
+    // Report an error if we might be hitting a cuDNN bug that produces
+    // incorrect results. See nvbugs/2072856
+    if (CUDNN_VERSION < 7300) {
+      SE_RETURN_IF_ERROR([&] {
+        if (algorithm_desc.algo_id() !=
+            CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING) {
+          return port::Status::OK();
+        }
+        if (output_descriptor.height() > 1 && output_descriptor.width() > 1) {
+          return port::Status::OK();
+        }
+        int convolution_size = output_descriptor.height() > 1
+                                   ? filter_descriptor.input_filter_height()
+                                   : filter_descriptor.input_filter_width();
+        if (convolution_size <= 32) {
+          return port::Status::OK();
+        }
+        cudnnConvolutionMode_t convolution_mode;
+        cudnnDataType_t compute_type;
+        RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionNdDescriptor(
+            conv.handle(), 0, nullptr, nullptr, nullptr, nullptr,
+            &convolution_mode, &compute_type));
+        if (convolution_mode != CUDNN_CONVOLUTION) {
+          return port::Status::OK();
+        }
+        return port::Status(
+            port::error::FAILED_PRECONDITION,
+            "This configuration potentially produces incorrect results.");
+      }());
+    }
+
+    if (algorithm_desc.algo_id() ==
+            CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
+        !ShouldIncludeWinogradNonfusedAlgo(input_descriptor,
+                                           output_descriptor)) {
+      return port::Status(
+          port::error::FAILED_PRECONDITION,
+          "This configuration has potential integer overflow in "
+          "cuDNNv5 and cuDNNv6. See b/68264959.");
+    }
+
+    // Zero out the result buffer for strided conv backward filter for NHWC
+    // layouts. cuDNN 7.1.4 and 7.2 has non-determinisic bug if the buffer is
+    // not zeroed.
+    //
+    // This wrong result caused by the bug is very flaky. It needs to be run for
+    // up to 20 times to produce a mismatch.
+    //
+    // See nvbugs/2379553.
+    if (CUDNN_VERSION >= 7100 && CUDNN_VERSION < 7300 &&
+        algorithm_desc.algo_id() == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 &&
+        cudnn_type == CUDNN_DATA_HALF &&
+        input_descriptor.layout() == dnn::DataLayout::kBatchYXDepth &&
+        filter_descriptor.layout() == dnn::FilterLayout::kOutputYXInput &&
+        output_descriptor.layout() == dnn::DataLayout::kBatchYXDepth &&
+        (convolution_descriptor.vertical_filter_stride() > 1 ||
+         convolution_descriptor.horizontal_filter_stride() > 1)) {
+      stream->ThenMemZero(&filter_data, filter_data.size());
+    }
+    return port::Status::OK();
+  };
+
+  switch (kind) {
+    case dnn::ConvolutionKind::FORWARD: {
+      SE_RETURN_IF_ERROR(get_fwd_bugs());
+      RETURN_IF_CUDNN_ERROR(cudnnConvolutionForward(
+          cudnn.handle(),
+          /*alpha=*/alpha, /*srcDesc=*/input_nd.handle(),
+          /*srcData=*/input_data.opaque(), /*filterDesc=*/filter_nd.handle(),
+          /*filterData=*/filter_data.opaque(), /*convDesc=*/conv.handle(),
+          /*algo=*/ToConvForwardAlgo(algorithm_desc),
+          /*workSpace=*/scratch_memory.opaque(),
+          /*workSpaceSizeInBytes=*/scratch_memory.size(), /*beta=*/beta,
+          /*yDesc=*/output_nd.handle(), /*y=*/output_data.opaque()));
+      break;
+    }
+    case dnn::ConvolutionKind::BACKWARD_DATA: {
+      SE_RETURN_IF_ERROR(get_bwd_data_bugs());
+      RETURN_IF_CUDNN_ERROR(cudnnConvolutionBackwardData(
+          cudnn.handle(),
+          /*alpha=*/alpha,
+          /*wDesc=*/filter_nd.handle(),
+          /*w=*/filter_data.opaque(),
+          /*dyDesc=*/output_nd.handle(),
+          /*dy=*/output_data.opaque(),
+          /*convDesc=*/conv.handle(),
+          /*algo=*/ToConvBackwardDataAlgo(algorithm_desc),
+          /*workSpace=*/scratch_memory.opaque(),
+          /*workSpaceSizeInBytes=*/scratch_memory.size(),
+          /*beta=*/beta,
+          /*dxDesc=*/input_nd.handle(),
+          /*dx=*/input_data.opaque()));
+      break;
+    }
+    case dnn::ConvolutionKind::BACKWARD_FILTER: {
+      SE_RETURN_IF_ERROR(get_bwd_filter_bugs());
+      RETURN_IF_CUDNN_ERROR(cudnnConvolutionBackwardFilter(
+          cudnn.handle(),
+          /*alpha=*/alpha,
+          /*srcDesc=*/input_nd.handle(),
+          /*srcData=*/input_data.opaque(),
+          /*diffDesc=*/output_nd.handle(),
+          /*diffData=*/output_data.opaque(),
+          /*convDesc=*/conv.handle(),
+          /*algo=*/ToConvBackwardFilterAlgo(algorithm_desc),
+          /*workSpace=*/scratch_memory.opaque(),
+          /*workSpaceSizeInBytes=*/scratch_memory.size(),
+          /*beta=*/beta,
+          /*gradDesc=*/filter_nd.handle(),
+          /*dw=*/filter_data.opaque()));
+      break;
+    }
+    default:
+      return port::InternalError(
+          absl::StrCat("Unexpected convolution kind ", static_cast<int>(kind)));
+  }
 
   if (is_profiling) {
-    if (!timer->Stop(AsCUDAStream(stream))) {
+    if (!timer->Stop(AsGpuStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
-    output_profile_result->set_algorithm(algo_desc);
+    output_profile_result->set_algorithm(algorithm_desc);
     output_profile_result->set_elapsed_time_in_ms(
         timer->GetElapsedMilliseconds());
-    output_profile_result->set_scratch_size(scratch.size());
+    output_profile_result->set_scratch_size(scratch_memory.size());
+
+    LogCudaProto(
+        GenerateConvProto(kind, element_type, input_descriptor,
+                          filter_descriptor, output_descriptor, algorithm_desc,
+                          convolution_descriptor, dalpha, dbeta,
+                          accumulator_type, dnn::ActivationMode::kNone),
+        output_profile_result->elapsed_time_in_ms(), stream->parent());
   }
 
   return port::Status::OK();
@@ -2532,13 +3173,13 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
           stream, cudnn, algorithm_config, conv_input_nd, filter, conv,
           output_nd, scratch_allocator, &scratch));
 
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
   if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));  // NOLINT
+    timer.reset(new GpuTimer(parent_));  // NOLINT
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to start timer");
     }
   }
@@ -2591,21 +3232,42 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
       /*yDesc=*/output_nd.handle(), /*y=*/output_data->opaque()));
 
   if (is_profiling) {
-    if (!timer->Stop(AsCUDAStream(stream))) {
+    if (!timer->Stop(AsGpuStream(stream))) {
       return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
     output_profile_result->set_algorithm(algo_desc);
     output_profile_result->set_elapsed_time_in_ms(
         timer->GetElapsedMilliseconds());
     output_profile_result->set_scratch_size(scratch.size());
+
+    LogCudaProto(
+        GenerateConvProto(
+            dnn::ConvolutionKind::FORWARD, dnn::ToDataType<ElementType>::value,
+            conv_input_descriptor, filter_descriptor, output_descriptor,
+            algo_desc, convolution_descriptor, conv_input_scale,
+            side_input_scale, accumulator_type, activation_mode),
+        output_profile_result->elapsed_time_in_ms(), stream->parent());
   }
 
   return port::Status::OK();
 }
 
+inline bool TensorOpMathAvailable(int cc_major) {
+  return cc_major >= 7 && CUDNN_VERSION >= 7000 && TensorOpMathEnabled();
+}
+
 bool CudnnSupport::GetConvolveAlgorithms(
     bool with_winograd_nonfused, int cc_major, int cc_minor,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
+  bool tensor_op_math_available = TensorOpMathAvailable(cc_major);
+  out_algorithms->clear();
+
+  if (RequireDeterminism()) {
+    out_algorithms->push_back({CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
+                               tensor_op_math_available});
+    return true;
+  }
+
   std::vector<dnn::AlgorithmDesc::Index> algo_types = {
     // clang-format off
     CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
@@ -2623,13 +3285,13 @@ bool CudnnSupport::GetConvolveAlgorithms(
     algo_types.push_back(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED);
   }
 
-  out_algorithms->clear();
   for (auto i : algo_types) {
     out_algorithms->push_back({i, /*use_tensor_ops=*/false});
-    if (cc_major >= 7 && CUDNN_VERSION >= 7000 && TensorOpMathEnabled()) {
+    if (tensor_op_math_available) {
       out_algorithms->push_back({i, /*use_tensor_ops=*/true});
     }
   }
+
   return true;
 }
 
@@ -2658,6 +3320,15 @@ bool CudnnSupport::GetRnnAlgorithms(
 bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
     bool with_winograd_nonfused, int cc_major, int cc_minor,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
+  bool tensor_op_math_available = TensorOpMathAvailable(cc_major);
+  out_algorithms->clear();
+
+  if (RequireDeterminism()) {
+    out_algorithms->push_back(
+        {CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, tensor_op_math_available});
+    return true;
+  }
+
   std::vector<dnn::AlgorithmDesc::Index> algo_types = {
       // clang-format off
     CUDNN_CONVOLUTION_BWD_DATA_ALGO_0,
@@ -2671,19 +3342,28 @@ bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
     algo_types.push_back(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED);
   }
 
-  out_algorithms->clear();
   for (auto i : algo_types) {
     out_algorithms->push_back({i, /*use_tensor_ops=*/false});
-    if (cc_major >= 7 && CUDNN_VERSION >= 7000 && TensorOpMathEnabled()) {
+    if (tensor_op_math_available) {
       out_algorithms->push_back({i, /*use_tensor_ops=*/true});
     }
   }
+
   return true;
 }
 
 bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
     bool with_winograd_nonfused, int cc_major, int cc_minor,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
+  bool tensor_op_math_available = TensorOpMathAvailable(cc_major);
+  out_algorithms->clear();
+
+  if (RequireDeterminism()) {
+    out_algorithms->push_back(
+        {CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, tensor_op_math_available});
+    return true;
+  }
+
   std::vector<dnn::AlgorithmDesc::Index> algo_types = {
       // clang-format off
       CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0,
@@ -2702,13 +3382,13 @@ bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
     algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED);
   }
 
-  out_algorithms->clear();
   for (auto i : algo_types) {
     out_algorithms->push_back({i, /*use_tensor_ops=*/false});
-    if (cc_major >= 7 && CUDNN_VERSION >= 7000 && TensorOpMathEnabled()) {
+    if (tensor_op_math_available) {
       out_algorithms->push_back({i, /*use_tensor_ops=*/true});
     }
   }
+
   return true;
 }
 
@@ -2877,85 +3557,27 @@ port::Status CudnnSupport::DoBatchNormalizationBackwardImpl(
   return port::Status::OK();
 }
 
-bool CudnnSupport::DoConvolve(
-    Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
-    const DeviceMemory<float>& input_data,
+bool CudnnSupport::DoFusedConvolve(
+    Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
+    const DeviceMemory<double>& conv_input_data, double conv_input_scale,
     const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<float>& filter_data,
+    const DeviceMemory<double>& filter_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const DeviceMemory<double>& side_input_data, double side_input_scale,
+    const dnn::BatchDescriptor& bias_descriptor,
+    const DeviceMemory<double>& biases, dnn::ActivationMode activation_mode,
     const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
+    DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor,
-                     filter_data, convolution_descriptor, output_descriptor,
-                     output_data, dnn::DataType::kFloat, scratch_allocator,
-                     algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
-}
-
-bool CudnnSupport::DoConvolve(
-    Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
-    const DeviceMemory<double>& input_data,
-    const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<double>& filter_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor,
-                     filter_data, convolution_descriptor, output_descriptor,
-                     output_data, dnn::DataType::kDouble, scratch_allocator,
-                     algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
-}
-
-bool CudnnSupport::DoConvolve(
-    Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
-    const DeviceMemory<Eigen::half>& input_data,
-    const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<Eigen::half>& filter_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-  dnn::DataType acc_type =
-      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
-          ? dnn::DataType::kFloat
-          : dnn::DataType::kHalf;
-  return IsStatusOk(
-      DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor,
-                     filter_data, convolution_descriptor, output_descriptor,
-                     output_data, acc_type, scratch_allocator, algorithm_config,
-                     output_profile_result),
-      /*report_error=*/!output_profile_result);
-}
-
-bool CudnnSupport::DoFusedConvolve(
-    Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
-    const DeviceMemory<double>& conv_input_data, double conv_input_scale,
-    const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<double>& filter_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const DeviceMemory<double>& side_input_data, double side_input_scale,
-    const dnn::BatchDescriptor& bias_descriptor,
-    const DeviceMemory<double>& biases, dnn::ActivationMode activation_mode,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoFusedConvolveImpl(stream, conv_input_descriptor, conv_input_data,
-                          conv_input_scale, filter_descriptor, filter_data,
-                          convolution_descriptor, side_input_data,
-                          side_input_scale, bias_descriptor, biases,
-                          activation_mode, output_descriptor, output_data,
-                          dnn::DataType::kDouble, scratch_allocator,
-                          algorithm_config, output_profile_result),
+      DoFusedConvolveImpl(
+          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
+          filter_descriptor, filter_data, convolution_descriptor,
+          side_input_data, side_input_scale, bias_descriptor, biases,
+          activation_mode, output_descriptor, output_data,
+          GetConvAccumulatorType(dnn::DataType::kDouble), scratch_allocator,
+          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -2973,13 +3595,13 @@ bool CudnnSupport::DoFusedConvolve(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoFusedConvolveImpl(stream, conv_input_descriptor, conv_input_data,
-                          conv_input_scale, filter_descriptor, filter_data,
-                          convolution_descriptor, side_input_data,
-                          side_input_scale, bias_descriptor, biases,
-                          activation_mode, output_descriptor, output_data,
-                          dnn::DataType::kFloat, scratch_allocator,
-                          algorithm_config, output_profile_result),
+      DoFusedConvolveImpl(
+          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
+          filter_descriptor, filter_data, convolution_descriptor,
+          side_input_data, side_input_scale, bias_descriptor, biases,
+          activation_mode, output_descriptor, output_data,
+          GetConvAccumulatorType(dnn::DataType::kFloat), scratch_allocator,
+          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -2997,17 +3619,14 @@ bool CudnnSupport::DoFusedConvolve(
     DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  dnn::DataType acc_type =
-      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
-          ? dnn::DataType::kFloat
-          : dnn::DataType::kHalf;
   return IsStatusOk(
       DoFusedConvolveImpl(
           stream, conv_input_descriptor, conv_input_data, conv_input_scale,
           filter_descriptor, filter_data, convolution_descriptor,
           side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data, acc_type,
-          scratch_allocator, algorithm_config, output_profile_result),
+          activation_mode, output_descriptor, output_data,
+          GetConvAccumulatorType(dnn::DataType::kHalf), scratch_allocator,
+          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3033,13 +3652,13 @@ bool CudnnSupport::DoFusedConvolve(
     return false;
   }
   return IsStatusOk(
-      DoFusedConvolveImpl(stream, conv_input_descriptor, conv_input_data,
-                          conv_input_scale, filter_descriptor, filter_data,
-                          convolution_descriptor, side_input_data,
-                          side_input_scale, bias_descriptor, biases,
-                          activation_mode, output_descriptor, output_data,
-                          dnn::DataType::kInt32, scratch_allocator,
-                          algorithm_config, output_profile_result),
+      DoFusedConvolveImpl(
+          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
+          filter_descriptor, filter_data, convolution_descriptor,
+          side_input_data, side_input_scale, bias_descriptor, biases,
+          activation_mode, output_descriptor, output_data,
+          GetConvAccumulatorType(dnn::DataType::kInt8), scratch_allocator,
+          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3065,368 +3684,6 @@ bool CudnnSupport::DoTransformTensor(Stream* stream,
   return IsStatusOk(status, /*report_error=*/true);
 }
 
-template <class T>
-port::Status CudnnSupport::DoConvolveBackwardDataImpl(
-    Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<T>& filter_data,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<T> backward_output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::BatchDescriptor& input_descriptor,
-    DeviceMemory<T>* backward_input_data, dnn::DataType accumulator_type,
-    ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
-  // Alpha is the scaling factor for input.
-  float falpha = 1.0;
-  double dalpha = 1.0;
-  void* alpha = cudnn_type == CUDNN_DATA_DOUBLE ? static_cast<void*>(&dalpha)
-                                                : static_cast<void*>(&falpha);
-  // Beta is the scaling factor for output.
-  float fbeta = 0.0;
-  double dbeta = 0.0;
-  void* beta = cudnn_type == CUDNN_DATA_DOUBLE ? static_cast<void*>(&dbeta)
-                                               : static_cast<void*>(&fbeta);
-
-  auto cudnn = cudnn_->GetHandle(parent_, stream);
-
-  CudnnTensorDescriptor out_back_nd(output_descriptor, cudnn_type);
-  CudnnTensorDescriptor in_back_nd(input_descriptor, cudnn_type);
-  CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
-  CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  ToCudnnDataType(accumulator_type));
-
-  const bool is_profiling = output_profile_result != nullptr;
-
-  DeviceMemory<uint8> scratch;
-  SE_ASSIGN_OR_RETURN(dnn::AlgorithmDesc algo_desc,
-                      GetCudnnConvolutionBackwardDataAlgorithm(
-                          stream, cudnn, algorithm_config, in_back_nd, filter,
-                          conv, out_back_nd, scratch_allocator, &scratch));
-
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
-  if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));  // NOLINT
-    // The start and stop of the timer should be as close to the Cudnn call as
-    // possible. It is still possible for other threads to issue workload on
-    // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
-      return port::Status(port::error::INTERNAL, "Failed to start timer");
-    }
-  }
-
-  if (algo_desc.algo_id() == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
-      !ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor)) {
-    return port::Status(port::error::FAILED_PRECONDITION,
-                        "This configuration has potential integer overflow in "
-                        "cuDNNv5 and cuDNNv6. See b/68264959.");
-  }
-
-  // Cudnn 7.1.4 has a bug if the workspace of the following convolution is not
-  // zero-initialized, nvbugs/2254619.
-  if (CUDNN_VERSION >= 7000 && CUDNN_VERSION < 7300 &&
-      algo_desc.algo_id() == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 &&
-      cudnn_type == CUDNN_DATA_HALF && algo_desc.tensor_ops_enabled() &&
-      input_descriptor.layout() == dnn::DataLayout::kBatchYXDepth &&
-      filter_descriptor.layout() == dnn::FilterLayout::kOutputInputYX &&
-      output_descriptor.layout() == dnn::DataLayout::kBatchDepthYX &&
-      (convolution_descriptor.vertical_filter_stride() > 1 ||
-       convolution_descriptor.horizontal_filter_stride() > 1)) {
-    stream->ThenMemZero(&scratch, scratch.size());
-  }
-
-  RETURN_IF_CUDNN_ERROR(
-      cudnnConvolutionBackwardData(cudnn.handle(),
-                                   /*alpha=*/alpha,
-                                   /*wDesc=*/filter.handle(),
-                                   /*w=*/filter_data.opaque(),
-                                   /*dyDesc=*/out_back_nd.handle(),
-                                   /*dy=*/backward_output_data.opaque(),
-                                   /*convDesc=*/conv.handle(),
-                                   /*algo=*/ToConvBackwardDataAlgo(algo_desc),
-                                   /*workSpace=*/scratch.opaque(),
-                                   /*workSpaceSizeInBytes=*/scratch.size(),
-                                   /*beta=*/beta,
-                                   /*dxDesc=*/in_back_nd.handle(),
-                                   /*dx=*/backward_input_data->opaque()));
-  if (is_profiling) {
-    if (!timer->Stop(AsCUDAStream(stream))) {
-      return port::Status(port::error::INTERNAL, "Failed to stop timer");
-    }
-    output_profile_result->set_algorithm(algo_desc);
-    output_profile_result->set_elapsed_time_in_ms(
-        timer->GetElapsedMilliseconds());
-    output_profile_result->set_scratch_size(scratch.size());
-  }
-
-  return port::Status::OK();
-}
-
-bool CudnnSupport::DoConvolveBackwardData(
-    Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<double>& filter_data,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<double> backward_output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::BatchDescriptor& input_descriptor,
-    DeviceMemory<double>* backward_input_data,
-    ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoConvolveBackwardDataImpl(
-          stream, filter_descriptor, filter_data, output_descriptor,
-          backward_output_data, convolution_descriptor, input_descriptor,
-          backward_input_data, dnn::DataType::kDouble, scratch_allocator,
-          algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
-}
-
-bool CudnnSupport::DoConvolveBackwardData(
-    Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<float>& filter_data,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<float> backward_output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::BatchDescriptor& input_descriptor,
-    DeviceMemory<float>* backward_input_data,
-    ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoConvolveBackwardDataImpl(
-          stream, filter_descriptor, filter_data, output_descriptor,
-          backward_output_data, convolution_descriptor, input_descriptor,
-          backward_input_data, dnn::DataType::kFloat, scratch_allocator,
-          algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
-}
-
-bool CudnnSupport::DoConvolveBackwardData(
-    Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<Eigen::half>& filter_data,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<Eigen::half> backward_output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::BatchDescriptor& input_descriptor,
-    DeviceMemory<Eigen::half>* backward_input_data,
-    ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-  dnn::DataType acc_type =
-      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
-          ? dnn::DataType::kFloat
-          : dnn::DataType::kHalf;
-  return IsStatusOk(
-      DoConvolveBackwardDataImpl(
-          stream, filter_descriptor, filter_data, output_descriptor,
-          backward_output_data, convolution_descriptor, input_descriptor,
-          backward_input_data, acc_type, scratch_allocator, algorithm_config,
-          output_profile_result),
-      /*report_error=*/!output_profile_result);
-}
-
-template <class T>
-port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
-    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-    const DeviceMemory<T>& input_data,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<T> backward_output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::FilterDescriptor& filter_descriptor,
-    DeviceMemory<T>* backward_filter_data, dnn::DataType accumulator_type,
-    ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
-  // Alpha is the scaling factor for input.
-  float falpha = 1.0;
-  double dalpha = 1.0;
-  void* alpha = cudnn_type == CUDNN_DATA_DOUBLE ? static_cast<void*>(&dalpha)
-                                                : static_cast<void*>(&falpha);
-  // Beta is the scaling factor for output.
-  float fbeta = 0.0;
-  double dbeta = 0.0;
-  void* beta = cudnn_type == CUDNN_DATA_DOUBLE ? static_cast<void*>(&dbeta)
-                                               : static_cast<void*>(&fbeta);
-
-  auto cudnn = cudnn_->GetHandle(parent_, stream);
-
-  CudnnTensorDescriptor out_back_nd(output_descriptor, cudnn_type);
-  CudnnTensorDescriptor input_nd(input_descriptor, cudnn_type);
-  CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
-  CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  ToCudnnDataType(accumulator_type));
-
-  const bool is_profiling = output_profile_result != nullptr;
-
-  DeviceMemory<uint8> scratch;
-  SE_ASSIGN_OR_RETURN(dnn::AlgorithmDesc algo_desc,
-                      GetCudnnConvolutionBackwardFilterAlgorithm(
-                          stream, cudnn, algorithm_config, input_nd, filter,
-                          conv, out_back_nd, scratch_allocator, &scratch));
-
-  std::unique_ptr<CUDATimer, TimerDeleter> timer;
-  if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));  // NOLINT
-    // The start and stop of the timer should be as close to the Cudnn call as
-    // possible. It is still possible for other threads to issue workload on
-    // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
-      return port::Status(port::error::INTERNAL, "Failed to start timer");
-    }
-  }
-
-  // Report an error if we might be hitting a cuDNN bug that produces incorrect
-  // results. See nvbugs/2072856
-  if (CUDNN_VERSION < 7300) {
-    SE_RETURN_IF_ERROR([&] {
-      if (algo_desc.algo_id() != CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING) {
-        return port::Status::OK();
-      }
-      if (output_descriptor.height() > 1 && output_descriptor.width() > 1) {
-        return port::Status::OK();
-      }
-      int convolution_size = output_descriptor.height() > 1
-                                 ? filter_descriptor.input_filter_height()
-                                 : filter_descriptor.input_filter_width();
-      if (convolution_size <= 32) {
-        return port::Status::OK();
-      }
-      cudnnConvolutionMode_t convolution_mode;
-      cudnnDataType_t compute_type;
-      RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionNdDescriptor(
-          conv.handle(), 0, nullptr, nullptr, nullptr, nullptr,
-          &convolution_mode, &compute_type));
-      if (convolution_mode != CUDNN_CONVOLUTION) {
-        return port::Status::OK();
-      }
-      return port::Status(
-          port::error::FAILED_PRECONDITION,
-          "This configuration potentially produces incorrect results.");
-    }());
-  }
-
-  if (algo_desc.algo_id() == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
-      !ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor)) {
-    return port::Status(port::error::FAILED_PRECONDITION,
-                        "This configuration has potential integer overflow in "
-                        "cuDNNv5 and cuDNNv6. See b/68264959.");
-  }
-
-  // Zero out the result buffer for strided conv backward filter for NHWC
-  // layouts. cuDNN 7.1.4 and 7.2 has non-determinisic bug if the buffer is not
-  // zeroed.
-  //
-  // This wrong result caused by the bug is very flaky. It needs to be run for
-  // up to 20 times to produce a mismatch.
-  //
-  // See nvbugs/2379553.
-  if (CUDNN_VERSION >= 7100 && CUDNN_VERSION < 7300 &&
-      algo_desc.algo_id() == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 &&
-      cudnn_type == CUDNN_DATA_HALF &&
-      input_descriptor.layout() == dnn::DataLayout::kBatchYXDepth &&
-      filter_descriptor.layout() == dnn::FilterLayout::kOutputYXInput &&
-      output_descriptor.layout() == dnn::DataLayout::kBatchYXDepth &&
-      (convolution_descriptor.vertical_filter_stride() > 1 ||
-       convolution_descriptor.horizontal_filter_stride() > 1)) {
-    stream->ThenMemZero(backward_filter_data, backward_filter_data->size());
-  }
-
-  RETURN_IF_CUDNN_ERROR(cudnnConvolutionBackwardFilter(
-      cudnn.handle(),
-      /*alpha=*/alpha,
-      /*srcDesc=*/input_nd.handle(),
-      /*srcData=*/input_data.opaque(),
-      /*diffDesc=*/out_back_nd.handle(),
-      /*diffData=*/backward_output_data.opaque(),
-      /*convDesc=*/conv.handle(),
-      /*algo=*/ToConvBackwardFilterAlgo(algo_desc),
-      /*workSpace=*/scratch.opaque(),
-      /*workSpaceSizeInBytes=*/scratch.size(),
-      /*beta=*/beta,
-      /*gradDesc=*/filter.handle(),
-      /*dw=*/backward_filter_data->opaque()));
-  if (is_profiling) {
-    if (!timer->Stop(AsCUDAStream(stream))) {
-      return port::Status(port::error::INTERNAL, "Failed to stop timer");
-    }
-    output_profile_result->set_algorithm(algo_desc);
-    output_profile_result->set_elapsed_time_in_ms(
-        timer->GetElapsedMilliseconds());
-    output_profile_result->set_scratch_size(scratch.size());
-  }
-
-  return port::Status::OK();
-}
-
-bool CudnnSupport::DoConvolveBackwardFilter(
-    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-    const DeviceMemory<double>& input_data,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<double> backward_output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::FilterDescriptor& filter_descriptor,
-    DeviceMemory<double>* backward_filter_data,
-    ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoConvolveBackwardFilterImpl(
-          stream, input_descriptor, input_data, output_descriptor,
-          backward_output_data, convolution_descriptor, filter_descriptor,
-          backward_filter_data, dnn::DataType::kDouble,
-
-          scratch_allocator, algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
-}
-
-bool CudnnSupport::DoConvolveBackwardFilter(
-    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-    const DeviceMemory<float>& input_data,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<float> backward_output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::FilterDescriptor& filter_descriptor,
-    DeviceMemory<float>* backward_filter_data,
-    ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(DoConvolveBackwardFilterImpl(
-                        stream, input_descriptor, input_data, output_descriptor,
-                        backward_output_data, convolution_descriptor,
-                        filter_descriptor, backward_filter_data,
-
-                        dnn::DataType::kFloat, scratch_allocator,
-                        algorithm_config, output_profile_result),
-                    /*report_error=*/!output_profile_result);
-}
-
-bool CudnnSupport::DoConvolveBackwardFilter(
-    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-    const DeviceMemory<Eigen::half>& input_data,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<Eigen::half> backward_output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const dnn::FilterDescriptor& filter_descriptor,
-    DeviceMemory<Eigen::half>* backward_filter_data,
-    ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-  dnn::DataType acc_type =
-      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
-          ? dnn::DataType::kFloat
-          : dnn::DataType::kHalf;
-  return IsStatusOk(
-      DoConvolveBackwardFilterImpl(
-          stream, input_descriptor, input_data, output_descriptor,
-          backward_output_data, convolution_descriptor, filter_descriptor,
-          backward_filter_data, acc_type, scratch_allocator, algorithm_config,
-          output_profile_result),
-      /*report_error=*/!output_profile_result);
-}
-
 template <class T>
 port::Status CudnnSupport::DoConvolveBackwardBiasImpl(
     Stream* stream, const dnn::BatchDescriptor& input_descriptor,
@@ -3760,6 +4017,31 @@ bool CudnnSupport::DoPoolForward(
   return IsStatusOk(status, /*report_error=*/true);
 }
 
+bool CudnnSupport::DoPoolForward(
+    Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
+    const dnn::BatchDescriptor& input_dimensions,
+    const DeviceMemory<int8>& input_data,
+    const dnn::BatchDescriptor& output_dimensions,
+    DeviceMemory<int8>* output_data, ScratchAllocator* workspace_allocator) {
+  // Alpha is the scaling factor for input.
+  float alpha = 1.0;
+  // Beta is the scaling factor for output.
+  float beta = 0.0;
+
+  CudnnTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_INT8);
+  CudnnTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_INT8);
+  CudnnPoolingDescriptor pooling_desc(pooling_dimensions);
+
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnPoolingForward(
+        cudnn.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
+        input_data.opaque(), &beta, dest_desc.handle(), output_data->opaque()));
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
+}
+
 bool CudnnSupport::DoPoolBackward(
     Stream* stream, const dnn::PoolingDescriptor& pooling_dimensions,
     const dnn::BatchDescriptor& input_dimensions,
@@ -3850,13 +4132,6 @@ bool CudnnSupport::DoPoolBackward(
   return IsStatusOk(status, /*report_error=*/true);
 }
 
-bool CudnnSupport::DoNormalize(
-    Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
-    const DeviceMemory<float>& input_data, DeviceMemory<float>* output_data) {
-  LOG(FATAL) << "not yet implemented";  // TODO(leary)
-  return false;
-}
-
 bool CudnnSupport::DoNormalizeWithDimensions(
     Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
     const dnn::BatchDescriptor& dimensions,
@@ -4055,22 +4330,22 @@ bool CudnnSupport::DeriveOutputBatchDescriptor(
   return IsStatusOk(status, /*report_error=*/true);
 }
 
-}  // namespace cuda
+}  // namespace gpu
 
 void initialize_cudnn() {
   port::Status status =
       PluginRegistry::Instance()->RegisterFactory<PluginRegistry::DnnFactory>(
-          cuda::kCudaPlatformId, cuda::kCuDnnPlugin, "cuDNN",
+          cuda::kCudaPlatformId, gpu::kCuDnnPlugin, "cuDNN",
           [](internal::StreamExecutorInterface* parent) -> dnn::DnnSupport* {
-            cuda::CUDAExecutor* cuda_executor =
-                dynamic_cast<cuda::CUDAExecutor*>(parent);
+            gpu::GpuExecutor* cuda_executor =
+                dynamic_cast<gpu::GpuExecutor*>(parent);
             if (cuda_executor == nullptr) {
               LOG(ERROR) << "Attempting to initialize an instance of the cuDNN "
                          << "support library with a non-CUDA StreamExecutor";
               return nullptr;
             }
 
-            cuda::CudnnSupport* dnn = new cuda::CudnnSupport(cuda_executor);
+            gpu::CudnnSupport* dnn = new gpu::CudnnSupport(cuda_executor);
             if (!dnn->Init().ok()) {
               // Note: Init() will log a more specific error.
               delete dnn;
@@ -4085,10 +4360,12 @@ void initialize_cudnn() {
   }
 
   PluginRegistry::Instance()->SetDefaultFactory(
-      cuda::kCudaPlatformId, PluginKind::kDnn, cuda::kCuDnnPlugin);
+      cuda::kCudaPlatformId, PluginKind::kDnn, gpu::kCuDnnPlugin);
 }
 
 }  // namespace stream_executor
 
+#pragma clang diagnostic pop
+
 REGISTER_MODULE_INITIALIZER(register_cudnn,
                             { stream_executor::initialize_cudnn(); });
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 0641be140d2f19651696b0bcac498870a4db2960..d8a8ddf6a566580531783c3ac4b9cc26406409ea 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -28,9 +28,9 @@ limitations under the License.
 #include "tensorflow/stream_executor/temporary_device_memory.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
-class CUDAExecutor;
+class GpuExecutor;
 class CudnnRnnDescriptor;
 class CudnnRnnSequenceTensorDescriptor;
 class CudnnRnnStateTensorDescriptor;
@@ -42,7 +42,7 @@ extern const PluginId kCuDnnPlugin;
 // functions, see dnn.h.
 class CudnnSupport : public dnn::DnnSupport {
  public:
-  explicit CudnnSupport(CUDAExecutor* parent);
+  explicit CudnnSupport(GpuExecutor* parent);
 
   port::Status Init() override;
   port::StatusOr<perftools::gputools::dnn::VersionInfo> GetVersion() override;
@@ -55,10 +55,16 @@ class CudnnSupport : public dnn::DnnSupport {
       ScratchAllocator* state_allocator) override;
 
   port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
-  createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
+  createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
                                     int data_size,
                                     dnn::DataType data_type) override;
 
+  port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
+  createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
+                                    int data_size,
+                                    const absl::Span<const int>& seq_lengths,
+                                    dnn::DataType data_type) override;
+
   port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
   createRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size,
                                  dnn::DataType data_type) override;
@@ -252,38 +258,16 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<float>* scale_backprop,
       DeviceMemory<float>* offset_backprop) override;
 
-  bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
-                  const DeviceMemory<float>& input_data,
-                  const dnn::FilterDescriptor& filter_descriptor,
-                  const DeviceMemory<float>& filter_data,
-                  const dnn::ConvolutionDescriptor& convolution_descriptor,
-                  const dnn::BatchDescriptor& output_descriptor,
-                  DeviceMemory<float>* output_data,
-                  ScratchAllocator* scratch_allocator,
-                  const dnn::AlgorithmConfig& algorithm_config,
-                  dnn::ProfileResult* output_profile_result) override;
-
-  bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
-                  const DeviceMemory<double>& input_data,
-                  const dnn::FilterDescriptor& filter_descriptor,
-                  const DeviceMemory<double>& filter_data,
-                  const dnn::ConvolutionDescriptor& convolution_descriptor,
-                  const dnn::BatchDescriptor& output_descriptor,
-                  DeviceMemory<double>* output_data,
-                  ScratchAllocator* scratch_allocator,
-                  const dnn::AlgorithmConfig& algorithm_config,
-                  dnn::ProfileResult* output_profile_result) override;
-
-  bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
-                  const DeviceMemory<Eigen::half>& input_data,
-                  const dnn::FilterDescriptor& filter_descriptor,
-                  const DeviceMemory<Eigen::half>& filter_data,
-                  const dnn::ConvolutionDescriptor& convolution_descriptor,
-                  const dnn::BatchDescriptor& output_descriptor,
-                  DeviceMemory<Eigen::half>* output_data,
-                  ScratchAllocator* scratch_allocator,
-                  const dnn::AlgorithmConfig& algorithm_config,
-                  dnn::ProfileResult* output_profile_result) override;
+  port::Status DoConvolve(
+      dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+      const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      dnn::AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory,
+      dnn::ProfileResult* output_profile_result) override;
 
   bool DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
@@ -384,78 +368,6 @@ class CudnnSupport : public dnn::DnnSupport {
     return false;
   }
 
-  bool DoConvolveBackwardData(
-      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<double>& filter_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<double> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& input_descriptor,
-      DeviceMemory<double>* backward_input_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result) override;
-
-  bool DoConvolveBackwardData(
-      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<float>& filter_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<float> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& input_descriptor,
-      DeviceMemory<float>* backward_input_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result) override;
-
-  bool DoConvolveBackwardData(
-      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<Eigen::half>& filter_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<Eigen::half> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& input_descriptor,
-      DeviceMemory<Eigen::half>* backward_input_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result) override;
-
-  bool DoConvolveBackwardFilter(
-      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-      const DeviceMemory<double>& input_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<double> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::FilterDescriptor& filter_descriptor,
-      DeviceMemory<double>* backward_filter_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result) override;
-
-  bool DoConvolveBackwardFilter(
-      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-      const DeviceMemory<float>& input_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<float> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::FilterDescriptor& filter_descriptor,
-      DeviceMemory<float>* backward_filter_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result) override;
-
-  bool DoConvolveBackwardFilter(
-      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-      const DeviceMemory<Eigen::half>& input_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<Eigen::half> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::FilterDescriptor& filter_descriptor,
-      DeviceMemory<Eigen::half>* backward_filter_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result) override;
-
   bool DoConvolveBackwardBias(
       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
       const DeviceMemory<double>& input_data,
@@ -534,6 +446,14 @@ class CudnnSupport : public dnn::DnnSupport {
                      DeviceMemory<Eigen::half>* output_data,
                      ScratchAllocator* workspace_allocator) override;
 
+  bool DoPoolForward(Stream* stream,
+                     const dnn::PoolingDescriptor& pooling_dimensions,
+                     const dnn::BatchDescriptor& input_dimensions,
+                     const DeviceMemory<int8>& input_data,
+                     const dnn::BatchDescriptor& output_dimensions,
+                     DeviceMemory<int8>* output_data,
+                     ScratchAllocator* workspace_allocator) override;
+
   bool DoPoolBackward(Stream* stream,
                       const dnn::PoolingDescriptor& pooling_dimensions,
                       const dnn::BatchDescriptor& input_dimensions,
@@ -564,11 +484,6 @@ class CudnnSupport : public dnn::DnnSupport {
                       DeviceMemory<Eigen::half>* output_diff_data,
                       ScratchAllocator* workspace_allocator) override;
 
-  bool DoNormalize(Stream* stream,
-                   const dnn::NormalizeDescriptor& normalize_descriptor,
-                   const DeviceMemory<float>& input_data,
-                   DeviceMemory<float>* output_data) override;
-
   bool DoNormalizeWithDimensions(
       Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
       const dnn::BatchDescriptor& dimensions,
@@ -632,7 +547,7 @@ class CudnnSupport : public dnn::DnnSupport {
                          DeviceMemoryBase* output_data) override;
 
  private:
-  CUDAExecutor* parent_;  // Parent executor object. Not owned.
+  GpuExecutor* parent_;  // Parent executor object. Not owned.
 
   // Provides access to the cuDNN handle.
   std::unique_ptr<class CudnnAccess> cudnn_;
@@ -662,19 +577,6 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop,
       DeviceMemory<U>* offset_backprop);
 
-  template <class T>
-  port::Status DoConvolveImpl(
-      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-      const DeviceMemory<T>& input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<T>& filter_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<T>* output_data, dnn::DataType accumulator_type,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result);
-
   template <typename ElementType, typename BiasType, typename ScaleType>
   port::Status DoFusedConvolveImpl(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
@@ -692,32 +594,6 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result);
 
-  template <class T>
-  port::Status DoConvolveBackwardDataImpl(
-      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<T>& filter_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<T> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& input_descriptor,
-      DeviceMemory<T>* backward_input_data, dnn::DataType accumulator_type,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result);
-
-  template <class T>
-  port::Status DoConvolveBackwardFilterImpl(
-      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-      const DeviceMemory<T>& input_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<T> backward_output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::FilterDescriptor& filter_descriptor,
-      DeviceMemory<T>* backward_filter_data, dnn::DataType accumulator_type,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result);
-
   template <class T>
   port::Status DoConvolveBackwardBiasImpl(
       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
@@ -770,10 +646,23 @@ class CudnnSupport : public dnn::DnnSupport {
       ScratchAllocator* workspace_allocator,
       dnn::ProfileResult* output_profile_result);
 
+ private:
+  port::Status DoPrepareForConvolution(
+      dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
+      const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      DeviceMemoryBase filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemoryBase output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::AlgorithmConfig& algorithm_config,
+      ScratchAllocator* scratch_allocator, dnn::AlgorithmDesc* algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory) override;
+
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnSupport);
 };
 
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index b34d1f722eaf60b21f2289a4b87b5653bfd43bb9..5ef821a4f121c58d1c2f64b602345063165ac58e 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -24,7 +24,9 @@ limitations under the License.
 #include "absl/base/casts.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/str_cat.h"
+#include "cuda/include/cuda_runtime_api.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
+#include "tensorflow/stream_executor/cuda/cuda_driver_wrapper.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/human_readable.h"
@@ -44,21 +46,20 @@ bool FLAGS_gpuexec_cuda_device_0_only = false;
 
 // Debugging: on each push and pop of a cuda context, verify the current context
 // matches the expected one.
-constexpr bool kVerifyCudaContext = false;
+constexpr bool kVerifyGpuContext = false;
 
 namespace stream_executor {
-namespace cuda {
-
+namespace gpu {
 namespace {
 
 // Manages the singleton map of contexts that we've created, mapping
-// from the CUcontext to the CudaContext* that we pass around internally.
-// This also manages assignment of unique ids to CudaContexts, to allow
+// from the CUcontext to the GpuContext* that we pass around internally.
+// This also manages assignment of unique ids to GpuContexts, to allow
 // for fast comparison of a context against the current context.
 //
 // CUDA-runtime-created contexts are avoided, if triple angle
 // brace launches are required, by using the scoped activations in
-// cuda_activation.h.
+// gpu/gpu_activation.h.
 class CreatedContexts {
  public:
   // Returns whether context is a member of the live set.
@@ -68,14 +69,14 @@ class CreatedContexts {
   }
 
   // Adds context to the live set, or returns it if it's already present.
-  static CudaContext* Add(CUcontext context) {
+  static GpuContext* Add(CUcontext context) {
     CHECK(context != nullptr);
     mutex_lock lock(mu_);
     auto insert_result = Live()->insert(std::make_pair(context, nullptr));
     auto it = insert_result.first;
     if (insert_result.second) {
       // context was not present in the map.  Add it.
-      it->second = MakeUnique<CudaContext>(context, next_id_++);
+      it->second = MakeUnique<GpuContext>(context, next_id_++);
     }
     return it->second.get();
   }
@@ -91,9 +92,9 @@ class CreatedContexts {
 
  private:
   // Returns the live map singleton.
-  static std::map<CUcontext, std::unique_ptr<CudaContext>> *Live() {
+  static std::map<CUcontext, std::unique_ptr<GpuContext>>* Live() {
     static auto singleton =
-        new std::map<CUcontext, std::unique_ptr<CudaContext>>;
+        new std::map<CUcontext, std::unique_ptr<GpuContext>>;
     return singleton;
   }
 
@@ -107,12 +108,12 @@ class CreatedContexts {
 
 // Formats CUresult to output prettified values into a log stream.
 string ToString(CUresult result) {
-  const char *error_name;
-  if (cuGetErrorName(result, &error_name)) {
+  const char* error_name;
+  if (tensorflow::wrap::cuGetErrorName(result, &error_name)) {
     return absl::StrCat("UNKNOWN ERROR (", static_cast<int>(result), ")");
   }
-  const char *error_string;
-  if (cuGetErrorString(result, &error_string)) {
+  const char* error_string;
+  if (tensorflow::wrap::cuGetErrorString(result, &error_string)) {
     return error_name;
   }
   return absl::StrCat(error_name, ": ", error_string);
@@ -122,7 +123,7 @@ string ToString(CUresult result) {
 // created by StreamExecutor (to ensure that the CUDA runtime didn't create a
 // context behind our backs).
 CUcontext CurrentContext() {
-  CUcontext current = CUDADriver::CurrentContextOrDie();
+  CUcontext current = cuda::CurrentContextOrDie();
   if (current != nullptr && !CreatedContexts::Has(current)) {
     LOG(FATAL) << "current context was not created by the StreamExecutor "
                   "cuda_driver API: "
@@ -139,14 +140,14 @@ CUcontext CurrentContext() {
 // thread::ThreadPool on some platforms), we run certain routines in this pool
 // and wait for completion.
 static mutex driver_executor_threadpool_mu(LINKER_INITIALIZED);
-static port::ThreadPool *InitializeDriverExecutor() {
+static port::ThreadPool* InitializeDriverExecutor() {
   return new port::ThreadPool(port::Env::Default(), port::ThreadOptions(),
                               "cuda_driver", 1);
 }
 
-port::ThreadPool *GetDriverExecutor() {
+port::ThreadPool* GetDriverExecutor() {
   mutex_lock lock(driver_executor_threadpool_mu);
-  static port::ThreadPool *thread_pool = InitializeDriverExecutor();
+  static port::ThreadPool* thread_pool = InitializeDriverExecutor();
   return thread_pool;
 }
 
@@ -165,18 +166,36 @@ string MemorySpaceString(MemorySpace memory_space) {
 
 namespace {
 
+template <typename PtrT>
+bool PointerIsValid(const PtrT ptr) {
+  // Checks that the pointer is to a location on the device it purports to be.
+  // PtrT is one of CUdeviceptr or void*.  If it's a CUdeviceptr, then
+  // cudaPointerGetAttributes should not fail, and return a memoryType of
+  // cudaMemoryTypeDevice.
+
+  bool is_host_ptr = !std::is_same<PtrT, CUdeviceptr>::value;
+  cudaPointerAttributes attributes;
+  cudaError_t err =
+      cudaPointerGetAttributes(&attributes, reinterpret_cast<const void*>(ptr));
+  // If we failed, reset cuda error status to avoid poisoning cuda streams.
+  if (err != cudaSuccess) cudaGetLastError();
+  bool points_to_host_memory = (err == cudaErrorInvalidValue ||
+                                attributes.memoryType != cudaMemoryTypeDevice);
+  return (is_host_ptr == points_to_host_memory);
+}
+
 // Call cuCtxtSynchronize and crash if it doesn't succeed.
 void SynchronizeOrDie() {
-  auto res = cuCtxSynchronize();
+  auto res = tensorflow::wrap::cuCtxSynchronize();
   if (res != CUDA_SUCCESS) {
-    LOG(FATAL) << "Synchronize found "
-               << ToString(res) << " :: " << port::CurrentStackTrace();
+    LOG(FATAL) << "Synchronize found " << ToString(res)
+               << " :: " << port::CurrentStackTrace();
   }
 }
 
 struct ThreadLocalData {
   int64 id;
-  CudaContext* context;  // Only valid if id == a known good context.
+  GpuContext* context;  // Only valid if id == a known good context.
   int depth;
 };
 
@@ -184,13 +203,13 @@ SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
 
 }  // namespace
 
-ScopedActivateContext::ScopedActivateContext(CudaContext* cuda_context) {
+ScopedActivateContext::ScopedActivateContext(GpuContext* cuda_context) {
   if (FLAGS_gpuexec_cuda_sync_around_driver_calls) SynchronizeOrDie();
 
   auto* tls = &tls_data.get();
   tls->depth++;
   if (tls->id == cuda_context->id()) {
-    if (kVerifyCudaContext) {
+    if (kVerifyGpuContext) {
       CHECK_EQ(CurrentContext(), cuda_context->context());
     }
     DCHECK_EQ(CurrentContext(), cuda_context->context());
@@ -203,7 +222,8 @@ ScopedActivateContext::ScopedActivateContext(CudaContext* cuda_context) {
   to_restore_ = (tls->depth == 1 ? nullptr : tls->context);
 
   // Set the context and update thread local.
-  CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(cuda_context->context()));
+  CHECK_EQ(CUDA_SUCCESS,
+           tensorflow::wrap::cuCtxSetCurrent(cuda_context->context()));
   tls->id = cuda_context->id();
   tls->context = cuda_context;
 }
@@ -213,8 +233,8 @@ ScopedActivateContext::~ScopedActivateContext() {
 
   auto* tls = &tls_data.get();
 
-  if (kVerifyCudaContext) {
-    // Note that if kVerifyCudaContext is used, and contexts are deleted, it's
+  if (kVerifyGpuContext) {
+    // Note that if kVerifyGpuContext is used, and contexts are deleted, it's
     // possible this could fail in the CurrentContext() call.
     CHECK_EQ(CurrentContext(),
              tls->context == nullptr ? nullptr : tls->context->context());
@@ -228,7 +248,8 @@ ScopedActivateContext::~ScopedActivateContext() {
   }
 
   // Set context and update thread local.
-  CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(to_restore_->context()));
+  CHECK_EQ(CUDA_SUCCESS,
+           tensorflow::wrap::cuCtxSetCurrent(to_restore_->context()));
   tls->id = to_restore_->id();
   tls->context = to_restore_;
 }
@@ -239,7 +260,7 @@ namespace {
 // logging purposes. Returns "?" if the device could not be successfully
 // queried.
 string CUDAPointerToDeviceString(CUdeviceptr pointer) {
-  auto value = CUDADriver::GetPointerDevice(pointer);
+  auto value = GpuDriver::GetPointerDevice(pointer);
   if (value.ok()) {
     return absl::StrCat(value.ValueOrDie());
   }
@@ -251,7 +272,7 @@ string CUDAPointerToDeviceString(CUdeviceptr pointer) {
 // logging purposes. Returns "?" if the memory space could not be successfully
 // queried.
 string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) {
-  auto value = CUDADriver::GetPointerMemorySpace(pointer);
+  auto value = GpuDriver::GetPointerMemorySpace(pointer);
   if (value.ok()) {
     return MemorySpaceString(value.ValueOrDie());
   }
@@ -264,25 +285,24 @@ string CUDAPointerToMemorySpaceString(CUdeviceptr pointer) {
 // primarily for logging purposes. Returns "error" if an error is encountered
 // in the process of querying.
 string CUDAPointersToCanAccessString(CUdeviceptr from, CUdeviceptr to) {
-  auto from_context = CUDADriver::GetPointerContext(from);
+  auto from_context = GpuDriver::GetPointerContext(from);
   if (!from_context.ok()) {
     LOG(ERROR) << "could not retrieve source pointer's context: "
                << from_context.status();
     return "error";
   }
-  auto to_context = CUDADriver::GetPointerContext(to);
+  auto to_context = GpuDriver::GetPointerContext(to);
   if (!to_context.ok()) {
     LOG(ERROR) << "could not retrieve destination pointer's context: "
                << to_context.status();
     return "error";
   }
-  return CUDADriver::CanEnablePeerAccess(from_context.ValueOrDie(),
-                                         to_context.ValueOrDie())
+  return GpuDriver::CanEnablePeerAccess(from_context.ValueOrDie(),
+                                        to_context.ValueOrDie())
              ? "true"
              : "false";
 }
 
-
 // Actually performs the work of CUDA initialization. Wrapped up in one-time
 // execution guard.
 static port::Status InternalInit() {
@@ -290,7 +310,7 @@ static port::Status InternalInit() {
   if (FLAGS_gpuexec_cuda_driver_inject_init_error) {
     LOG(ERROR) << "injecting CUDA init error; initialization will fail";
   } else {
-    res = cuInit(0 /* = flags */);
+    res = tensorflow::wrap::cuInit(0 /* = flags */);
   }
 
   if (res == CUDA_SUCCESS) {
@@ -305,12 +325,12 @@ static port::Status InternalInit() {
 
 }  // namespace
 
-/* static */ port::Status CUDADriver::Init() {
+/* static */ port::Status GpuDriver::Init() {
   // Cached return value from calling InternalInit(), as cuInit need only be
-  // called once, but CUDADriver::Init may be called many times.
+  // called once, but GpuDriver::Init may be called many times.
   static port::Status init_retval;
   static bool set = false;
-  static mutex *init_mu = new mutex;
+  static mutex* init_mu = new mutex;
 
   mutex_lock lock(*init_mu);
   if (!set) {
@@ -321,9 +341,9 @@ static port::Status InternalInit() {
   return init_retval;
 }
 
-/* static */ port::Status CUDADriver::GetDevice(int device_ordinal,
-                                                CUdevice *device) {
-  CUresult res = cuDeviceGet(device, device_ordinal);
+/* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
+                                               CUdevice* device) {
+  CUresult res = tensorflow::wrap::cuDeviceGet(device, device_ordinal);
   if (res == CUDA_SUCCESS) {
     return port::Status::OK();
   }
@@ -333,11 +353,12 @@ static port::Status InternalInit() {
       absl::StrCat("failed call to cuDeviceGet: ", ToString(res)));
 }
 
-/* static */ bool CUDADriver::GetDeviceName(CUdevice device,
-                                            string *device_name) {
+/* static */ bool GpuDriver::GetDeviceName(CUdevice device,
+                                           string* device_name) {
   static const size_t kCharLimit = 64;
   absl::InlinedVector<char, 4> chars(kCharLimit);
-  CUresult res = cuDeviceGetName(chars.begin(), kCharLimit - 1, device);
+  CUresult res =
+      tensorflow::wrap::cuDeviceGetName(chars.begin(), kCharLimit - 1, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to get device name for " << device << ": "
                << ToString(res);
@@ -348,8 +369,8 @@ static port::Status InternalInit() {
   return true;
 }
 
-bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
-                                 int *flags) {
+bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
+                                 int* flags) {
   static_assert(DeviceOptions::kMask == 0xf,
                 "needs update for new device options");
 
@@ -372,9 +393,9 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
   return true;
 }
 
-/* static */ port::Status CUDADriver::CreateContext(
-    CUdevice device, const DeviceOptions &device_options,
-    CudaContext **context) {
+/* static */ port::Status GpuDriver::CreateContext(
+    int device_ordinal, CUdevice device, const DeviceOptions& device_options,
+    GpuContext** context) {
   *context = nullptr;
 
   int flags = 0;
@@ -388,9 +409,9 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
 
   unsigned int former_primary_context_flags;
   int former_primary_context_is_active;
-  CHECK_EQ(CUDA_SUCCESS,
-           cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
-                                      &former_primary_context_is_active));
+  CHECK_EQ(CUDA_SUCCESS, tensorflow::wrap::cuDevicePrimaryCtxGetState(
+                             device, &former_primary_context_flags,
+                             &former_primary_context_is_active));
   if (former_primary_context_flags != flags) {
     if (former_primary_context_is_active) {
       LOG(ERROR)
@@ -398,15 +419,16 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
           << former_primary_context_flags << ") than the desired flag set ("
           << flags << ").";
     } else {
-      CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags));
+      CHECK_EQ(CUDA_SUCCESS,
+               tensorflow::wrap::cuDevicePrimaryCtxSetFlags(device, flags));
     }
   }
 
-  former_context = CUDADriver::CurrentContextOrDie();
-  res = cuDevicePrimaryCtxRetain(&new_context, device);
+  former_context = cuda::CurrentContextOrDie();
+  res = tensorflow::wrap::cuDevicePrimaryCtxRetain(&new_context, device);
   if (former_context != nullptr) {
     CUdevice former_device;
-    if (cuCtxGetDevice(&former_device) == CUDA_SUCCESS) {
+    if (tensorflow::wrap::cuCtxGetDevice(&former_device) == CUDA_SUCCESS) {
       if (former_device == device) {
         if (former_context == new_context) {
           VLOG(2) << "The primary context " << former_context << " for device "
@@ -425,13 +447,14 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
                  << former_context;
     }
   }
-  CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(former_context));
+  CHECK_EQ(CUDA_SUCCESS, tensorflow::wrap::cuCtxSetCurrent(former_context));
 
   if (res == CUDA_SUCCESS) {
     *context = CreatedContexts::Add(new_context);
     CHECK(*context != nullptr)
         << "success in this call must entail non-null result";
-    VLOG(2) << "created or reused context " << context << " for this thread";
+    VLOG(2) << "created or reused context " << new_context
+            << " for this thread";
     return port::Status::OK();
   }
 
@@ -448,17 +471,17 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
   return port::Status(port::error::INTERNAL, message);
 }
 
-/* static */ void CUDADriver::DestroyContext(CudaContext* context) {
+/* static */ void GpuDriver::DestroyContext(GpuContext* context) {
   if (context == nullptr) {
     return;
   }
   CUcontext former_context = CurrentContext();
-  CUresult res = cuCtxSetCurrent(context->context());
+  CUresult res = tensorflow::wrap::cuCtxSetCurrent(context->context());
   CUdevice device;
-  cuCtxGetDevice(&device);
-  cuCtxSetCurrent(former_context);
+  tensorflow::wrap::cuCtxGetDevice(&device);
+  tensorflow::wrap::cuCtxSetCurrent(former_context);
 
-  res = cuDevicePrimaryCtxRelease(device);
+  res = tensorflow::wrap::cuDevicePrimaryCtxRelease(device);
 
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to release CUDA context; leaking: " << ToString(res);
@@ -467,10 +490,11 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
   CreatedContexts::Remove(context->context());
 }
 
-/* static */ bool CUDADriver::FuncGetAttribute(CUfunction_attribute attribute,
-                                               CUfunction func,
-                                               int *attribute_value) {
-  CUresult res = cuFuncGetAttribute(attribute_value, attribute, func);
+/* static */ bool GpuDriver::FuncGetAttribute(CUfunction_attribute attribute,
+                                              CUfunction func,
+                                              int* attribute_value) {
+  CUresult res =
+      tensorflow::wrap::cuFuncGetAttribute(attribute_value, attribute, func);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query kernel attribute. kernel: " << func
                << ", attribute: " << attribute;
@@ -479,9 +503,9 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
   return true;
 }
 
-/* static */ bool CUDADriver::FuncSetCacheConfig(CUfunction function,
-                                                 CUfunc_cache cache_config) {
-  CUresult res = cuFuncSetCacheConfig(function, cache_config);
+/* static */ bool GpuDriver::FuncSetCacheConfig(CUfunction function,
+                                                CUfunc_cache cache_config) {
+  CUresult res = tensorflow::wrap::cuFuncSetCacheConfig(function, cache_config);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to set CUDA kernel cache config. kernel: " << function
                << ", config: " << cache_config << ", result: " << ToString(res);
@@ -492,13 +516,14 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
 }
 
 /* static */ port::StatusOr<CUsharedconfig>
-CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
+GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   CUsharedconfig shared_mem_config;
   ScopedActivateContext activation(context);
-  CUresult result = cuCtxGetSharedMemConfig(&shared_mem_config);
+  CUresult result =
+      tensorflow::wrap::cuCtxGetSharedMemConfig(&shared_mem_config);
   if (result != CUDA_SUCCESS) {
     CUdevice device;
-    cuCtxGetDevice(&device);
+    tensorflow::wrap::cuCtxGetDevice(&device);
     LOG(ERROR) << "failed to get CUDA device shared memory config. "
                << "Context device ID: " << device
                << ", result: " << ToString(result);
@@ -509,13 +534,14 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return shared_mem_config;
 }
 
-/* static */ port::Status CUDADriver::ContextSetSharedMemConfig(
-    CudaContext* context, CUsharedconfig shared_mem_config) {
+/* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
+    GpuContext* context, CUsharedconfig shared_mem_config) {
   ScopedActivateContext activation(context);
-  CUresult result = cuCtxSetSharedMemConfig(shared_mem_config);
+  CUresult result =
+      tensorflow::wrap::cuCtxSetSharedMemConfig(shared_mem_config);
   if (result != CUDA_SUCCESS) {
     CUdevice device;
-    cuCtxGetDevice(&device);
+    tensorflow::wrap::cuCtxGetDevice(&device);
     LOG(ERROR) << "failed to set CUDA device shared memory config. "
                << "Context device ID: " << device
                << ", config: " << shared_mem_config
@@ -527,20 +553,20 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return port::Status::OK();
 }
 
-/* static */ bool CUDADriver::LaunchKernel(
-    CudaContext* context, CUfunction function, unsigned int grid_dim_x,
+/* static */ bool GpuDriver::LaunchKernel(
+    GpuContext* context, CUfunction function, unsigned int grid_dim_x,
     unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x,
     unsigned int block_dim_y, unsigned int block_dim_z,
-    unsigned int shared_mem_bytes, CUstream stream, void **kernel_params,
-    void **extra) {
+    unsigned int shared_mem_bytes, CUstream stream, void** kernel_params,
+    void** extra) {
   ScopedActivateContext activation(context);
   VLOG(2) << "launching kernel: " << function << "; gdx: " << grid_dim_x
           << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
           << " bdx: " << block_dim_x << " bdy: " << block_dim_y
           << " bdz: " << block_dim_z;
-  CUresult res = cuLaunchKernel(function, grid_dim_x, grid_dim_y, grid_dim_z,
-                                block_dim_x, block_dim_y, block_dim_z,
-                                shared_mem_bytes, stream, kernel_params, extra);
+  CUresult res = tensorflow::wrap::cuLaunchKernel(
+      function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x, block_dim_y,
+      block_dim_z, shared_mem_bytes, stream, kernel_params, extra);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to launch CUDA kernel: " << function
                << "; result: " << ToString(res);
@@ -550,11 +576,12 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ port::Status CUDADriver::LoadCubin(CudaContext* context,
-                                                const char *cubin_bytes,
-                                                CUmodule *module) {
+/* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
+                                               const char* cubin_bytes,
+                                               CUmodule* module) {
   ScopedActivateContext activation(context);
-  CUresult result = cuModuleLoadFatBinary(module, cubin_bytes);
+  CUresult result =
+      tensorflow::wrap::cuModuleLoadFatBinary(module, cubin_bytes);
   if (result != CUDA_SUCCESS) {
     return port::Status(port::error::INTERNAL,
                         "failed to load in-memory CUBIN: " + ToString(result));
@@ -563,15 +590,15 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return port::Status::OK();
 }
 
-/* static */ bool CUDADriver::LoadPtx(CudaContext* context,
-                                      const char *ptx_contents,
-                                      CUmodule *module) {
+/* static */ bool GpuDriver::LoadPtx(GpuContext* context,
+                                     const char* ptx_contents,
+                                     CUmodule* module) {
   port::Notification notification;
   bool ret = true;
   GetDriverExecutor()->Schedule([context, ptx_contents, module, &ret,
                                  &notification]() {
     ScopedActivateContext activation(context);
-    void *ptx_data = const_cast<char *>(ptx_contents);
+    void* ptx_data = const_cast<char*>(ptx_contents);
     static const unsigned int kLogBufferBytesLimit = 1024;
     unsigned int error_log_buffer_bytes = kLogBufferBytesLimit;
     unsigned int info_log_buffer_bytes = kLogBufferBytesLimit;
@@ -584,12 +611,12 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                               CU_JIT_INFO_LOG_BUFFER, CU_JIT_LOG_VERBOSE};
     // Note that the driver API wants the contents of this values to be stored
     // in an array of void*s, so we coerce them accordingly.
-    void *option_values[] = {
-        absl::bit_cast<void *>(uintptr_t(error_log_buffer_bytes)),
-        absl::bit_cast<void *>(error_log_buffer.data()),
-        absl::bit_cast<void *>(uintptr_t(info_log_buffer_bytes)),
-        absl::bit_cast<void *>(info_log_buffer.data()),
-        absl::bit_cast<void *>(uintptr_t(log_verbose))};
+    void* option_values[] = {
+        absl::bit_cast<void*>(uintptr_t(error_log_buffer_bytes)),
+        absl::bit_cast<void*>(error_log_buffer.data()),
+        absl::bit_cast<void*>(uintptr_t(info_log_buffer_bytes)),
+        absl::bit_cast<void*>(info_log_buffer.data()),
+        absl::bit_cast<void*>(uintptr_t(log_verbose))};
     CHECK(TF_ARRAYSIZE(options) == TF_ARRAYSIZE(option_values));
 
     CUresult res;
@@ -597,8 +624,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
       // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
       // module loading: see http://b/13248943
 
-      res = cuModuleLoadDataEx(module, ptx_data, TF_ARRAYSIZE(options),
-                               options, option_values);
+      res = tensorflow::wrap::cuModuleLoadDataEx(
+          module, ptx_data, TF_ARRAYSIZE(options), options, option_values);
     }
 
     // The PTX JIT mutates the values in the option values array to reflect the
@@ -613,8 +640,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
       LOG(ERROR) << "failed to load PTX text as a module: " << ToString(res);
       // As a precaution for null termination of the API-provided value, ensure
       // that at least the last byte is null.
-      error_log_buffer[error_log_buffer_bytes ?
-                       error_log_buffer_bytes - 1 : 0] = '\0';
+      error_log_buffer[error_log_buffer_bytes ? error_log_buffer_bytes - 1
+                                              : 0] = '\0';
       LOG(ERROR) << "error log buffer (" << error_log_buffer_bytes
                  << " bytes): " << error_log_buffer.data();
       ret = false;
@@ -633,11 +660,18 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return ret;
 }
 
-/* static */ bool CUDADriver::SynchronousMemsetUint8(CudaContext* context,
-                                                     CUdeviceptr location,
-                                                     uint8 value, size_t size) {
+/* static */ bool GpuDriver::LoadHsaco(GpuContext* context,
+                                       const char* hsaco_contents,
+                                       CUmodule* module) {
+  LOG(ERROR) << "Feature not supported on CUDA platform (LoadHsaco)";
+  return false;
+}
+
+/* static */ bool GpuDriver::SynchronousMemsetUint8(GpuContext* context,
+                                                    CUdeviceptr location,
+                                                    uint8 value, size_t size) {
   ScopedActivateContext activation(context);
-  CUresult res = cuMemsetD8(location, value, size);
+  CUresult res = tensorflow::wrap::cuMemsetD8(location, value, size);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to memset memory: " << ToString(res);
     return false;
@@ -645,12 +679,12 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::SynchronousMemsetUint32(CudaContext* context,
-                                                      CUdeviceptr location,
-                                                      uint32 value,
-                                                      size_t uint32_count) {
+/* static */ bool GpuDriver::SynchronousMemsetUint32(GpuContext* context,
+                                                     CUdeviceptr location,
+                                                     uint32 value,
+                                                     size_t uint32_count) {
   ScopedActivateContext activation(context);
-  CUresult res = cuMemsetD32(location, value, uint32_count);
+  CUresult res = tensorflow::wrap::cuMemsetD32(location, value, uint32_count);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to memset memory: " << ToString(res);
     return false;
@@ -658,13 +692,14 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::AsynchronousMemsetUint8(CudaContext* context,
-                                                      CUdeviceptr location,
-                                                      uint8 value,
-                                                      size_t uint32_count,
-                                                      CUstream stream) {
+/* static */ bool GpuDriver::AsynchronousMemsetUint8(GpuContext* context,
+                                                     CUdeviceptr location,
+                                                     uint8 value,
+                                                     size_t uint32_count,
+                                                     CUstream stream) {
   ScopedActivateContext activation(context);
-  CUresult res = cuMemsetD8Async(location, value, uint32_count, stream);
+  CUresult res =
+      tensorflow::wrap::cuMemsetD8Async(location, value, uint32_count, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
     return false;
@@ -673,13 +708,14 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::AsynchronousMemsetUint32(CudaContext* context,
-                                                       CUdeviceptr location,
-                                                       uint32 value,
-                                                       size_t uint32_count,
-                                                       CUstream stream) {
+/* static */ bool GpuDriver::AsynchronousMemsetUint32(GpuContext* context,
+                                                      CUdeviceptr location,
+                                                      uint32 value,
+                                                      size_t uint32_count,
+                                                      CUstream stream) {
   ScopedActivateContext activation(context);
-  CUresult res = cuMemsetD32Async(location, value, uint32_count, stream);
+  CUresult res =
+      tensorflow::wrap::cuMemsetD32Async(location, value, uint32_count, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
     return false;
@@ -688,12 +724,13 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::AddStreamCallback(CudaContext* context,
-                                                CUstream stream,
-                                                StreamCallback callback,
-                                                void *data) {
+/* static */ bool GpuDriver::AddStreamCallback(GpuContext* context,
+                                               CUstream stream,
+                                               StreamCallback callback,
+                                               void* data) {
   // Note: flags param is required to be zero according to CUDA 6.0.
-  CUresult res = cuStreamAddCallback(stream, callback, data, 0 /* = flags */);
+  CUresult res = tensorflow::wrap::cuStreamAddCallback(stream, callback, data,
+                                                       0 /* = flags */);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "unable to add host callback: " << ToString(res);
     return false;
@@ -701,13 +738,14 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::GetModuleFunction(CudaContext *context,
-                                                CUmodule module,
-                                                const char *kernel_name,
-                                                CUfunction *function) {
+/* static */ bool GpuDriver::GetModuleFunction(GpuContext* context,
+                                               CUmodule module,
+                                               const char* kernel_name,
+                                               CUfunction* function) {
   ScopedActivateContext activated{context};
   CHECK(module != nullptr && kernel_name != nullptr);
-  CUresult res = cuModuleGetFunction(function, module, kernel_name);
+  CUresult res =
+      tensorflow::wrap::cuModuleGetFunction(function, module, kernel_name);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to get PTX kernel \"" << kernel_name
                << "\" from module: " << ToString(res);
@@ -717,15 +755,15 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::GetModuleSymbol(CudaContext* context,
-                                              CUmodule module,
-                                              const char *symbol_name,
-                                              CUdeviceptr *dptr,
-                                              size_t *bytes) {
+/* static */ bool GpuDriver::GetModuleSymbol(GpuContext* context,
+                                             CUmodule module,
+                                             const char* symbol_name,
+                                             CUdeviceptr* dptr, size_t* bytes) {
   ScopedActivateContext activated{context};
   CHECK(module != nullptr && symbol_name != nullptr &&
         (dptr != nullptr || bytes != nullptr));
-  CUresult res = cuModuleGetGlobal(dptr, bytes, module, symbol_name);
+  CUresult res =
+      tensorflow::wrap::cuModuleGetGlobal(dptr, bytes, module, symbol_name);
   if (res != CUDA_SUCCESS) {
     // symbol may not be found in the current module, but it may reside in
     // another module.
@@ -737,21 +775,21 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ void CUDADriver::UnloadModule(CudaContext *context,
-                                           CUmodule module) {
+/* static */ void GpuDriver::UnloadModule(GpuContext* context,
+                                          CUmodule module) {
   ScopedActivateContext activated{context};
-  CUresult res = cuModuleUnload(module);
+  CUresult res = tensorflow::wrap::cuModuleUnload(module);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to unload module " << module
                << "; leaking: " << ToString(res);
   }
 }
 
-/* static */ port::StatusOr<CUdevice> CUDADriver::DeviceFromContext(
-    CudaContext* context) {
+/* static */ port::StatusOr<CUdevice> GpuDriver::DeviceFromContext(
+    GpuContext* context) {
   ScopedActivateContext activated{context};
   CUdevice device = -1;
-  CUresult result = cuCtxGetDevice(&device);
+  CUresult result = tensorflow::wrap::cuCtxGetDevice(&device);
   if (result == CUDA_SUCCESS) {
     return device;
   }
@@ -761,109 +799,115 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
       absl::StrCat("failed to get device for context: ", ToString(result)));
 }
 
-/* static */ bool CUDADriver::CreateStream(CudaContext *context,
-                                           CUstream *out) {
+/* static */ bool GpuDriver::CreateStream(GpuContext* context,
+                                          CUstream* stream) {
   // TODO(leary) can we switch this to CU_STREAM_NON_BLOCKING or will that mess
   // up synchronization with respect to memsets and any other things that have
   // to occur on the default stream?
   ScopedActivateContext activated{context};
-  CUresult res = cuStreamCreate(out, 0);
+  CUresult res = tensorflow::wrap::cuStreamCreate(stream, 0);
   if (res != CUDA_SUCCESS) {
-    LOG(ERROR) << "could not allocate CUDA stream for context " << context
-               << ": " << ToString(res);
+    LOG(ERROR) << "could not allocate CUDA stream for context "
+               << context->context() << ": " << ToString(res);
     return false;
   }
 
-  VLOG(2) << "successfully created stream " << *out << " for context "
-          << context << " on thread";
+  VLOG(2) << "successfully created stream " << *stream << " for context "
+          << context->context() << " on thread";
   return true;
 }
 
-/* static */ void CUDADriver::DestroyStream(CudaContext* context,
-                                            CUstream *stream) {
+/* static */ void GpuDriver::DestroyStream(GpuContext* context,
+                                           CUstream* stream) {
   if (*stream == nullptr) {
     return;
   }
 
   ScopedActivateContext activated{context};
-  CUresult res = cuStreamDestroy(*stream);
+  CUresult res = tensorflow::wrap::cuStreamDestroy(*stream);
   if (res != CUDA_SUCCESS) {
-    LOG(ERROR) << "failed to destroy CUDA stream for context " << context
-               << ": " << ToString(res);
+    LOG(ERROR) << "failed to destroy CUDA stream for context "
+               << context->context() << ": " << ToString(res);
   } else {
     VLOG(2) << "successfully destroyed stream " << *stream << " for context "
-            << context;
+            << context->context();
     *stream = nullptr;
   }
 }
 
-/* static */ void *CUDADriver::DeviceAllocate(CudaContext *context,
-                                              uint64 bytes) {
+/* static */ void* GpuDriver::DeviceAllocate(GpuContext* context,
+                                             uint64 bytes) {
+  if (bytes == 0) {
+    return nullptr;
+  }
+
   ScopedActivateContext activated{context};
   CUdeviceptr result = 0;
-  CUresult res = cuMemAlloc(&result, bytes);
+  CUresult res = tensorflow::wrap::cuMemAlloc(&result, bytes);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to allocate "
                << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
                << " bytes) from device: " << ToString(res);
     return nullptr;
   }
-  void *ptr = reinterpret_cast<void *>(result);
-  VLOG(2) << "allocated " << ptr << " for context " << context << " of "
-          << bytes << " bytes";
+  void* ptr = reinterpret_cast<void*>(result);
+  VLOG(2) << "allocated " << ptr << " for context " << context->context()
+          << " of " << bytes << " bytes";
   return ptr;
 }
 
-/* static */ void CUDADriver::DeviceDeallocate(CudaContext* context,
-                                               void *location) {
+/* static */ void GpuDriver::DeviceDeallocate(GpuContext* context,
+                                              void* location) {
   ScopedActivateContext activation(context);
   CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
-  CUresult res = cuMemFree(pointer);
+  CUresult res = tensorflow::wrap::cuMemFree(pointer);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to free device memory at " << location
                << "; result: " << ToString(res);
   } else {
-    VLOG(2) << "deallocated " << location << " for context " << context;
+    VLOG(2) << "deallocated " << location << " for context "
+            << context->context();
   }
 }
 
-/* static */ void *CUDADriver::UnifiedMemoryAllocate(CudaContext *context,
-                                                     uint64 bytes) {
+/* static */ void* GpuDriver::UnifiedMemoryAllocate(GpuContext* context,
+                                                    uint64 bytes) {
   ScopedActivateContext activation(context);
   CUdeviceptr result = 0;
   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
-  CUresult res = cuMemAllocManaged(&result, bytes, CU_MEM_ATTACH_GLOBAL);
+  CUresult res =
+      tensorflow::wrap::cuMemAllocManaged(&result, bytes, CU_MEM_ATTACH_GLOBAL);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to alloc " << bytes
                << " bytes unified memory; result: " << ToString(res);
     return nullptr;
   }
-  void *ptr = reinterpret_cast<void *>(result);
-  VLOG(2) << "allocated " << ptr << " for context " << context << " of "
-          << bytes << " bytes in unified memory";
+  void* ptr = reinterpret_cast<void*>(result);
+  VLOG(2) << "allocated " << ptr << " for context " << context->context()
+          << " of " << bytes << " bytes in unified memory";
   return ptr;
 }
 
-/* static */ void CUDADriver::UnifiedMemoryDeallocate(CudaContext *context,
-                                                      void *location) {
+/* static */ void GpuDriver::UnifiedMemoryDeallocate(GpuContext* context,
+                                                     void* location) {
   ScopedActivateContext activation(context);
   CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
-  CUresult res = cuMemFree(pointer);
+  CUresult res = tensorflow::wrap::cuMemFree(pointer);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to free unified memory at " << location
                << "; result: " << ToString(res);
   } else {
     VLOG(2) << "deallocated unified memory at " << location << " for context "
-            << context;
+            << context->context();
   }
 }
 
-/* static */ void *CUDADriver::HostAllocate(CudaContext *context,
-                                            uint64 bytes) {
+/* static */ void* GpuDriver::HostAllocate(GpuContext* context, uint64 bytes) {
   ScopedActivateContext activation(context);
-  void *host_mem = nullptr;
+  void* host_mem = nullptr;
   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
-  CUresult res = cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE);
+  CUresult res = tensorflow::wrap::cuMemHostAlloc(&host_mem, bytes,
+                                                  CU_MEMHOSTALLOC_PORTABLE);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to alloc " << bytes
                << " bytes on host: " << ToString(res);
@@ -871,22 +915,22 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return host_mem;
 }
 
-/* static */ void CUDADriver::HostDeallocate(CudaContext* context,
-                                             void *location) {
+/* static */ void GpuDriver::HostDeallocate(GpuContext* context,
+                                            void* location) {
   ScopedActivateContext activation(context);
-  CUresult res = cuMemFreeHost(location);
+  CUresult res = tensorflow::wrap::cuMemFreeHost(location);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "error deallocating host memory at " << location << ": "
                << ToString(res);
   }
 }
 
-/* static */ bool CUDADriver::HostRegister(CudaContext* context, void *location,
-                                           uint64 bytes) {
+/* static */ bool GpuDriver::HostRegister(GpuContext* context, void* location,
+                                          uint64 bytes) {
   ScopedActivateContext activation(context);
   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
-  CUresult res =
-      cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE);
+  CUresult res = tensorflow::wrap::cuMemHostRegister(
+      location, bytes, CU_MEMHOSTREGISTER_PORTABLE);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "error registering host memory at " << location << ": "
                << ToString(res);
@@ -895,10 +939,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::HostUnregister(CudaContext* context,
-                                             void *location) {
+/* static */ bool GpuDriver::HostUnregister(GpuContext* context,
+                                            void* location) {
   ScopedActivateContext activation(context);
-  CUresult res = cuMemHostUnregister(location);
+  CUresult res = tensorflow::wrap::cuMemHostUnregister(location);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "error unregistering host memory at " << location << ": "
                << ToString(res);
@@ -907,15 +951,15 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ port::Status CUDADriver::DestroyEvent(CudaContext* context,
-                                                   CUevent *event) {
+/* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
+                                                  CUevent* event) {
   if (*event == nullptr) {
     return port::Status(port::error::INVALID_ARGUMENT,
                         "input event cannot be null");
   }
 
   ScopedActivateContext activated{context};
-  CUresult res = cuEventDestroy(*event);
+  CUresult res = tensorflow::wrap::cuEventDestroy(*event);
   *event = nullptr;
 
   switch (res) {
@@ -935,11 +979,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   }
 }
 
-/* static */ port::Status CUDADriver::RecordEvent(CudaContext* context,
-                                                  CUevent event,
-                                                  CUstream stream) {
+/* static */ port::Status GpuDriver::RecordEvent(GpuContext* context,
+                                                 CUevent event,
+                                                 CUstream stream) {
   ScopedActivateContext activated{context};
-  CUresult res = cuEventRecord(event, stream);
+  CUresult res = tensorflow::wrap::cuEventRecord(event, stream);
   switch (res) {
     case CUDA_SUCCESS:
       return port::Status::OK();
@@ -957,10 +1001,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   }
 }
 
-/* static */ port::StatusOr<CUresult> CUDADriver::QueryEvent(
-    CudaContext *context, CUevent event) {
+/* static */ port::StatusOr<CUresult> GpuDriver::QueryEvent(GpuContext* context,
+                                                            CUevent event) {
   ScopedActivateContext activated{context};
-  CUresult res = cuEventQuery(event);
+  CUresult res = tensorflow::wrap::cuEventQuery(event);
   if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
     return port::Status(
         port::error::INTERNAL,
@@ -970,18 +1014,18 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return res;
 }
 
-/* static */ bool CUDADriver::GetEventElapsedTime(CudaContext* context,
-                                                  float *elapsed_milliseconds,
-                                                  CUevent start, CUevent stop) {
+/* static */ bool GpuDriver::GetEventElapsedTime(GpuContext* context,
+                                                 float* elapsed_milliseconds,
+                                                 CUevent start, CUevent stop) {
   ScopedActivateContext activated{context};
   // The stop event must have completed in order for cuEventElapsedTime to
   // work.
-  CUresult res = cuEventSynchronize(stop);
+  CUresult res = tensorflow::wrap::cuEventSynchronize(stop);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
     return false;
   }
-  res = cuEventElapsedTime(elapsed_milliseconds, start, stop);
+  res = tensorflow::wrap::cuEventElapsedTime(elapsed_milliseconds, start, stop);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to get elapsed time between events: "
                << ToString(res);
@@ -991,11 +1035,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::WaitStreamOnEvent(CudaContext* context,
-                                                CUstream stream,
-                                                CUevent event) {
+/* static */ bool GpuDriver::WaitStreamOnEvent(GpuContext* context,
+                                               CUstream stream, CUevent event) {
   ScopedActivateContext activation(context);
-  CUresult res = cuStreamWaitEvent(stream, event, 0 /* = flags */);
+  CUresult res =
+      tensorflow::wrap::cuStreamWaitEvent(stream, event, 0 /* = flags */);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not wait stream on event: " << ToString(res);
     return false;
@@ -1004,9 +1048,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::SynchronizeContext(CudaContext* context) {
+/* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
   ScopedActivateContext activation(context);
-  CUresult res = cuCtxSynchronize();
+  CUresult res = tensorflow::wrap::cuCtxSynchronize();
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not synchronize on CUDA context: " << ToString(res)
                << " :: " << port::CurrentStackTrace();
@@ -1016,11 +1060,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ port::Status CUDADriver::SynchronizeStream(CudaContext *context,
-                                                        CUstream stream) {
+/* static */ port::Status GpuDriver::SynchronizeStream(GpuContext* context,
+                                                       CUstream stream) {
   ScopedActivateContext activated{context};
   CHECK(stream != nullptr);
-  CUresult res = cuStreamSynchronize(stream);
+  CUresult res = tensorflow::wrap::cuStreamSynchronize(stream);
   if (res != CUDA_SUCCESS) {
     port::Status status = port::InternalError(
         absl::StrCat("could not synchronize on CUDA stream: ", ToString(res)));
@@ -1032,11 +1076,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return port::Status::OK();
 }
 
-/* static */ bool CUDADriver::IsStreamIdle(CudaContext *context,
-                                           CUstream stream) {
+/* static */ bool GpuDriver::IsStreamIdle(GpuContext* context,
+                                          CUstream stream) {
   ScopedActivateContext activated{context};
   CHECK(stream != nullptr);
-  CUresult res = cuStreamQuery(stream);
+  CUresult res = tensorflow::wrap::cuStreamQuery(stream);
   if (res == CUDA_SUCCESS) {
     return true;
   }
@@ -1047,91 +1091,123 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return false;
 }
 
-/* static */ port::Status CUDADriver::SynchronousMemcpyD2H(CudaContext *context,
-                                                           void *host_dst,
-                                                           CUdeviceptr gpu_src,
-                                                           uint64 size) {
+/* static */ port::Status GpuDriver::SynchronousMemcpyD2H(GpuContext* context,
+                                                          void* host_dst,
+                                                          CUdeviceptr gpu_src,
+                                                          uint64 size) {
   ScopedActivateContext activation(context);
-  CUresult res = cuMemcpyDtoH(host_dst, gpu_src, size);
+  if (size > 0) {
+    CHECK(PointerIsValid(gpu_src))
+        << "Source pointer is not actually on GPU: " << gpu_src;
+    CHECK(PointerIsValid(host_dst))
+        << "Destination pointer is not actually on CPU: " << host_dst;
+  }
+  CUresult res = tensorflow::wrap::cuMemcpyDtoH(host_dst, gpu_src, size);
   if (res != CUDA_SUCCESS) {
     return port::InternalError(
         port::Printf("failed to synchronous memcpy from device to host: %s; "
                      "host dst: %p; GPU src: %p; size: %llu=0x%llx",
                      ToString(res).c_str(), host_dst,
-                     absl::bit_cast<void *>(gpu_src), size, size));
+                     absl::bit_cast<void*>(gpu_src), size, size));
   }
   VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to "
           << host_dst;
   return port::Status::OK();
 }
 
-/* static */ port::Status CUDADriver::SynchronousMemcpyH2D(CudaContext *context,
-                                                           CUdeviceptr gpu_dst,
-                                                           const void *host_src,
-                                                           uint64 size) {
+/* static */ port::Status GpuDriver::SynchronousMemcpyH2D(GpuContext* context,
+                                                          CUdeviceptr gpu_dst,
+                                                          const void* host_src,
+                                                          uint64 size) {
   ScopedActivateContext activation(context);
-  CUresult res = cuMemcpyHtoD(gpu_dst, host_src, size);
+  if (size > 0) {
+    CHECK(PointerIsValid(host_src))
+        << "Source pointer is not actually on CPU: " << host_src;
+    CHECK(PointerIsValid(gpu_dst))
+        << "Destination pointer is not actually on GPU: " << gpu_dst;
+  }
+  CUresult res = tensorflow::wrap::cuMemcpyHtoD(gpu_dst, host_src, size);
   if (res != CUDA_SUCCESS) {
     return port::InternalError(port::Printf(
         "failed to synchronous memcpy from host to device: %s; GPU dst: %p;"
         " host src: %p; size: %llu=0x%llx",
-        ToString(res).c_str(), absl::bit_cast<void *>(gpu_dst), host_src, size,
+        ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst), host_src, size,
         size));
   }
   VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes";
   return port::Status::OK();
 }
 
-/* static */ port::Status CUDADriver::SynchronousMemcpyD2D(CudaContext *context,
-                                                           CUdeviceptr gpu_dst,
-                                                           CUdeviceptr gpu_src,
-                                                           uint64 size) {
+/* static */ port::Status GpuDriver::SynchronousMemcpyD2D(GpuContext* context,
+                                                          CUdeviceptr gpu_dst,
+                                                          CUdeviceptr gpu_src,
+                                                          uint64 size) {
   ScopedActivateContext activation(context);
-  CUresult res = cuMemcpyDtoD(gpu_dst, gpu_src, size);
+  if (size > 0) {
+    CHECK(PointerIsValid(gpu_src))
+        << "Source pointer is not actually on GPU: " << gpu_src;
+    CHECK(PointerIsValid(gpu_dst))
+        << "Destination pointer is not actually on GPU: " << gpu_dst;
+  }
+  CUresult res = tensorflow::wrap::cuMemcpyDtoD(gpu_dst, gpu_src, size);
   if (res != CUDA_SUCCESS) {
     return port::InternalError(port::Printf(
         "failed to synchronous memcpy from host to device: %s; GPU dst: %p; "
         "GPU src: %p; size: %llu=0x%llx",
-        ToString(res).c_str(), absl::bit_cast<void *>(gpu_dst),
-        absl::bit_cast<void *>(gpu_src), size, size));
+        ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst),
+        absl::bit_cast<void*>(gpu_src), size, size));
   }
   VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes";
   return port::Status::OK();
 }
 
-/* static */ bool CUDADriver::AsynchronousMemcpyD2H(CudaContext* context,
-                                                    void *host_dst,
-                                                    CUdeviceptr gpu_src,
-                                                    uint64 size,
-                                                    CUstream stream) {
+/* static */ bool GpuDriver::AsynchronousMemcpyD2H(GpuContext* context,
+                                                   void* host_dst,
+                                                   CUdeviceptr gpu_src,
+                                                   uint64 size,
+                                                   CUstream stream) {
   ScopedActivateContext activation(context);
-  CUresult res = cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
+  if (size > 0) {
+    CHECK(PointerIsValid(gpu_src))
+        << "Source pointer is not actually on GPU: " << gpu_src;
+    CHECK(PointerIsValid(host_dst))
+        << "Destination pointer is not actually on CPU: " << host_dst;
+  }
+  CUresult res =
+      tensorflow::wrap::cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << port::Printf(
         "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
         "GPU src: %p; size: %llu=0x%llx",
-        ToString(res).c_str(), host_dst, absl::bit_cast<void *>(gpu_src), size,
+        ToString(res).c_str(), host_dst, absl::bit_cast<void*>(gpu_src), size,
         size);
     return false;
   }
   VLOG(2) << "successfully enqueued async memcpy d2h of " << size
-          << " bytes from " << absl::bit_cast<void *>(gpu_src) << " to "
+          << " bytes from " << absl::bit_cast<void*>(gpu_src) << " to "
           << host_dst << " on stream " << stream;
   return true;
 }
 
-/* static */ bool CUDADriver::AsynchronousMemcpyH2D(CudaContext* context,
-                                                    CUdeviceptr gpu_dst,
-                                                    const void *host_src,
-                                                    uint64 size,
-                                                    CUstream stream) {
+/* static */ bool GpuDriver::AsynchronousMemcpyH2D(GpuContext* context,
+                                                   CUdeviceptr gpu_dst,
+                                                   const void* host_src,
+                                                   uint64 size,
+                                                   CUstream stream) {
   ScopedActivateContext activation(context);
-  CUresult res = cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream);
+  if (size > 0) {
+    CHECK(PointerIsValid(host_src))
+        << "Source pointer is not actually on CPU: " << host_src;
+    CHECK(PointerIsValid(gpu_dst))
+        << "Destination pointer is not actually on GPU: " << gpu_dst;
+  }
+  CUresult res =
+      tensorflow::wrap::cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << port::Printf(
         "failed to enqueue async memcpy from host to device: %s; GPU dst: %p; "
         "host src: %p; size: %llu=0x%llx",
-        ToString(res).c_str(), absl::bit_cast<void *>(gpu_dst), host_src, size,
+        ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst), host_src, size,
         size);
     return false;
   }
@@ -1140,23 +1216,30 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ bool CUDADriver::AsynchronousMemcpyD2D(CudaContext* context,
-                                                    CUdeviceptr gpu_dst,
-                                                    CUdeviceptr gpu_src,
-                                                    uint64 size,
-                                                    CUstream stream) {
+/* static */ bool GpuDriver::AsynchronousMemcpyD2D(GpuContext* context,
+                                                   CUdeviceptr gpu_dst,
+                                                   CUdeviceptr gpu_src,
+                                                   uint64 size,
+                                                   CUstream stream) {
   ScopedActivateContext activation(context);
-  CUresult result = cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
+  if (size > 0) {
+    CHECK(PointerIsValid(gpu_src))
+        << "Source pointer is not actually on GPU: " << gpu_src;
+    CHECK(PointerIsValid(gpu_dst))
+        << "Destination pointer is not actually on GPU: " << gpu_dst;
+  }
+  CUresult result =
+      tensorflow::wrap::cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
   if (result != CUDA_SUCCESS) {
     LOG(ERROR) << port::Printf(
         "failed to enqueue async memcpy from device to device: %s"
         "; GPU dst: %p on %s %s"
         "; GPU src: %p on %s %s"
         "; can access? %s; size: %llu=0x%llx",
-        ToString(result).c_str(), absl::bit_cast<void *>(gpu_dst),
+        ToString(result).c_str(), absl::bit_cast<void*>(gpu_dst),
         CUDAPointerToMemorySpaceString(gpu_dst).c_str(),
         CUDAPointerToDeviceString(gpu_dst).c_str(),
-        absl::bit_cast<void *>(gpu_src),
+        absl::bit_cast<void*>(gpu_src),
         CUDAPointerToMemorySpaceString(gpu_src).c_str(),
         CUDAPointerToDeviceString(gpu_src).c_str(),
         CUDAPointersToCanAccessString(gpu_src, gpu_dst).c_str(), size, size);
@@ -1167,9 +1250,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return true;
 }
 
-/* static */ port::Status CUDADriver::CreateEvent(CudaContext* context,
-                                                  CUevent *result,
-                                                  EventFlags flags) {
+/* static */ port::Status GpuDriver::CreateEvent(GpuContext* context,
+                                                 CUevent* result,
+                                                 EventFlags flags) {
   int cuflags;
   switch (flags) {
     case EventFlags::kDefault:
@@ -1183,7 +1266,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   }
 
   ScopedActivateContext activated{context};
-  CUresult res = cuEventCreate(result, cuflags);
+  CUresult res = tensorflow::wrap::cuEventCreate(result, cuflags);
 
   if (res == CUDA_SUCCESS) {
     return port::Status::OK();
@@ -1197,9 +1280,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   }
 }
 
-/* static */ int CUDADriver::GetDeviceCount() {
+/* static */ int GpuDriver::GetDeviceCount() {
   int device_count = 0;
-  CUresult res = cuDeviceGetCount(&device_count);
+  CUresult res = tensorflow::wrap::cuDeviceGetCount(&device_count);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not retrieve CUDA device count: " << ToString(res);
     return 0;
@@ -1211,11 +1294,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return device_count;
 }
 
-/* static */ port::StatusOr<CudaContext*> CUDADriver::GetPointerContext(
+/* static */ port::StatusOr<GpuContext*> GpuDriver::GetPointerContext(
     CUdeviceptr pointer) {
-  CudaContext* context = nullptr;
-  CUresult result =
-      cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
+  GpuContext* context = nullptr;
+  CUresult result = tensorflow::wrap::cuPointerGetAttribute(
+      &context, CU_POINTER_ATTRIBUTE_CONTEXT, pointer);
   if (result == CUDA_SUCCESS) {
     CHECK(context != nullptr) << "success should entail non-null context";
     return context;
@@ -1227,11 +1310,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                    ToString(result)));
 }
 
-/* static */ port::StatusOr<MemorySpace> CUDADriver::GetPointerMemorySpace(
+/* static */ port::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
     CUdeviceptr pointer) {
   unsigned int value;
-  CUresult result =
-      cuPointerGetAttribute(&value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer);
+  CUresult result = tensorflow::wrap::cuPointerGetAttribute(
+      &value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer);
   if (result == CUDA_SUCCESS) {
     switch (value) {
       case CU_MEMORYTYPE_DEVICE:
@@ -1251,10 +1334,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                    ToString(result)));
 }
 
-/* static */ port::Status CUDADriver::GetPointerAddressRange(CUdeviceptr dptr,
-                                                             CUdeviceptr *base,
-                                                             size_t *size) {
-  CUresult result = cuMemGetAddressRange(base, size, dptr);
+/* static */ port::Status GpuDriver::GetPointerAddressRange(CUdeviceptr dptr,
+                                                            CUdeviceptr* base,
+                                                            size_t* size) {
+  CUresult result = tensorflow::wrap::cuMemGetAddressRange(base, size, dptr);
   if (result == CUDA_SUCCESS) {
     return port::Status::OK();
   } else if (result == CUDA_ERROR_NOT_FOUND) {
@@ -1264,16 +1347,16 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     return port::Status(
         port::error::NOT_FOUND,
         port::Printf("not a device pointer %p; %s",
-                     reinterpret_cast<void *>(dptr), ToString(result).c_str()));
+                     reinterpret_cast<void*>(dptr), ToString(result).c_str()));
   }
 
   return port::Status(
       port::error::INTERNAL,
       port::Printf("failed to get pointer into for device pointer %p; %s",
-                   reinterpret_cast<void *>(dptr), ToString(result).c_str()));
+                   reinterpret_cast<void*>(dptr), ToString(result).c_str()));
 }
 
-/* static */ port::StatusOr<CUdevice> CUDADriver::GetPointerDevice(
+/* static */ port::StatusOr<CUdevice> GpuDriver::GetPointerDevice(
     CUdeviceptr pointer) {
   auto result = GetPointerContext(pointer);
   if (!result.ok()) {
@@ -1283,20 +1366,40 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   return DeviceFromContext(result.ValueOrDie());
 }
 
-/* static */ port::Status CUDADriver::GetComputeCapability(int *cc_major,
-                                                           int *cc_minor,
-                                                           CUdevice device) {
+/* static */ port::Status GpuDriver::GetComputeCapability(int* cc_major,
+                                                          int* cc_minor,
+                                                          CUdevice device) {
   *cc_major = 0;
   *cc_minor = 0;
-  CUresult result = cuDeviceComputeCapability(cc_major, cc_minor, device);
-  if (result == CUDA_SUCCESS) {
-    return port::Status::OK();
+
+  CUresult res = tensorflow::wrap::cuDeviceGetAttribute(
+      cc_major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
+  if (res != CUDA_SUCCESS) {
+    return port::Status(
+        port::error::INTERNAL,
+        port::Printf(
+            "failed to get compute capability major for device: %s; %d",
+            ToString(res).c_str(), device));
   }
 
-  return port::Status(
+  res = tensorflow::wrap::cuDeviceGetAttribute(
+      cc_minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
+  if (res != CUDA_SUCCESS) {
+    return port::Status(
+        port::error::INTERNAL,
+        port::Printf(
+            "failed to get compute capability minor for device: %s; %d",
+            ToString(res).c_str(), device));
+  }
+
+  return port::Status::OK();
+}
+
+/* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
+                                                      CUdevice device) {
+  return port::Status{
       port::error::INTERNAL,
-      port::Printf("failed to get compute capability for device: %s; %d",
-                   ToString(result).c_str(), device));
+      "Feature not supported on CUDA platform (GetGpuISAVersion)"};
 }
 
 // Helper function that turns the integer output of cuDeviceGetAttribute to type
@@ -1305,7 +1408,8 @@ template <typename T>
 static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
                                             CUdevice_attribute attribute) {
   int value = -1;
-  CUresult result = cuDeviceGetAttribute(&value, attribute, device);
+  CUresult result =
+      tensorflow::wrap::cuDeviceGetAttribute(&value, attribute, device);
   if (result != CUDA_SUCCESS) {
     return port::Status(
         port::error::NOT_FOUND,
@@ -1316,68 +1420,68 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return converted;
 }
 
-/* static */ port::StatusOr<int> CUDADriver::GetMultiprocessorCount(
+/* static */ port::StatusOr<int> GpuDriver::GetMultiprocessorCount(
     CUdevice device) {
   return GetSimpleAttribute<int>(device,
                                  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
 }
 
-/* static */ port::StatusOr<int64> CUDADriver::GetMaxSharedMemoryPerCore(
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerCore(
     CUdevice device) {
   return GetSimpleAttribute<int64>(
       device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
 }
 
-/* static */ port::StatusOr<int64> CUDADriver::GetMaxSharedMemoryPerBlock(
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerBlock(
     CUdevice device) {
   return GetSimpleAttribute<int64>(
       device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK);
 }
 
-/* static */ port::StatusOr<int64> CUDADriver::GetMaxThreadsPerMultiprocessor(
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerMultiprocessor(
     CUdevice device) {
   return GetSimpleAttribute<int64>(
       device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR);
 }
 
-/* static */ port::StatusOr<int64> CUDADriver::GetMaxThreadsPerBlock(
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerBlock(
     CUdevice device) {
   return GetSimpleAttribute<int64>(device,
                                    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
 }
 
-/* static */ port::StatusOr<int64> CUDADriver::GetMaxRegistersPerBlock(
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxRegistersPerBlock(
     CUdevice device) {
   return GetSimpleAttribute<int64>(device,
                                    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK);
 }
 
-/* static */ port::StatusOr<int64> CUDADriver::GetThreadsPerWarp(
+/* static */ port::StatusOr<int64> GpuDriver::GetThreadsPerWarp(
     CUdevice device) {
   return GetSimpleAttribute<int64>(device, CU_DEVICE_ATTRIBUTE_WARP_SIZE);
 }
 
-/* static */ bool CUDADriver::GetGridLimits(int *x, int *y, int *z,
-                                            CUdevice device) {
+/* static */ bool GpuDriver::GetGridLimits(int* x, int* y, int* z,
+                                           CUdevice device) {
   int value;
-  CUresult res =
-      cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
+  CUresult res = tensorflow::wrap::cuDeviceGetAttribute(
+      &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
     return false;
   }
   *x = value;
 
-  res =
-      cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device);
+  res = tensorflow::wrap::cuDeviceGetAttribute(
+      &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
     return false;
   }
   *y = value;
 
-  res =
-      cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device);
+  res = tensorflow::wrap::cuDeviceGetAttribute(
+      &value, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
     return false;
@@ -1386,8 +1490,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return true;
 }
 
-/* static */ bool CUDADriver::GetDriverVersion(int *driver_version) {
-  CUresult res = cuDriverGetVersion(driver_version);
+/* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
+  CUresult res = tensorflow::wrap::cuDriverGetVersion(driver_version);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query driver version: " << ToString(res);
     return false;
@@ -1396,9 +1500,10 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return true;
 }
 
-/* static */ bool CUDADriver::GetDeviceProperties(CUdevprop *device_properties,
-                                                  int device_ordinal) {
-  CUresult res = cuDeviceGetProperties(device_properties, device_ordinal);
+/* static */ bool GpuDriver::GetDeviceProperties(CUdevprop* device_properties,
+                                                 int device_ordinal) {
+  CUresult res = tensorflow::wrap::cuDeviceGetProperties(device_properties,
+                                                         device_ordinal);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query device properties: " << ToString(res);
     return false;
@@ -1407,10 +1512,11 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return true;
 }
 
-/* static */ port::StatusOr<int> CUDADriver::GetDeviceAttribute(
+/* static */ port::StatusOr<int> GpuDriver::GetDeviceAttribute(
     CUdevice_attribute attribute, CUdevice device) {
   int val;
-  CUresult res = cuDeviceGetAttribute(&val, attribute, device);
+  CUresult res =
+      tensorflow::wrap::cuDeviceGetAttribute(&val, attribute, device);
   if (res != CUDA_SUCCESS) {
     return port::Status(
         port::error::INTERNAL,
@@ -1420,10 +1526,10 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return val;
 }
 
-/* static */ bool CUDADriver::IsEccEnabled(CUdevice device, bool *result) {
+/* static */ bool GpuDriver::IsEccEnabled(CUdevice device, bool* result) {
   int value = -1;
-  CUresult res =
-      cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
+  CUresult res = tensorflow::wrap::cuDeviceGetAttribute(
+      &value, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query ECC status: " << ToString(res);
     return false;
@@ -1433,13 +1539,13 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return true;
 }
 
-/* static */ bool CUDADriver::GetDeviceMemoryInfo(CudaContext* context,
-                                                  int64 *free_out,
-                                                  int64 *total_out) {
+/* static */ bool GpuDriver::GetDeviceMemoryInfo(GpuContext* context,
+                                                 int64* free_out,
+                                                 int64* total_out) {
   ScopedActivateContext activation(context);
   size_t free = 0;
   size_t total = 0;
-  CUresult res = cuMemGetInfo(&free, &total);
+  CUresult res = tensorflow::wrap::cuMemGetInfo(&free, &total);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query device memory info: " << ToString(res);
     return false;
@@ -1450,10 +1556,10 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return true;
 }
 
-/* static */ bool CUDADriver::GetDeviceTotalMemory(CUdevice device,
-                                                   uint64 *result) {
+/* static */ bool GpuDriver::GetDeviceTotalMemory(CUdevice device,
+                                                  uint64* result) {
   size_t value = -1;
-  CUresult res = cuDeviceTotalMem(&value, device);
+  CUresult res = tensorflow::wrap::cuDeviceTotalMem(&value, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query total available memory: " << ToString(res);
     return false;
@@ -1463,12 +1569,13 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return true;
 }
 
-/* static */ string CUDADriver::GetPCIBusID(CUdevice device) {
+/* static */ string GpuDriver::GetPCIBusID(CUdevice device) {
   string pci_bus_id;
   static const int kBufferSize = 64;
   absl::InlinedVector<char, 4> chars(kBufferSize);
   chars[kBufferSize - 1] = '\0';
-  CUresult res = cuDeviceGetPCIBusId(chars.begin(), kBufferSize - 1, device);
+  CUresult res = tensorflow::wrap::cuDeviceGetPCIBusId(chars.begin(),
+                                                       kBufferSize - 1, device);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
     return pci_bus_id;
@@ -1477,8 +1584,8 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return pci_bus_id;
 }
 
-/* static */ bool CUDADriver::CanEnablePeerAccess(CudaContext* from,
-                                                  CudaContext* to) {
+/* static */ bool GpuDriver::CanEnablePeerAccess(GpuContext* from,
+                                                 GpuContext* to) {
   if (from == to) {
     return true;  // A context can always access its own memory.
   }
@@ -1496,7 +1603,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
                << to_device.status();
     return false;
   }
-  CUresult res = cuDeviceCanAccessPeer(
+  CUresult res = tensorflow::wrap::cuDeviceCanAccessPeer(
       &can_access_peer, from_device.ValueOrDie(), to_device.ValueOrDie());
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
@@ -1506,14 +1613,15 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return can_access_peer;
 }
 
-/* static */ port::Status CUDADriver::EnablePeerAccess(CudaContext* from,
-                                                       CudaContext* to) {
+/* static */ port::Status GpuDriver::EnablePeerAccess(GpuContext* from,
+                                                      GpuContext* to) {
   if (from == to) {
     return port::Status::OK();  // A context can always access its own memory.
   }
 
   ScopedActivateContext activated{from};
-  CUresult result = cuCtxEnablePeerAccess(to->context(), 0 /* = flags */);
+  CUresult result =
+      tensorflow::wrap::cuCtxEnablePeerAccess(to->context(), 0 /* = flags */);
   if (result != CUDA_SUCCESS &&
       result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
     return port::Status(
@@ -1525,14 +1633,15 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return port::Status::OK();
 }
 
-/* static */ port::StatusOr<int> CUDADriver::GetMaxOccupiedBlocksPerCore(
-    CudaContext* context, CUfunction kernel, int threads_per_block,
+/* static */ port::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
+    GpuContext* context, CUfunction kernel, int threads_per_block,
     size_t dynamic_shared_memory_bytes) {
   ScopedActivateContext activation(context);
 
   int max_blocks;
-  CUresult result = cuOccupancyMaxActiveBlocksPerMultiprocessor(
-      &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes);
+  CUresult result =
+      tensorflow::wrap::cuOccupancyMaxActiveBlocksPerMultiprocessor(
+          &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes);
   if (result != CUDA_SUCCESS) {
     return port::Status(
         port::error::INTERNAL,
@@ -1543,11 +1652,15 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   return max_blocks;
 }
 
-/* static */ CUcontext CUDADriver::CurrentContextOrDie() {
+}  // namespace gpu
+
+namespace cuda {
+
+CUcontext CurrentContextOrDie() {
   CUcontext current = nullptr;
-  CUresult result = cuCtxGetCurrent(&current);
+  CUresult result = tensorflow::wrap::cuCtxGetCurrent(&current);
   if (result != CUDA_SUCCESS) {
-    LOG(FATAL) << "failed to query current context: " << ToString(result);
+    LOG(FATAL) << "failed to query current context: " << gpu::ToString(result);
   }
   return current;
 }
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.h b/tensorflow/stream_executor/cuda/cuda_driver.h
index 3713a5b7b98f8bd5173d649fa592107f06bda27d..5bbe6f6e627e8b4e217345b0e014e95c08df2fb0 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.h
+++ b/tensorflow/stream_executor/cuda/cuda_driver.h
@@ -18,502 +18,46 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_
 
-#include <stddef.h>
-#include "tensorflow/stream_executor/platform/port.h"
-
-#include "tensorflow/stream_executor/device_options.h"
-#include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/platform/port.h"
-#include "cuda/include/cuda.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
 
 namespace stream_executor {
-namespace cuda {
-
-// Identifies the memory space where an allocation resides. See
-// CUDADriver::GetPointerMemorySpace().
-enum class MemorySpace { kHost, kDevice };
-
-// Returns a casual string, such as "host" for the provided memory space.
-string MemorySpaceString(MemorySpace memory_space);
-
-class CudaContext;
-
-// CUDADriver contains wrappers for calls to the userspace library driver. It's
-// useful to isolate these calls and put basic wrappers around them to separate
-// userspace library driver behaviors from the rest of the program.
-//
-// At the moment it's simply used as a namespace.
-//
-// The calls log any specific errors internally and return whether the operation
-// was successful to the caller.
-//
-// The order of parameters is generally kept symmetric with the underlying CUDA
-// driver API.
-//
-// Links on functions are to specific documentation under
-// http://docs.nvidia.com/cuda/cuda-driver-api/
-//
-// Thread safety: these functions should not be used from signal handlers.
-class CUDADriver {
- public:
-  // Wraps a call to cuInit with logging to help indicate what has gone wrong in
-  // the case of failure. Safe to call multiple times; will be fast on all calls
-  // after the first.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html#group__CUDA__INITIALIZE_1g0a2f1517e1bd8502c7194c3a8c134bc3
-  static port::Status Init();
-
-  // Returns the device associated with the given context.
-  // device is an outparam owned by the caller, must not be null.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g4e84b109eba36cdaaade167f34ae881e
-  static port::StatusOr<CUdevice> DeviceFromContext(CudaContext* context);
-
-  // Creates a new CUDA stream associated with the given context via
-  // cuStreamCreate.
-  // stream is an outparam owned by the caller, must not be null.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4
-  static bool CreateStream(CudaContext* context, CUstream *stream);
-
-  // Destroys a CUDA stream associated with the given context.
-  // stream is owned by the caller, must not be null, and *stream is set to null
-  // if the stream is successfully destroyed.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g244c8833de4596bcd31a06cdf21ee758
-  static void DestroyStream(CudaContext* context, CUstream *stream);
-
-  // CUDA events can explicitly disable event TSC retrieval for some presumed
-  // performance improvement if timing is unnecessary.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
-  enum class EventFlags { kDefault, kDisableTiming };
-
-  // Creates a new event associated with the given context.
-  // result is an outparam owned by the caller and must not be null.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
-  static port::Status CreateEvent(CudaContext* context, CUevent *result,
-                                  EventFlags flags);
-
-  // Destroys *event and turns it into a nullptr. event may not be null, but
-  // *event may be, via cuEventDestroy
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g593ec73a8ec5a5fc031311d3e4dca1ef
-  static port::Status DestroyEvent(CudaContext* context, CUevent *event);
-
-  // Allocates a GPU memory space of size bytes associated with the given
-  // context via cuMemAlloc.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb82d2a09844a58dd9e744dc31e8aa467
-  static void *DeviceAllocate(CudaContext* context, uint64 bytes);
-
-  // Deallocates a GPU memory space of size bytes associated with the given
-  // context via cuMemFree.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
-  static void DeviceDeallocate(CudaContext* context, void *location);
-
-  // Allocates a unified memory space of size bytes associated with the given
-  // context via cuMemAllocManaged.
-  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb347ded34dc326af404aa02af5388a32
-  static void* UnifiedMemoryAllocate(CudaContext* context, uint64 bytes);
-
-  // Deallocates a unified memory space of size bytes associated with the given
-  // context via cuMemFree.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
-  static void UnifiedMemoryDeallocate(CudaContext* context, void* location);
-
-  // Allocates page-locked and CUDA-registered memory on the host via
-  // cuMemAllocHost.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0
-  static void *HostAllocate(CudaContext* context, uint64 bytes);
-
-  // Deallocates a location created by HostAllocate, via cuMemFreeHost.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g62e0fdbe181dab6b1c90fa1a51c7b92c
-  static void HostDeallocate(CudaContext* context, void *location);
-
-  // Registers a memory region at location of size bytes via cuMemHostRegister.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b54223
-  static bool HostRegister(CudaContext* context, void *location, uint64 bytes);
-
-  // Unregisters a memory region that was previously registered at location via
-  // cuMemHostUnregister.
-  //
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g63f450c8125359be87b7623b1c0b2a14
-  //
-  // TODO(leary) verify an error will be returned if the location wasn't
-  // previously registered.
-  static bool HostUnregister(CudaContext* context, void *location);
-
-  // Given a device ordinal, returns a device handle into the device outparam,
-  // which must not be null.
-  //
-  // N.B. these device handles do not have a corresponding destroy function in
-  // the CUDA driver API.
-  static port::Status GetDevice(int device_ordinal, CUdevice *device);
-
-  // Given a device handle, returns the name reported by the driver for the
-  // device.
-  static bool GetDeviceName(CUdevice device, string *name_out);
-
-  // Given a device to create a context for, returns a context handle into the
-  // context outparam, which must not be null.
-  //
-  // N.B. CUDA contexts are weird. They are implicitly associated with the
-  // calling thread. Current documentation on contexts and their influence on
-  // userspace processes is given here:
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g65dc0012348bc84810e2103a40d8e2cf
-  static port::Status CreateContext(CUdevice device,
-                                    const DeviceOptions& device_options,
-                                    CudaContext** context);
-
-  // Destroys the provided context via cuCtxDestroy.
-  // Don't do this while clients could still be using the context, per the docs
-  // bad things will happen.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g27a365aebb0eb548166309f58a1e8b8e
-  static void DestroyContext(CudaContext* context);
-
-  // Queries the runtime for the specified attribute of the specified function.
-  // cuFuncGetAttribute (the underlying CUDA driver API routine) only operates
-  // in terms of integer-sized values, so there's no potential for overrun (as
-  // of CUDA 5.5).
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b
-  static bool FuncGetAttribute(CUfunction_attribute attribute,
-                               CUfunction function, int *attribute_value);
-
-  // Sets the preferred cache configuration for the specified function.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g40f8c11e81def95dc0072a375f965681
-  static bool FuncSetCacheConfig(CUfunction function,
-                                 CUfunc_cache cache_config);
-
-  // Gets the preferred shared memory bank configuration for the specified
-  // CONTEXT (not function!), either default or four- or eight-byte bank size.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g17153a1b8b8c756f7ab8505686a4ad74
-  static port::StatusOr<CUsharedconfig> ContextGetSharedMemConfig(
-      CudaContext* context);
-
-  // Sets the preferred shared memory bank configuration for the specified
-  // CONTEXT (not function!), either default or four- or eight-byte bank size.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2574235fa643f8f251bf7bc28fac3692
-  static port::Status ContextSetSharedMemConfig(
-      CudaContext* context, CUsharedconfig shared_mem_config);
-
-  // Launches a CUDA kernel via cuLaunchKernel.
-  // TODO(leary) describe the structure of kernel_params and extra in a readable
-  // way.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15
-  static bool LaunchKernel(CudaContext* context, CUfunction function,
-                           unsigned int grid_dim_x, unsigned int grid_dim_y,
-                           unsigned int grid_dim_z, unsigned int block_dim_x,
-                           unsigned int block_dim_y, unsigned int block_dim_z,
-                           unsigned int shared_mem_bytes, CUstream stream,
-                           void **kernel_params, void **extra);
-
-  // Loads ptx_contents with the CUDA driver's PTX JIT and stores the resulting
-  // handle in "module". Any error logs that are produced are logged internally.
-  static bool LoadPtx(CudaContext* context, const char *ptx_contents,
-                      CUmodule *module);
-
-  // Loads cubin_bytes with the CUDA driver's blob loading interface and stores
-  // the resulting handle in "module".
-  static port::Status LoadCubin(CudaContext* context, const char *cubin_bytes,
-                                CUmodule *module);
-
-  // Retrieves a named kernel from a loaded module, and places the resulting
-  // handle into function (outparam) on success. Neither kernel_name nor
-  // function may be null. No ownership is taken of kernel_name.
-  static bool GetModuleFunction(CudaContext* context, CUmodule module,
-                                const char *kernel_name, CUfunction *function);
-
-  // Retrieves a named global/constant symbol from a loaded module, and returns
-  // a device pointer and size of the symbol on success. symbol_name may not be
-  // null. At least one of dptr or bytes should not be null. No ownership is
-  // taken of symbol_name.
-  static bool GetModuleSymbol(CudaContext* context, CUmodule module,
-                              const char *symbol_name, CUdeviceptr *dptr,
-                              size_t *bytes);
-
-  // Unloads module from the current context via cuModuleUnload.
-  // TODO(leary) the documentation doesn't say what kind of disasters happen
-  // if you try to unload a module while its CUfunctions are in use.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b
-  static void UnloadModule(CudaContext* context, CUmodule module);
-
-  // Performs a synchronous memset of the device memory segment via cuMemsetD8.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6e582bf866e9e2fb014297bfaf354d7b
-  static bool SynchronousMemsetUint8(CudaContext* context, CUdeviceptr location,
-                                     uint8 value, size_t size);
-
-  // Performs a synchronous memset of the device memory segment via cuMemsetD32.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g983e8d8759acd1b64326317481fbf132
-  static bool SynchronousMemsetUint32(CudaContext* context,
-                                      CUdeviceptr location, uint32 value,
-                                      size_t uint32_count);
-
-  // Performs an asynchronous memset of the device memory segment via
-  // cuMemsetD8Async.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gaef08a7ccd61112f94e82f2b30d43627
-  static bool AsynchronousMemsetUint8(CudaContext* context, CUdeviceptr location,
-                                      uint8 value, size_t uint32_count,
-                                      CUstream stream);
-
-  // Performs an asynchronous memset of the device memory segment via
-  // cuMemsetD32Async.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g58229da5d30f1c0cdf667b320ec2c0f5
-  static bool AsynchronousMemsetUint32(CudaContext* context,
-                                       CUdeviceptr location, uint32 value,
-                                       size_t uint32_count, CUstream stream);
-
-  // -- Synchronous memcopies.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4d32266788c440b0220b1a9ba5795169
-
-  static port::Status SynchronousMemcpyD2H(CudaContext* context, void* host_dst,
-                                           CUdeviceptr gpu_src, uint64 size);
-  static port::Status SynchronousMemcpyH2D(CudaContext* context,
-                                           CUdeviceptr gpu_dst,
-                                           const void* host_src, uint64 size);
-  static port::Status SynchronousMemcpyD2D(CudaContext* context,
-                                           CUdeviceptr gpu_dst,
-                                           CUdeviceptr gpu_src, uint64 size);
-
-  // -- Asynchronous memcopies.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g56f30236c7c5247f8e061b59d3268362
-
-  static bool AsynchronousMemcpyD2H(CudaContext* context, void *host_dst,
-                                    CUdeviceptr gpu_src, uint64 size,
-                                    CUstream stream);
-  static bool AsynchronousMemcpyH2D(CudaContext* context, CUdeviceptr gpu_dst,
-                                    const void *host_src, uint64 size,
-                                    CUstream stream);
-  static bool AsynchronousMemcpyD2D(CudaContext* context, CUdeviceptr gpu_dst,
-                                    CUdeviceptr gpu_src, uint64 size,
-                                    CUstream stream);
-
-  // The CUDA stream callback type signature.
-  // The data passed to AddStreamCallback is subsequently passed to this
-  // callback when it fires.
-  //
-  // Some notable things:
-  // * Callbacks must not make any CUDA API calls.
-  // * Callbacks from independent streams execute in an undefined order and may
-  //   be serialized.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g613d97a277d7640f4cb1c03bd51c2483
-  typedef void (*StreamCallback)(CUstream stream, CUresult status, void *data);
-
-  // Enqueues a callback operation into stream.
-  // See StreamCallback above and the NVIDIA documentation for additional
-  // details.
-  static bool AddStreamCallback(CudaContext* context, CUstream stream,
-                                StreamCallback callback, void *data);
-
-  // Causes stream to wait for event to trigger before proceeding via
-  // cuStreamWaitEvent.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#axzz334nAXAhM
-  static bool WaitStreamOnEvent(CudaContext* context, CUstream stream,
-                                CUevent event);
-
-  // Blocks the calling thread until the operations enqueued onto stream have
-  // been completed, via cuStreamSynchronize.
-  //
-  // TODO(leary) if a pathological thread enqueues operations onto the stream
-  // while another thread blocks like this, can you wind up waiting an unbounded
-  // amount of time?
-  //
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g15e49dd91ec15991eb7c0a741beb7dad
-  static port::Status SynchronizeStream(CudaContext* context, CUstream stream);
-
-  // Blocks the calling thread until the operations associated with the context
-  // have been completed, via cuCtxSynchronize.
-  //
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g7a54725f28d34b8c6299f0c6ca579616
-  static bool SynchronizeContext(CudaContext* context);
-
-  // Returns true if all stream tasks have completed at time of the call. Note
-  // the potential for races around this call (if another thread adds work to
-  // the stream immediately after this returns).
-  static bool IsStreamIdle(CudaContext* context, CUstream stream);
-
-  // Returns whether code in the from context can access memory in the to
-  // context via cuDeviceCanAccessPeer.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e
-  static bool CanEnablePeerAccess(CudaContext* from, CudaContext* to);
-
-  // Enables peer access per CanEnablePeerAccess, via cuCtxEnablePeerAccess.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a
-  static port::Status EnablePeerAccess(CudaContext* from, CudaContext* to);
-
-  // Returns the elapsed milliseconds between start and stop via
-  // cuEventElapsedTime.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1gdfb1178807353bbcaa9e245da497cf97
-  static bool GetEventElapsedTime(CudaContext* context,
-                                  float *elapsed_milliseconds, CUevent start,
-                                  CUevent stop);
-
-  // Records that an event occurred when execution reaches the current point in
-  // thestream via cuEventRecord.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g95424d3be52c4eb95d83861b70fb89d1
-  static port::Status RecordEvent(CudaContext* context, CUevent event,
-                                  CUstream stream);
-
-  // Polls (without blocking) to determine the status of an event - pending or
-  // complete (or an error status).
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g6f0704d755066b0ee705749ae911deef
-  static port::StatusOr<CUresult> QueryEvent(CudaContext* context,
-                                             CUevent event);
-
-  // -- Pointer-specific calls.
-
-  // Returns the context in which pointer was allocated or registered.
-  static port::StatusOr<CudaContext*> GetPointerContext(CUdeviceptr pointer);
-
-  // Returns the device associated with the context from GetPointerContext().
-  static port::StatusOr<CUdevice> GetPointerDevice(CUdeviceptr pointer);
-
-  // Returns the memory space addressed by pointer.
-  static port::StatusOr<MemorySpace> GetPointerMemorySpace(CUdeviceptr pointer);
-
-  // Returns the base address and size of the device pointer dptr.
-  static port::Status GetPointerAddressRange(CUdeviceptr dptr,
-                                             CUdeviceptr *base, size_t *size);
-
-  // -- Device-specific calls.
-
-  // Returns the compute capability for the device; i.e (3, 5).
-  // This is currently done via the deprecated device API.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1ge2091bbac7e1fb18c2821612115607ea
-  static port::Status GetComputeCapability(int *cc_major, int *cc_minor,
-                                           CUdevice device);
-
-  // Returns the number of multiprocessors on the device (note that the device
-  // may be multi-GPU-per-board).
-  static port::StatusOr<int> GetMultiprocessorCount(CUdevice device);
-
-  // Returns the limit on number of threads that can be resident in a single
-  // multiprocessor.
-  static port::StatusOr<int64> GetMaxThreadsPerMultiprocessor(CUdevice device);
-
-  // Returns the limit on number of threads which may be resident for a single
-  // block (cooperative thread array).
-  static port::StatusOr<int64> GetMaxThreadsPerBlock(CUdevice device);
-
-  // Returns the amount of shared memory available on a single GPU core (i.e.
-  // SM on NVIDIA devices).
-  static port::StatusOr<int64> GetMaxSharedMemoryPerCore(CUdevice device);
-
-  // Returns the amount of shared memory available for a single block
-  // (cooperative thread array).
-  static port::StatusOr<int64> GetMaxSharedMemoryPerBlock(CUdevice device);
-
-  // Returns the maximum supported number of registers per block.
-  static port::StatusOr<int64> GetMaxRegistersPerBlock(CUdevice device);
-
-  // Returns the number of threads per warp.
-  static port::StatusOr<int64> GetThreadsPerWarp(CUdevice device);
-
-  // Queries the grid limits for device with cuDeviceGetAttribute calls.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
-  static bool GetGridLimits(int *x, int *y, int *z, CUdevice device);
-
-  // Returns a grab-bag of device properties in a caller-owned device_properties
-  // structure for device_ordinal via cuDeviceGetProperties.
-  //
-  // This call is deprecated in the NVIDIA driver API; its replacement is
-  // GetDeviceAttribute
-  //
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1g65a5b4e25186bd257df80b98c98cffe6
-  static bool GetDeviceProperties(CUdevprop *device_properties,
-                                  int device_ordinal);
-
-  // Gets a specific integer-valued property about the given device.
-  //
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
-  static port::StatusOr<int> GetDeviceAttribute(CUdevice_attribute attribute,
-                                                CUdevice device);
-
-  // Returns whether ECC is enabled for the given CUdevice via
-  // cuDeviceGetattribute with CU_DEVICE_ATTRIBUTE_ECC_ENABLED.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
-  static bool IsEccEnabled(CUdevice device, bool *result);
-
-  // Returns the total amount of memory available for allocation by the CUDA
-  // context, in bytes, via cuDeviceTotalMem.
-  static bool GetDeviceTotalMemory(CUdevice device, uint64 *result);
-
-  // Returns the free amount of memory and total amount of memory, as reported
-  // by cuMemGetInfo.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g808f555540d0143a331cc42aa98835c0
-  static bool GetDeviceMemoryInfo(CudaContext* context, int64* free,
-                                  int64* total);
-
-  // Returns a PCI bus id string for the device.
-  // [domain]:[bus]:[device].[function]
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g85295e7d9745ab8f0aa80dd1e172acfc
-  static string GetPCIBusID(CUdevice device);
-
-  // -- Context- and device-independent calls.
-
-  // Returns the number of visible CUDA device via cuDeviceGetCount.
-  // This should correspond to the set of device ordinals available.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g52b5ce05cb8c5fb6831b2c0ff2887c74
-  static int GetDeviceCount();
-
-  // Returns the driver version number via cuDriverGetVersion.
-  // This is, surprisingly, NOT the actual driver version (e.g. 331.79) but,
-  // instead, the CUDA toolkit release number that this driver is compatible
-  // with; e.g. 6000 (for a CUDA 6.0 compatible driver) or 6050 (for a CUDA 6.5
-  // compatible driver).
-  //
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VERSION.html#group__CUDA__VERSION_1g8b7a10395392e049006e61bcdc8ebe71
-  static bool GetDriverVersion(int *driver_version);
-
-  // -- Other calls
-
-  // Returns the maximum number of blocks (per multiprocessor) occupied by the
-  // specified kernel/CUfunction when launched with the specified parameters.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__OCCUPANCY.html#group__CUDA__OCCUPANCY_1gcc6e1094d05cba2cee17fe33ddd04a98
-  static port::StatusOr<int> GetMaxOccupiedBlocksPerCore(
-      CudaContext* context, CUfunction kernel, int threads_per_block,
-      size_t dynamic_shared_memory_bytes);
-
-  // Returns the current context set in CUDA. This is done by calling the cuda
-  // driver (e.g., this value is not our cached view of the current context).
-  static CUcontext CurrentContextOrDie();
-
-  // Seam for injecting an error at CUDA initialization time for testing
-  // purposes.
-  static bool driver_inject_init_error_;
-};
-
-// Ensures a context is activated within a scope.
-class ScopedActivateContext {
- public:
-  // Activates the context via cuCtxSetCurrent, if it is not the currently
-  // active context (a la cuCtxGetCurrent). Note the alternative push/pop
-  // mechanism is said by NVIDIA to be relatively slow and deprecated.
-  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gbe562ee6258b4fcc272ca6478ca2a2f7
-  explicit ScopedActivateContext(CudaContext* context);
-
-  // Checks that the context has remained activated for the duration of the
-  // scope.
-  ~ScopedActivateContext();
-
- private:
-  CudaContext* to_restore_ = nullptr;
-};
-
-// CudaContext wraps a cuda CUcontext handle, and includes a unique id. The
+namespace gpu {
+// CUDAContext wraps a cuda CUcontext handle, and includes a unique id. The
 // unique id is positive, and ids are not repeated within the process.
-class CudaContext {
+class GpuContext {
  public:
-  CudaContext(CUcontext context, int64 id) : context_(context), id_(id) { }
+  GpuContext(CUcontext context, int64 id) : context_(context), id_(id) {}
 
   CUcontext context() const { return context_; }
   int64 id() const { return id_; }
 
   // Disallow copying and moving.
-  CudaContext(CudaContext&&) = delete;
-  CudaContext(const CudaContext&) = delete;
-  CudaContext& operator=(CudaContext&&) = delete;
-  CudaContext& operator=(const CudaContext&) = delete;
+  GpuContext(GpuContext&&) = delete;
+  GpuContext(const GpuContext&) = delete;
+  GpuContext& operator=(GpuContext&&) = delete;
+  GpuContext& operator=(const GpuContext&) = delete;
 
  private:
   CUcontext const context_;
   const int64 id_;
 };
 
+}  // namespace gpu
+
+namespace cuda {
+
+using MemorySpace = gpu::MemorySpace;
+
+using CUDADriver = gpu::GpuDriver;
+
+using ScopedActivateContext = gpu::ScopedActivateContext;
+
+using CudaContext = gpu::GpuContext;
+
+// Returns the current context set in CUDA. This is done by calling the cuda
+// driver (e.g., this value is not our cached view of the current context).
+CUcontext CurrentContextOrDie();
+
 }  // namespace cuda
 }  // namespace stream_executor
 
diff --git a/tensorflow/stream_executor/cuda/cuda_driver_wrapper.h b/tensorflow/stream_executor/cuda/cuda_driver_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..0de27d5cd3c858b0990618d7ec74be31088918f5
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_driver_wrapper.h
@@ -0,0 +1,144 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file wraps cuda driver calls with dso loader so that we don't need to
+// have explicit linking to libcuda. All TF cuda driver usage should route
+// through this wrapper.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_WRAPPER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_WRAPPER_H_
+
+#include "cuda/include/cuda.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace tensorflow {
+namespace wrap {
+#ifdef PLATFORM_GOOGLE
+// Use static linked library
+#define STREAM_EXECUTOR_LIBCUDA_WRAP(cudaSymbolName)                       \
+  template <typename... Args>                                              \
+  auto cudaSymbolName(Args... args)->decltype(::cudaSymbolName(args...)) { \
+    return ::cudaSymbolName(args...);                                      \
+  }
+
+// This macro wraps a global identifier, given by cudaSymbolName, in a callable
+// structure that loads the DLL symbol out of the DSO handle in a thread-safe
+// manner on first use. This dynamic loading technique is used to avoid DSO
+// dependencies on vendor libraries which may or may not be available in the
+// deployed binary environment.
+#else
+#define TO_STR_(x) #x
+#define TO_STR(x) TO_STR_(x)
+
+#define STREAM_EXECUTOR_LIBCUDA_WRAP(cudaSymbolName)                           \
+  template <typename... Args>                                                  \
+  auto cudaSymbolName(Args... args)->decltype(::cudaSymbolName(args...)) {     \
+    using FuncPtrT = std::add_pointer<decltype(::cudaSymbolName)>::type;       \
+    static FuncPtrT loaded = []() -> FuncPtrT {                                \
+      static const char *kName = TO_STR(cudaSymbolName);                       \
+      void *f;                                                                 \
+      auto s = stream_executor::port::Env::Default()->GetSymbolFromLibrary(    \
+          stream_executor::internal::CachedDsoLoader::GetCudaDriverDsoHandle() \
+              .ValueOrDie(),                                                   \
+          kName, &f);                                                          \
+      CHECK(s.ok()) << "could not find " << kName                              \
+                    << " in libcuda DSO; dlerror: " << s.error_message();      \
+      return reinterpret_cast<FuncPtrT>(f);                                    \
+    }();                                                                       \
+    return loaded(args...);                                                    \
+  }
+#endif
+
+// clang-format off
+#define LIBCUDA_ROUTINE_EACH(__macro)                   \
+  __macro(cuCtxEnablePeerAccess)                        \
+  __macro(cuCtxGetCurrent)                              \
+  __macro(cuCtxGetDevice)                               \
+  __macro(cuCtxGetSharedMemConfig)                      \
+  __macro(cuCtxSetCurrent)                              \
+  __macro(cuCtxSetSharedMemConfig)                      \
+  __macro(cuCtxSynchronize)                             \
+  __macro(cuDeviceCanAccessPeer)                        \
+  __macro(cuDeviceGet)                                  \
+  __macro(cuDeviceGetAttribute)                         \
+  __macro(cuDeviceGetCount)                             \
+  __macro(cuDeviceGetName)                              \
+  __macro(cuDeviceGetPCIBusId)                          \
+  __macro(cuDeviceGetProperties)                        \
+  __macro(cuDevicePrimaryCtxGetState)                   \
+  __macro(cuDevicePrimaryCtxRelease)                    \
+  __macro(cuDevicePrimaryCtxRetain)                     \
+  __macro(cuDevicePrimaryCtxSetFlags)                   \
+  __macro(cuDeviceTotalMem)                             \
+  __macro(cuDriverGetVersion)                           \
+  __macro(cuEventCreate)                                \
+  __macro(cuEventDestroy)                               \
+  __macro(cuEventElapsedTime)                           \
+  __macro(cuEventQuery)                                 \
+  __macro(cuEventRecord)                                \
+  __macro(cuEventSynchronize)                           \
+  __macro(cuFuncGetAttribute)                           \
+  __macro(cuFuncSetCacheConfig)                         \
+  __macro(cuGetErrorName)                               \
+  __macro(cuGetErrorString)                             \
+  __macro(cuInit)                                       \
+  __macro(cuLaunchKernel)                               \
+  __macro(cuMemAlloc)                                   \
+  __macro(cuMemAllocManaged)                            \
+  __macro(cuMemFree)                                    \
+  __macro(cuMemFreeHost)                                \
+  __macro(cuMemGetAddressRange)                         \
+  __macro(cuMemGetInfo)                                 \
+  __macro(cuMemHostAlloc)                               \
+  __macro(cuMemHostRegister)                            \
+  __macro(cuMemHostUnregister)                          \
+  __macro(cuMemcpyDtoD)                                 \
+  __macro(cuMemcpyDtoDAsync)                            \
+  __macro(cuMemcpyDtoH)                                 \
+  __macro(cuMemcpyDtoHAsync)                            \
+  __macro(cuMemcpyHtoD)                                 \
+  __macro(cuMemcpyHtoDAsync)                            \
+  __macro(cuMemsetD32)                                  \
+  __macro(cuMemsetD32Async)                             \
+  __macro(cuMemsetD8)                                   \
+  __macro(cuMemsetD8Async)                              \
+  __macro(cuModuleGetFunction)                          \
+  __macro(cuModuleGetGlobal)                            \
+  __macro(cuModuleLoadDataEx)                           \
+  __macro(cuModuleLoadFatBinary)                        \
+  __macro(cuModuleUnload)                               \
+  __macro(cuOccupancyMaxActiveBlocksPerMultiprocessor)  \
+  __macro(cuOccupancyMaxPotentialBlockSize)             \
+  __macro(cuPointerGetAttribute)                        \
+  __macro(cuStreamAddCallback)                          \
+  __macro(cuStreamCreate)                               \
+  __macro(cuStreamDestroy)                              \
+  __macro(cuStreamQuery)                                \
+  __macro(cuStreamSynchronize)                          \
+  __macro(cuStreamWaitEvent)
+
+// clang-format on
+
+LIBCUDA_ROUTINE_EACH(STREAM_EXECUTOR_LIBCUDA_WRAP)
+#undef LIBCUDA_ROUTINE_EACH
+#undef STREAM_EXECUTOR_LIBCUDA_WRAP
+#undef TO_STR
+#undef TO_STR_
+}  // namespace wrap
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_WRAPPER_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_event.cc b/tensorflow/stream_executor/cuda/cuda_event.cc
index 96dcf173566087db475e3b237591d19f06128d92..fd9d4741e01082ee46c9f1ba77a089ee2cc8fad5 100644
--- a/tensorflow/stream_executor/cuda/cuda_event.cc
+++ b/tensorflow/stream_executor/cuda/cuda_event.cc
@@ -20,30 +20,11 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
-CUDAEvent::CUDAEvent(CUDAExecutor* parent)
-    : parent_(parent), cuda_event_(nullptr) {}
-
-CUDAEvent::~CUDAEvent() {}
-
-port::Status CUDAEvent::Init() {
-  return CUDADriver::CreateEvent(parent_->cuda_context(), &cuda_event_,
-                                 CUDADriver::EventFlags::kDisableTiming);
-}
-
-port::Status CUDAEvent::Destroy() {
-  return CUDADriver::DestroyEvent(parent_->cuda_context(), &cuda_event_);
-}
-
-port::Status CUDAEvent::Record(CUDAStream* stream) {
-  return CUDADriver::RecordEvent(parent_->cuda_context(), cuda_event_,
-                                 stream->cuda_stream());
-}
-
-Event::Status CUDAEvent::PollForStatus() {
+Event::Status GpuEvent::PollForStatus() {
   port::StatusOr<CUresult> status =
-      CUDADriver::QueryEvent(parent_->cuda_context(), cuda_event_);
+      GpuDriver::QueryEvent(parent_->gpu_context(), gpu_event_);
   if (!status.ok()) {
     LOG(ERROR) << "Error polling for event status: "
                << status.status().error_message();
@@ -62,9 +43,5 @@ Event::Status CUDAEvent::PollForStatus() {
   }
 }
 
-const CUevent& CUDAEvent::cuda_event() {
-  return cuda_event_;
-}
-
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_event.h b/tensorflow/stream_executor/cuda/cuda_event.h
index f62344672ed624f1ed60b5452d33b6f8273f2b47..e3596e0261acc1f6225c610db33dbbcdc38fd7e4 100644
--- a/tensorflow/stream_executor/cuda/cuda_event.h
+++ b/tensorflow/stream_executor/cuda/cuda_event.h
@@ -16,45 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_EVENT_H_
 
-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
-#include "tensorflow/stream_executor/cuda/cuda_stream.h"
-#include "tensorflow/stream_executor/event.h"
-#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/gpu/gpu_event.h"
 
 namespace stream_executor {
 namespace cuda {
 
-// CUDAEvent wraps a CUevent in the platform-independent EventInterface
-// interface.
-class CUDAEvent : public internal::EventInterface {
- public:
-  explicit CUDAEvent(CUDAExecutor* parent);
-
-  ~CUDAEvent() override;
-
-  // Populates the CUDA-platform-specific elements of this object.
-  port::Status Init();
-
-  // Deallocates any platform-specific elements of this object. This is broken
-  // out (not part of the destructor) to allow for error reporting.
-  port::Status Destroy();
-
-  // Inserts the event at the current position into the specified stream.
-  port::Status Record(CUDAStream* stream);
-
-  // Polls the CUDA platform for the event's current status.
-  Event::Status PollForStatus();
-
-  // The underlying CUDA event element.
-  const CUevent& cuda_event();
-
- private:
-  // The Executor used to which this object and CUevent are bound.
-  CUDAExecutor* parent_;
-
-  // The underlying CUDA event element.
-  CUevent cuda_event_;
-};
+using CUDAEvent = gpu::GpuEvent;
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.cc b/tensorflow/stream_executor/cuda/cuda_fft.cc
index acac7d6368885537b1f5727779388d550680e90d..054b43b5b7a39702ce22891028f547b39c778fc1 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.cc
+++ b/tensorflow/stream_executor/cuda/cuda_fft.cc
@@ -23,21 +23,17 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/device_memory.h"
-
-#ifndef PLATFORM_GOOGLE
-#include "tensorflow/stream_executor/dso_loader.h"
-#endif
-
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
 PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuFftPlugin);
 
@@ -49,13 +45,13 @@ namespace wrap {
 // manner on first use. This dynamic loading technique is used to avoid DSO
 // dependencies on vendor libraries which may or may not be available in the
 // deployed binary environment.
-#define STREAM_EXECUTOR_CUFFT_WRAP(__name)                       \
-  struct WrapperShim__##__name {                                 \
-    template <typename... Args>                                  \
-    cufftResult operator()(CUDAExecutor *parent, Args... args) { \
-      cuda::ScopedActivateExecutorContext sac{parent};           \
-      return ::__name(args...);                                  \
-    }                                                            \
+#define STREAM_EXECUTOR_CUFFT_WRAP(__name)                      \
+  struct WrapperShim__##__name {                                \
+    template <typename... Args>                                 \
+    cufftResult operator()(GpuExecutor *parent, Args... args) { \
+      gpu::ScopedActivateExecutorContext sac{parent};           \
+      return ::__name(args...);                                 \
+    }                                                           \
   } __name;
 
 #else
@@ -81,8 +77,8 @@ namespace wrap {
       return f;                                                           \
     }                                                                     \
     template <typename... Args>                                           \
-    cufftResult operator()(CUDAExecutor *parent, Args... args) {          \
-      cuda::ScopedActivateExecutorContext sac{parent};                    \
+    cufftResult operator()(GpuExecutor *parent, Args... args) {           \
+      gpu::ScopedActivateExecutorContext sac{parent};                     \
       return DynLoad()(args...);                                          \
     }                                                                     \
   } __name;                                                               \
@@ -149,8 +145,8 @@ cufftType CUDAFftType(fft::Type type) {
 }
 
 // Associates the given stream with the given cuFFT plan.
-bool SetStream(CUDAExecutor *parent, cufftHandle plan, Stream *stream) {
-  auto ret = wrap::cufftSetStream(parent, plan, AsCUDAStreamValue(stream));
+bool SetStream(GpuExecutor *parent, cufftHandle plan, Stream *stream) {
+  auto ret = wrap::cufftSetStream(parent, plan, AsGpuStreamValue(stream));
   if (ret != CUFFT_SUCCESS) {
     LOG(ERROR) << "failed to run cuFFT routine cufftSetStream: " << ret;
     return false;
@@ -161,7 +157,7 @@ bool SetStream(CUDAExecutor *parent, cufftHandle plan, Stream *stream) {
 }  // namespace
 
 port::Status CUDAFftPlan::Initialize(
-    CUDAExecutor *parent, Stream *stream, int rank, uint64 *elem_count,
+    GpuExecutor *parent, Stream *stream, int rank, uint64 *elem_count,
     uint64 *input_embed, uint64 input_stride, uint64 input_distance,
     uint64 *output_embed, uint64 output_stride, uint64 output_distance,
     fft::Type type, int batch_count, ScratchAllocator *scratch_allocator) {
@@ -321,7 +317,7 @@ port::Status CUDAFftPlan::Initialize(
   return port::Status::OK();
 }
 
-port::Status CUDAFftPlan::Initialize(CUDAExecutor *parent, Stream *stream,
+port::Status CUDAFftPlan::Initialize(GpuExecutor *parent, Stream *stream,
                                      int rank, uint64 *elem_count,
                                      fft::Type type,
                                      ScratchAllocator *scratch_allocator) {
@@ -553,8 +549,8 @@ bool CUDAFft::DoFftInternal(Stream *stream, fft::Plan *plan, FuncT cufftExec,
   }
 
   auto ret = cufftExec(parent_, cuda_fft_plan->GetPlan(),
-                       CUDAComplex(const_cast<InputT *>(CUDAMemory(input))),
-                       CUDAComplex(CUDAMemoryMutable(output)));
+                       GpuComplex(const_cast<InputT *>(GpuMemory(input))),
+                       GpuComplex(GpuMemoryMutable(output)));
 
   if (ret != CUFFT_SUCCESS) {
     LOG(ERROR) << "failed to run cuFFT routine: " << ret;
@@ -580,8 +576,8 @@ bool CUDAFft::DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan,
   }
 
   auto ret = cufftExec(parent_, cuda_fft_plan->GetPlan(),
-                       CUDAComplex(const_cast<InputT *>(CUDAMemory(input))),
-                       CUDAComplex(CUDAMemoryMutable(output)),
+                       GpuComplex(const_cast<InputT *>(GpuMemory(input))),
+                       GpuComplex(GpuMemoryMutable(output)),
                        cuda_fft_plan->GetFftDirection());
 
   if (ret != CUFFT_SUCCESS) {
@@ -618,22 +614,22 @@ STREAM_EXECUTOR_CUDA_DEFINE_FFT(double, Z2Z, D2Z, Z2D)
 
 #undef STREAM_EXECUTOR_CUDA_DEFINE_FFT
 
-}  // namespace cuda
+}  // namespace gpu
 
 void initialize_cufft() {
   port::Status status =
       PluginRegistry::Instance()->RegisterFactory<PluginRegistry::FftFactory>(
-          cuda::kCudaPlatformId, cuda::kCuFftPlugin, "cuFFT",
+          cuda::kCudaPlatformId, gpu::kCuFftPlugin, "cuFFT",
           [](internal::StreamExecutorInterface *parent) -> fft::FftSupport * {
-            cuda::CUDAExecutor *cuda_executor =
-                dynamic_cast<cuda::CUDAExecutor *>(parent);
+            gpu::GpuExecutor *cuda_executor =
+                dynamic_cast<gpu::GpuExecutor *>(parent);
             if (cuda_executor == nullptr) {
               LOG(ERROR) << "Attempting to initialize an instance of the cuFFT "
                          << "support library with a non-CUDA StreamExecutor";
               return nullptr;
             }
 
-            return new cuda::CUDAFft(cuda_executor);
+            return new gpu::CUDAFft(cuda_executor);
           });
   if (!status.ok()) {
     LOG(ERROR) << "Unable to register cuFFT factory: "
@@ -641,7 +637,7 @@ void initialize_cufft() {
   }
 
   PluginRegistry::Instance()->SetDefaultFactory(
-      cuda::kCudaPlatformId, PluginKind::kFft, cuda::kCuFftPlugin);
+      cuda::kCudaPlatformId, PluginKind::kFft, gpu::kCuFftPlugin);
 }
 
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.h b/tensorflow/stream_executor/cuda/cuda_fft.h
index 8171e61418a3185455e50ee76315eb2493c36c01..0f3baeab6fa8b26b18c22854e8c95aadbb02f1ba 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.h
+++ b/tensorflow/stream_executor/cuda/cuda_fft.h
@@ -30,9 +30,9 @@ namespace stream_executor {
 
 class Stream;
 
-namespace cuda {
+namespace gpu {
 
-class CUDAExecutor;
+class GpuExecutor;
 
 // Opaque and unique indentifier for the cuFFT plugin.
 extern const PluginId kCuFftPlugin;
@@ -64,17 +64,17 @@ class CUDAFftPlan : public fft::Plan {
   }
 
   // Initialize function for batched plan
-  port::Status Initialize(CUDAExecutor *parent, Stream *stream, int rank,
-                          uint64 *elem_count, uint64 *input_embed,
+  port::Status Initialize(GpuExecutor* parent, Stream* stream, int rank,
+                          uint64* elem_count, uint64* input_embed,
                           uint64 input_stride, uint64 input_distance,
-                          uint64 *output_embed, uint64 output_stride,
+                          uint64* output_embed, uint64 output_stride,
                           uint64 output_distance, fft::Type type,
-                          int batch_count, ScratchAllocator *scratch_allocator);
+                          int batch_count, ScratchAllocator* scratch_allocator);
 
   // Initialize function for 1d,2d, and 3d plan
-  port::Status Initialize(CUDAExecutor *parent, Stream *stream, int rank,
-                          uint64 *elem_count, fft::Type type,
-                          ScratchAllocator *scratch_allocator);
+  port::Status Initialize(GpuExecutor* parent, Stream* stream, int rank,
+                          uint64* elem_count, fft::Type type,
+                          ScratchAllocator* scratch_allocator);
 
   port::Status UpdateScratchAllocator(Stream *stream,
                                       ScratchAllocator *scratch_allocator);
@@ -83,7 +83,7 @@ class CUDAFftPlan : public fft::Plan {
   bool IsInitialized() const { return is_initialized_; }
 
  private:
-  CUDAExecutor *parent_;
+  GpuExecutor* parent_;
   cufftHandle plan_;
   fft::Type fft_type_;
   DeviceMemory<uint8> scratch_;
@@ -96,7 +96,7 @@ class CUDAFftPlan : public fft::Plan {
 // This satisfies the platform-agnostic FftSupport interface.
 //
 // Note that the cuFFT handle that this encapsulates is implicitly tied to the
-// context (and, as a result, the device) that the parent CUDAExecutor is tied
+// context (and, as a result, the device) that the parent GpuExecutor is tied
 // to. This simply happens as an artifact of creating the cuFFT handle when a
 // CUDA context is active.
 //
@@ -104,13 +104,13 @@ class CUDAFftPlan : public fft::Plan {
 // context of parent_, so all context is explicit.
 class CUDAFft : public fft::FftSupport {
  public:
-  explicit CUDAFft(CUDAExecutor *parent) : parent_(parent) {}
+  explicit CUDAFft(GpuExecutor* parent) : parent_(parent) {}
   ~CUDAFft() override {}
 
   TENSORFLOW_STREAM_EXECUTOR_GPU_FFT_SUPPORT_OVERRIDES
 
  private:
-  CUDAExecutor *parent_;
+  GpuExecutor* parent_;
 
   // Two helper functions that execute dynload::cufftExec?2?.
 
@@ -131,7 +131,7 @@ class CUDAFft : public fft::FftSupport {
   SE_DISALLOW_COPY_AND_ASSIGN(CUDAFft);
 };
 
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 4874d096ad54fa352fd6e9ad3b7b87c1fff59f73..420f2591b82f48f0ff2bb713aca3e000083b4774 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
+#include "tensorflow/stream_executor/cuda/cuda_driver_wrapper.h"
 #include "tensorflow/stream_executor/cuda/cuda_event.h"
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
@@ -53,6 +54,10 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/stream_executor/timer.h"
 
+// LOG(ERROR) uses a const named ERROR, so a macro with the same name is
+// always unwanted. This happens on Windows that defines such a macro.
+#undef ERROR
+
 #ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_
 #error \
     "No driver calls in this file, wrap driver functionality in cuda_driver.cc."
@@ -67,7 +72,7 @@ extern bool FLAGS_check_gpu_leaks;
 bool FLAGS_prefer_cubin_to_ptx = true;
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
 // Hook that can be used to CUBIN-ate PTX before it is loaded into the driver.
 // It has been observed that loading both PTX and cubins into the driver library
@@ -79,17 +84,16 @@ namespace cuda {
 // variable with extern linkage and populate it from another translation unit.
 std::function<string(const string &)> g_cubinate;
 
-static CUDAEvent *AsCUDAEvent(Event *event) {
+static GpuEvent* AsGpuEvent(Event* event) {
   DCHECK(event != nullptr);
-  return static_cast<CUDAEvent *>(event->implementation());
+  return static_cast<GpuEvent*>(event->implementation());
 }
 
-
 // Given a platform-independent timer datatype, returns the internal CUDA
 // platform implementation pointer.
-static CUDATimer *AsCUDATimer(Timer *timer) {
+static GpuTimer* AsGpuTimer(Timer* timer) {
   DCHECK(timer != nullptr);
-  return static_cast<CUDATimer *>(timer->implementation());
+  return static_cast<GpuTimer*>(timer->implementation());
 }
 
 // Given const GPU memory, returns a libcuda device pointer datatype, suitable
@@ -107,48 +111,49 @@ static CUdeviceptr AsCudaDevicePtr(DeviceMemoryBase *gpu_mem) {
   return AsCudaDevicePtr(*gpu_mem);
 }
 
-CudaContext* ExtractCudaContext(CUDAExecutor *cuda_exec) {
+GpuContext* ExtractGpuContext(GpuExecutor* cuda_exec) {
   CHECK(cuda_exec != nullptr);
-  return cuda_exec->cuda_context();
+  return cuda_exec->gpu_context();
 }
 
-CUDAExecutor *ExtractCudaExecutor(StreamExecutor *stream_exec) {
-  return static_cast<CUDAExecutor *>(stream_exec->implementation());
+GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
+  return static_cast<GpuExecutor*>(stream_exec->implementation());
 }
 
-CUDAExecutor::~CUDAExecutor() {
-  CHECK(kernel_to_gpu_binary_.empty()) << "CUDAExecutor has live kernels.";
-  CHECK(gpu_binary_to_module_.empty()) << "CUDAExecutor has loaded modules.";
+GpuExecutor::~GpuExecutor() {
+  CHECK(kernel_to_gpu_binary_.empty()) << "GpuExecutor has live kernels.";
+  CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
   if (context_ != nullptr) {
-    CUDADriver::DestroyContext(context_);
+    GpuDriver::DestroyContext(context_);
   }
 }
 
-port::Status CUDAExecutor::Init(int device_ordinal,
-                                DeviceOptions device_options) {
+port::Status GpuExecutor::Init(int device_ordinal,
+                               DeviceOptions device_options) {
   device_ordinal_ = device_ordinal;
 
-  auto status = CUDADriver::Init();
+  auto status = GpuDriver::Init();
   if (!status.ok()) {
     return status;
   }
 
-  status = CUDADriver::GetDevice(device_ordinal_, &device_);
+  status = GpuDriver::GetDevice(device_ordinal_, &device_);
   if (!status.ok()) {
     return status;
   }
 
-  status = CUDADriver::CreateContext(device_, device_options, &context_);
+  status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
+                                    &context_);
   if (!status.ok()) {
     return status;
   }
 
-  return CUDADriver::GetComputeCapability(&cc_major_, &cc_minor_, device_);
+  return GpuDriver::GetComputeCapability(&cc_major_, &cc_minor_, device_);
 }
 
-bool CUDAExecutor::FindOnDiskForComputeCapability(
+bool GpuExecutor::FindOnDiskForComputeCapability(
     absl::string_view filename, absl::string_view canonical_suffix,
-    string *found_filename) const {
+    string* found_filename) const {
   if (cc_major_ == 0 && cc_minor_ == 0) {
     return false;
   }
@@ -172,6 +177,13 @@ bool CUDAExecutor::FindOnDiskForComputeCapability(
   return false;
 }
 
+bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
+                                          absl::string_view canonical_suffix,
+                                          string* found_filename) const {
+  LOG(ERROR)
+      << "Feature not supported on CUDA platform (FindOnDiskForISAVersion)";
+  return false;
+}
 // Returns the path to the running executable.
 // N.B. Derived from //knowledge/smalltalk/background_kb.cc
 // Arg: strip_exe: if true, remove the name of the executable itself from the
@@ -206,12 +218,12 @@ static string GetBinaryDir(bool strip_exe) {
   return exe_path;
 }
 
-bool CUDAExecutor::LoadModuleFromCuBin(const char *cubin, CUmodule *module) {
+bool GpuExecutor::LoadModuleFromCuBin(const char* cubin, CUmodule* module) {
   uint64_t module_refcount;
   std::tie(*module, module_refcount) = gpu_binary_to_module_[cubin];
 
   if (*module == nullptr) {
-    auto load_status = CUDADriver::LoadCubin(context_, cubin, module);
+    auto load_status = GpuDriver::LoadCubin(context_, cubin, module);
     if (!load_status.ok()) {
       LOG(ERROR) << "failed to load CUBIN: " << load_status;
       return false;
@@ -228,12 +240,12 @@ bool CUDAExecutor::LoadModuleFromCuBin(const char *cubin, CUmodule *module) {
   return true;
 }
 
-bool CUDAExecutor::LoadModuleFromPtx(const char *ptx, CUmodule *module) {
+bool GpuExecutor::LoadModuleFromPtx(const char* ptx, CUmodule* module) {
   uint64_t module_refcount;
   std::tie(*module, module_refcount) = gpu_binary_to_module_[ptx];
 
   if (*module == nullptr) {
-    if (!CUDADriver::LoadPtx(context_, ptx, module)) {
+    if (!GpuDriver::LoadPtx(context_, ptx, module)) {
       return false;
     }
     VLOG(3) << "Loaded PTX " << static_cast<const void *>(ptx) << " as module "
@@ -248,9 +260,14 @@ bool CUDAExecutor::LoadModuleFromPtx(const char *ptx, CUmodule *module) {
   return true;
 }
 
-bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
-                             KernelBase *kernel) {
-  CUDAKernel *cuda_kernel = AsCUDAKernel(kernel);
+bool GpuExecutor::LoadModuleFromHsaco(const char* hsaco, CUmodule* module) {
+  LOG(ERROR) << "Feature not supported on CUDA platform (LoadModuleFromHsaco)";
+  return false;
+}
+
+bool GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
+                            KernelBase* kernel) {
+  GpuKernel* cuda_kernel = AsGpuKernel(kernel);
   CUmodule module;
   const string *kernelname;
 
@@ -290,8 +307,8 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
     return false;
   }
   VLOG(2) << "getting function " << *kernelname << " from module " << module;
-  if (!CUDADriver::GetModuleFunction(context_, module, kernelname->c_str(),
-                                     cuda_kernel->cuda_function_ptr())) {
+  if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
+                                    cuda_kernel->gpu_function_ptr())) {
     return false;
   }
 
@@ -308,7 +325,7 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
   return true;
 }
 
-bool CUDAExecutor::UnloadGpuBinary(const void *gpu_binary) {
+bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
   auto module_it = gpu_binary_to_module_.find(gpu_binary);
   if (gpu_binary_to_module_.end() == module_it) {
     VLOG(3) << "No loaded CUDA module for " << gpu_binary;
@@ -319,13 +336,13 @@ bool CUDAExecutor::UnloadGpuBinary(const void *gpu_binary) {
   VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount;
   if (--refcount == 0) {
     VLOG(3) << "Unloading CUDA module " << module;
-    CUDADriver::UnloadModule(context_, module);
+    GpuDriver::UnloadModule(context_, module);
     gpu_binary_to_module_.erase(module_it);
   }
   return true;
 }
 
-void CUDAExecutor::UnloadKernel(const KernelBase *kernel) {
+void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
   VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
 
   mutex_lock lock{in_memory_modules_mu_};
@@ -341,9 +358,9 @@ void CUDAExecutor::UnloadKernel(const KernelBase *kernel) {
   kernel_to_gpu_binary_.erase(gpu_binary_it);
 }
 
-bool CUDAExecutor::LoadModule(const MultiModuleLoaderSpec &spec,
-                              ModuleHandle *module_handle) {
-  // In CUDAExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
+bool GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
+                             ModuleHandle* module_handle) {
+  // In GpuExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
   // ModuleHandle::id().
   CUmodule cu_module;
   if (spec.has_cuda_cubin_in_memory()) {
@@ -377,25 +394,23 @@ bool CUDAExecutor::LoadModule(const MultiModuleLoaderSpec &spec,
   return false;
 }
 
-bool CUDAExecutor::UnloadModule(ModuleHandle module_handle) {
+bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
   const char *gpu_binary = reinterpret_cast<const char *>(module_handle.id());
   mutex_lock lock{in_memory_modules_mu_};
   return UnloadGpuBinary(gpu_binary);
 }
 
-bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel,
-                                     KernelMetadata *kernel_metadata) {
+bool GpuExecutor::GetKernelMetadata(GpuKernel* cuda_kernel,
+                                    KernelMetadata* kernel_metadata) {
   int value;
-  if (!CUDADriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_NUM_REGS,
-                                    *cuda_kernel->cuda_function_ptr(),
-                                    &value)) {
+  if (!GpuDriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_NUM_REGS,
+                                   *cuda_kernel->gpu_function_ptr(), &value)) {
     return false;
   }
   kernel_metadata->set_registers_per_thread(value);
 
-  if (!CUDADriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
-                                    *cuda_kernel->cuda_function_ptr(),
-                                    &value)) {
+  if (!GpuDriver::FuncGetAttribute(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
+                                   *cuda_kernel->gpu_function_ptr(), &value)) {
     return false;
   }
   kernel_metadata->set_shared_memory_bytes(value);
@@ -403,13 +418,13 @@ bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel,
   return true;
 }
 
-bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
-                          const BlockDim &block_dims, const KernelBase &kernel,
-                          const KernelArgsArrayBase &args) {
+bool GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
+                         const BlockDim& block_dims, const KernelBase& kernel,
+                         const KernelArgsArrayBase& args) {
   CHECK_EQ(kernel.Arity(), args.number_of_arguments());
-  CUstream custream = AsCUDAStreamValue(stream);
-  const CUDAKernel *cuda_kernel = AsCUDAKernel(&kernel);
-  CUfunction cufunc = cuda_kernel->AsCUDAFunctionValue();
+  CUstream custream = AsGpuStreamValue(stream);
+  const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
+  CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
 
   // Only perform/print the occupancy check once.  Even just checking to see
   // whether we've done an occupancy check on this kernel before isn't free
@@ -426,16 +441,16 @@ bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
 
   if (cuda_kernel->GetPreferredCacheConfig() !=
       KernelCacheConfig::kNoPreference) {
-    CUDADriver::FuncSetCacheConfig(cufunc, cuda_kernel->GetCUDACacheConfig());
+    GpuDriver::FuncSetCacheConfig(cufunc, cuda_kernel->GetGpuCacheConfig());
   }
 
   void **kernel_params = const_cast<void **>(args.argument_addresses().data());
 
-  if (!CUDADriver::LaunchKernel(context_, cufunc, block_dims.x, block_dims.y,
-                                block_dims.z, thread_dims.x, thread_dims.y,
-                                thread_dims.z, args.number_of_shared_bytes(),
-                                custream, kernel_params,
-                                nullptr /* = extra */)) {
+  if (!GpuDriver::LaunchKernel(context_, cufunc, block_dims.x, block_dims.y,
+                               block_dims.z, thread_dims.x, thread_dims.y,
+                               thread_dims.z, args.number_of_shared_bytes(),
+                               custream, kernel_params,
+                               nullptr /* = extra */)) {
     LOG(ERROR) << "failed to launch CUDA kernel " << kernel.name() << " with "
                << args.number_of_arguments()
                << " args; thread dim: " << thread_dims.ToString()
@@ -449,9 +464,9 @@ bool CUDAExecutor::Launch(Stream *stream, const ThreadDim &thread_dims,
 // This is a non-essential operation; if there's a failure, proceed without
 // logging an error. It's nearly certain that in case of failures, we'd never
 // get here in the first place; these are very low-impact routines.
-void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
-                                     const ThreadDim &thread_dims,
-                                     const BlockDim &block_dims) {
+void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
+                                    const ThreadDim& thread_dims,
+                                    const BlockDim& block_dims) {
   VLOG(2) << "Computing kernel occupancy for kernel "
           << kernel.demangled_name();
   VLOG(2) << "Thread dimensions (" << thread_dims.x << ", " << thread_dims.y
@@ -470,8 +485,8 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
   const DeviceDescription &device_description =
       kernel.parent()->GetDeviceDescription();
 
-  const CUDAKernel *cuda_kernel = AsCUDAKernel(&kernel);
-  CUfunction cufunc = cuda_kernel->AsCUDAFunctionValue();
+  const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
+  CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
 
   int blocks_per_sm = CalculateOccupancy(device_description, regs_per_thread,
                                          smem_per_block, thread_dims, cufunc);
@@ -491,13 +506,14 @@ void CUDAExecutor::VlogOccupancyInfo(const KernelBase &kernel,
 // Compute and return maximum blocks per core (occupancy) based on the
 // device description, some kernel characteristics and the number of threads per
 // block.  If unable to compute occupancy, zero is returned.
-int CUDAExecutor::CalculateOccupancy(
-    const DeviceDescription &device_description, uint64 registers_per_thread,
-    uint64 shared_memory_per_block, const ThreadDim &thread_dims,
-    CUfunction func) {
+int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
+                                    uint64 registers_per_thread,
+                                    uint64 shared_memory_per_block,
+                                    const ThreadDim& thread_dims,
+                                    CUfunction func) {
   int suggested_blocks = 0;
   int suggested_threads = 0;
-  CUresult err = cuOccupancyMaxPotentialBlockSize(
+  CUresult err = tensorflow::wrap::cuOccupancyMaxPotentialBlockSize(
       &suggested_blocks, &suggested_threads, func, nullptr,
       shared_memory_per_block, 0);
   CHECK_EQ(err, CUDA_SUCCESS);
@@ -506,15 +522,15 @@ int CUDAExecutor::CalculateOccupancy(
 
 // Compute and return the suggested thread count to achieve ideal occupancy.
 // If the provided thread dimensions match this number, zero is returned.
-int CUDAExecutor::CompareOccupancy(int *initial_blocks,
-                                   const DeviceDescription &device_description,
-                                   uint64 registers_per_thread,
-                                   uint64 shared_memory_per_block,
-                                   const ThreadDim &thread_dims,
-                                   CUfunction func) {
+int GpuExecutor::CompareOccupancy(int* initial_blocks,
+                                  const DeviceDescription& device_description,
+                                  uint64 registers_per_thread,
+                                  uint64 shared_memory_per_block,
+                                  const ThreadDim& thread_dims,
+                                  CUfunction func) {
   int suggested_blocks = 0;
   int suggested_threads = 0;
-  CUresult err = cuOccupancyMaxPotentialBlockSize(
+  CUresult err = tensorflow::wrap::cuOccupancyMaxPotentialBlockSize(
       &suggested_blocks, &suggested_threads, func, nullptr,
       shared_memory_per_block, 0);
   CHECK_EQ(err, CUDA_SUCCESS);
@@ -526,88 +542,87 @@ int CUDAExecutor::CompareOccupancy(int *initial_blocks,
   }
 }
 
-void *CUDAExecutor::Allocate(uint64 size) {
-  return CUDADriver::DeviceAllocate(context_, size);
+void* GpuExecutor::Allocate(uint64 size) {
+  return GpuDriver::DeviceAllocate(context_, size);
 }
 
-void *CUDAExecutor::AllocateSubBuffer(DeviceMemoryBase *mem,
-                                      uint64 offset_bytes, uint64 size_bytes) {
+void* GpuExecutor::AllocateSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
+                                     uint64 size_bytes) {
   // offset and size are in bytes, so char* works as the pointer type.
   return reinterpret_cast<char *>(mem->opaque()) + offset_bytes;
 }
 
-void CUDAExecutor::Deallocate(DeviceMemoryBase *mem) {
+void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
   // CUDA "sub-buffers" are just pointer + offset, so no dealloc is necessary.
   if (!mem->is_sub_buffer()) {
-    CUDADriver::DeviceDeallocate(context_, mem->opaque());
+    GpuDriver::DeviceDeallocate(context_, mem->opaque());
   }
 }
 
-bool CUDAExecutor::HostMemoryRegister(void *location, uint64 size) {
+bool GpuExecutor::HostMemoryRegister(void* location, uint64 size) {
   if (location == nullptr || size == 0) {
     LOG(WARNING) << "attempting to register null or zero-sized memory: "
                  << location << "; size " << size;
   }
   VLOG(2) << "registering " << location << " size " << size;
-  return CUDADriver::HostRegister(context_, location, size);
+  return GpuDriver::HostRegister(context_, location, size);
 }
 
-bool CUDAExecutor::HostMemoryUnregister(void *location) {
+bool GpuExecutor::HostMemoryUnregister(void* location) {
   VLOG(2) << "unregistering " << location;
-  return CUDADriver::HostUnregister(context_, location);
+  return GpuDriver::HostUnregister(context_, location);
 }
 
-bool CUDAExecutor::SynchronizeAllActivity() {
-  return CUDADriver::SynchronizeContext(context_);
+bool GpuExecutor::SynchronizeAllActivity() {
+  return GpuDriver::SynchronizeContext(context_);
 }
 
-bool CUDAExecutor::SynchronousMemZero(DeviceMemoryBase *location, uint64 size) {
+bool GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location, uint64 size) {
   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
       size % 4 == 0) {
-    return CUDADriver::SynchronousMemsetUint32(
+    return GpuDriver::SynchronousMemsetUint32(
         context_, AsCudaDevicePtr(location), 0x0, size / 4);
   }
-  return CUDADriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
-                                            0x0, size);
+  return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
+                                           0x0, size);
 }
 
-bool CUDAExecutor::SynchronousMemSet(DeviceMemoryBase *location, int value,
-                                     uint64 size) {
+bool GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location, int value,
+                                    uint64 size) {
   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
       size % 4 == 0) {
     // cudaMemset reinterprets "value" as a uint8.
     uint8 byte_value = static_cast<uint8>(value);
     uint32 pattern = (byte_value << 24) | (byte_value << 16) |
                      (byte_value << 8) | byte_value;
-    return CUDADriver::SynchronousMemsetUint32(
+    return GpuDriver::SynchronousMemsetUint32(
         context_, AsCudaDevicePtr(location), pattern, size / 4);
   }
-  return CUDADriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
-                                            value, size);
+  return GpuDriver::SynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
+                                           value, size);
 }
 
-port::Status CUDAExecutor::SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
-                                             const void *host_src,
-                                             uint64 size) {
-  return CUDADriver::SynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
-                                          host_src, size);
+port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
+                                            const void* host_src, uint64 size) {
+  return GpuDriver::SynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
+                                         host_src, size);
 }
 
-port::Status CUDAExecutor::SynchronousMemcpy(void *host_dst,
-                                             const DeviceMemoryBase &gpu_src,
-                                             uint64 size) {
-  return CUDADriver::SynchronousMemcpyD2H(context_, host_dst,
-                                          AsCudaDevicePtr(gpu_src), size);
+port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
+                                            const DeviceMemoryBase& gpu_src,
+                                            uint64 size) {
+  return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
+                                         AsCudaDevicePtr(gpu_src), size);
 }
 
-port::Status CUDAExecutor::SynchronousMemcpyDeviceToDevice(
-    DeviceMemoryBase *gpu_dst, const DeviceMemoryBase &gpu_src, uint64 size) {
-  return CUDADriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
-                                          AsCudaDevicePtr(gpu_src), size);
+port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
+    DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64 size) {
+  return GpuDriver::SynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
+                                         AsCudaDevicePtr(gpu_src), size);
 }
 
-bool CUDAExecutor::MemZero(Stream *stream, DeviceMemoryBase *location,
-                           uint64 size) {
+bool GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
+                          uint64 size) {
   if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
       size % 4 == 0) {
     return Memset32(stream, location, 0x0, size);
@@ -616,88 +631,87 @@ bool CUDAExecutor::MemZero(Stream *stream, DeviceMemoryBase *location,
   }
 }
 
-bool CUDAExecutor::Memset(Stream *stream, DeviceMemoryBase *location,
-                           uint8 pattern, uint64 size) {
+bool GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
+                         uint8 pattern, uint64 size) {
   VLOG(2) << "enqueueing memset8 operation onto stream " << stream
           << " at location " << location << " with size " << size
           << " and pattern " << std::hex << pattern;
-  return CUDADriver::AsynchronousMemsetUint8(
-      context_, AsCudaDevicePtr(location), pattern, size,
-      AsCUDAStreamValue(stream));
+  return GpuDriver::AsynchronousMemsetUint8(context_, AsCudaDevicePtr(location),
+                                            pattern, size,
+                                            AsGpuStreamValue(stream));
 }
 
-bool CUDAExecutor::Memset32(Stream *stream, DeviceMemoryBase *location,
-                            uint32 pattern, uint64 size) {
+bool GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
+                           uint32 pattern, uint64 size) {
   VLOG(2) << "enqueueing memset32 operation onto stream " << stream
           << " at location " << location << " with size " << size
           << " and pattern " << std::hex << pattern;
   CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
         size % 4 == 0);
-  return CUDADriver::AsynchronousMemsetUint32(
+  return GpuDriver::AsynchronousMemsetUint32(
       context_, AsCudaDevicePtr(location), pattern, size / 4,
-      AsCUDAStreamValue(stream));
+      AsGpuStreamValue(stream));
 }
 
-bool CUDAExecutor::Memcpy(Stream *stream, void *host_dst,
-                          const DeviceMemoryBase &gpu_src, uint64 size) {
-  return CUDADriver::AsynchronousMemcpyD2H(context_, host_dst,
-                                           AsCudaDevicePtr(gpu_src), size,
-                                           AsCUDAStreamValue(stream));
+bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
+                         const DeviceMemoryBase& gpu_src, uint64 size) {
+  return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
+                                          AsCudaDevicePtr(gpu_src), size,
+                                          AsGpuStreamValue(stream));
 }
 
-bool CUDAExecutor::Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst,
-                          const void *host_src, uint64 size) {
-  return CUDADriver::AsynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
-                                           host_src, size,
-                                           AsCUDAStreamValue(stream));
+bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
+                         const void* host_src, uint64 size) {
+  return GpuDriver::AsynchronousMemcpyH2D(context_, AsCudaDevicePtr(gpu_dst),
+                                          host_src, size,
+                                          AsGpuStreamValue(stream));
 }
 
-bool CUDAExecutor::MemcpyDeviceToDevice(Stream *stream,
-                                        DeviceMemoryBase *gpu_dst,
-                                        const DeviceMemoryBase &gpu_src,
-                                        uint64 size) {
-  return CUDADriver::AsynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
-                                           AsCudaDevicePtr(gpu_src), size,
-                                           AsCUDAStreamValue(stream));
+bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
+                                       DeviceMemoryBase* gpu_dst,
+                                       const DeviceMemoryBase& gpu_src,
+                                       uint64 size) {
+  return GpuDriver::AsynchronousMemcpyD2D(context_, AsCudaDevicePtr(gpu_dst),
+                                          AsCudaDevicePtr(gpu_src), size,
+                                          AsGpuStreamValue(stream));
 }
 
-bool CUDAExecutor::HostCallback(Stream *stream,
-                                std::function<port::Status()> callback) {
+bool GpuExecutor::HostCallback(Stream* stream,
+                               std::function<port::Status()> callback) {
   auto callback_ptr = new std::function<void()>([callback]() {
     port::Status s = callback();
     if (!s.ok()) {
       LOG(WARNING) << "Host callback failed: " << s;
     }
   });
-  return CUDADriver::AddStreamCallback(context_, AsCUDAStreamValue(stream),
-                                       InternalHostCallback, callback_ptr);
+  return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
+                                      InternalHostCallback, callback_ptr);
 }
 
-/* static */ void CUDAExecutor::InternalHostCallback(CUstream stream,
-                                                     CUresult status,
-                                                     void *data) {
+/* static */ void GpuExecutor::InternalHostCallback(CUstream stream,
+                                                    CUresult status,
+                                                    void* data) {
   std::function<void()> *callback =
       reinterpret_cast<std::function<void()> *>(data);
   (*callback)();
   delete callback;
 }
 
-port::Status CUDAExecutor::AllocateEvent(Event *event) {
-  return AsCUDAEvent(event)->Init();
+port::Status GpuExecutor::AllocateEvent(Event* event) {
+  return AsGpuEvent(event)->Init();
 }
 
-port::Status CUDAExecutor::DeallocateEvent(Event *event) {
-  return AsCUDAEvent(event)->Destroy();
+port::Status GpuExecutor::DeallocateEvent(Event* event) {
+  return AsGpuEvent(event)->Destroy();
 }
 
-port::Status CUDAExecutor::RecordEvent(Stream *stream, Event *event) {
-  return AsCUDAEvent(event)->Record(AsCUDAStream(stream));
+port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
+  return AsGpuEvent(event)->Record(AsGpuStream(stream));
 }
 
-port::Status CUDAExecutor::WaitForEvent(Stream *stream, Event *event) {
-  if (CUDADriver::WaitStreamOnEvent(context_,
-                                    AsCUDAStream(stream)->cuda_stream(),
-                                    AsCUDAEvent(event)->cuda_event())) {
+port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
+  if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
+                                   AsGpuEvent(event)->gpu_event())) {
     return port::Status::OK();
   } else {
     return port::Status(
@@ -707,61 +721,61 @@ port::Status CUDAExecutor::WaitForEvent(Stream *stream, Event *event) {
   }
 }
 
-Event::Status CUDAExecutor::PollForEventStatus(Event *event) {
-  return AsCUDAEvent(event)->PollForStatus();
+Event::Status GpuExecutor::PollForEventStatus(Event* event) {
+  return AsGpuEvent(event)->PollForStatus();
 }
 
-bool CUDAExecutor::AllocateStream(Stream *stream) {
-  return AsCUDAStream(stream)->Init();
+bool GpuExecutor::AllocateStream(Stream* stream) {
+  return AsGpuStream(stream)->Init();
 }
 
-void CUDAExecutor::DeallocateStream(Stream *stream) {
-  CUDAStream *cuda_stream = AsCUDAStream(stream);
+void GpuExecutor::DeallocateStream(Stream* stream) {
+  GpuStream* cuda_stream = AsGpuStream(stream);
   if (!cuda_stream->IsIdle()) {
     LOG(ERROR) << "Deallocating stream with pending work";
   }
   cuda_stream->Destroy();
 }
 
-bool CUDAExecutor::AllocateTimer(Timer *timer) {
-  return AsCUDATimer(timer)->Init();
+bool GpuExecutor::AllocateTimer(Timer* timer) {
+  return AsGpuTimer(timer)->Init();
 }
 
-void CUDAExecutor::DeallocateTimer(Timer *timer) {
-  AsCUDATimer(timer)->Destroy();
+void GpuExecutor::DeallocateTimer(Timer* timer) {
+  AsGpuTimer(timer)->Destroy();
 }
 
-bool CUDAExecutor::CreateStreamDependency(Stream *dependent, Stream *other) {
-  CUevent other_completed_event = *AsCUDAStream(other)->completed_event();
-  bool ok = CUDADriver::RecordEvent(context_, other_completed_event,
-                                    AsCUDAStreamValue(other))
-      .ok();
+bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
+  CUevent other_completed_event = *AsGpuStream(other)->completed_event();
+  bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
+                                   AsGpuStreamValue(other))
+                .ok();
   if (!ok) {
     LOG(ERROR) << "failed to record completion event; "
                   "therefore, failed to create inter-stream dependency";
     return false;
   }
 
-  return CUDADriver::WaitStreamOnEvent(context_, AsCUDAStreamValue(dependent),
-                                       other_completed_event);
+  return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
+                                      other_completed_event);
 }
 
-bool CUDAExecutor::StartTimer(Stream *stream, Timer *timer) {
-  return AsCUDATimer(timer)->Start(AsCUDAStream(stream));
+bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
+  return AsGpuTimer(timer)->Start(AsGpuStream(stream));
 }
 
-bool CUDAExecutor::StopTimer(Stream *stream, Timer *timer) {
-  return AsCUDATimer(timer)->Stop(AsCUDAStream(stream));
+bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
+  return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
 }
 
-port::Status CUDAExecutor::BlockHostUntilDone(Stream *stream) {
-  return CUDADriver::SynchronizeStream(context_, AsCUDAStreamValue(stream));
+port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
+  return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
 }
 
-blas::BlasSupport *CUDAExecutor::CreateBlas() {
+blas::BlasSupport* GpuExecutor::CreateBlas() {
   PluginRegistry *registry = PluginRegistry::Instance();
   port::StatusOr<PluginRegistry::BlasFactory> status =
-      registry->GetFactory<PluginRegistry::BlasFactory>(kCudaPlatformId,
+      registry->GetFactory<PluginRegistry::BlasFactory>(cuda::kCudaPlatformId,
                                                         plugin_config_.blas());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve BLAS factory: "
@@ -772,10 +786,10 @@ blas::BlasSupport *CUDAExecutor::CreateBlas() {
   return status.ValueOrDie()(this);
 }
 
-dnn::DnnSupport *CUDAExecutor::CreateDnn() {
+dnn::DnnSupport* GpuExecutor::CreateDnn() {
   PluginRegistry *registry = PluginRegistry::Instance();
   port::StatusOr<PluginRegistry::DnnFactory> status =
-      registry->GetFactory<PluginRegistry::DnnFactory>(kCudaPlatformId,
+      registry->GetFactory<PluginRegistry::DnnFactory>(cuda::kCudaPlatformId,
                                                        plugin_config_.dnn());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve DNN factory: "
@@ -786,10 +800,10 @@ dnn::DnnSupport *CUDAExecutor::CreateDnn() {
   return status.ValueOrDie()(this);
 }
 
-fft::FftSupport *CUDAExecutor::CreateFft() {
+fft::FftSupport* GpuExecutor::CreateFft() {
   PluginRegistry *registry = PluginRegistry::Instance();
   port::StatusOr<PluginRegistry::FftFactory> status =
-      registry->GetFactory<PluginRegistry::FftFactory>(kCudaPlatformId,
+      registry->GetFactory<PluginRegistry::FftFactory>(cuda::kCudaPlatformId,
                                                        plugin_config_.fft());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve FFT factory: "
@@ -800,10 +814,10 @@ fft::FftSupport *CUDAExecutor::CreateFft() {
   return status.ValueOrDie()(this);
 }
 
-rng::RngSupport *CUDAExecutor::CreateRng() {
+rng::RngSupport* GpuExecutor::CreateRng() {
   PluginRegistry *registry = PluginRegistry::Instance();
   port::StatusOr<PluginRegistry::RngFactory> status =
-      registry->GetFactory<PluginRegistry::RngFactory>(kCudaPlatformId,
+      registry->GetFactory<PluginRegistry::RngFactory>(cuda::kCudaPlatformId,
                                                        plugin_config_.rng());
   if (!status.ok()) {
     LOG(ERROR) << "Unable to retrieve RNG factory: "
@@ -815,23 +829,21 @@ rng::RngSupport *CUDAExecutor::CreateRng() {
 }
 
 // TODO(rspringer): Remove in b/18544742.
-bool CUDAExecutor::SupportsDnn() const {
-  return true;
-}
+bool GpuExecutor::SupportsDnn() const { return true; }
 
-bool CUDAExecutor::CanEnablePeerAccessTo(StreamExecutorInterface *other) {
-  CUDAExecutor *cuda_other = static_cast<CUDAExecutor *>(other);
-  return CUDADriver::CanEnablePeerAccess(context_, cuda_other->context_);
+bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
+  GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
+  return GpuDriver::CanEnablePeerAccess(context_, cuda_other->context_);
 }
 
-port::Status CUDAExecutor::EnablePeerAccessTo(StreamExecutorInterface *other) {
-  CUDAExecutor *cuda_other = static_cast<CUDAExecutor *>(other);
-  return CUDADriver::EnablePeerAccess(context_, cuda_other->context_);
+port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
+  GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
+  return GpuDriver::EnablePeerAccess(context_, cuda_other->context_);
 }
 
-SharedMemoryConfig CUDAExecutor::GetDeviceSharedMemoryConfig() {
+SharedMemoryConfig GpuExecutor::GetDeviceSharedMemoryConfig() {
   port::StatusOr<CUsharedconfig> cuda_config =
-      CUDADriver::ContextGetSharedMemConfig(context_);
+      GpuDriver::ContextGetSharedMemConfig(context_);
   if (!cuda_config.ok()) {
     // Don't log; the failed call will log necessary output.
     return SharedMemoryConfig::kDefault;
@@ -850,7 +862,7 @@ SharedMemoryConfig CUDAExecutor::GetDeviceSharedMemoryConfig() {
   }
 }
 
-port::Status CUDAExecutor::SetDeviceSharedMemoryConfig(
+port::Status GpuExecutor::SetDeviceSharedMemoryConfig(
     SharedMemoryConfig config) {
   CUsharedconfig cuda_config;
   switch (config) {
@@ -867,21 +879,21 @@ port::Status CUDAExecutor::SetDeviceSharedMemoryConfig(
       LOG(FATAL) << "Invalid shared memory configuration specified: "
                  << static_cast<int>(config);
   }
-  return CUDADriver::ContextSetSharedMemConfig(context_, cuda_config);
+  return GpuDriver::ContextSetSharedMemConfig(context_, cuda_config);
 }
 
-bool CUDAExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const {
-  return CUDADriver::GetDeviceMemoryInfo(context_, free, total);
+bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
+  return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
 }
 
-bool CUDAExecutor::GetSymbol(const string &symbol_name,
-                             ModuleHandle module_handle, void **mem,
-                             size_t *bytes) {
+bool GpuExecutor::GetSymbol(const string& symbol_name,
+                            ModuleHandle module_handle, void** mem,
+                            size_t* bytes) {
   auto lookup_in_module = [&](CUmodule module) {
     CHECK(module != nullptr);
-    return CUDADriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
-                                       reinterpret_cast<CUdeviceptr *>(mem),
-                                       bytes);
+    return GpuDriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
+                                      reinterpret_cast<CUdeviceptr*>(mem),
+                                      bytes);
   };
 
   {  // give limited scope to mutex_lock
@@ -903,13 +915,13 @@ bool CUDAExecutor::GetSymbol(const string &symbol_name,
   return false;
 }
 
-bool CUDAExecutor::FillBlockDimLimit(BlockDim *block_dim_limit) const {
+bool GpuExecutor::FillBlockDimLimit(BlockDim* block_dim_limit) const {
   // The BlockDim name is a mismatch against these GRID_DIM_* queries because
   // we use BlockDims to express the dimensions of blocks within a grid
   // (as opposed to ThreadDim which expresses the dimensions of threads
   // within a block).
   int x, y, z;
-  if (!CUDADriver::GetGridLimits(&x, &y, &z, device_)) {
+  if (!GpuDriver::GetGridLimits(&x, &y, &z, device_)) {
     return false;
   }
 
@@ -919,35 +931,35 @@ bool CUDAExecutor::FillBlockDimLimit(BlockDim *block_dim_limit) const {
   return true;
 }
 
-bool CUDAExecutor::SupportsBlas() const { return true; }
+bool GpuExecutor::SupportsBlas() const { return true; }
 
-bool CUDAExecutor::SupportsFft() const { return true; }
+bool GpuExecutor::SupportsFft() const { return true; }
 
-bool CUDAExecutor::SupportsRng() const { return true; }
+bool GpuExecutor::SupportsRng() const { return true; }
 
 std::unique_ptr<internal::EventInterface>
-CUDAExecutor::CreateEventImplementation() {
-  return std::unique_ptr<internal::EventInterface>(new CUDAEvent(this));
+GpuExecutor::CreateEventImplementation() {
+  return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
 }
 
 std::unique_ptr<internal::KernelInterface>
-CUDAExecutor::CreateKernelImplementation() {
-  return std::unique_ptr<internal::KernelInterface>(new CUDAKernel());
+GpuExecutor::CreateKernelImplementation() {
+  return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
 }
 
 std::unique_ptr<internal::StreamInterface>
-CUDAExecutor::GetStreamImplementation() {
-  return std::unique_ptr<internal::StreamInterface>(new CUDAStream(this));
+GpuExecutor::GetStreamImplementation() {
+  return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
 }
 
 std::unique_ptr<internal::TimerInterface>
-CUDAExecutor::GetTimerImplementation() {
-  return std::unique_ptr<internal::TimerInterface>(new CUDATimer(this));
+GpuExecutor::GetTimerImplementation() {
+  return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
 }
 
-void *CUDAExecutor::GpuContextHack() { return context_; }
+void* GpuExecutor::GpuContextHack() { return context_; }
 
-CudaContext* CUDAExecutor::cuda_context() { return context_; }
+GpuContext* GpuExecutor::gpu_context() { return context_; }
 
 // Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
 // of SysFS. Returns -1 if it cannot.
@@ -1014,21 +1026,21 @@ static int TryToReadNumaNode(const string &pci_bus_id, int device_ordinal) {
 #endif
 }
 
-
-DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
+DeviceDescription* GpuExecutor::PopulateDeviceDescription() const {
   internal::DeviceDescriptionBuilder builder;
 
   {
     int driver_version = 0;
-    (void)CUDADriver::GetDriverVersion(&driver_version);
+    (void)GpuDriver::GetDriverVersion(&driver_version);
     string augmented_driver_version = port::Printf(
         "%d (%s)", driver_version,
-        DriverVersionStatusToString(Diagnostician::FindDsoVersion()).c_str());
+        cuda::DriverVersionStatusToString(Diagnostician::FindDsoVersion())
+            .c_str());
     builder.set_driver_version(augmented_driver_version);
   }
 
   {
-    string pci_bus_id = CUDADriver::GetPCIBusID(device_);
+    string pci_bus_id = GpuDriver::GetPCIBusID(device_);
 
     // Lower the hex characters to match sysfs.
     pci_bus_id = port::Lowercase(pci_bus_id);
@@ -1039,35 +1051,45 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
     builder.set_numa_node(numa_node);
   }
 
-  CUdevprop prop;
-  if (CUDADriver::GetDeviceProperties(&prop, device_ordinal_)) {
-    builder.set_threads_per_block_limit(prop.maxThreadsPerBlock);
+  {
+    builder.set_threads_per_block_limit(
+        GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                                      device_)
+            .ValueOrDie());
 
     ThreadDim thread_dim_limit;
-    thread_dim_limit.x = prop.maxThreadsDim[0];
-    thread_dim_limit.y = prop.maxThreadsDim[1];
-    thread_dim_limit.z = prop.maxThreadsDim[2];
+    thread_dim_limit.x = GpuDriver::GetDeviceAttribute(
+                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device_)
+                             .ValueOrDie();
+    thread_dim_limit.y = GpuDriver::GetDeviceAttribute(
+                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, device_)
+                             .ValueOrDie();
+    thread_dim_limit.z = GpuDriver::GetDeviceAttribute(
+                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, device_)
+                             .ValueOrDie();
     builder.set_thread_dim_limit(thread_dim_limit);
 
-    float clock_rate_ghz = static_cast<float>(prop.clockRate) / 1e6;
-    builder.set_clock_rate_ghz(clock_rate_ghz);
+    int clock_rate =
+        GpuDriver::GetDeviceAttribute(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device_)
+            .ValueOrDie();
+    builder.set_clock_rate_ghz(static_cast<float>(clock_rate) / 1e6);
   }
 
   {
     bool ecc_enabled = false;
-    (void)CUDADriver::IsEccEnabled(device_, &ecc_enabled);
+    (void)GpuDriver::IsEccEnabled(device_, &ecc_enabled);
     builder.set_ecc_enabled(ecc_enabled);
   }
 
   {
     uint64 device_memory_size = -1;
-    (void)CUDADriver::GetDeviceTotalMemory(device_, &device_memory_size);
+    (void)GpuDriver::GetDeviceTotalMemory(device_, &device_memory_size);
     builder.set_device_memory_size(device_memory_size);
   }
 
-  port::StatusOr<int> mem_clock_khz = CUDADriver::GetDeviceAttribute(
+  port::StatusOr<int> mem_clock_khz = GpuDriver::GetDeviceAttribute(
       CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device_ordinal_);
-  port::StatusOr<int> mem_bus_width_bits = CUDADriver::GetDeviceAttribute(
+  port::StatusOr<int> mem_bus_width_bits = GpuDriver::GetDeviceAttribute(
       CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device_ordinal_);
   if (mem_clock_khz.ok() && mem_bus_width_bits.ok()) {
     // Times 2 because HBM is DDR memory; it gets two data bits per each data
@@ -1085,7 +1107,7 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
 
   {
     string device_name;
-    (void)CUDADriver::GetDeviceName(device_, &device_name);
+    (void)GpuDriver::GetDeviceName(device_, &device_name);
     builder.set_name(device_name);
   }
 
@@ -1099,19 +1121,19 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
   builder.set_device_vendor("NVIDIA Corporation");
   builder.set_cuda_compute_capability(cc_major_, cc_minor_);
   builder.set_shared_memory_per_core(
-      CUDADriver::GetMaxSharedMemoryPerCore(device_).ValueOrDie());
+      GpuDriver::GetMaxSharedMemoryPerCore(device_).ValueOrDie());
   builder.set_shared_memory_per_block(
-      CUDADriver::GetMaxSharedMemoryPerBlock(device_).ValueOrDie());
+      GpuDriver::GetMaxSharedMemoryPerBlock(device_).ValueOrDie());
   builder.set_core_count(
-      CUDADriver::GetMultiprocessorCount(device_).ValueOrDie());
+      GpuDriver::GetMultiprocessorCount(device_).ValueOrDie());
   builder.set_threads_per_core_limit(
-      CUDADriver::GetMaxThreadsPerMultiprocessor(device_).ValueOrDie());
+      GpuDriver::GetMaxThreadsPerMultiprocessor(device_).ValueOrDie());
   builder.set_registers_per_block_limit(
-      CUDADriver::GetMaxRegistersPerBlock(device_).ValueOrDie());
+      GpuDriver::GetMaxRegistersPerBlock(device_).ValueOrDie());
   builder.set_threads_per_warp(
-      CUDADriver::GetThreadsPerWarp(device_).ValueOrDie());
+      GpuDriver::GetThreadsPerWarp(device_).ValueOrDie());
   builder.set_registers_per_core_limit(
-      CUDADriver::GetDeviceAttribute(
+      GpuDriver::GetDeviceAttribute(
           CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, device_)
           .ValueOrDie());
 
@@ -1119,11 +1141,11 @@ DeviceDescription *CUDAExecutor::PopulateDeviceDescription() const {
   return built.release();
 }
 
-}  // namespace cuda
+}  // namespace gpu
 
 void initialize_cuda_gpu_executor() {
-  *internal::MakeCUDAExecutorImplementation() = [](const PluginConfig &config) {
-    return new cuda::CUDAExecutor{config};
+  *internal::MakeCUDAExecutorImplementation() = [](const PluginConfig& config) {
+    return new gpu::GpuExecutor{config};
   };
 }
 
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index ae8e4abf92024626bf3d2bd3d334244708f55737..9d02c7516cfd9aa1e86a7e534e41d54f8d8e5de3 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -22,289 +22,12 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_GPU_EXECUTOR_H_
 
-#include <set>
-#include <unordered_map>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/stream_executor/cuda/cuda_kernel.h"
-#include "tensorflow/stream_executor/event.h"
-#include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/platform.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
-#include "tensorflow/stream_executor/platform/port.h"
-#include "tensorflow/stream_executor/platform/thread_annotations.h"
-#include "tensorflow/stream_executor/stream_executor_internal.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
 
 namespace stream_executor {
 namespace cuda {
 
-// CUDA-platform implementation of the platform-agnostic
-// StreamExecutorInferface.
-class CUDAExecutor : public internal::StreamExecutorInterface {
- public:
-  // sub_platform indicates the subplatform used in this executor; it must
-  // be a CUDA type.
-  explicit CUDAExecutor(const PluginConfig &plugin_config)
-      : device_(0),
-        context_(nullptr),
-        device_ordinal_(0),
-        cc_major_(0),
-        cc_minor_(0),
-        plugin_config_(plugin_config) {}
-
-  // See the corresponding StreamExecutor methods for method comments on the
-  // following overrides.
-
-  ~CUDAExecutor() override;
-
-  port::Status Init(int device_ordinal, DeviceOptions device_options) override;
-
-  bool GetKernel(const MultiKernelLoaderSpec &spec,
-                 KernelBase *kernel) override;
-  void UnloadKernel(const KernelBase *kernel) override;
-  bool LoadModule(const MultiModuleLoaderSpec &spec,
-                  ModuleHandle *module_handle) override;
-  bool UnloadModule(ModuleHandle module_handle) override;
-
-  bool Launch(Stream *stream, const ThreadDim &thread_dims,
-              const BlockDim &block_dims, const KernelBase &k,
-              const KernelArgsArrayBase &args) override;
-
-  int CalculateOccupancy(const DeviceDescription &device_description,
-                         uint64 registers_per_thread,
-                         uint64 shared_memory_per_block,
-                         const ThreadDim &thread_dims, CUfunction func);
-
-  int CompareOccupancy(int *initial_blocks,
-                       const DeviceDescription &device_description,
-                       uint64 registers_per_thread,
-                       uint64 shared_memory_per_block,
-                       const ThreadDim &thread_dims, CUfunction func);
-
-  void *Allocate(uint64 size) override;
-
-  void *AllocateSubBuffer(DeviceMemoryBase *mem, uint64 offset_bytes,
-                          uint64 size_bytes) override;
-
-  void Deallocate(DeviceMemoryBase *mem) override;
-
-  void *UnifiedMemoryAllocate(uint64 size) override {
-    return CUDADriver::UnifiedMemoryAllocate(context_, size);
-  }
-
-  void UnifiedMemoryDeallocate(void *location) override {
-    return CUDADriver::UnifiedMemoryDeallocate(context_, location);
-  }
-
-  // CUDA allocation/registration functions are necessary because the driver
-  // internally sets up buffers for DMA operations (and page locks them).
-  // There's no external interface for us to otherwise control these DMA
-  // settings.
-  void *HostMemoryAllocate(uint64 size) override {
-    return CUDADriver::HostAllocate(context_, size);
-  }
-
-  void HostMemoryDeallocate(void *location) override {
-    return CUDADriver::HostDeallocate(context_, location);
-  }
-
-  bool HostMemoryRegister(void *location, uint64 size) override;
-
-  bool HostMemoryUnregister(void *location) override;
-
-  bool SynchronizeAllActivity() override;
-
-  bool SynchronousMemZero(DeviceMemoryBase *location, uint64 size) override;
-
-  bool SynchronousMemSet(DeviceMemoryBase *location, int value,
-                         uint64 size) override;
-
-  port::Status SynchronousMemcpy(DeviceMemoryBase *gpu_dst,
-                                 const void *host_src, uint64 size) override;
-
-  port::Status SynchronousMemcpy(void *host_dst,
-                                 const DeviceMemoryBase &gpu_src,
-                                 uint64 size) override;
-
-  port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *gpu_dst,
-                                               const DeviceMemoryBase &gpu_src,
-                                               uint64 size) override;
-
-  bool MemZero(Stream *stream, DeviceMemoryBase *location,
-               uint64 size) override;
-  bool Memset(Stream *stream, DeviceMemoryBase *location, uint8 pattern,
-              uint64 size) override;
-  bool Memset32(Stream *stream, DeviceMemoryBase *location, uint32 pattern,
-                uint64 size) override;
-
-  bool Memcpy(Stream *stream, void *host_dst, const DeviceMemoryBase &gpu_src,
-              uint64 size) override;
-
-  bool Memcpy(Stream *stream, DeviceMemoryBase *gpu_dst, const void *host_src,
-              uint64 size) override;
-
-  bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *gpu_dst,
-                            const DeviceMemoryBase &gpu_src,
-                            uint64 size) override;
-
-  bool HostCallback(Stream *stream,
-                    std::function<port::Status()> callback) override;
-
-  bool AllocateStream(Stream *stream) override;
-
-  void DeallocateStream(Stream *stream) override;
-
-  bool CreateStreamDependency(Stream *dependent, Stream *other) override;
-
-  bool AllocateTimer(Timer *timer) override;
-
-  void DeallocateTimer(Timer *timer) override;
-
-  bool StartTimer(Stream *stream, Timer *timer) override;
-
-  bool StopTimer(Stream *stream, Timer *timer) override;
-
-  port::Status AllocateEvent(Event *event) override;
-
-  port::Status DeallocateEvent(Event *event) override;
-
-  port::Status RecordEvent(Stream *stream, Event *event) override;
-
-  port::Status WaitForEvent(Stream *stream, Event *event) override;
-
-  Event::Status PollForEventStatus(Event *event) override;
-
-  port::Status BlockHostUntilDone(Stream *stream) override;
-
-  int PlatformDeviceCount() override { return CUDADriver::GetDeviceCount(); }
-
-  port::Status EnablePeerAccessTo(StreamExecutorInterface *other) override;
-
-  bool CanEnablePeerAccessTo(StreamExecutorInterface *other) override;
-
-  SharedMemoryConfig GetDeviceSharedMemoryConfig() override;
-
-  port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override;
-
-  bool DeviceMemoryUsage(int64 *free, int64 *total) const override;
-
-  // Search for the symbol and returns a device pointer and size.
-  // Returns false if symbol does not exist.
-  bool GetSymbol(const string &symbol_name, ModuleHandle module_handle,
-                 void **mem, size_t *bytes) override;
-
-  DeviceDescription *PopulateDeviceDescription() const override;
-
-  // Populates the block_dim_limit by querying the device driver API. If an
-  // error occurs at any point while asking the driver for block dim limits, it
-  // will be only partially populated as a result, and an error will be logged.
-  bool FillBlockDimLimit(BlockDim *block_dim_limit) const;
-
-  bool SupportsBlas() const override;
-
-  blas::BlasSupport *CreateBlas() override;
-
-  bool SupportsFft() const override;
-
-  fft::FftSupport *CreateFft() override;
-
-  bool SupportsRng() const override;
-
-  rng::RngSupport *CreateRng() override;
-
-  bool SupportsDnn() const override;
-
-  dnn::DnnSupport *CreateDnn() override;
-
-  std::unique_ptr<internal::EventInterface> CreateEventImplementation()
-      override;
-
-  std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
-      override;
-
-  std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
-
-  std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override;
-
-  void *GpuContextHack() override;
-
-  CudaContext* cuda_context();
-
- private:
-  // Attempts to find a more specific version of the file indicated by
-  // filename by looking for compute-capability-specific suffixed versions; i.e.
-  // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if
-  // we're on a compute capability 3.0 machine.
-  bool FindOnDiskForComputeCapability(absl::string_view filename,
-                                      absl::string_view canonical_suffix,
-                                      string *found_filename) const;
-
-  // Host callback landing routine invoked by CUDA.
-  // data: User-provided callback provided to HostCallback() above, captured
-  //       as a std::function<void()>. Allocated/initialized inside
-  //       HostCallback() and owned and deleted by this call.
-  static void InternalHostCallback(CUstream stream, CUresult status,
-                                   void *data);
-
-  // Collects metadata for the specified kernel.
-  bool GetKernelMetadata(CUDAKernel *cuda_kernel,
-                         KernelMetadata *kernel_metadata);
-
-  // Prints to VLOG(2) information about the kernel's occupancy and how it might
-  // be improved.
-  void VlogOccupancyInfo(const KernelBase &kernel, const ThreadDim &thread_dims,
-                         const BlockDim &block_dims);
-
-  bool LoadModuleFromCuBin(const char *cubin, CUmodule *module)
-      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
-
-  // Loads the PTX text `ptx` as a CUDA module.  `ptx` must be null terminated.
-  bool LoadModuleFromPtx(const char *ptx, CUmodule *module)
-      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
-
-  bool UnloadGpuBinary(const void *gpu_binary)
-      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
-
-  // Guards the in-memory-module mapping.
-  mutex in_memory_modules_mu_;
-
-  // Kernel -> loaded GPU binary. Many kernels may load the same binary.
-  std::unordered_map<const KernelBase *, const void *> kernel_to_gpu_binary_
-      GUARDED_BY(in_memory_modules_mu_);
-  // GPU binary (PTX or CUBIN) -> {CUDA module, reference count}.
-  std::unordered_map<const void *, std::pair<CUmodule, uint64>>
-      gpu_binary_to_module_ GUARDED_BY(in_memory_modules_mu_);
-
-  // Guards the launched kernel set.
-  mutex launched_kernels_mu_;
-
-  // Keeps track of the set of launched kernels. Currently used to suppress the
-  // occupancy check on subsequent launches.
-  std::set<CUfunction> launched_kernels_ GUARDED_BY(launched_kernels_mu_);
-
-  // Handle for the CUDA device being operated on. Immutable
-  // post-initialization.
-  CUdevice device_;
-
-  // Handle for session with the library/driver. Immutable post-initialization.
-  CudaContext* context_;
-
-  // The device ordinal value that this executor was initialized with; recorded
-  // for use in getting device metadata. Immutable post-initialization.
-  int device_ordinal_;
-
-  // The major verion of the compute capability for device_.
-  int cc_major_;
-
-  // The minor verion of the compute capability for device_.
-  int cc_minor_;
-
-  // The plugin configuration associated with this instance.
-  PluginConfig plugin_config_;
-
-  SE_DISALLOW_COPY_AND_ASSIGN(CUDAExecutor);
-};
+using CUDAExecutor = gpu::GpuExecutor;
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_helpers.h b/tensorflow/stream_executor/cuda/cuda_helpers.h
index d55706c66a9b47abfe125eaaa09e4b0cc543622a..af6dcf3549748ef74674b5362c86dc284c6712c8 100644
--- a/tensorflow/stream_executor/cuda/cuda_helpers.h
+++ b/tensorflow/stream_executor/cuda/cuda_helpers.h
@@ -17,89 +17,9 @@ limitations under the License.
 //
 // These are typically placed here for use by multiple source components (for
 // example, BLAS and executor components).
-
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
 
-#include <stddef.h>
-#include <complex>
-
-#include "cuda/include/cuComplex.h"
-#include "cuda/include/cuda.h"
-
-namespace stream_executor {
-
-template <typename ElemT>
-class DeviceMemory;
-
-namespace cuda {
-
-// Converts a const DeviceMemory reference to its underlying typed pointer in
-// CUDA
-// device memory.
-template <typename T>
-const T *CUDAMemory(const DeviceMemory<T> &mem) {
-  return static_cast<const T *>(mem.opaque());
-}
-
-// Converts a (non-const) DeviceMemory pointer reference to its underlying typed
-// pointer in CUDA device memory.
-template <typename T>
-T *CUDAMemoryMutable(DeviceMemory<T> *mem) {
-  return static_cast<T *>(mem->opaque());
-}
-
-static_assert(sizeof(std::complex<float>) == sizeof(cuComplex),
-              "std::complex<float> and cuComplex should have the same size");
-static_assert(offsetof(cuComplex, x) == 0,
-              "The real part of cuComplex should appear first.");
-static_assert(sizeof(std::complex<double>) == sizeof(cuDoubleComplex),
-              "std::complex<double> and cuDoubleComplex should have the same "
-              "size");
-static_assert(offsetof(cuDoubleComplex, x) == 0,
-              "The real part of cuDoubleComplex should appear first.");
-
-// Type traits to get CUDA complex types from std::complex<>.
-
-template <typename T>
-struct CUDAComplexT {
-  typedef T type;
-};
-
-template <>
-struct CUDAComplexT<std::complex<float>> {
-  typedef cuComplex type;
-};
-
-template <>
-struct CUDAComplexT<std::complex<double>> {
-  typedef cuDoubleComplex type;
-};
-
-// Converts pointers of std::complex<> to pointers of
-// cuComplex/cuDoubleComplex. No type conversion for non-complex types.
-
-template <typename T>
-inline const typename CUDAComplexT<T>::type *CUDAComplex(const T *p) {
-  return reinterpret_cast<const typename CUDAComplexT<T>::type *>(p);
-}
-
-template <typename T>
-inline typename CUDAComplexT<T>::type *CUDAComplex(T *p) {
-  return reinterpret_cast<typename CUDAComplexT<T>::type *>(p);
-}
-
-// Converts values of std::complex<float/double> to values of
-// cuComplex/cuDoubleComplex.
-inline cuComplex CUDAComplexValue(std::complex<float> val) {
-  return {val.real(), val.imag()};
-}
-
-inline cuDoubleComplex CUDAComplexValue(std::complex<double> val) {
-  return {val.real(), val.imag()};
-}
-
-}  // namespace cuda
-}  // namespace stream_executor
+#include "tensorflow/stream_executor/gpu/gpu_helpers.h"
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_HELPERS_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_kernel.cc b/tensorflow/stream_executor/cuda/cuda_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1b4e9a178fbcab63adb0a14bc806ac3ee3a60416
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_kernel.cc
@@ -0,0 +1,38 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/cuda/cuda_kernel.h"
+
+namespace stream_executor {
+namespace gpu {
+
+CUfunc_cache GpuKernel::GetGpuCacheConfig() const {
+  switch (preferred_cache_config_) {
+    case KernelCacheConfig::kNoPreference:
+      return CU_FUNC_CACHE_PREFER_NONE;
+    case KernelCacheConfig::kPreferShared:
+      return CU_FUNC_CACHE_PREFER_SHARED;
+    case KernelCacheConfig::kPreferL1:
+      return CU_FUNC_CACHE_PREFER_L1;
+    case KernelCacheConfig::kPreferEqual:
+      return CU_FUNC_CACHE_PREFER_EQUAL;
+    default:
+      LOG(FATAL) << "Unknown KernelCacheConfig"
+                 << static_cast<int32>(preferred_cache_config_);
+  }
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_kernel.h b/tensorflow/stream_executor/cuda/cuda_kernel.h
index ec1dc51e57f5a928d54cb86b1cbcc217100df6d4..a8a18d200d93168660d70746db442aeaed146290 100644
--- a/tensorflow/stream_executor/cuda/cuda_kernel.h
+++ b/tensorflow/stream_executor/cuda/cuda_kernel.h
@@ -22,104 +22,12 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_KERNEL_H_
 
-#include "tensorflow/stream_executor/kernel_cache_config.h"
-#include "tensorflow/stream_executor/stream_executor_internal.h"
-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
-#include "tensorflow/stream_executor/platform/port.h"
-#include "tensorflow/stream_executor/platform/logging.h"
-#include "cuda/include/cuda.h"
-
-#ifdef PLATFORMS_GPUS_CUDA_DYNAMIC_LIBCUDA_DYNAMIC_LIBCUDA_H_
-#error \
-    "No driver calls in this file, wrap driver functionality in cuda_driver.cc."
-#endif
-
-#ifdef __CUDA_RUNTIME_H__
-#error \
-    "CUDA runtime being included into CUDA GPU executor; should be driver only."
-#endif
+#include "tensorflow/stream_executor/gpu/gpu_kernel.h"
 
 namespace stream_executor {
 namespace cuda {
 
-// Wraps a CUfunction to implement the platform-independent KernelInterface.
-class CUDAKernel : public internal::KernelInterface {
- public:
-  CUDAKernel() : cuda_function_(nullptr), arity_(0),
-                 preferred_cache_config_(KernelCacheConfig::kNoPreference) {}
-
-  // Note that the function is unloaded when the module is unloaded, and the
-  // module that the function is contained in is owned by the CUDAExecutor.
-  ~CUDAKernel() override {}
-
-  // As arity cannot be reflected upon using the CUDA API, the arity is
-  // explicitly set during the CUDAExecutor::GetKernel initialization process.
-  void set_arity(unsigned arity) { arity_ = arity; }
-  unsigned Arity() const override { return arity_; }
-
-  // Returns the CUfunction value for passing to the CUDA API.
-  CUfunction AsCUDAFunctionValue() const {
-    DCHECK(cuda_function_ != nullptr);
-    return const_cast<CUfunction>(cuda_function_);
-  }
-
-  // Returns the slot that the CUfunction is stored within for this object,
-  // for the CUDA API which wants to load into a CUfunction*.
-  CUfunction *cuda_function_ptr() { return &cuda_function_; }
-
-  // CUDA supports setting the preferred cache configuration of a CUfunction
-  // (more-or-less equivalent to a CUDAKernel). We support this via the below
-  // functions; users can set a preference, and that is applied when the kernel
-  // is [lazy-]loaded (in CUDAExecutor::Launch). The alternative would be to
-  // load the kernel & set the preference when the user calls the setter below;
-  // either approach is valid.
-  // Sets the current kernel cache configuration preference.
-  void SetPreferredCacheConfig(KernelCacheConfig config) override {
-    preferred_cache_config_ = config;
-  }
-
-  // Returns the current kernel cache configuration preference.
-  KernelCacheConfig GetPreferredCacheConfig() const override {
-    return preferred_cache_config_;
-  }
-
-  // Returns the current kernel cache configuration preference as a
-  // CUfunc_cache.
-  CUfunc_cache GetCUDACacheConfig() const {
-    switch (preferred_cache_config_) {
-      case KernelCacheConfig::kNoPreference:
-        return CU_FUNC_CACHE_PREFER_NONE;
-      case KernelCacheConfig::kPreferShared:
-        return CU_FUNC_CACHE_PREFER_SHARED;
-      case KernelCacheConfig::kPreferL1:
-        return CU_FUNC_CACHE_PREFER_L1;
-      case KernelCacheConfig::kPreferEqual:
-        return CU_FUNC_CACHE_PREFER_EQUAL;
-      default:
-        LOG(FATAL) << "Unknown KernelCacheConfig"
-                   << static_cast<int32>(preferred_cache_config_);
-    }
-  }
-
- private:
-  CUfunction cuda_function_;  // Wrapped CUDA kernel handle.
-  unsigned arity_;            // Number of formal parameters the kernel takes.
-
-  // Preferred (but not required) cache configuration for this kernel.
-  KernelCacheConfig preferred_cache_config_;
-};
-
-// Given a platform-independent kernel datatype, returns the (const) internal
-// CUDA platform implementation pointer.
-inline const CUDAKernel *AsCUDAKernel(const KernelBase *kernel) {
-  return static_cast<const CUDAKernel *>(kernel->implementation());
-}
-
-// Given a platform-independent kernel datatype, returns the (non-const)
-// internal CUDA platform implementation pointer.
-inline CUDAKernel *AsCUDAKernel(KernelBase *kernel) {
-  return static_cast<CUDAKernel *>(kernel->implementation());
-}
+using CUDAKernel = gpu::GpuKernel;
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.cc b/tensorflow/stream_executor/cuda/cuda_platform.cc
index b342e71bdd94f6112d500d86f6ed4051821d2d54..54aba01278d17505a33d190fba85eb543dd624e1 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform.cc
+++ b/tensorflow/stream_executor/cuda/cuda_platform.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/stringprintf.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 namespace {
 
 // Synchronize with spinlocks.
@@ -129,16 +129,16 @@ port::StatusOr<StreamExecutor*> CudaPlatform::FirstExecutorForBus(
       port::Printf("Executor for bus %d not found.", bus_ordinal));
 }
 
-Platform::Id CudaPlatform::id() const { return kCudaPlatformId; }
+Platform::Id CudaPlatform::id() const { return cuda::kCudaPlatformId; }
 
 int CudaPlatform::VisibleDeviceCount() const {
   // Throw away the result - it logs internally, and this [containing] function
   // isn't in the path of user control. It's safe to call this > 1x.
-  if (!cuda::CUDADriver::Init().ok()) {
+  if (!gpu::GpuDriver::Init().ok()) {
     return -1;
   }
 
-  return CUDADriver::GetDeviceCount();
+  return GpuDriver::GetDeviceCount();
 }
 
 const string& CudaPlatform::Name() const { return name_; }
@@ -169,7 +169,7 @@ port::StatusOr<StreamExecutor*> CudaPlatform::GetExecutor(
 port::StatusOr<std::unique_ptr<StreamExecutor>>
 CudaPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
   auto executor = MakeUnique<StreamExecutor>(
-      this, MakeUnique<CUDAExecutor>(config.plugin_config));
+      this, MakeUnique<GpuExecutor>(config.plugin_config));
   auto init_status = executor->Init(config.ordinal, config.device_options);
   if (!init_status.ok()) {
     return port::Status(
@@ -191,13 +191,13 @@ void CudaPlatform::UnregisterTraceListener(TraceListener* listener) {
   LOG(FATAL) << "not yet implemented: unregister CUDA trace listener";
 }
 
-}  // namespace cuda
+}  // namespace gpu
 
 static void InitializeCudaPlatform() {
   // Disabling leak checking, MultiPlatformManager does not destroy its
   // registered platforms.
 
-  std::unique_ptr<cuda::CudaPlatform> platform(new cuda::CudaPlatform);
+  std::unique_ptr<gpu::CudaPlatform> platform(new gpu::CudaPlatform);
   SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
 }
 
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.h b/tensorflow/stream_executor/cuda/cuda_platform.h
index fc0e15d5a6a9142f064085d34fcfaedfb25f433a..b21e9797be719fe9fe9ce4ebd75c36a485efb69b 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform.h
+++ b/tensorflow/stream_executor/cuda/cuda_platform.h
@@ -33,12 +33,13 @@ limitations under the License.
 
 namespace stream_executor {
 namespace cuda {
-
 // Opaque and unique identifier for the CUDA platform plugin.
 // This is needed so that plugins can refer to/identify this platform without
 // instantiating a CudaPlatform object.
 extern const Platform::Id kCudaPlatformId;
+}  // namespace cuda
 
+namespace gpu {
 // Cuda-specific platform plugin, registered as a singleton value via module
 // initializer.
 class CudaPlatform : public Platform {
@@ -102,6 +103,12 @@ class CudaPlatform : public Platform {
   SE_DISALLOW_COPY_AND_ASSIGN(CudaPlatform);
 };
 
+}  // namespace gpu
+
+namespace cuda {
+
+using CudaPlatform = gpu::CudaPlatform;
+
 }  // namespace cuda
 }  // namespace stream_executor
 
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.cc b/tensorflow/stream_executor/cuda/cuda_rng.cc
index 7f920719321637360fdf5c098e83dfaa49164e6c..395b30b4916b68594ac35f96ec08f361c877a2de 100644
--- a/tensorflow/stream_executor/cuda/cuda_rng.cc
+++ b/tensorflow/stream_executor/cuda/cuda_rng.cc
@@ -21,17 +21,15 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/device_memory.h"
-
-#ifndef PLATFORM_GOOGLE
-#include "tensorflow/stream_executor/dso_loader.h"
-#endif
-
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/rng.h"
+// clang-format off
 #include "cuda/include/curand.h"
+// clang-format on
 
 // Formats curandStatus_t to output prettified values into a log stream.
 std::ostream &operator<<(std::ostream &in, const curandStatus_t &status) {
@@ -60,33 +58,33 @@ std::ostream &operator<<(std::ostream &in, const curandStatus_t &status) {
 }
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
-PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuRandPlugin);
+PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kGpuRandPlugin);
 
 namespace wrap {
 
 #ifdef PLATFORM_GOOGLE
-#define STREAM_EXECUTOR_CURAND_WRAP(__name)                         \
-  struct WrapperShim__##__name {                                    \
-    template <typename... Args>                                     \
-    curandStatus_t operator()(CUDAExecutor *parent, Args... args) { \
-      cuda::ScopedActivateExecutorContext sac{parent};              \
-      return ::__name(args...);                                     \
-    }                                                               \
+#define STREAM_EXECUTOR_CURAND_WRAP(__name)                        \
+  struct WrapperShim__##__name {                                   \
+    template <typename... Args>                                    \
+    curandStatus_t operator()(GpuExecutor* parent, Args... args) { \
+      gpu::ScopedActivateExecutorContext sac{parent};              \
+      return ::__name(args...);                                    \
+    }                                                              \
   } __name;
 
 #else
 #define STREAM_EXECUTOR_CURAND_WRAP(__name)                               \
   struct DynLoadShim__##__name {                                          \
-    static const char *kName;                                             \
+    static const char* kName;                                             \
     using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
-    static void *GetDsoHandle() {                                         \
+    static void* GetDsoHandle() {                                         \
       auto s = internal::CachedDsoLoader::GetCurandDsoHandle();           \
       return s.ValueOrDie();                                              \
     }                                                                     \
     static FuncPtrT LoadOrDie() {                                         \
-      void *f;                                                            \
+      void* f;                                                            \
       auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
                                                           kName, &f);     \
       CHECK(s.ok()) << "could not find " << kName                         \
@@ -98,12 +96,12 @@ namespace wrap {
       return f;                                                           \
     }                                                                     \
     template <typename... Args>                                           \
-    curandStatus_t operator()(CUDAExecutor *parent, Args... args) {       \
-      cuda::ScopedActivateExecutorContext sac{parent};                    \
+    curandStatus_t operator()(GpuExecutor* parent, Args... args) {        \
+      gpu::ScopedActivateExecutorContext sac{parent};                     \
       return DynLoad()(args...);                                          \
     }                                                                     \
   } __name;                                                               \
-  const char *DynLoadShim__##__name::kName = #__name;
+  const char* DynLoadShim__##__name::kName = #__name;
 #endif
 
 STREAM_EXECUTOR_CURAND_WRAP(curandCreateGenerator);
@@ -118,38 +116,15 @@ STREAM_EXECUTOR_CURAND_WRAP(curandGenerateNormalDouble);
 
 }  // namespace wrap
 
-template <typename T>
-string TypeString();
-
-template <>
-string TypeString<float>() {
-  return "float";
-}
-
-template <>
-string TypeString<double>() {
-  return "double";
-}
-
-template <>
-string TypeString<std::complex<float>>() {
-  return "std::complex<float>";
-}
-
-template <>
-string TypeString<std::complex<double>>() {
-  return "std::complex<double>";
-}
-
-CUDARng::CUDARng(CUDAExecutor *parent) : parent_(parent), rng_(nullptr) {}
+GpuRng::GpuRng(GpuExecutor* parent) : parent_(parent), rng_(nullptr) {}
 
-CUDARng::~CUDARng() {
+GpuRng::~GpuRng() {
   if (rng_ != nullptr) {
     wrap::curandDestroyGenerator(parent_, rng_);
   }
 }
 
-bool CUDARng::Init() {
+bool GpuRng::Init() {
   mutex_lock lock(mu_);
   CHECK(rng_ == nullptr);
 
@@ -164,9 +139,9 @@ bool CUDARng::Init() {
   return true;
 }
 
-bool CUDARng::SetStream(Stream *stream) {
+bool GpuRng::SetStream(Stream* stream) {
   curandStatus_t ret =
-      wrap::curandSetStream(parent_, rng_, AsCUDAStreamValue(stream));
+      wrap::curandSetStream(parent_, rng_, AsGpuStreamValue(stream));
   if (ret != CURAND_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to set stream for random generation: " << ret;
     return false;
@@ -184,8 +159,7 @@ constexpr bool ComplexIsConsecutiveFloats() {
 }
 
 template <typename T>
-bool CUDARng::DoPopulateRandUniformInternal(Stream *stream,
-                                            DeviceMemory<T> *v) {
+bool GpuRng::DoPopulateRandUniformInternal(Stream* stream, DeviceMemory<T>* v) {
   mutex_lock lock(mu_);
   static_assert(ComplexIsConsecutiveFloats(),
                 "std::complex values are not stored as consecutive values");
@@ -205,11 +179,11 @@ bool CUDARng::DoPopulateRandUniformInternal(Stream *stream,
   if (std::is_same<T, float>::value ||
       std::is_same<T, std::complex<float>>::value) {
     ret = wrap::curandGenerateUniform(
-        parent_, rng_, reinterpret_cast<float *>(CUDAMemoryMutable(v)),
+        parent_, rng_, reinterpret_cast<float*>(GpuMemoryMutable(v)),
         element_count);
   } else {
     ret = wrap::curandGenerateUniformDouble(
-        parent_, rng_, reinterpret_cast<double *>(CUDAMemoryMutable(v)),
+        parent_, rng_, reinterpret_cast<double*>(GpuMemoryMutable(v)),
         element_count);
   }
   if (ret != CURAND_STATUS_SUCCESS) {
@@ -222,29 +196,29 @@ bool CUDARng::DoPopulateRandUniformInternal(Stream *stream,
   return true;
 }
 
-bool CUDARng::DoPopulateRandUniform(Stream *stream, DeviceMemory<float> *v) {
+bool GpuRng::DoPopulateRandUniform(Stream* stream, DeviceMemory<float>* v) {
   return DoPopulateRandUniformInternal(stream, v);
 }
 
-bool CUDARng::DoPopulateRandUniform(Stream *stream, DeviceMemory<double> *v) {
+bool GpuRng::DoPopulateRandUniform(Stream* stream, DeviceMemory<double>* v) {
   return DoPopulateRandUniformInternal(stream, v);
 }
 
-bool CUDARng::DoPopulateRandUniform(Stream *stream,
-                                    DeviceMemory<std::complex<float>> *v) {
+bool GpuRng::DoPopulateRandUniform(Stream* stream,
+                                   DeviceMemory<std::complex<float>>* v) {
   return DoPopulateRandUniformInternal(stream, v);
 }
 
-bool CUDARng::DoPopulateRandUniform(Stream *stream,
-                                    DeviceMemory<std::complex<double>> *v) {
+bool GpuRng::DoPopulateRandUniform(Stream* stream,
+                                   DeviceMemory<std::complex<double>>* v) {
   return DoPopulateRandUniformInternal(stream, v);
 }
 
 template <typename ElemT, typename FuncT>
-bool CUDARng::DoPopulateRandGaussianInternal(Stream *stream, ElemT mean,
-                                             ElemT stddev,
-                                             DeviceMemory<ElemT> *v,
-                                             FuncT func) {
+bool GpuRng::DoPopulateRandGaussianInternal(Stream* stream, ElemT mean,
+                                            ElemT stddev,
+                                            DeviceMemory<ElemT>* v,
+                                            FuncT func) {
   mutex_lock lock(mu_);
 
   if (!SetStream(stream)) {
@@ -253,7 +227,7 @@ bool CUDARng::DoPopulateRandGaussianInternal(Stream *stream, ElemT mean,
 
   uint64 element_count = v->ElementCount();
   curandStatus_t ret =
-      func(parent_, rng_, CUDAMemoryMutable(v), element_count, mean, stddev);
+      func(parent_, rng_, GpuMemoryMutable(v), element_count, mean, stddev);
 
   if (ret != CURAND_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to do gaussian generation of " << v->ElementCount()
@@ -264,19 +238,19 @@ bool CUDARng::DoPopulateRandGaussianInternal(Stream *stream, ElemT mean,
   return true;
 }
 
-bool CUDARng::DoPopulateRandGaussian(Stream *stream, float mean, float stddev,
-                                     DeviceMemory<float> *v) {
+bool GpuRng::DoPopulateRandGaussian(Stream* stream, float mean, float stddev,
+                                    DeviceMemory<float>* v) {
   return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
                                         wrap::curandGenerateNormal);
 }
 
-bool CUDARng::DoPopulateRandGaussian(Stream *stream, double mean, double stddev,
-                                     DeviceMemory<double> *v) {
+bool GpuRng::DoPopulateRandGaussian(Stream* stream, double mean, double stddev,
+                                    DeviceMemory<double>* v) {
   return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
                                         wrap::curandGenerateNormalDouble);
 }
 
-bool CUDARng::SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) {
+bool GpuRng::SetSeed(Stream* stream, const uint8* seed, uint64 seed_bytes) {
   mutex_lock lock(mu_);
   CHECK(rng_ != nullptr);
 
@@ -305,15 +279,15 @@ bool CUDARng::SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) {
   return true;
 }
 
-}  // namespace cuda
+}  // namespace gpu
 
 void initialize_curand() {
   port::Status status =
       PluginRegistry::Instance()->RegisterFactory<PluginRegistry::RngFactory>(
-          cuda::kCudaPlatformId, cuda::kCuRandPlugin, "cuRAND",
-          [](internal::StreamExecutorInterface *parent) -> rng::RngSupport * {
-            cuda::CUDAExecutor *cuda_executor =
-                dynamic_cast<cuda::CUDAExecutor *>(parent);
+          cuda::kCudaPlatformId, gpu::kGpuRandPlugin, "cuRAND",
+          [](internal::StreamExecutorInterface* parent) -> rng::RngSupport* {
+            gpu::GpuExecutor* cuda_executor =
+                dynamic_cast<gpu::GpuExecutor*>(parent);
             if (cuda_executor == nullptr) {
               LOG(ERROR)
                   << "Attempting to initialize an instance of the cuRAND "
@@ -321,7 +295,7 @@ void initialize_curand() {
               return nullptr;
             }
 
-            cuda::CUDARng *rng = new cuda::CUDARng(cuda_executor);
+            gpu::GpuRng* rng = new gpu::GpuRng(cuda_executor);
             if (!rng->Init()) {
               // Note: Init() will log a more specific error.
               delete rng;
@@ -336,7 +310,7 @@ void initialize_curand() {
   }
 
   PluginRegistry::Instance()->SetDefaultFactory(
-      cuda::kCudaPlatformId, PluginKind::kRng, cuda::kCuRandPlugin);
+      cuda::kCudaPlatformId, PluginKind::kRng, gpu::kGpuRandPlugin);
 }
 
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.h b/tensorflow/stream_executor/cuda/cuda_rng.h
index 57ef398aaa88da7de769c49820325c6c9feb4d70..d7f6b0e8e034967ed2919332aafca9c7a8081eba 100644
--- a/tensorflow/stream_executor/cuda/cuda_rng.h
+++ b/tensorflow/stream_executor/cuda/cuda_rng.h
@@ -16,85 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_RNG_H_
 
-#include "tensorflow/stream_executor/platform/mutex.h"
-#include "tensorflow/stream_executor/platform/port.h"
-#include "tensorflow/stream_executor/platform/thread_annotations.h"
-#include "tensorflow/stream_executor/plugin_registry.h"
-#include "tensorflow/stream_executor/rng.h"
-
-typedef struct curandGenerator_st *curandGenerator_t;
+#include "tensorflow/stream_executor/gpu/gpu_rng.h"
 
 namespace stream_executor {
 
-class Stream;
-template <typename ElemT>
-class DeviceMemory;
-
 namespace cuda {
 
-// Opaque and unique identifier for the cuRAND plugin.
-extern const PluginId kCuRandPlugin;
-
-class CUDAExecutor;
-
-// CUDA-platform implementation of the random number generation support
-// interface.
-//
-// Thread-safe post-initialization.
-class CUDARng : public rng::RngSupport {
- public:
-  explicit CUDARng(CUDAExecutor *parent);
-
-  // Retrieves a curand library generator handle. This is necessary for
-  // enqueuing random number generation work onto the device.
-  // TODO(leary) provide a way for users to select the RNG algorithm.
-  bool Init();
-
-  // Releases a curand library generator handle, if one was acquired.
-  ~CUDARng() override;
-
-  // See rng::RngSupport for details on the following overrides.
-  bool DoPopulateRandUniform(Stream *stream, DeviceMemory<float> *v) override;
-  bool DoPopulateRandUniform(Stream *stream, DeviceMemory<double> *v) override;
-  bool DoPopulateRandUniform(Stream *stream,
-                             DeviceMemory<std::complex<float>> *v) override;
-  bool DoPopulateRandUniform(Stream *stream,
-                             DeviceMemory<std::complex<double>> *v) override;
-  bool DoPopulateRandGaussian(Stream *stream, float mean, float stddev,
-                              DeviceMemory<float> *v) override;
-  bool DoPopulateRandGaussian(Stream *stream, double mean, double stddev,
-                              DeviceMemory<double> *v) override;
-
-  bool SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) override;
-
- private:
-  // Actually performs the work of generating random numbers - the public
-  // methods are thin wrappers to this interface.
-  template <typename T>
-  bool DoPopulateRandUniformInternal(Stream *stream, DeviceMemory<T> *v);
-  template <typename ElemT, typename FuncT>
-  bool DoPopulateRandGaussianInternal(Stream *stream, ElemT mean, ElemT stddev,
-                                      DeviceMemory<ElemT> *v, FuncT func);
-
-  // Sets the stream for the internal curand generator.
-  //
-  // This is a stateful operation, as the handle can only have one stream set at
-  // a given time, so it is usually performed right before enqueuing work to do
-  // with random number generation.
-  bool SetStream(Stream *stream) EXCLUSIVE_LOCKS_REQUIRED(mu_);
-
-  // mutex that guards the cuRAND handle for this device.
-  mutex mu_;
-
-  // CUDAExecutor which instantiated this CUDARng.
-  // Immutable post-initialization.
-  CUDAExecutor *parent_;
-
-  // cuRANDalibrary handle on the device.
-  curandGenerator_t rng_ GUARDED_BY(mu_);
-
-  SE_DISALLOW_COPY_AND_ASSIGN(CUDARng);
-};
+using CUDARng = gpu::GpuRng;
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_stream.h b/tensorflow/stream_executor/cuda/cuda_stream.h
index bb8bda4755344d859668425f89614cc87d7e2d3e..4460351368894a009eaa4d7186e809ddf3fa3aed 100644
--- a/tensorflow/stream_executor/cuda/cuda_stream.h
+++ b/tensorflow/stream_executor/cuda/cuda_stream.h
@@ -13,79 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Defines the CUDAStream type - the CUDA-specific implementation of the generic
+// Defines the GpuStream type - the CUDA-specific implementation of the generic
 // StreamExecutor Stream interface.
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
 
-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
-#include "tensorflow/stream_executor/platform/thread_annotations.h"
-#include "tensorflow/stream_executor/stream_executor_internal.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
 
 namespace stream_executor {
 namespace cuda {
 
-class CUDAExecutor;
+using CUDAStream = gpu::GpuStream;
 
-// Wraps a CUstream in order to satisfy the platform-independent
-// StreamInterface.
-//
-// Thread-safe post-initialization.
-class CUDAStream : public internal::StreamInterface {
- public:
-  explicit CUDAStream(CUDAExecutor *parent)
-      : parent_(parent), cuda_stream_(nullptr), completed_event_(nullptr) {}
-
-  // Note: teardown is handled by a parent's call to DeallocateStream.
-  ~CUDAStream() override {}
-
-  void *GpuStreamHack() override { return cuda_stream_; }
-  void **GpuStreamMemberHack() override {
-    return reinterpret_cast<void **>(&cuda_stream_);
-  }
-
-  // Explicitly initialize the CUDA resources associated with this stream, used
-  // by StreamExecutor::AllocateStream().
-  bool Init();
-
-  // Explicitly destroy the CUDA resources associated with this stream, used by
-  // StreamExecutor::DeallocateStream().
-  void Destroy();
-
-  // Returns true if no work is pending or executing on the stream.
-  bool IsIdle() const;
-
-  // Retrieves an event which indicates that all work enqueued into the stream
-  // has completed. Ownership of the event is not transferred to the caller, the
-  // event is owned by this stream.
-  CUevent* completed_event() { return &completed_event_; }
-
-  // Returns the CUstream value for passing to the CUDA API.
-  //
-  // Precond: this CUDAStream has been allocated (otherwise passing a nullptr
-  // into the NVIDIA library causes difficult-to-understand faults).
-  CUstream cuda_stream() const {
-    DCHECK(cuda_stream_ != nullptr);
-    return const_cast<CUstream>(cuda_stream_);
-  }
-
-  CUDAExecutor *parent() const { return parent_; }
-
- private:
-  CUDAExecutor *parent_;  // Executor that spawned this stream.
-  CUstream cuda_stream_;  // Wrapped CUDA stream handle.
-
-  // Event that indicates this stream has completed.
-  CUevent completed_event_ = nullptr;
-};
-
-// Helper functions to simplify extremely common flows.
-// Converts a Stream to the underlying CUDAStream implementation.
-CUDAStream *AsCUDAStream(Stream *stream);
-
-// Extracts a CUstream from a CUDAStream-backed Stream object.
-CUstream AsCUDAStreamValue(Stream *stream);
+inline CUDAStream* AsCUDAStream(Stream* stream) {
+  return gpu::AsGpuStream(stream);
+}
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_timer.h b/tensorflow/stream_executor/cuda/cuda_timer.h
index e040cf86fad1f40a708ad4ca28693e31908393f0..01b722e888687c0e199d7fe8ace92aec407f3a4b 100644
--- a/tensorflow/stream_executor/cuda/cuda_timer.h
+++ b/tensorflow/stream_executor/cuda/cuda_timer.h
@@ -13,76 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Defines the CUDATimer type - the CUDA-specific implementation of the generic
+// Defines the GpuTimer type - the CUDA-specific implementation of the generic
 // StreamExecutor Timer interface.
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_TIMER_H_
 
-#include "tensorflow/stream_executor/stream_executor_internal.h"
-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
-#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_timer.h"
 
 namespace stream_executor {
 namespace cuda {
 
-class CUDAExecutor;
-class CUDAStream;
-
-// Wraps a pair of CUevents in order to satisfy the platform-independent
-// TimerInferface -- both a start and a stop event are present which may be
-// recorded in a stream.
-class CUDATimer : public internal::TimerInterface {
- public:
-  explicit CUDATimer(CUDAExecutor *parent)
-      : parent_(parent), start_event_(nullptr), stop_event_(nullptr) {}
-
-  // Note: teardown needs to be explicitly handled in this API by a call to
-  // StreamExecutor::DeallocateTimer(), which invokes Destroy().
-  // TODO(csigg): Change to RAII.
-  ~CUDATimer() override {}
-
-  // Allocates the platform-specific pieces of the timer, called as part of
-  // StreamExecutor::AllocateTimer().
-  bool Init();
-
-  // Deallocates the platform-specific pieces of the timer, called as part of
-  // StreamExecutor::DeallocateTimer().
-  void Destroy();
-
-  // Records the "timer start" event at the current point in the stream.
-  bool Start(CUDAStream *stream);
-
-  // Records the "timer stop" event at the current point in the stream.
-  bool Stop(CUDAStream *stream);
-
-  // Returns the elapsed time, in milliseconds, between the start and stop
-  // events.
-  float GetElapsedMilliseconds() const;
-
-  // See Timer::Microseconds().
-  // TODO(leary) make this into an error code interface...
-  uint64 Microseconds() const override {
-    return GetElapsedMilliseconds() * 1e3;
-  }
-
-  // See Timer::Nanoseconds().
-  uint64 Nanoseconds() const override { return GetElapsedMilliseconds() * 1e6; }
-
- private:
-  CUDAExecutor *parent_;
-  CUevent start_event_;  // Event recorded to indicate the "start" timestamp
-                         // executing in a stream.
-  CUevent stop_event_;   // Event recorded to indicate the "stop" timestamp
-                         // executing in a stream.
-};
-
-struct TimerDeleter {
-  void operator()(CUDATimer *t) {
-    t->Destroy();
-    delete t;
-  }
-};
+using CUDATimer = gpu::GpuTimer;
 
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cudart_stub.cc b/tensorflow/stream_executor/cuda/cudart_stub.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8878700c5ea9b48f0bad2038d803e61c71313dad
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cudart_stub.cc
@@ -0,0 +1,121 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file wraps cuda runtime calls with dso loader so that we don't need to
+// have explicit linking to libcuda.
+
+#include "cuda/include/cuda_runtime_api.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+
+namespace {
+void* GetDsoHandle() {
+  static auto handle = []() -> void* {
+    auto handle_or =
+        stream_executor::internal::DsoLoader::GetCudaRuntimeDsoHandle();
+    if (!handle_or.ok()) return nullptr;
+    return handle_or.ValueOrDie();
+  }();
+  return handle;
+}
+
+template <typename T>
+T LoadSymbol(const char* symbol_name) {
+  void* symbol = nullptr;
+  auto env = stream_executor::port::Env::Default();
+  env->GetSymbolFromLibrary(GetDsoHandle(), symbol_name, &symbol).IgnoreError();
+  return reinterpret_cast<T>(symbol);
+}
+cudaError_t GetSymbolNotFoundError() {
+  return cudaErrorSharedObjectSymbolNotFound;
+}
+const char* GetSymbolNotFoundStrError() {
+  return "cudaErrorSharedObjectSymbolNotFound";
+}
+}  // namespace
+
+// Code below is auto-generated.
+extern "C" {
+cudaError_t CUDART_CB cudaFree(void* devPtr) {
+  using FuncPtr = cudaError_t (*)(void* devPtr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFree");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr);
+}
+
+cudaError_t CUDART_CB cudaGetDevice(int* device) {
+  using FuncPtr = cudaError_t (*)(int* device);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+cudaError_t CUDART_CB cudaGetDeviceProperties(cudaDeviceProp* prop,
+                                              int device) {
+  using FuncPtr = cudaError_t (*)(cudaDeviceProp * prop, int device);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceProperties");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(prop, device);
+}
+
+const char* CUDART_CB cudaGetErrorString(cudaError_t error) {
+  using FuncPtr = const char* (*)(cudaError_t error);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorString");
+  if (!func_ptr) return GetSymbolNotFoundStrError();
+  return func_ptr(error);
+}
+
+cudaError_t CUDART_CB cudaSetDevice(int device) {
+  using FuncPtr = cudaError_t (*)(int device);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+cudaError_t CUDART_CB cudaStreamAddCallback(cudaStream_t stream,
+                                            cudaStreamCallback_t callback,
+                                            void* userData,
+                                            unsigned int flags) {
+  using FuncPtr =
+      cudaError_t (*)(cudaStream_t stream, cudaStreamCallback_t callback,
+                      void* userData, unsigned int flags);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAddCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, callback, userData, flags);
+}
+
+cudaError_t CUDART_CB cudaGetDeviceCount(int* count) {
+  using FuncPtr = cudaError_t (*)(int* count);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count);
+}
+
+cudaError_t CUDART_CB cudaPointerGetAttributes(
+    struct cudaPointerAttributes* attributes, const void* ptr) {
+  using FuncPtr = cudaError_t (*)(struct cudaPointerAttributes * attributes,
+                                  const void* ptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPointerGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(attributes, ptr);
+}
+
+cudaError_t CUDART_CB cudaGetLastError() {
+  using FuncPtr = cudaError_t (*)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetLastError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cudnn_version.cc b/tensorflow/stream_executor/cuda/cudnn_version.cc
index e8fcc0361850a561928d09f29f78fb57071c24b2..9ef8bc95e5644ed060d88335de4f9d1abd5f719d 100644
--- a/tensorflow/stream_executor/cuda/cudnn_version.cc
+++ b/tensorflow/stream_executor/cuda/cudnn_version.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cudnn_version.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
 bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
                                         CudnnVersion loaded_version) {
@@ -36,5 +36,5 @@ bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
            loaded_version.minor_version >= source_version.minor_version));
 }
 
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cudnn_version.h b/tensorflow/stream_executor/cuda/cudnn_version.h
index 6464e7f8e8755b5b46b90a4b35d50509eb0cfde7..4607a9bff87bf29a00a9f5e0f112f9389fa12972 100644
--- a/tensorflow/stream_executor/cuda/cudnn_version.h
+++ b/tensorflow/stream_executor/cuda/cudnn_version.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
 struct CudnnVersion {
   CudnnVersion() = default;
@@ -44,7 +44,7 @@ struct CudnnVersion {
 bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
                                         CudnnVersion loaded_version);
 
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDNN_VERSION_H_
diff --git a/tensorflow/stream_executor/cuda/cudnn_version_test.cc b/tensorflow/stream_executor/cuda/cudnn_version_test.cc
index 7d4c6399d040e9bcddff5d98d202ab00fdeffa58..cfe114662d4515c68ffdab46918db09f631e9343 100644
--- a/tensorflow/stream_executor/cuda/cudnn_version_test.cc
+++ b/tensorflow/stream_executor/cuda/cudnn_version_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 namespace {
 
 TEST(CuDNNVersion, ToString) {
@@ -68,5 +68,5 @@ TEST(IsSourceCompatibleWithCudnnLibraryTest, Basic) {
 }
 
 }  // namespace
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/device_description.cc b/tensorflow/stream_executor/device_description.cc
index 0b991b7ba8cdad7f342adc6c8ff25b88d91e2bd2..2595d216b4f97b36fe82e6d020c4f7afde4d4274 100644
--- a/tensorflow/stream_executor/device_description.cc
+++ b/tensorflow/stream_executor/device_description.cc
@@ -50,6 +50,7 @@ DeviceDescription::DeviceDescription()
       clock_rate_ghz_(-1.0),
       cuda_compute_capability_major_(-1),
       cuda_compute_capability_minor_(-1),
+      rocm_amdgpu_isa_version_(-1),
       numa_node_(-1),
       core_count_(-1),
       ecc_enabled_(false) {}
@@ -112,6 +113,15 @@ bool DeviceDescription::cuda_compute_capability(int *major, int *minor) const {
   return cuda_compute_capability_major_ != 0;
 }
 
+bool DeviceDescription::rocm_amdgpu_isa_version(int *version) const {
+  bool status = false;
+  if (rocm_amdgpu_isa_version_ > 0) {
+    *version = rocm_amdgpu_isa_version_;
+    status = true;
+  }
+  return status;
+}
+
 bool ThreadDimOk(const DeviceDescription &device_description,
                  const ThreadDim &thread_dim) {
   auto total_threads = thread_dim.x * thread_dim.y * thread_dim.z;
diff --git a/tensorflow/stream_executor/device_description.h b/tensorflow/stream_executor/device_description.h
index 8ddf18629d554112631c3d9c09dbb7afd8505c76..cccc209e1c8bedde5bdba8f454bbe95d0f9f3458 100644
--- a/tensorflow/stream_executor/device_description.h
+++ b/tensorflow/stream_executor/device_description.h
@@ -133,6 +133,11 @@ class DeviceDescription {
   // zero, and the return value will be false.
   bool cuda_compute_capability(int *major, int *minor) const;
 
+  // Returns the AMDGPU ISA version if we're running on the ROCm platform.
+  // If the information is not available, the version is not modified,
+  // and the return value will be false.
+  bool rocm_amdgpu_isa_version(int *version) const;
+
   // Returns the maximum amount of shared memory present on a single core
   // (i.e. Streaming Multiprocessor on NVIDIA GPUs; Compute Unit for OpenCL
   // devices). Note that some devices, such as NVIDIA's have a configurable
@@ -195,6 +200,9 @@ class DeviceDescription {
   int cuda_compute_capability_major_;
   int cuda_compute_capability_minor_;
 
+  // ROCM AMDGPU ISA version, 0 if not available.
+  int rocm_amdgpu_isa_version_;
+
   int numa_node_;
   int core_count_;
   bool ecc_enabled_;
@@ -280,6 +288,10 @@ class DeviceDescriptionBuilder {
     device_description_->cuda_compute_capability_minor_ = minor;
   }
 
+  void set_rocm_amdgpu_isa_version(int version) {
+    device_description_->rocm_amdgpu_isa_version_ = version;
+  }
+
   void set_numa_node(int value) { device_description_->numa_node_ = value; }
   void set_core_count(int value) { device_description_->core_count_ = value; }
   void set_ecc_enabled(bool value) {
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index faa662211ebb366b8e20cdc3e33ca651c64cf73a..fcc3db928b1daaca33bef2e518aa6a4c1d8e5373 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -368,6 +368,16 @@ BatchDescriptor BatchDescriptor::DepthConcatenateOutputDescriptor(
   return output;
 }
 
+TensorDescriptorProto BatchDescriptor::ToProto(DataType data_type) const {
+  CHECK_EQ(0.0, value_max_);
+  CHECK_EQ(0.0, value_min_);
+  CHECK(quantized_activation_mode_ == QuantizedActivationMode::k8Bit);
+
+  TensorDescriptorProto ret = tensor_;
+  ret.set_data_type(data_type);
+  return ret;
+}
+
 // -- FilterDescriptor
 
 FilterDescriptor::FilterDescriptor(int ndims) {
@@ -434,6 +444,12 @@ int64 FilterDescriptor::ComputeWeightCount() const {
   return ret;
 }
 
+TensorDescriptorProto FilterDescriptor::ToProto(DataType data_type) const {
+  TensorDescriptorProto ret = tensor_;
+  ret.set_data_type(data_type);
+  return ret;
+}
+
 // -- ConvolutionDescriptor
 
 ConvolutionDescriptor::ConvolutionDescriptor(int ndims) {
@@ -565,5 +581,15 @@ string NormalizeDescriptor::ToShortString() const {
                       "_size:", segment_size_);
 }
 
+bool DnnSupport::IsStatusOk(const port::Status& status, bool report_error) {
+  if (status.ok()) {
+    return true;
+  }
+  if (report_error) {
+    LOG(ERROR) << status.error_message();
+  }
+  return false;
+}
+
 }  // namespace dnn
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index c044a356efb38c333c3153f024092a22fbdf56db..24c29486d069069c580878b00f1a86c70ca63a01 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -248,6 +248,12 @@ class BatchDescriptor {
   string ToString() const;
   string ToShortString() const;
 
+  // Pre-condition:
+  //   value_max_ == 0
+  //   value_min_ == 0
+  //   quantized_activation_mode_ == QuantizedActivationMode::k8Bit
+  TensorDescriptorProto ToProto(DataType data_type) const;
+
   // Accessors.
   int64 count() const { return tensor_.dimensions(0); }
   int64 feature_map_count() const { return tensor_.dimensions(1); }
@@ -420,6 +426,7 @@ class FilterDescriptor {
 
   string ToString() const;
   string ToShortString() const;
+  TensorDescriptorProto ToProto(DataType data_type) const;
 
   // Returns the number of weights required as parameters for a convolution
   // using this filter descriptor.
@@ -509,6 +516,7 @@ class ConvolutionDescriptor {
 
   string ToString() const;
   string ToShortString() const;
+  ConvolutionDescriptorProto ToProto() const { return proto_; }
 
   ConvolutionDescriptor& set_zero_padding_height(int64 value) {
     SetDim(padding(), DimIndex::Y, value);
@@ -730,6 +738,7 @@ class PoolingDescriptor {
 class AlgorithmDesc {
  public:
   typedef int64 Index;
+  AlgorithmDesc() : AlgorithmDesc(0, false) {}
   AlgorithmDesc(Index a, bool use_tensor_ops) {
     proto_.set_algo_id(a);
     proto_.set_math_type(use_tensor_ops ? AlgorithmProto::TENSOR_OP_MATH
@@ -745,6 +754,8 @@ class AlgorithmDesc {
   }
   uint64 hash() const;
 
+  AlgorithmProto ToProto() const { return proto_; }
+
  private:
   AlgorithmProto proto_;
 };
@@ -906,9 +917,10 @@ class VersionInfo {
  public:
   VersionInfo(int major = 0, int minor = 0, int patch = 0)
       : major_(major), minor_(minor), patch_(patch) {}
-  int major_version() { return major_; }
-  int minor_version() { return minor_; }
-  int patch() { return patch_; }
+  int major_version() const { return major_; }
+  int minor_version() const { return minor_; }
+  int patch() const { return patch_; }
+
  private:
   int major_;
   int minor_;
@@ -930,11 +942,7 @@ class VersionInfo {
 //   burden.
 // * Poor error handling: the API should return Status objects.
 //
-// Things worth trying:
-// * Move functions that are not actually common back to the backends. Then,
-//   callers may use dynamic_cast to access specific backends. This may not be
-//   that hard, as many of the callers are Stream::ThenXxx functions.
-// * Change all the returned bools to Status.
+// PrepareForConvolution is an example for how new APIs should be written.
 class DnnSupport {
  public:
   DnnSupport() {}
@@ -1175,6 +1183,26 @@ class DnnSupport {
     return false;
   }
 
+  template <typename ElementType>
+  port::Status PrepareForConvolution(
+      ConvolutionKind kind, Stream* stream,
+      const BatchDescriptor& batch_descriptor,
+      DeviceMemory<ElementType> input_data,
+      const FilterDescriptor& filter_descriptor,
+      DeviceMemory<ElementType> filter_data,
+      const BatchDescriptor& output_descriptor,
+      DeviceMemory<ElementType> output_data,
+      const ConvolutionDescriptor& convolution_descriptor,
+      const AlgorithmConfig& algorithm_config,
+      ScratchAllocator* scratch_allocator, AlgorithmDesc* algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory) {
+    return DoPrepareForConvolution(
+        kind, ToDataType<ElementType>::value, stream, batch_descriptor,
+        input_data, filter_descriptor, filter_data, output_descriptor,
+        output_data, convolution_descriptor, algorithm_config,
+        scratch_allocator, algorithm_desc, scratch_memory);
+  }
+
   // Enqueues a single-precision convolution operation onto the stream.
   //
   // Arguments (all borrowed):
@@ -1188,10 +1216,10 @@ class DnnSupport {
   //  output_descriptor: dimensions of the output layer.
   //  output_data: un-owned device memory region in which to place the
   //    convolution result.
-  //  scratch_allocator: un-owned, may-be-null object that may allocate scratch
-  //    space in order to speed up the convolution operation.
-  //  algorithm_config: specifies which algorithm should be used for the
+  //  algorithm_desc: specifies which algorithm should be used for the
   //    operation.
+  //  scratch: un-owned device memory for scratch space in order to speed up
+  //    the convolution operation.
   //  output_profile_result: the output profile result for this call. The
   //    profiling is only enabled when this is not nullptr.
   //
@@ -1209,43 +1237,34 @@ class DnnSupport {
   //   that if the inverse of the filter is applied to the output in VALID mode
   //   the result is the same size as the input - this requires even more
   //   padding of the input.
-  virtual bool DoConvolve(
-      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
-      const DeviceMemory<float>& input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<float>& filter_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
+  virtual port::Status DoConvolve(
+      ConvolutionKind kind, DataType element_type, Stream* stream,
+      const BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+      const FilterDescriptor& filter_descriptor, DeviceMemoryBase filter_data,
+      const BatchDescriptor& output_descriptor, DeviceMemoryBase output_data,
+      const ConvolutionDescriptor& convolution_descriptor,
+      AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory,
       ProfileResult* output_profile_result) = 0;
 
-  // Enqueues a double-precision convolution operation onto the stream.
-  // See DoConvolve above for argument details.
-  virtual bool DoConvolve(
-      Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
-      const DeviceMemory<double>& input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<double>& filter_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result) = 0;
-
-  // Enqueues a half-precision convolution operation onto the stream.
-  // See DoConvolve above for argument details.
-  virtual bool DoConvolve(
-      Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
-      const DeviceMemory<Eigen::half>& input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<Eigen::half>& filter_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<Eigen::half>* output_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      ProfileResult* output_profile_result) = 0;
+  template <typename ElementType>
+  bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+                  const DeviceMemory<ElementType>& input_data,
+                  const dnn::FilterDescriptor& filter_descriptor,
+                  const DeviceMemory<ElementType>& filter_data,
+                  const dnn::ConvolutionDescriptor& convolution_descriptor,
+                  const dnn::BatchDescriptor& output_descriptor,
+                  DeviceMemory<ElementType>* output_data,
+                  const dnn::AlgorithmDesc& algorithm_desc,
+                  DeviceMemory<uint8>* scratch_memory,
+                  ProfileResult* output_profile_result) {
+    return IsStatusOk(
+        DoConvolve(ConvolutionKind::FORWARD, ToDataType<ElementType>::value,
+                   stream, input_descriptor, input_data, filter_descriptor,
+                   filter_data, output_descriptor, *output_data,
+                   convolution_descriptor, algorithm_desc, *scratch_memory,
+                   output_profile_result),
+        !output_profile_result);
+  }
 
   // Return a list of algorithms supported by the forward convolution pass.
   // cc_major and cc_minor are the compute capabilities of the device.
@@ -1318,17 +1337,27 @@ class DnnSupport {
   //    backprop of the input.
   //  scratch_allocator: un-owned, may-be-null object that may allocate scratch
   //    space in order to speed up the convolution operation.
-  virtual bool DoConvolveBackwardData(
-      Stream* stream, const FilterDescriptor& filter_descriptor,
-      const DeviceMemory<float>& filter_data,
-      const BatchDescriptor& output_descriptor,
-      DeviceMemory<float> backward_output_data,
-      const ConvolutionDescriptor& convolution_descriptor,
-      const BatchDescriptor& input_descriptor,
-      DeviceMemory<float>* backward_input_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      ProfileResult* output_profile_result) = 0;
+  template <typename ElementType>
+  bool DoConvolveBackwardData(
+      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<ElementType>& filter_data,
+      const dnn::BatchDescriptor& output_descriptor,
+      const DeviceMemory<ElementType>& backward_output_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& input_descriptor,
+      DeviceMemory<ElementType>* backward_input_data,
+      const dnn::AlgorithmDesc& algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory,
+      ProfileResult* output_profile_result) {
+    return IsStatusOk(
+        DoConvolve(ConvolutionKind::BACKWARD_DATA,
+                   ToDataType<ElementType>::value, stream, input_descriptor,
+                   *backward_input_data, filter_descriptor, filter_data,
+                   output_descriptor, backward_output_data,
+                   convolution_descriptor, algorithm_desc, *scratch_memory,
+                   output_profile_result),
+        !output_profile_result);
+  }
 
   // Return a list of algorithms supported by the backward convolution pass for
   // data.
@@ -1336,30 +1365,6 @@ class DnnSupport {
       bool with_winograd_nonfused, int cc_major, int cc_minor,
       std::vector<AlgorithmDesc>* out_algorithms);
 
-  virtual bool DoConvolveBackwardData(
-      Stream* stream, const FilterDescriptor& filter_descriptor,
-      const DeviceMemory<double>& filter_data,
-      const BatchDescriptor& output_descriptor,
-      DeviceMemory<double> backward_output_data,
-      const ConvolutionDescriptor& convolution_descriptor,
-      const BatchDescriptor& input_descriptor,
-      DeviceMemory<double>* backward_input_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      ProfileResult* output_profile_result) = 0;
-
-  virtual bool DoConvolveBackwardData(
-      Stream* stream, const FilterDescriptor& filter_descriptor,
-      const DeviceMemory<Eigen::half>& filter_data,
-      const BatchDescriptor& output_descriptor,
-      DeviceMemory<Eigen::half> backward_output_data,
-      const ConvolutionDescriptor& convolution_descriptor,
-      const BatchDescriptor& input_descriptor,
-      DeviceMemory<Eigen::half>* backward_input_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      ProfileResult* output_profile_result) = 0;
-
   // Enqueues a single-precision backward convolution (for filter) operation
   // onto the stream.
   //
@@ -1379,17 +1384,27 @@ class DnnSupport {
   //    backprop of the filter.
   //  scratch_allocator: un-owned, may-be-null object that may allocate scratch
   //    space in order to speed up the convolution operation.
-  virtual bool DoConvolveBackwardFilter(
+  template <typename ElementType>
+  bool DoConvolveBackwardFilter(
       Stream* stream, const BatchDescriptor& input_descriptor,
-      const DeviceMemory<float>& input_data,
+      const DeviceMemory<ElementType>& input_data,
       const BatchDescriptor& output_descriptor,
-      DeviceMemory<float> backward_output_data,
+      const DeviceMemory<ElementType>& backward_output_data,
       const ConvolutionDescriptor& convolution_descriptor,
       const FilterDescriptor& filter_descriptor,
-      DeviceMemory<float>* backward_filter_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      ProfileResult* output_profile_result) = 0;
+      DeviceMemory<ElementType>* backward_filter_data,
+      const dnn::AlgorithmDesc& algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory,
+      ProfileResult* output_profile_result) {
+    return IsStatusOk(
+        DoConvolve(ConvolutionKind::BACKWARD_FILTER,
+                   ToDataType<ElementType>::value, stream, input_descriptor,
+                   input_data, filter_descriptor, *backward_filter_data,
+                   output_descriptor, backward_output_data,
+                   convolution_descriptor, algorithm_desc, *scratch_memory,
+                   output_profile_result),
+        !output_profile_result);
+  }
 
   // Return a list of algorithms supported by the backward convolution pass for
   // filters.
@@ -1397,30 +1412,6 @@ class DnnSupport {
       bool with_winograd_nonfused, int cc_major, int cc_minor,
       std::vector<AlgorithmDesc>* out_algorithms);
 
-  virtual bool DoConvolveBackwardFilter(
-      Stream* stream, const BatchDescriptor& input_descriptor,
-      const DeviceMemory<double>& input_data,
-      const BatchDescriptor& output_descriptor,
-      DeviceMemory<double> backward_output_data,
-      const ConvolutionDescriptor& convolution_descriptor,
-      const FilterDescriptor& filter_descriptor,
-      DeviceMemory<double>* backward_filter_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      ProfileResult* output_profile_result) = 0;
-
-  virtual bool DoConvolveBackwardFilter(
-      Stream* stream, const BatchDescriptor& input_descriptor,
-      const DeviceMemory<Eigen::half>& input_data,
-      const BatchDescriptor& output_descriptor,
-      DeviceMemory<Eigen::half> backward_output_data,
-      const ConvolutionDescriptor& convolution_descriptor,
-      const FilterDescriptor& filter_descriptor,
-      DeviceMemory<Eigen::half>* backward_filter_data,
-      ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      ProfileResult* output_profile_result) = 0;
-
   // Enqueues a single-precision backward convolution (for bias) operation onto
   // the stream.
   //
@@ -1606,6 +1597,17 @@ class DnnSupport {
     return false;
   }
 
+  virtual bool DoPoolForward(Stream* stream,
+                             const dnn::PoolingDescriptor& pooling_dimensions,
+                             const dnn::BatchDescriptor& input_dimensions,
+                             const DeviceMemory<int8>& input_data,
+                             const dnn::BatchDescriptor& output_dimensions,
+                             DeviceMemory<int8>* output_data,
+                             ScratchAllocator* workspace_allocator) {
+    LOG(FATAL) << "DoPoolForward not implemented for int8.";
+    return false;
+  }
+
   // Performs differentiation of the pooling operation.
   virtual bool DoPoolBackward(Stream* stream,
                               const dnn::PoolingDescriptor& pooling_dimensions,
@@ -1646,21 +1648,9 @@ class DnnSupport {
     return false;
   }
 
-  // Applies local response normalization to the values from
-  // input_data and writes the result to output_data. See comments on
-  // NormalizeDescriptor for a description of local response
-  // normalization.
-  virtual bool DoNormalize(Stream* stream,
-                           const dnn::NormalizeDescriptor& normalize_descriptor,
-                           const DeviceMemory<float>& input_data,
-                           DeviceMemory<float>* output_data) = 0;
-
   // Applies local response normalization to the values from input_data and
   // writes the result to output_data.
   //
-  // Similar to DoNormalize, but normalizes across feature maps and allows for
-  // specifying the dimensions of the tensor.
-  //
   // See comments on NormalizeDescriptor for a description of local response
   // normalization.
   virtual bool DoNormalizeWithDimensions(
@@ -2028,22 +2018,6 @@ class DnnSupport {
       QuantizedActivationMode mode,
       DeviceMemory<float>* gpu_unquantized_dst) = 0;
 
-  // Enqueues an asynchronous copy of the contents of buffer_src to
-  // gpu_unquantized_dst.
-  virtual bool DoCopyHostBuffer2Device(
-      Stream* stream, HostBuffer* buffer_src,
-      DeviceMemory<float>* gpu_unquantized_dst) {
-    return false;
-  }
-
-  // Enqueues an asynchronous copy of the contents of gpu_unquantized_src to
-  // buffer_dst.
-  virtual bool DoCopyDevice2HostBuffer(
-      Stream* stream, const DeviceMemory<float>& gpu_unquantized_src,
-      HostBuffer* buffer_dst) {
-    return false;
-  }
-
   // Create an RNN descriptor based on model shapes and configurations.
   // The caller retains the ownership of the descriptor.
   //
@@ -2080,17 +2054,27 @@ class DnnSupport {
   // sequence. The caller retains the ownership of the returned descriptor.
   //
   // Arguments:
-  //  seq_length: the length of the sequence.
+  //  max_seq_length: the max length of the sequences.
   //  batch_size: the size of a minibatch.
   //  data_size: the size of the state.
+  //  seq_lenghs: the lengths of sequences in a batch.
   //  data_type: an enum to specify the type for the underlying data.
   virtual port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
-  createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
+  createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
                                     int data_size, dnn::DataType data_type) {
     return port::Status(port::error::UNIMPLEMENTED,
                         "createRnnSequenceTensorDescriptor is unimplemented");
   }
 
+  virtual port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
+  createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
+                                    int data_size,
+                                    const absl::Span<const int>& seq_lengths,
+                                    dnn::DataType data_type) {
+    return port::Status(port::error::UNIMPLEMENTED,
+                        "createRnnSequenceTensorDescriptor is unimplemented");
+  }
+
   // Create an RNN state descriptor that specifies the input or hidden state.
   // The caller retains the ownership of the returned descriptor.
   virtual port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
@@ -2338,7 +2322,25 @@ class DnnSupport {
     return false;
   }
 
+ protected:
+  // Returns whether status is 'ok', and potentially logs the error.
+  static bool IsStatusOk(const port::Status& status, bool report_error);
+
  private:
+  virtual port::Status DoPrepareForConvolution(
+      ConvolutionKind kind, DataType element_type, Stream* stream,
+      const BatchDescriptor& batch_descriptor, DeviceMemoryBase input_data,
+      const FilterDescriptor& filter_descriptor, DeviceMemoryBase filter_data,
+      const BatchDescriptor& output_descriptor, DeviceMemoryBase output_data,
+      const ConvolutionDescriptor& convolution_descriptor,
+      const AlgorithmConfig& algorithm_config,
+      ScratchAllocator* scratch_allocator, AlgorithmDesc* algorithm_desc,
+      DeviceMemory<uint8>* scratch_memory) {
+    *algorithm_desc = {};
+    *scratch_memory = {};
+    return port::Status::OK();
+  }
+
   SE_DISALLOW_COPY_AND_ASSIGN(DnnSupport);
 };
 
diff --git a/tensorflow/stream_executor/dnn.proto b/tensorflow/stream_executor/dnn.proto
index 56b079c3f5b962636e7c75b46449adca8e13a43e..11fb5d0f6a02a32fd3c958133136b078ac848ac3 100644
--- a/tensorflow/stream_executor/dnn.proto
+++ b/tensorflow/stream_executor/dnn.proto
@@ -66,6 +66,13 @@ enum ConvolutionMode {
   CONVOLUTION = 1;
 }
 
+enum ConvolutionKind {
+  INVALID = 0;
+  FORWARD = 1;
+  BACKWARD_FILTER = 2;
+  BACKWARD_DATA = 3;
+}
+
 // Generic tensor representation.
 message TensorDescriptorProto {
   repeated int64 dimensions = 1;
@@ -101,3 +108,22 @@ message ConvolutionDescriptorProto {
   int32 group_count = 5;
   ConvolutionMode convolution_mode = 6;
 }
+
+// A convolution. Currently it's only used for logging. In the future, we may
+// want to use it in the API as well.
+message ConvolutionProto {
+  ConvolutionKind kind = 1;
+  TensorDescriptorProto input = 2;
+  TensorDescriptorProto filter = 3;
+  TensorDescriptorProto output = 4;
+  AlgorithmProto algorithm = 5;
+  ConvolutionDescriptorProto conv_desc = 6;
+
+  // result = conv_scale * conv(...) + side_value_scale * side_value.
+  // side_value is an arbitrary buffer if activation is not none. Otherwise, it
+  // has to be the result buffer (using its old values).
+  double conv_scale = 7;
+  double side_value_scale = 8;
+
+  ActivationMode activation = 9;
+}
diff --git a/tensorflow/stream_executor/dso_loader.cc b/tensorflow/stream_executor/dso_loader.cc
deleted file mode 100644
index 6dda5d63155d8f9cf8d068b3feae51b1fba88a51..0000000000000000000000000000000000000000
--- a/tensorflow/stream_executor/dso_loader.cc
+++ /dev/null
@@ -1,294 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// TODO(jhen): Replace hardcoded, platform specific path strings in GetXXXPath()
-// with a function in e.g. cuda.h.
-
-#include "tensorflow/stream_executor/dso_loader.h"
-
-#include <limits.h>
-#include <stdlib.h>
-#include <initializer_list>
-#include <vector>
-
-#include "absl/strings/str_cat.h"
-#include "tensorflow/core/platform/load_library.h"
-#include "tensorflow/stream_executor/lib/env.h"
-#include "tensorflow/stream_executor/lib/error.h"
-#include "tensorflow/stream_executor/lib/path.h"
-#include "tensorflow/stream_executor/lib/str_util.h"
-#include "tensorflow/stream_executor/lib/stringprintf.h"
-#include "tensorflow/stream_executor/platform/logging.h"
-#include "tensorflow/stream_executor/platform/port.h"
-
-#if !defined(PLATFORM_GOOGLE)
-#include "absl/strings/string_view.h"
-#include "cuda/cuda_config.h"
-#endif
-
-namespace stream_executor {
-namespace internal {
-
-string GetCudaVersion() { return TF_CUDA_VERSION; }
-string GetCudnnVersion() { return TF_CUDNN_VERSION; }
-
-/* static */ port::Status DsoLoader::GetCublasDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
-                                      "cublas", GetCudaVersion()),
-                                  GetCudaLibraryDirPath()),
-                      dso_handle);
-}
-
-/* static */ port::Status DsoLoader::GetCudnnDsoHandle(void** dso_handle) {
-  // libcudnn is versioned differently than the other libraries and may have a
-  // different version number than other CUDA libraries.  See b/22397368 for
-  // some details about the complications surrounding this.
-  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
-                                      "cudnn", GetCudnnVersion()),
-                                  GetCudaLibraryDirPath()),
-                      dso_handle);
-}
-
-/* static */ port::Status DsoLoader::GetCufftDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
-                                      "cufft", GetCudaVersion()),
-                                  GetCudaLibraryDirPath()),
-                      dso_handle);
-}
-
-/* static */ port::Status DsoLoader::GetCurandDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
-                                      "curand", GetCudaVersion()),
-                                  GetCudaLibraryDirPath()),
-                      dso_handle);
-}
-
-/* static */ port::Status DsoLoader::GetLibcudaDsoHandle(void** dso_handle) {
-#if defined(PLATFORM_WINDOWS)
-  return GetDsoHandle(
-      FindDsoPath(port::Env::Default()->FormatLibraryFileName("nvcuda", ""),
-                  GetCudaDriverLibraryPath()),
-      dso_handle);
-#else
-  port::Status status = GetDsoHandle(
-      FindDsoPath(port::Env::Default()->FormatLibraryFileName("cuda", "1"),
-                  GetCudaDriverLibraryPath()),
-      dso_handle);
-#if defined(__APPLE__)
-  // On Mac OS X, CUDA sometimes installs libcuda.dylib instead of
-  // libcuda.1.dylib.
-  return status.ok() ? status : GetDsoHandle(
-     FindDsoPath(port::Env::Default()->FormatLibraryFileName("cuda", ""),
-                 GetCudaDriverLibraryPath()),
-     dso_handle);
-#else
-  return status;
-#endif
-#endif
-}
-
-/* static */ port::Status DsoLoader::GetLibcuptiDsoHandle(void** dso_handle) {
-#if defined(ANDROID_TEGRA)
-  // On Android devices the CUDA version number is not added to the library
-  // name.
-  return GetDsoHandle(
-      FindDsoPath(port::Env::Default()->FormatLibraryFileName("cupti", ""),
-                  GetCudaCuptiLibraryPath()),
-      dso_handle);
-#else
-  return GetDsoHandle(FindDsoPath(port::Env::Default()->FormatLibraryFileName(
-                                      "cupti", GetCudaVersion()),
-                                  GetCudaCuptiLibraryPath()),
-                      dso_handle);
-#endif
-}
-
-static mutex& GetRpathMutex() {
-  static mutex* mu = new mutex;
-  return *mu;
-}
-
-/* static */ void DsoLoader::RegisterRpath(absl::string_view path) {
-  mutex_lock lock{GetRpathMutex()};
-  GetRpaths()->emplace_back(path);
-}
-
-/* static */ port::Status DsoLoader::GetDsoHandle(absl::string_view path,
-                                                  void** dso_handle,
-                                                  LoadKind load_kind) {
-  if (load_kind != LoadKind::kLocal) {
-    return port::Status(port::error::INVALID_ARGUMENT,
-                        "Only LoadKind::kLocal is currently supported");
-  }
-  string path_string(path);
-  port::Status s =
-      port::Env::Default()->LoadLibrary(path_string.c_str(), dso_handle);
-  if (!s.ok()) {
-#if !defined(PLATFORM_WINDOWS)
-    char* ld_library_path = getenv("LD_LIBRARY_PATH");
-#endif
-    LOG(INFO) << "Couldn't open CUDA library " << path
-#if !defined(PLATFORM_WINDOWS)
-              << ". LD_LIBRARY_PATH: "
-              << (ld_library_path != nullptr ? ld_library_path : "")
-#endif
-    ;
-    return port::Status(port::error::FAILED_PRECONDITION,
-                        absl::StrCat("could not dlopen DSO: ", path,
-                                     "; dlerror: ", s.error_message()));
-  }
-  LOG(INFO) << "successfully opened CUDA library " << path << " locally";
-  return port::Status::OK();
-}
-
-/* static */ string DsoLoader::GetBinaryDirectory(bool strip_executable_name) {
-  string exe_path = port::Env::Default()->GetExecutablePath();
-  return strip_executable_name ? string(port::Dirname(exe_path)) : exe_path;
-}
-
-// Creates a heap-allocated vector for initial rpaths.
-// Ownership is transferred to the caller.
-static std::vector<string>* CreatePrimordialRpaths() {
-  auto rpaths = new std::vector<string>;
-#if defined(__APPLE__)
-  rpaths->push_back("driver/driver_sh.runfiles/local_config_cuda/cuda/lib");
-#else
-  rpaths->push_back("driver/driver_sh.runfiles/local_config_cuda/cuda/lib64");
-#endif
-  return rpaths;
-}
-
-/* static */ std::vector<string>* DsoLoader::GetRpaths() {
-  static std::vector<string>* rpaths = CreatePrimordialRpaths();
-  return rpaths;
-}
-
-/* static */ bool DsoLoader::TrySymbolicDereference(string* candidate) {
-#if defined(PLATFORM_WINDOWS)
-  return false;
-#else
-  char buf[PATH_MAX];
-  char* result = realpath(candidate->c_str(), buf);
-  if (result == nullptr) {
-    return false;
-  }
-  VLOG(3) << "realpath resolved candidate path \"" << *candidate << "\" to \""
-          << result << "\"";
-  *candidate = result;
-  return true;
-#endif
-}
-
-/* static */ string DsoLoader::FindDsoPath(absl::string_view library_name,
-                                           absl::string_view runfiles_relpath) {
-  // Keep a record of the paths we attempted so we can dump out meaningful
-  // diagnostics if no path is found.
-  std::vector<string> attempted;
-
-  using StringPieces = std::vector<absl::string_view>;
-  string candidate;
-
-  // Otherwise, try binary-plus-rpath locations.
-  string binary_directory =
-      GetBinaryDirectory(true /* = strip_executable_name */);
-  mutex_lock lock{GetRpathMutex()};
-  for (const string& rpath : *GetRpaths()) {
-    candidate =
-        port::Join(StringPieces{binary_directory, rpath, library_name}, "/");
-    if (TrySymbolicDereference(&candidate)) {
-      return candidate;
-    }
-  }
-  attempted.push_back(candidate);
-
-  return string(library_name);
-}
-
-/* static */ string DsoLoader::GetCudaLibraryDirPath() {
-#if defined(__APPLE__)
-  return "external/local_config_cuda/cuda/lib";
-#else
-  return "external/local_config_cuda/cuda/lib64";
-#endif
-}
-
-/* static */ string DsoLoader::GetCudaDriverLibraryPath() {
-#if defined(__APPLE__)
-  return "external/local_config_cuda/cuda/driver/lib";
-#elif defined(PLATFORM_WINDOWS)
-  return "";
-#else
-  return "external/local_config_cuda/cuda/driver/lib64";
-#endif
-}
-
-/* static */ string DsoLoader::GetCudaCuptiLibraryPath() {
-#if defined(__APPLE__)
-  return "external/local_config_cuda/cuda/extras/CUPTI/lib";
-#else
-  return "external/local_config_cuda/cuda/extras/CUPTI/lib64";
-#endif
-}
-
-// -- CachedDsoLoader
-
-/* static */ port::StatusOr<void*> CachedDsoLoader::GetCublasDsoHandle() {
-  static port::StatusOr<void*> result =
-      FetchHandleResult(DsoLoader::GetCublasDsoHandle);
-  return result;
-}
-
-/* static */ port::StatusOr<void*> CachedDsoLoader::GetCurandDsoHandle() {
-  static port::StatusOr<void*> result =
-      FetchHandleResult(DsoLoader::GetCurandDsoHandle);
-  return result;
-}
-
-/* static */ port::StatusOr<void*> CachedDsoLoader::GetCudnnDsoHandle() {
-  static port::StatusOr<void*> result =
-      FetchHandleResult(DsoLoader::GetCudnnDsoHandle);
-  return result;
-}
-
-/* static */ port::StatusOr<void*> CachedDsoLoader::GetCufftDsoHandle() {
-  static port::StatusOr<void*> result =
-      FetchHandleResult(DsoLoader::GetCufftDsoHandle);
-  return result;
-}
-
-/* static */ port::StatusOr<void*> CachedDsoLoader::GetLibcudaDsoHandle() {
-  static port::StatusOr<void*> result =
-      FetchHandleResult(DsoLoader::GetLibcudaDsoHandle);
-  return result;
-}
-
-/* static */ port::StatusOr<void*> CachedDsoLoader::GetLibcuptiDsoHandle() {
-  static port::StatusOr<void*> result =
-      FetchHandleResult(DsoLoader::GetLibcuptiDsoHandle);
-  return result;
-}
-
-/* static */ port::StatusOr<void*> CachedDsoLoader::FetchHandleResult(
-    std::function<port::Status(void**)> load_dso) {
-  void* handle;
-  auto status = load_dso(&handle);
-  if (!status.ok()) {
-    return status;
-  }
-  return handle;
-}
-
-}  // namespace internal
-}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/dso_loader.h b/tensorflow/stream_executor/dso_loader.h
deleted file mode 100644
index f063b68d6058f7b1faecfd83d3d21b899cf027a3..0000000000000000000000000000000000000000
--- a/tensorflow/stream_executor/dso_loader.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Common DSO loading functionality: exposes callables that dlopen DSOs
-// in either the runfiles directories
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_DSO_LOADER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_DSO_LOADER_H_
-
-#include "tensorflow/stream_executor/platform/port.h"
-#include <vector>
-
-#include "absl/strings/string_view.h"
-#include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/platform.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
-
-namespace stream_executor {
-namespace internal {
-
-// Permits StreamExecutor code to dynamically load a pre-determined set of
-// relevant DSOs via dlopen.
-//
-// Thread-safe.
-class DsoLoader {
- public:
-  // The following methods either load the DSO of interest and return a dlopen
-  // handle or error status in the canonical namespace.
-
-  static port::Status GetCublasDsoHandle(void** dso_handle);
-  static port::Status GetCudnnDsoHandle(void** dso_handle);
-  static port::Status GetCufftDsoHandle(void** dso_handle);
-  static port::Status GetCurandDsoHandle(void** dso_handle);
-  static port::Status GetLibcudaDsoHandle(void** dso_handle);
-  static port::Status GetLibcuptiDsoHandle(void** dso_handle);
-
-  // Registers a new binary-relative path to use as a dlopen search path.
-  static void RegisterRpath(absl::string_view path);
-
- private:
-  // Registered rpaths (singleton vector) and a mutex that guards it.
-  static std::vector<string>* GetRpaths();
-
-  // Descriptive boolean wrapper to indicate whether symbols are made available
-  // to resolve in later-loaded libraries.
-  enum class LoadKind { kLocal, kGlobal };
-
-  // Loads a DSO from the given "path" (which can technically be any dlopen-able
-  // name). If the load kind is global, the symbols in the loaded DSO are
-  // visible to subsequent DSO loading operations.
-  static port::Status GetDsoHandle(absl::string_view path, void** dso_handle,
-                                   LoadKind load_kind = LoadKind::kLocal);
-
-  // Returns the binary directory (or binary path) associated with the currently
-  // executing program. If strip_executable_name is true, the executable file is
-  // stripped off of the path.
-  static string GetBinaryDirectory(bool strip_executable_name);
-
-  // Invokes realpath on the original path; updates candidate and returns true
-  // if it succeeds (i.e. a file exists at the path); otherwise, returns false.
-  static bool TrySymbolicDereference(string* candidate);
-
-  // Attempts to find a path to the DSO of interest, otherwise returns the
-  // bare library name:
-  // Arguments:
-  //   library_name: the filename in tree; e.g. libOpenCL.so.1.0.0
-  //   runfiles_relpath: where to look for the library relative to the runfiles
-  //      root; e.g. third_party/gpus/cuda/lib64
-  static string FindDsoPath(absl::string_view library_name,
-                            absl::string_view runfiles_relpath);
-
-  // Return platform dependent paths for DSOs
-  static string GetCudaLibraryDirPath();
-  static string GetCudaDriverLibraryPath();
-  static string GetCudaCuptiLibraryPath();
-
-  SE_DISALLOW_COPY_AND_ASSIGN(DsoLoader);
-};
-
-// Wrapper around the DsoLoader that prevents us from dlopen'ing any of the DSOs
-// more than once.
-class CachedDsoLoader {
- public:
-  // Cached versions of the corresponding DsoLoader methods above.
-  static port::StatusOr<void*> GetCublasDsoHandle();
-  static port::StatusOr<void*> GetCudnnDsoHandle();
-  static port::StatusOr<void*> GetCufftDsoHandle();
-  static port::StatusOr<void*> GetCurandDsoHandle();
-  static port::StatusOr<void*> GetLibcudaDsoHandle();
-  static port::StatusOr<void*> GetLibcuptiDsoHandle();
-
- private:
-  // Fetches a DSO handle via "load_dso" and returns the StatusOr form of the
-  // result.
-  static port::StatusOr<void*> FetchHandleResult(
-      std::function<port::Status(void**)> load_dso);
-
-  SE_DISALLOW_COPY_AND_ASSIGN(CachedDsoLoader);
-};
-
-}  // namespace internal
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_DSO_LOADER_H_
diff --git a/tensorflow/stream_executor/gpu/BUILD b/tensorflow/stream_executor/gpu/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e6812389e3eafc40365861722be3edb414dd05c7
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/BUILD
@@ -0,0 +1,189 @@
+# Description:
+#   GPU-platform specific StreamExecutor support code.
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow/stream_executor:build_defs.bzl",
+    "if_gpu_is_configured",
+)
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
+
+package(
+    default_visibility = ["//tensorflow/stream_executor:__subpackages__"],
+)
+
+# Filegroup used to collect source files for the dependency check.
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+cc_library(
+    name = "gpu_activation_header",
+    hdrs = ["gpu_activation.h"],
+    deps = ["//tensorflow/stream_executor/platform"],
+)
+
+cc_library(
+    name = "gpu_activation",
+    srcs = if_gpu_is_configured(["gpu_activation.cc"]),
+    hdrs = if_gpu_is_configured(["gpu_activation.h"]),
+    deps = if_gpu_is_configured([
+        ":gpu_activation_header",
+        ":gpu_driver_header",
+        "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor/platform",
+    ]),
+)
+
+cc_library(
+    name = "gpu_diagnostics_header",
+    hdrs = if_gpu_is_configured(["gpu_diagnostics.h"]),
+    deps = [
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "gpu_driver_header",
+    hdrs = if_gpu_is_configured(["gpu_driver.h"]),
+    deps = [
+        ":gpu_types_header",
+        "//tensorflow/stream_executor:device_options",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
+
+cc_library(
+    name = "gpu_event_header",
+    hdrs = if_gpu_is_configured(["gpu_event.h"]),
+    deps = [
+        ":gpu_driver_header",
+        ":gpu_stream_header",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor/lib",
+    ],
+)
+
+cc_library(
+    name = "gpu_event",
+    srcs = if_gpu_is_configured(["gpu_event.cc"]),
+    hdrs = if_gpu_is_configured(["gpu_event.h"]),
+    deps = [
+        ":gpu_driver_header",
+        ":gpu_executor_header",
+        ":gpu_stream",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+    ],
+)
+
+cc_library(
+    name = "gpu_executor_header",
+    hdrs = if_gpu_is_configured(["gpu_executor.h"]),
+    deps = [
+        ":gpu_kernel_header",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:platform",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "gpu_helpers_header",
+    hdrs = if_gpu_is_configured(["gpu_helpers.h"]),
+    deps = [":gpu_types_header"],
+)
+
+cc_library(
+    name = "gpu_kernel_header",
+    hdrs = if_gpu_is_configured(["gpu_kernel.h"]),
+    deps = [
+        ":gpu_driver_header",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "gpu_rng_header",
+    hdrs = if_gpu_is_configured(["gpu_rng.h"]),
+    deps = [
+        ":gpu_types_header",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:rng",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "gpu_stream_header",
+    hdrs = if_gpu_is_configured(["gpu_stream.h"]),
+    deps = [
+        ":gpu_driver_header",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "gpu_stream",
+    srcs = if_gpu_is_configured(["gpu_stream.cc"]),
+    hdrs = if_gpu_is_configured(["gpu_stream.h"]),
+    deps = [
+        ":gpu_driver_header",
+        ":gpu_executor_header",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor:stream_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+cc_library(
+    name = "gpu_timer_header",
+    hdrs = if_gpu_is_configured(["gpu_timer.h"]),
+    deps = [
+        ":gpu_driver_header",
+        ":gpu_executor_header",
+        "//tensorflow/stream_executor:stream_executor_internal",
+    ],
+)
+
+cc_library(
+    name = "gpu_timer",
+    srcs = if_gpu_is_configured(["gpu_timer.cc"]),
+    hdrs = if_gpu_is_configured(["gpu_timer.h"]),
+    deps = [
+        ":gpu_driver_header",
+        ":gpu_executor_header",
+        ":gpu_stream",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+    ],
+)
+
+cc_library(
+    name = "gpu_types_header",
+    hdrs = if_gpu_is_configured(["gpu_types.h"]),
+    deps = [
+        "//tensorflow/stream_executor/platform",
+    ] + if_cuda_is_configured([
+        "@local_config_cuda//cuda:cuda_headers",
+    ]) + if_rocm_is_configured([
+        "@local_config_rocm//rocm:rocm_headers",
+    ]),
+)
diff --git a/tensorflow/stream_executor/cuda/cuda_activation.cc b/tensorflow/stream_executor/gpu/gpu_activation.cc
similarity index 62%
rename from tensorflow/stream_executor/cuda/cuda_activation.cc
rename to tensorflow/stream_executor/gpu/gpu_activation.cc
index 02371c3c3ab403e9b3303fbbafdef18c30196f4f..6f74eef2dbc106c14f04736418f3e42adb68f0b9 100644
--- a/tensorflow/stream_executor/cuda/cuda_activation.cc
+++ b/tensorflow/stream_executor/gpu/gpu_activation.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,36 +13,36 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/stream_executor/cuda/cuda_activation.h"
+#include "tensorflow/stream_executor/gpu/gpu_activation.h"
 
-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
-CudaContext* ExtractCudaContext(CUDAExecutor *cuda_exec);
-CUDAExecutor *ExtractCudaExecutor(StreamExecutor *stream_exec);
+GpuContext* ExtractGpuContext(GpuExecutor* gpu_exec);
+GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec);
 
 ScopedActivateExecutorContext::ScopedActivateExecutorContext(
-    CUDAExecutor *cuda_exec):
-      driver_scoped_activate_context_(
-          new ScopedActivateContext{ExtractCudaContext(cuda_exec)}) { }
+    GpuExecutor* gpu_exec)
+    : driver_scoped_activate_context_(
+          new ScopedActivateContext{ExtractGpuContext(gpu_exec)}) {}
 
 ScopedActivateExecutorContext::ScopedActivateExecutorContext(
-    StreamExecutor *stream_exec)
-    : ScopedActivateExecutorContext(ExtractCudaExecutor(stream_exec)) {}
+    StreamExecutor* stream_exec)
+    : ScopedActivateExecutorContext(ExtractGpuExecutor(stream_exec)) {}
 
 ScopedActivateExecutorContext::~ScopedActivateExecutorContext() {
-  delete static_cast<ScopedActivateContext *>(driver_scoped_activate_context_);
+  delete static_cast<ScopedActivateContext*>(driver_scoped_activate_context_);
 }
 
 ScopedActivateExecutorContext::ScopedActivateExecutorContext(
-    ScopedActivateExecutorContext &&other)
+    ScopedActivateExecutorContext&& other)
     : driver_scoped_activate_context_(other.driver_scoped_activate_context_) {
   other.driver_scoped_activate_context_ = nullptr;
 }
 
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/gpu/gpu_activation.h b/tensorflow/stream_executor/gpu/gpu_activation.h
new file mode 100644
index 0000000000000000000000000000000000000000..3409304d7796bfac92295b2eecc10e2f9487c018
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_activation.h
@@ -0,0 +1,61 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file contains APIs that assume a StreamExecutor is backed by CUDA.
+// It reaches into the CUDA implementation to activate an underlying CUDA
+// context.
+//
+// Having this file separate from gpu/gpu_executor.h means that dependent
+// code does not also have to depend on cuda.h.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_ACTIVATION_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_ACTIVATION_H_
+
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace stream_executor {
+
+class StreamExecutor;
+
+namespace gpu {
+
+class GpuExecutor;
+class ScopedActivateContext;
+
+// Activates a CUDA context within an enclosing scope.
+class ScopedActivateExecutorContext {
+ public:
+  // Form that takes a CUDA executor implementation.
+  explicit ScopedActivateExecutorContext(GpuExecutor* gpu_exec);
+
+  // Form that takes a pImpl executor and extracts a CUDA implementation --
+  // fatal failure if it is not CUDA inside.
+  explicit ScopedActivateExecutorContext(StreamExecutor* stream_exec);
+
+  ScopedActivateExecutorContext(ScopedActivateExecutorContext&& other);
+
+  ~ScopedActivateExecutorContext();
+
+ private:
+  // The cuda.h-using datatype that we wrap.
+  ScopedActivateContext* driver_scoped_activate_context_;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ScopedActivateExecutorContext);
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_ACTIVATION_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_diagnostics.h b/tensorflow/stream_executor/gpu/gpu_diagnostics.h
new file mode 100644
index 0000000000000000000000000000000000000000..71642109b57fd9b4e0a0a3dbc4efee7991bb6f03
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_diagnostics.h
@@ -0,0 +1,99 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
+
+#include <tuple>
+#include "tensorflow/stream_executor/platform/port.h"
+
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// e.g. DriverVersion{346, 3, 4}
+using DriverVersion = std::tuple<int, int, int>;
+
+// FIXME: These functions are in stream_executor::cuda namespaces for now
+// Will move to stream_executor::gpu namespace in the near future
+//
+//// Converts a parsed driver version to string form.
+// string DriverVersionToString(DriverVersion version);
+//
+//// Converts a parsed driver version or status value to natural string form.
+// string DriverVersionStatusToString(port::StatusOr<DriverVersion> version);
+//
+//// Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
+// port::StatusOr<DriverVersion> StringToDriverVersion(const string& value);
+
+class Diagnostician {
+ public:
+  // Logs diagnostic information when CUDA appears to be misconfigured (e.g. is
+  // not initializing).
+  //
+  // Note: if we're running on a machine that has no GPUs, we don't want to
+  // produce very much log spew beyond saying, "looks like there's no CUDA
+  // kernel
+  // module running".
+  //
+  // Note: we use non-Google-File:: API here because we may be called before
+  // InitGoogle has completed.
+  static void LogDiagnosticInformation();
+
+  // Given the driver version file contents, finds the kernel module version and
+  // returns it as a string.
+  //
+  // This is solely used for more informative log messages when the user is
+  // running on a machine that happens to have a libcuda/kernel driver mismatch.
+  static port::StatusOr<DriverVersion> FindKernelModuleVersion(
+      const string& driver_version_file_contents);
+
+  // Extracts the kernel driver version from the current host.
+  static port::StatusOr<DriverVersion> FindKernelDriverVersion();
+
+  // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
+  // driver-interfacing DSO version number. Returns it as a string.
+  static port::StatusOr<DriverVersion> FindDsoVersion();
+
+  // Logs information about the kernel driver version and userspace driver
+  // library version.
+  static void LogDriverVersionInformation();
+
+ private:
+  // Given the DSO version number and the driver version file contents, extracts
+  // the driver version and compares, warning the user in the case of
+  // incompatibility.
+  //
+  // This is solely used for more informative log messages when the user is
+  // running on a machine that happens to have a libcuda/kernel driver mismatch.
+  static void WarnOnDsoKernelMismatch(
+      port::StatusOr<DriverVersion> dso_version,
+      port::StatusOr<DriverVersion> kernel_version);
+
+  // Logs information about the dev nodes present on this machine: their
+  // existence, permissions, accessibility from this uid/gid.
+  static void LogDevNodeDiagnosticInformation();
+
+  static string GetDevNodePath(int dev_node_ordinal);
+
+  SE_DISALLOW_COPY_AND_ASSIGN(Diagnostician);
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DIAGNOSTICS_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_driver.h b/tensorflow/stream_executor/gpu/gpu_driver.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5ef48db4704015c51fb1d0e203e541b6b79afc5
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_driver.h
@@ -0,0 +1,525 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// CUDA userspace driver library wrapper functionality.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_
+
+#include <stddef.h>
+#include "tensorflow/stream_executor/platform/port.h"
+
+#include "cuda/include/cuda.h"
+#include "tensorflow/stream_executor/device_options.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+#include "tensorflow/stream_executor/gpu/gpu_types.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// Identifies the memory space where an allocation resides. See
+// GpuDriver::GetPointerMemorySpace().
+enum class MemorySpace { kHost, kDevice };
+
+// Returns a casual string, such as "host" for the provided memory space.
+string MemorySpaceString(MemorySpace memory_space);
+
+class GpuContext;
+
+// GpuDriver contains wrappers for calls to the userspace library driver. It's
+// useful to isolate these calls and put basic wrappers around them to separate
+// userspace library driver behaviors from the rest of the program.
+//
+// At the moment it's simply used as a namespace.
+//
+// The calls log any specific errors internally and return whether the operation
+// was successful to the caller.
+//
+// The order of parameters is generally kept symmetric with the underlying CUDA
+// driver API.
+//
+// Links on functions are to specific documentation under
+// http://docs.nvidia.com/cuda/cuda-driver-api/
+//
+// Thread safety: these functions should not be used from signal handlers.
+class GpuDriver {
+ public:
+  // Wraps a call to cuInit with logging to help indicate what has gone wrong in
+  // the case of failure. Safe to call multiple times; will be fast on all calls
+  // after the first.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html#group__CUDA__INITIALIZE_1g0a2f1517e1bd8502c7194c3a8c134bc3
+  static port::Status Init();
+
+  // Returns the device associated with the given context.
+  // device is an outparam owned by the caller, must not be null.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g4e84b109eba36cdaaade167f34ae881e
+  static port::StatusOr<GpuDeviceHandle> DeviceFromContext(GpuContext* context);
+
+  // Creates a new CUDA stream associated with the given context via
+  // cuStreamCreate.
+  // stream is an outparam owned by the caller, must not be null.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4
+  static bool CreateStream(GpuContext* context, GpuStreamHandle* stream);
+
+  // Destroys a CUDA stream associated with the given context.
+  // stream is owned by the caller, must not be null, and *stream is set to null
+  // if the stream is successfully destroyed.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g244c8833de4596bcd31a06cdf21ee758
+  static void DestroyStream(GpuContext* context, GpuStreamHandle* stream);
+
+  // CUDA events can explicitly disable event TSC retrieval for some presumed
+  // performance improvement if timing is unnecessary.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
+  enum class EventFlags { kDefault, kDisableTiming };
+
+  // Creates a new event associated with the given context.
+  // result is an outparam owned by the caller and must not be null.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db
+  static port::Status CreateEvent(GpuContext* context, GpuEventHandle* result,
+                                  EventFlags flags);
+
+  // Destroys *event and turns it into a nullptr. event may not be null, but
+  // *event may be, via cuEventDestroy
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g593ec73a8ec5a5fc031311d3e4dca1ef
+  static port::Status DestroyEvent(GpuContext* context, GpuEventHandle* event);
+
+  // Allocates a GPU memory space of size bytes associated with the given
+  // context via cuMemAlloc.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb82d2a09844a58dd9e744dc31e8aa467
+  static void* DeviceAllocate(GpuContext* context, uint64 bytes);
+
+  // Deallocates a GPU memory space of size bytes associated with the given
+  // context via cuMemFree.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
+  static void DeviceDeallocate(GpuContext* context, void* location);
+
+  // Allocates a unified memory space of size bytes associated with the given
+  // context via cuMemAllocManaged.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb347ded34dc326af404aa02af5388a32
+  // (supported on CUDA only)
+  static void* UnifiedMemoryAllocate(GpuContext* context, uint64 bytes);
+
+  // Deallocates a unified memory space of size bytes associated with the given
+  // context via cuMemFree.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
+  // (supported on CUDA only)
+  static void UnifiedMemoryDeallocate(GpuContext* context, void* location);
+
+  // Allocates page-locked and CUDA-registered memory on the host via
+  // cuMemAllocHost.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0
+  static void* HostAllocate(GpuContext* context, uint64 bytes);
+
+  // Deallocates a location created by HostAllocate, via cuMemFreeHost.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g62e0fdbe181dab6b1c90fa1a51c7b92c
+  static void HostDeallocate(GpuContext* context, void* location);
+
+  // Registers a memory region at location of size bytes via cuMemHostRegister.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b54223
+  static bool HostRegister(GpuContext* context, void* location, uint64 bytes);
+
+  // Unregisters a memory region that was previously registered at location via
+  // cuMemHostUnregister.
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g63f450c8125359be87b7623b1c0b2a14
+  //
+  // TODO(leary) verify an error will be returned if the location wasn't
+  // previously registered.
+  static bool HostUnregister(GpuContext* context, void* location);
+
+  // Given a device ordinal, returns a device handle into the device outparam,
+  // which must not be null.
+  //
+  // N.B. these device handles do not have a corresponding destroy function in
+  // the CUDA driver API.
+  static port::Status GetDevice(int device_ordinal, GpuDeviceHandle* device);
+
+  // Given a device handle, returns the name reported by the driver for the
+  // device.
+  static bool GetDeviceName(GpuDeviceHandle device, string* device_name);
+
+  // Given a device to create a context for, returns a context handle into the
+  // context outparam, which must not be null.
+  //
+  // N.B. CUDA contexts are weird. They are implicitly associated with the
+  // calling thread. Current documentation on contexts and their influence on
+  // userspace processes is given here:
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g65dc0012348bc84810e2103a40d8e2cf
+  static port::Status CreateContext(int device_ordinal, GpuDeviceHandle device,
+                                    const DeviceOptions& device_options,
+                                    GpuContext** context);
+
+  // Destroys the provided context via cuCtxDestroy.
+  // Don't do this while clients could still be using the context, per the docs
+  // bad things will happen.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g27a365aebb0eb548166309f58a1e8b8e
+  static void DestroyContext(GpuContext* context);
+
+  // Queries the runtime for the specified attribute of the specified function.
+  // cuFuncGetAttribute (the underlying CUDA driver API routine) only operates
+  // in terms of integer-sized values, so there's no potential for overrun (as
+  // of CUDA 5.5).
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b
+  static bool FuncGetAttribute(GpuFunctionAttribute attribute,
+                               GpuFunctionHandle function,
+                               int* attribute_value);
+
+  // Sets the preferred cache configuration for the specified function.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g40f8c11e81def95dc0072a375f965681
+  static bool FuncSetCacheConfig(GpuFunctionHandle function,
+                                 GpuFuncCachePreference cache_config);
+
+  // Gets the preferred shared memory bank configuration for the specified
+  // CONTEXT (not function!), either default or four- or eight-byte bank size.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g17153a1b8b8c756f7ab8505686a4ad74
+  static port::StatusOr<GpuSharedMemConfig> ContextGetSharedMemConfig(
+      GpuContext* context);
+
+  // Sets the preferred shared memory bank configuration for the specified
+  // CONTEXT (not function!), either default or four- or eight-byte bank size.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2574235fa643f8f251bf7bc28fac3692
+  static port::Status ContextSetSharedMemConfig(
+      GpuContext* context, GpuSharedMemConfig shared_mem_config);
+
+  // Launches a CUDA kernel via cuLaunchKernel.
+  // TODO(leary) describe the structure of kernel_params and extra in a readable
+  // way.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15
+  static bool LaunchKernel(GpuContext* context, GpuFunctionHandle function,
+                           unsigned int grid_dim_x, unsigned int grid_dim_y,
+                           unsigned int grid_dim_z, unsigned int block_dim_x,
+                           unsigned int block_dim_y, unsigned int block_dim_z,
+                           unsigned int shared_mem_bytes,
+                           GpuStreamHandle stream, void** kernel_params,
+                           void** extra);
+
+  // Loads ptx_contents with the CUDA driver's PTX JIT and stores the resulting
+  // handle in "module". Any error logs that are produced are logged internally.
+  // (supported on CUDA only)
+  static bool LoadPtx(GpuContext* context, const char* ptx_contents,
+                      GpuModuleHandle* module);
+
+  // Loads cubin_bytes with the CUDA driver's blob loading interface and stores
+  // the resulting handle in "module".
+  // (supported on CUDA only)
+  static port::Status LoadCubin(GpuContext* context, const char* cubin_bytes,
+                                GpuModuleHandle* module);
+
+  // Loads HSACO with the ROCM runtime and stores the resulting handle in
+  // "module". Any error logs that are produced are logged internally.
+  // (supported on ROCm only)
+  static bool LoadHsaco(GpuContext* context, const char* hsaco_contents,
+                        GpuModuleHandle* module);
+
+  // Retrieves a named kernel from a loaded module, and places the resulting
+  // handle into function (outparam) on success. Neither kernel_name nor
+  // function may be null. No ownership is taken of kernel_name.
+  static bool GetModuleFunction(GpuContext* context, GpuModuleHandle module,
+                                const char* kernel_name,
+                                GpuFunctionHandle* function);
+
+  // Retrieves a named global/constant symbol from a loaded module, and returns
+  // a device pointer and size of the symbol on success. symbol_name may not be
+  // null. At least one of dptr or bytes should not be null. No ownership is
+  // taken of symbol_name.
+  static bool GetModuleSymbol(GpuContext* context, GpuModuleHandle module,
+                              const char* symbol_name, GpuDevicePtr* dptr,
+                              size_t* bytes);
+
+  // Unloads module from the current context via cuModuleUnload.
+  // TODO(leary) the documentation doesn't say what kind of disasters happen
+  // if you try to unload a module while its GpuFunctionHandles are in use.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b
+  static void UnloadModule(GpuContext* context, GpuModuleHandle module);
+
+  // Performs a synchronous memset of the device memory segment via cuMemsetD8.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6e582bf866e9e2fb014297bfaf354d7b
+  static bool SynchronousMemsetUint8(GpuContext* context, GpuDevicePtr location,
+                                     uint8 value, size_t size);
+
+  // Performs a synchronous memset of the device memory segment via cuMemsetD32.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g983e8d8759acd1b64326317481fbf132
+  static bool SynchronousMemsetUint32(GpuContext* context,
+                                      GpuDevicePtr location, uint32 value,
+                                      size_t uint32_count);
+
+  // Performs an asynchronous memset of the device memory segment via
+  // cuMemsetD8Async.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gaef08a7ccd61112f94e82f2b30d43627
+  static bool AsynchronousMemsetUint8(GpuContext* context,
+                                      GpuDevicePtr location, uint8 value,
+                                      size_t uint32_count,
+                                      GpuStreamHandle stream);
+
+  // Performs an asynchronous memset of the device memory segment via
+  // cuMemsetD32Async.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g58229da5d30f1c0cdf667b320ec2c0f5
+  static bool AsynchronousMemsetUint32(GpuContext* context,
+                                       GpuDevicePtr location, uint32 value,
+                                       size_t uint32_count,
+                                       GpuStreamHandle stream);
+
+  // -- Synchronous memcopies.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4d32266788c440b0220b1a9ba5795169
+
+  static port::Status SynchronousMemcpyD2H(GpuContext* context, void* host_dst,
+                                           GpuDevicePtr gpu_src, uint64 size);
+  static port::Status SynchronousMemcpyH2D(GpuContext* context,
+                                           GpuDevicePtr gpu_dst,
+                                           const void* host_src, uint64 size);
+  static port::Status SynchronousMemcpyD2D(GpuContext* context,
+                                           GpuDevicePtr gpu_dst,
+                                           GpuDevicePtr gpu_src, uint64 size);
+
+  // -- Asynchronous memcopies.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g56f30236c7c5247f8e061b59d3268362
+
+  static bool AsynchronousMemcpyD2H(GpuContext* context, void* host_dst,
+                                    GpuDevicePtr gpu_src, uint64 size,
+                                    GpuStreamHandle stream);
+  static bool AsynchronousMemcpyH2D(GpuContext* context, GpuDevicePtr gpu_dst,
+                                    const void* host_src, uint64 size,
+                                    GpuStreamHandle stream);
+  static bool AsynchronousMemcpyD2D(GpuContext* context, GpuDevicePtr gpu_dst,
+                                    GpuDevicePtr gpu_src, uint64 size,
+                                    GpuStreamHandle stream);
+
+  // The CUDA stream callback type signature.
+  // The data passed to AddStreamCallback is subsequently passed to this
+  // callback when it fires.
+  //
+  // Some notable things:
+  // * Callbacks must not make any CUDA API calls.
+  // * Callbacks from independent streams execute in an undefined order and may
+  //   be serialized.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g613d97a277d7640f4cb1c03bd51c2483
+  typedef void (*StreamCallback)(GpuStreamHandle stream, GpuStatus status,
+                                 void* data);
+
+  // Enqueues a callback operation into stream.
+  // See StreamCallback above and the NVIDIA documentation for additional
+  // details.
+  static bool AddStreamCallback(GpuContext* context, GpuStreamHandle stream,
+                                StreamCallback callback, void* data);
+
+  // Causes stream to wait for event to trigger before proceeding via
+  // cuStreamWaitEvent.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#axzz334nAXAhM
+  static bool WaitStreamOnEvent(GpuContext* context, GpuStreamHandle stream,
+                                GpuEventHandle event);
+
+  // Blocks the calling thread until the operations enqueued onto stream have
+  // been completed, via cuStreamSynchronize.
+  //
+  // TODO(leary) if a pathological thread enqueues operations onto the stream
+  // while another thread blocks like this, can you wind up waiting an unbounded
+  // amount of time?
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g15e49dd91ec15991eb7c0a741beb7dad
+  static port::Status SynchronizeStream(GpuContext* context,
+                                        GpuStreamHandle stream);
+
+  // Blocks the calling thread until the operations associated with the context
+  // have been completed, via cuCtxSynchronize.
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g7a54725f28d34b8c6299f0c6ca579616
+  static bool SynchronizeContext(GpuContext* context);
+
+  // Returns true if all stream tasks have completed at time of the call. Note
+  // the potential for races around this call (if another thread adds work to
+  // the stream immediately after this returns).
+  static bool IsStreamIdle(GpuContext* context, GpuStreamHandle stream);
+
+  // Returns whether code in the from context can access memory in the to
+  // context via cuDeviceCanAccessPeer.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e
+  static bool CanEnablePeerAccess(GpuContext* from, GpuContext* to);
+
+  // Enables peer access per CanEnablePeerAccess, via cuCtxEnablePeerAccess.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a
+  static port::Status EnablePeerAccess(GpuContext* from, GpuContext* to);
+
+  // Returns the elapsed milliseconds between start and stop via
+  // cuEventElapsedTime.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1gdfb1178807353bbcaa9e245da497cf97
+  static bool GetEventElapsedTime(GpuContext* context,
+                                  float* elapsed_milliseconds,
+                                  GpuEventHandle start, GpuEventHandle stop);
+
+  // Records that an event occurred when execution reaches the current point in
+  // thestream via cuEventRecord.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g95424d3be52c4eb95d83861b70fb89d1
+  static port::Status RecordEvent(GpuContext* context, GpuEventHandle event,
+                                  GpuStreamHandle stream);
+
+  // Polls (without blocking) to determine the status of an event - pending or
+  // complete (or an error status).
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g6f0704d755066b0ee705749ae911deef
+  static port::StatusOr<GpuStatus> QueryEvent(GpuContext* context,
+                                              GpuEventHandle event);
+
+  // -- Pointer-specific calls.
+
+  // Returns the context in which pointer was allocated or registered.
+  static port::StatusOr<GpuContext*> GetPointerContext(GpuDevicePtr pointer);
+
+  // Returns the device associated with the context from GetPointerContext().
+  static port::StatusOr<GpuDeviceHandle> GetPointerDevice(GpuDevicePtr pointer);
+
+  // Returns the memory space addressed by pointer.
+  static port::StatusOr<MemorySpace> GetPointerMemorySpace(
+      GpuDevicePtr pointer);
+
+  // Returns the base address and size of the device pointer dptr.
+  static port::Status GetPointerAddressRange(GpuDevicePtr dptr,
+                                             GpuDevicePtr* base, size_t* size);
+
+  // -- Device-specific calls.
+
+  // Returns the compute capability for the device; i.e (3, 5).
+  // This is currently done via the deprecated device API.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1ge2091bbac7e1fb18c2821612115607ea
+  // (supported on CUDA only)
+  static port::Status GetComputeCapability(int* cc_major, int* cc_minor,
+                                           GpuDeviceHandle device);
+
+  // Returns Gpu ISA version for the device; i.e 803, 900.
+  // (supported on ROCm only)
+  static port::Status GetGpuISAVersion(int* version, GpuDeviceHandle device);
+
+  // Returns the number of multiprocessors on the device (note that the device
+  // may be multi-GPU-per-board).
+  static port::StatusOr<int> GetMultiprocessorCount(GpuDeviceHandle device);
+
+  // Returns the limit on number of threads that can be resident in a single
+  // multiprocessor.
+  static port::StatusOr<int64> GetMaxThreadsPerMultiprocessor(
+      GpuDeviceHandle device);
+
+  // Returns the limit on number of threads which may be resident for a single
+  // block (cooperative thread array).
+  static port::StatusOr<int64> GetMaxThreadsPerBlock(GpuDeviceHandle device);
+
+  // Returns the amount of shared memory available on a single GPU core (i.e.
+  // SM on NVIDIA devices).
+  static port::StatusOr<int64> GetMaxSharedMemoryPerCore(
+      GpuDeviceHandle device);
+
+  // Returns the amount of shared memory available for a single block
+  // (cooperative thread array).
+  static port::StatusOr<int64> GetMaxSharedMemoryPerBlock(
+      GpuDeviceHandle device);
+
+  // Returns the maximum supported number of registers per block.
+  static port::StatusOr<int64> GetMaxRegistersPerBlock(GpuDeviceHandle device);
+
+  // Returns the number of threads per warp.
+  static port::StatusOr<int64> GetThreadsPerWarp(GpuDeviceHandle device);
+
+  // Queries the grid limits for device with cuDeviceGetAttribute calls.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
+  static bool GetGridLimits(int* x, int* y, int* z, GpuDeviceHandle device);
+
+  // Returns a grab-bag of device properties in a caller-owned device_properties
+  // structure for device_ordinal via cuDeviceGetProperties.
+  //
+  // This call is deprecated in the NVIDIA driver API; its replacement is
+  // GetDeviceAttribute
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html#group__CUDA__DEVICE__DEPRECATED_1g65a5b4e25186bd257df80b98c98cffe6
+  static bool GetDeviceProperties(GpuDeviceProperty* device_properties,
+                                  int device_ordinal);
+
+  // Gets a specific integer-valued property about the given device.
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
+  static port::StatusOr<int> GetDeviceAttribute(GpuDeviceAttribute attribute,
+                                                GpuDeviceHandle device);
+
+  // Returns whether ECC is enabled for the given GpuDeviceHandle via
+  // cuDeviceGetattribute with CU_DEVICE_ATTRIBUTE_ECC_ENABLED.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266
+  static bool IsEccEnabled(GpuDeviceHandle device, bool* result);
+
+  // Returns the total amount of memory available for allocation by the CUDA
+  // context, in bytes, via cuDeviceTotalMem.
+  static bool GetDeviceTotalMemory(GpuDeviceHandle device, uint64* result);
+
+  // Returns the free amount of memory and total amount of memory, as reported
+  // by cuMemGetInfo.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g808f555540d0143a331cc42aa98835c0
+  static bool GetDeviceMemoryInfo(GpuContext* context, int64* free,
+                                  int64* total);
+
+  // Returns a PCI bus id string for the device.
+  // [domain]:[bus]:[device].[function]
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g85295e7d9745ab8f0aa80dd1e172acfc
+  static string GetPCIBusID(GpuDeviceHandle device);
+
+  // -- Context- and device-independent calls.
+
+  // Returns the number of visible CUDA device via cuDeviceGetCount.
+  // This should correspond to the set of device ordinals available.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g52b5ce05cb8c5fb6831b2c0ff2887c74
+  static int GetDeviceCount();
+
+  // Returns the driver version number via cuDriverGetVersion.
+  // This is, surprisingly, NOT the actual driver version (e.g. 331.79) but,
+  // instead, the CUDA toolkit release number that this driver is compatible
+  // with; e.g. 6000 (for a CUDA 6.0 compatible driver) or 6050 (for a CUDA 6.5
+  // compatible driver).
+  //
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VERSION.html#group__CUDA__VERSION_1g8b7a10395392e049006e61bcdc8ebe71
+  static bool GetDriverVersion(int* driver_version);
+
+  // -- Other calls
+
+  // Returns the maximum number of blocks (per multiprocessor) occupied by the
+  // specified kernel/GpuFunctionHandle when launched with the specified
+  // parameters.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__OCCUPANCY.html#group__CUDA__OCCUPANCY_1gcc6e1094d05cba2cee17fe33ddd04a98
+  static port::StatusOr<int> GetMaxOccupiedBlocksPerCore(
+      GpuContext* context, GpuFunctionHandle kernel, int threads_per_block,
+      size_t dynamic_shared_memory_bytes);
+
+  // Seam for injecting an error at CUDA initialization time for testing
+  // purposes.
+  static bool driver_inject_init_error_;
+};
+
+// Ensures a context is activated within a scope.
+class ScopedActivateContext {
+ public:
+  // Activates the context via cuCtxSetCurrent, if it is not the currently
+  // active context (a la cuCtxGetCurrent). Note the alternative push/pop
+  // mechanism is said by NVIDIA to be relatively slow and deprecated.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gbe562ee6258b4fcc272ca6478ca2a2f7
+  explicit ScopedActivateContext(GpuContext* context);
+
+  // Checks that the context has remained activated for the duration of the
+  // scope.
+  ~ScopedActivateContext();
+
+ private:
+  GpuContext* to_restore_ = nullptr;
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_DRIVER_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_event.cc b/tensorflow/stream_executor/gpu/gpu_event.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a523958550d10c13624b729076a3fd271e68243a
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_event.cc
@@ -0,0 +1,47 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/gpu/gpu_event.h"
+
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace stream_executor {
+namespace gpu {
+
+GpuEvent::GpuEvent(GpuExecutor* parent)
+    : parent_(parent), gpu_event_(nullptr) {}
+
+GpuEvent::~GpuEvent() {}
+
+port::Status GpuEvent::Init() {
+  return GpuDriver::CreateEvent(parent_->gpu_context(), &gpu_event_,
+                                GpuDriver::EventFlags::kDisableTiming);
+}
+
+port::Status GpuEvent::Destroy() {
+  return GpuDriver::DestroyEvent(parent_->gpu_context(), &gpu_event_);
+}
+
+port::Status GpuEvent::Record(GpuStream* stream) {
+  return GpuDriver::RecordEvent(parent_->gpu_context(), gpu_event_,
+                                stream->gpu_stream());
+}
+
+GpuEventHandle GpuEvent::gpu_event() { return gpu_event_; }
+
+}  // namespace gpu
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/gpu/gpu_event.h b/tensorflow/stream_executor/gpu/gpu_event.h
new file mode 100644
index 0000000000000000000000000000000000000000..61f39d42fe7344b3b092b8fbcc5615da99564300
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_event.h
@@ -0,0 +1,62 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EVENT_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EVENT_H_
+
+#include "tensorflow/stream_executor/event.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/lib/status.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// GpuEvent wraps a GpuEventHandle in the platform-independent EventInterface
+// interface.
+class GpuEvent : public internal::EventInterface {
+ public:
+  explicit GpuEvent(GpuExecutor* parent);
+
+  ~GpuEvent() override;
+
+  // Populates the CUDA-platform-specific elements of this object.
+  port::Status Init();
+
+  // Deallocates any platform-specific elements of this object. This is broken
+  // out (not part of the destructor) to allow for error reporting.
+  port::Status Destroy();
+
+  // Inserts the event at the current position into the specified stream.
+  port::Status Record(GpuStream* stream);
+
+  // Polls the CUDA platform for the event's current status.
+  Event::Status PollForStatus();
+
+  // The underlying CUDA event element.
+  GpuEventHandle gpu_event();
+
+ private:
+  // The Executor used to which this object and GpuEventHandle are bound.
+  GpuExecutor* parent_;
+
+  // The underlying CUDA event element.
+  GpuEventHandle gpu_event_;
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EVENT_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_executor.h b/tensorflow/stream_executor/gpu/gpu_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f969a98d2f42b5be0f6d29e8e19c006540e3b8b
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_executor.h
@@ -0,0 +1,347 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The CUDA implementation of the StreamExecutorInterface functionality.
+// CUDA inclusions are ideally confined to this implementation file.
+//
+// The notions from the StreamExecutor basically correspond to the CUDA streams
+// programming model provided by the libcuda.so driver APIs, so we don't have
+// to do much more than wrap the calls to the libraries appropriately.
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
+
+#include <set>
+#include <unordered_map>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/stream_executor/event.h"
+#include "tensorflow/stream_executor/gpu/gpu_kernel.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// CUDA-platform implementation of the platform-agnostic
+// StreamExecutorInferface.
+class GpuExecutor : public internal::StreamExecutorInterface {
+ public:
+  // sub_platform indicates the subplatform used in this executor; it must
+  // be a CUDA type.
+  explicit GpuExecutor(const PluginConfig& plugin_config)
+      : device_(0),
+        context_(nullptr),
+        device_ordinal_(0),
+        cc_major_(0),
+        cc_minor_(0),
+        version_(0),
+        plugin_config_(plugin_config) {}
+
+  // See the corresponding StreamExecutor methods for method comments on the
+  // following overrides.
+
+  ~GpuExecutor() override;
+
+  port::Status Init(int device_ordinal, DeviceOptions device_options) override;
+
+  bool GetKernel(const MultiKernelLoaderSpec& spec,
+                 KernelBase* kernel) override;
+  // (supported on CUDA only)
+  void UnloadKernel(const KernelBase* kernel) override;
+  bool LoadModule(const MultiModuleLoaderSpec& spec,
+                  ModuleHandle* module_handle) override;
+  bool UnloadModule(ModuleHandle module_handle) override;
+
+  bool Launch(Stream* stream, const ThreadDim& thread_dims,
+              const BlockDim& block_dims, const KernelBase& k,
+              const KernelArgsArrayBase& args) override;
+
+  // (supported on CUDA only)
+  int CalculateOccupancy(const DeviceDescription& device_description,
+                         uint64 registers_per_thread,
+                         uint64 shared_memory_per_block,
+                         const ThreadDim& thread_dims, GpuFunctionHandle func);
+
+  // (supported on CUDA only)
+  int CompareOccupancy(int* initial_blocks,
+                       const DeviceDescription& device_description,
+                       uint64 registers_per_thread,
+                       uint64 shared_memory_per_block,
+                       const ThreadDim& thread_dims, GpuFunctionHandle func);
+
+  void* Allocate(uint64 size) override;
+
+  void* AllocateSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
+                          uint64 size_bytes) override;
+
+  void Deallocate(DeviceMemoryBase* mem) override;
+
+  void* UnifiedMemoryAllocate(uint64 size) override {
+    return GpuDriver::UnifiedMemoryAllocate(context_, size);
+  }
+
+  void UnifiedMemoryDeallocate(void* location) override {
+    return GpuDriver::UnifiedMemoryDeallocate(context_, location);
+  }
+
+  // CUDA allocation/registration functions are necessary because the driver
+  // internally sets up buffers for DMA operations (and page locks them).
+  // There's no external interface for us to otherwise control these DMA
+  // settings.
+  void* HostMemoryAllocate(uint64 size) override {
+    return GpuDriver::HostAllocate(context_, size);
+  }
+
+  void HostMemoryDeallocate(void* location) override {
+    return GpuDriver::HostDeallocate(context_, location);
+  }
+
+  bool HostMemoryRegister(void* location, uint64 size) override;
+
+  bool HostMemoryUnregister(void* location) override;
+
+  bool SynchronizeAllActivity() override;
+
+  bool SynchronousMemZero(DeviceMemoryBase* location, uint64 size) override;
+
+  bool SynchronousMemSet(DeviceMemoryBase* location, int value,
+                         uint64 size) override;
+
+  port::Status SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
+                                 const void* host_src, uint64 size) override;
+
+  port::Status SynchronousMemcpy(void* host_dst,
+                                 const DeviceMemoryBase& gpu_src,
+                                 uint64 size) override;
+
+  port::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase* gpu_dst,
+                                               const DeviceMemoryBase& gpu_src,
+                                               uint64 size) override;
+
+  bool MemZero(Stream* stream, DeviceMemoryBase* location,
+               uint64 size) override;
+  bool Memset(Stream* stream, DeviceMemoryBase* location, uint8 pattern,
+              uint64 size) override;
+  bool Memset32(Stream* stream, DeviceMemoryBase* location, uint32 pattern,
+                uint64 size) override;
+
+  bool Memcpy(Stream* stream, void* host_dst, const DeviceMemoryBase& gpu_src,
+              uint64 size) override;
+
+  bool Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst, const void* host_src,
+              uint64 size) override;
+
+  bool MemcpyDeviceToDevice(Stream* stream, DeviceMemoryBase* gpu_dst,
+                            const DeviceMemoryBase& gpu_src,
+                            uint64 size) override;
+
+  bool HostCallback(Stream* stream,
+                    std::function<port::Status()> callback) override;
+
+  bool AllocateStream(Stream* stream) override;
+
+  void DeallocateStream(Stream* stream) override;
+
+  bool CreateStreamDependency(Stream* dependent, Stream* other) override;
+
+  bool AllocateTimer(Timer* timer) override;
+
+  void DeallocateTimer(Timer* timer) override;
+
+  bool StartTimer(Stream* stream, Timer* timer) override;
+
+  bool StopTimer(Stream* stream, Timer* timer) override;
+
+  port::Status AllocateEvent(Event* event) override;
+
+  port::Status DeallocateEvent(Event* event) override;
+
+  port::Status RecordEvent(Stream* stream, Event* event) override;
+
+  port::Status WaitForEvent(Stream* stream, Event* event) override;
+
+  Event::Status PollForEventStatus(Event* event) override;
+
+  port::Status BlockHostUntilDone(Stream* stream) override;
+
+  int PlatformDeviceCount() override { return GpuDriver::GetDeviceCount(); }
+
+  port::Status EnablePeerAccessTo(StreamExecutorInterface* other) override;
+
+  bool CanEnablePeerAccessTo(StreamExecutorInterface* other) override;
+
+  SharedMemoryConfig GetDeviceSharedMemoryConfig() override;
+
+  port::Status SetDeviceSharedMemoryConfig(SharedMemoryConfig config) override;
+
+  bool DeviceMemoryUsage(int64* free, int64* total) const override;
+
+  // Search for the symbol and returns a device pointer and size.
+  // Returns false if symbol does not exist.
+  bool GetSymbol(const string& symbol_name, ModuleHandle module_handle,
+                 void** mem, size_t* bytes) override;
+
+  DeviceDescription* PopulateDeviceDescription() const override;
+
+  // Populates the block_dim_limit by querying the device driver API. If an
+  // error occurs at any point while asking the driver for block dim limits, it
+  // will be only partially populated as a result, and an error will be logged.
+  bool FillBlockDimLimit(BlockDim* block_dim_limit) const;
+
+  bool SupportsBlas() const override;
+
+  blas::BlasSupport* CreateBlas() override;
+
+  bool SupportsFft() const override;
+
+  fft::FftSupport* CreateFft() override;
+
+  bool SupportsRng() const override;
+
+  rng::RngSupport* CreateRng() override;
+
+  bool SupportsDnn() const override;
+
+  dnn::DnnSupport* CreateDnn() override;
+
+  std::unique_ptr<internal::EventInterface> CreateEventImplementation()
+      override;
+
+  std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
+      override;
+
+  std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
+
+  std::unique_ptr<internal::TimerInterface> GetTimerImplementation() override;
+
+  void* GpuContextHack() override;
+
+  GpuContext* gpu_context();
+
+ private:
+  // Attempts to find a more specific version of the file indicated by
+  // filename by looking for compute-capability-specific suffixed versions; i.e.
+  // looking for "foo.ptx" will check to see if "foo.ptx.cc30.ptx" is present if
+  // we're on a compute capability 3.0 machine.
+  // (supported on CUDA only)
+  bool FindOnDiskForComputeCapability(absl::string_view filename,
+                                      absl::string_view canonical_suffix,
+                                      string* found_filename) const;
+
+  // Attempts to find a more specific version of the file indicated by
+  // filename by looking for AMDGPU ISA-specific suffixed versions.
+  // (supported on ROCm only)
+
+  bool FindOnDiskForISAVersion(absl::string_view filename,
+                               absl::string_view canonical_suffix,
+                               string* found_filename) const;
+
+  // Host callback landing routine invoked by CUDA.
+  // data: User-provided callback provided to HostCallback() above, captured
+  //       as a std::function<void()>. Allocated/initialized inside
+  //       HostCallback() and owned and deleted by this call.
+  static void InternalHostCallback(GpuStreamHandle stream, GpuStatus status,
+                                   void* data);
+
+  // Collects metadata for the specified kernel.
+  bool GetKernelMetadata(GpuKernel* cuda_kernel,
+                         KernelMetadata* kernel_metadata);
+
+  // Prints to VLOG(2) information about the kernel's occupancy and how it might
+  // be improved.
+  void VlogOccupancyInfo(const KernelBase& kernel, const ThreadDim& thread_dims,
+                         const BlockDim& block_dims);
+
+  // (supported on CUDA only)
+  bool LoadModuleFromCuBin(const char* cubin, GpuModuleHandle* module)
+      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+  // Loads the PTX text `ptx` as a CUDA module.  `ptx` must be null terminated.
+  // (supported on CUDA only)
+  bool LoadModuleFromPtx(const char* ptx, GpuModuleHandle* module)
+      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+  // (supported on ROCm only)
+  bool LoadModuleFromHsaco(const char* hsaco, GpuModuleHandle* module)
+      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+  bool UnloadGpuBinary(const void* gpu_binary)
+      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+  // Guards the on-disk-module mapping.
+  mutex disk_modules_mu_;
+
+  // Mapping from filename to GPUModuleHandle, if it was already retrieved.
+  // Multiple GPUFunctionHandle are usually obtained from a single
+  // GPUModuleHandle so we attempt to hit in this mapping first, before
+  // retrieving it.
+  std::map<string, GpuModuleHandle> disk_modules_ GUARDED_BY(disk_modules_mu_);
+
+  // Guards the in-memory-module mapping.
+  mutex in_memory_modules_mu_;
+
+  std::map<const char*, GpuModuleHandle> in_memory_modules_
+      GUARDED_BY(in_memory_modules_mu_);
+
+  // Kernel -> loaded GPU binary. Many kernels may load the same binary.
+  std::unordered_map<const KernelBase*, const void*> kernel_to_gpu_binary_
+      GUARDED_BY(in_memory_modules_mu_);
+  // GPU binary (PTX or CUBIN or HSACO) -> {CUDA module, reference count}.
+  std::unordered_map<const void*, std::pair<GpuModuleHandle, uint64>>
+      gpu_binary_to_module_ GUARDED_BY(in_memory_modules_mu_);
+
+  // Guards the launched kernel set.
+  mutex launched_kernels_mu_;
+
+  // Keeps track of the set of launched kernels. Currently used to suppress the
+  // occupancy check on subsequent launches.
+  std::set<GpuFunctionHandle> launched_kernels_
+      GUARDED_BY(launched_kernels_mu_);
+
+  // Handle for the CUDA device being operated on. Immutable
+  // post-initialization.
+  GpuDeviceHandle device_;
+
+  // Handle for session with the library/driver. Immutable post-initialization.
+  GpuContext* context_;
+
+  // The device ordinal value that this executor was initialized with; recorded
+  // for use in getting device metadata. Immutable post-initialization.
+  int device_ordinal_;
+
+  // The major verion of the compute capability for device_.
+  int cc_major_;
+
+  // The minor verion of the compute capability for device_.
+  int cc_minor_;
+
+  // GPU ISA version for device_.
+  int version_;
+
+  // The plugin configuration associated with this instance.
+  PluginConfig plugin_config_;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(GpuExecutor);
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_helpers.h b/tensorflow/stream_executor/gpu/gpu_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..117a71718f269d8ffd724d55ae269fea95dac366
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_helpers.h
@@ -0,0 +1,107 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Common helper functions used for dealing with CUDA API datatypes.
+//
+// These are typically placed here for use by multiple source components (for
+// example, BLAS and executor components).
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_HELPERS_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_HELPERS_H_
+
+#include <stddef.h>
+#include <complex>
+
+#include "tensorflow/stream_executor/gpu/gpu_types.h"
+
+namespace stream_executor {
+
+template <typename ElemT>
+class DeviceMemory;
+
+namespace gpu {
+
+// Converts a const DeviceMemory reference to its underlying typed pointer in
+// CUDA
+// device memory.
+template <typename T>
+const T* GpuMemory(const DeviceMemory<T>& mem) {
+  return static_cast<const T*>(mem.opaque());
+}
+
+// Converts a (non-const) DeviceMemory pointer reference to its underlying typed
+// pointer in CUDA device memory.
+template <typename T>
+T* GpuMemoryMutable(DeviceMemory<T>* mem) {
+  return static_cast<T*>(mem->opaque());
+}
+
+static_assert(
+    sizeof(std::complex<float>) == sizeof(GpuComplexType),
+    "std::complex<float> and GpuComplexType should have the same size");
+static_assert(offsetof(GpuComplexType, x) == 0,
+              "The real part of GpuComplexType should appear first.");
+static_assert(
+    sizeof(std::complex<double>) == sizeof(GpuDoubleComplexType),
+    "std::complex<double> and GpuDoubleComplexType should have the same "
+    "size");
+static_assert(offsetof(GpuDoubleComplexType, x) == 0,
+              "The real part of GpuDoubleComplexType should appear first.");
+
+// Type traits to get CUDA complex types from std::complex<>.
+
+template <typename T>
+struct GpuComplexT {
+  typedef T type;
+};
+
+template <>
+struct GpuComplexT<std::complex<float>> {
+  typedef GpuComplexType type;
+};
+
+template <>
+struct GpuComplexT<std::complex<double>> {
+  typedef GpuDoubleComplexType type;
+};
+
+// Converts pointers of std::complex<> to pointers of
+// GpuComplexType/GpuDoubleComplexType. No type conversion for non-complex
+// types.
+
+template <typename T>
+inline const typename GpuComplexT<T>::type* GpuComplex(const T* p) {
+  return reinterpret_cast<const typename GpuComplexT<T>::type*>(p);
+}
+
+template <typename T>
+inline typename GpuComplexT<T>::type* GpuComplex(T* p) {
+  return reinterpret_cast<typename GpuComplexT<T>::type*>(p);
+}
+
+// Converts values of std::complex<float/double> to values of
+// GpuComplexType/GpuDoubleComplexType.
+inline GpuComplexType GpuComplexValue(std::complex<float> val) {
+  return {val.real(), val.imag()};
+}
+
+inline GpuDoubleComplexType GpuComplexValue(std::complex<double> val) {
+  return {val.real(), val.imag()};
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_HELPERS_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_kernel.h b/tensorflow/stream_executor/gpu/gpu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b89b20097323c02fc9cf7492d54657789956ca7
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_kernel.h
@@ -0,0 +1,105 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The CUDA implementation of the StreamExecutorInterface functionality.
+// CUDA inclusions are ideally confined to this implementation file.
+//
+// The notions from the StreamExecutor basically correspond to the CUDA streams
+// programming model provided by the libcuda.so driver APIs, so we don't have
+// to do much more than wrap the calls to the libraries appropriately.
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_
+
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/kernel_cache_config.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// Wraps a GpuFunctionHandle to implement the platform-independent
+// KernelInterface.
+class GpuKernel : public internal::KernelInterface {
+ public:
+  GpuKernel()
+      : gpu_function_(nullptr),
+        arity_(0),
+        preferred_cache_config_(KernelCacheConfig::kNoPreference) {}
+
+  // Note that the function is unloaded when the module is unloaded, and the
+  // module that the function is contained in is owned by the GpuExecutor.
+  ~GpuKernel() override {}
+
+  // As arity cannot be reflected upon using the CUDA API, the arity is
+  // explicitly set during the GpuExecutor::GetKernel initialization process.
+  void set_arity(unsigned arity) { arity_ = arity; }
+  unsigned Arity() const override { return arity_; }
+
+  // Returns the GpuFunctionHandle value for passing to the CUDA API.
+  GpuFunctionHandle AsGpuFunctionHandle() const {
+    DCHECK(gpu_function_ != nullptr);
+    return const_cast<GpuFunctionHandle>(gpu_function_);
+  }
+
+  // Returns the slot that the GpuFunctionHandle is stored within for this
+  // object, for the CUDA API which wants to load into a GpuFunctionHandle*.
+  GpuFunctionHandle* gpu_function_ptr() { return &gpu_function_; }
+
+  // CUDA supports setting the preferred cache configuration of a
+  // GpuFunctionHandle (more-or-less equivalent to a GpuKernel). We support this
+  // via the below functions; users can set a preference, and that is applied
+  // when the kernel is [lazy-]loaded (in GpuExecutor::Launch). The alternative
+  // would be to load the kernel & set the preference when the user calls the
+  // setter below; either approach is valid. Sets the current kernel cache
+  // configuration preference.
+  void SetPreferredCacheConfig(KernelCacheConfig config) override {
+    preferred_cache_config_ = config;
+  }
+
+  // Returns the current kernel cache configuration preference.
+  KernelCacheConfig GetPreferredCacheConfig() const override {
+    return preferred_cache_config_;
+  }
+
+  // Returns the current kernel cache configuration preference as a
+  // CUfunc_cache.
+  GpuFuncCachePreference GetGpuCacheConfig() const;
+
+ private:
+  GpuFunctionHandle gpu_function_;  // Wrapped CUDA kernel handle.
+  unsigned arity_;  // Number of formal parameters the kernel takes.
+
+  // Preferred (but not required) cache configuration for this kernel.
+  KernelCacheConfig preferred_cache_config_;
+};
+
+// Given a platform-independent kernel datatype, returns the (const) internal
+// CUDA platform implementation pointer.
+inline const GpuKernel* AsGpuKernel(const KernelBase* kernel) {
+  return static_cast<const GpuKernel*>(kernel->implementation());
+}
+
+// Given a platform-independent kernel datatype, returns the (non-const)
+// internal CUDA platform implementation pointer.
+inline GpuKernel* AsGpuKernel(KernelBase* kernel) {
+  return static_cast<GpuKernel*>(kernel->implementation());
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_rng.h b/tensorflow/stream_executor/gpu/gpu_rng.h
new file mode 100644
index 0000000000000000000000000000000000000000..d4bf1e1963044a9a54fb92b6a324d3fadd5e6c0b
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_rng.h
@@ -0,0 +1,125 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_RNG_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_RNG_H_
+
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/rng.h"
+
+#include "tensorflow/stream_executor/gpu/gpu_types.h"
+
+namespace stream_executor {
+
+class Stream;
+template <typename ElemT>
+class DeviceMemory;
+
+namespace gpu {
+
+// Opaque and unique identifier for the GPU RNG plugin.
+extern const PluginId kGpuRandPlugin;
+
+class GpuExecutor;
+
+// GPU-platform implementation of the random number generation support
+// interface.
+//
+// Thread-safe post-initialization.
+class GpuRng : public rng::RngSupport {
+ public:
+  explicit GpuRng(GpuExecutor* parent);
+
+  // Retrieves a gpu rng library generator handle. This is necessary for
+  // enqueuing random number generation work onto the device.
+  // TODO(leary) provide a way for users to select the RNG algorithm.
+  bool Init();
+
+  // Releases a gpu rng library generator handle, if one was acquired.
+  ~GpuRng() override;
+
+  // See rng::RngSupport for details on the following overrides.
+  bool DoPopulateRandUniform(Stream* stream, DeviceMemory<float>* v) override;
+  bool DoPopulateRandUniform(Stream* stream, DeviceMemory<double>* v) override;
+  bool DoPopulateRandUniform(Stream* stream,
+                             DeviceMemory<std::complex<float>>* v) override;
+  bool DoPopulateRandUniform(Stream* stream,
+                             DeviceMemory<std::complex<double>>* v) override;
+  bool DoPopulateRandGaussian(Stream* stream, float mean, float stddev,
+                              DeviceMemory<float>* v) override;
+  bool DoPopulateRandGaussian(Stream* stream, double mean, double stddev,
+                              DeviceMemory<double>* v) override;
+
+  bool SetSeed(Stream* stream, const uint8* seed, uint64 seed_bytes) override;
+
+ private:
+  // Actually performs the work of generating random numbers - the public
+  // methods are thin wrappers to this interface.
+  template <typename T>
+  bool DoPopulateRandUniformInternal(Stream* stream, DeviceMemory<T>* v);
+  template <typename ElemT, typename FuncT>
+  bool DoPopulateRandGaussianInternal(Stream* stream, ElemT mean, ElemT stddev,
+                                      DeviceMemory<ElemT>* v, FuncT func);
+
+  // Sets the stream for the internal gpu rng generator.
+  //
+  // This is a stateful operation, as the handle can only have one stream set at
+  // a given time, so it is usually performed right before enqueuing work to do
+  // with random number generation.
+  bool SetStream(Stream* stream) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // mutex that guards the gpu rng library handle for this device.
+  mutex mu_;
+
+  // GpuExecutor which instantiated this GpuRng.
+  // Immutable post-initialization.
+  GpuExecutor* parent_;
+
+  // gpu rng library handle on the device.
+  GpuRngHandle rng_ GUARDED_BY(mu_);
+
+  SE_DISALLOW_COPY_AND_ASSIGN(GpuRng);
+};
+
+template <typename T>
+string TypeString();
+
+template <>
+string TypeString<float>() {
+  return "float";
+}
+
+template <>
+string TypeString<double>() {
+  return "double";
+}
+
+template <>
+string TypeString<std::complex<float>>() {
+  return "std::complex<float>";
+}
+
+template <>
+string TypeString<std::complex<double>>() {
+  return "std::complex<double>";
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_RNG_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_stream.cc b/tensorflow/stream_executor/gpu/gpu_stream.cc
similarity index 51%
rename from tensorflow/stream_executor/cuda/cuda_stream.cc
rename to tensorflow/stream_executor/gpu/gpu_stream.cc
index b5aa7694f7e1d8d47f3252d3ba679292155119b5..f43500370fc6a7a3e919d2c7af0a92e98100284b 100644
--- a/tensorflow/stream_executor/cuda/cuda_stream.cc
+++ b/tensorflow/stream_executor/gpu/gpu_stream.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,49 +13,49 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/stream_executor/cuda/cuda_stream.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
 
-#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/stream.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
-bool CUDAStream::Init() {
-  if (!CUDADriver::CreateStream(parent_->cuda_context(), &cuda_stream_)) {
+bool GpuStream::Init() {
+  if (!GpuDriver::CreateStream(parent_->gpu_context(), &gpu_stream_)) {
     return false;
   }
-  return CUDADriver::CreateEvent(parent_->cuda_context(), &completed_event_,
-                                 CUDADriver::EventFlags::kDisableTiming)
+  return GpuDriver::CreateEvent(parent_->gpu_context(), &completed_event_,
+                                GpuDriver::EventFlags::kDisableTiming)
       .ok();
 }
 
-void CUDAStream::Destroy() {
+void GpuStream::Destroy() {
   if (completed_event_ != nullptr) {
     port::Status status =
-        CUDADriver::DestroyEvent(parent_->cuda_context(), &completed_event_);
+        GpuDriver::DestroyEvent(parent_->gpu_context(), &completed_event_);
     if (!status.ok()) {
       LOG(ERROR) << status.error_message();
     }
   }
 
-  CUDADriver::DestroyStream(parent_->cuda_context(), &cuda_stream_);
+  GpuDriver::DestroyStream(parent_->gpu_context(), &gpu_stream_);
 }
 
-bool CUDAStream::IsIdle() const {
-  return CUDADriver::IsStreamIdle(parent_->cuda_context(), cuda_stream_);
+bool GpuStream::IsIdle() const {
+  return GpuDriver::IsStreamIdle(parent_->gpu_context(), gpu_stream_);
 }
 
-CUDAStream *AsCUDAStream(Stream *stream) {
+GpuStream* AsGpuStream(Stream* stream) {
   DCHECK(stream != nullptr);
-  return static_cast<CUDAStream *>(stream->implementation());
+  return static_cast<GpuStream*>(stream->implementation());
 }
 
-CUstream AsCUDAStreamValue(Stream *stream) {
+GpuStreamHandle AsGpuStreamValue(Stream* stream) {
   DCHECK(stream != nullptr);
-  return AsCUDAStream(stream)->cuda_stream();
+  return AsGpuStream(stream)->gpu_stream();
 }
 
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/gpu/gpu_stream.h b/tensorflow/stream_executor/gpu/gpu_stream.h
new file mode 100644
index 0000000000000000000000000000000000000000..c38f6c132a571bb42b31c9649440fd0ff2aaa777
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_stream.h
@@ -0,0 +1,96 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Defines the GpuStream type - the CUDA-specific implementation of the generic
+// StreamExecutor Stream interface.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
+
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace stream_executor {
+namespace gpu {
+
+class GpuExecutor;
+
+// Wraps a GpuStreamHandle in order to satisfy the platform-independent
+// StreamInterface.
+//
+// Thread-safe post-initialization.
+class GpuStream : public internal::StreamInterface {
+ public:
+  explicit GpuStream(GpuExecutor* parent)
+      : parent_(parent), gpu_stream_(nullptr), completed_event_(nullptr) {}
+
+  // Note: teardown is handled by a parent's call to DeallocateStream.
+  ~GpuStream() override {}
+
+  void* GpuStreamHack() override { return gpu_stream_; }
+  void** GpuStreamMemberHack() override {
+    return reinterpret_cast<void**>(&gpu_stream_);
+  }
+
+  // Explicitly initialize the CUDA resources associated with this stream, used
+  // by StreamExecutor::AllocateStream().
+  bool Init();
+
+  // Explicitly destroy the CUDA resources associated with this stream, used by
+  // StreamExecutor::DeallocateStream().
+  void Destroy();
+
+  // Returns true if no work is pending or executing on the stream.
+  bool IsIdle() const;
+
+  // Retrieves an event which indicates that all work enqueued into the stream
+  // has completed. Ownership of the event is not transferred to the caller, the
+  // event is owned by this stream.
+  GpuEventHandle* completed_event() { return &completed_event_; }
+
+  // Returns the GpuStreamHandle value for passing to the CUDA API.
+  //
+  // Precond: this GpuStream has been allocated (otherwise passing a nullptr
+  // into the NVIDIA library causes difficult-to-understand faults).
+  GpuStreamHandle gpu_stream() const {
+    DCHECK(gpu_stream_ != nullptr);
+    return const_cast<GpuStreamHandle>(gpu_stream_);
+  }
+
+  // TODO(timshen): Migrate away and remove this function.
+  GpuStreamHandle cuda_stream() const { return gpu_stream(); }
+
+  GpuExecutor* parent() const { return parent_; }
+
+ private:
+  GpuExecutor* parent_;         // Executor that spawned this stream.
+  GpuStreamHandle gpu_stream_;  // Wrapped CUDA stream handle.
+
+  // Event that indicates this stream has completed.
+  GpuEventHandle completed_event_ = nullptr;
+};
+
+// Helper functions to simplify extremely common flows.
+// Converts a Stream to the underlying GpuStream implementation.
+GpuStream* AsGpuStream(Stream* stream);
+
+// Extracts a GpuStreamHandle from a GpuStream-backed Stream object.
+GpuStreamHandle AsGpuStreamValue(Stream* stream);
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
diff --git a/tensorflow/stream_executor/cuda/cuda_timer.cc b/tensorflow/stream_executor/gpu/gpu_timer.cc
similarity index 51%
rename from tensorflow/stream_executor/cuda/cuda_timer.cc
rename to tensorflow/stream_executor/gpu/gpu_timer.cc
index 991a12a23d632bd9fb4c97a340e244f6ffb4c7d3..cc4b50d9baa0af70410baad582d210e90bdb7b03 100644
--- a/tensorflow/stream_executor/cuda/cuda_timer.cc
+++ b/tensorflow/stream_executor/gpu/gpu_timer.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,31 +13,31 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/stream_executor/cuda/cuda_timer.h"
+#include "tensorflow/stream_executor/gpu/gpu_timer.h"
 
-#include "tensorflow/stream_executor/cuda/cuda_driver.h"
-#include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
-#include "tensorflow/stream_executor/cuda/cuda_stream.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
 #include "tensorflow/stream_executor/lib/status.h"
 
 namespace stream_executor {
-namespace cuda {
+namespace gpu {
 
-bool CUDATimer::Init() {
+bool GpuTimer::Init() {
   CHECK(start_event_ == nullptr && stop_event_ == nullptr);
-  CudaContext* context = parent_->cuda_context();
-  port::Status status = CUDADriver::CreateEvent(
-      context, &start_event_, CUDADriver::EventFlags::kDefault);
+  GpuContext* context = parent_->gpu_context();
+  port::Status status = GpuDriver::CreateEvent(context, &start_event_,
+                                               GpuDriver::EventFlags::kDefault);
   if (!status.ok()) {
     LOG(ERROR) << status;
     return false;
   }
 
-  status = CUDADriver::CreateEvent(context, &stop_event_,
-                                   CUDADriver::EventFlags::kDefault);
+  status = GpuDriver::CreateEvent(context, &stop_event_,
+                                  GpuDriver::EventFlags::kDefault);
   if (!status.ok()) {
     LOG(ERROR) << status;
-    status = CUDADriver::DestroyEvent(context, &start_event_);
+    status = GpuDriver::DestroyEvent(context, &start_event_);
     if (!status.ok()) {
       LOG(ERROR) << status;
     }
@@ -48,47 +48,46 @@ bool CUDATimer::Init() {
   return true;
 }
 
-void CUDATimer::Destroy() {
-  CudaContext* context = parent_->cuda_context();
-  port::Status status = CUDADriver::DestroyEvent(context, &start_event_);
+void GpuTimer::Destroy() {
+  GpuContext* context = parent_->gpu_context();
+  port::Status status = GpuDriver::DestroyEvent(context, &start_event_);
   if (!status.ok()) {
     LOG(ERROR) << status;
   }
 
-  status = CUDADriver::DestroyEvent(context, &stop_event_);
+  status = GpuDriver::DestroyEvent(context, &stop_event_);
   if (!status.ok()) {
     LOG(ERROR) << status;
   }
 }
 
-float CUDATimer::GetElapsedMilliseconds() const {
+float GpuTimer::GetElapsedMilliseconds() const {
   CHECK(start_event_ != nullptr && stop_event_ != nullptr);
   // TODO(leary) provide a way to query timer resolution?
   // CUDA docs say a resolution of about 0.5us
   float elapsed_milliseconds = NAN;
-  (void)CUDADriver::GetEventElapsedTime(parent_->cuda_context(),
-                                        &elapsed_milliseconds, start_event_,
-                                        stop_event_);
+  (void)GpuDriver::GetEventElapsedTime(
+      parent_->gpu_context(), &elapsed_milliseconds, start_event_, stop_event_);
   return elapsed_milliseconds;
 }
 
-bool CUDATimer::Start(CUDAStream* stream) {
-  port::Status status = CUDADriver::RecordEvent(
-      parent_->cuda_context(), start_event_, stream->cuda_stream());
+bool GpuTimer::Start(GpuStream* stream) {
+  port::Status status = GpuDriver::RecordEvent(
+      parent_->gpu_context(), start_event_, stream->gpu_stream());
   if (!status.ok()) {
     LOG(ERROR) << status;
   }
   return status.ok();
 }
 
-bool CUDATimer::Stop(CUDAStream* stream) {
-  port::Status status = CUDADriver::RecordEvent(
-      parent_->cuda_context(), stop_event_, stream->cuda_stream());
+bool GpuTimer::Stop(GpuStream* stream) {
+  port::Status status = GpuDriver::RecordEvent(
+      parent_->gpu_context(), stop_event_, stream->gpu_stream());
   if (!status.ok()) {
     LOG(ERROR) << status;
   }
   return status.ok();
 }
 
-}  // namespace cuda
+}  // namespace gpu
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/gpu/gpu_timer.h b/tensorflow/stream_executor/gpu/gpu_timer.h
new file mode 100644
index 0000000000000000000000000000000000000000..886f0c2d57729270b9a87635ddffd1a4be4acfdb
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_timer.h
@@ -0,0 +1,90 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Defines the GpuTimer type - the CUDA-specific implementation of the generic
+// StreamExecutor Timer interface.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TIMER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TIMER_H_
+
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace stream_executor {
+namespace gpu {
+
+class GpuExecutor;
+class GpuStream;
+
+// Wraps a pair of GpuEventHandles in order to satisfy the platform-independent
+// TimerInferface -- both a start and a stop event are present which may be
+// recorded in a stream.
+class GpuTimer : public internal::TimerInterface {
+ public:
+  explicit GpuTimer(GpuExecutor* parent)
+      : parent_(parent), start_event_(nullptr), stop_event_(nullptr) {}
+
+  // Note: teardown needs to be explicitly handled in this API by a call to
+  // StreamExecutor::DeallocateTimer(), which invokes Destroy().
+  // TODO(csigg): Change to RAII.
+  ~GpuTimer() override {}
+
+  // Allocates the platform-specific pieces of the timer, called as part of
+  // StreamExecutor::AllocateTimer().
+  bool Init();
+
+  // Deallocates the platform-specific pieces of the timer, called as part of
+  // StreamExecutor::DeallocateTimer().
+  void Destroy();
+
+  // Records the "timer start" event at the current point in the stream.
+  bool Start(GpuStream* stream);
+
+  // Records the "timer stop" event at the current point in the stream.
+  bool Stop(GpuStream* stream);
+
+  // Returns the elapsed time, in milliseconds, between the start and stop
+  // events.
+  float GetElapsedMilliseconds() const;
+
+  // See Timer::Microseconds().
+  // TODO(leary) make this into an error code interface...
+  uint64 Microseconds() const override {
+    return GetElapsedMilliseconds() * 1e3;
+  }
+
+  // See Timer::Nanoseconds().
+  uint64 Nanoseconds() const override { return GetElapsedMilliseconds() * 1e6; }
+
+ private:
+  GpuExecutor* parent_;
+  GpuEventHandle start_event_;  // Event recorded to indicate the "start"
+                                // timestamp executing in a stream.
+  GpuEventHandle stop_event_;   // Event recorded to indicate the "stop"
+                                // timestamp executing in a stream.
+};
+
+struct GpuTimerDeleter {
+  void operator()(GpuTimer* t) {
+    t->Destroy();
+    delete t;
+  }
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TIMER_H_
diff --git a/tensorflow/stream_executor/gpu/gpu_types.h b/tensorflow/stream_executor/gpu/gpu_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..c69177d0760eb225a78bf7531070d007a93d377a
--- /dev/null
+++ b/tensorflow/stream_executor/gpu/gpu_types.h
@@ -0,0 +1,84 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// GPU (ROCm / CUDA) specific type handle resolution
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
+#define TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
+
+#if TENSORFLOW_USE_ROCM
+
+#include "rocm/include/hip/hip_complex.h"
+#include "rocm/include/hip/hip_runtime.h"
+#include "rocm/include/hiprand/hiprand.h"
+
+#else  // CUDA
+
+#include "cuda/include/cuComplex.h"
+#include "cuda/include/cuda.h"
+
+// cannot include curand.h here
+//   because it triggers the #error in cuda/cuda_gpu_executor.cc
+//     (because curand.h includes cuda_runtime.h)
+// so explicitly adding the lone typedef we need from that file
+typedef struct curandGenerator_st* curandGenerator_t;
+
+#endif
+
+namespace stream_executor {
+namespace gpu {
+
+#if TENSORFLOW_USE_ROCM
+
+using GpuStreamHandle = hipStream_t;
+using GpuEventHandle = hipEvent_t;
+using GpuFunctionHandle = hipFunction_t;
+using GpuFunctionAttribute = hipDeviceAttribute_t;  // not a typo!
+using GpuDeviceHandle = hipDevice_t;
+using GpuDevicePtr = hipDeviceptr_t;
+using GpuDeviceAttribute = hipDeviceAttribute_t;
+using GpuDeviceProperty = hipDeviceProp_t;
+using GpuModuleHandle = hipModule_t;
+using GpuStatus = hipError_t;
+using GpuFuncCachePreference = hipFuncCache_t;
+using GpuSharedMemConfig = hipSharedMemConfig;
+using GpuComplexType = hipComplex;
+using GpuDoubleComplexType = hipDoubleComplex;
+using GpuRngHandle = hiprandGenerator_t;
+
+#else  // CUDA
+
+using GpuStreamHandle = CUstream;
+using GpuEventHandle = CUevent;
+using GpuFunctionHandle = CUfunction;
+using GpuFunctionAttribute = CUfunction_attribute;
+using GpuDeviceHandle = CUdevice;
+using GpuDevicePtr = CUdeviceptr;
+using GpuDeviceAttribute = CUdevice_attribute;
+using GpuDeviceProperty = CUdevprop;
+using GpuModuleHandle = CUmodule;
+using GpuStatus = CUresult;
+using GpuFuncCachePreference = CUfunc_cache;
+using GpuSharedMemConfig = CUsharedconfig;
+using GpuComplexType = cuComplex;
+using GpuDoubleComplexType = cuDoubleComplex;
+using GpuRngHandle = curandGenerator_t;
+
+#endif
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
diff --git a/tensorflow/stream_executor/host/BUILD b/tensorflow/stream_executor/host/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..127452aee9f7a0d528ec2a7b80bc488406b99030
--- /dev/null
+++ b/tensorflow/stream_executor/host/BUILD
@@ -0,0 +1,110 @@
+# Description:
+#   Host-platform specific StreamExecutor support code.
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/stream_executor:build_defs.bzl", "stream_executor_friends")
+
+package_group(
+    name = "friends",
+    packages = stream_executor_friends(),
+)
+
+package(default_visibility = [":friends"])
+
+# Filegroup used to collect source files for the dependency check.
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+cc_library(
+    name = "host_platform_id",
+    srcs = [
+        "host_platform_id.cc",
+    ],
+    hdrs = [
+        "host_platform_id.h",
+    ],
+    deps = [
+        "//tensorflow/stream_executor:platform",
+    ],
+)
+
+cc_library(
+    name = "host_platform",
+    srcs = [
+        "host_platform.cc",
+    ],
+    hdrs = [
+        "host_platform.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":host_gpu_executor",
+        ":host_platform_id",
+        "//tensorflow/stream_executor:executor_cache",
+        "//tensorflow/stream_executor:multi_platform_manager",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ],
+    alwayslink = True,  # Registers itself with the MultiPlatformManager.
+)
+
+cc_library(
+    name = "host_stream",
+    srcs = [
+        "host_stream.cc",
+    ],
+    hdrs = [
+        "host_stream.h",
+    ],
+    deps = [
+        "//tensorflow/stream_executor:kernel",
+        "//tensorflow/stream_executor/lib",
+    ],
+)
+
+cc_library(
+    name = "host_timer",
+    srcs = [
+        "host_timer.cc",
+    ],
+    hdrs = [
+        "host_timer.h",
+    ],
+    deps = [
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor:timer",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
+# TODO(22689637): Rename this target.
+cc_library(
+    name = "host_gpu_executor",
+    srcs = [
+        "host_gpu_executor.cc",
+    ],
+    hdrs = [
+        "host_gpu_executor.h",
+    ],
+    deps = [
+        ":host_platform_id",
+        ":host_stream",
+        ":host_timer",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor:kernel",
+        "//tensorflow/stream_executor:rng",
+        "//tensorflow/stream_executor:stream",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor:stream_executor_pimpl",
+        "//tensorflow/stream_executor:timer",
+        "//tensorflow/stream_executor/lib",
+    ],
+    alwayslink = True,
+)
diff --git a/tensorflow/stream_executor/host_buffer.h b/tensorflow/stream_executor/host_buffer.h
deleted file mode 100644
index 20299da5172f20b9b73c31b6491806dc57b1d2f0..0000000000000000000000000000000000000000
--- a/tensorflow/stream_executor/host_buffer.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_BUFFER_H_
-#define TENSORFLOW_STREAM_EXECUTOR_HOST_BUFFER_H_
-
-#include "tensorflow/stream_executor/dnn.h"
-
-namespace stream_executor {
-
-// A HostBuffer is a block of memory in host memory containing the data for a
-// dnn::BatchDescriptor using a device-dependent memory layout.
-// Derived classes provide methods to construct a HostBuffer for a specific
-// device, and to copy data in and out of the buffer.
-class HostBuffer {
- public:
-  const dnn::BatchDescriptor& descriptor() const { return descriptor_; }
-
-  // Returns a string describing the HostBuffer.
-  virtual string AsString() const = 0;
-
- protected:
-  // Construct a HostBuffer from the supplied dnn::BatchDescriptor.
-  explicit HostBuffer(const dnn::BatchDescriptor& descriptor)
-      : descriptor_(descriptor) {}
-  virtual ~HostBuffer() {}
-
- private:
-  const dnn::BatchDescriptor descriptor_;
-};
-
-}  // namespace stream_executor
-
-#endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_BUFFER_H_
diff --git a/tensorflow/stream_executor/lib/BUILD b/tensorflow/stream_executor/lib/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..133ff2b161b9db227a6a4921865f56bfc4b9bece
--- /dev/null
+++ b/tensorflow/stream_executor/lib/BUILD
@@ -0,0 +1,62 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/stream_executor:build_defs.bzl", "stream_executor_friends")
+
+package_group(
+    name = "friends",
+    packages = stream_executor_friends(),
+)
+
+package(default_visibility = [":friends"])
+
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+cc_library(
+    name = "lib",
+    srcs = glob(
+        [
+            "**/*.cc",
+        ],
+        exclude = [
+            "**/*test*",
+        ],
+    ),
+    hdrs = glob(["**/*.h"]),
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:ptr_util",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cc_test(
+    name = "statusor_test",
+    size = "small",
+    srcs = ["statusor_test.cc"],
+    deps = [
+        ":lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "utility_headers",
+    hdrs = [
+        "ptr_util.h",
+    ],
+    deps = [
+        "//tensorflow/core:ptr_util",
+    ],
+)
diff --git a/tensorflow/stream_executor/lib/initialize.h b/tensorflow/stream_executor/lib/initialize.h
index 688b0214694478e9be1b1d14e58fda94367f547b..cd0b9dad19bf1d0e4e07bc153d94664fda12bd98 100644
--- a/tensorflow/stream_executor/lib/initialize.h
+++ b/tensorflow/stream_executor/lib/initialize.h
@@ -16,55 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_INITIALIZE_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_INITIALIZE_H_
 
-#include "tensorflow/stream_executor/platform/port.h"
-
-#if defined(PLATFORM_GOOGLE)
-#include "tensorflow/stream_executor/platform/google/initialize.h"
-#else
-
-#undef REGISTER_MODULE_INITIALIZER
-#undef DECLARE_MODULE_INITIALIZER
-#undef REGISTER_MODULE_INITIALIZER_SEQUENCE
-
-namespace stream_executor {
-namespace port {
-
-class Initializer {
- public:
-  typedef void (*InitializerFunc)();
-  explicit Initializer(InitializerFunc func) { func(); }
-
-  struct Dependency {
-    Dependency(const char *n, Initializer *i) : name(n), initializer(i) {}
-    const char *const name;
-    Initializer *const initializer;
-  };
-
-  struct DependencyRegisterer {
-    DependencyRegisterer(const char *type, const char *name,
-                         Initializer *initializer,
-                         const Dependency &dependency);
-  };
-};
-
-}  // namespace port
-}  // namespace stream_executor
-
-#define REGISTER_INITIALIZER(type, name, body)                             \
-  static void google_init_##type##_##name() { body; }                      \
-  ::stream_executor::port::Initializer google_initializer_##type##_##name( \
-      google_init_##type##_##name)
-
-#define REGISTER_MODULE_INITIALIZER(name, body) \
-  REGISTER_INITIALIZER(module, name, body)
-
-#define DECLARE_INITIALIZER(type, name) \
-  extern ::stream_executor::port::Initializer google_initializer_##type##_##name
-
-#define DECLARE_MODULE_INITIALIZER(name) DECLARE_INITIALIZER(module, name)
-
-#define REGISTER_MODULE_INITIALIZER_SEQUENCE(name1, name2)
-
-#endif  // !defined(PLATFORM_GOOGLE)
+#include "tensorflow/stream_executor/platform/initialize.h"
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_LIB_INITIALIZE_H_
diff --git a/tensorflow/stream_executor/logging.proto b/tensorflow/stream_executor/logging.proto
new file mode 100644
index 0000000000000000000000000000000000000000..3351752788c610cdcb2baa15b83f9fd0f7492b5c
--- /dev/null
+++ b/tensorflow/stream_executor/logging.proto
@@ -0,0 +1,41 @@
+syntax = "proto3";
+
+package stream_executor;
+
+import "tensorflow/stream_executor/dnn.proto";
+
+message CudnnVersion {
+  int32 major = 1;
+  int32 minor = 2;
+  int32 patch = 3;
+};
+
+message ComputeCapability {
+  int32 major = 1;
+  int32 minor = 2;
+}
+
+// NOTE: this proto is temporarily duplicated in other places, outside of
+// stream_executor. The plan is to move all custom logging (tensorflow::Logger
+// related) behavior out of StreamExecutor. There are two reasons:
+// * Technical: stream_executor is part of libtensorflow_framework.so. It's
+//   extremely hard to have a single definition of the protos in the .so, and
+//   let the callers call into those definitions. The complication lives in
+//   cc_proto_library where we have a header-only version and impl version.
+// * Functional: we want to log autotuning stats from the callers. The
+//   autotuning stats are not available in SE.
+//
+// TODO(timshen): remove this proto once both XLA and TF log autotuning
+// results.
+message CudaInfo {
+  CudnnVersion cudnn_version = 1;
+  ComputeCapability compute_capability = 2;
+}
+
+message ConvLogEntry {
+  CudaInfo cuda_info = 1;
+  dnn.ConvolutionProto convolution = 2;
+
+  // Profiled time in ms. 0.0 if the convolution is not profiled.
+  float profile_time_ms = 3;
+}
diff --git a/tensorflow/stream_executor/platform.cc b/tensorflow/stream_executor/platform.cc
index c0205abbee305edc23e24d79c53f9ed3b84049b5..9c99581438653a55223a5ebee6173d2a5fefb3ab 100644
--- a/tensorflow/stream_executor/platform.cc
+++ b/tensorflow/stream_executor/platform.cc
@@ -28,6 +28,8 @@ string PlatformKindString(PlatformKind kind) {
   switch (kind) {
     case PlatformKind::kCuda:
       return "CUDA";
+    case PlatformKind::kROCm:
+      return "ROCm";
     case PlatformKind::kOpenCL:
       return "OpenCL";
     case PlatformKind::kHost:
@@ -52,6 +54,7 @@ PlatformKind PlatformKindFromString(string kind) {
 bool PlatformIsRunnable(PlatformKind kind) {
   switch (kind) {
     case PlatformKind::kCuda:
+    case PlatformKind::kROCm:
     case PlatformKind::kOpenCL:
     case PlatformKind::kHost:
       return true;
@@ -63,6 +66,7 @@ bool PlatformIsRunnable(PlatformKind kind) {
 bool PlatformIsRunnableOnDevice(PlatformKind kind) {
   switch (kind) {
     case PlatformKind::kCuda:
+    case PlatformKind::kROCm:
     case PlatformKind::kOpenCL:
       return true;
     default:
diff --git a/tensorflow/stream_executor/platform.h b/tensorflow/stream_executor/platform.h
index 5cb7047b6f39483f237b5bb249906d9ce8a06b9e..2c2cd77ad21aaeb700a7cffe598112237204b418 100644
--- a/tensorflow/stream_executor/platform.h
+++ b/tensorflow/stream_executor/platform.h
@@ -40,6 +40,7 @@ class StreamExecutor;
 enum class PlatformKind {
   kInvalid,
   kCuda,
+  kROCm,
   kOpenCL,
   kHost,
   kMock,
diff --git a/tensorflow/stream_executor/platform/BUILD b/tensorflow/stream_executor/platform/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..702b2cdfe0dd41997f99daf1bcdcbf8a6994edd8
--- /dev/null
+++ b/tensorflow/stream_executor/platform/BUILD
@@ -0,0 +1,47 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/stream_executor:build_defs.bzl", "stream_executor_friends")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_platform_hdrs")
+
+package_group(
+    name = "friends",
+    packages = stream_executor_friends(),
+)
+
+package(
+    default_visibility = [":friends"],
+)
+
+cc_library(
+    name = "platform",
+    textual_hdrs = [
+        "logging.h",
+        "mutex.h",
+        "platform.h",
+        "port.h",
+        "thread_annotations.h",
+        "initialize.h",
+    ],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/platform/default:platform",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "dso_loader",
+    hdrs = ["dso_loader.h"],
+    deps = [
+        ":platform",
+        "//tensorflow/stream_executor/platform/default:dso_loader",
+    ],
+)
+
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
diff --git a/tensorflow/stream_executor/platform/default/BUILD b/tensorflow/stream_executor/platform/default/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f1ae7d86ff78a50da51ef730098cee2fc9e30aad
--- /dev/null
+++ b/tensorflow/stream_executor/platform/default/BUILD
@@ -0,0 +1,25 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow/stream_executor:__subpackages__"])
+
+cc_library(
+    name = "platform",
+    textual_hdrs = [
+        "initialize.h",
+        "mutex.h",
+    ],
+    deps = ["//tensorflow/core:lib"],
+)
+
+cc_library(
+    name = "dso_loader",
+    srcs = ["dso_loader.cc"],
+    hdrs = ["dso_loader.h"],
+    deps = [
+        "//tensorflow/stream_executor:platform",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/strings",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
diff --git a/tensorflow/stream_executor/platform/default/dso_loader.cc b/tensorflow/stream_executor/platform/default/dso_loader.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a2a9ad67534921c3349dbccaa2f6843bfadc787f
--- /dev/null
+++ b/tensorflow/stream_executor/platform/default/dso_loader.cc
@@ -0,0 +1,186 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/stream_executor/platform/default/dso_loader.h"
+
+#include <stdlib.h>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "cuda/cuda_config.h"
+#include "tensorflow/core/platform/load_library.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/path.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace stream_executor {
+namespace internal {
+
+namespace {
+string GetCudaVersion() { return TF_CUDA_VERSION; }
+string GetCudnnVersion() { return TF_CUDNN_VERSION; }
+
+port::StatusOr<void*> GetDsoHandle(const string& name, const string& version) {
+  auto filename = port::Env::Default()->FormatLibraryFileName(name, version);
+  void* dso_handle;
+  port::Status status =
+      port::Env::Default()->LoadLibrary(filename.c_str(), &dso_handle);
+  if (status.ok()) {
+    LOG(INFO) << "Successfully opened dynamic library " << filename;
+    return dso_handle;
+  }
+
+  auto message = absl::StrCat("Could not dlopen library '", filename,
+                              "'; dlerror: ", status.error_message());
+#if !defined(PLATFORM_WINDOWS)
+  if (const char* ld_library_path = getenv("LD_LIBRARY_PATH")) {
+    message += absl::StrCat("; LD_LIRARY_PATH: ", ld_library_path);
+  }
+#endif
+  LOG(INFO) << message;
+  return port::Status(port::error::FAILED_PRECONDITION, message);
+}
+}  // namespace
+
+namespace DsoLoader {
+port::StatusOr<void*> GetCudaDriverDsoHandle() {
+#if defined(PLATFORM_WINDOWS)
+  return GetDsoHandle("nvcuda", "");
+#elif defined(__APPLE__)
+  // On Mac OS X, CUDA sometimes installs libcuda.dylib instead of
+  // libcuda.1.dylib.
+  auto handle_or = GetDsoHandle("cuda", "");
+  if (handle_or.ok()) {
+    return handle_or;
+  }
+#endif
+  return GetDsoHandle("cuda", "1");
+}
+
+port::StatusOr<void*> GetCudaRuntimeDsoHandle() {
+  return GetDsoHandle("cudart", GetCudaVersion());
+}
+
+port::StatusOr<void*> GetCublasDsoHandle() {
+  return GetDsoHandle("cublas", GetCudaVersion());
+}
+
+port::StatusOr<void*> GetCufftDsoHandle() {
+  return GetDsoHandle("cufft", GetCudaVersion());
+}
+
+port::StatusOr<void*> GetCurandDsoHandle() {
+  return GetDsoHandle("curand", GetCudaVersion());
+}
+
+port::StatusOr<void*> GetCuptiDsoHandle() {
+#if defined(ANDROID_TEGRA)
+  // On Android devices the CUDA version number is not added to the library
+  // name.
+  return GetDsoHandle("cupti", "");
+#else
+  return GetDsoHandle("cupti", GetCudaVersion());
+#endif
+}
+
+port::StatusOr<void*> GetCudnnDsoHandle() {
+  return GetDsoHandle("cudnn", GetCudnnVersion());
+}
+
+port::StatusOr<void*> GetRocblasDsoHandle() {
+  return GetDsoHandle("rocblas", "");
+}
+
+port::StatusOr<void*> GetMiopenDsoHandle() {
+  return GetDsoHandle("MIOpen", "");
+}
+
+port::StatusOr<void*> GetRocfftDsoHandle() {
+  return GetDsoHandle("rocfft", "");
+}
+
+port::StatusOr<void*> GetRocrandDsoHandle() {
+  return GetDsoHandle("rocrand", "");
+}
+
+port::StatusOr<void*> GetHipDsoHandle() { return GetDsoHandle("hip_hcc", ""); }
+
+}  // namespace DsoLoader
+
+namespace CachedDsoLoader {
+port::StatusOr<void*> GetCudaDriverDsoHandle() {
+  static auto result = new auto(DsoLoader::GetCudaDriverDsoHandle());
+  return *result;
+}
+
+port::StatusOr<void*> GetCudaRuntimeDsoHandle() {
+  static auto result = new auto(DsoLoader::GetCudaRuntimeDsoHandle());
+  return *result;
+}
+
+port::StatusOr<void*> GetCublasDsoHandle() {
+  static auto result = new auto(DsoLoader::GetCublasDsoHandle());
+  return *result;
+}
+
+port::StatusOr<void*> GetCurandDsoHandle() {
+  static auto result = new auto(DsoLoader::GetCurandDsoHandle());
+  return *result;
+}
+
+port::StatusOr<void*> GetCufftDsoHandle() {
+  static auto result = new auto(DsoLoader::GetCufftDsoHandle());
+  return *result;
+}
+
+port::StatusOr<void*> GetCuptiDsoHandle() {
+  static auto result = new auto(DsoLoader::GetCuptiDsoHandle());
+  return *result;
+}
+
+port::StatusOr<void*> GetCudnnDsoHandle() {
+  static auto result = new auto(DsoLoader::GetCudnnDsoHandle());
+  return *result;
+}
+
+port::StatusOr<void*> GetRocblasDsoHandle() {
+  static auto result = new auto(DsoLoader::GetRocblasDsoHandle());
+  return result;
+}
+
+port::StatusOr<void*> GetMiopenDsoHandle() {
+  static auto result = new auto(DsoLoader::GetMiopenDsoHandle());
+  return result;
+}
+
+port::StatusOr<void*> GetRocfftDsoHandle() {
+  static auto result = new auto(DsoLoader::GetRocfftDsoHandle());
+  return result;
+}
+
+port::StatusOr<void*> GetRocrandDsoHandle() {
+  static auto result = new auto(DsoLoader::GetRocrandDsoHandle());
+  return result;
+}
+
+port::StatusOr<void*> GetHipDsoHandle() {
+  static auto result = new auto(DsoLoader::GetHipDsoHandle());
+  return result;
+}
+
+}  // namespace CachedDsoLoader
+}  // namespace internal
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/platform/default/dso_loader.h b/tensorflow/stream_executor/platform/default/dso_loader.h
new file mode 100644
index 0000000000000000000000000000000000000000..89f23324dcdcfc0ca3d9d8c1382b566fcd1fd79e
--- /dev/null
+++ b/tensorflow/stream_executor/platform/default/dso_loader.h
@@ -0,0 +1,74 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Common DSO loading functionality: exposes callables that dlopen DSOs
+// in either the runfiles directories
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_DSO_LOADER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_DSO_LOADER_H_
+
+#include <vector>
+#include "tensorflow/stream_executor/platform/port.h"
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+
+namespace stream_executor {
+namespace internal {
+
+namespace DsoLoader {
+// The following methods either load the DSO of interest and return a dlopen
+// handle or error status.
+port::StatusOr<void*> GetCudaDriverDsoHandle();
+port::StatusOr<void*> GetCudaRuntimeDsoHandle();
+port::StatusOr<void*> GetCublasDsoHandle();
+port::StatusOr<void*> GetCufftDsoHandle();
+port::StatusOr<void*> GetCurandDsoHandle();
+port::StatusOr<void*> GetCuptiDsoHandle();
+port::StatusOr<void*> GetCudnnDsoHandle();
+
+port::StatusOr<void*> GetRocblasDsoHandle();
+port::StatusOr<void*> GetMiopenDsoHandle();
+port::StatusOr<void*> GetRocfftDsoHandle();
+port::StatusOr<void*> GetRocrandDsoHandle();
+port::StatusOr<void*> GetHipDsoHandle();
+}  // namespace DsoLoader
+
+// Wrapper around the DsoLoader that prevents us from dlopen'ing any of the DSOs
+// more than once.
+namespace CachedDsoLoader {
+// Cached versions of the corresponding DsoLoader methods above.
+port::StatusOr<void*> GetCudaDriverDsoHandle();
+port::StatusOr<void*> GetCudaRuntimeDsoHandle();
+port::StatusOr<void*> GetCublasDsoHandle();
+port::StatusOr<void*> GetCufftDsoHandle();
+port::StatusOr<void*> GetCurandDsoHandle();
+port::StatusOr<void*> GetCuptiDsoHandle();
+port::StatusOr<void*> GetCudnnDsoHandle();
+
+port::StatusOr<void*> GetRocblasDsoHandle();
+port::StatusOr<void*> GetMiopenDsoHandle();
+port::StatusOr<void*> GetRocfftDsoHandle();
+port::StatusOr<void*> GetRocrandDsoHandle();
+port::StatusOr<void*> GetHipDsoHandle();
+}  // namespace CachedDsoLoader
+
+}  // namespace internal
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_DSO_LOADER_H_
diff --git a/tensorflow/stream_executor/platform/default/initialize.h b/tensorflow/stream_executor/platform/default/initialize.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d27c85336e1ca64ebcc6969f2179399529e8b37
--- /dev/null
+++ b/tensorflow/stream_executor/platform/default/initialize.h
@@ -0,0 +1,62 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_INITIALIZE_H_
+#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_INITIALIZE_H_
+
+#undef REGISTER_MODULE_INITIALIZER
+#undef DECLARE_MODULE_INITIALIZER
+#undef REGISTER_MODULE_INITIALIZER_SEQUENCE
+
+namespace stream_executor {
+namespace port {
+
+class Initializer {
+ public:
+  typedef void (*InitializerFunc)();
+  explicit Initializer(InitializerFunc func) { func(); }
+
+  struct Dependency {
+    Dependency(const char *n, Initializer *i) : name(n), initializer(i) {}
+    const char *const name;
+    Initializer *const initializer;
+  };
+
+  struct DependencyRegisterer {
+    DependencyRegisterer(const char *type, const char *name,
+                         Initializer *initializer,
+                         const Dependency &dependency);
+  };
+};
+
+}  // namespace port
+}  // namespace stream_executor
+
+#define REGISTER_INITIALIZER(type, name, body)                             \
+  static void google_init_##type##_##name() { body; }                      \
+  ::stream_executor::port::Initializer google_initializer_##type##_##name( \
+      google_init_##type##_##name)
+
+#define REGISTER_MODULE_INITIALIZER(name, body) \
+  REGISTER_INITIALIZER(module, name, body)
+
+#define DECLARE_INITIALIZER(type, name) \
+  extern ::stream_executor::port::Initializer google_initializer_##type##_##name
+
+#define DECLARE_MODULE_INITIALIZER(name) DECLARE_INITIALIZER(module, name)
+
+#define REGISTER_MODULE_INITIALIZER_SEQUENCE(name1, name2)
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_INITIALIZE_H_
diff --git a/tensorflow/stream_executor/platform/default/mutex.h b/tensorflow/stream_executor/platform/default/mutex.h
index c9f5a7c609e5bbe59ea456e30d575b991aa37b65..2f8f0636ba7bd037f356525047f2dd7c0eda789d 100644
--- a/tensorflow/stream_executor/platform/default/mutex.h
+++ b/tensorflow/stream_executor/platform/default/mutex.h
@@ -16,7 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_MUTEX_H_
 #define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DEFAULT_MUTEX_H_
 
-#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace stream_executor {
 
diff --git a/tensorflow/stream_executor/platform/dso_loader.h b/tensorflow/stream_executor/platform/dso_loader.h
new file mode 100644
index 0000000000000000000000000000000000000000..1dd56684b1917b07ba6e421479b14ac22af5d335
--- /dev/null
+++ b/tensorflow/stream_executor/platform/dso_loader.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DSO_LOADER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DSO_LOADER_H_
+
+#include "tensorflow/stream_executor/platform/platform.h"
+
+// Include appropriate platform-dependent implementations
+#if defined(PLATFORM_GOOGLE)
+#include "tensorflow/stream_executor/platform/google/dso_loader.h"
+#elif defined(PLATFORM_POSIX) || defined(PLATFORM_POSIX_ANDROID) || \
+    defined(PLATFORM_GOOGLE_ANDROID)
+#include "tensorflow/stream_executor/platform/default/dso_loader.h"
+#else
+#error Define the appropriate PLATFORM_<foo> macro for this platform
+#endif
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_DSO_LOADER_H_
diff --git a/tensorflow/stream_executor/platform/initialize.h b/tensorflow/stream_executor/platform/initialize.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb13132afff7c9f6d4c57176eef8d7180bb45a93
--- /dev/null
+++ b/tensorflow/stream_executor/platform/initialize.h
@@ -0,0 +1,27 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_INITIALIZE_H_
+#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_INITIALIZE_H_
+
+#include "tensorflow/stream_executor/platform/platform.h"
+
+#if defined(PLATFORM_GOOGLE)
+#include "tensorflow/stream_executor/platform/google/initialize.h"
+#else
+#include "tensorflow/stream_executor/platform/default/initialize.h"
+#endif
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_INITIALIZE_H_
diff --git a/tensorflow/stream_executor/platform/mutex.h b/tensorflow/stream_executor/platform/mutex.h
index 28828951de521752e8debfc1b6cfd2de73a09828..fa6c8c017c30b66baf07e1ee19f4326d7c01b9c3 100644
--- a/tensorflow/stream_executor/platform/mutex.h
+++ b/tensorflow/stream_executor/platform/mutex.h
@@ -16,8 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_MUTEX_H_
 #define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_MUTEX_H_
 
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/platform/platform.h"
 
 #if defined(PLATFORM_GOOGLE)
 #include "tensorflow/stream_executor/platform/google/mutex.h"
diff --git a/tensorflow/stream_executor/platform/platform.h b/tensorflow/stream_executor/platform/platform.h
new file mode 100644
index 0000000000000000000000000000000000000000..5bf0e120d39f8bfa8e1a62ae3749beac076335c6
--- /dev/null
+++ b/tensorflow/stream_executor/platform/platform.h
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PLATFORM_H_
+#define TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PLATFORM_H_
+
+#if !defined(PLATFORM_POSIX) && !defined(PLATFORM_GOOGLE) && \
+    !defined(PLATFORM_POSIX_ANDROID) && !defined(PLATFORM_GOOGLE_ANDROID)
+
+// Choose which platform we are on.
+#if defined(ANDROID) || defined(__ANDROID__)
+#define PLATFORM_POSIX_ANDROID
+
+#elif defined(__APPLE__)
+#define PLATFORM_POSIX
+
+#else
+// If no platform specified, use:
+#define PLATFORM_POSIX
+
+#endif
+#endif
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_PLATFORM_PLATFORM_H_
diff --git a/tensorflow/stream_executor/rocm/BUILD b/tensorflow/stream_executor/rocm/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..5190b551f801d75f809277351b72b9ca6b9cc522
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/BUILD
@@ -0,0 +1,279 @@
+# Description:
+#   ROCm-platform specific StreamExecutor support code.
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load(
+    "//tensorflow/stream_executor:build_defs.bzl",
+    "stream_executor_friends",
+)
+load("//tensorflow:tensorflow.bzl", "tf_copts")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
+load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
+
+package_group(
+    name = "friends",
+    packages = stream_executor_friends(),
+)
+
+package(
+    default_visibility = [":friends"],
+)
+
+# Filegroup used to collect source files for the dependency check.
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+cc_library(
+    name = "rocm_diagnostics",
+    srcs = if_rocm_is_configured(["rocm_diagnostics.cc"]),
+    hdrs = if_rocm_is_configured(["rocm_diagnostics.h"]),
+    deps = if_rocm_is_configured([
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "//tensorflow/stream_executor/gpu:gpu_diagnostics_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ]),
+)
+
+cc_library(
+    name = "rocm_driver",
+    srcs = if_rocm_is_configured(["rocm_driver.cc"]),
+    hdrs = if_rocm_is_configured(["rocm_driver_wrapper.h"]),
+    deps = if_rocm_is_configured([
+        ":rocm_diagnostics",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/stream_executor:device_options",
+        "//tensorflow/stream_executor/gpu:gpu_driver_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+        "@local_config_rocm//rocm:rocm_headers",
+    ]),
+)
+
+cc_library(
+    name = "rocm_event",
+    srcs = if_rocm_is_configured(["rocm_event.cc"]),
+    hdrs = [],
+    deps = if_rocm_is_configured([
+        ":rocm_driver",
+        "//tensorflow/stream_executor:stream_executor_headers",
+        "//tensorflow/stream_executor/gpu:gpu_event_header",
+        "//tensorflow/stream_executor/gpu:gpu_executor_header",
+        "//tensorflow/stream_executor/gpu:gpu_stream_header",
+        "//tensorflow/stream_executor/lib",
+    ]),
+)
+
+cc_library(
+    name = "rocm_gpu_executor",
+    srcs = if_rocm_is_configured(["rocm_gpu_executor.cc"]),
+    hdrs = [],
+    deps = if_rocm_is_configured([
+        ":rocm_diagnostics",
+        ":rocm_driver",
+        ":rocm_event",
+        ":rocm_kernel",
+        ":rocm_platform_id",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:stream_executor_internal",
+        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor:timer",
+        "//tensorflow/stream_executor/gpu:gpu_activation_header",
+        "//tensorflow/stream_executor/gpu:gpu_event",
+        "//tensorflow/stream_executor/gpu:gpu_kernel_header",
+        "//tensorflow/stream_executor/gpu:gpu_stream",
+        "//tensorflow/stream_executor/gpu:gpu_timer",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ]),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "rocm_kernel",
+    srcs = if_rocm_is_configured(["rocm_kernel.cc"]),
+    hdrs = [],
+    visibility = ["//visibility:public"],
+    deps = if_rocm_is_configured([
+        "//tensorflow/stream_executor/gpu:gpu_kernel_header",
+    ]),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "rocm_platform",
+    srcs = if_rocm_is_configured(["rocm_platform.cc"]),
+    hdrs = if_rocm_is_configured(["rocm_platform.h"]),
+    visibility = ["//visibility:public"],
+    deps = if_rocm_is_configured([
+        ":rocm_driver",
+        ":rocm_gpu_executor",
+        ":rocm_platform_id",
+        "//tensorflow/stream_executor",  # buildcleaner: keep
+        "//tensorflow/stream_executor:executor_cache",
+        "//tensorflow/stream_executor:multi_platform_manager",
+        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+    ]),
+    alwayslink = True,  # Registers itself with the MultiPlatformManager.
+)
+
+cc_library(
+    name = "rocm_platform_id",
+    srcs = ["rocm_platform_id.cc"],
+    hdrs = ["rocm_platform_id.h"],
+    deps = ["//tensorflow/stream_executor:platform"],
+)
+
+cc_library(
+    name = "rocblas_plugin",
+    srcs = if_rocm_is_configured(["rocm_blas.cc"]),
+    hdrs = if_rocm_is_configured(["rocm_blas.h"]),
+    visibility = ["//visibility:public"],
+    deps = if_rocm_is_configured([
+        ":rocm_gpu_executor",
+        ":rocm_platform_id",
+        "//third_party/eigen3",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:host_or_device_scalar",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor:timer",
+        "//tensorflow/stream_executor/gpu:gpu_activation",
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
+        "//tensorflow/stream_executor/gpu:gpu_stream_header",
+        "//tensorflow/stream_executor/gpu:gpu_timer_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+        "@com_google_absl//absl/strings",
+        "@local_config_rocm//rocm:rocm_headers",
+    ] + if_static([
+        "@local_config_rocm//rocm:rocblas",
+    ])),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "rocfft_plugin",
+    srcs = if_rocm_is_configured(["rocm_fft.cc"]),
+    hdrs = if_rocm_is_configured(["rocm_fft.h"]),
+    visibility = ["//visibility:public"],
+    deps = if_rocm_is_configured([
+        ":rocm_platform_id",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:fft",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:scratch_allocator",
+        "//tensorflow/stream_executor/gpu:gpu_activation",
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
+        "//tensorflow/stream_executor/gpu:gpu_executor_header",
+        "//tensorflow/stream_executor/gpu:gpu_stream_header",
+        "//tensorflow/stream_executor/gpu:gpu_kernel_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+        "@local_config_rocm//rocm:rocm_headers",
+    ] + if_static([
+        "@local_config_rocm//rocm:rocfft",
+    ])),
+    alwayslink = True,
+)
+
+# FIXME: enable in future PRs
+#cc_library(
+#    name = "miopen_plugin",
+#    srcs = ["rocm_dnn.cc"],
+#    hdrs = [],
+#    copts = [
+#        # STREAM_EXECUTOR_CUDNN_WRAP would fail on Clang with the default
+#        # setting of template depth 256
+#        "-ftemplate-depth-512",
+#    ],
+#    visibility = ["//visibility:public"],
+#    deps = [
+#        ":rocm_diagnostics",
+#        ":rocm_driver",
+#        ":rocm_gpu_executor",
+#        ":rocm_platform_id",
+#        "//third_party/eigen3",
+#        "//tensorflow/core:lib",
+#        "//tensorflow/core:lib_internal",
+#        "//tensorflow/core:logger",
+#        "//tensorflow/stream_executor:dnn",
+#        "//tensorflow/stream_executor:event",
+#        "//tensorflow/stream_executor:logging_proto_cc",
+#        "//tensorflow/stream_executor:plugin_registry",
+#        "//tensorflow/stream_executor:scratch_allocator",
+#        "//tensorflow/stream_executor:stream_executor_pimpl_header",
+#        "//tensorflow/stream_executor:temporary_device_memory",
+#        "//tenosrflow/stream_executor/gpu:gpu_activation_header",
+#        "//tenosrflow/stream_executor/gpu:gpu_stream_header",
+#        "//tenosrflow/stream_executor/gpu:gpu_timer_header",
+#        "//tensorflow/stream_executor/lib",
+#        "//tensorflow/stream_executor/platform",
+#        "//tensorflow/stream_executor/platform:dso_loader",
+#        "@com_google_absl//absl/strings",
+#        "@local_config_rocm//rocm:rocm_headers",
+#    ] + tf_additional_miopen_plugin_deps() + if_static(["@local_config_rocm//rocm:miopen"]),
+#    alwayslink = True,
+#)
+
+cc_library(
+    name = "rocrand_plugin",
+    srcs = if_rocm_is_configured(["rocm_rng.cc"]),
+    hdrs = if_rocm_is_configured([]),
+    deps = if_rocm_is_configured([
+        ":rocm_gpu_executor",
+        ":rocm_platform_id",
+        "@local_config_rocm//rocm:rocm_headers",
+        "//tensorflow/stream_executor:event",
+        "//tensorflow/stream_executor:plugin_registry",
+        "//tensorflow/stream_executor:rng",
+        "//tensorflow/stream_executor/gpu:gpu_activation_header",
+        "//tensorflow/stream_executor/gpu:gpu_helpers_header",
+        "//tensorflow/stream_executor/gpu:gpu_executor_header",
+        "//tensorflow/stream_executor/gpu:gpu_rng_header",
+        "//tensorflow/stream_executor/gpu:gpu_stream_header",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ] + if_static([
+        "@local_config_rocm//rocm:hiprand",
+    ])),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "all_runtime",
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = if_rocm_is_configured([
+        # FIXME: enable in future PRs
+        #":miopen_plugin",
+        ":rocfft_plugin",
+        ":rocblas_plugin",
+        ":rocrand_plugin",
+        ":rocm_driver",
+        ":rocm_platform",
+    ]),
+    alwayslink = 1,
+)
diff --git a/tensorflow/stream_executor/rocm/rocm_blas.cc b/tensorflow/stream_executor/rocm/rocm_blas.cc
new file mode 100644
index 0000000000000000000000000000000000000000..82b966644c573c542c3c59320b0cfe7149ee3dbb
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_blas.cc
@@ -0,0 +1,2374 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "rocm/include/rocblas.h"
+
+#include "tensorflow/stream_executor/rocm/rocm_blas.h"
+
+#define EIGEN_USE_GPU
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+#include <assert.h>
+#include <complex>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/gpu/gpu_activation.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_helpers.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/gpu/gpu_timer.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/status_macros.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
+#include "tensorflow/stream_executor/scratch_allocator.h"
+#include "tensorflow/stream_executor/stream_executor.h"
+
+namespace stream_executor {
+namespace gpu {
+
+PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kRocBlasPlugin);
+
+namespace wrap {
+
+#ifdef PLATFORM_GOOGLE
+#define STREAM_EXECUTOR_ROCBLAS_WRAP(__name)                       \
+  struct WrapperShim__##__name {                                   \
+    static const char *kName;                                      \
+    template <typename... Args>                                    \
+    rocblas_status operator()(GpuExecutor *parent, Args... args) { \
+      gpu::ScopedActivateExecutorContext sac{parent};              \
+      return ::__name(args...);                                    \
+    }                                                              \
+  } __name;                                                        \
+  const char *WrapperShim__##__name::kName = #__name;
+
+#define STREAM_EXECUTOR_ROCBLAS_V2_WRAP(__name) \
+  STREAM_EXECUTOR_ROCBLAS_WRAP(__name)
+
+#else
+
+#define STREAM_EXECUTOR_ROCBLAS_WRAP(__name)                              \
+  struct DynLoadShim__##__name {                                          \
+    static const char *kName;                                             \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
+    static void *GetDsoHandle() {                                         \
+      auto s = internal::CachedDsoLoader::GetRocblasDsoHandle();          \
+      return s.ValueOrDie();                                              \
+    }                                                                     \
+    static FuncPtrT LoadOrDie() {                                         \
+      void *f;                                                            \
+      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                          kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                         \
+                    << " in rocblas DSO; dlerror: " << s.error_message(); \
+      return reinterpret_cast<FuncPtrT>(f);                               \
+    }                                                                     \
+    static FuncPtrT DynLoad() {                                           \
+      static FuncPtrT f = LoadOrDie();                                    \
+      return f;                                                           \
+    }                                                                     \
+    template <typename... Args>                                           \
+    rocblas_status operator()(GpuExecutor *parent, Args... args) {        \
+      gpu::ScopedActivateExecutorContext sac{parent};                     \
+      return DynLoad()(args...);                                          \
+    }                                                                     \
+  } __name;                                                               \
+  const char *DynLoadShim__##__name::kName = #__name;
+
+#define STREAM_EXECUTOR_ROCBLAS_V2_WRAP(__name) \
+  STREAM_EXECUTOR_ROCBLAS_WRAP(__name)
+
+#endif
+
+#define ROCBLAS_BLAS_ROUTINE_EACH(__macro)                                     \
+  __macro(rocblas_snrm2) __macro(rocblas_dnrm2) /*  __macro(rocblas_scnrm2)    \
+                                                  __macro(rocblas_dznrm2) */   \
+      __macro(rocblas_sdot)                                                    \
+          __macro(rocblas_ddot) /*  __macro(rocblas_cdotu)                     \
+                                  __macro(rocblas_cdotc)                       \
+                                  __macro(rocblas_zdotu)                       \
+                                  __macro(rocblas_zdotc)                    */ \
+      __macro(rocblas_sscal)                                                   \
+          __macro(rocblas_dscal) /*  __macro(rocblas_cscal)                    \
+                                   __macro(rocblas_csscal)                     \
+                                   __macro(rocblas_zscal)                      \
+                                   __macro(rocblas_zdscal) */                  \
+      __macro(rocblas_saxpy)                                                   \
+          __macro(rocblas_daxpy) /*  __macro(rocblas_caxpy)                    \
+                                   __macro(rocblas_zaxpy) */                   \
+      __macro(rocblas_scopy)                                                   \
+          __macro(rocblas_dcopy) /*  __macro(rocblas_ccopy)                    \
+                                   __macro(rocblas_zcopy) */                   \
+      __macro(rocblas_sswap)                                                   \
+          __macro(rocblas_dswap) /*  __macro(rocblas_cswap)                    \
+                                   __macro(rocblas_zswap) */                   \
+      __macro(rocblas_isamax)                                                  \
+          __macro(rocblas_idamax) /*  __macro(rocblas_icamax)                  \
+                                    __macro(rocblas_izamax) */                 \
+      __macro(rocblas_isamin)                                                  \
+          __macro(rocblas_idamin) /*  __macro(rocblas_icamin)                  \
+                                    __macro(rocblas_izamin) */                 \
+      __macro(rocblas_sasum)                                                   \
+          __macro(rocblas_dasum) /*  __macro(rocblas_scasum)                   \
+                                   __macro(rocblas_dzasum)                     \
+                                   __macro(rocblas_srot)                       \
+                                   __macro(rocblas_drot)                       \
+                                   __macro(rocblas_crot)                       \
+                                   __macro(rocblas_csrot)                      \
+                                   __macro(rocblas_zrot)                       \
+                                   __macro(rocblas_zdrot)                      \
+                                   __macro(rocblas_srotg)                      \
+                                   __macro(rocblas_drotg)                      \
+                                   __macro(rocblas_Crotg)                      \
+                                   __macro(rocblas_crotg)                      \
+                                   __macro(rocblas_zrotm)                      \
+                                   __macro(rocblas_drotm)                      \
+                                   __macro(rocblas_srotmg)                     \
+                                   __macro(rocblas_drotmg) */                  \
+      __macro(rocblas_sgemv)                                                   \
+          __macro(rocblas_dgemv) /*  __macro(rocblas_cgemv)                    \
+                                   __macro(rocblas_zgemv)                      \
+                                   __macro(rocblas_sgbmv)                      \
+                                   __macro(rocblas_dgbmv)                      \
+                                   __macro(rocblas_cgbmv)                      \
+                                   __macro(rocblas_zgbmv)                      \
+                                   __macro(rocblas_strmv)                      \
+                                   __macro(rocblas_dtrmv)                      \
+                                   __macro(rocblas_ctrmv)                      \
+                                   __macro(rocblas_ztrmv)                      \
+                                   __macro(rocblas_stbmv)                      \
+                                   __macro(rocblas_dtbmv)                      \
+                                   __macro(rocblas_ctbmv)                      \
+                                   __macro(rocblas_ztbmv)                      \
+                                   __macro(rocblas_stpmv)                      \
+                                   __macro(rocblas_dtpmv)                      \
+                                   __macro(rocblas_ctpmv)                      \
+                                   __macro(rocblas_ztpmv)                      \
+                                   __macro(rocblas_strsv)                      \
+                                   __macro(rocblas_dtrsv)                      \
+                                   __macro(rocblas_ctrsv)                      \
+                                   __macro(rocblas_ztrsv)                      \
+                                   __macro(rocblas_stpsv)                      \
+                                   __macro(rocblas_dtpsv)                      \
+                                   __macro(rocblas_ctpsv)                      \
+                                   __macro(rocblas_ztpsv)                      \
+                                   __macro(rocblas_stbsv)                      \
+                                   __macro(rocblas_dtbsv)                      \
+                                   __macro(rocblas_ctbsv)                      \
+                                   __macro(rocblas_ztbsv)                      \
+                                   __macro(rocblas_ssymv)                      \
+                                   __macro(rocblas_dsymv)                      \
+                                   __macro(rocblas_csymv)                      \
+                                   __macro(rocblas_zsymv)                      \
+                                   __macro(rocblas_chemv)                      \
+                                   __macro(rocblas_zhemv)                      \
+                                   __macro(rocblas_ssbmv)                      \
+                                   __macro(rocblas_dsbmv)                      \
+                                   __macro(rocblas_chbmv)                      \
+                                   __macro(rocblas_zhbmv)                      \
+                                   __macro(rocblas_sspmv)                      \
+                                   __macro(rocblas_dspmv)                      \
+                                   __macro(rocblas_chpmv)                      \
+                                   __macro(rocblas_zhpmv) */                   \
+      __macro(rocblas_sger)                                                    \
+          __macro(rocblas_dger) /*  __macro(rocblas_cgeru)                     \
+                                  __macro(rocblas_cgerc)                       \
+                                  __macro(rocblas_zgeru)                       \
+                                  __macro(rocblas_zgerc)                    */ \
+      __macro(rocblas_ssyr)                                                    \
+          __macro(rocblas_dsyr) /*  __macro(rocblas_csyr)                      \
+                                  __macro(rocblas_zsyr)                        \
+                                  __macro(rocblas_cher)                        \
+                                  __macro(rocblas_zher)                        \
+                                  __macro(rocblas_sspr)                        \
+                                  __macro(rocblas_dspr)                        \
+                                  __macro(rocblas_chpr)                        \
+                                  __macro(rocblas_zhpr)                        \
+                                  __macro(rocblas_ssyr2)                       \
+                                  __macro(rocblas_dsyr2)                       \
+                                  __macro(rocblas_csyr2)                       \
+                                  __macro(rocblas_zsyr2)                       \
+                                  __macro(rocblas_cher2)                       \
+                                  __macro(rocblas_zher2)                       \
+                                  __macro(rocblas_sspr2)                       \
+                                  __macro(rocblas_dspr2)                       \
+                                  __macro(rocblas_chpr2)                       \
+                                  __macro(rocblas_zhpr2)                    */ \
+      __macro(rocblas_sgemm) __macro(rocblas_dgemm)                            \
+          __macro(rocblas_hgemm) /*  __macro(rocblas_cgemm)                    \
+                                   __macro(rocblas_zgemm)                      \
+                                   __macro(rocblas_ssyrk)                      \
+                                   __macro(rocblas_dsyrk)                      \
+                                   __macro(rocblas_csyrk)                      \
+                                   __macro(rocblas_zsyrk)                      \
+                                   __macro(rocblas_cherk)                      \
+                                   __macro(rocblas_zherk)                      \
+                                   __macro(rocblas_ssyr2k)                     \
+                                   __macro(rocblas_dsyr2k)                     \
+                                   __macro(rocblas_csyr2k)                     \
+                                   __macro(rocblas_zsyr2k)                     \
+                                   __macro(rocblas_cher2k)                     \
+                                   __macro(rocblas_zher2k)                     \
+                                   __macro(rocblas_ssyrkx)                     \
+                                   __macro(rocblas_dsyrkx)                     \
+                                   __macro(rocblas_csyrkx)                     \
+                                   __macro(rocblas_zsyrkx)                     \
+                                   __macro(rocblas_cherkx)                     \
+                                   __macro(rocblas_zherkx)                     \
+                                   __macro(rocblas_ssymm)                      \
+                                   __macro(rocblas_dsymm)                      \
+                                   __macro(rocblas_csymm)                      \
+                                   __macro(rocblas_zsymm)                      \
+                                   __macro(rocblas_chemm)                      \
+                                   __macro(rocblas_zhemm) */                   \
+      __macro(rocblas_strsm)                                                   \
+          __macro(rocblas_dtrsm) /*  __macro(rocblas_ctrsm)                    \
+                                   __macro(rocblas_ztrsm)                      \
+                                   __macro(rocblas_strmm)                      \
+                                   __macro(rocblas_dtrmm)                      \
+                                   __macro(rocblas_ctrmm)                      \
+                                   __macro(rocblas_ztrmm) */                   \
+      __macro(rocblas_sgeam)                                                   \
+          __macro(rocblas_dgeam) /*  __macro(rocblas_cgeam)                    \
+                                   __macro(rocblas_zgeam)                      \
+                                   __macro(rocblas_sdgmm)                      \
+                                   __macro(rocblas_ddgmm)                      \
+                                   __macro(rocblas_cdgmm)                      \
+                                   __macro(rocblas_zdgmm) */
+
+STREAM_EXECUTOR_ROCBLAS_V2_WRAP(rocblas_create_handle)
+STREAM_EXECUTOR_ROCBLAS_V2_WRAP(rocblas_destroy_handle)
+STREAM_EXECUTOR_ROCBLAS_V2_WRAP(rocblas_set_stream)
+// STREAM_EXECUTOR_ROCBLAS_V2_WRAP(rocblas_set_pointer_mode)
+// STREAM_EXECUTOR_ROCBLAS_V2_WRAP(rocblas_get_pointer_mode)
+// STREAM_EXECUTOR_ROCBLAS_WRAP(rocblas_sgemm_batched)
+STREAM_EXECUTOR_ROCBLAS_WRAP(rocblas_hgemm_strided_batched)
+STREAM_EXECUTOR_ROCBLAS_WRAP(rocblas_sgemm_strided_batched)
+// STREAM_EXECUTOR_ROCBLAS_WRAP(rocblas_dgemm_batched)
+STREAM_EXECUTOR_ROCBLAS_WRAP(rocblas_dgemm_strided_batched)
+// STREAM_EXECUTOR_ROCBLAS_WRAP(rocblas_cgemm_batched)
+// STREAM_EXECUTOR_ROCBLAS_WRAP(rocblas_zgemm_batched)
+ROCBLAS_BLAS_ROUTINE_EACH(STREAM_EXECUTOR_ROCBLAS_V2_WRAP)
+
+}  // namespace wrap
+
+static string ToString(rocblas_status status) {
+  switch (status) {
+    case rocblas_status_success:
+      return "rocblas_status_success";
+    case rocblas_status_invalid_handle:
+      return "rocblas_status_invalid_handle";
+    case rocblas_status_not_implemented:
+      return "rocblas_status_not_implemented";
+    case rocblas_status_invalid_pointer:
+      return "rocblas_status_invalid_pointer";
+    case rocblas_status_invalid_size:
+      return "rocblas_status_invalid_size";
+    case rocblas_status_memory_error:
+      return "rocblas_status_memory_error";
+    case rocblas_status_internal_error:
+      return "rocblas_status_internal_error";
+    default:
+      return absl::StrCat("<invalid rocBLAS status: ", status, ">");
+  }
+}
+
+bool ROCMBlas::Init() {
+  rocblas_status ret = wrap::rocblas_create_handle(parent_, &blas_);
+  if (ret != rocblas_status_success) {
+    LOG(ERROR) << "failed to create rocBLAS handle: " << ToString(ret);
+    return false;
+  }
+
+  return true;
+}
+
+ROCMBlas::ROCMBlas(gpu::GpuExecutor *parent)
+    : parent_(CHECK_NOTNULL(parent)), blas_(nullptr) {}
+
+ROCMBlas::~ROCMBlas() {
+  if (blas_ != nullptr) {
+    wrap::rocblas_destroy_handle(parent_, blas_);
+  }
+}
+
+bool ROCMBlas::SetStream(Stream *stream) {
+  CHECK(stream != nullptr);
+  CHECK(AsGpuStreamValue(stream) != nullptr);
+  CHECK(blas_ != nullptr);
+  rocblas_status ret =
+      wrap::rocblas_set_stream(parent_, blas_, AsGpuStreamValue(stream));
+  if (ret != rocblas_status_success) {
+    LOG(ERROR) << "failed to set stream for rocBLAS calls: " << ToString(ret);
+    return false;
+  }
+
+  return true;
+}
+
+namespace {
+
+// Helper functions transforming blas arguments into rocBLAS arguments.
+
+rocblas_operation ROCMBlasTranspose(blas::Transpose trans) {
+  switch (trans) {
+    case blas::Transpose::kNoTranspose:
+      return rocblas_operation_none;
+    case blas::Transpose::kTranspose:
+      return rocblas_operation_transpose;
+    case blas::Transpose::kConjugateTranspose:
+      return rocblas_operation_conjugate_transpose;
+    default:
+      LOG(FATAL) << "Invalid value of blas::Transpose.";
+  }
+}
+
+rocblas_fill ROCMBlasUpperLower(blas::UpperLower uplo) {
+  switch (uplo) {
+    case blas::UpperLower::kUpper:
+      return rocblas_fill_upper;
+    case blas::UpperLower::kLower:
+      return rocblas_fill_lower;
+    default:
+      LOG(FATAL) << "Invalid value of blas::UpperLower.";
+  }
+}
+
+rocblas_diagonal ROCMBlasDiagonal(blas::Diagonal diag) {
+  switch (diag) {
+    case blas::Diagonal::kUnit:
+      return rocblas_diagonal_unit;
+    case blas::Diagonal::kNonUnit:
+      return rocblas_diagonal_non_unit;
+    default:
+      LOG(FATAL) << "Invalid value of blas::Diagonal.";
+  }
+}
+
+rocblas_side ROCMBlasSide(blas::Side side) {
+  switch (side) {
+    case blas::Side::kLeft:
+      return rocblas_side_left;
+    case blas::Side::kRight:
+      return rocblas_side_right;
+    default:
+      LOG(FATAL) << "Invalid value of blas::Side.";
+  }
+}
+
+}  // namespace
+
+template <typename FuncT, typename... Args>
+bool ROCMBlas::DoBlasInternalImpl(FuncT rocblas_func, Stream *stream,
+                                  bool pointer_mode_host, bool err_on_failure,
+                                  Args... args) {
+  mutex_lock lock{mu_};
+
+  CHECK(blas_ != nullptr);
+  if (!SetStream(stream)) {
+    return false;
+  }
+
+  rocblas_status ret = rocblas_func(parent_, blas_, args...);
+  if (err_on_failure && ret != rocblas_status_success) {
+    LOG(ERROR) << "failed to run ROCBLAS routine " << rocblas_func.kName << ": "
+               << ToString(ret);
+  }
+  return ret == rocblas_status_success;
+}
+
+bool ROCMBlas::DoBlasAsum(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<float> &x, int incx,
+                          DeviceMemory<float> *result) {
+  return DoBlasInternal(wrap::rocblas_sasum, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasAsum(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<double> &x, int incx,
+                          DeviceMemory<double> *result) {
+  return DoBlasInternal(wrap::rocblas_dasum, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasAsum(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          DeviceMemory<float> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the ASUM operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasAsum(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          DeviceMemory<double> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the ASUM operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha,
+                          const DeviceMemory<float> &x, int incx,
+                          DeviceMemory<float> *y, int incy) {
+  return DoBlasInternal(wrap::rocblas_saxpy, stream,
+                        true /* = pointer_mode_host */, elem_count, &alpha,
+                        GpuMemory(x), incx, GpuMemoryMutable(y), incy);
+}
+
+bool ROCMBlas::DoBlasAxpy(Stream *stream, uint64 elem_count, double alpha,
+                          const DeviceMemory<double> &x, int incx,
+                          DeviceMemory<double> *y, int incy) {
+  return DoBlasInternal(wrap::rocblas_daxpy, stream,
+                        true /* = pointer_mode_host */, elem_count, &alpha,
+                        GpuMemory(x), incx, GpuMemoryMutable(y), incy);
+}
+
+bool ROCMBlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the AXPY operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasAxpy(Stream *stream, uint64 elem_count,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the AXPY operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasCopy(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<float> &x, int incx,
+                          DeviceMemory<float> *y, int incy) {
+  return DoBlasInternal(wrap::rocblas_scopy, stream,
+                        true /* = pointer_mode_host */, elem_count,
+                        GpuMemory(x), incx, GpuMemoryMutable(y), incy);
+}
+
+bool ROCMBlas::DoBlasCopy(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<double> &x, int incx,
+                          DeviceMemory<double> *y, int incy) {
+  return DoBlasInternal(wrap::rocblas_dcopy, stream,
+                        true /* = pointer_mode_host */, elem_count,
+                        GpuMemory(x), incx, GpuMemoryMutable(y), incy);
+}
+
+bool ROCMBlas::DoBlasCopy(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the COPY operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasCopy(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the COPY operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasDot(Stream *stream, uint64 elem_count,
+                         const DeviceMemory<float> &x, int incx,
+                         const DeviceMemory<float> &y, int incy,
+                         DeviceMemory<float> *result) {
+  return DoBlasInternal(
+      wrap::rocblas_sdot, stream, false /* = pointer_mode_host */, elem_count,
+      GpuMemory(x), incx, GpuMemory(y), incy, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasDot(Stream *stream, uint64 elem_count,
+                         const DeviceMemory<double> &x, int incx,
+                         const DeviceMemory<double> &y, int incy,
+                         DeviceMemory<double> *result) {
+  return DoBlasInternal(
+      wrap::rocblas_ddot, stream, false /* = pointer_mode_host */, elem_count,
+      GpuMemory(x), incx, GpuMemory(y), incy, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasDotc(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          const DeviceMemory<std::complex<float>> &y, int incy,
+                          DeviceMemory<std::complex<float>> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the DOT operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasDotc(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          const DeviceMemory<std::complex<double>> &y, int incy,
+                          DeviceMemory<std::complex<double>> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the DOT operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasDotu(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          const DeviceMemory<std::complex<float>> &y, int incy,
+                          DeviceMemory<std::complex<float>> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the DOT operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasDotu(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          const DeviceMemory<std::complex<double>> &y, int incy,
+                          DeviceMemory<std::complex<double>> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the DOT operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<float> &x, int incx,
+                          DeviceMemory<float> *result) {
+  return DoBlasInternal(wrap::rocblas_snrm2, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<double> &x, int incx,
+                          DeviceMemory<double> *result) {
+  return DoBlasInternal(wrap::rocblas_dnrm2, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          DeviceMemory<float> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the NRM2 operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          DeviceMemory<double> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the NRM2 operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRot(Stream *stream, uint64 elem_count,
+                         DeviceMemory<float> *x, int incx,
+                         DeviceMemory<float> *y, int incy, float c, float s) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROT operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRot(Stream *stream, uint64 elem_count,
+                         DeviceMemory<double> *x, int incx,
+                         DeviceMemory<double> *y, int incy, double c,
+                         double s) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROT operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRot(Stream *stream, uint64 elem_count,
+                         DeviceMemory<std::complex<float>> *x, int incx,
+                         DeviceMemory<std::complex<float>> *y, int incy,
+                         float c, float s) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROT operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRot(Stream *stream, uint64 elem_count,
+                         DeviceMemory<std::complex<double>> *x, int incx,
+                         DeviceMemory<std::complex<double>> *y, int incy,
+                         double c, double s) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROT operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRotg(Stream *stream, DeviceMemory<float> *a,
+                          DeviceMemory<float> *b, DeviceMemory<float> *c,
+                          DeviceMemory<float> *s) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROTG operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRotg(Stream *stream, DeviceMemory<double> *a,
+                          DeviceMemory<double> *b, DeviceMemory<double> *c,
+                          DeviceMemory<double> *s) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROTG operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<float>> *a,
+                          DeviceMemory<std::complex<float>> *b,
+                          DeviceMemory<float> *c,
+                          DeviceMemory<std::complex<float>> *s) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROTG operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRotg(Stream *stream, DeviceMemory<std::complex<double>> *a,
+                          DeviceMemory<std::complex<double>> *b,
+                          DeviceMemory<double> *c,
+                          DeviceMemory<std::complex<double>> *s) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROTG operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRotm(Stream *stream, uint64 elem_count,
+                          DeviceMemory<float> *x, int incx,
+                          DeviceMemory<float> *y, int incy,
+                          const DeviceMemory<float> &param) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROTM operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRotm(Stream *stream, uint64 elem_count,
+                          DeviceMemory<double> *x, int incx,
+                          DeviceMemory<double> *y, int incy,
+                          const DeviceMemory<double> &param) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROTM operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRotmg(Stream *stream, DeviceMemory<float> *d1,
+                           DeviceMemory<float> *d2, DeviceMemory<float> *x1,
+                           const DeviceMemory<float> &y1,
+                           DeviceMemory<float> *param) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROTMG operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasRotmg(Stream *stream, DeviceMemory<double> *d1,
+                           DeviceMemory<double> *d2, DeviceMemory<double> *x1,
+                           const DeviceMemory<double> &y1,
+                           DeviceMemory<double> *param) {
+  LOG(ERROR) << "rocBLAS does not currently support the ROTMG operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha,
+                          DeviceMemory<float> *x, int incx) {
+  return DoBlasInternal(wrap::rocblas_sscal, stream,
+                        true /* = pointer_mode_host */, elem_count, &alpha,
+                        GpuMemoryMutable(x), incx);
+}
+
+bool ROCMBlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
+                          DeviceMemory<double> *x, int incx) {
+  return DoBlasInternal(wrap::rocblas_dscal, stream,
+                        true /* = pointer_mode_host */, elem_count, &alpha,
+                        GpuMemoryMutable(x), incx);
+}
+
+bool ROCMBlas::DoBlasScal(Stream *stream, uint64 elem_count, float alpha,
+                          DeviceMemory<std::complex<float>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the SCAL operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasScal(Stream *stream, uint64 elem_count, double alpha,
+                          DeviceMemory<std::complex<double>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the SCAL operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasScal(Stream *stream, uint64 elem_count,
+                          std::complex<float> alpha,
+                          DeviceMemory<std::complex<float>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the SCAL operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasScal(Stream *stream, uint64 elem_count,
+                          std::complex<double> alpha,
+                          DeviceMemory<std::complex<double>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the SCAL operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSwap(Stream *stream, uint64 elem_count,
+                          DeviceMemory<float> *x, int incx,
+                          DeviceMemory<float> *y, int incy) {
+  return DoBlasInternal(wrap::rocblas_sswap, stream,
+                        true /* = pointer_mode_host */, elem_count,
+                        GpuMemoryMutable(x), incx, GpuMemoryMutable(y), incy);
+}
+
+bool ROCMBlas::DoBlasSwap(Stream *stream, uint64 elem_count,
+                          DeviceMemory<double> *x, int incx,
+                          DeviceMemory<double> *y, int incy) {
+  return DoBlasInternal(wrap::rocblas_dswap, stream,
+                        true /* = pointer_mode_host */, elem_count,
+                        GpuMemoryMutable(x), incx, GpuMemoryMutable(y), incy);
+}
+
+bool ROCMBlas::DoBlasSwap(Stream *stream, uint64 elem_count,
+                          DeviceMemory<std::complex<float>> *x, int incx,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the SWAP operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSwap(Stream *stream, uint64 elem_count,
+                          DeviceMemory<std::complex<double>> *x, int incx,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the SWAP operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasIamax(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<float> &x, int incx,
+                           DeviceMemory<int> *result) {
+  return DoBlasInternal(wrap::rocblas_isamax, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasIamax(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<double> &x, int incx,
+                           DeviceMemory<int> *result) {
+  return DoBlasInternal(wrap::rocblas_idamax, stream,
+                        false /* = pointer_mode_host */, elem_count,
+                        GpuMemory(x), incx, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasIamax(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<std::complex<float>> &x, int incx,
+                           DeviceMemory<int> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the AMAX operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasIamax(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<std::complex<double>> &x,
+                           int incx, DeviceMemory<int> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the AMAX operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasIamin(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<float> &x, int incx,
+                           DeviceMemory<int> *result) {
+  return DoBlasInternal(
+      wrap::rocblas_isamin, stream, false /* = pointer_mode_host */, elem_count,
+      GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasIamin(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<double> &x, int incx,
+                           DeviceMemory<int> *result) {
+  return DoBlasInternal(
+      wrap::rocblas_idamin, stream, false /* = pointer_mode_host */, elem_count,
+      GpuComplex(GpuMemory(x)), incx, GpuMemoryMutable(result));
+}
+
+bool ROCMBlas::DoBlasIamin(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<std::complex<float>> &x, int incx,
+                           DeviceMemory<int> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the AMIN operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasIamin(Stream *stream, uint64 elem_count,
+                           const DeviceMemory<std::complex<double>> &x,
+                           int incx, DeviceMemory<int> *result) {
+  LOG(ERROR) << "rocBLAS does not currently support the AMIN operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, uint64 kl, uint64 ku, float alpha,
+                          const DeviceMemory<float> &a, int lda,
+                          const DeviceMemory<float> &x, int incx, float beta,
+                          DeviceMemory<float> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the GBMV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, uint64 kl, uint64 ku, double alpha,
+                          const DeviceMemory<double> &a, int lda,
+                          const DeviceMemory<double> &x, int incx, double beta,
+                          DeviceMemory<double> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the GBMV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, uint64 kl, uint64 ku,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the GBMV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, uint64 kl, uint64 ku,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the GBMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, float alpha, const DeviceMemory<float> &a,
+                          int lda, const DeviceMemory<float> &x, int incx,
+                          float beta, DeviceMemory<float> *y, int incy) {
+  return DoBlasInternal(
+      wrap::rocblas_sgemv, stream, true /* = pointer_mode_host */,
+      ROCMBlasTranspose(trans), m, n, &alpha, GpuMemory(a), lda, GpuMemory(x),
+      incx, &beta, GpuMemoryMutable(y), incy);
+}
+
+bool ROCMBlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, double alpha, const DeviceMemory<double> &a,
+                          int lda, const DeviceMemory<double> &x, int incx,
+                          double beta, DeviceMemory<double> *y, int incy) {
+  return DoBlasInternal(
+      wrap::rocblas_dgemv, stream, true /* = pointer_mode_host */,
+      ROCMBlasTranspose(trans), m, n, &alpha, GpuMemory(a), lda, GpuMemory(x),
+      incx, &beta, GpuMemoryMutable(y), incy);
+}
+
+bool ROCMBlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the GEMV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemv(Stream *stream, blas::Transpose trans, uint64 m,
+                          uint64 n, std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the GEMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, float alpha,
+                         const DeviceMemory<float> &x, int incx,
+                         const DeviceMemory<float> &y, int incy,
+                         DeviceMemory<float> *a, int lda) {
+  return DoBlasInternal(
+      wrap::rocblas_sger, stream, true /* = pointer_mode_host */, m, n, &alpha,
+      GpuMemory(x), incx, GpuMemory(y), incy, GpuMemoryMutable(a), lda);
+}
+
+bool ROCMBlas::DoBlasGer(Stream *stream, uint64 m, uint64 n, double alpha,
+                         const DeviceMemory<double> &x, int incx,
+                         const DeviceMemory<double> &y, int incy,
+                         DeviceMemory<double> *a, int lda) {
+  return DoBlasInternal(
+      wrap::rocblas_dger, stream, true /* = pointer_mode_host */, m, n, &alpha,
+      GpuMemory(x), incx, GpuMemory(y), incy, GpuMemoryMutable(a), lda);
+}
+
+bool ROCMBlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          const DeviceMemory<std::complex<float>> &y, int incy,
+                          DeviceMemory<std::complex<float>> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the GER operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGerc(Stream *stream, uint64 m, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          const DeviceMemory<std::complex<double>> &y, int incy,
+                          DeviceMemory<std::complex<double>> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the GER operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          const DeviceMemory<std::complex<float>> &y, int incy,
+                          DeviceMemory<std::complex<float>> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the GERU operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGeru(Stream *stream, uint64 m, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          const DeviceMemory<std::complex<double>> &y, int incy,
+                          DeviceMemory<std::complex<double>> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the GERU operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          uint64 k, std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the HBMV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          uint64 k, std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the HBMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the HEMV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHemv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the HEMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         float alpha,
+                         const DeviceMemory<std::complex<float>> &x, int incx,
+                         DeviceMemory<std::complex<float>> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the HER operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHer(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         double alpha,
+                         const DeviceMemory<std::complex<double>> &x, int incx,
+                         DeviceMemory<std::complex<double>> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the HER operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          const DeviceMemory<std::complex<float>> &y, int incy,
+                          DeviceMemory<std::complex<float>> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the HER2 operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHer2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          const DeviceMemory<std::complex<double>> &y, int incy,
+                          DeviceMemory<std::complex<double>> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the HER2 operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &ap,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the HPMV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &ap,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the HPMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         float alpha,
+                         const DeviceMemory<std::complex<float>> &x, int incx,
+                         DeviceMemory<std::complex<float>> *ap) {
+  LOG(ERROR) << "rocBLAS does not currently support the HPR operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHpr(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         double alpha,
+                         const DeviceMemory<std::complex<double>> &x, int incx,
+                         DeviceMemory<std::complex<double>> *ap) {
+  LOG(ERROR) << "rocBLAS does not currently support the HPR operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &x, int incx,
+                          const DeviceMemory<std::complex<float>> &y, int incy,
+                          DeviceMemory<std::complex<float>> *ap) {
+  LOG(ERROR) << "rocBLAS does not currently support the HPR2 operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &x, int incx,
+                          const DeviceMemory<std::complex<double>> &y, int incy,
+                          DeviceMemory<std::complex<double>> *ap) {
+  LOG(ERROR) << "rocBLAS does not currently support the HPR2 operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          uint64 k, float alpha, const DeviceMemory<float> &a,
+                          int lda, const DeviceMemory<float> &x, int incx,
+                          float beta, DeviceMemory<float> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the SBMV operation "
+             << "for the \"complex<float>\" dataype";
+
+  return false;
+}
+
+bool ROCMBlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          uint64 k, double alpha, const DeviceMemory<double> &a,
+                          int lda, const DeviceMemory<double> &x, int incx,
+                          double beta, DeviceMemory<double> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the SBMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          float alpha, const DeviceMemory<float> &ap,
+                          const DeviceMemory<float> &x, int incx, float beta,
+                          DeviceMemory<float> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the SPMV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSpmv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          double alpha, const DeviceMemory<double> &ap,
+                          const DeviceMemory<double> &x, int incx, double beta,
+                          DeviceMemory<double> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the SPMV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         float alpha, const DeviceMemory<float> &x, int incx,
+                         DeviceMemory<float> *ap) {
+  LOG(ERROR) << "rocBLAS does not currently support the SPR operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSpr(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         double alpha, const DeviceMemory<double> &x, int incx,
+                         DeviceMemory<double> *ap) {
+  LOG(ERROR) << "rocBLAS does not currently support the SPR operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          float alpha, const DeviceMemory<float> &x, int incx,
+                          const DeviceMemory<float> &y, int incy,
+                          DeviceMemory<float> *ap) {
+  LOG(ERROR) << "rocBLAS does not currently support the SPR2 operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSpr2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          double alpha, const DeviceMemory<double> &x, int incx,
+                          const DeviceMemory<double> &y, int incy,
+                          DeviceMemory<double> *ap) {
+  LOG(ERROR) << "rocBLAS does not currently support the SPR2 operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          float alpha, const DeviceMemory<float> &a, int lda,
+                          const DeviceMemory<float> &x, int incx, float beta,
+                          DeviceMemory<float> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYMV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSymv(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          double alpha, const DeviceMemory<double> &a, int lda,
+                          const DeviceMemory<double> &x, int incx, double beta,
+                          DeviceMemory<double> *y, int incy) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYMV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         float alpha, const DeviceMemory<float> &x, int incx,
+                         DeviceMemory<float> *a, int lda) {
+  return DoBlasInternal(wrap::rocblas_ssyr, stream,
+                        true /* = pointer_mode_host */,
+                        ROCMBlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
+                        GpuMemoryMutable(a), lda);
+}
+
+bool ROCMBlas::DoBlasSyr(Stream *stream, blas::UpperLower uplo, uint64 n,
+                         double alpha, const DeviceMemory<double> &x, int incx,
+                         DeviceMemory<double> *a, int lda) {
+  return DoBlasInternal(wrap::rocblas_dsyr, stream,
+                        true /* = pointer_mode_host */,
+                        ROCMBlasUpperLower(uplo), n, &alpha, GpuMemory(x), incx,
+                        GpuMemoryMutable(a), lda);
+}
+
+bool ROCMBlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          float alpha, const DeviceMemory<float> &x, int incx,
+                          const DeviceMemory<float> &y, int incy,
+                          DeviceMemory<float> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYR2 operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyr2(Stream *stream, blas::UpperLower uplo, uint64 n,
+                          double alpha, const DeviceMemory<double> &x, int incx,
+                          const DeviceMemory<double> &y, int incy,
+                          DeviceMemory<double> *a, int lda) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYR2 operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<float> &a, int lda,
+                          DeviceMemory<float> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TBMV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<double> &a, int lda,
+                          DeviceMemory<double> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TBMV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<std::complex<float>> &a,
+                          int lda, DeviceMemory<std::complex<float>> *x,
+                          int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TBMV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTbmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<std::complex<double>> &a,
+                          int lda, DeviceMemory<std::complex<double>> *x,
+                          int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TBMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<float> &a, int lda,
+                          DeviceMemory<float> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TBSV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<double> &a, int lda,
+                          DeviceMemory<double> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TBSV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<std::complex<float>> &a,
+                          int lda, DeviceMemory<std::complex<float>> *x,
+                          int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TBSV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTbsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          uint64 k, const DeviceMemory<std::complex<double>> &a,
+                          int lda, DeviceMemory<std::complex<double>> *x,
+                          int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TBSV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<float> &ap, DeviceMemory<float> *x,
+                          int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TPMV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<double> &ap,
+                          DeviceMemory<double> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TPMV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<float>> &ap,
+                          DeviceMemory<std::complex<float>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TPMV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTpmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<double>> &ap,
+                          DeviceMemory<std::complex<double>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TPMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<float> &ap, DeviceMemory<float> *x,
+                          int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TPSV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<double> &ap,
+                          DeviceMemory<double> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TPSV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<float>> &ap,
+                          DeviceMemory<std::complex<float>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TPSV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTpsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<double>> &ap,
+                          DeviceMemory<std::complex<double>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TPSV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<float> &a, int lda,
+                          DeviceMemory<float> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRMV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<double> &a, int lda,
+                          DeviceMemory<double> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRMV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          DeviceMemory<std::complex<float>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRMV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrmv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          DeviceMemory<std::complex<double>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRMV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<float> &a, int lda,
+                          DeviceMemory<float> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRSV operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<double> &a, int lda,
+                          DeviceMemory<double> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRSV operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          DeviceMemory<std::complex<float>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRSV operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrsv(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, blas::Diagonal diag, uint64 n,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          DeviceMemory<std::complex<double>> *x, int incx) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRSV operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
+                          blas::Transpose transb, uint64 m, uint64 n, uint64 k,
+                          float alpha, const DeviceMemory<Eigen::half> &a,
+                          int lda, const DeviceMemory<Eigen::half> &b, int ldb,
+                          float beta, DeviceMemory<Eigen::half> *c, int ldc) {
+  VLOG(1) << port::Printf(
+      "doing rocBLAS SGEMM: at=%d bt=%d m=%llu n=%llu "
+      "k=%llu alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f "
+      "c=%p ldc=%d",
+      static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
+      a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc);
+  if (transa == blas::Transpose::kNoTranspose) {
+    if (lda < static_cast<int64>(m)) {
+      LOG(WARNING) << "GEMM lda was smaller than m (no transpose case); "
+                      "precondition violation";
+    }
+  } else {
+    if (lda < static_cast<int64>(k)) {
+      LOG(WARNING) << "GEMM lda (" << lda << ") was smaller than k (" << k
+                   << ") (transpose case); precondition violation";
+    }
+  }
+  if (transb == blas::Transpose::kNoTranspose) {
+    if (ldb < static_cast<int64>(k)) {
+      LOG(WARNING) << "GEMM ldb (" << ldb << ") was smaller than k (" << k
+                   << ") (no transpose case); precondition violation";
+    }
+  } else {
+    if (ldb < static_cast<int64>(n)) {
+      LOG(WARNING) << "GEMM ldb was smaller than n (transpose case); "
+                      "precondition violation";
+    }
+  }
+  const Eigen::half alpha_half(alpha);
+  const Eigen::half beta_half(beta);
+  return DoBlasInternal(
+      wrap::rocblas_hgemm, stream, true /* = pointer_mode_host */,
+      ROCMBlasTranspose(transa), ROCMBlasTranspose(transb), m, n, k,
+      reinterpret_cast<const rocblas_half *>(&alpha_half),
+      reinterpret_cast<const rocblas_half *>(GpuMemory(a)), lda,
+      reinterpret_cast<const rocblas_half *>(GpuMemory(b)), ldb,
+      reinterpret_cast<const rocblas_half *>(&beta_half),
+      reinterpret_cast<rocblas_half *>(GpuMemoryMutable(c)), ldc);
+}
+
+bool ROCMBlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
+                          blas::Transpose transb, uint64 m, uint64 n, uint64 k,
+                          float alpha, const DeviceMemory<float> &a, int lda,
+                          const DeviceMemory<float> &b, int ldb, float beta,
+                          DeviceMemory<float> *c, int ldc) {
+  VLOG(1) << port::Printf(
+      "doing rocBLAS SGEMM: at=%d bt=%d m=%llu n=%llu "
+      "k=%llu alpha=%f a=%p lda=%d b=%p ldb=%d beta=%f "
+      "c=%p ldc=%d",
+      static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
+      a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc);
+  if (transa == blas::Transpose::kNoTranspose) {
+    if (lda < static_cast<int64>(m)) {
+      LOG(WARNING) << "GEMM lda was smaller than m (no transpose case); "
+                      "precondition violation";
+    }
+  } else {
+    if (lda < static_cast<int64>(k)) {
+      LOG(WARNING) << "GEMM lda (" << lda << ") was smaller than k (" << k
+                   << ") (transpose case); precondition violation";
+    }
+  }
+  if (transb == blas::Transpose::kNoTranspose) {
+    if (ldb < static_cast<int64>(k)) {
+      LOG(WARNING) << "GEMM ldb (" << ldb << ") was smaller than k (" << k
+                   << ") (no transpose case); precondition violation";
+    }
+  } else {
+    if (ldb < static_cast<int64>(n)) {
+      LOG(WARNING) << "GEMM ldb was smaller than n (transpose case); "
+                      "precondition violation";
+    }
+  }
+  return DoBlasInternal(
+      wrap::rocblas_sgemm, stream, true /* = pointer_mode_host */,
+      ROCMBlasTranspose(transa), ROCMBlasTranspose(transb), m, n, k, &alpha,
+      GpuMemory(a), lda, GpuMemory(b), ldb, &beta, GpuMemoryMutable(c), ldc);
+}
+
+bool ROCMBlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
+                          blas::Transpose transb, uint64 m, uint64 n, uint64 k,
+                          double alpha, const DeviceMemory<double> &a, int lda,
+                          const DeviceMemory<double> &b, int ldb, double beta,
+                          DeviceMemory<double> *c, int ldc) {
+  return DoBlasInternal(
+      wrap::rocblas_dgemm, stream, true /* = pointer_mode_host */,
+      ROCMBlasTranspose(transa), ROCMBlasTranspose(transb), m, n, k, &alpha,
+      GpuMemory(a), lda, GpuMemory(b), ldb, &beta, GpuMemoryMutable(c), ldc);
+}
+
+bool ROCMBlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
+                          blas::Transpose transb, uint64 m, uint64 n, uint64 k,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &b, int ldb,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the GEMM operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
+                          blas::Transpose transb, uint64 m, uint64 n, uint64 k,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &b, int ldb,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the GEMM operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemvWithProfiling(
+    Stream *stream, blas::Transpose trans, uint64 m, uint64 n, float alpha,
+    const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &x,
+    int incx, float beta, DeviceMemory<float> *y, int incy,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
+                                     incx, beta, y, incy,
+                                     output_profile_result);
+}
+
+bool ROCMBlas::DoBlasGemvWithProfiling(
+    Stream *stream, blas::Transpose trans, uint64 m, uint64 n, double alpha,
+    const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &x,
+    int incx, double beta, DeviceMemory<double> *y, int incy,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
+                                     incx, beta, y, incy,
+                                     output_profile_result);
+}
+
+bool ROCMBlas::DoBlasGemvWithProfiling(
+    Stream *stream, blas::Transpose trans, uint64 m, uint64 n,
+    std::complex<float> alpha, const DeviceMemory<std::complex<float>> &a,
+    int lda, const DeviceMemory<std::complex<float>> &x, int incx,
+    std::complex<float> beta, DeviceMemory<std::complex<float>> *y, int incy,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
+                                     incx, beta, y, incy,
+                                     output_profile_result);
+}
+
+bool ROCMBlas::DoBlasGemvWithProfiling(
+    Stream *stream, blas::Transpose trans, uint64 m, uint64 n,
+    std::complex<double> alpha, const DeviceMemory<std::complex<double>> &a,
+    int lda, const DeviceMemory<std::complex<double>> &x, int incx,
+    std::complex<double> beta, DeviceMemory<std::complex<double>> *y, int incy,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemvWithProfilingImpl(stream, trans, m, n, alpha, a, lda, x,
+                                     incx, beta, y, incy,
+                                     output_profile_result);
+}
+
+bool ROCMBlas::DoBlasGemmWithProfiling(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha, const DeviceMemory<Eigen::half> &a,
+    int lda, const DeviceMemory<Eigen::half> &b, int ldb, float beta,
+    DeviceMemory<Eigen::half> *c, int ldc,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
+                                     lda, b, ldb, beta, c, ldc,
+                                     output_profile_result);
+}
+
+bool ROCMBlas::DoBlasGemmWithProfiling(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
+    const DeviceMemory<float> &b, int ldb, float beta, DeviceMemory<float> *c,
+    int ldc, blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
+                                     lda, b, ldb, beta, c, ldc,
+                                     output_profile_result);
+}
+
+bool ROCMBlas::DoBlasGemmWithProfiling(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
+    const DeviceMemory<double> &b, int ldb, double beta,
+    DeviceMemory<double> *c, int ldc,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
+                                     lda, b, ldb, beta, c, ldc,
+                                     output_profile_result);
+}
+
+bool ROCMBlas::DoBlasGemmWithProfiling(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, std::complex<float> alpha,
+    const DeviceMemory<std::complex<float>> &a, int lda,
+    const DeviceMemory<std::complex<float>> &b, int ldb,
+    std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
+                                     lda, b, ldb, beta, c, ldc,
+                                     output_profile_result);
+}
+
+bool ROCMBlas::DoBlasGemmWithProfiling(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, std::complex<double> alpha,
+    const DeviceMemory<std::complex<double>> &a, int lda,
+    const DeviceMemory<std::complex<double>> &b, int ldb,
+    std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
+    blas::ProfileResult *output_profile_result) {
+  return DoBlasGemmWithProfilingImpl(stream, transa, transb, m, n, k, alpha, a,
+                                     lda, b, ldb, beta, c, ldc,
+                                     output_profile_result);
+}
+
+template <typename T>
+bool ROCMBlas::DoBlasGemvWithProfilingImpl(
+    Stream *stream, blas::Transpose trans, uint64 m, uint64 n, const T &alpha,
+    const DeviceMemory<T> &a, int lda, const DeviceMemory<T> &x, int incx,
+    const T &beta, DeviceMemory<T> *y, int incy,
+    blas::ProfileResult *output_profile_result) {
+  // ROCM TODO: properly implement the interface
+  return false;
+}
+
+template <typename T, typename ParamType>
+bool ROCMBlas::DoBlasGemmWithProfilingImpl(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const ParamType &alpha, const DeviceMemory<T> &a,
+    int lda, const DeviceMemory<T> &b, int ldb, const ParamType &beta,
+    DeviceMemory<T> *c, int ldc, blas::ProfileResult *output_profile_result) {
+  // ROCM TODO: properly implement the interface
+  return false;
+}
+
+template <typename InT, typename OutT, typename CompT>
+bool ROCMBlas::DoBlasGemmWithAlgorithmImpl(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const CompT &alpha, const DeviceMemory<InT> &a, int lda,
+    const DeviceMemory<InT> &b, int ldb, const CompT &beta,
+    DeviceMemory<OutT> *c, int ldc, blas::ComputationType computation_type,
+    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+  // ROCM TODO: properly implement the interface
+  return false;
+}
+
+bool ROCMBlas::GetBlasGemmAlgorithms(
+    std::vector<blas::AlgorithmType> *out_algorithms) {
+  // ROCM TODO: properly implement the interface
+  return true;
+}
+
+bool ROCMBlas::DoBlasGemmWithAlgorithm(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const HostOrDeviceScalar<int> &alpha,
+    const DeviceMemory<int8> &a, int lda, const DeviceMemory<int8> &b, int ldb,
+    const HostOrDeviceScalar<int> &beta, DeviceMemory<int32> *c, int ldc,
+    blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+    blas::ProfileResult *output_profile_result) {
+  LOG(ERROR)
+      << "rocBLAS does not currently support the GEMMwithAlgorithm operation "
+      << "for the \"int8\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemmWithAlgorithm(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const HostOrDeviceScalar<Eigen::half> &alpha,
+    const DeviceMemory<Eigen::half> &a, int lda,
+    const DeviceMemory<Eigen::half> &b, int ldb,
+    const HostOrDeviceScalar<Eigen::half> &beta, DeviceMemory<Eigen::half> *c,
+    int ldc, blas::ComputationType computation_type,
+    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+  LOG(ERROR)
+      << "rocBLAS does not currently support the GEMMwithAlgorithm operation "
+      << "for the \"half\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemmWithAlgorithm(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const HostOrDeviceScalar<float> &alpha,
+    const DeviceMemory<float> &a, int lda, const DeviceMemory<float> &b,
+    int ldb, const HostOrDeviceScalar<float> &beta, DeviceMemory<float> *c,
+    int ldc, blas::ComputationType computation_type,
+    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+  LOG(ERROR)
+      << "rocBLAS does not currently support the GEMMwithAlgorithm operation "
+      << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemmWithAlgorithm(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const HostOrDeviceScalar<double> &alpha,
+    const DeviceMemory<double> &a, int lda, const DeviceMemory<double> &b,
+    int ldb, const HostOrDeviceScalar<double> &beta, DeviceMemory<double> *c,
+    int ldc, blas::ComputationType computation_type,
+    blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result) {
+  LOG(ERROR)
+      << "rocBLAS does not currently support the GEMMwithAlgorithm operation "
+      << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemmWithAlgorithm(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const HostOrDeviceScalar<std::complex<float>> &alpha,
+    const DeviceMemory<std::complex<float>> &a, int lda,
+    const DeviceMemory<std::complex<float>> &b, int ldb,
+    const HostOrDeviceScalar<std::complex<float>> &beta,
+    DeviceMemory<std::complex<float>> *c, int ldc,
+    blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+    blas::ProfileResult *output_profile_result) {
+  LOG(ERROR)
+      << "rocBLAS does not currently support the GEMMwithAlgorithm operation "
+      << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemmWithAlgorithm(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, const HostOrDeviceScalar<std::complex<double>> &alpha,
+    const DeviceMemory<std::complex<double>> &a, int lda,
+    const DeviceMemory<std::complex<double>> &b, int ldb,
+    const HostOrDeviceScalar<std::complex<double>> &beta,
+    DeviceMemory<std::complex<double>> *c, int ldc,
+    blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+    blas::ProfileResult *output_profile_result) {
+  LOG(ERROR)
+      << "rocBLAS does not currently support the GEMMwithAlgorithm operation "
+      << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+template <typename T>
+struct EigenHalfToRocBlasHalf {
+  using type = T;
+};
+
+template <>
+struct EigenHalfToRocBlasHalf<Eigen::half> {
+  using type = rocblas_half;
+};
+
+template <typename T, typename FuncT>
+port::Status ROCMBlas::DoBlasGemmBatchedInternal(
+    FuncT rocblas_func, Stream *stream, blas::Transpose transa,
+    blas::Transpose transb, uint64 m, uint64 n, uint64 k, T alpha,
+    const port::ArraySlice<DeviceMemory<T> *> &a_ptrs_to_wrappers, int lda,
+    const port::ArraySlice<DeviceMemory<T> *> &b_ptrs_to_wrappers, int ldb,
+    T beta, const port::ArraySlice<DeviceMemory<T> *> &c_ptrs_to_wrappers,
+    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+  // MAPPED_T will be same as T for all types except Eigen::Half
+  // for T = Eigen::half, MAPPED_T = rocblas_half
+  using MAPPED_T = typename EigenHalfToRocBlasHalf<T>::type;
+
+  // Alocate local vectors to hold device pointers to matrices
+  std::vector<MAPPED_T *> a_raw_ptrs, b_raw_ptrs, c_raw_ptrs;
+  for (int i = 0; i < batch_count; ++i) {
+    // static_cast does work when converting Eigen::half* to rocblas_half*,
+    // hence the use od reinterpret_cast
+    a_raw_ptrs.push_back(
+        reinterpret_cast<MAPPED_T *>(a_ptrs_to_wrappers[i]->opaque()));
+    b_raw_ptrs.push_back(
+        reinterpret_cast<MAPPED_T *>(b_ptrs_to_wrappers[i]->opaque()));
+    c_raw_ptrs.push_back(
+        reinterpret_cast<MAPPED_T *>(c_ptrs_to_wrappers[i]->opaque()));
+  }
+
+  //  batch_count <= 1 is base case, no definable matrix stride, set it same as
+  //  ld*
+  long long bsa = lda;
+  long long bsb = ldb;
+  long long bsc = ldc;
+  bool bsa_is_constant = true;
+  bool bsb_is_constant = true;
+  bool bsc_is_constant = true;
+
+  if (batch_count > 1) {
+    // Remember first stride; if any other stride is different that this one,
+    // KABLAM
+    bsa = a_raw_ptrs[1] - a_raw_ptrs[0];
+    bsb = b_raw_ptrs[1] - b_raw_ptrs[0];
+    bsc = c_raw_ptrs[1] - c_raw_ptrs[0];
+
+    //  Loop to verify that batched strides are constant
+    //  All the test cases from batch_matmul_op_test.py seem to satisfy this
+    //  requirement of a constant stride.  If this can be proven globally, then
+    //  this loop check can be safely removed
+    for (int i = 1; i < batch_count - 1; ++i) {
+      long long iterative_bsa = a_raw_ptrs[i + 1] - a_raw_ptrs[i];
+      if (iterative_bsa != bsa) {
+        bsa_is_constant = false;
+        break;
+      }
+
+      long long iterative_bsb = b_raw_ptrs[i + 1] - b_raw_ptrs[i];
+      if (iterative_bsb != bsb) {
+        bsb_is_constant = false;
+        break;
+      }
+
+      long long iterative_bsc = c_raw_ptrs[i + 1] - c_raw_ptrs[i];
+      if (iterative_bsc != bsc) {
+        bsc_is_constant = false;
+        break;
+      }
+    }
+  }
+
+  assert(!(ldc < m || bsc < ldc * n));
+
+  if (ROCMBlasTranspose(transa) == rocblas_operation_none)
+    assert(!(lda < m || bsa < lda * k));
+  else
+    assert(!(lda < k || bsa < lda * m));
+
+  if (ROCMBlasTranspose(transb) == rocblas_operation_none)
+    assert(!(ldb < k || bsb < ldb * n));
+  else
+    assert(!(ldb < n || bsc < ldc * k));
+
+  MAPPED_T *alpha_ptr = reinterpret_cast<MAPPED_T *>(&alpha);
+  MAPPED_T *beta_ptr = reinterpret_cast<MAPPED_T *>(&beta);
+
+  if (bsa_is_constant && bsb_is_constant && bsc_is_constant) {
+    bool ok = DoBlasInternal(
+        rocblas_func, stream, true /* = pointer_mode_host */,
+        ROCMBlasTranspose(transa), ROCMBlasTranspose(transb), m, n, k,
+        GpuComplex(alpha_ptr), a_raw_ptrs[0], lda, bsa, b_raw_ptrs[0], ldb, bsb,
+        GpuComplex(beta_ptr), c_raw_ptrs[0], ldc, bsc, batch_count);
+
+    if (ok) {
+      return port::Status::OK();
+    }
+  }
+
+  return port::Status(port::error::INTERNAL,
+                      "failed BLAS call, see log for details");
+}
+
+bool ROCMBlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb, float beta,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &c, int ldc,
+    int batch_count, ScratchAllocator *scratch_allocator) {
+  const Eigen::half alpha_half(alpha);
+  const Eigen::half beta_half(beta);
+
+  port::Status status = DoBlasGemmBatchedInternal(
+      wrap::rocblas_hgemm_strided_batched, stream, transa, transb, m, n, k,
+      alpha_half, a, lda, b, ldb, beta_half, c, ldc, batch_count,
+      scratch_allocator);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+
+  return status.ok();
+}
+
+bool ROCMBlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha,
+    const port::ArraySlice<DeviceMemory<float> *> &a_array, int lda,
+    const port::ArraySlice<DeviceMemory<float> *> &b_array, int ldb, float beta,
+    const port::ArraySlice<DeviceMemory<float> *> &c_array, int ldc,
+    int batch_count, ScratchAllocator *scratch_allocator) {
+  port::Status status = DoBlasGemmBatchedInternal(
+      wrap::rocblas_sgemm_strided_batched, stream, transa, transb, m, n, k,
+      alpha, a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count,
+      scratch_allocator);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return status.ok();
+}
+
+bool ROCMBlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, double alpha,
+    const port::ArraySlice<DeviceMemory<double> *> &a_array, int lda,
+    const port::ArraySlice<DeviceMemory<double> *> &b_array, int ldb,
+    double beta, const port::ArraySlice<DeviceMemory<double> *> &c_array,
+    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+  port::Status status = DoBlasGemmBatchedInternal(
+      wrap::rocblas_dgemm_strided_batched, stream, transa, transb, m, n, k,
+      alpha, a_array, lda, b_array, ldb, beta, c_array, ldc, batch_count,
+      scratch_allocator);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return status.ok();
+}
+
+bool ROCMBlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, std::complex<float> alpha,
+    const port::ArraySlice<DeviceMemory<std::complex<float>> *> &a_array,
+    int lda,
+    const port::ArraySlice<DeviceMemory<std::complex<float>> *> &b_array,
+    int ldb, std::complex<float> beta,
+    const port::ArraySlice<DeviceMemory<std::complex<float>> *> &c_array,
+    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+  LOG(ERROR) << "rocBLAS does not currently support the GEMMBatched operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, std::complex<double> alpha,
+    const port::ArraySlice<DeviceMemory<std::complex<double>> *> &a_array,
+    int lda,
+    const port::ArraySlice<DeviceMemory<std::complex<double>> *> &b_array,
+    int ldb, std::complex<double> beta,
+    const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c_array,
+    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+  LOG(ERROR) << "rocBLAS does not currently support the GEMMBatched operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHemm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, uint64 m, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &b, int ldb,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the HEMM operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHemm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, uint64 m, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &b, int ldb,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the HEMM operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, uint64 n, uint64 k,
+                          float alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          float beta, DeviceMemory<std::complex<float>> *c,
+                          int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the HERK operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHerk(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, uint64 n, uint64 k,
+                          double alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          double beta, DeviceMemory<std::complex<double>> *c,
+                          int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the HERK operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
+                           blas::Transpose trans, uint64 n, uint64 k,
+                           std::complex<float> alpha,
+                           const DeviceMemory<std::complex<float>> &a, int lda,
+                           const DeviceMemory<std::complex<float>> &b, int ldb,
+                           float beta, DeviceMemory<std::complex<float>> *c,
+                           int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the HER2K operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasHer2k(Stream *stream, blas::UpperLower uplo,
+                           blas::Transpose trans, uint64 n, uint64 k,
+                           std::complex<double> alpha,
+                           const DeviceMemory<std::complex<double>> &a, int lda,
+                           const DeviceMemory<std::complex<double>> &b, int ldb,
+                           double beta, DeviceMemory<std::complex<double>> *c,
+                           int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the HER2K operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSymm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, uint64 m, uint64 n,
+                          float alpha, const DeviceMemory<float> &a, int lda,
+                          const DeviceMemory<float> &b, int ldb, float beta,
+                          DeviceMemory<float> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYMM operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSymm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, uint64 m, uint64 n,
+                          double alpha, const DeviceMemory<double> &a, int lda,
+                          const DeviceMemory<double> &b, int ldb, double beta,
+                          DeviceMemory<double> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYMM operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSymm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, uint64 m, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          const DeviceMemory<std::complex<float>> &b, int ldb,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYMM operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSymm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, uint64 m, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          const DeviceMemory<std::complex<double>> &b, int ldb,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYMM operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, uint64 n, uint64 k,
+                          float alpha, const DeviceMemory<float> &a, int lda,
+                          float beta, DeviceMemory<float> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYRK operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, uint64 n, uint64 k,
+                          double alpha, const DeviceMemory<double> &a, int lda,
+                          double beta, DeviceMemory<double> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYRK operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, uint64 n, uint64 k,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          std::complex<float> beta,
+                          DeviceMemory<std::complex<float>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYRK operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyrk(Stream *stream, blas::UpperLower uplo,
+                          blas::Transpose trans, uint64 n, uint64 k,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          std::complex<double> beta,
+                          DeviceMemory<std::complex<double>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYRK operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
+                           blas::Transpose trans, uint64 n, uint64 k,
+                           float alpha, const DeviceMemory<float> &a, int lda,
+                           const DeviceMemory<float> &b, int ldb, float beta,
+                           DeviceMemory<float> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYR2K operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
+                           blas::Transpose trans, uint64 n, uint64 k,
+                           double alpha, const DeviceMemory<double> &a, int lda,
+                           const DeviceMemory<double> &b, int ldb, double beta,
+                           DeviceMemory<double> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYR2K operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
+                           blas::Transpose trans, uint64 n, uint64 k,
+                           std::complex<float> alpha,
+                           const DeviceMemory<std::complex<float>> &a, int lda,
+                           const DeviceMemory<std::complex<float>> &b, int ldb,
+                           std::complex<float> beta,
+                           DeviceMemory<std::complex<float>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYR2K operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasSyr2k(Stream *stream, blas::UpperLower uplo,
+                           blas::Transpose trans, uint64 n, uint64 k,
+                           std::complex<double> alpha,
+                           const DeviceMemory<std::complex<double>> &a, int lda,
+                           const DeviceMemory<std::complex<double>> &b, int ldb,
+                           std::complex<double> beta,
+                           DeviceMemory<std::complex<double>> *c, int ldc) {
+  LOG(ERROR) << "rocBLAS does not currently support the SYR2K operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrmm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n, float alpha,
+                          const DeviceMemory<float> &a, int lda,
+                          DeviceMemory<float> *b, int ldb) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRMM operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrmm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n, double alpha,
+                          const DeviceMemory<double> &a, int lda,
+                          DeviceMemory<double> *b, int ldb) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRMM operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrmm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          DeviceMemory<std::complex<float>> *b, int ldb) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRMM operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrmm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          DeviceMemory<std::complex<double>> *b, int ldb) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRMM operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrsm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n, float alpha,
+                          const DeviceMemory<float> &a, int lda,
+                          DeviceMemory<float> *b, int ldb) {
+  return DoBlasInternal(
+      wrap::rocblas_strsm, stream, true /* = pointer_mode_host */,
+      ROCMBlasSide(side), ROCMBlasUpperLower(uplo), ROCMBlasTranspose(transa),
+      ROCMBlasDiagonal(diag), m, n, &alpha, const_cast<float *>(GpuMemory(a)),
+      lda, GpuMemoryMutable(b), ldb);
+}
+
+bool ROCMBlas::DoBlasTrsm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n, double alpha,
+                          const DeviceMemory<double> &a, int lda,
+                          DeviceMemory<double> *b, int ldb) {
+  return DoBlasInternal(
+      wrap::rocblas_dtrsm, stream, true /* = pointer_mode_host */,
+      ROCMBlasSide(side), ROCMBlasUpperLower(uplo), ROCMBlasTranspose(transa),
+      ROCMBlasDiagonal(diag), m, n, &alpha, const_cast<double *>(GpuMemory(a)),
+      lda, GpuMemoryMutable(b), ldb);
+}
+
+bool ROCMBlas::DoBlasTrsm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n,
+                          std::complex<float> alpha,
+                          const DeviceMemory<std::complex<float>> &a, int lda,
+                          DeviceMemory<std::complex<float>> *b, int ldb) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRSM operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+
+bool ROCMBlas::DoBlasTrsm(Stream *stream, blas::Side side,
+                          blas::UpperLower uplo, blas::Transpose transa,
+                          blas::Diagonal diag, uint64 m, uint64 n,
+                          std::complex<double> alpha,
+                          const DeviceMemory<std::complex<double>> &a, int lda,
+                          DeviceMemory<std::complex<double>> *b, int ldb) {
+  LOG(ERROR) << "rocBLAS does not currently support the TRSM operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+bool ROCMBlas::DoBlasGemmStridedBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha, const DeviceMemory<Eigen::half> &a,
+    int lda, int64 stride_a, const DeviceMemory<Eigen::half> &b, int ldb,
+    int64 stride_b, float beta, DeviceMemory<Eigen::half> *c, int ldc,
+    int64 stride_c, int batch_count) {
+  LOG(ERROR) << "rocBLAS does not currently support the "
+                "DoBlasGemmStridedBatched operation "
+             << "for the \"Eigen::half\" dataype";
+  return false;
+}
+bool ROCMBlas::DoBlasGemmStridedBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha, const DeviceMemory<float> &a, int lda,
+    int64 stride_a, const DeviceMemory<float> &b, int ldb, int64 stride_b,
+    float beta, DeviceMemory<float> *c, int ldc, int64 stride_c,
+    int batch_count) {
+  LOG(ERROR) << "rocBLAS does not currently support the "
+                "DoBlasGemmStridedBatched operation "
+             << "for the \"float\" dataype";
+  return false;
+}
+bool ROCMBlas::DoBlasGemmStridedBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, double alpha, const DeviceMemory<double> &a, int lda,
+    int64 stride_a, const DeviceMemory<double> &b, int ldb, int64 stride_b,
+    double beta, DeviceMemory<double> *c, int ldc, int64 stride_c,
+    int batch_count) {
+  LOG(ERROR) << "rocBLAS does not currently support the "
+                "DoBlasGemmStridedBatched operation "
+             << "for the \"double\" dataype";
+  return false;
+}
+bool ROCMBlas::DoBlasGemmStridedBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, std::complex<float> alpha,
+    const DeviceMemory<std::complex<float>> &a, int lda, int64 stride_a,
+    const DeviceMemory<std::complex<float>> &b, int ldb, int64 stride_b,
+    std::complex<float> beta, DeviceMemory<std::complex<float>> *c, int ldc,
+    int64 stride_c, int batch_count) {
+  LOG(ERROR) << "rocBLAS does not currently support the "
+                "DoBlasGemmStridedBatched operation "
+             << "for the \"complex<float>\" dataype";
+  return false;
+}
+bool ROCMBlas::DoBlasGemmStridedBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, std::complex<double> alpha,
+    const DeviceMemory<std::complex<double>> &a, int lda, int64 stride_a,
+    const DeviceMemory<std::complex<double>> &b, int ldb, int64 stride_b,
+    std::complex<double> beta, DeviceMemory<std::complex<double>> *c, int ldc,
+    int64 stride_c, int batch_count) {
+  LOG(ERROR) << "rocBLAS does not currently support the "
+                "DoBlasGemmStridedBatched operation "
+             << "for the \"complex<double>\" dataype";
+  return false;
+}
+}  // namespace gpu
+
+void initialize_rocblas() {
+  auto rocBlasAlreadyRegistered = PluginRegistry::Instance()->HasFactory(
+      rocm::kROCmPlatformId, PluginKind::kBlas, gpu::kRocBlasPlugin);
+
+  if (!rocBlasAlreadyRegistered) {
+    port::Status status =
+        PluginRegistry::Instance()
+            ->RegisterFactory<PluginRegistry::BlasFactory>(
+                rocm::kROCmPlatformId, gpu::kRocBlasPlugin, "rocBLAS",
+                [](internal::StreamExecutorInterface *parent)
+                    -> blas::BlasSupport * {
+                  gpu::GpuExecutor *rocm_executor =
+                      dynamic_cast<gpu::GpuExecutor *>(parent);
+                  if (rocm_executor == nullptr) {
+                    LOG(ERROR)
+                        << "Attempting to initialize an instance of the "
+                           "rocBLAS "
+                        << "support library with a non-ROCM StreamExecutor";
+                    return nullptr;
+                  }
+
+                  gpu::ROCMBlas *blas = new gpu::ROCMBlas(rocm_executor);
+                  if (!blas->Init()) {
+                    // Note: Init() will log a more specific error.
+                    delete blas;
+                    return nullptr;
+                  }
+                  return blas;
+                });
+
+    if (!status.ok()) {
+      LOG(ERROR) << "Unable to register rocBLAS factory: "
+                 << status.error_message();
+    }
+
+    PluginRegistry::Instance()->SetDefaultFactory(
+        rocm::kROCmPlatformId, PluginKind::kBlas, gpu::kRocBlasPlugin);
+  }
+}
+
+}  // namespace stream_executor
+
+REGISTER_MODULE_INITIALIZER(register_rocblas,
+                            { stream_executor::initialize_rocblas(); });
diff --git a/tensorflow/stream_executor/rocm/rocm_blas.h b/tensorflow/stream_executor/rocm/rocm_blas.h
new file mode 100644
index 0000000000000000000000000000000000000000..75c68481cee5f6123c80e9751c06392f1835a5ce
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_blas.h
@@ -0,0 +1,159 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// ROCM-specific support for BLAS functionality -- this wraps the rocBLAS
+// library capabilities, and is only included into ROCM implementation code --
+// it will not introduce rocm headers into other code.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_BLAS_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_BLAS_H_
+
+#include "tensorflow/stream_executor/blas.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+
+namespace stream_executor {
+
+class Stream;
+
+namespace gpu {
+
+// Opaque and unique identifier for the rocBLAS plugin.
+extern const PluginId kRocBlasPlugin;
+
+class GpuExecutor;
+
+// BLAS plugin for ROCM platform via rocBLAS library.
+//
+// This satisfies the platform-agnostic BlasSupport interface.
+//
+// Note that the rocBLAS handle that this encapsulates is implicitly tied to the
+// context (and, as a result, the device) that the parent GpuExecutor is tied
+// to. This simply happens as an artifact of creating the rocBLAS handle when a
+// ROCM context is active.
+//
+// Thread-safe post-initialization.
+class ROCMBlas : public blas::BlasSupport {
+ public:
+  explicit ROCMBlas(GpuExecutor *parent);
+
+  // Allocates a rocBLAS handle.
+  bool Init();
+
+  // Releases the rocBLAS handle, if present.
+  ~ROCMBlas() override;
+
+  TENSORFLOW_STREAM_EXECUTOR_GPU_BLAS_SUPPORT_OVERRIDES
+
+ private:
+  // Tells rocBLAS to enqueue the BLAS operation onto a particular Stream.
+  //
+  // rocBLAS is stateful, and only be associated with one stream (in order to
+  // enqueue dispatch) at a given time. As a result, this generally must be
+  // invoked before calling into rocBLAS.
+  bool SetStream(Stream *stream) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // A helper function that calls the real rocBLAS function together with error
+  // handling.
+  //
+  // rocblas_func:       rocBLAS function pointer.
+  // rocblas_name:       rocBLAS function name.
+  // stream:             Stream to enqueue the BLAS operation onto.
+  // pointer_mode_host:  Indicate if the pointer to a scalar value is from host
+  //                     (true) or device (false).
+  // err_on_failure:     Whether to print an error if the rocBLAS function
+  // fails. args:               Arguments of rocBLAS function.
+  template <typename FuncT, typename... Args>
+  bool DoBlasInternalImpl(FuncT rocblas_func, Stream *stream,
+                          bool pointer_mode_host, bool err_on_failure,
+                          Args... args);
+
+  // Convenience functions that call DoBlasInternalImpl with different values
+  // for err_on_failure.
+  template <typename FuncT, typename... Args>
+  bool DoBlasInternal(FuncT rocblas_func, Stream *stream,
+                      bool pointer_mode_host, Args... args) {
+    return DoBlasInternalImpl(rocblas_func, stream, pointer_mode_host,
+                              /*err_on_failure=*/true, args...);
+  }
+  template <typename FuncT, typename... Args>
+  bool DoBlasInternalFailureOK(FuncT rocblas_func, Stream *stream,
+                               bool pointer_mode_host, Args... args) {
+    return DoBlasInternalImpl(rocblas_func, stream, pointer_mode_host,
+                              /*err_on_failure=*/false, args...);
+  }
+
+  // A helper function to implement DoBlasGemmBatched interfaces for generic
+  // types.
+  template <typename T, typename FuncT>
+  port::Status DoBlasGemmBatchedInternal(
+      FuncT rocblas_func, Stream *stream, blas::Transpose transa,
+      blas::Transpose transb, uint64 m, uint64 n, uint64 k, T alpha,
+      const port::ArraySlice<DeviceMemory<T> *> &a_array, int lda,
+      const port::ArraySlice<DeviceMemory<T> *> &b_array, int ldb, T beta,
+      const port::ArraySlice<DeviceMemory<T> *> &c_array, int ldc,
+      int batch_count, ScratchAllocator *scratch_allocator);
+
+  // Helper function for implementing DoBlasGemmWithAlgorithm.
+  //
+  // We take alpha and beta by const reference because T might be Eigen::half,
+  // and we want to avoid pulling in a dependency on Eigen.  When we pass the
+  // references to rocBLAS, we essentially reinterpret_cast to __half, which is
+  // safe because Eigen::half inherits from __half.
+  template <typename InT, typename OutT, typename CompT>
+  bool DoBlasGemmWithAlgorithmImpl(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+      uint64 n, uint64 k, const CompT &alpha, const DeviceMemory<InT> &a,
+      int lda, const DeviceMemory<InT> &b, int ldb, const CompT &beta,
+      DeviceMemory<OutT> *c, int ldc, blas::ComputationType computation_type,
+      blas::AlgorithmType algorithm,
+      blas::ProfileResult *output_profile_result);
+
+  // Helper function for implementing DoBlasGemmWithProfiling.
+  template <typename T, typename ParamType>
+  bool DoBlasGemmWithProfilingImpl(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+      uint64 n, uint64 k, const ParamType &alpha, const DeviceMemory<T> &a,
+      int lda, const DeviceMemory<T> &b, int ldb, const ParamType &beta,
+      DeviceMemory<T> *c, int ldc, blas::ProfileResult *output_profile_result);
+
+  // Helper function for implementing DoBlasGemvWithProfiling.
+  template <typename T>
+  bool DoBlasGemvWithProfilingImpl(Stream *stream, blas::Transpose trans,
+                                   uint64 m, uint64 n, const T &alpha,
+                                   const DeviceMemory<T> &a, int lda,
+                                   const DeviceMemory<T> &x, int incx,
+                                   const T &beta, DeviceMemory<T> *y, int incy,
+                                   blas::ProfileResult *output_profile_result);
+
+  // mutex that guards the rocBLAS handle for this device.
+  mutex mu_;
+
+  // GpuExecutor which instantiated this ROCMBlas.
+  // Immutable post-initialization.
+  GpuExecutor *parent_;
+
+  // rocBLAS library handle on the device.
+  rocblas_handle blas_ GUARDED_BY(mu_);
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ROCMBlas);
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_BLAS_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_diagnostics.cc b/tensorflow/stream_executor/rocm/rocm_diagnostics.cc
new file mode 100644
index 0000000000000000000000000000000000000000..812974a9debb88e7db924680089acc5dc2ccc5a2
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_diagnostics.cc
@@ -0,0 +1,240 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <dirent.h>
+
+#include <limits.h>
+#include <link.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/sysmacros.h>
+#include <unistd.h>
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/numbers.h"
+#include "tensorflow/stream_executor/lib/process_state.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/str_util.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/rocm/rocm_diagnostics.h"
+
+namespace stream_executor {
+namespace rocm {
+
+string DriverVersionToString(DriverVersion version) {
+  return absl::StrFormat("%d.%d.%d", std::get<0>(version), std::get<1>(version),
+                         std::get<2>(version));
+}
+
+string DriverVersionStatusToString(port::StatusOr<DriverVersion> version) {
+  if (!version.ok()) {
+    return version.status().ToString();
+  }
+
+  return DriverVersionToString(version.ValueOrDie());
+}
+
+port::StatusOr<DriverVersion> StringToDriverVersion(const string& value) {
+  std::vector<string> pieces = port::Split(value, '.');
+  if (pieces.size() != 2 && pieces.size() != 3) {
+    return port::Status{port::error::INVALID_ARGUMENT,
+                        absl::StrFormat("expected %%d.%%d or %%d.%%d.%%d form "
+                                        "for driver version; got \"%s\"",
+                                        value.c_str())};
+  }
+
+  int major;
+  int minor;
+  int patch = 0;
+  if (!port::safe_strto32(pieces[0], &major)) {
+    return port::Status{
+        port::error::INVALID_ARGUMENT,
+        absl::StrFormat("could not parse major version number \"%s\" as an "
+                        "integer from string \"%s\"",
+                        pieces[0].c_str(), value.c_str())};
+  }
+  if (!port::safe_strto32(pieces[1], &minor)) {
+    return port::Status{
+        port::error::INVALID_ARGUMENT,
+        absl::StrFormat("could not parse minor version number \"%s\" as an "
+                        "integer from string \"%s\"",
+                        pieces[1].c_str(), value.c_str())};
+  }
+  if (pieces.size() == 3 && !port::safe_strto32(pieces[2], &patch)) {
+    return port::Status{
+        port::error::INVALID_ARGUMENT,
+        absl::StrFormat("could not parse patch version number \"%s\" as an "
+                        "integer from string \"%s\"",
+                        pieces[2].c_str(), value.c_str())};
+  }
+
+  DriverVersion result{major, minor, patch};
+  VLOG(2) << "version string \"" << value << "\" made value "
+          << DriverVersionToString(result);
+  return result;
+}
+
+}  // namespace rocm
+}  // namespace stream_executor
+
+namespace stream_executor {
+namespace gpu {
+
+// -- class Diagnostician
+
+string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
+  return absl::StrCat("/dev/kfd", dev_node_ordinal);
+}
+
+void Diagnostician::LogDiagnosticInformation() {
+  LOG(INFO) << "retrieving ROCM diagnostic information for host: "
+            << port::Hostname();
+
+  LogDriverVersionInformation();
+}
+
+/* static */ void Diagnostician::LogDriverVersionInformation() {
+  LOG(INFO) << "hostname: " << port::Hostname();
+  if (VLOG_IS_ON(1)) {
+    const char* value = getenv("LD_LIBRARY_PATH");
+    string library_path = value == nullptr ? "" : value;
+    VLOG(1) << "LD_LIBRARY_PATH is: \"" << library_path << "\"";
+
+    std::vector<string> pieces = port::Split(library_path, ':');
+    for (const auto& piece : pieces) {
+      if (piece.empty()) {
+        continue;
+      }
+      DIR* dir = opendir(piece.c_str());
+      if (dir == nullptr) {
+        VLOG(1) << "could not open \"" << piece << "\"";
+        continue;
+      }
+      while (dirent* entity = readdir(dir)) {
+        VLOG(1) << piece << " :: " << entity->d_name;
+      }
+      closedir(dir);
+    }
+  }
+  port::StatusOr<DriverVersion> dso_version = FindDsoVersion();
+  LOG(INFO) << "librocm reported version is: "
+            << rocm::DriverVersionStatusToString(dso_version);
+
+  port::StatusOr<DriverVersion> kernel_version = FindKernelDriverVersion();
+  LOG(INFO) << "kernel reported version is: "
+            << rocm::DriverVersionStatusToString(kernel_version);
+
+  if (kernel_version.ok() && dso_version.ok()) {
+    WarnOnDsoKernelMismatch(dso_version, kernel_version);
+  }
+}
+
+// Iterates through loaded DSOs with DlIteratePhdrCallback to find the
+// driver-interfacing DSO version number. Returns it as a string.
+port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
+  port::StatusOr<DriverVersion> result{port::Status{
+      port::error::NOT_FOUND,
+      "was unable to find librocm.so DSO loaded into this program"}};
+
+  // Callback used when iterating through DSOs. Looks for the driver-interfacing
+  // DSO and yields its version number into the callback data, when found.
+  auto iterate_phdr = [](struct dl_phdr_info* info, size_t size,
+                         void* data) -> int {
+    if (strstr(info->dlpi_name, "librocm.so.1")) {
+      VLOG(1) << "found DLL info with name: " << info->dlpi_name;
+      char resolved_path[PATH_MAX] = {0};
+      if (realpath(info->dlpi_name, resolved_path) == nullptr) {
+        return 0;
+      }
+      VLOG(1) << "found DLL info with resolved path: " << resolved_path;
+      const char* slash = rindex(resolved_path, '/');
+      if (slash == nullptr) {
+        return 0;
+      }
+      const char* so_suffix = ".so.";
+      const char* dot = strstr(slash, so_suffix);
+      if (dot == nullptr) {
+        return 0;
+      }
+      string dso_version = dot + strlen(so_suffix);
+      // TODO(b/22689637): Eliminate the explicit namespace if possible.
+      auto stripped_dso_version = port::StripSuffixString(dso_version, ".ld64");
+      auto result = static_cast<port::StatusOr<DriverVersion>*>(data);
+      *result = rocm::StringToDriverVersion(stripped_dso_version);
+      return 1;
+    }
+    return 0;
+  };
+
+  dl_iterate_phdr(iterate_phdr, &result);
+
+  return result;
+}
+
+port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
+    const string& driver_version_file_contents) {
+  static const char* kDriverFilePrelude = "Kernel Module  ";
+  size_t offset = driver_version_file_contents.find(kDriverFilePrelude);
+  if (offset == string::npos) {
+    return port::Status{
+        port::error::NOT_FOUND,
+        absl::StrCat("could not find kernel module information in "
+                     "driver version file contents: \"",
+                     driver_version_file_contents, "\"")};
+  }
+
+  string version_and_rest = driver_version_file_contents.substr(
+      offset + strlen(kDriverFilePrelude), string::npos);
+  size_t space_index = version_and_rest.find(" ");
+  auto kernel_version = version_and_rest.substr(0, space_index);
+  // TODO(b/22689637): Eliminate the explicit namespace if possible.
+  auto stripped_kernel_version =
+      port::StripSuffixString(kernel_version, ".ld64");
+  return rocm::StringToDriverVersion(stripped_kernel_version);
+}
+
+void Diagnostician::WarnOnDsoKernelMismatch(
+    port::StatusOr<DriverVersion> dso_version,
+    port::StatusOr<DriverVersion> kernel_version) {
+  if (kernel_version.ok() && dso_version.ok() &&
+      dso_version.ValueOrDie() == kernel_version.ValueOrDie()) {
+    LOG(INFO) << "kernel version seems to match DSO: "
+              << rocm::DriverVersionToString(kernel_version.ValueOrDie());
+  } else {
+    LOG(ERROR) << "kernel version "
+               << rocm::DriverVersionStatusToString(kernel_version)
+               << " does not match DSO version "
+               << rocm::DriverVersionStatusToString(dso_version)
+               << " -- cannot find working devices in this configuration";
+  }
+}
+
+port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
+  auto status = port::Status{port::error::UNIMPLEMENTED,
+                             "kernel reported driver version not implemented"};
+  return status;
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/rocm/rocm_diagnostics.h b/tensorflow/stream_executor/rocm/rocm_diagnostics.h
new file mode 100644
index 0000000000000000000000000000000000000000..233c6bdade68e19e02a30c92e92d5961d9ca260b
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_diagnostics.h
@@ -0,0 +1,41 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DIAGNOSTICS_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DIAGNOSTICS_H_
+
+#include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
+
+namespace stream_executor {
+namespace rocm {
+
+// e.g. DriverVersion{346, 3, 4}
+using DriverVersion = gpu::DriverVersion;
+
+// Converts a parsed driver version to string form.
+string DriverVersionToString(DriverVersion version);
+
+// Converts a parsed driver version or status value to natural string form.
+string DriverVersionStatusToString(port::StatusOr<DriverVersion> version);
+
+// Converts a string of a form like "331.79" to a DriverVersion{331, 79}.
+port::StatusOr<DriverVersion> StringToDriverVersion(const string& value);
+
+using Diagnostician = gpu::Diagnostician;
+
+}  // namespace rocm
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DIAGNOSTICS_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_driver.cc b/tensorflow/stream_executor/rocm/rocm_driver.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1b0b91426aae0ed7cca445143186a9b5daeced6a
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_driver.cc
@@ -0,0 +1,1391 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <map>
+#include <set>
+#include <utility>
+
+#include "absl/base/casts.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/human_readable.h"
+#include "tensorflow/stream_executor/lib/notification.h"
+#include "tensorflow/stream_executor/lib/stacktrace.h"
+#include "tensorflow/stream_executor/lib/static_threadlocal.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/lib/threadpool.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/rocm/rocm_driver_wrapper.h"
+
+bool FLAGS_gpuexec_rocm_driver_inject_init_error = false;
+bool FLAGS_gpuexec_rocm_sync_around_driver_calls = false;
+bool FLAGS_gpuexec_rocm_device_0_only = false;
+
+// Debugging: on each push and pop of a rocm context, verify the current device
+// matches the expected one.
+constexpr bool kVerifyGpuContext = false;
+
+namespace stream_executor {
+namespace gpu {
+
+// GpuContext wraps the device_ordinal.
+// Only reason we need this wrapper class is to make the GpuDriver* API
+class GpuContext {
+ public:
+  GpuContext(const int v) : device_ordinal_(v) {}
+
+  int device_ordinal() const { return device_ordinal_; }
+
+  // Disallow copying and moving.
+  GpuContext(GpuContext&&) = delete;
+  GpuContext(const GpuContext&) = delete;
+  GpuContext& operator=(GpuContext&&) = delete;
+  GpuContext& operator=(const GpuContext&) = delete;
+
+ private:
+  const int device_ordinal_;
+};
+
+namespace {
+
+// Formats hipError_t to output prettified values into a log stream.
+// Error summaries taken from:
+//
+// TODO(leary) switch to cuGetErrorName when updated rocm.h is available.
+string ToString(hipError_t result) {
+#define OSTREAM_ROCM_ERROR(__name) \
+  case hipError##__name:           \
+    return "HIP_ERROR_" #__name;
+
+  switch (result) {
+    OSTREAM_ROCM_ERROR(InvalidValue)
+    OSTREAM_ROCM_ERROR(OutOfMemory)
+    OSTREAM_ROCM_ERROR(NotInitialized)
+    OSTREAM_ROCM_ERROR(Deinitialized)
+    OSTREAM_ROCM_ERROR(NoDevice)
+    OSTREAM_ROCM_ERROR(InvalidDevice)
+    OSTREAM_ROCM_ERROR(InvalidImage)
+    OSTREAM_ROCM_ERROR(InvalidContext)
+    OSTREAM_ROCM_ERROR(InvalidHandle)
+    OSTREAM_ROCM_ERROR(NotFound)
+    OSTREAM_ROCM_ERROR(NotReady)
+    OSTREAM_ROCM_ERROR(NoBinaryForGpu)
+
+    // Encountered an uncorrectable ECC error during execution.
+    OSTREAM_ROCM_ERROR(ECCNotCorrectable)
+
+    // Load/store on an invalid address. Must reboot all context.
+    case 700:
+      return "ROCM_ERROR_ILLEGAL_ADDRESS";
+    // Passed too many / wrong arguments, too many threads for register count.
+    case 701:
+      return "ROCM_ERROR_LAUNCH_OUT_OF_RESOURCES";
+
+      OSTREAM_ROCM_ERROR(ContextAlreadyInUse)
+      OSTREAM_ROCM_ERROR(PeerAccessUnsupported)
+      OSTREAM_ROCM_ERROR(Unknown)  // Unknown internal error to ROCM.
+    default:
+      return absl::StrCat("hipError_t(", static_cast<int>(result), ")");
+  }
+}
+
+// ROCM driver routines may require a large amount of stack (particularly
+// hipModuleLoadDataEx, in our experience). To avoid stack overflow when using
+// stack-limited threads (such as those spawned by a default-argument
+// thread::ThreadPool on some platforms), we run certain routines in this pool
+// and wait for completion.
+static mutex driver_executor_threadpool_mu(LINKER_INITIALIZED);
+static port::ThreadPool* InitializeDriverExecutor() {
+  return new port::ThreadPool(port::Env::Default(), port::ThreadOptions(),
+                              "rocm_driver", 1);
+}
+
+port::ThreadPool* GetDriverExecutor() {
+  mutex_lock lock(driver_executor_threadpool_mu);
+  static port::ThreadPool* thread_pool = InitializeDriverExecutor();
+  return thread_pool;
+}
+
+}  // namespace
+
+string MemorySpaceString(MemorySpace memory_space) {
+  switch (memory_space) {
+    case MemorySpace::kHost:
+      return "host";
+    case MemorySpace::kDevice:
+      return "device";
+    default:
+      LOG(FATAL) << "impossible memory space";
+  }
+}
+
+// Returns the current device set in HIP. This is done by calling the
+// HIP driver (e.g., this value is not our cached view of the current device).
+static int CurrentDeviceOrDie() {
+  int current = -1;
+  hipError_t result = tensorflow::wrap::hipGetDevice(&current);
+  if (result != hipSuccess) {
+    LOG(FATAL) << "failed to query current device: " << ToString(result);
+  }
+  return current;
+}
+
+namespace {
+
+// Call hipDeviceSynchronize and crash if it doesn't succeed.
+void SynchronizeOrDie() {
+  auto res = tensorflow::wrap::hipDeviceSynchronize();
+  if (res != hipSuccess) {
+    LOG(FATAL) << "Synchronize found " << ToString(res)
+               << " :: " << port::CurrentStackTrace();
+  }
+}
+
+struct ThreadLocalData {
+  int current_device_ordinal;
+  int depth;
+};
+
+SE_STATIC_THREAD_LOCAL_POD(ThreadLocalData, tls_data);
+
+}  // namespace
+
+ScopedActivateContext::ScopedActivateContext(GpuContext* context) {
+  if (FLAGS_gpuexec_rocm_sync_around_driver_calls) {
+    SynchronizeOrDie();
+  }
+
+  auto* tls = &tls_data.get();
+  if (tls->depth == 0) {
+    tls->current_device_ordinal = CurrentDeviceOrDie();
+  }
+
+  if (kVerifyGpuContext) {
+    CHECK_EQ(CurrentDeviceOrDie(), tls->current_device_ordinal);
+  }
+
+  tls->depth++;
+
+  to_restore_ = context;
+
+  if (context->device_ordinal() == tls->current_device_ordinal) {
+    DCHECK_EQ(CurrentDeviceOrDie(), context->device_ordinal());
+    return;
+  }
+
+  VLOG(3) << "ScopedActivateContext switching device from "
+          << tls->current_device_ordinal << " to " << context->device_ordinal();
+
+  // Set the device and update thread local.
+  CHECK_EQ(hipSuccess,
+           tensorflow::wrap::hipSetDevice(context->device_ordinal()));
+  tls->current_device_ordinal = context->device_ordinal();
+}
+
+ScopedActivateContext::~ScopedActivateContext() {
+  if (FLAGS_gpuexec_rocm_sync_around_driver_calls) {
+    SynchronizeOrDie();
+  }
+
+  auto* tls = &tls_data.get();
+
+  if (kVerifyGpuContext) {
+    CHECK_EQ(CurrentDeviceOrDie(), tls->current_device_ordinal);
+  }
+
+  tls->depth--;
+  DCHECK_GE(tls->depth, 0);
+
+  if (to_restore_->device_ordinal() == tls->current_device_ordinal) {
+    DCHECK_EQ(CurrentDeviceOrDie(), to_restore_->device_ordinal());
+    return;
+  }
+
+  VLOG(3) << "ScopedActivateContext switching device from "
+          << tls->current_device_ordinal << " to "
+          << to_restore_->device_ordinal();
+
+  // Set context and update thread local.
+  CHECK_EQ(hipSuccess,
+           tensorflow::wrap::hipSetDevice(to_restore_->device_ordinal()));
+  tls->current_device_ordinal = to_restore_->device_ordinal();
+}
+
+namespace {
+
+// Returns a stringified device number associated with pointer, primarily for
+// logging purposes. Returns "?" if the device could not be successfully
+// queried.
+string ROCMPointerToDeviceString(hipDeviceptr_t pointer) {
+  auto value = GpuDriver::GetPointerDevice(pointer);
+  if (value.ok()) {
+    return absl::StrCat(value.ValueOrDie());
+  }
+  LOG(ERROR) << "could not query device: " << value.status();
+  return "?";
+}
+
+// Returns a stringified memory space associated with pointer, primarily for
+// logging purposes. Returns "?" if the memory space could not be successfully
+// queried.
+string ROCMPointerToMemorySpaceString(hipDeviceptr_t pointer) {
+  auto value = GpuDriver::GetPointerMemorySpace(pointer);
+  if (value.ok()) {
+    return MemorySpaceString(value.ValueOrDie());
+  }
+  LOG(ERROR) << "could not query device: " << value.status();
+  return "?";
+}
+
+// Returns a stringified representation of whether or not peer access is
+// permitted between the "from" and "to" pointers' associated contexts,
+// primarily for logging purposes. Returns "error" if an error is encountered
+// in the process of querying.
+string ROCMPointersToCanAccessString(hipDeviceptr_t from, hipDeviceptr_t to) {
+  hipPointerAttribute_t from_pointerAttributes;
+  hipError_t result =
+      tensorflow::wrap::hipPointerGetAttributes(&from_pointerAttributes, from);
+  if (result != hipSuccess) {
+    LOG(ERROR) << "could not retrieve source pointer's device: "
+               << ToString(result);
+    return "error";
+  }
+
+  hipPointerAttribute_t to_pointerAttributes;
+  result = tensorflow::wrap::hipPointerGetAttributes(&to_pointerAttributes, to);
+  if (result != hipSuccess) {
+    LOG(ERROR) << "could not retrieve destination pointer's device: "
+               << ToString(result);
+    return "error";
+  }
+
+  GpuContext fromCtx(from_pointerAttributes.device);
+  GpuContext toCtx(to_pointerAttributes.device);
+
+  return GpuDriver::CanEnablePeerAccess(&fromCtx, &toCtx) ? "true" : "false";
+}
+
+// Actually performs the work of ROCM initialization. Wrapped up in one-time
+// execution guard.
+static port::Status InternalInit() {
+  hipError_t res = hipErrorNoDevice;
+  if (FLAGS_gpuexec_rocm_driver_inject_init_error) {
+    LOG(ERROR) << "injecting ROCM init error; initialization will fail";
+  } else {
+    res = tensorflow::wrap::hipInit(0 /* = flags */);
+  }
+
+  if (res == hipSuccess) {
+    return port::Status::OK();
+  }
+
+  LOG(ERROR) << "failed call to hipInit: " << ToString(res);
+  Diagnostician::LogDiagnosticInformation();
+  return port::Status{port::error::ABORTED,
+                      absl::StrCat("failed call to hipInit: ", ToString(res))};
+}
+
+}  // namespace
+
+/* static */ port::Status GpuDriver::Init() {
+  // Cached return value from calling InternalInit(), as hipInit need only be
+  // called once, but GpuDriver::Init may be called many times.
+  static port::Status init_retval;
+  static bool set = false;
+  static mutex* init_mu = new mutex;
+
+  mutex_lock lock(*init_mu);
+  if (!set) {
+    init_retval = InternalInit();
+    set = true;
+  }
+
+  return init_retval;
+}
+
+/* static */ port::Status GpuDriver::GetDevice(int device_ordinal,
+                                               hipDevice_t* device) {
+  hipError_t res = tensorflow::wrap::hipDeviceGet(device, device_ordinal);
+  if (res == hipSuccess) {
+    return port::Status::OK();
+  }
+
+  return port::Status{
+      port::error::INTERNAL,
+      absl::StrCat("failed call to hipDeviceGet: ", ToString(res))};
+}
+
+/* static */ bool GpuDriver::GetDeviceName(hipDevice_t device,
+                                           string* device_name) {
+  static const size_t kCharLimit = 64;
+  absl::InlinedVector<char, 4> chars(kCharLimit);
+  hipError_t res =
+      tensorflow::wrap::hipDeviceGetName(chars.begin(), kCharLimit - 1, device);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to get device name for " << device << ": "
+               << ToString(res);
+    return false;
+  }
+  chars[kCharLimit - 1] = '\0';
+  *device_name = chars.begin();
+  return true;
+}
+
+bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
+                                 int* flags) {
+  static_assert(DeviceOptions::kMask == 0xf,
+                "needs update for new device options");
+  return true;
+}
+
+/* static */ port::Status GpuDriver::CreateContext(
+    int device_ordinal, hipDevice_t device, const DeviceOptions& device_options,
+    GpuContext** context) {
+  *context = new GpuContext(device_ordinal);
+  return port::Status::OK();
+}
+/* static */ void GpuDriver::DestroyContext(GpuContext* context) {
+  if (context == nullptr) {
+    return;
+  }
+  delete context;
+}
+
+/* static */ bool GpuDriver::FuncGetAttribute(hipDeviceAttribute_t attribute,
+                                              hipFunction_t func,
+                                              int* attribute_value) {
+  // TODO(ROCm) properly implement this feature in HIP
+  hipError_t res = hipSuccess;
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query kernel attribute. kernel: " << func
+               << ", attribute: " << attribute;
+    return false;
+  }
+  return true;
+}
+
+/* static */ bool GpuDriver::FuncSetCacheConfig(hipFunction_t function,
+                                                hipFuncCache_t cache_config) {
+  hipError_t res =
+      tensorflow::wrap::hipFuncSetCacheConfig(function, cache_config);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to set ROCM kernel cache config. kernel: " << function
+               << ", config: " << cache_config << ", result: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ port::StatusOr<hipSharedMemConfig>
+GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
+  hipSharedMemConfig shared_mem_config;
+  ScopedActivateContext activation{context};
+  hipError_t result =
+      tensorflow::wrap::hipDeviceGetSharedMemConfig(&shared_mem_config);
+  if (result != hipSuccess) {
+    LOG(ERROR) << "failed to get ROCM device shared memory config. "
+               << "Context device ID: " << context->device_ordinal()
+               << ", result: " << ToString(result);
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrCat("failed to get shared memory config: ", ToString(result))};
+  }
+  return shared_mem_config;
+}
+
+/* static */ port::Status GpuDriver::ContextSetSharedMemConfig(
+    GpuContext* context, hipSharedMemConfig shared_mem_config) {
+  ScopedActivateContext activation{context};
+  hipError_t result =
+      tensorflow::wrap::hipDeviceSetSharedMemConfig(shared_mem_config);
+  if (result != hipSuccess) {
+    LOG(ERROR) << "failed to set ROCM device shared memory config. "
+               << "Context device ID: " << context->device_ordinal()
+               << ", config: " << shared_mem_config
+               << ", result: " << ToString(result);
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrCat("failed to set shared memory config: ", ToString(result))};
+  }
+  return port::Status::OK();
+}
+
+/* static */ bool GpuDriver::LaunchKernel(
+    GpuContext* context, hipFunction_t function, unsigned int grid_dim_x,
+    unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x,
+    unsigned int block_dim_y, unsigned int block_dim_z,
+    unsigned int shared_mem_bytes, GpuStreamHandle stream, void** kernel_params,
+    void** extra) {
+  ScopedActivateContext activation{context};
+  VLOG(2) << "launching kernel: " << function << "; gdx: " << grid_dim_x
+          << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
+          << " bdx: " << block_dim_x << " bdy: " << block_dim_y
+          << " bdz: " << block_dim_z << " smem: " << shared_mem_bytes;
+  hipError_t res = tensorflow::wrap::hipModuleLaunchKernel(
+      function, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x, block_dim_y,
+      block_dim_z, shared_mem_bytes, stream, kernel_params, extra);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to launch ROCM kernel: " << function
+               << "; result: " << ToString(res);
+    return false;
+  }
+  VLOG(2) << "successfully launched kernel";
+  return true;
+}
+
+/* static */ bool GpuDriver::LoadPtx(GpuContext* context,
+                                     const char* ptx_contents,
+                                     hipModule_t* module) {
+  LOG(ERROR) << "Feature not supported on ROCm platform (LoadPtx)";
+  return false;
+}
+
+/* static */ port::Status GpuDriver::LoadCubin(GpuContext* context,
+                                               const char* cubin_bytes,
+                                               hipModule_t* module) {
+  return port::Status{port::error::INTERNAL,
+                      "Feature not supported on ROCm platform (LoadCubin)"};
+}
+
+/* static */ bool GpuDriver::LoadHsaco(GpuContext* context,
+                                       const char* hsaco_contents,
+                                       hipModule_t* module) {
+  port::Notification notification;
+  bool ret = true;
+  GetDriverExecutor()->Schedule(
+      [context, hsaco_contents, module, &ret, &notification]() {
+        ScopedActivateContext activation{context};
+        void* hsaco_data = const_cast<char*>(hsaco_contents);
+
+        hipError_t res =
+            tensorflow::wrap::hipModuleLoadData(module, hsaco_data);
+
+        if (res != hipSuccess) {
+          LOG(ERROR) << "failed to load HSACO: " << ToString(res);
+          ret = false;
+          notification.Notify();
+        }
+
+        CHECK(module != nullptr);
+        notification.Notify();
+      });
+  notification.WaitForNotification();
+
+  return ret;
+}
+
+/* static */ bool GpuDriver::SynchronousMemsetUint8(GpuContext* context,
+                                                    hipDeviceptr_t location,
+                                                    uint8 value, size_t size) {
+  ScopedActivateContext activation{context};
+  hipError_t res = tensorflow::wrap::hipMemset(location, value, size);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to memset memory: " << ToString(res);
+    return false;
+  }
+  return true;
+}
+
+/* static */ bool GpuDriver::SynchronousMemsetUint32(GpuContext* context,
+                                                     hipDeviceptr_t location,
+                                                     uint32 value,
+                                                     size_t uint32_count) {
+  ScopedActivateContext activation{context};
+  void* pointer = absl::bit_cast<void*>(location);
+  unsigned char valueC = static_cast<unsigned char>(value);
+  uint32_t value32 = (valueC << 24) | (valueC << 16) | (valueC << 8) | (valueC);
+  if (value32 != value) {
+    //  mismatch indicates case where hipMemsetAsyc can't emulate hipMemSetD32
+    LOG(ERROR) << "failed to memset memory";
+    return false;
+  }
+  hipError_t res = tensorflow::wrap::hipMemset(pointer, static_cast<int>(value),
+                                               uint32_count * 4);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to memset memory: " << ToString(res);
+    return false;
+  }
+  return true;
+}
+
+/* static */ bool GpuDriver::AsynchronousMemsetUint8(GpuContext* context,
+                                                     hipDeviceptr_t location,
+                                                     uint8 value,
+                                                     size_t uint32_count,
+                                                     GpuStreamHandle stream) {
+  ScopedActivateContext activation{context};
+  hipError_t res =
+      tensorflow::wrap::hipMemsetAsync(location, value, uint32_count, stream);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
+    return false;
+  }
+  VLOG(2) << "successfully enqueued async memset operation";
+  return true;
+}
+
+/* static */ bool GpuDriver::AsynchronousMemsetUint32(GpuContext* context,
+                                                      hipDeviceptr_t location,
+                                                      uint32 value,
+                                                      size_t uint32_count,
+                                                      GpuStreamHandle stream) {
+  ScopedActivateContext activation{context};
+  void* pointer = absl::bit_cast<void*>(location);
+
+  // FIXME - need to set a 32-bit value here
+  unsigned char valueC = static_cast<unsigned char>(value);
+  uint32_t value32 = (valueC << 24) | (valueC << 16) | (valueC << 8) | (valueC);
+  if (value32 != value) {
+    // mismatch indicates case where hipMemsetAsyc can't emulate hipMemSetD32
+    LOG(ERROR) << "failed to memset memory";
+    return false;
+  }
+  hipError_t res = tensorflow::wrap::hipMemsetAsync(pointer, value,
+                                                    uint32_count * 4, stream);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
+    return false;
+  }
+  VLOG(2) << "successfully enqueued async memset operation";
+  return true;
+}
+
+/* static */ bool GpuDriver::AddStreamCallback(GpuContext* context,
+                                               GpuStreamHandle stream,
+                                               StreamCallback callback,
+                                               void* data) {
+  hipError_t res = tensorflow::wrap::hipStreamAddCallback(
+      stream, (hipStreamCallback_t)callback, data, 0 /* = flags */);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "unable to add host callback: " << ToString(res);
+    return false;
+  }
+  return true;
+}
+
+/* static */ bool GpuDriver::GetModuleFunction(GpuContext* context,
+                                               hipModule_t module,
+                                               const char* kernel_name,
+                                               hipFunction_t* function) {
+  ScopedActivateContext activated{context};
+  CHECK(module != nullptr && kernel_name != nullptr);
+  hipError_t res =
+      tensorflow::wrap::hipModuleGetFunction(function, module, kernel_name);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to get kernel \"" << kernel_name
+               << "\" from module: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ bool GpuDriver::GetModuleSymbol(GpuContext* context,
+                                             hipModule_t module,
+                                             const char* symbol_name,
+                                             hipDeviceptr_t* dptr,
+                                             size_t* bytes) {
+  ScopedActivateContext activated{context};
+  CHECK(module != nullptr && symbol_name != nullptr &&
+        (dptr != nullptr || bytes != nullptr));
+  hipError_t res =
+      tensorflow::wrap::hipModuleGetGlobal(dptr, bytes, module, symbol_name);
+  if (res != hipSuccess) {
+    // symbol may not be found in the current module, but it may reside in
+    // another module.
+    VLOG(2) << "failed to get symbol \"" << symbol_name
+            << "\" from module: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ void GpuDriver::UnloadModule(GpuContext* context,
+                                          hipModule_t module) {
+  ScopedActivateContext activated{context};
+  hipError_t res = tensorflow::wrap::hipModuleUnload(module);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to unload module " << module
+               << "; leaking: " << ToString(res);
+  }
+}
+
+/* static */ bool GpuDriver::CreateStream(GpuContext* context,
+                                          GpuStreamHandle* stream) {
+  ScopedActivateContext activated{context};
+  hipError_t res = tensorflow::wrap::hipStreamCreateWithFlags(
+      stream, hipStreamDefault);  // switch to hipStreamNonBlocking?
+  if (res != hipSuccess) {
+    LOG(ERROR) << "could not allocate ROCM stream for device "
+               << context->device_ordinal() << ": " << ToString(res);
+    return false;
+  }
+
+  VLOG(2) << "successfully created stream " << *stream << " for device "
+          << context->device_ordinal() << " on thread";
+  return true;
+}
+
+/* static */ void GpuDriver::DestroyStream(GpuContext* context,
+                                           GpuStreamHandle* stream) {
+  if (*stream == nullptr) {
+    return;
+  }
+
+  ScopedActivateContext activated{context};
+  hipError_t res = tensorflow::wrap::hipStreamDestroy(*stream);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to destroy ROCM stream for device "
+               << context->device_ordinal() << ": " << ToString(res);
+  } else {
+    VLOG(2) << "successfully destroyed stream " << *stream << " for device "
+            << context->device_ordinal();
+    *stream = nullptr;
+  }
+}
+
+/* static */ void* GpuDriver::DeviceAllocate(GpuContext* context,
+                                             uint64 bytes) {
+  ScopedActivateContext activated{context};
+  hipDeviceptr_t result = 0;
+  hipError_t res = tensorflow::wrap::hipMallocVanilla(&result, bytes);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to allocate "
+               << port::HumanReadableNumBytes::ToString(bytes) << " (" << bytes
+               << " bytes) from device: " << ToString(res);
+    return nullptr;
+  }
+  void* ptr = reinterpret_cast<void*>(result);
+  VLOG(2) << "allocated " << ptr << " for device " << context->device_ordinal()
+          << " of " << bytes << " bytes";
+  return ptr;
+}
+
+/* static */ void GpuDriver::DeviceDeallocate(GpuContext* context,
+                                              void* location) {
+  ScopedActivateContext activation{context};
+  hipDeviceptr_t pointer = absl::bit_cast<hipDeviceptr_t>(location);
+  hipError_t res = tensorflow::wrap::hipFree(pointer);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to free device memory at " << location
+               << "; result: " << ToString(res);
+  } else {
+    VLOG(2) << "deallocated " << location << " for device "
+            << context->device_ordinal();
+  }
+}
+
+/* static */ void* GpuDriver::UnifiedMemoryAllocate(GpuContext* context,
+                                                    uint64 bytes) {
+  ScopedActivateContext activated{context};
+
+  LOG(ERROR)
+      << "Feature not supported on ROCm platform (UnifiedMemoryAllocate)";
+  return nullptr;
+}
+
+/* static */ void GpuDriver::UnifiedMemoryDeallocate(GpuContext* context,
+                                                     void* location) {
+  LOG(ERROR)
+      << "Feature not supported on ROCm platform (UnifiedMemoryDeallocate)";
+}
+
+/* static */ void* GpuDriver::HostAllocate(GpuContext* context, uint64 bytes) {
+  ScopedActivateContext activation{context};
+  void* host_mem = nullptr;
+  // "Portable" memory is visible to all ROCM contexts. Safe for our use model.
+  hipError_t res = tensorflow::wrap::hipHostMallocVanilla(
+      &host_mem, bytes, hipHostMallocPortable);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to alloc " << bytes
+               << " bytes on host: " << ToString(res);
+  }
+  return host_mem;
+}
+
+/* static */ void GpuDriver::HostDeallocate(GpuContext* context,
+                                            void* location) {
+  ScopedActivateContext activation{context};
+  hipError_t res = tensorflow::wrap::hipHostFree(location);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "error deallocating host memory at " << location << ": "
+               << ToString(res);
+  }
+}
+
+/* static */ bool GpuDriver::HostRegister(GpuContext* context, void* location,
+                                          uint64 bytes) {
+  ScopedActivateContext activation{context};
+  // "Portable" memory is visible to all ROCM contexts. Safe for our use model.
+  hipError_t res = tensorflow::wrap::hipHostRegister(location, bytes,
+                                                     hipHostRegisterPortable);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "error registering host memory at " << location << ": "
+               << ToString(res);
+    return false;
+  }
+  return true;
+}
+
+/* static */ bool GpuDriver::HostUnregister(GpuContext* context,
+                                            void* location) {
+  ScopedActivateContext activation{context};
+  hipError_t res = tensorflow::wrap::hipHostUnregister(location);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "error unregistering host memory at " << location << ": "
+               << ToString(res);
+    return false;
+  }
+  return true;
+}
+
+/* static */ port::Status GpuDriver::DestroyEvent(GpuContext* context,
+                                                  GpuEventHandle* event) {
+  if (*event == nullptr) {
+    return port::Status{port::error::INVALID_ARGUMENT,
+                        "input event cannot be null"};
+  }
+
+  ScopedActivateContext activated{context};
+  hipError_t res = tensorflow::wrap::hipEventDestroy(*event);
+  *event = nullptr;
+
+  switch (res) {
+    case hipSuccess:
+      return port::Status::OK();
+    case hipErrorDeinitialized:
+    case hipErrorNotInitialized:
+      return port::Status{
+          port::error::FAILED_PRECONDITION,
+          absl::StrFormat("error destroying ROCM event in device %d: %s",
+                          context->device_ordinal(), ToString(res).c_str())};
+    default:
+      return port::Status{
+          port::error::INTERNAL,
+          absl::StrFormat("error destroying ROCM event in device %d: %s",
+                          context->device_ordinal(), ToString(res).c_str())};
+  }
+}
+
+/* static */ port::Status GpuDriver::RecordEvent(GpuContext* context,
+                                                 GpuEventHandle event,
+                                                 GpuStreamHandle stream) {
+  ScopedActivateContext activated{context};
+  hipError_t res = tensorflow::wrap::hipEventRecord(event, stream);
+  switch (res) {
+    case hipSuccess:
+      return port::Status::OK();
+    case hipErrorDeinitialized:
+    case hipErrorNotInitialized:
+      return port::Status{
+          port::error::FAILED_PRECONDITION,
+          absl::StrFormat("error recording ROCM event on stream %p: %s", stream,
+                          ToString(res).c_str())};
+    default:
+      return port::Status{
+          port::error::INVALID_ARGUMENT,
+          absl::StrFormat("error recording ROCM event on stream %p: %s", stream,
+                          ToString(res).c_str())};
+  }
+}
+
+/* static */ port::StatusOr<hipError_t> GpuDriver::QueryEvent(
+    GpuContext* context, GpuEventHandle event) {
+  ScopedActivateContext activated{context};
+  hipError_t res = tensorflow::wrap::hipEventQuery(event);
+  if (res != hipSuccess && res != hipErrorNotReady) {
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrFormat("failed to query event: %s", ToString(res).c_str())};
+  }
+
+  return res;
+}
+
+/* static */ bool GpuDriver::GetEventElapsedTime(GpuContext* context,
+                                                 float* elapsed_milliseconds,
+                                                 GpuEventHandle start,
+                                                 GpuEventHandle stop) {
+  ScopedActivateContext activated{context};
+  // The stop event must have completed in order for hipEventElapsedTime to
+  // work.
+  hipError_t res = tensorflow::wrap::hipEventSynchronize(stop);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res);
+    return false;
+  }
+  res =
+      tensorflow::wrap::hipEventElapsedTime(elapsed_milliseconds, start, stop);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to get elapsed time between events: "
+               << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ bool GpuDriver::WaitStreamOnEvent(GpuContext* context,
+                                               GpuStreamHandle stream,
+                                               GpuEventHandle event) {
+  ScopedActivateContext activation{context};
+  hipError_t res =
+      tensorflow::wrap::hipStreamWaitEvent(stream, event, 0 /* = flags */);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "could not wait stream on event: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ bool GpuDriver::SynchronizeContext(GpuContext* context) {
+  ScopedActivateContext activation{context};
+  hipError_t res = tensorflow::wrap::hipDeviceSynchronize();
+  if (res != hipSuccess) {
+    LOG(ERROR) << "could not synchronize on ROCM device: " << ToString(res)
+               << " :: " << port::CurrentStackTrace();
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ port::Status GpuDriver::SynchronizeStream(GpuContext* context,
+                                                       GpuStreamHandle stream) {
+  ScopedActivateContext activated{context};
+  CHECK(stream != nullptr);
+  hipError_t res = tensorflow::wrap::hipStreamSynchronize(stream);
+  if (res != hipSuccess) {
+    port::Status status = port::InternalError(
+        absl::StrCat("could not synchronize on ROCM stream: ", ToString(res)));
+    LOG(ERROR) << status << " :: " << port::CurrentStackTrace();
+    return status;
+  }
+  VLOG(2) << "successfully synchronized stream " << stream << " on device "
+          << context->device_ordinal();
+  return port::Status::OK();
+}
+
+/* static */ bool GpuDriver::IsStreamIdle(GpuContext* context,
+                                          GpuStreamHandle stream) {
+  ScopedActivateContext activated{context};
+  CHECK(stream != nullptr);
+  hipError_t res = tensorflow::wrap::hipStreamQuery(stream);
+  if (res == hipSuccess) {
+    return true;
+  }
+
+  if (res != hipErrorNotReady) {
+    LOG(ERROR) << "stream in bad state on status query: " << ToString(res);
+  }
+  return false;
+}
+
+/* static */ port::Status GpuDriver::SynchronousMemcpyD2H(
+    GpuContext* context, void* host_dst, hipDeviceptr_t gpu_src, uint64 size) {
+  ScopedActivateContext activation{context};
+  hipError_t res = tensorflow::wrap::hipMemcpyDtoH(host_dst, gpu_src, size);
+  if (res != hipSuccess) {
+    return port::InternalError(
+        absl::StrFormat("failed to synchronous memcpy from device to host: %s; "
+                        "host dst: %p; Gpu src: %p; size: %llu=0x%llx",
+                        ToString(res).c_str(), host_dst,
+                        absl::bit_cast<void*>(gpu_src), size, size));
+  }
+  VLOG(2) << "successfully sync memcpy'd d2h of " << size << " bytes to "
+          << host_dst;
+  return port::Status::OK();
+}
+
+/* static */ port::Status GpuDriver::SynchronousMemcpyH2D(
+    GpuContext* context, hipDeviceptr_t gpu_dst, const void* host_src,
+    uint64 size) {
+  ScopedActivateContext activation{context};
+  hipError_t res = tensorflow::wrap::hipMemcpyHtoD(
+      gpu_dst, const_cast<void*>(host_src), size);
+  if (res != hipSuccess) {
+    return port::InternalError(absl::StrFormat(
+        "failed to synchronous memcpy from host to device: %s; Gpu dst: %p;"
+        " host src: %p; size: %llu=0x%llx",
+        ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst), host_src, size,
+        size));
+  }
+  VLOG(2) << "successfully enqueued sync memcpy h2d of " << size << " bytes";
+  return port::Status::OK();
+}
+
+/* static */ port::Status GpuDriver::SynchronousMemcpyD2D(
+    GpuContext* context, hipDeviceptr_t gpu_dst, hipDeviceptr_t gpu_src,
+    uint64 size) {
+  ScopedActivateContext activation{context};
+  hipError_t res = tensorflow::wrap::hipMemcpyDtoD(gpu_dst, gpu_src, size);
+  if (res != hipSuccess) {
+    return port::InternalError(absl::StrFormat(
+        "failed to synchronous memcpy from host to device: %s; Gpu dst: %p; "
+        "Gpu src: %p; size: %llu=0x%llx",
+        ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst),
+        absl::bit_cast<void*>(gpu_src), size, size));
+  }
+  VLOG(2) << "successfully sync memcpy'd d2d of " << size << " bytes";
+  return port::Status::OK();
+}
+
+/* static */ bool GpuDriver::AsynchronousMemcpyD2H(GpuContext* context,
+                                                   void* host_dst,
+                                                   hipDeviceptr_t gpu_src,
+                                                   uint64 size,
+                                                   GpuStreamHandle stream) {
+  ScopedActivateContext activation{context};
+  hipError_t res =
+      tensorflow::wrap::hipMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
+  if (res != hipSuccess) {
+    LOG(ERROR) << absl::StrFormat(
+        "failed to enqueue async memcpy from device to host: %s; host dst: %p; "
+        "Gpu src: %p; size: %llu=0x%llx",
+        ToString(res).c_str(), host_dst, absl::bit_cast<void*>(gpu_src), size,
+        size);
+    return false;
+  }
+  VLOG(2) << "successfully enqueued async memcpy d2h of " << size
+          << " bytes from " << absl::bit_cast<void*>(gpu_src) << " to "
+          << host_dst << " on stream " << stream;
+  return true;
+}
+
+/* static */ bool GpuDriver::AsynchronousMemcpyH2D(GpuContext* context,
+                                                   hipDeviceptr_t gpu_dst,
+                                                   const void* host_src,
+                                                   uint64 size,
+                                                   GpuStreamHandle stream) {
+  ScopedActivateContext activation{context};
+  hipError_t res = tensorflow::wrap::hipMemcpyHtoDAsync(
+      gpu_dst, const_cast<void*>(host_src), size, stream);
+  if (res != hipSuccess) {
+    LOG(ERROR) << absl::StrFormat(
+        "failed to enqueue async memcpy from host to device: %s; Gpu dst: %p; "
+        "host src: %p; size: %llu=0x%llx",
+        ToString(res).c_str(), absl::bit_cast<void*>(gpu_dst), host_src, size,
+        size);
+    return false;
+  }
+  VLOG(2) << "successfully enqueued async memcpy h2d of " << size << " bytes"
+          << " on stream " << stream;
+  return true;
+}
+
+/* static */ bool GpuDriver::AsynchronousMemcpyD2D(GpuContext* context,
+                                                   hipDeviceptr_t gpu_dst,
+                                                   hipDeviceptr_t gpu_src,
+                                                   uint64 size,
+                                                   GpuStreamHandle stream) {
+  ScopedActivateContext activation{context};
+  hipError_t result =
+      tensorflow::wrap::hipMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
+  if (result != hipSuccess) {
+    LOG(ERROR) << absl::StrFormat(
+        "failed to enqueue async memcpy from device to device: %s"
+        "; Gpu dst: %p on %s %s"
+        "; Gpu src: %p on %s %s"
+        "; can access? %s; size: %llu=0x%llx",
+        ToString(result).c_str(), absl::bit_cast<void*>(gpu_dst),
+        ROCMPointerToMemorySpaceString(gpu_dst).c_str(),
+        ROCMPointerToDeviceString(gpu_dst).c_str(),
+        absl::bit_cast<void*>(gpu_src),
+        ROCMPointerToMemorySpaceString(gpu_src).c_str(),
+        ROCMPointerToDeviceString(gpu_src).c_str(),
+        ROCMPointersToCanAccessString(gpu_src, gpu_dst).c_str(), size, size);
+
+    return false;
+  }
+  VLOG(2) << "successfully enqueued async memcpy d2d of " << size << " bytes";
+  return true;
+}
+
+/* static */ port::Status GpuDriver::CreateEvent(GpuContext* context,
+                                                 GpuEventHandle* event,
+                                                 EventFlags flags) {
+  int hipflags;
+  switch (flags) {
+    case EventFlags::kDefault:
+      hipflags = hipEventDefault;
+      break;
+    case EventFlags::kDisableTiming:
+      hipflags = hipEventDisableTiming | hipEventReleaseToSystem;
+      break;
+    default:
+      LOG(FATAL) << "impossible event flags: " << int(hipflags);
+  }
+
+  ScopedActivateContext activated{context};
+  hipError_t res = tensorflow::wrap::hipEventCreateWithFlags(event, hipflags);
+
+  if (res == hipSuccess) {
+    return port::Status::OK();
+  } else if (res == hipErrorMemoryAllocation) {
+    return port::Status{port::error::RESOURCE_EXHAUSTED,
+                        "could not create ROCM event: out of device memory"};
+  } else {
+    return port::Status{
+        port::error::FAILED_PRECONDITION,
+        absl::StrCat("could not create ROCM event: ", ToString(res))};
+  }
+}
+
+/* static */ int GpuDriver::GetDeviceCount() {
+  int device_count = 0;
+  hipError_t res = tensorflow::wrap::hipGetDeviceCount(&device_count);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "could not retrieve ROCM device count: " << ToString(res);
+    return 0;
+  }
+
+  if (FLAGS_gpuexec_rocm_device_0_only && device_count > 1) {
+    device_count = 1;
+  }
+  return device_count;
+}
+
+/* static */ port::Status GpuDriver::GetComputeCapability(int* cc_major,
+                                                          int* cc_minor,
+                                                          hipDevice_t device) {
+  return port::Status(
+      port::error::INTERNAL,
+      absl::StrFormat("failed to get compute capability for device: %d "
+                      "(unsupported API on AMD Gpus)",
+                      device));
+}
+
+/* static */ port::Status GpuDriver::GetPointerAddressRange(
+    hipDeviceptr_t dptr, hipDeviceptr_t* base, size_t* size) {
+  hipError_t result = tensorflow::wrap::hipMemGetAddressRange(base, size, dptr);
+  if (result == hipSuccess) {
+    return port::Status::OK();
+  } else if (result == hipErrorNotFound) {
+    // We differentiate between "this pointer is unknown" (return here) and
+    // "there was an internal error while performing this operation" (return
+    // below).
+    return port::Status{port::error::NOT_FOUND,
+                        absl::StrFormat("not a device pointer %p; %s",
+                                        reinterpret_cast<void*>(dptr),
+                                        ToString(result).c_str())};
+  }
+
+  return port::Status{
+      port::error::INTERNAL,
+      absl::StrFormat("failed to get pointer into for device pointer %p; %s",
+                      reinterpret_cast<void*>(dptr), ToString(result).c_str())};
+}
+
+/* static */ port::StatusOr<MemorySpace> GpuDriver::GetPointerMemorySpace(
+    hipDeviceptr_t pointer) {
+  unsigned int value;
+  hipError_t result = hipSuccess;
+  if (result == hipSuccess) {
+    switch (value) {
+      case hipMemoryTypeDevice:
+        return MemorySpace::kDevice;
+      case hipMemoryTypeHost:
+        return MemorySpace::kHost;
+      default:
+        return port::Status{
+            port::error::INTERNAL,
+            absl::StrCat("unknown memory space provided by ROCM API: ", value)};
+    }
+  }
+
+  return port::Status{
+      port::error::INTERNAL,
+      absl::StrCat("failed to query device pointer for memory space: ",
+                   ToString(result))};
+}
+
+/* static */ port::StatusOr<hipDevice_t> GpuDriver::GetPointerDevice(
+    hipDeviceptr_t pointer) {
+  hipPointerAttribute_t pointerAttributes;
+  hipError_t result =
+      tensorflow::wrap::hipPointerGetAttributes(&pointerAttributes, pointer);
+  if (result != hipSuccess) {
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrCat("failed to get device for pointer: ", ToString(result))};
+  }
+
+  hipDevice_t device;
+  result = tensorflow::wrap::hipDeviceGet(&device, pointerAttributes.device);
+  if (result != hipSuccess) {
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrCat("failed to get device for pointer: ", ToString(result))};
+  }
+
+  return device;
+}
+
+/* static */ port::Status GpuDriver::GetGpuISAVersion(int* version,
+                                                      hipDevice_t device) {
+  hipDeviceProp_t props;
+  hipError_t result = tensorflow::wrap::hipGetDeviceProperties(&props, device);
+  if (result == hipSuccess) {
+    *version = props.gcnArch;
+    return port::Status::OK();
+  }
+  *version = 0;
+  return port::Status{
+      port::error::INTERNAL,
+      absl::StrFormat("failed to determine AMDGpu ISA version for device %d",
+                      device)};
+}
+
+// Helper function that turns the integer output of hipDeviceGetAttribute to
+// type T and wraps it in a StatusOr.
+template <typename T>
+static port::StatusOr<T> GetSimpleAttribute(hipDevice_t device,
+                                            hipDeviceAttribute_t attribute) {
+  int value = -1;
+  hipError_t result =
+      tensorflow::wrap::hipDeviceGetAttribute(&value, attribute, device);
+  if (result != hipSuccess) {
+    return port::Status{
+        port::error::NOT_FOUND,
+        absl::StrCat("could not retrieve ROCM device attribute (", attribute,
+                     "): ", ToString(result))};
+  }
+  T converted = value;
+  return converted;
+}
+
+/* static */ port::StatusOr<int> GpuDriver::GetMultiprocessorCount(
+    hipDevice_t device) {
+  return GetSimpleAttribute<int>(device, hipDeviceAttributeMultiprocessorCount);
+}
+
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerCore(
+    hipDevice_t device) {
+  return GetSimpleAttribute<int64>(
+      device, hipDeviceAttributeMaxSharedMemoryPerMultiprocessor);
+}
+
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxSharedMemoryPerBlock(
+    hipDevice_t device) {
+  return GetSimpleAttribute<int64>(device,
+                                   hipDeviceAttributeMaxSharedMemoryPerBlock);
+}
+
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerMultiprocessor(
+    hipDevice_t device) {
+  return GetSimpleAttribute<int64>(
+      device, hipDeviceAttributeMaxThreadsPerMultiProcessor);
+}
+
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxThreadsPerBlock(
+    hipDevice_t device) {
+  return GetSimpleAttribute<int64>(device,
+                                   hipDeviceAttributeMaxThreadsPerBlock);
+}
+
+/* static */ port::StatusOr<int64> GpuDriver::GetMaxRegistersPerBlock(
+    hipDevice_t device) {
+  return GetSimpleAttribute<int64>(device,
+                                   hipDeviceAttributeMaxRegistersPerBlock);
+}
+
+/* static */ port::StatusOr<int64> GpuDriver::GetThreadsPerWarp(
+    hipDevice_t device) {
+  return GetSimpleAttribute<int64>(device, hipDeviceAttributeWarpSize);
+}
+
+/* static */ bool GpuDriver::GetGridLimits(int* x, int* y, int* z,
+                                           hipDevice_t device) {
+  int value;
+  hipError_t res = tensorflow::wrap::hipDeviceGetAttribute(
+      &value, hipDeviceAttributeMaxGridDimX, device);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query max grid dim x: " << ToString(res);
+    return false;
+  }
+  *x = value;
+
+  res = tensorflow::wrap::hipDeviceGetAttribute(
+      &value, hipDeviceAttributeMaxGridDimY, device);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query max grid dim y: " << ToString(res);
+    return false;
+  }
+  *y = value;
+
+  res = tensorflow::wrap::hipDeviceGetAttribute(
+      &value, hipDeviceAttributeMaxGridDimZ, device);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query max grid dim z: " << ToString(res);
+    return false;
+  }
+  *z = value;
+  return true;
+}
+
+/* static */ bool GpuDriver::GetDriverVersion(int* driver_version) {
+  hipError_t res = tensorflow::wrap::hipDriverGetVersion(driver_version);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query driver version: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ bool GpuDriver::GetDeviceProperties(
+    hipDeviceProp_t* device_properties, int device_ordinal) {
+  hipError_t res = tensorflow::wrap::hipGetDeviceProperties(device_properties,
+                                                            device_ordinal);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query device properties: " << ToString(res);
+    return false;
+  }
+
+  return true;
+}
+
+/* static */ port::StatusOr<int> GpuDriver::GetDeviceAttribute(
+    hipDeviceAttribute_t attribute, hipDevice_t device) {
+  return GetSimpleAttribute<int>(device, attribute);
+}
+
+/* static */ bool GpuDriver::IsEccEnabled(hipDevice_t device, bool* result) {
+  int value = -1;
+  hipError_t res = hipSuccess;
+  // TODO(ROCm) implement this feature in HIP
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query ECC status: " << ToString(res);
+    return false;
+  }
+
+  *result = value;
+  return true;
+}
+
+/* static */ bool GpuDriver::GetDeviceMemoryInfo(GpuContext* context,
+                                                 int64* free_out,
+                                                 int64* total_out) {
+  ScopedActivateContext activation{context};
+  size_t free = 0;
+  size_t total = 0;
+  hipError_t res = tensorflow::wrap::hipMemGetInfo(&free, &total);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query device memory info: " << ToString(res);
+    return false;
+  }
+
+  *free_out = free;
+  *total_out = total;
+  return true;
+}
+
+/* static */ bool GpuDriver::GetDeviceTotalMemory(hipDevice_t device,
+                                                  uint64* result) {
+  size_t value = -1;
+  hipError_t res = tensorflow::wrap::hipDeviceTotalMem(&value, device);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query total available memory: " << ToString(res);
+    return false;
+  }
+
+  *result = value;
+  return true;
+}
+
+/* static */ string GpuDriver::GetPCIBusID(hipDevice_t device) {
+  string pci_bus_id;
+  static const int kBufferSize = 64;
+  absl::InlinedVector<char, 4> chars(kBufferSize);
+  chars[kBufferSize - 1] = '\0';
+  hipError_t res = tensorflow::wrap::hipDeviceGetPCIBusId(
+      chars.begin(), kBufferSize - 1, device);
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to query PCI bus id for device: " << ToString(res);
+    return pci_bus_id;
+  }
+  pci_bus_id = chars.begin();
+  return pci_bus_id;
+}
+
+/* static */ bool GpuDriver::CanEnablePeerAccess(GpuContext* from,
+                                                 GpuContext* to) {
+  if (from->device_ordinal() == to->device_ordinal()) {
+    return true;  // A device can always access its own memory.
+  }
+
+  int can_access_peer = -1;
+  hipError_t res = tensorflow::wrap::hipDeviceCanAccessPeer(
+      &can_access_peer, from->device_ordinal(), to->device_ordinal());
+  if (res != hipSuccess) {
+    LOG(ERROR) << "failed to detect peer access capability: " << ToString(res);
+    return false;
+  }
+
+  return can_access_peer;
+}
+
+/* static */ port::Status GpuDriver::EnablePeerAccess(GpuContext* from,
+                                                      GpuContext* to) {
+  if (from->device_ordinal() == to->device_ordinal()) {
+    return port::Status::OK();  // A device can always access its own memory.
+  }
+
+  ScopedActivateContext activated{from};
+  hipError_t result = tensorflow::wrap::hipDeviceEnablePeerAccess(
+      to->device_ordinal(), 0 /* = flags */);
+  if (result != hipSuccess && result != hipErrorPeerAccessAlreadyEnabled) {
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrFormat("failed to enable peer access from %d to %d: %s",
+                        from->device_ordinal(), to->device_ordinal(),
+                        ToString(result).c_str())};
+  }
+
+  return port::Status::OK();
+}
+
+/* static */ port::StatusOr<int> GpuDriver::GetMaxOccupiedBlocksPerCore(
+    GpuContext* context, hipFunction_t kernel, int threads_per_block,
+    size_t dynamic_shared_memory_bytes) {
+  ScopedActivateContext activation{context};
+
+  int max_blocks = 0;
+  hipError_t result = hipSuccess;
+  // TODO(ROCm) implement this feature in HIP
+  if (result != hipSuccess) {
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrFormat("failed to calculate occupancy of kernel %p: %s",
+                        kernel, ToString(result).c_str())};
+  }
+
+  return max_blocks;
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h b/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..27495c2cbc04e7b6171814e142414a9817c26226
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_driver_wrapper.h
@@ -0,0 +1,146 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file wraps rocm driver calls with dso loader so that we don't need to
+// have explicit linking to librocm. All TF rocm driver usage should route
+// through this wrapper.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
+
+#include "rocm/include/hip/hip_runtime.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+#if defined(TENSORFLOW_USE_ROCM)
+
+#endif
+
+namespace tensorflow {
+namespace wrap {
+#ifdef PLATFORM_GOOGLE
+// Use static linked library
+#define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName)                          \
+  template <typename... Args>                                            \
+  auto hipSymbolName(Args... args)->decltype(::hipSymbolName(args...)) { \
+    return ::hipSymbolName(args...);                                     \
+  }
+
+// This macro wraps a global identifier, given by hipSymbolName, in a callable
+// structure that loads the DLL symbol out of the DSO handle in a thread-safe
+// manner on first use. This dynamic loading technique is used to avoid DSO
+// dependencies on vendor libraries which may or may not be available in the
+// deployed binary environment.
+#else
+#define TO_STR_(x) #x
+#define TO_STR(x) TO_STR_(x)
+
+// hipMalloc and hipHostMalloc are defined as funtion templates in the
+// HIP header files, and hence their names get mangled and the attempt
+// to resolve their name when trying to dynamically load them will fail
+// Updating the HIP header files to make them C functions is underway.
+// Until that change flows through, we will workaround the issue by
+// creating dummy wrappers for them here
+
+hipError_t hipMallocVanilla(void** ptr, size_t size) {
+  return hipErrorNotInitialized;
+}
+
+hipError_t hipHostMallocVanilla(void** ptr, size_t size, unsigned int flags) {
+  return hipErrorNotInitialized;
+}
+
+#define STREAM_EXECUTOR_HIP_WRAP(hipSymbolName)                             \
+  template <typename... Args>                                               \
+  auto hipSymbolName(Args... args)->decltype(::hipSymbolName(args...)) {    \
+    using FuncPtrT = std::add_pointer<decltype(::hipSymbolName)>::type;     \
+    static FuncPtrT loaded = []() -> FuncPtrT {                             \
+      static const char *kName = TO_STR(hipSymbolName);                     \
+      void *f;                                                              \
+      auto s = stream_executor::port::Env::Default()->GetSymbolFromLibrary( \
+          stream_executor::internal::CachedDsoLoader::GetHipDsoHandle()     \
+              .ValueOrDie(),                                                \
+          kName, &f);                                                       \
+      CHECK(s.ok()) << "could not find " << kName                           \
+                    << " in HIP DSO; dlerror: " << s.error_message();       \
+      return reinterpret_cast<FuncPtrT>(f);                                 \
+    }();                                                                    \
+    return loaded(args...);                                                 \
+  }
+#endif
+
+// clang-format off
+#define HIP_ROUTINE_EACH(__macro)                   \
+  __macro(hipDeviceCanAccessPeer)                   \
+  __macro(hipDeviceEnablePeerAccess)                \
+  __macro(hipDeviceGet)                             \
+  __macro(hipDeviceGetAttribute)                    \
+  __macro(hipDeviceGetName)                         \
+  __macro(hipDeviceGetPCIBusId)                     \
+  __macro(hipDeviceGetSharedMemConfig)              \
+  __macro(hipDeviceSetSharedMemConfig)              \
+  __macro(hipDeviceSynchronize)                     \
+  __macro(hipDeviceTotalMem)                        \
+  __macro(hipDriverGetVersion)                      \
+  __macro(hipEventCreateWithFlags)                  \
+  __macro(hipEventElapsedTime)                      \
+  __macro(hipEventDestroy)                          \
+  __macro(hipEventQuery)                            \
+  __macro(hipEventRecord)                           \
+  __macro(hipEventSynchronize)                      \
+  __macro(hipFree)                                  \
+  __macro(hipFuncSetCacheConfig)                    \
+  __macro(hipGetDevice)                             \
+  __macro(hipGetDeviceCount)                        \
+  __macro(hipGetDeviceProperties)                   \
+  __macro(hipHostFree)                              \
+  __macro(hipHostRegister)                          \
+  __macro(hipHostUnregister)                        \
+  __macro(hipInit)                                  \
+  __macro(hipMemGetAddressRange)                    \
+  __macro(hipMemGetInfo)                            \
+  __macro(hipMemcpyDtoD)                            \
+  __macro(hipMemcpyDtoDAsync)                       \
+  __macro(hipMemcpyDtoH)                            \
+  __macro(hipMemcpyDtoHAsync)                       \
+  __macro(hipMemcpyHtoD)                            \
+  __macro(hipMemcpyHtoDAsync)                       \
+  __macro(hipMemset)                                \
+  __macro(hipMemsetAsync)                           \
+  __macro(hipModuleGetFunction)                     \
+  __macro(hipModuleGetGlobal)                       \
+  __macro(hipModuleLaunchKernel)                    \
+  __macro(hipModuleLoadData)                        \
+  __macro(hipModuleUnload)                          \
+  __macro(hipPointerGetAttributes)                  \
+  __macro(hipSetDevice)                             \
+  __macro(hipStreamAddCallback)                     \
+  __macro(hipStreamCreateWithFlags)                 \
+  __macro(hipStreamDestroy)                         \
+  __macro(hipStreamQuery)                           \
+  __macro(hipStreamSynchronize)                     \
+  __macro(hipStreamWaitEvent)                       \
+// clang-format on
+
+HIP_ROUTINE_EACH(STREAM_EXECUTOR_HIP_WRAP)
+#undef HIP_ROUTINE_EACH
+#undef STREAM_EXECUTOR_HIP_WRAP
+#undef TO_STR
+#undef TO_STR_
+}  // namespace wrap
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DRIVER_WRAPPER_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_event.cc b/tensorflow/stream_executor/rocm/rocm_event.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c0ffd74c177bf5149f98cc045a51559b9acf1d94
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_event.cc
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/gpu/gpu_event.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace stream_executor {
+namespace gpu {
+
+Event::Status GpuEvent::PollForStatus() {
+  port::StatusOr<hipError_t> status =
+      GpuDriver::QueryEvent(parent_->gpu_context(), gpu_event_);
+  if (!status.ok()) {
+    LOG(ERROR) << "Error polling for event status: "
+               << status.status().error_message();
+    return Event::Status::kError;
+  }
+
+  switch (status.ValueOrDie()) {
+    case hipSuccess:
+      return Event::Status::kComplete;
+    case hipErrorNotReady:
+      return Event::Status::kPending;
+    default:
+      LOG(INFO) << "Error condition returned for event status: "
+                << status.ValueOrDie();
+      return Event::Status::kError;
+  }
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/rocm/rocm_fft.cc b/tensorflow/stream_executor/rocm/rocm_fft.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2af973309c01ec67de0d7022e5a8cefd18c5063a
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_fft.cc
@@ -0,0 +1,618 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/rocm/rocm_fft.h"
+
+#include <complex>
+
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/gpu/gpu_activation.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_helpers.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+
+namespace stream_executor {
+namespace gpu {
+
+PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kRocFftPlugin);
+
+namespace wrap {
+
+#ifdef PLATFORM_GOOGLE
+// This macro wraps a global identifier, given by __name, in a callable
+// structure that loads the DLL symbol out of the DSO handle in a thread-safe
+// manner on first use. This dynamic loading technique is used to avoid DSO
+// dependencies on vendor libraries which may or may not be available in the
+// deployed binary environment.
+#define STREAM_EXECUTOR_ROCFFT_WRAP(__name)                      \
+  struct WrapperShim__##__name {                                 \
+    template <typename... Args>                                  \
+    hipfftResult operator()(GpuExecutor *parent, Args... args) { \
+      gpu::ScopedActivateExecutorContext sac{parent};            \
+      return ::__name(args...);                                  \
+    }                                                            \
+  } __name;
+
+#else
+
+#define STREAM_EXECUTOR_ROCFFT_WRAP(__name)                               \
+  struct DynLoadShim__##__name {                                          \
+    static const char *kName;                                             \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
+    static void *GetDsoHandle() {                                         \
+      auto s = internal::CachedDsoLoader::GetRocfftDsoHandle();           \
+      return s.ValueOrDie();                                              \
+    }                                                                     \
+    static FuncPtrT LoadOrDie() {                                         \
+      void *f;                                                            \
+      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                          kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                         \
+                    << " in rocfft DSO; dlerror: " << s.error_message();  \
+      return reinterpret_cast<FuncPtrT>(f);                               \
+    }                                                                     \
+    static FuncPtrT DynLoad() {                                           \
+      static FuncPtrT f = LoadOrDie();                                    \
+      return f;                                                           \
+    }                                                                     \
+    template <typename... Args>                                           \
+    hipfftResult operator()(GpuExecutor *parent, Args... args) {          \
+      gpu::ScopedActivateExecutorContext sac{parent};                     \
+      return DynLoad()(args...);                                          \
+    }                                                                     \
+  } __name;                                                               \
+  const char *DynLoadShim__##__name::kName = #__name;
+
+#endif
+
+#define ROCFFT_ROUTINE_EACH(__macro)                                           \
+  __macro(hipfftDestroy) __macro(hipfftSetStream) __macro(hipfftPlan1d)        \
+      __macro(hipfftPlan2d) __macro(hipfftPlan3d) __macro(hipfftPlanMany)      \
+          __macro(hipfftCreate) __macro(hipfftSetAutoAllocation)               \
+              __macro(hipfftSetWorkArea) __macro(hipfftGetSize1d)              \
+                  __macro(hipfftMakePlan1d) __macro(hipfftGetSize2d)           \
+                      __macro(hipfftMakePlan2d) __macro(hipfftGetSize3d)       \
+                          __macro(hipfftMakePlan3d) __macro(hipfftGetSizeMany) \
+                              __macro(hipfftMakePlanMany)                      \
+                                  __macro(hipfftExecD2Z)                       \
+                                      __macro(hipfftExecZ2D)                   \
+                                          __macro(hipfftExecC2C)               \
+                                              __macro(hipfftExecC2R)           \
+                                                  __macro(hipfftExecZ2Z)       \
+                                                      __macro(hipfftExecR2C)
+
+ROCFFT_ROUTINE_EACH(STREAM_EXECUTOR_ROCFFT_WRAP)
+
+}  // namespace wrap
+
+namespace {
+
+// A helper function transforming gpu_fft arguments into rocFFT arguments.
+hipfftType ROCMFftType(fft::Type type) {
+  switch (type) {
+    case fft::Type::kC2CForward:
+    case fft::Type::kC2CInverse:
+      return HIPFFT_C2C;
+    case fft::Type::kC2R:
+      return HIPFFT_C2R;
+    case fft::Type::kR2C:
+      return HIPFFT_R2C;
+    case fft::Type::kZ2ZForward:
+    case fft::Type::kZ2ZInverse:
+      return HIPFFT_Z2Z;
+    case fft::Type::kZ2D:
+      return HIPFFT_Z2D;
+    case fft::Type::kD2Z:
+      return HIPFFT_D2Z;
+    default:
+      LOG(FATAL) << "Invalid value of fft::Type.";
+  }
+}
+
+// Associates the given stream with the given rocFFT plan.
+bool SetStream(GpuExecutor *parent, hipfftHandle plan, Stream *stream) {
+  auto ret = wrap::hipfftSetStream(parent, plan, AsGpuStreamValue(stream));
+  if (ret != HIPFFT_SUCCESS) {
+    LOG(ERROR) << "failed to run rocFFT routine hipfftSetStream: " << ret;
+    return false;
+  }
+  return true;
+}
+
+}  // namespace
+
+port::Status ROCMFftPlan::Initialize(
+    GpuExecutor *parent, Stream *stream, int rank, uint64 *elem_count,
+    uint64 *input_embed, uint64 input_stride, uint64 input_distance,
+    uint64 *output_embed, uint64 output_stride, uint64 output_distance,
+    fft::Type type, int batch_count, ScratchAllocator *scratch_allocator) {
+  if (IsInitialized()) {
+    LOG(FATAL) << "Try to repeatedly initialize.";
+  }
+  is_initialized_ = true;
+  int elem_count_[3], input_embed_[3], output_embed_[3];
+  for (int i = 0; i < rank; ++i) {
+    elem_count_[i] = elem_count[i];
+    if (input_embed) {
+      input_embed_[i] = input_embed[i];
+    }
+    if (output_embed) {
+      output_embed_[i] = output_embed[i];
+    }
+  }
+  parent_ = parent;
+  fft_type_ = type;
+  if (batch_count == 1 && input_embed == nullptr && output_embed == nullptr) {
+    hipfftResult_t ret;
+    if (scratch_allocator == nullptr) {
+      switch (rank) {
+        case 1:
+          // hipfftPlan1d
+          ret = wrap::hipfftPlan1d(parent, &plan_, elem_count_[0],
+                                   ROCMFftType(type), 1 /* = batch */);
+          if (ret != HIPFFT_SUCCESS) {
+            LOG(ERROR) << "failed to create rocFFT 1d plan:" << ret;
+            return port::Status{port::error::INTERNAL,
+                                "Failed to create rocFFT 1d plan."};
+          }
+          return port::Status::OK();
+        case 2:
+          // hipfftPlan2d
+          ret = wrap::hipfftPlan2d(parent, &plan_, elem_count_[0],
+                                   elem_count_[1], ROCMFftType(type));
+          if (ret != HIPFFT_SUCCESS) {
+            LOG(ERROR) << "failed to create rocFFT 2d plan:" << ret;
+            return port::Status{port::error::INTERNAL,
+                                "Failed to create rocFFT 2d plan."};
+          }
+          return port::Status::OK();
+        case 3:
+          // hipfftPlan3d
+          ret =
+              wrap::hipfftPlan3d(parent, &plan_, elem_count_[0], elem_count_[1],
+                                 elem_count_[2], ROCMFftType(type));
+          if (ret != HIPFFT_SUCCESS) {
+            LOG(ERROR) << "failed to create rocFFT 3d plan:" << ret;
+            return port::Status{port::error::INTERNAL,
+                                "Failed to create rocFFT 3d plan."};
+          }
+          return port::Status::OK();
+        default:
+          LOG(ERROR) << "Invalid rank value for hipfftPlan. "
+                        "Requested 1, 2, or 3, given: "
+                     << rank;
+          return port::Status{port::error::INVALID_ARGUMENT,
+                              "hipfftPlan only takes rank 1, 2, or 3."};
+      }
+    } else {
+      ret = wrap::hipfftCreate(parent, &plan_);
+      if (ret != HIPFFT_SUCCESS) {
+        LOG(ERROR) << "failed to create rocFFT plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to create rocFFT plan."};
+      }
+      ret = wrap::hipfftSetAutoAllocation(parent, plan_, 0);
+      if (ret != HIPFFT_SUCCESS) {
+        LOG(ERROR) << "failed to set auto allocation for rocFFT plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to set auto allocation for rocFFT plan."};
+      }
+      size_t size_in_bytes;
+      switch (rank) {
+        case 1:
+          ret = wrap::hipfftMakePlan1d(parent, plan_, elem_count_[0],
+                                       ROCMFftType(type), /*batch=*/1,
+                                       &size_in_bytes);
+          if (ret != HIPFFT_SUCCESS) {
+            LOG(ERROR) << "failed to make rocFFT 1d plan:" << ret;
+            return port::Status{port::error::INTERNAL,
+                                "Failed to make rocFFT 1d plan."};
+          }
+          break;
+        case 2:
+          ret = wrap::hipfftMakePlan2d(parent, plan_, elem_count_[0],
+                                       elem_count_[1], ROCMFftType(type),
+                                       &size_in_bytes);
+          if (ret != HIPFFT_SUCCESS) {
+            LOG(ERROR) << "failed to make rocFFT 2d plan:" << ret;
+            return port::Status{port::error::INTERNAL,
+                                "Failed to make rocFFT 2d plan."};
+          }
+          break;
+        case 3:
+          ret = wrap::hipfftMakePlan3d(parent, plan_, elem_count_[0],
+                                       elem_count_[1], elem_count_[2],
+                                       ROCMFftType(type), &size_in_bytes);
+          if (ret != HIPFFT_SUCCESS) {
+            LOG(ERROR) << "failed to make rocFFT 3d plan:" << ret;
+            return port::Status{port::error::INTERNAL,
+                                "Failed to make rocFFT 3d plan."};
+          }
+          break;
+        default:
+          LOG(ERROR) << "Invalid rank value for hipfftPlan. "
+                        "Requested 1, 2, or 3, given: "
+                     << rank;
+          return port::Status{port::error::INVALID_ARGUMENT,
+                              "hipfftPlan only takes rank 1, 2, or 3."};
+      }
+      // TODO(yangzihao): refactor this code and the one with the same function
+      // in the batch mode.
+      if (size_in_bytes != 0) {
+        auto allocated =
+            scratch_allocator->AllocateBytes(stream, size_in_bytes);
+        if (!allocated.ok() || (scratch_ = allocated.ValueOrDie()) == nullptr) {
+          LOG(ERROR) << "failed to allocate work area.";
+          return allocated.status();
+        }
+      }
+      // Connect work area with allocated space.
+      ret = wrap::hipfftSetWorkArea(parent, plan_, scratch_.opaque());
+      if (ret != HIPFFT_SUCCESS) {
+        LOG(ERROR) << "failed to set work area for rocFFT plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to set work area for rocFFT plan."};
+      }
+      return port::Status::OK();
+    }
+  } else {
+    // For either multiple batches or rank higher than 3, use hipfftPlanMany().
+    if (scratch_allocator == nullptr) {
+      auto ret = wrap::hipfftPlanMany(
+          parent, &plan_, rank, elem_count_,
+          input_embed ? input_embed_ : nullptr, input_stride, input_distance,
+          output_embed ? output_embed_ : nullptr, output_stride,
+          output_distance, ROCMFftType(type), batch_count);
+      if (ret != HIPFFT_SUCCESS) {
+        LOG(ERROR) << "failed to create rocFFT batched plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to create rocFFT bacthed plan."};
+      }
+    } else {
+      auto ret = wrap::hipfftCreate(parent, &plan_);
+      if (ret != HIPFFT_SUCCESS) {
+        LOG(ERROR) << "failed to create rocFFT batched plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to create rocFFT bacthed plan."};
+      }
+      ret = wrap::hipfftSetAutoAllocation(parent, plan_, 0);
+      if (ret != HIPFFT_SUCCESS) {
+        LOG(ERROR) << "failed to set auto allocation for rocFFT batched plan:"
+                   << ret;
+        return port::Status{
+            port::error::INTERNAL,
+            "Failed to set auto allocation for rocFFT bacthed plan."};
+      }
+      size_t size_in_bytes;
+      ret = wrap::hipfftMakePlanMany(
+          parent, plan_, rank, elem_count_,
+          input_embed ? input_embed_ : nullptr, input_stride, input_distance,
+          output_embed ? output_embed_ : nullptr, output_stride,
+          output_distance, ROCMFftType(type), batch_count, &size_in_bytes);
+      if (ret != HIPFFT_SUCCESS) {
+        LOG(ERROR) << "failed to make rocFFT batched plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to make rocFFT bacthed plan."};
+      }
+      if (size_in_bytes != 0) {
+        auto allocated =
+            scratch_allocator->AllocateBytes(stream, size_in_bytes);
+        if (!allocated.ok() || (scratch_ = allocated.ValueOrDie()) == nullptr) {
+          LOG(ERROR) << "failed to allocate work area.";
+          return allocated.status();
+        }
+      }
+      // Connect work area with allocated space.
+      ret = wrap::hipfftSetWorkArea(parent, plan_, scratch_.opaque());
+      if (ret != HIPFFT_SUCCESS) {
+        LOG(ERROR) << "failed to set work area for rocFFT batched plan:" << ret;
+        return port::Status{port::error::INTERNAL,
+                            "Failed to set work area for rocFFT bacthed plan."};
+      }
+    }
+  }
+  return port::Status::OK();
+}
+
+port::Status ROCMFftPlan::Initialize(GpuExecutor *parent, Stream *stream,
+                                     int rank, uint64 *elem_count,
+                                     fft::Type type,
+                                     ScratchAllocator *scratch_allocator) {
+  return Initialize(parent_, stream, rank, elem_count,
+                    /*input_embed=*/nullptr, /*input_stride=*/0,
+                    /*input_distance=*/0,
+                    /*output_embed=*/nullptr, /*output_stride=*/0,
+                    /*output_distance=*/0, type, 1, scratch_allocator);
+}
+
+ROCMFftPlan::~ROCMFftPlan() { wrap::hipfftDestroy(parent_, plan_); }
+
+int ROCMFftPlan::GetFftDirection() const {
+  if (!IsInitialized()) {
+    LOG(FATAL) << "Try to get fft direction before initialization.";
+  } else {
+    switch (fft_type_) {
+      case fft::Type::kC2CForward:
+      case fft::Type::kZ2ZForward:
+      case fft::Type::kR2C:
+      case fft::Type::kD2Z:
+        return HIPFFT_FORWARD;
+      case fft::Type::kC2CInverse:
+      case fft::Type::kZ2ZInverse:
+      case fft::Type::kC2R:
+      case fft::Type::kZ2D:
+        return HIPFFT_BACKWARD;
+      default:
+        LOG(FATAL) << "Invalid value of fft::Type.";
+    }
+  }
+}
+
+std::unique_ptr<fft::Plan> ROCMFft::Create1dPlan(Stream *stream, uint64 num_x,
+                                                 fft::Type type,
+                                                 bool in_place_fft) {
+  std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
+  uint64 elem_count[1] = {num_x};
+  port::Status status = fft_plan_ptr->Initialize(
+      parent_, stream, 1, elem_count, type, /*scratch_allocator=*/nullptr);
+  // TODO(yangzihao): In the future, send error msg back to TensorFlow
+  // so it can fail gracefully,
+  if (!status.ok()) {
+    LOG(FATAL) << "failed to initialize hipfft 1d plan: "
+               << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
+}
+
+std::unique_ptr<fft::Plan> ROCMFft::Create1dPlanWithScratchAllocator(
+    Stream *stream, uint64 num_x, fft::Type type, bool in_place_fft,
+    ScratchAllocator *scratch_allocator) {
+  std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
+  uint64 elem_count[1] = {num_x};
+  port::Status status = fft_plan_ptr->Initialize(parent_, stream, 1, elem_count,
+                                                 type, scratch_allocator);
+  if (!status.ok()) {
+    LOG(FATAL)
+        << "failed to initialize hipfft 1d plan with customized allocator: "
+        << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
+}
+
+std::unique_ptr<fft::Plan> ROCMFft::Create2dPlan(Stream *stream, uint64 num_x,
+                                                 uint64 num_y, fft::Type type,
+                                                 bool in_place_fft) {
+  std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
+  uint64 elem_count[2] = {num_x, num_y};
+  port::Status status = fft_plan_ptr->Initialize(
+      parent_, stream, 1, elem_count, type, /*scratch_allocator=*/nullptr);
+  if (!status.ok()) {
+    LOG(FATAL) << "failed to initialize hipfft 2d plan: "
+               << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
+}
+
+std::unique_ptr<fft::Plan> ROCMFft::Create2dPlanWithScratchAllocator(
+    Stream *stream, uint64 num_x, uint64 num_y, fft::Type type,
+    bool in_place_fft, ScratchAllocator *scratch_allocator) {
+  std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
+  uint64 elem_count[2] = {num_x, num_y};
+  port::Status status = fft_plan_ptr->Initialize(parent_, stream, 2, elem_count,
+                                                 type, scratch_allocator);
+  if (!status.ok()) {
+    LOG(FATAL)
+        << "failed to initialize hipfft 2d plan with customized allocator: "
+        << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
+}
+
+std::unique_ptr<fft::Plan> ROCMFft::Create3dPlan(Stream *stream, uint64 num_x,
+                                                 uint64 num_y, uint64 num_z,
+                                                 fft::Type type,
+                                                 bool in_place_fft) {
+  std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
+  uint64 elem_count[3] = {num_x, num_y, num_z};
+  port::Status status = fft_plan_ptr->Initialize(
+      parent_, stream, 3, elem_count, type, /*scratch_allocator=*/nullptr);
+  if (!status.ok()) {
+    LOG(FATAL) << "failed to initialize hipfft 3d plan: "
+               << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
+}
+
+std::unique_ptr<fft::Plan> ROCMFft::Create3dPlanWithScratchAllocator(
+    Stream *stream, uint64 num_x, uint64 num_y, uint64 num_z, fft::Type type,
+    bool in_place_fft, ScratchAllocator *scratch_allocator) {
+  std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
+  uint64 elem_count[3] = {num_x, num_y, num_z};
+  port::Status status = fft_plan_ptr->Initialize(parent_, stream, 3, elem_count,
+                                                 type, scratch_allocator);
+  if (!status.ok()) {
+    LOG(FATAL)
+        << "failed to initialize hipfft 3d plan with customized allocator: "
+        << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
+}
+
+std::unique_ptr<fft::Plan> ROCMFft::CreateBatchedPlan(
+    Stream *stream, int rank, uint64 *elem_count, uint64 *input_embed,
+    uint64 input_stride, uint64 input_distance, uint64 *output_embed,
+    uint64 output_stride, uint64 output_distance, fft::Type type,
+    bool in_place_fft, int batch_count) {
+  std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
+  port::Status status = fft_plan_ptr->Initialize(
+      parent_, stream, rank, elem_count, input_embed, input_stride,
+      input_distance, output_embed, output_stride, output_distance, type,
+      batch_count, /*scratch_allocator=*/nullptr);
+  if (!status.ok()) {
+    LOG(FATAL) << "failed to initialize batched hipfft plan: "
+               << status.error_message();
+  }
+
+  return std::move(fft_plan_ptr);
+}
+
+std::unique_ptr<fft::Plan> ROCMFft::CreateBatchedPlanWithScratchAllocator(
+    Stream *stream, int rank, uint64 *elem_count, uint64 *input_embed,
+    uint64 input_stride, uint64 input_distance, uint64 *output_embed,
+    uint64 output_stride, uint64 output_distance, fft::Type type,
+    bool in_place_fft, int batch_count, ScratchAllocator *scratch_allocator) {
+  std::unique_ptr<ROCMFftPlan> fft_plan_ptr{new ROCMFftPlan()};
+  port::Status status = fft_plan_ptr->Initialize(
+      parent_, stream, rank, elem_count, input_embed, input_stride,
+      input_distance, output_embed, output_stride, output_distance, type,
+      batch_count, scratch_allocator);
+  if (!status.ok()) {
+    LOG(FATAL) << "failed to initialize batched hipfft plan with customized "
+                  "allocator: "
+               << status.error_message();
+  }
+  return std::move(fft_plan_ptr);
+}
+
+void ROCMFft::UpdatePlanWithScratchAllocator(
+    Stream *stream, fft::Plan *plan, ScratchAllocator *scratch_allocator) {
+  LOG(ERROR) << "update plan with scratch allocator not implemented";
+}
+
+template <typename FuncT, typename InputT, typename OutputT>
+bool ROCMFft::DoFftInternal(Stream *stream, fft::Plan *plan, FuncT hipfftExec,
+                            const DeviceMemory<InputT> &input,
+                            DeviceMemory<OutputT> *output) {
+  ROCMFftPlan *rocm_fft_plan = dynamic_cast<ROCMFftPlan *>(plan);
+  if (rocm_fft_plan == nullptr) {
+    LOG(ERROR) << "the passed-in plan is not a ROCMFftPlan object.";
+    return false;
+  }
+
+  if (!SetStream(parent_, rocm_fft_plan->GetPlan(), stream)) {
+    return false;
+  }
+
+  auto ret = hipfftExec(parent_, rocm_fft_plan->GetPlan(),
+                        GpuComplex(const_cast<InputT *>(GpuMemory(input))),
+                        GpuComplex(GpuMemoryMutable(output)));
+
+  if (ret != HIPFFT_SUCCESS) {
+    LOG(ERROR) << "failed to run rocFFT routine: " << ret;
+    return false;
+  }
+
+  return true;
+}
+
+template <typename FuncT, typename InputT, typename OutputT>
+bool ROCMFft::DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan,
+                                         FuncT hipfftExec,
+                                         const DeviceMemory<InputT> &input,
+                                         DeviceMemory<OutputT> *output) {
+  ROCMFftPlan *rocm_fft_plan = dynamic_cast<ROCMFftPlan *>(plan);
+  if (rocm_fft_plan == nullptr) {
+    LOG(ERROR) << "the passed-in plan is not a ROCMFftPlan object.";
+    return false;
+  }
+
+  if (!SetStream(parent_, rocm_fft_plan->GetPlan(), stream)) {
+    return false;
+  }
+
+  auto ret = hipfftExec(parent_, rocm_fft_plan->GetPlan(),
+                        GpuComplex(const_cast<InputT *>(GpuMemory(input))),
+                        GpuComplex(GpuMemoryMutable(output)),
+                        rocm_fft_plan->GetFftDirection());
+
+  if (ret != HIPFFT_SUCCESS) {
+    LOG(ERROR) << "failed to run rocFFT routine: " << ret;
+    return false;
+  }
+
+  return true;
+}
+
+#define STREAM_EXECUTOR_ROCM_DEFINE_FFT(__type, __fft_type1, __fft_type2,    \
+                                        __fft_type3)                         \
+  bool ROCMFft::DoFft(Stream *stream, fft::Plan *plan,                       \
+                      const DeviceMemory<std::complex<__type>> &input,       \
+                      DeviceMemory<std::complex<__type>> *output) {          \
+    return DoFftWithDirectionInternal(                                       \
+        stream, plan, wrap::hipfftExec##__fft_type1, input, output);         \
+  }                                                                          \
+  bool ROCMFft::DoFft(Stream *stream, fft::Plan *plan,                       \
+                      const DeviceMemory<__type> &input,                     \
+                      DeviceMemory<std::complex<__type>> *output) {          \
+    return DoFftInternal(stream, plan, wrap::hipfftExec##__fft_type2, input, \
+                         output);                                            \
+  }                                                                          \
+  bool ROCMFft::DoFft(Stream *stream, fft::Plan *plan,                       \
+                      const DeviceMemory<std::complex<__type>> &input,       \
+                      DeviceMemory<__type> *output) {                        \
+    return DoFftInternal(stream, plan, wrap::hipfftExec##__fft_type3, input, \
+                         output);                                            \
+  }
+
+STREAM_EXECUTOR_ROCM_DEFINE_FFT(float, C2C, R2C, C2R)
+STREAM_EXECUTOR_ROCM_DEFINE_FFT(double, Z2Z, D2Z, Z2D)
+
+#undef STREAM_EXECUTOR_ROCM_DEFINE_FFT
+
+}  // namespace gpu
+
+void initialize_rocfft() {
+  auto rocFftAlreadyRegistered = PluginRegistry::Instance()->HasFactory(
+      rocm::kROCmPlatformId, PluginKind::kFft, gpu::kRocFftPlugin);
+
+  if (!rocFftAlreadyRegistered) {
+    port::Status status =
+        PluginRegistry::Instance()->RegisterFactory<PluginRegistry::FftFactory>(
+            rocm::kROCmPlatformId, gpu::kRocFftPlugin, "rocFFT",
+            [](internal::StreamExecutorInterface *parent) -> fft::FftSupport * {
+              gpu::GpuExecutor *rocm_executor =
+                  dynamic_cast<gpu::GpuExecutor *>(parent);
+              if (rocm_executor == nullptr) {
+                LOG(ERROR)
+                    << "Attempting to initialize an instance of the rocFFT "
+                    << "support library with a non-ROCM StreamExecutor";
+                return nullptr;
+              }
+
+              return new gpu::ROCMFft(rocm_executor);
+            });
+    if (!status.ok()) {
+      LOG(ERROR) << "Unable to register rocFFT factory: "
+                 << status.error_message();
+    }
+
+    PluginRegistry::Instance()->SetDefaultFactory(
+        rocm::kROCmPlatformId, PluginKind::kFft, gpu::kRocFftPlugin);
+  }
+}
+
+}  // namespace stream_executor
+
+REGISTER_MODULE_INITIALIZER(register_rocfft,
+                            { stream_executor::initialize_rocfft(); });
diff --git a/tensorflow/stream_executor/rocm/rocm_fft.h b/tensorflow/stream_executor/rocm/rocm_fft.h
new file mode 100644
index 0000000000000000000000000000000000000000..7086d8a4b129a5807fdbde6d9ace6ee437edc3ce
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_fft.h
@@ -0,0 +1,132 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// ROCM-specific support for FFT functionality -- this wraps the rocFFT library
+// capabilities, and is only included into ROCM implementation code -- it will
+// not introduce rocm headers into other code.
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_FFT_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_FFT_H_
+
+#include "rocm/include/rocfft/hipfft.h"
+#include "tensorflow/stream_executor/fft.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/scratch_allocator.h"
+
+namespace stream_executor {
+
+class Stream;
+
+namespace gpu {
+
+class GpuExecutor;
+
+// Opaque and unique indentifier for the rocFFT plugin.
+extern const PluginId kRocFftPlugin;
+
+// ROCMFftPlan uses deferred initialization. Only a single call of
+// Initialize() is allowed to properly create hipfft plan and set member
+// variable is_initialized_ to true. Newly added interface that uses member
+// variables should first check is_initialized_ to make sure that the values of
+// member variables are valid.
+class ROCMFftPlan : public fft::Plan {
+ public:
+  ROCMFftPlan()
+      : parent_(nullptr),
+        plan_(),
+        fft_type_(fft::Type::kInvalid),
+        scratch_(nullptr),
+        is_initialized_(false) {}
+  ~ROCMFftPlan() override;
+
+  // Get FFT direction in hipFFT based on FFT type.
+  int GetFftDirection() const;
+  hipfftHandle GetPlan() const {
+    if (IsInitialized()) {
+      return plan_;
+    } else {
+      LOG(FATAL) << "Try to get hipfftHandle value before initialization.";
+    }
+  }
+
+  // Initialize function for batched plan
+  port::Status Initialize(GpuExecutor *parent, Stream *stream, int rank,
+                          uint64 *elem_count, uint64 *input_embed,
+                          uint64 input_stride, uint64 input_distance,
+                          uint64 *output_embed, uint64 output_stride,
+                          uint64 output_distance, fft::Type type,
+                          int batch_count, ScratchAllocator *scratch_allocator);
+
+  // Initialize function for 1d,2d, and 3d plan
+  port::Status Initialize(GpuExecutor *parent, Stream *stream, int rank,
+                          uint64 *elem_count, fft::Type type,
+                          ScratchAllocator *scratch_allocator);
+
+ protected:
+  bool IsInitialized() const { return is_initialized_; }
+
+ private:
+  GpuExecutor *parent_;
+  hipfftHandle plan_;
+  fft::Type fft_type_;
+  DeviceMemory<uint8> scratch_;
+  bool is_initialized_;
+};
+
+// FFT support for ROCM platform via rocFFT library.
+//
+// This satisfies the platform-agnostic FftSupport interface.
+//
+// Note that the hipFFT handle that this encapsulates is implicitly tied to the
+// context (and, as a result, the device) that the parent GpuExecutor is tied
+// to. This simply happens as an artifact of creating the hipFFT handle when a
+// ROCM context is active.
+//
+// Thread-safe. The ROCM context associated with all operations is the ROCM
+// context of parent_, so all context is explicit.
+class ROCMFft : public fft::FftSupport {
+ public:
+  explicit ROCMFft(GpuExecutor *parent) : parent_(parent) {}
+  ~ROCMFft() override {}
+
+  TENSORFLOW_STREAM_EXECUTOR_GPU_FFT_SUPPORT_OVERRIDES
+
+ private:
+  GpuExecutor *parent_;
+
+  // Two helper functions that execute dynload::hipfftExec?2?.
+
+  // This is for complex to complex FFT, when the direction is required.
+  template <typename FuncT, typename InputT, typename OutputT>
+  bool DoFftWithDirectionInternal(Stream *stream, fft::Plan *plan,
+                                  FuncT hipfft_exec,
+                                  const DeviceMemory<InputT> &input,
+                                  DeviceMemory<OutputT> *output);
+
+  // This is for complex to real or real to complex FFT, when the direction
+  // is implied.
+  template <typename FuncT, typename InputT, typename OutputT>
+  bool DoFftInternal(Stream *stream, fft::Plan *plan, FuncT hipfft_exec,
+                     const DeviceMemory<InputT> &input,
+                     DeviceMemory<OutputT> *output);
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ROCMFft);
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_FFT_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0f6af695dfdf7ee8623478fbc2250c2fb7d39e64
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
@@ -0,0 +1,977 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <unistd.h>
+
+#include "absl/base/casts.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_event.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/gpu/gpu_timer.h"
+#include "tensorflow/stream_executor/kernel_cache_config.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/mathutil.h"
+#include "tensorflow/stream_executor/lib/numbers.h"
+#include "tensorflow/stream_executor/lib/path.h"
+#include "tensorflow/stream_executor/lib/process_state.h"
+#include "tensorflow/stream_executor/lib/ptr_util.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/lib/str_util.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/plugin_registry.h"
+#include "tensorflow/stream_executor/rocm/rocm_diagnostics.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
+#include "tensorflow/stream_executor/stream.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/stream_executor/timer.h"
+
+#ifdef PLATFORMS_GPUS_ROCM_DYNAMIC_LIBROCM_DYNAMIC_LIBROCM_H_
+#error \
+    "No driver calls in this file, wrap driver functionality in rocm_driver.cc."
+#endif
+
+#ifdef __ROCM_RUNTIME_H__
+#error \
+    "ROCM runtime being included into ROCM GPU executor; should be driver only."
+#endif
+
+namespace stream_executor {
+namespace gpu {
+
+static GpuEvent* AsGpuEvent(Event* event) {
+  DCHECK(event != nullptr);
+  return static_cast<GpuEvent*>(event->implementation());
+}
+
+// Given a platform-independent timer datatype, returns the internal ROCM
+// platform implementation pointer.
+static GpuTimer* AsGpuTimer(Timer* timer) {
+  DCHECK(timer != nullptr);
+  return static_cast<GpuTimer*>(timer->implementation());
+}
+
+// Given const GPU memory, returns a librocm device pointer datatype, suitable
+// for passing directly to librocm APIs.
+//
+// N.B. we must lose constness in order to pass a suitable type to the existing
+// librocm APIs, so the caller should take care to only pass the result of const
+// GPU memory conversions to librocm functions which will honor constness.
+static hipDeviceptr_t AsROCmDevicePtr(const DeviceMemoryBase& gpu_mem) {
+  return const_cast<hipDeviceptr_t>(gpu_mem.opaque());
+}
+
+// See description on const version above.
+static hipDeviceptr_t AsROCmDevicePtr(DeviceMemoryBase* gpu_mem) {
+  return AsROCmDevicePtr(*gpu_mem);
+}
+
+static GpuContext* GetGpuContext(Stream* stream) {
+  return static_cast<GpuExecutor*>(stream->parent()->implementation())
+      ->gpu_context();
+}
+
+GpuContext* ExtractGpuContext(GpuExecutor* rocm_exec) {
+  CHECK(rocm_exec != nullptr);
+  return rocm_exec->gpu_context();
+}
+
+GpuExecutor* ExtractGpuExecutor(StreamExecutor* stream_exec) {
+  return static_cast<GpuExecutor*>(stream_exec->implementation());
+}
+
+GpuExecutor::~GpuExecutor() {
+  for (auto& it : disk_modules_) {
+    GpuDriver::UnloadModule(context_, it.second);
+  }
+  for (auto& it : in_memory_modules_) {
+    GpuDriver::UnloadModule(context_, it.second);
+  }
+  if (context_ != nullptr) {
+    GpuDriver::DestroyContext(context_);
+  }
+  CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
+}
+bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
+  const char* gpu_binary = reinterpret_cast<const char*>(module_handle.id());
+  mutex_lock lock{in_memory_modules_mu_};
+  return UnloadGpuBinary(gpu_binary);
+}
+
+bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
+  auto module_it = gpu_binary_to_module_.find(gpu_binary);
+  if (gpu_binary_to_module_.end() == module_it) {
+    VLOG(3) << "No loaded  HSACO module for " << gpu_binary;
+    return false;
+  }
+  auto& module = module_it->second.first;
+  auto& refcount = module_it->second.second;
+  VLOG(3) << "Found HSACO module " << module << " with refcount " << refcount;
+  if (--refcount == 0) {
+    VLOG(3) << "Unloading  HSACO module " << module;
+    GpuDriver::UnloadModule(context_, module);
+    gpu_binary_to_module_.erase(module_it);
+  }
+  return true;
+}
+
+void GpuExecutor::UnloadKernel(const KernelBase* kernel) {
+  LOG(FATAL) << "Feature not supported on ROCM platform (UnloadKernel)";
+}
+
+port::Status GpuExecutor::Init(int device_ordinal,
+                               DeviceOptions device_options) {
+  device_ordinal_ = device_ordinal;
+
+  auto status = GpuDriver::Init();
+  if (!status.ok()) {
+    return status;
+  }
+
+  status = GpuDriver::GetDevice(device_ordinal_, &device_);
+  if (!status.ok()) {
+    return status;
+  }
+
+  status = GpuDriver::CreateContext(device_ordinal_, device_, device_options,
+                                    &context_);
+  if (!status.ok()) {
+    return status;
+  }
+
+  return GpuDriver::GetGpuISAVersion(&version_, device_);
+}
+
+bool GpuExecutor::FindOnDiskForComputeCapability(
+    absl::string_view filename, absl::string_view canonical_suffix,
+    string* found_filename) const {
+  LOG(FATAL) << "Feature not supported on ROCM platform "
+                "(FindOnDiskForComputeCapability)";
+  return false;
+}
+
+bool GpuExecutor::FindOnDiskForISAVersion(absl::string_view filename,
+                                          absl::string_view canonical_suffix,
+                                          string* found_filename) const {
+  if (version_ == 0) {
+    return false;
+  }
+
+  string cc_specific =
+      absl::StrCat(filename, ".cc", version_, canonical_suffix);
+  if (port::FileExists(cc_specific).ok()) {
+    VLOG(2) << "found AMDGPU ISA version-specific file, using that: "
+            << cc_specific;
+    *found_filename = cc_specific;
+    return true;
+  }
+
+  VLOG(2) << "could not find AMDGPU ISA version-specific file at: "
+          << cc_specific;
+  if (port::FileExists(string(filename)).ok()) {
+    *found_filename = string(filename);
+    return true;
+  }
+
+  return false;
+}
+
+// Returns the path to the running executable.
+// N.B. Derived from //knowledge/smalltalk/background_kb.cc
+// Arg: strip_exe: if true, remove the name of the executable itself from the
+//                 returned string. Example: calling this from /usr/bin/foo
+//                 would return /usr/bin.
+static string GetBinaryDir(bool strip_exe) {
+  char exe_path[PATH_MAX] = {0};
+  CHECK_ERR(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1));
+  // Make sure it's null-terminated:
+  exe_path[sizeof(exe_path) - 1] = 0;
+
+  if (strip_exe) {
+    // The exe is the last component of the path, so remove one component.
+    string ret = exe_path;
+    std::vector<string> components = port::Split(exe_path, '/');
+    components.pop_back();
+    return port::Join(components, "/");
+  }
+  return exe_path;
+}
+
+bool GpuExecutor::GetKernel(const MultiKernelLoaderSpec& spec,
+                            KernelBase* kernel) {
+  GpuKernel* rocm_kernel = AsGpuKernel(kernel);
+  hipModule_t module = nullptr;
+  const string* kernelname;
+
+  const OnDiskKernelLoaderSpec* on_disk_spec = nullptr;
+  bool has_cubin = spec.has_cuda_cubin_on_disk();
+  if (has_cubin) {
+    on_disk_spec = &spec.cuda_cubin_on_disk();
+  }
+
+  if (on_disk_spec != nullptr) {
+    LOG(WARNING) << "loading ROCM kernel from disk is not supported";
+    return false;
+  } else if (spec.has_cuda_cubin_in_memory()) {
+    kernelname = &spec.cuda_cubin_in_memory().kernelname();
+
+    const char* hsaco = spec.cuda_cubin_in_memory().bytes();
+    mutex_lock lock{in_memory_modules_mu_};
+    module = in_memory_modules_[hsaco];
+
+    if (module == nullptr) {
+      if (!GpuDriver::LoadHsaco(context_, hsaco, &module)) {
+        LOG(ERROR) << "failed to load HSACO\n";
+        return false;
+      }
+      in_memory_modules_[hsaco] = module;
+    }
+  } else {
+    LOG(WARNING) << "no method of loading ROCM kernel provided";
+    return false;
+  }
+
+  VLOG(2) << "getting function " << *kernelname << " from module " << module;
+  if (!GpuDriver::GetModuleFunction(context_, module, kernelname->c_str(),
+                                    rocm_kernel->gpu_function_ptr())) {
+    return false;
+  }
+
+  // We have to trust the kernel loader spec arity because there doesn't appear
+  // to be a way to reflect on the number of expected arguments w/the ROCM API.
+  rocm_kernel->set_arity(spec.arity());
+
+  KernelMetadata kernel_metadata;
+  if (!GetKernelMetadata(rocm_kernel, &kernel_metadata)) {
+    LOG(WARNING) << "Unable to get metadata for kernel " << kernelname;
+  }
+  kernel->set_metadata(kernel_metadata);
+  kernel->set_name(*kernelname);
+  return true;
+}
+
+bool GpuExecutor::GetKernelMetadata(GpuKernel* rocm_kernel,
+                                    KernelMetadata* kernel_metadata) {
+  int value = 0;
+  // TODO(ROCm) implement this feature in HIP
+  kernel_metadata->set_registers_per_thread(value);
+
+  // TODO(ROCm) implement this feature in HIP
+  kernel_metadata->set_shared_memory_bytes(value);
+
+  return true;
+}
+
+bool GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
+                         const BlockDim& block_dims, const KernelBase& kernel,
+                         const KernelArgsArrayBase& args) {
+  CHECK_EQ(kernel.Arity(), args.number_of_arguments());
+  GpuStreamHandle hipstream = AsGpuStreamValue(stream);
+  const GpuKernel* rocm_kernel = AsGpuKernel(&kernel);
+  hipFunction_t hipfunc = rocm_kernel->AsGpuFunctionHandle();
+
+  // Only perform/print the occupancy check once.  Even just checking to see
+  // whether we've done an occupancy check on this kernel before isn't free
+  // (because we have to synchronize), so we only do this at -v 2+.
+  if (VLOG_IS_ON(2)) {
+    mutex_lock lock(launched_kernels_mu_);
+    if (!launched_kernels_.count(hipfunc)) {
+      VlogOccupancyInfo(kernel, thread_dims, block_dims);
+      // TODO(rspringer): Remove elements from launched_kernels_...if we ever
+      // expose a kernel/module deallocation method.
+      launched_kernels_.insert(hipfunc);
+    }
+  }
+
+  if (rocm_kernel->GetPreferredCacheConfig() !=
+      KernelCacheConfig::kNoPreference) {
+    GpuDriver::FuncSetCacheConfig(hipfunc, rocm_kernel->GetGpuCacheConfig());
+  }
+
+  // prepare kernargs
+  // KernelArgsArrayBase keeps the pointer of arguments
+  // deference them here
+  std::vector<void*> kernargs;
+  KernelArgIterator iter = args.arg_iterator();
+  while (iter.has_next()) {
+    KernelArg arg = iter.next();
+    VLOG(2) << "*(arg.address): "
+            << reinterpret_cast<void*>(
+                   *static_cast<const uint64_t*>(arg.address));
+    kernargs.push_back(
+        reinterpret_cast<void*>(*static_cast<const uint64_t*>(arg.address)));
+  }
+
+  size_t size = sizeof(void*) * kernargs.size();
+  void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER, kernargs.data(),
+                    HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, HIP_LAUNCH_PARAM_END};
+
+  if (!GpuDriver::LaunchKernel(
+          GetGpuContext(stream), hipfunc, block_dims.x, block_dims.y,
+          block_dims.z, thread_dims.x, thread_dims.y, thread_dims.z,
+          args.number_of_shared_bytes(), hipstream, nullptr, (void**)&config)) {
+    LOG(ERROR) << "failed to launch ROCM kernel with args: "
+               << args.number_of_arguments()
+               << "; thread dim: " << thread_dims.ToString()
+               << "; block dim: " << block_dims.ToString();
+    return false;
+  }
+
+  return true;
+}
+
+int GpuExecutor::CalculateOccupancy(const DeviceDescription& device_description,
+                                    uint64 registers_per_thread,
+                                    uint64 shared_memory_per_block,
+                                    const ThreadDim& thread_dims,
+                                    GpuFunctionHandle func) {
+  LOG(FATAL) << "Feature not supported on ROCM platform (CalculateOccupancy)";
+  return 0;
+}
+
+int GpuExecutor::CompareOccupancy(int* initial_blocks,
+                                  const DeviceDescription& device_description,
+                                  uint64 registers_per_thread,
+                                  uint64 shared_memory_per_block,
+                                  const ThreadDim& thread_dims,
+                                  GpuFunctionHandle func) {
+  LOG(FATAL) << "Feature not supported on ROCM platform (CompareOccupancy)";
+  return 0;
+}
+
+bool GpuExecutor::LoadModule(const MultiModuleLoaderSpec& spec,
+                             ModuleHandle* module_handle) {
+  // In GpuExecutor we store the pointer to the  HSACO binary  as
+  // ModuleHandle::id().
+  hipModule_t hip_module = nullptr;
+  // TODO(ROCm): Need  generic term instead of cubin/cuda/ptx
+  if (spec.has_cuda_cubin_in_memory()) {
+    mutex_lock lock{in_memory_modules_mu_};
+    if (!LoadModuleFromHsaco(
+            reinterpret_cast<const char*>(spec.cuda_cubin_in_memory().data()),
+            &hip_module)) {
+      return false;
+    }
+    *module_handle = ModuleHandle(const_cast<void*>(
+        static_cast<const void*>(spec.cuda_cubin_in_memory().data())));
+    return true;
+  } else {
+    LOG(ERROR) << "No HSACO binary found \n";
+    return false;
+  }
+}
+
+bool GpuExecutor::LoadModuleFromCuBin(const char* cubin, hipModule_t* module) {
+  LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromCuBin)";
+  return false;
+}
+
+bool GpuExecutor::LoadModuleFromPtx(const char* ptx, hipModule_t* module) {
+  LOG(FATAL) << "Feature not supported on ROCM platform (LoadModuleFromPtx)";
+  return false;
+}
+
+bool GpuExecutor::LoadModuleFromHsaco(const char* hsaco, hipModule_t* module) {
+  uint64_t module_refcount;
+  std::tie(*module, module_refcount) = gpu_binary_to_module_[hsaco];
+
+  if (*module == nullptr) {
+    if (!GpuDriver::LoadHsaco(context_, hsaco, module)) {
+      LOG(ERROR) << "failed to load : HSACO \n";
+      return false;
+    }
+    module_refcount = 1;
+    VLOG(3) << "Loaded HSACO " << static_cast<const void*>(hsaco)
+            << " as module " << *module;
+  } else {
+    ++module_refcount;
+    VLOG(3) << "HSACO " << static_cast<const void*>(hsaco)
+            << " is already loaded as module " << *module;
+  }
+  gpu_binary_to_module_[hsaco] = {*module, module_refcount};
+  return true;
+}
+
+// This is a non-essential operation; if there's a failure, proceed without
+// logging an error. It's nearly certain that in case of failures, we'd never
+// get here in the first place; these are very low-impact routines.
+void GpuExecutor::VlogOccupancyInfo(const KernelBase& kernel,
+                                    const ThreadDim& thread_dims,
+                                    const BlockDim& block_dims) {
+  // TODO(ROCm) implement this feature in HIP
+}
+
+void* GpuExecutor::Allocate(uint64 size) {
+  return GpuDriver::DeviceAllocate(context_, size);
+}
+
+void* GpuExecutor::AllocateSubBuffer(DeviceMemoryBase* mem, uint64 offset_bytes,
+                                     uint64 size_bytes) {
+  // offset and size are in bytes, so char* works as the pointer type.
+  return reinterpret_cast<char*>(mem->opaque()) + offset_bytes;
+}
+
+void GpuExecutor::Deallocate(DeviceMemoryBase* mem) {
+  // ROCM "sub-buffers" are just pointer + offset, so no dealloc is necessary.
+  if (!mem->is_sub_buffer()) {
+    GpuDriver::DeviceDeallocate(context_, mem->opaque());
+  }
+}
+
+bool GpuExecutor::HostMemoryRegister(void* location, uint64 size) {
+  if (location == nullptr || size == 0) {
+    LOG(WARNING) << "attempting to register null or zero-sized memory: "
+                 << location << "; size " << size;
+  }
+  VLOG(2) << "registering " << location << " size " << size;
+  return GpuDriver::HostRegister(context_, location, size);
+}
+
+bool GpuExecutor::HostMemoryUnregister(void* location) {
+  VLOG(2) << "unregistering " << location;
+  return GpuDriver::HostUnregister(context_, location);
+}
+
+bool GpuExecutor::SynchronizeAllActivity() {
+  return GpuDriver::SynchronizeContext(context_);
+}
+
+bool GpuExecutor::SynchronousMemZero(DeviceMemoryBase* location, uint64 size) {
+  if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
+      size % 4 == 0) {
+    return GpuDriver::SynchronousMemsetUint32(
+        context_, AsROCmDevicePtr(location), 0x0, size / 4);
+  }
+  return GpuDriver::SynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
+                                           0x0, size);
+}
+
+bool GpuExecutor::SynchronousMemSet(DeviceMemoryBase* location, int value,
+                                    uint64 size) {
+  if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
+      size % 4 == 0) {
+    // hipMemset reinterprets "value" as a uint8.
+    uint8 byte_value = static_cast<uint8>(value);
+    uint32 pattern = (byte_value << 24) | (byte_value << 16) |
+                     (byte_value << 8) | byte_value;
+    return GpuDriver::SynchronousMemsetUint32(
+        context_, AsROCmDevicePtr(location), pattern, size / 4);
+  }
+  return GpuDriver::SynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
+                                           value, size);
+}
+
+port::Status GpuExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
+                                            const void* host_src, uint64 size) {
+  return GpuDriver::SynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
+                                         host_src, size);
+}
+
+port::Status GpuExecutor::SynchronousMemcpy(void* host_dst,
+                                            const DeviceMemoryBase& gpu_src,
+                                            uint64 size) {
+  return GpuDriver::SynchronousMemcpyD2H(context_, host_dst,
+                                         AsROCmDevicePtr(gpu_src), size);
+}
+
+port::Status GpuExecutor::SynchronousMemcpyDeviceToDevice(
+    DeviceMemoryBase* gpu_dst, const DeviceMemoryBase& gpu_src, uint64 size) {
+  return GpuDriver::SynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
+                                         AsROCmDevicePtr(gpu_src), size);
+}
+
+bool GpuExecutor::MemZero(Stream* stream, DeviceMemoryBase* location,
+                          uint64 size) {
+  if (reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
+      size % 4 == 0) {
+    return Memset32(stream, location, 0x0, size);
+  } else {
+    return Memset(stream, location, 0x0, size);
+  }
+}
+
+bool GpuExecutor::Memset(Stream* stream, DeviceMemoryBase* location,
+                         uint8 pattern, uint64 size) {
+  VLOG(2) << "enqueueing memset8 operation onto stream " << stream
+          << " at location " << location << " with size " << size
+          << " and pattern " << std::hex << pattern;
+  return GpuDriver::AsynchronousMemsetUint8(context_, AsROCmDevicePtr(location),
+                                            pattern, size,
+                                            AsGpuStreamValue(stream));
+}
+
+bool GpuExecutor::Memset32(Stream* stream, DeviceMemoryBase* location,
+                           uint32 pattern, uint64 size) {
+  VLOG(2) << "enqueueing memset32 operation onto stream " << stream
+          << " at location " << location << " with size " << size
+          << " and pattern " << std::hex << pattern;
+  CHECK(reinterpret_cast<uintptr_t>(location->opaque()) % 4 == 0 &&
+        size % 4 == 0);
+  return GpuDriver::AsynchronousMemsetUint32(
+      context_, AsROCmDevicePtr(location), pattern, size / 4,
+      AsGpuStreamValue(stream));
+}
+
+bool GpuExecutor::Memcpy(Stream* stream, void* host_dst,
+                         const DeviceMemoryBase& gpu_src, uint64 size) {
+  return GpuDriver::AsynchronousMemcpyD2H(context_, host_dst,
+                                          AsROCmDevicePtr(gpu_src), size,
+                                          AsGpuStreamValue(stream));
+}
+
+bool GpuExecutor::Memcpy(Stream* stream, DeviceMemoryBase* gpu_dst,
+                         const void* host_src, uint64 size) {
+  return GpuDriver::AsynchronousMemcpyH2D(context_, AsROCmDevicePtr(gpu_dst),
+                                          host_src, size,
+                                          AsGpuStreamValue(stream));
+}
+
+bool GpuExecutor::MemcpyDeviceToDevice(Stream* stream,
+                                       DeviceMemoryBase* gpu_dst,
+                                       const DeviceMemoryBase& gpu_src,
+                                       uint64 size) {
+  return GpuDriver::AsynchronousMemcpyD2D(context_, AsROCmDevicePtr(gpu_dst),
+                                          AsROCmDevicePtr(gpu_src), size,
+                                          AsGpuStreamValue(stream));
+}
+
+bool GpuExecutor::HostCallback(Stream* stream,
+                               std::function<port::Status()> callback) {
+  auto callback_ptr = new std::function<void()>([callback]() {
+    port::Status s = callback();
+    if (!s.ok()) {
+      LOG(WARNING) << "Host callback failed: " << s;
+    }
+  });
+  return GpuDriver::AddStreamCallback(context_, AsGpuStreamValue(stream),
+                                      InternalHostCallback, callback_ptr);
+}
+
+/* static */ void GpuExecutor::InternalHostCallback(GpuStreamHandle stream,
+                                                    hipError_t status,
+                                                    void* data) {
+  std::function<void()>* callback =
+      reinterpret_cast<std::function<void()>*>(data);
+  (*callback)();
+  delete callback;
+}
+
+port::Status GpuExecutor::AllocateEvent(Event* event) {
+  return AsGpuEvent(event)->Init();
+}
+
+port::Status GpuExecutor::DeallocateEvent(Event* event) {
+  return AsGpuEvent(event)->Destroy();
+}
+
+port::Status GpuExecutor::RecordEvent(Stream* stream, Event* event) {
+  return AsGpuEvent(event)->Record(AsGpuStream(stream));
+}
+
+port::Status GpuExecutor::WaitForEvent(Stream* stream, Event* event) {
+  if (GpuDriver::WaitStreamOnEvent(context_, AsGpuStream(stream)->gpu_stream(),
+                                   AsGpuEvent(event)->gpu_event())) {
+    return port::Status::OK();
+  } else {
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrFormat("error recording waiting for ROCM event on stream %p",
+                        stream)};
+  }
+}
+
+Event::Status GpuExecutor::PollForEventStatus(Event* event) {
+  return AsGpuEvent(event)->PollForStatus();
+}
+
+bool GpuExecutor::AllocateStream(Stream* stream) {
+  return AsGpuStream(stream)->Init();
+}
+
+void GpuExecutor::DeallocateStream(Stream* stream) {
+  GpuStream* rocm_stream = AsGpuStream(stream);
+  if (!rocm_stream->IsIdle()) {
+    LOG(ERROR) << "Deallocating stream with pending work";
+  }
+  rocm_stream->Destroy();
+}
+
+bool GpuExecutor::AllocateTimer(Timer* timer) {
+  return AsGpuTimer(timer)->Init();
+}
+
+void GpuExecutor::DeallocateTimer(Timer* timer) {
+  AsGpuTimer(timer)->Destroy();
+}
+
+bool GpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
+  GpuEventHandle other_completed_event = *AsGpuStream(other)->completed_event();
+  bool ok = GpuDriver::RecordEvent(context_, other_completed_event,
+                                   AsGpuStreamValue(other))
+                .ok();
+  if (!ok) {
+    LOG(ERROR) << "failed to record completion event; "
+                  "therefore, failed to create inter-stream dependency";
+    return false;
+  }
+
+  return GpuDriver::WaitStreamOnEvent(context_, AsGpuStreamValue(dependent),
+                                      other_completed_event);
+}
+
+bool GpuExecutor::StartTimer(Stream* stream, Timer* timer) {
+  return AsGpuTimer(timer)->Start(AsGpuStream(stream));
+}
+
+bool GpuExecutor::StopTimer(Stream* stream, Timer* timer) {
+  return AsGpuTimer(timer)->Stop(AsGpuStream(stream));
+}
+
+port::Status GpuExecutor::BlockHostUntilDone(Stream* stream) {
+  return GpuDriver::SynchronizeStream(context_, AsGpuStreamValue(stream));
+}
+
+blas::BlasSupport* GpuExecutor::CreateBlas() {
+  PluginRegistry* registry = PluginRegistry::Instance();
+  port::StatusOr<PluginRegistry::BlasFactory> status =
+      registry->GetFactory<PluginRegistry::BlasFactory>(rocm::kROCmPlatformId,
+                                                        plugin_config_.blas());
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to retrieve BLAS factory: "
+               << status.status().error_message();
+    return nullptr;
+  }
+
+  return status.ValueOrDie()(this);
+}
+
+dnn::DnnSupport* GpuExecutor::CreateDnn() {
+  PluginRegistry* registry = PluginRegistry::Instance();
+  port::StatusOr<PluginRegistry::DnnFactory> status =
+      registry->GetFactory<PluginRegistry::DnnFactory>(rocm::kROCmPlatformId,
+                                                       plugin_config_.dnn());
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to retrieve DNN factory: "
+               << status.status().error_message();
+    return nullptr;
+  }
+
+  return status.ValueOrDie()(this);
+}
+
+fft::FftSupport* GpuExecutor::CreateFft() {
+  PluginRegistry* registry = PluginRegistry::Instance();
+  port::StatusOr<PluginRegistry::FftFactory> status =
+      registry->GetFactory<PluginRegistry::FftFactory>(rocm::kROCmPlatformId,
+                                                       plugin_config_.fft());
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to retrieve FFT factory: "
+               << status.status().error_message();
+    return nullptr;
+  }
+
+  return status.ValueOrDie()(this);
+}
+
+rng::RngSupport* GpuExecutor::CreateRng() {
+  PluginRegistry* registry = PluginRegistry::Instance();
+  port::StatusOr<PluginRegistry::RngFactory> status =
+      registry->GetFactory<PluginRegistry::RngFactory>(rocm::kROCmPlatformId,
+                                                       plugin_config_.rng());
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to retrieve RNG factory: "
+               << status.status().error_message();
+    return nullptr;
+  }
+
+  return status.ValueOrDie()(this);
+}
+
+// TODO(rspringer): Remove in b/18544742.
+bool GpuExecutor::SupportsDnn() const { return true; }
+
+bool GpuExecutor::CanEnablePeerAccessTo(StreamExecutorInterface* other) {
+  GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
+  return GpuDriver::CanEnablePeerAccess(context_, rocm_other->context_);
+}
+
+port::Status GpuExecutor::EnablePeerAccessTo(StreamExecutorInterface* other) {
+  GpuExecutor* rocm_other = static_cast<GpuExecutor*>(other);
+  return GpuDriver::EnablePeerAccess(context_, rocm_other->context_);
+}
+
+SharedMemoryConfig GpuExecutor::GetDeviceSharedMemoryConfig() {
+  port::StatusOr<hipSharedMemConfig> rocm_config =
+      GpuDriver::ContextGetSharedMemConfig(context_);
+  if (!rocm_config.ok()) {
+    // Don't log; the failed call will log necessary output.
+    return SharedMemoryConfig::kDefault;
+  }
+
+  switch (rocm_config.ValueOrDie()) {
+    case hipSharedMemBankSizeDefault:
+      return SharedMemoryConfig::kDefault;
+    case hipSharedMemBankSizeFourByte:
+      return SharedMemoryConfig::kFourByte;
+    case hipSharedMemBankSizeEightByte:
+      return SharedMemoryConfig::kEightByte;
+    default:
+      LOG(FATAL) << "Invalid shared memory configuration returned: "
+                 << rocm_config.ValueOrDie();
+  }
+}
+
+port::Status GpuExecutor::SetDeviceSharedMemoryConfig(
+    SharedMemoryConfig config) {
+  hipSharedMemConfig rocm_config;
+  switch (config) {
+    case SharedMemoryConfig::kDefault:
+      rocm_config = hipSharedMemBankSizeDefault;
+      break;
+    case SharedMemoryConfig::kFourByte:
+      rocm_config = hipSharedMemBankSizeFourByte;
+      break;
+    case SharedMemoryConfig::kEightByte:
+      rocm_config = hipSharedMemBankSizeEightByte;
+      break;
+    default:
+      LOG(FATAL) << "Invalid shared memory configuration specified: "
+                 << static_cast<int>(config);
+  }
+  return GpuDriver::ContextSetSharedMemConfig(context_, rocm_config);
+}
+
+bool GpuExecutor::DeviceMemoryUsage(int64* free, int64* total) const {
+  return GpuDriver::GetDeviceMemoryInfo(context_, free, total);
+}
+
+bool GpuExecutor::GetSymbol(const string& symbol_name,
+                            ModuleHandle module_handle, void** mem,
+                            size_t* bytes) {
+  {  // give limited scope to mutex_lock
+    mutex_lock lock{disk_modules_mu_};
+    for (auto& it : disk_modules_) {
+      if (GpuDriver::GetModuleSymbol(context_, it.second, symbol_name.c_str(),
+                                     reinterpret_cast<hipDeviceptr_t*>(mem),
+                                     bytes)) {
+        return true;
+      }
+    }
+  }
+
+  {  // give limited scope to mutex_lock
+    mutex_lock lock{in_memory_modules_mu_};
+    for (auto& it : in_memory_modules_) {
+      if (GpuDriver::GetModuleSymbol(context_, it.second, symbol_name.c_str(),
+                                     reinterpret_cast<hipDeviceptr_t*>(mem),
+                                     bytes)) {
+        return true;
+      }
+    }
+  }
+
+  {  // give limited scope to mutex_lock
+    mutex_lock lock{in_memory_modules_mu_};
+    if (static_cast<bool>(module_handle)) {
+      auto it = gpu_binary_to_module_.find(module_handle.id());
+      CHECK(it != gpu_binary_to_module_.end());
+      if (GpuDriver::GetModuleSymbol(
+              context_, it->second.first, symbol_name.c_str(),
+              reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
+        return true;
+      }
+    }
+
+    for (auto& it : gpu_binary_to_module_) {
+      if (GpuDriver::GetModuleSymbol(
+              context_, it.second.first, symbol_name.c_str(),
+              reinterpret_cast<hipDeviceptr_t*>(mem), bytes)) {
+        return true;
+      }
+    }
+  }
+
+  LOG(INFO) << "Falied to find symbol in any modules: " << symbol_name;
+  return false;
+}
+
+bool GpuExecutor::FillBlockDimLimit(BlockDim* block_dim_limit) const {
+  // The BlockDim name is a mismatch against these GRID_DIM_* queries because
+  // we use BlockDims to express the dimensions of blocks within a grid
+  // (as opposed to ThreadDim which expresses the dimensions of threads
+  // within a block).
+  int x, y, z;
+  if (!GpuDriver::GetGridLimits(&x, &y, &z, device_)) {
+    return false;
+  }
+
+  block_dim_limit->x = x;
+  block_dim_limit->y = y;
+  block_dim_limit->z = z;
+  return true;
+}
+
+bool GpuExecutor::SupportsBlas() const { return true; }
+
+bool GpuExecutor::SupportsFft() const { return true; }
+
+bool GpuExecutor::SupportsRng() const { return true; }
+
+std::unique_ptr<internal::EventInterface>
+GpuExecutor::CreateEventImplementation() {
+  return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
+}
+
+std::unique_ptr<internal::KernelInterface>
+GpuExecutor::CreateKernelImplementation() {
+  return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
+}
+
+std::unique_ptr<internal::StreamInterface>
+GpuExecutor::GetStreamImplementation() {
+  return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
+}
+
+std::unique_ptr<internal::TimerInterface>
+GpuExecutor::GetTimerImplementation() {
+  return std::unique_ptr<internal::TimerInterface>(new GpuTimer(this));
+}
+
+void* GpuExecutor::GpuContextHack() { return context_; }
+
+GpuContext* GpuExecutor::gpu_context() { return context_; }
+
+// Attempts to read the NUMA node corresponding to the GPU device's PCI bus out
+// of SysFS. Returns -1 if it cannot.
+//
+// For anything more complicated/prod-focused than this, you'll likely want to
+// turn to gsys' topology modeling.
+static int TryToReadNumaNode(const string& pci_bus_id, int device_ordinal) {
+  // TODO(ROCm) implement this feature in HIP
+  return 1;
+}
+
+DeviceDescription* GpuExecutor::PopulateDeviceDescription() const {
+  internal::DeviceDescriptionBuilder builder;
+
+  {
+    int driver_version = 0;
+    (void)GpuDriver::GetDriverVersion(&driver_version);
+    string augmented_driver_version = absl::StrFormat(
+        "%d (%s)", driver_version,
+        rocm::DriverVersionStatusToString(Diagnostician::FindDsoVersion())
+            .c_str());
+    builder.set_driver_version(augmented_driver_version);
+  }
+
+  {
+    string pci_bus_id = GpuDriver::GetPCIBusID(device_);
+
+    // Lower the hex characters to match sysfs.
+    pci_bus_id = port::Lowercase(pci_bus_id);
+    builder.set_pci_bus_id(pci_bus_id);
+
+    // Read the NUMA node corresponding to the PCI bus ID out of sysfs.
+    int numa_node = TryToReadNumaNode(pci_bus_id, device_ordinal_);
+    builder.set_numa_node(numa_node);
+  }
+
+  hipDeviceProp_t prop;
+  if (GpuDriver::GetDeviceProperties(&prop, device_ordinal_)) {
+    builder.set_threads_per_block_limit(prop.maxThreadsPerBlock);
+
+    ThreadDim thread_dim_limit;
+    thread_dim_limit.x = prop.maxThreadsDim[0];
+    thread_dim_limit.y = prop.maxThreadsDim[1];
+    thread_dim_limit.z = prop.maxThreadsDim[2];
+    builder.set_thread_dim_limit(thread_dim_limit);
+
+    float clock_rate_ghz = static_cast<float>(prop.clockRate) / 1e6;
+    builder.set_clock_rate_ghz(clock_rate_ghz);
+  }
+
+  {
+    bool ecc_enabled = false;
+    (void)GpuDriver::IsEccEnabled(device_, &ecc_enabled);
+    builder.set_ecc_enabled(ecc_enabled);
+  }
+
+  {
+    uint64 device_memory_size = -1;
+    (void)GpuDriver::GetDeviceTotalMemory(device_, &device_memory_size);
+    builder.set_device_memory_size(device_memory_size);
+  }
+
+  {
+    BlockDim block_dim_limit;
+    FillBlockDimLimit(&block_dim_limit);
+    builder.set_block_dim_limit(block_dim_limit);
+  }
+
+  {
+    string device_name;
+    (void)GpuDriver::GetDeviceName(device_, &device_name);
+    builder.set_name(device_name);
+  }
+
+  builder.set_platform_version(
+      absl::StrCat("AMDGPU ISA version: gfx", version_));
+
+  // TODO(leary) should be a way to query this from the driver, but this is
+  // unlikely to change for us any time soon.
+  builder.set_device_address_bits(64);
+
+  builder.set_device_vendor("Advanced Micro Devices, Inc");
+  builder.set_rocm_amdgpu_isa_version(version_);
+  builder.set_shared_memory_per_core(
+      GpuDriver::GetMaxSharedMemoryPerCore(device_).ValueOrDie());
+  builder.set_shared_memory_per_block(
+      GpuDriver::GetMaxSharedMemoryPerBlock(device_).ValueOrDie());
+  builder.set_core_count(
+      GpuDriver::GetMultiprocessorCount(device_).ValueOrDie());
+  builder.set_threads_per_core_limit(
+      GpuDriver::GetMaxThreadsPerMultiprocessor(device_).ValueOrDie());
+  builder.set_registers_per_block_limit(
+      GpuDriver::GetMaxRegistersPerBlock(device_).ValueOrDie());
+  builder.set_threads_per_warp(
+      GpuDriver::GetThreadsPerWarp(device_).ValueOrDie());
+  builder.set_registers_per_core_limit(64 * 1024);
+
+  auto built = builder.Build();
+  return built.release();
+}
+
+}  // namespace gpu
+
+void initialize_rocm_gpu_executor() {
+  *internal::MakeROCMExecutorImplementation() = [](const PluginConfig& config) {
+    return new gpu::GpuExecutor{config};
+  };
+}
+
+}  // namespace stream_executor
+
+REGISTER_MODULE_INITIALIZER(rocm_gpu_executor, {
+  stream_executor::initialize_rocm_gpu_executor();
+});
diff --git a/tensorflow/stream_executor/rocm/rocm_kernel.cc b/tensorflow/stream_executor/rocm/rocm_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..162b2bdc71574e7dc30f5a3ed2d5a15a45d97206
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_kernel.cc
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/gpu/gpu_kernel.h"
+
+namespace stream_executor {
+namespace gpu {
+
+hipFuncCache_t GpuKernel::GetGpuCacheConfig() const {
+  switch (preferred_cache_config_) {
+    case KernelCacheConfig::kNoPreference:
+      return hipFuncCachePreferNone;
+    case KernelCacheConfig::kPreferShared:
+      return hipFuncCachePreferShared;
+    case KernelCacheConfig::kPreferL1:
+      return hipFuncCachePreferL1;
+    case KernelCacheConfig::kPreferEqual:
+      return hipFuncCachePreferEqual;
+    default:
+      LOG(FATAL) << "Unknown KernelCacheConfig"
+                 << static_cast<int32>(preferred_cache_config_);
+  }
+}
+
+}  // namespace gpu
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/rocm/rocm_platform.cc b/tensorflow/stream_executor/rocm/rocm_platform.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ce091658da4db4087f2b1078ad46b67afce5695e
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_platform.cc
@@ -0,0 +1,180 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/rocm/rocm_platform.h"
+
+#include "absl/strings/str_format.h"
+#include "tensorflow/stream_executor/gpu/gpu_driver.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/lib/error.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/ptr_util.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
+
+namespace stream_executor {
+namespace gpu {
+
+ROCmPlatform::ROCmPlatform()
+    : name_("ROCM"), min_numa_node_(0), limit_numa_node_(0) {}
+
+ROCmPlatform::~ROCmPlatform() {}
+
+// Due to legacy issues in user code, we can't currently call InpectNumaNodes
+// at module initialization time, because non-GPU programs still include this
+// plugin via various methods, so instead, it has to be init-on-reference.
+void ROCmPlatform::InspectNumaNodes() {
+  // To get NUMA node information, we need to create all executors, so we can
+  // examine their device descriptions to see their bus assignments.
+  static bool initialized = false;
+  static mutex numa_mutex(LINKER_INITIALIZED);
+  mutex_lock lock(numa_mutex);
+  if (initialized) {
+    return;
+  }
+
+  StreamExecutorConfig config;
+  for (int i = 0; i < VisibleDeviceCount(); i++) {
+    config.ordinal = i;
+    StreamExecutor* exec = GetExecutor(config).ValueOrDie();
+    if (i == 0) {
+      // NUMA nodes may not start at 0, so set the minimum node  based on the
+      // first executor we see.
+      min_numa_node_ = exec->GetDeviceDescription().numa_node();
+      limit_numa_node_ = min_numa_node_ + 1;
+    } else {
+      min_numa_node_ =
+          std::min(min_numa_node_, exec->GetDeviceDescription().numa_node());
+      limit_numa_node_ = std::max(limit_numa_node_,
+                                  exec->GetDeviceDescription().numa_node() + 1);
+    }
+  }
+  initialized = true;
+}
+
+int ROCmPlatform::BusCount() {
+  InspectNumaNodes();
+  return limit_numa_node_ - min_numa_node_;
+}
+
+int ROCmPlatform::DeviceToBus(int device_ordinal) {
+  StreamExecutorConfig config;
+  config.ordinal = device_ordinal;
+  StreamExecutor* exec = GetExecutor(config).ValueOrDie();
+  return exec->GetDeviceDescription().numa_node() - min_numa_node_;
+}
+
+port::StatusOr<StreamExecutor*> ROCmPlatform::FirstExecutorForBus(
+    int bus_ordinal) {
+  InspectNumaNodes();
+  CHECK_LT(bus_ordinal, BusCount()) << "bus ordinal out of available range";
+  for (int i = 0; i < VisibleDeviceCount(); i++) {
+    if (DeviceToBus(i) == bus_ordinal) {
+      StreamExecutorConfig config;
+      config.ordinal = i;
+      return GetExecutor(config).ValueOrDie();
+    }
+  }
+
+  return port::Status{
+      port::error::NOT_FOUND,
+      absl::StrFormat("Executor for bus %d not found.", bus_ordinal)};
+}
+
+Platform::Id ROCmPlatform::id() const { return rocm::kROCmPlatformId; }
+
+int ROCmPlatform::VisibleDeviceCount() const {
+  // Throw away the result - it logs internally, and this [containing] function
+  // isn't in the path of user control. It's safe to call this > 1x.
+
+  if (!gpu::GpuDriver::Init().ok()) {
+    return -1;
+  }
+
+  return GpuDriver::GetDeviceCount();
+}
+
+const string& ROCmPlatform::Name() const { return name_; }
+
+port::StatusOr<StreamExecutor*> ROCmPlatform::ExecutorForDevice(int ordinal) {
+  StreamExecutorConfig config;
+  config.ordinal = ordinal;
+  config.plugin_config = PluginConfig();
+  config.device_options = DeviceOptions::Default();
+  return GetExecutor(config);
+}
+
+port::StatusOr<StreamExecutor*> ROCmPlatform::ExecutorForDeviceWithPluginConfig(
+    int device_ordinal, const PluginConfig& plugin_config) {
+  StreamExecutorConfig config;
+  config.ordinal = device_ordinal;
+  config.plugin_config = plugin_config;
+  config.device_options = DeviceOptions::Default();
+  return GetExecutor(config);
+}
+
+port::StatusOr<StreamExecutor*> ROCmPlatform::GetExecutor(
+    const StreamExecutorConfig& config) {
+  return executor_cache_.GetOrCreate(
+      config, [&]() { return GetUncachedExecutor(config); });
+}
+
+port::StatusOr<std::unique_ptr<StreamExecutor>>
+ROCmPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
+  auto executor = MakeUnique<StreamExecutor>(
+      this, MakeUnique<GpuExecutor>(config.plugin_config));
+  auto init_status = executor->Init(config.ordinal, config.device_options);
+  if (!init_status.ok()) {
+    return port::Status{
+        port::error::INTERNAL,
+        absl::StrFormat(
+            "failed initializing StreamExecutor for ROCM device ordinal %d: %s",
+            config.ordinal, init_status.ToString().c_str())};
+  }
+
+  return std::move(executor);
+}
+
+void ROCmPlatform::RegisterTraceListener(
+    std::unique_ptr<TraceListener> listener) {
+  LOG(FATAL) << "not yet implemented: register ROCM trace listener";
+}
+
+void ROCmPlatform::UnregisterTraceListener(TraceListener* listener) {
+  LOG(FATAL) << "not yet implemented: unregister ROCM trace listener";
+}
+
+}  // namespace gpu
+
+static void InitializeROCmPlatform() {
+  // Disabling leak checking, MultiPlatformManager does not destroy its
+  // registered platforms.
+  auto status = MultiPlatformManager::PlatformWithName("ROCM");
+  if (!status.ok()) {
+    std::unique_ptr<gpu::ROCmPlatform> platform(new gpu::ROCmPlatform);
+    SE_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
+  }
+}
+
+}  // namespace stream_executor
+
+REGISTER_MODULE_INITIALIZER(rocm_platform,
+                            stream_executor::InitializeROCmPlatform());
+
+DECLARE_MODULE_INITIALIZER(multi_platform_manager);
+// Note that module initialization sequencing is not supported in the
+// open-source project, so this will be a no-op there.
+REGISTER_MODULE_INITIALIZER_SEQUENCE(rocm_platform, multi_platform_manager);
diff --git a/tensorflow/stream_executor/rocm/rocm_platform.h b/tensorflow/stream_executor/rocm/rocm_platform.h
new file mode 100644
index 0000000000000000000000000000000000000000..d498e5fdb1e9ef1f31b2fea13625aba995d9acad
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_platform.h
@@ -0,0 +1,110 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/stream_executor/executor_cache.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/platform/mutex.h"
+#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/stream_executor_internal.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/stream_executor/trace_listener.h"
+
+namespace stream_executor {
+namespace gpu {
+
+// Opaque and unique identifier for the ROCM platform plugin.
+// This is needed so that plugins can refer to/identify this platform without
+// instantiating a ROCmPlatform object.
+extern const Platform::Id kROCmPlatformId;
+
+// ROCm-specific platform plugin, registered as a singleton value via module
+// initializer.
+class ROCmPlatform : public Platform {
+ public:
+  ROCmPlatform();
+  ~ROCmPlatform() override;
+
+  // ROCmPlatform-specific functionality
+  // Returns the number of distinct buses / NUMA nodes on the machine.
+  int BusCount();
+
+  // Returns the bus/NUMA node for the specified device ordinal.
+  int DeviceToBus(int device_ordinal);
+
+  // Returns the lowest-ordinal-number StreamExecutor on the specified bus.
+  port::StatusOr<StreamExecutor*> FirstExecutorForBus(int bus_ordinal);
+
+  // Platform interface implementation:
+  // Returns the same value as kROCmPlatform above.
+  Platform::Id id() const override;
+
+  // Returns -1 as a sentinel on internal failure (and logs the error).
+  int VisibleDeviceCount() const override;
+
+  const string& Name() const override;
+
+  port::StatusOr<StreamExecutor*> ExecutorForDevice(int ordinal) override;
+
+  port::StatusOr<StreamExecutor*> ExecutorForDeviceWithPluginConfig(
+      int ordinal, const PluginConfig& config) override;
+
+  port::StatusOr<StreamExecutor*> GetExecutor(
+      const StreamExecutorConfig& config) override;
+
+  port::StatusOr<std::unique_ptr<StreamExecutor>> GetUncachedExecutor(
+      const StreamExecutorConfig& config) override;
+
+  void RegisterTraceListener(std::unique_ptr<TraceListener> listener) override;
+
+  void UnregisterTraceListener(TraceListener* listener) override;
+
+ private:
+  // Determines the number of NUMA nodes and the assignment of executor to each.
+  void InspectNumaNodes();
+
+  // This platform's name.
+  string name_;
+
+  // mutex that guards internal state.
+  mutable mutex mu_;
+
+  // Cache of created executors.
+  ExecutorCache executor_cache_;
+
+  // The smallest NUMA node value for any device managed by this machine
+  // manager. Used, along with limit_numa_node_, to convert NUMA nodes into bus
+  // ordinals. The NUMA node space occupied by GPUs is assumed to be dense./
+  int min_numa_node_;
+
+  // Larger than the NUMA node value for any device managed by this machine
+  // manager.
+  int limit_numa_node_;
+
+  SE_DISALLOW_COPY_AND_ASSIGN(ROCmPlatform);
+};
+
+}  // namespace gpu
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_platform_id.cc b/tensorflow/stream_executor/rocm/rocm_platform_id.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bb07858a96babaed11c991c59ff9644e0933ac6b
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_platform_id.cc
@@ -0,0 +1,24 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
+
+namespace stream_executor {
+namespace rocm {
+
+PLATFORM_DEFINE_ID(kROCmPlatformId);
+
+}  // namespace rocm
+}  // namespace stream_executor
diff --git a/tensorflow/stream_executor/rocm/rocm_platform_id.h b/tensorflow/stream_executor/rocm/rocm_platform_id.h
new file mode 100644
index 0000000000000000000000000000000000000000..a17d4f97bbcb91e883f89d107da40aebcb6fba95
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_platform_id.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_ID_H_
+#define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_ID_H_
+
+#include "tensorflow/stream_executor/platform.h"
+
+namespace stream_executor {
+namespace rocm {
+
+// Opaque and unique identifier for the ROCm platform.
+// This is needed so that plugins can refer to/identify this platform without
+// instantiating a ROCmPlatform object.
+// This is broken out here to avoid a circular dependency between ROCmPlatform
+// and ROCmExecutor.
+extern const Platform::Id kROCmPlatformId;
+
+}  // namespace rocm
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_PLATFORM_ID_H_
diff --git a/tensorflow/stream_executor/rocm/rocm_rng.cc b/tensorflow/stream_executor/rocm/rocm_rng.cc
new file mode 100644
index 0000000000000000000000000000000000000000..99bfc49d10fb0dd71aaf4ead6ece0f9336920545
--- /dev/null
+++ b/tensorflow/stream_executor/rocm/rocm_rng.cc
@@ -0,0 +1,325 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "rocm/include/hiprand/hiprand.h"
+#include "tensorflow/stream_executor/gpu/gpu_rng.h"
+
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/gpu/gpu_activation.h"
+#include "tensorflow/stream_executor/gpu/gpu_executor.h"
+#include "tensorflow/stream_executor/gpu/gpu_helpers.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/status.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/rng.h"
+#include "tensorflow/stream_executor/rocm/rocm_platform_id.h"
+
+// Formats hiprandStatus_t to output prettified values into a log stream.
+std::ostream& operator<<(std::ostream& in, const hiprandStatus_t& status) {
+#define OSTREAM_HIPRAND_STATUS(__name) \
+  case HIPRAND_STATUS_##__name:        \
+    in << "HIPRAND_STATUS_" #__name;   \
+    return in;
+
+  switch (status) {
+    OSTREAM_HIPRAND_STATUS(SUCCESS)
+    OSTREAM_HIPRAND_STATUS(VERSION_MISMATCH)
+    OSTREAM_HIPRAND_STATUS(NOT_INITIALIZED)
+    OSTREAM_HIPRAND_STATUS(ALLOCATION_FAILED)
+    OSTREAM_HIPRAND_STATUS(TYPE_ERROR)
+    OSTREAM_HIPRAND_STATUS(OUT_OF_RANGE)
+    OSTREAM_HIPRAND_STATUS(LENGTH_NOT_MULTIPLE)
+    OSTREAM_HIPRAND_STATUS(LAUNCH_FAILURE)
+    OSTREAM_HIPRAND_STATUS(PREEXISTING_FAILURE)
+    OSTREAM_HIPRAND_STATUS(INITIALIZATION_FAILED)
+    OSTREAM_HIPRAND_STATUS(ARCH_MISMATCH)
+    OSTREAM_HIPRAND_STATUS(INTERNAL_ERROR)
+    default:
+      in << "hiprandStatus_t(" << static_cast<int>(status) << ")";
+      return in;
+  }
+}
+
+namespace stream_executor {
+namespace gpu {
+
+PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kGpuRandPlugin);
+
+namespace wrap {
+
+#ifdef PLATFORM_GOOGLE
+
+#define STREAM_EXECUTOR_HIPRAND_WRAP(__name)                        \
+  struct WrapperShim__##__name {                                    \
+    template <typename... Args>                                     \
+    hiprandStatus_t operator()(GpuExecutor* parent, Args... args) { \
+      gpu::ScopedActivateExecutorContext sac{parent};               \
+      return ::__name(args...);                                     \
+    }                                                               \
+  } __name;
+
+#else
+
+#define STREAM_EXECUTOR_HIPRAND_WRAP(__name)                              \
+  struct DynLoadShim__##__name {                                          \
+    static const char* kName;                                             \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
+    static void* GetDsoHandle() {                                         \
+      auto s = internal::CachedDsoLoader::GetRocrandDsoHandle();          \
+      return s.ValueOrDie();                                              \
+    }                                                                     \
+    static FuncPtrT LoadOrDie() {                                         \
+      void* f;                                                            \
+      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                          kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                         \
+                    << " in rocrand DSO; dlerror: " << s.error_message(); \
+      return reinterpret_cast<FuncPtrT>(f);                               \
+    }                                                                     \
+    static FuncPtrT DynLoad() {                                           \
+      static FuncPtrT f = LoadOrDie();                                    \
+      return f;                                                           \
+    }                                                                     \
+    template <typename... Args>                                           \
+    hiprandStatus operator()(GpuExecutor* parent, Args... args) {         \
+      gpu::ScopedActivateExecutorContext sac{parent};                     \
+      return DynLoad()(args...);                                          \
+    }                                                                     \
+  } __name;                                                               \
+  const char* DynLoadShim__##__name::kName = #__name;
+
+#endif
+
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandCreateGenerator);
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandDestroyGenerator);
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandSetStream);
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandGenerateUniform);
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandGenerateUniformDouble);
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandSetPseudoRandomGeneratorSeed);
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandSetGeneratorOffset);
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandGenerateNormal);
+STREAM_EXECUTOR_HIPRAND_WRAP(hiprandGenerateNormalDouble);
+
+}  // namespace wrap
+
+GpuRng::GpuRng(GpuExecutor* parent) : parent_(parent), rng_(nullptr) {}
+
+GpuRng::~GpuRng() {
+  if (rng_ != nullptr) {
+    wrap::hiprandDestroyGenerator(parent_, rng_);
+  }
+}
+
+bool GpuRng::Init() {
+  mutex_lock lock{mu_};
+  CHECK(rng_ == nullptr);
+
+  hiprandStatus_t ret =
+      wrap::hiprandCreateGenerator(parent_, &rng_, HIPRAND_RNG_PSEUDO_DEFAULT);
+  if (ret != HIPRAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to create random number generator: " << ret;
+    return false;
+  }
+
+  CHECK(rng_ != nullptr);
+  return true;
+}
+
+bool GpuRng::SetStream(Stream* stream) {
+  hiprandStatus_t ret =
+      wrap::hiprandSetStream(parent_, rng_, AsGpuStreamValue(stream));
+  if (ret != HIPRAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set stream for random generation: " << ret;
+    return false;
+  }
+
+  return true;
+}
+
+// Returns true if std::complex stores its contents as two consecutive
+// elements. Tests int, float and double, as the last two are independent
+// specializations.
+constexpr bool ComplexIsConsecutiveFloats() {
+  return sizeof(std::complex<int>) == 8 && sizeof(std::complex<float>) == 8 &&
+         sizeof(std::complex<double>) == 16;
+}
+
+template <typename T>
+bool GpuRng::DoPopulateRandUniformInternal(Stream* stream, DeviceMemory<T>* v) {
+  mutex_lock lock{mu_};
+  static_assert(ComplexIsConsecutiveFloats(),
+                "std::complex values are not stored as consecutive values");
+
+  if (!SetStream(stream)) {
+    return false;
+  }
+
+  // std::complex<T> is currently implemented as two consecutive T variables.
+  uint64 element_count = v->ElementCount();
+  if (std::is_same<T, std::complex<float>>::value ||
+      std::is_same<T, std::complex<double>>::value) {
+    element_count *= 2;
+  }
+
+  hiprandStatus_t ret;
+  if (std::is_same<T, float>::value ||
+      std::is_same<T, std::complex<float>>::value) {
+    ret = wrap::hiprandGenerateUniform(
+        parent_, rng_, reinterpret_cast<float*>(GpuMemoryMutable(v)),
+        element_count);
+  } else {
+    ret = wrap::hiprandGenerateUniformDouble(
+        parent_, rng_, reinterpret_cast<double*>(GpuMemoryMutable(v)),
+        element_count);
+  }
+  if (ret != HIPRAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to do uniform generation of " << v->ElementCount()
+               << " " << TypeString<T>() << "s at " << v->opaque() << ": "
+               << ret;
+    return false;
+  }
+
+  return true;
+}
+
+bool GpuRng::DoPopulateRandUniform(Stream* stream, DeviceMemory<float>* v) {
+  return DoPopulateRandUniformInternal(stream, v);
+}
+
+bool GpuRng::DoPopulateRandUniform(Stream* stream, DeviceMemory<double>* v) {
+  return DoPopulateRandUniformInternal(stream, v);
+}
+
+bool GpuRng::DoPopulateRandUniform(Stream* stream,
+                                   DeviceMemory<std::complex<float>>* v) {
+  return DoPopulateRandUniformInternal(stream, v);
+}
+
+bool GpuRng::DoPopulateRandUniform(Stream* stream,
+                                   DeviceMemory<std::complex<double>>* v) {
+  return DoPopulateRandUniformInternal(stream, v);
+}
+
+template <typename ElemT, typename FuncT>
+bool GpuRng::DoPopulateRandGaussianInternal(Stream* stream, ElemT mean,
+                                            ElemT stddev,
+                                            DeviceMemory<ElemT>* v,
+                                            FuncT func) {
+  mutex_lock lock{mu_};
+
+  if (!SetStream(stream)) {
+    return false;
+  }
+
+  uint64 element_count = v->ElementCount();
+  hiprandStatus_t ret =
+      func(parent_, rng_, GpuMemoryMutable(v), element_count, mean, stddev);
+
+  if (ret != HIPRAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to do gaussian generation of " << v->ElementCount()
+               << " floats at " << v->opaque() << ": " << ret;
+    return false;
+  }
+
+  return true;
+}
+
+bool GpuRng::DoPopulateRandGaussian(Stream* stream, float mean, float stddev,
+                                    DeviceMemory<float>* v) {
+  return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
+                                        wrap::hiprandGenerateNormal);
+}
+
+bool GpuRng::DoPopulateRandGaussian(Stream* stream, double mean, double stddev,
+                                    DeviceMemory<double>* v) {
+  return DoPopulateRandGaussianInternal(stream, mean, stddev, v,
+                                        wrap::hiprandGenerateNormalDouble);
+}
+
+bool GpuRng::SetSeed(Stream* stream, const uint8* seed, uint64 seed_bytes) {
+  mutex_lock lock{mu_};
+  CHECK(rng_ != nullptr);
+
+  if (!CheckSeed(seed, seed_bytes)) {
+    return false;
+  }
+
+  if (!SetStream(stream)) {
+    return false;
+  }
+
+  // Requires 8 bytes of seed data; checked in RngSupport::CheckSeed (above)
+  // (which itself requires 16 for API consistency with host RNG fallbacks).
+  hiprandStatus_t ret = wrap::hiprandSetPseudoRandomGeneratorSeed(
+      parent_, rng_, *(reinterpret_cast<const uint64*>(seed)));
+  if (ret != HIPRAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to set rng seed: " << ret;
+    return false;
+  }
+
+  ret = wrap::hiprandSetGeneratorOffset(parent_, rng_, 0);
+  if (ret != HIPRAND_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to reset rng position: " << ret;
+    return false;
+  }
+  return true;
+}
+
+}  // namespace gpu
+
+void initialize_rocrand() {
+  auto rocRandAlreadyRegistered = PluginRegistry::Instance()->HasFactory(
+      rocm::kROCmPlatformId, PluginKind::kRng, gpu::kGpuRandPlugin);
+
+  if (!rocRandAlreadyRegistered) {
+    port::Status status =
+        PluginRegistry::Instance()->RegisterFactory<PluginRegistry::RngFactory>(
+            rocm::kROCmPlatformId, gpu::kGpuRandPlugin, "rocRAND",
+            [](internal::StreamExecutorInterface* parent) -> rng::RngSupport* {
+              gpu::GpuExecutor* rocm_executor =
+                  dynamic_cast<gpu::GpuExecutor*>(parent);
+              if (rocm_executor == nullptr) {
+                LOG(ERROR)
+                    << "Attempting to initialize an instance of the hipRAND "
+                    << "support library with a non-ROCM StreamExecutor";
+                return nullptr;
+              }
+
+              gpu::GpuRng* rng = new gpu::GpuRng(rocm_executor);
+              if (!rng->Init()) {
+                // Note: Init() will log a more specific error.
+                delete rng;
+                return nullptr;
+              }
+              return rng;
+            });
+
+    if (!status.ok()) {
+      LOG(ERROR) << "Unable to register rocRAND factory: "
+                 << status.error_message();
+    }
+
+    PluginRegistry::Instance()->SetDefaultFactory(
+        rocm::kROCmPlatformId, PluginKind::kRng, gpu::kGpuRandPlugin);
+  }
+}
+
+}  // namespace stream_executor
+
+REGISTER_MODULE_INITIALIZER(register_rocrand,
+                            { stream_executor::initialize_rocrand(); });
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 3edc66cde8045d7f6ae53095e8136d1697fb1d23..2577d3825fb448ac332d109990b3d556c4535835 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/stream_executor/blas.h"
-#include "tensorflow/stream_executor/host_buffer.h"
 #include "tensorflow/stream_executor/host_or_device_scalar.h"
 #include "tensorflow/stream_executor/lib/stacktrace.h"
 #include "tensorflow/stream_executor/platform.h"
@@ -95,8 +94,6 @@ string ToVlogString(const void *ptr) {
   return out.str();
 }
 
-string ToVlogString(const HostBuffer &buffer) { return buffer.AsString(); }
-
 template <class T>
 string ToVlogString(const std::complex<T> &c) {
   // StrCat does not convert std::complex to text.
@@ -284,6 +281,12 @@ Stream::~Stream() {
   }
 }
 
+port::Status Stream::RefreshStatus() {
+  port::Status status = parent_->GetStatus(this);
+  CheckStatus(status);
+  return status;
+}
+
 Stream &Stream::Init() {
   VLOG_CALL();
 
@@ -434,160 +437,6 @@ Stream &Stream::ThenBatchNormalizationBackward(
   return *this;
 }
 
-Stream &Stream::ThenFusedConvolveWithScratch(
-    const dnn::BatchDescriptor &conv_input_descriptor,
-    const DeviceMemory<int8> &conv_input_data, float conv_input_scale,
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<int8> &filter_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const DeviceMemory<int8> &side_input_data, float side_input_scale,
-    const dnn::BatchDescriptor &bias_descriptor,
-    const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
-    const dnn::BatchDescriptor &output_descriptor, DeviceMemory<int8> *output,
-    ScratchAllocator *scratch_allocator) {
-  VLOG_CALL(PARAM(conv_input_descriptor), PARAM(conv_input_data),
-            PARAM(conv_input_scale), PARAM(filter_descriptor),
-            PARAM(filter_data), PARAM(convolution_descriptor),
-            PARAM(side_input_data), PARAM(side_input_scale),
-            PARAM(bias_descriptor), PARAM(biases), PARAM(activation_mode),
-            PARAM(output_descriptor), PARAM(output));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoFusedConvolve(
-          this, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output, scratch_allocator,
-          dnn::AlgorithmConfig(), /*output_profile_result=*/nullptr));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
-Stream &Stream::ThenFusedConvolveWithScratch(
-    const dnn::BatchDescriptor &conv_input_descriptor,
-    const DeviceMemory<Eigen::half> &conv_input_data, float conv_input_scale,
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<Eigen::half> &filter_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const DeviceMemory<Eigen::half> &side_input_data, float side_input_scale,
-    const dnn::BatchDescriptor &bias_descriptor,
-    const DeviceMemory<Eigen::half> &biases,
-    dnn::ActivationMode activation_mode,
-    const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<Eigen::half> *output, ScratchAllocator *scratch_allocator) {
-  VLOG_CALL(PARAM(conv_input_descriptor), PARAM(conv_input_data),
-            PARAM(conv_input_scale), PARAM(filter_descriptor),
-            PARAM(filter_data), PARAM(convolution_descriptor),
-            PARAM(side_input_data), PARAM(side_input_scale),
-            PARAM(bias_descriptor), PARAM(biases), PARAM(activation_mode),
-            PARAM(output_descriptor), PARAM(output));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoFusedConvolve(
-          this, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output, scratch_allocator,
-          dnn::AlgorithmConfig(), /*output_profile_result=*/nullptr));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
-Stream &Stream::ThenFusedConvolveWithScratch(
-    const dnn::BatchDescriptor &conv_input_descriptor,
-    const DeviceMemory<float> &conv_input_data, float conv_input_scale,
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<float> &filter_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const DeviceMemory<float> &side_input_data, float side_input_scale,
-    const dnn::BatchDescriptor &bias_descriptor,
-    const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
-    const dnn::BatchDescriptor &output_descriptor, DeviceMemory<float> *output,
-    ScratchAllocator *scratch_allocator) {
-  VLOG_CALL(PARAM(conv_input_descriptor), PARAM(conv_input_data),
-            PARAM(conv_input_scale), PARAM(filter_descriptor),
-            PARAM(filter_data), PARAM(convolution_descriptor),
-            PARAM(side_input_data), PARAM(side_input_scale),
-            PARAM(bias_descriptor), PARAM(biases), PARAM(activation_mode),
-            PARAM(output_descriptor), PARAM(output));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoFusedConvolve(
-          this, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output, scratch_allocator,
-          dnn::AlgorithmConfig(), /*output_profile_result=*/nullptr));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
-Stream &Stream::ThenConvolveWithScratch(
-    const dnn::BatchDescriptor &input_descriptor,
-    const DeviceMemory<Eigen::half> &input_data,
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<Eigen::half> &filter_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<Eigen::half> *output, ScratchAllocator *scratch_allocator) {
-  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
-            PARAM(filter_descriptor), PARAM(filter_data),
-            PARAM(convolution_descriptor), PARAM(output_descriptor),
-            PARAM(output));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoConvolve(
-          this, input_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output, scratch_allocator,
-          dnn::AlgorithmConfig(),
-          /*output_profile_result=*/nullptr));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
-Stream &Stream::ThenConvolveWithScratch(
-    const dnn::BatchDescriptor &input_descriptor,
-    const DeviceMemory<float> &input_data,
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<float> &filter_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::BatchDescriptor &output_descriptor, DeviceMemory<float> *output,
-    ScratchAllocator *scratch_allocator) {
-  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
-            PARAM(filter_descriptor), PARAM(filter_data),
-            PARAM(convolution_descriptor), PARAM(output_descriptor),
-            PARAM(output));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoConvolve(
-          this, input_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output, scratch_allocator,
-          dnn::AlgorithmConfig(),
-          /*output_profile_result=*/nullptr));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
 Stream &Stream::ThenFusedConvolveWithAlgorithm(
     const dnn::BatchDescriptor &conv_input_descriptor,
     const DeviceMemory<double> &conv_input_data, double conv_input_scale,
@@ -758,10 +607,21 @@ Stream &Stream::ThenConvolveWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolve(
-          this, input_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output, scratch_allocator,
-          algorithm_config, output_profile_result);
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status =
+          dnn->PrepareForConvolution(
+                 dnn::ConvolutionKind::FORWARD, this, input_descriptor,
+                 input_data, filter_descriptor, filter_data, output_descriptor,
+                 *output, convolution_descriptor, algorithm_config,
+                 scratch_allocator, &algorithm_desc, &scratch_memory)
+              .ok();
+      if (status) {
+        status = dnn->DoConvolve(
+            this, input_descriptor, input_data, filter_descriptor, filter_data,
+            convolution_descriptor, output_descriptor, output, algorithm_desc,
+            &scratch_memory, output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -789,10 +649,21 @@ Stream &Stream::ThenConvolveWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolve(
-          this, input_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output, scratch_allocator,
-          algorithm_config, output_profile_result);
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status =
+          dnn->PrepareForConvolution(
+                 dnn::ConvolutionKind::FORWARD, this, input_descriptor,
+                 input_data, filter_descriptor, filter_data, output_descriptor,
+                 *output, convolution_descriptor, algorithm_config,
+                 scratch_allocator, &algorithm_desc, &scratch_memory)
+              .ok();
+      if (status) {
+        status = dnn->DoConvolve(
+            this, input_descriptor, input_data, filter_descriptor, filter_data,
+            convolution_descriptor, output_descriptor, output, algorithm_desc,
+            &scratch_memory, output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -820,10 +691,21 @@ Stream &Stream::ThenConvolveWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolve(
-          this, input_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output, scratch_allocator,
-          algorithm_config, output_profile_result);
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status =
+          dnn->PrepareForConvolution(
+                 dnn::ConvolutionKind::FORWARD, this, input_descriptor,
+                 input_data, filter_descriptor, filter_data, output_descriptor,
+                 *output, convolution_descriptor, algorithm_config,
+                 scratch_allocator, &algorithm_desc, &scratch_memory)
+              .ok();
+      if (status) {
+        status = dnn->DoConvolve(
+            this, input_descriptor, input_data, filter_descriptor, filter_data,
+            convolution_descriptor, output_descriptor, output, algorithm_desc,
+            &scratch_memory, output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -834,24 +716,6 @@ Stream &Stream::ThenConvolveWithAlgorithm(
   return *this;
 }
 
-Stream &Stream::ThenFusedConvolve(
-    const dnn::BatchDescriptor &conv_input_descriptor,
-    const DeviceMemory<int8> &conv_input_data, float conv_input_scale,
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<int8> &filter_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const DeviceMemory<int8> &side_input_data, float side_input_scale,
-    const dnn::BatchDescriptor &bias_descriptor,
-    const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
-    const dnn::BatchDescriptor &output_descriptor, DeviceMemory<int8> *output) {
-  return ThenFusedConvolveWithScratch(
-      conv_input_descriptor, conv_input_data, conv_input_scale,
-      filter_descriptor, filter_data, convolution_descriptor, side_input_data,
-      side_input_scale, bias_descriptor, biases, activation_mode,
-      output_descriptor, output,
-      /*scratch_allocator=*/nullptr);
-}
-
 Stream &Stream::ThenConvolve(
     const dnn::BatchDescriptor &input_descriptor,
     const DeviceMemory<float> &input_data,
@@ -860,10 +724,11 @@ Stream &Stream::ThenConvolve(
     const dnn::ConvolutionDescriptor &convolution_descriptor,
     const dnn::BatchDescriptor &output_descriptor,
     DeviceMemory<float> *output) {
-  return ThenConvolveWithScratch(input_descriptor, input_data,
-                                 filter_descriptor, filter_data,
-                                 convolution_descriptor, output_descriptor,
-                                 output, /*scratch_allocator=*/nullptr);
+  return ThenConvolveWithAlgorithm(
+      input_descriptor, input_data, filter_descriptor, filter_data,
+      convolution_descriptor, output_descriptor, output,
+      /*scratch_allocator=*/nullptr, dnn::AlgorithmConfig(),
+      /*output_profile_result=*/nullptr);
 }
 
 Stream &Stream::ThenConvolveQuantized(
@@ -953,34 +818,6 @@ Stream &Stream::ThenSeparableConvolve(
   return *this;
 }
 
-Stream &Stream::ThenConvolveBackwardDataWithScratch(
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<float> &filter_data,
-    const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<float> backward_output_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::BatchDescriptor &input_descriptor,
-    DeviceMemory<float> *backward_input_data,
-    ScratchAllocator *scratch_allocator) {
-  VLOG_CALL(PARAM(filter_descriptor), PARAM(filter_data),
-            PARAM(output_descriptor), PARAM(backward_output_data),
-            PARAM(convolution_descriptor), PARAM(input_descriptor),
-            PARAM(backward_input_data));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoConvolveBackwardData(
-          this, filter_descriptor, filter_data, output_descriptor,
-          backward_output_data, convolution_descriptor, input_descriptor,
-          backward_input_data, scratch_allocator, dnn::AlgorithmConfig(),
-          /*output_profile_result=*/nullptr));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
 Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
     const dnn::FilterDescriptor &filter_descriptor,
     const DeviceMemory<double> &filter_data,
@@ -999,11 +836,23 @@ Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolveBackwardData(
-          this, filter_descriptor, filter_data, output_descriptor,
-          backward_output_data, convolution_descriptor, input_descriptor,
-          backward_input_data, scratch_allocator, algorithm_config,
-          output_profile_result);
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status =
+          dnn->PrepareForConvolution(
+                 dnn::ConvolutionKind::BACKWARD_DATA, this, input_descriptor,
+                 *backward_input_data, filter_descriptor, filter_data,
+                 output_descriptor, backward_output_data,
+                 convolution_descriptor, algorithm_config, scratch_allocator,
+                 &algorithm_desc, &scratch_memory)
+              .ok();
+      if (status) {
+        status = dnn->DoConvolveBackwardData(
+            this, filter_descriptor, filter_data, output_descriptor,
+            backward_output_data, convolution_descriptor, input_descriptor,
+            backward_input_data, algorithm_desc, &scratch_memory,
+            output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -1032,11 +881,23 @@ Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolveBackwardData(
-          this, filter_descriptor, filter_data, output_descriptor,
-          backward_output_data, convolution_descriptor, input_descriptor,
-          backward_input_data, scratch_allocator, algorithm_config,
-          output_profile_result);
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status =
+          dnn->PrepareForConvolution(
+                 dnn::ConvolutionKind::BACKWARD_DATA, this, input_descriptor,
+                 *backward_input_data, filter_descriptor, filter_data,
+                 output_descriptor, backward_output_data,
+                 convolution_descriptor, algorithm_config, scratch_allocator,
+                 &algorithm_desc, &scratch_memory)
+              .ok();
+      if (status) {
+        status = dnn->DoConvolveBackwardData(
+            this, filter_descriptor, filter_data, output_descriptor,
+            backward_output_data, convolution_descriptor, input_descriptor,
+            backward_input_data, algorithm_desc, &scratch_memory,
+            output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -1065,11 +926,23 @@ Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolveBackwardData(
-          this, filter_descriptor, filter_data, output_descriptor,
-          backward_output_data, convolution_descriptor, input_descriptor,
-          backward_input_data, scratch_allocator, algorithm_config,
-          output_profile_result);
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status =
+          dnn->PrepareForConvolution(
+                 dnn::ConvolutionKind::BACKWARD_DATA, this, input_descriptor,
+                 *backward_input_data, filter_descriptor, filter_data,
+                 output_descriptor, backward_output_data,
+                 convolution_descriptor, algorithm_config, scratch_allocator,
+                 &algorithm_desc, &scratch_memory)
+              .ok();
+      if (status) {
+        status = dnn->DoConvolveBackwardData(
+            this, filter_descriptor, filter_data, output_descriptor,
+            backward_output_data, convolution_descriptor, input_descriptor,
+            backward_input_data, algorithm_desc, &scratch_memory,
+            output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -1080,76 +953,6 @@ Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
   return *this;
 }
 
-Stream &Stream::ThenConvolveBackwardDataWithScratch(
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<Eigen::half> &filter_data,
-    const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<Eigen::half> backward_output_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::BatchDescriptor &input_descriptor,
-    DeviceMemory<Eigen::half> *backward_input_data,
-    ScratchAllocator *scratch_allocator) {
-  VLOG_CALL(PARAM(filter_descriptor), PARAM(filter_data),
-            PARAM(output_descriptor), PARAM(backward_output_data),
-            PARAM(convolution_descriptor), PARAM(input_descriptor),
-            PARAM(backward_input_data));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoConvolveBackwardData(
-          this, filter_descriptor, filter_data, output_descriptor,
-          backward_output_data, convolution_descriptor, input_descriptor,
-          backward_input_data, scratch_allocator, dnn::AlgorithmConfig(),
-          /*output_profile_result=*/nullptr));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
-Stream &Stream::ThenConvolveBackwardData(
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<float> &filter_data,
-    const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<float> backward_output_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::BatchDescriptor &input_descriptor,
-    DeviceMemory<float> *backward_input_data) {
-  return ThenConvolveBackwardDataWithScratch(
-      filter_descriptor, filter_data, output_descriptor, backward_output_data,
-      convolution_descriptor, input_descriptor, backward_input_data,
-      /*scratch_allocator=*/nullptr);
-}
-
-Stream &Stream::ThenConvolveBackwardFilterWithScratch(
-    const dnn::BatchDescriptor &input_descriptor,
-    const DeviceMemory<float> &input_data,
-    const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<float> backward_output_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::FilterDescriptor &filter_descriptor,
-    DeviceMemory<float> *backward_filter_data,
-    ScratchAllocator *scratch_allocator) {
-  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
-            PARAM(output_descriptor), PARAM(backward_output_data),
-            PARAM(convolution_descriptor), PARAM(filter_descriptor),
-            PARAM(backward_filter_data));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoConvolveBackwardFilter(
-          this, input_descriptor, input_data, output_descriptor,
-          backward_output_data, convolution_descriptor, filter_descriptor,
-          backward_filter_data, scratch_allocator, dnn::AlgorithmConfig(),
-          /*output_profile_result=*/nullptr));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
 Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
     const dnn::BatchDescriptor &input_descriptor,
     const DeviceMemory<double> &input_data,
@@ -1168,11 +971,23 @@ Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolveBackwardFilter(
-          this, input_descriptor, input_data, output_descriptor,
-          backward_output_data, convolution_descriptor, filter_descriptor,
-          backward_filter_data, scratch_allocator, algorithm_config,
-          output_profile_result);
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status =
+          dnn->PrepareForConvolution(
+                 dnn::ConvolutionKind::BACKWARD_FILTER, this, input_descriptor,
+                 input_data, filter_descriptor, *backward_filter_data,
+                 output_descriptor, backward_output_data,
+                 convolution_descriptor, algorithm_config, scratch_allocator,
+                 &algorithm_desc, &scratch_memory)
+              .ok();
+      if (status) {
+        status = dnn->DoConvolveBackwardFilter(
+            this, input_descriptor, input_data, output_descriptor,
+            backward_output_data, convolution_descriptor, filter_descriptor,
+            backward_filter_data, algorithm_desc, &scratch_memory,
+            output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -1201,11 +1016,23 @@ Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolveBackwardFilter(
-          this, input_descriptor, input_data, output_descriptor,
-          backward_output_data, convolution_descriptor, filter_descriptor,
-          backward_filter_data, scratch_allocator, algorithm_config,
-          output_profile_result);
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status =
+          dnn->PrepareForConvolution(
+                 dnn::ConvolutionKind::BACKWARD_FILTER, this, input_descriptor,
+                 input_data, filter_descriptor, *backward_filter_data,
+                 output_descriptor, backward_output_data,
+                 convolution_descriptor, algorithm_config, scratch_allocator,
+                 &algorithm_desc, &scratch_memory)
+              .ok();
+      if (status) {
+        status = dnn->DoConvolveBackwardFilter(
+            this, input_descriptor, input_data, output_descriptor,
+            backward_output_data, convolution_descriptor, filter_descriptor,
+            backward_filter_data, algorithm_desc, &scratch_memory,
+            output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -1216,34 +1043,6 @@ Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
   return *this;
 }
 
-Stream &Stream::ThenConvolveBackwardFilterWithScratch(
-    const dnn::BatchDescriptor &input_descriptor,
-    const DeviceMemory<Eigen::half> &input_data,
-    const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<Eigen::half> backward_output_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::FilterDescriptor &filter_descriptor,
-    DeviceMemory<Eigen::half> *backward_filter_data,
-    ScratchAllocator *scratch_allocator) {
-  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
-            PARAM(output_descriptor), PARAM(backward_output_data),
-            PARAM(convolution_descriptor), PARAM(filter_descriptor),
-            PARAM(backward_filter_data));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoConvolveBackwardFilter(
-          this, input_descriptor, input_data, output_descriptor,
-          backward_output_data, convolution_descriptor, filter_descriptor,
-          backward_filter_data, scratch_allocator, dnn::AlgorithmConfig(),
-          /*output_profile_result=*/nullptr));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
 Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
     const dnn::BatchDescriptor &input_descriptor,
     const DeviceMemory<Eigen::half> &input_data,
@@ -1262,11 +1061,23 @@ Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoConvolveBackwardFilter(
-          this, input_descriptor, input_data, output_descriptor,
-          backward_output_data, convolution_descriptor, filter_descriptor,
-          backward_filter_data, scratch_allocator, algorithm_config,
-          output_profile_result);
+      DeviceMemory<uint8> scratch_memory;
+      dnn::AlgorithmDesc algorithm_desc;
+      auto status =
+          dnn->PrepareForConvolution(
+                 dnn::ConvolutionKind::BACKWARD_FILTER, this, input_descriptor,
+                 input_data, filter_descriptor, *backward_filter_data,
+                 output_descriptor, backward_output_data,
+                 convolution_descriptor, algorithm_config, scratch_allocator,
+                 &algorithm_desc, &scratch_memory)
+              .ok();
+      if (status) {
+        status = dnn->DoConvolveBackwardFilter(
+            this, input_descriptor, input_data, output_descriptor,
+            backward_output_data, convolution_descriptor, filter_descriptor,
+            backward_filter_data, algorithm_desc, &scratch_memory,
+            output_profile_result);
+      }
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -1277,20 +1088,6 @@ Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
   return *this;
 }
 
-Stream &Stream::ThenConvolveBackwardFilter(
-    const dnn::BatchDescriptor &input_descriptor,
-    const DeviceMemory<float> &input_data,
-    const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<float> backward_output_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::FilterDescriptor &filter_descriptor,
-    DeviceMemory<float> *backward_filter_data) {
-  return ThenConvolveBackwardFilterWithScratch(
-      input_descriptor, input_data, output_descriptor, backward_output_data,
-      convolution_descriptor, filter_descriptor, backward_filter_data,
-      /*scratch_allocator=*/nullptr);
-}
-
 template <typename T>
 Stream &Stream::ThenConvolveBackwardBiasImpl(
     const dnn::BatchDescriptor &input_descriptor,
@@ -1490,6 +1287,28 @@ Stream &Stream::ThenPoolForward(
   return *this;
 }
 
+Stream &Stream::ThenPoolForward(
+    const dnn::PoolingDescriptor &pooling_dimensions,
+    const dnn::BatchDescriptor &input_dimensions,
+    const DeviceMemory<int8> &input_data,
+    const dnn::BatchDescriptor &output_dimensions,
+    DeviceMemory<int8> *output_data, ScratchAllocator *workspace_allocator) {
+  VLOG_CALL(PARAM(pooling_dimensions), PARAM(input_dimensions),
+            PARAM(input_data), PARAM(output_dimensions), PARAM(output_data),
+            PARAM(workspace_allocator));
+
+  if (ok()) {
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions,
+                                    input_data, output_dimensions, output_data,
+                                    workspace_allocator));
+    } else {
+      SetErrorAndLogNoDnnSupport();
+    }
+  }
+  return *this;
+}
+
 Stream &Stream::ThenPoolBackward(
     const dnn::PoolingDescriptor &pooling_dimensions,
     const dnn::BatchDescriptor &input_dimensions,
@@ -1574,22 +1393,6 @@ Stream &Stream::ThenPoolBackward(
   return *this;
 }
 
-Stream &Stream::ThenNormalize(
-    const dnn::NormalizeDescriptor &normalize_descriptor,
-    const DeviceMemory<float> &input_data, DeviceMemory<float> *output_data) {
-  VLOG_CALL(PARAM(normalize_descriptor), PARAM(input_data), PARAM(output_data));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoNormalize(this, normalize_descriptor, input_data,
-                                  output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
 Stream &Stream::ThenNormalizeWithDimensions(
     const dnn::NormalizeDescriptor &normalize_descriptor,
     const dnn::BatchDescriptor &dimensions,
@@ -1932,36 +1735,6 @@ Stream &Stream::ThenMemcpyH2DQuantized(
   return *this;
 }
 
-Stream &Stream::ThenCopyHostBuffer2Device(
-    HostBuffer *buffer_src, DeviceMemory<float> *gpu_unquantized_dst) {
-  VLOG_CALL(PARAM(*buffer_src), PARAM(gpu_unquantized_dst));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(
-          dnn->DoCopyHostBuffer2Device(this, buffer_src, gpu_unquantized_dst));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
-Stream &Stream::ThenCopyDevice2HostBuffer(
-    const DeviceMemory<float> &gpu_unquantized_src, HostBuffer *buffer_dst) {
-  VLOG_CALL(PARAM(gpu_unquantized_src), PARAM(*buffer_dst));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(
-          dnn->DoCopyDevice2HostBuffer(this, gpu_unquantized_src, buffer_dst));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
 Stream *Stream::GetOrCreateSubStream() {
   mutex_lock lock(mu_);
 
@@ -5507,4 +5280,13 @@ string Stream::DebugStreamPointers() const {
                       ",impl=", ToVlogString(implementation_.get()), "]");
 }
 
+void Stream::CheckStatus(port::Status status) {
+  if (status.ok()) {
+    return;
+  }
+  LOG(ERROR) << status;
+  mutex_lock lock(mu_);
+  ok_ = false;
+}
+
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index 0fc90cf83d6b4e3e0ede84747f8149c1a25289ca..3e67d55922d1d4fa231b8f30caa95c2b782efbd8 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -109,6 +109,17 @@ class Stream {
   // stream.
   bool ok() const { return !InErrorState(); }
 
+  // Retrieves execution status back into the stream from the underlying
+  // implementation without blocking the stream.
+  //
+  // Normally, Stream::BlockHostUntilDone is used to get execution status.
+  // However, some devices use out-of-band mechnanisms to ensure their streams
+  // have finished on-device work, without needing to block the streams. (These
+  // devices should also override AllowsSyncOnCompletion to return false.) For
+  // these devices, this method can be used after work is finished to retrieve
+  // execution status.
+  port::Status RefreshStatus() LOCKS_EXCLUDED(mu_);
+
   // Initialize the stream. This must be performed before entraining any other
   // operations.
   Stream &Init() LOCKS_EXCLUDED(mu_);
@@ -262,19 +273,6 @@ class Stream {
       DeviceMemory<float> *scale_backprop,
       DeviceMemory<float> *offset_backprop);
 
-  // TODO(leary) add double-precision version of this interface.
-  Stream &ThenFusedConvolve(
-      const dnn::BatchDescriptor &conv_input_descriptor,
-      const DeviceMemory<int8> &conv_input_data, float conv_input_scale,
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<int8> &filter_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const DeviceMemory<int8> &side_input_data, float side_input_scale,
-      const dnn::BatchDescriptor &bias_descriptor,
-      const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<int8> *output);
-
   Stream &ThenConvolve(const dnn::BatchDescriptor &input_descriptor,
                        const DeviceMemory<float> &input_data,
                        const dnn::FilterDescriptor &filter_descriptor,
@@ -303,61 +301,6 @@ class Stream {
       const dnn::BatchDescriptor &output_descriptor,
       DeviceMemory<float> *output_data);
 
-  Stream &ThenFusedConvolveWithScratch(
-      const dnn::BatchDescriptor &conv_input_descriptor,
-      const DeviceMemory<int8> &conv_input_data, float conv_input_scale,
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<int8> &filter_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const DeviceMemory<int8> &side_input_data, float side_input_scale,
-      const dnn::BatchDescriptor &bias_descriptor,
-      const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
-      const dnn::BatchDescriptor &output_descriptor, DeviceMemory<int8> *output,
-      ScratchAllocator *scratch_allocator);
-
-  Stream &ThenFusedConvolveWithScratch(
-      const dnn::BatchDescriptor &conv_input_descriptor,
-      const DeviceMemory<Eigen::half> &conv_input_data, float conv_input_scale,
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<Eigen::half> &filter_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const DeviceMemory<Eigen::half> &side_input_data, float side_input_scale,
-      const dnn::BatchDescriptor &bias_descriptor,
-      const DeviceMemory<Eigen::half> &biases,
-      dnn::ActivationMode activation_mode,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<Eigen::half> *output, ScratchAllocator *scratch_allocator);
-
-  Stream &ThenFusedConvolveWithScratch(
-      const dnn::BatchDescriptor &conv_input_descriptor,
-      const DeviceMemory<float> &conv_input_data, float conv_input_scale,
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<float> &filter_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const DeviceMemory<float> &side_input_data, float side_input_scale,
-      const dnn::BatchDescriptor &bias_descriptor,
-      const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<float> *output, ScratchAllocator *scratch_allocator);
-
-  Stream &ThenConvolveWithScratch(
-      const dnn::BatchDescriptor &input_descriptor,
-      const DeviceMemory<Eigen::half> &input_data,
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<Eigen::half> &filter_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<Eigen::half> *output, ScratchAllocator *scratch_allocator);
-
-  Stream &ThenConvolveWithScratch(
-      const dnn::BatchDescriptor &input_descriptor,
-      const DeviceMemory<float> &input_data,
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<float> &filter_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<float> *output, ScratchAllocator *scratch_allocator);
-
   Stream &ThenConvolveWithAlgorithm(
       const dnn::BatchDescriptor &input_descriptor,
       const DeviceMemory<double> &input_data,
@@ -458,35 +401,6 @@ class Stream {
       const dnn::BatchDescriptor &output_descriptor,
       DeviceMemory<float> *output);
 
-  Stream &ThenConvolveBackwardData(
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<float> &filter_data,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<float> backward_output_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const dnn::BatchDescriptor &input_descriptor,
-      DeviceMemory<float> *backward_input_data);
-
-  Stream &ThenConvolveBackwardDataWithScratch(
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<float> &filter_data,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<float> backward_output_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const dnn::BatchDescriptor &input_descriptor,
-      DeviceMemory<float> *backward_input_data,
-      ScratchAllocator *scratch_allocator);
-
-  Stream &ThenConvolveBackwardDataWithScratch(
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<Eigen::half> &filter_data,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<Eigen::half> backward_output_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const dnn::BatchDescriptor &input_descriptor,
-      DeviceMemory<Eigen::half> *backward_input_data,
-      ScratchAllocator *scratch_allocator);
-
   Stream &ThenConvolveBackwardDataWithAlgorithm(
       const dnn::FilterDescriptor &filter_descriptor,
       const DeviceMemory<double> &filter_data,
@@ -523,35 +437,6 @@ class Stream {
       const dnn::AlgorithmConfig &algorithm_config,
       dnn::ProfileResult *output_profile_result);
 
-  Stream &ThenConvolveBackwardFilter(
-      const dnn::BatchDescriptor &input_descriptor,
-      const DeviceMemory<float> &input_data,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<float> backward_output_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const dnn::FilterDescriptor &filter_descriptor,
-      DeviceMemory<float> *backward_filter_data);
-
-  Stream &ThenConvolveBackwardFilterWithScratch(
-      const dnn::BatchDescriptor &input_descriptor,
-      const DeviceMemory<float> &input_data,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<float> backward_output_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const dnn::FilterDescriptor &filter_descriptor,
-      DeviceMemory<float> *backward_filter_data,
-      ScratchAllocator *scratch_allocator);
-
-  Stream &ThenConvolveBackwardFilterWithScratch(
-      const dnn::BatchDescriptor &input_descriptor,
-      const DeviceMemory<Eigen::half> &input_data,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<Eigen::half> backward_output_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const dnn::FilterDescriptor &filter_descriptor,
-      DeviceMemory<Eigen::half> *backward_filter_data,
-      ScratchAllocator *scratch_allocator);
-
   Stream &ThenConvolveBackwardFilterWithAlgorithm(
       const dnn::BatchDescriptor &input_descriptor,
       const DeviceMemory<double> &input_data,
@@ -650,6 +535,13 @@ class Stream {
                           DeviceMemory<Eigen::half> *output_data,
                           ScratchAllocator *workspace_allocator = nullptr);
 
+  Stream &ThenPoolForward(const dnn::PoolingDescriptor &pooling_dimensions,
+                          const dnn::BatchDescriptor &input_dimensions,
+                          const DeviceMemory<int8> &input_data,
+                          const dnn::BatchDescriptor &output_dimensions,
+                          DeviceMemory<int8> *output_data,
+                          ScratchAllocator *workspace_allocator = nullptr);
+
   Stream &ThenPoolBackward(const dnn::PoolingDescriptor &pooling_dimensions,
                            const dnn::BatchDescriptor &input_dimensions,
                            const DeviceMemory<double> &input_data,
@@ -677,12 +569,6 @@ class Stream {
                            DeviceMemory<Eigen::half> *output_diff_data,
                            ScratchAllocator *workspace_allocator = nullptr);
 
-  Stream &ThenNormalize(const dnn::NormalizeDescriptor &normalize_descriptor,
-                        const DeviceMemory<float> &input_data,
-                        DeviceMemory<float> *output_data);
-
-  // Similar to ThenNormalize, but normalizes across feature maps and allows for
-  // specifying the dimensions of the tensor.
   Stream &ThenNormalizeWithDimensions(
       const dnn::NormalizeDescriptor &normalize_descriptor,
       const dnn::BatchDescriptor &dimensions,
@@ -2092,6 +1978,9 @@ class Stream {
     ok_ = false;
   }
 
+  // Checks the status and logs the error message, if any.
+  void CheckStatus(port::Status status) LOCKS_EXCLUDED(mu_);
+
   void SetError() { CheckError(false /* = operation_retcode */); }
 
   void SetErrorAndLogNoDnnSupport() {
diff --git a/tensorflow/stream_executor/stream_executor_internal.cc b/tensorflow/stream_executor/stream_executor_internal.cc
index 341c6edccd3c1bfd314127c5356f03a15a85e1d3..46afedef3316bcd6b23c6f7b081af10db43d58f6 100644
--- a/tensorflow/stream_executor/stream_executor_internal.cc
+++ b/tensorflow/stream_executor/stream_executor_internal.cc
@@ -25,6 +25,13 @@ StreamExecutorFactory* MakeCUDAExecutorImplementation() {
   return &instance;
 }
 
+// -- ROCm
+
+StreamExecutorFactory* MakeROCMExecutorImplementation() {
+  static StreamExecutorFactory instance;
+  return &instance;
+}
+
 // -- OpenCL
 
 StreamExecutorFactory* MakeOpenCLExecutorImplementation() {
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index 0c2c33cfca227b2d67fcdc633dd94274a65b92bb..36eabda459cdec634de8542961d24942e943d4d1 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -27,6 +27,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/types/optional.h"
+#include "tensorflow/stream_executor/allocator_stats.h"
 #include "tensorflow/stream_executor/device_description.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/device_options.h"
@@ -253,6 +255,10 @@ class StreamExecutorInterface {
   virtual bool StartTimer(Stream *stream, Timer *timer) = 0;
   virtual bool StopTimer(Stream *stream, Timer *timer) = 0;
   virtual port::Status BlockHostUntilDone(Stream *stream) = 0;
+  virtual port::Status GetStatus(Stream *stream) {
+    return port::Status(port::error::UNIMPLEMENTED,
+                        "GetStatus is not supported on this executor.");
+  }
   virtual int PlatformDeviceCount() = 0;
   virtual port::Status EnablePeerAccessTo(StreamExecutorInterface *other) = 0;
   virtual bool CanEnablePeerAccessTo(StreamExecutorInterface *other) = 0;
@@ -363,6 +369,11 @@ class StreamExecutorInterface {
   // as a platform.
   virtual void *GpuContextHack() { return nullptr; }
 
+  // Return allocator statistics.
+  virtual absl::optional<AllocatorStats> GetAllocatorStats() {
+    return absl::nullopt;
+  }
+
  private:
   SE_DISALLOW_COPY_AND_ASSIGN(StreamExecutorInterface);
 };
@@ -374,9 +385,11 @@ using StreamFactory = std::function<StreamInterface *(StreamExecutor *)>;
 using TimerFactory = std::function<TimerInterface *(StreamExecutor *)>;
 using KernelFactory = std::function<KernelInterface*()>;
 
-StreamExecutorFactory* MakeCUDAExecutorImplementation();
+StreamExecutorFactory *MakeCUDAExecutorImplementation();
+
+StreamExecutorFactory *MakeROCMExecutorImplementation();
 
-StreamExecutorFactory* MakeOpenCLExecutorImplementation();
+StreamExecutorFactory *MakeOpenCLExecutorImplementation();
 
 extern StreamExecutorFactory MakeHostExecutorImplementation;
 
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index d1d0bd9bc21e0cdf6d5bb3dc4fc58bc42b30378f..aae1efc7c4fa730da620b30c7536a8774a94bce2 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -71,6 +71,9 @@ internal::StreamExecutorInterface *StreamExecutorImplementationFromPlatformKind(
     case PlatformKind::kCuda:
       factory = *internal::MakeCUDAExecutorImplementation();
       break;
+    case PlatformKind::kROCm:
+      factory = *internal::MakeROCMExecutorImplementation();
+      break;
     case PlatformKind::kOpenCL:
       factory = *internal::MakeOpenCLExecutorImplementation();
       break;
@@ -188,10 +191,14 @@ StreamExecutor::StreamExecutor(
       memory_limit_bytes_(GetMemoryLimitBytes()) {
   if (port::Lowercase(platform_->Name()) == "cuda") {
     platform_kind_ = PlatformKind::kCuda;
+  } else if (port::Lowercase(platform_->Name()) == "rocm") {
+    platform_kind_ = PlatformKind::kROCm;
   } else if (port::Lowercase(platform_->Name()) == "opencl") {
     platform_kind_ = PlatformKind::kOpenCL;
   } else if (port::Lowercase(platform_->Name()) == "host") {
     platform_kind_ = PlatformKind::kHost;
+  } else {
+    platform_kind_ = PlatformKind::kInvalid;
   }
 }
 
@@ -389,7 +396,7 @@ StreamExecutor::createRnnDescriptor(
 }
 
 port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
-StreamExecutor::createRnnSequenceTensorDescriptor(int seq_length,
+StreamExecutor::createRnnSequenceTensorDescriptor(int max_seq_length,
                                                   int batch_size, int data_size,
                                                   dnn::DataType data_type) {
   dnn::DnnSupport *dnn_support = AsDnn();
@@ -397,8 +404,21 @@ StreamExecutor::createRnnSequenceTensorDescriptor(int seq_length,
     return port::Status(port::error::UNKNOWN,
                         "Fail to find the dnn implementation.");
   }
-  return dnn_support->createRnnSequenceTensorDescriptor(seq_length, batch_size,
-                                                        data_size, data_type);
+  return dnn_support->createRnnSequenceTensorDescriptor(
+      max_seq_length, batch_size, data_size, data_type);
+}
+
+port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
+StreamExecutor::createRnnSequenceTensorDescriptor(
+    int max_seq_length, int batch_size, int data_size,
+    const absl::Span<const int> &seq_lengths, dnn::DataType data_type) {
+  dnn::DnnSupport *dnn_support = AsDnn();
+  if (!dnn_support) {
+    return port::Status(port::error::UNKNOWN,
+                        "Fail to find the dnn implementation.");
+  }
+  return dnn_support->createRnnSequenceTensorDescriptor(
+      max_seq_length, batch_size, data_size, seq_lengths, data_type);
 }
 
 port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
@@ -472,6 +492,10 @@ port::Status StreamExecutor::BlockHostUntilDone(Stream *stream) {
   return result;
 }
 
+port::Status StreamExecutor::GetStatus(Stream *stream) {
+  return implementation_->GetStatus(stream);
+}
+
 void *StreamExecutor::Allocate(uint64 size) {
   if (memory_limit_bytes_ > 0 &&
       mem_alloc_bytes_ + size > memory_limit_bytes_) {
@@ -847,6 +871,10 @@ bool StreamExecutor::UnregisterTraceListener(TraceListener *listener) {
   return true;
 }
 
+absl::optional<AllocatorStats> StreamExecutor::GetAllocatorStats() {
+  return implementation_->GetAllocatorStats();
+}
+
 template <typename TraceCallT, typename... ArgsT>
 void StreamExecutor::SubmitTrace(TraceCallT trace_call, ArgsT &&... args) {
   if (tracing_enabled_) {
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index d259a4ab635660982e9308bbf8f934fc5950d909..09fe0a5c5df30a03c27e075902c847e2517c57e2 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/macros.h"
+#include "absl/types/optional.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/lib/threadpool.h"
@@ -413,9 +414,15 @@ class StreamExecutor {
   // Create a RNN sequence descriptor that specifies either the input or output
   // sequence. The caller retains the ownership of the returned descriptor.
   port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
-  createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
+  createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
                                     int data_size, dnn::DataType data_type);
 
+  port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
+  createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
+                                    int data_size,
+                                    const absl::Span<const int> &seq_lengths,
+                                    dnn::DataType data_type);
+
   // Create an RNN state descriptor that specifies the input or hidden state.
   // The caller retains the ownership of the returned descriptor.
   port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
@@ -479,6 +486,9 @@ class StreamExecutor {
   // previously registered.
   bool UnregisterTraceListener(TraceListener* listener);
 
+  // Return allocator statistics.
+  absl::optional<AllocatorStats> GetAllocatorStats();
+
  private:
   template <typename BeginCallT, typename CompleteCallT,
             typename ReturnT, typename... BeginArgsT>
@@ -518,6 +528,9 @@ class StreamExecutor {
   // operations enqueued on the stream before this program point.
   port::Status BlockHostUntilDone(Stream *stream);
 
+  // Without blocking the device, retrieve the current stream status.
+  port::Status GetStatus(Stream *stream);
+
   // Synchronously allocates size bytes on the underlying platform and returns
   // an opaque void* representing that allocation. In the case of failure,
   // nullptr is returned.
@@ -847,7 +860,7 @@ DeviceMemory<T> StreamExecutor::AllocateSubBuffer(DeviceMemory<T> *parent,
   }
   CreateAllocRecord(opaque, sizeof(T) * element_count);
   return DeviceMemory<T>(DeviceMemoryBase(opaque, sizeof(T) * element_count,
-                                    true /* = is_sub_buffer */));
+                                          true /* = is_sub_buffer */));
 }
 
 template <typename... Params, typename... Args>
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index ed1de5a31cae98bf5855fde0676162f0264d998e..a42d3c90183954b0616749e9b4b10b9a265001fb 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -9,6 +9,7 @@ load(
     "tf_additional_grpc_deps_py",
     "tf_additional_xla_deps_py",
     "tf_cuda_tests_tags",
+    "tf_exec_compatible_with",
     "tf_sycl_tests_tags",
 )
 load(
@@ -44,9 +45,22 @@ load(
     "//third_party/ngraph:build_defs.bzl",
     "if_ngraph",
 )
+
 def register_extension_info(**kwargs):
     pass
 
+def if_v2(a):
+    return select({
+        clean_dep("//tensorflow:api_version_2"): a,
+        "//conditions:default": [],
+    })
+
+def if_not_v2(a):
+    return select({
+        clean_dep("//tensorflow:api_version_2"): [],
+        "//conditions:default": a,
+    })
+
 # if_cuda_is_configured def placeholder
 
 def if_cuda_is_configured_compat(x):
@@ -84,6 +98,11 @@ def tf_android_core_proto_headers(core_proto_sources_relative):
         for p in core_proto_sources_relative
     ])
 
+# Wrapper for portable protos which currently just creates an empty rule.
+def tf_portable_proto_library(name, proto_deps, **kwargs):
+    _ignore = [kwargs]
+    native.cc_library(name = name, deps = proto_deps)
+
 # Sanitize a dependency so that it works correctly from code that includes
 # TensorFlow as a submodule.
 def clean_dep(dep):
@@ -133,6 +152,12 @@ def if_android(a):
         "//conditions:default": [],
     })
 
+def if_emscripten(a):
+    return select({
+        clean_dep("//tensorflow:emscripten"): a,
+        "//conditions:default": [],
+    })
+
 def if_ios(a):
     return select({
         clean_dep("//tensorflow:ios"): a,
@@ -293,9 +318,19 @@ def tf_opts_nortti_if_android():
 
 # LINT.ThenChange(//tensorflow/contrib/android/cmake/CMakeLists.txt)
 
+def tf_opts_nortti_if_emscripten():
+    return if_emscripten([
+        "-fno-rtti",
+        "-DGOOGLE_PROTOBUF_NO_RTTI",
+        "-DGOOGLE_PROTOBUF_NO_STATIC_INITIALIZER",
+    ])
+
 def tf_features_nomodules_if_android():
     return if_android(["-use_header_modules"])
 
+def tf_features_nomodules_if_emscripten():
+    return if_emscripten(["-use_header_modules"])
+
 # Given a list of "op_lib_names" (a list of files in the ops directory
 # without their .cc extensions), generate a library for that file.
 def tf_gen_op_libs(op_lib_names, deps = None, is_external = True):
@@ -351,16 +386,13 @@ def tf_binary_additional_srcs():
         ],
     )
 
-def _linux_kernel_dso_name(kernel_build_target):
-    """Given a build target, construct the dso name for linux."""
-    parts = kernel_build_target.split(":")
-    return "%s:libtfkernel_%s.so" % (parts[0], parts[1])
-
 # Helper functions to add kernel dependencies to tf binaries when using dynamic
 # kernel linking.
-def tf_binary_dynamic_kernel_dsos(kernels):
+def tf_binary_dynamic_kernel_dsos():
     return if_dynamic_kernels(
-        extra_deps = [_linux_kernel_dso_name(k) for k in kernels],
+        extra_deps = [
+            "//tensorflow/core/kernels:libtfkernel_all_kernels.so",
+        ],
         otherwise = [],
     )
 
@@ -384,9 +416,9 @@ def tf_cc_shared_object(
     native.cc_binary(
         name = name,
         srcs = srcs + framework_so,
-        deps = deps + tf_binary_dynamic_kernel_deps(kernels),
+        deps = deps,
         linkshared = 1,
-        data = data + tf_binary_dynamic_kernel_dsos(kernels),
+        data = data,
         linkopts = linkopts + _rpath_linkopts(name) + select({
             clean_dep("//tensorflow:darwin"): [
                 "-Wl,-install_name,@rpath/" + name.split("/")[-1],
@@ -417,6 +449,11 @@ def tf_cc_binary(
         copts = tf_copts(),
         kernels = [],
         **kwargs):
+    if kernels:
+        added_data_deps = tf_binary_dynamic_kernel_dsos()
+    else:
+        added_data_deps = []
+
     native.cc_binary(
         name = name,
         copts = copts,
@@ -426,7 +463,7 @@ def tf_cc_binary(
                 clean_dep("//third_party/mkl:intel_binary_blob"),
             ],
         ),
-        data = data + tf_binary_dynamic_kernel_dsos(kernels),
+        data = depset(data + added_data_deps),
         linkopts = linkopts + _rpath_linkopts(name),
         **kwargs
     )
@@ -578,6 +615,7 @@ def tf_gen_op_wrappers_cc(
             clean_dep("//tensorflow/core:core_cpu"),
             clean_dep("//tensorflow/core:framework"),
             clean_dep("//tensorflow/core:lib"),
+            clean_dep("//tensorflow/core:ops"),
             clean_dep("//tensorflow/core:protos_all_cc"),
         ]) + if_android([
             clean_dep("//tensorflow/core:android_tensorflow_lib"),
@@ -594,6 +632,7 @@ def tf_gen_op_wrappers_cc(
             clean_dep("//tensorflow/core:core_cpu"),
             clean_dep("//tensorflow/core:framework"),
             clean_dep("//tensorflow/core:lib"),
+            clean_dep("//tensorflow/core:ops"),
             clean_dep("//tensorflow/core:protos_all_cc"),
         ]) + if_android([
             clean_dep("//tensorflow/core:android_tensorflow_lib"),
@@ -772,7 +811,8 @@ def tf_cc_test(
                 clean_dep("//third_party/mkl:intel_binary_blob"),
             ],
         ),
-        data = data + tf_binary_dynamic_kernel_dsos(kernels),
+        data = data + tf_binary_dynamic_kernel_dsos(),
+        exec_compatible_with = tf_exec_compatible_with(kwargs),
         # Nested select() statements seem not to be supported when passed to
         # linkstatic, and we already have a cuda select() passed in to this
         # function.
@@ -885,6 +925,7 @@ def tf_cuda_only_cc_test(
         args = [],
         kernels = [],
         linkopts = []):
+    tags = tags + tf_cuda_tests_tags()
     native.cc_test(
         name = "%s%s" % (name, "_gpu"),
         srcs = srcs + tf_binary_additional_srcs(),
@@ -892,7 +933,7 @@ def tf_cuda_only_cc_test(
         args = args,
         copts = _cuda_copts() + rocm_copts() + tf_copts(),
         features = if_cuda(["-use_header_modules"]),
-        data = data + tf_binary_dynamic_kernel_dsos(kernels),
+        data = data + tf_binary_dynamic_kernel_dsos(),
         deps = deps + tf_binary_dynamic_kernel_deps(kernels) + if_cuda_is_configured([
             clean_dep("//tensorflow/core:cuda"),
             clean_dep("//tensorflow/core:gpu_lib"),
@@ -907,7 +948,8 @@ def tf_cuda_only_cc_test(
             clean_dep("//tensorflow:darwin"): 1,
             "//conditions:default": 0,
         }),
-        tags = tags + tf_cuda_tests_tags(),
+        tags = tags,
+        exec_compatible_with = tf_exec_compatible_with({"tags": tags}),
     )
 
 register_extension_info(
@@ -970,7 +1012,8 @@ def tf_cc_test_mkl(
                 ],
             }) + _rpath_linkopts(src_to_test_name(src)),
             deps = deps + tf_binary_dynamic_kernel_deps(kernels) + mkl_deps(),
-            data = data + tf_binary_dynamic_kernel_dsos(kernels),
+            data = data + tf_binary_dynamic_kernel_dsos(),
+            exec_compatible_with = tf_exec_compatible_with({"tags": tags}),
             linkstatic = linkstatic,
             tags = tags,
             size = size,
@@ -1023,7 +1066,7 @@ def tf_java_test(
     native.java_test(
         name = name,
         srcs = srcs,
-        deps = deps + tf_binary_additional_srcs() + tf_binary_dynamic_kernel_dsos(kernels) + tf_binary_dynamic_kernel_deps(kernels),
+        deps = deps + tf_binary_additional_srcs() + tf_binary_dynamic_kernel_dsos() + tf_binary_dynamic_kernel_deps(kernels),
         *args,
         **kwargs
     )
@@ -1115,7 +1158,7 @@ def tf_cuda_library(deps = None, cuda_deps = None, copts = tf_copts(), **kwargs)
     kwargs["features"] = kwargs.get("features", []) + ["-use_header_modules"]
     native.cc_library(
         deps = deps + if_cuda_is_configured_compat(cuda_deps + [
-            clean_dep("//tensorflow/core:cuda"),
+            clean_dep("//tensorflow/stream_executor/cuda:cudart_stub"),
             "@local_config_cuda//cuda:cuda_headers",
         ]) + if_rocm_is_configured(cuda_deps + [
             # rocm_header placeholder
@@ -1423,7 +1466,7 @@ def cc_header_only_library(name, deps = [], includes = [], extra_deps = [], **kw
 
 def tf_custom_op_library_additional_deps():
     return [
-      "@protobuf_archive//:protobuf_headers",
+        "@protobuf_archive//:protobuf_headers",
         clean_dep("//third_party/eigen3"),
         clean_dep("//tensorflow/core:framework_headers_lib"),
     ] + if_windows(["//tensorflow/python:pywrap_tensorflow_import_lib"])
@@ -1433,8 +1476,8 @@ def tf_custom_op_library_additional_deps():
 # exporting symbols from _pywrap_tensorflow.dll on Windows.
 def tf_custom_op_library_additional_deps_impl():
     return [
-      "@protobuf_archive//:protobuf",
-      "@nsync//:nsync_cpp",
+        "@protobuf_archive//:protobuf",
+        "@nsync//:nsync_cpp",
         # for //third_party/eigen3
         clean_dep("//third_party/eigen3"),
         # for //tensorflow/core:framework_headers_lib
@@ -1632,7 +1675,9 @@ def tf_py_wrap_cc(
         swig_includes = [],
         deps = [],
         copts = [],
+        version_script = None,
         **kwargs):
+    """Builds a Python extension module."""
     module_name = name.split("/")[-1]
 
     # Convert a rule name such as foo/bar/baz to foo/bar/_baz.so
@@ -1651,6 +1696,11 @@ def tf_py_wrap_cc(
         toolchain_deps = ["@bazel_tools//tools/cpp:current_cc_toolchain"],
         deps = deps + extra_deps,
     )
+    if not version_script:
+        version_script = select({
+            "@local_config_cuda//cuda:darwin": clean_dep("//tensorflow:tf_exported_symbols.lds"),
+            "//conditions:default": clean_dep("//tensorflow:tf_version_script.lds"),
+        })
     vscriptname = name + "_versionscript"
     _append_init_to_versionscript(
         name = vscriptname,
@@ -1659,15 +1709,11 @@ def tf_py_wrap_cc(
             "//conditions:default": True,
         }),
         module_name = module_name,
-        template_file = select({
-            "@local_config_cuda//cuda:darwin": clean_dep("//tensorflow:tf_exported_symbols.lds"),
-            "//conditions:default": clean_dep("//tensorflow:tf_version_script.lds"),
-        }),
+        template_file = version_script,
     )
     extra_linkopts = select({
         "@local_config_cuda//cuda:darwin": [
-            "-Wl,-exported_symbols_list",
-            "$(location %s.lds)" % vscriptname,
+            "-Wl,-exported_symbols_list,$(location %s.lds)" % vscriptname,
         ],
         clean_dep("//tensorflow:windows"): [],
         "//conditions:default": [
@@ -1740,7 +1786,8 @@ def py_test(deps = [], data = [], kernels = [], **kwargs):
         data = data + select({
             "//conditions:default": [],
             clean_dep("//tensorflow:no_tensorflow_py_deps"): ["//tensorflow/tools/pip_package:win_pip_package_marker"],
-        }) + tf_binary_dynamic_kernel_dsos(kernels),
+        }) + tf_binary_dynamic_kernel_dsos(),
+        exec_compatible_with = tf_exec_compatible_with(kwargs),
         **kwargs
     )
 
@@ -1782,10 +1829,20 @@ def tf_py_test(
         tags = [],
         shard_count = 1,
         additional_deps = [],
+        additional_visibility = [],
         kernels = [],
         flaky = 0,
+        xla_enable_strict_auto_jit = False,
         xla_enabled = False,
         grpc_enabled = False):
+    """Create one or more python tests with extra tensorflow dependencies."""
+    xla_test_true_list = []
+
+    # xla_enable_strict_auto_jit is used to run Tensorflow unit tests with all XLA compilable
+    # kernels compiled with XLA.
+    if xla_enable_strict_auto_jit:
+        xla_enabled = True
+        xla_test_true_list += ["//tensorflow/python:is_xla_test_true"]
     if xla_enabled:
         additional_deps = additional_deps + tf_additional_xla_deps_py()
     if grpc_enabled:
@@ -1802,11 +1859,11 @@ def tf_py_test(
         shard_count = shard_count,
         srcs_version = "PY2AND3",
         tags = tags,
-        visibility = [clean_dep("//tensorflow:internal")],
+        visibility = [clean_dep("//tensorflow:internal")] + additional_visibility,
         deps = [
             clean_dep("//tensorflow/python:extra_py_tests_deps"),
             clean_dep("//tensorflow/python:gradient_checker"),
-        ] + additional_deps,
+        ] + additional_deps + xla_test_true_list,
     )
 
 register_extension_info(
@@ -1826,8 +1883,12 @@ def cuda_py_test(
         kernels = [],
         tags = [],
         flaky = 0,
+        xla_enable_strict_auto_jit = False,
         xla_enabled = False,
         grpc_enabled = False):
+    # TODO(b/122522101): Don't ignore xla_enable_strict_auto_jit and enable additional
+    # XLA tests once enough compute resources are available.
+    _ignored = [xla_enable_strict_auto_jit]
     if main == None:
         main = name + ".py"
     for config in ["cpu", "gpu"]:
@@ -1850,6 +1911,7 @@ def cuda_py_test(
             shard_count = shard_count,
             tags = test_tags,
             xla_enabled = xla_enabled,
+            xla_enable_strict_auto_jit = False,
         )
 
 register_extension_info(
@@ -1903,6 +1965,7 @@ def py_tests(
         tags = [],
         shard_count = 1,
         prefix = "",
+        xla_enable_strict_auto_jit = False,
         xla_enabled = False,
         grpc_enabled = False):
     for src in srcs:
@@ -1921,6 +1984,7 @@ def py_tests(
             shard_count = shard_count,
             tags = tags,
             xla_enabled = xla_enabled,
+            xla_enable_strict_auto_jit = xla_enable_strict_auto_jit,
         )
 
 def cuda_py_tests(
@@ -1933,8 +1997,12 @@ def cuda_py_tests(
         shard_count = 1,
         tags = [],
         prefix = "",
+        xla_enable_strict_auto_jit = False,
         xla_enabled = False,
         grpc_enabled = False):
+    # TODO(b/122522101): Don't ignore xla_enable_strict_auto_jit and enable additional
+    # XLA tests once enough compute resources are available.
+    _ignored = [xla_enable_strict_auto_jit]
     test_tags = tags + tf_cuda_tests_tags()
     py_tests(
         name = name,
@@ -1948,6 +2016,7 @@ def cuda_py_tests(
         shard_count = shard_count,
         tags = test_tags,
         xla_enabled = xla_enabled,
+        xla_enable_strict_auto_jit = False,
     )
 
 # Creates a genrule named <name> for running tools/proto_text's generator to
diff --git a/tensorflow/tf_exported_symbols.lds b/tensorflow/tf_exported_symbols.lds
index 9f6114f503467fc12fcfb5dae07e75d2113e410d..04632330c56c69a359d2e8fad424a1fb5afff74b 100644
--- a/tensorflow/tf_exported_symbols.lds
+++ b/tensorflow/tf_exported_symbols.lds
@@ -4,5 +4,4 @@
 *TF_*
 *TFE_*
 *nsync_*
-*pywrap_xla*
 *stream_executor*
diff --git a/tensorflow/tf_version_script.lds b/tensorflow/tf_version_script.lds
index 39d258c3b7edd1f5f7d0805c080e832aa1d6109a..563d178de7396fbae6127d9dcfbfa8cf00c65038 100644
--- a/tensorflow/tf_version_script.lds
+++ b/tensorflow/tf_version_script.lds
@@ -5,7 +5,6 @@ tensorflow {
     *TF_*;
     *TFE_*;
     *nsync_*;
-    *pywrap_xla*;
     *stream_executor*;
   local:
     *;
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-aggregation-method.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-aggregation-method.pbtxt
index f79029d3fe0b88a454b11456b3785c3ae28a253c..cc2d5c87d667fb5c4af6b6fc435ae626334fe2d1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-aggregation-method.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-aggregation-method.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.AggregationMethod"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.gradients_impl.AggregationMethod\'>"
+  is_instance: "<class \'tensorflow.python.ops.gradients_util.AggregationMethod\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "ADD_N"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
index a1083d732a1bb1b3212457f445323e5e868ef162..2e8ece122c572f3430afd073179c4d61f532303a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
@@ -26,7 +26,19 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_BOOL
     }
-     reserved_range {
+    field {
+      name: "collective_deterministic_sequential_execution"
+      number: 6
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "collective_nccl"
+      number: 7
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    reserved_range {
       start: 2
       end: 3
     }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
index b505d813509c2049fa6e3f60df553492d6f66613..9c7de2c5719350d6c30ac27c08712f326b014e3c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
@@ -149,6 +149,18 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_BOOL
       }
+      field {
+        name: "collective_deterministic_sequential_execution"
+        number: 6
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
+      field {
+        name: "collective_nccl"
+        number: 7
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
       reserved_range {
         start: 2
         end: 3
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-critical-section.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-critical-section.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..024a20834635e2fc75ad7e6a393fc8f092d9631a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-critical-section.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.CriticalSection"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.critical_section_ops.CriticalSection\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'shared_name\', \'critical_section_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "execute"
+    argspec: "args=[\'self\', \'fn\', \'exclusive_resource_access\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
index a2cc07483a4e10918891f555ca9459fb7503bb32..6c528dd16200e870ca860312defcb43155247979 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
@@ -84,6 +84,18 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_STRING
       }
+      field {
+        name: "timestamped_allocator"
+        number: 5
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
+      field {
+        name: "pending_cap"
+        number: 6
+        label: LABEL_OPTIONAL
+        type: TYPE_INT32
+      }
       nested_type {
         name: "VirtualDevices"
         field {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-indexed-slices.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-indexed-slices.pbtxt
index fee84d85307dffb675b507a31c4f1fda60de869d..5b47c718a5753905a4fa426b739dad4b01678c3f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-indexed-slices.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-indexed-slices.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.IndexedSlices"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.ops.IndexedSlices\'>"
   is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dense_shape"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-meta-graph-def.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-meta-graph-def.pbtxt
index d71c2358c93e9597726665fdf8f92e648b2ea772..b453f7e9903bf66d19b06974da016c8c2971372d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-meta-graph-def.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-meta-graph-def.pbtxt
@@ -44,6 +44,13 @@ tf_proto {
       type: TYPE_MESSAGE
       type_name: ".tensorflow.AssetFileDef"
     }
+    field {
+      name: "object_graph_def"
+      number: 7
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.SavedObjectGraph"
+    }
     nested_type {
       name: "MetaInfoDef"
       field {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-module.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-module.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8d599d73b84fc51b8bf3001f6773011e42f09456
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-module.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.Module"
+tf_class {
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "no_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-node-def.-experimental-debug-info.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-node-def.-experimental-debug-info.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..73483e2b6e2239dc35b25e2057b75a56ef010c3d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-node-def.-experimental-debug-info.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.NodeDef.ExperimentalDebugInfo"
+tf_proto {
+  descriptor {
+    name: "ExperimentalDebugInfo"
+    field {
+      name: "original_node_names"
+      number: 1
+      label: LABEL_REPEATED
+      type: TYPE_STRING
+    }
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-node-def.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-node-def.pbtxt
index 646fa8abb9b22dbd908ff821cbe66a33ad02ba64..18548632c9cb1cc227aec6f893bfc487ef2cd864 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-node-def.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-node-def.pbtxt
@@ -33,6 +33,13 @@ tf_proto {
       type: TYPE_MESSAGE
       type_name: ".tensorflow.NodeDef.AttrEntry"
     }
+    field {
+      name: "experimental_debug_info"
+      number: 6
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.NodeDef.ExperimentalDebugInfo"
+    }
     nested_type {
       name: "AttrEntry"
       field {
@@ -52,5 +59,14 @@ tf_proto {
         map_entry: true
       }
     }
+    nested_type {
+      name: "ExperimentalDebugInfo"
+      field {
+        name: "original_node_names"
+        number: 1
+        label: LABEL_REPEATED
+        type: TYPE_STRING
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..481a8c73ac351cc0ef38ee3681d5134f06334421
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor.pbtxt
@@ -0,0 +1,126 @@
+path: "tensorflow.RaggedTensor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.ragged.ragged_tensor.RaggedTensor\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "flat_values"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "nested_row_splits"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "ragged_rank"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "row_splits"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "values"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'values\', \'row_splits\', \'cached_row_lengths\', \'cached_value_rowids\', \'cached_nrows\', \'internal\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "bounding_shape"
+    argspec: "args=[\'self\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_nested_row_lengths"
+    argspec: "args=[\'cls\', \'flat_values\', \'nested_row_lengths\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_nested_row_splits"
+    argspec: "args=[\'cls\', \'flat_values\', \'nested_row_splits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_nested_value_rowids"
+    argspec: "args=[\'cls\', \'flat_values\', \'nested_value_rowids\', \'nested_nrows\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_row_lengths"
+    argspec: "args=[\'cls\', \'values\', \'row_lengths\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_row_limits"
+    argspec: "args=[\'cls\', \'values\', \'row_limits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_row_splits"
+    argspec: "args=[\'cls\', \'values\', \'row_splits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_row_starts"
+    argspec: "args=[\'cls\', \'values\', \'row_starts\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_sparse"
+    argspec: "args=[\'cls\', \'st_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_tensor"
+    argspec: "args=[\'cls\', \'tensor\', \'lengths\', \'padding\', \'ragged_rank\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\'], "
+  }
+  member_method {
+    name: "from_value_rowids"
+    argspec: "args=[\'cls\', \'values\', \'value_rowids\', \'nrows\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "nested_row_lengths"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "nrows"
+    argspec: "args=[\'self\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "row_lengths"
+    argspec: "args=[\'self\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "row_limits"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "row_starts"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "to_list"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "to_sparse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "to_tensor"
+    argspec: "args=[\'self\', \'default_value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "value_rowids"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "with_flat_values"
+    argspec: "args=[\'self\', \'new_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_values"
+    argspec: "args=[\'self\', \'new_values\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-run-metadata.-function-graphs.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-run-metadata.-function-graphs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d2e2f583d21a30fd1e97fc20dadd58d3a62a5141
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-run-metadata.-function-graphs.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.RunMetadata.FunctionGraphs"
+tf_proto {
+  descriptor {
+    name: "FunctionGraphs"
+    field {
+      name: "partition_graphs"
+      number: 1
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.GraphDef"
+    }
+    field {
+      name: "pre_optimization_graph"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.GraphDef"
+    }
+    field {
+      name: "post_optimization_graph"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.GraphDef"
+    }
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-run-metadata.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-run-metadata.pbtxt
index 1287940326c0196e76fff2cf6363622226092504..777b889745fefd69e628ea26d3ca0a0fefc743b4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-run-metadata.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-run-metadata.pbtxt
@@ -23,5 +23,36 @@ tf_proto {
       type: TYPE_MESSAGE
       type_name: ".tensorflow.GraphDef"
     }
+    field {
+      name: "function_graphs"
+      number: 4
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.RunMetadata.FunctionGraphs"
+    }
+    nested_type {
+      name: "FunctionGraphs"
+      field {
+        name: "partition_graphs"
+        number: 1
+        label: LABEL_REPEATED
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.GraphDef"
+      }
+      field {
+        name: "pre_optimization_graph"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.GraphDef"
+      }
+      field {
+        name: "post_optimization_graph"
+        number: 3
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.GraphDef"
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt
index 3add49e90d7eb5094ad68d1474e834404549c988..64f7260369d7cbc656ad3d23b69cc9079e030f95 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-sparse-tensor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.SparseTensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensor\'>"
   is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dense_shape"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-tensor-shape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-shape.pbtxt
index d11e927bd55cea52d0dbdfd4b28b2c1bc24fdaa5..60518ffadc833b0ab07e25c9b738aed7f7e08f20 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-tensor-shape.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-shape.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.TensorShape"
 tf_class {
-  is_instance: "<class \'tensorflow.python.framework.tensor_shape.TensorShapeV1\'>"
+  is_instance: "<class \'tensorflow.python.framework.tensor_shape.TensorShape\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dims"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
index 62d8ea9208f7f5f031b80be168cedfd538f18a22..9a4363829c20d10aeaed28abfa2146355ba12f46 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.Variable"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.variables.VariableV1\'>"
   is_instance: "<class \'tensorflow.python.ops.variables.Variable\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "SaveSliceInfo"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.audio.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.audio.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6c5724078357125255acd413902c4a5e57cb719e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.audio.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.audio"
+tf_module {
+  member_method {
+    name: "decode_wav"
+    argspec: "args=[\'contents\', \'desired_channels\', \'desired_samples\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+  }
+  member_method {
+    name: "encode_wav"
+    argspec: "args=[\'audio\', \'sample_rate\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-feature.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-feature.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1f04d028efdc895e493c9e60e1c9025fc26de4f3
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-feature.pbtxt
@@ -0,0 +1,36 @@
+path: "tensorflow.autograph.experimental.Feature"
+tf_class {
+  is_instance: "<enum \'Feature\'>"
+  member {
+    name: "ALL"
+    mtype: "<enum \'Feature\'>"
+  }
+  member {
+    name: "ASSERT_STATEMENTS"
+    mtype: "<enum \'Feature\'>"
+  }
+  member {
+    name: "AUTO_CONTROL_DEPS"
+    mtype: "<enum \'Feature\'>"
+  }
+  member {
+    name: "BUILTIN_FUNCTIONS"
+    mtype: "<enum \'Feature\'>"
+  }
+  member {
+    name: "ERROR_REWRITING"
+    mtype: "<enum \'Feature\'>"
+  }
+  member {
+    name: "LISTS"
+    mtype: "<enum \'Feature\'>"
+  }
+  member {
+    name: "LOGICAL_EXPRESSIONS"
+    mtype: "<enum \'Feature\'>"
+  }
+  member {
+    name: "NAME_SCOPES"
+    mtype: "<enum \'Feature\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-verbosity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-verbosity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c4d5b77c0738feb1fa6ea69672ee3fafa51de5be
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.-verbosity.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.autograph.experimental.Verbosity"
+tf_class {
+  is_instance: "<enum \'Verbosity\'>"
+  member {
+    name: "BRIEF"
+    mtype: "<enum \'Verbosity\'>"
+  }
+  member {
+    name: "VERBOSE"
+    mtype: "<enum \'Verbosity\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5747dac7ab201443d1f237415cd280aee672a8ff
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.autograph.experimental.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.autograph.experimental"
+tf_module {
+  member {
+    name: "Feature"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "Verbosity"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.autograph.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.autograph.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0baf6e03552f5b12e5f2e48f87cf1ec7332787bb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.autograph.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.autograph"
+tf_module {
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "set_verbosity"
+    argspec: "args=[\'level\', \'alsologtostdout\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "to_code"
+    argspec: "args=[\'entity\', \'recursive\', \'arg_values\', \'arg_types\', \'indentation\', \'experimental_optional_features\', \'experimental_partial_types\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'  \', \'Feature.ALL\', \'None\'], "
+  }
+  member_method {
+    name: "to_graph"
+    argspec: "args=[\'entity\', \'recursive\', \'arg_values\', \'arg_types\', \'experimental_optional_features\', \'experimental_strip_decorators\', \'experimental_verbose\', \'experimental_partial_types\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'Feature.ALL\', \'None\', \'Verbosity.BRIEF\', \'None\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[], varargs=args, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.compat.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.compat.pbtxt
index f1d760603e981a0b9a72fdc379dc81932ac71d67..95352dff3a6b9341857b3f3b82dcb6817e4553ab 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.compat.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.compat.pbtxt
@@ -32,6 +32,14 @@ tf_module {
     name: "as_text"
     argspec: "args=[\'bytes_or_text\', \'encoding\'], varargs=None, keywords=None, defaults=[\'utf-8\'], "
   }
+  member_method {
+    name: "dimension_at_index"
+    argspec: "args=[\'shape\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "dimension_value"
+    argspec: "args=[\'dimension\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "forward_compatibility_horizon"
     argspec: "args=[\'year\', \'month\', \'day\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.config.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..09076a8302610d769e811d8e34333d946d51bc8c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.config.pbtxt
@@ -0,0 +1,15 @@
+path: "tensorflow.config"
+tf_module {
+  member_method {
+    name: "experimental_connect_to_host"
+    argspec: "args=[\'remote_host\', \'job_name\'], varargs=None, keywords=None, defaults=[\'None\', \'worker\'], "
+  }
+  member_method {
+    name: "experimental_list_devices"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_run_functions_eagerly"
+    argspec: "args=[\'run_eagerly\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
index f7d388d33d050eac2c9f14682bc7068c745a46bc..1c55f81bed19929276cc17d31d333ff53712bc8f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
@@ -39,6 +39,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "filter_with_legacy_function"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
@@ -79,6 +83,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "map_with_legacy_function"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "options"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
index d73168b070e374a749a00f74b24b77a715d2f37e..5488449044a021ea58c92a028b109f701f6f1b79 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -41,6 +41,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "filter_with_legacy_function"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
@@ -81,6 +85,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "map_with_legacy_function"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "options"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt
index 4f0147a52381c748eccbfee29df0d3537ba5d14a..87af1123d77a22a362abced5605beddf12395723 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-iterator.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.data.Iterator"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.iterator_ops.Iterator\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "initializer"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
index 51224cd6b45f0a1efdfbb3ba6a3ca377d37fd00b..0927dd01a3f4629f6caf39f097085547638f0cf1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -41,6 +41,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "filter_with_legacy_function"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
@@ -81,6 +85,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "map_with_legacy_function"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "options"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
index a10add1b7e38f9875e699903b3e3c103d73e647e..bab1e399210ea20ffcaaa417a4f709d74c5e8a00 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
@@ -41,6 +41,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "filter_with_legacy_function"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
@@ -81,6 +85,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "map_with_legacy_function"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "options"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
index 71b597c19c512879b8f18b34843b160efecc6bec..68cf02333f05aa7f068686773ff3ce5d2f0d3ca5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -41,6 +41,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "filter_with_legacy_function"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
@@ -81,6 +85,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "map_with_legacy_function"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "options"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
index 20646e87b5fbe23d89ad31ca632a64bf958339f6..6d3f88eded4a055b7047886c3245f28866ec59ee 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -41,6 +41,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "filter_with_legacy_function"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
@@ -81,6 +85,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "map_with_legacy_function"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "options"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
index 86c5ff5b0bd7b42d61a92a44c8888852a48677be..bc4943e73788c8c59669e8f2fe1145e3d1c1fb01 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -41,6 +41,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "filter_with_legacy_function"
+    argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\'], varargs=None, keywords=None, defaults=None"
@@ -81,6 +85,10 @@ tf_class {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "map_with_legacy_function"
+    argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "options"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
index 2d115904925eb96164484300baf628d41d3fcff4..0e3999c0f4618f2b09a23ea0a32b5ae89990258d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
@@ -86,7 +86,7 @@ tf_module {
   }
   member_method {
     name: "bucket_by_sequence_length"
-    argspec: "args=[\'element_length_func\', \'bucket_boundaries\', \'bucket_batch_sizes\', \'padded_shapes\', \'padding_values\', \'pad_to_bucket_boundary\', \'no_padding\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'element_length_func\', \'bucket_boundaries\', \'bucket_batch_sizes\', \'padded_shapes\', \'padding_values\', \'pad_to_bucket_boundary\', \'no_padding\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "cardinality"
@@ -142,7 +142,7 @@ tf_module {
   }
   member_method {
     name: "make_csv_dataset"
-    argspec: "args=[\'file_pattern\', \'batch_size\', \'column_names\', \'column_defaults\', \'label_name\', \'select_columns\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'header\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'num_parallel_reads\', \'sloppy\', \'num_rows_for_inference\', \'compression_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \',\', \'True\', \'\', \'True\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'False\', \'100\', \'None\'], "
+    argspec: "args=[\'file_pattern\', \'batch_size\', \'column_names\', \'column_defaults\', \'label_name\', \'select_columns\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'header\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'num_parallel_reads\', \'sloppy\', \'num_rows_for_inference\', \'compression_type\', \'ignore_errors\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \',\', \'True\', \'\', \'True\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'False\', \'100\', \'None\', \'False\'], "
   }
   member_method {
     name: "make_saveable_from_iterator"
@@ -152,6 +152,10 @@ tf_module {
     name: "map_and_batch"
     argspec: "args=[\'map_func\', \'batch_size\', \'num_parallel_batches\', \'drop_remainder\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "map_and_batch_with_legacy_function"
+    argspec: "args=[\'map_func\', \'batch_size\', \'num_parallel_batches\', \'drop_remainder\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "parallel_interleave"
     argspec: "args=[\'map_func\', \'cycle_length\', \'block_length\', \'sloppy\', \'buffer_output_elements\', \'prefetch_input_elements\'], varargs=None, keywords=None, defaults=[\'1\', \'False\', \'None\', \'None\'], "
@@ -180,6 +184,10 @@ tf_module {
     name: "shuffle_and_repeat"
     argspec: "args=[\'buffer_size\', \'count\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "take_while"
+    argspec: "args=[\'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "unbatch"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
index aa474680592a1a3996ca3db970b814ba167cd801..272963382a009c837427176859994f5c603a05a2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
@@ -30,7 +30,7 @@ tf_module {
   }
   member_method {
     name: "make_initializable_iterator"
-    argspec: "args=[\'dataset\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'dataset\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "make_one_shot_iterator"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt
index 8a7f1e9363b8211d83d39d31da11507cb4c805eb..bf6816a0ab83a89b961618b037965c0f4861da1c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt
@@ -6,7 +6,7 @@ tf_module {
   }
   member_method {
     name: "assert_all_finite"
-    argspec: "args=[\'t\', \'msg\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'t\', \'msg\', \'name\', \'x\', \'message\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_equal"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-cross-device-ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-cross-device-ops.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a2ea23432416ee5f012e6f0c725d5f57841400f7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-cross-device-ops.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.distribute.CrossDeviceOps"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.CrossDeviceOps\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_implementation"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a38c4b21d563f0ef67eb2b2614fae678f0d97dce
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.distribute.HierarchicalCopyAllReduce"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.HierarchicalCopyAllReduce\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.AllReduceCrossDeviceOps\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.CrossDeviceOps\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_packs\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_implementation"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
index a613e2d3d1dcefacdf0ec336587a46ff7e0bcb90..89748f7713fd813ab56d0e07780da33ca8ff14bc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -3,10 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.mirrored_strategy.MirroredStrategy\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
   is_instance: "<type \'object\'>"
-  member {
-    name: "between_graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
@@ -15,45 +11,13 @@ tf_class {
     name: "num_replicas_in_sync"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "parameter_devices"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "require_static_shapes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "should_checkpoint"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "should_init"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "should_save_summary"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "worker_devices"
-    mtype: "<type \'property\'>"
-  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'devices\', \'cross_device_ops\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "batch_reduce"
-    argspec: "args=[\'self\', \'aggregation\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "broadcast"
-    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "call_for_each_replica"
-    argspec: "args=[\'self\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "colocate_vars_with"
@@ -64,29 +28,17 @@ tf_class {
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "distribute_dataset"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "experimental_finalize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "experimental_initialize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
   }
   member_method {
-    name: "finalize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "initialize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "make_dataset_iterator"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
@@ -95,22 +47,10 @@ tf_class {
     name: "make_input_fn_iterator"
     argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
   }
-  member_method {
-    name: "non_slot_devices"
-    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "read_var"
-    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "reduce"
     argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "run_steps_on_dataset"
-    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
-  }
   member_method {
     name: "scope"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -119,20 +59,8 @@ tf_class {
     name: "unwrap"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "update"
-    argspec: "args=[\'self\', \'var\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
-  }
   member_method {
     name: "update_config_proto"
     argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "update_non_slot"
-    argspec: "args=[\'self\', \'colocate_with\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "value_container"
-    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-nccl-all-reduce.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-nccl-all-reduce.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bdc09bcd84b96ee8475d3ef87ec5be686fc449ec
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-nccl-all-reduce.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.distribute.NcclAllReduce"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.NcclAllReduce\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.AllReduceCrossDeviceOps\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.CrossDeviceOps\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_packs\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_implementation"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5d7943af2dc82e59b4ccd34816f9ecb6fd77dae9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.distribute.OneDeviceStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.one_device_strategy.OneDeviceStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'device\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduction-to-one-device.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduction-to-one-device.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f5ade9f86ba24779061bce3aa3e6f019d26741aa
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduction-to-one-device.pbtxt
@@ -0,0 +1,34 @@
+path: "tensorflow.distribute.ReductionToOneDevice"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.ReductionToOneDevice\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.CrossDeviceOps\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduce_to_device\', \'accumulation_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_implementation"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt
index df707e8920e4488ed6b40a7f93f56b5624188c84..c3b7991175769f473acf929d656cd52ccca7bf4f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt
@@ -6,10 +6,6 @@ tf_class {
     name: "devices"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "distribution_strategy"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "num_replicas_in_sync"
     mtype: "<type \'property\'>"
@@ -26,6 +22,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'strategy\', \'replica_id_in_sync_group\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "all_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_call"
     argspec: "args=[\'self\', \'merge_fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-server.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-server.pbtxt
similarity index 96%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-server.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.distribute.-server.pbtxt
index 9b8f185f5b699e860c6fbb50b8d2912984908982..6c39bf4fc4099a753ceee4de0df990a887d2ab4e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-server.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-server.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.Server"
+path: "tensorflow.distribute.Server"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.server_lib.Server\'>"
   is_instance: "<type \'object\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt
index 77706e57133e1186d9e98fcf9205ed4c91772eda..5c4f09075316150b3118f048091d3c68a60a232d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "colocate_vars_with"
     argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_make_numpy_dataset"
+    argspec: "args=[\'self\', \'numpy_input\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "experimental_run_steps_on_iterator"
     argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
@@ -78,4 +82,8 @@ tf_class {
     name: "value_container"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variable_created_in_scope"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
index 9eb73d2c0d9069ec4b818abe1825503f0ea36fc9..6ed49d339d7af7b2d05dfa57121805a7dce48090 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
@@ -2,10 +2,6 @@ path: "tensorflow.distribute.Strategy"
 tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
   is_instance: "<type \'object\'>"
-  member {
-    name: "between_graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
@@ -14,45 +10,13 @@ tf_class {
     name: "num_replicas_in_sync"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "parameter_devices"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "require_static_shapes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "should_checkpoint"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "should_init"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "should_save_summary"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "worker_devices"
-    mtype: "<type \'property\'>"
-  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'extended\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "batch_reduce"
-    argspec: "args=[\'self\', \'aggregation\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "broadcast"
-    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "call_for_each_replica"
-    argspec: "args=[\'self\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "colocate_vars_with"
@@ -63,29 +27,17 @@ tf_class {
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "distribute_dataset"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "experimental_finalize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "experimental_initialize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
   }
   member_method {
-    name: "finalize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "initialize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "make_dataset_iterator"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
@@ -94,22 +46,10 @@ tf_class {
     name: "make_input_fn_iterator"
     argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
   }
-  member_method {
-    name: "non_slot_devices"
-    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "read_var"
-    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "reduce"
     argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "run_steps_on_dataset"
-    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
-  }
   member_method {
     name: "scope"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -118,20 +58,8 @@ tf_class {
     name: "unwrap"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "update"
-    argspec: "args=[\'self\', \'var\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
-  }
   member_method {
     name: "update_config_proto"
     argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "update_non_slot"
-    argspec: "args=[\'self\', \'colocate_with\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "value_container"
-    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0b35b61b4c08868feaf501e1f09b37d02da09cd0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt
@@ -0,0 +1,24 @@
+path: "tensorflow.distribute.cluster_resolver.ClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-g-c-e-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-g-c-e-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5c2cc522f1cac65611ffc3f09ce1513d186da27a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-g-c-e-cluster-resolver.pbtxt
@@ -0,0 +1,38 @@
+path: "tensorflow.distribute.cluster_resolver.GCEClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver.GCEClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "rpc_layer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'project\', \'zone\', \'instance_group\', \'port\', \'task_type\', \'task_id\', \'rpc_layer\', \'credentials\', \'service\'], varargs=None, keywords=None, defaults=[\'worker\', \'0\', \'grpc\', \'default\', \'None\'], "
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3220d68e05458da3cda4e36c63bc5dc79cde93af
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
@@ -0,0 +1,26 @@
+path: "tensorflow.distribute.cluster_resolver.KubernetesClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver.KubernetesClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'job_to_label_mapping\', \'tf_server_port\', \'rpc_layer\', \'override_client\'], varargs=None, keywords=None, defaults=[\'None\', \'8470\', \'grpc\', \'None\'], "
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-simple-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-simple-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4e80e3af308e1a80eb9fa7491eabbe1454b8edf6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-simple-cluster-resolver.pbtxt
@@ -0,0 +1,38 @@
+path: "tensorflow.distribute.cluster_resolver.SimpleClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.SimpleClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "rpc_layer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'cluster_spec\', \'master\', \'task_type\', \'task_id\', \'environment\', \'num_accelerators\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'None\', \'\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'accelerator_type\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'GPU\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..971ea3dca41a82b4a737a8c2468f2e1f6ffb2033
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.distribute.cluster_resolver.SlurmClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver.SlurmClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'jobs\', \'port_base\', \'gpus_per_node\', \'gpus_per_task\', \'tasks_per_node\', \'auto_set_gpu\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'8888\', \'1\', \'1\', \'None\', \'True\', \'grpc\'], "
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_task_info"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'accelerator_type\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'GPU\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-f-config-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-f-config-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5f9a430c0f84c9caba29dee514f1f3a3391d8588
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-f-config-cluster-resolver.pbtxt
@@ -0,0 +1,38 @@
+path: "tensorflow.distribute.cluster_resolver.TFConfigClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver.TFConfigClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "rpc_layer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\', \'environment\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ca22c85ac0ac871538b2820df39c6636fa23873c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
@@ -0,0 +1,34 @@
+path: "tensorflow.distribute.cluster_resolver.TPUClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver.TPUClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'tpu\', \'zone\', \'project\', \'job_name\', \'coordinator_name\', \'coordinator_address\', \'credentials\', \'service\', \'discovery_url\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'worker\', \'None\', \'None\', \'default\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_job_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_master"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'accelerator_type\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'TPU\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-union-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-union-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..179848aca39cbff879fbab7791a1a7bc1692c488
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-union-resolver.pbtxt
@@ -0,0 +1,38 @@
+path: "tensorflow.distribute.cluster_resolver.UnionResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.UnionClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "rpc_layer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'accelerator_type\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'GPU\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5906ffa850a360889e26fe0230618ad60cf01231
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.distribute.cluster_resolver"
+tf_module {
+  member {
+    name: "ClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GCEClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "KubernetesClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SimpleClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SlurmClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TFConfigClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TPUClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "UnionResolver"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..619c3744d6cae858eb501392e962fbb94751355e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.distribute.experimental.MultiWorkerMirroredStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.collective_all_reduce_strategy.CollectiveAllReduceStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c4cfa2f4ed2e8e346e847318e409dcf16b5f34f8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.distribute.experimental.ParameterServerStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.parameter_server_strategy.ParameterServerStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8859fbd38c43577c5f6040d717d11fb23941e4e2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.distribute.experimental"
+tf_module {
+  member {
+    name: "MultiWorkerMirroredStrategy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ParameterServerStrategy"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt
index b0dd73ca1d4179b4a3323fa0a9be2fde4e22799c..430e81166067cb9599b03fba43e066e099d928fc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt
@@ -1,5 +1,13 @@
 path: "tensorflow.distribute"
 tf_module {
+  member {
+    name: "CrossDeviceOps"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "HierarchicalCopyAllReduce"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "InputContext"
     mtype: "<type \'type\'>"
@@ -12,14 +20,30 @@ tf_module {
     name: "MirroredStrategy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "NcclAllReduce"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "OneDeviceStrategy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ReduceOp"
     mtype: "<class \'enum.EnumMeta\'>"
   }
+  member {
+    name: "ReductionToOneDevice"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ReplicaContext"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Server"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Strategy"
     mtype: "<type \'type\'>"
@@ -28,6 +52,14 @@ tf_module {
     name: "StrategyExtended"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "get_loss_reduction"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-mode-keys.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-mode-keys.pbtxt
index bf7c1abcd89b29c29f3487cab58cfdf28103119c..d53752780090c2d621dcabfc8c31e4f1192bd7c7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-mode-keys.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-mode-keys.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.ModeKeys"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.model_fn.ModeKeys\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.mode_keys.EstimatorModeKeys\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "EVAL"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-session-run-args.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-session-run-args.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b375c7429469d2a8b89d1bcd048599d6478624ae
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-session-run-args.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.estimator.SessionRunArgs"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunArgs\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunArgs\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "feed_dict"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "fetches"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "options"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-session-run-context.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-session-run-context.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cb4ac9f50ec9aa9d6531a16ebb48a9223cbc5188
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-session-run-context.pbtxt
@@ -0,0 +1,25 @@
+path: "tensorflow.estimator.SessionRunContext"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunContext\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "original_args"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "session"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stop_requested"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'original_args\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "request_stop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-session-run-hook.pbtxt
similarity index 95%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-session-run-hook.pbtxt
index db1aa24acf0e295b4b787eef68250401dd6a6e27..54e9ad9ed44b64e2c1c49b5ade4c7d3bb35563de 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-hook.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-session-run-hook.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.SessionRunHook"
+path: "tensorflow.estimator.SessionRunHook"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
   is_instance: "<type \'object\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-session-run-values.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-session-run-values.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..678814169635bfa9997db26df23acc79c2d84881
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-session-run-values.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.estimator.SessionRunValues"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunValues\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunValues\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "options"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "results"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "run_metadata"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
index f0fd7ce782db71ff5e790fe50e93556bf5d19e1e..b1bd5a2661d44d9b36b965ba160874e6142628ea 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
@@ -36,4 +36,16 @@ tf_module {
     name: "stop_if_higher_hook"
     argspec: "args=[\'estimator\', \'metric_name\', \'threshold\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
   }
+  member_method {
+    name: "stop_if_lower_hook"
+    argspec: "args=[\'estimator\', \'metric_name\', \'threshold\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
+  }
+  member_method {
+    name: "stop_if_no_decrease_hook"
+    argspec: "args=[\'estimator\', \'metric_name\', \'max_steps_without_decrease\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
+  }
+  member_method {
+    name: "stop_if_no_increase_hook"
+    argspec: "args=[\'estimator\', \'metric_name\', \'max_steps_without_increase\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-classification-output.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-classification-output.pbtxt
index 52874dd9b9316d9815c5aef51e272e6ffddb5224..ce486807a47031999ec4c9082bbf4b2e4af910ed 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-classification-output.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-classification-output.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.ClassificationOutput"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ClassificationOutput\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ClassificationOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-export-output.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-export-output.pbtxt
index 964c315e9730effac38d60f7242527e71cbf9846..9775b2cca812d3fd47a3e821f032ed67c62d0078 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-export-output.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-export-output.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.export.ExportOutput"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-predict-output.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-predict-output.pbtxt
index bb82bc9e58627318b897f0610c7d852db7f98c07..a4389fb998e86cc291bb02d2ae04d220f1e152cc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-predict-output.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-predict-output.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.PredictOutput"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.PredictOutput\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.PredictOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "outputs"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-regression-output.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-regression-output.pbtxt
index 8522834433f214e5d646ef6265b1047fb7f2cc4f..fc7f8447893c99f4f68bf12f0790e8a549232dc3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-regression-output.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.export.-regression-output.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.RegressionOutput"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.RegressionOutput\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.RegressionOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "value"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
index d3656ae0455971ccd98062a52ec0412bf6af06f7..6f57505afe84f3982a8beb402783f35b3e699241 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
@@ -132,6 +132,22 @@ tf_module {
     name: "SecondOrStepTimer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SessionRunArgs"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionRunContext"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionRunHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionRunValues"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "StepCounterHook"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.feature_column.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.feature_column.pbtxt
index f06e7989537eef2b0e6fa4b720e90614366b41ee..79ed45cfba325e0749c823765a181f413c0e2617 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.feature_column.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.feature_column.pbtxt
@@ -48,6 +48,26 @@ tf_module {
     name: "numeric_column"
     argspec: "args=[\'key\', \'shape\', \'default_value\', \'dtype\', \'normalizer_fn\'], varargs=None, keywords=None, defaults=[\'(1,)\', \'None\', \"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "sequence_categorical_column_with_hash_bucket"
+    argspec: "args=[\'key\', \'hash_bucket_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'string\'>\"], "
+  }
+  member_method {
+    name: "sequence_categorical_column_with_identity"
+    argspec: "args=[\'key\', \'num_buckets\', \'default_value\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sequence_categorical_column_with_vocabulary_file"
+    argspec: "args=[\'key\', \'vocabulary_file\', \'vocabulary_size\', \'num_oov_buckets\', \'default_value\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \"<dtype: \'string\'>\"], "
+  }
+  member_method {
+    name: "sequence_categorical_column_with_vocabulary_list"
+    argspec: "args=[\'key\', \'vocabulary_list\', \'dtype\', \'default_value\', \'num_oov_buckets\'], varargs=None, keywords=None, defaults=[\'None\', \'-1\', \'0\'], "
+  }
+  member_method {
+    name: "sequence_numeric_column"
+    argspec: "args=[\'key\', \'shape\', \'default_value\', \'dtype\', \'normalizer_fn\'], varargs=None, keywords=None, defaults=[\'(1,)\', \'0.0\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
   member_method {
     name: "shared_embedding_columns"
     argspec: "args=[\'categorical_columns\', \'dimension\', \'combiner\', \'initializer\', \'shared_embedding_collection_name\', \'ckpt_to_load_from\', \'tensor_name_in_ckpt\', \'max_norm\', \'trainable\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
index 15d0e099bab3052553671d52d396239b27383a8d..102ca132a214b1d45420f7727182b45cb3c37894 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
@@ -32,13 +32,17 @@ tf_module {
     name: "central_crop"
     argspec: "args=[\'image\', \'central_fraction\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "combined_non_max_suppression"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size_per_class\', \'max_total_size\', \'iou_threshold\', \'score_threshold\', \'pad_per_class\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'-inf\', \'False\', \'None\'], "
+  }
   member_method {
     name: "convert_image_dtype"
     argspec: "args=[\'image\', \'dtype\', \'saturate\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "crop_and_resize"
-    argspec: "args=[\'image\', \'boxes\', \'box_ind\', \'crop_size\', \'method\', \'extrapolation_value\', \'name\'], varargs=None, keywords=None, defaults=[\'bilinear\', \'0\', \'None\'], "
+    argspec: "args=[\'image\', \'boxes\', \'box_ind\', \'crop_size\', \'method\', \'extrapolation_value\', \'name\', \'box_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'bilinear\', \'0\', \'None\', \'None\'], "
   }
   member_method {
     name: "crop_to_bounding_box"
@@ -86,7 +90,7 @@ tf_module {
   }
   member_method {
     name: "extract_image_patches"
-    argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\', \'name\', \'sizes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "extract_jpeg_shape"
@@ -194,7 +198,7 @@ tf_module {
   }
   member_method {
     name: "resize_image_with_pad"
-    argspec: "args=[\'image\', \'target_height\', \'target_width\', \'method\'], varargs=None, keywords=None, defaults=[\'0\'], "
+    argspec: "args=[\'image\', \'target_height\', \'target_width\', \'method\', \'align_corners\'], varargs=None, keywords=None, defaults=[\'0\', \'False\'], "
   }
   member_method {
     name: "resize_images"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.-g-file.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.-g-file.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c6bf57a88fc1295da13e0b58671191c9d8ba8caa
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.-g-file.pbtxt
@@ -0,0 +1,58 @@
+path: "tensorflow.io.gfile.GFile"
+tf_class {
+  is_instance: "<class \'tensorflow.python.platform.gfile.GFile\'>"
+  is_instance: "<class \'tensorflow.python.lib.io.file_io.FileIO\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "mode"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flush"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "next"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'n\'], varargs=None, keywords=None, defaults=[\'-1\'], "
+  }
+  member_method {
+    name: "readline"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "readlines"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "seek"
+    argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "tell"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "write"
+    argspec: "args=[\'self\', \'file_content\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.pbtxt
index cfa3372b12bfe32eed4311c89b6448c0359c0913..a797c06ff337cffe503d89c09497996ea64c6ad2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.io.gfile"
 tf_module {
+  member {
+    name: "GFile"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "copy"
     argspec: "args=[\'src\', \'dst\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index a3254cbd947d9ef70617131e9f4b17f44f059840..5e1371815469974b91b1a4de16fa20d19404eee8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -131,7 +135,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -163,19 +167,19 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'validation_freq\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'1\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -231,11 +235,11 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index b70e9ee98d5bc4900420ddb1307abf9adcd8cad0..4bbe98b789935318a901d84502cb763a60ddc92b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -136,7 +140,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -168,19 +172,19 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'validation_freq\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'1\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -240,7 +244,7 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "predict_classes"
@@ -248,7 +252,7 @@ tf_class {
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
index 8cd0c6ea5f027fa1f30b60a742450b651242d406..c64a90890f13e4b14678005cb71460e427272416 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
@@ -124,6 +124,14 @@ tf_module {
     name: "ctc_label_dense_to_sparse"
     argspec: "args=[\'labels\', \'label_lengths\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cumprod"
+    argspec: "args=[\'x\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "cumsum"
+    argspec: "args=[\'x\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
   member_method {
     name: "dot"
     argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
@@ -190,7 +198,7 @@ tf_module {
   }
   member_method {
     name: "get_session"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'op_input_list\'], varargs=None, keywords=None, defaults=[\'()\'], "
   }
   member_method {
     name: "get_uid"
@@ -248,6 +256,10 @@ tf_module {
     name: "learning_phase"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "learning_phase_scope"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "less"
     argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-base-logger.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-base-logger.pbtxt
index 7d298e95135ebf41230d72ff488fef30be682edb..9dbdaf0f5f3db292feb98fe06092b6f7a6b8f034 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-base-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-base-logger.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
index 133205ab88b47afad32fc70ceca93513768a3b19..0725f606e2923ff1bd5a8814febdfe7de8a2602c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt
index d766c09ac5efaa9d0e4ffba4e495385130c7e770..14bfc3bedbfb5a379e28a0cb9cd2f7f744539fa1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-callback.pbtxt
@@ -22,6 +22,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt
index 605f74e5602a63f5a18c31cb26113d300ec76e7a..9812bad8f66f3d5afe365287feca748f9e6efd5d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-early-stopping.pbtxt
@@ -27,6 +27,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt
index cd893e67269164781d6a6b6294a199014d40fed8..5aa739391ef894cdede1db17f903a50111f25eca 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-history.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt
index 50f2054cabb1b8f6c46a9537ea923a18f87e5c80..bf5bcb68df47ed8661509598d3bc59f01dfcefe6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-lambda-callback.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
index 9ed9db0a89b49b88098e15baca414ff78b6f10e6..a04ffb92eb9e32b2473355f140d68537b80074df 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
index 3d8d1363bb4e4de818788efbf3c997594350006a..5ae176017b3cf1ac019ecdc0f1c255f23b32fcec 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-progbar-logger.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-progbar-logger.pbtxt
index 5012f1517d57dd646d82ab669cb279b6363dd6ec..624f856d2752e1f375154664a892d6c1d600ecbd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-progbar-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-progbar-logger.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
index 73652c2b61259f768eca76b995ae4592df868392..0fed6fd23670a16acd8d770269090c3dda0eee30 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
@@ -27,6 +27,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt
index 24db71de1182d58b78fec0419aa9cb48a2e315d2..71cf7f4a4922752c0ba154a8d3fe29b37c305675 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-remote-monitor.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt
index c5503c69a5f3cb6765c984778c0e3626369ee815..66bacd809408029e40ba8009dd9ea6889bb24e0e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-tensor-board.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.callbacks.TensorBoard"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.callbacks.TensorBoard\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks_v1.TensorBoard\'>"
   is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
index de6e8ef072558e6d926ea125aa5056e3c229d37f..d5a59d870a390a6f5632332c12534f83c686e2dd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-cosine-decay-restarts.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-cosine-decay-restarts.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..58bede556dfd4d8988d92e99e402d9b3b3bf5adb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-cosine-decay-restarts.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.experimental.CosineDecayRestarts"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.CosineDecayRestarts\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'first_decay_steps\', \'t_mul\', \'m_mul\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'2.0\', \'1.0\', \'0.0\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-cosine-decay.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-cosine-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e2549a2ac627421ecc80df2d6235c1a22ab5e3ff
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-cosine-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.experimental.CosineDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.CosineDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-cosine-decay.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-cosine-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f083120b52ce483f46cc92390b53180bc3bd65ed
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-cosine-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.experimental.LinearCosineDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LinearCosineDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'num_periods\', \'alpha\', \'beta\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'0.0\', \'0.001\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-noisy-linear-cosine-decay.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-noisy-linear-cosine-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ea3c6beb1c0f8fffaa442956c0cc134f70a5e84
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-noisy-linear-cosine-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.experimental.NoisyLinearCosineDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.NoisyLinearCosineDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'initial_variance\', \'variance_decay\', \'num_periods\', \'alpha\', \'beta\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'0.55\', \'0.5\', \'0.0\', \'0.001\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index 1d814b2c8b553f1b2a07f9d9b97dc70ec0674969..2f3cb0b7c51e119da6a122dd6672109789c1e73c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.PeepholeLSTMCell\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTMCell\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5a75f44fcc3a1ecc65b27cc52d61256b6e69e0af
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
@@ -0,0 +1,184 @@
+path: "tensorflow.keras.experimental.SequenceFeatures"
+tf_class {
+  is_instance: "<class \'tensorflow.python.feature_column.sequence_feature_column.SequenceFeatures\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt
index 164edbd66ab2487a980155eabcf18ed8446e2c14..65b82a3f3222c51c4a419918ad1e74dd52152aed 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt
@@ -1,7 +1,35 @@
 path: "tensorflow.keras.experimental"
 tf_module {
+  member {
+    name: "CosineDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CosineDecayRestarts"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LinearCosineDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "NoisyLinearCosineDecay"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "PeepholeLSTMCell"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SequenceFeatures"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "export_saved_model"
+    argspec: "args=[\'model\', \'saved_model_path\', \'custom_objects\', \'as_text\', \'input_signature\', \'serving_only\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "load_from_saved_model"
+    argspec: "args=[\'saved_model_path\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
index b84629540e700f242f885064c92309c294693a11..0c6c0a34b9f606398831c2a82e9b049fed96957a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Activation"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Activation\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
index 5918a13ad8629582829049485e896688ecad9579..15bf03977dbc03660971fc7343cb0388d8696326 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ActivityRegularization"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.ActivityRegularization\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
index 599da06427dfe4f28e757a7aac8d8a14856a4556..b265384d5980f4c4982ced19af0208427da56817 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Add\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
index f9ff1538c8134d96051ad81d35c73e59c6a8cc57..3a0882daf1eefaf17f893c3b565784bdd60ac689 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.AlphaDropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.noise.AlphaDropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 723fc9cdb0d0ad93470e22fd8c147d3ecc92af91..d2ee310d68e5da9f07f9cb6656165c97fccaa469 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index 957ce2f0ce86f8df3eb8b57606229fb661eb52f7..1da079f39e25b020406dabca46966d57dfb4451d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index a52c0af68175420dc2a1993d1f025d36705538e1..d96751ccf916aa9a9522f341a3befbc987f25125 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
index a004db62ddcaaae02a411d8db51f4026ece1384d..3819e525a99b67337f1d3a36b30f6ee0dfefa03b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Average\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 44f83d1387cb2ec681f50f7b1f0297f3f74594ed..47f6b397a70109e9cdb833eae0cbec8a753831f7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index 8378faf7188ec594865d4b68c8ea8cae284183ca..4b8cadca5344acd7fe279d0132184d666a94f448 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 9d5655c9644e3a2394a346bed78fc478cf60ba8d..5c66da42e64b3cf57c2e03002bb95bc33eeb3ac2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
index b3d3c84f92e6491601f670739b2b45f79313e8f5..203fea1d9ff251a45826b13cbbb557c67a549622 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV1\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV2\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
index d37a6b47105225d7b83b6a264b944ceeb583a6c4..95eb6f69ecc4fa313da9eca9b6bee185b647be03 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Bidirectional\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -17,6 +17,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -111,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
index 1ad7a91be0ba48d0dbab19da8c7cd9ca89095918..09410135435100779208ff1c3db9198fdc88b178 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Concatenate\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index cb9abc25396bb63a3c40de5cc52f9df7ed20071e..941b6aa3dd3316d15edede430ef25da913bf0ce6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional_recurrent.ConvRNN2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -42,6 +42,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "filters"
     mtype: "<type \'property\'>"
@@ -192,7 +196,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
index 47dba1d81f8f97a60fe72ec521f82a78ee5f3505..4bf8336fb32013258ec17edb810a68e7c0d3cfb3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index fd649418961301f150aac3dabc1bdf0ade4a9c28..221addf20aec476c7533284f9d7437cb0e1a6bad 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
index 1b1425d53197db8b59abf51fe93c0b0c45299956..1c95fcc8b543c3da318119f7c9ed64952ba5b819 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index 1741063fe8b09acf3865e0a135e96bb715dcdcfa..994a507bfb20a675ef2b4f8f5a9b2284ee0c21ad 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
index 50feb4f458ad1a9cb2b2bfe5d67997b7551eed74..ae251b529bd6243733263acad98367a1a64e6530 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
index faaa535df9fe03ad07862f0793f8ebea67b405ca..1d73eecc37c30b0ce6ef723f56691cc502a5698e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 4079329d1ee2a61270fee38426bb8a0859c38ce3..d37ec0f5603237cb5c397dd8193f51d5e50db0ed 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
index 32e56696e1617f7810792e3416a2ebb2037d23c2..bb3c37d573fc2477a7327ea4dfec45a7a66e7c22 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 381abe73401fa3a588873d643324fc020c159e30..fc29f1c913adbfde33827d8095181763a43126d8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
index b3e4bf9689dc7e9db63de7f43e9dfa9ac4d42b02..2658fb4e9984969c60257c17361799e2f56b1bb7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
index 7aeff8003c322e8a8168dd70481a8b30b08762a8..58567eafa4134b747b4ca7ed9b71aa77d3a4c2e3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Cropping1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
index a1728d9d4f9a1e677646db04c4d0df9572e21208..42be76f4189d6fe299acb6fda744e268881de6f0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Cropping2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
index 8d8fd142cc64ee113c4b6a7e4e2462ecc69b6028..11092f225d1c350ef81653555fb3cec8f6bda85a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Cropping3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
index 7758209adf8fe7a1306fa5ef125935dafd925c3e..e618a111b6f60cd73fa64ab47d4d4422580a73ab 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.cudnn_recurrent._CuDNNRNN\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -18,6 +18,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -112,7 +116,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
index 7c463ff1257599366be049edce6cc06140906286..cf8a67b54983f17773e59de2e0661fd9896d2420 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.cudnn_recurrent._CuDNNRNN\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -18,6 +18,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -112,7 +116,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d6f7f3033ecf8e226b961dabfe59e751639e5b98
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
@@ -0,0 +1,184 @@
+path: "tensorflow.keras.layers.DenseFeatures"
+tf_class {
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'features\', \'cols_to_output_tensors\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
index 4960d0264e96e872ea5c49a8841cef20bd5eb37c..339c9f52b7ff45ebebce128980b3ed9fc47bfc5b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Dense"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dense\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 8fad7535f882718462a11e27e75732e3097cb87d..c2992de5a8f9757266fe681943e7a6b84039a893 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
index 5b425f2d4d7a8a897280490e26922766d8bf7065..b37f4c845a45418ba8fc47c89718debb97de8f88 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Dot\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
index f6c4d0a438ed027635b40ec992eb1bbcb5c9a3a1..d21f577721c3052507969c208870b9d69dd3313d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Dropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
index 82b761fc1761bb3e7638f7a80bc80c6433162d04..f1e086b5dc50a3e47bbf2e9d3d130cade60b8902 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ELU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ELU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
index c9ff323877e06b6dff274644744d425e3a9b7932..eb3496aa48fa43066f4f394ba2039416b4157f33 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Embedding"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.embeddings.Embedding\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
index 9b4165d4cbf88fefd2bb684dae70ea8afc01357b..bbe324c77255fe5ccba650566d4a455196ce49ac 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Flatten"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Flatten\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index f225f7c4309615919fb05df05f2ae664bde80097..dd93e32ddcecb70203798c65483638a328019d3a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.GRUCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.GRUCell\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
index 855d001700179fb634d1dff78585d340420abe7f..9f25b3c6574e08b96e4947210c3c7d174e7293f3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.GRU\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -33,6 +33,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "implementation"
     mtype: "<type \'property\'>"
@@ -175,7 +179,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 2c404c99cd2175cdc8b60b229e4410bf280ebcb7..e24862632e73c9030dbce521df445da2521f322d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.GaussianDropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.noise.GaussianDropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 6f109d59d0f6fcd2b4650719e3b4f653baec7d23..b1b5759881792810b4b11e279bd699652c256334 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.GaussianNoise"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.noise.GaussianNoise\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 69f8a9031d32eb73bb44291cdf330d738d745cf9..9c4087a06f1776c3372bcc8524f1ea48af1b7447 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 4299f765e525b136e289bba169becec06e19ffb1..d56429442c626340b0f38becf452668ff1c4535f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 9153a1a2406b6fc4ab60c80fee2f8d6d69b00b72..089145ead9f0aa35f4a6e2d41b5dfb8425c68557 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 625e81fd2322ceba153fa65c138948ce43843089..2bc02b6f69f2468449c8d551d42e3b12e219964a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 2fc769742c70c5665c9cb77ad246fcdb49366d5a..c2510d129460b8819da832b3354321db49868a29 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index e307a65c7c565660e1f2b6b6b74dc5970425eaa4..845d6b17015a4168426b430add12d303fe14cf5a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index 4394ad0364e89fd3531d6625e52540991cadf973..f6fa8659dc4cf316968dd9f48572532fc0b4e5aa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 050ed39fe98dc7cfdf6febe45e235d3ae7cbf486..1285e21f714e3cac3288e4d5c8b883c5ec909d7c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 436191821ef4689351b6124cf2a20afad917e4ab..a1417e46ceb713440bfcb16d7f29d415883a3633 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 4ba540aa6adc72b572aa9340f89967d69ab78a3c..ff4da8ba542732f48388428f3dcf9452d41a3320 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index a2e9322cb3fd4e56af708d5c4e17b660f7bc2247..7140d5718ba61f508a1d00729a4777745994bee4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index 5d16a57fc1aeff9939220de8043fcae39e3d953e..4edeb9788d88000634ee1aadab9fea69fb2c83c6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
index 9dd29c1251ef2eacaf535a3f10f3d42dc36624a2..48609567d5bed7246d0162ddfb8da92090c5640f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.InputLayer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.input_layer.InputLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 0045d5775e2c19df21428bd4420b6e5612c8002b..7398613812d0b5dc7d3f9bb62fdda0bd08d11b60 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LSTMCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTMCell\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 529c750f98715ec30313ed34c9023a845061a3df..a8f60e83b9bc92c2e0759d120c84b5a61b7431aa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTM\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -33,6 +33,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "implementation"
     mtype: "<type \'property\'>"
@@ -175,7 +179,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
index d4d1bc6b6bbf0ce39742b740aff6dc0c1cd464a1..88f1f8b06d1e887385ce44903ada3dfb3c934e21 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Lambda"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Lambda\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
index e1f5491180903f7d6931cc09755cabb715bbf233..c95f9159cdf2549cce9d645d9cb9b4802ea87018 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.layers.Layer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -11,6 +11,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -81,7 +85,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\', \'dynamic\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "add_loss"
@@ -101,7 +105,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index 9b69d9a9447f42907236b5cc8c7672012f96c38a..80d1c32dede3d72a4bb5f150d9c9d51143b7f5ff 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LeakyReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.LeakyReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index fd52259432577ac94dc702d4411ad5c0eed1ff10..b050302861485258c0043d97bb325860f770fb06 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LocallyConnected1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.local.LocallyConnected1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 5fc8af0d03564c649dff6e9df70d10731319de40..3bb780cdb0ae3b2f2c8b95f1e41c524f12d49162 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LocallyConnected2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.local.LocallyConnected2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
index 7f8932270e63bc02852c5b64e53694e7e26be08b..690208be83a58bf107f02fa342812c9d29319183 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Masking"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Masking\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 4723b99cb0792e1ce0bdc45e46908da8c2b5359c..02f3186dc60056844a43821411e4438eb123fb88 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 173c5d4a8b149c4e23683cf375e8d793db7faa5a..f2e9a3b13665024e8102f1db84293b73fa094d42 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
index 14e1899e145224e411d65cbf481060a3b2cec0f1..868faa03919a0d656394a691200e622d987f4be1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index a708e652bf0e82dea0f58034a81a040a39550dc9..8e1662630c18d8662145226c6a54a05f49cab24b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index e6706b5cf9f32bda78adc4e2db5916a5750cc82e..ab96640936137bbcd3e73b1dbb56060dfe4943d8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index a73c082d1bba0453b742f76bacf0ad6116ba79a7..4f492f50fbd7b571b5fc853e6523c9cb56d73e82 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
index f3f195554bbf4a43efaf2af0fd278a23bf270994..702f2e88d0b32776ab2e45bf968843e8df1e5578 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Maximum\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
index f345d1d67b2ce0200c64b1aeea5f39821d070bac..fef939090d8d1e4ffc17b290754b16b87a07f38a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Minimum\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
index 31cb8bc177c7a9e365101e75108a29900fbda124..1e2db3fb135ded60a33f8f4f3ffa92322d38dbd6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Multiply\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 44cccc92bd2f1ff0335c22f2967865dc88a96ff7..1450047d7dd716b97a824c6718bef26373214114 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.PReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.PReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
index b55e191ff1ad6997550966bbb6154a81a489575d..d5d3e0333bfc5ef8bf56a153d04ccc9c9df85dc7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Permute"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Permute\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
index e9575436e5b14ac8c52a0b59c86937886eab5f40..f1151f43c9df50850e61b5ab6e9ea7b7bc0184d9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.RNN"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -106,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
index 98223b207f2ecfd5b7af8a53390166e53a7d4f73..0874240fa6f60bd8c345fc1c2219f9f7bf843831 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
index 2df918b16b2552323d75083bfa80e328c0639cfe..de9f8fb5939fd11ccdde97e383e1f4a7f1d99e73 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.RepeatVector"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.RepeatVector\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
index ce5f9e21290eeddc0052257191ac4a6d068c1366..a125754f43a31806b94b4775b4339e9c610a8ee0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Reshape"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Reshape\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index a0bb917775fd9edb5d909bf850310e0596a88209..01fac3a2595838bdd17b8ae5b0dd93a7ed70d495 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index d7942f201bdbfa8d1577813be461a5905b5c6c90..80628d76d8224af71ebc12a1d735481ea6c0c654 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index f7ac9042d46f46ab35d18c62e5d8841679a18ca9..41e96fde4b2a06c67fd2007f0b597104b4cf5b93 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index e5a92688220f6e227b317d71a70fde01df4c432b..f48b0b3517f04dcba9ace267b4ffcfc5abb70c23 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 0fe2c974a762784a82a6b97e116357be2a61d84f..5e799329c032a1e72a61378623e2844fa7e89401 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.SimpleRNNCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.SimpleRNNCell\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 2ee5873f0f11688019dec3a6cd69db06d99b9caa..60893bbf1aa5a508cb4fedabc8cd90927b488f71 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.SimpleRNN\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -33,6 +33,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -163,7 +167,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
index 5b8f64aa35725d0ea44fc5c5b81952fd839503e7..c96405a434c9ba3cb770b3dcaf6cf551ac4f200c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Softmax"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.Softmax\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index 240cb6e562f77467d94ef95db2374150e318bc04..153b7bc876738ef180901080bef3f8fdf2dd80f9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index 6226c469f8a534f96f6ea991fa5e7d2cf0019e3f..44e08117c3130df83572dd8409e5af273ac5b290 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 34dabce6d8dd0b1b6fe50a008a981e1f06a77edf..6e9f624999444a54b5d7dd2e372f9d5d470a6011 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 0ddf628ace582db259ebe0b211aba6e6362b5d5b..57da4c0ba9d62aad702a6204b3ce41b48bf6d161 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.StackedRNNCells"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.StackedRNNCells\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -110,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
index 12eb35ad154a514afd9c900cb2dbece8af28c49f..27eb794485a8c1c46cf918782b97724e9774eef4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Subtract\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index c41020c2b45cc88c9b63f3b7a45c35066794dfe2..733070e50bc283729ecbd91b1b79af9521eba678 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ThresholdedReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ThresholdedReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
index 479f89cf6ae93e8d6ae02e304a51a145164df7de..009ecca9a7f490f4792c3c0649e3a9003d0e247d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.TimeDistributed\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -107,7 +111,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index 233363ce02614f184b43a059889c7475b6a8c50b..f465aa64b91f644d5d2f292f0a67a3300ed3f488 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.UpSampling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index cb6228ac446bd236df88f94eb6e9e717ea38463d..049da3deddc0fa56d4ae2bfd6552452473d0b7d4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.UpSampling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 03bad3ccb613a225ad56e128ea680fc9312151e1..1d50c891154dd86e4ebe5b481a137e4c7ce248a0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.UpSampling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
index 158996792a47fab0e7aa26d21d4bb7f281ca76d2..6604ac05d910abd89e3c04f6a8194adac582963e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Wrapper"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -106,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index 63a56cd3eebe271f66258c9a0acb974764555b34..2c8d52765d55357fb2f02f5268b089b27fa20429 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ZeroPadding1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index 965a4cca04651e123c5bd93484200a58b39918ba..bf9f43c1dbfbddb0ddd19b26d554df86ba493d25 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ZeroPadding2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 1a624308878a68f1b48cb0f8b5e08dafbbfa0333..a78cfa826bfdf5f49947cd9ae2f9879bf36328e2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ZeroPadding3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt
similarity index 76%
rename from tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt
index 7758209adf8fe7a1306fa5ef125935dafd925c3e..9b5598eed07c9f04feb0d90820381abc12dbb456 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt
@@ -1,21 +1,19 @@
-path: "tensorflow.keras.layers.CuDNNGRU"
+path: "tensorflow.keras.layers.experimental.LayerNormalization"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.layers.cudnn_recurrent.CuDNNGRU\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.cudnn_recurrent._CuDNNRNN\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.LayerNormalization\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "cell"
+    name: "dtype"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "dtype"
+    name: "dynamic"
     mtype: "<type \'property\'>"
   }
   member {
@@ -66,10 +64,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "states"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -92,7 +86,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\'], varargs=None, keywords=kwargs, defaults=[\'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'norm_axis\', \'params_axis\', \'epsilon\', \'center\', \'scale\', \'beta_initializer\', \'gamma_initializer\', \'beta_regularizer\', \'gamma_regularizer\', \'beta_constraint\', \'gamma_constraint\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'-1\', \'1e-12\', \'True\', \'True\', \'zeros\', \'ones\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -112,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -124,11 +118,11 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
@@ -146,10 +140,6 @@ tf_class {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_initial_state"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -164,7 +154,7 @@ tf_class {
   }
   member_method {
     name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_output_at"
@@ -186,10 +176,6 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0f229615461dc7b781c0ba2ec6f81692d65354bf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.keras.layers.experimental"
+tf_module {
+  member {
+    name: "LayerNormalization"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt
index 9d7e5bb8c7808689bedd8abb835e61c1f38fdb1d..cc0fdabee1db04ec48f1c4e23b765a2cf89d4ad8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.pbtxt
@@ -124,6 +124,10 @@ tf_module {
     name: "Dense"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DenseFeatures"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DepthwiseConv2D"
     mtype: "<type \'type\'>"
@@ -396,6 +400,10 @@ tf_module {
     name: "ZeroPadding3D"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "Input"
     argspec: "args=[\'shape\', \'batch_size\', \'name\', \'dtype\', \'sparse\', \'tensor\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'None\'], "
@@ -412,6 +420,10 @@ tf_module {
     name: "concatenate"
     argspec: "args=[\'inputs\', \'axis\'], varargs=None, keywords=kwargs, defaults=[\'-1\'], "
   }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "dot"
     argspec: "args=[\'inputs\', \'axes\', \'normalize\'], varargs=None, keywords=kwargs, defaults=[\'False\'], "
@@ -428,6 +440,10 @@ tf_module {
     name: "multiply"
     argspec: "args=[\'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'layer\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "subtract"
     argspec: "args=[\'inputs\'], varargs=None, keywords=kwargs, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-binary-crossentropy.pbtxt
index 2f7da93f6f412ca559aec2f6acde2b80a5c93c86..1242eec68f1414f1c8e67bb95602687f4a58412f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-binary-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-binary-crossentropy.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.BinaryCrossentropy"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.BinaryCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'binary_crossentropy\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
index b3a7cd80973259bd5cdfe382c656a9478f8933d8..cf3c2de840450de8e9467269ec446172583e8ffd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.CategoricalCrossentropy"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.CategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'categorical_crossentropy\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fa374afb28bc4d7fe226456743c285b4f539ced1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-hinge.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.CategoricalHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CategoricalHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'categorical_hinge\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-cosine-similarity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aa14c44fa3628236033e952b69f3a160c49a36fc
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-cosine-similarity.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.CosineSimilarity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CosineSimilarity\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'sum_over_batch_size\', \'cosine_similarity\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a4c25eefcbbc75afb3765b11e325f6bd830ccba8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-hinge.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.Hinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Hinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-huber.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-huber.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1fa8ffa95726f72b620c3908b48fe20dfae1dc17
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-huber.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.Huber"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Huber\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'delta\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'sum_over_batch_size\', \'huber_loss\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-k-l-divergence.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d950c789eb44fcad792a9d11856ce11143715807
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-k-l-divergence.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.KLDivergence"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.KLDivergence\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'kullback_leibler_divergence\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-log-cosh.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-log-cosh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fbbd5317f89f801e8a4f4cc80e700e2b478ebf40
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-log-cosh.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.LogCosh"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.LogCosh\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'logcosh\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-log-loss.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-log-loss.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..40ca239cceec8a726f6fbb0b2a2c633d1499afa7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-log-loss.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.LogLoss"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.LogLoss\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'logloss\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-loss.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-loss.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..04a9cc94201a5472a7c6158acfc4bfd48d4f74db
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-loss.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.keras.losses.Loss"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-error.pbtxt
index 712bb2ecd3526c354cbcf640e689526b2e415a13..9da6b59ec83bb5b74336a122a791a0d5ea3eb079 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-error.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.MeanAbsoluteError"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsoluteError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_absolute_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
index 7fe362da89b47a925cd4708909e1c882a9a23aca..7c3ae9b49a415c1586df01984bd73af38ee97558 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.MeanAbsolutePercentageError"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsolutePercentageError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_absolute_percentage_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-error.pbtxt
index a5718533500d9508c558d25d13fc6b61518a73a0..2126ac68d2a4cd8f1b68466e073ec573d13f2cda 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-error.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.MeanSquaredError"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_squared_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
index 200006db355ca4dc8eb2f509bcb9da7543145548..6ef9610546a0ec662313534f424d49879187f302 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.MeanSquaredLogarithmicError"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredLogarithmicError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_squared_logarithmic_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-poisson.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-poisson.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..61c90c3140e2b68b9796873b0de73668f1508476
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-poisson.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.Poisson"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Poisson\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'poisson\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c13f9f967db7014548de1283c5d59bbac403299a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.SparseCategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.SparseCategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-squared-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fabe4c7814462b91a12062bac5c2119cfd45bccf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-squared-hinge.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.SquaredHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.SquaredHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'squared_hinge\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
index 9e26ddbdca0c45df195dd566952379887dcfcff3..7e90b36ead9b0a8b58a621afaf379eebc2cfdc8d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
@@ -8,6 +8,38 @@ tf_module {
     name: "CategoricalCrossentropy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CategoricalHinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CosineSimilarity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Hinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Huber"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "KLDivergence"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogCosh"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogLoss"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Loss"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "MeanAbsoluteError"
     mtype: "<type \'type\'>"
@@ -24,6 +56,18 @@ tf_module {
     name: "MeanSquaredLogarithmicError"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Poisson"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseCategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SquaredHinge"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "KLD"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -46,11 +90,11 @@ tf_module {
   }
   member_method {
     name: "binary_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
   }
   member_method {
     name: "categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
   }
   member_method {
     name: "categorical_hinge"
@@ -58,11 +102,15 @@ tf_module {
   }
   member_method {
     name: "cosine"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
   member_method {
     name: "cosine_proximity"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
+  }
+  member_method {
+    name: "cosine_similarity"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
   member_method {
     name: "deserialize"
@@ -130,7 +178,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\'], "
   }
   member_method {
     name: "squared_hinge"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6e00a3a355269a0ccc5d69b3fcea106c4908e115
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-a-u-c.pbtxt
@@ -0,0 +1,200 @@
+path: "tensorflow.keras.metrics.AUC"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.AUC\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_thresholds\', \'curve\', \'summation_method\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'ROC\', \'interpolation\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "interpolate_pr_auc"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
index 2db07df5235e150f691a12d6b332c6d0d241ac19..18cde2f32aa10100c63d81470cb6e0fd2e61d4f0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Accuracy\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,6 +16,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -105,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
index 904ad3a21a05895b23e30dab82a89a31c74dcfca..c5d2fc9c5397ae20dbd0c7f8f7ce7801c63c3997 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.BinaryAccuracy\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,6 +16,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -105,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a8662459c9ce52da3a42e9c5e47c52b6deb6ab06
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.BinaryCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.BinaryCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'binary_crossentropy\', \'None\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
index 17b74924fab4f596a010d6b9731b474433a8153e..998c4cbb1fad2352cfb9a510ba6e9b153860fcf8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalAccuracy\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,6 +16,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -105,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..04f1794aba61aae085a7580806e524eea8b2a791
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.CategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'categorical_crossentropy\', \'None\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..19442b5028dda68548c19c74e0828abf4fd54534
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-hinge.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.CategoricalHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'categorical_hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..678c7b0681fe4281893fba70b4652233a91e2a0c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-cosine-similarity.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.CosineSimilarity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CosineSimilarity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'axis\'], varargs=None, keywords=None, defaults=[\'cosine_similarity\', \'None\', \'-1\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
index 49f577e1367aece126449923f77f4f6c89493e99..5a94569660fdc31f1889b5ca64f1483970cb5235 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
index e8baf858669a446a11b44e044f36bfde61e440bb..9033d9e655b2f2b80836153c23d9927315360de6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dedc64f1375b66b90f655f280c1a56ba165cfa17
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-hinge.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.Hinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Hinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..af8366b60876cb31f840c5f5007e67980be8dc3a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-k-l-divergence.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.KLDivergence"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.KLDivergence\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'kullback_leibler_divergence\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a7e072e21cc94492ed27186f44b92863cd791d62
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-log-cosh-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.LogCoshError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.LogCoshError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'logcosh\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..75173ad17a9c1fa02451287adad10870a60d653b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.MeanAbsoluteError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanAbsoluteError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_absolute_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7be81b63bbe01b8534bd64d163e735d735ff88f3
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.MeanAbsolutePercentageError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanAbsolutePercentageError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_absolute_percentage_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..21e44ed988494119662e5e1a5101edbe4d7a35fd
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-io-u.pbtxt
@@ -0,0 +1,196 @@
+path: "tensorflow.keras.metrics.MeanIoU"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanIoU\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_classes\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ef17fc34566e8ab6c5cc73781b40cb0f7396067
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-relative-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.MeanRelativeError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanRelativeError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'normalizer\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..363f532ba410f1ebae5f105769a0e69c2e2d0166
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.MeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_squared_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..712f10cd3051fe3de82472cb0eef2ec5fb53b6dd
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.MeanSquaredLogarithmicError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanSquaredLogarithmicError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_squared_logarithmic_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fff91d2b44c6e1e7c1fa0339c737c4a44b9566b6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean-tensor.pbtxt
@@ -0,0 +1,204 @@
+path: "tensorflow.keras.metrics.MeanTensor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanTensor\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "count"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "total"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_tensor\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
index 40fe64bbd2cec45b9a8c4e9b041d3fa858af1327..cffb444835c58c28953f85c61a8f2d98f2e74716 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
@@ -1,9 +1,10 @@
 path: "tensorflow.keras.metrics.Mean"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ce746ab350bfa0534bf7f9ac7d6e8255c7749894
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-metric.pbtxt
@@ -0,0 +1,195 @@
+path: "tensorflow.keras.metrics.Metric"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..570b77408cbaa2b7a0089f9de8a528e604799abe
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-poisson.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.Poisson"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Poisson\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'poisson\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
index ae6a85026da80cd071984aede8d0ec4e9cd571c5..83535d56cfc37932be785684825bed0e29a4fa5e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Precision\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -83,7 +87,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'thresholds\', \'top_k\', \'class_id\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
index 31068a51d510a7b95f62f61f03d37176c0fca55d..9ec2bbc4a3c8709f162dc0407408b2fe29b695a8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Recall\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -83,7 +87,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'thresholds\', \'top_k\', \'class_id\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e2bdbd54e22756b823716c149cf0f24661acc812
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.RootMeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.RootMeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'root_mean_squared_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
index aa77d1972cea42184fbbdb91e117b08ba38328fd..172c40eb2777d5504968de225718c270a0ce4e99 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
index 0c17452292a031d42f3da0d5844e99d1272dad25..8a24088257a423c18f347eb256915bda10459e1f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalAccuracy\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,6 +16,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -105,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0cadc9dcd99c03d81907ad5b1c03fd3cba25f833
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.SparseCategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_crossentropy\', \'None\', \'False\', \'-1\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c135b8f680061a1e79fedd9d705d0fb54344823b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.SparseTopKCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseTopKCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'k\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'5\', \'sparse_top_k_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
index 67857aa89f1769c736d810cf5f73739021afeddf..4f6818797e1e55362a35d37e70a05bbb3b8d65b2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..90bbb087fafcdcde5dee048c45adbc45e3be2e55
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-squared-hinge.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.SquaredHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SquaredHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'squared_hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30ef19e02cfc99d117e6a396beeaf6422a105013
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sum.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.keras.metrics.Sum"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Sum\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'sum\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e59476a2410f859dff7171162a2cab123d5e853d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.TopKCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TopKCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'k\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'5\', \'top_k_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
index 1b5eb8d0de53960c3a98409119709c1307aa6379..6627a460c7522358a6f44d415a2ad8ce59b97427 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
index 5b9c470e32d7e038f9ba11e4f96ab6eaa6b60a87..8c3c2cb03a8b28db6212e29e0cb9b7b61fca7174 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
index 905021dd790205e64a6f9839218200db98941927..3f7fe4a2f4033faf1e4f79705fe78475866e80f9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.metrics"
 tf_module {
+  member {
+    name: "AUC"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Accuracy"
     mtype: "<type \'type\'>"
@@ -8,10 +12,26 @@ tf_module {
     name: "BinaryAccuracy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "BinaryCrossentropy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CategoricalAccuracy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalHinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CosineSimilarity"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "FalseNegatives"
     mtype: "<type \'type\'>"
@@ -20,10 +40,58 @@ tf_module {
     name: "FalsePositives"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Hinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "KLDivergence"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogCoshError"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Mean"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "MeanAbsoluteError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsolutePercentageError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanIoU"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanRelativeError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredLogarithmicError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanTensor"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Metric"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Poisson"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Precision"
     mtype: "<type \'type\'>"
@@ -32,6 +100,10 @@ tf_module {
     name: "Recall"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "RootMeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SensitivityAtSpecificity"
     mtype: "<type \'type\'>"
@@ -40,10 +112,30 @@ tf_module {
     name: "SparseCategoricalAccuracy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SparseCategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseTopKCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SpecificityAtSensitivity"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SquaredHinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Sum"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TopKCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TrueNegatives"
     mtype: "<type \'type\'>"
@@ -78,7 +170,7 @@ tf_module {
   }
   member_method {
     name: "binary_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
   }
   member_method {
     name: "categorical_accuracy"
@@ -86,15 +178,15 @@ tf_module {
   }
   member_method {
     name: "categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
   }
   member_method {
     name: "cosine"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
   member_method {
     name: "cosine_proximity"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
   member_method {
     name: "deserialize"
@@ -162,7 +254,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\'], "
   }
   member_method {
     name: "sparse_top_k_categorical_accuracy"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index c58c7bef22dd4bff95d8ff07a10e20bb1bc463ad..63100a2176869e0a4ff30355c3df7ceaefaef65e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -131,7 +135,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -163,19 +167,19 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'validation_freq\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'1\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -231,11 +235,11 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index 473a1c16fb1edfbf37a7752e273566c1310853af..5c9ba04296ee39e1fdf014396d166d91ef5f714c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -136,7 +140,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -168,19 +172,19 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'validation_freq\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'1\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -240,7 +244,7 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "predict_classes"
@@ -248,7 +252,7 @@ tf_class {
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
index b9ce154bddef609e0aaf6627d6f59de551e51e3b..8471803624634eb2d3bacd79e236e51d4488a764 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.Adadelta"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Adadelta\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adadelta.Adadelta\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'rho\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'1.0\', \'0.95\', \'None\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.95\', \'1e-07\', \'Adadelta\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
index d0dc9e37a386a26143365eb443d5ba5fce8a87d9..0466ea65fa3ccaab1459841def55d6f907c7b14c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.Adagrad"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Adagrad\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adagrad.Adagrad\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'None\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.1\', \'1e-07\', \'Adagrad\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
index 06815fa99a4a474ec131c29d0cbc78bb2b9cb72d..9762fad5d0fb7690e041b853eba65bee1583ad14 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.Adam"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Adam\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adam.Adam\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'decay\', \'amsgrad\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'None\', \'0.0\', \'False\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'amsgrad\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'False\', \'Adam\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
index 47b55fdb44e79e976b6de13d760a7cf175323c6c..f477a60d237f5801b8ac8713c150cd83f2b3d768 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.Adamax"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Adamax\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adamax.Adamax\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.002\', \'0.9\', \'0.999\', \'None\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'Adamax\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ad42c6b75b32947635ec1098a3d639e011ec3765
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.keras.optimizers.Ftrl"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.ftrl.Ftrl\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=kwargs, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
index 8c63a7dda98568b24ea1b3cda15d4c840fbfd804..3ffb4bb8b4dea5840013e830efc7eec6699f71ed 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.Nadam"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Nadam\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.nadam.Nadam\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'schedule_decay\'], varargs=None, keywords=kwargs, defaults=[\'0.002\', \'0.9\', \'0.999\', \'None\', \'0.004\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'Nadam\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
index 53d64dae932e250b9d81b2767a833de3bac8c403..9639c71ce415f5a942485fdc0d40f32c24f16b7d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
@@ -1,14 +1,35 @@
 path: "tensorflow.keras.optimizers.Optimizer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -18,6 +39,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -26,8 +55,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
index a1e9b8cceb95e8f25ac5f414fadacf237be33cd9..2a7603d69b4f55d23e03e6e3d4fa5e60aeaac4c6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.RMSprop"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.RMSprop\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.rmsprop.RMSprop\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'rho\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'None\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'momentum\', \'epsilon\', \'centered\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.0\', \'1e-07\', \'False\', \'RMSprop\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
index a67fefb1bafebd62db9f6108f0fe1847b5d2e0cb..41635553347f5f1c04c221574ce7e5c6ac05275d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.SGD"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.SGD\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.gradient_descent.SGD\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'momentum\', \'decay\', \'nesterov\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'0.0\', \'0.0\', \'False\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'nesterov\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.0\', \'False\', \'SGD\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.pbtxt
index 7257b02087e237eaa47ed6a042559aa1332fc87b..7a333834c267e59f7a09c4936b8ed59776be7ee5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.pbtxt
@@ -16,6 +16,10 @@ tf_module {
     name: "Adamax"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Ftrl"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Nadam"
     mtype: "<type \'type\'>"
@@ -32,6 +36,10 @@ tf_module {
     name: "SGD"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "schedules"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-exponential-decay.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-exponential-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..25ae478cb2c663b8a856bd29146558b808499079
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-exponential-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.optimizers.schedules.ExponentialDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.ExponentialDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-inverse-time-decay.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-inverse-time-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b2fe61f4d2cb8f76fe1c8d6261b5f383b79281f0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-inverse-time-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.optimizers.schedules.InverseTimeDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.InverseTimeDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.ones.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-learning-rate-schedule.pbtxt
similarity index 54%
rename from tensorflow/tools/api/golden/v2/tensorflow.initializers.ones.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-learning-rate-schedule.pbtxt
index 18481d48150d2dcf7d6908ab1914ab217da93c10..3b33bd7526bd3f67f54450f97adf3d1d4d717051 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.ones.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-learning-rate-schedule.pbtxt
@@ -1,11 +1,9 @@
-path: "tensorflow.initializers.ones"
+path: "tensorflow.keras.optimizers.schedules.LearningRateSchedule"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Ones\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-piecewise-constant-decay.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-piecewise-constant-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6f1496492abfabb04bd47834d434ab8df05af705
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-piecewise-constant-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.optimizers.schedules.PiecewiseConstantDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.PiecewiseConstantDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'boundaries\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-polynomial-decay.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-polynomial-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..728436c36111de60c3752e09049ffb5678e4b2d1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.-polynomial-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.optimizers.schedules.PolynomialDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.PolynomialDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'end_learning_rate\', \'power\', \'cycle\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'1.0\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..024e472a734935e668b9d6ee6e9c115cc90bdcd8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.schedules.pbtxt
@@ -0,0 +1,31 @@
+path: "tensorflow.keras.optimizers.schedules"
+tf_module {
+  member {
+    name: "ExponentialDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "InverseTimeDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LearningRateSchedule"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PiecewiseConstantDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PolynomialDecay"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'learning_rate_schedule\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
index 059c91f724aae187055f8323c7748dc99f153302..d012bd97efe8748463fb7e465ad3ce7c015d841e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
index d06c8e81ee5d2a8b487d7c3c3714a1f4ed2c8e80..90a27e5d66af196074c8c8ca44830ad6ce073da3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
index 6be8e7c210f3f0a28ed8ad8a6672bc4323eb7f9d..d653a0cec41e6c6e459b54c9126db20d13750112 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
index 16d9ecce10cfb3c28cd1cf47fd65c987680bda41..32f9345ea40915c27682e4e7274e4a5852a72078 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV2\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
index 21c695935ce7751df67e09091c961e9e0cfbbf7c..0e7adfe26b66ec5e877fc13090311ffb35cfd6d7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
index f24d0307207588610c1f764bf43912b64c3ea2c6..5296597dc55bbba51d5c11ad3256f65580e37374 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -16,6 +16,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
index 0a510ece355435d8e75e39d5f7cdc6cebefe32cf..5ae9568e6421919c97ea7ec2300d33ccddc53e02 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
index d0ee44bed3c739da27cc83f0e643e1ea9dd98078..aa0da6d68ca1eb32dc47fc3a378c953fde165c79 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -16,6 +16,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
index 546de3cdab3aa0519450f74c6c6d0fe74ddc000c..516f0faea98550eb3d85fcbf0185aeddbea74ca4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
index 3ad311581eba815c2d1b0155a1380db80dd61c5d..d92af8f326484f99dbf08f9695e257a67e1697f2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dense\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
index 9b83271350cf90a2d430303dfecfd28facad272b..614643fc9945a4f1c1cc23b6dadafe3a47d4a82c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
index 87a7fb3d843e3e8e3e2fe5a56ec0b181355a6d7b..31022d3049e91d5026625dbf89c79ba424d5949f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Flatten\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
index 32b17e90ade7aa0054a390256e3abadfc7011cbe..03bbf39022d366afa67f62e5299a0ad26ba7d4db 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.layers.Layer"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
index 643c469717c258207046ddd93a318f47753de46b..63a301e3e6eb5370a832b35a5399752d3fb68fa9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
index 434e25adc12c2f2f704b07087b8552781ac2d024..d81a3368ced00d6090ebeab1ffec85e19953846b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
index 089fc6f9243c85937500b6275da034eb0748ecd4..48d93d503e8939dd6bc2896014146a1151ae1c46 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
index bc3d58b9ca9789b43bc91f9283a81811f2b6a4e9..2f1f1c1e3fd5c1528c7848cf93f8ff123a0cb743 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -16,6 +16,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
index fe7d71af3a4a46bed4ea9e62cbd7ad17987517c7..bd7549af4c420f69ea2992a9c9dfa39471d083d9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
@@ -6,7 +6,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -16,6 +16,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-adjoint.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-adjoint.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37344f70311bd225856ce72c52dfd0ac1fb09075
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-adjoint.pbtxt
@@ -0,0 +1,150 @@
+path: "tensorflow.linalg.LinearOperatorAdjoint"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_adjoint.LinearOperatorAdjoint\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "operator"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'operator\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
index 773c74e64d13ca4a840b7f599fc2cbe9c161cd03..ddef774a75157401354d29b75c7a00fbedfd9ec5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_block_diag.LinearOperatorBlockDiag\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
@@ -95,6 +103,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
index 533544d21f2753f785113a30518f4fcbcff96cd7..97a6b1a475b9555f6f5ded273050b5751625b78b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant._BaseLinearOperatorCirculant\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -72,6 +76,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_hermitian_spectrum"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], "
@@ -116,6 +124,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
index e3926eb6d4714731d09ff9c5b75a89830c06e7c1..e2bfe7e7d852e825f826f9f785c40b9550f706be 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant._BaseLinearOperatorCirculant\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -72,6 +76,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_hermitian_spectrum"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], "
@@ -116,6 +124,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
index ba209df7824a9cc076499458e35acd7dcf1eaf35..8885526669065e5a5506bfe1bf93076f4584f9d9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant._BaseLinearOperatorCirculant\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -72,6 +76,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_hermitian_spectrum"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], "
@@ -116,6 +124,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
index 081fb0e08bcd1b35ab44459d1c8eb0857dd14956..2a017fcb8987d46d8e24d2d21b43ae9962ad2075 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_composition.LinearOperatorComposition\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
@@ -95,6 +103,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
index 2014a04301618c20af5cf6f1144eb4dbda2479e1..31dcf7b0a6b4699e7009746fe62ec5551ee3e11b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_diag.LinearOperatorDiag\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
@@ -95,6 +103,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
index 9a87ae9687741090485bd8d4d0d07d359a2015e7..0ad39b4ba6006a1efa6b16e650ef3140516775f6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_full_matrix.LinearOperatorFullMatrix\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -59,6 +63,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
@@ -91,6 +99,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
index 33afb835ce1d524991c0024bfb87c29a72aac08e..f66a5a833a42c06f32696abc0a3114aa89f73a7e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_identity.BaseLinearOperatorIdentity\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -60,6 +64,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'mat\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
@@ -92,6 +100,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a7eb144d83aaeb2997d44b703b46de9a01c3a478
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-inversion.pbtxt
@@ -0,0 +1,150 @@
+path: "tensorflow.linalg.LinearOperatorInversion"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_inversion.LinearOperatorInversion\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "operator"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'operator\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
index a9078c8ab5cca078237a29febabdbbd4a8b6c89c..c983f8c6e6aa53716d1c2d07f219baccda99bd04 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_kronecker.LinearOperatorKronecker\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
@@ -95,6 +103,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
index 4cfa3bb30d7382f3cf3cc0d5ce412d230d2a4287..813aec2a137ccaaea9718b7d0254ed0d60500247 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_low_rank_update.LinearOperatorLowRankUpdate\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "base_operator"
     mtype: "<type \'property\'>"
@@ -83,6 +87,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
@@ -115,6 +123,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
index a87649133fd207ad59f2124c6b0b5aa44916e5a5..0bb7a15e1342aeb4be94e9a40e1e6b1828e397b8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_lower_triangular.LinearOperatorLowerTriangular\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -59,6 +63,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
@@ -91,6 +99,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
index 32656467840fbbc0c8708ea68aac5aa75c11a540..7747c985404e54f93d012aba86a39503a855c76d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_identity.BaseLinearOperatorIdentity\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -64,6 +68,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'mat\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
@@ -96,6 +104,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
index 49d8890c8942bc0021886ee6c9bc4e7625452655..590782bbc1d57ed4efb1cfb68b145b49d64c4545 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_zeros.LinearOperatorZeros\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -59,6 +63,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'mat\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
@@ -91,6 +99,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
index c89dc067b331603e227d9d578147e2dd1ee4a900..ed6bfdff288220fc0bcdf9fb6c4c78abfe5e43b2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
@@ -2,6 +2,10 @@ path: "tensorflow.linalg.LinearOperator"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -58,6 +62,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
@@ -90,6 +98,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
index 9f7b422fabcd55aed98bc93f01143d35698c0399..53564e33596c1789493f7eaa00ca74d491b041db 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "LinearOperator"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LinearOperatorAdjoint"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LinearOperatorBlockDiag"
     mtype: "<type \'type\'>"
@@ -36,6 +40,10 @@ tf_module {
     name: "LinearOperatorIdentity"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LinearOperatorInversion"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LinearOperatorKronecker"
     mtype: "<type \'type\'>"
@@ -196,4 +204,8 @@ tf_module {
     name: "triangular_solve"
     argspec: "args=[\'matrix\', \'rhs\', \'lower\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "tridiagonal_solve"
+    argspec: "args=[\'diagonals\', \'rhs\', \'diagonals_format\', \'transpose_rhs\', \'conjugate_rhs\', \'name\'], varargs=None, keywords=None, defaults=[\'compact\', \'False\', \'False\', \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-op-hint.-op-hint-argument-tracker.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-op-hint.-op-hint-argument-tracker.pbtxt
index 1fe179f6c1b64ebc2f7535719bc1598577ee7f03..68cb07ea6fab85824400cce8408ebcb1dc030f8c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.-op-hint.-op-hint-argument-tracker.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-op-hint.-op-hint-argument-tracker.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'function_name\', \'unique_function_id\', \'node_name_prefix\', \'attr_name\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'function_name\', \'unique_function_id\', \'node_name_prefix\', \'attr_name\', \'level\', \'children_inputs_mappings\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
   }
   member_method {
     name: "add"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-op-hint.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-op-hint.pbtxt
index 66e692a5a379203cb491980802b7003072bfe76c..3ac478f7626556574983aed4e5d284cb758406c8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.-op-hint.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-op-hint.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "AGGREGATE_STACK"
     mtype: "<type \'str\'>"
   }
+  member {
+    name: "CHILDREN_INPUTS_MAPPINGS"
+    mtype: "<type \'str\'>"
+  }
   member {
     name: "FUNCTION_AGGREGATE_ATTR"
     mtype: "<type \'str\'>"
@@ -22,6 +26,10 @@ tf_class {
     name: "FUNCTION_INPUT_INDEX_ATTR"
     mtype: "<type \'str\'>"
   }
+  member {
+    name: "FUNCTION_LEVEL_ATTR"
+    mtype: "<type \'str\'>"
+  }
   member {
     name: "FUNCTION_NAME_ATTR"
     mtype: "<type \'str\'>"
@@ -48,7 +56,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'function_name\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'function_name\', \'level\', \'children_inputs_mappings\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'None\'], "
   }
   member_method {
     name: "add_input"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-optimize.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-optimize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fedb5ee9fa4a31f25133bef55b980c18ed74fb79
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-optimize.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.lite.Optimize"
+tf_class {
+  is_instance: "<enum \'Optimize\'>"
+  member {
+    name: "OPTIMIZE_FOR_LATENCY"
+    mtype: "<enum \'Optimize\'>"
+  }
+  member {
+    name: "OPTIMIZE_FOR_SIZE"
+    mtype: "<enum \'Optimize\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-representative-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-representative-dataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d14b69531d183faa35d19f379d6b20c29b02e6e8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-representative-dataset.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.lite.RepresentativeDataset"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.python.lite.RepresentativeDataset\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'input_gen\', \'output_gen\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..34b4133d0ca6edb929c780babd0652187d41c76b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
@@ -0,0 +1,210 @@
+path: "tensorflow.lite.experimental.nn.TFLiteLSTMCell"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.experimental.examples.lstm.rnn_cell.TFLiteLSTMCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_units\', \'use_peepholes\', \'cell_clip\', \'initializer\', \'num_proj\', \'proj_clip\', \'num_unit_shards\', \'num_proj_shards\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "zero_state"
+    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2fff2b8606c98ddffabbd3d27e7a7848d1fce86a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
@@ -0,0 +1,210 @@
+path: "tensorflow.lite.experimental.nn.TfLiteRNNCell"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.experimental.examples.lstm.rnn_cell.TfLiteRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "zero_state"
+    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5ce858c46ba304b7cc3ce6b257518cdcc9aac646
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.nn.pbtxt
@@ -0,0 +1,15 @@
+path: "tensorflow.lite.experimental.nn"
+tf_module {
+  member {
+    name: "TFLiteLSTMCell"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TfLiteRNNCell"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "dynamic_rnn"
+    argspec: "args=[\'cell\', \'inputs\', \'sequence_length\', \'initial_state\', \'dtype\', \'parallel_iterations\', \'swap_memory\', \'time_major\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'True\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8d585ea6643b731fa4767301f13654fc699d6e23
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.lite.experimental"
+tf_module {
+  member {
+    name: "nn"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.pbtxt
index 154dd00821794ef4a5118e98d67e32beca38bebf..b508c3255e3817c14d4cfb6adf0e67606c744e2f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.pbtxt
@@ -12,6 +12,14 @@ tf_module {
     name: "OpsSet"
     mtype: "<class \'enum.EnumMeta\'>"
   }
+  member {
+    name: "Optimize"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "RepresentativeDataset"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TFLiteConverter"
     mtype: "<type \'type\'>"
@@ -24,6 +32,10 @@ tf_module {
     name: "constants"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "toco_convert"
     argspec: "args=[\'input_data\', \'input_tensors\', \'output_tensors\'], varargs=args, keywords=kwargs, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
index f34e2c2aa5a5b30e037157bc84894da5dce78538..6fea38d03acaf1c6c3ec60109b6e16f0a2d3f11f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
@@ -102,7 +102,7 @@ tf_module {
   }
   member_method {
     name: "count_nonzero"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'dtype\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int64\'>\", \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'dtype\', \'name\', \'reduction_indices\', \'keep_dims\', \'input\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\", \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "cumprod"
@@ -120,6 +120,10 @@ tf_module {
     name: "divide"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "divide_no_nan"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "equal"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -260,10 +264,18 @@ tf_module {
     name: "multiply"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "multiply_no_nan"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "negative"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "nextafter"
+    argspec: "args=[\'x1\', \'x2\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "not_equal"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -296,6 +308,10 @@ tf_module {
     name: "reduce_any"
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "reduce_euclidean_norm"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "reduce_logsumexp"
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nest.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nest.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..70bb6d760bc7a6c55bbdfd5c05cde4c08769786d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nest.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.nest"
+tf_module {
+  member_method {
+    name: "assert_same_structure"
+    argspec: "args=[\'nest1\', \'nest2\', \'check_types\', \'expand_composites\'], varargs=None, keywords=None, defaults=[\'True\', \'False\'], "
+  }
+  member_method {
+    name: "flatten"
+    argspec: "args=[\'structure\', \'expand_composites\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "is_nested"
+    argspec: "args=[\'seq\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "map_structure"
+    argspec: "args=[\'func\'], varargs=structure, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "pack_sequence_as"
+    argspec: "args=[\'structure\', \'flat_sequence\', \'expand_composites\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
index 40e20f8c919e64362e5697bd00ded70d0c2292a0..53f8697a0ed67024d9332b9367bf76745f45644e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
@@ -22,15 +22,27 @@ tf_module {
   }
   member_method {
     name: "avg_pool"
-    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\', \'input\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "avg_pool1d"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NWC\', \'None\'], "
+  }
+  member_method {
+    name: "avg_pool2d"
+    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\', \'input\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
   }
   member_method {
     name: "avg_pool3d"
     argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\'], "
   }
+  member_method {
+    name: "avg_pool_v2"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "batch_norm_with_global_normalization"
-    argspec: "args=[\'t\', \'m\', \'v\', \'beta\', \'gamma\', \'variance_epsilon\', \'scale_after_normalization\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'t\', \'m\', \'v\', \'beta\', \'gamma\', \'variance_epsilon\', \'scale_after_normalization\', \'name\', \'input\', \'mean\', \'variance\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "batch_normalization"
@@ -54,11 +66,15 @@ tf_module {
   }
   member_method {
     name: "conv1d"
-    argspec: "args=[\'value\', \'filters\', \'stride\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'value\', \'filters\', \'stride\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\', \'input\', \'dilations\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv1d_transpose"
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NWC\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv2d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\', \'filters\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv2d_backprop_filter"
@@ -66,15 +82,15 @@ tf_module {
   }
   member_method {
     name: "conv2d_backprop_input"
-    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\', \'filters\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv2d_transpose"
-    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NHWC\', \'None\'], "
+    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\', \'input\', \'filters\', \'dilations\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'SAME\', \'NHWC\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv3d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\', \'filters\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv3d_backprop_filter"
@@ -86,11 +102,15 @@ tf_module {
   }
   member_method {
     name: "conv3d_transpose"
-    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NDHWC\', \'None\'], "
+    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\', \'input\', \'filters\', \'dilations\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'SAME\', \'NDHWC\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "conv_transpose"
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "convolution"
-    argspec: "args=[\'input\', \'filter\', \'padding\', \'strides\', \'dilation_rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'filter\', \'padding\', \'strides\', \'dilation_rate\', \'name\', \'data_format\', \'filters\', \'dilations\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "crelu"
@@ -110,7 +130,7 @@ tf_module {
   }
   member_method {
     name: "ctc_loss"
-    argspec: "args=[\'labels\', \'inputs\', \'sequence_length\', \'preprocess_collapse_repeated\', \'ctc_merge_repeated\', \'ignore_longer_outputs_than_inputs\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'False\', \'True\'], "
+    argspec: "args=[\'labels\', \'inputs\', \'sequence_length\', \'preprocess_collapse_repeated\', \'ctc_merge_repeated\', \'ignore_longer_outputs_than_inputs\', \'time_major\', \'logits\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'True\', \'False\', \'True\', \'None\'], "
   }
   member_method {
     name: "ctc_loss_v2"
@@ -126,7 +146,7 @@ tf_module {
   }
   member_method {
     name: "depthwise_conv2d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\', \'dilations\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "depthwise_conv2d_backprop_filter"
@@ -150,7 +170,7 @@ tf_module {
   }
   member_method {
     name: "dilation2d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'rates\', \'padding\', \'name\', \'filters\', \'dilations\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "dropout"
@@ -234,19 +254,31 @@ tf_module {
   }
   member_method {
     name: "max_pool"
-    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\', \'input\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "max_pool1d"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NWC\', \'None\'], "
+  }
+  member_method {
+    name: "max_pool2d"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
   }
   member_method {
     name: "max_pool3d"
     argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\'], "
   }
+  member_method {
+    name: "max_pool_v2"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "max_pool_with_argmax"
-    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'Targmax\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'Targmax\', \'name\', \'output_dtype\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "moments"
-    argspec: "args=[\'x\', \'axes\', \'shift\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'x\', \'axes\', \'shift\', \'name\', \'keep_dims\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "nce_loss"
@@ -258,7 +290,7 @@ tf_module {
   }
   member_method {
     name: "pool"
-    argspec: "args=[\'input\', \'window_shape\', \'pooling_type\', \'padding\', \'dilation_rate\', \'strides\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'window_shape\', \'pooling_type\', \'padding\', \'dilation_rate\', \'strides\', \'name\', \'data_format\', \'dilations\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "quantized_avg_pool"
@@ -306,7 +338,7 @@ tf_module {
   }
   member_method {
     name: "separable_conv2d"
-    argspec: "args=[\'input\', \'depthwise_filter\', \'pointwise_filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'depthwise_filter\', \'pointwise_filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\', \'dilations\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "sigmoid"
@@ -322,7 +354,7 @@ tf_module {
   }
   member_method {
     name: "softmax_cross_entropy_with_logits"
-    argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'dim\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'-1\', \'None\'], "
+    argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'dim\', \'name\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'-1\', \'None\', \'None\'], "
   }
   member_method {
     name: "softmax_cross_entropy_with_logits_v2"
@@ -338,7 +370,7 @@ tf_module {
   }
   member_method {
     name: "space_to_batch"
-    argspec: "args=[\'input\', \'paddings\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'paddings\', \'block_size\', \'name\', \'block_shape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "space_to_depth"
@@ -362,7 +394,7 @@ tf_module {
   }
   member_method {
     name: "sufficient_statistics"
-    argspec: "args=[\'x\', \'axes\', \'shift\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'x\', \'axes\', \'shift\', \'keep_dims\', \'name\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "tanh"
@@ -382,7 +414,7 @@ tf_module {
   }
   member_method {
     name: "weighted_moments"
-    argspec: "args=[\'x\', \'axes\', \'frequency_weights\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+    argspec: "args=[\'x\', \'axes\', \'frequency_weights\', \'name\', \'keep_dims\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "with_space_to_batch"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index f7f9978c063ceae89c7228b476f54694e25bc249..95136152775dafedc6e276e15d426de595a4d983 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index f9e898484b9813373a49e6f117578f822cdeb156..912f78fac15eaeaeb7c260ab3e4c57e059befac3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 9e52a4252619ffc19b287fc1818fa6f772847335..58d004b3d5d10332065216b4a816febb673a4853 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index 9836433d08cba809107f9bb5dbccf2e971865b8a..a7b63a7c2b43f3364431081e0c71e366e8bc0c8a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -1,10 +1,11 @@
 path: "tensorflow.nn.rnn_cell.DropoutWrapper"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DropoutWrapper\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl._RNNCellWrapperV1\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index 5fd9b329bdeb40b5a57fe68564977f61b5349ae5..3f17805af25df993332b594c05be14d4bcaa5b7b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index 76c8cff22b1e65e65d0ac3d6705541dc3f16f80c..055485f3e90c0e48bb4f8d0ffc88b8dbca11b635 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
index f53567af52f7ed6baa78bcc75bfc0e38de02e548..23272f442279cf25b3833d19e94f646fe2144830 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index d3b68e4f2976912ed65ba7916284c951fda03b05..a9f7e85b1488dc49a52c3ac4d5a7ed55bf605ab5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index 1f7840ab919baeeb0077904592ba8dcc1d4c91fb..ecf43616741383e3565186c6f9a29ac17b9a2c4d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -1,10 +1,11 @@
 path: "tensorflow.nn.rnn_cell.ResidualWrapper"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.ResidualWrapper\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl._RNNCellWrapperV1\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +15,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index 584c74f99d896e45de06fa020413b8edd4440afb..103fdd0c1a5a1e4a2a601a17a5577de53a911d49 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -32,6 +32,10 @@ tf_module {
     name: "ConfigProto"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
+  member {
+    name: "CriticalSection"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DType"
     mtype: "<type \'type\'>"
@@ -136,6 +140,10 @@ tf_module {
     name: "MetaGraphDef"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
+  member {
+    name: "Module"
+    mtype: "<class \'tensorflow.python.module.module.ModuleMetaclass\'>"
+  }
   member {
     name: "NameAttrList"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -172,6 +180,10 @@ tf_module {
     name: "QueueBase"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "RaggedTensor"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "RandomShuffleQueue"
     mtype: "<type \'type\'>"
@@ -288,6 +300,14 @@ tf_module {
     name: "app"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "audio"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "autograph"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "bfloat16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -312,6 +332,10 @@ tf_module {
     name: "complex64"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "config"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "constant_initializer"
     mtype: "<type \'type\'>"
@@ -460,6 +484,10 @@ tf_module {
     name: "name_scope"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "nest"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "newaxis"
     mtype: "<type \'NoneType\'>"
@@ -504,6 +532,10 @@ tf_module {
     name: "quantization"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "queue"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "quint16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -512,6 +544,10 @@ tf_module {
     name: "quint8"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "ragged"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "random"
     mtype: "<type \'module\'>"
@@ -524,6 +560,10 @@ tf_module {
     name: "random_uniform_initializer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "raw_ops"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "resource"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -834,7 +874,7 @@ tf_module {
   }
   member_method {
     name: "batch_to_space"
-    argspec: "args=[\'input\', \'crops\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'crops\', \'block_size\', \'name\', \'block_shape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "batch_to_space_nd"
@@ -912,6 +952,10 @@ tf_module {
     name: "colocate_with"
     argspec: "args=[\'op\', \'ignore_existing\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
+  member_method {
+    name: "combined_non_max_suppression"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size_per_class\', \'max_total_size\', \'iou_threshold\', \'score_threshold\', \'pad_per_class\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
   member_method {
     name: "complex"
     argspec: "args=[\'real\', \'imag\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -946,7 +990,7 @@ tf_module {
   }
   member_method {
     name: "convert_to_tensor"
-    argspec: "args=[\'value\', \'dtype\', \'name\', \'preferred_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'value\', \'dtype\', \'name\', \'preferred_dtype\', \'dtype_hint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "convert_to_tensor_or_indexed_slices"
@@ -966,7 +1010,7 @@ tf_module {
   }
   member_method {
     name: "count_nonzero"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'dtype\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int64\'>\", \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'dtype\', \'name\', \'reduction_indices\', \'keep_dims\', \'input\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\", \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "count_up_to"
@@ -1060,6 +1104,10 @@ tf_module {
     name: "disable_resource_variables"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "disable_v2_batch_normalization"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "disable_v2_behavior"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -1104,6 +1152,10 @@ tf_module {
     name: "enable_resource_variables"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "enable_v2_batch_normalization"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "enable_v2_behavior"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -1150,7 +1202,7 @@ tf_module {
   }
   member_method {
     name: "extract_image_patches"
-    argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\', \'name\', \'sizes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "extract_volume_patches"
@@ -1228,9 +1280,13 @@ tf_module {
     name: "foldr"
     argspec: "args=[\'fn\', \'elems\', \'initializer\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "function"
+    argspec: "args=[\'func\', \'input_signature\', \'autograph\', \'experimental_autograph_options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\'], "
+  }
   member_method {
     name: "gather"
-    argspec: "args=[\'params\', \'indices\', \'validate_indices\', \'name\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0\'], "
+    argspec: "args=[\'params\', \'indices\', \'validate_indices\', \'name\', \'axis\', \'batch_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'0\'], "
   }
   member_method {
     name: "gather_nd"
@@ -1272,6 +1328,10 @@ tf_module {
     name: "get_session_tensor"
     argspec: "args=[\'handle\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_static_value"
+    argspec: "args=[\'tensor\', \'partial\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
   member_method {
     name: "get_variable"
     argspec: "args=[\'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'collections\', \'caching_device\', \'partitioner\', \'validate_shape\', \'use_resource\', \'custom_getter\', \'constraint\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
@@ -1408,6 +1468,10 @@ tf_module {
     name: "is_strictly_increasing"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "is_tensor"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "is_variable_initialized"
     argspec: "args=[\'variable\'], varargs=None, keywords=None, defaults=None"
@@ -1754,7 +1818,7 @@ tf_module {
   }
   member_method {
     name: "reduce_join"
-    argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\', \'None\'], "
+    argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "reduce_logsumexp"
@@ -1982,7 +2046,7 @@ tf_module {
   }
   member_method {
     name: "space_to_batch"
-    argspec: "args=[\'input\', \'paddings\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'paddings\', \'block_size\', \'name\', \'block_shape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "space_to_batch_nd"
@@ -1998,7 +2062,7 @@ tf_module {
   }
   member_method {
     name: "sparse_concat"
-    argspec: "args=[\'axis\', \'sp_inputs\', \'name\', \'expand_nonconcat_dim\', \'concat_dim\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'axis\', \'sp_inputs\', \'name\', \'expand_nonconcat_dim\', \'concat_dim\', \'expand_nonconcat_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
   }
   member_method {
     name: "sparse_fill_empty_rows"
@@ -2150,7 +2214,7 @@ tf_module {
   }
   member_method {
     name: "string_to_hash_bucket"
-    argspec: "args=[\'string_tensor\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'string_tensor\', \'num_buckets\', \'name\', \'input\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "string_to_hash_bucket_fast"
@@ -2162,7 +2226,7 @@ tf_module {
   }
   member_method {
     name: "string_to_number"
-    argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+    argspec: "args=[\'string_tensor\', \'out_type\', \'name\', \'input\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
   }
   member_method {
     name: "substr"
@@ -2330,7 +2394,7 @@ tf_module {
   }
   member_method {
     name: "verify_tensor_all_finite"
-    argspec: "args=[\'t\', \'msg\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'t\', \'msg\', \'name\', \'x\', \'message\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "where"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-f-i-f-o-queue.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.queue.-f-i-f-o-queue.pbtxt
similarity index 98%
rename from tensorflow/tools/api/golden/v2/tensorflow.-f-i-f-o-queue.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.queue.-f-i-f-o-queue.pbtxt
index a095616c00cfe8fb64413e2078ae1589a423d2f4..724ab5fe8283de44b20b059042f8d6744b11da19 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-f-i-f-o-queue.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.queue.-f-i-f-o-queue.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.FIFOQueue"
+path: "tensorflow.queue.FIFOQueue"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.data_flow_ops.FIFOQueue\'>"
   is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-padding-f-i-f-o-queue.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.queue.-padding-f-i-f-o-queue.pbtxt
similarity index 98%
rename from tensorflow/tools/api/golden/v2/tensorflow.io.-padding-f-i-f-o-queue.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.queue.-padding-f-i-f-o-queue.pbtxt
index 85306fdcac519820fce8d254d9aaaf504b830b7a..9ef0a4d9eb6bbfb69fddf3fe696e3f60ac3ef67b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.-padding-f-i-f-o-queue.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.queue.-padding-f-i-f-o-queue.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.io.PaddingFIFOQueue"
+path: "tensorflow.queue.PaddingFIFOQueue"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.data_flow_ops.PaddingFIFOQueue\'>"
   is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-priority-queue.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.queue.-priority-queue.pbtxt
similarity index 98%
rename from tensorflow/tools/api/golden/v2/tensorflow.io.-priority-queue.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.queue.-priority-queue.pbtxt
index 02d8037b34a57b5d1c1309b7cbcfd290a6091e04..bb66beb13af18501912fda85b9c3dc67cdf21683 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.-priority-queue.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.queue.-priority-queue.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.io.PriorityQueue"
+path: "tensorflow.queue.PriorityQueue"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.data_flow_ops.PriorityQueue\'>"
   is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-queue-base.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.queue.-queue-base.pbtxt
similarity index 98%
rename from tensorflow/tools/api/golden/v2/tensorflow.io.-queue-base.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.queue.-queue-base.pbtxt
index a30481a0ea8f1cb71f5695be2099f5e5ae3f644c..8faaad22af6e0f920e26a44e1ebf294fc4b109c4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.-queue-base.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.queue.-queue-base.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.io.QueueBase"
+path: "tensorflow.queue.QueueBase"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
   is_instance: "<type \'object\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.-random-shuffle-queue.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.queue.-random-shuffle-queue.pbtxt
similarity index 97%
rename from tensorflow/tools/api/golden/v2/tensorflow.io.-random-shuffle-queue.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.queue.-random-shuffle-queue.pbtxt
index 82cbf9884f77ed70d9f3191875daeb8b6f9f72ec..31cd503b13040b119d4028f813c94689f8e2ebb3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.-random-shuffle-queue.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.queue.-random-shuffle-queue.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.io.RandomShuffleQueue"
+path: "tensorflow.queue.RandomShuffleQueue"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.data_flow_ops.RandomShuffleQueue\'>"
   is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.queue.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.queue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c16e95e2116b703434ed91106eb29e4beb5668f2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.queue.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.queue"
+tf_module {
+  member {
+    name: "FIFOQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PaddingFIFOQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PriorityQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "QueueBase"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomShuffleQueue"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.ragged.-ragged-tensor-value.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.ragged.-ragged-tensor-value.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..96c895e0a49364b37d1578ff1a1e9214a10189df
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.ragged.-ragged-tensor-value.pbtxt
@@ -0,0 +1,41 @@
+path: "tensorflow.ragged.RaggedTensorValue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.ragged.ragged_tensor_value.RaggedTensorValue\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "flat_values"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "nested_row_splits"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "ragged_rank"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "row_splits"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "values"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'values\', \'row_splits\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "to_list"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.ragged.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.ragged.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..22ca7e931f3589f11b7fc5c655d633c86716b4d8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.ragged.pbtxt
@@ -0,0 +1,31 @@
+path: "tensorflow.ragged"
+tf_module {
+  member {
+    name: "RaggedTensorValue"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "constant"
+    argspec: "args=[\'pylist\', \'dtype\', \'ragged_rank\', \'inner_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "constant_value"
+    argspec: "args=[\'pylist\', \'dtype\', \'ragged_rank\', \'inner_shape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "map_flat_values"
+    argspec: "args=[\'op\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "range"
+    argspec: "args=[\'starts\', \'limits\', \'deltas\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "row_splits_to_segment_ids"
+    argspec: "args=[\'splits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "segment_ids_to_row_splits"
+    argspec: "args=[\'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..080afef7645fb4cf7a700b2da062311fd14871f7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -0,0 +1,4067 @@
+path: "tensorflow.raw_ops"
+tf_module {
+  member_method {
+    name: "Abort"
+    argspec: "args=[\'error_msg\', \'exit_without_error\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Abs"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AccumulateNV2"
+    argspec: "args=[\'inputs\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AccumulatorApplyGradient"
+    argspec: "args=[\'handle\', \'local_step\', \'gradient\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AccumulatorNumAccumulated"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AccumulatorSetGlobalStep"
+    argspec: "args=[\'handle\', \'new_global_step\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AccumulatorTakeGradient"
+    argspec: "args=[\'handle\', \'num_required\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Acos"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Acosh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Add"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AddManySparseToTensorsMap"
+    argspec: "args=[\'sparse_indices\', \'sparse_values\', \'sparse_shape\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AddN"
+    argspec: "args=[\'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AddSparseToTensorsMap"
+    argspec: "args=[\'sparse_indices\', \'sparse_values\', \'sparse_shape\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AddV2"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AdjustContrast"
+    argspec: "args=[\'images\', \'contrast_factor\', \'min_value\', \'max_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AdjustContrastv2"
+    argspec: "args=[\'images\', \'contrast_factor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AdjustHue"
+    argspec: "args=[\'images\', \'delta\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AdjustSaturation"
+    argspec: "args=[\'images\', \'scale\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "All"
+    argspec: "args=[\'input\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AllCandidateSampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Angle"
+    argspec: "args=[\'input\', \'Tout\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AnonymousIterator"
+    argspec: "args=[\'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Any"
+    argspec: "args=[\'input\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyAdaMax"
+    argspec: "args=[\'var\', \'m\', \'v\', \'beta1_power\', \'lr\', \'beta1\', \'beta2\', \'epsilon\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyAdadelta"
+    argspec: "args=[\'var\', \'accum\', \'accum_update\', \'lr\', \'rho\', \'epsilon\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'use_locking\', \'update_slots\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyAdagradDA"
+    argspec: "args=[\'var\', \'gradient_accumulator\', \'gradient_squared_accumulator\', \'grad\', \'lr\', \'l1\', \'l2\', \'global_step\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyAdam"
+    argspec: "args=[\'var\', \'m\', \'v\', \'beta1_power\', \'beta2_power\', \'lr\', \'beta1\', \'beta2\', \'epsilon\', \'grad\', \'use_locking\', \'use_nesterov\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyAddSign"
+    argspec: "args=[\'var\', \'m\', \'lr\', \'alpha\', \'sign_decay\', \'beta\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyCenteredRMSProp"
+    argspec: "args=[\'var\', \'mg\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyFtrl"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'lr\', \'l1\', \'l2\', \'lr_power\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyFtrlV2"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'lr\', \'l1\', \'l2\', \'l2_shrinkage\', \'lr_power\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyGradientDescent"
+    argspec: "args=[\'var\', \'alpha\', \'delta\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyMomentum"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'momentum\', \'use_locking\', \'use_nesterov\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyPowerSign"
+    argspec: "args=[\'var\', \'m\', \'lr\', \'logbase\', \'sign_decay\', \'beta\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyProximalAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'l1\', \'l2\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyProximalGradientDescent"
+    argspec: "args=[\'var\', \'alpha\', \'l1\', \'l2\', \'delta\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyRMSProp"
+    argspec: "args=[\'var\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApproximateEqual"
+    argspec: "args=[\'x\', \'y\', \'tolerance\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ArgMax"
+    argspec: "args=[\'input\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ArgMin"
+    argspec: "args=[\'input\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AsString"
+    argspec: "args=[\'input\', \'precision\', \'scientific\', \'shortest\', \'width\', \'fill\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Asin"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Asinh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Assert"
+    argspec: "args=[\'condition\', \'data\', \'summarize\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Assign"
+    argspec: "args=[\'ref\', \'value\', \'validate_shape\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AssignAdd"
+    argspec: "args=[\'ref\', \'value\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AssignAddVariableOp"
+    argspec: "args=[\'resource\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AssignSub"
+    argspec: "args=[\'ref\', \'value\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AssignSubVariableOp"
+    argspec: "args=[\'resource\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AssignVariableOp"
+    argspec: "args=[\'resource\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Atan"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Atan2"
+    argspec: "args=[\'y\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Atanh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AudioSpectrogram"
+    argspec: "args=[\'input\', \'window_size\', \'stride\', \'magnitude_squared\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AudioSummary"
+    argspec: "args=[\'tag\', \'tensor\', \'sample_rate\', \'max_outputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AudioSummaryV2"
+    argspec: "args=[\'tag\', \'tensor\', \'sample_rate\', \'max_outputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AvgPool"
+    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AvgPool3D"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AvgPool3DGrad"
+    argspec: "args=[\'orig_input_shape\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AvgPoolGrad"
+    argspec: "args=[\'orig_input_shape\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Barrier"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BarrierClose"
+    argspec: "args=[\'handle\', \'cancel_pending_enqueues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BarrierIncompleteSize"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BarrierInsertMany"
+    argspec: "args=[\'handle\', \'keys\', \'values\', \'component_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BarrierReadySize"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BarrierTakeMany"
+    argspec: "args=[\'handle\', \'num_elements\', \'component_types\', \'allow_small_batch\', \'wait_for_incomplete\', \'timeout_ms\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchCholesky"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchCholeskyGrad"
+    argspec: "args=[\'l\', \'grad\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchDataset"
+    argspec: "args=[\'input_dataset\', \'batch_size\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchDatasetV2"
+    argspec: "args=[\'input_dataset\', \'batch_size\', \'drop_remainder\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchFFT"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchFFT2D"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchFFT3D"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchIFFT"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchIFFT2D"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchIFFT3D"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchMatMul"
+    argspec: "args=[\'x\', \'y\', \'adj_x\', \'adj_y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchMatrixBandPart"
+    argspec: "args=[\'input\', \'num_lower\', \'num_upper\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchMatrixDeterminant"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchMatrixDiag"
+    argspec: "args=[\'diagonal\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchMatrixDiagPart"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchMatrixInverse"
+    argspec: "args=[\'input\', \'adjoint\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchMatrixSetDiag"
+    argspec: "args=[\'input\', \'diagonal\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchMatrixSolve"
+    argspec: "args=[\'matrix\', \'rhs\', \'adjoint\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchMatrixSolveLs"
+    argspec: "args=[\'matrix\', \'rhs\', \'l2_regularizer\', \'fast\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchMatrixTriangularSolve"
+    argspec: "args=[\'matrix\', \'rhs\', \'lower\', \'adjoint\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchNormWithGlobalNormalization"
+    argspec: "args=[\'t\', \'m\', \'v\', \'beta\', \'gamma\', \'variance_epsilon\', \'scale_after_normalization\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchNormWithGlobalNormalizationGrad"
+    argspec: "args=[\'t\', \'m\', \'v\', \'gamma\', \'backprop\', \'variance_epsilon\', \'scale_after_normalization\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchSelfAdjointEig"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchSelfAdjointEigV2"
+    argspec: "args=[\'input\', \'compute_v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchSvd"
+    argspec: "args=[\'input\', \'compute_uv\', \'full_matrices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchToSpace"
+    argspec: "args=[\'input\', \'crops\', \'block_size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchToSpaceND"
+    argspec: "args=[\'input\', \'block_shape\', \'crops\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BesselI0e"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BesselI1e"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Betainc"
+    argspec: "args=[\'a\', \'b\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BiasAdd"
+    argspec: "args=[\'value\', \'bias\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BiasAddGrad"
+    argspec: "args=[\'out_backprop\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BiasAddV1"
+    argspec: "args=[\'value\', \'bias\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Bincount"
+    argspec: "args=[\'arr\', \'size\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Bitcast"
+    argspec: "args=[\'input\', \'type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BitwiseAnd"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BitwiseOr"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BitwiseXor"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesBucketize"
+    argspec: "args=[\'float_values\', \'bucket_boundaries\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesCalculateBestGainsPerFeature"
+    argspec: "args=[\'node_id_range\', \'stats_summary_list\', \'l1\', \'l2\', \'tree_complexity\', \'min_node_weight\', \'max_splits\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesCenterBias"
+    argspec: "args=[\'tree_ensemble_handle\', \'mean_gradients\', \'mean_hessians\', \'l1\', \'l2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesCreateEnsemble"
+    argspec: "args=[\'tree_ensemble_handle\', \'stamp_token\', \'tree_ensemble_serialized\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesCreateQuantileStreamResource"
+    argspec: "args=[\'quantile_stream_resource_handle\', \'epsilon\', \'num_streams\', \'max_elements\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesDeserializeEnsemble"
+    argspec: "args=[\'tree_ensemble_handle\', \'stamp_token\', \'tree_ensemble_serialized\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesEnsembleResourceHandleOp"
+    argspec: "args=[\'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesExampleDebugOutputs"
+    argspec: "args=[\'tree_ensemble_handle\', \'bucketized_features\', \'logits_dimension\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesGetEnsembleStates"
+    argspec: "args=[\'tree_ensemble_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesMakeQuantileSummaries"
+    argspec: "args=[\'float_values\', \'example_weights\', \'epsilon\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesMakeStatsSummary"
+    argspec: "args=[\'node_ids\', \'gradients\', \'hessians\', \'bucketized_features_list\', \'max_splits\', \'num_buckets\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesPredict"
+    argspec: "args=[\'tree_ensemble_handle\', \'bucketized_features\', \'logits_dimension\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesQuantileStreamResourceAddSummaries"
+    argspec: "args=[\'quantile_stream_resource_handle\', \'summaries\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesQuantileStreamResourceDeserialize"
+    argspec: "args=[\'quantile_stream_resource_handle\', \'bucket_boundaries\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesQuantileStreamResourceFlush"
+    argspec: "args=[\'quantile_stream_resource_handle\', \'num_buckets\', \'generate_quantiles\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesQuantileStreamResourceGetBucketBoundaries"
+    argspec: "args=[\'quantile_stream_resource_handle\', \'num_features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesQuantileStreamResourceHandleOp"
+    argspec: "args=[\'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesSerializeEnsemble"
+    argspec: "args=[\'tree_ensemble_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesTrainingPredict"
+    argspec: "args=[\'tree_ensemble_handle\', \'cached_tree_ids\', \'cached_node_ids\', \'bucketized_features\', \'logits_dimension\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesUpdateEnsemble"
+    argspec: "args=[\'tree_ensemble_handle\', \'feature_ids\', \'node_ids\', \'gains\', \'thresholds\', \'left_node_contribs\', \'right_node_contribs\', \'max_depth\', \'learning_rate\', \'pruning_mode\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BroadcastArgs"
+    argspec: "args=[\'s0\', \'s1\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BroadcastGradientArgs"
+    argspec: "args=[\'s0\', \'s1\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BroadcastTo"
+    argspec: "args=[\'input\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Bucketize"
+    argspec: "args=[\'input\', \'boundaries\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CTCBeamSearchDecoder"
+    argspec: "args=[\'inputs\', \'sequence_length\', \'beam_width\', \'top_paths\', \'merge_repeated\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CTCGreedyDecoder"
+    argspec: "args=[\'inputs\', \'sequence_length\', \'merge_repeated\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CTCLoss"
+    argspec: "args=[\'inputs\', \'labels_indices\', \'labels_values\', \'sequence_length\', \'preprocess_collapse_repeated\', \'ctc_merge_repeated\', \'ignore_longer_outputs_than_inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CacheDataset"
+    argspec: "args=[\'input_dataset\', \'filename\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Case"
+    argspec: "args=[\'branch_index\', \'input\', \'Tout\', \'branches\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Cast"
+    argspec: "args=[\'x\', \'DstT\', \'Truncate\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Ceil"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CheckNumerics"
+    argspec: "args=[\'tensor\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Cholesky"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CholeskyGrad"
+    argspec: "args=[\'l\', \'grad\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ClipByValue"
+    argspec: "args=[\'t\', \'clip_value_min\', \'clip_value_max\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CloseSummaryWriter"
+    argspec: "args=[\'writer\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CollectiveBcastRecv"
+    argspec: "args=[\'T\', \'group_size\', \'group_key\', \'instance_key\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CollectiveBcastSend"
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CollectiveGather"
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CollectiveReduce"
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'subdiv_offsets\', \'wait_for\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CombinedNonMaxSuppression"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size_per_class\', \'max_total_size\', \'iou_threshold\', \'score_threshold\', \'pad_per_class\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CompareAndBitpack"
+    argspec: "args=[\'input\', \'threshold\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Complex"
+    argspec: "args=[\'real\', \'imag\', \'Tout\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ComplexAbs"
+    argspec: "args=[\'x\', \'Tout\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ComputeAccidentalHits"
+    argspec: "args=[\'true_classes\', \'sampled_candidates\', \'num_true\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Concat"
+    argspec: "args=[\'concat_dim\', \'values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ConcatOffset"
+    argspec: "args=[\'concat_dim\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ConcatV2"
+    argspec: "args=[\'values\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ConcatenateDataset"
+    argspec: "args=[\'input_dataset\', \'another_dataset\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ConditionalAccumulator"
+    argspec: "args=[\'dtype\', \'shape\', \'container\', \'shared_name\', \'reduction_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Conj"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ConjugateTranspose"
+    argspec: "args=[\'x\', \'perm\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Const"
+    argspec: "args=[\'value\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ConsumeMutexLock"
+    argspec: "args=[\'mutex_lock\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ControlTrigger"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Conv2D"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'use_cudnn_on_gpu\', \'padding\', \'explicit_paddings\', \'data_format\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Conv2DBackpropFilter"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'use_cudnn_on_gpu\', \'padding\', \'explicit_paddings\', \'data_format\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Conv2DBackpropInput"
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'use_cudnn_on_gpu\', \'padding\', \'explicit_paddings\', \'data_format\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Conv3D"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Conv3DBackpropFilter"
+    argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Conv3DBackpropFilterV2"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Conv3DBackpropInput"
+    argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Conv3DBackpropInputV2"
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Cos"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Cosh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CountUpTo"
+    argspec: "args=[\'ref\', \'limit\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CreateSummaryDbWriter"
+    argspec: "args=[\'writer\', \'db_uri\', \'experiment_name\', \'run_name\', \'user_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CreateSummaryFileWriter"
+    argspec: "args=[\'writer\', \'logdir\', \'max_queue\', \'flush_millis\', \'filename_suffix\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CropAndResize"
+    argspec: "args=[\'image\', \'boxes\', \'box_ind\', \'crop_size\', \'method\', \'extrapolation_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CropAndResizeGradBoxes"
+    argspec: "args=[\'grads\', \'image\', \'boxes\', \'box_ind\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CropAndResizeGradImage"
+    argspec: "args=[\'grads\', \'boxes\', \'box_ind\', \'image_size\', \'T\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Cross"
+    argspec: "args=[\'a\', \'b\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CudnnRNN"
+    argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'is_training\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CudnnRNNBackprop"
+    argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'output\', \'output_h\', \'output_c\', \'output_backprop\', \'output_h_backprop\', \'output_c_backprop\', \'reserve_space\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CudnnRNNBackpropV2"
+    argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'output\', \'output_h\', \'output_c\', \'output_backprop\', \'output_h_backprop\', \'output_c_backprop\', \'reserve_space\', \'host_reserved\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CudnnRNNBackpropV3"
+    argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'sequence_lengths\', \'output\', \'output_h\', \'output_c\', \'output_backprop\', \'output_h_backprop\', \'output_c_backprop\', \'reserve_space\', \'host_reserved\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CudnnRNNCanonicalToParams"
+    argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'weights\', \'biases\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CudnnRNNParamsSize"
+    argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'T\', \'S\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CudnnRNNParamsToCanonical"
+    argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'params\', \'num_params\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CudnnRNNV2"
+    argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'is_training\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CudnnRNNV3"
+    argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'sequence_lengths\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'is_training\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Cumprod"
+    argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Cumsum"
+    argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DataFormatDimMap"
+    argspec: "args=[\'x\', \'src_format\', \'dst_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DataFormatVecPermute"
+    argspec: "args=[\'x\', \'src_format\', \'dst_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DatasetToGraph"
+    argspec: "args=[\'input_dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DatasetToSingleElement"
+    argspec: "args=[\'dataset\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DebugGradientIdentity"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DebugGradientRefIdentity"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DecodeAndCropJpeg"
+    argspec: "args=[\'contents\', \'crop_window\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DecodeBase64"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DecodeBmp"
+    argspec: "args=[\'contents\', \'channels\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DecodeCSV"
+    argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'select_cols\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DecodeCompressed"
+    argspec: "args=[\'bytes\', \'compression_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DecodeGif"
+    argspec: "args=[\'contents\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DecodeJSONExample"
+    argspec: "args=[\'json_examples\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DecodeJpeg"
+    argspec: "args=[\'contents\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DecodePng"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DecodeRaw"
+    argspec: "args=[\'bytes\', \'out_type\', \'little_endian\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DecodeWav"
+    argspec: "args=[\'contents\', \'desired_channels\', \'desired_samples\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DeepCopy"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DeleteSessionTensor"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DenseToDenseSetOperation"
+    argspec: "args=[\'set1\', \'set2\', \'set_operation\', \'validate_indices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DenseToSparseSetOperation"
+    argspec: "args=[\'set1\', \'set2_indices\', \'set2_values\', \'set2_shape\', \'set_operation\', \'validate_indices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DepthToSpace"
+    argspec: "args=[\'input\', \'block_size\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DepthwiseConv2dNative"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DepthwiseConv2dNativeBackpropFilter"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DepthwiseConv2dNativeBackpropInput"
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Dequantize"
+    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'mode\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DeserializeIterator"
+    argspec: "args=[\'resource_handle\', \'serialized\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DeserializeManySparse"
+    argspec: "args=[\'serialized_sparse\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DeserializeSparse"
+    argspec: "args=[\'serialized_sparse\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DestroyResourceOp"
+    argspec: "args=[\'resource\', \'ignore_lookup_error\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DestroyTemporaryVariable"
+    argspec: "args=[\'ref\', \'var_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Diag"
+    argspec: "args=[\'diagonal\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DiagPart"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Digamma"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Dilation2D"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'rates\', \'padding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Dilation2DBackpropFilter"
+    argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'rates\', \'padding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Dilation2DBackpropInput"
+    argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'rates\', \'padding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Div"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DivNoNan"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DrawBoundingBoxes"
+    argspec: "args=[\'images\', \'boxes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DynamicPartition"
+    argspec: "args=[\'data\', \'partitions\', \'num_partitions\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DynamicStitch"
+    argspec: "args=[\'indices\', \'data\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "EagerPyFunc"
+    argspec: "args=[\'input\', \'token\', \'Tout\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "EditDistance"
+    argspec: "args=[\'hypothesis_indices\', \'hypothesis_values\', \'hypothesis_shape\', \'truth_indices\', \'truth_values\', \'truth_shape\', \'normalize\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Elu"
+    argspec: "args=[\'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "EluGrad"
+    argspec: "args=[\'gradients\', \'outputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Empty"
+    argspec: "args=[\'shape\', \'dtype\', \'init\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "EmptyTensorList"
+    argspec: "args=[\'element_shape\', \'max_num_elements\', \'element_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "EncodeBase64"
+    argspec: "args=[\'input\', \'pad\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "EncodeJpeg"
+    argspec: "args=[\'image\', \'format\', \'quality\', \'progressive\', \'optimize_size\', \'chroma_downsampling\', \'density_unit\', \'x_density\', \'y_density\', \'xmp_metadata\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "EncodePng"
+    argspec: "args=[\'image\', \'compression\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "EncodeWav"
+    argspec: "args=[\'audio\', \'sample_rate\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "EnsureShape"
+    argspec: "args=[\'input\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Enter"
+    argspec: "args=[\'data\', \'frame_name\', \'is_constant\', \'parallel_iterations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Equal"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Erf"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Erfc"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "EuclideanNorm"
+    argspec: "args=[\'input\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Exit"
+    argspec: "args=[\'data\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Exp"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExpandDims"
+    argspec: "args=[\'input\', \'dim\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalAssertNextDataset"
+    argspec: "args=[\'input_dataset\', \'transformations\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalBytesProducedStatsDataset"
+    argspec: "args=[\'input_dataset\', \'tag\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalCSVDataset"
+    argspec: "args=[\'filenames\', \'compression_type\', \'buffer_size\', \'header\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'select_cols\', \'record_defaults\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalChooseFastestDataset"
+    argspec: "args=[\'input_datasets\', \'num_experiments\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalDatasetCardinality"
+    argspec: "args=[\'input_dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalDatasetToTFRecord"
+    argspec: "args=[\'input_dataset\', \'filename\', \'compression_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalDenseToSparseBatchDataset"
+    argspec: "args=[\'input_dataset\', \'batch_size\', \'row_shape\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalDirectedInterleaveDataset"
+    argspec: "args=[\'selector_input_dataset\', \'data_input_datasets\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalGroupByReducerDataset"
+    argspec: "args=[\'input_dataset\', \'key_func_other_arguments\', \'init_func_other_arguments\', \'reduce_func_other_arguments\', \'finalize_func_other_arguments\', \'key_func\', \'init_func\', \'reduce_func\', \'finalize_func\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalGroupByWindowDataset"
+    argspec: "args=[\'input_dataset\', \'key_func_other_arguments\', \'reduce_func_other_arguments\', \'window_size_func_other_arguments\', \'key_func\', \'reduce_func\', \'window_size_func\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalIdentityIndexedDataset"
+    argspec: "args=[\'size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalIgnoreErrorsDataset"
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalIndexedDatasetGet"
+    argspec: "args=[\'materialized\', \'index\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalIndexedDatasetMaterialize"
+    argspec: "args=[\'dataset\', \'materialized\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalIteratorGetDevice"
+    argspec: "args=[\'resource\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalLMDBDataset"
+    argspec: "args=[\'filenames\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalLatencyStatsDataset"
+    argspec: "args=[\'input_dataset\', \'tag\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalMapAndBatchDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'batch_size\', \'num_parallel_calls\', \'drop_remainder\', \'f\', \'output_types\', \'output_shapes\', \'preserve_cardinality\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalMapDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'f\', \'output_types\', \'output_shapes\', \'use_inter_op_parallelism\', \'preserve_cardinality\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalMatchingFilesDataset"
+    argspec: "args=[\'patterns\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalMaterializedIndexDatasetHandle"
+    argspec: "args=[\'container\', \'shared_name\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalMaxIntraOpParallelismDataset"
+    argspec: "args=[\'input_dataset\', \'max_intra_op_parallelism\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalNonSerializableDataset"
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalNumaMapAndBatchDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'batch_size\', \'num_parallel_calls\', \'drop_remainder\', \'f\', \'output_types\', \'output_shapes\', \'preserve_cardinality\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalParallelInterleaveDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'cycle_length\', \'block_length\', \'sloppy\', \'buffer_output_elements\', \'prefetch_input_elements\', \'f\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalParseExampleDataset"
+    argspec: "args=[\'input_dataset\', \'num_parallel_calls\', \'dense_defaults\', \'sparse_keys\', \'dense_keys\', \'sparse_types\', \'dense_shapes\', \'output_types\', \'output_shapes\', \'sloppy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalPrivateThreadPoolDataset"
+    argspec: "args=[\'input_dataset\', \'num_threads\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalRandomDataset"
+    argspec: "args=[\'seed\', \'seed2\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalRebatchDataset"
+    argspec: "args=[\'input_dataset\', \'num_workers\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalScanDataset"
+    argspec: "args=[\'input_dataset\', \'initial_state\', \'other_arguments\', \'f\', \'output_types\', \'output_shapes\', \'preserve_cardinality\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalSetStatsAggregatorDataset"
+    argspec: "args=[\'input_dataset\', \'stats_aggregator\', \'tag\', \'counter_prefix\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalSleepDataset"
+    argspec: "args=[\'input_dataset\', \'sleep_microseconds\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalSlidingWindowDataset"
+    argspec: "args=[\'input_dataset\', \'window_size\', \'window_shift\', \'window_stride\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalSqlDataset"
+    argspec: "args=[\'driver_name\', \'data_source_name\', \'query\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalStatsAggregatorHandle"
+    argspec: "args=[\'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalStatsAggregatorSummary"
+    argspec: "args=[\'iterator\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalTakeWhileDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'predicate\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalThreadPoolDataset"
+    argspec: "args=[\'input_dataset\', \'thread_pool\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalThreadPoolHandle"
+    argspec: "args=[\'num_threads\', \'max_intra_op_parallelism\', \'display_name\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalUnbatchDataset"
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalUniqueDataset"
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Expm1"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExtractGlimpse"
+    argspec: "args=[\'input\', \'size\', \'offsets\', \'centered\', \'normalized\', \'uniform_noise\', \'noise\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExtractImagePatches"
+    argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExtractJpegShape"
+    argspec: "args=[\'contents\', \'output_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExtractVolumePatches"
+    argspec: "args=[\'input\', \'ksizes\', \'strides\', \'padding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FFT"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FFT2D"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FFT3D"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FIFOQueue"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FIFOQueueV2"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Fact"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FakeParam"
+    argspec: "args=[\'dtype\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FakeQuantWithMinMaxArgs"
+    argspec: "args=[\'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FakeQuantWithMinMaxArgsGradient"
+    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FakeQuantWithMinMaxVars"
+    argspec: "args=[\'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FakeQuantWithMinMaxVarsGradient"
+    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FakeQuantWithMinMaxVarsPerChannel"
+    argspec: "args=[\'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FakeQueue"
+    argspec: "args=[\'resource\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Fill"
+    argspec: "args=[\'dims\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FilterByLastComponentDataset"
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FilterDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'predicate\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FixedLengthRecordDataset"
+    argspec: "args=[\'filenames\', \'header_bytes\', \'record_bytes\', \'footer_bytes\', \'buffer_size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FixedLengthRecordDatasetV2"
+    argspec: "args=[\'filenames\', \'header_bytes\', \'record_bytes\', \'footer_bytes\', \'buffer_size\', \'compression_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FixedLengthRecordReader"
+    argspec: "args=[\'header_bytes\', \'record_bytes\', \'footer_bytes\', \'hop_bytes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FixedLengthRecordReaderV2"
+    argspec: "args=[\'header_bytes\', \'record_bytes\', \'footer_bytes\', \'hop_bytes\', \'container\', \'shared_name\', \'encoding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FixedUnigramCandidateSampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'vocab_file\', \'distortion\', \'num_reserved_ids\', \'num_shards\', \'shard\', \'unigrams\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FlatMapDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'f\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Floor"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FloorDiv"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FloorMod"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FlushSummaryWriter"
+    argspec: "args=[\'writer\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "For"
+    argspec: "args=[\'start\', \'limit\', \'delta\', \'input\', \'body\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FractionalAvgPool"
+    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'deterministic\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FractionalAvgPoolGrad"
+    argspec: "args=[\'orig_input_tensor_shape\', \'out_backprop\', \'row_pooling_sequence\', \'col_pooling_sequence\', \'overlapping\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FractionalMaxPool"
+    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'deterministic\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FractionalMaxPoolGrad"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'out_backprop\', \'row_pooling_sequence\', \'col_pooling_sequence\', \'overlapping\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FusedBatchNorm"
+    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'data_format\', \'is_training\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FusedBatchNormGrad"
+    argspec: "args=[\'y_backprop\', \'x\', \'scale\', \'reserve_space_1\', \'reserve_space_2\', \'epsilon\', \'data_format\', \'is_training\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FusedBatchNormGradV2"
+    argspec: "args=[\'y_backprop\', \'x\', \'scale\', \'reserve_space_1\', \'reserve_space_2\', \'epsilon\', \'data_format\', \'is_training\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FusedBatchNormV2"
+    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'data_format\', \'is_training\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FusedPadConv2D"
+    argspec: "args=[\'input\', \'paddings\', \'filter\', \'mode\', \'strides\', \'padding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FusedResizeAndPadConv2D"
+    argspec: "args=[\'input\', \'size\', \'paddings\', \'filter\', \'resize_align_corners\', \'mode\', \'strides\', \'padding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Gather"
+    argspec: "args=[\'params\', \'indices\', \'validate_indices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "GatherNd"
+    argspec: "args=[\'params\', \'indices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "GatherV2"
+    argspec: "args=[\'params\', \'indices\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "GenerateVocabRemapping"
+    argspec: "args=[\'new_vocab_file\', \'old_vocab_file\', \'new_vocab_offset\', \'num_new_vocab\', \'old_vocab_size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "GeneratorDataset"
+    argspec: "args=[\'init_func_other_args\', \'next_func_other_args\', \'finalize_func_other_args\', \'init_func\', \'next_func\', \'finalize_func\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "GetSessionHandle"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "GetSessionHandleV2"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "GetSessionTensor"
+    argspec: "args=[\'handle\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Greater"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "GreaterEqual"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "GuaranteeConst"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "HSVToRGB"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "HashTable"
+    argspec: "args=[\'container\', \'shared_name\', \'use_node_name_sharing\', \'key_dtype\', \'value_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "HashTableV2"
+    argspec: "args=[\'container\', \'shared_name\', \'use_node_name_sharing\', \'key_dtype\', \'value_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "HistogramFixedWidth"
+    argspec: "args=[\'values\', \'value_range\', \'nbins\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "HistogramSummary"
+    argspec: "args=[\'tag\', \'values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IFFT"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IFFT2D"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IFFT3D"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IRFFT"
+    argspec: "args=[\'input\', \'fft_length\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IRFFT2D"
+    argspec: "args=[\'input\', \'fft_length\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IRFFT3D"
+    argspec: "args=[\'input\', \'fft_length\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Identity"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IdentityN"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IdentityReader"
+    argspec: "args=[\'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IdentityReaderV2"
+    argspec: "args=[\'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "If"
+    argspec: "args=[\'cond\', \'input\', \'Tout\', \'then_branch\', \'else_branch\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Igamma"
+    argspec: "args=[\'a\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IgammaGradA"
+    argspec: "args=[\'a\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Igammac"
+    argspec: "args=[\'a\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Imag"
+    argspec: "args=[\'input\', \'Tout\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ImageSummary"
+    argspec: "args=[\'tag\', \'tensor\', \'max_images\', \'bad_color\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ImmutableConst"
+    argspec: "args=[\'dtype\', \'shape\', \'memory_region_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ImportEvent"
+    argspec: "args=[\'writer\', \'event\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "InTopK"
+    argspec: "args=[\'predictions\', \'targets\', \'k\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "InTopKV2"
+    argspec: "args=[\'predictions\', \'targets\', \'k\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "InitializeTable"
+    argspec: "args=[\'table_handle\', \'keys\', \'values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "InitializeTableFromTextFile"
+    argspec: "args=[\'table_handle\', \'filename\', \'key_index\', \'value_index\', \'vocab_size\', \'delimiter\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "InitializeTableFromTextFileV2"
+    argspec: "args=[\'table_handle\', \'filename\', \'key_index\', \'value_index\', \'vocab_size\', \'delimiter\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "InitializeTableV2"
+    argspec: "args=[\'table_handle\', \'keys\', \'values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "InplaceAdd"
+    argspec: "args=[\'x\', \'i\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "InplaceSub"
+    argspec: "args=[\'x\', \'i\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "InplaceUpdate"
+    argspec: "args=[\'x\', \'i\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "InterleaveDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'cycle_length\', \'block_length\', \'f\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Inv"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "InvGrad"
+    argspec: "args=[\'y\', \'dy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Invert"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "InvertPermutation"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IsBoostedTreesEnsembleInitialized"
+    argspec: "args=[\'tree_ensemble_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IsBoostedTreesQuantileStreamResourceInitialized"
+    argspec: "args=[\'quantile_stream_resource_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IsFinite"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IsInf"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IsNan"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IsVariableInitialized"
+    argspec: "args=[\'ref\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Iterator"
+    argspec: "args=[\'shared_name\', \'container\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IteratorFromStringHandle"
+    argspec: "args=[\'string_handle\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IteratorFromStringHandleV2"
+    argspec: "args=[\'string_handle\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IteratorGetNext"
+    argspec: "args=[\'iterator\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IteratorGetNextAsOptional"
+    argspec: "args=[\'iterator\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IteratorGetNextSync"
+    argspec: "args=[\'iterator\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IteratorToStringHandle"
+    argspec: "args=[\'resource_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IteratorV2"
+    argspec: "args=[\'shared_name\', \'container\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "L2Loss"
+    argspec: "args=[\'t\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LMDBReader"
+    argspec: "args=[\'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LRN"
+    argspec: "args=[\'input\', \'depth_radius\', \'bias\', \'alpha\', \'beta\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LRNGrad"
+    argspec: "args=[\'input_grads\', \'input_image\', \'output_image\', \'depth_radius\', \'bias\', \'alpha\', \'beta\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LeakyRelu"
+    argspec: "args=[\'features\', \'alpha\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LeakyReluGrad"
+    argspec: "args=[\'gradients\', \'features\', \'alpha\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LearnedUnigramCandidateSampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LeftShift"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Less"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LessEqual"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Lgamma"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LinSpace"
+    argspec: "args=[\'start\', \'stop\', \'num\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ListDiff"
+    argspec: "args=[\'x\', \'y\', \'out_idx\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LoadAndRemapMatrix"
+    argspec: "args=[\'ckpt_path\', \'old_tensor_name\', \'row_remapping\', \'col_remapping\', \'initializing_values\', \'num_rows\', \'num_cols\', \'max_rows_in_memory\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Log"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Log1p"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LogMatrixDeterminant"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LogSoftmax"
+    argspec: "args=[\'logits\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LogUniformCandidateSampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LogicalAnd"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LogicalNot"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LogicalOr"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LookupTableExport"
+    argspec: "args=[\'table_handle\', \'Tkeys\', \'Tvalues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LookupTableExportV2"
+    argspec: "args=[\'table_handle\', \'Tkeys\', \'Tvalues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LookupTableFind"
+    argspec: "args=[\'table_handle\', \'keys\', \'default_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LookupTableFindV2"
+    argspec: "args=[\'table_handle\', \'keys\', \'default_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LookupTableImport"
+    argspec: "args=[\'table_handle\', \'keys\', \'values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LookupTableImportV2"
+    argspec: "args=[\'table_handle\', \'keys\', \'values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LookupTableInsert"
+    argspec: "args=[\'table_handle\', \'keys\', \'values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LookupTableInsertV2"
+    argspec: "args=[\'table_handle\', \'keys\', \'values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LookupTableRemoveV2"
+    argspec: "args=[\'table_handle\', \'keys\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LookupTableSize"
+    argspec: "args=[\'table_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LookupTableSizeV2"
+    argspec: "args=[\'table_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LoopCond"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LowerBound"
+    argspec: "args=[\'sorted_inputs\', \'values\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Lu"
+    argspec: "args=[\'input\', \'output_idx_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MakeIterator"
+    argspec: "args=[\'dataset\', \'iterator\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MapClear"
+    argspec: "args=[\'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MapDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'f\', \'output_types\', \'output_shapes\', \'use_inter_op_parallelism\', \'preserve_cardinality\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MapDefun"
+    argspec: "args=[\'arguments\', \'captured_inputs\', \'output_types\', \'output_shapes\', \'f\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MapIncompleteSize"
+    argspec: "args=[\'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MapPeek"
+    argspec: "args=[\'key\', \'indices\', \'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MapSize"
+    argspec: "args=[\'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MapStage"
+    argspec: "args=[\'key\', \'indices\', \'values\', \'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MapUnstage"
+    argspec: "args=[\'key\', \'indices\', \'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MapUnstageNoKey"
+    argspec: "args=[\'indices\', \'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatMul"
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatchingFiles"
+    argspec: "args=[\'pattern\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatrixBandPart"
+    argspec: "args=[\'input\', \'num_lower\', \'num_upper\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatrixDeterminant"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatrixDiag"
+    argspec: "args=[\'diagonal\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatrixDiagPart"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatrixExponential"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatrixInverse"
+    argspec: "args=[\'input\', \'adjoint\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatrixLogarithm"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatrixSetDiag"
+    argspec: "args=[\'input\', \'diagonal\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatrixSolve"
+    argspec: "args=[\'matrix\', \'rhs\', \'adjoint\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatrixSolveLs"
+    argspec: "args=[\'matrix\', \'rhs\', \'l2_regularizer\', \'fast\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatrixSquareRoot"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatrixTriangularSolve"
+    argspec: "args=[\'matrix\', \'rhs\', \'lower\', \'adjoint\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Max"
+    argspec: "args=[\'input\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MaxPool"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MaxPool3D"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MaxPool3DGrad"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MaxPool3DGradGrad"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MaxPoolGrad"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MaxPoolGradGrad"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MaxPoolGradGradV2"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MaxPoolGradGradWithArgmax"
+    argspec: "args=[\'input\', \'grad\', \'argmax\', \'ksize\', \'strides\', \'padding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MaxPoolGradV2"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MaxPoolGradWithArgmax"
+    argspec: "args=[\'input\', \'grad\', \'argmax\', \'ksize\', \'strides\', \'padding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MaxPoolV2"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MaxPoolWithArgmax"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'Targmax\', \'padding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Maximum"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Mean"
+    argspec: "args=[\'input\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Merge"
+    argspec: "args=[\'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MergeSummary"
+    argspec: "args=[\'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MergeV2Checkpoints"
+    argspec: "args=[\'checkpoint_prefixes\', \'destination_prefix\', \'delete_old_dirs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Mfcc"
+    argspec: "args=[\'spectrogram\', \'sample_rate\', \'upper_frequency_limit\', \'lower_frequency_limit\', \'filterbank_channel_count\', \'dct_coefficient_count\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Min"
+    argspec: "args=[\'input\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Minimum"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MirrorPad"
+    argspec: "args=[\'input\', \'paddings\', \'mode\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MirrorPadGrad"
+    argspec: "args=[\'input\', \'paddings\', \'mode\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Mod"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ModelDataset"
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Mul"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MulNoNan"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MultiDeviceIterator"
+    argspec: "args=[\'devices\', \'shared_name\', \'container\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MultiDeviceIteratorFromStringHandle"
+    argspec: "args=[\'string_handle\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MultiDeviceIteratorGetNextFromShard"
+    argspec: "args=[\'multi_device_iterator\', \'shard_num\', \'incarnation_id\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MultiDeviceIteratorInit"
+    argspec: "args=[\'dataset\', \'multi_device_iterator\', \'max_buffer_size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MultiDeviceIteratorToStringHandle"
+    argspec: "args=[\'multi_device_iterator\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Multinomial"
+    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'seed2\', \'output_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MutableDenseHashTable"
+    argspec: "args=[\'empty_key\', \'container\', \'shared_name\', \'use_node_name_sharing\', \'value_dtype\', \'value_shape\', \'initial_num_buckets\', \'max_load_factor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MutableDenseHashTableV2"
+    argspec: "args=[\'empty_key\', \'deleted_key\', \'container\', \'shared_name\', \'use_node_name_sharing\', \'value_dtype\', \'value_shape\', \'initial_num_buckets\', \'max_load_factor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MutableHashTable"
+    argspec: "args=[\'container\', \'shared_name\', \'use_node_name_sharing\', \'key_dtype\', \'value_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MutableHashTableOfTensors"
+    argspec: "args=[\'container\', \'shared_name\', \'use_node_name_sharing\', \'key_dtype\', \'value_dtype\', \'value_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MutableHashTableOfTensorsV2"
+    argspec: "args=[\'container\', \'shared_name\', \'use_node_name_sharing\', \'key_dtype\', \'value_dtype\', \'value_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MutableHashTableV2"
+    argspec: "args=[\'container\', \'shared_name\', \'use_node_name_sharing\', \'key_dtype\', \'value_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MutexLock"
+    argspec: "args=[\'mutex\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MutexV2"
+    argspec: "args=[\'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NcclAllReduce"
+    argspec: "args=[\'input\', \'reduction\', \'num_devices\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NcclBroadcast"
+    argspec: "args=[\'input\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NcclReduce"
+    argspec: "args=[\'input\', \'reduction\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Neg"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NextAfter"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NextIteration"
+    argspec: "args=[\'data\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NoOp"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NonMaxSuppression"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NonMaxSuppressionV2"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NonMaxSuppressionV3"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'score_threshold\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NonMaxSuppressionV4"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'score_threshold\', \'pad_to_max_output_size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NonMaxSuppressionWithOverlaps"
+    argspec: "args=[\'overlaps\', \'scores\', \'max_output_size\', \'overlap_threshold\', \'score_threshold\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NotEqual"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NthElement"
+    argspec: "args=[\'input\', \'n\', \'reverse\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OneHot"
+    argspec: "args=[\'indices\', \'depth\', \'on_value\', \'off_value\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OneShotIterator"
+    argspec: "args=[\'dataset_factory\', \'output_types\', \'output_shapes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OnesLike"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OptimizeDataset"
+    argspec: "args=[\'input_dataset\', \'optimizations\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OptionalFromValue"
+    argspec: "args=[\'components\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OptionalGetValue"
+    argspec: "args=[\'optional\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OptionalHasValue"
+    argspec: "args=[\'optional\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OptionalNone"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OrderedMapClear"
+    argspec: "args=[\'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OrderedMapIncompleteSize"
+    argspec: "args=[\'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OrderedMapPeek"
+    argspec: "args=[\'key\', \'indices\', \'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OrderedMapSize"
+    argspec: "args=[\'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OrderedMapStage"
+    argspec: "args=[\'key\', \'indices\', \'values\', \'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OrderedMapUnstage"
+    argspec: "args=[\'key\', \'indices\', \'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OrderedMapUnstageNoKey"
+    argspec: "args=[\'indices\', \'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Pack"
+    argspec: "args=[\'values\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Pad"
+    argspec: "args=[\'input\', \'paddings\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PadV2"
+    argspec: "args=[\'input\', \'paddings\', \'constant_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PaddedBatchDataset"
+    argspec: "args=[\'input_dataset\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PaddedBatchDatasetV2"
+    argspec: "args=[\'input_dataset\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PaddingFIFOQueue"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PaddingFIFOQueueV2"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ParallelConcat"
+    argspec: "args=[\'values\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ParallelDynamicStitch"
+    argspec: "args=[\'indices\', \'data\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ParallelInterleaveDatasetV2"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'cycle_length\', \'block_length\', \'num_parallel_calls\', \'f\', \'output_types\', \'output_shapes\', \'sloppy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ParallelMapDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'num_parallel_calls\', \'f\', \'output_types\', \'output_shapes\', \'use_inter_op_parallelism\', \'sloppy\', \'preserve_cardinality\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ParameterizedTruncatedNormal"
+    argspec: "args=[\'shape\', \'means\', \'stdevs\', \'minvals\', \'maxvals\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ParseExample"
+    argspec: "args=[\'serialized\', \'names\', \'sparse_keys\', \'dense_keys\', \'dense_defaults\', \'sparse_types\', \'dense_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ParseSequenceExample"
+    argspec: "args=[\'serialized\', \'debug_name\', \'context_dense_defaults\', \'feature_list_dense_missing_assumed_empty\', \'context_sparse_keys\', \'context_dense_keys\', \'feature_list_sparse_keys\', \'feature_list_dense_keys\', \'Ncontext_sparse\', \'Ncontext_dense\', \'Nfeature_list_sparse\', \'Nfeature_list_dense\', \'context_sparse_types\', \'feature_list_dense_types\', \'context_dense_shapes\', \'feature_list_sparse_types\', \'feature_list_dense_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ParseSingleExample"
+    argspec: "args=[\'serialized\', \'dense_defaults\', \'num_sparse\', \'sparse_keys\', \'dense_keys\', \'sparse_types\', \'dense_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ParseSingleSequenceExample"
+    argspec: "args=[\'serialized\', \'feature_list_dense_missing_assumed_empty\', \'context_sparse_keys\', \'context_dense_keys\', \'feature_list_sparse_keys\', \'feature_list_dense_keys\', \'context_dense_defaults\', \'debug_name\', \'context_sparse_types\', \'feature_list_dense_types\', \'context_dense_shapes\', \'feature_list_sparse_types\', \'feature_list_dense_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ParseTensor"
+    argspec: "args=[\'serialized\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PartitionedCall"
+    argspec: "args=[\'args\', \'Tout\', \'f\', \'config\', \'config_proto\', \'executor_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Placeholder"
+    argspec: "args=[\'dtype\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PlaceholderV2"
+    argspec: "args=[\'dtype\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PlaceholderWithDefault"
+    argspec: "args=[\'input\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Polygamma"
+    argspec: "args=[\'a\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PopulationCount"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Pow"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PrefetchDataset"
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PreventGradient"
+    argspec: "args=[\'input\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Print"
+    argspec: "args=[\'input\', \'data\', \'message\', \'first_n\', \'summarize\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PrintV2"
+    argspec: "args=[\'input\', \'output_stream\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PriorityQueue"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PriorityQueueV2"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Prod"
+    argspec: "args=[\'input\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PyFunc"
+    argspec: "args=[\'input\', \'token\', \'Tout\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PyFuncStateless"
+    argspec: "args=[\'input\', \'token\', \'Tout\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Qr"
+    argspec: "args=[\'input\', \'full_matrices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizeAndDequantize"
+    argspec: "args=[\'input\', \'signed_input\', \'num_bits\', \'range_given\', \'input_min\', \'input_max\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizeAndDequantizeV2"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizeAndDequantizeV3"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'num_bits\', \'signed_input\', \'range_given\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizeDownAndShrinkRange"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizeV2"
+    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'round_mode\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedAdd"
+    argspec: "args=[\'x\', \'y\', \'min_x\', \'max_x\', \'min_y\', \'max_y\', \'Toutput\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedAvgPool"
+    argspec: "args=[\'input\', \'min_input\', \'max_input\', \'ksize\', \'strides\', \'padding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedBatchNormWithGlobalNormalization"
+    argspec: "args=[\'t\', \'t_min\', \'t_max\', \'m\', \'m_min\', \'m_max\', \'v\', \'v_min\', \'v_max\', \'beta\', \'beta_min\', \'beta_max\', \'gamma\', \'gamma_min\', \'gamma_max\', \'out_type\', \'variance_epsilon\', \'scale_after_normalization\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedBiasAdd"
+    argspec: "args=[\'input\', \'bias\', \'min_input\', \'max_input\', \'min_bias\', \'max_bias\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedConcat"
+    argspec: "args=[\'concat_dim\', \'values\', \'input_mins\', \'input_maxes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedConv2D"
+    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'out_type\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedConv2DAndRelu"
+    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'out_type\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedConv2DAndReluAndRequantize"
+    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'min_freezed_output\', \'max_freezed_output\', \'out_type\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedConv2DAndRequantize"
+    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'min_freezed_output\', \'max_freezed_output\', \'out_type\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedConv2DWithBias"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'out_type\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedConv2DWithBiasAndRelu"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'out_type\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedConv2DWithBiasAndReluAndRequantize"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'min_freezed_output\', \'max_freezed_output\', \'out_type\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedConv2DWithBiasAndRequantize"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'min_freezed_output\', \'max_freezed_output\', \'out_type\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'min_freezed_output\', \'max_freezed_output\', \'summand\', \'min_summand\', \'max_summand\', \'out_type\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedConv2DWithBiasSumAndRelu"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'summand\', \'out_type\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedConv2DWithBiasSumAndReluAndRequantize"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'min_freezed_output\', \'max_freezed_output\', \'summand\', \'min_summand\', \'max_summand\', \'out_type\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedInstanceNorm"
+    argspec: "args=[\'x\', \'x_min\', \'x_max\', \'output_range_given\', \'given_y_min\', \'given_y_max\', \'variance_epsilon\', \'min_separation\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedMatMul"
+    argspec: "args=[\'a\', \'b\', \'min_a\', \'max_a\', \'min_b\', \'max_b\', \'Toutput\', \'transpose_a\', \'transpose_b\', \'Tactivation\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedMaxPool"
+    argspec: "args=[\'input\', \'min_input\', \'max_input\', \'ksize\', \'strides\', \'padding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedMul"
+    argspec: "args=[\'x\', \'y\', \'min_x\', \'max_x\', \'min_y\', \'max_y\', \'Toutput\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedRelu"
+    argspec: "args=[\'features\', \'min_features\', \'max_features\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedRelu6"
+    argspec: "args=[\'features\', \'min_features\', \'max_features\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedReluX"
+    argspec: "args=[\'features\', \'max_value\', \'min_features\', \'max_features\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedReshape"
+    argspec: "args=[\'tensor\', \'shape\', \'input_min\', \'input_max\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedResizeBilinear"
+    argspec: "args=[\'images\', \'size\', \'min\', \'max\', \'align_corners\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueClose"
+    argspec: "args=[\'handle\', \'cancel_pending_enqueues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueCloseV2"
+    argspec: "args=[\'handle\', \'cancel_pending_enqueues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueDequeue"
+    argspec: "args=[\'handle\', \'component_types\', \'timeout_ms\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueDequeueMany"
+    argspec: "args=[\'handle\', \'n\', \'component_types\', \'timeout_ms\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueDequeueManyV2"
+    argspec: "args=[\'handle\', \'n\', \'component_types\', \'timeout_ms\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueDequeueUpTo"
+    argspec: "args=[\'handle\', \'n\', \'component_types\', \'timeout_ms\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueDequeueUpToV2"
+    argspec: "args=[\'handle\', \'n\', \'component_types\', \'timeout_ms\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueDequeueV2"
+    argspec: "args=[\'handle\', \'component_types\', \'timeout_ms\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueEnqueue"
+    argspec: "args=[\'handle\', \'components\', \'timeout_ms\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueEnqueueMany"
+    argspec: "args=[\'handle\', \'components\', \'timeout_ms\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueEnqueueManyV2"
+    argspec: "args=[\'handle\', \'components\', \'timeout_ms\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueEnqueueV2"
+    argspec: "args=[\'handle\', \'components\', \'timeout_ms\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueIsClosed"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueIsClosedV2"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueSize"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueSizeV2"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RFFT"
+    argspec: "args=[\'input\', \'fft_length\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RFFT2D"
+    argspec: "args=[\'input\', \'fft_length\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RFFT3D"
+    argspec: "args=[\'input\', \'fft_length\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RGBToHSV"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RaggedGather"
+    argspec: "args=[\'params_nested_splits\', \'params_dense_values\', \'indices\', \'OUTPUT_RAGGED_RANK\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RaggedRange"
+    argspec: "args=[\'starts\', \'limits\', \'deltas\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RaggedTensorToSparse"
+    argspec: "args=[\'rt_nested_splits\', \'rt_dense_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RandomCrop"
+    argspec: "args=[\'image\', \'size\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RandomGamma"
+    argspec: "args=[\'shape\', \'alpha\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RandomGammaGrad"
+    argspec: "args=[\'alpha\', \'sample\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RandomPoisson"
+    argspec: "args=[\'shape\', \'rate\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RandomPoissonV2"
+    argspec: "args=[\'shape\', \'rate\', \'seed\', \'seed2\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RandomShuffle"
+    argspec: "args=[\'value\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RandomShuffleQueue"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'min_after_dequeue\', \'seed\', \'seed2\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RandomShuffleQueueV2"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'min_after_dequeue\', \'seed\', \'seed2\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RandomStandardNormal"
+    argspec: "args=[\'shape\', \'seed\', \'seed2\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RandomUniform"
+    argspec: "args=[\'shape\', \'seed\', \'seed2\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RandomUniformInt"
+    argspec: "args=[\'shape\', \'minval\', \'maxval\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Range"
+    argspec: "args=[\'start\', \'limit\', \'delta\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RangeDataset"
+    argspec: "args=[\'start\', \'stop\', \'step\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Rank"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReadFile"
+    argspec: "args=[\'filename\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReadVariableOp"
+    argspec: "args=[\'resource\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderNumRecordsProduced"
+    argspec: "args=[\'reader_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderNumRecordsProducedV2"
+    argspec: "args=[\'reader_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderNumWorkUnitsCompleted"
+    argspec: "args=[\'reader_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderNumWorkUnitsCompletedV2"
+    argspec: "args=[\'reader_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderRead"
+    argspec: "args=[\'reader_handle\', \'queue_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderReadUpTo"
+    argspec: "args=[\'reader_handle\', \'queue_handle\', \'num_records\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderReadUpToV2"
+    argspec: "args=[\'reader_handle\', \'queue_handle\', \'num_records\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderReadV2"
+    argspec: "args=[\'reader_handle\', \'queue_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderReset"
+    argspec: "args=[\'reader_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderResetV2"
+    argspec: "args=[\'reader_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderRestoreState"
+    argspec: "args=[\'reader_handle\', \'state\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderRestoreStateV2"
+    argspec: "args=[\'reader_handle\', \'state\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderSerializeState"
+    argspec: "args=[\'reader_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderSerializeStateV2"
+    argspec: "args=[\'reader_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Real"
+    argspec: "args=[\'input\', \'Tout\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RealDiv"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Reciprocal"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReciprocalGrad"
+    argspec: "args=[\'y\', \'dy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RecordInput"
+    argspec: "args=[\'file_pattern\', \'file_random_seed\', \'file_shuffle_shift_ratio\', \'file_buffer_size\', \'file_parallelism\', \'batch_size\', \'compression_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReduceDataset"
+    argspec: "args=[\'input_dataset\', \'initial_state\', \'other_arguments\', \'f\', \'output_types\', \'output_shapes\', \'use_inter_op_parallelism\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReduceJoin"
+    argspec: "args=[\'inputs\', \'reduction_indices\', \'keep_dims\', \'separator\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RefEnter"
+    argspec: "args=[\'data\', \'frame_name\', \'is_constant\', \'parallel_iterations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RefExit"
+    argspec: "args=[\'data\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RefIdentity"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RefMerge"
+    argspec: "args=[\'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RefNextIteration"
+    argspec: "args=[\'data\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RefSelect"
+    argspec: "args=[\'index\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RefSwitch"
+    argspec: "args=[\'data\', \'pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RegexFullMatch"
+    argspec: "args=[\'input\', \'pattern\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RegexReplace"
+    argspec: "args=[\'input\', \'pattern\', \'rewrite\', \'replace_global\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Relu"
+    argspec: "args=[\'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Relu6"
+    argspec: "args=[\'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Relu6Grad"
+    argspec: "args=[\'gradients\', \'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReluGrad"
+    argspec: "args=[\'gradients\', \'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RemoteCall"
+    argspec: "args=[\'target\', \'args\', \'Tout\', \'f\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RepeatDataset"
+    argspec: "args=[\'input_dataset\', \'count\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RequantizationRange"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RequantizationRangePerChannel"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'clip_value_max\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Requantize"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'requested_output_min\', \'requested_output_max\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RequantizePerChannel"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'requested_output_min\', \'requested_output_max\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Reshape"
+    argspec: "args=[\'tensor\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResizeArea"
+    argspec: "args=[\'images\', \'size\', \'align_corners\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResizeBicubic"
+    argspec: "args=[\'images\', \'size\', \'align_corners\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResizeBicubicGrad"
+    argspec: "args=[\'grads\', \'original_image\', \'align_corners\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResizeBilinear"
+    argspec: "args=[\'images\', \'size\', \'align_corners\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResizeBilinearGrad"
+    argspec: "args=[\'grads\', \'original_image\', \'align_corners\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResizeNearestNeighbor"
+    argspec: "args=[\'images\', \'size\', \'align_corners\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResizeNearestNeighborGrad"
+    argspec: "args=[\'grads\', \'size\', \'align_corners\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyAdaMax"
+    argspec: "args=[\'var\', \'m\', \'v\', \'beta1_power\', \'lr\', \'beta1\', \'beta2\', \'epsilon\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyAdadelta"
+    argspec: "args=[\'var\', \'accum\', \'accum_update\', \'lr\', \'rho\', \'epsilon\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'use_locking\', \'update_slots\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyAdagradDA"
+    argspec: "args=[\'var\', \'gradient_accumulator\', \'gradient_squared_accumulator\', \'grad\', \'lr\', \'l1\', \'l2\', \'global_step\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyAdam"
+    argspec: "args=[\'var\', \'m\', \'v\', \'beta1_power\', \'beta2_power\', \'lr\', \'beta1\', \'beta2\', \'epsilon\', \'grad\', \'use_locking\', \'use_nesterov\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyAdamWithAmsgrad"
+    argspec: "args=[\'var\', \'m\', \'v\', \'vhat\', \'beta1_power\', \'beta2_power\', \'lr\', \'beta1\', \'beta2\', \'epsilon\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyAddSign"
+    argspec: "args=[\'var\', \'m\', \'lr\', \'alpha\', \'sign_decay\', \'beta\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyCenteredRMSProp"
+    argspec: "args=[\'var\', \'mg\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyFtrl"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'lr\', \'l1\', \'l2\', \'lr_power\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyFtrlV2"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'lr\', \'l1\', \'l2\', \'l2_shrinkage\', \'lr_power\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyGradientDescent"
+    argspec: "args=[\'var\', \'alpha\', \'delta\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyKerasMomentum"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'momentum\', \'use_locking\', \'use_nesterov\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyMomentum"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'momentum\', \'use_locking\', \'use_nesterov\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyPowerSign"
+    argspec: "args=[\'var\', \'m\', \'lr\', \'logbase\', \'sign_decay\', \'beta\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyProximalAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'l1\', \'l2\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyProximalGradientDescent"
+    argspec: "args=[\'var\', \'alpha\', \'l1\', \'l2\', \'delta\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyRMSProp"
+    argspec: "args=[\'var\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceCountUpTo"
+    argspec: "args=[\'resource\', \'limit\', \'T\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceGather"
+    argspec: "args=[\'resource\', \'indices\', \'validate_indices\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceScatterAdd"
+    argspec: "args=[\'resource\', \'indices\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceScatterDiv"
+    argspec: "args=[\'resource\', \'indices\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceScatterMax"
+    argspec: "args=[\'resource\', \'indices\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceScatterMin"
+    argspec: "args=[\'resource\', \'indices\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceScatterMul"
+    argspec: "args=[\'resource\', \'indices\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceScatterNdAdd"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceScatterNdSub"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceScatterNdUpdate"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceScatterSub"
+    argspec: "args=[\'resource\', \'indices\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceScatterUpdate"
+    argspec: "args=[\'resource\', \'indices\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceSparseApplyAdadelta"
+    argspec: "args=[\'var\', \'accum\', \'accum_update\', \'lr\', \'rho\', \'epsilon\', \'grad\', \'indices\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceSparseApplyAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'indices\', \'use_locking\', \'update_slots\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceSparseApplyAdagradDA"
+    argspec: "args=[\'var\', \'gradient_accumulator\', \'gradient_squared_accumulator\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'global_step\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceSparseApplyCenteredRMSProp"
+    argspec: "args=[\'var\', \'mg\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'indices\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceSparseApplyFtrl"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'lr_power\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceSparseApplyFtrlV2"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'l2_shrinkage\', \'lr_power\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceSparseApplyKerasMomentum"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'indices\', \'momentum\', \'use_locking\', \'use_nesterov\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceSparseApplyMomentum"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'indices\', \'momentum\', \'use_locking\', \'use_nesterov\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceSparseApplyProximalAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'l1\', \'l2\', \'grad\', \'indices\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceSparseApplyProximalGradientDescent"
+    argspec: "args=[\'var\', \'alpha\', \'l1\', \'l2\', \'grad\', \'indices\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceSparseApplyRMSProp"
+    argspec: "args=[\'var\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'indices\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceStridedSliceAssign"
+    argspec: "args=[\'ref\', \'begin\', \'end\', \'strides\', \'value\', \'begin_mask\', \'end_mask\', \'ellipsis_mask\', \'new_axis_mask\', \'shrink_axis_mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Restore"
+    argspec: "args=[\'file_pattern\', \'tensor_name\', \'dt\', \'preferred_shard\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RestoreSlice"
+    argspec: "args=[\'file_pattern\', \'tensor_name\', \'shape_and_slice\', \'dt\', \'preferred_shard\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RestoreV2"
+    argspec: "args=[\'prefix\', \'tensor_names\', \'shape_and_slices\', \'dtypes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Reverse"
+    argspec: "args=[\'tensor\', \'dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReverseSequence"
+    argspec: "args=[\'input\', \'seq_lengths\', \'seq_dim\', \'batch_dim\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReverseV2"
+    argspec: "args=[\'tensor\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RightShift"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Rint"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Roll"
+    argspec: "args=[\'input\', \'shift\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Round"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Rsqrt"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RsqrtGrad"
+    argspec: "args=[\'y\', \'dy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SampleDistortedBoundingBox"
+    argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'seed2\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SampleDistortedBoundingBoxV2"
+    argspec: "args=[\'image_size\', \'bounding_boxes\', \'min_object_covered\', \'seed\', \'seed2\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Save"
+    argspec: "args=[\'filename\', \'tensor_names\', \'data\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SaveSlices"
+    argspec: "args=[\'filename\', \'tensor_names\', \'shapes_and_slices\', \'data\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SaveV2"
+    argspec: "args=[\'prefix\', \'tensor_names\', \'shape_and_slices\', \'tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScalarSummary"
+    argspec: "args=[\'tags\', \'values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScaleAndTranslate"
+    argspec: "args=[\'images\', \'size\', \'scale\', \'translation\', \'kernel_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScaleAndTranslateGrad"
+    argspec: "args=[\'grads\', \'original_image\', \'scale\', \'translation\', \'kernel_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScatterAdd"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScatterDiv"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScatterMax"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScatterMin"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScatterMul"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScatterNd"
+    argspec: "args=[\'indices\', \'updates\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScatterNdAdd"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScatterNdNonAliasingAdd"
+    argspec: "args=[\'input\', \'indices\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScatterNdSub"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScatterNdUpdate"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScatterSub"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScatterUpdate"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SdcaFprint"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SdcaOptimizer"
+    argspec: "args=[\'sparse_example_indices\', \'sparse_feature_indices\', \'sparse_feature_values\', \'dense_features\', \'example_weights\', \'example_labels\', \'sparse_indices\', \'sparse_weights\', \'dense_weights\', \'example_state_data\', \'loss_type\', \'adaptative\', \'l1\', \'l2\', \'num_loss_partitions\', \'num_inner_iterations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SdcaOptimizerV2"
+    argspec: "args=[\'sparse_example_indices\', \'sparse_feature_indices\', \'sparse_feature_values\', \'dense_features\', \'example_weights\', \'example_labels\', \'sparse_indices\', \'sparse_weights\', \'dense_weights\', \'example_state_data\', \'loss_type\', \'adaptive\', \'l1\', \'l2\', \'num_loss_partitions\', \'num_inner_iterations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SdcaShrinkL1"
+    argspec: "args=[\'weights\', \'l1\', \'l2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SegmentMax"
+    argspec: "args=[\'data\', \'segment_ids\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SegmentMean"
+    argspec: "args=[\'data\', \'segment_ids\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SegmentMin"
+    argspec: "args=[\'data\', \'segment_ids\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SegmentProd"
+    argspec: "args=[\'data\', \'segment_ids\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SegmentSum"
+    argspec: "args=[\'data\', \'segment_ids\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Select"
+    argspec: "args=[\'condition\', \'t\', \'e\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SelfAdjointEig"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SelfAdjointEigV2"
+    argspec: "args=[\'input\', \'compute_v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Selu"
+    argspec: "args=[\'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SeluGrad"
+    argspec: "args=[\'gradients\', \'outputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SerializeIterator"
+    argspec: "args=[\'resource_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SerializeManySparse"
+    argspec: "args=[\'sparse_indices\', \'sparse_values\', \'sparse_shape\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SerializeSparse"
+    argspec: "args=[\'sparse_indices\', \'sparse_values\', \'sparse_shape\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SerializeTensor"
+    argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SetSize"
+    argspec: "args=[\'set_indices\', \'set_values\', \'set_shape\', \'validate_indices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Shape"
+    argspec: "args=[\'input\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ShapeN"
+    argspec: "args=[\'input\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ShardDataset"
+    argspec: "args=[\'input_dataset\', \'num_shards\', \'index\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ShardedFilename"
+    argspec: "args=[\'basename\', \'shard\', \'num_shards\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ShardedFilespec"
+    argspec: "args=[\'basename\', \'num_shards\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ShuffleAndRepeatDataset"
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'seed\', \'seed2\', \'count\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ShuffleDataset"
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'seed\', \'seed2\', \'reshuffle_each_iteration\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Sigmoid"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SigmoidGrad"
+    argspec: "args=[\'y\', \'dy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Sign"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Sin"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Sinh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Size"
+    argspec: "args=[\'input\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SkipDataset"
+    argspec: "args=[\'input_dataset\', \'count\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Slice"
+    argspec: "args=[\'input\', \'begin\', \'size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Snapshot"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Softmax"
+    argspec: "args=[\'logits\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SoftmaxCrossEntropyWithLogits"
+    argspec: "args=[\'features\', \'labels\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Softplus"
+    argspec: "args=[\'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SoftplusGrad"
+    argspec: "args=[\'gradients\', \'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Softsign"
+    argspec: "args=[\'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SoftsignGrad"
+    argspec: "args=[\'gradients\', \'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SpaceToBatch"
+    argspec: "args=[\'input\', \'paddings\', \'block_size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SpaceToBatchND"
+    argspec: "args=[\'input\', \'block_shape\', \'paddings\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SpaceToDepth"
+    argspec: "args=[\'input\', \'block_size\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseAccumulatorApplyGradient"
+    argspec: "args=[\'handle\', \'local_step\', \'gradient_indices\', \'gradient_values\', \'gradient_shape\', \'has_known_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseAccumulatorTakeGradient"
+    argspec: "args=[\'handle\', \'num_required\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseAdd"
+    argspec: "args=[\'a_indices\', \'a_values\', \'a_shape\', \'b_indices\', \'b_values\', \'b_shape\', \'thresh\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseAddGrad"
+    argspec: "args=[\'backprop_val_grad\', \'a_indices\', \'b_indices\', \'sum_indices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseApplyAdadelta"
+    argspec: "args=[\'var\', \'accum\', \'accum_update\', \'lr\', \'rho\', \'epsilon\', \'grad\', \'indices\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseApplyAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'indices\', \'use_locking\', \'update_slots\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseApplyAdagradDA"
+    argspec: "args=[\'var\', \'gradient_accumulator\', \'gradient_squared_accumulator\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'global_step\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseApplyCenteredRMSProp"
+    argspec: "args=[\'var\', \'mg\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'indices\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseApplyFtrl"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'lr_power\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseApplyFtrlV2"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'l2_shrinkage\', \'lr_power\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseApplyMomentum"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'indices\', \'momentum\', \'use_locking\', \'use_nesterov\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseApplyProximalAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'l1\', \'l2\', \'grad\', \'indices\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseApplyProximalGradientDescent"
+    argspec: "args=[\'var\', \'alpha\', \'l1\', \'l2\', \'grad\', \'indices\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseApplyRMSProp"
+    argspec: "args=[\'var\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'indices\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseConcat"
+    argspec: "args=[\'indices\', \'values\', \'shapes\', \'concat_dim\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseConditionalAccumulator"
+    argspec: "args=[\'dtype\', \'shape\', \'container\', \'shared_name\', \'reduction_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseCross"
+    argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'hashed_output\', \'num_buckets\', \'hash_key\', \'out_type\', \'internal_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseDenseCwiseAdd"
+    argspec: "args=[\'sp_indices\', \'sp_values\', \'sp_shape\', \'dense\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseDenseCwiseDiv"
+    argspec: "args=[\'sp_indices\', \'sp_values\', \'sp_shape\', \'dense\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseDenseCwiseMul"
+    argspec: "args=[\'sp_indices\', \'sp_values\', \'sp_shape\', \'dense\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseFillEmptyRows"
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'default_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseFillEmptyRowsGrad"
+    argspec: "args=[\'reverse_index_map\', \'grad_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseMatMul"
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'a_is_sparse\', \'b_is_sparse\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseReduceMax"
+    argspec: "args=[\'input_indices\', \'input_values\', \'input_shape\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseReduceMaxSparse"
+    argspec: "args=[\'input_indices\', \'input_values\', \'input_shape\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseReduceSum"
+    argspec: "args=[\'input_indices\', \'input_values\', \'input_shape\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseReduceSumSparse"
+    argspec: "args=[\'input_indices\', \'input_values\', \'input_shape\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseReorder"
+    argspec: "args=[\'input_indices\', \'input_values\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseReshape"
+    argspec: "args=[\'input_indices\', \'input_shape\', \'new_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSegmentMean"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSegmentMeanGrad"
+    argspec: "args=[\'grad\', \'indices\', \'segment_ids\', \'output_dim0\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSegmentMeanWithNumSegments"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'num_segments\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSegmentSqrtN"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSegmentSqrtNGrad"
+    argspec: "args=[\'grad\', \'indices\', \'segment_ids\', \'output_dim0\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSegmentSqrtNWithNumSegments"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'num_segments\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSegmentSum"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSegmentSumWithNumSegments"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'num_segments\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSlice"
+    argspec: "args=[\'indices\', \'values\', \'shape\', \'start\', \'size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSliceGrad"
+    argspec: "args=[\'backprop_val_grad\', \'input_indices\', \'input_start\', \'output_indices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSoftmax"
+    argspec: "args=[\'sp_indices\', \'sp_values\', \'sp_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSoftmaxCrossEntropyWithLogits"
+    argspec: "args=[\'features\', \'labels\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSparseMaximum"
+    argspec: "args=[\'a_indices\', \'a_values\', \'a_shape\', \'b_indices\', \'b_values\', \'b_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSparseMinimum"
+    argspec: "args=[\'a_indices\', \'a_values\', \'a_shape\', \'b_indices\', \'b_values\', \'b_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSplit"
+    argspec: "args=[\'split_dim\', \'indices\', \'values\', \'shape\', \'num_split\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseTensorDenseAdd"
+    argspec: "args=[\'a_indices\', \'a_values\', \'a_shape\', \'b\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseTensorDenseMatMul"
+    argspec: "args=[\'a_indices\', \'a_values\', \'a_shape\', \'b\', \'adjoint_a\', \'adjoint_b\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseTensorSliceDataset"
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseToDense"
+    argspec: "args=[\'sparse_indices\', \'output_shape\', \'sparse_values\', \'default_value\', \'validate_indices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseToSparseSetOperation"
+    argspec: "args=[\'set1_indices\', \'set1_values\', \'set1_shape\', \'set2_indices\', \'set2_values\', \'set2_shape\', \'set_operation\', \'validate_indices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Split"
+    argspec: "args=[\'split_dim\', \'value\', \'num_split\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SplitV"
+    argspec: "args=[\'value\', \'size_splits\', \'split_dim\', \'num_split\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Sqrt"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SqrtGrad"
+    argspec: "args=[\'y\', \'dy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Square"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SquaredDifference"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Squeeze"
+    argspec: "args=[\'input\', \'squeeze_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Stack"
+    argspec: "args=[\'elem_type\', \'stack_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StackClose"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StackCloseV2"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StackPop"
+    argspec: "args=[\'handle\', \'elem_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StackPopV2"
+    argspec: "args=[\'handle\', \'elem_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StackPush"
+    argspec: "args=[\'handle\', \'elem\', \'swap_memory\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StackPushV2"
+    argspec: "args=[\'handle\', \'elem\', \'swap_memory\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StackV2"
+    argspec: "args=[\'max_size\', \'elem_type\', \'stack_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Stage"
+    argspec: "args=[\'values\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StageClear"
+    argspec: "args=[\'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StagePeek"
+    argspec: "args=[\'index\', \'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StageSize"
+    argspec: "args=[\'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StatefulPartitionedCall"
+    argspec: "args=[\'args\', \'Tout\', \'f\', \'config\', \'config_proto\', \'executor_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StatelessIf"
+    argspec: "args=[\'cond\', \'input\', \'Tout\', \'then_branch\', \'else_branch\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StatelessMultinomial"
+    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'output_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StatelessRandomNormal"
+    argspec: "args=[\'shape\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StatelessRandomUniform"
+    argspec: "args=[\'shape\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StatelessRandomUniformInt"
+    argspec: "args=[\'shape\', \'seed\', \'minval\', \'maxval\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StatelessTruncatedNormal"
+    argspec: "args=[\'shape\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StatelessWhile"
+    argspec: "args=[\'input\', \'cond\', \'body\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StaticRegexFullMatch"
+    argspec: "args=[\'input\', \'pattern\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StaticRegexReplace"
+    argspec: "args=[\'input\', \'pattern\', \'rewrite\', \'replace_global\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StopGradient"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StridedSlice"
+    argspec: "args=[\'input\', \'begin\', \'end\', \'strides\', \'begin_mask\', \'end_mask\', \'ellipsis_mask\', \'new_axis_mask\', \'shrink_axis_mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StridedSliceAssign"
+    argspec: "args=[\'ref\', \'begin\', \'end\', \'strides\', \'value\', \'begin_mask\', \'end_mask\', \'ellipsis_mask\', \'new_axis_mask\', \'shrink_axis_mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StridedSliceGrad"
+    argspec: "args=[\'shape\', \'begin\', \'end\', \'strides\', \'dy\', \'begin_mask\', \'end_mask\', \'ellipsis_mask\', \'new_axis_mask\', \'shrink_axis_mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StringFormat"
+    argspec: "args=[\'inputs\', \'template\', \'placeholder\', \'summarize\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StringJoin"
+    argspec: "args=[\'inputs\', \'separator\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StringLength"
+    argspec: "args=[\'input\', \'unit\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StringSplit"
+    argspec: "args=[\'input\', \'delimiter\', \'skip_empty\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StringSplitV2"
+    argspec: "args=[\'input\', \'sep\', \'maxsplit\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StringStrip"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StringToHashBucket"
+    argspec: "args=[\'string_tensor\', \'num_buckets\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StringToHashBucketFast"
+    argspec: "args=[\'input\', \'num_buckets\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StringToHashBucketStrong"
+    argspec: "args=[\'input\', \'num_buckets\', \'key\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StringToNumber"
+    argspec: "args=[\'string_tensor\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Sub"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Substr"
+    argspec: "args=[\'input\', \'pos\', \'len\', \'unit\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Sum"
+    argspec: "args=[\'input\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SummaryWriter"
+    argspec: "args=[\'shared_name\', \'container\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Svd"
+    argspec: "args=[\'input\', \'compute_uv\', \'full_matrices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Switch"
+    argspec: "args=[\'data\', \'pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SymbolicGradient"
+    argspec: "args=[\'input\', \'Tout\', \'f\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TFRecordDataset"
+    argspec: "args=[\'filenames\', \'compression_type\', \'buffer_size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TFRecordReader"
+    argspec: "args=[\'container\', \'shared_name\', \'compression_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TFRecordReaderV2"
+    argspec: "args=[\'container\', \'shared_name\', \'compression_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TakeDataset"
+    argspec: "args=[\'input_dataset\', \'count\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TakeManySparseFromTensorsMap"
+    argspec: "args=[\'sparse_handles\', \'dtype\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Tan"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Tanh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TanhGrad"
+    argspec: "args=[\'y\', \'dy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TemporaryVariable"
+    argspec: "args=[\'shape\', \'dtype\', \'var_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArray"
+    argspec: "args=[\'size\', \'dtype\', \'dynamic_size\', \'clear_after_read\', \'tensor_array_name\', \'element_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayClose"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayCloseV2"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayCloseV3"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayConcat"
+    argspec: "args=[\'handle\', \'flow_in\', \'dtype\', \'element_shape_except0\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayConcatV2"
+    argspec: "args=[\'handle\', \'flow_in\', \'dtype\', \'element_shape_except0\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayConcatV3"
+    argspec: "args=[\'handle\', \'flow_in\', \'dtype\', \'element_shape_except0\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayGather"
+    argspec: "args=[\'handle\', \'indices\', \'flow_in\', \'dtype\', \'element_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayGatherV2"
+    argspec: "args=[\'handle\', \'indices\', \'flow_in\', \'dtype\', \'element_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayGatherV3"
+    argspec: "args=[\'handle\', \'indices\', \'flow_in\', \'dtype\', \'element_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayGrad"
+    argspec: "args=[\'handle\', \'flow_in\', \'source\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayGradV2"
+    argspec: "args=[\'handle\', \'flow_in\', \'source\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayGradV3"
+    argspec: "args=[\'handle\', \'flow_in\', \'source\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayGradWithShape"
+    argspec: "args=[\'handle\', \'flow_in\', \'shape_to_prepend\', \'source\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayPack"
+    argspec: "args=[\'handle\', \'flow_in\', \'dtype\', \'element_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayRead"
+    argspec: "args=[\'handle\', \'index\', \'flow_in\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayReadV2"
+    argspec: "args=[\'handle\', \'index\', \'flow_in\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayReadV3"
+    argspec: "args=[\'handle\', \'index\', \'flow_in\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayScatter"
+    argspec: "args=[\'handle\', \'indices\', \'value\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayScatterV2"
+    argspec: "args=[\'handle\', \'indices\', \'value\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayScatterV3"
+    argspec: "args=[\'handle\', \'indices\', \'value\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArraySize"
+    argspec: "args=[\'handle\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArraySizeV2"
+    argspec: "args=[\'handle\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArraySizeV3"
+    argspec: "args=[\'handle\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArraySplit"
+    argspec: "args=[\'handle\', \'value\', \'lengths\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArraySplitV2"
+    argspec: "args=[\'handle\', \'value\', \'lengths\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArraySplitV3"
+    argspec: "args=[\'handle\', \'value\', \'lengths\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayUnpack"
+    argspec: "args=[\'handle\', \'value\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayV2"
+    argspec: "args=[\'size\', \'dtype\', \'element_shape\', \'dynamic_size\', \'clear_after_read\', \'tensor_array_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayV3"
+    argspec: "args=[\'size\', \'dtype\', \'element_shape\', \'dynamic_size\', \'clear_after_read\', \'identical_element_shapes\', \'tensor_array_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayWrite"
+    argspec: "args=[\'handle\', \'index\', \'value\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayWriteV2"
+    argspec: "args=[\'handle\', \'index\', \'value\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayWriteV3"
+    argspec: "args=[\'handle\', \'index\', \'value\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorDataset"
+    argspec: "args=[\'components\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListConcat"
+    argspec: "args=[\'input_handle\', \'element_dtype\', \'element_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListConcatLists"
+    argspec: "args=[\'input_a\', \'input_b\', \'element_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListConcatV2"
+    argspec: "args=[\'input_handle\', \'element_shape\', \'leading_dims\', \'element_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListElementShape"
+    argspec: "args=[\'input_handle\', \'shape_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListFromTensor"
+    argspec: "args=[\'tensor\', \'element_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListGather"
+    argspec: "args=[\'input_handle\', \'indices\', \'element_shape\', \'element_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListGetItem"
+    argspec: "args=[\'input_handle\', \'index\', \'element_shape\', \'element_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListLength"
+    argspec: "args=[\'input_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListPopBack"
+    argspec: "args=[\'input_handle\', \'element_shape\', \'element_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListPushBack"
+    argspec: "args=[\'input_handle\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListPushBackBatch"
+    argspec: "args=[\'input_handles\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListReserve"
+    argspec: "args=[\'element_shape\', \'num_elements\', \'element_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListResize"
+    argspec: "args=[\'input_handle\', \'size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListScatter"
+    argspec: "args=[\'tensor\', \'indices\', \'element_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListScatterIntoExistingList"
+    argspec: "args=[\'input_handle\', \'tensor\', \'indices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListScatterV2"
+    argspec: "args=[\'tensor\', \'indices\', \'element_shape\', \'num_elements\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListSetItem"
+    argspec: "args=[\'input_handle\', \'index\', \'item\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListSplit"
+    argspec: "args=[\'tensor\', \'element_shape\', \'lengths\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListStack"
+    argspec: "args=[\'input_handle\', \'element_shape\', \'element_dtype\', \'num_elements\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorScatterAdd"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorScatterSub"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorScatterUpdate"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorSliceDataset"
+    argspec: "args=[\'components\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorSummary"
+    argspec: "args=[\'tensor\', \'description\', \'labels\', \'display_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorSummaryV2"
+    argspec: "args=[\'tag\', \'tensor\', \'serialized_summary_metadata\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TextLineDataset"
+    argspec: "args=[\'filenames\', \'compression_type\', \'buffer_size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TextLineReader"
+    argspec: "args=[\'skip_header_lines\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TextLineReaderV2"
+    argspec: "args=[\'skip_header_lines\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ThreadUnsafeUnigramCandidateSampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Tile"
+    argspec: "args=[\'input\', \'multiples\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TileGrad"
+    argspec: "args=[\'input\', \'multiples\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Timestamp"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TopK"
+    argspec: "args=[\'input\', \'k\', \'sorted\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TopKV2"
+    argspec: "args=[\'input\', \'k\', \'sorted\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Transpose"
+    argspec: "args=[\'x\', \'perm\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TridiagonalSolve"
+    argspec: "args=[\'diagonals\', \'rhs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TruncateDiv"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TruncateMod"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TruncatedNormal"
+    argspec: "args=[\'shape\', \'seed\', \'seed2\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UnicodeDecode"
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UnicodeDecodeWithOffsets"
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UnicodeEncode"
+    argspec: "args=[\'input_values\', \'input_splits\', \'errors\', \'output_encoding\', \'replacement_char\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UnicodeScript"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UnicodeTranscode"
+    argspec: "args=[\'input\', \'input_encoding\', \'output_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UniformCandidateSampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Unique"
+    argspec: "args=[\'x\', \'out_idx\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UniqueV2"
+    argspec: "args=[\'x\', \'axis\', \'out_idx\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UniqueWithCounts"
+    argspec: "args=[\'x\', \'out_idx\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UniqueWithCountsV2"
+    argspec: "args=[\'x\', \'axis\', \'out_idx\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Unpack"
+    argspec: "args=[\'value\', \'num\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UnravelIndex"
+    argspec: "args=[\'indices\', \'dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UnsortedSegmentMax"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UnsortedSegmentMin"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UnsortedSegmentProd"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UnsortedSegmentSum"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Unstage"
+    argspec: "args=[\'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UnwrapDatasetVariant"
+    argspec: "args=[\'input_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UpperBound"
+    argspec: "args=[\'sorted_inputs\', \'values\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "VarHandleOp"
+    argspec: "args=[\'container\', \'shared_name\', \'dtype\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "VarIsInitializedOp"
+    argspec: "args=[\'resource\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Variable"
+    argspec: "args=[\'shape\', \'dtype\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "VariableShape"
+    argspec: "args=[\'input\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "VariableV2"
+    argspec: "args=[\'shape\', \'dtype\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Where"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "While"
+    argspec: "args=[\'input\', \'cond\', \'body\', \'output_shapes\', \'parallel_iterations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "WholeFileReader"
+    argspec: "args=[\'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "WholeFileReaderV2"
+    argspec: "args=[\'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "WindowDataset"
+    argspec: "args=[\'input_dataset\', \'size\', \'shift\', \'stride\', \'drop_remainder\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "WrapDatasetVariant"
+    argspec: "args=[\'input_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "WriteAudioSummary"
+    argspec: "args=[\'writer\', \'step\', \'tag\', \'tensor\', \'sample_rate\', \'max_outputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "WriteFile"
+    argspec: "args=[\'filename\', \'contents\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "WriteGraphSummary"
+    argspec: "args=[\'writer\', \'step\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "WriteHistogramSummary"
+    argspec: "args=[\'writer\', \'step\', \'tag\', \'values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "WriteImageSummary"
+    argspec: "args=[\'writer\', \'step\', \'tag\', \'tensor\', \'bad_color\', \'max_images\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "WriteScalarSummary"
+    argspec: "args=[\'writer\', \'step\', \'tag\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "WriteSummary"
+    argspec: "args=[\'writer\', \'step\', \'tensor\', \'tag\', \'summary_metadata\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Xdivy"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Xlogy"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ZerosLike"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Zeta"
+    argspec: "args=[\'x\', \'q\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ZipDataset"
+    argspec: "args=[\'input_datasets\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
index 2a7c78910526f83fdfcd963c21996b4f4dc4bc28..5216f4e2ed063c72999654de022db37bbedb6a63 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
@@ -164,6 +164,10 @@ tf_module {
     name: "load"
     argspec: "args=[\'sess\', \'tags\', \'export_dir\', \'import_scope\'], varargs=None, keywords=saver_kwargs, defaults=[\'None\'], "
   }
+  member_method {
+    name: "load_v2"
+    argspec: "args=[\'export_dir\', \'tags\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "main_op_with_restore"
     argspec: "args=[\'restore_op_name\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-tensor.pbtxt
index 02e59a63e10b1a24bfe0c275044bf807b433f62e..a31689a58bceb91ccfb3fa91d8b778c6c25cc929 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.-sparse-tensor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.sparse.SparseTensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensor\'>"
   is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dense_shape"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
index 33e342bc75486be0bccffc1e36a94e147f934432..d3543e2e19def45db6b6d627b54475af2ae28c28 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
@@ -14,7 +14,7 @@ tf_module {
   }
   member_method {
     name: "concat"
-    argspec: "args=[\'axis\', \'sp_inputs\', \'name\', \'expand_nonconcat_dim\', \'concat_dim\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'axis\', \'sp_inputs\', \'name\', \'expand_nonconcat_dim\', \'concat_dim\', \'expand_nonconcat_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\'], "
   }
   member_method {
     name: "cross"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
index a1cd581a86bc2132bfa04ac3f3433e84b6365b19..ada8be91454b190875f6f078328c8f5279bd4784 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.strings"
 tf_module {
+  member_method {
+    name: "as_string"
+    argspec: "args=[\'input\', \'precision\', \'scientific\', \'shortest\', \'width\', \'fill\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'False\', \'False\', \'-1\', \'\', \'None\'], "
+  }
   member_method {
     name: "format"
     argspec: "args=[\'template\', \'inputs\', \'placeholder\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'{}\', \'3\', \'None\'], "
@@ -14,7 +18,7 @@ tf_module {
   }
   member_method {
     name: "reduce_join"
-    argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\', \'None\'], "
+    argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "regex_full_match"
@@ -38,7 +42,7 @@ tf_module {
   }
   member_method {
     name: "to_hash_bucket"
-    argspec: "args=[\'string_tensor\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'string_tensor\', \'num_buckets\', \'name\', \'input\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "to_hash_bucket_fast"
@@ -50,7 +54,15 @@ tf_module {
   }
   member_method {
     name: "to_number"
-    argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+    argspec: "args=[\'string_tensor\', \'out_type\', \'name\', \'input\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
+  }
+  member_method {
+    name: "unicode_decode"
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "unicode_decode_with_offsets"
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
   }
   member_method {
     name: "unicode_encode"
@@ -60,6 +72,14 @@ tf_module {
     name: "unicode_script"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "unicode_split"
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'None\'], "
+  }
+  member_method {
+    name: "unicode_split_with_offsets"
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'None\'], "
+  }
   member_method {
     name: "unicode_transcode"
     argspec: "args=[\'input\', \'input_encoding\', \'output_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.summary.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.summary.pbtxt
index 7ed9cd77a01c2eadb5ea43a02306d60d505127a0..3879645d60249b18664b77125917d2066a063662 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.summary.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.summary.pbtxt
@@ -44,6 +44,10 @@ tf_module {
     name: "image"
     argspec: "args=[\'name\', \'tensor\', \'max_outputs\', \'collections\', \'family\'], varargs=None, keywords=None, defaults=[\'3\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "initialize"
+    argspec: "args=[\'graph\', \'session\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "merge"
     argspec: "args=[\'inputs\', \'collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-adadelta-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-adadelta-optimizer.pbtxt
index 1f1d8b6f9e2cde4800cdef9c417191b1a0ce07b5..6ed8f934f307b88c26993176b1838d202d187b17 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-adadelta-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-adadelta-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.AdadeltaOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adadelta.AdadeltaOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-d-a-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
index a7c05d484905a0af26c80a52d92623ef4a3eb6c4..c57b3d8ed5a78b0417fd0e927e447cb3c0d9dd96 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.AdagradDAOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adagrad_da.AdagradDAOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-optimizer.pbtxt
index bc8b92389c6ed7dcb0fa23ff3abd86bb0d1c488a..897df3ed231d5e91c417e78c48e062a591308cf8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-adagrad-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.AdagradOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adagrad.AdagradOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-adam-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-adam-optimizer.pbtxt
index 5d17be9378fd130b89e199544f85e03a23a71d3c..cb8b5d366c4c5e853e99ca79737886b62f9503ce 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-adam-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-adam-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.AdamOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adam.AdamOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint.pbtxt
index 5be37200f368b1823093c67ad7042db534b0df93..629bc13612ab567006436bd95fee49c4e3acdefe 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.train.Checkpoint"
 tf_class {
-  is_instance: "<class \'tensorflow.python.training.checkpointable.util.Checkpoint\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.tracking.Checkpointable\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.util.Checkpoint\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "save_counter"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt
index d265fdeb01c38d8a1347e630d7f7bff111999634..1d1aceb0138d264501758a26eba75791d5b9f735 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-ftrl-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.FtrlOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.ftrl.FtrlOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-gradient-descent-optimizer.pbtxt
index c673e29cd4dd6cd3c01582abfbc306c092818892..b998e848c2ba6a585d3820549d1d873bf04538cf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-gradient-descent-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-gradient-descent-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.GradientDescentOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.gradient_descent.GradientDescentOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-momentum-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-momentum-optimizer.pbtxt
index 8199f63b9b8c64c73a3d62294277838cdc240280..2de61d67f717786152515d414bed6ccd574aa58f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-momentum-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-momentum-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.MomentumOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.momentum.MomentumOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-optimizer.pbtxt
index 876bb35e391885e751066a415967af848280c714..8baa56902581d8ac405f95992daa29ae4a9fd1e6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-optimizer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.train.Optimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
index 14349a74efb61124fc7b5568d5ec023f08b1b62f..626b75335461fc13a0bdc73b220d7e562a5a6c46 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.ProximalAdagradOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.proximal_adagrad.ProximalAdagradOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
index 7d982dc51f6edce1cf691671e31ddd07664f0dc1..9c0dca030d2fa4c75315fb60df4cc2019271b41e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.ProximalGradientDescentOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.proximal_gradient_descent.ProximalGradientDescentOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-r-m-s-prop-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
index 906384a2875bf7b05ac26fc43207f4ef9b5a7472..61ae458c01750493d87bc53f3be5c660ab912f5d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.RMSPropOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.rmsprop.RMSPropOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-saver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-saver.pbtxt
index 2cda458f468b2d748b43954b14b670df7145243f..a91ba5b6722d9929fc857091a5b2c6ae5cf188f9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-saver.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-saver.pbtxt
@@ -20,7 +20,7 @@ tf_class {
   }
   member_method {
     name: "export_meta_graph"
-    argspec: "args=[\'self\', \'filename\', \'collection_list\', \'as_text\', \'export_scope\', \'clear_devices\', \'clear_extraneous_savers\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\', \'False\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'filename\', \'collection_list\', \'as_text\', \'export_scope\', \'clear_devices\', \'clear_extraneous_savers\', \'strip_default_attrs\', \'save_debug_info\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\', \'False\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "from_proto"
@@ -36,7 +36,7 @@ tf_class {
   }
   member_method {
     name: "save"
-    argspec: "args=[\'self\', \'sess\', \'save_path\', \'global_step\', \'latest_filename\', \'meta_graph_suffix\', \'write_meta_graph\', \'write_state\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'meta\', \'True\', \'True\', \'False\'], "
+    argspec: "args=[\'self\', \'sess\', \'save_path\', \'global_step\', \'latest_filename\', \'meta_graph_suffix\', \'write_meta_graph\', \'write_state\', \'strip_default_attrs\', \'save_debug_info\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'meta\', \'True\', \'True\', \'False\', \'False\'], "
   }
   member_method {
     name: "set_last_checkpoints"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-sync-replicas-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-sync-replicas-optimizer.pbtxt
index 2c0fda3c72b7e1f02265827b9dc1929500935cd1..b812d6f1ef4ae8f4173c2b23a010935dcc6cabcf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-sync-replicas-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-sync-replicas-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.SyncReplicasOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.sync_replicas_optimizer.SyncReplicasOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
index bdb3ea2197c78dd17357f2753f05638c3c054bd6..ab7b199d8050ccda036eebfd590d99720e415b92 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
@@ -298,7 +298,7 @@ tf_module {
   }
   member_method {
     name: "export_meta_graph"
-    argspec: "args=[\'filename\', \'meta_info_def\', \'graph_def\', \'saver_def\', \'collection_list\', \'as_text\', \'graph\', \'export_scope\', \'clear_devices\', \'clear_extraneous_savers\', \'strip_default_attrs\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'None\', \'None\', \'False\', \'False\', \'False\'], "
+    argspec: "args=[\'filename\', \'meta_info_def\', \'graph_def\', \'saver_def\', \'collection_list\', \'as_text\', \'graph\', \'export_scope\', \'clear_devices\', \'clear_extraneous_savers\', \'strip_default_attrs\', \'save_debug_info\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'None\', \'None\', \'False\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "generate_checkpoint_state_proto"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-aggregation-method.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-aggregation-method.pbtxt
index f79029d3fe0b88a454b11456b3785c3ae28a253c..cc2d5c87d667fb5c4af6b6fc435ae626334fe2d1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-aggregation-method.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-aggregation-method.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.AggregationMethod"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.gradients_impl.AggregationMethod\'>"
+  is_instance: "<class \'tensorflow.python.ops.gradients_util.AggregationMethod\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "ADD_N"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-critical-section.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-critical-section.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..024a20834635e2fc75ad7e6a393fc8f092d9631a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-critical-section.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.CriticalSection"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.critical_section_ops.CriticalSection\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'shared_name\', \'critical_section_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "execute"
+    argspec: "args=[\'self\', \'fn\', \'exclusive_resource_access\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-event.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-event.pbtxt
deleted file mode 100644
index 3b75a1735be76fe77689736e492c42c54ab795c1..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-event.pbtxt
+++ /dev/null
@@ -1,74 +0,0 @@
-path: "tensorflow.Event"
-tf_proto {
-  descriptor {
-    name: "Event"
-    field {
-      name: "wall_time"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_DOUBLE
-    }
-    field {
-      name: "step"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "file_version"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-      oneof_index: 0
-    }
-    field {
-      name: "graph_def"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-      oneof_index: 0
-    }
-    field {
-      name: "summary"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.Summary"
-      oneof_index: 0
-    }
-    field {
-      name: "log_message"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.LogMessage"
-      oneof_index: 0
-    }
-    field {
-      name: "session_log"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.SessionLog"
-      oneof_index: 0
-    }
-    field {
-      name: "tagged_run_metadata"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TaggedRunMetadata"
-      oneof_index: 0
-    }
-    field {
-      name: "meta_graph_def"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-      oneof_index: 0
-    }
-    oneof_decl {
-      name: "what"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-indexed-slices.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-indexed-slices.pbtxt
index fee84d85307dffb675b507a31c4f1fda60de869d..5b47c718a5753905a4fa426b739dad4b01678c3f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-indexed-slices.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-indexed-slices.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.IndexedSlices"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.ops.IndexedSlices\'>"
   is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dense_shape"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-module.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-module.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8d599d73b84fc51b8bf3001f6773011e42f09456
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-module.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.Module"
+tf_class {
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "no_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..481a8c73ac351cc0ef38ee3681d5134f06334421
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor.pbtxt
@@ -0,0 +1,126 @@
+path: "tensorflow.RaggedTensor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.ragged.ragged_tensor.RaggedTensor\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "flat_values"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "nested_row_splits"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "ragged_rank"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "row_splits"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "values"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'values\', \'row_splits\', \'cached_row_lengths\', \'cached_value_rowids\', \'cached_nrows\', \'internal\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "bounding_shape"
+    argspec: "args=[\'self\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_nested_row_lengths"
+    argspec: "args=[\'cls\', \'flat_values\', \'nested_row_lengths\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_nested_row_splits"
+    argspec: "args=[\'cls\', \'flat_values\', \'nested_row_splits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_nested_value_rowids"
+    argspec: "args=[\'cls\', \'flat_values\', \'nested_value_rowids\', \'nested_nrows\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "from_row_lengths"
+    argspec: "args=[\'cls\', \'values\', \'row_lengths\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_row_limits"
+    argspec: "args=[\'cls\', \'values\', \'row_limits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_row_splits"
+    argspec: "args=[\'cls\', \'values\', \'row_splits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_row_starts"
+    argspec: "args=[\'cls\', \'values\', \'row_starts\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_sparse"
+    argspec: "args=[\'cls\', \'st_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_tensor"
+    argspec: "args=[\'cls\', \'tensor\', \'lengths\', \'padding\', \'ragged_rank\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\'], "
+  }
+  member_method {
+    name: "from_value_rowids"
+    argspec: "args=[\'cls\', \'values\', \'value_rowids\', \'nrows\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "nested_row_lengths"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "nrows"
+    argspec: "args=[\'self\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+  }
+  member_method {
+    name: "row_lengths"
+    argspec: "args=[\'self\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "row_limits"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "row_starts"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "to_list"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "to_sparse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "to_tensor"
+    argspec: "args=[\'self\', \'default_value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "value_rowids"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "with_flat_values"
+    argspec: "args=[\'self\', \'new_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_values"
+    argspec: "args=[\'self\', \'new_values\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt
index 3add49e90d7eb5094ad68d1474e834404549c988..64f7260369d7cbc656ad3d23b69cc9079e030f95 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.SparseTensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensor\'>"
   is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dense_shape"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-summary-metadata.-plugin-data.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-summary-metadata.-plugin-data.pbtxt
deleted file mode 100644
index a66b74b315c6132e8f884bd52e7a3b5bd7f52ccd..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-summary-metadata.-plugin-data.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.SummaryMetadata.PluginData"
-tf_proto {
-  descriptor {
-    name: "PluginData"
-    field {
-      name: "plugin_name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "content"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-summary-metadata.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-summary-metadata.pbtxt
deleted file mode 100644
index c02575b9626c848e9b871d2cc6febb26a5142f08..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-summary-metadata.pbtxt
+++ /dev/null
@@ -1,40 +0,0 @@
-path: "tensorflow.SummaryMetadata"
-tf_proto {
-  descriptor {
-    name: "SummaryMetadata"
-    field {
-      name: "plugin_data"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.SummaryMetadata.PluginData"
-    }
-    field {
-      name: "display_name"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "summary_description"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    nested_type {
-      name: "PluginData"
-      field {
-        name: "plugin_name"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "content"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_BYTES
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-summary.-audio.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-summary.-audio.pbtxt
deleted file mode 100644
index 94f712073e0d0dda201fcf7adba849dd45a1229b..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-summary.-audio.pbtxt
+++ /dev/null
@@ -1,36 +0,0 @@
-path: "tensorflow.Summary.Audio"
-tf_proto {
-  descriptor {
-    name: "Audio"
-    field {
-      name: "sample_rate"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_FLOAT
-    }
-    field {
-      name: "num_channels"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "length_frames"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "encoded_audio_string"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-    }
-    field {
-      name: "content_type"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-summary.-image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-summary.-image.pbtxt
deleted file mode 100644
index fc1acb483b3051cba01f5d9bc8501a61965bbc37..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-summary.-image.pbtxt
+++ /dev/null
@@ -1,30 +0,0 @@
-path: "tensorflow.Summary.Image"
-tf_proto {
-  descriptor {
-    name: "Image"
-    field {
-      name: "height"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "width"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "colorspace"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "encoded_image_string"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-summary.-value.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-summary.-value.pbtxt
deleted file mode 100644
index feb84b6ee996549ac58aa0e8a4ac560f947b6339..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-summary.-value.pbtxt
+++ /dev/null
@@ -1,74 +0,0 @@
-path: "tensorflow.Summary.Value"
-tf_proto {
-  descriptor {
-    name: "Value"
-    field {
-      name: "node_name"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "tag"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "metadata"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.SummaryMetadata"
-    }
-    field {
-      name: "simple_value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_FLOAT
-      oneof_index: 0
-    }
-    field {
-      name: "obsolete_old_style_histogram"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-      oneof_index: 0
-    }
-    field {
-      name: "image"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.Summary.Image"
-      oneof_index: 0
-    }
-    field {
-      name: "histo"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.HistogramProto"
-      oneof_index: 0
-    }
-    field {
-      name: "audio"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.Summary.Audio"
-      oneof_index: 0
-    }
-    field {
-      name: "tensor"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorProto"
-      oneof_index: 0
-    }
-    oneof_decl {
-      name: "value"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-summary.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-summary.pbtxt
deleted file mode 100644
index b2bdff7171804aae114d1e3631e3074b1e4006ba..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-summary.pbtxt
+++ /dev/null
@@ -1,144 +0,0 @@
-path: "tensorflow.Summary"
-tf_proto {
-  descriptor {
-    name: "Summary"
-    field {
-      name: "value"
-      number: 1
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.Summary.Value"
-    }
-    nested_type {
-      name: "Image"
-      field {
-        name: "height"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      field {
-        name: "width"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      field {
-        name: "colorspace"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      field {
-        name: "encoded_image_string"
-        number: 4
-        label: LABEL_OPTIONAL
-        type: TYPE_BYTES
-      }
-    }
-    nested_type {
-      name: "Audio"
-      field {
-        name: "sample_rate"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_FLOAT
-      }
-      field {
-        name: "num_channels"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_INT64
-      }
-      field {
-        name: "length_frames"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_INT64
-      }
-      field {
-        name: "encoded_audio_string"
-        number: 4
-        label: LABEL_OPTIONAL
-        type: TYPE_BYTES
-      }
-      field {
-        name: "content_type"
-        number: 5
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-    }
-    nested_type {
-      name: "Value"
-      field {
-        name: "node_name"
-        number: 7
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "tag"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "metadata"
-        number: 9
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.SummaryMetadata"
-      }
-      field {
-        name: "simple_value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_FLOAT
-        oneof_index: 0
-      }
-      field {
-        name: "obsolete_old_style_histogram"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_BYTES
-        oneof_index: 0
-      }
-      field {
-        name: "image"
-        number: 4
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.Summary.Image"
-        oneof_index: 0
-      }
-      field {
-        name: "histo"
-        number: 5
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.HistogramProto"
-        oneof_index: 0
-      }
-      field {
-        name: "audio"
-        number: 6
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.Summary.Audio"
-        oneof_index: 0
-      }
-      field {
-        name: "tensor"
-        number: 8
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.TensorProto"
-        oneof_index: 0
-      }
-      oneof_decl {
-        name: "value"
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-shape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-shape.pbtxt
index bee19520b7736967533c6d30a1862e3c48d03fc2..60518ffadc833b0ab07e25c9b738aed7f7e08f20 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-shape.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-shape.pbtxt
@@ -1,7 +1,6 @@
 path: "tensorflow.TensorShape"
 tf_class {
-  is_instance: "<class \'tensorflow.python.framework.tensor_shape.TensorShapeV2\'>"
-  is_instance: "<class \'tensorflow.python.framework.tensor_shape.TensorShapeV1\'>"
+  is_instance: "<class \'tensorflow.python.framework.tensor_shape.TensorShape\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dims"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
index 6136c8fbe79ef8d3851c39b8f11ac3c33f6050f2..03fd32fdebf80745cb264afd81d08bf7054aebaf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.Variable"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.variables.Variable\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "SaveSliceInfo"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.audio.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.audio.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6c5724078357125255acd413902c4a5e57cb719e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.audio.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.audio"
+tf_module {
+  member_method {
+    name: "decode_wav"
+    argspec: "args=[\'contents\', \'desired_channels\', \'desired_samples\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+  }
+  member_method {
+    name: "encode_wav"
+    argspec: "args=[\'audio\', \'sample_rate\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-feature.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-feature.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1f04d028efdc895e493c9e60e1c9025fc26de4f3
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-feature.pbtxt
@@ -0,0 +1,36 @@
+path: "tensorflow.autograph.experimental.Feature"
+tf_class {
+  is_instance: "<enum \'Feature\'>"
+  member {
+    name: "ALL"
+    mtype: "<enum \'Feature\'>"
+  }
+  member {
+    name: "ASSERT_STATEMENTS"
+    mtype: "<enum \'Feature\'>"
+  }
+  member {
+    name: "AUTO_CONTROL_DEPS"
+    mtype: "<enum \'Feature\'>"
+  }
+  member {
+    name: "BUILTIN_FUNCTIONS"
+    mtype: "<enum \'Feature\'>"
+  }
+  member {
+    name: "ERROR_REWRITING"
+    mtype: "<enum \'Feature\'>"
+  }
+  member {
+    name: "LISTS"
+    mtype: "<enum \'Feature\'>"
+  }
+  member {
+    name: "LOGICAL_EXPRESSIONS"
+    mtype: "<enum \'Feature\'>"
+  }
+  member {
+    name: "NAME_SCOPES"
+    mtype: "<enum \'Feature\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-verbosity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-verbosity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c4d5b77c0738feb1fa6ea69672ee3fafa51de5be
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.-verbosity.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.autograph.experimental.Verbosity"
+tf_class {
+  is_instance: "<enum \'Verbosity\'>"
+  member {
+    name: "BRIEF"
+    mtype: "<enum \'Verbosity\'>"
+  }
+  member {
+    name: "VERBOSE"
+    mtype: "<enum \'Verbosity\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5747dac7ab201443d1f237415cd280aee672a8ff
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.autograph.experimental.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.autograph.experimental"
+tf_module {
+  member {
+    name: "Feature"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "Verbosity"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.autograph.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.autograph.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0baf6e03552f5b12e5f2e48f87cf1ec7332787bb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.autograph.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.autograph"
+tf_module {
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "set_verbosity"
+    argspec: "args=[\'level\', \'alsologtostdout\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "to_code"
+    argspec: "args=[\'entity\', \'recursive\', \'arg_values\', \'arg_types\', \'indentation\', \'experimental_optional_features\', \'experimental_partial_types\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'  \', \'Feature.ALL\', \'None\'], "
+  }
+  member_method {
+    name: "to_graph"
+    argspec: "args=[\'entity\', \'recursive\', \'arg_values\', \'arg_types\', \'experimental_optional_features\', \'experimental_strip_decorators\', \'experimental_verbose\', \'experimental_partial_types\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'Feature.ALL\', \'None\', \'Verbosity.BRIEF\', \'None\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[], varargs=args, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.compat.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.compat.pbtxt
index f1d760603e981a0b9a72fdc379dc81932ac71d67..95352dff3a6b9341857b3f3b82dcb6817e4553ab 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.compat.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.compat.pbtxt
@@ -32,6 +32,14 @@ tf_module {
     name: "as_text"
     argspec: "args=[\'bytes_or_text\', \'encoding\'], varargs=None, keywords=None, defaults=[\'utf-8\'], "
   }
+  member_method {
+    name: "dimension_at_index"
+    argspec: "args=[\'shape\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "dimension_value"
+    argspec: "args=[\'dimension\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "forward_compatibility_horizon"
     argspec: "args=[\'year\', \'month\', \'day\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.config.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..09076a8302610d769e811d8e34333d946d51bc8c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.config.pbtxt
@@ -0,0 +1,15 @@
+path: "tensorflow.config"
+tf_module {
+  member_method {
+    name: "experimental_connect_to_host"
+    argspec: "args=[\'remote_host\', \'job_name\'], varargs=None, keywords=None, defaults=[\'None\', \'worker\'], "
+  }
+  member_method {
+    name: "experimental_list_devices"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_run_functions_eagerly"
+    argspec: "args=[\'run_eagerly\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.constant_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.constant_initializer.pbtxt
index 00ec669b1685f3cbdacd676bac61755bebb9f6da..437131abb2d4512c547635117ee0f9c2e1c3b284 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.constant_initializer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.constant_initializer.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.constant_initializer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Constant\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Constant\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'value\', \'dtype\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'float32\'>\", \'False\'], "
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
index d877339409d781f95f7ff75a553d21d82c27fc40..195c10445d4991616c11f66dcda2ad4b744940f2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
@@ -16,6 +16,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
+    argspec: "args=[\'self\', \'variant_tensor\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "apply"
@@ -89,6 +90,10 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
index f1573512438b3f40db7653bf94fd4ad282a40acd..043584c23305975ac90ceb25572ed07b65552406 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
index 690da98b1ac2097c4241ba3218caa3b476dbf397..76d15f46344812927b76e5edaf5c59d547eaa62d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -91,6 +91,10 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
index fe0bc1a4db5d4a5e78ec7479e414545b522ec2df..a6c7a2d7c58fafa08b5d9993eb83a0c68b01b691 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
index 261129b132189ef504678058f11651dd22bdce8c..ae177ebc6b8d0c18edccbad21b897fda6e6722e5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
index 0b34bbc94269280d6cca77bca789fb74f76629be..c15c73f3f78e5abe27395b9598005588a33b569d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
index 0e61890eee42a8b5b0df7bda0f99d189c4911eb9..567f48d1aa5d82529bd9d044405b951e447c5567 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "shard"
+    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
index 2d115904925eb96164484300baf628d41d3fcff4..48d90028646da07d95478abc9a3ddb5e918599e6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
@@ -86,7 +86,7 @@ tf_module {
   }
   member_method {
     name: "bucket_by_sequence_length"
-    argspec: "args=[\'element_length_func\', \'bucket_boundaries\', \'bucket_batch_sizes\', \'padded_shapes\', \'padding_values\', \'pad_to_bucket_boundary\', \'no_padding\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\'], "
+    argspec: "args=[\'element_length_func\', \'bucket_boundaries\', \'bucket_batch_sizes\', \'padded_shapes\', \'padding_values\', \'pad_to_bucket_boundary\', \'no_padding\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "cardinality"
@@ -142,7 +142,7 @@ tf_module {
   }
   member_method {
     name: "make_csv_dataset"
-    argspec: "args=[\'file_pattern\', \'batch_size\', \'column_names\', \'column_defaults\', \'label_name\', \'select_columns\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'header\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'num_parallel_reads\', \'sloppy\', \'num_rows_for_inference\', \'compression_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \',\', \'True\', \'\', \'True\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'False\', \'100\', \'None\'], "
+    argspec: "args=[\'file_pattern\', \'batch_size\', \'column_names\', \'column_defaults\', \'label_name\', \'select_columns\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'header\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'num_parallel_reads\', \'sloppy\', \'num_rows_for_inference\', \'compression_type\', \'ignore_errors\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \',\', \'True\', \'\', \'True\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'False\', \'100\', \'None\', \'False\'], "
   }
   member_method {
     name: "make_saveable_from_iterator"
@@ -180,6 +180,10 @@ tf_module {
     name: "shuffle_and_repeat"
     argspec: "args=[\'buffer_size\', \'count\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "take_while"
+    argspec: "args=[\'predicate\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "unbatch"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-cross-device-ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-cross-device-ops.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a2ea23432416ee5f012e6f0c725d5f57841400f7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-cross-device-ops.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.distribute.CrossDeviceOps"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.CrossDeviceOps\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_implementation"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a38c4b21d563f0ef67eb2b2614fae678f0d97dce
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.distribute.HierarchicalCopyAllReduce"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.HierarchicalCopyAllReduce\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.AllReduceCrossDeviceOps\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.CrossDeviceOps\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_packs\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_implementation"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
index a613e2d3d1dcefacdf0ec336587a46ff7e0bcb90..89748f7713fd813ab56d0e07780da33ca8ff14bc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -3,10 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.distribute.mirrored_strategy.MirroredStrategy\'>"
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
   is_instance: "<type \'object\'>"
-  member {
-    name: "between_graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
@@ -15,45 +11,13 @@ tf_class {
     name: "num_replicas_in_sync"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "parameter_devices"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "require_static_shapes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "should_checkpoint"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "should_init"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "should_save_summary"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "worker_devices"
-    mtype: "<type \'property\'>"
-  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'devices\', \'cross_device_ops\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "batch_reduce"
-    argspec: "args=[\'self\', \'aggregation\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "broadcast"
-    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "call_for_each_replica"
-    argspec: "args=[\'self\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "colocate_vars_with"
@@ -64,29 +28,17 @@ tf_class {
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "distribute_dataset"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "experimental_finalize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "experimental_initialize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
   }
   member_method {
-    name: "finalize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "initialize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "make_dataset_iterator"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
@@ -95,22 +47,10 @@ tf_class {
     name: "make_input_fn_iterator"
     argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
   }
-  member_method {
-    name: "non_slot_devices"
-    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "read_var"
-    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "reduce"
     argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "run_steps_on_dataset"
-    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
-  }
   member_method {
     name: "scope"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -119,20 +59,8 @@ tf_class {
     name: "unwrap"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "update"
-    argspec: "args=[\'self\', \'var\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
-  }
   member_method {
     name: "update_config_proto"
     argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "update_non_slot"
-    argspec: "args=[\'self\', \'colocate_with\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "value_container"
-    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-nccl-all-reduce.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-nccl-all-reduce.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bdc09bcd84b96ee8475d3ef87ec5be686fc449ec
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-nccl-all-reduce.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.distribute.NcclAllReduce"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.NcclAllReduce\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.AllReduceCrossDeviceOps\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.CrossDeviceOps\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_packs\'], varargs=None, keywords=None, defaults=[\'1\'], "
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_implementation"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5d7943af2dc82e59b4ccd34816f9ecb6fd77dae9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.distribute.OneDeviceStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.one_device_strategy.OneDeviceStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'device\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduction-to-one-device.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduction-to-one-device.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f5ade9f86ba24779061bce3aa3e6f019d26741aa
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduction-to-one-device.pbtxt
@@ -0,0 +1,34 @@
+path: "tensorflow.distribute.ReductionToOneDevice"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.ReductionToOneDevice\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cross_device_ops.CrossDeviceOps\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduce_to_device\', \'accumulation_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_implementation"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_implementation"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt
index df707e8920e4488ed6b40a7f93f56b5624188c84..c3b7991175769f473acf929d656cd52ccca7bf4f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt
@@ -6,10 +6,6 @@ tf_class {
     name: "devices"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "distribution_strategy"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "num_replicas_in_sync"
     mtype: "<type \'property\'>"
@@ -26,6 +22,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'strategy\', \'replica_id_in_sync_group\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "all_reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "merge_call"
     argspec: "args=[\'self\', \'merge_fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-server.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-server.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6c39bf4fc4099a753ceee4de0df990a887d2ab4e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-server.pbtxt
@@ -0,0 +1,29 @@
+path: "tensorflow.distribute.Server"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.server_lib.Server\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "server_def"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "target"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'server_or_cluster_def\', \'job_name\', \'task_index\', \'protocol\', \'config\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "create_local_server"
+    argspec: "args=[\'config\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+  }
+  member_method {
+    name: "join"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "start"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
index 77706e57133e1186d9e98fcf9205ed4c91772eda..5c4f09075316150b3118f048091d3c68a60a232d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
@@ -50,6 +50,10 @@ tf_class {
     name: "colocate_vars_with"
     argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "experimental_make_numpy_dataset"
+    argspec: "args=[\'self\', \'numpy_input\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "experimental_run_steps_on_iterator"
     argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
@@ -78,4 +82,8 @@ tf_class {
     name: "value_container"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variable_created_in_scope"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
index 9eb73d2c0d9069ec4b818abe1825503f0ea36fc9..6ed49d339d7af7b2d05dfa57121805a7dce48090 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
@@ -2,10 +2,6 @@ path: "tensorflow.distribute.Strategy"
 tf_class {
   is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
   is_instance: "<type \'object\'>"
-  member {
-    name: "between_graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "extended"
     mtype: "<type \'property\'>"
@@ -14,45 +10,13 @@ tf_class {
     name: "num_replicas_in_sync"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "parameter_devices"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "require_static_shapes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "should_checkpoint"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "should_init"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "should_save_summary"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "worker_devices"
-    mtype: "<type \'property\'>"
-  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'extended\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "batch_reduce"
-    argspec: "args=[\'self\', \'aggregation\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "broadcast"
-    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "call_for_each_replica"
-    argspec: "args=[\'self\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "colocate_vars_with"
@@ -63,29 +27,17 @@ tf_class {
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "distribute_dataset"
-    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "experimental_finalize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "experimental_initialize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
   }
   member_method {
-    name: "finalize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "initialize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "make_dataset_iterator"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
@@ -94,22 +46,10 @@ tf_class {
     name: "make_input_fn_iterator"
     argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
   }
-  member_method {
-    name: "non_slot_devices"
-    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "read_var"
-    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "reduce"
     argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "run_steps_on_dataset"
-    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
-  }
   member_method {
     name: "scope"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -118,20 +58,8 @@ tf_class {
     name: "unwrap"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "update"
-    argspec: "args=[\'self\', \'var\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
-  }
   member_method {
     name: "update_config_proto"
     argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "update_non_slot"
-    argspec: "args=[\'self\', \'colocate_with\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "value_container"
-    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0b35b61b4c08868feaf501e1f09b37d02da09cd0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-cluster-resolver.pbtxt
@@ -0,0 +1,24 @@
+path: "tensorflow.distribute.cluster_resolver.ClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-g-c-e-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-g-c-e-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5c2cc522f1cac65611ffc3f09ce1513d186da27a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-g-c-e-cluster-resolver.pbtxt
@@ -0,0 +1,38 @@
+path: "tensorflow.distribute.cluster_resolver.GCEClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver.GCEClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "rpc_layer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'project\', \'zone\', \'instance_group\', \'port\', \'task_type\', \'task_id\', \'rpc_layer\', \'credentials\', \'service\'], varargs=None, keywords=None, defaults=[\'worker\', \'0\', \'grpc\', \'default\', \'None\'], "
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3220d68e05458da3cda4e36c63bc5dc79cde93af
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-kubernetes-cluster-resolver.pbtxt
@@ -0,0 +1,26 @@
+path: "tensorflow.distribute.cluster_resolver.KubernetesClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver.KubernetesClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'job_to_label_mapping\', \'tf_server_port\', \'rpc_layer\', \'override_client\'], varargs=None, keywords=None, defaults=[\'None\', \'8470\', \'grpc\', \'None\'], "
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-simple-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-simple-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4e80e3af308e1a80eb9fa7491eabbe1454b8edf6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-simple-cluster-resolver.pbtxt
@@ -0,0 +1,38 @@
+path: "tensorflow.distribute.cluster_resolver.SimpleClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.SimpleClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "rpc_layer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'cluster_spec\', \'master\', \'task_type\', \'task_id\', \'environment\', \'num_accelerators\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'None\', \'\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'accelerator_type\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'GPU\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..971ea3dca41a82b4a737a8c2468f2e1f6ffb2033
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-slurm-cluster-resolver.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.distribute.cluster_resolver.SlurmClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver.SlurmClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'jobs\', \'port_base\', \'gpus_per_node\', \'gpus_per_task\', \'tasks_per_node\', \'auto_set_gpu\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'8888\', \'1\', \'1\', \'None\', \'True\', \'grpc\'], "
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_task_info"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'accelerator_type\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'GPU\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-f-config-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-f-config-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5f9a430c0f84c9caba29dee514f1f3a3391d8588
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-f-config-cluster-resolver.pbtxt
@@ -0,0 +1,38 @@
+path: "tensorflow.distribute.cluster_resolver.TFConfigClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver.TFConfigClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "rpc_layer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\', \'environment\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ca22c85ac0ac871538b2820df39c6636fa23873c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
@@ -0,0 +1,34 @@
+path: "tensorflow.distribute.cluster_resolver.TPUClusterResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver.TPUClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'tpu\', \'zone\', \'project\', \'job_name\', \'coordinator_name\', \'coordinator_address\', \'credentials\', \'service\', \'discovery_url\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'worker\', \'None\', \'None\', \'default\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_job_name"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_master"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'accelerator_type\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'TPU\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-union-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-union-resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..179848aca39cbff879fbab7791a1a7bc1692c488
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-union-resolver.pbtxt
@@ -0,0 +1,38 @@
+path: "tensorflow.distribute.cluster_resolver.UnionResolver"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.UnionClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "environment"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "rpc_layer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "task_type"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "cluster_spec"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "master"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'rpc_layer\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "num_accelerators"
+    argspec: "args=[\'self\', \'task_type\', \'task_id\', \'accelerator_type\', \'config_proto\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'GPU\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5906ffa850a360889e26fe0230618ad60cf01231
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.distribute.cluster_resolver"
+tf_module {
+  member {
+    name: "ClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GCEClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "KubernetesClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SimpleClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SlurmClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TFConfigClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TPUClusterResolver"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "UnionResolver"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..619c3744d6cae858eb501392e962fbb94751355e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.distribute.experimental.MultiWorkerMirroredStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.collective_all_reduce_strategy.CollectiveAllReduceStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c4cfa2f4ed2e8e346e847318e409dcf16b5f34f8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.distribute.experimental.ParameterServerStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.parameter_server_strategy.ParameterServerStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_make_numpy_iterator"
+    argspec: "args=[\'self\', \'numpy_input\', \'batch_size\', \'num_epochs\', \'shuffle\', \'session\'], varargs=None, keywords=None, defaults=[\'1\', \'1024\', \'None\'], "
+  }
+  member_method {
+    name: "experimental_run"
+    argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8859fbd38c43577c5f6040d717d11fb23941e4e2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.distribute.experimental"
+tf_module {
+  member {
+    name: "MultiWorkerMirroredStrategy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ParameterServerStrategy"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
index b0dd73ca1d4179b4a3323fa0a9be2fde4e22799c..7339bee6cd85ba9d474e55b3952468113d28a27c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
@@ -1,5 +1,13 @@
 path: "tensorflow.distribute"
 tf_module {
+  member {
+    name: "CrossDeviceOps"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "HierarchicalCopyAllReduce"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "InputContext"
     mtype: "<type \'type\'>"
@@ -12,14 +20,30 @@ tf_module {
     name: "MirroredStrategy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "NcclAllReduce"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "OneDeviceStrategy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ReduceOp"
     mtype: "<class \'enum.EnumMeta\'>"
   }
+  member {
+    name: "ReductionToOneDevice"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ReplicaContext"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Server"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Strategy"
     mtype: "<type \'type\'>"
@@ -28,9 +52,13 @@ tf_module {
     name: "StrategyExtended"
     mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "get_loss_reduction"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  member {
+    name: "cluster_resolver"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
   }
   member_method {
     name: "get_replica_context"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
index 01b870a81639807489ec2a09dcc185137aae1665..956e4d93e57069b6936413a3a432d45a22e4ed1b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
@@ -112,10 +112,6 @@ tf_module {
     name: "as_dtype"
     argspec: "args=[\'type_value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "as_string"
-    argspec: "args=[\'input\', \'precision\', \'scientific\', \'shortest\', \'width\', \'fill\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'False\', \'False\', \'-1\', \'\', \'None\'], "
-  }
   member_method {
     name: "cast"
     argspec: "args=[\'x\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-mode-keys.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-mode-keys.pbtxt
index bf7c1abcd89b29c29f3487cab58cfdf28103119c..d53752780090c2d621dcabfc8c31e4f1192bd7c7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-mode-keys.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-mode-keys.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.ModeKeys"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.model_fn.ModeKeys\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.mode_keys.EstimatorModeKeys\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "EVAL"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-session-run-args.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-session-run-args.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b375c7429469d2a8b89d1bcd048599d6478624ae
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-session-run-args.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.estimator.SessionRunArgs"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunArgs\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunArgs\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "feed_dict"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "fetches"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "options"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-session-run-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-session-run-context.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cb4ac9f50ec9aa9d6531a16ebb48a9223cbc5188
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-session-run-context.pbtxt
@@ -0,0 +1,25 @@
+path: "tensorflow.estimator.SessionRunContext"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunContext\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "original_args"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "session"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stop_requested"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'original_args\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "request_stop"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-session-run-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-session-run-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..54e9ad9ed44b64e2c1c49b5ade4c7d3bb35563de
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-session-run-hook.pbtxt
@@ -0,0 +1,28 @@
+path: "tensorflow.estimator.SessionRunHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-session-run-values.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-session-run-values.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..678814169635bfa9997db26df23acc79c2d84881
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-session-run-values.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.estimator.SessionRunValues"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunValues\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunValues\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "options"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "results"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "run_metadata"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
index f0fd7ce782db71ff5e790fe50e93556bf5d19e1e..b1bd5a2661d44d9b36b965ba160874e6142628ea 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
@@ -36,4 +36,16 @@ tf_module {
     name: "stop_if_higher_hook"
     argspec: "args=[\'estimator\', \'metric_name\', \'threshold\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
   }
+  member_method {
+    name: "stop_if_lower_hook"
+    argspec: "args=[\'estimator\', \'metric_name\', \'threshold\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
+  }
+  member_method {
+    name: "stop_if_no_decrease_hook"
+    argspec: "args=[\'estimator\', \'metric_name\', \'max_steps_without_decrease\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
+  }
+  member_method {
+    name: "stop_if_no_increase_hook"
+    argspec: "args=[\'estimator\', \'metric_name\', \'max_steps_without_increase\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-classification-output.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-classification-output.pbtxt
index 52874dd9b9316d9815c5aef51e272e6ffddb5224..ce486807a47031999ec4c9082bbf4b2e4af910ed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-classification-output.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-classification-output.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.ClassificationOutput"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ClassificationOutput\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ClassificationOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "classes"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-export-output.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-export-output.pbtxt
index 964c315e9730effac38d60f7242527e71cbf9846..9775b2cca812d3fd47a3e821f032ed67c62d0078 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-export-output.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-export-output.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.export.ExportOutput"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-predict-output.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-predict-output.pbtxt
index bb82bc9e58627318b897f0610c7d852db7f98c07..a4389fb998e86cc291bb02d2ae04d220f1e152cc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-predict-output.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-predict-output.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.PredictOutput"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.PredictOutput\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.PredictOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "outputs"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-regression-output.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-regression-output.pbtxt
index 8522834433f214e5d646ef6265b1047fb7f2cc4f..fc7f8447893c99f4f68bf12f0790e8a549232dc3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-regression-output.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.export.-regression-output.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.export.RegressionOutput"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.RegressionOutput\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.export.export_output.ExportOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.RegressionOutput\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.model_utils.export_output.ExportOutput\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "value"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.inputs.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.inputs.pbtxt
deleted file mode 100644
index b318fea1f82077c3924a843dd6b3857a3fdc0e8e..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.inputs.pbtxt
+++ /dev/null
@@ -1,11 +0,0 @@
-path: "tensorflow.estimator.inputs"
-tf_module {
-  member_method {
-    name: "numpy_input_fn"
-    argspec: "args=[\'x\', \'y\', \'batch_size\', \'num_epochs\', \'shuffle\', \'queue_capacity\', \'num_threads\'], varargs=None, keywords=None, defaults=[\'None\', \'128\', \'1\', \'None\', \'1000\', \'1\'], "
-  }
-  member_method {
-    name: "pandas_input_fn"
-    argspec: "args=[\'x\', \'y\', \'batch_size\', \'num_epochs\', \'shuffle\', \'queue_capacity\', \'num_threads\', \'target_column\'], varargs=None, keywords=None, defaults=[\'None\', \'128\', \'1\', \'None\', \'1000\', \'1\', \'target\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
index d3656ae0455971ccd98062a52ec0412bf6af06f7..add8ef5e65dce5d0fffa82805e465c46eeb3f3ab 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
@@ -132,6 +132,22 @@ tf_module {
     name: "SecondOrStepTimer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SessionRunArgs"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionRunContext"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionRunHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SessionRunValues"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "StepCounterHook"
     mtype: "<type \'type\'>"
@@ -164,10 +180,6 @@ tf_module {
     name: "export"
     mtype: "<type \'module\'>"
   }
-  member {
-    name: "inputs"
-    mtype: "<type \'module\'>"
-  }
   member_method {
     name: "add_metrics"
     argspec: "args=[\'estimator\', \'metric_fn\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt
index 3aadd7dc341ae97fdbfa83cd3fc96fc75249a4c2..4e4fd78b598c91b98a121e3751f1e61d67f14419 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt
@@ -41,7 +41,27 @@ tf_module {
     argspec: "args=[\'key\', \'shape\', \'default_value\', \'dtype\', \'normalizer_fn\'], varargs=None, keywords=None, defaults=[\'(1,)\', \'None\', \"<dtype: \'float32\'>\", \'None\'], "
   }
   member_method {
-    name: "shared_embedding_columns"
+    name: "sequence_categorical_column_with_hash_bucket"
+    argspec: "args=[\'key\', \'hash_bucket_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'string\'>\"], "
+  }
+  member_method {
+    name: "sequence_categorical_column_with_identity"
+    argspec: "args=[\'key\', \'num_buckets\', \'default_value\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "sequence_categorical_column_with_vocabulary_file"
+    argspec: "args=[\'key\', \'vocabulary_file\', \'vocabulary_size\', \'num_oov_buckets\', \'default_value\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \"<dtype: \'string\'>\"], "
+  }
+  member_method {
+    name: "sequence_categorical_column_with_vocabulary_list"
+    argspec: "args=[\'key\', \'vocabulary_list\', \'dtype\', \'default_value\', \'num_oov_buckets\'], varargs=None, keywords=None, defaults=[\'None\', \'-1\', \'0\'], "
+  }
+  member_method {
+    name: "sequence_numeric_column"
+    argspec: "args=[\'key\', \'shape\', \'default_value\', \'dtype\', \'normalizer_fn\'], varargs=None, keywords=None, defaults=[\'(1,)\', \'0.0\', \"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "shared_embeddings"
     argspec: "args=[\'categorical_columns\', \'dimension\', \'combiner\', \'initializer\', \'shared_embedding_collection_name\', \'ckpt_to_load_from\', \'tensor_name_in_ckpt\', \'max_norm\', \'trainable\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'None\', \'None\', \'None\', \'None\', \'True\'], "
   }
   member_method {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.glorot_uniform_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.glorot_uniform_initializer.pbtxt
deleted file mode 100644
index bb8540d0fd8b4a737bce8d23404616f3f51d2c79..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.glorot_uniform_initializer.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.glorot_uniform_initializer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.GlorotUniform\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.VarianceScaling\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
index 3c6ed1cfb8340b6e8f2599360e3c321c562e37ff..80c0cb7ae8440ab25a7f8c39faea002acae0d643 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
@@ -32,6 +32,10 @@ tf_module {
     name: "central_crop"
     argspec: "args=[\'image\', \'central_fraction\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "combined_non_max_suppression"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size_per_class\', \'max_total_size\', \'iou_threshold\', \'score_threshold\', \'pad_per_class\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'-inf\', \'False\', \'None\'], "
+  }
   member_method {
     name: "convert_image_dtype"
     argspec: "args=[\'image\', \'dtype\', \'saturate\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
@@ -82,7 +86,7 @@ tf_module {
   }
   member_method {
     name: "extract_glimpse"
-    argspec: "args=[\'input\', \'size\', \'offsets\', \'centered\', \'normalized\', \'uniform_noise\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'True\', \'None\'], "
+    argspec: "args=[\'input\', \'size\', \'offsets\', \'centered\', \'normalized\', \'noise\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'uniform\', \'None\'], "
   }
   member_method {
     name: "extract_image_patches"
@@ -182,7 +186,7 @@ tf_module {
   }
   member_method {
     name: "resize_image_with_pad"
-    argspec: "args=[\'image\', \'target_height\', \'target_width\', \'method\'], varargs=None, keywords=None, defaults=[\'0\'], "
+    argspec: "args=[\'image\', \'target_height\', \'target_width\', \'method\', \'align_corners\'], varargs=None, keywords=None, defaults=[\'0\', \'False\'], "
   }
   member_method {
     name: "rgb_to_grayscale"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.constant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.constant.pbtxt
deleted file mode 100644
index 607a5aae21ff7299fc96aee3b932c10d622f1127..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.constant.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.initializers.constant"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Constant\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'value\', \'dtype\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'float32\'>\", \'False\'], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_normal.pbtxt
deleted file mode 100644
index 4a81e52df966d0af93b097fe07ec642eb81f7edb..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_normal.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.initializers.glorot_normal"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.GlorotNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.VarianceScaling\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_uniform.pbtxt
deleted file mode 100644
index 815dc81dff5d5c3f89bc6e1d39b8fa7c4c15c914..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.glorot_uniform.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.initializers.glorot_uniform"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.GlorotUniform\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.VarianceScaling\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.orthogonal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.orthogonal.pbtxt
deleted file mode 100644
index ff64efd60cf1197bb9032912eb5cba48a63609a0..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.orthogonal.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.initializers.orthogonal"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Orthogonal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'gain\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
deleted file mode 100644
index e3c63fe737ee655169c00c7c0b2882c84f566244..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.pbtxt
+++ /dev/null
@@ -1,67 +0,0 @@
-path: "tensorflow.initializers"
-tf_module {
-  member {
-    name: "constant"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "glorot_normal"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "glorot_uniform"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "identity"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "ones"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "orthogonal"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "random_normal"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "random_uniform"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "truncated_normal"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "uniform_unit_scaling"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "variance_scaling"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "zeros"
-    mtype: "<type \'type\'>"
-  }
-  member_method {
-    name: "he_normal"
-    argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "he_uniform"
-    argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "lecun_normal"
-    argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "lecun_uniform"
-    argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.random_normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.random_normal.pbtxt
deleted file mode 100644
index 133e61c1d9869bdd00948df3877be990b30b7cc3..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.random_normal.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.initializers.random_normal"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.random_uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.random_uniform.pbtxt
deleted file mode 100644
index 0cfa0080f5a936bc80f69c2b5c15f671096ba350..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.random_uniform.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.initializers.random_uniform"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomUniform\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'minval\', \'maxval\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.truncated_normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.truncated_normal.pbtxt
deleted file mode 100644
index 730390fba274f9dc25eea7a53bb8145a2ade8613..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.truncated_normal.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.initializers.truncated_normal"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.TruncatedNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.uniform_unit_scaling.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.uniform_unit_scaling.pbtxt
deleted file mode 100644
index 13295ef375a4002f8fece5ebb5d2a5d5d26c68eb..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.uniform_unit_scaling.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.initializers.uniform_unit_scaling"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.UniformUnitScaling\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'factor\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.variance_scaling.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.initializers.variance_scaling.pbtxt
deleted file mode 100644
index 86340913e2506c96499aae05a3ed0d5273c93bba..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.variance_scaling.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.initializers.variance_scaling"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.VarianceScaling\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'scale\', \'mode\', \'distribution\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'fan_in\', \'truncated_normal\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.-g-file.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.-g-file.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c6bf57a88fc1295da13e0b58671191c9d8ba8caa
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.-g-file.pbtxt
@@ -0,0 +1,58 @@
+path: "tensorflow.io.gfile.GFile"
+tf_class {
+  is_instance: "<class \'tensorflow.python.platform.gfile.GFile\'>"
+  is_instance: "<class \'tensorflow.python.lib.io.file_io.FileIO\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "mode"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "flush"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "next"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read"
+    argspec: "args=[\'self\', \'n\'], varargs=None, keywords=None, defaults=[\'-1\'], "
+  }
+  member_method {
+    name: "readline"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "readlines"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "seek"
+    argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "tell"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "write"
+    argspec: "args=[\'self\', \'file_content\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt
index cfa3372b12bfe32eed4311c89b6448c0359c0913..a797c06ff337cffe503d89c09497996ea64c6ad2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.io.gfile"
 tf_module {
+  member {
+    name: "GFile"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "copy"
     argspec: "args=[\'src\', \'dst\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
index 8906329742c61ed08a25bcc252ec0d1dfa9e374e..2d9c759e3cf92a2368fd904fa57eec2413dbba8f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
@@ -8,22 +8,6 @@ tf_module {
     name: "FixedLenSequenceFeature"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "PaddingFIFOQueue"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "PriorityQueue"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "QueueBase"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "RandomShuffleQueue"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "SparseFeature"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index a3254cbd947d9ef70617131e9f4b17f44f059840..5e1371815469974b91b1a4de16fa20d19404eee8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -131,7 +135,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -163,19 +167,19 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'validation_freq\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'1\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -231,11 +235,11 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index b70e9ee98d5bc4900420ddb1307abf9adcd8cad0..4bbe98b789935318a901d84502cb763a60ddc92b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -136,7 +140,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -168,19 +172,19 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'validation_freq\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'1\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -240,7 +244,7 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "predict_classes"
@@ -248,7 +252,7 @@ tf_class {
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
index d200d3d26d7c1b7d54eda596a8056a66e29be0b6..49e3d1155c3cb711676c4e67b8c47a8ffbe7615d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
@@ -124,6 +124,14 @@ tf_module {
     name: "ctc_label_dense_to_sparse"
     argspec: "args=[\'labels\', \'label_lengths\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cumprod"
+    argspec: "args=[\'x\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
+  member_method {
+    name: "cumsum"
+    argspec: "args=[\'x\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
+  }
   member_method {
     name: "dot"
     argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
@@ -244,6 +252,10 @@ tf_module {
     name: "learning_phase"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "learning_phase_scope"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "less"
     argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-base-logger.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-base-logger.pbtxt
index 7d298e95135ebf41230d72ff488fef30be682edb..9dbdaf0f5f3db292feb98fe06092b6f7a6b8f034 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-base-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-base-logger.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
index 133205ab88b47afad32fc70ceca93513768a3b19..0725f606e2923ff1bd5a8814febdfe7de8a2602c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt
index d766c09ac5efaa9d0e4ffba4e495385130c7e770..14bfc3bedbfb5a379e28a0cb9cd2f7f744539fa1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-callback.pbtxt
@@ -22,6 +22,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt
index 605f74e5602a63f5a18c31cb26113d300ec76e7a..9812bad8f66f3d5afe365287feca748f9e6efd5d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-early-stopping.pbtxt
@@ -27,6 +27,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt
index cd893e67269164781d6a6b6294a199014d40fed8..5aa739391ef894cdede1db17f903a50111f25eca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-history.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt
index 50f2054cabb1b8f6c46a9537ea923a18f87e5c80..bf5bcb68df47ed8661509598d3bc59f01dfcefe6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-lambda-callback.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
index 9ed9db0a89b49b88098e15baca414ff78b6f10e6..a04ffb92eb9e32b2473355f140d68537b80074df 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
index 3d8d1363bb4e4de818788efbf3c997594350006a..5ae176017b3cf1ac019ecdc0f1c255f23b32fcec 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-progbar-logger.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-progbar-logger.pbtxt
index 5012f1517d57dd646d82ab669cb279b6363dd6ec..624f856d2752e1f375154664a892d6c1d600ecbd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-progbar-logger.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-progbar-logger.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
index 73652c2b61259f768eca76b995ae4592df868392..0fed6fd23670a16acd8d770269090c3dda0eee30 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
@@ -27,6 +27,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt
index 24db71de1182d58b78fec0419aa9cb48a2e315d2..71cf7f4a4922752c0ba154a8d3fe29b37c305675 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-remote-monitor.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
index c5503c69a5f3cb6765c984778c0e3626369ee815..abf4286b691b4f8a231df030c292a0f14a3d6f65 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-tensor-board.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'log_dir\', \'histogram_freq\', \'batch_size\', \'write_graph\', \'write_grads\', \'write_images\', \'embeddings_freq\', \'embeddings_layer_names\', \'embeddings_metadata\', \'embeddings_data\', \'update_freq\'], varargs=None, keywords=None, defaults=[\'./logs\', \'0\', \'32\', \'True\', \'False\', \'False\', \'0\', \'None\', \'None\', \'None\', \'epoch\'], "
+    argspec: "args=[\'self\', \'log_dir\', \'histogram_freq\', \'write_graph\', \'write_images\', \'update_freq\'], varargs=None, keywords=kwargs, defaults=[\'./logs\', \'0\', \'True\', \'False\', \'epoch\'], "
   }
   member_method {
     name: "on_batch_begin"
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
index de6e8ef072558e6d926ea125aa5056e3c229d37f..d5a59d870a390a6f5632332c12534f83c686e2dd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
@@ -23,6 +23,38 @@ tf_class {
     name: "on_epoch_end"
     argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "on_train_batch_begin"
     argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-cosine-decay-restarts.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-cosine-decay-restarts.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..58bede556dfd4d8988d92e99e402d9b3b3bf5adb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-cosine-decay-restarts.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.experimental.CosineDecayRestarts"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.CosineDecayRestarts\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'first_decay_steps\', \'t_mul\', \'m_mul\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'2.0\', \'1.0\', \'0.0\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-cosine-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-cosine-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e2549a2ac627421ecc80df2d6235c1a22ab5e3ff
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-cosine-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.experimental.CosineDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.CosineDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-cosine-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-cosine-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f083120b52ce483f46cc92390b53180bc3bd65ed
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-cosine-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.experimental.LinearCosineDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LinearCosineDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'num_periods\', \'alpha\', \'beta\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'0.0\', \'0.001\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-noisy-linear-cosine-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-noisy-linear-cosine-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ea3c6beb1c0f8fffaa442956c0cc134f70a5e84
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-noisy-linear-cosine-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.experimental.NoisyLinearCosineDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.NoisyLinearCosineDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'initial_variance\', \'variance_decay\', \'num_periods\', \'alpha\', \'beta\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'0.55\', \'0.5\', \'0.0\', \'0.001\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index 1d814b2c8b553f1b2a07f9d9b97dc70ec0674969..2f3cb0b7c51e119da6a122dd6672109789c1e73c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.PeepholeLSTMCell\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTMCell\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5a75f44fcc3a1ecc65b27cc52d61256b6e69e0af
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
@@ -0,0 +1,184 @@
+path: "tensorflow.keras.experimental.SequenceFeatures"
+tf_class {
+  is_instance: "<class \'tensorflow.python.feature_column.sequence_feature_column.SequenceFeatures\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt
index 164edbd66ab2487a980155eabcf18ed8446e2c14..65b82a3f3222c51c4a419918ad1e74dd52152aed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt
@@ -1,7 +1,35 @@
 path: "tensorflow.keras.experimental"
 tf_module {
+  member {
+    name: "CosineDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CosineDecayRestarts"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LinearCosineDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "NoisyLinearCosineDecay"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "PeepholeLSTMCell"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SequenceFeatures"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "export_saved_model"
+    argspec: "args=[\'model\', \'saved_model_path\', \'custom_objects\', \'as_text\', \'input_signature\', \'serving_only\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "load_from_saved_model"
+    argspec: "args=[\'saved_model_path\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-constant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-constant.pbtxt
index cbaba78ed5a851c3d6e29ab67c89fdfd5db01754..71b5acc38fdf6a0246053f3260fc7e9c17d3f204 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-constant.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-constant.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.keras.initializers.Constant"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Constant\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Constant\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'value\', \'dtype\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'float32\'>\", \'False\'], "
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-glorot-normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-glorot-normal.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..edff37e3a15b198839a6729d75f190e88491f057
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-glorot-normal.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.keras.initializers.GlorotNormal"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.GlorotNormal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.VarianceScaling\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-glorot-uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-glorot-uniform.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bc685ce0d58f5ec4afb058508886d8d14c393c24
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-glorot-uniform.pbtxt
@@ -0,0 +1,19 @@
+path: "tensorflow.keras.initializers.GlorotUniform"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.GlorotUniform\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.VarianceScaling\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-identity.pbtxt
index a5f7f348de9d9899d962e7647d7943ddb6a60604..e0f0f3a93dac6c5e63822bbddb0d88ffcc0cfa70 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-identity.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.keras.initializers.Identity"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Identity\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Identity\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'gain\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \"<dtype: \'float32\'>\"], "
+    argspec: "args=[\'self\', \'gain\'], varargs=None, keywords=None, defaults=[\'1.0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-initializer.pbtxt
index 8f10d1698e7b7b2afa9c2664c7dca38045eda85b..ae5ea9e48c9bf4fc478f60968cf1d83dd9c43762 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-initializer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-initializer.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.initializers.Initializer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-ones.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-ones.pbtxt
index 2fbfa774f8ed020164e32bb3cfb69b8a235609ba..57c0b0917d1fd50b7817e575f55cfbb2e3e1781b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-ones.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-ones.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.keras.initializers.Ones"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Ones\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Ones\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-orthogonal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-orthogonal.pbtxt
index 874d320d73d1f1cdbd817db587ea9dcfea4d352b..b24844fa35c555294f899ebe68a2ee180de149cc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-orthogonal.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-orthogonal.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.keras.initializers.Orthogonal"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Orthogonal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Orthogonal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'gain\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
+    argspec: "args=[\'self\', \'gain\', \'seed\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-random-normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-random-normal.pbtxt
index 26784ce55d087d7d4fea6e6e0989d4490c95c6c1..0753827aa67434cd5670a41bd09e61ae7acb28dc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-random-normal.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-random-normal.pbtxt
@@ -1,12 +1,11 @@
 path: "tensorflow.keras.initializers.RandomNormal"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.initializers.RandomNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.RandomNormal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'0.05\', \'None\', \"<dtype: \'float32\'>\"], "
+    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\'], varargs=None, keywords=None, defaults=[\'0.0\', \'0.05\', \'None\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-random-uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-random-uniform.pbtxt
index 4110bda5f6d54eb6853a10b5e31123e369ce1514..280b0a0243d5c0e4f595f6d0f8b8bcda8202cb5c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-random-uniform.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-random-uniform.pbtxt
@@ -1,12 +1,11 @@
 path: "tensorflow.keras.initializers.RandomUniform"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.initializers.RandomUniform\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomUniform\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.RandomUniform\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'minval\', \'maxval\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'-0.05\', \'0.05\', \'None\', \"<dtype: \'float32\'>\"], "
+    argspec: "args=[\'self\', \'minval\', \'maxval\', \'seed\'], varargs=None, keywords=None, defaults=[\'-0.05\', \'0.05\', \'None\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-truncated-normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-truncated-normal.pbtxt
index 0451d0d73a0b3ed718c4a95eaaecabbe51448b63..4076aa595fe9be1c77e25be3f9a09469cae9298b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-truncated-normal.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-truncated-normal.pbtxt
@@ -1,12 +1,11 @@
 path: "tensorflow.keras.initializers.TruncatedNormal"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.initializers.TruncatedNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.TruncatedNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.TruncatedNormal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'0.05\', \'None\', \"<dtype: \'float32\'>\"], "
+    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\'], varargs=None, keywords=None, defaults=[\'0.0\', \'0.05\', \'None\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-variance-scaling.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-variance-scaling.pbtxt
index 03f4064b9ef5093044a9cbb897043d643cf7f83e..a68219def66c5d68a189262408a7d1e16ac0c109 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-variance-scaling.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-variance-scaling.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.keras.initializers.VarianceScaling"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.VarianceScaling\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.VarianceScaling\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'scale\', \'mode\', \'distribution\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'fan_in\', \'truncated_normal\', \'None\', \"<dtype: \'float32\'>\"], "
+    argspec: "args=[\'self\', \'scale\', \'mode\', \'distribution\', \'seed\'], varargs=None, keywords=None, defaults=[\'1.0\', \'fan_in\', \'truncated_normal\', \'None\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-zeros.pbtxt
index b6ab68e5beb47c9bcfbc52f9808255bbb03d2dc0..129fa18c6171cd04fbd2d023fe8c67b75de4e542 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.-zeros.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.keras.initializers.Zeros"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Zeros\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Zeros\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.constant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.constant.pbtxt
index bddc37b907e7573c9fff27a0c3a5f7e199b88a9a..b03cbb8eb804ad80ce5c2d6e43fe07d4ac9db9cd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.constant.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.constant.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.keras.initializers.constant"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Constant\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Constant\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'value\', \'dtype\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'float32\'>\", \'False\'], "
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_normal.pbtxt
index ef0815972d219e7fee1e2a02f5eb53d26a41c734..02f8c252bdafc6ae5e0db1162ba2185c04981b63 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_normal.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_normal.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.keras.initializers.glorot_normal"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.GlorotNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.VarianceScaling\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.GlorotNormal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.VarianceScaling\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\"], "
+    argspec: "args=[\'self\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_uniform.pbtxt
index 439b5ada9bb3ff1f6267922a8c755d8f097b004a..6d18a3b6e7e063b5b3e172228a93b934a41736bf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_uniform.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.glorot_uniform.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.keras.initializers.glorot_uniform"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.GlorotUniform\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.VarianceScaling\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.GlorotUniform\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.VarianceScaling\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\"], "
+    argspec: "args=[\'self\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.identity.pbtxt
index a4c5a6149047ffdaadde1243e4c80feae05cd77b..dcdb6ddf5f0c6256c7f1160996a08565330aa6c5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.identity.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.keras.initializers.identity"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Identity\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Identity\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'gain\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \"<dtype: \'float32\'>\"], "
+    argspec: "args=[\'self\', \'gain\'], varargs=None, keywords=None, defaults=[\'1.0\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.normal.pbtxt
deleted file mode 100644
index 8d0b5c242bd97f6b85b34408fd6d96fadec530e5..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.normal.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.keras.initializers.normal"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.initializers.RandomNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'0.05\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.ones.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.ones.pbtxt
index a89f78d1e1a47c7cd5a252cfd0a7b2fa23979e90..cc2dd171dfceba916fed1a02bbfb19f26497adca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.ones.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.ones.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.keras.initializers.ones"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Ones\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Ones\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.orthogonal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.orthogonal.pbtxt
index ee1e9bbae2b7130db5b96309e2d87719169d788a..855065c1634abe2c794fec705a5dbc004fd3e597 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.orthogonal.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.orthogonal.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.keras.initializers.orthogonal"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Orthogonal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Orthogonal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'gain\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
+    argspec: "args=[\'self\', \'gain\', \'seed\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.pbtxt
index 1540c2915bff8b49ab1619223a54c67814c69551..15a56fbb973ccc729d7b377ef6a20b426687690c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.pbtxt
@@ -4,6 +4,14 @@ tf_module {
     name: "Constant"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "GlorotNormal"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GlorotUniform"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Identity"
     mtype: "<type \'type\'>"
@@ -56,10 +64,6 @@ tf_module {
     name: "identity"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "normal"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "ones"
     mtype: "<type \'type\'>"
@@ -68,22 +72,6 @@ tf_module {
     name: "orthogonal"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "random_normal"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "random_uniform"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "truncated_normal"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "uniform"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "zeros"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.random_normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.random_normal.pbtxt
deleted file mode 100644
index bac8211a10a50a33f19f36bb3f6370f38518903f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.random_normal.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.keras.initializers.random_normal"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.initializers.RandomNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'0.05\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.random_uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.random_uniform.pbtxt
deleted file mode 100644
index ab0d74d07171e3863be09b0d79045af7a7095587..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.random_uniform.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.keras.initializers.random_uniform"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.initializers.RandomUniform\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomUniform\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'minval\', \'maxval\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'-0.05\', \'0.05\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.truncated_normal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.truncated_normal.pbtxt
deleted file mode 100644
index 358cca2b9cf657f5db6533a5523bfb6393d1f36f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.truncated_normal.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.keras.initializers.truncated_normal"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.initializers.TruncatedNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.TruncatedNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'0.05\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.uniform.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.uniform.pbtxt
deleted file mode 100644
index e6c731361acde102dfc049a750637385555f9f43..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.uniform.pbtxt
+++ /dev/null
@@ -1,19 +0,0 @@
-path: "tensorflow.keras.initializers.uniform"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.initializers.RandomUniform\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomUniform\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'minval\', \'maxval\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'-0.05\', \'0.05\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.zeros.pbtxt
index a262390687f31a5fb79822e69273306b9e1897b5..f9b3359d7a9b7deda1c80ffce17b061d87f31235 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.initializers.zeros.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.keras.initializers.zeros"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Zeros\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Zeros\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
index b84629540e700f242f885064c92309c294693a11..0c6c0a34b9f606398831c2a82e9b049fed96957a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Activation"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Activation\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
index 5918a13ad8629582829049485e896688ecad9579..15bf03977dbc03660971fc7343cb0388d8696326 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ActivityRegularization"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.ActivityRegularization\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
index 599da06427dfe4f28e757a7aac8d8a14856a4556..b265384d5980f4c4982ced19af0208427da56817 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Add\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
index f9ff1538c8134d96051ad81d35c73e59c6a8cc57..3a0882daf1eefaf17f893c3b565784bdd60ac689 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.AlphaDropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.noise.AlphaDropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 723fc9cdb0d0ad93470e22fd8c147d3ecc92af91..d2ee310d68e5da9f07f9cb6656165c97fccaa469 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index 957ce2f0ce86f8df3eb8b57606229fb661eb52f7..1da079f39e25b020406dabca46966d57dfb4451d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index a52c0af68175420dc2a1993d1f025d36705538e1..d96751ccf916aa9a9522f341a3befbc987f25125 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
index a004db62ddcaaae02a411d8db51f4026ece1384d..3819e525a99b67337f1d3a36b30f6ee0dfefa03b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Average\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 44f83d1387cb2ec681f50f7b1f0297f3f74594ed..47f6b397a70109e9cdb833eae0cbec8a753831f7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index 8378faf7188ec594865d4b68c8ea8cae284183ca..4b8cadca5344acd7fe279d0132184d666a94f448 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 9d5655c9644e3a2394a346bed78fc478cf60ba8d..5c66da42e64b3cf57c2e03002bb95bc33eeb3ac2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
index 5da79268129fc5c08cbd37686333847cbb32730d..9ae69aa163c33359850e5a290613929fb65ad689 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.BatchNormalization"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV2\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
index d37a6b47105225d7b83b6a264b944ceeb583a6c4..95eb6f69ecc4fa313da9eca9b6bee185b647be03 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Bidirectional\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -17,6 +17,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -111,7 +115,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
index 1ad7a91be0ba48d0dbab19da8c7cd9ca89095918..09410135435100779208ff1c3db9198fdc88b178 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Concatenate\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index cb9abc25396bb63a3c40de5cc52f9df7ed20071e..941b6aa3dd3316d15edede430ef25da913bf0ce6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional_recurrent.ConvRNN2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -42,6 +42,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "filters"
     mtype: "<type \'property\'>"
@@ -192,7 +196,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
index 47dba1d81f8f97a60fe72ec521f82a78ee5f3505..4bf8336fb32013258ec17edb810a68e7c0d3cfb3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index fd649418961301f150aac3dabc1bdf0ade4a9c28..221addf20aec476c7533284f9d7437cb0e1a6bad 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
index 1b1425d53197db8b59abf51fe93c0b0c45299956..1c95fcc8b543c3da318119f7c9ed64952ba5b819 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index 1741063fe8b09acf3865e0a135e96bb715dcdcfa..994a507bfb20a675ef2b4f8f5a9b2284ee0c21ad 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
index 50feb4f458ad1a9cb2b2bfe5d67997b7551eed74..ae251b529bd6243733263acad98367a1a64e6530 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
index faaa535df9fe03ad07862f0793f8ebea67b405ca..1d73eecc37c30b0ce6ef723f56691cc502a5698e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 4079329d1ee2a61270fee38426bb8a0859c38ce3..d37ec0f5603237cb5c397dd8193f51d5e50db0ed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
index 32e56696e1617f7810792e3416a2ebb2037d23c2..bb3c37d573fc2477a7327ea4dfec45a7a66e7c22 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 381abe73401fa3a588873d643324fc020c159e30..fc29f1c913adbfde33827d8095181763a43126d8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
index b3e4bf9689dc7e9db63de7f43e9dfa9ac4d42b02..2658fb4e9984969c60257c17361799e2f56b1bb7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
index 7aeff8003c322e8a8168dd70481a8b30b08762a8..58567eafa4134b747b4ca7ed9b71aa77d3a4c2e3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Cropping1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
index a1728d9d4f9a1e677646db04c4d0df9572e21208..42be76f4189d6fe299acb6fda744e268881de6f0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Cropping2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
index 8d8fd142cc64ee113c4b6a7e4e2462ecc69b6028..11092f225d1c350ef81653555fb3cec8f6bda85a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Cropping3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
index 0781a93bd56c5ebc77e1fb650497621e49d7ee1f..d6f7f3033ecf8e226b961dabfe59e751639e5b98 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
   is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
index 4960d0264e96e872ea5c49a8841cef20bd5eb37c..339c9f52b7ff45ebebce128980b3ed9fc47bfc5b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Dense"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dense\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 8fad7535f882718462a11e27e75732e3097cb87d..c2992de5a8f9757266fe681943e7a6b84039a893 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
index 5b425f2d4d7a8a897280490e26922766d8bf7065..b37f4c845a45418ba8fc47c89718debb97de8f88 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Dot\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
index f6c4d0a438ed027635b40ec992eb1bbcb5c9a3a1..d21f577721c3052507969c208870b9d69dd3313d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Dropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
index 82b761fc1761bb3e7638f7a80bc80c6433162d04..f1e086b5dc50a3e47bbf2e9d3d130cade60b8902 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ELU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ELU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
index c9ff323877e06b6dff274644744d425e3a9b7932..eb3496aa48fa43066f4f394ba2039416b4157f33 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Embedding"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.embeddings.Embedding\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
index 9b4165d4cbf88fefd2bb684dae70ea8afc01357b..bbe324c77255fe5ccba650566d4a455196ce49ac 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Flatten"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Flatten\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index f225f7c4309615919fb05df05f2ae664bde80097..dd93e32ddcecb70203798c65483638a328019d3a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.GRUCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.GRUCell\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
index 855d001700179fb634d1dff78585d340420abe7f..32e69856b9e569718e96ea1e0054b4229d47707f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -1,9 +1,10 @@
 path: "tensorflow.keras.layers.GRU"
 tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.UnifiedGRU\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.GRU\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -33,6 +34,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "implementation"
     mtype: "<type \'property\'>"
@@ -155,7 +160,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\', \'time_major\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'True\'], "
   }
   member_method {
     name: "add_loss"
@@ -175,7 +180,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 2c404c99cd2175cdc8b60b229e4410bf280ebcb7..e24862632e73c9030dbce521df445da2521f322d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.GaussianDropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.noise.GaussianDropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 6f109d59d0f6fcd2b4650719e3b4f653baec7d23..b1b5759881792810b4b11e279bd699652c256334 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.GaussianNoise"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.noise.GaussianNoise\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 69f8a9031d32eb73bb44291cdf330d738d745cf9..9c4087a06f1776c3372bcc8524f1ea48af1b7447 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 4299f765e525b136e289bba169becec06e19ffb1..d56429442c626340b0f38becf452668ff1c4535f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 9153a1a2406b6fc4ab60c80fee2f8d6d69b00b72..089145ead9f0aa35f4a6e2d41b5dfb8425c68557 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 625e81fd2322ceba153fa65c138948ce43843089..2bc02b6f69f2468449c8d551d42e3b12e219964a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 2fc769742c70c5665c9cb77ad246fcdb49366d5a..c2510d129460b8819da832b3354321db49868a29 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index e307a65c7c565660e1f2b6b6b74dc5970425eaa4..845d6b17015a4168426b430add12d303fe14cf5a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index 4394ad0364e89fd3531d6625e52540991cadf973..f6fa8659dc4cf316968dd9f48572532fc0b4e5aa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 050ed39fe98dc7cfdf6febe45e235d3ae7cbf486..1285e21f714e3cac3288e4d5c8b883c5ec909d7c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 436191821ef4689351b6124cf2a20afad917e4ab..a1417e46ceb713440bfcb16d7f29d415883a3633 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 4ba540aa6adc72b572aa9340f89967d69ab78a3c..ff4da8ba542732f48388428f3dcf9452d41a3320 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index a2e9322cb3fd4e56af708d5c4e17b660f7bc2247..7140d5718ba61f508a1d00729a4777745994bee4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index 5d16a57fc1aeff9939220de8043fcae39e3d953e..4edeb9788d88000634ee1aadab9fea69fb2c83c6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
index 9dd29c1251ef2eacaf535a3f10f3d42dc36624a2..48609567d5bed7246d0162ddfb8da92090c5640f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.InputLayer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.input_layer.InputLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 0045d5775e2c19df21428bd4420b6e5612c8002b..7398613812d0b5dc7d3f9bb62fdda0bd08d11b60 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LSTMCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTMCell\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 529c750f98715ec30313ed34c9023a845061a3df..c9b759d792752a372e1ba35b77fc964c9da4eea1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -1,9 +1,10 @@
 path: "tensorflow.keras.layers.LSTM"
 tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.UnifiedLSTM\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTM\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -33,6 +34,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "implementation"
     mtype: "<type \'property\'>"
@@ -155,7 +160,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'time_major\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "add_loss"
@@ -175,7 +180,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
index d4d1bc6b6bbf0ce39742b740aff6dc0c1cd464a1..88f1f8b06d1e887385ce44903ada3dfb3c934e21 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Lambda"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Lambda\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
index e1f5491180903f7d6931cc09755cabb715bbf233..c95f9159cdf2549cce9d645d9cb9b4802ea87018 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.layers.Layer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -11,6 +11,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -81,7 +85,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\', \'dynamic\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "add_loss"
@@ -101,7 +105,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index 9b69d9a9447f42907236b5cc8c7672012f96c38a..80d1c32dede3d72a4bb5f150d9c9d51143b7f5ff 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LeakyReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.LeakyReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt
index 2b66576c96b8503d3ebb90f02ed19233223a269a..9bafe013efed1d59e1a6c43600cbe35593b04f97 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -18,6 +18,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -136,7 +140,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -168,19 +172,19 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'validation_freq\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'1\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -236,11 +240,11 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index fd52259432577ac94dc702d4411ad5c0eed1ff10..b050302861485258c0043d97bb325860f770fb06 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LocallyConnected1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.local.LocallyConnected1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 5fc8af0d03564c649dff6e9df70d10731319de40..3bb780cdb0ae3b2f2c8b95f1e41c524f12d49162 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.LocallyConnected2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.local.LocallyConnected2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
index 7f8932270e63bc02852c5b64e53694e7e26be08b..690208be83a58bf107f02fa342812c9d29319183 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Masking"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Masking\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 4723b99cb0792e1ce0bdc45e46908da8c2b5359c..02f3186dc60056844a43821411e4438eb123fb88 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 173c5d4a8b149c4e23683cf375e8d793db7faa5a..f2e9a3b13665024e8102f1db84293b73fa094d42 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
index 14e1899e145224e411d65cbf481060a3b2cec0f1..868faa03919a0d656394a691200e622d987f4be1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index a708e652bf0e82dea0f58034a81a040a39550dc9..8e1662630c18d8662145226c6a54a05f49cab24b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index e6706b5cf9f32bda78adc4e2db5916a5750cc82e..ab96640936137bbcd3e73b1dbb56060dfe4943d8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index a73c082d1bba0453b742f76bacf0ad6116ba79a7..4f492f50fbd7b571b5fc853e6523c9cb56d73e82 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
index f3f195554bbf4a43efaf2af0fd278a23bf270994..702f2e88d0b32776ab2e45bf968843e8df1e5578 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Maximum\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
index f345d1d67b2ce0200c64b1aeea5f39821d070bac..fef939090d8d1e4ffc17b290754b16b87a07f38a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Minimum\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
index 31cb8bc177c7a9e365101e75108a29900fbda124..1e2db3fb135ded60a33f8f4f3ffa92322d38dbd6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Multiply\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 44cccc92bd2f1ff0335c22f2967865dc88a96ff7..1450047d7dd716b97a824c6718bef26373214114 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.PReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.PReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
index b55e191ff1ad6997550966bbb6154a81a489575d..d5d3e0333bfc5ef8bf56a153d04ccc9c9df85dc7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Permute"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Permute\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
index e9575436e5b14ac8c52a0b59c86937886eab5f40..f1151f43c9df50850e61b5ab6e9ea7b7bc0184d9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.RNN"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -106,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
index 98223b207f2ecfd5b7af8a53390166e53a7d4f73..0874240fa6f60bd8c345fc1c2219f9f7bf843831 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
index 2df918b16b2552323d75083bfa80e328c0639cfe..de9f8fb5939fd11ccdde97e383e1f4a7f1d99e73 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.RepeatVector"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.RepeatVector\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
index ce5f9e21290eeddc0052257191ac4a6d068c1366..a125754f43a31806b94b4775b4339e9c610a8ee0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Reshape"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.Reshape\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index a0bb917775fd9edb5d909bf850310e0596a88209..01fac3a2595838bdd17b8ae5b0dd93a7ed70d495 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index d7942f201bdbfa8d1577813be461a5905b5c6c90..80628d76d8224af71ebc12a1d735481ea6c0c654 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index f7ac9042d46f46ab35d18c62e5d8841679a18ca9..41e96fde4b2a06c67fd2007f0b597104b4cf5b93 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index e5a92688220f6e227b317d71a70fde01df4c432b..f48b0b3517f04dcba9ace267b4ffcfc5abb70c23 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 0fe2c974a762784a82a6b97e116357be2a61d84f..5e799329c032a1e72a61378623e2844fa7e89401 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.SimpleRNNCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.SimpleRNNCell\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 2ee5873f0f11688019dec3a6cd69db06d99b9caa..60893bbf1aa5a508cb4fedabc8cd90927b488f71 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.SimpleRNN\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
@@ -33,6 +33,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -163,7 +167,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
index 5b8f64aa35725d0ea44fc5c5b81952fd839503e7..c96405a434c9ba3cb770b3dcaf6cf551ac4f200c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Softmax"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.Softmax\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index 240cb6e562f77467d94ef95db2374150e318bc04..153b7bc876738ef180901080bef3f8fdf2dd80f9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout1D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index 6226c469f8a534f96f6ea991fa5e7d2cf0019e3f..44e08117c3130df83572dd8409e5af273ac5b290 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout2D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 34dabce6d8dd0b1b6fe50a008a981e1f06a77edf..6e9f624999444a54b5d7dd2e372f9d5d470a6011 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout3D\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 0ddf628ace582db259ebe0b211aba6e6362b5d5b..57da4c0ba9d62aad702a6204b3ce41b48bf6d161 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.StackedRNNCells"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.recurrent.StackedRNNCells\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -110,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
index 12eb35ad154a514afd9c900cb2dbece8af28c49f..27eb794485a8c1c46cf918782b97724e9774eef4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.merge.Subtract\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index c41020c2b45cc88c9b63f3b7a45c35066794dfe2..733070e50bc283729ecbd91b1b79af9521eba678 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ThresholdedReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ThresholdedReLU\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
index 479f89cf6ae93e8d6ae02e304a51a145164df7de..009ecca9a7f490f4792c3c0649e3a9003d0e247d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.TimeDistributed\'>"
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -107,7 +111,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index 233363ce02614f184b43a059889c7475b6a8c50b..f465aa64b91f644d5d2f292f0a67a3300ed3f488 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.UpSampling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index cb6228ac446bd236df88f94eb6e9e717ea38463d..049da3deddc0fa56d4ae2bfd6552452473d0b7d4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.UpSampling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 03bad3ccb613a225ad56e128ea680fc9312151e1..1d50c891154dd86e4ebe5b481a137e4c7ce248a0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.UpSampling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
index 158996792a47fab0e7aa26d21d4bb7f281ca76d2..6604ac05d910abd89e3c04f6a8194adac582963e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.Wrapper"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -106,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index 63a56cd3eebe271f66258c9a0acb974764555b34..2c8d52765d55357fb2f02f5268b089b27fa20429 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ZeroPadding1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding1D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index 965a4cca04651e123c5bd93484200a58b39918ba..bf9f43c1dbfbddb0ddd19b26d554df86ba493d25 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ZeroPadding2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding2D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 1a624308878a68f1b48cb0f8b5e08dafbbfa0333..a78cfa826bfdf5f49947cd9ae2f9879bf36328e2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.ZeroPadding3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding3D\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -12,6 +12,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -102,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt
similarity index 75%
rename from tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
rename to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt
index 7c463ff1257599366be049edce6cc06140906286..9b5598eed07c9f04feb0d90820381abc12dbb456 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.-layer-normalization.pbtxt
@@ -1,21 +1,19 @@
-path: "tensorflow.keras.layers.CuDNNLSTM"
+path: "tensorflow.keras.layers.experimental.LayerNormalization"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.layers.cudnn_recurrent.CuDNNLSTM\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.cudnn_recurrent._CuDNNRNN\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.LayerNormalization\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "cell"
+    name: "dtype"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "dtype"
+    name: "dynamic"
     mtype: "<type \'property\'>"
   }
   member {
@@ -66,10 +64,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "states"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -92,7 +86,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\'], varargs=None, keywords=kwargs, defaults=[\'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'norm_axis\', \'params_axis\', \'epsilon\', \'center\', \'scale\', \'beta_initializer\', \'gamma_initializer\', \'beta_regularizer\', \'gamma_regularizer\', \'beta_constraint\', \'gamma_constraint\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'-1\', \'1e-12\', \'True\', \'True\', \'zeros\', \'ones\', \'None\', \'None\', \'None\', \'None\', \'True\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -112,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -124,11 +118,11 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
@@ -146,10 +140,6 @@ tf_class {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_initial_state"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -164,7 +154,7 @@ tf_class {
   }
   member_method {
     name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "get_output_at"
@@ -186,10 +176,6 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "reset_states"
-    argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0f229615461dc7b781c0ba2ec6f81692d65354bf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.keras.layers.experimental"
+tf_module {
+  member {
+    name: "LayerNormalization"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
index 3b4724ef104878df0caada75b0ba68740dc93f8a..f878c460596ccda1ce24417f2a260f7a5e69b755 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
@@ -112,14 +112,6 @@ tf_module {
     name: "Cropping3D"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "CuDNNGRU"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "CuDNNLSTM"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "Dense"
     mtype: "<type \'type\'>"
@@ -404,6 +396,10 @@ tf_module {
     name: "ZeroPadding3D"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "Input"
     argspec: "args=[\'shape\', \'batch_size\', \'name\', \'dtype\', \'sparse\', \'tensor\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'None\'], "
@@ -420,6 +416,10 @@ tf_module {
     name: "concatenate"
     argspec: "args=[\'inputs\', \'axis\'], varargs=None, keywords=kwargs, defaults=[\'-1\'], "
   }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "dot"
     argspec: "args=[\'inputs\', \'axes\', \'normalize\'], varargs=None, keywords=kwargs, defaults=[\'False\'], "
@@ -436,6 +436,10 @@ tf_module {
     name: "multiply"
     argspec: "args=[\'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'layer\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "subtract"
     argspec: "args=[\'inputs\'], varargs=None, keywords=kwargs, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-binary-crossentropy.pbtxt
index 2f7da93f6f412ca559aec2f6acde2b80a5c93c86..1242eec68f1414f1c8e67bb95602687f4a58412f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-binary-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-binary-crossentropy.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.BinaryCrossentropy"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.BinaryCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'binary_crossentropy\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
index b3a7cd80973259bd5cdfe382c656a9478f8933d8..cf3c2de840450de8e9467269ec446172583e8ffd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.CategoricalCrossentropy"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.CategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'categorical_crossentropy\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fa374afb28bc4d7fe226456743c285b4f539ced1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-hinge.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.CategoricalHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CategoricalHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'categorical_hinge\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-cosine-similarity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aa14c44fa3628236033e952b69f3a160c49a36fc
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-cosine-similarity.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.CosineSimilarity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CosineSimilarity\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'sum_over_batch_size\', \'cosine_similarity\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a4c25eefcbbc75afb3765b11e325f6bd830ccba8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-hinge.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.Hinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Hinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-huber.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-huber.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1fa8ffa95726f72b620c3908b48fe20dfae1dc17
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-huber.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.Huber"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Huber\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'delta\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'sum_over_batch_size\', \'huber_loss\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-k-l-divergence.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d950c789eb44fcad792a9d11856ce11143715807
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-k-l-divergence.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.KLDivergence"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.KLDivergence\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'kullback_leibler_divergence\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-log-cosh.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-log-cosh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fbbd5317f89f801e8a4f4cc80e700e2b478ebf40
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-log-cosh.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.LogCosh"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.LogCosh\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'logcosh\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-log-loss.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-log-loss.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..40ca239cceec8a726f6fbb0b2a2c633d1499afa7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-log-loss.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.LogLoss"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.LogLoss\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'logloss\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-loss.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-loss.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..04a9cc94201a5472a7c6158acfc4bfd48d4f74db
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-loss.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.keras.losses.Loss"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-error.pbtxt
index 712bb2ecd3526c354cbcf640e689526b2e415a13..9da6b59ec83bb5b74336a122a791a0d5ea3eb079 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-error.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.MeanAbsoluteError"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsoluteError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_absolute_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
index 7fe362da89b47a925cd4708909e1c882a9a23aca..7c3ae9b49a415c1586df01984bd73af38ee97558 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.MeanAbsolutePercentageError"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsolutePercentageError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_absolute_percentage_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-error.pbtxt
index a5718533500d9508c558d25d13fc6b61518a73a0..2126ac68d2a4cd8f1b68466e073ec573d13f2cda 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-error.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.MeanSquaredError"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_squared_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
index 200006db355ca4dc8eb2f509bcb9da7543145548..6ef9610546a0ec662313534f424d49879187f302 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
@@ -1,11 +1,12 @@
 path: "tensorflow.keras.losses.MeanSquaredLogarithmicError"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredLogarithmicError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_squared_logarithmic_error\'], "
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-poisson.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..61c90c3140e2b68b9796873b0de73668f1508476
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-poisson.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.Poisson"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Poisson\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'poisson\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-reduction.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-reduction.pbtxt
index f20ed26e2ea2819554159a9bcecb4141601e4a19..e93be80f1f702eacda20e4eefbec12dad724edaa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-reduction.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-reduction.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.losses.Reduction"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.losses.losses_impl.ReductionV2\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.losses_utils.ReductionV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "NONE"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c13f9f967db7014548de1283c5d59bbac403299a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-sparse-categorical-crossentropy.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.SparseCategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.SparseCategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-squared-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fabe4c7814462b91a12062bac5c2119cfd45bccf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-squared-hinge.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.keras.losses.SquaredHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.SquaredHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'squared_hinge\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
index c198096d252cd9a3706bcbf6f1e4a1199ec7a1f7..4ec0e887fc07ce66c554e2b4e23a06fc68a3e794 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
@@ -8,6 +8,38 @@ tf_module {
     name: "CategoricalCrossentropy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CategoricalHinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CosineSimilarity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Hinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Huber"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "KLDivergence"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogCosh"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogLoss"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Loss"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "MeanAbsoluteError"
     mtype: "<type \'type\'>"
@@ -24,10 +56,22 @@ tf_module {
     name: "MeanSquaredLogarithmicError"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Poisson"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Reduction"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SparseCategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SquaredHinge"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "KLD"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -50,23 +94,19 @@ tf_module {
   }
   member_method {
     name: "binary_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
   }
   member_method {
     name: "categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
   }
   member_method {
     name: "categorical_hinge"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "cosine"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "cosine_proximity"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    name: "cosine_similarity"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
   member_method {
     name: "deserialize"
@@ -134,7 +174,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\'], "
   }
   member_method {
     name: "squared_hinge"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6e00a3a355269a0ccc5d69b3fcea106c4908e115
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-a-u-c.pbtxt
@@ -0,0 +1,200 @@
+path: "tensorflow.keras.metrics.AUC"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.AUC\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_thresholds\', \'curve\', \'summation_method\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'ROC\', \'interpolation\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "interpolate_pr_auc"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
index 2db07df5235e150f691a12d6b332c6d0d241ac19..18cde2f32aa10100c63d81470cb6e0fd2e61d4f0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Accuracy\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,6 +16,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -105,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
index 904ad3a21a05895b23e30dab82a89a31c74dcfca..c5d2fc9c5397ae20dbd0c7f8f7ce7801c63c3997 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.BinaryAccuracy\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,6 +16,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -105,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a8662459c9ce52da3a42e9c5e47c52b6deb6ab06
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-crossentropy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.BinaryCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.BinaryCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'binary_crossentropy\', \'None\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
index 17b74924fab4f596a010d6b9731b474433a8153e..998c4cbb1fad2352cfb9a510ba6e9b153860fcf8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalAccuracy\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,6 +16,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -105,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..04f1794aba61aae085a7580806e524eea8b2a791
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-crossentropy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.CategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'categorical_crossentropy\', \'None\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..19442b5028dda68548c19c74e0828abf4fd54534
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-hinge.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.CategoricalHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'categorical_hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..678c7b0681fe4281893fba70b4652233a91e2a0c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-cosine-similarity.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.CosineSimilarity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CosineSimilarity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'axis\'], varargs=None, keywords=None, defaults=[\'cosine_similarity\', \'None\', \'-1\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
index 49f577e1367aece126449923f77f4f6c89493e99..5a94569660fdc31f1889b5ca64f1483970cb5235 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
index e8baf858669a446a11b44e044f36bfde61e440bb..9033d9e655b2f2b80836153c23d9927315360de6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dedc64f1375b66b90f655f280c1a56ba165cfa17
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-hinge.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.Hinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Hinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..af8366b60876cb31f840c5f5007e67980be8dc3a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-k-l-divergence.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.KLDivergence"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.KLDivergence\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'kullback_leibler_divergence\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a7e072e21cc94492ed27186f44b92863cd791d62
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-log-cosh-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.LogCoshError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.LogCoshError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'logcosh\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..75173ad17a9c1fa02451287adad10870a60d653b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.MeanAbsoluteError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanAbsoluteError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_absolute_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7be81b63bbe01b8534bd64d163e735d735ff88f3
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-absolute-percentage-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.MeanAbsolutePercentageError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanAbsolutePercentageError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_absolute_percentage_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..21e44ed988494119662e5e1a5101edbe4d7a35fd
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-io-u.pbtxt
@@ -0,0 +1,196 @@
+path: "tensorflow.keras.metrics.MeanIoU"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanIoU\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_classes\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ef17fc34566e8ab6c5cc73781b40cb0f7396067
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-relative-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.MeanRelativeError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanRelativeError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'normalizer\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..363f532ba410f1ebae5f105769a0e69c2e2d0166
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.MeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_squared_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..712f10cd3051fe3de82472cb0eef2ec5fb53b6dd
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.MeanSquaredLogarithmicError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanSquaredLogarithmicError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_squared_logarithmic_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fff91d2b44c6e1e7c1fa0339c737c4a44b9566b6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean-tensor.pbtxt
@@ -0,0 +1,204 @@
+path: "tensorflow.keras.metrics.MeanTensor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanTensor\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "count"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "total"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_tensor\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
index 40fe64bbd2cec45b9a8c4e9b041d3fa858af1327..cffb444835c58c28953f85c61a8f2d98f2e74716 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
@@ -1,9 +1,10 @@
 path: "tensorflow.keras.metrics.Mean"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -103,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ce746ab350bfa0534bf7f9ac7d6e8255c7749894
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-metric.pbtxt
@@ -0,0 +1,195 @@
+path: "tensorflow.keras.metrics.Metric"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..570b77408cbaa2b7a0089f9de8a528e604799abe
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-poisson.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.Poisson"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Poisson\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'poisson\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
index ae6a85026da80cd071984aede8d0ec4e9cd571c5..83535d56cfc37932be785684825bed0e29a4fa5e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Precision\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -83,7 +87,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'thresholds\', \'top_k\', \'class_id\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
index 31068a51d510a7b95f62f61f03d37176c0fca55d..9ec2bbc4a3c8709f162dc0407408b2fe29b695a8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.Recall\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -83,7 +87,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'thresholds\', \'top_k\', \'class_id\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -103,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e2bdbd54e22756b823716c149cf0f24661acc812
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-root-mean-squared-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.keras.metrics.RootMeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.RootMeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'root_mean_squared_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
index aa77d1972cea42184fbbdb91e117b08ba38328fd..172c40eb2777d5504968de225718c270a0ce4e99 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
index 0c17452292a031d42f3da0d5844e99d1272dad25..8a24088257a423c18f347eb256915bda10459e1f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -3,9 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalAccuracy\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -15,6 +16,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -105,7 +110,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0cadc9dcd99c03d81907ad5b1c03fd3cba25f833
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.SparseCategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_crossentropy\', \'None\', \'False\', \'-1\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c135b8f680061a1e79fedd9d705d0fb54344823b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.SparseTopKCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseTopKCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'k\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'5\', \'sparse_top_k_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
index 67857aa89f1769c736d810cf5f73739021afeddf..4f6818797e1e55362a35d37e70a05bbb3b8d65b2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..90bbb087fafcdcde5dee048c45adbc45e3be2e55
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-squared-hinge.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.SquaredHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SquaredHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'squared_hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..30ef19e02cfc99d117e6a396beeaf6422a105013
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sum.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.keras.metrics.Sum"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Sum\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'sum\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e59476a2410f859dff7171162a2cab123d5e853d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-top-k-categorical-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.keras.metrics.TopKCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TopKCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'k\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'5\', \'top_k_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
index 1b5eb8d0de53960c3a98409119709c1307aa6379..6627a460c7522358a6f44d415a2ad8ce59b97427 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
index 5b9c470e32d7e038f9ba11e4f96ab6eaa6b60a87..8c3c2cb03a8b28db6212e29e0cb9b7b61fca7174 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
   is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
index 905021dd790205e64a6f9839218200db98941927..71e89765cb8a660e4843362f912d7d011ca4ec14 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.metrics"
 tf_module {
+  member {
+    name: "AUC"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Accuracy"
     mtype: "<type \'type\'>"
@@ -8,10 +12,26 @@ tf_module {
     name: "BinaryAccuracy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "BinaryCrossentropy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CategoricalAccuracy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalHinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CosineSimilarity"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "FalseNegatives"
     mtype: "<type \'type\'>"
@@ -20,10 +40,58 @@ tf_module {
     name: "FalsePositives"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Hinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "KLDivergence"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogCoshError"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Mean"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "MeanAbsoluteError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsolutePercentageError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanIoU"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanRelativeError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredLogarithmicError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanTensor"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Metric"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Poisson"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Precision"
     mtype: "<type \'type\'>"
@@ -32,6 +100,10 @@ tf_module {
     name: "Recall"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "RootMeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SensitivityAtSpecificity"
     mtype: "<type \'type\'>"
@@ -40,10 +112,30 @@ tf_module {
     name: "SparseCategoricalAccuracy"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SparseCategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseTopKCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SpecificityAtSensitivity"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SquaredHinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Sum"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TopKCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TrueNegatives"
     mtype: "<type \'type\'>"
@@ -78,7 +170,7 @@ tf_module {
   }
   member_method {
     name: "binary_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
   }
   member_method {
     name: "categorical_accuracy"
@@ -86,15 +178,7 @@ tf_module {
   }
   member_method {
     name: "categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
-  }
-  member_method {
-    name: "cosine"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "cosine_proximity"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
   }
   member_method {
     name: "deserialize"
@@ -162,7 +246,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\'], "
   }
   member_method {
     name: "sparse_top_k_categorical_accuracy"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index c58c7bef22dd4bff95d8ff07a10e20bb1bc463ad..63100a2176869e0a4ff30355c3df7ceaefaef65e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -131,7 +135,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -163,19 +167,19 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'validation_freq\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'1\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -231,11 +235,11 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index 473a1c16fb1edfbf37a7752e273566c1310853af..5c9ba04296ee39e1fdf014396d166d91ef5f714c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -136,7 +140,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -168,19 +172,19 @@ tf_class {
   }
   member_method {
     name: "evaluate"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
-    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'validation_freq\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'1\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "fit_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'validation_freq\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'1\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
   }
   member_method {
     name: "from_config"
@@ -240,7 +244,7 @@ tf_class {
   }
   member_method {
     name: "predict"
-    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
   }
   member_method {
     name: "predict_classes"
@@ -248,7 +252,7 @@ tf_class {
   }
   member_method {
     name: "predict_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'callbacks\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "predict_on_batch"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
index b9ce154bddef609e0aaf6627d6f59de551e51e3b..8471803624634eb2d3bacd79e236e51d4488a764 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.Adadelta"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Adadelta\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adadelta.Adadelta\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'rho\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'1.0\', \'0.95\', \'None\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.95\', \'1e-07\', \'Adadelta\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
index d0dc9e37a386a26143365eb443d5ba5fce8a87d9..0466ea65fa3ccaab1459841def55d6f907c7b14c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.Adagrad"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Adagrad\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adagrad.Adagrad\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'None\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.1\', \'1e-07\', \'Adagrad\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
index 06815fa99a4a474ec131c29d0cbc78bb2b9cb72d..9762fad5d0fb7690e041b853eba65bee1583ad14 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.Adam"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Adam\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adam.Adam\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'decay\', \'amsgrad\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'None\', \'0.0\', \'False\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'amsgrad\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'False\', \'Adam\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
index 47b55fdb44e79e976b6de13d760a7cf175323c6c..f477a60d237f5801b8ac8713c150cd83f2b3d768 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.Adamax"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Adamax\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adamax.Adamax\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.002\', \'0.9\', \'0.999\', \'None\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'Adamax\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ad42c6b75b32947635ec1098a3d639e011ec3765
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.keras.optimizers.Ftrl"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.ftrl.Ftrl\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=kwargs, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
index 8c63a7dda98568b24ea1b3cda15d4c840fbfd804..3ffb4bb8b4dea5840013e830efc7eec6699f71ed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.Nadam"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Nadam\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.nadam.Nadam\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'beta_1\', \'beta_2\', \'epsilon\', \'schedule_decay\'], varargs=None, keywords=kwargs, defaults=[\'0.002\', \'0.9\', \'0.999\', \'None\', \'0.004\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'Nadam\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
index 53d64dae932e250b9d81b2767a833de3bac8c403..9639c71ce415f5a942485fdc0d40f32c24f16b7d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
@@ -1,14 +1,35 @@
 path: "tensorflow.keras.optimizers.Optimizer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -18,6 +39,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -26,8 +55,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
index a1e9b8cceb95e8f25ac5f414fadacf237be33cd9..2a7603d69b4f55d23e03e6e3d4fa5e60aeaac4c6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.RMSprop"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.RMSprop\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.rmsprop.RMSprop\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'rho\', \'epsilon\', \'decay\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'None\', \'0.0\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'momentum\', \'epsilon\', \'centered\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.0\', \'1e-07\', \'False\', \'RMSprop\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
index a67fefb1bafebd62db9f6108f0fe1847b5d2e0cb..41635553347f5f1c04c221574ce7e5c6ac05275d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
@@ -1,15 +1,36 @@
 path: "tensorflow.keras.optimizers.SGD"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.optimizers.SGD\'>"
-  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.gradient_descent.SGD\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'lr\', \'momentum\', \'decay\', \'nesterov\'], varargs=None, keywords=kwargs, defaults=[\'0.01\', \'0.0\', \'0.0\', \'False\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'nesterov\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.0\', \'False\', \'SGD\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "get_config"
@@ -19,6 +40,14 @@ tf_class {
     name: "get_gradients"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_updates"
     argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
@@ -27,8 +56,16 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.pbtxt
index 7257b02087e237eaa47ed6a042559aa1332fc87b..7a333834c267e59f7a09c4936b8ed59776be7ee5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.pbtxt
@@ -16,6 +16,10 @@ tf_module {
     name: "Adamax"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Ftrl"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Nadam"
     mtype: "<type \'type\'>"
@@ -32,6 +36,10 @@ tf_module {
     name: "SGD"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "schedules"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-exponential-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-exponential-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..25ae478cb2c663b8a856bd29146558b808499079
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-exponential-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.optimizers.schedules.ExponentialDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.ExponentialDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-inverse-time-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-inverse-time-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b2fe61f4d2cb8f76fe1c8d6261b5f383b79281f0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-inverse-time-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.optimizers.schedules.InverseTimeDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.InverseTimeDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-learning-rate-schedule.pbtxt
similarity index 54%
rename from tensorflow/tools/api/golden/v2/tensorflow.initializers.zeros.pbtxt
rename to tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-learning-rate-schedule.pbtxt
index 7df4237bb6537b39f42f7b3894beb1bec6641f6f..3b33bd7526bd3f67f54450f97adf3d1d4d717051 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-learning-rate-schedule.pbtxt
@@ -1,11 +1,9 @@
-path: "tensorflow.initializers.zeros"
+path: "tensorflow.keras.optimizers.schedules.LearningRateSchedule"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Zeros\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-piecewise-constant-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-piecewise-constant-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6f1496492abfabb04bd47834d434ab8df05af705
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-piecewise-constant-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.optimizers.schedules.PiecewiseConstantDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.PiecewiseConstantDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'boundaries\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-polynomial-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-polynomial-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..728436c36111de60c3752e09049ffb5678e4b2d1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.-polynomial-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.optimizers.schedules.PolynomialDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.PolynomialDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'end_learning_rate\', \'power\', \'cycle\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'1.0\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..024e472a734935e668b9d6ee6e9c115cc90bdcd8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.schedules.pbtxt
@@ -0,0 +1,31 @@
+path: "tensorflow.keras.optimizers.schedules"
+tf_module {
+  member {
+    name: "ExponentialDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "InverseTimeDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LearningRateSchedule"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PiecewiseConstantDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PolynomialDecay"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'learning_rate_schedule\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-adjoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-adjoint.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37344f70311bd225856ce72c52dfd0ac1fb09075
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-adjoint.pbtxt
@@ -0,0 +1,150 @@
+path: "tensorflow.linalg.LinearOperatorAdjoint"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_adjoint.LinearOperatorAdjoint\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "operator"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'operator\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
index 773c74e64d13ca4a840b7f599fc2cbe9c161cd03..ddef774a75157401354d29b75c7a00fbedfd9ec5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_block_diag.LinearOperatorBlockDiag\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
@@ -95,6 +103,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
index 533544d21f2753f785113a30518f4fcbcff96cd7..97a6b1a475b9555f6f5ded273050b5751625b78b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant._BaseLinearOperatorCirculant\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -72,6 +76,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_hermitian_spectrum"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], "
@@ -116,6 +124,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
index e3926eb6d4714731d09ff9c5b75a89830c06e7c1..e2bfe7e7d852e825f826f9f785c40b9550f706be 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant._BaseLinearOperatorCirculant\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -72,6 +76,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_hermitian_spectrum"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], "
@@ -116,6 +124,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
index ba209df7824a9cc076499458e35acd7dcf1eaf35..8885526669065e5a5506bfe1bf93076f4584f9d9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_circulant._BaseLinearOperatorCirculant\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -72,6 +76,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_hermitian_spectrum"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_hermitian_spectrum\'], "
@@ -116,6 +124,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
index 081fb0e08bcd1b35ab44459d1c8eb0857dd14956..2a017fcb8987d46d8e24d2d21b43ae9962ad2075 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_composition.LinearOperatorComposition\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
@@ -95,6 +103,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
index 2014a04301618c20af5cf6f1144eb4dbda2479e1..31dcf7b0a6b4699e7009746fe62ec5551ee3e11b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_diag.LinearOperatorDiag\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
@@ -95,6 +103,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
index 9a87ae9687741090485bd8d4d0d07d359a2015e7..0ad39b4ba6006a1efa6b16e650ef3140516775f6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_full_matrix.LinearOperatorFullMatrix\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -59,6 +63,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
@@ -91,6 +99,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
index 33afb835ce1d524991c0024bfb87c29a72aac08e..f66a5a833a42c06f32696abc0a3114aa89f73a7e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_identity.BaseLinearOperatorIdentity\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -60,6 +64,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'mat\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
@@ -92,6 +100,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a7eb144d83aaeb2997d44b703b46de9a01c3a478
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-inversion.pbtxt
@@ -0,0 +1,150 @@
+path: "tensorflow.linalg.LinearOperatorInversion"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_inversion.LinearOperatorInversion\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "operator"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'operator\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
index a9078c8ab5cca078237a29febabdbbd4a8b6c89c..c983f8c6e6aa53716d1c2d07f219baccda99bd04 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_kronecker.LinearOperatorKronecker\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -63,6 +67,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
@@ -95,6 +103,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
index 4cfa3bb30d7382f3cf3cc0d5ce412d230d2a4287..813aec2a137ccaaea9718b7d0254ed0d60500247 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_low_rank_update.LinearOperatorLowRankUpdate\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "base_operator"
     mtype: "<type \'property\'>"
@@ -83,6 +87,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
@@ -115,6 +123,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
index a87649133fd207ad59f2124c6b0b5aa44916e5a5..0bb7a15e1342aeb4be94e9a40e1e6b1828e397b8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_lower_triangular.LinearOperatorLowerTriangular\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -59,6 +63,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
@@ -91,6 +99,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
index 32656467840fbbc0c8708ea68aac5aa75c11a540..7747c985404e54f93d012aba86a39503a855c76d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
@@ -4,6 +4,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_identity.BaseLinearOperatorIdentity\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -64,6 +68,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'mat\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
@@ -96,6 +104,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
index 49d8890c8942bc0021886ee6c9bc4e7625452655..590782bbc1d57ed4efb1cfb68b145b49d64c4545 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_zeros.LinearOperatorZeros\'>"
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -59,6 +63,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'mat\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
@@ -91,6 +99,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
index c89dc067b331603e227d9d578147e2dd1ee4a900..ed6bfdff288220fc0bcdf9fb6c4c78abfe5e43b2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
@@ -2,6 +2,10 @@ path: "tensorflow.linalg.LinearOperator"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "H"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "batch_shape"
     mtype: "<type \'property\'>"
@@ -58,6 +62,10 @@ tf_class {
     name: "add_to_tensor"
     argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
   }
+  member_method {
+    name: "adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'adjoint\'], "
+  }
   member_method {
     name: "assert_non_singular"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
@@ -90,6 +98,10 @@ tf_class {
     name: "domain_dimension_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
   }
+  member_method {
+    name: "inverse"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'inverse\'], "
+  }
   member_method {
     name: "log_abs_determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
index 3e1e2e3d54de3e2442299a783f933a60dfd2db6d..e46cb44ba56c1df7b4c004b35c21f80326f963a7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "LinearOperator"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LinearOperatorAdjoint"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LinearOperatorBlockDiag"
     mtype: "<type \'type\'>"
@@ -36,6 +40,10 @@ tf_module {
     name: "LinearOperatorIdentity"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LinearOperatorInversion"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LinearOperatorKronecker"
     mtype: "<type \'type\'>"
@@ -196,4 +204,8 @@ tf_module {
     name: "triangular_solve"
     argspec: "args=[\'matrix\', \'rhs\', \'lower\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "tridiagonal_solve"
+    argspec: "args=[\'diagonals\', \'rhs\', \'diagonals_format\', \'transpose_rhs\', \'conjugate_rhs\', \'name\'], varargs=None, keywords=None, defaults=[\'compact\', \'False\', \'False\', \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-op-hint.-op-hint-argument-tracker.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-op-hint.-op-hint-argument-tracker.pbtxt
index 1fe179f6c1b64ebc2f7535719bc1598577ee7f03..68cb07ea6fab85824400cce8408ebcb1dc030f8c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.-op-hint.-op-hint-argument-tracker.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-op-hint.-op-hint-argument-tracker.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'function_name\', \'unique_function_id\', \'node_name_prefix\', \'attr_name\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'function_name\', \'unique_function_id\', \'node_name_prefix\', \'attr_name\', \'level\', \'children_inputs_mappings\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
   }
   member_method {
     name: "add"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-op-hint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-op-hint.pbtxt
index 66e692a5a379203cb491980802b7003072bfe76c..3ac478f7626556574983aed4e5d284cb758406c8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.-op-hint.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-op-hint.pbtxt
@@ -14,6 +14,10 @@ tf_class {
     name: "AGGREGATE_STACK"
     mtype: "<type \'str\'>"
   }
+  member {
+    name: "CHILDREN_INPUTS_MAPPINGS"
+    mtype: "<type \'str\'>"
+  }
   member {
     name: "FUNCTION_AGGREGATE_ATTR"
     mtype: "<type \'str\'>"
@@ -22,6 +26,10 @@ tf_class {
     name: "FUNCTION_INPUT_INDEX_ATTR"
     mtype: "<type \'str\'>"
   }
+  member {
+    name: "FUNCTION_LEVEL_ATTR"
+    mtype: "<type \'str\'>"
+  }
   member {
     name: "FUNCTION_NAME_ATTR"
     mtype: "<type \'str\'>"
@@ -48,7 +56,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'function_name\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'function_name\', \'level\', \'children_inputs_mappings\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'None\'], "
   }
   member_method {
     name: "add_input"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-optimize.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-optimize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fedb5ee9fa4a31f25133bef55b980c18ed74fb79
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-optimize.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.lite.Optimize"
+tf_class {
+  is_instance: "<enum \'Optimize\'>"
+  member {
+    name: "OPTIMIZE_FOR_LATENCY"
+    mtype: "<enum \'Optimize\'>"
+  }
+  member {
+    name: "OPTIMIZE_FOR_SIZE"
+    mtype: "<enum \'Optimize\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-representative-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-representative-dataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d14b69531d183faa35d19f379d6b20c29b02e6e8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-representative-dataset.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.lite.RepresentativeDataset"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.python.lite.RepresentativeDataset\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'input_gen\', \'output_gen\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-toco-converter.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-toco-converter.pbtxt
deleted file mode 100644
index 3ef90b8bc4646a2adfcbeca2258ff5aa7cbf8894..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.-toco-converter.pbtxt
+++ /dev/null
@@ -1,24 +0,0 @@
-path: "tensorflow.lite.TocoConverter"
-tf_class {
-  is_instance: "<class \'tensorflow.lite.python.lite.TocoConverter\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "from_frozen_graph"
-    argspec: "args=[\'cls\', \'graph_def_file\', \'input_arrays\', \'output_arrays\', \'input_shapes\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "from_keras_model_file"
-    argspec: "args=[\'cls\', \'model_file\', \'input_arrays\', \'input_shapes\', \'output_arrays\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "from_saved_model"
-    argspec: "args=[\'cls\', \'saved_model_dir\', \'input_arrays\', \'input_shapes\', \'output_arrays\', \'tag_set\', \'signature_key\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "from_session"
-    argspec: "args=[\'cls\', \'sess\', \'input_tensors\', \'output_tensors\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..34b4133d0ca6edb929c780babd0652187d41c76b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.-t-f-lite-l-s-t-m-cell.pbtxt
@@ -0,0 +1,210 @@
+path: "tensorflow.lite.experimental.nn.TFLiteLSTMCell"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.experimental.examples.lstm.rnn_cell.TFLiteLSTMCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_units\', \'use_peepholes\', \'cell_clip\', \'initializer\', \'num_proj\', \'proj_clip\', \'num_unit_shards\', \'num_proj_shards\', \'forget_bias\', \'state_is_tuple\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'1.0\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "zero_state"
+    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2fff2b8606c98ddffabbd3d27e7a7848d1fce86a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.-tf-lite-r-n-n-cell.pbtxt
@@ -0,0 +1,210 @@
+path: "tensorflow.lite.experimental.nn.TfLiteRNNCell"
+tf_class {
+  is_instance: "<class \'tensorflow.lite.experimental.examples.lstm.rnn_cell.TfLiteRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
+  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "scope_name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_size"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_units\', \'activation\', \'reuse\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "zero_state"
+    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5ce858c46ba304b7cc3ce6b257518cdcc9aac646
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.nn.pbtxt
@@ -0,0 +1,15 @@
+path: "tensorflow.lite.experimental.nn"
+tf_module {
+  member {
+    name: "TFLiteLSTMCell"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TfLiteRNNCell"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "dynamic_rnn"
+    argspec: "args=[\'cell\', \'inputs\', \'sequence_length\', \'initial_state\', \'dtype\', \'parallel_iterations\', \'swap_memory\', \'time_major\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'True\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8d585ea6643b731fa4767301f13654fc699d6e23
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.lite.experimental"
+tf_module {
+  member {
+    name: "nn"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.pbtxt
index 154dd00821794ef4a5118e98d67e32beca38bebf..5fccb608abd6dc03b91707fcfcb403a9f0ebceca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.pbtxt
@@ -13,19 +13,23 @@ tf_module {
     mtype: "<class \'enum.EnumMeta\'>"
   }
   member {
-    name: "TFLiteConverter"
+    name: "Optimize"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "RepresentativeDataset"
     mtype: "<type \'type\'>"
   }
   member {
-    name: "TocoConverter"
+    name: "TFLiteConverter"
     mtype: "<type \'type\'>"
   }
   member {
     name: "constants"
     mtype: "<type \'module\'>"
   }
-  member_method {
-    name: "toco_convert"
-    argspec: "args=[\'input_data\', \'input_tensors\', \'output_tensors\'], varargs=args, keywords=kwargs, defaults=None"
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-binary-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1d180a9fbad492089f37bc98de50904c8bfa4d38
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-binary-crossentropy.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.BinaryCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.BinaryCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'binary_crossentropy\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3937dfa153a08843c12f1098c05e49e1b1a01c79
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-crossentropy.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.CategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'categorical_crossentropy\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d2a064dd649535336ddfeda4f24b2594771bbbd5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-categorical-hinge.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.CategoricalHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CategoricalHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'categorical_hinge\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-cosine-similarity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7829f0f327be676b1fe81775bf3a8a368e88c7db
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-cosine-similarity.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.CosineSimilarity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CosineSimilarity\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'axis\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'sum_over_batch_size\', \'cosine_similarity\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..155154c312a8b9bf034d5600bd2d859036934ce1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-hinge.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.Hinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Hinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-huber.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-huber.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5052c19a049c0defa2f5b64f3d16626dc95374cc
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-huber.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.Huber"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Huber\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'delta\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'sum_over_batch_size\', \'huber_loss\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-k-l-divergence.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b739c057b238b6a3dfa14e325deac3c4f4b46fc1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-k-l-divergence.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.KLDivergence"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.KLDivergence\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'kullback_leibler_divergence\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-log-cosh.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-log-cosh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..557cc210450c2553668f914c84218bb762202668
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-log-cosh.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.LogCosh"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.LogCosh\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'logcosh\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-log-loss.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-log-loss.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..65b1feaefddc93bc5b6da966525ace33fb047e07
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-log-loss.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.LogLoss"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.LogLoss\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'logloss\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-loss.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-loss.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2bcc6f8854368f919cf9b4254caa6fc2e071dc6d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-loss.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.losses.Loss"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a33db29d421ccaab3bb829e3924cc3307039ad1b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-error.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.MeanAbsoluteError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsoluteError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_absolute_error\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-percentage-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4c79a5e8b8a9fc3779f93fa8e8fd29f28df9ec52
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-absolute-percentage-error.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.MeanAbsolutePercentageError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsolutePercentageError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_absolute_percentage_error\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b99e19413ed7a87509b1ff2296046e718c5dcc82
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-error.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.MeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_squared_error\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-logarithmic-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e01827391a796614bd0b69f36f0ae268ffb31a7b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-mean-squared-logarithmic-error.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.MeanSquaredLogarithmicError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredLogarithmicError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'mean_squared_logarithmic_error\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-poisson.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b6603cb407265b59ad17c2e428306e6b025aa9a1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-poisson.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.Poisson"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.Poisson\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'poisson\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
index 6a44e4ce66c9dfcb9912c96d0106e4f4fd9fdcff..e4ae87ea29365b13b0c49e3ea2329550b0e420ed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.losses.Reduction"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.losses.losses_impl.ReductionV2\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.losses_utils.ReductionV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "NONE"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-sparse-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5e3ce6fc728d594a3bc0c7a0ffe83f078e81fac8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-sparse-categorical-crossentropy.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.SparseCategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.SparseCategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-squared-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b5e3757143871b04a63c258d49f8ef30f52304fe
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-squared-hinge.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.losses.SquaredHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.SquaredHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.LossFunctionWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'squared_hinge\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
index 233b1a0131a4d292574be161de2d547cb0060c23..f688a32db7ae7212f4356de3fbc0e54729edd71d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
@@ -1,27 +1,183 @@
 path: "tensorflow.losses"
 tf_module {
+  member {
+    name: "BinaryCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalHinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CosineSimilarity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Hinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Huber"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "KLDivergence"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogCosh"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogLoss"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Loss"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsoluteError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsolutePercentageError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredLogarithmicError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Poisson"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Reduction"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SparseCategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SquaredHinge"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "KLD"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MAE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MAPE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MSE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MSLE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "binary_crossentropy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
+  }
+  member_method {
+    name: "categorical_crossentropy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
+  }
+  member_method {
+    name: "categorical_hinge"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "cosine_similarity"
+    argspec: "args=[\'y_true\', \'y_pred\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
+  }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'name\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get"
+    argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "hinge"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "kld"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "kullback_leibler_divergence"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "logcosh"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mae"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mape"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean_absolute_error"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean_absolute_percentage_error"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean_squared_error"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean_squared_logarithmic_error"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mse"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
-    name: "add_loss"
-    argspec: "args=[\'loss\', \'loss_collection\'], varargs=None, keywords=None, defaults=[\'losses\'], "
+    name: "msle"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "get_losses"
-    argspec: "args=[\'scope\', \'loss_collection\'], varargs=None, keywords=None, defaults=[\'None\', \'losses\'], "
+    name: "poisson"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "get_regularization_loss"
-    argspec: "args=[\'scope\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'total_regularization_loss\'], "
+    name: "serialize"
+    argspec: "args=[\'loss\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "get_regularization_losses"
-    argspec: "args=[\'scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    name: "sparse_categorical_crossentropy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\'], "
   }
   member_method {
-    name: "get_total_loss"
-    argspec: "args=[\'add_regularization_losses\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'total_loss\'], "
+    name: "squared_hinge"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
index 4ac0484050054abee9496bcf09d90ff58bbfb9d7..c2c5bb50b59815a154c52407c82648cc28d4b31b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
@@ -120,6 +120,10 @@ tf_module {
     name: "divide"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "divide_no_nan"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "equal"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -260,10 +264,18 @@ tf_module {
     name: "multiply"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "multiply_no_nan"
+    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "negative"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "nextafter"
+    argspec: "args=[\'x1\', \'x2\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "not_equal"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -296,6 +308,10 @@ tf_module {
     name: "reduce_any"
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "reduce_euclidean_norm"
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "reduce_logsumexp"
     argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9aa59370a3050d67e35324831d688f23ab444303
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-a-u-c.pbtxt
@@ -0,0 +1,200 @@
+path: "tensorflow.metrics.AUC"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.AUC\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_thresholds\', \'curve\', \'summation_method\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'ROC\', \'interpolation\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "interpolate_pr_auc"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f4cfade42ece20e113bfd41744f05a451bbba34c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.Accuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Accuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..84198f3cf3d841627aa88d690673b5e8fb1838a1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.BinaryAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.BinaryAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'threshold\'], varargs=None, keywords=None, defaults=[\'binary_accuracy\', \'None\', \'0.5\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d51c6a798bd20f25b523dc142bc6cb4734b5b6a0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-binary-crossentropy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.BinaryCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.BinaryCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'binary_crossentropy\', \'None\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..67e14faf3f950ed4d52c45111fa8c4a7023f7019
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.CategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..33cd4c574931e8de692bc69a7dc85d98ca432fcb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-crossentropy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.CategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'categorical_crossentropy\', \'None\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f5c90fab3fdbd901235bd7b5b10259fe2a67e071
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-categorical-hinge.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.CategoricalHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'categorical_hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..326df8fcc2aa4beeabe11c4566d5d77b6ed13981
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-cosine-similarity.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.CosineSimilarity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CosineSimilarity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'axis\'], varargs=None, keywords=None, defaults=[\'cosine_similarity\', \'None\', \'-1\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e9e32ad53c790a6753a1764c959575ff3eee7631
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-negatives.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.metrics.FalseNegatives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.FalseNegatives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..45a2c48acba5be6c53ba5666cf9e308f374f6372
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-false-positives.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.metrics.FalsePositives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.FalsePositives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d2132fda36f44150e7154323c5d5f1e317173777
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-hinge.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.Hinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Hinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bbd0db90fa517982b85f4fe071c16c7cef4f2f70
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-k-l-divergence.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.KLDivergence"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.KLDivergence\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'kullback_leibler_divergence\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b66eda8523d026935d7edc380d4c9a00e41e4a6b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-log-cosh-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.LogCoshError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.LogCoshError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'logcosh\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..90a6b0664165e37a61bccd7468dab19c7105d3d2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.MeanAbsoluteError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanAbsoluteError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_absolute_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8710a45cccd402e6e298044a53c957bd6797342f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-absolute-percentage-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.MeanAbsolutePercentageError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanAbsolutePercentageError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_absolute_percentage_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dcb48837860114f5073206abe447151fe130f5d4
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-io-u.pbtxt
@@ -0,0 +1,196 @@
+path: "tensorflow.metrics.MeanIoU"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanIoU\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_classes\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..207f262851adfcd6d751d3f33c3d528f44e8daae
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-relative-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.metrics.MeanRelativeError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanRelativeError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'normalizer\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0ce4c959abc303ff77bd7b50738bc8dd30750cd7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.MeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_squared_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6ad4089c2cb7d1eeeb796903e75731b675abdd5e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-squared-logarithmic-error.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.MeanSquaredLogarithmicError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanSquaredLogarithmicError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_squared_logarithmic_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..31dc0cc9d7a88b317faa0ba5df1d67a70e0955d8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean-tensor.pbtxt
@@ -0,0 +1,204 @@
+path: "tensorflow.metrics.MeanTensor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanTensor\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "count"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "total"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean_tensor\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..794665197bfa453fd73405d1ca8b1986c1758257
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-mean.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.metrics.Mean"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d1826983476253122caf753c821a3ce331801856
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-metric.pbtxt
@@ -0,0 +1,195 @@
+path: "tensorflow.metrics.Metric"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..202739712fcf5471911158b8944203c36b80546b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-poisson.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.Poisson"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Poisson\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'poisson\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2ebe442e8626a18729a1930ccad7a531c12553be
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-precision.pbtxt
@@ -0,0 +1,196 @@
+path: "tensorflow.metrics.Precision"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Precision\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'top_k\', \'class_id\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..74489f6446e533f89bc705a7657b0016abb4ab35
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-recall.pbtxt
@@ -0,0 +1,196 @@
+path: "tensorflow.metrics.Recall"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Recall\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'top_k\', \'class_id\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f408836b66fc3c0ccff64a625a58879b44b7498f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-root-mean-squared-error.pbtxt
@@ -0,0 +1,198 @@
+path: "tensorflow.metrics.RootMeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.RootMeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'root_mean_squared_error\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a6233b63a7161e52f4f40b306dd114e2aff5a2a7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sensitivity-at-specificity.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.metrics.SensitivityAtSpecificity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivityAtSpecificity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'specificity\', \'num_thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7dfbf3f62e32b0a1ca6d8d698d0f563484927d53
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.SparseCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0cb0007b1bf7ce959c25413aa6097f5026b0267c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-categorical-crossentropy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.SparseCategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_crossentropy\', \'None\', \'False\', \'-1\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d7e4344e43907e876020cf6fe58c0a997180a76a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sparse-top-k-categorical-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.SparseTopKCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseTopKCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'k\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'5\', \'sparse_top_k_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a9dbf70f8f66debf98967b29690e087a473e57c6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-specificity-at-sensitivity.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.metrics.SpecificityAtSensitivity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SpecificityAtSensitivity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'sensitivity\', \'num_thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..76ecc8c41c68ddf84c185d12c446f7e4f83529c9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-squared-hinge.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.SquaredHinge"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SquaredHinge\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'squared_hinge\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1642e1e62f2cbef1ffe2f3a01962e94c16030ca2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-sum.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.metrics.Sum"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Sum\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'sum\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d47520fe113d2858d58bc1325a0fbbf74cb087b6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-top-k-categorical-accuracy.pbtxt
@@ -0,0 +1,199 @@
+path: "tensorflow.metrics.TopKCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TopKCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Reduce\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'k\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'5\', \'top_k_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e8b55f8a2eec45220435ad1fa0298e4684d00ad1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-negatives.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.metrics.TrueNegatives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TrueNegatives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9e7274d7e6171d3766555d8c6910de8599e568b1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.-true-positives.pbtxt
@@ -0,0 +1,197 @@
+path: "tensorflow.metrics.TruePositives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TruePositives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\', \'dtype\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4ae997c6bd4ec0c98631c439a01b085f7cfae8e4
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
@@ -0,0 +1,263 @@
+path: "tensorflow.metrics"
+tf_module {
+  member {
+    name: "AUC"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Accuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "BinaryAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "BinaryCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalHinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CosineSimilarity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FalseNegatives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FalsePositives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Hinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "KLDivergence"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LogCoshError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Mean"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsoluteError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsolutePercentageError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanIoU"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanRelativeError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredLogarithmicError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanTensor"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Metric"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Poisson"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Precision"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Recall"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RootMeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SensitivityAtSpecificity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseCategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseTopKCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SpecificityAtSensitivity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SquaredHinge"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Sum"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TopKCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TrueNegatives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TruePositives"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "KLD"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MAE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MAPE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MSE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MSLE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "binary_accuracy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'threshold\'], varargs=None, keywords=None, defaults=[\'0.5\'], "
+  }
+  member_method {
+    name: "binary_crossentropy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
+  }
+  member_method {
+    name: "categorical_accuracy"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "categorical_crossentropy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'label_smoothing\'], varargs=None, keywords=None, defaults=[\'False\', \'0\'], "
+  }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get"
+    argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "hinge"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "kld"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "kullback_leibler_divergence"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mae"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mape"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean_absolute_error"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean_absolute_percentage_error"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean_squared_error"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mean_squared_logarithmic_error"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mse"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "msle"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "poisson"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'metric\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "sparse_categorical_accuracy"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "sparse_categorical_crossentropy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\', \'axis\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\'], "
+  }
+  member_method {
+    name: "sparse_top_k_categorical_accuracy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'k\'], varargs=None, keywords=None, defaults=[\'5\'], "
+  }
+  member_method {
+    name: "squared_hinge"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "top_k_categorical_accuracy"
+    argspec: "args=[\'y_true\', \'y_pred\', \'k\'], varargs=None, keywords=None, defaults=[\'5\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nest.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nest.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..70bb6d760bc7a6c55bbdfd5c05cde4c08769786d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nest.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.nest"
+tf_module {
+  member_method {
+    name: "assert_same_structure"
+    argspec: "args=[\'nest1\', \'nest2\', \'check_types\', \'expand_composites\'], varargs=None, keywords=None, defaults=[\'True\', \'False\'], "
+  }
+  member_method {
+    name: "flatten"
+    argspec: "args=[\'structure\', \'expand_composites\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "is_nested"
+    argspec: "args=[\'seq\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "map_structure"
+    argspec: "args=[\'func\'], varargs=structure, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "pack_sequence_as"
+    argspec: "args=[\'structure\', \'flat_sequence\', \'expand_composites\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
index c75c75f2ef7ca50cce15fe1dffb4d0de3f6815de..66424429d892514c5c4567c65fa083b0fd4faed6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
@@ -22,7 +22,15 @@ tf_module {
   }
   member_method {
     name: "avg_pool"
-    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "avg_pool1d"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NWC\', \'None\'], "
+  }
+  member_method {
+    name: "avg_pool2d"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
   }
   member_method {
     name: "avg_pool3d"
@@ -50,35 +58,31 @@ tf_module {
   }
   member_method {
     name: "conv1d"
-    argspec: "args=[\'input\', \'filters\', \'stride\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "conv2d"
-    argspec: "args=[\'input\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'stride\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NWC\', \'None\', \'None\'], "
   }
   member_method {
-    name: "conv2d_backprop_filter"
-    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
+    name: "conv1d_transpose"
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NWC\', \'None\', \'None\'], "
   }
   member_method {
-    name: "conv2d_backprop_input"
-    argspec: "args=[\'input_sizes\', \'filters\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
+    name: "conv2d"
+    argspec: "args=[\'input\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv2d_transpose"
-    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NHWC\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv3d"
     argspec: "args=[\'input\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\', \'None\'], "
   }
   member_method {
-    name: "conv3d_backprop_filter"
-    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
+    name: "conv3d_transpose"
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NDHWC\', \'None\', \'None\'], "
   }
   member_method {
-    name: "conv3d_transpose"
-    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NDHWC\', \'None\'], "
+    name: "conv_transpose"
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "convolution"
@@ -194,7 +198,15 @@ tf_module {
   }
   member_method {
     name: "max_pool"
-    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "max_pool1d"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NWC\', \'None\'], "
+  }
+  member_method {
+    name: "max_pool2d"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
   }
   member_method {
     name: "max_pool3d"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 9e52a4252619ffc19b287fc1818fa6f772847335..58d004b3d5d10332065216b4a816febb673a4853 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +14,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index d3b68e4f2976912ed65ba7916284c951fda03b05..a9f7e85b1488dc49a52c3ac4d5a7ed55bf605ab5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -3,7 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -13,6 +13,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt
index b1f687f52964e20a6dfa6f81f68e61d2a67513c9..953722467b5753ab5c138f402d0c6332c244688a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt
@@ -4,10 +4,6 @@ tf_module {
     name: "DeviceWrapper"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "DropoutWrapper"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "LSTMStateTuple"
     mtype: "<type \'type\'>"
@@ -16,8 +12,4 @@ tf_module {
     name: "RNNCell"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "ResidualWrapper"
-    mtype: "<type \'type\'>"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.ones_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.ones_initializer.pbtxt
index 210b56242b27fe4a832cfe50a53626d716d8877e..b271db6a659108031aab42a397068a4a13967551 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.ones_initializer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.ones_initializer.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.ones_initializer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Ones\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Ones\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2b476fafa9a6e26c29d91e28ac2ee66b6e74f637
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.optimizers.Adadelta"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adadelta.Adadelta\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.95\', \'1e-07\', \'Adadelta\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..be2fedfe81f5e01bf86cdcfaccf19dbd1f367543
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.optimizers.Adagrad"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adagrad.Adagrad\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.1\', \'1e-07\', \'Adagrad\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..919c433648ff5950d4ab0c0f2ff2295d33d1085c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.optimizers.Adam"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adam.Adam\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'amsgrad\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'False\', \'Adam\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..67fce4f5c63c0dcd364a124929c9232de1887ae9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.optimizers.Adamax"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.adamax.Adamax\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'Adamax\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ee4c9ad25fc678652e33fd5423f228dd86b89816
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.optimizers.Ftrl"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.ftrl.Ftrl\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=kwargs, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'Ftrl\', \'0.0\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..06363234ea68f192105295a1eec2b8487c4eb121
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.optimizers.Nadam"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.nadam.Nadam\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'Nadam\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..041922bdfd121b5f161f2d1dea443fc32c592743
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt
@@ -0,0 +1,70 @@
+path: "tensorflow.optimizers.Optimizer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5deef618248d608bf571a9548ea72f6837984a09
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.optimizers.RMSprop"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.rmsprop.RMSprop\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'momentum\', \'epsilon\', \'centered\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.9\', \'0.0\', \'1e-07\', \'False\', \'RMSprop\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..381f72767b807bb495aeaa98c012e7fd97608c22
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt
@@ -0,0 +1,71 @@
+path: "tensorflow.optimizers.SGD"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.gradient_descent.SGD\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.optimizer_v2.OptimizerV2\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "iterations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'nesterov\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.001\', \'0.0\', \'False\', \'SGD\'], "
+  }
+  member_method {
+    name: "add_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\', \'initializer\'], varargs=None, keywords=None, defaults=[\'zeros\'], "
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'trainable\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'zeros\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'grads_and_vars\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_gradients"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot"
+    argspec: "args=[\'self\', \'var\', \'slot_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_slot_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates"
+    argspec: "args=[\'self\', \'loss\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "minimize"
+    argspec: "args=[\'self\', \'loss\', \'var_list\', \'grad_loss\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "variables"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2e03b42ac7f8b8400c6c09061e6d9d09f0ac3d9e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.pbtxt
@@ -0,0 +1,55 @@
+path: "tensorflow.optimizers"
+tf_module {
+  member {
+    name: "Adadelta"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Adagrad"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Adam"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Adamax"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Ftrl"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Nadam"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Optimizer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RMSprop"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SGD"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "schedules"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get"
+    argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'optimizer\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-exponential-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-exponential-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f174d6cfc3ce20d1bf6d36e1cfff4d63d799a3d1
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-exponential-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.optimizers.schedules.ExponentialDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.ExponentialDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-inverse-time-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-inverse-time-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..851d21c5cc063c97a49e8dbe2611b2f903d216a4
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-inverse-time-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.optimizers.schedules.InverseTimeDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.InverseTimeDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.initializers.identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-learning-rate-schedule.pbtxt
similarity index 52%
rename from tensorflow/tools/api/golden/v2/tensorflow.initializers.identity.pbtxt
rename to tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-learning-rate-schedule.pbtxt
index 37fcab95997bb7299675a387d08184fc1387eee1..36db36e4f432e8c0b87e306d3a35d3a0e5bdde0b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.initializers.identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-learning-rate-schedule.pbtxt
@@ -1,11 +1,9 @@
-path: "tensorflow.initializers.identity"
+path: "tensorflow.optimizers.schedules.LearningRateSchedule"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Identity\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'gain\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \"<dtype: \'float32\'>\"], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-piecewise-constant-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-piecewise-constant-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6613bedef5f638e4b7f3211827031d04b35cc5ad
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-piecewise-constant-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.optimizers.schedules.PiecewiseConstantDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.PiecewiseConstantDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'boundaries\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-polynomial-decay.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-polynomial-decay.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fbd5bcef8f75e78c3bcaa87d82d45b2094bfed80
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.-polynomial-decay.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.optimizers.schedules.PolynomialDecay"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.PolynomialDecay\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizer_v2.learning_rate_schedule.LearningRateSchedule\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'initial_learning_rate\', \'decay_steps\', \'end_learning_rate\', \'power\', \'cycle\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'1.0\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8e1e61b76628a5ef1638e5134a4687402d0e47c8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.schedules.pbtxt
@@ -0,0 +1,31 @@
+path: "tensorflow.optimizers.schedules"
+tf_module {
+  member {
+    name: "ExponentialDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "InverseTimeDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "LearningRateSchedule"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PiecewiseConstantDecay"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PolynomialDecay"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "deserialize"
+    argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "serialize"
+    argspec: "args=[\'learning_rate_schedule\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index 4432cae53b64b66e5a5c906f87af94f61bcf36bd..66489e68914341303eb1988a3ae9083f258cf3f2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -5,15 +5,11 @@ tf_module {
     mtype: "<type \'type\'>"
   }
   member {
-    name: "DType"
+    name: "CriticalSection"
     mtype: "<type \'type\'>"
   }
   member {
-    name: "Event"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "FIFOQueue"
+    name: "DType"
     mtype: "<type \'type\'>"
   }
   member {
@@ -29,24 +25,24 @@ tf_module {
     mtype: "<type \'type\'>"
   }
   member {
-    name: "Operation"
-    mtype: "<type \'type\'>"
+    name: "Module"
+    mtype: "<class \'tensorflow.python.module.module.ModuleMetaclass\'>"
   }
   member {
-    name: "RegisterGradient"
+    name: "Operation"
     mtype: "<type \'type\'>"
   }
   member {
-    name: "SparseTensor"
+    name: "RaggedTensor"
     mtype: "<type \'type\'>"
   }
   member {
-    name: "Summary"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+    name: "RegisterGradient"
+    mtype: "<type \'type\'>"
   }
   member {
-    name: "SummaryMetadata"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+    name: "SparseTensor"
+    mtype: "<type \'type\'>"
   }
   member {
     name: "Tensor"
@@ -80,6 +76,14 @@ tf_module {
     name: "VariableSynchronization"
     mtype: "<class \'enum.EnumMeta\'>"
   }
+  member {
+    name: "audio"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "autograph"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "bfloat16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -104,6 +108,10 @@ tf_module {
     name: "complex64"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "config"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "constant_initializer"
     mtype: "<type \'type\'>"
@@ -156,10 +164,6 @@ tf_module {
     name: "float64"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
-  member {
-    name: "glorot_uniform_initializer"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "graph_util"
     mtype: "<type \'module\'>"
@@ -172,10 +176,6 @@ tf_module {
     name: "image"
     mtype: "<type \'module\'>"
   }
-  member {
-    name: "initializers"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "int16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -216,10 +216,18 @@ tf_module {
     name: "math"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "name_scope"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "nest"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "newaxis"
     mtype: "<type \'NoneType\'>"
@@ -232,6 +240,10 @@ tf_module {
     name: "ones_initializer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "optimizers"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "qint16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -248,6 +260,10 @@ tf_module {
     name: "quantization"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "queue"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "quint16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -256,6 +272,10 @@ tf_module {
     name: "quint8"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "ragged"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "random"
     mtype: "<type \'module\'>"
@@ -268,10 +288,18 @@ tf_module {
     name: "random_uniform_initializer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "raw_ops"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "resource"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "rnn"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "saved_model"
     mtype: "<type \'module\'>"
@@ -312,10 +340,6 @@ tf_module {
     name: "train"
     mtype: "<type \'module\'>"
   }
-  member {
-    name: "truncated_normal_initializer"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "uint16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -424,10 +448,6 @@ tf_module {
     name: "atanh"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "batch_gather"
-    argspec: "args=[\'params\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "batch_to_space"
     argspec: "args=[\'input\', \'block_shape\', \'crops\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -472,6 +492,10 @@ tf_module {
     name: "clip_by_value"
     argspec: "args=[\'t\', \'clip_value_min\', \'clip_value_max\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "combined_non_max_suppression"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size_per_class\', \'max_total_size\', \'iou_threshold\', \'score_threshold\', \'pad_per_class\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
   member_method {
     name: "complex"
     argspec: "args=[\'real\', \'imag\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -516,10 +540,6 @@ tf_module {
     name: "device"
     argspec: "args=[\'device_name\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "div_no_nan"
-    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "divide"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -598,7 +618,7 @@ tf_module {
   }
   member_method {
     name: "gather"
-    argspec: "args=[\'params\', \'indices\', \'validate_indices\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
+    argspec: "args=[\'params\', \'indices\', \'validate_indices\', \'axis\', \'batch_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0\', \'None\'], "
   }
   member_method {
     name: "gather_nd"
@@ -608,6 +628,10 @@ tf_module {
     name: "get_logger"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_static_value"
+    argspec: "args=[\'tensor\', \'partial\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
   member_method {
     name: "gradients"
     argspec: "args=[\'ys\', \'xs\', \'grad_ys\', \'name\', \'gate_gradients\', \'aggregation_method\', \'stop_gradients\', \'unconnected_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'gradients\', \'False\', \'None\', \'None\', \'UnconnectedGradients.NONE\'], "
@@ -656,6 +680,10 @@ tf_module {
     name: "init_scope"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "is_tensor"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "less"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -748,6 +776,10 @@ tf_module {
     name: "not_equal"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "numpy_function"
+    argspec: "args=[\'func\', \'inp\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "one_hot"
     argspec: "args=[\'indices\', \'depth\', \'on_value\', \'off_value\', \'axis\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
@@ -968,10 +1000,6 @@ tf_module {
     name: "string_split"
     argspec: "args=[\'source\', \'delimiter\', \'skip_empty\'], varargs=None, keywords=None, defaults=[\' \', \'True\'], "
   }
-  member_method {
-    name: "substr"
-    argspec: "args=[\'input\', \'pos\', \'len\', \'name\', \'unit\'], varargs=None, keywords=None, defaults=[\'None\', \'BYTE\'], "
-  }
   member_method {
     name: "subtract"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.queue.-f-i-f-o-queue.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.queue.-f-i-f-o-queue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..724ab5fe8283de44b20b059042f8d6744b11da19
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.queue.-f-i-f-o-queue.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.queue.FIFOQueue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.FIFOQueue\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'capacity\', \'dtypes\', \'shapes\', \'names\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'fifo_queue\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_closed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.queue.-padding-f-i-f-o-queue.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.queue.-padding-f-i-f-o-queue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9ef0a4d9eb6bbfb69fddf3fe696e3f60ac3ef67b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.queue.-padding-f-i-f-o-queue.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.queue.PaddingFIFOQueue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.PaddingFIFOQueue\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'capacity\', \'dtypes\', \'shapes\', \'names\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'padding_fifo_queue\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_closed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.queue.-priority-queue.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.queue.-priority-queue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bb66beb13af18501912fda85b9c3dc67cdf21683
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.queue.-priority-queue.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.queue.PriorityQueue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.PriorityQueue\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'capacity\', \'types\', \'shapes\', \'names\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'priority_queue\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_closed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.queue.-queue-base.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.queue.-queue-base.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8faaad22af6e0f920e26a44e1ebf294fc4b109c4
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.queue.-queue-base.pbtxt
@@ -0,0 +1,65 @@
+path: "tensorflow.queue.QueueBase"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtypes\', \'shapes\', \'names\', \'queue_ref\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_closed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.queue.-random-shuffle-queue.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.queue.-random-shuffle-queue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..31cd503b13040b119d4028f813c94689f8e2ebb3
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.queue.-random-shuffle-queue.pbtxt
@@ -0,0 +1,66 @@
+path: "tensorflow.queue.RandomShuffleQueue"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.RandomShuffleQueue\'>"
+  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.QueueBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtypes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "queue_ref"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shapes"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'capacity\', \'min_after_dequeue\', \'dtypes\', \'shapes\', \'names\', \'seed\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'random_shuffle_queue\'], "
+  }
+  member_method {
+    name: "close"
+    argspec: "args=[\'self\', \'cancel_pending_enqueues\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_many"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue_up_to"
+    argspec: "args=[\'self\', \'n\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue_many"
+    argspec: "args=[\'self\', \'vals\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_list"
+    argspec: "args=[\'index\', \'queues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_closed"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "size"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.queue.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.queue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c16e95e2116b703434ed91106eb29e4beb5668f2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.queue.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.queue"
+tf_module {
+  member {
+    name: "FIFOQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PaddingFIFOQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "PriorityQueue"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "QueueBase"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "RandomShuffleQueue"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.ragged.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.ragged.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5fde488ffdd4dc30695407b5eba097585c885f65
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.ragged.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.ragged"
+tf_module {
+  member_method {
+    name: "constant"
+    argspec: "args=[\'pylist\', \'dtype\', \'ragged_rank\', \'inner_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "map_flat_values"
+    argspec: "args=[\'op\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "range"
+    argspec: "args=[\'starts\', \'limits\', \'deltas\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "row_splits_to_segment_ids"
+    argspec: "args=[\'splits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "segment_ids_to_row_splits"
+    argspec: "args=[\'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.random_normal_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.random_normal_initializer.pbtxt
index 5993fdeb9c232ebc4090d9fffd8857da8ca6ada4..b1dfc444113d28e6dad160be8786872dabc70cc8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.random_normal_initializer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.random_normal_initializer.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.random_normal_initializer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.RandomNormal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
+    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\'], varargs=None, keywords=None, defaults=[\'0.0\', \'0.05\', \'None\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.random_uniform_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.random_uniform_initializer.pbtxt
index a434ed1599ef8b99b6e0496be388aa0e44755249..d16924a07a5f6fc11bb71f1786a691255e5c67e6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.random_uniform_initializer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.random_uniform_initializer.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.random_uniform_initializer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomUniform\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.RandomUniform\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'minval\', \'maxval\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'None\', \"<dtype: \'float32\'>\"], "
+    argspec: "args=[\'self\', \'minval\', \'maxval\', \'seed\'], varargs=None, keywords=None, defaults=[\'-0.05\', \'0.05\', \'None\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..080afef7645fb4cf7a700b2da062311fd14871f7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -0,0 +1,4067 @@
+path: "tensorflow.raw_ops"
+tf_module {
+  member_method {
+    name: "Abort"
+    argspec: "args=[\'error_msg\', \'exit_without_error\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Abs"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AccumulateNV2"
+    argspec: "args=[\'inputs\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AccumulatorApplyGradient"
+    argspec: "args=[\'handle\', \'local_step\', \'gradient\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AccumulatorNumAccumulated"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AccumulatorSetGlobalStep"
+    argspec: "args=[\'handle\', \'new_global_step\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AccumulatorTakeGradient"
+    argspec: "args=[\'handle\', \'num_required\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Acos"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Acosh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Add"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AddManySparseToTensorsMap"
+    argspec: "args=[\'sparse_indices\', \'sparse_values\', \'sparse_shape\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AddN"
+    argspec: "args=[\'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AddSparseToTensorsMap"
+    argspec: "args=[\'sparse_indices\', \'sparse_values\', \'sparse_shape\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AddV2"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AdjustContrast"
+    argspec: "args=[\'images\', \'contrast_factor\', \'min_value\', \'max_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AdjustContrastv2"
+    argspec: "args=[\'images\', \'contrast_factor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AdjustHue"
+    argspec: "args=[\'images\', \'delta\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AdjustSaturation"
+    argspec: "args=[\'images\', \'scale\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "All"
+    argspec: "args=[\'input\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AllCandidateSampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Angle"
+    argspec: "args=[\'input\', \'Tout\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AnonymousIterator"
+    argspec: "args=[\'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Any"
+    argspec: "args=[\'input\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyAdaMax"
+    argspec: "args=[\'var\', \'m\', \'v\', \'beta1_power\', \'lr\', \'beta1\', \'beta2\', \'epsilon\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyAdadelta"
+    argspec: "args=[\'var\', \'accum\', \'accum_update\', \'lr\', \'rho\', \'epsilon\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'use_locking\', \'update_slots\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyAdagradDA"
+    argspec: "args=[\'var\', \'gradient_accumulator\', \'gradient_squared_accumulator\', \'grad\', \'lr\', \'l1\', \'l2\', \'global_step\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyAdam"
+    argspec: "args=[\'var\', \'m\', \'v\', \'beta1_power\', \'beta2_power\', \'lr\', \'beta1\', \'beta2\', \'epsilon\', \'grad\', \'use_locking\', \'use_nesterov\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyAddSign"
+    argspec: "args=[\'var\', \'m\', \'lr\', \'alpha\', \'sign_decay\', \'beta\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyCenteredRMSProp"
+    argspec: "args=[\'var\', \'mg\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyFtrl"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'lr\', \'l1\', \'l2\', \'lr_power\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyFtrlV2"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'lr\', \'l1\', \'l2\', \'l2_shrinkage\', \'lr_power\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyGradientDescent"
+    argspec: "args=[\'var\', \'alpha\', \'delta\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyMomentum"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'momentum\', \'use_locking\', \'use_nesterov\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyPowerSign"
+    argspec: "args=[\'var\', \'m\', \'lr\', \'logbase\', \'sign_decay\', \'beta\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyProximalAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'l1\', \'l2\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyProximalGradientDescent"
+    argspec: "args=[\'var\', \'alpha\', \'l1\', \'l2\', \'delta\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApplyRMSProp"
+    argspec: "args=[\'var\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ApproximateEqual"
+    argspec: "args=[\'x\', \'y\', \'tolerance\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ArgMax"
+    argspec: "args=[\'input\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ArgMin"
+    argspec: "args=[\'input\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AsString"
+    argspec: "args=[\'input\', \'precision\', \'scientific\', \'shortest\', \'width\', \'fill\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Asin"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Asinh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Assert"
+    argspec: "args=[\'condition\', \'data\', \'summarize\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Assign"
+    argspec: "args=[\'ref\', \'value\', \'validate_shape\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AssignAdd"
+    argspec: "args=[\'ref\', \'value\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AssignAddVariableOp"
+    argspec: "args=[\'resource\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AssignSub"
+    argspec: "args=[\'ref\', \'value\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AssignSubVariableOp"
+    argspec: "args=[\'resource\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AssignVariableOp"
+    argspec: "args=[\'resource\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Atan"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Atan2"
+    argspec: "args=[\'y\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Atanh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AudioSpectrogram"
+    argspec: "args=[\'input\', \'window_size\', \'stride\', \'magnitude_squared\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AudioSummary"
+    argspec: "args=[\'tag\', \'tensor\', \'sample_rate\', \'max_outputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AudioSummaryV2"
+    argspec: "args=[\'tag\', \'tensor\', \'sample_rate\', \'max_outputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AvgPool"
+    argspec: "args=[\'value\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AvgPool3D"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AvgPool3DGrad"
+    argspec: "args=[\'orig_input_shape\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "AvgPoolGrad"
+    argspec: "args=[\'orig_input_shape\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Barrier"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BarrierClose"
+    argspec: "args=[\'handle\', \'cancel_pending_enqueues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BarrierIncompleteSize"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BarrierInsertMany"
+    argspec: "args=[\'handle\', \'keys\', \'values\', \'component_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BarrierReadySize"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BarrierTakeMany"
+    argspec: "args=[\'handle\', \'num_elements\', \'component_types\', \'allow_small_batch\', \'wait_for_incomplete\', \'timeout_ms\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchCholesky"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchCholeskyGrad"
+    argspec: "args=[\'l\', \'grad\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchDataset"
+    argspec: "args=[\'input_dataset\', \'batch_size\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchDatasetV2"
+    argspec: "args=[\'input_dataset\', \'batch_size\', \'drop_remainder\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchFFT"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchFFT2D"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchFFT3D"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchIFFT"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchIFFT2D"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchIFFT3D"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchMatMul"
+    argspec: "args=[\'x\', \'y\', \'adj_x\', \'adj_y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchMatrixBandPart"
+    argspec: "args=[\'input\', \'num_lower\', \'num_upper\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchMatrixDeterminant"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchMatrixDiag"
+    argspec: "args=[\'diagonal\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchMatrixDiagPart"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchMatrixInverse"
+    argspec: "args=[\'input\', \'adjoint\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchMatrixSetDiag"
+    argspec: "args=[\'input\', \'diagonal\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchMatrixSolve"
+    argspec: "args=[\'matrix\', \'rhs\', \'adjoint\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchMatrixSolveLs"
+    argspec: "args=[\'matrix\', \'rhs\', \'l2_regularizer\', \'fast\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchMatrixTriangularSolve"
+    argspec: "args=[\'matrix\', \'rhs\', \'lower\', \'adjoint\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchNormWithGlobalNormalization"
+    argspec: "args=[\'t\', \'m\', \'v\', \'beta\', \'gamma\', \'variance_epsilon\', \'scale_after_normalization\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchNormWithGlobalNormalizationGrad"
+    argspec: "args=[\'t\', \'m\', \'v\', \'gamma\', \'backprop\', \'variance_epsilon\', \'scale_after_normalization\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchSelfAdjointEig"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchSelfAdjointEigV2"
+    argspec: "args=[\'input\', \'compute_v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchSvd"
+    argspec: "args=[\'input\', \'compute_uv\', \'full_matrices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchToSpace"
+    argspec: "args=[\'input\', \'crops\', \'block_size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BatchToSpaceND"
+    argspec: "args=[\'input\', \'block_shape\', \'crops\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BesselI0e"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BesselI1e"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Betainc"
+    argspec: "args=[\'a\', \'b\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BiasAdd"
+    argspec: "args=[\'value\', \'bias\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BiasAddGrad"
+    argspec: "args=[\'out_backprop\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BiasAddV1"
+    argspec: "args=[\'value\', \'bias\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Bincount"
+    argspec: "args=[\'arr\', \'size\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Bitcast"
+    argspec: "args=[\'input\', \'type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BitwiseAnd"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BitwiseOr"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BitwiseXor"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesBucketize"
+    argspec: "args=[\'float_values\', \'bucket_boundaries\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesCalculateBestGainsPerFeature"
+    argspec: "args=[\'node_id_range\', \'stats_summary_list\', \'l1\', \'l2\', \'tree_complexity\', \'min_node_weight\', \'max_splits\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesCenterBias"
+    argspec: "args=[\'tree_ensemble_handle\', \'mean_gradients\', \'mean_hessians\', \'l1\', \'l2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesCreateEnsemble"
+    argspec: "args=[\'tree_ensemble_handle\', \'stamp_token\', \'tree_ensemble_serialized\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesCreateQuantileStreamResource"
+    argspec: "args=[\'quantile_stream_resource_handle\', \'epsilon\', \'num_streams\', \'max_elements\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesDeserializeEnsemble"
+    argspec: "args=[\'tree_ensemble_handle\', \'stamp_token\', \'tree_ensemble_serialized\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesEnsembleResourceHandleOp"
+    argspec: "args=[\'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesExampleDebugOutputs"
+    argspec: "args=[\'tree_ensemble_handle\', \'bucketized_features\', \'logits_dimension\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesGetEnsembleStates"
+    argspec: "args=[\'tree_ensemble_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesMakeQuantileSummaries"
+    argspec: "args=[\'float_values\', \'example_weights\', \'epsilon\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesMakeStatsSummary"
+    argspec: "args=[\'node_ids\', \'gradients\', \'hessians\', \'bucketized_features_list\', \'max_splits\', \'num_buckets\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesPredict"
+    argspec: "args=[\'tree_ensemble_handle\', \'bucketized_features\', \'logits_dimension\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesQuantileStreamResourceAddSummaries"
+    argspec: "args=[\'quantile_stream_resource_handle\', \'summaries\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesQuantileStreamResourceDeserialize"
+    argspec: "args=[\'quantile_stream_resource_handle\', \'bucket_boundaries\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesQuantileStreamResourceFlush"
+    argspec: "args=[\'quantile_stream_resource_handle\', \'num_buckets\', \'generate_quantiles\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesQuantileStreamResourceGetBucketBoundaries"
+    argspec: "args=[\'quantile_stream_resource_handle\', \'num_features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesQuantileStreamResourceHandleOp"
+    argspec: "args=[\'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesSerializeEnsemble"
+    argspec: "args=[\'tree_ensemble_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesTrainingPredict"
+    argspec: "args=[\'tree_ensemble_handle\', \'cached_tree_ids\', \'cached_node_ids\', \'bucketized_features\', \'logits_dimension\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BoostedTreesUpdateEnsemble"
+    argspec: "args=[\'tree_ensemble_handle\', \'feature_ids\', \'node_ids\', \'gains\', \'thresholds\', \'left_node_contribs\', \'right_node_contribs\', \'max_depth\', \'learning_rate\', \'pruning_mode\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BroadcastArgs"
+    argspec: "args=[\'s0\', \'s1\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BroadcastGradientArgs"
+    argspec: "args=[\'s0\', \'s1\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "BroadcastTo"
+    argspec: "args=[\'input\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Bucketize"
+    argspec: "args=[\'input\', \'boundaries\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CTCBeamSearchDecoder"
+    argspec: "args=[\'inputs\', \'sequence_length\', \'beam_width\', \'top_paths\', \'merge_repeated\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CTCGreedyDecoder"
+    argspec: "args=[\'inputs\', \'sequence_length\', \'merge_repeated\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CTCLoss"
+    argspec: "args=[\'inputs\', \'labels_indices\', \'labels_values\', \'sequence_length\', \'preprocess_collapse_repeated\', \'ctc_merge_repeated\', \'ignore_longer_outputs_than_inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CacheDataset"
+    argspec: "args=[\'input_dataset\', \'filename\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Case"
+    argspec: "args=[\'branch_index\', \'input\', \'Tout\', \'branches\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Cast"
+    argspec: "args=[\'x\', \'DstT\', \'Truncate\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Ceil"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CheckNumerics"
+    argspec: "args=[\'tensor\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Cholesky"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CholeskyGrad"
+    argspec: "args=[\'l\', \'grad\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ClipByValue"
+    argspec: "args=[\'t\', \'clip_value_min\', \'clip_value_max\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CloseSummaryWriter"
+    argspec: "args=[\'writer\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CollectiveBcastRecv"
+    argspec: "args=[\'T\', \'group_size\', \'group_key\', \'instance_key\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CollectiveBcastSend"
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CollectiveGather"
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CollectiveReduce"
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'subdiv_offsets\', \'wait_for\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CombinedNonMaxSuppression"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size_per_class\', \'max_total_size\', \'iou_threshold\', \'score_threshold\', \'pad_per_class\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CompareAndBitpack"
+    argspec: "args=[\'input\', \'threshold\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Complex"
+    argspec: "args=[\'real\', \'imag\', \'Tout\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ComplexAbs"
+    argspec: "args=[\'x\', \'Tout\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ComputeAccidentalHits"
+    argspec: "args=[\'true_classes\', \'sampled_candidates\', \'num_true\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Concat"
+    argspec: "args=[\'concat_dim\', \'values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ConcatOffset"
+    argspec: "args=[\'concat_dim\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ConcatV2"
+    argspec: "args=[\'values\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ConcatenateDataset"
+    argspec: "args=[\'input_dataset\', \'another_dataset\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ConditionalAccumulator"
+    argspec: "args=[\'dtype\', \'shape\', \'container\', \'shared_name\', \'reduction_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Conj"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ConjugateTranspose"
+    argspec: "args=[\'x\', \'perm\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Const"
+    argspec: "args=[\'value\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ConsumeMutexLock"
+    argspec: "args=[\'mutex_lock\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ControlTrigger"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Conv2D"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'use_cudnn_on_gpu\', \'padding\', \'explicit_paddings\', \'data_format\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Conv2DBackpropFilter"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'use_cudnn_on_gpu\', \'padding\', \'explicit_paddings\', \'data_format\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Conv2DBackpropInput"
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'use_cudnn_on_gpu\', \'padding\', \'explicit_paddings\', \'data_format\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Conv3D"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Conv3DBackpropFilter"
+    argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Conv3DBackpropFilterV2"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Conv3DBackpropInput"
+    argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Conv3DBackpropInputV2"
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Cos"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Cosh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CountUpTo"
+    argspec: "args=[\'ref\', \'limit\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CreateSummaryDbWriter"
+    argspec: "args=[\'writer\', \'db_uri\', \'experiment_name\', \'run_name\', \'user_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CreateSummaryFileWriter"
+    argspec: "args=[\'writer\', \'logdir\', \'max_queue\', \'flush_millis\', \'filename_suffix\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CropAndResize"
+    argspec: "args=[\'image\', \'boxes\', \'box_ind\', \'crop_size\', \'method\', \'extrapolation_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CropAndResizeGradBoxes"
+    argspec: "args=[\'grads\', \'image\', \'boxes\', \'box_ind\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CropAndResizeGradImage"
+    argspec: "args=[\'grads\', \'boxes\', \'box_ind\', \'image_size\', \'T\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Cross"
+    argspec: "args=[\'a\', \'b\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CudnnRNN"
+    argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'is_training\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CudnnRNNBackprop"
+    argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'output\', \'output_h\', \'output_c\', \'output_backprop\', \'output_h_backprop\', \'output_c_backprop\', \'reserve_space\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CudnnRNNBackpropV2"
+    argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'output\', \'output_h\', \'output_c\', \'output_backprop\', \'output_h_backprop\', \'output_c_backprop\', \'reserve_space\', \'host_reserved\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CudnnRNNBackpropV3"
+    argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'sequence_lengths\', \'output\', \'output_h\', \'output_c\', \'output_backprop\', \'output_h_backprop\', \'output_c_backprop\', \'reserve_space\', \'host_reserved\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CudnnRNNCanonicalToParams"
+    argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'weights\', \'biases\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CudnnRNNParamsSize"
+    argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'T\', \'S\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CudnnRNNParamsToCanonical"
+    argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'params\', \'num_params\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CudnnRNNV2"
+    argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'is_training\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "CudnnRNNV3"
+    argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'sequence_lengths\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'is_training\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Cumprod"
+    argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Cumsum"
+    argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DataFormatDimMap"
+    argspec: "args=[\'x\', \'src_format\', \'dst_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DataFormatVecPermute"
+    argspec: "args=[\'x\', \'src_format\', \'dst_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DatasetToGraph"
+    argspec: "args=[\'input_dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DatasetToSingleElement"
+    argspec: "args=[\'dataset\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DebugGradientIdentity"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DebugGradientRefIdentity"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DecodeAndCropJpeg"
+    argspec: "args=[\'contents\', \'crop_window\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DecodeBase64"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DecodeBmp"
+    argspec: "args=[\'contents\', \'channels\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DecodeCSV"
+    argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'select_cols\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DecodeCompressed"
+    argspec: "args=[\'bytes\', \'compression_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DecodeGif"
+    argspec: "args=[\'contents\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DecodeJSONExample"
+    argspec: "args=[\'json_examples\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DecodeJpeg"
+    argspec: "args=[\'contents\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DecodePng"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DecodeRaw"
+    argspec: "args=[\'bytes\', \'out_type\', \'little_endian\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DecodeWav"
+    argspec: "args=[\'contents\', \'desired_channels\', \'desired_samples\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DeepCopy"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DeleteSessionTensor"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DenseToDenseSetOperation"
+    argspec: "args=[\'set1\', \'set2\', \'set_operation\', \'validate_indices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DenseToSparseSetOperation"
+    argspec: "args=[\'set1\', \'set2_indices\', \'set2_values\', \'set2_shape\', \'set_operation\', \'validate_indices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DepthToSpace"
+    argspec: "args=[\'input\', \'block_size\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DepthwiseConv2dNative"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DepthwiseConv2dNativeBackpropFilter"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DepthwiseConv2dNativeBackpropInput"
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Dequantize"
+    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'mode\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DeserializeIterator"
+    argspec: "args=[\'resource_handle\', \'serialized\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DeserializeManySparse"
+    argspec: "args=[\'serialized_sparse\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DeserializeSparse"
+    argspec: "args=[\'serialized_sparse\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DestroyResourceOp"
+    argspec: "args=[\'resource\', \'ignore_lookup_error\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DestroyTemporaryVariable"
+    argspec: "args=[\'ref\', \'var_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Diag"
+    argspec: "args=[\'diagonal\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DiagPart"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Digamma"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Dilation2D"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'rates\', \'padding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Dilation2DBackpropFilter"
+    argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'rates\', \'padding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Dilation2DBackpropInput"
+    argspec: "args=[\'input\', \'filter\', \'out_backprop\', \'strides\', \'rates\', \'padding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Div"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DivNoNan"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DrawBoundingBoxes"
+    argspec: "args=[\'images\', \'boxes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DynamicPartition"
+    argspec: "args=[\'data\', \'partitions\', \'num_partitions\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "DynamicStitch"
+    argspec: "args=[\'indices\', \'data\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "EagerPyFunc"
+    argspec: "args=[\'input\', \'token\', \'Tout\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "EditDistance"
+    argspec: "args=[\'hypothesis_indices\', \'hypothesis_values\', \'hypothesis_shape\', \'truth_indices\', \'truth_values\', \'truth_shape\', \'normalize\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Elu"
+    argspec: "args=[\'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "EluGrad"
+    argspec: "args=[\'gradients\', \'outputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Empty"
+    argspec: "args=[\'shape\', \'dtype\', \'init\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "EmptyTensorList"
+    argspec: "args=[\'element_shape\', \'max_num_elements\', \'element_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "EncodeBase64"
+    argspec: "args=[\'input\', \'pad\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "EncodeJpeg"
+    argspec: "args=[\'image\', \'format\', \'quality\', \'progressive\', \'optimize_size\', \'chroma_downsampling\', \'density_unit\', \'x_density\', \'y_density\', \'xmp_metadata\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "EncodePng"
+    argspec: "args=[\'image\', \'compression\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "EncodeWav"
+    argspec: "args=[\'audio\', \'sample_rate\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "EnsureShape"
+    argspec: "args=[\'input\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Enter"
+    argspec: "args=[\'data\', \'frame_name\', \'is_constant\', \'parallel_iterations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Equal"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Erf"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Erfc"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "EuclideanNorm"
+    argspec: "args=[\'input\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Exit"
+    argspec: "args=[\'data\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Exp"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExpandDims"
+    argspec: "args=[\'input\', \'dim\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalAssertNextDataset"
+    argspec: "args=[\'input_dataset\', \'transformations\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalBytesProducedStatsDataset"
+    argspec: "args=[\'input_dataset\', \'tag\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalCSVDataset"
+    argspec: "args=[\'filenames\', \'compression_type\', \'buffer_size\', \'header\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'select_cols\', \'record_defaults\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalChooseFastestDataset"
+    argspec: "args=[\'input_datasets\', \'num_experiments\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalDatasetCardinality"
+    argspec: "args=[\'input_dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalDatasetToTFRecord"
+    argspec: "args=[\'input_dataset\', \'filename\', \'compression_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalDenseToSparseBatchDataset"
+    argspec: "args=[\'input_dataset\', \'batch_size\', \'row_shape\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalDirectedInterleaveDataset"
+    argspec: "args=[\'selector_input_dataset\', \'data_input_datasets\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalGroupByReducerDataset"
+    argspec: "args=[\'input_dataset\', \'key_func_other_arguments\', \'init_func_other_arguments\', \'reduce_func_other_arguments\', \'finalize_func_other_arguments\', \'key_func\', \'init_func\', \'reduce_func\', \'finalize_func\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalGroupByWindowDataset"
+    argspec: "args=[\'input_dataset\', \'key_func_other_arguments\', \'reduce_func_other_arguments\', \'window_size_func_other_arguments\', \'key_func\', \'reduce_func\', \'window_size_func\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalIdentityIndexedDataset"
+    argspec: "args=[\'size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalIgnoreErrorsDataset"
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalIndexedDatasetGet"
+    argspec: "args=[\'materialized\', \'index\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalIndexedDatasetMaterialize"
+    argspec: "args=[\'dataset\', \'materialized\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalIteratorGetDevice"
+    argspec: "args=[\'resource\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalLMDBDataset"
+    argspec: "args=[\'filenames\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalLatencyStatsDataset"
+    argspec: "args=[\'input_dataset\', \'tag\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalMapAndBatchDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'batch_size\', \'num_parallel_calls\', \'drop_remainder\', \'f\', \'output_types\', \'output_shapes\', \'preserve_cardinality\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalMapDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'f\', \'output_types\', \'output_shapes\', \'use_inter_op_parallelism\', \'preserve_cardinality\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalMatchingFilesDataset"
+    argspec: "args=[\'patterns\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalMaterializedIndexDatasetHandle"
+    argspec: "args=[\'container\', \'shared_name\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalMaxIntraOpParallelismDataset"
+    argspec: "args=[\'input_dataset\', \'max_intra_op_parallelism\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalNonSerializableDataset"
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalNumaMapAndBatchDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'batch_size\', \'num_parallel_calls\', \'drop_remainder\', \'f\', \'output_types\', \'output_shapes\', \'preserve_cardinality\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalParallelInterleaveDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'cycle_length\', \'block_length\', \'sloppy\', \'buffer_output_elements\', \'prefetch_input_elements\', \'f\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalParseExampleDataset"
+    argspec: "args=[\'input_dataset\', \'num_parallel_calls\', \'dense_defaults\', \'sparse_keys\', \'dense_keys\', \'sparse_types\', \'dense_shapes\', \'output_types\', \'output_shapes\', \'sloppy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalPrivateThreadPoolDataset"
+    argspec: "args=[\'input_dataset\', \'num_threads\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalRandomDataset"
+    argspec: "args=[\'seed\', \'seed2\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalRebatchDataset"
+    argspec: "args=[\'input_dataset\', \'num_workers\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalScanDataset"
+    argspec: "args=[\'input_dataset\', \'initial_state\', \'other_arguments\', \'f\', \'output_types\', \'output_shapes\', \'preserve_cardinality\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalSetStatsAggregatorDataset"
+    argspec: "args=[\'input_dataset\', \'stats_aggregator\', \'tag\', \'counter_prefix\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalSleepDataset"
+    argspec: "args=[\'input_dataset\', \'sleep_microseconds\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalSlidingWindowDataset"
+    argspec: "args=[\'input_dataset\', \'window_size\', \'window_shift\', \'window_stride\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalSqlDataset"
+    argspec: "args=[\'driver_name\', \'data_source_name\', \'query\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalStatsAggregatorHandle"
+    argspec: "args=[\'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalStatsAggregatorSummary"
+    argspec: "args=[\'iterator\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalTakeWhileDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'predicate\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalThreadPoolDataset"
+    argspec: "args=[\'input_dataset\', \'thread_pool\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalThreadPoolHandle"
+    argspec: "args=[\'num_threads\', \'max_intra_op_parallelism\', \'display_name\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalUnbatchDataset"
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExperimentalUniqueDataset"
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Expm1"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExtractGlimpse"
+    argspec: "args=[\'input\', \'size\', \'offsets\', \'centered\', \'normalized\', \'uniform_noise\', \'noise\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExtractImagePatches"
+    argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExtractJpegShape"
+    argspec: "args=[\'contents\', \'output_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ExtractVolumePatches"
+    argspec: "args=[\'input\', \'ksizes\', \'strides\', \'padding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FFT"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FFT2D"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FFT3D"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FIFOQueue"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FIFOQueueV2"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Fact"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FakeParam"
+    argspec: "args=[\'dtype\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FakeQuantWithMinMaxArgs"
+    argspec: "args=[\'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FakeQuantWithMinMaxArgsGradient"
+    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FakeQuantWithMinMaxVars"
+    argspec: "args=[\'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FakeQuantWithMinMaxVarsGradient"
+    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FakeQuantWithMinMaxVarsPerChannel"
+    argspec: "args=[\'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FakeQuantWithMinMaxVarsPerChannelGradient"
+    argspec: "args=[\'gradients\', \'inputs\', \'min\', \'max\', \'num_bits\', \'narrow_range\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FakeQueue"
+    argspec: "args=[\'resource\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Fill"
+    argspec: "args=[\'dims\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FilterByLastComponentDataset"
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FilterDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'predicate\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FixedLengthRecordDataset"
+    argspec: "args=[\'filenames\', \'header_bytes\', \'record_bytes\', \'footer_bytes\', \'buffer_size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FixedLengthRecordDatasetV2"
+    argspec: "args=[\'filenames\', \'header_bytes\', \'record_bytes\', \'footer_bytes\', \'buffer_size\', \'compression_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FixedLengthRecordReader"
+    argspec: "args=[\'header_bytes\', \'record_bytes\', \'footer_bytes\', \'hop_bytes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FixedLengthRecordReaderV2"
+    argspec: "args=[\'header_bytes\', \'record_bytes\', \'footer_bytes\', \'hop_bytes\', \'container\', \'shared_name\', \'encoding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FixedUnigramCandidateSampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'vocab_file\', \'distortion\', \'num_reserved_ids\', \'num_shards\', \'shard\', \'unigrams\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FlatMapDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'f\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Floor"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FloorDiv"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FloorMod"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FlushSummaryWriter"
+    argspec: "args=[\'writer\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "For"
+    argspec: "args=[\'start\', \'limit\', \'delta\', \'input\', \'body\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FractionalAvgPool"
+    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'deterministic\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FractionalAvgPoolGrad"
+    argspec: "args=[\'orig_input_tensor_shape\', \'out_backprop\', \'row_pooling_sequence\', \'col_pooling_sequence\', \'overlapping\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FractionalMaxPool"
+    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'deterministic\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FractionalMaxPoolGrad"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'out_backprop\', \'row_pooling_sequence\', \'col_pooling_sequence\', \'overlapping\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FusedBatchNorm"
+    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'data_format\', \'is_training\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FusedBatchNormGrad"
+    argspec: "args=[\'y_backprop\', \'x\', \'scale\', \'reserve_space_1\', \'reserve_space_2\', \'epsilon\', \'data_format\', \'is_training\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FusedBatchNormGradV2"
+    argspec: "args=[\'y_backprop\', \'x\', \'scale\', \'reserve_space_1\', \'reserve_space_2\', \'epsilon\', \'data_format\', \'is_training\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FusedBatchNormV2"
+    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'data_format\', \'is_training\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FusedPadConv2D"
+    argspec: "args=[\'input\', \'paddings\', \'filter\', \'mode\', \'strides\', \'padding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "FusedResizeAndPadConv2D"
+    argspec: "args=[\'input\', \'size\', \'paddings\', \'filter\', \'resize_align_corners\', \'mode\', \'strides\', \'padding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Gather"
+    argspec: "args=[\'params\', \'indices\', \'validate_indices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "GatherNd"
+    argspec: "args=[\'params\', \'indices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "GatherV2"
+    argspec: "args=[\'params\', \'indices\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "GenerateVocabRemapping"
+    argspec: "args=[\'new_vocab_file\', \'old_vocab_file\', \'new_vocab_offset\', \'num_new_vocab\', \'old_vocab_size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "GeneratorDataset"
+    argspec: "args=[\'init_func_other_args\', \'next_func_other_args\', \'finalize_func_other_args\', \'init_func\', \'next_func\', \'finalize_func\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "GetSessionHandle"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "GetSessionHandleV2"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "GetSessionTensor"
+    argspec: "args=[\'handle\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Greater"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "GreaterEqual"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "GuaranteeConst"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "HSVToRGB"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "HashTable"
+    argspec: "args=[\'container\', \'shared_name\', \'use_node_name_sharing\', \'key_dtype\', \'value_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "HashTableV2"
+    argspec: "args=[\'container\', \'shared_name\', \'use_node_name_sharing\', \'key_dtype\', \'value_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "HistogramFixedWidth"
+    argspec: "args=[\'values\', \'value_range\', \'nbins\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "HistogramSummary"
+    argspec: "args=[\'tag\', \'values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IFFT"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IFFT2D"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IFFT3D"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IRFFT"
+    argspec: "args=[\'input\', \'fft_length\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IRFFT2D"
+    argspec: "args=[\'input\', \'fft_length\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IRFFT3D"
+    argspec: "args=[\'input\', \'fft_length\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Identity"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IdentityN"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IdentityReader"
+    argspec: "args=[\'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IdentityReaderV2"
+    argspec: "args=[\'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "If"
+    argspec: "args=[\'cond\', \'input\', \'Tout\', \'then_branch\', \'else_branch\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Igamma"
+    argspec: "args=[\'a\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IgammaGradA"
+    argspec: "args=[\'a\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Igammac"
+    argspec: "args=[\'a\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Imag"
+    argspec: "args=[\'input\', \'Tout\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ImageSummary"
+    argspec: "args=[\'tag\', \'tensor\', \'max_images\', \'bad_color\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ImmutableConst"
+    argspec: "args=[\'dtype\', \'shape\', \'memory_region_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ImportEvent"
+    argspec: "args=[\'writer\', \'event\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "InTopK"
+    argspec: "args=[\'predictions\', \'targets\', \'k\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "InTopKV2"
+    argspec: "args=[\'predictions\', \'targets\', \'k\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "InitializeTable"
+    argspec: "args=[\'table_handle\', \'keys\', \'values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "InitializeTableFromTextFile"
+    argspec: "args=[\'table_handle\', \'filename\', \'key_index\', \'value_index\', \'vocab_size\', \'delimiter\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "InitializeTableFromTextFileV2"
+    argspec: "args=[\'table_handle\', \'filename\', \'key_index\', \'value_index\', \'vocab_size\', \'delimiter\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "InitializeTableV2"
+    argspec: "args=[\'table_handle\', \'keys\', \'values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "InplaceAdd"
+    argspec: "args=[\'x\', \'i\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "InplaceSub"
+    argspec: "args=[\'x\', \'i\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "InplaceUpdate"
+    argspec: "args=[\'x\', \'i\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "InterleaveDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'cycle_length\', \'block_length\', \'f\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Inv"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "InvGrad"
+    argspec: "args=[\'y\', \'dy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Invert"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "InvertPermutation"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IsBoostedTreesEnsembleInitialized"
+    argspec: "args=[\'tree_ensemble_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IsBoostedTreesQuantileStreamResourceInitialized"
+    argspec: "args=[\'quantile_stream_resource_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IsFinite"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IsInf"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IsNan"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IsVariableInitialized"
+    argspec: "args=[\'ref\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Iterator"
+    argspec: "args=[\'shared_name\', \'container\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IteratorFromStringHandle"
+    argspec: "args=[\'string_handle\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IteratorFromStringHandleV2"
+    argspec: "args=[\'string_handle\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IteratorGetNext"
+    argspec: "args=[\'iterator\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IteratorGetNextAsOptional"
+    argspec: "args=[\'iterator\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IteratorGetNextSync"
+    argspec: "args=[\'iterator\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IteratorToStringHandle"
+    argspec: "args=[\'resource_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "IteratorV2"
+    argspec: "args=[\'shared_name\', \'container\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "L2Loss"
+    argspec: "args=[\'t\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LMDBReader"
+    argspec: "args=[\'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LRN"
+    argspec: "args=[\'input\', \'depth_radius\', \'bias\', \'alpha\', \'beta\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LRNGrad"
+    argspec: "args=[\'input_grads\', \'input_image\', \'output_image\', \'depth_radius\', \'bias\', \'alpha\', \'beta\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LeakyRelu"
+    argspec: "args=[\'features\', \'alpha\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LeakyReluGrad"
+    argspec: "args=[\'gradients\', \'features\', \'alpha\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LearnedUnigramCandidateSampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LeftShift"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Less"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LessEqual"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Lgamma"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LinSpace"
+    argspec: "args=[\'start\', \'stop\', \'num\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ListDiff"
+    argspec: "args=[\'x\', \'y\', \'out_idx\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LoadAndRemapMatrix"
+    argspec: "args=[\'ckpt_path\', \'old_tensor_name\', \'row_remapping\', \'col_remapping\', \'initializing_values\', \'num_rows\', \'num_cols\', \'max_rows_in_memory\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Log"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Log1p"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LogMatrixDeterminant"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LogSoftmax"
+    argspec: "args=[\'logits\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LogUniformCandidateSampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LogicalAnd"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LogicalNot"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LogicalOr"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LookupTableExport"
+    argspec: "args=[\'table_handle\', \'Tkeys\', \'Tvalues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LookupTableExportV2"
+    argspec: "args=[\'table_handle\', \'Tkeys\', \'Tvalues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LookupTableFind"
+    argspec: "args=[\'table_handle\', \'keys\', \'default_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LookupTableFindV2"
+    argspec: "args=[\'table_handle\', \'keys\', \'default_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LookupTableImport"
+    argspec: "args=[\'table_handle\', \'keys\', \'values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LookupTableImportV2"
+    argspec: "args=[\'table_handle\', \'keys\', \'values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LookupTableInsert"
+    argspec: "args=[\'table_handle\', \'keys\', \'values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LookupTableInsertV2"
+    argspec: "args=[\'table_handle\', \'keys\', \'values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LookupTableRemoveV2"
+    argspec: "args=[\'table_handle\', \'keys\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LookupTableSize"
+    argspec: "args=[\'table_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LookupTableSizeV2"
+    argspec: "args=[\'table_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LoopCond"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "LowerBound"
+    argspec: "args=[\'sorted_inputs\', \'values\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Lu"
+    argspec: "args=[\'input\', \'output_idx_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MakeIterator"
+    argspec: "args=[\'dataset\', \'iterator\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MapClear"
+    argspec: "args=[\'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MapDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'f\', \'output_types\', \'output_shapes\', \'use_inter_op_parallelism\', \'preserve_cardinality\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MapDefun"
+    argspec: "args=[\'arguments\', \'captured_inputs\', \'output_types\', \'output_shapes\', \'f\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MapIncompleteSize"
+    argspec: "args=[\'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MapPeek"
+    argspec: "args=[\'key\', \'indices\', \'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MapSize"
+    argspec: "args=[\'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MapStage"
+    argspec: "args=[\'key\', \'indices\', \'values\', \'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MapUnstage"
+    argspec: "args=[\'key\', \'indices\', \'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MapUnstageNoKey"
+    argspec: "args=[\'indices\', \'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatMul"
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatchingFiles"
+    argspec: "args=[\'pattern\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatrixBandPart"
+    argspec: "args=[\'input\', \'num_lower\', \'num_upper\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatrixDeterminant"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatrixDiag"
+    argspec: "args=[\'diagonal\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatrixDiagPart"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatrixExponential"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatrixInverse"
+    argspec: "args=[\'input\', \'adjoint\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatrixLogarithm"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatrixSetDiag"
+    argspec: "args=[\'input\', \'diagonal\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatrixSolve"
+    argspec: "args=[\'matrix\', \'rhs\', \'adjoint\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatrixSolveLs"
+    argspec: "args=[\'matrix\', \'rhs\', \'l2_regularizer\', \'fast\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatrixSquareRoot"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MatrixTriangularSolve"
+    argspec: "args=[\'matrix\', \'rhs\', \'lower\', \'adjoint\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Max"
+    argspec: "args=[\'input\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MaxPool"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MaxPool3D"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MaxPool3DGrad"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MaxPool3DGradGrad"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MaxPoolGrad"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MaxPoolGradGrad"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MaxPoolGradGradV2"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MaxPoolGradGradWithArgmax"
+    argspec: "args=[\'input\', \'grad\', \'argmax\', \'ksize\', \'strides\', \'padding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MaxPoolGradV2"
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MaxPoolGradWithArgmax"
+    argspec: "args=[\'input\', \'grad\', \'argmax\', \'ksize\', \'strides\', \'padding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MaxPoolV2"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MaxPoolWithArgmax"
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'Targmax\', \'padding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Maximum"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Mean"
+    argspec: "args=[\'input\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Merge"
+    argspec: "args=[\'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MergeSummary"
+    argspec: "args=[\'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MergeV2Checkpoints"
+    argspec: "args=[\'checkpoint_prefixes\', \'destination_prefix\', \'delete_old_dirs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Mfcc"
+    argspec: "args=[\'spectrogram\', \'sample_rate\', \'upper_frequency_limit\', \'lower_frequency_limit\', \'filterbank_channel_count\', \'dct_coefficient_count\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Min"
+    argspec: "args=[\'input\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Minimum"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MirrorPad"
+    argspec: "args=[\'input\', \'paddings\', \'mode\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MirrorPadGrad"
+    argspec: "args=[\'input\', \'paddings\', \'mode\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Mod"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ModelDataset"
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Mul"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MulNoNan"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MultiDeviceIterator"
+    argspec: "args=[\'devices\', \'shared_name\', \'container\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MultiDeviceIteratorFromStringHandle"
+    argspec: "args=[\'string_handle\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MultiDeviceIteratorGetNextFromShard"
+    argspec: "args=[\'multi_device_iterator\', \'shard_num\', \'incarnation_id\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MultiDeviceIteratorInit"
+    argspec: "args=[\'dataset\', \'multi_device_iterator\', \'max_buffer_size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MultiDeviceIteratorToStringHandle"
+    argspec: "args=[\'multi_device_iterator\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Multinomial"
+    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'seed2\', \'output_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MutableDenseHashTable"
+    argspec: "args=[\'empty_key\', \'container\', \'shared_name\', \'use_node_name_sharing\', \'value_dtype\', \'value_shape\', \'initial_num_buckets\', \'max_load_factor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MutableDenseHashTableV2"
+    argspec: "args=[\'empty_key\', \'deleted_key\', \'container\', \'shared_name\', \'use_node_name_sharing\', \'value_dtype\', \'value_shape\', \'initial_num_buckets\', \'max_load_factor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MutableHashTable"
+    argspec: "args=[\'container\', \'shared_name\', \'use_node_name_sharing\', \'key_dtype\', \'value_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MutableHashTableOfTensors"
+    argspec: "args=[\'container\', \'shared_name\', \'use_node_name_sharing\', \'key_dtype\', \'value_dtype\', \'value_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MutableHashTableOfTensorsV2"
+    argspec: "args=[\'container\', \'shared_name\', \'use_node_name_sharing\', \'key_dtype\', \'value_dtype\', \'value_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MutableHashTableV2"
+    argspec: "args=[\'container\', \'shared_name\', \'use_node_name_sharing\', \'key_dtype\', \'value_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MutexLock"
+    argspec: "args=[\'mutex\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MutexV2"
+    argspec: "args=[\'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NcclAllReduce"
+    argspec: "args=[\'input\', \'reduction\', \'num_devices\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NcclBroadcast"
+    argspec: "args=[\'input\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NcclReduce"
+    argspec: "args=[\'input\', \'reduction\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Neg"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NextAfter"
+    argspec: "args=[\'x1\', \'x2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NextIteration"
+    argspec: "args=[\'data\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NoOp"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NonMaxSuppression"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NonMaxSuppressionV2"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NonMaxSuppressionV3"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'score_threshold\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NonMaxSuppressionV4"
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'score_threshold\', \'pad_to_max_output_size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NonMaxSuppressionWithOverlaps"
+    argspec: "args=[\'overlaps\', \'scores\', \'max_output_size\', \'overlap_threshold\', \'score_threshold\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NotEqual"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "NthElement"
+    argspec: "args=[\'input\', \'n\', \'reverse\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OneHot"
+    argspec: "args=[\'indices\', \'depth\', \'on_value\', \'off_value\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OneShotIterator"
+    argspec: "args=[\'dataset_factory\', \'output_types\', \'output_shapes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OnesLike"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OptimizeDataset"
+    argspec: "args=[\'input_dataset\', \'optimizations\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OptionalFromValue"
+    argspec: "args=[\'components\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OptionalGetValue"
+    argspec: "args=[\'optional\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OptionalHasValue"
+    argspec: "args=[\'optional\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OptionalNone"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OrderedMapClear"
+    argspec: "args=[\'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OrderedMapIncompleteSize"
+    argspec: "args=[\'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OrderedMapPeek"
+    argspec: "args=[\'key\', \'indices\', \'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OrderedMapSize"
+    argspec: "args=[\'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OrderedMapStage"
+    argspec: "args=[\'key\', \'indices\', \'values\', \'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OrderedMapUnstage"
+    argspec: "args=[\'key\', \'indices\', \'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "OrderedMapUnstageNoKey"
+    argspec: "args=[\'indices\', \'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Pack"
+    argspec: "args=[\'values\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Pad"
+    argspec: "args=[\'input\', \'paddings\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PadV2"
+    argspec: "args=[\'input\', \'paddings\', \'constant_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PaddedBatchDataset"
+    argspec: "args=[\'input_dataset\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PaddedBatchDatasetV2"
+    argspec: "args=[\'input_dataset\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PaddingFIFOQueue"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PaddingFIFOQueueV2"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ParallelConcat"
+    argspec: "args=[\'values\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ParallelDynamicStitch"
+    argspec: "args=[\'indices\', \'data\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ParallelInterleaveDatasetV2"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'cycle_length\', \'block_length\', \'num_parallel_calls\', \'f\', \'output_types\', \'output_shapes\', \'sloppy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ParallelMapDataset"
+    argspec: "args=[\'input_dataset\', \'other_arguments\', \'num_parallel_calls\', \'f\', \'output_types\', \'output_shapes\', \'use_inter_op_parallelism\', \'sloppy\', \'preserve_cardinality\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ParameterizedTruncatedNormal"
+    argspec: "args=[\'shape\', \'means\', \'stdevs\', \'minvals\', \'maxvals\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ParseExample"
+    argspec: "args=[\'serialized\', \'names\', \'sparse_keys\', \'dense_keys\', \'dense_defaults\', \'sparse_types\', \'dense_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ParseSequenceExample"
+    argspec: "args=[\'serialized\', \'debug_name\', \'context_dense_defaults\', \'feature_list_dense_missing_assumed_empty\', \'context_sparse_keys\', \'context_dense_keys\', \'feature_list_sparse_keys\', \'feature_list_dense_keys\', \'Ncontext_sparse\', \'Ncontext_dense\', \'Nfeature_list_sparse\', \'Nfeature_list_dense\', \'context_sparse_types\', \'feature_list_dense_types\', \'context_dense_shapes\', \'feature_list_sparse_types\', \'feature_list_dense_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ParseSingleExample"
+    argspec: "args=[\'serialized\', \'dense_defaults\', \'num_sparse\', \'sparse_keys\', \'dense_keys\', \'sparse_types\', \'dense_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ParseSingleSequenceExample"
+    argspec: "args=[\'serialized\', \'feature_list_dense_missing_assumed_empty\', \'context_sparse_keys\', \'context_dense_keys\', \'feature_list_sparse_keys\', \'feature_list_dense_keys\', \'context_dense_defaults\', \'debug_name\', \'context_sparse_types\', \'feature_list_dense_types\', \'context_dense_shapes\', \'feature_list_sparse_types\', \'feature_list_dense_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ParseTensor"
+    argspec: "args=[\'serialized\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PartitionedCall"
+    argspec: "args=[\'args\', \'Tout\', \'f\', \'config\', \'config_proto\', \'executor_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Placeholder"
+    argspec: "args=[\'dtype\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PlaceholderV2"
+    argspec: "args=[\'dtype\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PlaceholderWithDefault"
+    argspec: "args=[\'input\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Polygamma"
+    argspec: "args=[\'a\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PopulationCount"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Pow"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PrefetchDataset"
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PreventGradient"
+    argspec: "args=[\'input\', \'message\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Print"
+    argspec: "args=[\'input\', \'data\', \'message\', \'first_n\', \'summarize\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PrintV2"
+    argspec: "args=[\'input\', \'output_stream\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PriorityQueue"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PriorityQueueV2"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Prod"
+    argspec: "args=[\'input\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PyFunc"
+    argspec: "args=[\'input\', \'token\', \'Tout\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "PyFuncStateless"
+    argspec: "args=[\'input\', \'token\', \'Tout\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Qr"
+    argspec: "args=[\'input\', \'full_matrices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizeAndDequantize"
+    argspec: "args=[\'input\', \'signed_input\', \'num_bits\', \'range_given\', \'input_min\', \'input_max\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizeAndDequantizeV2"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizeAndDequantizeV3"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'num_bits\', \'signed_input\', \'range_given\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizeDownAndShrinkRange"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizeV2"
+    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'round_mode\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedAdd"
+    argspec: "args=[\'x\', \'y\', \'min_x\', \'max_x\', \'min_y\', \'max_y\', \'Toutput\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedAvgPool"
+    argspec: "args=[\'input\', \'min_input\', \'max_input\', \'ksize\', \'strides\', \'padding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedBatchNormWithGlobalNormalization"
+    argspec: "args=[\'t\', \'t_min\', \'t_max\', \'m\', \'m_min\', \'m_max\', \'v\', \'v_min\', \'v_max\', \'beta\', \'beta_min\', \'beta_max\', \'gamma\', \'gamma_min\', \'gamma_max\', \'out_type\', \'variance_epsilon\', \'scale_after_normalization\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedBiasAdd"
+    argspec: "args=[\'input\', \'bias\', \'min_input\', \'max_input\', \'min_bias\', \'max_bias\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedConcat"
+    argspec: "args=[\'concat_dim\', \'values\', \'input_mins\', \'input_maxes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedConv2D"
+    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'out_type\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedConv2DAndRelu"
+    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'out_type\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedConv2DAndReluAndRequantize"
+    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'min_freezed_output\', \'max_freezed_output\', \'out_type\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedConv2DAndRequantize"
+    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'min_freezed_output\', \'max_freezed_output\', \'out_type\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedConv2DWithBias"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'out_type\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedConv2DWithBiasAndRelu"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'out_type\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedConv2DWithBiasAndReluAndRequantize"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'min_freezed_output\', \'max_freezed_output\', \'out_type\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedConv2DWithBiasAndRequantize"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'min_freezed_output\', \'max_freezed_output\', \'out_type\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedConv2DWithBiasSignedSumAndReluAndRequantize"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'min_freezed_output\', \'max_freezed_output\', \'summand\', \'min_summand\', \'max_summand\', \'out_type\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedConv2DWithBiasSumAndRelu"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'summand\', \'out_type\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedConv2DWithBiasSumAndReluAndRequantize"
+    argspec: "args=[\'input\', \'filter\', \'bias\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'min_freezed_output\', \'max_freezed_output\', \'summand\', \'min_summand\', \'max_summand\', \'out_type\', \'strides\', \'padding\', \'dilations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedInstanceNorm"
+    argspec: "args=[\'x\', \'x_min\', \'x_max\', \'output_range_given\', \'given_y_min\', \'given_y_max\', \'variance_epsilon\', \'min_separation\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedMatMul"
+    argspec: "args=[\'a\', \'b\', \'min_a\', \'max_a\', \'min_b\', \'max_b\', \'Toutput\', \'transpose_a\', \'transpose_b\', \'Tactivation\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedMaxPool"
+    argspec: "args=[\'input\', \'min_input\', \'max_input\', \'ksize\', \'strides\', \'padding\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedMul"
+    argspec: "args=[\'x\', \'y\', \'min_x\', \'max_x\', \'min_y\', \'max_y\', \'Toutput\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedRelu"
+    argspec: "args=[\'features\', \'min_features\', \'max_features\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedRelu6"
+    argspec: "args=[\'features\', \'min_features\', \'max_features\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedReluX"
+    argspec: "args=[\'features\', \'max_value\', \'min_features\', \'max_features\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedReshape"
+    argspec: "args=[\'tensor\', \'shape\', \'input_min\', \'input_max\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QuantizedResizeBilinear"
+    argspec: "args=[\'images\', \'size\', \'min\', \'max\', \'align_corners\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueClose"
+    argspec: "args=[\'handle\', \'cancel_pending_enqueues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueCloseV2"
+    argspec: "args=[\'handle\', \'cancel_pending_enqueues\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueDequeue"
+    argspec: "args=[\'handle\', \'component_types\', \'timeout_ms\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueDequeueMany"
+    argspec: "args=[\'handle\', \'n\', \'component_types\', \'timeout_ms\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueDequeueManyV2"
+    argspec: "args=[\'handle\', \'n\', \'component_types\', \'timeout_ms\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueDequeueUpTo"
+    argspec: "args=[\'handle\', \'n\', \'component_types\', \'timeout_ms\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueDequeueUpToV2"
+    argspec: "args=[\'handle\', \'n\', \'component_types\', \'timeout_ms\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueDequeueV2"
+    argspec: "args=[\'handle\', \'component_types\', \'timeout_ms\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueEnqueue"
+    argspec: "args=[\'handle\', \'components\', \'timeout_ms\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueEnqueueMany"
+    argspec: "args=[\'handle\', \'components\', \'timeout_ms\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueEnqueueManyV2"
+    argspec: "args=[\'handle\', \'components\', \'timeout_ms\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueEnqueueV2"
+    argspec: "args=[\'handle\', \'components\', \'timeout_ms\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueIsClosed"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueIsClosedV2"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueSize"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "QueueSizeV2"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RFFT"
+    argspec: "args=[\'input\', \'fft_length\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RFFT2D"
+    argspec: "args=[\'input\', \'fft_length\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RFFT3D"
+    argspec: "args=[\'input\', \'fft_length\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RGBToHSV"
+    argspec: "args=[\'images\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RaggedGather"
+    argspec: "args=[\'params_nested_splits\', \'params_dense_values\', \'indices\', \'OUTPUT_RAGGED_RANK\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RaggedRange"
+    argspec: "args=[\'starts\', \'limits\', \'deltas\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RaggedTensorToSparse"
+    argspec: "args=[\'rt_nested_splits\', \'rt_dense_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RandomCrop"
+    argspec: "args=[\'image\', \'size\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RandomGamma"
+    argspec: "args=[\'shape\', \'alpha\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RandomGammaGrad"
+    argspec: "args=[\'alpha\', \'sample\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RandomPoisson"
+    argspec: "args=[\'shape\', \'rate\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RandomPoissonV2"
+    argspec: "args=[\'shape\', \'rate\', \'seed\', \'seed2\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RandomShuffle"
+    argspec: "args=[\'value\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RandomShuffleQueue"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'min_after_dequeue\', \'seed\', \'seed2\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RandomShuffleQueueV2"
+    argspec: "args=[\'component_types\', \'shapes\', \'capacity\', \'min_after_dequeue\', \'seed\', \'seed2\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RandomStandardNormal"
+    argspec: "args=[\'shape\', \'seed\', \'seed2\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RandomUniform"
+    argspec: "args=[\'shape\', \'seed\', \'seed2\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RandomUniformInt"
+    argspec: "args=[\'shape\', \'minval\', \'maxval\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Range"
+    argspec: "args=[\'start\', \'limit\', \'delta\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RangeDataset"
+    argspec: "args=[\'start\', \'stop\', \'step\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Rank"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReadFile"
+    argspec: "args=[\'filename\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReadVariableOp"
+    argspec: "args=[\'resource\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderNumRecordsProduced"
+    argspec: "args=[\'reader_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderNumRecordsProducedV2"
+    argspec: "args=[\'reader_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderNumWorkUnitsCompleted"
+    argspec: "args=[\'reader_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderNumWorkUnitsCompletedV2"
+    argspec: "args=[\'reader_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderRead"
+    argspec: "args=[\'reader_handle\', \'queue_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderReadUpTo"
+    argspec: "args=[\'reader_handle\', \'queue_handle\', \'num_records\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderReadUpToV2"
+    argspec: "args=[\'reader_handle\', \'queue_handle\', \'num_records\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderReadV2"
+    argspec: "args=[\'reader_handle\', \'queue_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderReset"
+    argspec: "args=[\'reader_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderResetV2"
+    argspec: "args=[\'reader_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderRestoreState"
+    argspec: "args=[\'reader_handle\', \'state\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderRestoreStateV2"
+    argspec: "args=[\'reader_handle\', \'state\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderSerializeState"
+    argspec: "args=[\'reader_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReaderSerializeStateV2"
+    argspec: "args=[\'reader_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Real"
+    argspec: "args=[\'input\', \'Tout\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RealDiv"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Reciprocal"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReciprocalGrad"
+    argspec: "args=[\'y\', \'dy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RecordInput"
+    argspec: "args=[\'file_pattern\', \'file_random_seed\', \'file_shuffle_shift_ratio\', \'file_buffer_size\', \'file_parallelism\', \'batch_size\', \'compression_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReduceDataset"
+    argspec: "args=[\'input_dataset\', \'initial_state\', \'other_arguments\', \'f\', \'output_types\', \'output_shapes\', \'use_inter_op_parallelism\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReduceJoin"
+    argspec: "args=[\'inputs\', \'reduction_indices\', \'keep_dims\', \'separator\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RefEnter"
+    argspec: "args=[\'data\', \'frame_name\', \'is_constant\', \'parallel_iterations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RefExit"
+    argspec: "args=[\'data\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RefIdentity"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RefMerge"
+    argspec: "args=[\'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RefNextIteration"
+    argspec: "args=[\'data\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RefSelect"
+    argspec: "args=[\'index\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RefSwitch"
+    argspec: "args=[\'data\', \'pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RegexFullMatch"
+    argspec: "args=[\'input\', \'pattern\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RegexReplace"
+    argspec: "args=[\'input\', \'pattern\', \'rewrite\', \'replace_global\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Relu"
+    argspec: "args=[\'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Relu6"
+    argspec: "args=[\'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Relu6Grad"
+    argspec: "args=[\'gradients\', \'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReluGrad"
+    argspec: "args=[\'gradients\', \'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RemoteCall"
+    argspec: "args=[\'target\', \'args\', \'Tout\', \'f\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RepeatDataset"
+    argspec: "args=[\'input_dataset\', \'count\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RequantizationRange"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RequantizationRangePerChannel"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'clip_value_max\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Requantize"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'requested_output_min\', \'requested_output_max\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RequantizePerChannel"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'requested_output_min\', \'requested_output_max\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Reshape"
+    argspec: "args=[\'tensor\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResizeArea"
+    argspec: "args=[\'images\', \'size\', \'align_corners\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResizeBicubic"
+    argspec: "args=[\'images\', \'size\', \'align_corners\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResizeBicubicGrad"
+    argspec: "args=[\'grads\', \'original_image\', \'align_corners\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResizeBilinear"
+    argspec: "args=[\'images\', \'size\', \'align_corners\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResizeBilinearGrad"
+    argspec: "args=[\'grads\', \'original_image\', \'align_corners\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResizeNearestNeighbor"
+    argspec: "args=[\'images\', \'size\', \'align_corners\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResizeNearestNeighborGrad"
+    argspec: "args=[\'grads\', \'size\', \'align_corners\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyAdaMax"
+    argspec: "args=[\'var\', \'m\', \'v\', \'beta1_power\', \'lr\', \'beta1\', \'beta2\', \'epsilon\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyAdadelta"
+    argspec: "args=[\'var\', \'accum\', \'accum_update\', \'lr\', \'rho\', \'epsilon\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'use_locking\', \'update_slots\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyAdagradDA"
+    argspec: "args=[\'var\', \'gradient_accumulator\', \'gradient_squared_accumulator\', \'grad\', \'lr\', \'l1\', \'l2\', \'global_step\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyAdam"
+    argspec: "args=[\'var\', \'m\', \'v\', \'beta1_power\', \'beta2_power\', \'lr\', \'beta1\', \'beta2\', \'epsilon\', \'grad\', \'use_locking\', \'use_nesterov\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyAdamWithAmsgrad"
+    argspec: "args=[\'var\', \'m\', \'v\', \'vhat\', \'beta1_power\', \'beta2_power\', \'lr\', \'beta1\', \'beta2\', \'epsilon\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyAddSign"
+    argspec: "args=[\'var\', \'m\', \'lr\', \'alpha\', \'sign_decay\', \'beta\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyCenteredRMSProp"
+    argspec: "args=[\'var\', \'mg\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyFtrl"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'lr\', \'l1\', \'l2\', \'lr_power\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyFtrlV2"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'lr\', \'l1\', \'l2\', \'l2_shrinkage\', \'lr_power\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyGradientDescent"
+    argspec: "args=[\'var\', \'alpha\', \'delta\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyKerasMomentum"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'momentum\', \'use_locking\', \'use_nesterov\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyMomentum"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'momentum\', \'use_locking\', \'use_nesterov\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyPowerSign"
+    argspec: "args=[\'var\', \'m\', \'lr\', \'logbase\', \'sign_decay\', \'beta\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyProximalAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'l1\', \'l2\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyProximalGradientDescent"
+    argspec: "args=[\'var\', \'alpha\', \'l1\', \'l2\', \'delta\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceApplyRMSProp"
+    argspec: "args=[\'var\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceCountUpTo"
+    argspec: "args=[\'resource\', \'limit\', \'T\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceGather"
+    argspec: "args=[\'resource\', \'indices\', \'validate_indices\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceScatterAdd"
+    argspec: "args=[\'resource\', \'indices\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceScatterDiv"
+    argspec: "args=[\'resource\', \'indices\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceScatterMax"
+    argspec: "args=[\'resource\', \'indices\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceScatterMin"
+    argspec: "args=[\'resource\', \'indices\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceScatterMul"
+    argspec: "args=[\'resource\', \'indices\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceScatterNdAdd"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceScatterNdSub"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceScatterNdUpdate"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceScatterSub"
+    argspec: "args=[\'resource\', \'indices\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceScatterUpdate"
+    argspec: "args=[\'resource\', \'indices\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceSparseApplyAdadelta"
+    argspec: "args=[\'var\', \'accum\', \'accum_update\', \'lr\', \'rho\', \'epsilon\', \'grad\', \'indices\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceSparseApplyAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'indices\', \'use_locking\', \'update_slots\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceSparseApplyAdagradDA"
+    argspec: "args=[\'var\', \'gradient_accumulator\', \'gradient_squared_accumulator\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'global_step\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceSparseApplyCenteredRMSProp"
+    argspec: "args=[\'var\', \'mg\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'indices\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceSparseApplyFtrl"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'lr_power\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceSparseApplyFtrlV2"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'l2_shrinkage\', \'lr_power\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceSparseApplyKerasMomentum"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'indices\', \'momentum\', \'use_locking\', \'use_nesterov\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceSparseApplyMomentum"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'indices\', \'momentum\', \'use_locking\', \'use_nesterov\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceSparseApplyProximalAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'l1\', \'l2\', \'grad\', \'indices\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceSparseApplyProximalGradientDescent"
+    argspec: "args=[\'var\', \'alpha\', \'l1\', \'l2\', \'grad\', \'indices\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceSparseApplyRMSProp"
+    argspec: "args=[\'var\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'indices\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ResourceStridedSliceAssign"
+    argspec: "args=[\'ref\', \'begin\', \'end\', \'strides\', \'value\', \'begin_mask\', \'end_mask\', \'ellipsis_mask\', \'new_axis_mask\', \'shrink_axis_mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Restore"
+    argspec: "args=[\'file_pattern\', \'tensor_name\', \'dt\', \'preferred_shard\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RestoreSlice"
+    argspec: "args=[\'file_pattern\', \'tensor_name\', \'shape_and_slice\', \'dt\', \'preferred_shard\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RestoreV2"
+    argspec: "args=[\'prefix\', \'tensor_names\', \'shape_and_slices\', \'dtypes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Reverse"
+    argspec: "args=[\'tensor\', \'dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReverseSequence"
+    argspec: "args=[\'input\', \'seq_lengths\', \'seq_dim\', \'batch_dim\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ReverseV2"
+    argspec: "args=[\'tensor\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RightShift"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Rint"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Roll"
+    argspec: "args=[\'input\', \'shift\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Round"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Rsqrt"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "RsqrtGrad"
+    argspec: "args=[\'y\', \'dy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SampleDistortedBoundingBox"
+    argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'seed2\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SampleDistortedBoundingBoxV2"
+    argspec: "args=[\'image_size\', \'bounding_boxes\', \'min_object_covered\', \'seed\', \'seed2\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Save"
+    argspec: "args=[\'filename\', \'tensor_names\', \'data\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SaveSlices"
+    argspec: "args=[\'filename\', \'tensor_names\', \'shapes_and_slices\', \'data\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SaveV2"
+    argspec: "args=[\'prefix\', \'tensor_names\', \'shape_and_slices\', \'tensors\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScalarSummary"
+    argspec: "args=[\'tags\', \'values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScaleAndTranslate"
+    argspec: "args=[\'images\', \'size\', \'scale\', \'translation\', \'kernel_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScaleAndTranslateGrad"
+    argspec: "args=[\'grads\', \'original_image\', \'scale\', \'translation\', \'kernel_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScatterAdd"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScatterDiv"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScatterMax"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScatterMin"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScatterMul"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScatterNd"
+    argspec: "args=[\'indices\', \'updates\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScatterNdAdd"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScatterNdNonAliasingAdd"
+    argspec: "args=[\'input\', \'indices\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScatterNdSub"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScatterNdUpdate"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScatterSub"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ScatterUpdate"
+    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SdcaFprint"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SdcaOptimizer"
+    argspec: "args=[\'sparse_example_indices\', \'sparse_feature_indices\', \'sparse_feature_values\', \'dense_features\', \'example_weights\', \'example_labels\', \'sparse_indices\', \'sparse_weights\', \'dense_weights\', \'example_state_data\', \'loss_type\', \'adaptative\', \'l1\', \'l2\', \'num_loss_partitions\', \'num_inner_iterations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SdcaOptimizerV2"
+    argspec: "args=[\'sparse_example_indices\', \'sparse_feature_indices\', \'sparse_feature_values\', \'dense_features\', \'example_weights\', \'example_labels\', \'sparse_indices\', \'sparse_weights\', \'dense_weights\', \'example_state_data\', \'loss_type\', \'adaptive\', \'l1\', \'l2\', \'num_loss_partitions\', \'num_inner_iterations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SdcaShrinkL1"
+    argspec: "args=[\'weights\', \'l1\', \'l2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SegmentMax"
+    argspec: "args=[\'data\', \'segment_ids\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SegmentMean"
+    argspec: "args=[\'data\', \'segment_ids\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SegmentMin"
+    argspec: "args=[\'data\', \'segment_ids\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SegmentProd"
+    argspec: "args=[\'data\', \'segment_ids\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SegmentSum"
+    argspec: "args=[\'data\', \'segment_ids\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Select"
+    argspec: "args=[\'condition\', \'t\', \'e\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SelfAdjointEig"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SelfAdjointEigV2"
+    argspec: "args=[\'input\', \'compute_v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Selu"
+    argspec: "args=[\'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SeluGrad"
+    argspec: "args=[\'gradients\', \'outputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SerializeIterator"
+    argspec: "args=[\'resource_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SerializeManySparse"
+    argspec: "args=[\'sparse_indices\', \'sparse_values\', \'sparse_shape\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SerializeSparse"
+    argspec: "args=[\'sparse_indices\', \'sparse_values\', \'sparse_shape\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SerializeTensor"
+    argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SetSize"
+    argspec: "args=[\'set_indices\', \'set_values\', \'set_shape\', \'validate_indices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Shape"
+    argspec: "args=[\'input\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ShapeN"
+    argspec: "args=[\'input\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ShardDataset"
+    argspec: "args=[\'input_dataset\', \'num_shards\', \'index\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ShardedFilename"
+    argspec: "args=[\'basename\', \'shard\', \'num_shards\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ShardedFilespec"
+    argspec: "args=[\'basename\', \'num_shards\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ShuffleAndRepeatDataset"
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'seed\', \'seed2\', \'count\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ShuffleDataset"
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'seed\', \'seed2\', \'reshuffle_each_iteration\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Sigmoid"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SigmoidGrad"
+    argspec: "args=[\'y\', \'dy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Sign"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Sin"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Sinh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Size"
+    argspec: "args=[\'input\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SkipDataset"
+    argspec: "args=[\'input_dataset\', \'count\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Slice"
+    argspec: "args=[\'input\', \'begin\', \'size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Snapshot"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Softmax"
+    argspec: "args=[\'logits\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SoftmaxCrossEntropyWithLogits"
+    argspec: "args=[\'features\', \'labels\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Softplus"
+    argspec: "args=[\'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SoftplusGrad"
+    argspec: "args=[\'gradients\', \'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Softsign"
+    argspec: "args=[\'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SoftsignGrad"
+    argspec: "args=[\'gradients\', \'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SpaceToBatch"
+    argspec: "args=[\'input\', \'paddings\', \'block_size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SpaceToBatchND"
+    argspec: "args=[\'input\', \'block_shape\', \'paddings\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SpaceToDepth"
+    argspec: "args=[\'input\', \'block_size\', \'data_format\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseAccumulatorApplyGradient"
+    argspec: "args=[\'handle\', \'local_step\', \'gradient_indices\', \'gradient_values\', \'gradient_shape\', \'has_known_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseAccumulatorTakeGradient"
+    argspec: "args=[\'handle\', \'num_required\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseAdd"
+    argspec: "args=[\'a_indices\', \'a_values\', \'a_shape\', \'b_indices\', \'b_values\', \'b_shape\', \'thresh\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseAddGrad"
+    argspec: "args=[\'backprop_val_grad\', \'a_indices\', \'b_indices\', \'sum_indices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseApplyAdadelta"
+    argspec: "args=[\'var\', \'accum\', \'accum_update\', \'lr\', \'rho\', \'epsilon\', \'grad\', \'indices\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseApplyAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'indices\', \'use_locking\', \'update_slots\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseApplyAdagradDA"
+    argspec: "args=[\'var\', \'gradient_accumulator\', \'gradient_squared_accumulator\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'global_step\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseApplyCenteredRMSProp"
+    argspec: "args=[\'var\', \'mg\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'indices\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseApplyFtrl"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'lr_power\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseApplyFtrlV2"
+    argspec: "args=[\'var\', \'accum\', \'linear\', \'grad\', \'indices\', \'lr\', \'l1\', \'l2\', \'l2_shrinkage\', \'lr_power\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseApplyMomentum"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'grad\', \'indices\', \'momentum\', \'use_locking\', \'use_nesterov\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseApplyProximalAdagrad"
+    argspec: "args=[\'var\', \'accum\', \'lr\', \'l1\', \'l2\', \'grad\', \'indices\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseApplyProximalGradientDescent"
+    argspec: "args=[\'var\', \'alpha\', \'l1\', \'l2\', \'grad\', \'indices\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseApplyRMSProp"
+    argspec: "args=[\'var\', \'ms\', \'mom\', \'lr\', \'rho\', \'momentum\', \'epsilon\', \'grad\', \'indices\', \'use_locking\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseConcat"
+    argspec: "args=[\'indices\', \'values\', \'shapes\', \'concat_dim\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseConditionalAccumulator"
+    argspec: "args=[\'dtype\', \'shape\', \'container\', \'shared_name\', \'reduction_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseCross"
+    argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'hashed_output\', \'num_buckets\', \'hash_key\', \'out_type\', \'internal_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseDenseCwiseAdd"
+    argspec: "args=[\'sp_indices\', \'sp_values\', \'sp_shape\', \'dense\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseDenseCwiseDiv"
+    argspec: "args=[\'sp_indices\', \'sp_values\', \'sp_shape\', \'dense\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseDenseCwiseMul"
+    argspec: "args=[\'sp_indices\', \'sp_values\', \'sp_shape\', \'dense\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseFillEmptyRows"
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'default_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseFillEmptyRowsGrad"
+    argspec: "args=[\'reverse_index_map\', \'grad_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseMatMul"
+    argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'a_is_sparse\', \'b_is_sparse\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseReduceMax"
+    argspec: "args=[\'input_indices\', \'input_values\', \'input_shape\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseReduceMaxSparse"
+    argspec: "args=[\'input_indices\', \'input_values\', \'input_shape\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseReduceSum"
+    argspec: "args=[\'input_indices\', \'input_values\', \'input_shape\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseReduceSumSparse"
+    argspec: "args=[\'input_indices\', \'input_values\', \'input_shape\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseReorder"
+    argspec: "args=[\'input_indices\', \'input_values\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseReshape"
+    argspec: "args=[\'input_indices\', \'input_shape\', \'new_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSegmentMean"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSegmentMeanGrad"
+    argspec: "args=[\'grad\', \'indices\', \'segment_ids\', \'output_dim0\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSegmentMeanWithNumSegments"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'num_segments\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSegmentSqrtN"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSegmentSqrtNGrad"
+    argspec: "args=[\'grad\', \'indices\', \'segment_ids\', \'output_dim0\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSegmentSqrtNWithNumSegments"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'num_segments\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSegmentSum"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSegmentSumWithNumSegments"
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'num_segments\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSlice"
+    argspec: "args=[\'indices\', \'values\', \'shape\', \'start\', \'size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSliceGrad"
+    argspec: "args=[\'backprop_val_grad\', \'input_indices\', \'input_start\', \'output_indices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSoftmax"
+    argspec: "args=[\'sp_indices\', \'sp_values\', \'sp_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSoftmaxCrossEntropyWithLogits"
+    argspec: "args=[\'features\', \'labels\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSparseMaximum"
+    argspec: "args=[\'a_indices\', \'a_values\', \'a_shape\', \'b_indices\', \'b_values\', \'b_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSparseMinimum"
+    argspec: "args=[\'a_indices\', \'a_values\', \'a_shape\', \'b_indices\', \'b_values\', \'b_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseSplit"
+    argspec: "args=[\'split_dim\', \'indices\', \'values\', \'shape\', \'num_split\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseTensorDenseAdd"
+    argspec: "args=[\'a_indices\', \'a_values\', \'a_shape\', \'b\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseTensorDenseMatMul"
+    argspec: "args=[\'a_indices\', \'a_values\', \'a_shape\', \'b\', \'adjoint_a\', \'adjoint_b\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseTensorSliceDataset"
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseToDense"
+    argspec: "args=[\'sparse_indices\', \'output_shape\', \'sparse_values\', \'default_value\', \'validate_indices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SparseToSparseSetOperation"
+    argspec: "args=[\'set1_indices\', \'set1_values\', \'set1_shape\', \'set2_indices\', \'set2_values\', \'set2_shape\', \'set_operation\', \'validate_indices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Split"
+    argspec: "args=[\'split_dim\', \'value\', \'num_split\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SplitV"
+    argspec: "args=[\'value\', \'size_splits\', \'split_dim\', \'num_split\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Sqrt"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SqrtGrad"
+    argspec: "args=[\'y\', \'dy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Square"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SquaredDifference"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Squeeze"
+    argspec: "args=[\'input\', \'squeeze_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Stack"
+    argspec: "args=[\'elem_type\', \'stack_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StackClose"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StackCloseV2"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StackPop"
+    argspec: "args=[\'handle\', \'elem_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StackPopV2"
+    argspec: "args=[\'handle\', \'elem_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StackPush"
+    argspec: "args=[\'handle\', \'elem\', \'swap_memory\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StackPushV2"
+    argspec: "args=[\'handle\', \'elem\', \'swap_memory\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StackV2"
+    argspec: "args=[\'max_size\', \'elem_type\', \'stack_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Stage"
+    argspec: "args=[\'values\', \'capacity\', \'memory_limit\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StageClear"
+    argspec: "args=[\'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StagePeek"
+    argspec: "args=[\'index\', \'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StageSize"
+    argspec: "args=[\'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StatefulPartitionedCall"
+    argspec: "args=[\'args\', \'Tout\', \'f\', \'config\', \'config_proto\', \'executor_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StatelessIf"
+    argspec: "args=[\'cond\', \'input\', \'Tout\', \'then_branch\', \'else_branch\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StatelessMultinomial"
+    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'output_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StatelessRandomNormal"
+    argspec: "args=[\'shape\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StatelessRandomUniform"
+    argspec: "args=[\'shape\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StatelessRandomUniformInt"
+    argspec: "args=[\'shape\', \'seed\', \'minval\', \'maxval\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StatelessTruncatedNormal"
+    argspec: "args=[\'shape\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StatelessWhile"
+    argspec: "args=[\'input\', \'cond\', \'body\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StaticRegexFullMatch"
+    argspec: "args=[\'input\', \'pattern\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StaticRegexReplace"
+    argspec: "args=[\'input\', \'pattern\', \'rewrite\', \'replace_global\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StopGradient"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StridedSlice"
+    argspec: "args=[\'input\', \'begin\', \'end\', \'strides\', \'begin_mask\', \'end_mask\', \'ellipsis_mask\', \'new_axis_mask\', \'shrink_axis_mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StridedSliceAssign"
+    argspec: "args=[\'ref\', \'begin\', \'end\', \'strides\', \'value\', \'begin_mask\', \'end_mask\', \'ellipsis_mask\', \'new_axis_mask\', \'shrink_axis_mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StridedSliceGrad"
+    argspec: "args=[\'shape\', \'begin\', \'end\', \'strides\', \'dy\', \'begin_mask\', \'end_mask\', \'ellipsis_mask\', \'new_axis_mask\', \'shrink_axis_mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StringFormat"
+    argspec: "args=[\'inputs\', \'template\', \'placeholder\', \'summarize\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StringJoin"
+    argspec: "args=[\'inputs\', \'separator\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StringLength"
+    argspec: "args=[\'input\', \'unit\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StringSplit"
+    argspec: "args=[\'input\', \'delimiter\', \'skip_empty\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StringSplitV2"
+    argspec: "args=[\'input\', \'sep\', \'maxsplit\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StringStrip"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StringToHashBucket"
+    argspec: "args=[\'string_tensor\', \'num_buckets\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StringToHashBucketFast"
+    argspec: "args=[\'input\', \'num_buckets\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StringToHashBucketStrong"
+    argspec: "args=[\'input\', \'num_buckets\', \'key\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "StringToNumber"
+    argspec: "args=[\'string_tensor\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Sub"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Substr"
+    argspec: "args=[\'input\', \'pos\', \'len\', \'unit\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Sum"
+    argspec: "args=[\'input\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SummaryWriter"
+    argspec: "args=[\'shared_name\', \'container\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Svd"
+    argspec: "args=[\'input\', \'compute_uv\', \'full_matrices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Switch"
+    argspec: "args=[\'data\', \'pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "SymbolicGradient"
+    argspec: "args=[\'input\', \'Tout\', \'f\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TFRecordDataset"
+    argspec: "args=[\'filenames\', \'compression_type\', \'buffer_size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TFRecordReader"
+    argspec: "args=[\'container\', \'shared_name\', \'compression_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TFRecordReaderV2"
+    argspec: "args=[\'container\', \'shared_name\', \'compression_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TakeDataset"
+    argspec: "args=[\'input_dataset\', \'count\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TakeManySparseFromTensorsMap"
+    argspec: "args=[\'sparse_handles\', \'dtype\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Tan"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Tanh"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TanhGrad"
+    argspec: "args=[\'y\', \'dy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TemporaryVariable"
+    argspec: "args=[\'shape\', \'dtype\', \'var_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArray"
+    argspec: "args=[\'size\', \'dtype\', \'dynamic_size\', \'clear_after_read\', \'tensor_array_name\', \'element_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayClose"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayCloseV2"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayCloseV3"
+    argspec: "args=[\'handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayConcat"
+    argspec: "args=[\'handle\', \'flow_in\', \'dtype\', \'element_shape_except0\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayConcatV2"
+    argspec: "args=[\'handle\', \'flow_in\', \'dtype\', \'element_shape_except0\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayConcatV3"
+    argspec: "args=[\'handle\', \'flow_in\', \'dtype\', \'element_shape_except0\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayGather"
+    argspec: "args=[\'handle\', \'indices\', \'flow_in\', \'dtype\', \'element_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayGatherV2"
+    argspec: "args=[\'handle\', \'indices\', \'flow_in\', \'dtype\', \'element_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayGatherV3"
+    argspec: "args=[\'handle\', \'indices\', \'flow_in\', \'dtype\', \'element_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayGrad"
+    argspec: "args=[\'handle\', \'flow_in\', \'source\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayGradV2"
+    argspec: "args=[\'handle\', \'flow_in\', \'source\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayGradV3"
+    argspec: "args=[\'handle\', \'flow_in\', \'source\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayGradWithShape"
+    argspec: "args=[\'handle\', \'flow_in\', \'shape_to_prepend\', \'source\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayPack"
+    argspec: "args=[\'handle\', \'flow_in\', \'dtype\', \'element_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayRead"
+    argspec: "args=[\'handle\', \'index\', \'flow_in\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayReadV2"
+    argspec: "args=[\'handle\', \'index\', \'flow_in\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayReadV3"
+    argspec: "args=[\'handle\', \'index\', \'flow_in\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayScatter"
+    argspec: "args=[\'handle\', \'indices\', \'value\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayScatterV2"
+    argspec: "args=[\'handle\', \'indices\', \'value\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayScatterV3"
+    argspec: "args=[\'handle\', \'indices\', \'value\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArraySize"
+    argspec: "args=[\'handle\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArraySizeV2"
+    argspec: "args=[\'handle\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArraySizeV3"
+    argspec: "args=[\'handle\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArraySplit"
+    argspec: "args=[\'handle\', \'value\', \'lengths\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArraySplitV2"
+    argspec: "args=[\'handle\', \'value\', \'lengths\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArraySplitV3"
+    argspec: "args=[\'handle\', \'value\', \'lengths\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayUnpack"
+    argspec: "args=[\'handle\', \'value\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayV2"
+    argspec: "args=[\'size\', \'dtype\', \'element_shape\', \'dynamic_size\', \'clear_after_read\', \'tensor_array_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayV3"
+    argspec: "args=[\'size\', \'dtype\', \'element_shape\', \'dynamic_size\', \'clear_after_read\', \'identical_element_shapes\', \'tensor_array_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayWrite"
+    argspec: "args=[\'handle\', \'index\', \'value\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayWriteV2"
+    argspec: "args=[\'handle\', \'index\', \'value\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorArrayWriteV3"
+    argspec: "args=[\'handle\', \'index\', \'value\', \'flow_in\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorDataset"
+    argspec: "args=[\'components\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListConcat"
+    argspec: "args=[\'input_handle\', \'element_dtype\', \'element_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListConcatLists"
+    argspec: "args=[\'input_a\', \'input_b\', \'element_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListConcatV2"
+    argspec: "args=[\'input_handle\', \'element_shape\', \'leading_dims\', \'element_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListElementShape"
+    argspec: "args=[\'input_handle\', \'shape_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListFromTensor"
+    argspec: "args=[\'tensor\', \'element_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListGather"
+    argspec: "args=[\'input_handle\', \'indices\', \'element_shape\', \'element_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListGetItem"
+    argspec: "args=[\'input_handle\', \'index\', \'element_shape\', \'element_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListLength"
+    argspec: "args=[\'input_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListPopBack"
+    argspec: "args=[\'input_handle\', \'element_shape\', \'element_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListPushBack"
+    argspec: "args=[\'input_handle\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListPushBackBatch"
+    argspec: "args=[\'input_handles\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListReserve"
+    argspec: "args=[\'element_shape\', \'num_elements\', \'element_dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListResize"
+    argspec: "args=[\'input_handle\', \'size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListScatter"
+    argspec: "args=[\'tensor\', \'indices\', \'element_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListScatterIntoExistingList"
+    argspec: "args=[\'input_handle\', \'tensor\', \'indices\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListScatterV2"
+    argspec: "args=[\'tensor\', \'indices\', \'element_shape\', \'num_elements\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListSetItem"
+    argspec: "args=[\'input_handle\', \'index\', \'item\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListSplit"
+    argspec: "args=[\'tensor\', \'element_shape\', \'lengths\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorListStack"
+    argspec: "args=[\'input_handle\', \'element_shape\', \'element_dtype\', \'num_elements\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorScatterAdd"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorScatterSub"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorScatterUpdate"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorSliceDataset"
+    argspec: "args=[\'components\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorSummary"
+    argspec: "args=[\'tensor\', \'description\', \'labels\', \'display_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TensorSummaryV2"
+    argspec: "args=[\'tag\', \'tensor\', \'serialized_summary_metadata\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TextLineDataset"
+    argspec: "args=[\'filenames\', \'compression_type\', \'buffer_size\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TextLineReader"
+    argspec: "args=[\'skip_header_lines\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TextLineReaderV2"
+    argspec: "args=[\'skip_header_lines\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ThreadUnsafeUnigramCandidateSampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Tile"
+    argspec: "args=[\'input\', \'multiples\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TileGrad"
+    argspec: "args=[\'input\', \'multiples\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Timestamp"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TopK"
+    argspec: "args=[\'input\', \'k\', \'sorted\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TopKV2"
+    argspec: "args=[\'input\', \'k\', \'sorted\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Transpose"
+    argspec: "args=[\'x\', \'perm\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TridiagonalSolve"
+    argspec: "args=[\'diagonals\', \'rhs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TruncateDiv"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TruncateMod"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "TruncatedNormal"
+    argspec: "args=[\'shape\', \'seed\', \'seed2\', \'dtype\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UnicodeDecode"
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UnicodeDecodeWithOffsets"
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UnicodeEncode"
+    argspec: "args=[\'input_values\', \'input_splits\', \'errors\', \'output_encoding\', \'replacement_char\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UnicodeScript"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UnicodeTranscode"
+    argspec: "args=[\'input\', \'input_encoding\', \'output_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UniformCandidateSampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'seed2\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Unique"
+    argspec: "args=[\'x\', \'out_idx\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UniqueV2"
+    argspec: "args=[\'x\', \'axis\', \'out_idx\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UniqueWithCounts"
+    argspec: "args=[\'x\', \'out_idx\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UniqueWithCountsV2"
+    argspec: "args=[\'x\', \'axis\', \'out_idx\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Unpack"
+    argspec: "args=[\'value\', \'num\', \'axis\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UnravelIndex"
+    argspec: "args=[\'indices\', \'dims\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UnsortedSegmentMax"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UnsortedSegmentMin"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UnsortedSegmentProd"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UnsortedSegmentSum"
+    argspec: "args=[\'data\', \'segment_ids\', \'num_segments\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Unstage"
+    argspec: "args=[\'capacity\', \'memory_limit\', \'dtypes\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UnwrapDatasetVariant"
+    argspec: "args=[\'input_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "UpperBound"
+    argspec: "args=[\'sorted_inputs\', \'values\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "VarHandleOp"
+    argspec: "args=[\'container\', \'shared_name\', \'dtype\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "VarIsInitializedOp"
+    argspec: "args=[\'resource\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Variable"
+    argspec: "args=[\'shape\', \'dtype\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "VariableShape"
+    argspec: "args=[\'input\', \'out_type\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "VariableV2"
+    argspec: "args=[\'shape\', \'dtype\', \'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Where"
+    argspec: "args=[\'input\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "While"
+    argspec: "args=[\'input\', \'cond\', \'body\', \'output_shapes\', \'parallel_iterations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "WholeFileReader"
+    argspec: "args=[\'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "WholeFileReaderV2"
+    argspec: "args=[\'container\', \'shared_name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "WindowDataset"
+    argspec: "args=[\'input_dataset\', \'size\', \'shift\', \'stride\', \'drop_remainder\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "WrapDatasetVariant"
+    argspec: "args=[\'input_handle\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "WriteAudioSummary"
+    argspec: "args=[\'writer\', \'step\', \'tag\', \'tensor\', \'sample_rate\', \'max_outputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "WriteFile"
+    argspec: "args=[\'filename\', \'contents\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "WriteGraphSummary"
+    argspec: "args=[\'writer\', \'step\', \'tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "WriteHistogramSummary"
+    argspec: "args=[\'writer\', \'step\', \'tag\', \'values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "WriteImageSummary"
+    argspec: "args=[\'writer\', \'step\', \'tag\', \'tensor\', \'bad_color\', \'max_images\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "WriteScalarSummary"
+    argspec: "args=[\'writer\', \'step\', \'tag\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "WriteSummary"
+    argspec: "args=[\'writer\', \'step\', \'tensor\', \'tag\', \'summary_metadata\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Xdivy"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Xlogy"
+    argspec: "args=[\'x\', \'y\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ZerosLike"
+    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "Zeta"
+    argspec: "args=[\'x\', \'q\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "ZipDataset"
+    argspec: "args=[\'input_datasets\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.rnn.-dropout-wrapper.pbtxt
similarity index 89%
rename from tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
rename to tensorflow/tools/api/golden/v2/tensorflow.rnn.-dropout-wrapper.pbtxt
index 9836433d08cba809107f9bb5dbccf2e971865b8a..53b61f03f96a1d4b4ef81032d716f3e913f0fd44 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.rnn.-dropout-wrapper.pbtxt
@@ -1,10 +1,14 @@
-path: "tensorflow.nn.rnn_cell.DropoutWrapper"
+path: "tensorflow.rnn.DropoutWrapper"
 tf_class {
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DropoutWrapperV2\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl._RNNCellWrapperV2\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DropoutWrapper\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl._RNNCellWrapperV1\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +18,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
@@ -132,11 +140,11 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.rnn.-residual-wrapper.pbtxt
similarity index 89%
rename from tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
rename to tensorflow/tools/api/golden/v2/tensorflow.rnn.-residual-wrapper.pbtxt
index 1f7840ab919baeeb0077904592ba8dcc1d4c91fb..9a303dfd6fde05e724d4012d1f9c09d69cc658a4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.rnn.-residual-wrapper.pbtxt
@@ -1,10 +1,14 @@
-path: "tensorflow.nn.rnn_cell.ResidualWrapper"
+path: "tensorflow.rnn.ResidualWrapper"
 tf_class {
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.ResidualWrapperV2\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl._RNNCellWrapperV2\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.ResidualWrapper\'>"
+  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl._RNNCellWrapperV1\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -14,6 +18,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "graph"
     mtype: "<type \'property\'>"
@@ -128,11 +136,11 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.rnn.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.rnn.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..32be6c7018feee84d961afe37ef1eaea333abc73
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.rnn.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.rnn"
+tf_module {
+  member {
+    name: "DropoutWrapper"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ResidualWrapper"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
index 63bebb20bcae08c645d9aaaecab2ea2de4cc49aa..17275d4d8d7f4cebdca43ed0f2fed6a9841a7ea2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
@@ -77,7 +77,7 @@ tf_module {
     mtype: "<type \'str\'>"
   }
   member {
-    name: "TRANING"
+    name: "TRAINING"
     mtype: "<type \'str\'>"
   }
   member {
@@ -88,29 +88,13 @@ tf_module {
     name: "VARIABLES_FILENAME"
     mtype: "<type \'str\'>"
   }
-  member_method {
-    name: "build_signature_def"
-    argspec: "args=[\'inputs\', \'outputs\', \'method_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "classification_signature_def"
-    argspec: "args=[\'examples\', \'classes\', \'scores\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "contains_saved_model"
     argspec: "args=[\'export_dir\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "is_valid_signature"
-    argspec: "args=[\'signature_def\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "predict_signature_def"
-    argspec: "args=[\'inputs\', \'outputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "regression_signature_def"
-    argspec: "args=[\'examples\', \'predictions\'], varargs=None, keywords=None, defaults=None"
+    name: "load"
+    argspec: "args=[\'export_dir\', \'tags\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "save"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-tensor.pbtxt
index 02e59a63e10b1a24bfe0c275044bf807b433f62e..a31689a58bceb91ccfb3fa91d8b778c6c25cc929 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.-sparse-tensor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.sparse.SparseTensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensor\'>"
   is_instance: "<class \'tensorflow.python.framework.ops._TensorLike\'>"
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dense_shape"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
index f6e32ed08c8339413374c11c6fc75aec92bffec2..e2da65eee41905c7b7c67eade11e1775a2408ca0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.strings"
 tf_module {
+  member_method {
+    name: "as_string"
+    argspec: "args=[\'input\', \'precision\', \'scientific\', \'shortest\', \'width\', \'fill\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'False\', \'False\', \'-1\', \'\', \'None\'], "
+  }
   member_method {
     name: "format"
     argspec: "args=[\'template\', \'inputs\', \'placeholder\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'{}\', \'3\', \'None\'], "
@@ -52,6 +56,14 @@ tf_module {
     name: "to_number"
     argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "unicode_decode"
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "unicode_decode_with_offsets"
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
+  }
   member_method {
     name: "unicode_encode"
     argspec: "args=[\'input\', \'output_encoding\', \'errors\', \'replacement_char\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'None\'], "
@@ -60,6 +72,14 @@ tf_module {
     name: "unicode_script"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "unicode_split"
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'None\'], "
+  }
+  member_method {
+    name: "unicode_split_with_offsets"
+    argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'None\'], "
+  }
   member_method {
     name: "unicode_transcode"
     argspec: "args=[\'input\', \'input_encoding\', \'output_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-event.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-event.pbtxt
deleted file mode 100644
index eb99d0f5334457aa654fed0553af143839328dba..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-event.pbtxt
+++ /dev/null
@@ -1,74 +0,0 @@
-path: "tensorflow.summary.Event"
-tf_proto {
-  descriptor {
-    name: "Event"
-    field {
-      name: "wall_time"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_DOUBLE
-    }
-    field {
-      name: "step"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "file_version"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-      oneof_index: 0
-    }
-    field {
-      name: "graph_def"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-      oneof_index: 0
-    }
-    field {
-      name: "summary"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.Summary"
-      oneof_index: 0
-    }
-    field {
-      name: "log_message"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.LogMessage"
-      oneof_index: 0
-    }
-    field {
-      name: "session_log"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.SessionLog"
-      oneof_index: 0
-    }
-    field {
-      name: "tagged_run_metadata"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TaggedRunMetadata"
-      oneof_index: 0
-    }
-    field {
-      name: "meta_graph_def"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-      oneof_index: 0
-    }
-    oneof_decl {
-      name: "what"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-file-writer-cache.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-file-writer-cache.pbtxt
deleted file mode 100644
index 2a5b63dceae3c0ac27b34c2e896ee3b90bbd7f75..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-file-writer-cache.pbtxt
+++ /dev/null
@@ -1,16 +0,0 @@
-path: "tensorflow.summary.FileWriterCache"
-tf_class {
-  is_instance: "<class \'tensorflow.python.summary.writer.writer_cache.FileWriterCache\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "clear"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get"
-    argspec: "args=[\'logdir\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-file-writer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-file-writer.pbtxt
deleted file mode 100644
index 6b65b0ace3cf7740ab03390841c941592000d127..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-file-writer.pbtxt
+++ /dev/null
@@ -1,50 +0,0 @@
-path: "tensorflow.summary.FileWriter"
-tf_class {
-  is_instance: "<class \'tensorflow.python.summary.writer.writer.FileWriter\'>"
-  is_instance: "<class \'tensorflow.python.summary.writer.writer.SummaryToEventTransformer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'logdir\', \'graph\', \'max_queue\', \'flush_secs\', \'graph_def\', \'filename_suffix\', \'session\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'120\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_event"
-    argspec: "args=[\'self\', \'event\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "add_graph"
-    argspec: "args=[\'self\', \'graph\', \'global_step\', \'graph_def\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_meta_graph"
-    argspec: "args=[\'self\', \'meta_graph_def\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_run_metadata"
-    argspec: "args=[\'self\', \'run_metadata\', \'tag\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_session_log"
-    argspec: "args=[\'self\', \'session_log\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_summary"
-    argspec: "args=[\'self\', \'summary\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "flush"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_logdir"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "reopen"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-description.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-description.pbtxt
deleted file mode 100644
index 4a8b59cf02ed46ef70f22564f3134214840600fe..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-description.pbtxt
+++ /dev/null
@@ -1,12 +0,0 @@
-path: "tensorflow.summary.SummaryDescription"
-tf_proto {
-  descriptor {
-    name: "SummaryDescription"
-    field {
-      name: "type_hint"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-audio.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-audio.pbtxt
deleted file mode 100644
index 8b271cf58fc11c8666abd456021afeedc0b14c7a..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-audio.pbtxt
+++ /dev/null
@@ -1,36 +0,0 @@
-path: "tensorflow.summary.Summary.Audio"
-tf_proto {
-  descriptor {
-    name: "Audio"
-    field {
-      name: "sample_rate"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_FLOAT
-    }
-    field {
-      name: "num_channels"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "length_frames"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "encoded_audio_string"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-    }
-    field {
-      name: "content_type"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-image.pbtxt
deleted file mode 100644
index dbbc02dd0506dbcebd1690602b5786b02c3ed4a0..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-image.pbtxt
+++ /dev/null
@@ -1,30 +0,0 @@
-path: "tensorflow.summary.Summary.Image"
-tf_proto {
-  descriptor {
-    name: "Image"
-    field {
-      name: "height"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "width"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "colorspace"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "encoded_image_string"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-value.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-value.pbtxt
deleted file mode 100644
index 4176171cd938e383fe5366153364d8e8e8c1a1ee..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.-value.pbtxt
+++ /dev/null
@@ -1,74 +0,0 @@
-path: "tensorflow.summary.Summary.Value"
-tf_proto {
-  descriptor {
-    name: "Value"
-    field {
-      name: "node_name"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "tag"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "metadata"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.SummaryMetadata"
-    }
-    field {
-      name: "simple_value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_FLOAT
-      oneof_index: 0
-    }
-    field {
-      name: "obsolete_old_style_histogram"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-      oneof_index: 0
-    }
-    field {
-      name: "image"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.Summary.Image"
-      oneof_index: 0
-    }
-    field {
-      name: "histo"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.HistogramProto"
-      oneof_index: 0
-    }
-    field {
-      name: "audio"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.Summary.Audio"
-      oneof_index: 0
-    }
-    field {
-      name: "tensor"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorProto"
-      oneof_index: 0
-    }
-    oneof_decl {
-      name: "value"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.pbtxt
deleted file mode 100644
index d6c5e3a87a115b9bdcfd044abe93177eda2af275..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary.pbtxt
+++ /dev/null
@@ -1,144 +0,0 @@
-path: "tensorflow.summary.Summary"
-tf_proto {
-  descriptor {
-    name: "Summary"
-    field {
-      name: "value"
-      number: 1
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.Summary.Value"
-    }
-    nested_type {
-      name: "Image"
-      field {
-        name: "height"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      field {
-        name: "width"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      field {
-        name: "colorspace"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      field {
-        name: "encoded_image_string"
-        number: 4
-        label: LABEL_OPTIONAL
-        type: TYPE_BYTES
-      }
-    }
-    nested_type {
-      name: "Audio"
-      field {
-        name: "sample_rate"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_FLOAT
-      }
-      field {
-        name: "num_channels"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_INT64
-      }
-      field {
-        name: "length_frames"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_INT64
-      }
-      field {
-        name: "encoded_audio_string"
-        number: 4
-        label: LABEL_OPTIONAL
-        type: TYPE_BYTES
-      }
-      field {
-        name: "content_type"
-        number: 5
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-    }
-    nested_type {
-      name: "Value"
-      field {
-        name: "node_name"
-        number: 7
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "tag"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "metadata"
-        number: 9
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.SummaryMetadata"
-      }
-      field {
-        name: "simple_value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_FLOAT
-        oneof_index: 0
-      }
-      field {
-        name: "obsolete_old_style_histogram"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_BYTES
-        oneof_index: 0
-      }
-      field {
-        name: "image"
-        number: 4
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.Summary.Image"
-        oneof_index: 0
-      }
-      field {
-        name: "histo"
-        number: 5
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.HistogramProto"
-        oneof_index: 0
-      }
-      field {
-        name: "audio"
-        number: 6
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.Summary.Audio"
-        oneof_index: 0
-      }
-      field {
-        name: "tensor"
-        number: 8
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.TensorProto"
-        oneof_index: 0
-      }
-      oneof_decl {
-        name: "value"
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-tagged-run-metadata.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-tagged-run-metadata.pbtxt
deleted file mode 100644
index 27c8873320403cb2e7402ef9f1bb0e7134d5f96b..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-tagged-run-metadata.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.summary.TaggedRunMetadata"
-tf_proto {
-  descriptor {
-    name: "TaggedRunMetadata"
-    field {
-      name: "tag"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "run_metadata"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
index 5cf4d7cfd9ac54eeccea5094ad789aede29540b8..85edef9d7e4c62a0a4f0c77f19e17d8c45535fc6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
@@ -1,32 +1,12 @@
 path: "tensorflow.summary"
 tf_module {
-  member {
-    name: "Event"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "FileWriter"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "FileWriterCache"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Summary"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "SummaryDescription"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member {
     name: "SummaryWriter"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "TaggedRunMetadata"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+  member_method {
+    name: "audio"
+    argspec: "args=[\'name\', \'data\', \'sample_rate\', \'step\', \'max_outputs\', \'encoding\', \'description\'], varargs=None, keywords=None, defaults=[\'3\', \'None\', \'None\'], "
   }
   member_method {
     name: "create_file_writer"
@@ -36,8 +16,36 @@ tf_module {
     name: "flush"
     argspec: "args=[\'writer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "histogram"
+    argspec: "args=[\'name\', \'data\', \'step\', \'buckets\', \'description\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "image"
+    argspec: "args=[\'name\', \'data\', \'step\', \'max_outputs\', \'description\'], varargs=None, keywords=None, defaults=[\'3\', \'None\'], "
+  }
   member_method {
     name: "import_event"
     argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "record_if"
+    argspec: "args=[\'condition\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "scalar"
+    argspec: "args=[\'name\', \'data\', \'step\', \'description\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "summary_scope"
+    argspec: "args=[\'name\', \'default_name\', \'values\'], varargs=None, keywords=None, defaults=[\'summary\', \'None\'], "
+  }
+  member_method {
+    name: "text"
+    argspec: "args=[\'name\', \'data\', \'step\', \'description\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "write"
+    argspec: "args=[\'tag\', \'tensor\', \'step\', \'metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
index 980e96ac254aebf229ae52d98f607ed87d334e7a..ac9dd8f7189799cbf9b061677cd88058cb9d799e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
@@ -10,12 +10,16 @@ tf_module {
   }
   member_method {
     name: "assert_equal_graph_def"
-    argspec: "args=[\'actual\', \'expected\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'expected\', \'actual\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "benchmark_config"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_gradient"
+    argspec: "args=[\'f\', \'x\', \'delta\'], varargs=None, keywords=None, defaults=[\'0.001\'], "
+  }
   member_method {
     name: "create_local_cluster"
     argspec: "args=[\'num_workers\', \'num_ps\', \'protocol\', \'worker_config\', \'ps_config\'], varargs=None, keywords=None, defaults=[\'grpc\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
index 5be37200f368b1823093c67ad7042db534b0df93..629bc13612ab567006436bd95fee49c4e3acdefe 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.train.Checkpoint"
 tf_class {
-  is_instance: "<class \'tensorflow.python.training.checkpointable.util.Checkpoint\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.tracking.Checkpointable\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.util.Checkpoint\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "save_counter"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
deleted file mode 100644
index 7d982dc51f6edce1cf691671e31ddd07664f0dc1..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.ProximalGradientDescentOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.proximal_gradient_descent.ProximalGradientDescentOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'0.0\', \'False\', \'ProximalGradientDescent\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
index 8c327f88f32357bc15b1cdcbbc2ffad674063f6b..4f293fb40d42c9169d667d2dbe117373c27ec893 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
@@ -60,54 +60,22 @@ tf_module {
     name: "JobDef"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "ProximalGradientDescentOptimizer"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "SequenceExample"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "Server"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "ServerDef"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "SessionRunHook"
-    mtype: "<type \'type\'>"
-  }
-  member_method {
-    name: "cosine_decay"
-    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
-  }
-  member_method {
-    name: "cosine_decay_restarts"
-    argspec: "args=[\'learning_rate\', \'global_step\', \'first_decay_steps\', \'t_mul\', \'m_mul\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'2.0\', \'1.0\', \'0.0\', \'None\'], "
-  }
-  member_method {
-    name: "exponential_decay"
-    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
   member_method {
     name: "get_checkpoint_state"
     argspec: "args=[\'checkpoint_dir\', \'latest_filename\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "inverse_time_decay"
-    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
   member_method {
     name: "latest_checkpoint"
     argspec: "args=[\'checkpoint_dir\', \'latest_filename\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "linear_cosine_decay"
-    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'num_periods\', \'alpha\', \'beta\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'0.0\', \'0.001\', \'None\'], "
-  }
   member_method {
     name: "list_variables"
     argspec: "args=[\'ckpt_dir_or_file\'], varargs=None, keywords=None, defaults=None"
@@ -120,22 +88,6 @@ tf_module {
     name: "load_variable"
     argspec: "args=[\'ckpt_dir_or_file\', \'name\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "natural_exp_decay"
-    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "noisy_linear_cosine_decay"
-    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'initial_variance\', \'variance_decay\', \'num_periods\', \'alpha\', \'beta\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'0.55\', \'0.5\', \'0.0\', \'0.001\', \'None\'], "
-  }
-  member_method {
-    name: "piecewise_constant_decay"
-    argspec: "args=[\'x\', \'boundaries\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "polynomial_decay"
-    argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'end_learning_rate\', \'power\', \'cycle\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'1.0\', \'False\', \'None\'], "
-  }
   member_method {
     name: "sdca_fprint"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -148,8 +100,4 @@ tf_module {
     name: "sdca_shrink_l1"
     argspec: "args=[\'weights\', \'l1\', \'l2\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "summary_iterator"
-    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.truncated_normal_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.truncated_normal_initializer.pbtxt
deleted file mode 100644
index c1e1c230a9f79e87294eb6038f870726a0ba85a4..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.truncated_normal_initializer.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.truncated_normal_initializer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.TruncatedNormal\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.zeros_initializer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.zeros_initializer.pbtxt
index e229b02ceec6739974d3b4ae2bb02ef273398c45..53c5ac89cf79527522ae2f1cc69b451c405c90d4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.zeros_initializer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.zeros_initializer.pbtxt
@@ -1,11 +1,10 @@
 path: "tensorflow.zeros_initializer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Zeros\'>"
-  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Zeros\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops_v2.Initializer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
index 3cbea41dcab4f8453ef2598fbfd4f8bee65c9b65..b81ce692bbdb28f7c039c32af7f803423d7e86c1 100644
--- a/tensorflow/tools/api/tests/BUILD
+++ b/tensorflow/tools/api/tests/BUILD
@@ -15,11 +15,7 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
 py_test(
     name = "api_compatibility_test",
-    srcs = [
-        "api_compatibility_test.py",
-        "//tensorflow:tf_python_api_gen_v2",
-    ],
-    args = ["--only_test_core_api=true"],
+    srcs = ["api_compatibility_test.py"],
     data = [
         "//tensorflow/tools/api/golden:api_golden_v1",
         "//tensorflow/tools/api/golden:api_golden_v2",
@@ -27,6 +23,7 @@ py_test(
         "//tensorflow/tools/api/tests:README.txt",
     ],
     srcs_version = "PY2AND3",
+    tags = ["no_rocm"],
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
@@ -35,6 +32,7 @@ py_test(
         "//tensorflow/tools/api/lib:python_object_to_proto_visitor",
         "//tensorflow/tools/common:public_api",
         "//tensorflow/tools/common:traverse",
+        "@six_archive//:six",
     ],
 )
 
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index 723fceef413d86675e885debd37e73e5facd7f7c..fe52a2bde7df16c4ae2ad911fa89bf82890c42a6 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -32,8 +32,8 @@ import os
 import re
 import sys
 
+import six
 import tensorflow as tf
-from tensorflow._api.v2 import v2 as tf_v2
 
 from google.protobuf import message
 from google.protobuf import text_format
@@ -59,7 +59,7 @@ _UPDATE_GOLDENS_HELP = """
 # DEFINE_boolean, only_test_core_api, default False:
 _ONLY_TEST_CORE_API_HELP = """
     Some TF APIs are being moved outside of the tensorflow/ directory. There is
-    no garuntee which versions of these APIs will be present when running this
+    no guarantee which versions of these APIs will be present when running this
     test. Therefore, do not error out on API changes in non-core TF code
     if this flag is set.
 """
@@ -78,6 +78,13 @@ _UPDATE_WARNING_FILE = 'tensorflow/tools/api/tests/API_UPDATE_WARNING.txt'
 _NON_CORE_PACKAGES = ['estimator']
 
 
+# TODO(annarev): remove this once we test with newer version of
+# estimator that actually has compat v1 version.
+if not hasattr(tf.compat.v1, 'estimator'):
+  tf.compat.v1.estimator = tf.estimator
+  tf.compat.v2.estimator = tf.estimator
+
+
 def _KeyToFilePath(key, api_version):
   """From a given key, construct a filepath.
 
@@ -135,6 +142,29 @@ def _FilterNonCoreGoldenFiles(golden_file_list):
   return filtered_file_list
 
 
+def _FilterGoldenProtoDict(golden_proto_dict, omit_golden_symbols_map):
+  """Filter out golden proto dict symbols that should be omitted."""
+  if not omit_golden_symbols_map:
+    return golden_proto_dict
+  filtered_proto_dict = dict(golden_proto_dict)
+  for key, symbol_list in six.iteritems(omit_golden_symbols_map):
+    api_object = api_objects_pb2.TFAPIObject()
+    api_object.CopyFrom(filtered_proto_dict[key])
+    filtered_proto_dict[key] = api_object
+    module_or_class = None
+    if api_object.HasField('tf_module'):
+      module_or_class = api_object.tf_module
+    elif api_object.HasField('tf_class'):
+      module_or_class = api_object.tf_class
+    if module_or_class is not None:
+      for members in (module_or_class.member, module_or_class.member_method):
+        filtered_members = [m for m in members if m.name not in symbol_list]
+        # Two steps because protobuf repeated fields disallow slice assignment.
+        del members[:]
+        members.extend(filtered_members)
+  return filtered_proto_dict
+
+
 class ApiCompatibilityTest(test.TestCase):
 
   def __init__(self, *args, **kwargs):
@@ -256,7 +286,7 @@ class ApiCompatibilityTest(test.TestCase):
     visitor.do_not_descend_map['tf'].append('contrib')
     if FLAGS.only_test_core_api:
       visitor.do_not_descend_map['tf'].extend(_NON_CORE_PACKAGES)
-    traverse.traverse(tf_v2.compat.v1, visitor)
+    traverse.traverse(tf.compat.v1, visitor)
 
   def testNoSubclassOfMessageV2(self):
     if not hasattr(tf.compat, 'v2'):
@@ -265,13 +295,14 @@ class ApiCompatibilityTest(test.TestCase):
     visitor.do_not_descend_map['tf'].append('contrib')
     if FLAGS.only_test_core_api:
       visitor.do_not_descend_map['tf'].extend(_NON_CORE_PACKAGES)
-    traverse.traverse(tf_v2, visitor)
+    traverse.traverse(tf.compat.v2, visitor)
 
   def _checkBackwardsCompatibility(self,
                                    root,
                                    golden_file_pattern,
                                    api_version,
-                                   additional_private_map=None):
+                                   additional_private_map=None,
+                                   omit_golden_symbols_map=None):
     # Extract all API stuff.
     visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor()
 
@@ -304,6 +335,8 @@ class ApiCompatibilityTest(test.TestCase):
         _FileNameToKey(filename): _ReadFileToProto(filename)
         for filename in golden_file_list
     }
+    golden_proto_dict = _FilterGoldenProtoDict(golden_proto_dict,
+                                               omit_golden_symbols_map)
 
     # Diff them. Do not fail if called with update.
     # If the test is run to update goldens, only report diffs but do not fail.
@@ -316,7 +349,7 @@ class ApiCompatibilityTest(test.TestCase):
 
   @test_util.run_v1_only('b/120545219')
   def testAPIBackwardsCompatibility(self):
-    api_version = 1
+    api_version = 2 if '_api.v2' in tf.__name__ else 1
     golden_file_pattern = os.path.join(
         resource_loader.get_root_dir_with_all_resources(),
         _KeyToFilePath('*', api_version))
@@ -339,7 +372,7 @@ class ApiCompatibilityTest(test.TestCase):
     golden_file_pattern = os.path.join(
         resource_loader.get_root_dir_with_all_resources(),
         _KeyToFilePath('*', api_version))
-    self._checkBackwardsCompatibility(tf_v2.compat.v1, golden_file_pattern,
+    self._checkBackwardsCompatibility(tf.compat.v1, golden_file_pattern,
                                       api_version)
 
   def testAPIBackwardsCompatibilityV2(self):
@@ -347,11 +380,17 @@ class ApiCompatibilityTest(test.TestCase):
     golden_file_pattern = os.path.join(
         resource_loader.get_root_dir_with_all_resources(),
         _KeyToFilePath('*', api_version))
+    omit_golden_symbols_map = {}
+    if FLAGS.only_test_core_api:
+      # In TF 2.0 these summary symbols are imported from TensorBoard.
+      omit_golden_symbols_map['tensorflow.summary'] = [
+          'audio', 'histogram', 'image', 'scalar', 'text']
     self._checkBackwardsCompatibility(
-        tf_v2,
+        tf.compat.v2,
         golden_file_pattern,
         api_version,
-        additional_private_map={'tf.compat': ['v1']})
+        additional_private_map={'tf.compat': ['v1', 'v2']},
+        omit_golden_symbols_map=omit_golden_symbols_map)
 
 
 if __name__ == '__main__':
@@ -363,7 +402,7 @@ if __name__ == '__main__':
   parser.add_argument(
       '--only_test_core_api',
       type=bool,
-      default=False,
+      default=True,  # only_test_core_api default value
       help=_ONLY_TEST_CORE_API_HELP)
   parser.add_argument(
       '--verbose_diffs', type=bool, default=True, help=_VERBOSE_DIFFS_HELP)
diff --git a/tensorflow/tools/benchmark/benchmark_model.cc b/tensorflow/tools/benchmark/benchmark_model.cc
index de93b12b97081feea5be96edf3b6e6dfbe5599b4..e5187ab8727b2af1853972417c2fb1b890b59a4a 100644
--- a/tensorflow/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/tools/benchmark/benchmark_model.cc
@@ -254,6 +254,7 @@ Status InitializeSession(int num_threads, const string& graph,
   tensorflow::ConfigProto& config = options.config;
   if (num_threads > 0) {
     config.set_intra_op_parallelism_threads(num_threads);
+    config.set_inter_op_parallelism_threads(num_threads);
   }
   LOG(INFO) << "Got config, " << config.device_count_size() << " devices";
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu b/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu
index eb9d0d4dd01c8b39fd108c88d690a2c08efa3760..ad82c88b4a6fa88d5375e66d44e31c2f4e17cea4 100644
--- a/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu
+++ b/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu
@@ -5,7 +5,7 @@ LABEL maintainer="Jan Prach <jendap@google.com>"
 # Copy and run the install scripts.
 COPY install/*.sh /install/
 RUN /install/install_bootstrap_deb_packages.sh
-RUN echo "deb http://http.debian.net/debian jessie-backports main" | \
+RUN echo "deb http://www.debian.net/debian jessie-backports main" | \
     tee -a /etc/apt/sources.list
 # Workaround bug in Jessie backport repository deb packages
 # http://serverfault.com/questions/830636/cannot-install-openjdk-8-jre-headless-on-debian-jessie
diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu b/tensorflow/tools/ci_build/Dockerfile.gpu
index a4cad4b6c65c35651e58495c8f1b8b4c5b5f38d8..f5a28ff16352d5428ac698f2cc7f73b0b1ba3394 100644
--- a/tensorflow/tools/ci_build/Dockerfile.gpu
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
 
 LABEL maintainer="Jan Prach <jendap@google.com>"
 
@@ -7,6 +7,12 @@ LABEL maintainer="Jan Prach <jendap@google.com>"
 RUN cp -P /usr/include/cudnn.h /usr/local/cuda/include
 RUN cp -P /usr/lib/x86_64-linux-gnu/libcudnn* /usr/local/cuda/lib64
 
+# Installs TensorRT, which is not included in NVIDIA Docker containers.
+RUN apt-get update \
+        && apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends libnvinfer-dev=5.0.2-1+cuda10.0
+
 # Copy and run the install scripts.
 COPY install/*.sh /install/
 ARG DEBIAN_FRONTEND=noninteractive
@@ -24,7 +30,7 @@ COPY install/.bazelrc /etc/bazel.bazelrc
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
 
 # Link NCCL libray and header where the build script expects them.
-RUN mkdir /usr/local/cuda-9.0/lib &&  \
+RUN mkdir /usr/local/cuda/lib &&  \
     ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
     ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cpu b/tensorflow/tools/ci_build/Dockerfile.rbe.cpu
index 7e5860aeec186d908e5d2884bd690b2e5e43cffa..500fb6e0b3a995a91f0faf6555e2e248babbfda1 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cpu
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cpu
@@ -1,3 +1,8 @@
+# To push a new version, run:
+# $ docker build -f Dockerfile.rbe.cpu \
+#       --tag "gcr.io/tensorflow-testing/nosla-ubuntu16.04" .
+# $ docker push gcr.io/tensorflow-testing/nosla-ubuntu16.04
+
 FROM launcher.gcr.io/google/rbe-ubuntu16-04:r327695
 LABEL maintainer="Yu Yi <yiyu@google.com>"
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04
index 03de89b7176b702cf8fdee84bb4372002ad94707..c6099c9e45115bfb84be6d3721fbf62088614801 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04
@@ -1,9 +1,9 @@
 # To push a new version, run:
 # $ docker build -f Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04 \
-#       --tag "gcr.io/asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04" .
-# $ docker push gcr.io/asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04
+#       --tag "gcr.io/tensorflow-testing/nosla-cuda10.0-cudnn7-ubuntu14.04" .
+# $ docker push gcr.io/tensorflow-testing/nosla-cuda10.0-cudnn7-ubuntu14.04
 
-FROM ubuntu:14.04
+FROM gcr.io/clang-docker-builder/clang-ubuntu14_04
 LABEL maintainer="Manuel Klimek <klimek@google.com>"
 
 RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \
@@ -19,7 +19,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates
 ENV CUDA_VERSION 10.0.130
 ENV CUDA_PKG_VERSION 10-0=$CUDA_VERSION-1
 ENV CUDNN_VERSION 7.3.1.20
-ENV NCCL_VERSION 2.3.5
+ENV TENSORRT_VERSION 5.0.2
 ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
 ENV NVIDIA_REQUIRE_CUDA "cuda>=10.0,driver>=410"
 ENV NVIDIA_VISIBLE_DEVICES all
@@ -47,21 +47,29 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libcudnn7=$CUDNN_VERSION-1+cuda10.0 \
         libcudnn7=$CUDNN_VERSION-1+cuda10.0 \
         libcudnn7-dev=$CUDNN_VERSION-1+cuda10.0 \
-        libnccl2=$NCCL_VERSION-2+cuda10.0 \
-        libnccl-dev=$NCCL_VERSION-2+cuda10.0 && \
+        nvinfer-runtime-trt-repo-ubuntu1604-$TENSORRT_VERSION-ga-cuda10.0 && \
+    apt-get update && apt-get install -y --no-install-recommends \
+        libnvinfer5=$TENSORRT_VERSION-1+cuda10.0 \
+        libnvinfer-dev=$TENSORRT_VERSION-1+cuda10.0 && \
     ln -s cuda-10.0 /usr/local/cuda && \
     apt-mark hold libcudnn7 && \
-    apt-mark hold libnccl2 && \
     rm -rf /var/lib/apt/lists/*
 
 # TODO(b/110903506): Provide a link to the SONAME of libcuda.so.
 # https://github.com/NVIDIA/nvidia-docker/issues/775
 RUN ln -s libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
 
-# TODO(klimek): Once the TODO in tensorflow's configure.py to correctly find
-# libnccl is resolved, delete this block.
-RUN ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so \
- && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so.2
+# Install a newer version of g++:
+# - we need a new libstdc++, because new clang versions do not work with a stock
+#   ubuntu 14.04 libstdc++.
+# - we want to compile with g++-7 to get ahead of LLVM dropping support for
+#   gcc 4.8.
+RUN apt-get update && \
+    apt-get install -y software-properties-common && \
+    add-apt-repository ppa:ubuntu-toolchain-r/test -y && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends g++-7 && \
+    rm -rf /var/lib/apt/lists/*
 
 # Copy and run the install scripts.
 COPY install/*.sh /install/
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04
index eb6ca7c8f0fe27bd8bb9e5b11cf14e98ad67e530..4ce4214065fbddd4769a4a35941e3b752aa49c9c 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04
@@ -1,7 +1,7 @@
 # To push a new version, run:
 # $ docker build -f Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04 \
-#       --tag "gcr.io/asci-toolchain/nosla-cuda9.0-cudnn7-ubuntu14.04" .
-# $ docker push gcr.io/asci-toolchain/nosla-cuda9.0-cudnn7-ubuntu14.04
+#       --tag "gcr.io/tensorflow-testing/nosla-cuda9.0-cudnn7-ubuntu14.04" .
+# $ docker push gcr.io/tensorflow-testing/nosla-cuda9.0-cudnn7-ubuntu14.04
 #
 # TODO(klimek): Include clang in this image so we can also target clang
 # builds.
@@ -25,7 +25,7 @@ ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
 ENV NVIDIA_VISIBLE_DEVICES all
 ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
 ENV NVIDIA_REQUIRE_CUDA "cuda>=9.0"
-ENV NCCL_VERSION 2.2.13
+ENV TENSORRT_VERSION 5.0.2
 ENV CUDNN_VERSION 7.1.4.18
 
 # TODO(b/110903506): /usr/loca/cuda/lib64/stubs should not be needed in
@@ -44,18 +44,19 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-cudart-$CUDA_PKG_VERSION \
         cuda-libraries-$CUDA_PKG_VERSION \
         cuda-cublas-9-0=9.0.176.4-1 \
-        libnccl2=$NCCL_VERSION-1+cuda9.0 \
         cuda-libraries-dev-$CUDA_PKG_VERSION \
         cuda-nvml-dev-$CUDA_PKG_VERSION \
         cuda-minimal-build-$CUDA_PKG_VERSION \
         cuda-command-line-tools-$CUDA_PKG_VERSION \
         cuda-core-9-0=9.0.176.3-1 \
         cuda-cublas-dev-9-0=9.0.176.4-1 \
-        libnccl-dev=$NCCL_VERSION-1+cuda9.0 \
         libcudnn7-dev=$CUDNN_VERSION-1+cuda9.0 \
-        libcudnn7=$CUDNN_VERSION-1+cuda9.0 && \
+        libcudnn7=$CUDNN_VERSION-1+cuda9.0 \
+        nvinfer-runtime-trt-repo-ubuntu1604-$TENSORRT_VERSION-ga-cuda9.0 && \
+    apt-get update && apt-get install -y --no-install-recommends \
+        libnvinfer5=$TENSORRT_VERSION-1+cuda9.0 \
+        libnvinfer-dev=$TENSORRT_VERSION-1+cuda9.0 && \
     ln -s cuda-9.0 /usr/local/cuda && \
-    apt-mark hold libnccl2 && \
     apt-mark hold libcudnn7 libcudnn7-dev && \
     rm -rf /var/lib/apt/lists/*
 
@@ -66,11 +67,6 @@ RUN echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf && \
 # https://github.com/NVIDIA/nvidia-docker/issues/775
 RUN ln -s libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
 
-# TODO(klimek): Once the TODO in tensorflow's configure.py to correctly find
-# libnccl is resolved, delete this block.
-RUN ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so \
- && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so.2
-
 # Install a newer version of libstdc++, as new clang versions do not work
 # with the stock ubuntu 14.04 libstdc++.
 RUN apt-get update && \
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.gpu b/tensorflow/tools/ci_build/Dockerfile.rbe.gpu
index b65620583676f7ae2a4e849e33df05a18c4c9a24..c4912a65b65d61c6154be5083805d430d697f662 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.gpu
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.gpu
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
 
 LABEL maintainer="Nick Lopez <ngiraldo@google.com>"
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.rocm b/tensorflow/tools/ci_build/Dockerfile.rocm
index aadaa8bac11cc80d1af11905d88116c8df677c2f..f190199643a14d42bb16113568d02ef2e6c85127 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rocm
+++ b/tensorflow/tools/ci_build/Dockerfile.rocm
@@ -44,9 +44,12 @@ RUN apt-get update --allow-insecure-repositories && DEBIAN_FRONTEND=noninteracti
   libboost-filesystem-dev \
   rpm \
   libnuma-dev \
+  pciutils \
   virtualenv \
   python-pip \
   python3-pip \
+  libxml2 \
+  libxml2-dev \
   wget && \
   apt-get clean && \
   rm -rf /var/lib/apt/lists/*
@@ -54,15 +57,12 @@ RUN apt-get update --allow-insecure-repositories && DEBIAN_FRONTEND=noninteracti
 # Install rocm pkgs
 RUN apt-get update --allow-insecure-repositories && \
     DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
-    rocm-dev rocm-libs rocm-utils \
+    rocm-dev rocm-libs rocm-utils rocm-cmake \
     rocfft miopen-hip miopengemm rocblas hipblas rocrand \
     rocm-profiler cxlactivitylogger && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-RUN cd ~ && git clone https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP.git
-RUN cd ~/HIP && mkdir -p build && cd build && cmake .. && make package -j && dpkg -i *.deb
-
 ENV HCC_HOME=$ROCM_PATH/hcc
 ENV HIP_PATH=$ROCM_PATH/hip
 ENV OPENCL_ROOT=$ROCM_PATH/opencl
@@ -71,12 +71,7 @@ ENV PATH="$ROCM_PATH/bin:${PATH}"
 ENV PATH="$OPENCL_ROOT/bin:${PATH}"
 
 # Add target file to help determine which device(s) to build for
-RUN echo -e "gfx803\ngfx900" >> /opt/rocm/bin/target.lst
-
-# Setup environment variables, and add those environment variables at the end of ~/.bashrc 
-ARG HCC_HOME=/opt/rocm/hcc
-ARG HIP_PATH=/opt/rocm/hip
-ARG PATH=$HCC_HOME/bin:$HIP_PATH/bin:$PATH
+RUN bash -c 'echo -e "gfx803\ngfx900\ngfx906" >> /opt/rocm/bin/target.lst'
 
 # Copy and run the install scripts.
 COPY install/*.sh /install/
diff --git a/tensorflow/tools/ci_build/builds/pip_new.sh b/tensorflow/tools/ci_build/builds/pip_new.sh
new file mode 100755
index 0000000000000000000000000000000000000000..ed687bf8463cd9f6039f6ff98bb276048457c8b9
--- /dev/null
+++ b/tensorflow/tools/ci_build/builds/pip_new.sh
@@ -0,0 +1,673 @@
+#!/usr/bin/env bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Build the Python PIP installation package for TensorFlow and install
+# the package.
+#
+# Usage:
+#   pip_new.sh
+#
+# Required step(s):
+#   Run configure.py prior to running this script.
+#
+# Required environment variable(s):
+#   CONTAINER_TYPE:      (CPU | GPU)
+#   OS_TYPE:             (UBUNTU | MACOS)
+#   TF_PYTHON_VERSION:   (python2 | python2.7 | python3.5 | python3.7)
+#
+# Optional environment variables. If provided, overwrites any default values.
+#   TF_BUILD_FLAGS:      Bazel build flags.
+#                          e.g. TF_BUILD_FLAGS="--config=opt"
+#   TF_TEST_FLAGS:       Bazel test flags.
+#                          e.g. TF_TEST_FLAGS="--verbose_failures=true \
+#                               --build_tests_only --test_output=errors"
+#   TF_TEST_FILTER_TAGS: Filtering tags for bazel tests. More specifically,
+#                        input tags for `--test_filter_tags` flag.
+#                          e.g. TF_TEST_FILTER_TAGS="no_pip,-nomac,no_oss"
+#   TF_TEST_TARGETS:     Bazel test targets.
+#                          e.g. TF_TEST_TARGETS="//tensorflow/contrib/... \
+#                               //tensorflow/... \
+#                               //tensorflow/python/..."
+#   TF_PIP_TESTS:        PIP tests to run. If NOT specified, skips all tests.
+#                          e.g. TF_PIP_TESTS="test_pip_virtualenv_clean \
+#                               test_pip_virtualenv_clean \
+#                               test_pip_virtualenv_oss_serial"
+#   IS_NIGHTLY:          Nightly run flag.
+#                          e.g. IS_NIGHTLY=1  # nightly runs
+#                          e.g. IS_NIGHTLY=0  # non-nightly runs
+#   TF_PROJECT_NAME:     Name of the project. This string will be pass onto
+#                        the wheel file name. For nightly builds, it will be
+#                        overwritten to 'tf_nightly'. For gpu builds, '_gpu'
+#                        will be appended.
+#                          e.g. TF_PROJECT_NAME="tensorflow"
+#                          e.g. TF_PROJECT_NAME="tf_nightly_gpu"
+#   TF_PIP_TEST_ROOT:    Root directory for building and testing pip pkgs.
+#                          e.g. TF_PIP_TEST_ROOT="pip_test"
+#
+# To-be-deprecated variable(s).
+#   GIT_TAG_OVERRIDE:    Values for `--git_tag_override`. This flag gets passed
+#                        in as `--action_env` for bazel build and tests.
+#   TF_BUILD_INSTALL_EXTRA_PIP_PACKAGES
+#                        Additonal pip packages to be installed.
+#                        Caveat: pip version needs to be checked prior.
+
+# set bash options
+set -e
+set -x
+
+###########################################################################
+# General helper function(s)
+###########################################################################
+
+# Strip leading and trailing whitespaces
+str_strip () {
+  echo -e "$1" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//'
+}
+
+# Convert string to all lower case
+lowercase() {
+  if [[ -z "${1}" ]]; then
+    die "Nothing to convert to lowercase. No argument given."
+  fi
+  echo "${1}" | tr '[:upper:]' '[:lower:]'
+}
+
+check_global_vars() {
+  # Check container type
+  if ! [[ ${CONTAINER_TYPE} == "cpu" ]] && \
+     ! [[ ${CONTAINER_TYPE} == "rocm" ]] && \
+     ! [[ ${CONTAINER_TYPE} == "gpu" ]]; then
+    die "Error: Provided CONTAINER_TYPE \"${CONTAINER_TYPE}\" "\
+        "is not supported."
+  fi
+  # Check OS type
+  if ! [[ ${OS_TYPE} == "ubuntu" ]] && \
+     ! [[ ${OS_TYPE} == "macos" ]]; then
+    die"Error: Provided OS_TYPE \"${OS_TYPE}\" is not supported."
+  fi
+}
+
+add_test_filter_tag() {
+  EMPTY=""
+  while true; do
+    FILTER="${1:$EMPTY}"
+    if ! [[ $BAZEL_TEST_FILTER_TAGS == *"${FILTER}"* ]]; then
+      BAZEL_TEST_FILTER_TAGS="${FILTER},${BAZEL_TEST_FILTER_TAGS}"
+    fi
+    shift
+    if [[ -z "${1}" ]]; then
+      break
+    fi
+  done
+}
+
+remove_test_filter_tag() {
+  EMPTY=""
+  while true; do
+    FILTER="${1:$EMPTY}"
+    BAZEL_TEST_FILTER_TAGS="$(echo ${BAZEL_TEST_FILTER_TAGS} | sed -e 's/^'${FILTER}',//g' -e 's/,'${FILTER}'//g')"
+    shift
+    if [[ -z "${1}" ]]; then
+      break
+    fi
+  done
+}
+
+# Clean up bazel build & test flags with proper configuration.
+update_bazel_flags() {
+  # Add git tag override flag if necessary.
+  GIT_TAG_STR=" --action_env=GIT_TAG_OVERRIDE"
+  if [[ -z "${GIT_TAG_OVERRIDE}" ]] && \
+    ! [[ ${BAZEL_BUILD_FLAGS} = *${GIT_TAG_STR}* ]]; then
+    BAZEL_BUILD_FLAGS+="${GIT_TAG_STR}"
+  fi
+  # Clean up whitespaces
+  BAZEL_BUILD_FLAGS=$(str_strip "${BAZEL_BUILD_FLAGS}")
+  BAZEL_TEST_FLAGS=$(str_strip "${BAZEL_TEST_FLAGS}")
+  # Cleaned bazel flags
+  echo "Bazel build flags (cleaned):\n" "${BAZEL_BUILD_FLAGS}"
+  echo "Bazel test flags (cleaned):\n" "${BAZEL_TEST_FLAGS}"
+}
+
+update_test_filter_tags() {
+  # Add test filter tags
+  # This script is for validating built PIP packages. Add pip tags.
+  add_test_filter_tag -no_pip -nopip
+  # MacOS filter tags
+  if [[ ${OS_TYPE} == "macos" ]]; then
+    remove_test_filter_tag nomac no_mac
+    add_test_filter_tag -nomac -no_mac
+  fi
+  echo "Final test filter tags: ${BAZEL_TEST_FILTER_TAGS}"
+}
+
+# Check currently running python and pip version
+check_python_pip_version() {
+  # Check if only the major version of python is provided by the user.
+  MAJOR_VER_ONLY=0
+  if [[ ${#PYTHON_VER} -lt 9 ]]; then
+    # User only provided major version (e.g. 'python2' instead of 'python2.7')
+    MAJOR_VER_ONLY=1
+  fi
+
+  # Retrieve only the version number of the user requested python.
+  PYTHON_VER_REQUESTED=${PYTHON_VER:6:3}
+  echo "PYTHON_VER_REQUESTED: ${PYTHON_VER_REQUESTED}"
+
+  # Retrieve only the version numbers of the python & pip in use currently.
+  PYTHON_VER_IN_USE=$(python --version 2>&1)
+  PYTHON_VER_IN_USE=${PYTHON_VER_IN_USE:7:3}
+  PIP_VER_IN_USE=$(pip --version)
+  PIP_VER_IN_USE=${PIP_VER_IN_USE:${#PIP_VER_IN_USE}-4:3}
+
+  # If only major versions are applied, drop minor versions.
+  if [[ $MAJOR_VER_ONLY == 1 ]]; then
+    PYTHON_VER_IN_USE=${PYTHON_VER_IN_USE:0:1}
+    PIP_VER_IN_USE=${PIP_VER_IN_USE:0:1}
+  fi
+
+  # Check if all versions match.
+  echo -e "User requested python version: '${PYTHON_VER_REQUESTED}'\n" \
+    "Detected python version in use: '${PYTHON_VER_IN_USE}'\n"\
+    "Detected pip version in use: '${PIP_VER_IN_USE}'"
+  if ! [[ $PYTHON_VER_REQUESTED == $PYTHON_VER_IN_USE ]]; then
+    die "Error: Mismatch in python versions detected."
+  else:
+    echo "Python and PIP versions in use match the requested."
+  fi
+}
+
+###########################################################################
+# Setup: directories, local/global variables
+###########################################################################
+
+# Script directory and source necessary files.
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${SCRIPT_DIR}/builds_common.sh"
+
+# Required global variables
+# Checks on values for these vars are done in "Build TF PIP Package" section.
+CONTAINER_TYPE=$(lowercase "${CONTAINER_TYPE}")
+OS_TYPE=$(lowercase "${OS_TYPE}")
+PYTHON_VER=$(lowercase "${TF_PYTHON_VERSION}")
+
+# Python bin path
+if [[ -z "$PYTHON_BIN_PATH" ]]; then
+  die "Error: PYTHON_BIN_PATH was not provided. Did you run configure?"
+fi
+# Get python version for configuring pip later in installation.
+PYTHON_VER_CFG=$(${PYTHON_BIN_PATH} -V 2>&1 | awk '{print $NF}' | cut -d. -f-2)
+echo "PYTHON_BIN_PATH: ${PYTHON_BIN_PATH} (version: ${PYTHON_VER_CFG})"
+
+# Default values for optional global variables in case they are not user
+# defined.
+DEFAULT_BAZEL_BUILD_FLAGS='--config=opt'
+DEFAULT_BAZEL_TEST_FLAGS='--test_output=errors --verbose_failures=true'
+DEFAULT_BAZEL_TEST_FILTERS='-no_oss,-oss_serial'
+DEFAULT_BAZEL_TEST_TARGETS='//tensorflow/python/... -//tensorflow/core/... -//tensorflow/compiler/... '
+DEFAULT_PIP_TESTS="" # Do not run any tests by default
+DEFAULT_IS_NIGHTLY=0 # Not nightly by default
+DEFAULT_PROJECT_NAME="tensorflow"
+DEFAULT_PIP_TEST_ROOT="pip_test"
+
+# Take in optional global variables
+BAZEL_BUILD_FLAGS=${TF_BUILD_FLAGS:-$DEFAULT_BAZEL_BUILD_FLAGS}
+BAZEL_TEST_FLAGS=${TF_TEST_FLAGS:-$DEFAULT_BAZEL_TEST_FLAGS}
+BAZEL_TEST_TARGETS=${TF_TEST_TARGETS:-$DEFAULT_BAZEL_TEST_TARGETS}
+BAZEL_TEST_FILTER_TAGS=${TF_TEST_FILTER_TAGS:-$DEFAULT_BAZEL_TEST_FILTERS}
+PIP_TESTS=${TF_PIP_TESTS:-$DEFAULT_PIP_TESTS}
+IS_NIGHTLY=${IS_NIGHTLY:-$DEFAULT_IS_NIGHTLY}
+PROJECT_NAME=${TF_PROJECT_NAME:-$DEFAULT_PROJECT_NAME}
+PIP_TEST_ROOT=${TF_PIP_TEST_ROOT:-$DEFAULT_PIP_TEST_ROOT}
+
+# Local variables
+PIP_WHL_DIR="${KOKORO_ARTIFACTS_DIR}/tensorflow/${PIP_TEST_ROOT}/whl"
+mkdir -p "${PIP_WHL_DIR}"
+PIP_WHL_DIR=$(realpath "${PIP_WHL_DIR}") # Get absolute path
+WHL_PATH=""
+# Determine the major.minor versions of python being used (e.g., 2.7).
+# Useful for determining the directory of the local pip installation.
+PY_MAJOR_MINOR_VER=$(${PYTHON_BIN_PATH} -V 2>&1 | awk '{print $NF}' | cut -d. -f-2)
+if [[ -z "${PY_MAJOR_MINOR_VER}" ]]; then
+  die "ERROR: Unable to determine the major.minor version of Python."
+fi
+echo "Python binary path to be used in PIP install: ${PYTHON_BIN_PATH} "\
+"(Major.Minor version: ${PY_MAJOR_MINOR_VER})"
+PYTHON_BIN_PATH_INIT=${PYTHON_BIN_PATH}
+PIP_BIN_PATH="$(which pip${PYTHON_VER_CFG})"
+PIP_BIN_PATH_INIT=${PIP_BIN_PATH}
+
+# PIP packages
+INSTALL_EXTRA_PIP_PACKAGES=${TF_BUILD_INSTALL_EXTRA_PIP_PACKAGES}
+
+###########################################################################
+# Build TF PIP Package
+###########################################################################
+
+# First remove any already existing binaries for a clean start and test.
+if [[ -d ${PIP_TEST_ROOT} ]]; then
+  echo "Test root directory ${PIP_TEST_ROOT} already exists. Deleting it."
+  sudo rm -rf ${PIP_TEST_ROOT}
+fi
+
+# Check that global variables are properly set.
+check_global_vars
+
+# Check if in a virtualenv and exit if yes.
+IN_VENV=$(python -c 'import sys; print("1" if hasattr(sys, "real_prefix") else "0")')
+if [[ "$IN_VENV" == "1" ]]; then
+  echo "It appears that we are already in a virtualenv. Deactivating..."
+  deactivate || source deactivate || die "FAILED: Unable to deactivate from existing virtualenv."
+fi
+
+# Configure python. Obtain the path to python binary.
+source tools/python_bin_path.sh
+# Assume PYTHON_BIN_PATH is exported by the script above.
+if [[ -z "$PYTHON_BIN_PATH" ]]; then
+  die "PYTHON_BIN_PATH was not provided. Did you run configure?"
+fi
+
+# Bazel build the file.
+PIP_BUILD_TARGET="//tensorflow/tools/pip_package:build_pip_package"
+# Clean bazel cache.
+bazel clean
+# Clean up and update bazel flags
+update_bazel_flags
+# Build. This outputs the file `build_pip_package`.
+bazel build ${BAZEL_BUILD_FLAGS} ${PIP_BUILD_TARGET} || \
+  die "Error: Bazel build failed for target: '${PIP_BUILD_TARGET}'"
+
+###########################################################################
+# Test function(s)
+###########################################################################
+
+test_pip_virtualenv_clean() {
+  # Create a clean directory.
+  CLEAN_VENV_DIR="${PIP_TEST_ROOT}/venv_clean"
+
+  # activate virtual environment and install tensorflow with PIP.
+  create_activate_virtualenv --clean "${CLEAN_VENV_DIR}"
+  # Install TF with pip
+  install_tensorflow_pip "${WHL_PATH}"
+
+  # cd to a temporary directory to avoid picking up Python files in the source
+  # tree.
+  TMP_DIR=$(mktemp -d)
+  pushd "${TMP_DIR}"
+
+  # Run a quick check on tensorflow installation.
+  RET_VAL=$(python -c "import tensorflow as tf; t1=tf.constant([1,2,3,4]); t2=tf.constant([5,6,7,8]); print(tf.add(t1,t2))")
+
+  # Deactivate virtualenv.
+  deactivate || source deactivate || die "FAILED: Unable to deactivate from existing virtualenv."
+
+  # Return to original directory. Remove temp dirs.
+  popd
+  sudo rm -rf "${TMP_DIR}" "${CLEAN_VENV_DIR}"
+
+  # Check result to see if tensorflow is properly installed.
+  if [[ ${RET_VAL} == *'Tensor("Add:0", shape=(4,), dtype=int32)'* ]]; then
+    echo "PIP test on clean virtualenv PASSED."
+    return 0
+  else
+    echo "PIP test on clean virtualenv FAILED."
+    return 1
+  fi
+}
+
+test_pip_virtualenv_non_clean() {
+  # Create virtualenv directory for install test
+  VENV_DIR="${PIP_TEST_ROOT}/venv"
+
+  # Activate virtualenv
+  create_activate_virtualenv "${VENV_DIR}"
+  # Install TF with pip
+  install_tensorflow_pip "${WHL_PATH}"
+
+  # cd to a temporary directory to avoid picking up Python files in the source
+  # tree.
+  TMP_DIR=$(mktemp -d)
+  pushd "${TMP_DIR}"
+
+  # Run a quick check on tensorflow installation.
+  RET_VAL=$(python -c "import tensorflow as tf; t1=tf.constant([1,2,3,4]); t2=tf.constant([5,6,7,8]); print(tf.add(t1,t2))")
+
+  # Return to original directory. Remove temp dirs.
+  popd
+  sudo rm -rf "${TMP_DIR}"
+
+  # Check result to see if tensorflow is properly installed.
+  if ! [[ ${RET_VAL} == *'Tensor("Add:0", shape=(4,), dtype=int32)'* ]]; then
+    echo "PIP test on virtualenv (non-clean) FAILED"
+    return 1
+  fi
+
+  # Install extra pip packages, if specified.
+  for PACKAGE in ${INSTALL_EXTRA_PIP_PACKAGES}; do
+    echo "Installing extra pip package required by test-on-install: ${PACKAGE}"
+
+    ${PIP_BIN_PATH} install ${PACKAGE}
+    if [[ $? != 0 ]]; then
+      echo "${PIP_BIN_PATH} install ${PACKAGE} FAILED."
+      deactivate || source deactivate || die "FAILED: Unable to deactivate from existing virtualenv."
+      return 1
+    fi
+  done
+
+  # Run bazel test.
+  run_test_with_bazel
+  RESULT=$?
+
+  # Deactivate from virtualenv.
+  deactivate || source deactivate || die "FAILED: Unable to deactivate from existing virtualenv."
+  sudo rm -rf "${VENV_DIR}"
+
+  if [[ $RESULT -ne 0 ]]; then
+    echo "PIP test on virtualenv (non-clean) FAILED."
+    return 1
+  else
+    echo "PIP test on virtualenv (non-clean) PASSED."
+    return 0
+  fi
+}
+
+test_pip_virtualenv_oss_serial() {
+  # Create virtualenv directory
+  VENV_DIR="${PIP_TEST_ROOT}/venv"
+
+  create_activate_virtualenv "${VENV_DIR}"
+  run_test_with_bazel --oss_serial
+  RESULT=$?
+
+  # deactivate virtualenv
+  deactivate || source deactivate || die "FAILED: Unable to deactivate from existing virtualenv."
+
+  if [[ ${RESULT} -ne 0 ]]; then
+    echo "PIP test on virtualenv (oss-serial) FAILED."
+    return 1
+  else
+    echo "PIP test on virtualenv (oss-serial) PASSED."
+    return 0
+  fi
+}
+
+###########################################################################
+# Test helper function(s)
+###########################################################################
+
+create_activate_virtualenv() {
+  VIRTUALENV_FLAGS="--system-site-packages"
+  if [[ "${1}" == "--clean" ]]; then
+    shift
+  fi
+
+  VIRTUALENV_DIR="${1}"
+  if [[ -d "${VIRTUALENV_DIR}" ]]; then
+    if sudo rm -rf "${VIRTUALENV_DIR}"
+    then
+      echo "Removed existing virtualenv directory: ${VIRTUALENV_DIR}"
+    else
+      die "Failed to remove existing virtualenv directory: ${VIRTUALENV_DIR}"
+    fi
+  fi
+
+  if mkdir -p "${VIRTUALENV_DIR}"
+  then
+    echo "Created virtualenv directory: ${VIRTUALENV_DIR}"
+  else
+    die "FAILED to create virtualenv directory: ${VIRTUALENV_DIR}"
+  fi
+
+  # Use the virtualenv from the default python version (i.e., python-virtualenv)
+  # to create the virtualenv directory for testing. Use the -p flag to specify
+  # the python version inside the to-be-created virtualenv directory.
+  ${PYTHON_BIN_PATH_INIT} -m virtualenv -p ${PYTHON_BIN_PATH_INIT} ${VIRTUALENV_FLAGS} ${VIRTUALENV_DIR} || \
+    die "FAILED: Unable to create virtualenv"
+
+  source "${VIRTUALENV_DIR}/bin/activate" || \
+    die "FAILED: Unable to activate virtualenv in ${VIRTUALENV_DIR}"
+
+  # Update .tf_configure.bazelrc with venv python path for bazel test.
+  PYTHON_BIN_PATH="$(which python)"
+  yes "" | ./configure
+}
+
+install_tensorflow_pip() {
+  if [[ -z "${1}" ]]; then
+    die "Please provide a proper wheel file path."
+  fi
+
+  # Set path to pip.
+  PIP_BIN_PATH="$(which pip${PYTHON_VER_CFG})"
+
+  # Print python and pip bin paths
+  echo "PYTHON_BIN_PATH to be used to install the .whl: ${PYTHON_BIN_PATH}"
+  echo "PIP_BIN_PATH to be used to install the .whl: ${PIP_BIN_PATH}"
+
+  # Upgrade pip so it supports tags such as cp27mu, manylinux1 etc.
+  echo "Upgrade pip in virtualenv"
+
+  # NOTE: pip install --upgrade pip leads to a documented TLS issue for
+  # some versions in python
+  curl https://bootstrap.pypa.io/get-pip.py | ${PYTHON_BIN_PATH} || \
+    die "Error: pip install (get-pip.py) FAILED"
+
+  # Check that requested python version matches configured one.
+  check_python_pip_version
+
+  # Force upgrade of setuptools. This must happen before the pip install of the
+  # WHL_PATH, which pulls in absl-py, which uses install_requires notation
+  # introduced in setuptools >=20.5. The default version of setuptools is 5.5.1,
+  # which is too old for absl-py.
+  ${PIP_BIN_PATH} install --upgrade setuptools==39.1.0 || \
+    die "Error: setuptools install, upgrade FAILED"
+
+  # Force tensorflow reinstallation. Otherwise it may not get installed from
+  # last build if it had the same version number as previous build.
+  PIP_FLAGS="--upgrade --force-reinstall"
+  ${PIP_BIN_PATH} install -v ${PIP_FLAGS} ${WHL_PATH} || \
+    die "pip install (forcing to reinstall tensorflow) FAILED"
+  echo "Successfully installed pip package ${WHL_PATH}"
+
+  # Force downgrade of setuptools. This must happen after the pip install of the
+  # WHL_PATH, which ends up upgrading to the latest version of setuptools.
+  # Versions of setuptools >= 39.1.0 will cause tests to fail like this:
+  #   ImportError: cannot import name py31compat
+  ${PIP_BIN_PATH} install --upgrade setuptools==39.1.0 || \
+    die "Error: setuptools install, upgrade FAILED"
+}
+
+run_test_with_bazel() {
+  IS_OSS_SERIAL=0
+  if [[ "${1}" == "--oss_serial" ]]; then
+    IS_OSS_SERIAL=1
+  fi
+  TF_GPU_COUNT=${TF_GPU_COUNT:-4}
+
+  # PIP tests should have a "different" path. Different than the one we place
+  # virtualenv, because we are deleting and recreating it here.
+  PIP_TEST_PREFIX=bazel_pip
+  TEST_ROOT=$(pwd)/${PIP_TEST_PREFIX}
+  sudo rm -rf $TEST_ROOT
+  mkdir -p $TEST_ROOT
+  ln -s $(pwd)/tensorflow $TEST_ROOT/tensorflow
+
+  if [[ "${IS_OSS_SERIAL}" == "1" ]]; then
+    remove_test_filter_tag -no_oss
+    add_test_filter_tag oss_serial
+  else
+    add_test_filter_tag -oss_serial
+  fi
+
+  # Clean the bazel cache
+  bazel clean
+  # Clean up flags before running bazel commands
+  update_bazel_flags
+  # Clean up and update test filter tags
+  update_test_filter_tags
+
+  # Figure out how many concurrent tests we can run and do run the tests.
+  BAZEL_PARALLEL_TEST_FLAGS=""
+  if [[ $CONTAINER_TYPE == "gpu" ]]; then
+    # Number of test threads is the number of GPU cards available.
+    if [[ $OS_TYPE == "macos" ]]; then
+      BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=1"
+    else
+      BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=${TF_GPU_COUNT} \
+        --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute"
+    fi
+  else
+    # Number of test threads is the number of physical CPUs.
+    if [[ $OS_TYPE == "macos" ]]; then
+      BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=$(sysctl -n hw.ncpu)"
+    else
+      BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=$(grep -c ^processor /proc/cpuinfo)"
+    fi
+  fi
+
+  if [[ ${IS_OSS_SERIAL} == 1 ]]; then
+    BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=1"
+  fi
+
+  # TODO(hyey): Update test target after validation.
+  # Run the test.
+  bazel test --build_tests_only ${BAZEL_TEST_FLAGS} ${BAZEL_PARALLEL_TEST_FLAGS} --test_tag_filters=${BAZEL_TEST_FILTER_TAGS} -k -- //$PIP_TEST_PREFIX/tensorflow/python/...
+
+  unlink ${TEST_ROOT}/tensorflow
+}
+
+run_all_tests() {
+  if [[ -z "${PIP_TESTS}" ]]; then
+    echo "No test was specified to run. Skipping all tests."
+    return 0
+  fi
+  FAIL_COUNTER=0
+  PASS_COUNTER=0
+  for TEST in ${PIP_TESTS[@]}; do
+
+    # Run tests.
+    case "${TEST}" in
+    "test_pip_virtualenv_clean")
+      test_pip_virtualenv_clean
+      ;;
+    "test_pip_virtualenv_non_clean")
+      test_pip_virtualenv_non_clean
+      ;;
+    "test_pip_virtualenv_oss_serial")
+      test_pip_virtualenv_oss_serial
+      ;;
+    *)
+      die "No matching test ${TEST} was found. Stopping test."
+      ;;
+    esac
+
+    # Check and update the results.
+    RETVAL=$?
+
+    # Update results counter
+    if [ ${RETVAL} -eq 0 ]; then
+      echo "Test (${TEST}) PASSED. (PASS COUNTER: ${PASS_COUNTER})"
+      PASS_COUNTER=$(($PASS_COUNTER+1))
+    else
+      echo "Test (${TEST}) FAILED. (FAIL COUNTER: ${FAIL_COUNTER})"
+      FAIL_COUNTER=$(($FAIL_COUNTER+1))
+    fi
+  done
+  printf "${PASS_COUNTER} PASSED | ${FAIL_COUNTER} FAILED"
+  if [[ "${FAIL_COUNTER}" == "0" ]]; then
+    printf "PIP tests ${COLOR_GREEN}PASSED${COLOR_NC}\n"
+    return 0
+  else:
+    printf "PIP tests ${COLOR_RED}FAILED${COLOR_NC}\n"
+    return 1
+  fi
+}
+
+###########################################################################
+# Build TF PIP Wheel file
+###########################################################################
+
+# Update the build flags for building whl.
+# Flags: GPU, OS, tf_nightly, project name
+GPU_FLAG=""
+NIGHTLY_FLAG=""
+
+# TF Nightly flag
+if [[ "$IS_NIGHTLY" == 1 ]]; then
+  # If 'nightly' is not specified in the project name already, then add.
+  if ! [[ $PROJECT_NAME == *"nightly"* ]]; then
+    echo "WARNING: IS_NIGHTLY=${IS_NIGHTLY} but requested project name \
+    (PROJECT_NAME=${PROJECT_NAME}) does not include 'nightly' string. \
+    Renaming it to 'tf_nightly'."
+    PROJECT_NAME="tf_nightly"
+  fi
+  NIGHTLY_FLAG="--nightly_flag"
+fi
+
+# CPU / GPU flag
+if [[ ${CONTAINER_TYPE} == "gpu" ]]; then
+  GPU_FLAG="--gpu"
+  if ! [[ $PROJECT_NAME == *"gpu"* ]]; then
+    echo "WARNING: GPU is specified but requested project name (PROJECT_NAME=${PROJECT_NAME}) \
+    does not include 'gpu'. Appending '_gpu' to the project name."
+    PROJECT_NAME="${PROJECT_NAME}_gpu"
+  fi
+fi
+
+./bazel-bin/tensorflow/tools/pip_package/build_pip_package ${PIP_WHL_DIR} ${GPU_FLAG} ${NIGHTLY_FLAG} "--project_name" ${PROJECT_NAME} || die "build_pip_package FAILED"
+
+PY_MAJOR_MINOR_VER=$(echo $PY_MAJOR_MINOR_VER | tr -d '.')
+if [[ $PY_MAJOR_MINOR_VER == "2" ]]; then
+  PY_MAJOR_MINOR_VER="27"
+fi
+
+# Set wheel path and verify that there is only one .whl file in the path.
+WHL_PATH=$(ls "${PIP_WHL_DIR}"/"${PROJECT_NAME}"-*"${PY_MAJOR_MINOR_VER}"*"${PY_MAJOR_MINOR_VER}"*.whl)
+if [[ $(echo "${WHL_PATH}" | wc -w) -ne 1 ]]; then
+  echo "ERROR: Failed to find exactly one built TensorFlow .whl file in "\
+  "directory: ${PIP_WHL_DIR}"
+fi
+
+WHL_DIR=$(dirname "${WHL_PATH}")
+WHL_BASE_NAME=$(basename "${WHL_PATH}")
+AUDITED_WHL_NAME="${WHL_DIR}"/$(echo "${WHL_BASE_NAME//linux/manylinux1}")
+
+# Print the size of the wheel file.
+echo "Size of the PIP wheel file built: $(ls -l ${WHL_PATH} | awk '{print $5}')"
+
+# Run tests (if any is specified).
+run_all_tests
+
+for WHL_PATH in $(ls ${PIP_WHL_DIR}/${PROJECT_NAME}*.whl); do
+  if [[ "${TF_NEED_CUDA}" -eq "1" ]]; then
+    # Copy and rename for gpu manylinux as we do not want auditwheel to package in libcudart.so
+    WHL_PATH=${AUDITED_WHL_NAME}
+    cp "${WHL_DIR}"/"${WHL_BASE_NAME}" "${WHL_PATH}"
+    echo "Copied manylinux1 wheel file at ${WHL_PATH}"
+  else
+    # Repair the wheels for cpu manylinux1
+    echo "auditwheel repairing ${WHL_PATH}"
+    auditwheel repair -w "${WHL_DIR}" "${WHL_PATH}"
+
+    if [[ -f ${AUDITED_WHL_NAME} ]]; then
+      WHL_PATH=${AUDITED_WHL_NAME}
+      echo "Repaired manylinux1 wheel file at: ${WHL_PATH}"
+    else
+      die "WARNING: Cannot find repaired wheel."
+    fi
+  fi
+done
+
+echo "EOF: Successfully ran pip_new.sh"
diff --git a/tensorflow/tools/ci_build/builds/run_pip_tests.sh b/tensorflow/tools/ci_build/builds/run_pip_tests.sh
index 7d5cf3f8439e223e0e8591333e727b2e58ca275c..a095633a22e8b24a4561ad3e13902a34424717ae 100755
--- a/tensorflow/tools/ci_build/builds/run_pip_tests.sh
+++ b/tensorflow/tools/ci_build/builds/run_pip_tests.sh
@@ -88,7 +88,8 @@ if [[ ${IS_GPU} == "1" ]]; then
   PIP_TEST_FILTER_TAG="-no_gpu,-no_pip_gpu,${PIP_TEST_FILTER_TAG}"
 fi
 if [[ ${IS_MAC} == "1" ]]; then
-  PIP_TEST_FILTER_TAG="-nomac,${PIP_TEST_FILTER_TAG}"
+  # TODO(b/122370901): Fix nomac, no_mac inconsistency.
+  PIP_TEST_FILTER_TAG="-nomac,-no_mac,${PIP_TEST_FILTER_TAG}"
 fi
 
 # Bazel flags we need for all tests:
diff --git a/tensorflow/tools/ci_build/builds/test_tutorials.sh b/tensorflow/tools/ci_build/builds/test_tutorials.sh
index db335f14ca4f88ade7a540ffab7ed9de67f1248e..a12827a2d3c9d4bf643d26ae854f544e614934dd 100755
--- a/tensorflow/tools/ci_build/builds/test_tutorials.sh
+++ b/tensorflow/tools/ci_build/builds/test_tutorials.sh
@@ -33,7 +33,7 @@
 #
 
 # List of all tutorial tests to run, separated by spaces
-TUT_TESTS="mnist_softmax mnist_with_summaries word2vec estimator_abalone"
+TUT_TESTS="mnist_with_summaries word2vec"
 
 if [[ -z "${TUT_TESTS_BLACKLIST}" ]]; then
   TF_BUILD_TUT_TEST_BLACKLIST=""
@@ -212,16 +212,6 @@ test_word2vec() {
 }
 
 
-# -----------------------------------------------------------
-# Estimator: abalone
-test_estimator_abalone() {
-  LOG_FILE=$1
-
-  run_in_directory "${TEST_DIR}" "${LOG_FILE}" \
-    "tensorflow/examples/tutorials/estimators/abalone.py"
-}
-
-
 # -----------------------------------------------------------
 # ptb_word_lm
 test_ptb_word_lm() {
diff --git a/tensorflow/tools/ci_build/builds/test_user_ops.sh b/tensorflow/tools/ci_build/builds/test_user_ops.sh
index 25ecee472524d5346252772b3058a5e824eef217..9da9c3b881ed14c4cebd3dd641c23d9cfd6f6708 100755
--- a/tensorflow/tools/ci_build/builds/test_user_ops.sh
+++ b/tensorflow/tools/ci_build/builds/test_user_ops.sh
@@ -239,8 +239,15 @@ function run_op() {
   fi
 }
 
+printf "\nTesting execution of user-defined op under graph mode:\n\n"
 run_op "$("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; print(tf.Session('').run(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT})))")"
-run_op "$("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; tf.enable_eager_execution(); print(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT}).numpy())")" " in eager mode"
+
+if [[ "${IS_GPU}" == "0" ]]; then
+  printf "\nTesting execution of user-defined op under eager mode:\n\n"
+  run_op "$("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; tf.enable_eager_execution(); print(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT}).numpy())")" " in eager mode"
+else
+  printf "\nSKIPPING the testing of execution of user-defined GPU kernel under eager mode. See b/122972785.\n\n"
+fi
 
 
 popd
diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index 435ec7ca68fc28362b9b546f977b24e003e55d2f..62c1e014d5eedecbbb07ed349914e8428c58785b 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -398,7 +398,8 @@ if [[ "${TF_BUILD_APPEND_ARGUMENTS}" == *"--test_tag_filters="* ]]; then
         NEW_ITEM="${NEW_ITEM},-benchmark-test"
       fi
       if [[ ${IS_MAC} == "1" ]] && [[ ${NEW_ITEM} != *"nomac"* ]]; then
-        NEW_ITEM="${NEW_ITEM},-nomac"
+        # TODO(b/122370901): Fix nomac, no_mac inconsistency.
+        NEW_ITEM="${NEW_ITEM},-nomac,-no_mac"
       fi
       EXTRA_ARGS="${EXTRA_ARGS} ${NEW_ITEM}"
     else
@@ -408,11 +409,13 @@ if [[ "${TF_BUILD_APPEND_ARGUMENTS}" == *"--test_tag_filters="* ]]; then
 else
   EXTRA_ARGS="${EXTRA_ARGS} ${TF_BUILD_APPEND_ARGUMENTS} --test_tag_filters=-no_oss,-oss_serial,-benchmark-test"
   if [[ ${IS_MAC} == "1" ]]; then
-    EXTRA_ARGS="${EXTRA_ARGS},-nomac"
+    # TODO(b/122370901): Fix nomac, no_mac inconsistency.
+    EXTRA_ARGS="${EXTRA_ARGS},-nomac,-no_mac"
   fi
   EXTRA_ARGS="${EXTRA_ARGS} --build_tag_filters=-no_oss,-oss_serial,-benchmark-test"
   if [[ ${IS_MAC} == "1" ]]; then
-    EXTRA_ARGS="${EXTRA_ARGS},-nomac"
+    # TODO(b/122370901): Fix nomac, no_mac inconsistency.
+    EXTRA_ARGS="${EXTRA_ARGS},-nomac,-no_mac"
   fi
 fi
 
@@ -610,6 +613,13 @@ if [[ "${DO_DOCKER}" == "1" ]]; then
   fi
 fi
 
+# Set a disk usage trap.
+function debug_disk_usage {
+    echo "Finished script... disk usage report in ${TMP_DIR}"
+    du -k -d 2 ${TMP_DIR} | sort -n -r
+}
+# trap debug_disk_usage EXIT
+
 chmod +x ${TMP_SCRIPT}
 
 # Map TF_BUILD container types to containers we actually have.
@@ -645,6 +655,8 @@ echo ""
 echo "Parameterized build ends with ${RESULT} at: $(date) "\
 "(Elapsed time: $((END_TIME - START_TIME)) s)"
 
+# Dump disk usage
+debug_disk_usage
 
 # Clean up temporary directory if it exists
 if [[ ! -z "${TMP_DIR}" ]]; then
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 2c348a0e3390af05cffff5d9a73d0bd57caa92b4..afb282715bc9197eb2250cb823af2a0f36a0d233 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -540,9 +540,12 @@ do_check_file_name_test() {
   python file_name_test.py
 }
 
+do_libtensorflow_framework_not_depend_on_cuda_check() {
+  bazel build --action_env=TF_NEED_CUDA=1 --define framework_shared_object=true --config=cuda --nobuild_tests_only tensorflow/core/platform/default/build_config:libtensorflow_cuda_check_deps
+}
 # Supply all sanity step commands and descriptions
-SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_check_file_name_test")
-SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Check file names for cases")
+SANITY_STEPS=("do_pylint PYTHON2" "do_pylint PYTHON3" "do_check_futures_test" "do_buildifier" "do_bazel_nobuild" "do_pip_package_licenses_check" "do_lib_package_licenses_check" "do_java_package_licenses_check" "do_pip_smoke_test" "do_check_load_py_test" "do_code_link_check" "do_check_file_name_test" "do_libtensorflow_framework_not_depend_on_cuda_check")
+SANITY_STEPS_DESC=("Python 2 pylint" "Python 3 pylint" "Check that python files have certain __future__ imports" "buildifier check" "bazel nobuild" "pip: license check for external dependencies" "C library: license check for external dependencies" "Java Native Library: license check for external dependencies" "Pip Smoke Test: Checking py_test dependencies exist in pip package" "Check load py_test: Check that BUILD files with py_test target properly load py_test" "Code Link Check: Check there are no broken links" "Check file names for cases" "Check gpu libtensorflow_framework.so does not depend on cuda shared libraries.")
 
 INCREMENTAL_FLAG=""
 DEFAULT_BAZEL_CONFIGS=""
diff --git a/tensorflow/tools/ci_build/copy_binary.py b/tensorflow/tools/ci_build/copy_binary.py
index 148526492d25e9acebe036294175e2814b2ead12..856d64eb82f48525d4851b1f2167e44c849eaa27 100755
--- a/tensorflow/tools/ci_build/copy_binary.py
+++ b/tensorflow/tools/ci_build/copy_binary.py
@@ -32,8 +32,8 @@ import shutil
 import tempfile
 import zipfile
 
-TF_NIGHTLY_REGEX = (r"(.+)tf_nightly(|_gpu)-(\d\.[\d]{1,2}"
-                    "\.\d.dev[\d]{0,8})-(.+)\.whl")
+TF_NIGHTLY_REGEX = (r"(.+)(tf_nightly.*)-(\d\.[\d]{1,2}"
+                    r"\.\d.dev[\d]{0,8})-(.+)\.whl")
 BINARY_STRING_TEMPLATE = "%s-%s-%s.whl"
 
 
@@ -43,7 +43,7 @@ def check_existence(filename):
     raise RuntimeError("%s not found." % filename)
 
 
-def copy_binary(directory, origin_tag, new_tag, version, gpu=False):
+def copy_binary(directory, origin_tag, new_tag, version, package):
   """Rename and copy binaries for different python versions.
 
   Arguments:
@@ -51,14 +51,10 @@ def copy_binary(directory, origin_tag, new_tag, version, gpu=False):
     origin_tag: str of the old python version tag
     new_tag: str of the new tag
     version: the version of the package
-    gpu: bool if its a gpu build or not
+    package: str, name of the package
 
   """
   print("Rename and copy binaries with %s to %s." % (origin_tag, new_tag))
-  if gpu:
-    package = "tf_nightly_gpu"
-  else:
-    package = "tf_nightly"
   origin_binary = BINARY_STRING_TEMPLATE % (package, version, origin_tag)
   new_binary = BINARY_STRING_TEMPLATE % (package, version, new_tag)
   zip_ref = zipfile.ZipFile(os.path.join(directory, origin_binary), "r")
@@ -120,7 +116,7 @@ def main():
   check_existence(args.filename)
   regex_groups = re.search(TF_NIGHTLY_REGEX, args.filename)
   directory = regex_groups.group(1)
-  gpu = regex_groups.group(2)
+  package = regex_groups.group(2)
   version = regex_groups.group(3)
   origin_tag = regex_groups.group(4)
   old_py_ver = re.search(r"(cp\d\d)", origin_tag).group(1)
@@ -129,7 +125,7 @@ def main():
   new_tag = origin_tag.replace(old_py_ver, "cp" + args.new_py_ver)
 
   # Copy the binary with the info we have
-  copy_binary(directory, origin_tag, new_tag, version, gpu)
+  copy_binary(directory, origin_tag, new_tag, version, package)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/tools/ci_build/install/.bazelrc b/tensorflow/tools/ci_build/install/.bazelrc
index 2060babd4a450a0fc318f8b9ee5cb8536d57189c..4662e2e60a16e8dd675ee2131a94fb313b9d5b8b 100644
--- a/tensorflow/tools/ci_build/install/.bazelrc
+++ b/tensorflow/tools/ci_build/install/.bazelrc
@@ -5,7 +5,7 @@ startup --batch
 
 # Similarly, we need to workaround sandboxing issues:
 #   https://github.com/bazelbuild/bazel/issues/418
-build  --verbose_failures --spawn_strategy=standalone --genrule_strategy=standalone
+build  --verbose_failures --spawn_strategy=standalone --strategy=Genrule=standalone
 test --spawn_strategy=standalone
 
 # Force bazel output to use colors (good for jenkins) and print useful errors.
diff --git a/tensorflow/tools/ci_build/install/install_bazel.sh b/tensorflow/tools/ci_build/install/install_bazel.sh
index e284401b8aa469ebcbed856cd09dd597be242d7a..f45ac3eab37bdb2a51c44f68d51fbdb42b5f82d1 100755
--- a/tensorflow/tools/ci_build/install/install_bazel.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel.sh
@@ -15,7 +15,7 @@
 # ==============================================================================
 
 # Select bazel version.
-BAZEL_VERSION="0.15.0"
+BAZEL_VERSION="0.20.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
index 87be81577d0efb395a12afc85109f10ad4178c27..9501a6d94b026774753bbd162fddec3c20753740 100755
--- a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
@@ -18,7 +18,7 @@
 # It will compile bazel from source and install it in /usr/local/bin
 
 # Select bazel version.
-BAZEL_VERSION="0.15.0"
+BAZEL_VERSION="0.20.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/install/install_deb_packages.sh b/tensorflow/tools/ci_build/install/install_deb_packages.sh
index 989f2a92eb6e5940b0557452080c3b0f3cf706ae..bd810016d2a05071e73cf3a8c72600b315e99679 100755
--- a/tensorflow/tools/ci_build/install/install_deb_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_deb_packages.sh
@@ -68,12 +68,6 @@ apt-get install -y --no-install-recommends \
     zip \
     zlib1g-dev
 
-apt-get update && \
-  apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
-  apt-get update && \
-  apt-get install libnvinfer4=4.1.2-1+cuda9.0 && \
-  apt-get install libnvinfer-dev=4.1.2-1+cuda9.0
-
 # populate the database
 updatedb
 
diff --git a/tensorflow/tools/ci_build/install/install_pi_python3_toolchain.sh b/tensorflow/tools/ci_build/install/install_pi_python3_toolchain.sh
index 9d8e3df3b5c3e192b987718318465c14184d4045..b6734e55226842fc54667fbdf3a349c321e45edd 100755
--- a/tensorflow/tools/ci_build/install/install_pi_python3_toolchain.sh
+++ b/tensorflow/tools/ci_build/install/install_pi_python3_toolchain.sh
@@ -25,5 +25,4 @@ apt-get install -y libpython3-all-dev:armhf
 echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | sudo tee /etc/apt/sources.list.d/bazel.list
 curl https://bazel.build/bazel-release.pub.gpg | sudo apt-key add -
 apt-get update
-rm -rf /usr/local/bin/bazel
-apt-get install -y bazel python3 python3-numpy python3-dev python3-pip
+apt-get install -y python3 python3-numpy python3-dev python3-pip
diff --git a/tensorflow/tools/ci_build/install/install_pi_toolchain.sh b/tensorflow/tools/ci_build/install/install_pi_toolchain.sh
index 03c43cc83805fbde8576b9d170c1d3d6c3993625..0238cc5895ff3e848f974ee464f77450f92a3f22 100755
--- a/tensorflow/tools/ci_build/install/install_pi_toolchain.sh
+++ b/tensorflow/tools/ci_build/install/install_pi_toolchain.sh
@@ -25,5 +25,4 @@ apt-get install -y libpython-all-dev:armhf
 echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | sudo tee /etc/apt/sources.list.d/bazel.list
 curl https://bazel.build/bazel-release.pub.gpg | sudo apt-key add -
 apt-get update
-rm -rf /usr/local/bin/bazel
-apt-get install -y bazel python python-numpy python-dev python-pip
+apt-get install -y python python-numpy python-dev python-pip
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 3470488cc55d3ec54af3546d33f4d1f8fc5e94d6..131950dc0d3e2499ffe77ea424db83c68e713130 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -18,10 +18,10 @@ set -e
 
 # We don't apt-get install so that we can install a newer version of pip.
 # Only needed for Ubuntu 14.04 and 16.04; not needed for 18.04 and Debian 8,9?
-# Run easy_install before easy_install3, so that the default pip points to pip2,
+# Run easy_install after easy_install3, so that the default pip points to pip2,
 # to match the default python version of 2.7.
-easy_install3 -U pip==9.0.3
-easy_install -U pip==9.0.3
+easy_install3 -U pip==18.1
+easy_install -U pip==18.1
 
 # Install pip packages from whl files to avoid the time-consuming process of
 # building from source.
@@ -40,8 +40,8 @@ pip2 install virtualenv
 pip3 install virtualenv
 
 # Install six.
-pip2 install --upgrade six==1.10.0
-pip3 install --upgrade six==1.10.0
+pip2 install --upgrade six==1.12.0
+pip3 install --upgrade six==1.12.0
 
 # Install absl-py.
 pip2 install --upgrade absl-py
@@ -60,8 +60,8 @@ pip2 install --upgrade markdown==2.6.8
 pip3 install --upgrade markdown==2.6.8
 
 # Install protobuf.
-pip2 install --upgrade protobuf==3.6.0
-pip3 install --upgrade protobuf==3.6.0
+pip2 install --upgrade protobuf==3.6.1
+pip3 install --upgrade protobuf==3.6.1
 
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
@@ -97,9 +97,9 @@ pip3 install py-cpuinfo
 pip2 install pylint==1.6.4
 pip3 install pylint==1.6.4
 
-# pep8 tests require the following:
-pip2 install pep8
-pip3 install pep8
+# pycodestyle tests require the following:
+pip2 install pycodestyle
+pip3 install pycodestyle
 
 # tf.mock require the following for python2:
 pip2 install mock
@@ -128,5 +128,5 @@ pip2 install --upgrade h5py==2.8.0
 pip3 install --upgrade h5py==2.8.0
 
 # Estimator
-pip2 install tensorflow_estimator --no-deps
-pip3 install tensorflow_estimator --no-deps
+pip2 install tf-estimator-nightly==1.12.0.dev20181203 --no-deps
+pip3 install tf-estimator-nightly==1.12.0.dev20181203 --no-deps
diff --git a/tensorflow/tools/ci_build/install/install_proto3.sh b/tensorflow/tools/ci_build/install/install_proto3.sh
index 821d50baff325106fceca368d46042401d13c336..3cb100856706558cacf6d2b601e2b34fd194082e 100755
--- a/tensorflow/tools/ci_build/install/install_proto3.sh
+++ b/tensorflow/tools/ci_build/install/install_proto3.sh
@@ -17,7 +17,7 @@
 # Install protobuf3.
 
 # Select protobuf version.
-PROTOBUF_VERSION="3.6.0"
+PROTOBUF_VERSION="3.6.1"
 protobuf_ver_flat=$(echo $PROTOBUF_VERSION | sed 's/\.//g' | sed 's/^0*//g')
 local_protobuf_ver=$(protoc --version)
 local_protobuf_ver_flat=$(echo $local_protobuf_ver | sed 's/\.//g' | sed 's/^0*//g')
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index 62e04df717316ffc8cf211a6887730be115623be..a58f49af2867812961675b7db61415b94febef39 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -52,7 +52,7 @@ pip3.5 install --upgrade absl-py
 pip3.5 install --upgrade six==1.10.0
 
 # Install protobuf.
-pip3.5 install --upgrade protobuf==3.6.0
+pip3.5 install --upgrade protobuf==3.6.1
 
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
@@ -90,4 +90,7 @@ pip3.5 install keras_applications==1.0.6
 pip3.5 install keras_preprocessing==1.0.5
 pip3.5 install --upgrade h5py==2.8.0
 
+# Estimator
+pip3.5 install tf-estimator-nightly==1.12.0.dev20181203 --no-deps
+
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index 48d556b1dd8e3e17b763b9c71e78e1d551554703..b1c2a0ab00a344df2dd26c74440bdb4a95ac410a 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -64,7 +64,7 @@ pip3 install --upgrade absl-py
 pip3 install --upgrade six==1.10.0
 
 # Install protobuf.
-pip3 install --upgrade protobuf==3.6.0
+pip3 install --upgrade protobuf==3.6.1
 
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
@@ -107,4 +107,7 @@ pip3 install --upgrade h5py==2.8.0
 pip3 install keras_applications==1.0.6
 pip3 install keras_preprocessing==1.0.5
 
+# Estimator
+pip3 install tf-estimator-nightly==1.12.0.dev20181203 --no-deps
+
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh b/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
index 7be5f454ecd6344cc1b0b79789c2b18acefc448d..a8b73cbe0cfe7fda70483a8b10fee2a7648b138a 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
@@ -36,4 +36,4 @@ yes "" | $PYTHON_BIN_PATH configure.py
 bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test --test_lang_filters=cc,py -k \
     --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only \
     --config=mkl --test_env=KMP_BLOCKTIME=0 --config=opt --test_output=errors -- \
-    //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
+    //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/... -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh b/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
index 1d0b838c1b5c101e202fcb3cb64c3ce4a9bd34d6..1944183c0e8124c2ed4e572ac4a63f1f82f5c380 100755
--- a/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
@@ -19,9 +19,10 @@ set -e
 set -x
 
 N_JOBS=$(grep -c ^processor /proc/cpuinfo)
+N_GPUS=$(lspci|grep 'VGA'|grep 'AMD/ATI'|wc -l)
 
 echo ""
-echo "Bazel will use ${N_JOBS} concurrent job(s)."
+echo "Bazel will use ${N_JOBS} concurrent build job(s) and ${N_GPUS} concurrent test job(s)."
 echo ""
 
 # Run configure.
@@ -29,11 +30,14 @@ export PYTHON_BIN_PATH=`which python3`
 export CC_OPT_FLAGS='-mavx'
 
 export TF_NEED_ROCM=1
+export TF_GPU_COUNT=${N_GPUS}
 
 yes "" | $PYTHON_BIN_PATH configure.py
 
 # Run bazel test command. Double test timeouts to avoid flakes.
-bazel test --config=rocm --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-benchmark-test -k \
-    --test_lang_filters=py --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
-    --build_tests_only --test_output=errors --local_test_jobs=1 --config=opt \
+bazel test --config=rocm --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test -k \
+    --test_lang_filters=py --jobs=${N_JOBS} --test_timeout 600,900,2400,7200 \
+    --build_tests_only --test_output=errors --local_test_jobs=${TF_GPU_COUNT} --config=opt \
+    --test_sharding_strategy=disabled \
+    --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh b/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh
index 3efd994d783d8f47b3471cc5ce177293b1e017cc..1184d4acec61f36cc630df313d403d33d73e1e7a 100755
--- a/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh
+++ b/tensorflow/tools/ci_build/osx/cpu/run_contrib.sh
@@ -31,6 +31,7 @@ export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python2)
 yes "" | $PYTHON_BIN_PATH configure.py
 which bazel
+# TODO(b/122370901): Fix nomac, no_mac inconsistency.
 bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test,-nomac,-no_mac \
     --test_timeout 300,450,1200,3600 \
     --test_size_filters=small,medium --config=opt \
diff --git a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
index adee0d3171fe13261f177a6f8a3b55aeb5789cc5..d39340b1d83dde254a00fea1ff6090e1df2d10ae 100755
--- a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
+++ b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
@@ -32,6 +32,7 @@ export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python2)
 yes "" | $PYTHON_BIN_PATH configure.py
 which bazel
+# TODO(b/122370901): Fix nomac, no_mac inconsistency.
 bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test,-nomac,-no_mac \
     --test_timeout 300,450,1200,3600 --config=opt \
     --announce_rc \
diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index 864278c6477b4b1e7e9bc3836e3e3d102d086530..987f0769b2d6da4631b6f408af4dbf62d9099f76 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -107,6 +107,7 @@ bazel build -c opt ${PI_COPTS} \
   --copt=-funsafe-math-optimizations --copt=-ftree-vectorize \
   --copt=-fomit-frame-pointer --cpu=armeabi \
   --crosstool_top=@local_config_arm_compiler//:toolchain \
+  --define tensorflow_mkldnn_contraction_kernel=0 \
   --verbose_failures \
   //tensorflow:libtensorflow.so \
   //tensorflow:libtensorflow_framework.so \
diff --git a/tensorflow/tools/ci_build/update_version.py b/tensorflow/tools/ci_build/update_version.py
index 4373d464b6a9f8cf6d498652d7afeed507a666ba..117627c458c3326735deb667b484c642b25a2ba9 100755
--- a/tensorflow/tools/ci_build/update_version.py
+++ b/tensorflow/tools/ci_build/update_version.py
@@ -84,19 +84,26 @@ class Version(object):
       identifier_string: extension string eg. (-rc0)
       version_type: version parameter ((REGULAR|NIGHTLY)_VERSION)
     """
-    self.string = "%s.%s.%s%s" % (major,
-                                  minor,
-                                  patch,
-                                  identifier_string)
     self.major = major
     self.minor = minor
     self.patch = patch
     self.identifier_string = identifier_string
     self.version_type = version_type
+    self._update_string()
+
+  def _update_string(self):
+    self.string = "%s.%s.%s%s" % (self.major,
+                                  self.minor,
+                                  self.patch,
+                                  self.identifier_string)
 
   def __str__(self):
     return self.string
 
+  def set_identifier_string(self, identifier_string):
+    self.identifier_string = identifier_string
+    self._update_string()
+
   @property
   def pep_440_str(self):
     if self.version_type == REGULAR_VERSION:
@@ -283,15 +290,14 @@ def main():
   """
 
   parser = argparse.ArgumentParser(description="Cherry picking automation.")
-  group = parser.add_mutually_exclusive_group(required=True)
 
   # Arg information
-  group.add_argument("--version",
-                     help="<new_major_ver>.<new_minor_ver>.<new_patch_ver>",
-                     default="")
-  group.add_argument("--nightly",
-                     help="disable the service provisioning step",
-                     action="store_true")
+  parser.add_argument("--version",
+                      help="<new_major_ver>.<new_minor_ver>.<new_patch_ver>",
+                      default="")
+  parser.add_argument("--nightly",
+                      help="disable the service provisioning step",
+                      action="store_true")
 
   args = parser.parse_args()
 
@@ -299,13 +305,17 @@ def main():
   old_version = get_current_semver_version()
 
   if args.nightly:
-    # Dev minor version is one ahead of official.
-    nightly_minor_ver = int(old_version.minor) + 1
-    new_version = Version(old_version.major,
-                          str(nightly_minor_ver),
-                          old_version.patch,
-                          "-dev" + time.strftime("%Y%m%d"),
-                          NIGHTLY_VERSION)
+    if args.version:
+      new_version = Version.parse_from_string(args.version, NIGHTLY_VERSION)
+      new_version.set_identifier_string("-dev" + time.strftime("%Y%m%d"))
+    else:
+      # Dev minor version is one ahead of official.
+      nightly_minor_ver = int(old_version.minor) + 1
+      new_version = Version(old_version.major,
+                            str(nightly_minor_ver),
+                            old_version.patch,
+                            "-dev" + time.strftime("%Y%m%d"),
+                            NIGHTLY_VERSION)
   else:
     new_version = Version.parse_from_string(args.version, REGULAR_VERSION)
 
diff --git a/tensorflow/tools/ci_build/windows/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
index c18f0d6e69d98ac50f0aa850f1c78ceaab4c36e2..9c6825f27164bdc694fbd694a90792f6a58f852c 100644
--- a/tensorflow/tools/ci_build/windows/bazel/common_env.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
@@ -30,6 +30,9 @@ export TMPDIR=${TMPDIR:-"C:/tmp"}
 export TMPDIR=$(cygpath -m "$TMPDIR")
 mkdir -p "$TMPDIR"
 
+# Add timestamps before each command.
+export PS4='+ $(date) + '
+
 # Set bash path
 export BAZEL_SH=${BAZEL_SH:-"C:/tools/msys64/usr/bin/bash"}
 
@@ -52,9 +55,9 @@ export PATH="/c/Program Files/Git/cmd:$PATH"
 export PATH="/c/${PYTHON_BASE_PATH}/Scripts:$PATH"
 
 # Setting default values to CUDA related environment variables
-export TF_CUDA_VERSION=${TF_CUDA_VERSION:-9.0}
+export TF_CUDA_VERSION=${TF_CUDA_VERSION:-10.0}
 export TF_CUDNN_VERSION=${TF_CUDNN_VERSION:-7}
-export TF_CUDA_COMPUTE_CAPABILITIES=${TF_CUDA_COMPUTE_CAPABILITIES:-3.7}
+export TF_CUDA_COMPUTE_CAPABILITIES=${TF_CUDA_COMPUTE_CAPABILITIES:-6.0}
 export CUDA_TOOLKIT_PATH=${CUDA_TOOLKIT_PATH:-"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${TF_CUDA_VERSION}"}
 export CUDNN_INSTALL_PATH=${CUDNN_INSTALL_PATH:-"C:/tools/cuda"}
 
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 62e1eaa366865616c063d9f9785b863033a32706..a938928baab9e010a5f2d7b8c209146fdc424932 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -60,6 +60,7 @@ RELEASE_BUILD=0
 TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..."
 PROJECT_NAME=""
 EXTRA_BUILD_FLAGS=""
+EXTRA_TEST_FLAGS=""
 
 # --skip_test            Skip running tests
 # --enable_remote_cache  Add options to enable remote cache for build and test
@@ -67,7 +68,6 @@ EXTRA_BUILD_FLAGS=""
 #                        ensure performance
 # --test_core_only       Use tensorflow/python/... as test target
 # --test_contrib_only    Use tensorflow/contrib/... as test target
-#for ARG in "$@"; do
 while [[ $# -gt 0 ]]; do
   case "$1" in
     --tf_nightly) TF_NIGHTLY=1 ;;
@@ -90,6 +90,13 @@ while [[ $# -gt 0 ]]; do
       fi
       PROJECT_NAME="$1"
       ;;
+    --extra_test_flags)
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      EXTRA_TEST_FLAGS="$1"
+      ;;
     *)
   esac
   shift
@@ -105,7 +112,11 @@ else
 fi
 
 if [[ "$TF_NIGHTLY" == 1 ]]; then
-  python tensorflow/tools/ci_build/update_version.py --nightly
+  if [[ ${PROJECT_NAME} == *"2.0_preview"* ]]; then
+    python tensorflow/tools/ci_build/update_version.py --version=2.0.0 --nightly
+  else
+    python tensorflow/tools/ci_build/update_version.py --nightly
+  fi
   if [ -z ${PROJECT_NAME} ]; then
     EXTRA_PIP_FLAGS="--nightly_flag"
   else
@@ -122,6 +133,10 @@ fi
 
 run_configure_for_cpu_build
 
+bazel build --announce_rc --config=opt ${EXTRA_BUILD_FLAGS}  \
+  --build_tag_filters=-no_pip,-no_windows,-no_oss,-gpu \
+  tensorflow/lite:framework tensorflow/lite/examples/minimal:minimal || exit $?
+
 bazel build --announce_rc --config=opt ${EXTRA_BUILD_FLAGS} \
   tensorflow/tools/pip_package:build_pip_package \
   --incompatible_remove_native_http_archive=false || exit $?
@@ -133,7 +148,7 @@ fi
 # Create a python test directory to avoid package name conflict
 create_python_test_dir "${PY_TEST_DIR}"
 
-./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}" "${EXTRA_PIP_FLAGS}"
+./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}" ${EXTRA_PIP_FLAGS}
 
 if [[ "$TF_NIGHTLY" == 1 ]]; then
   exit 0
@@ -149,6 +164,7 @@ N_JOBS="${NUMBER_OF_PROCESSORS}"
 # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
 # which will result testing system installed tensorflow
 bazel test --announce_rc --config=opt -k --test_output=errors \
+  ${EXTRA_TEST_FLAGS} \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
   --test_tag_filters=-no_pip,-no_windows,-no_oss,-gpu \
   --build_tag_filters=-no_pip,-no_windows,-no_oss,-gpu --build_tests_only \
diff --git a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
index acafd9ebce3afa634c1a1aafd4d9ac5c57935d80..efdd5f13c87e187c84e6e1d11770ebdb91e9df41 100644
--- a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
@@ -68,7 +68,7 @@ EXTRA_BUILD_FLAGS=""
 # --test_core_only       Use tensorflow/python/... as test target
 # --test_contrib_only    Use tensorflow/contrib/... as test target
 while [[ $# -gt 0 ]]; do
-  case "$ARG" in
+  case "$1" in
     --tf_nightly) TF_NIGHTLY=1 ;;
     --skip_test) SKIP_TEST=1 ;;
     --enable_remote_cache) set_remote_cache_options ;;
@@ -104,11 +104,15 @@ else
 fi
 
 if [[ "$TF_NIGHTLY" == 1 ]]; then
-  python tensorflow/tools/ci_build/update_version.py --nightly
+  if [[ ${PROJECT_NAME} == *"2.0_preview"* ]]; then
+    python tensorflow/tools/ci_build/update_version.py --version=2.0.0 --nightly
+  else
+    python tensorflow/tools/ci_build/update_version.py --nightly
+  fi
   if [ -z ${PROJECT_NAME} ]; then
     EXTRA_PIP_FLAGS="--nightly_flag"
   else
-    EXTRA_PIP_FLAGS="--project_name=${PROJECT_NAME} --nightly_flag"
+    EXTRA_PIP_FLAGS="--project_name ${PROJECT_NAME} --nightly_flag"
   fi
 fi
 
@@ -136,7 +140,7 @@ fi
 create_python_test_dir "${PY_TEST_DIR}"
 
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}" \
-  --gpu "${EXTRA_PIP_FLAGS}"
+  --gpu ${EXTRA_PIP_FLAGS}
 
 if [[ "$TF_NIGHTLY" == 1 ]]; then
   exit 0
diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
index 7dfee8f371b8c4795fe748d1fd02ee8d884f18f9..9c05db974b4e30c2997a9c0d11f792ae52587eb5 100644
--- a/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
@@ -41,7 +41,7 @@ run_configure_for_gpu_build
 # build_libtensorflow_tarball in ../builds/libtensorflow.sh
 # cannot be used on Windows since it relies on pkg_tar rules.
 # So we do something special here
-bazel build -c opt --copt=/arch:AVX --announce_rc \
+bazel --output_user_root=${TMPDIR} build -c opt --copt=/arch:AVX --announce_rc \
   tensorflow:libtensorflow.so \
   tensorflow/tools/lib_package:clicenses_generate \
   tensorflow/java:libtensorflow_jni.so \
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index a9902d77f5ec103fe2000a4a470d425e3998f45e..74ef9ec98beaa4ca444e9022e1112704014509b8 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -1,17 +1,21 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
-
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_copts",  # @unused
     "tf_cc_test",  # @unused
 )
 
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
 py_library(
     name = "ast_edits",
     srcs = ["ast_edits.py"],
     srcs_version = "PY2AND3",
+    deps = [
+        "@pasta",
+        "@six_archive//:six",
+    ],
 )
 
 py_test(
@@ -30,6 +34,13 @@ py_binary(
     name = "tf_upgrade",
     srcs = ["tf_upgrade.py"],
     srcs_version = "PY2AND3",
+    deps = [":tf_upgrade_lib"],
+)
+
+py_library(
+    name = "tf_upgrade_lib",
+    srcs = ["tf_upgrade.py"],
+    srcs_version = "PY2AND3",
     deps = [":ast_edits"],
 )
 
@@ -38,7 +49,7 @@ py_test(
     srcs = ["tf_upgrade_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":tf_upgrade",
+        ":tf_upgrade_lib",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
         "@six_archive//:six",
@@ -65,6 +76,7 @@ py_library(
         ":ast_edits",
         ":renames_v2",
         ":reorders_v2",
+        "@six_archive//:six",
     ],
 )
 
@@ -84,7 +96,7 @@ py_test(
     srcs = ["tf_upgrade_v2_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":tf_upgrade_v2",
+        ":tf_upgrade_v2_lib",
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
diff --git a/tensorflow/tools/compatibility/README.md b/tensorflow/tools/compatibility/README.md
index 6ff42b1fefe983d2119ddc7841d14d888443b49a..5e2de35338a9460205272112195ff5560d6e503c 100644
--- a/tensorflow/tools/compatibility/README.md
+++ b/tensorflow/tools/compatibility/README.md
@@ -7,7 +7,7 @@ Specifically: \
 
 ## Running the script from pip package
 
-First, install TensorFlow pip package. See
+First, install TensorFlow pip package*. See
 https://www.tensorflow.org/install/pip.
 
 Upgrade script can be run on a single Python file:
@@ -27,6 +27,8 @@ tf_upgrade_v2 --intree coolcode --outtree coolcode-upgraded
 tf_upgrade_v2 --intree coolcode --outtree coolcode-upgraded --copyotherfiles False
 ```
 
+*Note: `tf_upgrade_v2` is installed automatically as a script by the pip install 
+after TensorFlow 1.12.
 
 ## Report
 
diff --git a/tensorflow/tools/compatibility/ast_edits.py b/tensorflow/tools/compatibility/ast_edits.py
index eac2150502d6511da127a42fbb46c92bea7fe364..3d421f6704ffd26e1c5c34787aa0c29213e53cac 100644
--- a/tensorflow/tools/compatibility/ast_edits.py
+++ b/tensorflow/tools/compatibility/ast_edits.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import ast
-import collections
 import os
 import re
 import shutil
@@ -27,279 +26,365 @@ import sys
 import tempfile
 import traceback
 
+import pasta
+import six
+
 # Some regular expressions we will need for parsing
 FIND_OPEN = re.compile(r"^\s*(\[).*$")
 FIND_STRING_CHARS = re.compile(r"['\"]")
 
 
-class APIChangeSpec(object):
-  """This class defines the transformations that need to happen.
+INFO = "INFO"
+WARNING = "WARNING"
+ERROR = "ERROR"
 
-  This class must provide the following fields:
 
-  * `function_keyword_renames`: maps function names to a map of old -> new
-    argument names
-  * `symbol_renames`: maps function names to new function names
-  * `change_to_function`: a set of function names that have changed (for
-    notifications)
-  * `function_reorders`: maps functions whose argument order has changed to the
-    list of arguments in the new order
-  * `function_handle`: maps function names to custom handlers for the function
-  * `function_warnings`: maps full names of functions to warnings that will be
-    printed out if the function is used. (e.g. tf.nn.convolution())
-  * `unrestricted_function_warnings`: maps names of functions to warnings that
-    will be printed out when the function is used (e.g. foo.convolution()).
+def full_name_node(name, ctx=ast.Load()):
+  """Make an Attribute or Name node for name.
 
-  For an example, see `TFAPIChangeSpec`.
+  Translate a qualified name into nested Attribute nodes (and a Name node).
+
+  Args:
+    name: The name to translate to a node.
+    ctx: What context this name is used in. Defaults to Load()
+
+  Returns:
+    A Name or Attribute node.
   """
+  names = name.split(".")
+  names.reverse()
+  node = ast.Name(id=names.pop(), ctx=ast.Load())
+  while names:
+    node = ast.Attribute(value=node, attr=names.pop(), ctx=ast.Load())
 
+  # Change outermost ctx to the one given to us (inner ones should be Load).
+  node.ctx = ctx
+  return node
 
-class _FileEditTuple(
-    collections.namedtuple("_FileEditTuple",
-                           ["comment", "line", "start", "old", "new"])):
-  """Each edit that is recorded by a _FileEditRecorder.
 
-  Fields:
-    comment: A description of the edit and why it was made.
-    line: The line number in the file where the edit occurs (1-indexed).
-    start: The column number in the file where the edit occurs (0-indexed).
-    old: text string to remove (this must match what was in file).
-    new: text string to add in place of `old`.
-  """
+def get_arg_value(node, arg_name, arg_pos=None):
+  """Get the value of an argument from a ast.Call node.
 
-  __slots__ = ()
+  This function goes through the positional and keyword arguments to check
+  whether a given argument was used, and if so, returns its value (the node
+  representing its value).
 
+  This cannot introspect *args or **args, but it safely handles *args in
+  Python3.5+.
 
-class _FileEditRecorder(object):
-  """Record changes that need to be done to the file."""
+  Args:
+    node: The ast.Call node to extract arg values from.
+    arg_name: The name of the argument to extract.
+    arg_pos: The position of the argument (in case it's passed as a positional
+      argument).
 
-  def __init__(self, filename):
-    # all edits are lists of chars
-    self._filename = filename
+  Returns:
+    A tuple (arg_present, arg_value) containing a boolean indicating whether
+    the argument is present, and its value in case it is.
+  """
+  # Check keyword args
+  if arg_name is not None:
+    for kw in node.keywords:
+      if kw.arg == arg_name:
+        return (True, kw.value)
 
-    self._line_to_edit = collections.defaultdict(list)
-    self._errors = []
+  # Check positional args
+  if arg_pos is not None:
+    idx = 0
+    for arg in node.args:
+      if sys.version_info[:2] >= (3, 5) and isinstance(arg, ast.Starred):
+        continue  # Can't parse Starred
+      if idx == arg_pos:
+        return (True, arg)
+      idx += 1
 
-  def process(self, text):
-    """Process a list of strings, each corresponding to the recorded changes.
+  return (False, None)
 
-    Args:
-      text: A list of lines of text (assumed to contain newlines)
-    Returns:
-      A tuple of the modified text and a textual description of what is done.
-    Raises:
-      ValueError: if substitution source location does not have expected text.
-    """
 
-    change_report = ""
-
-    # Iterate of each line
-    for line, edits in self._line_to_edit.items():
-      offset = 0
-      # sort by column so that edits are processed in order in order to make
-      # indexing adjustments cumulative for changes that change the string
-      # length
-      edits.sort(key=lambda x: x.start)
-
-      # Extract each line to a list of characters, because mutable lists
-      # are editable, unlike immutable strings.
-      char_array = list(text[line - 1])
-
-      # Record a description of the change
-      change_report += "%r Line %d\n" % (self._filename, line)
-      change_report += "-" * 80 + "\n\n"
-      for e in edits:
-        change_report += "%s\n" % e.comment
-      change_report += "\n    Old: %s" % (text[line - 1])
-
-      # Make underscore buffers for underlining where in the line the edit was
-      change_list = [" "] * len(text[line - 1])
-      change_list_new = [" "] * len(text[line - 1])
-
-      # Iterate for each edit
-      for e in edits:
-        # Create effective start, end by accounting for change in length due
-        # to previous edits
-        start_eff = e.start + offset
-        end_eff = start_eff + len(e.old)
-
-        # Make sure the edit is changing what it should be changing
-        old_actual = "".join(char_array[start_eff:end_eff])
-        if old_actual != e.old:
-          raise ValueError("Expected text %r but got %r" %
-                           ("".join(e.old), "".join(old_actual)))
-        # Make the edit
-        char_array[start_eff:end_eff] = list(e.new)
-
-        # Create the underline highlighting of the before and after
-        change_list[e.start:e.start + len(e.old)] = "~" * len(e.old)
-        change_list_new[start_eff:end_eff] = "~" * len(e.new)
-
-        # Keep track of how to generate effective ranges
-        offset += len(e.new) - len(e.old)
-
-      # Finish the report comment
-      change_report += "         %s\n" % "".join(change_list)
-      text[line - 1] = "".join(char_array)
-      change_report += "    New: %s" % (text[line - 1])
-      change_report += "         %s\n\n" % "".join(change_list_new)
-    return "".join(text), change_report, self._errors
-
-  def add(self, comment, line, start, old, new, error=None):
-    """Add a new change that is needed.
+class APIChangeSpec(object):
+  """This class defines the transformations that need to happen.
 
-    Args:
-      comment: A description of what was changed
-      line: Line number (1 indexed)
-      start: Column offset (0 indexed)
-      old: old text
-      new: new text
-      error: this "edit" is something that cannot be fixed automatically
-    Returns:
-      None
-    """
+  This class must provide the following fields:
 
-    self._line_to_edit[line].append(
-        _FileEditTuple(comment, line, start, old, new))
-    if error:
-      self._errors.append("%s:%d: %s" % (self._filename, line, error))
+  * `function_keyword_renames`: maps function names to a map of old -> new
+    argument names
+  * `symbol_renames`: maps function names to new function names
+  * `change_to_function`: a set of function names that have changed (for
+    notifications)
+  * `function_reorders`: maps functions whose argument order has changed to the
+    list of arguments in the new order
+  * `function_warnings`: maps full names of functions to warnings that will be
+    printed out if the function is used. (e.g. tf.nn.convolution())
+  * `function_transformers`: maps function names to custom handlers
+  * `module_deprecations`: maps module names to warnings that will be printed
+    if the module is still used after all other transformations have run
 
+  For an example, see `TFAPIChangeSpec`.
+  """
 
-class _ASTCallVisitor(ast.NodeVisitor):
+
+class _PastaEditVisitor(ast.NodeVisitor):
   """AST Visitor that processes function calls.
 
   Updates function calls from old API version to new API version using a given
   change spec.
   """
 
-  def __init__(self, filename, lines, api_change_spec):
-    self._filename = filename
-    self._file_edit = _FileEditRecorder(filename)
-    self._lines = lines
+  def __init__(self, api_change_spec):
     self._api_change_spec = api_change_spec
+    self._log = []   # Holds 4-tuples: severity, line, col, msg.
+    self._stack = []  # Allow easy access to parents.
 
-  def process(self, lines):
-    return self._file_edit.process(lines)
+  # Overridden to maintain a stack of nodes to allow for parent access
+  def visit(self, node):
+    self._stack.append(node)
+    super(_PastaEditVisitor, self).visit(node)
+    self._stack.pop()
 
-  def generic_visit(self, node):
-    ast.NodeVisitor.generic_visit(self, node)
+  @property
+  def errors(self):
+    return [log for log in self._log if log[0] == ERROR]
 
-  def _rename_functions(self, node, full_name):
-    symbol_renames = self._api_change_spec.symbol_renames
-    try:
-      new_name = symbol_renames[full_name]
-      self._file_edit.add("Renamed function %r to %r" % (full_name, new_name),
-                          node.lineno, node.col_offset, full_name, new_name)
-    except KeyError:
-      pass
+  @property
+  def warnings(self):
+    return [log for log in self._log if log[0] == WARNING]
 
-  def _print_warning_for_function(self, node, full_name):
-    function_warnings = self._api_change_spec.function_warnings
-    try:
-      warning_message = function_warnings[full_name]
-      warning_message = warning_message.replace("<function name>", full_name)
-      self._file_edit.add(warning_message,
-                          node.lineno, node.col_offset, full_name, full_name,
-                          error="%s requires manual check." % full_name)
-    except KeyError:
-      pass
+  @property
+  def warnings_and_errors(self):
+    return [log for log in self._log if log[0] in (WARNING, ERROR)]
 
-  def _print_warning_for_function_unrestricted(self, node):
-    """Print a warning when specific functions are called.
+  @property
+  def info(self):
+    return [log for log in self._log if log[0] == INFO]
 
-    The function _print_warning_for_function matches the full name of the called
-    function, e.g., tf.foo.bar(). This function matches the function name that
-    is called, as long as the function is an attribute. For example,
-    `tf.foo.bar()` and `foo.bar()` are matched, but not `bar()`.
+  @property
+  def log(self):
+    return self._log
+
+  def add_log(self, severity, lineno, col, msg):
+    self._log.append((severity, lineno, col, msg))
+    print("%s line %d:%d: %s" % (severity, lineno, col, msg))
+
+  def add_logs(self, logs):
+    """Record a log and print it.
+
+    The log should be a tuple `(severity, lineno, col_offset, msg)`, which will
+    be printed and recorded. It is part of the log available in the `self.log`
+    property.
 
     Args:
-      node: ast.Call object
+      logs: The logs to add. Must be a list of tuples
+        `(severity, lineno, col_offset, msg)`.
     """
-    function_warnings = getattr(
-        self._api_change_spec, "unrestricted_function_warnings", {})
-    if isinstance(node.func, ast.Attribute):
-      function_name = node.func.attr
-      try:
-        warning_message = function_warnings[function_name]
-        self._file_edit.add(warning_message,
-                            node.lineno, node.col_offset, "", "",
-                            error="%s requires manual check." % function_name)
-      except KeyError:
-        pass
-
-  def _get_attribute_full_path(self, node):
-    """Traverse an attribute to generate a full name e.g. tf.foo.bar.
+    self._log.extend(logs)
+    for log in logs:
+      print("%s line %d:%d: %s" % log)
+
+  def _get_applicable_entries(self, transformer_field, full_name, name):
+    """Get all list entries indexed by name that apply to full_name or name."""
+    # Transformers are indexed to full name, name, or no name
+    # as a performance optimization.
+    function_transformers = getattr(self._api_change_spec,
+                                    transformer_field, {})
+
+    glob_name = "*." + name if name else None
+    transformers = []
+    if full_name in function_transformers:
+      transformers.append(function_transformers[full_name])
+    if glob_name in function_transformers:
+      transformers.append(function_transformers[glob_name])
+    if "*" in function_transformers:
+      transformers.append(function_transformers["*"])
+    return transformers
+
+  def _get_applicable_dict(self, transformer_field, full_name, name):
+    """Get all dict entries indexed by name that apply to full_name or name."""
+    # Transformers are indexed to full name, name, or no name
+    # as a performance optimization.
+    function_transformers = getattr(self._api_change_spec,
+                                    transformer_field, {})
+
+    glob_name = "*." + name if name else None
+    transformers = function_transformers.get("*", {}).copy()
+    transformers.update(function_transformers.get(glob_name, {}))
+    transformers.update(function_transformers.get(full_name, {}))
+    return transformers
+
+  def _get_full_name(self, node):
+    """Traverse an Attribute node to generate a full name, e.g., "tf.foo.bar".
+
+    This is the inverse of `full_name_node`.
 
     Args:
       node: A Node of type Attribute.
 
     Returns:
-      a '.'-delimited full-name or None if the tree was not a simple form.
+      a '.'-delimited full-name or None if node was not Attribute or Name.
       i.e. `foo()+b).bar` returns None, while `a.b.c` would return "a.b.c".
     """
     curr = node
     items = []
     while not isinstance(curr, ast.Name):
       if not isinstance(curr, ast.Attribute):
-        return None, None
+        return None
       items.append(curr.attr)
       curr = curr.value
     items.append(curr.id)
-    return ".".join(reversed(items)), items[0]
-
-  def _find_true_position(self, node):
-    """Return correct line number and column offset for a given node.
+    return ".".join(reversed(items))
 
-    This is necessary mainly because ListComp's location reporting reports
-    the next token after the list comprehension list opening.
+  def _maybe_add_warning(self, node, full_name):
+    """Adds an error to be printed about full_name at node."""
+    function_warnings = self._api_change_spec.function_warnings
+    if full_name in function_warnings:
+      level, message = function_warnings[full_name]
+      message = message.replace("<function name>", full_name)
+      self.add_log(level, node.lineno, node.col_offset,
+                   "%s requires manual check. %s" % (full_name, message))
+      return True
+    else:
+      return False
+
+  def _maybe_add_module_deprecation_warning(self, node, full_name, whole_name):
+    """Adds a warning if full_name is a deprecated module."""
+    warnings = self._api_change_spec.module_deprecations
+    if full_name in warnings:
+      level, message = warnings[full_name]
+      message = message.replace("<function name>", whole_name)
+      self.add_log(level, node.lineno, node.col_offset,
+                   "Using member %s in deprecated module %s. %s" % (whole_name,
+                                                                    full_name,
+                                                                    message))
+      return True
+    else:
+      return False
+
+  def _maybe_add_call_warning(self, node, full_name, name):
+    """Print a warning when specific functions are called with selected args.
 
-    Returns:
-      lineno, offset for the given node
+    The function _print_warning_for_function matches the full name of the called
+    function, e.g., tf.foo.bar(). This function matches the function name that
+    is called, as long as the function is an attribute. For example,
+    `tf.foo.bar()` and `foo.bar()` are matched, but not `bar()`.
 
     Args:
-      node: Node for which we wish to know the lineno and col_offset
+      node: ast.Call object
+      full_name: The precomputed full name of the callable, if one exists, None
+        otherwise.
+      name: The precomputed name of the callable, if one exists, None otherwise.
+
+    Returns:
+      Whether an error was recorded.
     """
-    if isinstance(node, ast.ListComp):
-      # Strangely, ast.ListComp returns the col_offset of the first token
-      # after the '[' token which appears to be a bug. Workaround by
-      # explicitly finding the real start of the list comprehension.
-      line = node.lineno
-      col = node.col_offset
-      # loop over lines
-      while 1:
-        # Reverse the text to and regular expression search for whitespace
-        text = self._lines[line - 1]
-        reversed_preceding_text = text[:col][::-1]
-        # First find if a [ can be found with only whitespace between it and
-        # col.
-        m = FIND_OPEN.match(reversed_preceding_text)
-        if m:
-          new_col_offset = col - m.start(1) - 1
-          return line, new_col_offset
+    # Only look for *.-warnings here, the other will be handled by the Attribute
+    # visitor. Also, do not warn for bare functions, only if the call func is
+    # an attribute.
+    warned = False
+    if isinstance(node.func, ast.Attribute):
+      warned = self._maybe_add_warning(node, "*." + name)
+
+    # All arg warnings are handled here, since only we have the args
+    arg_warnings = self._get_applicable_dict("function_arg_warnings",
+                                             full_name, name)
+
+    for (kwarg, arg), (level, warning) in sorted(arg_warnings.items()):
+      present, _ = get_arg_value(node, kwarg, arg)
+      if present:
+        warned = True
+        warning_message = warning.replace("<function name>", full_name or name)
+        self.add_log(level, node.lineno, node.col_offset,
+                     "%s called with %s argument requires manual check: %s" %
+                     (full_name or name, kwarg, warning_message))
+
+    return warned
+
+  def _maybe_rename(self, parent, node, full_name):
+    """Replace node (Attribute or Name) with a node representing full_name."""
+    new_name = self._api_change_spec.symbol_renames.get(full_name, None)
+    if new_name:
+      self.add_log(INFO, node.lineno, node.col_offset,
+                   "Renamed %r to %r" % (full_name, new_name))
+      new_node = full_name_node(new_name, node.ctx)
+      ast.copy_location(new_node, node)
+      pasta.ast_utils.replace_child(parent, node, new_node)
+      return True
+    else:
+      return False
+
+  def _maybe_change_to_function_call(self, parent, node, full_name):
+    """Wraps node (typically, an Attribute or Expr) in a Call."""
+    if full_name in self._api_change_spec.change_to_function:
+      if not isinstance(parent, ast.Call):
+        # ast.Call's constructor is really picky about how many arguments it
+        # wants, and also, it changed between Py2 and Py3.
+        if six.PY2:
+          new_node = ast.Call(node, [], [], None, None)
+        else:
+          new_node = ast.Call(node, [], [])
+        pasta.ast_utils.replace_child(parent, node, new_node)
+        ast.copy_location(new_node, node)
+        self.add_log(INFO, node.lineno, node.col_offset,
+                     "Changed %r to a function call" % full_name)
+        return True
+    return False
+
+  def _maybe_add_arg_names(self, node, full_name):
+    """Make args into keyword args if function called full_name requires it."""
+    function_reorders = self._api_change_spec.function_reorders
+
+    if full_name in function_reorders:
+      reordered = function_reorders[full_name]
+      new_keywords = []
+      idx = 0
+      for arg in node.args:
+        if sys.version_info[:2] >= (3, 5) and isinstance(arg, ast.Starred):
+          continue  # Can't move Starred to keywords
+        keyword_arg = reordered[idx]
+        keyword = ast.keyword(arg=keyword_arg, value=arg)
+        new_keywords.append(keyword)
+        idx += 1
+
+      if new_keywords:
+        self.add_log(INFO, node.lineno, node.col_offset,
+                     "Added keywords to args of function %r" % full_name)
+        node.args = []
+        node.keywords = new_keywords + (node.keywords or [])
+        return True
+    return False
+
+  def _maybe_modify_args(self, node, full_name, name):
+    """Rename keyword args if the function called full_name requires it."""
+    renamed_keywords = self._get_applicable_dict("function_keyword_renames",
+                                                 full_name, name)
+
+    if not renamed_keywords:
+      return False
+
+    modified = False
+    new_keywords = []
+    for keyword in node.keywords:
+      argkey = keyword.arg
+      if argkey in renamed_keywords:
+        modified = True
+        if renamed_keywords[argkey] is None:
+          lineno = getattr(keyword, "lineno", node.lineno)
+          col_offset = getattr(keyword, "col_offset", node.col_offset)
+          self.add_log(INFO, lineno, col_offset,
+                       "Removed argument %s for function %s" % (
+                           argkey, full_name or name))
         else:
-          if (reversed_preceding_text == "" or
-              reversed_preceding_text.isspace()):
-            line = line - 1
-            prev_line = self._lines[line - 1]
-            # TODO(aselle):
-            # this is poor comment detection, but it is good enough for
-            # cases where the comment does not contain string literal starting/
-            # ending characters. If ast gave us start and end locations of the
-            # ast nodes rather than just start, we could use string literal
-            # node ranges to filter out spurious #'s that appear in string
-            # literals.
-            comment_start = prev_line.find("#")
-            if comment_start == -1:
-              col = len(prev_line) - 1
-            elif FIND_STRING_CHARS.search(prev_line[comment_start:]) is None:
-              col = comment_start
-            else:
-              return None, None
-          else:
-            return None, None
-    # Most other nodes return proper locations (with notably does not), but
-    # it is not possible to use that in an argument.
-    return node.lineno, node.col_offset
+          keyword.arg = renamed_keywords[argkey]
+          lineno = getattr(keyword, "lineno", node.lineno)
+          col_offset = getattr(keyword, "col_offset", node.col_offset)
+          self.add_log(INFO, lineno, col_offset,
+                       "Renamed keyword argument for %s from %s to %s" % (
+                           full_name, argkey, renamed_keywords[argkey]))
+          new_keywords.append(keyword)
+      else:
+        new_keywords.append(keyword)
+
+    if modified:
+      node.keywords = new_keywords
+    return modified
 
   def visit_Call(self, node):  # pylint: disable=invalid-name
     """Handle visiting a call node in the AST.
@@ -307,104 +392,79 @@ class _ASTCallVisitor(ast.NodeVisitor):
     Args:
       node: Current Node
     """
-    self._print_warning_for_function_unrestricted(node)
-
-    # Find a simple attribute name path e.g. "tf.foo.bar"
-    full_name, name = self._get_attribute_full_path(node.func)
-
-    # Make sure the func is marked as being part of a call
-    node.func.is_function_for_call = True
+    assert self._stack[-1] is node
 
+    # Get the name for this call, so we can index stuff with it.
+    full_name = self._get_full_name(node.func)
     if full_name:
-      # Call special handlers
-      function_handles = self._api_change_spec.function_handle
-      glob_name = "*.{}".format(name)
-      if glob_name in function_handles:
-        function_handles[glob_name](self._file_edit, node)
-      if full_name in function_handles:
-        function_handles[full_name](self._file_edit, node)
-
-      # Examine any non-keyword argument and make it into a keyword argument
-      # if reordering required.
-      function_reorders = self._api_change_spec.function_reorders
-      function_keyword_renames = (
-          self._api_change_spec.function_keyword_renames)
-
-      if full_name in function_reorders:
-        reordered = function_reorders[full_name]
-        for idx, arg in enumerate(node.args):
-          lineno, col_offset = self._find_true_position(arg)
-          if lineno is None or col_offset is None:
-            self._file_edit.add(
-                "Failed to add keyword %r to reordered function %r" %
-                (reordered[idx], full_name),
-                arg.lineno,
-                arg.col_offset,
-                "",
-                "",
-                error="A necessary keyword argument failed to be inserted.")
-          else:
-            keyword_arg = reordered[idx]
-            if (full_name in function_keyword_renames and
-                keyword_arg in function_keyword_renames[full_name]):
-              keyword_arg = function_keyword_renames[full_name][keyword_arg]
-            self._file_edit.add("Added keyword %r to reordered function %r" %
-                                (reordered[idx], full_name), lineno, col_offset,
-                                "", keyword_arg + "=")
-
-      # Examine each keyword argument and convert it to the final renamed form
-      renamed_keywords = ({} if full_name not in function_keyword_renames else
-                          function_keyword_renames[full_name])
-      for keyword in node.keywords:
-        argkey = keyword.arg
-        argval = keyword.value
-
-        if argkey in renamed_keywords:
-          argval_lineno, argval_col_offset = self._find_true_position(argval)
-          if argval_lineno is not None and argval_col_offset is not None:
-            # TODO(aselle): We should scan backward to find the start of the
-            # keyword key. Unfortunately ast does not give you the location of
-            # keyword keys, so we are forced to infer it from the keyword arg
-            # value.
-            key_start = argval_col_offset - len(argkey) - 1
-            key_end = key_start + len(argkey) + 1
-            if (self._lines[argval_lineno - 1][key_start:key_end] == argkey +
-                "="):
-              self._file_edit.add("Renamed keyword argument from %r to %r" %
-                                  (argkey,
-                                   renamed_keywords[argkey]), argval_lineno,
-                                  argval_col_offset - len(argkey) - 1,
-                                  argkey + "=", renamed_keywords[argkey] + "=")
-              continue
-          self._file_edit.add(
-              "Failed to rename keyword argument from %r to %r" %
-              (argkey, renamed_keywords[argkey]),
-              argval.lineno,
-              argval.col_offset - len(argkey) - 1,
-              "",
-              "",
-              error="Failed to find keyword lexographically. Fix manually.")
-
-    ast.NodeVisitor.generic_visit(self, node)
+      name = full_name.split(".")[-1]
+    elif isinstance(node.func, ast.Name):
+      name = node.func.id
+    elif isinstance(node.func, ast.Attribute):
+      name = node.func.attr
+    else:
+      name = None
+
+    # Call standard transformers for this node.
+    # Make sure warnings come first, since args or names triggering warnings
+    # may be removed by the other transformations.
+    self._maybe_add_call_warning(node, full_name, name)
+    # Make all args into kwargs
+    self._maybe_add_arg_names(node, full_name)
+    # Argument name changes or deletions
+    self._maybe_modify_args(node, full_name, name)
+
+    # Call transformers. These have the ability to modify the node, and if they
+    # do, will return the new node they created (or the same node if they just
+    # changed it). The are given the parent, but we will take care of
+    # integrating their changes into the parent if they return a new node.
+    #
+    # These are matched on the old name, since renaming is performed by the
+    # Attribute visitor, which happens later.
+    transformers = self._get_applicable_entries("function_transformers",
+                                                full_name, name)
+
+    parent = self._stack[-2]
+
+    for transformer in transformers:
+      logs = []
+      new_node = transformer(parent, node, full_name, name, logs)
+      self.add_logs(logs)
+      if new_node and new_node is not node:
+        pasta.ast_utils.replace_child(parent, node, new_node)
+        node = new_node
+        self._stack[-1] = node
+
+    self.generic_visit(node)
 
   def visit_Attribute(self, node):  # pylint: disable=invalid-name
-    """Handle bare Attributes i.e. [tf.foo, tf.bar].
+    """Handle bare Attributes i.e. [tf.foo, tf.bar]."""
+    assert self._stack[-1] is node
 
-    Args:
-      node: Node that is of type ast.Attribute
-    """
-    full_name, _ = self._get_attribute_full_path(node)
+    full_name = self._get_full_name(node)
     if full_name:
+      parent = self._stack[-2]
+
       # Make sure the warning comes first, otherwise the name may have changed
-      self._print_warning_for_function(node, full_name)
-      self._rename_functions(node, full_name)
-    if full_name in self._api_change_spec.change_to_function:
-      if not hasattr(node, "is_function_for_call"):
-        new_text = full_name + "()"
-        self._file_edit.add("Changed %r to %r" % (full_name, new_text),
-                            node.lineno, node.col_offset, full_name, new_text)
+      self._maybe_add_warning(node, full_name)
+
+      # Once we did a modification, node is invalid and not worth inspecting
+      # further. Also, we only perform modifications for simple nodes, so
+      # There'd be no point in descending further.
+      if self._maybe_rename(parent, node, full_name):
+        return
+      if self._maybe_change_to_function_call(parent, node, full_name):
+        return
 
-    ast.NodeVisitor.generic_visit(self, node)
+      # The isinstance check is enough -- a bare Attribute is never root.
+      i = 2
+      while isinstance(self._stack[-i], ast.Attribute):
+        i += 1
+      whole_name = pasta.dump(self._stack[-(i-1)])
+
+      self._maybe_add_module_deprecation_warning(node, full_name, whole_name)
+
+    self.generic_visit(node)
 
 
 class ASTCodeUpgrader(object):
@@ -427,16 +487,48 @@ class ASTCodeUpgrader(object):
     """
 
     # Write to a temporary file, just in case we are doing an implace modify.
+    # pylint: disable=g-backslash-continuation
     with open(in_filename, "r") as in_file, \
         tempfile.NamedTemporaryFile("w", delete=False) as temp_file:
       ret = self.process_opened_file(in_filename, in_file, out_filename,
                                      temp_file)
+    # pylint: enable=g-backslash-continuation
 
     shutil.move(temp_file.name, out_filename)
     return ret
 
-  # Broad exceptions are required here because ast throws whatever it wants.
-  # pylint: disable=broad-except
+  def format_log(self, log, in_filename):
+    log_string = "%d:%d: %s: %s" % (log[1], log[2], log[0], log[3])
+    if in_filename:
+      return in_filename + ":" + log_string
+    else:
+      return log_string
+
+  def update_string_pasta(self, text, in_filename):
+    """Updates a file using pasta."""
+    try:
+      t = pasta.parse(text)
+    except (SyntaxError, ValueError, TypeError):
+      log = ["ERROR: Failed to parse.\n" + traceback.format_exc()]
+      return 0, "", log, []
+
+    visitor = _PastaEditVisitor(self._api_change_spec)
+    visitor.visit(t)
+
+    logs = [self.format_log(log, None) for log in visitor.log]
+    errors = [self.format_log(error, in_filename)
+              for error in visitor.warnings_and_errors]
+    return 1, pasta.dump(t), logs, errors
+
+  def _format_log(self, log, in_filename, out_filename):
+    text = "-" * 80 + "\n"
+    text += "Processing file %r\n outputting to %r\n" % (in_filename,
+                                                         out_filename)
+    text += "-" * 80 + "\n\n"
+    text += "\n".join(log) + "\n"
+    text += "-" * 80 + "\n\n"
+    return text
+
   def process_opened_file(self, in_filename, in_file, out_filename, out_file):
     """Process the given python file for incompatible changes.
 
@@ -451,33 +543,19 @@ class ASTCodeUpgrader(object):
     Returns:
       A tuple representing number of files processed, log of actions, errors
     """
-    process_errors = []
-    text = "-" * 80 + "\n"
-    text += "Processing file %r\n outputting to %r\n" % (in_filename,
-                                                         out_filename)
-    text += "-" * 80 + "\n\n"
-
-    parsed_ast = None
     lines = in_file.readlines()
-    try:
-      parsed_ast = ast.parse("".join(lines))
-    except Exception:
-      text += "Failed to parse %r\n\n" % in_filename
-      text += traceback.format_exc()
-    if parsed_ast:
-      visitor = _ASTCallVisitor(in_filename, lines, self._api_change_spec)
-      visitor.visit(parsed_ast)
-      out_text, new_text, process_errors = visitor.process(lines)
-      text += new_text
-      if out_file:
-        out_file.write(out_text)
-    text += "\n"
-    return 1, text, process_errors
-
-  # pylint: enable=broad-except
+    processed_file, new_file_content, log, process_errors = (
+        self.update_string_pasta("".join(lines), in_filename))
+
+    if out_file and processed_file:
+      out_file.write(new_file_content)
+
+    return (processed_file,
+            self._format_log(log, in_filename, out_filename),
+            process_errors)
 
   def process_tree(self, root_directory, output_root_directory,
-                   copy_other_files):
+                   copy_other_files, in_place):
     """Processes upgrades on an entire tree of python files in place.
 
     Note that only Python files. If you have custom code in other languages,
@@ -487,11 +565,21 @@ class ASTCodeUpgrader(object):
       root_directory: Directory to walk and process.
       output_root_directory: Directory to use as base.
       copy_other_files: Copy files that are not touched by this converter.
+      in_place: Allow the conversion of an entire directory in place.
 
     Returns:
-      A tuple of files processed, the report string ofr all files, and errors
+      A tuple of files processed, the report string for all files, and a dict
+        mapping filenames to errors encountered in that file.
     """
 
+    if output_root_directory == root_directory:
+      if in_place:
+        return self.process_tree_inplace(root_directory)
+      else:
+        print("In order to copy a directory in place the `--inplace` input "
+              "arg must be set to `True`.")
+        sys.exit(1)
+
     # make sure output directory doesn't exist
     if output_root_directory and os.path.exists(output_root_directory):
       print("Output directory %r must not already exist." %
@@ -528,7 +616,7 @@ class ASTCodeUpgrader(object):
           files_to_copy.append((fullpath, fullpath_output))
 
     file_count = 0
-    tree_errors = []
+    tree_errors = {}
     report = ""
     report += ("=" * 80) + "\n"
     report += "Input tree: %r\n" % root_directory
@@ -540,7 +628,7 @@ class ASTCodeUpgrader(object):
         os.makedirs(output_directory)
       file_count += 1
       _, l_report, l_errors = self.process_file(input_path, output_path)
-      tree_errors += l_errors
+      tree_errors[input_path] = l_errors
       report += l_report
     for input_path, output_path in files_to_copy:
       output_directory = os.path.dirname(output_path)
@@ -548,3 +636,26 @@ class ASTCodeUpgrader(object):
         os.makedirs(output_directory)
       shutil.copy(input_path, output_path)
     return file_count, report, tree_errors
+
+  def process_tree_inplace(self, root_directory):
+    """Process a directory of python files in place."""
+    files_to_process = []
+    for dir_name, _, file_list in os.walk(root_directory):
+      py_files = [os.path.join(dir_name,
+                               f) for f in file_list if f.endswith(".py")]
+      files_to_process += py_files
+
+    file_count = 0
+    tree_errors = {}
+    report = ""
+    report += ("=" * 80) + "\n"
+    report += "Input tree: %r\n" % root_directory
+    report += ("=" * 80) + "\n"
+
+    for path in files_to_process:
+      file_count += 1
+      _, l_report, l_errors = self.process_file(path, path)
+      tree_errors[path] = l_errors
+      report += l_report
+
+    return file_count, report, tree_errors
diff --git a/tensorflow/tools/compatibility/ast_edits_test.py b/tensorflow/tools/compatibility/ast_edits_test.py
index 99f20a026fcb9b60e0d4365dd2690946f0d833fc..a9307f9f83bb8f1ccfb965ca4570c5fba4c1e047 100644
--- a/tensorflow/tools/compatibility/ast_edits_test.py
+++ b/tensorflow/tools/compatibility/ast_edits_test.py
@@ -39,7 +39,10 @@ following new APIs:
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
+import ast
 import six
+
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test as test_lib
 from tensorflow.tools.compatibility import ast_edits
@@ -54,8 +57,16 @@ class NoUpdateSpec(ast_edits.APIChangeSpec):
     self.function_keyword_renames = {}
     self.symbol_renames = {}
     self.function_warnings = {}
-    self.unrestricted_function_warnings = {}
     self.change_to_function = {}
+    self.module_deprecations = {}
+
+
+class ModuleDeprecationSpec(NoUpdateSpec):
+  """A specification which deprecates 'a.b'."""
+
+  def __init__(self):
+    NoUpdateSpec.__init__(self)
+    self.module_deprecations.update({"a.b": (ast_edits.ERROR, "a.b is evil.")})
 
 
 class RenameKeywordSpec(NoUpdateSpec):
@@ -170,6 +181,15 @@ class TestAstEdits(test_util.TensorFlowTestCase):
                                      "test_out.py", out_file))
     return (count, report, errors), out_file.getvalue()
 
+  def testModuleDeprecation(self):
+    text = "a.b.c(a.b.x)"
+    (_, _, errors), new_text = self._upgrade(ModuleDeprecationSpec(), text)
+    self.assertEqual(text, new_text)
+    self.assertIn("Using member a.b.c", errors[0])
+    self.assertIn("1:0", errors[0])
+    self.assertIn("Using member a.b.c", errors[0])
+    self.assertIn("1:6", errors[1])
+
   def testNoTransformIfNothingIsSupplied(self):
     text = "f(a, b, kw1=c, kw2=d)\n"
     _, new_text = self._upgrade(NoUpdateSpec(), text)
@@ -191,6 +211,20 @@ class TestAstEdits(test_util.TensorFlowTestCase):
     _, new_text = self._upgrade(RenameKeywordSpec(), text)
     self.assertEqual(new_text, text)
 
+  def testKeywordReorderWithParens(self):
+    """Test that we get the expected result if there are parens around args."""
+    text = "f((a), ( ( b ) ))\n"
+    acceptable_outputs = [
+        # No change is a valid output
+        text,
+        # Also cases where all arguments are fully specified are allowed
+        "f(a=(a), b=( ( b ) ))\n",
+        # Making the parens canonical is ok
+        "f(a=(a), b=((b)))\n",
+    ]
+    _, new_text = self._upgrade(ReorderKeywordSpec(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
   def testKeywordReorder(self):
     """Test that we get the expected result if kw2 is now before kw1."""
     text = "f(a, b, kw1=c, kw2=d)\n"
@@ -401,7 +435,8 @@ class TestAstEdits(test_util.TensorFlowTestCase):
 
       def __init__(self):
         NoUpdateSpec.__init__(self)
-        self.unrestricted_function_warnings = {"foo": "not good"}
+        self.function_warnings = {"*.foo": (ast_edits.WARNING, "not good")}
+
     texts = ["object.foo()", "get_object().foo()",
              "get_object().foo()", "object.foo().bar()"]
     for text in texts:
@@ -415,6 +450,13 @@ class TestAstEdits(test_util.TensorFlowTestCase):
       (_, report, _), _ = self._upgrade(FooWarningSpec(), text)
       self.assertNotIn("not good", report)
 
+  def testFullNameNode(self):
+    t = ast_edits.full_name_node("a.b.c")
+    self.assertEquals(
+        ast.dump(t),
+        "Attribute(value=Attribute(value=Name(id='a', ctx=Load()), attr='b', "
+        "ctx=Load()), attr='c', ctx=Load())"
+    )
 
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/tools/compatibility/renames_v2.py b/tensorflow/tools/compatibility/renames_v2.py
index b757ad4647c6d92e21feccd7d90da887df379531..67425bba6c463338527ac99907ffd12927b8ac70 100644
--- a/tensorflow/tools/compatibility/renames_v2.py
+++ b/tensorflow/tools/compatibility/renames_v2.py
@@ -34,6 +34,8 @@ renames = {
     'tf.ConfigProto': 'tf.compat.v1.ConfigProto',
     'tf.DeviceSpec': 'tf.compat.v1.DeviceSpec',
     'tf.Dimension': 'tf.compat.v1.Dimension',
+    'tf.Event': 'tf.compat.v1.Event',
+    'tf.FIFOQueue': 'tf.queue.FIFOQueue',
     'tf.FixedLenFeature': 'tf.io.FixedLenFeature',
     'tf.FixedLenSequenceFeature': 'tf.io.FixedLenSequenceFeature',
     'tf.FixedLengthRecordReader': 'tf.compat.v1.FixedLengthRecordReader',
@@ -58,12 +60,12 @@ renames = {
     'tf.NotDifferentiable': 'tf.no_gradient',
     'tf.OpError': 'tf.errors.OpError',
     'tf.OptimizerOptions': 'tf.compat.v1.OptimizerOptions',
-    'tf.PaddingFIFOQueue': 'tf.io.PaddingFIFOQueue',
+    'tf.PaddingFIFOQueue': 'tf.queue.PaddingFIFOQueue',
     'tf.Print': 'tf.compat.v1.Print',
-    'tf.PriorityQueue': 'tf.io.PriorityQueue',
+    'tf.PriorityQueue': 'tf.queue.PriorityQueue',
     'tf.QUANTIZED_DTYPES': 'tf.dtypes.QUANTIZED_DTYPES',
-    'tf.QueueBase': 'tf.io.QueueBase',
-    'tf.RandomShuffleQueue': 'tf.io.RandomShuffleQueue',
+    'tf.QueueBase': 'tf.queue.QueueBase',
+    'tf.RandomShuffleQueue': 'tf.queue.RandomShuffleQueue',
     'tf.ReaderBase': 'tf.compat.v1.ReaderBase',
     'tf.RunMetadata': 'tf.compat.v1.RunMetadata',
     'tf.RunOptions': 'tf.compat.v1.RunOptions',
@@ -72,6 +74,8 @@ renames = {
     'tf.SparseConditionalAccumulator': 'tf.sparse.SparseConditionalAccumulator',
     'tf.SparseFeature': 'tf.io.SparseFeature',
     'tf.SparseTensorValue': 'tf.compat.v1.SparseTensorValue',
+    'tf.Summary': 'tf.compat.v1.Summary',
+    'tf.SummaryMetadata': 'tf.compat.v1.SummaryMetadata',
     'tf.TFRecordReader': 'tf.compat.v1.TFRecordReader',
     'tf.TensorInfo': 'tf.compat.v1.TensorInfo',
     'tf.TextLineReader': 'tf.compat.v1.TextLineReader',
@@ -117,7 +121,6 @@ renames = {
     'tf.container': 'tf.compat.v1.container',
     'tf.convert_to_tensor_or_indexed_slices': 'tf.compat.v1.convert_to_tensor_or_indexed_slices',
     'tf.convert_to_tensor_or_sparse_tensor': 'tf.compat.v1.convert_to_tensor_or_sparse_tensor',
-    'tf.count_nonzero': 'tf.compat.v1.count_nonzero',
     'tf.count_up_to': 'tf.compat.v1.count_up_to',
     'tf.create_partitioned_variables': 'tf.compat.v1.create_partitioned_variables',
     'tf.cross': 'tf.linalg.cross',
@@ -140,12 +143,14 @@ renames = {
     'tf.diag': 'tf.linalg.tensor_diag',
     'tf.diag_part': 'tf.linalg.tensor_diag_part',
     'tf.digamma': 'tf.math.digamma',
-    'tf.dimension_at_index': 'tf.compat.v1.dimension_at_index',
-    'tf.dimension_value': 'tf.compat.v1.dimension_value',
+    'tf.dimension_at_index': 'tf.compat.dimension_at_index',
+    'tf.dimension_value': 'tf.compat.dimension_value',
     'tf.disable_eager_execution': 'tf.compat.v1.disable_eager_execution',
     'tf.disable_resource_variables': 'tf.compat.v1.disable_resource_variables',
+    'tf.disable_v2_batch_normalization': 'tf.compat.v1.disable_v2_batch_normalization',
     'tf.disable_v2_behavior': 'tf.compat.v1.disable_v2_behavior',
     'tf.disable_v2_tensorshape': 'tf.compat.v1.disable_v2_tensorshape',
+    'tf.distribute.get_loss_reduction': 'tf.compat.v1.distribute.get_loss_reduction',
     'tf.distributions.Bernoulli': 'tf.compat.v1.distributions.Bernoulli',
     'tf.distributions.Beta': 'tf.compat.v1.distributions.Beta',
     'tf.distributions.Categorical': 'tf.compat.v1.distributions.Categorical',
@@ -165,8 +170,11 @@ renames = {
     'tf.distributions.Uniform': 'tf.compat.v1.distributions.Uniform',
     'tf.distributions.kl_divergence': 'tf.compat.v1.distributions.kl_divergence',
     'tf.div': 'tf.compat.v1.div',
+    'tf.div_no_nan': 'tf.math.divide_no_nan',
+    'tf.dtypes.as_string': 'tf.strings.as_string',
     'tf.enable_eager_execution': 'tf.compat.v1.enable_eager_execution',
     'tf.enable_resource_variables': 'tf.compat.v1.enable_resource_variables',
+    'tf.enable_v2_batch_normalization': 'tf.compat.v1.enable_v2_batch_normalization',
     'tf.enable_v2_behavior': 'tf.compat.v1.enable_v2_behavior',
     'tf.enable_v2_tensorshape': 'tf.compat.v1.enable_v2_tensorshape',
     'tf.encode_base64': 'tf.io.encode_base64',
@@ -181,6 +189,7 @@ renames = {
     'tf.fake_quant_with_min_max_vars_per_channel_gradient': 'tf.quantization.fake_quant_with_min_max_vars_per_channel_gradient',
     'tf.feature_column.input_layer': 'tf.compat.v1.feature_column.input_layer',
     'tf.feature_column.linear_model': 'tf.compat.v1.feature_column.linear_model',
+    'tf.feature_column.shared_embedding_columns': 'tf.compat.v1.feature_column.shared_embedding_columns',
     'tf.fft': 'tf.signal.fft',
     'tf.fft2d': 'tf.signal.fft2d',
     'tf.fft3d': 'tf.signal.fft3d',
@@ -197,12 +206,11 @@ renames = {
     'tf.get_variable': 'tf.compat.v1.get_variable',
     'tf.get_variable_scope': 'tf.compat.v1.get_variable_scope',
     'tf.gfile.FastGFile': 'tf.compat.v1.gfile.FastGFile',
-    'tf.gfile.GFile': 'tf.compat.v1.gfile.GFile',
-    'tf.gfile.Open': 'tf.compat.v1.gfile.Open',
     'tf.global_norm': 'tf.linalg.global_norm',
     'tf.global_variables': 'tf.compat.v1.global_variables',
     'tf.global_variables_initializer': 'tf.compat.v1.global_variables_initializer',
-    'tf.glorot_normal_initializer': 'tf.keras.initializers.glorot_normal',
+    'tf.glorot_normal_initializer': 'tf.compat.v1.glorot_normal_initializer',
+    'tf.glorot_uniform_initializer': 'tf.compat.v1.glorot_uniform_initializer',
     'tf.graph_util.convert_variables_to_constants': 'tf.compat.v1.graph_util.convert_variables_to_constants',
     'tf.graph_util.extract_sub_graph': 'tf.compat.v1.graph_util.extract_sub_graph',
     'tf.graph_util.must_run_on_cpu': 'tf.compat.v1.graph_util.must_run_on_cpu',
@@ -217,18 +225,37 @@ renames = {
     'tf.image.resize_area': 'tf.compat.v1.image.resize_area',
     'tf.image.resize_bicubic': 'tf.compat.v1.image.resize_bicubic',
     'tf.image.resize_bilinear': 'tf.compat.v1.image.resize_bilinear',
-    'tf.image.resize_images': 'tf.compat.v1.image.resize_images',
     'tf.image.resize_nearest_neighbor': 'tf.compat.v1.image.resize_nearest_neighbor',
     'tf.image.transpose_image': 'tf.compat.v1.image.transpose_image',
     'tf.initialize_all_tables': 'tf.compat.v1.initialize_all_tables',
     'tf.initialize_all_variables': 'tf.compat.v1.initialize_all_variables',
     'tf.initialize_local_variables': 'tf.compat.v1.initialize_local_variables',
     'tf.initialize_variables': 'tf.compat.v1.initialize_variables',
+    'tf.initializers.constant': 'tf.compat.v1.initializers.constant',
     'tf.initializers.global_variables': 'tf.compat.v1.initializers.global_variables',
+    'tf.initializers.glorot_normal': 'tf.compat.v1.initializers.glorot_normal',
+    'tf.initializers.glorot_uniform': 'tf.compat.v1.initializers.glorot_uniform',
+    'tf.initializers.he_normal': 'tf.compat.v1.initializers.he_normal',
+    'tf.initializers.he_uniform': 'tf.compat.v1.initializers.he_uniform',
+    'tf.initializers.identity': 'tf.compat.v1.initializers.identity',
+    'tf.initializers.lecun_normal': 'tf.compat.v1.initializers.lecun_normal',
+    'tf.initializers.lecun_uniform': 'tf.compat.v1.initializers.lecun_uniform',
     'tf.initializers.local_variables': 'tf.compat.v1.initializers.local_variables',
+    'tf.initializers.ones': 'tf.compat.v1.initializers.ones',
+    'tf.initializers.orthogonal': 'tf.compat.v1.initializers.orthogonal',
+    'tf.initializers.random_normal': 'tf.compat.v1.initializers.random_normal',
+    'tf.initializers.random_uniform': 'tf.compat.v1.initializers.random_uniform',
     'tf.initializers.tables_initializer': 'tf.compat.v1.initializers.tables_initializer',
+    'tf.initializers.truncated_normal': 'tf.compat.v1.initializers.truncated_normal',
+    'tf.initializers.uniform_unit_scaling': 'tf.compat.v1.initializers.uniform_unit_scaling',
     'tf.initializers.variables': 'tf.compat.v1.initializers.variables',
+    'tf.initializers.variance_scaling': 'tf.compat.v1.initializers.variance_scaling',
+    'tf.initializers.zeros': 'tf.compat.v1.initializers.zeros',
     'tf.invert_permutation': 'tf.math.invert_permutation',
+    'tf.io.PaddingFIFOQueue': 'tf.queue.PaddingFIFOQueue',
+    'tf.io.PriorityQueue': 'tf.queue.PriorityQueue',
+    'tf.io.QueueBase': 'tf.queue.QueueBase',
+    'tf.io.RandomShuffleQueue': 'tf.queue.RandomShuffleQueue',
     'tf.io.tf_record_iterator': 'tf.compat.v1.io.tf_record_iterator',
     'tf.is_finite': 'tf.math.is_finite',
     'tf.is_inf': 'tf.math.is_inf',
@@ -237,7 +264,23 @@ renames = {
     'tf.is_numeric_tensor': 'tf.debugging.is_numeric_tensor',
     'tf.is_strictly_increasing': 'tf.math.is_strictly_increasing',
     'tf.is_variable_initialized': 'tf.compat.v1.is_variable_initialized',
-    'tf.keras.backend.get_session': 'tf.compat.v1.keras.backend.get_session',
+    'tf.keras.initializers.Identity': 'tf.compat.v1.keras.initializers.Identity',
+    'tf.keras.initializers.Orthogonal': 'tf.compat.v1.keras.initializers.Orthogonal',
+    'tf.keras.initializers.TruncatedNormal': 'tf.compat.v1.keras.initializers.TruncatedNormal',
+    'tf.keras.initializers.VarianceScaling': 'tf.compat.v1.keras.initializers.VarianceScaling',
+    'tf.keras.initializers.glorot_normal': 'tf.compat.v1.keras.initializers.glorot_normal',
+    'tf.keras.initializers.glorot_uniform': 'tf.compat.v1.keras.initializers.glorot_uniform',
+    'tf.keras.initializers.he_normal': 'tf.compat.v1.keras.initializers.he_normal',
+    'tf.keras.initializers.he_uniform': 'tf.compat.v1.keras.initializers.he_uniform',
+    'tf.keras.initializers.identity': 'tf.compat.v1.keras.initializers.identity',
+    'tf.keras.initializers.lecun_normal': 'tf.compat.v1.keras.initializers.lecun_normal',
+    'tf.keras.initializers.lecun_uniform': 'tf.compat.v1.keras.initializers.lecun_uniform',
+    'tf.keras.initializers.normal': 'tf.compat.v1.keras.initializers.normal',
+    'tf.keras.initializers.orthogonal': 'tf.compat.v1.keras.initializers.orthogonal',
+    'tf.keras.initializers.random_normal': 'tf.compat.v1.keras.initializers.random_normal',
+    'tf.keras.initializers.random_uniform': 'tf.compat.v1.keras.initializers.random_uniform',
+    'tf.keras.initializers.truncated_normal': 'tf.compat.v1.keras.initializers.truncated_normal',
+    'tf.keras.initializers.uniform': 'tf.compat.v1.keras.initializers.uniform',
     'tf.layers.AveragePooling1D': 'tf.compat.v1.layers.AveragePooling1D',
     'tf.layers.AveragePooling2D': 'tf.compat.v1.layers.AveragePooling2D',
     'tf.layers.AveragePooling3D': 'tf.compat.v1.layers.AveragePooling3D',
@@ -279,6 +322,8 @@ renames = {
     'tf.lbeta': 'tf.math.lbeta',
     'tf.lgamma': 'tf.math.lgamma',
     'tf.lin_space': 'tf.linspace',
+    'tf.lite.TocoConverter': 'tf.compat.v1.lite.TocoConverter',
+    'tf.lite.toco_convert': 'tf.compat.v1.lite.toco_convert',
     'tf.local_variables': 'tf.compat.v1.local_variables',
     'tf.local_variables_initializer': 'tf.compat.v1.local_variables_initializer',
     'tf.log': 'tf.math.log',
@@ -305,9 +350,15 @@ renames = {
     'tf.logging.warn': 'tf.compat.v1.logging.warn',
     'tf.logging.warning': 'tf.compat.v1.logging.warning',
     'tf.logical_xor': 'tf.math.logical_xor',
+    'tf.losses.Reduction': 'tf.compat.v1.losses.Reduction',
     'tf.losses.absolute_difference': 'tf.compat.v1.losses.absolute_difference',
+    'tf.losses.add_loss': 'tf.compat.v1.losses.add_loss',
     'tf.losses.compute_weighted_loss': 'tf.compat.v1.losses.compute_weighted_loss',
     'tf.losses.cosine_distance': 'tf.compat.v1.losses.cosine_distance',
+    'tf.losses.get_losses': 'tf.compat.v1.losses.get_losses',
+    'tf.losses.get_regularization_loss': 'tf.compat.v1.losses.get_regularization_loss',
+    'tf.losses.get_regularization_losses': 'tf.compat.v1.losses.get_regularization_losses',
+    'tf.losses.get_total_loss': 'tf.compat.v1.losses.get_total_loss',
     'tf.losses.hinge_loss': 'tf.compat.v1.losses.hinge_loss',
     'tf.losses.huber_loss': 'tf.compat.v1.losses.huber_loss',
     'tf.losses.log_loss': 'tf.compat.v1.losses.log_loss',
@@ -372,8 +423,11 @@ renames = {
     'tf.min_max_variable_partitioner': 'tf.compat.v1.min_max_variable_partitioner',
     'tf.model_variables': 'tf.compat.v1.model_variables',
     'tf.moving_average_variables': 'tf.compat.v1.moving_average_variables',
+    'tf.nn.avg_pool_v2': 'tf.nn.avg_pool',
     'tf.nn.bidirectional_dynamic_rnn': 'tf.compat.v1.nn.bidirectional_dynamic_rnn',
-    'tf.nn.conv3d_backprop_filter_v2': 'tf.nn.conv3d_backprop_filter',
+    'tf.nn.conv2d_backprop_filter': 'tf.compat.v1.nn.conv2d_backprop_filter',
+    'tf.nn.conv3d_backprop_filter': 'tf.compat.v1.nn.conv3d_backprop_filter',
+    'tf.nn.conv3d_backprop_filter_v2': 'tf.compat.v1.nn.conv3d_backprop_filter_v2',
     'tf.nn.ctc_beam_search_decoder_v2': 'tf.nn.ctc_beam_search_decoder',
     'tf.nn.ctc_loss_v2': 'tf.nn.ctc_loss',
     'tf.nn.depthwise_conv2d_native': 'tf.compat.v1.nn.depthwise_conv2d_native',
@@ -381,6 +435,7 @@ renames = {
     'tf.nn.depthwise_conv2d_native_backprop_input': 'tf.nn.depthwise_conv2d_backprop_input',
     'tf.nn.dynamic_rnn': 'tf.compat.v1.nn.dynamic_rnn',
     'tf.nn.log_uniform_candidate_sampler': 'tf.random.log_uniform_candidate_sampler',
+    'tf.nn.max_pool_v2': 'tf.nn.max_pool',
     'tf.nn.quantized_avg_pool': 'tf.compat.v1.nn.quantized_avg_pool',
     'tf.nn.quantized_conv2d': 'tf.compat.v1.nn.quantized_conv2d',
     'tf.nn.quantized_max_pool': 'tf.compat.v1.nn.quantized_max_pool',
@@ -389,15 +444,17 @@ renames = {
     'tf.nn.relu_layer': 'tf.compat.v1.nn.relu_layer',
     'tf.nn.rnn_cell.BasicLSTMCell': 'tf.compat.v1.nn.rnn_cell.BasicLSTMCell',
     'tf.nn.rnn_cell.BasicRNNCell': 'tf.compat.v1.nn.rnn_cell.BasicRNNCell',
+    'tf.nn.rnn_cell.DropoutWrapper': 'tf.compat.v1.nn.rnn_cell.DropoutWrapper',
     'tf.nn.rnn_cell.GRUCell': 'tf.compat.v1.nn.rnn_cell.GRUCell',
     'tf.nn.rnn_cell.LSTMCell': 'tf.compat.v1.nn.rnn_cell.LSTMCell',
     'tf.nn.rnn_cell.MultiRNNCell': 'tf.compat.v1.nn.rnn_cell.MultiRNNCell',
+    'tf.nn.rnn_cell.ResidualWrapper': 'tf.compat.v1.nn.rnn_cell.ResidualWrapper',
     'tf.nn.static_bidirectional_rnn': 'tf.compat.v1.nn.static_bidirectional_rnn',
     'tf.nn.static_rnn': 'tf.compat.v1.nn.static_rnn',
     'tf.nn.uniform_candidate_sampler': 'tf.random.uniform_candidate_sampler',
     'tf.nn.xw_plus_b': 'tf.compat.v1.nn.xw_plus_b',
     'tf.op_scope': 'tf.compat.v1.op_scope',
-    'tf.orthogonal_initializer': 'tf.keras.initializers.Orthogonal',
+    'tf.orthogonal_initializer': 'tf.compat.v1.orthogonal_initializer',
     'tf.parse_single_sequence_example': 'tf.io.parse_single_sequence_example',
     'tf.parse_tensor': 'tf.io.parse_tensor',
     'tf.placeholder': 'tf.compat.v1.placeholder',
@@ -420,12 +477,13 @@ renames = {
     'tf.qr': 'tf.linalg.qr',
     'tf.quantize': 'tf.quantization.quantize',
     'tf.quantized_concat': 'tf.quantization.quantized_concat',
+    'tf.ragged.RaggedTensorValue': 'tf.compat.v1.ragged.RaggedTensorValue',
+    'tf.ragged.constant_value': 'tf.compat.v1.ragged.constant_value',
     'tf.random.get_seed': 'tf.compat.v1.random.get_seed',
     'tf.random.set_random_seed': 'tf.compat.v1.random.set_random_seed',
     'tf.random_crop': 'tf.image.random_crop',
     'tf.random_gamma': 'tf.random.gamma',
     'tf.random_normal': 'tf.random.normal',
-    'tf.random_poisson': 'tf.compat.v1.random_poisson',
     'tf.random_shuffle': 'tf.random.shuffle',
     'tf.random_uniform': 'tf.random.uniform',
     'tf.read_file': 'tf.io.read_file',
@@ -445,9 +503,10 @@ renames = {
     'tf.saved_model.Builder': 'tf.compat.v1.saved_model.Builder',
     'tf.saved_model.LEGACY_INIT_OP_KEY': 'tf.compat.v1.saved_model.LEGACY_INIT_OP_KEY',
     'tf.saved_model.MAIN_OP_KEY': 'tf.compat.v1.saved_model.MAIN_OP_KEY',
-    'tf.saved_model.TRAINING': 'tf.saved_model.TRANING',
+    'tf.saved_model.build_signature_def': 'tf.compat.v1.saved_model.build_signature_def',
     'tf.saved_model.build_tensor_info': 'tf.compat.v1.saved_model.build_tensor_info',
     'tf.saved_model.builder.SavedModelBuilder': 'tf.compat.v1.saved_model.builder.SavedModelBuilder',
+    'tf.saved_model.classification_signature_def': 'tf.compat.v1.saved_model.classification_signature_def',
     'tf.saved_model.constants.ASSETS_DIRECTORY': 'tf.saved_model.ASSETS_DIRECTORY',
     'tf.saved_model.constants.ASSETS_KEY': 'tf.saved_model.ASSETS_KEY',
     'tf.saved_model.constants.LEGACY_INIT_OP_KEY': 'tf.compat.v1.saved_model.constants.LEGACY_INIT_OP_KEY',
@@ -459,6 +518,7 @@ renames = {
     'tf.saved_model.constants.VARIABLES_FILENAME': 'tf.saved_model.VARIABLES_FILENAME',
     'tf.saved_model.experimental.save': 'tf.saved_model.save',
     'tf.saved_model.get_tensor_from_tensor_info': 'tf.compat.v1.saved_model.get_tensor_from_tensor_info',
+    'tf.saved_model.is_valid_signature': 'tf.compat.v1.saved_model.is_valid_signature',
     'tf.saved_model.load': 'tf.compat.v1.saved_model.load',
     'tf.saved_model.loader.load': 'tf.compat.v1.saved_model.loader.load',
     'tf.saved_model.loader.maybe_saved_model_directory': 'tf.compat.v1.saved_model.loader.maybe_saved_model_directory',
@@ -466,6 +526,8 @@ renames = {
     'tf.saved_model.main_op.main_op_with_restore': 'tf.compat.v1.saved_model.main_op.main_op_with_restore',
     'tf.saved_model.main_op_with_restore': 'tf.compat.v1.saved_model.main_op_with_restore',
     'tf.saved_model.maybe_saved_model_directory': 'tf.compat.v1.saved_model.maybe_saved_model_directory',
+    'tf.saved_model.predict_signature_def': 'tf.compat.v1.saved_model.predict_signature_def',
+    'tf.saved_model.regression_signature_def': 'tf.compat.v1.saved_model.regression_signature_def',
     'tf.saved_model.signature_constants.CLASSIFY_INPUTS': 'tf.saved_model.CLASSIFY_INPUTS',
     'tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME': 'tf.saved_model.CLASSIFY_METHOD_NAME',
     'tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES': 'tf.saved_model.CLASSIFY_OUTPUT_CLASSES',
@@ -477,16 +539,16 @@ renames = {
     'tf.saved_model.signature_constants.REGRESS_INPUTS': 'tf.saved_model.REGRESS_INPUTS',
     'tf.saved_model.signature_constants.REGRESS_METHOD_NAME': 'tf.saved_model.REGRESS_METHOD_NAME',
     'tf.saved_model.signature_constants.REGRESS_OUTPUTS': 'tf.saved_model.REGRESS_OUTPUTS',
-    'tf.saved_model.signature_def_utils.build_signature_def': 'tf.saved_model.build_signature_def',
-    'tf.saved_model.signature_def_utils.classification_signature_def': 'tf.saved_model.classification_signature_def',
-    'tf.saved_model.signature_def_utils.is_valid_signature': 'tf.saved_model.is_valid_signature',
-    'tf.saved_model.signature_def_utils.predict_signature_def': 'tf.saved_model.predict_signature_def',
-    'tf.saved_model.signature_def_utils.regression_signature_def': 'tf.saved_model.regression_signature_def',
+    'tf.saved_model.signature_def_utils.build_signature_def': 'tf.compat.v1.saved_model.signature_def_utils.build_signature_def',
+    'tf.saved_model.signature_def_utils.classification_signature_def': 'tf.compat.v1.saved_model.signature_def_utils.classification_signature_def',
+    'tf.saved_model.signature_def_utils.is_valid_signature': 'tf.compat.v1.saved_model.signature_def_utils.is_valid_signature',
+    'tf.saved_model.signature_def_utils.predict_signature_def': 'tf.compat.v1.saved_model.signature_def_utils.predict_signature_def',
+    'tf.saved_model.signature_def_utils.regression_signature_def': 'tf.compat.v1.saved_model.signature_def_utils.regression_signature_def',
     'tf.saved_model.simple_save': 'tf.compat.v1.saved_model.simple_save',
     'tf.saved_model.tag_constants.GPU': 'tf.saved_model.GPU',
     'tf.saved_model.tag_constants.SERVING': 'tf.saved_model.SERVING',
     'tf.saved_model.tag_constants.TPU': 'tf.saved_model.TPU',
-    'tf.saved_model.tag_constants.TRAINING': 'tf.saved_model.TRANING',
+    'tf.saved_model.tag_constants.TRAINING': 'tf.saved_model.TRAINING',
     'tf.saved_model.utils.build_tensor_info': 'tf.compat.v1.saved_model.utils.build_tensor_info',
     'tf.saved_model.utils.get_tensor_from_tensor_info': 'tf.compat.v1.saved_model.utils.get_tensor_from_tensor_info',
     'tf.scatter_add': 'tf.compat.v1.scatter_add',
@@ -523,9 +585,7 @@ renames = {
     'tf.sparse_merge': 'tf.compat.v1.sparse_merge',
     'tf.sparse_minimum': 'tf.sparse.minimum',
     'tf.sparse_placeholder': 'tf.compat.v1.sparse_placeholder',
-    'tf.sparse_reduce_max': 'tf.compat.v1.sparse_reduce_max',
     'tf.sparse_reduce_max_sparse': 'tf.compat.v1.sparse_reduce_max_sparse',
-    'tf.sparse_reduce_sum': 'tf.compat.v1.sparse_reduce_sum',
     'tf.sparse_reduce_sum_sparse': 'tf.compat.v1.sparse_reduce_sum_sparse',
     'tf.sparse_reorder': 'tf.sparse.reorder',
     'tf.sparse_reset_shape': 'tf.sparse.reset_shape',
@@ -560,11 +620,18 @@ renames = {
     'tf.string_strip': 'tf.strings.strip',
     'tf.string_to_hash_bucket_fast': 'tf.strings.to_hash_bucket_fast',
     'tf.string_to_hash_bucket_strong': 'tf.strings.to_hash_bucket_strong',
+    'tf.summary.Event': 'tf.compat.v1.summary.Event',
+    'tf.summary.FileWriter': 'tf.compat.v1.summary.FileWriter',
+    'tf.summary.FileWriterCache': 'tf.compat.v1.summary.FileWriterCache',
     'tf.summary.SessionLog': 'tf.compat.v1.summary.SessionLog',
+    'tf.summary.Summary': 'tf.compat.v1.summary.Summary',
+    'tf.summary.SummaryDescription': 'tf.compat.v1.summary.SummaryDescription',
+    'tf.summary.TaggedRunMetadata': 'tf.compat.v1.summary.TaggedRunMetadata',
     'tf.summary.audio': 'tf.compat.v1.summary.audio',
     'tf.summary.get_summary_description': 'tf.compat.v1.summary.get_summary_description',
     'tf.summary.histogram': 'tf.compat.v1.summary.histogram',
     'tf.summary.image': 'tf.compat.v1.summary.image',
+    'tf.summary.initialize': 'tf.compat.v1.summary.initialize',
     'tf.summary.merge': 'tf.compat.v1.summary.merge',
     'tf.summary.merge_all': 'tf.compat.v1.summary.merge_all',
     'tf.summary.scalar': 'tf.compat.v1.summary.scalar',
@@ -609,17 +676,20 @@ renames = {
     'tf.train.Optimizer': 'tf.compat.v1.train.Optimizer',
     'tf.train.ProfilerHook': 'tf.estimator.ProfilerHook',
     'tf.train.ProximalAdagradOptimizer': 'tf.compat.v1.train.ProximalAdagradOptimizer',
+    'tf.train.ProximalGradientDescentOptimizer': 'tf.compat.v1.train.ProximalGradientDescentOptimizer',
     'tf.train.QueueRunner': 'tf.compat.v1.train.QueueRunner',
     'tf.train.RMSPropOptimizer': 'tf.compat.v1.train.RMSPropOptimizer',
     'tf.train.Saver': 'tf.compat.v1.train.Saver',
     'tf.train.SaverDef': 'tf.compat.v1.train.SaverDef',
     'tf.train.Scaffold': 'tf.compat.v1.train.Scaffold',
     'tf.train.SecondOrStepTimer': 'tf.estimator.SecondOrStepTimer',
+    'tf.train.Server': 'tf.distribute.Server',
     'tf.train.SessionCreator': 'tf.compat.v1.train.SessionCreator',
     'tf.train.SessionManager': 'tf.compat.v1.train.SessionManager',
-    'tf.train.SessionRunArgs': 'tf.compat.v1.train.SessionRunArgs',
-    'tf.train.SessionRunContext': 'tf.compat.v1.train.SessionRunContext',
-    'tf.train.SessionRunValues': 'tf.compat.v1.train.SessionRunValues',
+    'tf.train.SessionRunArgs': 'tf.estimator.SessionRunArgs',
+    'tf.train.SessionRunContext': 'tf.estimator.SessionRunContext',
+    'tf.train.SessionRunHook': 'tf.estimator.SessionRunHook',
+    'tf.train.SessionRunValues': 'tf.estimator.SessionRunValues',
     'tf.train.SingularMonitoredSession': 'tf.compat.v1.train.SingularMonitoredSession',
     'tf.train.StepCounterHook': 'tf.estimator.StepCounterHook',
     'tf.train.StopAtStepHook': 'tf.estimator.StopAtStepHook',
@@ -634,8 +704,11 @@ renames = {
     'tf.train.batch': 'tf.compat.v1.train.batch',
     'tf.train.batch_join': 'tf.compat.v1.train.batch_join',
     'tf.train.checkpoint_exists': 'tf.compat.v1.train.checkpoint_exists',
+    'tf.train.cosine_decay': 'tf.compat.v1.train.cosine_decay',
+    'tf.train.cosine_decay_restarts': 'tf.compat.v1.train.cosine_decay_restarts',
     'tf.train.create_global_step': 'tf.compat.v1.train.create_global_step',
     'tf.train.do_quantize_training_on_graphdef': 'tf.compat.v1.train.do_quantize_training_on_graphdef',
+    'tf.train.exponential_decay': 'tf.compat.v1.train.exponential_decay',
     'tf.train.export_meta_graph': 'tf.compat.v1.train.export_meta_graph',
     'tf.train.generate_checkpoint_state_proto': 'tf.compat.v1.train.generate_checkpoint_state_proto',
     'tf.train.get_checkpoint_mtimes': 'tf.compat.v1.train.get_checkpoint_mtimes',
@@ -645,13 +718,19 @@ renames = {
     'tf.train.import_meta_graph': 'tf.compat.v1.train.import_meta_graph',
     'tf.train.init_from_checkpoint': 'tf.compat.v1.train.init_from_checkpoint',
     'tf.train.input_producer': 'tf.compat.v1.train.input_producer',
+    'tf.train.inverse_time_decay': 'tf.compat.v1.train.inverse_time_decay',
     'tf.train.limit_epochs': 'tf.compat.v1.train.limit_epochs',
+    'tf.train.linear_cosine_decay': 'tf.compat.v1.train.linear_cosine_decay',
     'tf.train.match_filenames_once': 'tf.io.match_filenames_once',
     'tf.train.maybe_batch': 'tf.compat.v1.train.maybe_batch',
     'tf.train.maybe_batch_join': 'tf.compat.v1.train.maybe_batch_join',
     'tf.train.maybe_shuffle_batch': 'tf.compat.v1.train.maybe_shuffle_batch',
     'tf.train.maybe_shuffle_batch_join': 'tf.compat.v1.train.maybe_shuffle_batch_join',
+    'tf.train.natural_exp_decay': 'tf.compat.v1.train.natural_exp_decay',
+    'tf.train.noisy_linear_cosine_decay': 'tf.compat.v1.train.noisy_linear_cosine_decay',
     'tf.train.piecewise_constant': 'tf.compat.v1.train.piecewise_constant',
+    'tf.train.piecewise_constant_decay': 'tf.compat.v1.train.piecewise_constant_decay',
+    'tf.train.polynomial_decay': 'tf.compat.v1.train.polynomial_decay',
     'tf.train.queue_runner.QueueRunner': 'tf.compat.v1.train.queue_runner.QueueRunner',
     'tf.train.queue_runner.add_queue_runner': 'tf.compat.v1.train.queue_runner.add_queue_runner',
     'tf.train.queue_runner.start_queue_runners': 'tf.compat.v1.train.queue_runner.start_queue_runners',
@@ -663,12 +742,13 @@ renames = {
     'tf.train.slice_input_producer': 'tf.compat.v1.train.slice_input_producer',
     'tf.train.start_queue_runners': 'tf.compat.v1.train.start_queue_runners',
     'tf.train.string_input_producer': 'tf.compat.v1.train.string_input_producer',
+    'tf.train.summary_iterator': 'tf.compat.v1.train.summary_iterator',
     'tf.train.update_checkpoint_state': 'tf.compat.v1.train.update_checkpoint_state',
     'tf.train.warm_start': 'tf.compat.v1.train.warm_start',
     'tf.train.write_graph': 'tf.io.write_graph',
     'tf.trainable_variables': 'tf.compat.v1.trainable_variables',
     'tf.truncated_normal': 'tf.random.truncated_normal',
-    'tf.uniform_unit_scaling_initializer': 'tf.initializers.uniform_unit_scaling',
+    'tf.uniform_unit_scaling_initializer': 'tf.compat.v1.uniform_unit_scaling_initializer',
     'tf.unsorted_segment_max': 'tf.math.unsorted_segment_max',
     'tf.unsorted_segment_mean': 'tf.math.unsorted_segment_mean',
     'tf.unsorted_segment_min': 'tf.math.unsorted_segment_min',
@@ -679,7 +759,7 @@ renames = {
     'tf.variable_op_scope': 'tf.compat.v1.variable_op_scope',
     'tf.variable_scope': 'tf.compat.v1.variable_scope',
     'tf.variables_initializer': 'tf.compat.v1.variables_initializer',
-    'tf.variance_scaling_initializer': 'tf.keras.initializers.VarianceScaling',
+    'tf.variance_scaling_initializer': 'tf.compat.v1.variance_scaling_initializer',
     'tf.verify_tensor_all_finite': 'tf.compat.v1.verify_tensor_all_finite',
     'tf.wrap_function': 'tf.compat.v1.wrap_function',
     'tf.write_file': 'tf.io.write_file',
diff --git a/tensorflow/tools/compatibility/reorders_v2.py b/tensorflow/tools/compatibility/reorders_v2.py
index 44494ac148cb878d500ef504eae8a6c388cc89df..14c2d0c09f953c73000c1595012423176fa48999 100644
--- a/tensorflow/tools/compatibility/reorders_v2.py
+++ b/tensorflow/tools/compatibility/reorders_v2.py
@@ -28,13 +28,17 @@ from __future__ import print_function
 reorders = {
     'tf.argmax': ['input', 'axis', 'name', 'dimension', 'output_type'],
     'tf.argmin': ['input', 'axis', 'name', 'dimension', 'output_type'],
-    'tf.batch_to_space': ['input', 'crops', 'block_size', 'name'],
+    'tf.batch_to_space': ['input', 'crops', 'block_size', 'name', 'block_shape'],
     'tf.boolean_mask': ['tensor', 'mask', 'name', 'axis'],
+    'tf.cond': ['pred', 'true_fn', 'false_fn', 'strict', 'name', 'fn1', 'fn2'],
     'tf.confusion_matrix': ['labels', 'predictions', 'num_classes', 'dtype', 'name', 'weights'],
-    'tf.convert_to_tensor': ['value', 'dtype', 'name', 'preferred_dtype'],
+    'tf.convert_to_tensor': ['value', 'dtype', 'name', 'preferred_dtype', 'dtype_hint'],
     'tf.decode_csv': ['records', 'record_defaults', 'field_delim', 'use_quote_delim', 'name', 'na_value', 'select_cols'],
     'tf.depth_to_space': ['input', 'block_size', 'name', 'data_format'],
     'tf.feature_column.categorical_column_with_vocabulary_file': ['key', 'vocabulary_file', 'vocabulary_size', 'num_oov_buckets', 'default_value', 'dtype'],
+    'tf.gradients': ['ys', 'xs', 'grad_ys', 'name', 'colocate_gradients_with_ops', 'gate_gradients', 'aggregation_method', 'stop_gradients', 'unconnected_gradients'],
+    'tf.hessians': ['ys', 'xs', 'name', 'colocate_gradients_with_ops', 'gate_gradients', 'aggregation_method'],
+    'tf.image.sample_distorted_bounding_box': ['image_size', 'bounding_boxes', 'seed', 'seed2', 'min_object_covered', 'aspect_ratio_range', 'area_range', 'max_attempts', 'use_image_if_no_bounding_boxes', 'name'],
     'tf.io.decode_csv': ['records', 'record_defaults', 'field_delim', 'use_quote_delim', 'name', 'na_value', 'select_cols'],
     'tf.io.parse_example': ['serialized', 'features', 'name', 'example_names'],
     'tf.io.parse_single_example': ['serialized', 'features', 'name', 'example_names'],
@@ -54,19 +58,28 @@ reorders = {
     'tf.math.reduce_prod': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
     'tf.math.reduce_sum': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
     'tf.multinomial': ['logits', 'num_samples', 'seed', 'name', 'output_dtype'],
-    'tf.nn.convolution': ['input', 'filter', 'padding', 'strides', 'dilation_rate', 'name', 'data_format'],
+    'tf.nn.avg_pool': ['value', 'ksize', 'strides', 'padding', 'data_format', 'name', 'input'],
+    'tf.nn.conv1d': ['value', 'filters', 'stride', 'padding', 'use_cudnn_on_gpu', 'data_format', 'name', 'input', 'dilations'],
+    'tf.nn.conv2d': ['input', 'filter', 'strides', 'padding', 'use_cudnn_on_gpu', 'data_format', 'dilations', 'name', 'filters'],
+    'tf.nn.conv2d_backprop_input': ['input_sizes', 'filter', 'out_backprop', 'strides', 'padding', 'use_cudnn_on_gpu', 'data_format', 'dilations', 'name', 'filters'],
+    'tf.nn.convolution': ['input', 'filter', 'padding', 'strides', 'dilation_rate', 'name', 'data_format', 'filters', 'dilations'],
     'tf.nn.crelu': ['features', 'name', 'axis'],
+    'tf.nn.ctc_beam_search_decoder': ['inputs', 'sequence_length', 'beam_width', 'top_paths', 'merge_repeated'],
     'tf.nn.depth_to_space': ['input', 'block_size', 'name', 'data_format'],
-    'tf.nn.depthwise_conv2d': ['input', 'filter', 'strides', 'padding', 'rate', 'name', 'data_format'],
+    'tf.nn.depthwise_conv2d': ['input', 'filter', 'strides', 'padding', 'rate', 'name', 'data_format', 'dilations'],
     'tf.nn.embedding_lookup': ['params', 'ids', 'partition_strategy', 'name', 'validate_indices', 'max_norm'],
     'tf.nn.embedding_lookup_sparse': ['params', 'sp_ids', 'sp_weights', 'partition_strategy', 'name', 'combiner', 'max_norm'],
+    'tf.nn.fractional_avg_pool': ['value', 'pooling_ratio', 'pseudo_random', 'overlapping', 'deterministic', 'seed', 'seed2', 'name'],
+    'tf.nn.fractional_max_pool': ['value', 'pooling_ratio', 'pseudo_random', 'overlapping', 'deterministic', 'seed', 'seed2', 'name'],
     'tf.nn.in_top_k': ['predictions', 'targets', 'k', 'name'],
-    'tf.nn.moments': ['x', 'axes', 'shift', 'name', 'keep_dims'],
-    'tf.nn.pool': ['input', 'window_shape', 'pooling_type', 'padding', 'dilation_rate', 'strides', 'name', 'data_format'],
-    'tf.nn.separable_conv2d': ['input', 'depthwise_filter', 'pointwise_filter', 'strides', 'padding', 'rate', 'name', 'data_format'],
-    'tf.nn.space_to_batch': ['input', 'paddings', 'block_size', 'name'],
+    'tf.nn.max_pool': ['value', 'ksize', 'strides', 'padding', 'data_format', 'name', 'input'],
+    'tf.nn.moments': ['x', 'axes', 'shift', 'name', 'keep_dims', 'keepdims'],
+    'tf.nn.pool': ['input', 'window_shape', 'pooling_type', 'padding', 'dilation_rate', 'strides', 'name', 'data_format', 'dilations'],
+    'tf.nn.separable_conv2d': ['input', 'depthwise_filter', 'pointwise_filter', 'strides', 'padding', 'rate', 'name', 'data_format', 'dilations'],
+    'tf.nn.softmax_cross_entropy_with_logits': ['_sentinel', 'labels', 'logits', 'dim', 'name', 'axis'],
+    'tf.nn.space_to_batch': ['input', 'paddings', 'block_size', 'name', 'block_shape'],
     'tf.nn.space_to_depth': ['input', 'block_size', 'name', 'data_format'],
-    'tf.nn.weighted_moments': ['x', 'axes', 'frequency_weights', 'name', 'keep_dims'],
+    'tf.nn.weighted_moments': ['x', 'axes', 'frequency_weights', 'name', 'keep_dims', 'keepdims'],
     'tf.norm': ['tensor', 'ord', 'axis', 'keepdims', 'name', 'keep_dims'],
     'tf.pad': ['tensor', 'paddings', 'mode', 'name', 'constant_values'],
     'tf.parse_example': ['serialized', 'features', 'name', 'example_names'],
@@ -77,7 +90,7 @@ reorders = {
     'tf.random_poisson': ['lam', 'shape', 'dtype', 'seed', 'name'],
     'tf.reduce_all': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
     'tf.reduce_any': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
-    'tf.reduce_join': ['inputs', 'axis', 'keep_dims', 'separator', 'name', 'reduction_indices'],
+    'tf.reduce_join': ['inputs', 'axis', 'keep_dims', 'separator', 'name', 'reduction_indices', 'keepdims'],
     'tf.reduce_logsumexp': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
     'tf.reduce_max': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
     'tf.reduce_mean': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
@@ -89,17 +102,17 @@ reorders = {
     'tf.serialize_sparse': ['sp_input', 'name', 'out_type'],
     'tf.shape': ['input', 'name', 'out_type'],
     'tf.size': ['input', 'name', 'out_type'],
-    'tf.space_to_batch': ['input', 'paddings', 'block_size', 'name'],
+    'tf.space_to_batch': ['input', 'paddings', 'block_size', 'name', 'block_shape'],
     'tf.space_to_depth': ['input', 'block_size', 'name', 'data_format'],
     'tf.sparse.add': ['a', 'b', 'threshold', 'thresh'],
-    'tf.sparse.concat': ['axis', 'sp_inputs', 'name', 'expand_nonconcat_dim', 'concat_dim'],
+    'tf.sparse.concat': ['axis', 'sp_inputs', 'name', 'expand_nonconcat_dim', 'concat_dim', 'expand_nonconcat_dims'],
     'tf.sparse.reduce_max': ['sp_input', 'axis', 'keepdims', 'reduction_axes', 'keep_dims'],
     'tf.sparse.segment_mean': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
     'tf.sparse.segment_sqrt_n': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
     'tf.sparse.segment_sum': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
     'tf.sparse.split': ['keyword_required', 'sp_input', 'num_split', 'axis', 'name', 'split_dim'],
     'tf.sparse_add': ['a', 'b', 'threshold', 'thresh'],
-    'tf.sparse_concat': ['axis', 'sp_inputs', 'name', 'expand_nonconcat_dim', 'concat_dim'],
+    'tf.sparse_concat': ['axis', 'sp_inputs', 'name', 'expand_nonconcat_dim', 'concat_dim', 'expand_nonconcat_dims'],
     'tf.sparse_matmul': ['a', 'b', 'transpose_a', 'transpose_b', 'a_is_sparse', 'b_is_sparse', 'name'],
     'tf.sparse_reduce_max': ['sp_input', 'axis', 'keepdims', 'reduction_axes', 'keep_dims'],
     'tf.sparse_segment_mean': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
@@ -107,8 +120,10 @@ reorders = {
     'tf.sparse_segment_sum': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
     'tf.sparse_split': ['keyword_required', 'sp_input', 'num_split', 'axis', 'name', 'split_dim'],
     'tf.strings.length': ['input', 'name', 'unit'],
-    'tf.strings.reduce_join': ['inputs', 'axis', 'keep_dims', 'separator', 'name', 'reduction_indices'],
+    'tf.strings.reduce_join': ['inputs', 'axis', 'keep_dims', 'separator', 'name', 'reduction_indices', 'keepdims'],
     'tf.strings.substr': ['input', 'pos', 'len', 'name', 'unit'],
+    'tf.substr': ['input', 'pos', 'len', 'name', 'unit'],
+    'tf.test.assert_equal_graph_def': ['actual', 'expected', 'checkpoint_v2'],
     'tf.transpose': ['a', 'perm', 'name', 'conjugate'],
     'tf.tuple': ['tensors', 'name', 'control_inputs'],
     'tf.while_loop': ['cond', 'body', 'loop_vars', 'shape_invariants', 'parallel_iterations', 'back_prop', 'swap_memory', 'name', 'maximum_iterations', 'return_same_structure']
diff --git a/tensorflow/tools/compatibility/testdata/test_file_v1_12.py b/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
index 5ce4dd49adc940dbc56e19915a188cdb6b8de1d1..2663762aa70253f54037393c0cb3cd791a040d56 100644
--- a/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
+++ b/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
@@ -70,6 +70,15 @@ class TestUpgrade(test_util.TensorFlowTestCase):
         [0],
         tf.argmin([[1, 3, 2]], name='abc', dimension=1))
 
+  @test_util.run_v1_only("b/120545219")
+  def testSoftmaxCrossEntropyWithLogits(self):
+    out = tf.nn.softmax_cross_entropy_with_logits(
+        logits=[0.1, 0.8], labels=[0, 1])
+    self.assertAllClose(out, 0.40318608)
+    out = tf.nn.softmax_cross_entropy_with_logits_v2(
+        logits=[0.1, 0.8], labels=[0, 1])
+    self.assertAllClose(out, 0.40318608)
+
 
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/tools/compatibility/tf_upgrade.py b/tensorflow/tools/compatibility/tf_upgrade.py
index 287d1a5483c32379da1dc651aba62a86a3f6d0f9..5dd548c8214992e95774c477f52ac6fc22b1fb4c 100644
--- a/tensorflow/tools/compatibility/tf_upgrade.py
+++ b/tensorflow/tools/compatibility/tf_upgrade.py
@@ -175,26 +175,16 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.op_scope": ["values", "name", "default_name"],
     }
 
-    # Specially handled functions.
-    self.function_handle = {"tf.reverse": self._reverse_handler}
-
     # Warnings that should be printed if corresponding functions are used.
-    self.function_warnings = {}
+    self.function_warnings = {
+        "tf.reverse": (
+            ast_edits.ERROR,
+            "tf.reverse has had its argument semantics changed "
+            "significantly. The converter cannot detect this reliably, so "
+            "you need to inspect this usage manually.\n"),
+    }
 
-  @staticmethod
-  def _reverse_handler(file_edit_recorder, node):
-    # TODO(aselle): Could check for a literal list of bools and try to convert
-    # them to indices.
-    comment = ("ERROR: tf.reverse has had its argument semantics changed "
-               "significantly the converter cannot detect this reliably, so "
-               "you need to inspect this usage manually.\n")
-    file_edit_recorder.add(
-        comment,
-        node.lineno,
-        node.col_offset,
-        "tf.reverse",
-        "tf.reverse",
-        error="tf.reverse requires manual check.")
+    self.module_deprecations = {}
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/tools/compatibility/tf_upgrade_test.py b/tensorflow/tools/compatibility/tf_upgrade_test.py
index 66325ea2ad36265c6c3779b414774abab8213a84..cf05575a9dd0cf6940a18e801fc76b667dbda233 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_test.py
@@ -112,7 +112,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     text = "tf.reverse(a, b)\n"
     _, unused_report, errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, new_text)
-    self.assertEqual(errors, ["test.py:1: tf.reverse requires manual check."])
+    self.assertIn("tf.reverse requires manual check", errors[0])
 
   def testListComprehension(self):
     def _test(input, output):  # pylint: disable=redefined-builtin
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py
index ea86da42f6bbb8170c56d08e02ab38cf72acf3f7..c3ca68be6043530ab7034c148986335e1a52fb92 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py
@@ -18,10 +18,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import ast
+import functools
+import sys
+
+import pasta
+
 from tensorflow.tools.compatibility import ast_edits
 from tensorflow.tools.compatibility import renames_v2
 from tensorflow.tools.compatibility import reorders_v2
 
+# These pylint warnings are a mistake.
+# pylint: disable=g-explicit-bool-comparison,g-bool-id-comparison
+
 
 class TFAPIChangeSpec(ast_edits.APIChangeSpec):
   """List of maps that describe what changed in the API."""
@@ -29,7 +38,36 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
   def __init__(self):
     # Maps from a function name to a dictionary that describes how to
     # map from an old argument keyword to the new argument keyword.
+    # If the new argument is None, it will be removed.
+    # Only keyword args are handled, so make sure to also put any function in
+    # function_reorders to ensure that all args are made into keywords first.
     self.function_keyword_renames = {
+        "tf.test.assert_equal_graph_def": {
+            "checkpoint_v2": None,
+        },
+        "tf.nn.embedding_lookup": {
+            "validate_indices": None,
+        },
+        "tf.image.sample_distorted_bounding_box": {
+            "seed2": None,
+        },
+        "tf.gradients": {
+            "colocate_gradients_with_ops": None,
+        },
+        "tf.hessians": {
+            "colocate_gradients_with_ops": None,
+        },
+        "*.minimize": {
+            "colocate_gradients_with_ops": None,
+        },
+        "*.compute_gradients": {
+            "colocate_gradients_with_ops": None,
+        },
+        "tf.cond": {
+            "strict": None,
+            "fn1": "true_fn",
+            "fn2": "false_fn"
+        },
         "tf.argmin": {
             "dimension": "axis",
         },
@@ -75,6 +113,10 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.convert_to_tensor": {
             "preferred_dtype": "dtype_hint"
         },
+        "tf.nn.softmax_cross_entropy_with_logits": {
+            "dim": "axis",
+            "_sentinel": None,
+        },
         "tf.nn.softmax_cross_entropy_with_logits_v2": {
             "dim": "axis"
         },
@@ -90,6 +132,11 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.load_file_system_library": {
             "library_filename": "library_location",
         },
+        "tf.count_nonzero": {
+            "input_tensor": "input",
+            "keep_dims": "keepdims",
+            "reduction_indices": "axis",
+        },
         "tf.math.count_nonzero": {
             "input_tensor": "input",
             "keep_dims": "keepdims",
@@ -175,6 +222,17 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.nn.max_pool_with_argmax": {
             "Targmax": "output_dtype",
         },
+        "tf.nn.max_pool": {
+            "value": "input"
+        },
+
+        "tf.nn.avg_pool": {
+            "value": "input"
+        },
+
+        "tf.nn.avg_pool2d": {
+            "value": "input"
+        },
         "tf.multinomial": {
             "output_dtype": "dtype",
         },
@@ -346,6 +404,38 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.nn.weighted_moments": {
             "keep_dims": "keepdims"
         },
+        "tf.nn.conv1d": {
+            "value": "input",
+            "use_cudnn_on_gpu": None,
+        },
+        "tf.nn.conv2d": {
+            "filter": "filters",
+            "use_cudnn_on_gpu": None,
+        },
+        "tf.nn.conv2d_backprop_input": {
+            "use_cudnn_on_gpu": None,
+            "input_sizes": "output_shape",
+            "out_backprop": "input",
+            "filter": "filters",
+        },
+        "tf.contrib.summary.audio": {
+            "tensor": "data",
+            "family": None,
+        },
+        "tf.contrib.summary.histogram": {
+            "tensor": "data",
+            "family": None,
+        },
+        "tf.contrib.summary.image": {
+            "tensor": "data",
+            "bad_color": None,
+            "max_images": "max_outputs",
+            "family": None,
+        },
+        "tf.contrib.summary.scalar": {
+            "tensor": "data",
+            "family": None,
+        },
     }
 
     # pylint: disable=line-too-long
@@ -356,10 +446,14 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
     self.manual_symbol_renames = {
         "tf.batch_to_space_nd":
             "tf.batch_to_space",
+        "tf.batch_gather":
+            "tf.compat.v1.batch_gather",
         "tf.space_to_batch_nd":
             "tf.space_to_batch",
         "tf.nn.space_to_batch":
             "tf.space_to_batch",
+        "tf.estimator.inputs":
+            "tf.compat.v1.estimator.inputs",
         "tf.extract_image_patches":
             "tf.image.extract_image_patches",
         "tf.gfile.Copy":
@@ -370,6 +464,8 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             "tf.io.gfile.exists",
         "tf.gfile.Glob":
             "tf.io.gfile.glob",
+        "tf.gfile.GFile":
+            "tf.io.gfile.GFile",
         "tf.gfile.IsDirectory":
             "tf.io.gfile.isdir",
         "tf.gfile.ListDirectory":
@@ -378,6 +474,8 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             "tf.io.gfile.makedirs",
         "tf.gfile.MkDir":
             "tf.io.gfile.mkdir",
+        "tf.gfile.Open":
+            "tf.io.gfile.GFile",
         "tf.gfile.Remove":
             "tf.io.gfile.remove",
         "tf.gfile.Rename":
@@ -470,10 +568,66 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             "tf.data.experimental.unbatch",
         "tf.contrib.data.unique":
             "tf.data.experimental.unique",
+        "tf.contrib.estimator.make_early_stopping_hook":
+            "tf.estimator.experimental.make_early_stopping_hook",
+        "tf.contrib.estimator.stop_if_higher_hook":
+            "tf.estimator.experimental.stop_if_higher_hook",
+        "tf.contrib.estimator.stop_if_lower_hook":
+            "tf.estimator.experimental.stop_if_lower_hook",
+        "tf.contrib.estimator.stop_if_no_decrease_hook":
+            "tf.estimator.experimental.stop_if_no_decrease_hook",
+        "tf.contrib.estimator.stop_if_no_increase_hook":
+            "tf.estimator.experimental.stop_if_no_increase_hook",
+        "tf.contrib.framework.CriticalSection":
+            "tf.CriticalSection",
+        "tf.contrib.framework.is_tensor":
+            "tf.is_tensor",
+        "tf.contrib.framework.nest.assert_same_structure":
+            "tf.nest.assert_same_structure",
+        "tf.contrib.framework.nest.flatten":
+            "tf.nest.flatten",
+        "tf.contrib.framework.nest.is_sequence":
+            "tf.nest.is_nested",
+        "tf.contrib.framework.nest.map_structure":
+            "tf.nest.map_structure",
+        "tf.contrib.framework.nest.pack_sequence_as":
+            "tf.nest.pack_sequence_as",
+        "tf.contrib.util.constant_value":
+            "tf.get_static_value",
+        "tf.contrib.saved_model.load_keras_model":
+            "tf.keras.experimental.load_from_saved_model",
+        "tf.contrib.saved_model.save_keras_model":
+            "tf.keras.experimental.export_saved_model",
+        "tf.contrib.rnn.RNNCell":
+            "tf.nn.rnn_cell.RNNCell",
+        "tf.contrib.rnn.LSTMStateTuple":
+            "tf.nn.rnn_cell.LSTMStateTuple",
+        "tf.contrib.rnn.BasicLSTMCell":
+            "tf.compat.v1.nn.rnn_cell.BasicLSTMCell",
+        "tf.contrib.rnn.BasicRNNCell":
+            "tf.compat.v1.nn.rnn_cell.BasicRNNCell",
+        "tf.contrib.rnn.GRUCell":
+            "tf.compat.v1.nn.rnn_cell.GRUCell",
+        "tf.contrib.rnn.LSTMCell":
+            "tf.compat.v1.nn.rnn_cell.LSTMCell",
+        "tf.contrib.rnn.MultiRNNCell":
+            "tf.compat.v1.nn.rnn_cell.MultiRNNCell",
         "tf.contrib.framework.sort":
             "tf.sort",
         "tf.contrib.framework.argsort":
             "tf.argsort",
+        "tf.contrib.summary.audio":
+            "tf.compat.v2.summary.audio",
+        "tf.contrib.summary.histogram":
+            "tf.compat.v2.summary.histogram",
+        "tf.contrib.summary.image":
+            "tf.compat.v2.summary.image",
+        "tf.contrib.summary.initialize":
+            "tf.compat.v1.summary.initialize",
+        "tf.contrib.summary.scalar":
+            "tf.compat.v2.summary.scalar",
+        "tf.count_nonzero":
+            "tf.math.count_nonzero",
         "tf.manip.batch_to_space_nd":
             "tf.batch_to_space",
         "tf.quantize_v2":
@@ -492,6 +646,8 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             "tf.sparse.reduce_max",
         "tf.random.stateless_multinomial":
             "tf.random.stateless_categorical",
+        "tf.substr":
+            "tf.strings.substr",
         "tf.string_to_hash_bucket":
             "tf.strings.to_hash_bucket",
         "tf.string_to_number":
@@ -548,12 +704,94 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         # changed significantly.
         "tf.nn.ctc_loss":
             "tf.compat.v1.nn.ctc_loss",
+        # tf.saved_model.load in 1.x has no equivalent in 2.x, but there is a
+        # symbol with the same name.
+        "tf.saved_model.load":
+            "tf.compat.v1.saved_model.load",
+        "tf.saved_model.load_v2":
+            "tf.compat.v2.saved_model.load",
+        "tf.zeros_initializer":
+            "tf.compat.v1.initializers.zeros",
+        "tf.ones_initializer":
+            "tf.compat.v1.initializers.ones",
+        "tf.constant_initializer":
+            "tf.compat.v1.initializers.constant",
+        "tf.random_uniform_initializer":
+            "tf.compat.v1.initializers.random_uniform",
+        "tf.random_normal_initializer":
+            "tf.compat.v1.initializers.random_normal",
+        "tf.truncated_normal_initializer":
+            "tf.compat.v1.initializers.truncated_normal",
+        "tf.image.resize_images":
+            "tf.image.resize",
+        "tf.random_poisson":
+            "tf.random.poisson",
+        "tf.debugging.assert_greater":
+            "tf.compat.v1.debugging.assert_greater",
+        "tf.debugging.assert_greater_equal":
+            "tf.compat.v1.debugging.assert_greater_equal",
+        "tf.debugging.assert_integer":
+            "tf.compat.v1.debugging.assert_integer",
+        "tf.debugging.assert_less":
+            "tf.compat.v1.debugging.assert_less",
+        "tf.debugging.assert_less_equal":
+            "tf.compat.v1.debugging.assert_less_equal",
+        "tf.debugging.assert_near":
+            "tf.compat.v1.debugging.assert_near",
+        "tf.debugging.assert_negative":
+            "tf.compat.v1.debugging.assert_negative",
+        "tf.debugging.assert_non_negative":
+            "tf.compat.v1.debugging.assert_non_negative",
+        "tf.debugging.assert_non_positive":
+            "tf.compat.v1.debugging.assert_non_positive",
+        "tf.debugging.assert_none_equal":
+            "tf.compat.v1.debugging.assert_none_equal",
+        "tf.debugging.assert_type":
+            "tf.compat.v1.debugging.assert_type",
+        "tf.debugging.assert_positive":
+            "tf.compat.v1.debugging.assert_positive",
+        "tf.debugging.assert_equal":
+            "tf.compat.v1.debugging.assert_equal",
+        "tf.debugging.assert_scalar":
+            "tf.compat.v1.debugging.assert_scalar",
+        "tf.assert_equal":
+            "tf.compat.v1.assert_equal",
+        "tf.assert_less":
+            "tf.compat.v1.assert_less",
+        "tf.assert_greater":
+            "tf.compat.v1.assert_greater",
+        "tf.debugging.assert_rank":
+            "tf.compat.v1.debugging.assert_rank",
+        "tf.debugging.assert_rank_at_least":
+            "tf.compat.v1.debugging.assert_rank_at_least",
+        "tf.debugging.assert_rank_in":
+            "tf.compat.v1.debugging.assert_rank_in",
+        "tf.assert_rank":
+            "tf.compat.v1.assert_rank",
+        "tf.nn.max_pool":
+            "tf.nn.max_pool2d",
+        "tf.nn.avg_pool":
+            "tf.nn.avg_pool2d",
+        "tf.keras.initializers.zeros":
+            "tf.compat.v1.keras.initializers.zeros",
+        "tf.keras.initializers.ones":
+            "tf.compat.v1.keras.initializers.ones",
+        "tf.keras.initializers.constant":
+            "tf.compat.v1.keras.initializers.constant",
+        "tf.data.experimental.map_and_batch_with_legacy_function":
+            "tf.compat.v1.data.experimental.map_and_batch_with_legacy_function",
+        "tf.nn.conv2d_backprop_input":
+            "tf.nn.conv2d_transpose"
     }
     # pylint: enable=line-too-long
 
     # Mapping from function to the new name of the function
     self.symbol_renames = renames_v2.renames
     self.symbol_renames.update(self.manual_symbol_renames)
+    self.symbol_renames = {
+        name: new_name
+        for name, new_name in self.symbol_renames.items()
+    }
 
     # Variables that should be changed to functions.
     self.change_to_function = {}
@@ -571,9 +809,14 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.argmax",
         "tf.argmin",
         "tf.batch_to_space",
+        "tf.cond",
         "tf.nn.space_to_batch",
         "tf.boolean_mask",
         "tf.convert_to_tensor",
+        "tf.nn.conv1d",
+        "tf.nn.conv2d",
+        "tf.nn.conv2d_backprop_input",
+        "tf.nn.ctc_beam_search_decoder",
         "tf.nn.moments",
         "tf.nn.convolution",
         "tf.nn.crelu",
@@ -600,9 +843,10 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.sparse.reduce_max",
         "tf.sparse_reduce_max",
         "tf.io.decode_csv",
-        "tf.strings.substr",
-        "tf.strings.reduce_join",
         "tf.strings.length",
+        "tf.strings.reduce_join",
+        "tf.strings.substr",
+        "tf.substr",
         "tf.transpose",
         "tf.tuple",
         "tf.parse_example",
@@ -635,81 +879,187 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "tf.nn.embedding_lookup_sparse",
         "tf.nn.in_top_k",
         "tf.nn.space_to_depth",
+        "tf.test.assert_equal_graph_def",
         "tf.linalg.norm",
         "tf.norm",
         "tf.reverse_sequence",
         "tf.sparse_split",
+        # tf.nn.softmax_cross_entropy_with_logits *must* be called with
+        # keyword arguments. Add keyword arguments in rare case when they
+        # are not specified.
+        "tf.nn.softmax_cross_entropy_with_logits",
+        "tf.nn.fractional_avg_pool",
+        "tf.nn.fractional_max_pool",
+        "tf.image.sample_distorted_bounding_box",
+        "tf.gradients",
+        "tf.hessians",
+        "tf.nn.max_pool",
+        "tf.nn.avg_pool",
     }
 
+    # Manual mapping of function names to be reordered to their list of argument
+    # names, in order. Only use this if argument names cannot be autodetected,
+    # e.g. if the functions are in contrib.
+    self.manual_function_reorders = {
+        "tf.contrib.summary.audio": [
+            "name", "tensor", "sample_rate", "max_outputs", "family", "step"],
+        "tf.contrib.summary.histogram": [
+            "name", "tensor", "family", "step"],
+        "tf.contrib.summary.image": [
+            "name", "tensor", "bad_color", "max_images", "family", "step"],
+        "tf.contrib.summary.scalar": [
+            "name", "tensor", "family", "step"],
+    }
     # Functions that were reordered should be changed to the new keyword args
     # for safety, if positional arguments are used. If you have reversed the
     # positional arguments yourself, this could do the wrong thing.
-    self.function_reorders = reorders_v2.reorders
-
-    # Specially handled functions.
-    self.function_handle = {
-        "tf.nn.dropout": self._dropout_handler,
-        "tf.gradients": self._colocate_handler("tf.gradients"),
-        "*.minimize": self._colocate_handler("Optimizer.minimize"),
-        "*.compute_gradients":
-            self._colocate_handler("Optimizer.compute_gradients"),
-    }
+    self.function_reorders = dict(reorders_v2.reorders)
+    self.function_reorders.update(self.manual_function_reorders)
 
-    decay_function_comment = (
-        "WARNING: <function name> has been changed to return a callable instead"
-        " of a tensor when graph building, but its functionality remains "
-        "unchanged during eager execution (returns a callable like "
-        "before). The converter cannot detect and fix this reliably, so "
-        "this usage has been converted to compat.v1 (even though it may already"
-        " be correct).\n"
+    contrib_warning = (
+        ast_edits.ERROR,
+        "<function name> cannot be converted automatically. tf.contrib will not"
+        " be distributed with TensorFlow 2.0, please consider an alternative in"
+        " non-contrib TensorFlow, a community-maintained repository, or fork "
+        "the required code."
     )
 
-    # TODO(b/118888586): add default value change to update script.
-    default_loss_reduction_changed = (
-        "WARNING: default value of loss_reduction has been changed to "
-        "SUM_OVER_BATCH_SIZE.\n"
+    flags_warning = (
+        ast_edits.ERROR,
+        "tf.flags has been removed, please use the argparse or absl"
+        " modules if you need command line parsing.")
+
+    decay_function_comment = (
+        ast_edits.INFO,
+        "To use learning rate decay schedules with TensorFlow 2.0, switch to "
+        "the schedules in `tf.keras.optimizers.schedules`.\n"
     )
 
     assert_return_type_comment = (
-        "WARNING: assert_* functions have been changed to return None, the "
+        ast_edits.INFO,
+        "<function name> has been changed to return None, the "
         "data argument has been removed, and arguments have been reordered."
         "\nThe calls have been converted to compat.v1 for safety (even though "
         " they may already have been correct)."
     )
 
     assert_rank_comment = (
-        "WARNING: assert_rank_* functions have been changed to return None, and"
+        ast_edits.INFO,
+        "<function name> has been changed to return None, and"
         " the data and summarize arguments have been removed."
         "\nThe calls have been converted to compat.v1 for safety (even though "
         " they may already have been correct)."
     )
 
-    tf_01s_like_no_optimize_comment = (
-        "WARNING: tf.zeros_like and tf.ones_like no longer have the optimize "
-        "argument in TF 2.0 or after (also, `tensor' argument is renamed to "
-        "`input')."
-        "\nThe calls have been converted to compat.v1 for safety (even though "
-        " they may already have been correct)."
-    )
+    initializers_no_dtype_comment = (
+        ast_edits.INFO,
+        "Initializers no longer have the "
+        "dtype argument in the constructor or partition_info argument in the "
+        "__call__ method.\nThe calls have been converted to compat.v1 for"
+        "safety (even though they may already have been correct).")
+
+    metrics_comment = (
+        ast_edits.INFO,
+        "tf.metrics have been replaced with object oriented versions in"
+        " TF 2.0 and after. The metric function calls have been converted to "
+        "compat.v1 for backward compatibility. Please update these calls to "
+        "the TF 2.0 versions.")
+
+    losses_comment = (
+        ast_edits.INFO,
+        "tf.losses have been replaced with object oriented versions in"
+        " TF 2.0 and after. The loss function calls have been converted to "
+        "compat.v1 for backward compatibility. Please update these calls to "
+        "the TF 2.0 versions.")
 
+    # This could be done with a _rename_if_arg_not_found_transformer
     deprecate_partition_strategy_comment = (
-        "WARNING: `partition_strategy` has been removed from `%s` "
-        " The 'div' strategy is used by default.")
+        ast_edits.WARNING,
+        "`partition_strategy` has been removed from <function name>. "
+        " The 'div' strategy will be used by default.")
+
+    # TODO(b/118888586): add default value change to update script.
+    default_loss_reduction_changed = (
+        ast_edits.WARNING,
+        "default value of loss_reduction has been changed to "
+        "SUM_OVER_BATCH_SIZE.\n"
+    )
+
+    # make change instead
+    uniform_unit_scaling_initializer_comment = (
+        ast_edits.ERROR,
+        "uniform_unit_scaling_initializer has been removed. Please use"
+        " tf.initializers.variance_scaling instead with distribution=uniform "
+        "to get equivalent behaviour.")
+
+    # Make change instead (issue warning about strip_...)
+    export_saved_model_renamed = (
+        ast_edits.ERROR,
+        "(Manual edit required) Please rename the method export_savedmodel() "
+        "to export_saved_model(). Two things to note:\n\t(1) The argument "
+        "strip_default_attributes has been removed. The function will always "
+        "strip the default attributes from ops. If this breaks your code, "
+        "please switch to tf.compat.v1.estimator.Estimator.\n\t(2) This change "
+        "only effects core estimator. If you are using "
+        "tf.contrib.learn.Estimator, please switch to using core estimator.")
+
+    # TODO(b/124529441): if possible eliminate need for manual checking.
+    contrib_summary_comment = (
+        ast_edits.WARNING,
+        "(Manual check required) tf.contrib.summary.* functions have been "
+        "migrated best-effort to tf.compat.v2.summary.* equivalents where "
+        "possible, but the resulting code may not always work. Please check "
+        "manually; you can report migration failures on b/124529441.")
 
     # Function warnings. <function name> placeholder inside warnings will be
     # replaced by function name.
+    # You can use *. to add items which do not check the FQN, and apply to e.g.,
+    # methods.
     self.function_warnings = {
-        "tf.assert_greater":
-            assert_return_type_comment,
+        "*.export_savedmodel":
+            export_saved_model_renamed,
         "tf.assert_equal":
             assert_return_type_comment,
+        "tf.assert_none_equal":
+            assert_return_type_comment,
+        "tf.assert_negative":
+            assert_return_type_comment,
+        "tf.assert_positive":
+            assert_return_type_comment,
+        "tf.assert_non_negative":
+            assert_return_type_comment,
+        "tf.assert_non_positive":
+            assert_return_type_comment,
+        "tf.assert_near":
+            assert_return_type_comment,
         "tf.assert_less":
             assert_return_type_comment,
+        "tf.assert_less_equal":
+            assert_return_type_comment,
+        "tf.assert_greater":
+            assert_return_type_comment,
+        "tf.assert_greater_equal":
+            assert_return_type_comment,
+        "tf.assert_integer":
+            assert_return_type_comment,
+        "tf.assert_type":
+            assert_return_type_comment,
+        "tf.assert_scalar":
+            assert_return_type_comment,
         "tf.assert_rank":
             assert_rank_comment,
-        "tf.cond": "tf.cond no longer takes 'strict'. "
-                   "Now 'strict' defaults to True."
-                   "fn1/fn2 arguments are replaced by true_fn/false_fn.",
+        "tf.assert_rank_at_least":
+            assert_rank_comment,
+        "tf.assert_rank_in":
+            assert_rank_comment,
+        "tf.contrib.summary.audio":
+            contrib_summary_comment,
+        "tf.contrib.summary.histogram":
+            contrib_summary_comment,
+        "tf.contrib.summary.image":
+            contrib_summary_comment,
+        "tf.contrib.summary.scalar":
+            contrib_summary_comment,
         "tf.debugging.assert_equal":
             assert_return_type_comment,
         "tf.debugging.assert_greater":
@@ -734,18 +1084,16 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             assert_return_type_comment,
         "tf.debugging.assert_positive":
             assert_return_type_comment,
+        "tf.debugging.assert_type":
+            assert_return_type_comment,
+        "tf.debugging.assert_scalar":
+            assert_return_type_comment,
         "tf.debugging.assert_rank":
             assert_rank_comment,
         "tf.debugging.assert_rank_at_least":
             assert_rank_comment,
         "tf.debugging.assert_rank_in":
             assert_rank_comment,
-        "tf.device": "tf.device no longer takes function as an argument. "
-                     "'devide_name_or_function' argument has been renamed to "
-                     "'device_name'.",
-        "tf.flags":
-            "tf.flags has been removed, please use the argparse or absl"
-            " module if you need command line parsing.",
         "tf.train.exponential_decay":
             decay_function_comment,
         "tf.train.piecewise_constant_decay":
@@ -780,146 +1128,759 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
             default_loss_reduction_changed,
         "tf.estimator.BaselineRegressor":
             default_loss_reduction_changed,
-        "tf.hessians": "tf.hessians no longer takes "
-                       "'colocate_gradients_with_ops' argument. Also, "
-                       "arguments have been reordered so that 'name' is the "
-                       "last argument.",
-        "tf.nn.conv1d":
-            "WARNING: use_cudnn_on_gpu argument has been removed and \"value\""
-            " was renamed to \"input\"",
-        "tf.nn.conv2d":
-            "WARNING: use_cudnn_on_gpu argument has been removed and "
-            "\"filter\" was renamed to \"filters\"",
-        "tf.nn.conv2d_backprop_filter":
-            "WARNING: use_cudnn_on_gpu argument has been removed",
-        "tf.nn.conv2d_backprop_input":
-            "WARNING: use_cudnn_on_gpu argument has been removed and "
-            "\"filter\" was renamed to \"filters\"",
-        "tf.nn.erosion2d":
-            "WARNING: <function name> now requires a data_format argument",
         "tf.nn.nce_loss":
-            deprecate_partition_strategy_comment % "tf.nn.nce_loss",
+            deprecate_partition_strategy_comment,
         "tf.nn.safe_embedding_lookup_sparse":
-            deprecate_partition_strategy_comment %
-            "tf.nn.safe_embedding_lookup_sparse",
+            deprecate_partition_strategy_comment,
         "tf.nn.sampled_softmax_loss":
-            deprecate_partition_strategy_comment % "tf.nn.sampled_softmax_loss",
-        "tf.zeros_like":
-            tf_01s_like_no_optimize_comment,
-        "tf.ones_like":
-            tf_01s_like_no_optimize_comment,
-        "tf.nn.embedding_lookup":
-            "WARNING: validate_indices argument has been removed.",
-        "tf.while_loop":
-            "tf.while_loop no longer takes 'return_same_structure' argument. "
-            "'return_same_structure' now defaults to True. Also, 'name'"
-            "argument is now the last argument.",
-        "tf.image.sample_distorted_bounding_box":
-            "tf.image.sample_distorted_bounding_box no longer takes 'seed2' "
-            "argument.",
-        "tf.nn.ctc_beam_search_decoder":
-            "tf.nn.ctc_beam_search_decoder no longer takes 'merge_repeated' "
-            "argument. 'merge_repeated' now defaults to False.",
-        "tf.nn.fractional_avg_pool":
-            "tf.nn.fractional_avg_pool no longer takes 'seed2' and "
-            "'deterministic' arguments. Now it takes a single 'seed' arg. If "
-            "'seed' is zero, the execution is random and deterministic "
-            "otherwise",
-        "tf.nn.fractional_max_pool":
-            "tf.nn.fractional_max_pool no longer takes 'seed2' and "
-            "'deterministic' arguments. Now it takes a single 'seed' arg. If "
-            "'seed' is zero, the execution is random and deterministic "
-            "otherwise",
-        "tf.nn.softmax_cross_entropy_with_logits":
-            "tf.nn.softmax_cross_entropy_with_logits behavior has changed. "
-            "'labels' needs to be wrapped with tf.stop_gradient to keep the "
-            "old behavior. Also, 'dim' argument has been renamed to 'axis'.",
-        "tf.test.assert_equal_graph_def":
-            "tf.assert_equal_graph_def no longer takes 'checkpoint_v2' "
-            "argument. 'checkpoint_v2' now defaults to True.",
+            deprecate_partition_strategy_comment,
+        "tf.keras.initializers.Zeros":
+            initializers_no_dtype_comment,
+        "tf.keras.initializers.zeros":
+            initializers_no_dtype_comment,
+        "tf.keras.initializers.Ones":
+            initializers_no_dtype_comment,
+        "tf.keras.initializers.ones":
+            initializers_no_dtype_comment,
+        "tf.keras.initializers.Constant":
+            initializers_no_dtype_comment,
+        "tf.keras.initializers.constant":
+            initializers_no_dtype_comment,
+        "tf.keras.initializers.VarianceScaling":
+            initializers_no_dtype_comment,
+        "tf.keras.initializers.Orthogonal":
+            initializers_no_dtype_comment,
+        "tf.keras.initializers.orthogonal":
+            initializers_no_dtype_comment,
+        "tf.keras.initializers.Identity":
+            initializers_no_dtype_comment,
+        "tf.keras.initializers.identity":
+            initializers_no_dtype_comment,
+        "tf.keras.initializers.glorot_uniform":
+            initializers_no_dtype_comment,
+        "tf.keras.initializers.glorot_normal":
+            initializers_no_dtype_comment,
+        "tf.initializers.zeros":
+            initializers_no_dtype_comment,
+        "tf.zeros_initializer":
+            initializers_no_dtype_comment,
+        "tf.initializers.ones":
+            initializers_no_dtype_comment,
+        "tf.ones_initializer":
+            initializers_no_dtype_comment,
+        "tf.initializers.constant":
+            initializers_no_dtype_comment,
+        "tf.constant_initializer":
+            initializers_no_dtype_comment,
+        "tf.initializers.random_uniform":
+            initializers_no_dtype_comment,
+        "tf.random_uniform_initializer":
+            initializers_no_dtype_comment,
+        "tf.initializers.random_normal":
+            initializers_no_dtype_comment,
+        "tf.random_normal_initializer":
+            initializers_no_dtype_comment,
+        "tf.initializers.truncated_normal":
+            initializers_no_dtype_comment,
+        "tf.truncated_normal_initializer":
+            initializers_no_dtype_comment,
+        "tf.initializers.variance_scaling":
+            initializers_no_dtype_comment,
+        "tf.variance_scaling_initializer":
+            initializers_no_dtype_comment,
+        "tf.initializers.orthogonal":
+            initializers_no_dtype_comment,
+        "tf.orthogonal_initializer":
+            initializers_no_dtype_comment,
+        "tf.initializers.identity":
+            initializers_no_dtype_comment,
+        "tf.glorot_uniform_initializer":
+            initializers_no_dtype_comment,
+        "tf.initializers.glorot_uniform":
+            initializers_no_dtype_comment,
+        "tf.glorot_normal_initializer":
+            initializers_no_dtype_comment,
+        "tf.initializers.glorot_normal":
+            initializers_no_dtype_comment,
+        "tf.initializers.uniform_unit_scaling":
+            uniform_unit_scaling_initializer_comment,
+        "tf.uniform_unit_scaling_initializer":
+            uniform_unit_scaling_initializer_comment,
+        "tf.losses.absolute_difference":
+            losses_comment,
+        "tf.losses.add_loss":
+            losses_comment,
+        "tf.losses.compute_weighted_loss":
+            losses_comment,
+        "tf.losses.cosine_distance":
+            losses_comment,
+        "tf.losses.get_losses":
+            losses_comment,
+        "tf.losses.get_regularization_loss":
+            losses_comment,
+        "tf.losses.get_regularization_losses":
+            losses_comment,
+        "tf.losses.get_total_loss":
+            losses_comment,
+        "tf.losses.hinge_loss":
+            losses_comment,
+        "tf.losses.huber_loss":
+            losses_comment,
+        "tf.losses.log_loss":
+            losses_comment,
+        "tf.losses.mean_pairwise_squared_error":
+            losses_comment,
+        "tf.losses.mean_squared_error":
+            losses_comment,
+        "tf.losses.sigmoid_cross_entropy":
+            losses_comment,
+        "tf.losses.softmax_cross_entropy":
+            losses_comment,
+        "tf.losses.sparse_softmax_cross_entropy":
+            losses_comment,
+        "tf.metrics.accuracy":
+            metrics_comment,
+        "tf.metrics.auc":
+            metrics_comment,
+        "tf.metrics.average_precision_at_k":
+            metrics_comment,
+        "tf.metrics.false_negatives":
+            metrics_comment,
+        "tf.metrics.false_negatives_at_thresholds":
+            metrics_comment,
+        "tf.metrics.false_positives":
+            metrics_comment,
+        "tf.metrics.false_positives_at_thresholds":
+            metrics_comment,
+        "tf.metrics.mean":
+            metrics_comment,
+        "tf.metrics.mean_absolute_error":
+            metrics_comment,
+        "tf.metrics.mean_cosine_distance":
+            metrics_comment,
+        "tf.metrics.mean_iou":
+            metrics_comment,
+        "tf.metrics.mean_per_class_accuracy":
+            metrics_comment,
+        "tf.metrics.mean_relative_error":
+            metrics_comment,
+        "tf.metrics.mean_squared_error":
+            metrics_comment,
+        "tf.metrics.mean_tensor":
+            metrics_comment,
+        "tf.metrics.percentage_below":
+            metrics_comment,
+        "tf.metrics.precision":
+            metrics_comment,
+        "tf.metrics.precision_at_k":
+            metrics_comment,
+        "tf.metrics.precision_at_thresholds":
+            metrics_comment,
+        "tf.metrics.precision_at_top_k":
+            metrics_comment,
+        "tf.metrics.recall":
+            metrics_comment,
+        "tf.metrics.recall_at_k":
+            metrics_comment,
+        "tf.metrics.recall_at_thresholds":
+            metrics_comment,
+        "tf.metrics.recall_at_top_k":
+            metrics_comment,
+        "tf.metrics.root_mean_squared_error":
+            metrics_comment,
+        "tf.metrics.sensitivity_at_specificity":
+            metrics_comment,
+        "tf.metrics.sparse_average_precision_at_k":
+            metrics_comment,
+        "tf.metrics.sparse_precision_at_k":
+            metrics_comment,
+        "tf.metrics.specificity_at_sensitivity":
+            metrics_comment,
+        "tf.metrics.true_negatives":
+            metrics_comment,
+        "tf.metrics.true_negatives_at_thresholds":
+            metrics_comment,
+        "tf.metrics.true_positives":
+            metrics_comment,
+        "tf.metrics.true_positives_at_thresholds":
+            metrics_comment,
     }
 
-    self.symbol_renames = {
-        name: new_name
-        for name, new_name in self.symbol_renames.items()
+    # Warnings that are emitted only if a specific arg is found.
+    self.function_arg_warnings = {
+        "tf.nn.conv1d": {
+            ("use_cudnn_on_gpu", 4): (
+                ast_edits.WARNING,
+                "use_cudnn_on_gpu has been removed, behavior is now equivalent"
+                "to setting it to True."),
+        },
+        "tf.nn.conv2d": {
+            ("use_cudnn_on_gpu", 4): (
+                ast_edits.WARNING,
+                "use_cudnn_on_gpu has been removed, behavior is now equivalent"
+                "to setting it to True."),
+        },
+        "tf.nn.conv2d_backprop_filter": {
+            ("use_cudnn_on_gpu", 5): (
+                ast_edits.WARNING,
+                "use_cudnn_on_gpu has been removed, behavior is now equivalent"
+                "to setting it to True."),
+        },
+        "tf.nn.conv2d_backprop_input": {
+            ("use_cudnn_on_gpu", 5): (
+                ast_edits.WARNING,
+                "use_cudnn_on_gpu has been removed, behavior is now equivalent"
+                "to setting it to True."),
+        },
+        "tf.gradients": {
+            ("colocate_gradients_with_ops", 4): (
+                ast_edits.INFO,
+                "tf.gradients no longer takes "
+                "'colocate_gradients_with_ops' argument, it behaves as if it "
+                "was set to True."),
+        },
+        "*.minimize": {
+            ("colocate_gradients_with_ops", 5): (
+                ast_edits.INFO,
+                "Optimizer.minimize no longer takes "
+                "'colocate_gradients_with_ops' argument, it behaves as if it "
+                "was set to True."),
+        },
+        "*.compute_gradients": {
+            ("colocate_gradients_with_ops", 4): (
+                ast_edits.INFO,
+                "Optimizer.compute_gradients no "
+                "longer takes 'colocate_gradients_with_ops' argument, it "
+                "behaves as if it was set to True."),
+        },
+        "tf.cond": {
+            ("strict", 3): (
+                ast_edits.WARNING,
+                "tf.cond no longer takes 'strict' argument, it behaves as "
+                "if was set to True.")
+        },
+        "tf.contrib.summary.audio": {
+            ("family", 4): (
+                ast_edits.WARNING,
+                "tf.contrib.summary.* functions no longer take the 'family' "
+                "argument; instead name scoping should be used. This call site "
+                "specifies a family argument so it cannot be converted safely.")
+        },
+        "tf.contrib.summary.histogram": {
+            ("family", 2): (
+                ast_edits.WARNING,
+                "tf.contrib.summary.* functions no longer take the 'family' "
+                "argument; instead name scoping should be used. This call site "
+                "specifies a family argument so it cannot be converted safely.")
+        },
+        "tf.contrib.summary.image": {
+            ("bad_color", 2): (
+                ast_edits.WARNING,
+                "tf.contrib.summary.image no longer takes the 'bad_color' "
+                "argument; caller must now preprocess if needed. This call "
+                "site specifies a bad_color argument so it cannot be converted "
+                "safely."),
+            ("family", 4): (
+                ast_edits.WARNING,
+                "tf.contrib.summary.* functions no longer take the 'family' "
+                "argument; instead name scoping should be used. This call site "
+                "specifies a family argument so it cannot be converted safely.")
+        },
+        "tf.contrib.summary.scalar": {
+            ("family", 2): (
+                ast_edits.WARNING,
+                "tf.contrib.summary.* functions no longer take the 'family' "
+                "argument; instead name scoping should be used. This call site "
+                "specifies a family argument so it cannot be converted safely.")
+        },
     }
 
-    export_saved_model_renamed = (
-        "(Manual edit required) Please rename the method export_savedmodel() "
-        "to export_saved_model(). Two things to note:\n\t(1) The argument "
-        "strip_default_attributes has been removed. The function will always "
-        "strip the default attributes from ops. If this breaks your code, "
-        "please switch to tf.compat.v1.estimator.Estimator.\n\t(2) This change "
-        "only effects core estimator. If you are using "
-        "tf.contrib.learn.Estimator, please switch to using core estimator.")
+    # Specially handled functions
+    # Each transformer is a callable which will be called with the arguments
+    #   transformer(parent, node, full_name, name, logs)
+    # Where logs is a list to which (level, line, col, msg) tuples can be
+    # appended, full_name is the FQN of the function called (or None if that is
+    # unknown), name is the name of the function called (or None is that is
+    # unknown). node is an ast.Call node representing this function call, and
+    # parent is its parent in the AST.
+    # The function may modify node (but not parent), and must return
+    # - none, if nothing was modified
+    # - node, if node was modified in place (make sure to use
+    #   pasta.ast_utils.replace_child to swap out children, otherwise formatting
+    #   may get messy)
+    # - a replacement for node, if the whole call node was replaced. The caller
+    #   will take care of changing parent.
+    self.function_transformers = {
+        "*.make_initializable_iterator": _iterator_transformer,
+        "*.make_one_shot_iterator": _iterator_transformer,
+        "tf.nn.dropout": _dropout_transformer,
+        "tf.to_bfloat16": _cast_transformer,
+        "tf.to_complex128": _cast_transformer,
+        "tf.to_complex64": _cast_transformer,
+        "tf.to_double": _cast_transformer,
+        "tf.to_float": _cast_transformer,
+        "tf.to_int32": _cast_transformer,
+        "tf.to_int64": _cast_transformer,
+        "tf.nn.softmax_cross_entropy_with_logits":
+            _softmax_cross_entropy_with_logits_transformer,
+        "tf.image.extract_glimpse": _extract_glimpse_transformer,
+        "tf.image.resize_area": _image_resize_transformer,
+        "tf.image.resize_bicubic": _image_resize_transformer,
+        "tf.image.resize_bilinear": _image_resize_transformer,
+        "tf.image.resize_nearest_neighbor": _image_resize_transformer,
+        "tf.nn.fractional_avg_pool": _pool_seed_transformer,
+        "tf.nn.fractional_max_pool": _pool_seed_transformer,
+        "tf.device": functools.partial(
+            _rename_if_arg_found_transformer, arg_name="device_name",
+            arg_ok_predicate=_is_ast_str, remove_if_ok=False,
+            message="tf.device no longer takes functions as an argument. "
+            "We could not determine that the argument value is a string, so "
+            "the call was converted to compat.v1."),
+        "tf.zeros_like": functools.partial(
+            _rename_if_arg_found_transformer, arg_name="optimize",
+            arg_ok_predicate=_is_ast_true, remove_if_ok=True,
+            message="tf.zeros_like no longer takes an optimize argument, and "
+            "behaves as if optimize=True. This call site specifies something "
+            "other than optimize=True, so it was converted to compat.v1."),
+        "tf.ones_like": functools.partial(
+            _rename_if_arg_found_transformer, arg_name="optimize",
+            arg_ok_predicate=_is_ast_true, remove_if_ok=True,
+            message="tf.ones_like no longer takes an optimize argument, and "
+            "behaves as if optimize=True. This call site specifies something "
+            "other than optimize=True, so it was converted to compat.v1."),
+        "tf.while_loop": functools.partial(
+            _rename_if_arg_found_transformer,
+            arg_name="return_same_structure",
+            arg_ok_predicate=_is_ast_true, remove_if_ok=True,
+            message="tf.while_loop no longer takes 'return_same_structure' "
+            "argument and behaves as if return_same_structure=True. This call "
+            "site specifies something other than return_same_structure=True, "
+            "so it was converted to compat.v1."),
+        "tf.nn.ctc_beam_search_decoder": functools.partial(
+            _rename_if_arg_found_transformer,
+            arg_name="merge_repeated",
+            arg_ok_predicate=_is_ast_false, remove_if_ok=True,
+            message="tf.nn.ctc_beam_search_decoder no longer takes the "
+            "'merge_repeated' argument and behaves as if merge_repeated=False. "
+            "This call site specifies something other than "
+            "merge_repeated=False, so it was converted to compat.v1."),
+        "tf.nn.erosion2d": functools.partial(
+            _add_argument_transformer,
+            arg_name="data_format",
+            arg_value_ast=ast.Str("NHWC")),
+        "tf.contrib.summary.audio": _add_summary_step_transformer,
+        "tf.contrib.summary.histogram": _add_summary_step_transformer,
+        "tf.contrib.summary.image": _add_summary_step_transformer,
+        "tf.contrib.summary.scalar": _add_summary_step_transformer,
+    }
 
-    make_initializable_iterator_deprecation = (
-        "(Manual edit required) The "
-        "`tf.data.Dataset.make_initializable_iterator()` method has been "
-        "removed. If you are using the Estimator API, you can return a dataset "
-        "directly from your input functions without creating an iterator. "
-        "As a last resort, please replace calls to that method on `dataset` "
-        "with a call to "
-        "`tf.compat.v1.data.make_initializable_iterator(dataset)`.")
-
-    make_one_shot_iterator_deprecation = (
-        "(Manual edit required) The "
-        "`tf.data.Dataset.make_one_shot_iterator()` method has been "
-        "removed. If you are using eager execution, you can iterate over "
-        "`dataset` using a Python `for` loop. If you are using the Estimator "
-        "API, you can return a dataset directly from your input functions "
-        "without creating an iterator. As a last resort, please replace calls "
-        "to that method on `dataset` with a call to "
-        "`tf.compat.v1.data.make_one_shot_iterator(dataset)`.")
-
-    # Specify warnings for functions that aren't restricted to the tf.x.y.z
-    # format. This should only be used for methods with unique names, e.g.
-    # export_savedmodel, which is only defined in Estimator objects.
-    self.unrestricted_function_warnings = {
-        "export_savedmodel": export_saved_model_renamed,
-        "make_initializable_iterator": make_initializable_iterator_deprecation,
-        "make_one_shot_iterator": make_one_shot_iterator_deprecation,
+    self.module_deprecations = {
+        "tf.contrib": contrib_warning,
+        "tf.flags": flags_warning,
     }
 
-  @staticmethod
-  def _dropout_handler(file_edit_recorder, node):
-    if len(node.args) < 2:
-      comment = ("ERROR: tf.nn.dropout did not take arguments, so automatic "
-                 "transformation was disabled. tf.nn.dropout has changed "
-                 "the semantics of the second argument.")
-      file_edit_recorder.add(
-          comment,
-          node.lineno,
-          node.col_offset,
-          "tf.nn.dropout",
-          "tf.nn.dropout",
-          error="tf.nn.dropout requires manual check.")
+
+def _is_ast_str(node):
+  """Determine whether this node represents a string."""
+  allowed_types = [ast.Str]
+  if hasattr(ast, "Bytes"):
+    allowed_types += [ast.Bytes]
+  if hasattr(ast, "JoinedStr"):
+    allowed_types += [ast.JoinedStr]
+  if hasattr(ast, "FormattedValue"):
+    allowed_types += [ast.FormattedValue]
+  return isinstance(node, allowed_types)
+
+
+def _is_ast_true(node):
+  if hasattr(ast, "NameConstant"):
+    return isinstance(node, ast.NameConstant) and node.value is True
+  else:
+    return isinstance(node, ast.Name) and node.id == "True"
+
+
+def _is_ast_false(node):
+  if hasattr(ast, "NameConstant"):
+    return isinstance(node, ast.NameConstant) and node.value is False
+  else:
+    return isinstance(node, ast.Name) and node.id == "False"
+
+
+# Lots of unused arguments below, since these are called in a standard manner.
+# pylint: disable=unused-argument
+
+
+def _rename_if_arg_found_transformer(parent, node, full_name, name, logs,
+                                     arg_name=None,
+                                     arg_ok_predicate=None,
+                                     remove_if_ok=False,
+                                     message=None):
+  """Replaces the given call with tf.compat.v1 if the given arg is found.
+
+  This requires the function to be called with all named args, so for using
+  this transformer, the function should also be added to renames.
+
+  If the arg is not found, the call site is left alone.
+
+  If the arg is found, and if arg_ok_predicate is given, it is called with
+  the ast Expression representing the argument value found. If it returns
+  True, the function is left alone.
+
+  If the arg is found, arg_ok_predicate is not None and returns ok, and
+  remove_if_ok is True, the argument is removed from the call.
+
+  Otherwise, `compat.v1` is inserted between tf and the function name.
+
+  Args:
+    parent: Parent of node.
+    node: ast.Call node to maybe modify.
+    full_name: full name of function to modify
+    name: name of function to modify
+    logs: list of logs to append to
+    arg_name: name of the argument to look for
+    arg_ok_predicate: predicate callable with the ast of the argument value,
+      returns whether the argument value is allowed.
+    remove_if_ok: remove the argument if present and ok as determined by
+      arg_ok_predicate.
+    message: message to print if a non-ok arg is found (and hence, the function
+      is renamed to its compat.v1 version).
+
+  Returns:
+    node, if it was modified, else None.
+  """
+  # Check whether arg is there.
+  arg_present, arg_value = ast_edits.get_arg_value(node, arg_name)
+  if not arg_present:
+    return
+
+  # Check whether arg is problematic (and if not, maybe remove it).
+  if arg_ok_predicate and arg_ok_predicate(arg_value):
+    if remove_if_ok:
+      for i, kw in enumerate(node.keywords):
+        if kw.arg == arg_name:
+          node.keywords.pop(i)
+          logs.append((ast_edits.INFO, node.lineno, node.col_offset,
+                       "Removed argument %s for function %s" % (
+                           arg_name, full_name or name)))
+          break
+      return node
+    else:
+      return
+
+  # All conditions met, insert v1 and log what we did.
+  # We must have a full name, so the func is an attribute.
+  new_name = full_name.replace("tf.", "tf.compat.v1.", 1)
+  node.func = ast_edits.full_name_node(new_name)
+  logs.append((
+      ast_edits.INFO, node.lineno, node.col_offset,
+      "Renaming %s to %s because argument %s is present. %s" %
+      (full_name, new_name, arg_name, message if message is not None else "")
+  ))
+  return node
+
+
+def _add_argument_transformer(parent, node, full_name, name, logs,
+                              arg_name, arg_value_ast):
+  """Adds an argument (as a final kwarg arg_name=arg_value_ast)."""
+  node.keywords.append(ast.keyword(arg=arg_name, value=arg_value_ast))
+  logs.append((
+      ast_edits.INFO, node.lineno, node.col_offset,
+      "Adding argument '%s' to call to %s." % (pasta.dump(node.keywords[-1],
+                                                          full_name or name))
+  ))
+  return node
+
+
+def _iterator_transformer(parent, node, full_name, name, logs):
+  """Transform iterator methods to compat function calls."""
+  # First, check that node.func.value is not already something we like
+  # (tf.compat.v1.data), or something which is handled in the rename
+  # (tf.data). This transformer only handles the method call to function call
+  # conversion.
+  if full_name and (full_name.startswith("tf.compat.v1.data") or
+                    full_name.startswith("tf.data")):
+    return
+
+  # This should never happen, since we're only called for Attribute nodes.
+  if not isinstance(node.func, ast.Attribute):
+    return
+
+  # Transform from x.f(y) to tf.compat.v1.data.f(x, y)
+  # Fortunately, node.func.value should already have valid position info
+  node.args = [node.func.value] + node.args
+  node.func.value = ast_edits.full_name_node("tf.compat.v1.data")
+
+  logs.append((ast_edits.WARNING, node.lineno, node.col_offset,
+               "Changing dataset.%s() to tf.compat.v1.data.%s(dataset). "
+               "Please check this transformation.\n" % (name, name)))
+
+  return node
+
+
+def _dropout_transformer(parent, node, full_name, name, logs):
+  """Replace keep_prob with 1-rate."""
+  def _replace_keep_prob_node(parent, old_value):
+    """Replaces old_value with 1-(old_value)."""
+    one = ast.Num(n=1)
+    one.lineno = 0
+    one.col_offset = 0
+    new_value = ast.BinOp(left=one, op=ast.Sub(),
+                          right=old_value)
+    # This copies the prefix and suffix on old_value to new_value.
+    pasta.ast_utils.replace_child(parent, old_value, new_value)
+    ast.copy_location(new_value, old_value)
+    # Put parentheses around keep_prob.value (and remove the old prefix/
+    # suffix, they should only be around new_value).
+    pasta.base.formatting.set(old_value, "prefix", "(")
+    pasta.base.formatting.set(old_value, "suffix", ")")
+
+  # Check if we have a keep_prob keyword arg
+  for keep_prob in node.keywords:
+    if keep_prob.arg == "keep_prob":
+      logs.append((ast_edits.INFO, node.lineno, node.col_offset,
+                   "Changing keep_prob arg of tf.nn.dropout to rate\n"))
+      keep_prob.arg = "rate"
+      _replace_keep_prob_node(keep_prob, keep_prob.value)
+      return node
+
+  # Maybe it was a positional arg
+  if len(node.args) < 2:
+    logs.append((ast_edits.ERROR, node.lineno, node.col_offset,
+                 "tf.nn.dropout called without arguments, so "
+                 "automatic fix was disabled. tf.nn.dropout has changed "
+                 "the semantics of the second argument."))
+  else:
+    _replace_keep_prob_node(node, node.args[1])
+    logs.append((ast_edits.INFO, node.lineno, node.col_offset,
+                 "Changing keep_prob arg of tf.nn.dropout to rate, and "
+                 "recomputing value.\n"))
+
+    return node
+
+
+def _cast_transformer(parent, node, full_name, name, logs):
+  """Transforms to_int and to_float to cast(..., dtype=...)."""
+
+  # Find out the dtype to cast to from the function name
+  dtype_str = name[3:]
+  # Special cases where the full dtype is not given
+  if dtype_str == "float":
+    dtype_str = "float32"
+  elif dtype_str == "double":
+    dtype_str = "float64"
+  new_arg = ast.keyword(arg="dtype",
+                        value=ast.Attribute(value=ast.Name(id="tf",
+                                                           ctx=ast.Load()),
+                                            attr=dtype_str, ctx=ast.Load()))
+  # Ensures a valid transformation when a positional name arg is given
+  if len(node.args) == 2:
+    name_arg = ast.keyword(arg="name",
+                           value=node.args[-1])
+    node.args = node.args[:-1]
+    node.keywords.append(name_arg)
+
+  # Python3 ast requires the args for the Attribute, but codegen will mess up
+  # the arg order if we just set them to 0.
+  new_arg.value.lineno = node.lineno
+  new_arg.value.col_offset = node.col_offset+100
+
+  node.keywords.append(new_arg)
+  if isinstance(node.func, ast.Attribute):
+    node.func.attr = "cast"
+  else:
+    assert isinstance(node.func, ast.Name)
+    node.func.id = "cast"
+
+  logs.append((ast_edits.INFO, node.lineno, node.col_offset,
+               "Changed %s call to tf.cast(..., dtype=tf.%s)." % (full_name,
+                                                                  dtype_str)))
+  return node
+
+
+def _softmax_cross_entropy_with_logits_transformer(
+    parent, node, full_name, name, logs):
+  """Wrap labels argument with stop_gradients."""
+  def _wrap_label(parent, old_value):
+    """Wrap labels with tf.stop_gradient."""
+    already_stop_grad = (isinstance(old_value, ast.Call) and
+                         isinstance(old_value.func, ast.Attribute) and
+                         old_value.func.attr == "stop_gradient" and
+                         isinstance(old_value.func.value, ast.Name) and
+                         old_value.func.value.id == "tf")
+    if already_stop_grad:
+      return False
+    try:
+      new_value = ast.Call(
+          ast.Name(id="tf.stop_gradient", ctx=ast.Load()),
+          [old_value], [])
+    except TypeError:
+      new_value = ast.Call(
+          ast.Name(id="tf.stop_gradient", ctx=ast.Load()),
+          [old_value], [], None, None)
+
+    # This copies the prefix and suffix on old_value to new_value.
+    pasta.ast_utils.replace_child(parent, old_value, new_value)
+    ast.copy_location(new_value, old_value)
+    return True
+
+  # Check if we have a labels keyword arg
+  for karg in node.keywords:
+    if karg.arg == "labels":
+      if _wrap_label(karg, karg.value):
+        logs.append((ast_edits.INFO, node.lineno, node.col_offset,
+                     "Changing labels arg of "
+                     "tf.nn.softmax_cross_entropy_with_logits to "
+                     "tf.stop_gradient(labels). Please check this "
+                     "transformation.\n"))
+      return node
+  return node
+
+
+def _image_resize_transformer(parent, node, full_name, name, logs):
+  """Transforms image.resize_* to image.resize(..., method=*, ...)."""
+  resize_method = name[7:].upper()
+  new_arg = ast.keyword(arg="method",
+                        value=ast.Attribute(
+                            value=ast.Attribute(
+                                value=ast.Attribute(
+                                    value=ast.Name(id="tf", ctx=ast.Load()),
+                                    attr="image", ctx=ast.Load()),
+                                attr="ResizeMethod", ctx=ast.Load()),
+                            attr=resize_method, ctx=ast.Load()))
+
+  # Ensures a valid transformation when a positional name arg is given
+  if len(node.args) == 4:
+    pos_arg = ast.keyword(arg="preserve_aspect_ratio",
+                          value=node.args[-1])
+    node.args = node.args[:-1]
+    node.keywords.append(pos_arg)
+  if len(node.args) == 3:
+    pos_arg = ast.keyword(arg="align_corners",
+                          value=node.args[-1])
+    node.args = node.args[:-1]
+    node.keywords.append(pos_arg)
+
+  # Python3 ast requires the args for the Attribute, but codegen will mess up
+  # the arg order if we just set them to 0.
+  new_arg.value.lineno = node.lineno
+  new_arg.value.col_offset = node.col_offset+100
+
+  node.keywords.append(new_arg)
+  if isinstance(node.func, ast.Attribute):
+    node.func.attr = "resize"
+  else:
+    assert isinstance(node.func, ast.Name)
+    node.func.id = "resize"
+
+  logs.append((ast_edits.INFO, node.lineno, node.col_offset,
+               "Changed %s call to tf.image.resize(..., "
+               "method=tf.image.ResizeMethod.%s)." % (full_name,
+                                                      resize_method)))
+  return node
+
+
+def _pool_seed_transformer(parent, node, full_name, name, logs):
+  """Removes seed2 and deterministic, and adds non-zero seed if needed."""
+  # This requires that this function uses all kwargs (add to renames!).
+  seed_arg = None
+  deterministic = False
+  modified = False
+  new_keywords = []
+
+  for kw in node.keywords:
+    if sys.version_info[:2] >= (3, 5) and isinstance(kw, ast.Starred):
+      pass
+    elif kw.arg == "seed":
+      seed_arg = kw
+    elif kw.arg == "seed2" or kw.arg == "deterministic":
+      lineno = getattr(kw, "lineno", node.lineno)
+      col_offset = getattr(kw, "col_offset", node.col_offset)
+      logs.append((ast_edits.INFO, lineno, col_offset,
+                   "Removed argument %s for function %s" % (
+                       kw.arg, full_name or name)))
+      if kw.arg == "deterministic":
+        if not _is_ast_false(kw.value):
+          deterministic = True
+      modified = True
+      continue
+    new_keywords.append(kw)
+
+  if deterministic:
+    if seed_arg is None:
+      new_keywords.append(ast.keyword(arg="seed", value=ast.Num(42)))
+      logs.add((
+          ast_edits.INFO, node.lineno, node.col_offset,
+          "Adding seed=42 to call to %s since determinism was requested" % (
+              full_name or name)
+      ))
     else:
-      comment = ("WARNING: tf.nn.dropout has changed the semantics of the "
-                 "second argument. Please check the transformation.\n")
-      file_edit_recorder.add(
-          comment,
-          node.args[1].lineno,
-          node.args[1].col_offset,
-          "",
-          "1 - ")
-
-  @staticmethod
-  def _colocate_handler(name):
-    def _helper(file_edit_recorder, node):
-      for keyword in node.keywords:
-        if keyword.arg == "colocate_gradients_with_ops":
-          # TODO(jhseu): Since ast_edit.py does string replacement, there's no
-          # straightforward way to remove the argument. Try to fix before 2.0 is
-          # final.
-          comment = ("For tf.gradients and tf.Optimizer.minimize, "
-                     "colocate_gradients_with_op has been removed and now "
-                     "defaults to True.")
-          file_edit_recorder.add(
-              comment,
-              node.lineno,
-              node.col_offset,
-              "",
-              "",
-              error="{} requires manual check.".format(name))
-    return _helper
+      logs.add((
+          ast_edits.WARNING, node.lineno, node.col_offset,
+          "The deterministic argument is deprecated for %s, pass a "
+          "non-zero seed for determinism. The deterministic argument is "
+          "present, possibly not False, and the seed is already set. The "
+          "converter cannot determine whether it is nonzero, please check."
+      ))
+
+  if modified:
+    node.keywords = new_keywords
+    return node
+  else:
+    return
+
+
+def _extract_glimpse_transformer(parent, node, full_name, name, logs):
+
+  def _replace_uniform_noise_node(parent, old_value):
+    """Replaces old_value with 'uniform' or 'guassian'."""
+    uniform = ast.Str(s="uniform")
+    gaussian = ast.Str(s="gaussian")
+    new_value = ast.IfExp(body=uniform, test=old_value, orelse=gaussian)
+    # This copies the prefix and suffix on old_value to new_value.
+    pasta.ast_utils.replace_child(parent, old_value, new_value)
+    ast.copy_location(new_value, old_value)
+    # Put parentheses around noise.value.test (and remove the old prefix/
+    # suffix, they should only be around new_value.test), so that:
+    # "uniform" if (a if b else c) else "gaussian" is valid.
+    pasta.base.formatting.set(new_value.test, "prefix", "(")
+    pasta.base.formatting.set(new_value.test, "suffix", ")")
+
+  # Check if we have a uniform_noise keyword arg
+  for uniform_noise in node.keywords:
+    if uniform_noise.arg == "uniform_noise":
+      logs.append((ast_edits.INFO, node.lineno, node.col_offset,
+                   "Changing uniform_noise arg of tf.image.extract_glimpse "
+                   "to noise, and recomputing value. Please check this "
+                   "transformation.\n"))
+      uniform_noise.arg = "noise"
+      value = "uniform" if uniform_noise.value else "gaussian"
+      _replace_uniform_noise_node(uniform_noise, uniform_noise.value)
+      return node
+
+  # Since `noise`/`uniform_noise` is optional arg, nothing needs to be
+  # done if len(node.args) < 5.
+  if len(node.args) >= 5:
+    _replace_uniform_noise_node(node, node.args[5])
+    logs.append((ast_edits.INFO, node.lineno, node.col_offset,
+                 "Changing uniform_noise arg of tf.image.extract_glimpse to "
+                 "noise, and recomputing value.\n"))
+    return node
+
+
+def _add_summary_step_transformer(parent, node, full_name, name, logs):
+  """Adds a step argument to the summary API call if not specified.
+
+  The inserted argument value is tf.compat.v1.train.get_or_create_global_step().
+  """
+  for keyword_arg in node.keywords:
+    if keyword_arg.arg == "step":
+      return node
+  default_value = "tf.compat.v1.train.get_or_create_global_step()"
+  # Parse with pasta instead of ast to avoid emitting a spurious trailing \n.
+  ast_value = pasta.parse(default_value)
+  node.keywords.append(ast.keyword(arg="step", value=ast_value))
+  logs.append((
+      ast_edits.WARNING, node.lineno, node.col_offset,
+      "Summary API writing function %s now requires a 'step' argument; "
+      "inserting default of %s." % (full_name or name, default_value)))
+  return node
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_main.py b/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
index 543d0786423f5b3f9bc59895c1325d19b6241cf7..c34d659eebf0c90361dbd3ae3da0d6b1b83ead09 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
@@ -27,7 +27,7 @@ from tensorflow.tools.compatibility import tf_upgrade_v2
 def main():
   parser = argparse.ArgumentParser(
       formatter_class=argparse.RawDescriptionHelpFormatter,
-      description="""Convert a TensorFlow Python file to 2.0
+      description="""Convert a TensorFlow Python file from 1.* to 2.0
 
 Simple usage:
   tf_upgrade_v2.py --infile foo.py --outfile bar.py
@@ -59,6 +59,14 @@ Simple usage:
             "copy the other files."),
       type=bool,
       default=True)
+  parser.add_argument(
+      "--inplace",
+      dest="in_place",
+      help=("If converting a whole tree of files, whether to "
+            "allow the conversion to be performed on the "
+            "files in the input tree."),
+      type=bool,
+      default=False)
   parser.add_argument(
       "--reportfile",
       dest="report_filename",
@@ -79,6 +87,7 @@ Simple usage:
           "single file.")
     files_processed, report_text, errors = upgrade.process_file(
         args.input_file, args.output_file)
+    errors = {args.input_file: errors}
     files_processed = 1
   elif args.input_tree:
     if not args.output_tree:
@@ -86,19 +95,34 @@ Simple usage:
           "--outtree=<output directory> argument is required when converting a "
           "file tree.")
     files_processed, report_text, errors = upgrade.process_tree(
-        args.input_tree, args.output_tree, args.copy_other_files)
+        args.input_tree, args.output_tree, args.copy_other_files, args.in_place)
   else:
     parser.print_help()
   if report_text:
-    open(report_filename, "w").write(report_text)
-    print("TensorFlow 2.0 Upgrade Script")
-    print("-----------------------------")
-    print("Converted %d files\n" % files_processed)
-    print("Detected %d errors that require attention" % len(errors))
-    print("-" * 80)
-    print("\n".join(errors))
-    print("\nMake sure to read the detailed log %r\n" % report_filename)
+    num_errors = 0
+    report = []
+    for f in errors:
+      if errors[f]:
+        num_errors += len(errors[f])
+        report.append("-" * 80 + "\n")
+        report.append("File: %s\n" % f)
+        report.append("-" * 80 + "\n")
+        report.append("\n".join(errors[f]) + "\n")
+
+    report = ("TensorFlow 2.0 Upgrade Script\n"
+              "-----------------------------\n"
+              "Converted %d files\n" % files_processed +
+              "Detected %d issues that require attention" % num_errors + "\n" +
+              "-" * 80 + "\n") + "".join(report)
+    with open(report_filename, "w") as report_file:
+      report_file.write(report)
+      report_file.write("=" * 80 + "\n")
+      report_file.write("Detailed log follows:\n\n")
+      report_file.write("=" * 80 + "\n")
+      report_file.write(report_text)
 
+    print(report)
+    print("\nMake sure to read the detailed log %r\n" % report_filename)
 
 if __name__ == "__main__":
   main()
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
index 0fc7a18734219cd0216816873768dd9dada16cc5..57d938f749b34cb14dd2b9d299a0be7646805eea 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
@@ -37,32 +37,6 @@ from tensorflow.tools.compatibility import ast_edits
 from tensorflow.tools.compatibility import tf_upgrade_v2
 
 
-_TENSORFLOW_API_ATTR_V1 = (
-    tf_export.API_ATTRS_V1[tf_export.TENSORFLOW_API_NAME].names)
-_TENSORFLOW_API_ATTR = tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].names
-_ESTIMATOR_API_ATTR_V1 = (
-    tf_export.API_ATTRS_V1[tf_export.ESTIMATOR_API_NAME].names)
-_ESTIMATOR_API_ATTR = tf_export.API_ATTRS[tf_export.ESTIMATOR_API_NAME].names
-
-
-def get_v1_names(symbol):
-  names_v1 = []
-  if hasattr(symbol, _TENSORFLOW_API_ATTR_V1):
-    names_v1.extend(getattr(symbol, _TENSORFLOW_API_ATTR_V1))
-  if hasattr(symbol, _ESTIMATOR_API_ATTR_V1):
-    names_v1.extend(getattr(symbol, _ESTIMATOR_API_ATTR_V1))
-  return names_v1
-
-
-def get_v2_names(symbol):
-  names_v2 = set()
-  if hasattr(symbol, _TENSORFLOW_API_ATTR):
-    names_v2.update(getattr(symbol, _TENSORFLOW_API_ATTR))
-  if hasattr(symbol, _ESTIMATOR_API_ATTR):
-    names_v2.update(getattr(symbol, _ESTIMATOR_API_ATTR))
-  return list(names_v2)
-
-
 def get_symbol_for_name(root, name):
   name_parts = name.split(".")
   symbol = root
@@ -112,18 +86,30 @@ class TestUpgrade(test_util.TensorFlowTestCase):
   @classmethod
   def setUpClass(cls):
     cls.v2_symbols = {}
-    if not hasattr(tf.compat, "v2"):
-      return
+    cls.v1_symbols = {}
+    if hasattr(tf.compat, "v2"):
 
-    def symbol_collector(unused_path, unused_parent, children):
-      for child in children:
-        _, attr = tf_decorator.unwrap(child[1])
-        api_names_v2 = get_v2_names(attr)
-        for name in api_names_v2:
-          cls.v2_symbols["tf." + name] = attr
+      def symbol_collector(unused_path, unused_parent, children):
+        for child in children:
+          _, attr = tf_decorator.unwrap(child[1])
+          api_names_v2 = tf_export.get_v2_names(attr)
+          for name in api_names_v2:
+            cls.v2_symbols["tf." + name] = attr
+
+      visitor = public_api.PublicAPIVisitor(symbol_collector)
+      traverse.traverse(tf.compat.v2, visitor)
+
+    if hasattr(tf.compat, "v1"):
 
-    visitor = public_api.PublicAPIVisitor(symbol_collector)
-    traverse.traverse(tf.compat.v2, visitor)
+      def symbol_collector_v1(unused_path, unused_parent, children):
+        for child in children:
+          _, attr = tf_decorator.unwrap(child[1])
+          api_names_v1 = tf_export.get_v1_names(attr)
+          for name in api_names_v1:
+            cls.v1_symbols["tf." + name] = attr
+
+      visitor = public_api.PublicAPIVisitor(symbol_collector_v1)
+      traverse.traverse(tf.compat.v1, visitor)
 
   def _upgrade(self, old_file_text):
     in_file = six.StringIO(old_file_text)
@@ -140,12 +126,12 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     self.assertTrue(report.find("Failed to parse") != -1)
 
   def testReport(self):
-    text = "tf.assert_near(a)\n"
+    text = "tf.angle(a)\n"
     _, report, unused_errors, unused_new_text = self._upgrade(text)
     # This is not a complete test, but it is a sanity test that a report
     # is generating information.
-    self.assertTrue(report.find("Renamed function `tf.assert_near` to "
-                                "`tf.debugging.assert_near`"))
+    self.assertTrue(report.find("Renamed function `tf.angle` to "
+                                "`tf.math.angle`"))
 
   def testRename(self):
     text = "tf.conj(a)\n"
@@ -166,12 +152,16 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     def conversion_visitor(unused_path, unused_parent, children):
       for child in children:
         _, attr = tf_decorator.unwrap(child[1])
-        api_names = get_v1_names(attr)
+        api_names = tf_export.get_v1_names(attr)
         for name in api_names:
           _, _, _, text = self._upgrade("tf." + name)
           if (text and
               not text.startswith("tf.compat.v1") and
-              text not in self.v2_symbols):
+              not text.startswith("tf.compat.v2") and
+              text not in self.v2_symbols and
+              # Builds currently install old version of estimator that doesn't
+              # have some 2.0 symbols.
+              not text.startswith("tf.estimator")):
             self.assertFalse(
                 True, "Symbol %s generated from %s not in v2 API" % (
                     text, name))
@@ -190,7 +180,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     def conversion_visitor(unused_path, unused_parent, children):
       for child in children:
         _, attr = tf_decorator.unwrap(child[1])
-        api_names = get_v1_names(attr)
+        api_names = tf_export.get_v1_names(attr)
         for name in api_names:
           if collect:
             v1_symbols.add("tf." + name)
@@ -198,6 +188,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
             _, _, _, text = self._upgrade("tf." + name)
             if (text and
                 not text.startswith("tf.compat.v1") and
+                not text.startswith("tf.compat.v2") and
                 not text.startswith("tf.estimator") and
                 text not in v1_symbols):
               self.assertFalse(
@@ -219,7 +210,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     def arg_test_visitor(unused_path, unused_parent, children):
       for child in children:
         _, attr = tf_decorator.unwrap(child[1])
-        names_v1 = get_v1_names(attr)
+        names_v1 = tf_export.get_v1_names(attr)
 
         for name in names_v1:
           name = "tf.%s" % name
@@ -259,8 +250,8 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     }
     function_warnings = (
         tf_upgrade_v2.TFAPIChangeSpec().function_warnings)
-    function_handles = (
-        tf_upgrade_v2.TFAPIChangeSpec().function_handle)
+    function_transformers = (
+        tf_upgrade_v2.TFAPIChangeSpec().function_transformers)
     keyword_renames = (
         tf_upgrade_v2.TFAPIChangeSpec().function_keyword_renames)
 
@@ -270,12 +261,12 @@ class TestUpgrade(test_util.TensorFlowTestCase):
         _, attr = tf_decorator.unwrap(child[1])
         if not tf_inspect.isfunction(attr):
           continue
-        names_v1 = get_v1_names(attr)
+        names_v1 = tf_export.get_v1_names(attr)
         arg_names_v1 = get_args(attr)
 
         for name in names_v1:
           tf_name = "tf.%s" % name
-          if tf_name in function_warnings or tf_name in function_handles:
+          if tf_name in function_warnings or tf_name in function_transformers:
             continue  # These require manual change
           if tf_name in v1_name_exceptions:
             continue
@@ -298,6 +289,10 @@ class TestUpgrade(test_util.TensorFlowTestCase):
                   "Function '%s' is not in 2.0 when converting\n%s\nto\n%s" %
                   (new_function_name, text_input, text))
             continue
+          if new_function_name.startswith("tf.compat.v2"):
+            self.assertIn(new_function_name.replace("tf.compat.v2.", "tf."),
+                          self.v2_symbols)
+            continue
           # 3. Verify V2 function and arguments.
           args_v2 = get_args(self.v2_symbols[new_function_name])
           args_v2.extend(v2_arg_exceptions)
@@ -307,17 +302,49 @@ class TestUpgrade(test_util.TensorFlowTestCase):
                 "Invalid argument '%s' in 2.0 when converting\n%s\nto\n%s.\n"
                 "Supported arguments: %s" % (
                     new_arg, text_input, text, str(args_v2)))
+          # 4. Verify that the argument exists in v1 as well.
+          if new_function_name in set(["tf.nn.ctc_loss",
+                                       "tf.saved_model.save"]):
+            continue
+          args_v1 = get_args(self.v1_symbols[new_function_name])
+          args_v1.extend(v2_arg_exceptions)
+          for new_arg in new_args:
+            self.assertIn(
+                new_arg, args_v1,
+                "Invalid argument '%s' in 1.0 when converting\n%s\nto\n%s.\n"
+                "Supported arguments: %s" % (
+                    new_arg, text_input, text, str(args_v1)))
 
     visitor = public_api.PublicAPIVisitor(conversion_visitor)
     visitor.do_not_descend_map["tf"].append("contrib")
     visitor.private_map["tf.compat"] = ["v1", "v2"]
     traverse.traverse(tf.compat.v1, visitor)
 
+  def testPositionsMatchArgGiven(self):
+    full_dict = tf_upgrade_v2.TFAPIChangeSpec().function_arg_warnings
+    method_names = full_dict.keys()
+    for method_name in method_names:
+      args = full_dict[method_name].keys()
+      # special case for optimizer methods
+      if method_name.startswith("*."):
+        method = method_name.replace("*", "tf.train.Optimizer")
+      else:
+        method = method_name
+      method = get_symbol_for_name(tf, method)
+      arg_spec = tf_inspect.getfullargspec(method)
+      for (arg, pos) in args:
+        # to deal with the self argument on methods on objects
+        if method_name.startswith("*."):
+          pos += 1
+        self.assertEqual(arg_spec[0][pos], arg)
+
   def testReorderFileNeedsUpdate(self):
     reordered_function_names = (
         tf_upgrade_v2.TFAPIChangeSpec().reordered_function_names)
     function_reorders = (
         tf_upgrade_v2.TFAPIChangeSpec().function_reorders)
+    manual_function_reorders = (
+        tf_upgrade_v2.TFAPIChangeSpec().manual_function_reorders)
 
     added_names_message = """Some function names in
 self.reordered_function_names are not in reorders_v2.py.
@@ -337,10 +364,12 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
     # function_reorders should contain reordered_function_names
     # and their TensorFlow V1 aliases.
     for name in function_reorders:
+      if name in manual_function_reorders:
+        continue
       # get other names for this function
       attr = get_symbol_for_name(tf.compat.v1, name)
       _, attr = tf_decorator.unwrap(attr)
-      v1_names = get_v1_names(attr)
+      v1_names = tf_export.get_v1_names(attr)
       self.assertTrue(v1_names)
       v1_names = ["tf.%s" % n for n in v1_names]
       # check if any other name is in
@@ -378,20 +407,83 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
                   "tf.train.inverse_time_decay", "tf.train.cosine_decay",
                   "tf.train.cosine_decay_restarts",
                   "tf.train.linear_cosine_decay",
-                  "tf.train.noisy_linear_cosine_decay"]:
+                  "tf.train.noisy_linear_cosine_decay",
+                  "tf.train.piecewise_constant_decay",
+                 ]:
 
       text = "%s(a, b)\n" % decay
-      _, report, errors, _ = self._upgrade(text)
-      self.assertEqual(errors, ["test.py:1: %s requires manual check." % decay])
-      self.assertIn("%s has been changed" % decay, report)
-
-  def testPiecewiseDecay(self):
-    text = "tf.train.piecewise_constant_decay(a, b)\n"
-    _, report, errors, _ = self._upgrade(text)
-    self.assertEqual(
-        errors,
-        ["test.py:1: tf.train.piecewise_constant_decay requires manual check."])
-    self.assertIn("tf.train.piecewise_constant_decay has been changed", report)
+      _, report, unused_errors, _ = self._upgrade(text)
+      self.assertIn("switch to the schedules in "
+                    "`tf.keras.optimizers.schedules`", report)
+
+  def testMetrics(self):
+    metrics = [
+        "accuracy",
+        "auc",
+        "average_precision_at_k",
+        "false_negatives",
+        "false_negatives_at_thresholds",
+        "false_positives",
+        "false_positives_at_thresholds",
+        "mean",
+        "mean_absolute_error",
+        "mean_cosine_distance",
+        "mean_iou",
+        "mean_per_class_accuracy",
+        "mean_relative_error",
+        "mean_squared_error",
+        "mean_tensor",
+        "percentage_below",
+        "precision",
+        "precision_at_k",
+        "precision_at_thresholds",
+        "precision_at_top_k",
+        "recall",
+        "recall_at_k",
+        "recall_at_thresholds",
+        "recall_at_top_k",
+        "root_mean_squared_error",
+        "sensitivity_at_specificity",
+        "sparse_average_precision_at_k",
+        "sparse_precision_at_k",
+        "specificity_at_sensitivity",
+        "true_negatives",
+        "true_negatives_at_thresholds",
+        "true_positives",
+        "true_positives_at_thresholds",
+    ]
+    for m in metrics:
+      text = "tf.metrics." + m + "(a, b)"
+      _, report, unused_errors, new_text = self._upgrade(text)
+      self.assertEqual("tf.compat.v1.metrics." + m + "(a, b)", new_text)
+      self.assertIn(
+          "tf.metrics have been replaced with object oriented versions", report)
+
+  def testLosses(self):
+    losses = [
+        "absolute_difference",
+        "add_loss",
+        "compute_weighted_loss",
+        "cosine_distance",
+        "get_losses",
+        "get_regularization_loss",
+        "get_regularization_losses",
+        "get_total_loss",
+        "hinge_loss",
+        "huber_loss",
+        "log_loss",
+        "mean_pairwise_squared_error",
+        "mean_squared_error",
+        "sigmoid_cross_entropy",
+        "softmax_cross_entropy",
+        "sparse_softmax_cross_entropy",
+    ]
+    for l in losses:
+      text = "tf.losses." + l + "(a, b)"
+      _, report, unused_errors, new_text = self._upgrade(text)
+      self.assertEqual("tf.compat.v1.losses." + l + "(a, b)", new_text)
+      self.assertIn(
+          "tf.losses have been replaced with object oriented versions", report)
 
   def testEstimatorLossReductionChange(self):
     classes = [
@@ -404,26 +496,94 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
       text = ns + "(a, b)"
       _, report, errors, new_text = self._upgrade(text)
       self.assertEqual(text, new_text)
-      self.assertEqual(errors, ["test.py:1: %s requires manual check." % ns])
+      self.assertIn("%s requires manual check" % ns, errors[0])
       self.assertIn("loss_reduction has been changed", report)
 
+  def testExtractGlimpse(self):
+    text = ("tf.image.extract_glimpse(x, size, off, False, "
+            "False, False, name=\"foo\")\n")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(
+        new_text,
+        "tf.image.extract_glimpse(x, size, off, False, "
+        "False, 'uniform' if (False) else 'gaussian', name=\"foo\")\n",
+    )
+
+    text = ("tf.image.extract_glimpse(x, size, off, centered=False, "
+            "normalized=False, uniform_noise=True if uniform_noise else "
+            "False, name=\"foo\")\n")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(
+        new_text,
+        "tf.image.extract_glimpse(x, size, off, centered=False, "
+        "normalized=False, noise='uniform' if (True if uniform_noise else "
+        "False) else 'gaussian', name=\"foo\")\n",
+    )
+
+    text = ("tf.image.extract_glimpse(x,\n"
+            "                         size,\n"
+            "                         off,\n"
+            "                         centered=True,\n"
+            "                         normalized=True, # Stuff before\n"
+            "                         uniform_noise=False,\n"
+            "                         name=\"foo\")# Stuff after\n")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(
+        new_text, "tf.image.extract_glimpse(x,\n"
+        "                         size,\n"
+        "                         off,\n"
+        "                         centered=True,\n"
+        "                         normalized=True, # Stuff before\n"
+        "                         noise='uniform' if (False) else 'gaussian',\n"
+        "                         name=\"foo\")# Stuff after\n")
+
+    text = "tf.image.extract_glimpse(x)\n"
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, text)
+    self.assertEqual(errors, [])
+
   def testDropout(self):
     text = "tf.nn.dropout(x, keep_prob, name=\"foo\")\n"
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(
         new_text,
-        "tf.nn.dropout(x, 1 - keep_prob, name=\"foo\")\n",
+        "tf.nn.dropout(x, 1 - (keep_prob), name=\"foo\")\n",
+    )
+
+    text = "tf.nn.dropout(x, keep_prob=.4, name=\"foo\")\n"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(
+        new_text,
+        "tf.nn.dropout(x, rate=1 - (.4), name=\"foo\")\n",
+    )
+
+    text = (
+        "tf.nn.dropout(x,  # Stuff before\n"
+        "              keep_prob=.4,  # Stuff after\n"
+        "              name=\"foo\")\n"
+    )
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(
+        new_text,
+        "tf.nn.dropout(x,  # Stuff before\n"
+        "              rate=1 - (.4),  # Stuff after\n"
+        "              name=\"foo\")\n",
     )
 
     text = "tf.nn.dropout(x)\n"
     _, unused_report, errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, text)
+    self.assertIn("tf.nn.dropout called without arguments", errors[0])
+
+  def testDropoutExpr(self):
+    text = "tf.nn.dropout(x, 1 - func(3 + 4.), name=\"foo\")\n"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(
-        errors,
-        ["test.py:1: tf.nn.dropout requires manual check."]
+        new_text,
+        "tf.nn.dropout(x, 1 - (1 - func(3 + 4.)), name=\"foo\")\n",
     )
 
-  def testCountNonZeroChanges(self):
+  def testMathCountNonZeroChanges(self):
     text = (
         "tf.math.count_nonzero(input_tensor=input, dtype=dtype, name=name, "
         "reduction_indices=axis, keep_dims=keepdims)\n"
@@ -435,6 +595,18 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
         )
     self.assertEqual(new_text, expected_text)
 
+  def testCountNonZeroChanges(self):
+    text = (
+        "tf.count_nonzero(input_tensor=input, dtype=dtype, name=name, "
+        "reduction_indices=axis, keep_dims=keepdims)\n"
+        )
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    expected_text = (
+        "tf.math.count_nonzero(input=input, dtype=dtype, name=name, "
+        "axis=axis, keepdims=keepdims)\n"
+        )
+    self.assertEqual(new_text, expected_text)
+
   def testRandomMultinomialToRandomCategorical(self):
     text = (
         "tf.random.multinomial(logits, samples, seed, name, output_dtype)\n"
@@ -456,6 +628,15 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
         )
     self.assertEqual(new_text, expected_text)
 
+  def testRandomPoissonConversion(self):
+    text1 = "tf.random_poisson(lam, shape, dtype)"
+    text2 = "tf.random.poisson(lam, shape, dtype)"
+    expected_text = "tf.random.poisson(lam=lam, shape=shape, dtype=dtype)"
+    _, unused_report, unused_errors, new_text1 = self._upgrade(text1)
+    self.assertEqual(new_text1, expected_text)
+    _, unused_report, unused_errors, new_text2 = self._upgrade(text2)
+    self.assertEqual(new_text2, expected_text)
+
   def testConvolutionOpUpdate(self):
     text = (
         "tf.nn.convolution(input, filter, padding, strides, dilation_rate, "
@@ -469,39 +650,51 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
     )
     self.assertEqual(new_text, expected_text)
 
+  def test_substr(self):
+    text = "tf.substr(input, pos, len, name, unit)\n"
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual("tf.strings.substr(input=input, pos=pos, len=len, "
+                     "name=name, unit=unit)\n", new_text)
+    self.assertEqual(errors, [])
+
   def testColocateGradientsWithOps(self):
-    text = "tf.gradients(a, foo=False)\n"
+    text = "tf.gradients(yx=a, foo=False)\n"
     _, unused_report, errors, new_text = self._upgrade(text)
     self.assertEqual(text, new_text)
     self.assertEqual(errors, [])
 
-    text = "tf.gradients(a, colocate_gradients_with_ops=False)\n"
+    text = "tf.gradients(yx=a, colocate_gradients_with_ops=False)\n"
+    _, report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual("tf.gradients(yx=a)\n", new_text)
+    self.assertIn("tf.gradients no longer takes", report)
+
+    text = "tf.gradients(y, x, grad_ys, name, colocate, gate)\n"
+    expected = ("tf.gradients(ys=y, xs=x, grad_ys=grad_ys, name=name, "
+                "gate_gradients=gate)\n")
     _, unused_report, errors, new_text = self._upgrade(text)
-    self.assertEqual(text, new_text)
-    self.assertEqual(errors, ["test.py:1: tf.gradients requires manual check."])
+    self.assertEqual(expected, new_text)
 
+  def testColocateGradientsWithOpsMinimize(self):
     text = "optimizer.minimize(a, foo=False)\n"
     _, unused_report, errors, new_text = self._upgrade(text)
     self.assertEqual(text, new_text)
     self.assertEqual(errors, [])
 
     text = "optimizer.minimize(a, colocate_gradients_with_ops=False)\n"
-    _, unused_report, errors, new_text = self._upgrade(text)
-    self.assertEqual(text, new_text)
-    self.assertEqual(errors,
-                     ["test.py:1: Optimizer.minimize requires manual check."])
+    _, report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual("optimizer.minimize(a)\n", new_text)
+    self.assertIn("Optimizer.minimize no longer takes", report)
 
+  def testColocateGradientsWithOpsComputeGradients(self):
     text = "optimizer.compute_gradients(a, foo=False)\n"
     _, unused_report, errors, new_text = self._upgrade(text)
     self.assertEqual(text, new_text)
     self.assertEqual(errors, [])
 
     text = "optimizer.compute_gradients(a, colocate_gradients_with_ops=False)\n"
-    _, unused_report, errors, new_text = self._upgrade(text)
-    self.assertEqual(text, new_text)
-    self.assertEqual(errors,
-                     ["test.py:1: Optimizer.compute_gradients "
-                      "requires manual check."])
+    _, report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual("optimizer.compute_gradients(a)\n", new_text)
+    self.assertIn("Optimizer.compute_gradients no longer takes", report)
 
   def testExportSavedModelRename(self):
     text = "self.est.export_savedmodel(path)"
@@ -542,6 +735,17 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
+  def testEstimatorInputs(self):
+    text = "tf.estimator.inputs.numpy_input_fn(0)"
+    expected_text = "tf.compat.v1.estimator.inputs.numpy_input_fn(0)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = "tf.estimator.inputs.pandas_input_fn(0)"
+    expected_text = "tf.compat.v1.estimator.inputs.pandas_input_fn(0)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
   def testBatchToSpace(self):
     text = "tf.batch_to_space_nd(input, block_shape, crops, name)"
     expected_text = "tf.batch_to_space(input, block_shape, crops, name)"
@@ -570,6 +774,16 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
+  def testKerasSavedModel(self):
+    text = (
+        "tf.contrib.saved_model.save_keras_model(model, './saved_models')\n"
+        "tf.contrib.saved_model.load_keras_model(saved_model_path)\n")
+    expected_text = (
+        "tf.keras.experimental.export_saved_model(model, './saved_models')\n"
+        "tf.keras.experimental.load_from_saved_model(saved_model_path)\n")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
   def testStatelessMultinomial(self):
     text = (
         "tf.random.stateless_multinomial(logits, num_samples, seed, "
@@ -581,26 +795,62 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
     self.assertEqual(new_text, expected_text)
 
   def testSoftMaxCrossEntropyWithLogitsV2(self):
-    text = "tf.nn.softmax_cross_entropy_with_logits_v2(labels, logits, dim=2)"
+    text = (
+        "tf.nn.softmax_cross_entropy_with_logits_v2("
+        "labels=labels, logits=logits, dim=2)")
     expected_text = (
-        "tf.nn.softmax_cross_entropy_with_logits(labels, logits, axis=2)")
+        "tf.nn.softmax_cross_entropy_with_logits("
+        "labels=labels, logits=logits, axis=2)")
     _, unused_report, errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
     self.assertFalse(errors)
 
   def testSoftMaxCrossEntropyWithLogits(self):
-    text = "tf.nn.softmax_cross_entropy_with_logits(labels, logits, dim=2)"
+    text = ("tf.nn.softmax_cross_entropy_with_logits("
+            "labels=labels, logits=logits, dim=2)")
     expected_text = (
-        "tf.nn.softmax_cross_entropy_with_logits(labels, logits, dim=2)")
-    _, report, errors, new_text = self._upgrade(text)
+        "tf.nn.softmax_cross_entropy_with_logits("
+        "labels=tf.stop_gradient(labels), logits=logits, axis=2)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
-    self.assertIn(
-        "tf.nn.softmax_cross_entropy_with_logits requires manual check.",
-        errors[0])
-    self.assertIn(
-        "tf.nn.softmax_cross_entropy_with_logits behavior has changed. ",
-        report)
+
+    text = ("tf.nn.softmax_cross_entropy_with_logits("
+            "labels=foo(bar))")
+    expected_text = ("tf.nn.softmax_cross_entropy_with_logits("
+                     "labels=tf.stop_gradient(foo(bar)))")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+  def testSoftMaxCrossEntropyWithLogitsDoesntNest(self):
+    text = ("tf.nn.softmax_cross_entropy_with_logits("
+            "labels=tf.stop_gradient(labels), logits=logits, dim=2)")
+    expected_text = (
+        "tf.nn.softmax_cross_entropy_with_logits("
+        "labels=tf.stop_gradient(labels), logits=logits, axis=2)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = ("tf.nn.softmax_cross_entropy_with_logits("
+            "labels=tf.stop_gradient(foo(bar)))")
+    expected_text = ("tf.nn.softmax_cross_entropy_with_logits("
+                     "labels=tf.stop_gradient(foo(bar)))")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+    text = ("tf.nn.softmax_cross_entropy_with_logits("
+            "labels=foo())")
+    expected_text = ("tf.nn.softmax_cross_entropy_with_logits("
+                     "labels=tf.stop_gradient(foo()))")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+    text = ("tf.nn.softmax_cross_entropy_with_logits("
+            "labels=foo().zz())")
+    expected_text = ("tf.nn.softmax_cross_entropy_with_logits("
+                     "labels=tf.stop_gradient(foo().zz()))")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
 
   def testSparseMatmul(self):
     text = ("tf.sparse_matmul(a, b, c, d, e, f, g)\n")
@@ -640,6 +890,45 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
+  def testConv2D(self):
+    text = (
+        "tf.nn.conv2d(input, filter, strides, padding, use_cudnn_on_gpu, "
+        "data_format)")
+    expected_text = (
+        "tf.nn.conv2d(input=input, filters=filter, strides=strides, "
+        "padding=padding, data_format=data_format)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = (
+        "tf.nn.conv2d(input, filter=filter, strides=strides, padding=padding, "
+        "use_cudnn_on_gpu=use_cudnn_on_gpu)")
+    expected_text = ("tf.nn.conv2d(input=input, filters=filter, "
+                     "strides=strides, padding=padding)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testConv2DBackpropFilter(self):
+    text = (
+        "tf.nn.conv2d_backprop_filter(input, filter_sizes, out_backprop, "
+        "strides, padding, use_cudnn_on_gpu, data_format)")
+    expected_text = (
+        "tf.compat.v1.nn.conv2d_backprop_filter(input, filter_sizes, "
+        "out_backprop, strides, padding, use_cudnn_on_gpu, data_format)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testConv2DBackpropInput(self):
+    text = (
+        "tf.nn.conv2d_backprop_input(input_sizes, filter, out_backprop, "
+        "strides, padding, use_cudnn_on_gpu, data_format)")
+    expected_text = (
+        "tf.nn.conv2d_transpose(output_shape=input_sizes, filters=filter, "
+        "input=out_backprop, strides=strides, padding=padding, "
+        "data_format=data_format)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
   def testSpacetoBatch(self):
     text = "tf.space_to_batch_nd(input, shape, paddings, name)"
     expected_text = "tf.space_to_batch(input, shape, paddings, name)"
@@ -673,7 +962,7 @@ bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
             "validate_indices, max_norm)")
     expected_text = ("tf.nn.embedding_lookup(params=params, ids=ids, "
                      "partition_strategy=partition_strategy, name=name, "
-                     "validate_indices=validate_indices, max_norm=max_norm)")
+                     "max_norm=max_norm)")
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
@@ -738,6 +1027,322 @@ tf.print('abc')
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, expected_text)
 
+  def testIterators(self):
+    for (text, expected) in [
+        ("(expr + yielding(data)).make_one_shot_iterator()",
+         "tf.compat.v1.data.make_one_shot_iterator((expr + yielding(data)))"),
+        ("dataset.make_one_shot_iterator()",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset)"),
+        ("dataset.make_one_shot_iterator(shared_name=foo)",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset, shared_name=foo)"),
+        ("dataset.make_one_shot_iterator(x, y, z)",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset, x, y, z)"),
+        ("dataset.make_initializable_iterator()",
+         "tf.compat.v1.data.make_initializable_iterator(dataset)"),
+        ("ds.make_initializable_iterator(shared_name=foo)",
+         "tf.compat.v1.data.make_initializable_iterator(ds, shared_name=foo)"),
+        ("dataset.make_initializable_iterator(x, y, z)",
+         "tf.compat.v1.data.make_initializable_iterator(dataset, x, y, z)"),
+        ("tf.data.make_one_shot_iterator(dataset)",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset)"),
+        ("tf.data.make_one_shot_iterator(dataset, shared_name=foo)",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset, shared_name=foo)"),
+        ("tf.data.make_one_shot_iterator(dataset, x, y, z)",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset, x, y, z)"),
+        ("tf.data.make_initializable_iterator(dataset)",
+         "tf.compat.v1.data.make_initializable_iterator(dataset)"),
+        ("tf.data.make_initializable_iterator(ds, shared_name=foo)",
+         "tf.compat.v1.data.make_initializable_iterator(ds, shared_name=foo)"),
+        ("tf.data.make_initializable_iterator(dataset, x, y, z)",
+         "tf.compat.v1.data.make_initializable_iterator(dataset, x, y, z)"),
+        ("tf.compat.v1.data.make_one_shot_iterator(dataset)",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset)"),
+        ("tf.compat.v1.data.make_one_shot_iterator(dataset, shared_name=foo)",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset, shared_name=foo)"),
+        ("tf.compat.v1.data.make_one_shot_iterator(dataset, x, y, z)",
+         "tf.compat.v1.data.make_one_shot_iterator(dataset, x, y, z)"),
+        ("tf.compat.v1.data.make_initializable_iterator(dataset)",
+         "tf.compat.v1.data.make_initializable_iterator(dataset)"),
+        ("tf.compat.v1.data.make_initializable_iterator(ds, shared_name=foo)",
+         "tf.compat.v1.data.make_initializable_iterator(ds, shared_name=foo)"),
+        ("tf.compat.v1.data.make_initializable_iterator(dataset, x, y, z)",
+         "tf.compat.v1.data.make_initializable_iterator(dataset, x, y, z)")]:
+      _, unused_report, unused_errors, actual = self._upgrade(text)
+      self.assertEqual(actual, expected)
+
+  def testMapAndBatch(self):
+    suffix = ".data.experimental.map_and_batch_with_legacy_function(args)"
+    text = "tf" + suffix
+    expected = "tf.compat.v1" + suffix
+    _, unused_report, unused_errors, actual = self._upgrade(text)
+    self.assertEqual(actual, expected)
+
+  def testCast(self):
+    for (name, dtype) in [("int32", "int32"),
+                          ("int64", "int64"),
+                          ("float", "float32"),
+                          ("double", "float64"),
+                          ("complex64", "complex64"),
+                          ("complex128", "complex128"),
+                          ("bfloat16", "bfloat16")]:
+      text = "tf.to_%s(x, name='test')" % name
+      expected_text = "tf.cast(x, name='test', dtype=tf.%s)" % dtype
+      _, unused_report, unused_errors, new_text = self._upgrade(text)
+      self.assertEqual(expected_text, new_text)
+
+  def testCastPositionalSecondArgument(self):
+    for (name, dtype) in [("int32", "int32"),
+                          ("int64", "int64"),
+                          ("float", "float32"),
+                          ("double", "float64"),
+                          ("complex64", "complex64"),
+                          ("complex128", "complex128"),
+                          ("bfloat16", "bfloat16")]:
+      text = "tf.to_%s(x, 'test')" % name
+      expected_text = "tf.cast(x, name='test', dtype=tf.%s)" % dtype
+      _, unused_report, unused_errors, new_text = self._upgrade(text)
+      self.assertEqual(expected_text, new_text)
+
+  def testImageResize(self):
+    for method in ["bilinear", "area", "bicubic", "nearest_neighbor"]:
+      text = "tf.image.resize_%s(i, s)" % method
+      expected_text = ("tf.image.resize(i, s, "
+                       "method=tf.image.ResizeMethod.%s)" % method.upper())
+      _, unused_report, unused_errors, new_text = self._upgrade(text)
+      self.assertEqual(expected_text, new_text)
+
+  def testImageResizeExtraPositionalArgs(self):
+    for method in ["bilinear", "area", "bicubic", "nearest_neighbor"]:
+      text = "tf.image.resize_%s(i, s, a, p)" % method
+      expected_text = ["tf.image.resize(i, s, ", "align_corners=a, ",
+                       "preserve_aspect_ratio=p, ",
+                       "method=tf.image.ResizeMethod.%s)" % method.upper()]
+      _, unused_report, unused_errors, new_text = self._upgrade(text)
+      for s in expected_text:
+        self.assertIn(s, new_text)
+
+  def testCond(self):
+    text = "tf.cond(a, b, c, True)"
+    expected_text = "tf.cond(pred=a, true_fn=b, false_fn=c)"
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+    self.assertIn("tf.cond", errors[0])
+    self.assertIn("requires manual check", errors[0])
+
+  def testParens(self):
+    text = """
+def _log_prob(self, x):
+  return tf.reduce_logsumexp(
+      (self.mixture_distribution.logits + self.distribution.log_prob(
+          x[..., tf.newaxis])),
+          axis=-1)"""
+    expected_text = """
+def _log_prob(self, x):
+  return tf.reduce_logsumexp(
+      input_tensor=(self.mixture_distribution.logits + self.distribution.log_prob(
+          x[..., tf.newaxis])),
+          axis=-1)"""
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+  def testAssertStatements(self):
+    for name in ["assert_greater", "assert_equal", "assert_none_equal",
+                 "assert_less", "assert_negative", "assert_positive",
+                 "assert_non_negative", "assert_non_positive", "assert_near",
+                 "assert_less", "assert_less_equal", "assert_greater",
+                 "assert_greater_equal", "assert_integer", "assert_type",
+                 "assert_scalar"]:
+      text = "tf.%s(a)" % name
+      expected_text = "tf.compat.v1.%s(a)" % name
+      _, report, unused_errors, new_text = self._upgrade(text)
+      self.assertEqual(expected_text, new_text)
+      self.assertIn("%s has been" % name, report)
+
+      text = "tf.debugging.%s(a)" % name
+      expected_text = "tf.compat.v1.debugging.%s(a)" % name
+      _, report, unused_errors, new_text = self._upgrade(text)
+      self.assertEqual(expected_text, new_text)
+      self.assertIn("%s has been" % name, report)
+
+  def testAssertRankStatements(self):
+    for name in ["assert_rank", "assert_rank_at_least", "assert_rank_in"]:
+      text = "tf.%s(a)" % name
+      expected_text = "tf.compat.v1.%s(a)" % name
+      _, report, unused_errors, new_text = self._upgrade(text)
+      self.assertEqual(expected_text, new_text)
+      self.assertIn("%s has been" % name, report)
+
+      text = "tf.debugging.%s(a)" % name
+      expected_text = "tf.compat.v1.debugging.%s(a)" % name
+      _, report, unused_errors, new_text = self._upgrade(text)
+      self.assertEqual(expected_text, new_text)
+      self.assertIn("%s has been" % name, report)
+
+  def test_assert_equal_graph_def(self):
+    text = "tf.test.assert_equal_graph_def(a, b, checkpoint_v2=x)"
+    expected = "tf.test.assert_equal_graph_def(actual=a, expected=b)"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+
+  def test_is_tensor_upgrade(self):
+    text = "tf.contrib.framework.is_tensor(x)"
+    expected = "tf.is_tensor(x)"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+
+  def test_CriticalSection_upgrade(self):
+    text = "tf.contrib.framework.CriticalSection(shared_name='blah')"
+    expected = "tf.CriticalSection(shared_name='blah')"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+
+  def test_sample_distorted_bounding_box(self):
+    # pylint: disable=line-too-long
+    text = "tf.image.sample_distorted_bounding_box(a, b, c, d, e, f, g, h, i, j)"
+    expected = "tf.image.sample_distorted_bounding_box(image_size=a, bounding_boxes=b, seed=c, min_object_covered=e, aspect_ratio_range=f, area_range=g, max_attempts=h, use_image_if_no_bounding_boxes=i, name=j)"
+    # pylint: enable=line-too-long
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+
+  def test_contrib_initialize(self):
+    text = "tf.contrib.summary.initialize"
+    expected = "tf.compat.v1.summary.initialize"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+
+  def test_contrib_framework_argsort(self):
+    text = "tf.contrib.framework.argsort"
+    expected = "tf.argsort"
+    # pylint: enable=line-too-long
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+
+  def test_flags_bare(self):
+    _, _, errors, _ = self._upgrade("tf.flags")
+    self.assertIn("tf.flags has been removed", errors[0])
+
+  def test_flags_flags(self):
+    _, _, errors, _ = self._upgrade("tf.flags.FLAGS")
+    self.assertIn("tf.flags has been removed", errors[0])
+
+  def test_max_pool_2d(self):
+    text = "tf.nn.max_pool(value=4)"
+    expected_text = "tf.nn.max_pool2d(input=4)"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+  def test_contrib_estimator_early_stopping(self):
+    api_symbols = [
+        "make_early_stopping_hook", "stop_if_higher_hook", "stop_if_lower_hook",
+        "stop_if_no_decrease_hook", "stop_if_no_increase_hook"
+    ]
+    for symbol in api_symbols:
+      text = "tf.contrib.estimator." + symbol
+      expected_text = "tf.estimator.experimental." + symbol
+      _, _, _, new_text = self._upgrade(text)
+      self.assertEqual(expected_text, new_text)
+
+  def test_contrib_rnn(self):
+    api_symbols = ["BasicLSTMCell", "BasicRNNCell", "GRUCell", "LSTMCell",
+                   "MultiRNNCell"]
+    for symbol in api_symbols:
+      text = "tf.contrib.rnn." + symbol
+      expected_text = "tf.compat.v1.nn.rnn_cell." + symbol
+      _, _, _, new_text = self._upgrade(text)
+      self.assertEqual(expected_text, new_text)
+
+  def test_contrib_summary_audio(self):
+    text = "tf.contrib.summary.audio('foo', myval, 44100, 3, 'fam', 42)"
+    expected = ("tf.compat.v2.summary.audio(name='foo', data=myval, "
+                "sample_rate=44100, max_outputs=3, step=42)")
+    _, _, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+    self.assertIn("'family' argument", errors[0])
+    self.assertIn("Manual check required", errors[1])
+
+  def test_contrib_summary_histogram(self):
+    text = "tf.contrib.summary.histogram('foo', myval, 'fam', 42)"
+    expected = ("tf.compat.v2.summary.histogram(name='foo', data=myval, "
+                "step=42)")
+    _, _, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+    self.assertIn("'family' argument", errors[0])
+    self.assertIn("Manual check required", errors[1])
+
+  def test_contrib_summary_image(self):
+    text = "tf.contrib.summary.image('foo', myval, red, 3, 'fam', 42)"
+    expected = ("tf.compat.v2.summary.image(name='foo', data=myval, "
+                "max_outputs=3, step=42)")
+    _, _, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+    self.assertIn("'bad_color' argument", errors[0])
+    self.assertIn("'family' argument", errors[1])
+    self.assertIn("Manual check required", errors[2])
+
+  def test_contrib_summary_scalar(self):
+    text = "tf.contrib.summary.scalar('foo', myval, 'fam', 42)"
+    expected = ("tf.compat.v2.summary.scalar(name='foo', data=myval, "
+                "step=42)")
+    _, _, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+    self.assertIn("'family' argument", errors[0])
+    self.assertIn("Manual check required", errors[1])
+
+  def test_contrib_summary_audio_nostep(self):
+    text = "tf.contrib.summary.audio('foo', myval, 44100)"
+    expected = ("tf.compat.v2.summary.audio(name='foo', data=myval, "
+                "sample_rate=44100, "
+                "step=tf.compat.v1.train.get_or_create_global_step())")
+    _, _, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+    self.assertIn("'step' argument", errors[0])
+    self.assertIn("Manual check required", errors[1])
+
+  def test_contrib_summary_histogram_nostep(self):
+    text = "tf.contrib.summary.histogram('foo', myval)"
+    expected = ("tf.compat.v2.summary.histogram(name='foo', data=myval, "
+                "step=tf.compat.v1.train.get_or_create_global_step())")
+    _, _, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+    self.assertIn("'step' argument", errors[0])
+    self.assertIn("Manual check required", errors[1])
+
+  def test_contrib_summary_image_nostep(self):
+    text = "tf.contrib.summary.image('foo', myval)"
+    expected = ("tf.compat.v2.summary.image(name='foo', data=myval, "
+                "step=tf.compat.v1.train.get_or_create_global_step())")
+    _, _, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+    self.assertIn("'step' argument", errors[0])
+    self.assertIn("Manual check required", errors[1])
+
+  def test_contrib_summary_scalar_nostep(self):
+    text = "tf.contrib.summary.scalar('foo', myval)"
+    expected = ("tf.compat.v2.summary.scalar(name='foo', data=myval, "
+                "step=tf.compat.v1.train.get_or_create_global_step())")
+    _, _, errors, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+    self.assertIn("'step' argument", errors[0])
+    self.assertIn("Manual check required", errors[1])
+
+  def test_avg_pool_2d(self):
+    text = "tf.nn.avg_pool(value=4)"
+    expected_text = "tf.nn.avg_pool2d(input=4)"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected_text, new_text)
+
+  def test_saved_model_load(self):
+    text = "tf.saved_model.load(sess, ['foo_graph'])"
+    expected = "tf.compat.v1.saved_model.load(sess, ['foo_graph'])"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+
+  def test_saved_model_load_v2(self):
+    text = "tf.saved_model.load_v2('/tmp/blah')"
+    expected = "tf.compat.v2.saved_model.load('/tmp/blah')"
+    _, _, _, new_text = self._upgrade(text)
+    self.assertEqual(expected, new_text)
+
 
 class TestUpgradeFiles(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/tools/compatibility/update/generate_v2_renames_map.py b/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
index 19ad6c3a2a5c723cbbff2c76c8bfe6517ca4a4f0..a2c5e7cf82dd8dfb5cb150a7e4e4a58a7a2e1631 100644
--- a/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
+++ b/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
@@ -64,58 +64,6 @@ from __future__ import print_function
 
 """
 
-_TENSORFLOW_API_ATTR_V1 = (
-    tf_export.API_ATTRS_V1[tf_export.TENSORFLOW_API_NAME].names)
-_TENSORFLOW_API_ATTR = tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].names
-_TENSORFLOW_CONSTANTS_ATTR_V1 = (
-    tf_export.API_ATTRS_V1[tf_export.TENSORFLOW_API_NAME].constants)
-_TENSORFLOW_CONSTANTS_ATTR = (
-    tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].constants)
-
-_ESTIMATOR_API_ATTR_V1 = (
-    tf_export.API_ATTRS_V1[tf_export.ESTIMATOR_API_NAME].names)
-_ESTIMATOR_API_ATTR = tf_export.API_ATTRS[tf_export.ESTIMATOR_API_NAME].names
-_ESTIMATOR_CONSTANTS_ATTR_V1 = (
-    tf_export.API_ATTRS_V1[tf_export.ESTIMATOR_API_NAME].constants)
-_ESTIMATOR_CONSTANTS_ATTR = (
-    tf_export.API_ATTRS[tf_export.ESTIMATOR_API_NAME].constants)
-
-
-def get_v1_names(symbol):
-  names_v1 = []
-  if hasattr(symbol, _TENSORFLOW_API_ATTR_V1):
-    names_v1.extend(getattr(symbol, _TENSORFLOW_API_ATTR_V1))
-  if hasattr(symbol, _ESTIMATOR_API_ATTR_V1):
-    names_v1.extend(getattr(symbol, _ESTIMATOR_API_ATTR_V1))
-  return names_v1
-
-
-def get_v2_names(symbol):
-  names_v2 = []
-  if hasattr(symbol, _TENSORFLOW_API_ATTR):
-    names_v2.extend(getattr(symbol, _TENSORFLOW_API_ATTR))
-  if hasattr(symbol, _ESTIMATOR_API_ATTR):
-    names_v2.extend(getattr(symbol, _ESTIMATOR_API_ATTR))
-  return list(names_v2)
-
-
-def get_v1_constants(module):
-  constants_v1 = []
-  if hasattr(module, _TENSORFLOW_CONSTANTS_ATTR_V1):
-    constants_v1.extend(getattr(module, _TENSORFLOW_CONSTANTS_ATTR_V1))
-  if hasattr(module, _ESTIMATOR_CONSTANTS_ATTR_V1):
-    constants_v1.extend(getattr(module, _ESTIMATOR_CONSTANTS_ATTR_V1))
-  return constants_v1
-
-
-def get_v2_constants(module):
-  constants_v2 = []
-  if hasattr(module, _TENSORFLOW_CONSTANTS_ATTR):
-    constants_v2.extend(getattr(module, _TENSORFLOW_CONSTANTS_ATTR))
-  if hasattr(module, _ESTIMATOR_CONSTANTS_ATTR):
-    constants_v2.extend(getattr(module, _ESTIMATOR_CONSTANTS_ATTR))
-  return constants_v2
-
 
 def get_canonical_name(v2_names, v1_name):
   if v2_names:
@@ -131,7 +79,7 @@ def get_all_v2_names():
     """Visitor that collects TF 2.0 names."""
     for child in children:
       _, attr = tf_decorator.unwrap(child[1])
-      api_names_v2 = get_v2_names(attr)
+      api_names_v2 = tf_export.get_v2_names(attr)
       for name in api_names_v2:
         v2_names.add(name)
 
@@ -149,8 +97,8 @@ def collect_constant_renames():
   """
   renames = set()
   for module in sys.modules.values():
-    constants_v1_list = get_v1_constants(module)
-    constants_v2_list = get_v2_constants(module)
+    constants_v1_list = tf_export.get_v1_constants(module)
+    constants_v2_list = tf_export.get_v2_constants(module)
 
     # _tf_api_constants attribute contains a list of tuples:
     # (api_names_list, constant_name)
@@ -186,8 +134,8 @@ def collect_function_renames():
     """Visitor that collects rename strings to add to rename_line_set."""
     for child in children:
       _, attr = tf_decorator.unwrap(child[1])
-      api_names_v1 = get_v1_names(attr)
-      api_names_v2 = get_v2_names(attr)
+      api_names_v1 = tf_export.get_v1_names(attr)
+      api_names_v2 = tf_export.get_v2_names(attr)
       deprecated_api_names = set(api_names_v1) - set(api_names_v2)
       for name in deprecated_api_names:
         renames.add((name, get_canonical_name(api_names_v2, name)))
diff --git a/tensorflow/tools/compatibility/update/generate_v2_reorders_map.py b/tensorflow/tools/compatibility/update/generate_v2_reorders_map.py
index 63541771bf36fb243ae241fbf1b4c4a83cf19fd7..0eb942d39617c7fe17bc62ff19c98047900d33cf 100644
--- a/tensorflow/tools/compatibility/update/generate_v2_reorders_map.py
+++ b/tensorflow/tools/compatibility/update/generate_v2_reorders_map.py
@@ -64,40 +64,6 @@ from __future__ import print_function
 
 """
 
-_TENSORFLOW_API_ATTR_V1 = (
-    tf_export.API_ATTRS_V1[tf_export.TENSORFLOW_API_NAME].names)
-_TENSORFLOW_API_ATTR = tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].names
-_TENSORFLOW_CONSTANTS_ATTR_V1 = (
-    tf_export.API_ATTRS_V1[tf_export.TENSORFLOW_API_NAME].constants)
-_TENSORFLOW_CONSTANTS_ATTR = (
-    tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].constants)
-
-_ESTIMATOR_API_ATTR_V1 = (
-    tf_export.API_ATTRS_V1[tf_export.ESTIMATOR_API_NAME].names)
-_ESTIMATOR_API_ATTR = tf_export.API_ATTRS[tf_export.ESTIMATOR_API_NAME].names
-_ESTIMATOR_CONSTANTS_ATTR_V1 = (
-    tf_export.API_ATTRS_V1[tf_export.ESTIMATOR_API_NAME].constants)
-_ESTIMATOR_CONSTANTS_ATTR = (
-    tf_export.API_ATTRS[tf_export.ESTIMATOR_API_NAME].constants)
-
-
-def get_v1_names(symbol):
-  names_v1 = []
-  if hasattr(symbol, _TENSORFLOW_API_ATTR_V1):
-    names_v1.extend(getattr(symbol, _TENSORFLOW_API_ATTR_V1))
-  if hasattr(symbol, _ESTIMATOR_API_ATTR_V1):
-    names_v1.extend(getattr(symbol, _ESTIMATOR_API_ATTR_V1))
-  return names_v1
-
-
-def get_v2_names(symbol):
-  names_v2 = []
-  if hasattr(symbol, _TENSORFLOW_API_ATTR):
-    names_v2.extend(getattr(symbol, _TENSORFLOW_API_ATTR))
-  if hasattr(symbol, _ESTIMATOR_API_ATTR):
-    names_v2.extend(getattr(symbol, _ESTIMATOR_API_ATTR))
-  return list(names_v2)
-
 
 def collect_function_arg_names(function_names):
   """Determines argument names for reordered function signatures.
@@ -115,7 +81,7 @@ def collect_function_arg_names(function_names):
     """Visitor that collects arguments for reordered functions."""
     for child in children:
       _, attr = tf_decorator.unwrap(child[1])
-      api_names_v1 = get_v1_names(attr)
+      api_names_v1 = tf_export.get_v1_names(attr)
       api_names_v1 = ['tf.%s' % name for name in api_names_v1]
       matches_function_names = any(
           name in function_names for name in api_names_v1)
diff --git a/tensorflow/tools/dist_test/README.md b/tensorflow/tools/dist_test/README.md
index 1e29977788176477492a03c4683cc489ec9fae44..6e7769b42aef040b93e1ed17f89417604a56c863 100644
--- a/tensorflow/tools/dist_test/README.md
+++ b/tensorflow/tools/dist_test/README.md
@@ -12,9 +12,8 @@ For example:
 
     ./local_test.sh
 
-By default, local_test.sh runs the MNIST-with-replicas model as a test.
-However, you can use the --model_name flag to run the tf-learn/wide&deep
-cesnsu model:
+By default, local_test.sh runs the MNIST-with-replicas model as a test. However,
+you can use the --model_name flag to run the tf-learn/wide&deep census model:
 
     ./local_test.sh --model_name CENSUS_WIDENDEEP
 
diff --git a/tensorflow/tools/dist_test/scripts_allreduce/k8s_generate_yaml_lib.py b/tensorflow/tools/dist_test/scripts_allreduce/k8s_generate_yaml_lib.py
index c570d1a9f834bd9df57df62088a0c4562be9512c..038a712d538fbaeb8d0d176287704993cff07799 100644
--- a/tensorflow/tools/dist_test/scripts_allreduce/k8s_generate_yaml_lib.py
+++ b/tensorflow/tools/dist_test/scripts_allreduce/k8s_generate_yaml_lib.py
@@ -195,7 +195,7 @@ def generate_RSA(bits=2048, exponent=65537):
 
 def get_change_ssh_port(use_hostnet, port):
   if use_hostnet == 1:
-    return "sed -i '/Port 22/c\Port {}' /etc/ssh/sshd_config".format(port)
+    return r"sed -i '/Port 22/c\Port {}' /etc/ssh/sshd_config".format(port)
 
   return ''
 
diff --git a/tensorflow/tools/dist_test/server/BUILD b/tensorflow/tools/dist_test/server/BUILD
index 3aa53a5615db27fd5d3c32bbbbee68ccc7dc4f2f..56810ae15778cb5394a67fd9f13c5d13e03d3635 100644
--- a/tensorflow/tools/dist_test/server/BUILD
+++ b/tensorflow/tools/dist_test/server/BUILD
@@ -12,6 +12,14 @@ load("//tensorflow:tensorflow.bzl", "py_binary")
 
 py_binary(
     name = "grpc_tensorflow_server",
+    srcs = ["grpc_tensorflow_server.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [":grpc_tensorflow_server_lib"],
+)
+
+py_library(
+    name = "grpc_tensorflow_server_lib",
     srcs = [
         "grpc_tensorflow_server.py",
     ],
@@ -33,7 +41,7 @@ py_test(
     main = "parse_cluster_spec_test.py",
     srcs_version = "PY2AND3",
     deps = [
-        ":grpc_tensorflow_server",
+        ":grpc_tensorflow_server_lib",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
     ],
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index c256dd364ef5a29ba7f8a2afa6e772ee9c566cb8..9ea29c0e201e9cb1630e7bb682d1d7694665decd 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -65,7 +65,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.15.0
+ENV BAZEL_VERSION 0.20.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 7f9b55b45595bc74b51e14883d1fd1dc19b9099c..e085ee7170c83729cb103811d5e2ba45e3d8cb96 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -15,8 +15,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         git \
         libcudnn7=7.2.1.38-1+cuda9.0 \
         libcudnn7-dev=7.2.1.38-1+cuda9.0 \
-        libnccl2=2.2.13-1+cuda9.0 \
-        libnccl-dev=2.2.13-1+cuda9.0 \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
@@ -41,11 +39,6 @@ RUN apt-get update && \
         apt-get install libnvinfer4=4.1.2-1+cuda9.0 && \
         apt-get install libnvinfer-dev=4.1.2-1+cuda9.0
 
-# Link NCCL libray and header where the build script expects them.
-RUN mkdir /usr/local/cuda-9.0/lib &&  \
-    ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
-    ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
-
 RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
     python get-pip.py && \
     rm get-pip.py
@@ -87,7 +80,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.15.0
+ENV BAZEL_VERSION 0.20.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
@@ -111,9 +104,6 @@ ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
 ENV TF_CUDA_VERSION=9.0
 ENV TF_CUDNN_VERSION=7
 
-# NCCL 2.x
-ENV TF_NCCL_VERSION=2
-
 RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
     LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} \
     tensorflow/tools/ci_build/builds/configured GPU \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-mkl b/tensorflow/tools/docker/Dockerfile.devel-mkl
index 2341c0e8ccfc5f88356ed38f33cca356c207214f..32aa00bdffb53d8491ce531dee29d0b14ffffab9 100755
--- a/tensorflow/tools/docker/Dockerfile.devel-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-mkl
@@ -3,13 +3,18 @@ FROM ubuntu:18.04
 LABEL maintainer="Clayne Robison <clayne.b.robison@intel.com>"
 
 # These parameters can be overridden by parameterized_docker_build.sh
-ARG TF_BUILD_VERSION=r1.12
+ARG TF_BUILD_VERSION=r1.13
 ARG PYTHON="python"
 ARG PYTHON3_DEV=""
 ARG WHL_DIR="/tmp/pip"
 ARG PIP="pip"
 
-RUN apt-get update && apt-get install -y --no-install-recommends \
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        ${PYTHON} \
+        ${PYTHON}-dev \
+        ${PYTHON}-pip \
+        ${PYTHON}-setuptools \
+        ${PYTHON}-wheel \
         build-essential \
         curl \
         git \
@@ -17,35 +22,20 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libfreetype6-dev \
         libhdf5-serial-dev \
         libpng-dev \
-        libzmq3-dev \
         libssl-dev \
+        libzmq3-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
         pkg-config \
         rsync \
         software-properties-common \
         unzip \
         zip \
         zlib1g-dev \
-        openjdk-8-jdk \
-        openjdk-8-jre-headless
-
-#install Python 3
-RUN if [ ${PYTHON} = "python3.6" ]; then \
-      curl https://www.python.org/ftp/python/3.6.5/Python-3.6.5.tar.xz -o /opt/python.tar.xz && \
-      cd /opt && tar xvf python.tar.xz && \
-      cd /opt/*/ && ./configure && \
-      make && make install; \
-    else \
-      apt-get install -y --no-install-recommends \
-        python-dev \
-        ${PYTHON3_DEV}; \
-    fi
-
-RUN    apt-get clean && \
+        && \
+    apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
-    ${PYTHON} get-pip.py && \
-    rm get-pip.py
 
 RUN ${PIP} --no-cache-dir install \
         Pillow \
@@ -57,17 +47,12 @@ RUN ${PIP} --no-cache-dir install \
         matplotlib \
         mock \
         numpy \
+        pandas \
         scipy \
         sklearn \
-        pandas \
         && \
     ${PYTHON} -m ipykernel.kernelspec
 
-RUN if [ "${PYTHON}" = "python3" ]; then \
-      ln -s -f /usr/bin/python3 /usr/bin/python; \
-  elif [ "${PYTHON}" = "python3.6" ]; then \
-      ln -s -f /usr/local/bin/python3.6 /usr/bin/python; \
-  fi
 
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
@@ -88,7 +73,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.15.0
+ENV BAZEL_VERSION 0.20.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod b/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
index 5e24617b2190f1d564d63f4c9be6321aa03cd8fb..21140918aa9ddf752ee8b24fd9a21f19ccace506 100755
--- a/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
+++ b/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
@@ -3,42 +3,43 @@ FROM ubuntu:18.04
 LABEL maintainer="Cong Xu <cong.xu@intel.com>"
 
 # These parameters can be overridden by parameterized_docker_build.sh
-ARG TF_BUILD_VERSION=r1.11
+ARG TF_BUILD_VERSION=r1.13
 ARG PYTHON="python"
 ARG PYTHON3_DEV=""
 ARG WHL_DIR="/tmp/pip"
 ARG PIP="pip"
 
-RUN apt-get update && apt-get install -y --no-install-recommends \
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        ${PYTHON} \
+        ${PYTHON}-dev \
+        ${PYTHON}-pip \
+        ${PYTHON}-setuptools \
+        ${PYTHON}-wheel \
         build-essential \
         curl \
         git \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
+        libnuma-dev \
         libpng-dev \
         libzmq3-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        openssh-client \
+        openssh-server \
         pkg-config \
-        python-dev \
-        ${PYTHON3_DEV} \
         rsync \
         software-properties-common \
         unzip \
+        wget \
         zip \
         zlib1g-dev \
-        openjdk-8-jdk \
-        openjdk-8-jre-headless \
-        wget \
-        libnuma-dev \
-        openssh-client \
-        openssh-server \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
-    ${PYTHON} get-pip.py && \
-    rm get-pip.py
 
 RUN ${PIP} --no-cache-dir install \
         Pillow \
@@ -56,9 +57,6 @@ RUN ${PIP} --no-cache-dir install \
         && \
     ${PYTHON} -m ipykernel.kernelspec
 
-RUN if [ "${PYTHON}" = "python3" ]; then \
-  ln -s -f /usr/bin/python3 /usr/bin/python; \
-  fi
 
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
@@ -79,7 +77,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.15.0
+ENV BAZEL_VERSION 0.20.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/docker/Dockerfile.mkl b/tensorflow/tools/docker/Dockerfile.mkl
index dad27697fa142ac80d7237510b8b7d7ebda2b621..3f7729ba59d88d6eefb939a397558f7c0bbcd79b 100755
--- a/tensorflow/tools/docker/Dockerfile.mkl
+++ b/tensorflow/tools/docker/Dockerfile.mkl
@@ -6,13 +6,18 @@ LABEL maintainer="Clayne Robison <clayne.b.robison@intel.com>"
 ARG TF_WHL_URL
 
 # Optional parameters
-ARG TF_BUILD_VERSION=r1.9
+ARG TF_BUILD_VERSION=r1.13
 ARG PYTHON="python"
 ARG PYTHON_DEV="python-dev"
 ARG PIP="pip"
 
 # Pick up some TF dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        ${PYTHON} \
+        ${PYTHON}-dev \
+        ${PYTHON}-pip \
+        ${PYTHON}-setuptools \
+        ${PYTHON}-wheel \
         build-essential \
         curl \
         libfreetype6-dev \
@@ -20,8 +25,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng-dev \
         libzmq3-dev \
         pkg-config \
-        ${PYTHON} \
-        ${PYTHON_DEV} \
         rsync \
         software-properties-common \
         unzip \
@@ -29,9 +32,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
-    ${PYTHON} get-pip.py && \
-    rm get-pip.py
 
 RUN ${PIP} --no-cache-dir install \
         Pillow \
@@ -48,13 +48,11 @@ RUN ${PIP} --no-cache-dir install \
         && \
     ${PYTHON} -m ipykernel.kernelspec
 
+
 COPY ${TF_WHL_URL} /
 RUN ${PIP} install --no-cache-dir --force-reinstall /${TF_WHL_URL} && \
     rm -rf /${TF_WHL_URL}
 
-RUN if [ "${PYTHON}" = "python3" ]; then \
-  ln -s -f /usr/bin/python3 /usr/bin/python; \
-  fi
 
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
diff --git a/tensorflow/tools/docker/Dockerfile.mkl-horovod b/tensorflow/tools/docker/Dockerfile.mkl-horovod
index 19dc45c62cbc79bf931d89f275b5a7816e9924c8..b0afd637279f7016060559dd678d75cab2451300 100755
--- a/tensorflow/tools/docker/Dockerfile.mkl-horovod
+++ b/tensorflow/tools/docker/Dockerfile.mkl-horovod
@@ -6,36 +6,36 @@ LABEL maintainer="Cong Xu <cong.xu@intel.com>"
 ARG TF_WHL_URL
 
 # Optional parameters
-ARG TF_BUILD_VERSION=r1.11
+ARG TF_BUILD_VERSION=r1.13
 ARG PYTHON="python"
 ARG PYTHON_DEV="python-dev"
 ARG PIP="pip"
 
 # Pick up some TF dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
+# RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+        ${PYTHON} \
+        ${PYTHON}-dev \
+        ${PYTHON}-pip \
+        ${PYTHON}-setuptools \
+        ${PYTHON}-wheel \
         build-essential \
         curl \
         libfreetype6-dev \
         libhdf5-serial-dev \
+        libnuma-dev \
         libpng-dev \
         libzmq3-dev \
+        openssh-client \
+        openssh-server \
         pkg-config \
-        python \
-        ${PYTHON_DEV} \
         rsync \
         software-properties-common \
         unzip \
         wget \
-        libnuma-dev \
-        openssh-client \
-        openssh-server \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
-    python get-pip.py && \
-    rm get-pip.py
 
 RUN ${PIP} --no-cache-dir install \
         Pillow \
@@ -50,15 +50,13 @@ RUN ${PIP} --no-cache-dir install \
         scipy \
         sklearn \
         && \
-    python -m ipykernel.kernelspec
+    ${PYTHON} -m ipykernel.kernelspec
+
 
 COPY ${TF_WHL_URL} /
 RUN ${PIP} install --no-cache-dir --force-reinstall /${TF_WHL_URL} && \
     rm -rf /${TF_WHL_URL}
 
-RUN if [ "${PYTHON}" = "python3" ]; then \
-  ln -s -f /usr/bin/python3 /usr/bin/python; \
-  fi
 
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
diff --git a/tensorflow/tools/dockerfiles/README.md b/tensorflow/tools/dockerfiles/README.md
index 07bfd5960e686d1198548c080df9c733955a2903..50b0cc5870ecccc216336fe5cdcbdd1a2a0e94b0 100644
--- a/tensorflow/tools/dockerfiles/README.md
+++ b/tensorflow/tools/dockerfiles/README.md
@@ -29,12 +29,13 @@ in the Dockerfile itself.
 After building the image with the tag `tf` (for example), use `docker run` to
 run the images.
 
-Note for new Docker users: the `-v` and `-u` flags share directories between
-the Docker container and your machine, and very important. Without
-`-v`, your work will be wiped once the container quits, and without `-u`, files
-created by the container will have the wrong file permissions on your host
-machine. If you are confused, check out the [Docker run
-documentation](https://docs.docker.com/engine/reference/run/).
+Note for new Docker users: the `-v` and `-u` flags share directories and
+permissions between the Docker container and your machine. Without `-v`, your
+work will be wiped once the container quits, and without `-u`, files created by
+the container will have the wrong file permissions on your host machine. Check
+out the
+[Docker run documentation](https://docs.docker.com/engine/reference/run/) for
+more info.
 
 ```bash
 # Volume mount (-v) is optional but highly recommended, especially for Jupyter.
@@ -83,15 +84,21 @@ $ alias asm_images="docker run --rm -v $(pwd):/tf -v /var/run/docker.sock:/var/r
 # If you're REBUILDING OR ADDING DOCKERFILES, remove docker.sock and add -u:
 $ alias asm_dockerfiles="docker run --rm -u $(id -u):$(id -g) -v $(pwd):/tf tf-tools python3 assembler.py "
 
-# Check flags
+# Check assembler flags
 $ asm_dockerfiles --help
 
 # Assemble all of the Dockerfiles
-$ asm_dockerfiles --release ubuntu-dockerfiles --construct_dockerfiles
+$ asm_dockerfiles --release dockerfiles --construct_dockerfiles
 
 # Build all of the "nightly" images on your local machine:
 $ asm_images --release nightly --build_images
 
+# Save the list of built images to a file:
+$ asm_images --release nightly --build_images > tf-built.txt
+
 # Build version release for version 99.0, except "gpu" tags:
-$ asm_images --release versioned --arg _TAG_PREFIX=99.0 --build_images --exclude_tags_matching '*.gpu.*'
+$ asm_images --release versioned --arg _TAG_PREFIX=99.0 --build_images --exclude_tags_matching '.*gpu.*'
+
+# Test your changes to the devel images:
+$ asm_images --release nightly --build_images --run_tests_path=$(realpath tests) --only_tags_matching="^devel-gpu-py3$"
 ```
diff --git a/tensorflow/tools/dockerfiles/assembler.py b/tensorflow/tools/dockerfiles/assembler.py
index 67a0320241d273bbb7a2439b2e09723905db0765..83b72cb5bb8d9686efe37f11357fc610902ebcb9 100644
--- a/tensorflow/tools/dockerfiles/assembler.py
+++ b/tensorflow/tools/dockerfiles/assembler.py
@@ -18,6 +18,9 @@
 - Builds images (and optionally runs image tests)
 - Pushes images to Docker Hub (provided with credentials)
 
+Logs are written to stderr; the list of successfully built images is
+written to stdout.
+
 Read README.md (in this directory) for instructions!
 """
 
@@ -31,6 +34,7 @@ import errno
 import itertools
 import multiprocessing
 import os
+import platform
 import re
 import shutil
 import sys
@@ -49,7 +53,7 @@ flags.DEFINE_string('hub_username', None,
 flags.DEFINE_string(
     'hub_password', None,
     ('Dockerhub password, only used with --upload_to_hub. Use from an env param'
-     'so your password isn\'t in your history.'))
+     ' so your password isn\'t in your history.'))
 
 flags.DEFINE_integer('hub_timeout', 3600,
                      'Abort Hub upload if it takes longer than this.')
@@ -142,6 +146,10 @@ flags.DEFINE_multi_string(
      'args will print a warning).'),
     short_name='a')
 
+flags.DEFINE_boolean(
+    'nocache', False,
+    'Disable the Docker build cache; identical to "docker build --no-cache"')
+
 flags.DEFINE_string(
     'spec_file',
     './spec.yml',
@@ -513,6 +521,7 @@ def main(argv):
   # Each tag has a name ('tag') and a definition consisting of the contents
   # of its Dockerfile, its build arg list, etc.
   failed_tags = []
+  succeeded_tags = []
   for tag, tag_defs in all_tags.items():
     for tag_def in tag_defs:
       eprint('> Working on {}'.format(tag))
@@ -544,6 +553,13 @@ def main(argv):
       if not FLAGS.build_images:
         continue
 
+      # Only build images for host architecture
+      proc_arch = platform.processor()
+      is_x86 = proc_arch.startswith('x86')
+      if (is_x86 and any([arch in tag for arch in ['ppc64le']]) or
+          not is_x86 and proc_arch not in tag):
+        continue
+
       # Generate a temporary Dockerfile to use to build, since docker-py
       # needs a filepath relative to the build context (i.e. the current
       # directory)
@@ -569,6 +585,7 @@ def main(argv):
           image, logs = dock.images.build(
               timeout=FLAGS.hub_timeout,
               path='.',
+              nocache=FLAGS.nocache,
               dockerfile=dockerfile,
               buildargs=tag_def['cli_args'],
               tag=repo_tag)
@@ -656,12 +673,20 @@ def main(argv):
               args=(FLAGS.hub_repository, dock, image, tag))
           p.start()
 
+      if not tag_failed:
+        succeeded_tags.append(tag)
+
   if failed_tags:
     eprint(
         '> Some tags failed to build or failed testing, check scrollback for '
         'errors: {}'.format(','.join(failed_tags)))
     exit(1)
 
+  eprint('> Writing built{} tags to standard out.'.format(
+      ' and tested' if FLAGS.run_tests_path else ''))
+  for tag in succeeded_tags:
+    print('{}:{}'.format(FLAGS.repository, tag))
+
 
 if __name__ == '__main__':
   app.run(main)
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
index d8fabadec280cc136bd6cc9a30e79390a9a167bd..c806fa4eacd38d6676333153e45631ed5f96ff42 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
@@ -47,13 +47,18 @@ RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
 #   tensorflow-gpu
 #   tf-nightly
 #   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
 ARG TF_PACKAGE=tensorflow
-RUN ${PIP} install ${TF_PACKAGE}
+ARG TF_PACKAGE_VERSION=
+RUN ${PIP} install ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
 RUN ${PIP} install jupyter matplotlib
+RUN ${PIP} install jupyter_http_over_ws
+RUN jupyter serverextension enable --py jupyter_http_over_ws
 
 RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
index 857b5e20471a82bd162e55b146854d0a5c165db8..a82577b53be4045171b685bbe4076ffdad4d3824 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
@@ -47,8 +47,11 @@ RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
 #   tensorflow-gpu
 #   tf-nightly
 #   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
 ARG TF_PACKAGE=tensorflow
-RUN ${PIP} install ${TF_PACKAGE}
+ARG TF_PACKAGE_VERSION=
+RUN ${PIP} install ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
index 43265676f8b7ab19dc14f2c1475de1af67054c6a..dc5b5d49b90e57e8a6acec1e8cd883d62a40dfba 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
@@ -30,7 +30,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
         rsync \
@@ -43,12 +42,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
- 
+
 ENV CI_BUILD_PYTHON python
 
-# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
 ARG CHECKOUT_TF_SRC=0
-RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
 
 ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
@@ -73,6 +74,7 @@ RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
     git \
+    wget \
     openjdk-8-jdk \
     ${PYTHON}-dev \
     swig
@@ -92,15 +94,20 @@ RUN ${PIP} --no-cache-dir install \
     enum34
 
 # Install bazel
-RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
-    curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
-    apt-get update && \
-    apt-get install -y bazel
+ARG BAZEL_VERSION=0.19.2
+RUN mkdir /bazel && \
+    wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    chmod +x /bazel/installer.sh && \
+    /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
 RUN ${PIP} install jupyter matplotlib
+RUN ${PIP} install jupyter_http_over_ws
+RUN jupyter serverextension enable --py jupyter_http_over_ws
 
 RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
index 5c5b2f91634ff43fb2a047c66a856ac787858a47..da813970a83e289ec7f9237c9b050fd37d7c1e55 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
@@ -30,7 +30,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
         rsync \
@@ -43,12 +42,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
- 
+
 ENV CI_BUILD_PYTHON python
 
-# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
 ARG CHECKOUT_TF_SRC=0
-RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
 
 ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
@@ -73,6 +74,7 @@ RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
     git \
+    wget \
     openjdk-8-jdk \
     ${PYTHON}-dev \
     swig
@@ -92,10 +94,13 @@ RUN ${PIP} --no-cache-dir install \
     enum34
 
 # Install bazel
-RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
-    curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
-    apt-get update && \
-    apt-get install -y bazel
+ARG BAZEL_VERSION=0.19.2
+RUN mkdir /bazel && \
+    wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    chmod +x /bazel/installer.sh && \
+    /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
index 8769e4e9cd619a2c31e37ee838e45ea050e42712..24309e3ba06a417c7dfc1bf6ce4802576fb83115 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
@@ -21,67 +21,66 @@
 
 ARG UBUNTU_VERSION=16.04
 
-FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base
-
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
+ARG CUDNN_MAJOR_VERSION=7
+ARG LIB_DIR_PREFIX=x86_64
+
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
-        cuda-command-line-tools-9-0 \
-        cuda-cublas-dev-9-0 \
-        cuda-cudart-dev-9-0 \
-        cuda-cufft-dev-9-0 \
-        cuda-curand-dev-9-0 \
-        cuda-cusolver-dev-9-0 \
-        cuda-cusparse-dev-9-0 \
-        curl \
-        git \
-        libcudnn7=7.2.1.38-1+cuda9.0 \
-        libcudnn7-dev=7.2.1.38-1+cuda9.0 \
-        libnccl2=2.2.13-1+cuda9.0 \
-        libnccl-dev=2.2.13-1+cuda9.0 \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-dev-${CUDA/./-} \
+        cuda-cudart-dev-${CUDA/./-} \
+        cuda-cufft-dev-${CUDA/./-} \
+        cuda-curand-dev-${CUDA/./-} \
+        cuda-cusolver-dev-${CUDA/./-} \
+        cuda-cusparse-dev-${CUDA/./-} \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libcudnn7-dev=${CUDNN}+cuda${CUDA} \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
-        python-dev \
         rsync \
         software-properties-common \
         unzip \
         zip \
         zlib1g-dev \
         wget \
+        git \
         && \
-    rm -rf /var/lib/apt/lists/* && \
-    find /usr/local/cuda-9.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
-    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
+    find /usr/local/cuda-${CUDA}/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
 
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
-        apt-get update && \
-        apt-get install libnvinfer4=4.1.2-1+cuda9.0 && \
-        apt-get install libnvinfer-dev=4.1.2-1+cuda9.0
-
-# Link NCCL libray and header where the build script expects them.
-RUN mkdir /usr/local/cuda-9.0/lib &&  \
-    ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
-    ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
+RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*; }
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
-ENV TF_NEED_TENSORRT 1
+ENV TF_NEED_TENSORRT 0
 ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
-ENV TF_CUDA_VERSION=9.0
-ENV TF_CUDNN_VERSION=7
-
-# NCCL 2.x
-ENV TF_NCCL_VERSION=2
-
-# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ENV TF_CUDA_VERSION=${CUDA}
+ENV TF_CUDNN_VERSION=${CUDNN_MAJOR_VERSION}
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
 ARG CHECKOUT_TF_SRC=0
-RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
 
 ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
@@ -106,6 +105,7 @@ RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
     git \
+    wget \
     openjdk-8-jdk \
     ${PYTHON}-dev \
     swig
@@ -125,15 +125,20 @@ RUN ${PIP} --no-cache-dir install \
     enum34
 
 # Install bazel
-RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
-    curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
-    apt-get update && \
-    apt-get install -y bazel
+ARG BAZEL_VERSION=0.19.2
+RUN mkdir /bazel && \
+    wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    chmod +x /bazel/installer.sh && \
+    /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
 RUN ${PIP} install jupyter matplotlib
+RUN ${PIP} install jupyter_http_over_ws
+RUN jupyter serverextension enable --py jupyter_http_over_ws
 
 RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
index 809cda679ea7e33b64e4b4180cfa1af2d05f8ff3..6bc4e32efb1681c4287758b5e2c70849321971b1 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
@@ -21,67 +21,66 @@
 
 ARG UBUNTU_VERSION=16.04
 
-FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base
-
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
+ARG CUDNN_MAJOR_VERSION=7
+ARG LIB_DIR_PREFIX=x86_64
+
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
-        cuda-command-line-tools-9-0 \
-        cuda-cublas-dev-9-0 \
-        cuda-cudart-dev-9-0 \
-        cuda-cufft-dev-9-0 \
-        cuda-curand-dev-9-0 \
-        cuda-cusolver-dev-9-0 \
-        cuda-cusparse-dev-9-0 \
-        curl \
-        git \
-        libcudnn7=7.2.1.38-1+cuda9.0 \
-        libcudnn7-dev=7.2.1.38-1+cuda9.0 \
-        libnccl2=2.2.13-1+cuda9.0 \
-        libnccl-dev=2.2.13-1+cuda9.0 \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-dev-${CUDA/./-} \
+        cuda-cudart-dev-${CUDA/./-} \
+        cuda-cufft-dev-${CUDA/./-} \
+        cuda-curand-dev-${CUDA/./-} \
+        cuda-cusolver-dev-${CUDA/./-} \
+        cuda-cusparse-dev-${CUDA/./-} \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libcudnn7-dev=${CUDNN}+cuda${CUDA} \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
-        python-dev \
         rsync \
         software-properties-common \
         unzip \
         zip \
         zlib1g-dev \
         wget \
+        git \
         && \
-    rm -rf /var/lib/apt/lists/* && \
-    find /usr/local/cuda-9.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
-    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
+    find /usr/local/cuda-${CUDA}/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
 
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
-        apt-get update && \
-        apt-get install libnvinfer4=4.1.2-1+cuda9.0 && \
-        apt-get install libnvinfer-dev=4.1.2-1+cuda9.0
-
-# Link NCCL libray and header where the build script expects them.
-RUN mkdir /usr/local/cuda-9.0/lib &&  \
-    ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
-    ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
+RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*; }
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
-ENV TF_NEED_TENSORRT 1
+ENV TF_NEED_TENSORRT 0
 ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
-ENV TF_CUDA_VERSION=9.0
-ENV TF_CUDNN_VERSION=7
-
-# NCCL 2.x
-ENV TF_NCCL_VERSION=2
-
-# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ENV TF_CUDA_VERSION=${CUDA}
+ENV TF_CUDNN_VERSION=${CUDNN_MAJOR_VERSION}
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
 ARG CHECKOUT_TF_SRC=0
-RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
 
 ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
@@ -106,6 +105,7 @@ RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
     git \
+    wget \
     openjdk-8-jdk \
     ${PYTHON}-dev \
     swig
@@ -125,10 +125,13 @@ RUN ${PIP} --no-cache-dir install \
     enum34
 
 # Install bazel
-RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
-    curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
-    apt-get update && \
-    apt-get install -y bazel
+ARG BAZEL_VERSION=0.19.2
+RUN mkdir /bazel && \
+    wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    chmod +x /bazel/installer.sh && \
+    /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
index acfe4d8607d56b6192926eb50ef9a3d58a07efe2..85a32fae1b10038956699db276e0c41973a77996 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
@@ -21,35 +21,41 @@
 
 ARG UBUNTU_VERSION=16.04
 
-FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base
-
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
+
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
+# Pick up some TF dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
-        cuda-command-line-tools-9-0 \
-        cuda-cublas-9-0 \
-        cuda-cufft-9-0 \
-        cuda-curand-9-0 \
-        cuda-cusolver-9-0 \
-        cuda-cusparse-9-0 \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-${CUDA/./-} \
+        cuda-cufft-${CUDA/./-} \
+        cuda-curand-${CUDA/./-} \
+        cuda-cusolver-${CUDA/./-} \
+        cuda-cusparse-${CUDA/./-} \
         curl \
-        libcudnn7=7.2.1.38-1+cuda9.0 \
-        libnccl2=2.2.13-1+cuda9.0 \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
-        rsync \
         software-properties-common \
-        unzip \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
+        unzip
 
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
-        apt-get update && \
-        apt-get install libnvinfer4=4.1.2-1+cuda9.0
+RUN [ ${ARCH} = ppc64le ] || (apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*)
 
 # For CUDA profiling, TensorFlow requires CUPTI.
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
@@ -78,13 +84,18 @@ RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
 #   tensorflow-gpu
 #   tf-nightly
 #   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
 ARG TF_PACKAGE=tensorflow
-RUN ${PIP} install ${TF_PACKAGE}
+ARG TF_PACKAGE_VERSION=
+RUN ${PIP} install ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
 RUN ${PIP} install jupyter matplotlib
+RUN ${PIP} install jupyter_http_over_ws
+RUN jupyter serverextension enable --py jupyter_http_over_ws
 
 RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
index f36a21eaf0cce02cf77db7c88358696c6f392cf4..c661341eaeeb5ec331055943397fa5245aea76fb 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
@@ -21,35 +21,41 @@
 
 ARG UBUNTU_VERSION=16.04
 
-FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
 
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
+# Pick up some TF dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
-        cuda-command-line-tools-9-0 \
-        cuda-cublas-9-0 \
-        cuda-cufft-9-0 \
-        cuda-curand-9-0 \
-        cuda-cusolver-9-0 \
-        cuda-cusparse-9-0 \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-${CUDA/./-} \
+        cuda-cufft-${CUDA/./-} \
+        cuda-curand-${CUDA/./-} \
+        cuda-cusolver-${CUDA/./-} \
+        cuda-cusparse-${CUDA/./-} \
         curl \
-        libcudnn7=7.2.1.38-1+cuda9.0 \
-        libnccl2=2.2.13-1+cuda9.0 \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
-        rsync \
         software-properties-common \
-        unzip \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
+        unzip
 
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
-        apt-get update && \
-        apt-get install libnvinfer4=4.1.2-1+cuda9.0
+RUN [ ${ARCH} = ppc64le ] || (apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*)
 
 # For CUDA profiling, TensorFlow requires CUPTI.
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
@@ -78,8 +84,11 @@ RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
 #   tensorflow-gpu
 #   tf-nightly
 #   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
 ARG TF_PACKAGE=tensorflow
-RUN ${PIP} install ${TF_PACKAGE}
+ARG TF_PACKAGE_VERSION=
+RUN ${PIP} install ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le-jupyter.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..63bf205a7ae0392a3ab30e142d9e6b100608fa86
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le-jupyter.Dockerfile
@@ -0,0 +1,94 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
+RUN apt-get update && apt-get install -y wget libhdf5-dev
+RUN ${PIP} install --global-option=build_ext \
+            --global-option=-I/usr/include/hdf5/serial/ \
+            --global-option=-L/usr/lib/powerpc64le-linux-gnu/hdf5/serial \
+            h5py
+
+# CACHE_STOP is used to rerun future commands, otherwise downloading the .whl will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+RUN if [ ${TF_PACKAGE} = tensorflow-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tensorflow ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    fi; \
+    MAJOR=`${PYTHON} -c 'import sys; print(sys.version_info[0])'`; \
+    MINOR=`${PYTHON} -c 'import sys; print(sys.version_info[1])'`; \
+    PACKAGE=$(wget -qO- ${BASE}"api/xml?xpath=//fileName&wrapper=artifacts" | grep -o "[^<>]*cp${MAJOR}${MINOR}[^<>]*.whl"); \
+    wget ${BASE}"artifact/tensorflow_pkg/"${PACKAGE}; \
+    ${PIP} install ${PACKAGE}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN ${PIP} install jupyter matplotlib
+RUN ${PIP} install jupyter_http_over_ws
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
+EXPOSE 8888
+
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..083d61bf9a2adc69ac821841f628096c91af3524
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/cpu-ppc64le.Dockerfile
@@ -0,0 +1,75 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
+RUN apt-get update && apt-get install -y wget libhdf5-dev
+RUN ${PIP} install --global-option=build_ext \
+            --global-option=-I/usr/include/hdf5/serial/ \
+            --global-option=-L/usr/lib/powerpc64le-linux-gnu/hdf5/serial \
+            h5py
+
+# CACHE_STOP is used to rerun future commands, otherwise downloading the .whl will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+RUN if [ ${TF_PACKAGE} = tensorflow-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tensorflow ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    fi; \
+    MAJOR=`${PYTHON} -c 'import sys; print(sys.version_info[0])'`; \
+    MINOR=`${PYTHON} -c 'import sys; print(sys.version_info[1])'`; \
+    PACKAGE=$(wget -qO- ${BASE}"api/xml?xpath=//fileName&wrapper=artifacts" | grep -o "[^<>]*cp${MAJOR}${MINOR}[^<>]*.whl"); \
+    wget ${BASE}"artifact/tensorflow_pkg/"${PACKAGE}; \
+    ${PIP} install ${PACKAGE}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..c8384f7af2020e142ec0a2c28418252e6028b083
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
@@ -0,0 +1,127 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
+ # Build and install bazel
+ENV BAZEL_VERSION 0.15.0
+WORKDIR /
+RUN mkdir /bazel && \
+    cd /bazel && \
+    curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-dist.zip && \
+    unzip bazel-$BAZEL_VERSION-dist.zip && \
+    bash ./compile.sh && \
+    cp output/bazel /usr/local/bin/ && \
+    rm -rf /bazel && \
+    cd -
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN ${PIP} install jupyter matplotlib
+RUN ${PIP} install jupyter_http_over_ws
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
+EXPOSE 8888
+
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..08f880e7b530cf6dd0e38dd0ff1e46eecbda17b1
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
@@ -0,0 +1,108 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        curl \
+        git \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        openjdk-8-jdk \
+        openjdk-8-jre-headless \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV CI_BUILD_PYTHON python
+
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
+ # Build and install bazel
+ENV BAZEL_VERSION 0.15.0
+WORKDIR /
+RUN mkdir /bazel && \
+    cd /bazel && \
+    curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-dist.zip && \
+    unzip bazel-$BAZEL_VERSION-dist.zip && \
+    bash ./compile.sh && \
+    cp output/bazel /usr/local/bin/ && \
+    rm -rf /bazel && \
+    cd -
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..c508a0f73f2dabb21c937b00c9653b33fbc108fe
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
@@ -0,0 +1,158 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
+ARG CUDNN_MAJOR_VERSION=7
+ARG LIB_DIR_PREFIX=x86_64
+
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-dev-${CUDA/./-} \
+        cuda-cudart-dev-${CUDA/./-} \
+        cuda-cufft-dev-${CUDA/./-} \
+        cuda-curand-dev-${CUDA/./-} \
+        cuda-cusolver-dev-${CUDA/./-} \
+        cuda-cusparse-dev-${CUDA/./-} \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libcudnn7-dev=${CUDNN}+cuda${CUDA} \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        wget \
+        git \
+        && \
+    find /usr/local/cuda-${CUDA}/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
+
+RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*; }
+
+# Configure the build for our CUDA configuration.
+ENV CI_BUILD_PYTHON python
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+ENV TF_NEED_CUDA 1
+ENV TF_NEED_TENSORRT 0
+ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
+ENV TF_CUDA_VERSION=${CUDA}
+ENV TF_CUDNN_VERSION=${CUDNN_MAJOR_VERSION}
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
+ # Build and install bazel
+ENV BAZEL_VERSION 0.15.0
+WORKDIR /
+RUN mkdir /bazel && \
+    cd /bazel && \
+    curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-dist.zip && \
+    unzip bazel-$BAZEL_VERSION-dist.zip && \
+    bash ./compile.sh && \
+    cp output/bazel /usr/local/bin/ && \
+    rm -rf /bazel && \
+    cd -
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN ${PIP} install jupyter matplotlib
+RUN ${PIP} install jupyter_http_over_ws
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
+EXPOSE 8888
+
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..f910cb2e91b7f20d9b399d4c032b63d3b991fd72
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
@@ -0,0 +1,139 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
+ARG CUDNN_MAJOR_VERSION=7
+ARG LIB_DIR_PREFIX=x86_64
+
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-dev-${CUDA/./-} \
+        cuda-cudart-dev-${CUDA/./-} \
+        cuda-cufft-dev-${CUDA/./-} \
+        cuda-curand-dev-${CUDA/./-} \
+        cuda-cusolver-dev-${CUDA/./-} \
+        cuda-cusparse-dev-${CUDA/./-} \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libcudnn7-dev=${CUDNN}+cuda${CUDA} \
+        libcurl3-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        unzip \
+        zip \
+        zlib1g-dev \
+        wget \
+        git \
+        && \
+    find /usr/local/cuda-${CUDA}/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
+
+RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*; }
+
+# Configure the build for our CUDA configuration.
+ENV CI_BUILD_PYTHON python
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+ENV TF_NEED_CUDA 1
+ENV TF_NEED_TENSORRT 0
+ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
+ENV TF_CUDA_VERSION=${CUDA}
+ENV TF_CUDNN_VERSION=${CUDNN_MAJOR_VERSION}
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
+ # Build and install bazel
+ENV BAZEL_VERSION 0.15.0
+WORKDIR /
+RUN mkdir /bazel && \
+    cd /bazel && \
+    curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-dist.zip && \
+    unzip bazel-$BAZEL_VERSION-dist.zip && \
+    bash ./compile.sh && \
+    cp output/bazel /usr/local/bin/ && \
+    rm -rf /bazel && \
+    cd -
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..1e82ca282208373f57923fda2619b51a43a0d52c
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile
@@ -0,0 +1,131 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
+
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
+# Pick up some TF dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-${CUDA/./-} \
+        cuda-cufft-${CUDA/./-} \
+        cuda-curand-${CUDA/./-} \
+        cuda-cusolver-${CUDA/./-} \
+        cuda-cusparse-${CUDA/./-} \
+        curl \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        software-properties-common \
+        unzip
+
+RUN [ ${ARCH} = ppc64le ] || (apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*)
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
+RUN apt-get update && apt-get install -y wget libhdf5-dev
+RUN ${PIP} install --global-option=build_ext \
+            --global-option=-I/usr/include/hdf5/serial/ \
+            --global-option=-L/usr/lib/powerpc64le-linux-gnu/hdf5/serial \
+            h5py
+
+# CACHE_STOP is used to rerun future commands, otherwise downloading the .whl will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+RUN if [ ${TF_PACKAGE} = tensorflow-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tensorflow ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    fi; \
+    MAJOR=`${PYTHON} -c 'import sys; print(sys.version_info[0])'`; \
+    MINOR=`${PYTHON} -c 'import sys; print(sys.version_info[1])'`; \
+    PACKAGE=$(wget -qO- ${BASE}"api/xml?xpath=//fileName&wrapper=artifacts" | grep -o "[^<>]*cp${MAJOR}${MINOR}[^<>]*.whl"); \
+    wget ${BASE}"artifact/tensorflow_pkg/"${PACKAGE}; \
+    ${PIP} install ${PACKAGE}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
+
+RUN ${PIP} install jupyter matplotlib
+RUN ${PIP} install jupyter_http_over_ws
+RUN jupyter serverextension enable --py jupyter_http_over_ws
+
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
+RUN mkdir /.local && chmod a+rwx /.local
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
+EXPOSE 8888
+
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..e9572756e53d6c400948b2f82e792826c19c7543
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile
@@ -0,0 +1,112 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
+
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
+
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
+# Pick up some TF dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-${CUDA/./-} \
+        cuda-cufft-${CUDA/./-} \
+        cuda-curand-${CUDA/./-} \
+        cuda-cusolver-${CUDA/./-} \
+        cuda-cusparse-${CUDA/./-} \
+        curl \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libzmq3-dev \
+        pkg-config \
+        software-properties-common \
+        unzip
+
+RUN [ ${ARCH} = ppc64le ] || (apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*)
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+
+ARG USE_PYTHON_3_NOT_2
+ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
+ARG PYTHON=python${_PY_SUFFIX}
+ARG PIP=pip${_PY_SUFFIX}
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+RUN apt-get update && apt-get install -y \
+    ${PYTHON} \
+    ${PYTHON}-pip
+
+RUN ${PIP} --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
+RUN apt-get update && apt-get install -y wget libhdf5-dev
+RUN ${PIP} install --global-option=build_ext \
+            --global-option=-I/usr/include/hdf5/serial/ \
+            --global-option=-L/usr/lib/powerpc64le-linux-gnu/hdf5/serial \
+            h5py
+
+# CACHE_STOP is used to rerun future commands, otherwise downloading the .whl will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+RUN if [ ${TF_PACKAGE} = tensorflow-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tensorflow ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    fi; \
+    MAJOR=`${PYTHON} -c 'import sys; print(sys.version_info[0])'`; \
+    MINOR=`${PYTHON} -c 'import sys; print(sys.version_info[1])'`; \
+    PACKAGE=$(wget -qO- ${BASE}"api/xml?xpath=//fileName&wrapper=artifacts" | grep -o "[^<>]*cp${MAJOR}${MINOR}[^<>]*.whl"); \
+    wget ${BASE}"artifact/tensorflow_pkg/"${PACKAGE}; \
+    ${PIP} install ${PACKAGE}
+
+COPY bashrc /etc/bash.bashrc
+RUN chmod a+rwx /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
index c4ec6095c0cae43b9d5756cd4391ca3ddd329fbe..c056d915d655965583f9f256297a538fbd51ba8c 100644
--- a/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
@@ -1,4 +1,6 @@
 RUN ${PIP} install jupyter matplotlib
+RUN ${PIP} install jupyter_http_over_ws
+RUN jupyter serverextension enable --py jupyter_http_over_ws
 
 RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
diff --git a/tensorflow/tools/dockerfiles/partials/tensorflow-ppc64le.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/tensorflow-ppc64le.partial.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..1e79574a34de7e15bccc68136269962d375459a0
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/tensorflow-ppc64le.partial.Dockerfile
@@ -0,0 +1,28 @@
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
+RUN apt-get update && apt-get install -y wget libhdf5-dev
+RUN ${PIP} install --global-option=build_ext \
+            --global-option=-I/usr/include/hdf5/serial/ \
+            --global-option=-L/usr/lib/powerpc64le-linux-gnu/hdf5/serial \
+            h5py
+
+# CACHE_STOP is used to rerun future commands, otherwise downloading the .whl will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+RUN if [ ${TF_PACKAGE} = tensorflow-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly-gpu ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tensorflow ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/lastSuccessfulBuild/; \
+    elif [ ${TF_PACKAGE} = tf-nightly ]; then \
+        BASE=https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Nightly_Artifact/lastSuccessfulBuild/; \
+    fi; \
+    MAJOR=`${PYTHON} -c 'import sys; print(sys.version_info[0])'`; \
+    MINOR=`${PYTHON} -c 'import sys; print(sys.version_info[1])'`; \
+    PACKAGE=$(wget -qO- ${BASE}"api/xml?xpath=//fileName&wrapper=artifacts" | grep -o "[^<>]*cp${MAJOR}${MINOR}[^<>]*.whl"); \
+    wget ${BASE}"artifact/tensorflow_pkg/"${PACKAGE}; \
+    ${PIP} install ${PACKAGE}
diff --git a/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile
index 76758bd147ef9d52b3db072bd0091190e132667c..2ae840687df4fa2419f92b73adc11dca5b3a9f7b 100644
--- a/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile
@@ -3,5 +3,8 @@
 #   tensorflow-gpu
 #   tf-nightly
 #   tf-nightly-gpu
+# Set --build-arg TF_PACKAGE_VERSION=1.11.0rc0 to install a specific version.
+# Installs the latest version by default.
 ARG TF_PACKAGE=tensorflow
-RUN ${PIP} install ${TF_PACKAGE}
+ARG TF_PACKAGE_VERSION=
+RUN ${PIP} install ${TF_PACKAGE}${TF_PACKAGE_VERSION:+==${TF_PACKAGE_VERSION}}
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
index 156bb019914554e650421fb23bcebc935658abdb..855a01c379b3c3b26f0bd50b3b3513cdf363f135 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
@@ -2,6 +2,7 @@ RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
     git \
+    wget \
     openjdk-8-jdk \
     ${PYTHON}-dev \
     swig
@@ -21,7 +22,10 @@ RUN ${PIP} --no-cache-dir install \
     enum34
 
 # Install bazel
-RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
-    curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
-    apt-get update && \
-    apt-get install -y bazel
+ARG BAZEL_VERSION=0.19.2
+RUN mkdir /bazel && \
+    wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
+    wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
+    chmod +x /bazel/installer.sh && \
+    /bazel/installer.sh && \
+    rm -f /bazel/installer.sh
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..0397ab5fa8569dc0274f9550cf4ecae65489c248
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
@@ -0,0 +1,33 @@
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    openjdk-8-jdk \
+    ${PYTHON}-dev \
+    swig
+
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
+ # Build and install bazel
+ENV BAZEL_VERSION 0.15.0
+WORKDIR /
+RUN mkdir /bazel && \
+    cd /bazel && \
+    curl -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-dist.zip && \
+    unzip bazel-$BAZEL_VERSION-dist.zip && \
+    bash ./compile.sh && \
+    cp output/bazel /usr/local/bin/ && \
+    rm -rf /bazel && \
+    cd -
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-cpu.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-cpu.partial.Dockerfile
index a61dfbbe54eb163b25160490f3ee245c36d21ffe..a1fd901b343bd80bde2061e29ee7f3abbf7e762d 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-cpu.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-cpu.partial.Dockerfile
@@ -7,7 +7,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
         rsync \
@@ -20,9 +19,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
- 
+
 ENV CI_BUILD_PYTHON python
 
-# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
 ARG CHECKOUT_TF_SRC=0
-RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
index 95f9875012d2a552be4af6f59cb6a5c60d99dce5..cf3e38b8c8b1072c5c1633003d4f1669192ce191 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
@@ -1,61 +1,60 @@
-FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
+ARG CUDNN_MAJOR_VERSION=7
+ARG LIB_DIR_PREFIX=x86_64
 
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
-        cuda-command-line-tools-9-0 \
-        cuda-cublas-dev-9-0 \
-        cuda-cudart-dev-9-0 \
-        cuda-cufft-dev-9-0 \
-        cuda-curand-dev-9-0 \
-        cuda-cusolver-dev-9-0 \
-        cuda-cusparse-dev-9-0 \
-        curl \
-        git \
-        libcudnn7=7.2.1.38-1+cuda9.0 \
-        libcudnn7-dev=7.2.1.38-1+cuda9.0 \
-        libnccl2=2.2.13-1+cuda9.0 \
-        libnccl-dev=2.2.13-1+cuda9.0 \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-dev-${CUDA/./-} \
+        cuda-cudart-dev-${CUDA/./-} \
+        cuda-cufft-dev-${CUDA/./-} \
+        cuda-curand-dev-${CUDA/./-} \
+        cuda-cusolver-dev-${CUDA/./-} \
+        cuda-cusparse-dev-${CUDA/./-} \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
+        libcudnn7-dev=${CUDNN}+cuda${CUDA} \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
-        python-dev \
         rsync \
         software-properties-common \
         unzip \
         zip \
         zlib1g-dev \
         wget \
+        git \
         && \
-    rm -rf /var/lib/apt/lists/* && \
-    find /usr/local/cuda-9.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
-    rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
-
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
-        apt-get update && \
-        apt-get install libnvinfer4=4.1.2-1+cuda9.0 && \
-        apt-get install libnvinfer-dev=4.1.2-1+cuda9.0
+    find /usr/local/cuda-${CUDA}/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+    rm /usr/lib/${LIB_DIR_PREFIX}-linux-gnu/libcudnn_static_v7.a
 
-# Link NCCL libray and header where the build script expects them.
-RUN mkdir /usr/local/cuda-9.0/lib &&  \
-    ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
-    ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
+RUN [[ "${ARCH}" = "ppc64le" ]] || { apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*; }
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
 ENV TF_NEED_CUDA 1
-ENV TF_NEED_TENSORRT 1
+ENV TF_NEED_TENSORRT 0
 ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
-ENV TF_CUDA_VERSION=9.0
-ENV TF_CUDNN_VERSION=7
-
-# NCCL 2.x
-ENV TF_NCCL_VERSION=2
-
-# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ENV TF_CUDA_VERSION=${CUDA}
+ENV TF_CUDNN_VERSION=${CUDNN_MAJOR_VERSION}
+# CACHE_STOP is used to rerun future commands, otherwise cloning tensorflow will be cached and will not pull the most recent version
+ARG CACHE_STOP=1
+# Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1
 ARG CHECKOUT_TF_SRC=0
-RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
index 1dc8e43aaddc606efde2cbd84215f7ef7131e251..041ee87839938b80489f750530c47c8519ab6171 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
@@ -1,32 +1,38 @@
-FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base
+ARG ARCH=
+ARG CUDA=10.0
+FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
+# ARCH and CUDA are specified again because the FROM directive resets ARGs
+# (but their default value is retained if set previously)
+ARG ARCH
+ARG CUDA
+ARG CUDNN=7.4.1.5-1
 
+# Needed for string substitution 
+SHELL ["/bin/bash", "-c"]
+# Pick up some TF dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
-        cuda-command-line-tools-9-0 \
-        cuda-cublas-9-0 \
-        cuda-cufft-9-0 \
-        cuda-curand-9-0 \
-        cuda-cusolver-9-0 \
-        cuda-cusparse-9-0 \
+        cuda-command-line-tools-${CUDA/./-} \
+        cuda-cublas-${CUDA/./-} \
+        cuda-cufft-${CUDA/./-} \
+        cuda-curand-${CUDA/./-} \
+        cuda-cusolver-${CUDA/./-} \
+        cuda-cusparse-${CUDA/./-} \
         curl \
-        libcudnn7=7.2.1.38-1+cuda9.0 \
-        libnccl2=2.2.13-1+cuda9.0 \
+        libcudnn7=${CUDNN}+cuda${CUDA} \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
         libzmq3-dev \
         pkg-config \
-        rsync \
         software-properties-common \
-        unzip \
-        && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
+        unzip
 
-RUN apt-get update && \
-        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
-        apt-get update && \
-        apt-get install libnvinfer4=4.1.2-1+cuda9.0
+RUN [ ${ARCH} = ppc64le ] || (apt-get update && \
+        apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda${CUDA} \
+        && apt-get update \
+        && apt-get install -y --no-install-recommends libnvinfer5=5.0.2-1+cuda${CUDA} \
+        && apt-get clean \
+        && rm -rf /var/lib/apt/lists/*)
 
 # For CUDA profiling, TensorFlow requires CUPTI.
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
diff --git a/tensorflow/tools/dockerfiles/spec.yml b/tensorflow/tools/dockerfiles/spec.yml
index 19d96e7a3df4468ff82f2029a1945a02b1e58932..6fddfe000c60dadd05ff172d4cb036e648377deb 100644
--- a/tensorflow/tools/dockerfiles/spec.yml
+++ b/tensorflow/tools/dockerfiles/spec.yml
@@ -55,6 +55,8 @@ releases:
         tag_specs:
             - "{ubuntu}{jupyter}"
             - "{ubuntu-devel}{jupyter}"
+            - "{ubuntu-ppc64le}{jupyter}"
+            - "{ubuntu-devel-ppc64le}{jupyter}"
 
 slice_sets:
 
@@ -122,6 +124,70 @@ slice_sets:
           args:
               - CHECKOUT_TF_SRC=1
 
+    ubuntu-ppc64le:
+        - add_to_name: "-ppc64le"
+          dockerfile_exclusive_name: "cpu-ppc64le"
+          dockerfile_subdirectory: "ppc64le"
+          args:
+              - UBUNTU_VERSION=18.04
+          partials:
+              - ubuntu/version
+              - ubuntu/cpu
+              - ubuntu/python
+              - tensorflow-ppc64le
+              - shell
+        - add_to_name: "-gpu-ppc64le"
+          dockerfile_exclusive_name: "gpu-ppc64le"
+          dockerfile_subdirectory: "ppc64le"
+          args:
+              - UBUNTU_VERSION=18.04
+              - ARCH=ppc64le
+              - CUDA=10.0
+              - TF_PACKAGE=tensorflow-gpu
+          partials:
+              - ubuntu/version
+              - ubuntu/nvidia
+              - ubuntu/python
+              - tensorflow-ppc64le
+              - shell
+          tests:
+              - import-gpu.sh
+          test_runtime: nvidia
+
+    ubuntu-devel-ppc64le:
+        - add_to_name: "devel-ppc64le"
+          dockerfile_exclusive_name: "devel-cpu-ppc64le"
+          dockerfile_subdirectory: "ppc64le"
+          partials:
+              - ubuntu/version
+              - ubuntu/devel-cpu
+              - ubuntu/python
+              - ubuntu/bazelbuild
+              - shell
+          tests:
+              - build-cpu.sh
+          args:
+              - UBUNTU_VERSION=18.04
+              - CHECKOUT_TF_SRC=1
+        - add_to_name: "devel-gpu-ppc64le"
+          dockerfile_exclusive_name: "devel-gpu-ppc64le"
+          dockerfile_subdirectory: "ppc64le"
+          args:
+              - UBUNTU_VERSION=18.04
+              - ARCH=ppc64le
+              - CUDA=10.0
+              - LIB_DIR_PREFIX=powerpc64le
+              - CHECKOUT_TF_SRC=1
+          partials:
+              - ubuntu/version
+              - ubuntu/devel-nvidia
+              - ubuntu/python
+              - ubuntu/bazelbuild
+              - shell
+          tests:
+              - build-gpu.sh
+          test_runtime: nvidia
+
     nightly:
         - add_to_name: "nightly"
           partials:
diff --git a/tensorflow/tools/dockerfiles/tools.Dockerfile b/tensorflow/tools/dockerfiles/tools.Dockerfile
index e8929295a5ee397acbe46ebf96894174ca01fca2..a96b2578cba7579c605d25ee6068d2cde278e1f4 100644
--- a/tensorflow/tools/dockerfiles/tools.Dockerfile
+++ b/tensorflow/tools/dockerfiles/tools.Dockerfile
@@ -17,7 +17,7 @@
 #
 # You can use this image to quickly develop changes to the Dockerfile assembler
 # or set of TF Docker partials. See README.md for usage instructions.
-FROM debian:stretch
+FROM ubuntu:16.04
 LABEL maintainer="Austin Anderson <angerson@google.com>"
 
 RUN apt-get update && apt-get install -y python3 python3-pip bash curl
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index b072853a4ec298ce5c15afc1307a966ecefb743f..cc106b5955ba07f4f166638ba51699060788e6ae 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -81,7 +81,7 @@ py_library(
     srcs_version = "PY2AND3",
 )
 
-py_binary(
+py_library(
     name = "generate_lib",
     srcs = ["generate_lib.py"],
     srcs_version = "PY2AND3",
@@ -155,7 +155,7 @@ py_test(
         "optonly",
     ],
     deps = [
-        ":generate2",
+        ":generate2_lib",
     ],
 )
 
@@ -163,7 +163,17 @@ py_binary(
     name = "generate2",
     srcs = ["generate2.py"],
     srcs_version = "PY2AND3",
-    deps = ["//tensorflow:tensorflow_py"],
+    deps = [":generate2_lib"],
+)
+
+py_library(
+    name = "generate2_lib",
+    srcs = ["generate2.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:util",
+    ],
 )
 
 py_library(
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
index fba909d26defffad2d7dbaffa4463695685ae50c..0a50eb6c2392b37932705b6481055d49d66417b2 100644
--- a/tensorflow/tools/docs/generate2.py
+++ b/tensorflow/tools/docs/generate2.py
@@ -31,10 +31,22 @@ from os import path
 
 from absl import app
 from absl import flags
-
 import tensorflow as tf
 
+from tensorflow_docs.api_generator import doc_generator_visitor
 from tensorflow_docs.api_generator import generate_lib
+from tensorflow_docs.api_generator import parser
+
+from tensorflow.python.util import tf_export
+from tensorflow.python.util import tf_inspect
+
+# Use tensorflow's `tf_inspect`, which is aware of `tf_decorator`.
+parser.tf_inspect = tf_inspect
+
+# `tf` has an `__all__` that doesn't list important things like `keras`.
+# The doc generator recognizes `__all__` as the list of public symbols.
+# So patch `tf.__all__` to list everything.
+tf.__all__ = [item_name for item_name, value in tf_inspect.getmembers(tf)]
 
 FLAGS = flags.FLAGS
 
@@ -50,6 +62,28 @@ flags.DEFINE_string(
 flags.DEFINE_bool("search_hints", True,
                   "Include meta-data search hints at the top of each file.")
 
+flags.DEFINE_string("site_path", "",
+                    "The prefix ({site-path}/api_docs/python/...) used in the "
+                    "`_toc.yaml` and `_redirects.yaml` files")
+
+
+# The doc generator isn't aware of tf_export.
+# So prefix the score tuples with -1 when this is the canonical name, +1
+# otherwise. The generator chooses the name with the lowest score.
+class TfExportAwareDocGeneratorVisitor(
+    doc_generator_visitor.DocGeneratorVisitor):
+  """A `tf_export` aware doc_visitor."""
+
+  def _score_name(self, name):
+    canonical = tf_export.get_canonical_name_for_symbol(self._index[name])
+
+    canonical_score = 1
+    if canonical is not None and name == "tf." + canonical:
+      canonical_score = -1
+
+    scores = super(TfExportAwareDocGeneratorVisitor, self)._score_name(name)
+    return (canonical_score,) + scores
+
 
 def build_docs(output_dir, code_url_prefix, search_hints=True):
   """Build api docs for tensorflow v2.
@@ -66,7 +100,8 @@ def build_docs(output_dir, code_url_prefix, search_hints=True):
       base_dir=base_dir,
       search_hints=search_hints,
       code_url_prefix=code_url_prefix,
-      site_path="api_docs/")
+      site_path=FLAGS.site_path,
+      visitor_cls=TfExportAwareDocGeneratorVisitor)
 
   doc_generator.build(output_dir)
 
diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index 83b4bf812881f423195f65cc98dc8f3189af3931..ff32d089514c0b514377e064b5af74555eb273af 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -1681,7 +1681,7 @@ def _get_defined_in(py_object, parser_config):
     path = path[:-1]
 
   # Never include links outside this code base.
-  if path.startswith('..'):
+  if path.startswith('..') or re.search(r'\b_api\b', path):
     return None
 
   if re.match(r'.*/gen_[^/]*\.py$', path):
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index 8e7cd9b10415740a554445edbb634706dd97857c..4d52c1fccf957e201ac64c1964a9822aad255815 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -29,8 +29,8 @@ from __future__ import print_function
 import argparse
 import json
 import os
-import subprocess
 import shutil
+import subprocess
 
 
 def parse_branch_ref(filename):
@@ -159,12 +159,14 @@ def get_git_version(git_base_path, git_tag_override):
   """
   unknown_label = b"unknown"
   try:
+    # Force to bytes so this works on python 2 and python 3
     val = bytes(subprocess.check_output([
         "git", str("--git-dir=%s/.git" % git_base_path),
         str("--work-tree=" + git_base_path), "describe", "--long", "--tags"
     ]).strip())
+    version_separator = b"-"
     if git_tag_override and val:
-      split_val = val.split("-")
+      split_val = val.split(version_separator)
       if len(split_val) < 3:
         raise Exception(
             ("Expected git version in format 'TAG-COMMITS AFTER TAG-HASH' "
@@ -172,8 +174,8 @@ def get_git_version(git_base_path, git_tag_override):
       # There might be "-" in the tag name. But we can be sure that the final
       # two "-" are those inserted by the git describe command.
       abbrev_commit = split_val[-1]
-      val = bytes(
-          "-".join([git_tag_override, "0", abbrev_commit]))
+      val = version_separator.join(
+          [bytes(git_tag_override, "utf-8"), b"0", abbrev_commit])
     return val if val else unknown_label
   except (subprocess.CalledProcessError, OSError):
     return unknown_label
@@ -187,7 +189,7 @@ def write_version_info(filename, git_version):
     git_version: the result of a git describe.
   """
   if b"\"" in git_version or b"\\" in git_version:
-    git_version = "git_version_is_invalid"  # do not cause build to fail!
+    git_version = b"git_version_is_invalid"  # do not cause build to fail!
   contents = """/*  Generated by gen_git_source.py  */
 #include <string>
 const char* tf_git_version() {return "%s";}
@@ -214,7 +216,7 @@ const int tf_monolithic_build() {
   return 0;
 #endif
 }
-""" % git_version
+""" % git_version.decode("utf-8")
   open(filename, "w").write(contents)
 
 
diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index eb1ed1f2ca859df42809084c1ea47a6f3b21012e..2145b3b0d5bfb788cea05c348f4fb881f7d12fb7 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -12,6 +12,7 @@ load(
     "tf_cc_binary",
     "tf_cc_test",
     "tf_py_test",
+    "if_not_v2",
 )
 
 exports_files(["LICENSE"])
@@ -131,12 +132,35 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
-        "//tensorflow/contrib/rnn:gru_ops_op_lib",
-        "//tensorflow/contrib/rnn:lstm_ops_op_lib",
         "//tensorflow/core/kernels:quantization_utils",
     ] + if_not_windows([
         "//tensorflow/core/kernels:remote_fused_graph_rewriter_transform",
         "//tensorflow/core/kernels/hexagon:hexagon_rewriter_transform",
+        "//tensorflow/core:sparse_ops_op_lib",
+        "//tensorflow/core:parsing_ops_op_lib",
+        "//tensorflow/core:sendrecv_ops_op_lib",
+        "//tensorflow/core:io_ops_op_lib",
+        "//tensorflow/core:logging_ops_op_lib",
+        "//tensorflow/core:lookup_ops_op_lib",
+        "//tensorflow/core:data_flow_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
+        "//tensorflow/core:state_ops_op_lib",
+        "//tensorflow/core:user_ops_op_lib",
+        "//tensorflow/core:training_ops_op_lib",
+        "//tensorflow/core:string_ops_op_lib",
+        "//tensorflow/core:remote_fused_graph_ops_op_lib",
+        "//tensorflow/core:random_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:manip_ops_op_lib",
+        "//tensorflow/core:list_ops_op_lib",
+        "//tensorflow/core:functional_ops_op_lib",
+        "//tensorflow/core:control_flow_ops_op_lib",
+        "//tensorflow/core:candidate_sampling_ops_op_lib",
+        "//tensorflow/core:array_ops_op_lib",
+    ]) + if_not_v2([
+        "//tensorflow/contrib/rnn:gru_ops_op_lib",
+        "//tensorflow/contrib/rnn:lstm_ops_op_lib",
     ]),
     alwayslink = 1,
 )
@@ -173,6 +197,7 @@ tf_cc_test(
         ":transforms_lib",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:sendrecv_ops",
+        "//tensorflow/core:bitwise_ops_op_lib",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/tools/graph_transforms/README.md b/tensorflow/tools/graph_transforms/README.md
index 9f6f553ba1e4c69f1b01d7686f043345be953ef2..c5c0f2da896378405b8b0da72935d5d677cfe741 100644
--- a/tensorflow/tools/graph_transforms/README.md
+++ b/tensorflow/tools/graph_transforms/README.md
@@ -1086,7 +1086,7 @@ in the future.
 
 The Graph Transform Tool associates names of transforms with the code to
 implement them using the `REGISTER_GRAPH_TRANSFORM()` macro. This takes a string
-and a function, and automagically registers the transform with the tool. You
+and a function, and automatically registers the transform with the tool. You
 will need to watch out for a few things though:
 
 *   Because it's using global C++ objects in each file under the hood, the
diff --git a/tensorflow/tools/graph_transforms/fold_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_batch_norms.cc
index 16a0f7d58df66be06224d58de623ee7e2dc41880..f59a7abbea93d7b9c838938689009d4d90c68095 100644
--- a/tensorflow/tools/graph_transforms/fold_batch_norms.cc
+++ b/tensorflow/tools/graph_transforms/fold_batch_norms.cc
@@ -37,7 +37,7 @@ Status FoldBatchNorms(const GraphDef& input_graph_def,
       input_graph_def,  // clang-format off
       {"Mul",                // mul_node
         {
-          {"Conv2D|MatMul",  // conv_node
+          {"Conv2D|MatMul|DepthwiseConv2dNative",  // conv_node
             {
               {"*"},         // input_node
               {"Const"},     // weights_node
@@ -72,8 +72,15 @@ Status FoldBatchNorms(const GraphDef& input_graph_def,
 
         // Make sure all the inputs really are vectors, with as many entries as
         // there are columns in the weights.
-        const int weights_cols_index = conv_node.op() == "Conv2D" ? 3 : 1;
-        const int64 weights_cols = weights.shape().dim_size(weights_cols_index);
+        int64 weights_cols;
+        if (conv_node.op() == "Conv2D") {
+          weights_cols = weights.shape().dim_size(3);
+        } else if (conv_node.op() == "DepthwiseConv2dNative") {
+          weights_cols =
+              weights.shape().dim_size(2) * weights.shape().dim_size(3);
+        } else {
+          weights_cols = weights.shape().dim_size(1);
+        }
         if ((mul_values.shape().dims() != 1) ||
             (mul_values.shape().dim_size(0) != weights_cols)) {
           return errors::InvalidArgument(
@@ -82,14 +89,13 @@ Status FoldBatchNorms(const GraphDef& input_graph_def,
         }
 
         // Multiply the original weights by the scale vector.
-        auto weights_matrix = weights.flat_inner_dims<float>();
+        auto weights_vector = weights.flat<float>();
         Tensor scaled_weights(DT_FLOAT, weights.shape());
-        auto scaled_weights_matrix = scaled_weights.flat_inner_dims<float>();
-        for (int64 row = 0; row < weights_matrix.dimension(0); ++row) {
-          for (int64 col = 0; col < weights_cols; ++col) {
-            scaled_weights_matrix(row, col) =
-                weights_matrix(row, col) * mul_values.flat<float>()(col);
-          }
+        auto scaled_weights_vector = scaled_weights.flat<float>();
+        for (int64 row = 0; row < weights_vector.dimension(0); ++row) {
+          scaled_weights_vector(row) =
+              weights_vector(row) *
+              mul_values.flat<float>()(row % weights_cols);
         }
 
         // Construct the new nodes.
diff --git a/tensorflow/tools/graph_transforms/fold_batch_norms_test.cc b/tensorflow/tools/graph_transforms/fold_batch_norms_test.cc
index a5d541feb6f4dbfd5a0f61b171fd05160a6d67c8..885fbd59b7797c35639d0a33dbb895d8589b6b4d 100644
--- a/tensorflow/tools/graph_transforms/fold_batch_norms_test.cc
+++ b/tensorflow/tools/graph_transforms/fold_batch_norms_test.cc
@@ -87,6 +87,57 @@ class FoldBatchNormsTest : public ::testing::Test {
     }
   }
 
+  void TestFoldBatchNormsDepthwiseConv2dNative() {
+    auto root = tensorflow::Scope::NewRootScope();
+    using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+
+    Tensor input_data(DT_FLOAT, TensorShape({1, 1, 6, 2}));
+    test::FillValues<float>(
+        &input_data, {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f, -1.0f, -4.0f, -2.0f,
+                      -5.0f, -3.0f, -6.0f});
+    Output input_op =
+        Const(root.WithOpName("input_op"), Input::Initializer(input_data));
+
+    Tensor weights_data(DT_FLOAT, TensorShape({1, 2, 2, 2}));
+    test::FillValues<float>(&weights_data,
+                            {1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f});
+    Output weights_op =
+        Const(root.WithOpName("weights_op"), Input::Initializer(weights_data));
+
+    Output conv_op = DepthwiseConv2dNative(root.WithOpName("conv_op"), input_op,
+                                           weights_op, {1, 1, 1, 1}, "VALID");
+
+    Tensor mul_values_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&mul_values_data, {2.0f, 3.0f, 4.0f, 5.0f});
+    Output mul_values_op = Const(root.WithOpName("mul_values"),
+                                 Input::Initializer(mul_values_data));
+
+    Output mul_op = Mul(root.WithOpName("output"), conv_op, mul_values_op);
+
+    GraphDef original_graph_def;
+    TF_ASSERT_OK(root.ToGraphDef(&original_graph_def));
+
+    std::unique_ptr<Session> original_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(original_session->Create(original_graph_def));
+    std::vector<Tensor> original_outputs;
+    TF_ASSERT_OK(original_session->Run({}, {"output"}, {}, &original_outputs));
+
+    GraphDef fused_graph_def;
+    TF_ASSERT_OK(
+        FoldBatchNorms(original_graph_def, {{}, {"output"}}, &fused_graph_def));
+
+    std::unique_ptr<Session> fused_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(fused_session->Create(fused_graph_def));
+    std::vector<Tensor> fused_outputs;
+    TF_ASSERT_OK(fused_session->Run({}, {"output"}, {}, &fused_outputs));
+
+    test::ExpectTensorNear<float>(original_outputs[0], fused_outputs[0], 1e-5);
+
+    for (const NodeDef& node : fused_graph_def.node()) {
+      EXPECT_NE("Mul", node.op());
+    }
+  }
+
   void TestFoldBatchNormsConv2DShared() {
     auto root = tensorflow::Scope::NewRootScope();
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
@@ -202,6 +253,9 @@ TEST_F(FoldBatchNormsTest, TestFoldBatchNormsConv2D) {
 TEST_F(FoldBatchNormsTest, TestFoldBatchNormsMatMul) {
   TestFoldBatchNormsMatMul();
 }
+TEST_F(FoldBatchNormsTest, TestFoldBatchNormsDepthwiseConv2dNative) {
+  TestFoldBatchNormsDepthwiseConv2dNative();
+}
 
 }  // namespace graph_transforms
 }  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
index fd546f812c0dafc5d2e71c94710c3c3f5b75250e..532b4600973cbc2ef2826be1bf551984a1f1f8d6 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
@@ -109,24 +109,29 @@ Status FuseScaleOffsetToConvWeights(const std::vector<float>& scale_values,
                                     const string& conv_output_name,
                                     std::vector<NodeDef>* new_nodes) {
   const NodeDef& conv_node = conv_node_match.node;
-  CHECK_EQ("Conv2D", conv_node.op());
+  // CHECK_EQ("Conv2D", conv_node.op());
   const NodeDef& input_node = conv_node_match.inputs[0].node;
   const NodeDef& weights_node = conv_node_match.inputs[1].node;
   CHECK_EQ("Const", weights_node.op());
 
   Tensor weights = GetNodeTensorAttr(weights_node, "value");
-  const int64 weights_cols = weights.shape().dim_size(3);
+  int64 weights_cols;
+  if (conv_node.op() == "Conv2D") {
+    weights_cols = weights.shape().dim_size(3);
+  } else if (conv_node.op() == "DepthwiseConv2dNative") {
+    weights_cols = weights.shape().dim_size(2) * weights.shape().dim_size(3);
+  } else {
+    weights_cols = weights.shape().dim_size(1);
+  }
   CHECK_EQ(weights_cols, scale_values.size());
 
   // Multiply the original weights by the scale vector.
-  auto weights_matrix = weights.flat_inner_dims<float>();
+  auto weights_vector = weights.flat<float>();
   Tensor scaled_weights(DT_FLOAT, weights.shape());
-  auto scaled_weights_matrix = scaled_weights.flat_inner_dims<float>();
-  for (int64 row = 0; row < weights_matrix.dimension(0); ++row) {
-    for (int64 col = 0; col < weights_cols; ++col) {
-      scaled_weights_matrix(row, col) =
-          weights_matrix(row, col) * scale_values[col];
-    }
+  auto scaled_weights_vector = scaled_weights.flat<float>();
+  for (int64 row = 0; row < weights_vector.dimension(0); ++row) {
+    scaled_weights_vector(row) =
+        weights_vector(row) * scale_values[row % weights_cols];
   }
   // Figure out the remaining bias to add on.
   Tensor bias_offset(DT_FLOAT, {weights_cols});
@@ -158,7 +163,7 @@ Status FuseScaleOffsetToConvWeights(const std::vector<float>& scale_values,
   NodeDef bias_add_node;
   bias_add_node.set_op("BiasAdd");
   bias_add_node.set_name(conv_output_name);
-  if (conv_node.attr().count("data_format") > 0) {
+  if (!conv_node.attr().count("data_format")) {
     CopyNodeAttr(conv_node, "data_format", "data_format", &bias_add_node);
   }
   CopyNodeAttr(conv_node, "T", "T", &bias_add_node);
@@ -185,7 +190,7 @@ Status FuseBatchNormWithConv(const NodeMatch& match,
 }
 
 Status FuseBatchNormWithBatchToSpace(const NodeMatch& match,
-                             std::vector<NodeDef>* new_nodes) {
+                                     std::vector<NodeDef>* new_nodes) {
   // Calculate the scale and offset values to apply.
   std::vector<float> scale_values;
   std::vector<float> offset_values;
@@ -200,9 +205,8 @@ Status FuseBatchNormWithBatchToSpace(const NodeMatch& match,
   const NodeDef& conv_node = conv_node_match.node;
 
   string biasadd_name = conv_node.name() + "/biasadd";
-  TF_RETURN_IF_ERROR(
-      FuseScaleOffsetToConvWeights(scale_values, offset_values, conv_node_match,
-                                   biasadd_name , new_nodes));
+  TF_RETURN_IF_ERROR(FuseScaleOffsetToConvWeights(
+      scale_values, offset_values, conv_node_match, biasadd_name, new_nodes));
 
   NodeDef new_batch_to_space_node = batch_to_space_node;
   // reuse batch_norm node name
@@ -292,7 +296,7 @@ Status FoldOldBatchNorms(const GraphDef& input_graph_def,
         current_graph_def,  // clang-format off
       {"BatchNormWithGlobalNormalization|FusedBatchNorm",    // batch_norm_node
         {
-          {"Conv2D",                          // conv_node
+          {"Conv2D|DepthwiseConv2dNative",                          // conv_node
             {
               {"*"},                          // input_node
               {"Const"},                      // weights_node
@@ -325,7 +329,7 @@ Status FoldOldBatchNorms(const GraphDef& input_graph_def,
          {
              {"BatchToSpaceND",                  // batch_to_space_node
               {
-                  {"Conv2D",                     // conv_node
+                  {"Conv2D|DepthwiseConv2dNative",                     // conv_node
                    {
                        {"*"},                    // input_node
                        {"Const"},                // weights_node
@@ -363,13 +367,13 @@ Status FoldOldBatchNorms(const GraphDef& input_graph_def,
         {
           {"ConcatV2|Concat",                     // concat two conv2d.
             {
-              {"Conv2D",                          // conv_node
+              {"Conv2D|DepthwiseConv2dNative",                          // conv_node
                 {
                   {"*"},                          // input_node
                   {"Const"},                      // weights_node
                 }
               },
-              {"Conv2D",                          // conv_node
+              {"Conv2D|DepthwiseConv2dNative",                          // conv_node
                 {
                   {"*"},                          // input_node
                   {"Const"},                      // weights_node
diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
index 435f46c107cd9b0a6d64d4c0d52607ec5f41eb4f..c5fa9b16b0c91e6c069462f0663908737bb6f835 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
@@ -121,6 +121,84 @@ class FoldOldBatchNormsTest : public ::testing::Test {
     }
   }
 
+  void TestFoldOldBatchNormsAfterDepthwiseConv2dNative() {
+    auto root = tensorflow::Scope::NewRootScope();
+    using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+
+    Tensor input_data(DT_FLOAT, TensorShape({1, 1, 6, 2}));
+    test::FillValues<float>(
+        &input_data, {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f, -1.0f, -4.0f, -2.0f,
+                      -5.0f, -3.0f, -6.0f});
+    Output input_op =
+        Const(root.WithOpName("input_op"), Input::Initializer(input_data));
+
+    Tensor weights_data(DT_FLOAT, TensorShape({1, 2, 2, 2}));
+    test::FillValues<float>(&weights_data,
+                            {1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f});
+    Output weights_op =
+        Const(root.WithOpName("weights_op"), Input::Initializer(weights_data));
+
+    Output conv_op = DepthwiseConv2dNative(root.WithOpName("conv_op"), input_op,
+                                           weights_op, {1, 1, 1, 1}, "VALID");
+
+    Tensor mean_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&mean_data, {10.0f, 20.0f, 30.0f, 40.0f});
+    Output mean_op =
+        Const(root.WithOpName("mean_op"), Input::Initializer(mean_data));
+
+    Tensor variance_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&variance_data, {0.25f, 0.5f, 0.75f, 1.0f});
+    Output variance_op = Const(root.WithOpName("variance_op"),
+                               Input::Initializer(variance_data));
+
+    Tensor beta_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&beta_data, {0.1f, 0.6f, 1.1f, 1.6f});
+    Output beta_op =
+        Const(root.WithOpName("beta_op"), Input::Initializer(beta_data));
+
+    Tensor gamma_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&gamma_data, {1.0f, 2.0f, 3.0f, 4.0f});
+    Output gamma_op =
+        Const(root.WithOpName("gamma_op"), Input::Initializer(gamma_data));
+
+    GraphDef original_graph_def;
+    TF_ASSERT_OK(root.ToGraphDef(&original_graph_def));
+
+    NodeDef batch_norm_node;
+    batch_norm_node.set_op("BatchNormWithGlobalNormalization");
+    batch_norm_node.set_name("output");
+    AddNodeInput("conv_op", &batch_norm_node);
+    AddNodeInput("mean_op", &batch_norm_node);
+    AddNodeInput("variance_op", &batch_norm_node);
+    AddNodeInput("beta_op", &batch_norm_node);
+    AddNodeInput("gamma_op", &batch_norm_node);
+    SetNodeAttr("T", DT_FLOAT, &batch_norm_node);
+    SetNodeAttr("variance_epsilon", 0.00001f, &batch_norm_node);
+    SetNodeAttr("scale_after_normalization", false, &batch_norm_node);
+    *(original_graph_def.mutable_node()->Add()) = batch_norm_node;
+    original_graph_def.mutable_versions()->set_producer(8);
+
+    std::unique_ptr<Session> original_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(original_session->Create(original_graph_def));
+    std::vector<Tensor> original_outputs;
+    TF_ASSERT_OK(original_session->Run({}, {"output"}, {}, &original_outputs));
+
+    GraphDef fused_graph_def;
+    TF_ASSERT_OK(FoldOldBatchNorms(original_graph_def, {{}, {"output"}},
+                                   &fused_graph_def));
+
+    std::unique_ptr<Session> fused_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(fused_session->Create(fused_graph_def));
+    std::vector<Tensor> fused_outputs;
+    TF_ASSERT_OK(fused_session->Run({}, {"output"}, {}, &fused_outputs));
+
+    test::ExpectTensorNear<float>(original_outputs[0], fused_outputs[0], 1e-5);
+
+    for (const NodeDef& node : fused_graph_def.node()) {
+      EXPECT_NE("BatchNormWithGlobalNormalization", node.op());
+    }
+  }
+
   void TestFoldFusedBatchNorms() {
     auto root = tensorflow::Scope::NewRootScope();
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
@@ -198,6 +276,83 @@ class FoldOldBatchNormsTest : public ::testing::Test {
     }
   }
 
+  void TestFoldFusedBatchNormsAfterDepthwiseConv2dNative() {
+    auto root = tensorflow::Scope::NewRootScope();
+    using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+
+    Tensor input_data(DT_FLOAT, TensorShape({1, 1, 6, 2}));
+    test::FillValues<float>(
+        &input_data, {1.0f, 4.0f, 2.0f, 5.0f, 3.0f, 6.0f, -1.0f, -4.0f, -2.0f,
+                      -5.0f, -3.0f, -6.0f});
+    Output input_op =
+        Const(root.WithOpName("input_op"), Input::Initializer(input_data));
+
+    Tensor weights_data(DT_FLOAT, TensorShape({1, 2, 2, 2}));
+    test::FillValues<float>(&weights_data,
+                            {1.0f, 2.0f, 3.0f, 4.0f, 0.1f, 0.2f, 0.3f, 0.4f});
+    Output weights_op =
+        Const(root.WithOpName("weights_op"), Input::Initializer(weights_data));
+
+    Output conv_op = DepthwiseConv2dNative(root.WithOpName("conv_op"), input_op,
+                                           weights_op, {1, 1, 1, 1}, "VALID");
+
+    Tensor mean_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&mean_data, {10.0f, 20.0f, 30.0f, 40.0f});
+    Output mean_op =
+        Const(root.WithOpName("mean_op"), Input::Initializer(mean_data));
+
+    Tensor variance_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&variance_data, {0.25f, 0.5f, 0.75f, 1.0f});
+    Output variance_op = Const(root.WithOpName("variance_op"),
+                               Input::Initializer(variance_data));
+
+    Tensor beta_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&beta_data, {0.1f, 0.6f, 1.1f, 1.6f});
+    Output beta_op =
+        Const(root.WithOpName("beta_op"), Input::Initializer(beta_data));
+
+    Tensor gamma_data(DT_FLOAT, TensorShape({4}));
+    test::FillValues<float>(&gamma_data, {1.0f, 2.0f, 3.0f, 4.0f});
+    Output gamma_op =
+        Const(root.WithOpName("gamma_op"), Input::Initializer(gamma_data));
+
+    GraphDef original_graph_def;
+    TF_ASSERT_OK(root.ToGraphDef(&original_graph_def));
+
+    NodeDef batch_norm_node;
+    batch_norm_node.set_op("FusedBatchNorm");
+    batch_norm_node.set_name("output");
+    AddNodeInput("conv_op", &batch_norm_node);
+    AddNodeInput("gamma_op", &batch_norm_node);
+    AddNodeInput("beta_op", &batch_norm_node);
+    AddNodeInput("mean_op", &batch_norm_node);
+    AddNodeInput("variance_op", &batch_norm_node);
+    SetNodeAttr("T", DT_FLOAT, &batch_norm_node);
+    SetNodeAttr("epsilon", 0.00001f, &batch_norm_node);
+    SetNodeAttr("is_training", false, &batch_norm_node);
+    *(original_graph_def.mutable_node()->Add()) = batch_norm_node;
+
+    std::unique_ptr<Session> original_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(original_session->Create(original_graph_def));
+    std::vector<Tensor> original_outputs;
+    TF_ASSERT_OK(original_session->Run({}, {"output"}, {}, &original_outputs));
+
+    GraphDef fused_graph_def;
+    TF_ASSERT_OK(FoldOldBatchNorms(original_graph_def, {{}, {"output"}},
+                                   &fused_graph_def));
+
+    std::unique_ptr<Session> fused_session(NewSession(SessionOptions()));
+    TF_ASSERT_OK(fused_session->Create(fused_graph_def));
+    std::vector<Tensor> fused_outputs;
+    TF_ASSERT_OK(fused_session->Run({}, {"output"}, {}, &fused_outputs));
+
+    test::ExpectTensorNear<float>(original_outputs[0], fused_outputs[0], 2e-5);
+
+    for (const NodeDef& node : fused_graph_def.node()) {
+      EXPECT_NE("FusedBatchNorm", node.op());
+    }
+  }
+
   void TestFoldFusedBatchNormsWithConcat(const bool split) {
     auto root = tensorflow::Scope::NewRootScope();
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
@@ -291,7 +446,7 @@ class FoldOldBatchNormsTest : public ::testing::Test {
     std::vector<Tensor> fused_outputs;
     TF_ASSERT_OK(fused_session->Run({}, {"output"}, {}, &fused_outputs));
 
-    test::ExpectTensorNear<float>(original_outputs[0], fused_outputs[0], 1e-5);
+    test::ExpectClose(original_outputs[0], fused_outputs[0]);
 
     for (const NodeDef& node : fused_graph_def.node()) {
       EXPECT_NE("FusedBatchNorm", node.op());
@@ -321,16 +476,17 @@ void TestFoldFusedBatchNormsWithBatchToSpace() {
 
   Tensor block_shape_data(DT_INT32, TensorShape({2}));
   test::FillValues<int32>(&block_shape_data, {1, 2});
-  Output block_shape_op =
-      Const(root.WithOpName("block_shape_op"), Input::Initializer(block_shape_data));
+  Output block_shape_op = Const(root.WithOpName("block_shape_op"),
+                                Input::Initializer(block_shape_data));
 
   Tensor crops_data(DT_INT32, TensorShape({2, 2}));
   test::FillValues<int32>(&crops_data, {0, 0, 0, 1});
   Output crops_op =
       Const(root.WithOpName("crops_op"), Input::Initializer(crops_data));
 
-  Output batch_to_space_op = BatchToSpaceND(root.WithOpName("batch_to_space_op"),
-                                            conv_op, block_shape_op, crops_data);
+  Output batch_to_space_op =
+      BatchToSpaceND(root.WithOpName("batch_to_space_op"), conv_op,
+                     block_shape_op, crops_data);
 
   Tensor mean_data(DT_FLOAT, TensorShape({2}));
   test::FillValues<float>(&mean_data, {10.0f, 20.0f});
@@ -339,8 +495,8 @@ void TestFoldFusedBatchNormsWithBatchToSpace() {
 
   Tensor variance_data(DT_FLOAT, TensorShape({2}));
   test::FillValues<float>(&variance_data, {0.25f, 0.5f});
-  Output variance_op = Const(root.WithOpName("variance_op"),
-                             Input::Initializer(variance_data));
+  Output variance_op =
+      Const(root.WithOpName("variance_op"), Input::Initializer(variance_data));
 
   Tensor beta_data(DT_FLOAT, TensorShape({2}));
   test::FillValues<float>(&beta_data, {0.1f, 0.6f});
@@ -410,5 +566,14 @@ TEST_F(FoldOldBatchNormsTest, TestFoldFusedBatchNormsWithBatchToSpace) {
   TestFoldFusedBatchNormsWithBatchToSpace();
 }
 
+TEST_F(FoldOldBatchNormsTest, TestFoldOldBatchNormsAfterDepthwiseConv2dNative) {
+  TestFoldOldBatchNormsAfterDepthwiseConv2dNative();
+}
+
+TEST_F(FoldOldBatchNormsTest,
+       TestFoldFusedBatchNormsAfterDepthwiseConv2dNative) {
+  TestFoldFusedBatchNormsAfterDepthwiseConv2dNative();
+}
+
 }  // namespace graph_transforms
 }  // namespace tensorflow
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 1186189844aa887ba011b532df3a73d89ffe52b8..86bd5107924ec4627b955264b179a06231ef8532 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -162,6 +162,7 @@ genrule(
         "//conditions:default": [],
     }) + if_cuda([
         "@cub_archive//:LICENSE.TXT",
+        "@local_config_nccl//:LICENSE",
     ]) + if_mkl([
         "//third_party/mkl:LICENSE",
         "//third_party/mkl_dnn:LICENSE",
@@ -232,6 +233,7 @@ genrule(
         "//conditions:default": [],
     }) + if_cuda([
         "@cub_archive//:LICENSE.TXT",
+        "@local_config_nccl//:LICENSE",
     ]) + if_mkl([
         "//third_party/mkl:LICENSE",
         "//third_party/mkl_dnn:LICENSE",
diff --git a/tensorflow/tools/optimization/BUILD b/tensorflow/tools/optimization/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..aa6c850b0b3abb3351e3225e0c3a66ab4272846e
--- /dev/null
+++ b/tensorflow/tools/optimization/BUILD
@@ -0,0 +1,52 @@
+# Description:
+#   Utilities that perform useful transformations on graphs
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_binary",
+    "tf_cuda_library",
+)
+
+exports_files(["LICENSE"])
+
+tf_cuda_library(
+    name = "optimization_pass_runner_lib",
+    srcs = ["optimization_pass_runner.cc"],
+    hdrs = ["optimization_pass_runner.h"],
+    deps = [
+        "//tensorflow/contrib:contrib_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+    ],
+)
+
+tf_cc_binary(
+    name = "gpu_optimization_pass_runner",
+    srcs = ["gpu_optimization_pass_runner_main.cc"],
+    deps = [
+        ":optimization_pass_runner_lib",
+        "//tensorflow/compiler/jit:xla_cpu_jit",
+        "//tensorflow/compiler/jit:xla_gpu_jit",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/contrib:contrib_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "@com_google_absl//absl/strings",
+    ],
+)
diff --git a/tensorflow/tools/optimization/gpu_optimization_pass_runner_main.cc b/tensorflow/tools/optimization/gpu_optimization_pass_runner_main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0d9f26cd5a42f7315cc1d074e8b6ec19caa75f30
--- /dev/null
+++ b/tensorflow/tools/optimization/gpu_optimization_pass_runner_main.cc
@@ -0,0 +1,60 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This file creates a binary that can run any registered optimization pass.
+// ./xla_gpu_opt  --input_file_path=/tmp/input.pbtxt
+// --output_file_path=/tmp/output.pbtxt
+// --optimization_pass=NameOfGraphOptimizationPass
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/tools/optimization/optimization_pass_runner.h"
+
+int main(int argc, char** argv) {
+  tensorflow::OptimizationPassRunner runner;
+  // Add fake devices for CPU, GPU, and XLA to ensure we have all devices we
+  // need.
+  // Most machines in our servers currently use 8 gpus. There is nothing special
+  // about this number and it can be decreased or increased to test other
+  // configurations.
+  int num_gpus_per_machine = 8;
+  for (int i = 0; i < num_gpus_per_machine; i++) {
+    TF_CHECK_OK(runner.AddDevice(
+        absl::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i),
+        tensorflow::DEVICE_CPU));
+    TF_CHECK_OK(runner.AddDevice(
+        absl::StrCat("/job:localhost/replica:0/task:0/device:GPU:", i),
+        tensorflow::DEVICE_GPU));
+    TF_CHECK_OK(runner.AddDevice(
+        absl::StrCat("/job:localhost/replica:0/task:0/device:XLA_CPU:", i),
+        tensorflow::DEVICE_XLA_CPU));
+    TF_CHECK_OK(runner.AddDevice(
+        absl::StrCat("/job:localhost/replica:0/task:0/device:XLA_GPU:", i),
+        tensorflow::DEVICE_XLA_GPU));
+    TF_CHECK_OK(runner.AddDevice(
+        absl::StrCat("/job:localhost/replica:0/task:0/device:CPU_XLA_JIT:", i),
+        tensorflow::DEVICE_CPU_XLA_JIT));
+    TF_CHECK_OK(runner.AddDevice(
+        absl::StrCat("/job:localhost/replica:0/task:0/device:GPU_XLA_JIT:", i),
+        tensorflow::DEVICE_GPU_XLA_JIT));
+  }
+  // This binary is used to test TF:XLA behavior, so turn on auto_jit.
+  TF_CHECK_OK(runner.SetJitLevel(tensorflow::OptimizerOptions::GlobalJitLevel::
+                                     OptimizerOptions_GlobalJitLevel_ON_2));
+  // Run the actual "main" function.
+  TF_CHECK_OK(runner.RunMain(argc, argv));
+}
diff --git a/tensorflow/tools/optimization/optimization_pass_runner.cc b/tensorflow/tools/optimization/optimization_pass_runner.cc
new file mode 100644
index 0000000000000000000000000000000000000000..231ff083813870819c23729e4308e0215661afcd
--- /dev/null
+++ b/tensorflow/tools/optimization/optimization_pass_runner.cc
@@ -0,0 +1,167 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This file creates a library that can run any registered optimization pass.
+// The binary that uses this will be run in a form similar to:
+// ./optimization_pass_runner  --input_file_path=/tmp/input.pbtxt
+// --output_file_path=/tmp/output.pbtxt
+// --optimization_pass=NameOfGraphOptimizationPass
+#include "tensorflow/tools/optimization/optimization_pass_runner.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace tensorflow {
+
+namespace {
+// A fake device used to populate a DeviceSet.
+class FakeDevice : public Device {
+ private:
+  explicit FakeDevice(const DeviceAttributes& device_attributes)
+      : Device(nullptr, device_attributes) {}
+
+ public:
+  Status Sync() override;
+  static std::unique_ptr<Device> Make(const string& name, const string& type);
+};
+
+Status FakeDevice::Sync() {
+  return errors::Unimplemented("FakeDevice::Sync()");
+}
+
+std::unique_ptr<Device> FakeDevice::Make(const string& name,
+                                         const string& type) {
+  DeviceAttributes device_attributes;
+  device_attributes.set_name(name);
+  device_attributes.set_device_type(DeviceType(type).type());
+  return std::unique_ptr<Device>(new FakeDevice(device_attributes));
+}
+}  // namespace
+
+Status OptimizationPassRunner::RunMain(int argc, char** argv) {
+  string input_file_path;
+  string output_file_path;
+  string optimization_pass;
+
+  const std::vector<Flag> flag_list = {
+      Flag("input_file_path", &input_file_path, "Location of the input graph."),
+      Flag("output_file_path", &output_file_path,
+           "Location to write the resulting graph."),
+      // For now only a single optimization pass can be run.
+      Flag("optimization_pass", &optimization_pass,
+           "Which optimization pass to run."),
+  };
+  if (!Flags::Parse(&argc, argv, flag_list)) {
+    return errors::FailedPrecondition("Invalid flags passed");
+  }
+  port::InitMain(argv[0], &argc, &argv);
+
+  if (input_file_path.empty()) {
+    return errors::FailedPrecondition("input_file_path is a required flag.");
+  }
+  if (output_file_path.empty()) {
+    return errors::FailedPrecondition("output_file_path is a required flag.");
+  }
+  if (optimization_pass.empty()) {
+    return errors::FailedPrecondition("optimization_pass is a required flag.");
+  }
+
+  // Turn on XLA Auto-Jit.
+  auto session_options = absl::make_unique<SessionOptions>();
+  session_options->config.mutable_graph_options()
+      ->mutable_optimizer_options()
+      ->set_global_jit_level(jit_level_);
+  FunctionDefLibrary flib;
+  std::unique_ptr<Graph> graph = absl::make_unique<Graph>(OpRegistry::Global());
+
+  GraphOptimizationPassOptions options;
+  options.session_options = session_options.release();
+  options.graph = &graph;
+  options.flib_def =
+      new FunctionLibraryDefinition((*options.graph)->op_registry(), flib);
+
+  // Grab the data
+  GraphDef graphdef;
+  GraphConstructorOptions graph_opts;
+  graph_opts.expect_device_spec = true;
+  graph_opts.allow_internal_ops = true;
+  TF_RETURN_IF_ERROR(ReadTextProto(Env::Default(), input_file_path, &graphdef));
+  TF_RETURN_IF_ERROR(
+      ConvertGraphDefToGraph(graph_opts, graphdef, options.graph->get()));
+
+  // Add all devices that were previously configured with AddDevice.
+  DeviceSet device_set;
+  for (auto& device : devices_) {
+    device_set.AddDevice(device.get());
+  }
+  options.device_set = &device_set;
+
+  Status result = errors::NotFound(
+      "An OptimizationPass was not found with the desired name.");
+
+  // Run the optimization pass specified by the command line flag.
+  for (const auto& groups_and_passes :
+       OptimizationPassRegistry::Global()->groups()) {
+    for (const auto& phase_and_passes : groups_and_passes.second) {
+      for (const auto& pass : phase_and_passes.second) {
+        if (pass->name() == optimization_pass) {
+          result = pass->Run(options);
+        }
+      }
+    }
+  }
+
+  TF_RETURN_IF_ERROR(result);
+
+  // Write out the result.
+  options.graph->get()->ToGraphDef(&graphdef);
+  TF_RETURN_IF_ERROR(
+      WriteTextProto(Env::Default(), output_file_path, graphdef));
+  return Status::OK();
+}
+
+Status OptimizationPassRunner::SetJitLevel(
+    OptimizerOptions::GlobalJitLevel jit_level) {
+  jit_level_ = jit_level;
+  return Status::OK();
+}
+
+Status OptimizationPassRunner::AddDevice(const string& name,
+                                         const string& type) {
+  devices_.push_back(FakeDevice::Make(name, type));
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/tools/optimization/optimization_pass_runner.h b/tensorflow/tools/optimization/optimization_pass_runner.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b26f64bcfb86e5e7fd6b6fe31b20cf75f931da1
--- /dev/null
+++ b/tensorflow/tools/optimization/optimization_pass_runner.h
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_TOOLS_OPTIMIZATION_OPTIMIZATION_PASS_RUNNER_H_
+#define TENSORFLOW_TOOLS_OPTIMIZATION_OPTIMIZATION_PASS_RUNNER_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+
+// OptimizationPassRunner can be initialized, populated with devices, then run
+// to test individual Tensorflow Optimization passes.
+class OptimizationPassRunner {
+ public:
+  explicit OptimizationPassRunner()
+      : jit_level_(OptimizerOptions::GlobalJitLevel::
+                       OptimizerOptions_GlobalJitLevel_DEFAULT) {}
+
+  // Add a fake device to the (initially empty) DeviceSet used for optimization.
+  // Names are of the form: "/job:localhost/replica:0/task:0/device:CPU:0"
+  Status AddDevice(const string& name, const string& type);
+
+  // Increasing the Jit level will cause XLA to compile parts of the tensorflow
+  // graph that it is able to.
+  Status SetJitLevel(OptimizerOptions::GlobalJitLevel jit_level);
+
+  // This can be called after adding devices and setting the jit level to parse
+  // command line flags and run the specified job. All 3 flags are required:
+  // input_file_path, output_file_path, optimization_pass.
+  //
+  // If this library becomes heavily used, the caller should be responsible for
+  // parsing any command line flags desired rather than this Method handling the
+  // work of a main() function.
+  Status RunMain(int argc, char** argv);
+
+ private:
+  OptimizerOptions::GlobalJitLevel jit_level_;
+  std::vector<std::unique_ptr<Device>> devices_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_TOOLS_OPTIMIZATION_OPTIMIZATION_PASS_RUNNER_H_
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index baacb8723961d0a78b29338f1c4f212e46573b2c..90dfca2b444fd48a340716a43176c1308810ce26 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -59,36 +59,7 @@ COMMON_PIP_DEPS = [
     "setup.py",
     ":included_headers",
     "//tensorflow:tensorflow_py",
-    "//tensorflow/contrib/autograph:autograph",
-    "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
-    "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
-    "//tensorflow/contrib/compiler:xla",
-    "//tensorflow/contrib/constrained_optimization:constrained_optimization_pip",
-    "//tensorflow/contrib/eager/python/examples:examples_pip",
-    "//tensorflow/contrib/eager/python:evaluator",
-    "//tensorflow/contrib/gan:gan",
-    "//tensorflow/contrib/graph_editor:graph_editor_pip",
-    "//tensorflow/contrib/keras:keras",
-    "//tensorflow/contrib/labeled_tensor:labeled_tensor_pip",
-    "//tensorflow/contrib/nn:nn_py",
-    "//tensorflow/contrib/predictor:predictor_pip",
-    "//tensorflow/contrib/proto:proto",
-    "//tensorflow/contrib/receptive_field:receptive_field_pip",
-    "//tensorflow/contrib/rate:rate",
-    "//tensorflow/contrib/rpc:rpc_pip",
-    "//tensorflow/contrib/session_bundle:session_bundle_pip",
-    "//tensorflow/contrib/signal:signal_py",
-    "//tensorflow/contrib/slim:slim",
-    "//tensorflow/contrib/slim/python/slim/data:data_pip",
-    "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
-    "//tensorflow/contrib/specs:specs",
-    "//tensorflow/contrib/summary:summary_test_util",
-    "//tensorflow/contrib/tensor_forest:init_py",
-    "//tensorflow/contrib/tensor_forest/hybrid:hybrid_pip",
-    "//tensorflow/contrib/timeseries:timeseries_pip",
-    "//tensorflow/contrib/tpu",
-    "//tensorflow/examples/tutorials/mnist:package",
-    "//tensorflow/lite/python:interpreter_test_data",
+    "//tensorflow/lite/python/testdata:interpreter_test_data",
     "//tensorflow/lite/python:tflite_convert",
     "//tensorflow/lite/toco/python:toco_from_protos",
     # "//tensorflow/python/autograph/converters:converters",
@@ -106,11 +77,14 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python:meta_graph_testdata",
     "//tensorflow/python:spectral_ops_test_util",
     "//tensorflow/python:util_example_parser_configuration",
+    "//tensorflow/python/data/benchmarks:benchmark_base",
     "//tensorflow/python/data/experimental/kernel_tests/serialization:dataset_serialization_test_base",
     "//tensorflow/python/data/experimental/kernel_tests:stats_dataset_test_base",
+    "//tensorflow/python/data/kernel_tests:filter_test_base",
     "//tensorflow/python/data/kernel_tests:test_base",
     "//tensorflow/python/debug:debug_pip",
     "//tensorflow/python/eager:eager_pip",
+    "//tensorflow/python/kernel_tests/random:util",
     "//tensorflow/python/kernel_tests/signal:test_util",
     "//tensorflow/python/kernel_tests/testdata:self_adjoint_eig_op_test_files",
     "//tensorflow/python/ops/ragged:ragged_test_util",
@@ -122,13 +96,49 @@ COMMON_PIP_DEPS = [
     "//tensorflow/tools/dist_test/server:grpc_tensorflow_server",
 ]
 
+COMMON_PIP_DEPS_V1 = COMMON_PIP_DEPS + [
+    "//tensorflow/contrib/autograph:autograph",
+    "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
+    "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
+    "//tensorflow/contrib/compiler:xla",
+    "//tensorflow/contrib/constrained_optimization:constrained_optimization_pip",
+    "//tensorflow/contrib/distribute/python:distribute_test_lib_pip",
+    "//tensorflow/contrib/eager/python/examples:examples_pip",
+    "//tensorflow/contrib/eager/python:evaluator",
+    "//tensorflow/contrib/gan:gan",
+    "//tensorflow/contrib/graph_editor:graph_editor_pip",
+    "//tensorflow/contrib/keras:keras",
+    "//tensorflow/contrib/labeled_tensor:labeled_tensor_pip",
+    "//tensorflow/contrib/nn:nn_py",
+    "//tensorflow/contrib/predictor:predictor_pip",
+    "//tensorflow/contrib/proto:proto",
+    "//tensorflow/contrib/receptive_field:receptive_field_pip",
+    "//tensorflow/contrib/rate:rate",
+    "//tensorflow/contrib/rpc:rpc_pip",
+    "//tensorflow/contrib/session_bundle:session_bundle_pip",
+    "//tensorflow/contrib/signal:signal_py",
+    "//tensorflow/contrib/slim:slim",
+    "//tensorflow/contrib/slim/python/slim/data:data_pip",
+    "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
+    "//tensorflow/contrib/specs:specs",
+    "//tensorflow/contrib/summary:summary_test_util",
+    "//tensorflow/contrib/tensor_forest:init_py",
+    "//tensorflow/contrib/tensor_forest/hybrid:hybrid_pip",
+    "//tensorflow/contrib/timeseries:timeseries_pip",
+    "//tensorflow/contrib/tpu",
+    "//tensorflow/examples/tutorials/mnist:package",
+]
+
 # On Windows, python binary is a zip file of runfiles tree.
 # Add everything to its data dependency for generating a runfiles tree
 # for building the pip package on Windows.
 py_binary(
     name = "simple_console_for_windows",
     srcs = ["simple_console_for_windows.py"],
-    data = COMMON_PIP_DEPS + ["//tensorflow/python:pywrap_tensorflow_import_lib_file"],
+    data = select({
+        "//conditions:default": COMMON_PIP_DEPS_V1,
+        "//tensorflow:api_version_2": COMMON_PIP_DEPS,
+    }) + ["//tensorflow/python:pywrap_tensorflow_import_lib_file"],
     srcs_version = "PY2AND3",
     deps = ["//tensorflow:tensorflow_py"],
 )
@@ -140,7 +150,11 @@ filegroup(
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
         "//third_party/hadoop:LICENSE.txt",
+        "@absl_py//absl:LICENSE",
+        "@absl_py//absl/logging:LICENSE",
         "@absl_py//absl/flags:LICENSE",
+        "@absl_py//absl/testing:LICENSE",
+        "@absl_py//absl/third_party/unittest3_backport:LICENSE",
         "@arm_neon_2_x86_sse//:LICENSE",
         "@astor_archive//:LICENSE",
         "@boringssl//:LICENSE",
@@ -149,6 +163,7 @@ filegroup(
         "@curl//:COPYING",
         "@double_conversion//:LICENSE",
         "@eigen_archive//:COPYING.MPL2",
+        "@enum34_archive//:LICENSE",
         "@farmhash_archive//:COPYING",
         "@fft2d//:fft/readme.txt",
         "@flatbuffers//:LICENSE.txt",
@@ -163,6 +178,7 @@ filegroup(
         "@local_config_sycl//sycl:LICENSE.text",
         "@nasm//:LICENSE",
         "@nsync//:LICENSE",
+        "@pasta//:LICENSE",
         "@pcre//:LICENCE",
         "@png_archive//:LICENSE",
         "@protobuf_archive//:LICENSE",
@@ -229,13 +245,17 @@ sh_binary(
     name = "build_pip_package",
     srcs = ["build_pip_package.sh"],
     data = select({
-        "//tensorflow:windows": [
-            ":simple_console_for_windows",
-        ],
-        "//conditions:default": COMMON_PIP_DEPS + [
-            ":simple_console",
-        ],
-    }) + if_mkl_ml(["//third_party/mkl:intel_binary_blob"]),
+               "//tensorflow:api_version_2": COMMON_PIP_DEPS,
+               "//conditions:default": COMMON_PIP_DEPS_V1,
+           }) +
+           select({
+               "//tensorflow:windows": [
+                   ":simple_console_for_windows",
+               ],
+               "//conditions:default": [
+                   ":simple_console",
+               ],
+           }) + if_mkl_ml(["//third_party/mkl:intel_binary_blob"]),
 )
 
 # A genrule for generating a marker file for the pip package on Windows
diff --git a/tensorflow/tools/pip_package/MANIFEST.in b/tensorflow/tools/pip_package/MANIFEST.in
index 272ff4735c34b319589bd9302fcdb5cd91b6d1ec..c304e8cf6ebe1739c1cc9011dafd8f89cae9baac 100644
--- a/tensorflow/tools/pip_package/MANIFEST.in
+++ b/tensorflow/tools/pip_package/MANIFEST.in
@@ -6,7 +6,6 @@ recursive-include * *.so
 recursive-include * *.dll
 recursive-include * *.lib
 recursive-include * *.csv
-recursive-include tensorflow/aux-bin *
 recursive-include tensorflow/include/tensorflow *.h
 recursive-include tensorflow/include/Eigen *
 recursive-include tensorflow/include/external *
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 439b5428b3b7bff651689e08e783bf7875f16319..27815491d23a6ec294f08b1b5eee5ed2d11e9766 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -118,9 +118,6 @@ function prepare_src() {
         fi
       fi
     fi
-    mkdir "${TMPDIR}/tensorflow/aux-bin"
-    # Install toco as a binary in aux-bin.
-    cp bazel-bin/tensorflow/lite/python/tflite_convert ${TMPDIR}/tensorflow/aux-bin/
   fi
 
   # protobuf pip package doesn't ship with header files. Copy the headers
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index ff821b864300c1eeb2f9d290ae47a25ce87a0884..3bcc4fc81bdbefcc3c1e5481d5a1c18ee8f15768 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -30,14 +30,20 @@ os.chdir(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")))
 PIP_PACKAGE_QUERY_EXPRESSION = (
     "deps(//tensorflow/tools/pip_package:build_pip_package)")
 
+# List of file paths containing BUILD files that should not be included for the
+# pip smoke test.
+BUILD_BLACKLIST = [
+    "tensorflow/lite/examples/android",
+    "tensorflow/lite/experimental/objc",
+    "tensorflow/lite/experimental/swift",
+]
 
 def GetBuild(dir_base):
   """Get the list of BUILD file all targets recursively startind at dir_base."""
   items = []
   for root, _, files in os.walk(dir_base):
     for name in files:
-      if (name == "BUILD" and
-          root.find("tensorflow/lite/examples/android") == -1):
+      if (name == "BUILD" and root not in BUILD_BLACKLIST):
         items.append("//" + root + ":all")
   return items
 
@@ -67,9 +73,9 @@ def BuildPyTestDependencies():
 
 PYTHON_TARGETS, PY_TEST_QUERY_EXPRESSION = BuildPyTestDependencies()
 
-# Hard-coded blacklist of files if not included in pip package
 # TODO(amitpatankar): Clean up blacklist.
-BLACKLIST = [
+# List of dependencies that should not included in the pip package.
+DEPENDENCY_BLACKLIST = [
     "//tensorflow/python:extra_py_tests_deps",
     "//tensorflow/cc/saved_model:saved_model_half_plus_two",
     "//tensorflow:no_tensorflow_py_deps",
@@ -82,17 +88,18 @@ BLACKLIST = [
     "//tensorflow/core/kernels/cloud:bigquery_reader_ops",
     "//tensorflow/python/feature_column:vocabulary_testdata",
     "//tensorflow/python:framework/test_file_system.so",
-    # contrib
-    "//tensorflow/contrib/session_bundle:session_bundle_half_plus_two",
-    "//tensorflow/contrib/keras:testing_utils",
-    "//tensorflow/lite/experimental/examples/lstm:tflite_lstm",
-    "//tensorflow/lite/experimental/examples/lstm:tflite_lstm.py",
+    # lite
+    "//tensorflow/lite/experimental/examples/lstm:rnn_cell",
+    "//tensorflow/lite/experimental/examples/lstm:rnn_cell.py",
     "//tensorflow/lite/experimental/examples/lstm:unidirectional_sequence_lstm_test",  # pylint:disable=line-too-long
     "//tensorflow/lite/experimental/examples/lstm:unidirectional_sequence_lstm_test.py",  # pylint:disable=line-too-long
     "//tensorflow/lite/python:interpreter",
     "//tensorflow/lite/python:interpreter_test",
     "//tensorflow/lite/python:interpreter.py",
     "//tensorflow/lite/python:interpreter_test.py",
+    # contrib
+    "//tensorflow/contrib/session_bundle:session_bundle_half_plus_two",
+    "//tensorflow/contrib/keras:testing_utils",
     "//tensorflow/contrib/ffmpeg:test_data",
     "//tensorflow/contrib/fused_conv:fused_conv2d_bias_activation_op_test_base",
     "//tensorflow/contrib/hadoop:test_data",
@@ -102,6 +109,7 @@ BLACKLIST = [
     "//tensorflow/contrib/framework:checkpoint_ops_testdata",
     "//tensorflow/contrib/bayesflow:reinforce_simple_example",
     "//tensorflow/contrib/bayesflow:examples/reinforce_simple/reinforce_simple_example.py",  # pylint:disable=line-too-long
+    "//tensorflow/contrib/saved_model:reader",  # Not present in v2
     "//tensorflow/contrib/timeseries/examples:predict",
     "//tensorflow/contrib/timeseries/examples:multivariate",
     "//tensorflow/contrib/timeseries/examples:known_anomaly",
@@ -148,8 +156,8 @@ def main():
   # File extensions and endings to ignore
   ignore_extensions = ["_test", "_test.py", "_test_gpu", "_test_gpu.py"]
 
-  ignored_files = 0
-  blacklisted_files = len(BLACKLIST)
+  ignored_files_count = 0
+  blacklisted_dependencies_count = len(DEPENDENCY_BLACKLIST)
   # Compare dependencies
   for dependency in tf_py_test_dependencies_list:
     if dependency and dependency.startswith("//tensorflow"):
@@ -157,16 +165,16 @@ def main():
       # Ignore extensions
       if any(dependency.endswith(ext) for ext in ignore_extensions):
         ignore = True
-        ignored_files += 1
+        ignored_files_count += 1
 
-      # Check if the dependency is in the pip package, the blacklist, or
-      # should be ignored because of its file extension
+      # Check if the dependency is in the pip package, the dependency blacklist,
+      # or should be ignored because of its file extension.
       if not (ignore or dependency in pip_package_dependencies_list or
-              dependency in BLACKLIST):
+              dependency in DEPENDENCY_BLACKLIST):
         missing_dependencies.append(dependency)
 
-  print("Ignored files: %d" % ignored_files)
-  print("Blacklisted files: %d" % blacklisted_files)
+  print("Ignored files count: %d" % ignored_files_count)
+  print("Blacklisted dependencies count: %d" % blacklisted_dependencies_count)
   if missing_dependencies:
     print("Missing the following dependencies from pip_packages:")
     for missing_dependency in missing_dependencies:
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 85c913f158863c5ff3718ae3f305829e15237b22..83081a1ff3868795a8e2be4b33e654056b025f0e 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -45,19 +45,20 @@ DOCLINES = __doc__.split('\n')
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.12.0-rc0'
+_VERSION = '1.12.0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
     'astor >= 0.6.0',
     'gast >= 0.2.0',
+    'google_pasta >= 0.1.2',
     'keras_applications >= 1.0.6',
     'keras_preprocessing >= 1.0.5',
-    'numpy >= 1.13.3',
+    'numpy >= 1.14.5, < 2.0',
     'six >= 1.10.0',
     'protobuf >= 3.6.1',
     'tensorboard >= 1.12.0, < 1.13.0',
-    'tensorflow_estimator >= 1.10.0',
+    'tensorflow_estimator >= 1.13.0rc0, < 1.14.0rc0',
     'termcolor >= 1.1.0',
 ]
 
@@ -87,7 +88,9 @@ if 'tf_nightly' in project_name:
   for i, pkg in enumerate(REQUIRED_PACKAGES):
     if 'tensorboard' in pkg:
       REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.13.0a0, < 1.14.0a0'
-    if 'tensorflow_estimator' in pkg:
+    elif 'tensorflow_estimator' in pkg and '2.0' in project_name:
+      REQUIRED_PACKAGES[i] = 'tensorflow-estimator-2.0-preview'
+    elif 'tensorflow_estimator' in pkg:
       REQUIRED_PACKAGES[i] = 'tf-estimator-nightly'
 
 # weakref.finalize and enum were introduced in Python 3.4
@@ -246,7 +249,7 @@ setup(
     url='https://www.tensorflow.org/',
     download_url='https://github.com/tensorflow/tensorflow/tags',
     author='Google Inc.',
-    author_email='opensource@google.com',
+    author_email='packages@tensorflow.org',
     # Contained modules and scripts.
     packages=find_packages(),
     entry_points={
@@ -281,6 +284,7 @@ setup(
         'Programming Language :: Python :: 3.4',
         'Programming Language :: Python :: 3.5',
         'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
         'Topic :: Scientific/Engineering',
         'Topic :: Scientific/Engineering :: Mathematics',
         'Topic :: Scientific/Engineering :: Artificial Intelligence',
diff --git a/tensorflow/tools/test/BUILD b/tensorflow/tools/test/BUILD
index 4b2026b9472b651f8e0571155dab8952d20aa8b2..ef12226ec001cc2ddcb09980fcf38a0aeb794742 100644
--- a/tensorflow/tools/test/BUILD
+++ b/tensorflow/tools/test/BUILD
@@ -57,6 +57,14 @@ py_binary(
     srcs = ["run_and_gather_logs.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
+    deps = [":run_and_gather_logs_main_lib"],
+)
+
+py_library(
+    name = "run_and_gather_logs_main_lib",
+    srcs = ["run_and_gather_logs.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
     deps = [
         ":run_and_gather_logs_lib",
         "//tensorflow/core:protos_all_py",
diff --git a/tensorflow/tools/test/performance.bzl b/tensorflow/tools/test/performance.bzl
index 3486871080c78dc7a1cc201ea2a4d45ebc342758..97861110346b62659ac97da95727250abaf3b928 100644
--- a/tensorflow/tools/test/performance.bzl
+++ b/tensorflow/tools/test/performance.bzl
@@ -4,60 +4,66 @@ load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 # Create a benchmark test target of a TensorFlow C++ test (tf_cc_*_test)
 def tf_cc_logged_benchmark(
-    name=None,
-    target=None,
-    benchmarks="..",
-    tags=[],
-    test_log_output_prefix="",
-    benchmark_type="cpp_microbenchmark"):
-  if not name:
-    fail("Must provide a name")
-  if not target:
-    fail("Must provide a target")
-  if (not ":" in target
-      or not target.startswith("//")
-      or target.endswith(":all")
-      or target.endswith(".")):
-    fail(" ".join(("Target must be a single well-defined test, e.g.,",
-                   "//path/to:test. Received: %s" % target)))
+        name = None,
+        target = None,
+        benchmarks = "..",
+        tags = [],
+        test_log_output_prefix = "",
+        benchmark_type = "cpp_microbenchmark"):
+    if not name:
+        fail("Must provide a name")
+    if not target:
+        fail("Must provide a target")
+    if (not ":" in target or
+        not target.startswith("//") or
+        target.endswith(":all") or
+        target.endswith(".")):
+        fail(" ".join((
+            "Target must be a single well-defined test, e.g.,",
+            "//path/to:test. Received: %s" % target,
+        )))
 
-  all_tags = (
-    depset(tags) + depset(
-      ["benchmark-test", "local", "manual", "regression-test"])).to_list()
+    all_tags = (
+        depset(tags) + depset(
+            ["benchmark-test", "local", "manual", "regression-test"],
+        )
+    ).to_list()
 
-  tf_py_test(
-      name = name,
-      tags = all_tags,
-      size = "large",
-      srcs = ["//tensorflow/tools/test:run_and_gather_logs"],
-      args = [
-          "--name=//%s:%s" % (native.package_name(), name),
-          "--test_name=" + target,
-          "--test_args=--benchmarks=%s" % benchmarks,
-          "--benchmark_type=%s" % benchmark_type,
-      ],
-      data = [
-        target,
-      ],
-      main = "run_and_gather_logs.py",
-      additional_deps = [
-          "//tensorflow/tools/test:run_and_gather_logs"
-      ])
+    tf_py_test(
+        name = name,
+        tags = all_tags,
+        size = "large",
+        srcs = ["//tensorflow/tools/test:run_and_gather_logs"],
+        args = [
+            "--name=//%s:%s" % (native.package_name(), name),
+            "--test_name=" + target,
+            "--test_args=--benchmarks=%s" % benchmarks,
+            "--benchmark_type=%s" % benchmark_type,
+        ],
+        data = [
+            target,
+        ],
+        main = "run_and_gather_logs.py",
+        additional_deps = [
+            "//tensorflow/tools/test:run_and_gather_logs",
+        ],
+    )
 
 # Create a benchmark test target of a TensorFlow python test (*py_tests)
 def tf_py_logged_benchmark(
-    name=None,
-    target=None,
-    benchmarks="..",
-    tags=[],
-    test_log_output_prefix=""):
-  # For now generating a py benchmark is the same as generating a C++
-  # benchmark target. In the future this may change, so we have
-  # two macros just in case
-  tf_cc_logged_benchmark(
-    name=name,
-    target=target,
-    benchmarks=benchmarks,
-    tags=tags,
-    test_log_output_prefix=test_log_output_prefix,
-    benchmark_type="python_benchmark")
+        name = None,
+        target = None,
+        benchmarks = "..",
+        tags = [],
+        test_log_output_prefix = ""):
+    # For now generating a py benchmark is the same as generating a C++
+    # benchmark target. In the future this may change, so we have
+    # two macros just in case
+    tf_cc_logged_benchmark(
+        name = name,
+        target = target,
+        benchmarks = benchmarks,
+        tags = tags,
+        test_log_output_prefix = test_log_output_prefix,
+        benchmark_type = "python_benchmark",
+    )
diff --git a/tensorflow/version_check.bzl b/tensorflow/version_check.bzl
index 79e721dab422c1449214acbe5fc1643edc3a9db0..74feaa19ff1523375249adbb7397c3d082d9f96c 100644
--- a/tensorflow/version_check.bzl
+++ b/tensorflow/version_check.bzl
@@ -1,48 +1,52 @@
 """ Helpers to check minimum version of bazel."""
 
 def _extract_version_number(bazel_version):
-  """Extracts the semantic version number from a version string
+    """Extracts the semantic version number from a version string
 
-  Args:
-    bazel_version: the version string that begins with the semantic version
-      e.g. "1.2.3rc1 abc1234" where "abc1234" is a commit hash.
+    Args:
+      bazel_version: the version string that begins with the semantic version
+        e.g. "1.2.3rc1 abc1234" where "abc1234" is a commit hash.
 
-  Returns:
-    The semantic version string, like "1.2.3".
-  """
-  for i in range(len(bazel_version)):
-    c = bazel_version[i]
-    if not (c.isdigit() or c == "."):
-      return bazel_version[:i]
-  return bazel_version
+    Returns:
+      The semantic version string, like "1.2.3".
+    """
+    for i in range(len(bazel_version)):
+        c = bazel_version[i]
+        if not (c.isdigit() or c == "."):
+            return bazel_version[:i]
+    return bazel_version
 
 # Parse the bazel version string from `native.bazel_version`.
 # e.g.
 # "0.10.0rc1 abc123d" => (0, 10, 0)
 # "0.3.0" => (0, 3, 0)
 def _parse_bazel_version(bazel_version):
-  """Parses a version string into a 3-tuple of ints
+    """Parses a version string into a 3-tuple of ints
 
-  int tuples can be compared directly using binary operators (<, >).
+    int tuples can be compared directly using binary operators (<, >).
 
-  Args:
-    bazel_version: the Bazel version string
+    Args:
+      bazel_version: the Bazel version string
 
-  Returns:
-    An int 3-tuple of a (major, minor, patch) version.
-  """
+    Returns:
+      An int 3-tuple of a (major, minor, patch) version.
+    """
 
-  version = _extract_version_number(bazel_version)
-  return tuple([int(n) for n in version.split(".")])
+    version = _extract_version_number(bazel_version)
+    return tuple([int(n) for n in version.split(".")])
 
 def check_bazel_version_at_least(minimum_bazel_version):
-  if "bazel_version" not in dir(native):
-    fail("\nCurrent Bazel version is lower than 0.2.1, expected at least %s\n" % minimum_bazel_version)
-  elif not native.bazel_version:
-    print("\nCurrent Bazel is not a release version, cannot check for compatibility.")
-    print("Make sure that you are running at least Bazel %s.\n" % minimum_bazel_version)
-    return
-
-  if _parse_bazel_version(native.bazel_version) < _parse_bazel_version(minimum_bazel_version):
-    fail("\nCurrent Bazel version is {}, expected at least {}\n".format(
-        native.bazel_version, minimum_bazel_version))
+    if "bazel_version" not in dir(native):
+        fail("\nCurrent Bazel version is lower than 0.2.1, expected at least %s\n" % minimum_bazel_version)
+    elif not native.bazel_version:
+        print("\nCurrent Bazel is not a release version, cannot check for compatibility.")
+        print("Make sure that you are running at least Bazel %s.\n" % minimum_bazel_version)
+        return
+
+    if _parse_bazel_version(native.bazel_version) < _parse_bazel_version(minimum_bazel_version):
+        fail("\nCurrent Bazel version is {}, expected at least {}\n".format(
+            native.bazel_version,
+            minimum_bazel_version,
+        ))
+
+parse_bazel_version = _parse_bazel_version
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 60dcca3207f88f4bba9e0d11c263f657d44ed1b5..937962fa22d781de5b69310c99f91ff78a5d00c1 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -7,9 +7,9 @@ load("//third_party:nccl/nccl_configure.bzl", "nccl_configure")
 load("//third_party/mkl:build_defs.bzl", "mkl_repository")
 load("//third_party/git:git_configure.bzl", "git_configure")
 load("//third_party/py:python_configure.bzl", "python_configure")
-
 load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
 load("//third_party/systemlibs:syslibs_configure.bzl", "syslibs_configure")
+load("//third_party/toolchains/remote:configure.bzl", "remote_execution_configure")
 load("//third_party/toolchains/clang6:repo.bzl", "clang6_configure")
 load("//third_party/toolchains/cpus/arm:arm_compiler_configure.bzl", "arm_compiler_configure")
 load("//third_party:repo.bzl", "tf_http_archive")
@@ -23,22 +23,26 @@ load(
 load("//third_party/aws:workspace.bzl", aws = "repo")
 load("//third_party/flatbuffers:workspace.bzl", flatbuffers = "repo")
 load("//third_party/highwayhash:workspace.bzl", highwayhash = "repo")
+load("//third_party/hwloc:workspace.bzl", hwloc = "repo")
 load("//third_party/icu:workspace.bzl", icu = "repo")
 load("//third_party/jpeg:workspace.bzl", jpeg = "repo")
 load("//third_party/nasm:workspace.bzl", nasm = "repo")
 load("//third_party/kissfft:workspace.bzl", kissfft = "repo")
 load("//third_party/keras_applications_archive:workspace.bzl", keras_applications = "repo")
+load("//third_party/pasta:workspace.bzl", pasta = "repo")
 
 def initialize_third_party():
     """ Load third party repositories.  See above load() statements. """
     aws()
     flatbuffers()
     highwayhash()
+    hwloc()
     icu()
     keras_applications()
     kissfft()
     jpeg()
     nasm()
+    pasta()
 
 # Sanitize a dependency so that it works correctly from code that includes
 # TensorFlow as a submodule.
@@ -60,6 +64,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     syslibs_configure(name = "local_config_syslibs")
     python_configure(name = "local_config_python")
     rocm_configure(name = "local_config_rocm")
+    remote_execution_configure(name = "local_config_remote_execution")
 
     initialize_third_party()
 
@@ -123,22 +128,23 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "com_google_absl",
         build_file = clean_dep("//third_party:com_google_absl.BUILD"),
-        sha256 = "3ad76de484192b2d5afd49d90492b5ed0bc59eb1a4e8e0deecc7a2a077a90251",
-        strip_prefix = "abseil-cpp-f197d7c72a54064cfde5a2058f1513a4a0ee36fb",
+        sha256 = "583e5801372a0bb12eb561858532e3bb9a3528f15f65cfc87b2c0f4c1ab1a0ca",
+        strip_prefix = "abseil-cpp-111ca7060a6ff50115ca85b59f6b5d8c8c5e9105",
         urls = [
-            "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/f197d7c72a54064cfde5a2058f1513a4a0ee36fb.tar.gz",
-            "https://github.com/abseil/abseil-cpp/archive/f197d7c72a54064cfde5a2058f1513a4a0ee36fb.tar.gz",
+            "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/111ca7060a6ff50115ca85b59f6b5d8c8c5e9105.tar.gz",
+            "https://github.com/abseil/abseil-cpp/archive/111ca7060a6ff50115ca85b59f6b5d8c8c5e9105.tar.gz",
         ],
     )
 
     tf_http_archive(
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
-        sha256 = "aae7a680d141c978301dfae2c7945c06039f65849fcf64269595a9cdbba82638",
-        strip_prefix = "eigen-eigen-729d33d11c81",
+        patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
+        sha256 = "61f0017318a24cf940db14e57febecc524b24a9faa8ff4fa7f9f91630c4cd09d",
+        strip_prefix = "eigen-eigen-5a4931dafc1c",
         urls = [
-            "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/729d33d11c81.tar.gz",
-            "https://bitbucket.org/eigen/eigen/get/729d33d11c81.tar.gz",
+            "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/5a4931dafc1c.tar.gz",
+            "https://bitbucket.org/eigen/eigen/get/5a4931dafc1c.tar.gz",
         ],
     )
 
@@ -179,15 +185,15 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "com_github_googlecloudplatform_google_cloud_cpp",
-        sha256 = "3ade2072e6588ff56c0434abe6c63aa5f3f2d56be15a299bafc7e9cdf0a12c17",
-        strip_prefix = "google-cloud-cpp-0.3.0",
+        sha256 = "8e3a302d37f232dec041bf3f3916ca3fa5689216d42112898a4e36581f2f4ce5",
+        strip_prefix = "google-cloud-cpp-0.6.1",
         system_build_file = clean_dep("//third_party/systemlibs:google_cloud_cpp.BUILD"),
         system_link_files = {
             "//third_party/systemlibs:google_cloud_cpp.google.cloud.bigtable.BUILD": "google/cloud/bigtable/BUILD",
         },
         urls = [
-            "https://mirror.bazel.build/github.com/GoogleCloudPlatform/google-cloud-cpp/archive/v0.3.0.tar.gz",
-            "https://github.com/GoogleCloudPlatform/google-cloud-cpp/archive/v0.3.0.tar.gz",
+            "https://mirror.bazel.build/github.com/googleapis/google-cloud-cpp/archive/v0.6.1.tar.gz",
+            "https://github.com/googleapis/google-cloud-cpp/archive/v0.6.1.tar.gz",
         ],
     )
 
@@ -276,12 +282,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "astor_archive",
         build_file = clean_dep("//third_party:astor.BUILD"),
-        sha256 = "ff6d2e2962d834acb125cc4dcc80c54a8c17c253f4cc9d9c43b5102a560bb75d",
-        strip_prefix = "astor-0.6.2",
+        sha256 = "95c30d87a6c2cf89aa628b87398466840f0ad8652f88eb173125a6df8533fb8d",
+        strip_prefix = "astor-0.7.1",
         system_build_file = clean_dep("//third_party/systemlibs:astor.BUILD"),
         urls = [
-            "https://mirror.bazel.build/pypi.python.org/packages/d8/be/c4276b3199ec3feee2a88bc64810fbea8f26d961e0a4cd9c68387a9f35de/astor-0.6.2.tar.gz",
-            "https://pypi.python.org/packages/d8/be/c4276b3199ec3feee2a88bc64810fbea8f26d961e0a4cd9c68387a9f35de/astor-0.6.2.tar.gz",
+            "https://mirror.bazel.build/pypi.python.org/packages/99/80/f9482277c919d28bebd85813c0a70117214149a96b08981b72b63240b84c/astor-0.7.1.tar.gz",
+            "https://pypi.python.org/packages/99/80/f9482277c919d28bebd85813c0a70117214149a96b08981b72b63240b84c/astor-0.7.1.tar.gz",
         ],
     )
 
@@ -311,17 +317,28 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "absl_py",
-        sha256 = "95160f778a62c7a60ddeadc7bf2d83f85a23a27359814aca12cf949e896fa82c",
-        strip_prefix = "abseil-py-pypi-v0.2.2",
+        sha256 = "595726be4bf3f7e6d64a1a255fa03717b693c01b913768abd52649cbb7ddf2bd",
+        strip_prefix = "abseil-py-pypi-v0.7.0",
         system_build_file = clean_dep("//third_party/systemlibs:absl_py.BUILD"),
         system_link_files = {
             "//third_party/systemlibs:absl_py.absl.flags.BUILD": "absl/flags/BUILD",
             "//third_party/systemlibs:absl_py.absl.testing.BUILD": "absl/testing/BUILD",
         },
         urls = [
-            "https://mirror.bazel.build/github.com/abseil/abseil-py/archive/pypi-v0.2.2.tar.gz",
-            "https://github.com/abseil/abseil-py/archive/pypi-v0.2.2.tar.gz",
+            "https://mirror.bazel.build/github.com/abseil/abseil-py/archive/pypi-v0.7.0.tar.gz",
+            "https://github.com/abseil/abseil-py/archive/pypi-v0.7.0.tar.gz",
+        ],
+    )
+
+    tf_http_archive(
+        name = "enum34_archive",
+        urls = [
+            "https://mirror.bazel.build/pypi.python.org/packages/bf/3e/31d502c25302814a7c2f1d3959d2a3b3f78e509002ba91aea64993936876/enum34-1.1.6.tar.gz",
+            "https://pypi.python.org/packages/bf/3e/31d502c25302814a7c2f1d3959d2a3b3f78e509002ba91aea64993936876/enum34-1.1.6.tar.gz",
         ],
+        sha256 = "8ad8c4783bf61ded74527bffb48ed9b54166685e4230386a9ed9b1279e2df5b1",
+        build_file = clean_dep("//third_party:enum34.BUILD"),
+        strip_prefix = "enum34-1.1.6/enum",
     )
 
     tf_http_archive(
@@ -391,22 +408,22 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "nsync",
-        sha256 = "692f9b30e219f71a6371b98edd39cef3cbda35ac3abc4cd99ce19db430a5591a",
-        strip_prefix = "nsync-1.20.1",
+        sha256 = "704be7f58afa47b99476bbac7aafd1a9db4357cef519db361716f13538547ffd",
+        strip_prefix = "nsync-1.20.2",
         system_build_file = clean_dep("//third_party/systemlibs:nsync.BUILD"),
         urls = [
-            "https://mirror.bazel.build/github.com/google/nsync/archive/1.20.1.tar.gz",
-            "https://github.com/google/nsync/archive/1.20.1.tar.gz",
+            "https://mirror.bazel.build/github.com/google/nsync/archive/1.20.2.tar.gz",
+            "https://github.com/google/nsync/archive/1.20.2.tar.gz",
         ],
     )
 
     tf_http_archive(
         name = "com_google_googletest",
-        sha256 = "353ab86e35cea1cd386115279cf4b16695bbf21b897bfbf2721cf4cb5f64ade8",
-        strip_prefix = "googletest-997d343dd680e541ef96ce71ee54a91daf2577a0",
+        sha256 = "ff7a82736e158c077e76188232eac77913a15dac0b22508c390ab3f88e6d6d86",
+        strip_prefix = "googletest-b6cd405286ed8635ece71c72f118e659f4ade3fb",
         urls = [
-            "https://mirror.bazel.build/github.com/google/googletest/archive/997d343dd680e541ef96ce71ee54a91daf2577a0.zip",
-            "https://github.com/google/googletest/archive/997d343dd680e541ef96ce71ee54a91daf2577a0.zip",
+            "https://mirror.bazel.build/github.com/google/googletest/archive/b6cd405286ed8635ece71c72f118e659f4ade3fb.zip",
+            "https://github.com/google/googletest/archive/b6cd405286ed8635ece71c72f118e659f4ade3fb.zip",
         ],
     )
 
@@ -460,12 +477,12 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     # WARNING: make sure ncteisen@ and vpai@ are cc-ed on any CL to change the below rule
     tf_http_archive(
         name = "grpc",
-        sha256 = "1aa84387232dda273ea8fdfe722622084f72c16f7b84bfc519ac7759b71cdc91",
-        strip_prefix = "grpc-69b6c047bc767b4d80e7af4d00ccb7c45b683dae",
+        sha256 = "e1e3a9edbfbe4230bee174d4aa45a15c1ec2b203cedb02d20df3e6345d8fa63e",
+        strip_prefix = "grpc-62688b6a05cc85b47fb77dd408611734253e47e2",
         system_build_file = clean_dep("//third_party/systemlibs:grpc.BUILD"),
         urls = [
-            "https://mirror.bazel.build/github.com/grpc/grpc/archive/69b6c047bc767b4d80e7af4d00ccb7c45b683dae.tar.gz",
-            "https://github.com/grpc/grpc/archive/69b6c047bc767b4d80e7af4d00ccb7c45b683dae.tar.gz",
+            "https://mirror.bazel.build/github.com/grpc/grpc/archive/62688b6a05cc85b47fb77dd408611734253e47e2.tar.gz",
+            "https://github.com/grpc/grpc/archive/62688b6a05cc85b47fb77dd408611734253e47e2.tar.gz",
         ],
     )
 
@@ -496,11 +513,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "llvm",
         build_file = clean_dep("//third_party/llvm:llvm.autogenerated.BUILD"),
-        sha256 = "34170a4aa07e434dd537d98a705dcf1b3901f73820fe1d6b9370e8c1c94e9157",
-        strip_prefix = "llvm-0487bd8f42c8b38166ff825d56014d0ff49db604",
+        sha256 = "8399a5e0111f52c5d8d3df3638cb997b3f01e1bc7288ee95d77e97b62a47ba18",
+        strip_prefix = "llvm-f2cd363b4b915b54ba0f25066f876c16d59fede6",
         urls = [
-            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/0487bd8f42c8b38166ff825d56014d0ff49db604.tar.gz",
-            "https://github.com/llvm-mirror/llvm/archive/0487bd8f42c8b38166ff825d56014d0ff49db604.tar.gz",
+            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/f2cd363b4b915b54ba0f25066f876c16d59fede6.tar.gz",
+            "https://github.com/llvm-mirror/llvm/archive/f2cd363b4b915b54ba0f25066f876c16d59fede6.tar.gz",
         ],
     )
 
@@ -700,16 +717,6 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
-    tf_http_archive(
-        name = "bazel_toolchains",
-        sha256 = "07dfbe80638eb1fe681f7c07e61b34b579c6710c691e49ee90ccdc6e9e75ebbb",
-        strip_prefix = "bazel-toolchains-9a111bd82161c1fbe8ed17a593ca1023fd941c70",
-        urls = [
-            "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/9a111bd82161c1fbe8ed17a593ca1023fd941c70.tar.gz",
-            "https://github.com/bazelbuild/bazel-toolchains/archive/9a111bd82161c1fbe8ed17a593ca1023fd941c70.tar.gz",
-        ],
-    )
-
     tf_http_archive(
         name = "arm_neon_2_x86_sse",
         build_file = clean_dep("//third_party:arm_neon_2_x86_sse.BUILD"),
@@ -828,44 +835,44 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "tbb",
         build_file = clean_dep("//third_party/ngraph:tbb.BUILD"),
-        sha256 = "724686f90bcda78f13b76f297d964008737ccd6399328143c1c0093e73ae6a13",
-        strip_prefix = "tbb-tbb_2018",
+        sha256 = "c3245012296f09f1418b78a8c2f17df5188b3bd0db620f7fd5fabe363320805a",
+        strip_prefix = "tbb-2019_U1",
         urls = [
-            "https://mirror.bazel.build/github.com/01org/tbb/archive/tbb_2018.zip",
-            "https://github.com/01org/tbb/archive/tbb_2018.zip",
+            "https://mirror.bazel.build/github.com/01org/tbb/archive/2019_U1.zip",
+            "https://github.com/01org/tbb/archive/2019_U1.zip",
         ],
     )
 
     tf_http_archive(
         name = "ngraph",
         build_file = clean_dep("//third_party/ngraph:ngraph.BUILD"),
-        sha256 = "2b28f9c9f063b96825a96d56d7f7978c9a1c55c9b25175c20dd49a8a77cb0305",
-        strip_prefix = "ngraph-0.9.1",
+        sha256 = "a1780f24a1381fc25e323b4b2d08b6ef5129f42e011305b2a34dcf43a48030d5",
+        strip_prefix = "ngraph-0.11.0",
         urls = [
-            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph/archive/v0.9.1.tar.gz",
-            "https://github.com/NervanaSystems/ngraph/archive/v0.9.1.tar.gz",
+            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph/archive/v0.11.0.tar.gz",
+            "https://github.com/NervanaSystems/ngraph/archive/v0.11.0.tar.gz",
         ],
     )
 
     tf_http_archive(
         name = "nlohmann_json_lib",
         build_file = clean_dep("//third_party/ngraph:nlohmann_json.BUILD"),
-        sha256 = "9f3549824af3ca7e9707a2503959886362801fb4926b869789d6929098a79e47",
-        strip_prefix = "json-3.1.1",
+        sha256 = "c377963a95989270c943d522bfefe7b889ef5ed0e1e15d535fd6f6f16ed70732",
+        strip_prefix = "json-3.4.0",
         urls = [
-            "https://mirror.bazel.build/github.com/nlohmann/json/archive/v3.1.1.tar.gz",
-            "https://github.com/nlohmann/json/archive/v3.1.1.tar.gz",
+            "https://mirror.bazel.build/github.com/nlohmann/json/archive/v3.4.0.tar.gz",
+            "https://github.com/nlohmann/json/archive/v3.4.0.tar.gz",
         ],
     )
 
     tf_http_archive(
         name = "ngraph_tf",
         build_file = clean_dep("//third_party/ngraph:ngraph_tf.BUILD"),
-        sha256 = "89accbc702e68a09775f1011a99dd16561038fd1ce59d566d64450176abaae5c",
-        strip_prefix = "ngraph-tf-0.7.0",
+        sha256 = "742a642d2c6622277df4c902b6830d616d0539cc8cd843d6cdb899bb99e66e36",
+        strip_prefix = "ngraph-tf-0.9.0",
         urls = [
-            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph-tf/archive/v0.7.0.tar.gz",
-            "https://github.com/NervanaSystems/ngraph-tf/archive/v0.7.0.tar.gz",
+            "https://mirror.bazel.build/github.com/NervanaSystems/ngraph-tf/archive/v0.9.0.zip",
+            "https://github.com/NervanaSystems/ngraph-tf/archive/v0.9.0.zip",
         ],
     )
 
diff --git a/third_party/android/android.bzl.tpl b/third_party/android/android.bzl.tpl
index e6ed4994f3ba6d721d717a04b0bd22f54dbb1d79..c498f585abfa5212464cdb91e839165643d9b005 100644
--- a/third_party/android/android.bzl.tpl
+++ b/third_party/android/android.bzl.tpl
@@ -1,9 +1,9 @@
 """Set up configurable Android SDK and NDK dependencies."""
 
 def android_workspace():
-  # String for replacement in Bazel template.
-  # These will either be replaced by android_sdk_repository if various ENV
-  # variables are set when `local_config_android` repo_rule is run, or they
-  # will be replaced by noops otherwise.
-  MAYBE_ANDROID_SDK_REPOSITORY
-  MAYBE_ANDROID_NDK_REPOSITORY
+    # String for replacement in Bazel template.
+    # These will either be replaced by android_sdk_repository if various ENV
+    # variables are set when `local_config_android` repo_rule is run, or they
+    # will be replaced by noops otherwise.
+    MAYBE_ANDROID_SDK_REPOSITORY
+    MAYBE_ANDROID_NDK_REPOSITORY
diff --git a/third_party/android/android_configure.bzl b/third_party/android/android_configure.bzl
index da09bdf39eed90b648ca8f47c79d16e3ec3804bb..646ed732a1c7c765a67e1a2d3daa1183cd6aede0 100644
--- a/third_party/android/android_configure.bzl
+++ b/third_party/android/android_configure.bzl
@@ -36,33 +36,39 @@ _ANDROID_NDK_REPO_TEMPLATE = """
 """
 
 def _android_autoconf_impl(repository_ctx):
-  """Implementation of the android_autoconf repository rule."""
-  sdk_home = repository_ctx.os.environ.get(_ANDROID_SDK_HOME)
-  sdk_api_level = repository_ctx.os.environ.get(_ANDROID_SDK_API_VERSION)
-  build_tools_version = repository_ctx.os.environ.get(
-      _ANDROID_BUILD_TOOLS_VERSION)
-  ndk_home = repository_ctx.os.environ.get(_ANDROID_NDK_HOME)
-  ndk_api_level = repository_ctx.os.environ.get(_ANDROID_NDK_API_VERSION)
+    """Implementation of the android_autoconf repository rule."""
+    sdk_home = repository_ctx.os.environ.get(_ANDROID_SDK_HOME)
+    sdk_api_level = repository_ctx.os.environ.get(_ANDROID_SDK_API_VERSION)
+    build_tools_version = repository_ctx.os.environ.get(
+        _ANDROID_BUILD_TOOLS_VERSION,
+    )
+    ndk_home = repository_ctx.os.environ.get(_ANDROID_NDK_HOME)
+    ndk_api_level = repository_ctx.os.environ.get(_ANDROID_NDK_API_VERSION)
 
-  sdk_rule = "pass"
-  if all([sdk_home, sdk_api_level, build_tools_version]):
-    sdk_rule = _ANDROID_SDK_REPO_TEMPLATE % (
-        sdk_home, sdk_api_level, build_tools_version)
+    sdk_rule = "pass"
+    if all([sdk_home, sdk_api_level, build_tools_version]):
+        sdk_rule = _ANDROID_SDK_REPO_TEMPLATE % (
+            sdk_home,
+            sdk_api_level,
+            build_tools_version,
+        )
 
-  ndk_rule = "pass"
-  if all([ndk_home, ndk_api_level]):
-    ndk_rule = _ANDROID_NDK_REPO_TEMPLATE % (ndk_home, ndk_api_level)
+    ndk_rule = "pass"
+    if all([ndk_home, ndk_api_level]):
+        ndk_rule = _ANDROID_NDK_REPO_TEMPLATE % (ndk_home, ndk_api_level)
 
-  repository_ctx.template(
-      "BUILD",
-      Label("//third_party/android:android_configure.BUILD.tpl"))
-  repository_ctx.template(
-      "android.bzl",
-      Label("//third_party/android:android.bzl.tpl"),
-      substitutions={
-          "MAYBE_ANDROID_SDK_REPOSITORY": sdk_rule,
-          "MAYBE_ANDROID_NDK_REPOSITORY": ndk_rule,
-      })
+    repository_ctx.template(
+        "BUILD",
+        Label("//third_party/android:android_configure.BUILD.tpl"),
+    )
+    repository_ctx.template(
+        "android.bzl",
+        Label("//third_party/android:android.bzl.tpl"),
+        substitutions = {
+            "MAYBE_ANDROID_SDK_REPOSITORY": sdk_rule,
+            "MAYBE_ANDROID_NDK_REPOSITORY": ndk_rule,
+        },
+    )
 
 android_configure = repository_rule(
     implementation = _android_autoconf_impl,
diff --git a/third_party/aws/BUILD.bazel b/third_party/aws/BUILD.bazel
index 5426f79e4650a1ce4dcb4a8408691310c864f06c..fde3072403f428830763d3b11542b8b39436b4ca 100644
--- a/third_party/aws/BUILD.bazel
+++ b/third_party/aws/BUILD.bazel
@@ -54,6 +54,11 @@ cc_library(
     hdrs = [
         "aws-cpp-sdk-core/include/aws/core/SDKConfig.h",
     ],
+    copts = [
+        "-DAWS_SDK_VERSION_MAJOR=1",
+        "-DAWS_SDK_VERSION_MINOR=5",
+        "-DAWS_SDK_VERSION_PATCH=8",
+    ],
     defines = select({
         "@org_tensorflow//tensorflow:linux_x86_64": [
             "PLATFORM_LINUX",
diff --git a/third_party/aws/workspace.bzl b/third_party/aws/workspace.bzl
index c2166381549a5cf6fb44912081ae9479bff69645..1d269f4d43ec4cc9d39f3c89ff40e07b4e4947c4 100644
--- a/third_party/aws/workspace.bzl
+++ b/third_party/aws/workspace.bzl
@@ -2,14 +2,17 @@
 
 load("//third_party:repo.bzl", "third_party_http_archive")
 
+# NOTE: version updates here should also update the major, minor, and patch variables declared in
+# the  copts field of the //third_party/aws:aws target
+
 def repo():
     third_party_http_archive(
         name = "aws",
         urls = [
-            "https://mirror.bazel.build/github.com/aws/aws-sdk-cpp/archive/1.3.15.tar.gz",
-            "https://github.com/aws/aws-sdk-cpp/archive/1.3.15.tar.gz",
+            "https://mirror.bazel.build/github.com/aws/aws-sdk-cpp/archive/1.5.8.tar.gz",
+            "https://github.com/aws/aws-sdk-cpp/archive/1.5.8.tar.gz",
         ],
-        sha256 = "b888d8ce5fc10254c3dd6c9020c7764dd53cf39cf011249d0b4deda895de1b7c",
-        strip_prefix = "aws-sdk-cpp-1.3.15",
+        sha256 = "89905075fe50aa13e0337ff905c2e8c1ce9caf77a3504484a7cda39179120ffc",
+        strip_prefix = "aws-sdk-cpp-1.5.8",
         build_file = "//third_party/aws:BUILD.bazel",
     )
diff --git a/third_party/clang_toolchain/download_clang.bzl b/third_party/clang_toolchain/download_clang.bzl
index 7ced9027473e39ad9870ce138b64c7f7ec64ad01..20ac3a8d5a4bd243151484da7cbcdf31b5dc3a36 100644
--- a/third_party/clang_toolchain/download_clang.bzl
+++ b/third_party/clang_toolchain/download_clang.bzl
@@ -39,15 +39,15 @@ def download_clang(repo_ctx, out_folder):
 
     # Latest CLANG_REVISION and CLANG_SUB_REVISION of the Chromiums's release
     # can be found in https://chromium.googlesource.com/chromium/src/tools/clang/+/master/scripts/update.py
-    CLANG_REVISION = "347933"
+    CLANG_REVISION = "348507"
     CLANG_SUB_REVISION = 1
 
     package_version = "%s-%s" % (CLANG_REVISION, CLANG_SUB_REVISION)
 
     checksums = {
-        "Linux_x64": "cae3643fdf5d46fc9bc8731212bb37573547148d90b64b083165e090133d11b0",
-        "Mac": "083a0e91a38c06e568652313ac7372b17a101268f7d65533d721ca30413442b4",
-        "Win": "43160487cfc7e88076a369a2b6e8e4a0f42e104c28d8903f3aaa62d630aba949",
+        "Linux_x64": "85a24f215737af91e0054d3a1cb435bd8ff06178cef14241c029c8a04ff16a79",
+        "Mac": "16a96a3c4b599d0418e812307087a223d5fee2ee3c7fd96f5cbc2a9e5bf8607d",
+        "Win": "4c144f24d3a82d546845c680f5b029ff02dd4de7614e93d1b21cfc6e20a26dad",
     }
 
     platform_folder = _get_platform_folder(repo_ctx.os.name)
diff --git a/third_party/common.bzl b/third_party/common.bzl
index db981a5e314c08f8612578161fcc1f63b4662a69..8134bf3d2562f5405ece9f50537b88d7fb27865f 100644
--- a/third_party/common.bzl
+++ b/third_party/common.bzl
@@ -21,11 +21,11 @@
 #   substitutions: A dictionary mapping strings to their substitutions
 
 def template_rule_impl(ctx):
-  ctx.template_action(
-      template = ctx.file.src,
-      output = ctx.outputs.out,
-      substitutions = ctx.attr.substitutions,
-  )
+    ctx.template_action(
+        template = ctx.file.src,
+        output = ctx.outputs.out,
+        substitutions = ctx.attr.substitutions,
+    )
 
 template_rule = rule(
     attrs = {
diff --git a/third_party/eigen3/gpu_packet_math.patch b/third_party/eigen3/gpu_packet_math.patch
new file mode 100644
index 0000000000000000000000000000000000000000..06347538e179383bbe991a49506b1656a5d21ab3
--- /dev/null
+++ b/third_party/eigen3/gpu_packet_math.patch
@@ -0,0 +1,18 @@
+--- a/Eigen/src/Core/arch/GPU/PacketMath.h
++++ b/Eigen/src/Core/arch/GPU/PacketMath.h
+@@ -100,6 +100,7 @@
+   return make_double2(from, from);
+ }
+ 
++#if defined(EIGEN_CUDA_ARCH)
+ namespace {
+ 
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a,
+@@ -211,6 +212,7 @@
+ pcmp_eq<double2>(const double2& a, const double2& b) {
+   return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
+ }
++#endif  // EIGEN_CUDA_ARCH
+ 
+ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
+   return make_float4(a, a+1, a+2, a+3);
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
index 223ea4d58bf4c40b2790e2f5d73e2a4fc1a79eec..8df6782551779acded8b74fab30dc78f74c51948 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
@@ -178,25 +178,37 @@ template <>
 struct unpacket_traits<Packet32q8i> {
   typedef QInt8 type;
   typedef Packet16q8i half;
-  enum { size = 32, alignment = Aligned32 };
+  enum { size = 32, alignment = Aligned32, vectorizable = true };
+};
+template <>
+struct unpacket_traits<Packet16q8i> {
+  typedef QInt8 type;
+  typedef Packet16q8i half;
+  enum { size = 16, alignment = Aligned32, vectorizable = true };
 };
 template <>
 struct unpacket_traits<Packet16q16i> {
   typedef QInt16 type;
   typedef Packet8q16i half;
-  enum { size = 16, alignment = Aligned32 };
+  enum { size = 16, alignment = Aligned32, vectorizable = true };
+};
+template <>
+struct unpacket_traits<Packet8q16i> {
+  typedef QInt16 type;
+  typedef Packet8q16i half;
+  enum { size = 8, alignment = Aligned32, vectorizable = true };
 };
 template <>
 struct unpacket_traits<Packet32q8u> {
   typedef QUInt8 type;
   typedef Packet16q8u half;
-  enum { size = 32, alignment = Aligned32 };
+  enum { size = 32, alignment = Aligned32, vectorizable = true };
 };
 template <>
 struct unpacket_traits<Packet8q32i> {
   typedef QInt32 type;
   typedef Packet4q32i half;
-  enum { size = 8, alignment = Aligned32 };
+  enum { size = 8, alignment = Aligned32, vectorizable = true };
 };
 
 // Unaligned load
@@ -206,6 +218,11 @@ EIGEN_STRONG_INLINE Packet32q8i ploadu<Packet32q8i>(const QInt8* from) {
       reinterpret_cast<const __m256i*>(from));
 }
 template <>
+EIGEN_STRONG_INLINE Packet16q8i ploadu<Packet16q8i>(const QInt8* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(
+      reinterpret_cast<const __m128i*>(from));
+}
+template <>
 EIGEN_STRONG_INLINE Packet32q8u ploadu<Packet32q8u>(const QUInt8* from) {
   EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
       reinterpret_cast<const __m256i*>(from));
@@ -215,6 +232,11 @@ EIGEN_STRONG_INLINE Packet16q16i ploadu<Packet16q16i>(const QInt16* from) {
   EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
       reinterpret_cast<const __m256i*>(from));
 }
+template<>
+EIGEN_STRONG_INLINE Packet8q16i ploadu<Packet8q16i>(const QInt16* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(
+      reinterpret_cast<const __m128i*>(from));
+}
 template <>
 EIGEN_STRONG_INLINE Packet8q32i ploadu<Packet8q32i>(const QInt32* from) {
   EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(
@@ -228,6 +250,11 @@ EIGEN_STRONG_INLINE Packet32q8i pload<Packet32q8i>(const QInt8* from) {
       reinterpret_cast<const __m256i*>(from));
 }
 template <>
+EIGEN_STRONG_INLINE Packet16q8i pload<Packet16q8i>(const QInt8* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(
+      reinterpret_cast<const __m128i*>(from));
+}
+template <>
 EIGEN_STRONG_INLINE Packet32q8u pload<Packet32q8u>(const QUInt8* from) {
   EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
       reinterpret_cast<const __m256i*>(from));
@@ -238,6 +265,11 @@ EIGEN_STRONG_INLINE Packet16q16i pload<Packet16q16i>(const QInt16* from) {
       reinterpret_cast<const __m256i*>(from));
 }
 template <>
+EIGEN_STRONG_INLINE Packet8q16i pload<Packet8q16i>(const QInt16* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(
+      reinterpret_cast<const __m128i*>(from));
+}
+template <>
 EIGEN_STRONG_INLINE Packet8q32i pload<Packet8q32i>(const QInt32* from) {
   EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(
       reinterpret_cast<const __m256i*>(from));
@@ -250,6 +282,11 @@ EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet32q8i& from) {
       reinterpret_cast<__m256i*>(to), from.val);
 }
 template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt8>(QInt8* to, const Packet16q8i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(
+      reinterpret_cast<__m128i*>(to), from.val);
+}
+template <>
 EIGEN_STRONG_INLINE void pstoreu<QUInt8>(QUInt8* to, const Packet32q8u& from) {
   EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
       reinterpret_cast<__m256i*>(to), from.val);
@@ -260,6 +297,11 @@ EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet16q16i& from) {
       reinterpret_cast<__m256i*>(to), from.val);
 }
 template <>
+EIGEN_STRONG_INLINE void pstoreu<QInt16>(QInt16* to, const Packet8q16i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(
+      reinterpret_cast<__m128i*>(to), from.val);
+}
+template <>
 EIGEN_STRONG_INLINE void pstoreu<QInt32>(QInt32* to, const Packet8q32i& from) {
   EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(
       reinterpret_cast<__m256i*>(to), from.val);
@@ -277,6 +319,11 @@ EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet16q16i& from) {
                                                from.val);
 }
 template <>
+EIGEN_STRONG_INLINE void pstore<QInt16>(QInt16* to, const Packet8q16i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
+                                            from.val);
+}
+template <>
 EIGEN_STRONG_INLINE void pstore<QUInt8>(QUInt8* to, const Packet32q8u& from) {
   EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
                                                from.val);
@@ -286,6 +333,11 @@ EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet32q8i& from) {
   EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to),
                                                from.val);
 }
+template <>
+EIGEN_STRONG_INLINE void pstore<QInt8>(QInt8* to, const Packet16q8i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to),
+                                            from.val);
+}
 
 // Extract first element.
 template <>
diff --git a/third_party/enum34.BUILD b/third_party/enum34.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..85262b07f6478bc91cfe8eb178a85c963feb4b79
--- /dev/null
+++ b/third_party/enum34.BUILD
@@ -0,0 +1,13 @@
+# Description:
+#   enum34 provides a backport of the enum module for Python 2.
+
+licenses(["notice"])  # MIT
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "enum",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/examples/eager/spinn/BUILD b/third_party/examples/eager/spinn/BUILD
index 0e39d4696fb5b4efafc94b4b96965d232ae4e473..640bcb230c8f3bcdf6f4c905e45cf32768b32418 100644
--- a/third_party/examples/eager/spinn/BUILD
+++ b/third_party/examples/eager/spinn/BUILD
@@ -1,6 +1,6 @@
 licenses(["notice"])  # 3-clause BSD.
 
-py_binary(
+py_library(
     name = "spinn",
     srcs = ["spinn.py"],
     srcs_version = "PY2AND3",
diff --git a/third_party/git/git_configure.bzl b/third_party/git/git_configure.bzl
index 8e2839bdc254acb42cf551cf561c40f8402c311a..fc18fdb98837e7ebf9ca812412a95dac8a2a7984 100644
--- a/third_party/git/git_configure.bzl
+++ b/third_party/git/git_configure.bzl
@@ -8,49 +8,57 @@
 _PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
 
 def _fail(msg):
-  """Output failure message when auto configuration fails."""
-  red = "\033[0;31m"
-  no_color = "\033[0m"
-  fail("%sGit Configuration Error:%s %s\n" % (red, no_color, msg))
+    """Output failure message when auto configuration fails."""
+    red = "\033[0;31m"
+    no_color = "\033[0m"
+    fail("%sGit Configuration Error:%s %s\n" % (red, no_color, msg))
 
 def _get_python_bin(repository_ctx):
-  """Gets the python bin path."""
-  python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH)
-  if python_bin != None:
-    return python_bin
-  python_bin_path = repository_ctx.which("python")
-  if python_bin_path != None:
-    return str(python_bin_path)
-  _fail("Cannot find python in PATH, please make sure " +
-        "python is installed and add its directory in PATH, or --define " +
-        "%s='/something/else'.\nPATH=%s" % (
-            _PYTHON_BIN_PATH, repository_ctx.os.environ.get("PATH", "")))
-
+    """Gets the python bin path."""
+    python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH)
+    if python_bin != None:
+        return python_bin
+    python_bin_path = repository_ctx.which("python")
+    if python_bin_path != None:
+        return str(python_bin_path)
+    _fail("Cannot find python in PATH, please make sure " +
+          "python is installed and add its directory in PATH, or --define " +
+          "%s='/something/else'.\nPATH=%s" % (
+              _PYTHON_BIN_PATH,
+              repository_ctx.os.environ.get("PATH", ""),
+          ))
 
 def _git_conf_impl(repository_ctx):
-  repository_ctx.template(
-      "BUILD",
-      Label("//third_party/git:BUILD.tpl"))
-
-  tensorflow_root_path = str(repository_ctx.path(
-      Label("@org_tensorflow//:BUILD")))[:-len("BUILD")]
-  python_script_path = repository_ctx.path(
-      Label("@org_tensorflow//tensorflow/tools/git:gen_git_source.py"))
-  generated_files_path = repository_ctx.path("gen")
+    repository_ctx.template(
+        "BUILD",
+        Label("//third_party/git:BUILD.tpl"),
+    )
 
-  r = repository_ctx.execute(
-      ["test", "-f", "%s/.git/logs/HEAD" % tensorflow_root_path])
-  if r.return_code == 0:
-    unused_var = repository_ctx.path(Label("//:.git/HEAD")) # pylint: disable=unused-variable
+    tensorflow_root_path = str(repository_ctx.path(
+        Label("@org_tensorflow//:BUILD"),
+    ))[:-len("BUILD")]
+    python_script_path = repository_ctx.path(
+        Label("@org_tensorflow//tensorflow/tools/git:gen_git_source.py"),
+    )
+    generated_files_path = repository_ctx.path("gen")
 
-  result = repository_ctx.execute([
-      _get_python_bin(repository_ctx),
-      python_script_path, "--configure", tensorflow_root_path,
-      "--gen_root_path", generated_files_path], quiet=False)
+    r = repository_ctx.execute(
+        ["test", "-f", "%s/.git/logs/HEAD" % tensorflow_root_path],
+    )
+    if r.return_code == 0:
+        unused_var = repository_ctx.path(Label("//:.git/HEAD"))  # pylint: disable=unused-variable
 
-  if not result.return_code == 0:
-    _fail(result.stderr)
+    result = repository_ctx.execute([
+        _get_python_bin(repository_ctx),
+        python_script_path,
+        "--configure",
+        tensorflow_root_path,
+        "--gen_root_path",
+        generated_files_path,
+    ], quiet = False)
 
+    if not result.return_code == 0:
+        _fail(result.stderr)
 
 git_configure = repository_rule(
     implementation = _git_conf_impl,
diff --git a/third_party/gpus/crosstool/BUILD.tpl b/third_party/gpus/crosstool/BUILD.tpl
index c8812fab3378328e44504598257a8860b45d1671..db76306ffbe9244a59d2e28e8e7c2a2f03e56f49 100644
--- a/third_party/gpus/crosstool/BUILD.tpl
+++ b/third_party/gpus/crosstool/BUILD.tpl
@@ -22,6 +22,12 @@ cc_toolchain_suite(
         "local|compiler": ":cc-compiler-local",
         "darwin|compiler": ":cc-compiler-darwin",
         "x64_windows|msvc-cl": ":cc-compiler-windows",
+        "x64_windows": ":cc-compiler-windows",
+        "arm": ":cc-compiler-local",
+        "k8": ":cc-compiler-local",
+        "piii": ":cc-compiler-local",
+        "ppc": ":cc-compiler-local",
+        "darwin": ":cc-compiler-darwin",
     },
 )
 
@@ -41,6 +47,7 @@ cc_toolchain(
     # last on the command line and contain all shared libraries to link, so all
     # regular options will be left of them.
     supports_param_files = 1,
+    toolchain_identifier = "local_linux",
 )
 
 cc_toolchain(
@@ -55,6 +62,7 @@ cc_toolchain(
     static_runtime_libs = [":empty"],
     strip_files = ":empty",
     supports_param_files = 0,
+    toolchain_identifier = "local_darwin",
 )
 
 cc_toolchain(
@@ -69,6 +77,7 @@ cc_toolchain(
     static_runtime_libs = [":empty"],
     strip_files = ":empty",
     supports_param_files = 1,
+    toolchain_identifier = "local_windows",
 )
 
 filegroup(
diff --git a/third_party/gpus/crosstool/CROSSTOOL.tpl b/third_party/gpus/crosstool/CROSSTOOL.tpl
index 921188cbb431d925df69fbd0cc06aac07fe1a1a9..1a13ac844caa4b46f030ef904537b3295a017418 100644
--- a/third_party/gpus/crosstool/CROSSTOOL.tpl
+++ b/third_party/gpus/crosstool/CROSSTOOL.tpl
@@ -2,31 +2,6 @@ major_version: "local"
 minor_version: ""
 default_target_cpu: "same_as_host"
 
-default_toolchain {
-  cpu: "k8"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "piii"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "arm"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "darwin"
-  toolchain_identifier: "local_darwin"
-}
-default_toolchain {
-  cpu: "ppc"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "x64_windows"
-  toolchain_identifier: "local_windows"
-}
-
 toolchain {
   abi_version: "local"
   abi_libc_version: "local"
@@ -642,6 +617,31 @@ toolchain {
     name: "no_legacy_features"
   }
 
+  # TODO(klimek): Previously we were using a .bat file to start python to run
+  # the python script that can redirect to nvcc - unfortunately .bat files
+  # have a rather short maximum length for command lines (8k). Instead, we
+  # now use the python binary as the compiler and pass the python script to
+  # it at the start of the command line. Investigate different possibilities
+  # to run the nvcc wrapper, either using pyinstaller --onefile, or writing
+  # a small C++ wrapper to redirect.
+  feature {
+    name: "redirector"
+    enabled: true
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      flag_group {
+        flag: "-B"
+        flag: "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py"
+      }
+    }
+  }
+
   # Suppress startup banner.
   feature {
     name: "nologo"
diff --git a/third_party/gpus/crosstool/CROSSTOOL_hipcc.tpl b/third_party/gpus/crosstool/CROSSTOOL_hipcc.tpl
index 0e175b3ef66b95c963cdface9417b64ede728bd2..221ccb2fe404f2b254d4f2c6c35b1b8d6f81e9b3 100644
--- a/third_party/gpus/crosstool/CROSSTOOL_hipcc.tpl
+++ b/third_party/gpus/crosstool/CROSSTOOL_hipcc.tpl
@@ -152,7 +152,11 @@ toolchain {
     # Removal of unused code and data at link time (can this increase binary size in some cases?).
     compiler_flag: "-ffunction-sections"
     compiler_flag: "-fdata-sections"
-    linker_flag: "-Wl,--gc-sections"
+
+    # With hipcc -fno-gpu-rdc, objects and libraries would contain a .kernel
+    # section which would be removed by ld. Therefore --gc-sections shall be
+    # abolished here.
+    #linker_flag: "-Wl,--gc-sections"
   }
   linking_mode_flags { mode: DYNAMIC }
 }
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
index 824238022b055d76618efca41da0e06054e160f1..c159531263270d21d78a9faac579b00596889fc6 100755
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
@@ -30,6 +30,12 @@ GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}')
 
 HIPCC_PATH = '%{hipcc_path}'
 PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
+HIPCC_ENV = '%{hipcc_env}'
+HIP_RUNTIME_PATH = '%{hip_runtime_path}'
+HIP_RUNTIME_LIBRARY = '%{hip_runtime_library}'
+HCC_RUNTIME_PATH = '%{hcc_runtime_path}'
+HCC_RUNTIME_LIBRARY = '%{hcc_runtime_library}'
+VERBOSE = '%{crosstool_verbose}'=='1'
 
 def Log(s):
   print('gpus/crosstool: {0}'.format(s))
@@ -165,6 +171,11 @@ def InvokeHipcc(argv, log=False):
 
   hipccopts = ' '
   hipccopts += ' ' + hipcc_compiler_options
+  # Use -fno-gpu-rdc by default for early GPU kernel finalization
+  # This flag would trigger GPU kernels be generated at compile time, instead
+  # of link time. This allows the default host compiler (gcc) be used as the
+  # linker for TensorFlow on ROCm platform.
+  hipccopts += ' -fno-gpu-rdc '
   hipccopts += undefines
   hipccopts += defines
   hipccopts += std_options
@@ -205,37 +216,46 @@ def main():
   args, leftover = parser.parse_known_args(sys.argv[1:])
 
   if args.x and args.x[0] == 'rocm':
+    # compilation for GPU objects
     if args.rocm_log: Log('-x rocm')
     leftover = [pipes.quote(s) for s in leftover]
     if args.rocm_log: Log('using hipcc')
     return InvokeHipcc(leftover, log=args.rocm_log)
 
-  # XXX use hipcc to link
-  if args.pass_exit_codes:
-    gpu_compiler_flags = [flag for flag in sys.argv[1:]
-                               if not flag.startswith(('-pass-exit-codes'))]
+  elif args.pass_exit_codes:
+    # link
+    # with hipcc compiler invoked with -fno-gpu-rdc by default now, it's ok to 
+    # use host compiler as linker, but we have to link with HCC/HIP runtime.
+    # Such restriction would be revised further as the bazel script get
+    # improved to fine tune dependencies to ROCm libraries.
+    gpu_linker_flags = [flag for flag in sys.argv[1:]
+                               if not flag.startswith(('--rocm_log'))]
+
+    gpu_linker_flags.append('-L' + HCC_RUNTIME_PATH)
+    gpu_linker_flags.append('-Wl,-rpath=' + HCC_RUNTIME_PATH)
+    gpu_linker_flags.append('-l' + HCC_RUNTIME_LIBRARY)
+    gpu_linker_flags.append('-L' + HIP_RUNTIME_PATH)
+    gpu_linker_flags.append('-Wl,-rpath=' + HIP_RUNTIME_PATH)
+    gpu_linker_flags.append('-l' + HIP_RUNTIME_LIBRARY)
+
+    if VERBOSE: print(' '.join([CPU_COMPILER] + gpu_linker_flags))
+    return subprocess.call([CPU_COMPILER] + gpu_linker_flags)
 
-    # special handling for $ORIGIN
-    # - guard every argument with ''
-    modified_gpu_compiler_flags = []
-    for flag in gpu_compiler_flags:
-      modified_gpu_compiler_flags.append("'" + flag + "'")
-
-    if args.rocm_log: Log('Link with hipcc: %s' % (' '.join([HIPCC_PATH] + modified_gpu_compiler_flags)))
-    return subprocess.call([HIPCC_PATH] + modified_gpu_compiler_flags)
-
-  # Strip our flags before passing through to the CPU compiler for files which
-  # are not -x rocm. We can't just pass 'leftover' because it also strips -x.
-  # We not only want to pass -x to the CPU compiler, but also keep it in its
-  # relative location in the argv list (the compiler is actually sensitive to
-  # this).
-  cpu_compiler_flags = [flag for flag in sys.argv[1:]
-                             if not flag.startswith(('--rocm_log'))]
-
-  # XXX: SE codes need to be built with gcc, but need this macro defined
-  cpu_compiler_flags.append("-D__HIP_PLATFORM_HCC__")
-
-  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+  else:
+    # compilation for host objects
+
+    # Strip our flags before passing through to the CPU compiler for files which
+    # are not -x rocm. We can't just pass 'leftover' because it also strips -x.
+    # We not only want to pass -x to the CPU compiler, but also keep it in its
+    # relative location in the argv list (the compiler is actually sensitive to
+    # this).
+    cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                               if not flag.startswith(('--rocm_log'))]
+
+    # XXX: SE codes need to be built with gcc, but need this macro defined
+    cpu_compiler_flags.append("-D__HIP_PLATFORM_HCC__")
+    if VERBOSE: print(' '.join([CPU_COMPILER] + cpu_compiler_flags))
+    return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
 
 if __name__ == '__main__':
   sys.exit(main())
diff --git a/third_party/gpus/crosstool/remote.BUILD.tpl b/third_party/gpus/crosstool/remote.BUILD.tpl
deleted file mode 100644
index b2316331db257a39086bdd5ca02b5ca6848cebcb..0000000000000000000000000000000000000000
--- a/third_party/gpus/crosstool/remote.BUILD.tpl
+++ /dev/null
@@ -1,10 +0,0 @@
-# Description:
-#   Template for crosstool Build file to use a pre-generated config.
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-alias(
-    name = "toolchain",
-    actual = "%{remote_cuda_repo}:toolchain",
-)
diff --git a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.bat.tpl b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.bat.tpl
deleted file mode 100644
index 8f8fb3e4231bf1b689cf9b21c53e990d5b9ee354..0000000000000000000000000000000000000000
--- a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.bat.tpl
+++ /dev/null
@@ -1,20 +0,0 @@
-:: Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-:: Invoke msvc_wrapper_for_nvcc.py, which is located in the same directory.
-@echo OFF
-set arg0=%~0
-for %%F in ("%arg0%") do set DRIVER_BIN=%%~dpF
-"%{python_binary}" -B "%DRIVER_BIN%\msvc_wrapper_for_nvcc.py" %*
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index f6b497f813185f82108de470ae39fac60d5d9f34..1921ef7c1fa0f13d336d3dae9e0eddf59ae25b44 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -202,4 +202,4 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
-%{cuda_include_genrules}
+%{copy_rules}
diff --git a/third_party/gpus/cuda/BUILD.windows.tpl b/third_party/gpus/cuda/BUILD.windows.tpl
index 325d18b9cb8a7c7c18c3df9e0630e67a9a28a937..3ed4fd415c33d3719307e3520084956f44430b0b 100644
--- a/third_party/gpus/cuda/BUILD.windows.tpl
+++ b/third_party/gpus/cuda/BUILD.windows.tpl
@@ -161,4 +161,4 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
-%{cuda_include_genrules}
+%{copy_rules}
diff --git a/third_party/gpus/cuda/remote.BUILD.tpl b/third_party/gpus/cuda/remote.BUILD.tpl
deleted file mode 100644
index 100c7bb7c41bd3f2a4e7e0eba865573d30422b45..0000000000000000000000000000000000000000
--- a/third_party/gpus/cuda/remote.BUILD.tpl
+++ /dev/null
@@ -1,110 +0,0 @@
-# Description:
-#   Template for cuda Build file to use a pre-generated config.
-licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
-
-package(default_visibility = ["//visibility:public"])
-
-config_setting(
-    name = "using_nvcc",
-    values = {
-        "define": "using_cuda_nvcc=true",
-    },
-)
-
-config_setting(
-    name = "using_clang",
-    values = {
-        "define": "using_cuda_clang=true",
-    },
-)
-
-# Equivalent to using_clang && -c opt.
-config_setting(
-    name = "using_clang_opt",
-    values = {
-        "define": "using_cuda_clang=true",
-        "compilation_mode": "opt",
-    },
-)
-
-config_setting(
-    name = "darwin",
-    values = {"cpu": "darwin"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "freebsd",
-    values = {"cpu": "freebsd"},
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "cuda_headers",
-    actual = "%{remote_cuda_repo}/cuda:cuda_headers",
-)
-
-alias(
-    name = "cudart_static",
-    actual = "%{remote_cuda_repo}/cuda:cudart_static",
-)
-
-alias(
-    name = "cuda_driver",
-    actual = "%{remote_cuda_repo}/cuda:cuda_driver",
-)
-
-alias(
-    name = "cudart",
-    actual = "%{remote_cuda_repo}/cuda:cudart",
-)
-
-alias(
-    name = "cublas",
-    actual = "%{remote_cuda_repo}/cuda:cublas",
-)
-
-alias(
-    name = "cusolver",
-    actual = "%{remote_cuda_repo}/cuda:cusolver",
-)
-
-alias(
-    name = "cudnn",
-    actual = "%{remote_cuda_repo}/cuda:cudnn",
-)
-
-alias(
-    name = "cudnn_header",
-    actual = "%{remote_cuda_repo}/cuda:cudnn_header",
-)
-
-alias(
-    name = "cufft",
-    actual = "%{remote_cuda_repo}/cuda:cufft",
-)
-
-alias(
-    name = "curand",
-    actual = "%{remote_cuda_repo}/cuda:curand",
-)
-
-alias(
-    name = "cuda",
-    actual = "%{remote_cuda_repo}/cuda:cuda",
-)
-
-alias(
-    name = "cupti_headers",
-    actual = "%{remote_cuda_repo}/cuda:cupti_headers",
-)
-
-alias(
-    name = "cupti_dsos",
-    actual = "%{remote_cuda_repo}/cuda:cupti_dsos",
-)
-
-alias(
-    name = "libdevice_root",
-    actual = "%{remote_cuda_repo}/cuda:libdevice_root",
-)
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 03c67bcb3d75aca19bcad8b824d79283193dc115..f13aacb93d59a4a61c8a1734385962b94e8a59a1 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -126,141 +126,142 @@ load(
 )
 
 def _get_python_bin(repository_ctx):
-  """Gets the python bin path."""
-  python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH)
-  if python_bin != None:
-    return python_bin
-  python_bin_name = "python.exe" if _is_windows(repository_ctx) else "python"
-  python_bin_path = repository_ctx.which(python_bin_name)
-  if python_bin_path != None:
-    return str(python_bin_path)
-  auto_configure_fail(
-      "Cannot find python in PATH, please make sure " +
-      "python is installed and add its directory in PATH, or --define " +
-      "%s='/something/else'.\nPATH=%s" % (
-          _PYTHON_BIN_PATH,
-          repository_ctx.os.environ.get("PATH", ""),
-      ))
-
+    """Gets the python bin path."""
+    python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH)
+    if python_bin != None:
+        return python_bin
+    python_bin_name = "python.exe" if _is_windows(repository_ctx) else "python"
+    python_bin_path = repository_ctx.which(python_bin_name)
+    if python_bin_path != None:
+        return str(python_bin_path)
+    auto_configure_fail(
+        "Cannot find python in PATH, please make sure " +
+        "python is installed and add its directory in PATH, or --define " +
+        "%s='/something/else'.\nPATH=%s" % (
+            _PYTHON_BIN_PATH,
+            repository_ctx.os.environ.get("PATH", ""),
+        ),
+    )
 
 def _get_nvcc_tmp_dir_for_windows(repository_ctx):
-  """Return the tmp directory for nvcc to generate intermediate source files."""
-  escaped_tmp_dir = escape_string(
-      get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace(
-          "\\", "\\\\"),)
-  return escaped_tmp_dir + "\\\\nvcc_inter_files_tmp_dir"
-
+    """Return the tmp directory for nvcc to generate intermediate source files."""
+    escaped_tmp_dir = escape_string(
+        get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace(
+            "\\",
+            "\\\\",
+        ),
+    )
+    return escaped_tmp_dir + "\\\\nvcc_inter_files_tmp_dir"
 
 def _get_msvc_compiler(repository_ctx):
-  vc_path = find_vc_path(repository_ctx)
-  return find_msvc_tool(repository_ctx, vc_path, "cl.exe").replace("\\", "/")
-
+    vc_path = find_vc_path(repository_ctx)
+    return find_msvc_tool(repository_ctx, vc_path, "cl.exe").replace("\\", "/")
 
 def _get_win_cuda_defines(repository_ctx):
-  """Return CROSSTOOL defines for Windows"""
+    """Return CROSSTOOL defines for Windows"""
+
+    # If we are not on Windows, return empty vaules for Windows specific fields.
+    # This ensures the CROSSTOOL file parser is happy.
+    if not _is_windows(repository_ctx):
+        return {
+            "%{msvc_env_tmp}": "",
+            "%{msvc_env_path}": "",
+            "%{msvc_env_include}": "",
+            "%{msvc_env_lib}": "",
+            "%{msvc_cl_path}": "",
+            "%{msvc_ml_path}": "",
+            "%{msvc_link_path}": "",
+            "%{msvc_lib_path}": "",
+            "%{cxx_builtin_include_directory}": "",
+        }
+
+    vc_path = find_vc_path(repository_ctx)
+    if not vc_path:
+        auto_configure_fail(
+            "Visual C++ build tools not found on your machine." +
+            "Please check your installation following https://docs.bazel.build/versions/master/windows.html#using",
+        )
+        return {}
+
+    env = setup_vc_env_vars(repository_ctx, vc_path)
+    escaped_paths = escape_string(env["PATH"])
+    escaped_include_paths = escape_string(env["INCLUDE"])
+    escaped_lib_paths = escape_string(env["LIB"])
+    escaped_tmp_dir = escape_string(
+        get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace(
+            "\\",
+            "\\\\",
+        ),
+    )
+
+    msvc_cl_path = _get_python_bin(repository_ctx)
+    msvc_ml_path = find_msvc_tool(repository_ctx, vc_path, "ml64.exe").replace(
+        "\\",
+        "/",
+    )
+    msvc_link_path = find_msvc_tool(repository_ctx, vc_path, "link.exe").replace(
+        "\\",
+        "/",
+    )
+    msvc_lib_path = find_msvc_tool(repository_ctx, vc_path, "lib.exe").replace(
+        "\\",
+        "/",
+    )
+
+    # nvcc will generate some temporary source files under %{nvcc_tmp_dir}
+    # The generated files are guranteed to have unique name, so they can share the same tmp directory
+    escaped_cxx_include_directories = [
+        "cxx_builtin_include_directory: \"%s\"" %
+        _get_nvcc_tmp_dir_for_windows(repository_ctx),
+    ]
+    for path in escaped_include_paths.split(";"):
+        if path:
+            escaped_cxx_include_directories.append(
+                "cxx_builtin_include_directory: \"%s\"" % path,
+            )
 
-  # If we are not on Windows, return empty vaules for Windows specific fields.
-  # This ensures the CROSSTOOL file parser is happy.
-  if not _is_windows(repository_ctx):
     return {
-        "%{msvc_env_tmp}": "",
-        "%{msvc_env_path}": "",
-        "%{msvc_env_include}": "",
-        "%{msvc_env_lib}": "",
-        "%{msvc_cl_path}": "",
-        "%{msvc_ml_path}": "",
-        "%{msvc_link_path}": "",
-        "%{msvc_lib_path}": "",
-        "%{cxx_builtin_include_directory}": "",
+        "%{msvc_env_tmp}": escaped_tmp_dir,
+        "%{msvc_env_path}": escaped_paths,
+        "%{msvc_env_include}": escaped_include_paths,
+        "%{msvc_env_lib}": escaped_lib_paths,
+        "%{msvc_cl_path}": msvc_cl_path,
+        "%{msvc_ml_path}": msvc_ml_path,
+        "%{msvc_link_path}": msvc_link_path,
+        "%{msvc_lib_path}": msvc_lib_path,
+        "%{cxx_builtin_include_directory}": "\n".join(escaped_cxx_include_directories),
     }
 
-  vc_path = find_vc_path(repository_ctx)
-  if not vc_path:
-    auto_configure_fail(
-        "Visual C++ build tools not found on your machine." +
-        "Please check your installation following https://docs.bazel.build/versions/master/windows.html#using"
-    )
-    return {}
-
-  env = setup_vc_env_vars(repository_ctx, vc_path)
-  escaped_paths = escape_string(env["PATH"])
-  escaped_include_paths = escape_string(env["INCLUDE"])
-  escaped_lib_paths = escape_string(env["LIB"])
-  escaped_tmp_dir = escape_string(
-      get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace(
-          "\\", "\\\\"),)
-
-  msvc_cl_path = "windows/msvc_wrapper_for_nvcc.bat"
-  msvc_ml_path = find_msvc_tool(repository_ctx, vc_path, "ml64.exe").replace(
-      "\\", "/")
-  msvc_link_path = find_msvc_tool(repository_ctx, vc_path, "link.exe").replace(
-      "\\", "/")
-  msvc_lib_path = find_msvc_tool(repository_ctx, vc_path, "lib.exe").replace(
-      "\\", "/")
-
-  # nvcc will generate some temporary source files under %{nvcc_tmp_dir}
-  # The generated files are guranteed to have unique name, so they can share the same tmp directory
-  escaped_cxx_include_directories = [
-      "cxx_builtin_include_directory: \"%s\"" %
-      _get_nvcc_tmp_dir_for_windows(repository_ctx)
-  ]
-  for path in escaped_include_paths.split(";"):
-    if path:
-      escaped_cxx_include_directories.append(
-          "cxx_builtin_include_directory: \"%s\"" % path)
-
-  return {
-      "%{msvc_env_tmp}":
-          escaped_tmp_dir,
-      "%{msvc_env_path}":
-          escaped_paths,
-      "%{msvc_env_include}":
-          escaped_include_paths,
-      "%{msvc_env_lib}":
-          escaped_lib_paths,
-      "%{msvc_cl_path}":
-          msvc_cl_path,
-      "%{msvc_ml_path}":
-          msvc_ml_path,
-      "%{msvc_link_path}":
-          msvc_link_path,
-      "%{msvc_lib_path}":
-          msvc_lib_path,
-      "%{cxx_builtin_include_directory}":
-          "\n".join(escaped_cxx_include_directories),
-  }
-
 # TODO(dzc): Once these functions have been factored out of Bazel's
 # cc_configure.bzl, load them from @bazel_tools instead.
 # BEGIN cc_configure common functions.
 def find_cc(repository_ctx):
-  """Find the C++ compiler."""
-  if _is_windows(repository_ctx):
-    return _get_msvc_compiler(repository_ctx)
-
-  if _use_cuda_clang(repository_ctx):
-    target_cc_name = "clang"
-    cc_path_envvar = _CLANG_CUDA_COMPILER_PATH
-    if _flag_enabled(repository_ctx, _TF_DOWNLOAD_CLANG):
-      return "extra_tools/bin/clang"
-  else:
-    target_cc_name = "gcc"
-    cc_path_envvar = _GCC_HOST_COMPILER_PATH
-  cc_name = target_cc_name
-
-  if cc_path_envvar in repository_ctx.os.environ:
-    cc_name_from_env = repository_ctx.os.environ[cc_path_envvar].strip()
-    if cc_name_from_env:
-      cc_name = cc_name_from_env
-  if cc_name.startswith("/"):
-    # Absolute path, maybe we should make this supported by our which function.
-    return cc_name
-  cc = repository_ctx.which(cc_name)
-  if cc == None:
-    fail(("Cannot find {}, either correct your path or set the {}" +
-          " environment variable").format(target_cc_name, cc_path_envvar))
-  return cc
-
+    """Find the C++ compiler."""
+    if _is_windows(repository_ctx):
+        return _get_msvc_compiler(repository_ctx)
+
+    if _use_cuda_clang(repository_ctx):
+        target_cc_name = "clang"
+        cc_path_envvar = _CLANG_CUDA_COMPILER_PATH
+        if _flag_enabled(repository_ctx, _TF_DOWNLOAD_CLANG):
+            return "extra_tools/bin/clang"
+    else:
+        target_cc_name = "gcc"
+        cc_path_envvar = _GCC_HOST_COMPILER_PATH
+    cc_name = target_cc_name
+
+    if cc_path_envvar in repository_ctx.os.environ:
+        cc_name_from_env = repository_ctx.os.environ[cc_path_envvar].strip()
+        if cc_name_from_env:
+            cc_name = cc_name_from_env
+    if cc_name.startswith("/"):
+        # Absolute path, maybe we should make this supported by our which function.
+        return cc_name
+    cc = repository_ctx.which(cc_name)
+    if cc == None:
+        fail(("Cannot find {}, either correct your path or set the {}" +
+              " environment variable").format(target_cc_name, cc_path_envvar))
+    return cc
 
 _INC_DIR_MARKER_BEGIN = "#include <...>"
 
@@ -269,729 +270,743 @@ _OSX_FRAMEWORK_SUFFIX = " (framework directory)"
 _OSX_FRAMEWORK_SUFFIX_LEN = len(_OSX_FRAMEWORK_SUFFIX)
 
 def _cxx_inc_convert(path):
-  """Convert path returned by cc -E xc++ in a complete path."""
-  path = path.strip()
-  if path.endswith(_OSX_FRAMEWORK_SUFFIX):
-    path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
-  return path
-
+    """Convert path returned by cc -E xc++ in a complete path."""
+    path = path.strip()
+    if path.endswith(_OSX_FRAMEWORK_SUFFIX):
+        path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
+    return path
 
 def _normalize_include_path(repository_ctx, path):
-  """Normalizes include paths before writing them to the crosstool.
-
-    If path points inside the 'crosstool' folder of the repository, a relative
-    path is returned.
-    If path points outside the 'crosstool' folder, an absolute path is returned.
-    """
-  path = str(repository_ctx.path(path))
-  crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
+    """Normalizes include paths before writing them to the crosstool.
 
-  if path.startswith(crosstool_folder):
-    # We drop the path to "$REPO/crosstool" and a trailing path separator.
-    return path[len(crosstool_folder) + 1:]
-  return path
+      If path points inside the 'crosstool' folder of the repository, a relative
+      path is returned.
+      If path points outside the 'crosstool' folder, an absolute path is returned.
+      """
+    path = str(repository_ctx.path(path))
+    crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
 
+    if path.startswith(crosstool_folder):
+        # We drop the path to "$REPO/crosstool" and a trailing path separator.
+        return path[len(crosstool_folder) + 1:]
+    return path
 
 def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp):
-  """Compute the list of default C or C++ include directories."""
-  if lang_is_cpp:
-    lang = "c++"
-  else:
-    lang = "c"
-  result = repository_ctx.execute([cc, "-E", "-x" + lang, "-", "-v"])
-  index1 = result.stderr.find(_INC_DIR_MARKER_BEGIN)
-  if index1 == -1:
-    return []
-  index1 = result.stderr.find("\n", index1)
-  if index1 == -1:
-    return []
-  index2 = result.stderr.rfind("\n ")
-  if index2 == -1 or index2 < index1:
-    return []
-  index2 = result.stderr.find("\n", index2 + 1)
-  if index2 == -1:
-    inc_dirs = result.stderr[index1 + 1:]
-  else:
-    inc_dirs = result.stderr[index1 + 1:index2].strip()
-
-  return [
-      _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
-      for p in inc_dirs.split("\n")
-  ]
+    """Compute the list of default C or C++ include directories."""
+    if lang_is_cpp:
+        lang = "c++"
+    else:
+        lang = "c"
+    result = repository_ctx.execute([cc, "-E", "-x" + lang, "-", "-v"])
+    index1 = result.stderr.find(_INC_DIR_MARKER_BEGIN)
+    if index1 == -1:
+        return []
+    index1 = result.stderr.find("\n", index1)
+    if index1 == -1:
+        return []
+    index2 = result.stderr.rfind("\n ")
+    if index2 == -1 or index2 < index1:
+        return []
+    index2 = result.stderr.find("\n", index2 + 1)
+    if index2 == -1:
+        inc_dirs = result.stderr[index1 + 1:]
+    else:
+        inc_dirs = result.stderr[index1 + 1:index2].strip()
 
+    return [
+        _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
+        for p in inc_dirs.split("\n")
+    ]
 
 def get_cxx_inc_directories(repository_ctx, cc):
-  """Compute the list of default C and C++ include directories."""
-
-  # For some reason `clang -xc` sometimes returns include paths that are
-  # different from the ones from `clang -xc++`. (Symlink and a dir)
-  # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
-  includes_cpp = _get_cxx_inc_directories_impl(repository_ctx, cc, True)
-  includes_c = _get_cxx_inc_directories_impl(repository_ctx, cc, False)
-
-  includes_cpp_set = depset(includes_cpp)
-  return includes_cpp + [
-      inc for inc in includes_c if inc not in includes_cpp_set
-  ]
-
+    """Compute the list of default C and C++ include directories."""
+
+    # For some reason `clang -xc` sometimes returns include paths that are
+    # different from the ones from `clang -xc++`. (Symlink and a dir)
+    # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
+    includes_cpp = _get_cxx_inc_directories_impl(repository_ctx, cc, True)
+    includes_c = _get_cxx_inc_directories_impl(repository_ctx, cc, False)
+
+    includes_cpp_set = depset(includes_cpp)
+    return includes_cpp + [
+        inc
+        for inc in includes_c
+        if inc not in includes_cpp_set
+    ]
 
 def auto_configure_fail(msg):
-  """Output failure message when cuda configuration fails."""
-  red = "\033[0;31m"
-  no_color = "\033[0m"
-  fail("\n%sCuda Configuration Error:%s %s\n" % (red, no_color, msg))
+    """Output failure message when cuda configuration fails."""
+    red = "\033[0;31m"
+    no_color = "\033[0m"
+    fail("\n%sCuda Configuration Error:%s %s\n" % (red, no_color, msg))
 
 # END cc_configure common functions (see TODO above).
 
 def _host_compiler_includes(repository_ctx, cc):
-  """Generates the cxx_builtin_include_directory entries for gcc inc dirs.
-
-    Args:
-      repository_ctx: The repository context.
-      cc: The path to the gcc host compiler.
-
-    Returns:
-      A string containing the cxx_builtin_include_directory for each of the gcc
-      host compiler include directories, which can be added to the CROSSTOOL
-      file.
-    """
-  inc_dirs = get_cxx_inc_directories(repository_ctx, cc)
-  inc_entries = []
-  for inc_dir in inc_dirs:
-    inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % inc_dir)
-  return "\n".join(inc_entries)
-
+    """Generates the cxx_builtin_include_directory entries for gcc inc dirs.
+
+      Args:
+        repository_ctx: The repository context.
+        cc: The path to the gcc host compiler.
+
+      Returns:
+        A string containing the cxx_builtin_include_directory for each of the gcc
+        host compiler include directories, which can be added to the CROSSTOOL
+        file.
+      """
+    inc_dirs = get_cxx_inc_directories(repository_ctx, cc)
+    inc_entries = []
+    for inc_dir in inc_dirs:
+        inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % inc_dir)
+    return "\n".join(inc_entries)
 
 def _cuda_include_path(repository_ctx, cuda_config):
-  """Generates the cxx_builtin_include_directory entries for cuda inc dirs.
-
-    Args:
-      repository_ctx: The repository context.
-      cc: The path to the gcc host compiler.
-
-    Returns:
-      A string containing the cxx_builtin_include_directory for each of the gcc
-      host compiler include directories, which can be added to the CROSSTOOL
-      file.
-    """
-  nvcc_path = repository_ctx.path("%s/bin/nvcc%s" % (
-      cuda_config.cuda_toolkit_path,
-      ".exe" if cuda_config.cpu_value == "Windows" else "",
-  ))
-  result = repository_ctx.execute([
-      nvcc_path,
-      "-v",
-      "/dev/null",
-      "-o",
-      "/dev/null",
-  ])
-  target_dir = ""
-  for one_line in result.stderr.splitlines():
-    if one_line.startswith("#$ _TARGET_DIR_="):
-      target_dir = (
-          cuda_config.cuda_toolkit_path + "/" + one_line.replace(
-              "#$ _TARGET_DIR_=", "") + "/include")
-  inc_entries = []
-  if target_dir != "":
-    inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % target_dir)
-  default_include = cuda_config.cuda_toolkit_path + "/include"
-  inc_entries.append(
-      "  cxx_builtin_include_directory: \"%s\"" % default_include)
-  return "\n".join(inc_entries)
-
-
-def _enable_cuda(repository_ctx):
-  if "TF_NEED_CUDA" in repository_ctx.os.environ:
-    enable_cuda = repository_ctx.os.environ["TF_NEED_CUDA"].strip()
-    return enable_cuda == "1"
-  return False
+    """Generates the cxx_builtin_include_directory entries for cuda inc dirs.
+
+      Args:
+        repository_ctx: The repository context.
+        cc: The path to the gcc host compiler.
+
+      Returns:
+        A string containing the cxx_builtin_include_directory for each of the gcc
+        host compiler include directories, which can be added to the CROSSTOOL
+        file.
+      """
+    nvcc_path = repository_ctx.path("%s/bin/nvcc%s" % (
+        cuda_config.cuda_toolkit_path,
+        ".exe" if cuda_config.cpu_value == "Windows" else "",
+    ))
+    result = repository_ctx.execute([
+        nvcc_path,
+        "-v",
+        "/dev/null",
+        "-o",
+        "/dev/null",
+    ])
+    target_dir = ""
+    for one_line in result.stderr.splitlines():
+        if one_line.startswith("#$ _TARGET_DIR_="):
+            target_dir = (
+                cuda_config.cuda_toolkit_path + "/" + one_line.replace(
+                    "#$ _TARGET_DIR_=",
+                    "",
+                ) + "/include"
+            )
+    inc_entries = []
+    if target_dir != "":
+        inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % target_dir)
+    default_include = cuda_config.cuda_toolkit_path + "/include"
+    inc_entries.append(
+        "  cxx_builtin_include_directory: \"%s\"" % default_include,
+    )
+    return "\n".join(inc_entries)
 
+def enable_cuda(repository_ctx):
+    if "TF_NEED_CUDA" in repository_ctx.os.environ:
+        enable_cuda = repository_ctx.os.environ["TF_NEED_CUDA"].strip()
+        return enable_cuda == "1"
+    return False
 
 def cuda_toolkit_path(repository_ctx):
-  """Finds the cuda toolkit directory.
+    """Finds the cuda toolkit directory.
 
-    Args:
-      repository_ctx: The repository context.
-
-    Returns:
-      A speculative real path of the cuda toolkit install directory.
-    """
-  cuda_toolkit_path = _DEFAULT_CUDA_TOOLKIT_PATH
-  if _CUDA_TOOLKIT_PATH in repository_ctx.os.environ:
-    cuda_toolkit_path = repository_ctx.os.environ[_CUDA_TOOLKIT_PATH].strip()
-  if not repository_ctx.path(cuda_toolkit_path).exists:
-    auto_configure_fail("Cannot find cuda toolkit path.")
-  return str(repository_ctx.path(cuda_toolkit_path).realpath)
+      Args:
+        repository_ctx: The repository context.
 
+      Returns:
+        A speculative real path of the cuda toolkit install directory.
+      """
+    cuda_toolkit_path = _DEFAULT_CUDA_TOOLKIT_PATH
+    if _CUDA_TOOLKIT_PATH in repository_ctx.os.environ:
+        cuda_toolkit_path = repository_ctx.os.environ[_CUDA_TOOLKIT_PATH].strip()
+    if not repository_ctx.path(cuda_toolkit_path).exists:
+        auto_configure_fail("Cannot find cuda toolkit path.")
+    return str(repository_ctx.path(cuda_toolkit_path).realpath)
 
 def _cudnn_install_basedir(repository_ctx):
-  """Finds the cudnn install directory."""
-  cudnn_install_path = _DEFAULT_CUDNN_INSTALL_PATH
-  if _CUDNN_INSTALL_PATH in repository_ctx.os.environ:
-    cudnn_install_path = repository_ctx.os.environ[_CUDNN_INSTALL_PATH].strip()
-  if not repository_ctx.path(cudnn_install_path).exists:
-    auto_configure_fail("Cannot find cudnn install path.")
-  return cudnn_install_path
-
+    """Finds the cudnn install directory."""
+    cudnn_install_path = _DEFAULT_CUDNN_INSTALL_PATH
+    if _CUDNN_INSTALL_PATH in repository_ctx.os.environ:
+        cudnn_install_path = repository_ctx.os.environ[_CUDNN_INSTALL_PATH].strip()
+    if not repository_ctx.path(cudnn_install_path).exists:
+        auto_configure_fail("Cannot find cudnn install path.")
+    return cudnn_install_path
 
 def matches_version(environ_version, detected_version):
-  """Checks whether the user-specified version matches the detected version.
-
-    This function performs a weak matching so that if the user specifies only
-    the
-    major or major and minor versions, the versions are still considered
-    matching
-    if the version parts match. To illustrate:
-
-        environ_version  detected_version  result
-        -----------------------------------------
-        5.1.3            5.1.3             True
-        5.1              5.1.3             True
-        5                5.1               True
-        5.1.3            5.1               False
-        5.2.3            5.1.3             False
-
-    Args:
-      environ_version: The version specified by the user via environment
-        variables.
-      detected_version: The version autodetected from the CUDA installation on
-        the system.
-    Returns: True if user-specified version matches detected version and False
-      otherwise.
-  """
-  environ_version_parts = environ_version.split(".")
-  detected_version_parts = detected_version.split(".")
-  if len(detected_version_parts) < len(environ_version_parts):
-    return False
-  for i, part in enumerate(detected_version_parts):
-    if i >= len(environ_version_parts):
-      break
-    if part != environ_version_parts[i]:
-      return False
-  return True
-
+    """Checks whether the user-specified version matches the detected version.
+
+      This function performs a weak matching so that if the user specifies only
+      the
+      major or major and minor versions, the versions are still considered
+      matching
+      if the version parts match. To illustrate:
+
+          environ_version  detected_version  result
+          -----------------------------------------
+          5.1.3            5.1.3             True
+          5.1              5.1.3             True
+          5                5.1               True
+          5.1.3            5.1               False
+          5.2.3            5.1.3             False
+
+      Args:
+        environ_version: The version specified by the user via environment
+          variables.
+        detected_version: The version autodetected from the CUDA installation on
+          the system.
+      Returns: True if user-specified version matches detected version and False
+        otherwise.
+    """
+    environ_version_parts = environ_version.split(".")
+    detected_version_parts = detected_version.split(".")
+    if len(detected_version_parts) < len(environ_version_parts):
+        return False
+    for i, part in enumerate(detected_version_parts):
+        if i >= len(environ_version_parts):
+            break
+        if part != environ_version_parts[i]:
+            return False
+    return True
 
 _NVCC_VERSION_PREFIX = "Cuda compilation tools, release "
 
 def _cuda_version(repository_ctx, cuda_toolkit_path, cpu_value):
-  """Detects the version of CUDA installed on the system.
-
-    Args:
-      repository_ctx: The repository context.
-      cuda_toolkit_path: The CUDA install directory.
-
-    Returns:
-      String containing the version of CUDA.
-    """
-
-  # Run nvcc --version and find the line containing the CUDA version.
-  nvcc_path = repository_ctx.path("%s/bin/nvcc%s" % (
-      cuda_toolkit_path,
-      ".exe" if cpu_value == "Windows" else "",
-  ))
-  if not nvcc_path.exists:
-    auto_configure_fail("Cannot find nvcc at %s" % str(nvcc_path))
-  result = repository_ctx.execute([str(nvcc_path), "--version"])
-  if result.stderr:
-    auto_configure_fail("Error running nvcc --version: %s" % result.stderr)
-  lines = result.stdout.splitlines()
-  version_line = lines[len(lines) - 1]
-  if version_line.find(_NVCC_VERSION_PREFIX) == -1:
-    auto_configure_fail(
-        "Could not parse CUDA version from nvcc --version. Got: %s" %
-        result.stdout,)
-
-  # Parse the CUDA version from the line containing the CUDA version.
-  prefix_removed = version_line.replace(_NVCC_VERSION_PREFIX, "")
-  parts = prefix_removed.split(",")
-  if len(parts) != 2 or len(parts[0]) < 2:
-    auto_configure_fail(
-        "Could not parse CUDA version from nvcc --version. Got: %s" %
-        result.stdout,)
-  full_version = parts[1].strip()
-  if full_version.startswith("V"):
-    full_version = full_version[1:]
-
-  # Check whether TF_CUDA_VERSION was set by the user and fail if it does not
-  # match the detected version.
-  environ_version = ""
-  if _TF_CUDA_VERSION in repository_ctx.os.environ:
-    environ_version = repository_ctx.os.environ[_TF_CUDA_VERSION].strip()
-  if environ_version and not matches_version(environ_version, full_version):
-    auto_configure_fail(
-        ("CUDA version detected from nvcc (%s) does not match " +
-         "TF_CUDA_VERSION (%s)") % (full_version, environ_version),)
-
-  # We only use the version consisting of the major and minor version numbers.
-  version_parts = full_version.split(".")
-  if len(version_parts) < 2:
-    auto_configure_fail("CUDA version detected from nvcc (%s) is incomplete.")
-  if cpu_value == "Windows":
-    version = "64_%s%s" % (version_parts[0], version_parts[1])
-  else:
-    version = "%s.%s" % (version_parts[0], version_parts[1])
-  return version
-
+    """Detects the version of CUDA installed on the system.
+
+      Args:
+        repository_ctx: The repository context.
+        cuda_toolkit_path: The CUDA install directory.
+
+      Returns:
+        String containing the version of CUDA.
+      """
+
+    # Run nvcc --version and find the line containing the CUDA version.
+    nvcc_path = repository_ctx.path("%s/bin/nvcc%s" % (
+        cuda_toolkit_path,
+        ".exe" if cpu_value == "Windows" else "",
+    ))
+    if not nvcc_path.exists:
+        auto_configure_fail("Cannot find nvcc at %s" % str(nvcc_path))
+    result = repository_ctx.execute([str(nvcc_path), "--version"])
+    if result.stderr:
+        auto_configure_fail("Error running nvcc --version: %s" % result.stderr)
+    lines = result.stdout.splitlines()
+    version_line = lines[len(lines) - 1]
+    if version_line.find(_NVCC_VERSION_PREFIX) == -1:
+        auto_configure_fail(
+            "Could not parse CUDA version from nvcc --version. Got: %s" %
+            result.stdout,
+        )
+
+    # Parse the CUDA version from the line containing the CUDA version.
+    prefix_removed = version_line.replace(_NVCC_VERSION_PREFIX, "")
+    parts = prefix_removed.split(",")
+    if len(parts) != 2 or len(parts[0]) < 2:
+        auto_configure_fail(
+            "Could not parse CUDA version from nvcc --version. Got: %s" %
+            result.stdout,
+        )
+    full_version = parts[1].strip()
+    if full_version.startswith("V"):
+        full_version = full_version[1:]
+
+    # Check whether TF_CUDA_VERSION was set by the user and fail if it does not
+    # match the detected version.
+    environ_version = ""
+    if _TF_CUDA_VERSION in repository_ctx.os.environ:
+        environ_version = repository_ctx.os.environ[_TF_CUDA_VERSION].strip()
+    if environ_version and not matches_version(environ_version, full_version):
+        auto_configure_fail(
+            ("CUDA version detected from nvcc (%s) does not match " +
+             "TF_CUDA_VERSION (%s)") % (full_version, environ_version),
+        )
+
+    # We only use the version consisting of the major and minor version numbers.
+    version_parts = full_version.split(".")
+    if len(version_parts) < 2:
+        auto_configure_fail("CUDA version detected from nvcc (%s) is incomplete.")
+    if cpu_value == "Windows":
+        version = "64_%s%s" % (version_parts[0], version_parts[1])
+    else:
+        version = "%s.%s" % (version_parts[0], version_parts[1])
+    return version
 
 _DEFINE_CUDNN_MAJOR = "#define CUDNN_MAJOR"
 _DEFINE_CUDNN_MINOR = "#define CUDNN_MINOR"
 _DEFINE_CUDNN_PATCHLEVEL = "#define CUDNN_PATCHLEVEL"
 
 def find_cuda_define(repository_ctx, header_dir, header_file, define):
-  """Returns the value of a #define in a header file.
-
-    Greps through a header file and returns the value of the specified #define.
-    If the #define is not found, then raise an error.
-
-    Args:
-      repository_ctx: The repository context.
-      header_dir: The directory containing the header file.
-      header_file: The header file name.
-      define: The #define to search for.
-
-    Returns:
-      The value of the #define found in the header.
-    """
-
-  # Confirm location of the header and grep for the line defining the macro.
-  h_path = repository_ctx.path("%s/%s" % (header_dir, header_file))
-  if not h_path.exists:
-    auto_configure_fail("Cannot find %s at %s" % (header_file, str(h_path)))
-  result = repository_ctx.execute(
-      # Grep one more lines as some #defines are splitted into two lines.
-      ["grep", "--color=never", "-A1", "-E", define,
-       str(h_path)],)
-  if result.stderr:
-    auto_configure_fail("Error reading %s: %s" % (str(h_path), result.stderr))
-
-  # Parse the version from the line defining the macro.
-  if result.stdout.find(define) == -1:
-    auto_configure_fail(
-        "Cannot find line containing '%s' in %s" % (define, h_path))
-
-  # Split results to lines
-  lines = result.stdout.split("\n")
-  num_lines = len(lines)
-  for l in range(num_lines):
-    line = lines[l]
-    if define in line:  # Find the line with define
-      version = line
-      if l != num_lines - 1 and line[-1] == "\\":  # Add next line, if multiline
-        version = version[:-1] + lines[l + 1]
-      break
-
-  # Remove any comments
-  version = version.split("//")[0]
-
-  # Remove define name
-  version = version.replace(define, "").strip()
-
-  # Remove the code after the version number.
-  version_end = version.find(" ")
-  if version_end != -1:
-    if version_end == 0:
-      auto_configure_fail(
-          "Cannot extract the version from line containing '%s' in %s" %
-          (define, str(h_path)),)
-    version = version[:version_end].strip()
-  return version
-
+    """Returns the value of a #define in a header file.
+
+      Greps through a header file and returns the value of the specified #define.
+      If the #define is not found, then raise an error.
+
+      Args:
+        repository_ctx: The repository context.
+        header_dir: The directory containing the header file.
+        header_file: The header file name.
+        define: The #define to search for.
+
+      Returns:
+        The value of the #define found in the header.
+      """
+
+    # Confirm location of the header and grep for the line defining the macro.
+    h_path = repository_ctx.path("%s/%s" % (header_dir, header_file))
+    if not h_path.exists:
+        auto_configure_fail("Cannot find %s at %s" % (header_file, str(h_path)))
+    result = repository_ctx.execute(
+        # Grep one more lines as some #defines are splitted into two lines.
+        [
+            "grep",
+            "--color=never",
+            "-A1",
+            "-E",
+            define,
+            str(h_path),
+        ],
+    )
+    if result.stderr:
+        auto_configure_fail("Error reading %s: %s" % (str(h_path), result.stderr))
+
+    # Parse the version from the line defining the macro.
+    if result.stdout.find(define) == -1:
+        auto_configure_fail(
+            "Cannot find line containing '%s' in %s" % (define, h_path),
+        )
+
+    # Split results to lines
+    lines = result.stdout.split("\n")
+    num_lines = len(lines)
+    for l in range(num_lines):
+        line = lines[l]
+        if define in line:  # Find the line with define
+            version = line
+            if l != num_lines - 1 and line[-1] == "\\":  # Add next line, if multiline
+                version = version[:-1] + lines[l + 1]
+            break
+
+    # Remove any comments
+    version = version.split("//")[0]
+
+    # Remove define name
+    version = version.replace(define, "").strip()
+
+    # Remove the code after the version number.
+    version_end = version.find(" ")
+    if version_end != -1:
+        if version_end == 0:
+            auto_configure_fail(
+                "Cannot extract the version from line containing '%s' in %s" %
+                (define, str(h_path)),
+            )
+        version = version[:version_end].strip()
+    return version
 
 def _cudnn_version(repository_ctx, cudnn_install_basedir, cpu_value):
-  """Detects the version of cuDNN installed on the system.
-
-    Args:
-      repository_ctx: The repository context.
-      cpu_value: The name of the host operating system.
-      cudnn_install_basedir: The cuDNN install directory.
+    """Detects the version of cuDNN installed on the system.
 
-    Returns:
-      A string containing the version of cuDNN.
-    """
-  cudnn_header_dir = _find_cudnn_header_dir(
-      repository_ctx,
-      cudnn_install_basedir,
-  )
-  major_version = find_cuda_define(
-      repository_ctx,
-      cudnn_header_dir,
-      "cudnn.h",
-      _DEFINE_CUDNN_MAJOR,
-  )
-  minor_version = find_cuda_define(
-      repository_ctx,
-      cudnn_header_dir,
-      "cudnn.h",
-      _DEFINE_CUDNN_MINOR,
-  )
-  patch_version = find_cuda_define(
-      repository_ctx,
-      cudnn_header_dir,
-      "cudnn.h",
-      _DEFINE_CUDNN_PATCHLEVEL,
-  )
-  full_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
-
-  # Check whether TF_CUDNN_VERSION was set by the user and fail if it does not
-  # match the detected version.
-  environ_version = ""
-  if _TF_CUDNN_VERSION in repository_ctx.os.environ:
-    environ_version = repository_ctx.os.environ[_TF_CUDNN_VERSION].strip()
-  if environ_version and not matches_version(environ_version, full_version):
-    cudnn_h_path = repository_ctx.path(
-        "%s/include/cudnn.h" % cudnn_install_basedir)
-    auto_configure_fail(("cuDNN version detected from %s (%s) does not match " +
-                         "TF_CUDNN_VERSION (%s)") %
-                        (str(cudnn_h_path), full_version, environ_version),)
-
-  # We only use the major version since we use the libcudnn libraries that are
-  # only versioned with the major version (e.g. libcudnn.so.5).
-  version = major_version
-  if cpu_value == "Windows":
-    version = "64_" + version
-  return version
+      Args:
+        repository_ctx: The repository context.
+        cpu_value: The name of the host operating system.
+        cudnn_install_basedir: The cuDNN install directory.
 
+      Returns:
+        A string containing the version of cuDNN.
+      """
+    cudnn_header_dir = _find_cudnn_header_dir(
+        repository_ctx,
+        cudnn_install_basedir,
+    )
+    major_version = find_cuda_define(
+        repository_ctx,
+        cudnn_header_dir,
+        "cudnn.h",
+        _DEFINE_CUDNN_MAJOR,
+    )
+    minor_version = find_cuda_define(
+        repository_ctx,
+        cudnn_header_dir,
+        "cudnn.h",
+        _DEFINE_CUDNN_MINOR,
+    )
+    patch_version = find_cuda_define(
+        repository_ctx,
+        cudnn_header_dir,
+        "cudnn.h",
+        _DEFINE_CUDNN_PATCHLEVEL,
+    )
+    full_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
+
+    # Check whether TF_CUDNN_VERSION was set by the user and fail if it does not
+    # match the detected version.
+    environ_version = ""
+    if _TF_CUDNN_VERSION in repository_ctx.os.environ:
+        environ_version = repository_ctx.os.environ[_TF_CUDNN_VERSION].strip()
+    if environ_version and not matches_version(environ_version, full_version):
+        cudnn_h_path = repository_ctx.path(
+            "%s/include/cudnn.h" % cudnn_install_basedir,
+        )
+        auto_configure_fail(("cuDNN version detected from %s (%s) does not match " +
+                             "TF_CUDNN_VERSION (%s)") %
+                            (str(cudnn_h_path), full_version, environ_version))
+
+    # Only use the major version to match the SONAME of the library.
+    version = major_version
+    if cpu_value == "Windows":
+        version = "64_" + version
+    return version
 
 def compute_capabilities(repository_ctx):
-  """Returns a list of strings representing cuda compute capabilities."""
-  if _TF_CUDA_COMPUTE_CAPABILITIES not in repository_ctx.os.environ:
-    return _DEFAULT_CUDA_COMPUTE_CAPABILITIES
-  capabilities_str = repository_ctx.os.environ[_TF_CUDA_COMPUTE_CAPABILITIES]
-  capabilities = capabilities_str.split(",")
-  for capability in capabilities:
-    # Workaround for Skylark's lack of support for regex. This check should
-    # be equivalent to checking:
-    #     if re.match("[0-9]+.[0-9]+", capability) == None:
-    parts = capability.split(".")
-    if len(parts) != 2 or not parts[0].isdigit() or not parts[1].isdigit():
-      auto_configure_fail("Invalid compute capability: %s" % capability)
-  return capabilities
-
+    """Returns a list of strings representing cuda compute capabilities."""
+    if _TF_CUDA_COMPUTE_CAPABILITIES not in repository_ctx.os.environ:
+        return _DEFAULT_CUDA_COMPUTE_CAPABILITIES
+    capabilities_str = repository_ctx.os.environ[_TF_CUDA_COMPUTE_CAPABILITIES]
+    capabilities = capabilities_str.split(",")
+    for capability in capabilities:
+        # Workaround for Skylark's lack of support for regex. This check should
+        # be equivalent to checking:
+        #     if re.match("[0-9]+.[0-9]+", capability) == None:
+        parts = capability.split(".")
+        if len(parts) != 2 or not parts[0].isdigit() or not parts[1].isdigit():
+            auto_configure_fail("Invalid compute capability: %s" % capability)
+    return capabilities
 
 def get_cpu_value(repository_ctx):
-  """Returns the name of the host operating system.
-
-    Args:
-      repository_ctx: The repository context.
-
-    Returns:
-      A string containing the name of the host operating system.
-    """
-  os_name = repository_ctx.os.name.lower()
-  if os_name.startswith("mac os"):
-    return "Darwin"
-  if os_name.find("windows") != -1:
-    return "Windows"
-  result = repository_ctx.execute(["uname", "-s"])
-  return result.stdout.strip()
-
+    """Returns the name of the host operating system.
+
+      Args:
+        repository_ctx: The repository context.
+
+      Returns:
+        A string containing the name of the host operating system.
+      """
+    os_name = repository_ctx.os.name.lower()
+    if os_name.startswith("mac os"):
+        return "Darwin"
+    if os_name.find("windows") != -1:
+        return "Windows"
+    result = repository_ctx.execute(["uname", "-s"])
+    return result.stdout.strip()
 
 def _is_windows(repository_ctx):
-  """Returns true if the host operating system is windows."""
-  return get_cpu_value(repository_ctx) == "Windows"
+    """Returns true if the host operating system is windows."""
+    return get_cpu_value(repository_ctx) == "Windows"
 
+def lib_name(base_name, cpu_value, version = None, static = False):
+    """Constructs the platform-specific name of a library.
 
-def _lib_name(lib, cpu_value, version = "", static = False):
-  """Constructs the platform-specific name of a library.
-
-    Args:
-      lib: The name of the library, such as "cudart"
-      cpu_value: The name of the host operating system.
-      version: The version of the library.
-      static: True the library is static or False if it is a shared object.
+      Args:
+        base_name: The name of the library, such as "cudart"
+        cpu_value: The name of the host operating system.
+        version: The version of the library.
+        static: True the library is static or False if it is a shared object.
+
+      Returns:
+        The platform-specific name of the library.
+      """
+    version = "" if not version else "." + version
+    if cpu_value in ("Linux", "FreeBSD"):
+        if static:
+            return "lib%s.a" % base_name
+        return "lib%s.so%s" % (base_name, version)
+    elif cpu_value == "Windows":
+        return "%s.lib" % base_name
+    elif cpu_value == "Darwin":
+        if static:
+            return "lib%s.a" % base_name
+        return "lib%s%s.dylib" % (base_name, version)
+    else:
+        auto_configure_fail("Invalid cpu_value: %s" % cpu_value)
 
-    Returns:
-      The platform-specific name of the library.
+def find_lib(repository_ctx, paths, check_soname = True):
     """
-  if cpu_value in ("Linux", "FreeBSD"):
-    if static:
-      return "lib%s.a" % lib
-    else:
-      if version:
-        version = ".%s" % version
-      return "lib%s.so%s" % (lib, version)
-  elif cpu_value == "Windows":
-    return "%s.lib" % lib
-  elif cpu_value == "Darwin":
-    if static:
-      return "lib%s.a" % lib
-    elif version:
-      version = ".%s" % version
-    return "lib%s%s.dylib" % (lib, version)
-  else:
-    auto_configure_fail("Invalid cpu_value: %s" % cpu_value)
+      Finds a library among a list of potential paths.
 
+      Args:
+        paths: List of paths to inspect.
+
+      Returns:
+        Returns the first path in paths that exist.
+    """
+    objdump = repository_ctx.which("objdump")
+    mismatches = []
+    for path in [repository_ctx.path(path) for path in paths]:
+        if not path.exists:
+            continue
+        if check_soname and objdump != None:
+            output = repository_ctx.execute([objdump, "-p", str(path)]).stdout
+            output = [line for line in output.splitlines() if "SONAME" in line]
+            sonames = [line.strip().split(" ")[-1] for line in output]
+            if not any([soname == path.basename for soname in sonames]):
+                mismatches.append(str(path))
+                continue
+        return path
+    if mismatches:
+        auto_configure_fail(
+            "None of the libraries match their SONAME: " + ", ".join(mismatches),
+        )
+    auto_configure_fail("No library found under: " + ", ".join(paths))
 
 def _find_cuda_lib(
         lib,
         repository_ctx,
         cpu_value,
         basedir,
-        version = "",
+        version,
         static = False):
-  """Finds the given CUDA or cuDNN library on the system.
-
-    Args:
-      lib: The name of the library, such as "cudart"
-      repository_ctx: The repository context.
-      cpu_value: The name of the host operating system.
-      basedir: The install directory of CUDA or cuDNN.
-      version: The version of the library.
-      static: True if static library, False if shared object.
-
-    Returns:
-      Returns a struct with the following fields:
-        file_name: The basename of the library found on the system.
-        path: The full path to the library.
-    """
-  file_name = _lib_name(lib, cpu_value, version, static)
-  for relative_path in CUDA_LIB_PATHS:
-    path = repository_ctx.path("%s/%s%s" % (basedir, relative_path, file_name))
-    if path.exists:
-      return struct(file_name=file_name, path=str(path.realpath))
-  auto_configure_fail("Cannot find cuda library %s" % file_name)
+    """Finds the given CUDA or cuDNN library on the system.
 
+      Args:
+        lib: The name of the library, such as "cudart"
+        repository_ctx: The repository context.
+        cpu_value: The name of the host operating system.
+        basedir: The install directory of CUDA or cuDNN.
+        version: The version of the library.
+        static: True if static library, False if shared object.
+
+      Returns:
+        Returns the path to the library.
+      """
+    file_name = lib_name(lib, cpu_value, version, static)
+    return find_lib(repository_ctx, [
+        "%s/%s%s" % (basedir, path, file_name)
+        for path in CUDA_LIB_PATHS
+    ], check_soname = version and not static)
 
 def _find_cupti_header_dir(repository_ctx, cuda_config):
-  """Returns the path to the directory containing cupti.h
-
-    On most systems, the cupti library is not installed in the same directory as
-    the other CUDA libraries but rather in a special extras/CUPTI directory.
-
-    Args:
-      repository_ctx: The repository context.
-      cuda_config: The CUDA config as returned by _get_cuda_config
-
-    Returns:
-      The path of the directory containing the cupti header.
-    """
-  cuda_toolkit_path = cuda_config.cuda_toolkit_path
-  for relative_path in CUPTI_HEADER_PATHS:
-    if repository_ctx.path(
-        "%s/%scupti.h" % (cuda_toolkit_path, relative_path)).exists:
-      return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
-  auto_configure_fail("Cannot find cupti.h under %s" % ", ".join(
-      [cuda_toolkit_path + "/" + s for s in CUPTI_HEADER_PATHS]))
-
+    """Returns the path to the directory containing cupti.h
+
+      On most systems, the cupti library is not installed in the same directory as
+      the other CUDA libraries but rather in a special extras/CUPTI directory.
+
+      Args:
+        repository_ctx: The repository context.
+        cuda_config: The CUDA config as returned by _get_cuda_config
+
+      Returns:
+        The path of the directory containing the cupti header.
+      """
+    cuda_toolkit_path = cuda_config.cuda_toolkit_path
+    for relative_path in CUPTI_HEADER_PATHS:
+        if repository_ctx.path(
+            "%s/%scupti.h" % (cuda_toolkit_path, relative_path),
+        ).exists:
+            return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
+    auto_configure_fail("Cannot find cupti.h under %s" % ", ".join(
+        [cuda_toolkit_path + "/" + s for s in CUPTI_HEADER_PATHS],
+    ))
 
 def _find_cupti_lib(repository_ctx, cuda_config):
-  """Finds the cupti library on the system.
-
-    On most systems, the cupti library is not installed in the same directory as
-    the other CUDA libraries but rather in a special extras/CUPTI directory.
-
-    Args:
-      repository_ctx: The repository context.
-      cuda_config: The cuda configuration as returned by _get_cuda_config.
-
-    Returns:
-      Returns a struct with the following fields:
-        file_name: The basename of the library found on the system.
-        path: The full path to the library.
-    """
-  file_name = _lib_name(
-      "cupti",
-      cuda_config.cpu_value,
-      cuda_config.cuda_version,
-  )
-  cuda_toolkit_path = cuda_config.cuda_toolkit_path
-  for relative_path in CUPTI_LIB_PATHS:
-    path = repository_ctx.path(
-        "%s/%s%s" % (cuda_toolkit_path, relative_path, file_name),)
-    if path.exists:
-      return struct(file_name=file_name, path=str(path.realpath))
-
-  auto_configure_fail("Cannot find cupti library %s" % file_name)
-
+    """Finds the cupti library on the system.
+
+      On most systems, the cupti library is not installed in the same directory as
+      the other CUDA libraries but rather in a special extras/CUPTI directory.
+
+      Args:
+        repository_ctx: The repository context.
+        cuda_config: The cuda configuration as returned by _get_cuda_config.
+
+      Returns:
+        Returns the path to the library.
+      """
+    file_name = lib_name(
+        "cupti",
+        cuda_config.cpu_value,
+        cuda_config.cuda_version,
+    )
+    basedir = cuda_config.cuda_toolkit_path
+    return find_lib(repository_ctx, [
+        "%s/%s%s" % (basedir, path, file_name)
+        for path in CUPTI_LIB_PATHS
+    ])
 
 def _find_libs(repository_ctx, cuda_config):
-  """Returns the CUDA and cuDNN libraries on the system.
+    """Returns the CUDA and cuDNN libraries on the system.
 
-    Args:
-      repository_ctx: The repository context.
-      cuda_config: The CUDA config as returned by _get_cuda_config
-
-    Returns:
-      Map of library names to structs of filename and path.
-    """
-  cpu_value = cuda_config.cpu_value
-  return {
-      "cuda":
-          _find_cuda_lib("cuda", repository_ctx, cpu_value,
-                         cuda_config.cuda_toolkit_path),
-      "cudart":
-          _find_cuda_lib(
-              "cudart",
-              repository_ctx,
-              cpu_value,
-              cuda_config.cuda_toolkit_path,
-              cuda_config.cuda_version,
-          ),
-      "cudart_static":
-          _find_cuda_lib(
-              "cudart_static",
-              repository_ctx,
-              cpu_value,
-              cuda_config.cuda_toolkit_path,
-              cuda_config.cuda_version,
-              static=True,
-          ),
-      "cublas":
-          _find_cuda_lib(
-              "cublas",
-              repository_ctx,
-              cpu_value,
-              cuda_config.cuda_toolkit_path,
-              cuda_config.cuda_version,
-          ),
-      "cusolver":
-          _find_cuda_lib(
-              "cusolver",
-              repository_ctx,
-              cpu_value,
-              cuda_config.cuda_toolkit_path,
-              cuda_config.cuda_version,
-          ),
-      "curand":
-          _find_cuda_lib(
-              "curand",
-              repository_ctx,
-              cpu_value,
-              cuda_config.cuda_toolkit_path,
-              cuda_config.cuda_version,
-          ),
-      "cufft":
-          _find_cuda_lib(
-              "cufft",
-              repository_ctx,
-              cpu_value,
-              cuda_config.cuda_toolkit_path,
-              cuda_config.cuda_version,
-          ),
-      "cudnn":
-          _find_cuda_lib(
-              "cudnn",
-              repository_ctx,
-              cpu_value,
-              cuda_config.cudnn_install_basedir,
-              cuda_config.cudnn_version,
-          ),
-      "cupti":
-          _find_cupti_lib(repository_ctx, cuda_config),
-  }
+      Args:
+        repository_ctx: The repository context.
+        cuda_config: The CUDA config as returned by _get_cuda_config
 
+      Returns:
+        Map of library names to structs of filename and path.
+      """
+    cpu_value = cuda_config.cpu_value
+    return {
+        "cuda": _find_cuda_lib(
+            "cuda",
+            repository_ctx,
+            cpu_value,
+            cuda_config.cuda_toolkit_path,
+            None,
+        ),
+        "cudart": _find_cuda_lib(
+            "cudart",
+            repository_ctx,
+            cpu_value,
+            cuda_config.cuda_toolkit_path,
+            cuda_config.cuda_version,
+        ),
+        "cudart_static": _find_cuda_lib(
+            "cudart_static",
+            repository_ctx,
+            cpu_value,
+            cuda_config.cuda_toolkit_path,
+            cuda_config.cuda_version,
+            static = True,
+        ),
+        "cublas": _find_cuda_lib(
+            "cublas",
+            repository_ctx,
+            cpu_value,
+            cuda_config.cuda_toolkit_path,
+            cuda_config.cuda_version,
+        ),
+        "cusolver": _find_cuda_lib(
+            "cusolver",
+            repository_ctx,
+            cpu_value,
+            cuda_config.cuda_toolkit_path,
+            cuda_config.cuda_version,
+        ),
+        "curand": _find_cuda_lib(
+            "curand",
+            repository_ctx,
+            cpu_value,
+            cuda_config.cuda_toolkit_path,
+            cuda_config.cuda_version,
+        ),
+        "cufft": _find_cuda_lib(
+            "cufft",
+            repository_ctx,
+            cpu_value,
+            cuda_config.cuda_toolkit_path,
+            cuda_config.cuda_version,
+        ),
+        "cudnn": _find_cuda_lib(
+            "cudnn",
+            repository_ctx,
+            cpu_value,
+            cuda_config.cudnn_install_basedir,
+            cuda_config.cudnn_version,
+        ),
+        "cupti": _find_cupti_lib(repository_ctx, cuda_config),
+    }
 
 def _find_cuda_include_path(repository_ctx, cuda_config):
-  """Returns the path to the directory containing cuda.h
-
-    Args:
-      repository_ctx: The repository context.
-      cuda_config: The CUDA config as returned by _get_cuda_config
-
-    Returns:
-      The path of the directory containing the CUDA headers.
-    """
-  cuda_toolkit_path = cuda_config.cuda_toolkit_path
-  for relative_path in CUDA_INCLUDE_PATHS:
-    if repository_ctx.path(
-        "%s/%scuda.h" % (cuda_toolkit_path, relative_path)).exists:
-      return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
-  auto_configure_fail("Cannot find cuda.h under %s" % cuda_toolkit_path)
-
+    """Returns the path to the directory containing cuda.h
+
+      Args:
+        repository_ctx: The repository context.
+        cuda_config: The CUDA config as returned by _get_cuda_config
+
+      Returns:
+        The path of the directory containing the CUDA headers.
+      """
+    cuda_toolkit_path = cuda_config.cuda_toolkit_path
+    for relative_path in CUDA_INCLUDE_PATHS:
+        if repository_ctx.path(
+            "%s/%scuda.h" % (cuda_toolkit_path, relative_path),
+        ).exists:
+            return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
+    auto_configure_fail("Cannot find cuda.h under %s" % cuda_toolkit_path)
 
 def _find_cudnn_header_dir(repository_ctx, cudnn_install_basedir):
-  """Returns the path to the directory containing cudnn.h
-
-    Args:
-      repository_ctx: The repository context.
-      cudnn_install_basedir: The cudnn install directory as returned by
-        _cudnn_install_basedir.
-
-    Returns:
-      The path of the directory containing the cudnn header.
-    """
-  for relative_path in CUDA_INCLUDE_PATHS:
-    if repository_ctx.path(
-        "%s/%scudnn.h" % (cudnn_install_basedir, relative_path)).exists:
-      return ("%s/%s" % (cudnn_install_basedir, relative_path))[:-1]
-  if repository_ctx.path("/usr/include/cudnn.h").exists:
-    return "/usr/include"
-  auto_configure_fail("Cannot find cudnn.h under %s" % cudnn_install_basedir)
-
+    """Returns the path to the directory containing cudnn.h
+
+      Args:
+        repository_ctx: The repository context.
+        cudnn_install_basedir: The cudnn install directory as returned by
+          _cudnn_install_basedir.
+
+      Returns:
+        The path of the directory containing the cudnn header.
+      """
+    for relative_path in CUDA_INCLUDE_PATHS:
+        if repository_ctx.path(
+            "%s/%scudnn.h" % (cudnn_install_basedir, relative_path),
+        ).exists:
+            return ("%s/%s" % (cudnn_install_basedir, relative_path))[:-1]
+    if repository_ctx.path("/usr/include/cudnn.h").exists:
+        return "/usr/include"
+    auto_configure_fail("Cannot find cudnn.h under %s" % cudnn_install_basedir)
 
 def _find_nvvm_libdevice_dir(repository_ctx, cuda_config):
-  """Returns the path to the directory containing libdevice in bitcode format.
-
-    Args:
-      repository_ctx: The repository context.
-      cuda_config: The CUDA config as returned by _get_cuda_config
-
-    Returns:
-      The path of the directory containing the CUDA headers.
-    """
-  cuda_toolkit_path = cuda_config.cuda_toolkit_path
-  for libdevice_file in NVVM_LIBDEVICE_FILES:
-    for relative_path in NVVM_LIBDEVICE_PATHS:
-      if repository_ctx.path("%s/%s%s" % (cuda_toolkit_path, relative_path,
-                                          libdevice_file)).exists:
-        return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
-  auto_configure_fail(
-      "Cannot find libdevice*.bc files under %s" % cuda_toolkit_path)
-
+    """Returns the path to the directory containing libdevice in bitcode format.
+
+      Args:
+        repository_ctx: The repository context.
+        cuda_config: The CUDA config as returned by _get_cuda_config
+
+      Returns:
+        The path of the directory containing the CUDA headers.
+      """
+    cuda_toolkit_path = cuda_config.cuda_toolkit_path
+    for libdevice_file in NVVM_LIBDEVICE_FILES:
+        for relative_path in NVVM_LIBDEVICE_PATHS:
+            if repository_ctx.path("%s/%s%s" % (
+                cuda_toolkit_path,
+                relative_path,
+                libdevice_file,
+            )).exists:
+                return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
+    auto_configure_fail(
+        "Cannot find libdevice*.bc files under %s" % cuda_toolkit_path,
+    )
 
 def _cudart_static_linkopt(cpu_value):
-  """Returns additional platform-specific linkopts for cudart."""
-  return "" if cpu_value == "Darwin" else "\"-lrt\","
-
+    """Returns additional platform-specific linkopts for cudart."""
+    return "" if cpu_value == "Darwin" else "\"-lrt\","
 
 def _get_cuda_config(repository_ctx):
-  """Detects and returns information about the CUDA installation on the system.
-
-    Args:
-      repository_ctx: The repository context.
-
-    Returns:
-      A struct containing the following fields:
-        cuda_toolkit_path: The CUDA toolkit installation directory.
-        cudnn_install_basedir: The cuDNN installation directory.
-        cuda_version: The version of CUDA on the system.
-        cudnn_version: The version of cuDNN on the system.
-        compute_capabilities: A list of the system's CUDA compute capabilities.
-        cpu_value: The name of the host operating system.
-    """
-  cpu_value = get_cpu_value(repository_ctx)
-  toolkit_path = cuda_toolkit_path(repository_ctx)
-  cuda_version = _cuda_version(repository_ctx, toolkit_path, cpu_value)
-  cudnn_install_basedir = _cudnn_install_basedir(repository_ctx)
-  cudnn_version = _cudnn_version(repository_ctx, cudnn_install_basedir,
-                                 cpu_value)
-  return struct(
-      cuda_toolkit_path=toolkit_path,
-      cudnn_install_basedir=cudnn_install_basedir,
-      cuda_version=cuda_version,
-      cudnn_version=cudnn_version,
-      compute_capabilities=compute_capabilities(repository_ctx),
-      cpu_value=cpu_value,
-  )
-
+    """Detects and returns information about the CUDA installation on the system.
+
+      Args:
+        repository_ctx: The repository context.
+
+      Returns:
+        A struct containing the following fields:
+          cuda_toolkit_path: The CUDA toolkit installation directory.
+          cudnn_install_basedir: The cuDNN installation directory.
+          cuda_version: The version of CUDA on the system.
+          cudnn_version: The version of cuDNN on the system.
+          compute_capabilities: A list of the system's CUDA compute capabilities.
+          cpu_value: The name of the host operating system.
+      """
+    cpu_value = get_cpu_value(repository_ctx)
+    toolkit_path = cuda_toolkit_path(repository_ctx)
+    cuda_version = _cuda_version(repository_ctx, toolkit_path, cpu_value)
+    cudnn_install_basedir = _cudnn_install_basedir(repository_ctx)
+    cudnn_version = _cudnn_version(
+        repository_ctx,
+        cudnn_install_basedir,
+        cpu_value,
+    )
+    return struct(
+        cuda_toolkit_path = toolkit_path,
+        cudnn_install_basedir = cudnn_install_basedir,
+        cuda_version = cuda_version,
+        cudnn_version = cudnn_version,
+        compute_capabilities = compute_capabilities(repository_ctx),
+        cpu_value = cpu_value,
+    )
 
 def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
-  if not out:
-    out = tpl.replace(":", "/")
-  repository_ctx.template(
-      out,
-      Label("//third_party/gpus/%s.tpl" % tpl),
-      substitutions,
-  )
-
+    if not out:
+        out = tpl.replace(":", "/")
+    repository_ctx.template(
+        out,
+        Label("//third_party/gpus/%s.tpl" % tpl),
+        substitutions,
+    )
 
 def _file(repository_ctx, label):
-  repository_ctx.template(
-      label.replace(":", "/"),
-      Label("//third_party/gpus/%s.tpl" % label),
-      {},
-  )
-
+    repository_ctx.template(
+        label.replace(":", "/"),
+        Label("//third_party/gpus/%s.tpl" % label),
+        {},
+    )
 
 _DUMMY_CROSSTOOL_BZL_FILE = """
 def error_gpu_disabled():
@@ -1019,99 +1034,83 @@ error_gpu_disabled()
 """
 
 def _create_dummy_repository(repository_ctx):
-  cpu_value = get_cpu_value(repository_ctx)
-
-  # Set up BUILD file for cuda/.
-  _tpl(
-      repository_ctx,
-      "cuda:build_defs.bzl",
-      {
-          "%{cuda_is_configured}": "False",
-          "%{cuda_extra_copts}": "[]",
-      },
-  )
-  _tpl(
-      repository_ctx,
-      "cuda:BUILD",
-      {
-          "%{cuda_driver_lib}":
-              _lib_name("cuda", cpu_value),
-          "%{cudart_static_lib}":
-              _lib_name(
-                  "cudart_static",
-                  cpu_value,
-                  static=True,
-              ),
-          "%{cudart_static_linkopt}":
-              _cudart_static_linkopt(cpu_value),
-          "%{cudart_lib}":
-              _lib_name("cudart", cpu_value),
-          "%{cublas_lib}":
-              _lib_name("cublas", cpu_value),
-          "%{cusolver_lib}":
-              _lib_name("cusolver", cpu_value),
-          "%{cudnn_lib}":
-              _lib_name("cudnn", cpu_value),
-          "%{cufft_lib}":
-              _lib_name("cufft", cpu_value),
-          "%{curand_lib}":
-              _lib_name("curand", cpu_value),
-          "%{cupti_lib}":
-              _lib_name("cupti", cpu_value),
-          "%{cuda_include_genrules}":
-              "",
-          "%{cuda_headers}":
-              "",
-      },
-  )
+    cpu_value = get_cpu_value(repository_ctx)
 
-  # Create dummy files for the CUDA toolkit since they are still required by
-  # tensorflow/core/platform/default/build_config:cuda.
-  repository_ctx.file("cuda/cuda/include/cuda.h", "")
-  repository_ctx.file("cuda/cuda/include/cublas.h", "")
-  repository_ctx.file("cuda/cuda/include/cudnn.h", "")
-  repository_ctx.file("cuda/cuda/extras/CUPTI/include/cupti.h", "")
-  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cuda", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudart", cpu_value))
-  repository_ctx.file(
-      "cuda/cuda/lib/%s" % _lib_name("cudart_static", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cublas", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cusolver", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cudnn", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("curand", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cufft", cpu_value))
-  repository_ctx.file("cuda/cuda/lib/%s" % _lib_name("cupti", cpu_value))
-
-  # Set up cuda_config.h, which is used by
-  # tensorflow/stream_executor/dso_loader.cc.
-  _tpl(
-      repository_ctx,
-      "cuda:cuda_config.h",
-      {
-          "%{cuda_version}":
-              _DEFAULT_CUDA_VERSION,
-          "%{cudnn_version}":
-              _DEFAULT_CUDNN_VERSION,
-          "%{cuda_compute_capabilities}":
-              ",".join([
-                  "CudaVersion(\"%s\")" % c
-                  for c in _DEFAULT_CUDA_COMPUTE_CAPABILITIES
-              ]),
-          "%{cuda_toolkit_path}":
-              _DEFAULT_CUDA_TOOLKIT_PATH,
-      },
-      "cuda/cuda/cuda_config.h",
-  )
+    # Set up BUILD file for cuda/.
+    _tpl(
+        repository_ctx,
+        "cuda:build_defs.bzl",
+        {
+            "%{cuda_is_configured}": "False",
+            "%{cuda_extra_copts}": "[]",
+        },
+    )
+    _tpl(
+        repository_ctx,
+        "cuda:BUILD",
+        {
+            "%{cuda_driver_lib}": lib_name("cuda", cpu_value),
+            "%{cudart_static_lib}": lib_name(
+                "cudart_static",
+                cpu_value,
+                static = True,
+            ),
+            "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
+            "%{cudart_lib}": lib_name("cudart", cpu_value),
+            "%{cublas_lib}": lib_name("cublas", cpu_value),
+            "%{cusolver_lib}": lib_name("cusolver", cpu_value),
+            "%{cudnn_lib}": lib_name("cudnn", cpu_value),
+            "%{cufft_lib}": lib_name("cufft", cpu_value),
+            "%{curand_lib}": lib_name("curand", cpu_value),
+            "%{cupti_lib}": lib_name("cupti", cpu_value),
+            "%{copy_rules}": "",
+            "%{cuda_headers}": "",
+        },
+    )
 
-  # If cuda_configure is not configured to build with GPU support, and the user
-  # attempts to build with --config=cuda, add a dummy build rule to intercept
-  # this and fail with an actionable error message.
-  repository_ctx.file(
-      "crosstool/error_gpu_disabled.bzl",
-      _DUMMY_CROSSTOOL_BZL_FILE,
-  )
-  repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
+    # Create dummy files for the CUDA toolkit since they are still required by
+    # tensorflow/core/platform/default/build_config:cuda.
+    repository_ctx.file("cuda/cuda/include/cuda.h")
+    repository_ctx.file("cuda/cuda/include/cublas.h")
+    repository_ctx.file("cuda/cuda/include/cudnn.h")
+    repository_ctx.file("cuda/cuda/extras/CUPTI/include/cupti.h")
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cuda", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cudart", cpu_value))
+    repository_ctx.file(
+        "cuda/cuda/lib/%s" % lib_name("cudart_static", cpu_value),
+    )
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cublas", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cusolver", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cudnn", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("curand", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cufft", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cupti", cpu_value))
+
+    # Set up cuda_config.h, which is used by
+    # tensorflow/stream_executor/dso_loader.cc.
+    _tpl(
+        repository_ctx,
+        "cuda:cuda_config.h",
+        {
+            "%{cuda_version}": _DEFAULT_CUDA_VERSION,
+            "%{cudnn_version}": _DEFAULT_CUDNN_VERSION,
+            "%{cuda_compute_capabilities}": ",".join([
+                "CudaVersion(\"%s\")" % c
+                for c in _DEFAULT_CUDA_COMPUTE_CAPABILITIES
+            ]),
+            "%{cuda_toolkit_path}": _DEFAULT_CUDA_TOOLKIT_PATH,
+        },
+        "cuda/cuda/cuda_config.h",
+    )
 
+    # If cuda_configure is not configured to build with GPU support, and the user
+    # attempts to build with --config=cuda, add a dummy build rule to intercept
+    # this and fail with an actionable error message.
+    repository_ctx.file(
+        "crosstool/error_gpu_disabled.bzl",
+        _DUMMY_CROSSTOOL_BZL_FILE,
+    )
+    repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
 
 def _execute(
         repository_ctx,
@@ -1119,450 +1118,387 @@ def _execute(
         error_msg = None,
         error_details = None,
         empty_stdout_fine = False):
-  """Executes an arbitrary shell command.
-
-    Args:
-      repository_ctx: the repository_ctx object
-      cmdline: list of strings, the command to execute
-      error_msg: string, a summary of the error if the command fails
-      error_details: string, details about the error or steps to fix it
-      empty_stdout_fine: bool, if True, an empty stdout result is fine,
-        otherwise it's an error
-    Return: the result of repository_ctx.execute(cmdline)
-  """
-  result = repository_ctx.execute(cmdline)
-  if result.stderr or not (empty_stdout_fine or result.stdout):
-    auto_configure_fail(
-        "\n".join([
-            error_msg.strip() if error_msg else "Repository command failed",
-            result.stderr.strip(),
-            error_details if error_details else "",
-        ]),)
-  return result
-
+    """Executes an arbitrary shell command.
+
+      Args:
+        repository_ctx: the repository_ctx object
+        cmdline: list of strings, the command to execute
+        error_msg: string, a summary of the error if the command fails
+        error_details: string, details about the error or steps to fix it
+        empty_stdout_fine: bool, if True, an empty stdout result is fine,
+          otherwise it's an error
+      Return: the result of repository_ctx.execute(cmdline)
+    """
+    result = repository_ctx.execute(cmdline)
+    if result.stderr or not (empty_stdout_fine or result.stdout):
+        auto_configure_fail(
+            "\n".join([
+                error_msg.strip() if error_msg else "Repository command failed",
+                result.stderr.strip(),
+                error_details if error_details else "",
+            ]),
+        )
+    return result
 
 def _norm_path(path):
-  """Returns a path with '/' and remove the trailing slash."""
-  path = path.replace("\\", "/")
-  if path[-1] == "/":
-    path = path[:-1]
-  return path
-
+    """Returns a path with '/' and remove the trailing slash."""
+    path = path.replace("\\", "/")
+    if path[-1] == "/":
+        path = path[:-1]
+    return path
+
+def make_copy_files_rule(repository_ctx, name, srcs, outs):
+    """Returns a rule to copy a set of files."""
+    cmds = []
+
+    # Copy files.
+    for src, out in zip(srcs, outs):
+        cmds.append('cp -f "%s" $(location %s)' % (src, out))
+    outs = [('        "%s",' % out) for out in outs]
+    return """genrule(
+    name = "%s",
+    outs = [
+%s
+    ],
+    cmd = \"""%s \""",
+)""" % (name, "\n".join(outs), " && ".join(cmds))
 
-def symlink_genrule_for_dir(
-        repository_ctx,
-        src_dir,
-        dest_dir,
-        genrule_name,
-        src_files = [],
-        dest_files = []):
-  """Returns a genrule to symlink(or copy if on Windows) a set of files.
-
-    If src_dir is passed, files will be read from the given directory; otherwise
-    we assume files are in src_files and dest_files
-    """
-  if src_dir != None:
+def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir):
+    """Returns a rule to recursively copy a directory."""
     src_dir = _norm_path(src_dir)
-    dest_dir = _norm_path(dest_dir)
-    files = "\n".join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
-
-    # Create a list with the src_dir stripped to use for outputs.
-    dest_files = files.replace(src_dir, "").splitlines()
-    src_files = files.splitlines()
-  command = []
-  if not _is_windows(repository_ctx):
-    # We clear folders that might have been generated previously to avoid
-    # undesired inclusions
-    command.append('if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi')
-    command.append('if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi')
-    command.append('if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi')
-    command.append('if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi')
-  outs = []
-  for i in range(len(dest_files)):
-    if dest_files[i] != "":
-      # If we have only one file to link we do not want to use the dest_dir, as
-      # $(@D) will include the full path to the file.
-      dest = "$(@D)/" + dest_dir + dest_files[i] if len(
-          dest_files) != 1 else "$(@D)/" + dest_files[i]
-
-      # Copy the headers to create a sandboxable setup.
-      cmd = "cp -f"
-      command.append(cmd + ' "%s" "%s"' % (src_files[i], dest))
-      outs.append('        "' + dest_dir + dest_files[i] + '",')
-  genrule = _genrule(
-      src_dir,
-      genrule_name,
-      " && ".join(command),
-      "\n".join(outs),
-  )
-  return genrule
+    out_dir = _norm_path(out_dir)
+    outs = _read_dir(repository_ctx, src_dir)
+    outs = [('        "%s",' % out.replace(src_dir, out_dir)) for out in outs]
+
+    # '@D' already contains the relative path for a single file, see
+    # http://docs.bazel.build/versions/master/be/make-variables.html#predefined_genrule_variables
+    out_dir = "$(@D)/%s" % out_dir if len(outs) > 1 else "$(@D)"
+    return """genrule(
+    name = "%s",
+    outs = [
+%s
+    ],
+    cmd = \"""cp -rLf "%s/." "%s/" \""",
+)""" % (name, "\n".join(outs), src_dir, out_dir)
 
+def _read_dir(repository_ctx, src_dir):
+    """Returns a string with all files in a directory.
+
+      Finds all files inside a directory, traversing subfolders and following
+      symlinks. The returned string contains the full path of all files
+      separated by line breaks.
+      """
+    if _is_windows(repository_ctx):
+        src_dir = src_dir.replace("/", "\\")
+        find_result = _execute(
+            repository_ctx,
+            ["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
+            empty_stdout_fine = True,
+        )
 
-def _genrule(src_dir, genrule_name, command, outs):
-  """Returns a string with a genrule.
+        # src_files will be used in genrule.outs where the paths must
+        # use forward slashes.
+        result = find_result.stdout.replace("\\", "/")
+    else:
+        find_result = _execute(
+            repository_ctx,
+            ["find", src_dir, "-follow", "-type", "f"],
+            empty_stdout_fine = True,
+        )
+        result = find_result.stdout
+    return sorted(result.splitlines())
 
-    Genrule executes the given command and produces the given outputs.
-    """
-  return (
-      "genrule(\n" + '    name = "' + genrule_name + '",\n' + "    outs = [\n" +
-      outs + "\n    ],\n" + '    cmd = """\n' + command + '\n   """,\n' + ")\n")
+def _flag_enabled(repository_ctx, flag_name):
+    if flag_name in repository_ctx.os.environ:
+        value = repository_ctx.os.environ[flag_name].strip()
+        return value == "1"
+    return False
 
+def _use_cuda_clang(repository_ctx):
+    return _flag_enabled(repository_ctx, "TF_CUDA_CLANG")
 
-def _read_dir(repository_ctx, src_dir):
-  """Returns a string with all files in a directory.
+def _compute_cuda_extra_copts(repository_ctx, compute_capabilities):
+    if _use_cuda_clang(repository_ctx):
+        capability_flags = [
+            "--cuda-gpu-arch=sm_" + cap.replace(".", "")
+            for cap in compute_capabilities
+        ]
+    else:
+        # Capabilities are handled in the "crosstool_wrapper_driver_is_not_gcc" for nvcc
+        # TODO(csigg): Make this consistent with cuda clang and pass to crosstool.
+        capability_flags = []
+    return str(capability_flags)
 
-    Finds all files inside a directory, traversing subfolders and following
-    symlinks. The returned string contains the full path of all files
-    separated by line breaks.
-    """
-  if _is_windows(repository_ctx):
-    src_dir = src_dir.replace("/", "\\")
-    find_result = _execute(
-        repository_ctx,
-        ["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
-        empty_stdout_fine=True,
-    )
+def _create_local_cuda_repository(repository_ctx):
+    """Creates the repository containing files set up to build with CUDA."""
+    cuda_config = _get_cuda_config(repository_ctx)
 
-    # src_files will be used in genrule.outs where the paths must
-    # use forward slashes.
-    result = find_result.stdout.replace("\\", "/")
-  else:
-    find_result = _execute(
+    cuda_include_path = _find_cuda_include_path(repository_ctx, cuda_config)
+    cudnn_header_dir = _find_cudnn_header_dir(
         repository_ctx,
-        ["find", src_dir, "-follow", "-type", "f"],
-        empty_stdout_fine=True,
+        cuda_config.cudnn_install_basedir,
     )
-    result = find_result.stdout
-  return result
+    cupti_header_dir = _find_cupti_header_dir(repository_ctx, cuda_config)
+    nvvm_libdevice_dir = _find_nvvm_libdevice_dir(repository_ctx, cuda_config)
 
-
-def _flag_enabled(repository_ctx, flag_name):
-  if flag_name in repository_ctx.os.environ:
-    value = repository_ctx.os.environ[flag_name].strip()
-    return value == "1"
-  return False
-
-
-def _use_cuda_clang(repository_ctx):
-  return _flag_enabled(repository_ctx, "TF_CUDA_CLANG")
-
-
-def _compute_cuda_extra_copts(repository_ctx, compute_capabilities):
-  if _use_cuda_clang(repository_ctx):
-    capability_flags = [
-        "--cuda-gpu-arch=sm_" + cap.replace(".", "")
-        for cap in compute_capabilities
+    # Create genrule to copy files from the installed CUDA toolkit into execroot.
+    copy_rules = [
+        make_copy_dir_rule(
+            repository_ctx,
+            name = "cuda-include",
+            src_dir = cuda_include_path,
+            out_dir = "cuda/include",
+        ),
+        make_copy_dir_rule(
+            repository_ctx,
+            name = "cuda-nvvm",
+            src_dir = nvvm_libdevice_dir,
+            out_dir = "cuda/nvvm/libdevice",
+        ),
+        make_copy_dir_rule(
+            repository_ctx,
+            name = "cuda-extras",
+            src_dir = cupti_header_dir,
+            out_dir = "cuda/extras/CUPTI/include",
+        ),
     ]
-  else:
-    # Capabilities are handled in the "crosstool_wrapper_driver_is_not_gcc" for nvcc
-    # TODO(csigg): Make this consistent with cuda clang and pass to crosstool.
-    capability_flags = []
-  return str(capability_flags)
 
+    cuda_libs = _find_libs(repository_ctx, cuda_config)
+    cuda_lib_srcs = []
+    cuda_lib_outs = []
+    for path in cuda_libs.values():
+        cuda_lib_srcs.append(str(path))
+        cuda_lib_outs.append("cuda/lib/" + path.basename)
+    copy_rules.append(make_copy_files_rule(
+        repository_ctx,
+        name = "cuda-lib",
+        srcs = cuda_lib_srcs,
+        outs = cuda_lib_outs,
+    ))
 
-def _create_local_cuda_repository(repository_ctx):
-  """Creates the repository containing files set up to build with CUDA."""
-  cuda_config = _get_cuda_config(repository_ctx)
-
-  cuda_include_path = _find_cuda_include_path(repository_ctx, cuda_config)
-  cudnn_header_dir = _find_cudnn_header_dir(
-      repository_ctx,
-      cuda_config.cudnn_install_basedir,
-  )
-  cupti_header_dir = _find_cupti_header_dir(repository_ctx, cuda_config)
-  nvvm_libdevice_dir = _find_nvvm_libdevice_dir(repository_ctx, cuda_config)
-
-  # Set up symbolic links for the cuda toolkit by creating genrules to do
-  # symlinking. We create one genrule for each directory we want to track under
-  # cuda_toolkit_path
-  cuda_toolkit_path = cuda_config.cuda_toolkit_path
-  genrules = [
-      symlink_genrule_for_dir(
-          repository_ctx,
-          cuda_include_path,
-          "cuda/include",
-          "cuda-include",
-      )
-  ]
-  genrules.append(
-      symlink_genrule_for_dir(
-          repository_ctx,
-          nvvm_libdevice_dir,
-          "cuda/nvvm/libdevice",
-          "cuda-nvvm",
-      ))
-  genrules.append(
-      symlink_genrule_for_dir(
-          repository_ctx,
-          cupti_header_dir,
-          "cuda/extras/CUPTI/include",
-          "cuda-extras",
-      ))
-
-  cuda_libs = _find_libs(repository_ctx, cuda_config)
-  cuda_lib_src = []
-  cuda_lib_dest = []
-  for lib in cuda_libs.values():
-    cuda_lib_src.append(lib.path)
-    cuda_lib_dest.append("cuda/lib/" + lib.file_name)
-  genrules.append(
-      symlink_genrule_for_dir(
-          repository_ctx,
-          None,
-          "",
-          "cuda-lib",
-          cuda_lib_src,
-          cuda_lib_dest,
-      ))
-
-  # Set up the symbolic links for cudnn if cndnn was not installed to
-  # CUDA_TOOLKIT_PATH.
-  included_files = _read_dir(repository_ctx, cuda_include_path).replace(
-      cuda_include_path,
-      "",
-  ).splitlines()
-  if "/cudnn.h" not in included_files:
-    genrules.append(
-        symlink_genrule_for_dir(
+    copy_rules.append(make_copy_dir_rule(
+        repository_ctx,
+        name = "cuda-bin",
+        src_dir = cuda_config.cuda_toolkit_path + "/bin",
+        out_dir = "cuda/bin",
+    ))
+
+    # Copy cudnn.h if cuDNN was not installed to CUDA_TOOLKIT_PATH.
+    included_files = _read_dir(repository_ctx, cuda_include_path)
+    if not any([file.endswith("cudnn.h") for file in included_files]):
+        copy_rules.append(make_copy_files_rule(
             repository_ctx,
-            None,
-            "cuda/include/",
-            "cudnn-include",
-            [cudnn_header_dir + "/cudnn.h"],
-            ["cudnn.h"],
+            name = "cudnn-include",
+            srcs = [cudnn_header_dir + "/cudnn.h"],
+            outs = ["cuda/include/cudnn.h"],
         ))
-  else:
-    genrules.append(
-        "filegroup(\n" + '    name = "cudnn-include",\n' + "    srcs = [],\n" +
-        ")\n",)
-
-  # Set up BUILD file for cuda/
-  _tpl(
-      repository_ctx,
-      "cuda:build_defs.bzl",
-      {
-          "%{cuda_is_configured}":
-              "True",
-          "%{cuda_extra_copts}":
-              _compute_cuda_extra_copts(
-                  repository_ctx,
-                  cuda_config.compute_capabilities,
-              ),
-      },
-  )
-  _tpl(
-      repository_ctx,
-      "cuda:BUILD.windows" if _is_windows(repository_ctx) else "cuda:BUILD",
-      {
-          "%{cuda_driver_lib}":
-              cuda_libs["cuda"].file_name,
-          "%{cudart_static_lib}":
-              cuda_libs["cudart_static"].file_name,
-          "%{cudart_static_linkopt}":
-              _cudart_static_linkopt(cuda_config.cpu_value,),
-          "%{cudart_lib}":
-              cuda_libs["cudart"].file_name,
-          "%{cublas_lib}":
-              cuda_libs["cublas"].file_name,
-          "%{cusolver_lib}":
-              cuda_libs["cusolver"].file_name,
-          "%{cudnn_lib}":
-              cuda_libs["cudnn"].file_name,
-          "%{cufft_lib}":
-              cuda_libs["cufft"].file_name,
-          "%{curand_lib}":
-              cuda_libs["curand"].file_name,
-          "%{cupti_lib}":
-              cuda_libs["cupti"].file_name,
-          "%{cuda_include_genrules}":
-              "\n".join(genrules),
-          "%{cuda_headers}": ('":cuda-include",\n' + '        ":cudnn-include",'
-                             ),
-      },
-      "cuda/BUILD",
-  )
-
-  is_cuda_clang = _use_cuda_clang(repository_ctx)
+    else:
+        copy_rules.append("filegroup(name = 'cudnn-include')\n")
 
-  should_download_clang = is_cuda_clang and _flag_enabled(
-      repository_ctx,
-      _TF_DOWNLOAD_CLANG,
-  )
-  if should_download_clang:
-    download_clang(repository_ctx, "crosstool/extra_tools")
-
-  # Set up crosstool/
-  cc = find_cc(repository_ctx)
-  cc_fullpath = cc if not should_download_clang else "crosstool/" + cc
-
-  host_compiler_includes = _host_compiler_includes(repository_ctx, cc_fullpath)
-  cuda_defines = {}
-  # Bazel sets '-B/usr/bin' flag to workaround build errors on RHEL (see
-  # https://github.com/bazelbuild/bazel/issues/760).
-  # However, this stops our custom clang toolchain from picking the provided
-  # LLD linker, so we're only adding '-B/usr/bin' when using non-downloaded
-  # toolchain.
-  # TODO: when bazel stops adding '-B/usr/bin' by default, remove this
-  #       flag from the CROSSTOOL completely (see
-  #       https://github.com/bazelbuild/bazel/issues/5634)
-  if should_download_clang:
-    cuda_defines["%{linker_bin_path_flag}"] = ""
-  else:
-    cuda_defines["%{linker_bin_path_flag}"] = 'flag: "-B/usr/bin"'
-
-  if is_cuda_clang:
-    cuda_defines["%{host_compiler_path}"] = str(cc)
-    cuda_defines["%{host_compiler_warnings}"] = """
-        # Some parts of the codebase set -Werror and hit this warning, so
-        # switch it off for now.
-        flag: "-Wno-invalid-partial-specialization"
-    """
-    cuda_defines["%{host_compiler_includes}"] = host_compiler_includes
-    cuda_defines["%{extra_no_canonical_prefixes_flags}"] = ""
-    _tpl(repository_ctx, "crosstool:BUILD", {
-        "%{linker_files}": ":empty",
-        "%{win_linker_files}": ":empty"
-    })
-    repository_ctx.file(
-        "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc", "")
-    repository_ctx.file("crosstool/windows/msvc_wrapper_for_nvcc.py", "")
-    repository_ctx.file("crosstool/windows/msvc_wrapper_for_nvcc.bat", "")
-  else:
-    cuda_defines[
-        "%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
-    cuda_defines["%{host_compiler_warnings}"] = ""
-
-    # nvcc has the system include paths built in and will automatically
-    # search them; we cannot work around that, so we add the relevant cuda
-    # system paths to the allowed compiler specific include paths.
-    cuda_defines["%{host_compiler_includes}"] = (
-        host_compiler_includes + "\n" + _cuda_include_path(
-            repository_ctx, cuda_config) +
-        "\n  cxx_builtin_include_directory: \"%s\"" % cupti_header_dir +
-        "\n  cxx_builtin_include_directory: \"%s\"" % cudnn_header_dir)
-
-    # For gcc, do not canonicalize system header paths; some versions of gcc
-    # pick the shortest possible path for system includes when creating the
-    # .d file - given that includes that are prefixed with "../" multiple
-    # time quickly grow longer than the root of the tree, this can lead to
-    # bazel's header check failing.
-    cuda_defines["%{extra_no_canonical_prefixes_flags}"] = (
-        "flag: \"-fno-canonical-system-headers\"")
-    nvcc_path = str(
-        repository_ctx.path("%s/bin/nvcc%s" % (
-            cuda_config.cuda_toolkit_path,
-            ".exe" if _is_windows(repository_ctx) else "",
-        )))
+    # Set up BUILD file for cuda/
     _tpl(
         repository_ctx,
-        "crosstool:BUILD",
+        "cuda:build_defs.bzl",
         {
-            "%{linker_files}": ":crosstool_wrapper_driver_is_not_gcc",
-            "%{win_linker_files}": ":windows_msvc_wrapper_files",
+            "%{cuda_is_configured}": "True",
+            "%{cuda_extra_copts}": _compute_cuda_extra_copts(
+                repository_ctx,
+                cuda_config.compute_capabilities,
+            ),
         },
     )
-    wrapper_defines = {
-        "%{cpu_compiler}":
-            str(cc),
-        "%{cuda_version}":
-            cuda_config.cuda_version,
-        "%{nvcc_path}":
-            nvcc_path,
-        "%{gcc_host_compiler_path}":
-            str(cc),
-        "%{cuda_compute_capabilities}":
-            ", ".join(
-                ["\"%s\"" % c for c in cuda_config.compute_capabilities],),
-        "%{nvcc_tmp_dir}":
-            _get_nvcc_tmp_dir_for_windows(repository_ctx),
-    }
     _tpl(
         repository_ctx,
-        "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",
-        wrapper_defines,
+        "cuda:BUILD.windows" if _is_windows(repository_ctx) else "cuda:BUILD",
+        {
+            "%{cuda_driver_lib}": cuda_libs["cuda"].basename,
+            "%{cudart_static_lib}": cuda_libs["cudart_static"].basename,
+            "%{cudart_static_linkopt}": _cudart_static_linkopt(cuda_config.cpu_value),
+            "%{cudart_lib}": cuda_libs["cudart"].basename,
+            "%{cublas_lib}": cuda_libs["cublas"].basename,
+            "%{cusolver_lib}": cuda_libs["cusolver"].basename,
+            "%{cudnn_lib}": cuda_libs["cudnn"].basename,
+            "%{cufft_lib}": cuda_libs["cufft"].basename,
+            "%{curand_lib}": cuda_libs["curand"].basename,
+            "%{cupti_lib}": cuda_libs["cupti"].basename,
+            "%{copy_rules}": "\n".join(copy_rules),
+            "%{cuda_headers}": (
+                '":cuda-include",\n' + '        ":cudnn-include",'
+            ),
+        },
+        "cuda/BUILD",
+    )
+
+    is_cuda_clang = _use_cuda_clang(repository_ctx)
+
+    should_download_clang = is_cuda_clang and _flag_enabled(
+        repository_ctx,
+        _TF_DOWNLOAD_CLANG,
     )
+    if should_download_clang:
+        download_clang(repository_ctx, "crosstool/extra_tools")
+
+    # Set up crosstool/
+    cc = find_cc(repository_ctx)
+    cc_fullpath = cc if not should_download_clang else "crosstool/" + cc
+
+    host_compiler_includes = _host_compiler_includes(repository_ctx, cc_fullpath)
+    cuda_defines = {}
+
+    # Bazel sets '-B/usr/bin' flag to workaround build errors on RHEL (see
+    # https://github.com/bazelbuild/bazel/issues/760).
+    # However, this stops our custom clang toolchain from picking the provided
+    # LLD linker, so we're only adding '-B/usr/bin' when using non-downloaded
+    # toolchain.
+    # TODO: when bazel stops adding '-B/usr/bin' by default, remove this
+    #       flag from the CROSSTOOL completely (see
+    #       https://github.com/bazelbuild/bazel/issues/5634)
+    if should_download_clang:
+        cuda_defines["%{linker_bin_path_flag}"] = ""
+    else:
+        cuda_defines["%{linker_bin_path_flag}"] = 'flag: "-B/usr/bin"'
+
+    if is_cuda_clang:
+        cuda_defines["%{host_compiler_path}"] = str(cc)
+        cuda_defines["%{host_compiler_warnings}"] = """
+        # Some parts of the codebase set -Werror and hit this warning, so
+        # switch it off for now.
+        flag: "-Wno-invalid-partial-specialization"
+    """
+        cuda_defines["%{host_compiler_includes}"] = host_compiler_includes
+        cuda_defines["%{extra_no_canonical_prefixes_flags}"] = ""
+        _tpl(repository_ctx, "crosstool:BUILD", {
+            "%{linker_files}": ":empty",
+            "%{win_linker_files}": ":empty",
+        })
+        repository_ctx.file(
+            "crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc",
+            "",
+        )
+        repository_ctx.file("crosstool/windows/msvc_wrapper_for_nvcc.py", "")
+    else:
+        cuda_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
+        cuda_defines["%{host_compiler_warnings}"] = ""
+
+        # nvcc has the system include paths built in and will automatically
+        # search them; we cannot work around that, so we add the relevant cuda
+        # system paths to the allowed compiler specific include paths.
+        cuda_defines["%{host_compiler_includes}"] = (
+            host_compiler_includes + "\n" + _cuda_include_path(
+                repository_ctx,
+                cuda_config,
+            ) +
+            "\n  cxx_builtin_include_directory: \"%s\"" % cupti_header_dir +
+            "\n  cxx_builtin_include_directory: \"%s\"" % cudnn_header_dir
+        )
+
+        # For gcc, do not canonicalize system header paths; some versions of gcc
+        # pick the shortest possible path for system includes when creating the
+        # .d file - given that includes that are prefixed with "../" multiple
+        # time quickly grow longer than the root of the tree, this can lead to
+        # bazel's header check failing.
+        cuda_defines["%{extra_no_canonical_prefixes_flags}"] = (
+            "flag: \"-fno-canonical-system-headers\""
+        )
+        nvcc_path = str(
+            repository_ctx.path("%s/bin/nvcc%s" % (
+                cuda_config.cuda_toolkit_path,
+                ".exe" if _is_windows(repository_ctx) else "",
+            )),
+        )
+        _tpl(
+            repository_ctx,
+            "crosstool:BUILD",
+            {
+                "%{linker_files}": ":crosstool_wrapper_driver_is_not_gcc",
+                "%{win_linker_files}": ":windows_msvc_wrapper_files",
+            },
+        )
+        wrapper_defines = {
+            "%{cpu_compiler}": str(cc),
+            "%{cuda_version}": cuda_config.cuda_version,
+            "%{nvcc_path}": nvcc_path,
+            "%{gcc_host_compiler_path}": str(cc),
+            "%{cuda_compute_capabilities}": ", ".join(
+                ["\"%s\"" % c for c in cuda_config.compute_capabilities],
+            ),
+            "%{nvcc_tmp_dir}": _get_nvcc_tmp_dir_for_windows(repository_ctx),
+        }
+        _tpl(
+            repository_ctx,
+            "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",
+            wrapper_defines,
+        )
+        _tpl(
+            repository_ctx,
+            "crosstool:windows/msvc_wrapper_for_nvcc.py",
+            wrapper_defines,
+        )
+
     _tpl(
         repository_ctx,
-        "crosstool:windows/msvc_wrapper_for_nvcc.py",
-        wrapper_defines,
+        "crosstool:CROSSTOOL",
+        cuda_defines + _get_win_cuda_defines(repository_ctx),
+        out = "crosstool/CROSSTOOL",
     )
+
+    # Set up cuda_config.h, which is used by
+    # tensorflow/stream_executor/dso_loader.cc.
     _tpl(
         repository_ctx,
-        "crosstool:windows/msvc_wrapper_for_nvcc.bat",
+        "cuda:cuda_config.h",
         {
-            "%{python_binary}": _get_python_bin(repository_ctx),
+            "%{cuda_version}": cuda_config.cuda_version,
+            "%{cudnn_version}": cuda_config.cudnn_version,
+            "%{cuda_compute_capabilities}": ",".join([
+                "CudaVersion(\"%s\")" % c
+                for c in cuda_config.compute_capabilities
+            ]),
+            "%{cuda_toolkit_path}": cuda_config.cuda_toolkit_path,
         },
+        "cuda/cuda/cuda_config.h",
     )
 
-  _tpl(
-      repository_ctx,
-      "crosstool:CROSSTOOL",
-      cuda_defines + _get_win_cuda_defines(repository_ctx),
-      out="crosstool/CROSSTOOL",
-  )
-
-  # Set up cuda_config.h, which is used by
-  # tensorflow/stream_executor/dso_loader.cc.
-  _tpl(
-      repository_ctx,
-      "cuda:cuda_config.h",
-      {
-          "%{cuda_version}":
-              cuda_config.cuda_version,
-          "%{cudnn_version}":
-              cuda_config.cudnn_version,
-          "%{cuda_compute_capabilities}":
-              ",".join([
-                  "CudaVersion(\"%s\")" % c
-                  for c in cuda_config.compute_capabilities
-              ],),
-          "%{cuda_toolkit_path}":
-              cuda_config.cuda_toolkit_path,
-      },
-      "cuda/cuda/cuda_config.h",
-  )
-
-
 def _create_remote_cuda_repository(repository_ctx, remote_config_repo):
-  """Creates pointers to a remotely configured repo set up to build with CUDA."""
-  _tpl(
-      repository_ctx,
-      "cuda:build_defs.bzl",
-      {
-          "%{cuda_is_configured}":
-              "True",
-          "%{cuda_extra_copts}":
-              _compute_cuda_extra_copts(
-                  repository_ctx,
-                  compute_capabilities(repository_ctx),
-              ),
-      },
-  )
-  _tpl(
-      repository_ctx,
-      "cuda:remote.BUILD",
-      {
-          "%{remote_cuda_repo}": remote_config_repo,
-      },
-      "cuda/BUILD",
-  )
-  _tpl(repository_ctx, "crosstool:remote.BUILD", {
-      "%{remote_cuda_repo}": remote_config_repo,
-  }, "crosstool/BUILD")
-
-
-def _cuda_autoconf_impl(repository_ctx):
-  """Implementation of the cuda_autoconf repository rule."""
-  if not _enable_cuda(repository_ctx):
-    _create_dummy_repository(repository_ctx)
-  elif _TF_CUDA_CONFIG_REPO in repository_ctx.os.environ:
-    _create_remote_cuda_repository(
+    """Creates pointers to a remotely configured repo set up to build with CUDA."""
+    _tpl(
         repository_ctx,
-        repository_ctx.os.environ[_TF_CUDA_CONFIG_REPO],
+        "cuda:build_defs.bzl",
+        {
+            "%{cuda_is_configured}": "True",
+            "%{cuda_extra_copts}": _compute_cuda_extra_copts(
+                repository_ctx,
+                compute_capabilities(repository_ctx),
+            ),
+        },
+    )
+    repository_ctx.template(
+        "cuda/BUILD",
+        Label(remote_config_repo + "/cuda:BUILD"),
+        {},
+    )
+    repository_ctx.template(
+        "cuda/build_defs.bzl",
+        Label(remote_config_repo + "/cuda:build_defs.bzl"),
+        {},
+    )
+    repository_ctx.template(
+        "cuda/cuda/cuda_config.h",
+        Label(remote_config_repo + "/cuda:cuda/cuda_config.h"),
+        {},
     )
-  else:
-    _create_local_cuda_repository(repository_ctx)
 
+def _cuda_autoconf_impl(repository_ctx):
+    """Implementation of the cuda_autoconf repository rule."""
+    if not enable_cuda(repository_ctx):
+        _create_dummy_repository(repository_ctx)
+    elif _TF_CUDA_CONFIG_REPO in repository_ctx.os.environ:
+        _create_remote_cuda_repository(
+            repository_ctx,
+            repository_ctx.os.environ[_TF_CUDA_CONFIG_REPO],
+        )
+    else:
+        _create_local_cuda_repository(repository_ctx)
 
 cuda_configure = repository_rule(
     implementation = _cuda_autoconf_impl,
diff --git a/third_party/gpus/rocm/BUILD.tpl b/third_party/gpus/rocm/BUILD.tpl
index 8258bb35897ac47c2e95c84a14089c73a075335d..502b6b8de2f520650c54f1ff01b1146b00a5c3f5 100644
--- a/third_party/gpus/rocm/BUILD.tpl
+++ b/third_party/gpus/rocm/BUILD.tpl
@@ -18,6 +18,7 @@ cc_library(
     includes = [
         ".",
         "rocm/include",
+        "rocm/include/rocrand",
     ],
     visibility = ["//visibility:public"],
 )
@@ -96,4 +97,4 @@ cc_library(
     ],
 )
 
-%{rocm_include_genrules}
+%{copy_rules}
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 6df6799bd7696d5dbcc70345bf7b5e19f709b8d4..63b8cd47c7122980845d91aec8b3f7b8c831657c 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -14,6 +14,12 @@
     `gfx803,gfx900`.
 """
 
+load(
+    ":cuda_configure.bzl",
+    "make_copy_dir_rule",
+    "make_copy_files_rule",
+)
+
 _GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
 _ROCM_TOOLKIT_PATH = "ROCM_TOOLKIT_PATH"
 _TF_ROCM_VERSION = "TF_ROCM_VERSION"
@@ -226,6 +232,42 @@ def _amdgpu_targets(repository_ctx):
             auto_configure_fail("Invalid AMDGPU target: %s" % amdgpu_target)
     return amdgpu_targets
 
+def _hipcc_env(repository_ctx):
+    """Returns the environment variable string for hipcc.
+
+    Args:
+        repository_ctx: The repository context.
+
+    Returns:
+        A string containing environment variables for hipcc.
+    """
+    hipcc_env = ""
+    for name in [
+        "HIP_CLANG_PATH",
+        "DEVICE_LIB_PATH",
+        "HIP_VDI_HOME",
+        "HIPCC_VERBOSE",
+        "HIPCC_COMPILE_FLAGS_APPEND",
+    ]:
+        if name in repository_ctx.os.environ:
+            hipcc_env = (hipcc_env + " " + name + "=\"" +
+                         repository_ctx.os.environ[name].strip() + "\";")
+    return hipcc_env.strip()
+
+def _crosstool_verbose(repository_ctx):
+    """Returns the environment variable value CROSSTOOL_VERBOSE.
+
+    Args:
+        repository_ctx: The repository context.
+
+    Returns:
+        A string containing value of environment variable CROSSTOOL_VERBOSE.
+    """
+    name = "CROSSTOOL_VERBOSE"
+    if name in repository_ctx.os.environ:
+        return repository_ctx.os.environ[name].strip()
+    return "0"
+
 def _cpu_value(repository_ctx):
     """Returns the name of the host operating system.
 
@@ -445,7 +487,7 @@ def _create_dummy_repository(repository_ctx):
             "%{miopen_lib}": _lib_name("miopen", cpu_value),
             "%{rocfft_lib}": _lib_name("rocfft", cpu_value),
             "%{hiprand_lib}": _lib_name("hiprand", cpu_value),
-            "%{rocm_include_genrules}": "",
+            "%{copy_rules}": "",
             "%{rocm_headers}": "",
         },
     )
@@ -510,51 +552,6 @@ def _norm_path(path):
         path = path[:-1]
     return path
 
-def _symlink_genrule_for_dir(
-        repository_ctx,
-        src_dir,
-        dest_dir,
-        genrule_name,
-        src_files = [],
-        dest_files = []):
-    """Returns a genrule to symlink(or copy if on Windows) a set of files.
-
-    If src_dir is passed, files will be read from the given directory; otherwise
-    we assume files are in src_files and dest_files
-    """
-    if src_dir != None:
-        src_dir = _norm_path(src_dir)
-        dest_dir = _norm_path(dest_dir)
-        files = _read_dir(repository_ctx, src_dir)
-
-        # Create a list with the src_dir stripped to use for outputs.
-        dest_files = files.replace(src_dir, "").splitlines()
-        src_files = files.splitlines()
-    command = []
-
-    # We clear folders that might have been generated previously to avoid
-    # undesired inclusions
-    command.append('if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi')
-    command.append('if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi')
-    outs = []
-    for i in range(len(dest_files)):
-        if dest_files[i] != "":
-            # If we have only one file to link we do not want to use the dest_dir, as
-            # $(@D) will include the full path to the file.
-            dest = "$(@D)/" + dest_dir + dest_files[i] if len(dest_files) != 1 else "$(@D)/" + dest_files[i]
-
-            # On Windows, symlink is not supported, so we just copy all the files.
-            cmd = "ln -s"
-            command.append(cmd + ' "%s" "%s"' % (src_files[i], dest))
-            outs.append('        "' + dest_dir + dest_files[i] + '",')
-    genrule = _genrule(
-        src_dir,
-        genrule_name,
-        " && ".join(command),
-        "\n".join(outs),
-    )
-    return genrule
-
 def _genrule(src_dir, genrule_name, command, outs):
     """Returns a string with a genrule.
 
@@ -601,56 +598,49 @@ def _create_local_rocm_repository(repository_ctx):
     """Creates the repository containing files set up to build with ROCm."""
     rocm_config = _get_rocm_config(repository_ctx)
 
-    # Set up symbolic links for the rocm toolkit by creating genrules to do
-    # symlinking. We create one genrule for each directory we want to track under
+    # Copy header and library files to execroot.
     # rocm_toolkit_path
     rocm_toolkit_path = rocm_config.rocm_toolkit_path
-    rocm_include_path = rocm_toolkit_path + "/include"
-    genrules = [_symlink_genrule_for_dir(
-        repository_ctx,
-        rocm_include_path,
-        "rocm/include",
-        "rocm-include",
-    )]
-    genrules.append(_symlink_genrule_for_dir(
-        repository_ctx,
-        rocm_toolkit_path + "/rocfft/include",
-        "rocm/include/rocfft",
-        "rocfft-include",
-    ))
-    genrules.append(_symlink_genrule_for_dir(
-        repository_ctx,
-        rocm_toolkit_path + "/rocblas/include",
-        "rocm/include/rocblas",
-        "rocblas-include",
-    ))
-    genrules.append(_symlink_genrule_for_dir(
-        repository_ctx,
-        rocm_toolkit_path + "/miopen/include",
-        "rocm/include/miopen",
-        "miopen-include",
-    ))
+    copy_rules = [
+        make_copy_dir_rule(
+            repository_ctx,
+            name = "rocm-include",
+            src_dir = rocm_toolkit_path + "/include",
+            out_dir = "rocm/include",
+        ),
+        make_copy_dir_rule(
+            repository_ctx,
+            name = "rocfft-include",
+            src_dir = rocm_toolkit_path + "/rocfft/include",
+            out_dir = "rocm/include/rocfft",
+        ),
+        make_copy_dir_rule(
+            repository_ctx,
+            name = "rocblas-include",
+            src_dir = rocm_toolkit_path + "/rocblas/include",
+            out_dir = "rocm/include/rocblas",
+        ),
+        make_copy_dir_rule(
+            repository_ctx,
+            name = "miopen-include",
+            src_dir = rocm_toolkit_path + "/miopen/include",
+            out_dir = "rocm/include/miopen",
+        ),
+    ]
 
     rocm_libs = _find_libs(repository_ctx, rocm_config)
-    rocm_lib_src = []
-    rocm_lib_dest = []
+    rocm_lib_srcs = []
+    rocm_lib_outs = []
     for lib in rocm_libs.values():
-        rocm_lib_src.append(lib.path)
-        rocm_lib_dest.append("rocm/lib/" + lib.file_name)
-    genrules.append(_symlink_genrule_for_dir(
+        rocm_lib_srcs.append(lib.path)
+        rocm_lib_outs.append("rocm/lib/" + lib.file_name)
+    copy_rules.append(make_copy_files_rule(
         repository_ctx,
-        None,
-        "",
-        "rocm-lib",
-        rocm_lib_src,
-        rocm_lib_dest,
+        name = "rocm-lib",
+        srcs = rocm_lib_srcs,
+        outs = rocm_lib_outs,
     ))
 
-    included_files = _read_dir(repository_ctx, rocm_include_path).replace(
-        rocm_include_path,
-        "",
-    ).splitlines()
-
     # Set up BUILD file for rocm/
     _tpl(
         repository_ctx,
@@ -672,7 +662,7 @@ def _create_local_rocm_repository(repository_ctx):
             "%{rocfft_lib}": rocm_libs["rocfft"].file_name,
             "%{hiprand_lib}": rocm_libs["hiprand"].file_name,
             "%{miopen_lib}": rocm_libs["miopen"].file_name,
-            "%{rocm_include_genrules}": "\n".join(genrules),
+            "%{copy_rules}": "\n".join(copy_rules),
             "%{rocm_headers}": ('":rocm-include",\n' +
                                 '":rocfft-include",\n' +
                                 '":rocblas-include",\n' +
@@ -701,6 +691,12 @@ def _create_local_rocm_repository(repository_ctx):
         {
             "%{cpu_compiler}": str(cc),
             "%{hipcc_path}": "/opt/rocm/bin/hipcc",
+            "%{hipcc_env}": _hipcc_env(repository_ctx),
+            "%{hip_runtime_path}": "/opt/rocm/hip/lib",
+            "%{hip_runtime_library}": "hip_hcc",
+            "%{hcc_runtime_path}": "/opt/rocm/hcc/lib",
+            "%{hcc_runtime_library}": "mcwamp",
+            "%{crosstool_verbose}": _crosstool_verbose(repository_ctx),
             "%{gcc_host_compiler_path}": str(cc),
             "%{rocm_amdgpu_targets}": ",".join(
                 ["\"%s\"" % c for c in rocm_config.amdgpu_targets],
diff --git a/third_party/hwloc/BUILD b/third_party/hwloc/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..2469c95668a55342da17c5dfb9d942d1a788d38e
--- /dev/null
+++ b/third_party/hwloc/BUILD
@@ -0,0 +1,7 @@
+# BUILD file to make this directory a package.
+
+licenses(["notice"])
+
+exports_files(
+    ["static-components.h"],
+)
diff --git a/third_party/hwloc/BUILD.bazel b/third_party/hwloc/BUILD.bazel
new file mode 100644
index 0000000000000000000000000000000000000000..5d636cabd39f83f9d63e481c81eedd4d078ca0df
--- /dev/null
+++ b/third_party/hwloc/BUILD.bazel
@@ -0,0 +1,278 @@
+# hwloc: Portable Hardware Locality Library
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])
+
+exports_files(["LICENSE"])
+
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@org_tensorflow//third_party:common.bzl", "template_rule")
+
+COMMON_INCLUDE_COPTS = [
+    "-I.",
+    "-Ihwloc",
+    "-Iinclude",
+]
+
+DISABLE_WARNINGS_COPTS = [
+    "-Wno-vla",
+]
+
+VAR_SETTINGS_COPTS = [
+    "-DHWLOC_DUMPED_HWDATA_DIR=",
+    "-DRUNSTATEDIR=",
+]
+
+template_rule(
+    name = "include_hwloc_autogen_config_h",
+    src = "include/hwloc/autogen/config.h.in",
+    out = "include/hwloc/autogen/config.h",
+    substitutions = {
+        "#undef HWLOC_VERSION_MAJOR": "#define HWLOC_VERSION_MAJOR 2",
+        "#undef HWLOC_VERSION_MINOR": "#define HWLOC_VERSION_MINOR 0",
+        "#undef HWLOC_VERSION_RELEASE": "#define HWLOC_VERSION_RELEASE 3",
+        "#undef HWLOC_VERSION_GREEK": "#define HWLOC_VERSION_GREEK \"\"",
+        "#undef HWLOC_VERSION": "#define HWLOC_VERSION \"2.0.3\"",
+        "#undef HWLOC_LINUX_SYS": "#define HWLOC_LINUX_SYS 1",
+        "#undef hwloc_pid_t": "#define hwloc_pid_t pid_t",
+        "#undef hwloc_thread_t": "#define hwloc_thread_t pthread_t",
+        "#  undef HWLOC_HAVE_STDINT_H": "#  define HWLOC_HAVE_STDINT_H 1 ",
+        "#undef HWLOC_SYM_TRANSFORM": "#define HWLOC_SYM_TRANSFORM 0",
+        "#undef HWLOC_SYM_PREFIX": "#define HWLOC_SYM_PREFIX hwloc_",
+        "#undef HWLOC_SYM_PREFIX_CAPS": "#define HWLOC_SYM_PREFIX_CAPS HWLOC_",
+    },
+)
+
+_INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_COMMON_SUBS = {
+    "#undef HAVE_CLOCK_GETTIME": "#define HAVE_CLOCK_GETTIME 1",
+    "#undef HAVE_CTYPE_H": "#define HAVE_CTYPE_H 1",
+    "#undef HAVE_DECL_CTL_HW": "#define HAVE_DECL_CTL_HW 0",
+    "#undef HAVE_DECL_FABSF": "#define HAVE_DECL_FABSF 1",
+    "#undef HAVE_DECL_GETEXECNAME": "#define HAVE_DECL_GETEXECNAME 0",
+    "#undef HAVE_DECL_GETMODULEFILENAME": "#define HAVE_DECL_GETMODULEFILENAME 0",
+    "#undef HAVE_DECL_GETPROGNAME": "#define HAVE_DECL_GETPROGNAME 0",
+    "#undef HAVE_DECL_HW_NCPU": "#define HAVE_DECL_HW_NCPU 0",
+    "#undef HAVE_DECL_MODFF": "#define HAVE_DECL_MODFF 1",
+    "#undef HAVE_DECL_PTHREAD_GETAFFINITY_NP": "#define HAVE_DECL_PTHREAD_GETAFFINITY_NP 1",
+    "#undef HAVE_DECL_PTHREAD_SETAFFINITY_NP": "#define HAVE_DECL_PTHREAD_SETAFFINITY_NP 1",
+    "#undef HAVE_DECL_RUNNING_ON_VALGRIND": "#define HAVE_DECL_RUNNING_ON_VALGRIND 0",
+    "#undef HAVE_DECL_SCHED_GETCPU": "#define HAVE_DECL_SCHED_GETCPU 1",
+    "#undef HAVE_DECL_SNPRINTF": "#define HAVE_DECL_SNPRINTF 1",
+    "#undef HAVE_DECL_STRTOULL": "#define HAVE_DECL_STRTOULL 1",
+    "#undef HAVE_DECL__PUTENV": "#define HAVE_DECL__PUTENV 0",
+    "#undef HAVE_DECL__SC_LARGE_PAGESIZE": "#define HAVE_DECL__SC_LARGE_PAGESIZE 0",
+    "#undef HAVE_DECL__SC_NPROCESSORS_CONF": "#define HAVE_DECL__SC_NPROCESSORS_CONF 1",
+    "#undef HAVE_DECL__SC_NPROCESSORS_ONLN": "#define HAVE_DECL__SC_NPROCESSORS_ONLN 1",
+    "#undef HAVE_DECL__SC_NPROC_CONF": "#define HAVE_DECL__SC_NPROC_CONF 0",
+    "#undef HAVE_DECL__SC_NPROC_ONLN": "#define HAVE_DECL__SC_NPROC_ONLN 0",
+    "#undef HAVE_DECL__SC_PAGESIZE": "#define HAVE_DECL__SC_PAGESIZE 1",
+    "#undef HAVE_DECL__SC_PAGE_SIZE": "#define HAVE_DECL__SC_PAGE_SIZE 1",
+    "#undef HAVE_DECL__STRDUP": "#define HAVE_DECL__STRDUP 0",
+    "#undef HAVE_DIRENT_H": "#define HAVE_DIRENT_H 1",
+    "#undef HAVE_DLFCN_H": "#define HAVE_DLFCN_H 1",
+    "#undef HAVE_FFS": "#define HAVE_FFS 1",
+    "#undef HAVE_FFSL": "#define HAVE_FFSL 1",
+    "#undef HAVE_GETPAGESIZE": "#define HAVE_GETPAGESIZE 1",
+    "#undef HAVE_INTTYPES_H": "#define HAVE_INTTYPES_H 1",
+    "#undef HAVE_LANGINFO_H": "#define HAVE_LANGINFO_H 1",
+    "#undef HAVE_LOCALE_H": "#define HAVE_LOCALE_H 1",
+    "#undef HAVE_MALLOC_H": "#define HAVE_MALLOC_H 1",
+    "#undef HAVE_MEMALIGN": "#define HAVE_MEMALIGN 1",
+    "#undef HAVE_MEMORY_H": "#define HAVE_MEMORY_H 1",
+    "#undef HAVE_MKSTEMP": "#define HAVE_MKSTEMP 1",
+    "#undef HAVE_NL_LANGINFO": "#define HAVE_NL_LANGINFO 1",
+    "#undef HAVE_OPENAT": "#define HAVE_OPENAT 1",
+    "#undef HAVE_POSIX_MEMALIGN": "#define HAVE_POSIX_MEMALIGN 1",
+    "#undef HAVE_PROGRAM_INVOCATION_NAME": "#define HAVE_PROGRAM_INVOCATION_NAME 1",
+    "#undef HAVE_PTHREAD_T": "#define HAVE_PTHREAD_T 1",
+    "#undef HAVE_PUTWC": "#define HAVE_PUTWC 1",
+    "#undef HAVE_SETLOCALE": "#define HAVE_SETLOCALE 1",
+    "#undef HAVE_SSIZE_T": "#define HAVE_SSIZE_T 1",
+    "#undef HAVE_STDINT_H": "#define HAVE_STDINT_H 1",
+    "#undef HAVE_STDLIB_H": "#define HAVE_STDLIB_H 1",
+    "#undef HAVE_STRCASECMP": "#define HAVE_STRCASECMP 1",
+    "#undef HAVE_STRFTIME": "#define HAVE_STRFTIME 1",
+    "#undef HAVE_STRINGS_H": "#define HAVE_STRINGS_H 1",
+    "#undef HAVE_STRING_H": "#define HAVE_STRING_H 1",
+    "#undef HAVE_STRNCASECMP": "#define HAVE_STRNCASECMP 1",
+    "#undef HAVE_SYS_MMAN_H": "#define HAVE_SYS_MMAN_H 1",
+    "#undef HAVE_SYS_PARAM_H": "#define HAVE_SYS_PARAM_H 1",
+    "#undef HAVE_SYS_STAT_H": "#define HAVE_SYS_STAT_H 1",
+    "#undef HAVE_SYS_SYSCTL_H": "#define HAVE_SYS_SYSCTL_H 1",
+    "#undef HAVE_SYS_TYPES_H": "#define HAVE_SYS_TYPES_H 1",
+    "#undef HAVE_SYS_UTSNAME_H": "#define HAVE_SYS_UTSNAME_H 1",
+    "#undef HAVE_TIME_H": "#define HAVE_TIME_H 1",
+    "#undef HAVE_UNAME": "#define HAVE_UNAME 1",
+    "#undef HAVE_UNISTD_H": "#define HAVE_UNISTD_H 1",
+    "#undef HAVE_USELOCALE": "#define HAVE_USELOCALE 1",
+    "#undef HAVE_WCHAR_T": "#define HAVE_WCHAR_T 1",
+    "#undef HAVE_X11_KEYSYM_H": "#define HAVE_X11_KEYSYM_H 1",
+    "#undef HAVE_X11_XLIB_H": "#define HAVE_X11_XLIB_H 1",
+    "#undef HAVE_X11_XUTIL_H": "#define HAVE_X11_XUTIL_H 1",
+    "#undef HAVE_XLOCALE_H": "#define HAVE_XLOCALE_H 1",
+    "#undef HAVE___PROGNAME": "#define HAVE___PROGNAME 1",
+    "#undef HWLOC_C_HAVE_VISIBILITY": "#define HWLOC_C_HAVE_VISIBILITY 1",
+    "#undef HWLOC_HAVE_ATTRIBUTE": "#define HWLOC_HAVE_ATTRIBUTE 1",
+    "#undef HWLOC_HAVE_ATTRIBUTE_ALIGNED": "#define HWLOC_HAVE_ATTRIBUTE_ALIGNED 1",
+    "#undef HWLOC_HAVE_ATTRIBUTE_ALWAYS_INLINE": "#define HWLOC_HAVE_ATTRIBUTE_ALWAYS_INLINE 1",
+    "#undef HWLOC_HAVE_ATTRIBUTE_COLD": "#define HWLOC_HAVE_ATTRIBUTE_COLD 1",
+    "#undef HWLOC_HAVE_ATTRIBUTE_CONST": "#define HWLOC_HAVE_ATTRIBUTE_CONST 1",
+    "#undef HWLOC_HAVE_ATTRIBUTE_DEPRECATED": "#define HWLOC_HAVE_ATTRIBUTE_DEPRECATED 1",
+    "#undef HWLOC_HAVE_ATTRIBUTE_FORMAT": "#define HWLOC_HAVE_ATTRIBUTE_FORMAT 1",
+    "#undef HWLOC_HAVE_ATTRIBUTE_HOT": "#define HWLOC_HAVE_ATTRIBUTE_HOT 1",
+    "#undef HWLOC_HAVE_ATTRIBUTE_MALLOC": "#define HWLOC_HAVE_ATTRIBUTE_MALLOC 1",
+    "#undef HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS": "#define HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS 1",
+    "#undef HWLOC_HAVE_ATTRIBUTE_NONNULL": "#define HWLOC_HAVE_ATTRIBUTE_NONNULL 1",
+    "#undef HWLOC_HAVE_ATTRIBUTE_NORETURN": "#define HWLOC_HAVE_ATTRIBUTE_NORETURN 1",
+    "#undef HWLOC_HAVE_ATTRIBUTE_NO_INSTRUMENT_FUNCTION": "#define HWLOC_HAVE_ATTRIBUTE_NO_INSTRUMENT_FUNCTION 1",
+    "#undef HWLOC_HAVE_ATTRIBUTE_PACKED": "#define HWLOC_HAVE_ATTRIBUTE_PACKED 1",
+    "#undef HWLOC_HAVE_ATTRIBUTE_PURE": "#define HWLOC_HAVE_ATTRIBUTE_PURE 1",
+    "#undef HWLOC_HAVE_ATTRIBUTE_SENTINEL": "#define HWLOC_HAVE_ATTRIBUTE_SENTINEL 1",
+    "#undef HWLOC_HAVE_ATTRIBUTE_UNUSED": "#define HWLOC_HAVE_ATTRIBUTE_UNUSED 1",
+    "#undef HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT": "#define HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT 1",
+    "#undef HWLOC_HAVE_ATTRIBUTE_WEAK_ALIAS": "#define HWLOC_HAVE_ATTRIBUTE_WEAK_ALIAS 1",
+    "#undef HWLOC_HAVE_CPU_SET": "#define HWLOC_HAVE_CPU_SET 1",
+    "#undef HWLOC_HAVE_CPU_SET_S": "#define HWLOC_HAVE_CPU_SET_S 1",
+    "#undef HWLOC_HAVE_DECL_FFS": "#define HWLOC_HAVE_DECL_FFS 1",
+    "#undef HWLOC_HAVE_DECL_FFSL": "#define HWLOC_HAVE_DECL_FFSL 1",
+    "#undef HWLOC_HAVE_DECL_STRCASECMP": "#define HWLOC_HAVE_DECL_STRCASECMP 1",
+    "#undef HWLOC_HAVE_DECL_STRNCASECMP": "#define HWLOC_HAVE_DECL_STRNCASECMP 1",
+    "#undef HWLOC_HAVE_FFS": "#define HWLOC_HAVE_FFS 1",
+    "#undef HWLOC_HAVE_FFSL": "#define HWLOC_HAVE_FFSL 1",
+    "#undef HWLOC_HAVE_LIBTERMCAP": "#define HWLOC_HAVE_LIBTERMCAP 1",
+    "#undef HWLOC_HAVE_LINUXIO": "#define HWLOC_HAVE_LINUXIO 1",
+    "#undef HWLOC_HAVE_PTHREAD_MUTEX": "#define HWLOC_HAVE_PTHREAD_MUTEX 1",
+    "#undef HWLOC_HAVE_SCHED_SETAFFINITY": "#define HWLOC_HAVE_SCHED_SETAFFINITY 1",
+    "#undef HWLOC_HAVE_STDINT_H": "#define HWLOC_HAVE_STDINT_H 1",
+    "#undef HWLOC_HAVE_SYSCALL": "#define HWLOC_HAVE_SYSCALL 1",
+    "#undef HWLOC_HAVE_X11_KEYSYM": "#define HWLOC_HAVE_X11_KEYSYM 1",
+    "#undef HWLOC_HAVE_X86_CPUID": "#define HWLOC_HAVE_X86_CPUID 1",
+    "#undef HWLOC_LINUX_SYS": "#define HWLOC_LINUX_SYS 1",
+    "#undef HWLOC_SIZEOF_UNSIGNED_INT": "#define HWLOC_SIZEOF_UNSIGNED_INT 4",
+    "#undef HWLOC_SIZEOF_UNSIGNED_LONG": "#define HWLOC_SIZEOF_UNSIGNED_LONG 8",
+    "#undef HWLOC_SYM_PREFIX": "#define HWLOC_SYM_PREFIX hwloc_",
+    "#undef HWLOC_SYM_PREFIX_CAPS": "#define HWLOC_SYM_PREFIX_CAPS HWLOC_",
+    "#undef HWLOC_SYM_TRANSFORM": "#define HWLOC_SYM_TRANSFORM 0",
+    "#undef HWLOC_USE_NCURSES": "#define HWLOC_USE_NCURSES 1",
+    "#undef HWLOC_VERSION": "#define HWLOC_VERSION \"2.0.3\"",
+    "#undef HWLOC_VERSION_GREEK": "#define HWLOC_VERSION_GREEK \"\"",
+    "#undef HWLOC_VERSION_MAJOR": "#define HWLOC_VERSION_MAJOR 2",
+    "#undef HWLOC_VERSION_MINOR": "#define HWLOC_VERSION_MINOR 0",
+    "#undef HWLOC_VERSION_RELEASE": "#define HWLOC_VERSION_RELEASE 3",
+    "#undef HWLOC_X86_64_ARCH": "#define HWLOC_X86_64_ARCH 1",
+    "#undef LT_OBJDIR": "#define LT_OBJDIR \".libs/\"",
+    "#undef PACKAGE": "#define PACKAGE \"hwloc\"",
+    "#undef PACKAGE_BUGREPORT": "#define PACKAGE_BUGREPORT \"http://github.com/open-mpi/hwloc/i",
+    "#undef PACKAGE_NAME": "#define PACKAGE_NAME \"hwloc\"",
+    "#undef PACKAGE_STRING": "#define PACKAGE_STRING \"hwloc 2.0.3\"",
+    "#undef PACKAGE_TARNAME": "#define PACKAGE_TARNAME \"hwloc\"",
+    "#undef PACKAGE_URL": "#define PACKAGE_URL \"\"",
+    "#undef PACKAGE_VERSION": "#define PACKAGE_VERSION \"2.0.3\"",
+    "#undef SIZEOF_UNSIGNED_INT": "#define SIZEOF_UNSIGNED_INT 4",
+    "#undef SIZEOF_UNSIGNED_LONG": "#define SIZEOF_UNSIGNED_LONG 8",
+    "#undef SIZEOF_VOID_P": "#define SIZEOF_VOID_P 8",
+    "#undef STDC_HEADERS": "#define STDC_HEADERS 1",
+    "# undef _HPUX_SOURCE": "# define _HPUX_SOURCE 1",
+    "# undef _ALL_SOURCE": "# define _ALL_SOURCE 1",
+    "# undef _GNU_SOURCE": "# define _GNU_SOURCE 1",
+    "# undef _POSIX_PTHREAD_SEMANTICS": "# define _POSIX_PTHREAD_SEMANTICS 1",
+    "# undef _TANDEM_SOURCE": "# define _TANDEM_SOURCE 1",
+    "# undef __EXTENSIONS__": "# define __EXTENSIONS__ 1",
+    "#undef VERSION": "#define VERSION \"2.0.3\"",
+    "#undef _HPUX_SOURCE": "#define _HPUX_SOURCE 1",
+    "#undef hwloc_pid_t": "#define hwloc_pid_t pid_t",
+    "#undef hwloc_thread_t": "#define hwloc_thread_t pthread_t",
+}
+
+_INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_CUDA_SUBS = {
+    "#undef HAVE_CUDA": "#undef HAVE_CUDA 1",
+    "#undef HAVE_CUDA_H": "#undef HAVE_CUDA_H 1",
+    "#undef HAVE_CUDA_RUNTIME_API_H": "#undef HAVE_CUDA_RUNTIME_API_H 1",
+} + _INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_COMMON_SUBS
+
+template_rule(
+    name = "include_private_hwloc_autogen__config_h",
+    src = "include/private/autogen/config.h.in",
+    out = "include/private/autogen/config.h",
+    substitutions = if_cuda(
+        _INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_CUDA_SUBS,
+        if_false = _INCLUDE_PRIVATE_HWLOC_AUTOIGEN_CONFIG_H_COMMON_SUBS,
+    ),
+)
+
+template_rule(
+    name = "move_static_components_h",
+    src = "@org_tensorflow//third_party/hwloc:static-components.h",
+    out = "hwloc/static-components.h",
+    substitutions = {},
+)
+
+cc_library(
+    name = "hwloc",
+    srcs = [
+        "hwloc/base64.c",
+        "hwloc/bind.c",
+        "hwloc/bitmap.c",
+        "hwloc/components.c",
+        "hwloc/diff.c",
+        "hwloc/distances.c",
+        "hwloc/misc.c",
+        "hwloc/pci-common.c",
+        "hwloc/shmem.c",
+        "hwloc/static-components.h",
+        "hwloc/topology.c",
+        "hwloc/topology-hardwired.c",
+        "hwloc/topology-linux.c",
+        "hwloc/topology-noos.c",
+        "hwloc/topology-synthetic.c",
+        "hwloc/topology-x86.c",
+        "hwloc/topology-xml.c",
+        "hwloc/topology-xml-nolibxml.c",
+        "hwloc/traversal.c",
+        "include/hwloc/linux.h",
+        "include/hwloc/plugins.h",
+        "include/hwloc/shmem.h",
+        "include/private/autogen/config.h",
+        "include/private/components.h",
+        "include/private/cpuid-x86.h",
+        "include/private/debug.h",
+        "include/private/internal-components.h",
+        "include/private/misc.h",
+        "include/private/private.h",
+        "include/private/xml.h",
+    ],
+    hdrs = [
+        "include/hwloc.h",
+        "include/hwloc/autogen/config.h",
+        "include/hwloc/bitmap.h",
+        "include/hwloc/deprecated.h",
+        "include/hwloc/diff.h",
+        "include/hwloc/distances.h",
+        "include/hwloc/export.h",
+        "include/hwloc/helper.h",
+        "include/hwloc/inlines.h",
+        "include/hwloc/rename.h",
+    ],
+    copts = COMMON_INCLUDE_COPTS + DISABLE_WARNINGS_COPTS + VAR_SETTINGS_COPTS,
+    features = [
+        "-parse_headers",
+        "-layering_check",
+    ],
+    includes = [
+        "hwloc",
+        "include",
+    ],
+    deps = [],
+)
+
+cc_binary(
+    name = "hwloc_print",
+    srcs = ["hwloc_print.cc"],
+    copts = COMMON_INCLUDE_COPTS,
+    deps = [
+        ":hwloc",
+    ],
+)
diff --git a/third_party/hwloc/static-components.h b/third_party/hwloc/static-components.h
new file mode 100644
index 0000000000000000000000000000000000000000..8cae42a9c1faf7343f1eb1785ae725497034d8c2
--- /dev/null
+++ b/third_party/hwloc/static-components.h
@@ -0,0 +1,26 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_HWLOC_STATIC_COMPONENTS_H_
+#define THIRD_PARTY_HWLOC_STATIC_COMPONENTS_H_
+
+#include <private/internal-components.h>
+static const struct hwloc_component* hwloc_static_components[] = {
+    &hwloc_noos_component,      &hwloc_xml_component,
+    &hwloc_synthetic_component, &hwloc_xml_nolibxml_component,
+    &hwloc_linux_component,     &hwloc_linuxio_component,
+    &hwloc_x86_component,       NULL};
+
+#endif  // THIRD_PARTY_HWLOC_STATIC_COMPONENTS_H_
diff --git a/third_party/hwloc/workspace.bzl b/third_party/hwloc/workspace.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..47a143c8a0e0cee70b1c9087f78170adabe40ed9
--- /dev/null
+++ b/third_party/hwloc/workspace.bzl
@@ -0,0 +1,15 @@
+"""loads the hwloc library, used by TF."""
+
+load("//third_party:repo.bzl", "third_party_http_archive")
+
+def repo():
+    third_party_http_archive(
+        name = "hwloc",
+        urls = [
+            "http://mirror.bazel.build/download.open-mpi.org/release/hwloc/v2.0/hwloc-2.0.3.tar.gz",
+            "https://download.open-mpi.org/release/hwloc/v2.0/hwloc-2.0.3.tar.gz",
+        ],
+        sha256 = "64def246aaa5b3a6e411ce10932a22e2146c3031b735c8f94739534f06ad071c",
+        strip_prefix = "hwloc-2.0.3",
+        build_file = "//third_party/hwloc:BUILD.bazel",
+    )
diff --git a/third_party/icu/BUILD.system b/third_party/icu/BUILD.system
index 328e412a8c29f6f7c2f5ecc5b6e8bbec7613972c..8a88a6ef7e0a51448e5c6157be2c277a60c53198 100644
--- a/third_party/icu/BUILD.system
+++ b/third_party/icu/BUILD.system
@@ -1,13 +1,19 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
 licenses(["notice"])  # Apache 2.0
 
 filegroup(
     name = "icu4c/LICENSE",
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
     name = "icu4j/main/shared/licenses/LICENSE",
-    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "headers",
 )
 
 cc_library(
@@ -15,7 +21,6 @@ cc_library(
     deps = [
         ":icuuc",
     ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
diff --git a/third_party/keras_applications_archive/BUILD.system b/third_party/keras_applications_archive/BUILD.system
new file mode 100644
index 0000000000000000000000000000000000000000..a3b58f15030bb0648f73064c214b939856961d90
--- /dev/null
+++ b/third_party/keras_applications_archive/BUILD.system
@@ -0,0 +1,13 @@
+# Description: Keras Applications: set of pre-trained deep learning models.
+
+licenses(["notice"])  # MIT
+
+filegroup(
+    name = "LICENSE",
+    visibility = ["//visibility:public"],
+)
+
+py_library(
+    name = "keras_applications",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/keras_applications_archive/workspace.bzl b/third_party/keras_applications_archive/workspace.bzl
index e90630fa974fb97f4c7d5a72c045a44c237a6ace..cf9d15ca28874439d5d8e78f87d8b502908d07fe 100644
--- a/third_party/keras_applications_archive/workspace.bzl
+++ b/third_party/keras_applications_archive/workspace.bzl
@@ -12,4 +12,5 @@ def repo():
             "https://github.com/keras-team/keras-applications/archive/1.0.6.tar.gz",
         ],
         build_file = "//third_party/keras_applications_archive:BUILD.bazel",
+        system_build_file = "//third_party/keras_applications_archive:BUILD.system",
     )
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index eb468aa65fce9c014bc7b53f1fb69729eb2a3718..f0ee086a7e0342fc884d81bfe5393c8cf9793e8d 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -646,6 +646,7 @@ cc_library(
         ":amdgpu_asm_printer",
         ":amdgpu_info",
         ":amdgpu_utils",
+        ":binary_format",
         ":config",
         ":core",
         ":mc",
@@ -793,6 +794,7 @@ cc_library(
         ":amdgpu_utils",
         ":analysis",
         ":asm_printer",
+        ":binary_format",
         ":code_gen",
         ":config",
         ":core",
@@ -2241,7 +2243,6 @@ cc_library(
     deps = [
         ":code_gen",
         ":config",
-        ":core",
         ":support",
     ],
 )
diff --git a/third_party/mkl/mkl.BUILD b/third_party/mkl/mkl.BUILD
index c3a71e4ff9b33a06a87f0f90978eaf3a718c7de6..3f3c9e9336afa551ed904150319a92e378288a5f 100644
--- a/third_party/mkl/mkl.BUILD
+++ b/third_party/mkl/mkl.BUILD
@@ -12,7 +12,7 @@ filegroup(
 
 cc_library(
     name = "mkl_headers",
-    srcs = glob(["include/*"]),
+    srcs = glob(["include/*(.cc|.cpp|.cxx|.c++|.C|.c|.h|.hh|.hpp|.ipp|.hxx|.inc|.S|.s|.asm|.a|.lib|.pic.a|.lo|.lo.lib|.pic.lo|.so|.dylib|.dll|.o|.obj|.pic.o)"]),
     includes = ["include"],
     visibility = ["//visibility:public"],
 )
diff --git a/third_party/mkl_dnn/mkldnn.BUILD b/third_party/mkl_dnn/mkldnn.BUILD
index d80c7135d6fd47f45a00b35bb29ceae0c0d1d003..bd842b87f8d28941072b1d11fb4ab6d3c54c28e0 100644
--- a/third_party/mkl_dnn/mkldnn.BUILD
+++ b/third_party/mkl_dnn/mkldnn.BUILD
@@ -17,8 +17,12 @@ cc_library(
     name = "mkl_dnn",
     srcs = glob([
         "src/common/*.cpp",
+        "src/common/*.hpp",
         "src/cpu/*.cpp",
+        "src/cpu/*.hpp",
         "src/cpu/gemm/*.cpp",
+        "src/cpu/gemm/*.hpp",
+        "src/cpu/xbyak/*.h",
     ]),
     hdrs = glob(["include/*"]),
     copts = [
@@ -68,8 +72,12 @@ cc_library(
     name = "mkldnn_single_threaded",
     srcs = glob([
         "src/common/*.cpp",
+        "src/common/*.hpp",
         "src/cpu/*.cpp",
+        "src/cpu/*.hpp",
         "src/cpu/gemm/*.cpp",
+        "src/cpu/gemm/*.hpp",
+        "src/cpu/xbyak/*.h",
     ]),
     hdrs = glob(["include/*"]),
     copts = [
diff --git a/third_party/nccl/archive.BUILD b/third_party/nccl/archive.BUILD
index 7a08f97ef328a7a731d7c76de8bda70c8d004dac..5901c6b296fa0f4da8061b2b44daed18cd0b3558 100644
--- a/third_party/nccl/archive.BUILD
+++ b/third_party/nccl/archive.BUILD
@@ -1,174 +1,106 @@
 # NVIDIA NCCL 2
 # A package of optimized primitives for collective multi-GPU communication.
 
-licenses(["restricted"])
+licenses(["notice"])
 
 exports_files(["LICENSE.txt"])
 
 load(
     "@local_config_nccl//:build_defs.bzl",
-    "gen_nccl_h",
-    "nccl_library",
-    "rdc_copts",
-    "rdc_library",
-)
-load(
-    "@local_config_cuda//cuda:build_defs.bzl",
-    "cuda_default_copts",
-)
-
-# Generate the nccl.h header file.
-gen_nccl_h(
-    name = "nccl_h",
-    output = "src/nccl.h",
-    template = "src/nccl.h.in",
+    "cuda_rdc_library",
+    "gen_device_srcs",
+    "process_srcs",
 )
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
 
-nccl_library(
+cc_library(
     name = "src_hdrs",
-    hdrs = [
-        "src/nccl.h",
-        # src/include/common_coll.h #includes "collectives/collectives.h".
-        # All other #includes of collectives.h are patched in process_srcs.
+    hdrs = process_srcs([
         "src/collectives/collectives.h",
-    ],
-    strip_include_prefix = "src",
+        "src/nccl.h.in",
+    ]),
 )
 
-nccl_library(
+cc_library(
     name = "include_hdrs",
-    hdrs = glob(["src/include/*.h"]),
-    strip_include_prefix = "src/include",
-)
-
-filegroup(
-    name = "device_hdrs",
-    srcs = glob(["src/collectives/device/*.h"]),
+    hdrs = process_srcs(glob(["src/include/*.h"])),
+    strip_include_prefix = "include",
 )
 
-filegroup(
-    name = "device_srcs",
-    srcs = [
-        "src/collectives/device/all_gather.cu",
-        "src/collectives/device/all_reduce.cu",
-        "src/collectives/device/broadcast.cu",
-        "src/collectives/device/reduce.cu",
-        "src/collectives/device/reduce_scatter.cu",
-    ],
-)
+device_srcs = process_srcs([
+    "src/collectives/device/all_gather.cu",
+    "src/collectives/device/all_reduce.cu",
+    "src/collectives/device/broadcast.cu",
+    "src/collectives/device/reduce.cu",
+    "src/collectives/device/reduce_scatter.cu",
+])
 
-nccl_library(
+# NCCL compiles the same source files with different NCCL_OP defines. RDC
+# compilation requires that each compiled module has a unique ID. Clang derives
+# the module ID from the path only so we need to rename the files to get
+# different IDs for different parts of compilation. NVCC does not have that
+# problem because it generates IDs based on preprocessed content.
+gen_device_srcs(
     name = "sum",
-    srcs = [
-        ":device_hdrs",
-        ":device_srcs",
-    ],
-    copts = ["-DNCCL_OP=0"] + rdc_copts(),
-    linkstatic = True,
-    prefix = "sum_",
-    deps = [
-        ":include_hdrs",
-        ":src_hdrs",
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
+    srcs = device_srcs,
+    NCCL_OP = 0,
 )
 
-nccl_library(
+gen_device_srcs(
     name = "prod",
-    srcs = [
-        ":device_hdrs",
-        ":device_srcs",
-    ],
-    copts = ["-DNCCL_OP=1"] + rdc_copts(),
-    linkstatic = True,
-    prefix = "_prod",
-    deps = [
-        ":include_hdrs",
-        ":src_hdrs",
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
+    srcs = device_srcs,
+    NCCL_OP = 1,
 )
 
-nccl_library(
+gen_device_srcs(
     name = "min",
-    srcs = [
-        ":device_hdrs",
-        ":device_srcs",
-    ],
-    copts = ["-DNCCL_OP=2"] + rdc_copts(),
-    linkstatic = True,
-    prefix = "min_",
-    deps = [
-        ":include_hdrs",
-        ":src_hdrs",
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
+    srcs = device_srcs,
+    NCCL_OP = 2,
 )
 
-nccl_library(
+gen_device_srcs(
     name = "max",
-    srcs = [
-        ":device_hdrs",
-        ":device_srcs",
-    ],
-    copts = ["-DNCCL_OP=3"] + rdc_copts(),
-    linkstatic = True,
-    prefix = "max_",
-    deps = [
-        ":include_hdrs",
-        ":src_hdrs",
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
+    srcs = device_srcs,
+    NCCL_OP = 3,
 )
 
-nccl_library(
-    name = "functions",
+cuda_rdc_library(
+    name = "device",
     srcs = [
-        "src/collectives/device/functions.cu",
-        ":device_hdrs",
-    ],
-    copts = rdc_copts(),
-    linkstatic = True,
-    deps = [
-        ":include_hdrs",
-        ":src_hdrs",
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
-)
-
-rdc_library(
-    name = "device_code",
-    deps = [
-        ":functions",
         ":max",
         ":min",
         ":prod",
         ":sum",
+    ] + process_srcs(glob([
+        "src/collectives/device/*.h",
+        "src/collectives/device/functions.cu",
+    ])),
+    deps = [
+        ":include_hdrs",
+        ":src_hdrs",
     ],
 )
 
 # Primary NCCL target.
-nccl_library(
+cc_library(
     name = "nccl",
-    srcs = glob(
+    srcs = process_srcs(glob(
         include = ["src/**/*.cu"],
         # Exclude device-library code.
         exclude = ["src/collectives/device/**"],
-    ) + [
+    )) + [
         # Required for header inclusion checking (see
         # http://docs.bazel.build/versions/master/be/c-cpp.html#hdrs).
-        # Files in src/ which #include "nccl.h" load it from there rather than
-        # from the virtual includes directory.
-        "src/nccl.h",
+        "nccl.h",
+        "collectives/collectives.h",
     ],
-    hdrs = ["src/nccl.h"],
-    copts = cuda_default_copts(),
+    hdrs = ["nccl.h"],
+    copts = cuda_default_copts() + ["-Wno-vla"],
     include_prefix = "third_party/nccl",
-    strip_include_prefix = "src",
     visibility = ["//visibility:public"],
     deps = [
-        ":device_code",
+        ":device",
         ":include_hdrs",
-        ":src_hdrs",
+        "@local_config_cuda//cuda:cudart_static",
     ],
 )
diff --git a/third_party/nccl/build_defs.bzl.tpl b/third_party/nccl/build_defs.bzl.tpl
index 42de79c411c844d48982c47753337102b915aefd..245f180a91b6f2661fc35d834aa13f9347b1f330 100644
--- a/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/nccl/build_defs.bzl.tpl
@@ -1,87 +1,97 @@
 """Repository rule for NCCL."""
 
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
+load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
 
-def _gen_nccl_h_impl(ctx):
-    """Creates nccl.h from a template."""
+def _process_src_impl(ctx):
+    """Applies various patches to the NCCL source."""
+    substitutions = {
+        "\"collectives.h": "\"collectives/collectives.h",
+        "\"../collectives.h": "\"collectives/collectives.h",
+        # Clang does not define __CUDACC_VER_*__, use CUDA_VERSION instead.
+        # TODO(csigg): Apply substitutions upstream and remove here.
+        "#if __CUDACC_VER_MAJOR__ >= 10 || (__CUDACC_VER_MAJOR__ >= 9 && __CUDACC_VER_MINOR__ >= 2)": "#if CUDART_VERSION >= 9200",
+        "#if __CUDACC_VER_MAJOR__ >= 10": "#if CUDART_VERSION >= 10000",
+        "#if __CUDACC_VER_MAJOR__ >= 9": "#if CUDART_VERSION >= 9000",
+        "#if __CUDACC_VER_MAJOR__ < 9": "#if CUDART_VERSION < 9000",
+        "nullptr_t": "std::nullptr_t",
+    }
+    if ctx.file.src.basename == "nccl.h.in":
+        substitutions.update({
+          "${nccl:Major}": "2",
+          "${nccl:Minor}": "3",
+          "${nccl:Patch}": "5",
+          "${nccl:Suffix}": "",
+          "${nccl:Version}": "2305",
+        })
+    if ctx.file.src.basename == "function.cu":
+        substitutions.update({
+            # Don't try to initialize the host shadow copy of this device-side
+            # global variable. There is no host pointer to a device-side
+            # function, which confuses clang.
+            # TODO(csigg): remove when fixed in clang.
+            "NCCL_FUNCS2B(ncclBroadcast),": "#if __CUDA_ARCH__\nNCCL_FUNCS2B(ncclBroadcast),",
+            "NCCL_FUNCS2A(ncclAllReduce)": "NCCL_FUNCS2A(ncclAllReduce)\n#endif",
+        })
     ctx.actions.expand_template(
-        output = ctx.outputs.output,
-        template = ctx.file.template,
-        substitutions = {
-            "${nccl:Major}": "2",
-            "${nccl:Minor}": "3",
-            "${nccl:Patch}": "5",
-            "${nccl:Suffix}": "",
-            "${nccl:Version}": "2305",
-        },
+        output = ctx.outputs.out,
+        template = ctx.file.src,
+        substitutions = substitutions,
     )
 
-gen_nccl_h = rule(
-    implementation = _gen_nccl_h_impl,
+_process_src = rule(
+    implementation = _process_src_impl,
     attrs = {
-        "template": attr.label(allow_single_file = True),
-        "output": attr.output(),
+        "src": attr.label(allow_single_file = True),
+        "out": attr.output(),
     },
 )
-"""Creates the NCCL header file."""
+"""Processes one NCCL source file so it can be compiled with bazel and clang."""
+
+def _out(src):
+    if not src.startswith("src/"):
+      fail("Source file not under src/...:", src)
+    src = src[4:]  # Strip 'src/'
+    if src == "nccl.h.in":
+      return "nccl.h"
+    if src.endswith(".cu"):
+      return src + ".cc"
+    return src
+
+def process_srcs(srcs):
+    """Processes files under src/ and copies them to the parent directory."""
+    [_process_src(
+      name = "_" + src,
+      src = src,
+      out = _out(src),
+    ) for src in srcs]
+    return ["_" + src for src in srcs]
 
-def _process_srcs_impl(ctx):
-    """Appends .cc to .cu files, patches include directives."""
+def _gen_device_srcs_impl(ctx):
     files = []
     for src in ctx.files.srcs:
-        if not src.is_source:
-            # Process only once, specifically "src/nccl.h".
-            files.append(src)
-            continue
-        name = src.basename
-        if src.extension == "cu":
-            name = ctx.attr.prefix + name + ".cc"
+        name = "%s_%s" % (ctx.attr.name, src.basename)
         file = ctx.actions.declare_file(name, sibling = src)
         ctx.actions.expand_template(
             output = file,
             template = src,
             substitutions = {
-                "\"collectives.h": "\"collectives/collectives.h",
-                "\"../collectives.h": "\"collectives/collectives.h",
-                "#if __CUDACC_VER_MAJOR__": "#if defined __CUDACC_VER_MAJOR__ && __CUDACC_VER_MAJOR__",
-                # Substitutions are applied in order.
-                "std::nullptr_t": "nullptr_t",
-                "nullptr_t": "std::nullptr_t",
+                "#define UNROLL 4": "#define UNROLL 4\n#define NCCL_OP %d" % ctx.attr.NCCL_OP,
             },
         )
         files.append(file)
     return [DefaultInfo(files = depset(files))]
 
-_process_srcs = rule(
-    implementation = _process_srcs_impl,
+gen_device_srcs = rule(
+    implementation = _gen_device_srcs_impl,
     attrs = {
         "srcs": attr.label_list(allow_files = True),
-        "prefix": attr.string(default = ""),
+        "NCCL_OP": attr.int(),
     },
 )
-"""Processes the NCCL srcs so they can be compiled with bazel and clang."""
-
-def nccl_library(name, srcs = None, hdrs = None, prefix = None, **kwargs):
-    """Processes the srcs and hdrs and creates a cc_library."""
-
-    _process_srcs(
-        name = name + "_srcs",
-        srcs = srcs,
-        prefix = prefix,
-    )
-    _process_srcs(
-        name = name + "_hdrs",
-        srcs = hdrs,
-    )
+"""Adds prefix to each file name in srcs and adds #define NCCL_OP."""
 
-    native.cc_library(
-        name = name,
-        srcs = [name + "_srcs"] if srcs else [],
-        hdrs = [name + "_hdrs"] if hdrs else [],
-        **kwargs
-    )
-
-def rdc_copts():
+def _rdc_copts():
     """Returns copts for compiling relocatable device code."""
 
     # The global functions can not have a lower register count than the
@@ -100,118 +110,255 @@ def rdc_copts():
             "-fcuda-rdc",
             "-Xcuda-ptxas",
             maxrregcount,
+            # Work around for clang bug (fixed in r348662), declaring
+            # '__device__ operator delete(void*, std::size_t)' non-inline.
+            # TODO(csigg): Only add this option for older clang versions.
+            "-std=gnu++11",
         ],
         "//conditions:default": [],
-    }) + ["-fvisibility=hidden"]
+    })
 
-def _filter_impl(ctx):
-    suffix = ctx.attr.suffix
-    files = [src for src in ctx.files.srcs if src.path.endswith(suffix)]
-    return [DefaultInfo(files = depset(files))]
+def _lookup_file(filegroup, path):
+    """Extracts file at (relative) path in filegroup."""
+    for file in filegroup.files:
+        if file.path.endswith(path):
+            return file
+    return None
 
-_filter = rule(
-    implementation = _filter_impl,
-    attrs = {
-        "srcs": attr.label_list(allow_files = True),
-        "suffix": attr.string(),
-    },
-)
-"""Filters the srcs to the ones ending with suffix."""
+def _pic_only(files):
+    """Returns the PIC files if there are any in 'files', otherwise 'files'."""
+    pic_only = [f for f in files if f.basename.find(".pic.") >= 0]
+    return pic_only if pic_only else files
+
+def _device_link_impl(ctx):
+    if not ctx.attr.gpu_archs:
+        fail("No GPU architecture specified. NCCL requires --config=cuda or similar.")
+
+    inputs = []
+    for dep in ctx.attr.deps:
+        inputs += dep.files.to_list()
+    inputs = _pic_only(inputs)
 
-def _gen_link_src_impl(ctx):
+    # Device-link to cubins for each architecture.
+    name = ctx.attr.name
+    register_h = None
+    cubins = []
+    images = []
+    for arch in ctx.attr.gpu_archs:
+        cubin = ctx.actions.declare_file("%s_%s.cubin" % (name, arch))
+        register_h = ctx.actions.declare_file("%s_register_%s.h" % (name, arch))
+        ctx.actions.run(
+            outputs = [register_h, cubin],
+            inputs = inputs,
+            executable = ctx.file._nvlink,
+            arguments = ctx.attr.nvlink_args + [
+                "--arch=%s" % arch,
+                "--register-link-binaries=%s" % register_h.path,
+                "--output-file=%s" % cubin.path,
+            ] + [file.path for file in inputs],
+            mnemonic = "nvlink",
+        )
+        cubins.append(cubin)
+        images.append("--image=profile=%s,file=%s" % (arch, cubin.path))
+
+    # Generate fatbin header from all cubins.
+    tmp_fatbin = ctx.actions.declare_file("%s.fatbin" % name)
+    fatbin_h = ctx.actions.declare_file("%s_fatbin.h" % name)
+    bin2c = ctx.file._bin2c
+    ctx.actions.run(
+        outputs = [tmp_fatbin, fatbin_h],
+        inputs = cubins,
+        executable = ctx.file._fatbinary,
+        arguments = [
+            "-64",
+            "--cmdline=--compile-only",
+            "--link",
+            "--compress-all",
+            "--bin2c-path=%s" % bin2c.dirname,
+            "--create=%s" % tmp_fatbin.path,
+            "--embedded-fatbin=%s" % fatbin_h.path,
+        ] + images,
+        tools = [bin2c],
+        mnemonic = "fatbinary",
+    )
+
+    # Generate the source file #including the headers generated above.
     ctx.actions.expand_template(
-        output = ctx.outputs.output,
-        template = ctx.file.template,
+        output = ctx.outputs.out,
+        template = ctx.file._link_stub,
         substitutions = {
-            "REGISTERLINKBINARYFILE": '"%s"' % ctx.file.register_hdr.short_path,
-            "FATBINFILE": '"%s"' % ctx.file.fatbin_hdr.short_path,
+            "REGISTERLINKBINARYFILE": '"%s"' % register_h.short_path,
+            "FATBINFILE": '"%s"' % fatbin_h.short_path,
         },
     )
 
-_gen_link_src = rule(
-    implementation = _gen_link_src_impl,
+    return [DefaultInfo(files = depset([register_h, fatbin_h]))]
+
+_device_link = rule(
+    implementation = _device_link_impl,
     attrs = {
-        "register_hdr": attr.label(allow_single_file = True),
-        "fatbin_hdr": attr.label(allow_single_file = True),
-        "template": attr.label(allow_single_file = True),
-        "output": attr.output(),
+        "deps": attr.label_list(),
+        "out": attr.output(mandatory = True),
+        "gpu_archs": attr.string_list(),
+        "nvlink_args": attr.string_list(),
+        "_nvlink": attr.label(
+            default = Label("@local_config_cuda//cuda:cuda/bin/nvlink"),
+            allow_single_file = True,
+            executable = True,
+            cfg = "host",
+        ),
+        "_fatbinary": attr.label(
+            default = Label("@local_config_cuda//cuda:cuda/bin/fatbinary"),
+            allow_single_file = True,
+            executable = True,
+            cfg = "host",
+        ),
+        "_bin2c": attr.label(
+            default = Label("@local_config_cuda//cuda:cuda/bin/bin2c"),
+            allow_single_file = True,
+            executable = True,
+            cfg = "host",
+        ),
+        "_link_stub": attr.label(
+            default = Label("@local_config_cuda//cuda:cuda/bin/crt/link.stub"),
+            allow_single_file = True,
+        ),
     },
 )
-"""Patches the include directives for the link.stub file."""
-
-def rdc_library(name, deps):
-    """Produces a cc_library from deps containing relocatable device code."""
-
-    # From .a and .pic.a archives, just use the latter. Otherwise we get
-    # multiply defined symbols.
-    # TODO(csigg): C++ Sandwich once available should allow passing this target
-    # to a cc_library dependency, which would avoid the linking order issue.
-    _filter(
-        name = name + "_deps_a",
-        srcs = deps,
-        suffix = ".pic.a",
+"""Links device code and generates source code for kernel registration."""
+
+def _merge_archive_impl(ctx):
+    # Generate an mri script to the merge archives in srcs and pass it to 'ar'.
+    # See https://stackoverflow.com/a/23621751.
+    files = _pic_only(ctx.files.srcs)
+    mri_script = "create " + ctx.outputs.out.path
+    for f in files:
+        mri_script += "\\naddlib " + f.path
+    mri_script += "\\nsave\\nend"
+
+    cc_toolchain = find_cpp_toolchain(ctx)
+    ctx.actions.run_shell(
+        inputs = ctx.files.srcs,  # + ctx.files._crosstool,
+        outputs = [ctx.outputs.out],
+        command = ("printf \"%s\" " % mri_script +
+                   "| %s -M" % cc_toolchain.ar_executable),
     )
 
-    # Device-link to cubins for each architecture.
-    images = []
-    cubins = []
-    for arch in %{gpu_architectures}:
-        cubin = "%s_%s.cubin" % (name, arch)
-        register_hdr = "%s_%s.h" % (name, arch)
-        nvlink = "@local_config_nccl//:nvlink"
-        cmd = ("$(location %s) " % nvlink +
-               select({
-                   # NCCL is only supported on Linux.
-                   "@org_tensorflow//tensorflow:linux_x86_64": "--cpu-arch=X86_64 ",
-                   "@org_tensorflow//tensorflow:linux_ppc64le": "--cpu-arch=PPC64LE ",
-                   "//conditions:default": "",
-               }) +
-               "--arch=%s $(SRCS) " % arch +
-               "--register-link-binaries=$(location %s) " % register_hdr +
-               "--output-file=$(location %s)" % cubin)
-        native.genrule(
-            name = "%s_%s" % (name, arch),
-            outs = [register_hdr, cubin],
-            srcs = [name + "_deps_a"],
-            cmd = cmd,
-            tools = [nvlink],
-        )
-        images.append("--image=profile=%s,file=$(location %s)" % (arch, cubin))
-        cubins.append(cubin)
+_merge_archive = rule(
+    implementation = _merge_archive_impl,
+    attrs = {
+        "srcs": attr.label_list(mandatory = True, allow_files = True),
+        "_cc_toolchain": attr.label(default = "@bazel_tools//tools/cpp:current_cc_toolchain"),
+        # "_crosstool": attr.label_list(cfg = "host", default = ["@bazel_tools//tools/cpp:crosstool"]),
+    },
+    outputs = {"out": "lib%{name}.a"},
+)
+"""Merges srcs into a single archive."""
 
-    # Generate fatbin header from all cubins.
-    fatbin_hdr = name + ".fatbin.h"
-    fatbinary = "@local_config_nccl//:cuda/bin/fatbinary"
-    bin2c = "@local_config_nccl//:cuda/bin/bin2c"
-    cmd = ("$(location %s) -64 --cmdline=--compile-only " % fatbinary +
-           "--link --bin2c-path $$(dirname $(location %s)) " % bin2c +
-           "--compress-all %s --create=%%{name}.fatbin " % " ".join(images) +
-           "--embedded-fatbin=$@")
-    native.genrule(
-        name = name + "_fatbin_h",
-        outs = [fatbin_hdr],
-        srcs = cubins,
-        cmd = cmd,
-        tools = [fatbinary, bin2c],
+def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwargs):
+    """Produces a cuda_library using separate compilation and linking.
+
+    CUDA separate compilation and linking allows device function calls across
+    translation units. This is different from the normal whole program
+    compilation where each translation unit contains all device code. For more
+    background, see
+    https://devblogs.nvidia.com/separate-compilation-linking-cuda-device-code/,
+    https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-options-for-separate-compilation
+
+    During separate compilation, the different CUDA source files are compiled
+    to 'relocatable device code' (RDC) and embedded in the host object files.
+    When using nvcc, linking the device code for each supported GPU
+    architecture and generating kernel registration code for the CUDA runtime
+    is handled automatically. Clang supports generating relocatable device
+    code, but it can't link it. We therefore rely on tools provided by the CUDA
+    SDK to link the device code and generate the host code to register the
+    kernels.
+
+    The nvlink tool extracts the RDC code from the object files and links it
+    into cubin files, one per GPU architecture. It also produces a header file
+    with a list of kernel names to register. The cubins are merged into a
+    binary blob using the fatbinary tool, and converted to a C header file with
+    the help of the bin2c tool. The registration header file, the fatbinary
+    header file, and the link.stub file (shipped with the CUDA SDK) are
+    compiled as ordinary host code.
+
+    Here is a diagram of the CUDA separate compilation trajectory:
+
+     x.cu.cc    y.cu.cc
+           \    /            cc_library (compile RDC and archive)
+            xy.a
+           /    \            * nvlink
+    register.h  xy.cubin
+          :      |           * fatbinary and bin2c
+          :     xy.fatbin.h
+          :      :           * #include
+          dlink.cc           * Expanded from crt/dlink.stub template
+             |               cc_library (host compile and archive)
+          dlink.a
+
+    The steps marked with '*' are implemented in the _device_link rule.
+
+    The object files in both xy.a and dlink.a reference symbols defined in the
+    other archive. The separate archives are a side effect of using two
+    cc_library targets to implement a single compilation trajectory. We could
+    fix this once bazel supports C++ sandwich. For now, we just merge the two
+    archives to avoid unresolved symbols:
+
+    xy.a      dlink.a
+        \    /           merge archive
+      xy_dlink.a
+           |             cc_library (or alternatively, cc_import)
+     final target
+
+    Another complication is that cc_library produces (depending on the
+    configuration) both PIC and non-PIC archives, but the distinction
+    is hidden from Starlark until C++ sandwich becomes available. We work
+    around this by dropping the non-PIC files if PIC files are available.
+
+    Args:
+      name: Target name.
+      hdrs: Header files.
+      copts: Compiler options.
+      linkstatic: Must be true.
+      **kwargs: Any other arguments.
+    """
+
+    if not hdrs:
+        hdrs = []
+    if not copts:
+        copts = []
+
+    # Compile host and device code into library.
+    lib = name + "_lib"
+    native.cc_library(
+        name = lib,
+        hdrs = hdrs,
+        copts = _rdc_copts() + copts,
+        linkstatic = linkstatic,
+        **kwargs
     )
 
-    # Generate the source file #including the headers generated above.
-    _gen_link_src(
-        name = name + "_dlink_src",
-        # Include just the last one, they are equivalent.
-        register_hdr = register_hdr,
-        fatbin_hdr = fatbin_hdr,
-        template = "@local_config_nccl//:cuda/bin/crt/link.stub",
-        output = name + ".cc",
+    # Generate source file containing linked device code.
+    dlink_hdrs = name + "_dlink_hdrs"
+    dlink_cc = name + "_dlink.cc"
+    _device_link(
+        name = dlink_hdrs,
+        deps = [lib],
+        out = dlink_cc,
+        gpu_archs = %{gpu_architectures},
+        nvlink_args = select({
+            "@org_tensorflow//tensorflow:linux_x86_64": ["--cpu-arch=X86_64"],
+            "@org_tensorflow//tensorflow:linux_ppc64le": ["--cpu-arch=PPC64LE"],
+            "//conditions:default": [],
+        }),
     )
 
-    # Compile the source file into the cc_library.
+    # Compile the source file into a library.
+    dlink = name + "_dlink"
     native.cc_library(
-        name = name + "_dlink_a",
-        srcs = [
-            name + "_dlink_src",
-        ],
-        textual_hdrs = [register_hdr, fatbin_hdr],
+        name = dlink,
+        srcs = [dlink_cc],
+        textual_hdrs = [dlink_hdrs],
         deps = [
             "@local_config_cuda//cuda:cuda_headers",
         ],
@@ -222,31 +369,22 @@ def rdc_library(name, deps):
             "__NV_EXTRA_INITIALIZATION=",
             "__NV_EXTRA_FINALIZATION=",
         ],
-        linkstatic = True,
+        linkstatic = linkstatic,
     )
 
-    # Repackage deps into a single archive. This avoid unresolved symbols when
-    # the archives happen to be linked in the wrong order. For more details, see
+    # Repackage the two libs into a single archive. This is required because
+    # both libs reference symbols defined in the other one. For details, see
     # https://eli.thegreenplace.net/2013/07/09/library-order-in-static-linking
-    native.genrule(
-        name = name + "_a",
-        srcs = [
-            name + "_deps_a",
-            name + "_dlink_a",
-        ],
-        outs = [name + ".a"],
-        # See https://stackoverflow.com/a/23621751
-        cmd = """
-addlibs=$$(echo $(SRCS) | sed "s/[^ ]* */\\naddlib &/g")
-printf "create $@$${addlibs}\\nsave\\nend" | $(AR) -M
-""",
+    archive = name + "_a"
+    _merge_archive(
+        name = archive,
+        srcs = [lib, dlink],
     )
 
+    # Create cc target from archive.
     native.cc_library(
         name = name,
-        srcs = [name + "_a"],
-        deps = [
-            "@local_config_cuda//cuda:cudart_static",
-        ],
-        linkstatic = True,
+        srcs = [archive],
+        hdrs = hdrs,
+        linkstatic = linkstatic,
     )
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index 1e6422b49ef4d7ce97b3b38f3b3964281a158b7c..07e4ad7beac7928e6326b04c9d3e2545a07f6fdd 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -13,7 +13,9 @@ load(
     "auto_configure_fail",
     "compute_capabilities",
     "cuda_toolkit_path",
+    "enable_cuda",
     "find_cuda_define",
+    "get_cpu_value",
     "matches_version",
 )
 
@@ -22,7 +24,7 @@ _NCCL_HDR_PATH = "NCCL_HDR_PATH"
 _NCCL_INSTALL_PATH = "NCCL_INSTALL_PATH"
 _TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
 _TF_NCCL_VERSION = "TF_NCCL_VERSION"
-_TF_NCCL_CONFIG_REPO = "TF_NCCL_CONFIG_REPO"
+_TF_NEED_CUDA = "TF_NEED_CUDA"
 
 _DEFINE_NCCL_MAJOR = "#define NCCL_MAJOR"
 _DEFINE_NCCL_MINOR = "#define NCCL_MINOR"
@@ -41,13 +43,6 @@ cc_library(
 """
 
 _NCCL_ARCHIVE_BUILD_CONTENT = """
-exports_files([
-    "cuda/bin/crt/link.stub",
-    "cuda/bin/fatbinary",
-    "cuda/bin/bin2c",
-    "nvlink",
-])
-
 filegroup(
   name = "LICENSE",
   data = ["@nccl_archive//:LICENSE.txt"],
@@ -116,26 +111,24 @@ def _check_nccl_version(repository_ctx, nccl_install_path, nccl_hdr_path, nccl_v
     header_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
     if not matches_version(nccl_version, header_version):
         auto_configure_fail(
-            ("NCCL library version detected from %s/nccl.h (%s) does not match " +
-             "TF_NCCL_VERSION (%s). To fix this rerun configure again.") %
+            ("NCCL library version detected from %s/nccl.h (%s) does not " +
+             "match TF_NCCL_VERSION (%s). To fix this rerun configure again.") %
             (header_dir, header_version, nccl_version),
         )
 
 def _nccl_configure_impl(repository_ctx):
     """Implementation of the nccl_configure repository rule."""
-    if _TF_NCCL_VERSION not in repository_ctx.os.environ:
+    if (not enable_cuda(repository_ctx) or
+        get_cpu_value(repository_ctx) not in ("Linux", "FreeBSD")):
         # Add a dummy build file to make bazel query happy.
         repository_ctx.file("BUILD", _NCCL_DUMMY_BUILD_CONTENT)
         return
 
-    if _TF_NCCL_CONFIG_REPO in repository_ctx.os.environ:
-        # Forward to the pre-configured remote repository.
-        repository_ctx.template("BUILD", _label("remote.BUILD.tpl"), {
-            "%{target}": repository_ctx.os.environ[_TF_NCCL_CONFIG_REPO],
-        })
-        return
+    nccl_version = ""
+    if _TF_NCCL_VERSION in repository_ctx.os.environ:
+        nccl_version = repository_ctx.os.environ[_TF_NCCL_VERSION].strip()
+        nccl_version = nccl_version.split(".")[0]
 
-    nccl_version = repository_ctx.os.environ[_TF_NCCL_VERSION].strip()
     if nccl_version == "":
         # Alias to open source build from @nccl_archive.
         repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
@@ -151,15 +144,6 @@ def _nccl_configure_impl(repository_ctx):
         repository_ctx.template("build_defs.bzl", _label("build_defs.bzl.tpl"), {
             "%{gpu_architectures}": str(gpu_architectures),
         })
-
-        repository_ctx.symlink(cuda_toolkit_path(repository_ctx), "cuda")
-
-        # Temporary work-around for setups which symlink ptxas to a newer
-        # version. The versions of nvlink and ptxas need to agree, so we find
-        # nvlink next to the real location of ptxas. This is only temporary and
-        # will be removed again soon.
-        nvlink_dir = repository_ctx.path("cuda/bin/ptxas").realpath.dirname
-        repository_ctx.symlink(nvlink_dir.get_child("nvlink"), "nvlink")
     else:
         # Create target for locally installed NCCL.
         nccl_install_path = repository_ctx.os.environ[_NCCL_INSTALL_PATH].strip()
@@ -179,7 +163,7 @@ nccl_configure = repository_rule(
         _NCCL_INSTALL_PATH,
         _TF_NCCL_VERSION,
         _TF_CUDA_COMPUTE_CAPABILITIES,
-        _TF_NCCL_CONFIG_REPO,
+        _TF_NEED_CUDA,
     ],
 )
 """Detects and configures the NCCL configuration.
diff --git a/third_party/nccl/remote.BUILD.tpl b/third_party/nccl/remote.BUILD.tpl
deleted file mode 100644
index d66fc5563d16edc81c9d883984e438f82e6820ae..0000000000000000000000000000000000000000
--- a/third_party/nccl/remote.BUILD.tpl
+++ /dev/null
@@ -1,6 +0,0 @@
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-alias(name="LICENSE", actual = "%{target}:LICENSE")
-alias(name = "nccl", actual = "%{target}:nccl")
diff --git a/third_party/nccl/system.BUILD.tpl b/third_party/nccl/system.BUILD.tpl
index a07f54955fc5aa51aba4b6c079de9489b8e4cba1..970dddb117846004c22b9d0233c344d99c6aa4f0 100644
--- a/third_party/nccl/system.BUILD.tpl
+++ b/third_party/nccl/system.BUILD.tpl
@@ -1,26 +1,25 @@
 filegroup(
-  name = "LICENSE",
-  visibility = ["//visibility:public"],
+    name = "LICENSE",
+    visibility = ["//visibility:public"],
 )
 
 cc_library(
-  name = "nccl",
-  srcs = ["libnccl.so.%{version}"],
-  hdrs = ["nccl.h"],
-  include_prefix = "third_party/nccl",
-  deps = [
-      "@local_config_cuda//cuda:cuda_headers",
-  ],
-  visibility = ["//visibility:public"],
+    name = "nccl",
+    srcs = ["libnccl.so.%{version}"],
+    hdrs = ["nccl.h"],
+    include_prefix = "third_party/nccl",
+    visibility = ["//visibility:public"],
+    deps = [
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
 )
 
 genrule(
-  name = "nccl-files",
-  outs = [
-    "libnccl.so.%{version}",
-    "nccl.h",
-  ],
-  cmd = """cp "%{hdr_path}/nccl.h" "$(@D)/nccl.h" &&
+    name = "nccl-files",
+    outs = [
+        "libnccl.so.%{version}",
+        "nccl.h",
+    ],
+    cmd = """cp "%{hdr_path}/nccl.h" "$(@D)/nccl.h" &&
            cp "%{install_path}/libnccl.so.%{version}" "$(@D)/libnccl.so.%{version}" """,
 )
-
diff --git a/third_party/ngraph/ngraph.BUILD b/third_party/ngraph/ngraph.BUILD
index 63e9548c53262461cfc9c3fd160f4f17430319c7..a7da325766cecc049065f9fe91d41d27f26ba1be 100644
--- a/third_party/ngraph/ngraph.BUILD
+++ b/third_party/ngraph/ngraph.BUILD
@@ -56,14 +56,16 @@ cc_library(
         "src/ngraph/runtime/cpu/cpu_backend.cpp",
         "src/ngraph/runtime/cpu/cpu_builder.cpp",
         "src/ngraph/runtime/cpu/cpu_call_frame.cpp",
+        "src/ngraph/runtime/cpu/cpu_cse.cpp",
+        "src/ngraph/runtime/cpu/cpu_executor.cpp",
         "src/ngraph/runtime/cpu/cpu_external_function.cpp",
         "src/ngraph/runtime/cpu/cpu_kernels.cpp",
         "src/ngraph/runtime/cpu/cpu_layout_descriptor.cpp",
+        "src/ngraph/runtime/cpu/cpu_op_annotations.cpp",
         "src/ngraph/runtime/cpu/cpu_tensor_view.cpp",
         "src/ngraph/runtime/cpu/cpu_tensor_view_wrapper.cpp",
         "src/ngraph/runtime/cpu/cpu_tracing.cpp",
         "src/ngraph/runtime/cpu/cpu_visualize_tree.cpp",
-        "src/ngraph/runtime/cpu/kernel/eigen_thread_pool.cpp",
         "src/ngraph/runtime/cpu/kernel/pad.cpp",
         "src/ngraph/runtime/cpu/kernel/reduce_max.cpp",
         "src/ngraph/runtime/cpu/kernel/reduce_sum.cpp",
@@ -79,20 +81,26 @@ cc_library(
         "src/ngraph/runtime/cpu/op/conv_relu.cpp",
         "src/ngraph/runtime/cpu/op/convert_layout.cpp",
         "src/ngraph/runtime/cpu/op/group_conv.cpp",
+        "src/ngraph/runtime/cpu/op/group_conv_bias.cpp",
+        "src/ngraph/runtime/cpu/op/halide_op.cpp",
+        "src/ngraph/runtime/cpu/op/leaky_relu.cpp",
         "src/ngraph/runtime/cpu/op/loop_kernel.cpp",
         "src/ngraph/runtime/cpu/op/lstm.cpp",
         "src/ngraph/runtime/cpu/op/matmul_bias.cpp",
         "src/ngraph/runtime/cpu/op/max_pool_with_indices.cpp",
         "src/ngraph/runtime/cpu/op/rnn.cpp",
         "src/ngraph/runtime/cpu/op/sigmoid_mul.cpp",
+        "src/ngraph/runtime/cpu/op/update_slice.cpp",
         "src/ngraph/runtime/cpu/pass/cpu_assignment.cpp",
         "src/ngraph/runtime/cpu/pass/cpu_collapse_dims.cpp",
-        "src/ngraph/runtime/cpu/pass/cpu_concat_inputs.cpp",
         "src/ngraph/runtime/cpu/pass/cpu_fusion.cpp",
+        "src/ngraph/runtime/cpu/pass/cpu_horizontal_fusion.cpp",
         "src/ngraph/runtime/cpu/pass/cpu_layout.cpp",
         "src/ngraph/runtime/cpu/pass/cpu_loop_kernel_fusion.cpp",
         "src/ngraph/runtime/cpu/pass/cpu_mat_fusion.cpp",
+        "src/ngraph/runtime/cpu/pass/cpu_memory_optimization.cpp",
         "src/ngraph/runtime/cpu/pass/cpu_post_layout_optimizations.cpp",
+        "src/ngraph/runtime/cpu/pass/cpu_reshape_sinking.cpp",
         "src/ngraph/runtime/cpu/pass/cpu_rnn_fusion.cpp",
         "src/ngraph/runtime/cpu/pass/cpu_workspace_insertion.cpp",
     ],
@@ -101,7 +109,7 @@ cc_library(
         "-I external/ngraph/src",
         "-I external/nlohmann_json_lib/include/",
         '-D SHARED_LIB_EXT=\\".so\\"',
-        '-D NGRAPH_VERSION=\\"0.9.1\\"',
+        '-D NGRAPH_VERSION=\\"0.11.0\\"',
         "-D NGRAPH_DEX_ONLY",
         '-D PROJECT_ROOT_DIR=\\"\\"',
     ],
@@ -124,11 +132,13 @@ cc_library(
         "src/ngraph/builder/*.cpp",
         "src/ngraph/descriptor/*.cpp",
         "src/ngraph/descriptor/layout/*.cpp",
+        "src/ngraph/op/experimental/generate_mask.cpp",
         "src/ngraph/op/experimental/quantized_avg_pool.cpp",
         "src/ngraph/op/experimental/quantized_conv_bias.cpp",
         "src/ngraph/op/experimental/quantized_conv_relu.cpp",
         "src/ngraph/op/experimental/quantized_conv.cpp",
         "src/ngraph/op/experimental/quantized_max_pool.cpp",
+        "src/ngraph/op/experimental/shape_of.cpp",
         "src/ngraph/op/*.cpp",
         "src/ngraph/op/util/*.cpp",
         "src/ngraph/pattern/*.cpp",
@@ -142,7 +152,7 @@ cc_library(
         "-I external/ngraph/src",
         "-I external/nlohmann_json_lib/include/",
         '-D SHARED_LIB_EXT=\\".so\\"',
-        '-D NGRAPH_VERSION=\\"0.9.1\\"',
+        '-D NGRAPH_VERSION=\\"0.11.0\\"',
         '-D PROJECT_ROOT_DIR=\\"\\"',
     ],
     visibility = ["//visibility:public"],
diff --git a/third_party/ngraph/ngraph_tf.BUILD b/third_party/ngraph/ngraph_tf.BUILD
index db9a66f9b5bcdaa29ec55175f1a8c76ac5f6f22a..6397e19e36aca5ea264a44ce5e92a1ca24ba46fc 100644
--- a/third_party/ngraph/ngraph_tf.BUILD
+++ b/third_party/ngraph/ngraph_tf.BUILD
@@ -18,6 +18,8 @@ cc_library(
         "src/ngraph_api.h",
         "src/ngraph_assign_clusters.cc",
         "src/ngraph_assign_clusters.h",
+        "src/ngraph_backend_manager.cc",
+        "src/ngraph_backend_manager.h",
         "src/ngraph_builder.cc",
         "src/ngraph_builder.h",
         "src/ngraph_capture_variables.cc",
diff --git a/third_party/pasta/BUILD b/third_party/pasta/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..9bd256a57939c402a1f2240f2ddc53f97794c56b
--- /dev/null
+++ b/third_party/pasta/BUILD
@@ -0,0 +1 @@
+# Empty BUILD file to force build system to see this directory at all.
diff --git a/third_party/pasta/BUILD.bazel b/third_party/pasta/BUILD.bazel
new file mode 100644
index 0000000000000000000000000000000000000000..ade681b606953b1df3e0140f83d714a39384c221
--- /dev/null
+++ b/third_party/pasta/BUILD.bazel
@@ -0,0 +1,30 @@
+# Description:
+#   AST-based python refactoring.
+load("@//third_party/pasta:build_defs.bzl", "copy_srcs")
+
+licenses(["notice"])  # Apache2
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "pasta",
+    srcs = copy_srcs([
+        "__init__.py",
+        "augment/__init__.py",
+        "augment/errors.py",
+        "augment/import_utils.py",
+        "augment/inline.py",
+        "augment/rename.py",
+        "base/__init__.py",
+        "base/annotate.py",
+        "base/ast_constants.py",
+        "base/ast_utils.py",
+        "base/codegen.py",
+        "base/formatting.py",
+        "base/scope.py",
+        "base/test_utils.py",
+        "base/token_generator.py",
+    ]),
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/pasta/BUILD.system b/third_party/pasta/BUILD.system
new file mode 100644
index 0000000000000000000000000000000000000000..6adc953c5abdc4bc5495fdf1bceef242a7bac61a
--- /dev/null
+++ b/third_party/pasta/BUILD.system
@@ -0,0 +1,13 @@
+# Description: Pasta, AST based python refactoring.
+
+licenses(["notice"])  # Apache2
+
+filegroup(
+    name = "LICENSE",
+    visibility = ["//visibility:public"],
+)
+
+py_library(
+    name = "pasta",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/pasta/build_defs.bzl b/third_party/pasta/build_defs.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..0a5316de402b8cb6d59ba271400bf4d9bee9f033
--- /dev/null
+++ b/third_party/pasta/build_defs.bzl
@@ -0,0 +1,12 @@
+"""Skylark makros for building pasta."""
+
+def copy_srcs(srcs):
+    """Copies srcs from 'pasta' to parent directory."""
+    for src in srcs:
+        native.genrule(
+            name = src.replace(".", "_"),
+            srcs = ["pasta/" + src],
+            outs = [src],
+            cmd = "mkdir -p $$(dirname $@); cp $< $@",
+        )
+    return srcs
diff --git a/third_party/pasta/workspace.bzl b/third_party/pasta/workspace.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..9961835328ef4ee51e984976fa6a01cab8e5fe87
--- /dev/null
+++ b/third_party/pasta/workspace.bzl
@@ -0,0 +1,16 @@
+"""Loads pasta python package."""
+
+load("//third_party:repo.bzl", "third_party_http_archive")
+
+def repo():
+    third_party_http_archive(
+        name = "pasta",
+        urls = [
+            "https://mirror.bazel.build/github.com/google/pasta/archive/v0.1.2.tar.gz",
+            "https://github.com/google/pasta/archive/v0.1.2.tar.gz",
+        ],
+        strip_prefix = "pasta-0.1.2",
+        sha256 = "53e4c009a5eac38e942deb48bfc2d3cfca62cd457255fa86ffedb7e40f726a0c",
+        build_file = "//third_party/pasta:BUILD.bazel",
+        system_build_file = "//third_party/pasta:BUILD.system",
+    )
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index 53264630a1618e448182c946707fb5336930cf6d..d1b1e5949211cc7556c7020ca3b2a114358b276b 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -11,302 +11,337 @@ _PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
 _PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
 _TF_PYTHON_CONFIG_REPO = "TF_PYTHON_CONFIG_REPO"
 
-
-def _tpl(repository_ctx, tpl, substitutions={}, out=None):
-  if not out:
-    out = tpl
-  repository_ctx.template(
-      out,
-      Label("//third_party/py:%s.tpl" % tpl),
-      substitutions)
-
+def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
+    if not out:
+        out = tpl
+    repository_ctx.template(
+        out,
+        Label("//third_party/py:%s.tpl" % tpl),
+        substitutions,
+    )
 
 def _fail(msg):
-  """Output failure message when auto configuration fails."""
-  red = "\033[0;31m"
-  no_color = "\033[0m"
-  fail("%sPython Configuration Error:%s %s\n" % (red, no_color, msg))
-
+    """Output failure message when auto configuration fails."""
+    red = "\033[0;31m"
+    no_color = "\033[0m"
+    fail("%sPython Configuration Error:%s %s\n" % (red, no_color, msg))
 
 def _is_windows(repository_ctx):
-  """Returns true if the host operating system is windows."""
-  os_name = repository_ctx.os.name.lower()
-  if os_name.find("windows") != -1:
-    return True
-  return False
-
-
-def _execute(repository_ctx, cmdline, error_msg=None, error_details=None,
-             empty_stdout_fine=False):
-  """Executes an arbitrary shell command.
-
-  Args:
-    repository_ctx: the repository_ctx object
-    cmdline: list of strings, the command to execute
-    error_msg: string, a summary of the error if the command fails
-    error_details: string, details about the error or steps to fix it
-    empty_stdout_fine: bool, if True, an empty stdout result is fine, otherwise
-      it's an error
-  Return:
-    the result of repository_ctx.execute(cmdline)
-  """
-  result = repository_ctx.execute(cmdline)
-  if result.stderr or not (empty_stdout_fine or result.stdout):
-    _fail("\n".join([
-        error_msg.strip() if error_msg else "Repository command failed",
-        result.stderr.strip(),
-        error_details if error_details else ""]))
-  return result
-
+    """Returns true if the host operating system is windows."""
+    os_name = repository_ctx.os.name.lower()
+    if os_name.find("windows") != -1:
+        return True
+    return False
+
+def _execute(
+        repository_ctx,
+        cmdline,
+        error_msg = None,
+        error_details = None,
+        empty_stdout_fine = False):
+    """Executes an arbitrary shell command.
+
+    Args:
+      repository_ctx: the repository_ctx object
+      cmdline: list of strings, the command to execute
+      error_msg: string, a summary of the error if the command fails
+      error_details: string, details about the error or steps to fix it
+      empty_stdout_fine: bool, if True, an empty stdout result is fine, otherwise
+        it's an error
+    Return:
+      the result of repository_ctx.execute(cmdline)
+    """
+    result = repository_ctx.execute(cmdline)
+    if result.stderr or not (empty_stdout_fine or result.stdout):
+        _fail("\n".join([
+            error_msg.strip() if error_msg else "Repository command failed",
+            result.stderr.strip(),
+            error_details if error_details else "",
+        ]))
+    return result
 
 def _read_dir(repository_ctx, src_dir):
-  """Returns a string with all files in a directory.
-
-  Finds all files inside a directory, traversing subfolders and following
-  symlinks. The returned string contains the full path of all files
-  separated by line breaks.
-  """
-  if _is_windows(repository_ctx):
-    src_dir = src_dir.replace("/", "\\")
-    find_result = _execute(
-        repository_ctx, ["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
-        empty_stdout_fine=True)
-    # src_files will be used in genrule.outs where the paths must
-    # use forward slashes.
-    result = find_result.stdout.replace("\\", "/")
-  else:
-    find_result = _execute(
-        repository_ctx, ["find", src_dir, "-follow", "-type", "f"],
-        empty_stdout_fine=True)
-    result = find_result.stdout
-  return result
-
+    """Returns a string with all files in a directory.
+
+    Finds all files inside a directory, traversing subfolders and following
+    symlinks. The returned string contains the full path of all files
+    separated by line breaks.
+    """
+    if _is_windows(repository_ctx):
+        src_dir = src_dir.replace("/", "\\")
+        find_result = _execute(
+            repository_ctx,
+            ["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
+            empty_stdout_fine = True,
+        )
+
+        # src_files will be used in genrule.outs where the paths must
+        # use forward slashes.
+        result = find_result.stdout.replace("\\", "/")
+    else:
+        find_result = _execute(
+            repository_ctx,
+            ["find", src_dir, "-follow", "-type", "f"],
+            empty_stdout_fine = True,
+        )
+        result = find_result.stdout
+    return result
 
 def _genrule(src_dir, genrule_name, command, outs):
-  """Returns a string with a genrule.
-
-  Genrule executes the given command and produces the given outputs.
-  """
-  return (
-      'genrule(\n' +
-      '    name = "' +
-      genrule_name + '",\n' +
-      '    outs = [\n' +
-      outs +
-      '\n    ],\n' +
-      '    cmd = """\n' +
-      command +
-      '\n   """,\n' +
-      ')\n'
-  )
-
+    """Returns a string with a genrule.
+
+    Genrule executes the given command and produces the given outputs.
+    """
+    return (
+        "genrule(\n" +
+        '    name = "' +
+        genrule_name + '",\n' +
+        "    outs = [\n" +
+        outs +
+        "\n    ],\n" +
+        '    cmd = """\n' +
+        command +
+        '\n   """,\n' +
+        ")\n"
+    )
 
 def _norm_path(path):
-  """Returns a path with '/' and remove the trailing slash."""
-  path = path.replace("\\", "/")
-  if path[-1] == "/":
-    path = path[:-1]
-  return path
-
-
-def _symlink_genrule_for_dir(repository_ctx, src_dir, dest_dir, genrule_name,
-    src_files = [], dest_files = []):
-  """Returns a genrule to symlink(or copy if on Windows) a set of files.
-
-  If src_dir is passed, files will be read from the given directory; otherwise
-  we assume files are in src_files and dest_files
-  """
-  if src_dir != None:
-    src_dir = _norm_path(src_dir)
-    dest_dir = _norm_path(dest_dir)
-    files = '\n'.join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
-    # Create a list with the src_dir stripped to use for outputs.
-    dest_files = files.replace(src_dir, '').splitlines()
-    src_files = files.splitlines()
-  command = []
-  outs = []
-  for i in range(len(dest_files)):
-    if dest_files[i] != "":
-      # If we have only one file to link we do not want to use the dest_dir, as
-      # $(@D) will include the full path to the file.
-      dest = '$(@D)/' + dest_dir + dest_files[i] if len(dest_files) != 1 else '$(@D)/' + dest_files[i]
-      # Copy the headers to create a sandboxable setup.
-      cmd = 'cp -f'
-      command.append(cmd + ' "%s" "%s"' % (src_files[i] , dest))
-      outs.append('        "' + dest_dir + dest_files[i] + '",')
-  genrule = _genrule(src_dir, genrule_name, " && ".join(command),
-                     "\n".join(outs))
-  return genrule
-
+    """Returns a path with '/' and remove the trailing slash."""
+    path = path.replace("\\", "/")
+    if path[-1] == "/":
+        path = path[:-1]
+    return path
+
+def _symlink_genrule_for_dir(
+        repository_ctx,
+        src_dir,
+        dest_dir,
+        genrule_name,
+        src_files = [],
+        dest_files = []):
+    """Returns a genrule to symlink(or copy if on Windows) a set of files.
+
+    If src_dir is passed, files will be read from the given directory; otherwise
+    we assume files are in src_files and dest_files
+    """
+    if src_dir != None:
+        src_dir = _norm_path(src_dir)
+        dest_dir = _norm_path(dest_dir)
+        files = "\n".join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
+
+        # Create a list with the src_dir stripped to use for outputs.
+        dest_files = files.replace(src_dir, "").splitlines()
+        src_files = files.splitlines()
+    command = []
+    outs = []
+    for i in range(len(dest_files)):
+        if dest_files[i] != "":
+            # If we have only one file to link we do not want to use the dest_dir, as
+            # $(@D) will include the full path to the file.
+            dest = "$(@D)/" + dest_dir + dest_files[i] if len(dest_files) != 1 else "$(@D)/" + dest_files[i]
+
+            # Copy the headers to create a sandboxable setup.
+            cmd = "cp -f"
+            command.append(cmd + ' "%s" "%s"' % (src_files[i], dest))
+            outs.append('        "' + dest_dir + dest_files[i] + '",')
+    genrule = _genrule(
+        src_dir,
+        genrule_name,
+        " && ".join(command),
+        "\n".join(outs),
+    )
+    return genrule
 
 def _get_python_bin(repository_ctx):
-  """Gets the python bin path."""
-  python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH)
-  if python_bin != None:
-    return python_bin
-  python_bin_path = repository_ctx.which("python")
-  if python_bin_path != None:
-    return str(python_bin_path)
-  _fail("Cannot find python in PATH, please make sure " +
-        "python is installed and add its directory in PATH, or --define " +
-        "%s='/something/else'.\nPATH=%s" % (
-            _PYTHON_BIN_PATH, repository_ctx.os.environ.get("PATH", "")))
-
+    """Gets the python bin path."""
+    python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH)
+    if python_bin != None:
+        return python_bin
+    python_bin_path = repository_ctx.which("python")
+    if python_bin_path != None:
+        return str(python_bin_path)
+    _fail("Cannot find python in PATH, please make sure " +
+          "python is installed and add its directory in PATH, or --define " +
+          "%s='/something/else'.\nPATH=%s" % (
+              _PYTHON_BIN_PATH,
+              repository_ctx.os.environ.get("PATH", ""),
+          ))
 
 def _get_bash_bin(repository_ctx):
-  """Gets the bash bin path."""
-  bash_bin = repository_ctx.os.environ.get(_BAZEL_SH)
-  if bash_bin != None:
-    return bash_bin
-  else:
-    bash_bin_path = repository_ctx.which("bash")
-    if bash_bin_path != None:
-      return str(bash_bin_path)
+    """Gets the bash bin path."""
+    bash_bin = repository_ctx.os.environ.get(_BAZEL_SH)
+    if bash_bin != None:
+        return bash_bin
     else:
-      _fail("Cannot find bash in PATH, please make sure " +
-            "bash is installed and add its directory in PATH, or --define " +
-            "%s='/path/to/bash'.\nPATH=%s" % (
-                _BAZEL_SH, repository_ctx.os.environ.get("PATH", "")))
-
+        bash_bin_path = repository_ctx.which("bash")
+        if bash_bin_path != None:
+            return str(bash_bin_path)
+        else:
+            _fail("Cannot find bash in PATH, please make sure " +
+                  "bash is installed and add its directory in PATH, or --define " +
+                  "%s='/path/to/bash'.\nPATH=%s" % (
+                      _BAZEL_SH,
+                      repository_ctx.os.environ.get("PATH", ""),
+                  ))
 
 def _get_python_lib(repository_ctx, python_bin):
-  """Gets the python lib path."""
-  python_lib = repository_ctx.os.environ.get(_PYTHON_LIB_PATH)
-  if python_lib != None:
-    return python_lib
-  print_lib = ("<<END\n" +
-      "from __future__ import print_function\n" +
-      "import site\n" +
-      "import os\n" +
-      "\n" +
-      "try:\n" +
-      "  input = raw_input\n" +
-      "except NameError:\n" +
-      "  pass\n" +
-      "\n" +
-      "python_paths = []\n" +
-      "if os.getenv('PYTHONPATH') is not None:\n" +
-      "  python_paths = os.getenv('PYTHONPATH').split(':')\n" +
-      "try:\n" +
-      "  library_paths = site.getsitepackages()\n" +
-      "except AttributeError:\n" +
-      " from distutils.sysconfig import get_python_lib\n" +
-      " library_paths = [get_python_lib()]\n" +
-      "all_paths = set(python_paths + library_paths)\n" +
-      "paths = []\n" +
-      "for path in all_paths:\n" +
-      "  if os.path.isdir(path):\n" +
-      "    paths.append(path)\n" +
-      "if len(paths) >=1:\n" +
-      "  print(paths[0])\n" +
-      "END")
-  cmd = '%s - %s' % (python_bin, print_lib)
-  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
-  return result.stdout.strip('\n')
-
+    """Gets the python lib path."""
+    python_lib = repository_ctx.os.environ.get(_PYTHON_LIB_PATH)
+    if python_lib != None:
+        return python_lib
+    print_lib = ("<<END\n" +
+                 "from __future__ import print_function\n" +
+                 "import site\n" +
+                 "import os\n" +
+                 "\n" +
+                 "try:\n" +
+                 "  input = raw_input\n" +
+                 "except NameError:\n" +
+                 "  pass\n" +
+                 "\n" +
+                 "python_paths = []\n" +
+                 "if os.getenv('PYTHONPATH') is not None:\n" +
+                 "  python_paths = os.getenv('PYTHONPATH').split(':')\n" +
+                 "try:\n" +
+                 "  library_paths = site.getsitepackages()\n" +
+                 "except AttributeError:\n" +
+                 " from distutils.sysconfig import get_python_lib\n" +
+                 " library_paths = [get_python_lib()]\n" +
+                 "all_paths = set(python_paths + library_paths)\n" +
+                 "paths = []\n" +
+                 "for path in all_paths:\n" +
+                 "  if os.path.isdir(path):\n" +
+                 "    paths.append(path)\n" +
+                 "if len(paths) >=1:\n" +
+                 "  print(paths[0])\n" +
+                 "END")
+    cmd = "%s - %s" % (python_bin, print_lib)
+    result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
+    return result.stdout.strip("\n")
 
 def _check_python_lib(repository_ctx, python_lib):
-  """Checks the python lib path."""
-  cmd = 'test -d "%s" -a -x "%s"' % (python_lib, python_lib)
-  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
-  if result.return_code == 1:
-    _fail("Invalid python library path: %s" % python_lib)
-
+    """Checks the python lib path."""
+    cmd = 'test -d "%s" -a -x "%s"' % (python_lib, python_lib)
+    result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
+    if result.return_code == 1:
+        _fail("Invalid python library path: %s" % python_lib)
 
 def _check_python_bin(repository_ctx, python_bin):
-  """Checks the python bin path."""
-  cmd =  '[[ -x "%s" ]] && [[ ! -d "%s" ]]' % (python_bin, python_bin)
-  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
-  if result.return_code == 1:
-    _fail("--define %s='%s' is not executable. Is it the python binary?" % (
-        _PYTHON_BIN_PATH, python_bin))
-
+    """Checks the python bin path."""
+    cmd = '[[ -x "%s" ]] && [[ ! -d "%s" ]]' % (python_bin, python_bin)
+    result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
+    if result.return_code == 1:
+        _fail("--define %s='%s' is not executable. Is it the python binary?" % (
+            _PYTHON_BIN_PATH,
+            python_bin,
+        ))
 
 def _get_python_include(repository_ctx, python_bin):
-  """Gets the python include path."""
-  result = _execute(
-      repository_ctx,
-      [python_bin, "-c",
-       'from __future__ import print_function;' +
-       'from distutils import sysconfig;' +
-       'print(sysconfig.get_python_inc())'],
-      error_msg="Problem getting python include path.",
-      error_details=("Is the Python binary path set up right? " +
-                     "(See ./configure or " + _PYTHON_BIN_PATH + ".) " +
-                     "Is distutils installed?"))
-  return result.stdout.splitlines()[0]
-
+    """Gets the python include path."""
+    result = _execute(
+        repository_ctx,
+        [
+            python_bin,
+            "-c",
+            "from __future__ import print_function;" +
+            "from distutils import sysconfig;" +
+            "print(sysconfig.get_python_inc())",
+        ],
+        error_msg = "Problem getting python include path.",
+        error_details = ("Is the Python binary path set up right? " +
+                         "(See ./configure or " + _PYTHON_BIN_PATH + ".) " +
+                         "Is distutils installed?"),
+    )
+    return result.stdout.splitlines()[0]
 
 def _get_python_import_lib_name(repository_ctx, python_bin):
-  """Get Python import library name (pythonXY.lib) on Windows."""
-  result = _execute(
-      repository_ctx,
-      [python_bin, "-c",
-       'import sys;' +
-       'print("python" + str(sys.version_info[0]) + ' +
-       '      str(sys.version_info[1]) + ".lib")'],
-      error_msg="Problem getting python import library.",
-      error_details=("Is the Python binary path set up right? " +
-                     "(See ./configure or " + _PYTHON_BIN_PATH + ".) "))
-  return result.stdout.splitlines()[0]
-
+    """Get Python import library name (pythonXY.lib) on Windows."""
+    result = _execute(
+        repository_ctx,
+        [
+            python_bin,
+            "-c",
+            "import sys;" +
+            'print("python" + str(sys.version_info[0]) + ' +
+            '      str(sys.version_info[1]) + ".lib")',
+        ],
+        error_msg = "Problem getting python import library.",
+        error_details = ("Is the Python binary path set up right? " +
+                         "(See ./configure or " + _PYTHON_BIN_PATH + ".) "),
+    )
+    return result.stdout.splitlines()[0]
 
 def _get_numpy_include(repository_ctx, python_bin):
-  """Gets the numpy include path."""
-  return _execute(repository_ctx,
-                  [python_bin, "-c",
-                   'from __future__ import print_function;' +
-                   'import numpy;' +
-                   ' print(numpy.get_include());'],
-                  error_msg="Problem getting numpy include path.",
-                  error_details="Is numpy installed?").stdout.splitlines()[0]
-
+    """Gets the numpy include path."""
+    return _execute(
+        repository_ctx,
+        [
+            python_bin,
+            "-c",
+            "from __future__ import print_function;" +
+            "import numpy;" +
+            " print(numpy.get_include());",
+        ],
+        error_msg = "Problem getting numpy include path.",
+        error_details = "Is numpy installed?",
+    ).stdout.splitlines()[0]
 
 def _create_local_python_repository(repository_ctx):
-  """Creates the repository containing files set up to build with Python."""
-  python_bin = _get_python_bin(repository_ctx)
-  _check_python_bin(repository_ctx, python_bin)
-  python_lib = _get_python_lib(repository_ctx, python_bin)
-  _check_python_lib(repository_ctx, python_lib)
-  python_include = _get_python_include(repository_ctx, python_bin)
-  numpy_include = _get_numpy_include(repository_ctx, python_bin) + '/numpy'
-  python_include_rule = _symlink_genrule_for_dir(
-      repository_ctx, python_include, 'python_include', 'python_include')
-  python_import_lib_genrule = ""
-  # To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
-  # See https://docs.python.org/3/extending/windows.html
-  if _is_windows(repository_ctx):
-    python_include = _norm_path(python_include)
-    python_import_lib_name = _get_python_import_lib_name(repository_ctx, python_bin)
-    python_import_lib_src = python_include.rsplit('/', 1)[0] + "/libs/" + python_import_lib_name
-    python_import_lib_genrule = _symlink_genrule_for_dir(
-      repository_ctx, None, '', 'python_import_lib',
-      [python_import_lib_src], [python_import_lib_name])
-  numpy_include_rule = _symlink_genrule_for_dir(
-      repository_ctx, numpy_include, 'numpy_include/numpy', 'numpy_include')
-  _tpl(repository_ctx, "BUILD", {
-      "%{PYTHON_INCLUDE_GENRULE}": python_include_rule,
-      "%{PYTHON_IMPORT_LIB_GENRULE}": python_import_lib_genrule,
-      "%{NUMPY_INCLUDE_GENRULE}": numpy_include_rule,
-  })
-
+    """Creates the repository containing files set up to build with Python."""
+    python_bin = _get_python_bin(repository_ctx)
+    _check_python_bin(repository_ctx, python_bin)
+    python_lib = _get_python_lib(repository_ctx, python_bin)
+    _check_python_lib(repository_ctx, python_lib)
+    python_include = _get_python_include(repository_ctx, python_bin)
+    numpy_include = _get_numpy_include(repository_ctx, python_bin) + "/numpy"
+    python_include_rule = _symlink_genrule_for_dir(
+        repository_ctx,
+        python_include,
+        "python_include",
+        "python_include",
+    )
+    python_import_lib_genrule = ""
+
+    # To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
+    # See https://docs.python.org/3/extending/windows.html
+    if _is_windows(repository_ctx):
+        python_include = _norm_path(python_include)
+        python_import_lib_name = _get_python_import_lib_name(repository_ctx, python_bin)
+        python_import_lib_src = python_include.rsplit("/", 1)[0] + "/libs/" + python_import_lib_name
+        python_import_lib_genrule = _symlink_genrule_for_dir(
+            repository_ctx,
+            None,
+            "",
+            "python_import_lib",
+            [python_import_lib_src],
+            [python_import_lib_name],
+        )
+    numpy_include_rule = _symlink_genrule_for_dir(
+        repository_ctx,
+        numpy_include,
+        "numpy_include/numpy",
+        "numpy_include",
+    )
+    _tpl(repository_ctx, "BUILD", {
+        "%{PYTHON_INCLUDE_GENRULE}": python_include_rule,
+        "%{PYTHON_IMPORT_LIB_GENRULE}": python_import_lib_genrule,
+        "%{NUMPY_INCLUDE_GENRULE}": numpy_include_rule,
+    })
 
 def _create_remote_python_repository(repository_ctx, remote_config_repo):
-  """Creates pointers to a remotely configured repo set up to build with Python.
-  """
-  _tpl(repository_ctx, "remote.BUILD", {
-      "%{REMOTE_PYTHON_REPO}": remote_config_repo,
-  }, "BUILD")
-
+    """Creates pointers to a remotely configured repo set up to build with Python.
+    """
+    repository_ctx.template("BUILD", Label(remote_config_repo + ":BUILD"), {})
 
 def _python_autoconf_impl(repository_ctx):
-  """Implementation of the python_autoconf repository rule."""
-  if _TF_PYTHON_CONFIG_REPO in repository_ctx.os.environ:
-      _create_remote_python_repository(repository_ctx,
-          repository_ctx.os.environ[_TF_PYTHON_CONFIG_REPO])
-  else:
-    _create_local_python_repository(repository_ctx)
-
+    """Implementation of the python_autoconf repository rule."""
+    if _TF_PYTHON_CONFIG_REPO in repository_ctx.os.environ:
+        _create_remote_python_repository(
+            repository_ctx,
+            repository_ctx.os.environ[_TF_PYTHON_CONFIG_REPO],
+        )
+    else:
+        _create_local_python_repository(repository_ctx)
 
 python_configure = repository_rule(
     implementation = _python_autoconf_impl,
diff --git a/third_party/py/remote.BUILD.tpl b/third_party/py/remote.BUILD.tpl
deleted file mode 100644
index edcac41ec6fdd80151caa894aa50ba3f4f2aa536..0000000000000000000000000000000000000000
--- a/third_party/py/remote.BUILD.tpl
+++ /dev/null
@@ -1,13 +0,0 @@
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-alias(
-    name = "python_headers",
-    actual = "%{REMOTE_PYTHON_REPO}:python_headers",
-)
-
-alias(
-    name = "numpy_headers",
-    actual = "%{REMOTE_PYTHON_REPO}:numpy_headers",
-)
diff --git a/third_party/sycl/sycl/build_defs.bzl.tpl b/third_party/sycl/sycl/build_defs.bzl.tpl
index 33386f8957c821ef579a2bc1dcfb71b94ceb0aa1..a726c8d953cb56c67281e2572edb4e64df05d064 100755
--- a/third_party/sycl/sycl/build_defs.bzl.tpl
+++ b/third_party/sycl/sycl/build_defs.bzl.tpl
@@ -11,7 +11,7 @@ def if_sycl(if_true, if_false = []):
     return select({
         "@local_config_sycl//sycl:using_sycl_ccpp": if_true,
         "@local_config_sycl//sycl:using_sycl_trisycl": if_true[0:1],
-        "//conditions:default": if_false
+        "//conditions:default": if_false,
     })
 
 def if_ccpp(if_true, if_false = []):
@@ -24,5 +24,5 @@ def if_ccpp(if_true, if_false = []):
     return select({
         "@local_config_sycl//sycl:using_sycl_ccpp": if_true,
         "@local_config_sycl//sycl:using_sycl_trisycl": if_false,
-        "//conditions:default": if_false
+        "//conditions:default": if_false,
     })
diff --git a/third_party/sycl/sycl_configure.bzl b/third_party/sycl/sycl_configure.bzl
index 5b9d0eb383d1b069c2107c2c22a59c3790cb721e..deba6c4116e763c80398c17ff3331cfa0ceb8a9b 100644
--- a/third_party/sycl/sycl_configure.bzl
+++ b/third_party/sycl/sycl_configure.bzl
@@ -11,122 +11,124 @@
 """
 
 _HOST_CXX_COMPILER = "HOST_CXX_COMPILER"
-_HOST_C_COMPILER= "HOST_C_COMPILER"
+_HOST_C_COMPILER = "HOST_C_COMPILER"
 _COMPUTECPP_TOOLKIT_PATH = "COMPUTECPP_TOOLKIT_PATH"
 _TRISYCL_INCLUDE_DIR = "TRISYCL_INCLUDE_DIR"
 _PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
 
 def _enable_sycl(repository_ctx):
-  if "TF_NEED_OPENCL_SYCL" in repository_ctx.os.environ:
-    enable_sycl = repository_ctx.os.environ["TF_NEED_OPENCL_SYCL"].strip()
-    return enable_sycl == "1"
-  return False
+    if "TF_NEED_OPENCL_SYCL" in repository_ctx.os.environ:
+        enable_sycl = repository_ctx.os.environ["TF_NEED_OPENCL_SYCL"].strip()
+        return enable_sycl == "1"
+    return False
 
 def _enable_compute_cpp(repository_ctx):
-  return _COMPUTECPP_TOOLKIT_PATH in repository_ctx.os.environ
+    return _COMPUTECPP_TOOLKIT_PATH in repository_ctx.os.environ
 
 def auto_configure_fail(msg):
-  """Output failure message when auto configuration fails."""
-  red = "\033[0;31m"
-  no_color = "\033[0m"
-  fail("\n%sAuto-Configuration Error:%s %s\n" % (red, no_color, msg))
+    """Output failure message when auto configuration fails."""
+    red = "\033[0;31m"
+    no_color = "\033[0m"
+    fail("\n%sAuto-Configuration Error:%s %s\n" % (red, no_color, msg))
+
 # END cc_configure common functions (see TODO above).
 
 def find_c(repository_ctx):
-  """Find host C compiler."""
-  c_name = "gcc"
-  if _HOST_C_COMPILER in repository_ctx.os.environ:
-    c_name = repository_ctx.os.environ[_HOST_C_COMPILER].strip()
-  if c_name.startswith("/"):
-    return c_name
-  c = repository_ctx.which(c_name)
-  if c == None:
-    fail("Cannot find C compiler, please correct your path.")
-  return c
+    """Find host C compiler."""
+    c_name = "gcc"
+    if _HOST_C_COMPILER in repository_ctx.os.environ:
+        c_name = repository_ctx.os.environ[_HOST_C_COMPILER].strip()
+    if c_name.startswith("/"):
+        return c_name
+    c = repository_ctx.which(c_name)
+    if c == None:
+        fail("Cannot find C compiler, please correct your path.")
+    return c
 
 def find_cc(repository_ctx):
-  """Find host C++ compiler."""
-  cc_name = "g++"
-  if _HOST_CXX_COMPILER in repository_ctx.os.environ:
-    cc_name = repository_ctx.os.environ[_HOST_CXX_COMPILER].strip()
-  if cc_name.startswith("/"):
-    return cc_name
-  cc = repository_ctx.which(cc_name)
-  if cc == None:
-    fail("Cannot find C++ compiler, please correct your path.")
-  return cc
+    """Find host C++ compiler."""
+    cc_name = "g++"
+    if _HOST_CXX_COMPILER in repository_ctx.os.environ:
+        cc_name = repository_ctx.os.environ[_HOST_CXX_COMPILER].strip()
+    if cc_name.startswith("/"):
+        return cc_name
+    cc = repository_ctx.which(cc_name)
+    if cc == None:
+        fail("Cannot find C++ compiler, please correct your path.")
+    return cc
 
 def find_computecpp_root(repository_ctx):
-  """Find ComputeCpp compiler."""
-  sycl_name = ""
-  if _COMPUTECPP_TOOLKIT_PATH in repository_ctx.os.environ:
-    sycl_name = repository_ctx.os.environ[_COMPUTECPP_TOOLKIT_PATH].strip()
-  if sycl_name.startswith("/"):
-    return sycl_name
-  fail("Cannot find SYCL compiler, please correct your path")
+    """Find ComputeCpp compiler."""
+    sycl_name = ""
+    if _COMPUTECPP_TOOLKIT_PATH in repository_ctx.os.environ:
+        sycl_name = repository_ctx.os.environ[_COMPUTECPP_TOOLKIT_PATH].strip()
+    if sycl_name.startswith("/"):
+        return sycl_name
+    fail("Cannot find SYCL compiler, please correct your path")
 
 def find_trisycl_include_dir(repository_ctx):
-  """Find triSYCL include directory. """
-  if _TRISYCL_INCLUDE_DIR in repository_ctx.os.environ:
-    sycl_name = repository_ctx.os.environ[_TRISYCL_INCLUDE_DIR].strip()
-    if sycl_name.startswith("/"):
-      return sycl_name
-  fail( "Cannot find triSYCL include directory, please correct your path")
+    """Find triSYCL include directory. """
+    if _TRISYCL_INCLUDE_DIR in repository_ctx.os.environ:
+        sycl_name = repository_ctx.os.environ[_TRISYCL_INCLUDE_DIR].strip()
+        if sycl_name.startswith("/"):
+            return sycl_name
+    fail("Cannot find triSYCL include directory, please correct your path")
 
 def find_python_lib(repository_ctx):
-  """Returns python path."""
-  if _PYTHON_LIB_PATH in repository_ctx.os.environ:
-    return repository_ctx.os.environ[_PYTHON_LIB_PATH].strip()
-  fail("Environment variable PYTHON_LIB_PATH was not specified re-run ./configure")
-
+    """Returns python path."""
+    if _PYTHON_LIB_PATH in repository_ctx.os.environ:
+        return repository_ctx.os.environ[_PYTHON_LIB_PATH].strip()
+    fail("Environment variable PYTHON_LIB_PATH was not specified re-run ./configure")
 
 def _check_lib(repository_ctx, toolkit_path, lib):
-  """Checks if lib exists under sycl_toolkit_path or fail if it doesn't.
+    """Checks if lib exists under sycl_toolkit_path or fail if it doesn't.
 
-  Args:
-    repository_ctx: The repository context.
-    toolkit_path: The toolkit directory containing the libraries.
-    ib: The library to look for under toolkit_path.
-  """
-  lib_path = toolkit_path + "/" + lib
-  if not repository_ctx.path(lib_path).exists:
-    auto_configure_fail("Cannot find %s" % lib_path)
+    Args:
+      repository_ctx: The repository context.
+      toolkit_path: The toolkit directory containing the libraries.
+      ib: The library to look for under toolkit_path.
+    """
+    lib_path = toolkit_path + "/" + lib
+    if not repository_ctx.path(lib_path).exists:
+        auto_configure_fail("Cannot find %s" % lib_path)
 
 def _check_dir(repository_ctx, directory):
-  """Checks whether the directory exists and fail if it does not.
+    """Checks whether the directory exists and fail if it does not.
 
-  Args:
-    repository_ctx: The repository context.
-    directory: The directory to check the existence of.
-  """
-  if not repository_ctx.path(directory).exists:
-    auto_configure_fail("Cannot find dir: %s" % directory)
+    Args:
+      repository_ctx: The repository context.
+      directory: The directory to check the existence of.
+    """
+    if not repository_ctx.path(directory).exists:
+        auto_configure_fail("Cannot find dir: %s" % directory)
 
 def _symlink_dir(repository_ctx, src_dir, dest_dir):
-  """Symlinks all the files in a directory.
-
-  Args:
-    repository_ctx: The repository context.
-    src_dir: The source directory.
-    dest_dir: The destination directory to create the symlinks in.
-  """
-  files = repository_ctx.path(src_dir).readdir()
-  for src_file in files:
-    repository_ctx.symlink(src_file, dest_dir + "/" + src_file.basename)
-
-def _tpl(repository_ctx, tpl, substitutions={}, out=None):
-  if not out:
-    out = tpl.replace(":", "/")
-  repository_ctx.template(
-      out,
-      Label("//third_party/sycl/%s.tpl" % tpl),
-      substitutions)
+    """Symlinks all the files in a directory.
+
+    Args:
+      repository_ctx: The repository context.
+      src_dir: The source directory.
+      dest_dir: The destination directory to create the symlinks in.
+    """
+    files = repository_ctx.path(src_dir).readdir()
+    for src_file in files:
+        repository_ctx.symlink(src_file, dest_dir + "/" + src_file.basename)
+
+def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
+    if not out:
+        out = tpl.replace(":", "/")
+    repository_ctx.template(
+        out,
+        Label("//third_party/sycl/%s.tpl" % tpl),
+        substitutions,
+    )
 
 def _file(repository_ctx, label):
-  repository_ctx.template(
-      label.replace(":", "/"),
-      Label("//third_party/sycl/%s" % label),
-      {})
+    repository_ctx.template(
+        label.replace(":", "/"),
+        Label("//third_party/sycl/%s" % label),
+        {},
+    )
 
 _DUMMY_CROSSTOOL_BZL_FILE = """
 def error_sycl_disabled():
@@ -147,7 +149,6 @@ def error_sycl_disabled():
   )
 """
 
-
 _DUMMY_CROSSTOOL_BUILD_FILE = """
 load("//crosstool:error_sycl_disabled.bzl", "error_sycl_disabled")
 
@@ -155,87 +156,97 @@ error_sycl_disabled()
 """
 
 def _create_dummy_repository(repository_ctx):
-  # Set up BUILD file for sycl/.
-  _tpl(repository_ctx, "sycl:build_defs.bzl")
-  _tpl(repository_ctx, "sycl:BUILD")
-  _file(repository_ctx, "sycl:LICENSE.text")
-  _tpl(repository_ctx, "sycl:platform.bzl")
-
-  # Create dummy files for the SYCL toolkit since they are still required by
-  # tensorflow/sycl/platform/default/build_config:sycl.
-  repository_ctx.file("sycl/include/sycl.hpp", "")
-  repository_ctx.file("sycl/lib/libComputeCpp.so", "")
-
-  # If sycl_configure is not configured to build with SYCL support, and the user
-  # attempts to build with --config=sycl, add a dummy build rule to intercept
-  # this and fail with an actionable error message.
-  repository_ctx.file("crosstool/error_sycl_disabled.bzl",
-                      _DUMMY_CROSSTOOL_BZL_FILE)
-  repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
-
-
-def _sycl_autoconf_imp(repository_ctx):
-  """Implementation of the sycl_autoconf rule."""
-  if not _enable_sycl(repository_ctx):
-    _create_dummy_repository(repository_ctx)
-  else:
-    # copy template files
+    # Set up BUILD file for sycl/.
     _tpl(repository_ctx, "sycl:build_defs.bzl")
     _tpl(repository_ctx, "sycl:BUILD")
-    _tpl(repository_ctx, "sycl:platform.bzl")
-    _tpl(repository_ctx, "crosstool:BUILD")
     _file(repository_ctx, "sycl:LICENSE.text")
+    _tpl(repository_ctx, "sycl:platform.bzl")
 
-    if _enable_compute_cpp(repository_ctx):
-      _tpl(repository_ctx, "crosstool:computecpp",
-      {
-        "%{host_cxx_compiler}" : find_cc(repository_ctx),
-        "%{host_c_compiler}" : find_c(repository_ctx)
-      })
-
-      computecpp_root = find_computecpp_root(repository_ctx);
-      _check_dir(repository_ctx, computecpp_root)
-
-      _tpl(repository_ctx, "crosstool:CROSSTOOL",
-      {
-        "%{sycl_include_dir}" : computecpp_root,
-        "%{sycl_impl}" : "computecpp",
-        "%{c++_std}" : "-std=c++11",
-        "%{python_lib_path}" : find_python_lib(repository_ctx),
-      })
-
-      # symlink libraries
-      _check_lib(repository_ctx, computecpp_root+"/lib", "libComputeCpp.so" )
-      _symlink_dir(repository_ctx, computecpp_root + "/lib", "sycl/lib")
-      _symlink_dir(repository_ctx, computecpp_root + "/include", "sycl/include")
-      _symlink_dir(repository_ctx, computecpp_root + "/bin", "sycl/bin")
-    else:
-
-      trisycl_include_dir = find_trisycl_include_dir(repository_ctx);
-      _check_dir(repository_ctx, trisycl_include_dir)
-
-      _tpl(repository_ctx, "crosstool:trisycl",
-      {
-        "%{host_cxx_compiler}" : find_cc(repository_ctx),
-        "%{host_c_compiler}" : find_c(repository_ctx),
-        "%{trisycl_include_dir}" : trisycl_include_dir
-      })
-
-
-      _tpl(repository_ctx, "crosstool:CROSSTOOL",
-      {
-        "%{sycl_include_dir}" : trisycl_include_dir,
-        "%{sycl_impl}" : "trisycl",
-        "%{c++_std}" : "-std=c++1y",
-        "%{python_lib_path}" : find_python_lib(repository_ctx),
-      })
+    # Create dummy files for the SYCL toolkit since they are still required by
+    # tensorflow/sycl/platform/default/build_config:sycl.
+    repository_ctx.file("sycl/include/sycl.hpp", "")
+    repository_ctx.file("sycl/lib/libComputeCpp.so", "")
 
-      _symlink_dir(repository_ctx, trisycl_include_dir, "sycl/include")
+    # If sycl_configure is not configured to build with SYCL support, and the user
+    # attempts to build with --config=sycl, add a dummy build rule to intercept
+    # this and fail with an actionable error message.
+    repository_ctx.file(
+        "crosstool/error_sycl_disabled.bzl",
+        _DUMMY_CROSSTOOL_BZL_FILE,
+    )
+    repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
 
+def _sycl_autoconf_imp(repository_ctx):
+    """Implementation of the sycl_autoconf rule."""
+    if not _enable_sycl(repository_ctx):
+        _create_dummy_repository(repository_ctx)
+    else:
+        # copy template files
+        _tpl(repository_ctx, "sycl:build_defs.bzl")
+        _tpl(repository_ctx, "sycl:BUILD")
+        _tpl(repository_ctx, "sycl:platform.bzl")
+        _tpl(repository_ctx, "crosstool:BUILD")
+        _file(repository_ctx, "sycl:LICENSE.text")
+
+        if _enable_compute_cpp(repository_ctx):
+            _tpl(
+                repository_ctx,
+                "crosstool:computecpp",
+                {
+                    "%{host_cxx_compiler}": find_cc(repository_ctx),
+                    "%{host_c_compiler}": find_c(repository_ctx),
+                },
+            )
+
+            computecpp_root = find_computecpp_root(repository_ctx)
+            _check_dir(repository_ctx, computecpp_root)
+
+            _tpl(
+                repository_ctx,
+                "crosstool:CROSSTOOL",
+                {
+                    "%{sycl_include_dir}": computecpp_root,
+                    "%{sycl_impl}": "computecpp",
+                    "%{c++_std}": "-std=c++11",
+                    "%{python_lib_path}": find_python_lib(repository_ctx),
+                },
+            )
+
+            # symlink libraries
+            _check_lib(repository_ctx, computecpp_root + "/lib", "libComputeCpp.so")
+            _symlink_dir(repository_ctx, computecpp_root + "/lib", "sycl/lib")
+            _symlink_dir(repository_ctx, computecpp_root + "/include", "sycl/include")
+            _symlink_dir(repository_ctx, computecpp_root + "/bin", "sycl/bin")
+        else:
+            trisycl_include_dir = find_trisycl_include_dir(repository_ctx)
+            _check_dir(repository_ctx, trisycl_include_dir)
+
+            _tpl(
+                repository_ctx,
+                "crosstool:trisycl",
+                {
+                    "%{host_cxx_compiler}": find_cc(repository_ctx),
+                    "%{host_c_compiler}": find_c(repository_ctx),
+                    "%{trisycl_include_dir}": trisycl_include_dir,
+                },
+            )
+
+            _tpl(
+                repository_ctx,
+                "crosstool:CROSSTOOL",
+                {
+                    "%{sycl_include_dir}": trisycl_include_dir,
+                    "%{sycl_impl}": "trisycl",
+                    "%{c++_std}": "-std=c++1y",
+                    "%{python_lib_path}": find_python_lib(repository_ctx),
+                },
+            )
+
+            _symlink_dir(repository_ctx, trisycl_include_dir, "sycl/include")
 
 sycl_configure = repository_rule(
-  implementation = _sycl_autoconf_imp,
-  local = True,
+    implementation = _sycl_autoconf_imp,
+    local = True,
 )
 """Detects and configures the SYCL toolchain.
 
diff --git a/third_party/systemlibs/absl_py.absl.testing.BUILD b/third_party/systemlibs/absl_py.absl.testing.BUILD
index c1b794c1e9cd43fbb4b9a2bd49000ae79d88531a..7629509ebb6aa0bb525081ab8eaae11639415ba6 100644
--- a/third_party/systemlibs/absl_py.absl.testing.BUILD
+++ b/third_party/systemlibs/absl_py.absl.testing.BUILD
@@ -2,6 +2,10 @@ licenses(["notice"])  # Apache 2.0
 
 py_library(
     name = "parameterized",
-    testonly = 1,
     visibility = ["//visibility:public"],
 )
+
+py_library(
+    name = "absltest",
+    visibility = ["//visiblity:public"],
+)
diff --git a/third_party/systemlibs/syslibs_configure.bzl b/third_party/systemlibs/syslibs_configure.bzl
index 645d242c96c02a6a90b84334af1ac2fd11e437da..1b971eca8ad342063106de904b624b3e3a9a7706 100644
--- a/third_party/systemlibs/syslibs_configure.bzl
+++ b/third_party/systemlibs/syslibs_configure.bzl
@@ -28,6 +28,7 @@ VALID_LIBS = [
     "icu",
     "jpeg",
     "jsoncpp_git",
+    "keras_applications_archive",
     "lmdb",
     "nasm",
     "nsync",
diff --git a/third_party/tensorrt/BUILD.tpl b/third_party/tensorrt/BUILD.tpl
index 57682e8735013544d76b14fc2c41dfff3d50f691..a41ab808c70cbe8f69653794afe5a7651f514252 100644
--- a/third_party/tensorrt/BUILD.tpl
+++ b/third_party/tensorrt/BUILD.tpl
@@ -12,19 +12,13 @@ package(default_visibility = ["//visibility:public"])
 cc_library(
     name = "tensorrt_headers",
     hdrs = [%{tensorrt_headers}],
-    includes = [
-        "include",
-    ],
     visibility = ["//visibility:public"],
 )
 
 cc_library(
-    name = "nv_infer",
-    srcs = [%{nv_infer}],
-    data = [%{nv_infer}],
-    includes = [
-        "include",
-    ],
+    name = "tensorrt",
+    srcs = %{tensorrt_libs},
+    data = %{tensorrt_libs},
     copts= cuda_default_copts(),
     deps = [
         "@local_config_cuda//cuda:cuda",
@@ -34,6 +28,5 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
-
-%{tensorrt_genrules}
+%{copy_rules}
 
diff --git a/third_party/tensorrt/build_defs.bzl.tpl b/third_party/tensorrt/build_defs.bzl.tpl
index 0dc3a7ba2d239cbeca5181ba20d0c98edb26bb94..6d00513827b3804c49ad1cb93e952c0338b886e9 100644
--- a/third_party/tensorrt/build_defs.bzl.tpl
+++ b/third_party/tensorrt/build_defs.bzl.tpl
@@ -2,6 +2,4 @@
 
 def if_tensorrt(if_true, if_false=[]):
   """Tests whether TensorRT was enabled during the configure process."""
-  if %{tensorrt_is_configured}:
-    return if_true
-  return if_false
+  return %{if_tensorrt}
diff --git a/third_party/tensorrt/tensorrt_configure.bzl b/third_party/tensorrt/tensorrt_configure.bzl
index 9b946505a615372aa7de317c8ee390a2cd4b60e9..1d019a1b4f531d077747203c38296b4aa0bd8d82 100644
--- a/third_party/tensorrt/tensorrt_configure.bzl
+++ b/third_party/tensorrt/tensorrt_configure.bzl
@@ -10,13 +10,17 @@
 load(
     "//third_party/gpus:cuda_configure.bzl",
     "auto_configure_fail",
-    "get_cpu_value",
     "find_cuda_define",
+    "find_lib",
+    "get_cpu_value",
+    "lib_name",
+    "make_copy_dir_rule",
+    "make_copy_files_rule",
     "matches_version",
-    "symlink_genrule_for_dir",
 )
 
 _TENSORRT_INSTALL_PATH = "TENSORRT_INSTALL_PATH"
+_TF_TENSORRT_CONFIG_REPO = "TF_TENSORRT_CONFIG_REPO"
 _TF_TENSORRT_VERSION = "TF_TENSORRT_VERSION"
 
 _TF_TENSORRT_LIBS = ["nvinfer"]
@@ -26,188 +30,200 @@ _DEFINE_TENSORRT_SONAME_MAJOR = "#define NV_TENSORRT_SONAME_MAJOR"
 _DEFINE_TENSORRT_SONAME_MINOR = "#define NV_TENSORRT_SONAME_MINOR"
 _DEFINE_TENSORRT_SONAME_PATCH = "#define NV_TENSORRT_SONAME_PATCH"
 
-
 def _headers_exist(repository_ctx, path):
-  """Returns whether all TensorRT header files could be found in 'path'.
+    """Returns whether all TensorRT header files could be found in 'path'.
 
-  Args:
-    repository_ctx: The repository context.
-    path: The TensorRT include path to check.
-
-  Returns:
-    True if all TensorRT header files can be found in the path.
-  """
-  for h in _TF_TENSORRT_HEADERS:
-    if not repository_ctx.path("%s/%s" % (path, h)).exists:
-      return False
-  return True
+    Args:
+      repository_ctx: The repository context.
+      path: The TensorRT include path to check.
 
+    Returns:
+      True if all TensorRT header files can be found in the path.
+    """
+    for h in _TF_TENSORRT_HEADERS:
+        if not repository_ctx.path("%s/%s" % (path, h)).exists:
+            return False
+    return True
 
 def _find_trt_header_dir(repository_ctx, trt_install_path):
-  """Returns the path to the directory containing headers of TensorRT.
-
-  Args:
-    repository_ctx: The repository context.
-    trt_install_path: The TensorRT library install directory.
-
-  Returns:
-    The path of the directory containing the TensorRT header.
-  """
-  if trt_install_path == "/usr/lib/x86_64-linux-gnu":
-    path = "/usr/include/x86_64-linux-gnu"
-    if _headers_exist(repository_ctx, path):
-      return path
-  if trt_install_path == "/usr/lib/aarch64-linux-gnu":
-    path = "/usr/include/aarch64-linux-gnu"
+    """Returns the path to the directory containing headers of TensorRT.
+
+    Args:
+      repository_ctx: The repository context.
+      trt_install_path: The TensorRT library install directory.
+
+    Returns:
+      The path of the directory containing the TensorRT header.
+    """
+    if trt_install_path == "/usr/lib/x86_64-linux-gnu":
+        path = "/usr/include/x86_64-linux-gnu"
+        if _headers_exist(repository_ctx, path):
+            return path
+    if trt_install_path == "/usr/lib/aarch64-linux-gnu":
+        path = "/usr/include/aarch64-linux-gnu"
+        if _headers_exist(repository_ctx, path):
+            return path
+    path = str(repository_ctx.path("%s/../include" % trt_install_path).realpath)
     if _headers_exist(repository_ctx, path):
-      return path
-  path = str(repository_ctx.path("%s/../include" % trt_install_path).realpath)
-  if _headers_exist(repository_ctx, path):
-    return path
-  auto_configure_fail(
-      "Cannot find NvInfer.h with TensorRT install path %s" % trt_install_path)
-
-
-def _trt_lib_version(repository_ctx, trt_install_path):
-  """Detects the library (e.g. libnvinfer) version of TensorRT.
-
-  Args:
-    repository_ctx: The repository context.
-    trt_install_path: The TensorRT library install directory.
-
-  Returns:
-    A string containing the library version of TensorRT.
-  """
-  trt_header_dir = _find_trt_header_dir(repository_ctx, trt_install_path)
-  major_version = find_cuda_define(repository_ctx, trt_header_dir, "NvInfer.h",
-                                   _DEFINE_TENSORRT_SONAME_MAJOR)
-  minor_version = find_cuda_define(repository_ctx, trt_header_dir, "NvInfer.h",
-                                   _DEFINE_TENSORRT_SONAME_MINOR)
-  patch_version = find_cuda_define(repository_ctx, trt_header_dir, "NvInfer.h",
-                                   _DEFINE_TENSORRT_SONAME_PATCH)
-  full_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
-  environ_version = repository_ctx.os.environ[_TF_TENSORRT_VERSION].strip()
-  if not matches_version(environ_version, full_version):
+        return path
     auto_configure_fail(
-        ("TensorRT library version detected from %s/%s (%s) does not match " +
-         "TF_TENSORRT_VERSION (%s). To fix this rerun configure again.") %
-        (trt_header_dir, "NvInfer.h", full_version, environ_version))
-  return environ_version
-
-
-def _find_trt_libs(repository_ctx, trt_install_path, trt_lib_version):
-  """Finds the given TensorRT library on the system.
-
-  Adapted from code contributed by Sami Kama (https://github.com/samikama).
-
-  Args:
-    repository_ctx: The repository context.
-    trt_install_path: The TensorRT library installation directory.
-    trt_lib_version: The version of TensorRT library files as returned
-      by _trt_lib_version.
-
-  Returns:
-    Map of library names to structs with the following fields:
-      src_file_path: The full path to the library found on the system.
-      dst_file_name: The basename of the target library.
-  """
-  objdump = repository_ctx.which("objdump")
-  result = {}
-  for lib in _TF_TENSORRT_LIBS:
-    dst_file_name = "lib%s.so.%s" % (lib, trt_lib_version)
-    src_file_path = repository_ctx.path("%s/%s" % (trt_install_path,
-                                                   dst_file_name))
-    if not src_file_path.exists:
-      auto_configure_fail(
-          "Cannot find TensorRT library %s" % str(src_file_path))
-    if objdump != None:
-      objdump_out = repository_ctx.execute([objdump, "-p", str(src_file_path)])
-      for line in objdump_out.stdout.splitlines():
-        if "SONAME" in line:
-          dst_file_name = line.strip().split(" ")[-1]
-    result.update({
-        lib:
-            struct(
-                dst_file_name=dst_file_name,
-                src_file_path=str(src_file_path.realpath))
-    })
-  return result
+        "Cannot find NvInfer.h with TensorRT install path %s" % trt_install_path,
+    )
 
+def _trt_lib_version(repository_ctx, trt_install_path):
+    """Detects the library (e.g. libnvinfer) version of TensorRT.
+
+    Args:
+      repository_ctx: The repository context.
+      trt_install_path: The TensorRT library install directory.
+
+    Returns:
+      A string containing the library version of TensorRT.
+    """
+    trt_header_dir = _find_trt_header_dir(repository_ctx, trt_install_path)
+    major_version = find_cuda_define(
+        repository_ctx,
+        trt_header_dir,
+        "NvInfer.h",
+        _DEFINE_TENSORRT_SONAME_MAJOR,
+    )
+    minor_version = find_cuda_define(
+        repository_ctx,
+        trt_header_dir,
+        "NvInfer.h",
+        _DEFINE_TENSORRT_SONAME_MINOR,
+    )
+    patch_version = find_cuda_define(
+        repository_ctx,
+        trt_header_dir,
+        "NvInfer.h",
+        _DEFINE_TENSORRT_SONAME_PATCH,
+    )
+    full_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
+    environ_version = repository_ctx.os.environ[_TF_TENSORRT_VERSION].strip()
+    if not matches_version(environ_version, full_version):
+        auto_configure_fail(
+            ("TensorRT library version detected from %s/%s (%s) does not match " +
+             "TF_TENSORRT_VERSION (%s). To fix this rerun configure again.") %
+            (trt_header_dir, "NvInfer.h", full_version, environ_version),
+        )
+
+    # Only use the major version to match the SONAME of the library.
+    return major_version
+
+def _find_trt_libs(repository_ctx, cpu_value, trt_install_path, trt_lib_version):
+    """Finds the given TensorRT library on the system.
+
+    Adapted from code contributed by Sami Kama (https://github.com/samikama).
+
+    Args:
+      repository_ctx: The repository context.
+      trt_install_path: The TensorRT library installation directory.
+      trt_lib_version: The version of TensorRT library files as returned
+        by _trt_lib_version.
+
+    Returns:
+      The path to the library.
+    """
+    result = {}
+    for lib in _TF_TENSORRT_LIBS:
+        file_name = lib_name("nvinfer", cpu_value, trt_lib_version)
+        path = find_lib(repository_ctx, ["%s/%s" % (trt_install_path, file_name)])
+        result[file_name] = path
+    return result
 
 def _tpl(repository_ctx, tpl, substitutions):
-  repository_ctx.template(tpl, Label("//third_party/tensorrt:%s.tpl" % tpl),
-                          substitutions)
-
+    repository_ctx.template(
+        tpl,
+        Label("//third_party/tensorrt:%s.tpl" % tpl),
+        substitutions,
+    )
 
 def _create_dummy_repository(repository_ctx):
-  """Create a dummy TensorRT repository."""
-  _tpl(repository_ctx, "build_defs.bzl", {"%{tensorrt_is_configured}": "False"})
-  substitutions = {
-      "%{tensorrt_genrules}": "",
-      "%{tensorrt_headers}": "",
-  }
-  for lib in _TF_TENSORRT_LIBS:
-    k = "%%{%s}" % lib.replace("nv", "nv_")
-    substitutions.update({k: ""})
-  _tpl(repository_ctx, "BUILD", substitutions)
+    """Create a dummy TensorRT repository."""
+    _tpl(repository_ctx, "build_defs.bzl", {"%{if_tensorrt}": "if_false"})
 
+    _tpl(repository_ctx, "BUILD", {
+        "%{tensorrt_genrules}": "",
+        "%{tensorrt_headers}": "[]",
+        "%{tensorrt_libs}": "[]",
+    })
 
 def _tensorrt_configure_impl(repository_ctx):
-  """Implementation of the tensorrt_configure repository rule."""
-  if _TENSORRT_INSTALL_PATH not in repository_ctx.os.environ:
-    _create_dummy_repository(repository_ctx)
-    return
-
-  if (get_cpu_value(repository_ctx) != "Linux"):
-    auto_configure_fail("TensorRT is supported only on Linux.")
-  if _TF_TENSORRT_VERSION not in repository_ctx.os.environ:
-    auto_configure_fail("TensorRT library (libnvinfer) version is not set.")
-  trt_install_path = repository_ctx.os.environ[_TENSORRT_INSTALL_PATH].strip()
-  if not repository_ctx.path(trt_install_path).exists:
-    auto_configure_fail(
-        "Cannot find TensorRT install path %s." % trt_install_path)
-
-  # Set up the symbolic links for the library files.
-  trt_lib_version = _trt_lib_version(repository_ctx, trt_install_path)
-  trt_libs = _find_trt_libs(repository_ctx, trt_install_path, trt_lib_version)
-  trt_lib_src = []
-  trt_lib_dest = []
-  for lib in trt_libs.values():
-    trt_lib_src.append(lib.src_file_path)
-    trt_lib_dest.append(lib.dst_file_name)
-  genrules = [
-      symlink_genrule_for_dir(repository_ctx, None, "tensorrt/lib/",
-                              "tensorrt_lib", trt_lib_src, trt_lib_dest)
-  ]
-
-  # Set up the symbolic links for the header files.
-  trt_header_dir = _find_trt_header_dir(repository_ctx, trt_install_path)
-  src_files = [
-      "%s/%s" % (trt_header_dir, header) for header in _TF_TENSORRT_HEADERS
-  ]
-  dest_files = _TF_TENSORRT_HEADERS
-  genrules.append(
-      symlink_genrule_for_dir(repository_ctx, None, "tensorrt/include/",
-                              "tensorrt_include", src_files, dest_files))
-
-  # Set up config file.
-  _tpl(repository_ctx, "build_defs.bzl", {"%{tensorrt_is_configured}": "True"})
-
-  # Set up BUILD file.
-  substitutions = {
-      "%{tensorrt_genrules}": "\n".join(genrules),
-      "%{tensorrt_headers}": '":tensorrt_include"',
-  }
-  for lib in _TF_TENSORRT_LIBS:
-    k = "%%{%s}" % lib.replace("nv", "nv_")
-    v = '"tensorrt/lib/%s"' % trt_libs[lib].dst_file_name
-    substitutions.update({k: v})
-  _tpl(repository_ctx, "BUILD", substitutions)
-
+    """Implementation of the tensorrt_configure repository rule."""
+    if _TF_TENSORRT_CONFIG_REPO in repository_ctx.os.environ:
+        # Forward to the pre-configured remote repository.
+        remote_config_repo = repository_ctx.os.environ[_TF_TENSORRT_CONFIG_REPO]
+        repository_ctx.template("BUILD", Label(remote_config_repo + ":BUILD"), {})
+        repository_ctx.template(
+            "build_defs.bzl",
+            Label(remote_config_repo + ":build_defs.bzl"),
+            {},
+        )
+        return
+
+    if _TENSORRT_INSTALL_PATH not in repository_ctx.os.environ:
+        _create_dummy_repository(repository_ctx)
+        return
+
+    cpu_value = get_cpu_value(repository_ctx)
+    if (cpu_value != "Linux"):
+        auto_configure_fail("TensorRT is supported only on Linux.")
+    if _TF_TENSORRT_VERSION not in repository_ctx.os.environ:
+        auto_configure_fail("TensorRT library (libnvinfer) version is not set.")
+    trt_install_path = repository_ctx.os.environ[_TENSORRT_INSTALL_PATH].strip()
+    if not repository_ctx.path(trt_install_path).exists:
+        auto_configure_fail(
+            "Cannot find TensorRT install path %s." % trt_install_path,
+        )
+
+    # Copy the library files.
+    trt_lib_version = _trt_lib_version(repository_ctx, trt_install_path)
+    trt_libs = _find_trt_libs(repository_ctx, cpu_value, trt_install_path, trt_lib_version)
+    trt_lib_srcs = []
+    trt_lib_outs = []
+    for path in trt_libs.values():
+        trt_lib_srcs.append(str(path))
+        trt_lib_outs.append("tensorrt/lib/" + path.basename)
+    copy_rules = [make_copy_files_rule(
+        repository_ctx,
+        name = "tensorrt_lib",
+        srcs = trt_lib_srcs,
+        outs = trt_lib_outs,
+    )]
+
+    # Copy the header files header files.
+    trt_header_dir = _find_trt_header_dir(repository_ctx, trt_install_path)
+    trt_header_srcs = [
+        "%s/%s" % (trt_header_dir, header)
+        for header in _TF_TENSORRT_HEADERS
+    ]
+    trt_header_outs = [
+        "tensorrt/include/" + header
+        for header in _TF_TENSORRT_HEADERS
+    ]
+    copy_rules.append(
+        make_copy_files_rule(
+            repository_ctx,
+            name = "tensorrt_include",
+            srcs = trt_header_srcs,
+            outs = trt_header_outs,
+        ),
+    )
+
+    # Set up config file.
+    _tpl(repository_ctx, "build_defs.bzl", {"%{if_tensorrt}": "if_true"})
+
+    # Set up BUILD file.
+    _tpl(repository_ctx, "BUILD", {
+        "%{copy_rules}": "\n".join(copy_rules),
+        "%{tensorrt_headers}": '":tensorrt_include"',
+        "%{tensorrt_libs}": str(trt_lib_outs),
+    })
 
 tensorrt_configure = repository_rule(
-    implementation=_tensorrt_configure_impl,
-    environ=[
+    implementation = _tensorrt_configure_impl,
+    environ = [
         _TENSORRT_INSTALL_PATH,
         _TF_TENSORRT_VERSION,
     ],
diff --git a/third_party/toolchains/BUILD b/third_party/toolchains/BUILD
index 9da417fd5fe18619de6dc51032b8e3cde21b6ffb..0e067708a8b27c07c16b7848a426e45f6e6bb605 100644
--- a/third_party/toolchains/BUILD
+++ b/third_party/toolchains/BUILD
@@ -4,10 +4,20 @@ package(default_visibility = ["//visibility:public"])
 
 load("//third_party/toolchains/preconfig/generate:containers.bzl", "container_digests")
 
-# Platform for use with remote execution with
-# custom container based off RBE Ubuntu16_04
-# http://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04
-# Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cpu
+# Constraint used for platforms below so we can force certain rules to be executed
+# on specific platforms.
+constraint_setting(name = "custom_platforms")
+
+# Constraint for platforms that allow GPU testing (i.e. have a GPU available).
+# This is used in exec_compatible_with of rules that need GPU access.
+constraint_value(
+    name = "gpu_test",
+    constraint_setting = ":custom_platforms",
+)
+
+# TODO(b/122347293): This is the RBE config based on the CPU configuration / image provided
+# in the asci-toolchain setup. Delete this once we switched CPU remote builds to the
+# new platform below.
 platform(
     name = "rbe_ubuntu16_04-tf",
     constraint_values = [
@@ -20,9 +30,39 @@ platform(
         properties: {
             name: "container-image"
             value:"docker://gcr.io/asci-toolchain/nosla-ubuntu16_04-tf@sha256:63a0e981a4e7ce5da2a851cf063e430f72947fd999d9336b7e54e2eebe8e0bf5"
-        }""",
+        }
+        properties: {
+            name: "Pool"
+            value: "default"
+        }
+        """,
+)
+
+# Remote build platforms.
+# Each of the platform rules here provide a platform definition that is bound to a docker image.
+# The result of the skylark configuration is checked into
+# //tensorflow/third_party/toolchains/preconfig.
+
+# Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cpu.
+platform(
+    name = "rbe_ubuntu16.04",
+    constraint_values = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:linux",
+    ],
+    remote_execution_properties = """
+        properties: {
+            name: "container-image"
+            value:"docker://gcr.io/tensorflow-testing/nosla-ubuntu16.04@%s"
+        }
+        properties: {
+            name: "Pool"
+            value: "default"
+        }
+        """ % container_digests["ubuntu16.04"],
 )
 
+# Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda9.0-cudnn7-ubuntu14.04.
 platform(
     name = "rbe_cuda9.0-cudnn7-ubuntu14.04",
     constraint_values = [
@@ -32,10 +72,16 @@ platform(
     remote_execution_properties = """
         properties: {
             name: "container-image"
-            value:"docker://gcr.io/asci-toolchain/nosla-cuda9.0-cudnn7-ubuntu14.04@%s"
-        }""" % container_digests["cuda9.0-cudnn7-ubuntu14.04"],
+            value:"docker://gcr.io/tensorflow-testing/nosla-cuda9.0-cudnn7-ubuntu14.04@%s"
+        }
+        properties: {
+            name: "Pool"
+            value: "default"
+        }
+        """ % container_digests["cuda9.0-cudnn7-ubuntu14.04"],
 )
 
+# Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04.
 platform(
     name = "rbe_cuda10.0-cudnn7-ubuntu14.04",
     constraint_values = [
@@ -45,6 +91,35 @@ platform(
     remote_execution_properties = """
         properties: {
             name: "container-image"
-            value:"docker://gcr.io/asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04@%s"
-        }""" % container_digests["cuda10.0-cudnn7-ubuntu14.04"],
+            value:"docker://gcr.io/tensorflow-testing/nosla-cuda10.0-cudnn7-ubuntu14.04@%s"
+        }
+        properties: {
+            name: "Pool"
+            value: "default"
+        }
+        """ % container_digests["cuda10.0-cudnn7-ubuntu14.04"],
+)
+
+# The above platform with GPU support.
+platform(
+    name = "rbe_cuda10.0-cudnn7-ubuntu14.04-gpu",
+    constraint_values = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:linux",
+        ":gpu_test",
+    ],
+    remote_execution_properties = """
+        properties: {
+            name: "container-image"
+            value: "docker://gcr.io/tensorflow-testing/nosla-cuda10.0-cudnn7-ubuntu14.04@%s"
+        }
+        properties: {
+            name: "dockerRuntime"
+            value: "nvidia"
+        }
+        properties: {
+            name: "Pool"
+            value: "gpu-pool"
+        }
+        """ % container_digests["cuda10.0-cudnn7-ubuntu14.04"],
 )
diff --git a/third_party/toolchains/clang6/repo.bzl b/third_party/toolchains/clang6/repo.bzl
index b81f44506f382a6f2e4bfc005249c1a56cf0ae2f..e4b6422c96d749f5b84211ee9911069bc7e62e9b 100644
--- a/third_party/toolchains/clang6/repo.bzl
+++ b/third_party/toolchains/clang6/repo.bzl
@@ -1,30 +1,37 @@
 """Repository rule for Debian 8 Jessie Clang-6.0 portable Linux builds."""
 
 def _clang6_configure(ctx):
-  # TODO(jart): It'd probably be better to use Bazel's struct.to_proto()
-  #             method to generate a gigantic CROSSTOOL file that allows
-  #             Clang to support everything.
-  ctx.symlink(
-      ctx.os.environ.get('TF_LLVM_PATH',
-                         '/usr/lib/llvm-6.0'),
-      'clang6/llvm')
-  ctx.symlink(
-      ctx.os.environ.get('STRIP', '/usr/bin/strip'),
-      'clang6/sbin/strip')
-  ctx.symlink(
-      ctx.os.environ.get('OBJDUMP', '/usr/bin/objdump'),
-      'clang6/sbin/objdump')
-  ctx.symlink(ctx.attr._build, 'clang6/BUILD')
-  ctx.template('clang6/CROSSTOOL', ctx.attr._crosstool, {
-      '%package(@local_config_clang6//clang6)%': str(ctx.path('clang6')),
-  })
+    # TODO(jart): It'd probably be better to use Bazel's struct.to_proto()
+    #             method to generate a gigantic CROSSTOOL file that allows
+    #             Clang to support everything.
+    ctx.symlink(
+        ctx.os.environ.get(
+            "TF_LLVM_PATH",
+            "/usr/lib/llvm-6.0",
+        ),
+        "clang6/llvm",
+    )
+    ctx.symlink(
+        ctx.os.environ.get("STRIP", "/usr/bin/strip"),
+        "clang6/sbin/strip",
+    )
+    ctx.symlink(
+        ctx.os.environ.get("OBJDUMP", "/usr/bin/objdump"),
+        "clang6/sbin/objdump",
+    )
+    ctx.symlink(ctx.attr._build, "clang6/BUILD")
+    ctx.template("clang6/CROSSTOOL", ctx.attr._crosstool, {
+        "%package(@local_config_clang6//clang6)%": str(ctx.path("clang6")),
+    })
 
 clang6_configure = repository_rule(
     implementation = _clang6_configure,
     attrs = {
-        '_build': attr.label(
-            default=str(Label('//third_party/toolchains/clang6:clang.BUILD'))),
-        '_crosstool': attr.label(
-            default=str(Label('//third_party/toolchains/clang6:CROSSTOOL.tpl'))),
+        "_build": attr.label(
+            default = str(Label("//third_party/toolchains/clang6:clang.BUILD")),
+        ),
+        "_crosstool": attr.label(
+            default = str(Label("//third_party/toolchains/clang6:CROSSTOOL.tpl")),
+        ),
     },
 )
diff --git a/third_party/toolchains/cpus/arm/BUILD b/third_party/toolchains/cpus/arm/BUILD
index 10c7867c233e9ffb865101eef081a38a493fd4d6..efed6972395c7e6f496f797e5e7463f2f2778438 100644
--- a/third_party/toolchains/cpus/arm/BUILD
+++ b/third_party/toolchains/cpus/arm/BUILD
@@ -5,6 +5,11 @@ cc_toolchain_suite(
     toolchains = {
         "armeabi|compiler": ":cc-compiler-armeabi",
         "local|compiler": ":cc-compiler-local",
+        "armeabi": ":cc-compiler-armeabi",
+        "k8": ":cc-compiler-local",
+        "piii": ":cc-compiler-local",
+        "arm": ":cc-compiler-local",
+        "s390x": ":cc-compiler-local",
     },
 )
 
@@ -32,6 +37,7 @@ cc_toolchain(
     static_runtime_libs = [":empty"],
     strip_files = ":empty",
     supports_param_files = 1,
+    toolchain_identifier = "local_linux",
 )
 
 cc_toolchain(
@@ -46,4 +52,5 @@ cc_toolchain(
     static_runtime_libs = [":empty"],
     strip_files = "arm_linux_all_files",
     supports_param_files = 1,
+    toolchain_identifier = "arm-linux-gnueabihf",
 )
diff --git a/third_party/toolchains/cpus/arm/CROSSTOOL.tpl b/third_party/toolchains/cpus/arm/CROSSTOOL.tpl
index f0e17d1fe065703e3ff5574cd1d1d94d322a66a8..8d51e9b0c6f9eb875d37b502a99327667f5078cc 100644
--- a/third_party/toolchains/cpus/arm/CROSSTOOL.tpl
+++ b/third_party/toolchains/cpus/arm/CROSSTOOL.tpl
@@ -2,14 +2,6 @@ major_version: "local"
 minor_version: ""
 default_target_cpu: "same_as_host"
 
-default_toolchain {
-  cpu: "k8"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "piii"
-  toolchain_identifier: "local_linux"
-}
 default_toolchain {
   cpu: "darwin"
   toolchain_identifier: "local_darwin"
@@ -18,14 +10,6 @@ default_toolchain {
   cpu: "freebsd"
   toolchain_identifier: "local_freebsd"
 }
-default_toolchain {
-  cpu: "armeabi"
-  toolchain_identifier: "arm-linux-gnueabihf"
-}
-default_toolchain {
-  cpu: "arm"
-  toolchain_identifier: "local_linux"
-}
 default_toolchain {
   cpu: "x64_windows"
   toolchain_identifier: "local_windows_msys64"
@@ -34,10 +18,6 @@ default_toolchain {
   cpu: "x64_windows_msvc"
   toolchain_identifier: "vc_14_0_x64"
 }
-default_toolchain {
-  cpu: "s390x"
-  toolchain_identifier: "local_linux"
-}
 
 toolchain {
   abi_version: "armeabi"
diff --git a/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl b/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl
index ab6eac115ce5523a66dd3db66a89dd7c24281ac1..d675e95f70fd31cc73c6c882e9decff3e8991df8 100644
--- a/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl
+++ b/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl
@@ -1,38 +1,38 @@
 # -*- Python -*-
 """Repository rule for arm compiler autoconfiguration."""
 
-def _tpl(repository_ctx, tpl, substitutions={}, out=None):
-  if not out:
-    out = tpl
-  repository_ctx.template(
-      out,
-      Label("//third_party/toolchains/cpus/arm:%s.tpl" % tpl),
-      substitutions)
-
+def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
+    if not out:
+        out = tpl
+    repository_ctx.template(
+        out,
+        Label("//third_party/toolchains/cpus/arm:%s.tpl" % tpl),
+        substitutions,
+    )
 
 def _arm_compiler_configure_impl(repository_ctx):
-  # We need to find a cross-compilation include directory for Python, so look
-  # for an environment variable. Be warned, this crosstool template is only
-  # regenerated on the first run of Bazel, so if you change the variable after
-  # it may not be reflected in later builds. Doing a shutdown and clean of Bazel
-  # doesn't fix this, you'll need to delete the generated file at something like:
-  # external/local_config_arm_compiler/CROSSTOOL in your Bazel install.
-  if "CROSSTOOL_PYTHON_INCLUDE_PATH" in repository_ctx.os.environ:
-    python_include_path = repository_ctx.os.environ["CROSSTOOL_PYTHON_INCLUDE_PATH"]
-  else:
-    python_include_path = "/usr/include/python2.7"
-  _tpl(repository_ctx, "CROSSTOOL", {
-      "%{ARM_COMPILER_PATH}%": str(repository_ctx.path(
-          repository_ctx.attr.remote_config_repo)),
-      "%{PYTHON_INCLUDE_PATH}%": python_include_path,
-  })
-  repository_ctx.symlink(repository_ctx.attr.build_file, "BUILD")
-
+    # We need to find a cross-compilation include directory for Python, so look
+    # for an environment variable. Be warned, this crosstool template is only
+    # regenerated on the first run of Bazel, so if you change the variable after
+    # it may not be reflected in later builds. Doing a shutdown and clean of Bazel
+    # doesn't fix this, you'll need to delete the generated file at something like:
+    # external/local_config_arm_compiler/CROSSTOOL in your Bazel install.
+    if "CROSSTOOL_PYTHON_INCLUDE_PATH" in repository_ctx.os.environ:
+        python_include_path = repository_ctx.os.environ["CROSSTOOL_PYTHON_INCLUDE_PATH"]
+    else:
+        python_include_path = "/usr/include/python2.7"
+    _tpl(repository_ctx, "CROSSTOOL", {
+        "%{ARM_COMPILER_PATH}%": str(repository_ctx.path(
+            repository_ctx.attr.remote_config_repo,
+        )),
+        "%{PYTHON_INCLUDE_PATH}%": python_include_path,
+    })
+    repository_ctx.symlink(repository_ctx.attr.build_file, "BUILD")
 
 arm_compiler_configure = repository_rule(
     implementation = _arm_compiler_configure_impl,
     attrs = {
-        "remote_config_repo": attr.string(mandatory = False, default =""),
+        "remote_config_repo": attr.string(mandatory = False, default = ""),
         "build_file": attr.label(),
     },
 )
diff --git a/third_party/toolchains/gpus/cuda/BUILD b/third_party/toolchains/gpus/cuda/BUILD
index f63a0ea81925783085b1b551aab778d41ba1fb2c..8bb22c0269b5c4bfc21ea60c6605ac75ba072595 100644
--- a/third_party/toolchains/gpus/cuda/BUILD
+++ b/third_party/toolchains/gpus/cuda/BUILD
@@ -85,8 +85,8 @@ cc_library(
 
 cc_library(
     name = "cudart",
-    srcs = ["cuda/lib/libcudart.so.9.0"],
-    data = ["cuda/lib/libcudart.so.9.0"],
+    srcs = ["cuda/lib/libcudart.so.10.0"],
+    data = ["cuda/lib/libcudart.so.10.0"],
     includes = [
         ".",
         "cuda/include",
@@ -97,8 +97,8 @@ cc_library(
 
 cc_library(
     name = "cublas",
-    srcs = ["cuda/lib/libcublas.so.9.0"],
-    data = ["cuda/lib/libcublas.so.9.0"],
+    srcs = ["cuda/lib/libcublas.so.10.0"],
+    data = ["cuda/lib/libcublas.so.10.0"],
     includes = [
         ".",
         "cuda/include",
@@ -109,8 +109,8 @@ cc_library(
 
 cc_library(
     name = "cusolver",
-    srcs = ["cuda/lib/libcusolver.so.9.0"],
-    data = ["cuda/lib/libcusolver.so.9.0"],
+    srcs = ["cuda/lib/libcusolver.so.10.0"],
+    data = ["cuda/lib/libcusolver.so.10.0"],
     includes = [
         ".",
         "cuda/include",
@@ -143,8 +143,8 @@ cc_library(
 
 cc_library(
     name = "cufft",
-    srcs = ["cuda/lib/libcufft.so.9.0"],
-    data = ["cuda/lib/libcufft.so.9.0"],
+    srcs = ["cuda/lib/libcufft.so.10.0"],
+    data = ["cuda/lib/libcufft.so.10.0"],
     includes = [
         ".",
         "cuda/include",
@@ -155,8 +155,8 @@ cc_library(
 
 cc_library(
     name = "curand",
-    srcs = ["cuda/lib/libcurand.so.9.0"],
-    data = ["cuda/lib/libcurand.so.9.0"],
+    srcs = ["cuda/lib/libcurand.so.10.0"],
+    data = ["cuda/lib/libcurand.so.10.0"],
     includes = [
         ".",
         "cuda/include",
@@ -193,7 +193,7 @@ cc_library(
 
 cc_library(
     name = "cupti_dsos",
-    data = ["cuda/lib/libcupti.so.9.0"],
+    data = ["cuda/lib/libcupti.so.10.0"],
     includes = [
         ".",
         "cuda/include",
@@ -1193,7 +1193,7 @@ genrule(
         "cuda/include/vector_types.h",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/include/CL/cl.h" "$(@D)/cuda/include/CL/cl.h" && cp "/usr/local/cuda-9.0/include/CL/cl.hpp" "$(@D)/cuda/include/CL/cl.hpp" && cp "/usr/local/cuda-9.0/include/CL/cl_egl.h" "$(@D)/cuda/include/CL/cl_egl.h" && cp "/usr/local/cuda-9.0/include/CL/cl_ext.h" "$(@D)/cuda/include/CL/cl_ext.h" && cp "/usr/local/cuda-9.0/include/CL/cl_gl.h" "$(@D)/cuda/include/CL/cl_gl.h" && cp "/usr/local/cuda-9.0/include/CL/cl_gl_ext.h" "$(@D)/cuda/include/CL/cl_gl_ext.h" && cp "/usr/local/cuda-9.0/include/CL/cl_platform.h" "$(@D)/cuda/include/CL/cl_platform.h" && cp "/usr/local/cuda-9.0/include/CL/opencl.h" "$(@D)/cuda/include/CL/opencl.h" && cp "/usr/local/cuda-9.0/include/builtin_types.h" "$(@D)/cuda/include/builtin_types.h" && cp "/usr/local/cuda-9.0/include/channel_descriptor.h" "$(@D)/cuda/include/channel_descriptor.h" && cp "/usr/local/cuda-9.0/include/common_functions.h" "$(@D)/cuda/include/common_functions.h" && cp "/usr/local/cuda-9.0/include/cooperative_groups.h" "$(@D)/cuda/include/cooperative_groups.h" && cp "/usr/local/cuda-9.0/include/cooperative_groups_helpers.h" "$(@D)/cuda/include/cooperative_groups_helpers.h" && cp "/usr/local/cuda-9.0/include/crt/common_functions.h" "$(@D)/cuda/include/crt/common_functions.h" && cp "/usr/local/cuda-9.0/include/crt/device_double_functions.h" "$(@D)/cuda/include/crt/device_double_functions.h" && cp "/usr/local/cuda-9.0/include/crt/device_double_functions.hpp" "$(@D)/cuda/include/crt/device_double_functions.hpp" && cp "/usr/local/cuda-9.0/include/crt/device_functions.h" "$(@D)/cuda/include/crt/device_functions.h" && cp "/usr/local/cuda-9.0/include/crt/device_functions.hpp" "$(@D)/cuda/include/crt/device_functions.hpp" && cp "/usr/local/cuda-9.0/include/crt/func_macro.h" "$(@D)/cuda/include/crt/func_macro.h" && cp "/usr/local/cuda-9.0/include/crt/host_config.h" "$(@D)/cuda/include/crt/host_config.h" && cp "/usr/local/cuda-9.0/include/crt/host_defines.h" "$(@D)/cuda/include/crt/host_defines.h" && cp "/usr/local/cuda-9.0/include/crt/host_runtime.h" "$(@D)/cuda/include/crt/host_runtime.h" && cp "/usr/local/cuda-9.0/include/crt/math_functions.h" "$(@D)/cuda/include/crt/math_functions.h" && cp "/usr/local/cuda-9.0/include/crt/math_functions.hpp" "$(@D)/cuda/include/crt/math_functions.hpp" && cp "/usr/local/cuda-9.0/include/crt/mma.h" "$(@D)/cuda/include/crt/mma.h" && cp "/usr/local/cuda-9.0/include/crt/mma.hpp" "$(@D)/cuda/include/crt/mma.hpp" && cp "/usr/local/cuda-9.0/include/crt/nvfunctional" "$(@D)/cuda/include/crt/nvfunctional" && cp "/usr/local/cuda-9.0/include/crt/sm_70_rt.h" "$(@D)/cuda/include/crt/sm_70_rt.h" && cp "/usr/local/cuda-9.0/include/crt/sm_70_rt.hpp" "$(@D)/cuda/include/crt/sm_70_rt.hpp" && cp "/usr/local/cuda-9.0/include/crt/storage_class.h" "$(@D)/cuda/include/crt/storage_class.h" && cp "/usr/local/cuda-9.0/include/cuComplex.h" "$(@D)/cuda/include/cuComplex.h" && cp "/usr/local/cuda-9.0/include/cublas.h" "$(@D)/cuda/include/cublas.h" && cp "/usr/local/cuda-9.0/include/cublasXt.h" "$(@D)/cuda/include/cublasXt.h" && cp "/usr/local/cuda-9.0/include/cublas_api.h" "$(@D)/cuda/include/cublas_api.h" && cp "/usr/local/cuda-9.0/include/cublas_v2.h" "$(@D)/cuda/include/cublas_v2.h" && cp "/usr/local/cuda-9.0/include/cuda.h" "$(@D)/cuda/include/cuda.h" && cp "/usr/local/cuda-9.0/include/cudaEGL.h" "$(@D)/cuda/include/cudaEGL.h" && cp "/usr/local/cuda-9.0/include/cudaGL.h" "$(@D)/cuda/include/cudaGL.h" && cp "/usr/local/cuda-9.0/include/cudaProfiler.h" "$(@D)/cuda/include/cudaProfiler.h" && cp "/usr/local/cuda-9.0/include/cudaVDPAU.h" "$(@D)/cuda/include/cudaVDPAU.h" && cp "/usr/local/cuda-9.0/include/cuda_device_runtime_api.h" "$(@D)/cuda/include/cuda_device_runtime_api.h" && cp "/usr/local/cuda-9.0/include/cuda_fp16.h" "$(@D)/cuda/include/cuda_fp16.h" && cp "/usr/local/cuda-9.0/include/cuda_fp16.hpp" "$(@D)/cuda/include/cuda_fp16.hpp" && cp "/usr/local/cuda-9.0/include/cuda_gl_interop.h" "$(@D)/cuda/include/cuda_gl_interop.h" && cp "/usr/local/cuda-9.0/include/cuda_occupancy.h" "$(@D)/cuda/include/cuda_occupancy.h" && cp "/usr/local/cuda-9.0/include/cuda_profiler_api.h" "$(@D)/cuda/include/cuda_profiler_api.h" && cp "/usr/local/cuda-9.0/include/cuda_runtime.h" "$(@D)/cuda/include/cuda_runtime.h" && cp "/usr/local/cuda-9.0/include/cuda_runtime_api.h" "$(@D)/cuda/include/cuda_runtime_api.h" && cp "/usr/local/cuda-9.0/include/cuda_surface_types.h" "$(@D)/cuda/include/cuda_surface_types.h" && cp "/usr/local/cuda-9.0/include/cuda_texture_types.h" "$(@D)/cuda/include/cuda_texture_types.h" && cp "/usr/local/cuda-9.0/include/cuda_vdpau_interop.h" "$(@D)/cuda/include/cuda_vdpau_interop.h" && cp "/usr/local/cuda-9.0/include/cudalibxt.h" "$(@D)/cuda/include/cudalibxt.h" && cp "/usr/local/cuda-9.0/include/cudnn.h" "$(@D)/cuda/include/cudnn.h" && cp "/usr/local/cuda-9.0/include/cufft.h" "$(@D)/cuda/include/cufft.h" && cp "/usr/local/cuda-9.0/include/cufftXt.h" "$(@D)/cuda/include/cufftXt.h" && cp "/usr/local/cuda-9.0/include/cufftw.h" "$(@D)/cuda/include/cufftw.h" && cp "/usr/local/cuda-9.0/include/curand.h" "$(@D)/cuda/include/curand.h" && cp "/usr/local/cuda-9.0/include/curand_discrete.h" "$(@D)/cuda/include/curand_discrete.h" && cp "/usr/local/cuda-9.0/include/curand_discrete2.h" "$(@D)/cuda/include/curand_discrete2.h" && cp "/usr/local/cuda-9.0/include/curand_globals.h" "$(@D)/cuda/include/curand_globals.h" && cp "/usr/local/cuda-9.0/include/curand_kernel.h" "$(@D)/cuda/include/curand_kernel.h" && cp "/usr/local/cuda-9.0/include/curand_lognormal.h" "$(@D)/cuda/include/curand_lognormal.h" && cp "/usr/local/cuda-9.0/include/curand_mrg32k3a.h" "$(@D)/cuda/include/curand_mrg32k3a.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32.h" "$(@D)/cuda/include/curand_mtgp32.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32_host.h" "$(@D)/cuda/include/curand_mtgp32_host.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32_kernel.h" "$(@D)/cuda/include/curand_mtgp32_kernel.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32dc_p_11213.h" "$(@D)/cuda/include/curand_mtgp32dc_p_11213.h" && cp "/usr/local/cuda-9.0/include/curand_normal.h" "$(@D)/cuda/include/curand_normal.h" && cp "/usr/local/cuda-9.0/include/curand_normal_static.h" "$(@D)/cuda/include/curand_normal_static.h" && cp "/usr/local/cuda-9.0/include/curand_philox4x32_x.h" "$(@D)/cuda/include/curand_philox4x32_x.h" && cp "/usr/local/cuda-9.0/include/curand_poisson.h" "$(@D)/cuda/include/curand_poisson.h" && cp "/usr/local/cuda-9.0/include/curand_precalc.h" "$(@D)/cuda/include/curand_precalc.h" && cp "/usr/local/cuda-9.0/include/curand_uniform.h" "$(@D)/cuda/include/curand_uniform.h" && cp "/usr/local/cuda-9.0/include/cusolverDn.h" "$(@D)/cuda/include/cusolverDn.h" && cp "/usr/local/cuda-9.0/include/cusolverRf.h" "$(@D)/cuda/include/cusolverRf.h" && cp "/usr/local/cuda-9.0/include/cusolverSp.h" "$(@D)/cuda/include/cusolverSp.h" && cp "/usr/local/cuda-9.0/include/cusolverSp_LOWLEVEL_PREVIEW.h" "$(@D)/cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h" && cp "/usr/local/cuda-9.0/include/cusolver_common.h" "$(@D)/cuda/include/cusolver_common.h" && cp "/usr/local/cuda-9.0/include/cusparse.h" "$(@D)/cuda/include/cusparse.h" && cp "/usr/local/cuda-9.0/include/cusparse_v2.h" "$(@D)/cuda/include/cusparse_v2.h" && cp "/usr/local/cuda-9.0/include/device_atomic_functions.h" "$(@D)/cuda/include/device_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/device_atomic_functions.hpp" "$(@D)/cuda/include/device_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/device_double_functions.h" "$(@D)/cuda/include/device_double_functions.h" && cp "/usr/local/cuda-9.0/include/device_double_functions.hpp" "$(@D)/cuda/include/device_double_functions.hpp" && cp "/usr/local/cuda-9.0/include/device_functions.h" "$(@D)/cuda/include/device_functions.h" && cp "/usr/local/cuda-9.0/include/device_functions.hpp" "$(@D)/cuda/include/device_functions.hpp" && cp "/usr/local/cuda-9.0/include/device_functions_decls.h" "$(@D)/cuda/include/device_functions_decls.h" && cp "/usr/local/cuda-9.0/include/device_launch_parameters.h" "$(@D)/cuda/include/device_launch_parameters.h" && cp "/usr/local/cuda-9.0/include/device_types.h" "$(@D)/cuda/include/device_types.h" && cp "/usr/local/cuda-9.0/include/driver_functions.h" "$(@D)/cuda/include/driver_functions.h" && cp "/usr/local/cuda-9.0/include/driver_types.h" "$(@D)/cuda/include/driver_types.h" && cp "/usr/local/cuda-9.0/include/dynlink_cuda.h" "$(@D)/cuda/include/dynlink_cuda.h" && cp "/usr/local/cuda-9.0/include/dynlink_cuda_cuda.h" "$(@D)/cuda/include/dynlink_cuda_cuda.h" && cp "/usr/local/cuda-9.0/include/dynlink_cuviddec.h" "$(@D)/cuda/include/dynlink_cuviddec.h" && cp "/usr/local/cuda-9.0/include/dynlink_nvcuvid.h" "$(@D)/cuda/include/dynlink_nvcuvid.h" && cp "/usr/local/cuda-9.0/include/fatBinaryCtl.h" "$(@D)/cuda/include/fatBinaryCtl.h" && cp "/usr/local/cuda-9.0/include/fatbinary.h" "$(@D)/cuda/include/fatbinary.h" && cp "/usr/local/cuda-9.0/include/host_config.h" "$(@D)/cuda/include/host_config.h" && cp "/usr/local/cuda-9.0/include/host_defines.h" "$(@D)/cuda/include/host_defines.h" && cp "/usr/local/cuda-9.0/include/library_types.h" "$(@D)/cuda/include/library_types.h" && cp "/usr/local/cuda-9.0/include/math_constants.h" "$(@D)/cuda/include/math_constants.h" && cp "/usr/local/cuda-9.0/include/math_functions.h" "$(@D)/cuda/include/math_functions.h" && cp "/usr/local/cuda-9.0/include/math_functions.hpp" "$(@D)/cuda/include/math_functions.hpp" && cp "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.h" "$(@D)/cuda/include/math_functions_dbl_ptx3.h" && cp "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.hpp" "$(@D)/cuda/include/math_functions_dbl_ptx3.hpp" && cp "/usr/local/cuda-9.0/include/mma.h" "$(@D)/cuda/include/mma.h" && cp "/usr/local/cuda-9.0/include/npp.h" "$(@D)/cuda/include/npp.h" && cp "/usr/local/cuda-9.0/include/nppcore.h" "$(@D)/cuda/include/nppcore.h" && cp "/usr/local/cuda-9.0/include/nppdefs.h" "$(@D)/cuda/include/nppdefs.h" && cp "/usr/local/cuda-9.0/include/nppi.h" "$(@D)/cuda/include/nppi.h" && cp "/usr/local/cuda-9.0/include/nppi_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/nppi_arithmetic_and_logical_operations.h" && cp "/usr/local/cuda-9.0/include/nppi_color_conversion.h" "$(@D)/cuda/include/nppi_color_conversion.h" && cp "/usr/local/cuda-9.0/include/nppi_compression_functions.h" "$(@D)/cuda/include/nppi_compression_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_computer_vision.h" "$(@D)/cuda/include/nppi_computer_vision.h" && cp "/usr/local/cuda-9.0/include/nppi_data_exchange_and_initialization.h" "$(@D)/cuda/include/nppi_data_exchange_and_initialization.h" && cp "/usr/local/cuda-9.0/include/nppi_filtering_functions.h" "$(@D)/cuda/include/nppi_filtering_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_geometry_transforms.h" "$(@D)/cuda/include/nppi_geometry_transforms.h" && cp "/usr/local/cuda-9.0/include/nppi_linear_transforms.h" "$(@D)/cuda/include/nppi_linear_transforms.h" && cp "/usr/local/cuda-9.0/include/nppi_morphological_operations.h" "$(@D)/cuda/include/nppi_morphological_operations.h" && cp "/usr/local/cuda-9.0/include/nppi_statistics_functions.h" "$(@D)/cuda/include/nppi_statistics_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_support_functions.h" "$(@D)/cuda/include/nppi_support_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_threshold_and_compare_operations.h" "$(@D)/cuda/include/nppi_threshold_and_compare_operations.h" && cp "/usr/local/cuda-9.0/include/npps.h" "$(@D)/cuda/include/npps.h" && cp "/usr/local/cuda-9.0/include/npps_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/npps_arithmetic_and_logical_operations.h" && cp "/usr/local/cuda-9.0/include/npps_conversion_functions.h" "$(@D)/cuda/include/npps_conversion_functions.h" && cp "/usr/local/cuda-9.0/include/npps_filtering_functions.h" "$(@D)/cuda/include/npps_filtering_functions.h" && cp "/usr/local/cuda-9.0/include/npps_initialization.h" "$(@D)/cuda/include/npps_initialization.h" && cp "/usr/local/cuda-9.0/include/npps_statistics_functions.h" "$(@D)/cuda/include/npps_statistics_functions.h" && cp "/usr/local/cuda-9.0/include/npps_support_functions.h" "$(@D)/cuda/include/npps_support_functions.h" && cp "/usr/local/cuda-9.0/include/nppversion.h" "$(@D)/cuda/include/nppversion.h" && cp "/usr/local/cuda-9.0/include/nvToolsExt.h" "$(@D)/cuda/include/nvToolsExt.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtCuda.h" "$(@D)/cuda/include/nvToolsExtCuda.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvToolsExtCudaRt.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtMeta.h" "$(@D)/cuda/include/nvToolsExtMeta.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtSync.h" "$(@D)/cuda/include/nvToolsExtSync.h" && cp "/usr/local/cuda-9.0/include/nvblas.h" "$(@D)/cuda/include/nvblas.h" && cp "/usr/local/cuda-9.0/include/nvfunctional" "$(@D)/cuda/include/nvfunctional" && cp "/usr/local/cuda-9.0/include/nvgraph.h" "$(@D)/cuda/include/nvgraph.h" && cp "/usr/local/cuda-9.0/include/nvml.h" "$(@D)/cuda/include/nvml.h" && cp "/usr/local/cuda-9.0/include/nvrtc.h" "$(@D)/cuda/include/nvrtc.h" && cp "/usr/local/cuda-9.0/include/sm_20_atomic_functions.h" "$(@D)/cuda/include/sm_20_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_20_atomic_functions.hpp" "$(@D)/cuda/include/sm_20_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/sm_20_intrinsics.h" "$(@D)/cuda/include/sm_20_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_20_intrinsics.hpp" "$(@D)/cuda/include/sm_20_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sm_30_intrinsics.h" "$(@D)/cuda/include/sm_30_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_30_intrinsics.hpp" "$(@D)/cuda/include/sm_30_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sm_32_atomic_functions.h" "$(@D)/cuda/include/sm_32_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_32_atomic_functions.hpp" "$(@D)/cuda/include/sm_32_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/sm_32_intrinsics.h" "$(@D)/cuda/include/sm_32_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_32_intrinsics.hpp" "$(@D)/cuda/include/sm_32_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sm_35_atomic_functions.h" "$(@D)/cuda/include/sm_35_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_35_intrinsics.h" "$(@D)/cuda/include/sm_35_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_60_atomic_functions.h" "$(@D)/cuda/include/sm_60_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_60_atomic_functions.hpp" "$(@D)/cuda/include/sm_60_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/sm_61_intrinsics.h" "$(@D)/cuda/include/sm_61_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_61_intrinsics.hpp" "$(@D)/cuda/include/sm_61_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sobol_direction_vectors.h" "$(@D)/cuda/include/sobol_direction_vectors.h" && cp "/usr/local/cuda-9.0/include/surface_functions.h" "$(@D)/cuda/include/surface_functions.h" && cp "/usr/local/cuda-9.0/include/surface_functions.hpp" "$(@D)/cuda/include/surface_functions.hpp" && cp "/usr/local/cuda-9.0/include/surface_indirect_functions.h" "$(@D)/cuda/include/surface_indirect_functions.h" && cp "/usr/local/cuda-9.0/include/surface_indirect_functions.hpp" "$(@D)/cuda/include/surface_indirect_functions.hpp" && cp "/usr/local/cuda-9.0/include/surface_types.h" "$(@D)/cuda/include/surface_types.h" && cp "/usr/local/cuda-9.0/include/texture_fetch_functions.h" "$(@D)/cuda/include/texture_fetch_functions.h" && cp "/usr/local/cuda-9.0/include/texture_fetch_functions.hpp" "$(@D)/cuda/include/texture_fetch_functions.hpp" && cp "/usr/local/cuda-9.0/include/texture_indirect_functions.h" "$(@D)/cuda/include/texture_indirect_functions.h" && cp "/usr/local/cuda-9.0/include/texture_indirect_functions.hpp" "$(@D)/cuda/include/texture_indirect_functions.hpp" && cp "/usr/local/cuda-9.0/include/texture_types.h" "$(@D)/cuda/include/texture_types.h" && cp "/usr/local/cuda-9.0/include/thrust/adjacent_difference.h" "$(@D)/cuda/include/thrust/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/advance.h" "$(@D)/cuda/include/thrust/advance.h" && cp "/usr/local/cuda-9.0/include/thrust/binary_search.h" "$(@D)/cuda/include/thrust/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/complex.h" "$(@D)/cuda/include/thrust/complex.h" && cp "/usr/local/cuda-9.0/include/thrust/copy.h" "$(@D)/cuda/include/thrust/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/count.h" "$(@D)/cuda/include/thrust/count.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/adjacent_difference.inl" "$(@D)/cuda/include/thrust/detail/adjacent_difference.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/advance.inl" "$(@D)/cuda/include/thrust/detail/advance.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.h" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.inl" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.h" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/no_throw_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/no_throw_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/binary_search.inl" "$(@D)/cuda/include/thrust/detail/binary_search.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/arithmetic.h" "$(@D)/cuda/include/thrust/detail/complex/arithmetic.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/c99math.h" "$(@D)/cuda/include/thrust/detail/complex/c99math.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/catrig.h" "$(@D)/cuda/include/thrust/detail/complex/catrig.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/catrigf.h" "$(@D)/cuda/include/thrust/detail/complex/catrigf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ccosh.h" "$(@D)/cuda/include/thrust/detail/complex/ccosh.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ccoshf.h" "$(@D)/cuda/include/thrust/detail/complex/ccoshf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cexp.h" "$(@D)/cuda/include/thrust/detail/complex/cexp.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cexpf.h" "$(@D)/cuda/include/thrust/detail/complex/cexpf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/clog.h" "$(@D)/cuda/include/thrust/detail/complex/clog.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/clogf.h" "$(@D)/cuda/include/thrust/detail/complex/clogf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/complex.inl" "$(@D)/cuda/include/thrust/detail/complex/complex.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cpow.h" "$(@D)/cuda/include/thrust/detail/complex/cpow.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cpowf.h" "$(@D)/cuda/include/thrust/detail/complex/cpowf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cproj.h" "$(@D)/cuda/include/thrust/detail/complex/cproj.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csinh.h" "$(@D)/cuda/include/thrust/detail/complex/csinh.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csinhf.h" "$(@D)/cuda/include/thrust/detail/complex/csinhf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrt.h" "$(@D)/cuda/include/thrust/detail/complex/csqrt.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrtf.h" "$(@D)/cuda/include/thrust/detail/complex/csqrtf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanh.h" "$(@D)/cuda/include/thrust/detail/complex/ctanh.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanhf.h" "$(@D)/cuda/include/thrust/detail/complex/ctanhf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/math_private.h" "$(@D)/cuda/include/thrust/detail/complex/math_private.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/stream.h" "$(@D)/cuda/include/thrust/detail/complex/stream.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config.h" "$(@D)/cuda/include/thrust/detail/config.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/compiler.h" "$(@D)/cuda/include/thrust/detail/config/compiler.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/compiler_fence.h" "$(@D)/cuda/include/thrust/detail/config/compiler_fence.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/config.h" "$(@D)/cuda/include/thrust/detail/config/config.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/debug.h" "$(@D)/cuda/include/thrust/detail/config/debug.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/device_system.h" "$(@D)/cuda/include/thrust/detail/config/device_system.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/exec_check_disable.h" "$(@D)/cuda/include/thrust/detail/config/exec_check_disable.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/forceinline.h" "$(@D)/cuda/include/thrust/detail/config/forceinline.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/global_workarounds.h" "$(@D)/cuda/include/thrust/detail/config/global_workarounds.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/host_device.h" "$(@D)/cuda/include/thrust/detail/config/host_device.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/host_system.h" "$(@D)/cuda/include/thrust/detail/config/host_system.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/simple_defines.h" "$(@D)/cuda/include/thrust/detail/config/simple_defines.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.h" "$(@D)/cuda/include/thrust/detail/contiguous_storage.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.inl" "$(@D)/cuda/include/thrust/detail/contiguous_storage.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy.h" "$(@D)/cuda/include/thrust/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy.inl" "$(@D)/cuda/include/thrust/detail/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy_if.h" "$(@D)/cuda/include/thrust/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy_if.inl" "$(@D)/cuda/include/thrust/detail/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/count.inl" "$(@D)/cuda/include/thrust/detail/count.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/cstdint.h" "$(@D)/cuda/include/thrust/detail/cstdint.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_delete.inl" "$(@D)/cuda/include/thrust/detail/device_delete.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_free.inl" "$(@D)/cuda/include/thrust/detail/device_free.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_malloc.inl" "$(@D)/cuda/include/thrust/detail/device_malloc.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_new.inl" "$(@D)/cuda/include/thrust/detail/device_new.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_ptr.inl" "$(@D)/cuda/include/thrust/detail/device_ptr.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_reference.inl" "$(@D)/cuda/include/thrust/detail/device_reference.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_vector.inl" "$(@D)/cuda/include/thrust/detail/device_vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/dispatch/is_trivial_copy.h" "$(@D)/cuda/include/thrust/detail/dispatch/is_trivial_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/distance.inl" "$(@D)/cuda/include/thrust/detail/distance.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/equal.inl" "$(@D)/cuda/include/thrust/detail/equal.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/execute_with_allocator.h" "$(@D)/cuda/include/thrust/detail/execute_with_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/execution_policy.h" "$(@D)/cuda/include/thrust/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/extrema.inl" "$(@D)/cuda/include/thrust/detail/extrema.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/fill.inl" "$(@D)/cuda/include/thrust/detail/fill.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/find.inl" "$(@D)/cuda/include/thrust/detail/find.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/for_each.inl" "$(@D)/cuda/include/thrust/detail/for_each.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/function.h" "$(@D)/cuda/include/thrust/detail/function.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional.inl" "$(@D)/cuda/include/thrust/detail/functional.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.h" "$(@D)/cuda/include/thrust/detail/functional/actor.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.inl" "$(@D)/cuda/include/thrust/detail/functional/actor.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/argument.h" "$(@D)/cuda/include/thrust/detail/functional/argument.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/composite.h" "$(@D)/cuda/include/thrust/detail/functional/composite.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/arithmetic_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/arithmetic_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/assignment_operator.h" "$(@D)/cuda/include/thrust/detail/functional/operators/assignment_operator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/bitwise_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/bitwise_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/compound_assignment_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/logical_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/logical_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/operator_adaptors.h" "$(@D)/cuda/include/thrust/detail/functional/operators/operator_adaptors.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/relational_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/relational_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/placeholder.h" "$(@D)/cuda/include/thrust/detail/functional/placeholder.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/value.h" "$(@D)/cuda/include/thrust/detail/functional/value.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/gather.inl" "$(@D)/cuda/include/thrust/detail/gather.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/generate.inl" "$(@D)/cuda/include/thrust/detail/generate.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/get_iterator_value.h" "$(@D)/cuda/include/thrust/detail/get_iterator_value.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/host_vector.inl" "$(@D)/cuda/include/thrust/detail/host_vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/inner_product.inl" "$(@D)/cuda/include/thrust/detail/inner_product.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/integer_math.h" "$(@D)/cuda/include/thrust/detail/integer_math.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/integer_traits.h" "$(@D)/cuda/include/thrust/detail/integer_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/internal_functional.h" "$(@D)/cuda/include/thrust/detail/internal_functional.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/logical.inl" "$(@D)/cuda/include/thrust/detail/logical.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/merge.inl" "$(@D)/cuda/include/thrust/detail/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/minmax.h" "$(@D)/cuda/include/thrust/detail/minmax.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/mismatch.inl" "$(@D)/cuda/include/thrust/detail/mismatch.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/mpl/math.h" "$(@D)/cuda/include/thrust/detail/mpl/math.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/numeric_traits.h" "$(@D)/cuda/include/thrust/detail/numeric_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/overlapped_copy.h" "$(@D)/cuda/include/thrust/detail/overlapped_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/pair.inl" "$(@D)/cuda/include/thrust/detail/pair.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/partition.inl" "$(@D)/cuda/include/thrust/detail/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/pointer.h" "$(@D)/cuda/include/thrust/detail/pointer.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/pointer.inl" "$(@D)/cuda/include/thrust/detail/pointer.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/range/head_flags.h" "$(@D)/cuda/include/thrust/detail/range/head_flags.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/range/tail_flags.h" "$(@D)/cuda/include/thrust/detail/range/tail_flags.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/raw_pointer_cast.h" "$(@D)/cuda/include/thrust/detail/raw_pointer_cast.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/raw_reference_cast.h" "$(@D)/cuda/include/thrust/detail/raw_reference_cast.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/reduce.inl" "$(@D)/cuda/include/thrust/detail/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/reference.h" "$(@D)/cuda/include/thrust/detail/reference.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/reference.inl" "$(@D)/cuda/include/thrust/detail/reference.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/reference_forward_declaration.h" "$(@D)/cuda/include/thrust/detail/reference_forward_declaration.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/remove.inl" "$(@D)/cuda/include/thrust/detail/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/replace.inl" "$(@D)/cuda/include/thrust/detail/replace.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/reverse.inl" "$(@D)/cuda/include/thrust/detail/reverse.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/scan.inl" "$(@D)/cuda/include/thrust/detail/scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/scatter.inl" "$(@D)/cuda/include/thrust/detail/scatter.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/seq.h" "$(@D)/cuda/include/thrust/detail/seq.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/sequence.inl" "$(@D)/cuda/include/thrust/detail/sequence.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/set_operations.inl" "$(@D)/cuda/include/thrust/detail/set_operations.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/sort.inl" "$(@D)/cuda/include/thrust/detail/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/static_assert.h" "$(@D)/cuda/include/thrust/detail/static_assert.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/static_map.h" "$(@D)/cuda/include/thrust/detail/static_map.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/swap.h" "$(@D)/cuda/include/thrust/detail/swap.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/swap.inl" "$(@D)/cuda/include/thrust/detail/swap.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/swap_ranges.inl" "$(@D)/cuda/include/thrust/detail/swap_ranges.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/tabulate.inl" "$(@D)/cuda/include/thrust/detail/tabulate.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.h" "$(@D)/cuda/include/thrust/detail/temporary_array.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.inl" "$(@D)/cuda/include/thrust/detail/temporary_array.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/transform.inl" "$(@D)/cuda/include/thrust/detail/transform.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/transform_reduce.inl" "$(@D)/cuda/include/thrust/detail/transform_reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/transform_scan.inl" "$(@D)/cuda/include/thrust/detail/transform_scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/trivial_sequence.h" "$(@D)/cuda/include/thrust/detail/trivial_sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/tuple.inl" "$(@D)/cuda/include/thrust/detail/tuple.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/tuple_meta_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_meta_transform.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/tuple_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_transform.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" "$(@D)/cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/function_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/function_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_member_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_member_function.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_nested_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_nested_type.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_trivial_assign.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_trivial_assign.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_call_possible.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_call_possible.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_metafunction_defined.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_metafunction_defined.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_output_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/minimum_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/minimum_type.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/pointer_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/pointer_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/result_of_adaptable_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_fill.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/unique.inl" "$(@D)/cuda/include/thrust/detail/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/use_default.h" "$(@D)/cuda/include/thrust/detail/use_default.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/util/align.h" "$(@D)/cuda/include/thrust/detail/util/align.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/util/blocking.h" "$(@D)/cuda/include/thrust/detail/util/blocking.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/vector_base.h" "$(@D)/cuda/include/thrust/detail/vector_base.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/vector_base.inl" "$(@D)/cuda/include/thrust/detail/vector_base.inl" && cp "/usr/local/cuda-9.0/include/thrust/device_allocator.h" "$(@D)/cuda/include/thrust/device_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/device_delete.h" "$(@D)/cuda/include/thrust/device_delete.h" && cp "/usr/local/cuda-9.0/include/thrust/device_free.h" "$(@D)/cuda/include/thrust/device_free.h" && cp "/usr/local/cuda-9.0/include/thrust/device_malloc.h" "$(@D)/cuda/include/thrust/device_malloc.h" && cp "/usr/local/cuda-9.0/include/thrust/device_malloc_allocator.h" "$(@D)/cuda/include/thrust/device_malloc_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/device_new.h" "$(@D)/cuda/include/thrust/device_new.h" && cp "/usr/local/cuda-9.0/include/thrust/device_new_allocator.h" "$(@D)/cuda/include/thrust/device_new_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/device_ptr.h" "$(@D)/cuda/include/thrust/device_ptr.h" && cp "/usr/local/cuda-9.0/include/thrust/device_reference.h" "$(@D)/cuda/include/thrust/device_reference.h" && cp "/usr/local/cuda-9.0/include/thrust/device_vector.h" "$(@D)/cuda/include/thrust/device_vector.h" && cp "/usr/local/cuda-9.0/include/thrust/distance.h" "$(@D)/cuda/include/thrust/distance.h" && cp "/usr/local/cuda-9.0/include/thrust/equal.h" "$(@D)/cuda/include/thrust/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/execution_policy.h" "$(@D)/cuda/include/thrust/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/extrema.h" "$(@D)/cuda/include/thrust/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/fill.h" "$(@D)/cuda/include/thrust/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/find.h" "$(@D)/cuda/include/thrust/find.h" && cp "/usr/local/cuda-9.0/include/thrust/for_each.h" "$(@D)/cuda/include/thrust/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/functional.h" "$(@D)/cuda/include/thrust/functional.h" && cp "/usr/local/cuda-9.0/include/thrust/gather.h" "$(@D)/cuda/include/thrust/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/generate.h" "$(@D)/cuda/include/thrust/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/host_vector.h" "$(@D)/cuda/include/thrust/host_vector.h" && cp "/usr/local/cuda-9.0/include/thrust/inner_product.h" "$(@D)/cuda/include/thrust/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/constant_iterator.h" "$(@D)/cuda/include/thrust/iterator/constant_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/counting_iterator.h" "$(@D)/cuda/include/thrust/iterator/counting_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_assign.h" "$(@D)/cuda/include/thrust/iterator/detail/any_assign.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/any_system_tag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/constant_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/constant_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/counting_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/counting_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/device_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/device_system_tag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/discard_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/discard_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/distance_from_result.h" "$(@D)/cuda/include/thrust/iterator/detail/distance_from_result.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/host_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/host_system_tag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_iterator_category.h" "$(@D)/cuda/include/thrust/iterator/detail/is_iterator_category.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_trivial_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/is_trivial_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_adaptor_base.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_adaptor_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_system.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_system.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_facade_category.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_facade_category.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traits.inl" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traits.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traversal_tags.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traversal_tags.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/join_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/join_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_category.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_category.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_system.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_system.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/normal_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/normal_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/permutation_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/permutation_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/retag.h" "$(@D)/cuda/include/thrust/iterator/detail/retag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/tagged_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/tagged_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_output_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_output_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/tuple_of_iterator_references.h" "$(@D)/cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/universal_categories.h" "$(@D)/cuda/include/thrust/iterator/detail/universal_categories.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/discard_iterator.h" "$(@D)/cuda/include/thrust/iterator/discard_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_adaptor.h" "$(@D)/cuda/include/thrust/iterator/iterator_adaptor.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_categories.h" "$(@D)/cuda/include/thrust/iterator/iterator_categories.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_facade.h" "$(@D)/cuda/include/thrust/iterator/iterator_facade.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_traits.h" "$(@D)/cuda/include/thrust/iterator/iterator_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/permutation_iterator.h" "$(@D)/cuda/include/thrust/iterator/permutation_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/retag.h" "$(@D)/cuda/include/thrust/iterator/retag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/reverse_iterator.h" "$(@D)/cuda/include/thrust/iterator/reverse_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/transform_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/transform_output_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_output_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/zip_iterator.h" "$(@D)/cuda/include/thrust/iterator/zip_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/logical.h" "$(@D)/cuda/include/thrust/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/memory.h" "$(@D)/cuda/include/thrust/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/merge.h" "$(@D)/cuda/include/thrust/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/mismatch.h" "$(@D)/cuda/include/thrust/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/pair.h" "$(@D)/cuda/include/thrust/pair.h" && cp "/usr/local/cuda-9.0/include/thrust/partition.h" "$(@D)/cuda/include/thrust/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/random.h" "$(@D)/cuda/include/thrust/random.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/discard_block_engine.inl" "$(@D)/cuda/include/thrust/random/detail/discard_block_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine_discard.h" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine_discard.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/mod.h" "$(@D)/cuda/include/thrust/random/detail/mod.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/normal_distribution.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution_base.h" "$(@D)/cuda/include/thrust/random/detail/normal_distribution_base.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/random_core_access.h" "$(@D)/cuda/include/thrust/random/detail/random_core_access.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/subtract_with_carry_engine.inl" "$(@D)/cuda/include/thrust/random/detail/subtract_with_carry_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_int_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_int_distribution.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_real_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_real_distribution.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine.inl" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine_max.h" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine_max.h" && cp "/usr/local/cuda-9.0/include/thrust/random/discard_block_engine.h" "$(@D)/cuda/include/thrust/random/discard_block_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/linear_congruential_engine.h" "$(@D)/cuda/include/thrust/random/linear_congruential_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/linear_feedback_shift_engine.h" "$(@D)/cuda/include/thrust/random/linear_feedback_shift_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/normal_distribution.h" "$(@D)/cuda/include/thrust/random/normal_distribution.h" && cp "/usr/local/cuda-9.0/include/thrust/random/subtract_with_carry_engine.h" "$(@D)/cuda/include/thrust/random/subtract_with_carry_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/uniform_int_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_int_distribution.h" && cp "/usr/local/cuda-9.0/include/thrust/random/uniform_real_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_real_distribution.h" && cp "/usr/local/cuda-9.0/include/thrust/random/xor_combine_engine.h" "$(@D)/cuda/include/thrust/random/xor_combine_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/reduce.h" "$(@D)/cuda/include/thrust/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/remove.h" "$(@D)/cuda/include/thrust/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/replace.h" "$(@D)/cuda/include/thrust/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/reverse.h" "$(@D)/cuda/include/thrust/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/scan.h" "$(@D)/cuda/include/thrust/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/scatter.h" "$(@D)/cuda/include/thrust/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/sequence.h" "$(@D)/cuda/include/thrust/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/set_operations.h" "$(@D)/cuda/include/thrust/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/sort.h" "$(@D)/cuda/include/thrust/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/swap.h" "$(@D)/cuda/include/thrust/swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cpp/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cpp/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/count.h" "$(@D)/cuda/include/thrust/system/cpp/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/equal.h" "$(@D)/cuda/include/thrust/system/cpp/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cpp/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/find.h" "$(@D)/cuda/include/thrust/system/cpp/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cpp/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/gather.h" "$(@D)/cuda/include/thrust/system/cpp/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/generate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cpp/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cpp/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/logical.h" "$(@D)/cuda/include/thrust/system/cpp/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cpp/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/merge.h" "$(@D)/cuda/include/thrust/system/cpp/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cpp/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/par.h" "$(@D)/cuda/include/thrust/system/cpp/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/partition.h" "$(@D)/cuda/include/thrust/system/cpp/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/remove.h" "$(@D)/cuda/include/thrust/system/cpp/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/replace.h" "$(@D)/cuda/include/thrust/system/cpp/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cpp/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sort.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cpp/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cpp/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/memory.h" "$(@D)/cuda/include/thrust/system/cpp/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/vector.h" "$(@D)/cuda/include/thrust/system/cpp/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/config.h" "$(@D)/cuda/include/thrust/system/cuda/config.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cuda/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cuda/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/agent_launcher.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/agent_launcher.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/alignment.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/alignment.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/util.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/count.h" "$(@D)/cuda/include/thrust/system/cuda/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/cross_system.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/cub.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/cub.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_select.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/host/mutex.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_allocator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_arch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_debug.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_device.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_device.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_macro.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_namespace.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_ptx.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_type.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_type.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/equal.h" "$(@D)/cuda/include/thrust/system/cuda/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/error.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/error.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cuda/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/find.h" "$(@D)/cuda/include/thrust/system/cuda/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cuda/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/gather.h" "$(@D)/cuda/include/thrust/system/cuda/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/generate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_driver_types.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_driver_types.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cuda/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cuda/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/logical.h" "$(@D)/cuda/include/thrust/system/cuda/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cuda/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/memory_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/merge.h" "$(@D)/cuda/include/thrust/system/cuda/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par_to_seq.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par_to_seq.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/parallel_for.h" "$(@D)/cuda/include/thrust/system/cuda/detail/parallel_for.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/partition.h" "$(@D)/cuda/include/thrust/system/cuda/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/remove.h" "$(@D)/cuda/include/thrust/system/cuda/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/replace.h" "$(@D)/cuda/include/thrust/system/cuda/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cuda/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cuda/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/terminate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/terminate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/util.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/error.h" "$(@D)/cuda/include/thrust/system/cuda/error.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/experimental/pinned_allocator.h" "$(@D)/cuda/include/thrust/system/cuda/experimental/pinned_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/memory.h" "$(@D)/cuda/include/thrust/system/cuda/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/vector.h" "$(@D)/cuda/include/thrust/system/cuda/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/adl/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/adl/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/count.h" "$(@D)/cuda/include/thrust/system/detail/adl/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/equal.h" "$(@D)/cuda/include/thrust/system/detail/adl/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/extrema.h" "$(@D)/cuda/include/thrust/system/detail/adl/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/find.h" "$(@D)/cuda/include/thrust/system/detail/adl/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/for_each.h" "$(@D)/cuda/include/thrust/system/detail/adl/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/gather.h" "$(@D)/cuda/include/thrust/system/detail/adl/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/generate.h" "$(@D)/cuda/include/thrust/system/detail/adl/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/get_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/adl/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/adl/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/logical.h" "$(@D)/cuda/include/thrust/system/detail/adl/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/adl/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/merge.h" "$(@D)/cuda/include/thrust/system/detail/adl/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/adl/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/partition.h" "$(@D)/cuda/include/thrust/system/detail/adl/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/remove.h" "$(@D)/cuda/include/thrust/system/detail/adl/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/replace.h" "$(@D)/cuda/include/thrust/system/detail/adl/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reverse.h" "$(@D)/cuda/include/thrust/system/detail/adl/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scatter.h" "$(@D)/cuda/include/thrust/system/detail/adl/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sequence.h" "$(@D)/cuda/include/thrust/system/detail/adl/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/adl/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sort.h" "$(@D)/cuda/include/thrust/system/detail/adl/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/adl/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/adl/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/adl/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/bad_alloc.h" "$(@D)/cuda/include/thrust/system/detail/bad_alloc.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/errno.h" "$(@D)/cuda/include/thrust/system/detail/errno.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/error_category.inl" "$(@D)/cuda/include/thrust/system/detail/error_category.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/error_code.inl" "$(@D)/cuda/include/thrust/system/detail/error_code.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/error_condition.inl" "$(@D)/cuda/include/thrust/system/detail/error_condition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.inl" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.h" "$(@D)/cuda/include/thrust/system/detail/generic/advance.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/advance.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.h" "$(@D)/cuda/include/thrust/system/detail/generic/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.inl" "$(@D)/cuda/include/thrust/system/detail/generic/count.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.h" "$(@D)/cuda/include/thrust/system/detail/generic/distance.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/distance.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.h" "$(@D)/cuda/include/thrust/system/detail/generic/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.inl" "$(@D)/cuda/include/thrust/system/detail/generic/equal.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.h" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.inl" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.h" "$(@D)/cuda/include/thrust/system/detail/generic/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.inl" "$(@D)/cuda/include/thrust/system/detail/generic/find.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/for_each.h" "$(@D)/cuda/include/thrust/system/detail/generic/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.h" "$(@D)/cuda/include/thrust/system/detail/generic/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.inl" "$(@D)/cuda/include/thrust/system/detail/generic/gather.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.h" "$(@D)/cuda/include/thrust/system/detail/generic/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/generate.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.inl" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/logical.h" "$(@D)/cuda/include/thrust/system/detail/generic/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.h" "$(@D)/cuda/include/thrust/system/detail/generic/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.inl" "$(@D)/cuda/include/thrust/system/detail/generic/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.h" "$(@D)/cuda/include/thrust/system/detail/generic/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.inl" "$(@D)/cuda/include/thrust/system/detail/generic/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.inl" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.h" "$(@D)/cuda/include/thrust/system/detail/generic/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.inl" "$(@D)/cuda/include/thrust/system/detail/generic/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.h" "$(@D)/cuda/include/thrust/system/detail/generic/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.inl" "$(@D)/cuda/include/thrust/system/detail/generic/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.h" "$(@D)/cuda/include/thrust/system/detail/generic/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.inl" "$(@D)/cuda/include/thrust/system/detail/generic/replace.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.h" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.h" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/select_system.h" "$(@D)/cuda/include/thrust/system/detail/generic/select_system.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.h" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.inl" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.h" "$(@D)/cuda/include/thrust/system/detail/generic/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.inl" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tag.h" "$(@D)/cuda/include/thrust/system/detail/generic/tag.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.inl" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/type_traits.h" "$(@D)/cuda/include/thrust/system/detail/generic/type_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/internal/decompose.h" "$(@D)/cuda/include/thrust/system/detail/internal/decompose.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/sequential/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/sequential/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_backward.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_backward.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/count.h" "$(@D)/cuda/include/thrust/system/detail/sequential/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/equal.h" "$(@D)/cuda/include/thrust/system/detail/sequential/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/execution_policy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/extrema.h" "$(@D)/cuda/include/thrust/system/detail/sequential/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/find.h" "$(@D)/cuda/include/thrust/system/detail/sequential/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/for_each.h" "$(@D)/cuda/include/thrust/system/detail/sequential/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/gather.h" "$(@D)/cuda/include/thrust/system/detail/sequential/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/general_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/general_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/generate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/get_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/sequential/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/insertion_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/insertion_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/sequential/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/logical.h" "$(@D)/cuda/include/thrust/system/detail/sequential/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/sequential/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.h" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/sequential/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/partition.h" "$(@D)/cuda/include/thrust/system/detail/sequential/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/remove.h" "$(@D)/cuda/include/thrust/system/detail/sequential/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/replace.h" "$(@D)/cuda/include/thrust/system/detail/sequential/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reverse.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scatter.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sequence.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/sequential/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/sequential/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/sequential/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/trivial_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/trivial_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/system_error.inl" "$(@D)/cuda/include/thrust/system/detail/system_error.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/error_code.h" "$(@D)/cuda/include/thrust/system/error_code.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/omp/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/omp/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/count.h" "$(@D)/cuda/include/thrust/system/omp/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.h" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/equal.h" "$(@D)/cuda/include/thrust/system/omp/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/omp/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/find.h" "$(@D)/cuda/include/thrust/system/omp/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/gather.h" "$(@D)/cuda/include/thrust/system/omp/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/generate.h" "$(@D)/cuda/include/thrust/system/omp/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/omp/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/omp/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/logical.h" "$(@D)/cuda/include/thrust/system/omp/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/omp/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/omp/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/merge.h" "$(@D)/cuda/include/thrust/system/omp/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/omp/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/par.h" "$(@D)/cuda/include/thrust/system/omp/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.h" "$(@D)/cuda/include/thrust/system/omp/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.h" "$(@D)/cuda/include/thrust/system/omp/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.inl" "$(@D)/cuda/include/thrust/system/omp/detail/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/replace.h" "$(@D)/cuda/include/thrust/system/omp/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/omp/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/omp/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/omp/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/omp/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.h" "$(@D)/cuda/include/thrust/system/omp/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.inl" "$(@D)/cuda/include/thrust/system/omp/detail/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/omp/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/omp/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/omp/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/omp/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/memory.h" "$(@D)/cuda/include/thrust/system/omp/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/vector.h" "$(@D)/cuda/include/thrust/system/omp/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system/system_error.h" "$(@D)/cuda/include/thrust/system/system_error.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/tbb/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/tbb/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/count.h" "$(@D)/cuda/include/thrust/system/tbb/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/equal.h" "$(@D)/cuda/include/thrust/system/tbb/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/extrema.h" "$(@D)/cuda/include/thrust/system/tbb/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/find.h" "$(@D)/cuda/include/thrust/system/tbb/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.h" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/gather.h" "$(@D)/cuda/include/thrust/system/tbb/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/generate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/get_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/tbb/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/tbb/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/logical.h" "$(@D)/cuda/include/thrust/system/tbb/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/tbb/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/memory.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.h" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/tbb/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/par.h" "$(@D)/cuda/include/thrust/system/tbb/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.h" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_intervals.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.h" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/replace.h" "$(@D)/cuda/include/thrust/system/tbb/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reverse.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scatter.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sequence.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/tbb/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/tbb/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/tbb/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/vector.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/memory.h" "$(@D)/cuda/include/thrust/system/tbb/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/vector.h" "$(@D)/cuda/include/thrust/system/tbb/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system_error.h" "$(@D)/cuda/include/thrust/system_error.h" && cp "/usr/local/cuda-9.0/include/thrust/tabulate.h" "$(@D)/cuda/include/thrust/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/transform.h" "$(@D)/cuda/include/thrust/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/transform_reduce.h" "$(@D)/cuda/include/thrust/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/transform_scan.h" "$(@D)/cuda/include/thrust/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/tuple.h" "$(@D)/cuda/include/thrust/tuple.h" && cp "/usr/local/cuda-9.0/include/thrust/uninitialized_copy.h" "$(@D)/cuda/include/thrust/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/uninitialized_fill.h" "$(@D)/cuda/include/thrust/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/unique.h" "$(@D)/cuda/include/thrust/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/version.h" "$(@D)/cuda/include/thrust/version.h" && cp "/usr/local/cuda-9.0/include/vector_functions.h" "$(@D)/cuda/include/vector_functions.h" && cp "/usr/local/cuda-9.0/include/vector_functions.hpp" "$(@D)/cuda/include/vector_functions.hpp" && cp "/usr/local/cuda-9.0/include/vector_types.h" "$(@D)/cuda/include/vector_types.h"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-10.0/include/CL/cl.h" "$(@D)/cuda/include/CL/cl.h" && cp "/usr/local/cuda-10.0/include/CL/cl.hpp" "$(@D)/cuda/include/CL/cl.hpp" && cp "/usr/local/cuda-10.0/include/CL/cl_egl.h" "$(@D)/cuda/include/CL/cl_egl.h" && cp "/usr/local/cuda-10.0/include/CL/cl_ext.h" "$(@D)/cuda/include/CL/cl_ext.h" && cp "/usr/local/cuda-10.0/include/CL/cl_gl.h" "$(@D)/cuda/include/CL/cl_gl.h" && cp "/usr/local/cuda-10.0/include/CL/cl_gl_ext.h" "$(@D)/cuda/include/CL/cl_gl_ext.h" && cp "/usr/local/cuda-10.0/include/CL/cl_platform.h" "$(@D)/cuda/include/CL/cl_platform.h" && cp "/usr/local/cuda-10.0/include/CL/opencl.h" "$(@D)/cuda/include/CL/opencl.h" && cp "/usr/local/cuda-10.0/include/builtin_types.h" "$(@D)/cuda/include/builtin_types.h" && cp "/usr/local/cuda-10.0/include/channel_descriptor.h" "$(@D)/cuda/include/channel_descriptor.h" && cp "/usr/local/cuda-10.0/include/common_functions.h" "$(@D)/cuda/include/common_functions.h" && cp "/usr/local/cuda-10.0/include/cooperative_groups.h" "$(@D)/cuda/include/cooperative_groups.h" && cp "/usr/local/cuda-10.0/include/cooperative_groups_helpers.h" "$(@D)/cuda/include/cooperative_groups_helpers.h" && cp "/usr/local/cuda-10.0/include/crt/common_functions.h" "$(@D)/cuda/include/crt/common_functions.h" && cp "/usr/local/cuda-10.0/include/crt/device_double_functions.h" "$(@D)/cuda/include/crt/device_double_functions.h" && cp "/usr/local/cuda-10.0/include/crt/device_double_functions.hpp" "$(@D)/cuda/include/crt/device_double_functions.hpp" && cp "/usr/local/cuda-10.0/include/crt/device_functions.h" "$(@D)/cuda/include/crt/device_functions.h" && cp "/usr/local/cuda-10.0/include/crt/device_functions.hpp" "$(@D)/cuda/include/crt/device_functions.hpp" && cp "/usr/local/cuda-10.0/include/crt/func_macro.h" "$(@D)/cuda/include/crt/func_macro.h" && cp "/usr/local/cuda-10.0/include/crt/host_config.h" "$(@D)/cuda/include/crt/host_config.h" && cp "/usr/local/cuda-10.0/include/crt/host_defines.h" "$(@D)/cuda/include/crt/host_defines.h" && cp "/usr/local/cuda-10.0/include/crt/host_runtime.h" "$(@D)/cuda/include/crt/host_runtime.h" && cp "/usr/local/cuda-10.0/include/crt/math_functions.h" "$(@D)/cuda/include/crt/math_functions.h" && cp "/usr/local/cuda-10.0/include/crt/math_functions.hpp" "$(@D)/cuda/include/crt/math_functions.hpp" && cp "/usr/local/cuda-10.0/include/crt/mma.h" "$(@D)/cuda/include/crt/mma.h" && cp "/usr/local/cuda-10.0/include/crt/mma.hpp" "$(@D)/cuda/include/crt/mma.hpp" && cp "/usr/local/cuda-10.0/include/crt/nvfunctional" "$(@D)/cuda/include/crt/nvfunctional" && cp "/usr/local/cuda-10.0/include/crt/sm_70_rt.h" "$(@D)/cuda/include/crt/sm_70_rt.h" && cp "/usr/local/cuda-10.0/include/crt/sm_70_rt.hpp" "$(@D)/cuda/include/crt/sm_70_rt.hpp" && cp "/usr/local/cuda-10.0/include/crt/storage_class.h" "$(@D)/cuda/include/crt/storage_class.h" && cp "/usr/local/cuda-10.0/include/cuComplex.h" "$(@D)/cuda/include/cuComplex.h" && cp "/usr/local/cuda-10.0/include/cublas.h" "$(@D)/cuda/include/cublas.h" && cp "/usr/local/cuda-10.0/include/cublasXt.h" "$(@D)/cuda/include/cublasXt.h" && cp "/usr/local/cuda-10.0/include/cublas_api.h" "$(@D)/cuda/include/cublas_api.h" && cp "/usr/local/cuda-10.0/include/cublas_v2.h" "$(@D)/cuda/include/cublas_v2.h" && cp "/usr/local/cuda-10.0/include/cuda.h" "$(@D)/cuda/include/cuda.h" && cp "/usr/local/cuda-10.0/include/cudaEGL.h" "$(@D)/cuda/include/cudaEGL.h" && cp "/usr/local/cuda-10.0/include/cudaGL.h" "$(@D)/cuda/include/cudaGL.h" && cp "/usr/local/cuda-10.0/include/cudaProfiler.h" "$(@D)/cuda/include/cudaProfiler.h" && cp "/usr/local/cuda-10.0/include/cudaVDPAU.h" "$(@D)/cuda/include/cudaVDPAU.h" && cp "/usr/local/cuda-10.0/include/cuda_device_runtime_api.h" "$(@D)/cuda/include/cuda_device_runtime_api.h" && cp "/usr/local/cuda-10.0/include/cuda_fp16.h" "$(@D)/cuda/include/cuda_fp16.h" && cp "/usr/local/cuda-10.0/include/cuda_fp16.hpp" "$(@D)/cuda/include/cuda_fp16.hpp" && cp "/usr/local/cuda-10.0/include/cuda_gl_interop.h" "$(@D)/cuda/include/cuda_gl_interop.h" && cp "/usr/local/cuda-10.0/include/cuda_occupancy.h" "$(@D)/cuda/include/cuda_occupancy.h" && cp "/usr/local/cuda-10.0/include/cuda_profiler_api.h" "$(@D)/cuda/include/cuda_profiler_api.h" && cp "/usr/local/cuda-10.0/include/cuda_runtime.h" "$(@D)/cuda/include/cuda_runtime.h" && cp "/usr/local/cuda-10.0/include/cuda_runtime_api.h" "$(@D)/cuda/include/cuda_runtime_api.h" && cp "/usr/local/cuda-10.0/include/cuda_surface_types.h" "$(@D)/cuda/include/cuda_surface_types.h" && cp "/usr/local/cuda-10.0/include/cuda_texture_types.h" "$(@D)/cuda/include/cuda_texture_types.h" && cp "/usr/local/cuda-10.0/include/cuda_vdpau_interop.h" "$(@D)/cuda/include/cuda_vdpau_interop.h" && cp "/usr/local/cuda-10.0/include/cudalibxt.h" "$(@D)/cuda/include/cudalibxt.h" && cp "/usr/local/cuda-10.0/include/cudnn.h" "$(@D)/cuda/include/cudnn.h" && cp "/usr/local/cuda-10.0/include/cufft.h" "$(@D)/cuda/include/cufft.h" && cp "/usr/local/cuda-10.0/include/cufftXt.h" "$(@D)/cuda/include/cufftXt.h" && cp "/usr/local/cuda-10.0/include/cufftw.h" "$(@D)/cuda/include/cufftw.h" && cp "/usr/local/cuda-10.0/include/curand.h" "$(@D)/cuda/include/curand.h" && cp "/usr/local/cuda-10.0/include/curand_discrete.h" "$(@D)/cuda/include/curand_discrete.h" && cp "/usr/local/cuda-10.0/include/curand_discrete2.h" "$(@D)/cuda/include/curand_discrete2.h" && cp "/usr/local/cuda-10.0/include/curand_globals.h" "$(@D)/cuda/include/curand_globals.h" && cp "/usr/local/cuda-10.0/include/curand_kernel.h" "$(@D)/cuda/include/curand_kernel.h" && cp "/usr/local/cuda-10.0/include/curand_lognormal.h" "$(@D)/cuda/include/curand_lognormal.h" && cp "/usr/local/cuda-10.0/include/curand_mrg32k3a.h" "$(@D)/cuda/include/curand_mrg32k3a.h" && cp "/usr/local/cuda-10.0/include/curand_mtgp32.h" "$(@D)/cuda/include/curand_mtgp32.h" && cp "/usr/local/cuda-10.0/include/curand_mtgp32_host.h" "$(@D)/cuda/include/curand_mtgp32_host.h" && cp "/usr/local/cuda-10.0/include/curand_mtgp32_kernel.h" "$(@D)/cuda/include/curand_mtgp32_kernel.h" && cp "/usr/local/cuda-10.0/include/curand_mtgp32dc_p_11213.h" "$(@D)/cuda/include/curand_mtgp32dc_p_11213.h" && cp "/usr/local/cuda-10.0/include/curand_normal.h" "$(@D)/cuda/include/curand_normal.h" && cp "/usr/local/cuda-10.0/include/curand_normal_static.h" "$(@D)/cuda/include/curand_normal_static.h" && cp "/usr/local/cuda-10.0/include/curand_philox4x32_x.h" "$(@D)/cuda/include/curand_philox4x32_x.h" && cp "/usr/local/cuda-10.0/include/curand_poisson.h" "$(@D)/cuda/include/curand_poisson.h" && cp "/usr/local/cuda-10.0/include/curand_precalc.h" "$(@D)/cuda/include/curand_precalc.h" && cp "/usr/local/cuda-10.0/include/curand_uniform.h" "$(@D)/cuda/include/curand_uniform.h" && cp "/usr/local/cuda-10.0/include/cusolverDn.h" "$(@D)/cuda/include/cusolverDn.h" && cp "/usr/local/cuda-10.0/include/cusolverRf.h" "$(@D)/cuda/include/cusolverRf.h" && cp "/usr/local/cuda-10.0/include/cusolverSp.h" "$(@D)/cuda/include/cusolverSp.h" && cp "/usr/local/cuda-10.0/include/cusolverSp_LOWLEVEL_PREVIEW.h" "$(@D)/cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h" && cp "/usr/local/cuda-10.0/include/cusolver_common.h" "$(@D)/cuda/include/cusolver_common.h" && cp "/usr/local/cuda-10.0/include/cusparse.h" "$(@D)/cuda/include/cusparse.h" && cp "/usr/local/cuda-10.0/include/cusparse_v2.h" "$(@D)/cuda/include/cusparse_v2.h" && cp "/usr/local/cuda-10.0/include/device_atomic_functions.h" "$(@D)/cuda/include/device_atomic_functions.h" && cp "/usr/local/cuda-10.0/include/device_atomic_functions.hpp" "$(@D)/cuda/include/device_atomic_functions.hpp" && cp "/usr/local/cuda-10.0/include/device_double_functions.h" "$(@D)/cuda/include/device_double_functions.h" && cp "/usr/local/cuda-10.0/include/device_double_functions.hpp" "$(@D)/cuda/include/device_double_functions.hpp" && cp "/usr/local/cuda-10.0/include/device_functions.h" "$(@D)/cuda/include/device_functions.h" && cp "/usr/local/cuda-10.0/include/device_functions.hpp" "$(@D)/cuda/include/device_functions.hpp" && cp "/usr/local/cuda-10.0/include/device_functions_decls.h" "$(@D)/cuda/include/device_functions_decls.h" && cp "/usr/local/cuda-10.0/include/device_launch_parameters.h" "$(@D)/cuda/include/device_launch_parameters.h" && cp "/usr/local/cuda-10.0/include/device_types.h" "$(@D)/cuda/include/device_types.h" && cp "/usr/local/cuda-10.0/include/driver_functions.h" "$(@D)/cuda/include/driver_functions.h" && cp "/usr/local/cuda-10.0/include/driver_types.h" "$(@D)/cuda/include/driver_types.h" && cp "/usr/local/cuda-10.0/include/dynlink_cuda.h" "$(@D)/cuda/include/dynlink_cuda.h" && cp "/usr/local/cuda-10.0/include/dynlink_cuda_cuda.h" "$(@D)/cuda/include/dynlink_cuda_cuda.h" && cp "/usr/local/cuda-10.0/include/dynlink_cuviddec.h" "$(@D)/cuda/include/dynlink_cuviddec.h" && cp "/usr/local/cuda-10.0/include/dynlink_nvcuvid.h" "$(@D)/cuda/include/dynlink_nvcuvid.h" && cp "/usr/local/cuda-10.0/include/fatBinaryCtl.h" "$(@D)/cuda/include/fatBinaryCtl.h" && cp "/usr/local/cuda-10.0/include/fatbinary.h" "$(@D)/cuda/include/fatbinary.h" && cp "/usr/local/cuda-10.0/include/host_config.h" "$(@D)/cuda/include/host_config.h" && cp "/usr/local/cuda-10.0/include/host_defines.h" "$(@D)/cuda/include/host_defines.h" && cp "/usr/local/cuda-10.0/include/library_types.h" "$(@D)/cuda/include/library_types.h" && cp "/usr/local/cuda-10.0/include/math_constants.h" "$(@D)/cuda/include/math_constants.h" && cp "/usr/local/cuda-10.0/include/math_functions.h" "$(@D)/cuda/include/math_functions.h" && cp "/usr/local/cuda-10.0/include/math_functions.hpp" "$(@D)/cuda/include/math_functions.hpp" && cp "/usr/local/cuda-10.0/include/math_functions_dbl_ptx3.h" "$(@D)/cuda/include/math_functions_dbl_ptx3.h" && cp "/usr/local/cuda-10.0/include/math_functions_dbl_ptx3.hpp" "$(@D)/cuda/include/math_functions_dbl_ptx3.hpp" && cp "/usr/local/cuda-10.0/include/mma.h" "$(@D)/cuda/include/mma.h" && cp "/usr/local/cuda-10.0/include/npp.h" "$(@D)/cuda/include/npp.h" && cp "/usr/local/cuda-10.0/include/nppcore.h" "$(@D)/cuda/include/nppcore.h" && cp "/usr/local/cuda-10.0/include/nppdefs.h" "$(@D)/cuda/include/nppdefs.h" && cp "/usr/local/cuda-10.0/include/nppi.h" "$(@D)/cuda/include/nppi.h" && cp "/usr/local/cuda-10.0/include/nppi_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/nppi_arithmetic_and_logical_operations.h" && cp "/usr/local/cuda-10.0/include/nppi_color_conversion.h" "$(@D)/cuda/include/nppi_color_conversion.h" && cp "/usr/local/cuda-10.0/include/nppi_compression_functions.h" "$(@D)/cuda/include/nppi_compression_functions.h" && cp "/usr/local/cuda-10.0/include/nppi_computer_vision.h" "$(@D)/cuda/include/nppi_computer_vision.h" && cp "/usr/local/cuda-10.0/include/nppi_data_exchange_and_initialization.h" "$(@D)/cuda/include/nppi_data_exchange_and_initialization.h" && cp "/usr/local/cuda-10.0/include/nppi_filtering_functions.h" "$(@D)/cuda/include/nppi_filtering_functions.h" && cp "/usr/local/cuda-10.0/include/nppi_geometry_transforms.h" "$(@D)/cuda/include/nppi_geometry_transforms.h" && cp "/usr/local/cuda-10.0/include/nppi_linear_transforms.h" "$(@D)/cuda/include/nppi_linear_transforms.h" && cp "/usr/local/cuda-10.0/include/nppi_morphological_operations.h" "$(@D)/cuda/include/nppi_morphological_operations.h" && cp "/usr/local/cuda-10.0/include/nppi_statistics_functions.h" "$(@D)/cuda/include/nppi_statistics_functions.h" && cp "/usr/local/cuda-10.0/include/nppi_support_functions.h" "$(@D)/cuda/include/nppi_support_functions.h" && cp "/usr/local/cuda-10.0/include/nppi_threshold_and_compare_operations.h" "$(@D)/cuda/include/nppi_threshold_and_compare_operations.h" && cp "/usr/local/cuda-10.0/include/npps.h" "$(@D)/cuda/include/npps.h" && cp "/usr/local/cuda-10.0/include/npps_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/npps_arithmetic_and_logical_operations.h" && cp "/usr/local/cuda-10.0/include/npps_conversion_functions.h" "$(@D)/cuda/include/npps_conversion_functions.h" && cp "/usr/local/cuda-10.0/include/npps_filtering_functions.h" "$(@D)/cuda/include/npps_filtering_functions.h" && cp "/usr/local/cuda-10.0/include/npps_initialization.h" "$(@D)/cuda/include/npps_initialization.h" && cp "/usr/local/cuda-10.0/include/npps_statistics_functions.h" "$(@D)/cuda/include/npps_statistics_functions.h" && cp "/usr/local/cuda-10.0/include/npps_support_functions.h" "$(@D)/cuda/include/npps_support_functions.h" && cp "/usr/local/cuda-10.0/include/nppversion.h" "$(@D)/cuda/include/nppversion.h" && cp "/usr/local/cuda-10.0/include/nvToolsExt.h" "$(@D)/cuda/include/nvToolsExt.h" && cp "/usr/local/cuda-10.0/include/nvToolsExtCuda.h" "$(@D)/cuda/include/nvToolsExtCuda.h" && cp "/usr/local/cuda-10.0/include/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvToolsExtCudaRt.h" && cp "/usr/local/cuda-10.0/include/nvToolsExtMeta.h" "$(@D)/cuda/include/nvToolsExtMeta.h" && cp "/usr/local/cuda-10.0/include/nvToolsExtSync.h" "$(@D)/cuda/include/nvToolsExtSync.h" && cp "/usr/local/cuda-10.0/include/nvblas.h" "$(@D)/cuda/include/nvblas.h" && cp "/usr/local/cuda-10.0/include/nvfunctional" "$(@D)/cuda/include/nvfunctional" && cp "/usr/local/cuda-10.0/include/nvgraph.h" "$(@D)/cuda/include/nvgraph.h" && cp "/usr/local/cuda-10.0/include/nvml.h" "$(@D)/cuda/include/nvml.h" && cp "/usr/local/cuda-10.0/include/nvrtc.h" "$(@D)/cuda/include/nvrtc.h" && cp "/usr/local/cuda-10.0/include/sm_20_atomic_functions.h" "$(@D)/cuda/include/sm_20_atomic_functions.h" && cp "/usr/local/cuda-10.0/include/sm_20_atomic_functions.hpp" "$(@D)/cuda/include/sm_20_atomic_functions.hpp" && cp "/usr/local/cuda-10.0/include/sm_20_intrinsics.h" "$(@D)/cuda/include/sm_20_intrinsics.h" && cp "/usr/local/cuda-10.0/include/sm_20_intrinsics.hpp" "$(@D)/cuda/include/sm_20_intrinsics.hpp" && cp "/usr/local/cuda-10.0/include/sm_30_intrinsics.h" "$(@D)/cuda/include/sm_30_intrinsics.h" && cp "/usr/local/cuda-10.0/include/sm_30_intrinsics.hpp" "$(@D)/cuda/include/sm_30_intrinsics.hpp" && cp "/usr/local/cuda-10.0/include/sm_32_atomic_functions.h" "$(@D)/cuda/include/sm_32_atomic_functions.h" && cp "/usr/local/cuda-10.0/include/sm_32_atomic_functions.hpp" "$(@D)/cuda/include/sm_32_atomic_functions.hpp" && cp "/usr/local/cuda-10.0/include/sm_32_intrinsics.h" "$(@D)/cuda/include/sm_32_intrinsics.h" && cp "/usr/local/cuda-10.0/include/sm_32_intrinsics.hpp" "$(@D)/cuda/include/sm_32_intrinsics.hpp" && cp "/usr/local/cuda-10.0/include/sm_35_atomic_functions.h" "$(@D)/cuda/include/sm_35_atomic_functions.h" && cp "/usr/local/cuda-10.0/include/sm_35_intrinsics.h" "$(@D)/cuda/include/sm_35_intrinsics.h" && cp "/usr/local/cuda-10.0/include/sm_60_atomic_functions.h" "$(@D)/cuda/include/sm_60_atomic_functions.h" && cp "/usr/local/cuda-10.0/include/sm_60_atomic_functions.hpp" "$(@D)/cuda/include/sm_60_atomic_functions.hpp" && cp "/usr/local/cuda-10.0/include/sm_61_intrinsics.h" "$(@D)/cuda/include/sm_61_intrinsics.h" && cp "/usr/local/cuda-10.0/include/sm_61_intrinsics.hpp" "$(@D)/cuda/include/sm_61_intrinsics.hpp" && cp "/usr/local/cuda-10.0/include/sobol_direction_vectors.h" "$(@D)/cuda/include/sobol_direction_vectors.h" && cp "/usr/local/cuda-10.0/include/surface_functions.h" "$(@D)/cuda/include/surface_functions.h" && cp "/usr/local/cuda-10.0/include/surface_functions.hpp" "$(@D)/cuda/include/surface_functions.hpp" && cp "/usr/local/cuda-10.0/include/surface_indirect_functions.h" "$(@D)/cuda/include/surface_indirect_functions.h" && cp "/usr/local/cuda-10.0/include/surface_indirect_functions.hpp" "$(@D)/cuda/include/surface_indirect_functions.hpp" && cp "/usr/local/cuda-10.0/include/surface_types.h" "$(@D)/cuda/include/surface_types.h" && cp "/usr/local/cuda-10.0/include/texture_fetch_functions.h" "$(@D)/cuda/include/texture_fetch_functions.h" && cp "/usr/local/cuda-10.0/include/texture_fetch_functions.hpp" "$(@D)/cuda/include/texture_fetch_functions.hpp" && cp "/usr/local/cuda-10.0/include/texture_indirect_functions.h" "$(@D)/cuda/include/texture_indirect_functions.h" && cp "/usr/local/cuda-10.0/include/texture_indirect_functions.hpp" "$(@D)/cuda/include/texture_indirect_functions.hpp" && cp "/usr/local/cuda-10.0/include/texture_types.h" "$(@D)/cuda/include/texture_types.h" && cp "/usr/local/cuda-10.0/include/thrust/adjacent_difference.h" "$(@D)/cuda/include/thrust/adjacent_difference.h" && cp "/usr/local/cuda-10.0/include/thrust/advance.h" "$(@D)/cuda/include/thrust/advance.h" && cp "/usr/local/cuda-10.0/include/thrust/binary_search.h" "$(@D)/cuda/include/thrust/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/complex.h" "$(@D)/cuda/include/thrust/complex.h" && cp "/usr/local/cuda-10.0/include/thrust/copy.h" "$(@D)/cuda/include/thrust/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/count.h" "$(@D)/cuda/include/thrust/count.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/adjacent_difference.inl" "$(@D)/cuda/include/thrust/detail/adjacent_difference.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/advance.inl" "$(@D)/cuda/include/thrust/detail/advance.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/allocator_traits.h" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/allocator_traits.inl" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/copy_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/copy_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/default_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/default_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/destroy_range.h" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/destroy_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/fill_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/fill_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/malloc_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/malloc_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/no_throw_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/no_throw_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/tagged_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/tagged_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/temporary_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/allocator/temporary_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/binary_search.inl" "$(@D)/cuda/include/thrust/detail/binary_search.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/arithmetic.h" "$(@D)/cuda/include/thrust/detail/complex/arithmetic.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/c99math.h" "$(@D)/cuda/include/thrust/detail/complex/c99math.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/catrig.h" "$(@D)/cuda/include/thrust/detail/complex/catrig.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/catrigf.h" "$(@D)/cuda/include/thrust/detail/complex/catrigf.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/ccosh.h" "$(@D)/cuda/include/thrust/detail/complex/ccosh.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/ccoshf.h" "$(@D)/cuda/include/thrust/detail/complex/ccoshf.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/cexp.h" "$(@D)/cuda/include/thrust/detail/complex/cexp.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/cexpf.h" "$(@D)/cuda/include/thrust/detail/complex/cexpf.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/clog.h" "$(@D)/cuda/include/thrust/detail/complex/clog.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/clogf.h" "$(@D)/cuda/include/thrust/detail/complex/clogf.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/complex.inl" "$(@D)/cuda/include/thrust/detail/complex/complex.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/cpow.h" "$(@D)/cuda/include/thrust/detail/complex/cpow.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/cpowf.h" "$(@D)/cuda/include/thrust/detail/complex/cpowf.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/cproj.h" "$(@D)/cuda/include/thrust/detail/complex/cproj.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/csinh.h" "$(@D)/cuda/include/thrust/detail/complex/csinh.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/csinhf.h" "$(@D)/cuda/include/thrust/detail/complex/csinhf.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/csqrt.h" "$(@D)/cuda/include/thrust/detail/complex/csqrt.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/csqrtf.h" "$(@D)/cuda/include/thrust/detail/complex/csqrtf.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/ctanh.h" "$(@D)/cuda/include/thrust/detail/complex/ctanh.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/ctanhf.h" "$(@D)/cuda/include/thrust/detail/complex/ctanhf.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/math_private.h" "$(@D)/cuda/include/thrust/detail/complex/math_private.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/complex/stream.h" "$(@D)/cuda/include/thrust/detail/complex/stream.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config.h" "$(@D)/cuda/include/thrust/detail/config.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/compiler.h" "$(@D)/cuda/include/thrust/detail/config/compiler.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/compiler_fence.h" "$(@D)/cuda/include/thrust/detail/config/compiler_fence.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/config.h" "$(@D)/cuda/include/thrust/detail/config/config.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/debug.h" "$(@D)/cuda/include/thrust/detail/config/debug.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/device_system.h" "$(@D)/cuda/include/thrust/detail/config/device_system.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/exec_check_disable.h" "$(@D)/cuda/include/thrust/detail/config/exec_check_disable.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/forceinline.h" "$(@D)/cuda/include/thrust/detail/config/forceinline.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/global_workarounds.h" "$(@D)/cuda/include/thrust/detail/config/global_workarounds.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/host_device.h" "$(@D)/cuda/include/thrust/detail/config/host_device.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/host_system.h" "$(@D)/cuda/include/thrust/detail/config/host_system.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/config/simple_defines.h" "$(@D)/cuda/include/thrust/detail/config/simple_defines.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/contiguous_storage.h" "$(@D)/cuda/include/thrust/detail/contiguous_storage.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/contiguous_storage.inl" "$(@D)/cuda/include/thrust/detail/contiguous_storage.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/copy.h" "$(@D)/cuda/include/thrust/detail/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/copy.inl" "$(@D)/cuda/include/thrust/detail/copy.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/copy_if.h" "$(@D)/cuda/include/thrust/detail/copy_if.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/copy_if.inl" "$(@D)/cuda/include/thrust/detail/copy_if.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/count.inl" "$(@D)/cuda/include/thrust/detail/count.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/cstdint.h" "$(@D)/cuda/include/thrust/detail/cstdint.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/device_delete.inl" "$(@D)/cuda/include/thrust/detail/device_delete.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/device_free.inl" "$(@D)/cuda/include/thrust/detail/device_free.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/device_malloc.inl" "$(@D)/cuda/include/thrust/detail/device_malloc.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/device_new.inl" "$(@D)/cuda/include/thrust/detail/device_new.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/device_ptr.inl" "$(@D)/cuda/include/thrust/detail/device_ptr.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/device_reference.inl" "$(@D)/cuda/include/thrust/detail/device_reference.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/device_vector.inl" "$(@D)/cuda/include/thrust/detail/device_vector.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/dispatch/is_trivial_copy.h" "$(@D)/cuda/include/thrust/detail/dispatch/is_trivial_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/distance.inl" "$(@D)/cuda/include/thrust/detail/distance.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/equal.inl" "$(@D)/cuda/include/thrust/detail/equal.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/execute_with_allocator.h" "$(@D)/cuda/include/thrust/detail/execute_with_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/execution_policy.h" "$(@D)/cuda/include/thrust/detail/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/extrema.inl" "$(@D)/cuda/include/thrust/detail/extrema.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/fill.inl" "$(@D)/cuda/include/thrust/detail/fill.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/find.inl" "$(@D)/cuda/include/thrust/detail/find.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/for_each.inl" "$(@D)/cuda/include/thrust/detail/for_each.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/function.h" "$(@D)/cuda/include/thrust/detail/function.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional.inl" "$(@D)/cuda/include/thrust/detail/functional.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/actor.h" "$(@D)/cuda/include/thrust/detail/functional/actor.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/actor.inl" "$(@D)/cuda/include/thrust/detail/functional/actor.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/argument.h" "$(@D)/cuda/include/thrust/detail/functional/argument.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/composite.h" "$(@D)/cuda/include/thrust/detail/functional/composite.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/arithmetic_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/arithmetic_operators.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/assignment_operator.h" "$(@D)/cuda/include/thrust/detail/functional/operators/assignment_operator.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/bitwise_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/bitwise_operators.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/compound_assignment_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/logical_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/logical_operators.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/operator_adaptors.h" "$(@D)/cuda/include/thrust/detail/functional/operators/operator_adaptors.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/relational_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/relational_operators.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/placeholder.h" "$(@D)/cuda/include/thrust/detail/functional/placeholder.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/functional/value.h" "$(@D)/cuda/include/thrust/detail/functional/value.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/gather.inl" "$(@D)/cuda/include/thrust/detail/gather.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/generate.inl" "$(@D)/cuda/include/thrust/detail/generate.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/get_iterator_value.h" "$(@D)/cuda/include/thrust/detail/get_iterator_value.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/host_vector.inl" "$(@D)/cuda/include/thrust/detail/host_vector.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/inner_product.inl" "$(@D)/cuda/include/thrust/detail/inner_product.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/integer_math.h" "$(@D)/cuda/include/thrust/detail/integer_math.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/integer_traits.h" "$(@D)/cuda/include/thrust/detail/integer_traits.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/internal_functional.h" "$(@D)/cuda/include/thrust/detail/internal_functional.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/logical.inl" "$(@D)/cuda/include/thrust/detail/logical.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/detail/malloc_and_free.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/merge.inl" "$(@D)/cuda/include/thrust/detail/merge.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/minmax.h" "$(@D)/cuda/include/thrust/detail/minmax.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/mismatch.inl" "$(@D)/cuda/include/thrust/detail/mismatch.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/mpl/math.h" "$(@D)/cuda/include/thrust/detail/mpl/math.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/numeric_traits.h" "$(@D)/cuda/include/thrust/detail/numeric_traits.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/overlapped_copy.h" "$(@D)/cuda/include/thrust/detail/overlapped_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/pair.inl" "$(@D)/cuda/include/thrust/detail/pair.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/partition.inl" "$(@D)/cuda/include/thrust/detail/partition.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/pointer.h" "$(@D)/cuda/include/thrust/detail/pointer.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/pointer.inl" "$(@D)/cuda/include/thrust/detail/pointer.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/range/head_flags.h" "$(@D)/cuda/include/thrust/detail/range/head_flags.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/range/tail_flags.h" "$(@D)/cuda/include/thrust/detail/range/tail_flags.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/raw_pointer_cast.h" "$(@D)/cuda/include/thrust/detail/raw_pointer_cast.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/raw_reference_cast.h" "$(@D)/cuda/include/thrust/detail/raw_reference_cast.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/reduce.inl" "$(@D)/cuda/include/thrust/detail/reduce.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/reference.h" "$(@D)/cuda/include/thrust/detail/reference.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/reference.inl" "$(@D)/cuda/include/thrust/detail/reference.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/reference_forward_declaration.h" "$(@D)/cuda/include/thrust/detail/reference_forward_declaration.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/remove.inl" "$(@D)/cuda/include/thrust/detail/remove.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/replace.inl" "$(@D)/cuda/include/thrust/detail/replace.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/reverse.inl" "$(@D)/cuda/include/thrust/detail/reverse.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/scan.inl" "$(@D)/cuda/include/thrust/detail/scan.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/scatter.inl" "$(@D)/cuda/include/thrust/detail/scatter.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/seq.h" "$(@D)/cuda/include/thrust/detail/seq.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/sequence.inl" "$(@D)/cuda/include/thrust/detail/sequence.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/set_operations.inl" "$(@D)/cuda/include/thrust/detail/set_operations.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/sort.inl" "$(@D)/cuda/include/thrust/detail/sort.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/static_assert.h" "$(@D)/cuda/include/thrust/detail/static_assert.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/static_map.h" "$(@D)/cuda/include/thrust/detail/static_map.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/swap.h" "$(@D)/cuda/include/thrust/detail/swap.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/swap.inl" "$(@D)/cuda/include/thrust/detail/swap.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/swap_ranges.inl" "$(@D)/cuda/include/thrust/detail/swap_ranges.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/tabulate.inl" "$(@D)/cuda/include/thrust/detail/tabulate.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/temporary_array.h" "$(@D)/cuda/include/thrust/detail/temporary_array.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/temporary_array.inl" "$(@D)/cuda/include/thrust/detail/temporary_array.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/detail/temporary_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/transform.inl" "$(@D)/cuda/include/thrust/detail/transform.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/transform_reduce.inl" "$(@D)/cuda/include/thrust/detail/transform_reduce.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/transform_scan.inl" "$(@D)/cuda/include/thrust/detail/transform_scan.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/trivial_sequence.h" "$(@D)/cuda/include/thrust/detail/trivial_sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/tuple.inl" "$(@D)/cuda/include/thrust/detail/tuple.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/tuple_meta_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_meta_transform.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/tuple_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_transform.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" "$(@D)/cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/function_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/function_traits.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/has_member_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_member_function.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/has_nested_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_nested_type.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/has_trivial_assign.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_trivial_assign.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/is_call_possible.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_call_possible.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/is_metafunction_defined.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_metafunction_defined.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/iterator/is_output_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/minimum_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/minimum_type.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/pointer_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/pointer_traits.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/type_traits/result_of_adaptable_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_copy.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_fill.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/unique.inl" "$(@D)/cuda/include/thrust/detail/unique.inl" && cp "/usr/local/cuda-10.0/include/thrust/detail/use_default.h" "$(@D)/cuda/include/thrust/detail/use_default.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/util/align.h" "$(@D)/cuda/include/thrust/detail/util/align.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/util/blocking.h" "$(@D)/cuda/include/thrust/detail/util/blocking.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/vector_base.h" "$(@D)/cuda/include/thrust/detail/vector_base.h" && cp "/usr/local/cuda-10.0/include/thrust/detail/vector_base.inl" "$(@D)/cuda/include/thrust/detail/vector_base.inl" && cp "/usr/local/cuda-10.0/include/thrust/device_allocator.h" "$(@D)/cuda/include/thrust/device_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/device_delete.h" "$(@D)/cuda/include/thrust/device_delete.h" && cp "/usr/local/cuda-10.0/include/thrust/device_free.h" "$(@D)/cuda/include/thrust/device_free.h" && cp "/usr/local/cuda-10.0/include/thrust/device_malloc.h" "$(@D)/cuda/include/thrust/device_malloc.h" && cp "/usr/local/cuda-10.0/include/thrust/device_malloc_allocator.h" "$(@D)/cuda/include/thrust/device_malloc_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/device_new.h" "$(@D)/cuda/include/thrust/device_new.h" && cp "/usr/local/cuda-10.0/include/thrust/device_new_allocator.h" "$(@D)/cuda/include/thrust/device_new_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/device_ptr.h" "$(@D)/cuda/include/thrust/device_ptr.h" && cp "/usr/local/cuda-10.0/include/thrust/device_reference.h" "$(@D)/cuda/include/thrust/device_reference.h" && cp "/usr/local/cuda-10.0/include/thrust/device_vector.h" "$(@D)/cuda/include/thrust/device_vector.h" && cp "/usr/local/cuda-10.0/include/thrust/distance.h" "$(@D)/cuda/include/thrust/distance.h" && cp "/usr/local/cuda-10.0/include/thrust/equal.h" "$(@D)/cuda/include/thrust/equal.h" && cp "/usr/local/cuda-10.0/include/thrust/execution_policy.h" "$(@D)/cuda/include/thrust/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/extrema.h" "$(@D)/cuda/include/thrust/extrema.h" && cp "/usr/local/cuda-10.0/include/thrust/fill.h" "$(@D)/cuda/include/thrust/fill.h" && cp "/usr/local/cuda-10.0/include/thrust/find.h" "$(@D)/cuda/include/thrust/find.h" && cp "/usr/local/cuda-10.0/include/thrust/for_each.h" "$(@D)/cuda/include/thrust/for_each.h" && cp "/usr/local/cuda-10.0/include/thrust/functional.h" "$(@D)/cuda/include/thrust/functional.h" && cp "/usr/local/cuda-10.0/include/thrust/gather.h" "$(@D)/cuda/include/thrust/gather.h" && cp "/usr/local/cuda-10.0/include/thrust/generate.h" "$(@D)/cuda/include/thrust/generate.h" && cp "/usr/local/cuda-10.0/include/thrust/host_vector.h" "$(@D)/cuda/include/thrust/host_vector.h" && cp "/usr/local/cuda-10.0/include/thrust/inner_product.h" "$(@D)/cuda/include/thrust/inner_product.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/constant_iterator.h" "$(@D)/cuda/include/thrust/iterator/constant_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/counting_iterator.h" "$(@D)/cuda/include/thrust/iterator/counting_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/any_assign.h" "$(@D)/cuda/include/thrust/iterator/detail/any_assign.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/any_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/any_system_tag.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/constant_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/constant_iterator_base.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/counting_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/counting_iterator.inl" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/device_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/device_system_tag.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/discard_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/discard_iterator_base.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/distance_from_result.h" "$(@D)/cuda/include/thrust/iterator/detail/distance_from_result.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/host_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/host_system_tag.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/is_iterator_category.h" "$(@D)/cuda/include/thrust/iterator/detail/is_iterator_category.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/is_trivial_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/is_trivial_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_adaptor_base.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_adaptor_base.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_category_to_system.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_system.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_category_to_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_facade_category.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_facade_category.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_traits.inl" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traits.inl" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_traversal_tags.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traversal_tags.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/join_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/join_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/minimum_category.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_category.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/minimum_system.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_system.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/normal_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/normal_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/permutation_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/permutation_iterator_base.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/retag.h" "$(@D)/cuda/include/thrust/iterator/detail/retag.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/reverse_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator.inl" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/reverse_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator_base.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/tagged_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/tagged_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/transform_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_iterator.inl" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/transform_output_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_output_iterator.inl" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/tuple_of_iterator_references.h" "$(@D)/cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/universal_categories.h" "$(@D)/cuda/include/thrust/iterator/detail/universal_categories.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/zip_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator.inl" && cp "/usr/local/cuda-10.0/include/thrust/iterator/detail/zip_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator_base.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/discard_iterator.h" "$(@D)/cuda/include/thrust/iterator/discard_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/iterator_adaptor.h" "$(@D)/cuda/include/thrust/iterator/iterator_adaptor.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/iterator_categories.h" "$(@D)/cuda/include/thrust/iterator/iterator_categories.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/iterator_facade.h" "$(@D)/cuda/include/thrust/iterator/iterator_facade.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/iterator_traits.h" "$(@D)/cuda/include/thrust/iterator/iterator_traits.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/permutation_iterator.h" "$(@D)/cuda/include/thrust/iterator/permutation_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/retag.h" "$(@D)/cuda/include/thrust/iterator/retag.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/reverse_iterator.h" "$(@D)/cuda/include/thrust/iterator/reverse_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/transform_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/transform_output_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_output_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/iterator/zip_iterator.h" "$(@D)/cuda/include/thrust/iterator/zip_iterator.h" && cp "/usr/local/cuda-10.0/include/thrust/logical.h" "$(@D)/cuda/include/thrust/logical.h" && cp "/usr/local/cuda-10.0/include/thrust/memory.h" "$(@D)/cuda/include/thrust/memory.h" && cp "/usr/local/cuda-10.0/include/thrust/merge.h" "$(@D)/cuda/include/thrust/merge.h" && cp "/usr/local/cuda-10.0/include/thrust/mismatch.h" "$(@D)/cuda/include/thrust/mismatch.h" && cp "/usr/local/cuda-10.0/include/thrust/pair.h" "$(@D)/cuda/include/thrust/pair.h" && cp "/usr/local/cuda-10.0/include/thrust/partition.h" "$(@D)/cuda/include/thrust/partition.h" && cp "/usr/local/cuda-10.0/include/thrust/random.h" "$(@D)/cuda/include/thrust/random.h" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/discard_block_engine.inl" "$(@D)/cuda/include/thrust/random/detail/discard_block_engine.inl" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/linear_congruential_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine.inl" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/linear_congruential_engine_discard.h" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine_discard.h" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/linear_feedback_shift_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/mod.h" "$(@D)/cuda/include/thrust/random/detail/mod.h" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/normal_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/normal_distribution.inl" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/normal_distribution_base.h" "$(@D)/cuda/include/thrust/random/detail/normal_distribution_base.h" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/random_core_access.h" "$(@D)/cuda/include/thrust/random/detail/random_core_access.h" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/subtract_with_carry_engine.inl" "$(@D)/cuda/include/thrust/random/detail/subtract_with_carry_engine.inl" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/uniform_int_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_int_distribution.inl" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/uniform_real_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_real_distribution.inl" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/xor_combine_engine.inl" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine.inl" && cp "/usr/local/cuda-10.0/include/thrust/random/detail/xor_combine_engine_max.h" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine_max.h" && cp "/usr/local/cuda-10.0/include/thrust/random/discard_block_engine.h" "$(@D)/cuda/include/thrust/random/discard_block_engine.h" && cp "/usr/local/cuda-10.0/include/thrust/random/linear_congruential_engine.h" "$(@D)/cuda/include/thrust/random/linear_congruential_engine.h" && cp "/usr/local/cuda-10.0/include/thrust/random/linear_feedback_shift_engine.h" "$(@D)/cuda/include/thrust/random/linear_feedback_shift_engine.h" && cp "/usr/local/cuda-10.0/include/thrust/random/normal_distribution.h" "$(@D)/cuda/include/thrust/random/normal_distribution.h" && cp "/usr/local/cuda-10.0/include/thrust/random/subtract_with_carry_engine.h" "$(@D)/cuda/include/thrust/random/subtract_with_carry_engine.h" && cp "/usr/local/cuda-10.0/include/thrust/random/uniform_int_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_int_distribution.h" && cp "/usr/local/cuda-10.0/include/thrust/random/uniform_real_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_real_distribution.h" && cp "/usr/local/cuda-10.0/include/thrust/random/xor_combine_engine.h" "$(@D)/cuda/include/thrust/random/xor_combine_engine.h" && cp "/usr/local/cuda-10.0/include/thrust/reduce.h" "$(@D)/cuda/include/thrust/reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/remove.h" "$(@D)/cuda/include/thrust/remove.h" && cp "/usr/local/cuda-10.0/include/thrust/replace.h" "$(@D)/cuda/include/thrust/replace.h" && cp "/usr/local/cuda-10.0/include/thrust/reverse.h" "$(@D)/cuda/include/thrust/reverse.h" && cp "/usr/local/cuda-10.0/include/thrust/scan.h" "$(@D)/cuda/include/thrust/scan.h" && cp "/usr/local/cuda-10.0/include/thrust/scatter.h" "$(@D)/cuda/include/thrust/scatter.h" && cp "/usr/local/cuda-10.0/include/thrust/sequence.h" "$(@D)/cuda/include/thrust/sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/set_operations.h" "$(@D)/cuda/include/thrust/set_operations.h" && cp "/usr/local/cuda-10.0/include/thrust/sort.h" "$(@D)/cuda/include/thrust/sort.h" && cp "/usr/local/cuda-10.0/include/thrust/swap.h" "$(@D)/cuda/include/thrust/swap.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cpp/detail/adjacent_difference.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/assign_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cpp/detail/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy_if.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/count.h" "$(@D)/cuda/include/thrust/system/cpp/detail/count.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/equal.h" "$(@D)/cuda/include/thrust/system/cpp/detail/equal.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cpp/detail/extrema.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/find.h" "$(@D)/cuda/include/thrust/system/cpp/detail/find.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cpp/detail/for_each.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/gather.h" "$(@D)/cuda/include/thrust/system/cpp/detail/gather.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/generate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/generate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/get_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cpp/detail/inner_product.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cpp/detail/iter_swap.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/logical.h" "$(@D)/cuda/include/thrust/system/cpp/detail/logical.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cpp/detail/malloc_and_free.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/memory.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/merge.h" "$(@D)/cuda/include/thrust/system/cpp/detail/merge.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cpp/detail/mismatch.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/par.h" "$(@D)/cuda/include/thrust/system/cpp/detail/par.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/partition.h" "$(@D)/cuda/include/thrust/system/cpp/detail/partition.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/remove.h" "$(@D)/cuda/include/thrust/system/cpp/detail/remove.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/replace.h" "$(@D)/cuda/include/thrust/system/cpp/detail/replace.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reverse.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scatter.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cpp/detail/set_operations.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/sort.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cpp/detail/swap_ranges.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/tabulate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cpp/detail/temporary_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/transform.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/unique.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/vector.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/memory.h" "$(@D)/cuda/include/thrust/system/cpp/memory.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cpp/vector.h" "$(@D)/cuda/include/thrust/system/cpp/vector.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/config.h" "$(@D)/cuda/include/thrust/system/cuda/config.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cuda/detail/adjacent_difference.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/assign_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cuda/detail/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_if.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/agent_launcher.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/agent_launcher.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/alignment.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/alignment.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/util.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/count.h" "$(@D)/cuda/include/thrust/system/cuda/detail/count.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/cross_system.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/cub.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/cub.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_select.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/host/mutex.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_allocator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_arch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_debug.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_device.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_device.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_macro.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_namespace.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_ptx.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_type.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_type.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/equal.h" "$(@D)/cuda/include/thrust/system/cuda/detail/equal.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/error.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/error.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cuda/detail/extrema.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/find.h" "$(@D)/cuda/include/thrust/system/cuda/detail/find.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cuda/detail/for_each.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/gather.h" "$(@D)/cuda/include/thrust/system/cuda/detail/gather.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/generate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/generate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/get_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/guarded_driver_types.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_driver_types.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cuda/detail/inner_product.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/internal/copy_cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cuda/detail/iter_swap.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/logical.h" "$(@D)/cuda/include/thrust/system/cuda/detail/logical.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cuda/detail/malloc_and_free.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/memory.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/memory_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/memory_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/merge.h" "$(@D)/cuda/include/thrust/system/cuda/detail/merge.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/mismatch.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/par.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/par_to_seq.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par_to_seq.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/parallel_for.h" "$(@D)/cuda/include/thrust/system/cuda/detail/parallel_for.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/partition.h" "$(@D)/cuda/include/thrust/system/cuda/detail/partition.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/remove.h" "$(@D)/cuda/include/thrust/system/cuda/detail/remove.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/replace.h" "$(@D)/cuda/include/thrust/system/cuda/detail/replace.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reverse.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scatter.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cuda/detail/set_operations.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cuda/detail/swap_ranges.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/tabulate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/temporary_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/terminate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/terminate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/transform.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/unique.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/util.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/vector.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/error.h" "$(@D)/cuda/include/thrust/system/cuda/error.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/experimental/pinned_allocator.h" "$(@D)/cuda/include/thrust/system/cuda/experimental/pinned_allocator.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/memory.h" "$(@D)/cuda/include/thrust/system/cuda/memory.h" && cp "/usr/local/cuda-10.0/include/thrust/system/cuda/vector.h" "$(@D)/cuda/include/thrust/system/cuda/vector.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/adl/adjacent_difference.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/assign_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/adl/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy_if.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/count.h" "$(@D)/cuda/include/thrust/system/detail/adl/count.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/equal.h" "$(@D)/cuda/include/thrust/system/detail/adl/equal.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/extrema.h" "$(@D)/cuda/include/thrust/system/detail/adl/extrema.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/find.h" "$(@D)/cuda/include/thrust/system/detail/adl/find.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/for_each.h" "$(@D)/cuda/include/thrust/system/detail/adl/for_each.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/gather.h" "$(@D)/cuda/include/thrust/system/detail/adl/gather.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/generate.h" "$(@D)/cuda/include/thrust/system/detail/adl/generate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/get_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/get_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/adl/inner_product.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/adl/iter_swap.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/logical.h" "$(@D)/cuda/include/thrust/system/detail/adl/logical.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/adl/malloc_and_free.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/merge.h" "$(@D)/cuda/include/thrust/system/detail/adl/merge.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/adl/mismatch.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/partition.h" "$(@D)/cuda/include/thrust/system/detail/adl/partition.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/remove.h" "$(@D)/cuda/include/thrust/system/detail/adl/remove.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/replace.h" "$(@D)/cuda/include/thrust/system/detail/adl/replace.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/reverse.h" "$(@D)/cuda/include/thrust/system/detail/adl/reverse.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/scatter.h" "$(@D)/cuda/include/thrust/system/detail/adl/scatter.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/sequence.h" "$(@D)/cuda/include/thrust/system/detail/adl/sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/adl/set_operations.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/sort.h" "$(@D)/cuda/include/thrust/system/detail/adl/sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/adl/swap_ranges.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/adl/tabulate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/adl/temporary_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/transform.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/unique.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/adl/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/bad_alloc.h" "$(@D)/cuda/include/thrust/system/detail/bad_alloc.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/errno.h" "$(@D)/cuda/include/thrust/system/detail/errno.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/error_category.inl" "$(@D)/cuda/include/thrust/system/detail/error_category.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/error_code.inl" "$(@D)/cuda/include/thrust/system/detail/error_code.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/error_condition.inl" "$(@D)/cuda/include/thrust/system/detail/error_condition.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/adjacent_difference.inl" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/advance.h" "$(@D)/cuda/include/thrust/system/detail/generic/advance.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/advance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/advance.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy_if.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/count.h" "$(@D)/cuda/include/thrust/system/detail/generic/count.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/count.inl" "$(@D)/cuda/include/thrust/system/detail/generic/count.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/distance.h" "$(@D)/cuda/include/thrust/system/detail/generic/distance.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/distance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/distance.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/equal.h" "$(@D)/cuda/include/thrust/system/detail/generic/equal.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/equal.inl" "$(@D)/cuda/include/thrust/system/detail/generic/equal.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/extrema.h" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/extrema.inl" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/find.h" "$(@D)/cuda/include/thrust/system/detail/generic/find.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/find.inl" "$(@D)/cuda/include/thrust/system/detail/generic/find.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/for_each.h" "$(@D)/cuda/include/thrust/system/detail/generic/for_each.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/gather.h" "$(@D)/cuda/include/thrust/system/detail/generic/gather.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/gather.inl" "$(@D)/cuda/include/thrust/system/detail/generic/gather.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/generate.h" "$(@D)/cuda/include/thrust/system/detail/generic/generate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/generate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/generate.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/inner_product.inl" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/logical.h" "$(@D)/cuda/include/thrust/system/detail/generic/logical.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/memory.h" "$(@D)/cuda/include/thrust/system/detail/generic/memory.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/memory.inl" "$(@D)/cuda/include/thrust/system/detail/generic/memory.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/merge.h" "$(@D)/cuda/include/thrust/system/detail/generic/merge.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/merge.inl" "$(@D)/cuda/include/thrust/system/detail/generic/merge.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/mismatch.inl" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/partition.h" "$(@D)/cuda/include/thrust/system/detail/generic/partition.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/partition.inl" "$(@D)/cuda/include/thrust/system/detail/generic/partition.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/remove.h" "$(@D)/cuda/include/thrust/system/detail/generic/remove.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/remove.inl" "$(@D)/cuda/include/thrust/system/detail/generic/remove.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/replace.h" "$(@D)/cuda/include/thrust/system/detail/generic/replace.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/replace.inl" "$(@D)/cuda/include/thrust/system/detail/generic/replace.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reverse.h" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reverse.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scalar/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scalar/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scatter.h" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scatter.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/select_system.h" "$(@D)/cuda/include/thrust/system/detail/generic/select_system.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sequence.h" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sequence.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/set_operations.inl" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sort.h" "$(@D)/cuda/include/thrust/system/detail/generic/sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sort.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sort.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/swap_ranges.inl" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/tabulate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/tag.h" "$(@D)/cuda/include/thrust/system/detail/generic/tag.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/temporary_buffer.inl" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/type_traits.h" "$(@D)/cuda/include/thrust/system/detail/generic/type_traits.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/internal/decompose.h" "$(@D)/cuda/include/thrust/system/detail/internal/decompose.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/sequential/adjacent_difference.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/assign_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/sequential/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy_backward.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_backward.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_if.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/count.h" "$(@D)/cuda/include/thrust/system/detail/sequential/count.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/equal.h" "$(@D)/cuda/include/thrust/system/detail/sequential/equal.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/execution_policy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/extrema.h" "$(@D)/cuda/include/thrust/system/detail/sequential/extrema.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/find.h" "$(@D)/cuda/include/thrust/system/detail/sequential/find.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/for_each.h" "$(@D)/cuda/include/thrust/system/detail/sequential/for_each.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/gather.h" "$(@D)/cuda/include/thrust/system/detail/sequential/gather.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/general_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/general_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/generate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/generate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/get_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/get_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/sequential/inner_product.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/insertion_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/insertion_sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/sequential/iter_swap.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/logical.h" "$(@D)/cuda/include/thrust/system/detail/sequential/logical.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/sequential/malloc_and_free.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/merge.h" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/merge.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/sequential/mismatch.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/partition.h" "$(@D)/cuda/include/thrust/system/detail/sequential/partition.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/remove.h" "$(@D)/cuda/include/thrust/system/detail/sequential/remove.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/replace.h" "$(@D)/cuda/include/thrust/system/detail/sequential/replace.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/reverse.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reverse.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/scatter.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scatter.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/sequence.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/sequential/set_operations.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_merge_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_merge_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_primitive_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_primitive_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_radix_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_radix_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/sequential/swap_ranges.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/tabulate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/sequential/temporary_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/transform.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/trivial_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/trivial_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/unique.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/detail/system_error.inl" "$(@D)/cuda/include/thrust/system/detail/system_error.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/error_code.h" "$(@D)/cuda/include/thrust/system/error_code.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/omp/detail/adjacent_difference.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/assign_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/omp/detail/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/count.h" "$(@D)/cuda/include/thrust/system/omp/detail/count.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/default_decomposition.h" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/default_decomposition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/equal.h" "$(@D)/cuda/include/thrust/system/omp/detail/equal.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/detail/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/omp/detail/extrema.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/find.h" "$(@D)/cuda/include/thrust/system/omp/detail/find.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/gather.h" "$(@D)/cuda/include/thrust/system/omp/detail/gather.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/generate.h" "$(@D)/cuda/include/thrust/system/omp/detail/generate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/get_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/omp/detail/inner_product.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/omp/detail/iter_swap.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/logical.h" "$(@D)/cuda/include/thrust/system/omp/detail/logical.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/omp/detail/malloc_and_free.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/omp/detail/memory.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/merge.h" "$(@D)/cuda/include/thrust/system/omp/detail/merge.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/omp/detail/mismatch.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/par.h" "$(@D)/cuda/include/thrust/system/omp/detail/par.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/partition.h" "$(@D)/cuda/include/thrust/system/omp/detail/partition.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/partition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/partition.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_intervals.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/remove.h" "$(@D)/cuda/include/thrust/system/omp/detail/remove.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/remove.inl" "$(@D)/cuda/include/thrust/system/omp/detail/remove.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/replace.h" "$(@D)/cuda/include/thrust/system/omp/detail/replace.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/omp/detail/reverse.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/omp/detail/scatter.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/omp/detail/sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/omp/detail/set_operations.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/sort.h" "$(@D)/cuda/include/thrust/system/omp/detail/sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/sort.inl" "$(@D)/cuda/include/thrust/system/omp/detail/sort.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/omp/detail/swap_ranges.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/omp/detail/tabulate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/omp/detail/temporary_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/transform.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/omp/detail/vector.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/memory.h" "$(@D)/cuda/include/thrust/system/omp/memory.h" && cp "/usr/local/cuda-10.0/include/thrust/system/omp/vector.h" "$(@D)/cuda/include/thrust/system/omp/vector.h" && cp "/usr/local/cuda-10.0/include/thrust/system/system_error.h" "$(@D)/cuda/include/thrust/system/system_error.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/tbb/detail/adjacent_difference.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/assign_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/tbb/detail/binary_search.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/count.h" "$(@D)/cuda/include/thrust/system/tbb/detail/count.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/equal.h" "$(@D)/cuda/include/thrust/system/tbb/detail/equal.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/extrema.h" "$(@D)/cuda/include/thrust/system/tbb/detail/extrema.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/find.h" "$(@D)/cuda/include/thrust/system/tbb/detail/find.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/for_each.h" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/gather.h" "$(@D)/cuda/include/thrust/system/tbb/detail/gather.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/generate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/generate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/get_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/get_value.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/tbb/detail/inner_product.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/tbb/detail/iter_swap.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/logical.h" "$(@D)/cuda/include/thrust/system/tbb/detail/logical.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/tbb/detail/malloc_and_free.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/memory.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/memory.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/merge.h" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/merge.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/tbb/detail/mismatch.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/par.h" "$(@D)/cuda/include/thrust/system/tbb/detail/par.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/partition.h" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/partition.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_intervals.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/remove.h" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/remove.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/replace.h" "$(@D)/cuda/include/thrust/system/tbb/detail/replace.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reverse.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reverse.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scan.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scatter.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scatter.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/sequence.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sequence.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/tbb/detail/set_operations.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/sort.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/sort.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/tbb/detail/swap_ranges.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/tabulate.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/tbb/detail/temporary_buffer.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/transform.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_scan.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_fill.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/vector.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/vector.inl" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/execution_policy.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/memory.h" "$(@D)/cuda/include/thrust/system/tbb/memory.h" && cp "/usr/local/cuda-10.0/include/thrust/system/tbb/vector.h" "$(@D)/cuda/include/thrust/system/tbb/vector.h" && cp "/usr/local/cuda-10.0/include/thrust/system_error.h" "$(@D)/cuda/include/thrust/system_error.h" && cp "/usr/local/cuda-10.0/include/thrust/tabulate.h" "$(@D)/cuda/include/thrust/tabulate.h" && cp "/usr/local/cuda-10.0/include/thrust/transform.h" "$(@D)/cuda/include/thrust/transform.h" && cp "/usr/local/cuda-10.0/include/thrust/transform_reduce.h" "$(@D)/cuda/include/thrust/transform_reduce.h" && cp "/usr/local/cuda-10.0/include/thrust/transform_scan.h" "$(@D)/cuda/include/thrust/transform_scan.h" && cp "/usr/local/cuda-10.0/include/thrust/tuple.h" "$(@D)/cuda/include/thrust/tuple.h" && cp "/usr/local/cuda-10.0/include/thrust/uninitialized_copy.h" "$(@D)/cuda/include/thrust/uninitialized_copy.h" && cp "/usr/local/cuda-10.0/include/thrust/uninitialized_fill.h" "$(@D)/cuda/include/thrust/uninitialized_fill.h" && cp "/usr/local/cuda-10.0/include/thrust/unique.h" "$(@D)/cuda/include/thrust/unique.h" && cp "/usr/local/cuda-10.0/include/thrust/version.h" "$(@D)/cuda/include/thrust/version.h" && cp "/usr/local/cuda-10.0/include/vector_functions.h" "$(@D)/cuda/include/vector_functions.h" && cp "/usr/local/cuda-10.0/include/vector_functions.hpp" "$(@D)/cuda/include/vector_functions.hpp" && cp "/usr/local/cuda-10.0/include/vector_types.h" "$(@D)/cuda/include/vector_types.h"
    """,
 )
 
@@ -1203,7 +1203,7 @@ genrule(
         "cuda/nvvm/libdevice/libdevice.10.bc",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/nvvm/libdevice/libdevice.10.bc" "$(@D)//libdevice.10.bc"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-10.0/nvvm/libdevice/libdevice.10.bc" "$(@D)//libdevice.10.bc"
    """,
 )
 
@@ -1240,7 +1240,7 @@ genrule(
         "cuda/extras/CUPTI/include/openacc/cupti_openacc.h",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/gl.h" "$(@D)/cuda/extras/CUPTI/include/GL/gl.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glew.h" "$(@D)/cuda/extras/CUPTI/include/GL/glew.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glext.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glu.h" "$(@D)/cuda/extras/CUPTI/include/GL/glu.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glut.h" "$(@D)/cuda/extras/CUPTI/include/GL/glut.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glx.h" "$(@D)/cuda/extras/CUPTI/include/GL/glx.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glxext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glxext.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglew.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglew.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglext.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglext.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cuda_stdint.h" "$(@D)/cuda/extras/CUPTI/include/cuda_stdint.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti.h" "$(@D)/cuda/extras/CUPTI/include/cupti.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_activity.h" "$(@D)/cuda/extras/CUPTI/include/cupti_activity.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_callbacks.h" "$(@D)/cuda/extras/CUPTI/include/cupti_callbacks.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_driver_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_driver_cbid.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_events.h" "$(@D)/cuda/extras/CUPTI/include/cupti_events.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_metrics.h" "$(@D)/cuda/extras/CUPTI/include/cupti_metrics.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_nvtx_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_nvtx_cbid.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_result.h" "$(@D)/cuda/extras/CUPTI/include/cupti_result.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_runtime_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_runtime_cbid.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_version.h" "$(@D)/cuda/extras/CUPTI/include/cupti_version.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaGL_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaGL_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaVDPAU_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_nvtx_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_nvtx_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/openacc/cupti_openacc.h" "$(@D)/cuda/extras/CUPTI/include/openacc/cupti_openacc.h"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/gl.h" "$(@D)/cuda/extras/CUPTI/include/GL/gl.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glew.h" "$(@D)/cuda/extras/CUPTI/include/GL/glew.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glext.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glu.h" "$(@D)/cuda/extras/CUPTI/include/GL/glu.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glut.h" "$(@D)/cuda/extras/CUPTI/include/GL/glut.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glx.h" "$(@D)/cuda/extras/CUPTI/include/GL/glx.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glxext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glxext.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/wglew.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglew.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/GL/wglext.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglext.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cuda_stdint.h" "$(@D)/cuda/extras/CUPTI/include/cuda_stdint.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti.h" "$(@D)/cuda/extras/CUPTI/include/cupti.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_activity.h" "$(@D)/cuda/extras/CUPTI/include/cupti_activity.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_callbacks.h" "$(@D)/cuda/extras/CUPTI/include/cupti_callbacks.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_driver_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_driver_cbid.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_events.h" "$(@D)/cuda/extras/CUPTI/include/cupti_events.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_metrics.h" "$(@D)/cuda/extras/CUPTI/include/cupti_metrics.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_nvtx_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_nvtx_cbid.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_result.h" "$(@D)/cuda/extras/CUPTI/include/cupti_result.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_runtime_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_runtime_cbid.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_version.h" "$(@D)/cuda/extras/CUPTI/include/cupti_version.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cudaGL_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaGL_meta.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cudaVDPAU_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_meta.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/generated_nvtx_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_nvtx_meta.h" && cp "/usr/local/cuda-10.0/extras/CUPTI/include/openacc/cupti_openacc.h" "$(@D)/cuda/extras/CUPTI/include/openacc/cupti_openacc.h"
    """,
 )
 
@@ -1248,17 +1248,17 @@ genrule(
     name = "cuda-lib",
     outs = [
         "cuda/lib/libcuda.so",
-        "cuda/lib/libcudart.so.9.0",
+        "cuda/lib/libcudart.so.10.0",
         "cuda/lib/libcudart_static.a",
-        "cuda/lib/libcublas.so.9.0",
-        "cuda/lib/libcusolver.so.9.0",
-        "cuda/lib/libcurand.so.9.0",
-        "cuda/lib/libcufft.so.9.0",
+        "cuda/lib/libcublas.so.10.0",
+        "cuda/lib/libcusolver.so.10.0",
+        "cuda/lib/libcurand.so.10.0",
+        "cuda/lib/libcufft.so.10.0",
         "cuda/lib/libcudnn.so.7",
-        "cuda/lib/libcupti.so.9.0",
+        "cuda/lib/libcupti.so.10.0",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart.so.9.0.176" "$(@D)/cuda/lib/libcudart.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcublas.so.9.0.480" "$(@D)/cuda/lib/libcublas.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcusolver.so.9.0.176" "$(@D)/cuda/lib/libcusolver.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcurand.so.9.0.176" "$(@D)/cuda/lib/libcurand.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcufft.so.9.0.176" "$(@D)/cuda/lib/libcufft.so.9.0" && cp "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.2.1" "$(@D)/cuda/lib/libcudnn.so.7" && cp "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.9.0.176" "$(@D)/cuda/lib/libcupti.so.9.0"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudart.so.10.0.176" "$(@D)/cuda/lib/libcudart.so.10.0" && cp "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcublas.so.10.0.480" "$(@D)/cuda/lib/libcublas.so.10.0" && cp "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcusolver.so.10.0.176" "$(@D)/cuda/lib/libcusolver.so.10.0" && cp "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcurand.so.10.0.176" "$(@D)/cuda/lib/libcurand.so.10.0" && cp "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcufft.so.10.0.176" "$(@D)/cuda/lib/libcufft.so.10.0" && cp "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.2.1" "$(@D)/cuda/lib/libcudnn.so.7" && cp "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.10.0.176" "$(@D)/cuda/lib/libcupti.so.10.0"
    """,
 )
 
diff --git a/third_party/toolchains/gpus/cuda/cuda/cuda_config.h b/third_party/toolchains/gpus/cuda/cuda/cuda_config.h
index 7cdaf144ada77c93119f7412df93e8f3423872ee..b05bfb732651360581d2ef9d353f16b6f9e2d9a6 100644
--- a/third_party/toolchains/gpus/cuda/cuda/cuda_config.h
+++ b/third_party/toolchains/gpus/cuda/cuda/cuda_config.h
@@ -19,9 +19,9 @@ limitations under the License.
 
 #define TF_CUDA_CAPABILITIES CudaVersion("3.0")
 
-#define TF_CUDA_VERSION "9.0"
+#define TF_CUDA_VERSION "10.0"
 #define TF_CUDNN_VERSION "7"
 
-#define TF_CUDA_TOOLKIT_PATH "/usr/local/cuda-9.0"
+#define TF_CUDA_TOOLKIT_PATH "/usr/local/cuda-10.0"
 
 #endif  // CUDA_CUDA_CONFIG_H_
diff --git a/third_party/toolchains/preconfig/generate/BUILD b/third_party/toolchains/preconfig/generate/BUILD
index 7e3e93d6004894029135f3151a282bcc43b8938f..7bc5f2bb6057d40038445f99ae519a31b477b742 100644
--- a/third_party/toolchains/preconfig/generate/BUILD
+++ b/third_party/toolchains/preconfig/generate/BUILD
@@ -3,33 +3,52 @@ licenses(["restricted"])
 load(":generate.bzl", "tensorflow_rbe_config")
 
 tensorflow_rbe_config(
-    name = "ubuntu14.04-py3-gcc-cuda9.0-cudnn7-nccl2",
+    name = "ubuntu16.04-py3-clang",
+    compiler = "clang",
+    python_version = "3",
+)
+
+tensorflow_rbe_config(
+    name = "ubuntu14.04-py3-gcc-cuda9.0-cudnn7-tensorrt5",
     compiler = "gcc",
     cuda_version = "9.0",
     cudnn_version = "7",
     python_version = "3",
+    tensorrt_version = "5",
 )
 
 tensorflow_rbe_config(
-    name = "ubuntu14.04-py3-clang-cuda9.0-cudnn7-nccl2",
+    name = "ubuntu14.04-py3-clang-cuda9.0-cudnn7-tensorrt5",
     compiler = "clang",
     cuda_version = "9.0",
     cudnn_version = "7",
     python_version = "3",
+    tensorrt_version = "5",
+)
+
+tensorflow_rbe_config(
+    name = "ubuntu14.04-py3-gcc7-cuda10.0-cudnn7-tensorrt5",
+    compiler = "gcc-7",
+    cuda_version = "10.0",
+    cudnn_version = "7",
+    python_version = "3",
+    tensorrt_version = "5",
 )
 
 tensorflow_rbe_config(
-    name = "ubuntu14.04-py3-gcc-cuda10.0-cudnn7-nccl2",
+    name = "ubuntu14.04-py3-gcc-cuda10.0-cudnn7-tensorrt5",
     compiler = "gcc",
     cuda_version = "10.0",
     cudnn_version = "7",
     python_version = "3",
+    tensorrt_version = "5",
 )
 
 tensorflow_rbe_config(
-    name = "ubuntu14.04-py3-clang-cuda10.0-cudnn7-nccl2",
+    name = "ubuntu14.04-py3-clang-cuda10.0-cudnn7-tensorrt5",
     compiler = "clang",
     cuda_version = "10.0",
     cudnn_version = "7",
     python_version = "3",
+    tensorrt_version = "5",
 )
diff --git a/third_party/toolchains/preconfig/generate/archives.bzl b/third_party/toolchains/preconfig/generate/archives.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..a26d2c623c9b05a74787db2b9f5e218e2102ce8c
--- /dev/null
+++ b/third_party/toolchains/preconfig/generate/archives.bzl
@@ -0,0 +1,12 @@
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+def bazel_toolchains_archive():
+    http_archive(
+        name = "bazel_toolchains",
+        sha256 = "109a99384f9d08f9e75136d218ebaebc68cc810c56897aea2224c57932052d30",
+        strip_prefix = "bazel-toolchains-94d31935a2c94fe7e7c7379a0f3393e181928ff7",
+        urls = [
+            "https://mirror.bazel.build/github.com/bazelbuild/bazel-toolchains/archive/94d31935a2c94fe7e7c7379a0f3393e181928ff7.tar.gz",
+            "https://github.com/bazelbuild/bazel-toolchains/archive/94d31935a2c94fe7e7c7379a0f3393e181928ff7.tar.gz",
+        ],
+    )
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index 7099b9bf3e4715706cbe725373add4cc98d304b8..a86261328eb2c6a90236fb429d71fe3dcb9fddf9 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -1,4 +1,5 @@
 container_digests = {
-    "cuda9.0-cudnn7-ubuntu14.04": "sha256:c26138f4c38c754da2bad44a8a068523abf7fbd71d58a57ce92e5342c5431bf5",
-    "cuda10.0-cudnn7-ubuntu14.04": "sha256:66e7d592c8149291d5562a0f3093655a15b09c22e0eb30a87b3b6469b7a30ffc",
+    "ubuntu16.04": "sha256:d0d98c53111c3ec071aa81632a2b0d6f210e5c2411c5172e31f99002125ec4de",
+    "cuda9.0-cudnn7-ubuntu14.04": "sha256:006a76ee1838122ff7f21ebac85f24c1ef350d4dd79b3ceff0e4fe649ed90d33",
+    "cuda10.0-cudnn7-ubuntu14.04": "sha256:d433e1221f802dac393bc8652fabcc63aa46896cd920bb888ae0e2002fe6b756",
 }
diff --git a/third_party/toolchains/preconfig/generate/generate.bzl b/third_party/toolchains/preconfig/generate/generate.bzl
index 2fb3a94cdca7430b522939266a4b2b398a65df8d..7458c4dc46223332d643c37aeebbb418bd49b195 100644
--- a/third_party/toolchains/preconfig/generate/generate.bzl
+++ b/third_party/toolchains/preconfig/generate/generate.bzl
@@ -3,42 +3,59 @@ load(
     "docker_toolchain_autoconfig",
 )
 
-def _tensorflow_rbe_config(name, cuda_version, cudnn_version, python_version, compiler):
-    docker_toolchain_autoconfig(
-        name = name,
-        base = "@cuda%s-cudnn%s-ubuntu14.04//image" % (cuda_version, cudnn_version),
-        bazel_version = "0.16.1",
+def _tensorflow_rbe_config(name, compiler, python_version, cuda_version = None, cudnn_version = None, tensorrt_version = None):
+    base = "@ubuntu16.04//image"
+    config_repos = [
+        "local_config_python",
+        "local_config_cc",
+    ]
+    env = {
+        "ABI_VERSION": "gcc",
+        "ABI_LIBC_VERSION": "glibc_2.19",
+        "BAZEL_COMPILER": compiler,
+        "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
+        "BAZEL_TARGET_LIBC": "glibc_2.19",
+        "BAZEL_TARGET_CPU": "k8",
+        "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
+        "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
+        "CC": compiler,
+        "PYTHON_BIN_PATH": "/usr/bin/python%s" % python_version,
+        "CLEAR_CACHE": "1",
+        "HOST_CXX_COMPILER": compiler,
+        "HOST_C_COMPILER": compiler,
+    }
+
+    if cuda_version != None:
+        base = "@cuda%s-cudnn%s-ubuntu14.04//image" % (cuda_version, cudnn_version)
+
+        # The cuda toolchain currently contains its own C++ toolchain definition,
+        # so we do not fetch local_config_cc.
         config_repos = [
-            "local_config_cuda",
             "local_config_python",
-            "local_config_nccl",
-        ],
-        env = {
-            "ABI_VERSION": "gcc",
-            "ABI_LIBC_VERSION": "glibc_2.19",
-            "BAZEL_COMPILER": compiler,
-            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
-            "BAZEL_TARGET_LIBC": "glibc_2.19",
-            "BAZEL_TARGET_CPU": "k8",
-            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
-            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
-            "CC": compiler,
-            "PYTHON_BIN_PATH": "/usr/bin/python%s" % python_version,
+            "local_config_cuda",
+            "local_config_tensorrt",
+        ]
+        env.update({
             "TF_NEED_CUDA": "1",
             "TF_CUDA_CLANG": "1" if compiler == "clang" else "0",
-            "CLEAR_CACHE": "1",
-            "TF_CUDA_COMPUTE_CAPABILITIES": "3.0",
+            "TF_CUDA_COMPUTE_CAPABILITIES": "3.0,6.0",
             "TF_ENABLE_XLA": "1",
             "TF_CUDNN_VERSION": cudnn_version,
             "TF_CUDA_VERSION": cuda_version,
-            "NCCL_INSTALL_PATH": "/usr/lib",
-            "NCCL_HDR_PATH": "/usr/include",
-            "TF_NCCL_VERSION": "2",
             "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
-        },
-        # TODO(klimek): We should use the sources that we currently work on, not
-        # just the latest snapshot of tensorflow that is checked in.
-        git_repo = "https://github.com/tensorflow/tensorflow",
+            "TF_NEED_TENSORRT": "1",
+            "TF_TENSORRT_VERSION": tensorrt_version,
+            "TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "GCC_HOST_COMPILER_PATH": compiler if compiler != "clang" else "",
+        })
+
+    docker_toolchain_autoconfig(
+        name = name,
+        base = base,
+        bazel_version = "0.21.0",
+        config_repos = config_repos,
+        env = env,
+        mount_project = "$(mount_project)",
         tags = ["manual"],
         incompatible_changes_off = True,
     )
diff --git a/third_party/toolchains/preconfig/generate/generate.sh b/third_party/toolchains/preconfig/generate/generate.sh
index 37c5211278abf243ab388d83688e6c8c7888cea3..754ec2057e04ee527f06bacfcf5b44e4ab823344 100755
--- a/third_party/toolchains/preconfig/generate/generate.sh
+++ b/third_party/toolchains/preconfig/generate/generate.sh
@@ -33,10 +33,20 @@ PY_VERSION="${PLATFORM[1]}"
 COMPILER="${PLATFORM[2]}"
 CUDA_VERSION="${PLATFORM[3]}"
 CUDNN_VERSION="${PLATFORM[4]}"
-NCCL_VERSION="${PLATFORM[5]}"
-
-if [[ "${COMPILER}" == "gcc" ]]; then
-  COMPILER="gcc-nvcc-${CUDA_VERSION}"
+TENSORRT_VERSION="${PLATFORM[5]}"
+
+# TODO(klimek): Put this into the name.
+
+if [[ -n "${CUDA_VERSION}" ]]; then
+  if [[ "${COMPILER}" == gcc* ]]; then
+    COMPILER="${COMPILER}-nvcc-${CUDA_VERSION}"
+  fi
+  # Currently we create a special toolchain for clang when compiling with
+  # cuda enabled. We can get rid of this once the default toolchain bazel
+  # provides supports cuda.
+  if [[ "${COMPILER}" == "clang" ]]; then
+    COMPILER="cuda-clang"
+  fi
 fi
 
 echo "OS: ${OS}"
@@ -44,12 +54,19 @@ echo "Python: ${PY_VERSION}"
 echo "Compiler: ${COMPILER}"
 echo "CUDA: ${CUDA_VERSION}"
 echo "CUDNN: ${CUDNN_VERSION}"
-echo "NCCL: ${NCCL_VERSION}"
+echo "TensorRT: ${TENSORRT_VERSION}"
 
-bazel build "${PKG}/generate:${TARGET}"
+bazel build --define=mount_project="${PWD}" "${PKG}/generate:${TARGET}"
 cd "${TEMPDIR}"
 tar xvf "${ROOT}/bazel-bin/${PKG}/generate/${TARGET}_outputs.tar"
 
+# TODO(klimek): The skylark config rules should copy the files instead of
+# creating aliases.
+# Other than in @local_config_tensorrt, the header files in the remote config
+# repo are not relative to the repository root. Add a dummy include_prefix to
+# make them available as virtual includes.
+buildozer 'set include_prefix ""' //local_config_tensorrt:%cc_library
+
 # Delete all empty files: configurations leave empty files around when they are
 # unnecessary.
 find . -empty -delete
@@ -58,8 +75,8 @@ find . -empty -delete
 # <OS>/
 #   <CUDA>-<CUDNN>/
 #   <COMPILER>/
-#   <NCCL>/
 #   <PYTHON>/
+#   <TENSORRT>/
 
 # Create our toplevel output directory for the OS.
 mkdir "${OS}"
@@ -67,16 +84,22 @@ mkdir "${OS}"
 # Python:
 mv local_config_python "${OS}/${PY_VERSION}"
 
-# NCCL:
-mv local_config_nccl "${OS}/${NCCL_VERSION}"
+if [[ -n "${CUDA_VERSION}" ]]; then
+  # Compiler:
+  mv local_config_cuda/crosstool "${OS}/${COMPILER}"
 
-# Compiler:
-mv local_config_cuda/crosstool "${OS}/${COMPILER}"
+  # CUDA:
+  mv local_config_cuda "${OS}/${CUDA_VERSION}-${CUDNN_VERSION}"
 
-# CUDA:
-mv local_config_cuda "${OS}/${CUDA_VERSION}-${CUDNN_VERSION}"
+  # TensorRT:
+  mv local_config_tensorrt "${OS}/${TENSORRT_VERSION}"
+else
+  # Compiler:
+  mv local_config_cc "${OS}/${COMPILER}"
+fi
 
 # Cleanup for copybara.
+find "${OS}" -name '*.h' |xargs clang-format -i
 find "${OS}" -name 'BUILD' -o -name '*.bzl' |xargs buildifier
 find "${OS}" -name 'BUILD' -o -name '*.bzl' |xargs -I {} mv {} {}.oss
 
diff --git a/third_party/toolchains/preconfig/generate/workspace.bzl b/third_party/toolchains/preconfig/generate/workspace.bzl
index f30c2f1ae6318c645e174617a74b8fdadac1598e..bce2d5bdbf44ceacb8bf9ad7bee7c6110c1edba7 100644
--- a/third_party/toolchains/preconfig/generate/workspace.bzl
+++ b/third_party/toolchains/preconfig/generate/workspace.bzl
@@ -1,24 +1,34 @@
+load(
+    "@io_bazel_rules_docker//repositories:repositories.bzl",
+    container_repositories = "repositories",
+)
 load(
     "@io_bazel_rules_docker//container:container.bzl",
     "container_pull",
-    container_repositories = "repositories",
 )
 load(":containers.bzl", "container_digests")
 
 def _remote_config_workspace():
     container_repositories()
 
+    container_pull(
+        name = "ubuntu16.04",
+        registry = "gcr.io",
+        repository = "tensorflow-testing/nosla-ubuntu16.04",
+        digest = container_digests["ubuntu16.04"],
+    )
+
     container_pull(
         name = "cuda9.0-cudnn7-ubuntu14.04",
         registry = "gcr.io",
-        repository = "asci-toolchain/nosla-cuda9.0-cudnn7-ubuntu14.04",
+        repository = "tensorflow-testing/nosla-cuda9.0-cudnn7-ubuntu14.04",
         digest = container_digests["cuda9.0-cudnn7-ubuntu14.04"],
     )
 
     container_pull(
         name = "cuda10.0-cudnn7-ubuntu14.04",
         registry = "gcr.io",
-        repository = "asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04",
+        repository = "tensorflow-testing/nosla-cuda10.0-cudnn7-ubuntu14.04",
         digest = container_digests["cuda10.0-cudnn7-ubuntu14.04"],
     )
 
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
index c813efccf9b82578984b33d04fd513030c83e0b1..a75170ed7239c48b9a9901c74f30825f3babafad 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
@@ -1192,9 +1192,7 @@ genrule(
         "cuda/include/vector_functions.hpp",
         "cuda/include/vector_types.h",
     ],
-    cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-10.0/include/CL/cl.h" "$(@D)/cuda/include/CL/cl.h" && cp -f "/usr/local/cuda-10.0/include/CL/cl.hpp" "$(@D)/cuda/include/CL/cl.hpp" && cp -f "/usr/local/cuda-10.0/include/CL/cl_egl.h" "$(@D)/cuda/include/CL/cl_egl.h" && cp -f "/usr/local/cuda-10.0/include/CL/cl_ext.h" "$(@D)/cuda/include/CL/cl_ext.h" && cp -f "/usr/local/cuda-10.0/include/CL/cl_gl.h" "$(@D)/cuda/include/CL/cl_gl.h" && cp -f "/usr/local/cuda-10.0/include/CL/cl_gl_ext.h" "$(@D)/cuda/include/CL/cl_gl_ext.h" && cp -f "/usr/local/cuda-10.0/include/CL/cl_platform.h" "$(@D)/cuda/include/CL/cl_platform.h" && cp -f "/usr/local/cuda-10.0/include/CL/opencl.h" "$(@D)/cuda/include/CL/opencl.h" && cp -f "/usr/local/cuda-10.0/include/builtin_types.h" "$(@D)/cuda/include/builtin_types.h" && cp -f "/usr/local/cuda-10.0/include/channel_descriptor.h" "$(@D)/cuda/include/channel_descriptor.h" && cp -f "/usr/local/cuda-10.0/include/common_functions.h" "$(@D)/cuda/include/common_functions.h" && cp -f "/usr/local/cuda-10.0/include/cooperative_groups.h" "$(@D)/cuda/include/cooperative_groups.h" && cp -f "/usr/local/cuda-10.0/include/cooperative_groups_helpers.h" "$(@D)/cuda/include/cooperative_groups_helpers.h" && cp -f "/usr/local/cuda-10.0/include/crt/common_functions.h" "$(@D)/cuda/include/crt/common_functions.h" && cp -f "/usr/local/cuda-10.0/include/crt/device_double_functions.h" "$(@D)/cuda/include/crt/device_double_functions.h" && cp -f "/usr/local/cuda-10.0/include/crt/device_double_functions.hpp" "$(@D)/cuda/include/crt/device_double_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/crt/device_functions.h" "$(@D)/cuda/include/crt/device_functions.h" && cp -f "/usr/local/cuda-10.0/include/crt/device_functions.hpp" "$(@D)/cuda/include/crt/device_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/crt/func_macro.h" "$(@D)/cuda/include/crt/func_macro.h" && cp -f "/usr/local/cuda-10.0/include/crt/host_config.h" "$(@D)/cuda/include/crt/host_config.h" && cp -f "/usr/local/cuda-10.0/include/crt/host_defines.h" "$(@D)/cuda/include/crt/host_defines.h" && cp -f "/usr/local/cuda-10.0/include/crt/host_runtime.h" "$(@D)/cuda/include/crt/host_runtime.h" && cp -f "/usr/local/cuda-10.0/include/crt/math_functions.h" "$(@D)/cuda/include/crt/math_functions.h" && cp -f "/usr/local/cuda-10.0/include/crt/math_functions.hpp" "$(@D)/cuda/include/crt/math_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/crt/mma.h" "$(@D)/cuda/include/crt/mma.h" && cp -f "/usr/local/cuda-10.0/include/crt/mma.hpp" "$(@D)/cuda/include/crt/mma.hpp" && cp -f "/usr/local/cuda-10.0/include/crt/nvfunctional" "$(@D)/cuda/include/crt/nvfunctional" && cp -f "/usr/local/cuda-10.0/include/crt/sm_70_rt.h" "$(@D)/cuda/include/crt/sm_70_rt.h" && cp -f "/usr/local/cuda-10.0/include/crt/sm_70_rt.hpp" "$(@D)/cuda/include/crt/sm_70_rt.hpp" && cp -f "/usr/local/cuda-10.0/include/crt/storage_class.h" "$(@D)/cuda/include/crt/storage_class.h" && cp -f "/usr/local/cuda-10.0/include/cuComplex.h" "$(@D)/cuda/include/cuComplex.h" && cp -f "/usr/local/cuda-10.0/include/cublas.h" "$(@D)/cuda/include/cublas.h" && cp -f "/usr/local/cuda-10.0/include/cublasXt.h" "$(@D)/cuda/include/cublasXt.h" && cp -f "/usr/local/cuda-10.0/include/cublas_api.h" "$(@D)/cuda/include/cublas_api.h" && cp -f "/usr/local/cuda-10.0/include/cublas_v2.h" "$(@D)/cuda/include/cublas_v2.h" && cp -f "/usr/local/cuda-10.0/include/cuda.h" "$(@D)/cuda/include/cuda.h" && cp -f "/usr/local/cuda-10.0/include/cudaEGL.h" "$(@D)/cuda/include/cudaEGL.h" && cp -f "/usr/local/cuda-10.0/include/cudaGL.h" "$(@D)/cuda/include/cudaGL.h" && cp -f "/usr/local/cuda-10.0/include/cudaProfiler.h" "$(@D)/cuda/include/cudaProfiler.h" && cp -f "/usr/local/cuda-10.0/include/cudaVDPAU.h" "$(@D)/cuda/include/cudaVDPAU.h" && cp -f "/usr/local/cuda-10.0/include/cuda_device_runtime_api.h" "$(@D)/cuda/include/cuda_device_runtime_api.h" && cp -f "/usr/local/cuda-10.0/include/cuda_egl_interop.h" "$(@D)/cuda/include/cuda_egl_interop.h" && cp -f "/usr/local/cuda-10.0/include/cuda_fp16.h" "$(@D)/cuda/include/cuda_fp16.h" && cp -f "/usr/local/cuda-10.0/include/cuda_fp16.hpp" "$(@D)/cuda/include/cuda_fp16.hpp" && cp -f "/usr/local/cuda-10.0/include/cuda_gl_interop.h" "$(@D)/cuda/include/cuda_gl_interop.h" && cp -f "/usr/local/cuda-10.0/include/cuda_occupancy.h" "$(@D)/cuda/include/cuda_occupancy.h" && cp -f "/usr/local/cuda-10.0/include/cuda_profiler_api.h" "$(@D)/cuda/include/cuda_profiler_api.h" && cp -f "/usr/local/cuda-10.0/include/cuda_runtime.h" "$(@D)/cuda/include/cuda_runtime.h" && cp -f "/usr/local/cuda-10.0/include/cuda_runtime_api.h" "$(@D)/cuda/include/cuda_runtime_api.h" && cp -f "/usr/local/cuda-10.0/include/cuda_surface_types.h" "$(@D)/cuda/include/cuda_surface_types.h" && cp -f "/usr/local/cuda-10.0/include/cuda_texture_types.h" "$(@D)/cuda/include/cuda_texture_types.h" && cp -f "/usr/local/cuda-10.0/include/cuda_vdpau_interop.h" "$(@D)/cuda/include/cuda_vdpau_interop.h" && cp -f "/usr/local/cuda-10.0/include/cudalibxt.h" "$(@D)/cuda/include/cudalibxt.h" && cp -f "/usr/local/cuda-10.0/include/cudart_platform.h" "$(@D)/cuda/include/cudart_platform.h" && cp -f "/usr/local/cuda-10.0/include/cufft.h" "$(@D)/cuda/include/cufft.h" && cp -f "/usr/local/cuda-10.0/include/cufftXt.h" "$(@D)/cuda/include/cufftXt.h" && cp -f "/usr/local/cuda-10.0/include/cufftw.h" "$(@D)/cuda/include/cufftw.h" && cp -f "/usr/local/cuda-10.0/include/curand.h" "$(@D)/cuda/include/curand.h" && cp -f "/usr/local/cuda-10.0/include/curand_discrete.h" "$(@D)/cuda/include/curand_discrete.h" && cp -f "/usr/local/cuda-10.0/include/curand_discrete2.h" "$(@D)/cuda/include/curand_discrete2.h" && cp -f "/usr/local/cuda-10.0/include/curand_globals.h" "$(@D)/cuda/include/curand_globals.h" && cp -f "/usr/local/cuda-10.0/include/curand_kernel.h" "$(@D)/cuda/include/curand_kernel.h" && cp -f "/usr/local/cuda-10.0/include/curand_lognormal.h" "$(@D)/cuda/include/curand_lognormal.h" && cp -f "/usr/local/cuda-10.0/include/curand_mrg32k3a.h" "$(@D)/cuda/include/curand_mrg32k3a.h" && cp -f "/usr/local/cuda-10.0/include/curand_mtgp32.h" "$(@D)/cuda/include/curand_mtgp32.h" && cp -f "/usr/local/cuda-10.0/include/curand_mtgp32_host.h" "$(@D)/cuda/include/curand_mtgp32_host.h" && cp -f "/usr/local/cuda-10.0/include/curand_mtgp32_kernel.h" "$(@D)/cuda/include/curand_mtgp32_kernel.h" && cp -f "/usr/local/cuda-10.0/include/curand_mtgp32dc_p_11213.h" "$(@D)/cuda/include/curand_mtgp32dc_p_11213.h" && cp -f "/usr/local/cuda-10.0/include/curand_normal.h" "$(@D)/cuda/include/curand_normal.h" && cp -f "/usr/local/cuda-10.0/include/curand_normal_static.h" "$(@D)/cuda/include/curand_normal_static.h" && cp -f "/usr/local/cuda-10.0/include/curand_philox4x32_x.h" "$(@D)/cuda/include/curand_philox4x32_x.h" && cp -f "/usr/local/cuda-10.0/include/curand_poisson.h" "$(@D)/cuda/include/curand_poisson.h" && cp -f "/usr/local/cuda-10.0/include/curand_precalc.h" "$(@D)/cuda/include/curand_precalc.h" && cp -f "/usr/local/cuda-10.0/include/curand_uniform.h" "$(@D)/cuda/include/curand_uniform.h" && cp -f "/usr/local/cuda-10.0/include/cusolverDn.h" "$(@D)/cuda/include/cusolverDn.h" && cp -f "/usr/local/cuda-10.0/include/cusolverRf.h" "$(@D)/cuda/include/cusolverRf.h" && cp -f "/usr/local/cuda-10.0/include/cusolverSp.h" "$(@D)/cuda/include/cusolverSp.h" && cp -f "/usr/local/cuda-10.0/include/cusolverSp_LOWLEVEL_PREVIEW.h" "$(@D)/cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h" && cp -f "/usr/local/cuda-10.0/include/cusolver_common.h" "$(@D)/cuda/include/cusolver_common.h" && cp -f "/usr/local/cuda-10.0/include/cusparse.h" "$(@D)/cuda/include/cusparse.h" && cp -f "/usr/local/cuda-10.0/include/cusparse_v2.h" "$(@D)/cuda/include/cusparse_v2.h" && cp -f "/usr/local/cuda-10.0/include/device_atomic_functions.h" "$(@D)/cuda/include/device_atomic_functions.h" && cp -f "/usr/local/cuda-10.0/include/device_atomic_functions.hpp" "$(@D)/cuda/include/device_atomic_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/device_double_functions.h" "$(@D)/cuda/include/device_double_functions.h" && cp -f "/usr/local/cuda-10.0/include/device_functions.h" "$(@D)/cuda/include/device_functions.h" && cp -f "/usr/local/cuda-10.0/include/device_launch_parameters.h" "$(@D)/cuda/include/device_launch_parameters.h" && cp -f "/usr/local/cuda-10.0/include/device_types.h" "$(@D)/cuda/include/device_types.h" && cp -f "/usr/local/cuda-10.0/include/driver_functions.h" "$(@D)/cuda/include/driver_functions.h" && cp -f "/usr/local/cuda-10.0/include/driver_types.h" "$(@D)/cuda/include/driver_types.h" && cp -f "/usr/local/cuda-10.0/include/fatBinaryCtl.h" "$(@D)/cuda/include/fatBinaryCtl.h" && cp -f "/usr/local/cuda-10.0/include/fatbinary.h" "$(@D)/cuda/include/fatbinary.h" && cp -f "/usr/local/cuda-10.0/include/host_config.h" "$(@D)/cuda/include/host_config.h" && cp -f "/usr/local/cuda-10.0/include/host_defines.h" "$(@D)/cuda/include/host_defines.h" && cp -f "/usr/local/cuda-10.0/include/library_types.h" "$(@D)/cuda/include/library_types.h" && cp -f "/usr/local/cuda-10.0/include/math_constants.h" "$(@D)/cuda/include/math_constants.h" && cp -f "/usr/local/cuda-10.0/include/math_functions.h" "$(@D)/cuda/include/math_functions.h" && cp -f "/usr/local/cuda-10.0/include/mma.h" "$(@D)/cuda/include/mma.h" && cp -f "/usr/local/cuda-10.0/include/npp.h" "$(@D)/cuda/include/npp.h" && cp -f "/usr/local/cuda-10.0/include/nppcore.h" "$(@D)/cuda/include/nppcore.h" && cp -f "/usr/local/cuda-10.0/include/nppdefs.h" "$(@D)/cuda/include/nppdefs.h" && cp -f "/usr/local/cuda-10.0/include/nppi.h" "$(@D)/cuda/include/nppi.h" && cp -f "/usr/local/cuda-10.0/include/nppi_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/nppi_arithmetic_and_logical_operations.h" && cp -f "/usr/local/cuda-10.0/include/nppi_color_conversion.h" "$(@D)/cuda/include/nppi_color_conversion.h" && cp -f "/usr/local/cuda-10.0/include/nppi_compression_functions.h" "$(@D)/cuda/include/nppi_compression_functions.h" && cp -f "/usr/local/cuda-10.0/include/nppi_computer_vision.h" "$(@D)/cuda/include/nppi_computer_vision.h" && cp -f "/usr/local/cuda-10.0/include/nppi_data_exchange_and_initialization.h" "$(@D)/cuda/include/nppi_data_exchange_and_initialization.h" && cp -f "/usr/local/cuda-10.0/include/nppi_filtering_functions.h" "$(@D)/cuda/include/nppi_filtering_functions.h" && cp -f "/usr/local/cuda-10.0/include/nppi_geometry_transforms.h" "$(@D)/cuda/include/nppi_geometry_transforms.h" && cp -f "/usr/local/cuda-10.0/include/nppi_linear_transforms.h" "$(@D)/cuda/include/nppi_linear_transforms.h" && cp -f "/usr/local/cuda-10.0/include/nppi_morphological_operations.h" "$(@D)/cuda/include/nppi_morphological_operations.h" && cp -f "/usr/local/cuda-10.0/include/nppi_statistics_functions.h" "$(@D)/cuda/include/nppi_statistics_functions.h" && cp -f "/usr/local/cuda-10.0/include/nppi_support_functions.h" "$(@D)/cuda/include/nppi_support_functions.h" && cp -f "/usr/local/cuda-10.0/include/nppi_threshold_and_compare_operations.h" "$(@D)/cuda/include/nppi_threshold_and_compare_operations.h" && cp -f "/usr/local/cuda-10.0/include/npps.h" "$(@D)/cuda/include/npps.h" && cp -f "/usr/local/cuda-10.0/include/npps_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/npps_arithmetic_and_logical_operations.h" && cp -f "/usr/local/cuda-10.0/include/npps_conversion_functions.h" "$(@D)/cuda/include/npps_conversion_functions.h" && cp -f "/usr/local/cuda-10.0/include/npps_filtering_functions.h" "$(@D)/cuda/include/npps_filtering_functions.h" && cp -f "/usr/local/cuda-10.0/include/npps_initialization.h" "$(@D)/cuda/include/npps_initialization.h" && cp -f "/usr/local/cuda-10.0/include/npps_statistics_functions.h" "$(@D)/cuda/include/npps_statistics_functions.h" && cp -f "/usr/local/cuda-10.0/include/npps_support_functions.h" "$(@D)/cuda/include/npps_support_functions.h" && cp -f "/usr/local/cuda-10.0/include/nppversion.h" "$(@D)/cuda/include/nppversion.h" && cp -f "/usr/local/cuda-10.0/include/nvToolsExt.h" "$(@D)/cuda/include/nvToolsExt.h" && cp -f "/usr/local/cuda-10.0/include/nvToolsExtCuda.h" "$(@D)/cuda/include/nvToolsExtCuda.h" && cp -f "/usr/local/cuda-10.0/include/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvToolsExtCudaRt.h" && cp -f "/usr/local/cuda-10.0/include/nvToolsExtMeta.h" "$(@D)/cuda/include/nvToolsExtMeta.h" && cp -f "/usr/local/cuda-10.0/include/nvToolsExtSync.h" "$(@D)/cuda/include/nvToolsExtSync.h" && cp -f "/usr/local/cuda-10.0/include/nvblas.h" "$(@D)/cuda/include/nvblas.h" && cp -f "/usr/local/cuda-10.0/include/nvfunctional" "$(@D)/cuda/include/nvfunctional" && cp -f "/usr/local/cuda-10.0/include/nvgraph.h" "$(@D)/cuda/include/nvgraph.h" && cp -f "/usr/local/cuda-10.0/include/nvjpeg.h" "$(@D)/cuda/include/nvjpeg.h" && cp -f "/usr/local/cuda-10.0/include/nvml.h" "$(@D)/cuda/include/nvml.h" && cp -f "/usr/local/cuda-10.0/include/nvrtc.h" "$(@D)/cuda/include/nvrtc.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvToolsExt.h" "$(@D)/cuda/include/nvtx3/nvToolsExt.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvToolsExtCuda.h" "$(@D)/cuda/include/nvtx3/nvToolsExtCuda.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvtx3/nvToolsExtCudaRt.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvToolsExtOpenCL.h" "$(@D)/cuda/include/nvtx3/nvToolsExtOpenCL.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvToolsExtSync.h" "$(@D)/cuda/include/nvtx3/nvToolsExtSync.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImpl.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImpl.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImplCore.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImplCore.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxInit.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxInit.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxInitDecls.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxInitDecls.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxInitDefs.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxInitDefs.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxLinkOnce.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxLinkOnce.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxTypes.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxTypes.h" && cp -f "/usr/local/cuda-10.0/include/sm_20_atomic_functions.h" "$(@D)/cuda/include/sm_20_atomic_functions.h" && cp -f "/usr/local/cuda-10.0/include/sm_20_atomic_functions.hpp" "$(@D)/cuda/include/sm_20_atomic_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_20_intrinsics.h" "$(@D)/cuda/include/sm_20_intrinsics.h" && cp -f "/usr/local/cuda-10.0/include/sm_20_intrinsics.hpp" "$(@D)/cuda/include/sm_20_intrinsics.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_30_intrinsics.h" "$(@D)/cuda/include/sm_30_intrinsics.h" && cp -f "/usr/local/cuda-10.0/include/sm_30_intrinsics.hpp" "$(@D)/cuda/include/sm_30_intrinsics.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_32_atomic_functions.h" "$(@D)/cuda/include/sm_32_atomic_functions.h" && cp -f "/usr/local/cuda-10.0/include/sm_32_atomic_functions.hpp" "$(@D)/cuda/include/sm_32_atomic_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_32_intrinsics.h" "$(@D)/cuda/include/sm_32_intrinsics.h" && cp -f "/usr/local/cuda-10.0/include/sm_32_intrinsics.hpp" "$(@D)/cuda/include/sm_32_intrinsics.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_35_atomic_functions.h" "$(@D)/cuda/include/sm_35_atomic_functions.h" && cp -f "/usr/local/cuda-10.0/include/sm_35_intrinsics.h" "$(@D)/cuda/include/sm_35_intrinsics.h" && cp -f "/usr/local/cuda-10.0/include/sm_60_atomic_functions.h" "$(@D)/cuda/include/sm_60_atomic_functions.h" && cp -f "/usr/local/cuda-10.0/include/sm_60_atomic_functions.hpp" "$(@D)/cuda/include/sm_60_atomic_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_61_intrinsics.h" "$(@D)/cuda/include/sm_61_intrinsics.h" && cp -f "/usr/local/cuda-10.0/include/sm_61_intrinsics.hpp" "$(@D)/cuda/include/sm_61_intrinsics.hpp" && cp -f "/usr/local/cuda-10.0/include/sobol_direction_vectors.h" "$(@D)/cuda/include/sobol_direction_vectors.h" && cp -f "/usr/local/cuda-10.0/include/surface_functions.h" "$(@D)/cuda/include/surface_functions.h" && cp -f "/usr/local/cuda-10.0/include/surface_functions.hpp" "$(@D)/cuda/include/surface_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/surface_indirect_functions.h" "$(@D)/cuda/include/surface_indirect_functions.h" && cp -f "/usr/local/cuda-10.0/include/surface_indirect_functions.hpp" "$(@D)/cuda/include/surface_indirect_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/surface_types.h" "$(@D)/cuda/include/surface_types.h" && cp -f "/usr/local/cuda-10.0/include/texture_fetch_functions.h" "$(@D)/cuda/include/texture_fetch_functions.h" && cp -f "/usr/local/cuda-10.0/include/texture_fetch_functions.hpp" "$(@D)/cuda/include/texture_fetch_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/texture_indirect_functions.h" "$(@D)/cuda/include/texture_indirect_functions.h" && cp -f "/usr/local/cuda-10.0/include/texture_indirect_functions.hpp" "$(@D)/cuda/include/texture_indirect_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/texture_types.h" "$(@D)/cuda/include/texture_types.h" && cp -f "/usr/local/cuda-10.0/include/thrust/adjacent_difference.h" "$(@D)/cuda/include/thrust/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/advance.h" "$(@D)/cuda/include/thrust/advance.h" && cp -f "/usr/local/cuda-10.0/include/thrust/binary_search.h" "$(@D)/cuda/include/thrust/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/complex.h" "$(@D)/cuda/include/thrust/complex.h" && cp -f "/usr/local/cuda-10.0/include/thrust/copy.h" "$(@D)/cuda/include/thrust/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/count.h" "$(@D)/cuda/include/thrust/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/adjacent_difference.inl" "$(@D)/cuda/include/thrust/detail/adjacent_difference.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/advance.inl" "$(@D)/cuda/include/thrust/detail/advance.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/alignment.h" "$(@D)/cuda/include/thrust/detail/alignment.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/allocator_traits.h" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/allocator_traits.inl" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/copy_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/copy_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/default_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/default_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/destroy_range.h" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/destroy_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/fill_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/fill_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/malloc_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/malloc_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/no_throw_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/no_throw_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/tagged_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/tagged_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/temporary_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/temporary_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/binary_search.inl" "$(@D)/cuda/include/thrust/detail/binary_search.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/arithmetic.h" "$(@D)/cuda/include/thrust/detail/complex/arithmetic.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/c99math.h" "$(@D)/cuda/include/thrust/detail/complex/c99math.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/catrig.h" "$(@D)/cuda/include/thrust/detail/complex/catrig.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/catrigf.h" "$(@D)/cuda/include/thrust/detail/complex/catrigf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/ccosh.h" "$(@D)/cuda/include/thrust/detail/complex/ccosh.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/ccoshf.h" "$(@D)/cuda/include/thrust/detail/complex/ccoshf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/cexp.h" "$(@D)/cuda/include/thrust/detail/complex/cexp.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/cexpf.h" "$(@D)/cuda/include/thrust/detail/complex/cexpf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/clog.h" "$(@D)/cuda/include/thrust/detail/complex/clog.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/clogf.h" "$(@D)/cuda/include/thrust/detail/complex/clogf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/complex.inl" "$(@D)/cuda/include/thrust/detail/complex/complex.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/cpow.h" "$(@D)/cuda/include/thrust/detail/complex/cpow.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/cproj.h" "$(@D)/cuda/include/thrust/detail/complex/cproj.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/csinh.h" "$(@D)/cuda/include/thrust/detail/complex/csinh.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/csinhf.h" "$(@D)/cuda/include/thrust/detail/complex/csinhf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/csqrt.h" "$(@D)/cuda/include/thrust/detail/complex/csqrt.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/csqrtf.h" "$(@D)/cuda/include/thrust/detail/complex/csqrtf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/ctanh.h" "$(@D)/cuda/include/thrust/detail/complex/ctanh.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/ctanhf.h" "$(@D)/cuda/include/thrust/detail/complex/ctanhf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/math_private.h" "$(@D)/cuda/include/thrust/detail/complex/math_private.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/stream.h" "$(@D)/cuda/include/thrust/detail/complex/stream.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config.h" "$(@D)/cuda/include/thrust/detail/config.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/compiler.h" "$(@D)/cuda/include/thrust/detail/config/compiler.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/compiler_fence.h" "$(@D)/cuda/include/thrust/detail/config/compiler_fence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/config.h" "$(@D)/cuda/include/thrust/detail/config/config.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/debug.h" "$(@D)/cuda/include/thrust/detail/config/debug.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/device_system.h" "$(@D)/cuda/include/thrust/detail/config/device_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/exec_check_disable.h" "$(@D)/cuda/include/thrust/detail/config/exec_check_disable.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/forceinline.h" "$(@D)/cuda/include/thrust/detail/config/forceinline.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/global_workarounds.h" "$(@D)/cuda/include/thrust/detail/config/global_workarounds.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/host_device.h" "$(@D)/cuda/include/thrust/detail/config/host_device.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/host_system.h" "$(@D)/cuda/include/thrust/detail/config/host_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/simple_defines.h" "$(@D)/cuda/include/thrust/detail/config/simple_defines.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/contiguous_storage.h" "$(@D)/cuda/include/thrust/detail/contiguous_storage.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/contiguous_storage.inl" "$(@D)/cuda/include/thrust/detail/contiguous_storage.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/copy.h" "$(@D)/cuda/include/thrust/detail/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/copy.inl" "$(@D)/cuda/include/thrust/detail/copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/copy_if.h" "$(@D)/cuda/include/thrust/detail/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/copy_if.inl" "$(@D)/cuda/include/thrust/detail/copy_if.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/count.inl" "$(@D)/cuda/include/thrust/detail/count.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/cstdint.h" "$(@D)/cuda/include/thrust/detail/cstdint.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_delete.inl" "$(@D)/cuda/include/thrust/detail/device_delete.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_free.inl" "$(@D)/cuda/include/thrust/detail/device_free.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_malloc.inl" "$(@D)/cuda/include/thrust/detail/device_malloc.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_new.inl" "$(@D)/cuda/include/thrust/detail/device_new.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_ptr.inl" "$(@D)/cuda/include/thrust/detail/device_ptr.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_reference.inl" "$(@D)/cuda/include/thrust/detail/device_reference.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_vector.inl" "$(@D)/cuda/include/thrust/detail/device_vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/dispatch/is_trivial_copy.h" "$(@D)/cuda/include/thrust/detail/dispatch/is_trivial_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/distance.inl" "$(@D)/cuda/include/thrust/detail/distance.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/equal.inl" "$(@D)/cuda/include/thrust/detail/equal.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/execute_with_allocator.h" "$(@D)/cuda/include/thrust/detail/execute_with_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/execution_policy.h" "$(@D)/cuda/include/thrust/detail/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/extrema.inl" "$(@D)/cuda/include/thrust/detail/extrema.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/fill.inl" "$(@D)/cuda/include/thrust/detail/fill.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/find.inl" "$(@D)/cuda/include/thrust/detail/find.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/for_each.inl" "$(@D)/cuda/include/thrust/detail/for_each.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/function.h" "$(@D)/cuda/include/thrust/detail/function.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional.inl" "$(@D)/cuda/include/thrust/detail/functional.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/actor.h" "$(@D)/cuda/include/thrust/detail/functional/actor.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/actor.inl" "$(@D)/cuda/include/thrust/detail/functional/actor.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/argument.h" "$(@D)/cuda/include/thrust/detail/functional/argument.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/composite.h" "$(@D)/cuda/include/thrust/detail/functional/composite.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/arithmetic_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/arithmetic_operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/assignment_operator.h" "$(@D)/cuda/include/thrust/detail/functional/operators/assignment_operator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/bitwise_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/bitwise_operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/compound_assignment_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/logical_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/logical_operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/operator_adaptors.h" "$(@D)/cuda/include/thrust/detail/functional/operators/operator_adaptors.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/relational_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/relational_operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/placeholder.h" "$(@D)/cuda/include/thrust/detail/functional/placeholder.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/value.h" "$(@D)/cuda/include/thrust/detail/functional/value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/gather.inl" "$(@D)/cuda/include/thrust/detail/gather.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/generate.inl" "$(@D)/cuda/include/thrust/detail/generate.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/get_iterator_value.h" "$(@D)/cuda/include/thrust/detail/get_iterator_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/host_vector.inl" "$(@D)/cuda/include/thrust/detail/host_vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/inner_product.inl" "$(@D)/cuda/include/thrust/detail/inner_product.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/integer_math.h" "$(@D)/cuda/include/thrust/detail/integer_math.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/integer_traits.h" "$(@D)/cuda/include/thrust/detail/integer_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/internal_functional.h" "$(@D)/cuda/include/thrust/detail/internal_functional.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/logical.inl" "$(@D)/cuda/include/thrust/detail/logical.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/merge.inl" "$(@D)/cuda/include/thrust/detail/merge.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/minmax.h" "$(@D)/cuda/include/thrust/detail/minmax.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/mismatch.inl" "$(@D)/cuda/include/thrust/detail/mismatch.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/mpl/math.h" "$(@D)/cuda/include/thrust/detail/mpl/math.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/numeric_traits.h" "$(@D)/cuda/include/thrust/detail/numeric_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/overlapped_copy.h" "$(@D)/cuda/include/thrust/detail/overlapped_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/pair.inl" "$(@D)/cuda/include/thrust/detail/pair.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/partition.inl" "$(@D)/cuda/include/thrust/detail/partition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/pointer.h" "$(@D)/cuda/include/thrust/detail/pointer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/pointer.inl" "$(@D)/cuda/include/thrust/detail/pointer.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/preprocessor.h" "$(@D)/cuda/include/thrust/detail/preprocessor.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/range/head_flags.h" "$(@D)/cuda/include/thrust/detail/range/head_flags.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/range/tail_flags.h" "$(@D)/cuda/include/thrust/detail/range/tail_flags.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/raw_pointer_cast.h" "$(@D)/cuda/include/thrust/detail/raw_pointer_cast.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/raw_reference_cast.h" "$(@D)/cuda/include/thrust/detail/raw_reference_cast.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/reduce.inl" "$(@D)/cuda/include/thrust/detail/reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/reference.h" "$(@D)/cuda/include/thrust/detail/reference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/reference.inl" "$(@D)/cuda/include/thrust/detail/reference.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/reference_forward_declaration.h" "$(@D)/cuda/include/thrust/detail/reference_forward_declaration.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/remove.inl" "$(@D)/cuda/include/thrust/detail/remove.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/replace.inl" "$(@D)/cuda/include/thrust/detail/replace.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/reverse.inl" "$(@D)/cuda/include/thrust/detail/reverse.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/scan.inl" "$(@D)/cuda/include/thrust/detail/scan.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/scatter.inl" "$(@D)/cuda/include/thrust/detail/scatter.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/seq.h" "$(@D)/cuda/include/thrust/detail/seq.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/sequence.inl" "$(@D)/cuda/include/thrust/detail/sequence.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/set_operations.inl" "$(@D)/cuda/include/thrust/detail/set_operations.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/sort.inl" "$(@D)/cuda/include/thrust/detail/sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/static_assert.h" "$(@D)/cuda/include/thrust/detail/static_assert.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/static_map.h" "$(@D)/cuda/include/thrust/detail/static_map.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/swap.h" "$(@D)/cuda/include/thrust/detail/swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/swap.inl" "$(@D)/cuda/include/thrust/detail/swap.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/swap_ranges.inl" "$(@D)/cuda/include/thrust/detail/swap_ranges.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/tabulate.inl" "$(@D)/cuda/include/thrust/detail/tabulate.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/temporary_array.h" "$(@D)/cuda/include/thrust/detail/temporary_array.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/temporary_array.inl" "$(@D)/cuda/include/thrust/detail/temporary_array.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/transform.inl" "$(@D)/cuda/include/thrust/detail/transform.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/transform_reduce.inl" "$(@D)/cuda/include/thrust/detail/transform_reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/transform_scan.inl" "$(@D)/cuda/include/thrust/detail/transform_scan.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/trivial_sequence.h" "$(@D)/cuda/include/thrust/detail/trivial_sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/tuple.inl" "$(@D)/cuda/include/thrust/detail/tuple.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/tuple_meta_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_meta_transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/tuple_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" "$(@D)/cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/function_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/function_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/has_member_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_member_function.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/has_nested_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_nested_type.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/has_trivial_assign.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_trivial_assign.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/is_call_possible.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_call_possible.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/is_metafunction_defined.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_metafunction_defined.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/iterator/is_output_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/minimum_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/minimum_type.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/pointer_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/pointer_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/result_of_adaptable_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_fill.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/unique.inl" "$(@D)/cuda/include/thrust/detail/unique.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/use_default.h" "$(@D)/cuda/include/thrust/detail/use_default.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/util/align.h" "$(@D)/cuda/include/thrust/detail/util/align.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/util/blocking.h" "$(@D)/cuda/include/thrust/detail/util/blocking.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/vector_base.h" "$(@D)/cuda/include/thrust/detail/vector_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/vector_base.inl" "$(@D)/cuda/include/thrust/detail/vector_base.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/device_allocator.h" "$(@D)/cuda/include/thrust/device_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_delete.h" "$(@D)/cuda/include/thrust/device_delete.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_free.h" "$(@D)/cuda/include/thrust/device_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_malloc.h" "$(@D)/cuda/include/thrust/device_malloc.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_malloc_allocator.h" "$(@D)/cuda/include/thrust/device_malloc_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_new.h" "$(@D)/cuda/include/thrust/device_new.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_new_allocator.h" "$(@D)/cuda/include/thrust/device_new_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_ptr.h" "$(@D)/cuda/include/thrust/device_ptr.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_reference.h" "$(@D)/cuda/include/thrust/device_reference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_vector.h" "$(@D)/cuda/include/thrust/device_vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/distance.h" "$(@D)/cuda/include/thrust/distance.h" && cp -f "/usr/local/cuda-10.0/include/thrust/equal.h" "$(@D)/cuda/include/thrust/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/execution_policy.h" "$(@D)/cuda/include/thrust/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/extrema.h" "$(@D)/cuda/include/thrust/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/fill.h" "$(@D)/cuda/include/thrust/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/find.h" "$(@D)/cuda/include/thrust/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/for_each.h" "$(@D)/cuda/include/thrust/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/functional.h" "$(@D)/cuda/include/thrust/functional.h" && cp -f "/usr/local/cuda-10.0/include/thrust/gather.h" "$(@D)/cuda/include/thrust/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/generate.h" "$(@D)/cuda/include/thrust/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/host_vector.h" "$(@D)/cuda/include/thrust/host_vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/inner_product.h" "$(@D)/cuda/include/thrust/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/constant_iterator.h" "$(@D)/cuda/include/thrust/iterator/constant_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/counting_iterator.h" "$(@D)/cuda/include/thrust/iterator/counting_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/any_assign.h" "$(@D)/cuda/include/thrust/iterator/detail/any_assign.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/any_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/any_system_tag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/constant_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/constant_iterator_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/counting_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/counting_iterator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/device_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/device_system_tag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/discard_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/discard_iterator_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/distance_from_result.h" "$(@D)/cuda/include/thrust/iterator/detail/distance_from_result.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/host_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/host_system_tag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/is_iterator_category.h" "$(@D)/cuda/include/thrust/iterator/detail/is_iterator_category.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/is_trivial_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/is_trivial_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_adaptor_base.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_adaptor_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_category_to_system.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_category_to_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_facade_category.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_facade_category.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_traits.inl" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traits.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_traversal_tags.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traversal_tags.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/join_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/join_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/minimum_category.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_category.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/minimum_system.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/normal_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/normal_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/permutation_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/permutation_iterator_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/retag.h" "$(@D)/cuda/include/thrust/iterator/detail/retag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/reverse_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/reverse_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/tagged_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/tagged_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/transform_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_iterator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/transform_output_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_output_iterator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/tuple_of_iterator_references.h" "$(@D)/cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/universal_categories.h" "$(@D)/cuda/include/thrust/iterator/detail/universal_categories.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/zip_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/zip_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/discard_iterator.h" "$(@D)/cuda/include/thrust/iterator/discard_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/iterator_adaptor.h" "$(@D)/cuda/include/thrust/iterator/iterator_adaptor.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/iterator_categories.h" "$(@D)/cuda/include/thrust/iterator/iterator_categories.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/iterator_facade.h" "$(@D)/cuda/include/thrust/iterator/iterator_facade.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/iterator_traits.h" "$(@D)/cuda/include/thrust/iterator/iterator_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/permutation_iterator.h" "$(@D)/cuda/include/thrust/iterator/permutation_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/retag.h" "$(@D)/cuda/include/thrust/iterator/retag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/reverse_iterator.h" "$(@D)/cuda/include/thrust/iterator/reverse_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/transform_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/transform_output_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_output_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/zip_iterator.h" "$(@D)/cuda/include/thrust/iterator/zip_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/logical.h" "$(@D)/cuda/include/thrust/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/memory.h" "$(@D)/cuda/include/thrust/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/merge.h" "$(@D)/cuda/include/thrust/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/mismatch.h" "$(@D)/cuda/include/thrust/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/pair.h" "$(@D)/cuda/include/thrust/pair.h" && cp -f "/usr/local/cuda-10.0/include/thrust/partition.h" "$(@D)/cuda/include/thrust/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random.h" "$(@D)/cuda/include/thrust/random.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/discard_block_engine.inl" "$(@D)/cuda/include/thrust/random/detail/discard_block_engine.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/linear_congruential_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/linear_congruential_engine_discard.h" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine_discard.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/linear_feedback_shift_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/mod.h" "$(@D)/cuda/include/thrust/random/detail/mod.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/normal_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/normal_distribution.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/normal_distribution_base.h" "$(@D)/cuda/include/thrust/random/detail/normal_distribution_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/random_core_access.h" "$(@D)/cuda/include/thrust/random/detail/random_core_access.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/subtract_with_carry_engine.inl" "$(@D)/cuda/include/thrust/random/detail/subtract_with_carry_engine.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/uniform_int_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_int_distribution.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/uniform_real_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_real_distribution.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/xor_combine_engine.inl" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/xor_combine_engine_max.h" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine_max.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/discard_block_engine.h" "$(@D)/cuda/include/thrust/random/discard_block_engine.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/linear_congruential_engine.h" "$(@D)/cuda/include/thrust/random/linear_congruential_engine.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/linear_feedback_shift_engine.h" "$(@D)/cuda/include/thrust/random/linear_feedback_shift_engine.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/normal_distribution.h" "$(@D)/cuda/include/thrust/random/normal_distribution.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/subtract_with_carry_engine.h" "$(@D)/cuda/include/thrust/random/subtract_with_carry_engine.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/uniform_int_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_int_distribution.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/uniform_real_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_real_distribution.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/xor_combine_engine.h" "$(@D)/cuda/include/thrust/random/xor_combine_engine.h" && cp -f "/usr/local/cuda-10.0/include/thrust/reduce.h" "$(@D)/cuda/include/thrust/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/remove.h" "$(@D)/cuda/include/thrust/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/replace.h" "$(@D)/cuda/include/thrust/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/reverse.h" "$(@D)/cuda/include/thrust/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/scan.h" "$(@D)/cuda/include/thrust/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/scatter.h" "$(@D)/cuda/include/thrust/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/sequence.h" "$(@D)/cuda/include/thrust/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/set_operations.h" "$(@D)/cuda/include/thrust/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/sort.h" "$(@D)/cuda/include/thrust/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/swap.h" "$(@D)/cuda/include/thrust/swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cpp/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cpp/detail/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/count.h" "$(@D)/cuda/include/thrust/system/cpp/detail/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/equal.h" "$(@D)/cuda/include/thrust/system/cpp/detail/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cpp/detail/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/find.h" "$(@D)/cuda/include/thrust/system/cpp/detail/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cpp/detail/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/gather.h" "$(@D)/cuda/include/thrust/system/cpp/detail/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/generate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cpp/detail/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cpp/detail/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/logical.h" "$(@D)/cuda/include/thrust/system/cpp/detail/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cpp/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/memory.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/merge.h" "$(@D)/cuda/include/thrust/system/cpp/detail/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cpp/detail/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/par.h" "$(@D)/cuda/include/thrust/system/cpp/detail/par.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/partition.h" "$(@D)/cuda/include/thrust/system/cpp/detail/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/remove.h" "$(@D)/cuda/include/thrust/system/cpp/detail/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/replace.h" "$(@D)/cuda/include/thrust/system/cpp/detail/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cpp/detail/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/sort.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cpp/detail/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cpp/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/transform.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/unique.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/memory.h" "$(@D)/cuda/include/thrust/system/cpp/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/vector.h" "$(@D)/cuda/include/thrust/system/cpp/vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/config.h" "$(@D)/cuda/include/thrust/system/cuda/config.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cuda/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cuda/detail/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/agent_launcher.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/agent_launcher.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/alignment.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/alignment.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/util.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/count.h" "$(@D)/cuda/include/thrust/system/cuda/detail/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/cross_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/cub.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/cub.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_select.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/host/mutex.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_allocator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_arch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_debug.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_device.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_device.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_macro.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_namespace.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_ptx.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_type.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_type.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/equal.h" "$(@D)/cuda/include/thrust/system/cuda/detail/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/error.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/error.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cuda/detail/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/find.h" "$(@D)/cuda/include/thrust/system/cuda/detail/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cuda/detail/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/gather.h" "$(@D)/cuda/include/thrust/system/cuda/detail/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/generate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/guarded_driver_types.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_driver_types.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cuda/detail/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/internal/copy_cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cuda/detail/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/logical.h" "$(@D)/cuda/include/thrust/system/cuda/detail/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cuda/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/memory.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/merge.h" "$(@D)/cuda/include/thrust/system/cuda/detail/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/par.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/par_to_seq.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par_to_seq.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/parallel_for.h" "$(@D)/cuda/include/thrust/system/cuda/detail/parallel_for.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/partition.h" "$(@D)/cuda/include/thrust/system/cuda/detail/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/remove.h" "$(@D)/cuda/include/thrust/system/cuda/detail/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/replace.h" "$(@D)/cuda/include/thrust/system/cuda/detail/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cuda/detail/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cuda/detail/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/terminate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/terminate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/transform.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/unique.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/util.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/error.h" "$(@D)/cuda/include/thrust/system/cuda/error.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/experimental/pinned_allocator.h" "$(@D)/cuda/include/thrust/system/cuda/experimental/pinned_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/memory.h" "$(@D)/cuda/include/thrust/system/cuda/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/vector.h" "$(@D)/cuda/include/thrust/system/cuda/vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/adl/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/adl/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/count.h" "$(@D)/cuda/include/thrust/system/detail/adl/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/equal.h" "$(@D)/cuda/include/thrust/system/detail/adl/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/extrema.h" "$(@D)/cuda/include/thrust/system/detail/adl/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/find.h" "$(@D)/cuda/include/thrust/system/detail/adl/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/for_each.h" "$(@D)/cuda/include/thrust/system/detail/adl/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/gather.h" "$(@D)/cuda/include/thrust/system/detail/adl/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/generate.h" "$(@D)/cuda/include/thrust/system/detail/adl/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/get_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/adl/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/adl/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/logical.h" "$(@D)/cuda/include/thrust/system/detail/adl/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/adl/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/merge.h" "$(@D)/cuda/include/thrust/system/detail/adl/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/adl/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/partition.h" "$(@D)/cuda/include/thrust/system/detail/adl/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/remove.h" "$(@D)/cuda/include/thrust/system/detail/adl/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/replace.h" "$(@D)/cuda/include/thrust/system/detail/adl/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/reverse.h" "$(@D)/cuda/include/thrust/system/detail/adl/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/scatter.h" "$(@D)/cuda/include/thrust/system/detail/adl/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/sequence.h" "$(@D)/cuda/include/thrust/system/detail/adl/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/adl/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/sort.h" "$(@D)/cuda/include/thrust/system/detail/adl/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/adl/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/adl/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/adl/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/transform.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/unique.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/bad_alloc.h" "$(@D)/cuda/include/thrust/system/detail/bad_alloc.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/errno.h" "$(@D)/cuda/include/thrust/system/detail/errno.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/error_category.inl" "$(@D)/cuda/include/thrust/system/detail/error_category.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/error_code.inl" "$(@D)/cuda/include/thrust/system/detail/error_code.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/error_condition.inl" "$(@D)/cuda/include/thrust/system/detail/error_condition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/adjacent_difference.inl" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/advance.h" "$(@D)/cuda/include/thrust/system/detail/generic/advance.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/advance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/advance.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy_if.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/count.h" "$(@D)/cuda/include/thrust/system/detail/generic/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/count.inl" "$(@D)/cuda/include/thrust/system/detail/generic/count.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/distance.h" "$(@D)/cuda/include/thrust/system/detail/generic/distance.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/distance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/distance.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/equal.h" "$(@D)/cuda/include/thrust/system/detail/generic/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/equal.inl" "$(@D)/cuda/include/thrust/system/detail/generic/equal.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/extrema.h" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/extrema.inl" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/find.h" "$(@D)/cuda/include/thrust/system/detail/generic/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/find.inl" "$(@D)/cuda/include/thrust/system/detail/generic/find.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/for_each.h" "$(@D)/cuda/include/thrust/system/detail/generic/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/gather.h" "$(@D)/cuda/include/thrust/system/detail/generic/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/gather.inl" "$(@D)/cuda/include/thrust/system/detail/generic/gather.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/generate.h" "$(@D)/cuda/include/thrust/system/detail/generic/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/generate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/generate.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/inner_product.inl" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/logical.h" "$(@D)/cuda/include/thrust/system/detail/generic/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/memory.h" "$(@D)/cuda/include/thrust/system/detail/generic/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/memory.inl" "$(@D)/cuda/include/thrust/system/detail/generic/memory.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/merge.h" "$(@D)/cuda/include/thrust/system/detail/generic/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/merge.inl" "$(@D)/cuda/include/thrust/system/detail/generic/merge.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/mismatch.inl" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/partition.h" "$(@D)/cuda/include/thrust/system/detail/generic/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/partition.inl" "$(@D)/cuda/include/thrust/system/detail/generic/partition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/remove.h" "$(@D)/cuda/include/thrust/system/detail/generic/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/remove.inl" "$(@D)/cuda/include/thrust/system/detail/generic/remove.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/replace.h" "$(@D)/cuda/include/thrust/system/detail/generic/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/replace.inl" "$(@D)/cuda/include/thrust/system/detail/generic/replace.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reverse.h" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reverse.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scalar/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scalar/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scatter.h" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scatter.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/select_system.h" "$(@D)/cuda/include/thrust/system/detail/generic/select_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sequence.h" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sequence.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/set_operations.inl" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sort.h" "$(@D)/cuda/include/thrust/system/detail/generic/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sort.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/swap_ranges.inl" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/tabulate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/tag.h" "$(@D)/cuda/include/thrust/system/detail/generic/tag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/temporary_buffer.inl" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/type_traits.h" "$(@D)/cuda/include/thrust/system/detail/generic/type_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/internal/decompose.h" "$(@D)/cuda/include/thrust/system/detail/internal/decompose.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/sequential/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/sequential/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy_backward.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_backward.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/count.h" "$(@D)/cuda/include/thrust/system/detail/sequential/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/equal.h" "$(@D)/cuda/include/thrust/system/detail/sequential/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/execution_policy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/extrema.h" "$(@D)/cuda/include/thrust/system/detail/sequential/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/find.h" "$(@D)/cuda/include/thrust/system/detail/sequential/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/for_each.h" "$(@D)/cuda/include/thrust/system/detail/sequential/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/gather.h" "$(@D)/cuda/include/thrust/system/detail/sequential/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/general_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/general_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/generate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/get_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/sequential/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/insertion_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/insertion_sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/sequential/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/logical.h" "$(@D)/cuda/include/thrust/system/detail/sequential/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/sequential/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/merge.h" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/merge.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/sequential/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/partition.h" "$(@D)/cuda/include/thrust/system/detail/sequential/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/remove.h" "$(@D)/cuda/include/thrust/system/detail/sequential/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/replace.h" "$(@D)/cuda/include/thrust/system/detail/sequential/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/reverse.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/scatter.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/sequence.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/sequential/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_merge_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_merge_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_primitive_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_primitive_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_radix_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_radix_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/sequential/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/sequential/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/transform.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/trivial_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/trivial_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/unique.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/system_error.inl" "$(@D)/cuda/include/thrust/system/detail/system_error.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/error_code.h" "$(@D)/cuda/include/thrust/system/error_code.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/omp/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/omp/detail/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/count.h" "$(@D)/cuda/include/thrust/system/omp/detail/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/default_decomposition.h" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/default_decomposition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/equal.h" "$(@D)/cuda/include/thrust/system/omp/detail/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/detail/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/omp/detail/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/find.h" "$(@D)/cuda/include/thrust/system/omp/detail/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/gather.h" "$(@D)/cuda/include/thrust/system/omp/detail/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/generate.h" "$(@D)/cuda/include/thrust/system/omp/detail/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/omp/detail/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/omp/detail/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/logical.h" "$(@D)/cuda/include/thrust/system/omp/detail/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/omp/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/omp/detail/memory.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/merge.h" "$(@D)/cuda/include/thrust/system/omp/detail/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/omp/detail/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/par.h" "$(@D)/cuda/include/thrust/system/omp/detail/par.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/partition.h" "$(@D)/cuda/include/thrust/system/omp/detail/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/partition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/partition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_intervals.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/remove.h" "$(@D)/cuda/include/thrust/system/omp/detail/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/remove.inl" "$(@D)/cuda/include/thrust/system/omp/detail/remove.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/replace.h" "$(@D)/cuda/include/thrust/system/omp/detail/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/omp/detail/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/omp/detail/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/omp/detail/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/omp/detail/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/sort.h" "$(@D)/cuda/include/thrust/system/omp/detail/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/sort.inl" "$(@D)/cuda/include/thrust/system/omp/detail/sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/omp/detail/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/omp/detail/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/omp/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/transform.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/omp/detail/vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/memory.h" "$(@D)/cuda/include/thrust/system/omp/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/vector.h" "$(@D)/cuda/include/thrust/system/omp/vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/system_error.h" "$(@D)/cuda/include/thrust/system/system_error.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/tbb/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/tbb/detail/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/count.h" "$(@D)/cuda/include/thrust/system/tbb/detail/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/equal.h" "$(@D)/cuda/include/thrust/system/tbb/detail/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/extrema.h" "$(@D)/cuda/include/thrust/system/tbb/detail/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/find.h" "$(@D)/cuda/include/thrust/system/tbb/detail/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/for_each.h" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/gather.h" "$(@D)/cuda/include/thrust/system/tbb/detail/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/generate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/get_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/tbb/detail/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/tbb/detail/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/logical.h" "$(@D)/cuda/include/thrust/system/tbb/detail/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/tbb/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/memory.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/memory.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/merge.h" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/merge.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/tbb/detail/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/par.h" "$(@D)/cuda/include/thrust/system/tbb/detail/par.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/partition.h" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/partition.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_intervals.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/remove.h" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/remove.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/replace.h" "$(@D)/cuda/include/thrust/system/tbb/detail/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reverse.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scan.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scatter.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/sequence.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/tbb/detail/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/sort.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/sort.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/tbb/detail/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/tbb/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/transform.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/vector.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/memory.h" "$(@D)/cuda/include/thrust/system/tbb/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/vector.h" "$(@D)/cuda/include/thrust/system/tbb/vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system_error.h" "$(@D)/cuda/include/thrust/system_error.h" && cp -f "/usr/local/cuda-10.0/include/thrust/tabulate.h" "$(@D)/cuda/include/thrust/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/transform.h" "$(@D)/cuda/include/thrust/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/transform_reduce.h" "$(@D)/cuda/include/thrust/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/transform_scan.h" "$(@D)/cuda/include/thrust/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/tuple.h" "$(@D)/cuda/include/thrust/tuple.h" && cp -f "/usr/local/cuda-10.0/include/thrust/uninitialized_copy.h" "$(@D)/cuda/include/thrust/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/uninitialized_fill.h" "$(@D)/cuda/include/thrust/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/unique.h" "$(@D)/cuda/include/thrust/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/version.h" "$(@D)/cuda/include/thrust/version.h" && cp -f "/usr/local/cuda-10.0/include/vector_functions.h" "$(@D)/cuda/include/vector_functions.h" && cp -f "/usr/local/cuda-10.0/include/vector_functions.hpp" "$(@D)/cuda/include/vector_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/vector_types.h" "$(@D)/cuda/include/vector_types.h"
-   """,
+    cmd = """cp -rLf "/usr/local/cuda-10.0/include/." "$(@D)/cuda/include/" """,
 )
 
 genrule(
@@ -1202,9 +1200,7 @@ genrule(
     outs = [
         "cuda/nvvm/libdevice/libdevice.10.bc",
     ],
-    cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-10.0/nvvm/libdevice/libdevice.10.bc" "$(@D)//libdevice.10.bc"
-   """,
+    cmd = """cp -rLf "/usr/local/cuda-10.0/nvvm/libdevice/." "$(@D)/" """,
 )
 
 genrule(
@@ -1241,9 +1237,7 @@ genrule(
         "cuda/extras/CUPTI/include/openmp/cupti_openmp.h",
         "cuda/extras/CUPTI/include/openmp/ompt.h",
     ],
-    cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/gl.h" "$(@D)/cuda/extras/CUPTI/include/GL/gl.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glew.h" "$(@D)/cuda/extras/CUPTI/include/GL/glew.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glext.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glu.h" "$(@D)/cuda/extras/CUPTI/include/GL/glu.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glut.h" "$(@D)/cuda/extras/CUPTI/include/GL/glut.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glx.h" "$(@D)/cuda/extras/CUPTI/include/GL/glx.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glxext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glxext.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/wglew.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglew.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/wglext.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglext.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cuda_stdint.h" "$(@D)/cuda/extras/CUPTI/include/cuda_stdint.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti.h" "$(@D)/cuda/extras/CUPTI/include/cupti.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_activity.h" "$(@D)/cuda/extras/CUPTI/include/cupti_activity.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_callbacks.h" "$(@D)/cuda/extras/CUPTI/include/cupti_callbacks.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_driver_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_driver_cbid.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_events.h" "$(@D)/cuda/extras/CUPTI/include/cupti_events.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_metrics.h" "$(@D)/cuda/extras/CUPTI/include/cupti_metrics.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_nvtx_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_nvtx_cbid.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_result.h" "$(@D)/cuda/extras/CUPTI/include/cupti_result.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_runtime_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_runtime_cbid.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_version.h" "$(@D)/cuda/extras/CUPTI/include/cupti_version.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cudaGL_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaGL_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cudaVDPAU_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_nvtx_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_nvtx_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/openacc/cupti_openacc.h" "$(@D)/cuda/extras/CUPTI/include/openacc/cupti_openacc.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/openmp/cupti_openmp.h" "$(@D)/cuda/extras/CUPTI/include/openmp/cupti_openmp.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/openmp/ompt.h" "$(@D)/cuda/extras/CUPTI/include/openmp/ompt.h"
-   """,
+    cmd = """cp -rLf "/usr/local/cuda-10.0/extras/CUPTI/include/." "$(@D)/cuda/extras/CUPTI/include/" """,
 )
 
 genrule(
@@ -1259,9 +1253,31 @@ genrule(
         "cuda/lib/libcudnn.so.7",
         "cuda/lib/libcupti.so.10.0",
     ],
-    cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudart.so.10.0.130" "$(@D)/cuda/lib/libcudart.so.10.0" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcublas.so.10.0.130" "$(@D)/cuda/lib/libcublas.so.10.0" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcusolver.so.10.0.130" "$(@D)/cuda/lib/libcusolver.so.10.0" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcurand.so.10.0.130" "$(@D)/cuda/lib/libcurand.so.10.0" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcufft.so.10.0.145" "$(@D)/cuda/lib/libcufft.so.10.0" && cp -f "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.3.1" "$(@D)/cuda/lib/libcudnn.so.7" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/lib64/libcupti.so.10.0.130" "$(@D)/cuda/lib/libcupti.so.10.0"
-   """,
+    cmd = """cp -f "/usr/local/cuda-10.0/lib64/stubs/libcuda.so" $(location cuda/lib/libcuda.so) && cp -f "/usr/local/cuda-10.0/lib64/libcudart.so.10.0" $(location cuda/lib/libcudart.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcudart_static.a" $(location cuda/lib/libcudart_static.a) && cp -f "/usr/local/cuda-10.0/lib64/libcublas.so.10.0" $(location cuda/lib/libcublas.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcusolver.so.10.0" $(location cuda/lib/libcusolver.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcurand.so.10.0" $(location cuda/lib/libcurand.so.10.0) && cp -f "/usr/local/cuda-10.0/lib64/libcufft.so.10.0" $(location cuda/lib/libcufft.so.10.0) && cp -f "/usr/lib/x86_64-linux-gnu/libcudnn.so.7" $(location cuda/lib/libcudnn.so.7) && cp -f "/usr/local/cuda-10.0/extras/CUPTI/lib64/libcupti.so.10.0" $(location cuda/lib/libcupti.so.10.0) """,
+)
+
+genrule(
+    name = "cuda-bin",
+    outs = [
+        "cuda/bin/bin2c",
+        "cuda/bin/crt/link.stub",
+        "cuda/bin/crt/prelink.stub",
+        "cuda/bin/cuda-gdb",
+        "cuda/bin/cuda-gdbserver",
+        "cuda/bin/cuda-memcheck",
+        "cuda/bin/cudafe++",
+        "cuda/bin/cuobjdump",
+        "cuda/bin/fatbinary",
+        "cuda/bin/gpu-library-advisor",
+        "cuda/bin/nvcc",
+        "cuda/bin/nvcc.profile",
+        "cuda/bin/nvdisasm",
+        "cuda/bin/nvlink",
+        "cuda/bin/nvprof",
+        "cuda/bin/nvprune",
+        "cuda/bin/ptxas",
+    ],
+    cmd = """cp -rLf "/usr/local/cuda-10.0/bin/." "$(@D)/cuda/bin/" """,
 )
 
 genrule(
@@ -1269,7 +1285,5 @@ genrule(
     outs = [
         "cuda/include/cudnn.h",
     ],
-    cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/include/cudnn.h" "$(@D)/cudnn.h"
-   """,
+    cmd = """cp -f "/usr/include/cudnn.h" $(location cuda/include/cudnn.h) """,
 )
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h
old mode 100755
new mode 100644
index 0934618e0b538ab0db2a969870c85aa9c4053130..783d678417644be42150954116b03790813e3fea
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef CUDA_CUDA_CONFIG_H_
 #define CUDA_CUDA_CONFIG_H_
 
-#define TF_CUDA_CAPABILITIES CudaVersion("3.0")
+#define TF_CUDA_CAPABILITIES CudaVersion("3.0"), CudaVersion("6.0")
 
 #define TF_CUDA_VERSION "10.0"
 #define TF_CUDNN_VERSION "7"
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
index c6930904b564bf2cce70b484a0e7b0759f13b7c9..cfd0a08e93a2654f266b4a7e647cc03062074cd2 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
@@ -1187,9 +1187,7 @@ genrule(
         "cuda/include/vector_functions.hpp",
         "cuda/include/vector_types.h",
     ],
-    cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-9.0/include/CL/cl.h" "$(@D)/cuda/include/CL/cl.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl.hpp" "$(@D)/cuda/include/CL/cl.hpp" && cp -f "/usr/local/cuda-9.0/include/CL/cl_egl.h" "$(@D)/cuda/include/CL/cl_egl.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl_ext.h" "$(@D)/cuda/include/CL/cl_ext.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl_gl.h" "$(@D)/cuda/include/CL/cl_gl.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl_gl_ext.h" "$(@D)/cuda/include/CL/cl_gl_ext.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl_platform.h" "$(@D)/cuda/include/CL/cl_platform.h" && cp -f "/usr/local/cuda-9.0/include/CL/opencl.h" "$(@D)/cuda/include/CL/opencl.h" && cp -f "/usr/local/cuda-9.0/include/builtin_types.h" "$(@D)/cuda/include/builtin_types.h" && cp -f "/usr/local/cuda-9.0/include/channel_descriptor.h" "$(@D)/cuda/include/channel_descriptor.h" && cp -f "/usr/local/cuda-9.0/include/common_functions.h" "$(@D)/cuda/include/common_functions.h" && cp -f "/usr/local/cuda-9.0/include/cooperative_groups.h" "$(@D)/cuda/include/cooperative_groups.h" && cp -f "/usr/local/cuda-9.0/include/cooperative_groups_helpers.h" "$(@D)/cuda/include/cooperative_groups_helpers.h" && cp -f "/usr/local/cuda-9.0/include/crt/common_functions.h" "$(@D)/cuda/include/crt/common_functions.h" && cp -f "/usr/local/cuda-9.0/include/crt/device_double_functions.h" "$(@D)/cuda/include/crt/device_double_functions.h" && cp -f "/usr/local/cuda-9.0/include/crt/device_double_functions.hpp" "$(@D)/cuda/include/crt/device_double_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/device_functions.h" "$(@D)/cuda/include/crt/device_functions.h" && cp -f "/usr/local/cuda-9.0/include/crt/device_functions.hpp" "$(@D)/cuda/include/crt/device_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/func_macro.h" "$(@D)/cuda/include/crt/func_macro.h" && cp -f "/usr/local/cuda-9.0/include/crt/host_config.h" "$(@D)/cuda/include/crt/host_config.h" && cp -f "/usr/local/cuda-9.0/include/crt/host_defines.h" "$(@D)/cuda/include/crt/host_defines.h" && cp -f "/usr/local/cuda-9.0/include/crt/host_runtime.h" "$(@D)/cuda/include/crt/host_runtime.h" && cp -f "/usr/local/cuda-9.0/include/crt/math_functions.h" "$(@D)/cuda/include/crt/math_functions.h" && cp -f "/usr/local/cuda-9.0/include/crt/math_functions.hpp" "$(@D)/cuda/include/crt/math_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/mma.h" "$(@D)/cuda/include/crt/mma.h" && cp -f "/usr/local/cuda-9.0/include/crt/mma.hpp" "$(@D)/cuda/include/crt/mma.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/nvfunctional" "$(@D)/cuda/include/crt/nvfunctional" && cp -f "/usr/local/cuda-9.0/include/crt/sm_70_rt.h" "$(@D)/cuda/include/crt/sm_70_rt.h" && cp -f "/usr/local/cuda-9.0/include/crt/sm_70_rt.hpp" "$(@D)/cuda/include/crt/sm_70_rt.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/storage_class.h" "$(@D)/cuda/include/crt/storage_class.h" && cp -f "/usr/local/cuda-9.0/include/cuComplex.h" "$(@D)/cuda/include/cuComplex.h" && cp -f "/usr/local/cuda-9.0/include/cublas.h" "$(@D)/cuda/include/cublas.h" && cp -f "/usr/local/cuda-9.0/include/cublasXt.h" "$(@D)/cuda/include/cublasXt.h" && cp -f "/usr/local/cuda-9.0/include/cublas_api.h" "$(@D)/cuda/include/cublas_api.h" && cp -f "/usr/local/cuda-9.0/include/cublas_v2.h" "$(@D)/cuda/include/cublas_v2.h" && cp -f "/usr/local/cuda-9.0/include/cuda.h" "$(@D)/cuda/include/cuda.h" && cp -f "/usr/local/cuda-9.0/include/cudaEGL.h" "$(@D)/cuda/include/cudaEGL.h" && cp -f "/usr/local/cuda-9.0/include/cudaGL.h" "$(@D)/cuda/include/cudaGL.h" && cp -f "/usr/local/cuda-9.0/include/cudaProfiler.h" "$(@D)/cuda/include/cudaProfiler.h" && cp -f "/usr/local/cuda-9.0/include/cudaVDPAU.h" "$(@D)/cuda/include/cudaVDPAU.h" && cp -f "/usr/local/cuda-9.0/include/cuda_device_runtime_api.h" "$(@D)/cuda/include/cuda_device_runtime_api.h" && cp -f "/usr/local/cuda-9.0/include/cuda_fp16.h" "$(@D)/cuda/include/cuda_fp16.h" && cp -f "/usr/local/cuda-9.0/include/cuda_fp16.hpp" "$(@D)/cuda/include/cuda_fp16.hpp" && cp -f "/usr/local/cuda-9.0/include/cuda_gl_interop.h" "$(@D)/cuda/include/cuda_gl_interop.h" && cp -f "/usr/local/cuda-9.0/include/cuda_occupancy.h" "$(@D)/cuda/include/cuda_occupancy.h" && cp -f "/usr/local/cuda-9.0/include/cuda_profiler_api.h" "$(@D)/cuda/include/cuda_profiler_api.h" && cp -f "/usr/local/cuda-9.0/include/cuda_runtime.h" "$(@D)/cuda/include/cuda_runtime.h" && cp -f "/usr/local/cuda-9.0/include/cuda_runtime_api.h" "$(@D)/cuda/include/cuda_runtime_api.h" && cp -f "/usr/local/cuda-9.0/include/cuda_surface_types.h" "$(@D)/cuda/include/cuda_surface_types.h" && cp -f "/usr/local/cuda-9.0/include/cuda_texture_types.h" "$(@D)/cuda/include/cuda_texture_types.h" && cp -f "/usr/local/cuda-9.0/include/cuda_vdpau_interop.h" "$(@D)/cuda/include/cuda_vdpau_interop.h" && cp -f "/usr/local/cuda-9.0/include/cudalibxt.h" "$(@D)/cuda/include/cudalibxt.h" && cp -f "/usr/local/cuda-9.0/include/cufft.h" "$(@D)/cuda/include/cufft.h" && cp -f "/usr/local/cuda-9.0/include/cufftXt.h" "$(@D)/cuda/include/cufftXt.h" && cp -f "/usr/local/cuda-9.0/include/cufftw.h" "$(@D)/cuda/include/cufftw.h" && cp -f "/usr/local/cuda-9.0/include/curand.h" "$(@D)/cuda/include/curand.h" && cp -f "/usr/local/cuda-9.0/include/curand_discrete.h" "$(@D)/cuda/include/curand_discrete.h" && cp -f "/usr/local/cuda-9.0/include/curand_discrete2.h" "$(@D)/cuda/include/curand_discrete2.h" && cp -f "/usr/local/cuda-9.0/include/curand_globals.h" "$(@D)/cuda/include/curand_globals.h" && cp -f "/usr/local/cuda-9.0/include/curand_kernel.h" "$(@D)/cuda/include/curand_kernel.h" && cp -f "/usr/local/cuda-9.0/include/curand_lognormal.h" "$(@D)/cuda/include/curand_lognormal.h" && cp -f "/usr/local/cuda-9.0/include/curand_mrg32k3a.h" "$(@D)/cuda/include/curand_mrg32k3a.h" && cp -f "/usr/local/cuda-9.0/include/curand_mtgp32.h" "$(@D)/cuda/include/curand_mtgp32.h" && cp -f "/usr/local/cuda-9.0/include/curand_mtgp32_host.h" "$(@D)/cuda/include/curand_mtgp32_host.h" && cp -f "/usr/local/cuda-9.0/include/curand_mtgp32_kernel.h" "$(@D)/cuda/include/curand_mtgp32_kernel.h" && cp -f "/usr/local/cuda-9.0/include/curand_mtgp32dc_p_11213.h" "$(@D)/cuda/include/curand_mtgp32dc_p_11213.h" && cp -f "/usr/local/cuda-9.0/include/curand_normal.h" "$(@D)/cuda/include/curand_normal.h" && cp -f "/usr/local/cuda-9.0/include/curand_normal_static.h" "$(@D)/cuda/include/curand_normal_static.h" && cp -f "/usr/local/cuda-9.0/include/curand_philox4x32_x.h" "$(@D)/cuda/include/curand_philox4x32_x.h" && cp -f "/usr/local/cuda-9.0/include/curand_poisson.h" "$(@D)/cuda/include/curand_poisson.h" && cp -f "/usr/local/cuda-9.0/include/curand_precalc.h" "$(@D)/cuda/include/curand_precalc.h" && cp -f "/usr/local/cuda-9.0/include/curand_uniform.h" "$(@D)/cuda/include/curand_uniform.h" && cp -f "/usr/local/cuda-9.0/include/cusolverDn.h" "$(@D)/cuda/include/cusolverDn.h" && cp -f "/usr/local/cuda-9.0/include/cusolverRf.h" "$(@D)/cuda/include/cusolverRf.h" && cp -f "/usr/local/cuda-9.0/include/cusolverSp.h" "$(@D)/cuda/include/cusolverSp.h" && cp -f "/usr/local/cuda-9.0/include/cusolverSp_LOWLEVEL_PREVIEW.h" "$(@D)/cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h" && cp -f "/usr/local/cuda-9.0/include/cusolver_common.h" "$(@D)/cuda/include/cusolver_common.h" && cp -f "/usr/local/cuda-9.0/include/cusparse.h" "$(@D)/cuda/include/cusparse.h" && cp -f "/usr/local/cuda-9.0/include/cusparse_v2.h" "$(@D)/cuda/include/cusparse_v2.h" && cp -f "/usr/local/cuda-9.0/include/device_atomic_functions.h" "$(@D)/cuda/include/device_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/device_atomic_functions.hpp" "$(@D)/cuda/include/device_atomic_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/device_double_functions.h" "$(@D)/cuda/include/device_double_functions.h" && cp -f "/usr/local/cuda-9.0/include/device_double_functions.hpp" "$(@D)/cuda/include/device_double_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/device_functions.h" "$(@D)/cuda/include/device_functions.h" && cp -f "/usr/local/cuda-9.0/include/device_functions.hpp" "$(@D)/cuda/include/device_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/device_functions_decls.h" "$(@D)/cuda/include/device_functions_decls.h" && cp -f "/usr/local/cuda-9.0/include/device_launch_parameters.h" "$(@D)/cuda/include/device_launch_parameters.h" && cp -f "/usr/local/cuda-9.0/include/device_types.h" "$(@D)/cuda/include/device_types.h" && cp -f "/usr/local/cuda-9.0/include/driver_functions.h" "$(@D)/cuda/include/driver_functions.h" && cp -f "/usr/local/cuda-9.0/include/driver_types.h" "$(@D)/cuda/include/driver_types.h" && cp -f "/usr/local/cuda-9.0/include/dynlink_cuda.h" "$(@D)/cuda/include/dynlink_cuda.h" && cp -f "/usr/local/cuda-9.0/include/dynlink_cuda_cuda.h" "$(@D)/cuda/include/dynlink_cuda_cuda.h" && cp -f "/usr/local/cuda-9.0/include/dynlink_cuviddec.h" "$(@D)/cuda/include/dynlink_cuviddec.h" && cp -f "/usr/local/cuda-9.0/include/dynlink_nvcuvid.h" "$(@D)/cuda/include/dynlink_nvcuvid.h" && cp -f "/usr/local/cuda-9.0/include/fatBinaryCtl.h" "$(@D)/cuda/include/fatBinaryCtl.h" && cp -f "/usr/local/cuda-9.0/include/fatbinary.h" "$(@D)/cuda/include/fatbinary.h" && cp -f "/usr/local/cuda-9.0/include/host_config.h" "$(@D)/cuda/include/host_config.h" && cp -f "/usr/local/cuda-9.0/include/host_defines.h" "$(@D)/cuda/include/host_defines.h" && cp -f "/usr/local/cuda-9.0/include/library_types.h" "$(@D)/cuda/include/library_types.h" && cp -f "/usr/local/cuda-9.0/include/math_constants.h" "$(@D)/cuda/include/math_constants.h" && cp -f "/usr/local/cuda-9.0/include/math_functions.h" "$(@D)/cuda/include/math_functions.h" && cp -f "/usr/local/cuda-9.0/include/math_functions.hpp" "$(@D)/cuda/include/math_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.h" "$(@D)/cuda/include/math_functions_dbl_ptx3.h" && cp -f "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.hpp" "$(@D)/cuda/include/math_functions_dbl_ptx3.hpp" && cp -f "/usr/local/cuda-9.0/include/mma.h" "$(@D)/cuda/include/mma.h" && cp -f "/usr/local/cuda-9.0/include/npp.h" "$(@D)/cuda/include/npp.h" && cp -f "/usr/local/cuda-9.0/include/nppcore.h" "$(@D)/cuda/include/nppcore.h" && cp -f "/usr/local/cuda-9.0/include/nppdefs.h" "$(@D)/cuda/include/nppdefs.h" && cp -f "/usr/local/cuda-9.0/include/nppi.h" "$(@D)/cuda/include/nppi.h" && cp -f "/usr/local/cuda-9.0/include/nppi_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/nppi_arithmetic_and_logical_operations.h" && cp -f "/usr/local/cuda-9.0/include/nppi_color_conversion.h" "$(@D)/cuda/include/nppi_color_conversion.h" && cp -f "/usr/local/cuda-9.0/include/nppi_compression_functions.h" "$(@D)/cuda/include/nppi_compression_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppi_computer_vision.h" "$(@D)/cuda/include/nppi_computer_vision.h" && cp -f "/usr/local/cuda-9.0/include/nppi_data_exchange_and_initialization.h" "$(@D)/cuda/include/nppi_data_exchange_and_initialization.h" && cp -f "/usr/local/cuda-9.0/include/nppi_filtering_functions.h" "$(@D)/cuda/include/nppi_filtering_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppi_geometry_transforms.h" "$(@D)/cuda/include/nppi_geometry_transforms.h" && cp -f "/usr/local/cuda-9.0/include/nppi_linear_transforms.h" "$(@D)/cuda/include/nppi_linear_transforms.h" && cp -f "/usr/local/cuda-9.0/include/nppi_morphological_operations.h" "$(@D)/cuda/include/nppi_morphological_operations.h" && cp -f "/usr/local/cuda-9.0/include/nppi_statistics_functions.h" "$(@D)/cuda/include/nppi_statistics_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppi_support_functions.h" "$(@D)/cuda/include/nppi_support_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppi_threshold_and_compare_operations.h" "$(@D)/cuda/include/nppi_threshold_and_compare_operations.h" && cp -f "/usr/local/cuda-9.0/include/npps.h" "$(@D)/cuda/include/npps.h" && cp -f "/usr/local/cuda-9.0/include/npps_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/npps_arithmetic_and_logical_operations.h" && cp -f "/usr/local/cuda-9.0/include/npps_conversion_functions.h" "$(@D)/cuda/include/npps_conversion_functions.h" && cp -f "/usr/local/cuda-9.0/include/npps_filtering_functions.h" "$(@D)/cuda/include/npps_filtering_functions.h" && cp -f "/usr/local/cuda-9.0/include/npps_initialization.h" "$(@D)/cuda/include/npps_initialization.h" && cp -f "/usr/local/cuda-9.0/include/npps_statistics_functions.h" "$(@D)/cuda/include/npps_statistics_functions.h" && cp -f "/usr/local/cuda-9.0/include/npps_support_functions.h" "$(@D)/cuda/include/npps_support_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppversion.h" "$(@D)/cuda/include/nppversion.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExt.h" "$(@D)/cuda/include/nvToolsExt.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExtCuda.h" "$(@D)/cuda/include/nvToolsExtCuda.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvToolsExtCudaRt.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExtMeta.h" "$(@D)/cuda/include/nvToolsExtMeta.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExtSync.h" "$(@D)/cuda/include/nvToolsExtSync.h" && cp -f "/usr/local/cuda-9.0/include/nvblas.h" "$(@D)/cuda/include/nvblas.h" && cp -f "/usr/local/cuda-9.0/include/nvfunctional" "$(@D)/cuda/include/nvfunctional" && cp -f "/usr/local/cuda-9.0/include/nvgraph.h" "$(@D)/cuda/include/nvgraph.h" && cp -f "/usr/local/cuda-9.0/include/nvml.h" "$(@D)/cuda/include/nvml.h" && cp -f "/usr/local/cuda-9.0/include/nvrtc.h" "$(@D)/cuda/include/nvrtc.h" && cp -f "/usr/local/cuda-9.0/include/sm_20_atomic_functions.h" "$(@D)/cuda/include/sm_20_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/sm_20_atomic_functions.hpp" "$(@D)/cuda/include/sm_20_atomic_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_20_intrinsics.h" "$(@D)/cuda/include/sm_20_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_20_intrinsics.hpp" "$(@D)/cuda/include/sm_20_intrinsics.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_30_intrinsics.h" "$(@D)/cuda/include/sm_30_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_30_intrinsics.hpp" "$(@D)/cuda/include/sm_30_intrinsics.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_32_atomic_functions.h" "$(@D)/cuda/include/sm_32_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/sm_32_atomic_functions.hpp" "$(@D)/cuda/include/sm_32_atomic_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_32_intrinsics.h" "$(@D)/cuda/include/sm_32_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_32_intrinsics.hpp" "$(@D)/cuda/include/sm_32_intrinsics.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_35_atomic_functions.h" "$(@D)/cuda/include/sm_35_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/sm_35_intrinsics.h" "$(@D)/cuda/include/sm_35_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_60_atomic_functions.h" "$(@D)/cuda/include/sm_60_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/sm_60_atomic_functions.hpp" "$(@D)/cuda/include/sm_60_atomic_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_61_intrinsics.h" "$(@D)/cuda/include/sm_61_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_61_intrinsics.hpp" "$(@D)/cuda/include/sm_61_intrinsics.hpp" && cp -f "/usr/local/cuda-9.0/include/sobol_direction_vectors.h" "$(@D)/cuda/include/sobol_direction_vectors.h" && cp -f "/usr/local/cuda-9.0/include/surface_functions.h" "$(@D)/cuda/include/surface_functions.h" && cp -f "/usr/local/cuda-9.0/include/surface_functions.hpp" "$(@D)/cuda/include/surface_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/surface_indirect_functions.h" "$(@D)/cuda/include/surface_indirect_functions.h" && cp -f "/usr/local/cuda-9.0/include/surface_indirect_functions.hpp" "$(@D)/cuda/include/surface_indirect_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/surface_types.h" "$(@D)/cuda/include/surface_types.h" && cp -f "/usr/local/cuda-9.0/include/texture_fetch_functions.h" "$(@D)/cuda/include/texture_fetch_functions.h" && cp -f "/usr/local/cuda-9.0/include/texture_fetch_functions.hpp" "$(@D)/cuda/include/texture_fetch_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/texture_indirect_functions.h" "$(@D)/cuda/include/texture_indirect_functions.h" && cp -f "/usr/local/cuda-9.0/include/texture_indirect_functions.hpp" "$(@D)/cuda/include/texture_indirect_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/texture_types.h" "$(@D)/cuda/include/texture_types.h" && cp -f "/usr/local/cuda-9.0/include/thrust/adjacent_difference.h" "$(@D)/cuda/include/thrust/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/advance.h" "$(@D)/cuda/include/thrust/advance.h" && cp -f "/usr/local/cuda-9.0/include/thrust/binary_search.h" "$(@D)/cuda/include/thrust/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/complex.h" "$(@D)/cuda/include/thrust/complex.h" && cp -f "/usr/local/cuda-9.0/include/thrust/copy.h" "$(@D)/cuda/include/thrust/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/count.h" "$(@D)/cuda/include/thrust/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/adjacent_difference.inl" "$(@D)/cuda/include/thrust/detail/adjacent_difference.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/advance.inl" "$(@D)/cuda/include/thrust/detail/advance.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.h" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.inl" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.h" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/no_throw_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/no_throw_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/binary_search.inl" "$(@D)/cuda/include/thrust/detail/binary_search.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/arithmetic.h" "$(@D)/cuda/include/thrust/detail/complex/arithmetic.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/c99math.h" "$(@D)/cuda/include/thrust/detail/complex/c99math.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/catrig.h" "$(@D)/cuda/include/thrust/detail/complex/catrig.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/catrigf.h" "$(@D)/cuda/include/thrust/detail/complex/catrigf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/ccosh.h" "$(@D)/cuda/include/thrust/detail/complex/ccosh.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/ccoshf.h" "$(@D)/cuda/include/thrust/detail/complex/ccoshf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cexp.h" "$(@D)/cuda/include/thrust/detail/complex/cexp.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cexpf.h" "$(@D)/cuda/include/thrust/detail/complex/cexpf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/clog.h" "$(@D)/cuda/include/thrust/detail/complex/clog.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/clogf.h" "$(@D)/cuda/include/thrust/detail/complex/clogf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/complex.inl" "$(@D)/cuda/include/thrust/detail/complex/complex.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cpow.h" "$(@D)/cuda/include/thrust/detail/complex/cpow.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cpowf.h" "$(@D)/cuda/include/thrust/detail/complex/cpowf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cproj.h" "$(@D)/cuda/include/thrust/detail/complex/cproj.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/csinh.h" "$(@D)/cuda/include/thrust/detail/complex/csinh.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/csinhf.h" "$(@D)/cuda/include/thrust/detail/complex/csinhf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrt.h" "$(@D)/cuda/include/thrust/detail/complex/csqrt.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrtf.h" "$(@D)/cuda/include/thrust/detail/complex/csqrtf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanh.h" "$(@D)/cuda/include/thrust/detail/complex/ctanh.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanhf.h" "$(@D)/cuda/include/thrust/detail/complex/ctanhf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/math_private.h" "$(@D)/cuda/include/thrust/detail/complex/math_private.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/stream.h" "$(@D)/cuda/include/thrust/detail/complex/stream.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config.h" "$(@D)/cuda/include/thrust/detail/config.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/compiler.h" "$(@D)/cuda/include/thrust/detail/config/compiler.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/compiler_fence.h" "$(@D)/cuda/include/thrust/detail/config/compiler_fence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/config.h" "$(@D)/cuda/include/thrust/detail/config/config.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/debug.h" "$(@D)/cuda/include/thrust/detail/config/debug.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/device_system.h" "$(@D)/cuda/include/thrust/detail/config/device_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/exec_check_disable.h" "$(@D)/cuda/include/thrust/detail/config/exec_check_disable.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/forceinline.h" "$(@D)/cuda/include/thrust/detail/config/forceinline.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/global_workarounds.h" "$(@D)/cuda/include/thrust/detail/config/global_workarounds.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/host_device.h" "$(@D)/cuda/include/thrust/detail/config/host_device.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/host_system.h" "$(@D)/cuda/include/thrust/detail/config/host_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/simple_defines.h" "$(@D)/cuda/include/thrust/detail/config/simple_defines.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.h" "$(@D)/cuda/include/thrust/detail/contiguous_storage.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.inl" "$(@D)/cuda/include/thrust/detail/contiguous_storage.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/copy.h" "$(@D)/cuda/include/thrust/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/copy.inl" "$(@D)/cuda/include/thrust/detail/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/copy_if.h" "$(@D)/cuda/include/thrust/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/copy_if.inl" "$(@D)/cuda/include/thrust/detail/copy_if.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/count.inl" "$(@D)/cuda/include/thrust/detail/count.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/cstdint.h" "$(@D)/cuda/include/thrust/detail/cstdint.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_delete.inl" "$(@D)/cuda/include/thrust/detail/device_delete.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_free.inl" "$(@D)/cuda/include/thrust/detail/device_free.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_malloc.inl" "$(@D)/cuda/include/thrust/detail/device_malloc.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_new.inl" "$(@D)/cuda/include/thrust/detail/device_new.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_ptr.inl" "$(@D)/cuda/include/thrust/detail/device_ptr.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_reference.inl" "$(@D)/cuda/include/thrust/detail/device_reference.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_vector.inl" "$(@D)/cuda/include/thrust/detail/device_vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/dispatch/is_trivial_copy.h" "$(@D)/cuda/include/thrust/detail/dispatch/is_trivial_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/distance.inl" "$(@D)/cuda/include/thrust/detail/distance.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/equal.inl" "$(@D)/cuda/include/thrust/detail/equal.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/execute_with_allocator.h" "$(@D)/cuda/include/thrust/detail/execute_with_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/execution_policy.h" "$(@D)/cuda/include/thrust/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/extrema.inl" "$(@D)/cuda/include/thrust/detail/extrema.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/fill.inl" "$(@D)/cuda/include/thrust/detail/fill.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/find.inl" "$(@D)/cuda/include/thrust/detail/find.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/for_each.inl" "$(@D)/cuda/include/thrust/detail/for_each.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/function.h" "$(@D)/cuda/include/thrust/detail/function.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional.inl" "$(@D)/cuda/include/thrust/detail/functional.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.h" "$(@D)/cuda/include/thrust/detail/functional/actor.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.inl" "$(@D)/cuda/include/thrust/detail/functional/actor.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/argument.h" "$(@D)/cuda/include/thrust/detail/functional/argument.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/composite.h" "$(@D)/cuda/include/thrust/detail/functional/composite.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/arithmetic_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/arithmetic_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/assignment_operator.h" "$(@D)/cuda/include/thrust/detail/functional/operators/assignment_operator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/bitwise_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/bitwise_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/compound_assignment_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/logical_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/logical_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/operator_adaptors.h" "$(@D)/cuda/include/thrust/detail/functional/operators/operator_adaptors.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/relational_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/relational_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/placeholder.h" "$(@D)/cuda/include/thrust/detail/functional/placeholder.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/value.h" "$(@D)/cuda/include/thrust/detail/functional/value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/gather.inl" "$(@D)/cuda/include/thrust/detail/gather.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/generate.inl" "$(@D)/cuda/include/thrust/detail/generate.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/get_iterator_value.h" "$(@D)/cuda/include/thrust/detail/get_iterator_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/host_vector.inl" "$(@D)/cuda/include/thrust/detail/host_vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/inner_product.inl" "$(@D)/cuda/include/thrust/detail/inner_product.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/integer_math.h" "$(@D)/cuda/include/thrust/detail/integer_math.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/integer_traits.h" "$(@D)/cuda/include/thrust/detail/integer_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/internal_functional.h" "$(@D)/cuda/include/thrust/detail/internal_functional.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/logical.inl" "$(@D)/cuda/include/thrust/detail/logical.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/merge.inl" "$(@D)/cuda/include/thrust/detail/merge.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/minmax.h" "$(@D)/cuda/include/thrust/detail/minmax.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/mismatch.inl" "$(@D)/cuda/include/thrust/detail/mismatch.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/mpl/math.h" "$(@D)/cuda/include/thrust/detail/mpl/math.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/numeric_traits.h" "$(@D)/cuda/include/thrust/detail/numeric_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/overlapped_copy.h" "$(@D)/cuda/include/thrust/detail/overlapped_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/pair.inl" "$(@D)/cuda/include/thrust/detail/pair.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/partition.inl" "$(@D)/cuda/include/thrust/detail/partition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/pointer.h" "$(@D)/cuda/include/thrust/detail/pointer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/pointer.inl" "$(@D)/cuda/include/thrust/detail/pointer.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/range/head_flags.h" "$(@D)/cuda/include/thrust/detail/range/head_flags.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/range/tail_flags.h" "$(@D)/cuda/include/thrust/detail/range/tail_flags.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/raw_pointer_cast.h" "$(@D)/cuda/include/thrust/detail/raw_pointer_cast.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/raw_reference_cast.h" "$(@D)/cuda/include/thrust/detail/raw_reference_cast.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reduce.inl" "$(@D)/cuda/include/thrust/detail/reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reference.h" "$(@D)/cuda/include/thrust/detail/reference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reference.inl" "$(@D)/cuda/include/thrust/detail/reference.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reference_forward_declaration.h" "$(@D)/cuda/include/thrust/detail/reference_forward_declaration.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/remove.inl" "$(@D)/cuda/include/thrust/detail/remove.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/replace.inl" "$(@D)/cuda/include/thrust/detail/replace.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reverse.inl" "$(@D)/cuda/include/thrust/detail/reverse.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/scan.inl" "$(@D)/cuda/include/thrust/detail/scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/scatter.inl" "$(@D)/cuda/include/thrust/detail/scatter.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/seq.h" "$(@D)/cuda/include/thrust/detail/seq.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/sequence.inl" "$(@D)/cuda/include/thrust/detail/sequence.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/set_operations.inl" "$(@D)/cuda/include/thrust/detail/set_operations.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/sort.inl" "$(@D)/cuda/include/thrust/detail/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/static_assert.h" "$(@D)/cuda/include/thrust/detail/static_assert.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/static_map.h" "$(@D)/cuda/include/thrust/detail/static_map.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/swap.h" "$(@D)/cuda/include/thrust/detail/swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/swap.inl" "$(@D)/cuda/include/thrust/detail/swap.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/swap_ranges.inl" "$(@D)/cuda/include/thrust/detail/swap_ranges.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/tabulate.inl" "$(@D)/cuda/include/thrust/detail/tabulate.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.h" "$(@D)/cuda/include/thrust/detail/temporary_array.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.inl" "$(@D)/cuda/include/thrust/detail/temporary_array.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/transform.inl" "$(@D)/cuda/include/thrust/detail/transform.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/transform_reduce.inl" "$(@D)/cuda/include/thrust/detail/transform_reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/transform_scan.inl" "$(@D)/cuda/include/thrust/detail/transform_scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/trivial_sequence.h" "$(@D)/cuda/include/thrust/detail/trivial_sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/tuple.inl" "$(@D)/cuda/include/thrust/detail/tuple.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/tuple_meta_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_meta_transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/tuple_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" "$(@D)/cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/function_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/function_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_member_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_member_function.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_nested_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_nested_type.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_trivial_assign.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_trivial_assign.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_call_possible.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_call_possible.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_metafunction_defined.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_metafunction_defined.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_output_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/minimum_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/minimum_type.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/pointer_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/pointer_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/result_of_adaptable_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_fill.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/unique.inl" "$(@D)/cuda/include/thrust/detail/unique.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/use_default.h" "$(@D)/cuda/include/thrust/detail/use_default.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/util/align.h" "$(@D)/cuda/include/thrust/detail/util/align.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/util/blocking.h" "$(@D)/cuda/include/thrust/detail/util/blocking.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/vector_base.h" "$(@D)/cuda/include/thrust/detail/vector_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/vector_base.inl" "$(@D)/cuda/include/thrust/detail/vector_base.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/device_allocator.h" "$(@D)/cuda/include/thrust/device_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_delete.h" "$(@D)/cuda/include/thrust/device_delete.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_free.h" "$(@D)/cuda/include/thrust/device_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_malloc.h" "$(@D)/cuda/include/thrust/device_malloc.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_malloc_allocator.h" "$(@D)/cuda/include/thrust/device_malloc_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_new.h" "$(@D)/cuda/include/thrust/device_new.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_new_allocator.h" "$(@D)/cuda/include/thrust/device_new_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_ptr.h" "$(@D)/cuda/include/thrust/device_ptr.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_reference.h" "$(@D)/cuda/include/thrust/device_reference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_vector.h" "$(@D)/cuda/include/thrust/device_vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/distance.h" "$(@D)/cuda/include/thrust/distance.h" && cp -f "/usr/local/cuda-9.0/include/thrust/equal.h" "$(@D)/cuda/include/thrust/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/execution_policy.h" "$(@D)/cuda/include/thrust/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/extrema.h" "$(@D)/cuda/include/thrust/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/fill.h" "$(@D)/cuda/include/thrust/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/find.h" "$(@D)/cuda/include/thrust/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/for_each.h" "$(@D)/cuda/include/thrust/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/functional.h" "$(@D)/cuda/include/thrust/functional.h" && cp -f "/usr/local/cuda-9.0/include/thrust/gather.h" "$(@D)/cuda/include/thrust/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/generate.h" "$(@D)/cuda/include/thrust/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/host_vector.h" "$(@D)/cuda/include/thrust/host_vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/inner_product.h" "$(@D)/cuda/include/thrust/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/constant_iterator.h" "$(@D)/cuda/include/thrust/iterator/constant_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/counting_iterator.h" "$(@D)/cuda/include/thrust/iterator/counting_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_assign.h" "$(@D)/cuda/include/thrust/iterator/detail/any_assign.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/any_system_tag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/constant_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/constant_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/counting_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/counting_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/device_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/device_system_tag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/discard_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/discard_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/distance_from_result.h" "$(@D)/cuda/include/thrust/iterator/detail/distance_from_result.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/host_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/host_system_tag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_iterator_category.h" "$(@D)/cuda/include/thrust/iterator/detail/is_iterator_category.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_trivial_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/is_trivial_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_adaptor_base.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_adaptor_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_system.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_facade_category.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_facade_category.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traits.inl" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traits.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traversal_tags.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traversal_tags.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/join_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/join_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_category.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_category.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_system.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/normal_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/normal_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/permutation_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/permutation_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/retag.h" "$(@D)/cuda/include/thrust/iterator/detail/retag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/tagged_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/tagged_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_output_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_output_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/tuple_of_iterator_references.h" "$(@D)/cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/universal_categories.h" "$(@D)/cuda/include/thrust/iterator/detail/universal_categories.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/discard_iterator.h" "$(@D)/cuda/include/thrust/iterator/discard_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/iterator_adaptor.h" "$(@D)/cuda/include/thrust/iterator/iterator_adaptor.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/iterator_categories.h" "$(@D)/cuda/include/thrust/iterator/iterator_categories.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/iterator_facade.h" "$(@D)/cuda/include/thrust/iterator/iterator_facade.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/iterator_traits.h" "$(@D)/cuda/include/thrust/iterator/iterator_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/permutation_iterator.h" "$(@D)/cuda/include/thrust/iterator/permutation_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/retag.h" "$(@D)/cuda/include/thrust/iterator/retag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/reverse_iterator.h" "$(@D)/cuda/include/thrust/iterator/reverse_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/transform_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/transform_output_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_output_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/zip_iterator.h" "$(@D)/cuda/include/thrust/iterator/zip_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/logical.h" "$(@D)/cuda/include/thrust/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/memory.h" "$(@D)/cuda/include/thrust/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/merge.h" "$(@D)/cuda/include/thrust/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/mismatch.h" "$(@D)/cuda/include/thrust/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/pair.h" "$(@D)/cuda/include/thrust/pair.h" && cp -f "/usr/local/cuda-9.0/include/thrust/partition.h" "$(@D)/cuda/include/thrust/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random.h" "$(@D)/cuda/include/thrust/random.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/discard_block_engine.inl" "$(@D)/cuda/include/thrust/random/detail/discard_block_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine_discard.h" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine_discard.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/mod.h" "$(@D)/cuda/include/thrust/random/detail/mod.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/normal_distribution.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution_base.h" "$(@D)/cuda/include/thrust/random/detail/normal_distribution_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/random_core_access.h" "$(@D)/cuda/include/thrust/random/detail/random_core_access.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/subtract_with_carry_engine.inl" "$(@D)/cuda/include/thrust/random/detail/subtract_with_carry_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_int_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_int_distribution.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_real_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_real_distribution.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine.inl" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine_max.h" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine_max.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/discard_block_engine.h" "$(@D)/cuda/include/thrust/random/discard_block_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/linear_congruential_engine.h" "$(@D)/cuda/include/thrust/random/linear_congruential_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/linear_feedback_shift_engine.h" "$(@D)/cuda/include/thrust/random/linear_feedback_shift_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/normal_distribution.h" "$(@D)/cuda/include/thrust/random/normal_distribution.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/subtract_with_carry_engine.h" "$(@D)/cuda/include/thrust/random/subtract_with_carry_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/uniform_int_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_int_distribution.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/uniform_real_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_real_distribution.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/xor_combine_engine.h" "$(@D)/cuda/include/thrust/random/xor_combine_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/reduce.h" "$(@D)/cuda/include/thrust/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/remove.h" "$(@D)/cuda/include/thrust/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/replace.h" "$(@D)/cuda/include/thrust/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/reverse.h" "$(@D)/cuda/include/thrust/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/scan.h" "$(@D)/cuda/include/thrust/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/scatter.h" "$(@D)/cuda/include/thrust/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/sequence.h" "$(@D)/cuda/include/thrust/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/set_operations.h" "$(@D)/cuda/include/thrust/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/sort.h" "$(@D)/cuda/include/thrust/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/swap.h" "$(@D)/cuda/include/thrust/swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cpp/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cpp/detail/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/count.h" "$(@D)/cuda/include/thrust/system/cpp/detail/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/equal.h" "$(@D)/cuda/include/thrust/system/cpp/detail/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cpp/detail/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/find.h" "$(@D)/cuda/include/thrust/system/cpp/detail/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cpp/detail/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/gather.h" "$(@D)/cuda/include/thrust/system/cpp/detail/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/generate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cpp/detail/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cpp/detail/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/logical.h" "$(@D)/cuda/include/thrust/system/cpp/detail/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cpp/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/merge.h" "$(@D)/cuda/include/thrust/system/cpp/detail/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cpp/detail/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/par.h" "$(@D)/cuda/include/thrust/system/cpp/detail/par.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/partition.h" "$(@D)/cuda/include/thrust/system/cpp/detail/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/remove.h" "$(@D)/cuda/include/thrust/system/cpp/detail/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/replace.h" "$(@D)/cuda/include/thrust/system/cpp/detail/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cpp/detail/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sort.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cpp/detail/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cpp/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/memory.h" "$(@D)/cuda/include/thrust/system/cpp/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/vector.h" "$(@D)/cuda/include/thrust/system/cpp/vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/config.h" "$(@D)/cuda/include/thrust/system/cuda/config.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cuda/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cuda/detail/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/agent_launcher.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/agent_launcher.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/alignment.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/alignment.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/util.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/count.h" "$(@D)/cuda/include/thrust/system/cuda/detail/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/cross_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/cub.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/cub.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_select.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/host/mutex.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_allocator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_arch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_debug.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_device.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_device.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_macro.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_namespace.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_ptx.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_type.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_type.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/equal.h" "$(@D)/cuda/include/thrust/system/cuda/detail/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/error.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/error.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cuda/detail/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/find.h" "$(@D)/cuda/include/thrust/system/cuda/detail/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cuda/detail/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/gather.h" "$(@D)/cuda/include/thrust/system/cuda/detail/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/generate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_driver_types.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_driver_types.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cuda/detail/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cuda/detail/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/logical.h" "$(@D)/cuda/include/thrust/system/cuda/detail/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cuda/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/memory_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/merge.h" "$(@D)/cuda/include/thrust/system/cuda/detail/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par_to_seq.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par_to_seq.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/parallel_for.h" "$(@D)/cuda/include/thrust/system/cuda/detail/parallel_for.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/partition.h" "$(@D)/cuda/include/thrust/system/cuda/detail/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/remove.h" "$(@D)/cuda/include/thrust/system/cuda/detail/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/replace.h" "$(@D)/cuda/include/thrust/system/cuda/detail/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cuda/detail/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cuda/detail/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/terminate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/terminate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/util.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/error.h" "$(@D)/cuda/include/thrust/system/cuda/error.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/experimental/pinned_allocator.h" "$(@D)/cuda/include/thrust/system/cuda/experimental/pinned_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/memory.h" "$(@D)/cuda/include/thrust/system/cuda/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/vector.h" "$(@D)/cuda/include/thrust/system/cuda/vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/adl/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/adl/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/count.h" "$(@D)/cuda/include/thrust/system/detail/adl/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/equal.h" "$(@D)/cuda/include/thrust/system/detail/adl/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/extrema.h" "$(@D)/cuda/include/thrust/system/detail/adl/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/find.h" "$(@D)/cuda/include/thrust/system/detail/adl/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/for_each.h" "$(@D)/cuda/include/thrust/system/detail/adl/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/gather.h" "$(@D)/cuda/include/thrust/system/detail/adl/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/generate.h" "$(@D)/cuda/include/thrust/system/detail/adl/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/get_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/adl/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/adl/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/logical.h" "$(@D)/cuda/include/thrust/system/detail/adl/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/adl/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/merge.h" "$(@D)/cuda/include/thrust/system/detail/adl/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/adl/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/partition.h" "$(@D)/cuda/include/thrust/system/detail/adl/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/remove.h" "$(@D)/cuda/include/thrust/system/detail/adl/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/replace.h" "$(@D)/cuda/include/thrust/system/detail/adl/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reverse.h" "$(@D)/cuda/include/thrust/system/detail/adl/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scatter.h" "$(@D)/cuda/include/thrust/system/detail/adl/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sequence.h" "$(@D)/cuda/include/thrust/system/detail/adl/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/adl/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sort.h" "$(@D)/cuda/include/thrust/system/detail/adl/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/adl/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/adl/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/adl/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/bad_alloc.h" "$(@D)/cuda/include/thrust/system/detail/bad_alloc.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/errno.h" "$(@D)/cuda/include/thrust/system/detail/errno.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/error_category.inl" "$(@D)/cuda/include/thrust/system/detail/error_category.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/error_code.inl" "$(@D)/cuda/include/thrust/system/detail/error_code.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/error_condition.inl" "$(@D)/cuda/include/thrust/system/detail/error_condition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.inl" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.h" "$(@D)/cuda/include/thrust/system/detail/generic/advance.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/advance.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.h" "$(@D)/cuda/include/thrust/system/detail/generic/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.inl" "$(@D)/cuda/include/thrust/system/detail/generic/count.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.h" "$(@D)/cuda/include/thrust/system/detail/generic/distance.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/distance.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.h" "$(@D)/cuda/include/thrust/system/detail/generic/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.inl" "$(@D)/cuda/include/thrust/system/detail/generic/equal.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.h" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.inl" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.h" "$(@D)/cuda/include/thrust/system/detail/generic/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.inl" "$(@D)/cuda/include/thrust/system/detail/generic/find.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/for_each.h" "$(@D)/cuda/include/thrust/system/detail/generic/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.h" "$(@D)/cuda/include/thrust/system/detail/generic/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.inl" "$(@D)/cuda/include/thrust/system/detail/generic/gather.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.h" "$(@D)/cuda/include/thrust/system/detail/generic/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/generate.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.inl" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/logical.h" "$(@D)/cuda/include/thrust/system/detail/generic/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.h" "$(@D)/cuda/include/thrust/system/detail/generic/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.inl" "$(@D)/cuda/include/thrust/system/detail/generic/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.h" "$(@D)/cuda/include/thrust/system/detail/generic/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.inl" "$(@D)/cuda/include/thrust/system/detail/generic/merge.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.inl" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.h" "$(@D)/cuda/include/thrust/system/detail/generic/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.inl" "$(@D)/cuda/include/thrust/system/detail/generic/partition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.h" "$(@D)/cuda/include/thrust/system/detail/generic/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.inl" "$(@D)/cuda/include/thrust/system/detail/generic/remove.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.h" "$(@D)/cuda/include/thrust/system/detail/generic/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.inl" "$(@D)/cuda/include/thrust/system/detail/generic/replace.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.h" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.h" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/select_system.h" "$(@D)/cuda/include/thrust/system/detail/generic/select_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.h" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.inl" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.h" "$(@D)/cuda/include/thrust/system/detail/generic/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.inl" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tag.h" "$(@D)/cuda/include/thrust/system/detail/generic/tag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.inl" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/type_traits.h" "$(@D)/cuda/include/thrust/system/detail/generic/type_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/internal/decompose.h" "$(@D)/cuda/include/thrust/system/detail/internal/decompose.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/sequential/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/sequential/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_backward.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_backward.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/count.h" "$(@D)/cuda/include/thrust/system/detail/sequential/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/equal.h" "$(@D)/cuda/include/thrust/system/detail/sequential/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/execution_policy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/extrema.h" "$(@D)/cuda/include/thrust/system/detail/sequential/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/find.h" "$(@D)/cuda/include/thrust/system/detail/sequential/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/for_each.h" "$(@D)/cuda/include/thrust/system/detail/sequential/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/gather.h" "$(@D)/cuda/include/thrust/system/detail/sequential/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/general_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/general_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/generate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/get_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/sequential/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/insertion_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/insertion_sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/sequential/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/logical.h" "$(@D)/cuda/include/thrust/system/detail/sequential/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/sequential/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.h" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/sequential/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/partition.h" "$(@D)/cuda/include/thrust/system/detail/sequential/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/remove.h" "$(@D)/cuda/include/thrust/system/detail/sequential/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/replace.h" "$(@D)/cuda/include/thrust/system/detail/sequential/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reverse.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scatter.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sequence.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/sequential/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/sequential/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/sequential/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/trivial_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/trivial_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/system_error.inl" "$(@D)/cuda/include/thrust/system/detail/system_error.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/error_code.h" "$(@D)/cuda/include/thrust/system/error_code.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/omp/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/omp/detail/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/count.h" "$(@D)/cuda/include/thrust/system/omp/detail/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.h" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/equal.h" "$(@D)/cuda/include/thrust/system/omp/detail/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/omp/detail/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/find.h" "$(@D)/cuda/include/thrust/system/omp/detail/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/gather.h" "$(@D)/cuda/include/thrust/system/omp/detail/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/generate.h" "$(@D)/cuda/include/thrust/system/omp/detail/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/omp/detail/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/omp/detail/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/logical.h" "$(@D)/cuda/include/thrust/system/omp/detail/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/omp/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/omp/detail/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/merge.h" "$(@D)/cuda/include/thrust/system/omp/detail/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/omp/detail/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/par.h" "$(@D)/cuda/include/thrust/system/omp/detail/par.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.h" "$(@D)/cuda/include/thrust/system/omp/detail/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/partition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.h" "$(@D)/cuda/include/thrust/system/omp/detail/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.inl" "$(@D)/cuda/include/thrust/system/omp/detail/remove.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/replace.h" "$(@D)/cuda/include/thrust/system/omp/detail/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/omp/detail/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/omp/detail/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/omp/detail/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/omp/detail/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.h" "$(@D)/cuda/include/thrust/system/omp/detail/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.inl" "$(@D)/cuda/include/thrust/system/omp/detail/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/omp/detail/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/omp/detail/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/omp/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/omp/detail/vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/memory.h" "$(@D)/cuda/include/thrust/system/omp/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/vector.h" "$(@D)/cuda/include/thrust/system/omp/vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/system_error.h" "$(@D)/cuda/include/thrust/system/system_error.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/tbb/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/tbb/detail/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/count.h" "$(@D)/cuda/include/thrust/system/tbb/detail/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/equal.h" "$(@D)/cuda/include/thrust/system/tbb/detail/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/extrema.h" "$(@D)/cuda/include/thrust/system/tbb/detail/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/find.h" "$(@D)/cuda/include/thrust/system/tbb/detail/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.h" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/gather.h" "$(@D)/cuda/include/thrust/system/tbb/detail/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/generate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/get_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/tbb/detail/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/tbb/detail/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/logical.h" "$(@D)/cuda/include/thrust/system/tbb/detail/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/tbb/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/memory.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.h" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/tbb/detail/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/par.h" "$(@D)/cuda/include/thrust/system/tbb/detail/par.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.h" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_intervals.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.h" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/replace.h" "$(@D)/cuda/include/thrust/system/tbb/detail/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reverse.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scatter.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sequence.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/tbb/detail/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/tbb/detail/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/tbb/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/vector.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/memory.h" "$(@D)/cuda/include/thrust/system/tbb/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/vector.h" "$(@D)/cuda/include/thrust/system/tbb/vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system_error.h" "$(@D)/cuda/include/thrust/system_error.h" && cp -f "/usr/local/cuda-9.0/include/thrust/tabulate.h" "$(@D)/cuda/include/thrust/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/transform.h" "$(@D)/cuda/include/thrust/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/transform_reduce.h" "$(@D)/cuda/include/thrust/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/transform_scan.h" "$(@D)/cuda/include/thrust/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/tuple.h" "$(@D)/cuda/include/thrust/tuple.h" && cp -f "/usr/local/cuda-9.0/include/thrust/uninitialized_copy.h" "$(@D)/cuda/include/thrust/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/uninitialized_fill.h" "$(@D)/cuda/include/thrust/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/unique.h" "$(@D)/cuda/include/thrust/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/version.h" "$(@D)/cuda/include/thrust/version.h" && cp -f "/usr/local/cuda-9.0/include/vector_functions.h" "$(@D)/cuda/include/vector_functions.h" && cp -f "/usr/local/cuda-9.0/include/vector_functions.hpp" "$(@D)/cuda/include/vector_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/vector_types.h" "$(@D)/cuda/include/vector_types.h"
-   """,
+    cmd = """cp -rf "/usr/local/cuda-9.0/include/." "$(@D)/cuda/include/" """,
 )
 
 genrule(
@@ -1197,9 +1195,7 @@ genrule(
     outs = [
         "cuda/nvvm/libdevice/libdevice.10.bc",
     ],
-    cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-9.0/nvvm/libdevice/libdevice.10.bc" "$(@D)//libdevice.10.bc"
-   """,
+    cmd = """cp -rf "/usr/local/cuda-9.0/nvvm/libdevice/." "$(@D)/" """,
 )
 
 genrule(
@@ -1234,9 +1230,7 @@ genrule(
         "cuda/extras/CUPTI/include/generated_nvtx_meta.h",
         "cuda/extras/CUPTI/include/openacc/cupti_openacc.h",
     ],
-    cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/gl.h" "$(@D)/cuda/extras/CUPTI/include/GL/gl.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glew.h" "$(@D)/cuda/extras/CUPTI/include/GL/glew.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glext.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glu.h" "$(@D)/cuda/extras/CUPTI/include/GL/glu.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glut.h" "$(@D)/cuda/extras/CUPTI/include/GL/glut.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glx.h" "$(@D)/cuda/extras/CUPTI/include/GL/glx.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glxext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glxext.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglew.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglew.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglext.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglext.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cuda_stdint.h" "$(@D)/cuda/extras/CUPTI/include/cuda_stdint.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti.h" "$(@D)/cuda/extras/CUPTI/include/cupti.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_activity.h" "$(@D)/cuda/extras/CUPTI/include/cupti_activity.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_callbacks.h" "$(@D)/cuda/extras/CUPTI/include/cupti_callbacks.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_driver_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_driver_cbid.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_events.h" "$(@D)/cuda/extras/CUPTI/include/cupti_events.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_metrics.h" "$(@D)/cuda/extras/CUPTI/include/cupti_metrics.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_nvtx_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_nvtx_cbid.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_result.h" "$(@D)/cuda/extras/CUPTI/include/cupti_result.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_runtime_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_runtime_cbid.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_version.h" "$(@D)/cuda/extras/CUPTI/include/cupti_version.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaGL_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaGL_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaVDPAU_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_nvtx_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_nvtx_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/openacc/cupti_openacc.h" "$(@D)/cuda/extras/CUPTI/include/openacc/cupti_openacc.h"
-   """,
+    cmd = """cp -rf "/usr/local/cuda-9.0/extras/CUPTI/include/." "$(@D)/cuda/extras/CUPTI/include/" """,
 )
 
 genrule(
@@ -1252,9 +1246,32 @@ genrule(
         "cuda/lib/libcudnn.so.7",
         "cuda/lib/libcupti.so.9.0",
     ],
-    cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart.so.9.0.176" "$(@D)/cuda/lib/libcudart.so.9.0" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcublas.so.9.0.480" "$(@D)/cuda/lib/libcublas.so.9.0" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcusolver.so.9.0.176" "$(@D)/cuda/lib/libcusolver.so.9.0" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcurand.so.9.0.176" "$(@D)/cuda/lib/libcurand.so.9.0" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcufft.so.9.0.176" "$(@D)/cuda/lib/libcufft.so.9.0" && cp -f "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.1.4" "$(@D)/cuda/lib/libcudnn.so.7" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/lib64/libcupti.so.9.0.176" "$(@D)/cuda/lib/libcupti.so.9.0"
-   """,
+    cmd = """cp -f "/usr/local/cuda-9.0/lib64/stubs/libcuda.so" $(location cuda/lib/libcuda.so) && cp -f "/usr/local/cuda-9.0/lib64/libcudart.so.9.0" $(location cuda/lib/libcudart.so.9.0) && cp -f "/usr/local/cuda-9.0/lib64/libcudart_static.a" $(location cuda/lib/libcudart_static.a) && cp -f "/usr/local/cuda-9.0/lib64/libcublas.so.9.0" $(location cuda/lib/libcublas.so.9.0) && cp -f "/usr/local/cuda-9.0/lib64/libcusolver.so.9.0" $(location cuda/lib/libcusolver.so.9.0) && cp -f "/usr/local/cuda-9.0/lib64/libcurand.so.9.0" $(location cuda/lib/libcurand.so.9.0) && cp -f "/usr/local/cuda-9.0/lib64/libcufft.so.9.0" $(location cuda/lib/libcufft.so.9.0) && cp -f "/usr/lib/x86_64-linux-gnu/libcudnn.so.7" $(location cuda/lib/libcudnn.so.7) && cp -f "/usr/local/cuda-9.0/extras/CUPTI/lib64/libcupti.so.9.0" $(location cuda/lib/libcupti.so.9.0) """,
+)
+
+genrule(
+    name = "cuda-bin",
+    outs = [
+        "cuda/bin/bin2c",
+        "cuda/bin/crt/link.stub",
+        "cuda/bin/crt/prelink.stub",
+        "cuda/bin/cuda-gdb",
+        "cuda/bin/cuda-gdbserver",
+        "cuda/bin/cuda-memcheck",
+        "cuda/bin/cudafe",
+        "cuda/bin/cudafe++",
+        "cuda/bin/cuobjdump",
+        "cuda/bin/fatbinary",
+        "cuda/bin/gpu-library-advisor",
+        "cuda/bin/nvcc",
+        "cuda/bin/nvcc.profile",
+        "cuda/bin/nvdisasm",
+        "cuda/bin/nvlink",
+        "cuda/bin/nvprof",
+        "cuda/bin/nvprune",
+        "cuda/bin/ptxas",
+    ],
+    cmd = """cp -rf "/usr/local/cuda-9.0/bin/." "$(@D)/cuda/bin/" """,
 )
 
 genrule(
@@ -1262,7 +1279,5 @@ genrule(
     outs = [
         "cuda/include/cudnn.h",
     ],
-    cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/include/cudnn.h" "$(@D)/cudnn.h"
-   """,
+    cmd = """cp -f "/usr/include/cudnn.h" $(location cuda/include/cudnn.h) """,
 )
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
index 6442e7628a416e3298cfd2579cee275459780145..2b84b761ff9fb0c8a803a8cae8d1f9b89c210008 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
@@ -22,6 +22,12 @@ cc_toolchain_suite(
         "local|compiler": ":cc-compiler-local",
         "darwin|compiler": ":cc-compiler-darwin",
         "x64_windows|msvc-cl": ":cc-compiler-windows",
+        "x64_windows": ":cc-compiler-windows",
+        "arm": ":cc-compiler-local",
+        "k8": ":cc-compiler-local",
+        "piii": ":cc-compiler-local",
+        "ppc": ":cc-compiler-local",
+        "darwin": ":cc-compiler-darwin",
     },
 )
 
@@ -41,6 +47,7 @@ cc_toolchain(
     # last on the command line and contain all shared libraries to link, so all
     # regular options will be left of them.
     supports_param_files = 1,
+    toolchain_identifier = "local_linux",
 )
 
 cc_toolchain(
@@ -55,6 +62,7 @@ cc_toolchain(
     static_runtime_libs = [":empty"],
     strip_files = ":empty",
     supports_param_files = 0,
+    toolchain_identifier = "local_darwin",
 )
 
 cc_toolchain(
@@ -69,6 +77,7 @@ cc_toolchain(
     static_runtime_libs = [":empty"],
     strip_files = ":empty",
     supports_param_files = 1,
+    toolchain_identifier = "local_windows",
 )
 
 filegroup(
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/CROSSTOOL b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/CROSSTOOL
index 1c2e8bcae63ebc9b1ee22b5d677c185589b547f8..b6b87e87d2bed20d2d6def051e7b7c32374189e9 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/CROSSTOOL
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/CROSSTOOL
@@ -2,31 +2,6 @@ major_version: "local"
 minor_version: ""
 default_target_cpu: "same_as_host"
 
-default_toolchain {
-  cpu: "k8"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "piii"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "arm"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "darwin"
-  toolchain_identifier: "local_darwin"
-}
-default_toolchain {
-  cpu: "ppc"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "x64_windows"
-  toolchain_identifier: "local_windows"
-}
-
 toolchain {
   abi_version: "local"
   abi_libc_version: "local"
@@ -664,6 +639,31 @@ toolchain {
     name: "no_legacy_features"
   }
 
+  # TODO(klimek): Previously we were using a .bat file to start python to run
+  # the python script that can redirect to nvcc - unfortunately .bat files
+  # have a rather short maximum length for command lines (8k). Instead, we
+  # now use the python binary as the compiler and pass the python script to
+  # it at the start of the command line. Investigate different possibilities
+  # to run the nvcc wrapper, either using pyinstaller --onefile, or writing
+  # a small C++ wrapper to redirect.
+  feature {
+    name: "redirector"
+    enabled: true
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      flag_group {
+        flag: "-B"
+        flag: "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py"
+      }
+    }
+  }
+
   # Suppress startup banner.
   feature {
     name: "nologo"
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
index 7ae59e9967adf9b1a980a8085e203459ba8a7c7b..c49b20f2eb9d9c8f9c66ca5096aad771377bf437 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
@@ -199,7 +199,7 @@ def InvokeNvcc(argv, log=False):
   srcs = ' '.join(src_files)
   out = ' -o ' + out_file[0]
 
-  supported_cuda_compute_capabilities = [ "3.0" ]
+  supported_cuda_compute_capabilities = [ "3.0", "6.0" ]
   nvccopts = '-D_FORCE_INLINES '
   for capability in supported_cuda_compute_capabilities:
     capability = capability.replace('.', '')
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
index 00483951af966e0085e6f2b1d74290d9ee872963..510ba52fd5eaa328bd7514c86c1e5e2d7ddd7893 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
@@ -36,7 +36,7 @@ GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
 NVCC_PATH = '/usr/local/cuda-10.0/bin/nvcc'
 NVCC_VERSION = '10.0'
 NVCC_TEMP_DIR = "C:\\Windows\\Temp\\nvcc_inter_files_tmp_dir"
-supported_cuda_compute_capabilities = [ "3.0" ]
+supported_cuda_compute_capabilities = [ "3.0", "6.0" ]
 
 def Log(s):
   print('gpus/crosstool: {0}'.format(s))
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.py
index 859b3196d5dba9afadeae56f34be04247b00fe09..b0b4a53a805cba4e1be3b6b5438ca725a3599e78 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.py
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.py
@@ -103,8 +103,9 @@ def InvokeNvcc(argv, log=False):
     The return value of calling os.system('nvcc ' + args)
   """
 
-  src_files = [f for f in argv if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  src_files = [
+      f for f in argv if re.search(r'\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)
+  ]
   if len(src_files) == 0:
     raise Error('No source files found for cuda compilation.')
 
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/clang/bin/crosstool_wrapper_driver_is_not_gcc
index 63893d3722f6b43579758e5f747076b1f1e73ed7..192314137d4f5ca178e350894550132d045d7a2b 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/clang/bin/crosstool_wrapper_driver_is_not_gcc
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/clang/bin/crosstool_wrapper_driver_is_not_gcc
@@ -49,9 +49,9 @@ import pipes
 CPU_COMPILER = ('/usr/bin/gcc')
 GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
 
-NVCC_PATH = '/usr/local/cuda-9.0/bin/nvcc'
+NVCC_PATH = '/usr/local/cuda/bin/nvcc'
 PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
-NVCC_VERSION = '9.0'
+NVCC_VERSION = '10.0'
 
 def Log(s):
   print('gpus/crosstool: {0}'.format(s))
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/windows/msvc_wrapper_for_nvcc.py
index 859b3196d5dba9afadeae56f34be04247b00fe09..b0b4a53a805cba4e1be3b6b5438ca725a3599e78 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/windows/msvc_wrapper_for_nvcc.py
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/windows/msvc_wrapper_for_nvcc.py
@@ -103,8 +103,9 @@ def InvokeNvcc(argv, log=False):
     The return value of calling os.system('nvcc ' + args)
   """
 
-  src_files = [f for f in argv if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  src_files = [
+      f for f in argv if re.search(r'\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)
+  ]
   if len(src_files) == 0:
     raise Error('No source files found for cuda compilation.')
 
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/BUILD
new file mode 100755
index 0000000000000000000000000000000000000000..2b84b761ff9fb0c8a803a8cae8d1f9b89c210008
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/BUILD
@@ -0,0 +1,96 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+toolchain(
+    name = "toolchain-linux-x86_64",
+    exec_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    toolchain = ":cc-compiler-local",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "local|compiler": ":cc-compiler-local",
+        "darwin|compiler": ":cc-compiler-darwin",
+        "x64_windows|msvc-cl": ":cc-compiler-windows",
+        "x64_windows": ":cc-compiler-windows",
+        "arm": ":cc-compiler-local",
+        "k8": ":cc-compiler-local",
+        "piii": ":cc-compiler-local",
+        "ppc": ":cc-compiler-local",
+        "darwin": ":cc-compiler-darwin",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-local",
+    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    compiler_files = ":empty",
+    cpu = "local",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    # To support linker flags that need to go to the start of command line
+    # we need the toolchain to support parameter files. Parameter files are
+    # last on the command line and contain all shared libraries to link, so all
+    # regular options will be left of them.
+    supports_param_files = 1,
+    toolchain_identifier = "local_linux",
+)
+
+cc_toolchain(
+    name = "cc-compiler-darwin",
+    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    compiler_files = ":empty",
+    cpu = "darwin",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 0,
+    toolchain_identifier = "local_darwin",
+)
+
+cc_toolchain(
+    name = "cc-compiler-windows",
+    all_files = ":windows_msvc_wrapper_files",
+    compiler_files = ":empty",
+    cpu = "x64_windows",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":windows_msvc_wrapper_files",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_identifier = "local_windows",
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "crosstool_wrapper_driver_is_not_gcc",
+    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
+)
+
+filegroup(
+    name = "windows_msvc_wrapper_files",
+    srcs = glob(["windows/msvc_*"]),
+)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/CROSSTOOL b/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/CROSSTOOL
new file mode 100755
index 0000000000000000000000000000000000000000..059842f58cf748e972635db1a82d6ed4ef580f6c
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/CROSSTOOL
@@ -0,0 +1,1431 @@
+major_version: "local"
+minor_version: ""
+default_target_cpu: "same_as_host"
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "local"
+  target_cpu: "local"
+  target_system_name: "local"
+  toolchain_identifier: "local_linux"
+
+  feature {
+    name: "c++11"
+    flag_set {
+      action: "c++-compile"
+      flag_group {
+        flag: "-std=c++11"
+      }
+    }
+  }
+
+  feature {
+    name: "stdlib"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-lstdc++"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        flag: "-Wno-builtin-macro-redefined"
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  feature {
+    name: "alwayslink"
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,-no-as-needed"
+      }
+    }
+  }
+
+  # This feature will be enabled for builds that support pic by bazel.
+  feature {
+    name: "pic"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        expand_if_all_available: "pic"
+        flag: "-fPIC"
+      }
+      flag_group {
+        expand_if_none_available: "pic"
+        flag: "-fPIE"
+      }
+    }
+  }
+
+  # Security hardening on by default.
+  feature {
+    name: "hardening"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+        # We need to undef it before redefining it as some distributions now
+        # have it enabled by default.
+        flag: "-U_FORTIFY_SOURCE"
+        flag: "-D_FORTIFY_SOURCE=1"
+        flag: "-fstack-protector"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-pie"
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+  }
+
+  feature {
+    name: "warnings"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # All warnings are enabled. Maybe enable -Werror as well?
+        flag: "-Wall"
+        
+      }
+    }
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  feature {
+    name: "frame-pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-fno-omit-frame-pointer"
+      }
+    }
+  }
+
+  feature {
+    name: "build-id"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        # Stamp the binary with a unique identifier.
+        flag: "-Wl,--build-id=md5"
+        flag: "-Wl,--hash-style=gnu"
+      }
+    }
+  }
+
+  feature {
+    name: "no-canonical-prefixes"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-no-canonical-prefixes"
+        flag: "-fno-canonical-system-headers"
+      }
+    }
+  }
+
+  feature {
+    name: "disable-assertions"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "linker-bin-path"
+
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-B/usr/bin"
+      }
+    }
+  }
+
+  feature {
+    name: "common"
+    implies: "stdlib"
+    implies: "c++11"
+    implies: "determinism"
+    implies: "alwayslink"
+    implies: "hardening"
+    implies: "warnings"
+    implies: "frame-pointer"
+    implies: "build-id"
+    implies: "no-canonical-prefixes"
+    implies: "linker-bin-path"
+  }
+
+  feature {
+    name: "opt"
+    implies: "common"
+    implies: "disable-assertions"
+
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # No debug symbols.
+        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
+        # or even generally? However, that can't happen here, as it requires
+        # special handling in Bazel.
+        flag: "-g0"
+
+        # Conservative choice for -O
+        # -O3 can increase binary size and even slow down the resulting binaries.
+        # Profile first and / or use FDO if you need better performance than this.
+        flag: "-O2"
+
+        # Removal of unused code and data at link time (can this increase binary size in some cases?).
+        flag: "-ffunction-sections"
+        flag: "-fdata-sections"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,--gc-sections"
+      }
+    }
+  }
+
+  feature {
+    name: "fastbuild"
+    implies: "common"
+  }
+
+  feature {
+    name: "dbg"
+    implies: "common"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-g"
+      }
+    }
+  }
+
+  # Set clang as a C/C++ compiler.
+  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
+
+  # Use the default system toolchain for everything else.
+  tool_path { name: "ar" path: "/usr/bin/ar" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Enabled dynamic linking.
+  linking_mode_flags { mode: DYNAMIC }
+
+  cxx_builtin_include_directory: "/usr/include/c++/7"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/7"
+  cxx_builtin_include_directory: "/usr/include/c++/7/backward"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/7/include"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/7/include-fixed"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
+  cxx_builtin_include_directory: "/usr/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/targets/x86_64-linux/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/extras/CUPTI/include"
+  cxx_builtin_include_directory: "/usr/include"
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "macosx"
+  target_cpu: "darwin"
+  target_system_name: "local"
+  toolchain_identifier: "local_darwin"
+  feature {
+    name: "c++11"
+    flag_set {
+      action: "c++-compile"
+      flag_group {
+        flag: "-std=c++11"
+      }
+    }
+  }
+
+  feature {
+    name: "stdlib"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-lc++"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        flag: "-Wno-builtin-macro-redefined"
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  # This feature will be enabled for builds that support pic by bazel.
+  feature {
+    name: "pic"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        expand_if_all_available: "pic"
+        flag: "-fPIC"
+      }
+      flag_group {
+        expand_if_none_available: "pic"
+        flag: "-fPIE"
+      }
+    }
+  }
+
+  # Security hardening on by default.
+  feature {
+    name: "hardening"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+        # We need to undef it before redefining it as some distributions now
+        # have it enabled by default.
+        flag: "-U_FORTIFY_SOURCE"
+        flag: "-D_FORTIFY_SOURCE=1"
+        flag: "-fstack-protector"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-pie"
+      }
+    }
+  }
+
+  feature {
+    name: "warnings"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # All warnings are enabled. Maybe enable -Werror as well?
+        flag: "-Wall"
+        
+      }
+    }
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  feature {
+    name: "frame-pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-fno-omit-frame-pointer"
+      }
+    }
+  }
+
+  feature {
+    name: "no-canonical-prefixes"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag:"-no-canonical-prefixes"
+      }
+    }
+  }
+
+  feature {
+    name: "disable-assertions"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "linker-bin-path"
+
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-B/usr/bin"
+      }
+    }
+  }
+
+  feature {
+    name: "undefined-dynamic"
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-undefined"
+        flag: "dynamic_lookup"
+      }
+    }
+  }
+
+  feature {
+    name: "common"
+    implies: "stdlib"
+    implies: "c++11"
+    implies: "determinism"
+    implies: "hardening"
+    implies: "warnings"
+    implies: "frame-pointer"
+    implies: "no-canonical-prefixes"
+    implies: "linker-bin-path"
+    implies: "undefined-dynamic"
+  }
+
+  feature {
+    name: "opt"
+    implies: "common"
+    implies: "disable-assertions"
+
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # No debug symbols.
+        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
+        # or even generally? However, that can't happen here, as it requires
+        # special handling in Bazel.
+        flag: "-g0"
+
+        # Conservative choice for -O
+        # -O3 can increase binary size and even slow down the resulting binaries.
+        # Profile first and / or use FDO if you need better performance than this.
+        flag: "-O2"
+
+        # Removal of unused code and data at link time (can this increase binary size in some cases?).
+        flag: "-ffunction-sections"
+        flag: "-fdata-sections"
+      }
+    }
+  }
+
+  feature {
+    name: "fastbuild"
+    implies: "common"
+  }
+
+  feature {
+    name: "dbg"
+    implies: "common"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-g"
+      }
+    }
+  }
+
+  # Set clang as a C/C++ compiler.
+  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
+
+  # Use the default system toolchain for everything else.
+  tool_path { name: "ar" path: "/usr/bin/libtool" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Enabled dynamic linking.
+  linking_mode_flags { mode: DYNAMIC }
+
+  cxx_builtin_include_directory: "/usr/include/c++/7"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/7"
+  cxx_builtin_include_directory: "/usr/include/c++/7/backward"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/7/include"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/7/include-fixed"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
+  cxx_builtin_include_directory: "/usr/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/targets/x86_64-linux/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/extras/CUPTI/include"
+  cxx_builtin_include_directory: "/usr/include"
+}
+
+toolchain {
+  toolchain_identifier: "local_windows"
+  host_system_name: "local"
+  target_system_name: "local"
+
+  abi_version: "local"
+  abi_libc_version: "local"
+  target_cpu: "x64_windows"
+  compiler: "msvc-cl"
+  target_libc: "msvcrt"
+
+
+
+  tool_path {
+    name: "ar"
+    path: ""
+  }
+  tool_path {
+    name: "ml"
+    path: ""
+  }
+  tool_path {
+    name: "cpp"
+    path: ""
+  }
+  tool_path {
+    name: "gcc"
+    path: ""
+  }
+  tool_path {
+    name: "gcov"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "ld"
+    path: ""
+  }
+  tool_path {
+    name: "nm"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objcopy"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objdump"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "strip"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  supports_interface_shared_objects: true
+
+  # TODO(pcloudy): Review those flags below, they should be defined by cl.exe
+  compiler_flag: "/DCOMPILER_MSVC"
+
+  # Don't define min/max macros in windows.h.
+  compiler_flag: "/DNOMINMAX"
+
+  # Platform defines.
+  compiler_flag: "/D_WIN32_WINNT=0x0600"
+  # Turn off warning messages.
+  compiler_flag: "/D_CRT_SECURE_NO_DEPRECATE"
+  compiler_flag: "/D_CRT_SECURE_NO_WARNINGS"
+  compiler_flag: "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS"
+
+  # Useful options to have on for compilation.
+  # Increase the capacity of object files to 2^32 sections.
+  compiler_flag: "/bigobj"
+  # Allocate 500MB for precomputed headers.
+  compiler_flag: "/Zm500"
+  # Use unsigned char by default.
+  compiler_flag: "/J"
+  # Use function level linking.
+  compiler_flag: "/Gy"
+  # Use string pooling.
+  compiler_flag: "/GF"
+  # Catch C++ exceptions only and tell the compiler to assume that functions declared
+  # as extern "C" never throw a C++ exception.
+  compiler_flag: "/EHsc"
+
+  # Globally disabled warnings.
+  # Don't warn about elements of array being be default initialized.
+  compiler_flag: "/wd4351"
+  # Don't warn about no matching delete found.
+  compiler_flag: "/wd4291"
+  # Don't warn about diamond inheritance patterns.
+  compiler_flag: "/wd4250"
+  # Don't warn about insecure functions (e.g. non _s functions).
+  compiler_flag: "/wd4996"
+
+  linker_flag: "/MACHINE:X64"
+
+  feature {
+    name: "no_legacy_features"
+  }
+
+  # TODO(klimek): Previously we were using a .bat file to start python to run
+  # the python script that can redirect to nvcc - unfortunately .bat files
+  # have a rather short maximum length for command lines (8k). Instead, we
+  # now use the python binary as the compiler and pass the python script to
+  # it at the start of the command line. Investigate different possibilities
+  # to run the nvcc wrapper, either using pyinstaller --onefile, or writing
+  # a small C++ wrapper to redirect.
+  feature {
+    name: "redirector"
+    enabled: true
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      flag_group {
+        flag: "-B"
+        flag: "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py"
+      }
+    }
+  }
+
+  # Suppress startup banner.
+  feature {
+    name: "nologo"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      flag_group {
+        flag: "/nologo"
+      }
+    }
+  }
+
+  feature {
+    name: 'has_configured_linker_path'
+  }
+
+  # This feature indicates strip is not supported, building stripped binary will just result a copy of orignial binary
+  feature {
+    name: 'no_stripping'
+  }
+
+  # This feature indicates this is a toolchain targeting Windows.
+  feature {
+    name: 'targets_windows'
+    implies: 'copy_dynamic_libraries_to_binary'
+    enabled: true
+  }
+
+  feature {
+    name: 'copy_dynamic_libraries_to_binary'
+  }
+
+  action_config {
+    config_name: 'assemble'
+    action_name: 'assemble'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'preprocess-assemble'
+    action_name: 'preprocess-assemble'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'c-compile'
+    action_name: 'c-compile'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-compile'
+    action_name: 'c++-compile'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-link-executable'
+    action_name: 'c++-link-executable'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+  }
+
+  action_config {
+    config_name: 'c++-link-dynamic-library'
+    action_name: 'c++-link-dynamic-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'shared_flag'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+    implies: 'has_configured_linker_path'
+    implies: 'def_file'
+  }
+
+  action_config {
+      config_name: 'c++-link-nodeps-dynamic-library'
+      action_name: 'c++-link-nodeps-dynamic-library'
+      tool {
+        tool_path: ''
+      }
+      implies: 'nologo'
+      implies: 'shared_flag'
+      implies: 'linkstamps'
+      implies: 'output_execpath_flags'
+      implies: 'input_param_flags'
+      implies: 'user_link_flags'
+      implies: 'legacy_link_flags'
+      implies: 'linker_subsystem_flag'
+      implies: 'linker_param_file'
+      implies: 'msvc_env'
+      implies: 'no_stripping'
+      implies: 'has_configured_linker_path'
+      implies: 'def_file'
+    }
+
+  action_config {
+    config_name: 'c++-link-static-library'
+    action_name: 'c++-link-static-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'archiver_flags'
+    implies: 'input_param_flags'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+  }
+
+  # TODO(b/65151735): Remove legacy_compile_flags feature when legacy fields are
+  # not used in this crosstool
+  feature {
+    name: 'legacy_compile_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'legacy_compile_flags'
+        flag: '%{legacy_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: "msvc_env"
+    env_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      env_entry {
+        key: "PATH"
+        value: ""
+      }
+      env_entry {
+        key: "INCLUDE"
+        value: ""
+      }
+      env_entry {
+        key: "LIB"
+        value: ""
+      }
+      env_entry {
+        key: "TMP"
+        value: ""
+      }
+      env_entry {
+        key: "TEMP"
+        value: ""
+      }
+    }
+  }
+
+  feature {
+    name: 'include_paths'
+    flag_set {
+      action: "assemble"
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      flag_group {
+        iterate_over: 'quote_include_paths'
+        flag: '/I%{quote_include_paths}'
+      }
+      flag_group {
+        iterate_over: 'include_paths'
+        flag: '/I%{include_paths}'
+      }
+      flag_group {
+        iterate_over: 'system_include_paths'
+        flag: '/I%{system_include_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: "preprocessor_defines"
+    flag_set {
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-module-compile"
+      flag_group {
+        flag: "/D%{preprocessor_defines}"
+        iterate_over: "preprocessor_defines"
+      }
+    }
+  }
+
+  # Tell Bazel to parse the output of /showIncludes
+  feature {
+    name: 'parse_showincludes'
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-module-compile'
+      action: 'c++-header-parsing'
+      flag_group {
+        flag: "/showIncludes"
+      }
+    }
+  }
+
+
+  feature {
+    name: 'generate_pdb_file'
+    requires: {
+      feature: 'dbg'
+    }
+    requires: {
+      feature: 'fastbuild'
+    }
+  }
+
+  feature {
+    name: 'shared_flag'
+    flag_set {
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/DLL'
+      }
+    }
+  }
+
+  feature {
+    name: 'linkstamps'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      expand_if_all_available: 'linkstamp_paths'
+      flag_group {
+        iterate_over: 'linkstamp_paths'
+        flag: '%{linkstamp_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: 'output_execpath_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'archiver_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'input_param_flags'
+    flag_set {
+      expand_if_all_available: 'interface_library_output_path'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/IMPLIB:%{interface_library_output_path}"
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libopts'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'libopts'
+        flag: '%{libopts}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libraries_to_link'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        iterate_over: 'libraries_to_link'
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file_group'
+          }
+          iterate_over: 'libraries_to_link.object_files'
+          flag_group {
+            flag: '%{libraries_to_link.object_files}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'interface_library'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'static_library'
+          }
+          flag_group {
+            expand_if_false: 'libraries_to_link.is_whole_archive'
+            flag: '%{libraries_to_link.name}'
+          }
+          flag_group {
+            expand_if_true: 'libraries_to_link.is_whole_archive'
+            flag: '/WHOLEARCHIVE:%{libraries_to_link.name}'
+          }
+        }
+      }
+    }
+  }
+
+  # Since this feature is declared earlier in the CROSSTOOL than
+  # "user_link_flags", this feature will be applied prior to it anwyhere they
+  # are both implied. And since "user_link_flags" contains the linkopts from
+  # the build rule, this allows the user to override the /SUBSYSTEM in the BUILD
+  # file.
+  feature {
+    name: 'linker_subsystem_flag'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/SUBSYSTEM:CONSOLE'
+      }
+    }
+  }
+
+  # The "user_link_flags" contains user-defined linkopts (from build rules)
+  # so it should be defined after features that declare user-overridable flags.
+  # For example the "linker_subsystem_flag" defines a default "/SUBSYSTEM" flag
+  # but we want to let the user override it, therefore "link_flag_subsystem" is
+  # defined earlier in the CROSSTOOL file than "user_link_flags".
+  feature {
+    name: 'user_link_flags'
+    flag_set {
+      expand_if_all_available: 'user_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'user_link_flags'
+        flag: '%{user_link_flags}'
+      }
+    }
+  }
+  feature {
+    name: 'legacy_link_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'legacy_link_flags'
+        flag: '%{legacy_link_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'linker_param_file'
+    flag_set {
+      expand_if_all_available: 'linker_param_file'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '@%{linker_param_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'static_link_msvcrt'
+  }
+
+  feature {
+    name: 'static_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MT"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MD"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'static_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MTd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MDd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dbg'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+        flag: "/DDEBUG"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEBUG:FULL"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'fastbuild'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+        flag: "/DDEBUG"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEBUG:FASTLINK"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'opt'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/O2"
+        flag: "/DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: 'user_compile_flags'
+    flag_set {
+      expand_if_all_available: 'user_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'user_compile_flags'
+        flag: '%{user_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'sysroot'
+    flag_set {
+      expand_if_all_available: 'sysroot'
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'sysroot'
+        flag: '--sysroot=%{sysroot}'
+      }
+    }
+  }
+
+  feature {
+    name: 'unfiltered_compile_flags'
+    flag_set {
+      expand_if_all_available: 'unfiltered_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'unfiltered_compile_flags'
+        flag: '%{unfiltered_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_output_flags'
+    flag_set {
+      action: 'assemble'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+        flag: '/Zi'
+      }
+    }
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_assembly_file'
+        flag: '/Fa%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_preprocess_file'
+        flag: '/P'
+        flag: '/Fi%{output_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_input_flags'
+    flag_set {
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'source_file'
+        flag: '/c'
+        flag: '%{source_file}'
+      }
+    }
+  }
+
+  feature {
+    name : 'def_file',
+    flag_set {
+      expand_if_all_available: 'def_file_path'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEF:%{def_file_path}"
+        # We can specify a different DLL name in DEF file, /ignore:4070 suppresses
+        # the warning message about DLL name doesn't match the default one.
+        # See https://msdn.microsoft.com/en-us/library/sfkk2fz7.aspx
+        flag: "/ignore:4070"
+      }
+    }
+  }
+
+  feature {
+    name: 'windows_export_all_symbols'
+  }
+
+  feature {
+    name: 'no_windows_export_all_symbols'
+  }
+
+  linking_mode_flags { mode: DYNAMIC }
+}
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
new file mode 100755
index 0000000000000000000000000000000000000000..07742839ca5eabdeb7acd902aefa8ece4201347b
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
@@ -0,0 +1,264 @@
+#!/usr/bin/env python
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Crosstool wrapper for compiling CUDA programs.
+
+SYNOPSIS:
+  crosstool_wrapper_is_not_gcc [options passed in by cc_library()
+                                or cc_binary() rule]
+
+DESCRIPTION:
+  This script is expected to be called by the cc_library() or cc_binary() bazel
+  rules. When the option "-x cuda" is present in the list of arguments passed
+  to this script, it invokes the nvcc CUDA compiler. Most arguments are passed
+  as is as a string to --compiler-options of nvcc. When "-x cuda" is not
+  present, this wrapper invokes hybrid_driver_is_not_gcc with the input
+  arguments as is.
+
+NOTES:
+  Changes to the contents of this file must be propagated from
+  //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc to
+  //third_party/gpus/crosstool/v*/*/clang/bin/crosstool_wrapper_is_not_gcc
+"""
+
+from __future__ import print_function
+
+__author__ = 'keveman@google.com (Manjunath Kudlur)'
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('/usr/bin/gcc-7')
+GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc-7')
+
+NVCC_PATH = '/usr/local/cuda-10.0/bin/nvcc'
+PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
+NVCC_VERSION = '10.0'
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from the argv list.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    option: The option whose value to extract, without the leading '-'.
+
+  Returns:
+    A list of values, either directly following the option,
+    (eg., -opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., -opt val1 -opt val2).
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-' + option, nargs='*', action='append')
+  args, _ = parser.parse_known_args(argv)
+  if not args or not vars(args)[option]:
+    return []
+  else:
+    return sum(vars(args)[option], [])
+
+
+def GetHostCompilerOptions(argv):
+  """Collect the -isystem, -iquote, and --sysroot option values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be used as the --compiler-options to nvcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-isystem', nargs='*', action='append')
+  parser.add_argument('-iquote', nargs='*', action='append')
+  parser.add_argument('--sysroot', nargs=1)
+  parser.add_argument('-g', nargs='*', action='append')
+  parser.add_argument('-fno-canonical-system-headers', action='store_true')
+
+  args, _ = parser.parse_known_args(argv)
+
+  opts = ''
+
+  if args.isystem:
+    opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
+  if args.iquote:
+    opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
+  if args.g:
+    opts += ' -g' + ' -g'.join(sum(args.g, []))
+  if args.fno_canonical_system_headers:
+    opts += ' -fno-canonical-system-headers'
+  if args.sysroot:
+    opts += ' --sysroot ' + args.sysroot[0]
+
+  return opts
+
+def _update_options(nvcc_options):
+  if NVCC_VERSION in ("7.0",):
+    return nvcc_options
+
+  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
+  return [ update_options[opt] if opt in update_options else opt
+                    for opt in nvcc_options ]
+
+def GetNvccOptions(argv):
+  """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be passed directly to nvcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+  args, _ = parser.parse_known_args(argv)
+
+  if args.nvcc_options:
+    options = _update_options(sum(args.nvcc_options, []))
+    return ' '.join(['--'+a for a in options])
+  return ''
+
+
+def InvokeNvcc(argv, log=False):
+  """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling os.system('nvcc ' + args)
+  """
+
+  host_compiler_options = GetHostCompilerOptions(argv)
+  nvcc_compiler_options = GetNvccOptions(argv)
+  opt_option = GetOptionValue(argv, 'O')
+  m_options = GetOptionValue(argv, 'm')
+  m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
+  include_options = GetOptionValue(argv, 'I')
+  out_file = GetOptionValue(argv, 'o')
+  depfiles = GetOptionValue(argv, 'MF')
+  defines = GetOptionValue(argv, 'D')
+  defines = ''.join([' -D' + define for define in defines])
+  undefines = GetOptionValue(argv, 'U')
+  undefines = ''.join([' -U' + define for define in undefines])
+  std_options = GetOptionValue(argv, 'std')
+  # currently only c++11 is supported by Cuda 7.0 std argument
+  nvcc_allowed_std_options = ["c++11"]
+  std_options = ''.join([' -std=' + define
+      for define in std_options if define in nvcc_allowed_std_options])
+
+  # The list of source files get passed after the -c option. I don't know of
+  # any other reliable way to just get the list of source files to be compiled.
+  src_files = GetOptionValue(argv, 'c')
+
+  # Pass -w through from host to nvcc, but don't do anything fancier with
+  # warnings-related flags, since they're not necessarily the same across
+  # compilers.
+  warning_options = ' -w' if '-w' in argv else ''
+
+  if len(src_files) == 0:
+    return 1
+  if len(out_file) != 1:
+    return 1
+
+  opt = (' -O2' if (len(opt_option) > 0 and int(opt_option[0]) > 0)
+         else ' -g -G')
+
+  includes = (' -I ' + ' -I '.join(include_options)
+              if len(include_options) > 0
+              else '')
+
+  # Unfortunately, there are other options that have -c prefix too.
+  # So allowing only those look like C/C++ files.
+  src_files = [f for f in src_files if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  srcs = ' '.join(src_files)
+  out = ' -o ' + out_file[0]
+
+  supported_cuda_compute_capabilities = [ "3.0" ]
+  nvccopts = '-D_FORCE_INLINES '
+  for capability in supported_cuda_compute_capabilities:
+    capability = capability.replace('.', '')
+    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s,compute_%s\" ' % (
+        capability, capability, capability)
+  nvccopts += ' ' + nvcc_compiler_options
+  nvccopts += undefines
+  nvccopts += defines
+  nvccopts += std_options
+  nvccopts += m_options
+  nvccopts += warning_options
+
+  if depfiles:
+    # Generate the dependency file
+    depfile = depfiles[0]
+    cmd = (NVCC_PATH + ' ' + nvccopts +
+           ' --compiler-options "' + host_compiler_options + '"' +
+           ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+           ' -I .' +
+           ' -x cu ' + opt + includes + ' ' + srcs + ' -M -o ' + depfile)
+    if log: Log(cmd)
+    exit_status = os.system(cmd)
+    if exit_status != 0:
+      return exit_status
+
+  cmd = (NVCC_PATH + ' ' + nvccopts +
+         ' --compiler-options "' + host_compiler_options + ' -fPIC"' +
+         ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+         ' -I .' +
+         ' -x cu ' + opt + includes + ' -c ' + srcs + out)
+
+  # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
+  # Need to investigate and fix.
+  cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
+  if log: Log(cmd)
+  return os.system(cmd)
+
+
+def main():
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--cuda_log', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'cuda':
+    if args.cuda_log: Log('-x cuda')
+    leftover = [pipes.quote(s) for s in leftover]
+    if args.cuda_log: Log('using nvcc')
+    return InvokeNvcc(leftover, log=args.cuda_log)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--cuda_log'))]
+
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
new file mode 100755
index 0000000000000000000000000000000000000000..cb1385dccdc7a9b0d40533c273825250edd8a13b
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc7-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Crosstool wrapper for compiling CUDA programs with nvcc on Windows.
+
+DESCRIPTION:
+  This script is the Windows version of //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc
+"""
+
+from __future__ import print_function
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('/usr/bin/gcc-7')
+GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc-7')
+
+NVCC_PATH = '/usr/local/cuda-10.0/bin/nvcc'
+NVCC_VERSION = '10.0'
+NVCC_TEMP_DIR = "C:\\Windows\\Temp\\nvcc_inter_files_tmp_dir"
+supported_cuda_compute_capabilities = [ "3.0" ]
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from options.
+
+  Args:
+    option: The option whose value to extract, without the leading '/'.
+
+  Returns:
+    1. A list of values, either directly following the option,
+    (eg., /opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., /opt val1 /opt val2).
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser(prefix_chars='/')
+  parser.add_argument('/' + option, nargs='*', action='append')
+  args, leftover = parser.parse_known_args(argv)
+  if args and vars(args)[option]:
+    return (sum(vars(args)[option], []), leftover)
+  return ([], leftover)
+
+def _update_options(nvcc_options):
+  if NVCC_VERSION in ("7.0",):
+    return nvcc_options
+
+  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
+  return [ update_options[opt] if opt in update_options else opt
+                    for opt in nvcc_options ]
+
+def GetNvccOptions(argv):
+  """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    1. The string that can be passed directly to nvcc.
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+  args, leftover = parser.parse_known_args(argv)
+
+  if args.nvcc_options:
+    options = _update_options(sum(args.nvcc_options, []))
+    return (['--' + a for a in options], leftover)
+  return ([], leftover)
+
+
+def InvokeNvcc(argv, log=False):
+  """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling os.system('nvcc ' + args)
+  """
+
+  src_files = [f for f in argv if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  if len(src_files) == 0:
+    raise Error('No source files found for cuda compilation.')
+
+  out_file = [ f for f in argv if f.startswith('/Fo') ]
+  if len(out_file) != 1:
+    raise Error('Please sepecify exactly one output file for cuda compilation.')
+  out = ['-o', out_file[0][len('/Fo'):]]
+
+  nvcc_compiler_options, argv = GetNvccOptions(argv)
+
+  opt_option, argv = GetOptionValue(argv, 'O')
+  opt = ['-g', '-G']
+  if (len(opt_option) > 0 and opt_option[0] != 'd'):
+    opt = ['-O2']
+
+  include_options, argv = GetOptionValue(argv, 'I')
+  includes = ["-I " + include for include in include_options]
+
+  defines, argv = GetOptionValue(argv, 'D')
+  defines = ['-D' + define for define in defines]
+
+  undefines, argv = GetOptionValue(argv, 'U')
+  undefines = ['-U' + define for define in undefines]
+
+  # The rest of the unrecongized options should be passed to host compiler
+  host_compiler_options = [option for option in argv if option not in (src_files + out_file)]
+
+  m_options = ["-m64"]
+
+  nvccopts = ['-D_FORCE_INLINES']
+  for capability in supported_cuda_compute_capabilities:
+    capability = capability.replace('.', '')
+    nvccopts += [r'-gencode=arch=compute_%s,"code=sm_%s,compute_%s"' % (
+        capability, capability, capability)]
+  nvccopts += nvcc_compiler_options
+  nvccopts += undefines
+  nvccopts += defines
+  nvccopts += m_options
+  nvccopts += ['--compiler-options="' + " ".join(host_compiler_options) + '"']
+  nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files
+  # If we don't specify --keep-dir, nvcc will generate intermediate files under TEMP
+  # Put them under NVCC_TEMP_DIR instead, then Bazel can ignore files under NVCC_TEMP_DIR during dependency check
+  # http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver
+  # Different actions are sharing NVCC_TEMP_DIR, so we cannot remove it if the directory already exists.
+  if os.path.isfile(NVCC_TEMP_DIR):
+    os.remove(NVCC_TEMP_DIR)
+  if not os.path.exists(NVCC_TEMP_DIR):
+    os.makedirs(NVCC_TEMP_DIR)
+  nvccopts += ['--keep', '--keep-dir', NVCC_TEMP_DIR]
+  cmd = [NVCC_PATH] + nvccopts
+  if log:
+    Log(cmd)
+  proc = subprocess.Popen(cmd,
+                          stdout=sys.stdout,
+                          stderr=sys.stderr,
+                          env=os.environ.copy(),
+                          shell=True)
+  proc.wait()
+  return proc.returncode
+
+def main():
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--cuda_log', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'cuda':
+    if args.cuda_log: Log('-x cuda')
+    leftover = [pipes.quote(s) for s in leftover]
+    if args.cuda_log: Log('using nvcc')
+    return InvokeNvcc(leftover, log=args.cuda_log)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--cuda_log'))
+                             and not flag.startswith(('-nvcc_options'))]
+
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/nccl2/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/nccl2/BUILD
deleted file mode 100755
index 96ed60d3cfe2e6e16b33b884c9be8749d7fd0a4f..0000000000000000000000000000000000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/nccl2/BUILD
+++ /dev/null
@@ -1,25 +0,0 @@
-filegroup(
-    name = "LICENSE",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "nccl",
-    srcs = ["libnccl.so.2"],
-    hdrs = ["nccl.h"],
-    include_prefix = "third_party/nccl",
-    visibility = ["//visibility:public"],
-    deps = [
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
-)
-
-genrule(
-    name = "nccl-files",
-    outs = [
-        "libnccl.so.2",
-        "nccl.h",
-    ],
-    cmd = """cp "/usr/include/nccl.h" "$(@D)/nccl.h" &&
-           cp "/usr/lib/libnccl.so.2" "$(@D)/libnccl.so.2" """,
-)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/nccl2/WORKSPACE b/third_party/toolchains/preconfig/ubuntu14.04/nccl2/WORKSPACE
deleted file mode 100644
index 1e6662ac91669df808c82391c68a76292c1cd23d..0000000000000000000000000000000000000000
--- a/third_party/toolchains/preconfig/ubuntu14.04/nccl2/WORKSPACE
+++ /dev/null
@@ -1,2 +0,0 @@
-# DO NOT EDIT: automatically generated WORKSPACE file for nccl_configure rule
-workspace(name = "local_config_nccl")
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
new file mode 100755
index 0000000000000000000000000000000000000000..75e2b7fd3361c079668e76db9fb416c324553e83
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD
@@ -0,0 +1,48 @@
+# NVIDIA TensorRT
+# A high-performance deep learning inference optimizer and runtime.
+
+licenses(["notice"])
+
+exports_files(["LICENSE"])
+
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "tensorrt_headers",
+    hdrs = [":tensorrt_include"],
+    include_prefix = "",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "tensorrt",
+    srcs = ["tensorrt/lib/libnvinfer.so.5"],
+    copts = cuda_default_copts(),
+    data = ["tensorrt/lib/libnvinfer.so.5"],
+    include_prefix = "",
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+    deps = [
+        ":tensorrt_headers",
+        "@local_config_cuda//cuda",
+    ],
+)
+
+genrule(
+    name = "tensorrt_lib",
+    outs = [
+        "tensorrt/lib/libnvinfer.so.5",
+    ],
+    cmd = """cp -f "/usr/lib/x86_64-linux-gnu/libnvinfer.so.5" $(location tensorrt/lib/libnvinfer.so.5) """,
+)
+
+genrule(
+    name = "tensorrt_include",
+    outs = [
+        "tensorrt/include/NvInfer.h",
+        "tensorrt/include/NvUtils.h",
+    ],
+    cmd = """cp -f "/usr/include/x86_64-linux-gnu/NvInfer.h" $(location tensorrt/include/NvInfer.h) && cp -f "/usr/include/x86_64-linux-gnu/NvUtils.h" $(location tensorrt/include/NvUtils.h) """,
+)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/WORKSPACE b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/WORKSPACE
new file mode 100644
index 0000000000000000000000000000000000000000..ce47f14b91bf5249f9face7e486cde60b9d2d669
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/WORKSPACE
@@ -0,0 +1,2 @@
+# DO NOT EDIT: automatically generated WORKSPACE file for tensorrt_configure rule
+workspace(name = "local_config_tensorrt")
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl
new file mode 100755
index 0000000000000000000000000000000000000000..527be93834197de6d72fbc70a6dd25e4fb893900
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl
@@ -0,0 +1,5 @@
+# Build configurations for TensorRT.
+
+def if_tensorrt(if_true, if_false = []):
+    """Tests whether TensorRT was enabled during the configure process."""
+    return if_true
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD b/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD
new file mode 100755
index 0000000000000000000000000000000000000000..5a0c52f66ab2224c0b021875d0447ee638e833c4
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD
@@ -0,0 +1,111 @@
+# Copyright 2016 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This becomes the BUILD file for @local_config_cc// under non-FreeBSD unixes.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+cc_library(
+    name = "malloc",
+)
+
+cc_library(
+    name = "stl",
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "cc_wrapper",
+    srcs = ["cc_wrapper.sh"],
+)
+
+filegroup(
+    name = "compiler_deps",
+    srcs = glob(["extra_tools/**"]) + [":empty"],
+)
+
+# This is the entry point for --crosstool_top.  Toolchains are found
+# by lopping off the name of --crosstool_top and searching for
+# the "${CPU}" entry in the toolchains attribute.
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "k8|clang": ":cc-compiler-k8",
+        "k8": ":cc-compiler-k8",
+        "armeabi-v7a|compiler": ":cc-compiler-armeabi-v7a",
+        "armeabi-v7a": ":cc-compiler-armeabi-v7a",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-k8",
+    all_files = ":compiler_deps",
+    compiler_files = ":compiler_deps",
+    cpu = "k8",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":compiler_deps",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_identifier = "linux_gnu_x86",
+)
+
+toolchain(
+    name = "cc-toolchain-k8",
+    exec_compatible_with = [
+        # TODO(katre): add autodiscovered constraints for host CPU and OS.
+    ],
+    target_compatible_with = [
+        # TODO(katre): add autodiscovered constraints for host CPU and OS.
+    ],
+    toolchain = ":cc-compiler-k8",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+# Android tooling requires a default toolchain for the armeabi-v7a cpu.
+cc_toolchain(
+    name = "cc-compiler-armeabi-v7a",
+    all_files = ":empty",
+    compiler_files = ":empty",
+    cpu = "local",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":empty",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 1,
+    toolchain_identifier = "stub_armeabi-v7a",
+)
+
+toolchain(
+    name = "cc-toolchain-armeabi-v7a",
+    exec_compatible_with = [
+        # TODO(katre): add autodiscovered constraints for host CPU and OS.
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:arm",
+        "@bazel_tools//platforms:android",
+    ],
+    toolchain = ":cc-compiler-armabi-v7a",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/clang/CROSSTOOL b/third_party/toolchains/preconfig/ubuntu16.04/clang/CROSSTOOL
new file mode 100755
index 0000000000000000000000000000000000000000..48f82eb35d5b2268a758bb0ebb36e243663ca372
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/clang/CROSSTOOL
@@ -0,0 +1,1209 @@
+# Copyright 2016 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+major_version: "local"
+minor_version: ""
+
+# Android tooling requires a default toolchain for the armeabi-v7a cpu.
+toolchain {
+  abi_version: "armeabi-v7a"
+  abi_libc_version: "armeabi-v7a"
+  builtin_sysroot: ""
+  compiler: "compiler"
+  host_system_name: "armeabi-v7a"
+  needsPic: true
+  supports_gold_linker: false
+  supports_incremental_linker: false
+  supports_fission: false
+  supports_interface_shared_objects: false
+  supports_normalizing_ar: false
+  supports_start_end_lib: false
+  target_libc: "armeabi-v7a"
+  target_cpu: "armeabi-v7a"
+  target_system_name: "armeabi-v7a"
+  toolchain_identifier: "stub_armeabi-v7a"
+
+  tool_path { name: "ar" path: "/bin/false" }
+  tool_path { name: "compat-ld" path: "/bin/false" }
+  tool_path { name: "cpp" path: "/bin/false" }
+  tool_path { name: "dwp" path: "/bin/false" }
+  tool_path { name: "gcc" path: "/bin/false" }
+  tool_path { name: "gcov" path: "/bin/false" }
+  tool_path { name: "ld" path: "/bin/false" }
+
+  tool_path { name: "nm" path: "/bin/false" }
+  tool_path { name: "objcopy" path: "/bin/false" }
+  tool_path { name: "objdump" path: "/bin/false" }
+  tool_path { name: "strip" path: "/bin/false" }
+  linking_mode_flags { mode: DYNAMIC }
+}
+
+toolchain {
+  toolchain_identifier: "linux_gnu_x86"
+  abi_version: "gcc"
+  abi_libc_version: "glibc_2.19"
+  builtin_sysroot: ""
+  compiler: "clang"
+  host_system_name: "i686-unknown-linux-gnu"
+  needsPic: true
+  supports_gold_linker: true
+  supports_incremental_linker: false
+  supports_fission: false
+  supports_interface_shared_objects: false
+  supports_normalizing_ar: false
+  supports_start_end_lib: true
+  target_libc: "glibc_2.19"
+  target_cpu: "k8"
+  target_system_name: "x86_64-unknown-linux-gnu"
+  cxx_flag: "-std=c++0x"
+  linker_flag: "-fuse-ld=gold"
+  linker_flag: "-Wl,-no-as-needed"
+  linker_flag: "-Wl,-z,relro,-z,now"
+  linker_flag: "-B/usr/local/bin"
+  linker_flag: "-lstdc++"
+  linker_flag: "-lm"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/local/lib/clang/7.0.0/include"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
+  cxx_builtin_include_directory: "/usr/include"
+  cxx_builtin_include_directory: "/usr/include/c++/4.9"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/4.9"
+  cxx_builtin_include_directory: "/usr/include/c++/4.9/backward"
+  objcopy_embed_flag: "-I"
+  objcopy_embed_flag: "binary"
+  unfiltered_cxx_flag: "-no-canonical-prefixes"
+  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
+  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
+  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
+  compiler_flag: "-U_FORTIFY_SOURCE"
+  compiler_flag: "-fstack-protector"
+  compiler_flag: "-Wall"
+  compiler_flag: "-Wthread-safety"
+  compiler_flag: "-Wself-assign"
+  compiler_flag: "-fcolor-diagnostics"
+  compiler_flag: "-fno-omit-frame-pointer"
+  tool_path {name: "ar" path: "/usr/bin/ar" }
+  tool_path {name: "ld" path: "/usr/bin/ld" }
+  tool_path {name: "cpp" path: "/usr/bin/cpp" }
+  tool_path {name: "gcc" path: "/usr/local/bin/clang" }
+  tool_path {name: "dwp" path: "/usr/bin/dwp" }
+  tool_path {name: "gcov" path: "None" }
+  tool_path {name: "nm" path: "/usr/bin/nm" }
+  tool_path {name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path {name: "objdump" path: "/usr/bin/objdump" }
+  tool_path {name: "strip" path: "/usr/bin/strip" }
+
+  compilation_mode_flags {
+    mode: DBG
+    compiler_flag: "-g"
+  }
+  compilation_mode_flags {
+    mode: OPT
+    compiler_flag: "-g0"
+    compiler_flag: "-O2"
+    compiler_flag: "-D_FORTIFY_SOURCE=1"
+    compiler_flag: "-DNDEBUG"
+    compiler_flag: "-ffunction-sections"
+    compiler_flag: "-fdata-sections"
+    linker_flag: "-Wl,--gc-sections"
+  }
+  linking_mode_flags { mode: DYNAMIC }
+
+
+    feature {
+      name: 'coverage'
+      provides: 'profile'
+      flag_set {
+        action: 'preprocess-assemble'
+        action: 'c-compile'
+        action: 'c++-compile'
+        action: 'c++-header-parsing'
+        action: 'c++-module-compile'
+        flag_group {
+        flag: '--coverage'
+      }
+      }
+      flag_set {
+        action: 'c++-link-dynamic-library'
+        action: 'c++-link-nodeps-dynamic-library'
+        action: 'c++-link-executable'
+        flag_group {
+        flag: '--coverage'
+      }
+      }
+    }
+  
+
+  feature {
+    name: 'fdo_optimize'
+    provides: 'profile'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      expand_if_all_available: 'fdo_profile_path'
+      flag_group {
+        flag: '-fprofile-use=%{fdo_profile_path}'
+        flag: '-fprofile-correction',
+      }
+    }
+  }
+}
+
+toolchain {
+  toolchain_identifier: "msys_x64_mingw"
+  abi_version: "local"
+  abi_libc_version: "local"
+  builtin_sysroot: ""
+  compiler: "mingw-gcc"
+  host_system_name: "local"
+  needsPic: false
+  target_libc: "mingw"
+  target_cpu: "x64_windows"
+  target_system_name: "local"
+
+  artifact_name_pattern {
+     category_name: 'executable'
+     prefix: ''
+     extension: '.exe'
+  }
+
+
+
+  linking_mode_flags { mode: DYNAMIC }
+}
+
+toolchain {
+  toolchain_identifier: "msvc_x64"
+  host_system_name: "local"
+  target_system_name: "local"
+
+  abi_version: "local"
+  abi_libc_version: "local"
+  target_cpu: "x64_windows"
+  compiler: "msvc-cl"
+  target_libc: "msvcrt"
+  default_python_version: "python2.7"
+
+
+
+  tool_path {
+    name: "ar"
+    path: ""
+  }
+  tool_path {
+    name: "ml"
+    path: ""
+  }
+  tool_path {
+    name: "cpp"
+    path: ""
+  }
+  tool_path {
+    name: "gcc"
+    path: ""
+  }
+  tool_path {
+    name: "gcov"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "ld"
+    path: ""
+  }
+  tool_path {
+    name: "nm"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objcopy"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objdump"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "strip"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  supports_gold_linker: false
+  supports_start_end_lib: false
+  supports_interface_shared_objects: true
+  supports_incremental_linker: false
+  supports_normalizing_ar: true
+  needsPic: false
+
+  # TODO(pcloudy): Review those flags below, they should be defined by cl.exe
+  compiler_flag: "/DCOMPILER_MSVC"
+
+  # Don't define min/max macros in windows.h.
+  compiler_flag: "/DNOMINMAX"
+
+  # Platform defines.
+  compiler_flag: "/D_WIN32_WINNT=0x0601"
+  # Turn off warning messages.
+  compiler_flag: "/D_CRT_SECURE_NO_DEPRECATE"
+  compiler_flag: "/D_CRT_SECURE_NO_WARNINGS"
+
+  # Useful options to have on for compilation.
+  # Increase the capacity of object files to 2^32 sections.
+  compiler_flag: "/bigobj"
+  # Allocate 500MB for precomputed headers.
+  compiler_flag: "/Zm500"
+  # Catch C++ exceptions only and tell the compiler to assume that functions declared
+  # as extern "C" never throw a C++ exception.
+  compiler_flag: "/EHsc"
+
+  # Globally disabled warnings.
+  # Don't warn about elements of array being be default initialized.
+  compiler_flag: "/wd4351"
+  # Don't warn about no matching delete found.
+  compiler_flag: "/wd4291"
+  # Don't warn about diamond inheritance patterns.
+  compiler_flag: "/wd4250"
+  # Don't warn about insecure functions (e.g. non _s functions).
+  compiler_flag: "/wd4996"
+
+  linker_flag: "/MACHINE:X64"
+
+  feature {
+    name: "no_legacy_features"
+  }
+
+  artifact_name_pattern {
+     category_name: 'object_file'
+     prefix: ''
+     extension: '.obj'
+  }
+
+  artifact_name_pattern {
+     category_name: 'static_library'
+     prefix: ''
+     extension: '.lib'
+  }
+
+  artifact_name_pattern {
+     category_name: 'alwayslink_static_library'
+     prefix: ''
+     extension: '.lo.lib'
+  }
+
+  artifact_name_pattern {
+     category_name: 'executable'
+     prefix: ''
+     extension: '.exe'
+  }
+
+  artifact_name_pattern {
+     category_name: 'dynamic_library'
+     prefix: ''
+     extension: '.dll'
+  }
+
+  artifact_name_pattern {
+     category_name: 'interface_library'
+     prefix: ''
+     extension: '.if.lib'
+  }
+
+  # Suppress startup banner.
+  feature {
+    name: "nologo"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      flag_group {
+        flag: "/nologo"
+      }
+    }
+  }
+
+  feature {
+    name: 'has_configured_linker_path'
+  }
+
+  # This feature indicates strip is not supported, building stripped binary will just result a copy of orignial binary
+  feature {
+    name: 'no_stripping'
+  }
+
+  # This feature indicates this is a toolchain targeting Windows.
+  feature {
+    name: 'targets_windows'
+    implies: 'copy_dynamic_libraries_to_binary'
+    enabled: true
+  }
+
+  feature {
+    name: 'copy_dynamic_libraries_to_binary'
+  }
+
+  action_config {
+    config_name: 'assemble'
+    action_name: 'assemble'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'preprocess-assemble'
+    action_name: 'preprocess-assemble'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'c-compile'
+    action_name: 'c-compile'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-compile'
+    action_name: 'c++-compile'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-link-executable'
+    action_name: 'c++-link-executable'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+  }
+
+  action_config {
+    config_name: 'c++-link-dynamic-library'
+    action_name: 'c++-link-dynamic-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'shared_flag'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+    implies: 'has_configured_linker_path'
+    implies: 'def_file'
+  }
+
+  action_config {
+      config_name: 'c++-link-nodeps-dynamic-library'
+      action_name: 'c++-link-nodeps-dynamic-library'
+      tool {
+        tool_path: ''
+      }
+      implies: 'nologo'
+      implies: 'shared_flag'
+      implies: 'linkstamps'
+      implies: 'output_execpath_flags'
+      implies: 'input_param_flags'
+      implies: 'user_link_flags'
+      implies: 'legacy_link_flags'
+      implies: 'linker_subsystem_flag'
+      implies: 'linker_param_file'
+      implies: 'msvc_env'
+      implies: 'no_stripping'
+      implies: 'has_configured_linker_path'
+      implies: 'def_file'
+    }
+
+  action_config {
+    config_name: 'c++-link-static-library'
+    action_name: 'c++-link-static-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'archiver_flags'
+    implies: 'input_param_flags'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+  }
+
+  # TODO(b/65151735): Remove legacy_compile_flags feature when legacy fields are
+  # not used in this crosstool
+  feature {
+    name: 'legacy_compile_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'legacy_compile_flags'
+        flag: '%{legacy_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: "msvc_env"
+    env_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      env_entry {
+        key: "PATH"
+        value: ""
+      }
+      env_entry {
+        key: "TMP"
+        value: ""
+      }
+      env_entry {
+        key: "TEMP"
+        value: ""
+      }
+    }
+    implies: 'msvc_compile_env'
+    implies: 'msvc_link_env'
+  }
+
+  feature {
+    name: "msvc_compile_env"
+    env_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      env_entry {
+        key: "INCLUDE"
+        value: ""
+      }
+    }
+  }
+
+  feature {
+    name: "msvc_link_env"
+    env_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      env_entry {
+        key: "LIB"
+        value: ""
+      }
+    }
+  }
+
+  feature {
+    name: 'include_paths'
+    flag_set {
+      action: "assemble"
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      flag_group {
+        iterate_over: 'quote_include_paths'
+        flag: '/I%{quote_include_paths}'
+      }
+      flag_group {
+        iterate_over: 'include_paths'
+        flag: '/I%{include_paths}'
+      }
+      flag_group {
+        iterate_over: 'system_include_paths'
+        flag: '/I%{system_include_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: "preprocessor_defines"
+    flag_set {
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-module-compile"
+      flag_group {
+        flag: "/D%{preprocessor_defines}"
+        iterate_over: "preprocessor_defines"
+      }
+    }
+  }
+
+  # Tell Bazel to parse the output of /showIncludes
+  feature {
+    name: 'parse_showincludes'
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-module-compile'
+      action: 'c++-header-parsing'
+      flag_group {
+        flag: "/showIncludes"
+      }
+    }
+  }
+
+
+  feature {
+    name: 'generate_pdb_file'
+    requires: {
+      feature: 'dbg'
+    }
+    requires: {
+      feature: 'fastbuild'
+    }
+  }
+
+  feature {
+    name: 'shared_flag'
+    flag_set {
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/DLL'
+      }
+    }
+  }
+
+  feature {
+    name: 'linkstamps'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      expand_if_all_available: 'linkstamp_paths'
+      flag_group {
+        iterate_over: 'linkstamp_paths'
+        flag: '%{linkstamp_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: 'output_execpath_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'archiver_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'input_param_flags'
+    flag_set {
+      expand_if_all_available: 'interface_library_output_path'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/IMPLIB:%{interface_library_output_path}"
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libopts'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'libopts'
+        flag: '%{libopts}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libraries_to_link'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        iterate_over: 'libraries_to_link'
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file_group'
+          }
+          iterate_over: 'libraries_to_link.object_files'
+          flag_group {
+            flag: '%{libraries_to_link.object_files}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'interface_library'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'static_library'
+          }
+          flag_group {
+            expand_if_false: 'libraries_to_link.is_whole_archive'
+            flag: '%{libraries_to_link.name}'
+          }
+          flag_group {
+            expand_if_true: 'libraries_to_link.is_whole_archive'
+            flag: '/WHOLEARCHIVE:%{libraries_to_link.name}'
+          }
+        }
+      }
+    }
+  }
+
+  # Since this feature is declared earlier in the CROSSTOOL than
+  # "user_link_flags", this feature will be applied prior to it anwyhere they
+  # are both implied. And since "user_link_flags" contains the linkopts from
+  # the build rule, this allows the user to override the /SUBSYSTEM in the BUILD
+  # file.
+  feature {
+    name: 'linker_subsystem_flag'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/SUBSYSTEM:CONSOLE'
+      }
+    }
+  }
+
+  # The "user_link_flags" contains user-defined linkopts (from build rules)
+  # so it should be defined after features that declare user-overridable flags.
+  # For example the "linker_subsystem_flag" defines a default "/SUBSYSTEM" flag
+  # but we want to let the user override it, therefore "link_flag_subsystem" is
+  # defined earlier in the CROSSTOOL file than "user_link_flags".
+  feature {
+    name: 'user_link_flags'
+    flag_set {
+      expand_if_all_available: 'user_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'user_link_flags'
+        flag: '%{user_link_flags}'
+      }
+    }
+  }
+  feature {
+    name: 'legacy_link_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'legacy_link_flags'
+        flag: '%{legacy_link_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'linker_param_file'
+    flag_set {
+      expand_if_all_available: 'linker_param_file'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '@%{linker_param_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'static_link_msvcrt'
+  }
+
+  feature {
+    name: 'static_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MT"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MD"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'static_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MTd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MDd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dbg'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: ""
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'fastbuild'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: ""
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'opt'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/O2" # Implies /Og /Oi /Ot /Oy /Ob2 /Gs /GF /Gy
+      }
+    }
+    implies: 'frame_pointer'
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  # Must come after /O1, /O2 and /Ox.
+  feature {
+    name: "frame_pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "/Oy-"
+      }
+    }
+  }
+
+  # Remove assert/DCHECKs in opt mode.
+  # You can have them back with --features=-disable_assertions.
+  feature {
+    name: 'disable_assertions'
+    enabled: true
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      with_feature: {
+        feature: 'opt'
+      }
+      flag_group {
+        flag: "/DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    enabled: true
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        # TODO: detect clang on Windows and use "-Wno-builtin-macro-redefined"
+        flag: "/wd4117" # Trying to define or undefine a predefined macro
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  feature {
+    name: 'treat_warnings_as_errors'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/WX"
+      }
+    }
+  }
+
+  # Trade slower build time for smaller binary
+  feature {
+    name: 'smaller_binary'
+    enabled: true
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      with_feature: {
+        feature: 'opt'
+      }
+      flag_group {
+        flag: "/Gy" # Enable function-level linking (-ffunction-sections)
+        flag: "/Gw" # Optimize global data (-fdata-sections)
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library',
+      action: 'c++-link-nodeps-dynamic-library'
+      with_feature: {
+        feature: 'opt'
+      }
+      flag_group {
+        flag: '/OPT:ICF' # Fold identical functions
+        flag: '/OPT:REF' # Eliminate unreferenced functions and data
+      }
+    }
+  }
+
+  # Suppress warnings that most users do not care
+  feature {
+    name: 'ignore_noisy_warnings'
+    enabled: true
+    flag_set {
+      action: 'c++-link-static-library'
+      flag_group {
+        # Suppress 'object file does not define any public symbols' warning
+        flag: '/ignore:4221'
+      }
+    }
+  }
+
+  feature {
+    name: 'user_compile_flags'
+    flag_set {
+      expand_if_all_available: 'user_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'user_compile_flags'
+        flag: '%{user_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'sysroot'
+    flag_set {
+      expand_if_all_available: 'sysroot'
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'sysroot'
+        flag: '--sysroot=%{sysroot}'
+      }
+    }
+  }
+
+  feature {
+    name: 'unfiltered_compile_flags'
+    flag_set {
+      expand_if_all_available: 'unfiltered_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'unfiltered_compile_flags'
+        flag: '%{unfiltered_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_output_flags'
+    flag_set {
+      action: 'assemble'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+        flag: '/Zi'
+      }
+    }
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_assembly_file'
+        flag: '/Fa%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_preprocess_file'
+        flag: '/P'
+        flag: '/Fi%{output_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_input_flags'
+    flag_set {
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'source_file'
+        flag: '/c'
+        flag: '%{source_file}'
+      }
+    }
+  }
+
+  feature {
+    name : 'def_file',
+    flag_set {
+      expand_if_all_available: 'def_file_path'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEF:%{def_file_path}"
+        # We can specify a different DLL name in DEF file, /ignore:4070 suppresses
+        # the warning message about DLL name doesn't match the default one.
+        # See https://msdn.microsoft.com/en-us/library/sfkk2fz7.aspx
+        flag: "/ignore:4070"
+      }
+    }
+  }
+
+  feature {
+    name: 'windows_export_all_symbols'
+  }
+
+  feature {
+    name: 'no_windows_export_all_symbols'
+  }
+
+  linking_mode_flags { mode: DYNAMIC }
+}
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/clang/WORKSPACE b/third_party/toolchains/preconfig/ubuntu16.04/clang/WORKSPACE
new file mode 100644
index 0000000000000000000000000000000000000000..bc05b4c36ff49949e18a9c6f08b03d541149ede1
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/clang/WORKSPACE
@@ -0,0 +1,2 @@
+# DO NOT EDIT: automatically generated WORKSPACE file for cc_autoconf rule
+workspace(name = "local_config_cc")
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/clang/cc_wrapper.sh b/third_party/toolchains/preconfig/ubuntu16.04/clang/cc_wrapper.sh
new file mode 100755
index 0000000000000000000000000000000000000000..42a751dccfb0d9c7115ef5ed5483335c0e0f129b
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/clang/cc_wrapper.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+#
+# Copyright 2015 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Ship the environment to the C++ action
+#
+set -eu
+
+# Set-up the environment
+
+
+# Call the C++ compiler
+/usr/local/bin/clang "$@"
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/clang/dummy_toolchain.bzl b/third_party/toolchains/preconfig/ubuntu16.04/clang/dummy_toolchain.bzl
new file mode 100755
index 0000000000000000000000000000000000000000..45c0285d232806672e93cb6d9b860b2693e75d3d
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/clang/dummy_toolchain.bzl
@@ -0,0 +1,23 @@
+# pylint: disable=g-bad-file-header
+# Copyright 2017 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Skylark rule that stubs a toolchain."""
+
+def _dummy_toolchain_impl(ctx):
+    ctx = ctx  # unused argument
+    toolchain = platform_common.ToolchainInfo()
+    return [toolchain]
+
+dummy_toolchain = rule(_dummy_toolchain_impl, attrs = {})
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/clang/tools/cpp/empty.cc b/third_party/toolchains/preconfig/ubuntu16.04/clang/tools/cpp/empty.cc
new file mode 100755
index 0000000000000000000000000000000000000000..c272dabaeb6829b5ded592b4b37194ef3af364dd
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/clang/tools/cpp/empty.cc
@@ -0,0 +1 @@
+int main() {}
\ No newline at end of file
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/py3/BUILD b/third_party/toolchains/preconfig/ubuntu16.04/py3/BUILD
new file mode 100755
index 0000000000000000000000000000000000000000..77eaa4d5121c32f2a4d58f3bb0fb470b72c9f0f6
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/py3/BUILD
@@ -0,0 +1,205 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+# To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
+# See https://docs.python.org/3/extending/windows.html
+cc_import(
+    name = "python_lib",
+    interface_library = select({
+        ":windows": ":python_import_lib",
+        # A placeholder for Unix platforms which makes --no_build happy.
+        "//conditions:default": "not-existing.lib",
+    }),
+    system_provided = 1,
+)
+
+cc_library(
+    name = "python_headers",
+    hdrs = [":python_include"],
+    includes = ["python_include"],
+    deps = select({
+        ":windows": [":python_lib"],
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "numpy_headers",
+    hdrs = [":numpy_include"],
+    includes = ["numpy_include"],
+)
+
+config_setting(
+    name = "windows",
+    values = {"cpu": "x64_windows"},
+    visibility = ["//visibility:public"],
+)
+
+genrule(
+    name = "python_include",
+    outs = [
+        "python_include/Python-ast.h",
+        "python_include/Python.h",
+        "python_include/abstract.h",
+        "python_include/accu.h",
+        "python_include/asdl.h",
+        "python_include/ast.h",
+        "python_include/bitset.h",
+        "python_include/bltinmodule.h",
+        "python_include/boolobject.h",
+        "python_include/bytearrayobject.h",
+        "python_include/bytes_methods.h",
+        "python_include/bytesobject.h",
+        "python_include/cellobject.h",
+        "python_include/ceval.h",
+        "python_include/classobject.h",
+        "python_include/code.h",
+        "python_include/codecs.h",
+        "python_include/compile.h",
+        "python_include/complexobject.h",
+        "python_include/datetime.h",
+        "python_include/descrobject.h",
+        "python_include/dictobject.h",
+        "python_include/dtoa.h",
+        "python_include/dynamic_annotations.h",
+        "python_include/enumobject.h",
+        "python_include/errcode.h",
+        "python_include/eval.h",
+        "python_include/fileobject.h",
+        "python_include/fileutils.h",
+        "python_include/floatobject.h",
+        "python_include/frameobject.h",
+        "python_include/funcobject.h",
+        "python_include/genobject.h",
+        "python_include/graminit.h",
+        "python_include/grammar.h",
+        "python_include/import.h",
+        "python_include/intrcheck.h",
+        "python_include/iterobject.h",
+        "python_include/listobject.h",
+        "python_include/longintrepr.h",
+        "python_include/longobject.h",
+        "python_include/marshal.h",
+        "python_include/memoryobject.h",
+        "python_include/metagrammar.h",
+        "python_include/methodobject.h",
+        "python_include/modsupport.h",
+        "python_include/moduleobject.h",
+        "python_include/namespaceobject.h",
+        "python_include/node.h",
+        "python_include/numpy/__multiarray_api.h",
+        "python_include/numpy/__ufunc_api.h",
+        "python_include/numpy/_neighborhood_iterator_imp.h",
+        "python_include/numpy/_numpyconfig.h",
+        "python_include/numpy/arrayobject.h",
+        "python_include/numpy/arrayscalars.h",
+        "python_include/numpy/halffloat.h",
+        "python_include/numpy/multiarray_api.txt",
+        "python_include/numpy/ndarrayobject.h",
+        "python_include/numpy/ndarraytypes.h",
+        "python_include/numpy/noprefix.h",
+        "python_include/numpy/npy_1_7_deprecated_api.h",
+        "python_include/numpy/npy_3kcompat.h",
+        "python_include/numpy/npy_common.h",
+        "python_include/numpy/npy_cpu.h",
+        "python_include/numpy/npy_endian.h",
+        "python_include/numpy/npy_interrupt.h",
+        "python_include/numpy/npy_math.h",
+        "python_include/numpy/npy_no_deprecated_api.h",
+        "python_include/numpy/npy_os.h",
+        "python_include/numpy/numpyconfig.h",
+        "python_include/numpy/old_defines.h",
+        "python_include/numpy/oldnumeric.h",
+        "python_include/numpy/ufunc_api.txt",
+        "python_include/numpy/ufuncobject.h",
+        "python_include/numpy/utils.h",
+        "python_include/object.h",
+        "python_include/objimpl.h",
+        "python_include/odictobject.h",
+        "python_include/opcode.h",
+        "python_include/osdefs.h",
+        "python_include/parsetok.h",
+        "python_include/patchlevel.h",
+        "python_include/pgen.h",
+        "python_include/pgenheaders.h",
+        "python_include/py_curses.h",
+        "python_include/pyarena.h",
+        "python_include/pyatomic.h",
+        "python_include/pycapsule.h",
+        "python_include/pyconfig.h",
+        "python_include/pyctype.h",
+        "python_include/pydebug.h",
+        "python_include/pyerrors.h",
+        "python_include/pyexpat.h",
+        "python_include/pyfpe.h",
+        "python_include/pygetopt.h",
+        "python_include/pyhash.h",
+        "python_include/pylifecycle.h",
+        "python_include/pymacconfig.h",
+        "python_include/pymacro.h",
+        "python_include/pymath.h",
+        "python_include/pymem.h",
+        "python_include/pyport.h",
+        "python_include/pystate.h",
+        "python_include/pystrcmp.h",
+        "python_include/pystrhex.h",
+        "python_include/pystrtod.h",
+        "python_include/pythonrun.h",
+        "python_include/pythread.h",
+        "python_include/pytime.h",
+        "python_include/rangeobject.h",
+        "python_include/setobject.h",
+        "python_include/sliceobject.h",
+        "python_include/structmember.h",
+        "python_include/structseq.h",
+        "python_include/symtable.h",
+        "python_include/sysmodule.h",
+        "python_include/token.h",
+        "python_include/traceback.h",
+        "python_include/tupleobject.h",
+        "python_include/typeslots.h",
+        "python_include/ucnhash.h",
+        "python_include/unicodeobject.h",
+        "python_include/warnings.h",
+        "python_include/weakrefobject.h",
+    ],
+    cmd = """
+cp -f "/usr/include/python3.5m/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp -f "/usr/include/python3.5m/Python.h" "$(@D)/python_include/Python.h" && cp -f "/usr/include/python3.5m/abstract.h" "$(@D)/python_include/abstract.h" && cp -f "/usr/include/python3.5m/accu.h" "$(@D)/python_include/accu.h" && cp -f "/usr/include/python3.5m/asdl.h" "$(@D)/python_include/asdl.h" && cp -f "/usr/include/python3.5m/ast.h" "$(@D)/python_include/ast.h" && cp -f "/usr/include/python3.5m/bitset.h" "$(@D)/python_include/bitset.h" && cp -f "/usr/include/python3.5m/bltinmodule.h" "$(@D)/python_include/bltinmodule.h" && cp -f "/usr/include/python3.5m/boolobject.h" "$(@D)/python_include/boolobject.h" && cp -f "/usr/include/python3.5m/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp -f "/usr/include/python3.5m/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp -f "/usr/include/python3.5m/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp -f "/usr/include/python3.5m/cellobject.h" "$(@D)/python_include/cellobject.h" && cp -f "/usr/include/python3.5m/ceval.h" "$(@D)/python_include/ceval.h" && cp -f "/usr/include/python3.5m/classobject.h" "$(@D)/python_include/classobject.h" && cp -f "/usr/include/python3.5m/code.h" "$(@D)/python_include/code.h" && cp -f "/usr/include/python3.5m/codecs.h" "$(@D)/python_include/codecs.h" && cp -f "/usr/include/python3.5m/compile.h" "$(@D)/python_include/compile.h" && cp -f "/usr/include/python3.5m/complexobject.h" "$(@D)/python_include/complexobject.h" && cp -f "/usr/include/python3.5m/datetime.h" "$(@D)/python_include/datetime.h" && cp -f "/usr/include/python3.5m/descrobject.h" "$(@D)/python_include/descrobject.h" && cp -f "/usr/include/python3.5m/dictobject.h" "$(@D)/python_include/dictobject.h" && cp -f "/usr/include/python3.5m/dtoa.h" "$(@D)/python_include/dtoa.h" && cp -f "/usr/include/python3.5m/dynamic_annotations.h" "$(@D)/python_include/dynamic_annotations.h" && cp -f "/usr/include/python3.5m/enumobject.h" "$(@D)/python_include/enumobject.h" && cp -f "/usr/include/python3.5m/errcode.h" "$(@D)/python_include/errcode.h" && cp -f "/usr/include/python3.5m/eval.h" "$(@D)/python_include/eval.h" && cp -f "/usr/include/python3.5m/fileobject.h" "$(@D)/python_include/fileobject.h" && cp -f "/usr/include/python3.5m/fileutils.h" "$(@D)/python_include/fileutils.h" && cp -f "/usr/include/python3.5m/floatobject.h" "$(@D)/python_include/floatobject.h" && cp -f "/usr/include/python3.5m/frameobject.h" "$(@D)/python_include/frameobject.h" && cp -f "/usr/include/python3.5m/funcobject.h" "$(@D)/python_include/funcobject.h" && cp -f "/usr/include/python3.5m/genobject.h" "$(@D)/python_include/genobject.h" && cp -f "/usr/include/python3.5m/graminit.h" "$(@D)/python_include/graminit.h" && cp -f "/usr/include/python3.5m/grammar.h" "$(@D)/python_include/grammar.h" && cp -f "/usr/include/python3.5m/import.h" "$(@D)/python_include/import.h" && cp -f "/usr/include/python3.5m/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp -f "/usr/include/python3.5m/iterobject.h" "$(@D)/python_include/iterobject.h" && cp -f "/usr/include/python3.5m/listobject.h" "$(@D)/python_include/listobject.h" && cp -f "/usr/include/python3.5m/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp -f "/usr/include/python3.5m/longobject.h" "$(@D)/python_include/longobject.h" && cp -f "/usr/include/python3.5m/marshal.h" "$(@D)/python_include/marshal.h" && cp -f "/usr/include/python3.5m/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp -f "/usr/include/python3.5m/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp -f "/usr/include/python3.5m/methodobject.h" "$(@D)/python_include/methodobject.h" && cp -f "/usr/include/python3.5m/modsupport.h" "$(@D)/python_include/modsupport.h" && cp -f "/usr/include/python3.5m/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp -f "/usr/include/python3.5m/namespaceobject.h" "$(@D)/python_include/namespaceobject.h" && cp -f "/usr/include/python3.5m/node.h" "$(@D)/python_include/node.h" && cp -f "/usr/include/python3.5m/numpy/__multiarray_api.h" "$(@D)/python_include/numpy/__multiarray_api.h" && cp -f "/usr/include/python3.5m/numpy/__ufunc_api.h" "$(@D)/python_include/numpy/__ufunc_api.h" && cp -f "/usr/include/python3.5m/numpy/_neighborhood_iterator_imp.h" "$(@D)/python_include/numpy/_neighborhood_iterator_imp.h" && cp -f "/usr/include/python3.5m/numpy/_numpyconfig.h" "$(@D)/python_include/numpy/_numpyconfig.h" && cp -f "/usr/include/python3.5m/numpy/arrayobject.h" "$(@D)/python_include/numpy/arrayobject.h" && cp -f "/usr/include/python3.5m/numpy/arrayscalars.h" "$(@D)/python_include/numpy/arrayscalars.h" && cp -f "/usr/include/python3.5m/numpy/halffloat.h" "$(@D)/python_include/numpy/halffloat.h" && cp -f "/usr/include/python3.5m/numpy/multiarray_api.txt" "$(@D)/python_include/numpy/multiarray_api.txt" && cp -f "/usr/include/python3.5m/numpy/ndarrayobject.h" "$(@D)/python_include/numpy/ndarrayobject.h" && cp -f "/usr/include/python3.5m/numpy/ndarraytypes.h" "$(@D)/python_include/numpy/ndarraytypes.h" && cp -f "/usr/include/python3.5m/numpy/noprefix.h" "$(@D)/python_include/numpy/noprefix.h" && cp -f "/usr/include/python3.5m/numpy/npy_1_7_deprecated_api.h" "$(@D)/python_include/numpy/npy_1_7_deprecated_api.h" && cp -f "/usr/include/python3.5m/numpy/npy_3kcompat.h" "$(@D)/python_include/numpy/npy_3kcompat.h" && cp -f "/usr/include/python3.5m/numpy/npy_common.h" "$(@D)/python_include/numpy/npy_common.h" && cp -f "/usr/include/python3.5m/numpy/npy_cpu.h" "$(@D)/python_include/numpy/npy_cpu.h" && cp -f "/usr/include/python3.5m/numpy/npy_endian.h" "$(@D)/python_include/numpy/npy_endian.h" && cp -f "/usr/include/python3.5m/numpy/npy_interrupt.h" "$(@D)/python_include/numpy/npy_interrupt.h" && cp -f "/usr/include/python3.5m/numpy/npy_math.h" "$(@D)/python_include/numpy/npy_math.h" && cp -f "/usr/include/python3.5m/numpy/npy_no_deprecated_api.h" "$(@D)/python_include/numpy/npy_no_deprecated_api.h" && cp -f "/usr/include/python3.5m/numpy/npy_os.h" "$(@D)/python_include/numpy/npy_os.h" && cp -f "/usr/include/python3.5m/numpy/numpyconfig.h" "$(@D)/python_include/numpy/numpyconfig.h" && cp -f "/usr/include/python3.5m/numpy/old_defines.h" "$(@D)/python_include/numpy/old_defines.h" && cp -f "/usr/include/python3.5m/numpy/oldnumeric.h" "$(@D)/python_include/numpy/oldnumeric.h" && cp -f "/usr/include/python3.5m/numpy/ufunc_api.txt" "$(@D)/python_include/numpy/ufunc_api.txt" && cp -f "/usr/include/python3.5m/numpy/ufuncobject.h" "$(@D)/python_include/numpy/ufuncobject.h" && cp -f "/usr/include/python3.5m/numpy/utils.h" "$(@D)/python_include/numpy/utils.h" && cp -f "/usr/include/python3.5m/object.h" "$(@D)/python_include/object.h" && cp -f "/usr/include/python3.5m/objimpl.h" "$(@D)/python_include/objimpl.h" && cp -f "/usr/include/python3.5m/odictobject.h" "$(@D)/python_include/odictobject.h" && cp -f "/usr/include/python3.5m/opcode.h" "$(@D)/python_include/opcode.h" && cp -f "/usr/include/python3.5m/osdefs.h" "$(@D)/python_include/osdefs.h" && cp -f "/usr/include/python3.5m/parsetok.h" "$(@D)/python_include/parsetok.h" && cp -f "/usr/include/python3.5m/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp -f "/usr/include/python3.5m/pgen.h" "$(@D)/python_include/pgen.h" && cp -f "/usr/include/python3.5m/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp -f "/usr/include/python3.5m/py_curses.h" "$(@D)/python_include/py_curses.h" && cp -f "/usr/include/python3.5m/pyarena.h" "$(@D)/python_include/pyarena.h" && cp -f "/usr/include/python3.5m/pyatomic.h" "$(@D)/python_include/pyatomic.h" && cp -f "/usr/include/python3.5m/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp -f "/usr/include/python3.5m/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp -f "/usr/include/python3.5m/pyctype.h" "$(@D)/python_include/pyctype.h" && cp -f "/usr/include/python3.5m/pydebug.h" "$(@D)/python_include/pydebug.h" && cp -f "/usr/include/python3.5m/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp -f "/usr/include/python3.5m/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp -f "/usr/include/python3.5m/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp -f "/usr/include/python3.5m/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp -f "/usr/include/python3.5m/pyhash.h" "$(@D)/python_include/pyhash.h" && cp -f "/usr/include/python3.5m/pylifecycle.h" "$(@D)/python_include/pylifecycle.h" && cp -f "/usr/include/python3.5m/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp -f "/usr/include/python3.5m/pymacro.h" "$(@D)/python_include/pymacro.h" && cp -f "/usr/include/python3.5m/pymath.h" "$(@D)/python_include/pymath.h" && cp -f "/usr/include/python3.5m/pymem.h" "$(@D)/python_include/pymem.h" && cp -f "/usr/include/python3.5m/pyport.h" "$(@D)/python_include/pyport.h" && cp -f "/usr/include/python3.5m/pystate.h" "$(@D)/python_include/pystate.h" && cp -f "/usr/include/python3.5m/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp -f "/usr/include/python3.5m/pystrhex.h" "$(@D)/python_include/pystrhex.h" && cp -f "/usr/include/python3.5m/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp -f "/usr/include/python3.5m/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp -f "/usr/include/python3.5m/pythread.h" "$(@D)/python_include/pythread.h" && cp -f "/usr/include/python3.5m/pytime.h" "$(@D)/python_include/pytime.h" && cp -f "/usr/include/python3.5m/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp -f "/usr/include/python3.5m/setobject.h" "$(@D)/python_include/setobject.h" && cp -f "/usr/include/python3.5m/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp -f "/usr/include/python3.5m/structmember.h" "$(@D)/python_include/structmember.h" && cp -f "/usr/include/python3.5m/structseq.h" "$(@D)/python_include/structseq.h" && cp -f "/usr/include/python3.5m/symtable.h" "$(@D)/python_include/symtable.h" && cp -f "/usr/include/python3.5m/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp -f "/usr/include/python3.5m/token.h" "$(@D)/python_include/token.h" && cp -f "/usr/include/python3.5m/traceback.h" "$(@D)/python_include/traceback.h" && cp -f "/usr/include/python3.5m/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp -f "/usr/include/python3.5m/typeslots.h" "$(@D)/python_include/typeslots.h" && cp -f "/usr/include/python3.5m/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp -f "/usr/include/python3.5m/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp -f "/usr/include/python3.5m/warnings.h" "$(@D)/python_include/warnings.h" && cp -f "/usr/include/python3.5m/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
+   """,
+)
+
+genrule(
+    name = "numpy_include",
+    outs = [
+        "numpy_include/numpy/__multiarray_api.h",
+        "numpy_include/numpy/__ufunc_api.h",
+        "numpy_include/numpy/_neighborhood_iterator_imp.h",
+        "numpy_include/numpy/_numpyconfig.h",
+        "numpy_include/numpy/arrayobject.h",
+        "numpy_include/numpy/arrayscalars.h",
+        "numpy_include/numpy/halffloat.h",
+        "numpy_include/numpy/multiarray_api.txt",
+        "numpy_include/numpy/ndarrayobject.h",
+        "numpy_include/numpy/ndarraytypes.h",
+        "numpy_include/numpy/noprefix.h",
+        "numpy_include/numpy/npy_1_7_deprecated_api.h",
+        "numpy_include/numpy/npy_3kcompat.h",
+        "numpy_include/numpy/npy_common.h",
+        "numpy_include/numpy/npy_cpu.h",
+        "numpy_include/numpy/npy_endian.h",
+        "numpy_include/numpy/npy_interrupt.h",
+        "numpy_include/numpy/npy_math.h",
+        "numpy_include/numpy/npy_no_deprecated_api.h",
+        "numpy_include/numpy/npy_os.h",
+        "numpy_include/numpy/numpyconfig.h",
+        "numpy_include/numpy/old_defines.h",
+        "numpy_include/numpy/oldnumeric.h",
+        "numpy_include/numpy/ufunc_api.txt",
+        "numpy_include/numpy/ufuncobject.h",
+        "numpy_include/numpy/utils.h",
+    ],
+    cmd = """
+cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp -f "/usr/lib/python3/dist-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
+   """,
+)
diff --git a/third_party/toolchains/preconfig/ubuntu16.04/py3/WORKSPACE b/third_party/toolchains/preconfig/ubuntu16.04/py3/WORKSPACE
new file mode 100644
index 0000000000000000000000000000000000000000..1d298fefa3bf40b2c02605960d69c5974e9de7b7
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu16.04/py3/WORKSPACE
@@ -0,0 +1,2 @@
+# DO NOT EDIT: automatically generated WORKSPACE file for python_configure rule
+workspace(name = "local_config_python")
diff --git a/third_party/toolchains/remote/BUILD b/third_party/toolchains/remote/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/third_party/toolchains/remote/BUILD.tpl b/third_party/toolchains/remote/BUILD.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/third_party/toolchains/remote/configure.bzl b/third_party/toolchains/remote/configure.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..cc5b9842648e74a7aa5ac01721719abbd0752123
--- /dev/null
+++ b/third_party/toolchains/remote/configure.bzl
@@ -0,0 +1,43 @@
+"""Repository rule for remote GPU autoconfiguration.
+
+This rule creates the starlark file
+//third_party/toolchains/remote:execution.bzl
+providing the function `gpu_test_tags`.
+
+`gpu_test_tags` will return:
+
+  * `local`: if `REMOTE_GPU_TESTING` is false, allowing CPU tests to run
+    remotely and GPU tests to run locally in the same bazel invocation.
+  * `remote-gpu`: if `REMOTE_GPU_TESTING` is true; this allows rules to
+    set an execution requirement that enables a GPU-enabled remote platform.
+"""
+
+_REMOTE_GPU_TESTING = "REMOTE_GPU_TESTING"
+
+def _flag_enabled(repository_ctx, flag_name):
+    if flag_name not in repository_ctx.os.environ:
+        return False
+    return repository_ctx.os.environ[flag_name].strip() == "1"
+
+def _remote_execution_configure(repository_ctx):
+    # If we do not support remote gpu test execution, mark them as local, so we
+    # can combine remote builds with local gpu tests.
+    gpu_test_tags = "\"local\""
+    if _flag_enabled(repository_ctx, _REMOTE_GPU_TESTING):
+        gpu_test_tags = "\"remote-gpu\""
+    repository_ctx.template(
+        "remote_execution.bzl",
+        Label("//third_party/toolchains/remote:execution.bzl.tpl"),
+        {
+            "%{gpu_test_tags}": gpu_test_tags,
+        },
+    )
+    repository_ctx.template(
+        "BUILD",
+        Label("//third_party/toolchains/remote:BUILD.tpl"),
+    )
+
+remote_execution_configure = repository_rule(
+    implementation = _remote_execution_configure,
+    environ = [_REMOTE_GPU_TESTING],
+)
diff --git a/third_party/toolchains/remote/execution.bzl.tpl b/third_party/toolchains/remote/execution.bzl.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..18858cc0dc01fa50b70eb735768de613388dddda
--- /dev/null
+++ b/third_party/toolchains/remote/execution.bzl.tpl
@@ -0,0 +1,2 @@
+def gpu_test_tags():
+    return [%{gpu_test_tags}]